aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm/lib/Target
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/lib/Target')
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64.h66
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64.td349
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp244
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp733
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp484
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp414
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp716
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64CallLowering.cpp317
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64CallLowering.h56
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.h139
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td337
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp151
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp1108
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp438
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp913
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp149
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp965
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp5099
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp1200
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h79
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def173
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp3966
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp10666
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h607
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64InstrAtomics.td404
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td9516
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp4173
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h318
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td6132
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp1161
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.h47
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp204
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h30
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp1728
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp216
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.h52
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h198
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp383
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.h38
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h6591
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp566
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp179
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp577
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h66
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp450
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h106
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td635
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64SchedA53.td293
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64SchedA57.td664
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64SchedA57WriteRes.td544
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64SchedCyclone.td869
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64SchedFalkor.td26
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64SchedKryo.td133
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td2358
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64SchedM1.td380
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64SchedVulcan.td852
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64Schedule.td106
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp59
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h31
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp171
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp188
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h265
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64SystemOperands.td1018
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp489
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h76
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp72
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h47
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp643
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h139
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp371
-rw-r--r--contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp4625
-rw-r--r--contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp1588
-rw-r--r--contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h38
-rw-r--r--contrib/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp222
-rw-r--r--contrib/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h38
-rw-r--r--contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp1475
-rw-r--r--contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h188
-rw-r--r--contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h803
-rw-r--r--contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp612
-rw-r--r--contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp346
-rw-r--r--contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp217
-rw-r--r--contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h26
-rw-r--r--contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h76
-rw-r--r--contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp100
-rw-r--r--contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h38
-rw-r--r--contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp603
-rw-r--r--contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp145
-rw-r--r--contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h167
-rw-r--r--contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp169
-rw-r--r--contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h87
-rw-r--r--contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp431
-rw-r--r--contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp41
-rw-r--r--contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h42
-rw-r--r--contrib/llvm/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp39
-rw-r--r--contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp122
-rw-r--r--contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h524
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPU.h175
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPU.td530
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp75
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp222
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp189
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp826
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h160
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp42
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h34
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td135
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp480
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp102
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h47
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp1632
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp3176
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h338
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp115
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h57
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td330
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td677
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp110
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h58
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td36
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp242
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h46
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp47
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h77
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp372
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h42
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp840
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp52
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h43
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.td25
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h193
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp362
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h607
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp645
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h103
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp30
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h32
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp340
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h98
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp149
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp1745
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h658
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp3599
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/BUFInstructions.td1350
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/CIInstructions.td15
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td222
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/DSInstructions.td906
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp609
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h130
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td663
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/FLATInstructions.td530
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp502
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h71
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp312
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h54
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp1108
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h194
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp200
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp87
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp21
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h39
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h28
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp45
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h33
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp21
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h63
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp112
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h64
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.cpp408
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.h26
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp242
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h111
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp189
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp335
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MIMGInstructions.td763
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/Processors.td189
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp213
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp699
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600Defines.h171
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp339
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp270
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp14
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.h32
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp2202
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.h104
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600InstrFormats.td495
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp1483
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h331
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600Instructions.td1722
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600Intrinsics.td67
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp16
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.h28
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp467
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.h100
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp401
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp409
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp98
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h54
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.td252
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600Schedule.td49
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R700Instructions.td21
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp425
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp96
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIDefines.h393
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp88
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp462
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp622
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp400
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h64
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp4519
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h200
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp329
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp679
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td285
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp3642
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h794
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td1254
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIInstructions.td1089
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIIntrinsics.td209
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp531
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp468
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp161
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp226
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h500
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp1898
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h493
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp304
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp1476
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h282
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td465
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SISchedule.td138
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp512
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp156
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp730
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SMInstructions.td535
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SOPInstructions.td1229
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp37
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp69
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h31
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp461
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h216
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h152
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp181
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h39
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/VIInstrFormats.td20
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/VIInstructions.td14
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td621
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/VOP2Instructions.td757
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/VOP3Instructions.td447
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/VOPCInstructions.td1144
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/VOPInstructions.td350
-rw-r--r--contrib/llvm/lib/Target/ARM/A15SDOptimizer.cpp710
-rw-r--r--contrib/llvm/lib/Target/ARM/ARM.h59
-rw-r--r--contrib/llvm/lib/Target/ARM/ARM.td873
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp2165
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h161
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp4707
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h522
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp843
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h201
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMBasicBlockInfo.h110
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMCallLowering.cpp203
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMCallLowering.h42
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMCallingConv.h288
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMCallingConv.td310
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMComputeBlockSize.cpp72
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp2258
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp281
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h272
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp1686
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMFastISel.cpp3045
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMFeatures.h97
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp2321
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMFrameLowering.h85
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp101
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.h49
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp4682
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp13504
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMISelLowering.h720
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMInstrFormats.td2531
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp136
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMInstrInfo.h47
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMInstrInfo.td5857
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMInstrNEON.td8191
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMInstrThumb.td1610
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td4673
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMInstrVFP.td2308
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMInstructionSelector.cpp109
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMInstructionSelector.h39
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp44
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.h29
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp2389
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMMCInstLower.cpp263
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp24
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h255
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMOptimizeBarriersPass.cpp105
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMPerfectShuffle.h6591
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp127
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.h41
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMRegisterInfo.cpp19
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMRegisterInfo.h31
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td430
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMSchedule.td367
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMScheduleA8.td1075
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMScheduleA9.td2529
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMScheduleR52.td983
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMScheduleSwift.td1046
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMScheduleV6.td300
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp261
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h69
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp382
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMSubtarget.h661
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp544
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMTargetMachine.h131
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp92
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.h50
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp538
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h139
-rw-r--r--contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp10352
-rw-r--r--contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp5313
-rw-r--r--contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp1669
-rw-r--r--contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h238
-rwxr-xr-xcontrib/llvm/lib/Target/ARM/LICENSE.TXT47
-rw-r--r--contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h762
-rw-r--r--contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp1168
-rw-r--r--contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h80
-rw-r--r--contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h36
-rw-r--r--contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h31
-rw-r--r--contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h27
-rw-r--r--contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h466
-rw-r--r--contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp289
-rw-r--r--contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp1403
-rw-r--r--contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h120
-rw-r--r--contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp115
-rw-r--r--contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h56
-rw-r--r--contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp1702
-rw-r--r--contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp41
-rw-r--r--contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h79
-rw-r--r--contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp331
-rw-r--r--contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h131
-rw-r--r--contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp43
-rw-r--r--contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp486
-rw-r--r--contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp77
-rw-r--r--contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp196
-rw-r--r--contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h93
-rw-r--r--contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp91
-rw-r--r--contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp47
-rw-r--r--contrib/llvm/lib/Target/ARM/MLxExpansionPass.cpp395
-rw-r--r--contrib/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp41
-rw-r--r--contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp884
-rw-r--r--contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h93
-rw-r--r--contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp129
-rw-r--r--contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h61
-rw-r--r--contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp304
-rw-r--r--contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp643
-rw-r--r--contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h74
-rw-r--r--contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp1106
-rw-r--r--contrib/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp625
-rw-r--r--contrib/llvm/lib/Target/ARM/ThumbRegisterInfo.h66
-rw-r--r--contrib/llvm/lib/Target/AVR/AVR.h58
-rw-r--r--contrib/llvm/lib/Target/AVR/AVR.td81
-rw-r--r--contrib/llvm/lib/Target/AVR/AVRAsmPrinter.cpp184
-rw-r--r--contrib/llvm/lib/Target/AVR/AVRCallingConv.td58
-rw-r--r--contrib/llvm/lib/Target/AVR/AVRDevices.td491
-rw-r--r--contrib/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp1515
-rw-r--r--contrib/llvm/lib/Target/AVR/AVRFrameLowering.cpp538
-rw-r--r--contrib/llvm/lib/Target/AVR/AVRFrameLowering.h46
-rw-r--r--contrib/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp565
-rw-r--r--contrib/llvm/lib/Target/AVR/AVRISelLowering.cpp1937
-rw-r--r--contrib/llvm/lib/Target/AVR/AVRISelLowering.h163
-rw-r--r--contrib/llvm/lib/Target/AVR/AVRInstrFormats.td579
-rw-r--r--contrib/llvm/lib/Target/AVR/AVRInstrInfo.cpp498
-rw-r--r--contrib/llvm/lib/Target/AVR/AVRInstrInfo.h112
-rw-r--r--contrib/llvm/lib/Target/AVR/AVRInstrInfo.td2047
-rw-r--r--contrib/llvm/lib/Target/AVR/AVRInstrumentFunctions.cpp222
-rw-r--r--contrib/llvm/lib/Target/AVR/AVRMCInstLower.cpp100
-rw-r--r--contrib/llvm/lib/Target/AVR/AVRMCInstLower.h43
-rw-r--r--contrib/llvm/lib/Target/AVR/AVRMachineFunctionInfo.h69
-rw-r--r--contrib/llvm/lib/Target/AVR/AVRRegisterInfo.cpp266
-rw-r--r--contrib/llvm/lib/Target/AVR/AVRRegisterInfo.h58
-rw-r--r--contrib/llvm/lib/Target/AVR/AVRRegisterInfo.td216
-rw-r--r--contrib/llvm/lib/Target/AVR/AVRRelaxMemOperations.cpp149
-rw-r--r--contrib/llvm/lib/Target/AVR/AVRSelectionDAGInfo.h28
-rw-r--r--contrib/llvm/lib/Target/AVR/AVRSubtarget.cpp47
-rw-r--r--contrib/llvm/lib/Target/AVR/AVRSubtarget.h119
-rw-r--r--contrib/llvm/lib/Target/AVR/AVRTargetMachine.cpp118
-rw-r--r--contrib/llvm/lib/Target/AVR/AVRTargetMachine.h51
-rw-r--r--contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.cpp41
-rw-r--r--contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.h33
-rw-r--r--contrib/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp631
-rw-r--r--contrib/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp156
-rw-r--r--contrib/llvm/lib/Target/AVR/InstPrinter/AVRInstPrinter.cpp171
-rw-r--r--contrib/llvm/lib/Target/AVR/InstPrinter/AVRInstPrinter.h54
-rw-r--r--contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp473
-rw-r--r--contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h78
-rw-r--r--contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp127
-rw-r--r--contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp66
-rw-r--r--contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.h29
-rw-r--r--contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRFixupKinds.h149
-rw-r--r--contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp28
-rw-r--r--contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h31
-rw-r--r--contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp304
-rw-r--r--contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.h115
-rw-r--r--contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp189
-rw-r--r--contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h88
-rw-r--r--contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp121
-rw-r--r--contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h59
-rw-r--r--contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.cpp24
-rw-r--r--contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.h32
-rw-r--r--contrib/llvm/lib/Target/AVR/README.md8
-rw-r--r--contrib/llvm/lib/Target/AVR/TODO.md7
-rw-r--r--contrib/llvm/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp23
-rw-r--r--contrib/llvm/lib/Target/BPF/BPF.h22
-rw-r--r--contrib/llvm/lib/Target/BPF/BPF.td44
-rw-r--r--contrib/llvm/lib/Target/BPF/BPFAsmPrinter.cpp61
-rw-r--r--contrib/llvm/lib/Target/BPF/BPFCallingConv.td29
-rw-r--r--contrib/llvm/lib/Target/BPF/BPFFrameLowering.cpp40
-rw-r--r--contrib/llvm/lib/Target/BPF/BPFFrameLowering.h41
-rw-r--r--contrib/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp186
-rw-r--r--contrib/llvm/lib/Target/BPF/BPFISelLowering.cpp596
-rw-r--r--contrib/llvm/lib/Target/BPF/BPFISelLowering.h93
-rw-r--r--contrib/llvm/lib/Target/BPF/BPFInstrFormats.td33
-rw-r--r--contrib/llvm/lib/Target/BPF/BPFInstrInfo.cpp174
-rw-r--r--contrib/llvm/lib/Target/BPF/BPFInstrInfo.h61
-rw-r--r--contrib/llvm/lib/Target/BPF/BPFInstrInfo.td578
-rw-r--r--contrib/llvm/lib/Target/BPF/BPFMCInstLower.cpp76
-rw-r--r--contrib/llvm/lib/Target/BPF/BPFMCInstLower.h43
-rw-r--r--contrib/llvm/lib/Target/BPF/BPFRegisterInfo.cpp103
-rw-r--r--contrib/llvm/lib/Target/BPF/BPFRegisterInfo.h40
-rw-r--r--contrib/llvm/lib/Target/BPF/BPFRegisterInfo.td41
-rw-r--r--contrib/llvm/lib/Target/BPF/BPFSubtarget.cpp31
-rw-r--r--contrib/llvm/lib/Target/BPF/BPFSubtarget.h64
-rw-r--r--contrib/llvm/lib/Target/BPF/BPFTargetMachine.cpp82
-rw-r--r--contrib/llvm/lib/Target/BPF/BPFTargetMachine.h44
-rw-r--r--contrib/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp154
-rw-r--r--contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp94
-rw-r--r--contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.h40
-rw-r--r--contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp109
-rw-r--r--contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp58
-rw-r--r--contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h50
-rw-r--r--contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp179
-rw-r--r--contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp116
-rw-r--r--contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h71
-rw-r--r--contrib/llvm/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp36
-rw-r--r--contrib/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp2154
-rw-r--r--contrib/llvm/lib/Target/Hexagon/BitTracker.cpp1144
-rw-r--r--contrib/llvm/lib/Target/Hexagon/BitTracker.h438
-rw-r--r--contrib/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp1624
-rw-r--r--contrib/llvm/lib/Target/Hexagon/Hexagon.h56
-rw-r--r--contrib/llvm/lib/Target/Hexagon/Hexagon.td295
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp604
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h62
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp2881
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp1191
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.h64
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp483
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonBlockRanges.h244
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonBranchRelaxation.cpp219
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp253
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonCallingConv.td35
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp1304
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp3149
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp887
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp1034
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp1283
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonFixupHwLoops.cpp194
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp2441
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.h159
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp271
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp1601
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonGenMux.cpp351
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp538
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp1980
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.cpp140
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.h78
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp2002
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp3323
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.h294
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonInstrAlias.td652
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonInstrEnc.td1019
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonInstrFormats.td445
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV4.td160
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV60.td238
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp4283
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.h443
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.td4799
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV3.td215
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV4.td3301
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV5.td497
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV60.td2068
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoVector.td69
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonIntrinsics.td1353
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsDerived.td40
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV3.td27
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV4.td305
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV5.td111
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td803
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonIsetDx.td728
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp170
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.cpp16
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.h80
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp1031
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.h254
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp701
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonOperands.td332
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp678
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp148
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonPatterns.td3347
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonPeephole.cpp338
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp330
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp272
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h84
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td286
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonSchedule.td24
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonScheduleV4.td203
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonScheduleV55.td194
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonScheduleV60.td301
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp63
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.h35
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp115
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp1205
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp609
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp393
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.h157
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonSystemInst.td134
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp345
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.h50
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp382
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.h50
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonTargetStreamer.h31
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp71
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h69
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp1683
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h117
-rw-r--r--contrib/llvm/lib/Target/Hexagon/HexagonVectorPrint.cpp209
-rw-r--r--contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp753
-rw-r--r--contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h308
-rw-r--r--contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp295
-rw-r--r--contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonFixupKinds.h138
-rw-r--r--contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp227
-rw-r--r--contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h92
-rw-r--r--contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp37
-rw-r--r--contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h31
-rw-r--r--contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp585
-rw-r--r--contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h217
-rw-r--r--contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp824
-rw-r--r--contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h75
-rw-r--r--contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp425
-rw-r--r--contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp1081
-rw-r--r--contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp164
-rw-r--r--contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h44
-rw-r--r--contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp78
-rw-r--r--contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h47
-rw-r--r--contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp809
-rw-r--r--contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h302
-rw-r--r--contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp236
-rw-r--r--contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h65
-rw-r--r--contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp278
-rw-r--r--contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h79
-rw-r--r--contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp463
-rw-r--r--contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h190
-rw-r--r--contrib/llvm/lib/Target/Hexagon/RDFCopy.cpp240
-rw-r--r--contrib/llvm/lib/Target/Hexagon/RDFCopy.h55
-rw-r--r--contrib/llvm/lib/Target/Hexagon/RDFDeadCode.cpp232
-rw-r--r--contrib/llvm/lib/Target/Hexagon/RDFDeadCode.h67
-rw-r--r--contrib/llvm/lib/Target/Hexagon/RDFGraph.cpp1950
-rw-r--r--contrib/llvm/lib/Target/Hexagon/RDFGraph.h997
-rw-r--r--contrib/llvm/lib/Target/Hexagon/RDFLiveness.cpp1030
-rw-r--r--contrib/llvm/lib/Target/Hexagon/RDFLiveness.h134
-rw-r--r--contrib/llvm/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp23
-rw-r--r--contrib/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp1213
-rw-r--r--contrib/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp240
-rw-r--r--contrib/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.h41
-rw-r--r--contrib/llvm/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.cpp305
-rw-r--r--contrib/llvm/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.h65
-rw-r--r--contrib/llvm/lib/Target/Lanai/Lanai.h51
-rw-r--r--contrib/llvm/lib/Target/Lanai/Lanai.td47
-rw-r--r--contrib/llvm/lib/Target/Lanai/LanaiAluCode.h148
-rw-r--r--contrib/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp243
-rw-r--r--contrib/llvm/lib/Target/Lanai/LanaiCallingConv.td50
-rw-r--r--contrib/llvm/lib/Target/Lanai/LanaiCondCode.h100
-rw-r--r--contrib/llvm/lib/Target/Lanai/LanaiDelaySlotFiller.cpp262
-rw-r--r--contrib/llvm/lib/Target/Lanai/LanaiFrameLowering.cpp220
-rw-r--r--contrib/llvm/lib/Target/Lanai/LanaiFrameLowering.h57
-rw-r--r--contrib/llvm/lib/Target/Lanai/LanaiISelDAGToDAG.cpp337
-rw-r--r--contrib/llvm/lib/Target/Lanai/LanaiISelLowering.cpp1488
-rw-r--r--contrib/llvm/lib/Target/Lanai/LanaiISelLowering.h149
-rw-r--r--contrib/llvm/lib/Target/Lanai/LanaiInstrFormats.td561
-rw-r--r--contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp808
-rw-r--r--contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.h186
-rw-r--r--contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.td884
-rw-r--r--contrib/llvm/lib/Target/Lanai/LanaiMCInstLower.cpp139
-rw-r--r--contrib/llvm/lib/Target/Lanai/LanaiMCInstLower.h47
-rw-r--r--contrib/llvm/lib/Target/Lanai/LanaiMachineFunctionInfo.cpp23
-rw-r--r--contrib/llvm/lib/Target/Lanai/LanaiMachineFunctionInfo.h58
-rw-r--r--contrib/llvm/lib/Target/Lanai/LanaiMemAluCombiner.cpp425
-rw-r--r--contrib/llvm/lib/Target/Lanai/LanaiRegisterInfo.cpp287
-rw-r--r--contrib/llvm/lib/Target/Lanai/LanaiRegisterInfo.h63
-rw-r--r--contrib/llvm/lib/Target/Lanai/LanaiRegisterInfo.td64
-rw-r--r--contrib/llvm/lib/Target/Lanai/LanaiSchedule.td70
-rw-r--r--contrib/llvm/lib/Target/Lanai/LanaiSelectionDAGInfo.cpp35
-rw-r--r--contrib/llvm/lib/Target/Lanai/LanaiSelectionDAGInfo.h36
-rw-r--r--contrib/llvm/lib/Target/Lanai/LanaiSubtarget.cpp47
-rw-r--r--contrib/llvm/lib/Target/Lanai/LanaiSubtarget.h76
-rw-r--r--contrib/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp113
-rw-r--r--contrib/llvm/lib/Target/Lanai/LanaiTargetMachine.h55
-rw-r--r--contrib/llvm/lib/Target/Lanai/LanaiTargetObjectFile.cpp132
-rw-r--r--contrib/llvm/lib/Target/Lanai/LanaiTargetObjectFile.h46
-rw-r--r--contrib/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h81
-rw-r--r--contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp172
-rw-r--r--contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiBaseInfo.h119
-rw-r--r--contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp95
-rw-r--r--contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiFixupKinds.h43
-rw-r--r--contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.cpp43
-rw-r--r--contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.h31
-rw-r--r--contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp309
-rw-r--r--contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp60
-rw-r--r--contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.h56
-rw-r--r--contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp154
-rw-r--r--contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h61
-rw-r--r--contrib/llvm/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp25
-rw-r--r--contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp116
-rw-r--r--contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h43
-rw-r--r--contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp26
-rw-r--r--contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h31
-rw-r--r--contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp79
-rw-r--r--contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h38
-rw-r--r--contrib/llvm/lib/Target/MSP430/MSP430.h47
-rw-r--r--contrib/llvm/lib/Target/MSP430/MSP430.td60
-rw-r--r--contrib/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp159
-rw-r--r--contrib/llvm/lib/Target/MSP430/MSP430BranchSelector.cpp257
-rw-r--r--contrib/llvm/lib/Target/MSP430/MSP430CallingConv.td37
-rw-r--r--contrib/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp301
-rw-r--r--contrib/llvm/lib/Target/MSP430/MSP430FrameLowering.h54
-rw-r--r--contrib/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp469
-rw-r--r--contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp1342
-rw-r--r--contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.h174
-rw-r--r--contrib/llvm/lib/Target/MSP430/MSP430InstrFormats.td211
-rw-r--r--contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp335
-rw-r--r--contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.h92
-rw-r--r--contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.td1211
-rw-r--r--contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.cpp157
-rw-r--r--contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.h47
-rw-r--r--contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp14
-rw-r--r--contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h54
-rw-r--r--contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.cpp161
-rw-r--r--contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.h46
-rw-r--r--contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.td81
-rw-r--r--contrib/llvm/lib/Target/MSP430/MSP430Subtarget.cpp37
-rw-r--r--contrib/llvm/lib/Target/MSP430/MSP430Subtarget.h69
-rw-r--r--contrib/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp80
-rw-r--r--contrib/llvm/lib/Target/MSP430/MSP430TargetMachine.h49
-rw-r--r--contrib/llvm/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp23
-rw-r--r--contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp6766
-rw-r--r--contrib/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp2505
-rw-r--r--contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp294
-rw-r--r--contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.h114
-rw-r--r--contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp69
-rw-r--r--contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h200
-rw-r--r--contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp119
-rw-r--r--contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h82
-rw-r--r--contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp522
-rw-r--r--contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h94
-rw-r--r--contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h132
-rw-r--r--contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp662
-rw-r--r--contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp82
-rw-r--r--contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h76
-rw-r--r--contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h224
-rw-r--r--contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp66
-rw-r--r--contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h31
-rw-r--r--contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp1171
-rw-r--r--contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h278
-rw-r--r--contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp287
-rw-r--r--contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h91
-rw-r--r--contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h30
-rw-r--r--contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp201
-rw-r--r--contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h84
-rw-r--r--contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp268
-rw-r--r--contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp95
-rw-r--r--contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp1166
-rw-r--r--contrib/llvm/lib/Target/Mips/MSA.txt83
-rw-r--r--contrib/llvm/lib/Target/Mips/MicroMips32r6InstrFormats.td1094
-rw-r--r--contrib/llvm/lib/Target/Mips/MicroMips32r6InstrInfo.td1881
-rw-r--r--contrib/llvm/lib/Target/Mips/MicroMips64r6InstrFormats.td267
-rw-r--r--contrib/llvm/lib/Target/Mips/MicroMips64r6InstrInfo.td568
-rw-r--r--contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrFormats.td302
-rw-r--r--contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrInfo.td601
-rw-r--r--contrib/llvm/lib/Target/Mips/MicroMipsInstrFPU.td180
-rw-r--r--contrib/llvm/lib/Target/Mips/MicroMipsInstrFormats.td1049
-rw-r--r--contrib/llvm/lib/Target/Mips/MicroMipsInstrInfo.td1190
-rw-r--r--contrib/llvm/lib/Target/Mips/Mips.h37
-rw-r--r--contrib/llvm/lib/Target/Mips/Mips.td231
-rw-r--r--contrib/llvm/lib/Target/Mips/Mips16FrameLowering.cpp176
-rw-r--r--contrib/llvm/lib/Target/Mips/Mips16FrameLowering.h47
-rw-r--r--contrib/llvm/lib/Target/Mips/Mips16HardFloat.cpp547
-rw-r--r--contrib/llvm/lib/Target/Mips/Mips16HardFloatInfo.cpp50
-rw-r--r--contrib/llvm/lib/Target/Mips/Mips16HardFloatInfo.h50
-rw-r--r--contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp261
-rw-r--r--contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.h55
-rw-r--r--contrib/llvm/lib/Target/Mips/Mips16ISelLowering.cpp800
-rw-r--r--contrib/llvm/lib/Target/Mips/Mips16ISelLowering.h82
-rw-r--r--contrib/llvm/lib/Target/Mips/Mips16InstrFormats.td640
-rw-r--r--contrib/llvm/lib/Target/Mips/Mips16InstrInfo.cpp519
-rw-r--r--contrib/llvm/lib/Target/Mips/Mips16InstrInfo.h126
-rw-r--r--contrib/llvm/lib/Target/Mips/Mips16InstrInfo.td1910
-rw-r--r--contrib/llvm/lib/Target/Mips/Mips16RegisterInfo.cpp148
-rw-r--r--contrib/llvm/lib/Target/Mips/Mips16RegisterInfo.h48
-rw-r--r--contrib/llvm/lib/Target/Mips/Mips32r6InstrFormats.td578
-rw-r--r--contrib/llvm/lib/Target/Mips/Mips32r6InstrInfo.td1004
-rw-r--r--contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td772
-rw-r--r--contrib/llvm/lib/Target/Mips/Mips64r6InstrInfo.td279
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsAnalyzeImmediate.cpp154
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsAnalyzeImmediate.h63
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp1073
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsAsmPrinter.h147
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsCCState.cpp136
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsCCState.h136
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsCallingConv.td406
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsCondMov.td299
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp1694
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsDSPInstrFormats.td369
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsDSPInstrInfo.td1456
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp891
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsEVAInstrFormats.td84
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsEVAInstrInfo.td209
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsFastISel.cpp2081
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsFrameLowering.cpp159
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsFrameLowering.h54
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsHazardSchedule.cpp160
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsISelDAGToDAG.cpp270
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsISelDAGToDAG.h144
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp4030
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsISelLowering.h614
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsInstrFPU.td687
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsInstrFormats.td968
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsInstrInfo.cpp503
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsInstrInfo.h161
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsInstrInfo.td2868
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsLongBranch.cpp532
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsMCInstLower.cpp281
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsMCInstLower.h48
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsMSAInstrFormats.td455
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsMSAInstrInfo.td3946
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsMachineFunction.cpp104
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsMachineFunction.h132
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp50
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp301
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsOptionRecord.h78
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsOs16.cpp160
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsRegisterInfo.cpp340
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsRegisterInfo.h82
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsRegisterInfo.td673
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp893
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.h52
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp1067
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.h146
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp3779
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsSEISelLowering.h131
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp754
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.h119
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp260
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.h41
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsSchedule.td674
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsScheduleGeneric.td1048
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsScheduleP5600.td586
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsSubtarget.cpp174
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsSubtarget.h316
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp270
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsTargetMachine.h96
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp150
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.h48
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsTargetStreamer.h324
-rw-r--r--contrib/llvm/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp48
-rw-r--r--contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp286
-rw-r--r--contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h52
-rw-r--r--contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h46
-rw-r--r--contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp59
-rw-r--r--contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h31
-rw-r--r--contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp79
-rw-r--r--contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h38
-rw-r--r--contrib/llvm/lib/Target/NVPTX/ManagedStringPool.h48
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTX.h178
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTX.td96
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp70
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.h23
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp2387
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h343
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp84
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp78
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h36
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp354
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp5259
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h100
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp4639
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h547
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp181
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXInferAddressSpaces.cpp583
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXInstrFormats.td59
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp248
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h79
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td2807
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td7260
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp349
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.h24
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp118
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp253
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp60
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.h125
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h51
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXPeephole.cpp157
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp227
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp128
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.h65
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td69
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp191
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXSection.h43
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp59
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.h116
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp371
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h89
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h105
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp154
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h64
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp317
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.h65
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXVector.td1479
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp152
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp219
-rw-r--r--contrib/llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp29
-rw-r--r--contrib/llvm/lib/Target/NVPTX/cl_common_defines.h122
-rw-r--r--contrib/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp1966
-rw-r--r--contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp440
-rw-r--r--contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp506
-rw-r--r--contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h74
-rw-r--r--contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp241
-rw-r--r--contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp425
-rw-r--r--contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h56
-rw-r--r--contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp83
-rw-r--r--contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h39
-rw-r--r--contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp389
-rw-r--r--contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp150
-rw-r--r--contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h100
-rw-r--r--contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp264
-rw-r--r--contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h106
-rw-r--r--contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp383
-rw-r--r--contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp86
-rw-r--r--contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h76
-rw-r--r--contrib/llvm/lib/Target/PowerPC/P9InstrResources.td808
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPC.h104
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPC.td468
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp1471
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp274
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp283
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCCCState.cpp36
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCCCState.h42
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp728
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCCallingConv.h35
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCCallingConv.td284
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp213
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp2374
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp2164
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.h149
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp436
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.h102
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp4529
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp12771
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h1031
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td1301
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCInstrAltivec.td1451
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCInstrBuilder.h43
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCInstrFormats.td1992
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCInstrHTM.td172
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp1933
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h297
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td4403
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCInstrQPX.td1216
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCInstrSPE.td447
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td2924
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp452
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp187
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp392
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp46
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h217
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCPerfectShuffle.h6591
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp166
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp1078
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h145
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.td352
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCSchedule.td135
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCSchedule440.td608
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCScheduleA2.td172
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCScheduleE500mc.td321
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCScheduleE5500.td381
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCScheduleG3.td80
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCScheduleG4.td96
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCScheduleG4Plus.td112
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCScheduleG5.td130
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCScheduleP7.td397
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCScheduleP8.td406
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCScheduleP9.td335
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp252
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h322
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp174
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCTOCRegDeps.cpp155
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp429
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.h85
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp59
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.h34
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCTargetStreamer.h27
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp443
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h92
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp177
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp398
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp1035
-rw-r--r--contrib/llvm/lib/Target/PowerPC/README_P9.txt605
-rw-r--r--contrib/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp37
-rw-r--r--contrib/llvm/lib/Target/PowerPC/p9-instrs.txt442
-rw-r--r--contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp91
-rw-r--r--contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp47
-rw-r--r--contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp25
-rw-r--r--contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h31
-rw-r--r--contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp91
-rw-r--r--contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp59
-rw-r--r--contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h58
-rw-r--r--contrib/llvm/lib/Target/RISCV/RISCV.td27
-rw-r--r--contrib/llvm/lib/Target/RISCV/RISCVInstrFormats.td152
-rw-r--r--contrib/llvm/lib/Target/RISCV/RISCVInstrInfo.td55
-rw-r--r--contrib/llvm/lib/Target/RISCV/RISCVRegisterInfo.td90
-rw-r--r--contrib/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp58
-rw-r--r--contrib/llvm/lib/Target/RISCV/RISCVTargetMachine.h40
-rw-r--r--contrib/llvm/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.cpp30
-rw-r--r--contrib/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp1300
-rw-r--r--contrib/llvm/lib/Target/Sparc/DelaySlotFiller.cpp512
-rw-r--r--contrib/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp670
-rw-r--r--contrib/llvm/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp197
-rw-r--r--contrib/llvm/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h55
-rwxr-xr-xcontrib/llvm/lib/Target/Sparc/LeonFeatures.td82
-rwxr-xr-xcontrib/llvm/lib/Target/Sparc/LeonPasses.cpp374
-rwxr-xr-xcontrib/llvm/lib/Target/Sparc/LeonPasses.h115
-rw-r--r--contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp306
-rw-r--r--contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp140
-rw-r--r--contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h97
-rw-r--r--contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp70
-rw-r--r--contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h38
-rw-r--r--contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp229
-rw-r--r--contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp221
-rw-r--r--contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h112
-rw-r--r--contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp170
-rw-r--r--contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h62
-rw-r--r--contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.cpp46
-rw-r--r--contrib/llvm/lib/Target/Sparc/Sparc.h167
-rw-r--r--contrib/llvm/lib/Target/Sparc/Sparc.td159
-rw-r--r--contrib/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp449
-rw-r--r--contrib/llvm/lib/Target/Sparc/SparcCallingConv.td144
-rw-r--r--contrib/llvm/lib/Target/Sparc/SparcFrameLowering.cpp368
-rw-r--r--contrib/llvm/lib/Target/Sparc/SparcFrameLowering.h68
-rw-r--r--contrib/llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp405
-rw-r--r--contrib/llvm/lib/Target/Sparc/SparcISelLowering.cpp3574
-rw-r--r--contrib/llvm/lib/Target/Sparc/SparcISelLowering.h223
-rw-r--r--contrib/llvm/lib/Target/Sparc/SparcInstr64Bit.td541
-rw-r--r--contrib/llvm/lib/Target/Sparc/SparcInstrAliases.td506
-rw-r--r--contrib/llvm/lib/Target/Sparc/SparcInstrFormats.td369
-rw-r--r--contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp510
-rw-r--r--contrib/llvm/lib/Target/Sparc/SparcInstrInfo.h108
-rw-r--r--contrib/llvm/lib/Target/Sparc/SparcInstrInfo.td1692
-rw-r--r--contrib/llvm/lib/Target/Sparc/SparcInstrVIS.td263
-rw-r--r--contrib/llvm/lib/Target/Sparc/SparcMCInstLower.cpp108
-rw-r--r--contrib/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.cpp14
-rw-r--r--contrib/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.h56
-rw-r--r--contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp237
-rw-r--r--contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.h50
-rw-r--r--contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.td377
-rwxr-xr-xcontrib/llvm/lib/Target/Sparc/SparcSchedule.td124
-rw-r--r--contrib/llvm/lib/Target/Sparc/SparcSubtarget.cpp99
-rw-r--r--contrib/llvm/lib/Target/Sparc/SparcSubtarget.h122
-rw-r--r--contrib/llvm/lib/Target/Sparc/SparcTargetMachine.cpp197
-rw-r--r--contrib/llvm/lib/Target/Sparc/SparcTargetMachine.h79
-rw-r--r--contrib/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp42
-rw-r--r--contrib/llvm/lib/Target/Sparc/SparcTargetObjectFile.h35
-rw-r--r--contrib/llvm/lib/Target/Sparc/SparcTargetStreamer.h49
-rw-r--r--contrib/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp35
-rw-r--r--contrib/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp1259
-rw-r--r--contrib/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp451
-rw-r--r--contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp231
-rw-r--r--contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h75
-rw-r--r--contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp126
-rw-r--r--contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp29
-rw-r--r--contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h26
-rw-r--r--contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp284
-rw-r--r--contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h32
-rw-r--r--contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp164
-rw-r--r--contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp246
-rw-r--r--contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h108
-rw-r--r--contrib/llvm/lib/Target/SystemZ/README.txt154
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZ.h185
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZ.td75
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp527
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h42
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZCallingConv.cpp21
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZCallingConv.h130
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZCallingConv.td122
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp52
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.h58
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp575
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZExpandPseudo.cpp153
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZFeatures.td171
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp549
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.h64
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp337
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.h128
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp1419
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp6294
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.h595
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZInstrBuilder.h46
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZInstrFP.td507
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZInstrFormats.td4083
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp1752
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.h309
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td1929
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZInstrVector.td1200
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp146
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp465
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZMCInstLower.cpp103
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZMCInstLower.h44
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp17
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.h79
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp153
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h112
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZOperands.td593
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZOperators.td684
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZPatterns.td169
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZProcessors.td35
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp159
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h67
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td306
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZSchedule.td77
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td1064
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td769
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td807
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp275
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h74
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp285
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp64
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZSubtarget.h146
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZTDC.cpp382
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp209
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.h53
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp315
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h61
-rw-r--r--contrib/llvm/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp23
-rw-r--r--contrib/llvm/lib/Target/Target.cpp141
-rw-r--r--contrib/llvm/lib/Target/TargetIntrinsicInfo.cpp30
-rw-r--r--contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp326
-rw-r--r--contrib/llvm/lib/Target/TargetMachine.cpp221
-rw-r--r--contrib/llvm/lib/Target/TargetMachineC.cpp243
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp151
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp244
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h58
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp105
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp67
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp53
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h32
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp121
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp146
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h182
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp120
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h88
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/README.txt147
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp36
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssembly.h56
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssembly.td66
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp95
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp322
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp579
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp120
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp308
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp1274
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp296
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp257
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.h57
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyISD.def25
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp118
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp707
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h98
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td47
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td130
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td114
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrConv.td111
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFloat.td101
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFormats.td102
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp204
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h63
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td222
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td97
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td686
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td19
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp128
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp1184
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp115
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h46
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp62
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h119
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp105
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp76
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp198
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp124
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp175
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp107
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp884
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp148
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h52
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td62
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp99
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp20
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h30
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp119
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp202
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp55
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h85
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp277
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.h53
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp24
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h30
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp83
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h72
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp71
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.h34
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt68
-rw-r--r--contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp1077
-rw-r--r--contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h68
-rw-r--r--contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp3184
-rw-r--r--contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h41
-rw-r--r--contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h546
-rw-r--r--contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp1083
-rw-r--r--contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp1901
-rw-r--r--contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h682
-rw-r--r--contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h493
-rw-r--r--contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp299
-rw-r--r--contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h142
-rw-r--r--contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp1197
-rw-r--r--contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h30
-rw-r--r--contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp260
-rw-r--r--contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h162
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp881
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h784
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp303
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h40
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp171
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h61
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp1529
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp456
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h127
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp610
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp105
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp60
-rw-r--r--contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp29
-rw-r--r--contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp606
-rw-r--r--contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h162
-rw-r--r--contrib/llvm/lib/Target/X86/X86.h99
-rw-r--r--contrib/llvm/lib/Target/X86/X86.td856
-rw-r--r--contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp661
-rw-r--r--contrib/llvm/lib/Target/X86/X86AsmPrinter.h141
-rw-r--r--contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp591
-rw-r--r--contrib/llvm/lib/Target/X86/X86CallLowering.cpp46
-rw-r--r--contrib/llvm/lib/Target/X86/X86CallLowering.h39
-rw-r--r--contrib/llvm/lib/Target/X86/X86CallingConv.cpp208
-rw-r--r--contrib/llvm/lib/Target/X86/X86CallingConv.h121
-rw-r--r--contrib/llvm/lib/Target/X86/X86CallingConv.td1121
-rwxr-xr-xcontrib/llvm/lib/Target/X86/X86EvexToVex.cpp213
-rw-r--r--contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp297
-rw-r--r--contrib/llvm/lib/Target/X86/X86FastISel.cpp3933
-rw-r--r--contrib/llvm/lib/Target/X86/X86FixupBWInsts.cpp367
-rw-r--r--contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp418
-rw-r--r--contrib/llvm/lib/Target/X86/X86FixupSetCC.cpp187
-rw-r--r--contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp1696
-rw-r--r--contrib/llvm/lib/Target/X86/X86FrameLowering.cpp2998
-rw-r--r--contrib/llvm/lib/Target/X86/X86FrameLowering.h218
-rw-r--r--contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp2798
-rw-r--r--contrib/llvm/lib/Target/X86/X86ISelLowering.cpp34395
-rw-r--r--contrib/llvm/lib/Target/X86/X86ISelLowering.h1382
-rw-r--r--contrib/llvm/lib/Target/X86/X86Instr3DNow.td103
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrAVX512.td9181
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrArithmetic.td1375
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrBuilder.h233
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td112
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrCompiler.td1932
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrControl.td358
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrExtension.td186
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrFMA.td443
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrFMA3Info.cpp285
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrFMA3Info.h315
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrFPStack.td729
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrFormats.td957
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td1099
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrInfo.cpp9731
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrInfo.h608
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrInfo.td3119
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrMMX.td675
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrMPX.td70
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrSGX.td24
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrSSE.td8711
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrSVM.td62
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td970
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrSystem.td622
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrTSX.td50
-rwxr-xr-xcontrib/llvm/lib/Target/X86/X86InstrTablesInfo.h1148
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrVMX.td66
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrXOP.td427
-rw-r--r--contrib/llvm/lib/Target/X86/X86InterleavedAccess.cpp221
-rw-r--r--contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h1794
-rw-r--r--contrib/llvm/lib/Target/X86/X86MCInstLower.cpp1795
-rw-r--r--contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp33
-rw-r--r--contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h185
-rw-r--r--contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp645
-rw-r--r--contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp219
-rw-r--r--contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp758
-rw-r--r--contrib/llvm/lib/Target/X86/X86RegisterInfo.h142
-rw-r--r--contrib/llvm/lib/Target/X86/X86RegisterInfo.td530
-rw-r--r--contrib/llvm/lib/Target/X86/X86SchedHaswell.td2147
-rw-r--r--contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td250
-rw-r--r--contrib/llvm/lib/Target/X86/X86Schedule.td661
-rw-r--r--contrib/llvm/lib/Target/X86/X86ScheduleAtom.td550
-rw-r--r--contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td341
-rw-r--r--contrib/llvm/lib/Target/X86/X86ScheduleSLM.td233
-rw-r--r--contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp283
-rw-r--r--contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h50
-rw-r--r--contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp333
-rw-r--r--contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h52
-rw-r--r--contrib/llvm/lib/Target/X86/X86Subtarget.cpp365
-rw-r--r--contrib/llvm/lib/Target/X86/X86Subtarget.h633
-rw-r--r--contrib/llvm/lib/Target/X86/X86TargetMachine.cpp405
-rw-r--r--contrib/llvm/lib/Target/X86/X86TargetMachine.h48
-rw-r--r--contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp184
-rw-r--r--contrib/llvm/lib/Target/X86/X86TargetObjectFile.h84
-rw-r--r--contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp2250
-rw-r--r--contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h116
-rw-r--r--contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp328
-rw-r--r--contrib/llvm/lib/Target/X86/X86WinAllocaExpander.cpp295
-rw-r--r--contrib/llvm/lib/Target/X86/X86WinEHState.cpp796
-rw-r--r--contrib/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp785
-rw-r--r--contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp88
-rw-r--r--contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h43
-rw-r--r--contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp33
-rw-r--r--contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h31
-rw-r--r--contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp151
-rw-r--r--contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h39
-rw-r--r--contrib/llvm/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp22
-rw-r--r--contrib/llvm/lib/Target/XCore/XCore.h37
-rw-r--r--contrib/llvm/lib/Target/XCore/XCore.td47
-rw-r--r--contrib/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp300
-rw-r--r--contrib/llvm/lib/Target/XCore/XCoreCallingConv.td40
-rw-r--r--contrib/llvm/lib/Target/XCore/XCoreFrameLowering.cpp592
-rw-r--r--contrib/llvm/lib/Target/XCore/XCoreFrameLowering.h63
-rw-r--r--contrib/llvm/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp66
-rw-r--r--contrib/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp282
-rw-r--r--contrib/llvm/lib/Target/XCore/XCoreISelLowering.cpp1948
-rw-r--r--contrib/llvm/lib/Target/XCore/XCoreISelLowering.h234
-rw-r--r--contrib/llvm/lib/Target/XCore/XCoreInstrFormats.td277
-rw-r--r--contrib/llvm/lib/Target/XCore/XCoreInstrInfo.cpp451
-rw-r--r--contrib/llvm/lib/Target/XCore/XCoreInstrInfo.h94
-rw-r--r--contrib/llvm/lib/Target/XCore/XCoreInstrInfo.td1312
-rw-r--r--contrib/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp234
-rw-r--r--contrib/llvm/lib/Target/XCore/XCoreMCInstLower.cpp114
-rw-r--r--contrib/llvm/lib/Target/XCore/XCoreMCInstLower.h41
-rw-r--r--contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.cpp72
-rw-r--r--contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h106
-rw-r--r--contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.cpp330
-rw-r--r--contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.h55
-rw-r--r--contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.td59
-rw-r--r--contrib/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.cpp51
-rw-r--r--contrib/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.h35
-rw-r--r--contrib/llvm/lib/Target/XCore/XCoreSubtarget.cpp31
-rw-r--r--contrib/llvm/lib/Target/XCore/XCoreSubtarget.h66
-rw-r--r--contrib/llvm/lib/Target/XCore/XCoreTargetMachine.cpp99
-rw-r--r--contrib/llvm/lib/Target/XCore/XCoreTargetMachine.h48
-rw-r--r--contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp156
-rw-r--r--contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.h40
-rw-r--r--contrib/llvm/lib/Target/XCore/XCoreTargetStreamer.h27
-rw-r--r--contrib/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h54
1278 files changed, 754706 insertions, 0 deletions
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64.h b/contrib/llvm/lib/Target/AArch64/AArch64.h
new file mode 100644
index 000000000000..fd106a8d9b0b
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64.h
@@ -0,0 +1,66 @@
+//==-- AArch64.h - Top-level interface for AArch64 --------------*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// AArch64 back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64_H
+
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class AArch64TargetMachine;
+class FunctionPass;
+class MachineFunctionPass;
+
+FunctionPass *createAArch64DeadRegisterDefinitions();
+FunctionPass *createAArch64RedundantCopyEliminationPass();
+FunctionPass *createAArch64ConditionalCompares();
+FunctionPass *createAArch64AdvSIMDScalar();
+FunctionPass *createAArch64ISelDag(AArch64TargetMachine &TM,
+ CodeGenOpt::Level OptLevel);
+FunctionPass *createAArch64StorePairSuppressPass();
+FunctionPass *createAArch64ExpandPseudoPass();
+FunctionPass *createAArch64LoadStoreOptimizationPass();
+FunctionPass *createAArch64VectorByElementOptPass();
+ModulePass *createAArch64PromoteConstantPass();
+FunctionPass *createAArch64ConditionOptimizerPass();
+FunctionPass *createAArch64AddressTypePromotionPass();
+FunctionPass *createAArch64A57FPLoadBalancing();
+FunctionPass *createAArch64A53Fix835769();
+
+FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
+
+FunctionPass *createAArch64CollectLOHPass();
+
+void initializeAArch64A53Fix835769Pass(PassRegistry&);
+void initializeAArch64A57FPLoadBalancingPass(PassRegistry&);
+void initializeAArch64AddressTypePromotionPass(PassRegistry&);
+void initializeAArch64AdvSIMDScalarPass(PassRegistry&);
+void initializeAArch64CollectLOHPass(PassRegistry&);
+void initializeAArch64ConditionalComparesPass(PassRegistry&);
+void initializeAArch64ConditionOptimizerPass(PassRegistry&);
+void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry&);
+void initializeAArch64ExpandPseudoPass(PassRegistry&);
+void initializeAArch64LoadStoreOptPass(PassRegistry&);
+void initializeAArch64VectorByElementOptPass(PassRegistry&);
+void initializeAArch64PromoteConstantPass(PassRegistry&);
+void initializeAArch64RedundantCopyEliminationPass(PassRegistry&);
+void initializeAArch64StorePairSuppressPass(PassRegistry&);
+void initializeLDTLSCleanupPass(PassRegistry&);
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64.td b/contrib/llvm/lib/Target/AArch64/AArch64.td
new file mode 100644
index 000000000000..c40391d5ad9d
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64.td
@@ -0,0 +1,349 @@
+//=- AArch64.td - Describe the AArch64 Target Machine --------*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing.
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// AArch64 Subtarget features.
+//
+
+def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", "true",
+ "Enable ARMv8 FP">;
+
+def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
+ "Enable Advanced SIMD instructions", [FeatureFPARMv8]>;
+
+def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
+ "Enable cryptographic instructions">;
+
+def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
+ "Enable ARMv8 CRC-32 checksum instructions">;
+
+def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true",
+ "Enable ARMv8 Reliability, Availability and Serviceability Extensions">;
+
+def FeatureLSE : SubtargetFeature<"lse", "HasLSE", "true",
+ "Enable ARMv8.1 Large System Extension (LSE) atomic instructions">;
+
+def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true",
+ "Enable ARMv8 PMUv3 Performance Monitors extension">;
+
+def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true",
+ "Full FP16", [FeatureFPARMv8]>;
+
+def FeatureSPE : SubtargetFeature<"spe", "HasSPE", "true",
+ "Enable Statistical Profiling extension">;
+
+/// Cyclone has register move instructions which are "free".
+def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
+ "Has zero-cycle register moves">;
+
+/// Cyclone has instructions which zero registers for "free".
+def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
+ "Has zero-cycle zeroing instructions">;
+
+def FeatureStrictAlign : SubtargetFeature<"strict-align",
+ "StrictAlign", "true",
+ "Disallow all unaligned memory "
+ "access">;
+
+def FeatureReserveX18 : SubtargetFeature<"reserve-x18", "ReserveX18", "true",
+ "Reserve X18, making it unavailable "
+ "as a GPR">;
+
+def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true",
+ "Use alias analysis during codegen">;
+
+def FeatureBalanceFPOps : SubtargetFeature<"balance-fp-ops", "BalanceFPOps",
+ "true",
+ "balance mix of odd and even D-registers for fp multiply(-accumulate) ops">;
+
+def FeaturePredictableSelectIsExpensive : SubtargetFeature<
+ "predictable-select-expensive", "PredictableSelectIsExpensive", "true",
+ "Prefer likely predicted branches over selects">;
+
+def FeatureCustomCheapAsMoveHandling : SubtargetFeature<"custom-cheap-as-move",
+ "CustomAsCheapAsMove", "true",
+ "Use custom code for TargetInstrInfo::isAsCheapAsAMove()">;
+
+def FeaturePostRAScheduler : SubtargetFeature<"use-postra-scheduler",
+ "UsePostRAScheduler", "true", "Schedule again after register allocation">;
+
+def FeatureSlowMisaligned128Store : SubtargetFeature<"slow-misaligned-128store",
+ "Misaligned128StoreIsSlow", "true", "Misaligned 128 bit stores are slow">;
+
+def FeatureAvoidQuadLdStPairs : SubtargetFeature<"no-quad-ldst-pairs",
+ "AvoidQuadLdStPairs", "true",
+ "Do not form quad load/store pair operations">;
+
+def FeatureAlternateSExtLoadCVTF32Pattern : SubtargetFeature<
+ "alternate-sextload-cvt-f32-pattern", "UseAlternateSExtLoadCVTF32Pattern",
+ "true", "Use alternative pattern for sextload convert to f32">;
+
+def FeatureArithmeticBccFusion : SubtargetFeature<
+ "arith-bcc-fusion", "HasArithmeticBccFusion", "true",
+ "CPU fuses arithmetic+bcc operations">;
+
+def FeatureArithmeticCbzFusion : SubtargetFeature<
+ "arith-cbz-fusion", "HasArithmeticCbzFusion", "true",
+ "CPU fuses arithmetic + cbz/cbnz operations">;
+
+def FeatureDisableLatencySchedHeuristic : SubtargetFeature<
+ "disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true",
+ "Disable latency scheduling heuristic">;
+
+def FeatureUseRSqrt : SubtargetFeature<
+ "use-reciprocal-square-root", "UseRSqrt", "true",
+ "Use the reciprocal square root approximation">;
+
+//===----------------------------------------------------------------------===//
+// Architectures.
+//
+
+def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true",
+ "Support ARM v8.1a instructions", [FeatureCRC, FeatureLSE]>;
+
+def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
+ "Support ARM v8.2a instructions", [HasV8_1aOps, FeatureRAS]>;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "AArch64RegisterInfo.td"
+include "AArch64CallingConvention.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "AArch64Schedule.td"
+include "AArch64InstrInfo.td"
+
+def AArch64InstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// Named operands for MRS/MSR/TLBI/...
+//===----------------------------------------------------------------------===//
+
+include "AArch64SystemOperands.td"
+
+//===----------------------------------------------------------------------===//
+// AArch64 Processors supported.
+//
+include "AArch64SchedA53.td"
+include "AArch64SchedA57.td"
+include "AArch64SchedCyclone.td"
+include "AArch64SchedFalkor.td"
+include "AArch64SchedKryo.td"
+include "AArch64SchedM1.td"
+include "AArch64SchedVulcan.td"
+
+def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
+ "Cortex-A35 ARM processors", [
+ FeatureCRC,
+ FeatureCrypto,
+ FeatureFPARMv8,
+ FeatureNEON,
+ FeaturePerfMon
+ ]>;
+
+def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
+ "Cortex-A53 ARM processors", [
+ FeatureBalanceFPOps,
+ FeatureCRC,
+ FeatureCrypto,
+ FeatureCustomCheapAsMoveHandling,
+ FeatureFPARMv8,
+ FeatureNEON,
+ FeaturePerfMon,
+ FeaturePostRAScheduler,
+ FeatureUseAA
+ ]>;
+
+def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
+ "Cortex-A57 ARM processors", [
+ FeatureBalanceFPOps,
+ FeatureCRC,
+ FeatureCrypto,
+ FeatureCustomCheapAsMoveHandling,
+ FeatureFPARMv8,
+ FeatureNEON,
+ FeaturePerfMon,
+ FeaturePostRAScheduler,
+ FeaturePredictableSelectIsExpensive
+ ]>;
+
+def ProcA72 : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72",
+ "Cortex-A72 ARM processors", [
+ FeatureCRC,
+ FeatureCrypto,
+ FeatureFPARMv8,
+ FeatureNEON,
+ FeaturePerfMon
+ ]>;
+
+def ProcA73 : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73",
+ "Cortex-A73 ARM processors", [
+ FeatureCRC,
+ FeatureCrypto,
+ FeatureFPARMv8,
+ FeatureNEON,
+ FeaturePerfMon
+ ]>;
+
+def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone",
+ "Cyclone", [
+ FeatureAlternateSExtLoadCVTF32Pattern,
+ FeatureCrypto,
+ FeatureDisableLatencySchedHeuristic,
+ FeatureFPARMv8,
+ FeatureArithmeticBccFusion,
+ FeatureArithmeticCbzFusion,
+ FeatureNEON,
+ FeaturePerfMon,
+ FeatureSlowMisaligned128Store,
+ FeatureZCRegMove,
+ FeatureZCZeroing
+ ]>;
+
+def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
+ "Samsung Exynos-M1 processors",
+ [FeatureAvoidQuadLdStPairs,
+ FeatureCRC,
+ FeatureCrypto,
+ FeatureCustomCheapAsMoveHandling,
+ FeatureFPARMv8,
+ FeatureNEON,
+ FeaturePerfMon,
+ FeaturePostRAScheduler,
+ FeatureSlowMisaligned128Store,
+ FeatureUseRSqrt,
+ FeatureZCZeroing]>;
+
+def ProcExynosM2 : SubtargetFeature<"exynosm2", "ARMProcFamily", "ExynosM1",
+ "Samsung Exynos-M2/M3 processors",
+ [FeatureAvoidQuadLdStPairs,
+ FeatureCRC,
+ FeatureCrypto,
+ FeatureCustomCheapAsMoveHandling,
+ FeatureFPARMv8,
+ FeatureNEON,
+ FeaturePerfMon,
+ FeaturePostRAScheduler,
+ FeatureSlowMisaligned128Store,
+ FeatureZCZeroing]>;
+
+def ProcKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
+ "Qualcomm Kryo processors", [
+ FeatureCRC,
+ FeatureCrypto,
+ FeatureCustomCheapAsMoveHandling,
+ FeatureFPARMv8,
+ FeatureNEON,
+ FeaturePerfMon,
+ FeaturePostRAScheduler,
+ FeaturePredictableSelectIsExpensive,
+ FeatureZCZeroing
+ ]>;
+
+def ProcFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
+ "Qualcomm Falkor processors", [
+ FeatureCRC,
+ FeatureCrypto,
+ FeatureFPARMv8,
+ FeatureNEON,
+ FeaturePerfMon
+ ]>;
+
+def ProcVulcan : SubtargetFeature<"vulcan", "ARMProcFamily", "Vulcan",
+ "Broadcom Vulcan processors", [
+ FeatureCRC,
+ FeatureCrypto,
+ FeatureFPARMv8,
+ FeatureArithmeticBccFusion,
+ FeatureNEON,
+ FeaturePostRAScheduler,
+ FeaturePredictableSelectIsExpensive,
+ HasV8_1aOps]>;
+
+def : ProcessorModel<"generic", NoSchedModel, [
+ FeatureCRC,
+ FeatureFPARMv8,
+ FeatureNEON,
+ FeaturePerfMon,
+ FeaturePostRAScheduler
+ ]>;
+
+// FIXME: Cortex-A35 is currently modelled as a Cortex-A53
+def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>;
+def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>;
+def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>;
+// FIXME: Cortex-A72 and Cortex-A73 are currently modelled as an Cortex-A57.
+def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA72]>;
+def : ProcessorModel<"cortex-a73", CortexA57Model, [ProcA73]>;
+def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
+def : ProcessorModel<"exynos-m1", ExynosM1Model, [ProcExynosM1]>;
+def : ProcessorModel<"exynos-m2", ExynosM1Model, [ProcExynosM2]>;
+def : ProcessorModel<"exynos-m3", ExynosM1Model, [ProcExynosM2]>;
+def : ProcessorModel<"falkor", FalkorModel, [ProcFalkor]>;
+def : ProcessorModel<"kryo", KryoModel, [ProcKryo]>;
+def : ProcessorModel<"vulcan", VulcanModel, [ProcVulcan]>;
+
+//===----------------------------------------------------------------------===//
+// Assembly parser
+//===----------------------------------------------------------------------===//
+
+def GenericAsmParserVariant : AsmParserVariant {
+ int Variant = 0;
+ string Name = "generic";
+ string BreakCharacters = ".";
+}
+
+def AppleAsmParserVariant : AsmParserVariant {
+ int Variant = 1;
+ string Name = "apple-neon";
+ string BreakCharacters = ".";
+}
+
+//===----------------------------------------------------------------------===//
+// Assembly printer
+//===----------------------------------------------------------------------===//
+// AArch64 Uses the MC printer for asm output, so make sure the TableGen
+// AsmWriter bits get associated with the correct class.
+def GenericAsmWriter : AsmWriter {
+ string AsmWriterClassName = "InstPrinter";
+ int PassSubtarget = 1;
+ int Variant = 0;
+ bit isMCAsmWriter = 1;
+}
+
+def AppleAsmWriter : AsmWriter {
+ let AsmWriterClassName = "AppleInstPrinter";
+ int PassSubtarget = 1;
+ int Variant = 1;
+ int isMCAsmWriter = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Target Declaration
+//===----------------------------------------------------------------------===//
+
+def AArch64 : Target {
+ let InstructionSet = AArch64InstrInfo;
+ let AssemblyParserVariants = [GenericAsmParserVariant, AppleAsmParserVariant];
+ let AssemblyWriters = [GenericAsmWriter, AppleAsmWriter];
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp b/contrib/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp
new file mode 100644
index 000000000000..e6afb42440a7
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp
@@ -0,0 +1,244 @@
+//===-- AArch64A53Fix835769.cpp -------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This pass changes code to work around Cortex-A53 erratum 835769.
+// It works around it by inserting a nop instruction in code sequences that
+// in some circumstances may trigger the erratum.
+// It inserts a nop instruction between a sequence of the following 2 classes
+// of instructions:
+// instr 1: mem-instr (including loads, stores and prefetches).
+// instr 2: non-SIMD integer multiply-accumulate writing 64-bit X registers.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-fix-cortex-a53-835769"
+
+STATISTIC(NumNopsAdded, "Number of Nops added to work around erratum 835769");
+
+//===----------------------------------------------------------------------===//
+// Helper functions
+
+// Is the instruction a match for the instruction that comes first in the
+// sequence of instructions that can trigger the erratum?
+static bool isFirstInstructionInSequence(MachineInstr *MI) {
+ // Must return true if this instruction is a load, a store or a prefetch.
+ switch (MI->getOpcode()) {
+ case AArch64::PRFMl:
+ case AArch64::PRFMroW:
+ case AArch64::PRFMroX:
+ case AArch64::PRFMui:
+ case AArch64::PRFUMi:
+ return true;
+ default:
+ return MI->mayLoadOrStore();
+ }
+}
+
+// Is the instruction a match for the instruction that comes second in the
+// sequence that can trigger the erratum?
+static bool isSecondInstructionInSequence(MachineInstr *MI) {
+ // Must return true for non-SIMD integer multiply-accumulates, writing
+ // to a 64-bit register.
+ switch (MI->getOpcode()) {
+ // Erratum cannot be triggered when the destination register is 32 bits,
+ // therefore only include the following.
+ case AArch64::MSUBXrrr:
+ case AArch64::MADDXrrr:
+ case AArch64::SMADDLrrr:
+ case AArch64::SMSUBLrrr:
+ case AArch64::UMADDLrrr:
+ case AArch64::UMSUBLrrr:
+ // Erratum can only be triggered by multiply-adds, not by regular
+ // non-accumulating multiplies, i.e. when Ra=XZR='11111'
+ return MI->getOperand(3).getReg() != AArch64::XZR;
+ default:
+ return false;
+ }
+}
+
+
+//===----------------------------------------------------------------------===//
+
+namespace {
+class AArch64A53Fix835769 : public MachineFunctionPass {
+ const TargetInstrInfo *TII;
+
+public:
+ static char ID;
+ explicit AArch64A53Fix835769() : MachineFunctionPass(ID) {
+ initializeAArch64A53Fix835769Pass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &F) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+ StringRef getPassName() const override {
+ return "Workaround A53 erratum 835769 pass";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+private:
+ bool runOnBasicBlock(MachineBasicBlock &MBB);
+};
+char AArch64A53Fix835769::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(AArch64A53Fix835769, "aarch64-fix-cortex-a53-835769-pass",
+ "AArch64 fix for A53 erratum 835769", false, false)
+
+//===----------------------------------------------------------------------===//
+
+bool
+AArch64A53Fix835769::runOnMachineFunction(MachineFunction &F) {
+ DEBUG(dbgs() << "***** AArch64A53Fix835769 *****\n");
+ bool Changed = false;
+ TII = F.getSubtarget().getInstrInfo();
+
+ for (auto &MBB : F) {
+ Changed |= runOnBasicBlock(MBB);
+ }
+ return Changed;
+}
+
+// Return the block that was fallen through to get to MBB, if any,
+// otherwise nullptr.
+static MachineBasicBlock *getBBFallenThrough(MachineBasicBlock *MBB,
+ const TargetInstrInfo *TII) {
+ // Get the previous machine basic block in the function.
+ MachineFunction::iterator MBBI(MBB);
+
+ // Can't go off top of function.
+ if (MBBI == MBB->getParent()->begin())
+ return nullptr;
+
+ MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+ SmallVector<MachineOperand, 2> Cond;
+
+ MachineBasicBlock *PrevBB = &*std::prev(MBBI);
+ for (MachineBasicBlock *S : MBB->predecessors())
+ if (S == PrevBB && !TII->analyzeBranch(*PrevBB, TBB, FBB, Cond) && !TBB &&
+ !FBB)
+ return S;
+
+ return nullptr;
+}
+
+// Iterate through fallen through blocks trying to find a previous non-pseudo if
+// there is one, otherwise return nullptr. Only look for instructions in
+// previous blocks, not the current block, since we only use this to look at
+// previous blocks.
+static MachineInstr *getLastNonPseudo(MachineBasicBlock &MBB,
+ const TargetInstrInfo *TII) {
+ MachineBasicBlock *FMBB = &MBB;
+
+ // If there is no non-pseudo in the current block, loop back around and try
+ // the previous block (if there is one).
+ while ((FMBB = getBBFallenThrough(FMBB, TII))) {
+ for (MachineInstr &I : make_range(FMBB->rbegin(), FMBB->rend()))
+ if (!I.isPseudo())
+ return &I;
+ }
+
+ // There was no previous non-pseudo in the fallen through blocks
+ return nullptr;
+}
+
+static void insertNopBeforeInstruction(MachineBasicBlock &MBB, MachineInstr* MI,
+ const TargetInstrInfo *TII) {
+ // If we are the first instruction of the block, put the NOP at the end of
+ // the previous fallthrough block
+ if (MI == &MBB.front()) {
+ MachineInstr *I = getLastNonPseudo(MBB, TII);
+ assert(I && "Expected instruction");
+ DebugLoc DL = I->getDebugLoc();
+ BuildMI(I->getParent(), DL, TII->get(AArch64::HINT)).addImm(0);
+ }
+ else {
+ DebugLoc DL = MI->getDebugLoc();
+ BuildMI(MBB, MI, DL, TII->get(AArch64::HINT)).addImm(0);
+ }
+
+ ++NumNopsAdded;
+}
+
+bool
+AArch64A53Fix835769::runOnBasicBlock(MachineBasicBlock &MBB) {
+ bool Changed = false;
+ DEBUG(dbgs() << "Running on MBB: " << MBB << " - scanning instructions...\n");
+
+ // First, scan the basic block, looking for a sequence of 2 instructions
+ // that match the conditions under which the erratum may trigger.
+
+ // List of terminating instructions in matching sequences
+ std::vector<MachineInstr*> Sequences;
+ unsigned Idx = 0;
+ MachineInstr *PrevInstr = nullptr;
+
+ // Try and find the last non-pseudo instruction in any fallen through blocks,
+ // if there isn't one, then we use nullptr to represent that.
+ PrevInstr = getLastNonPseudo(MBB, TII);
+
+ for (auto &MI : MBB) {
+ MachineInstr *CurrInstr = &MI;
+ DEBUG(dbgs() << " Examining: " << MI);
+ if (PrevInstr) {
+ DEBUG(dbgs() << " PrevInstr: " << *PrevInstr
+ << " CurrInstr: " << *CurrInstr
+ << " isFirstInstructionInSequence(PrevInstr): "
+ << isFirstInstructionInSequence(PrevInstr) << "\n"
+ << " isSecondInstructionInSequence(CurrInstr): "
+ << isSecondInstructionInSequence(CurrInstr) << "\n");
+ if (isFirstInstructionInSequence(PrevInstr) &&
+ isSecondInstructionInSequence(CurrInstr)) {
+ DEBUG(dbgs() << " ** pattern found at Idx " << Idx << "!\n");
+ Sequences.push_back(CurrInstr);
+ }
+ }
+ if (!CurrInstr->isPseudo())
+ PrevInstr = CurrInstr;
+ ++Idx;
+ }
+
+ DEBUG(dbgs() << "Scan complete, " << Sequences.size()
+ << " occurrences of pattern found.\n");
+
+ // Then update the basic block, inserting nops between the detected sequences.
+ for (auto &MI : Sequences) {
+ Changed = true;
+ insertNopBeforeInstruction(MBB, MI, TII);
+ }
+
+ return Changed;
+}
+
+// Factory function used by AArch64TargetMachine to add the pass to
+// the passmanager.
+FunctionPass *llvm::createAArch64A53Fix835769() {
+ return new AArch64A53Fix835769();
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
new file mode 100644
index 000000000000..0aa597bcdc56
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
@@ -0,0 +1,733 @@
+//===-- AArch64A57FPLoadBalancing.cpp - Balance FP ops statically on A57---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// For best-case performance on Cortex-A57, we should try to use a balanced
+// mix of odd and even D-registers when performing a critical sequence of
+// independent, non-quadword FP/ASIMD floating-point multiply or
+// multiply-accumulate operations.
+//
+// This pass attempts to detect situations where the register allocation may
+// adversely affect this load balancing and to change the registers used so as
+// to better utilize the CPU.
+//
+// Ideally we'd just take each multiply or multiply-accumulate in turn and
+// allocate it alternating even or odd registers. However, multiply-accumulates
+// are most efficiently performed in the same functional unit as their
+// accumulation operand. Therefore this pass tries to find maximal sequences
+// ("Chains") of multiply-accumulates linked via their accumulation operand,
+// and assign them all the same "color" (oddness/evenness).
+//
+// This optimization affects S-register and D-register floating point
+// multiplies and FMADD/FMAs, as well as vector (floating point only) muls and
+// FMADD/FMA. Q register instructions (and 128-bit vector instructions) are
+// not affected.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-a57-fp-load-balancing"
+
+// Enforce the algorithm to use the scavenged register even when the original
+// destination register is the correct color. Used for testing.
+static cl::opt<bool>
+TransformAll("aarch64-a57-fp-load-balancing-force-all",
+ cl::desc("Always modify dest registers regardless of color"),
+ cl::init(false), cl::Hidden);
+
+// Never use the balance information obtained from chains - return a specific
+// color always. Used for testing.
+static cl::opt<unsigned>
+OverrideBalance("aarch64-a57-fp-load-balancing-override",
+ cl::desc("Ignore balance information, always return "
+ "(1: Even, 2: Odd)."),
+ cl::init(0), cl::Hidden);
+
+//===----------------------------------------------------------------------===//
+// Helper functions
+
+// Is the instruction a type of multiply on 64-bit (or 32-bit) FPRs?
+static bool isMul(MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ case AArch64::FMULSrr:
+ case AArch64::FNMULSrr:
+ case AArch64::FMULDrr:
+ case AArch64::FNMULDrr:
+ return true;
+ default:
+ return false;
+ }
+}
+
+// Is the instruction a type of FP multiply-accumulate on 64-bit (or 32-bit) FPRs?
+static bool isMla(MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ case AArch64::FMSUBSrrr:
+ case AArch64::FMADDSrrr:
+ case AArch64::FNMSUBSrrr:
+ case AArch64::FNMADDSrrr:
+ case AArch64::FMSUBDrrr:
+ case AArch64::FMADDDrrr:
+ case AArch64::FNMSUBDrrr:
+ case AArch64::FNMADDDrrr:
+ return true;
+ default:
+ return false;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// A "color", which is either even or odd. Yes, these aren't really colors
+/// but the algorithm is conceptually doing two-color graph coloring.
+enum class Color { Even, Odd };
+#ifndef NDEBUG
+static const char *ColorNames[2] = { "Even", "Odd" };
+#endif
+
+class Chain;
+
+class AArch64A57FPLoadBalancing : public MachineFunctionPass {
+ MachineRegisterInfo *MRI;
+ const TargetRegisterInfo *TRI;
+ RegisterClassInfo RCI;
+
+public:
+ static char ID;
+ explicit AArch64A57FPLoadBalancing() : MachineFunctionPass(ID) {
+ initializeAArch64A57FPLoadBalancingPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &F) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+ StringRef getPassName() const override {
+ return "A57 FP Anti-dependency breaker";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+private:
+ bool runOnBasicBlock(MachineBasicBlock &MBB);
+ bool colorChainSet(std::vector<Chain*> GV, MachineBasicBlock &MBB,
+ int &Balance);
+ bool colorChain(Chain *G, Color C, MachineBasicBlock &MBB);
+ int scavengeRegister(Chain *G, Color C, MachineBasicBlock &MBB);
+ void scanInstruction(MachineInstr *MI, unsigned Idx,
+ std::map<unsigned, Chain*> &Active,
+ std::vector<std::unique_ptr<Chain>> &AllChains);
+ void maybeKillChain(MachineOperand &MO, unsigned Idx,
+ std::map<unsigned, Chain*> &RegChains);
+ Color getColor(unsigned Register);
+ Chain *getAndEraseNext(Color PreferredColor, std::vector<Chain*> &L);
+};
+}
+
+char AArch64A57FPLoadBalancing::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AArch64A57FPLoadBalancing, DEBUG_TYPE,
+ "AArch64 A57 FP Load-Balancing", false, false)
+INITIALIZE_PASS_END(AArch64A57FPLoadBalancing, DEBUG_TYPE,
+ "AArch64 A57 FP Load-Balancing", false, false)
+
+namespace {
+/// A Chain is a sequence of instructions that are linked together by
+/// an accumulation operand. For example:
+///
+/// fmul d0<def>, ?
+/// fmla d1<def>, ?, ?, d0<kill>
+/// fmla d2<def>, ?, ?, d1<kill>
+///
+/// There may be other instructions interleaved in the sequence that
+/// do not belong to the chain. These other instructions must not use
+/// the "chain" register at any point.
+///
+/// We currently only support chains where the "chain" operand is killed
+/// at each link in the chain for simplicity.
+/// A chain has three important instructions - Start, Last and Kill.
+/// * The start instruction is the first instruction in the chain.
+/// * Last is the final instruction in the chain.
+/// * Kill may or may not be defined. If defined, Kill is the instruction
+/// where the outgoing value of the Last instruction is killed.
+/// This information is important as if we know the outgoing value is
+/// killed with no intervening uses, we can safely change its register.
+///
+/// Without a kill instruction, we must assume the outgoing value escapes
+/// beyond our model and either must not change its register or must
+/// create a fixup FMOV to keep the old register value consistent.
+///
+class Chain {
+public:
+ /// The important (marker) instructions.
+ MachineInstr *StartInst, *LastInst, *KillInst;
+ /// The index, from the start of the basic block, that each marker
+ /// appears. These are stored so we can do quick interval tests.
+ unsigned StartInstIdx, LastInstIdx, KillInstIdx;
+ /// All instructions in the chain.
+ std::set<MachineInstr*> Insts;
+ /// True if KillInst cannot be modified. If this is true,
+ /// we cannot change LastInst's outgoing register.
+ /// This will be true for tied values and regmasks.
+ bool KillIsImmutable;
+ /// The "color" of LastInst. This will be the preferred chain color,
+ /// as changing intermediate nodes is easy but changing the last
+ /// instruction can be more tricky.
+ Color LastColor;
+
+ Chain(MachineInstr *MI, unsigned Idx, Color C)
+ : StartInst(MI), LastInst(MI), KillInst(nullptr),
+ StartInstIdx(Idx), LastInstIdx(Idx), KillInstIdx(0),
+ LastColor(C) {
+ Insts.insert(MI);
+ }
+
+ /// Add a new instruction into the chain. The instruction's dest operand
+ /// has the given color.
+ void add(MachineInstr *MI, unsigned Idx, Color C) {
+ LastInst = MI;
+ LastInstIdx = Idx;
+ LastColor = C;
+ assert((KillInstIdx == 0 || LastInstIdx < KillInstIdx) &&
+ "Chain: broken invariant. A Chain can only be killed after its last "
+ "def");
+
+ Insts.insert(MI);
+ }
+
+ /// Return true if MI is a member of the chain.
+ bool contains(MachineInstr &MI) { return Insts.count(&MI) > 0; }
+
+ /// Return the number of instructions in the chain.
+ unsigned size() const {
+ return Insts.size();
+ }
+
+ /// Inform the chain that its last active register (the dest register of
+ /// LastInst) is killed by MI with no intervening uses or defs.
+ void setKill(MachineInstr *MI, unsigned Idx, bool Immutable) {
+ KillInst = MI;
+ KillInstIdx = Idx;
+ KillIsImmutable = Immutable;
+ assert((KillInstIdx == 0 || LastInstIdx < KillInstIdx) &&
+ "Chain: broken invariant. A Chain can only be killed after its last "
+ "def");
+ }
+
+ /// Return the first instruction in the chain.
+ MachineInstr *getStart() const { return StartInst; }
+ /// Return the last instruction in the chain.
+ MachineInstr *getLast() const { return LastInst; }
+ /// Return the "kill" instruction (as set with setKill()) or NULL.
+ MachineInstr *getKill() const { return KillInst; }
+ /// Return an instruction that can be used as an iterator for the end
+ /// of the chain. This is the maximum of KillInst (if set) and LastInst.
+ MachineBasicBlock::iterator end() const {
+ return ++MachineBasicBlock::iterator(KillInst ? KillInst : LastInst);
+ }
+ MachineBasicBlock::iterator begin() const { return getStart(); }
+
+ /// Can the Kill instruction (assuming one exists) be modified?
+ bool isKillImmutable() const { return KillIsImmutable; }
+
+ /// Return the preferred color of this chain.
+ Color getPreferredColor() {
+ if (OverrideBalance != 0)
+ return OverrideBalance == 1 ? Color::Even : Color::Odd;
+ return LastColor;
+ }
+
+ /// Return true if this chain (StartInst..KillInst) overlaps with Other.
+ bool rangeOverlapsWith(const Chain &Other) const {
+ unsigned End = KillInst ? KillInstIdx : LastInstIdx;
+ unsigned OtherEnd = Other.KillInst ?
+ Other.KillInstIdx : Other.LastInstIdx;
+
+ return StartInstIdx <= OtherEnd && Other.StartInstIdx <= End;
+ }
+
+ /// Return true if this chain starts before Other.
+ bool startsBefore(const Chain *Other) const {
+ return StartInstIdx < Other->StartInstIdx;
+ }
+
+ /// Return true if the group will require a fixup MOV at the end.
+ bool requiresFixup() const {
+ return (getKill() && isKillImmutable()) || !getKill();
+ }
+
+ /// Return a simple string representation of the chain.
+ std::string str() const {
+ std::string S;
+ raw_string_ostream OS(S);
+
+ OS << "{";
+ StartInst->print(OS, /* SkipOpers= */true);
+ OS << " -> ";
+ LastInst->print(OS, /* SkipOpers= */true);
+ if (KillInst) {
+ OS << " (kill @ ";
+ KillInst->print(OS, /* SkipOpers= */true);
+ OS << ")";
+ }
+ OS << "}";
+
+ return OS.str();
+ }
+
+};
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+
+bool AArch64A57FPLoadBalancing::runOnMachineFunction(MachineFunction &F) {
+ if (skipFunction(*F.getFunction()))
+ return false;
+
+ if (!F.getSubtarget<AArch64Subtarget>().balanceFPOps())
+ return false;
+
+ bool Changed = false;
+ DEBUG(dbgs() << "***** AArch64A57FPLoadBalancing *****\n");
+
+ MRI = &F.getRegInfo();
+ TRI = F.getRegInfo().getTargetRegisterInfo();
+ RCI.runOnMachineFunction(F);
+
+ for (auto &MBB : F) {
+ Changed |= runOnBasicBlock(MBB);
+ }
+
+ return Changed;
+}
+
+bool AArch64A57FPLoadBalancing::runOnBasicBlock(MachineBasicBlock &MBB) {
+ bool Changed = false;
+ DEBUG(dbgs() << "Running on MBB: " << MBB << " - scanning instructions...\n");
+
+ // First, scan the basic block producing a set of chains.
+
+ // The currently "active" chains - chains that can be added to and haven't
+ // been killed yet. This is keyed by register - all chains can only have one
+ // "link" register between each inst in the chain.
+ std::map<unsigned, Chain*> ActiveChains;
+ std::vector<std::unique_ptr<Chain>> AllChains;
+ unsigned Idx = 0;
+ for (auto &MI : MBB)
+ scanInstruction(&MI, Idx++, ActiveChains, AllChains);
+
+ DEBUG(dbgs() << "Scan complete, "<< AllChains.size() << " chains created.\n");
+
+ // Group the chains into disjoint sets based on their liveness range. This is
+ // a poor-man's version of graph coloring. Ideally we'd create an interference
+ // graph and perform full-on graph coloring on that, but;
+ // (a) That's rather heavyweight for only two colors.
+ // (b) We expect multiple disjoint interference regions - in practice the live
+ // range of chains is quite small and they are clustered between loads
+ // and stores.
+ EquivalenceClasses<Chain*> EC;
+ for (auto &I : AllChains)
+ EC.insert(I.get());
+
+ for (auto &I : AllChains)
+ for (auto &J : AllChains)
+ if (I != J && I->rangeOverlapsWith(*J))
+ EC.unionSets(I.get(), J.get());
+ DEBUG(dbgs() << "Created " << EC.getNumClasses() << " disjoint sets.\n");
+
+ // Now we assume that every member of an equivalence class interferes
+ // with every other member of that class, and with no members of other classes.
+
+ // Convert the EquivalenceClasses to a simpler set of sets.
+ std::vector<std::vector<Chain*> > V;
+ for (auto I = EC.begin(), E = EC.end(); I != E; ++I) {
+ std::vector<Chain*> Cs(EC.member_begin(I), EC.member_end());
+ if (Cs.empty()) continue;
+ V.push_back(std::move(Cs));
+ }
+
+ // Now we have a set of sets, order them by start address so
+ // we can iterate over them sequentially.
+ std::sort(V.begin(), V.end(),
+ [](const std::vector<Chain*> &A,
+ const std::vector<Chain*> &B) {
+ return A.front()->startsBefore(B.front());
+ });
+
+ // As we only have two colors, we can track the global (BB-level) balance of
+ // odds versus evens. We aim to keep this near zero to keep both execution
+ // units fed.
+ // Positive means we're even-heavy, negative we're odd-heavy.
+ //
+ // FIXME: If chains have interdependencies, for example:
+ // mul r0, r1, r2
+ // mul r3, r0, r1
+ // We do not model this and may color each one differently, assuming we'll
+ // get ILP when we obviously can't. This hasn't been seen to be a problem
+ // in practice so far, so we simplify the algorithm by ignoring it.
+ int Parity = 0;
+
+ for (auto &I : V)
+ Changed |= colorChainSet(std::move(I), MBB, Parity);
+
+ return Changed;
+}
+
+Chain *AArch64A57FPLoadBalancing::getAndEraseNext(Color PreferredColor,
+ std::vector<Chain*> &L) {
+ if (L.empty())
+ return nullptr;
+
+ // We try and get the best candidate from L to color next, given that our
+ // preferred color is "PreferredColor". L is ordered from larger to smaller
+ // chains. It is beneficial to color the large chains before the small chains,
+ // but if we can't find a chain of the maximum length with the preferred color,
+ // we fuzz the size and look for slightly smaller chains before giving up and
+ // returning a chain that must be recolored.
+
+ // FIXME: Does this need to be configurable?
+ const unsigned SizeFuzz = 1;
+ unsigned MinSize = L.front()->size() - SizeFuzz;
+ for (auto I = L.begin(), E = L.end(); I != E; ++I) {
+ if ((*I)->size() <= MinSize) {
+ // We've gone past the size limit. Return the previous item.
+ Chain *Ch = *--I;
+ L.erase(I);
+ return Ch;
+ }
+
+ if ((*I)->getPreferredColor() == PreferredColor) {
+ Chain *Ch = *I;
+ L.erase(I);
+ return Ch;
+ }
+ }
+
+ // Bailout case - just return the first item.
+ Chain *Ch = L.front();
+ L.erase(L.begin());
+ return Ch;
+}
+
+bool AArch64A57FPLoadBalancing::colorChainSet(std::vector<Chain*> GV,
+ MachineBasicBlock &MBB,
+ int &Parity) {
+ bool Changed = false;
+ DEBUG(dbgs() << "colorChainSet(): #sets=" << GV.size() << "\n");
+
+ // Sort by descending size order so that we allocate the most important
+ // sets first.
+ // Tie-break equivalent sizes by sorting chains requiring fixups before
+ // those without fixups. The logic here is that we should look at the
+ // chains that we cannot change before we look at those we can,
+ // so the parity counter is updated and we know what color we should
+ // change them to!
+ // Final tie-break with instruction order so pass output is stable (i.e. not
+ // dependent on malloc'd pointer values).
+ std::sort(GV.begin(), GV.end(), [](const Chain *G1, const Chain *G2) {
+ if (G1->size() != G2->size())
+ return G1->size() > G2->size();
+ if (G1->requiresFixup() != G2->requiresFixup())
+ return G1->requiresFixup() > G2->requiresFixup();
+ // Make sure startsBefore() produces a stable final order.
+ assert((G1 == G2 || (G1->startsBefore(G2) ^ G2->startsBefore(G1))) &&
+ "Starts before not total order!");
+ return G1->startsBefore(G2);
+ });
+
+ Color PreferredColor = Parity < 0 ? Color::Even : Color::Odd;
+ while (Chain *G = getAndEraseNext(PreferredColor, GV)) {
+ // Start off by assuming we'll color to our own preferred color.
+ Color C = PreferredColor;
+ if (Parity == 0)
+ // But if we really don't care, use the chain's preferred color.
+ C = G->getPreferredColor();
+
+ DEBUG(dbgs() << " - Parity=" << Parity << ", Color="
+ << ColorNames[(int)C] << "\n");
+
+ // If we'll need a fixup FMOV, don't bother. Testing has shown that this
+ // happens infrequently and when it does it has at least a 50% chance of
+ // slowing code down instead of speeding it up.
+ if (G->requiresFixup() && C != G->getPreferredColor()) {
+ C = G->getPreferredColor();
+ DEBUG(dbgs() << " - " << G->str() << " - not worthwhile changing; "
+ "color remains " << ColorNames[(int)C] << "\n");
+ }
+
+ Changed |= colorChain(G, C, MBB);
+
+ Parity += (C == Color::Even) ? G->size() : -G->size();
+ PreferredColor = Parity < 0 ? Color::Even : Color::Odd;
+ }
+
+ return Changed;
+}
+
+int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C,
+ MachineBasicBlock &MBB) {
+ RegScavenger RS;
+ RS.enterBasicBlock(MBB);
+ RS.forward(MachineBasicBlock::iterator(G->getStart()));
+
+ // Can we find an appropriate register that is available throughout the life
+ // of the chain?
+ unsigned RegClassID = G->getStart()->getDesc().OpInfo[0].RegClass;
+ BitVector AvailableRegs = RS.getRegsAvailable(TRI->getRegClass(RegClassID));
+ for (MachineBasicBlock::iterator I = G->begin(), E = G->end(); I != E; ++I) {
+ RS.forward(I);
+ AvailableRegs &= RS.getRegsAvailable(TRI->getRegClass(RegClassID));
+
+ // Remove any registers clobbered by a regmask or any def register that is
+ // immediately dead.
+ for (auto J : I->operands()) {
+ if (J.isRegMask())
+ AvailableRegs.clearBitsNotInMask(J.getRegMask());
+
+ if (J.isReg() && J.isDef()) {
+ MCRegAliasIterator AI(J.getReg(), TRI, /*IncludeSelf=*/true);
+ if (J.isDead())
+ for (; AI.isValid(); ++AI)
+ AvailableRegs.reset(*AI);
+#ifndef NDEBUG
+ else
+ for (; AI.isValid(); ++AI)
+ assert(!AvailableRegs[*AI] &&
+ "Non-dead def should have been removed by now!");
+#endif
+ }
+ }
+ }
+
+ // Make sure we allocate in-order, to get the cheapest registers first.
+ auto Ord = RCI.getOrder(TRI->getRegClass(RegClassID));
+ for (auto Reg : Ord) {
+ if (!AvailableRegs[Reg])
+ continue;
+ if (C == getColor(Reg))
+ return Reg;
+ }
+
+ return -1;
+}
+
+bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C,
+ MachineBasicBlock &MBB) {
+ bool Changed = false;
+ DEBUG(dbgs() << " - colorChain(" << G->str() << ", "
+ << ColorNames[(int)C] << ")\n");
+
+ // Try and obtain a free register of the right class. Without a register
+ // to play with we cannot continue.
+ int Reg = scavengeRegister(G, C, MBB);
+ if (Reg == -1) {
+ DEBUG(dbgs() << "Scavenging (thus coloring) failed!\n");
+ return false;
+ }
+ DEBUG(dbgs() << " - Scavenged register: " << TRI->getName(Reg) << "\n");
+
+ std::map<unsigned, unsigned> Substs;
+ for (MachineInstr &I : *G) {
+ if (!G->contains(I) && (&I != G->getKill() || G->isKillImmutable()))
+ continue;
+
+ // I is a member of G, or I is a mutable instruction that kills G.
+
+ std::vector<unsigned> ToErase;
+ for (auto &U : I.operands()) {
+ if (U.isReg() && U.isUse() && Substs.find(U.getReg()) != Substs.end()) {
+ unsigned OrigReg = U.getReg();
+ U.setReg(Substs[OrigReg]);
+ if (U.isKill())
+ // Don't erase straight away, because there may be other operands
+ // that also reference this substitution!
+ ToErase.push_back(OrigReg);
+ } else if (U.isRegMask()) {
+ for (auto J : Substs) {
+ if (U.clobbersPhysReg(J.first))
+ ToErase.push_back(J.first);
+ }
+ }
+ }
+ // Now it's safe to remove the substs identified earlier.
+ for (auto J : ToErase)
+ Substs.erase(J);
+
+ // Only change the def if this isn't the last instruction.
+ if (&I != G->getKill()) {
+ MachineOperand &MO = I.getOperand(0);
+
+ bool Change = TransformAll || getColor(MO.getReg()) != C;
+ if (G->requiresFixup() && &I == G->getLast())
+ Change = false;
+
+ if (Change) {
+ Substs[MO.getReg()] = Reg;
+ MO.setReg(Reg);
+
+ Changed = true;
+ }
+ }
+ }
+ assert(Substs.size() == 0 && "No substitutions should be left active!");
+
+ if (G->getKill()) {
+ DEBUG(dbgs() << " - Kill instruction seen.\n");
+ } else {
+ // We didn't have a kill instruction, but we didn't seem to need to change
+ // the destination register anyway.
+ DEBUG(dbgs() << " - Destination register not changed.\n");
+ }
+ return Changed;
+}
+
+void AArch64A57FPLoadBalancing::scanInstruction(
+ MachineInstr *MI, unsigned Idx, std::map<unsigned, Chain *> &ActiveChains,
+ std::vector<std::unique_ptr<Chain>> &AllChains) {
+ // Inspect "MI", updating ActiveChains and AllChains.
+
+ if (isMul(MI)) {
+
+ for (auto &I : MI->uses())
+ maybeKillChain(I, Idx, ActiveChains);
+ for (auto &I : MI->defs())
+ maybeKillChain(I, Idx, ActiveChains);
+
+ // Create a new chain. Multiplies don't require forwarding so can go on any
+ // unit.
+ unsigned DestReg = MI->getOperand(0).getReg();
+
+ DEBUG(dbgs() << "New chain started for register "
+ << TRI->getName(DestReg) << " at " << *MI);
+
+ auto G = llvm::make_unique<Chain>(MI, Idx, getColor(DestReg));
+ ActiveChains[DestReg] = G.get();
+ AllChains.push_back(std::move(G));
+
+ } else if (isMla(MI)) {
+
+ // It is beneficial to keep MLAs on the same functional unit as their
+ // accumulator operand.
+ unsigned DestReg = MI->getOperand(0).getReg();
+ unsigned AccumReg = MI->getOperand(3).getReg();
+
+ maybeKillChain(MI->getOperand(1), Idx, ActiveChains);
+ maybeKillChain(MI->getOperand(2), Idx, ActiveChains);
+ if (DestReg != AccumReg)
+ maybeKillChain(MI->getOperand(0), Idx, ActiveChains);
+
+ if (ActiveChains.find(AccumReg) != ActiveChains.end()) {
+ DEBUG(dbgs() << "Chain found for accumulator register "
+ << TRI->getName(AccumReg) << " in MI " << *MI);
+
+ // For simplicity we only chain together sequences of MULs/MLAs where the
+ // accumulator register is killed on each instruction. This means we don't
+ // need to track other uses of the registers we want to rewrite.
+ //
+ // FIXME: We could extend to handle the non-kill cases for more coverage.
+ if (MI->getOperand(3).isKill()) {
+ // Add to chain.
+ DEBUG(dbgs() << "Instruction was successfully added to chain.\n");
+ ActiveChains[AccumReg]->add(MI, Idx, getColor(DestReg));
+ // Handle cases where the destination is not the same as the accumulator.
+ if (DestReg != AccumReg) {
+ ActiveChains[DestReg] = ActiveChains[AccumReg];
+ ActiveChains.erase(AccumReg);
+ }
+ return;
+ }
+
+ DEBUG(dbgs() << "Cannot add to chain because accumulator operand wasn't "
+ << "marked <kill>!\n");
+ maybeKillChain(MI->getOperand(3), Idx, ActiveChains);
+ }
+
+ DEBUG(dbgs() << "Creating new chain for dest register "
+ << TRI->getName(DestReg) << "\n");
+ auto G = llvm::make_unique<Chain>(MI, Idx, getColor(DestReg));
+ ActiveChains[DestReg] = G.get();
+ AllChains.push_back(std::move(G));
+
+ } else {
+
+ // Non-MUL or MLA instruction. Invalidate any chain in the uses or defs
+ // lists.
+ for (auto &I : MI->uses())
+ maybeKillChain(I, Idx, ActiveChains);
+ for (auto &I : MI->defs())
+ maybeKillChain(I, Idx, ActiveChains);
+
+ }
+}
+
+void AArch64A57FPLoadBalancing::
+maybeKillChain(MachineOperand &MO, unsigned Idx,
+ std::map<unsigned, Chain*> &ActiveChains) {
+ // Given an operand and the set of active chains (keyed by register),
+ // determine if a chain should be ended and remove from ActiveChains.
+ MachineInstr *MI = MO.getParent();
+
+ if (MO.isReg()) {
+
+ // If this is a KILL of a current chain, record it.
+ if (MO.isKill() && ActiveChains.find(MO.getReg()) != ActiveChains.end()) {
+ DEBUG(dbgs() << "Kill seen for chain " << TRI->getName(MO.getReg())
+ << "\n");
+ ActiveChains[MO.getReg()]->setKill(MI, Idx, /*Immutable=*/MO.isTied());
+ }
+ ActiveChains.erase(MO.getReg());
+
+ } else if (MO.isRegMask()) {
+
+ for (auto I = ActiveChains.begin(), E = ActiveChains.end();
+ I != E;) {
+ if (MO.clobbersPhysReg(I->first)) {
+ DEBUG(dbgs() << "Kill (regmask) seen for chain "
+ << TRI->getName(I->first) << "\n");
+ I->second->setKill(MI, Idx, /*Immutable=*/true);
+ ActiveChains.erase(I++);
+ } else
+ ++I;
+ }
+
+ }
+}
+
+Color AArch64A57FPLoadBalancing::getColor(unsigned Reg) {
+ if ((TRI->getEncodingValue(Reg) % 2) == 0)
+ return Color::Even;
+ else
+ return Color::Odd;
+}
+
+// Factory function used by AArch64TargetMachine to add the pass to the passmanager.
+FunctionPass *llvm::createAArch64A57FPLoadBalancing() {
+ return new AArch64A57FPLoadBalancing();
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp b/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
new file mode 100644
index 000000000000..0cbb2db1134a
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
@@ -0,0 +1,484 @@
+//===-- AArch64AddressTypePromotion.cpp --- Promote type for addr accesses -==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to promote the computations use to obtained a sign extended
+// value used into memory accesses.
+// E.g.
+// a = add nsw i32 b, 3
+// d = sext i32 a to i64
+// e = getelementptr ..., i64 d
+//
+// =>
+// f = sext i32 b to i64
+// a = add nsw i64 f, 3
+// e = getelementptr ..., i64 a
+//
+// This is legal to do if the computations are marked with either nsw or nuw
+// markers. Moreover, the current heuristic is simple: it does not create new
+// sext operations, i.e., it gives up when a sext would have forked (e.g., if a
+// = add i32 b, c, two sexts are required to promote the computation).
+//
+// FIXME: This pass may be useful for other targets too.
+// ===---------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-type-promotion"
+
+static cl::opt<bool>
+EnableMerge("aarch64-type-promotion-merge", cl::Hidden,
+ cl::desc("Enable merging of redundant sexts when one is dominating"
+ " the other."),
+ cl::init(true));
+
+#define AARCH64_TYPE_PROMO_NAME "AArch64 Address Type Promotion"
+
+//===----------------------------------------------------------------------===//
+// AArch64AddressTypePromotion
+//===----------------------------------------------------------------------===//
+
+namespace {
+class AArch64AddressTypePromotion : public FunctionPass {
+
+public:
+ static char ID;
+ AArch64AddressTypePromotion()
+ : FunctionPass(ID), Func(nullptr), ConsideredSExtType(nullptr) {
+ initializeAArch64AddressTypePromotionPass(*PassRegistry::getPassRegistry());
+ }
+
+ StringRef getPassName() const override { return AARCH64_TYPE_PROMO_NAME; }
+
+ /// Iterate over the functions and promote the computation of interesting
+ // sext instructions.
+ bool runOnFunction(Function &F) override;
+
+private:
+ /// The current function.
+ Function *Func;
+ /// Filter out all sexts that does not have this type.
+ /// Currently initialized with Int64Ty.
+ Type *ConsideredSExtType;
+
+ // This transformation requires dominator info.
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ FunctionPass::getAnalysisUsage(AU);
+ }
+
+ typedef SmallPtrSet<Instruction *, 32> SetOfInstructions;
+ typedef SmallVector<Instruction *, 16> Instructions;
+ typedef DenseMap<Value *, Instructions> ValueToInsts;
+
+ /// Check if it is profitable to move a sext through this instruction.
+ /// Currently, we consider it is profitable if:
+ /// - Inst is used only once (no need to insert truncate).
+ /// - Inst has only one operand that will require a sext operation (we do
+ /// do not create new sext operation).
+ bool shouldGetThrough(const Instruction *Inst);
+
+ /// Check if it is possible and legal to move a sext through this
+ /// instruction.
+ /// Current heuristic considers that we can get through:
+ /// - Arithmetic operation marked with the nsw or nuw flag.
+ /// - Other sext operation.
+ /// - Truncate operation if it was just dropping sign extended bits.
+ bool canGetThrough(const Instruction *Inst);
+
+ /// Move sext operations through safe to sext instructions.
+ bool propagateSignExtension(Instructions &SExtInsts);
+
+ /// Is this sext should be considered for code motion.
+ /// We look for sext with ConsideredSExtType and uses in at least one
+ // GetElementPtrInst.
+ bool shouldConsiderSExt(const Instruction *SExt) const;
+
+ /// Collect all interesting sext operations, i.e., the ones with the right
+ /// type and used in memory accesses.
+ /// More precisely, a sext instruction is considered as interesting if it
+ /// is used in a "complex" getelementptr or it exits at least another
+ /// sext instruction that sign extended the same initial value.
+ /// A getelementptr is considered as "complex" if it has more than 2
+ // operands.
+ void analyzeSExtension(Instructions &SExtInsts);
+
+ /// Merge redundant sign extension operations in common dominator.
+ void mergeSExts(ValueToInsts &ValToSExtendedUses,
+ SetOfInstructions &ToRemove);
+};
+} // end anonymous namespace.
+
+char AArch64AddressTypePromotion::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AArch64AddressTypePromotion, "aarch64-type-promotion",
+ AARCH64_TYPE_PROMO_NAME, false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(AArch64AddressTypePromotion, "aarch64-type-promotion",
+ AARCH64_TYPE_PROMO_NAME, false, false)
+
+FunctionPass *llvm::createAArch64AddressTypePromotionPass() {
+ return new AArch64AddressTypePromotion();
+}
+
+bool AArch64AddressTypePromotion::canGetThrough(const Instruction *Inst) {
+ if (isa<SExtInst>(Inst))
+ return true;
+
+ const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
+ if (BinOp && isa<OverflowingBinaryOperator>(BinOp) &&
+ (BinOp->hasNoUnsignedWrap() || BinOp->hasNoSignedWrap()))
+ return true;
+
+ // sext(trunc(sext)) --> sext
+ if (isa<TruncInst>(Inst) && isa<SExtInst>(Inst->getOperand(0))) {
+ const Instruction *Opnd = cast<Instruction>(Inst->getOperand(0));
+ // Check that the truncate just drop sign extended bits.
+ if (Inst->getType()->getIntegerBitWidth() >=
+ Opnd->getOperand(0)->getType()->getIntegerBitWidth() &&
+ Inst->getOperand(0)->getType()->getIntegerBitWidth() <=
+ ConsideredSExtType->getIntegerBitWidth())
+ return true;
+ }
+
+ return false;
+}
+
+bool AArch64AddressTypePromotion::shouldGetThrough(const Instruction *Inst) {
+ // If the type of the sext is the same as the considered one, this sext
+ // will become useless.
+ // Otherwise, we will have to do something to preserve the original value,
+ // unless it is used once.
+ if (isa<SExtInst>(Inst) &&
+ (Inst->getType() == ConsideredSExtType || Inst->hasOneUse()))
+ return true;
+
+ // If the Inst is used more that once, we may need to insert truncate
+ // operations and we don't do that at the moment.
+ if (!Inst->hasOneUse())
+ return false;
+
+ // This truncate is used only once, thus if we can get thourgh, it will become
+ // useless.
+ if (isa<TruncInst>(Inst))
+ return true;
+
+ // If both operands are not constant, a new sext will be created here.
+ // Current heuristic is: each step should be profitable.
+ // Therefore we don't allow to increase the number of sext even if it may
+ // be profitable later on.
+ if (isa<BinaryOperator>(Inst) && isa<ConstantInt>(Inst->getOperand(1)))
+ return true;
+
+ return false;
+}
+
+static bool shouldSExtOperand(const Instruction *Inst, int OpIdx) {
+ return !(isa<SelectInst>(Inst) && OpIdx == 0);
+}
+
+bool
+AArch64AddressTypePromotion::shouldConsiderSExt(const Instruction *SExt) const {
+ if (SExt->getType() != ConsideredSExtType)
+ return false;
+
+ for (const User *U : SExt->users()) {
+ if (isa<GetElementPtrInst>(U))
+ return true;
+ }
+
+ return false;
+}
+
+// Input:
+// - SExtInsts contains all the sext instructions that are used directly in
+// GetElementPtrInst, i.e., access to memory.
+// Algorithm:
+// - For each sext operation in SExtInsts:
+// Let var be the operand of sext.
+// while it is profitable (see shouldGetThrough), legal, and safe
+// (see canGetThrough) to move sext through var's definition:
+// * promote the type of var's definition.
+// * fold var into sext uses.
+// * move sext above var's definition.
+// * update sext operand to use the operand of var that should be sign
+// extended (by construction there is only one).
+//
+// E.g.,
+// a = ... i32 c, 3
+// b = sext i32 a to i64 <- is it legal/safe/profitable to get through 'a'
+// ...
+// = b
+// => Yes, update the code
+// b = sext i32 c to i64
+// a = ... i64 b, 3
+// ...
+// = a
+// Iterate on 'c'.
+bool
+AArch64AddressTypePromotion::propagateSignExtension(Instructions &SExtInsts) {
+ DEBUG(dbgs() << "*** Propagate Sign Extension ***\n");
+
+ bool LocalChange = false;
+ SetOfInstructions ToRemove;
+ ValueToInsts ValToSExtendedUses;
+ while (!SExtInsts.empty()) {
+ // Get through simple chain.
+ Instruction *SExt = SExtInsts.pop_back_val();
+
+ DEBUG(dbgs() << "Consider:\n" << *SExt << '\n');
+
+ // If this SExt has already been merged continue.
+ if (SExt->use_empty() && ToRemove.count(SExt)) {
+ DEBUG(dbgs() << "No uses => marked as delete\n");
+ continue;
+ }
+
+ // Now try to get through the chain of definitions.
+ while (auto *Inst = dyn_cast<Instruction>(SExt->getOperand(0))) {
+ DEBUG(dbgs() << "Try to get through:\n" << *Inst << '\n');
+ if (!canGetThrough(Inst) || !shouldGetThrough(Inst)) {
+ // We cannot get through something that is not an Instruction
+ // or not safe to SExt.
+ DEBUG(dbgs() << "Cannot get through\n");
+ break;
+ }
+
+ LocalChange = true;
+ // If this is a sign extend, it becomes useless.
+ if (isa<SExtInst>(Inst) || isa<TruncInst>(Inst)) {
+ DEBUG(dbgs() << "SExt or trunc, mark it as to remove\n");
+ // We cannot use replaceAllUsesWith here because we may trigger some
+ // assertion on the type as all involved sext operation may have not
+ // been moved yet.
+ while (!Inst->use_empty()) {
+ Use &U = *Inst->use_begin();
+ Instruction *User = dyn_cast<Instruction>(U.getUser());
+ assert(User && "User of sext is not an Instruction!");
+ User->setOperand(U.getOperandNo(), SExt);
+ }
+ ToRemove.insert(Inst);
+ SExt->setOperand(0, Inst->getOperand(0));
+ SExt->moveBefore(Inst);
+ continue;
+ }
+
+ // Get through the Instruction:
+ // 1. Update its type.
+ // 2. Replace the uses of SExt by Inst.
+ // 3. Sign extend each operand that needs to be sign extended.
+
+ // Step #1.
+ Inst->mutateType(SExt->getType());
+ // Step #2.
+ SExt->replaceAllUsesWith(Inst);
+ // Step #3.
+ Instruction *SExtForOpnd = SExt;
+
+ DEBUG(dbgs() << "Propagate SExt to operands\n");
+ for (int OpIdx = 0, EndOpIdx = Inst->getNumOperands(); OpIdx != EndOpIdx;
+ ++OpIdx) {
+ DEBUG(dbgs() << "Operand:\n" << *(Inst->getOperand(OpIdx)) << '\n');
+ if (Inst->getOperand(OpIdx)->getType() == SExt->getType() ||
+ !shouldSExtOperand(Inst, OpIdx)) {
+ DEBUG(dbgs() << "No need to propagate\n");
+ continue;
+ }
+ // Check if we can statically sign extend the operand.
+ Value *Opnd = Inst->getOperand(OpIdx);
+ if (const ConstantInt *Cst = dyn_cast<ConstantInt>(Opnd)) {
+ DEBUG(dbgs() << "Statically sign extend\n");
+ Inst->setOperand(OpIdx, ConstantInt::getSigned(SExt->getType(),
+ Cst->getSExtValue()));
+ continue;
+ }
+ // UndefValue are typed, so we have to statically sign extend them.
+ if (isa<UndefValue>(Opnd)) {
+ DEBUG(dbgs() << "Statically sign extend\n");
+ Inst->setOperand(OpIdx, UndefValue::get(SExt->getType()));
+ continue;
+ }
+
+ // Otherwise we have to explicity sign extend it.
+ assert(SExtForOpnd &&
+ "Only one operand should have been sign extended");
+
+ SExtForOpnd->setOperand(0, Opnd);
+
+ DEBUG(dbgs() << "Move before:\n" << *Inst << "\nSign extend\n");
+ // Move the sign extension before the insertion point.
+ SExtForOpnd->moveBefore(Inst);
+ Inst->setOperand(OpIdx, SExtForOpnd);
+ // If more sext are required, new instructions will have to be created.
+ SExtForOpnd = nullptr;
+ }
+ if (SExtForOpnd == SExt) {
+ DEBUG(dbgs() << "Sign extension is useless now\n");
+ ToRemove.insert(SExt);
+ break;
+ }
+ }
+
+ // If the use is already of the right type, connect its uses to its argument
+ // and delete it.
+ // This can happen for an Instruction all uses of which are sign extended.
+ if (!ToRemove.count(SExt) &&
+ SExt->getType() == SExt->getOperand(0)->getType()) {
+ DEBUG(dbgs() << "Sign extension is useless, attach its use to "
+ "its argument\n");
+ SExt->replaceAllUsesWith(SExt->getOperand(0));
+ ToRemove.insert(SExt);
+ } else
+ ValToSExtendedUses[SExt->getOperand(0)].push_back(SExt);
+ }
+
+ if (EnableMerge)
+ mergeSExts(ValToSExtendedUses, ToRemove);
+
+ // Remove all instructions marked as ToRemove.
+ for (Instruction *I: ToRemove)
+ I->eraseFromParent();
+ return LocalChange;
+}
+
+void AArch64AddressTypePromotion::mergeSExts(ValueToInsts &ValToSExtendedUses,
+ SetOfInstructions &ToRemove) {
+ DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+ for (auto &Entry : ValToSExtendedUses) {
+ Instructions &Insts = Entry.second;
+ Instructions CurPts;
+ for (Instruction *Inst : Insts) {
+ if (ToRemove.count(Inst))
+ continue;
+ bool inserted = false;
+ for (auto &Pt : CurPts) {
+ if (DT.dominates(Inst, Pt)) {
+ DEBUG(dbgs() << "Replace all uses of:\n" << *Pt << "\nwith:\n"
+ << *Inst << '\n');
+ Pt->replaceAllUsesWith(Inst);
+ ToRemove.insert(Pt);
+ Pt = Inst;
+ inserted = true;
+ break;
+ }
+ if (!DT.dominates(Pt, Inst))
+ // Give up if we need to merge in a common dominator as the
+ // expermients show it is not profitable.
+ continue;
+
+ DEBUG(dbgs() << "Replace all uses of:\n" << *Inst << "\nwith:\n"
+ << *Pt << '\n');
+ Inst->replaceAllUsesWith(Pt);
+ ToRemove.insert(Inst);
+ inserted = true;
+ break;
+ }
+ if (!inserted)
+ CurPts.push_back(Inst);
+ }
+ }
+}
+
+void AArch64AddressTypePromotion::analyzeSExtension(Instructions &SExtInsts) {
+ DEBUG(dbgs() << "*** Analyze Sign Extensions ***\n");
+
+ DenseMap<Value *, Instruction *> SeenChains;
+
+ for (auto &BB : *Func) {
+ for (auto &II : BB) {
+ Instruction *SExt = &II;
+
+ // Collect all sext operation per type.
+ if (!isa<SExtInst>(SExt) || !shouldConsiderSExt(SExt))
+ continue;
+
+ DEBUG(dbgs() << "Found:\n" << (*SExt) << '\n');
+
+ // Cases where we actually perform the optimization:
+ // 1. SExt is used in a getelementptr with more than 2 operand =>
+ // likely we can merge some computation if they are done on 64 bits.
+ // 2. The beginning of the SExt chain is SExt several time. =>
+ // code sharing is possible.
+
+ bool insert = false;
+ // #1.
+ for (const User *U : SExt->users()) {
+ const Instruction *Inst = dyn_cast<GetElementPtrInst>(U);
+ if (Inst && Inst->getNumOperands() > 2) {
+ DEBUG(dbgs() << "Interesting use in GetElementPtrInst\n" << *Inst
+ << '\n');
+ insert = true;
+ break;
+ }
+ }
+
+ // #2.
+ // Check the head of the chain.
+ Instruction *Inst = SExt;
+ Value *Last;
+ do {
+ int OpdIdx = 0;
+ const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
+ if (BinOp && isa<ConstantInt>(BinOp->getOperand(0)))
+ OpdIdx = 1;
+ Last = Inst->getOperand(OpdIdx);
+ Inst = dyn_cast<Instruction>(Last);
+ } while (Inst && canGetThrough(Inst) && shouldGetThrough(Inst));
+
+ DEBUG(dbgs() << "Head of the chain:\n" << *Last << '\n');
+ DenseMap<Value *, Instruction *>::iterator AlreadySeen =
+ SeenChains.find(Last);
+ if (insert || AlreadySeen != SeenChains.end()) {
+ DEBUG(dbgs() << "Insert\n");
+ SExtInsts.push_back(SExt);
+ if (AlreadySeen != SeenChains.end() && AlreadySeen->second != nullptr) {
+ DEBUG(dbgs() << "Insert chain member\n");
+ SExtInsts.push_back(AlreadySeen->second);
+ SeenChains[Last] = nullptr;
+ }
+ } else {
+ DEBUG(dbgs() << "Record its chain membership\n");
+ SeenChains[Last] = SExt;
+ }
+ }
+ }
+}
+
+bool AArch64AddressTypePromotion::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ if (F.isDeclaration())
+ return false;
+ Func = &F;
+ ConsideredSExtType = Type::getInt64Ty(Func->getContext());
+
+ DEBUG(dbgs() << "*** " << getPassName() << ": " << Func->getName() << '\n');
+
+ Instructions SExtInsts;
+ analyzeSExtension(SExtInsts);
+ return propagateSignExtension(SExtInsts);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
new file mode 100644
index 000000000000..bc2320dd20b3
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
@@ -0,0 +1,414 @@
+//===-- AArch64AdvSIMDScalar.cpp - Replace dead defs w/ zero reg --===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// When profitable, replace GPR targeting i64 instructions with their
+// AdvSIMD scalar equivalents. Generally speaking, "profitable" is defined
+// as minimizing the number of cross-class register copies.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// TODO: Graph based predicate heuristics.
+// Walking the instruction list linearly will get many, perhaps most, of
+// the cases, but to do a truly thorough job of this, we need a more
+// wholistic approach.
+//
+// This optimization is very similar in spirit to the register allocator's
+// spill placement, only here we're determining where to place cross-class
+// register copies rather than spills. As such, a similar approach is
+// called for.
+//
+// We want to build up a set of graphs of all instructions which are candidates
+// for transformation along with instructions which generate their inputs and
+// consume their outputs. For each edge in the graph, we assign a weight
+// based on whether there is a copy required there (weight zero if not) and
+// the block frequency of the block containing the defining or using
+// instruction, whichever is less. Our optimization is then a graph problem
+// to minimize the total weight of all the graphs, then transform instructions
+// and add or remove copy instructions as called for to implement the
+// solution.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64RegisterInfo.h"
+#include "AArch64Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-simd-scalar"
+
+// Allow forcing all i64 operations with equivalent SIMD instructions to use
+// them. For stress-testing the transformation function.
+static cl::opt<bool>
+TransformAll("aarch64-simd-scalar-force-all",
+ cl::desc("Force use of AdvSIMD scalar instructions everywhere"),
+ cl::init(false), cl::Hidden);
+
+STATISTIC(NumScalarInsnsUsed, "Number of scalar instructions used");
+STATISTIC(NumCopiesDeleted, "Number of cross-class copies deleted");
+STATISTIC(NumCopiesInserted, "Number of cross-class copies inserted");
+
+#define AARCH64_ADVSIMD_NAME "AdvSIMD Scalar Operation Optimization"
+
+namespace {
+class AArch64AdvSIMDScalar : public MachineFunctionPass {
+ MachineRegisterInfo *MRI;
+ const TargetInstrInfo *TII;
+
+private:
+ // isProfitableToTransform - Predicate function to determine whether an
+ // instruction should be transformed to its equivalent AdvSIMD scalar
+ // instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
+ bool isProfitableToTransform(const MachineInstr &MI) const;
+
+ // transformInstruction - Perform the transformation of an instruction
+ // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
+ // to be the correct register class, minimizing cross-class copies.
+ void transformInstruction(MachineInstr &MI);
+
+ // processMachineBasicBlock - Main optimzation loop.
+ bool processMachineBasicBlock(MachineBasicBlock *MBB);
+
+public:
+ static char ID; // Pass identification, replacement for typeid.
+ explicit AArch64AdvSIMDScalar() : MachineFunctionPass(ID) {
+ initializeAArch64AdvSIMDScalarPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &F) override;
+
+ StringRef getPassName() const override { return AARCH64_ADVSIMD_NAME; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+char AArch64AdvSIMDScalar::ID = 0;
+} // end anonymous namespace
+
+INITIALIZE_PASS(AArch64AdvSIMDScalar, "aarch64-simd-scalar",
+ AARCH64_ADVSIMD_NAME, false, false)
+
+static bool isGPR64(unsigned Reg, unsigned SubReg,
+ const MachineRegisterInfo *MRI) {
+ if (SubReg)
+ return false;
+ if (TargetRegisterInfo::isVirtualRegister(Reg))
+ return MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::GPR64RegClass);
+ return AArch64::GPR64RegClass.contains(Reg);
+}
+
+static bool isFPR64(unsigned Reg, unsigned SubReg,
+ const MachineRegisterInfo *MRI) {
+ if (TargetRegisterInfo::isVirtualRegister(Reg))
+ return (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR64RegClass) &&
+ SubReg == 0) ||
+ (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR128RegClass) &&
+ SubReg == AArch64::dsub);
+ // Physical register references just check the register class directly.
+ return (AArch64::FPR64RegClass.contains(Reg) && SubReg == 0) ||
+ (AArch64::FPR128RegClass.contains(Reg) && SubReg == AArch64::dsub);
+}
+
+// getSrcFromCopy - Get the original source register for a GPR64 <--> FPR64
+// copy instruction. Return zero_reg if the instruction is not a copy.
+static MachineOperand *getSrcFromCopy(MachineInstr *MI,
+ const MachineRegisterInfo *MRI,
+ unsigned &SubReg) {
+ SubReg = 0;
+ // The "FMOV Xd, Dn" instruction is the typical form.
+ if (MI->getOpcode() == AArch64::FMOVDXr ||
+ MI->getOpcode() == AArch64::FMOVXDr)
+ return &MI->getOperand(1);
+ // A lane zero extract "UMOV.d Xd, Vn[0]" is equivalent. We shouldn't see
+ // these at this stage, but it's easy to check for.
+ if (MI->getOpcode() == AArch64::UMOVvi64 && MI->getOperand(2).getImm() == 0) {
+ SubReg = AArch64::dsub;
+ return &MI->getOperand(1);
+ }
+ // Or just a plain COPY instruction. This can be directly to/from FPR64,
+ // or it can be a dsub subreg reference to an FPR128.
+ if (MI->getOpcode() == AArch64::COPY) {
+ if (isFPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
+ MRI) &&
+ isGPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(), MRI))
+ return &MI->getOperand(1);
+ if (isGPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
+ MRI) &&
+ isFPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(),
+ MRI)) {
+ SubReg = MI->getOperand(1).getSubReg();
+ return &MI->getOperand(1);
+ }
+ }
+
+ // Otherwise, this is some other kind of instruction.
+ return nullptr;
+}
+
+// getTransformOpcode - For any opcode for which there is an AdvSIMD equivalent
+// that we're considering transforming to, return that AdvSIMD opcode. For all
+// others, return the original opcode.
+static unsigned getTransformOpcode(unsigned Opc) {
+ switch (Opc) {
+ default:
+ break;
+ // FIXME: Lots more possibilities.
+ case AArch64::ADDXrr:
+ return AArch64::ADDv1i64;
+ case AArch64::SUBXrr:
+ return AArch64::SUBv1i64;
+ case AArch64::ANDXrr:
+ return AArch64::ANDv8i8;
+ case AArch64::EORXrr:
+ return AArch64::EORv8i8;
+ case AArch64::ORRXrr:
+ return AArch64::ORRv8i8;
+ }
+ // No AdvSIMD equivalent, so just return the original opcode.
+ return Opc;
+}
+
+static bool isTransformable(const MachineInstr &MI) {
+ unsigned Opc = MI.getOpcode();
+ return Opc != getTransformOpcode(Opc);
+}
+
+// isProfitableToTransform - Predicate function to determine whether an
+// instruction should be transformed to its equivalent AdvSIMD scalar
+// instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
+bool AArch64AdvSIMDScalar::isProfitableToTransform(
+ const MachineInstr &MI) const {
+ // If this instruction isn't eligible to be transformed (no SIMD equivalent),
+ // early exit since that's the common case.
+ if (!isTransformable(MI))
+ return false;
+
+ // Count the number of copies we'll need to add and approximate the number
+ // of copies that a transform will enable us to remove.
+ unsigned NumNewCopies = 3;
+ unsigned NumRemovableCopies = 0;
+
+ unsigned OrigSrc0 = MI.getOperand(1).getReg();
+ unsigned OrigSrc1 = MI.getOperand(2).getReg();
+ unsigned SubReg0;
+ unsigned SubReg1;
+ if (!MRI->def_empty(OrigSrc0)) {
+ MachineRegisterInfo::def_instr_iterator Def =
+ MRI->def_instr_begin(OrigSrc0);
+ assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
+ MachineOperand *MOSrc0 = getSrcFromCopy(&*Def, MRI, SubReg0);
+ // If the source was from a copy, we don't need to insert a new copy.
+ if (MOSrc0)
+ --NumNewCopies;
+ // If there are no other users of the original source, we can delete
+ // that instruction.
+ if (MOSrc0 && MRI->hasOneNonDBGUse(OrigSrc0))
+ ++NumRemovableCopies;
+ }
+ if (!MRI->def_empty(OrigSrc1)) {
+ MachineRegisterInfo::def_instr_iterator Def =
+ MRI->def_instr_begin(OrigSrc1);
+ assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
+ MachineOperand *MOSrc1 = getSrcFromCopy(&*Def, MRI, SubReg1);
+ if (MOSrc1)
+ --NumNewCopies;
+ // If there are no other users of the original source, we can delete
+ // that instruction.
+ if (MOSrc1 && MRI->hasOneNonDBGUse(OrigSrc1))
+ ++NumRemovableCopies;
+ }
+
+ // If any of the uses of the original instructions is a cross class copy,
+ // that's a copy that will be removable if we transform. Likewise, if
+ // any of the uses is a transformable instruction, it's likely the tranforms
+ // will chain, enabling us to save a copy there, too. This is an aggressive
+ // heuristic that approximates the graph based cost analysis described above.
+ unsigned Dst = MI.getOperand(0).getReg();
+ bool AllUsesAreCopies = true;
+ for (MachineRegisterInfo::use_instr_nodbg_iterator
+ Use = MRI->use_instr_nodbg_begin(Dst),
+ E = MRI->use_instr_nodbg_end();
+ Use != E; ++Use) {
+ unsigned SubReg;
+ if (getSrcFromCopy(&*Use, MRI, SubReg) || isTransformable(*Use))
+ ++NumRemovableCopies;
+ // If the use is an INSERT_SUBREG, that's still something that can
+ // directly use the FPR64, so we don't invalidate AllUsesAreCopies. It's
+ // preferable to have it use the FPR64 in most cases, as if the source
+ // vector is an IMPLICIT_DEF, the INSERT_SUBREG just goes away entirely.
+ // Ditto for a lane insert.
+ else if (Use->getOpcode() == AArch64::INSERT_SUBREG ||
+ Use->getOpcode() == AArch64::INSvi64gpr)
+ ;
+ else
+ AllUsesAreCopies = false;
+ }
+ // If all of the uses of the original destination register are copies to
+ // FPR64, then we won't end up having a new copy back to GPR64 either.
+ if (AllUsesAreCopies)
+ --NumNewCopies;
+
+ // If a transform will not increase the number of cross-class copies required,
+ // return true.
+ if (NumNewCopies <= NumRemovableCopies)
+ return true;
+
+ // Finally, even if we otherwise wouldn't transform, check if we're forcing
+ // transformation of everything.
+ return TransformAll;
+}
+
+static MachineInstr *insertCopy(const TargetInstrInfo *TII, MachineInstr &MI,
+ unsigned Dst, unsigned Src, bool IsKill) {
+ MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+ TII->get(AArch64::COPY), Dst)
+ .addReg(Src, getKillRegState(IsKill));
+ DEBUG(dbgs() << " adding copy: " << *MIB);
+ ++NumCopiesInserted;
+ return MIB;
+}
+
+// transformInstruction - Perform the transformation of an instruction
+// to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
+// to be the correct register class, minimizing cross-class copies.
+void AArch64AdvSIMDScalar::transformInstruction(MachineInstr &MI) {
+ DEBUG(dbgs() << "Scalar transform: " << MI);
+
+ MachineBasicBlock *MBB = MI.getParent();
+ unsigned OldOpc = MI.getOpcode();
+ unsigned NewOpc = getTransformOpcode(OldOpc);
+ assert(OldOpc != NewOpc && "transform an instruction to itself?!");
+
+ // Check if we need a copy for the source registers.
+ unsigned OrigSrc0 = MI.getOperand(1).getReg();
+ unsigned OrigSrc1 = MI.getOperand(2).getReg();
+ unsigned Src0 = 0, SubReg0;
+ unsigned Src1 = 0, SubReg1;
+ bool KillSrc0 = false, KillSrc1 = false;
+ if (!MRI->def_empty(OrigSrc0)) {
+ MachineRegisterInfo::def_instr_iterator Def =
+ MRI->def_instr_begin(OrigSrc0);
+ assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
+ MachineOperand *MOSrc0 = getSrcFromCopy(&*Def, MRI, SubReg0);
+ // If there are no other users of the original source, we can delete
+ // that instruction.
+ if (MOSrc0) {
+ Src0 = MOSrc0->getReg();
+ KillSrc0 = MOSrc0->isKill();
+ // Src0 is going to be reused, thus, it cannot be killed anymore.
+ MOSrc0->setIsKill(false);
+ if (MRI->hasOneNonDBGUse(OrigSrc0)) {
+ assert(MOSrc0 && "Can't delete copy w/o a valid original source!");
+ Def->eraseFromParent();
+ ++NumCopiesDeleted;
+ }
+ }
+ }
+ if (!MRI->def_empty(OrigSrc1)) {
+ MachineRegisterInfo::def_instr_iterator Def =
+ MRI->def_instr_begin(OrigSrc1);
+ assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
+ MachineOperand *MOSrc1 = getSrcFromCopy(&*Def, MRI, SubReg1);
+ // If there are no other users of the original source, we can delete
+ // that instruction.
+ if (MOSrc1) {
+ Src1 = MOSrc1->getReg();
+ KillSrc1 = MOSrc1->isKill();
+ // Src0 is going to be reused, thus, it cannot be killed anymore.
+ MOSrc1->setIsKill(false);
+ if (MRI->hasOneNonDBGUse(OrigSrc1)) {
+ assert(MOSrc1 && "Can't delete copy w/o a valid original source!");
+ Def->eraseFromParent();
+ ++NumCopiesDeleted;
+ }
+ }
+ }
+ // If we weren't able to reference the original source directly, create a
+ // copy.
+ if (!Src0) {
+ SubReg0 = 0;
+ Src0 = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
+ insertCopy(TII, MI, Src0, OrigSrc0, KillSrc0);
+ KillSrc0 = true;
+ }
+ if (!Src1) {
+ SubReg1 = 0;
+ Src1 = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
+ insertCopy(TII, MI, Src1, OrigSrc1, KillSrc1);
+ KillSrc1 = true;
+ }
+
+ // Create a vreg for the destination.
+ // FIXME: No need to do this if the ultimate user expects an FPR64.
+ // Check for that and avoid the copy if possible.
+ unsigned Dst = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
+
+ // For now, all of the new instructions have the same simple three-register
+ // form, so no need to special case based on what instruction we're
+ // building.
+ BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(NewOpc), Dst)
+ .addReg(Src0, getKillRegState(KillSrc0), SubReg0)
+ .addReg(Src1, getKillRegState(KillSrc1), SubReg1);
+
+ // Now copy the result back out to a GPR.
+ // FIXME: Try to avoid this if all uses could actually just use the FPR64
+ // directly.
+ insertCopy(TII, MI, MI.getOperand(0).getReg(), Dst, true);
+
+ // Erase the old instruction.
+ MI.eraseFromParent();
+
+ ++NumScalarInsnsUsed;
+}
+
+// processMachineBasicBlock - Main optimzation loop.
+bool AArch64AdvSIMDScalar::processMachineBasicBlock(MachineBasicBlock *MBB) {
+ bool Changed = false;
+ for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) {
+ MachineInstr &MI = *I++;
+ if (isProfitableToTransform(MI)) {
+ transformInstruction(MI);
+ Changed = true;
+ }
+ }
+ return Changed;
+}
+
+// runOnMachineFunction - Pass entry point from PassManager.
+bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) {
+ bool Changed = false;
+ DEBUG(dbgs() << "***** AArch64AdvSIMDScalar *****\n");
+
+ if (skipFunction(*mf.getFunction()))
+ return false;
+
+ MRI = &mf.getRegInfo();
+ TII = mf.getSubtarget().getInstrInfo();
+
+ // Just check things on a one-block-at-a-time basis.
+ for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I)
+ if (processMachineBasicBlock(&*I))
+ Changed = true;
+ return Changed;
+}
+
+// createAArch64AdvSIMDScalar - Factory function used by AArch64TargetMachine
+// to add the pass to the PassManager.
+FunctionPass *llvm::createAArch64AdvSIMDScalar() {
+ return new AArch64AdvSIMDScalar();
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
new file mode 100644
index 000000000000..b2d96a32fd3a
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -0,0 +1,716 @@
+//===-- AArch64AsmPrinter.cpp - AArch64 LLVM assembly writer --------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to the AArch64 assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "AArch64.h"
+#include "AArch64MCInstLower.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64RegisterInfo.h"
+#include "AArch64Subtarget.h"
+#include "InstPrinter/AArch64InstPrinter.h"
+#include "MCTargetDesc/AArch64MCExpr.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCLinkerOptimizationHint.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+namespace {
+
+class AArch64AsmPrinter : public AsmPrinter {
+ AArch64MCInstLower MCInstLowering;
+ StackMaps SM;
+ const AArch64Subtarget *STI;
+
+public:
+ AArch64AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
+ : AsmPrinter(TM, std::move(Streamer)), MCInstLowering(OutContext, *this),
+ SM(*this), AArch64FI(nullptr) {}
+
+ StringRef getPassName() const override { return "AArch64 Assembly Printer"; }
+
+ /// \brief Wrapper for MCInstLowering.lowerOperand() for the
+ /// tblgen'erated pseudo lowering.
+ bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const {
+ return MCInstLowering.lowerOperand(MO, MCOp);
+ }
+
+ void LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
+ const MachineInstr &MI);
+ void LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
+ const MachineInstr &MI);
+
+ void LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI);
+ void LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI);
+ void LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI);
+
+ void EmitXRayTable();
+ void EmitSled(const MachineInstr &MI, SledKind Kind);
+
+ /// \brief tblgen'erated driver function for lowering simple MI->MC
+ /// pseudo instructions.
+ bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
+ const MachineInstr *MI);
+
+ void EmitInstruction(const MachineInstr *MI) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AsmPrinter::getAnalysisUsage(AU);
+ AU.setPreservesAll();
+ }
+
+ bool runOnMachineFunction(MachineFunction &F) override {
+ AArch64FI = F.getInfo<AArch64FunctionInfo>();
+ STI = static_cast<const AArch64Subtarget*>(&F.getSubtarget());
+ bool Result = AsmPrinter::runOnMachineFunction(F);
+ EmitXRayTable();
+ return Result;
+ }
+
+private:
+ void printOperand(const MachineInstr *MI, unsigned OpNum, raw_ostream &O);
+ bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O);
+ bool printAsmRegInClass(const MachineOperand &MO,
+ const TargetRegisterClass *RC, bool isVector,
+ raw_ostream &O);
+
+ bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &O) override;
+ bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &O) override;
+
+ void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
+
+ void EmitFunctionBodyEnd() override;
+
+ MCSymbol *GetCPISymbol(unsigned CPID) const override;
+ void EmitEndOfAsmFile(Module &M) override;
+ AArch64FunctionInfo *AArch64FI;
+
+ /// \brief Emit the LOHs contained in AArch64FI.
+ void EmitLOHs();
+
+ /// Emit instruction to set float register to zero.
+ void EmitFMov0(const MachineInstr &MI);
+
+ typedef std::map<const MachineInstr *, MCSymbol *> MInstToMCSymbol;
+ MInstToMCSymbol LOHInstToLabel;
+};
+
+} // end of anonymous namespace
+
+//===----------------------------------------------------------------------===//
+
+void AArch64AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI)
+{
+ EmitSled(MI, SledKind::FUNCTION_ENTER);
+}
+
+void AArch64AsmPrinter::LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI)
+{
+ EmitSled(MI, SledKind::FUNCTION_EXIT);
+}
+
+void AArch64AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI)
+{
+ EmitSled(MI, SledKind::TAIL_CALL);
+}
+
+void AArch64AsmPrinter::EmitXRayTable()
+{
+ //TODO: merge the logic for ELF XRay sleds at a higher level, so to avoid
+ // code duplication as it is now for x86_64, ARM32 and AArch64.
+ if (Sleds.empty())
+ return;
+
+ auto PrevSection = OutStreamer->getCurrentSectionOnly();
+ auto Fn = MF->getFunction();
+ MCSection *Section;
+
+ if (STI->isTargetELF()) {
+ if (Fn->hasComdat())
+ Section = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS,
+ ELF::SHF_ALLOC | ELF::SHF_GROUP, 0,
+ Fn->getComdat()->getName());
+ else
+ Section = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS,
+ ELF::SHF_ALLOC);
+ } else if (STI->isTargetMachO()) {
+ Section = OutContext.getMachOSection("__DATA", "xray_instr_map", 0,
+ SectionKind::getReadOnlyWithRel());
+ } else {
+ llvm_unreachable("Unsupported target");
+ }
+
+ // Before we switch over, we force a reference to a label inside the
+ // xray_instr_map section. Since EmitXRayTable() is always called just
+ // before the function's end, we assume that this is happening after the
+ // last return instruction.
+ //
+ // We then align the reference to 16 byte boundaries, which we determined
+ // experimentally to be beneficial to avoid causing decoder stalls.
+ MCSymbol *Tmp = OutContext.createTempSymbol("xray_synthetic_", true);
+ OutStreamer->EmitCodeAlignment(16);
+ OutStreamer->EmitSymbolValue(Tmp, 8, false);
+ OutStreamer->SwitchSection(Section);
+ OutStreamer->EmitLabel(Tmp);
+ for (const auto &Sled : Sleds) {
+ OutStreamer->EmitSymbolValue(Sled.Sled, 8);
+ OutStreamer->EmitSymbolValue(CurrentFnSym, 8);
+ auto Kind = static_cast<uint8_t>(Sled.Kind);
+ OutStreamer->EmitBytes(
+ StringRef(reinterpret_cast<const char *>(&Kind), 1));
+ OutStreamer->EmitBytes(
+ StringRef(reinterpret_cast<const char *>(&Sled.AlwaysInstrument), 1));
+ OutStreamer->EmitZeros(14);
+ }
+ OutStreamer->SwitchSection(PrevSection);
+
+ Sleds.clear();
+}
+
+void AArch64AsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind)
+{
+ static const int8_t NoopsInSledCount = 7;
+ // We want to emit the following pattern:
+ //
+ // .Lxray_sled_N:
+ // ALIGN
+ // B #32
+ // ; 7 NOP instructions (28 bytes)
+ // .tmpN
+ //
+ // We need the 28 bytes (7 instructions) because at runtime, we'd be patching
+ // over the full 32 bytes (8 instructions) with the following pattern:
+ //
+ // STP X0, X30, [SP, #-16]! ; push X0 and the link register to the stack
+ // LDR W0, #12 ; W0 := function ID
+ // LDR X16,#12 ; X16 := addr of __xray_FunctionEntry or __xray_FunctionExit
+ // BLR X16 ; call the tracing trampoline
+ // ;DATA: 32 bits of function ID
+ // ;DATA: lower 32 bits of the address of the trampoline
+ // ;DATA: higher 32 bits of the address of the trampoline
+ // LDP X0, X30, [SP], #16 ; pop X0 and the link register from the stack
+ //
+ OutStreamer->EmitCodeAlignment(4);
+ auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
+ OutStreamer->EmitLabel(CurSled);
+ auto Target = OutContext.createTempSymbol();
+
+ // Emit "B #32" instruction, which jumps over the next 28 bytes.
+ // The operand has to be the number of 4-byte instructions to jump over,
+ // including the current instruction.
+ EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::B).addImm(8));
+
+ for (int8_t I = 0; I < NoopsInSledCount; I++)
+ EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
+
+ OutStreamer->EmitLabel(Target);
+ recordSled(CurSled, MI, Kind);
+}
+
+void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) {
+ const Triple &TT = TM.getTargetTriple();
+ if (TT.isOSBinFormatMachO()) {
+ // Funny Darwin hack: This flag tells the linker that no global symbols
+ // contain code that falls through to other global symbols (e.g. the obvious
+ // implementation of multiple entry points). If this doesn't occur, the
+ // linker can safely perform dead code stripping. Since LLVM never
+ // generates code that does this, it is always safe to set.
+ OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+ SM.serializeToStackMapSection();
+ }
+}
+
+void AArch64AsmPrinter::EmitLOHs() {
+ SmallVector<MCSymbol *, 3> MCArgs;
+
+ for (const auto &D : AArch64FI->getLOHContainer()) {
+ for (const MachineInstr *MI : D.getArgs()) {
+ MInstToMCSymbol::iterator LabelIt = LOHInstToLabel.find(MI);
+ assert(LabelIt != LOHInstToLabel.end() &&
+ "Label hasn't been inserted for LOH related instruction");
+ MCArgs.push_back(LabelIt->second);
+ }
+ OutStreamer->EmitLOHDirective(D.getKind(), MCArgs);
+ MCArgs.clear();
+ }
+}
+
+void AArch64AsmPrinter::EmitFunctionBodyEnd() {
+ if (!AArch64FI->getLOHRelated().empty())
+ EmitLOHs();
+}
+
+/// GetCPISymbol - Return the symbol for the specified constant pool entry.
+MCSymbol *AArch64AsmPrinter::GetCPISymbol(unsigned CPID) const {
+ // Darwin uses a linker-private symbol name for constant-pools (to
+ // avoid addends on the relocation?), ELF has no such concept and
+ // uses a normal private symbol.
+ if (!getDataLayout().getLinkerPrivateGlobalPrefix().empty())
+ return OutContext.getOrCreateSymbol(
+ Twine(getDataLayout().getLinkerPrivateGlobalPrefix()) + "CPI" +
+ Twine(getFunctionNumber()) + "_" + Twine(CPID));
+
+ return OutContext.getOrCreateSymbol(
+ Twine(getDataLayout().getPrivateGlobalPrefix()) + "CPI" +
+ Twine(getFunctionNumber()) + "_" + Twine(CPID));
+}
+
+void AArch64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum,
+ raw_ostream &O) {
+ const MachineOperand &MO = MI->getOperand(OpNum);
+ switch (MO.getType()) {
+ default:
+ llvm_unreachable("<unknown operand type>");
+ case MachineOperand::MO_Register: {
+ unsigned Reg = MO.getReg();
+ assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+ assert(!MO.getSubReg() && "Subregs should be eliminated!");
+ O << AArch64InstPrinter::getRegisterName(Reg);
+ break;
+ }
+ case MachineOperand::MO_Immediate: {
+ int64_t Imm = MO.getImm();
+ O << '#' << Imm;
+ break;
+ }
+ case MachineOperand::MO_GlobalAddress: {
+ const GlobalValue *GV = MO.getGlobal();
+ MCSymbol *Sym = getSymbol(GV);
+
+ // FIXME: Can we get anything other than a plain symbol here?
+ assert(!MO.getTargetFlags() && "Unknown operand target flag!");
+
+ Sym->print(O, MAI);
+ printOffset(MO.getOffset(), O);
+ break;
+ }
+ }
+}
+
+bool AArch64AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode,
+ raw_ostream &O) {
+ unsigned Reg = MO.getReg();
+ switch (Mode) {
+ default:
+ return true; // Unknown mode.
+ case 'w':
+ Reg = getWRegFromXReg(Reg);
+ break;
+ case 'x':
+ Reg = getXRegFromWReg(Reg);
+ break;
+ }
+
+ O << AArch64InstPrinter::getRegisterName(Reg);
+ return false;
+}
+
+// Prints the register in MO using class RC using the offset in the
+// new register class. This should not be used for cross class
+// printing.
+bool AArch64AsmPrinter::printAsmRegInClass(const MachineOperand &MO,
+ const TargetRegisterClass *RC,
+ bool isVector, raw_ostream &O) {
+ assert(MO.isReg() && "Should only get here with a register!");
+ const TargetRegisterInfo *RI = STI->getRegisterInfo();
+ unsigned Reg = MO.getReg();
+ unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg));
+ assert(RI->regsOverlap(RegToPrint, Reg));
+ O << AArch64InstPrinter::getRegisterName(
+ RegToPrint, isVector ? AArch64::vreg : AArch64::NoRegAltName);
+ return false;
+}
+
+bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+ unsigned AsmVariant,
+ const char *ExtraCode, raw_ostream &O) {
+ const MachineOperand &MO = MI->getOperand(OpNum);
+
+ // First try the generic code, which knows about modifiers like 'c' and 'n'.
+ if (!AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O))
+ return false;
+
+ // Does this asm operand have a single letter operand modifier?
+ if (ExtraCode && ExtraCode[0]) {
+ if (ExtraCode[1] != 0)
+ return true; // Unknown modifier.
+
+ switch (ExtraCode[0]) {
+ default:
+ return true; // Unknown modifier.
+ case 'w': // Print W register
+ case 'x': // Print X register
+ if (MO.isReg())
+ return printAsmMRegister(MO, ExtraCode[0], O);
+ if (MO.isImm() && MO.getImm() == 0) {
+ unsigned Reg = ExtraCode[0] == 'w' ? AArch64::WZR : AArch64::XZR;
+ O << AArch64InstPrinter::getRegisterName(Reg);
+ return false;
+ }
+ printOperand(MI, OpNum, O);
+ return false;
+ case 'b': // Print B register.
+ case 'h': // Print H register.
+ case 's': // Print S register.
+ case 'd': // Print D register.
+ case 'q': // Print Q register.
+ if (MO.isReg()) {
+ const TargetRegisterClass *RC;
+ switch (ExtraCode[0]) {
+ case 'b':
+ RC = &AArch64::FPR8RegClass;
+ break;
+ case 'h':
+ RC = &AArch64::FPR16RegClass;
+ break;
+ case 's':
+ RC = &AArch64::FPR32RegClass;
+ break;
+ case 'd':
+ RC = &AArch64::FPR64RegClass;
+ break;
+ case 'q':
+ RC = &AArch64::FPR128RegClass;
+ break;
+ default:
+ return true;
+ }
+ return printAsmRegInClass(MO, RC, false /* vector */, O);
+ }
+ printOperand(MI, OpNum, O);
+ return false;
+ }
+ }
+
+ // According to ARM, we should emit x and v registers unless we have a
+ // modifier.
+ if (MO.isReg()) {
+ unsigned Reg = MO.getReg();
+
+ // If this is a w or x register, print an x register.
+ if (AArch64::GPR32allRegClass.contains(Reg) ||
+ AArch64::GPR64allRegClass.contains(Reg))
+ return printAsmMRegister(MO, 'x', O);
+
+ // If this is a b, h, s, d, or q register, print it as a v register.
+ return printAsmRegInClass(MO, &AArch64::FPR128RegClass, true /* vector */,
+ O);
+ }
+
+ printOperand(MI, OpNum, O);
+ return false;
+}
+
+bool AArch64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+ unsigned OpNum,
+ unsigned AsmVariant,
+ const char *ExtraCode,
+ raw_ostream &O) {
+ if (ExtraCode && ExtraCode[0])
+ return true; // Unknown modifier.
+
+ const MachineOperand &MO = MI->getOperand(OpNum);
+ assert(MO.isReg() && "unexpected inline asm memory operand");
+ O << "[" << AArch64InstPrinter::getRegisterName(MO.getReg()) << "]";
+ return false;
+}
+
+void AArch64AsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
+ raw_ostream &OS) {
+ unsigned NOps = MI->getNumOperands();
+ assert(NOps == 4);
+ OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
+ // cast away const; DIetc do not take const operands for some reason.
+ OS << cast<DILocalVariable>(MI->getOperand(NOps - 2).getMetadata())
+ ->getName();
+ OS << " <- ";
+ // Frame address. Currently handles register +- offset only.
+ assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm());
+ OS << '[';
+ printOperand(MI, 0, OS);
+ OS << '+';
+ printOperand(MI, 1, OS);
+ OS << ']';
+ OS << "+";
+ printOperand(MI, NOps - 2, OS);
+}
+
+void AArch64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
+ const MachineInstr &MI) {
+ unsigned NumNOPBytes = StackMapOpers(&MI).getNumPatchBytes();
+
+ SM.recordStackMap(MI);
+ assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
+
+ // Scan ahead to trim the shadow.
+ const MachineBasicBlock &MBB = *MI.getParent();
+ MachineBasicBlock::const_iterator MII(MI);
+ ++MII;
+ while (NumNOPBytes > 0) {
+ if (MII == MBB.end() || MII->isCall() ||
+ MII->getOpcode() == AArch64::DBG_VALUE ||
+ MII->getOpcode() == TargetOpcode::PATCHPOINT ||
+ MII->getOpcode() == TargetOpcode::STACKMAP)
+ break;
+ ++MII;
+ NumNOPBytes -= 4;
+ }
+
+ // Emit nops.
+ for (unsigned i = 0; i < NumNOPBytes; i += 4)
+ EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
+}
+
+// Lower a patchpoint of the form:
+// [<def>], <id>, <numBytes>, <target>, <numArgs>
+void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
+ const MachineInstr &MI) {
+ SM.recordPatchPoint(MI);
+
+ PatchPointOpers Opers(&MI);
+
+ int64_t CallTarget = Opers.getCallTarget().getImm();
+ unsigned EncodedBytes = 0;
+ if (CallTarget) {
+ assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget &&
+ "High 16 bits of call target should be zero.");
+ unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
+ EncodedBytes = 16;
+ // Materialize the jump address:
+ EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVZXi)
+ .addReg(ScratchReg)
+ .addImm((CallTarget >> 32) & 0xFFFF)
+ .addImm(32));
+ EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKXi)
+ .addReg(ScratchReg)
+ .addReg(ScratchReg)
+ .addImm((CallTarget >> 16) & 0xFFFF)
+ .addImm(16));
+ EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKXi)
+ .addReg(ScratchReg)
+ .addReg(ScratchReg)
+ .addImm(CallTarget & 0xFFFF)
+ .addImm(0));
+ EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::BLR).addReg(ScratchReg));
+ }
+ // Emit padding.
+ unsigned NumBytes = Opers.getNumPatchBytes();
+ assert(NumBytes >= EncodedBytes &&
+ "Patchpoint can't request size less than the length of a call.");
+ assert((NumBytes - EncodedBytes) % 4 == 0 &&
+ "Invalid number of NOP bytes requested!");
+ for (unsigned i = EncodedBytes; i < NumBytes; i += 4)
+ EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
+}
+
+void AArch64AsmPrinter::EmitFMov0(const MachineInstr &MI) {
+ unsigned DestReg = MI.getOperand(0).getReg();
+ if (STI->hasZeroCycleZeroing()) {
+ // Convert S/D register to corresponding Q register
+ if (AArch64::S0 <= DestReg && DestReg <= AArch64::S31) {
+ DestReg = AArch64::Q0 + (DestReg - AArch64::S0);
+ } else {
+ assert(AArch64::D0 <= DestReg && DestReg <= AArch64::D31);
+ DestReg = AArch64::Q0 + (DestReg - AArch64::D0);
+ }
+ MCInst MOVI;
+ MOVI.setOpcode(AArch64::MOVIv2d_ns);
+ MOVI.addOperand(MCOperand::createReg(DestReg));
+ MOVI.addOperand(MCOperand::createImm(0));
+ EmitToStreamer(*OutStreamer, MOVI);
+ } else {
+ MCInst FMov;
+ switch (MI.getOpcode()) {
+ default: llvm_unreachable("Unexpected opcode");
+ case AArch64::FMOVS0:
+ FMov.setOpcode(AArch64::FMOVWSr);
+ FMov.addOperand(MCOperand::createReg(DestReg));
+ FMov.addOperand(MCOperand::createReg(AArch64::WZR));
+ break;
+ case AArch64::FMOVD0:
+ FMov.setOpcode(AArch64::FMOVXDr);
+ FMov.addOperand(MCOperand::createReg(DestReg));
+ FMov.addOperand(MCOperand::createReg(AArch64::XZR));
+ break;
+ }
+ EmitToStreamer(*OutStreamer, FMov);
+ }
+}
+
+// Simple pseudo-instructions have their lowering (with expansion to real
+// instructions) auto-generated.
+#include "AArch64GenMCPseudoLowering.inc"
+
+void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
+ // Do any auto-generated pseudo lowerings.
+ if (emitPseudoExpansionLowering(*OutStreamer, MI))
+ return;
+
+ if (AArch64FI->getLOHRelated().count(MI)) {
+ // Generate a label for LOH related instruction
+ MCSymbol *LOHLabel = createTempSymbol("loh");
+ // Associate the instruction with the label
+ LOHInstToLabel[MI] = LOHLabel;
+ OutStreamer->EmitLabel(LOHLabel);
+ }
+
+ // Do any manual lowerings.
+ switch (MI->getOpcode()) {
+ default:
+ break;
+ case AArch64::DBG_VALUE: {
+ if (isVerbose() && OutStreamer->hasRawTextSupport()) {
+ SmallString<128> TmpStr;
+ raw_svector_ostream OS(TmpStr);
+ PrintDebugValueComment(MI, OS);
+ OutStreamer->EmitRawText(StringRef(OS.str()));
+ }
+ return;
+ }
+
+ // Tail calls use pseudo instructions so they have the proper code-gen
+ // attributes (isCall, isReturn, etc.). We lower them to the real
+ // instruction here.
+ case AArch64::TCRETURNri: {
+ MCInst TmpInst;
+ TmpInst.setOpcode(AArch64::BR);
+ TmpInst.addOperand(MCOperand::createReg(MI->getOperand(0).getReg()));
+ EmitToStreamer(*OutStreamer, TmpInst);
+ return;
+ }
+ case AArch64::TCRETURNdi: {
+ MCOperand Dest;
+ MCInstLowering.lowerOperand(MI->getOperand(0), Dest);
+ MCInst TmpInst;
+ TmpInst.setOpcode(AArch64::B);
+ TmpInst.addOperand(Dest);
+ EmitToStreamer(*OutStreamer, TmpInst);
+ return;
+ }
+ case AArch64::TLSDESC_CALLSEQ: {
+ /// lower this to:
+ /// adrp x0, :tlsdesc:var
+ /// ldr x1, [x0, #:tlsdesc_lo12:var]
+ /// add x0, x0, #:tlsdesc_lo12:var
+ /// .tlsdesccall var
+ /// blr x1
+ /// (TPIDR_EL0 offset now in x0)
+ const MachineOperand &MO_Sym = MI->getOperand(0);
+ MachineOperand MO_TLSDESC_LO12(MO_Sym), MO_TLSDESC(MO_Sym);
+ MCOperand Sym, SymTLSDescLo12, SymTLSDesc;
+ MO_TLSDESC_LO12.setTargetFlags(AArch64II::MO_TLS | AArch64II::MO_PAGEOFF |
+ AArch64II::MO_NC);
+ MO_TLSDESC.setTargetFlags(AArch64II::MO_TLS | AArch64II::MO_PAGE);
+ MCInstLowering.lowerOperand(MO_Sym, Sym);
+ MCInstLowering.lowerOperand(MO_TLSDESC_LO12, SymTLSDescLo12);
+ MCInstLowering.lowerOperand(MO_TLSDESC, SymTLSDesc);
+
+ MCInst Adrp;
+ Adrp.setOpcode(AArch64::ADRP);
+ Adrp.addOperand(MCOperand::createReg(AArch64::X0));
+ Adrp.addOperand(SymTLSDesc);
+ EmitToStreamer(*OutStreamer, Adrp);
+
+ MCInst Ldr;
+ Ldr.setOpcode(AArch64::LDRXui);
+ Ldr.addOperand(MCOperand::createReg(AArch64::X1));
+ Ldr.addOperand(MCOperand::createReg(AArch64::X0));
+ Ldr.addOperand(SymTLSDescLo12);
+ Ldr.addOperand(MCOperand::createImm(0));
+ EmitToStreamer(*OutStreamer, Ldr);
+
+ MCInst Add;
+ Add.setOpcode(AArch64::ADDXri);
+ Add.addOperand(MCOperand::createReg(AArch64::X0));
+ Add.addOperand(MCOperand::createReg(AArch64::X0));
+ Add.addOperand(SymTLSDescLo12);
+ Add.addOperand(MCOperand::createImm(AArch64_AM::getShiftValue(0)));
+ EmitToStreamer(*OutStreamer, Add);
+
+ // Emit a relocation-annotation. This expands to no code, but requests
+ // the following instruction gets an R_AARCH64_TLSDESC_CALL.
+ MCInst TLSDescCall;
+ TLSDescCall.setOpcode(AArch64::TLSDESCCALL);
+ TLSDescCall.addOperand(Sym);
+ EmitToStreamer(*OutStreamer, TLSDescCall);
+
+ MCInst Blr;
+ Blr.setOpcode(AArch64::BLR);
+ Blr.addOperand(MCOperand::createReg(AArch64::X1));
+ EmitToStreamer(*OutStreamer, Blr);
+
+ return;
+ }
+
+ case AArch64::FMOVS0:
+ case AArch64::FMOVD0:
+ EmitFMov0(*MI);
+ return;
+
+ case TargetOpcode::STACKMAP:
+ return LowerSTACKMAP(*OutStreamer, SM, *MI);
+
+ case TargetOpcode::PATCHPOINT:
+ return LowerPATCHPOINT(*OutStreamer, SM, *MI);
+
+ case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
+ LowerPATCHABLE_FUNCTION_ENTER(*MI);
+ return;
+
+ case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
+ LowerPATCHABLE_FUNCTION_EXIT(*MI);
+ return;
+
+ case TargetOpcode::PATCHABLE_TAIL_CALL:
+ LowerPATCHABLE_TAIL_CALL(*MI);
+ return;
+ }
+
+ // Finally, do the automated lowerings for everything else.
+ MCInst TmpInst;
+ MCInstLowering.Lower(MI, TmpInst);
+ EmitToStreamer(*OutStreamer, TmpInst);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeAArch64AsmPrinter() {
+ RegisterAsmPrinter<AArch64AsmPrinter> X(getTheAArch64leTarget());
+ RegisterAsmPrinter<AArch64AsmPrinter> Y(getTheAArch64beTarget());
+ RegisterAsmPrinter<AArch64AsmPrinter> Z(getTheARM64Target());
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.cpp
new file mode 100644
index 000000000000..a4950af32097
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.cpp
@@ -0,0 +1,317 @@
+//===-- llvm/lib/Target/AArch64/AArch64CallLowering.cpp - Call lowering ---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the lowering of LLVM calls to machine code calls for
+/// GlobalISel.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AArch64CallLowering.h"
+#include "AArch64ISelLowering.h"
+
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "This shouldn't be built without GISel"
+#endif
+
+AArch64CallLowering::AArch64CallLowering(const AArch64TargetLowering &TLI)
+ : CallLowering(&TLI) {
+}
+
+struct IncomingArgHandler : public CallLowering::ValueHandler {
+ IncomingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI)
+ : ValueHandler(MIRBuilder, MRI) {}
+
+ unsigned getStackAddress(uint64_t Size, int64_t Offset,
+ MachinePointerInfo &MPO) override {
+ auto &MFI = MIRBuilder.getMF().getFrameInfo();
+ int FI = MFI.CreateFixedObject(Size, Offset, true);
+ MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
+ unsigned AddrReg = MRI.createGenericVirtualRegister(LLT::pointer(0, 64));
+ MIRBuilder.buildFrameIndex(AddrReg, FI);
+ return AddrReg;
+ }
+
+ void assignValueToReg(unsigned ValVReg, unsigned PhysReg,
+ CCValAssign &VA) override {
+ markPhysRegUsed(PhysReg);
+ MIRBuilder.buildCopy(ValVReg, PhysReg);
+ // FIXME: assert extension
+ }
+
+ void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size,
+ MachinePointerInfo &MPO, CCValAssign &VA) override {
+ auto MMO = MIRBuilder.getMF().getMachineMemOperand(
+ MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size,
+ 0);
+ MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
+ }
+
+ /// How the physical register gets marked varies between formal
+ /// parameters (it's a basic-block live-in), and a call instruction
+ /// (it's an implicit-def of the BL).
+ virtual void markPhysRegUsed(unsigned PhysReg) = 0;
+};
+
+struct FormalArgHandler : public IncomingArgHandler {
+ FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI)
+ : IncomingArgHandler(MIRBuilder, MRI) {}
+
+ void markPhysRegUsed(unsigned PhysReg) override {
+ MIRBuilder.getMBB().addLiveIn(PhysReg);
+ }
+};
+
+struct CallReturnHandler : public IncomingArgHandler {
+ CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+ MachineInstrBuilder MIB)
+ : IncomingArgHandler(MIRBuilder, MRI), MIB(MIB) {}
+
+ void markPhysRegUsed(unsigned PhysReg) override {
+ MIB.addDef(PhysReg, RegState::Implicit);
+ }
+
+ MachineInstrBuilder MIB;
+};
+
+struct OutgoingArgHandler : public CallLowering::ValueHandler {
+ OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+ MachineInstrBuilder MIB)
+ : ValueHandler(MIRBuilder, MRI), MIB(MIB) {}
+
+ unsigned getStackAddress(uint64_t Size, int64_t Offset,
+ MachinePointerInfo &MPO) override {
+ LLT p0 = LLT::pointer(0, 64);
+ LLT s64 = LLT::scalar(64);
+ unsigned SPReg = MRI.createGenericVirtualRegister(p0);
+ MIRBuilder.buildCopy(SPReg, AArch64::SP);
+
+ unsigned OffsetReg = MRI.createGenericVirtualRegister(s64);
+ MIRBuilder.buildConstant(OffsetReg, Offset);
+
+ unsigned AddrReg = MRI.createGenericVirtualRegister(p0);
+ MIRBuilder.buildGEP(AddrReg, SPReg, OffsetReg);
+
+ MPO = MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset);
+ return AddrReg;
+ }
+
+ void assignValueToReg(unsigned ValVReg, unsigned PhysReg,
+ CCValAssign &VA) override {
+ MIB.addUse(PhysReg, RegState::Implicit);
+ unsigned ExtReg = extendRegister(ValVReg, VA);
+ MIRBuilder.buildCopy(PhysReg, ExtReg);
+ }
+
+ void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size,
+ MachinePointerInfo &MPO, CCValAssign &VA) override {
+ auto MMO = MIRBuilder.getMF().getMachineMemOperand(
+ MPO, MachineMemOperand::MOStore, Size, 0);
+ MIRBuilder.buildStore(ValVReg, Addr, *MMO);
+ }
+
+ MachineInstrBuilder MIB;
+};
+
+void AArch64CallLowering::splitToValueTypes(const ArgInfo &OrigArg,
+ SmallVectorImpl<ArgInfo> &SplitArgs,
+ const DataLayout &DL,
+ MachineRegisterInfo &MRI,
+ SplitArgTy PerformArgSplit) const {
+ const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
+ LLVMContext &Ctx = OrigArg.Ty->getContext();
+
+ SmallVector<EVT, 4> SplitVTs;
+ SmallVector<uint64_t, 4> Offsets;
+ ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0);
+
+ if (SplitVTs.size() == 1) {
+ // No splitting to do, but we want to replace the original type (e.g. [1 x
+ // double] -> double).
+ SplitArgs.emplace_back(OrigArg.Reg, SplitVTs[0].getTypeForEVT(Ctx),
+ OrigArg.Flags);
+ return;
+ }
+
+ unsigned FirstRegIdx = SplitArgs.size();
+ for (auto SplitVT : SplitVTs) {
+ // FIXME: set split flags if they're actually used (e.g. i128 on AAPCS).
+ Type *SplitTy = SplitVT.getTypeForEVT(Ctx);
+ SplitArgs.push_back(
+ ArgInfo{MRI.createGenericVirtualRegister(LLT{*SplitTy, DL}), SplitTy,
+ OrigArg.Flags});
+ }
+
+ SmallVector<uint64_t, 4> BitOffsets;
+ for (auto Offset : Offsets)
+ BitOffsets.push_back(Offset * 8);
+
+ SmallVector<unsigned, 8> SplitRegs;
+ for (auto I = &SplitArgs[FirstRegIdx]; I != SplitArgs.end(); ++I)
+ SplitRegs.push_back(I->Reg);
+
+ PerformArgSplit(SplitRegs, BitOffsets);
+}
+
+bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
+ const Value *Val, unsigned VReg) const {
+ MachineFunction &MF = MIRBuilder.getMF();
+ const Function &F = *MF.getFunction();
+
+ auto MIB = MIRBuilder.buildInstrNoInsert(AArch64::RET_ReallyLR);
+ assert(((Val && VReg) || (!Val && !VReg)) && "Return value without a vreg");
+ bool Success = true;
+ if (VReg) {
+ const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
+ CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(F.getCallingConv());
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ auto &DL = F.getParent()->getDataLayout();
+
+ ArgInfo OrigArg{VReg, Val->getType()};
+ setArgFlags(OrigArg, AttributeSet::ReturnIndex, DL, F);
+
+ SmallVector<ArgInfo, 8> SplitArgs;
+ splitToValueTypes(OrigArg, SplitArgs, DL, MRI,
+ [&](ArrayRef<unsigned> Regs, ArrayRef<uint64_t> Offsets) {
+ MIRBuilder.buildExtract(Regs, Offsets, VReg);
+ });
+
+ OutgoingArgHandler Handler(MIRBuilder, MRI, MIB);
+ Success = handleAssignments(MIRBuilder, AssignFn, SplitArgs, Handler);
+ }
+
+ MIRBuilder.insertInstr(MIB);
+ return Success;
+}
+
+bool AArch64CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
+ const Function &F,
+ ArrayRef<unsigned> VRegs) const {
+ auto &Args = F.getArgumentList();
+ MachineFunction &MF = MIRBuilder.getMF();
+ MachineBasicBlock &MBB = MIRBuilder.getMBB();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ auto &DL = F.getParent()->getDataLayout();
+
+ SmallVector<ArgInfo, 8> SplitArgs;
+ unsigned i = 0;
+ for (auto &Arg : Args) {
+ ArgInfo OrigArg{VRegs[i], Arg.getType()};
+ setArgFlags(OrigArg, i + 1, DL, F);
+ splitToValueTypes(OrigArg, SplitArgs, DL, MRI,
+ [&](ArrayRef<unsigned> Regs, ArrayRef<uint64_t> Offsets) {
+ MIRBuilder.buildSequence(VRegs[i], Regs, Offsets);
+ });
+ ++i;
+ }
+
+ if (!MBB.empty())
+ MIRBuilder.setInstr(*MBB.begin());
+
+ const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
+ CCAssignFn *AssignFn =
+ TLI.CCAssignFnForCall(F.getCallingConv(), /*IsVarArg=*/false);
+
+ FormalArgHandler Handler(MIRBuilder, MRI);
+ if (!handleAssignments(MIRBuilder, AssignFn, SplitArgs, Handler))
+ return false;
+
+ // Move back to the end of the basic block.
+ MIRBuilder.setMBB(MBB);
+
+ return true;
+}
+
+bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
+ const MachineOperand &Callee,
+ const ArgInfo &OrigRet,
+ ArrayRef<ArgInfo> OrigArgs) const {
+ MachineFunction &MF = MIRBuilder.getMF();
+ const Function &F = *MF.getFunction();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ auto &DL = F.getParent()->getDataLayout();
+
+ SmallVector<ArgInfo, 8> SplitArgs;
+ for (auto &OrigArg : OrigArgs) {
+ splitToValueTypes(OrigArg, SplitArgs, DL, MRI,
+ [&](ArrayRef<unsigned> Regs, ArrayRef<uint64_t> Offsets) {
+ MIRBuilder.buildExtract(Regs, Offsets, OrigArg.Reg);
+ });
+ }
+
+ // Find out which ABI gets to decide where things go.
+ const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
+ CCAssignFn *CallAssignFn =
+ TLI.CCAssignFnForCall(F.getCallingConv(), /*IsVarArg=*/false);
+
+ // Create a temporarily-floating call instruction so we can add the implicit
+ // uses of arg registers.
+ auto MIB = MIRBuilder.buildInstrNoInsert(Callee.isReg() ? AArch64::BLR
+ : AArch64::BL);
+ MIB.addOperand(Callee);
+
+ // Tell the call which registers are clobbered.
+ auto TRI = MF.getSubtarget().getRegisterInfo();
+ MIB.addRegMask(TRI->getCallPreservedMask(MF, F.getCallingConv()));
+
+ // Do the actual argument marshalling.
+ SmallVector<unsigned, 8> PhysRegs;
+ OutgoingArgHandler Handler(MIRBuilder, MRI, MIB);
+ if (!handleAssignments(MIRBuilder, CallAssignFn, SplitArgs, Handler))
+ return false;
+
+ // Now we can add the actual call instruction to the correct basic block.
+ MIRBuilder.insertInstr(MIB);
+
+ // If Callee is a reg, since it is used by a target specific
+ // instruction, it must have a register class matching the
+ // constraint of that instruction.
+ if (Callee.isReg())
+ MIB->getOperand(0).setReg(constrainOperandRegClass(
+ MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(),
+ *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(),
+ Callee.getReg(), 0));
+
+ // Finally we can copy the returned value back into its virtual-register. In
+ // symmetry with the arugments, the physical register must be an
+ // implicit-define of the call instruction.
+ CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(F.getCallingConv());
+ if (OrigRet.Reg) {
+ SplitArgs.clear();
+
+ SmallVector<uint64_t, 8> RegOffsets;
+ SmallVector<unsigned, 8> SplitRegs;
+ splitToValueTypes(OrigRet, SplitArgs, DL, MRI,
+ [&](ArrayRef<unsigned> Regs, ArrayRef<uint64_t> Offsets) {
+ std::copy(Offsets.begin(), Offsets.end(),
+ std::back_inserter(RegOffsets));
+ std::copy(Regs.begin(), Regs.end(),
+ std::back_inserter(SplitRegs));
+ });
+
+ CallReturnHandler Handler(MIRBuilder, MRI, MIB);
+ if (!handleAssignments(MIRBuilder, RetAssignFn, SplitArgs, Handler))
+ return false;
+
+ if (!RegOffsets.empty())
+ MIRBuilder.buildSequence(OrigRet.Reg, SplitRegs, RegOffsets);
+ }
+
+ return true;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.h b/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.h
new file mode 100644
index 000000000000..ce6676249df6
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.h
@@ -0,0 +1,56 @@
+//===-- llvm/lib/Target/AArch64/AArch64CallLowering.h - Call lowering -----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file describes how to lower LLVM calls to machine code calls.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING
+#define LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING
+
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include "llvm/CodeGen/ValueTypes.h"
+
+namespace llvm {
+
+class AArch64TargetLowering;
+
+class AArch64CallLowering: public CallLowering {
+ public:
+ AArch64CallLowering(const AArch64TargetLowering &TLI);
+
+ bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val,
+ unsigned VReg) const override;
+
+ bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
+ ArrayRef<unsigned> VRegs) const override;
+
+ bool lowerCall(MachineIRBuilder &MIRBuilder, const MachineOperand &Callee,
+ const ArgInfo &OrigRet,
+ ArrayRef<ArgInfo> OrigArgs) const override;
+
+private:
+ typedef std::function<void(MachineIRBuilder &, Type *, unsigned,
+ CCValAssign &)>
+ RegHandler;
+
+ typedef std::function<void(MachineIRBuilder &, int, CCValAssign &)>
+ MemHandler;
+
+ typedef std::function<void(ArrayRef<unsigned>, ArrayRef<uint64_t>)>
+ SplitArgTy;
+
+ void splitToValueTypes(const ArgInfo &OrigArgInfo,
+ SmallVectorImpl<ArgInfo> &SplitArgs,
+ const DataLayout &DL, MachineRegisterInfo &MRI,
+ SplitArgTy SplitArg) const;
+};
+} // End of namespace llvm;
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.h b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.h
new file mode 100644
index 000000000000..bc44bc5f2461
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.h
@@ -0,0 +1,139 @@
+//=== AArch64CallingConv.h - Custom Calling Convention Routines -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the custom routines for the AArch64 Calling Convention
+// that aren't done by tablegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64CALLINGCONVENTION_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64CALLINGCONVENTION_H
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+namespace {
+using namespace llvm;
+
+static const MCPhysReg XRegList[] = {AArch64::X0, AArch64::X1, AArch64::X2,
+ AArch64::X3, AArch64::X4, AArch64::X5,
+ AArch64::X6, AArch64::X7};
+static const MCPhysReg HRegList[] = {AArch64::H0, AArch64::H1, AArch64::H2,
+ AArch64::H3, AArch64::H4, AArch64::H5,
+ AArch64::H6, AArch64::H7};
+static const MCPhysReg SRegList[] = {AArch64::S0, AArch64::S1, AArch64::S2,
+ AArch64::S3, AArch64::S4, AArch64::S5,
+ AArch64::S6, AArch64::S7};
+static const MCPhysReg DRegList[] = {AArch64::D0, AArch64::D1, AArch64::D2,
+ AArch64::D3, AArch64::D4, AArch64::D5,
+ AArch64::D6, AArch64::D7};
+static const MCPhysReg QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
+ AArch64::Q3, AArch64::Q4, AArch64::Q5,
+ AArch64::Q6, AArch64::Q7};
+
+static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
+ MVT LocVT, ISD::ArgFlagsTy &ArgFlags,
+ CCState &State, unsigned SlotAlign) {
+ unsigned Size = LocVT.getSizeInBits() / 8;
+ unsigned StackAlign =
+ State.getMachineFunction().getDataLayout().getStackAlignment();
+ unsigned Align = std::min(ArgFlags.getOrigAlign(), StackAlign);
+
+ for (auto &It : PendingMembers) {
+ It.convertToMem(State.AllocateStack(Size, std::max(Align, SlotAlign)));
+ State.addLoc(It);
+ SlotAlign = 1;
+ }
+
+ // All pending members have now been allocated
+ PendingMembers.clear();
+ return true;
+}
+
+/// The Darwin variadic PCS places anonymous arguments in 8-byte stack slots. An
+/// [N x Ty] type must still be contiguous in memory though.
+static bool CC_AArch64_Custom_Stack_Block(
+ unsigned &ValNo, MVT &ValVT, MVT &LocVT, CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+ SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
+
+ // Add the argument to the list to be allocated once we know the size of the
+ // block.
+ PendingMembers.push_back(
+ CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
+
+ if (!ArgFlags.isInConsecutiveRegsLast())
+ return true;
+
+ return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, 8);
+}
+
+/// Given an [N x Ty] block, it should be passed in a consecutive sequence of
+/// registers. If no such sequence is available, mark the rest of the registers
+/// of that type as used and place the argument on the stack.
+static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+ // Try to allocate a contiguous block of registers, each of the correct
+ // size to hold one member.
+ ArrayRef<MCPhysReg> RegList;
+ if (LocVT.SimpleTy == MVT::i64)
+ RegList = XRegList;
+ else if (LocVT.SimpleTy == MVT::f16)
+ RegList = HRegList;
+ else if (LocVT.SimpleTy == MVT::f32 || LocVT.is32BitVector())
+ RegList = SRegList;
+ else if (LocVT.SimpleTy == MVT::f64 || LocVT.is64BitVector())
+ RegList = DRegList;
+ else if (LocVT.SimpleTy == MVT::f128 || LocVT.is128BitVector())
+ RegList = QRegList;
+ else {
+ // Not an array we want to split up after all.
+ return false;
+ }
+
+ SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
+
+ // Add the argument to the list to be allocated once we know the size of the
+ // block.
+ PendingMembers.push_back(
+ CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
+
+ if (!ArgFlags.isInConsecutiveRegsLast())
+ return true;
+
+ unsigned RegResult = State.AllocateRegBlock(RegList, PendingMembers.size());
+ if (RegResult) {
+ for (auto &It : PendingMembers) {
+ It.convertToReg(RegResult);
+ State.addLoc(It);
+ ++RegResult;
+ }
+ PendingMembers.clear();
+ return true;
+ }
+
+ // Mark all regs in the class as unavailable
+ for (auto Reg : RegList)
+ State.AllocateReg(Reg);
+
+ const AArch64Subtarget &Subtarget = static_cast<const AArch64Subtarget &>(
+ State.getMachineFunction().getSubtarget());
+ unsigned SlotAlign = Subtarget.isTargetDarwin() ? 1 : 8;
+
+ return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, SlotAlign);
+}
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td
new file mode 100644
index 000000000000..9058617768dd
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -0,0 +1,337 @@
+//=- AArch64CallingConv.td - Calling Conventions for AArch64 -*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for AArch64 architecture.
+//
+//===----------------------------------------------------------------------===//
+
+/// CCIfAlign - Match of the original alignment of the arg
+class CCIfAlign<string Align, CCAction A> :
+ CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>;
+/// CCIfBigEndian - Match only if we're in big endian mode.
+class CCIfBigEndian<CCAction A> :
+ CCIf<"State.getMachineFunction().getDataLayout().isBigEndian()", A>;
+
+//===----------------------------------------------------------------------===//
+// ARM AAPCS64 Calling Convention
+//===----------------------------------------------------------------------===//
+
+def CC_AArch64_AAPCS : CallingConv<[
+ CCIfType<[iPTR], CCBitConvertToType<i64>>,
+ CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
+ CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
+
+ // Big endian vectors must be passed as if they were 1-element vectors so that
+ // their lanes are in a consistent order.
+ CCIfBigEndian<CCIfType<[v2i32, v2f32, v4i16, v4f16, v8i8],
+ CCBitConvertToType<f64>>>,
+ CCIfBigEndian<CCIfType<[v2i64, v2f64, v4i32, v4f32, v8i16, v8f16, v16i8],
+ CCBitConvertToType<f128>>>,
+
+ // An SRet is passed in X8, not X0 like a normal pointer parameter.
+ CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[X8], [W8]>>>,
+
+ // Put ByVal arguments directly on the stack. Minimum size and alignment of a
+ // slot is 64-bit.
+ CCIfByVal<CCPassByVal<8, 8>>,
+
+ // The 'nest' parameter, if any, is passed in X18.
+ // Darwin uses X18 as the platform register and hence 'nest' isn't currently
+ // supported there.
+ CCIfNest<CCAssignToReg<[X18]>>,
+
+ // Pass SwiftSelf in a callee saved register.
+ CCIfSwiftSelf<CCIfType<[i64], CCAssignToRegWithShadow<[X20], [W20]>>>,
+
+ CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
+
+ // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
+ // up to eight each of GPR and FPR.
+ CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+ CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
+ [X0, X1, X2, X3, X4, X5, X6, X7]>>,
+ // i128 is split to two i64s, we can't fit half to register X7.
+ CCIfType<[i64], CCIfSplit<CCAssignToRegWithShadow<[X0, X2, X4, X6],
+ [X0, X1, X3, X5]>>>,
+
+ // i128 is split to two i64s, and its stack alignment is 16 bytes.
+ CCIfType<[i64], CCIfSplit<CCAssignToStackWithShadow<8, 16, [X7]>>>,
+
+ CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
+ [W0, W1, W2, W3, W4, W5, W6, W7]>>,
+ CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
+ [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+ CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
+ [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+ CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+ [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+ CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
+ CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+ [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+ CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+ CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+
+ // If more than will fit in registers, pass them on the stack instead.
+ CCIfType<[i1, i8, i16, f16], CCAssignToStack<8, 8>>,
+ CCIfType<[i32, f32], CCAssignToStack<8, 8>>,
+ CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16],
+ CCAssignToStack<8, 8>>,
+ CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+ CCAssignToStack<16, 16>>
+]>;
+
+def RetCC_AArch64_AAPCS : CallingConv<[
+ CCIfType<[iPTR], CCBitConvertToType<i64>>,
+ CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
+ CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
+
+ CCIfSwiftError<CCIfType<[i64], CCAssignToRegWithShadow<[X19], [W19]>>>,
+
+ // Big endian vectors must be passed as if they were 1-element vectors so that
+ // their lanes are in a consistent order.
+ CCIfBigEndian<CCIfType<[v2i32, v2f32, v4i16, v4f16, v8i8],
+ CCBitConvertToType<f64>>>,
+ CCIfBigEndian<CCIfType<[v2i64, v2f64, v4i32, v4f32, v8i16, v8f16, v16i8],
+ CCBitConvertToType<f128>>>,
+
+ CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+ CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
+ [X0, X1, X2, X3, X4, X5, X6, X7]>>,
+ CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
+ [W0, W1, W2, W3, W4, W5, W6, W7]>>,
+ CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
+ [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+ CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
+ [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+ CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+ [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+ CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
+ CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+ [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+ CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+ CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>
+]>;
+
+
+// Darwin uses a calling convention which differs in only two ways
+// from the standard one at this level:
+// + i128s (i.e. split i64s) don't need even registers.
+// + Stack slots are sized as needed rather than being at least 64-bit.
+def CC_AArch64_DarwinPCS : CallingConv<[
+ CCIfType<[iPTR], CCBitConvertToType<i64>>,
+ CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
+ CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
+
+ // An SRet is passed in X8, not X0 like a normal pointer parameter.
+ CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[X8], [W8]>>>,
+
+ // Put ByVal arguments directly on the stack. Minimum size and alignment of a
+ // slot is 64-bit.
+ CCIfByVal<CCPassByVal<8, 8>>,
+
+ // Pass SwiftSelf in a callee saved register.
+ CCIfSwiftSelf<CCIfType<[i64], CCAssignToRegWithShadow<[X20], [W20]>>>,
+
+ // A SwiftError is passed in X19.
+ CCIfSwiftError<CCIfType<[i64], CCAssignToRegWithShadow<[X19], [W19]>>>,
+
+ CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
+
+ // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
+ // up to eight each of GPR and FPR.
+ CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+ CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
+ [X0, X1, X2, X3, X4, X5, X6, X7]>>,
+ // i128 is split to two i64s, we can't fit half to register X7.
+ CCIfType<[i64],
+ CCIfSplit<CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6],
+ [W0, W1, W2, W3, W4, W5, W6]>>>,
+ // i128 is split to two i64s, and its stack alignment is 16 bytes.
+ CCIfType<[i64], CCIfSplit<CCAssignToStackWithShadow<8, 16, [X7]>>>,
+
+ CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
+ [W0, W1, W2, W3, W4, W5, W6, W7]>>,
+ CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
+ [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+ CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
+ [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+ CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+ [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+ CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
+ CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+ [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+ CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+ CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+
+ // If more than will fit in registers, pass them on the stack instead.
+ CCIf<"ValVT == MVT::i1 || ValVT == MVT::i8", CCAssignToStack<1, 1>>,
+ CCIf<"ValVT == MVT::i16 || ValVT == MVT::f16", CCAssignToStack<2, 2>>,
+ CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+ CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16],
+ CCAssignToStack<8, 8>>,
+ CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+ CCAssignToStack<16, 16>>
+]>;
+
+def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
+ CCIfType<[iPTR], CCBitConvertToType<i64>>,
+ CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
+ CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
+
+ CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Stack_Block">>,
+
+ // Handle all scalar types as either i64 or f64.
+ CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+ CCIfType<[f16, f32], CCPromoteToType<f64>>,
+
+ // Everything is on the stack.
+ // i128 is split to two i64s, and its stack alignment is 16 bytes.
+ CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>,
+ CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
+ CCAssignToStack<8, 8>>,
+ CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+ CCAssignToStack<16, 16>>
+]>;
+
+// The WebKit_JS calling convention only passes the first argument (the callee)
+// in register and the remaining arguments on stack. We allow 32bit stack slots,
+// so that WebKit can write partial values in the stack and define the other
+// 32bit quantity as undef.
+def CC_AArch64_WebKit_JS : CallingConv<[
+ // Handle i1, i8, i16, i32, and i64 passing in register X0 (W0).
+ CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+ CCIfType<[i32], CCAssignToRegWithShadow<[W0], [X0]>>,
+ CCIfType<[i64], CCAssignToRegWithShadow<[X0], [W0]>>,
+
+ // Pass the remaining arguments on the stack instead.
+ CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+ CCIfType<[i64, f64], CCAssignToStack<8, 8>>
+]>;
+
+def RetCC_AArch64_WebKit_JS : CallingConv<[
+ CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
+ [X0, X1, X2, X3, X4, X5, X6, X7]>>,
+ CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
+ [W0, W1, W2, W3, W4, W5, W6, W7]>>,
+ CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
+ [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+ CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+ [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// ARM64 Calling Convention for GHC
+//===----------------------------------------------------------------------===//
+
+// This calling convention is specific to the Glasgow Haskell Compiler.
+// The only documentation is the GHC source code, specifically the C header
+// file:
+//
+// https://github.com/ghc/ghc/blob/master/includes/stg/MachRegs.h
+//
+// which defines the registers for the Spineless Tagless G-Machine (STG) that
+// GHC uses to implement lazy evaluation. The generic STG machine has a set of
+// registers which are mapped to appropriate set of architecture specific
+// registers for each CPU architecture.
+//
+// The STG Machine is documented here:
+//
+// https://ghc.haskell.org/trac/ghc/wiki/Commentary/Compiler/GeneratedCode
+//
+// The AArch64 register mapping is under the heading "The ARMv8/AArch64 ABI
+// register mapping".
+
+def CC_AArch64_GHC : CallingConv<[
+ CCIfType<[iPTR], CCBitConvertToType<i64>>,
+
+ // Handle all vector types as either f64 or v2f64.
+ CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+ CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, f128], CCBitConvertToType<v2f64>>,
+
+ CCIfType<[v2f64], CCAssignToReg<[Q4, Q5]>>,
+ CCIfType<[f32], CCAssignToReg<[S8, S9, S10, S11]>>,
+ CCIfType<[f64], CCAssignToReg<[D12, D13, D14, D15]>>,
+
+ // Promote i8/i16/i32 arguments to i64.
+ CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+ // Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, R5, R6, SpLim
+ CCIfType<[i64], CCAssignToReg<[X19, X20, X21, X22, X23, X24, X25, X26, X27, X28]>>
+]>;
+
+// FIXME: LR is only callee-saved in the sense that *we* preserve it and are
+// presumably a callee to someone. External functions may not do so, but this
+// is currently safe since BL has LR as an implicit-def and what happens after a
+// tail call doesn't matter.
+//
+// It would be better to model its preservation semantics properly (create a
+// vreg on entry, use it in RET & tail call generation; make that vreg def if we
+// end up saving LR as part of a call frame). Watch this space...
+def CSR_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
+ X23, X24, X25, X26, X27, X28,
+ D8, D9, D10, D11,
+ D12, D13, D14, D15)>;
+
+// Constructors and destructors return 'this' in the iOS 64-bit C++ ABI; since
+// 'this' and the pointer return value are both passed in X0 in these cases,
+// this can be partially modelled by treating X0 as a callee-saved register;
+// only the resulting RegMask is used; the SaveList is ignored
+//
+// (For generic ARM 64-bit ABI code, clang will not generate constructors or
+// destructors with 'this' returns, so this RegMask will not be used in that
+// case)
+def CSR_AArch64_AAPCS_ThisReturn : CalleeSavedRegs<(add CSR_AArch64_AAPCS, X0)>;
+
+def CSR_AArch64_AAPCS_SwiftError
+ : CalleeSavedRegs<(sub CSR_AArch64_AAPCS, X19)>;
+
+// The function used by Darwin to obtain the address of a thread-local variable
+// guarantees more than a normal AAPCS function. x16 and x17 are used on the
+// fast path for calculation, but other registers except X0 (argument/return)
+// and LR (it is a call, after all) are preserved.
+def CSR_AArch64_TLS_Darwin
+ : CalleeSavedRegs<(add (sub (sequence "X%u", 1, 28), X16, X17),
+ FP,
+ (sequence "Q%u", 0, 31))>;
+
+// We can only handle a register pair with adjacent registers, the register pair
+// should belong to the same class as well. Since the access function on the
+// fast path calls a function that follows CSR_AArch64_TLS_Darwin,
+// CSR_AArch64_CXX_TLS_Darwin should be a subset of CSR_AArch64_TLS_Darwin.
+def CSR_AArch64_CXX_TLS_Darwin
+ : CalleeSavedRegs<(add CSR_AArch64_AAPCS,
+ (sub (sequence "X%u", 1, 28), X15, X16, X17, X18),
+ (sequence "D%u", 0, 31))>;
+
+// CSRs that are handled by prologue, epilogue.
+def CSR_AArch64_CXX_TLS_Darwin_PE
+ : CalleeSavedRegs<(add LR, FP)>;
+
+// CSRs that are handled explicitly via copies.
+def CSR_AArch64_CXX_TLS_Darwin_ViaCopy
+ : CalleeSavedRegs<(sub CSR_AArch64_CXX_TLS_Darwin, LR, FP)>;
+
+// The ELF stub used for TLS-descriptor access saves every feasible
+// register. Only X0 and LR are clobbered.
+def CSR_AArch64_TLS_ELF
+ : CalleeSavedRegs<(add (sequence "X%u", 1, 28), FP,
+ (sequence "Q%u", 0, 31))>;
+
+def CSR_AArch64_AllRegs
+ : CalleeSavedRegs<(add (sequence "W%u", 0, 30), WSP,
+ (sequence "X%u", 0, 28), FP, LR, SP,
+ (sequence "B%u", 0, 31), (sequence "H%u", 0, 31),
+ (sequence "S%u", 0, 31), (sequence "D%u", 0, 31),
+ (sequence "Q%u", 0, 31))>;
+
+def CSR_AArch64_NoRegs : CalleeSavedRegs<(add)>;
+
+def CSR_AArch64_RT_MostRegs : CalleeSavedRegs<(add CSR_AArch64_AAPCS,
+ (sequence "X%u", 9, 15))>;
+
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
new file mode 100644
index 000000000000..6f8dd3e3ac0c
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
@@ -0,0 +1,151 @@
+//===-- AArch64CleanupLocalDynamicTLSPass.cpp ---------------------*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Local-dynamic access to thread-local variables proceeds in three stages.
+//
+// 1. The offset of this Module's thread-local area from TPIDR_EL0 is calculated
+// in much the same way as a general-dynamic TLS-descriptor access against
+// the special symbol _TLS_MODULE_BASE.
+// 2. The variable's offset from _TLS_MODULE_BASE_ is calculated using
+// instructions with "dtprel" modifiers.
+// 3. These two are added, together with TPIDR_EL0, to obtain the variable's
+// true address.
+//
+// This is only better than general-dynamic access to the variable if two or
+// more of the first stage TLS-descriptor calculations can be combined. This
+// pass looks through a function and performs such combinations.
+//
+//===----------------------------------------------------------------------===//
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64TargetMachine.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+using namespace llvm;
+
+#define TLSCLEANUP_PASS_NAME "AArch64 Local Dynamic TLS Access Clean-up"
+
+namespace {
+struct LDTLSCleanup : public MachineFunctionPass {
+ static char ID;
+ LDTLSCleanup() : MachineFunctionPass(ID) {
+ initializeLDTLSCleanupPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ if (AFI->getNumLocalDynamicTLSAccesses() < 2) {
+ // No point folding accesses if there isn't at least two.
+ return false;
+ }
+
+ MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
+ return VisitNode(DT->getRootNode(), 0);
+ }
+
+ // Visit the dominator subtree rooted at Node in pre-order.
+ // If TLSBaseAddrReg is non-null, then use that to replace any
+ // TLS_base_addr instructions. Otherwise, create the register
+ // when the first such instruction is seen, and then use it
+ // as we encounter more instructions.
+ bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) {
+ MachineBasicBlock *BB = Node->getBlock();
+ bool Changed = false;
+
+ // Traverse the current block.
+ for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
+ ++I) {
+ switch (I->getOpcode()) {
+ case AArch64::TLSDESC_CALLSEQ:
+ // Make sure it's a local dynamic access.
+ if (!I->getOperand(0).isSymbol() ||
+ strcmp(I->getOperand(0).getSymbolName(), "_TLS_MODULE_BASE_"))
+ break;
+
+ if (TLSBaseAddrReg)
+ I = replaceTLSBaseAddrCall(*I, TLSBaseAddrReg);
+ else
+ I = setRegister(*I, &TLSBaseAddrReg);
+ Changed = true;
+ break;
+ default:
+ break;
+ }
+ }
+
+ // Visit the children of this block in the dominator tree.
+ for (MachineDomTreeNode *N : *Node) {
+ Changed |= VisitNode(N, TLSBaseAddrReg);
+ }
+
+ return Changed;
+ }
+
+ // Replace the TLS_base_addr instruction I with a copy from
+ // TLSBaseAddrReg, returning the new instruction.
+ MachineInstr *replaceTLSBaseAddrCall(MachineInstr &I,
+ unsigned TLSBaseAddrReg) {
+ MachineFunction *MF = I.getParent()->getParent();
+ const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+
+ // Insert a Copy from TLSBaseAddrReg to x0, which is where the rest of the
+ // code sequence assumes the address will be.
+ MachineInstr *Copy = BuildMI(*I.getParent(), I, I.getDebugLoc(),
+ TII->get(TargetOpcode::COPY), AArch64::X0)
+ .addReg(TLSBaseAddrReg);
+
+ // Erase the TLS_base_addr instruction.
+ I.eraseFromParent();
+
+ return Copy;
+ }
+
+ // Create a virtal register in *TLSBaseAddrReg, and populate it by
+ // inserting a copy instruction after I. Returns the new instruction.
+ MachineInstr *setRegister(MachineInstr &I, unsigned *TLSBaseAddrReg) {
+ MachineFunction *MF = I.getParent()->getParent();
+ const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+
+ // Create a virtual register for the TLS base address.
+ MachineRegisterInfo &RegInfo = MF->getRegInfo();
+ *TLSBaseAddrReg = RegInfo.createVirtualRegister(&AArch64::GPR64RegClass);
+
+ // Insert a copy from X0 to TLSBaseAddrReg for later.
+ MachineInstr *Copy =
+ BuildMI(*I.getParent(), ++I.getIterator(), I.getDebugLoc(),
+ TII->get(TargetOpcode::COPY), *TLSBaseAddrReg)
+ .addReg(AArch64::X0);
+
+ return Copy;
+ }
+
+ StringRef getPassName() const override { return TLSCLEANUP_PASS_NAME; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<MachineDominatorTree>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+}
+
+INITIALIZE_PASS(LDTLSCleanup, "aarch64-local-dynamic-tls-cleanup",
+ TLSCLEANUP_PASS_NAME, false, false)
+
+char LDTLSCleanup::ID = 0;
+FunctionPass *llvm::createAArch64CleanupLocalDynamicTLSPass() {
+ return new LDTLSCleanup();
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp b/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
new file mode 100644
index 000000000000..7666011f75b6
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -0,0 +1,1108 @@
+//===---------- AArch64CollectLOH.cpp - AArch64 collect LOH pass --*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that collect the Linker Optimization Hint (LOH).
+// This pass should be run at the very end of the compilation flow, just before
+// assembly printer.
+// To be useful for the linker, the LOH must be printed into the assembly file.
+//
+// A LOH describes a sequence of instructions that may be optimized by the
+// linker.
+// This same sequence cannot be optimized by the compiler because some of
+// the information will be known at link time.
+// For instance, consider the following sequence:
+// L1: adrp xA, sym@PAGE
+// L2: add xB, xA, sym@PAGEOFF
+// L3: ldr xC, [xB, #imm]
+// This sequence can be turned into:
+// A literal load if sym@PAGE + sym@PAGEOFF + #imm - address(L3) is < 1MB:
+// L3: ldr xC, sym+#imm
+// It may also be turned into either the following more efficient
+// code sequences:
+// - If sym@PAGEOFF + #imm fits the encoding space of L3.
+// L1: adrp xA, sym@PAGE
+// L3: ldr xC, [xB, sym@PAGEOFF + #imm]
+// - If sym@PAGE + sym@PAGEOFF - address(L1) < 1MB:
+// L1: adr xA, sym
+// L3: ldr xC, [xB, #imm]
+//
+// To be valid a LOH must meet all the requirements needed by all the related
+// possible linker transformations.
+// For instance, using the running example, the constraints to emit
+// ".loh AdrpAddLdr" are:
+// - L1, L2, and L3 instructions are of the expected type, i.e.,
+// respectively ADRP, ADD (immediate), and LD.
+// - The result of L1 is used only by L2.
+// - The register argument (xA) used in the ADD instruction is defined
+// only by L1.
+// - The result of L2 is used only by L3.
+// - The base address (xB) in L3 is defined only L2.
+// - The ADRP in L1 and the ADD in L2 must reference the same symbol using
+// @PAGE/@PAGEOFF with no additional constants
+//
+// Currently supported LOHs are:
+// * So called non-ADRP-related:
+// - .loh AdrpAddLdr L1, L2, L3:
+// L1: adrp xA, sym@PAGE
+// L2: add xB, xA, sym@PAGEOFF
+// L3: ldr xC, [xB, #imm]
+// - .loh AdrpLdrGotLdr L1, L2, L3:
+// L1: adrp xA, sym@GOTPAGE
+// L2: ldr xB, [xA, sym@GOTPAGEOFF]
+// L3: ldr xC, [xB, #imm]
+// - .loh AdrpLdr L1, L3:
+// L1: adrp xA, sym@PAGE
+// L3: ldr xC, [xA, sym@PAGEOFF]
+// - .loh AdrpAddStr L1, L2, L3:
+// L1: adrp xA, sym@PAGE
+// L2: add xB, xA, sym@PAGEOFF
+// L3: str xC, [xB, #imm]
+// - .loh AdrpLdrGotStr L1, L2, L3:
+// L1: adrp xA, sym@GOTPAGE
+// L2: ldr xB, [xA, sym@GOTPAGEOFF]
+// L3: str xC, [xB, #imm]
+// - .loh AdrpAdd L1, L2:
+// L1: adrp xA, sym@PAGE
+// L2: add xB, xA, sym@PAGEOFF
+// For all these LOHs, L1, L2, L3 form a simple chain:
+// L1 result is used only by L2 and L2 result by L3.
+// L3 LOH-related argument is defined only by L2 and L2 LOH-related argument
+// by L1.
+// All these LOHs aim at using more efficient load/store patterns by folding
+// some instructions used to compute the address directly into the load/store.
+//
+// * So called ADRP-related:
+// - .loh AdrpAdrp L2, L1:
+// L2: ADRP xA, sym1@PAGE
+// L1: ADRP xA, sym2@PAGE
+// L2 dominates L1 and xA is not redifined between L2 and L1
+// This LOH aims at getting rid of redundant ADRP instructions.
+//
+// The overall design for emitting the LOHs is:
+// 1. AArch64CollectLOH (this pass) records the LOHs in the AArch64FunctionInfo.
+// 2. AArch64AsmPrinter reads the LOHs from AArch64FunctionInfo and it:
+// 1. Associates them a label.
+// 2. Emits them in a MCStreamer (EmitLOHDirective).
+// - The MCMachOStreamer records them into the MCAssembler.
+// - The MCAsmStreamer prints them.
+// - Other MCStreamers ignore them.
+// 3. Closes the MCStreamer:
+// - The MachObjectWriter gets them from the MCAssembler and writes
+// them in the object file.
+// - Other ObjectWriters ignore them.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-collect-loh"
+
+static cl::opt<bool>
+PreCollectRegister("aarch64-collect-loh-pre-collect-register", cl::Hidden,
+ cl::desc("Restrict analysis to registers invovled"
+ " in LOHs"),
+ cl::init(true));
+
+static cl::opt<bool>
+BasicBlockScopeOnly("aarch64-collect-loh-bb-only", cl::Hidden,
+ cl::desc("Restrict analysis at basic block scope"),
+ cl::init(true));
+
+STATISTIC(NumADRPSimpleCandidate,
+ "Number of simplifiable ADRP dominate by another");
+#ifndef NDEBUG
+STATISTIC(NumADRPComplexCandidate2,
+ "Number of simplifiable ADRP reachable by 2 defs");
+STATISTIC(NumADRPComplexCandidate3,
+ "Number of simplifiable ADRP reachable by 3 defs");
+STATISTIC(NumADRPComplexCandidateOther,
+ "Number of simplifiable ADRP reachable by 4 or more defs");
+STATISTIC(NumADDToSTRWithImm,
+ "Number of simplifiable STR with imm reachable by ADD");
+STATISTIC(NumLDRToSTRWithImm,
+ "Number of simplifiable STR with imm reachable by LDR");
+STATISTIC(NumADDToSTR, "Number of simplifiable STR reachable by ADD");
+STATISTIC(NumLDRToSTR, "Number of simplifiable STR reachable by LDR");
+STATISTIC(NumADDToLDRWithImm,
+ "Number of simplifiable LDR with imm reachable by ADD");
+STATISTIC(NumLDRToLDRWithImm,
+ "Number of simplifiable LDR with imm reachable by LDR");
+STATISTIC(NumADDToLDR, "Number of simplifiable LDR reachable by ADD");
+STATISTIC(NumLDRToLDR, "Number of simplifiable LDR reachable by LDR");
+#endif // NDEBUG
+STATISTIC(NumADRPToLDR, "Number of simplifiable LDR reachable by ADRP");
+#ifndef NDEBUG
+STATISTIC(NumCplxLvl1, "Number of complex case of level 1");
+STATISTIC(NumTooCplxLvl1, "Number of too complex case of level 1");
+STATISTIC(NumCplxLvl2, "Number of complex case of level 2");
+STATISTIC(NumTooCplxLvl2, "Number of too complex case of level 2");
+#endif // NDEBUG
+STATISTIC(NumADRSimpleCandidate, "Number of simplifiable ADRP + ADD");
+STATISTIC(NumADRComplexCandidate, "Number of too complex ADRP + ADD");
+
+#define AARCH64_COLLECT_LOH_NAME "AArch64 Collect Linker Optimization Hint (LOH)"
+
+namespace {
+struct AArch64CollectLOH : public MachineFunctionPass {
+ static char ID;
+ AArch64CollectLOH() : MachineFunctionPass(ID) {
+ initializeAArch64CollectLOHPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+ StringRef getPassName() const override { return AARCH64_COLLECT_LOH_NAME; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ AU.addRequired<MachineDominatorTree>();
+ }
+
+private:
+};
+
+/// A set of MachineInstruction.
+typedef SetVector<const MachineInstr *> SetOfMachineInstr;
+/// Map a basic block to a set of instructions per register.
+/// This is used to represent the exposed uses of a basic block
+/// per register.
+typedef MapVector<const MachineBasicBlock *,
+ std::unique_ptr<SetOfMachineInstr[]>>
+BlockToSetOfInstrsPerColor;
+/// Map a basic block to an instruction per register.
+/// This is used to represent the live-out definitions of a basic block
+/// per register.
+typedef MapVector<const MachineBasicBlock *,
+ std::unique_ptr<const MachineInstr *[]>>
+BlockToInstrPerColor;
+/// Map an instruction to a set of instructions. Used to represent the
+/// mapping def to reachable uses or use to definitions.
+typedef MapVector<const MachineInstr *, SetOfMachineInstr> InstrToInstrs;
+/// Map a basic block to a BitVector.
+/// This is used to record the kill registers per basic block.
+typedef MapVector<const MachineBasicBlock *, BitVector> BlockToRegSet;
+
+/// Map a register to a dense id.
+typedef DenseMap<unsigned, unsigned> MapRegToId;
+/// Map a dense id to a register. Used for debug purposes.
+typedef SmallVector<unsigned, 32> MapIdToReg;
+} // end anonymous namespace.
+
+char AArch64CollectLOH::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AArch64CollectLOH, "aarch64-collect-loh",
+ AARCH64_COLLECT_LOH_NAME, false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(AArch64CollectLOH, "aarch64-collect-loh",
+ AARCH64_COLLECT_LOH_NAME, false, false)
+
+/// Given a couple (MBB, reg) get the corresponding set of instruction from
+/// the given "sets".
+/// If this couple does not reference any set, an empty set is added to "sets"
+/// for this couple and returned.
+/// \param nbRegs is used internally allocate some memory. It must be consistent
+/// with the way sets is used.
+static SetOfMachineInstr &getSet(BlockToSetOfInstrsPerColor &sets,
+ const MachineBasicBlock &MBB, unsigned reg,
+ unsigned nbRegs) {
+ SetOfMachineInstr *result;
+ BlockToSetOfInstrsPerColor::iterator it = sets.find(&MBB);
+ if (it != sets.end())
+ result = it->second.get();
+ else
+ result = (sets[&MBB] = make_unique<SetOfMachineInstr[]>(nbRegs)).get();
+
+ return result[reg];
+}
+
+/// Given a couple (reg, MI) get the corresponding set of instructions from the
+/// the given "sets".
+/// This is used to get the uses record in sets of a definition identified by
+/// MI and reg, i.e., MI defines reg.
+/// If the couple does not reference anything, an empty set is added to
+/// "sets[reg]".
+/// \pre set[reg] is valid.
+static SetOfMachineInstr &getUses(InstrToInstrs *sets, unsigned reg,
+ const MachineInstr &MI) {
+ return sets[reg][&MI];
+}
+
+/// Same as getUses but does not modify the input map: sets.
+/// \return NULL if the couple (reg, MI) is not in sets.
+static const SetOfMachineInstr *getUses(const InstrToInstrs *sets, unsigned reg,
+ const MachineInstr &MI) {
+ InstrToInstrs::const_iterator Res = sets[reg].find(&MI);
+ if (Res != sets[reg].end())
+ return &(Res->second);
+ return nullptr;
+}
+
+/// Initialize the reaching definition algorithm:
+/// For each basic block BB in MF, record:
+/// - its kill set.
+/// - its reachable uses (uses that are exposed to BB's predecessors).
+/// - its the generated definitions.
+/// \param DummyOp if not NULL, specifies a Dummy Operation to be added to
+/// the list of uses of exposed defintions.
+/// \param ADRPMode specifies to only consider ADRP instructions for generated
+/// definition. It also consider definitions of ADRP instructions as uses and
+/// ignore other uses. The ADRPMode is used to collect the information for LHO
+/// that involve ADRP operation only.
+static void initReachingDef(const MachineFunction &MF,
+ InstrToInstrs *ColorOpToReachedUses,
+ BlockToInstrPerColor &Gen, BlockToRegSet &Kill,
+ BlockToSetOfInstrsPerColor &ReachableUses,
+ const MapRegToId &RegToId,
+ const MachineInstr *DummyOp, bool ADRPMode) {
+ const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+ unsigned NbReg = RegToId.size();
+
+ for (const MachineBasicBlock &MBB : MF) {
+ auto &BBGen = Gen[&MBB];
+ BBGen = make_unique<const MachineInstr *[]>(NbReg);
+ std::fill(BBGen.get(), BBGen.get() + NbReg, nullptr);
+
+ BitVector &BBKillSet = Kill[&MBB];
+ BBKillSet.resize(NbReg);
+ for (const MachineInstr &MI : MBB) {
+ bool IsADRP = MI.getOpcode() == AArch64::ADRP;
+
+ // Process uses first.
+ if (IsADRP || !ADRPMode)
+ for (const MachineOperand &MO : MI.operands()) {
+ // Treat ADRP def as use, as the goal of the analysis is to find
+ // ADRP defs reached by other ADRP defs.
+ if (!MO.isReg() || (!ADRPMode && !MO.isUse()) ||
+ (ADRPMode && (!IsADRP || !MO.isDef())))
+ continue;
+ unsigned CurReg = MO.getReg();
+ MapRegToId::const_iterator ItCurRegId = RegToId.find(CurReg);
+ if (ItCurRegId == RegToId.end())
+ continue;
+ CurReg = ItCurRegId->second;
+
+ // if CurReg has not been defined, this use is reachable.
+ if (!BBGen[CurReg] && !BBKillSet.test(CurReg))
+ getSet(ReachableUses, MBB, CurReg, NbReg).insert(&MI);
+ // current basic block definition for this color, if any, is in Gen.
+ if (BBGen[CurReg])
+ getUses(ColorOpToReachedUses, CurReg, *BBGen[CurReg]).insert(&MI);
+ }
+
+ // Process clobbers.
+ for (const MachineOperand &MO : MI.operands()) {
+ if (!MO.isRegMask())
+ continue;
+ // Clobbers kill the related colors.
+ const uint32_t *PreservedRegs = MO.getRegMask();
+
+ // Set generated regs.
+ for (const auto &Entry : RegToId) {
+ unsigned Reg = Entry.second;
+ // Use the global register ID when querying APIs external to this
+ // pass.
+ if (MachineOperand::clobbersPhysReg(PreservedRegs, Entry.first)) {
+ // Do not register clobbered definition for no ADRP.
+ // This definition is not used anyway (otherwise register
+ // allocation is wrong).
+ BBGen[Reg] = ADRPMode ? &MI : nullptr;
+ BBKillSet.set(Reg);
+ }
+ }
+ }
+
+ // Process register defs.
+ for (const MachineOperand &MO : MI.operands()) {
+ if (!MO.isReg() || !MO.isDef())
+ continue;
+ unsigned CurReg = MO.getReg();
+ MapRegToId::const_iterator ItCurRegId = RegToId.find(CurReg);
+ if (ItCurRegId == RegToId.end())
+ continue;
+
+ for (MCRegAliasIterator AI(CurReg, TRI, true); AI.isValid(); ++AI) {
+ MapRegToId::const_iterator ItRegId = RegToId.find(*AI);
+ // If this alias has not been recorded, then it is not interesting
+ // for the current analysis.
+ // We can end up in this situation because of tuple registers.
+ // E.g., Let say we are interested in S1. When we register
+ // S1, we will also register its aliases and in particular
+ // the tuple Q1_Q2.
+ // Now, when we encounter Q1_Q2, we will look through its aliases
+ // and will find that S2 is not registered.
+ if (ItRegId == RegToId.end())
+ continue;
+
+ BBKillSet.set(ItRegId->second);
+ BBGen[ItRegId->second] = &MI;
+ }
+ BBGen[ItCurRegId->second] = &MI;
+ }
+ }
+
+ // If we restrict our analysis to basic block scope, conservatively add a
+ // dummy
+ // use for each generated value.
+ if (!ADRPMode && DummyOp && !MBB.succ_empty())
+ for (unsigned CurReg = 0; CurReg < NbReg; ++CurReg)
+ if (BBGen[CurReg])
+ getUses(ColorOpToReachedUses, CurReg, *BBGen[CurReg]).insert(DummyOp);
+ }
+}
+
+/// Reaching def core algorithm:
+/// while an Out has changed
+/// for each bb
+/// for each color
+/// In[bb][color] = U Out[bb.predecessors][color]
+/// insert reachableUses[bb][color] in each in[bb][color]
+/// op.reachedUses
+///
+/// Out[bb] = Gen[bb] U (In[bb] - Kill[bb])
+static void reachingDefAlgorithm(const MachineFunction &MF,
+ InstrToInstrs *ColorOpToReachedUses,
+ BlockToSetOfInstrsPerColor &In,
+ BlockToSetOfInstrsPerColor &Out,
+ BlockToInstrPerColor &Gen, BlockToRegSet &Kill,
+ BlockToSetOfInstrsPerColor &ReachableUses,
+ unsigned NbReg) {
+ bool HasChanged;
+ do {
+ HasChanged = false;
+ for (const MachineBasicBlock &MBB : MF) {
+ unsigned CurReg;
+ for (CurReg = 0; CurReg < NbReg; ++CurReg) {
+ SetOfMachineInstr &BBInSet = getSet(In, MBB, CurReg, NbReg);
+ SetOfMachineInstr &BBReachableUses =
+ getSet(ReachableUses, MBB, CurReg, NbReg);
+ SetOfMachineInstr &BBOutSet = getSet(Out, MBB, CurReg, NbReg);
+ unsigned Size = BBOutSet.size();
+ // In[bb][color] = U Out[bb.predecessors][color]
+ for (const MachineBasicBlock *PredMBB : MBB.predecessors()) {
+ SetOfMachineInstr &PredOutSet = getSet(Out, *PredMBB, CurReg, NbReg);
+ BBInSet.insert(PredOutSet.begin(), PredOutSet.end());
+ }
+ // insert reachableUses[bb][color] in each in[bb][color] op.reachedses
+ for (const MachineInstr *MI : BBInSet) {
+ SetOfMachineInstr &OpReachedUses =
+ getUses(ColorOpToReachedUses, CurReg, *MI);
+ OpReachedUses.insert(BBReachableUses.begin(), BBReachableUses.end());
+ }
+ // Out[bb] = Gen[bb] U (In[bb] - Kill[bb])
+ if (!Kill[&MBB].test(CurReg))
+ BBOutSet.insert(BBInSet.begin(), BBInSet.end());
+ if (Gen[&MBB][CurReg])
+ BBOutSet.insert(Gen[&MBB][CurReg]);
+ HasChanged |= BBOutSet.size() != Size;
+ }
+ }
+ } while (HasChanged);
+}
+
+/// Reaching definition algorithm.
+/// \param MF function on which the algorithm will operate.
+/// \param[out] ColorOpToReachedUses will contain the result of the reaching
+/// def algorithm.
+/// \param ADRPMode specify whether the reaching def algorithm should be tuned
+/// for ADRP optimization. \see initReachingDef for more details.
+/// \param DummyOp if not NULL, the algorithm will work at
+/// basic block scope and will set for every exposed definition a use to
+/// @p DummyOp.
+/// \pre ColorOpToReachedUses is an array of at least number of registers of
+/// InstrToInstrs.
+static void reachingDef(const MachineFunction &MF,
+ InstrToInstrs *ColorOpToReachedUses,
+ const MapRegToId &RegToId, bool ADRPMode = false,
+ const MachineInstr *DummyOp = nullptr) {
+ // structures:
+ // For each basic block.
+ // Out: a set per color of definitions that reach the
+ // out boundary of this block.
+ // In: Same as Out but for in boundary.
+ // Gen: generated color in this block (one operation per color).
+ // Kill: register set of killed color in this block.
+ // ReachableUses: a set per color of uses (operation) reachable
+ // for "In" definitions.
+ BlockToSetOfInstrsPerColor Out, In, ReachableUses;
+ BlockToInstrPerColor Gen;
+ BlockToRegSet Kill;
+
+ // Initialize Gen, kill and reachableUses.
+ initReachingDef(MF, ColorOpToReachedUses, Gen, Kill, ReachableUses, RegToId,
+ DummyOp, ADRPMode);
+
+ // Algo.
+ if (!DummyOp)
+ reachingDefAlgorithm(MF, ColorOpToReachedUses, In, Out, Gen, Kill,
+ ReachableUses, RegToId.size());
+}
+
+#ifndef NDEBUG
+/// print the result of the reaching definition algorithm.
+static void printReachingDef(const InstrToInstrs *ColorOpToReachedUses,
+ unsigned NbReg, const TargetRegisterInfo *TRI,
+ const MapIdToReg &IdToReg) {
+ unsigned CurReg;
+ for (CurReg = 0; CurReg < NbReg; ++CurReg) {
+ if (ColorOpToReachedUses[CurReg].empty())
+ continue;
+ DEBUG(dbgs() << "*** Reg " << PrintReg(IdToReg[CurReg], TRI) << " ***\n");
+
+ for (const auto &DefsIt : ColorOpToReachedUses[CurReg]) {
+ DEBUG(dbgs() << "Def:\n");
+ DEBUG(DefsIt.first->print(dbgs()));
+ DEBUG(dbgs() << "Reachable uses:\n");
+ for (const MachineInstr *MI : DefsIt.second) {
+ DEBUG(MI->print(dbgs()));
+ }
+ }
+ }
+}
+#endif // NDEBUG
+
+/// Answer the following question: Can Def be one of the definition
+/// involved in a part of a LOH?
+static bool canDefBePartOfLOH(const MachineInstr *Def) {
+ unsigned Opc = Def->getOpcode();
+ // Accept ADRP, ADDLow and LOADGot.
+ switch (Opc) {
+ default:
+ return false;
+ case AArch64::ADRP:
+ return true;
+ case AArch64::ADDXri:
+ // Check immediate to see if the immediate is an address.
+ switch (Def->getOperand(2).getType()) {
+ default:
+ return false;
+ case MachineOperand::MO_GlobalAddress:
+ case MachineOperand::MO_JumpTableIndex:
+ case MachineOperand::MO_ConstantPoolIndex:
+ case MachineOperand::MO_BlockAddress:
+ return true;
+ }
+ case AArch64::LDRXui:
+ // Check immediate to see if the immediate is an address.
+ switch (Def->getOperand(2).getType()) {
+ default:
+ return false;
+ case MachineOperand::MO_GlobalAddress:
+ return true;
+ }
+ }
+ // Unreachable.
+ return false;
+}
+
+/// Check whether the given instruction can the end of a LOH chain involving a
+/// store.
+static bool isCandidateStore(const MachineInstr *Instr) {
+ switch (Instr->getOpcode()) {
+ default:
+ return false;
+ case AArch64::STRBBui:
+ case AArch64::STRHHui:
+ case AArch64::STRBui:
+ case AArch64::STRHui:
+ case AArch64::STRWui:
+ case AArch64::STRXui:
+ case AArch64::STRSui:
+ case AArch64::STRDui:
+ case AArch64::STRQui:
+ // In case we have str xA, [xA, #imm], this is two different uses
+ // of xA and we cannot fold, otherwise the xA stored may be wrong,
+ // even if #imm == 0.
+ if (Instr->getOperand(0).getReg() != Instr->getOperand(1).getReg())
+ return true;
+ }
+ return false;
+}
+
+/// Given the result of a reaching definition algorithm in ColorOpToReachedUses,
+/// Build the Use to Defs information and filter out obvious non-LOH candidates.
+/// In ADRPMode, non-LOH candidates are "uses" with non-ADRP definitions.
+/// In non-ADRPMode, non-LOH candidates are "uses" with several definition,
+/// i.e., no simple chain.
+/// \param ADRPMode -- \see initReachingDef.
+static void reachedUsesToDefs(InstrToInstrs &UseToReachingDefs,
+ const InstrToInstrs *ColorOpToReachedUses,
+ const MapRegToId &RegToId,
+ bool ADRPMode = false) {
+
+ SetOfMachineInstr NotCandidate;
+ unsigned NbReg = RegToId.size();
+ MapRegToId::const_iterator EndIt = RegToId.end();
+ for (unsigned CurReg = 0; CurReg < NbReg; ++CurReg) {
+ // If this color is never defined, continue.
+ if (ColorOpToReachedUses[CurReg].empty())
+ continue;
+
+ for (const auto &DefsIt : ColorOpToReachedUses[CurReg]) {
+ for (const MachineInstr *MI : DefsIt.second) {
+ const MachineInstr *Def = DefsIt.first;
+ MapRegToId::const_iterator It;
+ // if all the reaching defs are not adrp, this use will not be
+ // simplifiable.
+ if ((ADRPMode && Def->getOpcode() != AArch64::ADRP) ||
+ (!ADRPMode && !canDefBePartOfLOH(Def)) ||
+ (!ADRPMode && isCandidateStore(MI) &&
+ // store are LOH candidate iff the end of the chain is used as
+ // base.
+ ((It = RegToId.find((MI)->getOperand(1).getReg())) == EndIt ||
+ It->second != CurReg))) {
+ NotCandidate.insert(MI);
+ continue;
+ }
+ // Do not consider self reaching as a simplifiable case for ADRP.
+ if (!ADRPMode || MI != DefsIt.first) {
+ UseToReachingDefs[MI].insert(DefsIt.first);
+ // If UsesIt has several reaching definitions, it is not
+ // candidate for simplificaton in non-ADRPMode.
+ if (!ADRPMode && UseToReachingDefs[MI].size() > 1)
+ NotCandidate.insert(MI);
+ }
+ }
+ }
+ }
+ for (const MachineInstr *Elem : NotCandidate) {
+ DEBUG(dbgs() << "Too many reaching defs: " << *Elem << "\n");
+ // It would have been better if we could just remove the entry
+ // from the map. Because of that, we have to filter the garbage
+ // (second.empty) in the subsequence analysis.
+ UseToReachingDefs[Elem].clear();
+ }
+}
+
+/// Based on the use to defs information (in ADRPMode), compute the
+/// opportunities of LOH ADRP-related.
+static void computeADRP(const InstrToInstrs &UseToDefs,
+ AArch64FunctionInfo &AArch64FI,
+ const MachineDominatorTree *MDT) {
+ DEBUG(dbgs() << "*** Compute LOH for ADRP\n");
+ for (const auto &Entry : UseToDefs) {
+ unsigned Size = Entry.second.size();
+ if (Size == 0)
+ continue;
+ if (Size == 1) {
+ const MachineInstr *L2 = *Entry.second.begin();
+ const MachineInstr *L1 = Entry.first;
+ if (!MDT->dominates(L2, L1)) {
+ DEBUG(dbgs() << "Dominance check failed:\n" << *L2 << '\n' << *L1
+ << '\n');
+ continue;
+ }
+ DEBUG(dbgs() << "Record AdrpAdrp:\n" << *L2 << '\n' << *L1 << '\n');
+ AArch64FI.addLOHDirective(MCLOH_AdrpAdrp, {L2, L1});
+ ++NumADRPSimpleCandidate;
+ }
+#ifndef NDEBUG
+ else if (Size == 2)
+ ++NumADRPComplexCandidate2;
+ else if (Size == 3)
+ ++NumADRPComplexCandidate3;
+ else
+ ++NumADRPComplexCandidateOther;
+#endif
+ // if Size < 1, the use should have been removed from the candidates
+ assert(Size >= 1 && "No reaching defs for that use!");
+ }
+}
+
+/// Check whether the given instruction can be the end of a LOH chain
+/// involving a load.
+static bool isCandidateLoad(const MachineInstr *Instr) {
+ switch (Instr->getOpcode()) {
+ default:
+ return false;
+ case AArch64::LDRSBWui:
+ case AArch64::LDRSBXui:
+ case AArch64::LDRSHWui:
+ case AArch64::LDRSHXui:
+ case AArch64::LDRSWui:
+ case AArch64::LDRBui:
+ case AArch64::LDRHui:
+ case AArch64::LDRWui:
+ case AArch64::LDRXui:
+ case AArch64::LDRSui:
+ case AArch64::LDRDui:
+ case AArch64::LDRQui:
+ if (Instr->getOperand(2).getTargetFlags() & AArch64II::MO_GOT)
+ return false;
+ return true;
+ }
+ // Unreachable.
+ return false;
+}
+
+/// Check whether the given instruction can load a litteral.
+static bool supportLoadFromLiteral(const MachineInstr *Instr) {
+ switch (Instr->getOpcode()) {
+ default:
+ return false;
+ case AArch64::LDRSWui:
+ case AArch64::LDRWui:
+ case AArch64::LDRXui:
+ case AArch64::LDRSui:
+ case AArch64::LDRDui:
+ case AArch64::LDRQui:
+ return true;
+ }
+ // Unreachable.
+ return false;
+}
+
+/// Check whether the given instruction is a LOH candidate.
+/// \param UseToDefs is used to check that Instr is at the end of LOH supported
+/// chain.
+/// \pre UseToDefs contains only on def per use, i.e., obvious non candidate are
+/// already been filtered out.
+static bool isCandidate(const MachineInstr *Instr,
+ const InstrToInstrs &UseToDefs,
+ const MachineDominatorTree *MDT) {
+ if (!isCandidateLoad(Instr) && !isCandidateStore(Instr))
+ return false;
+
+ const MachineInstr *Def = *UseToDefs.find(Instr)->second.begin();
+ if (Def->getOpcode() != AArch64::ADRP) {
+ // At this point, Def is ADDXri or LDRXui of the right type of
+ // symbol, because we filtered out the uses that were not defined
+ // by these kind of instructions (+ ADRP).
+
+ // Check if this forms a simple chain: each intermediate node must
+ // dominates the next one.
+ if (!MDT->dominates(Def, Instr))
+ return false;
+ // Move one node up in the simple chain.
+ if (UseToDefs.find(Def) ==
+ UseToDefs.end()
+ // The map may contain garbage we have to ignore.
+ ||
+ UseToDefs.find(Def)->second.empty())
+ return false;
+ Instr = Def;
+ Def = *UseToDefs.find(Def)->second.begin();
+ }
+ // Check if we reached the top of the simple chain:
+ // - top is ADRP.
+ // - check the simple chain property: each intermediate node must
+ // dominates the next one.
+ if (Def->getOpcode() == AArch64::ADRP)
+ return MDT->dominates(Def, Instr);
+ return false;
+}
+
+static bool registerADRCandidate(const MachineInstr &Use,
+ const InstrToInstrs &UseToDefs,
+ const InstrToInstrs *DefsPerColorToUses,
+ AArch64FunctionInfo &AArch64FI,
+ SetOfMachineInstr *InvolvedInLOHs,
+ const MapRegToId &RegToId) {
+ // Look for opportunities to turn ADRP -> ADD or
+ // ADRP -> LDR GOTPAGEOFF into ADR.
+ // If ADRP has more than one use. Give up.
+ if (Use.getOpcode() != AArch64::ADDXri &&
+ (Use.getOpcode() != AArch64::LDRXui ||
+ !(Use.getOperand(2).getTargetFlags() & AArch64II::MO_GOT)))
+ return false;
+ InstrToInstrs::const_iterator It = UseToDefs.find(&Use);
+ // The map may contain garbage that we need to ignore.
+ if (It == UseToDefs.end() || It->second.empty())
+ return false;
+ const MachineInstr &Def = **It->second.begin();
+ if (Def.getOpcode() != AArch64::ADRP)
+ return false;
+ // Check the number of users of ADRP.
+ const SetOfMachineInstr *Users =
+ getUses(DefsPerColorToUses,
+ RegToId.find(Def.getOperand(0).getReg())->second, Def);
+ if (Users->size() > 1) {
+ ++NumADRComplexCandidate;
+ return false;
+ }
+ ++NumADRSimpleCandidate;
+ assert((!InvolvedInLOHs || InvolvedInLOHs->insert(&Def)) &&
+ "ADRP already involved in LOH.");
+ assert((!InvolvedInLOHs || InvolvedInLOHs->insert(&Use)) &&
+ "ADD already involved in LOH.");
+ DEBUG(dbgs() << "Record AdrpAdd\n" << Def << '\n' << Use << '\n');
+
+ AArch64FI.addLOHDirective(
+ Use.getOpcode() == AArch64::ADDXri ? MCLOH_AdrpAdd : MCLOH_AdrpLdrGot,
+ {&Def, &Use});
+ return true;
+}
+
+/// Based on the use to defs information (in non-ADRPMode), compute the
+/// opportunities of LOH non-ADRP-related
+static void computeOthers(const InstrToInstrs &UseToDefs,
+ const InstrToInstrs *DefsPerColorToUses,
+ AArch64FunctionInfo &AArch64FI, const MapRegToId &RegToId,
+ const MachineDominatorTree *MDT) {
+ SetOfMachineInstr *InvolvedInLOHs = nullptr;
+#ifndef NDEBUG
+ SetOfMachineInstr InvolvedInLOHsStorage;
+ InvolvedInLOHs = &InvolvedInLOHsStorage;
+#endif // NDEBUG
+ DEBUG(dbgs() << "*** Compute LOH for Others\n");
+ // ADRP -> ADD/LDR -> LDR/STR pattern.
+ // Fall back to ADRP -> ADD pattern if we fail to catch the bigger pattern.
+
+ // FIXME: When the statistics are not important,
+ // This initial filtering loop can be merged into the next loop.
+ // Currently, we didn't do it to have the same code for both DEBUG and
+ // NDEBUG builds. Indeed, the iterator of the second loop would need
+ // to be changed.
+ SetOfMachineInstr PotentialCandidates;
+ SetOfMachineInstr PotentialADROpportunities;
+ for (auto &Use : UseToDefs) {
+ // If no definition is available, this is a non candidate.
+ if (Use.second.empty())
+ continue;
+ // Keep only instructions that are load or store and at the end of
+ // a ADRP -> ADD/LDR/Nothing chain.
+ // We already filtered out the no-chain cases.
+ if (!isCandidate(Use.first, UseToDefs, MDT)) {
+ PotentialADROpportunities.insert(Use.first);
+ continue;
+ }
+ PotentialCandidates.insert(Use.first);
+ }
+
+ // Make the following distinctions for statistics as the linker does
+ // know how to decode instructions:
+ // - ADD/LDR/Nothing make there different patterns.
+ // - LDR/STR make two different patterns.
+ // Hence, 6 - 1 base patterns.
+ // (because ADRP-> Nothing -> STR is not simplifiable)
+
+ // The linker is only able to have a simple semantic, i.e., if pattern A
+ // do B.
+ // However, we want to see the opportunity we may miss if we were able to
+ // catch more complex cases.
+
+ // PotentialCandidates are result of a chain ADRP -> ADD/LDR ->
+ // A potential candidate becomes a candidate, if its current immediate
+ // operand is zero and all nodes of the chain have respectively only one user
+#ifndef NDEBUG
+ SetOfMachineInstr DefsOfPotentialCandidates;
+#endif
+ for (const MachineInstr *Candidate : PotentialCandidates) {
+ // Get the definition of the candidate i.e., ADD or LDR.
+ const MachineInstr *Def = *UseToDefs.find(Candidate)->second.begin();
+ // Record the elements of the chain.
+ const MachineInstr *L1 = Def;
+ const MachineInstr *L2 = nullptr;
+ unsigned ImmediateDefOpc = Def->getOpcode();
+ if (Def->getOpcode() != AArch64::ADRP) {
+ // Check the number of users of this node.
+ const SetOfMachineInstr *Users =
+ getUses(DefsPerColorToUses,
+ RegToId.find(Def->getOperand(0).getReg())->second, *Def);
+ if (Users->size() > 1) {
+#ifndef NDEBUG
+ // if all the uses of this def are in potential candidate, this is
+ // a complex candidate of level 2.
+ bool IsLevel2 = true;
+ for (const MachineInstr *MI : *Users) {
+ if (!PotentialCandidates.count(MI)) {
+ ++NumTooCplxLvl2;
+ IsLevel2 = false;
+ break;
+ }
+ }
+ if (IsLevel2)
+ ++NumCplxLvl2;
+#endif // NDEBUG
+ PotentialADROpportunities.insert(Def);
+ continue;
+ }
+ L2 = Def;
+ Def = *UseToDefs.find(Def)->second.begin();
+ L1 = Def;
+ } // else the element in the middle of the chain is nothing, thus
+ // Def already contains the first element of the chain.
+
+ // Check the number of users of the first node in the chain, i.e., ADRP
+ const SetOfMachineInstr *Users =
+ getUses(DefsPerColorToUses,
+ RegToId.find(Def->getOperand(0).getReg())->second, *Def);
+ if (Users->size() > 1) {
+#ifndef NDEBUG
+ // if all the uses of this def are in the defs of the potential candidate,
+ // this is a complex candidate of level 1
+ if (DefsOfPotentialCandidates.empty()) {
+ // lazy init
+ DefsOfPotentialCandidates = PotentialCandidates;
+ for (const MachineInstr *Candidate : PotentialCandidates) {
+ if (!UseToDefs.find(Candidate)->second.empty())
+ DefsOfPotentialCandidates.insert(
+ *UseToDefs.find(Candidate)->second.begin());
+ }
+ }
+ bool Found = false;
+ for (auto &Use : *Users) {
+ if (!DefsOfPotentialCandidates.count(Use)) {
+ ++NumTooCplxLvl1;
+ Found = true;
+ break;
+ }
+ }
+ if (!Found)
+ ++NumCplxLvl1;
+#endif // NDEBUG
+ continue;
+ }
+
+ bool IsL2Add = (ImmediateDefOpc == AArch64::ADDXri);
+ // If the chain is three instructions long and ldr is the second element,
+ // then this ldr must load form GOT, otherwise this is not a correct chain.
+ if (L2 && !IsL2Add &&
+ !(L2->getOperand(2).getTargetFlags() & AArch64II::MO_GOT))
+ continue;
+ SmallVector<const MachineInstr *, 3> Args;
+ MCLOHType Kind;
+ if (isCandidateLoad(Candidate)) {
+ if (!L2) {
+ // At this point, the candidate LOH indicates that the ldr instruction
+ // may use a direct access to the symbol. There is not such encoding
+ // for loads of byte and half.
+ if (!supportLoadFromLiteral(Candidate))
+ continue;
+
+ DEBUG(dbgs() << "Record AdrpLdr:\n" << *L1 << '\n' << *Candidate
+ << '\n');
+ Kind = MCLOH_AdrpLdr;
+ Args.push_back(L1);
+ Args.push_back(Candidate);
+ assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) &&
+ "L1 already involved in LOH.");
+ assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) &&
+ "Candidate already involved in LOH.");
+ ++NumADRPToLDR;
+ } else {
+ DEBUG(dbgs() << "Record Adrp" << (IsL2Add ? "Add" : "LdrGot")
+ << "Ldr:\n" << *L1 << '\n' << *L2 << '\n' << *Candidate
+ << '\n');
+
+ Kind = IsL2Add ? MCLOH_AdrpAddLdr : MCLOH_AdrpLdrGotLdr;
+ Args.push_back(L1);
+ Args.push_back(L2);
+ Args.push_back(Candidate);
+
+ PotentialADROpportunities.remove(L2);
+ assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) &&
+ "L1 already involved in LOH.");
+ assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L2)) &&
+ "L2 already involved in LOH.");
+ assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) &&
+ "Candidate already involved in LOH.");
+#ifndef NDEBUG
+ // get the immediate of the load
+ if (Candidate->getOperand(2).getImm() == 0)
+ if (ImmediateDefOpc == AArch64::ADDXri)
+ ++NumADDToLDR;
+ else
+ ++NumLDRToLDR;
+ else if (ImmediateDefOpc == AArch64::ADDXri)
+ ++NumADDToLDRWithImm;
+ else
+ ++NumLDRToLDRWithImm;
+#endif // NDEBUG
+ }
+ } else {
+ if (ImmediateDefOpc == AArch64::ADRP)
+ continue;
+ else {
+
+ DEBUG(dbgs() << "Record Adrp" << (IsL2Add ? "Add" : "LdrGot")
+ << "Str:\n" << *L1 << '\n' << *L2 << '\n' << *Candidate
+ << '\n');
+
+ Kind = IsL2Add ? MCLOH_AdrpAddStr : MCLOH_AdrpLdrGotStr;
+ Args.push_back(L1);
+ Args.push_back(L2);
+ Args.push_back(Candidate);
+
+ PotentialADROpportunities.remove(L2);
+ assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) &&
+ "L1 already involved in LOH.");
+ assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L2)) &&
+ "L2 already involved in LOH.");
+ assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) &&
+ "Candidate already involved in LOH.");
+#ifndef NDEBUG
+ // get the immediate of the store
+ if (Candidate->getOperand(2).getImm() == 0)
+ if (ImmediateDefOpc == AArch64::ADDXri)
+ ++NumADDToSTR;
+ else
+ ++NumLDRToSTR;
+ else if (ImmediateDefOpc == AArch64::ADDXri)
+ ++NumADDToSTRWithImm;
+ else
+ ++NumLDRToSTRWithImm;
+#endif // DEBUG
+ }
+ }
+ AArch64FI.addLOHDirective(Kind, Args);
+ }
+
+ // Now, we grabbed all the big patterns, check ADR opportunities.
+ for (const MachineInstr *Candidate : PotentialADROpportunities)
+ registerADRCandidate(*Candidate, UseToDefs, DefsPerColorToUses, AArch64FI,
+ InvolvedInLOHs, RegToId);
+}
+
+/// Look for every register defined by potential LOHs candidates.
+/// Map these registers with dense id in @p RegToId and vice-versa in
+/// @p IdToReg. @p IdToReg is populated only in DEBUG mode.
+static void collectInvolvedReg(const MachineFunction &MF, MapRegToId &RegToId,
+ MapIdToReg &IdToReg,
+ const TargetRegisterInfo *TRI) {
+ unsigned CurRegId = 0;
+ if (!PreCollectRegister) {
+ unsigned NbReg = TRI->getNumRegs();
+ for (; CurRegId < NbReg; ++CurRegId) {
+ RegToId[CurRegId] = CurRegId;
+ DEBUG(IdToReg.push_back(CurRegId));
+ DEBUG(assert(IdToReg[CurRegId] == CurRegId && "Reg index mismatches"));
+ }
+ return;
+ }
+
+ DEBUG(dbgs() << "** Collect Involved Register\n");
+ for (const auto &MBB : MF) {
+ for (const MachineInstr &MI : MBB) {
+ if (!canDefBePartOfLOH(&MI) &&
+ !isCandidateLoad(&MI) && !isCandidateStore(&MI))
+ continue;
+
+ // Process defs
+ for (MachineInstr::const_mop_iterator IO = MI.operands_begin(),
+ IOEnd = MI.operands_end();
+ IO != IOEnd; ++IO) {
+ if (!IO->isReg() || !IO->isDef())
+ continue;
+ unsigned CurReg = IO->getReg();
+ for (MCRegAliasIterator AI(CurReg, TRI, true); AI.isValid(); ++AI)
+ if (RegToId.find(*AI) == RegToId.end()) {
+ DEBUG(IdToReg.push_back(*AI);
+ assert(IdToReg[CurRegId] == *AI &&
+ "Reg index mismatches insertion index."));
+ RegToId[*AI] = CurRegId++;
+ DEBUG(dbgs() << "Register: " << PrintReg(*AI, TRI) << '\n');
+ }
+ }
+ }
+ }
+}
+
+bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
+ const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+ const MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>();
+
+ MapRegToId RegToId;
+ MapIdToReg IdToReg;
+ AArch64FunctionInfo *AArch64FI = MF.getInfo<AArch64FunctionInfo>();
+ assert(AArch64FI && "No MachineFunctionInfo for this function!");
+
+ DEBUG(dbgs() << "Looking for LOH in " << MF.getName() << '\n');
+
+ collectInvolvedReg(MF, RegToId, IdToReg, TRI);
+ if (RegToId.empty())
+ return false;
+
+ MachineInstr *DummyOp = nullptr;
+ if (BasicBlockScopeOnly) {
+ const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+ // For local analysis, create a dummy operation to record uses that are not
+ // local.
+ DummyOp = MF.CreateMachineInstr(TII->get(AArch64::COPY), DebugLoc());
+ }
+
+ unsigned NbReg = RegToId.size();
+ bool Modified = false;
+
+ // Start with ADRP.
+ InstrToInstrs *ColorOpToReachedUses = new InstrToInstrs[NbReg];
+
+ // Compute the reaching def in ADRP mode, meaning ADRP definitions
+ // are first considered as uses.
+ reachingDef(MF, ColorOpToReachedUses, RegToId, true, DummyOp);
+ DEBUG(dbgs() << "ADRP reaching defs\n");
+ DEBUG(printReachingDef(ColorOpToReachedUses, NbReg, TRI, IdToReg));
+
+ // Translate the definition to uses map into a use to definitions map to ease
+ // statistic computation.
+ InstrToInstrs ADRPToReachingDefs;
+ reachedUsesToDefs(ADRPToReachingDefs, ColorOpToReachedUses, RegToId, true);
+
+ // Compute LOH for ADRP.
+ computeADRP(ADRPToReachingDefs, *AArch64FI, MDT);
+ delete[] ColorOpToReachedUses;
+
+ // Continue with general ADRP -> ADD/LDR -> LDR/STR pattern.
+ ColorOpToReachedUses = new InstrToInstrs[NbReg];
+
+ // first perform a regular reaching def analysis.
+ reachingDef(MF, ColorOpToReachedUses, RegToId, false, DummyOp);
+ DEBUG(dbgs() << "All reaching defs\n");
+ DEBUG(printReachingDef(ColorOpToReachedUses, NbReg, TRI, IdToReg));
+
+ // Turn that into a use to defs to ease statistic computation.
+ InstrToInstrs UsesToReachingDefs;
+ reachedUsesToDefs(UsesToReachingDefs, ColorOpToReachedUses, RegToId, false);
+
+ // Compute other than AdrpAdrp LOH.
+ computeOthers(UsesToReachingDefs, ColorOpToReachedUses, *AArch64FI, RegToId,
+ MDT);
+ delete[] ColorOpToReachedUses;
+
+ if (BasicBlockScopeOnly)
+ MF.DeleteMachineInstr(DummyOp);
+
+ return Modified;
+}
+
+/// createAArch64CollectLOHPass - returns an instance of the Statistic for
+/// linker optimization pass.
+FunctionPass *llvm::createAArch64CollectLOHPass() {
+ return new AArch64CollectLOH();
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
new file mode 100644
index 000000000000..8b186328d125
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
@@ -0,0 +1,438 @@
+//=- AArch64ConditionOptimizer.cpp - Remove useless comparisons for AArch64 -=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to make consecutive compares of values use same operands to
+// allow CSE pass to remove duplicated instructions. For this it analyzes
+// branches and adjusts comparisons with immediate values by converting:
+// * GE -> GT
+// * GT -> GE
+// * LT -> LE
+// * LE -> LT
+// and adjusting immediate values appropriately. It basically corrects two
+// immediate values towards each other to make them equal.
+//
+// Consider the following example in C:
+//
+// if ((a < 5 && ...) || (a > 5 && ...)) {
+// ~~~~~ ~~~~~
+// ^ ^
+// x y
+//
+// Here both "x" and "y" expressions compare "a" with "5". When "x" evaluates
+// to "false", "y" can just check flags set by the first comparison. As a
+// result of the canonicalization employed by
+// SelectionDAGBuilder::visitSwitchCase, DAGCombine, and other target-specific
+// code, assembly ends up in the form that is not CSE friendly:
+//
+// ...
+// cmp w8, #4
+// b.gt .LBB0_3
+// ...
+// .LBB0_3:
+// cmp w8, #6
+// b.lt .LBB0_6
+// ...
+//
+// Same assembly after the pass:
+//
+// ...
+// cmp w8, #5
+// b.ge .LBB0_3
+// ...
+// .LBB0_3:
+// cmp w8, #5 // <-- CSE pass removes this instruction
+// b.le .LBB0_6
+// ...
+//
+// Currently only SUBS and ADDS followed by b.?? are supported.
+//
+// TODO: maybe handle TBNZ/TBZ the same way as CMP when used instead for "a < 0"
+// TODO: handle other conditional instructions (e.g. CSET)
+// TODO: allow second branching to be anything if it doesn't require adjusting
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <cstdlib>
+#include <tuple>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-condopt"
+
+STATISTIC(NumConditionsAdjusted, "Number of conditions adjusted");
+
+namespace {
+class AArch64ConditionOptimizer : public MachineFunctionPass {
+ const TargetInstrInfo *TII;
+ MachineDominatorTree *DomTree;
+ const MachineRegisterInfo *MRI;
+
+public:
+ // Stores immediate, compare instruction opcode and branch condition (in this
+ // order) of adjusted comparison.
+ typedef std::tuple<int, unsigned, AArch64CC::CondCode> CmpInfo;
+
+ static char ID;
+ AArch64ConditionOptimizer() : MachineFunctionPass(ID) {
+ initializeAArch64ConditionOptimizerPass(*PassRegistry::getPassRegistry());
+ }
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+ MachineInstr *findSuitableCompare(MachineBasicBlock *MBB);
+ CmpInfo adjustCmp(MachineInstr *CmpMI, AArch64CC::CondCode Cmp);
+ void modifyCmp(MachineInstr *CmpMI, const CmpInfo &Info);
+ bool adjustTo(MachineInstr *CmpMI, AArch64CC::CondCode Cmp, MachineInstr *To,
+ int ToImm);
+ bool runOnMachineFunction(MachineFunction &MF) override;
+ StringRef getPassName() const override {
+ return "AArch64 Condition Optimizer";
+ }
+};
+} // end anonymous namespace
+
+char AArch64ConditionOptimizer::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AArch64ConditionOptimizer, "aarch64-condopt",
+ "AArch64 CondOpt Pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(AArch64ConditionOptimizer, "aarch64-condopt",
+ "AArch64 CondOpt Pass", false, false)
+
+FunctionPass *llvm::createAArch64ConditionOptimizerPass() {
+ return new AArch64ConditionOptimizer();
+}
+
+void AArch64ConditionOptimizer::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+// Finds compare instruction that corresponds to supported types of branching.
+// Returns the instruction or nullptr on failures or detecting unsupported
+// instructions.
+MachineInstr *AArch64ConditionOptimizer::findSuitableCompare(
+ MachineBasicBlock *MBB) {
+ MachineBasicBlock::iterator I = MBB->getFirstTerminator();
+ if (I == MBB->end())
+ return nullptr;
+
+ if (I->getOpcode() != AArch64::Bcc)
+ return nullptr;
+
+ // Since we may modify cmp of this MBB, make sure NZCV does not live out.
+ for (auto SuccBB : MBB->successors())
+ if (SuccBB->isLiveIn(AArch64::NZCV))
+ return nullptr;
+
+ // Now find the instruction controlling the terminator.
+ for (MachineBasicBlock::iterator B = MBB->begin(); I != B;) {
+ --I;
+ assert(!I->isTerminator() && "Spurious terminator");
+ // Check if there is any use of NZCV between CMP and Bcc.
+ if (I->readsRegister(AArch64::NZCV))
+ return nullptr;
+ switch (I->getOpcode()) {
+ // cmp is an alias for subs with a dead destination register.
+ case AArch64::SUBSWri:
+ case AArch64::SUBSXri:
+ // cmn is an alias for adds with a dead destination register.
+ case AArch64::ADDSWri:
+ case AArch64::ADDSXri: {
+ unsigned ShiftAmt = AArch64_AM::getShiftValue(I->getOperand(3).getImm());
+ if (!I->getOperand(2).isImm()) {
+ DEBUG(dbgs() << "Immediate of cmp is symbolic, " << *I << '\n');
+ return nullptr;
+ } else if (I->getOperand(2).getImm() << ShiftAmt >= 0xfff) {
+ DEBUG(dbgs() << "Immediate of cmp may be out of range, " << *I << '\n');
+ return nullptr;
+ } else if (!MRI->use_empty(I->getOperand(0).getReg())) {
+ DEBUG(dbgs() << "Destination of cmp is not dead, " << *I << '\n');
+ return nullptr;
+ }
+ return &*I;
+ }
+ // Prevent false positive case like:
+ // cmp w19, #0
+ // cinc w0, w19, gt
+ // ...
+ // fcmp d8, #0.0
+ // b.gt .LBB0_5
+ case AArch64::FCMPDri:
+ case AArch64::FCMPSri:
+ case AArch64::FCMPESri:
+ case AArch64::FCMPEDri:
+
+ case AArch64::SUBSWrr:
+ case AArch64::SUBSXrr:
+ case AArch64::ADDSWrr:
+ case AArch64::ADDSXrr:
+ case AArch64::FCMPSrr:
+ case AArch64::FCMPDrr:
+ case AArch64::FCMPESrr:
+ case AArch64::FCMPEDrr:
+ // Skip comparison instructions without immediate operands.
+ return nullptr;
+ }
+ }
+ DEBUG(dbgs() << "Flags not defined in BB#" << MBB->getNumber() << '\n');
+ return nullptr;
+}
+
+// Changes opcode adds <-> subs considering register operand width.
+static int getComplementOpc(int Opc) {
+ switch (Opc) {
+ case AArch64::ADDSWri: return AArch64::SUBSWri;
+ case AArch64::ADDSXri: return AArch64::SUBSXri;
+ case AArch64::SUBSWri: return AArch64::ADDSWri;
+ case AArch64::SUBSXri: return AArch64::ADDSXri;
+ default:
+ llvm_unreachable("Unexpected opcode");
+ }
+}
+
+// Changes form of comparison inclusive <-> exclusive.
+static AArch64CC::CondCode getAdjustedCmp(AArch64CC::CondCode Cmp) {
+ switch (Cmp) {
+ case AArch64CC::GT: return AArch64CC::GE;
+ case AArch64CC::GE: return AArch64CC::GT;
+ case AArch64CC::LT: return AArch64CC::LE;
+ case AArch64CC::LE: return AArch64CC::LT;
+ default:
+ llvm_unreachable("Unexpected condition code");
+ }
+}
+
+// Transforms GT -> GE, GE -> GT, LT -> LE, LE -> LT by updating comparison
+// operator and condition code.
+AArch64ConditionOptimizer::CmpInfo AArch64ConditionOptimizer::adjustCmp(
+ MachineInstr *CmpMI, AArch64CC::CondCode Cmp) {
+ unsigned Opc = CmpMI->getOpcode();
+
+ // CMN (compare with negative immediate) is an alias to ADDS (as
+ // "operand - negative" == "operand + positive")
+ bool Negative = (Opc == AArch64::ADDSWri || Opc == AArch64::ADDSXri);
+
+ int Correction = (Cmp == AArch64CC::GT) ? 1 : -1;
+ // Negate Correction value for comparison with negative immediate (CMN).
+ if (Negative) {
+ Correction = -Correction;
+ }
+
+ const int OldImm = (int)CmpMI->getOperand(2).getImm();
+ const int NewImm = std::abs(OldImm + Correction);
+
+ // Handle +0 -> -1 and -0 -> +1 (CMN with 0 immediate) transitions by
+ // adjusting compare instruction opcode.
+ if (OldImm == 0 && ((Negative && Correction == 1) ||
+ (!Negative && Correction == -1))) {
+ Opc = getComplementOpc(Opc);
+ }
+
+ return CmpInfo(NewImm, Opc, getAdjustedCmp(Cmp));
+}
+
+// Applies changes to comparison instruction suggested by adjustCmp().
+void AArch64ConditionOptimizer::modifyCmp(MachineInstr *CmpMI,
+ const CmpInfo &Info) {
+ int Imm;
+ unsigned Opc;
+ AArch64CC::CondCode Cmp;
+ std::tie(Imm, Opc, Cmp) = Info;
+
+ MachineBasicBlock *const MBB = CmpMI->getParent();
+
+ // Change immediate in comparison instruction (ADDS or SUBS).
+ BuildMI(*MBB, CmpMI, CmpMI->getDebugLoc(), TII->get(Opc))
+ .addOperand(CmpMI->getOperand(0))
+ .addOperand(CmpMI->getOperand(1))
+ .addImm(Imm)
+ .addOperand(CmpMI->getOperand(3));
+ CmpMI->eraseFromParent();
+
+ // The fact that this comparison was picked ensures that it's related to the
+ // first terminator instruction.
+ MachineInstr &BrMI = *MBB->getFirstTerminator();
+
+ // Change condition in branch instruction.
+ BuildMI(*MBB, BrMI, BrMI.getDebugLoc(), TII->get(AArch64::Bcc))
+ .addImm(Cmp)
+ .addOperand(BrMI.getOperand(1));
+ BrMI.eraseFromParent();
+
+ MBB->updateTerminator();
+
+ ++NumConditionsAdjusted;
+}
+
+// Parse a condition code returned by AnalyzeBranch, and compute the CondCode
+// corresponding to TBB.
+// Returns true if parsing was successful, otherwise false is returned.
+static bool parseCond(ArrayRef<MachineOperand> Cond, AArch64CC::CondCode &CC) {
+ // A normal br.cond simply has the condition code.
+ if (Cond[0].getImm() != -1) {
+ assert(Cond.size() == 1 && "Unknown Cond array format");
+ CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
+ return true;
+ }
+ return false;
+}
+
+// Adjusts one cmp instruction to another one if result of adjustment will allow
+// CSE. Returns true if compare instruction was changed, otherwise false is
+// returned.
+bool AArch64ConditionOptimizer::adjustTo(MachineInstr *CmpMI,
+ AArch64CC::CondCode Cmp, MachineInstr *To, int ToImm)
+{
+ CmpInfo Info = adjustCmp(CmpMI, Cmp);
+ if (std::get<0>(Info) == ToImm && std::get<1>(Info) == To->getOpcode()) {
+ modifyCmp(CmpMI, Info);
+ return true;
+ }
+ return false;
+}
+
+bool AArch64ConditionOptimizer::runOnMachineFunction(MachineFunction &MF) {
+ DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n"
+ << "********** Function: " << MF.getName() << '\n');
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
+ TII = MF.getSubtarget().getInstrInfo();
+ DomTree = &getAnalysis<MachineDominatorTree>();
+ MRI = &MF.getRegInfo();
+
+ bool Changed = false;
+
+ // Visit blocks in dominator tree pre-order. The pre-order enables multiple
+ // cmp-conversions from the same head block.
+ // Note that updateDomTree() modifies the children of the DomTree node
+ // currently being visited. The df_iterator supports that; it doesn't look at
+ // child_begin() / child_end() until after a node has been visited.
+ for (MachineDomTreeNode *I : depth_first(DomTree)) {
+ MachineBasicBlock *HBB = I->getBlock();
+
+ SmallVector<MachineOperand, 4> HeadCond;
+ MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+ if (TII->analyzeBranch(*HBB, TBB, FBB, HeadCond)) {
+ continue;
+ }
+
+ // Equivalence check is to skip loops.
+ if (!TBB || TBB == HBB) {
+ continue;
+ }
+
+ SmallVector<MachineOperand, 4> TrueCond;
+ MachineBasicBlock *TBB_TBB = nullptr, *TBB_FBB = nullptr;
+ if (TII->analyzeBranch(*TBB, TBB_TBB, TBB_FBB, TrueCond)) {
+ continue;
+ }
+
+ MachineInstr *HeadCmpMI = findSuitableCompare(HBB);
+ if (!HeadCmpMI) {
+ continue;
+ }
+
+ MachineInstr *TrueCmpMI = findSuitableCompare(TBB);
+ if (!TrueCmpMI) {
+ continue;
+ }
+
+ AArch64CC::CondCode HeadCmp;
+ if (HeadCond.empty() || !parseCond(HeadCond, HeadCmp)) {
+ continue;
+ }
+
+ AArch64CC::CondCode TrueCmp;
+ if (TrueCond.empty() || !parseCond(TrueCond, TrueCmp)) {
+ continue;
+ }
+
+ const int HeadImm = (int)HeadCmpMI->getOperand(2).getImm();
+ const int TrueImm = (int)TrueCmpMI->getOperand(2).getImm();
+
+ DEBUG(dbgs() << "Head branch:\n");
+ DEBUG(dbgs() << "\tcondition: "
+ << AArch64CC::getCondCodeName(HeadCmp) << '\n');
+ DEBUG(dbgs() << "\timmediate: " << HeadImm << '\n');
+
+ DEBUG(dbgs() << "True branch:\n");
+ DEBUG(dbgs() << "\tcondition: "
+ << AArch64CC::getCondCodeName(TrueCmp) << '\n');
+ DEBUG(dbgs() << "\timmediate: " << TrueImm << '\n');
+
+ if (((HeadCmp == AArch64CC::GT && TrueCmp == AArch64CC::LT) ||
+ (HeadCmp == AArch64CC::LT && TrueCmp == AArch64CC::GT)) &&
+ std::abs(TrueImm - HeadImm) == 2) {
+ // This branch transforms machine instructions that correspond to
+ //
+ // 1) (a > {TrueImm} && ...) || (a < {HeadImm} && ...)
+ // 2) (a < {TrueImm} && ...) || (a > {HeadImm} && ...)
+ //
+ // into
+ //
+ // 1) (a >= {NewImm} && ...) || (a <= {NewImm} && ...)
+ // 2) (a <= {NewImm} && ...) || (a >= {NewImm} && ...)
+
+ CmpInfo HeadCmpInfo = adjustCmp(HeadCmpMI, HeadCmp);
+ CmpInfo TrueCmpInfo = adjustCmp(TrueCmpMI, TrueCmp);
+ if (std::get<0>(HeadCmpInfo) == std::get<0>(TrueCmpInfo) &&
+ std::get<1>(HeadCmpInfo) == std::get<1>(TrueCmpInfo)) {
+ modifyCmp(HeadCmpMI, HeadCmpInfo);
+ modifyCmp(TrueCmpMI, TrueCmpInfo);
+ Changed = true;
+ }
+ } else if (((HeadCmp == AArch64CC::GT && TrueCmp == AArch64CC::GT) ||
+ (HeadCmp == AArch64CC::LT && TrueCmp == AArch64CC::LT)) &&
+ std::abs(TrueImm - HeadImm) == 1) {
+ // This branch transforms machine instructions that correspond to
+ //
+ // 1) (a > {TrueImm} && ...) || (a > {HeadImm} && ...)
+ // 2) (a < {TrueImm} && ...) || (a < {HeadImm} && ...)
+ //
+ // into
+ //
+ // 1) (a <= {NewImm} && ...) || (a > {NewImm} && ...)
+ // 2) (a < {NewImm} && ...) || (a >= {NewImm} && ...)
+
+ // GT -> GE transformation increases immediate value, so picking the
+ // smaller one; LT -> LE decreases immediate value so invert the choice.
+ bool adjustHeadCond = (HeadImm < TrueImm);
+ if (HeadCmp == AArch64CC::LT) {
+ adjustHeadCond = !adjustHeadCond;
+ }
+
+ if (adjustHeadCond) {
+ Changed |= adjustTo(HeadCmpMI, HeadCmp, TrueCmpMI, TrueImm);
+ } else {
+ Changed |= adjustTo(TrueCmpMI, TrueCmp, HeadCmpMI, HeadImm);
+ }
+ }
+ // Other transformation cases almost never occur due to generation of < or >
+ // comparisons instead of <= and >=.
+ }
+
+ return Changed;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
new file mode 100644
index 000000000000..da09b36cac9c
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -0,0 +1,913 @@
+//===-- AArch64ConditionalCompares.cpp --- CCMP formation for AArch64 -----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64ConditionalCompares pass which reduces
+// branching and code size by using the conditional compare instructions CCMP,
+// CCMN, and FCMP.
+//
+// The CFG transformations for forming conditional compares are very similar to
+// if-conversion, and this pass should run immediately before the early
+// if-conversion pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-ccmp"
+
+// Absolute maximum number of instructions allowed per speculated block.
+// This bypasses all other heuristics, so it should be set fairly high.
+static cl::opt<unsigned> BlockInstrLimit(
+ "aarch64-ccmp-limit", cl::init(30), cl::Hidden,
+ cl::desc("Maximum number of instructions per speculated block."));
+
+// Stress testing mode - disable heuristics.
+static cl::opt<bool> Stress("aarch64-stress-ccmp", cl::Hidden,
+ cl::desc("Turn all knobs to 11"));
+
+STATISTIC(NumConsidered, "Number of ccmps considered");
+STATISTIC(NumPhiRejs, "Number of ccmps rejected (PHI)");
+STATISTIC(NumPhysRejs, "Number of ccmps rejected (Physregs)");
+STATISTIC(NumPhi2Rejs, "Number of ccmps rejected (PHI2)");
+STATISTIC(NumHeadBranchRejs, "Number of ccmps rejected (Head branch)");
+STATISTIC(NumCmpBranchRejs, "Number of ccmps rejected (CmpBB branch)");
+STATISTIC(NumCmpTermRejs, "Number of ccmps rejected (CmpBB is cbz...)");
+STATISTIC(NumImmRangeRejs, "Number of ccmps rejected (Imm out of range)");
+STATISTIC(NumLiveDstRejs, "Number of ccmps rejected (Cmp dest live)");
+STATISTIC(NumMultNZCVUses, "Number of ccmps rejected (NZCV used)");
+STATISTIC(NumUnknNZCVDefs, "Number of ccmps rejected (NZCV def unknown)");
+
+STATISTIC(NumSpeculateRejs, "Number of ccmps rejected (Can't speculate)");
+
+STATISTIC(NumConverted, "Number of ccmp instructions created");
+STATISTIC(NumCompBranches, "Number of cbz/cbnz branches converted");
+
+//===----------------------------------------------------------------------===//
+// SSACCmpConv
+//===----------------------------------------------------------------------===//
+//
+// The SSACCmpConv class performs ccmp-conversion on SSA form machine code
+// after determining if it is possible. The class contains no heuristics;
+// external code should be used to determine when ccmp-conversion is a good
+// idea.
+//
+// CCmp-formation works on a CFG representing chained conditions, typically
+// from C's short-circuit || and && operators:
+//
+// From: Head To: Head
+// / | CmpBB
+// / | / |
+// | CmpBB / |
+// | / | Tail |
+// | / | | |
+// Tail | | |
+// | | | |
+// ... ... ... ...
+//
+// The Head block is terminated by a br.cond instruction, and the CmpBB block
+// contains compare + br.cond. Tail must be a successor of both.
+//
+// The cmp-conversion turns the compare instruction in CmpBB into a conditional
+// compare, and merges CmpBB into Head, speculatively executing its
+// instructions. The AArch64 conditional compare instructions have an immediate
+// operand that specifies the NZCV flag values when the condition is false and
+// the compare isn't executed. This makes it possible to chain compares with
+// different condition codes.
+//
+// Example:
+//
+// if (a == 5 || b == 17)
+// foo();
+//
+// Head:
+// cmp w0, #5
+// b.eq Tail
+// CmpBB:
+// cmp w1, #17
+// b.eq Tail
+// ...
+// Tail:
+// bl _foo
+//
+// Becomes:
+//
+// Head:
+// cmp w0, #5
+// ccmp w1, #17, 4, ne ; 4 = nZcv
+// b.eq Tail
+// ...
+// Tail:
+// bl _foo
+//
+// The ccmp condition code is the one that would cause the Head terminator to
+// branch to CmpBB.
+//
+// FIXME: It should also be possible to speculate a block on the critical edge
+// between Head and Tail, just like if-converting a diamond.
+//
+// FIXME: Handle PHIs in Tail by turning them into selects (if-conversion).
+
+namespace {
+class SSACCmpConv {
+ MachineFunction *MF;
+ const TargetInstrInfo *TII;
+ const TargetRegisterInfo *TRI;
+ MachineRegisterInfo *MRI;
+
+public:
+ /// The first block containing a conditional branch, dominating everything
+ /// else.
+ MachineBasicBlock *Head;
+
+ /// The block containing cmp+br.cond with a successor shared with Head.
+ MachineBasicBlock *CmpBB;
+
+ /// The common successor for Head and CmpBB.
+ MachineBasicBlock *Tail;
+
+ /// The compare instruction in CmpBB that can be converted to a ccmp.
+ MachineInstr *CmpMI;
+
+private:
+ /// The branch condition in Head as determined by AnalyzeBranch.
+ SmallVector<MachineOperand, 4> HeadCond;
+
+ /// The condition code that makes Head branch to CmpBB.
+ AArch64CC::CondCode HeadCmpBBCC;
+
+ /// The branch condition in CmpBB.
+ SmallVector<MachineOperand, 4> CmpBBCond;
+
+ /// The condition code that makes CmpBB branch to Tail.
+ AArch64CC::CondCode CmpBBTailCC;
+
+ /// Check if the Tail PHIs are trivially convertible.
+ bool trivialTailPHIs();
+
+ /// Remove CmpBB from the Tail PHIs.
+ void updateTailPHIs();
+
+ /// Check if an operand defining DstReg is dead.
+ bool isDeadDef(unsigned DstReg);
+
+ /// Find the compare instruction in MBB that controls the conditional branch.
+ /// Return NULL if a convertible instruction can't be found.
+ MachineInstr *findConvertibleCompare(MachineBasicBlock *MBB);
+
+ /// Return true if all non-terminator instructions in MBB can be safely
+ /// speculated.
+ bool canSpeculateInstrs(MachineBasicBlock *MBB, const MachineInstr *CmpMI);
+
+public:
+ /// runOnMachineFunction - Initialize per-function data structures.
+ void runOnMachineFunction(MachineFunction &MF) {
+ this->MF = &MF;
+ TII = MF.getSubtarget().getInstrInfo();
+ TRI = MF.getSubtarget().getRegisterInfo();
+ MRI = &MF.getRegInfo();
+ }
+
+ /// If the sub-CFG headed by MBB can be cmp-converted, initialize the
+ /// internal state, and return true.
+ bool canConvert(MachineBasicBlock *MBB);
+
+ /// Cmo-convert the last block passed to canConvertCmp(), assuming
+ /// it is possible. Add any erased blocks to RemovedBlocks.
+ void convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks);
+
+ /// Return the expected code size delta if the conversion into a
+ /// conditional compare is performed.
+ int expectedCodeSizeDelta() const;
+};
+} // end anonymous namespace
+
+// Check that all PHIs in Tail are selecting the same value from Head and CmpBB.
+// This means that no if-conversion is required when merging CmpBB into Head.
+bool SSACCmpConv::trivialTailPHIs() {
+ for (auto &I : *Tail) {
+ if (!I.isPHI())
+ break;
+ unsigned HeadReg = 0, CmpBBReg = 0;
+ // PHI operands come in (VReg, MBB) pairs.
+ for (unsigned oi = 1, oe = I.getNumOperands(); oi != oe; oi += 2) {
+ MachineBasicBlock *MBB = I.getOperand(oi + 1).getMBB();
+ unsigned Reg = I.getOperand(oi).getReg();
+ if (MBB == Head) {
+ assert((!HeadReg || HeadReg == Reg) && "Inconsistent PHI operands");
+ HeadReg = Reg;
+ }
+ if (MBB == CmpBB) {
+ assert((!CmpBBReg || CmpBBReg == Reg) && "Inconsistent PHI operands");
+ CmpBBReg = Reg;
+ }
+ }
+ if (HeadReg != CmpBBReg)
+ return false;
+ }
+ return true;
+}
+
+// Assuming that trivialTailPHIs() is true, update the Tail PHIs by simply
+// removing the CmpBB operands. The Head operands will be identical.
+void SSACCmpConv::updateTailPHIs() {
+ for (auto &I : *Tail) {
+ if (!I.isPHI())
+ break;
+ // I is a PHI. It can have multiple entries for CmpBB.
+ for (unsigned oi = I.getNumOperands(); oi > 2; oi -= 2) {
+ // PHI operands are (Reg, MBB) at (oi-2, oi-1).
+ if (I.getOperand(oi - 1).getMBB() == CmpBB) {
+ I.RemoveOperand(oi - 1);
+ I.RemoveOperand(oi - 2);
+ }
+ }
+ }
+}
+
+// This pass runs before the AArch64DeadRegisterDefinitions pass, so compares
+// are still writing virtual registers without any uses.
+bool SSACCmpConv::isDeadDef(unsigned DstReg) {
+ // Writes to the zero register are dead.
+ if (DstReg == AArch64::WZR || DstReg == AArch64::XZR)
+ return true;
+ if (!TargetRegisterInfo::isVirtualRegister(DstReg))
+ return false;
+ // A virtual register def without any uses will be marked dead later, and
+ // eventually replaced by the zero register.
+ return MRI->use_nodbg_empty(DstReg);
+}
+
+// Parse a condition code returned by AnalyzeBranch, and compute the CondCode
+// corresponding to TBB.
+// Return
+static bool parseCond(ArrayRef<MachineOperand> Cond, AArch64CC::CondCode &CC) {
+ // A normal br.cond simply has the condition code.
+ if (Cond[0].getImm() != -1) {
+ assert(Cond.size() == 1 && "Unknown Cond array format");
+ CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
+ return true;
+ }
+ // For tbz and cbz instruction, the opcode is next.
+ switch (Cond[1].getImm()) {
+ default:
+ // This includes tbz / tbnz branches which can't be converted to
+ // ccmp + br.cond.
+ return false;
+ case AArch64::CBZW:
+ case AArch64::CBZX:
+ assert(Cond.size() == 3 && "Unknown Cond array format");
+ CC = AArch64CC::EQ;
+ return true;
+ case AArch64::CBNZW:
+ case AArch64::CBNZX:
+ assert(Cond.size() == 3 && "Unknown Cond array format");
+ CC = AArch64CC::NE;
+ return true;
+ }
+}
+
+MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
+ MachineBasicBlock::iterator I = MBB->getFirstTerminator();
+ if (I == MBB->end())
+ return nullptr;
+ // The terminator must be controlled by the flags.
+ if (!I->readsRegister(AArch64::NZCV)) {
+ switch (I->getOpcode()) {
+ case AArch64::CBZW:
+ case AArch64::CBZX:
+ case AArch64::CBNZW:
+ case AArch64::CBNZX:
+ // These can be converted into a ccmp against #0.
+ return &*I;
+ }
+ ++NumCmpTermRejs;
+ DEBUG(dbgs() << "Flags not used by terminator: " << *I);
+ return nullptr;
+ }
+
+ // Now find the instruction controlling the terminator.
+ for (MachineBasicBlock::iterator B = MBB->begin(); I != B;) {
+ --I;
+ assert(!I->isTerminator() && "Spurious terminator");
+ switch (I->getOpcode()) {
+ // cmp is an alias for subs with a dead destination register.
+ case AArch64::SUBSWri:
+ case AArch64::SUBSXri:
+ // cmn is an alias for adds with a dead destination register.
+ case AArch64::ADDSWri:
+ case AArch64::ADDSXri:
+ // Check that the immediate operand is within range, ccmp wants a uimm5.
+ // Rd = SUBSri Rn, imm, shift
+ if (I->getOperand(3).getImm() || !isUInt<5>(I->getOperand(2).getImm())) {
+ DEBUG(dbgs() << "Immediate out of range for ccmp: " << *I);
+ ++NumImmRangeRejs;
+ return nullptr;
+ }
+ LLVM_FALLTHROUGH;
+ case AArch64::SUBSWrr:
+ case AArch64::SUBSXrr:
+ case AArch64::ADDSWrr:
+ case AArch64::ADDSXrr:
+ if (isDeadDef(I->getOperand(0).getReg()))
+ return &*I;
+ DEBUG(dbgs() << "Can't convert compare with live destination: " << *I);
+ ++NumLiveDstRejs;
+ return nullptr;
+ case AArch64::FCMPSrr:
+ case AArch64::FCMPDrr:
+ case AArch64::FCMPESrr:
+ case AArch64::FCMPEDrr:
+ return &*I;
+ }
+
+ // Check for flag reads and clobbers.
+ MIOperands::PhysRegInfo PRI =
+ MIOperands(*I).analyzePhysReg(AArch64::NZCV, TRI);
+
+ if (PRI.Read) {
+ // The ccmp doesn't produce exactly the same flags as the original
+ // compare, so reject the transform if there are uses of the flags
+ // besides the terminators.
+ DEBUG(dbgs() << "Can't create ccmp with multiple uses: " << *I);
+ ++NumMultNZCVUses;
+ return nullptr;
+ }
+
+ if (PRI.Defined || PRI.Clobbered) {
+ DEBUG(dbgs() << "Not convertible compare: " << *I);
+ ++NumUnknNZCVDefs;
+ return nullptr;
+ }
+ }
+ DEBUG(dbgs() << "Flags not defined in BB#" << MBB->getNumber() << '\n');
+ return nullptr;
+}
+
+/// Determine if all the instructions in MBB can safely
+/// be speculated. The terminators are not considered.
+///
+/// Only CmpMI is allowed to clobber the flags.
+///
+bool SSACCmpConv::canSpeculateInstrs(MachineBasicBlock *MBB,
+ const MachineInstr *CmpMI) {
+ // Reject any live-in physregs. It's probably NZCV/EFLAGS, and very hard to
+ // get right.
+ if (!MBB->livein_empty()) {
+ DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has live-ins.\n");
+ return false;
+ }
+
+ unsigned InstrCount = 0;
+
+ // Check all instructions, except the terminators. It is assumed that
+ // terminators never have side effects or define any used register values.
+ for (auto &I : make_range(MBB->begin(), MBB->getFirstTerminator())) {
+ if (I.isDebugValue())
+ continue;
+
+ if (++InstrCount > BlockInstrLimit && !Stress) {
+ DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has more than "
+ << BlockInstrLimit << " instructions.\n");
+ return false;
+ }
+
+ // There shouldn't normally be any phis in a single-predecessor block.
+ if (I.isPHI()) {
+ DEBUG(dbgs() << "Can't hoist: " << I);
+ return false;
+ }
+
+ // Don't speculate loads. Note that it may be possible and desirable to
+ // speculate GOT or constant pool loads that are guaranteed not to trap,
+ // but we don't support that for now.
+ if (I.mayLoad()) {
+ DEBUG(dbgs() << "Won't speculate load: " << I);
+ return false;
+ }
+
+ // We never speculate stores, so an AA pointer isn't necessary.
+ bool DontMoveAcrossStore = true;
+ if (!I.isSafeToMove(nullptr, DontMoveAcrossStore)) {
+ DEBUG(dbgs() << "Can't speculate: " << I);
+ return false;
+ }
+
+ // Only CmpMI is allowed to clobber the flags.
+ if (&I != CmpMI && I.modifiesRegister(AArch64::NZCV, TRI)) {
+ DEBUG(dbgs() << "Clobbers flags: " << I);
+ return false;
+ }
+ }
+ return true;
+}
+
+/// Analyze the sub-cfg rooted in MBB, and return true if it is a potential
+/// candidate for cmp-conversion. Fill out the internal state.
+///
+bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
+ Head = MBB;
+ Tail = CmpBB = nullptr;
+
+ if (Head->succ_size() != 2)
+ return false;
+ MachineBasicBlock *Succ0 = Head->succ_begin()[0];
+ MachineBasicBlock *Succ1 = Head->succ_begin()[1];
+
+ // CmpBB can only have a single predecessor. Tail is allowed many.
+ if (Succ0->pred_size() != 1)
+ std::swap(Succ0, Succ1);
+
+ // Succ0 is our candidate for CmpBB.
+ if (Succ0->pred_size() != 1 || Succ0->succ_size() != 2)
+ return false;
+
+ CmpBB = Succ0;
+ Tail = Succ1;
+
+ if (!CmpBB->isSuccessor(Tail))
+ return false;
+
+ // The CFG topology checks out.
+ DEBUG(dbgs() << "\nTriangle: BB#" << Head->getNumber() << " -> BB#"
+ << CmpBB->getNumber() << " -> BB#" << Tail->getNumber() << '\n');
+ ++NumConsidered;
+
+ // Tail is allowed to have many predecessors, but we can't handle PHIs yet.
+ //
+ // FIXME: Real PHIs could be if-converted as long as the CmpBB values are
+ // defined before The CmpBB cmp clobbers the flags. Alternatively, it should
+ // always be safe to sink the ccmp down to immediately before the CmpBB
+ // terminators.
+ if (!trivialTailPHIs()) {
+ DEBUG(dbgs() << "Can't handle phis in Tail.\n");
+ ++NumPhiRejs;
+ return false;
+ }
+
+ if (!Tail->livein_empty()) {
+ DEBUG(dbgs() << "Can't handle live-in physregs in Tail.\n");
+ ++NumPhysRejs;
+ return false;
+ }
+
+ // CmpBB should never have PHIs since Head is its only predecessor.
+ // FIXME: Clean them up if it happens.
+ if (!CmpBB->empty() && CmpBB->front().isPHI()) {
+ DEBUG(dbgs() << "Can't handle phis in CmpBB.\n");
+ ++NumPhi2Rejs;
+ return false;
+ }
+
+ if (!CmpBB->livein_empty()) {
+ DEBUG(dbgs() << "Can't handle live-in physregs in CmpBB.\n");
+ ++NumPhysRejs;
+ return false;
+ }
+
+ // The branch we're looking to eliminate must be analyzable.
+ HeadCond.clear();
+ MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+ if (TII->analyzeBranch(*Head, TBB, FBB, HeadCond)) {
+ DEBUG(dbgs() << "Head branch not analyzable.\n");
+ ++NumHeadBranchRejs;
+ return false;
+ }
+
+ // This is weird, probably some sort of degenerate CFG, or an edge to a
+ // landing pad.
+ if (!TBB || HeadCond.empty()) {
+ DEBUG(dbgs() << "AnalyzeBranch didn't find conditional branch in Head.\n");
+ ++NumHeadBranchRejs;
+ return false;
+ }
+
+ if (!parseCond(HeadCond, HeadCmpBBCC)) {
+ DEBUG(dbgs() << "Unsupported branch type on Head\n");
+ ++NumHeadBranchRejs;
+ return false;
+ }
+
+ // Make sure the branch direction is right.
+ if (TBB != CmpBB) {
+ assert(TBB == Tail && "Unexpected TBB");
+ HeadCmpBBCC = AArch64CC::getInvertedCondCode(HeadCmpBBCC);
+ }
+
+ CmpBBCond.clear();
+ TBB = FBB = nullptr;
+ if (TII->analyzeBranch(*CmpBB, TBB, FBB, CmpBBCond)) {
+ DEBUG(dbgs() << "CmpBB branch not analyzable.\n");
+ ++NumCmpBranchRejs;
+ return false;
+ }
+
+ if (!TBB || CmpBBCond.empty()) {
+ DEBUG(dbgs() << "AnalyzeBranch didn't find conditional branch in CmpBB.\n");
+ ++NumCmpBranchRejs;
+ return false;
+ }
+
+ if (!parseCond(CmpBBCond, CmpBBTailCC)) {
+ DEBUG(dbgs() << "Unsupported branch type on CmpBB\n");
+ ++NumCmpBranchRejs;
+ return false;
+ }
+
+ if (TBB != Tail)
+ CmpBBTailCC = AArch64CC::getInvertedCondCode(CmpBBTailCC);
+
+ DEBUG(dbgs() << "Head->CmpBB on " << AArch64CC::getCondCodeName(HeadCmpBBCC)
+ << ", CmpBB->Tail on " << AArch64CC::getCondCodeName(CmpBBTailCC)
+ << '\n');
+
+ CmpMI = findConvertibleCompare(CmpBB);
+ if (!CmpMI)
+ return false;
+
+ if (!canSpeculateInstrs(CmpBB, CmpMI)) {
+ ++NumSpeculateRejs;
+ return false;
+ }
+ return true;
+}
+
+void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
+ DEBUG(dbgs() << "Merging BB#" << CmpBB->getNumber() << " into BB#"
+ << Head->getNumber() << ":\n" << *CmpBB);
+
+ // All CmpBB instructions are moved into Head, and CmpBB is deleted.
+ // Update the CFG first.
+ updateTailPHIs();
+ Head->removeSuccessor(CmpBB, true);
+ CmpBB->removeSuccessor(Tail, true);
+ Head->transferSuccessorsAndUpdatePHIs(CmpBB);
+ DebugLoc TermDL = Head->getFirstTerminator()->getDebugLoc();
+ TII->removeBranch(*Head);
+
+ // If the Head terminator was one of the cbz / tbz branches with built-in
+ // compare, we need to insert an explicit compare instruction in its place.
+ if (HeadCond[0].getImm() == -1) {
+ ++NumCompBranches;
+ unsigned Opc = 0;
+ switch (HeadCond[1].getImm()) {
+ case AArch64::CBZW:
+ case AArch64::CBNZW:
+ Opc = AArch64::SUBSWri;
+ break;
+ case AArch64::CBZX:
+ case AArch64::CBNZX:
+ Opc = AArch64::SUBSXri;
+ break;
+ default:
+ llvm_unreachable("Cannot convert Head branch");
+ }
+ const MCInstrDesc &MCID = TII->get(Opc);
+ // Create a dummy virtual register for the SUBS def.
+ unsigned DestReg =
+ MRI->createVirtualRegister(TII->getRegClass(MCID, 0, TRI, *MF));
+ // Insert a SUBS Rn, #0 instruction instead of the cbz / cbnz.
+ BuildMI(*Head, Head->end(), TermDL, MCID)
+ .addReg(DestReg, RegState::Define | RegState::Dead)
+ .addOperand(HeadCond[2])
+ .addImm(0)
+ .addImm(0);
+ // SUBS uses the GPR*sp register classes.
+ MRI->constrainRegClass(HeadCond[2].getReg(),
+ TII->getRegClass(MCID, 1, TRI, *MF));
+ }
+
+ Head->splice(Head->end(), CmpBB, CmpBB->begin(), CmpBB->end());
+
+ // Now replace CmpMI with a ccmp instruction that also considers the incoming
+ // flags.
+ unsigned Opc = 0;
+ unsigned FirstOp = 1; // First CmpMI operand to copy.
+ bool isZBranch = false; // CmpMI is a cbz/cbnz instruction.
+ switch (CmpMI->getOpcode()) {
+ default:
+ llvm_unreachable("Unknown compare opcode");
+ case AArch64::SUBSWri: Opc = AArch64::CCMPWi; break;
+ case AArch64::SUBSWrr: Opc = AArch64::CCMPWr; break;
+ case AArch64::SUBSXri: Opc = AArch64::CCMPXi; break;
+ case AArch64::SUBSXrr: Opc = AArch64::CCMPXr; break;
+ case AArch64::ADDSWri: Opc = AArch64::CCMNWi; break;
+ case AArch64::ADDSWrr: Opc = AArch64::CCMNWr; break;
+ case AArch64::ADDSXri: Opc = AArch64::CCMNXi; break;
+ case AArch64::ADDSXrr: Opc = AArch64::CCMNXr; break;
+ case AArch64::FCMPSrr: Opc = AArch64::FCCMPSrr; FirstOp = 0; break;
+ case AArch64::FCMPDrr: Opc = AArch64::FCCMPDrr; FirstOp = 0; break;
+ case AArch64::FCMPESrr: Opc = AArch64::FCCMPESrr; FirstOp = 0; break;
+ case AArch64::FCMPEDrr: Opc = AArch64::FCCMPEDrr; FirstOp = 0; break;
+ case AArch64::CBZW:
+ case AArch64::CBNZW:
+ Opc = AArch64::CCMPWi;
+ FirstOp = 0;
+ isZBranch = true;
+ break;
+ case AArch64::CBZX:
+ case AArch64::CBNZX:
+ Opc = AArch64::CCMPXi;
+ FirstOp = 0;
+ isZBranch = true;
+ break;
+ }
+
+ // The ccmp instruction should set the flags according to the comparison when
+ // Head would have branched to CmpBB.
+ // The NZCV immediate operand should provide flags for the case where Head
+ // would have branched to Tail. These flags should cause the new Head
+ // terminator to branch to tail.
+ unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(CmpBBTailCC);
+ const MCInstrDesc &MCID = TII->get(Opc);
+ MRI->constrainRegClass(CmpMI->getOperand(FirstOp).getReg(),
+ TII->getRegClass(MCID, 0, TRI, *MF));
+ if (CmpMI->getOperand(FirstOp + 1).isReg())
+ MRI->constrainRegClass(CmpMI->getOperand(FirstOp + 1).getReg(),
+ TII->getRegClass(MCID, 1, TRI, *MF));
+ MachineInstrBuilder MIB =
+ BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), MCID)
+ .addOperand(CmpMI->getOperand(FirstOp)); // Register Rn
+ if (isZBranch)
+ MIB.addImm(0); // cbz/cbnz Rn -> ccmp Rn, #0
+ else
+ MIB.addOperand(CmpMI->getOperand(FirstOp + 1)); // Register Rm / Immediate
+ MIB.addImm(NZCV).addImm(HeadCmpBBCC);
+
+ // If CmpMI was a terminator, we need a new conditional branch to replace it.
+ // This now becomes a Head terminator.
+ if (isZBranch) {
+ bool isNZ = CmpMI->getOpcode() == AArch64::CBNZW ||
+ CmpMI->getOpcode() == AArch64::CBNZX;
+ BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), TII->get(AArch64::Bcc))
+ .addImm(isNZ ? AArch64CC::NE : AArch64CC::EQ)
+ .addOperand(CmpMI->getOperand(1)); // Branch target.
+ }
+ CmpMI->eraseFromParent();
+ Head->updateTerminator();
+
+ RemovedBlocks.push_back(CmpBB);
+ CmpBB->eraseFromParent();
+ DEBUG(dbgs() << "Result:\n" << *Head);
+ ++NumConverted;
+}
+
+int SSACCmpConv::expectedCodeSizeDelta() const {
+ int delta = 0;
+ // If the Head terminator was one of the cbz / tbz branches with built-in
+ // compare, we need to insert an explicit compare instruction in its place
+ // plus a branch instruction.
+ if (HeadCond[0].getImm() == -1) {
+ switch (HeadCond[1].getImm()) {
+ case AArch64::CBZW:
+ case AArch64::CBNZW:
+ case AArch64::CBZX:
+ case AArch64::CBNZX:
+ // Therefore delta += 1
+ delta = 1;
+ break;
+ default:
+ llvm_unreachable("Cannot convert Head branch");
+ }
+ }
+ // If the Cmp terminator was one of the cbz / tbz branches with
+ // built-in compare, it will be turned into a compare instruction
+ // into Head, but we do not save any instruction.
+ // Otherwise, we save the branch instruction.
+ switch (CmpMI->getOpcode()) {
+ default:
+ --delta;
+ break;
+ case AArch64::CBZW:
+ case AArch64::CBNZW:
+ case AArch64::CBZX:
+ case AArch64::CBNZX:
+ break;
+ }
+ return delta;
+}
+
+//===----------------------------------------------------------------------===//
+// AArch64ConditionalCompares Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+class AArch64ConditionalCompares : public MachineFunctionPass {
+ const TargetInstrInfo *TII;
+ const TargetRegisterInfo *TRI;
+ MCSchedModel SchedModel;
+ // Does the proceeded function has Oz attribute.
+ bool MinSize;
+ MachineRegisterInfo *MRI;
+ MachineDominatorTree *DomTree;
+ MachineLoopInfo *Loops;
+ MachineTraceMetrics *Traces;
+ MachineTraceMetrics::Ensemble *MinInstr;
+ SSACCmpConv CmpConv;
+
+public:
+ static char ID;
+ AArch64ConditionalCompares() : MachineFunctionPass(ID) {
+ initializeAArch64ConditionalComparesPass(*PassRegistry::getPassRegistry());
+ }
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+ bool runOnMachineFunction(MachineFunction &MF) override;
+ StringRef getPassName() const override {
+ return "AArch64 Conditional Compares";
+ }
+
+private:
+ bool tryConvert(MachineBasicBlock *);
+ void updateDomTree(ArrayRef<MachineBasicBlock *> Removed);
+ void updateLoops(ArrayRef<MachineBasicBlock *> Removed);
+ void invalidateTraces();
+ bool shouldConvert();
+};
+} // end anonymous namespace
+
+char AArch64ConditionalCompares::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AArch64ConditionalCompares, "aarch64-ccmp",
+ "AArch64 CCMP Pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
+INITIALIZE_PASS_END(AArch64ConditionalCompares, "aarch64-ccmp",
+ "AArch64 CCMP Pass", false, false)
+
+FunctionPass *llvm::createAArch64ConditionalCompares() {
+ return new AArch64ConditionalCompares();
+}
+
+void AArch64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineLoopInfo>();
+ AU.addPreserved<MachineLoopInfo>();
+ AU.addRequired<MachineTraceMetrics>();
+ AU.addPreserved<MachineTraceMetrics>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+/// Update the dominator tree after if-conversion erased some blocks.
+void AArch64ConditionalCompares::updateDomTree(
+ ArrayRef<MachineBasicBlock *> Removed) {
+ // convert() removes CmpBB which was previously dominated by Head.
+ // CmpBB children should be transferred to Head.
+ MachineDomTreeNode *HeadNode = DomTree->getNode(CmpConv.Head);
+ for (MachineBasicBlock *RemovedMBB : Removed) {
+ MachineDomTreeNode *Node = DomTree->getNode(RemovedMBB);
+ assert(Node != HeadNode && "Cannot erase the head node");
+ assert(Node->getIDom() == HeadNode && "CmpBB should be dominated by Head");
+ while (Node->getNumChildren())
+ DomTree->changeImmediateDominator(Node->getChildren().back(), HeadNode);
+ DomTree->eraseNode(RemovedMBB);
+ }
+}
+
+/// Update LoopInfo after if-conversion.
+void
+AArch64ConditionalCompares::updateLoops(ArrayRef<MachineBasicBlock *> Removed) {
+ if (!Loops)
+ return;
+ for (MachineBasicBlock *RemovedMBB : Removed)
+ Loops->removeBlock(RemovedMBB);
+}
+
+/// Invalidate MachineTraceMetrics before if-conversion.
+void AArch64ConditionalCompares::invalidateTraces() {
+ Traces->invalidate(CmpConv.Head);
+ Traces->invalidate(CmpConv.CmpBB);
+}
+
+/// Apply cost model and heuristics to the if-conversion in IfConv.
+/// Return true if the conversion is a good idea.
+///
+bool AArch64ConditionalCompares::shouldConvert() {
+ // Stress testing mode disables all cost considerations.
+ if (Stress)
+ return true;
+ if (!MinInstr)
+ MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
+
+ // Head dominates CmpBB, so it is always included in its trace.
+ MachineTraceMetrics::Trace Trace = MinInstr->getTrace(CmpConv.CmpBB);
+
+ // If code size is the main concern
+ if (MinSize) {
+ int CodeSizeDelta = CmpConv.expectedCodeSizeDelta();
+ DEBUG(dbgs() << "Code size delta: " << CodeSizeDelta << '\n');
+ // If we are minimizing the code size, do the conversion whatever
+ // the cost is.
+ if (CodeSizeDelta < 0)
+ return true;
+ if (CodeSizeDelta > 0) {
+ DEBUG(dbgs() << "Code size is increasing, give up on this one.\n");
+ return false;
+ }
+ // CodeSizeDelta == 0, continue with the regular heuristics
+ }
+
+ // Heuristic: The compare conversion delays the execution of the branch
+ // instruction because we must wait for the inputs to the second compare as
+ // well. The branch has no dependent instructions, but delaying it increases
+ // the cost of a misprediction.
+ //
+ // Set a limit on the delay we will accept.
+ unsigned DelayLimit = SchedModel.MispredictPenalty * 3 / 4;
+
+ // Instruction depths can be computed for all trace instructions above CmpBB.
+ unsigned HeadDepth =
+ Trace.getInstrCycles(*CmpConv.Head->getFirstTerminator()).Depth;
+ unsigned CmpBBDepth =
+ Trace.getInstrCycles(*CmpConv.CmpBB->getFirstTerminator()).Depth;
+ DEBUG(dbgs() << "Head depth: " << HeadDepth
+ << "\nCmpBB depth: " << CmpBBDepth << '\n');
+ if (CmpBBDepth > HeadDepth + DelayLimit) {
+ DEBUG(dbgs() << "Branch delay would be larger than " << DelayLimit
+ << " cycles.\n");
+ return false;
+ }
+
+ // Check the resource depth at the bottom of CmpBB - these instructions will
+ // be speculated.
+ unsigned ResDepth = Trace.getResourceDepth(true);
+ DEBUG(dbgs() << "Resources: " << ResDepth << '\n');
+
+ // Heuristic: The speculatively executed instructions must all be able to
+ // merge into the Head block. The Head critical path should dominate the
+ // resource cost of the speculated instructions.
+ if (ResDepth > HeadDepth) {
+ DEBUG(dbgs() << "Too many instructions to speculate.\n");
+ return false;
+ }
+ return true;
+}
+
+bool AArch64ConditionalCompares::tryConvert(MachineBasicBlock *MBB) {
+ bool Changed = false;
+ while (CmpConv.canConvert(MBB) && shouldConvert()) {
+ invalidateTraces();
+ SmallVector<MachineBasicBlock *, 4> RemovedBlocks;
+ CmpConv.convert(RemovedBlocks);
+ Changed = true;
+ updateDomTree(RemovedBlocks);
+ updateLoops(RemovedBlocks);
+ }
+ return Changed;
+}
+
+bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) {
+ DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n"
+ << "********** Function: " << MF.getName() << '\n');
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
+ TII = MF.getSubtarget().getInstrInfo();
+ TRI = MF.getSubtarget().getRegisterInfo();
+ SchedModel = MF.getSubtarget().getSchedModel();
+ MRI = &MF.getRegInfo();
+ DomTree = &getAnalysis<MachineDominatorTree>();
+ Loops = getAnalysisIfAvailable<MachineLoopInfo>();
+ Traces = &getAnalysis<MachineTraceMetrics>();
+ MinInstr = nullptr;
+ MinSize = MF.getFunction()->optForMinSize();
+
+ bool Changed = false;
+ CmpConv.runOnMachineFunction(MF);
+
+ // Visit blocks in dominator tree pre-order. The pre-order enables multiple
+ // cmp-conversions from the same head block.
+ // Note that updateDomTree() modifies the children of the DomTree node
+ // currently being visited. The df_iterator supports that; it doesn't look at
+ // child_begin() / child_end() until after a node has been visited.
+ for (auto *I : depth_first(DomTree))
+ if (tryConvert(I->getBlock()))
+ Changed = true;
+
+ return Changed;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
new file mode 100644
index 000000000000..30e2b2310456
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
@@ -0,0 +1,149 @@
+//==-- AArch64DeadRegisterDefinitions.cpp - Replace dead defs w/ zero reg --==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file When allowed by the instruction, replace a dead definition of a GPR
+/// with the zero register. This makes the code a bit friendlier towards the
+/// hardware's register renamer.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64RegisterInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-dead-defs"
+
+STATISTIC(NumDeadDefsReplaced, "Number of dead definitions replaced");
+
+#define AARCH64_DEAD_REG_DEF_NAME "AArch64 Dead register definitions"
+
+namespace {
+class AArch64DeadRegisterDefinitions : public MachineFunctionPass {
+private:
+ const TargetRegisterInfo *TRI;
+ const MachineRegisterInfo *MRI;
+ const TargetInstrInfo *TII;
+ bool Changed;
+ void processMachineBasicBlock(MachineBasicBlock &MBB);
+public:
+ static char ID; // Pass identification, replacement for typeid.
+ AArch64DeadRegisterDefinitions() : MachineFunctionPass(ID) {
+ initializeAArch64DeadRegisterDefinitionsPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &F) override;
+
+ StringRef getPassName() const override { return AARCH64_DEAD_REG_DEF_NAME; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+char AArch64DeadRegisterDefinitions::ID = 0;
+} // end anonymous namespace
+
+INITIALIZE_PASS(AArch64DeadRegisterDefinitions, "aarch64-dead-defs",
+ AARCH64_DEAD_REG_DEF_NAME, false, false)
+
+static bool usesFrameIndex(const MachineInstr &MI) {
+ for (const MachineOperand &MO : MI.uses())
+ if (MO.isFI())
+ return true;
+ return false;
+}
+
+void AArch64DeadRegisterDefinitions::processMachineBasicBlock(
+ MachineBasicBlock &MBB) {
+ const MachineFunction &MF = *MBB.getParent();
+ for (MachineInstr &MI : MBB) {
+ if (usesFrameIndex(MI)) {
+ // We need to skip this instruction because while it appears to have a
+ // dead def it uses a frame index which might expand into a multi
+ // instruction sequence during EPI.
+ DEBUG(dbgs() << " Ignoring, operand is frame index\n");
+ continue;
+ }
+ if (MI.definesRegister(AArch64::XZR) || MI.definesRegister(AArch64::WZR)) {
+ // It is not allowed to write to the same register (not even the zero
+ // register) twice in a single instruction.
+ DEBUG(dbgs() << " Ignoring, XZR or WZR already used by the instruction\n");
+ continue;
+ }
+ const MCInstrDesc &Desc = MI.getDesc();
+ for (int I = 0, E = Desc.getNumDefs(); I != E; ++I) {
+ MachineOperand &MO = MI.getOperand(I);
+ if (!MO.isReg() || !MO.isDef())
+ continue;
+ // We should not have any relevant physreg defs that are replacable by
+ // zero before register allocation. So we just check for dead vreg defs.
+ unsigned Reg = MO.getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(Reg) ||
+ (!MO.isDead() && !MRI->use_nodbg_empty(Reg)))
+ continue;
+ assert(!MO.isImplicit() && "Unexpected implicit def!");
+ DEBUG(dbgs() << " Dead def operand #" << I << " in:\n ";
+ MI.print(dbgs()));
+ // Be careful not to change the register if it's a tied operand.
+ if (MI.isRegTiedToUseOperand(I)) {
+ DEBUG(dbgs() << " Ignoring, def is tied operand.\n");
+ continue;
+ }
+ const TargetRegisterClass *RC = TII->getRegClass(Desc, I, TRI, MF);
+ unsigned NewReg;
+ if (RC == nullptr) {
+ DEBUG(dbgs() << " Ignoring, register is not a GPR.\n");
+ continue;
+ } else if (RC->contains(AArch64::WZR))
+ NewReg = AArch64::WZR;
+ else if (RC->contains(AArch64::XZR))
+ NewReg = AArch64::XZR;
+ else {
+ DEBUG(dbgs() << " Ignoring, register is not a GPR.\n");
+ continue;
+ }
+ DEBUG(dbgs() << " Replacing with zero register. New:\n ");
+ MO.setReg(NewReg);
+ MO.setIsDead();
+ DEBUG(MI.print(dbgs()));
+ ++NumDeadDefsReplaced;
+ Changed = true;
+ // Only replace one dead register, see check for zero register above.
+ break;
+ }
+ }
+}
+
+// Scan the function for instructions that have a dead definition of a
+// register. Replace that register with the zero register when possible.
+bool AArch64DeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
+ TRI = MF.getSubtarget().getRegisterInfo();
+ TII = MF.getSubtarget().getInstrInfo();
+ MRI = &MF.getRegInfo();
+ DEBUG(dbgs() << "***** AArch64DeadRegisterDefinitions *****\n");
+ Changed = false;
+ for (auto &MBB : MF)
+ processMachineBasicBlock(MBB);
+ return Changed;
+}
+
+FunctionPass *llvm::createAArch64DeadRegisterDefinitions() {
+ return new AArch64DeadRegisterDefinitions();
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
new file mode 100644
index 000000000000..fe1c0beee0eb
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -0,0 +1,965 @@
+//==-- AArch64ExpandPseudoInsts.cpp - Expand pseudo instructions --*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that expands pseudo instructions into target
+// instructions to allow proper scheduling and other late optimizations. This
+// pass should be run after register allocation but before the post-regalloc
+// scheduling pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/MathExtras.h"
+using namespace llvm;
+
+#define AARCH64_EXPAND_PSEUDO_NAME "AArch64 pseudo instruction expansion pass"
+
+namespace {
+class AArch64ExpandPseudo : public MachineFunctionPass {
+public:
+ static char ID;
+ AArch64ExpandPseudo() : MachineFunctionPass(ID) {
+ initializeAArch64ExpandPseudoPass(*PassRegistry::getPassRegistry());
+ }
+
+ const AArch64InstrInfo *TII;
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+
+ StringRef getPassName() const override { return AARCH64_EXPAND_PSEUDO_NAME; }
+
+private:
+ bool expandMBB(MachineBasicBlock &MBB);
+ bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI);
+ bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ unsigned BitSize);
+
+ bool expandCMP_SWAP(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ unsigned LdarOp, unsigned StlrOp, unsigned CmpOp,
+ unsigned ExtendImm, unsigned ZeroReg,
+ MachineBasicBlock::iterator &NextMBBI);
+ bool expandCMP_SWAP_128(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI);
+};
+char AArch64ExpandPseudo::ID = 0;
+}
+
+INITIALIZE_PASS(AArch64ExpandPseudo, "aarch64-expand-pseudo",
+ AARCH64_EXPAND_PSEUDO_NAME, false, false)
+
+/// \brief Transfer implicit operands on the pseudo instruction to the
+/// instructions created from the expansion.
+static void transferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI,
+ MachineInstrBuilder &DefMI) {
+ const MCInstrDesc &Desc = OldMI.getDesc();
+ for (unsigned i = Desc.getNumOperands(), e = OldMI.getNumOperands(); i != e;
+ ++i) {
+ const MachineOperand &MO = OldMI.getOperand(i);
+ assert(MO.isReg() && MO.getReg());
+ if (MO.isUse())
+ UseMI.addOperand(MO);
+ else
+ DefMI.addOperand(MO);
+ }
+}
+
+/// \brief Helper function which extracts the specified 16-bit chunk from a
+/// 64-bit value.
+static uint64_t getChunk(uint64_t Imm, unsigned ChunkIdx) {
+ assert(ChunkIdx < 4 && "Out of range chunk index specified!");
+
+ return (Imm >> (ChunkIdx * 16)) & 0xFFFF;
+}
+
+/// \brief Helper function which replicates a 16-bit chunk within a 64-bit
+/// value. Indices correspond to element numbers in a v4i16.
+static uint64_t replicateChunk(uint64_t Imm, unsigned FromIdx, unsigned ToIdx) {
+ assert((FromIdx < 4) && (ToIdx < 4) && "Out of range chunk index specified!");
+ const unsigned ShiftAmt = ToIdx * 16;
+
+ // Replicate the source chunk to the destination position.
+ const uint64_t Chunk = getChunk(Imm, FromIdx) << ShiftAmt;
+ // Clear the destination chunk.
+ Imm &= ~(0xFFFFLL << ShiftAmt);
+ // Insert the replicated chunk.
+ return Imm | Chunk;
+}
+
+/// \brief Helper function which tries to materialize a 64-bit value with an
+/// ORR + MOVK instruction sequence.
+static bool tryOrrMovk(uint64_t UImm, uint64_t OrrImm, MachineInstr &MI,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ const AArch64InstrInfo *TII, unsigned ChunkIdx) {
+ assert(ChunkIdx < 4 && "Out of range chunk index specified!");
+ const unsigned ShiftAmt = ChunkIdx * 16;
+
+ uint64_t Encoding;
+ if (AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding)) {
+ // Create the ORR-immediate instruction.
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
+ .addOperand(MI.getOperand(0))
+ .addReg(AArch64::XZR)
+ .addImm(Encoding);
+
+ // Create the MOVK instruction.
+ const unsigned Imm16 = getChunk(UImm, ChunkIdx);
+ const unsigned DstReg = MI.getOperand(0).getReg();
+ const bool DstIsDead = MI.getOperand(0).isDead();
+ MachineInstrBuilder MIB1 =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
+ .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstReg)
+ .addImm(Imm16)
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt));
+
+ transferImpOps(MI, MIB, MIB1);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ return false;
+}
+
+/// \brief Check whether the given 16-bit chunk replicated to full 64-bit width
+/// can be materialized with an ORR instruction.
+static bool canUseOrr(uint64_t Chunk, uint64_t &Encoding) {
+ Chunk = (Chunk << 48) | (Chunk << 32) | (Chunk << 16) | Chunk;
+
+ return AArch64_AM::processLogicalImmediate(Chunk, 64, Encoding);
+}
+
+/// \brief Check for identical 16-bit chunks within the constant and if so
+/// materialize them with a single ORR instruction. The remaining one or two
+/// 16-bit chunks will be materialized with MOVK instructions.
+///
+/// This allows us to materialize constants like |A|B|A|A| or |A|B|C|A| (order
+/// of the chunks doesn't matter), assuming |A|A|A|A| can be materialized with
+/// an ORR instruction.
+///
+static bool tryToreplicateChunks(uint64_t UImm, MachineInstr &MI,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ const AArch64InstrInfo *TII) {
+ typedef DenseMap<uint64_t, unsigned> CountMap;
+ CountMap Counts;
+
+ // Scan the constant and count how often every chunk occurs.
+ for (unsigned Idx = 0; Idx < 4; ++Idx)
+ ++Counts[getChunk(UImm, Idx)];
+
+ // Traverse the chunks to find one which occurs more than once.
+ for (CountMap::const_iterator Chunk = Counts.begin(), End = Counts.end();
+ Chunk != End; ++Chunk) {
+ const uint64_t ChunkVal = Chunk->first;
+ const unsigned Count = Chunk->second;
+
+ uint64_t Encoding = 0;
+
+ // We are looking for chunks which have two or three instances and can be
+ // materialized with an ORR instruction.
+ if ((Count != 2 && Count != 3) || !canUseOrr(ChunkVal, Encoding))
+ continue;
+
+ const bool CountThree = Count == 3;
+ // Create the ORR-immediate instruction.
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
+ .addOperand(MI.getOperand(0))
+ .addReg(AArch64::XZR)
+ .addImm(Encoding);
+
+ const unsigned DstReg = MI.getOperand(0).getReg();
+ const bool DstIsDead = MI.getOperand(0).isDead();
+
+ unsigned ShiftAmt = 0;
+ uint64_t Imm16 = 0;
+ // Find the first chunk not materialized with the ORR instruction.
+ for (; ShiftAmt < 64; ShiftAmt += 16) {
+ Imm16 = (UImm >> ShiftAmt) & 0xFFFF;
+
+ if (Imm16 != ChunkVal)
+ break;
+ }
+
+ // Create the first MOVK instruction.
+ MachineInstrBuilder MIB1 =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
+ .addReg(DstReg,
+ RegState::Define | getDeadRegState(DstIsDead && CountThree))
+ .addReg(DstReg)
+ .addImm(Imm16)
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt));
+
+ // In case we have three instances the whole constant is now materialized
+ // and we can exit.
+ if (CountThree) {
+ transferImpOps(MI, MIB, MIB1);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ // Find the remaining chunk which needs to be materialized.
+ for (ShiftAmt += 16; ShiftAmt < 64; ShiftAmt += 16) {
+ Imm16 = (UImm >> ShiftAmt) & 0xFFFF;
+
+ if (Imm16 != ChunkVal)
+ break;
+ }
+
+ // Create the second MOVK instruction.
+ MachineInstrBuilder MIB2 =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
+ .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstReg)
+ .addImm(Imm16)
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt));
+
+ transferImpOps(MI, MIB, MIB2);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ return false;
+}
+
+/// \brief Check whether this chunk matches the pattern '1...0...'. This pattern
+/// starts a contiguous sequence of ones if we look at the bits from the LSB
+/// towards the MSB.
+static bool isStartChunk(uint64_t Chunk) {
+ if (Chunk == 0 || Chunk == UINT64_MAX)
+ return false;
+
+ return isMask_64(~Chunk);
+}
+
+/// \brief Check whether this chunk matches the pattern '0...1...' This pattern
+/// ends a contiguous sequence of ones if we look at the bits from the LSB
+/// towards the MSB.
+static bool isEndChunk(uint64_t Chunk) {
+ if (Chunk == 0 || Chunk == UINT64_MAX)
+ return false;
+
+ return isMask_64(Chunk);
+}
+
+/// \brief Clear or set all bits in the chunk at the given index.
+static uint64_t updateImm(uint64_t Imm, unsigned Idx, bool Clear) {
+ const uint64_t Mask = 0xFFFF;
+
+ if (Clear)
+ // Clear chunk in the immediate.
+ Imm &= ~(Mask << (Idx * 16));
+ else
+ // Set all bits in the immediate for the particular chunk.
+ Imm |= Mask << (Idx * 16);
+
+ return Imm;
+}
+
+/// \brief Check whether the constant contains a sequence of contiguous ones,
+/// which might be interrupted by one or two chunks. If so, materialize the
+/// sequence of contiguous ones with an ORR instruction.
+/// Materialize the chunks which are either interrupting the sequence or outside
+/// of the sequence with a MOVK instruction.
+///
+/// Assuming S is a chunk which starts the sequence (1...0...), E is a chunk
+/// which ends the sequence (0...1...). Then we are looking for constants which
+/// contain at least one S and E chunk.
+/// E.g. |E|A|B|S|, |A|E|B|S| or |A|B|E|S|.
+///
+/// We are also looking for constants like |S|A|B|E| where the contiguous
+/// sequence of ones wraps around the MSB into the LSB.
+///
+static bool trySequenceOfOnes(uint64_t UImm, MachineInstr &MI,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ const AArch64InstrInfo *TII) {
+ const int NotSet = -1;
+ const uint64_t Mask = 0xFFFF;
+
+ int StartIdx = NotSet;
+ int EndIdx = NotSet;
+ // Try to find the chunks which start/end a contiguous sequence of ones.
+ for (int Idx = 0; Idx < 4; ++Idx) {
+ int64_t Chunk = getChunk(UImm, Idx);
+ // Sign extend the 16-bit chunk to 64-bit.
+ Chunk = (Chunk << 48) >> 48;
+
+ if (isStartChunk(Chunk))
+ StartIdx = Idx;
+ else if (isEndChunk(Chunk))
+ EndIdx = Idx;
+ }
+
+ // Early exit in case we can't find a start/end chunk.
+ if (StartIdx == NotSet || EndIdx == NotSet)
+ return false;
+
+ // Outside of the contiguous sequence of ones everything needs to be zero.
+ uint64_t Outside = 0;
+ // Chunks between the start and end chunk need to have all their bits set.
+ uint64_t Inside = Mask;
+
+ // If our contiguous sequence of ones wraps around from the MSB into the LSB,
+ // just swap indices and pretend we are materializing a contiguous sequence
+ // of zeros surrounded by a contiguous sequence of ones.
+ if (StartIdx > EndIdx) {
+ std::swap(StartIdx, EndIdx);
+ std::swap(Outside, Inside);
+ }
+
+ uint64_t OrrImm = UImm;
+ int FirstMovkIdx = NotSet;
+ int SecondMovkIdx = NotSet;
+
+ // Find out which chunks we need to patch up to obtain a contiguous sequence
+ // of ones.
+ for (int Idx = 0; Idx < 4; ++Idx) {
+ const uint64_t Chunk = getChunk(UImm, Idx);
+
+ // Check whether we are looking at a chunk which is not part of the
+ // contiguous sequence of ones.
+ if ((Idx < StartIdx || EndIdx < Idx) && Chunk != Outside) {
+ OrrImm = updateImm(OrrImm, Idx, Outside == 0);
+
+ // Remember the index we need to patch.
+ if (FirstMovkIdx == NotSet)
+ FirstMovkIdx = Idx;
+ else
+ SecondMovkIdx = Idx;
+
+ // Check whether we are looking a chunk which is part of the contiguous
+ // sequence of ones.
+ } else if (Idx > StartIdx && Idx < EndIdx && Chunk != Inside) {
+ OrrImm = updateImm(OrrImm, Idx, Inside != Mask);
+
+ // Remember the index we need to patch.
+ if (FirstMovkIdx == NotSet)
+ FirstMovkIdx = Idx;
+ else
+ SecondMovkIdx = Idx;
+ }
+ }
+ assert(FirstMovkIdx != NotSet && "Constant materializable with single ORR!");
+
+ // Create the ORR-immediate instruction.
+ uint64_t Encoding = 0;
+ AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding);
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
+ .addOperand(MI.getOperand(0))
+ .addReg(AArch64::XZR)
+ .addImm(Encoding);
+
+ const unsigned DstReg = MI.getOperand(0).getReg();
+ const bool DstIsDead = MI.getOperand(0).isDead();
+
+ const bool SingleMovk = SecondMovkIdx == NotSet;
+ // Create the first MOVK instruction.
+ MachineInstrBuilder MIB1 =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
+ .addReg(DstReg,
+ RegState::Define | getDeadRegState(DstIsDead && SingleMovk))
+ .addReg(DstReg)
+ .addImm(getChunk(UImm, FirstMovkIdx))
+ .addImm(
+ AArch64_AM::getShifterImm(AArch64_AM::LSL, FirstMovkIdx * 16));
+
+ // Early exit in case we only need to emit a single MOVK instruction.
+ if (SingleMovk) {
+ transferImpOps(MI, MIB, MIB1);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ // Create the second MOVK instruction.
+ MachineInstrBuilder MIB2 =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
+ .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstReg)
+ .addImm(getChunk(UImm, SecondMovkIdx))
+ .addImm(
+ AArch64_AM::getShifterImm(AArch64_AM::LSL, SecondMovkIdx * 16));
+
+ transferImpOps(MI, MIB, MIB2);
+ MI.eraseFromParent();
+ return true;
+}
+
+/// \brief Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more
+/// real move-immediate instructions to synthesize the immediate.
+bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ unsigned BitSize) {
+ MachineInstr &MI = *MBBI;
+ unsigned DstReg = MI.getOperand(0).getReg();
+ uint64_t Imm = MI.getOperand(1).getImm();
+ const unsigned Mask = 0xFFFF;
+
+ if (DstReg == AArch64::XZR || DstReg == AArch64::WZR) {
+ // Useless def, and we don't want to risk creating an invalid ORR (which
+ // would really write to sp).
+ MI.eraseFromParent();
+ return true;
+ }
+
+ // Try a MOVI instruction (aka ORR-immediate with the zero register).
+ uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
+ uint64_t Encoding;
+ if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
+ unsigned Opc = (BitSize == 32 ? AArch64::ORRWri : AArch64::ORRXri);
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc))
+ .addOperand(MI.getOperand(0))
+ .addReg(BitSize == 32 ? AArch64::WZR : AArch64::XZR)
+ .addImm(Encoding);
+ transferImpOps(MI, MIB, MIB);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ // Scan the immediate and count the number of 16-bit chunks which are either
+ // all ones or all zeros.
+ unsigned OneChunks = 0;
+ unsigned ZeroChunks = 0;
+ for (unsigned Shift = 0; Shift < BitSize; Shift += 16) {
+ const unsigned Chunk = (Imm >> Shift) & Mask;
+ if (Chunk == Mask)
+ OneChunks++;
+ else if (Chunk == 0)
+ ZeroChunks++;
+ }
+
+ // Since we can't materialize the constant with a single ORR instruction,
+ // let's see whether we can materialize 3/4 of the constant with an ORR
+ // instruction and use an additional MOVK instruction to materialize the
+ // remaining 1/4.
+ //
+ // We are looking for constants with a pattern like: |A|X|B|X| or |X|A|X|B|.
+ //
+ // E.g. assuming |A|X|A|X| is a pattern which can be materialized with ORR,
+ // we would create the following instruction sequence:
+ //
+ // ORR x0, xzr, |A|X|A|X|
+ // MOVK x0, |B|, LSL #16
+ //
+ // Only look at 64-bit constants which can't be materialized with a single
+ // instruction e.g. which have less than either three all zero or all one
+ // chunks.
+ //
+ // Ignore 32-bit constants here, they always can be materialized with a
+ // MOVZ/MOVN + MOVK pair. Since the 32-bit constant can't be materialized
+ // with a single ORR, the best sequence we can achieve is a ORR + MOVK pair.
+ // Thus we fall back to the default code below which in the best case creates
+ // a single MOVZ/MOVN instruction (in case one chunk is all zero or all one).
+ //
+ if (BitSize == 64 && OneChunks < 3 && ZeroChunks < 3) {
+ // If we interpret the 64-bit constant as a v4i16, are elements 0 and 2
+ // identical?
+ if (getChunk(UImm, 0) == getChunk(UImm, 2)) {
+ // See if we can come up with a constant which can be materialized with
+ // ORR-immediate by replicating element 3 into element 1.
+ uint64_t OrrImm = replicateChunk(UImm, 3, 1);
+ if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 1))
+ return true;
+
+ // See if we can come up with a constant which can be materialized with
+ // ORR-immediate by replicating element 1 into element 3.
+ OrrImm = replicateChunk(UImm, 1, 3);
+ if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 3))
+ return true;
+
+ // If we interpret the 64-bit constant as a v4i16, are elements 1 and 3
+ // identical?
+ } else if (getChunk(UImm, 1) == getChunk(UImm, 3)) {
+ // See if we can come up with a constant which can be materialized with
+ // ORR-immediate by replicating element 2 into element 0.
+ uint64_t OrrImm = replicateChunk(UImm, 2, 0);
+ if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 0))
+ return true;
+
+ // See if we can come up with a constant which can be materialized with
+ // ORR-immediate by replicating element 1 into element 3.
+ OrrImm = replicateChunk(UImm, 0, 2);
+ if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 2))
+ return true;
+ }
+ }
+
+ // Check for identical 16-bit chunks within the constant and if so materialize
+ // them with a single ORR instruction. The remaining one or two 16-bit chunks
+ // will be materialized with MOVK instructions.
+ if (BitSize == 64 && tryToreplicateChunks(UImm, MI, MBB, MBBI, TII))
+ return true;
+
+ // Check whether the constant contains a sequence of contiguous ones, which
+ // might be interrupted by one or two chunks. If so, materialize the sequence
+ // of contiguous ones with an ORR instruction. Materialize the chunks which
+ // are either interrupting the sequence or outside of the sequence with a
+ // MOVK instruction.
+ if (BitSize == 64 && trySequenceOfOnes(UImm, MI, MBB, MBBI, TII))
+ return true;
+
+ // Use a MOVZ or MOVN instruction to set the high bits, followed by one or
+ // more MOVK instructions to insert additional 16-bit portions into the
+ // lower bits.
+ bool isNeg = false;
+
+ // Use MOVN to materialize the high bits if we have more all one chunks
+ // than all zero chunks.
+ if (OneChunks > ZeroChunks) {
+ isNeg = true;
+ Imm = ~Imm;
+ }
+
+ unsigned FirstOpc;
+ if (BitSize == 32) {
+ Imm &= (1LL << 32) - 1;
+ FirstOpc = (isNeg ? AArch64::MOVNWi : AArch64::MOVZWi);
+ } else {
+ FirstOpc = (isNeg ? AArch64::MOVNXi : AArch64::MOVZXi);
+ }
+ unsigned Shift = 0; // LSL amount for high bits with MOVZ/MOVN
+ unsigned LastShift = 0; // LSL amount for last MOVK
+ if (Imm != 0) {
+ unsigned LZ = countLeadingZeros(Imm);
+ unsigned TZ = countTrailingZeros(Imm);
+ Shift = ((63 - LZ) / 16) * 16;
+ LastShift = (TZ / 16) * 16;
+ }
+ unsigned Imm16 = (Imm >> Shift) & Mask;
+ bool DstIsDead = MI.getOperand(0).isDead();
+ MachineInstrBuilder MIB1 =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(FirstOpc))
+ .addReg(DstReg, RegState::Define |
+ getDeadRegState(DstIsDead && Shift == LastShift))
+ .addImm(Imm16)
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift));
+
+ // If a MOVN was used for the high bits of a negative value, flip the rest
+ // of the bits back for use with MOVK.
+ if (isNeg)
+ Imm = ~Imm;
+
+ if (Shift == LastShift) {
+ transferImpOps(MI, MIB1, MIB1);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ MachineInstrBuilder MIB2;
+ unsigned Opc = (BitSize == 32 ? AArch64::MOVKWi : AArch64::MOVKXi);
+ while (Shift != LastShift) {
+ Shift -= 16;
+ Imm16 = (Imm >> Shift) & Mask;
+ if (Imm16 == (isNeg ? Mask : 0))
+ continue; // This 16-bit portion is already set correctly.
+ MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc))
+ .addReg(DstReg,
+ RegState::Define |
+ getDeadRegState(DstIsDead && Shift == LastShift))
+ .addReg(DstReg)
+ .addImm(Imm16)
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift));
+ }
+
+ transferImpOps(MI, MIB1, MIB2);
+ MI.eraseFromParent();
+ return true;
+}
+
+static void addPostLoopLiveIns(MachineBasicBlock *MBB, LivePhysRegs &LiveRegs) {
+ for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I)
+ MBB->addLiveIn(*I);
+}
+
+bool AArch64ExpandPseudo::expandCMP_SWAP(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned LdarOp,
+ unsigned StlrOp, unsigned CmpOp, unsigned ExtendImm, unsigned ZeroReg,
+ MachineBasicBlock::iterator &NextMBBI) {
+ MachineInstr &MI = *MBBI;
+ DebugLoc DL = MI.getDebugLoc();
+ MachineOperand &Dest = MI.getOperand(0);
+ unsigned StatusReg = MI.getOperand(1).getReg();
+ MachineOperand &Addr = MI.getOperand(2);
+ MachineOperand &Desired = MI.getOperand(3);
+ MachineOperand &New = MI.getOperand(4);
+
+ LivePhysRegs LiveRegs(&TII->getRegisterInfo());
+ LiveRegs.addLiveOuts(MBB);
+ for (auto I = std::prev(MBB.end()); I != MBBI; --I)
+ LiveRegs.stepBackward(*I);
+
+ MachineFunction *MF = MBB.getParent();
+ auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+ auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+ auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+
+ MF->insert(++MBB.getIterator(), LoadCmpBB);
+ MF->insert(++LoadCmpBB->getIterator(), StoreBB);
+ MF->insert(++StoreBB->getIterator(), DoneBB);
+
+ // .Lloadcmp:
+ // ldaxr xDest, [xAddr]
+ // cmp xDest, xDesired
+ // b.ne .Ldone
+ LoadCmpBB->addLiveIn(Addr.getReg());
+ LoadCmpBB->addLiveIn(Dest.getReg());
+ LoadCmpBB->addLiveIn(Desired.getReg());
+ addPostLoopLiveIns(LoadCmpBB, LiveRegs);
+
+ BuildMI(LoadCmpBB, DL, TII->get(LdarOp), Dest.getReg())
+ .addReg(Addr.getReg());
+ BuildMI(LoadCmpBB, DL, TII->get(CmpOp), ZeroReg)
+ .addReg(Dest.getReg(), getKillRegState(Dest.isDead()))
+ .addOperand(Desired)
+ .addImm(ExtendImm);
+ BuildMI(LoadCmpBB, DL, TII->get(AArch64::Bcc))
+ .addImm(AArch64CC::NE)
+ .addMBB(DoneBB)
+ .addReg(AArch64::NZCV, RegState::Implicit | RegState::Kill);
+ LoadCmpBB->addSuccessor(DoneBB);
+ LoadCmpBB->addSuccessor(StoreBB);
+
+ // .Lstore:
+ // stlxr wStatus, xNew, [xAddr]
+ // cbnz wStatus, .Lloadcmp
+ StoreBB->addLiveIn(Addr.getReg());
+ StoreBB->addLiveIn(New.getReg());
+ addPostLoopLiveIns(StoreBB, LiveRegs);
+
+ BuildMI(StoreBB, DL, TII->get(StlrOp), StatusReg)
+ .addOperand(New)
+ .addOperand(Addr);
+ BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW))
+ .addReg(StatusReg, RegState::Kill)
+ .addMBB(LoadCmpBB);
+ StoreBB->addSuccessor(LoadCmpBB);
+ StoreBB->addSuccessor(DoneBB);
+
+ DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
+ DoneBB->transferSuccessors(&MBB);
+ addPostLoopLiveIns(DoneBB, LiveRegs);
+
+ MBB.addSuccessor(LoadCmpBB);
+
+ NextMBBI = MBB.end();
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AArch64ExpandPseudo::expandCMP_SWAP_128(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI) {
+
+ MachineInstr &MI = *MBBI;
+ DebugLoc DL = MI.getDebugLoc();
+ MachineOperand &DestLo = MI.getOperand(0);
+ MachineOperand &DestHi = MI.getOperand(1);
+ unsigned StatusReg = MI.getOperand(2).getReg();
+ MachineOperand &Addr = MI.getOperand(3);
+ MachineOperand &DesiredLo = MI.getOperand(4);
+ MachineOperand &DesiredHi = MI.getOperand(5);
+ MachineOperand &NewLo = MI.getOperand(6);
+ MachineOperand &NewHi = MI.getOperand(7);
+
+ LivePhysRegs LiveRegs(&TII->getRegisterInfo());
+ LiveRegs.addLiveOuts(MBB);
+ for (auto I = std::prev(MBB.end()); I != MBBI; --I)
+ LiveRegs.stepBackward(*I);
+
+ MachineFunction *MF = MBB.getParent();
+ auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+ auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+ auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+
+ MF->insert(++MBB.getIterator(), LoadCmpBB);
+ MF->insert(++LoadCmpBB->getIterator(), StoreBB);
+ MF->insert(++StoreBB->getIterator(), DoneBB);
+
+ // .Lloadcmp:
+ // ldaxp xDestLo, xDestHi, [xAddr]
+ // cmp xDestLo, xDesiredLo
+ // sbcs xDestHi, xDesiredHi
+ // b.ne .Ldone
+ LoadCmpBB->addLiveIn(Addr.getReg());
+ LoadCmpBB->addLiveIn(DestLo.getReg());
+ LoadCmpBB->addLiveIn(DestHi.getReg());
+ LoadCmpBB->addLiveIn(DesiredLo.getReg());
+ LoadCmpBB->addLiveIn(DesiredHi.getReg());
+ addPostLoopLiveIns(LoadCmpBB, LiveRegs);
+
+ BuildMI(LoadCmpBB, DL, TII->get(AArch64::LDAXPX))
+ .addReg(DestLo.getReg(), RegState::Define)
+ .addReg(DestHi.getReg(), RegState::Define)
+ .addReg(Addr.getReg());
+ BuildMI(LoadCmpBB, DL, TII->get(AArch64::SUBSXrs), AArch64::XZR)
+ .addReg(DestLo.getReg(), getKillRegState(DestLo.isDead()))
+ .addOperand(DesiredLo)
+ .addImm(0);
+ BuildMI(LoadCmpBB, DL, TII->get(AArch64::CSINCWr), StatusReg)
+ .addUse(AArch64::WZR)
+ .addUse(AArch64::WZR)
+ .addImm(AArch64CC::EQ);
+ BuildMI(LoadCmpBB, DL, TII->get(AArch64::SUBSXrs), AArch64::XZR)
+ .addReg(DestHi.getReg(), getKillRegState(DestHi.isDead()))
+ .addOperand(DesiredHi)
+ .addImm(0);
+ BuildMI(LoadCmpBB, DL, TII->get(AArch64::CSINCWr), StatusReg)
+ .addUse(StatusReg, RegState::Kill)
+ .addUse(StatusReg, RegState::Kill)
+ .addImm(AArch64CC::EQ);
+ BuildMI(LoadCmpBB, DL, TII->get(AArch64::CBNZW))
+ .addUse(StatusReg, RegState::Kill)
+ .addMBB(DoneBB);
+ LoadCmpBB->addSuccessor(DoneBB);
+ LoadCmpBB->addSuccessor(StoreBB);
+
+ // .Lstore:
+ // stlxp wStatus, xNewLo, xNewHi, [xAddr]
+ // cbnz wStatus, .Lloadcmp
+ StoreBB->addLiveIn(Addr.getReg());
+ StoreBB->addLiveIn(NewLo.getReg());
+ StoreBB->addLiveIn(NewHi.getReg());
+ addPostLoopLiveIns(StoreBB, LiveRegs);
+ BuildMI(StoreBB, DL, TII->get(AArch64::STLXPX), StatusReg)
+ .addOperand(NewLo)
+ .addOperand(NewHi)
+ .addOperand(Addr);
+ BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW))
+ .addReg(StatusReg, RegState::Kill)
+ .addMBB(LoadCmpBB);
+ StoreBB->addSuccessor(LoadCmpBB);
+ StoreBB->addSuccessor(DoneBB);
+
+ DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
+ DoneBB->transferSuccessors(&MBB);
+ addPostLoopLiveIns(DoneBB, LiveRegs);
+
+ MBB.addSuccessor(LoadCmpBB);
+
+ NextMBBI = MBB.end();
+ MI.eraseFromParent();
+ return true;
+}
+
+/// \brief If MBBI references a pseudo instruction that should be expanded here,
+/// do the expansion and return true. Otherwise return false.
+bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned Opcode = MI.getOpcode();
+ switch (Opcode) {
+ default:
+ break;
+
+ case AArch64::ADDWrr:
+ case AArch64::SUBWrr:
+ case AArch64::ADDXrr:
+ case AArch64::SUBXrr:
+ case AArch64::ADDSWrr:
+ case AArch64::SUBSWrr:
+ case AArch64::ADDSXrr:
+ case AArch64::SUBSXrr:
+ case AArch64::ANDWrr:
+ case AArch64::ANDXrr:
+ case AArch64::BICWrr:
+ case AArch64::BICXrr:
+ case AArch64::ANDSWrr:
+ case AArch64::ANDSXrr:
+ case AArch64::BICSWrr:
+ case AArch64::BICSXrr:
+ case AArch64::EONWrr:
+ case AArch64::EONXrr:
+ case AArch64::EORWrr:
+ case AArch64::EORXrr:
+ case AArch64::ORNWrr:
+ case AArch64::ORNXrr:
+ case AArch64::ORRWrr:
+ case AArch64::ORRXrr: {
+ unsigned Opcode;
+ switch (MI.getOpcode()) {
+ default:
+ return false;
+ case AArch64::ADDWrr: Opcode = AArch64::ADDWrs; break;
+ case AArch64::SUBWrr: Opcode = AArch64::SUBWrs; break;
+ case AArch64::ADDXrr: Opcode = AArch64::ADDXrs; break;
+ case AArch64::SUBXrr: Opcode = AArch64::SUBXrs; break;
+ case AArch64::ADDSWrr: Opcode = AArch64::ADDSWrs; break;
+ case AArch64::SUBSWrr: Opcode = AArch64::SUBSWrs; break;
+ case AArch64::ADDSXrr: Opcode = AArch64::ADDSXrs; break;
+ case AArch64::SUBSXrr: Opcode = AArch64::SUBSXrs; break;
+ case AArch64::ANDWrr: Opcode = AArch64::ANDWrs; break;
+ case AArch64::ANDXrr: Opcode = AArch64::ANDXrs; break;
+ case AArch64::BICWrr: Opcode = AArch64::BICWrs; break;
+ case AArch64::BICXrr: Opcode = AArch64::BICXrs; break;
+ case AArch64::ANDSWrr: Opcode = AArch64::ANDSWrs; break;
+ case AArch64::ANDSXrr: Opcode = AArch64::ANDSXrs; break;
+ case AArch64::BICSWrr: Opcode = AArch64::BICSWrs; break;
+ case AArch64::BICSXrr: Opcode = AArch64::BICSXrs; break;
+ case AArch64::EONWrr: Opcode = AArch64::EONWrs; break;
+ case AArch64::EONXrr: Opcode = AArch64::EONXrs; break;
+ case AArch64::EORWrr: Opcode = AArch64::EORWrs; break;
+ case AArch64::EORXrr: Opcode = AArch64::EORXrs; break;
+ case AArch64::ORNWrr: Opcode = AArch64::ORNWrs; break;
+ case AArch64::ORNXrr: Opcode = AArch64::ORNXrs; break;
+ case AArch64::ORRWrr: Opcode = AArch64::ORRWrs; break;
+ case AArch64::ORRXrr: Opcode = AArch64::ORRXrs; break;
+ }
+ MachineInstrBuilder MIB1 =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode),
+ MI.getOperand(0).getReg())
+ .addOperand(MI.getOperand(1))
+ .addOperand(MI.getOperand(2))
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+ transferImpOps(MI, MIB1, MIB1);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ case AArch64::LOADgot: {
+ // Expand into ADRP + LDR.
+ unsigned DstReg = MI.getOperand(0).getReg();
+ const MachineOperand &MO1 = MI.getOperand(1);
+ unsigned Flags = MO1.getTargetFlags();
+ MachineInstrBuilder MIB1 =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg);
+ MachineInstrBuilder MIB2 =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::LDRXui))
+ .addOperand(MI.getOperand(0))
+ .addReg(DstReg);
+
+ if (MO1.isGlobal()) {
+ MIB1.addGlobalAddress(MO1.getGlobal(), 0, Flags | AArch64II::MO_PAGE);
+ MIB2.addGlobalAddress(MO1.getGlobal(), 0,
+ Flags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+ } else if (MO1.isSymbol()) {
+ MIB1.addExternalSymbol(MO1.getSymbolName(), Flags | AArch64II::MO_PAGE);
+ MIB2.addExternalSymbol(MO1.getSymbolName(),
+ Flags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+ } else {
+ assert(MO1.isCPI() &&
+ "Only expect globals, externalsymbols, or constant pools");
+ MIB1.addConstantPoolIndex(MO1.getIndex(), MO1.getOffset(),
+ Flags | AArch64II::MO_PAGE);
+ MIB2.addConstantPoolIndex(MO1.getIndex(), MO1.getOffset(),
+ Flags | AArch64II::MO_PAGEOFF |
+ AArch64II::MO_NC);
+ }
+
+ transferImpOps(MI, MIB1, MIB2);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ case AArch64::MOVaddr:
+ case AArch64::MOVaddrJT:
+ case AArch64::MOVaddrCP:
+ case AArch64::MOVaddrBA:
+ case AArch64::MOVaddrTLS:
+ case AArch64::MOVaddrEXT: {
+ // Expand into ADRP + ADD.
+ unsigned DstReg = MI.getOperand(0).getReg();
+ MachineInstrBuilder MIB1 =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg)
+ .addOperand(MI.getOperand(1));
+
+ MachineInstrBuilder MIB2 =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADDXri))
+ .addOperand(MI.getOperand(0))
+ .addReg(DstReg)
+ .addOperand(MI.getOperand(2))
+ .addImm(0);
+
+ transferImpOps(MI, MIB1, MIB2);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ case AArch64::MOVi32imm:
+ return expandMOVImm(MBB, MBBI, 32);
+ case AArch64::MOVi64imm:
+ return expandMOVImm(MBB, MBBI, 64);
+ case AArch64::RET_ReallyLR: {
+ // Hiding the LR use with RET_ReallyLR may lead to extra kills in the
+ // function and missing live-ins. We are fine in practice because callee
+ // saved register handling ensures the register value is restored before
+ // RET, but we need the undef flag here to appease the MachineVerifier
+ // liveness checks.
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::RET))
+ .addReg(AArch64::LR, RegState::Undef);
+ transferImpOps(MI, MIB, MIB);
+ MI.eraseFromParent();
+ return true;
+ }
+ case AArch64::CMP_SWAP_8:
+ return expandCMP_SWAP(MBB, MBBI, AArch64::LDAXRB, AArch64::STLXRB,
+ AArch64::SUBSWrx,
+ AArch64_AM::getArithExtendImm(AArch64_AM::UXTB, 0),
+ AArch64::WZR, NextMBBI);
+ case AArch64::CMP_SWAP_16:
+ return expandCMP_SWAP(MBB, MBBI, AArch64::LDAXRH, AArch64::STLXRH,
+ AArch64::SUBSWrx,
+ AArch64_AM::getArithExtendImm(AArch64_AM::UXTH, 0),
+ AArch64::WZR, NextMBBI);
+ case AArch64::CMP_SWAP_32:
+ return expandCMP_SWAP(MBB, MBBI, AArch64::LDAXRW, AArch64::STLXRW,
+ AArch64::SUBSWrs,
+ AArch64_AM::getShifterImm(AArch64_AM::LSL, 0),
+ AArch64::WZR, NextMBBI);
+ case AArch64::CMP_SWAP_64:
+ return expandCMP_SWAP(MBB, MBBI,
+ AArch64::LDAXRX, AArch64::STLXRX, AArch64::SUBSXrs,
+ AArch64_AM::getShifterImm(AArch64_AM::LSL, 0),
+ AArch64::XZR, NextMBBI);
+ case AArch64::CMP_SWAP_128:
+ return expandCMP_SWAP_128(MBB, MBBI, NextMBBI);
+ }
+ return false;
+}
+
+/// \brief Iterate over the instructions in basic block MBB and expand any
+/// pseudo instructions. Return true if anything was modified.
+bool AArch64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
+ bool Modified = false;
+
+ MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+ while (MBBI != E) {
+ MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+ Modified |= expandMI(MBB, MBBI, NMBBI);
+ MBBI = NMBBI;
+ }
+
+ return Modified;
+}
+
+bool AArch64ExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
+ TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+ bool Modified = false;
+ for (auto &MBB : MF)
+ Modified |= expandMBB(MBB);
+ return Modified;
+}
+
+/// \brief Returns an instance of the pseudo instruction expansion pass.
+FunctionPass *llvm::createAArch64ExpandPseudoPass() {
+ return new AArch64ExpandPseudo();
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp
new file mode 100644
index 000000000000..fe2c2d4550a7
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -0,0 +1,5099 @@
+//===-- AArch6464FastISel.cpp - AArch64 FastISel implementation -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the AArch64-specific support for the FastISel class. Some
+// of the target-specific code is generated by tablegen in the file
+// AArch64GenFastISel.inc, which is #included here.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64CallingConvention.h"
+#include "AArch64Subtarget.h"
+#include "AArch64TargetMachine.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/MC/MCSymbol.h"
+using namespace llvm;
+
+namespace {
+
+class AArch64FastISel final : public FastISel {
+ class Address {
+ public:
+ typedef enum {
+ RegBase,
+ FrameIndexBase
+ } BaseKind;
+
+ private:
+ BaseKind Kind;
+ AArch64_AM::ShiftExtendType ExtType;
+ union {
+ unsigned Reg;
+ int FI;
+ } Base;
+ unsigned OffsetReg;
+ unsigned Shift;
+ int64_t Offset;
+ const GlobalValue *GV;
+
+ public:
+ Address() : Kind(RegBase), ExtType(AArch64_AM::InvalidShiftExtend),
+ OffsetReg(0), Shift(0), Offset(0), GV(nullptr) { Base.Reg = 0; }
+ void setKind(BaseKind K) { Kind = K; }
+ BaseKind getKind() const { return Kind; }
+ void setExtendType(AArch64_AM::ShiftExtendType E) { ExtType = E; }
+ AArch64_AM::ShiftExtendType getExtendType() const { return ExtType; }
+ bool isRegBase() const { return Kind == RegBase; }
+ bool isFIBase() const { return Kind == FrameIndexBase; }
+ void setReg(unsigned Reg) {
+ assert(isRegBase() && "Invalid base register access!");
+ Base.Reg = Reg;
+ }
+ unsigned getReg() const {
+ assert(isRegBase() && "Invalid base register access!");
+ return Base.Reg;
+ }
+ void setOffsetReg(unsigned Reg) {
+ OffsetReg = Reg;
+ }
+ unsigned getOffsetReg() const {
+ return OffsetReg;
+ }
+ void setFI(unsigned FI) {
+ assert(isFIBase() && "Invalid base frame index access!");
+ Base.FI = FI;
+ }
+ unsigned getFI() const {
+ assert(isFIBase() && "Invalid base frame index access!");
+ return Base.FI;
+ }
+ void setOffset(int64_t O) { Offset = O; }
+ int64_t getOffset() { return Offset; }
+ void setShift(unsigned S) { Shift = S; }
+ unsigned getShift() { return Shift; }
+
+ void setGlobalValue(const GlobalValue *G) { GV = G; }
+ const GlobalValue *getGlobalValue() { return GV; }
+ };
+
+ /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
+ /// make the right decision when generating code for different targets.
+ const AArch64Subtarget *Subtarget;
+ LLVMContext *Context;
+
+ bool fastLowerArguments() override;
+ bool fastLowerCall(CallLoweringInfo &CLI) override;
+ bool fastLowerIntrinsicCall(const IntrinsicInst *II) override;
+
+private:
+ // Selection routines.
+ bool selectAddSub(const Instruction *I);
+ bool selectLogicalOp(const Instruction *I);
+ bool selectLoad(const Instruction *I);
+ bool selectStore(const Instruction *I);
+ bool selectBranch(const Instruction *I);
+ bool selectIndirectBr(const Instruction *I);
+ bool selectCmp(const Instruction *I);
+ bool selectSelect(const Instruction *I);
+ bool selectFPExt(const Instruction *I);
+ bool selectFPTrunc(const Instruction *I);
+ bool selectFPToInt(const Instruction *I, bool Signed);
+ bool selectIntToFP(const Instruction *I, bool Signed);
+ bool selectRem(const Instruction *I, unsigned ISDOpcode);
+ bool selectRet(const Instruction *I);
+ bool selectTrunc(const Instruction *I);
+ bool selectIntExt(const Instruction *I);
+ bool selectMul(const Instruction *I);
+ bool selectShift(const Instruction *I);
+ bool selectBitCast(const Instruction *I);
+ bool selectFRem(const Instruction *I);
+ bool selectSDiv(const Instruction *I);
+ bool selectGetElementPtr(const Instruction *I);
+ bool selectAtomicCmpXchg(const AtomicCmpXchgInst *I);
+
+ // Utility helper routines.
+ bool isTypeLegal(Type *Ty, MVT &VT);
+ bool isTypeSupported(Type *Ty, MVT &VT, bool IsVectorAllowed = false);
+ bool isValueAvailable(const Value *V) const;
+ bool computeAddress(const Value *Obj, Address &Addr, Type *Ty = nullptr);
+ bool computeCallAddress(const Value *V, Address &Addr);
+ bool simplifyAddress(Address &Addr, MVT VT);
+ void addLoadStoreOperands(Address &Addr, const MachineInstrBuilder &MIB,
+ MachineMemOperand::Flags Flags,
+ unsigned ScaleFactor, MachineMemOperand *MMO);
+ bool isMemCpySmall(uint64_t Len, unsigned Alignment);
+ bool tryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len,
+ unsigned Alignment);
+ bool foldXALUIntrinsic(AArch64CC::CondCode &CC, const Instruction *I,
+ const Value *Cond);
+ bool optimizeIntExtLoad(const Instruction *I, MVT RetVT, MVT SrcVT);
+ bool optimizeSelect(const SelectInst *SI);
+ std::pair<unsigned, bool> getRegForGEPIndex(const Value *Idx);
+
+ // Emit helper routines.
+ unsigned emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS,
+ const Value *RHS, bool SetFlags = false,
+ bool WantResult = true, bool IsZExt = false);
+ unsigned emitAddSub_rr(bool UseAdd, MVT RetVT, unsigned LHSReg,
+ bool LHSIsKill, unsigned RHSReg, bool RHSIsKill,
+ bool SetFlags = false, bool WantResult = true);
+ unsigned emitAddSub_ri(bool UseAdd, MVT RetVT, unsigned LHSReg,
+ bool LHSIsKill, uint64_t Imm, bool SetFlags = false,
+ bool WantResult = true);
+ unsigned emitAddSub_rs(bool UseAdd, MVT RetVT, unsigned LHSReg,
+ bool LHSIsKill, unsigned RHSReg, bool RHSIsKill,
+ AArch64_AM::ShiftExtendType ShiftType,
+ uint64_t ShiftImm, bool SetFlags = false,
+ bool WantResult = true);
+ unsigned emitAddSub_rx(bool UseAdd, MVT RetVT, unsigned LHSReg,
+ bool LHSIsKill, unsigned RHSReg, bool RHSIsKill,
+ AArch64_AM::ShiftExtendType ExtType,
+ uint64_t ShiftImm, bool SetFlags = false,
+ bool WantResult = true);
+
+ // Emit functions.
+ bool emitCompareAndBranch(const BranchInst *BI);
+ bool emitCmp(const Value *LHS, const Value *RHS, bool IsZExt);
+ bool emitICmp(MVT RetVT, const Value *LHS, const Value *RHS, bool IsZExt);
+ bool emitICmp_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill, uint64_t Imm);
+ bool emitFCmp(MVT RetVT, const Value *LHS, const Value *RHS);
+ unsigned emitLoad(MVT VT, MVT ResultVT, Address Addr, bool WantZExt = true,
+ MachineMemOperand *MMO = nullptr);
+ bool emitStore(MVT VT, unsigned SrcReg, Address Addr,
+ MachineMemOperand *MMO = nullptr);
+ bool emitStoreRelease(MVT VT, unsigned SrcReg, unsigned AddrReg,
+ MachineMemOperand *MMO = nullptr);
+ unsigned emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool isZExt);
+ unsigned emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt);
+ unsigned emitAdd(MVT RetVT, const Value *LHS, const Value *RHS,
+ bool SetFlags = false, bool WantResult = true,
+ bool IsZExt = false);
+ unsigned emitAdd_ri_(MVT VT, unsigned Op0, bool Op0IsKill, int64_t Imm);
+ unsigned emitSub(MVT RetVT, const Value *LHS, const Value *RHS,
+ bool SetFlags = false, bool WantResult = true,
+ bool IsZExt = false);
+ unsigned emitSubs_rr(MVT RetVT, unsigned LHSReg, bool LHSIsKill,
+ unsigned RHSReg, bool RHSIsKill, bool WantResult = true);
+ unsigned emitSubs_rs(MVT RetVT, unsigned LHSReg, bool LHSIsKill,
+ unsigned RHSReg, bool RHSIsKill,
+ AArch64_AM::ShiftExtendType ShiftType, uint64_t ShiftImm,
+ bool WantResult = true);
+ unsigned emitLogicalOp(unsigned ISDOpc, MVT RetVT, const Value *LHS,
+ const Value *RHS);
+ unsigned emitLogicalOp_ri(unsigned ISDOpc, MVT RetVT, unsigned LHSReg,
+ bool LHSIsKill, uint64_t Imm);
+ unsigned emitLogicalOp_rs(unsigned ISDOpc, MVT RetVT, unsigned LHSReg,
+ bool LHSIsKill, unsigned RHSReg, bool RHSIsKill,
+ uint64_t ShiftImm);
+ unsigned emitAnd_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill, uint64_t Imm);
+ unsigned emitMul_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
+ unsigned Op1, bool Op1IsKill);
+ unsigned emitSMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
+ unsigned Op1, bool Op1IsKill);
+ unsigned emitUMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
+ unsigned Op1, bool Op1IsKill);
+ unsigned emitLSL_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
+ unsigned Op1Reg, bool Op1IsKill);
+ unsigned emitLSL_ri(MVT RetVT, MVT SrcVT, unsigned Op0Reg, bool Op0IsKill,
+ uint64_t Imm, bool IsZExt = true);
+ unsigned emitLSR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
+ unsigned Op1Reg, bool Op1IsKill);
+ unsigned emitLSR_ri(MVT RetVT, MVT SrcVT, unsigned Op0Reg, bool Op0IsKill,
+ uint64_t Imm, bool IsZExt = true);
+ unsigned emitASR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
+ unsigned Op1Reg, bool Op1IsKill);
+ unsigned emitASR_ri(MVT RetVT, MVT SrcVT, unsigned Op0Reg, bool Op0IsKill,
+ uint64_t Imm, bool IsZExt = false);
+
+ unsigned materializeInt(const ConstantInt *CI, MVT VT);
+ unsigned materializeFP(const ConstantFP *CFP, MVT VT);
+ unsigned materializeGV(const GlobalValue *GV);
+
+ // Call handling routines.
+private:
+ CCAssignFn *CCAssignFnForCall(CallingConv::ID CC) const;
+ bool processCallArgs(CallLoweringInfo &CLI, SmallVectorImpl<MVT> &ArgVTs,
+ unsigned &NumBytes);
+ bool finishCall(CallLoweringInfo &CLI, MVT RetVT, unsigned NumBytes);
+
+public:
+ // Backend specific FastISel code.
+ unsigned fastMaterializeAlloca(const AllocaInst *AI) override;
+ unsigned fastMaterializeConstant(const Constant *C) override;
+ unsigned fastMaterializeFloatZero(const ConstantFP* CF) override;
+
+ explicit AArch64FastISel(FunctionLoweringInfo &FuncInfo,
+ const TargetLibraryInfo *LibInfo)
+ : FastISel(FuncInfo, LibInfo, /*SkipTargetIndependentISel=*/true) {
+ Subtarget =
+ &static_cast<const AArch64Subtarget &>(FuncInfo.MF->getSubtarget());
+ Context = &FuncInfo.Fn->getContext();
+ }
+
+ bool fastSelectInstruction(const Instruction *I) override;
+
+#include "AArch64GenFastISel.inc"
+};
+
+} // end anonymous namespace
+
+#include "AArch64GenCallingConv.inc"
+
+/// \brief Check if the sign-/zero-extend will be a noop.
+static bool isIntExtFree(const Instruction *I) {
+ assert((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
+ "Unexpected integer extend instruction.");
+ assert(!I->getType()->isVectorTy() && I->getType()->isIntegerTy() &&
+ "Unexpected value type.");
+ bool IsZExt = isa<ZExtInst>(I);
+
+ if (const auto *LI = dyn_cast<LoadInst>(I->getOperand(0)))
+ if (LI->hasOneUse())
+ return true;
+
+ if (const auto *Arg = dyn_cast<Argument>(I->getOperand(0)))
+ if ((IsZExt && Arg->hasZExtAttr()) || (!IsZExt && Arg->hasSExtAttr()))
+ return true;
+
+ return false;
+}
+
+/// \brief Determine the implicit scale factor that is applied by a memory
+/// operation for a given value type.
+static unsigned getImplicitScaleFactor(MVT VT) {
+ switch (VT.SimpleTy) {
+ default:
+ return 0; // invalid
+ case MVT::i1: // fall-through
+ case MVT::i8:
+ return 1;
+ case MVT::i16:
+ return 2;
+ case MVT::i32: // fall-through
+ case MVT::f32:
+ return 4;
+ case MVT::i64: // fall-through
+ case MVT::f64:
+ return 8;
+ }
+}
+
+CCAssignFn *AArch64FastISel::CCAssignFnForCall(CallingConv::ID CC) const {
+ if (CC == CallingConv::WebKit_JS)
+ return CC_AArch64_WebKit_JS;
+ if (CC == CallingConv::GHC)
+ return CC_AArch64_GHC;
+ return Subtarget->isTargetDarwin() ? CC_AArch64_DarwinPCS : CC_AArch64_AAPCS;
+}
+
+unsigned AArch64FastISel::fastMaterializeAlloca(const AllocaInst *AI) {
+ assert(TLI.getValueType(DL, AI->getType(), true) == MVT::i64 &&
+ "Alloca should always return a pointer.");
+
+ // Don't handle dynamic allocas.
+ if (!FuncInfo.StaticAllocaMap.count(AI))
+ return 0;
+
+ DenseMap<const AllocaInst *, int>::iterator SI =
+ FuncInfo.StaticAllocaMap.find(AI);
+
+ if (SI != FuncInfo.StaticAllocaMap.end()) {
+ unsigned ResultReg = createResultReg(&AArch64::GPR64spRegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri),
+ ResultReg)
+ .addFrameIndex(SI->second)
+ .addImm(0)
+ .addImm(0);
+ return ResultReg;
+ }
+
+ return 0;
+}
+
+unsigned AArch64FastISel::materializeInt(const ConstantInt *CI, MVT VT) {
+ if (VT > MVT::i64)
+ return 0;
+
+ if (!CI->isZero())
+ return fastEmit_i(VT, VT, ISD::Constant, CI->getZExtValue());
+
+ // Create a copy from the zero register to materialize a "0" value.
+ const TargetRegisterClass *RC = (VT == MVT::i64) ? &AArch64::GPR64RegClass
+ : &AArch64::GPR32RegClass;
+ unsigned ZeroReg = (VT == MVT::i64) ? AArch64::XZR : AArch64::WZR;
+ unsigned ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY),
+ ResultReg).addReg(ZeroReg, getKillRegState(true));
+ return ResultReg;
+}
+
+unsigned AArch64FastISel::materializeFP(const ConstantFP *CFP, MVT VT) {
+ // Positive zero (+0.0) has to be materialized with a fmov from the zero
+ // register, because the immediate version of fmov cannot encode zero.
+ if (CFP->isNullValue())
+ return fastMaterializeFloatZero(CFP);
+
+ if (VT != MVT::f32 && VT != MVT::f64)
+ return 0;
+
+ const APFloat Val = CFP->getValueAPF();
+ bool Is64Bit = (VT == MVT::f64);
+ // This checks to see if we can use FMOV instructions to materialize
+ // a constant, otherwise we have to materialize via the constant pool.
+ if (TLI.isFPImmLegal(Val, VT)) {
+ int Imm =
+ Is64Bit ? AArch64_AM::getFP64Imm(Val) : AArch64_AM::getFP32Imm(Val);
+ assert((Imm != -1) && "Cannot encode floating-point constant.");
+ unsigned Opc = Is64Bit ? AArch64::FMOVDi : AArch64::FMOVSi;
+ return fastEmitInst_i(Opc, TLI.getRegClassFor(VT), Imm);
+ }
+
+ // For the MachO large code model materialize the FP constant in code.
+ if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
+ unsigned Opc1 = Is64Bit ? AArch64::MOVi64imm : AArch64::MOVi32imm;
+ const TargetRegisterClass *RC = Is64Bit ?
+ &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+
+ unsigned TmpReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc1), TmpReg)
+ .addImm(CFP->getValueAPF().bitcastToAPInt().getZExtValue());
+
+ unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg)
+ .addReg(TmpReg, getKillRegState(true));
+
+ return ResultReg;
+ }
+
+ // Materialize via constant pool. MachineConstantPool wants an explicit
+ // alignment.
+ unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
+ if (Align == 0)
+ Align = DL.getTypeAllocSize(CFP->getType());
+
+ unsigned CPI = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align);
+ unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
+ ADRPReg).addConstantPoolIndex(CPI, 0, AArch64II::MO_PAGE);
+
+ unsigned Opc = Is64Bit ? AArch64::LDRDui : AArch64::LDRSui;
+ unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+ .addReg(ADRPReg)
+ .addConstantPoolIndex(CPI, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+ return ResultReg;
+}
+
+unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) {
+ // We can't handle thread-local variables quickly yet.
+ if (GV->isThreadLocal())
+ return 0;
+
+ // MachO still uses GOT for large code-model accesses, but ELF requires
+ // movz/movk sequences, which FastISel doesn't handle yet.
+ if (TM.getCodeModel() != CodeModel::Small && !Subtarget->isTargetMachO())
+ return 0;
+
+ unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, TM);
+
+ EVT DestEVT = TLI.getValueType(DL, GV->getType(), true);
+ if (!DestEVT.isSimple())
+ return 0;
+
+ unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass);
+ unsigned ResultReg;
+
+ if (OpFlags & AArch64II::MO_GOT) {
+ // ADRP + LDRX
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
+ ADRPReg)
+ .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGE);
+
+ ResultReg = createResultReg(&AArch64::GPR64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::LDRXui),
+ ResultReg)
+ .addReg(ADRPReg)
+ .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGEOFF |
+ AArch64II::MO_NC);
+ } else {
+ // ADRP + ADDX
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
+ ADRPReg)
+ .addGlobalAddress(GV, 0, AArch64II::MO_PAGE);
+
+ ResultReg = createResultReg(&AArch64::GPR64spRegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri),
+ ResultReg)
+ .addReg(ADRPReg)
+ .addGlobalAddress(GV, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
+ .addImm(0);
+ }
+ return ResultReg;
+}
+
+unsigned AArch64FastISel::fastMaterializeConstant(const Constant *C) {
+ EVT CEVT = TLI.getValueType(DL, C->getType(), true);
+
+ // Only handle simple types.
+ if (!CEVT.isSimple())
+ return 0;
+ MVT VT = CEVT.getSimpleVT();
+
+ if (const auto *CI = dyn_cast<ConstantInt>(C))
+ return materializeInt(CI, VT);
+ else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
+ return materializeFP(CFP, VT);
+ else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+ return materializeGV(GV);
+
+ return 0;
+}
+
+unsigned AArch64FastISel::fastMaterializeFloatZero(const ConstantFP* CFP) {
+ assert(CFP->isNullValue() &&
+ "Floating-point constant is not a positive zero.");
+ MVT VT;
+ if (!isTypeLegal(CFP->getType(), VT))
+ return 0;
+
+ if (VT != MVT::f32 && VT != MVT::f64)
+ return 0;
+
+ bool Is64Bit = (VT == MVT::f64);
+ unsigned ZReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
+ unsigned Opc = Is64Bit ? AArch64::FMOVXDr : AArch64::FMOVWSr;
+ return fastEmitInst_r(Opc, TLI.getRegClassFor(VT), ZReg, /*IsKill=*/true);
+}
+
+/// \brief Check if the multiply is by a power-of-2 constant.
+static bool isMulPowOf2(const Value *I) {
+ if (const auto *MI = dyn_cast<MulOperator>(I)) {
+ if (const auto *C = dyn_cast<ConstantInt>(MI->getOperand(0)))
+ if (C->getValue().isPowerOf2())
+ return true;
+ if (const auto *C = dyn_cast<ConstantInt>(MI->getOperand(1)))
+ if (C->getValue().isPowerOf2())
+ return true;
+ }
+ return false;
+}
+
+// Computes the address to get to an object.
+bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty)
+{
+ const User *U = nullptr;
+ unsigned Opcode = Instruction::UserOp1;
+ if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
+ // Don't walk into other basic blocks unless the object is an alloca from
+ // another block, otherwise it may not have a virtual register assigned.
+ if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(Obj)) ||
+ FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
+ Opcode = I->getOpcode();
+ U = I;
+ }
+ } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(Obj)) {
+ Opcode = C->getOpcode();
+ U = C;
+ }
+
+ if (auto *Ty = dyn_cast<PointerType>(Obj->getType()))
+ if (Ty->getAddressSpace() > 255)
+ // Fast instruction selection doesn't support the special
+ // address spaces.
+ return false;
+
+ switch (Opcode) {
+ default:
+ break;
+ case Instruction::BitCast: {
+ // Look through bitcasts.
+ return computeAddress(U->getOperand(0), Addr, Ty);
+ }
+ case Instruction::IntToPtr: {
+ // Look past no-op inttoptrs.
+ if (TLI.getValueType(DL, U->getOperand(0)->getType()) ==
+ TLI.getPointerTy(DL))
+ return computeAddress(U->getOperand(0), Addr, Ty);
+ break;
+ }
+ case Instruction::PtrToInt: {
+ // Look past no-op ptrtoints.
+ if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
+ return computeAddress(U->getOperand(0), Addr, Ty);
+ break;
+ }
+ case Instruction::GetElementPtr: {
+ Address SavedAddr = Addr;
+ uint64_t TmpOffset = Addr.getOffset();
+
+ // Iterate through the GEP folding the constants into offsets where
+ // we can.
+ for (gep_type_iterator GTI = gep_type_begin(U), E = gep_type_end(U);
+ GTI != E; ++GTI) {
+ const Value *Op = GTI.getOperand();
+ if (StructType *STy = GTI.getStructTypeOrNull()) {
+ const StructLayout *SL = DL.getStructLayout(STy);
+ unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
+ TmpOffset += SL->getElementOffset(Idx);
+ } else {
+ uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
+ for (;;) {
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+ // Constant-offset addressing.
+ TmpOffset += CI->getSExtValue() * S;
+ break;
+ }
+ if (canFoldAddIntoGEP(U, Op)) {
+ // A compatible add with a constant operand. Fold the constant.
+ ConstantInt *CI =
+ cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+ TmpOffset += CI->getSExtValue() * S;
+ // Iterate on the other operand.
+ Op = cast<AddOperator>(Op)->getOperand(0);
+ continue;
+ }
+ // Unsupported
+ goto unsupported_gep;
+ }
+ }
+ }
+
+ // Try to grab the base operand now.
+ Addr.setOffset(TmpOffset);
+ if (computeAddress(U->getOperand(0), Addr, Ty))
+ return true;
+
+ // We failed, restore everything and try the other options.
+ Addr = SavedAddr;
+
+ unsupported_gep:
+ break;
+ }
+ case Instruction::Alloca: {
+ const AllocaInst *AI = cast<AllocaInst>(Obj);
+ DenseMap<const AllocaInst *, int>::iterator SI =
+ FuncInfo.StaticAllocaMap.find(AI);
+ if (SI != FuncInfo.StaticAllocaMap.end()) {
+ Addr.setKind(Address::FrameIndexBase);
+ Addr.setFI(SI->second);
+ return true;
+ }
+ break;
+ }
+ case Instruction::Add: {
+ // Adds of constants are common and easy enough.
+ const Value *LHS = U->getOperand(0);
+ const Value *RHS = U->getOperand(1);
+
+ if (isa<ConstantInt>(LHS))
+ std::swap(LHS, RHS);
+
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
+ Addr.setOffset(Addr.getOffset() + CI->getSExtValue());
+ return computeAddress(LHS, Addr, Ty);
+ }
+
+ Address Backup = Addr;
+ if (computeAddress(LHS, Addr, Ty) && computeAddress(RHS, Addr, Ty))
+ return true;
+ Addr = Backup;
+
+ break;
+ }
+ case Instruction::Sub: {
+ // Subs of constants are common and easy enough.
+ const Value *LHS = U->getOperand(0);
+ const Value *RHS = U->getOperand(1);
+
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
+ Addr.setOffset(Addr.getOffset() - CI->getSExtValue());
+ return computeAddress(LHS, Addr, Ty);
+ }
+ break;
+ }
+ case Instruction::Shl: {
+ if (Addr.getOffsetReg())
+ break;
+
+ const auto *CI = dyn_cast<ConstantInt>(U->getOperand(1));
+ if (!CI)
+ break;
+
+ unsigned Val = CI->getZExtValue();
+ if (Val < 1 || Val > 3)
+ break;
+
+ uint64_t NumBytes = 0;
+ if (Ty && Ty->isSized()) {
+ uint64_t NumBits = DL.getTypeSizeInBits(Ty);
+ NumBytes = NumBits / 8;
+ if (!isPowerOf2_64(NumBits))
+ NumBytes = 0;
+ }
+
+ if (NumBytes != (1ULL << Val))
+ break;
+
+ Addr.setShift(Val);
+ Addr.setExtendType(AArch64_AM::LSL);
+
+ const Value *Src = U->getOperand(0);
+ if (const auto *I = dyn_cast<Instruction>(Src)) {
+ if (FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
+ // Fold the zext or sext when it won't become a noop.
+ if (const auto *ZE = dyn_cast<ZExtInst>(I)) {
+ if (!isIntExtFree(ZE) &&
+ ZE->getOperand(0)->getType()->isIntegerTy(32)) {
+ Addr.setExtendType(AArch64_AM::UXTW);
+ Src = ZE->getOperand(0);
+ }
+ } else if (const auto *SE = dyn_cast<SExtInst>(I)) {
+ if (!isIntExtFree(SE) &&
+ SE->getOperand(0)->getType()->isIntegerTy(32)) {
+ Addr.setExtendType(AArch64_AM::SXTW);
+ Src = SE->getOperand(0);
+ }
+ }
+ }
+ }
+
+ if (const auto *AI = dyn_cast<BinaryOperator>(Src))
+ if (AI->getOpcode() == Instruction::And) {
+ const Value *LHS = AI->getOperand(0);
+ const Value *RHS = AI->getOperand(1);
+
+ if (const auto *C = dyn_cast<ConstantInt>(LHS))
+ if (C->getValue() == 0xffffffff)
+ std::swap(LHS, RHS);
+
+ if (const auto *C = dyn_cast<ConstantInt>(RHS))
+ if (C->getValue() == 0xffffffff) {
+ Addr.setExtendType(AArch64_AM::UXTW);
+ unsigned Reg = getRegForValue(LHS);
+ if (!Reg)
+ return false;
+ bool RegIsKill = hasTrivialKill(LHS);
+ Reg = fastEmitInst_extractsubreg(MVT::i32, Reg, RegIsKill,
+ AArch64::sub_32);
+ Addr.setOffsetReg(Reg);
+ return true;
+ }
+ }
+
+ unsigned Reg = getRegForValue(Src);
+ if (!Reg)
+ return false;
+ Addr.setOffsetReg(Reg);
+ return true;
+ }
+ case Instruction::Mul: {
+ if (Addr.getOffsetReg())
+ break;
+
+ if (!isMulPowOf2(U))
+ break;
+
+ const Value *LHS = U->getOperand(0);
+ const Value *RHS = U->getOperand(1);
+
+ // Canonicalize power-of-2 value to the RHS.
+ if (const auto *C = dyn_cast<ConstantInt>(LHS))
+ if (C->getValue().isPowerOf2())
+ std::swap(LHS, RHS);
+
+ assert(isa<ConstantInt>(RHS) && "Expected an ConstantInt.");
+ const auto *C = cast<ConstantInt>(RHS);
+ unsigned Val = C->getValue().logBase2();
+ if (Val < 1 || Val > 3)
+ break;
+
+ uint64_t NumBytes = 0;
+ if (Ty && Ty->isSized()) {
+ uint64_t NumBits = DL.getTypeSizeInBits(Ty);
+ NumBytes = NumBits / 8;
+ if (!isPowerOf2_64(NumBits))
+ NumBytes = 0;
+ }
+
+ if (NumBytes != (1ULL << Val))
+ break;
+
+ Addr.setShift(Val);
+ Addr.setExtendType(AArch64_AM::LSL);
+
+ const Value *Src = LHS;
+ if (const auto *I = dyn_cast<Instruction>(Src)) {
+ if (FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
+ // Fold the zext or sext when it won't become a noop.
+ if (const auto *ZE = dyn_cast<ZExtInst>(I)) {
+ if (!isIntExtFree(ZE) &&
+ ZE->getOperand(0)->getType()->isIntegerTy(32)) {
+ Addr.setExtendType(AArch64_AM::UXTW);
+ Src = ZE->getOperand(0);
+ }
+ } else if (const auto *SE = dyn_cast<SExtInst>(I)) {
+ if (!isIntExtFree(SE) &&
+ SE->getOperand(0)->getType()->isIntegerTy(32)) {
+ Addr.setExtendType(AArch64_AM::SXTW);
+ Src = SE->getOperand(0);
+ }
+ }
+ }
+ }
+
+ unsigned Reg = getRegForValue(Src);
+ if (!Reg)
+ return false;
+ Addr.setOffsetReg(Reg);
+ return true;
+ }
+ case Instruction::And: {
+ if (Addr.getOffsetReg())
+ break;
+
+ if (!Ty || DL.getTypeSizeInBits(Ty) != 8)
+ break;
+
+ const Value *LHS = U->getOperand(0);
+ const Value *RHS = U->getOperand(1);
+
+ if (const auto *C = dyn_cast<ConstantInt>(LHS))
+ if (C->getValue() == 0xffffffff)
+ std::swap(LHS, RHS);
+
+ if (const auto *C = dyn_cast<ConstantInt>(RHS))
+ if (C->getValue() == 0xffffffff) {
+ Addr.setShift(0);
+ Addr.setExtendType(AArch64_AM::LSL);
+ Addr.setExtendType(AArch64_AM::UXTW);
+
+ unsigned Reg = getRegForValue(LHS);
+ if (!Reg)
+ return false;
+ bool RegIsKill = hasTrivialKill(LHS);
+ Reg = fastEmitInst_extractsubreg(MVT::i32, Reg, RegIsKill,
+ AArch64::sub_32);
+ Addr.setOffsetReg(Reg);
+ return true;
+ }
+ break;
+ }
+ case Instruction::SExt:
+ case Instruction::ZExt: {
+ if (!Addr.getReg() || Addr.getOffsetReg())
+ break;
+
+ const Value *Src = nullptr;
+ // Fold the zext or sext when it won't become a noop.
+ if (const auto *ZE = dyn_cast<ZExtInst>(U)) {
+ if (!isIntExtFree(ZE) && ZE->getOperand(0)->getType()->isIntegerTy(32)) {
+ Addr.setExtendType(AArch64_AM::UXTW);
+ Src = ZE->getOperand(0);
+ }
+ } else if (const auto *SE = dyn_cast<SExtInst>(U)) {
+ if (!isIntExtFree(SE) && SE->getOperand(0)->getType()->isIntegerTy(32)) {
+ Addr.setExtendType(AArch64_AM::SXTW);
+ Src = SE->getOperand(0);
+ }
+ }
+
+ if (!Src)
+ break;
+
+ Addr.setShift(0);
+ unsigned Reg = getRegForValue(Src);
+ if (!Reg)
+ return false;
+ Addr.setOffsetReg(Reg);
+ return true;
+ }
+ } // end switch
+
+ if (Addr.isRegBase() && !Addr.getReg()) {
+ unsigned Reg = getRegForValue(Obj);
+ if (!Reg)
+ return false;
+ Addr.setReg(Reg);
+ return true;
+ }
+
+ if (!Addr.getOffsetReg()) {
+ unsigned Reg = getRegForValue(Obj);
+ if (!Reg)
+ return false;
+ Addr.setOffsetReg(Reg);
+ return true;
+ }
+
+ return false;
+}
+
+bool AArch64FastISel::computeCallAddress(const Value *V, Address &Addr) {
+ const User *U = nullptr;
+ unsigned Opcode = Instruction::UserOp1;
+ bool InMBB = true;
+
+ if (const auto *I = dyn_cast<Instruction>(V)) {
+ Opcode = I->getOpcode();
+ U = I;
+ InMBB = I->getParent() == FuncInfo.MBB->getBasicBlock();
+ } else if (const auto *C = dyn_cast<ConstantExpr>(V)) {
+ Opcode = C->getOpcode();
+ U = C;
+ }
+
+ switch (Opcode) {
+ default: break;
+ case Instruction::BitCast:
+ // Look past bitcasts if its operand is in the same BB.
+ if (InMBB)
+ return computeCallAddress(U->getOperand(0), Addr);
+ break;
+ case Instruction::IntToPtr:
+ // Look past no-op inttoptrs if its operand is in the same BB.
+ if (InMBB &&
+ TLI.getValueType(DL, U->getOperand(0)->getType()) ==
+ TLI.getPointerTy(DL))
+ return computeCallAddress(U->getOperand(0), Addr);
+ break;
+ case Instruction::PtrToInt:
+ // Look past no-op ptrtoints if its operand is in the same BB.
+ if (InMBB && TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
+ return computeCallAddress(U->getOperand(0), Addr);
+ break;
+ }
+
+ if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+ Addr.setGlobalValue(GV);
+ return true;
+ }
+
+ // If all else fails, try to materialize the value in a register.
+ if (!Addr.getGlobalValue()) {
+ Addr.setReg(getRegForValue(V));
+ return Addr.getReg() != 0;
+ }
+
+ return false;
+}
+
+
+bool AArch64FastISel::isTypeLegal(Type *Ty, MVT &VT) {
+ EVT evt = TLI.getValueType(DL, Ty, true);
+
+ // Only handle simple types.
+ if (evt == MVT::Other || !evt.isSimple())
+ return false;
+ VT = evt.getSimpleVT();
+
+ // This is a legal type, but it's not something we handle in fast-isel.
+ if (VT == MVT::f128)
+ return false;
+
+ // Handle all other legal types, i.e. a register that will directly hold this
+ // value.
+ return TLI.isTypeLegal(VT);
+}
+
+/// \brief Determine if the value type is supported by FastISel.
+///
+/// FastISel for AArch64 can handle more value types than are legal. This adds
+/// simple value type such as i1, i8, and i16.
+bool AArch64FastISel::isTypeSupported(Type *Ty, MVT &VT, bool IsVectorAllowed) {
+ if (Ty->isVectorTy() && !IsVectorAllowed)
+ return false;
+
+ if (isTypeLegal(Ty, VT))
+ return true;
+
+ // If this is a type than can be sign or zero-extended to a basic operation
+ // go ahead and accept it now.
+ if (VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16)
+ return true;
+
+ return false;
+}
+
+bool AArch64FastISel::isValueAvailable(const Value *V) const {
+ if (!isa<Instruction>(V))
+ return true;
+
+ const auto *I = cast<Instruction>(V);
+ return FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB;
+}
+
+bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) {
+ unsigned ScaleFactor = getImplicitScaleFactor(VT);
+ if (!ScaleFactor)
+ return false;
+
+ bool ImmediateOffsetNeedsLowering = false;
+ bool RegisterOffsetNeedsLowering = false;
+ int64_t Offset = Addr.getOffset();
+ if (((Offset < 0) || (Offset & (ScaleFactor - 1))) && !isInt<9>(Offset))
+ ImmediateOffsetNeedsLowering = true;
+ else if (Offset > 0 && !(Offset & (ScaleFactor - 1)) &&
+ !isUInt<12>(Offset / ScaleFactor))
+ ImmediateOffsetNeedsLowering = true;
+
+ // Cannot encode an offset register and an immediate offset in the same
+ // instruction. Fold the immediate offset into the load/store instruction and
+ // emit an additional add to take care of the offset register.
+ if (!ImmediateOffsetNeedsLowering && Addr.getOffset() && Addr.getOffsetReg())
+ RegisterOffsetNeedsLowering = true;
+
+ // Cannot encode zero register as base.
+ if (Addr.isRegBase() && Addr.getOffsetReg() && !Addr.getReg())
+ RegisterOffsetNeedsLowering = true;
+
+ // If this is a stack pointer and the offset needs to be simplified then put
+ // the alloca address into a register, set the base type back to register and
+ // continue. This should almost never happen.
+ if ((ImmediateOffsetNeedsLowering || Addr.getOffsetReg()) && Addr.isFIBase())
+ {
+ unsigned ResultReg = createResultReg(&AArch64::GPR64spRegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri),
+ ResultReg)
+ .addFrameIndex(Addr.getFI())
+ .addImm(0)
+ .addImm(0);
+ Addr.setKind(Address::RegBase);
+ Addr.setReg(ResultReg);
+ }
+
+ if (RegisterOffsetNeedsLowering) {
+ unsigned ResultReg = 0;
+ if (Addr.getReg()) {
+ if (Addr.getExtendType() == AArch64_AM::SXTW ||
+ Addr.getExtendType() == AArch64_AM::UXTW )
+ ResultReg = emitAddSub_rx(/*UseAdd=*/true, MVT::i64, Addr.getReg(),
+ /*TODO:IsKill=*/false, Addr.getOffsetReg(),
+ /*TODO:IsKill=*/false, Addr.getExtendType(),
+ Addr.getShift());
+ else
+ ResultReg = emitAddSub_rs(/*UseAdd=*/true, MVT::i64, Addr.getReg(),
+ /*TODO:IsKill=*/false, Addr.getOffsetReg(),
+ /*TODO:IsKill=*/false, AArch64_AM::LSL,
+ Addr.getShift());
+ } else {
+ if (Addr.getExtendType() == AArch64_AM::UXTW)
+ ResultReg = emitLSL_ri(MVT::i64, MVT::i32, Addr.getOffsetReg(),
+ /*Op0IsKill=*/false, Addr.getShift(),
+ /*IsZExt=*/true);
+ else if (Addr.getExtendType() == AArch64_AM::SXTW)
+ ResultReg = emitLSL_ri(MVT::i64, MVT::i32, Addr.getOffsetReg(),
+ /*Op0IsKill=*/false, Addr.getShift(),
+ /*IsZExt=*/false);
+ else
+ ResultReg = emitLSL_ri(MVT::i64, MVT::i64, Addr.getOffsetReg(),
+ /*Op0IsKill=*/false, Addr.getShift());
+ }
+ if (!ResultReg)
+ return false;
+
+ Addr.setReg(ResultReg);
+ Addr.setOffsetReg(0);
+ Addr.setShift(0);
+ Addr.setExtendType(AArch64_AM::InvalidShiftExtend);
+ }
+
+ // Since the offset is too large for the load/store instruction get the
+ // reg+offset into a register.
+ if (ImmediateOffsetNeedsLowering) {
+ unsigned ResultReg;
+ if (Addr.getReg())
+ // Try to fold the immediate into the add instruction.
+ ResultReg = emitAdd_ri_(MVT::i64, Addr.getReg(), /*IsKill=*/false, Offset);
+ else
+ ResultReg = fastEmit_i(MVT::i64, MVT::i64, ISD::Constant, Offset);
+
+ if (!ResultReg)
+ return false;
+ Addr.setReg(ResultReg);
+ Addr.setOffset(0);
+ }
+ return true;
+}
+
+void AArch64FastISel::addLoadStoreOperands(Address &Addr,
+ const MachineInstrBuilder &MIB,
+ MachineMemOperand::Flags Flags,
+ unsigned ScaleFactor,
+ MachineMemOperand *MMO) {
+ int64_t Offset = Addr.getOffset() / ScaleFactor;
+ // Frame base works a bit differently. Handle it separately.
+ if (Addr.isFIBase()) {
+ int FI = Addr.getFI();
+ // FIXME: We shouldn't be using getObjectSize/getObjectAlignment. The size
+ // and alignment should be based on the VT.
+ MMO = FuncInfo.MF->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*FuncInfo.MF, FI, Offset), Flags,
+ MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+ // Now add the rest of the operands.
+ MIB.addFrameIndex(FI).addImm(Offset);
+ } else {
+ assert(Addr.isRegBase() && "Unexpected address kind.");
+ const MCInstrDesc &II = MIB->getDesc();
+ unsigned Idx = (Flags & MachineMemOperand::MOStore) ? 1 : 0;
+ Addr.setReg(
+ constrainOperandRegClass(II, Addr.getReg(), II.getNumDefs()+Idx));
+ Addr.setOffsetReg(
+ constrainOperandRegClass(II, Addr.getOffsetReg(), II.getNumDefs()+Idx+1));
+ if (Addr.getOffsetReg()) {
+ assert(Addr.getOffset() == 0 && "Unexpected offset");
+ bool IsSigned = Addr.getExtendType() == AArch64_AM::SXTW ||
+ Addr.getExtendType() == AArch64_AM::SXTX;
+ MIB.addReg(Addr.getReg());
+ MIB.addReg(Addr.getOffsetReg());
+ MIB.addImm(IsSigned);
+ MIB.addImm(Addr.getShift() != 0);
+ } else
+ MIB.addReg(Addr.getReg()).addImm(Offset);
+ }
+
+ if (MMO)
+ MIB.addMemOperand(MMO);
+}
+
+unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS,
+ const Value *RHS, bool SetFlags,
+ bool WantResult, bool IsZExt) {
+ AArch64_AM::ShiftExtendType ExtendType = AArch64_AM::InvalidShiftExtend;
+ bool NeedExtend = false;
+ switch (RetVT.SimpleTy) {
+ default:
+ return 0;
+ case MVT::i1:
+ NeedExtend = true;
+ break;
+ case MVT::i8:
+ NeedExtend = true;
+ ExtendType = IsZExt ? AArch64_AM::UXTB : AArch64_AM::SXTB;
+ break;
+ case MVT::i16:
+ NeedExtend = true;
+ ExtendType = IsZExt ? AArch64_AM::UXTH : AArch64_AM::SXTH;
+ break;
+ case MVT::i32: // fall-through
+ case MVT::i64:
+ break;
+ }
+ MVT SrcVT = RetVT;
+ RetVT.SimpleTy = std::max(RetVT.SimpleTy, MVT::i32);
+
+ // Canonicalize immediates to the RHS first.
+ if (UseAdd && isa<Constant>(LHS) && !isa<Constant>(RHS))
+ std::swap(LHS, RHS);
+
+ // Canonicalize mul by power of 2 to the RHS.
+ if (UseAdd && LHS->hasOneUse() && isValueAvailable(LHS))
+ if (isMulPowOf2(LHS))
+ std::swap(LHS, RHS);
+
+ // Canonicalize shift immediate to the RHS.
+ if (UseAdd && LHS->hasOneUse() && isValueAvailable(LHS))
+ if (const auto *SI = dyn_cast<BinaryOperator>(LHS))
+ if (isa<ConstantInt>(SI->getOperand(1)))
+ if (SI->getOpcode() == Instruction::Shl ||
+ SI->getOpcode() == Instruction::LShr ||
+ SI->getOpcode() == Instruction::AShr )
+ std::swap(LHS, RHS);
+
+ unsigned LHSReg = getRegForValue(LHS);
+ if (!LHSReg)
+ return 0;
+ bool LHSIsKill = hasTrivialKill(LHS);
+
+ if (NeedExtend)
+ LHSReg = emitIntExt(SrcVT, LHSReg, RetVT, IsZExt);
+
+ unsigned ResultReg = 0;
+ if (const auto *C = dyn_cast<ConstantInt>(RHS)) {
+ uint64_t Imm = IsZExt ? C->getZExtValue() : C->getSExtValue();
+ if (C->isNegative())
+ ResultReg = emitAddSub_ri(!UseAdd, RetVT, LHSReg, LHSIsKill, -Imm,
+ SetFlags, WantResult);
+ else
+ ResultReg = emitAddSub_ri(UseAdd, RetVT, LHSReg, LHSIsKill, Imm, SetFlags,
+ WantResult);
+ } else if (const auto *C = dyn_cast<Constant>(RHS))
+ if (C->isNullValue())
+ ResultReg = emitAddSub_ri(UseAdd, RetVT, LHSReg, LHSIsKill, 0, SetFlags,
+ WantResult);
+
+ if (ResultReg)
+ return ResultReg;
+
+ // Only extend the RHS within the instruction if there is a valid extend type.
+ if (ExtendType != AArch64_AM::InvalidShiftExtend && RHS->hasOneUse() &&
+ isValueAvailable(RHS)) {
+ if (const auto *SI = dyn_cast<BinaryOperator>(RHS))
+ if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1)))
+ if ((SI->getOpcode() == Instruction::Shl) && (C->getZExtValue() < 4)) {
+ unsigned RHSReg = getRegForValue(SI->getOperand(0));
+ if (!RHSReg)
+ return 0;
+ bool RHSIsKill = hasTrivialKill(SI->getOperand(0));
+ return emitAddSub_rx(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg,
+ RHSIsKill, ExtendType, C->getZExtValue(),
+ SetFlags, WantResult);
+ }
+ unsigned RHSReg = getRegForValue(RHS);
+ if (!RHSReg)
+ return 0;
+ bool RHSIsKill = hasTrivialKill(RHS);
+ return emitAddSub_rx(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, RHSIsKill,
+ ExtendType, 0, SetFlags, WantResult);
+ }
+
+ // Check if the mul can be folded into the instruction.
+ if (RHS->hasOneUse() && isValueAvailable(RHS)) {
+ if (isMulPowOf2(RHS)) {
+ const Value *MulLHS = cast<MulOperator>(RHS)->getOperand(0);
+ const Value *MulRHS = cast<MulOperator>(RHS)->getOperand(1);
+
+ if (const auto *C = dyn_cast<ConstantInt>(MulLHS))
+ if (C->getValue().isPowerOf2())
+ std::swap(MulLHS, MulRHS);
+
+ assert(isa<ConstantInt>(MulRHS) && "Expected a ConstantInt.");
+ uint64_t ShiftVal = cast<ConstantInt>(MulRHS)->getValue().logBase2();
+ unsigned RHSReg = getRegForValue(MulLHS);
+ if (!RHSReg)
+ return 0;
+ bool RHSIsKill = hasTrivialKill(MulLHS);
+ ResultReg = emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg,
+ RHSIsKill, AArch64_AM::LSL, ShiftVal, SetFlags,
+ WantResult);
+ if (ResultReg)
+ return ResultReg;
+ }
+ }
+
+ // Check if the shift can be folded into the instruction.
+ if (RHS->hasOneUse() && isValueAvailable(RHS)) {
+ if (const auto *SI = dyn_cast<BinaryOperator>(RHS)) {
+ if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1))) {
+ AArch64_AM::ShiftExtendType ShiftType = AArch64_AM::InvalidShiftExtend;
+ switch (SI->getOpcode()) {
+ default: break;
+ case Instruction::Shl: ShiftType = AArch64_AM::LSL; break;
+ case Instruction::LShr: ShiftType = AArch64_AM::LSR; break;
+ case Instruction::AShr: ShiftType = AArch64_AM::ASR; break;
+ }
+ uint64_t ShiftVal = C->getZExtValue();
+ if (ShiftType != AArch64_AM::InvalidShiftExtend) {
+ unsigned RHSReg = getRegForValue(SI->getOperand(0));
+ if (!RHSReg)
+ return 0;
+ bool RHSIsKill = hasTrivialKill(SI->getOperand(0));
+ ResultReg = emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg,
+ RHSIsKill, ShiftType, ShiftVal, SetFlags,
+ WantResult);
+ if (ResultReg)
+ return ResultReg;
+ }
+ }
+ }
+ }
+
+ unsigned RHSReg = getRegForValue(RHS);
+ if (!RHSReg)
+ return 0;
+ bool RHSIsKill = hasTrivialKill(RHS);
+
+ if (NeedExtend)
+ RHSReg = emitIntExt(SrcVT, RHSReg, RetVT, IsZExt);
+
+ return emitAddSub_rr(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, RHSIsKill,
+ SetFlags, WantResult);
+}
+
+unsigned AArch64FastISel::emitAddSub_rr(bool UseAdd, MVT RetVT, unsigned LHSReg,
+ bool LHSIsKill, unsigned RHSReg,
+ bool RHSIsKill, bool SetFlags,
+ bool WantResult) {
+ assert(LHSReg && RHSReg && "Invalid register number.");
+
+ if (RetVT != MVT::i32 && RetVT != MVT::i64)
+ return 0;
+
+ static const unsigned OpcTable[2][2][2] = {
+ { { AArch64::SUBWrr, AArch64::SUBXrr },
+ { AArch64::ADDWrr, AArch64::ADDXrr } },
+ { { AArch64::SUBSWrr, AArch64::SUBSXrr },
+ { AArch64::ADDSWrr, AArch64::ADDSXrr } }
+ };
+ bool Is64Bit = RetVT == MVT::i64;
+ unsigned Opc = OpcTable[SetFlags][UseAdd][Is64Bit];
+ const TargetRegisterClass *RC =
+ Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+ unsigned ResultReg;
+ if (WantResult)
+ ResultReg = createResultReg(RC);
+ else
+ ResultReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
+
+ const MCInstrDesc &II = TII.get(Opc);
+ LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs());
+ RHSReg = constrainOperandRegClass(II, RHSReg, II.getNumDefs() + 1);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+ .addReg(LHSReg, getKillRegState(LHSIsKill))
+ .addReg(RHSReg, getKillRegState(RHSIsKill));
+ return ResultReg;
+}
+
+unsigned AArch64FastISel::emitAddSub_ri(bool UseAdd, MVT RetVT, unsigned LHSReg,
+ bool LHSIsKill, uint64_t Imm,
+ bool SetFlags, bool WantResult) {
+ assert(LHSReg && "Invalid register number.");
+
+ if (RetVT != MVT::i32 && RetVT != MVT::i64)
+ return 0;
+
+ unsigned ShiftImm;
+ if (isUInt<12>(Imm))
+ ShiftImm = 0;
+ else if ((Imm & 0xfff000) == Imm) {
+ ShiftImm = 12;
+ Imm >>= 12;
+ } else
+ return 0;
+
+ static const unsigned OpcTable[2][2][2] = {
+ { { AArch64::SUBWri, AArch64::SUBXri },
+ { AArch64::ADDWri, AArch64::ADDXri } },
+ { { AArch64::SUBSWri, AArch64::SUBSXri },
+ { AArch64::ADDSWri, AArch64::ADDSXri } }
+ };
+ bool Is64Bit = RetVT == MVT::i64;
+ unsigned Opc = OpcTable[SetFlags][UseAdd][Is64Bit];
+ const TargetRegisterClass *RC;
+ if (SetFlags)
+ RC = Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+ else
+ RC = Is64Bit ? &AArch64::GPR64spRegClass : &AArch64::GPR32spRegClass;
+ unsigned ResultReg;
+ if (WantResult)
+ ResultReg = createResultReg(RC);
+ else
+ ResultReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
+
+ const MCInstrDesc &II = TII.get(Opc);
+ LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs());
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+ .addReg(LHSReg, getKillRegState(LHSIsKill))
+ .addImm(Imm)
+ .addImm(getShifterImm(AArch64_AM::LSL, ShiftImm));
+ return ResultReg;
+}
+
+unsigned AArch64FastISel::emitAddSub_rs(bool UseAdd, MVT RetVT, unsigned LHSReg,
+ bool LHSIsKill, unsigned RHSReg,
+ bool RHSIsKill,
+ AArch64_AM::ShiftExtendType ShiftType,
+ uint64_t ShiftImm, bool SetFlags,
+ bool WantResult) {
+ assert(LHSReg && RHSReg && "Invalid register number.");
+
+ if (RetVT != MVT::i32 && RetVT != MVT::i64)
+ return 0;
+
+ // Don't deal with undefined shifts.
+ if (ShiftImm >= RetVT.getSizeInBits())
+ return 0;
+
+ static const unsigned OpcTable[2][2][2] = {
+ { { AArch64::SUBWrs, AArch64::SUBXrs },
+ { AArch64::ADDWrs, AArch64::ADDXrs } },
+ { { AArch64::SUBSWrs, AArch64::SUBSXrs },
+ { AArch64::ADDSWrs, AArch64::ADDSXrs } }
+ };
+ bool Is64Bit = RetVT == MVT::i64;
+ unsigned Opc = OpcTable[SetFlags][UseAdd][Is64Bit];
+ const TargetRegisterClass *RC =
+ Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+ unsigned ResultReg;
+ if (WantResult)
+ ResultReg = createResultReg(RC);
+ else
+ ResultReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
+
+ const MCInstrDesc &II = TII.get(Opc);
+ LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs());
+ RHSReg = constrainOperandRegClass(II, RHSReg, II.getNumDefs() + 1);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+ .addReg(LHSReg, getKillRegState(LHSIsKill))
+ .addReg(RHSReg, getKillRegState(RHSIsKill))
+ .addImm(getShifterImm(ShiftType, ShiftImm));
+ return ResultReg;
+}
+
+unsigned AArch64FastISel::emitAddSub_rx(bool UseAdd, MVT RetVT, unsigned LHSReg,
+ bool LHSIsKill, unsigned RHSReg,
+ bool RHSIsKill,
+ AArch64_AM::ShiftExtendType ExtType,
+ uint64_t ShiftImm, bool SetFlags,
+ bool WantResult) {
+ assert(LHSReg && RHSReg && "Invalid register number.");
+
+ if (RetVT != MVT::i32 && RetVT != MVT::i64)
+ return 0;
+
+ if (ShiftImm >= 4)
+ return 0;
+
+ static const unsigned OpcTable[2][2][2] = {
+ { { AArch64::SUBWrx, AArch64::SUBXrx },
+ { AArch64::ADDWrx, AArch64::ADDXrx } },
+ { { AArch64::SUBSWrx, AArch64::SUBSXrx },
+ { AArch64::ADDSWrx, AArch64::ADDSXrx } }
+ };
+ bool Is64Bit = RetVT == MVT::i64;
+ unsigned Opc = OpcTable[SetFlags][UseAdd][Is64Bit];
+ const TargetRegisterClass *RC = nullptr;
+ if (SetFlags)
+ RC = Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+ else
+ RC = Is64Bit ? &AArch64::GPR64spRegClass : &AArch64::GPR32spRegClass;
+ unsigned ResultReg;
+ if (WantResult)
+ ResultReg = createResultReg(RC);
+ else
+ ResultReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
+
+ const MCInstrDesc &II = TII.get(Opc);
+ LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs());
+ RHSReg = constrainOperandRegClass(II, RHSReg, II.getNumDefs() + 1);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+ .addReg(LHSReg, getKillRegState(LHSIsKill))
+ .addReg(RHSReg, getKillRegState(RHSIsKill))
+ .addImm(getArithExtendImm(ExtType, ShiftImm));
+ return ResultReg;
+}
+
+bool AArch64FastISel::emitCmp(const Value *LHS, const Value *RHS, bool IsZExt) {
+ Type *Ty = LHS->getType();
+ EVT EVT = TLI.getValueType(DL, Ty, true);
+ if (!EVT.isSimple())
+ return false;
+ MVT VT = EVT.getSimpleVT();
+
+ switch (VT.SimpleTy) {
+ default:
+ return false;
+ case MVT::i1:
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32:
+ case MVT::i64:
+ return emitICmp(VT, LHS, RHS, IsZExt);
+ case MVT::f32:
+ case MVT::f64:
+ return emitFCmp(VT, LHS, RHS);
+ }
+}
+
+bool AArch64FastISel::emitICmp(MVT RetVT, const Value *LHS, const Value *RHS,
+ bool IsZExt) {
+ return emitSub(RetVT, LHS, RHS, /*SetFlags=*/true, /*WantResult=*/false,
+ IsZExt) != 0;
+}
+
+bool AArch64FastISel::emitICmp_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill,
+ uint64_t Imm) {
+ return emitAddSub_ri(/*UseAdd=*/false, RetVT, LHSReg, LHSIsKill, Imm,
+ /*SetFlags=*/true, /*WantResult=*/false) != 0;
+}
+
+bool AArch64FastISel::emitFCmp(MVT RetVT, const Value *LHS, const Value *RHS) {
+ if (RetVT != MVT::f32 && RetVT != MVT::f64)
+ return false;
+
+ // Check to see if the 2nd operand is a constant that we can encode directly
+ // in the compare.
+ bool UseImm = false;
+ if (const auto *CFP = dyn_cast<ConstantFP>(RHS))
+ if (CFP->isZero() && !CFP->isNegative())
+ UseImm = true;
+
+ unsigned LHSReg = getRegForValue(LHS);
+ if (!LHSReg)
+ return false;
+ bool LHSIsKill = hasTrivialKill(LHS);
+
+ if (UseImm) {
+ unsigned Opc = (RetVT == MVT::f64) ? AArch64::FCMPDri : AArch64::FCMPSri;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
+ .addReg(LHSReg, getKillRegState(LHSIsKill));
+ return true;
+ }
+
+ unsigned RHSReg = getRegForValue(RHS);
+ if (!RHSReg)
+ return false;
+ bool RHSIsKill = hasTrivialKill(RHS);
+
+ unsigned Opc = (RetVT == MVT::f64) ? AArch64::FCMPDrr : AArch64::FCMPSrr;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
+ .addReg(LHSReg, getKillRegState(LHSIsKill))
+ .addReg(RHSReg, getKillRegState(RHSIsKill));
+ return true;
+}
+
+unsigned AArch64FastISel::emitAdd(MVT RetVT, const Value *LHS, const Value *RHS,
+ bool SetFlags, bool WantResult, bool IsZExt) {
+ return emitAddSub(/*UseAdd=*/true, RetVT, LHS, RHS, SetFlags, WantResult,
+ IsZExt);
+}
+
+/// \brief This method is a wrapper to simplify add emission.
+///
+/// First try to emit an add with an immediate operand using emitAddSub_ri. If
+/// that fails, then try to materialize the immediate into a register and use
+/// emitAddSub_rr instead.
+unsigned AArch64FastISel::emitAdd_ri_(MVT VT, unsigned Op0, bool Op0IsKill,
+ int64_t Imm) {
+ unsigned ResultReg;
+ if (Imm < 0)
+ ResultReg = emitAddSub_ri(false, VT, Op0, Op0IsKill, -Imm);
+ else
+ ResultReg = emitAddSub_ri(true, VT, Op0, Op0IsKill, Imm);
+
+ if (ResultReg)
+ return ResultReg;
+
+ unsigned CReg = fastEmit_i(VT, VT, ISD::Constant, Imm);
+ if (!CReg)
+ return 0;
+
+ ResultReg = emitAddSub_rr(true, VT, Op0, Op0IsKill, CReg, true);
+ return ResultReg;
+}
+
+unsigned AArch64FastISel::emitSub(MVT RetVT, const Value *LHS, const Value *RHS,
+ bool SetFlags, bool WantResult, bool IsZExt) {
+ return emitAddSub(/*UseAdd=*/false, RetVT, LHS, RHS, SetFlags, WantResult,
+ IsZExt);
+}
+
+unsigned AArch64FastISel::emitSubs_rr(MVT RetVT, unsigned LHSReg,
+ bool LHSIsKill, unsigned RHSReg,
+ bool RHSIsKill, bool WantResult) {
+ return emitAddSub_rr(/*UseAdd=*/false, RetVT, LHSReg, LHSIsKill, RHSReg,
+ RHSIsKill, /*SetFlags=*/true, WantResult);
+}
+
+unsigned AArch64FastISel::emitSubs_rs(MVT RetVT, unsigned LHSReg,
+ bool LHSIsKill, unsigned RHSReg,
+ bool RHSIsKill,
+ AArch64_AM::ShiftExtendType ShiftType,
+ uint64_t ShiftImm, bool WantResult) {
+ return emitAddSub_rs(/*UseAdd=*/false, RetVT, LHSReg, LHSIsKill, RHSReg,
+ RHSIsKill, ShiftType, ShiftImm, /*SetFlags=*/true,
+ WantResult);
+}
+
+unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT,
+ const Value *LHS, const Value *RHS) {
+ // Canonicalize immediates to the RHS first.
+ if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS))
+ std::swap(LHS, RHS);
+
+ // Canonicalize mul by power-of-2 to the RHS.
+ if (LHS->hasOneUse() && isValueAvailable(LHS))
+ if (isMulPowOf2(LHS))
+ std::swap(LHS, RHS);
+
+ // Canonicalize shift immediate to the RHS.
+ if (LHS->hasOneUse() && isValueAvailable(LHS))
+ if (const auto *SI = dyn_cast<ShlOperator>(LHS))
+ if (isa<ConstantInt>(SI->getOperand(1)))
+ std::swap(LHS, RHS);
+
+ unsigned LHSReg = getRegForValue(LHS);
+ if (!LHSReg)
+ return 0;
+ bool LHSIsKill = hasTrivialKill(LHS);
+
+ unsigned ResultReg = 0;
+ if (const auto *C = dyn_cast<ConstantInt>(RHS)) {
+ uint64_t Imm = C->getZExtValue();
+ ResultReg = emitLogicalOp_ri(ISDOpc, RetVT, LHSReg, LHSIsKill, Imm);
+ }
+ if (ResultReg)
+ return ResultReg;
+
+ // Check if the mul can be folded into the instruction.
+ if (RHS->hasOneUse() && isValueAvailable(RHS)) {
+ if (isMulPowOf2(RHS)) {
+ const Value *MulLHS = cast<MulOperator>(RHS)->getOperand(0);
+ const Value *MulRHS = cast<MulOperator>(RHS)->getOperand(1);
+
+ if (const auto *C = dyn_cast<ConstantInt>(MulLHS))
+ if (C->getValue().isPowerOf2())
+ std::swap(MulLHS, MulRHS);
+
+ assert(isa<ConstantInt>(MulRHS) && "Expected a ConstantInt.");
+ uint64_t ShiftVal = cast<ConstantInt>(MulRHS)->getValue().logBase2();
+
+ unsigned RHSReg = getRegForValue(MulLHS);
+ if (!RHSReg)
+ return 0;
+ bool RHSIsKill = hasTrivialKill(MulLHS);
+ ResultReg = emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg,
+ RHSIsKill, ShiftVal);
+ if (ResultReg)
+ return ResultReg;
+ }
+ }
+
+ // Check if the shift can be folded into the instruction.
+ if (RHS->hasOneUse() && isValueAvailable(RHS)) {
+ if (const auto *SI = dyn_cast<ShlOperator>(RHS))
+ if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1))) {
+ uint64_t ShiftVal = C->getZExtValue();
+ unsigned RHSReg = getRegForValue(SI->getOperand(0));
+ if (!RHSReg)
+ return 0;
+ bool RHSIsKill = hasTrivialKill(SI->getOperand(0));
+ ResultReg = emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg,
+ RHSIsKill, ShiftVal);
+ if (ResultReg)
+ return ResultReg;
+ }
+ }
+
+ unsigned RHSReg = getRegForValue(RHS);
+ if (!RHSReg)
+ return 0;
+ bool RHSIsKill = hasTrivialKill(RHS);
+
+ MVT VT = std::max(MVT::i32, RetVT.SimpleTy);
+ ResultReg = fastEmit_rr(VT, VT, ISDOpc, LHSReg, LHSIsKill, RHSReg, RHSIsKill);
+ if (RetVT >= MVT::i8 && RetVT <= MVT::i16) {
+ uint64_t Mask = (RetVT == MVT::i8) ? 0xff : 0xffff;
+ ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
+ }
+ return ResultReg;
+}
+
+unsigned AArch64FastISel::emitLogicalOp_ri(unsigned ISDOpc, MVT RetVT,
+ unsigned LHSReg, bool LHSIsKill,
+ uint64_t Imm) {
+ static_assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR),
+ "ISD nodes are not consecutive!");
+ static const unsigned OpcTable[3][2] = {
+ { AArch64::ANDWri, AArch64::ANDXri },
+ { AArch64::ORRWri, AArch64::ORRXri },
+ { AArch64::EORWri, AArch64::EORXri }
+ };
+ const TargetRegisterClass *RC;
+ unsigned Opc;
+ unsigned RegSize;
+ switch (RetVT.SimpleTy) {
+ default:
+ return 0;
+ case MVT::i1:
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32: {
+ unsigned Idx = ISDOpc - ISD::AND;
+ Opc = OpcTable[Idx][0];
+ RC = &AArch64::GPR32spRegClass;
+ RegSize = 32;
+ break;
+ }
+ case MVT::i64:
+ Opc = OpcTable[ISDOpc - ISD::AND][1];
+ RC = &AArch64::GPR64spRegClass;
+ RegSize = 64;
+ break;
+ }
+
+ if (!AArch64_AM::isLogicalImmediate(Imm, RegSize))
+ return 0;
+
+ unsigned ResultReg =
+ fastEmitInst_ri(Opc, RC, LHSReg, LHSIsKill,
+ AArch64_AM::encodeLogicalImmediate(Imm, RegSize));
+ if (RetVT >= MVT::i8 && RetVT <= MVT::i16 && ISDOpc != ISD::AND) {
+ uint64_t Mask = (RetVT == MVT::i8) ? 0xff : 0xffff;
+ ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
+ }
+ return ResultReg;
+}
+
+unsigned AArch64FastISel::emitLogicalOp_rs(unsigned ISDOpc, MVT RetVT,
+ unsigned LHSReg, bool LHSIsKill,
+ unsigned RHSReg, bool RHSIsKill,
+ uint64_t ShiftImm) {
+ static_assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR),
+ "ISD nodes are not consecutive!");
+ static const unsigned OpcTable[3][2] = {
+ { AArch64::ANDWrs, AArch64::ANDXrs },
+ { AArch64::ORRWrs, AArch64::ORRXrs },
+ { AArch64::EORWrs, AArch64::EORXrs }
+ };
+
+ // Don't deal with undefined shifts.
+ if (ShiftImm >= RetVT.getSizeInBits())
+ return 0;
+
+ const TargetRegisterClass *RC;
+ unsigned Opc;
+ switch (RetVT.SimpleTy) {
+ default:
+ return 0;
+ case MVT::i1:
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32:
+ Opc = OpcTable[ISDOpc - ISD::AND][0];
+ RC = &AArch64::GPR32RegClass;
+ break;
+ case MVT::i64:
+ Opc = OpcTable[ISDOpc - ISD::AND][1];
+ RC = &AArch64::GPR64RegClass;
+ break;
+ }
+ unsigned ResultReg =
+ fastEmitInst_rri(Opc, RC, LHSReg, LHSIsKill, RHSReg, RHSIsKill,
+ AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftImm));
+ if (RetVT >= MVT::i8 && RetVT <= MVT::i16) {
+ uint64_t Mask = (RetVT == MVT::i8) ? 0xff : 0xffff;
+ ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
+ }
+ return ResultReg;
+}
+
+unsigned AArch64FastISel::emitAnd_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill,
+ uint64_t Imm) {
+ return emitLogicalOp_ri(ISD::AND, RetVT, LHSReg, LHSIsKill, Imm);
+}
+
+unsigned AArch64FastISel::emitLoad(MVT VT, MVT RetVT, Address Addr,
+ bool WantZExt, MachineMemOperand *MMO) {
+ if (!TLI.allowsMisalignedMemoryAccesses(VT))
+ return 0;
+
+ // Simplify this down to something we can handle.
+ if (!simplifyAddress(Addr, VT))
+ return 0;
+
+ unsigned ScaleFactor = getImplicitScaleFactor(VT);
+ if (!ScaleFactor)
+ llvm_unreachable("Unexpected value type.");
+
+ // Negative offsets require unscaled, 9-bit, signed immediate offsets.
+ // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets.
+ bool UseScaled = true;
+ if ((Addr.getOffset() < 0) || (Addr.getOffset() & (ScaleFactor - 1))) {
+ UseScaled = false;
+ ScaleFactor = 1;
+ }
+
+ static const unsigned GPOpcTable[2][8][4] = {
+ // Sign-extend.
+ { { AArch64::LDURSBWi, AArch64::LDURSHWi, AArch64::LDURWi,
+ AArch64::LDURXi },
+ { AArch64::LDURSBXi, AArch64::LDURSHXi, AArch64::LDURSWi,
+ AArch64::LDURXi },
+ { AArch64::LDRSBWui, AArch64::LDRSHWui, AArch64::LDRWui,
+ AArch64::LDRXui },
+ { AArch64::LDRSBXui, AArch64::LDRSHXui, AArch64::LDRSWui,
+ AArch64::LDRXui },
+ { AArch64::LDRSBWroX, AArch64::LDRSHWroX, AArch64::LDRWroX,
+ AArch64::LDRXroX },
+ { AArch64::LDRSBXroX, AArch64::LDRSHXroX, AArch64::LDRSWroX,
+ AArch64::LDRXroX },
+ { AArch64::LDRSBWroW, AArch64::LDRSHWroW, AArch64::LDRWroW,
+ AArch64::LDRXroW },
+ { AArch64::LDRSBXroW, AArch64::LDRSHXroW, AArch64::LDRSWroW,
+ AArch64::LDRXroW }
+ },
+ // Zero-extend.
+ { { AArch64::LDURBBi, AArch64::LDURHHi, AArch64::LDURWi,
+ AArch64::LDURXi },
+ { AArch64::LDURBBi, AArch64::LDURHHi, AArch64::LDURWi,
+ AArch64::LDURXi },
+ { AArch64::LDRBBui, AArch64::LDRHHui, AArch64::LDRWui,
+ AArch64::LDRXui },
+ { AArch64::LDRBBui, AArch64::LDRHHui, AArch64::LDRWui,
+ AArch64::LDRXui },
+ { AArch64::LDRBBroX, AArch64::LDRHHroX, AArch64::LDRWroX,
+ AArch64::LDRXroX },
+ { AArch64::LDRBBroX, AArch64::LDRHHroX, AArch64::LDRWroX,
+ AArch64::LDRXroX },
+ { AArch64::LDRBBroW, AArch64::LDRHHroW, AArch64::LDRWroW,
+ AArch64::LDRXroW },
+ { AArch64::LDRBBroW, AArch64::LDRHHroW, AArch64::LDRWroW,
+ AArch64::LDRXroW }
+ }
+ };
+
+ static const unsigned FPOpcTable[4][2] = {
+ { AArch64::LDURSi, AArch64::LDURDi },
+ { AArch64::LDRSui, AArch64::LDRDui },
+ { AArch64::LDRSroX, AArch64::LDRDroX },
+ { AArch64::LDRSroW, AArch64::LDRDroW }
+ };
+
+ unsigned Opc;
+ const TargetRegisterClass *RC;
+ bool UseRegOffset = Addr.isRegBase() && !Addr.getOffset() && Addr.getReg() &&
+ Addr.getOffsetReg();
+ unsigned Idx = UseRegOffset ? 2 : UseScaled ? 1 : 0;
+ if (Addr.getExtendType() == AArch64_AM::UXTW ||
+ Addr.getExtendType() == AArch64_AM::SXTW)
+ Idx++;
+
+ bool IsRet64Bit = RetVT == MVT::i64;
+ switch (VT.SimpleTy) {
+ default:
+ llvm_unreachable("Unexpected value type.");
+ case MVT::i1: // Intentional fall-through.
+ case MVT::i8:
+ Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][0];
+ RC = (IsRet64Bit && !WantZExt) ?
+ &AArch64::GPR64RegClass: &AArch64::GPR32RegClass;
+ break;
+ case MVT::i16:
+ Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][1];
+ RC = (IsRet64Bit && !WantZExt) ?
+ &AArch64::GPR64RegClass: &AArch64::GPR32RegClass;
+ break;
+ case MVT::i32:
+ Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][2];
+ RC = (IsRet64Bit && !WantZExt) ?
+ &AArch64::GPR64RegClass: &AArch64::GPR32RegClass;
+ break;
+ case MVT::i64:
+ Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][3];
+ RC = &AArch64::GPR64RegClass;
+ break;
+ case MVT::f32:
+ Opc = FPOpcTable[Idx][0];
+ RC = &AArch64::FPR32RegClass;
+ break;
+ case MVT::f64:
+ Opc = FPOpcTable[Idx][1];
+ RC = &AArch64::FPR64RegClass;
+ break;
+ }
+
+ // Create the base instruction, then add the operands.
+ unsigned ResultReg = createResultReg(RC);
+ MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc), ResultReg);
+ addLoadStoreOperands(Addr, MIB, MachineMemOperand::MOLoad, ScaleFactor, MMO);
+
+ // Loading an i1 requires special handling.
+ if (VT == MVT::i1) {
+ unsigned ANDReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, 1);
+ assert(ANDReg && "Unexpected AND instruction emission failure.");
+ ResultReg = ANDReg;
+ }
+
+ // For zero-extending loads to 64bit we emit a 32bit load and then convert
+ // the 32bit reg to a 64bit reg.
+ if (WantZExt && RetVT == MVT::i64 && VT <= MVT::i32) {
+ unsigned Reg64 = createResultReg(&AArch64::GPR64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(AArch64::SUBREG_TO_REG), Reg64)
+ .addImm(0)
+ .addReg(ResultReg, getKillRegState(true))
+ .addImm(AArch64::sub_32);
+ ResultReg = Reg64;
+ }
+ return ResultReg;
+}
+
+bool AArch64FastISel::selectAddSub(const Instruction *I) {
+ MVT VT;
+ if (!isTypeSupported(I->getType(), VT, /*IsVectorAllowed=*/true))
+ return false;
+
+ if (VT.isVector())
+ return selectOperator(I, I->getOpcode());
+
+ unsigned ResultReg;
+ switch (I->getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected instruction.");
+ case Instruction::Add:
+ ResultReg = emitAdd(VT, I->getOperand(0), I->getOperand(1));
+ break;
+ case Instruction::Sub:
+ ResultReg = emitSub(VT, I->getOperand(0), I->getOperand(1));
+ break;
+ }
+ if (!ResultReg)
+ return false;
+
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool AArch64FastISel::selectLogicalOp(const Instruction *I) {
+ MVT VT;
+ if (!isTypeSupported(I->getType(), VT, /*IsVectorAllowed=*/true))
+ return false;
+
+ if (VT.isVector())
+ return selectOperator(I, I->getOpcode());
+
+ unsigned ResultReg;
+ switch (I->getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected instruction.");
+ case Instruction::And:
+ ResultReg = emitLogicalOp(ISD::AND, VT, I->getOperand(0), I->getOperand(1));
+ break;
+ case Instruction::Or:
+ ResultReg = emitLogicalOp(ISD::OR, VT, I->getOperand(0), I->getOperand(1));
+ break;
+ case Instruction::Xor:
+ ResultReg = emitLogicalOp(ISD::XOR, VT, I->getOperand(0), I->getOperand(1));
+ break;
+ }
+ if (!ResultReg)
+ return false;
+
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool AArch64FastISel::selectLoad(const Instruction *I) {
+ MVT VT;
+ // Verify we have a legal type before going any further. Currently, we handle
+ // simple types that will directly fit in a register (i32/f32/i64/f64) or
+ // those that can be sign or zero-extended to a basic operation (i1/i8/i16).
+ if (!isTypeSupported(I->getType(), VT, /*IsVectorAllowed=*/true) ||
+ cast<LoadInst>(I)->isAtomic())
+ return false;
+
+ const Value *SV = I->getOperand(0);
+ if (TLI.supportSwiftError()) {
+ // Swifterror values can come from either a function parameter with
+ // swifterror attribute or an alloca with swifterror attribute.
+ if (const Argument *Arg = dyn_cast<Argument>(SV)) {
+ if (Arg->hasSwiftErrorAttr())
+ return false;
+ }
+
+ if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(SV)) {
+ if (Alloca->isSwiftError())
+ return false;
+ }
+ }
+
+ // See if we can handle this address.
+ Address Addr;
+ if (!computeAddress(I->getOperand(0), Addr, I->getType()))
+ return false;
+
+ // Fold the following sign-/zero-extend into the load instruction.
+ bool WantZExt = true;
+ MVT RetVT = VT;
+ const Value *IntExtVal = nullptr;
+ if (I->hasOneUse()) {
+ if (const auto *ZE = dyn_cast<ZExtInst>(I->use_begin()->getUser())) {
+ if (isTypeSupported(ZE->getType(), RetVT))
+ IntExtVal = ZE;
+ else
+ RetVT = VT;
+ } else if (const auto *SE = dyn_cast<SExtInst>(I->use_begin()->getUser())) {
+ if (isTypeSupported(SE->getType(), RetVT))
+ IntExtVal = SE;
+ else
+ RetVT = VT;
+ WantZExt = false;
+ }
+ }
+
+ unsigned ResultReg =
+ emitLoad(VT, RetVT, Addr, WantZExt, createMachineMemOperandFor(I));
+ if (!ResultReg)
+ return false;
+
+ // There are a few different cases we have to handle, because the load or the
+ // sign-/zero-extend might not be selected by FastISel if we fall-back to
+ // SelectionDAG. There is also an ordering issue when both instructions are in
+ // different basic blocks.
+ // 1.) The load instruction is selected by FastISel, but the integer extend
+ // not. This usually happens when the integer extend is in a different
+ // basic block and SelectionDAG took over for that basic block.
+ // 2.) The load instruction is selected before the integer extend. This only
+ // happens when the integer extend is in a different basic block.
+ // 3.) The load instruction is selected by SelectionDAG and the integer extend
+ // by FastISel. This happens if there are instructions between the load
+ // and the integer extend that couldn't be selected by FastISel.
+ if (IntExtVal) {
+ // The integer extend hasn't been emitted yet. FastISel or SelectionDAG
+ // could select it. Emit a copy to subreg if necessary. FastISel will remove
+ // it when it selects the integer extend.
+ unsigned Reg = lookUpRegForValue(IntExtVal);
+ auto *MI = MRI.getUniqueVRegDef(Reg);
+ if (!MI) {
+ if (RetVT == MVT::i64 && VT <= MVT::i32) {
+ if (WantZExt) {
+ // Delete the last emitted instruction from emitLoad (SUBREG_TO_REG).
+ std::prev(FuncInfo.InsertPt)->eraseFromParent();
+ ResultReg = std::prev(FuncInfo.InsertPt)->getOperand(0).getReg();
+ } else
+ ResultReg = fastEmitInst_extractsubreg(MVT::i32, ResultReg,
+ /*IsKill=*/true,
+ AArch64::sub_32);
+ }
+ updateValueMap(I, ResultReg);
+ return true;
+ }
+
+ // The integer extend has already been emitted - delete all the instructions
+ // that have been emitted by the integer extend lowering code and use the
+ // result from the load instruction directly.
+ while (MI) {
+ Reg = 0;
+ for (auto &Opnd : MI->uses()) {
+ if (Opnd.isReg()) {
+ Reg = Opnd.getReg();
+ break;
+ }
+ }
+ MI->eraseFromParent();
+ MI = nullptr;
+ if (Reg)
+ MI = MRI.getUniqueVRegDef(Reg);
+ }
+ updateValueMap(IntExtVal, ResultReg);
+ return true;
+ }
+
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool AArch64FastISel::emitStoreRelease(MVT VT, unsigned SrcReg,
+ unsigned AddrReg,
+ MachineMemOperand *MMO) {
+ unsigned Opc;
+ switch (VT.SimpleTy) {
+ default: return false;
+ case MVT::i8: Opc = AArch64::STLRB; break;
+ case MVT::i16: Opc = AArch64::STLRH; break;
+ case MVT::i32: Opc = AArch64::STLRW; break;
+ case MVT::i64: Opc = AArch64::STLRX; break;
+ }
+
+ const MCInstrDesc &II = TII.get(Opc);
+ SrcReg = constrainOperandRegClass(II, SrcReg, 0);
+ AddrReg = constrainOperandRegClass(II, AddrReg, 1);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+ .addReg(SrcReg)
+ .addReg(AddrReg)
+ .addMemOperand(MMO);
+ return true;
+}
+
+bool AArch64FastISel::emitStore(MVT VT, unsigned SrcReg, Address Addr,
+ MachineMemOperand *MMO) {
+ if (!TLI.allowsMisalignedMemoryAccesses(VT))
+ return false;
+
+ // Simplify this down to something we can handle.
+ if (!simplifyAddress(Addr, VT))
+ return false;
+
+ unsigned ScaleFactor = getImplicitScaleFactor(VT);
+ if (!ScaleFactor)
+ llvm_unreachable("Unexpected value type.");
+
+ // Negative offsets require unscaled, 9-bit, signed immediate offsets.
+ // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets.
+ bool UseScaled = true;
+ if ((Addr.getOffset() < 0) || (Addr.getOffset() & (ScaleFactor - 1))) {
+ UseScaled = false;
+ ScaleFactor = 1;
+ }
+
+ static const unsigned OpcTable[4][6] = {
+ { AArch64::STURBBi, AArch64::STURHHi, AArch64::STURWi, AArch64::STURXi,
+ AArch64::STURSi, AArch64::STURDi },
+ { AArch64::STRBBui, AArch64::STRHHui, AArch64::STRWui, AArch64::STRXui,
+ AArch64::STRSui, AArch64::STRDui },
+ { AArch64::STRBBroX, AArch64::STRHHroX, AArch64::STRWroX, AArch64::STRXroX,
+ AArch64::STRSroX, AArch64::STRDroX },
+ { AArch64::STRBBroW, AArch64::STRHHroW, AArch64::STRWroW, AArch64::STRXroW,
+ AArch64::STRSroW, AArch64::STRDroW }
+ };
+
+ unsigned Opc;
+ bool VTIsi1 = false;
+ bool UseRegOffset = Addr.isRegBase() && !Addr.getOffset() && Addr.getReg() &&
+ Addr.getOffsetReg();
+ unsigned Idx = UseRegOffset ? 2 : UseScaled ? 1 : 0;
+ if (Addr.getExtendType() == AArch64_AM::UXTW ||
+ Addr.getExtendType() == AArch64_AM::SXTW)
+ Idx++;
+
+ switch (VT.SimpleTy) {
+ default: llvm_unreachable("Unexpected value type.");
+ case MVT::i1: VTIsi1 = true;
+ case MVT::i8: Opc = OpcTable[Idx][0]; break;
+ case MVT::i16: Opc = OpcTable[Idx][1]; break;
+ case MVT::i32: Opc = OpcTable[Idx][2]; break;
+ case MVT::i64: Opc = OpcTable[Idx][3]; break;
+ case MVT::f32: Opc = OpcTable[Idx][4]; break;
+ case MVT::f64: Opc = OpcTable[Idx][5]; break;
+ }
+
+ // Storing an i1 requires special handling.
+ if (VTIsi1 && SrcReg != AArch64::WZR) {
+ unsigned ANDReg = emitAnd_ri(MVT::i32, SrcReg, /*TODO:IsKill=*/false, 1);
+ assert(ANDReg && "Unexpected AND instruction emission failure.");
+ SrcReg = ANDReg;
+ }
+ // Create the base instruction, then add the operands.
+ const MCInstrDesc &II = TII.get(Opc);
+ SrcReg = constrainOperandRegClass(II, SrcReg, II.getNumDefs());
+ MachineInstrBuilder MIB =
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addReg(SrcReg);
+ addLoadStoreOperands(Addr, MIB, MachineMemOperand::MOStore, ScaleFactor, MMO);
+
+ return true;
+}
+
+bool AArch64FastISel::selectStore(const Instruction *I) {
+ MVT VT;
+ const Value *Op0 = I->getOperand(0);
+ // Verify we have a legal type before going any further. Currently, we handle
+ // simple types that will directly fit in a register (i32/f32/i64/f64) or
+ // those that can be sign or zero-extended to a basic operation (i1/i8/i16).
+ if (!isTypeSupported(Op0->getType(), VT, /*IsVectorAllowed=*/true))
+ return false;
+
+ const Value *PtrV = I->getOperand(1);
+ if (TLI.supportSwiftError()) {
+ // Swifterror values can come from either a function parameter with
+ // swifterror attribute or an alloca with swifterror attribute.
+ if (const Argument *Arg = dyn_cast<Argument>(PtrV)) {
+ if (Arg->hasSwiftErrorAttr())
+ return false;
+ }
+
+ if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrV)) {
+ if (Alloca->isSwiftError())
+ return false;
+ }
+ }
+
+ // Get the value to be stored into a register. Use the zero register directly
+ // when possible to avoid an unnecessary copy and a wasted register.
+ unsigned SrcReg = 0;
+ if (const auto *CI = dyn_cast<ConstantInt>(Op0)) {
+ if (CI->isZero())
+ SrcReg = (VT == MVT::i64) ? AArch64::XZR : AArch64::WZR;
+ } else if (const auto *CF = dyn_cast<ConstantFP>(Op0)) {
+ if (CF->isZero() && !CF->isNegative()) {
+ VT = MVT::getIntegerVT(VT.getSizeInBits());
+ SrcReg = (VT == MVT::i64) ? AArch64::XZR : AArch64::WZR;
+ }
+ }
+
+ if (!SrcReg)
+ SrcReg = getRegForValue(Op0);
+
+ if (!SrcReg)
+ return false;
+
+ auto *SI = cast<StoreInst>(I);
+
+ // Try to emit a STLR for seq_cst/release.
+ if (SI->isAtomic()) {
+ AtomicOrdering Ord = SI->getOrdering();
+ // The non-atomic instructions are sufficient for relaxed stores.
+ if (isReleaseOrStronger(Ord)) {
+ // The STLR addressing mode only supports a base reg; pass that directly.
+ unsigned AddrReg = getRegForValue(PtrV);
+ return emitStoreRelease(VT, SrcReg, AddrReg,
+ createMachineMemOperandFor(I));
+ }
+ }
+
+ // See if we can handle this address.
+ Address Addr;
+ if (!computeAddress(PtrV, Addr, Op0->getType()))
+ return false;
+
+ if (!emitStore(VT, SrcReg, Addr, createMachineMemOperandFor(I)))
+ return false;
+ return true;
+}
+
+static AArch64CC::CondCode getCompareCC(CmpInst::Predicate Pred) {
+ switch (Pred) {
+ case CmpInst::FCMP_ONE:
+ case CmpInst::FCMP_UEQ:
+ default:
+ // AL is our "false" for now. The other two need more compares.
+ return AArch64CC::AL;
+ case CmpInst::ICMP_EQ:
+ case CmpInst::FCMP_OEQ:
+ return AArch64CC::EQ;
+ case CmpInst::ICMP_SGT:
+ case CmpInst::FCMP_OGT:
+ return AArch64CC::GT;
+ case CmpInst::ICMP_SGE:
+ case CmpInst::FCMP_OGE:
+ return AArch64CC::GE;
+ case CmpInst::ICMP_UGT:
+ case CmpInst::FCMP_UGT:
+ return AArch64CC::HI;
+ case CmpInst::FCMP_OLT:
+ return AArch64CC::MI;
+ case CmpInst::ICMP_ULE:
+ case CmpInst::FCMP_OLE:
+ return AArch64CC::LS;
+ case CmpInst::FCMP_ORD:
+ return AArch64CC::VC;
+ case CmpInst::FCMP_UNO:
+ return AArch64CC::VS;
+ case CmpInst::FCMP_UGE:
+ return AArch64CC::PL;
+ case CmpInst::ICMP_SLT:
+ case CmpInst::FCMP_ULT:
+ return AArch64CC::LT;
+ case CmpInst::ICMP_SLE:
+ case CmpInst::FCMP_ULE:
+ return AArch64CC::LE;
+ case CmpInst::FCMP_UNE:
+ case CmpInst::ICMP_NE:
+ return AArch64CC::NE;
+ case CmpInst::ICMP_UGE:
+ return AArch64CC::HS;
+ case CmpInst::ICMP_ULT:
+ return AArch64CC::LO;
+ }
+}
+
+/// \brief Try to emit a combined compare-and-branch instruction.
+bool AArch64FastISel::emitCompareAndBranch(const BranchInst *BI) {
+ assert(isa<CmpInst>(BI->getCondition()) && "Expected cmp instruction");
+ const CmpInst *CI = cast<CmpInst>(BI->getCondition());
+ CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+
+ const Value *LHS = CI->getOperand(0);
+ const Value *RHS = CI->getOperand(1);
+
+ MVT VT;
+ if (!isTypeSupported(LHS->getType(), VT))
+ return false;
+
+ unsigned BW = VT.getSizeInBits();
+ if (BW > 64)
+ return false;
+
+ MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
+ MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
+
+ // Try to take advantage of fallthrough opportunities.
+ if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+ std::swap(TBB, FBB);
+ Predicate = CmpInst::getInversePredicate(Predicate);
+ }
+
+ int TestBit = -1;
+ bool IsCmpNE;
+ switch (Predicate) {
+ default:
+ return false;
+ case CmpInst::ICMP_EQ:
+ case CmpInst::ICMP_NE:
+ if (isa<Constant>(LHS) && cast<Constant>(LHS)->isNullValue())
+ std::swap(LHS, RHS);
+
+ if (!isa<Constant>(RHS) || !cast<Constant>(RHS)->isNullValue())
+ return false;
+
+ if (const auto *AI = dyn_cast<BinaryOperator>(LHS))
+ if (AI->getOpcode() == Instruction::And && isValueAvailable(AI)) {
+ const Value *AndLHS = AI->getOperand(0);
+ const Value *AndRHS = AI->getOperand(1);
+
+ if (const auto *C = dyn_cast<ConstantInt>(AndLHS))
+ if (C->getValue().isPowerOf2())
+ std::swap(AndLHS, AndRHS);
+
+ if (const auto *C = dyn_cast<ConstantInt>(AndRHS))
+ if (C->getValue().isPowerOf2()) {
+ TestBit = C->getValue().logBase2();
+ LHS = AndLHS;
+ }
+ }
+
+ if (VT == MVT::i1)
+ TestBit = 0;
+
+ IsCmpNE = Predicate == CmpInst::ICMP_NE;
+ break;
+ case CmpInst::ICMP_SLT:
+ case CmpInst::ICMP_SGE:
+ if (!isa<Constant>(RHS) || !cast<Constant>(RHS)->isNullValue())
+ return false;
+
+ TestBit = BW - 1;
+ IsCmpNE = Predicate == CmpInst::ICMP_SLT;
+ break;
+ case CmpInst::ICMP_SGT:
+ case CmpInst::ICMP_SLE:
+ if (!isa<ConstantInt>(RHS))
+ return false;
+
+ if (cast<ConstantInt>(RHS)->getValue() != APInt(BW, -1, true))
+ return false;
+
+ TestBit = BW - 1;
+ IsCmpNE = Predicate == CmpInst::ICMP_SLE;
+ break;
+ } // end switch
+
+ static const unsigned OpcTable[2][2][2] = {
+ { {AArch64::CBZW, AArch64::CBZX },
+ {AArch64::CBNZW, AArch64::CBNZX} },
+ { {AArch64::TBZW, AArch64::TBZX },
+ {AArch64::TBNZW, AArch64::TBNZX} }
+ };
+
+ bool IsBitTest = TestBit != -1;
+ bool Is64Bit = BW == 64;
+ if (TestBit < 32 && TestBit >= 0)
+ Is64Bit = false;
+
+ unsigned Opc = OpcTable[IsBitTest][IsCmpNE][Is64Bit];
+ const MCInstrDesc &II = TII.get(Opc);
+
+ unsigned SrcReg = getRegForValue(LHS);
+ if (!SrcReg)
+ return false;
+ bool SrcIsKill = hasTrivialKill(LHS);
+
+ if (BW == 64 && !Is64Bit)
+ SrcReg = fastEmitInst_extractsubreg(MVT::i32, SrcReg, SrcIsKill,
+ AArch64::sub_32);
+
+ if ((BW < 32) && !IsBitTest)
+ SrcReg = emitIntExt(VT, SrcReg, MVT::i32, /*IsZExt=*/true);
+
+ // Emit the combined compare and branch instruction.
+ SrcReg = constrainOperandRegClass(II, SrcReg, II.getNumDefs());
+ MachineInstrBuilder MIB =
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
+ .addReg(SrcReg, getKillRegState(SrcIsKill));
+ if (IsBitTest)
+ MIB.addImm(TestBit);
+ MIB.addMBB(TBB);
+
+ finishCondBranch(BI->getParent(), TBB, FBB);
+ return true;
+}
+
+bool AArch64FastISel::selectBranch(const Instruction *I) {
+ const BranchInst *BI = cast<BranchInst>(I);
+ if (BI->isUnconditional()) {
+ MachineBasicBlock *MSucc = FuncInfo.MBBMap[BI->getSuccessor(0)];
+ fastEmitBranch(MSucc, BI->getDebugLoc());
+ return true;
+ }
+
+ MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
+ MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
+
+ if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
+ if (CI->hasOneUse() && isValueAvailable(CI)) {
+ // Try to optimize or fold the cmp.
+ CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+ switch (Predicate) {
+ default:
+ break;
+ case CmpInst::FCMP_FALSE:
+ fastEmitBranch(FBB, DbgLoc);
+ return true;
+ case CmpInst::FCMP_TRUE:
+ fastEmitBranch(TBB, DbgLoc);
+ return true;
+ }
+
+ // Try to emit a combined compare-and-branch first.
+ if (emitCompareAndBranch(BI))
+ return true;
+
+ // Try to take advantage of fallthrough opportunities.
+ if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+ std::swap(TBB, FBB);
+ Predicate = CmpInst::getInversePredicate(Predicate);
+ }
+
+ // Emit the cmp.
+ if (!emitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
+ return false;
+
+ // FCMP_UEQ and FCMP_ONE cannot be checked with a single branch
+ // instruction.
+ AArch64CC::CondCode CC = getCompareCC(Predicate);
+ AArch64CC::CondCode ExtraCC = AArch64CC::AL;
+ switch (Predicate) {
+ default:
+ break;
+ case CmpInst::FCMP_UEQ:
+ ExtraCC = AArch64CC::EQ;
+ CC = AArch64CC::VS;
+ break;
+ case CmpInst::FCMP_ONE:
+ ExtraCC = AArch64CC::MI;
+ CC = AArch64CC::GT;
+ break;
+ }
+ assert((CC != AArch64CC::AL) && "Unexpected condition code.");
+
+ // Emit the extra branch for FCMP_UEQ and FCMP_ONE.
+ if (ExtraCC != AArch64CC::AL) {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
+ .addImm(ExtraCC)
+ .addMBB(TBB);
+ }
+
+ // Emit the branch.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
+ .addImm(CC)
+ .addMBB(TBB);
+
+ finishCondBranch(BI->getParent(), TBB, FBB);
+ return true;
+ }
+ } else if (const auto *CI = dyn_cast<ConstantInt>(BI->getCondition())) {
+ uint64_t Imm = CI->getZExtValue();
+ MachineBasicBlock *Target = (Imm == 0) ? FBB : TBB;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::B))
+ .addMBB(Target);
+
+ // Obtain the branch probability and add the target to the successor list.
+ if (FuncInfo.BPI) {
+ auto BranchProbability = FuncInfo.BPI->getEdgeProbability(
+ BI->getParent(), Target->getBasicBlock());
+ FuncInfo.MBB->addSuccessor(Target, BranchProbability);
+ } else
+ FuncInfo.MBB->addSuccessorWithoutProb(Target);
+ return true;
+ } else {
+ AArch64CC::CondCode CC = AArch64CC::NE;
+ if (foldXALUIntrinsic(CC, I, BI->getCondition())) {
+ // Fake request the condition, otherwise the intrinsic might be completely
+ // optimized away.
+ unsigned CondReg = getRegForValue(BI->getCondition());
+ if (!CondReg)
+ return false;
+
+ // Emit the branch.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
+ .addImm(CC)
+ .addMBB(TBB);
+
+ finishCondBranch(BI->getParent(), TBB, FBB);
+ return true;
+ }
+ }
+
+ unsigned CondReg = getRegForValue(BI->getCondition());
+ if (CondReg == 0)
+ return false;
+ bool CondRegIsKill = hasTrivialKill(BI->getCondition());
+
+ // i1 conditions come as i32 values, test the lowest bit with tb(n)z.
+ unsigned Opcode = AArch64::TBNZW;
+ if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+ std::swap(TBB, FBB);
+ Opcode = AArch64::TBZW;
+ }
+
+ const MCInstrDesc &II = TII.get(Opcode);
+ unsigned ConstrainedCondReg
+ = constrainOperandRegClass(II, CondReg, II.getNumDefs());
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+ .addReg(ConstrainedCondReg, getKillRegState(CondRegIsKill))
+ .addImm(0)
+ .addMBB(TBB);
+
+ finishCondBranch(BI->getParent(), TBB, FBB);
+ return true;
+}
+
+bool AArch64FastISel::selectIndirectBr(const Instruction *I) {
+ const IndirectBrInst *BI = cast<IndirectBrInst>(I);
+ unsigned AddrReg = getRegForValue(BI->getOperand(0));
+ if (AddrReg == 0)
+ return false;
+
+ // Emit the indirect branch.
+ const MCInstrDesc &II = TII.get(AArch64::BR);
+ AddrReg = constrainOperandRegClass(II, AddrReg, II.getNumDefs());
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addReg(AddrReg);
+
+ // Make sure the CFG is up-to-date.
+ for (auto *Succ : BI->successors())
+ FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[Succ]);
+
+ return true;
+}
+
+bool AArch64FastISel::selectCmp(const Instruction *I) {
+ const CmpInst *CI = cast<CmpInst>(I);
+
+ // Vectors of i1 are weird: bail out.
+ if (CI->getType()->isVectorTy())
+ return false;
+
+ // Try to optimize or fold the cmp.
+ CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+ unsigned ResultReg = 0;
+ switch (Predicate) {
+ default:
+ break;
+ case CmpInst::FCMP_FALSE:
+ ResultReg = createResultReg(&AArch64::GPR32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg)
+ .addReg(AArch64::WZR, getKillRegState(true));
+ break;
+ case CmpInst::FCMP_TRUE:
+ ResultReg = fastEmit_i(MVT::i32, MVT::i32, ISD::Constant, 1);
+ break;
+ }
+
+ if (ResultReg) {
+ updateValueMap(I, ResultReg);
+ return true;
+ }
+
+ // Emit the cmp.
+ if (!emitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
+ return false;
+
+ ResultReg = createResultReg(&AArch64::GPR32RegClass);
+
+ // FCMP_UEQ and FCMP_ONE cannot be checked with a single instruction. These
+ // condition codes are inverted, because they are used by CSINC.
+ static unsigned CondCodeTable[2][2] = {
+ { AArch64CC::NE, AArch64CC::VC },
+ { AArch64CC::PL, AArch64CC::LE }
+ };
+ unsigned *CondCodes = nullptr;
+ switch (Predicate) {
+ default:
+ break;
+ case CmpInst::FCMP_UEQ:
+ CondCodes = &CondCodeTable[0][0];
+ break;
+ case CmpInst::FCMP_ONE:
+ CondCodes = &CondCodeTable[1][0];
+ break;
+ }
+
+ if (CondCodes) {
+ unsigned TmpReg1 = createResultReg(&AArch64::GPR32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::CSINCWr),
+ TmpReg1)
+ .addReg(AArch64::WZR, getKillRegState(true))
+ .addReg(AArch64::WZR, getKillRegState(true))
+ .addImm(CondCodes[0]);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::CSINCWr),
+ ResultReg)
+ .addReg(TmpReg1, getKillRegState(true))
+ .addReg(AArch64::WZR, getKillRegState(true))
+ .addImm(CondCodes[1]);
+
+ updateValueMap(I, ResultReg);
+ return true;
+ }
+
+ // Now set a register based on the comparison.
+ AArch64CC::CondCode CC = getCompareCC(Predicate);
+ assert((CC != AArch64CC::AL) && "Unexpected condition code.");
+ AArch64CC::CondCode invertedCC = getInvertedCondCode(CC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::CSINCWr),
+ ResultReg)
+ .addReg(AArch64::WZR, getKillRegState(true))
+ .addReg(AArch64::WZR, getKillRegState(true))
+ .addImm(invertedCC);
+
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+/// \brief Optimize selects of i1 if one of the operands has a 'true' or 'false'
+/// value.
+bool AArch64FastISel::optimizeSelect(const SelectInst *SI) {
+ if (!SI->getType()->isIntegerTy(1))
+ return false;
+
+ const Value *Src1Val, *Src2Val;
+ unsigned Opc = 0;
+ bool NeedExtraOp = false;
+ if (auto *CI = dyn_cast<ConstantInt>(SI->getTrueValue())) {
+ if (CI->isOne()) {
+ Src1Val = SI->getCondition();
+ Src2Val = SI->getFalseValue();
+ Opc = AArch64::ORRWrr;
+ } else {
+ assert(CI->isZero());
+ Src1Val = SI->getFalseValue();
+ Src2Val = SI->getCondition();
+ Opc = AArch64::BICWrr;
+ }
+ } else if (auto *CI = dyn_cast<ConstantInt>(SI->getFalseValue())) {
+ if (CI->isOne()) {
+ Src1Val = SI->getCondition();
+ Src2Val = SI->getTrueValue();
+ Opc = AArch64::ORRWrr;
+ NeedExtraOp = true;
+ } else {
+ assert(CI->isZero());
+ Src1Val = SI->getCondition();
+ Src2Val = SI->getTrueValue();
+ Opc = AArch64::ANDWrr;
+ }
+ }
+
+ if (!Opc)
+ return false;
+
+ unsigned Src1Reg = getRegForValue(Src1Val);
+ if (!Src1Reg)
+ return false;
+ bool Src1IsKill = hasTrivialKill(Src1Val);
+
+ unsigned Src2Reg = getRegForValue(Src2Val);
+ if (!Src2Reg)
+ return false;
+ bool Src2IsKill = hasTrivialKill(Src2Val);
+
+ if (NeedExtraOp) {
+ Src1Reg = emitLogicalOp_ri(ISD::XOR, MVT::i32, Src1Reg, Src1IsKill, 1);
+ Src1IsKill = true;
+ }
+ unsigned ResultReg = fastEmitInst_rr(Opc, &AArch64::GPR32RegClass, Src1Reg,
+ Src1IsKill, Src2Reg, Src2IsKill);
+ updateValueMap(SI, ResultReg);
+ return true;
+}
+
+bool AArch64FastISel::selectSelect(const Instruction *I) {
+ assert(isa<SelectInst>(I) && "Expected a select instruction.");
+ MVT VT;
+ if (!isTypeSupported(I->getType(), VT))
+ return false;
+
+ unsigned Opc;
+ const TargetRegisterClass *RC;
+ switch (VT.SimpleTy) {
+ default:
+ return false;
+ case MVT::i1:
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32:
+ Opc = AArch64::CSELWr;
+ RC = &AArch64::GPR32RegClass;
+ break;
+ case MVT::i64:
+ Opc = AArch64::CSELXr;
+ RC = &AArch64::GPR64RegClass;
+ break;
+ case MVT::f32:
+ Opc = AArch64::FCSELSrrr;
+ RC = &AArch64::FPR32RegClass;
+ break;
+ case MVT::f64:
+ Opc = AArch64::FCSELDrrr;
+ RC = &AArch64::FPR64RegClass;
+ break;
+ }
+
+ const SelectInst *SI = cast<SelectInst>(I);
+ const Value *Cond = SI->getCondition();
+ AArch64CC::CondCode CC = AArch64CC::NE;
+ AArch64CC::CondCode ExtraCC = AArch64CC::AL;
+
+ if (optimizeSelect(SI))
+ return true;
+
+ // Try to pickup the flags, so we don't have to emit another compare.
+ if (foldXALUIntrinsic(CC, I, Cond)) {
+ // Fake request the condition to force emission of the XALU intrinsic.
+ unsigned CondReg = getRegForValue(Cond);
+ if (!CondReg)
+ return false;
+ } else if (isa<CmpInst>(Cond) && cast<CmpInst>(Cond)->hasOneUse() &&
+ isValueAvailable(Cond)) {
+ const auto *Cmp = cast<CmpInst>(Cond);
+ // Try to optimize or fold the cmp.
+ CmpInst::Predicate Predicate = optimizeCmpPredicate(Cmp);
+ const Value *FoldSelect = nullptr;
+ switch (Predicate) {
+ default:
+ break;
+ case CmpInst::FCMP_FALSE:
+ FoldSelect = SI->getFalseValue();
+ break;
+ case CmpInst::FCMP_TRUE:
+ FoldSelect = SI->getTrueValue();
+ break;
+ }
+
+ if (FoldSelect) {
+ unsigned SrcReg = getRegForValue(FoldSelect);
+ if (!SrcReg)
+ return false;
+ unsigned UseReg = lookUpRegForValue(SI);
+ if (UseReg)
+ MRI.clearKillFlags(UseReg);
+
+ updateValueMap(I, SrcReg);
+ return true;
+ }
+
+ // Emit the cmp.
+ if (!emitCmp(Cmp->getOperand(0), Cmp->getOperand(1), Cmp->isUnsigned()))
+ return false;
+
+ // FCMP_UEQ and FCMP_ONE cannot be checked with a single select instruction.
+ CC = getCompareCC(Predicate);
+ switch (Predicate) {
+ default:
+ break;
+ case CmpInst::FCMP_UEQ:
+ ExtraCC = AArch64CC::EQ;
+ CC = AArch64CC::VS;
+ break;
+ case CmpInst::FCMP_ONE:
+ ExtraCC = AArch64CC::MI;
+ CC = AArch64CC::GT;
+ break;
+ }
+ assert((CC != AArch64CC::AL) && "Unexpected condition code.");
+ } else {
+ unsigned CondReg = getRegForValue(Cond);
+ if (!CondReg)
+ return false;
+ bool CondIsKill = hasTrivialKill(Cond);
+
+ const MCInstrDesc &II = TII.get(AArch64::ANDSWri);
+ CondReg = constrainOperandRegClass(II, CondReg, 1);
+
+ // Emit a TST instruction (ANDS wzr, reg, #imm).
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II,
+ AArch64::WZR)
+ .addReg(CondReg, getKillRegState(CondIsKill))
+ .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
+ }
+
+ unsigned Src1Reg = getRegForValue(SI->getTrueValue());
+ bool Src1IsKill = hasTrivialKill(SI->getTrueValue());
+
+ unsigned Src2Reg = getRegForValue(SI->getFalseValue());
+ bool Src2IsKill = hasTrivialKill(SI->getFalseValue());
+
+ if (!Src1Reg || !Src2Reg)
+ return false;
+
+ if (ExtraCC != AArch64CC::AL) {
+ Src2Reg = fastEmitInst_rri(Opc, RC, Src1Reg, Src1IsKill, Src2Reg,
+ Src2IsKill, ExtraCC);
+ Src2IsKill = true;
+ }
+ unsigned ResultReg = fastEmitInst_rri(Opc, RC, Src1Reg, Src1IsKill, Src2Reg,
+ Src2IsKill, CC);
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool AArch64FastISel::selectFPExt(const Instruction *I) {
+ Value *V = I->getOperand(0);
+ if (!I->getType()->isDoubleTy() || !V->getType()->isFloatTy())
+ return false;
+
+ unsigned Op = getRegForValue(V);
+ if (Op == 0)
+ return false;
+
+ unsigned ResultReg = createResultReg(&AArch64::FPR64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::FCVTDSr),
+ ResultReg).addReg(Op);
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool AArch64FastISel::selectFPTrunc(const Instruction *I) {
+ Value *V = I->getOperand(0);
+ if (!I->getType()->isFloatTy() || !V->getType()->isDoubleTy())
+ return false;
+
+ unsigned Op = getRegForValue(V);
+ if (Op == 0)
+ return false;
+
+ unsigned ResultReg = createResultReg(&AArch64::FPR32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::FCVTSDr),
+ ResultReg).addReg(Op);
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+// FPToUI and FPToSI
+bool AArch64FastISel::selectFPToInt(const Instruction *I, bool Signed) {
+ MVT DestVT;
+ if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector())
+ return false;
+
+ unsigned SrcReg = getRegForValue(I->getOperand(0));
+ if (SrcReg == 0)
+ return false;
+
+ EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType(), true);
+ if (SrcVT == MVT::f128)
+ return false;
+
+ unsigned Opc;
+ if (SrcVT == MVT::f64) {
+ if (Signed)
+ Opc = (DestVT == MVT::i32) ? AArch64::FCVTZSUWDr : AArch64::FCVTZSUXDr;
+ else
+ Opc = (DestVT == MVT::i32) ? AArch64::FCVTZUUWDr : AArch64::FCVTZUUXDr;
+ } else {
+ if (Signed)
+ Opc = (DestVT == MVT::i32) ? AArch64::FCVTZSUWSr : AArch64::FCVTZSUXSr;
+ else
+ Opc = (DestVT == MVT::i32) ? AArch64::FCVTZUUWSr : AArch64::FCVTZUUXSr;
+ }
+ unsigned ResultReg = createResultReg(
+ DestVT == MVT::i32 ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+ .addReg(SrcReg);
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool AArch64FastISel::selectIntToFP(const Instruction *I, bool Signed) {
+ MVT DestVT;
+ if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector())
+ return false;
+ assert ((DestVT == MVT::f32 || DestVT == MVT::f64) &&
+ "Unexpected value type.");
+
+ unsigned SrcReg = getRegForValue(I->getOperand(0));
+ if (!SrcReg)
+ return false;
+ bool SrcIsKill = hasTrivialKill(I->getOperand(0));
+
+ EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType(), true);
+
+ // Handle sign-extension.
+ if (SrcVT == MVT::i16 || SrcVT == MVT::i8 || SrcVT == MVT::i1) {
+ SrcReg =
+ emitIntExt(SrcVT.getSimpleVT(), SrcReg, MVT::i32, /*isZExt*/ !Signed);
+ if (!SrcReg)
+ return false;
+ SrcIsKill = true;
+ }
+
+ unsigned Opc;
+ if (SrcVT == MVT::i64) {
+ if (Signed)
+ Opc = (DestVT == MVT::f32) ? AArch64::SCVTFUXSri : AArch64::SCVTFUXDri;
+ else
+ Opc = (DestVT == MVT::f32) ? AArch64::UCVTFUXSri : AArch64::UCVTFUXDri;
+ } else {
+ if (Signed)
+ Opc = (DestVT == MVT::f32) ? AArch64::SCVTFUWSri : AArch64::SCVTFUWDri;
+ else
+ Opc = (DestVT == MVT::f32) ? AArch64::UCVTFUWSri : AArch64::UCVTFUWDri;
+ }
+
+ unsigned ResultReg = fastEmitInst_r(Opc, TLI.getRegClassFor(DestVT), SrcReg,
+ SrcIsKill);
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool AArch64FastISel::fastLowerArguments() {
+ if (!FuncInfo.CanLowerReturn)
+ return false;
+
+ const Function *F = FuncInfo.Fn;
+ if (F->isVarArg())
+ return false;
+
+ CallingConv::ID CC = F->getCallingConv();
+ if (CC != CallingConv::C && CC != CallingConv::Swift)
+ return false;
+
+ // Only handle simple cases of up to 8 GPR and FPR each.
+ unsigned GPRCnt = 0;
+ unsigned FPRCnt = 0;
+ unsigned Idx = 0;
+ for (auto const &Arg : F->args()) {
+ // The first argument is at index 1.
+ ++Idx;
+ if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) ||
+ F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
+ F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
+ F->getAttributes().hasAttribute(Idx, Attribute::SwiftSelf) ||
+ F->getAttributes().hasAttribute(Idx, Attribute::SwiftError) ||
+ F->getAttributes().hasAttribute(Idx, Attribute::Nest))
+ return false;
+
+ Type *ArgTy = Arg.getType();
+ if (ArgTy->isStructTy() || ArgTy->isArrayTy())
+ return false;
+
+ EVT ArgVT = TLI.getValueType(DL, ArgTy);
+ if (!ArgVT.isSimple())
+ return false;
+
+ MVT VT = ArgVT.getSimpleVT().SimpleTy;
+ if (VT.isFloatingPoint() && !Subtarget->hasFPARMv8())
+ return false;
+
+ if (VT.isVector() &&
+ (!Subtarget->hasNEON() || !Subtarget->isLittleEndian()))
+ return false;
+
+ if (VT >= MVT::i1 && VT <= MVT::i64)
+ ++GPRCnt;
+ else if ((VT >= MVT::f16 && VT <= MVT::f64) || VT.is64BitVector() ||
+ VT.is128BitVector())
+ ++FPRCnt;
+ else
+ return false;
+
+ if (GPRCnt > 8 || FPRCnt > 8)
+ return false;
+ }
+
+ static const MCPhysReg Registers[6][8] = {
+ { AArch64::W0, AArch64::W1, AArch64::W2, AArch64::W3, AArch64::W4,
+ AArch64::W5, AArch64::W6, AArch64::W7 },
+ { AArch64::X0, AArch64::X1, AArch64::X2, AArch64::X3, AArch64::X4,
+ AArch64::X5, AArch64::X6, AArch64::X7 },
+ { AArch64::H0, AArch64::H1, AArch64::H2, AArch64::H3, AArch64::H4,
+ AArch64::H5, AArch64::H6, AArch64::H7 },
+ { AArch64::S0, AArch64::S1, AArch64::S2, AArch64::S3, AArch64::S4,
+ AArch64::S5, AArch64::S6, AArch64::S7 },
+ { AArch64::D0, AArch64::D1, AArch64::D2, AArch64::D3, AArch64::D4,
+ AArch64::D5, AArch64::D6, AArch64::D7 },
+ { AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, AArch64::Q4,
+ AArch64::Q5, AArch64::Q6, AArch64::Q7 }
+ };
+
+ unsigned GPRIdx = 0;
+ unsigned FPRIdx = 0;
+ for (auto const &Arg : F->args()) {
+ MVT VT = TLI.getSimpleValueType(DL, Arg.getType());
+ unsigned SrcReg;
+ const TargetRegisterClass *RC;
+ if (VT >= MVT::i1 && VT <= MVT::i32) {
+ SrcReg = Registers[0][GPRIdx++];
+ RC = &AArch64::GPR32RegClass;
+ VT = MVT::i32;
+ } else if (VT == MVT::i64) {
+ SrcReg = Registers[1][GPRIdx++];
+ RC = &AArch64::GPR64RegClass;
+ } else if (VT == MVT::f16) {
+ SrcReg = Registers[2][FPRIdx++];
+ RC = &AArch64::FPR16RegClass;
+ } else if (VT == MVT::f32) {
+ SrcReg = Registers[3][FPRIdx++];
+ RC = &AArch64::FPR32RegClass;
+ } else if ((VT == MVT::f64) || VT.is64BitVector()) {
+ SrcReg = Registers[4][FPRIdx++];
+ RC = &AArch64::FPR64RegClass;
+ } else if (VT.is128BitVector()) {
+ SrcReg = Registers[5][FPRIdx++];
+ RC = &AArch64::FPR128RegClass;
+ } else
+ llvm_unreachable("Unexpected value type.");
+
+ unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
+ // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
+ // Without this, EmitLiveInCopies may eliminate the livein if its only
+ // use is a bitcast (which isn't turned into an instruction).
+ unsigned ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg)
+ .addReg(DstReg, getKillRegState(true));
+ updateValueMap(&Arg, ResultReg);
+ }
+ return true;
+}
+
+bool AArch64FastISel::processCallArgs(CallLoweringInfo &CLI,
+ SmallVectorImpl<MVT> &OutVTs,
+ unsigned &NumBytes) {
+ CallingConv::ID CC = CLI.CallConv;
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CC, false, *FuncInfo.MF, ArgLocs, *Context);
+ CCInfo.AnalyzeCallOperands(OutVTs, CLI.OutFlags, CCAssignFnForCall(CC));
+
+ // Get a count of how many bytes are to be pushed on the stack.
+ NumBytes = CCInfo.getNextStackOffset();
+
+ // Issue CALLSEQ_START
+ unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
+ .addImm(NumBytes);
+
+ // Process the args.
+ for (CCValAssign &VA : ArgLocs) {
+ const Value *ArgVal = CLI.OutVals[VA.getValNo()];
+ MVT ArgVT = OutVTs[VA.getValNo()];
+
+ unsigned ArgReg = getRegForValue(ArgVal);
+ if (!ArgReg)
+ return false;
+
+ // Handle arg promotion: SExt, ZExt, AExt.
+ switch (VA.getLocInfo()) {
+ case CCValAssign::Full:
+ break;
+ case CCValAssign::SExt: {
+ MVT DestVT = VA.getLocVT();
+ MVT SrcVT = ArgVT;
+ ArgReg = emitIntExt(SrcVT, ArgReg, DestVT, /*isZExt=*/false);
+ if (!ArgReg)
+ return false;
+ break;
+ }
+ case CCValAssign::AExt:
+ // Intentional fall-through.
+ case CCValAssign::ZExt: {
+ MVT DestVT = VA.getLocVT();
+ MVT SrcVT = ArgVT;
+ ArgReg = emitIntExt(SrcVT, ArgReg, DestVT, /*isZExt=*/true);
+ if (!ArgReg)
+ return false;
+ break;
+ }
+ default:
+ llvm_unreachable("Unknown arg promotion!");
+ }
+
+ // Now copy/store arg to correct locations.
+ if (VA.isRegLoc() && !VA.needsCustom()) {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg);
+ CLI.OutRegs.push_back(VA.getLocReg());
+ } else if (VA.needsCustom()) {
+ // FIXME: Handle custom args.
+ return false;
+ } else {
+ assert(VA.isMemLoc() && "Assuming store on stack.");
+
+ // Don't emit stores for undef values.
+ if (isa<UndefValue>(ArgVal))
+ continue;
+
+ // Need to store on the stack.
+ unsigned ArgSize = (ArgVT.getSizeInBits() + 7) / 8;
+
+ unsigned BEAlign = 0;
+ if (ArgSize < 8 && !Subtarget->isLittleEndian())
+ BEAlign = 8 - ArgSize;
+
+ Address Addr;
+ Addr.setKind(Address::RegBase);
+ Addr.setReg(AArch64::SP);
+ Addr.setOffset(VA.getLocMemOffset() + BEAlign);
+
+ unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType());
+ MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+ MachinePointerInfo::getStack(*FuncInfo.MF, Addr.getOffset()),
+ MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment);
+
+ if (!emitStore(ArgVT, ArgReg, Addr, MMO))
+ return false;
+ }
+ }
+ return true;
+}
+
+bool AArch64FastISel::finishCall(CallLoweringInfo &CLI, MVT RetVT,
+ unsigned NumBytes) {
+ CallingConv::ID CC = CLI.CallConv;
+
+ // Issue CALLSEQ_END
+ unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp))
+ .addImm(NumBytes).addImm(0);
+
+ // Now the return value.
+ if (RetVT != MVT::isVoid) {
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CC, false, *FuncInfo.MF, RVLocs, *Context);
+ CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC));
+
+ // Only handle a single return value.
+ if (RVLocs.size() != 1)
+ return false;
+
+ // Copy all of the result registers out of their specified physreg.
+ MVT CopyVT = RVLocs[0].getValVT();
+
+ // TODO: Handle big-endian results
+ if (CopyVT.isVector() && !Subtarget->isLittleEndian())
+ return false;
+
+ unsigned ResultReg = createResultReg(TLI.getRegClassFor(CopyVT));
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg)
+ .addReg(RVLocs[0].getLocReg());
+ CLI.InRegs.push_back(RVLocs[0].getLocReg());
+
+ CLI.ResultReg = ResultReg;
+ CLI.NumResultRegs = 1;
+ }
+
+ return true;
+}
+
+bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) {
+ CallingConv::ID CC = CLI.CallConv;
+ bool IsTailCall = CLI.IsTailCall;
+ bool IsVarArg = CLI.IsVarArg;
+ const Value *Callee = CLI.Callee;
+ MCSymbol *Symbol = CLI.Symbol;
+
+ if (!Callee && !Symbol)
+ return false;
+
+ // Allow SelectionDAG isel to handle tail calls.
+ if (IsTailCall)
+ return false;
+
+ CodeModel::Model CM = TM.getCodeModel();
+ // Only support the small and large code model.
+ if (CM != CodeModel::Small && CM != CodeModel::Large)
+ return false;
+
+ // FIXME: Add large code model support for ELF.
+ if (CM == CodeModel::Large && !Subtarget->isTargetMachO())
+ return false;
+
+ // Let SDISel handle vararg functions.
+ if (IsVarArg)
+ return false;
+
+ // FIXME: Only handle *simple* calls for now.
+ MVT RetVT;
+ if (CLI.RetTy->isVoidTy())
+ RetVT = MVT::isVoid;
+ else if (!isTypeLegal(CLI.RetTy, RetVT))
+ return false;
+
+ for (auto Flag : CLI.OutFlags)
+ if (Flag.isInReg() || Flag.isSRet() || Flag.isNest() || Flag.isByVal() ||
+ Flag.isSwiftSelf() || Flag.isSwiftError())
+ return false;
+
+ // Set up the argument vectors.
+ SmallVector<MVT, 16> OutVTs;
+ OutVTs.reserve(CLI.OutVals.size());
+
+ for (auto *Val : CLI.OutVals) {
+ MVT VT;
+ if (!isTypeLegal(Val->getType(), VT) &&
+ !(VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16))
+ return false;
+
+ // We don't handle vector parameters yet.
+ if (VT.isVector() || VT.getSizeInBits() > 64)
+ return false;
+
+ OutVTs.push_back(VT);
+ }
+
+ Address Addr;
+ if (Callee && !computeCallAddress(Callee, Addr))
+ return false;
+
+ // Handle the arguments now that we've gotten them.
+ unsigned NumBytes;
+ if (!processCallArgs(CLI, OutVTs, NumBytes))
+ return false;
+
+ // Issue the call.
+ MachineInstrBuilder MIB;
+ if (CM == CodeModel::Small) {
+ const MCInstrDesc &II = TII.get(Addr.getReg() ? AArch64::BLR : AArch64::BL);
+ MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II);
+ if (Symbol)
+ MIB.addSym(Symbol, 0);
+ else if (Addr.getGlobalValue())
+ MIB.addGlobalAddress(Addr.getGlobalValue(), 0, 0);
+ else if (Addr.getReg()) {
+ unsigned Reg = constrainOperandRegClass(II, Addr.getReg(), 0);
+ MIB.addReg(Reg);
+ } else
+ return false;
+ } else {
+ unsigned CallReg = 0;
+ if (Symbol) {
+ unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
+ ADRPReg)
+ .addSym(Symbol, AArch64II::MO_GOT | AArch64II::MO_PAGE);
+
+ CallReg = createResultReg(&AArch64::GPR64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(AArch64::LDRXui), CallReg)
+ .addReg(ADRPReg)
+ .addSym(Symbol,
+ AArch64II::MO_GOT | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+ } else if (Addr.getGlobalValue())
+ CallReg = materializeGV(Addr.getGlobalValue());
+ else if (Addr.getReg())
+ CallReg = Addr.getReg();
+
+ if (!CallReg)
+ return false;
+
+ const MCInstrDesc &II = TII.get(AArch64::BLR);
+ CallReg = constrainOperandRegClass(II, CallReg, 0);
+ MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addReg(CallReg);
+ }
+
+ // Add implicit physical register uses to the call.
+ for (auto Reg : CLI.OutRegs)
+ MIB.addReg(Reg, RegState::Implicit);
+
+ // Add a register mask with the call-preserved registers.
+ // Proper defs for return values will be added by setPhysRegsDeadExcept().
+ MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC));
+
+ CLI.Call = MIB;
+
+ // Finish off the call including any return values.
+ return finishCall(CLI, RetVT, NumBytes);
+}
+
+bool AArch64FastISel::isMemCpySmall(uint64_t Len, unsigned Alignment) {
+ if (Alignment)
+ return Len / Alignment <= 4;
+ else
+ return Len < 32;
+}
+
+bool AArch64FastISel::tryEmitSmallMemCpy(Address Dest, Address Src,
+ uint64_t Len, unsigned Alignment) {
+ // Make sure we don't bloat code by inlining very large memcpy's.
+ if (!isMemCpySmall(Len, Alignment))
+ return false;
+
+ int64_t UnscaledOffset = 0;
+ Address OrigDest = Dest;
+ Address OrigSrc = Src;
+
+ while (Len) {
+ MVT VT;
+ if (!Alignment || Alignment >= 8) {
+ if (Len >= 8)
+ VT = MVT::i64;
+ else if (Len >= 4)
+ VT = MVT::i32;
+ else if (Len >= 2)
+ VT = MVT::i16;
+ else {
+ VT = MVT::i8;
+ }
+ } else {
+ // Bound based on alignment.
+ if (Len >= 4 && Alignment == 4)
+ VT = MVT::i32;
+ else if (Len >= 2 && Alignment == 2)
+ VT = MVT::i16;
+ else {
+ VT = MVT::i8;
+ }
+ }
+
+ unsigned ResultReg = emitLoad(VT, VT, Src);
+ if (!ResultReg)
+ return false;
+
+ if (!emitStore(VT, ResultReg, Dest))
+ return false;
+
+ int64_t Size = VT.getSizeInBits() / 8;
+ Len -= Size;
+ UnscaledOffset += Size;
+
+ // We need to recompute the unscaled offset for each iteration.
+ Dest.setOffset(OrigDest.getOffset() + UnscaledOffset);
+ Src.setOffset(OrigSrc.getOffset() + UnscaledOffset);
+ }
+
+ return true;
+}
+
+/// \brief Check if it is possible to fold the condition from the XALU intrinsic
+/// into the user. The condition code will only be updated on success.
+bool AArch64FastISel::foldXALUIntrinsic(AArch64CC::CondCode &CC,
+ const Instruction *I,
+ const Value *Cond) {
+ if (!isa<ExtractValueInst>(Cond))
+ return false;
+
+ const auto *EV = cast<ExtractValueInst>(Cond);
+ if (!isa<IntrinsicInst>(EV->getAggregateOperand()))
+ return false;
+
+ const auto *II = cast<IntrinsicInst>(EV->getAggregateOperand());
+ MVT RetVT;
+ const Function *Callee = II->getCalledFunction();
+ Type *RetTy =
+ cast<StructType>(Callee->getReturnType())->getTypeAtIndex(0U);
+ if (!isTypeLegal(RetTy, RetVT))
+ return false;
+
+ if (RetVT != MVT::i32 && RetVT != MVT::i64)
+ return false;
+
+ const Value *LHS = II->getArgOperand(0);
+ const Value *RHS = II->getArgOperand(1);
+
+ // Canonicalize immediate to the RHS.
+ if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) &&
+ isCommutativeIntrinsic(II))
+ std::swap(LHS, RHS);
+
+ // Simplify multiplies.
+ Intrinsic::ID IID = II->getIntrinsicID();
+ switch (IID) {
+ default:
+ break;
+ case Intrinsic::smul_with_overflow:
+ if (const auto *C = dyn_cast<ConstantInt>(RHS))
+ if (C->getValue() == 2)
+ IID = Intrinsic::sadd_with_overflow;
+ break;
+ case Intrinsic::umul_with_overflow:
+ if (const auto *C = dyn_cast<ConstantInt>(RHS))
+ if (C->getValue() == 2)
+ IID = Intrinsic::uadd_with_overflow;
+ break;
+ }
+
+ AArch64CC::CondCode TmpCC;
+ switch (IID) {
+ default:
+ return false;
+ case Intrinsic::sadd_with_overflow:
+ case Intrinsic::ssub_with_overflow:
+ TmpCC = AArch64CC::VS;
+ break;
+ case Intrinsic::uadd_with_overflow:
+ TmpCC = AArch64CC::HS;
+ break;
+ case Intrinsic::usub_with_overflow:
+ TmpCC = AArch64CC::LO;
+ break;
+ case Intrinsic::smul_with_overflow:
+ case Intrinsic::umul_with_overflow:
+ TmpCC = AArch64CC::NE;
+ break;
+ }
+
+ // Check if both instructions are in the same basic block.
+ if (!isValueAvailable(II))
+ return false;
+
+ // Make sure nothing is in the way
+ BasicBlock::const_iterator Start(I);
+ BasicBlock::const_iterator End(II);
+ for (auto Itr = std::prev(Start); Itr != End; --Itr) {
+ // We only expect extractvalue instructions between the intrinsic and the
+ // instruction to be selected.
+ if (!isa<ExtractValueInst>(Itr))
+ return false;
+
+ // Check that the extractvalue operand comes from the intrinsic.
+ const auto *EVI = cast<ExtractValueInst>(Itr);
+ if (EVI->getAggregateOperand() != II)
+ return false;
+ }
+
+ CC = TmpCC;
+ return true;
+}
+
+bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
+ // FIXME: Handle more intrinsics.
+ switch (II->getIntrinsicID()) {
+ default: return false;
+ case Intrinsic::frameaddress: {
+ MachineFrameInfo &MFI = FuncInfo.MF->getFrameInfo();
+ MFI.setFrameAddressIsTaken(true);
+
+ const AArch64RegisterInfo *RegInfo =
+ static_cast<const AArch64RegisterInfo *>(Subtarget->getRegisterInfo());
+ unsigned FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF));
+ unsigned SrcReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), SrcReg).addReg(FramePtr);
+ // Recursively load frame address
+ // ldr x0, [fp]
+ // ldr x0, [x0]
+ // ldr x0, [x0]
+ // ...
+ unsigned DestReg;
+ unsigned Depth = cast<ConstantInt>(II->getOperand(0))->getZExtValue();
+ while (Depth--) {
+ DestReg = fastEmitInst_ri(AArch64::LDRXui, &AArch64::GPR64RegClass,
+ SrcReg, /*IsKill=*/true, 0);
+ assert(DestReg && "Unexpected LDR instruction emission failure.");
+ SrcReg = DestReg;
+ }
+
+ updateValueMap(II, SrcReg);
+ return true;
+ }
+ case Intrinsic::memcpy:
+ case Intrinsic::memmove: {
+ const auto *MTI = cast<MemTransferInst>(II);
+ // Don't handle volatile.
+ if (MTI->isVolatile())
+ return false;
+
+ // Disable inlining for memmove before calls to ComputeAddress. Otherwise,
+ // we would emit dead code because we don't currently handle memmoves.
+ bool IsMemCpy = (II->getIntrinsicID() == Intrinsic::memcpy);
+ if (isa<ConstantInt>(MTI->getLength()) && IsMemCpy) {
+ // Small memcpy's are common enough that we want to do them without a call
+ // if possible.
+ uint64_t Len = cast<ConstantInt>(MTI->getLength())->getZExtValue();
+ unsigned Alignment = MTI->getAlignment();
+ if (isMemCpySmall(Len, Alignment)) {
+ Address Dest, Src;
+ if (!computeAddress(MTI->getRawDest(), Dest) ||
+ !computeAddress(MTI->getRawSource(), Src))
+ return false;
+ if (tryEmitSmallMemCpy(Dest, Src, Len, Alignment))
+ return true;
+ }
+ }
+
+ if (!MTI->getLength()->getType()->isIntegerTy(64))
+ return false;
+
+ if (MTI->getSourceAddressSpace() > 255 || MTI->getDestAddressSpace() > 255)
+ // Fast instruction selection doesn't support the special
+ // address spaces.
+ return false;
+
+ const char *IntrMemName = isa<MemCpyInst>(II) ? "memcpy" : "memmove";
+ return lowerCallTo(II, IntrMemName, II->getNumArgOperands() - 2);
+ }
+ case Intrinsic::memset: {
+ const MemSetInst *MSI = cast<MemSetInst>(II);
+ // Don't handle volatile.
+ if (MSI->isVolatile())
+ return false;
+
+ if (!MSI->getLength()->getType()->isIntegerTy(64))
+ return false;
+
+ if (MSI->getDestAddressSpace() > 255)
+ // Fast instruction selection doesn't support the special
+ // address spaces.
+ return false;
+
+ return lowerCallTo(II, "memset", II->getNumArgOperands() - 2);
+ }
+ case Intrinsic::sin:
+ case Intrinsic::cos:
+ case Intrinsic::pow: {
+ MVT RetVT;
+ if (!isTypeLegal(II->getType(), RetVT))
+ return false;
+
+ if (RetVT != MVT::f32 && RetVT != MVT::f64)
+ return false;
+
+ static const RTLIB::Libcall LibCallTable[3][2] = {
+ { RTLIB::SIN_F32, RTLIB::SIN_F64 },
+ { RTLIB::COS_F32, RTLIB::COS_F64 },
+ { RTLIB::POW_F32, RTLIB::POW_F64 }
+ };
+ RTLIB::Libcall LC;
+ bool Is64Bit = RetVT == MVT::f64;
+ switch (II->getIntrinsicID()) {
+ default:
+ llvm_unreachable("Unexpected intrinsic.");
+ case Intrinsic::sin:
+ LC = LibCallTable[0][Is64Bit];
+ break;
+ case Intrinsic::cos:
+ LC = LibCallTable[1][Is64Bit];
+ break;
+ case Intrinsic::pow:
+ LC = LibCallTable[2][Is64Bit];
+ break;
+ }
+
+ ArgListTy Args;
+ Args.reserve(II->getNumArgOperands());
+
+ // Populate the argument list.
+ for (auto &Arg : II->arg_operands()) {
+ ArgListEntry Entry;
+ Entry.Val = Arg;
+ Entry.Ty = Arg->getType();
+ Args.push_back(Entry);
+ }
+
+ CallLoweringInfo CLI;
+ MCContext &Ctx = MF->getContext();
+ CLI.setCallee(DL, Ctx, TLI.getLibcallCallingConv(LC), II->getType(),
+ TLI.getLibcallName(LC), std::move(Args));
+ if (!lowerCallTo(CLI))
+ return false;
+ updateValueMap(II, CLI.ResultReg);
+ return true;
+ }
+ case Intrinsic::fabs: {
+ MVT VT;
+ if (!isTypeLegal(II->getType(), VT))
+ return false;
+
+ unsigned Opc;
+ switch (VT.SimpleTy) {
+ default:
+ return false;
+ case MVT::f32:
+ Opc = AArch64::FABSSr;
+ break;
+ case MVT::f64:
+ Opc = AArch64::FABSDr;
+ break;
+ }
+ unsigned SrcReg = getRegForValue(II->getOperand(0));
+ if (!SrcReg)
+ return false;
+ bool SrcRegIsKill = hasTrivialKill(II->getOperand(0));
+ unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+ .addReg(SrcReg, getKillRegState(SrcRegIsKill));
+ updateValueMap(II, ResultReg);
+ return true;
+ }
+ case Intrinsic::trap: {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BRK))
+ .addImm(1);
+ return true;
+ }
+ case Intrinsic::sqrt: {
+ Type *RetTy = II->getCalledFunction()->getReturnType();
+
+ MVT VT;
+ if (!isTypeLegal(RetTy, VT))
+ return false;
+
+ unsigned Op0Reg = getRegForValue(II->getOperand(0));
+ if (!Op0Reg)
+ return false;
+ bool Op0IsKill = hasTrivialKill(II->getOperand(0));
+
+ unsigned ResultReg = fastEmit_r(VT, VT, ISD::FSQRT, Op0Reg, Op0IsKill);
+ if (!ResultReg)
+ return false;
+
+ updateValueMap(II, ResultReg);
+ return true;
+ }
+ case Intrinsic::sadd_with_overflow:
+ case Intrinsic::uadd_with_overflow:
+ case Intrinsic::ssub_with_overflow:
+ case Intrinsic::usub_with_overflow:
+ case Intrinsic::smul_with_overflow:
+ case Intrinsic::umul_with_overflow: {
+ // This implements the basic lowering of the xalu with overflow intrinsics.
+ const Function *Callee = II->getCalledFunction();
+ auto *Ty = cast<StructType>(Callee->getReturnType());
+ Type *RetTy = Ty->getTypeAtIndex(0U);
+
+ MVT VT;
+ if (!isTypeLegal(RetTy, VT))
+ return false;
+
+ if (VT != MVT::i32 && VT != MVT::i64)
+ return false;
+
+ const Value *LHS = II->getArgOperand(0);
+ const Value *RHS = II->getArgOperand(1);
+ // Canonicalize immediate to the RHS.
+ if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) &&
+ isCommutativeIntrinsic(II))
+ std::swap(LHS, RHS);
+
+ // Simplify multiplies.
+ Intrinsic::ID IID = II->getIntrinsicID();
+ switch (IID) {
+ default:
+ break;
+ case Intrinsic::smul_with_overflow:
+ if (const auto *C = dyn_cast<ConstantInt>(RHS))
+ if (C->getValue() == 2) {
+ IID = Intrinsic::sadd_with_overflow;
+ RHS = LHS;
+ }
+ break;
+ case Intrinsic::umul_with_overflow:
+ if (const auto *C = dyn_cast<ConstantInt>(RHS))
+ if (C->getValue() == 2) {
+ IID = Intrinsic::uadd_with_overflow;
+ RHS = LHS;
+ }
+ break;
+ }
+
+ unsigned ResultReg1 = 0, ResultReg2 = 0, MulReg = 0;
+ AArch64CC::CondCode CC = AArch64CC::Invalid;
+ switch (IID) {
+ default: llvm_unreachable("Unexpected intrinsic!");
+ case Intrinsic::sadd_with_overflow:
+ ResultReg1 = emitAdd(VT, LHS, RHS, /*SetFlags=*/true);
+ CC = AArch64CC::VS;
+ break;
+ case Intrinsic::uadd_with_overflow:
+ ResultReg1 = emitAdd(VT, LHS, RHS, /*SetFlags=*/true);
+ CC = AArch64CC::HS;
+ break;
+ case Intrinsic::ssub_with_overflow:
+ ResultReg1 = emitSub(VT, LHS, RHS, /*SetFlags=*/true);
+ CC = AArch64CC::VS;
+ break;
+ case Intrinsic::usub_with_overflow:
+ ResultReg1 = emitSub(VT, LHS, RHS, /*SetFlags=*/true);
+ CC = AArch64CC::LO;
+ break;
+ case Intrinsic::smul_with_overflow: {
+ CC = AArch64CC::NE;
+ unsigned LHSReg = getRegForValue(LHS);
+ if (!LHSReg)
+ return false;
+ bool LHSIsKill = hasTrivialKill(LHS);
+
+ unsigned RHSReg = getRegForValue(RHS);
+ if (!RHSReg)
+ return false;
+ bool RHSIsKill = hasTrivialKill(RHS);
+
+ if (VT == MVT::i32) {
+ MulReg = emitSMULL_rr(MVT::i64, LHSReg, LHSIsKill, RHSReg, RHSIsKill);
+ unsigned ShiftReg = emitLSR_ri(MVT::i64, MVT::i64, MulReg,
+ /*IsKill=*/false, 32);
+ MulReg = fastEmitInst_extractsubreg(VT, MulReg, /*IsKill=*/true,
+ AArch64::sub_32);
+ ShiftReg = fastEmitInst_extractsubreg(VT, ShiftReg, /*IsKill=*/true,
+ AArch64::sub_32);
+ emitSubs_rs(VT, ShiftReg, /*IsKill=*/true, MulReg, /*IsKill=*/false,
+ AArch64_AM::ASR, 31, /*WantResult=*/false);
+ } else {
+ assert(VT == MVT::i64 && "Unexpected value type.");
+ // LHSReg and RHSReg cannot be killed by this Mul, since they are
+ // reused in the next instruction.
+ MulReg = emitMul_rr(VT, LHSReg, /*IsKill=*/false, RHSReg,
+ /*IsKill=*/false);
+ unsigned SMULHReg = fastEmit_rr(VT, VT, ISD::MULHS, LHSReg, LHSIsKill,
+ RHSReg, RHSIsKill);
+ emitSubs_rs(VT, SMULHReg, /*IsKill=*/true, MulReg, /*IsKill=*/false,
+ AArch64_AM::ASR, 63, /*WantResult=*/false);
+ }
+ break;
+ }
+ case Intrinsic::umul_with_overflow: {
+ CC = AArch64CC::NE;
+ unsigned LHSReg = getRegForValue(LHS);
+ if (!LHSReg)
+ return false;
+ bool LHSIsKill = hasTrivialKill(LHS);
+
+ unsigned RHSReg = getRegForValue(RHS);
+ if (!RHSReg)
+ return false;
+ bool RHSIsKill = hasTrivialKill(RHS);
+
+ if (VT == MVT::i32) {
+ MulReg = emitUMULL_rr(MVT::i64, LHSReg, LHSIsKill, RHSReg, RHSIsKill);
+ emitSubs_rs(MVT::i64, AArch64::XZR, /*IsKill=*/true, MulReg,
+ /*IsKill=*/false, AArch64_AM::LSR, 32,
+ /*WantResult=*/false);
+ MulReg = fastEmitInst_extractsubreg(VT, MulReg, /*IsKill=*/true,
+ AArch64::sub_32);
+ } else {
+ assert(VT == MVT::i64 && "Unexpected value type.");
+ // LHSReg and RHSReg cannot be killed by this Mul, since they are
+ // reused in the next instruction.
+ MulReg = emitMul_rr(VT, LHSReg, /*IsKill=*/false, RHSReg,
+ /*IsKill=*/false);
+ unsigned UMULHReg = fastEmit_rr(VT, VT, ISD::MULHU, LHSReg, LHSIsKill,
+ RHSReg, RHSIsKill);
+ emitSubs_rr(VT, AArch64::XZR, /*IsKill=*/true, UMULHReg,
+ /*IsKill=*/false, /*WantResult=*/false);
+ }
+ break;
+ }
+ }
+
+ if (MulReg) {
+ ResultReg1 = createResultReg(TLI.getRegClassFor(VT));
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg1).addReg(MulReg);
+ }
+
+ ResultReg2 = fastEmitInst_rri(AArch64::CSINCWr, &AArch64::GPR32RegClass,
+ AArch64::WZR, /*IsKill=*/true, AArch64::WZR,
+ /*IsKill=*/true, getInvertedCondCode(CC));
+ (void)ResultReg2;
+ assert((ResultReg1 + 1) == ResultReg2 &&
+ "Nonconsecutive result registers.");
+ updateValueMap(II, ResultReg1, 2);
+ return true;
+ }
+ }
+ return false;
+}
+
+bool AArch64FastISel::selectRet(const Instruction *I) {
+ const ReturnInst *Ret = cast<ReturnInst>(I);
+ const Function &F = *I->getParent()->getParent();
+
+ if (!FuncInfo.CanLowerReturn)
+ return false;
+
+ if (F.isVarArg())
+ return false;
+
+ if (TLI.supportSwiftError() &&
+ F.getAttributes().hasAttrSomewhere(Attribute::SwiftError))
+ return false;
+
+ if (TLI.supportSplitCSR(FuncInfo.MF))
+ return false;
+
+ // Build a list of return value registers.
+ SmallVector<unsigned, 4> RetRegs;
+
+ if (Ret->getNumOperands() > 0) {
+ CallingConv::ID CC = F.getCallingConv();
+ SmallVector<ISD::OutputArg, 4> Outs;
+ GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL);
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ValLocs;
+ CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, I->getContext());
+ CCAssignFn *RetCC = CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS
+ : RetCC_AArch64_AAPCS;
+ CCInfo.AnalyzeReturn(Outs, RetCC);
+
+ // Only handle a single return value for now.
+ if (ValLocs.size() != 1)
+ return false;
+
+ CCValAssign &VA = ValLocs[0];
+ const Value *RV = Ret->getOperand(0);
+
+ // Don't bother handling odd stuff for now.
+ if ((VA.getLocInfo() != CCValAssign::Full) &&
+ (VA.getLocInfo() != CCValAssign::BCvt))
+ return false;
+
+ // Only handle register returns for now.
+ if (!VA.isRegLoc())
+ return false;
+
+ unsigned Reg = getRegForValue(RV);
+ if (Reg == 0)
+ return false;
+
+ unsigned SrcReg = Reg + VA.getValNo();
+ unsigned DestReg = VA.getLocReg();
+ // Avoid a cross-class copy. This is very unlikely.
+ if (!MRI.getRegClass(SrcReg)->contains(DestReg))
+ return false;
+
+ EVT RVEVT = TLI.getValueType(DL, RV->getType());
+ if (!RVEVT.isSimple())
+ return false;
+
+ // Vectors (of > 1 lane) in big endian need tricky handling.
+ if (RVEVT.isVector() && RVEVT.getVectorNumElements() > 1 &&
+ !Subtarget->isLittleEndian())
+ return false;
+
+ MVT RVVT = RVEVT.getSimpleVT();
+ if (RVVT == MVT::f128)
+ return false;
+
+ MVT DestVT = VA.getValVT();
+ // Special handling for extended integers.
+ if (RVVT != DestVT) {
+ if (RVVT != MVT::i1 && RVVT != MVT::i8 && RVVT != MVT::i16)
+ return false;
+
+ if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt())
+ return false;
+
+ bool IsZExt = Outs[0].Flags.isZExt();
+ SrcReg = emitIntExt(RVVT, SrcReg, DestVT, IsZExt);
+ if (SrcReg == 0)
+ return false;
+ }
+
+ // Make the copy.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), DestReg).addReg(SrcReg);
+
+ // Add register to return instruction.
+ RetRegs.push_back(VA.getLocReg());
+ }
+
+ MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(AArch64::RET_ReallyLR));
+ for (unsigned RetReg : RetRegs)
+ MIB.addReg(RetReg, RegState::Implicit);
+ return true;
+}
+
+bool AArch64FastISel::selectTrunc(const Instruction *I) {
+ Type *DestTy = I->getType();
+ Value *Op = I->getOperand(0);
+ Type *SrcTy = Op->getType();
+
+ EVT SrcEVT = TLI.getValueType(DL, SrcTy, true);
+ EVT DestEVT = TLI.getValueType(DL, DestTy, true);
+ if (!SrcEVT.isSimple())
+ return false;
+ if (!DestEVT.isSimple())
+ return false;
+
+ MVT SrcVT = SrcEVT.getSimpleVT();
+ MVT DestVT = DestEVT.getSimpleVT();
+
+ if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16 &&
+ SrcVT != MVT::i8)
+ return false;
+ if (DestVT != MVT::i32 && DestVT != MVT::i16 && DestVT != MVT::i8 &&
+ DestVT != MVT::i1)
+ return false;
+
+ unsigned SrcReg = getRegForValue(Op);
+ if (!SrcReg)
+ return false;
+ bool SrcIsKill = hasTrivialKill(Op);
+
+ // If we're truncating from i64 to a smaller non-legal type then generate an
+ // AND. Otherwise, we know the high bits are undefined and a truncate only
+ // generate a COPY. We cannot mark the source register also as result
+ // register, because this can incorrectly transfer the kill flag onto the
+ // source register.
+ unsigned ResultReg;
+ if (SrcVT == MVT::i64) {
+ uint64_t Mask = 0;
+ switch (DestVT.SimpleTy) {
+ default:
+ // Trunc i64 to i32 is handled by the target-independent fast-isel.
+ return false;
+ case MVT::i1:
+ Mask = 0x1;
+ break;
+ case MVT::i8:
+ Mask = 0xff;
+ break;
+ case MVT::i16:
+ Mask = 0xffff;
+ break;
+ }
+ // Issue an extract_subreg to get the lower 32-bits.
+ unsigned Reg32 = fastEmitInst_extractsubreg(MVT::i32, SrcReg, SrcIsKill,
+ AArch64::sub_32);
+ // Create the AND instruction which performs the actual truncation.
+ ResultReg = emitAnd_ri(MVT::i32, Reg32, /*IsKill=*/true, Mask);
+ assert(ResultReg && "Unexpected AND instruction emission failure.");
+ } else {
+ ResultReg = createResultReg(&AArch64::GPR32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg)
+ .addReg(SrcReg, getKillRegState(SrcIsKill));
+ }
+
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+unsigned AArch64FastISel::emiti1Ext(unsigned SrcReg, MVT DestVT, bool IsZExt) {
+ assert((DestVT == MVT::i8 || DestVT == MVT::i16 || DestVT == MVT::i32 ||
+ DestVT == MVT::i64) &&
+ "Unexpected value type.");
+ // Handle i8 and i16 as i32.
+ if (DestVT == MVT::i8 || DestVT == MVT::i16)
+ DestVT = MVT::i32;
+
+ if (IsZExt) {
+ unsigned ResultReg = emitAnd_ri(MVT::i32, SrcReg, /*TODO:IsKill=*/false, 1);
+ assert(ResultReg && "Unexpected AND instruction emission failure.");
+ if (DestVT == MVT::i64) {
+ // We're ZExt i1 to i64. The ANDWri Wd, Ws, #1 implicitly clears the
+ // upper 32 bits. Emit a SUBREG_TO_REG to extend from Wd to Xd.
+ unsigned Reg64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(AArch64::SUBREG_TO_REG), Reg64)
+ .addImm(0)
+ .addReg(ResultReg)
+ .addImm(AArch64::sub_32);
+ ResultReg = Reg64;
+ }
+ return ResultReg;
+ } else {
+ if (DestVT == MVT::i64) {
+ // FIXME: We're SExt i1 to i64.
+ return 0;
+ }
+ return fastEmitInst_rii(AArch64::SBFMWri, &AArch64::GPR32RegClass, SrcReg,
+ /*TODO:IsKill=*/false, 0, 0);
+ }
+}
+
+unsigned AArch64FastISel::emitMul_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
+ unsigned Op1, bool Op1IsKill) {
+ unsigned Opc, ZReg;
+ switch (RetVT.SimpleTy) {
+ default: return 0;
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32:
+ RetVT = MVT::i32;
+ Opc = AArch64::MADDWrrr; ZReg = AArch64::WZR; break;
+ case MVT::i64:
+ Opc = AArch64::MADDXrrr; ZReg = AArch64::XZR; break;
+ }
+
+ const TargetRegisterClass *RC =
+ (RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+ return fastEmitInst_rrr(Opc, RC, Op0, Op0IsKill, Op1, Op1IsKill,
+ /*IsKill=*/ZReg, true);
+}
+
+unsigned AArch64FastISel::emitSMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
+ unsigned Op1, bool Op1IsKill) {
+ if (RetVT != MVT::i64)
+ return 0;
+
+ return fastEmitInst_rrr(AArch64::SMADDLrrr, &AArch64::GPR64RegClass,
+ Op0, Op0IsKill, Op1, Op1IsKill,
+ AArch64::XZR, /*IsKill=*/true);
+}
+
+unsigned AArch64FastISel::emitUMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
+ unsigned Op1, bool Op1IsKill) {
+ if (RetVT != MVT::i64)
+ return 0;
+
+ return fastEmitInst_rrr(AArch64::UMADDLrrr, &AArch64::GPR64RegClass,
+ Op0, Op0IsKill, Op1, Op1IsKill,
+ AArch64::XZR, /*IsKill=*/true);
+}
+
+unsigned AArch64FastISel::emitLSL_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
+ unsigned Op1Reg, bool Op1IsKill) {
+ unsigned Opc = 0;
+ bool NeedTrunc = false;
+ uint64_t Mask = 0;
+ switch (RetVT.SimpleTy) {
+ default: return 0;
+ case MVT::i8: Opc = AArch64::LSLVWr; NeedTrunc = true; Mask = 0xff; break;
+ case MVT::i16: Opc = AArch64::LSLVWr; NeedTrunc = true; Mask = 0xffff; break;
+ case MVT::i32: Opc = AArch64::LSLVWr; break;
+ case MVT::i64: Opc = AArch64::LSLVXr; break;
+ }
+
+ const TargetRegisterClass *RC =
+ (RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+ if (NeedTrunc) {
+ Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Op1IsKill, Mask);
+ Op1IsKill = true;
+ }
+ unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op0IsKill, Op1Reg,
+ Op1IsKill);
+ if (NeedTrunc)
+ ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
+ return ResultReg;
+}
+
+unsigned AArch64FastISel::emitLSL_ri(MVT RetVT, MVT SrcVT, unsigned Op0,
+ bool Op0IsKill, uint64_t Shift,
+ bool IsZExt) {
+ assert(RetVT.SimpleTy >= SrcVT.SimpleTy &&
+ "Unexpected source/return type pair.");
+ assert((SrcVT == MVT::i1 || SrcVT == MVT::i8 || SrcVT == MVT::i16 ||
+ SrcVT == MVT::i32 || SrcVT == MVT::i64) &&
+ "Unexpected source value type.");
+ assert((RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32 ||
+ RetVT == MVT::i64) && "Unexpected return value type.");
+
+ bool Is64Bit = (RetVT == MVT::i64);
+ unsigned RegSize = Is64Bit ? 64 : 32;
+ unsigned DstBits = RetVT.getSizeInBits();
+ unsigned SrcBits = SrcVT.getSizeInBits();
+ const TargetRegisterClass *RC =
+ Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+
+ // Just emit a copy for "zero" shifts.
+ if (Shift == 0) {
+ if (RetVT == SrcVT) {
+ unsigned ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg)
+ .addReg(Op0, getKillRegState(Op0IsKill));
+ return ResultReg;
+ } else
+ return emitIntExt(SrcVT, Op0, RetVT, IsZExt);
+ }
+
+ // Don't deal with undefined shifts.
+ if (Shift >= DstBits)
+ return 0;
+
+ // For immediate shifts we can fold the zero-/sign-extension into the shift.
+ // {S|U}BFM Wd, Wn, #r, #s
+ // Wd<32+s-r,32-r> = Wn<s:0> when r > s
+
+ // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+ // %2 = shl i16 %1, 4
+ // Wd<32+7-28,32-28> = Wn<7:0> <- clamp s to 7
+ // 0b1111_1111_1111_1111__1111_1010_1010_0000 sext
+ // 0b0000_0000_0000_0000__0000_0101_0101_0000 sext | zext
+ // 0b0000_0000_0000_0000__0000_1010_1010_0000 zext
+
+ // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+ // %2 = shl i16 %1, 8
+ // Wd<32+7-24,32-24> = Wn<7:0>
+ // 0b1111_1111_1111_1111__1010_1010_0000_0000 sext
+ // 0b0000_0000_0000_0000__0101_0101_0000_0000 sext | zext
+ // 0b0000_0000_0000_0000__1010_1010_0000_0000 zext
+
+ // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+ // %2 = shl i16 %1, 12
+ // Wd<32+3-20,32-20> = Wn<3:0>
+ // 0b1111_1111_1111_1111__1010_0000_0000_0000 sext
+ // 0b0000_0000_0000_0000__0101_0000_0000_0000 sext | zext
+ // 0b0000_0000_0000_0000__1010_0000_0000_0000 zext
+
+ unsigned ImmR = RegSize - Shift;
+ // Limit the width to the length of the source type.
+ unsigned ImmS = std::min<unsigned>(SrcBits - 1, DstBits - 1 - Shift);
+ static const unsigned OpcTable[2][2] = {
+ {AArch64::SBFMWri, AArch64::SBFMXri},
+ {AArch64::UBFMWri, AArch64::UBFMXri}
+ };
+ unsigned Opc = OpcTable[IsZExt][Is64Bit];
+ if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) {
+ unsigned TmpReg = MRI.createVirtualRegister(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(AArch64::SUBREG_TO_REG), TmpReg)
+ .addImm(0)
+ .addReg(Op0, getKillRegState(Op0IsKill))
+ .addImm(AArch64::sub_32);
+ Op0 = TmpReg;
+ Op0IsKill = true;
+ }
+ return fastEmitInst_rii(Opc, RC, Op0, Op0IsKill, ImmR, ImmS);
+}
+
+unsigned AArch64FastISel::emitLSR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
+ unsigned Op1Reg, bool Op1IsKill) {
+ unsigned Opc = 0;
+ bool NeedTrunc = false;
+ uint64_t Mask = 0;
+ switch (RetVT.SimpleTy) {
+ default: return 0;
+ case MVT::i8: Opc = AArch64::LSRVWr; NeedTrunc = true; Mask = 0xff; break;
+ case MVT::i16: Opc = AArch64::LSRVWr; NeedTrunc = true; Mask = 0xffff; break;
+ case MVT::i32: Opc = AArch64::LSRVWr; break;
+ case MVT::i64: Opc = AArch64::LSRVXr; break;
+ }
+
+ const TargetRegisterClass *RC =
+ (RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+ if (NeedTrunc) {
+ Op0Reg = emitAnd_ri(MVT::i32, Op0Reg, Op0IsKill, Mask);
+ Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Op1IsKill, Mask);
+ Op0IsKill = Op1IsKill = true;
+ }
+ unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op0IsKill, Op1Reg,
+ Op1IsKill);
+ if (NeedTrunc)
+ ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
+ return ResultReg;
+}
+
+unsigned AArch64FastISel::emitLSR_ri(MVT RetVT, MVT SrcVT, unsigned Op0,
+ bool Op0IsKill, uint64_t Shift,
+ bool IsZExt) {
+ assert(RetVT.SimpleTy >= SrcVT.SimpleTy &&
+ "Unexpected source/return type pair.");
+ assert((SrcVT == MVT::i1 || SrcVT == MVT::i8 || SrcVT == MVT::i16 ||
+ SrcVT == MVT::i32 || SrcVT == MVT::i64) &&
+ "Unexpected source value type.");
+ assert((RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32 ||
+ RetVT == MVT::i64) && "Unexpected return value type.");
+
+ bool Is64Bit = (RetVT == MVT::i64);
+ unsigned RegSize = Is64Bit ? 64 : 32;
+ unsigned DstBits = RetVT.getSizeInBits();
+ unsigned SrcBits = SrcVT.getSizeInBits();
+ const TargetRegisterClass *RC =
+ Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+
+ // Just emit a copy for "zero" shifts.
+ if (Shift == 0) {
+ if (RetVT == SrcVT) {
+ unsigned ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg)
+ .addReg(Op0, getKillRegState(Op0IsKill));
+ return ResultReg;
+ } else
+ return emitIntExt(SrcVT, Op0, RetVT, IsZExt);
+ }
+
+ // Don't deal with undefined shifts.
+ if (Shift >= DstBits)
+ return 0;
+
+ // For immediate shifts we can fold the zero-/sign-extension into the shift.
+ // {S|U}BFM Wd, Wn, #r, #s
+ // Wd<s-r:0> = Wn<s:r> when r <= s
+
+ // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+ // %2 = lshr i16 %1, 4
+ // Wd<7-4:0> = Wn<7:4>
+ // 0b0000_0000_0000_0000__0000_1111_1111_1010 sext
+ // 0b0000_0000_0000_0000__0000_0000_0000_0101 sext | zext
+ // 0b0000_0000_0000_0000__0000_0000_0000_1010 zext
+
+ // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+ // %2 = lshr i16 %1, 8
+ // Wd<7-7,0> = Wn<7:7>
+ // 0b0000_0000_0000_0000__0000_0000_1111_1111 sext
+ // 0b0000_0000_0000_0000__0000_0000_0000_0000 sext
+ // 0b0000_0000_0000_0000__0000_0000_0000_0000 zext
+
+ // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+ // %2 = lshr i16 %1, 12
+ // Wd<7-7,0> = Wn<7:7> <- clamp r to 7
+ // 0b0000_0000_0000_0000__0000_0000_0000_1111 sext
+ // 0b0000_0000_0000_0000__0000_0000_0000_0000 sext
+ // 0b0000_0000_0000_0000__0000_0000_0000_0000 zext
+
+ if (Shift >= SrcBits && IsZExt)
+ return materializeInt(ConstantInt::get(*Context, APInt(RegSize, 0)), RetVT);
+
+ // It is not possible to fold a sign-extend into the LShr instruction. In this
+ // case emit a sign-extend.
+ if (!IsZExt) {
+ Op0 = emitIntExt(SrcVT, Op0, RetVT, IsZExt);
+ if (!Op0)
+ return 0;
+ Op0IsKill = true;
+ SrcVT = RetVT;
+ SrcBits = SrcVT.getSizeInBits();
+ IsZExt = true;
+ }
+
+ unsigned ImmR = std::min<unsigned>(SrcBits - 1, Shift);
+ unsigned ImmS = SrcBits - 1;
+ static const unsigned OpcTable[2][2] = {
+ {AArch64::SBFMWri, AArch64::SBFMXri},
+ {AArch64::UBFMWri, AArch64::UBFMXri}
+ };
+ unsigned Opc = OpcTable[IsZExt][Is64Bit];
+ if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) {
+ unsigned TmpReg = MRI.createVirtualRegister(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(AArch64::SUBREG_TO_REG), TmpReg)
+ .addImm(0)
+ .addReg(Op0, getKillRegState(Op0IsKill))
+ .addImm(AArch64::sub_32);
+ Op0 = TmpReg;
+ Op0IsKill = true;
+ }
+ return fastEmitInst_rii(Opc, RC, Op0, Op0IsKill, ImmR, ImmS);
+}
+
+unsigned AArch64FastISel::emitASR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
+ unsigned Op1Reg, bool Op1IsKill) {
+ unsigned Opc = 0;
+ bool NeedTrunc = false;
+ uint64_t Mask = 0;
+ switch (RetVT.SimpleTy) {
+ default: return 0;
+ case MVT::i8: Opc = AArch64::ASRVWr; NeedTrunc = true; Mask = 0xff; break;
+ case MVT::i16: Opc = AArch64::ASRVWr; NeedTrunc = true; Mask = 0xffff; break;
+ case MVT::i32: Opc = AArch64::ASRVWr; break;
+ case MVT::i64: Opc = AArch64::ASRVXr; break;
+ }
+
+ const TargetRegisterClass *RC =
+ (RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+ if (NeedTrunc) {
+ Op0Reg = emitIntExt(RetVT, Op0Reg, MVT::i32, /*IsZExt=*/false);
+ Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Op1IsKill, Mask);
+ Op0IsKill = Op1IsKill = true;
+ }
+ unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op0IsKill, Op1Reg,
+ Op1IsKill);
+ if (NeedTrunc)
+ ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
+ return ResultReg;
+}
+
+unsigned AArch64FastISel::emitASR_ri(MVT RetVT, MVT SrcVT, unsigned Op0,
+ bool Op0IsKill, uint64_t Shift,
+ bool IsZExt) {
+ assert(RetVT.SimpleTy >= SrcVT.SimpleTy &&
+ "Unexpected source/return type pair.");
+ assert((SrcVT == MVT::i1 || SrcVT == MVT::i8 || SrcVT == MVT::i16 ||
+ SrcVT == MVT::i32 || SrcVT == MVT::i64) &&
+ "Unexpected source value type.");
+ assert((RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32 ||
+ RetVT == MVT::i64) && "Unexpected return value type.");
+
+ bool Is64Bit = (RetVT == MVT::i64);
+ unsigned RegSize = Is64Bit ? 64 : 32;
+ unsigned DstBits = RetVT.getSizeInBits();
+ unsigned SrcBits = SrcVT.getSizeInBits();
+ const TargetRegisterClass *RC =
+ Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+
+ // Just emit a copy for "zero" shifts.
+ if (Shift == 0) {
+ if (RetVT == SrcVT) {
+ unsigned ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg)
+ .addReg(Op0, getKillRegState(Op0IsKill));
+ return ResultReg;
+ } else
+ return emitIntExt(SrcVT, Op0, RetVT, IsZExt);
+ }
+
+ // Don't deal with undefined shifts.
+ if (Shift >= DstBits)
+ return 0;
+
+ // For immediate shifts we can fold the zero-/sign-extension into the shift.
+ // {S|U}BFM Wd, Wn, #r, #s
+ // Wd<s-r:0> = Wn<s:r> when r <= s
+
+ // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+ // %2 = ashr i16 %1, 4
+ // Wd<7-4:0> = Wn<7:4>
+ // 0b1111_1111_1111_1111__1111_1111_1111_1010 sext
+ // 0b0000_0000_0000_0000__0000_0000_0000_0101 sext | zext
+ // 0b0000_0000_0000_0000__0000_0000_0000_1010 zext
+
+ // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+ // %2 = ashr i16 %1, 8
+ // Wd<7-7,0> = Wn<7:7>
+ // 0b1111_1111_1111_1111__1111_1111_1111_1111 sext
+ // 0b0000_0000_0000_0000__0000_0000_0000_0000 sext
+ // 0b0000_0000_0000_0000__0000_0000_0000_0000 zext
+
+ // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+ // %2 = ashr i16 %1, 12
+ // Wd<7-7,0> = Wn<7:7> <- clamp r to 7
+ // 0b1111_1111_1111_1111__1111_1111_1111_1111 sext
+ // 0b0000_0000_0000_0000__0000_0000_0000_0000 sext
+ // 0b0000_0000_0000_0000__0000_0000_0000_0000 zext
+
+ if (Shift >= SrcBits && IsZExt)
+ return materializeInt(ConstantInt::get(*Context, APInt(RegSize, 0)), RetVT);
+
+ unsigned ImmR = std::min<unsigned>(SrcBits - 1, Shift);
+ unsigned ImmS = SrcBits - 1;
+ static const unsigned OpcTable[2][2] = {
+ {AArch64::SBFMWri, AArch64::SBFMXri},
+ {AArch64::UBFMWri, AArch64::UBFMXri}
+ };
+ unsigned Opc = OpcTable[IsZExt][Is64Bit];
+ if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) {
+ unsigned TmpReg = MRI.createVirtualRegister(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(AArch64::SUBREG_TO_REG), TmpReg)
+ .addImm(0)
+ .addReg(Op0, getKillRegState(Op0IsKill))
+ .addImm(AArch64::sub_32);
+ Op0 = TmpReg;
+ Op0IsKill = true;
+ }
+ return fastEmitInst_rii(Opc, RC, Op0, Op0IsKill, ImmR, ImmS);
+}
+
+unsigned AArch64FastISel::emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+ bool IsZExt) {
+ assert(DestVT != MVT::i1 && "ZeroExt/SignExt an i1?");
+
+ // FastISel does not have plumbing to deal with extensions where the SrcVT or
+ // DestVT are odd things, so test to make sure that they are both types we can
+ // handle (i1/i8/i16/i32 for SrcVT and i8/i16/i32/i64 for DestVT), otherwise
+ // bail out to SelectionDAG.
+ if (((DestVT != MVT::i8) && (DestVT != MVT::i16) &&
+ (DestVT != MVT::i32) && (DestVT != MVT::i64)) ||
+ ((SrcVT != MVT::i1) && (SrcVT != MVT::i8) &&
+ (SrcVT != MVT::i16) && (SrcVT != MVT::i32)))
+ return 0;
+
+ unsigned Opc;
+ unsigned Imm = 0;
+
+ switch (SrcVT.SimpleTy) {
+ default:
+ return 0;
+ case MVT::i1:
+ return emiti1Ext(SrcReg, DestVT, IsZExt);
+ case MVT::i8:
+ if (DestVT == MVT::i64)
+ Opc = IsZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
+ else
+ Opc = IsZExt ? AArch64::UBFMWri : AArch64::SBFMWri;
+ Imm = 7;
+ break;
+ case MVT::i16:
+ if (DestVT == MVT::i64)
+ Opc = IsZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
+ else
+ Opc = IsZExt ? AArch64::UBFMWri : AArch64::SBFMWri;
+ Imm = 15;
+ break;
+ case MVT::i32:
+ assert(DestVT == MVT::i64 && "IntExt i32 to i32?!?");
+ Opc = IsZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
+ Imm = 31;
+ break;
+ }
+
+ // Handle i8 and i16 as i32.
+ if (DestVT == MVT::i8 || DestVT == MVT::i16)
+ DestVT = MVT::i32;
+ else if (DestVT == MVT::i64) {
+ unsigned Src64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(AArch64::SUBREG_TO_REG), Src64)
+ .addImm(0)
+ .addReg(SrcReg)
+ .addImm(AArch64::sub_32);
+ SrcReg = Src64;
+ }
+
+ const TargetRegisterClass *RC =
+ (DestVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+ return fastEmitInst_rii(Opc, RC, SrcReg, /*TODO:IsKill=*/false, 0, Imm);
+}
+
+static bool isZExtLoad(const MachineInstr *LI) {
+ switch (LI->getOpcode()) {
+ default:
+ return false;
+ case AArch64::LDURBBi:
+ case AArch64::LDURHHi:
+ case AArch64::LDURWi:
+ case AArch64::LDRBBui:
+ case AArch64::LDRHHui:
+ case AArch64::LDRWui:
+ case AArch64::LDRBBroX:
+ case AArch64::LDRHHroX:
+ case AArch64::LDRWroX:
+ case AArch64::LDRBBroW:
+ case AArch64::LDRHHroW:
+ case AArch64::LDRWroW:
+ return true;
+ }
+}
+
+static bool isSExtLoad(const MachineInstr *LI) {
+ switch (LI->getOpcode()) {
+ default:
+ return false;
+ case AArch64::LDURSBWi:
+ case AArch64::LDURSHWi:
+ case AArch64::LDURSBXi:
+ case AArch64::LDURSHXi:
+ case AArch64::LDURSWi:
+ case AArch64::LDRSBWui:
+ case AArch64::LDRSHWui:
+ case AArch64::LDRSBXui:
+ case AArch64::LDRSHXui:
+ case AArch64::LDRSWui:
+ case AArch64::LDRSBWroX:
+ case AArch64::LDRSHWroX:
+ case AArch64::LDRSBXroX:
+ case AArch64::LDRSHXroX:
+ case AArch64::LDRSWroX:
+ case AArch64::LDRSBWroW:
+ case AArch64::LDRSHWroW:
+ case AArch64::LDRSBXroW:
+ case AArch64::LDRSHXroW:
+ case AArch64::LDRSWroW:
+ return true;
+ }
+}
+
+bool AArch64FastISel::optimizeIntExtLoad(const Instruction *I, MVT RetVT,
+ MVT SrcVT) {
+ const auto *LI = dyn_cast<LoadInst>(I->getOperand(0));
+ if (!LI || !LI->hasOneUse())
+ return false;
+
+ // Check if the load instruction has already been selected.
+ unsigned Reg = lookUpRegForValue(LI);
+ if (!Reg)
+ return false;
+
+ MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
+ if (!MI)
+ return false;
+
+ // Check if the correct load instruction has been emitted - SelectionDAG might
+ // have emitted a zero-extending load, but we need a sign-extending load.
+ bool IsZExt = isa<ZExtInst>(I);
+ const auto *LoadMI = MI;
+ if (LoadMI->getOpcode() == TargetOpcode::COPY &&
+ LoadMI->getOperand(1).getSubReg() == AArch64::sub_32) {
+ unsigned LoadReg = MI->getOperand(1).getReg();
+ LoadMI = MRI.getUniqueVRegDef(LoadReg);
+ assert(LoadMI && "Expected valid instruction");
+ }
+ if (!(IsZExt && isZExtLoad(LoadMI)) && !(!IsZExt && isSExtLoad(LoadMI)))
+ return false;
+
+ // Nothing to be done.
+ if (RetVT != MVT::i64 || SrcVT > MVT::i32) {
+ updateValueMap(I, Reg);
+ return true;
+ }
+
+ if (IsZExt) {
+ unsigned Reg64 = createResultReg(&AArch64::GPR64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(AArch64::SUBREG_TO_REG), Reg64)
+ .addImm(0)
+ .addReg(Reg, getKillRegState(true))
+ .addImm(AArch64::sub_32);
+ Reg = Reg64;
+ } else {
+ assert((MI->getOpcode() == TargetOpcode::COPY &&
+ MI->getOperand(1).getSubReg() == AArch64::sub_32) &&
+ "Expected copy instruction");
+ Reg = MI->getOperand(1).getReg();
+ MI->eraseFromParent();
+ }
+ updateValueMap(I, Reg);
+ return true;
+}
+
+bool AArch64FastISel::selectIntExt(const Instruction *I) {
+ assert((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
+ "Unexpected integer extend instruction.");
+ MVT RetVT;
+ MVT SrcVT;
+ if (!isTypeSupported(I->getType(), RetVT))
+ return false;
+
+ if (!isTypeSupported(I->getOperand(0)->getType(), SrcVT))
+ return false;
+
+ // Try to optimize already sign-/zero-extended values from load instructions.
+ if (optimizeIntExtLoad(I, RetVT, SrcVT))
+ return true;
+
+ unsigned SrcReg = getRegForValue(I->getOperand(0));
+ if (!SrcReg)
+ return false;
+ bool SrcIsKill = hasTrivialKill(I->getOperand(0));
+
+ // Try to optimize already sign-/zero-extended values from function arguments.
+ bool IsZExt = isa<ZExtInst>(I);
+ if (const auto *Arg = dyn_cast<Argument>(I->getOperand(0))) {
+ if ((IsZExt && Arg->hasZExtAttr()) || (!IsZExt && Arg->hasSExtAttr())) {
+ if (RetVT == MVT::i64 && SrcVT != MVT::i64) {
+ unsigned ResultReg = createResultReg(&AArch64::GPR64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(AArch64::SUBREG_TO_REG), ResultReg)
+ .addImm(0)
+ .addReg(SrcReg, getKillRegState(SrcIsKill))
+ .addImm(AArch64::sub_32);
+ SrcReg = ResultReg;
+ }
+ // Conservatively clear all kill flags from all uses, because we are
+ // replacing a sign-/zero-extend instruction at IR level with a nop at MI
+ // level. The result of the instruction at IR level might have been
+ // trivially dead, which is now not longer true.
+ unsigned UseReg = lookUpRegForValue(I);
+ if (UseReg)
+ MRI.clearKillFlags(UseReg);
+
+ updateValueMap(I, SrcReg);
+ return true;
+ }
+ }
+
+ unsigned ResultReg = emitIntExt(SrcVT, SrcReg, RetVT, IsZExt);
+ if (!ResultReg)
+ return false;
+
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool AArch64FastISel::selectRem(const Instruction *I, unsigned ISDOpcode) {
+ EVT DestEVT = TLI.getValueType(DL, I->getType(), true);
+ if (!DestEVT.isSimple())
+ return false;
+
+ MVT DestVT = DestEVT.getSimpleVT();
+ if (DestVT != MVT::i64 && DestVT != MVT::i32)
+ return false;
+
+ unsigned DivOpc;
+ bool Is64bit = (DestVT == MVT::i64);
+ switch (ISDOpcode) {
+ default:
+ return false;
+ case ISD::SREM:
+ DivOpc = Is64bit ? AArch64::SDIVXr : AArch64::SDIVWr;
+ break;
+ case ISD::UREM:
+ DivOpc = Is64bit ? AArch64::UDIVXr : AArch64::UDIVWr;
+ break;
+ }
+ unsigned MSubOpc = Is64bit ? AArch64::MSUBXrrr : AArch64::MSUBWrrr;
+ unsigned Src0Reg = getRegForValue(I->getOperand(0));
+ if (!Src0Reg)
+ return false;
+ bool Src0IsKill = hasTrivialKill(I->getOperand(0));
+
+ unsigned Src1Reg = getRegForValue(I->getOperand(1));
+ if (!Src1Reg)
+ return false;
+ bool Src1IsKill = hasTrivialKill(I->getOperand(1));
+
+ const TargetRegisterClass *RC =
+ (DestVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+ unsigned QuotReg = fastEmitInst_rr(DivOpc, RC, Src0Reg, /*IsKill=*/false,
+ Src1Reg, /*IsKill=*/false);
+ assert(QuotReg && "Unexpected DIV instruction emission failure.");
+ // The remainder is computed as numerator - (quotient * denominator) using the
+ // MSUB instruction.
+ unsigned ResultReg = fastEmitInst_rrr(MSubOpc, RC, QuotReg, /*IsKill=*/true,
+ Src1Reg, Src1IsKill, Src0Reg,
+ Src0IsKill);
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool AArch64FastISel::selectMul(const Instruction *I) {
+ MVT VT;
+ if (!isTypeSupported(I->getType(), VT, /*IsVectorAllowed=*/true))
+ return false;
+
+ if (VT.isVector())
+ return selectBinaryOp(I, ISD::MUL);
+
+ const Value *Src0 = I->getOperand(0);
+ const Value *Src1 = I->getOperand(1);
+ if (const auto *C = dyn_cast<ConstantInt>(Src0))
+ if (C->getValue().isPowerOf2())
+ std::swap(Src0, Src1);
+
+ // Try to simplify to a shift instruction.
+ if (const auto *C = dyn_cast<ConstantInt>(Src1))
+ if (C->getValue().isPowerOf2()) {
+ uint64_t ShiftVal = C->getValue().logBase2();
+ MVT SrcVT = VT;
+ bool IsZExt = true;
+ if (const auto *ZExt = dyn_cast<ZExtInst>(Src0)) {
+ if (!isIntExtFree(ZExt)) {
+ MVT VT;
+ if (isValueAvailable(ZExt) && isTypeSupported(ZExt->getSrcTy(), VT)) {
+ SrcVT = VT;
+ IsZExt = true;
+ Src0 = ZExt->getOperand(0);
+ }
+ }
+ } else if (const auto *SExt = dyn_cast<SExtInst>(Src0)) {
+ if (!isIntExtFree(SExt)) {
+ MVT VT;
+ if (isValueAvailable(SExt) && isTypeSupported(SExt->getSrcTy(), VT)) {
+ SrcVT = VT;
+ IsZExt = false;
+ Src0 = SExt->getOperand(0);
+ }
+ }
+ }
+
+ unsigned Src0Reg = getRegForValue(Src0);
+ if (!Src0Reg)
+ return false;
+ bool Src0IsKill = hasTrivialKill(Src0);
+
+ unsigned ResultReg =
+ emitLSL_ri(VT, SrcVT, Src0Reg, Src0IsKill, ShiftVal, IsZExt);
+
+ if (ResultReg) {
+ updateValueMap(I, ResultReg);
+ return true;
+ }
+ }
+
+ unsigned Src0Reg = getRegForValue(I->getOperand(0));
+ if (!Src0Reg)
+ return false;
+ bool Src0IsKill = hasTrivialKill(I->getOperand(0));
+
+ unsigned Src1Reg = getRegForValue(I->getOperand(1));
+ if (!Src1Reg)
+ return false;
+ bool Src1IsKill = hasTrivialKill(I->getOperand(1));
+
+ unsigned ResultReg = emitMul_rr(VT, Src0Reg, Src0IsKill, Src1Reg, Src1IsKill);
+
+ if (!ResultReg)
+ return false;
+
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool AArch64FastISel::selectShift(const Instruction *I) {
+ MVT RetVT;
+ if (!isTypeSupported(I->getType(), RetVT, /*IsVectorAllowed=*/true))
+ return false;
+
+ if (RetVT.isVector())
+ return selectOperator(I, I->getOpcode());
+
+ if (const auto *C = dyn_cast<ConstantInt>(I->getOperand(1))) {
+ unsigned ResultReg = 0;
+ uint64_t ShiftVal = C->getZExtValue();
+ MVT SrcVT = RetVT;
+ bool IsZExt = I->getOpcode() != Instruction::AShr;
+ const Value *Op0 = I->getOperand(0);
+ if (const auto *ZExt = dyn_cast<ZExtInst>(Op0)) {
+ if (!isIntExtFree(ZExt)) {
+ MVT TmpVT;
+ if (isValueAvailable(ZExt) && isTypeSupported(ZExt->getSrcTy(), TmpVT)) {
+ SrcVT = TmpVT;
+ IsZExt = true;
+ Op0 = ZExt->getOperand(0);
+ }
+ }
+ } else if (const auto *SExt = dyn_cast<SExtInst>(Op0)) {
+ if (!isIntExtFree(SExt)) {
+ MVT TmpVT;
+ if (isValueAvailable(SExt) && isTypeSupported(SExt->getSrcTy(), TmpVT)) {
+ SrcVT = TmpVT;
+ IsZExt = false;
+ Op0 = SExt->getOperand(0);
+ }
+ }
+ }
+
+ unsigned Op0Reg = getRegForValue(Op0);
+ if (!Op0Reg)
+ return false;
+ bool Op0IsKill = hasTrivialKill(Op0);
+
+ switch (I->getOpcode()) {
+ default: llvm_unreachable("Unexpected instruction.");
+ case Instruction::Shl:
+ ResultReg = emitLSL_ri(RetVT, SrcVT, Op0Reg, Op0IsKill, ShiftVal, IsZExt);
+ break;
+ case Instruction::AShr:
+ ResultReg = emitASR_ri(RetVT, SrcVT, Op0Reg, Op0IsKill, ShiftVal, IsZExt);
+ break;
+ case Instruction::LShr:
+ ResultReg = emitLSR_ri(RetVT, SrcVT, Op0Reg, Op0IsKill, ShiftVal, IsZExt);
+ break;
+ }
+ if (!ResultReg)
+ return false;
+
+ updateValueMap(I, ResultReg);
+ return true;
+ }
+
+ unsigned Op0Reg = getRegForValue(I->getOperand(0));
+ if (!Op0Reg)
+ return false;
+ bool Op0IsKill = hasTrivialKill(I->getOperand(0));
+
+ unsigned Op1Reg = getRegForValue(I->getOperand(1));
+ if (!Op1Reg)
+ return false;
+ bool Op1IsKill = hasTrivialKill(I->getOperand(1));
+
+ unsigned ResultReg = 0;
+ switch (I->getOpcode()) {
+ default: llvm_unreachable("Unexpected instruction.");
+ case Instruction::Shl:
+ ResultReg = emitLSL_rr(RetVT, Op0Reg, Op0IsKill, Op1Reg, Op1IsKill);
+ break;
+ case Instruction::AShr:
+ ResultReg = emitASR_rr(RetVT, Op0Reg, Op0IsKill, Op1Reg, Op1IsKill);
+ break;
+ case Instruction::LShr:
+ ResultReg = emitLSR_rr(RetVT, Op0Reg, Op0IsKill, Op1Reg, Op1IsKill);
+ break;
+ }
+
+ if (!ResultReg)
+ return false;
+
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool AArch64FastISel::selectBitCast(const Instruction *I) {
+ MVT RetVT, SrcVT;
+
+ if (!isTypeLegal(I->getOperand(0)->getType(), SrcVT))
+ return false;
+ if (!isTypeLegal(I->getType(), RetVT))
+ return false;
+
+ unsigned Opc;
+ if (RetVT == MVT::f32 && SrcVT == MVT::i32)
+ Opc = AArch64::FMOVWSr;
+ else if (RetVT == MVT::f64 && SrcVT == MVT::i64)
+ Opc = AArch64::FMOVXDr;
+ else if (RetVT == MVT::i32 && SrcVT == MVT::f32)
+ Opc = AArch64::FMOVSWr;
+ else if (RetVT == MVT::i64 && SrcVT == MVT::f64)
+ Opc = AArch64::FMOVDXr;
+ else
+ return false;
+
+ const TargetRegisterClass *RC = nullptr;
+ switch (RetVT.SimpleTy) {
+ default: llvm_unreachable("Unexpected value type.");
+ case MVT::i32: RC = &AArch64::GPR32RegClass; break;
+ case MVT::i64: RC = &AArch64::GPR64RegClass; break;
+ case MVT::f32: RC = &AArch64::FPR32RegClass; break;
+ case MVT::f64: RC = &AArch64::FPR64RegClass; break;
+ }
+ unsigned Op0Reg = getRegForValue(I->getOperand(0));
+ if (!Op0Reg)
+ return false;
+ bool Op0IsKill = hasTrivialKill(I->getOperand(0));
+ unsigned ResultReg = fastEmitInst_r(Opc, RC, Op0Reg, Op0IsKill);
+
+ if (!ResultReg)
+ return false;
+
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool AArch64FastISel::selectFRem(const Instruction *I) {
+ MVT RetVT;
+ if (!isTypeLegal(I->getType(), RetVT))
+ return false;
+
+ RTLIB::Libcall LC;
+ switch (RetVT.SimpleTy) {
+ default:
+ return false;
+ case MVT::f32:
+ LC = RTLIB::REM_F32;
+ break;
+ case MVT::f64:
+ LC = RTLIB::REM_F64;
+ break;
+ }
+
+ ArgListTy Args;
+ Args.reserve(I->getNumOperands());
+
+ // Populate the argument list.
+ for (auto &Arg : I->operands()) {
+ ArgListEntry Entry;
+ Entry.Val = Arg;
+ Entry.Ty = Arg->getType();
+ Args.push_back(Entry);
+ }
+
+ CallLoweringInfo CLI;
+ MCContext &Ctx = MF->getContext();
+ CLI.setCallee(DL, Ctx, TLI.getLibcallCallingConv(LC), I->getType(),
+ TLI.getLibcallName(LC), std::move(Args));
+ if (!lowerCallTo(CLI))
+ return false;
+ updateValueMap(I, CLI.ResultReg);
+ return true;
+}
+
+bool AArch64FastISel::selectSDiv(const Instruction *I) {
+ MVT VT;
+ if (!isTypeLegal(I->getType(), VT))
+ return false;
+
+ if (!isa<ConstantInt>(I->getOperand(1)))
+ return selectBinaryOp(I, ISD::SDIV);
+
+ const APInt &C = cast<ConstantInt>(I->getOperand(1))->getValue();
+ if ((VT != MVT::i32 && VT != MVT::i64) || !C ||
+ !(C.isPowerOf2() || (-C).isPowerOf2()))
+ return selectBinaryOp(I, ISD::SDIV);
+
+ unsigned Lg2 = C.countTrailingZeros();
+ unsigned Src0Reg = getRegForValue(I->getOperand(0));
+ if (!Src0Reg)
+ return false;
+ bool Src0IsKill = hasTrivialKill(I->getOperand(0));
+
+ if (cast<BinaryOperator>(I)->isExact()) {
+ unsigned ResultReg = emitASR_ri(VT, VT, Src0Reg, Src0IsKill, Lg2);
+ if (!ResultReg)
+ return false;
+ updateValueMap(I, ResultReg);
+ return true;
+ }
+
+ int64_t Pow2MinusOne = (1ULL << Lg2) - 1;
+ unsigned AddReg = emitAdd_ri_(VT, Src0Reg, /*IsKill=*/false, Pow2MinusOne);
+ if (!AddReg)
+ return false;
+
+ // (Src0 < 0) ? Pow2 - 1 : 0;
+ if (!emitICmp_ri(VT, Src0Reg, /*IsKill=*/false, 0))
+ return false;
+
+ unsigned SelectOpc;
+ const TargetRegisterClass *RC;
+ if (VT == MVT::i64) {
+ SelectOpc = AArch64::CSELXr;
+ RC = &AArch64::GPR64RegClass;
+ } else {
+ SelectOpc = AArch64::CSELWr;
+ RC = &AArch64::GPR32RegClass;
+ }
+ unsigned SelectReg =
+ fastEmitInst_rri(SelectOpc, RC, AddReg, /*IsKill=*/true, Src0Reg,
+ Src0IsKill, AArch64CC::LT);
+ if (!SelectReg)
+ return false;
+
+ // Divide by Pow2 --> ashr. If we're dividing by a negative value we must also
+ // negate the result.
+ unsigned ZeroReg = (VT == MVT::i64) ? AArch64::XZR : AArch64::WZR;
+ unsigned ResultReg;
+ if (C.isNegative())
+ ResultReg = emitAddSub_rs(/*UseAdd=*/false, VT, ZeroReg, /*IsKill=*/true,
+ SelectReg, /*IsKill=*/true, AArch64_AM::ASR, Lg2);
+ else
+ ResultReg = emitASR_ri(VT, VT, SelectReg, /*IsKill=*/true, Lg2);
+
+ if (!ResultReg)
+ return false;
+
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+/// This is mostly a copy of the existing FastISel getRegForGEPIndex code. We
+/// have to duplicate it for AArch64, because otherwise we would fail during the
+/// sign-extend emission.
+std::pair<unsigned, bool> AArch64FastISel::getRegForGEPIndex(const Value *Idx) {
+ unsigned IdxN = getRegForValue(Idx);
+ if (IdxN == 0)
+ // Unhandled operand. Halt "fast" selection and bail.
+ return std::pair<unsigned, bool>(0, false);
+
+ bool IdxNIsKill = hasTrivialKill(Idx);
+
+ // If the index is smaller or larger than intptr_t, truncate or extend it.
+ MVT PtrVT = TLI.getPointerTy(DL);
+ EVT IdxVT = EVT::getEVT(Idx->getType(), /*HandleUnknown=*/false);
+ if (IdxVT.bitsLT(PtrVT)) {
+ IdxN = emitIntExt(IdxVT.getSimpleVT(), IdxN, PtrVT, /*IsZExt=*/false);
+ IdxNIsKill = true;
+ } else if (IdxVT.bitsGT(PtrVT))
+ llvm_unreachable("AArch64 FastISel doesn't support types larger than i64");
+ return std::pair<unsigned, bool>(IdxN, IdxNIsKill);
+}
+
+/// This is mostly a copy of the existing FastISel GEP code, but we have to
+/// duplicate it for AArch64, because otherwise we would bail out even for
+/// simple cases. This is because the standard fastEmit functions don't cover
+/// MUL at all and ADD is lowered very inefficientily.
+bool AArch64FastISel::selectGetElementPtr(const Instruction *I) {
+ unsigned N = getRegForValue(I->getOperand(0));
+ if (!N)
+ return false;
+ bool NIsKill = hasTrivialKill(I->getOperand(0));
+
+ // Keep a running tab of the total offset to coalesce multiple N = N + Offset
+ // into a single N = N + TotalOffset.
+ uint64_t TotalOffs = 0;
+ MVT VT = TLI.getPointerTy(DL);
+ for (gep_type_iterator GTI = gep_type_begin(I), E = gep_type_end(I);
+ GTI != E; ++GTI) {
+ const Value *Idx = GTI.getOperand();
+ if (auto *StTy = GTI.getStructTypeOrNull()) {
+ unsigned Field = cast<ConstantInt>(Idx)->getZExtValue();
+ // N = N + Offset
+ if (Field)
+ TotalOffs += DL.getStructLayout(StTy)->getElementOffset(Field);
+ } else {
+ Type *Ty = GTI.getIndexedType();
+
+ // If this is a constant subscript, handle it quickly.
+ if (const auto *CI = dyn_cast<ConstantInt>(Idx)) {
+ if (CI->isZero())
+ continue;
+ // N = N + Offset
+ TotalOffs +=
+ DL.getTypeAllocSize(Ty) * cast<ConstantInt>(CI)->getSExtValue();
+ continue;
+ }
+ if (TotalOffs) {
+ N = emitAdd_ri_(VT, N, NIsKill, TotalOffs);
+ if (!N)
+ return false;
+ NIsKill = true;
+ TotalOffs = 0;
+ }
+
+ // N = N + Idx * ElementSize;
+ uint64_t ElementSize = DL.getTypeAllocSize(Ty);
+ std::pair<unsigned, bool> Pair = getRegForGEPIndex(Idx);
+ unsigned IdxN = Pair.first;
+ bool IdxNIsKill = Pair.second;
+ if (!IdxN)
+ return false;
+
+ if (ElementSize != 1) {
+ unsigned C = fastEmit_i(VT, VT, ISD::Constant, ElementSize);
+ if (!C)
+ return false;
+ IdxN = emitMul_rr(VT, IdxN, IdxNIsKill, C, true);
+ if (!IdxN)
+ return false;
+ IdxNIsKill = true;
+ }
+ N = fastEmit_rr(VT, VT, ISD::ADD, N, NIsKill, IdxN, IdxNIsKill);
+ if (!N)
+ return false;
+ }
+ }
+ if (TotalOffs) {
+ N = emitAdd_ri_(VT, N, NIsKill, TotalOffs);
+ if (!N)
+ return false;
+ }
+ updateValueMap(I, N);
+ return true;
+}
+
+bool AArch64FastISel::selectAtomicCmpXchg(const AtomicCmpXchgInst *I) {
+ assert(TM.getOptLevel() == CodeGenOpt::None &&
+ "cmpxchg survived AtomicExpand at optlevel > -O0");
+
+ auto *RetPairTy = cast<StructType>(I->getType());
+ Type *RetTy = RetPairTy->getTypeAtIndex(0U);
+ assert(RetPairTy->getTypeAtIndex(1U)->isIntegerTy(1) &&
+ "cmpxchg has a non-i1 status result");
+
+ MVT VT;
+ if (!isTypeLegal(RetTy, VT))
+ return false;
+
+ const TargetRegisterClass *ResRC;
+ unsigned Opc, CmpOpc;
+ // This only supports i32/i64, because i8/i16 aren't legal, and the generic
+ // extractvalue selection doesn't support that.
+ if (VT == MVT::i32) {
+ Opc = AArch64::CMP_SWAP_32;
+ CmpOpc = AArch64::SUBSWrs;
+ ResRC = &AArch64::GPR32RegClass;
+ } else if (VT == MVT::i64) {
+ Opc = AArch64::CMP_SWAP_64;
+ CmpOpc = AArch64::SUBSXrs;
+ ResRC = &AArch64::GPR64RegClass;
+ } else {
+ return false;
+ }
+
+ const MCInstrDesc &II = TII.get(Opc);
+
+ const unsigned AddrReg = constrainOperandRegClass(
+ II, getRegForValue(I->getPointerOperand()), II.getNumDefs());
+ const unsigned DesiredReg = constrainOperandRegClass(
+ II, getRegForValue(I->getCompareOperand()), II.getNumDefs() + 1);
+ const unsigned NewReg = constrainOperandRegClass(
+ II, getRegForValue(I->getNewValOperand()), II.getNumDefs() + 2);
+
+ const unsigned ResultReg1 = createResultReg(ResRC);
+ const unsigned ResultReg2 = createResultReg(&AArch64::GPR32RegClass);
+ const unsigned ScratchReg = createResultReg(&AArch64::GPR32RegClass);
+
+ // FIXME: MachineMemOperand doesn't support cmpxchg yet.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+ .addDef(ResultReg1)
+ .addDef(ScratchReg)
+ .addUse(AddrReg)
+ .addUse(DesiredReg)
+ .addUse(NewReg);
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
+ .addDef(VT == MVT::i32 ? AArch64::WZR : AArch64::XZR)
+ .addUse(ResultReg1)
+ .addUse(DesiredReg)
+ .addImm(0);
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::CSINCWr))
+ .addDef(ResultReg2)
+ .addUse(AArch64::WZR)
+ .addUse(AArch64::WZR)
+ .addImm(AArch64CC::NE);
+
+ assert((ResultReg1 + 1) == ResultReg2 && "Nonconsecutive result registers.");
+ updateValueMap(I, ResultReg1, 2);
+ return true;
+}
+
+bool AArch64FastISel::fastSelectInstruction(const Instruction *I) {
+ switch (I->getOpcode()) {
+ default:
+ break;
+ case Instruction::Add:
+ case Instruction::Sub:
+ return selectAddSub(I);
+ case Instruction::Mul:
+ return selectMul(I);
+ case Instruction::SDiv:
+ return selectSDiv(I);
+ case Instruction::SRem:
+ if (!selectBinaryOp(I, ISD::SREM))
+ return selectRem(I, ISD::SREM);
+ return true;
+ case Instruction::URem:
+ if (!selectBinaryOp(I, ISD::UREM))
+ return selectRem(I, ISD::UREM);
+ return true;
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ return selectShift(I);
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ return selectLogicalOp(I);
+ case Instruction::Br:
+ return selectBranch(I);
+ case Instruction::IndirectBr:
+ return selectIndirectBr(I);
+ case Instruction::BitCast:
+ if (!FastISel::selectBitCast(I))
+ return selectBitCast(I);
+ return true;
+ case Instruction::FPToSI:
+ if (!selectCast(I, ISD::FP_TO_SINT))
+ return selectFPToInt(I, /*Signed=*/true);
+ return true;
+ case Instruction::FPToUI:
+ return selectFPToInt(I, /*Signed=*/false);
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ return selectIntExt(I);
+ case Instruction::Trunc:
+ if (!selectCast(I, ISD::TRUNCATE))
+ return selectTrunc(I);
+ return true;
+ case Instruction::FPExt:
+ return selectFPExt(I);
+ case Instruction::FPTrunc:
+ return selectFPTrunc(I);
+ case Instruction::SIToFP:
+ if (!selectCast(I, ISD::SINT_TO_FP))
+ return selectIntToFP(I, /*Signed=*/true);
+ return true;
+ case Instruction::UIToFP:
+ return selectIntToFP(I, /*Signed=*/false);
+ case Instruction::Load:
+ return selectLoad(I);
+ case Instruction::Store:
+ return selectStore(I);
+ case Instruction::FCmp:
+ case Instruction::ICmp:
+ return selectCmp(I);
+ case Instruction::Select:
+ return selectSelect(I);
+ case Instruction::Ret:
+ return selectRet(I);
+ case Instruction::FRem:
+ return selectFRem(I);
+ case Instruction::GetElementPtr:
+ return selectGetElementPtr(I);
+ case Instruction::AtomicCmpXchg:
+ return selectAtomicCmpXchg(cast<AtomicCmpXchgInst>(I));
+ }
+
+ // fall-back to target-independent instruction selection.
+ return selectOperator(I, I->getOpcode());
+ // Silence warnings.
+ (void)&CC_AArch64_DarwinPCS_VarArg;
+}
+
+namespace llvm {
+llvm::FastISel *AArch64::createFastISel(FunctionLoweringInfo &FuncInfo,
+ const TargetLibraryInfo *LibInfo) {
+ return new AArch64FastISel(FuncInfo, LibInfo);
+}
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
new file mode 100644
index 000000000000..f5b8c35375f8
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -0,0 +1,1200 @@
+//===- AArch64FrameLowering.cpp - AArch64 Frame Lowering -------*- C++ -*-====//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 implementation of TargetFrameLowering class.
+//
+// On AArch64, stack frames are structured as follows:
+//
+// The stack grows downward.
+//
+// All of the individual frame areas on the frame below are optional, i.e. it's
+// possible to create a function so that the particular area isn't present
+// in the frame.
+//
+// At function entry, the "frame" looks as follows:
+//
+// | | Higher address
+// |-----------------------------------|
+// | |
+// | arguments passed on the stack |
+// | |
+// |-----------------------------------| <- sp
+// | | Lower address
+//
+//
+// After the prologue has run, the frame has the following general structure.
+// Note that this doesn't depict the case where a red-zone is used. Also,
+// technically the last frame area (VLAs) doesn't get created until in the
+// main function body, after the prologue is run. However, it's depicted here
+// for completeness.
+//
+// | | Higher address
+// |-----------------------------------|
+// | |
+// | arguments passed on the stack |
+// | |
+// |-----------------------------------|
+// | |
+// | prev_fp, prev_lr |
+// | (a.k.a. "frame record") |
+// |-----------------------------------| <- fp(=x29)
+// | |
+// | other callee-saved registers |
+// | |
+// |-----------------------------------|
+// |.empty.space.to.make.part.below....|
+// |.aligned.in.case.it.needs.more.than| (size of this area is unknown at
+// |.the.standard.16-byte.alignment....| compile time; if present)
+// |-----------------------------------|
+// | |
+// | local variables of fixed size |
+// | including spill slots |
+// |-----------------------------------| <- bp(not defined by ABI,
+// |.variable-sized.local.variables....| LLVM chooses X19)
+// |.(VLAs)............................| (size of this area is unknown at
+// |...................................| compile time)
+// |-----------------------------------| <- sp
+// | | Lower address
+//
+//
+// To access the data in a frame, at-compile time, a constant offset must be
+// computable from one of the pointers (fp, bp, sp) to access it. The size
+// of the areas with a dotted background cannot be computed at compile-time
+// if they are present, making it required to have all three of fp, bp and
+// sp to be set up to be able to access all contents in the frame areas,
+// assuming all of the frame areas are non-empty.
+//
+// For most functions, some of the frame areas are empty. For those functions,
+// it may not be necessary to set up fp or bp:
+// * A base pointer is definitely needed when there are both VLAs and local
+// variables with more-than-default alignment requirements.
+// * A frame pointer is definitely needed when there are local variables with
+// more-than-default alignment requirements.
+//
+// In some cases when a base pointer is not strictly needed, it is generated
+// anyway when offsets from the frame pointer to access local variables become
+// so large that the offset can't be encoded in the immediate fields of loads
+// or stores.
+//
+// FIXME: also explain the redzone concept.
+// FIXME: also explain the concept of reserved call frames.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64FrameLowering.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64Subtarget.h"
+#include "AArch64TargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "frame-info"
+
+static cl::opt<bool> EnableRedZone("aarch64-redzone",
+ cl::desc("enable use of redzone on AArch64"),
+ cl::init(false), cl::Hidden);
+
+STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
+
+bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
+ if (!EnableRedZone)
+ return false;
+ // Don't use the red zone if the function explicitly asks us not to.
+ // This is typically used for kernel code.
+ if (MF.getFunction()->hasFnAttribute(Attribute::NoRedZone))
+ return false;
+
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ unsigned NumBytes = AFI->getLocalStackSize();
+
+ return !(MFI.hasCalls() || hasFP(MF) || NumBytes > 128);
+}
+
+/// hasFP - Return true if the specified function should have a dedicated frame
+/// pointer register.
+bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
+ // Retain behavior of always omitting the FP for leaf functions when possible.
+ return (MFI.hasCalls() &&
+ MF.getTarget().Options.DisableFramePointerElim(MF)) ||
+ MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
+ MFI.hasStackMap() || MFI.hasPatchPoint() ||
+ RegInfo->needsStackRealignment(MF);
+}
+
+/// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
+/// not required, we reserve argument space for call sites in the function
+/// immediately on entry to the current function. This eliminates the need for
+/// add/sub sp brackets around call sites. Returns true if the call frame is
+/// included as part of the stack frame.
+bool
+AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+ return !MF.getFrameInfo().hasVarSizedObjects();
+}
+
+MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
+ const AArch64InstrInfo *TII =
+ static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
+ DebugLoc DL = I->getDebugLoc();
+ unsigned Opc = I->getOpcode();
+ bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
+ uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
+
+ const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+ if (!TFI->hasReservedCallFrame(MF)) {
+ unsigned Align = getStackAlignment();
+
+ int64_t Amount = I->getOperand(0).getImm();
+ Amount = alignTo(Amount, Align);
+ if (!IsDestroy)
+ Amount = -Amount;
+
+ // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
+ // doesn't have to pop anything), then the first operand will be zero too so
+ // this adjustment is a no-op.
+ if (CalleePopAmount == 0) {
+ // FIXME: in-function stack adjustment for calls is limited to 24-bits
+ // because there's no guaranteed temporary register available.
+ //
+ // ADD/SUB (immediate) has only LSL #0 and LSL #12 available.
+ // 1) For offset <= 12-bit, we use LSL #0
+ // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
+ // LSL #0, and the other uses LSL #12.
+ //
+ // Most call frames will be allocated at the start of a function so
+ // this is OK, but it is a limitation that needs dealing with.
+ assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
+ emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, Amount, TII);
+ }
+ } else if (CalleePopAmount != 0) {
+ // If the calling convention demands that the callee pops arguments from the
+ // stack, we want to add it back if we have a reserved call frame.
+ assert(CalleePopAmount < 0xffffff && "call frame too large");
+ emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, -CalleePopAmount,
+ TII);
+ }
+ return MBB.erase(I);
+}
+
+void AArch64FrameLowering::emitCalleeSavedFrameMoves(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
+ MachineFunction &MF = *MBB.getParent();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ const TargetSubtargetInfo &STI = MF.getSubtarget();
+ const MCRegisterInfo *MRI = STI.getRegisterInfo();
+ const TargetInstrInfo *TII = STI.getInstrInfo();
+ DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+ // Add callee saved registers to move list.
+ const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+ if (CSI.empty())
+ return;
+
+ for (const auto &Info : CSI) {
+ unsigned Reg = Info.getReg();
+ int64_t Offset =
+ MFI.getObjectOffset(Info.getFrameIdx()) - getOffsetOfLocalArea();
+ unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
+ unsigned CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameSetup);
+ }
+}
+
+// Find a scratch register that we can use at the start of the prologue to
+// re-align the stack pointer. We avoid using callee-save registers since they
+// may appear to be free when this is called from canUseAsPrologue (during
+// shrink wrapping), but then no longer be free when this is called from
+// emitPrologue.
+//
+// FIXME: This is a bit conservative, since in the above case we could use one
+// of the callee-save registers as a scratch temp to re-align the stack pointer,
+// but we would then have to make sure that we were in fact saving at least one
+// callee-save register in the prologue, which is additional complexity that
+// doesn't seem worth the benefit.
+static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
+ MachineFunction *MF = MBB->getParent();
+
+ // If MBB is an entry block, use X9 as the scratch register
+ if (&MF->front() == MBB)
+ return AArch64::X9;
+
+ const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
+ LivePhysRegs LiveRegs(&TRI);
+ LiveRegs.addLiveIns(*MBB);
+
+ // Mark callee saved registers as used so we will not choose them.
+ const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
+ const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(MF);
+ for (unsigned i = 0; CSRegs[i]; ++i)
+ LiveRegs.addReg(CSRegs[i]);
+
+ // Prefer X9 since it was historically used for the prologue scratch reg.
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
+ if (LiveRegs.available(MRI, AArch64::X9))
+ return AArch64::X9;
+
+ for (unsigned Reg : AArch64::GPR64RegClass) {
+ if (LiveRegs.available(MRI, Reg))
+ return Reg;
+ }
+ return AArch64::NoRegister;
+}
+
+bool AArch64FrameLowering::canUseAsPrologue(
+ const MachineBasicBlock &MBB) const {
+ const MachineFunction *MF = MBB.getParent();
+ MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
+ const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
+ const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+
+ // Don't need a scratch register if we're not going to re-align the stack.
+ if (!RegInfo->needsStackRealignment(*MF))
+ return true;
+ // Otherwise, we can use any block as long as it has a scratch register
+ // available.
+ return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister;
+}
+
+bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
+ MachineFunction &MF, unsigned StackBumpBytes) const {
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+ const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+
+ if (AFI->getLocalStackSize() == 0)
+ return false;
+
+ // 512 is the maximum immediate for stp/ldp that will be used for
+ // callee-save save/restores
+ if (StackBumpBytes >= 512)
+ return false;
+
+ if (MFI.hasVarSizedObjects())
+ return false;
+
+ if (RegInfo->needsStackRealignment(MF))
+ return false;
+
+ // This isn't strictly necessary, but it simplifies things a bit since the
+ // current RedZone handling code assumes the SP is adjusted by the
+ // callee-save save/restore code.
+ if (canUseRedZone(MF))
+ return false;
+
+ return true;
+}
+
+// Convert callee-save register save/restore instruction to do stack pointer
+// decrement/increment to allocate/deallocate the callee-save stack area by
+// converting store/load to use pre/post increment version.
+static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc) {
+
+ unsigned NewOpc;
+ bool NewIsUnscaled = false;
+ switch (MBBI->getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected callee-save save/restore opcode!");
+ case AArch64::STPXi:
+ NewOpc = AArch64::STPXpre;
+ break;
+ case AArch64::STPDi:
+ NewOpc = AArch64::STPDpre;
+ break;
+ case AArch64::STRXui:
+ NewOpc = AArch64::STRXpre;
+ NewIsUnscaled = true;
+ break;
+ case AArch64::STRDui:
+ NewOpc = AArch64::STRDpre;
+ NewIsUnscaled = true;
+ break;
+ case AArch64::LDPXi:
+ NewOpc = AArch64::LDPXpost;
+ break;
+ case AArch64::LDPDi:
+ NewOpc = AArch64::LDPDpost;
+ break;
+ case AArch64::LDRXui:
+ NewOpc = AArch64::LDRXpost;
+ NewIsUnscaled = true;
+ break;
+ case AArch64::LDRDui:
+ NewOpc = AArch64::LDRDpost;
+ NewIsUnscaled = true;
+ break;
+ }
+
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
+ MIB.addReg(AArch64::SP, RegState::Define);
+
+ // Copy all operands other than the immediate offset.
+ unsigned OpndIdx = 0;
+ for (unsigned OpndEnd = MBBI->getNumOperands() - 1; OpndIdx < OpndEnd;
+ ++OpndIdx)
+ MIB.addOperand(MBBI->getOperand(OpndIdx));
+
+ assert(MBBI->getOperand(OpndIdx).getImm() == 0 &&
+ "Unexpected immediate offset in first/last callee-save save/restore "
+ "instruction!");
+ assert(MBBI->getOperand(OpndIdx - 1).getReg() == AArch64::SP &&
+ "Unexpected base register in callee-save save/restore instruction!");
+ // Last operand is immediate offset that needs fixing.
+ assert(CSStackSizeInc % 8 == 0);
+ int64_t CSStackSizeIncImm = CSStackSizeInc;
+ if (!NewIsUnscaled)
+ CSStackSizeIncImm /= 8;
+ MIB.addImm(CSStackSizeIncImm);
+
+ MIB.setMIFlags(MBBI->getFlags());
+ MIB.setMemRefs(MBBI->memoperands_begin(), MBBI->memoperands_end());
+
+ return std::prev(MBB.erase(MBBI));
+}
+
+// Fixup callee-save register save/restore instructions to take into account
+// combined SP bump by adding the local stack size to the stack offsets.
+static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
+ unsigned LocalStackSize) {
+ unsigned Opc = MI.getOpcode();
+ (void)Opc;
+ assert((Opc == AArch64::STPXi || Opc == AArch64::STPDi ||
+ Opc == AArch64::STRXui || Opc == AArch64::STRDui ||
+ Opc == AArch64::LDPXi || Opc == AArch64::LDPDi ||
+ Opc == AArch64::LDRXui || Opc == AArch64::LDRDui) &&
+ "Unexpected callee-save save/restore opcode!");
+
+ unsigned OffsetIdx = MI.getNumExplicitOperands() - 1;
+ assert(MI.getOperand(OffsetIdx - 1).getReg() == AArch64::SP &&
+ "Unexpected base register in callee-save save/restore instruction!");
+ // Last operand is immediate offset that needs fixing.
+ MachineOperand &OffsetOpnd = MI.getOperand(OffsetIdx);
+ // All generated opcodes have scaled offsets.
+ assert(LocalStackSize % 8 == 0);
+ OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / 8);
+}
+
+void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ MachineBasicBlock::iterator MBBI = MBB.begin();
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const Function *Fn = MF.getFunction();
+ const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+ const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ MachineModuleInfo &MMI = MF.getMMI();
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ bool needsFrameMoves = MMI.hasDebugInfo() || Fn->needsUnwindTableEntry();
+ bool HasFP = hasFP(MF);
+
+ // Debug location must be unknown since the first debug location is used
+ // to determine the end of the prologue.
+ DebugLoc DL;
+
+ // All calls are tail calls in GHC calling conv, and functions have no
+ // prologue/epilogue.
+ if (MF.getFunction()->getCallingConv() == CallingConv::GHC)
+ return;
+
+ int NumBytes = (int)MFI.getStackSize();
+ if (!AFI->hasStackFrame()) {
+ assert(!HasFP && "unexpected function without stack frame but with FP");
+
+ // All of the stack allocation is for locals.
+ AFI->setLocalStackSize(NumBytes);
+
+ if (!NumBytes)
+ return;
+ // REDZONE: If the stack size is less than 128 bytes, we don't need
+ // to actually allocate.
+ if (canUseRedZone(MF))
+ ++NumRedZoneFunctions;
+ else {
+ emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
+ MachineInstr::FrameSetup);
+
+ // Label used to tie together the PROLOG_LABEL and the MachineMoves.
+ MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
+ // Encode the stack size of the leaf function.
+ unsigned CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes));
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameSetup);
+ }
+ return;
+ }
+
+ auto CSStackSize = AFI->getCalleeSavedStackSize();
+ // All of the remaining stack allocations are for locals.
+ AFI->setLocalStackSize(NumBytes - CSStackSize);
+
+ bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
+ if (CombineSPBump) {
+ emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
+ MachineInstr::FrameSetup);
+ NumBytes = 0;
+ } else if (CSStackSize != 0) {
+ MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(MBB, MBBI, DL, TII,
+ -CSStackSize);
+ NumBytes -= CSStackSize;
+ }
+ assert(NumBytes >= 0 && "Negative stack allocation size!?");
+
+ // Move past the saves of the callee-saved registers, fixing up the offsets
+ // and pre-inc if we decided to combine the callee-save and local stack
+ // pointer bump above.
+ MachineBasicBlock::iterator End = MBB.end();
+ while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup)) {
+ if (CombineSPBump)
+ fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize());
+ ++MBBI;
+ }
+ if (HasFP) {
+ // Only set up FP if we actually need to. Frame pointer is fp = sp - 16.
+ int FPOffset = CSStackSize - 16;
+ if (CombineSPBump)
+ FPOffset += AFI->getLocalStackSize();
+
+ // Issue sub fp, sp, FPOffset or
+ // mov fp,sp when FPOffset is zero.
+ // Note: All stores of callee-saved registers are marked as "FrameSetup".
+ // This code marks the instruction(s) that set the FP also.
+ emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP, FPOffset, TII,
+ MachineInstr::FrameSetup);
+ }
+
+ // Allocate space for the rest of the frame.
+ if (NumBytes) {
+ const bool NeedsRealignment = RegInfo->needsStackRealignment(MF);
+ unsigned scratchSPReg = AArch64::SP;
+
+ if (NeedsRealignment) {
+ scratchSPReg = findScratchNonCalleeSaveRegister(&MBB);
+ assert(scratchSPReg != AArch64::NoRegister);
+ }
+
+ // If we're a leaf function, try using the red zone.
+ if (!canUseRedZone(MF))
+ // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
+ // the correct value here, as NumBytes also includes padding bytes,
+ // which shouldn't be counted here.
+ emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, -NumBytes, TII,
+ MachineInstr::FrameSetup);
+
+ if (NeedsRealignment) {
+ const unsigned Alignment = MFI.getMaxAlignment();
+ const unsigned NrBitsToZero = countTrailingZeros(Alignment);
+ assert(NrBitsToZero > 1);
+ assert(scratchSPReg != AArch64::SP);
+
+ // SUB X9, SP, NumBytes
+ // -- X9 is temporary register, so shouldn't contain any live data here,
+ // -- free to use. This is already produced by emitFrameOffset above.
+ // AND SP, X9, 0b11111...0000
+ // The logical immediates have a non-trivial encoding. The following
+ // formula computes the encoded immediate with all ones but
+ // NrBitsToZero zero bits as least significant bits.
+ uint32_t andMaskEncoded = (1 << 12) // = N
+ | ((64 - NrBitsToZero) << 6) // immr
+ | ((64 - NrBitsToZero - 1) << 0); // imms
+
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
+ .addReg(scratchSPReg, RegState::Kill)
+ .addImm(andMaskEncoded);
+ AFI->setStackRealigned(true);
+ }
+ }
+
+ // If we need a base pointer, set it up here. It's whatever the value of the
+ // stack pointer is at this point. Any variable size objects will be allocated
+ // after this, so we can still use the base pointer to reference locals.
+ //
+ // FIXME: Clarify FrameSetup flags here.
+ // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
+ // needed.
+ if (RegInfo->hasBasePointer(MF)) {
+ TII->copyPhysReg(MBB, MBBI, DL, RegInfo->getBaseRegister(), AArch64::SP,
+ false);
+ }
+
+ if (needsFrameMoves) {
+ const DataLayout &TD = MF.getDataLayout();
+ const int StackGrowth = -TD.getPointerSize(0);
+ unsigned FramePtr = RegInfo->getFrameRegister(MF);
+ // An example of the prologue:
+ //
+ // .globl __foo
+ // .align 2
+ // __foo:
+ // Ltmp0:
+ // .cfi_startproc
+ // .cfi_personality 155, ___gxx_personality_v0
+ // Leh_func_begin:
+ // .cfi_lsda 16, Lexception33
+ //
+ // stp xa,bx, [sp, -#offset]!
+ // ...
+ // stp x28, x27, [sp, #offset-32]
+ // stp fp, lr, [sp, #offset-16]
+ // add fp, sp, #offset - 16
+ // sub sp, sp, #1360
+ //
+ // The Stack:
+ // +-------------------------------------------+
+ // 10000 | ........ | ........ | ........ | ........ |
+ // 10004 | ........ | ........ | ........ | ........ |
+ // +-------------------------------------------+
+ // 10008 | ........ | ........ | ........ | ........ |
+ // 1000c | ........ | ........ | ........ | ........ |
+ // +===========================================+
+ // 10010 | X28 Register |
+ // 10014 | X28 Register |
+ // +-------------------------------------------+
+ // 10018 | X27 Register |
+ // 1001c | X27 Register |
+ // +===========================================+
+ // 10020 | Frame Pointer |
+ // 10024 | Frame Pointer |
+ // +-------------------------------------------+
+ // 10028 | Link Register |
+ // 1002c | Link Register |
+ // +===========================================+
+ // 10030 | ........ | ........ | ........ | ........ |
+ // 10034 | ........ | ........ | ........ | ........ |
+ // +-------------------------------------------+
+ // 10038 | ........ | ........ | ........ | ........ |
+ // 1003c | ........ | ........ | ........ | ........ |
+ // +-------------------------------------------+
+ //
+ // [sp] = 10030 :: >>initial value<<
+ // sp = 10020 :: stp fp, lr, [sp, #-16]!
+ // fp = sp == 10020 :: mov fp, sp
+ // [sp] == 10020 :: stp x28, x27, [sp, #-16]!
+ // sp == 10010 :: >>final value<<
+ //
+ // The frame pointer (w29) points to address 10020. If we use an offset of
+ // '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24
+ // for w27, and -32 for w28:
+ //
+ // Ltmp1:
+ // .cfi_def_cfa w29, 16
+ // Ltmp2:
+ // .cfi_offset w30, -8
+ // Ltmp3:
+ // .cfi_offset w29, -16
+ // Ltmp4:
+ // .cfi_offset w27, -24
+ // Ltmp5:
+ // .cfi_offset w28, -32
+
+ if (HasFP) {
+ // Define the current CFA rule to use the provided FP.
+ unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
+ unsigned CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::createDefCfa(nullptr, Reg, 2 * StackGrowth));
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameSetup);
+ } else {
+ // Encode the stack size of the leaf function.
+ unsigned CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::createDefCfaOffset(nullptr, -MFI.getStackSize()));
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameSetup);
+ }
+
+ // Now emit the moves for whatever callee saved regs we have (including FP,
+ // LR if those are saved).
+ emitCalleeSavedFrameMoves(MBB, MBBI);
+ }
+}
+
+void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ DebugLoc DL;
+ bool IsTailCallReturn = false;
+ if (MBB.end() != MBBI) {
+ DL = MBBI->getDebugLoc();
+ unsigned RetOpcode = MBBI->getOpcode();
+ IsTailCallReturn = RetOpcode == AArch64::TCRETURNdi ||
+ RetOpcode == AArch64::TCRETURNri;
+ }
+ int NumBytes = MFI.getStackSize();
+ const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+
+ // All calls are tail calls in GHC calling conv, and functions have no
+ // prologue/epilogue.
+ if (MF.getFunction()->getCallingConv() == CallingConv::GHC)
+ return;
+
+ // Initial and residual are named for consistency with the prologue. Note that
+ // in the epilogue, the residual adjustment is executed first.
+ uint64_t ArgumentPopSize = 0;
+ if (IsTailCallReturn) {
+ MachineOperand &StackAdjust = MBBI->getOperand(1);
+
+ // For a tail-call in a callee-pops-arguments environment, some or all of
+ // the stack may actually be in use for the call's arguments, this is
+ // calculated during LowerCall and consumed here...
+ ArgumentPopSize = StackAdjust.getImm();
+ } else {
+ // ... otherwise the amount to pop is *all* of the argument space,
+ // conveniently stored in the MachineFunctionInfo by
+ // LowerFormalArguments. This will, of course, be zero for the C calling
+ // convention.
+ ArgumentPopSize = AFI->getArgumentStackToRestore();
+ }
+
+ // The stack frame should be like below,
+ //
+ // ---------------------- ---
+ // | | |
+ // | BytesInStackArgArea| CalleeArgStackSize
+ // | (NumReusableBytes) | (of tail call)
+ // | | ---
+ // | | |
+ // ---------------------| --- |
+ // | | | |
+ // | CalleeSavedReg | | |
+ // | (CalleeSavedStackSize)| | |
+ // | | | |
+ // ---------------------| | NumBytes
+ // | | StackSize (StackAdjustUp)
+ // | LocalStackSize | | |
+ // | (covering callee | | |
+ // | args) | | |
+ // | | | |
+ // ---------------------- --- ---
+ //
+ // So NumBytes = StackSize + BytesInStackArgArea - CalleeArgStackSize
+ // = StackSize + ArgumentPopSize
+ //
+ // AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps
+ // it as the 2nd argument of AArch64ISD::TC_RETURN.
+
+ auto CSStackSize = AFI->getCalleeSavedStackSize();
+ bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
+
+ if (!CombineSPBump && CSStackSize != 0)
+ convertCalleeSaveRestoreToSPPrePostIncDec(
+ MBB, std::prev(MBB.getFirstTerminator()), DL, TII, CSStackSize);
+
+ // Move past the restores of the callee-saved registers.
+ MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator();
+ MachineBasicBlock::iterator Begin = MBB.begin();
+ while (LastPopI != Begin) {
+ --LastPopI;
+ if (!LastPopI->getFlag(MachineInstr::FrameDestroy)) {
+ ++LastPopI;
+ break;
+ } else if (CombineSPBump)
+ fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize());
+ }
+
+ // If there is a single SP update, insert it before the ret and we're done.
+ if (CombineSPBump) {
+ emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
+ NumBytes + ArgumentPopSize, TII,
+ MachineInstr::FrameDestroy);
+ return;
+ }
+
+ NumBytes -= CSStackSize;
+ assert(NumBytes >= 0 && "Negative stack allocation size!?");
+
+ if (!hasFP(MF)) {
+ bool RedZone = canUseRedZone(MF);
+ // If this was a redzone leaf function, we don't need to restore the
+ // stack pointer (but we may need to pop stack args for fastcc).
+ if (RedZone && ArgumentPopSize == 0)
+ return;
+
+ bool NoCalleeSaveRestore = CSStackSize == 0;
+ int StackRestoreBytes = RedZone ? 0 : NumBytes;
+ if (NoCalleeSaveRestore)
+ StackRestoreBytes += ArgumentPopSize;
+ emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
+ StackRestoreBytes, TII, MachineInstr::FrameDestroy);
+ // If we were able to combine the local stack pop with the argument pop,
+ // then we're done.
+ if (NoCalleeSaveRestore || ArgumentPopSize == 0)
+ return;
+ NumBytes = 0;
+ }
+
+ // Restore the original stack pointer.
+ // FIXME: Rather than doing the math here, we should instead just use
+ // non-post-indexed loads for the restores if we aren't actually going to
+ // be able to save any instructions.
+ if (MFI.hasVarSizedObjects() || AFI->isStackRealigned())
+ emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
+ -CSStackSize + 16, TII, MachineInstr::FrameDestroy);
+ else if (NumBytes)
+ emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes, TII,
+ MachineInstr::FrameDestroy);
+
+ // This must be placed after the callee-save restore code because that code
+ // assumes the SP is at the same location as it was after the callee-save save
+ // code in the prologue.
+ if (ArgumentPopSize)
+ emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
+ ArgumentPopSize, TII, MachineInstr::FrameDestroy);
+}
+
+/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
+/// debug info. It's the same as what we use for resolving the code-gen
+/// references for now. FIXME: This can go wrong when references are
+/// SP-relative and simple call frames aren't used.
+int AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF,
+ int FI,
+ unsigned &FrameReg) const {
+ return resolveFrameIndexReference(MF, FI, FrameReg);
+}
+
+int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
+ int FI, unsigned &FrameReg,
+ bool PreferFP) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
+ MF.getSubtarget().getRegisterInfo());
+ const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ int FPOffset = MFI.getObjectOffset(FI) + 16;
+ int Offset = MFI.getObjectOffset(FI) + MFI.getStackSize();
+ bool isFixed = MFI.isFixedObjectIndex(FI);
+
+ // Use frame pointer to reference fixed objects. Use it for locals if
+ // there are VLAs or a dynamically realigned SP (and thus the SP isn't
+ // reliable as a base). Make sure useFPForScavengingIndex() does the
+ // right thing for the emergency spill slot.
+ bool UseFP = false;
+ if (AFI->hasStackFrame()) {
+ // Note: Keeping the following as multiple 'if' statements rather than
+ // merging to a single expression for readability.
+ //
+ // Argument access should always use the FP.
+ if (isFixed) {
+ UseFP = hasFP(MF);
+ } else if (hasFP(MF) && !RegInfo->hasBasePointer(MF) &&
+ !RegInfo->needsStackRealignment(MF)) {
+ // Use SP or FP, whichever gives us the best chance of the offset
+ // being in range for direct access. If the FPOffset is positive,
+ // that'll always be best, as the SP will be even further away.
+ // If the FPOffset is negative, we have to keep in mind that the
+ // available offset range for negative offsets is smaller than for
+ // positive ones. If we have variable sized objects, we're stuck with
+ // using the FP regardless, though, as the SP offset is unknown
+ // and we don't have a base pointer available. If an offset is
+ // available via the FP and the SP, use whichever is closest.
+ if (PreferFP || MFI.hasVarSizedObjects() || FPOffset >= 0 ||
+ (FPOffset >= -256 && Offset > -FPOffset))
+ UseFP = true;
+ }
+ }
+
+ assert((isFixed || !RegInfo->needsStackRealignment(MF) || !UseFP) &&
+ "In the presence of dynamic stack pointer realignment, "
+ "non-argument objects cannot be accessed through the frame pointer");
+
+ if (UseFP) {
+ FrameReg = RegInfo->getFrameRegister(MF);
+ return FPOffset;
+ }
+
+ // Use the base pointer if we have one.
+ if (RegInfo->hasBasePointer(MF))
+ FrameReg = RegInfo->getBaseRegister();
+ else {
+ FrameReg = AArch64::SP;
+ // If we're using the red zone for this function, the SP won't actually
+ // be adjusted, so the offsets will be negative. They're also all
+ // within range of the signed 9-bit immediate instructions.
+ if (canUseRedZone(MF))
+ Offset -= AFI->getLocalStackSize();
+ }
+
+ return Offset;
+}
+
+static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
+ // Do not set a kill flag on values that are also marked as live-in. This
+ // happens with the @llvm-returnaddress intrinsic and with arguments passed in
+ // callee saved registers.
+ // Omitting the kill flags is conservatively correct even if the live-in
+ // is not used after all.
+ bool IsLiveIn = MF.getRegInfo().isLiveIn(Reg);
+ return getKillRegState(!IsLiveIn);
+}
+
+static bool produceCompactUnwindFrame(MachineFunction &MF) {
+ const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+ AttributeSet Attrs = MF.getFunction()->getAttributes();
+ return Subtarget.isTargetMachO() &&
+ !(Subtarget.getTargetLowering()->supportSwiftError() &&
+ Attrs.hasAttrSomewhere(Attribute::SwiftError));
+}
+
+namespace {
+struct RegPairInfo {
+ RegPairInfo() : Reg1(AArch64::NoRegister), Reg2(AArch64::NoRegister) {}
+ unsigned Reg1;
+ unsigned Reg2;
+ int FrameIdx;
+ int Offset;
+ bool IsGPR;
+ bool isPaired() const { return Reg2 != AArch64::NoRegister; }
+};
+} // end anonymous namespace
+
+static void computeCalleeSaveRegisterPairs(
+ MachineFunction &MF, const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs) {
+
+ if (CSI.empty())
+ return;
+
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ CallingConv::ID CC = MF.getFunction()->getCallingConv();
+ unsigned Count = CSI.size();
+ (void)CC;
+ // MachO's compact unwind format relies on all registers being stored in
+ // pairs.
+ assert((!produceCompactUnwindFrame(MF) ||
+ CC == CallingConv::PreserveMost ||
+ (Count & 1) == 0) &&
+ "Odd number of callee-saved regs to spill!");
+ unsigned Offset = AFI->getCalleeSavedStackSize();
+
+ for (unsigned i = 0; i < Count; ++i) {
+ RegPairInfo RPI;
+ RPI.Reg1 = CSI[i].getReg();
+
+ assert(AArch64::GPR64RegClass.contains(RPI.Reg1) ||
+ AArch64::FPR64RegClass.contains(RPI.Reg1));
+ RPI.IsGPR = AArch64::GPR64RegClass.contains(RPI.Reg1);
+
+ // Add the next reg to the pair if it is in the same register class.
+ if (i + 1 < Count) {
+ unsigned NextReg = CSI[i + 1].getReg();
+ if ((RPI.IsGPR && AArch64::GPR64RegClass.contains(NextReg)) ||
+ (!RPI.IsGPR && AArch64::FPR64RegClass.contains(NextReg)))
+ RPI.Reg2 = NextReg;
+ }
+
+ // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
+ // list to come in sorted by frame index so that we can issue the store
+ // pair instructions directly. Assert if we see anything otherwise.
+ //
+ // The order of the registers in the list is controlled by
+ // getCalleeSavedRegs(), so they will always be in-order, as well.
+ assert((!RPI.isPaired() ||
+ (CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx())) &&
+ "Out of order callee saved regs!");
+
+ // MachO's compact unwind format relies on all registers being stored in
+ // adjacent register pairs.
+ assert((!produceCompactUnwindFrame(MF) ||
+ CC == CallingConv::PreserveMost ||
+ (RPI.isPaired() &&
+ ((RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) ||
+ RPI.Reg1 + 1 == RPI.Reg2))) &&
+ "Callee-save registers not saved as adjacent register pair!");
+
+ RPI.FrameIdx = CSI[i].getFrameIdx();
+
+ if (Count * 8 != AFI->getCalleeSavedStackSize() && !RPI.isPaired()) {
+ // Round up size of non-pair to pair size if we need to pad the
+ // callee-save area to ensure 16-byte alignment.
+ Offset -= 16;
+ assert(MFI.getObjectAlignment(RPI.FrameIdx) <= 16);
+ MFI.setObjectAlignment(RPI.FrameIdx, 16);
+ AFI->setCalleeSaveStackHasFreeSpace(true);
+ } else
+ Offset -= RPI.isPaired() ? 16 : 8;
+ assert(Offset % 8 == 0);
+ RPI.Offset = Offset / 8;
+ assert((RPI.Offset >= -64 && RPI.Offset <= 63) &&
+ "Offset out of bounds for LDP/STP immediate");
+
+ RegPairs.push_back(RPI);
+ if (RPI.isPaired())
+ ++i;
+ }
+}
+
+bool AArch64FrameLowering::spillCalleeSavedRegisters(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const {
+ MachineFunction &MF = *MBB.getParent();
+ const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+ DebugLoc DL;
+ SmallVector<RegPairInfo, 8> RegPairs;
+
+ computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs);
+
+ for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE;
+ ++RPII) {
+ RegPairInfo RPI = *RPII;
+ unsigned Reg1 = RPI.Reg1;
+ unsigned Reg2 = RPI.Reg2;
+ unsigned StrOpc;
+
+ // Issue sequence of spills for cs regs. The first spill may be converted
+ // to a pre-decrement store later by emitPrologue if the callee-save stack
+ // area allocation can't be combined with the local stack area allocation.
+ // For example:
+ // stp x22, x21, [sp, #0] // addImm(+0)
+ // stp x20, x19, [sp, #16] // addImm(+2)
+ // stp fp, lr, [sp, #32] // addImm(+4)
+ // Rationale: This sequence saves uop updates compared to a sequence of
+ // pre-increment spills like stp xi,xj,[sp,#-16]!
+ // Note: Similar rationale and sequence for restores in epilog.
+ if (RPI.IsGPR)
+ StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui;
+ else
+ StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui;
+ DEBUG(dbgs() << "CSR spill: (" << TRI->getName(Reg1);
+ if (RPI.isPaired())
+ dbgs() << ", " << TRI->getName(Reg2);
+ dbgs() << ") -> fi#(" << RPI.FrameIdx;
+ if (RPI.isPaired())
+ dbgs() << ", " << RPI.FrameIdx+1;
+ dbgs() << ")\n");
+
+ MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
+ MBB.addLiveIn(Reg1);
+ if (RPI.isPaired()) {
+ MBB.addLiveIn(Reg2);
+ MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
+ MIB.addMemOperand(MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx + 1),
+ MachineMemOperand::MOStore, 8, 8));
+ }
+ MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
+ .addReg(AArch64::SP)
+ .addImm(RPI.Offset) // [sp, #offset*8], where factor*8 is implicit
+ .setMIFlag(MachineInstr::FrameSetup);
+ MIB.addMemOperand(MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx),
+ MachineMemOperand::MOStore, 8, 8));
+ }
+ return true;
+}
+
+bool AArch64FrameLowering::restoreCalleeSavedRegisters(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const {
+ MachineFunction &MF = *MBB.getParent();
+ const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+ DebugLoc DL;
+ SmallVector<RegPairInfo, 8> RegPairs;
+
+ if (MI != MBB.end())
+ DL = MI->getDebugLoc();
+
+ computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs);
+
+ for (auto RPII = RegPairs.begin(), RPIE = RegPairs.end(); RPII != RPIE;
+ ++RPII) {
+ RegPairInfo RPI = *RPII;
+ unsigned Reg1 = RPI.Reg1;
+ unsigned Reg2 = RPI.Reg2;
+
+ // Issue sequence of restores for cs regs. The last restore may be converted
+ // to a post-increment load later by emitEpilogue if the callee-save stack
+ // area allocation can't be combined with the local stack area allocation.
+ // For example:
+ // ldp fp, lr, [sp, #32] // addImm(+4)
+ // ldp x20, x19, [sp, #16] // addImm(+2)
+ // ldp x22, x21, [sp, #0] // addImm(+0)
+ // Note: see comment in spillCalleeSavedRegisters()
+ unsigned LdrOpc;
+ if (RPI.IsGPR)
+ LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui;
+ else
+ LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui;
+ DEBUG(dbgs() << "CSR restore: (" << TRI->getName(Reg1);
+ if (RPI.isPaired())
+ dbgs() << ", " << TRI->getName(Reg2);
+ dbgs() << ") -> fi#(" << RPI.FrameIdx;
+ if (RPI.isPaired())
+ dbgs() << ", " << RPI.FrameIdx+1;
+ dbgs() << ")\n");
+
+ MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc));
+ if (RPI.isPaired()) {
+ MIB.addReg(Reg2, getDefRegState(true));
+ MIB.addMemOperand(MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx + 1),
+ MachineMemOperand::MOLoad, 8, 8));
+ }
+ MIB.addReg(Reg1, getDefRegState(true))
+ .addReg(AArch64::SP)
+ .addImm(RPI.Offset) // [sp, #offset*8] where the factor*8 is implicit
+ .setMIFlag(MachineInstr::FrameDestroy);
+ MIB.addMemOperand(MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx),
+ MachineMemOperand::MOLoad, 8, 8));
+ }
+ return true;
+}
+
+void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
+ BitVector &SavedRegs,
+ RegScavenger *RS) const {
+ // All calls are tail calls in GHC calling conv, and functions have no
+ // prologue/epilogue.
+ if (MF.getFunction()->getCallingConv() == CallingConv::GHC)
+ return;
+
+ TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+ const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
+ MF.getSubtarget().getRegisterInfo());
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ unsigned UnspilledCSGPR = AArch64::NoRegister;
+ unsigned UnspilledCSGPRPaired = AArch64::NoRegister;
+
+ // The frame record needs to be created by saving the appropriate registers
+ if (hasFP(MF)) {
+ SavedRegs.set(AArch64::FP);
+ SavedRegs.set(AArch64::LR);
+ }
+
+ unsigned BasePointerReg = AArch64::NoRegister;
+ if (RegInfo->hasBasePointer(MF))
+ BasePointerReg = RegInfo->getBaseRegister();
+
+ bool ExtraCSSpill = false;
+ const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
+ // Figure out which callee-saved registers to save/restore.
+ for (unsigned i = 0; CSRegs[i]; ++i) {
+ const unsigned Reg = CSRegs[i];
+
+ // Add the base pointer register to SavedRegs if it is callee-save.
+ if (Reg == BasePointerReg)
+ SavedRegs.set(Reg);
+
+ bool RegUsed = SavedRegs.test(Reg);
+ unsigned PairedReg = CSRegs[i ^ 1];
+ if (!RegUsed) {
+ if (AArch64::GPR64RegClass.contains(Reg) &&
+ !RegInfo->isReservedReg(MF, Reg)) {
+ UnspilledCSGPR = Reg;
+ UnspilledCSGPRPaired = PairedReg;
+ }
+ continue;
+ }
+
+ // MachO's compact unwind format relies on all registers being stored in
+ // pairs.
+ // FIXME: the usual format is actually better if unwinding isn't needed.
+ if (produceCompactUnwindFrame(MF) && !SavedRegs.test(PairedReg)) {
+ SavedRegs.set(PairedReg);
+ if (AArch64::GPR64RegClass.contains(PairedReg) &&
+ !RegInfo->isReservedReg(MF, PairedReg))
+ ExtraCSSpill = true;
+ }
+ }
+
+ DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:";
+ for (int Reg = SavedRegs.find_first(); Reg != -1;
+ Reg = SavedRegs.find_next(Reg))
+ dbgs() << ' ' << PrintReg(Reg, RegInfo);
+ dbgs() << "\n";);
+
+ // If any callee-saved registers are used, the frame cannot be eliminated.
+ unsigned NumRegsSpilled = SavedRegs.count();
+ bool CanEliminateFrame = NumRegsSpilled == 0;
+
+ // FIXME: Set BigStack if any stack slot references may be out of range.
+ // For now, just conservatively guestimate based on unscaled indexing
+ // range. We'll end up allocating an unnecessary spill slot a lot, but
+ // realistically that's not a big deal at this stage of the game.
+ // The CSR spill slots have not been allocated yet, so estimateStackSize
+ // won't include them.
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ unsigned CFSize = MFI.estimateStackSize(MF) + 8 * NumRegsSpilled;
+ DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n");
+ bool BigStack = (CFSize >= 256);
+ if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
+ AFI->setHasStackFrame(true);
+
+ // Estimate if we might need to scavenge a register at some point in order
+ // to materialize a stack offset. If so, either spill one additional
+ // callee-saved register or reserve a special spill slot to facilitate
+ // register scavenging. If we already spilled an extra callee-saved register
+ // above to keep the number of spills even, we don't need to do anything else
+ // here.
+ if (BigStack && !ExtraCSSpill) {
+ if (UnspilledCSGPR != AArch64::NoRegister) {
+ DEBUG(dbgs() << "Spilling " << PrintReg(UnspilledCSGPR, RegInfo)
+ << " to get a scratch register.\n");
+ SavedRegs.set(UnspilledCSGPR);
+ // MachO's compact unwind format relies on all registers being stored in
+ // pairs, so if we need to spill one extra for BigStack, then we need to
+ // store the pair.
+ if (produceCompactUnwindFrame(MF))
+ SavedRegs.set(UnspilledCSGPRPaired);
+ ExtraCSSpill = true;
+ NumRegsSpilled = SavedRegs.count();
+ }
+
+ // If we didn't find an extra callee-saved register to spill, create
+ // an emergency spill slot.
+ if (!ExtraCSSpill) {
+ const TargetRegisterClass *RC = &AArch64::GPR64RegClass;
+ int FI = MFI.CreateStackObject(RC->getSize(), RC->getAlignment(), false);
+ RS->addScavengingFrameIndex(FI);
+ DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
+ << " as the emergency spill slot.\n");
+ }
+ }
+
+ // Round up to register pair alignment to avoid additional SP adjustment
+ // instructions.
+ AFI->setCalleeSavedStackSize(alignTo(8 * NumRegsSpilled, 16));
+}
+
+bool AArch64FrameLowering::enableStackSlotScavenging(
+ const MachineFunction &MF) const {
+ const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ return AFI->hasCalleeSaveStackFreeSpace();
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h
new file mode 100644
index 000000000000..f254ea9b70aa
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -0,0 +1,79 @@
+//==-- AArch64FrameLowering.h - TargetFrameLowering for AArch64 --*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H
+
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+
+class AArch64FrameLowering : public TargetFrameLowering {
+public:
+ explicit AArch64FrameLowering()
+ : TargetFrameLowering(StackGrowsDown, 16, 0, 16,
+ true /*StackRealignable*/) {}
+
+ void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI) const;
+
+ MachineBasicBlock::iterator
+ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const override;
+
+ /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+ /// the function.
+ void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+ void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+ bool canUseAsPrologue(const MachineBasicBlock &MBB) const override;
+
+ int getFrameIndexReference(const MachineFunction &MF, int FI,
+ unsigned &FrameReg) const override;
+ int resolveFrameIndexReference(const MachineFunction &MF, int FI,
+ unsigned &FrameReg,
+ bool PreferFP = false) const;
+ bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const override;
+
+ bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const override;
+
+ /// \brief Can this function use the red zone for local allocations.
+ bool canUseRedZone(const MachineFunction &MF) const;
+
+ bool hasFP(const MachineFunction &MF) const override;
+ bool hasReservedCallFrame(const MachineFunction &MF) const override;
+
+ void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+ RegScavenger *RS) const override;
+
+ /// Returns true if the target will correctly handle shrink wrapping.
+ bool enableShrinkWrapping(const MachineFunction &MF) const override {
+ return true;
+ }
+
+ bool enableStackSlotScavenging(const MachineFunction &MF) const override;
+
+private:
+ bool shouldCombineCSRLocalStackBump(MachineFunction &MF,
+ unsigned StackBumpBytes) const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def b/contrib/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
new file mode 100644
index 000000000000..e927d58ad612
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
@@ -0,0 +1,173 @@
+//===- AArch64GenRegisterBankInfo.def ----------------------------*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines all the static objects used by AArch64RegisterBankInfo.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "You shouldn't build this"
+#endif
+
+namespace llvm {
+namespace AArch64 {
+
+RegisterBank GPRRegBank;
+RegisterBank FPRRegBank;
+RegisterBank CCRRegBank;
+
+RegisterBank *RegBanks[] = {&GPRRegBank, &FPRRegBank, &CCRRegBank};
+
+// PartialMappings.
+enum PartialMappingIdx {
+ PMI_None = -1,
+ PMI_GPR32 = 1,
+ PMI_GPR64,
+ PMI_FPR32,
+ PMI_FPR64,
+ PMI_FPR128,
+ PMI_FPR256,
+ PMI_FPR512,
+ PMI_FirstGPR = PMI_GPR32,
+ PMI_LastGPR = PMI_GPR64,
+ PMI_FirstFPR = PMI_FPR32,
+ PMI_LastFPR = PMI_FPR512,
+ PMI_Min = PMI_FirstGPR,
+};
+
+static unsigned getRegBankBaseIdxOffset(unsigned Size) {
+ assert(Size && "0-sized type!!");
+ // Make anything smaller than 32 gets 32
+ Size = ((Size + 31) / 32) * 32;
+ // 32 is 0, 64 is 1, 128 is 2, and so on.
+ return Log2_32(Size) - /*Log2_32(32)=*/ 5;
+}
+
+RegisterBankInfo::PartialMapping PartMappings[] {
+ /* StartIdx, Length, RegBank */
+ // 0: GPR 32-bit value.
+ {0, 32, GPRRegBank},
+ // 1: GPR 64-bit value.
+ {0, 64, GPRRegBank},
+ // 2: FPR 32-bit value.
+ {0, 32, FPRRegBank},
+ // 3: FPR 64-bit value.
+ {0, 64, FPRRegBank},
+ // 4: FPR 128-bit value.
+ {0, 128, FPRRegBank},
+ // 5: FPR 256-bit value.
+ {0, 256, FPRRegBank},
+ // 6: FPR 512-bit value.
+ {0, 512, FPRRegBank}
+};
+
+enum ValueMappingIdx {
+ First3OpsIdx = 0,
+ Last3OpsIdx = 18,
+ DistanceBetweenRegBanks = 3,
+ FirstCrossRegCpyIdx = 21,
+ LastCrossRegCpyIdx = 27,
+ DistanceBetweenCrossRegCpy = 2
+};
+
+// ValueMappings.
+RegisterBankInfo::ValueMapping ValMappings[]{
+ /* BreakDown, NumBreakDowns */
+ // 3-operands instructions (all binary operations should end up with one of
+ // those mapping).
+ // 0: GPR 32-bit value. <-- This must match First3OpsIdx.
+ {&PartMappings[PMI_GPR32 - PMI_Min], 1},
+ {&PartMappings[PMI_GPR32 - PMI_Min], 1},
+ {&PartMappings[PMI_GPR32 - PMI_Min], 1},
+ // 3: GPR 64-bit value.
+ {&PartMappings[PMI_GPR64 - PMI_Min], 1},
+ {&PartMappings[PMI_GPR64 - PMI_Min], 1},
+ {&PartMappings[PMI_GPR64 - PMI_Min], 1},
+ // 6: FPR 32-bit value.
+ {&PartMappings[PMI_FPR32 - PMI_Min], 1},
+ {&PartMappings[PMI_FPR32 - PMI_Min], 1},
+ {&PartMappings[PMI_FPR32 - PMI_Min], 1},
+ // 9: FPR 64-bit value.
+ {&PartMappings[PMI_FPR64 - PMI_Min], 1},
+ {&PartMappings[PMI_FPR64 - PMI_Min], 1},
+ {&PartMappings[PMI_FPR64 - PMI_Min], 1},
+ // 12: FPR 128-bit value.
+ {&PartMappings[PMI_FPR128 - PMI_Min], 1},
+ {&PartMappings[PMI_FPR128 - PMI_Min], 1},
+ {&PartMappings[PMI_FPR128 - PMI_Min], 1},
+ // 15: FPR 256-bit value.
+ {&PartMappings[PMI_FPR256 - PMI_Min], 1},
+ {&PartMappings[PMI_FPR256 - PMI_Min], 1},
+ {&PartMappings[PMI_FPR256 - PMI_Min], 1},
+ // 18: FPR 512-bit value. <-- This must match Last3OpsIdx.
+ {&PartMappings[PMI_FPR512 - PMI_Min], 1},
+ {&PartMappings[PMI_FPR512 - PMI_Min], 1},
+ {&PartMappings[PMI_FPR512 - PMI_Min], 1},
+ // Cross register bank copies.
+ // 21: GPR 32-bit value to FPR 32-bit value. <-- This must match
+ // FirstCrossRegCpyIdx.
+ {&PartMappings[PMI_GPR32 - PMI_Min], 1},
+ {&PartMappings[PMI_FPR32 - PMI_Min], 1},
+ // 23: GPR 64-bit value to FPR 64-bit value.
+ {&PartMappings[PMI_GPR64 - PMI_Min], 1},
+ {&PartMappings[PMI_FPR64 - PMI_Min], 1},
+ // 25: FPR 32-bit value to GPR 32-bit value.
+ {&PartMappings[PMI_FPR32 - PMI_Min], 1},
+ {&PartMappings[PMI_GPR32 - PMI_Min], 1},
+ // 27: FPR 64-bit value to GPR 64-bit value. <-- This must match
+ // LastCrossRegCpyIdx.
+ {&PartMappings[PMI_FPR64 - PMI_Min], 1},
+ {&PartMappings[PMI_GPR64 - PMI_Min], 1}
+};
+
+/// Get the pointer to the ValueMapping representing the RegisterBank
+/// at \p RBIdx with a size of \p Size.
+///
+/// The returned mapping works for instructions with the same kind of
+/// operands for up to 3 operands.
+///
+/// \pre \p RBIdx != PartialMappingIdx::None
+const RegisterBankInfo::ValueMapping *
+getValueMapping(PartialMappingIdx RBIdx, unsigned Size) {
+ assert(RBIdx != PartialMappingIdx::PMI_None && "No mapping needed for that");
+ unsigned ValMappingIdx = First3OpsIdx +
+ (RBIdx - AArch64::PartialMappingIdx::PMI_Min +
+ getRegBankBaseIdxOffset(Size)) *
+ ValueMappingIdx::DistanceBetweenRegBanks;
+ assert(ValMappingIdx >= AArch64::First3OpsIdx &&
+ ValMappingIdx <= AArch64::Last3OpsIdx && "Mapping out of bound");
+
+ return &ValMappings[ValMappingIdx];
+}
+
+/// Get the pointer to the ValueMapping of the operands of a copy
+/// instruction from a GPR or FPR register to a GPR or FPR register
+/// with a size of \p Size.
+///
+/// If \p DstIsGPR is true, the destination of the copy is on GPR,
+/// otherwise it is on FPR. Same thing for \p SrcIsGPR.
+const RegisterBankInfo::ValueMapping *
+getCopyMapping(bool DstIsGPR, bool SrcIsGPR, unsigned Size) {
+ PartialMappingIdx DstRBIdx = DstIsGPR ? PMI_FirstGPR : PMI_FirstFPR;
+ PartialMappingIdx SrcRBIdx = SrcIsGPR ? PMI_FirstGPR : PMI_FirstFPR;
+ if (DstRBIdx == SrcRBIdx)
+ return getValueMapping(DstRBIdx, Size);
+ assert(Size <= 64 && "GPR cannot handle that size");
+ unsigned ValMappingIdx =
+ FirstCrossRegCpyIdx +
+ (DstRBIdx - PMI_Min + getRegBankBaseIdxOffset(Size)) *
+ ValueMappingIdx::DistanceBetweenCrossRegCpy;
+ assert(ValMappingIdx >= AArch64::FirstCrossRegCpyIdx &&
+ ValMappingIdx <= AArch64::LastCrossRegCpyIdx &&
+ "Mapping out of bound");
+ return &ValMappings[ValMappingIdx];
+}
+
+} // End AArch64 namespace.
+} // End llvm namespace.
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
new file mode 100644
index 000000000000..3099383e5b32
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -0,0 +1,3966 @@
+//===-- AArch64ISelDAGToDAG.cpp - A dag to dag inst selector for AArch64 --===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the AArch64 target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64TargetMachine.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Function.h" // To access function attributes.
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-isel"
+
+//===--------------------------------------------------------------------===//
+/// AArch64DAGToDAGISel - AArch64 specific code to select AArch64 machine
+/// instructions for SelectionDAG operations.
+///
+namespace {
+
+class AArch64DAGToDAGISel : public SelectionDAGISel {
+
+ /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
+ /// make the right decision when generating code for different targets.
+ const AArch64Subtarget *Subtarget;
+
+ bool ForCodeSize;
+
+public:
+ explicit AArch64DAGToDAGISel(AArch64TargetMachine &tm,
+ CodeGenOpt::Level OptLevel)
+ : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr),
+ ForCodeSize(false) {}
+
+ StringRef getPassName() const override {
+ return "AArch64 Instruction Selection";
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ ForCodeSize = MF.getFunction()->optForSize();
+ Subtarget = &MF.getSubtarget<AArch64Subtarget>();
+ return SelectionDAGISel::runOnMachineFunction(MF);
+ }
+
+ void Select(SDNode *Node) override;
+
+ /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+ /// inline asm expressions.
+ bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+ unsigned ConstraintID,
+ std::vector<SDValue> &OutOps) override;
+
+ bool tryMLAV64LaneV128(SDNode *N);
+ bool tryMULLV64LaneV128(unsigned IntNo, SDNode *N);
+ bool SelectArithExtendedRegister(SDValue N, SDValue &Reg, SDValue &Shift);
+ bool SelectArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
+ bool SelectNegArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
+ bool SelectArithShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
+ return SelectShiftedRegister(N, false, Reg, Shift);
+ }
+ bool SelectLogicalShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
+ return SelectShiftedRegister(N, true, Reg, Shift);
+ }
+ bool SelectAddrModeIndexed7S8(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeIndexed7S(N, 1, Base, OffImm);
+ }
+ bool SelectAddrModeIndexed7S16(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeIndexed7S(N, 2, Base, OffImm);
+ }
+ bool SelectAddrModeIndexed7S32(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeIndexed7S(N, 4, Base, OffImm);
+ }
+ bool SelectAddrModeIndexed7S64(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeIndexed7S(N, 8, Base, OffImm);
+ }
+ bool SelectAddrModeIndexed7S128(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeIndexed7S(N, 16, Base, OffImm);
+ }
+ bool SelectAddrModeIndexed8(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeIndexed(N, 1, Base, OffImm);
+ }
+ bool SelectAddrModeIndexed16(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeIndexed(N, 2, Base, OffImm);
+ }
+ bool SelectAddrModeIndexed32(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeIndexed(N, 4, Base, OffImm);
+ }
+ bool SelectAddrModeIndexed64(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeIndexed(N, 8, Base, OffImm);
+ }
+ bool SelectAddrModeIndexed128(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeIndexed(N, 16, Base, OffImm);
+ }
+ bool SelectAddrModeUnscaled8(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeUnscaled(N, 1, Base, OffImm);
+ }
+ bool SelectAddrModeUnscaled16(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeUnscaled(N, 2, Base, OffImm);
+ }
+ bool SelectAddrModeUnscaled32(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeUnscaled(N, 4, Base, OffImm);
+ }
+ bool SelectAddrModeUnscaled64(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeUnscaled(N, 8, Base, OffImm);
+ }
+ bool SelectAddrModeUnscaled128(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeUnscaled(N, 16, Base, OffImm);
+ }
+
+ template<int Width>
+ bool SelectAddrModeWRO(SDValue N, SDValue &Base, SDValue &Offset,
+ SDValue &SignExtend, SDValue &DoShift) {
+ return SelectAddrModeWRO(N, Width / 8, Base, Offset, SignExtend, DoShift);
+ }
+
+ template<int Width>
+ bool SelectAddrModeXRO(SDValue N, SDValue &Base, SDValue &Offset,
+ SDValue &SignExtend, SDValue &DoShift) {
+ return SelectAddrModeXRO(N, Width / 8, Base, Offset, SignExtend, DoShift);
+ }
+
+
+ /// Form sequences of consecutive 64/128-bit registers for use in NEON
+ /// instructions making use of a vector-list (e.g. ldN, tbl). Vecs must have
+ /// between 1 and 4 elements. If it contains a single element that is returned
+ /// unchanged; otherwise a REG_SEQUENCE value is returned.
+ SDValue createDTuple(ArrayRef<SDValue> Vecs);
+ SDValue createQTuple(ArrayRef<SDValue> Vecs);
+
+ /// Generic helper for the createDTuple/createQTuple
+ /// functions. Those should almost always be called instead.
+ SDValue createTuple(ArrayRef<SDValue> Vecs, const unsigned RegClassIDs[],
+ const unsigned SubRegs[]);
+
+ void SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc, bool isExt);
+
+ bool tryIndexedLoad(SDNode *N);
+
+ void SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
+ unsigned SubRegIdx);
+ void SelectPostLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
+ unsigned SubRegIdx);
+ void SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+ void SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+
+ void SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc);
+ void SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc);
+ void SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+ void SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+
+ bool tryBitfieldExtractOp(SDNode *N);
+ bool tryBitfieldExtractOpFromSExt(SDNode *N);
+ bool tryBitfieldInsertOp(SDNode *N);
+ bool tryBitfieldInsertInZeroOp(SDNode *N);
+
+ bool tryReadRegister(SDNode *N);
+ bool tryWriteRegister(SDNode *N);
+
+// Include the pieces autogenerated from the target description.
+#include "AArch64GenDAGISel.inc"
+
+private:
+ bool SelectShiftedRegister(SDValue N, bool AllowROR, SDValue &Reg,
+ SDValue &Shift);
+ bool SelectAddrModeIndexed7S(SDValue N, unsigned Size, SDValue &Base,
+ SDValue &OffImm);
+ bool SelectAddrModeIndexed(SDValue N, unsigned Size, SDValue &Base,
+ SDValue &OffImm);
+ bool SelectAddrModeUnscaled(SDValue N, unsigned Size, SDValue &Base,
+ SDValue &OffImm);
+ bool SelectAddrModeWRO(SDValue N, unsigned Size, SDValue &Base,
+ SDValue &Offset, SDValue &SignExtend,
+ SDValue &DoShift);
+ bool SelectAddrModeXRO(SDValue N, unsigned Size, SDValue &Base,
+ SDValue &Offset, SDValue &SignExtend,
+ SDValue &DoShift);
+ bool isWorthFolding(SDValue V) const;
+ bool SelectExtendedSHL(SDValue N, unsigned Size, bool WantExtend,
+ SDValue &Offset, SDValue &SignExtend);
+
+ template<unsigned RegWidth>
+ bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos) {
+ return SelectCVTFixedPosOperand(N, FixedPos, RegWidth);
+ }
+
+ bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos, unsigned Width);
+
+ void SelectCMP_SWAP(SDNode *N);
+
+};
+} // end anonymous namespace
+
+/// isIntImmediate - This method tests to see if the node is a constant
+/// operand. If so Imm will receive the 32-bit value.
+static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
+ if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
+ Imm = C->getZExtValue();
+ return true;
+ }
+ return false;
+}
+
+// isIntImmediate - This method tests to see if a constant operand.
+// If so Imm will receive the value.
+static bool isIntImmediate(SDValue N, uint64_t &Imm) {
+ return isIntImmediate(N.getNode(), Imm);
+}
+
+// isOpcWithIntImmediate - This method tests to see if the node is a specific
+// opcode and that it has a immediate integer right operand.
+// If so Imm will receive the 32 bit value.
+static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
+ uint64_t &Imm) {
+ return N->getOpcode() == Opc &&
+ isIntImmediate(N->getOperand(1).getNode(), Imm);
+}
+
+bool AArch64DAGToDAGISel::SelectInlineAsmMemoryOperand(
+ const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) {
+ switch(ConstraintID) {
+ default:
+ llvm_unreachable("Unexpected asm memory constraint");
+ case InlineAsm::Constraint_i:
+ case InlineAsm::Constraint_m:
+ case InlineAsm::Constraint_Q:
+ // Require the address to be in a register. That is safe for all AArch64
+ // variants and it is hard to do anything much smarter without knowing
+ // how the operand is used.
+ OutOps.push_back(Op);
+ return false;
+ }
+ return true;
+}
+
+/// SelectArithImmed - Select an immediate value that can be represented as
+/// a 12-bit value shifted left by either 0 or 12. If so, return true with
+/// Val set to the 12-bit value and Shift set to the shifter operand.
+bool AArch64DAGToDAGISel::SelectArithImmed(SDValue N, SDValue &Val,
+ SDValue &Shift) {
+ // This function is called from the addsub_shifted_imm ComplexPattern,
+ // which lists [imm] as the list of opcode it's interested in, however
+ // we still need to check whether the operand is actually an immediate
+ // here because the ComplexPattern opcode list is only used in
+ // root-level opcode matching.
+ if (!isa<ConstantSDNode>(N.getNode()))
+ return false;
+
+ uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
+ unsigned ShiftAmt;
+
+ if (Immed >> 12 == 0) {
+ ShiftAmt = 0;
+ } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
+ ShiftAmt = 12;
+ Immed = Immed >> 12;
+ } else
+ return false;
+
+ unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
+ SDLoc dl(N);
+ Val = CurDAG->getTargetConstant(Immed, dl, MVT::i32);
+ Shift = CurDAG->getTargetConstant(ShVal, dl, MVT::i32);
+ return true;
+}
+
+/// SelectNegArithImmed - As above, but negates the value before trying to
+/// select it.
+bool AArch64DAGToDAGISel::SelectNegArithImmed(SDValue N, SDValue &Val,
+ SDValue &Shift) {
+ // This function is called from the addsub_shifted_imm ComplexPattern,
+ // which lists [imm] as the list of opcode it's interested in, however
+ // we still need to check whether the operand is actually an immediate
+ // here because the ComplexPattern opcode list is only used in
+ // root-level opcode matching.
+ if (!isa<ConstantSDNode>(N.getNode()))
+ return false;
+
+ // The immediate operand must be a 24-bit zero-extended immediate.
+ uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
+
+ // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
+ // have the opposite effect on the C flag, so this pattern mustn't match under
+ // those circumstances.
+ if (Immed == 0)
+ return false;
+
+ if (N.getValueType() == MVT::i32)
+ Immed = ~((uint32_t)Immed) + 1;
+ else
+ Immed = ~Immed + 1ULL;
+ if (Immed & 0xFFFFFFFFFF000000ULL)
+ return false;
+
+ Immed &= 0xFFFFFFULL;
+ return SelectArithImmed(CurDAG->getConstant(Immed, SDLoc(N), MVT::i32), Val,
+ Shift);
+}
+
+/// getShiftTypeForNode - Translate a shift node to the corresponding
+/// ShiftType value.
+static AArch64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) {
+ switch (N.getOpcode()) {
+ default:
+ return AArch64_AM::InvalidShiftExtend;
+ case ISD::SHL:
+ return AArch64_AM::LSL;
+ case ISD::SRL:
+ return AArch64_AM::LSR;
+ case ISD::SRA:
+ return AArch64_AM::ASR;
+ case ISD::ROTR:
+ return AArch64_AM::ROR;
+ }
+}
+
+/// \brief Determine whether it is worth to fold V into an extended register.
+bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const {
+ // it hurts if the value is used at least twice, unless we are optimizing
+ // for code size.
+ return ForCodeSize || V.hasOneUse();
+}
+
+/// SelectShiftedRegister - Select a "shifted register" operand. If the value
+/// is not shifted, set the Shift operand to default of "LSL 0". The logical
+/// instructions allow the shifted register to be rotated, but the arithmetic
+/// instructions do not. The AllowROR parameter specifies whether ROR is
+/// supported.
+bool AArch64DAGToDAGISel::SelectShiftedRegister(SDValue N, bool AllowROR,
+ SDValue &Reg, SDValue &Shift) {
+ AArch64_AM::ShiftExtendType ShType = getShiftTypeForNode(N);
+ if (ShType == AArch64_AM::InvalidShiftExtend)
+ return false;
+ if (!AllowROR && ShType == AArch64_AM::ROR)
+ return false;
+
+ if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+ unsigned BitSize = N.getValueSizeInBits();
+ unsigned Val = RHS->getZExtValue() & (BitSize - 1);
+ unsigned ShVal = AArch64_AM::getShifterImm(ShType, Val);
+
+ Reg = N.getOperand(0);
+ Shift = CurDAG->getTargetConstant(ShVal, SDLoc(N), MVT::i32);
+ return isWorthFolding(N);
+ }
+
+ return false;
+}
+
+/// getExtendTypeForNode - Translate an extend node to the corresponding
+/// ExtendType value.
+static AArch64_AM::ShiftExtendType
+getExtendTypeForNode(SDValue N, bool IsLoadStore = false) {
+ if (N.getOpcode() == ISD::SIGN_EXTEND ||
+ N.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+ EVT SrcVT;
+ if (N.getOpcode() == ISD::SIGN_EXTEND_INREG)
+ SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
+ else
+ SrcVT = N.getOperand(0).getValueType();
+
+ if (!IsLoadStore && SrcVT == MVT::i8)
+ return AArch64_AM::SXTB;
+ else if (!IsLoadStore && SrcVT == MVT::i16)
+ return AArch64_AM::SXTH;
+ else if (SrcVT == MVT::i32)
+ return AArch64_AM::SXTW;
+ assert(SrcVT != MVT::i64 && "extend from 64-bits?");
+
+ return AArch64_AM::InvalidShiftExtend;
+ } else if (N.getOpcode() == ISD::ZERO_EXTEND ||
+ N.getOpcode() == ISD::ANY_EXTEND) {
+ EVT SrcVT = N.getOperand(0).getValueType();
+ if (!IsLoadStore && SrcVT == MVT::i8)
+ return AArch64_AM::UXTB;
+ else if (!IsLoadStore && SrcVT == MVT::i16)
+ return AArch64_AM::UXTH;
+ else if (SrcVT == MVT::i32)
+ return AArch64_AM::UXTW;
+ assert(SrcVT != MVT::i64 && "extend from 64-bits?");
+
+ return AArch64_AM::InvalidShiftExtend;
+ } else if (N.getOpcode() == ISD::AND) {
+ ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
+ if (!CSD)
+ return AArch64_AM::InvalidShiftExtend;
+ uint64_t AndMask = CSD->getZExtValue();
+
+ switch (AndMask) {
+ default:
+ return AArch64_AM::InvalidShiftExtend;
+ case 0xFF:
+ return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend;
+ case 0xFFFF:
+ return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend;
+ case 0xFFFFFFFF:
+ return AArch64_AM::UXTW;
+ }
+ }
+
+ return AArch64_AM::InvalidShiftExtend;
+}
+
+// Helper for SelectMLAV64LaneV128 - Recognize high lane extracts.
+static bool checkHighLaneIndex(SDNode *DL, SDValue &LaneOp, int &LaneIdx) {
+ if (DL->getOpcode() != AArch64ISD::DUPLANE16 &&
+ DL->getOpcode() != AArch64ISD::DUPLANE32)
+ return false;
+
+ SDValue SV = DL->getOperand(0);
+ if (SV.getOpcode() != ISD::INSERT_SUBVECTOR)
+ return false;
+
+ SDValue EV = SV.getOperand(1);
+ if (EV.getOpcode() != ISD::EXTRACT_SUBVECTOR)
+ return false;
+
+ ConstantSDNode *DLidx = cast<ConstantSDNode>(DL->getOperand(1).getNode());
+ ConstantSDNode *EVidx = cast<ConstantSDNode>(EV.getOperand(1).getNode());
+ LaneIdx = DLidx->getSExtValue() + EVidx->getSExtValue();
+ LaneOp = EV.getOperand(0);
+
+ return true;
+}
+
+// Helper for SelectOpcV64LaneV128 - Recognize operations where one operand is a
+// high lane extract.
+static bool checkV64LaneV128(SDValue Op0, SDValue Op1, SDValue &StdOp,
+ SDValue &LaneOp, int &LaneIdx) {
+
+ if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx)) {
+ std::swap(Op0, Op1);
+ if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx))
+ return false;
+ }
+ StdOp = Op1;
+ return true;
+}
+
+/// SelectMLAV64LaneV128 - AArch64 supports vector MLAs where one multiplicand
+/// is a lane in the upper half of a 128-bit vector. Recognize and select this
+/// so that we don't emit unnecessary lane extracts.
+bool AArch64DAGToDAGISel::tryMLAV64LaneV128(SDNode *N) {
+ SDLoc dl(N);
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ SDValue MLAOp1; // Will hold ordinary multiplicand for MLA.
+ SDValue MLAOp2; // Will hold lane-accessed multiplicand for MLA.
+ int LaneIdx = -1; // Will hold the lane index.
+
+ if (Op1.getOpcode() != ISD::MUL ||
+ !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
+ LaneIdx)) {
+ std::swap(Op0, Op1);
+ if (Op1.getOpcode() != ISD::MUL ||
+ !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
+ LaneIdx))
+ return false;
+ }
+
+ SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, dl, MVT::i64);
+
+ SDValue Ops[] = { Op0, MLAOp1, MLAOp2, LaneIdxVal };
+
+ unsigned MLAOpc = ~0U;
+
+ switch (N->getSimpleValueType(0).SimpleTy) {
+ default:
+ llvm_unreachable("Unrecognized MLA.");
+ case MVT::v4i16:
+ MLAOpc = AArch64::MLAv4i16_indexed;
+ break;
+ case MVT::v8i16:
+ MLAOpc = AArch64::MLAv8i16_indexed;
+ break;
+ case MVT::v2i32:
+ MLAOpc = AArch64::MLAv2i32_indexed;
+ break;
+ case MVT::v4i32:
+ MLAOpc = AArch64::MLAv4i32_indexed;
+ break;
+ }
+
+ ReplaceNode(N, CurDAG->getMachineNode(MLAOpc, dl, N->getValueType(0), Ops));
+ return true;
+}
+
+bool AArch64DAGToDAGISel::tryMULLV64LaneV128(unsigned IntNo, SDNode *N) {
+ SDLoc dl(N);
+ SDValue SMULLOp0;
+ SDValue SMULLOp1;
+ int LaneIdx;
+
+ if (!checkV64LaneV128(N->getOperand(1), N->getOperand(2), SMULLOp0, SMULLOp1,
+ LaneIdx))
+ return false;
+
+ SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, dl, MVT::i64);
+
+ SDValue Ops[] = { SMULLOp0, SMULLOp1, LaneIdxVal };
+
+ unsigned SMULLOpc = ~0U;
+
+ if (IntNo == Intrinsic::aarch64_neon_smull) {
+ switch (N->getSimpleValueType(0).SimpleTy) {
+ default:
+ llvm_unreachable("Unrecognized SMULL.");
+ case MVT::v4i32:
+ SMULLOpc = AArch64::SMULLv4i16_indexed;
+ break;
+ case MVT::v2i64:
+ SMULLOpc = AArch64::SMULLv2i32_indexed;
+ break;
+ }
+ } else if (IntNo == Intrinsic::aarch64_neon_umull) {
+ switch (N->getSimpleValueType(0).SimpleTy) {
+ default:
+ llvm_unreachable("Unrecognized SMULL.");
+ case MVT::v4i32:
+ SMULLOpc = AArch64::UMULLv4i16_indexed;
+ break;
+ case MVT::v2i64:
+ SMULLOpc = AArch64::UMULLv2i32_indexed;
+ break;
+ }
+ } else
+ llvm_unreachable("Unrecognized intrinsic.");
+
+ ReplaceNode(N, CurDAG->getMachineNode(SMULLOpc, dl, N->getValueType(0), Ops));
+ return true;
+}
+
+/// Instructions that accept extend modifiers like UXTW expect the register
+/// being extended to be a GPR32, but the incoming DAG might be acting on a
+/// GPR64 (either via SEXT_INREG or AND). Extract the appropriate low bits if
+/// this is the case.
+static SDValue narrowIfNeeded(SelectionDAG *CurDAG, SDValue N) {
+ if (N.getValueType() == MVT::i32)
+ return N;
+
+ SDLoc dl(N);
+ SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
+ MachineSDNode *Node = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+ dl, MVT::i32, N, SubReg);
+ return SDValue(Node, 0);
+}
+
+
+/// SelectArithExtendedRegister - Select a "extended register" operand. This
+/// operand folds in an extend followed by an optional left shift.
+bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
+ SDValue &Shift) {
+ unsigned ShiftVal = 0;
+ AArch64_AM::ShiftExtendType Ext;
+
+ if (N.getOpcode() == ISD::SHL) {
+ ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
+ if (!CSD)
+ return false;
+ ShiftVal = CSD->getZExtValue();
+ if (ShiftVal > 4)
+ return false;
+
+ Ext = getExtendTypeForNode(N.getOperand(0));
+ if (Ext == AArch64_AM::InvalidShiftExtend)
+ return false;
+
+ Reg = N.getOperand(0).getOperand(0);
+ } else {
+ Ext = getExtendTypeForNode(N);
+ if (Ext == AArch64_AM::InvalidShiftExtend)
+ return false;
+
+ Reg = N.getOperand(0);
+
+ // Don't match if free 32-bit -> 64-bit zext can be used instead.
+ if (Ext == AArch64_AM::UXTW &&
+ Reg->getValueType(0).getSizeInBits() == 32 && isDef32(*Reg.getNode()))
+ return false;
+ }
+
+ // AArch64 mandates that the RHS of the operation must use the smallest
+ // register class that could contain the size being extended from. Thus,
+ // if we're folding a (sext i8), we need the RHS to be a GPR32, even though
+ // there might not be an actual 32-bit value in the program. We can
+ // (harmlessly) synthesize one by injected an EXTRACT_SUBREG here.
+ assert(Ext != AArch64_AM::UXTX && Ext != AArch64_AM::SXTX);
+ Reg = narrowIfNeeded(CurDAG, Reg);
+ Shift = CurDAG->getTargetConstant(getArithExtendImm(Ext, ShiftVal), SDLoc(N),
+ MVT::i32);
+ return isWorthFolding(N);
+}
+
+/// If there's a use of this ADDlow that's not itself a load/store then we'll
+/// need to create a real ADD instruction from it anyway and there's no point in
+/// folding it into the mem op. Theoretically, it shouldn't matter, but there's
+/// a single pseudo-instruction for an ADRP/ADD pair so over-aggressive folding
+/// leads to duplicated ADRP instructions.
+static bool isWorthFoldingADDlow(SDValue N) {
+ for (auto Use : N->uses()) {
+ if (Use->getOpcode() != ISD::LOAD && Use->getOpcode() != ISD::STORE &&
+ Use->getOpcode() != ISD::ATOMIC_LOAD &&
+ Use->getOpcode() != ISD::ATOMIC_STORE)
+ return false;
+
+ // ldar and stlr have much more restrictive addressing modes (just a
+ // register).
+ if (isStrongerThanMonotonic(cast<MemSDNode>(Use)->getOrdering()))
+ return false;
+ }
+
+ return true;
+}
+
+/// SelectAddrModeIndexed7S - Select a "register plus scaled signed 7-bit
+/// immediate" address. The "Size" argument is the size in bytes of the memory
+/// reference, which determines the scale.
+bool AArch64DAGToDAGISel::SelectAddrModeIndexed7S(SDValue N, unsigned Size,
+ SDValue &Base,
+ SDValue &OffImm) {
+ SDLoc dl(N);
+ const DataLayout &DL = CurDAG->getDataLayout();
+ const TargetLowering *TLI = getTargetLowering();
+ if (N.getOpcode() == ISD::FrameIndex) {
+ int FI = cast<FrameIndexSDNode>(N)->getIndex();
+ Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
+ OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
+ return true;
+ }
+
+ // As opposed to the (12-bit) Indexed addressing mode below, the 7-bit signed
+ // selected here doesn't support labels/immediates, only base+offset.
+
+ if (CurDAG->isBaseWithConstantOffset(N)) {
+ if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+ int64_t RHSC = RHS->getSExtValue();
+ unsigned Scale = Log2_32(Size);
+ if ((RHSC & (Size - 1)) == 0 && RHSC >= -(0x40 << Scale) &&
+ RHSC < (0x40 << Scale)) {
+ Base = N.getOperand(0);
+ if (Base.getOpcode() == ISD::FrameIndex) {
+ int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+ Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
+ }
+ OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64);
+ return true;
+ }
+ }
+ }
+
+ // Base only. The address will be materialized into a register before
+ // the memory is accessed.
+ // add x0, Xbase, #offset
+ // stp x1, x2, [x0]
+ Base = N;
+ OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
+ return true;
+}
+
+/// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit
+/// immediate" address. The "Size" argument is the size in bytes of the memory
+/// reference, which determines the scale.
+bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
+ SDValue &Base, SDValue &OffImm) {
+ SDLoc dl(N);
+ const DataLayout &DL = CurDAG->getDataLayout();
+ const TargetLowering *TLI = getTargetLowering();
+ if (N.getOpcode() == ISD::FrameIndex) {
+ int FI = cast<FrameIndexSDNode>(N)->getIndex();
+ Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
+ OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
+ return true;
+ }
+
+ if (N.getOpcode() == AArch64ISD::ADDlow && isWorthFoldingADDlow(N)) {
+ GlobalAddressSDNode *GAN =
+ dyn_cast<GlobalAddressSDNode>(N.getOperand(1).getNode());
+ Base = N.getOperand(0);
+ OffImm = N.getOperand(1);
+ if (!GAN)
+ return true;
+
+ const GlobalValue *GV = GAN->getGlobal();
+ unsigned Alignment = GV->getAlignment();
+ Type *Ty = GV->getValueType();
+ if (Alignment == 0 && Ty->isSized())
+ Alignment = DL.getABITypeAlignment(Ty);
+
+ if (Alignment >= Size)
+ return true;
+ }
+
+ if (CurDAG->isBaseWithConstantOffset(N)) {
+ if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+ int64_t RHSC = (int64_t)RHS->getZExtValue();
+ unsigned Scale = Log2_32(Size);
+ if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
+ Base = N.getOperand(0);
+ if (Base.getOpcode() == ISD::FrameIndex) {
+ int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+ Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
+ }
+ OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64);
+ return true;
+ }
+ }
+ }
+
+ // Before falling back to our general case, check if the unscaled
+ // instructions can handle this. If so, that's preferable.
+ if (SelectAddrModeUnscaled(N, Size, Base, OffImm))
+ return false;
+
+ // Base only. The address will be materialized into a register before
+ // the memory is accessed.
+ // add x0, Xbase, #offset
+ // ldr x0, [x0]
+ Base = N;
+ OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
+ return true;
+}
+
+/// SelectAddrModeUnscaled - Select a "register plus unscaled signed 9-bit
+/// immediate" address. This should only match when there is an offset that
+/// is not valid for a scaled immediate addressing mode. The "Size" argument
+/// is the size in bytes of the memory reference, which is needed here to know
+/// what is valid for a scaled immediate.
+bool AArch64DAGToDAGISel::SelectAddrModeUnscaled(SDValue N, unsigned Size,
+ SDValue &Base,
+ SDValue &OffImm) {
+ if (!CurDAG->isBaseWithConstantOffset(N))
+ return false;
+ if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+ int64_t RHSC = RHS->getSExtValue();
+ // If the offset is valid as a scaled immediate, don't match here.
+ if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 &&
+ RHSC < (0x1000 << Log2_32(Size)))
+ return false;
+ if (RHSC >= -256 && RHSC < 256) {
+ Base = N.getOperand(0);
+ if (Base.getOpcode() == ISD::FrameIndex) {
+ int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+ const TargetLowering *TLI = getTargetLowering();
+ Base = CurDAG->getTargetFrameIndex(
+ FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+ }
+ OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i64);
+ return true;
+ }
+ }
+ return false;
+}
+
+static SDValue Widen(SelectionDAG *CurDAG, SDValue N) {
+ SDLoc dl(N);
+ SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
+ SDValue ImpDef = SDValue(
+ CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, MVT::i64), 0);
+ MachineSDNode *Node = CurDAG->getMachineNode(
+ TargetOpcode::INSERT_SUBREG, dl, MVT::i64, ImpDef, N, SubReg);
+ return SDValue(Node, 0);
+}
+
+/// \brief Check if the given SHL node (\p N), can be used to form an
+/// extended register for an addressing mode.
+bool AArch64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size,
+ bool WantExtend, SDValue &Offset,
+ SDValue &SignExtend) {
+ assert(N.getOpcode() == ISD::SHL && "Invalid opcode.");
+ ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
+ if (!CSD || (CSD->getZExtValue() & 0x7) != CSD->getZExtValue())
+ return false;
+
+ SDLoc dl(N);
+ if (WantExtend) {
+ AArch64_AM::ShiftExtendType Ext =
+ getExtendTypeForNode(N.getOperand(0), true);
+ if (Ext == AArch64_AM::InvalidShiftExtend)
+ return false;
+
+ Offset = narrowIfNeeded(CurDAG, N.getOperand(0).getOperand(0));
+ SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
+ MVT::i32);
+ } else {
+ Offset = N.getOperand(0);
+ SignExtend = CurDAG->getTargetConstant(0, dl, MVT::i32);
+ }
+
+ unsigned LegalShiftVal = Log2_32(Size);
+ unsigned ShiftVal = CSD->getZExtValue();
+
+ if (ShiftVal != 0 && ShiftVal != LegalShiftVal)
+ return false;
+
+ return isWorthFolding(N);
+}
+
+bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
+ SDValue &Base, SDValue &Offset,
+ SDValue &SignExtend,
+ SDValue &DoShift) {
+ if (N.getOpcode() != ISD::ADD)
+ return false;
+ SDValue LHS = N.getOperand(0);
+ SDValue RHS = N.getOperand(1);
+ SDLoc dl(N);
+
+ // We don't want to match immediate adds here, because they are better lowered
+ // to the register-immediate addressing modes.
+ if (isa<ConstantSDNode>(LHS) || isa<ConstantSDNode>(RHS))
+ return false;
+
+ // Check if this particular node is reused in any non-memory related
+ // operation. If yes, do not try to fold this node into the address
+ // computation, since the computation will be kept.
+ const SDNode *Node = N.getNode();
+ for (SDNode *UI : Node->uses()) {
+ if (!isa<MemSDNode>(*UI))
+ return false;
+ }
+
+ // Remember if it is worth folding N when it produces extended register.
+ bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
+
+ // Try to match a shifted extend on the RHS.
+ if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
+ SelectExtendedSHL(RHS, Size, true, Offset, SignExtend)) {
+ Base = LHS;
+ DoShift = CurDAG->getTargetConstant(true, dl, MVT::i32);
+ return true;
+ }
+
+ // Try to match a shifted extend on the LHS.
+ if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL &&
+ SelectExtendedSHL(LHS, Size, true, Offset, SignExtend)) {
+ Base = RHS;
+ DoShift = CurDAG->getTargetConstant(true, dl, MVT::i32);
+ return true;
+ }
+
+ // There was no shift, whatever else we find.
+ DoShift = CurDAG->getTargetConstant(false, dl, MVT::i32);
+
+ AArch64_AM::ShiftExtendType Ext = AArch64_AM::InvalidShiftExtend;
+ // Try to match an unshifted extend on the LHS.
+ if (IsExtendedRegisterWorthFolding &&
+ (Ext = getExtendTypeForNode(LHS, true)) !=
+ AArch64_AM::InvalidShiftExtend) {
+ Base = RHS;
+ Offset = narrowIfNeeded(CurDAG, LHS.getOperand(0));
+ SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
+ MVT::i32);
+ if (isWorthFolding(LHS))
+ return true;
+ }
+
+ // Try to match an unshifted extend on the RHS.
+ if (IsExtendedRegisterWorthFolding &&
+ (Ext = getExtendTypeForNode(RHS, true)) !=
+ AArch64_AM::InvalidShiftExtend) {
+ Base = LHS;
+ Offset = narrowIfNeeded(CurDAG, RHS.getOperand(0));
+ SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
+ MVT::i32);
+ if (isWorthFolding(RHS))
+ return true;
+ }
+
+ return false;
+}
+
+// Check if the given immediate is preferred by ADD. If an immediate can be
+// encoded in an ADD, or it can be encoded in an "ADD LSL #12" and can not be
+// encoded by one MOVZ, return true.
+static bool isPreferredADD(int64_t ImmOff) {
+ // Constant in [0x0, 0xfff] can be encoded in ADD.
+ if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL)
+ return true;
+ // Check if it can be encoded in an "ADD LSL #12".
+ if ((ImmOff & 0xffffffffff000fffLL) == 0x0LL)
+ // As a single MOVZ is faster than a "ADD of LSL #12", ignore such constant.
+ return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL &&
+ (ImmOff & 0xffffffffffff0fffLL) != 0x0LL;
+ return false;
+}
+
+bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
+ SDValue &Base, SDValue &Offset,
+ SDValue &SignExtend,
+ SDValue &DoShift) {
+ if (N.getOpcode() != ISD::ADD)
+ return false;
+ SDValue LHS = N.getOperand(0);
+ SDValue RHS = N.getOperand(1);
+ SDLoc DL(N);
+
+ // Check if this particular node is reused in any non-memory related
+ // operation. If yes, do not try to fold this node into the address
+ // computation, since the computation will be kept.
+ const SDNode *Node = N.getNode();
+ for (SDNode *UI : Node->uses()) {
+ if (!isa<MemSDNode>(*UI))
+ return false;
+ }
+
+ // Watch out if RHS is a wide immediate, it can not be selected into
+ // [BaseReg+Imm] addressing mode. Also it may not be able to be encoded into
+ // ADD/SUB. Instead it will use [BaseReg + 0] address mode and generate
+ // instructions like:
+ // MOV X0, WideImmediate
+ // ADD X1, BaseReg, X0
+ // LDR X2, [X1, 0]
+ // For such situation, using [BaseReg, XReg] addressing mode can save one
+ // ADD/SUB:
+ // MOV X0, WideImmediate
+ // LDR X2, [BaseReg, X0]
+ if (isa<ConstantSDNode>(RHS)) {
+ int64_t ImmOff = (int64_t)cast<ConstantSDNode>(RHS)->getZExtValue();
+ unsigned Scale = Log2_32(Size);
+ // Skip the immediate can be selected by load/store addressing mode.
+ // Also skip the immediate can be encoded by a single ADD (SUB is also
+ // checked by using -ImmOff).
+ if ((ImmOff % Size == 0 && ImmOff >= 0 && ImmOff < (0x1000 << Scale)) ||
+ isPreferredADD(ImmOff) || isPreferredADD(-ImmOff))
+ return false;
+
+ SDValue Ops[] = { RHS };
+ SDNode *MOVI =
+ CurDAG->getMachineNode(AArch64::MOVi64imm, DL, MVT::i64, Ops);
+ SDValue MOVIV = SDValue(MOVI, 0);
+ // This ADD of two X register will be selected into [Reg+Reg] mode.
+ N = CurDAG->getNode(ISD::ADD, DL, MVT::i64, LHS, MOVIV);
+ }
+
+ // Remember if it is worth folding N when it produces extended register.
+ bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
+
+ // Try to match a shifted extend on the RHS.
+ if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
+ SelectExtendedSHL(RHS, Size, false, Offset, SignExtend)) {
+ Base = LHS;
+ DoShift = CurDAG->getTargetConstant(true, DL, MVT::i32);
+ return true;
+ }
+
+ // Try to match a shifted extend on the LHS.
+ if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL &&
+ SelectExtendedSHL(LHS, Size, false, Offset, SignExtend)) {
+ Base = RHS;
+ DoShift = CurDAG->getTargetConstant(true, DL, MVT::i32);
+ return true;
+ }
+
+ // Match any non-shifted, non-extend, non-immediate add expression.
+ Base = LHS;
+ Offset = RHS;
+ SignExtend = CurDAG->getTargetConstant(false, DL, MVT::i32);
+ DoShift = CurDAG->getTargetConstant(false, DL, MVT::i32);
+ // Reg1 + Reg2 is free: no check needed.
+ return true;
+}
+
+SDValue AArch64DAGToDAGISel::createDTuple(ArrayRef<SDValue> Regs) {
+ static const unsigned RegClassIDs[] = {
+ AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID};
+ static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1,
+ AArch64::dsub2, AArch64::dsub3};
+
+ return createTuple(Regs, RegClassIDs, SubRegs);
+}
+
+SDValue AArch64DAGToDAGISel::createQTuple(ArrayRef<SDValue> Regs) {
+ static const unsigned RegClassIDs[] = {
+ AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID};
+ static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1,
+ AArch64::qsub2, AArch64::qsub3};
+
+ return createTuple(Regs, RegClassIDs, SubRegs);
+}
+
+SDValue AArch64DAGToDAGISel::createTuple(ArrayRef<SDValue> Regs,
+ const unsigned RegClassIDs[],
+ const unsigned SubRegs[]) {
+ // There's no special register-class for a vector-list of 1 element: it's just
+ // a vector.
+ if (Regs.size() == 1)
+ return Regs[0];
+
+ assert(Regs.size() >= 2 && Regs.size() <= 4);
+
+ SDLoc DL(Regs[0]);
+
+ SmallVector<SDValue, 4> Ops;
+
+ // First operand of REG_SEQUENCE is the desired RegClass.
+ Ops.push_back(
+ CurDAG->getTargetConstant(RegClassIDs[Regs.size() - 2], DL, MVT::i32));
+
+ // Then we get pairs of source & subregister-position for the components.
+ for (unsigned i = 0; i < Regs.size(); ++i) {
+ Ops.push_back(Regs[i]);
+ Ops.push_back(CurDAG->getTargetConstant(SubRegs[i], DL, MVT::i32));
+ }
+
+ SDNode *N =
+ CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops);
+ return SDValue(N, 0);
+}
+
+void AArch64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc,
+ bool isExt) {
+ SDLoc dl(N);
+ EVT VT = N->getValueType(0);
+
+ unsigned ExtOff = isExt;
+
+ // Form a REG_SEQUENCE to force register allocation.
+ unsigned Vec0Off = ExtOff + 1;
+ SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Off,
+ N->op_begin() + Vec0Off + NumVecs);
+ SDValue RegSeq = createQTuple(Regs);
+
+ SmallVector<SDValue, 6> Ops;
+ if (isExt)
+ Ops.push_back(N->getOperand(1));
+ Ops.push_back(RegSeq);
+ Ops.push_back(N->getOperand(NumVecs + ExtOff + 1));
+ ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, Ops));
+}
+
+bool AArch64DAGToDAGISel::tryIndexedLoad(SDNode *N) {
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ if (LD->isUnindexed())
+ return false;
+ EVT VT = LD->getMemoryVT();
+ EVT DstVT = N->getValueType(0);
+ ISD::MemIndexedMode AM = LD->getAddressingMode();
+ bool IsPre = AM == ISD::PRE_INC || AM == ISD::PRE_DEC;
+
+ // We're not doing validity checking here. That was done when checking
+ // if we should mark the load as indexed or not. We're just selecting
+ // the right instruction.
+ unsigned Opcode = 0;
+
+ ISD::LoadExtType ExtType = LD->getExtensionType();
+ bool InsertTo64 = false;
+ if (VT == MVT::i64)
+ Opcode = IsPre ? AArch64::LDRXpre : AArch64::LDRXpost;
+ else if (VT == MVT::i32) {
+ if (ExtType == ISD::NON_EXTLOAD)
+ Opcode = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
+ else if (ExtType == ISD::SEXTLOAD)
+ Opcode = IsPre ? AArch64::LDRSWpre : AArch64::LDRSWpost;
+ else {
+ Opcode = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
+ InsertTo64 = true;
+ // The result of the load is only i32. It's the subreg_to_reg that makes
+ // it into an i64.
+ DstVT = MVT::i32;
+ }
+ } else if (VT == MVT::i16) {
+ if (ExtType == ISD::SEXTLOAD) {
+ if (DstVT == MVT::i64)
+ Opcode = IsPre ? AArch64::LDRSHXpre : AArch64::LDRSHXpost;
+ else
+ Opcode = IsPre ? AArch64::LDRSHWpre : AArch64::LDRSHWpost;
+ } else {
+ Opcode = IsPre ? AArch64::LDRHHpre : AArch64::LDRHHpost;
+ InsertTo64 = DstVT == MVT::i64;
+ // The result of the load is only i32. It's the subreg_to_reg that makes
+ // it into an i64.
+ DstVT = MVT::i32;
+ }
+ } else if (VT == MVT::i8) {
+ if (ExtType == ISD::SEXTLOAD) {
+ if (DstVT == MVT::i64)
+ Opcode = IsPre ? AArch64::LDRSBXpre : AArch64::LDRSBXpost;
+ else
+ Opcode = IsPre ? AArch64::LDRSBWpre : AArch64::LDRSBWpost;
+ } else {
+ Opcode = IsPre ? AArch64::LDRBBpre : AArch64::LDRBBpost;
+ InsertTo64 = DstVT == MVT::i64;
+ // The result of the load is only i32. It's the subreg_to_reg that makes
+ // it into an i64.
+ DstVT = MVT::i32;
+ }
+ } else if (VT == MVT::f16) {
+ Opcode = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost;
+ } else if (VT == MVT::f32) {
+ Opcode = IsPre ? AArch64::LDRSpre : AArch64::LDRSpost;
+ } else if (VT == MVT::f64 || VT.is64BitVector()) {
+ Opcode = IsPre ? AArch64::LDRDpre : AArch64::LDRDpost;
+ } else if (VT.is128BitVector()) {
+ Opcode = IsPre ? AArch64::LDRQpre : AArch64::LDRQpost;
+ } else
+ return false;
+ SDValue Chain = LD->getChain();
+ SDValue Base = LD->getBasePtr();
+ ConstantSDNode *OffsetOp = cast<ConstantSDNode>(LD->getOffset());
+ int OffsetVal = (int)OffsetOp->getZExtValue();
+ SDLoc dl(N);
+ SDValue Offset = CurDAG->getTargetConstant(OffsetVal, dl, MVT::i64);
+ SDValue Ops[] = { Base, Offset, Chain };
+ SDNode *Res = CurDAG->getMachineNode(Opcode, dl, MVT::i64, DstVT,
+ MVT::Other, Ops);
+ // Either way, we're replacing the node, so tell the caller that.
+ SDValue LoadedVal = SDValue(Res, 1);
+ if (InsertTo64) {
+ SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
+ LoadedVal =
+ SDValue(CurDAG->getMachineNode(
+ AArch64::SUBREG_TO_REG, dl, MVT::i64,
+ CurDAG->getTargetConstant(0, dl, MVT::i64), LoadedVal,
+ SubReg),
+ 0);
+ }
+
+ ReplaceUses(SDValue(N, 0), LoadedVal);
+ ReplaceUses(SDValue(N, 1), SDValue(Res, 0));
+ ReplaceUses(SDValue(N, 2), SDValue(Res, 2));
+ CurDAG->RemoveDeadNode(N);
+ return true;
+}
+
+void AArch64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
+ unsigned SubRegIdx) {
+ SDLoc dl(N);
+ EVT VT = N->getValueType(0);
+ SDValue Chain = N->getOperand(0);
+
+ SDValue Ops[] = {N->getOperand(2), // Mem operand;
+ Chain};
+
+ const EVT ResTys[] = {MVT::Untyped, MVT::Other};
+
+ SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+ SDValue SuperReg = SDValue(Ld, 0);
+ for (unsigned i = 0; i < NumVecs; ++i)
+ ReplaceUses(SDValue(N, i),
+ CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg));
+
+ ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
+
+ // Transfer memoperands.
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+ cast<MachineSDNode>(Ld)->setMemRefs(MemOp, MemOp + 1);
+
+ CurDAG->RemoveDeadNode(N);
+}
+
+void AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs,
+ unsigned Opc, unsigned SubRegIdx) {
+ SDLoc dl(N);
+ EVT VT = N->getValueType(0);
+ SDValue Chain = N->getOperand(0);
+
+ SDValue Ops[] = {N->getOperand(1), // Mem operand
+ N->getOperand(2), // Incremental
+ Chain};
+
+ const EVT ResTys[] = {MVT::i64, // Type of the write back register
+ MVT::Untyped, MVT::Other};
+
+ SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+ // Update uses of write back register
+ ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0));
+
+ // Update uses of vector list
+ SDValue SuperReg = SDValue(Ld, 1);
+ if (NumVecs == 1)
+ ReplaceUses(SDValue(N, 0), SuperReg);
+ else
+ for (unsigned i = 0; i < NumVecs; ++i)
+ ReplaceUses(SDValue(N, i),
+ CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg));
+
+ // Update the chain
+ ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
+ CurDAG->RemoveDeadNode(N);
+}
+
+void AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
+ unsigned Opc) {
+ SDLoc dl(N);
+ EVT VT = N->getOperand(2)->getValueType(0);
+
+ // Form a REG_SEQUENCE to force register allocation.
+ bool Is128Bit = VT.getSizeInBits() == 128;
+ SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
+ SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);
+
+ SDValue Ops[] = {RegSeq, N->getOperand(NumVecs + 2), N->getOperand(0)};
+ SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops);
+
+ // Transfer memoperands.
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+ cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
+
+ ReplaceNode(N, St);
+}
+
+void AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs,
+ unsigned Opc) {
+ SDLoc dl(N);
+ EVT VT = N->getOperand(2)->getValueType(0);
+ const EVT ResTys[] = {MVT::i64, // Type of the write back register
+ MVT::Other}; // Type for the Chain
+
+ // Form a REG_SEQUENCE to force register allocation.
+ bool Is128Bit = VT.getSizeInBits() == 128;
+ SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
+ SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);
+
+ SDValue Ops[] = {RegSeq,
+ N->getOperand(NumVecs + 1), // base register
+ N->getOperand(NumVecs + 2), // Incremental
+ N->getOperand(0)}; // Chain
+ SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+ ReplaceNode(N, St);
+}
+
+namespace {
+/// WidenVector - Given a value in the V64 register class, produce the
+/// equivalent value in the V128 register class.
+class WidenVector {
+ SelectionDAG &DAG;
+
+public:
+ WidenVector(SelectionDAG &DAG) : DAG(DAG) {}
+
+ SDValue operator()(SDValue V64Reg) {
+ EVT VT = V64Reg.getValueType();
+ unsigned NarrowSize = VT.getVectorNumElements();
+ MVT EltTy = VT.getVectorElementType().getSimpleVT();
+ MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
+ SDLoc DL(V64Reg);
+
+ SDValue Undef =
+ SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, WideTy), 0);
+ return DAG.getTargetInsertSubreg(AArch64::dsub, DL, WideTy, Undef, V64Reg);
+ }
+};
+} // namespace
+
+/// NarrowVector - Given a value in the V128 register class, produce the
+/// equivalent value in the V64 register class.
+static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
+ EVT VT = V128Reg.getValueType();
+ unsigned WideSize = VT.getVectorNumElements();
+ MVT EltTy = VT.getVectorElementType().getSimpleVT();
+ MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
+
+ return DAG.getTargetExtractSubreg(AArch64::dsub, SDLoc(V128Reg), NarrowTy,
+ V128Reg);
+}
+
+void AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs,
+ unsigned Opc) {
+ SDLoc dl(N);
+ EVT VT = N->getValueType(0);
+ bool Narrow = VT.getSizeInBits() == 64;
+
+ // Form a REG_SEQUENCE to force register allocation.
+ SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
+
+ if (Narrow)
+ transform(Regs, Regs.begin(),
+ WidenVector(*CurDAG));
+
+ SDValue RegSeq = createQTuple(Regs);
+
+ const EVT ResTys[] = {MVT::Untyped, MVT::Other};
+
+ unsigned LaneNo =
+ cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
+
+ SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, dl, MVT::i64),
+ N->getOperand(NumVecs + 3), N->getOperand(0)};
+ SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+ SDValue SuperReg = SDValue(Ld, 0);
+
+ EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
+ static const unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1,
+ AArch64::qsub2, AArch64::qsub3 };
+ for (unsigned i = 0; i < NumVecs; ++i) {
+ SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT, SuperReg);
+ if (Narrow)
+ NV = NarrowVector(NV, *CurDAG);
+ ReplaceUses(SDValue(N, i), NV);
+ }
+
+ ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
+ CurDAG->RemoveDeadNode(N);
+}
+
+void AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs,
+ unsigned Opc) {
+ SDLoc dl(N);
+ EVT VT = N->getValueType(0);
+ bool Narrow = VT.getSizeInBits() == 64;
+
+ // Form a REG_SEQUENCE to force register allocation.
+ SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
+
+ if (Narrow)
+ transform(Regs, Regs.begin(),
+ WidenVector(*CurDAG));
+
+ SDValue RegSeq = createQTuple(Regs);
+
+ const EVT ResTys[] = {MVT::i64, // Type of the write back register
+ RegSeq->getValueType(0), MVT::Other};
+
+ unsigned LaneNo =
+ cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
+
+ SDValue Ops[] = {RegSeq,
+ CurDAG->getTargetConstant(LaneNo, dl,
+ MVT::i64), // Lane Number
+ N->getOperand(NumVecs + 2), // Base register
+ N->getOperand(NumVecs + 3), // Incremental
+ N->getOperand(0)};
+ SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+ // Update uses of the write back register
+ ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0));
+
+ // Update uses of the vector list
+ SDValue SuperReg = SDValue(Ld, 1);
+ if (NumVecs == 1) {
+ ReplaceUses(SDValue(N, 0),
+ Narrow ? NarrowVector(SuperReg, *CurDAG) : SuperReg);
+ } else {
+ EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
+ static const unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1,
+ AArch64::qsub2, AArch64::qsub3 };
+ for (unsigned i = 0; i < NumVecs; ++i) {
+ SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT,
+ SuperReg);
+ if (Narrow)
+ NV = NarrowVector(NV, *CurDAG);
+ ReplaceUses(SDValue(N, i), NV);
+ }
+ }
+
+ // Update the Chain
+ ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
+ CurDAG->RemoveDeadNode(N);
+}
+
+void AArch64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs,
+ unsigned Opc) {
+ SDLoc dl(N);
+ EVT VT = N->getOperand(2)->getValueType(0);
+ bool Narrow = VT.getSizeInBits() == 64;
+
+ // Form a REG_SEQUENCE to force register allocation.
+ SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
+
+ if (Narrow)
+ transform(Regs, Regs.begin(),
+ WidenVector(*CurDAG));
+
+ SDValue RegSeq = createQTuple(Regs);
+
+ unsigned LaneNo =
+ cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
+
+ SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, dl, MVT::i64),
+ N->getOperand(NumVecs + 3), N->getOperand(0)};
+ SDNode *St = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
+
+ // Transfer memoperands.
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+ cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
+
+ ReplaceNode(N, St);
+}
+
+void AArch64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs,
+ unsigned Opc) {
+ SDLoc dl(N);
+ EVT VT = N->getOperand(2)->getValueType(0);
+ bool Narrow = VT.getSizeInBits() == 64;
+
+ // Form a REG_SEQUENCE to force register allocation.
+ SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
+
+ if (Narrow)
+ transform(Regs, Regs.begin(),
+ WidenVector(*CurDAG));
+
+ SDValue RegSeq = createQTuple(Regs);
+
+ const EVT ResTys[] = {MVT::i64, // Type of the write back register
+ MVT::Other};
+
+ unsigned LaneNo =
+ cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
+
+ SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, dl, MVT::i64),
+ N->getOperand(NumVecs + 2), // Base Register
+ N->getOperand(NumVecs + 3), // Incremental
+ N->getOperand(0)};
+ SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+ // Transfer memoperands.
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+ cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
+
+ ReplaceNode(N, St);
+}
+
+static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
+ unsigned &Opc, SDValue &Opd0,
+ unsigned &LSB, unsigned &MSB,
+ unsigned NumberOfIgnoredLowBits,
+ bool BiggerPattern) {
+ assert(N->getOpcode() == ISD::AND &&
+ "N must be a AND operation to call this function");
+
+ EVT VT = N->getValueType(0);
+
+ // Here we can test the type of VT and return false when the type does not
+ // match, but since it is done prior to that call in the current context
+ // we turned that into an assert to avoid redundant code.
+ assert((VT == MVT::i32 || VT == MVT::i64) &&
+ "Type checking must have been done before calling this function");
+
+ // FIXME: simplify-demanded-bits in DAGCombine will probably have
+ // changed the AND node to a 32-bit mask operation. We'll have to
+ // undo that as part of the transform here if we want to catch all
+ // the opportunities.
+ // Currently the NumberOfIgnoredLowBits argument helps to recover
+ // form these situations when matching bigger pattern (bitfield insert).
+
+ // For unsigned extracts, check for a shift right and mask
+ uint64_t AndImm = 0;
+ if (!isOpcWithIntImmediate(N, ISD::AND, AndImm))
+ return false;
+
+ const SDNode *Op0 = N->getOperand(0).getNode();
+
+ // Because of simplify-demanded-bits in DAGCombine, the mask may have been
+ // simplified. Try to undo that
+ AndImm |= (1 << NumberOfIgnoredLowBits) - 1;
+
+ // The immediate is a mask of the low bits iff imm & (imm+1) == 0
+ if (AndImm & (AndImm + 1))
+ return false;
+
+ bool ClampMSB = false;
+ uint64_t SrlImm = 0;
+ // Handle the SRL + ANY_EXTEND case.
+ if (VT == MVT::i64 && Op0->getOpcode() == ISD::ANY_EXTEND &&
+ isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL, SrlImm)) {
+ // Extend the incoming operand of the SRL to 64-bit.
+ Opd0 = Widen(CurDAG, Op0->getOperand(0).getOperand(0));
+ // Make sure to clamp the MSB so that we preserve the semantics of the
+ // original operations.
+ ClampMSB = true;
+ } else if (VT == MVT::i32 && Op0->getOpcode() == ISD::TRUNCATE &&
+ isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL,
+ SrlImm)) {
+ // If the shift result was truncated, we can still combine them.
+ Opd0 = Op0->getOperand(0).getOperand(0);
+
+ // Use the type of SRL node.
+ VT = Opd0->getValueType(0);
+ } else if (isOpcWithIntImmediate(Op0, ISD::SRL, SrlImm)) {
+ Opd0 = Op0->getOperand(0);
+ } else if (BiggerPattern) {
+ // Let's pretend a 0 shift right has been performed.
+ // The resulting code will be at least as good as the original one
+ // plus it may expose more opportunities for bitfield insert pattern.
+ // FIXME: Currently we limit this to the bigger pattern, because
+ // some optimizations expect AND and not UBFM.
+ Opd0 = N->getOperand(0);
+ } else
+ return false;
+
+ // Bail out on large immediates. This happens when no proper
+ // combining/constant folding was performed.
+ if (!BiggerPattern && (SrlImm <= 0 || SrlImm >= VT.getSizeInBits())) {
+ DEBUG((dbgs() << N
+ << ": Found large shift immediate, this should not happen\n"));
+ return false;
+ }
+
+ LSB = SrlImm;
+ MSB = SrlImm + (VT == MVT::i32 ? countTrailingOnes<uint32_t>(AndImm)
+ : countTrailingOnes<uint64_t>(AndImm)) -
+ 1;
+ if (ClampMSB)
+ // Since we're moving the extend before the right shift operation, we need
+ // to clamp the MSB to make sure we don't shift in undefined bits instead of
+ // the zeros which would get shifted in with the original right shift
+ // operation.
+ MSB = MSB > 31 ? 31 : MSB;
+
+ Opc = VT == MVT::i32 ? AArch64::UBFMWri : AArch64::UBFMXri;
+ return true;
+}
+
+static bool isBitfieldExtractOpFromSExtInReg(SDNode *N, unsigned &Opc,
+ SDValue &Opd0, unsigned &Immr,
+ unsigned &Imms) {
+ assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
+
+ EVT VT = N->getValueType(0);
+ unsigned BitWidth = VT.getSizeInBits();
+ assert((VT == MVT::i32 || VT == MVT::i64) &&
+ "Type checking must have been done before calling this function");
+
+ SDValue Op = N->getOperand(0);
+ if (Op->getOpcode() == ISD::TRUNCATE) {
+ Op = Op->getOperand(0);
+ VT = Op->getValueType(0);
+ BitWidth = VT.getSizeInBits();
+ }
+
+ uint64_t ShiftImm;
+ if (!isOpcWithIntImmediate(Op.getNode(), ISD::SRL, ShiftImm) &&
+ !isOpcWithIntImmediate(Op.getNode(), ISD::SRA, ShiftImm))
+ return false;
+
+ unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
+ if (ShiftImm + Width > BitWidth)
+ return false;
+
+ Opc = (VT == MVT::i32) ? AArch64::SBFMWri : AArch64::SBFMXri;
+ Opd0 = Op.getOperand(0);
+ Immr = ShiftImm;
+ Imms = ShiftImm + Width - 1;
+ return true;
+}
+
+static bool isSeveralBitsExtractOpFromShr(SDNode *N, unsigned &Opc,
+ SDValue &Opd0, unsigned &LSB,
+ unsigned &MSB) {
+ // We are looking for the following pattern which basically extracts several
+ // continuous bits from the source value and places it from the LSB of the
+ // destination value, all other bits of the destination value or set to zero:
+ //
+ // Value2 = AND Value, MaskImm
+ // SRL Value2, ShiftImm
+ //
+ // with MaskImm >> ShiftImm to search for the bit width.
+ //
+ // This gets selected into a single UBFM:
+ //
+ // UBFM Value, ShiftImm, BitWide + SrlImm -1
+ //
+
+ if (N->getOpcode() != ISD::SRL)
+ return false;
+
+ uint64_t AndMask = 0;
+ if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, AndMask))
+ return false;
+
+ Opd0 = N->getOperand(0).getOperand(0);
+
+ uint64_t SrlImm = 0;
+ if (!isIntImmediate(N->getOperand(1), SrlImm))
+ return false;
+
+ // Check whether we really have several bits extract here.
+ unsigned BitWide = 64 - countLeadingOnes(~(AndMask >> SrlImm));
+ if (BitWide && isMask_64(AndMask >> SrlImm)) {
+ if (N->getValueType(0) == MVT::i32)
+ Opc = AArch64::UBFMWri;
+ else
+ Opc = AArch64::UBFMXri;
+
+ LSB = SrlImm;
+ MSB = BitWide + SrlImm - 1;
+ return true;
+ }
+
+ return false;
+}
+
+static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
+ unsigned &Immr, unsigned &Imms,
+ bool BiggerPattern) {
+ assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
+ "N must be a SHR/SRA operation to call this function");
+
+ EVT VT = N->getValueType(0);
+
+ // Here we can test the type of VT and return false when the type does not
+ // match, but since it is done prior to that call in the current context
+ // we turned that into an assert to avoid redundant code.
+ assert((VT == MVT::i32 || VT == MVT::i64) &&
+ "Type checking must have been done before calling this function");
+
+ // Check for AND + SRL doing several bits extract.
+ if (isSeveralBitsExtractOpFromShr(N, Opc, Opd0, Immr, Imms))
+ return true;
+
+ // We're looking for a shift of a shift.
+ uint64_t ShlImm = 0;
+ uint64_t TruncBits = 0;
+ if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, ShlImm)) {
+ Opd0 = N->getOperand(0).getOperand(0);
+ } else if (VT == MVT::i32 && N->getOpcode() == ISD::SRL &&
+ N->getOperand(0).getNode()->getOpcode() == ISD::TRUNCATE) {
+ // We are looking for a shift of truncate. Truncate from i64 to i32 could
+ // be considered as setting high 32 bits as zero. Our strategy here is to
+ // always generate 64bit UBFM. This consistency will help the CSE pass
+ // later find more redundancy.
+ Opd0 = N->getOperand(0).getOperand(0);
+ TruncBits = Opd0->getValueType(0).getSizeInBits() - VT.getSizeInBits();
+ VT = Opd0->getValueType(0);
+ assert(VT == MVT::i64 && "the promoted type should be i64");
+ } else if (BiggerPattern) {
+ // Let's pretend a 0 shift left has been performed.
+ // FIXME: Currently we limit this to the bigger pattern case,
+ // because some optimizations expect AND and not UBFM
+ Opd0 = N->getOperand(0);
+ } else
+ return false;
+
+ // Missing combines/constant folding may have left us with strange
+ // constants.
+ if (ShlImm >= VT.getSizeInBits()) {
+ DEBUG((dbgs() << N
+ << ": Found large shift immediate, this should not happen\n"));
+ return false;
+ }
+
+ uint64_t SrlImm = 0;
+ if (!isIntImmediate(N->getOperand(1), SrlImm))
+ return false;
+
+ assert(SrlImm > 0 && SrlImm < VT.getSizeInBits() &&
+ "bad amount in shift node!");
+ int immr = SrlImm - ShlImm;
+ Immr = immr < 0 ? immr + VT.getSizeInBits() : immr;
+ Imms = VT.getSizeInBits() - ShlImm - TruncBits - 1;
+ // SRA requires a signed extraction
+ if (VT == MVT::i32)
+ Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMWri : AArch64::UBFMWri;
+ else
+ Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMXri : AArch64::UBFMXri;
+ return true;
+}
+
+bool AArch64DAGToDAGISel::tryBitfieldExtractOpFromSExt(SDNode *N) {
+ assert(N->getOpcode() == ISD::SIGN_EXTEND);
+
+ EVT VT = N->getValueType(0);
+ EVT NarrowVT = N->getOperand(0)->getValueType(0);
+ if (VT != MVT::i64 || NarrowVT != MVT::i32)
+ return false;
+
+ uint64_t ShiftImm;
+ SDValue Op = N->getOperand(0);
+ if (!isOpcWithIntImmediate(Op.getNode(), ISD::SRA, ShiftImm))
+ return false;
+
+ SDLoc dl(N);
+ // Extend the incoming operand of the shift to 64-bits.
+ SDValue Opd0 = Widen(CurDAG, Op.getOperand(0));
+ unsigned Immr = ShiftImm;
+ unsigned Imms = NarrowVT.getSizeInBits() - 1;
+ SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, VT),
+ CurDAG->getTargetConstant(Imms, dl, VT)};
+ CurDAG->SelectNodeTo(N, AArch64::SBFMXri, VT, Ops);
+ return true;
+}
+
+static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
+ SDValue &Opd0, unsigned &Immr, unsigned &Imms,
+ unsigned NumberOfIgnoredLowBits = 0,
+ bool BiggerPattern = false) {
+ if (N->getValueType(0) != MVT::i32 && N->getValueType(0) != MVT::i64)
+ return false;
+
+ switch (N->getOpcode()) {
+ default:
+ if (!N->isMachineOpcode())
+ return false;
+ break;
+ case ISD::AND:
+ return isBitfieldExtractOpFromAnd(CurDAG, N, Opc, Opd0, Immr, Imms,
+ NumberOfIgnoredLowBits, BiggerPattern);
+ case ISD::SRL:
+ case ISD::SRA:
+ return isBitfieldExtractOpFromShr(N, Opc, Opd0, Immr, Imms, BiggerPattern);
+
+ case ISD::SIGN_EXTEND_INREG:
+ return isBitfieldExtractOpFromSExtInReg(N, Opc, Opd0, Immr, Imms);
+ }
+
+ unsigned NOpc = N->getMachineOpcode();
+ switch (NOpc) {
+ default:
+ return false;
+ case AArch64::SBFMWri:
+ case AArch64::UBFMWri:
+ case AArch64::SBFMXri:
+ case AArch64::UBFMXri:
+ Opc = NOpc;
+ Opd0 = N->getOperand(0);
+ Immr = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
+ Imms = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
+ return true;
+ }
+ // Unreachable
+ return false;
+}
+
+bool AArch64DAGToDAGISel::tryBitfieldExtractOp(SDNode *N) {
+ unsigned Opc, Immr, Imms;
+ SDValue Opd0;
+ if (!isBitfieldExtractOp(CurDAG, N, Opc, Opd0, Immr, Imms))
+ return false;
+
+ EVT VT = N->getValueType(0);
+ SDLoc dl(N);
+
+ // If the bit extract operation is 64bit but the original type is 32bit, we
+ // need to add one EXTRACT_SUBREG.
+ if ((Opc == AArch64::SBFMXri || Opc == AArch64::UBFMXri) && VT == MVT::i32) {
+ SDValue Ops64[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, MVT::i64),
+ CurDAG->getTargetConstant(Imms, dl, MVT::i64)};
+
+ SDNode *BFM = CurDAG->getMachineNode(Opc, dl, MVT::i64, Ops64);
+ SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
+ ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl,
+ MVT::i32, SDValue(BFM, 0), SubReg));
+ return true;
+ }
+
+ SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, VT),
+ CurDAG->getTargetConstant(Imms, dl, VT)};
+ CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+ return true;
+}
+
+/// Does DstMask form a complementary pair with the mask provided by
+/// BitsToBeInserted, suitable for use in a BFI instruction. Roughly speaking,
+/// this asks whether DstMask zeroes precisely those bits that will be set by
+/// the other half.
+static bool isBitfieldDstMask(uint64_t DstMask, const APInt &BitsToBeInserted,
+ unsigned NumberOfIgnoredHighBits, EVT VT) {
+ assert((VT == MVT::i32 || VT == MVT::i64) &&
+ "i32 or i64 mask type expected!");
+ unsigned BitWidth = VT.getSizeInBits() - NumberOfIgnoredHighBits;
+
+ APInt SignificantDstMask = APInt(BitWidth, DstMask);
+ APInt SignificantBitsToBeInserted = BitsToBeInserted.zextOrTrunc(BitWidth);
+
+ return (SignificantDstMask & SignificantBitsToBeInserted) == 0 &&
+ (SignificantDstMask | SignificantBitsToBeInserted).isAllOnesValue();
+}
+
+// Look for bits that will be useful for later uses.
+// A bit is consider useless as soon as it is dropped and never used
+// before it as been dropped.
+// E.g., looking for useful bit of x
+// 1. y = x & 0x7
+// 2. z = y >> 2
+// After #1, x useful bits are 0x7, then the useful bits of x, live through
+// y.
+// After #2, the useful bits of x are 0x4.
+// However, if x is used on an unpredicatable instruction, then all its bits
+// are useful.
+// E.g.
+// 1. y = x & 0x7
+// 2. z = y >> 2
+// 3. str x, [@x]
+static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth = 0);
+
+static void getUsefulBitsFromAndWithImmediate(SDValue Op, APInt &UsefulBits,
+ unsigned Depth) {
+ uint64_t Imm =
+ cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
+ Imm = AArch64_AM::decodeLogicalImmediate(Imm, UsefulBits.getBitWidth());
+ UsefulBits &= APInt(UsefulBits.getBitWidth(), Imm);
+ getUsefulBits(Op, UsefulBits, Depth + 1);
+}
+
+static void getUsefulBitsFromBitfieldMoveOpd(SDValue Op, APInt &UsefulBits,
+ uint64_t Imm, uint64_t MSB,
+ unsigned Depth) {
+ // inherit the bitwidth value
+ APInt OpUsefulBits(UsefulBits);
+ OpUsefulBits = 1;
+
+ if (MSB >= Imm) {
+ OpUsefulBits = OpUsefulBits.shl(MSB - Imm + 1);
+ --OpUsefulBits;
+ // The interesting part will be in the lower part of the result
+ getUsefulBits(Op, OpUsefulBits, Depth + 1);
+ // The interesting part was starting at Imm in the argument
+ OpUsefulBits = OpUsefulBits.shl(Imm);
+ } else {
+ OpUsefulBits = OpUsefulBits.shl(MSB + 1);
+ --OpUsefulBits;
+ // The interesting part will be shifted in the result
+ OpUsefulBits = OpUsefulBits.shl(OpUsefulBits.getBitWidth() - Imm);
+ getUsefulBits(Op, OpUsefulBits, Depth + 1);
+ // The interesting part was at zero in the argument
+ OpUsefulBits = OpUsefulBits.lshr(OpUsefulBits.getBitWidth() - Imm);
+ }
+
+ UsefulBits &= OpUsefulBits;
+}
+
+static void getUsefulBitsFromUBFM(SDValue Op, APInt &UsefulBits,
+ unsigned Depth) {
+ uint64_t Imm =
+ cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
+ uint64_t MSB =
+ cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
+
+ getUsefulBitsFromBitfieldMoveOpd(Op, UsefulBits, Imm, MSB, Depth);
+}
+
+static void getUsefulBitsFromOrWithShiftedReg(SDValue Op, APInt &UsefulBits,
+ unsigned Depth) {
+ uint64_t ShiftTypeAndValue =
+ cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
+ APInt Mask(UsefulBits);
+ Mask.clearAllBits();
+ Mask.flipAllBits();
+
+ if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSL) {
+ // Shift Left
+ uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue);
+ Mask = Mask.shl(ShiftAmt);
+ getUsefulBits(Op, Mask, Depth + 1);
+ Mask = Mask.lshr(ShiftAmt);
+ } else if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSR) {
+ // Shift Right
+ // We do not handle AArch64_AM::ASR, because the sign will change the
+ // number of useful bits
+ uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue);
+ Mask = Mask.lshr(ShiftAmt);
+ getUsefulBits(Op, Mask, Depth + 1);
+ Mask = Mask.shl(ShiftAmt);
+ } else
+ return;
+
+ UsefulBits &= Mask;
+}
+
+static void getUsefulBitsFromBFM(SDValue Op, SDValue Orig, APInt &UsefulBits,
+ unsigned Depth) {
+ uint64_t Imm =
+ cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
+ uint64_t MSB =
+ cast<const ConstantSDNode>(Op.getOperand(3).getNode())->getZExtValue();
+
+ APInt OpUsefulBits(UsefulBits);
+ OpUsefulBits = 1;
+
+ APInt ResultUsefulBits(UsefulBits.getBitWidth(), 0);
+ ResultUsefulBits.flipAllBits();
+ APInt Mask(UsefulBits.getBitWidth(), 0);
+
+ getUsefulBits(Op, ResultUsefulBits, Depth + 1);
+
+ if (MSB >= Imm) {
+ // The instruction is a BFXIL.
+ uint64_t Width = MSB - Imm + 1;
+ uint64_t LSB = Imm;
+
+ OpUsefulBits = OpUsefulBits.shl(Width);
+ --OpUsefulBits;
+
+ if (Op.getOperand(1) == Orig) {
+ // Copy the low bits from the result to bits starting from LSB.
+ Mask = ResultUsefulBits & OpUsefulBits;
+ Mask = Mask.shl(LSB);
+ }
+
+ if (Op.getOperand(0) == Orig)
+ // Bits starting from LSB in the input contribute to the result.
+ Mask |= (ResultUsefulBits & ~OpUsefulBits);
+ } else {
+ // The instruction is a BFI.
+ uint64_t Width = MSB + 1;
+ uint64_t LSB = UsefulBits.getBitWidth() - Imm;
+
+ OpUsefulBits = OpUsefulBits.shl(Width);
+ --OpUsefulBits;
+ OpUsefulBits = OpUsefulBits.shl(LSB);
+
+ if (Op.getOperand(1) == Orig) {
+ // Copy the bits from the result to the zero bits.
+ Mask = ResultUsefulBits & OpUsefulBits;
+ Mask = Mask.lshr(LSB);
+ }
+
+ if (Op.getOperand(0) == Orig)
+ Mask |= (ResultUsefulBits & ~OpUsefulBits);
+ }
+
+ UsefulBits &= Mask;
+}
+
+static void getUsefulBitsForUse(SDNode *UserNode, APInt &UsefulBits,
+ SDValue Orig, unsigned Depth) {
+
+ // Users of this node should have already been instruction selected
+ // FIXME: Can we turn that into an assert?
+ if (!UserNode->isMachineOpcode())
+ return;
+
+ switch (UserNode->getMachineOpcode()) {
+ default:
+ return;
+ case AArch64::ANDSWri:
+ case AArch64::ANDSXri:
+ case AArch64::ANDWri:
+ case AArch64::ANDXri:
+ // We increment Depth only when we call the getUsefulBits
+ return getUsefulBitsFromAndWithImmediate(SDValue(UserNode, 0), UsefulBits,
+ Depth);
+ case AArch64::UBFMWri:
+ case AArch64::UBFMXri:
+ return getUsefulBitsFromUBFM(SDValue(UserNode, 0), UsefulBits, Depth);
+
+ case AArch64::ORRWrs:
+ case AArch64::ORRXrs:
+ if (UserNode->getOperand(1) != Orig)
+ return;
+ return getUsefulBitsFromOrWithShiftedReg(SDValue(UserNode, 0), UsefulBits,
+ Depth);
+ case AArch64::BFMWri:
+ case AArch64::BFMXri:
+ return getUsefulBitsFromBFM(SDValue(UserNode, 0), Orig, UsefulBits, Depth);
+
+ case AArch64::STRBBui:
+ case AArch64::STURBBi:
+ if (UserNode->getOperand(0) != Orig)
+ return;
+ UsefulBits &= APInt(UsefulBits.getBitWidth(), 0xff);
+ return;
+
+ case AArch64::STRHHui:
+ case AArch64::STURHHi:
+ if (UserNode->getOperand(0) != Orig)
+ return;
+ UsefulBits &= APInt(UsefulBits.getBitWidth(), 0xffff);
+ return;
+ }
+}
+
+static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth) {
+ if (Depth >= 6)
+ return;
+ // Initialize UsefulBits
+ if (!Depth) {
+ unsigned Bitwidth = Op.getScalarValueSizeInBits();
+ // At the beginning, assume every produced bits is useful
+ UsefulBits = APInt(Bitwidth, 0);
+ UsefulBits.flipAllBits();
+ }
+ APInt UsersUsefulBits(UsefulBits.getBitWidth(), 0);
+
+ for (SDNode *Node : Op.getNode()->uses()) {
+ // A use cannot produce useful bits
+ APInt UsefulBitsForUse = APInt(UsefulBits);
+ getUsefulBitsForUse(Node, UsefulBitsForUse, Op, Depth);
+ UsersUsefulBits |= UsefulBitsForUse;
+ }
+ // UsefulBits contains the produced bits that are meaningful for the
+ // current definition, thus a user cannot make a bit meaningful at
+ // this point
+ UsefulBits &= UsersUsefulBits;
+}
+
+/// Create a machine node performing a notional SHL of Op by ShlAmount. If
+/// ShlAmount is negative, do a (logical) right-shift instead. If ShlAmount is
+/// 0, return Op unchanged.
+static SDValue getLeftShift(SelectionDAG *CurDAG, SDValue Op, int ShlAmount) {
+ if (ShlAmount == 0)
+ return Op;
+
+ EVT VT = Op.getValueType();
+ SDLoc dl(Op);
+ unsigned BitWidth = VT.getSizeInBits();
+ unsigned UBFMOpc = BitWidth == 32 ? AArch64::UBFMWri : AArch64::UBFMXri;
+
+ SDNode *ShiftNode;
+ if (ShlAmount > 0) {
+ // LSL wD, wN, #Amt == UBFM wD, wN, #32-Amt, #31-Amt
+ ShiftNode = CurDAG->getMachineNode(
+ UBFMOpc, dl, VT, Op,
+ CurDAG->getTargetConstant(BitWidth - ShlAmount, dl, VT),
+ CurDAG->getTargetConstant(BitWidth - 1 - ShlAmount, dl, VT));
+ } else {
+ // LSR wD, wN, #Amt == UBFM wD, wN, #Amt, #32-1
+ assert(ShlAmount < 0 && "expected right shift");
+ int ShrAmount = -ShlAmount;
+ ShiftNode = CurDAG->getMachineNode(
+ UBFMOpc, dl, VT, Op, CurDAG->getTargetConstant(ShrAmount, dl, VT),
+ CurDAG->getTargetConstant(BitWidth - 1, dl, VT));
+ }
+
+ return SDValue(ShiftNode, 0);
+}
+
+/// Does this tree qualify as an attempt to move a bitfield into position,
+/// essentially "(and (shl VAL, N), Mask)".
+static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
+ bool BiggerPattern,
+ SDValue &Src, int &ShiftAmount,
+ int &MaskWidth) {
+ EVT VT = Op.getValueType();
+ unsigned BitWidth = VT.getSizeInBits();
+ (void)BitWidth;
+ assert(BitWidth == 32 || BitWidth == 64);
+
+ APInt KnownZero, KnownOne;
+ CurDAG->computeKnownBits(Op, KnownZero, KnownOne);
+
+ // Non-zero in the sense that they're not provably zero, which is the key
+ // point if we want to use this value
+ uint64_t NonZeroBits = (~KnownZero).getZExtValue();
+
+ // Discard a constant AND mask if present. It's safe because the node will
+ // already have been factored into the computeKnownBits calculation above.
+ uint64_t AndImm;
+ if (isOpcWithIntImmediate(Op.getNode(), ISD::AND, AndImm)) {
+ assert((~APInt(BitWidth, AndImm) & ~KnownZero) == 0);
+ Op = Op.getOperand(0);
+ }
+
+ // Don't match if the SHL has more than one use, since then we'll end up
+ // generating SHL+UBFIZ instead of just keeping SHL+AND.
+ if (!BiggerPattern && !Op.hasOneUse())
+ return false;
+
+ uint64_t ShlImm;
+ if (!isOpcWithIntImmediate(Op.getNode(), ISD::SHL, ShlImm))
+ return false;
+ Op = Op.getOperand(0);
+
+ if (!isShiftedMask_64(NonZeroBits))
+ return false;
+
+ ShiftAmount = countTrailingZeros(NonZeroBits);
+ MaskWidth = countTrailingOnes(NonZeroBits >> ShiftAmount);
+
+ // BFI encompasses sufficiently many nodes that it's worth inserting an extra
+ // LSL/LSR if the mask in NonZeroBits doesn't quite match up with the ISD::SHL
+ // amount. BiggerPattern is true when this pattern is being matched for BFI,
+ // BiggerPattern is false when this pattern is being matched for UBFIZ, in
+ // which case it is not profitable to insert an extra shift.
+ if (ShlImm - ShiftAmount != 0 && !BiggerPattern)
+ return false;
+ Src = getLeftShift(CurDAG, Op, ShlImm - ShiftAmount);
+
+ return true;
+}
+
+static bool isShiftedMask(uint64_t Mask, EVT VT) {
+ assert(VT == MVT::i32 || VT == MVT::i64);
+ if (VT == MVT::i32)
+ return isShiftedMask_32(Mask);
+ return isShiftedMask_64(Mask);
+}
+
+// Generate a BFI/BFXIL from 'or (and X, MaskImm), OrImm' iff the value being
+// inserted only sets known zero bits.
+static bool tryBitfieldInsertOpFromOrAndImm(SDNode *N, SelectionDAG *CurDAG) {
+ assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
+
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::i32 && VT != MVT::i64)
+ return false;
+
+ unsigned BitWidth = VT.getSizeInBits();
+
+ uint64_t OrImm;
+ if (!isOpcWithIntImmediate(N, ISD::OR, OrImm))
+ return false;
+
+ // Skip this transformation if the ORR immediate can be encoded in the ORR.
+ // Otherwise, we'll trade an AND+ORR for ORR+BFI/BFXIL, which is most likely
+ // performance neutral.
+ if (AArch64_AM::isLogicalImmediate(OrImm, BitWidth))
+ return false;
+
+ uint64_t MaskImm;
+ SDValue And = N->getOperand(0);
+ // Must be a single use AND with an immediate operand.
+ if (!And.hasOneUse() ||
+ !isOpcWithIntImmediate(And.getNode(), ISD::AND, MaskImm))
+ return false;
+
+ // Compute the Known Zero for the AND as this allows us to catch more general
+ // cases than just looking for AND with imm.
+ APInt KnownZero, KnownOne;
+ CurDAG->computeKnownBits(And, KnownZero, KnownOne);
+
+ // Non-zero in the sense that they're not provably zero, which is the key
+ // point if we want to use this value.
+ uint64_t NotKnownZero = (~KnownZero).getZExtValue();
+
+ // The KnownZero mask must be a shifted mask (e.g., 1110..011, 11100..00).
+ if (!isShiftedMask(KnownZero.getZExtValue(), VT))
+ return false;
+
+ // The bits being inserted must only set those bits that are known to be zero.
+ if ((OrImm & NotKnownZero) != 0) {
+ // FIXME: It's okay if the OrImm sets NotKnownZero bits to 1, but we don't
+ // currently handle this case.
+ return false;
+ }
+
+ // BFI/BFXIL dst, src, #lsb, #width.
+ int LSB = countTrailingOnes(NotKnownZero);
+ int Width = BitWidth - APInt(BitWidth, NotKnownZero).countPopulation();
+
+ // BFI/BFXIL is an alias of BFM, so translate to BFM operands.
+ unsigned ImmR = (BitWidth - LSB) % BitWidth;
+ unsigned ImmS = Width - 1;
+
+ // If we're creating a BFI instruction avoid cases where we need more
+ // instructions to materialize the BFI constant as compared to the original
+ // ORR. A BFXIL will use the same constant as the original ORR, so the code
+ // should be no worse in this case.
+ bool IsBFI = LSB != 0;
+ uint64_t BFIImm = OrImm >> LSB;
+ if (IsBFI && !AArch64_AM::isLogicalImmediate(BFIImm, BitWidth)) {
+ // We have a BFI instruction and we know the constant can't be materialized
+ // with a ORR-immediate with the zero register.
+ unsigned OrChunks = 0, BFIChunks = 0;
+ for (unsigned Shift = 0; Shift < BitWidth; Shift += 16) {
+ if (((OrImm >> Shift) & 0xFFFF) != 0)
+ ++OrChunks;
+ if (((BFIImm >> Shift) & 0xFFFF) != 0)
+ ++BFIChunks;
+ }
+ if (BFIChunks > OrChunks)
+ return false;
+ }
+
+ // Materialize the constant to be inserted.
+ SDLoc DL(N);
+ unsigned MOVIOpc = VT == MVT::i32 ? AArch64::MOVi32imm : AArch64::MOVi64imm;
+ SDNode *MOVI = CurDAG->getMachineNode(
+ MOVIOpc, DL, VT, CurDAG->getTargetConstant(BFIImm, DL, VT));
+
+ // Create the BFI/BFXIL instruction.
+ SDValue Ops[] = {And.getOperand(0), SDValue(MOVI, 0),
+ CurDAG->getTargetConstant(ImmR, DL, VT),
+ CurDAG->getTargetConstant(ImmS, DL, VT)};
+ unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri;
+ CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+ return true;
+}
+
+static bool tryBitfieldInsertOpFromOr(SDNode *N, const APInt &UsefulBits,
+ SelectionDAG *CurDAG) {
+ assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
+
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::i32 && VT != MVT::i64)
+ return false;
+
+ unsigned BitWidth = VT.getSizeInBits();
+
+ // Because of simplify-demanded-bits in DAGCombine, involved masks may not
+ // have the expected shape. Try to undo that.
+
+ unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros();
+ unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros();
+
+ // Given a OR operation, check if we have the following pattern
+ // ubfm c, b, imm, imm2 (or something that does the same jobs, see
+ // isBitfieldExtractOp)
+ // d = e & mask2 ; where mask is a binary sequence of 1..10..0 and
+ // countTrailingZeros(mask2) == imm2 - imm + 1
+ // f = d | c
+ // if yes, replace the OR instruction with:
+ // f = BFM Opd0, Opd1, LSB, MSB ; where LSB = imm, and MSB = imm2
+
+ // OR is commutative, check all combinations of operand order and values of
+ // BiggerPattern, i.e.
+ // Opd0, Opd1, BiggerPattern=false
+ // Opd1, Opd0, BiggerPattern=false
+ // Opd0, Opd1, BiggerPattern=true
+ // Opd1, Opd0, BiggerPattern=true
+ // Several of these combinations may match, so check with BiggerPattern=false
+ // first since that will produce better results by matching more instructions
+ // and/or inserting fewer extra instructions.
+ for (int I = 0; I < 4; ++I) {
+
+ SDValue Dst, Src;
+ unsigned ImmR, ImmS;
+ bool BiggerPattern = I / 2;
+ SDValue OrOpd0Val = N->getOperand(I % 2);
+ SDNode *OrOpd0 = OrOpd0Val.getNode();
+ SDValue OrOpd1Val = N->getOperand((I + 1) % 2);
+ SDNode *OrOpd1 = OrOpd1Val.getNode();
+
+ unsigned BFXOpc;
+ int DstLSB, Width;
+ if (isBitfieldExtractOp(CurDAG, OrOpd0, BFXOpc, Src, ImmR, ImmS,
+ NumberOfIgnoredLowBits, BiggerPattern)) {
+ // Check that the returned opcode is compatible with the pattern,
+ // i.e., same type and zero extended (U and not S)
+ if ((BFXOpc != AArch64::UBFMXri && VT == MVT::i64) ||
+ (BFXOpc != AArch64::UBFMWri && VT == MVT::i32))
+ continue;
+
+ // Compute the width of the bitfield insertion
+ DstLSB = 0;
+ Width = ImmS - ImmR + 1;
+ // FIXME: This constraint is to catch bitfield insertion we may
+ // want to widen the pattern if we want to grab general bitfied
+ // move case
+ if (Width <= 0)
+ continue;
+
+ // If the mask on the insertee is correct, we have a BFXIL operation. We
+ // can share the ImmR and ImmS values from the already-computed UBFM.
+ } else if (isBitfieldPositioningOp(CurDAG, OrOpd0Val,
+ BiggerPattern,
+ Src, DstLSB, Width)) {
+ ImmR = (BitWidth - DstLSB) % BitWidth;
+ ImmS = Width - 1;
+ } else
+ continue;
+
+ // Check the second part of the pattern
+ EVT VT = OrOpd1->getValueType(0);
+ assert((VT == MVT::i32 || VT == MVT::i64) && "unexpected OR operand");
+
+ // Compute the Known Zero for the candidate of the first operand.
+ // This allows to catch more general case than just looking for
+ // AND with imm. Indeed, simplify-demanded-bits may have removed
+ // the AND instruction because it proves it was useless.
+ APInt KnownZero, KnownOne;
+ CurDAG->computeKnownBits(OrOpd1Val, KnownZero, KnownOne);
+
+ // Check if there is enough room for the second operand to appear
+ // in the first one
+ APInt BitsToBeInserted =
+ APInt::getBitsSet(KnownZero.getBitWidth(), DstLSB, DstLSB + Width);
+
+ if ((BitsToBeInserted & ~KnownZero) != 0)
+ continue;
+
+ // Set the first operand
+ uint64_t Imm;
+ if (isOpcWithIntImmediate(OrOpd1, ISD::AND, Imm) &&
+ isBitfieldDstMask(Imm, BitsToBeInserted, NumberOfIgnoredHighBits, VT))
+ // In that case, we can eliminate the AND
+ Dst = OrOpd1->getOperand(0);
+ else
+ // Maybe the AND has been removed by simplify-demanded-bits
+ // or is useful because it discards more bits
+ Dst = OrOpd1Val;
+
+ // both parts match
+ SDLoc DL(N);
+ SDValue Ops[] = {Dst, Src, CurDAG->getTargetConstant(ImmR, DL, VT),
+ CurDAG->getTargetConstant(ImmS, DL, VT)};
+ unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri;
+ CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+ return true;
+ }
+
+ // Generate a BFXIL from 'or (and X, Mask0Imm), (and Y, Mask1Imm)' iff
+ // Mask0Imm and ~Mask1Imm are equivalent and one of the MaskImms is a shifted
+ // mask (e.g., 0x000ffff0).
+ uint64_t Mask0Imm, Mask1Imm;
+ SDValue And0 = N->getOperand(0);
+ SDValue And1 = N->getOperand(1);
+ if (And0.hasOneUse() && And1.hasOneUse() &&
+ isOpcWithIntImmediate(And0.getNode(), ISD::AND, Mask0Imm) &&
+ isOpcWithIntImmediate(And1.getNode(), ISD::AND, Mask1Imm) &&
+ APInt(BitWidth, Mask0Imm) == ~APInt(BitWidth, Mask1Imm) &&
+ (isShiftedMask(Mask0Imm, VT) || isShiftedMask(Mask1Imm, VT))) {
+
+ // ORR is commutative, so canonicalize to the form 'or (and X, Mask0Imm),
+ // (and Y, Mask1Imm)' where Mask1Imm is the shifted mask masking off the
+ // bits to be inserted.
+ if (isShiftedMask(Mask0Imm, VT)) {
+ std::swap(And0, And1);
+ std::swap(Mask0Imm, Mask1Imm);
+ }
+
+ SDValue Src = And1->getOperand(0);
+ SDValue Dst = And0->getOperand(0);
+ unsigned LSB = countTrailingZeros(Mask1Imm);
+ int Width = BitWidth - APInt(BitWidth, Mask0Imm).countPopulation();
+
+ // The BFXIL inserts the low-order bits from a source register, so right
+ // shift the needed bits into place.
+ SDLoc DL(N);
+ unsigned ShiftOpc = (VT == MVT::i32) ? AArch64::UBFMWri : AArch64::UBFMXri;
+ SDNode *LSR = CurDAG->getMachineNode(
+ ShiftOpc, DL, VT, Src, CurDAG->getTargetConstant(LSB, DL, VT),
+ CurDAG->getTargetConstant(BitWidth - 1, DL, VT));
+
+ // BFXIL is an alias of BFM, so translate to BFM operands.
+ unsigned ImmR = (BitWidth - LSB) % BitWidth;
+ unsigned ImmS = Width - 1;
+
+ // Create the BFXIL instruction.
+ SDValue Ops[] = {Dst, SDValue(LSR, 0),
+ CurDAG->getTargetConstant(ImmR, DL, VT),
+ CurDAG->getTargetConstant(ImmS, DL, VT)};
+ unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri;
+ CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+ return true;
+ }
+
+ return false;
+}
+
+bool AArch64DAGToDAGISel::tryBitfieldInsertOp(SDNode *N) {
+ if (N->getOpcode() != ISD::OR)
+ return false;
+
+ APInt NUsefulBits;
+ getUsefulBits(SDValue(N, 0), NUsefulBits);
+
+ // If all bits are not useful, just return UNDEF.
+ if (!NUsefulBits) {
+ CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF, N->getValueType(0));
+ return true;
+ }
+
+ if (tryBitfieldInsertOpFromOr(N, NUsefulBits, CurDAG))
+ return true;
+
+ return tryBitfieldInsertOpFromOrAndImm(N, CurDAG);
+}
+
+/// SelectBitfieldInsertInZeroOp - Match a UBFIZ instruction that is the
+/// equivalent of a left shift by a constant amount followed by an and masking
+/// out a contiguous set of bits.
+bool AArch64DAGToDAGISel::tryBitfieldInsertInZeroOp(SDNode *N) {
+ if (N->getOpcode() != ISD::AND)
+ return false;
+
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::i32 && VT != MVT::i64)
+ return false;
+
+ SDValue Op0;
+ int DstLSB, Width;
+ if (!isBitfieldPositioningOp(CurDAG, SDValue(N, 0), /*BiggerPattern=*/false,
+ Op0, DstLSB, Width))
+ return false;
+
+ // ImmR is the rotate right amount.
+ unsigned ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits();
+ // ImmS is the most significant bit of the source to be moved.
+ unsigned ImmS = Width - 1;
+
+ SDLoc DL(N);
+ SDValue Ops[] = {Op0, CurDAG->getTargetConstant(ImmR, DL, VT),
+ CurDAG->getTargetConstant(ImmS, DL, VT)};
+ unsigned Opc = (VT == MVT::i32) ? AArch64::UBFMWri : AArch64::UBFMXri;
+ CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+ return true;
+}
+
+bool
+AArch64DAGToDAGISel::SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos,
+ unsigned RegWidth) {
+ APFloat FVal(0.0);
+ if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N))
+ FVal = CN->getValueAPF();
+ else if (LoadSDNode *LN = dyn_cast<LoadSDNode>(N)) {
+ // Some otherwise illegal constants are allowed in this case.
+ if (LN->getOperand(1).getOpcode() != AArch64ISD::ADDlow ||
+ !isa<ConstantPoolSDNode>(LN->getOperand(1)->getOperand(1)))
+ return false;
+
+ ConstantPoolSDNode *CN =
+ dyn_cast<ConstantPoolSDNode>(LN->getOperand(1)->getOperand(1));
+ FVal = cast<ConstantFP>(CN->getConstVal())->getValueAPF();
+ } else
+ return false;
+
+ // An FCVT[SU] instruction performs: convertToInt(Val * 2^fbits) where fbits
+ // is between 1 and 32 for a destination w-register, or 1 and 64 for an
+ // x-register.
+ //
+ // By this stage, we've detected (fp_to_[su]int (fmul Val, THIS_NODE)) so we
+ // want THIS_NODE to be 2^fbits. This is much easier to deal with using
+ // integers.
+ bool IsExact;
+
+ // fbits is between 1 and 64 in the worst-case, which means the fmul
+ // could have 2^64 as an actual operand. Need 65 bits of precision.
+ APSInt IntVal(65, true);
+ FVal.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact);
+
+ // N.b. isPowerOf2 also checks for > 0.
+ if (!IsExact || !IntVal.isPowerOf2()) return false;
+ unsigned FBits = IntVal.logBase2();
+
+ // Checks above should have guaranteed that we haven't lost information in
+ // finding FBits, but it must still be in range.
+ if (FBits == 0 || FBits > RegWidth) return false;
+
+ FixedPos = CurDAG->getTargetConstant(FBits, SDLoc(N), MVT::i32);
+ return true;
+}
+
+// Inspects a register string of the form o0:op1:CRn:CRm:op2 gets the fields
+// of the string and obtains the integer values from them and combines these
+// into a single value to be used in the MRS/MSR instruction.
+static int getIntOperandFromRegisterString(StringRef RegString) {
+ SmallVector<StringRef, 5> Fields;
+ RegString.split(Fields, ':');
+
+ if (Fields.size() == 1)
+ return -1;
+
+ assert(Fields.size() == 5
+ && "Invalid number of fields in read register string");
+
+ SmallVector<int, 5> Ops;
+ bool AllIntFields = true;
+
+ for (StringRef Field : Fields) {
+ unsigned IntField;
+ AllIntFields &= !Field.getAsInteger(10, IntField);
+ Ops.push_back(IntField);
+ }
+
+ assert(AllIntFields &&
+ "Unexpected non-integer value in special register string.");
+
+ // Need to combine the integer fields of the string into a single value
+ // based on the bit encoding of MRS/MSR instruction.
+ return (Ops[0] << 14) | (Ops[1] << 11) | (Ops[2] << 7) |
+ (Ops[3] << 3) | (Ops[4]);
+}
+
+// Lower the read_register intrinsic to an MRS instruction node if the special
+// register string argument is either of the form detailed in the ALCE (the
+// form described in getIntOperandsFromRegsterString) or is a named register
+// known by the MRS SysReg mapper.
+bool AArch64DAGToDAGISel::tryReadRegister(SDNode *N) {
+ const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
+ const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
+ SDLoc DL(N);
+
+ int Reg = getIntOperandFromRegisterString(RegString->getString());
+ if (Reg != -1) {
+ ReplaceNode(N, CurDAG->getMachineNode(
+ AArch64::MRS, DL, N->getSimpleValueType(0), MVT::Other,
+ CurDAG->getTargetConstant(Reg, DL, MVT::i32),
+ N->getOperand(0)));
+ return true;
+ }
+
+ // Use the sysreg mapper to map the remaining possible strings to the
+ // value for the register to be used for the instruction operand.
+ auto TheReg = AArch64SysReg::lookupSysRegByName(RegString->getString());
+ if (TheReg && TheReg->Readable &&
+ TheReg->haveFeatures(Subtarget->getFeatureBits()))
+ Reg = TheReg->Encoding;
+ else
+ Reg = AArch64SysReg::parseGenericRegister(RegString->getString());
+
+ if (Reg != -1) {
+ ReplaceNode(N, CurDAG->getMachineNode(
+ AArch64::MRS, DL, N->getSimpleValueType(0), MVT::Other,
+ CurDAG->getTargetConstant(Reg, DL, MVT::i32),
+ N->getOperand(0)));
+ return true;
+ }
+
+ return false;
+}
+
+// Lower the write_register intrinsic to an MSR instruction node if the special
+// register string argument is either of the form detailed in the ALCE (the
+// form described in getIntOperandsFromRegsterString) or is a named register
+// known by the MSR SysReg mapper.
+bool AArch64DAGToDAGISel::tryWriteRegister(SDNode *N) {
+ const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
+ const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
+ SDLoc DL(N);
+
+ int Reg = getIntOperandFromRegisterString(RegString->getString());
+ if (Reg != -1) {
+ ReplaceNode(
+ N, CurDAG->getMachineNode(AArch64::MSR, DL, MVT::Other,
+ CurDAG->getTargetConstant(Reg, DL, MVT::i32),
+ N->getOperand(2), N->getOperand(0)));
+ return true;
+ }
+
+ // Check if the register was one of those allowed as the pstatefield value in
+ // the MSR (immediate) instruction. To accept the values allowed in the
+ // pstatefield for the MSR (immediate) instruction, we also require that an
+ // immediate value has been provided as an argument, we know that this is
+ // the case as it has been ensured by semantic checking.
+ auto PMapper = AArch64PState::lookupPStateByName(RegString->getString());;
+ if (PMapper) {
+ assert (isa<ConstantSDNode>(N->getOperand(2))
+ && "Expected a constant integer expression.");
+ unsigned Reg = PMapper->Encoding;
+ uint64_t Immed = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+ unsigned State;
+ if (Reg == AArch64PState::PAN || Reg == AArch64PState::UAO) {
+ assert(Immed < 2 && "Bad imm");
+ State = AArch64::MSRpstateImm1;
+ } else {
+ assert(Immed < 16 && "Bad imm");
+ State = AArch64::MSRpstateImm4;
+ }
+ ReplaceNode(N, CurDAG->getMachineNode(
+ State, DL, MVT::Other,
+ CurDAG->getTargetConstant(Reg, DL, MVT::i32),
+ CurDAG->getTargetConstant(Immed, DL, MVT::i16),
+ N->getOperand(0)));
+ return true;
+ }
+
+ // Use the sysreg mapper to attempt to map the remaining possible strings
+ // to the value for the register to be used for the MSR (register)
+ // instruction operand.
+ auto TheReg = AArch64SysReg::lookupSysRegByName(RegString->getString());
+ if (TheReg && TheReg->Writeable &&
+ TheReg->haveFeatures(Subtarget->getFeatureBits()))
+ Reg = TheReg->Encoding;
+ else
+ Reg = AArch64SysReg::parseGenericRegister(RegString->getString());
+ if (Reg != -1) {
+ ReplaceNode(N, CurDAG->getMachineNode(
+ AArch64::MSR, DL, MVT::Other,
+ CurDAG->getTargetConstant(Reg, DL, MVT::i32),
+ N->getOperand(2), N->getOperand(0)));
+ return true;
+ }
+
+ return false;
+}
+
+/// We've got special pseudo-instructions for these
+void AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
+ unsigned Opcode;
+ EVT MemTy = cast<MemSDNode>(N)->getMemoryVT();
+ if (MemTy == MVT::i8)
+ Opcode = AArch64::CMP_SWAP_8;
+ else if (MemTy == MVT::i16)
+ Opcode = AArch64::CMP_SWAP_16;
+ else if (MemTy == MVT::i32)
+ Opcode = AArch64::CMP_SWAP_32;
+ else if (MemTy == MVT::i64)
+ Opcode = AArch64::CMP_SWAP_64;
+ else
+ llvm_unreachable("Unknown AtomicCmpSwap type");
+
+ MVT RegTy = MemTy == MVT::i64 ? MVT::i64 : MVT::i32;
+ SDValue Ops[] = {N->getOperand(1), N->getOperand(2), N->getOperand(3),
+ N->getOperand(0)};
+ SDNode *CmpSwap = CurDAG->getMachineNode(
+ Opcode, SDLoc(N),
+ CurDAG->getVTList(RegTy, MVT::i32, MVT::Other), Ops);
+
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
+ cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1);
+
+ ReplaceUses(SDValue(N, 0), SDValue(CmpSwap, 0));
+ ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 2));
+ CurDAG->RemoveDeadNode(N);
+}
+
+void AArch64DAGToDAGISel::Select(SDNode *Node) {
+ // Dump information about the Node being selected
+ DEBUG(errs() << "Selecting: ");
+ DEBUG(Node->dump(CurDAG));
+ DEBUG(errs() << "\n");
+
+ // If we have a custom node, we already have selected!
+ if (Node->isMachineOpcode()) {
+ DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+ Node->setNodeId(-1);
+ return;
+ }
+
+ // Few custom selection stuff.
+ EVT VT = Node->getValueType(0);
+
+ switch (Node->getOpcode()) {
+ default:
+ break;
+
+ case ISD::ATOMIC_CMP_SWAP:
+ SelectCMP_SWAP(Node);
+ return;
+
+ case ISD::READ_REGISTER:
+ if (tryReadRegister(Node))
+ return;
+ break;
+
+ case ISD::WRITE_REGISTER:
+ if (tryWriteRegister(Node))
+ return;
+ break;
+
+ case ISD::ADD:
+ if (tryMLAV64LaneV128(Node))
+ return;
+ break;
+
+ case ISD::LOAD: {
+ // Try to select as an indexed load. Fall through to normal processing
+ // if we can't.
+ if (tryIndexedLoad(Node))
+ return;
+ break;
+ }
+
+ case ISD::SRL:
+ case ISD::AND:
+ case ISD::SRA:
+ case ISD::SIGN_EXTEND_INREG:
+ if (tryBitfieldExtractOp(Node))
+ return;
+ if (tryBitfieldInsertInZeroOp(Node))
+ return;
+ break;
+
+ case ISD::SIGN_EXTEND:
+ if (tryBitfieldExtractOpFromSExt(Node))
+ return;
+ break;
+
+ case ISD::OR:
+ if (tryBitfieldInsertOp(Node))
+ return;
+ break;
+
+ case ISD::EXTRACT_VECTOR_ELT: {
+ // Extracting lane zero is a special case where we can just use a plain
+ // EXTRACT_SUBREG instruction, which will become FMOV. This is easier for
+ // the rest of the compiler, especially the register allocator and copyi
+ // propagation, to reason about, so is preferred when it's possible to
+ // use it.
+ ConstantSDNode *LaneNode = cast<ConstantSDNode>(Node->getOperand(1));
+ // Bail and use the default Select() for non-zero lanes.
+ if (LaneNode->getZExtValue() != 0)
+ break;
+ // If the element type is not the same as the result type, likewise
+ // bail and use the default Select(), as there's more to do than just
+ // a cross-class COPY. This catches extracts of i8 and i16 elements
+ // since they will need an explicit zext.
+ if (VT != Node->getOperand(0).getValueType().getVectorElementType())
+ break;
+ unsigned SubReg;
+ switch (Node->getOperand(0)
+ .getValueType()
+ .getVectorElementType()
+ .getSizeInBits()) {
+ default:
+ llvm_unreachable("Unexpected vector element type!");
+ case 64:
+ SubReg = AArch64::dsub;
+ break;
+ case 32:
+ SubReg = AArch64::ssub;
+ break;
+ case 16:
+ SubReg = AArch64::hsub;
+ break;
+ case 8:
+ llvm_unreachable("unexpected zext-requiring extract element!");
+ }
+ SDValue Extract = CurDAG->getTargetExtractSubreg(SubReg, SDLoc(Node), VT,
+ Node->getOperand(0));
+ DEBUG(dbgs() << "ISEL: Custom selection!\n=> ");
+ DEBUG(Extract->dumpr(CurDAG));
+ DEBUG(dbgs() << "\n");
+ ReplaceNode(Node, Extract.getNode());
+ return;
+ }
+ case ISD::Constant: {
+ // Materialize zero constants as copies from WZR/XZR. This allows
+ // the coalescer to propagate these into other instructions.
+ ConstantSDNode *ConstNode = cast<ConstantSDNode>(Node);
+ if (ConstNode->isNullValue()) {
+ if (VT == MVT::i32) {
+ SDValue New = CurDAG->getCopyFromReg(
+ CurDAG->getEntryNode(), SDLoc(Node), AArch64::WZR, MVT::i32);
+ ReplaceNode(Node, New.getNode());
+ return;
+ } else if (VT == MVT::i64) {
+ SDValue New = CurDAG->getCopyFromReg(
+ CurDAG->getEntryNode(), SDLoc(Node), AArch64::XZR, MVT::i64);
+ ReplaceNode(Node, New.getNode());
+ return;
+ }
+ }
+ break;
+ }
+
+ case ISD::FrameIndex: {
+ // Selects to ADDXri FI, 0 which in turn will become ADDXri SP, imm.
+ int FI = cast<FrameIndexSDNode>(Node)->getIndex();
+ unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0);
+ const TargetLowering *TLI = getTargetLowering();
+ SDValue TFI = CurDAG->getTargetFrameIndex(
+ FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+ SDLoc DL(Node);
+ SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, DL, MVT::i32),
+ CurDAG->getTargetConstant(Shifter, DL, MVT::i32) };
+ CurDAG->SelectNodeTo(Node, AArch64::ADDXri, MVT::i64, Ops);
+ return;
+ }
+ case ISD::INTRINSIC_W_CHAIN: {
+ unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+ switch (IntNo) {
+ default:
+ break;
+ case Intrinsic::aarch64_ldaxp:
+ case Intrinsic::aarch64_ldxp: {
+ unsigned Op =
+ IntNo == Intrinsic::aarch64_ldaxp ? AArch64::LDAXPX : AArch64::LDXPX;
+ SDValue MemAddr = Node->getOperand(2);
+ SDLoc DL(Node);
+ SDValue Chain = Node->getOperand(0);
+
+ SDNode *Ld = CurDAG->getMachineNode(Op, DL, MVT::i64, MVT::i64,
+ MVT::Other, MemAddr, Chain);
+
+ // Transfer memoperands.
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = cast<MemIntrinsicSDNode>(Node)->getMemOperand();
+ cast<MachineSDNode>(Ld)->setMemRefs(MemOp, MemOp + 1);
+ ReplaceNode(Node, Ld);
+ return;
+ }
+ case Intrinsic::aarch64_stlxp:
+ case Intrinsic::aarch64_stxp: {
+ unsigned Op =
+ IntNo == Intrinsic::aarch64_stlxp ? AArch64::STLXPX : AArch64::STXPX;
+ SDLoc DL(Node);
+ SDValue Chain = Node->getOperand(0);
+ SDValue ValLo = Node->getOperand(2);
+ SDValue ValHi = Node->getOperand(3);
+ SDValue MemAddr = Node->getOperand(4);
+
+ // Place arguments in the right order.
+ SDValue Ops[] = {ValLo, ValHi, MemAddr, Chain};
+
+ SDNode *St = CurDAG->getMachineNode(Op, DL, MVT::i32, MVT::Other, Ops);
+ // Transfer memoperands.
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = cast<MemIntrinsicSDNode>(Node)->getMemOperand();
+ cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
+
+ ReplaceNode(Node, St);
+ return;
+ }
+ case Intrinsic::aarch64_neon_ld1x2:
+ if (VT == MVT::v8i8) {
+ SelectLoad(Node, 2, AArch64::LD1Twov8b, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectLoad(Node, 2, AArch64::LD1Twov16b, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectLoad(Node, 2, AArch64::LD1Twov4h, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectLoad(Node, 2, AArch64::LD1Twov8h, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectLoad(Node, 2, AArch64::LD1Twov2s, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectLoad(Node, 2, AArch64::LD1Twov4s, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectLoad(Node, 2, AArch64::LD1Twov2d, AArch64::qsub0);
+ return;
+ }
+ break;
+ case Intrinsic::aarch64_neon_ld1x3:
+ if (VT == MVT::v8i8) {
+ SelectLoad(Node, 3, AArch64::LD1Threev8b, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectLoad(Node, 3, AArch64::LD1Threev16b, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectLoad(Node, 3, AArch64::LD1Threev4h, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectLoad(Node, 3, AArch64::LD1Threev8h, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectLoad(Node, 3, AArch64::LD1Threev2s, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectLoad(Node, 3, AArch64::LD1Threev4s, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectLoad(Node, 3, AArch64::LD1Threev2d, AArch64::qsub0);
+ return;
+ }
+ break;
+ case Intrinsic::aarch64_neon_ld1x4:
+ if (VT == MVT::v8i8) {
+ SelectLoad(Node, 4, AArch64::LD1Fourv8b, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectLoad(Node, 4, AArch64::LD1Fourv16b, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectLoad(Node, 4, AArch64::LD1Fourv4h, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectLoad(Node, 4, AArch64::LD1Fourv8h, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectLoad(Node, 4, AArch64::LD1Fourv2s, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectLoad(Node, 4, AArch64::LD1Fourv4s, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectLoad(Node, 4, AArch64::LD1Fourv2d, AArch64::qsub0);
+ return;
+ }
+ break;
+ case Intrinsic::aarch64_neon_ld2:
+ if (VT == MVT::v8i8) {
+ SelectLoad(Node, 2, AArch64::LD2Twov8b, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectLoad(Node, 2, AArch64::LD2Twov16b, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectLoad(Node, 2, AArch64::LD2Twov4h, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectLoad(Node, 2, AArch64::LD2Twov8h, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectLoad(Node, 2, AArch64::LD2Twov2s, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectLoad(Node, 2, AArch64::LD2Twov4s, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectLoad(Node, 2, AArch64::LD2Twov2d, AArch64::qsub0);
+ return;
+ }
+ break;
+ case Intrinsic::aarch64_neon_ld3:
+ if (VT == MVT::v8i8) {
+ SelectLoad(Node, 3, AArch64::LD3Threev8b, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectLoad(Node, 3, AArch64::LD3Threev16b, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectLoad(Node, 3, AArch64::LD3Threev4h, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectLoad(Node, 3, AArch64::LD3Threev8h, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectLoad(Node, 3, AArch64::LD3Threev2s, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectLoad(Node, 3, AArch64::LD3Threev4s, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectLoad(Node, 3, AArch64::LD3Threev2d, AArch64::qsub0);
+ return;
+ }
+ break;
+ case Intrinsic::aarch64_neon_ld4:
+ if (VT == MVT::v8i8) {
+ SelectLoad(Node, 4, AArch64::LD4Fourv8b, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectLoad(Node, 4, AArch64::LD4Fourv16b, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectLoad(Node, 4, AArch64::LD4Fourv4h, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectLoad(Node, 4, AArch64::LD4Fourv8h, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectLoad(Node, 4, AArch64::LD4Fourv2s, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectLoad(Node, 4, AArch64::LD4Fourv4s, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectLoad(Node, 4, AArch64::LD4Fourv2d, AArch64::qsub0);
+ return;
+ }
+ break;
+ case Intrinsic::aarch64_neon_ld2r:
+ if (VT == MVT::v8i8) {
+ SelectLoad(Node, 2, AArch64::LD2Rv8b, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectLoad(Node, 2, AArch64::LD2Rv16b, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectLoad(Node, 2, AArch64::LD2Rv4h, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectLoad(Node, 2, AArch64::LD2Rv8h, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectLoad(Node, 2, AArch64::LD2Rv2s, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectLoad(Node, 2, AArch64::LD2Rv4s, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectLoad(Node, 2, AArch64::LD2Rv1d, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectLoad(Node, 2, AArch64::LD2Rv2d, AArch64::qsub0);
+ return;
+ }
+ break;
+ case Intrinsic::aarch64_neon_ld3r:
+ if (VT == MVT::v8i8) {
+ SelectLoad(Node, 3, AArch64::LD3Rv8b, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectLoad(Node, 3, AArch64::LD3Rv16b, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectLoad(Node, 3, AArch64::LD3Rv4h, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectLoad(Node, 3, AArch64::LD3Rv8h, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectLoad(Node, 3, AArch64::LD3Rv2s, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectLoad(Node, 3, AArch64::LD3Rv4s, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectLoad(Node, 3, AArch64::LD3Rv1d, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectLoad(Node, 3, AArch64::LD3Rv2d, AArch64::qsub0);
+ return;
+ }
+ break;
+ case Intrinsic::aarch64_neon_ld4r:
+ if (VT == MVT::v8i8) {
+ SelectLoad(Node, 4, AArch64::LD4Rv8b, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectLoad(Node, 4, AArch64::LD4Rv16b, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectLoad(Node, 4, AArch64::LD4Rv4h, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectLoad(Node, 4, AArch64::LD4Rv8h, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectLoad(Node, 4, AArch64::LD4Rv2s, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectLoad(Node, 4, AArch64::LD4Rv4s, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectLoad(Node, 4, AArch64::LD4Rv1d, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectLoad(Node, 4, AArch64::LD4Rv2d, AArch64::qsub0);
+ return;
+ }
+ break;
+ case Intrinsic::aarch64_neon_ld2lane:
+ if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+ SelectLoadLane(Node, 2, AArch64::LD2i8);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16) {
+ SelectLoadLane(Node, 2, AArch64::LD2i16);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+ VT == MVT::v2f32) {
+ SelectLoadLane(Node, 2, AArch64::LD2i32);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+ VT == MVT::v1f64) {
+ SelectLoadLane(Node, 2, AArch64::LD2i64);
+ return;
+ }
+ break;
+ case Intrinsic::aarch64_neon_ld3lane:
+ if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+ SelectLoadLane(Node, 3, AArch64::LD3i8);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16) {
+ SelectLoadLane(Node, 3, AArch64::LD3i16);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+ VT == MVT::v2f32) {
+ SelectLoadLane(Node, 3, AArch64::LD3i32);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+ VT == MVT::v1f64) {
+ SelectLoadLane(Node, 3, AArch64::LD3i64);
+ return;
+ }
+ break;
+ case Intrinsic::aarch64_neon_ld4lane:
+ if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+ SelectLoadLane(Node, 4, AArch64::LD4i8);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16) {
+ SelectLoadLane(Node, 4, AArch64::LD4i16);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+ VT == MVT::v2f32) {
+ SelectLoadLane(Node, 4, AArch64::LD4i32);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+ VT == MVT::v1f64) {
+ SelectLoadLane(Node, 4, AArch64::LD4i64);
+ return;
+ }
+ break;
+ }
+ } break;
+ case ISD::INTRINSIC_WO_CHAIN: {
+ unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
+ switch (IntNo) {
+ default:
+ break;
+ case Intrinsic::aarch64_neon_tbl2:
+ SelectTable(Node, 2,
+ VT == MVT::v8i8 ? AArch64::TBLv8i8Two : AArch64::TBLv16i8Two,
+ false);
+ return;
+ case Intrinsic::aarch64_neon_tbl3:
+ SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBLv8i8Three
+ : AArch64::TBLv16i8Three,
+ false);
+ return;
+ case Intrinsic::aarch64_neon_tbl4:
+ SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBLv8i8Four
+ : AArch64::TBLv16i8Four,
+ false);
+ return;
+ case Intrinsic::aarch64_neon_tbx2:
+ SelectTable(Node, 2,
+ VT == MVT::v8i8 ? AArch64::TBXv8i8Two : AArch64::TBXv16i8Two,
+ true);
+ return;
+ case Intrinsic::aarch64_neon_tbx3:
+ SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBXv8i8Three
+ : AArch64::TBXv16i8Three,
+ true);
+ return;
+ case Intrinsic::aarch64_neon_tbx4:
+ SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBXv8i8Four
+ : AArch64::TBXv16i8Four,
+ true);
+ return;
+ case Intrinsic::aarch64_neon_smull:
+ case Intrinsic::aarch64_neon_umull:
+ if (tryMULLV64LaneV128(IntNo, Node))
+ return;
+ break;
+ }
+ break;
+ }
+ case ISD::INTRINSIC_VOID: {
+ unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+ if (Node->getNumOperands() >= 3)
+ VT = Node->getOperand(2)->getValueType(0);
+ switch (IntNo) {
+ default:
+ break;
+ case Intrinsic::aarch64_neon_st1x2: {
+ if (VT == MVT::v8i8) {
+ SelectStore(Node, 2, AArch64::ST1Twov8b);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectStore(Node, 2, AArch64::ST1Twov16b);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectStore(Node, 2, AArch64::ST1Twov4h);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectStore(Node, 2, AArch64::ST1Twov8h);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectStore(Node, 2, AArch64::ST1Twov2s);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectStore(Node, 2, AArch64::ST1Twov4s);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectStore(Node, 2, AArch64::ST1Twov2d);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectStore(Node, 2, AArch64::ST1Twov1d);
+ return;
+ }
+ break;
+ }
+ case Intrinsic::aarch64_neon_st1x3: {
+ if (VT == MVT::v8i8) {
+ SelectStore(Node, 3, AArch64::ST1Threev8b);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectStore(Node, 3, AArch64::ST1Threev16b);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectStore(Node, 3, AArch64::ST1Threev4h);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectStore(Node, 3, AArch64::ST1Threev8h);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectStore(Node, 3, AArch64::ST1Threev2s);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectStore(Node, 3, AArch64::ST1Threev4s);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectStore(Node, 3, AArch64::ST1Threev2d);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectStore(Node, 3, AArch64::ST1Threev1d);
+ return;
+ }
+ break;
+ }
+ case Intrinsic::aarch64_neon_st1x4: {
+ if (VT == MVT::v8i8) {
+ SelectStore(Node, 4, AArch64::ST1Fourv8b);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectStore(Node, 4, AArch64::ST1Fourv16b);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectStore(Node, 4, AArch64::ST1Fourv4h);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectStore(Node, 4, AArch64::ST1Fourv8h);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectStore(Node, 4, AArch64::ST1Fourv2s);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectStore(Node, 4, AArch64::ST1Fourv4s);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectStore(Node, 4, AArch64::ST1Fourv2d);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectStore(Node, 4, AArch64::ST1Fourv1d);
+ return;
+ }
+ break;
+ }
+ case Intrinsic::aarch64_neon_st2: {
+ if (VT == MVT::v8i8) {
+ SelectStore(Node, 2, AArch64::ST2Twov8b);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectStore(Node, 2, AArch64::ST2Twov16b);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectStore(Node, 2, AArch64::ST2Twov4h);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectStore(Node, 2, AArch64::ST2Twov8h);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectStore(Node, 2, AArch64::ST2Twov2s);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectStore(Node, 2, AArch64::ST2Twov4s);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectStore(Node, 2, AArch64::ST2Twov2d);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectStore(Node, 2, AArch64::ST1Twov1d);
+ return;
+ }
+ break;
+ }
+ case Intrinsic::aarch64_neon_st3: {
+ if (VT == MVT::v8i8) {
+ SelectStore(Node, 3, AArch64::ST3Threev8b);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectStore(Node, 3, AArch64::ST3Threev16b);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectStore(Node, 3, AArch64::ST3Threev4h);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectStore(Node, 3, AArch64::ST3Threev8h);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectStore(Node, 3, AArch64::ST3Threev2s);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectStore(Node, 3, AArch64::ST3Threev4s);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectStore(Node, 3, AArch64::ST3Threev2d);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectStore(Node, 3, AArch64::ST1Threev1d);
+ return;
+ }
+ break;
+ }
+ case Intrinsic::aarch64_neon_st4: {
+ if (VT == MVT::v8i8) {
+ SelectStore(Node, 4, AArch64::ST4Fourv8b);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectStore(Node, 4, AArch64::ST4Fourv16b);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectStore(Node, 4, AArch64::ST4Fourv4h);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectStore(Node, 4, AArch64::ST4Fourv8h);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectStore(Node, 4, AArch64::ST4Fourv2s);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectStore(Node, 4, AArch64::ST4Fourv4s);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectStore(Node, 4, AArch64::ST4Fourv2d);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectStore(Node, 4, AArch64::ST1Fourv1d);
+ return;
+ }
+ break;
+ }
+ case Intrinsic::aarch64_neon_st2lane: {
+ if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+ SelectStoreLane(Node, 2, AArch64::ST2i8);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16) {
+ SelectStoreLane(Node, 2, AArch64::ST2i16);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+ VT == MVT::v2f32) {
+ SelectStoreLane(Node, 2, AArch64::ST2i32);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+ VT == MVT::v1f64) {
+ SelectStoreLane(Node, 2, AArch64::ST2i64);
+ return;
+ }
+ break;
+ }
+ case Intrinsic::aarch64_neon_st3lane: {
+ if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+ SelectStoreLane(Node, 3, AArch64::ST3i8);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16) {
+ SelectStoreLane(Node, 3, AArch64::ST3i16);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+ VT == MVT::v2f32) {
+ SelectStoreLane(Node, 3, AArch64::ST3i32);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+ VT == MVT::v1f64) {
+ SelectStoreLane(Node, 3, AArch64::ST3i64);
+ return;
+ }
+ break;
+ }
+ case Intrinsic::aarch64_neon_st4lane: {
+ if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+ SelectStoreLane(Node, 4, AArch64::ST4i8);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16) {
+ SelectStoreLane(Node, 4, AArch64::ST4i16);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+ VT == MVT::v2f32) {
+ SelectStoreLane(Node, 4, AArch64::ST4i32);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+ VT == MVT::v1f64) {
+ SelectStoreLane(Node, 4, AArch64::ST4i64);
+ return;
+ }
+ break;
+ }
+ }
+ break;
+ }
+ case AArch64ISD::LD2post: {
+ if (VT == MVT::v8i8) {
+ SelectPostLoad(Node, 2, AArch64::LD2Twov8b_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectPostLoad(Node, 2, AArch64::LD2Twov16b_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectPostLoad(Node, 2, AArch64::LD2Twov4h_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectPostLoad(Node, 2, AArch64::LD2Twov8h_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectPostLoad(Node, 2, AArch64::LD2Twov2s_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectPostLoad(Node, 2, AArch64::LD2Twov4s_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectPostLoad(Node, 2, AArch64::LD2Twov2d_POST, AArch64::qsub0);
+ return;
+ }
+ break;
+ }
+ case AArch64ISD::LD3post: {
+ if (VT == MVT::v8i8) {
+ SelectPostLoad(Node, 3, AArch64::LD3Threev8b_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectPostLoad(Node, 3, AArch64::LD3Threev16b_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectPostLoad(Node, 3, AArch64::LD3Threev4h_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectPostLoad(Node, 3, AArch64::LD3Threev8h_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectPostLoad(Node, 3, AArch64::LD3Threev2s_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectPostLoad(Node, 3, AArch64::LD3Threev4s_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectPostLoad(Node, 3, AArch64::LD3Threev2d_POST, AArch64::qsub0);
+ return;
+ }
+ break;
+ }
+ case AArch64ISD::LD4post: {
+ if (VT == MVT::v8i8) {
+ SelectPostLoad(Node, 4, AArch64::LD4Fourv8b_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectPostLoad(Node, 4, AArch64::LD4Fourv16b_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectPostLoad(Node, 4, AArch64::LD4Fourv4h_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectPostLoad(Node, 4, AArch64::LD4Fourv8h_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectPostLoad(Node, 4, AArch64::LD4Fourv2s_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectPostLoad(Node, 4, AArch64::LD4Fourv4s_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectPostLoad(Node, 4, AArch64::LD4Fourv2d_POST, AArch64::qsub0);
+ return;
+ }
+ break;
+ }
+ case AArch64ISD::LD1x2post: {
+ if (VT == MVT::v8i8) {
+ SelectPostLoad(Node, 2, AArch64::LD1Twov8b_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectPostLoad(Node, 2, AArch64::LD1Twov16b_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectPostLoad(Node, 2, AArch64::LD1Twov4h_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectPostLoad(Node, 2, AArch64::LD1Twov8h_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectPostLoad(Node, 2, AArch64::LD1Twov2s_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectPostLoad(Node, 2, AArch64::LD1Twov4s_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectPostLoad(Node, 2, AArch64::LD1Twov2d_POST, AArch64::qsub0);
+ return;
+ }
+ break;
+ }
+ case AArch64ISD::LD1x3post: {
+ if (VT == MVT::v8i8) {
+ SelectPostLoad(Node, 3, AArch64::LD1Threev8b_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectPostLoad(Node, 3, AArch64::LD1Threev16b_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectPostLoad(Node, 3, AArch64::LD1Threev4h_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectPostLoad(Node, 3, AArch64::LD1Threev8h_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectPostLoad(Node, 3, AArch64::LD1Threev2s_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectPostLoad(Node, 3, AArch64::LD1Threev4s_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectPostLoad(Node, 3, AArch64::LD1Threev2d_POST, AArch64::qsub0);
+ return;
+ }
+ break;
+ }
+ case AArch64ISD::LD1x4post: {
+ if (VT == MVT::v8i8) {
+ SelectPostLoad(Node, 4, AArch64::LD1Fourv8b_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectPostLoad(Node, 4, AArch64::LD1Fourv16b_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectPostLoad(Node, 4, AArch64::LD1Fourv4h_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectPostLoad(Node, 4, AArch64::LD1Fourv8h_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectPostLoad(Node, 4, AArch64::LD1Fourv2s_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectPostLoad(Node, 4, AArch64::LD1Fourv4s_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectPostLoad(Node, 4, AArch64::LD1Fourv2d_POST, AArch64::qsub0);
+ return;
+ }
+ break;
+ }
+ case AArch64ISD::LD1DUPpost: {
+ if (VT == MVT::v8i8) {
+ SelectPostLoad(Node, 1, AArch64::LD1Rv8b_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectPostLoad(Node, 1, AArch64::LD1Rv16b_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectPostLoad(Node, 1, AArch64::LD1Rv4h_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectPostLoad(Node, 1, AArch64::LD1Rv8h_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectPostLoad(Node, 1, AArch64::LD1Rv2s_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectPostLoad(Node, 1, AArch64::LD1Rv4s_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectPostLoad(Node, 1, AArch64::LD1Rv1d_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectPostLoad(Node, 1, AArch64::LD1Rv2d_POST, AArch64::qsub0);
+ return;
+ }
+ break;
+ }
+ case AArch64ISD::LD2DUPpost: {
+ if (VT == MVT::v8i8) {
+ SelectPostLoad(Node, 2, AArch64::LD2Rv8b_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectPostLoad(Node, 2, AArch64::LD2Rv16b_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectPostLoad(Node, 2, AArch64::LD2Rv4h_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectPostLoad(Node, 2, AArch64::LD2Rv8h_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectPostLoad(Node, 2, AArch64::LD2Rv2s_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectPostLoad(Node, 2, AArch64::LD2Rv4s_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectPostLoad(Node, 2, AArch64::LD2Rv1d_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectPostLoad(Node, 2, AArch64::LD2Rv2d_POST, AArch64::qsub0);
+ return;
+ }
+ break;
+ }
+ case AArch64ISD::LD3DUPpost: {
+ if (VT == MVT::v8i8) {
+ SelectPostLoad(Node, 3, AArch64::LD3Rv8b_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectPostLoad(Node, 3, AArch64::LD3Rv16b_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectPostLoad(Node, 3, AArch64::LD3Rv4h_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectPostLoad(Node, 3, AArch64::LD3Rv8h_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectPostLoad(Node, 3, AArch64::LD3Rv2s_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectPostLoad(Node, 3, AArch64::LD3Rv4s_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectPostLoad(Node, 3, AArch64::LD3Rv1d_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectPostLoad(Node, 3, AArch64::LD3Rv2d_POST, AArch64::qsub0);
+ return;
+ }
+ break;
+ }
+ case AArch64ISD::LD4DUPpost: {
+ if (VT == MVT::v8i8) {
+ SelectPostLoad(Node, 4, AArch64::LD4Rv8b_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectPostLoad(Node, 4, AArch64::LD4Rv16b_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectPostLoad(Node, 4, AArch64::LD4Rv4h_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectPostLoad(Node, 4, AArch64::LD4Rv8h_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectPostLoad(Node, 4, AArch64::LD4Rv2s_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectPostLoad(Node, 4, AArch64::LD4Rv4s_POST, AArch64::qsub0);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectPostLoad(Node, 4, AArch64::LD4Rv1d_POST, AArch64::dsub0);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectPostLoad(Node, 4, AArch64::LD4Rv2d_POST, AArch64::qsub0);
+ return;
+ }
+ break;
+ }
+ case AArch64ISD::LD1LANEpost: {
+ if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+ SelectPostLoadLane(Node, 1, AArch64::LD1i8_POST);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16) {
+ SelectPostLoadLane(Node, 1, AArch64::LD1i16_POST);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+ VT == MVT::v2f32) {
+ SelectPostLoadLane(Node, 1, AArch64::LD1i32_POST);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+ VT == MVT::v1f64) {
+ SelectPostLoadLane(Node, 1, AArch64::LD1i64_POST);
+ return;
+ }
+ break;
+ }
+ case AArch64ISD::LD2LANEpost: {
+ if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+ SelectPostLoadLane(Node, 2, AArch64::LD2i8_POST);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16) {
+ SelectPostLoadLane(Node, 2, AArch64::LD2i16_POST);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+ VT == MVT::v2f32) {
+ SelectPostLoadLane(Node, 2, AArch64::LD2i32_POST);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+ VT == MVT::v1f64) {
+ SelectPostLoadLane(Node, 2, AArch64::LD2i64_POST);
+ return;
+ }
+ break;
+ }
+ case AArch64ISD::LD3LANEpost: {
+ if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+ SelectPostLoadLane(Node, 3, AArch64::LD3i8_POST);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16) {
+ SelectPostLoadLane(Node, 3, AArch64::LD3i16_POST);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+ VT == MVT::v2f32) {
+ SelectPostLoadLane(Node, 3, AArch64::LD3i32_POST);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+ VT == MVT::v1f64) {
+ SelectPostLoadLane(Node, 3, AArch64::LD3i64_POST);
+ return;
+ }
+ break;
+ }
+ case AArch64ISD::LD4LANEpost: {
+ if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+ SelectPostLoadLane(Node, 4, AArch64::LD4i8_POST);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16) {
+ SelectPostLoadLane(Node, 4, AArch64::LD4i16_POST);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+ VT == MVT::v2f32) {
+ SelectPostLoadLane(Node, 4, AArch64::LD4i32_POST);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+ VT == MVT::v1f64) {
+ SelectPostLoadLane(Node, 4, AArch64::LD4i64_POST);
+ return;
+ }
+ break;
+ }
+ case AArch64ISD::ST2post: {
+ VT = Node->getOperand(1).getValueType();
+ if (VT == MVT::v8i8) {
+ SelectPostStore(Node, 2, AArch64::ST2Twov8b_POST);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectPostStore(Node, 2, AArch64::ST2Twov16b_POST);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectPostStore(Node, 2, AArch64::ST2Twov4h_POST);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectPostStore(Node, 2, AArch64::ST2Twov8h_POST);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectPostStore(Node, 2, AArch64::ST2Twov2s_POST);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectPostStore(Node, 2, AArch64::ST2Twov4s_POST);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectPostStore(Node, 2, AArch64::ST2Twov2d_POST);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
+ return;
+ }
+ break;
+ }
+ case AArch64ISD::ST3post: {
+ VT = Node->getOperand(1).getValueType();
+ if (VT == MVT::v8i8) {
+ SelectPostStore(Node, 3, AArch64::ST3Threev8b_POST);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectPostStore(Node, 3, AArch64::ST3Threev16b_POST);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectPostStore(Node, 3, AArch64::ST3Threev4h_POST);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectPostStore(Node, 3, AArch64::ST3Threev8h_POST);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectPostStore(Node, 3, AArch64::ST3Threev2s_POST);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectPostStore(Node, 3, AArch64::ST3Threev4s_POST);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectPostStore(Node, 3, AArch64::ST3Threev2d_POST);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
+ return;
+ }
+ break;
+ }
+ case AArch64ISD::ST4post: {
+ VT = Node->getOperand(1).getValueType();
+ if (VT == MVT::v8i8) {
+ SelectPostStore(Node, 4, AArch64::ST4Fourv8b_POST);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectPostStore(Node, 4, AArch64::ST4Fourv16b_POST);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectPostStore(Node, 4, AArch64::ST4Fourv4h_POST);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectPostStore(Node, 4, AArch64::ST4Fourv8h_POST);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectPostStore(Node, 4, AArch64::ST4Fourv2s_POST);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectPostStore(Node, 4, AArch64::ST4Fourv4s_POST);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectPostStore(Node, 4, AArch64::ST4Fourv2d_POST);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
+ return;
+ }
+ break;
+ }
+ case AArch64ISD::ST1x2post: {
+ VT = Node->getOperand(1).getValueType();
+ if (VT == MVT::v8i8) {
+ SelectPostStore(Node, 2, AArch64::ST1Twov8b_POST);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectPostStore(Node, 2, AArch64::ST1Twov16b_POST);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectPostStore(Node, 2, AArch64::ST1Twov4h_POST);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectPostStore(Node, 2, AArch64::ST1Twov8h_POST);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectPostStore(Node, 2, AArch64::ST1Twov2s_POST);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectPostStore(Node, 2, AArch64::ST1Twov4s_POST);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectPostStore(Node, 2, AArch64::ST1Twov2d_POST);
+ return;
+ }
+ break;
+ }
+ case AArch64ISD::ST1x3post: {
+ VT = Node->getOperand(1).getValueType();
+ if (VT == MVT::v8i8) {
+ SelectPostStore(Node, 3, AArch64::ST1Threev8b_POST);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectPostStore(Node, 3, AArch64::ST1Threev16b_POST);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectPostStore(Node, 3, AArch64::ST1Threev4h_POST);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectPostStore(Node, 3, AArch64::ST1Threev8h_POST);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectPostStore(Node, 3, AArch64::ST1Threev2s_POST);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectPostStore(Node, 3, AArch64::ST1Threev4s_POST);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectPostStore(Node, 3, AArch64::ST1Threev2d_POST);
+ return;
+ }
+ break;
+ }
+ case AArch64ISD::ST1x4post: {
+ VT = Node->getOperand(1).getValueType();
+ if (VT == MVT::v8i8) {
+ SelectPostStore(Node, 4, AArch64::ST1Fourv8b_POST);
+ return;
+ } else if (VT == MVT::v16i8) {
+ SelectPostStore(Node, 4, AArch64::ST1Fourv16b_POST);
+ return;
+ } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SelectPostStore(Node, 4, AArch64::ST1Fourv4h_POST);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+ SelectPostStore(Node, 4, AArch64::ST1Fourv8h_POST);
+ return;
+ } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+ SelectPostStore(Node, 4, AArch64::ST1Fourv2s_POST);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+ SelectPostStore(Node, 4, AArch64::ST1Fourv4s_POST);
+ return;
+ } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+ SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ SelectPostStore(Node, 4, AArch64::ST1Fourv2d_POST);
+ return;
+ }
+ break;
+ }
+ case AArch64ISD::ST2LANEpost: {
+ VT = Node->getOperand(1).getValueType();
+ if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+ SelectPostStoreLane(Node, 2, AArch64::ST2i8_POST);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16) {
+ SelectPostStoreLane(Node, 2, AArch64::ST2i16_POST);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+ VT == MVT::v2f32) {
+ SelectPostStoreLane(Node, 2, AArch64::ST2i32_POST);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+ VT == MVT::v1f64) {
+ SelectPostStoreLane(Node, 2, AArch64::ST2i64_POST);
+ return;
+ }
+ break;
+ }
+ case AArch64ISD::ST3LANEpost: {
+ VT = Node->getOperand(1).getValueType();
+ if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+ SelectPostStoreLane(Node, 3, AArch64::ST3i8_POST);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16) {
+ SelectPostStoreLane(Node, 3, AArch64::ST3i16_POST);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+ VT == MVT::v2f32) {
+ SelectPostStoreLane(Node, 3, AArch64::ST3i32_POST);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+ VT == MVT::v1f64) {
+ SelectPostStoreLane(Node, 3, AArch64::ST3i64_POST);
+ return;
+ }
+ break;
+ }
+ case AArch64ISD::ST4LANEpost: {
+ VT = Node->getOperand(1).getValueType();
+ if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+ SelectPostStoreLane(Node, 4, AArch64::ST4i8_POST);
+ return;
+ } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16) {
+ SelectPostStoreLane(Node, 4, AArch64::ST4i16_POST);
+ return;
+ } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+ VT == MVT::v2f32) {
+ SelectPostStoreLane(Node, 4, AArch64::ST4i32_POST);
+ return;
+ } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+ VT == MVT::v1f64) {
+ SelectPostStoreLane(Node, 4, AArch64::ST4i64_POST);
+ return;
+ }
+ break;
+ }
+ }
+
+ // Select the default instruction
+ SelectCode(Node);
+}
+
+/// createAArch64ISelDag - This pass converts a legalized DAG into a
+/// AArch64-specific DAG, ready for instruction scheduling.
+FunctionPass *llvm::createAArch64ISelDag(AArch64TargetMachine &TM,
+ CodeGenOpt::Level OptLevel) {
+ return new AArch64DAGToDAGISel(TM, OptLevel);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
new file mode 100644
index 000000000000..4c98253878e4
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -0,0 +1,10666 @@
+//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64TargetLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64ISelLowering.h"
+#include "AArch64CallingConvention.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64PerfectShuffle.h"
+#include "AArch64Subtarget.h"
+#include "AArch64TargetMachine.h"
+#include "AArch64TargetObjectFile.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-lower"
+
+STATISTIC(NumTailCalls, "Number of tail calls");
+STATISTIC(NumShiftInserts, "Number of vector shift inserts");
+
+static cl::opt<bool>
+EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
+ cl::desc("Allow AArch64 SLI/SRI formation"),
+ cl::init(false));
+
+// FIXME: The necessary dtprel relocations don't seem to be supported
+// well in the GNU bfd and gold linkers at the moment. Therefore, by
+// default, for now, fall back to GeneralDynamic code generation.
+cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
+ "aarch64-elf-ldtls-generation", cl::Hidden,
+ cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
+ cl::init(false));
+
+/// Value type used for condition codes.
+static const MVT MVT_CC = MVT::i32;
+
+AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
+ const AArch64Subtarget &STI)
+ : TargetLowering(TM), Subtarget(&STI) {
+
+ // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
+ // we have to make something up. Arbitrarily, choose ZeroOrOne.
+ setBooleanContents(ZeroOrOneBooleanContent);
+ // When comparing vectors the result sets the different elements in the
+ // vector to all-one or all-zero.
+ setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+
+ // Set up the register classes.
+ addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
+ addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
+
+ if (Subtarget->hasFPARMv8()) {
+ addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
+ addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
+ addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
+ addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
+ }
+
+ if (Subtarget->hasNEON()) {
+ addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
+ addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
+ // Someone set us up the NEON.
+ addDRTypeForNEON(MVT::v2f32);
+ addDRTypeForNEON(MVT::v8i8);
+ addDRTypeForNEON(MVT::v4i16);
+ addDRTypeForNEON(MVT::v2i32);
+ addDRTypeForNEON(MVT::v1i64);
+ addDRTypeForNEON(MVT::v1f64);
+ addDRTypeForNEON(MVT::v4f16);
+
+ addQRTypeForNEON(MVT::v4f32);
+ addQRTypeForNEON(MVT::v2f64);
+ addQRTypeForNEON(MVT::v16i8);
+ addQRTypeForNEON(MVT::v8i16);
+ addQRTypeForNEON(MVT::v4i32);
+ addQRTypeForNEON(MVT::v2i64);
+ addQRTypeForNEON(MVT::v8f16);
+ }
+
+ // Compute derived properties from the register classes
+ computeRegisterProperties(Subtarget->getRegisterInfo());
+
+ // Provide all sorts of operation actions
+ setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+ setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
+ setOperationAction(ISD::SETCC, MVT::i32, Custom);
+ setOperationAction(ISD::SETCC, MVT::i64, Custom);
+ setOperationAction(ISD::SETCC, MVT::f32, Custom);
+ setOperationAction(ISD::SETCC, MVT::f64, Custom);
+ setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+ setOperationAction(ISD::BR_CC, MVT::i32, Custom);
+ setOperationAction(ISD::BR_CC, MVT::i64, Custom);
+ setOperationAction(ISD::BR_CC, MVT::f32, Custom);
+ setOperationAction(ISD::BR_CC, MVT::f64, Custom);
+ setOperationAction(ISD::SELECT, MVT::i32, Custom);
+ setOperationAction(ISD::SELECT, MVT::i64, Custom);
+ setOperationAction(ISD::SELECT, MVT::f32, Custom);
+ setOperationAction(ISD::SELECT, MVT::f64, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
+ setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+ setOperationAction(ISD::JumpTable, MVT::i64, Custom);
+
+ setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
+ setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
+ setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
+
+ setOperationAction(ISD::FREM, MVT::f32, Expand);
+ setOperationAction(ISD::FREM, MVT::f64, Expand);
+ setOperationAction(ISD::FREM, MVT::f80, Expand);
+
+ // Custom lowering hooks are needed for XOR
+ // to fold it into CSINC/CSINV.
+ setOperationAction(ISD::XOR, MVT::i32, Custom);
+ setOperationAction(ISD::XOR, MVT::i64, Custom);
+
+ // Virtually no operation on f128 is legal, but LLVM can't expand them when
+ // there's a valid register class, so we need custom operations in most cases.
+ setOperationAction(ISD::FABS, MVT::f128, Expand);
+ setOperationAction(ISD::FADD, MVT::f128, Custom);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
+ setOperationAction(ISD::FCOS, MVT::f128, Expand);
+ setOperationAction(ISD::FDIV, MVT::f128, Custom);
+ setOperationAction(ISD::FMA, MVT::f128, Expand);
+ setOperationAction(ISD::FMUL, MVT::f128, Custom);
+ setOperationAction(ISD::FNEG, MVT::f128, Expand);
+ setOperationAction(ISD::FPOW, MVT::f128, Expand);
+ setOperationAction(ISD::FREM, MVT::f128, Expand);
+ setOperationAction(ISD::FRINT, MVT::f128, Expand);
+ setOperationAction(ISD::FSIN, MVT::f128, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
+ setOperationAction(ISD::FSQRT, MVT::f128, Expand);
+ setOperationAction(ISD::FSUB, MVT::f128, Custom);
+ setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
+ setOperationAction(ISD::SETCC, MVT::f128, Custom);
+ setOperationAction(ISD::BR_CC, MVT::f128, Custom);
+ setOperationAction(ISD::SELECT, MVT::f128, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
+ setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
+
+ // Lowering for many of the conversions is actually specified by the non-f128
+ // type. The LowerXXX function will be trivial when f128 isn't involved.
+ setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+ setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+ setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
+ setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
+ setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
+
+ // Variable arguments.
+ setOperationAction(ISD::VASTART, MVT::Other, Custom);
+ setOperationAction(ISD::VAARG, MVT::Other, Custom);
+ setOperationAction(ISD::VACOPY, MVT::Other, Custom);
+ setOperationAction(ISD::VAEND, MVT::Other, Expand);
+
+ // Variable-sized objects.
+ setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+ setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
+
+ // Constant pool entries
+ setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
+
+ // BlockAddress
+ setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
+
+ // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
+ setOperationAction(ISD::ADDC, MVT::i32, Custom);
+ setOperationAction(ISD::ADDE, MVT::i32, Custom);
+ setOperationAction(ISD::SUBC, MVT::i32, Custom);
+ setOperationAction(ISD::SUBE, MVT::i32, Custom);
+ setOperationAction(ISD::ADDC, MVT::i64, Custom);
+ setOperationAction(ISD::ADDE, MVT::i64, Custom);
+ setOperationAction(ISD::SUBC, MVT::i64, Custom);
+ setOperationAction(ISD::SUBE, MVT::i64, Custom);
+
+ // AArch64 lacks both left-rotate and popcount instructions.
+ setOperationAction(ISD::ROTL, MVT::i32, Expand);
+ setOperationAction(ISD::ROTL, MVT::i64, Expand);
+ for (MVT VT : MVT::vector_valuetypes()) {
+ setOperationAction(ISD::ROTL, VT, Expand);
+ setOperationAction(ISD::ROTR, VT, Expand);
+ }
+
+ // AArch64 doesn't have {U|S}MUL_LOHI.
+ setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+ setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+
+
+ setOperationAction(ISD::CTPOP, MVT::i32, Custom);
+ setOperationAction(ISD::CTPOP, MVT::i64, Custom);
+
+ setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+ setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+ for (MVT VT : MVT::vector_valuetypes()) {
+ setOperationAction(ISD::SDIVREM, VT, Expand);
+ setOperationAction(ISD::UDIVREM, VT, Expand);
+ }
+ setOperationAction(ISD::SREM, MVT::i32, Expand);
+ setOperationAction(ISD::SREM, MVT::i64, Expand);
+ setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+ setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
+ setOperationAction(ISD::UREM, MVT::i32, Expand);
+ setOperationAction(ISD::UREM, MVT::i64, Expand);
+
+ // Custom lower Add/Sub/Mul with overflow.
+ setOperationAction(ISD::SADDO, MVT::i32, Custom);
+ setOperationAction(ISD::SADDO, MVT::i64, Custom);
+ setOperationAction(ISD::UADDO, MVT::i32, Custom);
+ setOperationAction(ISD::UADDO, MVT::i64, Custom);
+ setOperationAction(ISD::SSUBO, MVT::i32, Custom);
+ setOperationAction(ISD::SSUBO, MVT::i64, Custom);
+ setOperationAction(ISD::USUBO, MVT::i32, Custom);
+ setOperationAction(ISD::USUBO, MVT::i64, Custom);
+ setOperationAction(ISD::SMULO, MVT::i32, Custom);
+ setOperationAction(ISD::SMULO, MVT::i64, Custom);
+ setOperationAction(ISD::UMULO, MVT::i32, Custom);
+ setOperationAction(ISD::UMULO, MVT::i64, Custom);
+
+ setOperationAction(ISD::FSIN, MVT::f32, Expand);
+ setOperationAction(ISD::FSIN, MVT::f64, Expand);
+ setOperationAction(ISD::FCOS, MVT::f32, Expand);
+ setOperationAction(ISD::FCOS, MVT::f64, Expand);
+ setOperationAction(ISD::FPOW, MVT::f32, Expand);
+ setOperationAction(ISD::FPOW, MVT::f64, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+
+ // f16 is a storage-only type, always promote it to f32.
+ setOperationAction(ISD::SETCC, MVT::f16, Promote);
+ setOperationAction(ISD::BR_CC, MVT::f16, Promote);
+ setOperationAction(ISD::SELECT_CC, MVT::f16, Promote);
+ setOperationAction(ISD::SELECT, MVT::f16, Promote);
+ setOperationAction(ISD::FADD, MVT::f16, Promote);
+ setOperationAction(ISD::FSUB, MVT::f16, Promote);
+ setOperationAction(ISD::FMUL, MVT::f16, Promote);
+ setOperationAction(ISD::FDIV, MVT::f16, Promote);
+ setOperationAction(ISD::FREM, MVT::f16, Promote);
+ setOperationAction(ISD::FMA, MVT::f16, Promote);
+ setOperationAction(ISD::FNEG, MVT::f16, Promote);
+ setOperationAction(ISD::FABS, MVT::f16, Promote);
+ setOperationAction(ISD::FCEIL, MVT::f16, Promote);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote);
+ setOperationAction(ISD::FCOS, MVT::f16, Promote);
+ setOperationAction(ISD::FFLOOR, MVT::f16, Promote);
+ setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote);
+ setOperationAction(ISD::FPOW, MVT::f16, Promote);
+ setOperationAction(ISD::FPOWI, MVT::f16, Promote);
+ setOperationAction(ISD::FRINT, MVT::f16, Promote);
+ setOperationAction(ISD::FSIN, MVT::f16, Promote);
+ setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
+ setOperationAction(ISD::FSQRT, MVT::f16, Promote);
+ setOperationAction(ISD::FEXP, MVT::f16, Promote);
+ setOperationAction(ISD::FEXP2, MVT::f16, Promote);
+ setOperationAction(ISD::FLOG, MVT::f16, Promote);
+ setOperationAction(ISD::FLOG2, MVT::f16, Promote);
+ setOperationAction(ISD::FLOG10, MVT::f16, Promote);
+ setOperationAction(ISD::FROUND, MVT::f16, Promote);
+ setOperationAction(ISD::FTRUNC, MVT::f16, Promote);
+ setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
+ setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
+ setOperationAction(ISD::FMINNAN, MVT::f16, Promote);
+ setOperationAction(ISD::FMAXNAN, MVT::f16, Promote);
+
+ // v4f16 is also a storage-only type, so promote it to v4f32 when that is
+ // known to be safe.
+ setOperationAction(ISD::FADD, MVT::v4f16, Promote);
+ setOperationAction(ISD::FSUB, MVT::v4f16, Promote);
+ setOperationAction(ISD::FMUL, MVT::v4f16, Promote);
+ setOperationAction(ISD::FDIV, MVT::v4f16, Promote);
+ setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Promote);
+ setOperationAction(ISD::FP_ROUND, MVT::v4f16, Promote);
+ AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32);
+ AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32);
+ AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32);
+ AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32);
+ AddPromotedToType(ISD::FP_EXTEND, MVT::v4f16, MVT::v4f32);
+ AddPromotedToType(ISD::FP_ROUND, MVT::v4f16, MVT::v4f32);
+
+ // Expand all other v4f16 operations.
+ // FIXME: We could generate better code by promoting some operations to
+ // a pair of v4f32s
+ setOperationAction(ISD::FABS, MVT::v4f16, Expand);
+ setOperationAction(ISD::FCEIL, MVT::v4f16, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand);
+ setOperationAction(ISD::FCOS, MVT::v4f16, Expand);
+ setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand);
+ setOperationAction(ISD::FMA, MVT::v4f16, Expand);
+ setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand);
+ setOperationAction(ISD::FNEG, MVT::v4f16, Expand);
+ setOperationAction(ISD::FPOW, MVT::v4f16, Expand);
+ setOperationAction(ISD::FPOWI, MVT::v4f16, Expand);
+ setOperationAction(ISD::FREM, MVT::v4f16, Expand);
+ setOperationAction(ISD::FROUND, MVT::v4f16, Expand);
+ setOperationAction(ISD::FRINT, MVT::v4f16, Expand);
+ setOperationAction(ISD::FSIN, MVT::v4f16, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand);
+ setOperationAction(ISD::FSQRT, MVT::v4f16, Expand);
+ setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand);
+ setOperationAction(ISD::SETCC, MVT::v4f16, Expand);
+ setOperationAction(ISD::BR_CC, MVT::v4f16, Expand);
+ setOperationAction(ISD::SELECT, MVT::v4f16, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand);
+ setOperationAction(ISD::FEXP, MVT::v4f16, Expand);
+ setOperationAction(ISD::FEXP2, MVT::v4f16, Expand);
+ setOperationAction(ISD::FLOG, MVT::v4f16, Expand);
+ setOperationAction(ISD::FLOG2, MVT::v4f16, Expand);
+ setOperationAction(ISD::FLOG10, MVT::v4f16, Expand);
+
+
+ // v8f16 is also a storage-only type, so expand it.
+ setOperationAction(ISD::FABS, MVT::v8f16, Expand);
+ setOperationAction(ISD::FADD, MVT::v8f16, Expand);
+ setOperationAction(ISD::FCEIL, MVT::v8f16, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand);
+ setOperationAction(ISD::FCOS, MVT::v8f16, Expand);
+ setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
+ setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand);
+ setOperationAction(ISD::FMA, MVT::v8f16, Expand);
+ setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
+ setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand);
+ setOperationAction(ISD::FNEG, MVT::v8f16, Expand);
+ setOperationAction(ISD::FPOW, MVT::v8f16, Expand);
+ setOperationAction(ISD::FPOWI, MVT::v8f16, Expand);
+ setOperationAction(ISD::FREM, MVT::v8f16, Expand);
+ setOperationAction(ISD::FROUND, MVT::v8f16, Expand);
+ setOperationAction(ISD::FRINT, MVT::v8f16, Expand);
+ setOperationAction(ISD::FSIN, MVT::v8f16, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand);
+ setOperationAction(ISD::FSQRT, MVT::v8f16, Expand);
+ setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
+ setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand);
+ setOperationAction(ISD::SETCC, MVT::v8f16, Expand);
+ setOperationAction(ISD::BR_CC, MVT::v8f16, Expand);
+ setOperationAction(ISD::SELECT, MVT::v8f16, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand);
+ setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand);
+ setOperationAction(ISD::FEXP, MVT::v8f16, Expand);
+ setOperationAction(ISD::FEXP2, MVT::v8f16, Expand);
+ setOperationAction(ISD::FLOG, MVT::v8f16, Expand);
+ setOperationAction(ISD::FLOG2, MVT::v8f16, Expand);
+ setOperationAction(ISD::FLOG10, MVT::v8f16, Expand);
+
+ // AArch64 has implementations of a lot of rounding-like FP operations.
+ for (MVT Ty : {MVT::f32, MVT::f64}) {
+ setOperationAction(ISD::FFLOOR, Ty, Legal);
+ setOperationAction(ISD::FNEARBYINT, Ty, Legal);
+ setOperationAction(ISD::FCEIL, Ty, Legal);
+ setOperationAction(ISD::FRINT, Ty, Legal);
+ setOperationAction(ISD::FTRUNC, Ty, Legal);
+ setOperationAction(ISD::FROUND, Ty, Legal);
+ setOperationAction(ISD::FMINNUM, Ty, Legal);
+ setOperationAction(ISD::FMAXNUM, Ty, Legal);
+ setOperationAction(ISD::FMINNAN, Ty, Legal);
+ setOperationAction(ISD::FMAXNAN, Ty, Legal);
+ }
+
+ setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
+
+ setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
+
+ // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
+ // This requires the Performance Monitors extension.
+ if (Subtarget->hasPerfMon())
+ setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
+
+ if (Subtarget->isTargetMachO()) {
+ // For iOS, we don't want to the normal expansion of a libcall to
+ // sincos. We want to issue a libcall to __sincos_stret to avoid memory
+ // traffic.
+ setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
+ setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
+ } else {
+ setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+ }
+
+ // Make floating-point constants legal for the large code model, so they don't
+ // become loads from the constant pool.
+ if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
+ setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+ setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
+ }
+
+ // AArch64 does not have floating-point extending loads, i1 sign-extending
+ // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
+ for (MVT VT : MVT::fp_valuetypes()) {
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
+ }
+ for (MVT VT : MVT::integer_valuetypes())
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
+
+ setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+ setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+ setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+ setTruncStoreAction(MVT::f128, MVT::f80, Expand);
+ setTruncStoreAction(MVT::f128, MVT::f64, Expand);
+ setTruncStoreAction(MVT::f128, MVT::f32, Expand);
+ setTruncStoreAction(MVT::f128, MVT::f16, Expand);
+
+ setOperationAction(ISD::BITCAST, MVT::i16, Custom);
+ setOperationAction(ISD::BITCAST, MVT::f16, Custom);
+
+ // Indexed loads and stores are supported.
+ for (unsigned im = (unsigned)ISD::PRE_INC;
+ im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
+ setIndexedLoadAction(im, MVT::i8, Legal);
+ setIndexedLoadAction(im, MVT::i16, Legal);
+ setIndexedLoadAction(im, MVT::i32, Legal);
+ setIndexedLoadAction(im, MVT::i64, Legal);
+ setIndexedLoadAction(im, MVT::f64, Legal);
+ setIndexedLoadAction(im, MVT::f32, Legal);
+ setIndexedLoadAction(im, MVT::f16, Legal);
+ setIndexedStoreAction(im, MVT::i8, Legal);
+ setIndexedStoreAction(im, MVT::i16, Legal);
+ setIndexedStoreAction(im, MVT::i32, Legal);
+ setIndexedStoreAction(im, MVT::i64, Legal);
+ setIndexedStoreAction(im, MVT::f64, Legal);
+ setIndexedStoreAction(im, MVT::f32, Legal);
+ setIndexedStoreAction(im, MVT::f16, Legal);
+ }
+
+ // Trap.
+ setOperationAction(ISD::TRAP, MVT::Other, Legal);
+
+ // We combine OR nodes for bitfield operations.
+ setTargetDAGCombine(ISD::OR);
+
+ // Vector add and sub nodes may conceal a high-half opportunity.
+ // Also, try to fold ADD into CSINC/CSINV..
+ setTargetDAGCombine(ISD::ADD);
+ setTargetDAGCombine(ISD::SUB);
+ setTargetDAGCombine(ISD::SRL);
+ setTargetDAGCombine(ISD::XOR);
+ setTargetDAGCombine(ISD::SINT_TO_FP);
+ setTargetDAGCombine(ISD::UINT_TO_FP);
+
+ setTargetDAGCombine(ISD::FP_TO_SINT);
+ setTargetDAGCombine(ISD::FP_TO_UINT);
+ setTargetDAGCombine(ISD::FDIV);
+
+ setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
+
+ setTargetDAGCombine(ISD::ANY_EXTEND);
+ setTargetDAGCombine(ISD::ZERO_EXTEND);
+ setTargetDAGCombine(ISD::SIGN_EXTEND);
+ setTargetDAGCombine(ISD::BITCAST);
+ setTargetDAGCombine(ISD::CONCAT_VECTORS);
+ setTargetDAGCombine(ISD::STORE);
+ if (Subtarget->supportsAddressTopByteIgnored())
+ setTargetDAGCombine(ISD::LOAD);
+
+ setTargetDAGCombine(ISD::MUL);
+
+ setTargetDAGCombine(ISD::SELECT);
+ setTargetDAGCombine(ISD::VSELECT);
+
+ setTargetDAGCombine(ISD::INTRINSIC_VOID);
+ setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
+ setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+ setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+
+ MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
+ MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
+ MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4;
+
+ setStackPointerRegisterToSaveRestore(AArch64::SP);
+
+ setSchedulingPreference(Sched::Hybrid);
+
+ // Enable TBZ/TBNZ
+ MaskAndBranchFoldingIsLegal = true;
+ EnableExtLdPromotion = true;
+
+ // Set required alignment.
+ setMinFunctionAlignment(2);
+ // Set preferred alignments.
+ setPrefFunctionAlignment(STI.getPrefFunctionAlignment());
+ setPrefLoopAlignment(STI.getPrefLoopAlignment());
+
+ // Only change the limit for entries in a jump table if specified by
+ // the subtarget, but not at the command line.
+ unsigned MaxJT = STI.getMaximumJumpTableSize();
+ if (MaxJT && getMaximumJumpTableSize() == 0)
+ setMaximumJumpTableSize(MaxJT);
+
+ setHasExtractBitsInsn(true);
+
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
+ if (Subtarget->hasNEON()) {
+ // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
+ // silliness like this:
+ setOperationAction(ISD::FABS, MVT::v1f64, Expand);
+ setOperationAction(ISD::FADD, MVT::v1f64, Expand);
+ setOperationAction(ISD::FCEIL, MVT::v1f64, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand);
+ setOperationAction(ISD::FCOS, MVT::v1f64, Expand);
+ setOperationAction(ISD::FDIV, MVT::v1f64, Expand);
+ setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand);
+ setOperationAction(ISD::FMA, MVT::v1f64, Expand);
+ setOperationAction(ISD::FMUL, MVT::v1f64, Expand);
+ setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand);
+ setOperationAction(ISD::FNEG, MVT::v1f64, Expand);
+ setOperationAction(ISD::FPOW, MVT::v1f64, Expand);
+ setOperationAction(ISD::FREM, MVT::v1f64, Expand);
+ setOperationAction(ISD::FROUND, MVT::v1f64, Expand);
+ setOperationAction(ISD::FRINT, MVT::v1f64, Expand);
+ setOperationAction(ISD::FSIN, MVT::v1f64, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand);
+ setOperationAction(ISD::FSQRT, MVT::v1f64, Expand);
+ setOperationAction(ISD::FSUB, MVT::v1f64, Expand);
+ setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand);
+ setOperationAction(ISD::SETCC, MVT::v1f64, Expand);
+ setOperationAction(ISD::BR_CC, MVT::v1f64, Expand);
+ setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand);
+ setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand);
+
+ setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand);
+ setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand);
+
+ setOperationAction(ISD::MUL, MVT::v1i64, Expand);
+
+ // AArch64 doesn't have a direct vector ->f32 conversion instructions for
+ // elements smaller than i32, so promote the input to i32 first.
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Promote);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Promote);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Promote);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Promote);
+ // i8 and i16 vector elements also need promotion to i32 for v8i8 or v8i16
+ // -> v8f16 conversions.
+ setOperationAction(ISD::SINT_TO_FP, MVT::v8i8, Promote);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Promote);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Promote);
+ // Similarly, there is no direct i32 -> f64 vector conversion instruction.
+ setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
+ // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
+ // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
+ setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
+
+ setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
+ setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
+
+ setOperationAction(ISD::CTTZ, MVT::v2i8, Expand);
+ setOperationAction(ISD::CTTZ, MVT::v4i16, Expand);
+ setOperationAction(ISD::CTTZ, MVT::v2i32, Expand);
+ setOperationAction(ISD::CTTZ, MVT::v1i64, Expand);
+ setOperationAction(ISD::CTTZ, MVT::v16i8, Expand);
+ setOperationAction(ISD::CTTZ, MVT::v8i16, Expand);
+ setOperationAction(ISD::CTTZ, MVT::v4i32, Expand);
+ setOperationAction(ISD::CTTZ, MVT::v2i64, Expand);
+
+ // AArch64 doesn't have MUL.2d:
+ setOperationAction(ISD::MUL, MVT::v2i64, Expand);
+ // Custom handling for some quad-vector types to detect MULL.
+ setOperationAction(ISD::MUL, MVT::v8i16, Custom);
+ setOperationAction(ISD::MUL, MVT::v4i32, Custom);
+ setOperationAction(ISD::MUL, MVT::v2i64, Custom);
+
+ setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
+ setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
+ // Likewise, narrowing and extending vector loads/stores aren't handled
+ // directly.
+ for (MVT VT : MVT::vector_valuetypes()) {
+ setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
+
+ setOperationAction(ISD::MULHS, VT, Expand);
+ setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+ setOperationAction(ISD::MULHU, VT, Expand);
+ setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+
+ setOperationAction(ISD::BSWAP, VT, Expand);
+
+ for (MVT InnerVT : MVT::vector_valuetypes()) {
+ setTruncStoreAction(VT, InnerVT, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
+ }
+ }
+
+ // AArch64 has implementations of a lot of rounding-like FP operations.
+ for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
+ setOperationAction(ISD::FFLOOR, Ty, Legal);
+ setOperationAction(ISD::FNEARBYINT, Ty, Legal);
+ setOperationAction(ISD::FCEIL, Ty, Legal);
+ setOperationAction(ISD::FRINT, Ty, Legal);
+ setOperationAction(ISD::FTRUNC, Ty, Legal);
+ setOperationAction(ISD::FROUND, Ty, Legal);
+ }
+ }
+
+ PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
+}
+
+void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
+ if (VT == MVT::v2f32 || VT == MVT::v4f16) {
+ setOperationAction(ISD::LOAD, VT, Promote);
+ AddPromotedToType(ISD::LOAD, VT, MVT::v2i32);
+
+ setOperationAction(ISD::STORE, VT, Promote);
+ AddPromotedToType(ISD::STORE, VT, MVT::v2i32);
+ } else if (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16) {
+ setOperationAction(ISD::LOAD, VT, Promote);
+ AddPromotedToType(ISD::LOAD, VT, MVT::v2i64);
+
+ setOperationAction(ISD::STORE, VT, Promote);
+ AddPromotedToType(ISD::STORE, VT, MVT::v2i64);
+ }
+
+ // Mark vector float intrinsics as expand.
+ if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
+ setOperationAction(ISD::FSIN, VT, Expand);
+ setOperationAction(ISD::FCOS, VT, Expand);
+ setOperationAction(ISD::FPOWI, VT, Expand);
+ setOperationAction(ISD::FPOW, VT, Expand);
+ setOperationAction(ISD::FLOG, VT, Expand);
+ setOperationAction(ISD::FLOG2, VT, Expand);
+ setOperationAction(ISD::FLOG10, VT, Expand);
+ setOperationAction(ISD::FEXP, VT, Expand);
+ setOperationAction(ISD::FEXP2, VT, Expand);
+
+ // But we do support custom-lowering for FCOPYSIGN.
+ setOperationAction(ISD::FCOPYSIGN, VT, Custom);
+ }
+
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+ setOperationAction(ISD::SRA, VT, Custom);
+ setOperationAction(ISD::SRL, VT, Custom);
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::AND, VT, Custom);
+ setOperationAction(ISD::OR, VT, Custom);
+ setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
+
+ setOperationAction(ISD::SELECT, VT, Expand);
+ setOperationAction(ISD::SELECT_CC, VT, Expand);
+ setOperationAction(ISD::VSELECT, VT, Expand);
+ for (MVT InnerVT : MVT::all_valuetypes())
+ setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
+
+ // CNT supports only B element sizes.
+ if (VT != MVT::v8i8 && VT != MVT::v16i8)
+ setOperationAction(ISD::CTPOP, VT, Expand);
+
+ setOperationAction(ISD::UDIV, VT, Expand);
+ setOperationAction(ISD::SDIV, VT, Expand);
+ setOperationAction(ISD::UREM, VT, Expand);
+ setOperationAction(ISD::SREM, VT, Expand);
+ setOperationAction(ISD::FREM, VT, Expand);
+
+ setOperationAction(ISD::FP_TO_SINT, VT, Custom);
+ setOperationAction(ISD::FP_TO_UINT, VT, Custom);
+
+ // [SU][MIN|MAX] are available for all NEON types apart from i64.
+ if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
+ for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
+ setOperationAction(Opcode, VT, Legal);
+
+ // F[MIN|MAX][NUM|NAN] are available for all FP NEON types (not f16 though!).
+ if (VT.isFloatingPoint() && VT.getVectorElementType() != MVT::f16)
+ for (unsigned Opcode : {ISD::FMINNAN, ISD::FMAXNAN,
+ ISD::FMINNUM, ISD::FMAXNUM})
+ setOperationAction(Opcode, VT, Legal);
+
+ if (Subtarget->isLittleEndian()) {
+ for (unsigned im = (unsigned)ISD::PRE_INC;
+ im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
+ setIndexedLoadAction(im, VT, Legal);
+ setIndexedStoreAction(im, VT, Legal);
+ }
+ }
+}
+
+void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
+ addRegisterClass(VT, &AArch64::FPR64RegClass);
+ addTypeForNEON(VT, MVT::v2i32);
+}
+
+void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
+ addRegisterClass(VT, &AArch64::FPR128RegClass);
+ addTypeForNEON(VT, MVT::v4i32);
+}
+
+EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
+ EVT VT) const {
+ if (!VT.isVector())
+ return MVT::i32;
+ return VT.changeVectorElementTypeToInteger();
+}
+
+/// computeKnownBitsForTargetNode - Determine which of the bits specified in
+/// Mask are known to be either zero or one and return them in the
+/// KnownZero/KnownOne bitsets.
+void AArch64TargetLowering::computeKnownBitsForTargetNode(
+ const SDValue Op, APInt &KnownZero, APInt &KnownOne,
+ const SelectionDAG &DAG, unsigned Depth) const {
+ switch (Op.getOpcode()) {
+ default:
+ break;
+ case AArch64ISD::CSEL: {
+ APInt KnownZero2, KnownOne2;
+ DAG.computeKnownBits(Op->getOperand(0), KnownZero, KnownOne, Depth + 1);
+ DAG.computeKnownBits(Op->getOperand(1), KnownZero2, KnownOne2, Depth + 1);
+ KnownZero &= KnownZero2;
+ KnownOne &= KnownOne2;
+ break;
+ }
+ case ISD::INTRINSIC_W_CHAIN: {
+ ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
+ Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
+ switch (IntID) {
+ default: return;
+ case Intrinsic::aarch64_ldaxr:
+ case Intrinsic::aarch64_ldxr: {
+ unsigned BitWidth = KnownOne.getBitWidth();
+ EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
+ unsigned MemBits = VT.getScalarSizeInBits();
+ KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
+ return;
+ }
+ }
+ break;
+ }
+ case ISD::INTRINSIC_WO_CHAIN:
+ case ISD::INTRINSIC_VOID: {
+ unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ switch (IntNo) {
+ default:
+ break;
+ case Intrinsic::aarch64_neon_umaxv:
+ case Intrinsic::aarch64_neon_uminv: {
+ // Figure out the datatype of the vector operand. The UMINV instruction
+ // will zero extend the result, so we can mark as known zero all the
+ // bits larger than the element datatype. 32-bit or larget doesn't need
+ // this as those are legal types and will be handled by isel directly.
+ MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
+ unsigned BitWidth = KnownZero.getBitWidth();
+ if (VT == MVT::v8i8 || VT == MVT::v16i8) {
+ assert(BitWidth >= 8 && "Unexpected width!");
+ APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
+ KnownZero |= Mask;
+ } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
+ assert(BitWidth >= 16 && "Unexpected width!");
+ APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
+ KnownZero |= Mask;
+ }
+ break;
+ } break;
+ }
+ }
+ }
+}
+
+MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
+ EVT) const {
+ return MVT::i64;
+}
+
+bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+ unsigned AddrSpace,
+ unsigned Align,
+ bool *Fast) const {
+ if (Subtarget->requiresStrictAlign())
+ return false;
+
+ if (Fast) {
+ // Some CPUs are fine with unaligned stores except for 128-bit ones.
+ *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
+ // See comments in performSTORECombine() for more details about
+ // these conditions.
+
+ // Code that uses clang vector extensions can mark that it
+ // wants unaligned accesses to be treated as fast by
+ // underspecifying alignment to be 1 or 2.
+ Align <= 2 ||
+
+ // Disregard v2i64. Memcpy lowering produces those and splitting
+ // them regresses performance on micro-benchmarks and olden/bh.
+ VT == MVT::v2i64;
+ }
+ return true;
+}
+
+FastISel *
+AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo) const {
+ return AArch64::createFastISel(funcInfo, libInfo);
+}
+
+const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
+ switch ((AArch64ISD::NodeType)Opcode) {
+ case AArch64ISD::FIRST_NUMBER: break;
+ case AArch64ISD::CALL: return "AArch64ISD::CALL";
+ case AArch64ISD::ADRP: return "AArch64ISD::ADRP";
+ case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow";
+ case AArch64ISD::LOADgot: return "AArch64ISD::LOADgot";
+ case AArch64ISD::RET_FLAG: return "AArch64ISD::RET_FLAG";
+ case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND";
+ case AArch64ISD::CSEL: return "AArch64ISD::CSEL";
+ case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL";
+ case AArch64ISD::CSINV: return "AArch64ISD::CSINV";
+ case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG";
+ case AArch64ISD::CSINC: return "AArch64ISD::CSINC";
+ case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER";
+ case AArch64ISD::TLSDESC_CALLSEQ: return "AArch64ISD::TLSDESC_CALLSEQ";
+ case AArch64ISD::ADC: return "AArch64ISD::ADC";
+ case AArch64ISD::SBC: return "AArch64ISD::SBC";
+ case AArch64ISD::ADDS: return "AArch64ISD::ADDS";
+ case AArch64ISD::SUBS: return "AArch64ISD::SUBS";
+ case AArch64ISD::ADCS: return "AArch64ISD::ADCS";
+ case AArch64ISD::SBCS: return "AArch64ISD::SBCS";
+ case AArch64ISD::ANDS: return "AArch64ISD::ANDS";
+ case AArch64ISD::CCMP: return "AArch64ISD::CCMP";
+ case AArch64ISD::CCMN: return "AArch64ISD::CCMN";
+ case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP";
+ case AArch64ISD::FCMP: return "AArch64ISD::FCMP";
+ case AArch64ISD::DUP: return "AArch64ISD::DUP";
+ case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8";
+ case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16";
+ case AArch64ISD::DUPLANE32: return "AArch64ISD::DUPLANE32";
+ case AArch64ISD::DUPLANE64: return "AArch64ISD::DUPLANE64";
+ case AArch64ISD::MOVI: return "AArch64ISD::MOVI";
+ case AArch64ISD::MOVIshift: return "AArch64ISD::MOVIshift";
+ case AArch64ISD::MOVIedit: return "AArch64ISD::MOVIedit";
+ case AArch64ISD::MOVImsl: return "AArch64ISD::MOVImsl";
+ case AArch64ISD::FMOV: return "AArch64ISD::FMOV";
+ case AArch64ISD::MVNIshift: return "AArch64ISD::MVNIshift";
+ case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl";
+ case AArch64ISD::BICi: return "AArch64ISD::BICi";
+ case AArch64ISD::ORRi: return "AArch64ISD::ORRi";
+ case AArch64ISD::BSL: return "AArch64ISD::BSL";
+ case AArch64ISD::NEG: return "AArch64ISD::NEG";
+ case AArch64ISD::EXTR: return "AArch64ISD::EXTR";
+ case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1";
+ case AArch64ISD::ZIP2: return "AArch64ISD::ZIP2";
+ case AArch64ISD::UZP1: return "AArch64ISD::UZP1";
+ case AArch64ISD::UZP2: return "AArch64ISD::UZP2";
+ case AArch64ISD::TRN1: return "AArch64ISD::TRN1";
+ case AArch64ISD::TRN2: return "AArch64ISD::TRN2";
+ case AArch64ISD::REV16: return "AArch64ISD::REV16";
+ case AArch64ISD::REV32: return "AArch64ISD::REV32";
+ case AArch64ISD::REV64: return "AArch64ISD::REV64";
+ case AArch64ISD::EXT: return "AArch64ISD::EXT";
+ case AArch64ISD::VSHL: return "AArch64ISD::VSHL";
+ case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR";
+ case AArch64ISD::VASHR: return "AArch64ISD::VASHR";
+ case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ";
+ case AArch64ISD::CMGE: return "AArch64ISD::CMGE";
+ case AArch64ISD::CMGT: return "AArch64ISD::CMGT";
+ case AArch64ISD::CMHI: return "AArch64ISD::CMHI";
+ case AArch64ISD::CMHS: return "AArch64ISD::CMHS";
+ case AArch64ISD::FCMEQ: return "AArch64ISD::FCMEQ";
+ case AArch64ISD::FCMGE: return "AArch64ISD::FCMGE";
+ case AArch64ISD::FCMGT: return "AArch64ISD::FCMGT";
+ case AArch64ISD::CMEQz: return "AArch64ISD::CMEQz";
+ case AArch64ISD::CMGEz: return "AArch64ISD::CMGEz";
+ case AArch64ISD::CMGTz: return "AArch64ISD::CMGTz";
+ case AArch64ISD::CMLEz: return "AArch64ISD::CMLEz";
+ case AArch64ISD::CMLTz: return "AArch64ISD::CMLTz";
+ case AArch64ISD::FCMEQz: return "AArch64ISD::FCMEQz";
+ case AArch64ISD::FCMGEz: return "AArch64ISD::FCMGEz";
+ case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz";
+ case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz";
+ case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz";
+ case AArch64ISD::SADDV: return "AArch64ISD::SADDV";
+ case AArch64ISD::UADDV: return "AArch64ISD::UADDV";
+ case AArch64ISD::SMINV: return "AArch64ISD::SMINV";
+ case AArch64ISD::UMINV: return "AArch64ISD::UMINV";
+ case AArch64ISD::SMAXV: return "AArch64ISD::SMAXV";
+ case AArch64ISD::UMAXV: return "AArch64ISD::UMAXV";
+ case AArch64ISD::NOT: return "AArch64ISD::NOT";
+ case AArch64ISD::BIT: return "AArch64ISD::BIT";
+ case AArch64ISD::CBZ: return "AArch64ISD::CBZ";
+ case AArch64ISD::CBNZ: return "AArch64ISD::CBNZ";
+ case AArch64ISD::TBZ: return "AArch64ISD::TBZ";
+ case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ";
+ case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN";
+ case AArch64ISD::PREFETCH: return "AArch64ISD::PREFETCH";
+ case AArch64ISD::SITOF: return "AArch64ISD::SITOF";
+ case AArch64ISD::UITOF: return "AArch64ISD::UITOF";
+ case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST";
+ case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I";
+ case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I";
+ case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I";
+ case AArch64ISD::URSHR_I: return "AArch64ISD::URSHR_I";
+ case AArch64ISD::SQSHLU_I: return "AArch64ISD::SQSHLU_I";
+ case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge";
+ case AArch64ISD::LD2post: return "AArch64ISD::LD2post";
+ case AArch64ISD::LD3post: return "AArch64ISD::LD3post";
+ case AArch64ISD::LD4post: return "AArch64ISD::LD4post";
+ case AArch64ISD::ST2post: return "AArch64ISD::ST2post";
+ case AArch64ISD::ST3post: return "AArch64ISD::ST3post";
+ case AArch64ISD::ST4post: return "AArch64ISD::ST4post";
+ case AArch64ISD::LD1x2post: return "AArch64ISD::LD1x2post";
+ case AArch64ISD::LD1x3post: return "AArch64ISD::LD1x3post";
+ case AArch64ISD::LD1x4post: return "AArch64ISD::LD1x4post";
+ case AArch64ISD::ST1x2post: return "AArch64ISD::ST1x2post";
+ case AArch64ISD::ST1x3post: return "AArch64ISD::ST1x3post";
+ case AArch64ISD::ST1x4post: return "AArch64ISD::ST1x4post";
+ case AArch64ISD::LD1DUPpost: return "AArch64ISD::LD1DUPpost";
+ case AArch64ISD::LD2DUPpost: return "AArch64ISD::LD2DUPpost";
+ case AArch64ISD::LD3DUPpost: return "AArch64ISD::LD3DUPpost";
+ case AArch64ISD::LD4DUPpost: return "AArch64ISD::LD4DUPpost";
+ case AArch64ISD::LD1LANEpost: return "AArch64ISD::LD1LANEpost";
+ case AArch64ISD::LD2LANEpost: return "AArch64ISD::LD2LANEpost";
+ case AArch64ISD::LD3LANEpost: return "AArch64ISD::LD3LANEpost";
+ case AArch64ISD::LD4LANEpost: return "AArch64ISD::LD4LANEpost";
+ case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost";
+ case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost";
+ case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost";
+ case AArch64ISD::SMULL: return "AArch64ISD::SMULL";
+ case AArch64ISD::UMULL: return "AArch64ISD::UMULL";
+ case AArch64ISD::FRECPE: return "AArch64ISD::FRECPE";
+ case AArch64ISD::FRECPS: return "AArch64ISD::FRECPS";
+ case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE";
+ case AArch64ISD::FRSQRTS: return "AArch64ISD::FRSQRTS";
+ }
+ return nullptr;
+}
+
+MachineBasicBlock *
+AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
+ MachineBasicBlock *MBB) const {
+ // We materialise the F128CSEL pseudo-instruction as some control flow and a
+ // phi node:
+
+ // OrigBB:
+ // [... previous instrs leading to comparison ...]
+ // b.ne TrueBB
+ // b EndBB
+ // TrueBB:
+ // ; Fallthrough
+ // EndBB:
+ // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
+
+ MachineFunction *MF = MBB->getParent();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+ DebugLoc DL = MI.getDebugLoc();
+ MachineFunction::iterator It = ++MBB->getIterator();
+
+ unsigned DestReg = MI.getOperand(0).getReg();
+ unsigned IfTrueReg = MI.getOperand(1).getReg();
+ unsigned IfFalseReg = MI.getOperand(2).getReg();
+ unsigned CondCode = MI.getOperand(3).getImm();
+ bool NZCVKilled = MI.getOperand(4).isKill();
+
+ MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MF->insert(It, TrueBB);
+ MF->insert(It, EndBB);
+
+ // Transfer rest of current basic-block to EndBB
+ EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
+ MBB->end());
+ EndBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+ BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
+ BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
+ MBB->addSuccessor(TrueBB);
+ MBB->addSuccessor(EndBB);
+
+ // TrueBB falls through to the end.
+ TrueBB->addSuccessor(EndBB);
+
+ if (!NZCVKilled) {
+ TrueBB->addLiveIn(AArch64::NZCV);
+ EndBB->addLiveIn(AArch64::NZCV);
+ }
+
+ BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
+ .addReg(IfTrueReg)
+ .addMBB(TrueBB)
+ .addReg(IfFalseReg)
+ .addMBB(MBB);
+
+ MI.eraseFromParent();
+ return EndBB;
+}
+
+MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
+ MachineInstr &MI, MachineBasicBlock *BB) const {
+ switch (MI.getOpcode()) {
+ default:
+#ifndef NDEBUG
+ MI.dump();
+#endif
+ llvm_unreachable("Unexpected instruction for custom inserter!");
+
+ case AArch64::F128CSEL:
+ return EmitF128CSEL(MI, BB);
+
+ case TargetOpcode::STACKMAP:
+ case TargetOpcode::PATCHPOINT:
+ return emitPatchPoint(MI, BB);
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// AArch64 Lowering private implementation.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Lowering Code
+//===----------------------------------------------------------------------===//
+
+/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
+/// CC
+static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
+ switch (CC) {
+ default:
+ llvm_unreachable("Unknown condition code!");
+ case ISD::SETNE:
+ return AArch64CC::NE;
+ case ISD::SETEQ:
+ return AArch64CC::EQ;
+ case ISD::SETGT:
+ return AArch64CC::GT;
+ case ISD::SETGE:
+ return AArch64CC::GE;
+ case ISD::SETLT:
+ return AArch64CC::LT;
+ case ISD::SETLE:
+ return AArch64CC::LE;
+ case ISD::SETUGT:
+ return AArch64CC::HI;
+ case ISD::SETUGE:
+ return AArch64CC::HS;
+ case ISD::SETULT:
+ return AArch64CC::LO;
+ case ISD::SETULE:
+ return AArch64CC::LS;
+ }
+}
+
+/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
+static void changeFPCCToAArch64CC(ISD::CondCode CC,
+ AArch64CC::CondCode &CondCode,
+ AArch64CC::CondCode &CondCode2) {
+ CondCode2 = AArch64CC::AL;
+ switch (CC) {
+ default:
+ llvm_unreachable("Unknown FP condition!");
+ case ISD::SETEQ:
+ case ISD::SETOEQ:
+ CondCode = AArch64CC::EQ;
+ break;
+ case ISD::SETGT:
+ case ISD::SETOGT:
+ CondCode = AArch64CC::GT;
+ break;
+ case ISD::SETGE:
+ case ISD::SETOGE:
+ CondCode = AArch64CC::GE;
+ break;
+ case ISD::SETOLT:
+ CondCode = AArch64CC::MI;
+ break;
+ case ISD::SETOLE:
+ CondCode = AArch64CC::LS;
+ break;
+ case ISD::SETONE:
+ CondCode = AArch64CC::MI;
+ CondCode2 = AArch64CC::GT;
+ break;
+ case ISD::SETO:
+ CondCode = AArch64CC::VC;
+ break;
+ case ISD::SETUO:
+ CondCode = AArch64CC::VS;
+ break;
+ case ISD::SETUEQ:
+ CondCode = AArch64CC::EQ;
+ CondCode2 = AArch64CC::VS;
+ break;
+ case ISD::SETUGT:
+ CondCode = AArch64CC::HI;
+ break;
+ case ISD::SETUGE:
+ CondCode = AArch64CC::PL;
+ break;
+ case ISD::SETLT:
+ case ISD::SETULT:
+ CondCode = AArch64CC::LT;
+ break;
+ case ISD::SETLE:
+ case ISD::SETULE:
+ CondCode = AArch64CC::LE;
+ break;
+ case ISD::SETNE:
+ case ISD::SETUNE:
+ CondCode = AArch64CC::NE;
+ break;
+ }
+}
+
+/// Convert a DAG fp condition code to an AArch64 CC.
+/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
+/// should be AND'ed instead of OR'ed.
+static void changeFPCCToANDAArch64CC(ISD::CondCode CC,
+ AArch64CC::CondCode &CondCode,
+ AArch64CC::CondCode &CondCode2) {
+ CondCode2 = AArch64CC::AL;
+ switch (CC) {
+ default:
+ changeFPCCToAArch64CC(CC, CondCode, CondCode2);
+ assert(CondCode2 == AArch64CC::AL);
+ break;
+ case ISD::SETONE:
+ // (a one b)
+ // == ((a olt b) || (a ogt b))
+ // == ((a ord b) && (a une b))
+ CondCode = AArch64CC::VC;
+ CondCode2 = AArch64CC::NE;
+ break;
+ case ISD::SETUEQ:
+ // (a ueq b)
+ // == ((a uno b) || (a oeq b))
+ // == ((a ule b) && (a uge b))
+ CondCode = AArch64CC::PL;
+ CondCode2 = AArch64CC::LE;
+ break;
+ }
+}
+
+/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
+/// CC usable with the vector instructions. Fewer operations are available
+/// without a real NZCV register, so we have to use less efficient combinations
+/// to get the same effect.
+static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
+ AArch64CC::CondCode &CondCode,
+ AArch64CC::CondCode &CondCode2,
+ bool &Invert) {
+ Invert = false;
+ switch (CC) {
+ default:
+ // Mostly the scalar mappings work fine.
+ changeFPCCToAArch64CC(CC, CondCode, CondCode2);
+ break;
+ case ISD::SETUO:
+ Invert = true;
+ LLVM_FALLTHROUGH;
+ case ISD::SETO:
+ CondCode = AArch64CC::MI;
+ CondCode2 = AArch64CC::GE;
+ break;
+ case ISD::SETUEQ:
+ case ISD::SETULT:
+ case ISD::SETULE:
+ case ISD::SETUGT:
+ case ISD::SETUGE:
+ // All of the compare-mask comparisons are ordered, but we can switch
+ // between the two by a double inversion. E.g. ULE == !OGT.
+ Invert = true;
+ changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2);
+ break;
+ }
+}
+
+static bool isLegalArithImmed(uint64_t C) {
+ // Matches AArch64DAGToDAGISel::SelectArithImmed().
+ return (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
+}
+
+static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+ const SDLoc &dl, SelectionDAG &DAG) {
+ EVT VT = LHS.getValueType();
+
+ if (VT.isFloatingPoint()) {
+ assert(VT != MVT::f128);
+ if (VT == MVT::f16) {
+ LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
+ RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
+ VT = MVT::f32;
+ }
+ return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
+ }
+
+ // The CMP instruction is just an alias for SUBS, and representing it as
+ // SUBS means that it's possible to get CSE with subtract operations.
+ // A later phase can perform the optimization of setting the destination
+ // register to WZR/XZR if it ends up being unused.
+ unsigned Opcode = AArch64ISD::SUBS;
+
+ if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
+ (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+ // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on
+ // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags
+ // can be set differently by this operation. It comes down to whether
+ // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
+ // everything is fine. If not then the optimization is wrong. Thus general
+ // comparisons are only valid if op2 != 0.
+
+ // So, finally, the only LLVM-native comparisons that don't mention C and V
+ // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
+ // the absence of information about op2.
+ Opcode = AArch64ISD::ADDS;
+ RHS = RHS.getOperand(1);
+ } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) &&
+ !isUnsignedIntSetCC(CC)) {
+ // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
+ // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
+ // of the signed comparisons.
+ Opcode = AArch64ISD::ANDS;
+ RHS = LHS.getOperand(1);
+ LHS = LHS.getOperand(0);
+ }
+
+ return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
+ .getValue(1);
+}
+
+/// \defgroup AArch64CCMP CMP;CCMP matching
+///
+/// These functions deal with the formation of CMP;CCMP;... sequences.
+/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
+/// a comparison. They set the NZCV flags to a predefined value if their
+/// predicate is false. This allows to express arbitrary conjunctions, for
+/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B))))"
+/// expressed as:
+/// cmp A
+/// ccmp B, inv(CB), CA
+/// check for CB flags
+///
+/// In general we can create code for arbitrary "... (and (and A B) C)"
+/// sequences. We can also implement some "or" expressions, because "(or A B)"
+/// is equivalent to "not (and (not A) (not B))" and we can implement some
+/// negation operations:
+/// We can negate the results of a single comparison by inverting the flags
+/// used when the predicate fails and inverting the flags tested in the next
+/// instruction; We can also negate the results of the whole previous
+/// conditional compare sequence by inverting the flags tested in the next
+/// instruction. However there is no way to negate the result of a partial
+/// sequence.
+///
+/// Therefore on encountering an "or" expression we can negate the subtree on
+/// one side and have to be able to push the negate to the leafs of the subtree
+/// on the other side (see also the comments in code). As complete example:
+/// "or (or (setCA (cmp A)) (setCB (cmp B)))
+/// (and (setCC (cmp C)) (setCD (cmp D)))"
+/// is transformed to
+/// "not (and (not (and (setCC (cmp C)) (setCC (cmp D))))
+/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
+/// and implemented as:
+/// cmp C
+/// ccmp D, inv(CD), CC
+/// ccmp A, CA, inv(CD)
+/// ccmp B, CB, inv(CA)
+/// check for CB flags
+/// A counterexample is "or (and A B) (and C D)" which cannot be implemented
+/// by conditional compare sequences.
+/// @{
+
+/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
+static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
+ ISD::CondCode CC, SDValue CCOp,
+ AArch64CC::CondCode Predicate,
+ AArch64CC::CondCode OutCC,
+ const SDLoc &DL, SelectionDAG &DAG) {
+ unsigned Opcode = 0;
+ if (LHS.getValueType().isFloatingPoint()) {
+ assert(LHS.getValueType() != MVT::f128);
+ if (LHS.getValueType() == MVT::f16) {
+ LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
+ RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
+ }
+ Opcode = AArch64ISD::FCCMP;
+ } else if (RHS.getOpcode() == ISD::SUB) {
+ SDValue SubOp0 = RHS.getOperand(0);
+ if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+ // See emitComparison() on why we can only do this for SETEQ and SETNE.
+ Opcode = AArch64ISD::CCMN;
+ RHS = RHS.getOperand(1);
+ }
+ }
+ if (Opcode == 0)
+ Opcode = AArch64ISD::CCMP;
+
+ SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
+ AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
+ unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
+ SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
+ return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
+}
+
+/// Returns true if @p Val is a tree of AND/OR/SETCC operations.
+/// CanPushNegate is set to true if we can push a negate operation through
+/// the tree in a was that we are left with AND operations and negate operations
+/// at the leafs only. i.e. "not (or (or x y) z)" can be changed to
+/// "and (and (not x) (not y)) (not z)"; "not (or (and x y) z)" cannot be
+/// brought into such a form.
+static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanNegate,
+ unsigned Depth = 0) {
+ if (!Val.hasOneUse())
+ return false;
+ unsigned Opcode = Val->getOpcode();
+ if (Opcode == ISD::SETCC) {
+ if (Val->getOperand(0).getValueType() == MVT::f128)
+ return false;
+ CanNegate = true;
+ return true;
+ }
+ // Protect against exponential runtime and stack overflow.
+ if (Depth > 6)
+ return false;
+ if (Opcode == ISD::AND || Opcode == ISD::OR) {
+ SDValue O0 = Val->getOperand(0);
+ SDValue O1 = Val->getOperand(1);
+ bool CanNegateL;
+ if (!isConjunctionDisjunctionTree(O0, CanNegateL, Depth+1))
+ return false;
+ bool CanNegateR;
+ if (!isConjunctionDisjunctionTree(O1, CanNegateR, Depth+1))
+ return false;
+
+ if (Opcode == ISD::OR) {
+ // For an OR expression we need to be able to negate at least one side or
+ // we cannot do the transformation at all.
+ if (!CanNegateL && !CanNegateR)
+ return false;
+ // We can however change a (not (or x y)) to (and (not x) (not y)) if we
+ // can negate the x and y subtrees.
+ CanNegate = CanNegateL && CanNegateR;
+ } else {
+ // If the operands are OR expressions then we finally need to negate their
+ // outputs, we can only do that for the operand with emitted last by
+ // negating OutCC, not for both operands.
+ bool NeedsNegOutL = O0->getOpcode() == ISD::OR;
+ bool NeedsNegOutR = O1->getOpcode() == ISD::OR;
+ if (NeedsNegOutL && NeedsNegOutR)
+ return false;
+ // We cannot negate an AND operation (it would become an OR),
+ CanNegate = false;
+ }
+ return true;
+ }
+ return false;
+}
+
+/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
+/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
+/// Tries to transform the given i1 producing node @p Val to a series compare
+/// and conditional compare operations. @returns an NZCV flags producing node
+/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
+/// transformation was not possible.
+/// On recursive invocations @p PushNegate may be set to true to have negation
+/// effects pushed to the tree leafs; @p Predicate is an NZCV flag predicate
+/// for the comparisons in the current subtree; @p Depth limits the search
+/// depth to avoid stack overflow.
+static SDValue emitConjunctionDisjunctionTreeRec(SelectionDAG &DAG, SDValue Val,
+ AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
+ AArch64CC::CondCode Predicate) {
+ // We're at a tree leaf, produce a conditional comparison operation.
+ unsigned Opcode = Val->getOpcode();
+ if (Opcode == ISD::SETCC) {
+ SDValue LHS = Val->getOperand(0);
+ SDValue RHS = Val->getOperand(1);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
+ bool isInteger = LHS.getValueType().isInteger();
+ if (Negate)
+ CC = getSetCCInverse(CC, isInteger);
+ SDLoc DL(Val);
+ // Determine OutCC and handle FP special case.
+ if (isInteger) {
+ OutCC = changeIntCCToAArch64CC(CC);
+ } else {
+ assert(LHS.getValueType().isFloatingPoint());
+ AArch64CC::CondCode ExtraCC;
+ changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
+ // Some floating point conditions can't be tested with a single condition
+ // code. Construct an additional comparison in this case.
+ if (ExtraCC != AArch64CC::AL) {
+ SDValue ExtraCmp;
+ if (!CCOp.getNode())
+ ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
+ else
+ ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
+ ExtraCC, DL, DAG);
+ CCOp = ExtraCmp;
+ Predicate = ExtraCC;
+ }
+ }
+
+ // Produce a normal comparison if we are first in the chain
+ if (!CCOp)
+ return emitComparison(LHS, RHS, CC, DL, DAG);
+ // Otherwise produce a ccmp.
+ return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
+ DAG);
+ }
+ assert((Opcode == ISD::AND || (Opcode == ISD::OR && Val->hasOneUse())) &&
+ "Valid conjunction/disjunction tree");
+
+ // Check if both sides can be transformed.
+ SDValue LHS = Val->getOperand(0);
+ SDValue RHS = Val->getOperand(1);
+
+ // In case of an OR we need to negate our operands and the result.
+ // (A v B) <=> not(not(A) ^ not(B))
+ bool NegateOpsAndResult = Opcode == ISD::OR;
+ // We can negate the results of all previous operations by inverting the
+ // predicate flags giving us a free negation for one side. The other side
+ // must be negatable by itself.
+ if (NegateOpsAndResult) {
+ // See which side we can negate.
+ bool CanNegateL;
+ bool isValidL = isConjunctionDisjunctionTree(LHS, CanNegateL);
+ assert(isValidL && "Valid conjunction/disjunction tree");
+ (void)isValidL;
+
+#ifndef NDEBUG
+ bool CanNegateR;
+ bool isValidR = isConjunctionDisjunctionTree(RHS, CanNegateR);
+ assert(isValidR && "Valid conjunction/disjunction tree");
+ assert((CanNegateL || CanNegateR) && "Valid conjunction/disjunction tree");
+#endif
+
+ // Order the side which we cannot negate to RHS so we can emit it first.
+ if (!CanNegateL)
+ std::swap(LHS, RHS);
+ } else {
+ bool NeedsNegOutL = LHS->getOpcode() == ISD::OR;
+ assert((!NeedsNegOutL || RHS->getOpcode() != ISD::OR) &&
+ "Valid conjunction/disjunction tree");
+ // Order the side where we need to negate the output flags to RHS so it
+ // gets emitted first.
+ if (NeedsNegOutL)
+ std::swap(LHS, RHS);
+ }
+
+ // Emit RHS. If we want to negate the tree we only need to push a negate
+ // through if we are already in a PushNegate case, otherwise we can negate
+ // the "flags to test" afterwards.
+ AArch64CC::CondCode RHSCC;
+ SDValue CmpR = emitConjunctionDisjunctionTreeRec(DAG, RHS, RHSCC, Negate,
+ CCOp, Predicate);
+ if (NegateOpsAndResult && !Negate)
+ RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
+ // Emit LHS. We may need to negate it.
+ SDValue CmpL = emitConjunctionDisjunctionTreeRec(DAG, LHS, OutCC,
+ NegateOpsAndResult, CmpR,
+ RHSCC);
+ // If we transformed an OR to and AND then we have to negate the result
+ // (or absorb the Negate parameter).
+ if (NegateOpsAndResult && !Negate)
+ OutCC = AArch64CC::getInvertedCondCode(OutCC);
+ return CmpL;
+}
+
+/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
+/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
+/// \see emitConjunctionDisjunctionTreeRec().
+static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val,
+ AArch64CC::CondCode &OutCC) {
+ bool CanNegate;
+ if (!isConjunctionDisjunctionTree(Val, CanNegate))
+ return SDValue();
+
+ return emitConjunctionDisjunctionTreeRec(DAG, Val, OutCC, false, SDValue(),
+ AArch64CC::AL);
+}
+
+/// @}
+
+static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+ SDValue &AArch64cc, SelectionDAG &DAG,
+ const SDLoc &dl) {
+ if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
+ EVT VT = RHS.getValueType();
+ uint64_t C = RHSC->getZExtValue();
+ if (!isLegalArithImmed(C)) {
+ // Constant does not fit, try adjusting it by one?
+ switch (CC) {
+ default:
+ break;
+ case ISD::SETLT:
+ case ISD::SETGE:
+ if ((VT == MVT::i32 && C != 0x80000000 &&
+ isLegalArithImmed((uint32_t)(C - 1))) ||
+ (VT == MVT::i64 && C != 0x80000000ULL &&
+ isLegalArithImmed(C - 1ULL))) {
+ CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
+ C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
+ RHS = DAG.getConstant(C, dl, VT);
+ }
+ break;
+ case ISD::SETULT:
+ case ISD::SETUGE:
+ if ((VT == MVT::i32 && C != 0 &&
+ isLegalArithImmed((uint32_t)(C - 1))) ||
+ (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
+ CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
+ C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
+ RHS = DAG.getConstant(C, dl, VT);
+ }
+ break;
+ case ISD::SETLE:
+ case ISD::SETGT:
+ if ((VT == MVT::i32 && C != INT32_MAX &&
+ isLegalArithImmed((uint32_t)(C + 1))) ||
+ (VT == MVT::i64 && C != INT64_MAX &&
+ isLegalArithImmed(C + 1ULL))) {
+ CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
+ C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
+ RHS = DAG.getConstant(C, dl, VT);
+ }
+ break;
+ case ISD::SETULE:
+ case ISD::SETUGT:
+ if ((VT == MVT::i32 && C != UINT32_MAX &&
+ isLegalArithImmed((uint32_t)(C + 1))) ||
+ (VT == MVT::i64 && C != UINT64_MAX &&
+ isLegalArithImmed(C + 1ULL))) {
+ CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
+ C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
+ RHS = DAG.getConstant(C, dl, VT);
+ }
+ break;
+ }
+ }
+ }
+ SDValue Cmp;
+ AArch64CC::CondCode AArch64CC;
+ if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
+ const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
+
+ // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
+ // For the i8 operand, the largest immediate is 255, so this can be easily
+ // encoded in the compare instruction. For the i16 operand, however, the
+ // largest immediate cannot be encoded in the compare.
+ // Therefore, use a sign extending load and cmn to avoid materializing the
+ // -1 constant. For example,
+ // movz w1, #65535
+ // ldrh w0, [x0, #0]
+ // cmp w0, w1
+ // >
+ // ldrsh w0, [x0, #0]
+ // cmn w0, #1
+ // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
+ // if and only if (sext LHS) == (sext RHS). The checks are in place to
+ // ensure both the LHS and RHS are truly zero extended and to make sure the
+ // transformation is profitable.
+ if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
+ cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
+ cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
+ LHS.getNode()->hasNUsesOfValue(1, 0)) {
+ int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
+ if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
+ SDValue SExt =
+ DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
+ DAG.getValueType(MVT::i16));
+ Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
+ RHS.getValueType()),
+ CC, dl, DAG);
+ AArch64CC = changeIntCCToAArch64CC(CC);
+ }
+ }
+
+ if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) {
+ if ((Cmp = emitConjunctionDisjunctionTree(DAG, LHS, AArch64CC))) {
+ if ((CC == ISD::SETNE) ^ RHSC->isNullValue())
+ AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
+ }
+ }
+ }
+
+ if (!Cmp) {
+ Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+ AArch64CC = changeIntCCToAArch64CC(CC);
+ }
+ AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
+ return Cmp;
+}
+
+static std::pair<SDValue, SDValue>
+getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
+ assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
+ "Unsupported value type");
+ SDValue Value, Overflow;
+ SDLoc DL(Op);
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ unsigned Opc = 0;
+ switch (Op.getOpcode()) {
+ default:
+ llvm_unreachable("Unknown overflow instruction!");
+ case ISD::SADDO:
+ Opc = AArch64ISD::ADDS;
+ CC = AArch64CC::VS;
+ break;
+ case ISD::UADDO:
+ Opc = AArch64ISD::ADDS;
+ CC = AArch64CC::HS;
+ break;
+ case ISD::SSUBO:
+ Opc = AArch64ISD::SUBS;
+ CC = AArch64CC::VS;
+ break;
+ case ISD::USUBO:
+ Opc = AArch64ISD::SUBS;
+ CC = AArch64CC::LO;
+ break;
+ // Multiply needs a little bit extra work.
+ case ISD::SMULO:
+ case ISD::UMULO: {
+ CC = AArch64CC::NE;
+ bool IsSigned = Op.getOpcode() == ISD::SMULO;
+ if (Op.getValueType() == MVT::i32) {
+ unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ // For a 32 bit multiply with overflow check we want the instruction
+ // selector to generate a widening multiply (SMADDL/UMADDL). For that we
+ // need to generate the following pattern:
+ // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
+ LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
+ RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
+ SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
+ SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
+ DAG.getConstant(0, DL, MVT::i64));
+ // On AArch64 the upper 32 bits are always zero extended for a 32 bit
+ // operation. We need to clear out the upper 32 bits, because we used a
+ // widening multiply that wrote all 64 bits. In the end this should be a
+ // noop.
+ Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
+ if (IsSigned) {
+ // The signed overflow check requires more than just a simple check for
+ // any bit set in the upper 32 bits of the result. These bits could be
+ // just the sign bits of a negative number. To perform the overflow
+ // check we have to arithmetic shift right the 32nd bit of the result by
+ // 31 bits. Then we compare the result to the upper 32 bits.
+ SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
+ DAG.getConstant(32, DL, MVT::i64));
+ UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
+ SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
+ DAG.getConstant(31, DL, MVT::i64));
+ // It is important that LowerBits is last, otherwise the arithmetic
+ // shift will not be folded into the compare (SUBS).
+ SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
+ Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
+ .getValue(1);
+ } else {
+ // The overflow check for unsigned multiply is easy. We only need to
+ // check if any of the upper 32 bits are set. This can be done with a
+ // CMP (shifted register). For that we need to generate the following
+ // pattern:
+ // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
+ SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
+ DAG.getConstant(32, DL, MVT::i64));
+ SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
+ Overflow =
+ DAG.getNode(AArch64ISD::SUBS, DL, VTs,
+ DAG.getConstant(0, DL, MVT::i64),
+ UpperBits).getValue(1);
+ }
+ break;
+ }
+ assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
+ // For the 64 bit multiply
+ Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
+ if (IsSigned) {
+ SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
+ SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
+ DAG.getConstant(63, DL, MVT::i64));
+ // It is important that LowerBits is last, otherwise the arithmetic
+ // shift will not be folded into the compare (SUBS).
+ SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
+ Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
+ .getValue(1);
+ } else {
+ SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
+ SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
+ Overflow =
+ DAG.getNode(AArch64ISD::SUBS, DL, VTs,
+ DAG.getConstant(0, DL, MVT::i64),
+ UpperBits).getValue(1);
+ }
+ break;
+ }
+ } // switch (...)
+
+ if (Opc) {
+ SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
+
+ // Emit the AArch64 operation with overflow check.
+ Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
+ Overflow = Value.getValue(1);
+ }
+ return std::make_pair(Value, Overflow);
+}
+
+SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
+ RTLIB::Libcall Call) const {
+ SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
+ return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first;
+}
+
+static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
+ SDValue Sel = Op.getOperand(0);
+ SDValue Other = Op.getOperand(1);
+
+ // If neither operand is a SELECT_CC, give up.
+ if (Sel.getOpcode() != ISD::SELECT_CC)
+ std::swap(Sel, Other);
+ if (Sel.getOpcode() != ISD::SELECT_CC)
+ return Op;
+
+ // The folding we want to perform is:
+ // (xor x, (select_cc a, b, cc, 0, -1) )
+ // -->
+ // (csel x, (xor x, -1), cc ...)
+ //
+ // The latter will get matched to a CSINV instruction.
+
+ ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
+ SDValue LHS = Sel.getOperand(0);
+ SDValue RHS = Sel.getOperand(1);
+ SDValue TVal = Sel.getOperand(2);
+ SDValue FVal = Sel.getOperand(3);
+ SDLoc dl(Sel);
+
+ // FIXME: This could be generalized to non-integer comparisons.
+ if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
+ return Op;
+
+ ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
+ ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
+
+ // The values aren't constants, this isn't the pattern we're looking for.
+ if (!CFVal || !CTVal)
+ return Op;
+
+ // We can commute the SELECT_CC by inverting the condition. This
+ // might be needed to make this fit into a CSINV pattern.
+ if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
+ std::swap(TVal, FVal);
+ std::swap(CTVal, CFVal);
+ CC = ISD::getSetCCInverse(CC, true);
+ }
+
+ // If the constants line up, perform the transform!
+ if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
+ SDValue CCVal;
+ SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
+
+ FVal = Other;
+ TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
+ DAG.getConstant(-1ULL, dl, Other.getValueType()));
+
+ return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
+ CCVal, Cmp);
+ }
+
+ return Op;
+}
+
+static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
+ EVT VT = Op.getValueType();
+
+ // Let legalize expand this if it isn't a legal type yet.
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ return SDValue();
+
+ SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+
+ unsigned Opc;
+ bool ExtraOp = false;
+ switch (Op.getOpcode()) {
+ default:
+ llvm_unreachable("Invalid code");
+ case ISD::ADDC:
+ Opc = AArch64ISD::ADDS;
+ break;
+ case ISD::SUBC:
+ Opc = AArch64ISD::SUBS;
+ break;
+ case ISD::ADDE:
+ Opc = AArch64ISD::ADCS;
+ ExtraOp = true;
+ break;
+ case ISD::SUBE:
+ Opc = AArch64ISD::SBCS;
+ ExtraOp = true;
+ break;
+ }
+
+ if (!ExtraOp)
+ return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
+ return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
+ Op.getOperand(2));
+}
+
+static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
+ // Let legalize expand this if it isn't a legal type yet.
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
+ return SDValue();
+
+ SDLoc dl(Op);
+ AArch64CC::CondCode CC;
+ // The actual operation that sets the overflow or carry flag.
+ SDValue Value, Overflow;
+ std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
+
+ // We use 0 and 1 as false and true values.
+ SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
+ SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
+
+ // We use an inverted condition, because the conditional select is inverted
+ // too. This will allow it to be selected to a single instruction:
+ // CSINC Wd, WZR, WZR, invert(cond).
+ SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
+ Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
+ CCVal, Overflow);
+
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+ return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
+}
+
+// Prefetch operands are:
+// 1: Address to prefetch
+// 2: bool isWrite
+// 3: int locality (0 = no locality ... 3 = extreme locality)
+// 4: bool isDataCache
+static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+ unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+ unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+
+ bool IsStream = !Locality;
+ // When the locality number is set
+ if (Locality) {
+ // The front-end should have filtered out the out-of-range values
+ assert(Locality <= 3 && "Prefetch locality out-of-range");
+ // The locality degree is the opposite of the cache speed.
+ // Put the number the other way around.
+ // The encoding starts at 0 for level 1
+ Locality = 3 - Locality;
+ }
+
+ // built the mask value encoding the expected behavior.
+ unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
+ (!IsData << 3) | // IsDataCache bit
+ (Locality << 1) | // Cache level bits
+ (unsigned)IsStream; // Stream bit
+ return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
+ DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1));
+}
+
+SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
+
+ RTLIB::Libcall LC;
+ LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
+
+ return LowerF128Call(Op, DAG, LC);
+}
+
+SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
+ SelectionDAG &DAG) const {
+ if (Op.getOperand(0).getValueType() != MVT::f128) {
+ // It's legal except when f128 is involved
+ return Op;
+ }
+
+ RTLIB::Libcall LC;
+ LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
+
+ // FP_ROUND node has a second operand indicating whether it is known to be
+ // precise. That doesn't take part in the LibCall so we can't directly use
+ // LowerF128Call.
+ SDValue SrcVal = Op.getOperand(0);
+ return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
+ SDLoc(Op)).first;
+}
+
+static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
+ // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
+ // Any additional optimization in this function should be recorded
+ // in the cost tables.
+ EVT InVT = Op.getOperand(0).getValueType();
+ EVT VT = Op.getValueType();
+ unsigned NumElts = InVT.getVectorNumElements();
+
+ // f16 vectors are promoted to f32 before a conversion.
+ if (InVT.getVectorElementType() == MVT::f16) {
+ MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
+ SDLoc dl(Op);
+ return DAG.getNode(
+ Op.getOpcode(), dl, Op.getValueType(),
+ DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
+ }
+
+ if (VT.getSizeInBits() < InVT.getSizeInBits()) {
+ SDLoc dl(Op);
+ SDValue Cv =
+ DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
+ Op.getOperand(0));
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
+ }
+
+ if (VT.getSizeInBits() > InVT.getSizeInBits()) {
+ SDLoc dl(Op);
+ MVT ExtVT =
+ MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
+ VT.getVectorNumElements());
+ SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
+ return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
+ }
+
+ // Type changing conversions are illegal.
+ return Op;
+}
+
+SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
+ SelectionDAG &DAG) const {
+ if (Op.getOperand(0).getValueType().isVector())
+ return LowerVectorFP_TO_INT(Op, DAG);
+
+ // f16 conversions are promoted to f32.
+ if (Op.getOperand(0).getValueType() == MVT::f16) {
+ SDLoc dl(Op);
+ return DAG.getNode(
+ Op.getOpcode(), dl, Op.getValueType(),
+ DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0)));
+ }
+
+ if (Op.getOperand(0).getValueType() != MVT::f128) {
+ // It's legal except when f128 is involved
+ return Op;
+ }
+
+ RTLIB::Libcall LC;
+ if (Op.getOpcode() == ISD::FP_TO_SINT)
+ LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType());
+ else
+ LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
+
+ SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
+ return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first;
+}
+
+static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
+ // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
+ // Any additional optimization in this function should be recorded
+ // in the cost tables.
+ EVT VT = Op.getValueType();
+ SDLoc dl(Op);
+ SDValue In = Op.getOperand(0);
+ EVT InVT = In.getValueType();
+
+ if (VT.getSizeInBits() < InVT.getSizeInBits()) {
+ MVT CastVT =
+ MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
+ InVT.getVectorNumElements());
+ In = DAG.getNode(Op.getOpcode(), dl, CastVT, In);
+ return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl));
+ }
+
+ if (VT.getSizeInBits() > InVT.getSizeInBits()) {
+ unsigned CastOpc =
+ Op.getOpcode() == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ EVT CastVT = VT.changeVectorElementTypeToInteger();
+ In = DAG.getNode(CastOpc, dl, CastVT, In);
+ return DAG.getNode(Op.getOpcode(), dl, VT, In);
+ }
+
+ return Op;
+}
+
+SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
+ SelectionDAG &DAG) const {
+ if (Op.getValueType().isVector())
+ return LowerVectorINT_TO_FP(Op, DAG);
+
+ // f16 conversions are promoted to f32.
+ if (Op.getValueType() == MVT::f16) {
+ SDLoc dl(Op);
+ return DAG.getNode(
+ ISD::FP_ROUND, dl, MVT::f16,
+ DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)),
+ DAG.getIntPtrConstant(0, dl));
+ }
+
+ // i128 conversions are libcalls.
+ if (Op.getOperand(0).getValueType() == MVT::i128)
+ return SDValue();
+
+ // Other conversions are legal, unless it's to the completely software-based
+ // fp128.
+ if (Op.getValueType() != MVT::f128)
+ return Op;
+
+ RTLIB::Libcall LC;
+ if (Op.getOpcode() == ISD::SINT_TO_FP)
+ LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
+ else
+ LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
+
+ return LowerF128Call(Op, DAG, LC);
+}
+
+SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
+ SelectionDAG &DAG) const {
+ // For iOS, we want to call an alternative entry point: __sincos_stret,
+ // which returns the values in two S / D registers.
+ SDLoc dl(Op);
+ SDValue Arg = Op.getOperand(0);
+ EVT ArgVT = Arg.getValueType();
+ Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+
+ ArgListTy Args;
+ ArgListEntry Entry;
+
+ Entry.Node = Arg;
+ Entry.Ty = ArgTy;
+ Entry.isSExt = false;
+ Entry.isZExt = false;
+ Args.push_back(Entry);
+
+ const char *LibcallName =
+ (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";
+ SDValue Callee =
+ DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
+
+ StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
+ .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
+
+ std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+ return CallResult.first;
+}
+
+static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) {
+ if (Op.getValueType() != MVT::f16)
+ return SDValue();
+
+ assert(Op.getOperand(0).getValueType() == MVT::i16);
+ SDLoc DL(Op);
+
+ Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
+ Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
+ return SDValue(
+ DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op,
+ DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
+ 0);
+}
+
+static EVT getExtensionTo64Bits(const EVT &OrigVT) {
+ if (OrigVT.getSizeInBits() >= 64)
+ return OrigVT;
+
+ assert(OrigVT.isSimple() && "Expecting a simple value type");
+
+ MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
+ switch (OrigSimpleTy) {
+ default: llvm_unreachable("Unexpected Vector Type");
+ case MVT::v2i8:
+ case MVT::v2i16:
+ return MVT::v2i32;
+ case MVT::v4i8:
+ return MVT::v4i16;
+ }
+}
+
+static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG,
+ const EVT &OrigTy,
+ const EVT &ExtTy,
+ unsigned ExtOpcode) {
+ // The vector originally had a size of OrigTy. It was then extended to ExtTy.
+ // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
+ // 64-bits we need to insert a new extension so that it will be 64-bits.
+ assert(ExtTy.is128BitVector() && "Unexpected extension size");
+ if (OrigTy.getSizeInBits() >= 64)
+ return N;
+
+ // Must extend size to at least 64 bits to be used as an operand for VMULL.
+ EVT NewVT = getExtensionTo64Bits(OrigTy);
+
+ return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
+}
+
+static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
+ bool isSigned) {
+ EVT VT = N->getValueType(0);
+
+ if (N->getOpcode() != ISD::BUILD_VECTOR)
+ return false;
+
+ for (const SDValue &Elt : N->op_values()) {
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
+ unsigned EltSize = VT.getScalarSizeInBits();
+ unsigned HalfSize = EltSize / 2;
+ if (isSigned) {
+ if (!isIntN(HalfSize, C->getSExtValue()))
+ return false;
+ } else {
+ if (!isUIntN(HalfSize, C->getZExtValue()))
+ return false;
+ }
+ continue;
+ }
+ return false;
+ }
+
+ return true;
+}
+
+static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
+ if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
+ return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG,
+ N->getOperand(0)->getValueType(0),
+ N->getValueType(0),
+ N->getOpcode());
+
+ assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
+ EVT VT = N->getValueType(0);
+ SDLoc dl(N);
+ unsigned EltSize = VT.getScalarSizeInBits() / 2;
+ unsigned NumElts = VT.getVectorNumElements();
+ MVT TruncVT = MVT::getIntegerVT(EltSize);
+ SmallVector<SDValue, 8> Ops;
+ for (unsigned i = 0; i != NumElts; ++i) {
+ ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
+ const APInt &CInt = C->getAPIntValue();
+ // Element types smaller than 32 bits are not legal, so use i32 elements.
+ // The values are implicitly truncated so sext vs. zext doesn't matter.
+ Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
+ }
+ return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
+}
+
+static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
+ if (N->getOpcode() == ISD::SIGN_EXTEND)
+ return true;
+ if (isExtendedBUILD_VECTOR(N, DAG, true))
+ return true;
+ return false;
+}
+
+static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
+ if (N->getOpcode() == ISD::ZERO_EXTEND)
+ return true;
+ if (isExtendedBUILD_VECTOR(N, DAG, false))
+ return true;
+ return false;
+}
+
+static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
+ unsigned Opcode = N->getOpcode();
+ if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
+ SDNode *N0 = N->getOperand(0).getNode();
+ SDNode *N1 = N->getOperand(1).getNode();
+ return N0->hasOneUse() && N1->hasOneUse() &&
+ isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
+ }
+ return false;
+}
+
+static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
+ unsigned Opcode = N->getOpcode();
+ if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
+ SDNode *N0 = N->getOperand(0).getNode();
+ SDNode *N1 = N->getOperand(1).getNode();
+ return N0->hasOneUse() && N1->hasOneUse() &&
+ isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
+ }
+ return false;
+}
+
+static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
+ // Multiplications are only custom-lowered for 128-bit vectors so that
+ // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
+ EVT VT = Op.getValueType();
+ assert(VT.is128BitVector() && VT.isInteger() &&
+ "unexpected type for custom-lowering ISD::MUL");
+ SDNode *N0 = Op.getOperand(0).getNode();
+ SDNode *N1 = Op.getOperand(1).getNode();
+ unsigned NewOpc = 0;
+ bool isMLA = false;
+ bool isN0SExt = isSignExtended(N0, DAG);
+ bool isN1SExt = isSignExtended(N1, DAG);
+ if (isN0SExt && isN1SExt)
+ NewOpc = AArch64ISD::SMULL;
+ else {
+ bool isN0ZExt = isZeroExtended(N0, DAG);
+ bool isN1ZExt = isZeroExtended(N1, DAG);
+ if (isN0ZExt && isN1ZExt)
+ NewOpc = AArch64ISD::UMULL;
+ else if (isN1SExt || isN1ZExt) {
+ // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
+ // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
+ if (isN1SExt && isAddSubSExt(N0, DAG)) {
+ NewOpc = AArch64ISD::SMULL;
+ isMLA = true;
+ } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
+ NewOpc = AArch64ISD::UMULL;
+ isMLA = true;
+ } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
+ std::swap(N0, N1);
+ NewOpc = AArch64ISD::UMULL;
+ isMLA = true;
+ }
+ }
+
+ if (!NewOpc) {
+ if (VT == MVT::v2i64)
+ // Fall through to expand this. It is not legal.
+ return SDValue();
+ else
+ // Other vector multiplications are legal.
+ return Op;
+ }
+ }
+
+ // Legalize to a S/UMULL instruction
+ SDLoc DL(Op);
+ SDValue Op0;
+ SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
+ if (!isMLA) {
+ Op0 = skipExtensionForVectorMULL(N0, DAG);
+ assert(Op0.getValueType().is64BitVector() &&
+ Op1.getValueType().is64BitVector() &&
+ "unexpected types for extended operands to VMULL");
+ return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
+ }
+ // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
+ // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
+ // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
+ SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
+ SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
+ EVT Op1VT = Op1.getValueType();
+ return DAG.getNode(N0->getOpcode(), DL, VT,
+ DAG.getNode(NewOpc, DL, VT,
+ DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
+ DAG.getNode(NewOpc, DL, VT,
+ DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
+}
+
+SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
+ SelectionDAG &DAG) const {
+ unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ SDLoc dl(Op);
+ switch (IntNo) {
+ default: return SDValue(); // Don't custom lower most intrinsics.
+ case Intrinsic::thread_pointer: {
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
+ }
+ case Intrinsic::aarch64_neon_smax:
+ return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::aarch64_neon_umax:
+ return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::aarch64_neon_smin:
+ return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::aarch64_neon_umin:
+ return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ }
+}
+
+SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
+ SelectionDAG &DAG) const {
+ switch (Op.getOpcode()) {
+ default:
+ llvm_unreachable("unimplemented operand");
+ return SDValue();
+ case ISD::BITCAST:
+ return LowerBITCAST(Op, DAG);
+ case ISD::GlobalAddress:
+ return LowerGlobalAddress(Op, DAG);
+ case ISD::GlobalTLSAddress:
+ return LowerGlobalTLSAddress(Op, DAG);
+ case ISD::SETCC:
+ return LowerSETCC(Op, DAG);
+ case ISD::BR_CC:
+ return LowerBR_CC(Op, DAG);
+ case ISD::SELECT:
+ return LowerSELECT(Op, DAG);
+ case ISD::SELECT_CC:
+ return LowerSELECT_CC(Op, DAG);
+ case ISD::JumpTable:
+ return LowerJumpTable(Op, DAG);
+ case ISD::ConstantPool:
+ return LowerConstantPool(Op, DAG);
+ case ISD::BlockAddress:
+ return LowerBlockAddress(Op, DAG);
+ case ISD::VASTART:
+ return LowerVASTART(Op, DAG);
+ case ISD::VACOPY:
+ return LowerVACOPY(Op, DAG);
+ case ISD::VAARG:
+ return LowerVAARG(Op, DAG);
+ case ISD::ADDC:
+ case ISD::ADDE:
+ case ISD::SUBC:
+ case ISD::SUBE:
+ return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
+ case ISD::SADDO:
+ case ISD::UADDO:
+ case ISD::SSUBO:
+ case ISD::USUBO:
+ case ISD::SMULO:
+ case ISD::UMULO:
+ return LowerXALUO(Op, DAG);
+ case ISD::FADD:
+ return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
+ case ISD::FSUB:
+ return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
+ case ISD::FMUL:
+ return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
+ case ISD::FDIV:
+ return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
+ case ISD::FP_ROUND:
+ return LowerFP_ROUND(Op, DAG);
+ case ISD::FP_EXTEND:
+ return LowerFP_EXTEND(Op, DAG);
+ case ISD::FRAMEADDR:
+ return LowerFRAMEADDR(Op, DAG);
+ case ISD::RETURNADDR:
+ return LowerRETURNADDR(Op, DAG);
+ case ISD::INSERT_VECTOR_ELT:
+ return LowerINSERT_VECTOR_ELT(Op, DAG);
+ case ISD::EXTRACT_VECTOR_ELT:
+ return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+ case ISD::BUILD_VECTOR:
+ return LowerBUILD_VECTOR(Op, DAG);
+ case ISD::VECTOR_SHUFFLE:
+ return LowerVECTOR_SHUFFLE(Op, DAG);
+ case ISD::EXTRACT_SUBVECTOR:
+ return LowerEXTRACT_SUBVECTOR(Op, DAG);
+ case ISD::SRA:
+ case ISD::SRL:
+ case ISD::SHL:
+ return LowerVectorSRA_SRL_SHL(Op, DAG);
+ case ISD::SHL_PARTS:
+ return LowerShiftLeftParts(Op, DAG);
+ case ISD::SRL_PARTS:
+ case ISD::SRA_PARTS:
+ return LowerShiftRightParts(Op, DAG);
+ case ISD::CTPOP:
+ return LowerCTPOP(Op, DAG);
+ case ISD::FCOPYSIGN:
+ return LowerFCOPYSIGN(Op, DAG);
+ case ISD::AND:
+ return LowerVectorAND(Op, DAG);
+ case ISD::OR:
+ return LowerVectorOR(Op, DAG);
+ case ISD::XOR:
+ return LowerXOR(Op, DAG);
+ case ISD::PREFETCH:
+ return LowerPREFETCH(Op, DAG);
+ case ISD::SINT_TO_FP:
+ case ISD::UINT_TO_FP:
+ return LowerINT_TO_FP(Op, DAG);
+ case ISD::FP_TO_SINT:
+ case ISD::FP_TO_UINT:
+ return LowerFP_TO_INT(Op, DAG);
+ case ISD::FSINCOS:
+ return LowerFSINCOS(Op, DAG);
+ case ISD::MUL:
+ return LowerMUL(Op, DAG);
+ case ISD::INTRINSIC_WO_CHAIN:
+ return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "AArch64GenCallingConv.inc"
+
+/// Selects the correct CCAssignFn for a given CallingConvention value.
+CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
+ bool IsVarArg) const {
+ switch (CC) {
+ default:
+ llvm_unreachable("Unsupported calling convention.");
+ case CallingConv::WebKit_JS:
+ return CC_AArch64_WebKit_JS;
+ case CallingConv::GHC:
+ return CC_AArch64_GHC;
+ case CallingConv::C:
+ case CallingConv::Fast:
+ case CallingConv::PreserveMost:
+ case CallingConv::CXX_FAST_TLS:
+ case CallingConv::Swift:
+ if (!Subtarget->isTargetDarwin())
+ return CC_AArch64_AAPCS;
+ return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS;
+ }
+}
+
+CCAssignFn *
+AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
+ return CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS
+ : RetCC_AArch64_AAPCS;
+}
+
+SDValue AArch64TargetLowering::LowerFormalArguments(
+ SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ // Assign locations to all of the incoming arguments.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+ *DAG.getContext());
+
+ // At this point, Ins[].VT may already be promoted to i32. To correctly
+ // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
+ // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
+ // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
+ // we use a special version of AnalyzeFormalArguments to pass in ValVT and
+ // LocVT.
+ unsigned NumArgs = Ins.size();
+ Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin();
+ unsigned CurArgIdx = 0;
+ for (unsigned i = 0; i != NumArgs; ++i) {
+ MVT ValVT = Ins[i].VT;
+ if (Ins[i].isOrigArg()) {
+ std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
+ CurArgIdx = Ins[i].getOrigArgIndex();
+
+ // Get type of the original argument.
+ EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
+ /*AllowUnknown*/ true);
+ MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
+ // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
+ if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
+ ValVT = MVT::i8;
+ else if (ActualMVT == MVT::i16)
+ ValVT = MVT::i16;
+ }
+ CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
+ bool Res =
+ AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
+ assert(!Res && "Call operand has unhandled type");
+ (void)Res;
+ }
+ assert(ArgLocs.size() == Ins.size());
+ SmallVector<SDValue, 16> ArgValues;
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+
+ if (Ins[i].Flags.isByVal()) {
+ // Byval is used for HFAs in the PCS, but the system should work in a
+ // non-compliant manner for larger structs.
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ int Size = Ins[i].Flags.getByValSize();
+ unsigned NumRegs = (Size + 7) / 8;
+
+ // FIXME: This works on big-endian for composite byvals, which are the common
+ // case. It should also work for fundamental types too.
+ unsigned FrameIdx =
+ MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
+ SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
+ InVals.push_back(FrameIdxN);
+
+ continue;
+ }
+
+ if (VA.isRegLoc()) {
+ // Arguments stored in registers.
+ EVT RegVT = VA.getLocVT();
+
+ SDValue ArgValue;
+ const TargetRegisterClass *RC;
+
+ if (RegVT == MVT::i32)
+ RC = &AArch64::GPR32RegClass;
+ else if (RegVT == MVT::i64)
+ RC = &AArch64::GPR64RegClass;
+ else if (RegVT == MVT::f16)
+ RC = &AArch64::FPR16RegClass;
+ else if (RegVT == MVT::f32)
+ RC = &AArch64::FPR32RegClass;
+ else if (RegVT == MVT::f64 || RegVT.is64BitVector())
+ RC = &AArch64::FPR64RegClass;
+ else if (RegVT == MVT::f128 || RegVT.is128BitVector())
+ RC = &AArch64::FPR128RegClass;
+ else
+ llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
+
+ // Transform the arguments in physical registers into virtual ones.
+ unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+ ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
+
+ // If this is an 8, 16 or 32-bit value, it is really passed promoted
+ // to 64 bits. Insert an assert[sz]ext to capture this, then
+ // truncate to the right size.
+ switch (VA.getLocInfo()) {
+ default:
+ llvm_unreachable("Unknown loc info!");
+ case CCValAssign::Full:
+ break;
+ case CCValAssign::BCvt:
+ ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
+ break;
+ case CCValAssign::AExt:
+ case CCValAssign::SExt:
+ case CCValAssign::ZExt:
+ // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt
+ // nodes after our lowering.
+ assert(RegVT == Ins[i].VT && "incorrect register location selected");
+ break;
+ }
+
+ InVals.push_back(ArgValue);
+
+ } else { // VA.isRegLoc()
+ assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
+ unsigned ArgOffset = VA.getLocMemOffset();
+ unsigned ArgSize = VA.getValVT().getSizeInBits() / 8;
+
+ uint32_t BEAlign = 0;
+ if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
+ !Ins[i].Flags.isInConsecutiveRegs())
+ BEAlign = 8 - ArgSize;
+
+ int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
+
+ // Create load nodes to retrieve arguments from the stack.
+ SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+ SDValue ArgValue;
+
+ // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
+ ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
+ MVT MemVT = VA.getValVT();
+
+ switch (VA.getLocInfo()) {
+ default:
+ break;
+ case CCValAssign::BCvt:
+ MemVT = VA.getLocVT();
+ break;
+ case CCValAssign::SExt:
+ ExtType = ISD::SEXTLOAD;
+ break;
+ case CCValAssign::ZExt:
+ ExtType = ISD::ZEXTLOAD;
+ break;
+ case CCValAssign::AExt:
+ ExtType = ISD::EXTLOAD;
+ break;
+ }
+
+ ArgValue = DAG.getExtLoad(
+ ExtType, DL, VA.getLocVT(), Chain, FIN,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
+ MemVT);
+
+ InVals.push_back(ArgValue);
+ }
+ }
+
+ // varargs
+ AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+ if (isVarArg) {
+ if (!Subtarget->isTargetDarwin()) {
+ // The AAPCS variadic function ABI is identical to the non-variadic
+ // one. As a result there may be more arguments in registers and we should
+ // save them for future reference.
+ saveVarArgRegisters(CCInfo, DAG, DL, Chain);
+ }
+
+ // This will point to the next argument passed via stack.
+ unsigned StackOffset = CCInfo.getNextStackOffset();
+ // We currently pass all varargs at 8-byte alignment.
+ StackOffset = ((StackOffset + 7) & ~7);
+ FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
+ }
+
+ unsigned StackArgSize = CCInfo.getNextStackOffset();
+ bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
+ if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
+ // This is a non-standard ABI so by fiat I say we're allowed to make full
+ // use of the stack area to be popped, which must be aligned to 16 bytes in
+ // any case:
+ StackArgSize = alignTo(StackArgSize, 16);
+
+ // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
+ // a multiple of 16.
+ FuncInfo->setArgumentStackToRestore(StackArgSize);
+
+ // This realignment carries over to the available bytes below. Our own
+ // callers will guarantee the space is free by giving an aligned value to
+ // CALLSEQ_START.
+ }
+ // Even if we're not expected to free up the space, it's useful to know how
+ // much is there while considering tail calls (because we can reuse it).
+ FuncInfo->setBytesInStackArgArea(StackArgSize);
+
+ return Chain;
+}
+
+void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
+ SelectionDAG &DAG,
+ const SDLoc &DL,
+ SDValue &Chain) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+ SmallVector<SDValue, 8> MemOps;
+
+ static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
+ AArch64::X3, AArch64::X4, AArch64::X5,
+ AArch64::X6, AArch64::X7 };
+ static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
+ unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
+
+ unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
+ int GPRIdx = 0;
+ if (GPRSaveSize != 0) {
+ GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false);
+
+ SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
+
+ for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
+ unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
+ SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
+ SDValue Store = DAG.getStore(
+ Val.getValue(1), DL, Val, FIN,
+ MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8));
+ MemOps.push_back(Store);
+ FIN =
+ DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
+ }
+ }
+ FuncInfo->setVarArgsGPRIndex(GPRIdx);
+ FuncInfo->setVarArgsGPRSize(GPRSaveSize);
+
+ if (Subtarget->hasFPARMv8()) {
+ static const MCPhysReg FPRArgRegs[] = {
+ AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
+ AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
+ static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
+ unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
+
+ unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
+ int FPRIdx = 0;
+ if (FPRSaveSize != 0) {
+ FPRIdx = MFI.CreateStackObject(FPRSaveSize, 16, false);
+
+ SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
+
+ for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
+ unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
+ SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
+
+ SDValue Store = DAG.getStore(
+ Val.getValue(1), DL, Val, FIN,
+ MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16));
+ MemOps.push_back(Store);
+ FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
+ DAG.getConstant(16, DL, PtrVT));
+ }
+ }
+ FuncInfo->setVarArgsFPRIndex(FPRIdx);
+ FuncInfo->setVarArgsFPRSize(FPRSaveSize);
+ }
+
+ if (!MemOps.empty()) {
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
+ }
+}
+
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+SDValue AArch64TargetLowering::LowerCallResult(
+ SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
+ SDValue ThisVal) const {
+ CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
+ ? RetCC_AArch64_WebKit_JS
+ : RetCC_AArch64_AAPCS;
+ // Assign locations to each value returned by this call.
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+ *DAG.getContext());
+ CCInfo.AnalyzeCallResult(Ins, RetCC);
+
+ // Copy all of the result registers out of their specified physreg.
+ for (unsigned i = 0; i != RVLocs.size(); ++i) {
+ CCValAssign VA = RVLocs[i];
+
+ // Pass 'this' value directly from the argument to return value, to avoid
+ // reg unit interference
+ if (i == 0 && isThisReturn) {
+ assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
+ "unexpected return calling convention register assignment");
+ InVals.push_back(ThisVal);
+ continue;
+ }
+
+ SDValue Val =
+ DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
+ Chain = Val.getValue(1);
+ InFlag = Val.getValue(2);
+
+ switch (VA.getLocInfo()) {
+ default:
+ llvm_unreachable("Unknown loc info!");
+ case CCValAssign::Full:
+ break;
+ case CCValAssign::BCvt:
+ Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
+ break;
+ }
+
+ InVals.push_back(Val);
+ }
+
+ return Chain;
+}
+
+/// Return true if the calling convention is one that we can guarantee TCO for.
+static bool canGuaranteeTCO(CallingConv::ID CC) {
+ return CC == CallingConv::Fast;
+}
+
+/// Return true if we might ever do TCO for calls with this calling convention.
+static bool mayTailCallThisCC(CallingConv::ID CC) {
+ switch (CC) {
+ case CallingConv::C:
+ case CallingConv::PreserveMost:
+ case CallingConv::Swift:
+ return true;
+ default:
+ return canGuaranteeTCO(CC);
+ }
+}
+
+bool AArch64TargetLowering::isEligibleForTailCallOptimization(
+ SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
+ if (!mayTailCallThisCC(CalleeCC))
+ return false;
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ const Function *CallerF = MF.getFunction();
+ CallingConv::ID CallerCC = CallerF->getCallingConv();
+ bool CCMatch = CallerCC == CalleeCC;
+
+ // Byval parameters hand the function a pointer directly into the stack area
+ // we want to reuse during a tail call. Working around this *is* possible (see
+ // X86) but less efficient and uglier in LowerCall.
+ for (Function::const_arg_iterator i = CallerF->arg_begin(),
+ e = CallerF->arg_end();
+ i != e; ++i)
+ if (i->hasByValAttr())
+ return false;
+
+ if (getTargetMachine().Options.GuaranteedTailCallOpt)
+ return canGuaranteeTCO(CalleeCC) && CCMatch;
+
+ // Externally-defined functions with weak linkage should not be
+ // tail-called on AArch64 when the OS does not support dynamic
+ // pre-emption of symbols, as the AAELF spec requires normal calls
+ // to undefined weak functions to be replaced with a NOP or jump to the
+ // next instruction. The behaviour of branch instructions in this
+ // situation (as used for tail calls) is implementation-defined, so we
+ // cannot rely on the linker replacing the tail call with a return.
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ const GlobalValue *GV = G->getGlobal();
+ const Triple &TT = getTargetMachine().getTargetTriple();
+ if (GV->hasExternalWeakLinkage() &&
+ (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
+ return false;
+ }
+
+ // Now we search for cases where we can use a tail call without changing the
+ // ABI. Sibcall is used in some places (particularly gcc) to refer to this
+ // concept.
+
+ // I want anyone implementing a new calling convention to think long and hard
+ // about this assert.
+ assert((!isVarArg || CalleeCC == CallingConv::C) &&
+ "Unexpected variadic calling convention");
+
+ LLVMContext &C = *DAG.getContext();
+ if (isVarArg && !Outs.empty()) {
+ // At least two cases here: if caller is fastcc then we can't have any
+ // memory arguments (we'd be expected to clean up the stack afterwards). If
+ // caller is C then we could potentially use its argument area.
+
+ // FIXME: for now we take the most conservative of these in both cases:
+ // disallow all variadic memory operands.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
+
+ CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
+ for (const CCValAssign &ArgLoc : ArgLocs)
+ if (!ArgLoc.isRegLoc())
+ return false;
+ }
+
+ // Check that the call results are passed in the same way.
+ if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
+ CCAssignFnForCall(CalleeCC, isVarArg),
+ CCAssignFnForCall(CallerCC, isVarArg)))
+ return false;
+ // The callee has to preserve all registers the caller needs to preserve.
+ const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
+ const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
+ if (!CCMatch) {
+ const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
+ if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
+ return false;
+ }
+
+ // Nothing more to check if the callee is taking no arguments
+ if (Outs.empty())
+ return true;
+
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
+
+ CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
+
+ const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+
+ // If the stack arguments for this call do not fit into our own save area then
+ // the call cannot be made tail.
+ if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
+ return false;
+
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
+ return false;
+
+ return true;
+}
+
+SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
+ SelectionDAG &DAG,
+ MachineFrameInfo &MFI,
+ int ClobberedFI) const {
+ SmallVector<SDValue, 8> ArgChains;
+ int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
+ int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
+
+ // Include the original chain at the beginning of the list. When this is
+ // used by target LowerCall hooks, this helps legalize find the
+ // CALLSEQ_BEGIN node.
+ ArgChains.push_back(Chain);
+
+ // Add a chain value for each stack argument corresponding
+ for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
+ UE = DAG.getEntryNode().getNode()->use_end();
+ U != UE; ++U)
+ if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
+ if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
+ if (FI->getIndex() < 0) {
+ int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
+ int64_t InLastByte = InFirstByte;
+ InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
+
+ if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
+ (FirstByte <= InFirstByte && InFirstByte <= LastByte))
+ ArgChains.push_back(SDValue(L, 1));
+ }
+
+ // Build a tokenfactor for all the chains.
+ return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
+}
+
+bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
+ bool TailCallOpt) const {
+ return CallCC == CallingConv::Fast && TailCallOpt;
+}
+
+/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
+/// and add input and output parameter nodes.
+SDValue
+AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const {
+ SelectionDAG &DAG = CLI.DAG;
+ SDLoc &DL = CLI.DL;
+ SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+ SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
+ SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
+ SDValue Chain = CLI.Chain;
+ SDValue Callee = CLI.Callee;
+ bool &IsTailCall = CLI.IsTailCall;
+ CallingConv::ID CallConv = CLI.CallConv;
+ bool IsVarArg = CLI.IsVarArg;
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ bool IsThisReturn = false;
+
+ AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+ bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
+ bool IsSibCall = false;
+
+ if (IsTailCall) {
+ // Check if it's really possible to do a tail call.
+ IsTailCall = isEligibleForTailCallOptimization(
+ Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
+ if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall())
+ report_fatal_error("failed to perform tail call elimination on a call "
+ "site marked musttail");
+
+ // A sibling call is one where we're under the usual C ABI and not planning
+ // to change that but can still do a tail call:
+ if (!TailCallOpt && IsTailCall)
+ IsSibCall = true;
+
+ if (IsTailCall)
+ ++NumTailCalls;
+ }
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
+ *DAG.getContext());
+
+ if (IsVarArg) {
+ // Handle fixed and variable vector arguments differently.
+ // Variable vector arguments always go into memory.
+ unsigned NumArgs = Outs.size();
+
+ for (unsigned i = 0; i != NumArgs; ++i) {
+ MVT ArgVT = Outs[i].VT;
+ ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+ CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
+ /*IsVarArg=*/ !Outs[i].IsFixed);
+ bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
+ assert(!Res && "Call operand has unhandled type");
+ (void)Res;
+ }
+ } else {
+ // At this point, Outs[].VT may already be promoted to i32. To correctly
+ // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
+ // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
+ // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
+ // we use a special version of AnalyzeCallOperands to pass in ValVT and
+ // LocVT.
+ unsigned NumArgs = Outs.size();
+ for (unsigned i = 0; i != NumArgs; ++i) {
+ MVT ValVT = Outs[i].VT;
+ // Get type of the original argument.
+ EVT ActualVT = getValueType(DAG.getDataLayout(),
+ CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
+ /*AllowUnknown*/ true);
+ MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
+ ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+ // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
+ if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
+ ValVT = MVT::i8;
+ else if (ActualMVT == MVT::i16)
+ ValVT = MVT::i16;
+
+ CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
+ bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
+ assert(!Res && "Call operand has unhandled type");
+ (void)Res;
+ }
+ }
+
+ // Get a count of how many bytes are to be pushed on the stack.
+ unsigned NumBytes = CCInfo.getNextStackOffset();
+
+ if (IsSibCall) {
+ // Since we're not changing the ABI to make this a tail call, the memory
+ // operands are already available in the caller's incoming argument space.
+ NumBytes = 0;
+ }
+
+ // FPDiff is the byte offset of the call's argument area from the callee's.
+ // Stores to callee stack arguments will be placed in FixedStackSlots offset
+ // by this amount for a tail call. In a sibling call it must be 0 because the
+ // caller will deallocate the entire stack and the callee still expects its
+ // arguments to begin at SP+0. Completely unused for non-tail calls.
+ int FPDiff = 0;
+
+ if (IsTailCall && !IsSibCall) {
+ unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
+
+ // Since callee will pop argument stack as a tail call, we must keep the
+ // popped size 16-byte aligned.
+ NumBytes = alignTo(NumBytes, 16);
+
+ // FPDiff will be negative if this tail call requires more space than we
+ // would automatically have in our incoming argument space. Positive if we
+ // can actually shrink the stack.
+ FPDiff = NumReusableBytes - NumBytes;
+
+ // The stack pointer must be 16-byte aligned at all times it's used for a
+ // memory operation, which in practice means at *all* times and in
+ // particular across call boundaries. Therefore our own arguments started at
+ // a 16-byte aligned SP and the delta applied for the tail call should
+ // satisfy the same constraint.
+ assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
+ }
+
+ // Adjust the stack pointer for the new arguments...
+ // These operations are automatically eliminated by the prolog/epilog pass
+ if (!IsSibCall)
+ Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, DL,
+ true),
+ DL);
+
+ SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
+ getPointerTy(DAG.getDataLayout()));
+
+ SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+ SmallVector<SDValue, 8> MemOpChains;
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+ // Walk the register/memloc assignments, inserting copies/loads.
+ for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
+ ++i, ++realArgIdx) {
+ CCValAssign &VA = ArgLocs[i];
+ SDValue Arg = OutVals[realArgIdx];
+ ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
+
+ // Promote the value if needed.
+ switch (VA.getLocInfo()) {
+ default:
+ llvm_unreachable("Unknown loc info!");
+ case CCValAssign::Full:
+ break;
+ case CCValAssign::SExt:
+ Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::ZExt:
+ Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::AExt:
+ if (Outs[realArgIdx].ArgVT == MVT::i1) {
+ // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
+ Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
+ Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
+ }
+ Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::BCvt:
+ Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::FPExt:
+ Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
+ break;
+ }
+
+ if (VA.isRegLoc()) {
+ if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i64) {
+ assert(VA.getLocVT() == MVT::i64 &&
+ "unexpected calling convention register assignment");
+ assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
+ "unexpected use of 'returned'");
+ IsThisReturn = true;
+ }
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+ } else {
+ assert(VA.isMemLoc());
+
+ SDValue DstAddr;
+ MachinePointerInfo DstInfo;
+
+ // FIXME: This works on big-endian for composite byvals, which are the
+ // common case. It should also work for fundamental types too.
+ uint32_t BEAlign = 0;
+ unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
+ : VA.getValVT().getSizeInBits();
+ OpSize = (OpSize + 7) / 8;
+ if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
+ !Flags.isInConsecutiveRegs()) {
+ if (OpSize < 8)
+ BEAlign = 8 - OpSize;
+ }
+ unsigned LocMemOffset = VA.getLocMemOffset();
+ int32_t Offset = LocMemOffset + BEAlign;
+ SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
+ PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
+
+ if (IsTailCall) {
+ Offset = Offset + FPDiff;
+ int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
+
+ DstAddr = DAG.getFrameIndex(FI, PtrVT);
+ DstInfo =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
+
+ // Make sure any stack arguments overlapping with where we're storing
+ // are loaded before this eventual operation. Otherwise they'll be
+ // clobbered.
+ Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
+ } else {
+ SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
+
+ DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
+ DstInfo = MachinePointerInfo::getStack(DAG.getMachineFunction(),
+ LocMemOffset);
+ }
+
+ if (Outs[i].Flags.isByVal()) {
+ SDValue SizeNode =
+ DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
+ SDValue Cpy = DAG.getMemcpy(
+ Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
+ /*isVol = */ false, /*AlwaysInline = */ false,
+ /*isTailCall = */ false,
+ DstInfo, MachinePointerInfo());
+
+ MemOpChains.push_back(Cpy);
+ } else {
+ // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
+ // promoted to a legal register type i32, we should truncate Arg back to
+ // i1/i8/i16.
+ if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
+ VA.getValVT() == MVT::i16)
+ Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
+
+ SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
+ MemOpChains.push_back(Store);
+ }
+ }
+ }
+
+ if (!MemOpChains.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
+
+ // Build a sequence of copy-to-reg nodes chained together with token chain
+ // and flag operands which copy the outgoing args into the appropriate regs.
+ SDValue InFlag;
+ for (auto &RegToPass : RegsToPass) {
+ Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
+ RegToPass.second, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
+ // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
+ // node so that legalize doesn't hack it.
+ if (getTargetMachine().getCodeModel() == CodeModel::Large &&
+ Subtarget->isTargetMachO()) {
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ const GlobalValue *GV = G->getGlobal();
+ bool InternalLinkage = GV->hasInternalLinkage();
+ if (InternalLinkage)
+ Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
+ else {
+ Callee =
+ DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT);
+ Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
+ }
+ } else if (ExternalSymbolSDNode *S =
+ dyn_cast<ExternalSymbolSDNode>(Callee)) {
+ const char *Sym = S->getSymbol();
+ Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);
+ Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
+ }
+ } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ const GlobalValue *GV = G->getGlobal();
+ Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
+ } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+ const char *Sym = S->getSymbol();
+ Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
+ }
+
+ // We don't usually want to end the call-sequence here because we would tidy
+ // the frame up *after* the call, however in the ABI-changing tail-call case
+ // we've carefully laid out the parameters so that when sp is reset they'll be
+ // in the correct location.
+ if (IsTailCall && !IsSibCall) {
+ Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
+ DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
+ InFlag = Chain.getValue(1);
+ }
+
+ std::vector<SDValue> Ops;
+ Ops.push_back(Chain);
+ Ops.push_back(Callee);
+
+ if (IsTailCall) {
+ // Each tail call may have to adjust the stack by a different amount, so
+ // this information must travel along with the operation for eventual
+ // consumption by emitEpilogue.
+ Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
+ }
+
+ // Add argument registers to the end of the list so that they are known live
+ // into the call.
+ for (auto &RegToPass : RegsToPass)
+ Ops.push_back(DAG.getRegister(RegToPass.first,
+ RegToPass.second.getValueType()));
+
+ // Add a register mask operand representing the call-preserved registers.
+ const uint32_t *Mask;
+ const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
+ if (IsThisReturn) {
+ // For 'this' returns, use the X0-preserving mask if applicable
+ Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
+ if (!Mask) {
+ IsThisReturn = false;
+ Mask = TRI->getCallPreservedMask(MF, CallConv);
+ }
+ } else
+ Mask = TRI->getCallPreservedMask(MF, CallConv);
+
+ assert(Mask && "Missing call preserved mask for calling convention");
+ Ops.push_back(DAG.getRegisterMask(Mask));
+
+ if (InFlag.getNode())
+ Ops.push_back(InFlag);
+
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+ // If we're doing a tall call, use a TC_RETURN here rather than an
+ // actual call instruction.
+ if (IsTailCall) {
+ MF.getFrameInfo().setHasTailCall();
+ return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
+ }
+
+ // Returns a chain and a flag for retval copy to use.
+ Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
+ InFlag = Chain.getValue(1);
+
+ uint64_t CalleePopBytes =
+ DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
+
+ Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
+ DAG.getIntPtrConstant(CalleePopBytes, DL, true),
+ InFlag, DL);
+ if (!Ins.empty())
+ InFlag = Chain.getValue(1);
+
+ // Handle result values, copying them out of physregs into vregs that we
+ // return.
+ return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
+ InVals, IsThisReturn,
+ IsThisReturn ? OutVals[0] : SDValue());
+}
+
+bool AArch64TargetLowering::CanLowerReturn(
+ CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
+ CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
+ ? RetCC_AArch64_WebKit_JS
+ : RetCC_AArch64_AAPCS;
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
+ return CCInfo.CheckReturn(Outs, RetCC);
+}
+
+SDValue
+AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SDLoc &DL, SelectionDAG &DAG) const {
+ CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
+ ? RetCC_AArch64_WebKit_JS
+ : RetCC_AArch64_AAPCS;
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+ *DAG.getContext());
+ CCInfo.AnalyzeReturn(Outs, RetCC);
+
+ // Copy the result values into the output registers.
+ SDValue Flag;
+ SmallVector<SDValue, 4> RetOps(1, Chain);
+ for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
+ ++i, ++realRVLocIdx) {
+ CCValAssign &VA = RVLocs[i];
+ assert(VA.isRegLoc() && "Can only return in registers!");
+ SDValue Arg = OutVals[realRVLocIdx];
+
+ switch (VA.getLocInfo()) {
+ default:
+ llvm_unreachable("Unknown loc info!");
+ case CCValAssign::Full:
+ if (Outs[i].ArgVT == MVT::i1) {
+ // AAPCS requires i1 to be zero-extended to i8 by the producer of the
+ // value. This is strictly redundant on Darwin (which uses "zeroext
+ // i1"), but will be optimised out before ISel.
+ Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
+ Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
+ }
+ break;
+ case CCValAssign::BCvt:
+ Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+ break;
+ }
+
+ Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
+ Flag = Chain.getValue(1);
+ RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+ }
+ const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
+ const MCPhysReg *I =
+ TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
+ if (I) {
+ for (; *I; ++I) {
+ if (AArch64::GPR64RegClass.contains(*I))
+ RetOps.push_back(DAG.getRegister(*I, MVT::i64));
+ else if (AArch64::FPR64RegClass.contains(*I))
+ RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
+ else
+ llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+ }
+ }
+
+ RetOps[0] = Chain; // Update chain.
+
+ // Add the flag if we have it.
+ if (Flag.getNode())
+ RetOps.push_back(Flag);
+
+ return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
+}
+
+//===----------------------------------------------------------------------===//
+// Other Lowering Code
+//===----------------------------------------------------------------------===//
+
+SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ SDLoc DL(Op);
+ const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
+ const GlobalValue *GV = GN->getGlobal();
+ unsigned char OpFlags =
+ Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
+
+ assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
+ "unexpected offset in global node");
+
+ // This also catched the large code model case for Darwin.
+ if ((OpFlags & AArch64II::MO_GOT) != 0) {
+ SDValue GotAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
+ // FIXME: Once remat is capable of dealing with instructions with register
+ // operands, expand this into two nodes instead of using a wrapper node.
+ return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
+ }
+
+ if (getTargetMachine().getCodeModel() == CodeModel::Large) {
+ const unsigned char MO_NC = AArch64II::MO_NC;
+ return DAG.getNode(
+ AArch64ISD::WrapperLarge, DL, PtrVT,
+ DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G3),
+ DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G2 | MO_NC),
+ DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G1 | MO_NC),
+ DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G0 | MO_NC));
+ } else {
+ // Use ADRP/ADD or ADRP/LDR for everything else: the small model on ELF and
+ // the only correct model on Darwin.
+ SDValue Hi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+ OpFlags | AArch64II::MO_PAGE);
+ unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC;
+ SDValue Lo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, LoFlags);
+
+ SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
+ return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+ }
+}
+
+/// \brief Convert a TLS address reference into the correct sequence of loads
+/// and calls to compute the variable's address (for Darwin, currently) and
+/// return an SDValue containing the final node.
+
+/// Darwin only has one TLS scheme which must be capable of dealing with the
+/// fully general situation, in the worst case. This means:
+/// + "extern __thread" declaration.
+/// + Defined in a possibly unknown dynamic library.
+///
+/// The general system is that each __thread variable has a [3 x i64] descriptor
+/// which contains information used by the runtime to calculate the address. The
+/// only part of this the compiler needs to know about is the first xword, which
+/// contains a function pointer that must be called with the address of the
+/// entire descriptor in "x0".
+///
+/// Since this descriptor may be in a different unit, in general even the
+/// descriptor must be accessed via an indirect load. The "ideal" code sequence
+/// is:
+/// adrp x0, _var@TLVPPAGE
+/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
+/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
+/// ; the function pointer
+/// blr x1 ; Uses descriptor address in x0
+/// ; Address of _var is now in x0.
+///
+/// If the address of _var's descriptor *is* known to the linker, then it can
+/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
+/// a slight efficiency gain.
+SDValue
+AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin");
+
+ SDLoc DL(Op);
+ MVT PtrVT = getPointerTy(DAG.getDataLayout());
+ const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+
+ SDValue TLVPAddr =
+ DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
+ SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
+
+ // The first entry in the descriptor is a function pointer that we must call
+ // to obtain the address of the variable.
+ SDValue Chain = DAG.getEntryNode();
+ SDValue FuncTLVGet = DAG.getLoad(
+ MVT::i64, DL, Chain, DescAddr,
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+ /* Alignment = */ 8,
+ MachineMemOperand::MONonTemporal | MachineMemOperand::MOInvariant |
+ MachineMemOperand::MODereferenceable);
+ Chain = FuncTLVGet.getValue(1);
+
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ MFI.setAdjustsStack(true);
+
+ // TLS calls preserve all registers except those that absolutely must be
+ // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
+ // silly).
+ const uint32_t *Mask =
+ Subtarget->getRegisterInfo()->getTLSCallPreservedMask();
+
+ // Finally, we can make the call. This is just a degenerate version of a
+ // normal AArch64 call node: x0 takes the address of the descriptor, and
+ // returns the address of the variable in this thread.
+ Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
+ Chain =
+ DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
+ Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
+ DAG.getRegisterMask(Mask), Chain.getValue(1));
+ return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
+}
+
+/// When accessing thread-local variables under either the general-dynamic or
+/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
+/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
+/// is a function pointer to carry out the resolution.
+///
+/// The sequence is:
+/// adrp x0, :tlsdesc:var
+/// ldr x1, [x0, #:tlsdesc_lo12:var]
+/// add x0, x0, #:tlsdesc_lo12:var
+/// .tlsdesccall var
+/// blr x1
+/// (TPIDR_EL0 offset now in x0)
+///
+/// The above sequence must be produced unscheduled, to enable the linker to
+/// optimize/relax this sequence.
+/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
+/// above sequence, and expanded really late in the compilation flow, to ensure
+/// the sequence is produced as per above.
+SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
+ const SDLoc &DL,
+ SelectionDAG &DAG) const {
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+ SDValue Chain = DAG.getEntryNode();
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+ Chain =
+ DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
+ SDValue Glue = Chain.getValue(1);
+
+ return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
+}
+
+SDValue
+AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Subtarget->isTargetELF() && "This function expects an ELF target");
+ assert(getTargetMachine().getCodeModel() == CodeModel::Small &&
+ "ELF TLS only supported in small memory model");
+ // Different choices can be made for the maximum size of the TLS area for a
+ // module. For the small address model, the default TLS size is 16MiB and the
+ // maximum TLS size is 4GiB.
+ // FIXME: add -mtls-size command line option and make it control the 16MiB
+ // vs. 4GiB code sequence generation.
+ const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+
+ TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
+
+ if (DAG.getTarget().Options.EmulatedTLS)
+ return LowerToTLSEmulatedModel(GA, DAG);
+
+ if (!EnableAArch64ELFLocalDynamicTLSGeneration) {
+ if (Model == TLSModel::LocalDynamic)
+ Model = TLSModel::GeneralDynamic;
+ }
+
+ SDValue TPOff;
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ SDLoc DL(Op);
+ const GlobalValue *GV = GA->getGlobal();
+
+ SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
+
+ if (Model == TLSModel::LocalExec) {
+ SDValue HiVar = DAG.getTargetGlobalAddress(
+ GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
+ SDValue LoVar = DAG.getTargetGlobalAddress(
+ GV, DL, PtrVT, 0,
+ AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+
+ SDValue TPWithOff_lo =
+ SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
+ HiVar,
+ DAG.getTargetConstant(0, DL, MVT::i32)),
+ 0);
+ SDValue TPWithOff =
+ SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPWithOff_lo,
+ LoVar,
+ DAG.getTargetConstant(0, DL, MVT::i32)),
+ 0);
+ return TPWithOff;
+ } else if (Model == TLSModel::InitialExec) {
+ TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
+ TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
+ } else if (Model == TLSModel::LocalDynamic) {
+ // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
+ // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
+ // the beginning of the module's TLS region, followed by a DTPREL offset
+ // calculation.
+
+ // These accesses will need deduplicating if there's more than one.
+ AArch64FunctionInfo *MFI =
+ DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
+ MFI->incNumLocalDynamicTLSAccesses();
+
+ // The call needs a relocation too for linker relaxation. It doesn't make
+ // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
+ // the address.
+ SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
+ AArch64II::MO_TLS);
+
+ // Now we can calculate the offset from TPIDR_EL0 to this module's
+ // thread-local area.
+ TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
+
+ // Now use :dtprel_whatever: operations to calculate this variable's offset
+ // in its thread-storage area.
+ SDValue HiVar = DAG.getTargetGlobalAddress(
+ GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
+ SDValue LoVar = DAG.getTargetGlobalAddress(
+ GV, DL, MVT::i64, 0,
+ AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+
+ TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
+ DAG.getTargetConstant(0, DL, MVT::i32)),
+ 0);
+ TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
+ DAG.getTargetConstant(0, DL, MVT::i32)),
+ 0);
+ } else if (Model == TLSModel::GeneralDynamic) {
+ // The call needs a relocation too for linker relaxation. It doesn't make
+ // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
+ // the address.
+ SDValue SymAddr =
+ DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
+
+ // Finally we can make a call to calculate the offset from tpidr_el0.
+ TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
+ } else
+ llvm_unreachable("Unsupported ELF TLS access model");
+
+ return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
+}
+
+SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
+ SelectionDAG &DAG) const {
+ if (Subtarget->isTargetDarwin())
+ return LowerDarwinGlobalTLSAddress(Op, DAG);
+ else if (Subtarget->isTargetELF())
+ return LowerELFGlobalTLSAddress(Op, DAG);
+
+ llvm_unreachable("Unexpected platform trying to use TLS");
+}
+SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
+ SDValue Chain = Op.getOperand(0);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+ SDValue LHS = Op.getOperand(2);
+ SDValue RHS = Op.getOperand(3);
+ SDValue Dest = Op.getOperand(4);
+ SDLoc dl(Op);
+
+ // Handle f128 first, since lowering it will result in comparing the return
+ // value of a libcall against zero, which is just what the rest of LowerBR_CC
+ // is expecting to deal with.
+ if (LHS.getValueType() == MVT::f128) {
+ softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
+
+ // If softenSetCCOperands returned a scalar, we need to compare the result
+ // against zero to select between true and false values.
+ if (!RHS.getNode()) {
+ RHS = DAG.getConstant(0, dl, LHS.getValueType());
+ CC = ISD::SETNE;
+ }
+ }
+
+ // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
+ // instruction.
+ unsigned Opc = LHS.getOpcode();
+ if (LHS.getResNo() == 1 && isOneConstant(RHS) &&
+ (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
+ Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
+ assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
+ "Unexpected condition code.");
+ // Only lower legal XALUO ops.
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
+ return SDValue();
+
+ // The actual operation with overflow check.
+ AArch64CC::CondCode OFCC;
+ SDValue Value, Overflow;
+ std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
+
+ if (CC == ISD::SETNE)
+ OFCC = getInvertedCondCode(OFCC);
+ SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);
+
+ return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+ Overflow);
+ }
+
+ if (LHS.getValueType().isInteger()) {
+ assert((LHS.getValueType() == RHS.getValueType()) &&
+ (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
+
+ // If the RHS of the comparison is zero, we can potentially fold this
+ // to a specialized branch.
+ const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
+ if (RHSC && RHSC->getZExtValue() == 0) {
+ if (CC == ISD::SETEQ) {
+ // See if we can use a TBZ to fold in an AND as well.
+ // TBZ has a smaller branch displacement than CBZ. If the offset is
+ // out of bounds, a late MI-layer pass rewrites branches.
+ // 403.gcc is an example that hits this case.
+ if (LHS.getOpcode() == ISD::AND &&
+ isa<ConstantSDNode>(LHS.getOperand(1)) &&
+ isPowerOf2_64(LHS.getConstantOperandVal(1))) {
+ SDValue Test = LHS.getOperand(0);
+ uint64_t Mask = LHS.getConstantOperandVal(1);
+ return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
+ DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
+ Dest);
+ }
+
+ return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
+ } else if (CC == ISD::SETNE) {
+ // See if we can use a TBZ to fold in an AND as well.
+ // TBZ has a smaller branch displacement than CBZ. If the offset is
+ // out of bounds, a late MI-layer pass rewrites branches.
+ // 403.gcc is an example that hits this case.
+ if (LHS.getOpcode() == ISD::AND &&
+ isa<ConstantSDNode>(LHS.getOperand(1)) &&
+ isPowerOf2_64(LHS.getConstantOperandVal(1))) {
+ SDValue Test = LHS.getOperand(0);
+ uint64_t Mask = LHS.getConstantOperandVal(1);
+ return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
+ DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
+ Dest);
+ }
+
+ return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
+ } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
+ // Don't combine AND since emitComparison converts the AND to an ANDS
+ // (a.k.a. TST) and the test in the test bit and branch instruction
+ // becomes redundant. This would also increase register pressure.
+ uint64_t Mask = LHS.getValueSizeInBits() - 1;
+ return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
+ DAG.getConstant(Mask, dl, MVT::i64), Dest);
+ }
+ }
+ if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
+ LHS.getOpcode() != ISD::AND) {
+ // Don't combine AND since emitComparison converts the AND to an ANDS
+ // (a.k.a. TST) and the test in the test bit and branch instruction
+ // becomes redundant. This would also increase register pressure.
+ uint64_t Mask = LHS.getValueSizeInBits() - 1;
+ return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
+ DAG.getConstant(Mask, dl, MVT::i64), Dest);
+ }
+
+ SDValue CCVal;
+ SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
+ return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+ Cmp);
+ }
+
+ assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
+
+ // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
+ // clean. Some of them require two branches to implement.
+ SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+ AArch64CC::CondCode CC1, CC2;
+ changeFPCCToAArch64CC(CC, CC1, CC2);
+ SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
+ SDValue BR1 =
+ DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
+ if (CC2 != AArch64CC::AL) {
+ SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
+ return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
+ Cmp);
+ }
+
+ return BR1;
+}
+
+SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ SDLoc DL(Op);
+
+ SDValue In1 = Op.getOperand(0);
+ SDValue In2 = Op.getOperand(1);
+ EVT SrcVT = In2.getValueType();
+
+ if (SrcVT.bitsLT(VT))
+ In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
+ else if (SrcVT.bitsGT(VT))
+ In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL));
+
+ EVT VecVT;
+ EVT EltVT;
+ uint64_t EltMask;
+ SDValue VecVal1, VecVal2;
+ if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) {
+ EltVT = MVT::i32;
+ VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32);
+ EltMask = 0x80000000ULL;
+
+ if (!VT.isVector()) {
+ VecVal1 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT,
+ DAG.getUNDEF(VecVT), In1);
+ VecVal2 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT,
+ DAG.getUNDEF(VecVT), In2);
+ } else {
+ VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
+ VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
+ }
+ } else if (VT == MVT::f64 || VT == MVT::v2f64) {
+ EltVT = MVT::i64;
+ VecVT = MVT::v2i64;
+
+ // We want to materialize a mask with the high bit set, but the AdvSIMD
+ // immediate moves cannot materialize that in a single instruction for
+ // 64-bit elements. Instead, materialize zero and then negate it.
+ EltMask = 0;
+
+ if (!VT.isVector()) {
+ VecVal1 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT,
+ DAG.getUNDEF(VecVT), In1);
+ VecVal2 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT,
+ DAG.getUNDEF(VecVT), In2);
+ } else {
+ VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
+ VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
+ }
+ } else {
+ llvm_unreachable("Invalid type for copysign!");
+ }
+
+ SDValue BuildVec = DAG.getConstant(EltMask, DL, VecVT);
+
+ // If we couldn't materialize the mask above, then the mask vector will be
+ // the zero vector, and we need to negate it here.
+ if (VT == MVT::f64 || VT == MVT::v2f64) {
+ BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec);
+ BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec);
+ BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec);
+ }
+
+ SDValue Sel =
+ DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec);
+
+ if (VT == MVT::f32)
+ return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel);
+ else if (VT == MVT::f64)
+ return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel);
+ else
+ return DAG.getNode(ISD::BITCAST, DL, VT, Sel);
+}
+
+SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
+ if (DAG.getMachineFunction().getFunction()->hasFnAttribute(
+ Attribute::NoImplicitFloat))
+ return SDValue();
+
+ if (!Subtarget->hasNEON())
+ return SDValue();
+
+ // While there is no integer popcount instruction, it can
+ // be more efficiently lowered to the following sequence that uses
+ // AdvSIMD registers/instructions as long as the copies to/from
+ // the AdvSIMD registers are cheap.
+ // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
+ // CNT V0.8B, V0.8B // 8xbyte pop-counts
+ // ADDV B0, V0.8B // sum 8xbyte pop-counts
+ // UMOV X0, V0.B[0] // copy byte result back to integer reg
+ SDValue Val = Op.getOperand(0);
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+
+ if (VT == MVT::i32)
+ Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
+ Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
+
+ SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
+ SDValue UaddLV = DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
+ DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
+
+ if (VT == MVT::i64)
+ UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
+ return UaddLV;
+}
+
+SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+
+ if (Op.getValueType().isVector())
+ return LowerVSETCC(Op, DAG);
+
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+ SDLoc dl(Op);
+
+ // We chose ZeroOrOneBooleanContents, so use zero and one.
+ EVT VT = Op.getValueType();
+ SDValue TVal = DAG.getConstant(1, dl, VT);
+ SDValue FVal = DAG.getConstant(0, dl, VT);
+
+ // Handle f128 first, since one possible outcome is a normal integer
+ // comparison which gets picked up by the next if statement.
+ if (LHS.getValueType() == MVT::f128) {
+ softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
+
+ // If softenSetCCOperands returned a scalar, use it.
+ if (!RHS.getNode()) {
+ assert(LHS.getValueType() == Op.getValueType() &&
+ "Unexpected setcc expansion!");
+ return LHS;
+ }
+ }
+
+ if (LHS.getValueType().isInteger()) {
+ SDValue CCVal;
+ SDValue Cmp =
+ getAArch64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl);
+
+ // Note that we inverted the condition above, so we reverse the order of
+ // the true and false operands here. This will allow the setcc to be
+ // matched to a single CSINC instruction.
+ return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
+ }
+
+ // Now we know we're dealing with FP values.
+ assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
+
+ // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
+ // and do the comparison.
+ SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+
+ AArch64CC::CondCode CC1, CC2;
+ changeFPCCToAArch64CC(CC, CC1, CC2);
+ if (CC2 == AArch64CC::AL) {
+ changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, false), CC1, CC2);
+ SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
+
+ // Note that we inverted the condition above, so we reverse the order of
+ // the true and false operands here. This will allow the setcc to be
+ // matched to a single CSINC instruction.
+ return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
+ } else {
+ // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
+ // totally clean. Some of them require two CSELs to implement. As is in
+ // this case, we emit the first CSEL and then emit a second using the output
+ // of the first as the RHS. We're effectively OR'ing the two CC's together.
+
+ // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
+ SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
+ SDValue CS1 =
+ DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
+
+ SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
+ return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
+ }
+}
+
+SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
+ SDValue RHS, SDValue TVal,
+ SDValue FVal, const SDLoc &dl,
+ SelectionDAG &DAG) const {
+ // Handle f128 first, because it will result in a comparison of some RTLIB
+ // call result against zero.
+ if (LHS.getValueType() == MVT::f128) {
+ softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
+
+ // If softenSetCCOperands returned a scalar, we need to compare the result
+ // against zero to select between true and false values.
+ if (!RHS.getNode()) {
+ RHS = DAG.getConstant(0, dl, LHS.getValueType());
+ CC = ISD::SETNE;
+ }
+ }
+
+ // Also handle f16, for which we need to do a f32 comparison.
+ if (LHS.getValueType() == MVT::f16) {
+ LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
+ RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
+ }
+
+ // Next, handle integers.
+ if (LHS.getValueType().isInteger()) {
+ assert((LHS.getValueType() == RHS.getValueType()) &&
+ (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
+
+ unsigned Opcode = AArch64ISD::CSEL;
+
+ // If both the TVal and the FVal are constants, see if we can swap them in
+ // order to for a CSINV or CSINC out of them.
+ ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
+ ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
+
+ if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) {
+ std::swap(TVal, FVal);
+ std::swap(CTVal, CFVal);
+ CC = ISD::getSetCCInverse(CC, true);
+ } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) {
+ std::swap(TVal, FVal);
+ std::swap(CTVal, CFVal);
+ CC = ISD::getSetCCInverse(CC, true);
+ } else if (TVal.getOpcode() == ISD::XOR) {
+ // If TVal is a NOT we want to swap TVal and FVal so that we can match
+ // with a CSINV rather than a CSEL.
+ if (isAllOnesConstant(TVal.getOperand(1))) {
+ std::swap(TVal, FVal);
+ std::swap(CTVal, CFVal);
+ CC = ISD::getSetCCInverse(CC, true);
+ }
+ } else if (TVal.getOpcode() == ISD::SUB) {
+ // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
+ // that we can match with a CSNEG rather than a CSEL.
+ if (isNullConstant(TVal.getOperand(0))) {
+ std::swap(TVal, FVal);
+ std::swap(CTVal, CFVal);
+ CC = ISD::getSetCCInverse(CC, true);
+ }
+ } else if (CTVal && CFVal) {
+ const int64_t TrueVal = CTVal->getSExtValue();
+ const int64_t FalseVal = CFVal->getSExtValue();
+ bool Swap = false;
+
+ // If both TVal and FVal are constants, see if FVal is the
+ // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
+ // instead of a CSEL in that case.
+ if (TrueVal == ~FalseVal) {
+ Opcode = AArch64ISD::CSINV;
+ } else if (TrueVal == -FalseVal) {
+ Opcode = AArch64ISD::CSNEG;
+ } else if (TVal.getValueType() == MVT::i32) {
+ // If our operands are only 32-bit wide, make sure we use 32-bit
+ // arithmetic for the check whether we can use CSINC. This ensures that
+ // the addition in the check will wrap around properly in case there is
+ // an overflow (which would not be the case if we do the check with
+ // 64-bit arithmetic).
+ const uint32_t TrueVal32 = CTVal->getZExtValue();
+ const uint32_t FalseVal32 = CFVal->getZExtValue();
+
+ if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
+ Opcode = AArch64ISD::CSINC;
+
+ if (TrueVal32 > FalseVal32) {
+ Swap = true;
+ }
+ }
+ // 64-bit check whether we can use CSINC.
+ } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) {
+ Opcode = AArch64ISD::CSINC;
+
+ if (TrueVal > FalseVal) {
+ Swap = true;
+ }
+ }
+
+ // Swap TVal and FVal if necessary.
+ if (Swap) {
+ std::swap(TVal, FVal);
+ std::swap(CTVal, CFVal);
+ CC = ISD::getSetCCInverse(CC, true);
+ }
+
+ if (Opcode != AArch64ISD::CSEL) {
+ // Drop FVal since we can get its value by simply inverting/negating
+ // TVal.
+ FVal = TVal;
+ }
+ }
+
+ // Avoid materializing a constant when possible by reusing a known value in
+ // a register. However, don't perform this optimization if the known value
+ // is one, zero or negative one in the case of a CSEL. We can always
+ // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
+ // FVal, respectively.
+ ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
+ if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
+ !RHSVal->isNullValue() && !RHSVal->isAllOnesValue()) {
+ AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
+ // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
+ // "a != C ? x : a" to avoid materializing C.
+ if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
+ TVal = LHS;
+ else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
+ FVal = LHS;
+ } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
+ assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
+ // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
+ // avoid materializing C.
+ AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
+ if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
+ Opcode = AArch64ISD::CSINV;
+ TVal = LHS;
+ FVal = DAG.getConstant(0, dl, FVal.getValueType());
+ }
+ }
+
+ SDValue CCVal;
+ SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
+
+ EVT VT = TVal.getValueType();
+ return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
+ }
+
+ // Now we know we're dealing with FP values.
+ assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
+ assert(LHS.getValueType() == RHS.getValueType());
+ EVT VT = TVal.getValueType();
+ SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+
+ // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
+ // clean. Some of them require two CSELs to implement.
+ AArch64CC::CondCode CC1, CC2;
+ changeFPCCToAArch64CC(CC, CC1, CC2);
+
+ if (DAG.getTarget().Options.UnsafeFPMath) {
+ // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
+ // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
+ ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
+ if (RHSVal && RHSVal->isZero()) {
+ ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
+ ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
+
+ if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
+ CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
+ TVal = LHS;
+ else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
+ CFVal && CFVal->isZero() &&
+ FVal.getValueType() == LHS.getValueType())
+ FVal = LHS;
+ }
+ }
+
+ // Emit first, and possibly only, CSEL.
+ SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
+ SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
+
+ // If we need a second CSEL, emit it, using the output of the first as the
+ // RHS. We're effectively OR'ing the two CC's together.
+ if (CC2 != AArch64CC::AL) {
+ SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
+ return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
+ }
+
+ // Otherwise, return the output of the first CSEL.
+ return CS1;
+}
+
+SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
+ SelectionDAG &DAG) const {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ SDValue TVal = Op.getOperand(2);
+ SDValue FVal = Op.getOperand(3);
+ SDLoc DL(Op);
+ return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
+}
+
+SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue CCVal = Op->getOperand(0);
+ SDValue TVal = Op->getOperand(1);
+ SDValue FVal = Op->getOperand(2);
+ SDLoc DL(Op);
+
+ unsigned Opc = CCVal.getOpcode();
+ // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
+ // instruction.
+ if (CCVal.getResNo() == 1 &&
+ (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
+ Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
+ // Only lower legal XALUO ops.
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
+ return SDValue();
+
+ AArch64CC::CondCode OFCC;
+ SDValue Value, Overflow;
+ std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
+ SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
+
+ return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
+ CCVal, Overflow);
+ }
+
+ // Lower it the same way as we would lower a SELECT_CC node.
+ ISD::CondCode CC;
+ SDValue LHS, RHS;
+ if (CCVal.getOpcode() == ISD::SETCC) {
+ LHS = CCVal.getOperand(0);
+ RHS = CCVal.getOperand(1);
+ CC = cast<CondCodeSDNode>(CCVal->getOperand(2))->get();
+ } else {
+ LHS = CCVal;
+ RHS = DAG.getConstant(0, DL, CCVal.getValueType());
+ CC = ISD::SETNE;
+ }
+ return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
+}
+
+SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
+ SelectionDAG &DAG) const {
+ // Jump table entries as PC relative offsets. No additional tweaking
+ // is necessary here. Just get the address of the jump table.
+ JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ SDLoc DL(Op);
+
+ if (getTargetMachine().getCodeModel() == CodeModel::Large &&
+ !Subtarget->isTargetMachO()) {
+ const unsigned char MO_NC = AArch64II::MO_NC;
+ return DAG.getNode(
+ AArch64ISD::WrapperLarge, DL, PtrVT,
+ DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G3),
+ DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G2 | MO_NC),
+ DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G1 | MO_NC),
+ DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
+ AArch64II::MO_G0 | MO_NC));
+ }
+
+ SDValue Hi =
+ DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_PAGE);
+ SDValue Lo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
+ AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+ SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
+ return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+}
+
+SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
+ SelectionDAG &DAG) const {
+ ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ SDLoc DL(Op);
+
+ if (getTargetMachine().getCodeModel() == CodeModel::Large) {
+ // Use the GOT for the large code model on iOS.
+ if (Subtarget->isTargetMachO()) {
+ SDValue GotAddr = DAG.getTargetConstantPool(
+ CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
+ AArch64II::MO_GOT);
+ return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
+ }
+
+ const unsigned char MO_NC = AArch64II::MO_NC;
+ return DAG.getNode(
+ AArch64ISD::WrapperLarge, DL, PtrVT,
+ DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+ CP->getOffset(), AArch64II::MO_G3),
+ DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+ CP->getOffset(), AArch64II::MO_G2 | MO_NC),
+ DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+ CP->getOffset(), AArch64II::MO_G1 | MO_NC),
+ DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+ CP->getOffset(), AArch64II::MO_G0 | MO_NC));
+ } else {
+ // Use ADRP/ADD or ADRP/LDR for everything else: the small memory model on
+ // ELF, the only valid one on Darwin.
+ SDValue Hi =
+ DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+ CP->getOffset(), AArch64II::MO_PAGE);
+ SDValue Lo = DAG.getTargetConstantPool(
+ CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
+ AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+
+ SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
+ return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+ }
+}
+
+SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
+ SelectionDAG &DAG) const {
+ const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ SDLoc DL(Op);
+ if (getTargetMachine().getCodeModel() == CodeModel::Large &&
+ !Subtarget->isTargetMachO()) {
+ const unsigned char MO_NC = AArch64II::MO_NC;
+ return DAG.getNode(
+ AArch64ISD::WrapperLarge, DL, PtrVT,
+ DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G3),
+ DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G2 | MO_NC),
+ DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G1 | MO_NC),
+ DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G0 | MO_NC));
+ } else {
+ SDValue Hi = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGE);
+ SDValue Lo = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGEOFF |
+ AArch64II::MO_NC);
+ SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
+ return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+ }
+}
+
+SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
+ SelectionDAG &DAG) const {
+ AArch64FunctionInfo *FuncInfo =
+ DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
+
+ SDLoc DL(Op);
+ SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
+ getPointerTy(DAG.getDataLayout()));
+ const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+ return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
+ MachinePointerInfo(SV));
+}
+
+SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
+ SelectionDAG &DAG) const {
+ // The layout of the va_list struct is specified in the AArch64 Procedure Call
+ // Standard, section B.3.
+ MachineFunction &MF = DAG.getMachineFunction();
+ AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+ SDLoc DL(Op);
+
+ SDValue Chain = Op.getOperand(0);
+ SDValue VAList = Op.getOperand(1);
+ const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+ SmallVector<SDValue, 4> MemOps;
+
+ // void *__stack at offset 0
+ SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
+ MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
+ MachinePointerInfo(SV), /* Alignment = */ 8));
+
+ // void *__gr_top at offset 8
+ int GPRSize = FuncInfo->getVarArgsGPRSize();
+ if (GPRSize > 0) {
+ SDValue GRTop, GRTopAddr;
+
+ GRTopAddr =
+ DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(8, DL, PtrVT));
+
+ GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
+ GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
+ DAG.getConstant(GPRSize, DL, PtrVT));
+
+ MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
+ MachinePointerInfo(SV, 8),
+ /* Alignment = */ 8));
+ }
+
+ // void *__vr_top at offset 16
+ int FPRSize = FuncInfo->getVarArgsFPRSize();
+ if (FPRSize > 0) {
+ SDValue VRTop, VRTopAddr;
+ VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
+ DAG.getConstant(16, DL, PtrVT));
+
+ VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
+ VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
+ DAG.getConstant(FPRSize, DL, PtrVT));
+
+ MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
+ MachinePointerInfo(SV, 16),
+ /* Alignment = */ 8));
+ }
+
+ // int __gr_offs at offset 24
+ SDValue GROffsAddr =
+ DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT));
+ MemOps.push_back(DAG.getStore(
+ Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), GROffsAddr,
+ MachinePointerInfo(SV, 24), /* Alignment = */ 4));
+
+ // int __vr_offs at offset 28
+ SDValue VROffsAddr =
+ DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT));
+ MemOps.push_back(DAG.getStore(
+ Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), VROffsAddr,
+ MachinePointerInfo(SV, 28), /* Alignment = */ 4));
+
+ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
+}
+
+SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
+ SelectionDAG &DAG) const {
+ return Subtarget->isTargetDarwin() ? LowerDarwin_VASTART(Op, DAG)
+ : LowerAAPCS_VASTART(Op, DAG);
+}
+
+SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
+ SelectionDAG &DAG) const {
+ // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
+ // pointer.
+ SDLoc DL(Op);
+ unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32;
+ const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
+ const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+
+ return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1),
+ Op.getOperand(2),
+ DAG.getConstant(VaListSize, DL, MVT::i32),
+ 8, false, false, false, MachinePointerInfo(DestSV),
+ MachinePointerInfo(SrcSV));
+}
+
+SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
+ assert(Subtarget->isTargetDarwin() &&
+ "automatic va_arg instruction only works on Darwin");
+
+ const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+ EVT VT = Op.getValueType();
+ SDLoc DL(Op);
+ SDValue Chain = Op.getOperand(0);
+ SDValue Addr = Op.getOperand(1);
+ unsigned Align = Op.getConstantOperandVal(3);
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+ SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V));
+ Chain = VAList.getValue(1);
+
+ if (Align > 8) {
+ assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2");
+ VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
+ DAG.getConstant(Align - 1, DL, PtrVT));
+ VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
+ DAG.getConstant(-(int64_t)Align, DL, PtrVT));
+ }
+
+ Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
+ uint64_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
+
+ // Scalar integer and FP values smaller than 64 bits are implicitly extended
+ // up to 64 bits. At the very least, we have to increase the striding of the
+ // vaargs list to match this, and for FP values we need to introduce
+ // FP_ROUND nodes as well.
+ if (VT.isInteger() && !VT.isVector())
+ ArgSize = 8;
+ bool NeedFPTrunc = false;
+ if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
+ ArgSize = 8;
+ NeedFPTrunc = true;
+ }
+
+ // Increment the pointer, VAList, to the next vaarg
+ SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
+ DAG.getConstant(ArgSize, DL, PtrVT));
+ // Store the incremented VAList to the legalized pointer
+ SDValue APStore =
+ DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
+
+ // Load the actual argument out of the pointer VAList
+ if (NeedFPTrunc) {
+ // Load the value as an f64.
+ SDValue WideFP =
+ DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
+ // Round the value down to an f32.
+ SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
+ DAG.getIntPtrConstant(1, DL));
+ SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
+ // Merge the rounded value with the chain output of the load.
+ return DAG.getMergeValues(Ops, DL);
+ }
+
+ return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
+}
+
+SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
+ SelectionDAG &DAG) const {
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ MFI.setFrameAddressIsTaken(true);
+
+ EVT VT = Op.getValueType();
+ SDLoc DL(Op);
+ unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ SDValue FrameAddr =
+ DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
+ while (Depth--)
+ FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
+ MachinePointerInfo());
+ return FrameAddr;
+}
+
+// FIXME? Maybe this could be a TableGen attribute on some registers and
+// this table could be generated automatically from RegInfo.
+unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT,
+ SelectionDAG &DAG) const {
+ unsigned Reg = StringSwitch<unsigned>(RegName)
+ .Case("sp", AArch64::SP)
+ .Default(0);
+ if (Reg)
+ return Reg;
+ report_fatal_error(Twine("Invalid register name \""
+ + StringRef(RegName) + "\"."));
+}
+
+SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
+ SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MFI.setReturnAddressIsTaken(true);
+
+ EVT VT = Op.getValueType();
+ SDLoc DL(Op);
+ unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ if (Depth) {
+ SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+ SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
+ return DAG.getLoad(VT, DL, DAG.getEntryNode(),
+ DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
+ MachinePointerInfo());
+ }
+
+ // Return LR, which contains the return address. Mark it an implicit live-in.
+ unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
+ return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
+}
+
+/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
+/// i64 values and take a 2 x i64 value to shift plus a shift amount.
+SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+ EVT VT = Op.getValueType();
+ unsigned VTBits = VT.getSizeInBits();
+ SDLoc dl(Op);
+ SDValue ShOpLo = Op.getOperand(0);
+ SDValue ShOpHi = Op.getOperand(1);
+ SDValue ShAmt = Op.getOperand(2);
+ unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
+
+ assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
+
+ SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
+ DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
+ SDValue HiBitsForLo = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
+
+ // Unfortunately, if ShAmt == 0, we just calculated "(SHL ShOpHi, 64)" which
+ // is "undef". We wanted 0, so CSEL it directly.
+ SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
+ ISD::SETEQ, dl, DAG);
+ SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
+ HiBitsForLo =
+ DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
+ HiBitsForLo, CCVal, Cmp);
+
+ SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
+ DAG.getConstant(VTBits, dl, MVT::i64));
+
+ SDValue LoBitsForLo = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
+ SDValue LoForNormalShift =
+ DAG.getNode(ISD::OR, dl, VT, LoBitsForLo, HiBitsForLo);
+
+ Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
+ dl, DAG);
+ CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
+ SDValue LoForBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
+ SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
+ LoForNormalShift, CCVal, Cmp);
+
+ // AArch64 shifts larger than the register width are wrapped rather than
+ // clamped, so we can't just emit "hi >> x".
+ SDValue HiForNormalShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
+ SDValue HiForBigShift =
+ Opc == ISD::SRA
+ ? DAG.getNode(Opc, dl, VT, ShOpHi,
+ DAG.getConstant(VTBits - 1, dl, MVT::i64))
+ : DAG.getConstant(0, dl, VT);
+ SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
+ HiForNormalShift, CCVal, Cmp);
+
+ SDValue Ops[2] = { Lo, Hi };
+ return DAG.getMergeValues(Ops, dl);
+}
+
+
+/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
+/// i64 values and take a 2 x i64 value to shift plus a shift amount.
+SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+ EVT VT = Op.getValueType();
+ unsigned VTBits = VT.getSizeInBits();
+ SDLoc dl(Op);
+ SDValue ShOpLo = Op.getOperand(0);
+ SDValue ShOpHi = Op.getOperand(1);
+ SDValue ShAmt = Op.getOperand(2);
+
+ assert(Op.getOpcode() == ISD::SHL_PARTS);
+ SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
+ DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
+ SDValue LoBitsForHi = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
+
+ // Unfortunately, if ShAmt == 0, we just calculated "(SRL ShOpLo, 64)" which
+ // is "undef". We wanted 0, so CSEL it directly.
+ SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
+ ISD::SETEQ, dl, DAG);
+ SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
+ LoBitsForHi =
+ DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
+ LoBitsForHi, CCVal, Cmp);
+
+ SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
+ DAG.getConstant(VTBits, dl, MVT::i64));
+ SDValue HiBitsForHi = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
+ SDValue HiForNormalShift =
+ DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi);
+
+ SDValue HiForBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
+
+ Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
+ dl, DAG);
+ CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
+ SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
+ HiForNormalShift, CCVal, Cmp);
+
+ // AArch64 shifts of larger than register sizes are wrapped rather than
+ // clamped, so we can't just emit "lo << a" if a is too big.
+ SDValue LoForBigShift = DAG.getConstant(0, dl, VT);
+ SDValue LoForNormalShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+ SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
+ LoForNormalShift, CCVal, Cmp);
+
+ SDValue Ops[2] = { Lo, Hi };
+ return DAG.getMergeValues(Ops, dl);
+}
+
+bool AArch64TargetLowering::isOffsetFoldingLegal(
+ const GlobalAddressSDNode *GA) const {
+ // The AArch64 target doesn't support folding offsets into global addresses.
+ return false;
+}
+
+bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+ // We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases.
+ // FIXME: We should be able to handle f128 as well with a clever lowering.
+ if (Imm.isPosZero() && (VT == MVT::f64 || VT == MVT::f32))
+ return true;
+
+ if (VT == MVT::f64)
+ return AArch64_AM::getFP64Imm(Imm) != -1;
+ else if (VT == MVT::f32)
+ return AArch64_AM::getFP32Imm(Imm) != -1;
+ return false;
+}
+
+//===----------------------------------------------------------------------===//
+// AArch64 Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
+ SDValue Operand, SelectionDAG &DAG,
+ int &ExtraSteps) {
+ EVT VT = Operand.getValueType();
+ if (ST->hasNEON() &&
+ (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
+ VT == MVT::f32 || VT == MVT::v1f32 ||
+ VT == MVT::v2f32 || VT == MVT::v4f32)) {
+ if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified)
+ // For the reciprocal estimates, convergence is quadratic, so the number
+ // of digits is doubled after each iteration. In ARMv8, the accuracy of
+ // the initial estimate is 2^-8. Thus the number of extra steps to refine
+ // the result for float (23 mantissa bits) is 2 and for double (52
+ // mantissa bits) is 3.
+ ExtraSteps = VT == MVT::f64 ? 3 : 2;
+
+ return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
+ }
+
+ return SDValue();
+}
+
+SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
+ SelectionDAG &DAG, int Enabled,
+ int &ExtraSteps,
+ bool &UseOneConst,
+ bool Reciprocal) const {
+ if (Enabled == ReciprocalEstimate::Enabled ||
+ (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
+ if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
+ DAG, ExtraSteps)) {
+ SDLoc DL(Operand);
+ EVT VT = Operand.getValueType();
+
+ SDNodeFlags Flags;
+ Flags.setUnsafeAlgebra(true);
+
+ // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
+ // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
+ for (int i = ExtraSteps; i > 0; --i) {
+ SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
+ &Flags);
+ Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, &Flags);
+ Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, &Flags);
+ }
+
+ if (!Reciprocal) {
+ EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
+ VT);
+ SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
+ SDValue Eq = DAG.getSetCC(DL, CCVT, Operand, FPZero, ISD::SETEQ);
+
+ Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, &Flags);
+ // Correct the result if the operand is 0.0.
+ Estimate = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL,
+ VT, Eq, Operand, Estimate);
+ }
+
+ ExtraSteps = 0;
+ return Estimate;
+ }
+
+ return SDValue();
+}
+
+SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
+ SelectionDAG &DAG, int Enabled,
+ int &ExtraSteps) const {
+ if (Enabled == ReciprocalEstimate::Enabled)
+ if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
+ DAG, ExtraSteps)) {
+ SDLoc DL(Operand);
+ EVT VT = Operand.getValueType();
+
+ SDNodeFlags Flags;
+ Flags.setUnsafeAlgebra(true);
+
+ // Newton reciprocal iteration: E * (2 - X * E)
+ // AArch64 reciprocal iteration instruction: (2 - M * N)
+ for (int i = ExtraSteps; i > 0; --i) {
+ SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
+ Estimate, &Flags);
+ Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, &Flags);
+ }
+
+ ExtraSteps = 0;
+ return Estimate;
+ }
+
+ return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+// AArch64 Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+// Table of Constraints
+// TODO: This is the current set of constraints supported by ARM for the
+// compiler, not all of them may make sense, e.g. S may be difficult to support.
+//
+// r - A general register
+// w - An FP/SIMD register of some size in the range v0-v31
+// x - An FP/SIMD register of some size in the range v0-v15
+// I - Constant that can be used with an ADD instruction
+// J - Constant that can be used with a SUB instruction
+// K - Constant that can be used with a 32-bit logical instruction
+// L - Constant that can be used with a 64-bit logical instruction
+// M - Constant that can be used as a 32-bit MOV immediate
+// N - Constant that can be used as a 64-bit MOV immediate
+// Q - A memory reference with base register and no offset
+// S - A symbolic address
+// Y - Floating point constant zero
+// Z - Integer constant zero
+//
+// Note that general register operands will be output using their 64-bit x
+// register name, whatever the size of the variable, unless the asm operand
+// is prefixed by the %w modifier. Floating-point and SIMD register operands
+// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
+// %q modifier.
+const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
+ // At this point, we have to lower this constraint to something else, so we
+ // lower it to an "r" or "w". However, by doing this we will force the result
+ // to be in register, while the X constraint is much more permissive.
+ //
+ // Although we are correct (we are free to emit anything, without
+ // constraints), we might break use cases that would expect us to be more
+ // efficient and emit something else.
+ if (!Subtarget->hasFPARMv8())
+ return "r";
+
+ if (ConstraintVT.isFloatingPoint())
+ return "w";
+
+ if (ConstraintVT.isVector() &&
+ (ConstraintVT.getSizeInBits() == 64 ||
+ ConstraintVT.getSizeInBits() == 128))
+ return "w";
+
+ return "r";
+}
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+AArch64TargetLowering::ConstraintType
+AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ default:
+ break;
+ case 'z':
+ return C_Other;
+ case 'x':
+ case 'w':
+ return C_RegisterClass;
+ // An address with a single base register. Due to the way we
+ // currently handle addresses it is the same as 'r'.
+ case 'Q':
+ return C_Memory;
+ }
+ }
+ return TargetLowering::getConstraintType(Constraint);
+}
+
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+AArch64TargetLowering::getSingleConstraintMatchWeight(
+ AsmOperandInfo &info, const char *constraint) const {
+ ConstraintWeight weight = CW_Invalid;
+ Value *CallOperandVal = info.CallOperandVal;
+ // If we don't have a value, we can't do a match,
+ // but allow it at the lowest weight.
+ if (!CallOperandVal)
+ return CW_Default;
+ Type *type = CallOperandVal->getType();
+ // Look at the constraint type.
+ switch (*constraint) {
+ default:
+ weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+ break;
+ case 'x':
+ case 'w':
+ if (type->isFloatingPointTy() || type->isVectorTy())
+ weight = CW_Register;
+ break;
+ case 'z':
+ weight = CW_Constant;
+ break;
+ }
+ return weight;
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+AArch64TargetLowering::getRegForInlineAsmConstraint(
+ const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ case 'r':
+ if (VT.getSizeInBits() == 64)
+ return std::make_pair(0U, &AArch64::GPR64commonRegClass);
+ return std::make_pair(0U, &AArch64::GPR32commonRegClass);
+ case 'w':
+ if (VT.getSizeInBits() == 16)
+ return std::make_pair(0U, &AArch64::FPR16RegClass);
+ if (VT.getSizeInBits() == 32)
+ return std::make_pair(0U, &AArch64::FPR32RegClass);
+ if (VT.getSizeInBits() == 64)
+ return std::make_pair(0U, &AArch64::FPR64RegClass);
+ if (VT.getSizeInBits() == 128)
+ return std::make_pair(0U, &AArch64::FPR128RegClass);
+ break;
+ // The instructions that this constraint is designed for can
+ // only take 128-bit registers so just use that regclass.
+ case 'x':
+ if (VT.getSizeInBits() == 128)
+ return std::make_pair(0U, &AArch64::FPR128_loRegClass);
+ break;
+ }
+ }
+ if (StringRef("{cc}").equals_lower(Constraint))
+ return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
+
+ // Use the default implementation in TargetLowering to convert the register
+ // constraint into a member of a register class.
+ std::pair<unsigned, const TargetRegisterClass *> Res;
+ Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+
+ // Not found as a standard register?
+ if (!Res.second) {
+ unsigned Size = Constraint.size();
+ if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
+ tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
+ int RegNo;
+ bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
+ if (!Failed && RegNo >= 0 && RegNo <= 31) {
+ // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
+ // By default we'll emit v0-v31 for this unless there's a modifier where
+ // we'll emit the correct register as well.
+ if (VT != MVT::Other && VT.getSizeInBits() == 64) {
+ Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
+ Res.second = &AArch64::FPR64RegClass;
+ } else {
+ Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
+ Res.second = &AArch64::FPR128RegClass;
+ }
+ }
+ }
+ }
+
+ return Res;
+}
+
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector. If it is invalid, don't add anything to Ops.
+void AArch64TargetLowering::LowerAsmOperandForConstraint(
+ SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
+ SelectionDAG &DAG) const {
+ SDValue Result;
+
+ // Currently only support length 1 constraints.
+ if (Constraint.length() != 1)
+ return;
+
+ char ConstraintLetter = Constraint[0];
+ switch (ConstraintLetter) {
+ default:
+ break;
+
+ // This set of constraints deal with valid constants for various instructions.
+ // Validate and return a target constant for them if we can.
+ case 'z': {
+ // 'z' maps to xzr or wzr so it needs an input of 0.
+ if (!isNullConstant(Op))
+ return;
+
+ if (Op.getValueType() == MVT::i64)
+ Result = DAG.getRegister(AArch64::XZR, MVT::i64);
+ else
+ Result = DAG.getRegister(AArch64::WZR, MVT::i32);
+ break;
+ }
+
+ case 'I':
+ case 'J':
+ case 'K':
+ case 'L':
+ case 'M':
+ case 'N':
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+ if (!C)
+ return;
+
+ // Grab the value and do some validation.
+ uint64_t CVal = C->getZExtValue();
+ switch (ConstraintLetter) {
+ // The I constraint applies only to simple ADD or SUB immediate operands:
+ // i.e. 0 to 4095 with optional shift by 12
+ // The J constraint applies only to ADD or SUB immediates that would be
+ // valid when negated, i.e. if [an add pattern] were to be output as a SUB
+ // instruction [or vice versa], in other words -1 to -4095 with optional
+ // left shift by 12.
+ case 'I':
+ if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
+ break;
+ return;
+ case 'J': {
+ uint64_t NVal = -C->getSExtValue();
+ if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
+ CVal = C->getSExtValue();
+ break;
+ }
+ return;
+ }
+ // The K and L constraints apply *only* to logical immediates, including
+ // what used to be the MOVI alias for ORR (though the MOVI alias has now
+ // been removed and MOV should be used). So these constraints have to
+ // distinguish between bit patterns that are valid 32-bit or 64-bit
+ // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
+ // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
+ // versa.
+ case 'K':
+ if (AArch64_AM::isLogicalImmediate(CVal, 32))
+ break;
+ return;
+ case 'L':
+ if (AArch64_AM::isLogicalImmediate(CVal, 64))
+ break;
+ return;
+ // The M and N constraints are a superset of K and L respectively, for use
+ // with the MOV (immediate) alias. As well as the logical immediates they
+ // also match 32 or 64-bit immediates that can be loaded either using a
+ // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
+ // (M) or 64-bit 0x1234000000000000 (N) etc.
+ // As a note some of this code is liberally stolen from the asm parser.
+ case 'M': {
+ if (!isUInt<32>(CVal))
+ return;
+ if (AArch64_AM::isLogicalImmediate(CVal, 32))
+ break;
+ if ((CVal & 0xFFFF) == CVal)
+ break;
+ if ((CVal & 0xFFFF0000ULL) == CVal)
+ break;
+ uint64_t NCVal = ~(uint32_t)CVal;
+ if ((NCVal & 0xFFFFULL) == NCVal)
+ break;
+ if ((NCVal & 0xFFFF0000ULL) == NCVal)
+ break;
+ return;
+ }
+ case 'N': {
+ if (AArch64_AM::isLogicalImmediate(CVal, 64))
+ break;
+ if ((CVal & 0xFFFFULL) == CVal)
+ break;
+ if ((CVal & 0xFFFF0000ULL) == CVal)
+ break;
+ if ((CVal & 0xFFFF00000000ULL) == CVal)
+ break;
+ if ((CVal & 0xFFFF000000000000ULL) == CVal)
+ break;
+ uint64_t NCVal = ~CVal;
+ if ((NCVal & 0xFFFFULL) == NCVal)
+ break;
+ if ((NCVal & 0xFFFF0000ULL) == NCVal)
+ break;
+ if ((NCVal & 0xFFFF00000000ULL) == NCVal)
+ break;
+ if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
+ break;
+ return;
+ }
+ default:
+ return;
+ }
+
+ // All assembler immediates are 64-bit integers.
+ Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
+ break;
+ }
+
+ if (Result.getNode()) {
+ Ops.push_back(Result);
+ return;
+ }
+
+ return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+//===----------------------------------------------------------------------===//
+// AArch64 Advanced SIMD Support
+//===----------------------------------------------------------------------===//
+
+/// WidenVector - Given a value in the V64 register class, produce the
+/// equivalent value in the V128 register class.
+static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
+ EVT VT = V64Reg.getValueType();
+ unsigned NarrowSize = VT.getVectorNumElements();
+ MVT EltTy = VT.getVectorElementType().getSimpleVT();
+ MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
+ SDLoc DL(V64Reg);
+
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
+ V64Reg, DAG.getConstant(0, DL, MVT::i32));
+}
+
+/// getExtFactor - Determine the adjustment factor for the position when
+/// generating an "extract from vector registers" instruction.
+static unsigned getExtFactor(SDValue &V) {
+ EVT EltType = V.getValueType().getVectorElementType();
+ return EltType.getSizeInBits() / 8;
+}
+
+/// NarrowVector - Given a value in the V128 register class, produce the
+/// equivalent value in the V64 register class.
+static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
+ EVT VT = V128Reg.getValueType();
+ unsigned WideSize = VT.getVectorNumElements();
+ MVT EltTy = VT.getVectorElementType().getSimpleVT();
+ MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
+ SDLoc DL(V128Reg);
+
+ return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg);
+}
+
+// Gather data to see if the operation can be modelled as a
+// shuffle in combination with VEXTs.
+SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+ unsigned NumElts = VT.getVectorNumElements();
+
+ struct ShuffleSourceInfo {
+ SDValue Vec;
+ unsigned MinElt;
+ unsigned MaxElt;
+
+ // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
+ // be compatible with the shuffle we intend to construct. As a result
+ // ShuffleVec will be some sliding window into the original Vec.
+ SDValue ShuffleVec;
+
+ // Code should guarantee that element i in Vec starts at element "WindowBase
+ // + i * WindowScale in ShuffleVec".
+ int WindowBase;
+ int WindowScale;
+
+ bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
+ ShuffleSourceInfo(SDValue Vec)
+ : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0),
+ WindowScale(1) {}
+ };
+
+ // First gather all vectors used as an immediate source for this BUILD_VECTOR
+ // node.
+ SmallVector<ShuffleSourceInfo, 2> Sources;
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue V = Op.getOperand(i);
+ if (V.isUndef())
+ continue;
+ else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ !isa<ConstantSDNode>(V.getOperand(1))) {
+ // A shuffle can only come from building a vector from various
+ // elements of other vectors, provided their indices are constant.
+ return SDValue();
+ }
+
+ // Add this element source to the list if it's not already there.
+ SDValue SourceVec = V.getOperand(0);
+ auto Source = find(Sources, SourceVec);
+ if (Source == Sources.end())
+ Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
+
+ // Update the minimum and maximum lane number seen.
+ unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
+ Source->MinElt = std::min(Source->MinElt, EltNo);
+ Source->MaxElt = std::max(Source->MaxElt, EltNo);
+ }
+
+ // Currently only do something sane when at most two source vectors
+ // are involved.
+ if (Sources.size() > 2)
+ return SDValue();
+
+ // Find out the smallest element size among result and two sources, and use
+ // it as element size to build the shuffle_vector.
+ EVT SmallestEltTy = VT.getVectorElementType();
+ for (auto &Source : Sources) {
+ EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
+ if (SrcEltTy.bitsLT(SmallestEltTy)) {
+ SmallestEltTy = SrcEltTy;
+ }
+ }
+ unsigned ResMultiplier =
+ VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
+ NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
+ EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
+
+ // If the source vector is too wide or too narrow, we may nevertheless be able
+ // to construct a compatible shuffle either by concatenating it with UNDEF or
+ // extracting a suitable range of elements.
+ for (auto &Src : Sources) {
+ EVT SrcVT = Src.ShuffleVec.getValueType();
+
+ if (SrcVT.getSizeInBits() == VT.getSizeInBits())
+ continue;
+
+ // This stage of the search produces a source with the same element type as
+ // the original, but with a total width matching the BUILD_VECTOR output.
+ EVT EltVT = SrcVT.getVectorElementType();
+ unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
+ EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
+
+ if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
+ assert(2 * SrcVT.getSizeInBits() == VT.getSizeInBits());
+ // We can pad out the smaller vector for free, so if it's part of a
+ // shuffle...
+ Src.ShuffleVec =
+ DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
+ DAG.getUNDEF(Src.ShuffleVec.getValueType()));
+ continue;
+ }
+
+ assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits());
+
+ if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
+ // Span too large for a VEXT to cope
+ return SDValue();
+ }
+
+ if (Src.MinElt >= NumSrcElts) {
+ // The extraction can just take the second half
+ Src.ShuffleVec =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+ DAG.getConstant(NumSrcElts, dl, MVT::i64));
+ Src.WindowBase = -NumSrcElts;
+ } else if (Src.MaxElt < NumSrcElts) {
+ // The extraction can just take the first half
+ Src.ShuffleVec =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+ DAG.getConstant(0, dl, MVT::i64));
+ } else {
+ // An actual VEXT is needed
+ SDValue VEXTSrc1 =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+ DAG.getConstant(0, dl, MVT::i64));
+ SDValue VEXTSrc2 =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+ DAG.getConstant(NumSrcElts, dl, MVT::i64));
+ unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
+
+ Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
+ VEXTSrc2,
+ DAG.getConstant(Imm, dl, MVT::i32));
+ Src.WindowBase = -Src.MinElt;
+ }
+ }
+
+ // Another possible incompatibility occurs from the vector element types. We
+ // can fix this by bitcasting the source vectors to the same type we intend
+ // for the shuffle.
+ for (auto &Src : Sources) {
+ EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
+ if (SrcEltTy == SmallestEltTy)
+ continue;
+ assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
+ Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
+ Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
+ Src.WindowBase *= Src.WindowScale;
+ }
+
+ // Final sanity check before we try to actually produce a shuffle.
+ DEBUG(
+ for (auto Src : Sources)
+ assert(Src.ShuffleVec.getValueType() == ShuffleVT);
+ );
+
+ // The stars all align, our next step is to produce the mask for the shuffle.
+ SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
+ int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
+ for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
+ SDValue Entry = Op.getOperand(i);
+ if (Entry.isUndef())
+ continue;
+
+ auto Src = find(Sources, Entry.getOperand(0));
+ int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
+
+ // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
+ // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
+ // segment.
+ EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
+ int BitsDefined =
+ std::min(OrigEltTy.getSizeInBits(), VT.getScalarSizeInBits());
+ int LanesDefined = BitsDefined / BitsPerShuffleLane;
+
+ // This source is expected to fill ResMultiplier lanes of the final shuffle,
+ // starting at the appropriate offset.
+ int *LaneMask = &Mask[i * ResMultiplier];
+
+ int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
+ ExtractBase += NumElts * (Src - Sources.begin());
+ for (int j = 0; j < LanesDefined; ++j)
+ LaneMask[j] = ExtractBase + j;
+ }
+
+ // Final check before we try to produce nonsense...
+ if (!isShuffleMaskLegal(Mask, ShuffleVT))
+ return SDValue();
+
+ SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
+ for (unsigned i = 0; i < Sources.size(); ++i)
+ ShuffleOps[i] = Sources[i].ShuffleVec;
+
+ SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
+ ShuffleOps[1], Mask);
+ return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
+}
+
+// check if an EXT instruction can handle the shuffle mask when the
+// vector sources of the shuffle are the same.
+static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ // Assume that the first shuffle index is not UNDEF. Fail if it is.
+ if (M[0] < 0)
+ return false;
+
+ Imm = M[0];
+
+ // If this is a VEXT shuffle, the immediate value is the index of the first
+ // element. The other shuffle indices must be the successive elements after
+ // the first one.
+ unsigned ExpectedElt = Imm;
+ for (unsigned i = 1; i < NumElts; ++i) {
+ // Increment the expected index. If it wraps around, just follow it
+ // back to index zero and keep going.
+ ++ExpectedElt;
+ if (ExpectedElt == NumElts)
+ ExpectedElt = 0;
+
+ if (M[i] < 0)
+ continue; // ignore UNDEF indices
+ if (ExpectedElt != static_cast<unsigned>(M[i]))
+ return false;
+ }
+
+ return true;
+}
+
+// check if an EXT instruction can handle the shuffle mask when the
+// vector sources of the shuffle are different.
+static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
+ unsigned &Imm) {
+ // Look for the first non-undef element.
+ const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
+
+ // Benefit form APInt to handle overflow when calculating expected element.
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
+ APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
+ // The following shuffle indices must be the successive elements after the
+ // first real element.
+ const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(),
+ [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;});
+ if (FirstWrongElt != M.end())
+ return false;
+
+ // The index of an EXT is the first element if it is not UNDEF.
+ // Watch out for the beginning UNDEFs. The EXT index should be the expected
+ // value of the first element. E.g.
+ // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
+ // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
+ // ExpectedElt is the last mask index plus 1.
+ Imm = ExpectedElt.getZExtValue();
+
+ // There are two difference cases requiring to reverse input vectors.
+ // For example, for vector <4 x i32> we have the following cases,
+ // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
+ // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
+ // For both cases, we finally use mask <5, 6, 7, 0>, which requires
+ // to reverse two input vectors.
+ if (Imm < NumElts)
+ ReverseEXT = true;
+ else
+ Imm -= NumElts;
+
+ return true;
+}
+
+/// isREVMask - Check if a vector shuffle corresponds to a REV
+/// instruction with the specified blocksize. (The order of the elements
+/// within each block of the vector is reversed.)
+static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
+ assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
+ "Only possible block sizes for REV are: 16, 32, 64");
+
+ unsigned EltSz = VT.getScalarSizeInBits();
+ if (EltSz == 64)
+ return false;
+
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned BlockElts = M[0] + 1;
+ // If the first shuffle index is UNDEF, be optimistic.
+ if (M[0] < 0)
+ BlockElts = BlockSize / EltSz;
+
+ if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
+ return false;
+
+ for (unsigned i = 0; i < NumElts; ++i) {
+ if (M[i] < 0)
+ continue; // ignore UNDEF indices
+ if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
+ return false;
+ }
+
+ return true;
+}
+
+static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+ unsigned NumElts = VT.getVectorNumElements();
+ WhichResult = (M[0] == 0 ? 0 : 1);
+ unsigned Idx = WhichResult * NumElts / 2;
+ for (unsigned i = 0; i != NumElts; i += 2) {
+ if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
+ (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
+ return false;
+ Idx += 1;
+ }
+
+ return true;
+}
+
+static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+ unsigned NumElts = VT.getVectorNumElements();
+ WhichResult = (M[0] == 0 ? 0 : 1);
+ for (unsigned i = 0; i != NumElts; ++i) {
+ if (M[i] < 0)
+ continue; // ignore UNDEF indices
+ if ((unsigned)M[i] != 2 * i + WhichResult)
+ return false;
+ }
+
+ return true;
+}
+
+static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+ unsigned NumElts = VT.getVectorNumElements();
+ WhichResult = (M[0] == 0 ? 0 : 1);
+ for (unsigned i = 0; i < NumElts; i += 2) {
+ if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
+ (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
+ return false;
+ }
+ return true;
+}
+
+/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
+static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+ unsigned NumElts = VT.getVectorNumElements();
+ WhichResult = (M[0] == 0 ? 0 : 1);
+ unsigned Idx = WhichResult * NumElts / 2;
+ for (unsigned i = 0; i != NumElts; i += 2) {
+ if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
+ (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
+ return false;
+ Idx += 1;
+ }
+
+ return true;
+}
+
+/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
+static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+ unsigned Half = VT.getVectorNumElements() / 2;
+ WhichResult = (M[0] == 0 ? 0 : 1);
+ for (unsigned j = 0; j != 2; ++j) {
+ unsigned Idx = WhichResult;
+ for (unsigned i = 0; i != Half; ++i) {
+ int MIdx = M[i + j * Half];
+ if (MIdx >= 0 && (unsigned)MIdx != Idx)
+ return false;
+ Idx += 2;
+ }
+ }
+
+ return true;
+}
+
+/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
+static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+ unsigned NumElts = VT.getVectorNumElements();
+ WhichResult = (M[0] == 0 ? 0 : 1);
+ for (unsigned i = 0; i < NumElts; i += 2) {
+ if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
+ (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
+ return false;
+ }
+ return true;
+}
+
+static bool isINSMask(ArrayRef<int> M, int NumInputElements,
+ bool &DstIsLeft, int &Anomaly) {
+ if (M.size() != static_cast<size_t>(NumInputElements))
+ return false;
+
+ int NumLHSMatch = 0, NumRHSMatch = 0;
+ int LastLHSMismatch = -1, LastRHSMismatch = -1;
+
+ for (int i = 0; i < NumInputElements; ++i) {
+ if (M[i] == -1) {
+ ++NumLHSMatch;
+ ++NumRHSMatch;
+ continue;
+ }
+
+ if (M[i] == i)
+ ++NumLHSMatch;
+ else
+ LastLHSMismatch = i;
+
+ if (M[i] == i + NumInputElements)
+ ++NumRHSMatch;
+ else
+ LastRHSMismatch = i;
+ }
+
+ if (NumLHSMatch == NumInputElements - 1) {
+ DstIsLeft = true;
+ Anomaly = LastLHSMismatch;
+ return true;
+ } else if (NumRHSMatch == NumInputElements - 1) {
+ DstIsLeft = false;
+ Anomaly = LastRHSMismatch;
+ return true;
+ }
+
+ return false;
+}
+
+static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
+ if (VT.getSizeInBits() != 128)
+ return false;
+
+ unsigned NumElts = VT.getVectorNumElements();
+
+ for (int I = 0, E = NumElts / 2; I != E; I++) {
+ if (Mask[I] != I)
+ return false;
+ }
+
+ int Offset = NumElts / 2;
+ for (int I = NumElts / 2, E = NumElts; I != E; I++) {
+ if (Mask[I] != I + SplitLHS * Offset)
+ return false;
+ }
+
+ return true;
+}
+
+static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+ SDValue V0 = Op.getOperand(0);
+ SDValue V1 = Op.getOperand(1);
+ ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
+
+ if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
+ VT.getVectorElementType() != V1.getValueType().getVectorElementType())
+ return SDValue();
+
+ bool SplitV0 = V0.getValueSizeInBits() == 128;
+
+ if (!isConcatMask(Mask, VT, SplitV0))
+ return SDValue();
+
+ EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+ VT.getVectorNumElements() / 2);
+ if (SplitV0) {
+ V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
+ DAG.getConstant(0, DL, MVT::i64));
+ }
+ if (V1.getValueSizeInBits() == 128) {
+ V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
+ DAG.getConstant(0, DL, MVT::i64));
+ }
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
+}
+
+/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
+/// the specified operations to build the shuffle.
+static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
+ SDValue RHS, SelectionDAG &DAG,
+ const SDLoc &dl) {
+ unsigned OpNum = (PFEntry >> 26) & 0x0F;
+ unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
+ unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
+
+ enum {
+ OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
+ OP_VREV,
+ OP_VDUP0,
+ OP_VDUP1,
+ OP_VDUP2,
+ OP_VDUP3,
+ OP_VEXT1,
+ OP_VEXT2,
+ OP_VEXT3,
+ OP_VUZPL, // VUZP, left result
+ OP_VUZPR, // VUZP, right result
+ OP_VZIPL, // VZIP, left result
+ OP_VZIPR, // VZIP, right result
+ OP_VTRNL, // VTRN, left result
+ OP_VTRNR // VTRN, right result
+ };
+
+ if (OpNum == OP_COPY) {
+ if (LHSID == (1 * 9 + 2) * 9 + 3)
+ return LHS;
+ assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
+ return RHS;
+ }
+
+ SDValue OpLHS, OpRHS;
+ OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
+ OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
+ EVT VT = OpLHS.getValueType();
+
+ switch (OpNum) {
+ default:
+ llvm_unreachable("Unknown shuffle opcode!");
+ case OP_VREV:
+ // VREV divides the vector in half and swaps within the half.
+ if (VT.getVectorElementType() == MVT::i32 ||
+ VT.getVectorElementType() == MVT::f32)
+ return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
+ // vrev <4 x i16> -> REV32
+ if (VT.getVectorElementType() == MVT::i16 ||
+ VT.getVectorElementType() == MVT::f16)
+ return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
+ // vrev <4 x i8> -> REV16
+ assert(VT.getVectorElementType() == MVT::i8);
+ return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
+ case OP_VDUP0:
+ case OP_VDUP1:
+ case OP_VDUP2:
+ case OP_VDUP3: {
+ EVT EltTy = VT.getVectorElementType();
+ unsigned Opcode;
+ if (EltTy == MVT::i8)
+ Opcode = AArch64ISD::DUPLANE8;
+ else if (EltTy == MVT::i16 || EltTy == MVT::f16)
+ Opcode = AArch64ISD::DUPLANE16;
+ else if (EltTy == MVT::i32 || EltTy == MVT::f32)
+ Opcode = AArch64ISD::DUPLANE32;
+ else if (EltTy == MVT::i64 || EltTy == MVT::f64)
+ Opcode = AArch64ISD::DUPLANE64;
+ else
+ llvm_unreachable("Invalid vector element type?");
+
+ if (VT.getSizeInBits() == 64)
+ OpLHS = WidenVector(OpLHS, DAG);
+ SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
+ return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
+ }
+ case OP_VEXT1:
+ case OP_VEXT2:
+ case OP_VEXT3: {
+ unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
+ return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
+ DAG.getConstant(Imm, dl, MVT::i32));
+ }
+ case OP_VUZPL:
+ return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS,
+ OpRHS);
+ case OP_VUZPR:
+ return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS,
+ OpRHS);
+ case OP_VZIPL:
+ return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS,
+ OpRHS);
+ case OP_VZIPR:
+ return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS,
+ OpRHS);
+ case OP_VTRNL:
+ return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS,
+ OpRHS);
+ case OP_VTRNR:
+ return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS,
+ OpRHS);
+ }
+}
+
+static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
+ SelectionDAG &DAG) {
+ // Check to see if we can use the TBL instruction.
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ SDLoc DL(Op);
+
+ EVT EltVT = Op.getValueType().getVectorElementType();
+ unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
+
+ SmallVector<SDValue, 8> TBLMask;
+ for (int Val : ShuffleMask) {
+ for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
+ unsigned Offset = Byte + Val * BytesPerElt;
+ TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
+ }
+ }
+
+ MVT IndexVT = MVT::v8i8;
+ unsigned IndexLen = 8;
+ if (Op.getValueSizeInBits() == 128) {
+ IndexVT = MVT::v16i8;
+ IndexLen = 16;
+ }
+
+ SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
+ SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
+
+ SDValue Shuffle;
+ if (V2.getNode()->isUndef()) {
+ if (IndexLen == 8)
+ V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
+ Shuffle = DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
+ DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
+ DAG.getBuildVector(IndexVT, DL,
+ makeArrayRef(TBLMask.data(), IndexLen)));
+ } else {
+ if (IndexLen == 8) {
+ V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
+ Shuffle = DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
+ DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
+ DAG.getBuildVector(IndexVT, DL,
+ makeArrayRef(TBLMask.data(), IndexLen)));
+ } else {
+ // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
+ // cannot currently represent the register constraints on the input
+ // table registers.
+ // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
+ // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
+ // IndexLen));
+ Shuffle = DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
+ DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
+ V2Cst, DAG.getBuildVector(IndexVT, DL,
+ makeArrayRef(TBLMask.data(), IndexLen)));
+ }
+ }
+ return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
+}
+
+static unsigned getDUPLANEOp(EVT EltType) {
+ if (EltType == MVT::i8)
+ return AArch64ISD::DUPLANE8;
+ if (EltType == MVT::i16 || EltType == MVT::f16)
+ return AArch64ISD::DUPLANE16;
+ if (EltType == MVT::i32 || EltType == MVT::f32)
+ return AArch64ISD::DUPLANE32;
+ if (EltType == MVT::i64 || EltType == MVT::f64)
+ return AArch64ISD::DUPLANE64;
+
+ llvm_unreachable("Invalid vector element type?");
+}
+
+SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+
+ ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
+
+ // Convert shuffles that are directly supported on NEON to target-specific
+ // DAG nodes, instead of keeping them as shuffles and matching them again
+ // during code selection. This is more efficient and avoids the possibility
+ // of inconsistencies between legalization and selection.
+ ArrayRef<int> ShuffleMask = SVN->getMask();
+
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+
+ if (SVN->isSplat()) {
+ int Lane = SVN->getSplatIndex();
+ // If this is undef splat, generate it via "just" vdup, if possible.
+ if (Lane == -1)
+ Lane = 0;
+
+ if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
+ return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
+ V1.getOperand(0));
+ // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
+ // constant. If so, we can just reference the lane's definition directly.
+ if (V1.getOpcode() == ISD::BUILD_VECTOR &&
+ !isa<ConstantSDNode>(V1.getOperand(Lane)))
+ return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
+
+ // Otherwise, duplicate from the lane of the input vector.
+ unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
+
+ // SelectionDAGBuilder may have "helpfully" already extracted or conatenated
+ // to make a vector of the same size as this SHUFFLE. We can ignore the
+ // extract entirely, and canonicalise the concat using WidenVector.
+ if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+ Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
+ V1 = V1.getOperand(0);
+ } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) {
+ unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
+ Lane -= Idx * VT.getVectorNumElements() / 2;
+ V1 = WidenVector(V1.getOperand(Idx), DAG);
+ } else if (VT.getSizeInBits() == 64)
+ V1 = WidenVector(V1, DAG);
+
+ return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, dl, MVT::i64));
+ }
+
+ if (isREVMask(ShuffleMask, VT, 64))
+ return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
+ if (isREVMask(ShuffleMask, VT, 32))
+ return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
+ if (isREVMask(ShuffleMask, VT, 16))
+ return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
+
+ bool ReverseEXT = false;
+ unsigned Imm;
+ if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
+ if (ReverseEXT)
+ std::swap(V1, V2);
+ Imm *= getExtFactor(V1);
+ return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
+ DAG.getConstant(Imm, dl, MVT::i32));
+ } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
+ Imm *= getExtFactor(V1);
+ return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
+ DAG.getConstant(Imm, dl, MVT::i32));
+ }
+
+ unsigned WhichResult;
+ if (isZIPMask(ShuffleMask, VT, WhichResult)) {
+ unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
+ return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
+ }
+ if (isUZPMask(ShuffleMask, VT, WhichResult)) {
+ unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
+ return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
+ }
+ if (isTRNMask(ShuffleMask, VT, WhichResult)) {
+ unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
+ return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
+ }
+
+ if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
+ unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
+ return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
+ }
+ if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
+ unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
+ return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
+ }
+ if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
+ unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
+ return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
+ }
+
+ if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG))
+ return Concat;
+
+ bool DstIsLeft;
+ int Anomaly;
+ int NumInputElements = V1.getValueType().getVectorNumElements();
+ if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
+ SDValue DstVec = DstIsLeft ? V1 : V2;
+ SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);
+
+ SDValue SrcVec = V1;
+ int SrcLane = ShuffleMask[Anomaly];
+ if (SrcLane >= NumInputElements) {
+ SrcVec = V2;
+ SrcLane -= VT.getVectorNumElements();
+ }
+ SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);
+
+ EVT ScalarVT = VT.getVectorElementType();
+
+ if (ScalarVT.getSizeInBits() < 32 && ScalarVT.isInteger())
+ ScalarVT = MVT::i32;
+
+ return DAG.getNode(
+ ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
+ DstLaneV);
+ }
+
+ // If the shuffle is not directly supported and it has 4 elements, use
+ // the PerfectShuffle-generated table to synthesize it from other shuffles.
+ unsigned NumElts = VT.getVectorNumElements();
+ if (NumElts == 4) {
+ unsigned PFIndexes[4];
+ for (unsigned i = 0; i != 4; ++i) {
+ if (ShuffleMask[i] < 0)
+ PFIndexes[i] = 8;
+ else
+ PFIndexes[i] = ShuffleMask[i];
+ }
+
+ // Compute the index in the perfect shuffle table.
+ unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
+ PFIndexes[2] * 9 + PFIndexes[3];
+ unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+ unsigned Cost = (PFEntry >> 30);
+
+ if (Cost <= 4)
+ return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
+ }
+
+ return GenerateTBL(Op, ShuffleMask, DAG);
+}
+
+static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
+ APInt &UndefBits) {
+ EVT VT = BVN->getValueType(0);
+ APInt SplatBits, SplatUndef;
+ unsigned SplatBitSize;
+ bool HasAnyUndefs;
+ if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
+ unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
+
+ for (unsigned i = 0; i < NumSplats; ++i) {
+ CnstBits <<= SplatBitSize;
+ UndefBits <<= SplatBitSize;
+ CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
+ UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
+ }
+
+ return true;
+ }
+
+ return false;
+}
+
+SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
+ SelectionDAG &DAG) const {
+ BuildVectorSDNode *BVN =
+ dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
+ SDValue LHS = Op.getOperand(0);
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+
+ if (!BVN)
+ return Op;
+
+ APInt CnstBits(VT.getSizeInBits(), 0);
+ APInt UndefBits(VT.getSizeInBits(), 0);
+ if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
+ // We only have BIC vector immediate instruction, which is and-not.
+ CnstBits = ~CnstBits;
+
+ // We make use of a little bit of goto ickiness in order to avoid having to
+ // duplicate the immediate matching logic for the undef toggled case.
+ bool SecondTry = false;
+ AttemptModImm:
+
+ if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
+ CnstBits = CnstBits.zextOrTrunc(64);
+ uint64_t CnstVal = CnstBits.getZExtValue();
+
+ if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+ DAG.getConstant(CnstVal, dl, MVT::i32),
+ DAG.getConstant(0, dl, MVT::i32));
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+ DAG.getConstant(CnstVal, dl, MVT::i32),
+ DAG.getConstant(8, dl, MVT::i32));
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+ DAG.getConstant(CnstVal, dl, MVT::i32),
+ DAG.getConstant(16, dl, MVT::i32));
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+ DAG.getConstant(CnstVal, dl, MVT::i32),
+ DAG.getConstant(24, dl, MVT::i32));
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+ SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+ DAG.getConstant(CnstVal, dl, MVT::i32),
+ DAG.getConstant(0, dl, MVT::i32));
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+ SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+ DAG.getConstant(CnstVal, dl, MVT::i32),
+ DAG.getConstant(8, dl, MVT::i32));
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ }
+ }
+
+ if (SecondTry)
+ goto FailedModImm;
+ SecondTry = true;
+ CnstBits = ~UndefBits;
+ goto AttemptModImm;
+ }
+
+// We can always fall back to a non-immediate AND.
+FailedModImm:
+ return Op;
+}
+
+// Specialized code to quickly find if PotentialBVec is a BuildVector that
+// consists of only the same constant int value, returned in reference arg
+// ConstVal
+static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
+ uint64_t &ConstVal) {
+ BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
+ if (!Bvec)
+ return false;
+ ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
+ if (!FirstElt)
+ return false;
+ EVT VT = Bvec->getValueType(0);
+ unsigned NumElts = VT.getVectorNumElements();
+ for (unsigned i = 1; i < NumElts; ++i)
+ if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
+ return false;
+ ConstVal = FirstElt->getZExtValue();
+ return true;
+}
+
+static unsigned getIntrinsicID(const SDNode *N) {
+ unsigned Opcode = N->getOpcode();
+ switch (Opcode) {
+ default:
+ return Intrinsic::not_intrinsic;
+ case ISD::INTRINSIC_WO_CHAIN: {
+ unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+ if (IID < Intrinsic::num_intrinsics)
+ return IID;
+ return Intrinsic::not_intrinsic;
+ }
+ }
+}
+
+// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
+// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
+// BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2.
+// Also, logical shift right -> sri, with the same structure.
+static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+
+ if (!VT.isVector())
+ return SDValue();
+
+ SDLoc DL(N);
+
+ // Is the first op an AND?
+ const SDValue And = N->getOperand(0);
+ if (And.getOpcode() != ISD::AND)
+ return SDValue();
+
+ // Is the second op an shl or lshr?
+ SDValue Shift = N->getOperand(1);
+ // This will have been turned into: AArch64ISD::VSHL vector, #shift
+ // or AArch64ISD::VLSHR vector, #shift
+ unsigned ShiftOpc = Shift.getOpcode();
+ if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR))
+ return SDValue();
+ bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR;
+
+ // Is the shift amount constant?
+ ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
+ if (!C2node)
+ return SDValue();
+
+ // Is the and mask vector all constant?
+ uint64_t C1;
+ if (!isAllConstantBuildVector(And.getOperand(1), C1))
+ return SDValue();
+
+ // Is C1 == ~C2, taking into account how much one can shift elements of a
+ // particular size?
+ uint64_t C2 = C2node->getZExtValue();
+ unsigned ElemSizeInBits = VT.getScalarSizeInBits();
+ if (C2 > ElemSizeInBits)
+ return SDValue();
+ unsigned ElemMask = (1 << ElemSizeInBits) - 1;
+ if ((C1 & ElemMask) != (~C2 & ElemMask))
+ return SDValue();
+
+ SDValue X = And.getOperand(0);
+ SDValue Y = Shift.getOperand(0);
+
+ unsigned Intrin =
+ IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli;
+ SDValue ResultSLI =
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+ DAG.getConstant(Intrin, DL, MVT::i32), X, Y,
+ Shift.getOperand(1));
+
+ DEBUG(dbgs() << "aarch64-lower: transformed: \n");
+ DEBUG(N->dump(&DAG));
+ DEBUG(dbgs() << "into: \n");
+ DEBUG(ResultSLI->dump(&DAG));
+
+ ++NumShiftInserts;
+ return ResultSLI;
+}
+
+SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
+ SelectionDAG &DAG) const {
+ // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
+ if (EnableAArch64SlrGeneration) {
+ if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
+ return Res;
+ }
+
+ BuildVectorSDNode *BVN =
+ dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
+ SDValue LHS = Op.getOperand(1);
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+
+ // OR commutes, so try swapping the operands.
+ if (!BVN) {
+ LHS = Op.getOperand(0);
+ BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
+ }
+ if (!BVN)
+ return Op;
+
+ APInt CnstBits(VT.getSizeInBits(), 0);
+ APInt UndefBits(VT.getSizeInBits(), 0);
+ if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
+ // We make use of a little bit of goto ickiness in order to avoid having to
+ // duplicate the immediate matching logic for the undef toggled case.
+ bool SecondTry = false;
+ AttemptModImm:
+
+ if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
+ CnstBits = CnstBits.zextOrTrunc(64);
+ uint64_t CnstVal = CnstBits.getZExtValue();
+
+ if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+ DAG.getConstant(CnstVal, dl, MVT::i32),
+ DAG.getConstant(0, dl, MVT::i32));
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+ DAG.getConstant(CnstVal, dl, MVT::i32),
+ DAG.getConstant(8, dl, MVT::i32));
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+ DAG.getConstant(CnstVal, dl, MVT::i32),
+ DAG.getConstant(16, dl, MVT::i32));
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+ DAG.getConstant(CnstVal, dl, MVT::i32),
+ DAG.getConstant(24, dl, MVT::i32));
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+ SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+ DAG.getConstant(CnstVal, dl, MVT::i32),
+ DAG.getConstant(0, dl, MVT::i32));
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+ SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+ DAG.getConstant(CnstVal, dl, MVT::i32),
+ DAG.getConstant(8, dl, MVT::i32));
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ }
+ }
+
+ if (SecondTry)
+ goto FailedModImm;
+ SecondTry = true;
+ CnstBits = UndefBits;
+ goto AttemptModImm;
+ }
+
+// We can always fall back to a non-immediate OR.
+FailedModImm:
+ return Op;
+}
+
+// Normalize the operands of BUILD_VECTOR. The value of constant operands will
+// be truncated to fit element width.
+static SDValue NormalizeBuildVector(SDValue Op,
+ SelectionDAG &DAG) {
+ assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+ EVT EltTy= VT.getVectorElementType();
+
+ if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
+ return Op;
+
+ SmallVector<SDValue, 16> Ops;
+ for (SDValue Lane : Op->ops()) {
+ if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
+ APInt LowBits(EltTy.getSizeInBits(),
+ CstLane->getZExtValue());
+ Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
+ }
+ Ops.push_back(Lane);
+ }
+ return DAG.getBuildVector(VT, dl, Ops);
+}
+
+SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+ Op = NormalizeBuildVector(Op, DAG);
+ BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
+
+ APInt CnstBits(VT.getSizeInBits(), 0);
+ APInt UndefBits(VT.getSizeInBits(), 0);
+ if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
+ // We make use of a little bit of goto ickiness in order to avoid having to
+ // duplicate the immediate matching logic for the undef toggled case.
+ bool SecondTry = false;
+ AttemptModImm:
+
+ if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
+ CnstBits = CnstBits.zextOrTrunc(64);
+ uint64_t CnstVal = CnstBits.getZExtValue();
+
+ // Certain magic vector constants (used to express things like NOT
+ // and NEG) are passed through unmodified. This allows codegen patterns
+ // for these operations to match. Special-purpose patterns will lower
+ // these immediates to MOVIs if it proves necessary.
+ if (VT.isInteger() && (CnstVal == 0 || CnstVal == ~0ULL))
+ return Op;
+
+ // The many faces of MOVI...
+ if (AArch64_AM::isAdvSIMDModImmType10(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType10(CnstVal);
+ if (VT.getSizeInBits() == 128) {
+ SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::v2i64,
+ DAG.getConstant(CnstVal, dl, MVT::i32));
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ }
+
+ // Support the V64 version via subregister insertion.
+ SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::f64,
+ DAG.getConstant(CnstVal, dl, MVT::i32));
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+ DAG.getConstant(CnstVal, dl, MVT::i32),
+ DAG.getConstant(0, dl, MVT::i32));
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+ DAG.getConstant(CnstVal, dl, MVT::i32),
+ DAG.getConstant(8, dl, MVT::i32));
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+ DAG.getConstant(CnstVal, dl, MVT::i32),
+ DAG.getConstant(16, dl, MVT::i32));
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+ DAG.getConstant(CnstVal, dl, MVT::i32),
+ DAG.getConstant(24, dl, MVT::i32));
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+ SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+ DAG.getConstant(CnstVal, dl, MVT::i32),
+ DAG.getConstant(0, dl, MVT::i32));
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+ SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+ DAG.getConstant(CnstVal, dl, MVT::i32),
+ DAG.getConstant(8, dl, MVT::i32));
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
+ DAG.getConstant(CnstVal, dl, MVT::i32),
+ DAG.getConstant(264, dl, MVT::i32));
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
+ DAG.getConstant(CnstVal, dl, MVT::i32),
+ DAG.getConstant(272, dl, MVT::i32));
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType9(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType9(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
+ SDValue Mov = DAG.getNode(AArch64ISD::MOVI, dl, MovTy,
+ DAG.getConstant(CnstVal, dl, MVT::i32));
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ }
+
+ // The few faces of FMOV...
+ if (AArch64_AM::isAdvSIMDModImmType11(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType11(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32;
+ SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MovTy,
+ DAG.getConstant(CnstVal, dl, MVT::i32));
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType12(CnstVal) &&
+ VT.getSizeInBits() == 128) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType12(CnstVal);
+ SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MVT::v2f64,
+ DAG.getConstant(CnstVal, dl, MVT::i32));
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ }
+
+ // The many faces of MVNI...
+ CnstVal = ~CnstVal;
+ if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+ DAG.getConstant(CnstVal, dl, MVT::i32),
+ DAG.getConstant(0, dl, MVT::i32));
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+ DAG.getConstant(CnstVal, dl, MVT::i32),
+ DAG.getConstant(8, dl, MVT::i32));
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+ DAG.getConstant(CnstVal, dl, MVT::i32),
+ DAG.getConstant(16, dl, MVT::i32));
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+ DAG.getConstant(CnstVal, dl, MVT::i32),
+ DAG.getConstant(24, dl, MVT::i32));
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+ SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+ DAG.getConstant(CnstVal, dl, MVT::i32),
+ DAG.getConstant(0, dl, MVT::i32));
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+ SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+ DAG.getConstant(CnstVal, dl, MVT::i32),
+ DAG.getConstant(8, dl, MVT::i32));
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
+ DAG.getConstant(CnstVal, dl, MVT::i32),
+ DAG.getConstant(264, dl, MVT::i32));
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
+ DAG.getConstant(CnstVal, dl, MVT::i32),
+ DAG.getConstant(272, dl, MVT::i32));
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ }
+ }
+
+ if (SecondTry)
+ goto FailedModImm;
+ SecondTry = true;
+ CnstBits = UndefBits;
+ goto AttemptModImm;
+ }
+FailedModImm:
+
+ // Scan through the operands to find some interesting properties we can
+ // exploit:
+ // 1) If only one value is used, we can use a DUP, or
+ // 2) if only the low element is not undef, we can just insert that, or
+ // 3) if only one constant value is used (w/ some non-constant lanes),
+ // we can splat the constant value into the whole vector then fill
+ // in the non-constant lanes.
+ // 4) FIXME: If different constant values are used, but we can intelligently
+ // select the values we'll be overwriting for the non-constant
+ // lanes such that we can directly materialize the vector
+ // some other way (MOVI, e.g.), we can be sneaky.
+ unsigned NumElts = VT.getVectorNumElements();
+ bool isOnlyLowElement = true;
+ bool usesOnlyOneValue = true;
+ bool usesOnlyOneConstantValue = true;
+ bool isConstant = true;
+ unsigned NumConstantLanes = 0;
+ SDValue Value;
+ SDValue ConstantValue;
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue V = Op.getOperand(i);
+ if (V.isUndef())
+ continue;
+ if (i > 0)
+ isOnlyLowElement = false;
+ if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
+ isConstant = false;
+
+ if (isa<ConstantSDNode>(V) || isa<ConstantFPSDNode>(V)) {
+ ++NumConstantLanes;
+ if (!ConstantValue.getNode())
+ ConstantValue = V;
+ else if (ConstantValue != V)
+ usesOnlyOneConstantValue = false;
+ }
+
+ if (!Value.getNode())
+ Value = V;
+ else if (V != Value)
+ usesOnlyOneValue = false;
+ }
+
+ if (!Value.getNode())
+ return DAG.getUNDEF(VT);
+
+ if (isOnlyLowElement)
+ return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
+
+ // Use DUP for non-constant splats. For f32 constant splats, reduce to
+ // i32 and try again.
+ if (usesOnlyOneValue) {
+ if (!isConstant) {
+ if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ Value.getValueType() != VT)
+ return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
+
+ // This is actually a DUPLANExx operation, which keeps everything vectory.
+
+ // DUPLANE works on 128-bit vectors, widen it if necessary.
+ SDValue Lane = Value.getOperand(1);
+ Value = Value.getOperand(0);
+ if (Value.getValueSizeInBits() == 64)
+ Value = WidenVector(Value, DAG);
+
+ unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
+ return DAG.getNode(Opcode, dl, VT, Value, Lane);
+ }
+
+ if (VT.getVectorElementType().isFloatingPoint()) {
+ SmallVector<SDValue, 8> Ops;
+ EVT EltTy = VT.getVectorElementType();
+ assert ((EltTy == MVT::f16 || EltTy == MVT::f32 || EltTy == MVT::f64) &&
+ "Unsupported floating-point vector type");
+ MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
+ for (unsigned i = 0; i < NumElts; ++i)
+ Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
+ EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
+ SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
+ Val = LowerBUILD_VECTOR(Val, DAG);
+ if (Val.getNode())
+ return DAG.getNode(ISD::BITCAST, dl, VT, Val);
+ }
+ }
+
+ // If there was only one constant value used and for more than one lane,
+ // start by splatting that value, then replace the non-constant lanes. This
+ // is better than the default, which will perform a separate initialization
+ // for each lane.
+ if (NumConstantLanes > 0 && usesOnlyOneConstantValue) {
+ SDValue Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
+ // Now insert the non-constant lanes.
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue V = Op.getOperand(i);
+ SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
+ if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V)) {
+ // Note that type legalization likely mucked about with the VT of the
+ // source operand, so we may have to convert it here before inserting.
+ Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
+ }
+ }
+ return Val;
+ }
+
+ // If all elements are constants and the case above didn't get hit, fall back
+ // to the default expansion, which will generate a load from the constant
+ // pool.
+ if (isConstant)
+ return SDValue();
+
+ // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
+ if (NumElts >= 4) {
+ if (SDValue shuffle = ReconstructShuffle(Op, DAG))
+ return shuffle;
+ }
+
+ // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
+ // know the default expansion would otherwise fall back on something even
+ // worse. For a vector with one or two non-undef values, that's
+ // scalar_to_vector for the elements followed by a shuffle (provided the
+ // shuffle is valid for the target) and materialization element by element
+ // on the stack followed by a load for everything else.
+ if (!isConstant && !usesOnlyOneValue) {
+ SDValue Vec = DAG.getUNDEF(VT);
+ SDValue Op0 = Op.getOperand(0);
+ unsigned ElemSize = VT.getScalarSizeInBits();
+ unsigned i = 0;
+ // For 32 and 64 bit types, use INSERT_SUBREG for lane zero to
+ // a) Avoid a RMW dependency on the full vector register, and
+ // b) Allow the register coalescer to fold away the copy if the
+ // value is already in an S or D register.
+ // Do not do this for UNDEF/LOAD nodes because we have better patterns
+ // for those avoiding the SCALAR_TO_VECTOR/BUILD_VECTOR.
+ if (!Op0.isUndef() && Op0.getOpcode() != ISD::LOAD &&
+ (ElemSize == 32 || ElemSize == 64)) {
+ unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub;
+ MachineSDNode *N =
+ DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0,
+ DAG.getTargetConstant(SubIdx, dl, MVT::i32));
+ Vec = SDValue(N, 0);
+ ++i;
+ }
+ for (; i < NumElts; ++i) {
+ SDValue V = Op.getOperand(i);
+ if (V.isUndef())
+ continue;
+ SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
+ Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
+ }
+ return Vec;
+ }
+
+ // Just use the default expansion. We failed to find a better alternative.
+ return SDValue();
+}
+
+SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
+
+ // Check for non-constant or out of range lane.
+ EVT VT = Op.getOperand(0).getValueType();
+ ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+ if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
+ return SDValue();
+
+
+ // Insertion/extraction are legal for V128 types.
+ if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
+ VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
+ VT == MVT::v8f16)
+ return Op;
+
+ if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
+ VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
+ return SDValue();
+
+ // For V64 types, we perform insertion by expanding the value
+ // to a V128 type and perform the insertion on that.
+ SDLoc DL(Op);
+ SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
+ EVT WideTy = WideVec.getValueType();
+
+ SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec,
+ Op.getOperand(1), Op.getOperand(2));
+ // Re-narrow the resultant vector.
+ return NarrowVector(Node, DAG);
+}
+
+SDValue
+AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
+
+ // Check for non-constant or out of range lane.
+ EVT VT = Op.getOperand(0).getValueType();
+ ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+ if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
+ return SDValue();
+
+
+ // Insertion/extraction are legal for V128 types.
+ if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
+ VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
+ VT == MVT::v8f16)
+ return Op;
+
+ if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
+ VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
+ return SDValue();
+
+ // For V64 types, we perform extraction by expanding the value
+ // to a V128 type and perform the extraction on that.
+ SDLoc DL(Op);
+ SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
+ EVT WideTy = WideVec.getValueType();
+
+ EVT ExtrTy = WideTy.getVectorElementType();
+ if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
+ ExtrTy = MVT::i32;
+
+ // For extractions, we just return the result directly.
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
+ Op.getOperand(1));
+}
+
+SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getOperand(0).getValueType();
+ SDLoc dl(Op);
+ // Just in case...
+ if (!VT.isVector())
+ return SDValue();
+
+ ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+ if (!Cst)
+ return SDValue();
+ unsigned Val = Cst->getZExtValue();
+
+ unsigned Size = Op.getValueSizeInBits();
+
+ // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
+ if (Val == 0)
+ return Op;
+
+ // If this is extracting the upper 64-bits of a 128-bit vector, we match
+ // that directly.
+ if (Size == 64 && Val * VT.getScalarSizeInBits() == 64)
+ return Op;
+
+ return SDValue();
+}
+
+bool AArch64TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
+ EVT VT) const {
+ if (VT.getVectorNumElements() == 4 &&
+ (VT.is128BitVector() || VT.is64BitVector())) {
+ unsigned PFIndexes[4];
+ for (unsigned i = 0; i != 4; ++i) {
+ if (M[i] < 0)
+ PFIndexes[i] = 8;
+ else
+ PFIndexes[i] = M[i];
+ }
+
+ // Compute the index in the perfect shuffle table.
+ unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
+ PFIndexes[2] * 9 + PFIndexes[3];
+ unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+ unsigned Cost = (PFEntry >> 30);
+
+ if (Cost <= 4)
+ return true;
+ }
+
+ bool DummyBool;
+ int DummyInt;
+ unsigned DummyUnsigned;
+
+ return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
+ isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
+ isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
+ // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
+ isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
+ isZIPMask(M, VT, DummyUnsigned) ||
+ isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
+ isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
+ isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
+ isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) ||
+ isConcatMask(M, VT, VT.getSizeInBits() == 128));
+}
+
+/// getVShiftImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift operation, where all the elements of the
+/// build_vector must have the same constant integer value.
+static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
+ // Ignore bit_converts.
+ while (Op.getOpcode() == ISD::BITCAST)
+ Op = Op.getOperand(0);
+ BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
+ APInt SplatBits, SplatUndef;
+ unsigned SplatBitSize;
+ bool HasAnyUndefs;
+ if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
+ HasAnyUndefs, ElementBits) ||
+ SplatBitSize > ElementBits)
+ return false;
+ Cnt = SplatBits.getSExtValue();
+ return true;
+}
+
+/// isVShiftLImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift left operation. That value must be in the range:
+/// 0 <= Value < ElementBits for a left shift; or
+/// 0 <= Value <= ElementBits for a long left shift.
+static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
+ assert(VT.isVector() && "vector shift count is not a vector type");
+ int64_t ElementBits = VT.getScalarSizeInBits();
+ if (!getVShiftImm(Op, ElementBits, Cnt))
+ return false;
+ return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
+}
+
+/// isVShiftRImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift right operation. The value must be in the range:
+/// 1 <= Value <= ElementBits for a right shift; or
+static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
+ assert(VT.isVector() && "vector shift count is not a vector type");
+ int64_t ElementBits = VT.getScalarSizeInBits();
+ if (!getVShiftImm(Op, ElementBits, Cnt))
+ return false;
+ return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
+}
+
+SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ SDLoc DL(Op);
+ int64_t Cnt;
+
+ if (!Op.getOperand(1).getValueType().isVector())
+ return Op;
+ unsigned EltSize = VT.getScalarSizeInBits();
+
+ switch (Op.getOpcode()) {
+ default:
+ llvm_unreachable("unexpected shift opcode");
+
+ case ISD::SHL:
+ if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
+ return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
+ DAG.getConstant(Cnt, DL, MVT::i32));
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+ DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
+ MVT::i32),
+ Op.getOperand(0), Op.getOperand(1));
+ case ISD::SRA:
+ case ISD::SRL:
+ // Right shift immediate
+ if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
+ unsigned Opc =
+ (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
+ return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
+ DAG.getConstant(Cnt, DL, MVT::i32));
+ }
+
+ // Right shift register. Note, there is not a shift right register
+ // instruction, but the shift left register instruction takes a signed
+ // value, where negative numbers specify a right shift.
+ unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
+ : Intrinsic::aarch64_neon_ushl;
+ // negate the shift amount
+ SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1));
+ SDValue NegShiftLeft =
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+ DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
+ NegShift);
+ return NegShiftLeft;
+ }
+
+ return SDValue();
+}
+
+static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
+ AArch64CC::CondCode CC, bool NoNans, EVT VT,
+ const SDLoc &dl, SelectionDAG &DAG) {
+ EVT SrcVT = LHS.getValueType();
+ assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
+ "function only supposed to emit natural comparisons");
+
+ BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
+ APInt CnstBits(VT.getSizeInBits(), 0);
+ APInt UndefBits(VT.getSizeInBits(), 0);
+ bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits);
+ bool IsZero = IsCnst && (CnstBits == 0);
+
+ if (SrcVT.getVectorElementType().isFloatingPoint()) {
+ switch (CC) {
+ default:
+ return SDValue();
+ case AArch64CC::NE: {
+ SDValue Fcmeq;
+ if (IsZero)
+ Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
+ else
+ Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
+ return DAG.getNode(AArch64ISD::NOT, dl, VT, Fcmeq);
+ }
+ case AArch64CC::EQ:
+ if (IsZero)
+ return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
+ return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
+ case AArch64CC::GE:
+ if (IsZero)
+ return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
+ return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
+ case AArch64CC::GT:
+ if (IsZero)
+ return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
+ return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
+ case AArch64CC::LS:
+ if (IsZero)
+ return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
+ return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
+ case AArch64CC::LT:
+ if (!NoNans)
+ return SDValue();
+ // If we ignore NaNs then we can use to the MI implementation.
+ LLVM_FALLTHROUGH;
+ case AArch64CC::MI:
+ if (IsZero)
+ return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
+ return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
+ }
+ }
+
+ switch (CC) {
+ default:
+ return SDValue();
+ case AArch64CC::NE: {
+ SDValue Cmeq;
+ if (IsZero)
+ Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
+ else
+ Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
+ return DAG.getNode(AArch64ISD::NOT, dl, VT, Cmeq);
+ }
+ case AArch64CC::EQ:
+ if (IsZero)
+ return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
+ return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
+ case AArch64CC::GE:
+ if (IsZero)
+ return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
+ return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
+ case AArch64CC::GT:
+ if (IsZero)
+ return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
+ return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
+ case AArch64CC::LE:
+ if (IsZero)
+ return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
+ return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
+ case AArch64CC::LS:
+ return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
+ case AArch64CC::LO:
+ return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
+ case AArch64CC::LT:
+ if (IsZero)
+ return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
+ return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
+ case AArch64CC::HI:
+ return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
+ case AArch64CC::HS:
+ return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
+ }
+}
+
+SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
+ SelectionDAG &DAG) const {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
+ SDLoc dl(Op);
+
+ if (LHS.getValueType().getVectorElementType().isInteger()) {
+ assert(LHS.getValueType() == RHS.getValueType());
+ AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
+ SDValue Cmp =
+ EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
+ return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
+ }
+
+ if (LHS.getValueType().getVectorElementType() == MVT::f16)
+ return SDValue();
+
+ assert(LHS.getValueType().getVectorElementType() == MVT::f32 ||
+ LHS.getValueType().getVectorElementType() == MVT::f64);
+
+ // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
+ // clean. Some of them require two branches to implement.
+ AArch64CC::CondCode CC1, CC2;
+ bool ShouldInvert;
+ changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
+
+ bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath;
+ SDValue Cmp =
+ EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
+ if (!Cmp.getNode())
+ return SDValue();
+
+ if (CC2 != AArch64CC::AL) {
+ SDValue Cmp2 =
+ EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
+ if (!Cmp2.getNode())
+ return SDValue();
+
+ Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
+ }
+
+ Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
+
+ if (ShouldInvert)
+ return Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
+
+ return Cmp;
+}
+
+/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
+/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
+/// specified in the intrinsic calls.
+bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
+ const CallInst &I,
+ unsigned Intrinsic) const {
+ auto &DL = I.getModule()->getDataLayout();
+ switch (Intrinsic) {
+ case Intrinsic::aarch64_neon_ld2:
+ case Intrinsic::aarch64_neon_ld3:
+ case Intrinsic::aarch64_neon_ld4:
+ case Intrinsic::aarch64_neon_ld1x2:
+ case Intrinsic::aarch64_neon_ld1x3:
+ case Intrinsic::aarch64_neon_ld1x4:
+ case Intrinsic::aarch64_neon_ld2lane:
+ case Intrinsic::aarch64_neon_ld3lane:
+ case Intrinsic::aarch64_neon_ld4lane:
+ case Intrinsic::aarch64_neon_ld2r:
+ case Intrinsic::aarch64_neon_ld3r:
+ case Intrinsic::aarch64_neon_ld4r: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ // Conservatively set memVT to the entire set of vectors loaded.
+ uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
+ Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+ Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
+ Info.offset = 0;
+ Info.align = 0;
+ Info.vol = false; // volatile loads with NEON intrinsics not supported
+ Info.readMem = true;
+ Info.writeMem = false;
+ return true;
+ }
+ case Intrinsic::aarch64_neon_st2:
+ case Intrinsic::aarch64_neon_st3:
+ case Intrinsic::aarch64_neon_st4:
+ case Intrinsic::aarch64_neon_st1x2:
+ case Intrinsic::aarch64_neon_st1x3:
+ case Intrinsic::aarch64_neon_st1x4:
+ case Intrinsic::aarch64_neon_st2lane:
+ case Intrinsic::aarch64_neon_st3lane:
+ case Intrinsic::aarch64_neon_st4lane: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ // Conservatively set memVT to the entire set of vectors stored.
+ unsigned NumElts = 0;
+ for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
+ Type *ArgTy = I.getArgOperand(ArgI)->getType();
+ if (!ArgTy->isVectorTy())
+ break;
+ NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
+ }
+ Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+ Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
+ Info.offset = 0;
+ Info.align = 0;
+ Info.vol = false; // volatile stores with NEON intrinsics not supported
+ Info.readMem = false;
+ Info.writeMem = true;
+ return true;
+ }
+ case Intrinsic::aarch64_ldaxr:
+ case Intrinsic::aarch64_ldxr: {
+ PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::getVT(PtrTy->getElementType());
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
+ Info.vol = true;
+ Info.readMem = true;
+ Info.writeMem = false;
+ return true;
+ }
+ case Intrinsic::aarch64_stlxr:
+ case Intrinsic::aarch64_stxr: {
+ PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::getVT(PtrTy->getElementType());
+ Info.ptrVal = I.getArgOperand(1);
+ Info.offset = 0;
+ Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
+ Info.vol = true;
+ Info.readMem = false;
+ Info.writeMem = true;
+ return true;
+ }
+ case Intrinsic::aarch64_ldaxp:
+ case Intrinsic::aarch64_ldxp: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::i128;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.align = 16;
+ Info.vol = true;
+ Info.readMem = true;
+ Info.writeMem = false;
+ return true;
+ }
+ case Intrinsic::aarch64_stlxp:
+ case Intrinsic::aarch64_stxp: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::i128;
+ Info.ptrVal = I.getArgOperand(2);
+ Info.offset = 0;
+ Info.align = 16;
+ Info.vol = true;
+ Info.readMem = false;
+ Info.writeMem = true;
+ return true;
+ }
+ default:
+ break;
+ }
+
+ return false;
+}
+
+// Truncations from 64-bit GPR to 32-bit GPR is free.
+bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
+ if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+ return false;
+ unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
+ unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
+ return NumBits1 > NumBits2;
+}
+bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
+ if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
+ return false;
+ unsigned NumBits1 = VT1.getSizeInBits();
+ unsigned NumBits2 = VT2.getSizeInBits();
+ return NumBits1 > NumBits2;
+}
+
+/// Check if it is profitable to hoist instruction in then/else to if.
+/// Not profitable if I and it's user can form a FMA instruction
+/// because we prefer FMSUB/FMADD.
+bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
+ if (I->getOpcode() != Instruction::FMul)
+ return true;
+
+ if (I->getNumUses() != 1)
+ return true;
+
+ Instruction *User = I->user_back();
+
+ if (User &&
+ !(User->getOpcode() == Instruction::FSub ||
+ User->getOpcode() == Instruction::FAdd))
+ return true;
+
+ const TargetOptions &Options = getTargetMachine().Options;
+ const DataLayout &DL = I->getModule()->getDataLayout();
+ EVT VT = getValueType(DL, User->getOperand(0)->getType());
+
+ return !(isFMAFasterThanFMulAndFAdd(VT) &&
+ isOperationLegalOrCustom(ISD::FMA, VT) &&
+ (Options.AllowFPOpFusion == FPOpFusion::Fast ||
+ Options.UnsafeFPMath));
+}
+
+// All 32-bit GPR operations implicitly zero the high-half of the corresponding
+// 64-bit GPR.
+bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
+ if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+ return false;
+ unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
+ unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
+ return NumBits1 == 32 && NumBits2 == 64;
+}
+bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
+ if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
+ return false;
+ unsigned NumBits1 = VT1.getSizeInBits();
+ unsigned NumBits2 = VT2.getSizeInBits();
+ return NumBits1 == 32 && NumBits2 == 64;
+}
+
+bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+ EVT VT1 = Val.getValueType();
+ if (isZExtFree(VT1, VT2)) {
+ return true;
+ }
+
+ if (Val.getOpcode() != ISD::LOAD)
+ return false;
+
+ // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
+ return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
+ VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
+ VT1.getSizeInBits() <= 32);
+}
+
+bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
+ if (isa<FPExtInst>(Ext))
+ return false;
+
+ // Vector types are next free.
+ if (Ext->getType()->isVectorTy())
+ return false;
+
+ for (const Use &U : Ext->uses()) {
+ // The extension is free if we can fold it with a left shift in an
+ // addressing mode or an arithmetic operation: add, sub, and cmp.
+
+ // Is there a shift?
+ const Instruction *Instr = cast<Instruction>(U.getUser());
+
+ // Is this a constant shift?
+ switch (Instr->getOpcode()) {
+ case Instruction::Shl:
+ if (!isa<ConstantInt>(Instr->getOperand(1)))
+ return false;
+ break;
+ case Instruction::GetElementPtr: {
+ gep_type_iterator GTI = gep_type_begin(Instr);
+ auto &DL = Ext->getModule()->getDataLayout();
+ std::advance(GTI, U.getOperandNo()-1);
+ Type *IdxTy = GTI.getIndexedType();
+ // This extension will end up with a shift because of the scaling factor.
+ // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
+ // Get the shift amount based on the scaling factor:
+ // log2(sizeof(IdxTy)) - log2(8).
+ uint64_t ShiftAmt =
+ countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy)) - 3;
+ // Is the constant foldable in the shift of the addressing mode?
+ // I.e., shift amount is between 1 and 4 inclusive.
+ if (ShiftAmt == 0 || ShiftAmt > 4)
+ return false;
+ break;
+ }
+ case Instruction::Trunc:
+ // Check if this is a noop.
+ // trunc(sext ty1 to ty2) to ty1.
+ if (Instr->getType() == Ext->getOperand(0)->getType())
+ continue;
+ LLVM_FALLTHROUGH;
+ default:
+ return false;
+ }
+
+ // At this point we can use the bfm family, so this extension is free
+ // for that use.
+ }
+ return true;
+}
+
+bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
+ unsigned &RequiredAligment) const {
+ if (!LoadedType.isSimple() ||
+ (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
+ return false;
+ // Cyclone supports unaligned accesses.
+ RequiredAligment = 0;
+ unsigned NumBits = LoadedType.getSizeInBits();
+ return NumBits == 32 || NumBits == 64;
+}
+
+/// \brief Lower an interleaved load into a ldN intrinsic.
+///
+/// E.g. Lower an interleaved load (Factor = 2):
+/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
+/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
+/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
+///
+/// Into:
+/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
+/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
+/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
+bool AArch64TargetLowering::lowerInterleavedLoad(
+ LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+ ArrayRef<unsigned> Indices, unsigned Factor) const {
+ assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
+ "Invalid interleave factor");
+ assert(!Shuffles.empty() && "Empty shufflevector input");
+ assert(Shuffles.size() == Indices.size() &&
+ "Unmatched number of shufflevectors and indices");
+
+ const DataLayout &DL = LI->getModule()->getDataLayout();
+
+ VectorType *VecTy = Shuffles[0]->getType();
+ unsigned VecSize = DL.getTypeSizeInBits(VecTy);
+
+ // Skip if we do not have NEON and skip illegal vector types.
+ if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128))
+ return false;
+
+ // A pointer vector can not be the return type of the ldN intrinsics. Need to
+ // load integer vectors first and then convert to pointer vectors.
+ Type *EltTy = VecTy->getVectorElementType();
+ if (EltTy->isPointerTy())
+ VecTy =
+ VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());
+
+ Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace());
+ Type *Tys[2] = {VecTy, PtrTy};
+ static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2,
+ Intrinsic::aarch64_neon_ld3,
+ Intrinsic::aarch64_neon_ld4};
+ Function *LdNFunc =
+ Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
+
+ IRBuilder<> Builder(LI);
+ Value *Ptr = Builder.CreateBitCast(LI->getPointerOperand(), PtrTy);
+
+ CallInst *LdN = Builder.CreateCall(LdNFunc, Ptr, "ldN");
+
+ // Replace uses of each shufflevector with the corresponding vector loaded
+ // by ldN.
+ for (unsigned i = 0; i < Shuffles.size(); i++) {
+ ShuffleVectorInst *SVI = Shuffles[i];
+ unsigned Index = Indices[i];
+
+ Value *SubVec = Builder.CreateExtractValue(LdN, Index);
+
+ // Convert the integer vector to pointer vector if the element is pointer.
+ if (EltTy->isPointerTy())
+ SubVec = Builder.CreateIntToPtr(SubVec, SVI->getType());
+
+ SVI->replaceAllUsesWith(SubVec);
+ }
+
+ return true;
+}
+
+/// \brief Get a mask consisting of sequential integers starting from \p Start.
+///
+/// I.e. <Start, Start + 1, ..., Start + NumElts - 1>
+static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start,
+ unsigned NumElts) {
+ SmallVector<Constant *, 16> Mask;
+ for (unsigned i = 0; i < NumElts; i++)
+ Mask.push_back(Builder.getInt32(Start + i));
+
+ return ConstantVector::get(Mask);
+}
+
+/// \brief Lower an interleaved store into a stN intrinsic.
+///
+/// E.g. Lower an interleaved store (Factor = 3):
+/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
+/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
+/// store <12 x i32> %i.vec, <12 x i32>* %ptr
+///
+/// Into:
+/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
+/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
+/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
+/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
+///
+/// Note that the new shufflevectors will be removed and we'll only generate one
+/// st3 instruction in CodeGen.
+///
+/// Example for a more general valid mask (Factor 3). Lower:
+/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
+/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
+/// store <12 x i32> %i.vec, <12 x i32>* %ptr
+///
+/// Into:
+/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
+/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
+/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
+/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
+bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
+ ShuffleVectorInst *SVI,
+ unsigned Factor) const {
+ assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
+ "Invalid interleave factor");
+
+ VectorType *VecTy = SVI->getType();
+ assert(VecTy->getVectorNumElements() % Factor == 0 &&
+ "Invalid interleaved store");
+
+ unsigned LaneLen = VecTy->getVectorNumElements() / Factor;
+ Type *EltTy = VecTy->getVectorElementType();
+ VectorType *SubVecTy = VectorType::get(EltTy, LaneLen);
+
+ const DataLayout &DL = SI->getModule()->getDataLayout();
+ unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
+
+ // Skip if we do not have NEON and skip illegal vector types.
+ if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128))
+ return false;
+
+ Value *Op0 = SVI->getOperand(0);
+ Value *Op1 = SVI->getOperand(1);
+ IRBuilder<> Builder(SI);
+
+ // StN intrinsics don't support pointer vectors as arguments. Convert pointer
+ // vectors to integer vectors.
+ if (EltTy->isPointerTy()) {
+ Type *IntTy = DL.getIntPtrType(EltTy);
+ unsigned NumOpElts =
+ dyn_cast<VectorType>(Op0->getType())->getVectorNumElements();
+
+ // Convert to the corresponding integer vector.
+ Type *IntVecTy = VectorType::get(IntTy, NumOpElts);
+ Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
+ Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
+
+ SubVecTy = VectorType::get(IntTy, LaneLen);
+ }
+
+ Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace());
+ Type *Tys[2] = {SubVecTy, PtrTy};
+ static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2,
+ Intrinsic::aarch64_neon_st3,
+ Intrinsic::aarch64_neon_st4};
+ Function *StNFunc =
+ Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);
+
+ SmallVector<Value *, 5> Ops;
+
+ // Split the shufflevector operands into sub vectors for the new stN call.
+ auto Mask = SVI->getShuffleMask();
+ for (unsigned i = 0; i < Factor; i++) {
+ if (Mask[i] >= 0) {
+ Ops.push_back(Builder.CreateShuffleVector(
+ Op0, Op1, getSequentialMask(Builder, Mask[i], LaneLen)));
+ } else {
+ unsigned StartMask = 0;
+ for (unsigned j = 1; j < LaneLen; j++) {
+ if (Mask[j*Factor + i] >= 0) {
+ StartMask = Mask[j*Factor + i] - j;
+ break;
+ }
+ }
+ // Note: If all elements in a chunk are undefs, StartMask=0!
+ // Note: Filling undef gaps with random elements is ok, since
+ // those elements were being written anyway (with undefs).
+ // In the case of all undefs we're defaulting to using elems from 0
+ // Note: StartMask cannot be negative, it's checked in isReInterleaveMask
+ Ops.push_back(Builder.CreateShuffleVector(
+ Op0, Op1, getSequentialMask(Builder, StartMask, LaneLen)));
+ }
+ }
+
+ Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), PtrTy));
+ Builder.CreateCall(StNFunc, Ops);
+ return true;
+}
+
+static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
+ unsigned AlignCheck) {
+ return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
+ (DstAlign == 0 || DstAlign % AlignCheck == 0));
+}
+
+EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
+ unsigned SrcAlign, bool IsMemset,
+ bool ZeroMemset,
+ bool MemcpyStrSrc,
+ MachineFunction &MF) const {
+ // Don't use AdvSIMD to implement 16-byte memset. It would have taken one
+ // instruction to materialize the v2i64 zero and one store (with restrictive
+ // addressing mode). Just do two i64 store of zero-registers.
+ bool Fast;
+ const Function *F = MF.getFunction();
+ if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 &&
+ !F->hasFnAttribute(Attribute::NoImplicitFloat) &&
+ (memOpAlign(SrcAlign, DstAlign, 16) ||
+ (allowsMisalignedMemoryAccesses(MVT::f128, 0, 1, &Fast) && Fast)))
+ return MVT::f128;
+
+ if (Size >= 8 &&
+ (memOpAlign(SrcAlign, DstAlign, 8) ||
+ (allowsMisalignedMemoryAccesses(MVT::i64, 0, 1, &Fast) && Fast)))
+ return MVT::i64;
+
+ if (Size >= 4 &&
+ (memOpAlign(SrcAlign, DstAlign, 4) ||
+ (allowsMisalignedMemoryAccesses(MVT::i32, 0, 1, &Fast) && Fast)))
+ return MVT::i32;
+
+ return MVT::Other;
+}
+
+// 12-bit optionally shifted immediates are legal for adds.
+bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
+ // Avoid UB for INT64_MIN.
+ if (Immed == std::numeric_limits<int64_t>::min())
+ return false;
+ // Same encoding for add/sub, just flip the sign.
+ Immed = std::abs(Immed);
+ return ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
+}
+
+// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
+// immediates is the same as for an add or a sub.
+bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
+ return isLegalAddImmediate(Immed);
+}
+
+/// isLegalAddressingMode - Return true if the addressing mode represented
+/// by AM is legal for this target, for a load/store of the specified type.
+bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
+ const AddrMode &AM, Type *Ty,
+ unsigned AS) const {
+ // AArch64 has five basic addressing modes:
+ // reg
+ // reg + 9-bit signed offset
+ // reg + SIZE_IN_BYTES * 12-bit unsigned offset
+ // reg1 + reg2
+ // reg + SIZE_IN_BYTES * reg
+
+ // No global is ever allowed as a base.
+ if (AM.BaseGV)
+ return false;
+
+ // No reg+reg+imm addressing.
+ if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
+ return false;
+
+ // check reg + imm case:
+ // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
+ uint64_t NumBytes = 0;
+ if (Ty->isSized()) {
+ uint64_t NumBits = DL.getTypeSizeInBits(Ty);
+ NumBytes = NumBits / 8;
+ if (!isPowerOf2_64(NumBits))
+ NumBytes = 0;
+ }
+
+ if (!AM.Scale) {
+ int64_t Offset = AM.BaseOffs;
+
+ // 9-bit signed offset
+ if (isInt<9>(Offset))
+ return true;
+
+ // 12-bit unsigned offset
+ unsigned shift = Log2_64(NumBytes);
+ if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
+ // Must be a multiple of NumBytes (NumBytes is a power of 2)
+ (Offset >> shift) << shift == Offset)
+ return true;
+ return false;
+ }
+
+ // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
+
+ return AM.Scale == 1 || (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes);
+}
+
+int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL,
+ const AddrMode &AM, Type *Ty,
+ unsigned AS) const {
+ // Scaling factors are not free at all.
+ // Operands | Rt Latency
+ // -------------------------------------------
+ // Rt, [Xn, Xm] | 4
+ // -------------------------------------------
+ // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
+ // Rt, [Xn, Wm, <extend> #imm] |
+ if (isLegalAddressingMode(DL, AM, Ty, AS))
+ // Scale represents reg2 * scale, thus account for 1 if
+ // it is not equal to 0 or 1.
+ return AM.Scale != 0 && AM.Scale != 1;
+ return -1;
+}
+
+bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+ VT = VT.getScalarType();
+
+ if (!VT.isSimple())
+ return false;
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ case MVT::f32:
+ case MVT::f64:
+ return true;
+ default:
+ break;
+ }
+
+ return false;
+}
+
+const MCPhysReg *
+AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
+ // LR is a callee-save register, but we must treat it as clobbered by any call
+ // site. Hence we include LR in the scratch registers, which are in turn added
+ // as implicit-defs for stackmaps and patchpoints.
+ static const MCPhysReg ScratchRegs[] = {
+ AArch64::X16, AArch64::X17, AArch64::LR, 0
+ };
+ return ScratchRegs;
+}
+
+bool
+AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N) const {
+ EVT VT = N->getValueType(0);
+ // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine
+ // it with shift to let it be lowered to UBFX.
+ if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
+ isa<ConstantSDNode>(N->getOperand(1))) {
+ uint64_t TruncMask = N->getConstantOperandVal(1);
+ if (isMask_64(TruncMask) &&
+ N->getOperand(0).getOpcode() == ISD::SRL &&
+ isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
+ return false;
+ }
+ return true;
+}
+
+bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
+ Type *Ty) const {
+ assert(Ty->isIntegerTy());
+
+ unsigned BitSize = Ty->getPrimitiveSizeInBits();
+ if (BitSize == 0)
+ return false;
+
+ int64_t Val = Imm.getSExtValue();
+ if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
+ return true;
+
+ if ((int64_t)Val < 0)
+ Val = ~Val;
+ if (BitSize == 32)
+ Val &= (1LL << 32) - 1;
+
+ unsigned LZ = countLeadingZeros((uint64_t)Val);
+ unsigned Shift = (63 - LZ) / 16;
+ // MOVZ is free so return true for one or fewer MOVK.
+ return Shift < 3;
+}
+
+/// Turn vector tests of the signbit in the form of:
+/// xor (sra X, elt_size(X)-1), -1
+/// into:
+/// cmge X, X, #0
+static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
+ const AArch64Subtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
+ if (!Subtarget->hasNEON() || !VT.isVector())
+ return SDValue();
+
+ // There must be a shift right algebraic before the xor, and the xor must be a
+ // 'not' operation.
+ SDValue Shift = N->getOperand(0);
+ SDValue Ones = N->getOperand(1);
+ if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
+ !ISD::isBuildVectorAllOnes(Ones.getNode()))
+ return SDValue();
+
+ // The shift should be smearing the sign bit across each vector element.
+ auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
+ EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
+ if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
+ return SDValue();
+
+ return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
+}
+
+// Generate SUBS and CSEL for integer abs.
+static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDLoc DL(N);
+
+ // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
+ // and change it to SUB and CSEL.
+ if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
+ N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
+ N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0))
+ if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
+ if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
+ SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
+ N0.getOperand(0));
+ // Generate SUBS & CSEL.
+ SDValue Cmp =
+ DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
+ N0.getOperand(0), DAG.getConstant(0, DL, VT));
+ return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0.getOperand(0), Neg,
+ DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
+ SDValue(Cmp.getNode(), 1));
+ }
+ return SDValue();
+}
+
+static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const AArch64Subtarget *Subtarget) {
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
+ return Cmp;
+
+ return performIntegerAbsCombine(N, DAG);
+}
+
+SDValue
+AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
+ SelectionDAG &DAG,
+ std::vector<SDNode *> *Created) const {
+ AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes();
+ if (isIntDivCheap(N->getValueType(0), Attr))
+ return SDValue(N,0); // Lower SDIV as SDIV
+
+ // fold (sdiv X, pow2)
+ EVT VT = N->getValueType(0);
+ if ((VT != MVT::i32 && VT != MVT::i64) ||
+ !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()))
+ return SDValue();
+
+ SDLoc DL(N);
+ SDValue N0 = N->getOperand(0);
+ unsigned Lg2 = Divisor.countTrailingZeros();
+ SDValue Zero = DAG.getConstant(0, DL, VT);
+ SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
+
+ // Add (N0 < 0) ? Pow2 - 1 : 0;
+ SDValue CCVal;
+ SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL);
+ SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
+ SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp);
+
+ if (Created) {
+ Created->push_back(Cmp.getNode());
+ Created->push_back(Add.getNode());
+ Created->push_back(CSel.getNode());
+ }
+
+ // Divide by pow2.
+ SDValue SRA =
+ DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64));
+
+ // If we're dividing by a positive value, we're done. Otherwise, we must
+ // negate the result.
+ if (Divisor.isNonNegative())
+ return SRA;
+
+ if (Created)
+ Created->push_back(SRA.getNode());
+ return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA);
+}
+
+static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const AArch64Subtarget *Subtarget) {
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ // The below optimizations require a constant RHS.
+ if (!isa<ConstantSDNode>(N->getOperand(1)))
+ return SDValue();
+
+ ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(1));
+ const APInt &ConstValue = C->getAPIntValue();
+
+ // Multiplication of a power of two plus/minus one can be done more
+ // cheaply as as shift+add/sub. For now, this is true unilaterally. If
+ // future CPUs have a cheaper MADD instruction, this may need to be
+ // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
+ // 64-bit is 5 cycles, so this is always a win.
+ // More aggressively, some multiplications N0 * C can be lowered to
+ // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
+ // e.g. 6=3*2=(2+1)*2.
+ // TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45
+ // which equals to (1+2)*16-(1+2).
+ SDValue N0 = N->getOperand(0);
+ // TrailingZeroes is used to test if the mul can be lowered to
+ // shift+add+shift.
+ unsigned TrailingZeroes = ConstValue.countTrailingZeros();
+ if (TrailingZeroes) {
+ // Conservatively do not lower to shift+add+shift if the mul might be
+ // folded into smul or umul.
+ if (N0->hasOneUse() && (isSignExtended(N0.getNode(), DAG) ||
+ isZeroExtended(N0.getNode(), DAG)))
+ return SDValue();
+ // Conservatively do not lower to shift+add+shift if the mul might be
+ // folded into madd or msub.
+ if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD ||
+ N->use_begin()->getOpcode() == ISD::SUB))
+ return SDValue();
+ }
+ // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
+ // and shift+add+shift.
+ APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
+
+ unsigned ShiftAmt, AddSubOpc;
+ // Is the shifted value the LHS operand of the add/sub?
+ bool ShiftValUseIsN0 = true;
+ // Do we need to negate the result?
+ bool NegateResult = false;
+
+ if (ConstValue.isNonNegative()) {
+ // (mul x, 2^N + 1) => (add (shl x, N), x)
+ // (mul x, 2^N - 1) => (sub (shl x, N), x)
+ // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
+ APInt SCVMinus1 = ShiftedConstValue - 1;
+ APInt CVPlus1 = ConstValue + 1;
+ if (SCVMinus1.isPowerOf2()) {
+ ShiftAmt = SCVMinus1.logBase2();
+ AddSubOpc = ISD::ADD;
+ } else if (CVPlus1.isPowerOf2()) {
+ ShiftAmt = CVPlus1.logBase2();
+ AddSubOpc = ISD::SUB;
+ } else
+ return SDValue();
+ } else {
+ // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
+ // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
+ APInt CVNegPlus1 = -ConstValue + 1;
+ APInt CVNegMinus1 = -ConstValue - 1;
+ if (CVNegPlus1.isPowerOf2()) {
+ ShiftAmt = CVNegPlus1.logBase2();
+ AddSubOpc = ISD::SUB;
+ ShiftValUseIsN0 = false;
+ } else if (CVNegMinus1.isPowerOf2()) {
+ ShiftAmt = CVNegMinus1.logBase2();
+ AddSubOpc = ISD::ADD;
+ NegateResult = true;
+ } else
+ return SDValue();
+ }
+
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+ SDValue ShiftedVal = DAG.getNode(ISD::SHL, DL, VT, N0,
+ DAG.getConstant(ShiftAmt, DL, MVT::i64));
+
+ SDValue AddSubN0 = ShiftValUseIsN0 ? ShiftedVal : N0;
+ SDValue AddSubN1 = ShiftValUseIsN0 ? N0 : ShiftedVal;
+ SDValue Res = DAG.getNode(AddSubOpc, DL, VT, AddSubN0, AddSubN1);
+ assert(!(NegateResult && TrailingZeroes) &&
+ "NegateResult and TrailingZeroes cannot both be true for now.");
+ // Negate the result.
+ if (NegateResult)
+ return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
+ // Shift the result.
+ if (TrailingZeroes)
+ return DAG.getNode(ISD::SHL, DL, VT, Res,
+ DAG.getConstant(TrailingZeroes, DL, MVT::i64));
+ return Res;
+}
+
+static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
+ SelectionDAG &DAG) {
+ // Take advantage of vector comparisons producing 0 or -1 in each lane to
+ // optimize away operation when it's from a constant.
+ //
+ // The general transformation is:
+ // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
+ // AND(VECTOR_CMP(x,y), constant2)
+ // constant2 = UNARYOP(constant)
+
+ // Early exit if this isn't a vector operation, the operand of the
+ // unary operation isn't a bitwise AND, or if the sizes of the operations
+ // aren't the same.
+ EVT VT = N->getValueType(0);
+ if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
+ N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
+ VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
+ return SDValue();
+
+ // Now check that the other operand of the AND is a constant. We could
+ // make the transformation for non-constant splats as well, but it's unclear
+ // that would be a benefit as it would not eliminate any operations, just
+ // perform one more step in scalar code before moving to the vector unit.
+ if (BuildVectorSDNode *BV =
+ dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
+ // Bail out if the vector isn't a constant.
+ if (!BV->isConstant())
+ return SDValue();
+
+ // Everything checks out. Build up the new and improved node.
+ SDLoc DL(N);
+ EVT IntVT = BV->getValueType(0);
+ // Create a new constant of the appropriate type for the transformed
+ // DAG.
+ SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
+ // The AND node needs bitcasts to/from an integer vector type around it.
+ SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
+ SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
+ N->getOperand(0)->getOperand(0), MaskConst);
+ SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
+ return Res;
+ }
+
+ return SDValue();
+}
+
+static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
+ const AArch64Subtarget *Subtarget) {
+ // First try to optimize away the conversion when it's conditionally from
+ // a constant. Vectors only.
+ if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
+ return Res;
+
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::f32 && VT != MVT::f64)
+ return SDValue();
+
+ // Only optimize when the source and destination types have the same width.
+ if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
+ return SDValue();
+
+ // If the result of an integer load is only used by an integer-to-float
+ // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
+ // This eliminates an "integer-to-vector-move" UOP and improves throughput.
+ SDValue N0 = N->getOperand(0);
+ if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
+ // Do not change the width of a volatile load.
+ !cast<LoadSDNode>(N0)->isVolatile()) {
+ LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+ SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
+ LN0->getPointerInfo(), LN0->getAlignment(),
+ LN0->getMemOperand()->getFlags());
+
+ // Make sure successors of the original load stay after it by updating them
+ // to use the new Chain.
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
+
+ unsigned Opcode =
+ (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
+ return DAG.getNode(Opcode, SDLoc(N), VT, Load);
+ }
+
+ return SDValue();
+}
+
+/// Fold a floating-point multiply by power of two into floating-point to
+/// fixed-point conversion.
+static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const AArch64Subtarget *Subtarget) {
+ if (!Subtarget->hasNEON())
+ return SDValue();
+
+ SDValue Op = N->getOperand(0);
+ if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
+ Op.getOpcode() != ISD::FMUL)
+ return SDValue();
+
+ SDValue ConstVec = Op->getOperand(1);
+ if (!isa<BuildVectorSDNode>(ConstVec))
+ return SDValue();
+
+ MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
+ uint32_t FloatBits = FloatTy.getSizeInBits();
+ if (FloatBits != 32 && FloatBits != 64)
+ return SDValue();
+
+ MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
+ uint32_t IntBits = IntTy.getSizeInBits();
+ if (IntBits != 16 && IntBits != 32 && IntBits != 64)
+ return SDValue();
+
+ // Avoid conversions where iN is larger than the float (e.g., float -> i64).
+ if (IntBits > FloatBits)
+ return SDValue();
+
+ BitVector UndefElements;
+ BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
+ int32_t Bits = IntBits == 64 ? 64 : 32;
+ int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
+ if (C == -1 || C == 0 || C > Bits)
+ return SDValue();
+
+ MVT ResTy;
+ unsigned NumLanes = Op.getValueType().getVectorNumElements();
+ switch (NumLanes) {
+ default:
+ return SDValue();
+ case 2:
+ ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
+ break;
+ case 4:
+ ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
+ break;
+ }
+
+ if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ assert((ResTy != MVT::v4i64 || DCI.isBeforeLegalizeOps()) &&
+ "Illegal vector type after legalization");
+
+ SDLoc DL(N);
+ bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
+ unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
+ : Intrinsic::aarch64_neon_vcvtfp2fxu;
+ SDValue FixConv =
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy,
+ DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
+ Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
+ // We can handle smaller integers by generating an extra trunc.
+ if (IntBits < FloatBits)
+ FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
+
+ return FixConv;
+}
+
+/// Fold a floating-point divide by power of two into fixed-point to
+/// floating-point conversion.
+static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const AArch64Subtarget *Subtarget) {
+ if (!Subtarget->hasNEON())
+ return SDValue();
+
+ SDValue Op = N->getOperand(0);
+ unsigned Opc = Op->getOpcode();
+ if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
+ !Op.getOperand(0).getValueType().isSimple() ||
+ (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
+ return SDValue();
+
+ SDValue ConstVec = N->getOperand(1);
+ if (!isa<BuildVectorSDNode>(ConstVec))
+ return SDValue();
+
+ MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
+ int32_t IntBits = IntTy.getSizeInBits();
+ if (IntBits != 16 && IntBits != 32 && IntBits != 64)
+ return SDValue();
+
+ MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
+ int32_t FloatBits = FloatTy.getSizeInBits();
+ if (FloatBits != 32 && FloatBits != 64)
+ return SDValue();
+
+ // Avoid conversions where iN is larger than the float (e.g., i64 -> float).
+ if (IntBits > FloatBits)
+ return SDValue();
+
+ BitVector UndefElements;
+ BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
+ int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
+ if (C == -1 || C == 0 || C > FloatBits)
+ return SDValue();
+
+ MVT ResTy;
+ unsigned NumLanes = Op.getValueType().getVectorNumElements();
+ switch (NumLanes) {
+ default:
+ return SDValue();
+ case 2:
+ ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
+ break;
+ case 4:
+ ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
+ break;
+ }
+
+ if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ SDLoc DL(N);
+ SDValue ConvInput = Op.getOperand(0);
+ bool IsSigned = Opc == ISD::SINT_TO_FP;
+ if (IntBits < FloatBits)
+ ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
+ ResTy, ConvInput);
+
+ unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
+ : Intrinsic::aarch64_neon_vcvtfxu2fp;
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
+ DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
+ DAG.getConstant(C, DL, MVT::i32));
+}
+
+/// An EXTR instruction is made up of two shifts, ORed together. This helper
+/// searches for and classifies those shifts.
+static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
+ bool &FromHi) {
+ if (N.getOpcode() == ISD::SHL)
+ FromHi = false;
+ else if (N.getOpcode() == ISD::SRL)
+ FromHi = true;
+ else
+ return false;
+
+ if (!isa<ConstantSDNode>(N.getOperand(1)))
+ return false;
+
+ ShiftAmount = N->getConstantOperandVal(1);
+ Src = N->getOperand(0);
+ return true;
+}
+
+/// EXTR instruction extracts a contiguous chunk of bits from two existing
+/// registers viewed as a high/low pair. This function looks for the pattern:
+/// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an
+/// EXTR. Can't quite be done in TableGen because the two immediates aren't
+/// independent.
+static SDValue tryCombineToEXTR(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+
+ assert(N->getOpcode() == ISD::OR && "Unexpected root");
+
+ if (VT != MVT::i32 && VT != MVT::i64)
+ return SDValue();
+
+ SDValue LHS;
+ uint32_t ShiftLHS = 0;
+ bool LHSFromHi = 0;
+ if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
+ return SDValue();
+
+ SDValue RHS;
+ uint32_t ShiftRHS = 0;
+ bool RHSFromHi = 0;
+ if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
+ return SDValue();
+
+ // If they're both trying to come from the high part of the register, they're
+ // not really an EXTR.
+ if (LHSFromHi == RHSFromHi)
+ return SDValue();
+
+ if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
+ return SDValue();
+
+ if (LHSFromHi) {
+ std::swap(LHS, RHS);
+ std::swap(ShiftLHS, ShiftRHS);
+ }
+
+ return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS,
+ DAG.getConstant(ShiftRHS, DL, MVT::i64));
+}
+
+static SDValue tryCombineToBSL(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ EVT VT = N->getValueType(0);
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc DL(N);
+
+ if (!VT.isVector())
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ if (N0.getOpcode() != ISD::AND)
+ return SDValue();
+
+ SDValue N1 = N->getOperand(1);
+ if (N1.getOpcode() != ISD::AND)
+ return SDValue();
+
+ // We only have to look for constant vectors here since the general, variable
+ // case can be handled in TableGen.
+ unsigned Bits = VT.getScalarSizeInBits();
+ uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
+ for (int i = 1; i >= 0; --i)
+ for (int j = 1; j >= 0; --j) {
+ BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
+ BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
+ if (!BVN0 || !BVN1)
+ continue;
+
+ bool FoundMatch = true;
+ for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
+ ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
+ ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
+ if (!CN0 || !CN1 ||
+ CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
+ FoundMatch = false;
+ break;
+ }
+ }
+
+ if (FoundMatch)
+ return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0),
+ N0->getOperand(1 - i), N1->getOperand(1 - j));
+ }
+
+ return SDValue();
+}
+
+static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+ const AArch64Subtarget *Subtarget) {
+ // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ return SDValue();
+
+ if (SDValue Res = tryCombineToEXTR(N, DCI))
+ return Res;
+
+ if (SDValue Res = tryCombineToBSL(N, DCI))
+ return Res;
+
+ return SDValue();
+}
+
+static SDValue performSRLCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::i32 && VT != MVT::i64)
+ return SDValue();
+
+ // Canonicalize (srl (bswap i32 x), 16) to (rotr (bswap i32 x), 16), if the
+ // high 16-bits of x are zero. Similarly, canonicalize (srl (bswap i64 x), 32)
+ // to (rotr (bswap i64 x), 32), if the high 32-bits of x are zero.
+ SDValue N0 = N->getOperand(0);
+ if (N0.getOpcode() == ISD::BSWAP) {
+ SDLoc DL(N);
+ SDValue N1 = N->getOperand(1);
+ SDValue N00 = N0.getOperand(0);
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
+ uint64_t ShiftAmt = C->getZExtValue();
+ if (VT == MVT::i32 && ShiftAmt == 16 &&
+ DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(32, 16)))
+ return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
+ if (VT == MVT::i64 && ShiftAmt == 32 &&
+ DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(64, 32)))
+ return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
+ }
+ }
+ return SDValue();
+}
+
+static SDValue performBitcastCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG) {
+ // Wait 'til after everything is legalized to try this. That way we have
+ // legal vector types and such.
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ // Remove extraneous bitcasts around an extract_subvector.
+ // For example,
+ // (v4i16 (bitconvert
+ // (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1)))))
+ // becomes
+ // (extract_subvector ((v8i16 ...), (i64 4)))
+
+ // Only interested in 64-bit vectors as the ultimate result.
+ EVT VT = N->getValueType(0);
+ if (!VT.isVector())
+ return SDValue();
+ if (VT.getSimpleVT().getSizeInBits() != 64)
+ return SDValue();
+ // Is the operand an extract_subvector starting at the beginning or halfway
+ // point of the vector? A low half may also come through as an
+ // EXTRACT_SUBREG, so look for that, too.
+ SDValue Op0 = N->getOperand(0);
+ if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR &&
+ !(Op0->isMachineOpcode() &&
+ Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG))
+ return SDValue();
+ uint64_t idx = cast<ConstantSDNode>(Op0->getOperand(1))->getZExtValue();
+ if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+ if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0)
+ return SDValue();
+ } else if (Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG) {
+ if (idx != AArch64::dsub)
+ return SDValue();
+ // The dsub reference is equivalent to a lane zero subvector reference.
+ idx = 0;
+ }
+ // Look through the bitcast of the input to the extract.
+ if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST)
+ return SDValue();
+ SDValue Source = Op0->getOperand(0)->getOperand(0);
+ // If the source type has twice the number of elements as our destination
+ // type, we know this is an extract of the high or low half of the vector.
+ EVT SVT = Source->getValueType(0);
+ if (SVT.getVectorNumElements() != VT.getVectorNumElements() * 2)
+ return SDValue();
+
+ DEBUG(dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n");
+
+ // Create the simplified form to just extract the low or high half of the
+ // vector directly rather than bothering with the bitcasts.
+ SDLoc dl(N);
+ unsigned NumElements = VT.getVectorNumElements();
+ if (idx) {
+ SDValue HalfIdx = DAG.getConstant(NumElements, dl, MVT::i64);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx);
+ } else {
+ SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, dl, MVT::i32);
+ return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT,
+ Source, SubReg),
+ 0);
+ }
+}
+
+static SDValue performConcatVectorsCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG) {
+ SDLoc dl(N);
+ EVT VT = N->getValueType(0);
+ SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
+
+ // Optimize concat_vectors of truncated vectors, where the intermediate
+ // type is illegal, to avoid said illegality, e.g.,
+ // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
+ // (v2i16 (truncate (v2i64)))))
+ // ->
+ // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
+ // (v4i32 (bitcast (v2i64))),
+ // <0, 2, 4, 6>)))
+ // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
+ // on both input and result type, so we might generate worse code.
+ // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
+ if (N->getNumOperands() == 2 &&
+ N0->getOpcode() == ISD::TRUNCATE &&
+ N1->getOpcode() == ISD::TRUNCATE) {
+ SDValue N00 = N0->getOperand(0);
+ SDValue N10 = N1->getOperand(0);
+ EVT N00VT = N00.getValueType();
+
+ if (N00VT == N10.getValueType() &&
+ (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
+ N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
+ MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
+ SmallVector<int, 8> Mask(MidVT.getVectorNumElements());
+ for (size_t i = 0; i < Mask.size(); ++i)
+ Mask[i] = i * 2;
+ return DAG.getNode(ISD::TRUNCATE, dl, VT,
+ DAG.getVectorShuffle(
+ MidVT, dl,
+ DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
+ DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
+ }
+ }
+
+ // Wait 'til after everything is legalized to try this. That way we have
+ // legal vector types and such.
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
+ // splat. The indexed instructions are going to be expecting a DUPLANE64, so
+ // canonicalise to that.
+ if (N0 == N1 && VT.getVectorNumElements() == 2) {
+ assert(VT.getScalarSizeInBits() == 64);
+ return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
+ DAG.getConstant(0, dl, MVT::i64));
+ }
+
+ // Canonicalise concat_vectors so that the right-hand vector has as few
+ // bit-casts as possible before its real operation. The primary matching
+ // destination for these operations will be the narrowing "2" instructions,
+ // which depend on the operation being performed on this right-hand vector.
+ // For example,
+ // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
+ // becomes
+ // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
+
+ if (N1->getOpcode() != ISD::BITCAST)
+ return SDValue();
+ SDValue RHS = N1->getOperand(0);
+ MVT RHSTy = RHS.getValueType().getSimpleVT();
+ // If the RHS is not a vector, this is not the pattern we're looking for.
+ if (!RHSTy.isVector())
+ return SDValue();
+
+ DEBUG(dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
+
+ MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
+ RHSTy.getVectorNumElements() * 2);
+ return DAG.getNode(ISD::BITCAST, dl, VT,
+ DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
+ DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
+ RHS));
+}
+
+static SDValue tryCombineFixedPointConvert(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG) {
+ // Wait 'til after everything is legalized to try this. That way we have
+ // legal vector types and such.
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+ // Transform a scalar conversion of a value from a lane extract into a
+ // lane extract of a vector conversion. E.g., from foo1 to foo2:
+ // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
+ // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
+ //
+ // The second form interacts better with instruction selection and the
+ // register allocator to avoid cross-class register copies that aren't
+ // coalescable due to a lane reference.
+
+ // Check the operand and see if it originates from a lane extract.
+ SDValue Op1 = N->getOperand(1);
+ if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+ // Yep, no additional predication needed. Perform the transform.
+ SDValue IID = N->getOperand(0);
+ SDValue Shift = N->getOperand(2);
+ SDValue Vec = Op1.getOperand(0);
+ SDValue Lane = Op1.getOperand(1);
+ EVT ResTy = N->getValueType(0);
+ EVT VecResTy;
+ SDLoc DL(N);
+
+ // The vector width should be 128 bits by the time we get here, even
+ // if it started as 64 bits (the extract_vector handling will have
+ // done so).
+ assert(Vec.getValueSizeInBits() == 128 &&
+ "unexpected vector size on extract_vector_elt!");
+ if (Vec.getValueType() == MVT::v4i32)
+ VecResTy = MVT::v4f32;
+ else if (Vec.getValueType() == MVT::v2i64)
+ VecResTy = MVT::v2f64;
+ else
+ llvm_unreachable("unexpected vector type!");
+
+ SDValue Convert =
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
+ }
+ return SDValue();
+}
+
+// AArch64 high-vector "long" operations are formed by performing the non-high
+// version on an extract_subvector of each operand which gets the high half:
+//
+// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
+//
+// However, there are cases which don't have an extract_high explicitly, but
+// have another operation that can be made compatible with one for free. For
+// example:
+//
+// (dupv64 scalar) --> (extract_high (dup128 scalar))
+//
+// This routine does the actual conversion of such DUPs, once outer routines
+// have determined that everything else is in order.
+// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
+// similarly here.
+static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
+ switch (N.getOpcode()) {
+ case AArch64ISD::DUP:
+ case AArch64ISD::DUPLANE8:
+ case AArch64ISD::DUPLANE16:
+ case AArch64ISD::DUPLANE32:
+ case AArch64ISD::DUPLANE64:
+ case AArch64ISD::MOVI:
+ case AArch64ISD::MOVIshift:
+ case AArch64ISD::MOVIedit:
+ case AArch64ISD::MOVImsl:
+ case AArch64ISD::MVNIshift:
+ case AArch64ISD::MVNImsl:
+ break;
+ default:
+ // FMOV could be supported, but isn't very useful, as it would only occur
+ // if you passed a bitcast' floating point immediate to an eligible long
+ // integer op (addl, smull, ...).
+ return SDValue();
+ }
+
+ MVT NarrowTy = N.getSimpleValueType();
+ if (!NarrowTy.is64BitVector())
+ return SDValue();
+
+ MVT ElementTy = NarrowTy.getVectorElementType();
+ unsigned NumElems = NarrowTy.getVectorNumElements();
+ MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
+
+ SDLoc dl(N);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy,
+ DAG.getNode(N->getOpcode(), dl, NewVT, N->ops()),
+ DAG.getConstant(NumElems, dl, MVT::i64));
+}
+
+static bool isEssentiallyExtractSubvector(SDValue N) {
+ if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR)
+ return true;
+
+ return N.getOpcode() == ISD::BITCAST &&
+ N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR;
+}
+
+/// \brief Helper structure to keep track of ISD::SET_CC operands.
+struct GenericSetCCInfo {
+ const SDValue *Opnd0;
+ const SDValue *Opnd1;
+ ISD::CondCode CC;
+};
+
+/// \brief Helper structure to keep track of a SET_CC lowered into AArch64 code.
+struct AArch64SetCCInfo {
+ const SDValue *Cmp;
+ AArch64CC::CondCode CC;
+};
+
+/// \brief Helper structure to keep track of SetCC information.
+union SetCCInfo {
+ GenericSetCCInfo Generic;
+ AArch64SetCCInfo AArch64;
+};
+
+/// \brief Helper structure to be able to read SetCC information. If set to
+/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
+/// GenericSetCCInfo.
+struct SetCCInfoAndKind {
+ SetCCInfo Info;
+ bool IsAArch64;
+};
+
+/// \brief Check whether or not \p Op is a SET_CC operation, either a generic or
+/// an
+/// AArch64 lowered one.
+/// \p SetCCInfo is filled accordingly.
+/// \post SetCCInfo is meanginfull only when this function returns true.
+/// \return True when Op is a kind of SET_CC operation.
+static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
+ // If this is a setcc, this is straight forward.
+ if (Op.getOpcode() == ISD::SETCC) {
+ SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
+ SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
+ SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+ SetCCInfo.IsAArch64 = false;
+ return true;
+ }
+ // Otherwise, check if this is a matching csel instruction.
+ // In other words:
+ // - csel 1, 0, cc
+ // - csel 0, 1, !cc
+ if (Op.getOpcode() != AArch64ISD::CSEL)
+ return false;
+ // Set the information about the operands.
+ // TODO: we want the operands of the Cmp not the csel
+ SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
+ SetCCInfo.IsAArch64 = true;
+ SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>(
+ cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
+
+ // Check that the operands matches the constraints:
+ // (1) Both operands must be constants.
+ // (2) One must be 1 and the other must be 0.
+ ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
+ ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+
+ // Check (1).
+ if (!TValue || !FValue)
+ return false;
+
+ // Check (2).
+ if (!TValue->isOne()) {
+ // Update the comparison when we are interested in !cc.
+ std::swap(TValue, FValue);
+ SetCCInfo.Info.AArch64.CC =
+ AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC);
+ }
+ return TValue->isOne() && FValue->isNullValue();
+}
+
+// Returns true if Op is setcc or zext of setcc.
+static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
+ if (isSetCC(Op, Info))
+ return true;
+ return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
+ isSetCC(Op->getOperand(0), Info));
+}
+
+// The folding we want to perform is:
+// (add x, [zext] (setcc cc ...) )
+// -->
+// (csel x, (add x, 1), !cc ...)
+//
+// The latter will get matched to a CSINC instruction.
+static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
+ assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
+ SDValue LHS = Op->getOperand(0);
+ SDValue RHS = Op->getOperand(1);
+ SetCCInfoAndKind InfoAndKind;
+
+ // If neither operand is a SET_CC, give up.
+ if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
+ std::swap(LHS, RHS);
+ if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
+ return SDValue();
+ }
+
+ // FIXME: This could be generatized to work for FP comparisons.
+ EVT CmpVT = InfoAndKind.IsAArch64
+ ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
+ : InfoAndKind.Info.Generic.Opnd0->getValueType();
+ if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
+ return SDValue();
+
+ SDValue CCVal;
+ SDValue Cmp;
+ SDLoc dl(Op);
+ if (InfoAndKind.IsAArch64) {
+ CCVal = DAG.getConstant(
+ AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl,
+ MVT::i32);
+ Cmp = *InfoAndKind.Info.AArch64.Cmp;
+ } else
+ Cmp = getAArch64Cmp(*InfoAndKind.Info.Generic.Opnd0,
+ *InfoAndKind.Info.Generic.Opnd1,
+ ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, true),
+ CCVal, DAG, dl);
+
+ EVT VT = Op->getValueType(0);
+ LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
+ return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
+}
+
+// The basic add/sub long vector instructions have variants with "2" on the end
+// which act on the high-half of their inputs. They are normally matched by
+// patterns like:
+//
+// (add (zeroext (extract_high LHS)),
+// (zeroext (extract_high RHS)))
+// -> uaddl2 vD, vN, vM
+//
+// However, if one of the extracts is something like a duplicate, this
+// instruction can still be used profitably. This function puts the DAG into a
+// more appropriate form for those patterns to trigger.
+static SDValue performAddSubLongCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG) {
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ MVT VT = N->getSimpleValueType(0);
+ if (!VT.is128BitVector()) {
+ if (N->getOpcode() == ISD::ADD)
+ return performSetccAddFolding(N, DAG);
+ return SDValue();
+ }
+
+ // Make sure both branches are extended in the same way.
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
+ LHS.getOpcode() != ISD::SIGN_EXTEND) ||
+ LHS.getOpcode() != RHS.getOpcode())
+ return SDValue();
+
+ unsigned ExtType = LHS.getOpcode();
+
+ // It's not worth doing if at least one of the inputs isn't already an
+ // extract, but we don't know which it'll be so we have to try both.
+ if (isEssentiallyExtractSubvector(LHS.getOperand(0))) {
+ RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
+ if (!RHS.getNode())
+ return SDValue();
+
+ RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
+ } else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) {
+ LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
+ if (!LHS.getNode())
+ return SDValue();
+
+ LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
+ }
+
+ return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
+}
+
+// Massage DAGs which we can use the high-half "long" operations on into
+// something isel will recognize better. E.g.
+//
+// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
+// (aarch64_neon_umull (extract_high (v2i64 vec)))
+// (extract_high (v2i64 (dup128 scalar)))))
+//
+static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG) {
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ SDValue LHS = N->getOperand(1);
+ SDValue RHS = N->getOperand(2);
+ assert(LHS.getValueType().is64BitVector() &&
+ RHS.getValueType().is64BitVector() &&
+ "unexpected shape for long operation");
+
+ // Either node could be a DUP, but it's not worth doing both of them (you'd
+ // just as well use the non-high version) so look for a corresponding extract
+ // operation on the other "wing".
+ if (isEssentiallyExtractSubvector(LHS)) {
+ RHS = tryExtendDUPToExtractHigh(RHS, DAG);
+ if (!RHS.getNode())
+ return SDValue();
+ } else if (isEssentiallyExtractSubvector(RHS)) {
+ LHS = tryExtendDUPToExtractHigh(LHS, DAG);
+ if (!LHS.getNode())
+ return SDValue();
+ }
+
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
+ N->getOperand(0), LHS, RHS);
+}
+
+static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
+ MVT ElemTy = N->getSimpleValueType(0).getScalarType();
+ unsigned ElemBits = ElemTy.getSizeInBits();
+
+ int64_t ShiftAmount;
+ if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
+ APInt SplatValue, SplatUndef;
+ unsigned SplatBitSize;
+ bool HasAnyUndefs;
+ if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
+ HasAnyUndefs, ElemBits) ||
+ SplatBitSize != ElemBits)
+ return SDValue();
+
+ ShiftAmount = SplatValue.getSExtValue();
+ } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
+ ShiftAmount = CVN->getSExtValue();
+ } else
+ return SDValue();
+
+ unsigned Opcode;
+ bool IsRightShift;
+ switch (IID) {
+ default:
+ llvm_unreachable("Unknown shift intrinsic");
+ case Intrinsic::aarch64_neon_sqshl:
+ Opcode = AArch64ISD::SQSHL_I;
+ IsRightShift = false;
+ break;
+ case Intrinsic::aarch64_neon_uqshl:
+ Opcode = AArch64ISD::UQSHL_I;
+ IsRightShift = false;
+ break;
+ case Intrinsic::aarch64_neon_srshl:
+ Opcode = AArch64ISD::SRSHR_I;
+ IsRightShift = true;
+ break;
+ case Intrinsic::aarch64_neon_urshl:
+ Opcode = AArch64ISD::URSHR_I;
+ IsRightShift = true;
+ break;
+ case Intrinsic::aarch64_neon_sqshlu:
+ Opcode = AArch64ISD::SQSHLU_I;
+ IsRightShift = false;
+ break;
+ }
+
+ if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
+ SDLoc dl(N);
+ return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
+ DAG.getConstant(-ShiftAmount, dl, MVT::i32));
+ } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
+ SDLoc dl(N);
+ return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
+ DAG.getConstant(ShiftAmount, dl, MVT::i32));
+ }
+
+ return SDValue();
+}
+
+// The CRC32[BH] instructions ignore the high bits of their data operand. Since
+// the intrinsics must be legal and take an i32, this means there's almost
+// certainly going to be a zext in the DAG which we can eliminate.
+static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
+ SDValue AndN = N->getOperand(2);
+ if (AndN.getOpcode() != ISD::AND)
+ return SDValue();
+
+ ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
+ if (!CMask || CMask->getZExtValue() != Mask)
+ return SDValue();
+
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
+ N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
+}
+
+static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N,
+ SelectionDAG &DAG) {
+ SDLoc dl(N);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
+ DAG.getNode(Opc, dl,
+ N->getOperand(1).getSimpleValueType(),
+ N->getOperand(1)),
+ DAG.getConstant(0, dl, MVT::i64));
+}
+
+static SDValue performIntrinsicCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const AArch64Subtarget *Subtarget) {
+ SelectionDAG &DAG = DCI.DAG;
+ unsigned IID = getIntrinsicID(N);
+ switch (IID) {
+ default:
+ break;
+ case Intrinsic::aarch64_neon_vcvtfxs2fp:
+ case Intrinsic::aarch64_neon_vcvtfxu2fp:
+ return tryCombineFixedPointConvert(N, DCI, DAG);
+ case Intrinsic::aarch64_neon_saddv:
+ return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG);
+ case Intrinsic::aarch64_neon_uaddv:
+ return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG);
+ case Intrinsic::aarch64_neon_sminv:
+ return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG);
+ case Intrinsic::aarch64_neon_uminv:
+ return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG);
+ case Intrinsic::aarch64_neon_smaxv:
+ return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG);
+ case Intrinsic::aarch64_neon_umaxv:
+ return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
+ case Intrinsic::aarch64_neon_fmax:
+ return DAG.getNode(ISD::FMAXNAN, SDLoc(N), N->getValueType(0),
+ N->getOperand(1), N->getOperand(2));
+ case Intrinsic::aarch64_neon_fmin:
+ return DAG.getNode(ISD::FMINNAN, SDLoc(N), N->getValueType(0),
+ N->getOperand(1), N->getOperand(2));
+ case Intrinsic::aarch64_neon_fmaxnm:
+ return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
+ N->getOperand(1), N->getOperand(2));
+ case Intrinsic::aarch64_neon_fminnm:
+ return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
+ N->getOperand(1), N->getOperand(2));
+ case Intrinsic::aarch64_neon_smull:
+ case Intrinsic::aarch64_neon_umull:
+ case Intrinsic::aarch64_neon_pmull:
+ case Intrinsic::aarch64_neon_sqdmull:
+ return tryCombineLongOpWithDup(IID, N, DCI, DAG);
+ case Intrinsic::aarch64_neon_sqshl:
+ case Intrinsic::aarch64_neon_uqshl:
+ case Intrinsic::aarch64_neon_sqshlu:
+ case Intrinsic::aarch64_neon_srshl:
+ case Intrinsic::aarch64_neon_urshl:
+ return tryCombineShiftImm(IID, N, DAG);
+ case Intrinsic::aarch64_crc32b:
+ case Intrinsic::aarch64_crc32cb:
+ return tryCombineCRC32(0xff, N, DAG);
+ case Intrinsic::aarch64_crc32h:
+ case Intrinsic::aarch64_crc32ch:
+ return tryCombineCRC32(0xffff, N, DAG);
+ }
+ return SDValue();
+}
+
+static SDValue performExtendCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG) {
+ // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
+ // we can convert that DUP into another extract_high (of a bigger DUP), which
+ // helps the backend to decide that an sabdl2 would be useful, saving a real
+ // extract_high operation.
+ if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
+ N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
+ SDNode *ABDNode = N->getOperand(0).getNode();
+ unsigned IID = getIntrinsicID(ABDNode);
+ if (IID == Intrinsic::aarch64_neon_sabd ||
+ IID == Intrinsic::aarch64_neon_uabd) {
+ SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG);
+ if (!NewABD.getNode())
+ return SDValue();
+
+ return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0),
+ NewABD);
+ }
+ }
+
+ // This is effectively a custom type legalization for AArch64.
+ //
+ // Type legalization will split an extend of a small, legal, type to a larger
+ // illegal type by first splitting the destination type, often creating
+ // illegal source types, which then get legalized in isel-confusing ways,
+ // leading to really terrible codegen. E.g.,
+ // %result = v8i32 sext v8i8 %value
+ // becomes
+ // %losrc = extract_subreg %value, ...
+ // %hisrc = extract_subreg %value, ...
+ // %lo = v4i32 sext v4i8 %losrc
+ // %hi = v4i32 sext v4i8 %hisrc
+ // Things go rapidly downhill from there.
+ //
+ // For AArch64, the [sz]ext vector instructions can only go up one element
+ // size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32
+ // take two instructions.
+ //
+ // This implies that the most efficient way to do the extend from v8i8
+ // to two v4i32 values is to first extend the v8i8 to v8i16, then do
+ // the normal splitting to happen for the v8i16->v8i32.
+
+ // This is pre-legalization to catch some cases where the default
+ // type legalization will create ill-tempered code.
+ if (!DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ // We're only interested in cleaning things up for non-legal vector types
+ // here. If both the source and destination are legal, things will just
+ // work naturally without any fiddling.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ EVT ResVT = N->getValueType(0);
+ if (!ResVT.isVector() || TLI.isTypeLegal(ResVT))
+ return SDValue();
+ // If the vector type isn't a simple VT, it's beyond the scope of what
+ // we're worried about here. Let legalization do its thing and hope for
+ // the best.
+ SDValue Src = N->getOperand(0);
+ EVT SrcVT = Src->getValueType(0);
+ if (!ResVT.isSimple() || !SrcVT.isSimple())
+ return SDValue();
+
+ // If the source VT is a 64-bit vector, we can play games and get the
+ // better results we want.
+ if (SrcVT.getSizeInBits() != 64)
+ return SDValue();
+
+ unsigned SrcEltSize = SrcVT.getScalarSizeInBits();
+ unsigned ElementCount = SrcVT.getVectorNumElements();
+ SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount);
+ SDLoc DL(N);
+ Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src);
+
+ // Now split the rest of the operation into two halves, each with a 64
+ // bit source.
+ EVT LoVT, HiVT;
+ SDValue Lo, Hi;
+ unsigned NumElements = ResVT.getVectorNumElements();
+ assert(!(NumElements & 1) && "Splitting vector, but not in half!");
+ LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(),
+ ResVT.getVectorElementType(), NumElements / 2);
+
+ EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
+ LoVT.getVectorNumElements());
+ Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
+ DAG.getConstant(0, DL, MVT::i64));
+ Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
+ DAG.getConstant(InNVT.getVectorNumElements(), DL, MVT::i64));
+ Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
+ Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);
+
+ // Now combine the parts back together so we still have a single result
+ // like the combiner expects.
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
+}
+
+static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
+ SDValue SplatVal, unsigned NumVecElts) {
+ unsigned OrigAlignment = St.getAlignment();
+ unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
+
+ // Create scalar stores. This is at least as good as the code sequence for a
+ // split unaligned store which is a dup.s, ext.b, and two stores.
+ // Most of the time the three stores should be replaced by store pair
+ // instructions (stp).
+ SDLoc DL(&St);
+ SDValue BasePtr = St.getBasePtr();
+ SDValue NewST1 =
+ DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, St.getPointerInfo(),
+ OrigAlignment, St.getMemOperand()->getFlags());
+
+ unsigned Offset = EltOffset;
+ while (--NumVecElts) {
+ unsigned Alignment = MinAlign(OrigAlignment, Offset);
+ SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
+ DAG.getConstant(Offset, DL, MVT::i64));
+ NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
+ St.getPointerInfo(), Alignment,
+ St.getMemOperand()->getFlags());
+ Offset += EltOffset;
+ }
+ return NewST1;
+}
+
+/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
+/// load store optimizer pass will merge them to store pair stores. This should
+/// be better than a movi to create the vector zero followed by a vector store
+/// if the zero constant is not re-used, since one instructions and one register
+/// live range will be removed.
+///
+/// For example, the final generated code should be:
+///
+/// stp xzr, xzr, [x0]
+///
+/// instead of:
+///
+/// movi v0.2d, #0
+/// str q0, [x0]
+///
+static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
+ SDValue StVal = St.getValue();
+ EVT VT = StVal.getValueType();
+
+ // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
+ // 2, 3 or 4 i32 elements.
+ int NumVecElts = VT.getVectorNumElements();
+ if (!(((NumVecElts == 2 || NumVecElts == 3) &&
+ VT.getVectorElementType().getSizeInBits() == 64) ||
+ ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
+ VT.getVectorElementType().getSizeInBits() == 32)))
+ return SDValue();
+
+ if (StVal.getOpcode() != ISD::BUILD_VECTOR)
+ return SDValue();
+
+ // If the zero constant has more than one use then the vector store could be
+ // better since the constant mov will be amortized and stp q instructions
+ // should be able to be formed.
+ if (!StVal.hasOneUse())
+ return SDValue();
+
+ // If the immediate offset of the address operand is too large for the stp
+ // instruction, then bail out.
+ if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
+ int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
+ if (Offset < -512 || Offset > 504)
+ return SDValue();
+ }
+
+ for (int I = 0; I < NumVecElts; ++I) {
+ SDValue EltVal = StVal.getOperand(I);
+ if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
+ return SDValue();
+ }
+
+ // Use WZR/XZR here to prevent DAGCombiner::MergeConsecutiveStores from
+ // undoing this transformation.
+ SDValue SplatVal = VT.getVectorElementType().getSizeInBits() == 32
+ ? DAG.getRegister(AArch64::WZR, MVT::i32)
+ : DAG.getRegister(AArch64::XZR, MVT::i64);
+ return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
+}
+
+/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
+/// value. The load store optimizer pass will merge them to store pair stores.
+/// This has better performance than a splat of the scalar followed by a split
+/// vector store. Even if the stores are not merged it is four stores vs a dup,
+/// followed by an ext.b and two stores.
+static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
+ SDValue StVal = St.getValue();
+ EVT VT = StVal.getValueType();
+
+ // Don't replace floating point stores, they possibly won't be transformed to
+ // stp because of the store pair suppress pass.
+ if (VT.isFloatingPoint())
+ return SDValue();
+
+ // We can express a splat as store pair(s) for 2 or 4 elements.
+ unsigned NumVecElts = VT.getVectorNumElements();
+ if (NumVecElts != 4 && NumVecElts != 2)
+ return SDValue();
+
+ // Check that this is a splat.
+ // Make sure that each of the relevant vector element locations are inserted
+ // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
+ std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
+ SDValue SplatVal;
+ for (unsigned I = 0; I < NumVecElts; ++I) {
+ // Check for insert vector elements.
+ if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
+ return SDValue();
+
+ // Check that same value is inserted at each vector element.
+ if (I == 0)
+ SplatVal = StVal.getOperand(1);
+ else if (StVal.getOperand(1) != SplatVal)
+ return SDValue();
+
+ // Check insert element index.
+ ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
+ if (!CIndex)
+ return SDValue();
+ uint64_t IndexVal = CIndex->getZExtValue();
+ if (IndexVal >= NumVecElts)
+ return SDValue();
+ IndexNotInserted.reset(IndexVal);
+
+ StVal = StVal.getOperand(0);
+ }
+ // Check that all vector element locations were inserted to.
+ if (IndexNotInserted.any())
+ return SDValue();
+
+ return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
+}
+
+static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG,
+ const AArch64Subtarget *Subtarget) {
+ if (!DCI.isBeforeLegalize())
+ return SDValue();
+
+ StoreSDNode *S = cast<StoreSDNode>(N);
+ if (S->isVolatile())
+ return SDValue();
+
+ SDValue StVal = S->getValue();
+ EVT VT = StVal.getValueType();
+ if (!VT.isVector())
+ return SDValue();
+
+ // If we get a splat of zeros, convert this vector store to a store of
+ // scalars. They will be merged into store pairs of xzr thereby removing one
+ // instruction and one register.
+ if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
+ return ReplacedZeroSplat;
+
+ // FIXME: The logic for deciding if an unaligned store should be split should
+ // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
+ // a call to that function here.
+
+ if (!Subtarget->isMisaligned128StoreSlow())
+ return SDValue();
+
+ // Don't split at -Oz.
+ if (DAG.getMachineFunction().getFunction()->optForMinSize())
+ return SDValue();
+
+ // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
+ // those up regresses performance on micro-benchmarks and olden/bh.
+ if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
+ return SDValue();
+
+ // Split unaligned 16B stores. They are terrible for performance.
+ // Don't split stores with alignment of 1 or 2. Code that uses clang vector
+ // extensions can use this to mark that it does not want splitting to happen
+ // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
+ // eliminating alignment hazards is only 1 in 8 for alignment of 2.
+ if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 ||
+ S->getAlignment() <= 2)
+ return SDValue();
+
+ // If we get a splat of a scalar convert this vector store to a store of
+ // scalars. They will be merged into store pairs thereby removing two
+ // instructions.
+ if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
+ return ReplacedSplat;
+
+ SDLoc DL(S);
+ unsigned NumElts = VT.getVectorNumElements() / 2;
+ // Split VT into two.
+ EVT HalfVT =
+ EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts);
+ SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
+ DAG.getConstant(0, DL, MVT::i64));
+ SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
+ DAG.getConstant(NumElts, DL, MVT::i64));
+ SDValue BasePtr = S->getBasePtr();
+ SDValue NewST1 =
+ DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
+ S->getAlignment(), S->getMemOperand()->getFlags());
+ SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
+ DAG.getConstant(8, DL, MVT::i64));
+ return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
+ S->getPointerInfo(), S->getAlignment(),
+ S->getMemOperand()->getFlags());
+}
+
+/// Target-specific DAG combine function for post-increment LD1 (lane) and
+/// post-increment LD1R.
+static SDValue performPostLD1Combine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ bool IsLaneOp) {
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+
+ unsigned LoadIdx = IsLaneOp ? 1 : 0;
+ SDNode *LD = N->getOperand(LoadIdx).getNode();
+ // If it is not LOAD, can not do such combine.
+ if (LD->getOpcode() != ISD::LOAD)
+ return SDValue();
+
+ LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
+ EVT MemVT = LoadSDN->getMemoryVT();
+ // Check if memory operand is the same type as the vector element.
+ if (MemVT != VT.getVectorElementType())
+ return SDValue();
+
+ // Check if there are other uses. If so, do not combine as it will introduce
+ // an extra load.
+ for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
+ ++UI) {
+ if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
+ continue;
+ if (*UI != N)
+ return SDValue();
+ }
+
+ SDValue Addr = LD->getOperand(1);
+ SDValue Vector = N->getOperand(0);
+ // Search for a use of the address operand that is an increment.
+ for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
+ Addr.getNode()->use_end(); UI != UE; ++UI) {
+ SDNode *User = *UI;
+ if (User->getOpcode() != ISD::ADD
+ || UI.getUse().getResNo() != Addr.getResNo())
+ continue;
+
+ // Check that the add is independent of the load. Otherwise, folding it
+ // would create a cycle.
+ if (User->isPredecessorOf(LD) || LD->isPredecessorOf(User))
+ continue;
+ // Also check that add is not used in the vector operand. This would also
+ // create a cycle.
+ if (User->isPredecessorOf(Vector.getNode()))
+ continue;
+
+ // If the increment is a constant, it must match the memory ref size.
+ SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
+ if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
+ uint32_t IncVal = CInc->getZExtValue();
+ unsigned NumBytes = VT.getScalarSizeInBits() / 8;
+ if (IncVal != NumBytes)
+ continue;
+ Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
+ }
+
+ // Finally, check that the vector doesn't depend on the load.
+ // Again, this would create a cycle.
+ // The load depending on the vector is fine, as that's the case for the
+ // LD1*post we'll eventually generate anyway.
+ if (LoadSDN->isPredecessorOf(Vector.getNode()))
+ continue;
+
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(LD->getOperand(0)); // Chain
+ if (IsLaneOp) {
+ Ops.push_back(Vector); // The vector to be inserted
+ Ops.push_back(N->getOperand(2)); // The lane to be inserted in the vector
+ }
+ Ops.push_back(Addr);
+ Ops.push_back(Inc);
+
+ EVT Tys[3] = { VT, MVT::i64, MVT::Other };
+ SDVTList SDTys = DAG.getVTList(Tys);
+ unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
+ SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
+ MemVT,
+ LoadSDN->getMemOperand());
+
+ // Update the uses.
+ SDValue NewResults[] = {
+ SDValue(LD, 0), // The result of load
+ SDValue(UpdN.getNode(), 2) // Chain
+ };
+ DCI.CombineTo(LD, NewResults);
+ DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
+ DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register
+
+ break;
+ }
+ return SDValue();
+}
+
+/// Simplify \Addr given that the top byte of it is ignored by HW during
+/// address translation.
+static bool performTBISimplification(SDValue Addr,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG) {
+ APInt DemandedMask = APInt::getLowBitsSet(64, 56);
+ APInt KnownZero, KnownOne;
+ TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
+ DCI.isBeforeLegalizeOps());
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLI.SimplifyDemandedBits(Addr, DemandedMask, KnownZero, KnownOne, TLO)) {
+ DCI.CommitTargetLoweringOpt(TLO);
+ return true;
+ }
+ return false;
+}
+
+static SDValue performSTORECombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG,
+ const AArch64Subtarget *Subtarget) {
+ if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
+ return Split;
+
+ if (Subtarget->supportsAddressTopByteIgnored() &&
+ performTBISimplification(N->getOperand(2), DCI, DAG))
+ return SDValue(N, 0);
+
+ return SDValue();
+}
+
+ /// This function handles the log2-shuffle pattern produced by the
+/// LoopVectorizer for the across vector reduction. It consists of
+/// log2(NumVectorElements) steps and, in each step, 2^(s) elements
+/// are reduced, where s is an induction variable from 0 to
+/// log2(NumVectorElements).
+static SDValue tryMatchAcrossLaneShuffleForReduction(SDNode *N, SDValue OpV,
+ unsigned Op,
+ SelectionDAG &DAG) {
+ EVT VTy = OpV->getOperand(0).getValueType();
+ if (!VTy.isVector())
+ return SDValue();
+
+ int NumVecElts = VTy.getVectorNumElements();
+ if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) {
+ if (NumVecElts != 4)
+ return SDValue();
+ } else {
+ if (NumVecElts != 4 && NumVecElts != 8 && NumVecElts != 16)
+ return SDValue();
+ }
+
+ int NumExpectedSteps = APInt(8, NumVecElts).logBase2();
+ SDValue PreOp = OpV;
+ // Iterate over each step of the across vector reduction.
+ for (int CurStep = 0; CurStep != NumExpectedSteps; ++CurStep) {
+ SDValue CurOp = PreOp.getOperand(0);
+ SDValue Shuffle = PreOp.getOperand(1);
+ if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) {
+ // Try to swap the 1st and 2nd operand as add and min/max instructions
+ // are commutative.
+ CurOp = PreOp.getOperand(1);
+ Shuffle = PreOp.getOperand(0);
+ if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE)
+ return SDValue();
+ }
+
+ // Check if the input vector is fed by the operator we want to handle,
+ // except the last step; the very first input vector is not necessarily
+ // the same operator we are handling.
+ if (CurOp.getOpcode() != Op && (CurStep != (NumExpectedSteps - 1)))
+ return SDValue();
+
+ // Check if it forms one step of the across vector reduction.
+ // E.g.,
+ // %cur = add %1, %0
+ // %shuffle = vector_shuffle %cur, <2, 3, u, u>
+ // %pre = add %cur, %shuffle
+ if (Shuffle.getOperand(0) != CurOp)
+ return SDValue();
+
+ int NumMaskElts = 1 << CurStep;
+ ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Shuffle)->getMask();
+ // Check mask values in each step.
+ // We expect the shuffle mask in each step follows a specific pattern
+ // denoted here by the <M, U> form, where M is a sequence of integers
+ // starting from NumMaskElts, increasing by 1, and the number integers
+ // in M should be NumMaskElts. U is a sequence of UNDEFs and the number
+ // of undef in U should be NumVecElts - NumMaskElts.
+ // E.g., for <8 x i16>, mask values in each step should be :
+ // step 0 : <1,u,u,u,u,u,u,u>
+ // step 1 : <2,3,u,u,u,u,u,u>
+ // step 2 : <4,5,6,7,u,u,u,u>
+ for (int i = 0; i < NumVecElts; ++i)
+ if ((i < NumMaskElts && Mask[i] != (NumMaskElts + i)) ||
+ (i >= NumMaskElts && !(Mask[i] < 0)))
+ return SDValue();
+
+ PreOp = CurOp;
+ }
+ unsigned Opcode;
+ bool IsIntrinsic = false;
+
+ switch (Op) {
+ default:
+ llvm_unreachable("Unexpected operator for across vector reduction");
+ case ISD::ADD:
+ Opcode = AArch64ISD::UADDV;
+ break;
+ case ISD::SMAX:
+ Opcode = AArch64ISD::SMAXV;
+ break;
+ case ISD::UMAX:
+ Opcode = AArch64ISD::UMAXV;
+ break;
+ case ISD::SMIN:
+ Opcode = AArch64ISD::SMINV;
+ break;
+ case ISD::UMIN:
+ Opcode = AArch64ISD::UMINV;
+ break;
+ case ISD::FMAXNUM:
+ Opcode = Intrinsic::aarch64_neon_fmaxnmv;
+ IsIntrinsic = true;
+ break;
+ case ISD::FMINNUM:
+ Opcode = Intrinsic::aarch64_neon_fminnmv;
+ IsIntrinsic = true;
+ break;
+ }
+ SDLoc DL(N);
+
+ return IsIntrinsic
+ ? DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, N->getValueType(0),
+ DAG.getConstant(Opcode, DL, MVT::i32), PreOp)
+ : DAG.getNode(
+ ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0),
+ DAG.getNode(Opcode, DL, PreOp.getSimpleValueType(), PreOp),
+ DAG.getConstant(0, DL, MVT::i64));
+}
+
+/// Target-specific DAG combine for the across vector min/max reductions.
+/// This function specifically handles the final clean-up step of the vector
+/// min/max reductions produced by the LoopVectorizer. It is the log2-shuffle
+/// pattern, which narrows down and finds the final min/max value from all
+/// elements of the vector.
+/// For example, for a <16 x i8> vector :
+/// svn0 = vector_shuffle %0, undef<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u>
+/// %smax0 = smax %arr, svn0
+/// %svn1 = vector_shuffle %smax0, undef<4,5,6,7,u,u,u,u,u,u,u,u,u,u,u,u>
+/// %smax1 = smax %smax0, %svn1
+/// %svn2 = vector_shuffle %smax1, undef<2,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+/// %smax2 = smax %smax1, svn2
+/// %svn3 = vector_shuffle %smax2, undef<1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+/// %sc = setcc %smax2, %svn3, gt
+/// %n0 = extract_vector_elt %sc, #0
+/// %n1 = extract_vector_elt %smax2, #0
+/// %n2 = extract_vector_elt $smax2, #1
+/// %result = select %n0, %n1, n2
+/// becomes :
+/// %1 = smaxv %0
+/// %result = extract_vector_elt %1, 0
+static SDValue
+performAcrossLaneMinMaxReductionCombine(SDNode *N, SelectionDAG &DAG,
+ const AArch64Subtarget *Subtarget) {
+ if (!Subtarget->hasNEON())
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue IfTrue = N->getOperand(1);
+ SDValue IfFalse = N->getOperand(2);
+
+ // Check if the SELECT merges up the final result of the min/max
+ // from a vector.
+ if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ IfTrue.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ IfFalse.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ // Expect N0 is fed by SETCC.
+ SDValue SetCC = N0.getOperand(0);
+ EVT SetCCVT = SetCC.getValueType();
+ if (SetCC.getOpcode() != ISD::SETCC || !SetCCVT.isVector() ||
+ SetCCVT.getVectorElementType() != MVT::i1)
+ return SDValue();
+
+ SDValue VectorOp = SetCC.getOperand(0);
+ unsigned Op = VectorOp->getOpcode();
+ // Check if the input vector is fed by the operator we want to handle.
+ if (Op != ISD::SMAX && Op != ISD::UMAX && Op != ISD::SMIN &&
+ Op != ISD::UMIN && Op != ISD::FMAXNUM && Op != ISD::FMINNUM)
+ return SDValue();
+
+ EVT VTy = VectorOp.getValueType();
+ if (!VTy.isVector())
+ return SDValue();
+
+ if (VTy.getSizeInBits() < 64)
+ return SDValue();
+
+ EVT EltTy = VTy.getVectorElementType();
+ if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) {
+ if (EltTy != MVT::f32)
+ return SDValue();
+ } else {
+ if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
+ return SDValue();
+ }
+
+ // Check if extracting from the same vector.
+ // For example,
+ // %sc = setcc %vector, %svn1, gt
+ // %n0 = extract_vector_elt %sc, #0
+ // %n1 = extract_vector_elt %vector, #0
+ // %n2 = extract_vector_elt $vector, #1
+ if (!(VectorOp == IfTrue->getOperand(0) &&
+ VectorOp == IfFalse->getOperand(0)))
+ return SDValue();
+
+ // Check if the condition code is matched with the operator type.
+ ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
+ if ((Op == ISD::SMAX && CC != ISD::SETGT && CC != ISD::SETGE) ||
+ (Op == ISD::UMAX && CC != ISD::SETUGT && CC != ISD::SETUGE) ||
+ (Op == ISD::SMIN && CC != ISD::SETLT && CC != ISD::SETLE) ||
+ (Op == ISD::UMIN && CC != ISD::SETULT && CC != ISD::SETULE) ||
+ (Op == ISD::FMAXNUM && CC != ISD::SETOGT && CC != ISD::SETOGE &&
+ CC != ISD::SETUGT && CC != ISD::SETUGE && CC != ISD::SETGT &&
+ CC != ISD::SETGE) ||
+ (Op == ISD::FMINNUM && CC != ISD::SETOLT && CC != ISD::SETOLE &&
+ CC != ISD::SETULT && CC != ISD::SETULE && CC != ISD::SETLT &&
+ CC != ISD::SETLE))
+ return SDValue();
+
+ // Expect to check only lane 0 from the vector SETCC.
+ if (!isNullConstant(N0.getOperand(1)))
+ return SDValue();
+
+ // Expect to extract the true value from lane 0.
+ if (!isNullConstant(IfTrue.getOperand(1)))
+ return SDValue();
+
+ // Expect to extract the false value from lane 1.
+ if (!isOneConstant(IfFalse.getOperand(1)))
+ return SDValue();
+
+ return tryMatchAcrossLaneShuffleForReduction(N, SetCC, Op, DAG);
+}
+
+/// Target-specific DAG combine for the across vector add reduction.
+/// This function specifically handles the final clean-up step of the vector
+/// add reduction produced by the LoopVectorizer. It is the log2-shuffle
+/// pattern, which adds all elements of a vector together.
+/// For example, for a <4 x i32> vector :
+/// %1 = vector_shuffle %0, <2,3,u,u>
+/// %2 = add %0, %1
+/// %3 = vector_shuffle %2, <1,u,u,u>
+/// %4 = add %2, %3
+/// %result = extract_vector_elt %4, 0
+/// becomes :
+/// %0 = uaddv %0
+/// %result = extract_vector_elt %0, 0
+static SDValue
+performAcrossLaneAddReductionCombine(SDNode *N, SelectionDAG &DAG,
+ const AArch64Subtarget *Subtarget) {
+ if (!Subtarget->hasNEON())
+ return SDValue();
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ // Check if the input vector is fed by the ADD.
+ if (N0->getOpcode() != ISD::ADD)
+ return SDValue();
+
+ // The vector extract idx must constant zero because we only expect the final
+ // result of the reduction is placed in lane 0.
+ if (!isNullConstant(N1))
+ return SDValue();
+
+ EVT VTy = N0.getValueType();
+ if (!VTy.isVector())
+ return SDValue();
+
+ EVT EltTy = VTy.getVectorElementType();
+ if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
+ return SDValue();
+
+ if (VTy.getSizeInBits() < 64)
+ return SDValue();
+
+ return tryMatchAcrossLaneShuffleForReduction(N, N0, ISD::ADD, DAG);
+}
+
+/// Target-specific DAG combine function for NEON load/store intrinsics
+/// to merge base address updates.
+static SDValue performNEONPostLDSTCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG) {
+ if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+ return SDValue();
+
+ unsigned AddrOpIdx = N->getNumOperands() - 1;
+ SDValue Addr = N->getOperand(AddrOpIdx);
+
+ // Search for a use of the address operand that is an increment.
+ for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
+ UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
+ SDNode *User = *UI;
+ if (User->getOpcode() != ISD::ADD ||
+ UI.getUse().getResNo() != Addr.getResNo())
+ continue;
+
+ // Check that the add is independent of the load/store. Otherwise, folding
+ // it would create a cycle.
+ if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
+ continue;
+
+ // Find the new opcode for the updating load/store.
+ bool IsStore = false;
+ bool IsLaneOp = false;
+ bool IsDupOp = false;
+ unsigned NewOpc = 0;
+ unsigned NumVecs = 0;
+ unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ switch (IntNo) {
+ default: llvm_unreachable("unexpected intrinsic for Neon base update");
+ case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
+ NumVecs = 2; break;
+ case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
+ NumVecs = 3; break;
+ case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
+ NumVecs = 4; break;
+ case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
+ NumVecs = 2; IsStore = true; break;
+ case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
+ NumVecs = 3; IsStore = true; break;
+ case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
+ NumVecs = 4; IsStore = true; break;
+ case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
+ NumVecs = 2; break;
+ case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
+ NumVecs = 3; break;
+ case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
+ NumVecs = 4; break;
+ case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
+ NumVecs = 2; IsStore = true; break;
+ case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
+ NumVecs = 3; IsStore = true; break;
+ case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
+ NumVecs = 4; IsStore = true; break;
+ case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
+ NumVecs = 2; IsDupOp = true; break;
+ case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
+ NumVecs = 3; IsDupOp = true; break;
+ case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
+ NumVecs = 4; IsDupOp = true; break;
+ case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
+ NumVecs = 2; IsLaneOp = true; break;
+ case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
+ NumVecs = 3; IsLaneOp = true; break;
+ case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
+ NumVecs = 4; IsLaneOp = true; break;
+ case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
+ NumVecs = 2; IsStore = true; IsLaneOp = true; break;
+ case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
+ NumVecs = 3; IsStore = true; IsLaneOp = true; break;
+ case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
+ NumVecs = 4; IsStore = true; IsLaneOp = true; break;
+ }
+
+ EVT VecTy;
+ if (IsStore)
+ VecTy = N->getOperand(2).getValueType();
+ else
+ VecTy = N->getValueType(0);
+
+ // If the increment is a constant, it must match the memory ref size.
+ SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
+ if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
+ uint32_t IncVal = CInc->getZExtValue();
+ unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
+ if (IsLaneOp || IsDupOp)
+ NumBytes /= VecTy.getVectorNumElements();
+ if (IncVal != NumBytes)
+ continue;
+ Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
+ }
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(N->getOperand(0)); // Incoming chain
+ // Load lane and store have vector list as input.
+ if (IsLaneOp || IsStore)
+ for (unsigned i = 2; i < AddrOpIdx; ++i)
+ Ops.push_back(N->getOperand(i));
+ Ops.push_back(Addr); // Base register
+ Ops.push_back(Inc);
+
+ // Return Types.
+ EVT Tys[6];
+ unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
+ unsigned n;
+ for (n = 0; n < NumResultVecs; ++n)
+ Tys[n] = VecTy;
+ Tys[n++] = MVT::i64; // Type of write back register
+ Tys[n] = MVT::Other; // Type of the chain
+ SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2));
+
+ MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
+ SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
+ MemInt->getMemoryVT(),
+ MemInt->getMemOperand());
+
+ // Update the uses.
+ std::vector<SDValue> NewResults;
+ for (unsigned i = 0; i < NumResultVecs; ++i) {
+ NewResults.push_back(SDValue(UpdN.getNode(), i));
+ }
+ NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
+ DCI.CombineTo(N, NewResults);
+ DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
+
+ break;
+ }
+ return SDValue();
+}
+
+// Checks to see if the value is the prescribed width and returns information
+// about its extension mode.
+static
+bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
+ ExtType = ISD::NON_EXTLOAD;
+ switch(V.getNode()->getOpcode()) {
+ default:
+ return false;
+ case ISD::LOAD: {
+ LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
+ if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
+ || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
+ ExtType = LoadNode->getExtensionType();
+ return true;
+ }
+ return false;
+ }
+ case ISD::AssertSext: {
+ VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
+ if ((TypeNode->getVT() == MVT::i8 && width == 8)
+ || (TypeNode->getVT() == MVT::i16 && width == 16)) {
+ ExtType = ISD::SEXTLOAD;
+ return true;
+ }
+ return false;
+ }
+ case ISD::AssertZext: {
+ VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
+ if ((TypeNode->getVT() == MVT::i8 && width == 8)
+ || (TypeNode->getVT() == MVT::i16 && width == 16)) {
+ ExtType = ISD::ZEXTLOAD;
+ return true;
+ }
+ return false;
+ }
+ case ISD::Constant:
+ case ISD::TargetConstant: {
+ return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
+ 1LL << (width - 1);
+ }
+ }
+
+ return true;
+}
+
+// This function does a whole lot of voodoo to determine if the tests are
+// equivalent without and with a mask. Essentially what happens is that given a
+// DAG resembling:
+//
+// +-------------+ +-------------+ +-------------+ +-------------+
+// | Input | | AddConstant | | CompConstant| | CC |
+// +-------------+ +-------------+ +-------------+ +-------------+
+// | | | |
+// V V | +----------+
+// +-------------+ +----+ | |
+// | ADD | |0xff| | |
+// +-------------+ +----+ | |
+// | | | |
+// V V | |
+// +-------------+ | |
+// | AND | | |
+// +-------------+ | |
+// | | |
+// +-----+ | |
+// | | |
+// V V V
+// +-------------+
+// | CMP |
+// +-------------+
+//
+// The AND node may be safely removed for some combinations of inputs. In
+// particular we need to take into account the extension type of the Input,
+// the exact values of AddConstant, CompConstant, and CC, along with the nominal
+// width of the input (this can work for any width inputs, the above graph is
+// specific to 8 bits.
+//
+// The specific equations were worked out by generating output tables for each
+// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
+// problem was simplified by working with 4 bit inputs, which means we only
+// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
+// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
+// patterns present in both extensions (0,7). For every distinct set of
+// AddConstant and CompConstants bit patterns we can consider the masked and
+// unmasked versions to be equivalent if the result of this function is true for
+// all 16 distinct bit patterns of for the current extension type of Input (w0).
+//
+// sub w8, w0, w1
+// and w10, w8, #0x0f
+// cmp w8, w2
+// cset w9, AArch64CC
+// cmp w10, w2
+// cset w11, AArch64CC
+// cmp w9, w11
+// cset w0, eq
+// ret
+//
+// Since the above function shows when the outputs are equivalent it defines
+// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
+// would be expensive to run during compiles. The equations below were written
+// in a test harness that confirmed they gave equivalent outputs to the above
+// for all inputs function, so they can be used determine if the removal is
+// legal instead.
+//
+// isEquivalentMaskless() is the code for testing if the AND can be removed
+// factored out of the DAG recognition as the DAG can take several forms.
+
+static bool isEquivalentMaskless(unsigned CC, unsigned width,
+ ISD::LoadExtType ExtType, int AddConstant,
+ int CompConstant) {
+ // By being careful about our equations and only writing the in term
+ // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
+ // make them generally applicable to all bit widths.
+ int MaxUInt = (1 << width);
+
+ // For the purposes of these comparisons sign extending the type is
+ // equivalent to zero extending the add and displacing it by half the integer
+ // width. Provided we are careful and make sure our equations are valid over
+ // the whole range we can just adjust the input and avoid writing equations
+ // for sign extended inputs.
+ if (ExtType == ISD::SEXTLOAD)
+ AddConstant -= (1 << (width-1));
+
+ switch(CC) {
+ case AArch64CC::LE:
+ case AArch64CC::GT: {
+ if ((AddConstant == 0) ||
+ (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
+ (AddConstant >= 0 && CompConstant < 0) ||
+ (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
+ return true;
+ } break;
+ case AArch64CC::LT:
+ case AArch64CC::GE: {
+ if ((AddConstant == 0) ||
+ (AddConstant >= 0 && CompConstant <= 0) ||
+ (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
+ return true;
+ } break;
+ case AArch64CC::HI:
+ case AArch64CC::LS: {
+ if ((AddConstant >= 0 && CompConstant < 0) ||
+ (AddConstant <= 0 && CompConstant >= -1 &&
+ CompConstant < AddConstant + MaxUInt))
+ return true;
+ } break;
+ case AArch64CC::PL:
+ case AArch64CC::MI: {
+ if ((AddConstant == 0) ||
+ (AddConstant > 0 && CompConstant <= 0) ||
+ (AddConstant < 0 && CompConstant <= AddConstant))
+ return true;
+ } break;
+ case AArch64CC::LO:
+ case AArch64CC::HS: {
+ if ((AddConstant >= 0 && CompConstant <= 0) ||
+ (AddConstant <= 0 && CompConstant >= 0 &&
+ CompConstant <= AddConstant + MaxUInt))
+ return true;
+ } break;
+ case AArch64CC::EQ:
+ case AArch64CC::NE: {
+ if ((AddConstant > 0 && CompConstant < 0) ||
+ (AddConstant < 0 && CompConstant >= 0 &&
+ CompConstant < AddConstant + MaxUInt) ||
+ (AddConstant >= 0 && CompConstant >= 0 &&
+ CompConstant >= AddConstant) ||
+ (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
+
+ return true;
+ } break;
+ case AArch64CC::VS:
+ case AArch64CC::VC:
+ case AArch64CC::AL:
+ case AArch64CC::NV:
+ return true;
+ case AArch64CC::Invalid:
+ break;
+ }
+
+ return false;
+}
+
+static
+SDValue performCONDCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG, unsigned CCIndex,
+ unsigned CmpIndex) {
+ unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
+ SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
+ unsigned CondOpcode = SubsNode->getOpcode();
+
+ if (CondOpcode != AArch64ISD::SUBS)
+ return SDValue();
+
+ // There is a SUBS feeding this condition. Is it fed by a mask we can
+ // use?
+
+ SDNode *AndNode = SubsNode->getOperand(0).getNode();
+ unsigned MaskBits = 0;
+
+ if (AndNode->getOpcode() != ISD::AND)
+ return SDValue();
+
+ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
+ uint32_t CNV = CN->getZExtValue();
+ if (CNV == 255)
+ MaskBits = 8;
+ else if (CNV == 65535)
+ MaskBits = 16;
+ }
+
+ if (!MaskBits)
+ return SDValue();
+
+ SDValue AddValue = AndNode->getOperand(0);
+
+ if (AddValue.getOpcode() != ISD::ADD)
+ return SDValue();
+
+ // The basic dag structure is correct, grab the inputs and validate them.
+
+ SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
+ SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
+ SDValue SubsInputValue = SubsNode->getOperand(1);
+
+ // The mask is present and the provenance of all the values is a smaller type,
+ // lets see if the mask is superfluous.
+
+ if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
+ !isa<ConstantSDNode>(SubsInputValue.getNode()))
+ return SDValue();
+
+ ISD::LoadExtType ExtType;
+
+ if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
+ !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
+ !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
+ return SDValue();
+
+ if(!isEquivalentMaskless(CC, MaskBits, ExtType,
+ cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
+ cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
+ return SDValue();
+
+ // The AND is not necessary, remove it.
+
+ SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
+ SubsNode->getValueType(1));
+ SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
+
+ SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
+ DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
+
+ return SDValue(N, 0);
+}
+
+// Optimize compare with zero and branch.
+static SDValue performBRCONDCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG) {
+ if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
+ N = NV.getNode();
+ SDValue Chain = N->getOperand(0);
+ SDValue Dest = N->getOperand(1);
+ SDValue CCVal = N->getOperand(2);
+ SDValue Cmp = N->getOperand(3);
+
+ assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
+ unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
+ if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
+ return SDValue();
+
+ unsigned CmpOpc = Cmp.getOpcode();
+ if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
+ return SDValue();
+
+ // Only attempt folding if there is only one use of the flag and no use of the
+ // value.
+ if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
+ return SDValue();
+
+ SDValue LHS = Cmp.getOperand(0);
+ SDValue RHS = Cmp.getOperand(1);
+
+ assert(LHS.getValueType() == RHS.getValueType() &&
+ "Expected the value type to be the same for both operands!");
+ if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
+ return SDValue();
+
+ if (isNullConstant(LHS))
+ std::swap(LHS, RHS);
+
+ if (!isNullConstant(RHS))
+ return SDValue();
+
+ if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
+ LHS.getOpcode() == ISD::SRL)
+ return SDValue();
+
+ // Fold the compare into the branch instruction.
+ SDValue BR;
+ if (CC == AArch64CC::EQ)
+ BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
+ else
+ BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
+
+ // Do not add new nodes to DAG combiner worklist.
+ DCI.CombineTo(N, BR, false);
+
+ return SDValue();
+}
+
+// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
+// as well as whether the test should be inverted. This code is required to
+// catch these cases (as opposed to standard dag combines) because
+// AArch64ISD::TBZ is matched during legalization.
+static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
+ SelectionDAG &DAG) {
+
+ if (!Op->hasOneUse())
+ return Op;
+
+ // We don't handle undef/constant-fold cases below, as they should have
+ // already been taken care of (e.g. and of 0, test of undefined shifted bits,
+ // etc.)
+
+ // (tbz (trunc x), b) -> (tbz x, b)
+ // This case is just here to enable more of the below cases to be caught.
+ if (Op->getOpcode() == ISD::TRUNCATE &&
+ Bit < Op->getValueType(0).getSizeInBits()) {
+ return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
+ }
+
+ if (Op->getNumOperands() != 2)
+ return Op;
+
+ auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
+ if (!C)
+ return Op;
+
+ switch (Op->getOpcode()) {
+ default:
+ return Op;
+
+ // (tbz (and x, m), b) -> (tbz x, b)
+ case ISD::AND:
+ if ((C->getZExtValue() >> Bit) & 1)
+ return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
+ return Op;
+
+ // (tbz (shl x, c), b) -> (tbz x, b-c)
+ case ISD::SHL:
+ if (C->getZExtValue() <= Bit &&
+ (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
+ Bit = Bit - C->getZExtValue();
+ return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
+ }
+ return Op;
+
+ // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
+ case ISD::SRA:
+ Bit = Bit + C->getZExtValue();
+ if (Bit >= Op->getValueType(0).getSizeInBits())
+ Bit = Op->getValueType(0).getSizeInBits() - 1;
+ return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
+
+ // (tbz (srl x, c), b) -> (tbz x, b+c)
+ case ISD::SRL:
+ if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
+ Bit = Bit + C->getZExtValue();
+ return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
+ }
+ return Op;
+
+ // (tbz (xor x, -1), b) -> (tbnz x, b)
+ case ISD::XOR:
+ if ((C->getZExtValue() >> Bit) & 1)
+ Invert = !Invert;
+ return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
+ }
+}
+
+// Optimize test single bit zero/non-zero and branch.
+static SDValue performTBZCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG) {
+ unsigned Bit = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+ bool Invert = false;
+ SDValue TestSrc = N->getOperand(1);
+ SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
+
+ if (TestSrc == NewTestSrc)
+ return SDValue();
+
+ unsigned NewOpc = N->getOpcode();
+ if (Invert) {
+ if (NewOpc == AArch64ISD::TBZ)
+ NewOpc = AArch64ISD::TBNZ;
+ else {
+ assert(NewOpc == AArch64ISD::TBNZ);
+ NewOpc = AArch64ISD::TBZ;
+ }
+ }
+
+ SDLoc DL(N);
+ return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
+ DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
+}
+
+// vselect (v1i1 setcc) ->
+// vselect (v1iXX setcc) (XX is the size of the compared operand type)
+// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
+// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
+// such VSELECT.
+static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
+ SDValue N0 = N->getOperand(0);
+ EVT CCVT = N0.getValueType();
+
+ if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 ||
+ CCVT.getVectorElementType() != MVT::i1)
+ return SDValue();
+
+ EVT ResVT = N->getValueType(0);
+ EVT CmpVT = N0.getOperand(0).getValueType();
+ // Only combine when the result type is of the same size as the compared
+ // operands.
+ if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
+ return SDValue();
+
+ SDValue IfTrue = N->getOperand(1);
+ SDValue IfFalse = N->getOperand(2);
+ SDValue SetCC =
+ DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
+ N0.getOperand(0), N0.getOperand(1),
+ cast<CondCodeSDNode>(N0.getOperand(2))->get());
+ return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
+ IfTrue, IfFalse);
+}
+
+/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
+/// the compare-mask instructions rather than going via NZCV, even if LHS and
+/// RHS are really scalar. This replaces any scalar setcc in the above pattern
+/// with a vector one followed by a DUP shuffle on the result.
+static SDValue performSelectCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue N0 = N->getOperand(0);
+ EVT ResVT = N->getValueType(0);
+
+ if (N0.getOpcode() != ISD::SETCC)
+ return SDValue();
+
+ // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
+ // scalar SetCCResultType. We also don't expect vectors, because we assume
+ // that selects fed by vector SETCCs are canonicalized to VSELECT.
+ assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
+ "Scalar-SETCC feeding SELECT has unexpected result type!");
+
+ // If NumMaskElts == 0, the comparison is larger than select result. The
+ // largest real NEON comparison is 64-bits per lane, which means the result is
+ // at most 32-bits and an illegal vector. Just bail out for now.
+ EVT SrcVT = N0.getOperand(0).getValueType();
+
+ // Don't try to do this optimization when the setcc itself has i1 operands.
+ // There are no legal vectors of i1, so this would be pointless.
+ if (SrcVT == MVT::i1)
+ return SDValue();
+
+ int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
+ if (!ResVT.isVector() || NumMaskElts == 0)
+ return SDValue();
+
+ SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
+ EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
+
+ // Also bail out if the vector CCVT isn't the same size as ResVT.
+ // This can happen if the SETCC operand size doesn't divide the ResVT size
+ // (e.g., f64 vs v3f32).
+ if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
+ return SDValue();
+
+ // Make sure we didn't create illegal types, if we're not supposed to.
+ assert(DCI.isBeforeLegalize() ||
+ DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
+
+ // First perform a vector comparison, where lane 0 is the one we're interested
+ // in.
+ SDLoc DL(N0);
+ SDValue LHS =
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
+ SDValue RHS =
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
+ SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
+
+ // Now duplicate the comparison mask we want across all other lanes.
+ SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
+ SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
+ Mask = DAG.getNode(ISD::BITCAST, DL,
+ ResVT.changeVectorElementTypeToInteger(), Mask);
+
+ return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
+}
+
+/// Get rid of unnecessary NVCASTs (that don't change the type).
+static SDValue performNVCASTCombine(SDNode *N) {
+ if (N->getValueType(0) == N->getOperand(0).getValueType())
+ return N->getOperand(0);
+
+ return SDValue();
+}
+
+SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ switch (N->getOpcode()) {
+ default:
+ break;
+ case ISD::ADD:
+ case ISD::SUB:
+ return performAddSubLongCombine(N, DCI, DAG);
+ case ISD::XOR:
+ return performXorCombine(N, DAG, DCI, Subtarget);
+ case ISD::MUL:
+ return performMulCombine(N, DAG, DCI, Subtarget);
+ case ISD::SINT_TO_FP:
+ case ISD::UINT_TO_FP:
+ return performIntToFpCombine(N, DAG, Subtarget);
+ case ISD::FP_TO_SINT:
+ case ISD::FP_TO_UINT:
+ return performFpToIntCombine(N, DAG, DCI, Subtarget);
+ case ISD::FDIV:
+ return performFDivCombine(N, DAG, DCI, Subtarget);
+ case ISD::OR:
+ return performORCombine(N, DCI, Subtarget);
+ case ISD::SRL:
+ return performSRLCombine(N, DCI);
+ case ISD::INTRINSIC_WO_CHAIN:
+ return performIntrinsicCombine(N, DCI, Subtarget);
+ case ISD::ANY_EXTEND:
+ case ISD::ZERO_EXTEND:
+ case ISD::SIGN_EXTEND:
+ return performExtendCombine(N, DCI, DAG);
+ case ISD::BITCAST:
+ return performBitcastCombine(N, DCI, DAG);
+ case ISD::CONCAT_VECTORS:
+ return performConcatVectorsCombine(N, DCI, DAG);
+ case ISD::SELECT: {
+ SDValue RV = performSelectCombine(N, DCI);
+ if (!RV.getNode())
+ RV = performAcrossLaneMinMaxReductionCombine(N, DAG, Subtarget);
+ return RV;
+ }
+ case ISD::VSELECT:
+ return performVSelectCombine(N, DCI.DAG);
+ case ISD::LOAD:
+ if (performTBISimplification(N->getOperand(1), DCI, DAG))
+ return SDValue(N, 0);
+ break;
+ case ISD::STORE:
+ return performSTORECombine(N, DCI, DAG, Subtarget);
+ case AArch64ISD::BRCOND:
+ return performBRCONDCombine(N, DCI, DAG);
+ case AArch64ISD::TBNZ:
+ case AArch64ISD::TBZ:
+ return performTBZCombine(N, DCI, DAG);
+ case AArch64ISD::CSEL:
+ return performCONDCombine(N, DCI, DAG, 2, 3);
+ case AArch64ISD::DUP:
+ return performPostLD1Combine(N, DCI, false);
+ case AArch64ISD::NVCAST:
+ return performNVCASTCombine(N);
+ case ISD::INSERT_VECTOR_ELT:
+ return performPostLD1Combine(N, DCI, true);
+ case ISD::EXTRACT_VECTOR_ELT:
+ return performAcrossLaneAddReductionCombine(N, DAG, Subtarget);
+ case ISD::INTRINSIC_VOID:
+ case ISD::INTRINSIC_W_CHAIN:
+ switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+ case Intrinsic::aarch64_neon_ld2:
+ case Intrinsic::aarch64_neon_ld3:
+ case Intrinsic::aarch64_neon_ld4:
+ case Intrinsic::aarch64_neon_ld1x2:
+ case Intrinsic::aarch64_neon_ld1x3:
+ case Intrinsic::aarch64_neon_ld1x4:
+ case Intrinsic::aarch64_neon_ld2lane:
+ case Intrinsic::aarch64_neon_ld3lane:
+ case Intrinsic::aarch64_neon_ld4lane:
+ case Intrinsic::aarch64_neon_ld2r:
+ case Intrinsic::aarch64_neon_ld3r:
+ case Intrinsic::aarch64_neon_ld4r:
+ case Intrinsic::aarch64_neon_st2:
+ case Intrinsic::aarch64_neon_st3:
+ case Intrinsic::aarch64_neon_st4:
+ case Intrinsic::aarch64_neon_st1x2:
+ case Intrinsic::aarch64_neon_st1x3:
+ case Intrinsic::aarch64_neon_st1x4:
+ case Intrinsic::aarch64_neon_st2lane:
+ case Intrinsic::aarch64_neon_st3lane:
+ case Intrinsic::aarch64_neon_st4lane:
+ return performNEONPostLDSTCombine(N, DCI, DAG);
+ default:
+ break;
+ }
+ }
+ return SDValue();
+}
+
+// Check if the return value is used as only a return value, as otherwise
+// we can't perform a tail-call. In particular, we need to check for
+// target ISD nodes that are returns and any other "odd" constructs
+// that the generic analysis code won't necessarily catch.
+bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
+ SDValue &Chain) const {
+ if (N->getNumValues() != 1)
+ return false;
+ if (!N->hasNUsesOfValue(1, 0))
+ return false;
+
+ SDValue TCChain = Chain;
+ SDNode *Copy = *N->use_begin();
+ if (Copy->getOpcode() == ISD::CopyToReg) {
+ // If the copy has a glue operand, we conservatively assume it isn't safe to
+ // perform a tail call.
+ if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
+ MVT::Glue)
+ return false;
+ TCChain = Copy->getOperand(0);
+ } else if (Copy->getOpcode() != ISD::FP_EXTEND)
+ return false;
+
+ bool HasRet = false;
+ for (SDNode *Node : Copy->uses()) {
+ if (Node->getOpcode() != AArch64ISD::RET_FLAG)
+ return false;
+ HasRet = true;
+ }
+
+ if (!HasRet)
+ return false;
+
+ Chain = TCChain;
+ return true;
+}
+
+// Return whether the an instruction can potentially be optimized to a tail
+// call. This will cause the optimizers to attempt to move, or duplicate,
+// return instructions to help enable tail call optimizations for this
+// instruction.
+bool AArch64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
+ return CI->isTailCall();
+}
+
+bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
+ SDValue &Offset,
+ ISD::MemIndexedMode &AM,
+ bool &IsInc,
+ SelectionDAG &DAG) const {
+ if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
+ return false;
+
+ Base = Op->getOperand(0);
+ // All of the indexed addressing mode instructions take a signed
+ // 9 bit immediate offset.
+ if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
+ int64_t RHSC = RHS->getSExtValue();
+ if (Op->getOpcode() == ISD::SUB)
+ RHSC = -(uint64_t)RHSC;
+ if (!isInt<9>(RHSC))
+ return false;
+ IsInc = (Op->getOpcode() == ISD::ADD);
+ Offset = Op->getOperand(1);
+ return true;
+ }
+ return false;
+}
+
+bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
+ SDValue &Offset,
+ ISD::MemIndexedMode &AM,
+ SelectionDAG &DAG) const {
+ EVT VT;
+ SDValue Ptr;
+ if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+ VT = LD->getMemoryVT();
+ Ptr = LD->getBasePtr();
+ } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+ VT = ST->getMemoryVT();
+ Ptr = ST->getBasePtr();
+ } else
+ return false;
+
+ bool IsInc;
+ if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
+ return false;
+ AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC;
+ return true;
+}
+
+bool AArch64TargetLowering::getPostIndexedAddressParts(
+ SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset,
+ ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
+ EVT VT;
+ SDValue Ptr;
+ if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+ VT = LD->getMemoryVT();
+ Ptr = LD->getBasePtr();
+ } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+ VT = ST->getMemoryVT();
+ Ptr = ST->getBasePtr();
+ } else
+ return false;
+
+ bool IsInc;
+ if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
+ return false;
+ // Post-indexing updates the base, so it's not a valid transform
+ // if that's not the same as the load's pointer.
+ if (Ptr != Base)
+ return false;
+ AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
+ return true;
+}
+
+static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) {
+ SDLoc DL(N);
+ SDValue Op = N->getOperand(0);
+
+ if (N->getValueType(0) != MVT::i16 || Op.getValueType() != MVT::f16)
+ return;
+
+ Op = SDValue(
+ DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
+ DAG.getUNDEF(MVT::i32), Op,
+ DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
+ 0);
+ Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
+}
+
+static void ReplaceReductionResults(SDNode *N,
+ SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG, unsigned InterOp,
+ unsigned AcrossOp) {
+ EVT LoVT, HiVT;
+ SDValue Lo, Hi;
+ SDLoc dl(N);
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+ std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
+ SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
+ SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
+ Results.push_back(SplitVal);
+}
+
+static std::pair<SDValue, SDValue> splitInt128(SDValue N, SelectionDAG &DAG) {
+ SDLoc DL(N);
+ SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, N);
+ SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64,
+ DAG.getNode(ISD::SRL, DL, MVT::i128, N,
+ DAG.getConstant(64, DL, MVT::i64)));
+ return std::make_pair(Lo, Hi);
+}
+
+static void ReplaceCMP_SWAP_128Results(SDNode *N,
+ SmallVectorImpl<SDValue> & Results,
+ SelectionDAG &DAG) {
+ assert(N->getValueType(0) == MVT::i128 &&
+ "AtomicCmpSwap on types less than 128 should be legal");
+ auto Desired = splitInt128(N->getOperand(2), DAG);
+ auto New = splitInt128(N->getOperand(3), DAG);
+ SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
+ New.first, New.second, N->getOperand(0)};
+ SDNode *CmpSwap = DAG.getMachineNode(
+ AArch64::CMP_SWAP_128, SDLoc(N),
+ DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other), Ops);
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1);
+ MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
+ cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1);
+
+ Results.push_back(SDValue(CmpSwap, 0));
+ Results.push_back(SDValue(CmpSwap, 1));
+ Results.push_back(SDValue(CmpSwap, 3));
+}
+
+void AArch64TargetLowering::ReplaceNodeResults(
+ SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
+ switch (N->getOpcode()) {
+ default:
+ llvm_unreachable("Don't know how to custom expand this");
+ case ISD::BITCAST:
+ ReplaceBITCASTResults(N, Results, DAG);
+ return;
+ case AArch64ISD::SADDV:
+ ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
+ return;
+ case AArch64ISD::UADDV:
+ ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);
+ return;
+ case AArch64ISD::SMINV:
+ ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV);
+ return;
+ case AArch64ISD::UMINV:
+ ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);
+ return;
+ case AArch64ISD::SMAXV:
+ ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);
+ return;
+ case AArch64ISD::UMAXV:
+ ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);
+ return;
+ case ISD::FP_TO_UINT:
+ case ISD::FP_TO_SINT:
+ assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
+ // Let normal code take care of it by not adding anything to Results.
+ return;
+ case ISD::ATOMIC_CMP_SWAP:
+ ReplaceCMP_SWAP_128Results(N, Results, DAG);
+ return;
+ }
+}
+
+bool AArch64TargetLowering::useLoadStackGuardNode() const {
+ if (!Subtarget->isTargetAndroid())
+ return true;
+ return TargetLowering::useLoadStackGuardNode();
+}
+
+unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
+ // Combine multiple FDIVs with the same divisor into multiple FMULs by the
+ // reciprocal if there are three or more FDIVs.
+ return 3;
+}
+
+TargetLoweringBase::LegalizeTypeAction
+AArch64TargetLowering::getPreferredVectorAction(EVT VT) const {
+ MVT SVT = VT.getSimpleVT();
+ // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
+ // v4i16, v2i32 instead of to promote.
+ if (SVT == MVT::v1i8 || SVT == MVT::v1i16 || SVT == MVT::v1i32
+ || SVT == MVT::v1f32)
+ return TypeWidenVector;
+
+ return TargetLoweringBase::getPreferredVectorAction(VT);
+}
+
+// Loads and stores less than 128-bits are already atomic; ones above that
+// are doomed anyway, so defer to the default libcall and blame the OS when
+// things go wrong.
+bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
+ unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
+ return Size == 128;
+}
+
+// Loads and stores less than 128-bits are already atomic; ones above that
+// are doomed anyway, so defer to the default libcall and blame the OS when
+// things go wrong.
+TargetLowering::AtomicExpansionKind
+AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
+ unsigned Size = LI->getType()->getPrimitiveSizeInBits();
+ return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None;
+}
+
+// For the real atomic operations, we have ldxr/stxr up to 128 bits,
+TargetLowering::AtomicExpansionKind
+AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
+ unsigned Size = AI->getType()->getPrimitiveSizeInBits();
+ return Size <= 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None;
+}
+
+bool AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
+ AtomicCmpXchgInst *AI) const {
+ // At -O0, fast-regalloc cannot cope with the live vregs necessary to
+ // implement cmpxchg without spilling. If the address being exchanged is also
+ // on the stack and close enough to the spill slot, this can lead to a
+ // situation where the monitor always gets cleared and the atomic operation
+ // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
+ return getTargetMachine().getOptLevel() != 0;
+}
+
+Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+ AtomicOrdering Ord) const {
+ Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+ Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
+ bool IsAcquire = isAcquireOrStronger(Ord);
+
+ // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
+ // intrinsic must return {i64, i64} and we have to recombine them into a
+ // single i128 here.
+ if (ValTy->getPrimitiveSizeInBits() == 128) {
+ Intrinsic::ID Int =
+ IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
+ Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int);
+
+ Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
+ Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
+
+ Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
+ Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
+ Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
+ Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
+ return Builder.CreateOr(
+ Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
+ }
+
+ Type *Tys[] = { Addr->getType() };
+ Intrinsic::ID Int =
+ IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
+ Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int, Tys);
+
+ return Builder.CreateTruncOrBitCast(
+ Builder.CreateCall(Ldxr, Addr),
+ cast<PointerType>(Addr->getType())->getElementType());
+}
+
+void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
+ IRBuilder<> &Builder) const {
+ Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+ Builder.CreateCall(
+ llvm::Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
+}
+
+Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
+ Value *Val, Value *Addr,
+ AtomicOrdering Ord) const {
+ Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+ bool IsRelease = isReleaseOrStronger(Ord);
+
+ // Since the intrinsics must have legal type, the i128 intrinsics take two
+ // parameters: "i64, i64". We must marshal Val into the appropriate form
+ // before the call.
+ if (Val->getType()->getPrimitiveSizeInBits() == 128) {
+ Intrinsic::ID Int =
+ IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
+ Function *Stxr = Intrinsic::getDeclaration(M, Int);
+ Type *Int64Ty = Type::getInt64Ty(M->getContext());
+
+ Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
+ Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
+ Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
+ return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
+ }
+
+ Intrinsic::ID Int =
+ IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
+ Type *Tys[] = { Addr->getType() };
+ Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
+
+ return Builder.CreateCall(Stxr,
+ {Builder.CreateZExtOrBitCast(
+ Val, Stxr->getFunctionType()->getParamType(0)),
+ Addr});
+}
+
+bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
+ Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
+ return Ty->isArrayTy();
+}
+
+bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
+ EVT) const {
+ return false;
+}
+
+Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
+ if (!Subtarget->isTargetAndroid())
+ return TargetLowering::getIRStackGuard(IRB);
+
+ // Android provides a fixed TLS slot for the stack cookie. See the definition
+ // of TLS_SLOT_STACK_GUARD in
+ // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
+ const unsigned TlsOffset = 0x28;
+ Module *M = IRB.GetInsertBlock()->getParent()->getParent();
+ Function *ThreadPointerFunc =
+ Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
+ return IRB.CreatePointerCast(
+ IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset),
+ Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0));
+}
+
+Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
+ if (!Subtarget->isTargetAndroid())
+ return TargetLowering::getSafeStackPointerLocation(IRB);
+
+ // Android provides a fixed TLS slot for the SafeStack pointer. See the
+ // definition of TLS_SLOT_SAFESTACK in
+ // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
+ const unsigned TlsOffset = 0x48;
+ Module *M = IRB.GetInsertBlock()->getParent()->getParent();
+ Function *ThreadPointerFunc =
+ Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
+ return IRB.CreatePointerCast(
+ IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset),
+ Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0));
+}
+
+void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
+ // Update IsSplitCSR in AArch64unctionInfo.
+ AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
+ AFI->setIsSplitCSR(true);
+}
+
+void AArch64TargetLowering::insertCopiesSplitCSR(
+ MachineBasicBlock *Entry,
+ const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
+ const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
+ const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
+ if (!IStart)
+ return;
+
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
+ MachineBasicBlock::iterator MBBI = Entry->begin();
+ for (const MCPhysReg *I = IStart; *I; ++I) {
+ const TargetRegisterClass *RC = nullptr;
+ if (AArch64::GPR64RegClass.contains(*I))
+ RC = &AArch64::GPR64RegClass;
+ else if (AArch64::FPR64RegClass.contains(*I))
+ RC = &AArch64::FPR64RegClass;
+ else
+ llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+
+ unsigned NewVR = MRI->createVirtualRegister(RC);
+ // Create copy from CSR to a virtual register.
+ // FIXME: this currently does not emit CFI pseudo-instructions, it works
+ // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
+ // nounwind. If we want to generalize this later, we may need to emit
+ // CFI pseudo-instructions.
+ assert(Entry->getParent()->getFunction()->hasFnAttribute(
+ Attribute::NoUnwind) &&
+ "Function should be nounwind in insertCopiesSplitCSR!");
+ Entry->addLiveIn(*I);
+ BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
+ .addReg(*I);
+
+ // Insert the copy-back instructions right before the terminator.
+ for (auto *Exit : Exits)
+ BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
+ TII->get(TargetOpcode::COPY), *I)
+ .addReg(NewVR);
+ }
+}
+
+bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
+ // Integer division on AArch64 is expensive. However, when aggressively
+ // optimizing for code size, we prefer to use a div instruction, as it is
+ // usually smaller than the alternative sequence.
+ // The exception to this is vector division. Since AArch64 doesn't have vector
+ // integer division, leaving the division as-is is a loss even in terms of
+ // size, because it will have to be scalarized, while the alternative code
+ // sequence can be performed in vector form.
+ bool OptSize =
+ Attr.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
+ return OptSize && !VT.isVector();
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h
new file mode 100644
index 000000000000..054ccc31674f
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -0,0 +1,607 @@
+//==-- AArch64ISelLowering.h - AArch64 DAG Lowering Interface ----*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that AArch64 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64ISELLOWERING_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64ISELLOWERING_H
+
+#include "AArch64.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+namespace AArch64ISD {
+
+enum NodeType : unsigned {
+ FIRST_NUMBER = ISD::BUILTIN_OP_END,
+ WrapperLarge, // 4-instruction MOVZ/MOVK sequence for 64-bit addresses.
+ CALL, // Function call.
+
+ // Produces the full sequence of instructions for getting the thread pointer
+ // offset of a variable into X0, using the TLSDesc model.
+ TLSDESC_CALLSEQ,
+ ADRP, // Page address of a TargetGlobalAddress operand.
+ ADDlow, // Add the low 12 bits of a TargetGlobalAddress operand.
+ LOADgot, // Load from automatically generated descriptor (e.g. Global
+ // Offset Table, TLS record).
+ RET_FLAG, // Return with a flag operand. Operand 0 is the chain operand.
+ BRCOND, // Conditional branch instruction; "b.cond".
+ CSEL,
+ FCSEL, // Conditional move instruction.
+ CSINV, // Conditional select invert.
+ CSNEG, // Conditional select negate.
+ CSINC, // Conditional select increment.
+
+ // Pointer to the thread's local storage area. Materialised from TPIDR_EL0 on
+ // ELF.
+ THREAD_POINTER,
+ ADC,
+ SBC, // adc, sbc instructions
+
+ // Arithmetic instructions which write flags.
+ ADDS,
+ SUBS,
+ ADCS,
+ SBCS,
+ ANDS,
+
+ // Conditional compares. Operands: left,right,falsecc,cc,flags
+ CCMP,
+ CCMN,
+ FCCMP,
+
+ // Floating point comparison
+ FCMP,
+
+ // Scalar extract
+ EXTR,
+
+ // Scalar-to-vector duplication
+ DUP,
+ DUPLANE8,
+ DUPLANE16,
+ DUPLANE32,
+ DUPLANE64,
+
+ // Vector immedate moves
+ MOVI,
+ MOVIshift,
+ MOVIedit,
+ MOVImsl,
+ FMOV,
+ MVNIshift,
+ MVNImsl,
+
+ // Vector immediate ops
+ BICi,
+ ORRi,
+
+ // Vector bit select: similar to ISD::VSELECT but not all bits within an
+ // element must be identical.
+ BSL,
+
+ // Vector arithmetic negation
+ NEG,
+
+ // Vector shuffles
+ ZIP1,
+ ZIP2,
+ UZP1,
+ UZP2,
+ TRN1,
+ TRN2,
+ REV16,
+ REV32,
+ REV64,
+ EXT,
+
+ // Vector shift by scalar
+ VSHL,
+ VLSHR,
+ VASHR,
+
+ // Vector shift by scalar (again)
+ SQSHL_I,
+ UQSHL_I,
+ SQSHLU_I,
+ SRSHR_I,
+ URSHR_I,
+
+ // Vector comparisons
+ CMEQ,
+ CMGE,
+ CMGT,
+ CMHI,
+ CMHS,
+ FCMEQ,
+ FCMGE,
+ FCMGT,
+
+ // Vector zero comparisons
+ CMEQz,
+ CMGEz,
+ CMGTz,
+ CMLEz,
+ CMLTz,
+ FCMEQz,
+ FCMGEz,
+ FCMGTz,
+ FCMLEz,
+ FCMLTz,
+
+ // Vector across-lanes addition
+ // Only the lower result lane is defined.
+ SADDV,
+ UADDV,
+
+ // Vector across-lanes min/max
+ // Only the lower result lane is defined.
+ SMINV,
+ UMINV,
+ SMAXV,
+ UMAXV,
+
+ // Vector bitwise negation
+ NOT,
+
+ // Vector bitwise selection
+ BIT,
+
+ // Compare-and-branch
+ CBZ,
+ CBNZ,
+ TBZ,
+ TBNZ,
+
+ // Tail calls
+ TC_RETURN,
+
+ // Custom prefetch handling
+ PREFETCH,
+
+ // {s|u}int to FP within a FP register.
+ SITOF,
+ UITOF,
+
+ /// Natural vector cast. ISD::BITCAST is not natural in the big-endian
+ /// world w.r.t vectors; which causes additional REV instructions to be
+ /// generated to compensate for the byte-swapping. But sometimes we do
+ /// need to re-interpret the data in SIMD vector registers in big-endian
+ /// mode without emitting such REV instructions.
+ NVCAST,
+
+ SMULL,
+ UMULL,
+
+ // Reciprocal estimates and steps.
+ FRECPE, FRECPS,
+ FRSQRTE, FRSQRTS,
+
+ // NEON Load/Store with post-increment base updates
+ LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,
+ LD3post,
+ LD4post,
+ ST2post,
+ ST3post,
+ ST4post,
+ LD1x2post,
+ LD1x3post,
+ LD1x4post,
+ ST1x2post,
+ ST1x3post,
+ ST1x4post,
+ LD1DUPpost,
+ LD2DUPpost,
+ LD3DUPpost,
+ LD4DUPpost,
+ LD1LANEpost,
+ LD2LANEpost,
+ LD3LANEpost,
+ LD4LANEpost,
+ ST2LANEpost,
+ ST3LANEpost,
+ ST4LANEpost
+};
+
+} // end namespace AArch64ISD
+
+namespace {
+
+// Any instruction that defines a 32-bit result zeros out the high half of the
+// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
+// be copying from a truncate. But any other 32-bit operation will zero-extend
+// up to 64 bits.
+// FIXME: X86 also checks for CMOV here. Do we need something similar?
+static inline bool isDef32(const SDNode &N) {
+ unsigned Opc = N.getOpcode();
+ return Opc != ISD::TRUNCATE && Opc != TargetOpcode::EXTRACT_SUBREG &&
+ Opc != ISD::CopyFromReg;
+}
+
+} // end anonymous namespace
+
+class AArch64Subtarget;
+class AArch64TargetMachine;
+
+class AArch64TargetLowering : public TargetLowering {
+public:
+ explicit AArch64TargetLowering(const TargetMachine &TM,
+ const AArch64Subtarget &STI);
+
+ /// Selects the correct CCAssignFn for a given CallingConvention value.
+ CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
+
+ /// Selects the correct CCAssignFn for a given CallingConvention value.
+ CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC) const;
+
+ /// Determine which of the bits specified in Mask are known to be either zero
+ /// or one and return them in the KnownZero/KnownOne bitsets.
+ void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero,
+ APInt &KnownOne, const SelectionDAG &DAG,
+ unsigned Depth = 0) const override;
+
+ MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override;
+
+ /// Returns true if the target allows unaligned memory accesses of the
+ /// specified type.
+ bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0,
+ unsigned Align = 1,
+ bool *Fast = nullptr) const override;
+
+ /// Provide custom lowering hooks for some operations.
+ SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+ const char *getTargetNodeName(unsigned Opcode) const override;
+
+ SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+ /// Returns true if a cast between SrcAS and DestAS is a noop.
+ bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
+ // Addrspacecasts are always noops.
+ return true;
+ }
+
+ /// This method returns a target specific FastISel object, or null if the
+ /// target does not support "fast" ISel.
+ FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo) const override;
+
+ bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+
+ bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+
+ /// Return true if the given shuffle mask can be codegen'd directly, or if it
+ /// should be stack expanded.
+ bool isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const override;
+
+ /// Return the ISD::SETCC ValueType.
+ EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+ EVT VT) const override;
+
+ SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;
+
+ MachineBasicBlock *EmitF128CSEL(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *
+ EmitInstrWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *MBB) const override;
+
+ bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
+ unsigned Intrinsic) const override;
+
+ bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
+ bool isTruncateFree(EVT VT1, EVT VT2) const override;
+
+ bool isProfitableToHoist(Instruction *I) const override;
+
+ bool isZExtFree(Type *Ty1, Type *Ty2) const override;
+ bool isZExtFree(EVT VT1, EVT VT2) const override;
+ bool isZExtFree(SDValue Val, EVT VT2) const override;
+
+ bool hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const override;
+
+ unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
+
+ bool lowerInterleavedLoad(LoadInst *LI,
+ ArrayRef<ShuffleVectorInst *> Shuffles,
+ ArrayRef<unsigned> Indices,
+ unsigned Factor) const override;
+ bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+ unsigned Factor) const override;
+
+ bool isLegalAddImmediate(int64_t) const override;
+ bool isLegalICmpImmediate(int64_t) const override;
+
+ EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
+ bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
+ MachineFunction &MF) const override;
+
+ /// Return true if the addressing mode represented by AM is legal for this
+ /// target, for a load/store of the specified type.
+ bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
+ unsigned AS) const override;
+
+ /// \brief Return the cost of the scaling factor used in the addressing
+ /// mode represented by AM for this target, for a load/store
+ /// of the specified type.
+ /// If the AM is supported, the return value must be >= 0.
+ /// If the AM is not supported, it returns a negative value.
+ int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty,
+ unsigned AS) const override;
+
+ /// Return true if an FMA operation is faster than a pair of fmul and fadd
+ /// instructions. fmuladd intrinsics will be expanded to FMAs when this method
+ /// returns true, otherwise fmuladd is expanded to fmul + fadd.
+ bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
+
+ const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
+
+ /// \brief Returns false if N is a bit extraction pattern of (X >> C) & Mask.
+ bool isDesirableToCommuteWithShift(const SDNode *N) const override;
+
+ /// \brief Returns true if it is beneficial to convert a load of a constant
+ /// to just the constant itself.
+ bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+ Type *Ty) const override;
+
+ Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+ AtomicOrdering Ord) const override;
+ Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
+ Value *Addr, AtomicOrdering Ord) const override;
+
+ void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const override;
+
+ TargetLoweringBase::AtomicExpansionKind
+ shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
+ bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
+ TargetLoweringBase::AtomicExpansionKind
+ shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
+
+ bool shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
+
+ bool useLoadStackGuardNode() const override;
+ TargetLoweringBase::LegalizeTypeAction
+ getPreferredVectorAction(EVT VT) const override;
+
+ /// If the target has a standard location for the stack protector cookie,
+ /// returns the address of that location. Otherwise, returns nullptr.
+ Value *getIRStackGuard(IRBuilder<> &IRB) const override;
+
+ /// If the target has a standard location for the unsafe stack pointer,
+ /// returns the address of that location. Otherwise, returns nullptr.
+ Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override;
+
+ /// If a physical register, this returns the register that receives the
+ /// exception address on entry to an EH pad.
+ unsigned
+ getExceptionPointerRegister(const Constant *PersonalityFn) const override {
+ // FIXME: This is a guess. Has this been defined yet?
+ return AArch64::X0;
+ }
+
+ /// If a physical register, this returns the register that receives the
+ /// exception typeid on entry to a landing pad.
+ unsigned
+ getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
+ // FIXME: This is a guess. Has this been defined yet?
+ return AArch64::X1;
+ }
+
+ bool isIntDivCheap(EVT VT, AttributeSet Attr) const override;
+
+ bool isCheapToSpeculateCttz() const override {
+ return true;
+ }
+
+ bool isCheapToSpeculateCtlz() const override {
+ return true;
+ }
+
+ bool hasAndNotCompare(SDValue) const override {
+ // 'bics'
+ return true;
+ }
+
+ bool hasBitPreservingFPLogic(EVT VT) const override {
+ // FIXME: Is this always true? It should be true for vectors at least.
+ return VT == MVT::f32 || VT == MVT::f64;
+ }
+
+ bool supportSplitCSR(MachineFunction *MF) const override {
+ return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
+ MF->getFunction()->hasFnAttribute(Attribute::NoUnwind);
+ }
+ void initializeSplitCSR(MachineBasicBlock *Entry) const override;
+ void insertCopiesSplitCSR(
+ MachineBasicBlock *Entry,
+ const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
+
+ bool supportSwiftError() const override {
+ return true;
+ }
+
+private:
+ bool isExtFreeImpl(const Instruction *Ext) const override;
+
+ /// Keep a pointer to the AArch64Subtarget around so that we can
+ /// make the right decision when generating code for different targets.
+ const AArch64Subtarget *Subtarget;
+
+ void addTypeForNEON(MVT VT, MVT PromotedBitwiseVT);
+ void addDRTypeForNEON(MVT VT);
+ void addQRTypeForNEON(MVT VT);
+
+ SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &DL, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const override;
+
+ SDValue LowerCall(CallLoweringInfo & /*CLI*/,
+ SmallVectorImpl<SDValue> &InVals) const override;
+
+ SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+ CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &DL, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
+ SDValue ThisVal) const;
+
+ SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+
+ bool isEligibleForTailCallOptimization(
+ SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
+
+ /// Finds the incoming stack arguments which overlap the given fixed stack
+ /// object and incorporates their load into the current chain. This prevents
+ /// an upcoming store from clobbering the stack argument before it's used.
+ SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG,
+ MachineFrameInfo &MFI, int ClobberedFI) const;
+
+ bool DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const;
+
+ void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, const SDLoc &DL,
+ SDValue &Chain) const;
+
+ bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ LLVMContext &Context) const override;
+
+ SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
+ SelectionDAG &DAG) const override;
+
+ SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerELFTLSDescCallSeq(SDValue SymAddr, const SDLoc &DL,
+ SelectionDAG &DAG) const;
+ SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue RHS,
+ SDValue TVal, SDValue FVal, const SDLoc &dl,
+ SelectionDAG &DAG) const;
+ SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerDarwin_VASTART(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG,
+ RTLIB::Libcall Call) const;
+ SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVectorAND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
+ std::vector<SDNode *> *Created) const override;
+ SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
+ int &ExtraSteps, bool &UseOneConst,
+ bool Reciprocal) const override;
+ SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
+ int &ExtraSteps) const override;
+ unsigned combineRepeatedFPDivisors() const override;
+
+ ConstraintType getConstraintType(StringRef Constraint) const override;
+ unsigned getRegisterByName(const char* RegName, EVT VT,
+ SelectionDAG &DAG) const override;
+
+ /// Examine constraint string and operand type and determine a weight value.
+ /// The operand object must already have been set up with the operand type.
+ ConstraintWeight
+ getSingleConstraintMatchWeight(AsmOperandInfo &info,
+ const char *constraint) const override;
+
+ std::pair<unsigned, const TargetRegisterClass *>
+ getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ StringRef Constraint, MVT VT) const override;
+
+ const char *LowerXConstraint(EVT ConstraintVT) const override;
+
+ void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
+ std::vector<SDValue> &Ops,
+ SelectionDAG &DAG) const override;
+
+ unsigned getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
+ if (ConstraintCode == "Q")
+ return InlineAsm::Constraint_Q;
+ // FIXME: clang has code for 'Ump', 'Utf', 'Usa', and 'Ush' but these are
+ // followed by llvm_unreachable so we'll leave them unimplemented in
+ // the backend for now.
+ return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
+ }
+
+ bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
+ bool mayBeEmittedAsTailCall(CallInst *CI) const override;
+ bool getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset,
+ ISD::MemIndexedMode &AM, bool &IsInc,
+ SelectionDAG &DAG) const;
+ bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset,
+ ISD::MemIndexedMode &AM,
+ SelectionDAG &DAG) const override;
+ bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base,
+ SDValue &Offset, ISD::MemIndexedMode &AM,
+ SelectionDAG &DAG) const override;
+
+ void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const override;
+
+ bool functionArgumentNeedsConsecutiveRegisters(Type *Ty,
+ CallingConv::ID CallConv,
+ bool isVarArg) const override;
+
+ bool shouldNormalizeToSelectSequence(LLVMContext &, EVT) const override;
+};
+
+namespace AArch64 {
+FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo);
+} // end namespace AArch64
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrAtomics.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
new file mode 100644
index 000000000000..867074c3c374
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -0,0 +1,404 @@
+//=- AArch64InstrAtomics.td - AArch64 Atomic codegen support -*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// AArch64 Atomic operand code-gen constructs.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------
+// Atomic fences
+//===----------------------------------
+def : Pat<(atomic_fence (i64 4), (imm)), (DMB (i32 0x9))>;
+def : Pat<(atomic_fence (imm), (imm)), (DMB (i32 0xb))>;
+
+//===----------------------------------
+// Atomic loads
+//===----------------------------------
+
+// When they're actually atomic, only one addressing mode (GPR64sp) is
+// supported, but when they're relaxed and anything can be used, all the
+// standard modes would be valid and may give efficiency gains.
+
+// A atomic load operation that actually needs acquire semantics.
+class acquiring_load<PatFrag base>
+ : PatFrag<(ops node:$ptr), (base node:$ptr), [{
+ AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+ return isAcquireOrStronger(Ordering);
+}]>;
+
+// An atomic load operation that does not need either acquire or release
+// semantics.
+class relaxed_load<PatFrag base>
+ : PatFrag<(ops node:$ptr), (base node:$ptr), [{
+ AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+ return !isAcquireOrStronger(Ordering);
+}]>;
+
+// 8-bit loads
+def : Pat<(acquiring_load<atomic_load_8> GPR64sp:$ptr), (LDARB GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_8> (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm,
+ ro_Wextend8:$offset)),
+ (LDRBBroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$offset)>;
+def : Pat<(relaxed_load<atomic_load_8> (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend8:$offset)),
+ (LDRBBroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$offset)>;
+def : Pat<(relaxed_load<atomic_load_8> (am_indexed8 GPR64sp:$Rn,
+ uimm12s1:$offset)),
+ (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(relaxed_load<atomic_load_8>
+ (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
+ (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+
+// 16-bit loads
+def : Pat<(acquiring_load<atomic_load_16> GPR64sp:$ptr), (LDARH GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_16> (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
+ ro_Wextend16:$extend)),
+ (LDRHHroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend)>;
+def : Pat<(relaxed_load<atomic_load_16> (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend16:$extend)),
+ (LDRHHroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend)>;
+def : Pat<(relaxed_load<atomic_load_16> (am_indexed16 GPR64sp:$Rn,
+ uimm12s2:$offset)),
+ (LDRHHui GPR64sp:$Rn, uimm12s2:$offset)>;
+def : Pat<(relaxed_load<atomic_load_16>
+ (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
+ (LDURHHi GPR64sp:$Rn, simm9:$offset)>;
+
+// 32-bit loads
+def : Pat<(acquiring_load<atomic_load_32> GPR64sp:$ptr), (LDARW GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_32> (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
+ ro_Wextend32:$extend)),
+ (LDRWroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend)>;
+def : Pat<(relaxed_load<atomic_load_32> (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend32:$extend)),
+ (LDRWroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend)>;
+def : Pat<(relaxed_load<atomic_load_32> (am_indexed32 GPR64sp:$Rn,
+ uimm12s4:$offset)),
+ (LDRWui GPR64sp:$Rn, uimm12s4:$offset)>;
+def : Pat<(relaxed_load<atomic_load_32>
+ (am_unscaled32 GPR64sp:$Rn, simm9:$offset)),
+ (LDURWi GPR64sp:$Rn, simm9:$offset)>;
+
+// 64-bit loads
+def : Pat<(acquiring_load<atomic_load_64> GPR64sp:$ptr), (LDARX GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_64> (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+ ro_Wextend64:$extend)),
+ (LDRXroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
+def : Pat<(relaxed_load<atomic_load_64> (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend64:$extend)),
+ (LDRXroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
+def : Pat<(relaxed_load<atomic_load_64> (am_indexed64 GPR64sp:$Rn,
+ uimm12s8:$offset)),
+ (LDRXui GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat<(relaxed_load<atomic_load_64>
+ (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+ (LDURXi GPR64sp:$Rn, simm9:$offset)>;
+
+//===----------------------------------
+// Atomic stores
+//===----------------------------------
+
+// When they're actually atomic, only one addressing mode (GPR64sp) is
+// supported, but when they're relaxed and anything can be used, all the
+// standard modes would be valid and may give efficiency gains.
+
+// A store operation that actually needs release semantics.
+class releasing_store<PatFrag base>
+ : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
+ AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+ assert(Ordering != AtomicOrdering::AcquireRelease &&
+ "unexpected store ordering");
+ return isReleaseOrStronger(Ordering);
+}]>;
+
+// An atomic store operation that doesn't actually need to be atomic on AArch64.
+class relaxed_store<PatFrag base>
+ : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
+ AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+ return !isReleaseOrStronger(Ordering);
+}]>;
+
+// 8-bit stores
+def : Pat<(releasing_store<atomic_store_8> GPR64sp:$ptr, GPR32:$val),
+ (STLRB GPR32:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_8>
+ (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend),
+ GPR32:$val),
+ (STRBBroW GPR32:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend)>;
+def : Pat<(relaxed_store<atomic_store_8>
+ (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend),
+ GPR32:$val),
+ (STRBBroX GPR32:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend)>;
+def : Pat<(relaxed_store<atomic_store_8>
+ (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset), GPR32:$val),
+ (STRBBui GPR32:$val, GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(relaxed_store<atomic_store_8>
+ (am_unscaled8 GPR64sp:$Rn, simm9:$offset), GPR32:$val),
+ (STURBBi GPR32:$val, GPR64sp:$Rn, simm9:$offset)>;
+
+// 16-bit stores
+def : Pat<(releasing_store<atomic_store_16> GPR64sp:$ptr, GPR32:$val),
+ (STLRH GPR32:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_16> (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
+ ro_Wextend16:$extend),
+ GPR32:$val),
+ (STRHHroW GPR32:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend)>;
+def : Pat<(relaxed_store<atomic_store_16> (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend16:$extend),
+ GPR32:$val),
+ (STRHHroX GPR32:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend)>;
+def : Pat<(relaxed_store<atomic_store_16>
+ (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset), GPR32:$val),
+ (STRHHui GPR32:$val, GPR64sp:$Rn, uimm12s2:$offset)>;
+def : Pat<(relaxed_store<atomic_store_16>
+ (am_unscaled16 GPR64sp:$Rn, simm9:$offset), GPR32:$val),
+ (STURHHi GPR32:$val, GPR64sp:$Rn, simm9:$offset)>;
+
+// 32-bit stores
+def : Pat<(releasing_store<atomic_store_32> GPR64sp:$ptr, GPR32:$val),
+ (STLRW GPR32:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_32> (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
+ ro_Wextend32:$extend),
+ GPR32:$val),
+ (STRWroW GPR32:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend)>;
+def : Pat<(relaxed_store<atomic_store_32> (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend32:$extend),
+ GPR32:$val),
+ (STRWroX GPR32:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend)>;
+def : Pat<(relaxed_store<atomic_store_32>
+ (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset), GPR32:$val),
+ (STRWui GPR32:$val, GPR64sp:$Rn, uimm12s4:$offset)>;
+def : Pat<(relaxed_store<atomic_store_32>
+ (am_unscaled32 GPR64sp:$Rn, simm9:$offset), GPR32:$val),
+ (STURWi GPR32:$val, GPR64sp:$Rn, simm9:$offset)>;
+
+// 64-bit stores
+def : Pat<(releasing_store<atomic_store_64> GPR64sp:$ptr, GPR64:$val),
+ (STLRX GPR64:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_64> (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+ ro_Wextend16:$extend),
+ GPR64:$val),
+ (STRXroW GPR64:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
+def : Pat<(relaxed_store<atomic_store_64> (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend16:$extend),
+ GPR64:$val),
+ (STRXroX GPR64:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
+def : Pat<(relaxed_store<atomic_store_64>
+ (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset), GPR64:$val),
+ (STRXui GPR64:$val, GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat<(relaxed_store<atomic_store_64>
+ (am_unscaled64 GPR64sp:$Rn, simm9:$offset), GPR64:$val),
+ (STURXi GPR64:$val, GPR64sp:$Rn, simm9:$offset)>;
+
+//===----------------------------------
+// Low-level exclusive operations
+//===----------------------------------
+
+// Load-exclusives.
+
+def ldxr_1 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def ldxr_2 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def ldxr_4 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def ldxr_8 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+def : Pat<(ldxr_1 GPR64sp:$addr),
+ (SUBREG_TO_REG (i64 0), (LDXRB GPR64sp:$addr), sub_32)>;
+def : Pat<(ldxr_2 GPR64sp:$addr),
+ (SUBREG_TO_REG (i64 0), (LDXRH GPR64sp:$addr), sub_32)>;
+def : Pat<(ldxr_4 GPR64sp:$addr),
+ (SUBREG_TO_REG (i64 0), (LDXRW GPR64sp:$addr), sub_32)>;
+def : Pat<(ldxr_8 GPR64sp:$addr), (LDXRX GPR64sp:$addr)>;
+
+def : Pat<(and (ldxr_1 GPR64sp:$addr), 0xff),
+ (SUBREG_TO_REG (i64 0), (LDXRB GPR64sp:$addr), sub_32)>;
+def : Pat<(and (ldxr_2 GPR64sp:$addr), 0xffff),
+ (SUBREG_TO_REG (i64 0), (LDXRH GPR64sp:$addr), sub_32)>;
+def : Pat<(and (ldxr_4 GPR64sp:$addr), 0xffffffff),
+ (SUBREG_TO_REG (i64 0), (LDXRW GPR64sp:$addr), sub_32)>;
+
+// Load-exclusives.
+
+def ldaxr_1 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def ldaxr_2 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def ldaxr_4 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def ldaxr_8 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+def : Pat<(ldaxr_1 GPR64sp:$addr),
+ (SUBREG_TO_REG (i64 0), (LDAXRB GPR64sp:$addr), sub_32)>;
+def : Pat<(ldaxr_2 GPR64sp:$addr),
+ (SUBREG_TO_REG (i64 0), (LDAXRH GPR64sp:$addr), sub_32)>;
+def : Pat<(ldaxr_4 GPR64sp:$addr),
+ (SUBREG_TO_REG (i64 0), (LDAXRW GPR64sp:$addr), sub_32)>;
+def : Pat<(ldaxr_8 GPR64sp:$addr), (LDAXRX GPR64sp:$addr)>;
+
+def : Pat<(and (ldaxr_1 GPR64sp:$addr), 0xff),
+ (SUBREG_TO_REG (i64 0), (LDAXRB GPR64sp:$addr), sub_32)>;
+def : Pat<(and (ldaxr_2 GPR64sp:$addr), 0xffff),
+ (SUBREG_TO_REG (i64 0), (LDAXRH GPR64sp:$addr), sub_32)>;
+def : Pat<(and (ldaxr_4 GPR64sp:$addr), 0xffffffff),
+ (SUBREG_TO_REG (i64 0), (LDAXRW GPR64sp:$addr), sub_32)>;
+
+// Store-exclusives.
+
+def stxr_1 : PatFrag<(ops node:$val, node:$ptr),
+ (int_aarch64_stxr node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def stxr_2 : PatFrag<(ops node:$val, node:$ptr),
+ (int_aarch64_stxr node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def stxr_4 : PatFrag<(ops node:$val, node:$ptr),
+ (int_aarch64_stxr node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def stxr_8 : PatFrag<(ops node:$val, node:$ptr),
+ (int_aarch64_stxr node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+
+def : Pat<(stxr_1 GPR64:$val, GPR64sp:$addr),
+ (STXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_2 GPR64:$val, GPR64sp:$addr),
+ (STXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_4 GPR64:$val, GPR64sp:$addr),
+ (STXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_8 GPR64:$val, GPR64sp:$addr),
+ (STXRX GPR64:$val, GPR64sp:$addr)>;
+
+def : Pat<(stxr_1 (zext (and GPR32:$val, 0xff)), GPR64sp:$addr),
+ (STXRB GPR32:$val, GPR64sp:$addr)>;
+def : Pat<(stxr_2 (zext (and GPR32:$val, 0xffff)), GPR64sp:$addr),
+ (STXRH GPR32:$val, GPR64sp:$addr)>;
+def : Pat<(stxr_4 (zext GPR32:$val), GPR64sp:$addr),
+ (STXRW GPR32:$val, GPR64sp:$addr)>;
+
+def : Pat<(stxr_1 (and GPR64:$val, 0xff), GPR64sp:$addr),
+ (STXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_2 (and GPR64:$val, 0xffff), GPR64sp:$addr),
+ (STXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_4 (and GPR64:$val, 0xffffffff), GPR64sp:$addr),
+ (STXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+
+// Store-release-exclusives.
+
+def stlxr_1 : PatFrag<(ops node:$val, node:$ptr),
+ (int_aarch64_stlxr node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def stlxr_2 : PatFrag<(ops node:$val, node:$ptr),
+ (int_aarch64_stlxr node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def stlxr_4 : PatFrag<(ops node:$val, node:$ptr),
+ (int_aarch64_stlxr node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def stlxr_8 : PatFrag<(ops node:$val, node:$ptr),
+ (int_aarch64_stlxr node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+
+def : Pat<(stlxr_1 GPR64:$val, GPR64sp:$addr),
+ (STLXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_2 GPR64:$val, GPR64sp:$addr),
+ (STLXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_4 GPR64:$val, GPR64sp:$addr),
+ (STLXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_8 GPR64:$val, GPR64sp:$addr),
+ (STLXRX GPR64:$val, GPR64sp:$addr)>;
+
+def : Pat<(stlxr_1 (zext (and GPR32:$val, 0xff)), GPR64sp:$addr),
+ (STLXRB GPR32:$val, GPR64sp:$addr)>;
+def : Pat<(stlxr_2 (zext (and GPR32:$val, 0xffff)), GPR64sp:$addr),
+ (STLXRH GPR32:$val, GPR64sp:$addr)>;
+def : Pat<(stlxr_4 (zext GPR32:$val), GPR64sp:$addr),
+ (STLXRW GPR32:$val, GPR64sp:$addr)>;
+
+def : Pat<(stlxr_1 (and GPR64:$val, 0xff), GPR64sp:$addr),
+ (STLXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_2 (and GPR64:$val, 0xffff), GPR64sp:$addr),
+ (STLXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_4 (and GPR64:$val, 0xffffffff), GPR64sp:$addr),
+ (STLXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+
+
+// And clear exclusive.
+
+def : Pat<(int_aarch64_clrex), (CLREX 0xf)>;
+
+//===----------------------------------
+// Atomic cmpxchg for -O0
+//===----------------------------------
+
+// The fast register allocator used during -O0 inserts spills to cover any VRegs
+// live across basic block boundaries. When this happens between an LDXR and an
+// STXR it can clear the exclusive monitor, causing all cmpxchg attempts to
+// fail.
+
+// Unfortunately, this means we have to have an alternative (expanded
+// post-regalloc) path for -O0 compilations. Fortunately this path can be
+// significantly more naive than the standard expansion: we conservatively
+// assume seq_cst, strong cmpxchg and omit clrex on failure.
+
+let Constraints = "@earlyclobber $Rd,@earlyclobber $scratch",
+ mayLoad = 1, mayStore = 1 in {
+def CMP_SWAP_8 : Pseudo<(outs GPR32:$Rd, GPR32:$scratch),
+ (ins GPR64:$addr, GPR32:$desired, GPR32:$new), []>,
+ Sched<[WriteAtomic]>;
+
+def CMP_SWAP_16 : Pseudo<(outs GPR32:$Rd, GPR32:$scratch),
+ (ins GPR64:$addr, GPR32:$desired, GPR32:$new), []>,
+ Sched<[WriteAtomic]>;
+
+def CMP_SWAP_32 : Pseudo<(outs GPR32:$Rd, GPR32:$scratch),
+ (ins GPR64:$addr, GPR32:$desired, GPR32:$new), []>,
+ Sched<[WriteAtomic]>;
+
+def CMP_SWAP_64 : Pseudo<(outs GPR64:$Rd, GPR32:$scratch),
+ (ins GPR64:$addr, GPR64:$desired, GPR64:$new), []>,
+ Sched<[WriteAtomic]>;
+}
+
+let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi,@earlyclobber $scratch",
+ mayLoad = 1, mayStore = 1 in
+def CMP_SWAP_128 : Pseudo<(outs GPR64:$RdLo, GPR64:$RdHi, GPR32:$scratch),
+ (ins GPR64:$addr, GPR64:$desiredLo, GPR64:$desiredHi,
+ GPR64:$newLo, GPR64:$newHi), []>,
+ Sched<[WriteAtomic]>;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td
new file mode 100644
index 000000000000..cefdf51b50d2
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -0,0 +1,9516 @@
+//===- AArch64InstrFormats.td - AArch64 Instruction Formats --*- tblgen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Describe AArch64 instructions format here
+//
+
+// Format specifies the encoding used by the instruction. This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<2> val> {
+ bits<2> Value = val;
+}
+
+def PseudoFrm : Format<0>;
+def NormalFrm : Format<1>; // Do we need any others?
+
+// AArch64 Instruction Format
+class AArch64Inst<Format f, string cstr> : Instruction {
+ field bits<32> Inst; // Instruction encoding.
+ // Mask of bits that cause an encoding to be UNPREDICTABLE.
+ // If a bit is set, then if the corresponding bit in the
+ // target encoding differs from its value in the "Inst" field,
+ // the instruction is UNPREDICTABLE (SoftFail in abstract parlance).
+ field bits<32> Unpredictable = 0;
+ // SoftFail is the generic name for this field, but we alias it so
+ // as to make it more obvious what it means in ARM-land.
+ field bits<32> SoftFail = Unpredictable;
+ let Namespace = "AArch64";
+ Format F = f;
+ bits<2> Form = F.Value;
+ let Pattern = [];
+ let Constraints = cstr;
+}
+
+// Pseudo instructions (don't have encoding information)
+class Pseudo<dag oops, dag iops, list<dag> pattern, string cstr = "">
+ : AArch64Inst<PseudoFrm, cstr> {
+ dag OutOperandList = oops;
+ dag InOperandList = iops;
+ let Pattern = pattern;
+ let isCodeGenOnly = 1;
+}
+
+// Real instructions (have encoding information)
+class EncodedI<string cstr, list<dag> pattern> : AArch64Inst<NormalFrm, cstr> {
+ let Pattern = pattern;
+ let Size = 4;
+}
+
+// Normal instructions
+class I<dag oops, dag iops, string asm, string operands, string cstr,
+ list<dag> pattern>
+ : EncodedI<cstr, pattern> {
+ dag OutOperandList = oops;
+ dag InOperandList = iops;
+ let AsmString = !strconcat(asm, operands);
+}
+
+class TriOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$MHS, node:$RHS), res>;
+class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
+class UnOpFrag<dag res> : PatFrag<(ops node:$LHS), res>;
+
+// Helper fragment for an extract of the high portion of a 128-bit vector.
+def extract_high_v16i8 :
+ UnOpFrag<(extract_subvector (v16i8 node:$LHS), (i64 8))>;
+def extract_high_v8i16 :
+ UnOpFrag<(extract_subvector (v8i16 node:$LHS), (i64 4))>;
+def extract_high_v4i32 :
+ UnOpFrag<(extract_subvector (v4i32 node:$LHS), (i64 2))>;
+def extract_high_v2i64 :
+ UnOpFrag<(extract_subvector (v2i64 node:$LHS), (i64 1))>;
+
+//===----------------------------------------------------------------------===//
+// Asm Operand Classes.
+//
+
+// Shifter operand for arithmetic shifted encodings.
+def ShifterOperand : AsmOperandClass {
+ let Name = "Shifter";
+}
+
+// Shifter operand for mov immediate encodings.
+def MovImm32ShifterOperand : AsmOperandClass {
+ let SuperClasses = [ShifterOperand];
+ let Name = "MovImm32Shifter";
+ let RenderMethod = "addShifterOperands";
+ let DiagnosticType = "InvalidMovImm32Shift";
+}
+def MovImm64ShifterOperand : AsmOperandClass {
+ let SuperClasses = [ShifterOperand];
+ let Name = "MovImm64Shifter";
+ let RenderMethod = "addShifterOperands";
+ let DiagnosticType = "InvalidMovImm64Shift";
+}
+
+// Shifter operand for arithmetic register shifted encodings.
+class ArithmeticShifterOperand<int width> : AsmOperandClass {
+ let SuperClasses = [ShifterOperand];
+ let Name = "ArithmeticShifter" # width;
+ let PredicateMethod = "isArithmeticShifter<" # width # ">";
+ let RenderMethod = "addShifterOperands";
+ let DiagnosticType = "AddSubRegShift" # width;
+}
+
+def ArithmeticShifterOperand32 : ArithmeticShifterOperand<32>;
+def ArithmeticShifterOperand64 : ArithmeticShifterOperand<64>;
+
+// Shifter operand for logical register shifted encodings.
+class LogicalShifterOperand<int width> : AsmOperandClass {
+ let SuperClasses = [ShifterOperand];
+ let Name = "LogicalShifter" # width;
+ let PredicateMethod = "isLogicalShifter<" # width # ">";
+ let RenderMethod = "addShifterOperands";
+ let DiagnosticType = "AddSubRegShift" # width;
+}
+
+def LogicalShifterOperand32 : LogicalShifterOperand<32>;
+def LogicalShifterOperand64 : LogicalShifterOperand<64>;
+
+// Shifter operand for logical vector 128/64-bit shifted encodings.
+def LogicalVecShifterOperand : AsmOperandClass {
+ let SuperClasses = [ShifterOperand];
+ let Name = "LogicalVecShifter";
+ let RenderMethod = "addShifterOperands";
+}
+def LogicalVecHalfWordShifterOperand : AsmOperandClass {
+ let SuperClasses = [LogicalVecShifterOperand];
+ let Name = "LogicalVecHalfWordShifter";
+ let RenderMethod = "addShifterOperands";
+}
+
+// The "MSL" shifter on the vector MOVI instruction.
+def MoveVecShifterOperand : AsmOperandClass {
+ let SuperClasses = [ShifterOperand];
+ let Name = "MoveVecShifter";
+ let RenderMethod = "addShifterOperands";
+}
+
+// Extend operand for arithmetic encodings.
+def ExtendOperand : AsmOperandClass {
+ let Name = "Extend";
+ let DiagnosticType = "AddSubRegExtendLarge";
+}
+def ExtendOperand64 : AsmOperandClass {
+ let SuperClasses = [ExtendOperand];
+ let Name = "Extend64";
+ let DiagnosticType = "AddSubRegExtendSmall";
+}
+// 'extend' that's a lsl of a 64-bit register.
+def ExtendOperandLSL64 : AsmOperandClass {
+ let SuperClasses = [ExtendOperand];
+ let Name = "ExtendLSL64";
+ let RenderMethod = "addExtend64Operands";
+ let DiagnosticType = "AddSubRegExtendLarge";
+}
+
+// 8-bit floating-point immediate encodings.
+def FPImmOperand : AsmOperandClass {
+ let Name = "FPImm";
+ let ParserMethod = "tryParseFPImm";
+ let DiagnosticType = "InvalidFPImm";
+}
+
+def CondCode : AsmOperandClass {
+ let Name = "CondCode";
+ let DiagnosticType = "InvalidCondCode";
+}
+
+// A 32-bit register pasrsed as 64-bit
+def GPR32as64Operand : AsmOperandClass {
+ let Name = "GPR32as64";
+}
+def GPR32as64 : RegisterOperand<GPR32> {
+ let ParserMatchClass = GPR32as64Operand;
+}
+
+// 8-bit immediate for AdvSIMD where 64-bit values of the form:
+// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
+// are encoded as the eight bit value 'abcdefgh'.
+def SIMDImmType10Operand : AsmOperandClass { let Name = "SIMDImmType10"; }
+
+
+//===----------------------------------------------------------------------===//
+// Operand Definitions.
+//
+
+// ADR[P] instruction labels.
+def AdrpOperand : AsmOperandClass {
+ let Name = "AdrpLabel";
+ let ParserMethod = "tryParseAdrpLabel";
+ let DiagnosticType = "InvalidLabel";
+}
+def adrplabel : Operand<i64> {
+ let EncoderMethod = "getAdrLabelOpValue";
+ let PrintMethod = "printAdrpLabel";
+ let ParserMatchClass = AdrpOperand;
+}
+
+def AdrOperand : AsmOperandClass {
+ let Name = "AdrLabel";
+ let ParserMethod = "tryParseAdrLabel";
+ let DiagnosticType = "InvalidLabel";
+}
+def adrlabel : Operand<i64> {
+ let EncoderMethod = "getAdrLabelOpValue";
+ let ParserMatchClass = AdrOperand;
+}
+
+// simm9 predicate - True if the immediate is in the range [-256, 255].
+def SImm9Operand : AsmOperandClass {
+ let Name = "SImm9";
+ let DiagnosticType = "InvalidMemoryIndexedSImm9";
+}
+def simm9 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= -256 && Imm < 256; }]> {
+ let ParserMatchClass = SImm9Operand;
+}
+
+// simm7sN predicate - True if the immediate is a multiple of N in the range
+// [-64 * N, 63 * N].
+class SImm7Scaled<int Scale> : AsmOperandClass {
+ let Name = "SImm7s" # Scale;
+ let DiagnosticType = "InvalidMemoryIndexed" # Scale # "SImm7";
+}
+
+def SImm7s4Operand : SImm7Scaled<4>;
+def SImm7s8Operand : SImm7Scaled<8>;
+def SImm7s16Operand : SImm7Scaled<16>;
+
+def simm7s4 : Operand<i32> {
+ let ParserMatchClass = SImm7s4Operand;
+ let PrintMethod = "printImmScale<4>";
+}
+
+def simm7s8 : Operand<i32> {
+ let ParserMatchClass = SImm7s8Operand;
+ let PrintMethod = "printImmScale<8>";
+}
+
+def simm7s16 : Operand<i32> {
+ let ParserMatchClass = SImm7s16Operand;
+ let PrintMethod = "printImmScale<16>";
+}
+
+def am_indexed7s8 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S8", []>;
+def am_indexed7s16 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S16", []>;
+def am_indexed7s32 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S32", []>;
+def am_indexed7s64 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S64", []>;
+def am_indexed7s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S128", []>;
+
+class AsmImmRange<int Low, int High> : AsmOperandClass {
+ let Name = "Imm" # Low # "_" # High;
+ let DiagnosticType = "InvalidImm" # Low # "_" # High;
+}
+
+def Imm1_8Operand : AsmImmRange<1, 8>;
+def Imm1_16Operand : AsmImmRange<1, 16>;
+def Imm1_32Operand : AsmImmRange<1, 32>;
+def Imm1_64Operand : AsmImmRange<1, 64>;
+
+def MovZSymbolG3AsmOperand : AsmOperandClass {
+ let Name = "MovZSymbolG3";
+ let RenderMethod = "addImmOperands";
+}
+
+def movz_symbol_g3 : Operand<i32> {
+ let ParserMatchClass = MovZSymbolG3AsmOperand;
+}
+
+def MovZSymbolG2AsmOperand : AsmOperandClass {
+ let Name = "MovZSymbolG2";
+ let RenderMethod = "addImmOperands";
+}
+
+def movz_symbol_g2 : Operand<i32> {
+ let ParserMatchClass = MovZSymbolG2AsmOperand;
+}
+
+def MovZSymbolG1AsmOperand : AsmOperandClass {
+ let Name = "MovZSymbolG1";
+ let RenderMethod = "addImmOperands";
+}
+
+def movz_symbol_g1 : Operand<i32> {
+ let ParserMatchClass = MovZSymbolG1AsmOperand;
+}
+
+def MovZSymbolG0AsmOperand : AsmOperandClass {
+ let Name = "MovZSymbolG0";
+ let RenderMethod = "addImmOperands";
+}
+
+def movz_symbol_g0 : Operand<i32> {
+ let ParserMatchClass = MovZSymbolG0AsmOperand;
+}
+
+def MovKSymbolG3AsmOperand : AsmOperandClass {
+ let Name = "MovKSymbolG3";
+ let RenderMethod = "addImmOperands";
+}
+
+def movk_symbol_g3 : Operand<i32> {
+ let ParserMatchClass = MovKSymbolG3AsmOperand;
+}
+
+def MovKSymbolG2AsmOperand : AsmOperandClass {
+ let Name = "MovKSymbolG2";
+ let RenderMethod = "addImmOperands";
+}
+
+def movk_symbol_g2 : Operand<i32> {
+ let ParserMatchClass = MovKSymbolG2AsmOperand;
+}
+
+def MovKSymbolG1AsmOperand : AsmOperandClass {
+ let Name = "MovKSymbolG1";
+ let RenderMethod = "addImmOperands";
+}
+
+def movk_symbol_g1 : Operand<i32> {
+ let ParserMatchClass = MovKSymbolG1AsmOperand;
+}
+
+def MovKSymbolG0AsmOperand : AsmOperandClass {
+ let Name = "MovKSymbolG0";
+ let RenderMethod = "addImmOperands";
+}
+
+def movk_symbol_g0 : Operand<i32> {
+ let ParserMatchClass = MovKSymbolG0AsmOperand;
+}
+
+class fixedpoint_i32<ValueType FloatVT>
+ : Operand<FloatVT>,
+ ComplexPattern<FloatVT, 1, "SelectCVTFixedPosOperand<32>", [fpimm, ld]> {
+ let EncoderMethod = "getFixedPointScaleOpValue";
+ let DecoderMethod = "DecodeFixedPointScaleImm32";
+ let ParserMatchClass = Imm1_32Operand;
+}
+
+class fixedpoint_i64<ValueType FloatVT>
+ : Operand<FloatVT>,
+ ComplexPattern<FloatVT, 1, "SelectCVTFixedPosOperand<64>", [fpimm, ld]> {
+ let EncoderMethod = "getFixedPointScaleOpValue";
+ let DecoderMethod = "DecodeFixedPointScaleImm64";
+ let ParserMatchClass = Imm1_64Operand;
+}
+
+def fixedpoint_f16_i32 : fixedpoint_i32<f16>;
+def fixedpoint_f32_i32 : fixedpoint_i32<f32>;
+def fixedpoint_f64_i32 : fixedpoint_i32<f64>;
+
+def fixedpoint_f16_i64 : fixedpoint_i64<f16>;
+def fixedpoint_f32_i64 : fixedpoint_i64<f32>;
+def fixedpoint_f64_i64 : fixedpoint_i64<f64>;
+
+def vecshiftR8 : Operand<i32>, ImmLeaf<i32, [{
+ return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9);
+}]> {
+ let EncoderMethod = "getVecShiftR8OpValue";
+ let DecoderMethod = "DecodeVecShiftR8Imm";
+ let ParserMatchClass = Imm1_8Operand;
+}
+def vecshiftR16 : Operand<i32>, ImmLeaf<i32, [{
+ return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
+}]> {
+ let EncoderMethod = "getVecShiftR16OpValue";
+ let DecoderMethod = "DecodeVecShiftR16Imm";
+ let ParserMatchClass = Imm1_16Operand;
+}
+def vecshiftR16Narrow : Operand<i32>, ImmLeaf<i32, [{
+ return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9);
+}]> {
+ let EncoderMethod = "getVecShiftR16OpValue";
+ let DecoderMethod = "DecodeVecShiftR16ImmNarrow";
+ let ParserMatchClass = Imm1_8Operand;
+}
+def vecshiftR32 : Operand<i32>, ImmLeaf<i32, [{
+ return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33);
+}]> {
+ let EncoderMethod = "getVecShiftR32OpValue";
+ let DecoderMethod = "DecodeVecShiftR32Imm";
+ let ParserMatchClass = Imm1_32Operand;
+}
+def vecshiftR32Narrow : Operand<i32>, ImmLeaf<i32, [{
+ return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
+}]> {
+ let EncoderMethod = "getVecShiftR32OpValue";
+ let DecoderMethod = "DecodeVecShiftR32ImmNarrow";
+ let ParserMatchClass = Imm1_16Operand;
+}
+def vecshiftR64 : Operand<i32>, ImmLeaf<i32, [{
+ return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 65);
+}]> {
+ let EncoderMethod = "getVecShiftR64OpValue";
+ let DecoderMethod = "DecodeVecShiftR64Imm";
+ let ParserMatchClass = Imm1_64Operand;
+}
+def vecshiftR64Narrow : Operand<i32>, ImmLeaf<i32, [{
+ return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33);
+}]> {
+ let EncoderMethod = "getVecShiftR64OpValue";
+ let DecoderMethod = "DecodeVecShiftR64ImmNarrow";
+ let ParserMatchClass = Imm1_32Operand;
+}
+
+def Imm0_1Operand : AsmImmRange<0, 1>;
+def Imm0_7Operand : AsmImmRange<0, 7>;
+def Imm0_15Operand : AsmImmRange<0, 15>;
+def Imm0_31Operand : AsmImmRange<0, 31>;
+def Imm0_63Operand : AsmImmRange<0, 63>;
+
+def vecshiftL8 : Operand<i32>, ImmLeaf<i32, [{
+ return (((uint32_t)Imm) < 8);
+}]> {
+ let EncoderMethod = "getVecShiftL8OpValue";
+ let DecoderMethod = "DecodeVecShiftL8Imm";
+ let ParserMatchClass = Imm0_7Operand;
+}
+def vecshiftL16 : Operand<i32>, ImmLeaf<i32, [{
+ return (((uint32_t)Imm) < 16);
+}]> {
+ let EncoderMethod = "getVecShiftL16OpValue";
+ let DecoderMethod = "DecodeVecShiftL16Imm";
+ let ParserMatchClass = Imm0_15Operand;
+}
+def vecshiftL32 : Operand<i32>, ImmLeaf<i32, [{
+ return (((uint32_t)Imm) < 32);
+}]> {
+ let EncoderMethod = "getVecShiftL32OpValue";
+ let DecoderMethod = "DecodeVecShiftL32Imm";
+ let ParserMatchClass = Imm0_31Operand;
+}
+def vecshiftL64 : Operand<i32>, ImmLeaf<i32, [{
+ return (((uint32_t)Imm) < 64);
+}]> {
+ let EncoderMethod = "getVecShiftL64OpValue";
+ let DecoderMethod = "DecodeVecShiftL64Imm";
+ let ParserMatchClass = Imm0_63Operand;
+}
+
+
+// Crazy immediate formats used by 32-bit and 64-bit logical immediate
+// instructions for splatting repeating bit patterns across the immediate.
+def logical_imm32_XFORM : SDNodeXForm<imm, [{
+ uint64_t enc = AArch64_AM::encodeLogicalImmediate(N->getZExtValue(), 32);
+ return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
+}]>;
+def logical_imm64_XFORM : SDNodeXForm<imm, [{
+ uint64_t enc = AArch64_AM::encodeLogicalImmediate(N->getZExtValue(), 64);
+ return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
+}]>;
+
+let DiagnosticType = "LogicalSecondSource" in {
+ def LogicalImm32Operand : AsmOperandClass {
+ let Name = "LogicalImm32";
+ }
+ def LogicalImm64Operand : AsmOperandClass {
+ let Name = "LogicalImm64";
+ }
+ def LogicalImm32NotOperand : AsmOperandClass {
+ let Name = "LogicalImm32Not";
+ }
+ def LogicalImm64NotOperand : AsmOperandClass {
+ let Name = "LogicalImm64Not";
+ }
+}
+def logical_imm32 : Operand<i32>, PatLeaf<(imm), [{
+ return AArch64_AM::isLogicalImmediate(N->getZExtValue(), 32);
+}], logical_imm32_XFORM> {
+ let PrintMethod = "printLogicalImm32";
+ let ParserMatchClass = LogicalImm32Operand;
+}
+def logical_imm64 : Operand<i64>, PatLeaf<(imm), [{
+ return AArch64_AM::isLogicalImmediate(N->getZExtValue(), 64);
+}], logical_imm64_XFORM> {
+ let PrintMethod = "printLogicalImm64";
+ let ParserMatchClass = LogicalImm64Operand;
+}
+def logical_imm32_not : Operand<i32> {
+ let ParserMatchClass = LogicalImm32NotOperand;
+}
+def logical_imm64_not : Operand<i64> {
+ let ParserMatchClass = LogicalImm64NotOperand;
+}
+
+// imm0_65535 predicate - True if the immediate is in the range [0,65535].
+def Imm0_65535Operand : AsmImmRange<0, 65535>;
+def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{
+ return ((uint32_t)Imm) < 65536;
+}]> {
+ let ParserMatchClass = Imm0_65535Operand;
+ let PrintMethod = "printImmHex";
+}
+
+// imm0_255 predicate - True if the immediate is in the range [0,255].
+def Imm0_255Operand : AsmOperandClass { let Name = "Imm0_255"; }
+def imm0_255 : Operand<i32>, ImmLeaf<i32, [{
+ return ((uint32_t)Imm) < 256;
+}]> {
+ let ParserMatchClass = Imm0_255Operand;
+ let PrintMethod = "printImm";
+}
+
+// imm0_127 predicate - True if the immediate is in the range [0,127]
+def Imm0_127Operand : AsmImmRange<0, 127>;
+def imm0_127 : Operand<i32>, ImmLeaf<i32, [{
+ return ((uint32_t)Imm) < 128;
+}]> {
+ let ParserMatchClass = Imm0_127Operand;
+ let PrintMethod = "printImm";
+}
+
+// NOTE: These imm0_N operands have to be of type i64 because i64 is the size
+// for all shift-amounts.
+
+// imm0_63 predicate - True if the immediate is in the range [0,63]
+def imm0_63 : Operand<i64>, ImmLeaf<i64, [{
+ return ((uint64_t)Imm) < 64;
+}]> {
+ let ParserMatchClass = Imm0_63Operand;
+}
+
+// imm0_31 predicate - True if the immediate is in the range [0,31]
+def imm0_31 : Operand<i64>, ImmLeaf<i64, [{
+ return ((uint64_t)Imm) < 32;
+}]> {
+ let ParserMatchClass = Imm0_31Operand;
+}
+
+// True if the 32-bit immediate is in the range [0,31]
+def imm32_0_31 : Operand<i32>, ImmLeaf<i32, [{
+ return ((uint64_t)Imm) < 32;
+}]> {
+ let ParserMatchClass = Imm0_31Operand;
+}
+
+// imm0_1 predicate - True if the immediate is in the range [0,1]
+def imm0_1 : Operand<i64>, ImmLeaf<i64, [{
+ return ((uint64_t)Imm) < 2;
+}]> {
+ let ParserMatchClass = Imm0_1Operand;
+}
+
+// imm0_15 predicate - True if the immediate is in the range [0,15]
+def imm0_15 : Operand<i64>, ImmLeaf<i64, [{
+ return ((uint64_t)Imm) < 16;
+}]> {
+ let ParserMatchClass = Imm0_15Operand;
+}
+
+// imm0_7 predicate - True if the immediate is in the range [0,7]
+def imm0_7 : Operand<i64>, ImmLeaf<i64, [{
+ return ((uint64_t)Imm) < 8;
+}]> {
+ let ParserMatchClass = Imm0_7Operand;
+}
+
+// imm32_0_15 predicate - True if the 32-bit immediate is in the range [0,15]
+def imm32_0_15 : Operand<i32>, ImmLeaf<i32, [{
+ return ((uint32_t)Imm) < 16;
+}]> {
+ let ParserMatchClass = Imm0_15Operand;
+}
+
+// An arithmetic shifter operand:
+// {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr
+// {5-0} - imm6
+class arith_shift<ValueType Ty, int width> : Operand<Ty> {
+ let PrintMethod = "printShifter";
+ let ParserMatchClass = !cast<AsmOperandClass>(
+ "ArithmeticShifterOperand" # width);
+}
+
+def arith_shift32 : arith_shift<i32, 32>;
+def arith_shift64 : arith_shift<i64, 64>;
+
+class arith_shifted_reg<ValueType Ty, RegisterClass regclass, int width>
+ : Operand<Ty>,
+ ComplexPattern<Ty, 2, "SelectArithShiftedRegister", []> {
+ let PrintMethod = "printShiftedRegister";
+ let MIOperandInfo = (ops regclass, !cast<Operand>("arith_shift" # width));
+}
+
+def arith_shifted_reg32 : arith_shifted_reg<i32, GPR32, 32>;
+def arith_shifted_reg64 : arith_shifted_reg<i64, GPR64, 64>;
+
+// An arithmetic shifter operand:
+// {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr, 11 = ror
+// {5-0} - imm6
+class logical_shift<int width> : Operand<i32> {
+ let PrintMethod = "printShifter";
+ let ParserMatchClass = !cast<AsmOperandClass>(
+ "LogicalShifterOperand" # width);
+}
+
+def logical_shift32 : logical_shift<32>;
+def logical_shift64 : logical_shift<64>;
+
+class logical_shifted_reg<ValueType Ty, RegisterClass regclass, Operand shiftop>
+ : Operand<Ty>,
+ ComplexPattern<Ty, 2, "SelectLogicalShiftedRegister", []> {
+ let PrintMethod = "printShiftedRegister";
+ let MIOperandInfo = (ops regclass, shiftop);
+}
+
+def logical_shifted_reg32 : logical_shifted_reg<i32, GPR32, logical_shift32>;
+def logical_shifted_reg64 : logical_shifted_reg<i64, GPR64, logical_shift64>;
+
+// A logical vector shifter operand:
+// {7-6} - shift type: 00 = lsl
+// {5-0} - imm6: #0, #8, #16, or #24
+def logical_vec_shift : Operand<i32> {
+ let PrintMethod = "printShifter";
+ let EncoderMethod = "getVecShifterOpValue";
+ let ParserMatchClass = LogicalVecShifterOperand;
+}
+
+// A logical vector half-word shifter operand:
+// {7-6} - shift type: 00 = lsl
+// {5-0} - imm6: #0 or #8
+def logical_vec_hw_shift : Operand<i32> {
+ let PrintMethod = "printShifter";
+ let EncoderMethod = "getVecShifterOpValue";
+ let ParserMatchClass = LogicalVecHalfWordShifterOperand;
+}
+
+// A vector move shifter operand:
+// {0} - imm1: #8 or #16
+def move_vec_shift : Operand<i32> {
+ let PrintMethod = "printShifter";
+ let EncoderMethod = "getMoveVecShifterOpValue";
+ let ParserMatchClass = MoveVecShifterOperand;
+}
+
+let DiagnosticType = "AddSubSecondSource" in {
+ def AddSubImmOperand : AsmOperandClass {
+ let Name = "AddSubImm";
+ let ParserMethod = "tryParseAddSubImm";
+ }
+ def AddSubImmNegOperand : AsmOperandClass {
+ let Name = "AddSubImmNeg";
+ let ParserMethod = "tryParseAddSubImm";
+ }
+}
+// An ADD/SUB immediate shifter operand:
+// second operand:
+// {7-6} - shift type: 00 = lsl
+// {5-0} - imm6: #0 or #12
+class addsub_shifted_imm<ValueType Ty>
+ : Operand<Ty>, ComplexPattern<Ty, 2, "SelectArithImmed", [imm]> {
+ let PrintMethod = "printAddSubImm";
+ let EncoderMethod = "getAddSubImmOpValue";
+ let ParserMatchClass = AddSubImmOperand;
+ let MIOperandInfo = (ops i32imm, i32imm);
+}
+
+class addsub_shifted_imm_neg<ValueType Ty>
+ : Operand<Ty> {
+ let EncoderMethod = "getAddSubImmOpValue";
+ let ParserMatchClass = AddSubImmNegOperand;
+ let MIOperandInfo = (ops i32imm, i32imm);
+}
+
+def addsub_shifted_imm32 : addsub_shifted_imm<i32>;
+def addsub_shifted_imm64 : addsub_shifted_imm<i64>;
+def addsub_shifted_imm32_neg : addsub_shifted_imm_neg<i32>;
+def addsub_shifted_imm64_neg : addsub_shifted_imm_neg<i64>;
+
+class neg_addsub_shifted_imm<ValueType Ty>
+ : Operand<Ty>, ComplexPattern<Ty, 2, "SelectNegArithImmed", [imm]> {
+ let PrintMethod = "printAddSubImm";
+ let EncoderMethod = "getAddSubImmOpValue";
+ let ParserMatchClass = AddSubImmOperand;
+ let MIOperandInfo = (ops i32imm, i32imm);
+}
+
+def neg_addsub_shifted_imm32 : neg_addsub_shifted_imm<i32>;
+def neg_addsub_shifted_imm64 : neg_addsub_shifted_imm<i64>;
+
+// An extend operand:
+// {5-3} - extend type
+// {2-0} - imm3
+def arith_extend : Operand<i32> {
+ let PrintMethod = "printArithExtend";
+ let ParserMatchClass = ExtendOperand;
+}
+def arith_extend64 : Operand<i32> {
+ let PrintMethod = "printArithExtend";
+ let ParserMatchClass = ExtendOperand64;
+}
+
+// 'extend' that's a lsl of a 64-bit register.
+def arith_extendlsl64 : Operand<i32> {
+ let PrintMethod = "printArithExtend";
+ let ParserMatchClass = ExtendOperandLSL64;
+}
+
+class arith_extended_reg32<ValueType Ty> : Operand<Ty>,
+ ComplexPattern<Ty, 2, "SelectArithExtendedRegister", []> {
+ let PrintMethod = "printExtendedRegister";
+ let MIOperandInfo = (ops GPR32, arith_extend);
+}
+
+class arith_extended_reg32to64<ValueType Ty> : Operand<Ty>,
+ ComplexPattern<Ty, 2, "SelectArithExtendedRegister", []> {
+ let PrintMethod = "printExtendedRegister";
+ let MIOperandInfo = (ops GPR32, arith_extend64);
+}
+
+// Floating-point immediate.
+def fpimm16 : Operand<f16>,
+ PatLeaf<(f16 fpimm), [{
+ return AArch64_AM::getFP16Imm(N->getValueAPF()) != -1;
+ }], SDNodeXForm<fpimm, [{
+ APFloat InVal = N->getValueAPF();
+ uint32_t enc = AArch64_AM::getFP16Imm(InVal);
+ return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
+ }]>> {
+ let ParserMatchClass = FPImmOperand;
+ let PrintMethod = "printFPImmOperand";
+}
+def fpimm32 : Operand<f32>,
+ PatLeaf<(f32 fpimm), [{
+ return AArch64_AM::getFP32Imm(N->getValueAPF()) != -1;
+ }], SDNodeXForm<fpimm, [{
+ APFloat InVal = N->getValueAPF();
+ uint32_t enc = AArch64_AM::getFP32Imm(InVal);
+ return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
+ }]>> {
+ let ParserMatchClass = FPImmOperand;
+ let PrintMethod = "printFPImmOperand";
+}
+def fpimm64 : Operand<f64>,
+ PatLeaf<(f64 fpimm), [{
+ return AArch64_AM::getFP64Imm(N->getValueAPF()) != -1;
+ }], SDNodeXForm<fpimm, [{
+ APFloat InVal = N->getValueAPF();
+ uint32_t enc = AArch64_AM::getFP64Imm(InVal);
+ return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
+ }]>> {
+ let ParserMatchClass = FPImmOperand;
+ let PrintMethod = "printFPImmOperand";
+}
+
+def fpimm8 : Operand<i32> {
+ let ParserMatchClass = FPImmOperand;
+ let PrintMethod = "printFPImmOperand";
+}
+
+def fpimm0 : PatLeaf<(fpimm), [{
+ return N->isExactlyValue(+0.0);
+}]>;
+
+// Vector lane operands
+class AsmVectorIndex<string Suffix> : AsmOperandClass {
+ let Name = "VectorIndex" # Suffix;
+ let DiagnosticType = "InvalidIndex" # Suffix;
+}
+def VectorIndex1Operand : AsmVectorIndex<"1">;
+def VectorIndexBOperand : AsmVectorIndex<"B">;
+def VectorIndexHOperand : AsmVectorIndex<"H">;
+def VectorIndexSOperand : AsmVectorIndex<"S">;
+def VectorIndexDOperand : AsmVectorIndex<"D">;
+
+def VectorIndex1 : Operand<i64>, ImmLeaf<i64, [{
+ return ((uint64_t)Imm) == 1;
+}]> {
+ let ParserMatchClass = VectorIndex1Operand;
+ let PrintMethod = "printVectorIndex";
+ let MIOperandInfo = (ops i64imm);
+}
+def VectorIndexB : Operand<i64>, ImmLeaf<i64, [{
+ return ((uint64_t)Imm) < 16;
+}]> {
+ let ParserMatchClass = VectorIndexBOperand;
+ let PrintMethod = "printVectorIndex";
+ let MIOperandInfo = (ops i64imm);
+}
+def VectorIndexH : Operand<i64>, ImmLeaf<i64, [{
+ return ((uint64_t)Imm) < 8;
+}]> {
+ let ParserMatchClass = VectorIndexHOperand;
+ let PrintMethod = "printVectorIndex";
+ let MIOperandInfo = (ops i64imm);
+}
+def VectorIndexS : Operand<i64>, ImmLeaf<i64, [{
+ return ((uint64_t)Imm) < 4;
+}]> {
+ let ParserMatchClass = VectorIndexSOperand;
+ let PrintMethod = "printVectorIndex";
+ let MIOperandInfo = (ops i64imm);
+}
+def VectorIndexD : Operand<i64>, ImmLeaf<i64, [{
+ return ((uint64_t)Imm) < 2;
+}]> {
+ let ParserMatchClass = VectorIndexDOperand;
+ let PrintMethod = "printVectorIndex";
+ let MIOperandInfo = (ops i64imm);
+}
+
+// 8-bit immediate for AdvSIMD where 64-bit values of the form:
+// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
+// are encoded as the eight bit value 'abcdefgh'.
+def simdimmtype10 : Operand<i32>,
+ PatLeaf<(f64 fpimm), [{
+ return AArch64_AM::isAdvSIMDModImmType10(N->getValueAPF()
+ .bitcastToAPInt()
+ .getZExtValue());
+ }], SDNodeXForm<fpimm, [{
+ APFloat InVal = N->getValueAPF();
+ uint32_t enc = AArch64_AM::encodeAdvSIMDModImmType10(N->getValueAPF()
+ .bitcastToAPInt()
+ .getZExtValue());
+ return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
+ }]>> {
+ let ParserMatchClass = SIMDImmType10Operand;
+ let PrintMethod = "printSIMDType10Operand";
+}
+
+
+//---
+// System management
+//---
+
+// Base encoding for system instruction operands.
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class BaseSystemI<bit L, dag oops, dag iops, string asm, string operands,
+ list<dag> pattern = []>
+ : I<oops, iops, asm, operands, "", pattern> {
+ let Inst{31-22} = 0b1101010100;
+ let Inst{21} = L;
+}
+
+// System instructions which do not have an Rt register.
+class SimpleSystemI<bit L, dag iops, string asm, string operands,
+ list<dag> pattern = []>
+ : BaseSystemI<L, (outs), iops, asm, operands, pattern> {
+ let Inst{4-0} = 0b11111;
+}
+
+// System instructions which have an Rt register.
+class RtSystemI<bit L, dag oops, dag iops, string asm, string operands>
+ : BaseSystemI<L, oops, iops, asm, operands>,
+ Sched<[WriteSys]> {
+ bits<5> Rt;
+ let Inst{4-0} = Rt;
+}
+
+// Hint instructions that take both a CRm and a 3-bit immediate.
+// NOTE: ideally, this would have mayStore = 0, mayLoad = 0, but we cannot
+// model patterns with sufficiently fine granularity
+let mayStore = 1, mayLoad = 1, hasSideEffects = 1 in
+ class HintI<string mnemonic>
+ : SimpleSystemI<0, (ins imm0_127:$imm), mnemonic#"\t$imm", "",
+ [(int_aarch64_hint imm0_127:$imm)]>,
+ Sched<[WriteHint]> {
+ bits <7> imm;
+ let Inst{20-12} = 0b000110010;
+ let Inst{11-5} = imm;
+ }
+
+// System instructions taking a single literal operand which encodes into
+// CRm. op2 differentiates the opcodes.
+def BarrierAsmOperand : AsmOperandClass {
+ let Name = "Barrier";
+ let ParserMethod = "tryParseBarrierOperand";
+}
+def barrier_op : Operand<i32> {
+ let PrintMethod = "printBarrierOption";
+ let ParserMatchClass = BarrierAsmOperand;
+}
+class CRmSystemI<Operand crmtype, bits<3> opc, string asm,
+ list<dag> pattern = []>
+ : SimpleSystemI<0, (ins crmtype:$CRm), asm, "\t$CRm", pattern>,
+ Sched<[WriteBarrier]> {
+ bits<4> CRm;
+ let Inst{20-12} = 0b000110011;
+ let Inst{11-8} = CRm;
+ let Inst{7-5} = opc;
+}
+
+// MRS/MSR system instructions. These have different operand classes because
+// a different subset of registers can be accessed through each instruction.
+def MRSSystemRegisterOperand : AsmOperandClass {
+ let Name = "MRSSystemRegister";
+ let ParserMethod = "tryParseSysReg";
+ let DiagnosticType = "MRS";
+}
+// concatenation of op0, op1, CRn, CRm, op2. 16-bit immediate.
+def mrs_sysreg_op : Operand<i32> {
+ let ParserMatchClass = MRSSystemRegisterOperand;
+ let DecoderMethod = "DecodeMRSSystemRegister";
+ let PrintMethod = "printMRSSystemRegister";
+}
+
+def MSRSystemRegisterOperand : AsmOperandClass {
+ let Name = "MSRSystemRegister";
+ let ParserMethod = "tryParseSysReg";
+ let DiagnosticType = "MSR";
+}
+def msr_sysreg_op : Operand<i32> {
+ let ParserMatchClass = MSRSystemRegisterOperand;
+ let DecoderMethod = "DecodeMSRSystemRegister";
+ let PrintMethod = "printMSRSystemRegister";
+}
+
+def PSBHintOperand : AsmOperandClass {
+ let Name = "PSBHint";
+ let ParserMethod = "tryParsePSBHint";
+}
+def psbhint_op : Operand<i32> {
+ let ParserMatchClass = PSBHintOperand;
+ let PrintMethod = "printPSBHintOp";
+ let MCOperandPredicate = [{
+ // Check, if operand is valid, to fix exhaustive aliasing in disassembly.
+ // "psb" is an alias to "hint" only for certain values of CRm:Op2 fields.
+ if (!MCOp.isImm())
+ return false;
+ return AArch64PSBHint::lookupPSBByEncoding(MCOp.getImm()) != nullptr;
+ }];
+}
+
+class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg),
+ "mrs", "\t$Rt, $systemreg"> {
+ bits<16> systemreg;
+ let Inst{20-5} = systemreg;
+}
+
+// FIXME: Some of these def NZCV, others don't. Best way to model that?
+// Explicitly modeling each of the system register as a register class
+// would do it, but feels like overkill at this point.
+class MSRI : RtSystemI<0, (outs), (ins msr_sysreg_op:$systemreg, GPR64:$Rt),
+ "msr", "\t$systemreg, $Rt"> {
+ bits<16> systemreg;
+ let Inst{20-5} = systemreg;
+}
+
+def SystemPStateFieldWithImm0_15Operand : AsmOperandClass {
+ let Name = "SystemPStateFieldWithImm0_15";
+ let ParserMethod = "tryParseSysReg";
+}
+def pstatefield4_op : Operand<i32> {
+ let ParserMatchClass = SystemPStateFieldWithImm0_15Operand;
+ let PrintMethod = "printSystemPStateField";
+}
+
+let Defs = [NZCV] in
+class MSRpstateImm0_15
+ : SimpleSystemI<0, (ins pstatefield4_op:$pstatefield, imm0_15:$imm),
+ "msr", "\t$pstatefield, $imm">,
+ Sched<[WriteSys]> {
+ bits<6> pstatefield;
+ bits<4> imm;
+ let Inst{20-19} = 0b00;
+ let Inst{18-16} = pstatefield{5-3};
+ let Inst{15-12} = 0b0100;
+ let Inst{11-8} = imm;
+ let Inst{7-5} = pstatefield{2-0};
+
+ let DecoderMethod = "DecodeSystemPStateInstruction";
+ // MSRpstateI aliases with MSRI. When the MSRpstateI decoder method returns
+ // Fail the decoder should attempt to decode the instruction as MSRI.
+ let hasCompleteDecoder = 0;
+}
+
+def SystemPStateFieldWithImm0_1Operand : AsmOperandClass {
+ let Name = "SystemPStateFieldWithImm0_1";
+ let ParserMethod = "tryParseSysReg";
+}
+def pstatefield1_op : Operand<i32> {
+ let ParserMatchClass = SystemPStateFieldWithImm0_1Operand;
+ let PrintMethod = "printSystemPStateField";
+}
+
+let Defs = [NZCV] in
+class MSRpstateImm0_1
+ : SimpleSystemI<0, (ins pstatefield1_op:$pstatefield, imm0_1:$imm),
+ "msr", "\t$pstatefield, $imm">,
+ Sched<[WriteSys]> {
+ bits<6> pstatefield;
+ bit imm;
+ let Inst{20-19} = 0b00;
+ let Inst{18-16} = pstatefield{5-3};
+ let Inst{15-9} = 0b0100000;
+ let Inst{8} = imm;
+ let Inst{7-5} = pstatefield{2-0};
+
+ let DecoderMethod = "DecodeSystemPStateInstruction";
+ // MSRpstateI aliases with MSRI. When the MSRpstateI decoder method returns
+ // Fail the decoder should attempt to decode the instruction as MSRI.
+ let hasCompleteDecoder = 0;
+}
+
+// SYS and SYSL generic system instructions.
+def SysCRAsmOperand : AsmOperandClass {
+ let Name = "SysCR";
+ let ParserMethod = "tryParseSysCROperand";
+}
+
+def sys_cr_op : Operand<i32> {
+ let PrintMethod = "printSysCROperand";
+ let ParserMatchClass = SysCRAsmOperand;
+}
+
+class SystemXtI<bit L, string asm>
+ : RtSystemI<L, (outs),
+ (ins imm0_7:$op1, sys_cr_op:$Cn, sys_cr_op:$Cm, imm0_7:$op2, GPR64:$Rt),
+ asm, "\t$op1, $Cn, $Cm, $op2, $Rt"> {
+ bits<3> op1;
+ bits<4> Cn;
+ bits<4> Cm;
+ bits<3> op2;
+ let Inst{20-19} = 0b01;
+ let Inst{18-16} = op1;
+ let Inst{15-12} = Cn;
+ let Inst{11-8} = Cm;
+ let Inst{7-5} = op2;
+}
+
+class SystemLXtI<bit L, string asm>
+ : RtSystemI<L, (outs),
+ (ins GPR64:$Rt, imm0_7:$op1, sys_cr_op:$Cn, sys_cr_op:$Cm, imm0_7:$op2),
+ asm, "\t$Rt, $op1, $Cn, $Cm, $op2"> {
+ bits<3> op1;
+ bits<4> Cn;
+ bits<4> Cm;
+ bits<3> op2;
+ let Inst{20-19} = 0b01;
+ let Inst{18-16} = op1;
+ let Inst{15-12} = Cn;
+ let Inst{11-8} = Cm;
+ let Inst{7-5} = op2;
+}
+
+
+// Branch (register) instructions:
+//
+// case opc of
+// 0001 blr
+// 0000 br
+// 0101 dret
+// 0100 eret
+// 0010 ret
+// otherwise UNDEFINED
+class BaseBranchReg<bits<4> opc, dag oops, dag iops, string asm,
+ string operands, list<dag> pattern>
+ : I<oops, iops, asm, operands, "", pattern>, Sched<[WriteBrReg]> {
+ let Inst{31-25} = 0b1101011;
+ let Inst{24-21} = opc;
+ let Inst{20-16} = 0b11111;
+ let Inst{15-10} = 0b000000;
+ let Inst{4-0} = 0b00000;
+}
+
+class BranchReg<bits<4> opc, string asm, list<dag> pattern>
+ : BaseBranchReg<opc, (outs), (ins GPR64:$Rn), asm, "\t$Rn", pattern> {
+ bits<5> Rn;
+ let Inst{9-5} = Rn;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1, isReturn = 1 in
+class SpecialReturn<bits<4> opc, string asm>
+ : BaseBranchReg<opc, (outs), (ins), asm, "", []> {
+ let Inst{9-5} = 0b11111;
+}
+
+//---
+// Conditional branch instruction.
+//---
+
+// Condition code.
+// 4-bit immediate. Pretty-printed as <cc>
+def ccode : Operand<i32> {
+ let PrintMethod = "printCondCode";
+ let ParserMatchClass = CondCode;
+}
+def inv_ccode : Operand<i32> {
+ // AL and NV are invalid in the aliases which use inv_ccode
+ let PrintMethod = "printInverseCondCode";
+ let ParserMatchClass = CondCode;
+ let MCOperandPredicate = [{
+ return MCOp.isImm() &&
+ MCOp.getImm() != AArch64CC::AL &&
+ MCOp.getImm() != AArch64CC::NV;
+ }];
+}
+
+// Conditional branch target. 19-bit immediate. The low two bits of the target
+// offset are implied zero and so are not part of the immediate.
+def PCRelLabel19Operand : AsmOperandClass {
+ let Name = "PCRelLabel19";
+ let DiagnosticType = "InvalidLabel";
+}
+def am_brcond : Operand<OtherVT> {
+ let EncoderMethod = "getCondBranchTargetOpValue";
+ let DecoderMethod = "DecodePCRelLabel19";
+ let PrintMethod = "printAlignedLabel";
+ let ParserMatchClass = PCRelLabel19Operand;
+}
+
+class BranchCond : I<(outs), (ins ccode:$cond, am_brcond:$target),
+ "b", ".$cond\t$target", "",
+ [(AArch64brcond bb:$target, imm:$cond, NZCV)]>,
+ Sched<[WriteBr]> {
+ let isBranch = 1;
+ let isTerminator = 1;
+ let Uses = [NZCV];
+
+ bits<4> cond;
+ bits<19> target;
+ let Inst{31-24} = 0b01010100;
+ let Inst{23-5} = target;
+ let Inst{4} = 0;
+ let Inst{3-0} = cond;
+}
+
+//---
+// Compare-and-branch instructions.
+//---
+class BaseCmpBranch<RegisterClass regtype, bit op, string asm, SDNode node>
+ : I<(outs), (ins regtype:$Rt, am_brcond:$target),
+ asm, "\t$Rt, $target", "",
+ [(node regtype:$Rt, bb:$target)]>,
+ Sched<[WriteBr]> {
+ let isBranch = 1;
+ let isTerminator = 1;
+
+ bits<5> Rt;
+ bits<19> target;
+ let Inst{30-25} = 0b011010;
+ let Inst{24} = op;
+ let Inst{23-5} = target;
+ let Inst{4-0} = Rt;
+}
+
+multiclass CmpBranch<bit op, string asm, SDNode node> {
+ def W : BaseCmpBranch<GPR32, op, asm, node> {
+ let Inst{31} = 0;
+ }
+ def X : BaseCmpBranch<GPR64, op, asm, node> {
+ let Inst{31} = 1;
+ }
+}
+
+//---
+// Test-bit-and-branch instructions.
+//---
+// Test-and-branch target. 14-bit sign-extended immediate. The low two bits of
+// the target offset are implied zero and so are not part of the immediate.
+def BranchTarget14Operand : AsmOperandClass {
+ let Name = "BranchTarget14";
+}
+def am_tbrcond : Operand<OtherVT> {
+ let EncoderMethod = "getTestBranchTargetOpValue";
+ let PrintMethod = "printAlignedLabel";
+ let ParserMatchClass = BranchTarget14Operand;
+}
+
+// AsmOperand classes to emit (or not) special diagnostics
+def TBZImm0_31Operand : AsmOperandClass {
+ let Name = "TBZImm0_31";
+ let PredicateMethod = "isImm0_31";
+ let RenderMethod = "addImm0_31Operands";
+}
+def TBZImm32_63Operand : AsmOperandClass {
+ let Name = "Imm32_63";
+ let DiagnosticType = "InvalidImm0_63";
+}
+
+class tbz_imm0_31<AsmOperandClass matcher> : Operand<i64>, ImmLeaf<i64, [{
+ return (((uint32_t)Imm) < 32);
+}]> {
+ let ParserMatchClass = matcher;
+}
+
+def tbz_imm0_31_diag : tbz_imm0_31<Imm0_31Operand>;
+def tbz_imm0_31_nodiag : tbz_imm0_31<TBZImm0_31Operand>;
+
+def tbz_imm32_63 : Operand<i64>, ImmLeaf<i64, [{
+ return (((uint32_t)Imm) > 31) && (((uint32_t)Imm) < 64);
+}]> {
+ let ParserMatchClass = TBZImm32_63Operand;
+}
+
+class BaseTestBranch<RegisterClass regtype, Operand immtype,
+ bit op, string asm, SDNode node>
+ : I<(outs), (ins regtype:$Rt, immtype:$bit_off, am_tbrcond:$target),
+ asm, "\t$Rt, $bit_off, $target", "",
+ [(node regtype:$Rt, immtype:$bit_off, bb:$target)]>,
+ Sched<[WriteBr]> {
+ let isBranch = 1;
+ let isTerminator = 1;
+
+ bits<5> Rt;
+ bits<6> bit_off;
+ bits<14> target;
+
+ let Inst{30-25} = 0b011011;
+ let Inst{24} = op;
+ let Inst{23-19} = bit_off{4-0};
+ let Inst{18-5} = target;
+ let Inst{4-0} = Rt;
+
+ let DecoderMethod = "DecodeTestAndBranch";
+}
+
+multiclass TestBranch<bit op, string asm, SDNode node> {
+ def W : BaseTestBranch<GPR32, tbz_imm0_31_diag, op, asm, node> {
+ let Inst{31} = 0;
+ }
+
+ def X : BaseTestBranch<GPR64, tbz_imm32_63, op, asm, node> {
+ let Inst{31} = 1;
+ }
+
+ // Alias X-reg with 0-31 imm to W-Reg.
+ def : InstAlias<asm # "\t$Rd, $imm, $target",
+ (!cast<Instruction>(NAME#"W") GPR32as64:$Rd,
+ tbz_imm0_31_nodiag:$imm, am_tbrcond:$target), 0>;
+ def : Pat<(node GPR64:$Rn, tbz_imm0_31_diag:$imm, bb:$target),
+ (!cast<Instruction>(NAME#"W") (EXTRACT_SUBREG GPR64:$Rn, sub_32),
+ tbz_imm0_31_diag:$imm, bb:$target)>;
+}
+
+//---
+// Unconditional branch (immediate) instructions.
+//---
+def BranchTarget26Operand : AsmOperandClass {
+ let Name = "BranchTarget26";
+ let DiagnosticType = "InvalidLabel";
+}
+def am_b_target : Operand<OtherVT> {
+ let EncoderMethod = "getBranchTargetOpValue";
+ let PrintMethod = "printAlignedLabel";
+ let ParserMatchClass = BranchTarget26Operand;
+}
+def am_bl_target : Operand<i64> {
+ let EncoderMethod = "getBranchTargetOpValue";
+ let PrintMethod = "printAlignedLabel";
+ let ParserMatchClass = BranchTarget26Operand;
+}
+
+class BImm<bit op, dag iops, string asm, list<dag> pattern>
+ : I<(outs), iops, asm, "\t$addr", "", pattern>, Sched<[WriteBr]> {
+ bits<26> addr;
+ let Inst{31} = op;
+ let Inst{30-26} = 0b00101;
+ let Inst{25-0} = addr;
+
+ let DecoderMethod = "DecodeUnconditionalBranch";
+}
+
+class BranchImm<bit op, string asm, list<dag> pattern>
+ : BImm<op, (ins am_b_target:$addr), asm, pattern>;
+class CallImm<bit op, string asm, list<dag> pattern>
+ : BImm<op, (ins am_bl_target:$addr), asm, pattern>;
+
+//---
+// Basic one-operand data processing instructions.
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseOneOperandData<bits<3> opc, RegisterClass regtype, string asm,
+ SDPatternOperator node>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "",
+ [(set regtype:$Rd, (node regtype:$Rn))]>,
+ Sched<[WriteI, ReadI]> {
+ bits<5> Rd;
+ bits<5> Rn;
+
+ let Inst{30-13} = 0b101101011000000000;
+ let Inst{12-10} = opc;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass OneOperandData<bits<3> opc, string asm,
+ SDPatternOperator node = null_frag> {
+ def Wr : BaseOneOperandData<opc, GPR32, asm, node> {
+ let Inst{31} = 0;
+ }
+
+ def Xr : BaseOneOperandData<opc, GPR64, asm, node> {
+ let Inst{31} = 1;
+ }
+}
+
+class OneWRegData<bits<3> opc, string asm, SDPatternOperator node>
+ : BaseOneOperandData<opc, GPR32, asm, node> {
+ let Inst{31} = 0;
+}
+
+class OneXRegData<bits<3> opc, string asm, SDPatternOperator node>
+ : BaseOneOperandData<opc, GPR64, asm, node> {
+ let Inst{31} = 1;
+}
+
+//---
+// Basic two-operand data processing instructions.
+//---
+class BaseBaseAddSubCarry<bit isSub, RegisterClass regtype, string asm,
+ list<dag> pattern>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+ asm, "\t$Rd, $Rn, $Rm", "", pattern>,
+ Sched<[WriteI, ReadI, ReadI]> {
+ let Uses = [NZCV];
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ let Inst{30} = isSub;
+ let Inst{28-21} = 0b11010000;
+ let Inst{20-16} = Rm;
+ let Inst{15-10} = 0;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+class BaseAddSubCarry<bit isSub, RegisterClass regtype, string asm,
+ SDNode OpNode>
+ : BaseBaseAddSubCarry<isSub, regtype, asm,
+ [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm, NZCV))]>;
+
+class BaseAddSubCarrySetFlags<bit isSub, RegisterClass regtype, string asm,
+ SDNode OpNode>
+ : BaseBaseAddSubCarry<isSub, regtype, asm,
+ [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm, NZCV)),
+ (implicit NZCV)]> {
+ let Defs = [NZCV];
+}
+
+multiclass AddSubCarry<bit isSub, string asm, string asm_setflags,
+ SDNode OpNode, SDNode OpNode_setflags> {
+ def Wr : BaseAddSubCarry<isSub, GPR32, asm, OpNode> {
+ let Inst{31} = 0;
+ let Inst{29} = 0;
+ }
+ def Xr : BaseAddSubCarry<isSub, GPR64, asm, OpNode> {
+ let Inst{31} = 1;
+ let Inst{29} = 0;
+ }
+
+ // Sets flags.
+ def SWr : BaseAddSubCarrySetFlags<isSub, GPR32, asm_setflags,
+ OpNode_setflags> {
+ let Inst{31} = 0;
+ let Inst{29} = 1;
+ }
+ def SXr : BaseAddSubCarrySetFlags<isSub, GPR64, asm_setflags,
+ OpNode_setflags> {
+ let Inst{31} = 1;
+ let Inst{29} = 1;
+ }
+}
+
+class BaseTwoOperand<bits<4> opc, RegisterClass regtype, string asm,
+ SDPatternOperator OpNode>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+ asm, "\t$Rd, $Rn, $Rm", "",
+ [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ let Inst{30-21} = 0b0011010110;
+ let Inst{20-16} = Rm;
+ let Inst{15-14} = 0b00;
+ let Inst{13-10} = opc;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+class BaseDiv<bit isSigned, RegisterClass regtype, string asm,
+ SDPatternOperator OpNode>
+ : BaseTwoOperand<{0,0,1,?}, regtype, asm, OpNode> {
+ let Inst{10} = isSigned;
+}
+
+multiclass Div<bit isSigned, string asm, SDPatternOperator OpNode> {
+ def Wr : BaseDiv<isSigned, GPR32, asm, OpNode>,
+ Sched<[WriteID32, ReadID, ReadID]> {
+ let Inst{31} = 0;
+ }
+ def Xr : BaseDiv<isSigned, GPR64, asm, OpNode>,
+ Sched<[WriteID64, ReadID, ReadID]> {
+ let Inst{31} = 1;
+ }
+}
+
+class BaseShift<bits<2> shift_type, RegisterClass regtype, string asm,
+ SDPatternOperator OpNode = null_frag>
+ : BaseTwoOperand<{1,0,?,?}, regtype, asm, OpNode>,
+ Sched<[WriteIS, ReadI]> {
+ let Inst{11-10} = shift_type;
+}
+
+multiclass Shift<bits<2> shift_type, string asm, SDNode OpNode> {
+ def Wr : BaseShift<shift_type, GPR32, asm> {
+ let Inst{31} = 0;
+ }
+
+ def Xr : BaseShift<shift_type, GPR64, asm, OpNode> {
+ let Inst{31} = 1;
+ }
+
+ def : Pat<(i32 (OpNode GPR32:$Rn, i64:$Rm)),
+ (!cast<Instruction>(NAME # "Wr") GPR32:$Rn,
+ (EXTRACT_SUBREG i64:$Rm, sub_32))>;
+
+ def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (zext GPR32:$Rm)))),
+ (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
+
+ def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (anyext GPR32:$Rm)))),
+ (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
+
+ def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (sext GPR32:$Rm)))),
+ (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
+}
+
+class ShiftAlias<string asm, Instruction inst, RegisterClass regtype>
+ : InstAlias<asm#"\t$dst, $src1, $src2",
+ (inst regtype:$dst, regtype:$src1, regtype:$src2), 0>;
+
+class BaseMulAccum<bit isSub, bits<3> opc, RegisterClass multype,
+ RegisterClass addtype, string asm,
+ list<dag> pattern>
+ : I<(outs addtype:$Rd), (ins multype:$Rn, multype:$Rm, addtype:$Ra),
+ asm, "\t$Rd, $Rn, $Rm, $Ra", "", pattern> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ bits<5> Ra;
+ let Inst{30-24} = 0b0011011;
+ let Inst{23-21} = opc;
+ let Inst{20-16} = Rm;
+ let Inst{15} = isSub;
+ let Inst{14-10} = Ra;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+multiclass MulAccum<bit isSub, string asm, SDNode AccNode> {
+ // MADD/MSUB generation is decided by MachineCombiner.cpp
+ def Wrrr : BaseMulAccum<isSub, 0b000, GPR32, GPR32, asm,
+ [/*(set GPR32:$Rd, (AccNode GPR32:$Ra, (mul GPR32:$Rn, GPR32:$Rm)))*/]>,
+ Sched<[WriteIM32, ReadIM, ReadIM, ReadIMA]> {
+ let Inst{31} = 0;
+ }
+
+ def Xrrr : BaseMulAccum<isSub, 0b000, GPR64, GPR64, asm,
+ [/*(set GPR64:$Rd, (AccNode GPR64:$Ra, (mul GPR64:$Rn, GPR64:$Rm)))*/]>,
+ Sched<[WriteIM64, ReadIM, ReadIM, ReadIMA]> {
+ let Inst{31} = 1;
+ }
+}
+
+class WideMulAccum<bit isSub, bits<3> opc, string asm,
+ SDNode AccNode, SDNode ExtNode>
+ : BaseMulAccum<isSub, opc, GPR32, GPR64, asm,
+ [(set GPR64:$Rd, (AccNode GPR64:$Ra,
+ (mul (ExtNode GPR32:$Rn), (ExtNode GPR32:$Rm))))]>,
+ Sched<[WriteIM32, ReadIM, ReadIM, ReadIMA]> {
+ let Inst{31} = 1;
+}
+
+class MulHi<bits<3> opc, string asm, SDNode OpNode>
+ : I<(outs GPR64:$Rd), (ins GPR64:$Rn, GPR64:$Rm),
+ asm, "\t$Rd, $Rn, $Rm", "",
+ [(set GPR64:$Rd, (OpNode GPR64:$Rn, GPR64:$Rm))]>,
+ Sched<[WriteIM64, ReadIM, ReadIM]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ let Inst{31-24} = 0b10011011;
+ let Inst{23-21} = opc;
+ let Inst{20-16} = Rm;
+ let Inst{15} = 0;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+
+ // The Ra field of SMULH and UMULH is unused: it should be assembled as 31
+ // (i.e. all bits 1) but is ignored by the processor.
+ let PostEncoderMethod = "fixMulHigh";
+}
+
+class MulAccumWAlias<string asm, Instruction inst>
+ : InstAlias<asm#"\t$dst, $src1, $src2",
+ (inst GPR32:$dst, GPR32:$src1, GPR32:$src2, WZR)>;
+class MulAccumXAlias<string asm, Instruction inst>
+ : InstAlias<asm#"\t$dst, $src1, $src2",
+ (inst GPR64:$dst, GPR64:$src1, GPR64:$src2, XZR)>;
+class WideMulAccumAlias<string asm, Instruction inst>
+ : InstAlias<asm#"\t$dst, $src1, $src2",
+ (inst GPR64:$dst, GPR32:$src1, GPR32:$src2, XZR)>;
+
+class BaseCRC32<bit sf, bits<2> sz, bit C, RegisterClass StreamReg,
+ SDPatternOperator OpNode, string asm>
+ : I<(outs GPR32:$Rd), (ins GPR32:$Rn, StreamReg:$Rm),
+ asm, "\t$Rd, $Rn, $Rm", "",
+ [(set GPR32:$Rd, (OpNode GPR32:$Rn, StreamReg:$Rm))]>,
+ Sched<[WriteISReg, ReadI, ReadISReg]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+
+ let Inst{31} = sf;
+ let Inst{30-21} = 0b0011010110;
+ let Inst{20-16} = Rm;
+ let Inst{15-13} = 0b010;
+ let Inst{12} = C;
+ let Inst{11-10} = sz;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+ let Predicates = [HasCRC];
+}
+
+//---
+// Address generation.
+//---
+
+class ADRI<bit page, string asm, Operand adr, list<dag> pattern>
+ : I<(outs GPR64:$Xd), (ins adr:$label), asm, "\t$Xd, $label", "",
+ pattern>,
+ Sched<[WriteI]> {
+ bits<5> Xd;
+ bits<21> label;
+ let Inst{31} = page;
+ let Inst{30-29} = label{1-0};
+ let Inst{28-24} = 0b10000;
+ let Inst{23-5} = label{20-2};
+ let Inst{4-0} = Xd;
+
+ let DecoderMethod = "DecodeAdrInstruction";
+}
+
+//---
+// Move immediate.
+//---
+
+def movimm32_imm : Operand<i32> {
+ let ParserMatchClass = Imm0_65535Operand;
+ let EncoderMethod = "getMoveWideImmOpValue";
+ let PrintMethod = "printImm";
+}
+def movimm32_shift : Operand<i32> {
+ let PrintMethod = "printShifter";
+ let ParserMatchClass = MovImm32ShifterOperand;
+}
+def movimm64_shift : Operand<i32> {
+ let PrintMethod = "printShifter";
+ let ParserMatchClass = MovImm64ShifterOperand;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseMoveImmediate<bits<2> opc, RegisterClass regtype, Operand shifter,
+ string asm>
+ : I<(outs regtype:$Rd), (ins movimm32_imm:$imm, shifter:$shift),
+ asm, "\t$Rd, $imm$shift", "", []>,
+ Sched<[WriteImm]> {
+ bits<5> Rd;
+ bits<16> imm;
+ bits<6> shift;
+ let Inst{30-29} = opc;
+ let Inst{28-23} = 0b100101;
+ let Inst{22-21} = shift{5-4};
+ let Inst{20-5} = imm;
+ let Inst{4-0} = Rd;
+
+ let DecoderMethod = "DecodeMoveImmInstruction";
+}
+
+multiclass MoveImmediate<bits<2> opc, string asm> {
+ def Wi : BaseMoveImmediate<opc, GPR32, movimm32_shift, asm> {
+ let Inst{31} = 0;
+ }
+
+ def Xi : BaseMoveImmediate<opc, GPR64, movimm64_shift, asm> {
+ let Inst{31} = 1;
+ }
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseInsertImmediate<bits<2> opc, RegisterClass regtype, Operand shifter,
+ string asm>
+ : I<(outs regtype:$Rd),
+ (ins regtype:$src, movimm32_imm:$imm, shifter:$shift),
+ asm, "\t$Rd, $imm$shift", "$src = $Rd", []>,
+ Sched<[WriteI, ReadI]> {
+ bits<5> Rd;
+ bits<16> imm;
+ bits<6> shift;
+ let Inst{30-29} = opc;
+ let Inst{28-23} = 0b100101;
+ let Inst{22-21} = shift{5-4};
+ let Inst{20-5} = imm;
+ let Inst{4-0} = Rd;
+
+ let DecoderMethod = "DecodeMoveImmInstruction";
+}
+
+multiclass InsertImmediate<bits<2> opc, string asm> {
+ def Wi : BaseInsertImmediate<opc, GPR32, movimm32_shift, asm> {
+ let Inst{31} = 0;
+ }
+
+ def Xi : BaseInsertImmediate<opc, GPR64, movimm64_shift, asm> {
+ let Inst{31} = 1;
+ }
+}
+
+//---
+// Add/Subtract
+//---
+
+class BaseAddSubImm<bit isSub, bit setFlags, RegisterClass dstRegtype,
+ RegisterClass srcRegtype, addsub_shifted_imm immtype,
+ string asm, SDPatternOperator OpNode>
+ : I<(outs dstRegtype:$Rd), (ins srcRegtype:$Rn, immtype:$imm),
+ asm, "\t$Rd, $Rn, $imm", "",
+ [(set dstRegtype:$Rd, (OpNode srcRegtype:$Rn, immtype:$imm))]>,
+ Sched<[WriteI, ReadI]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<14> imm;
+ let Inst{30} = isSub;
+ let Inst{29} = setFlags;
+ let Inst{28-24} = 0b10001;
+ let Inst{23-22} = imm{13-12}; // '00' => lsl #0, '01' => lsl #12
+ let Inst{21-10} = imm{11-0};
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+ let DecoderMethod = "DecodeBaseAddSubImm";
+}
+
+class BaseAddSubRegPseudo<RegisterClass regtype,
+ SDPatternOperator OpNode>
+ : Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+ [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>,
+ Sched<[WriteI, ReadI, ReadI]>;
+
+class BaseAddSubSReg<bit isSub, bit setFlags, RegisterClass regtype,
+ arith_shifted_reg shifted_regtype, string asm,
+ SDPatternOperator OpNode>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm),
+ asm, "\t$Rd, $Rn, $Rm", "",
+ [(set regtype:$Rd, (OpNode regtype:$Rn, shifted_regtype:$Rm))]>,
+ Sched<[WriteISReg, ReadI, ReadISReg]> {
+ // The operands are in order to match the 'addr' MI operands, so we
+ // don't need an encoder method and by-name matching. Just use the default
+ // in-order handling. Since we're using by-order, make sure the names
+ // do not match.
+ bits<5> dst;
+ bits<5> src1;
+ bits<5> src2;
+ bits<8> shift;
+ let Inst{30} = isSub;
+ let Inst{29} = setFlags;
+ let Inst{28-24} = 0b01011;
+ let Inst{23-22} = shift{7-6};
+ let Inst{21} = 0;
+ let Inst{20-16} = src2;
+ let Inst{15-10} = shift{5-0};
+ let Inst{9-5} = src1;
+ let Inst{4-0} = dst;
+
+ let DecoderMethod = "DecodeThreeAddrSRegInstruction";
+}
+
+class BaseAddSubEReg<bit isSub, bit setFlags, RegisterClass dstRegtype,
+ RegisterClass src1Regtype, Operand src2Regtype,
+ string asm, SDPatternOperator OpNode>
+ : I<(outs dstRegtype:$R1),
+ (ins src1Regtype:$R2, src2Regtype:$R3),
+ asm, "\t$R1, $R2, $R3", "",
+ [(set dstRegtype:$R1, (OpNode src1Regtype:$R2, src2Regtype:$R3))]>,
+ Sched<[WriteIEReg, ReadI, ReadIEReg]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ bits<6> ext;
+ let Inst{30} = isSub;
+ let Inst{29} = setFlags;
+ let Inst{28-24} = 0b01011;
+ let Inst{23-21} = 0b001;
+ let Inst{20-16} = Rm;
+ let Inst{15-13} = ext{5-3};
+ let Inst{12-10} = ext{2-0};
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+
+ let DecoderMethod = "DecodeAddSubERegInstruction";
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseAddSubEReg64<bit isSub, bit setFlags, RegisterClass dstRegtype,
+ RegisterClass src1Regtype, RegisterClass src2Regtype,
+ Operand ext_op, string asm>
+ : I<(outs dstRegtype:$Rd),
+ (ins src1Regtype:$Rn, src2Regtype:$Rm, ext_op:$ext),
+ asm, "\t$Rd, $Rn, $Rm$ext", "", []>,
+ Sched<[WriteIEReg, ReadI, ReadIEReg]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ bits<6> ext;
+ let Inst{30} = isSub;
+ let Inst{29} = setFlags;
+ let Inst{28-24} = 0b01011;
+ let Inst{23-21} = 0b001;
+ let Inst{20-16} = Rm;
+ let Inst{15} = ext{5};
+ let Inst{12-10} = ext{2-0};
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+
+ let DecoderMethod = "DecodeAddSubERegInstruction";
+}
+
+// Aliases for register+register add/subtract.
+class AddSubRegAlias<string asm, Instruction inst, RegisterClass dstRegtype,
+ RegisterClass src1Regtype, RegisterClass src2Regtype,
+ int shiftExt>
+ : InstAlias<asm#"\t$dst, $src1, $src2",
+ (inst dstRegtype:$dst, src1Regtype:$src1, src2Regtype:$src2,
+ shiftExt)>;
+
+multiclass AddSub<bit isSub, string mnemonic, string alias,
+ SDPatternOperator OpNode = null_frag> {
+ let hasSideEffects = 0, isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+ // Add/Subtract immediate
+ // Increase the weight of the immediate variant to try to match it before
+ // the extended register variant.
+ // We used to match the register variant before the immediate when the
+ // register argument could be implicitly zero-extended.
+ let AddedComplexity = 6 in
+ def Wri : BaseAddSubImm<isSub, 0, GPR32sp, GPR32sp, addsub_shifted_imm32,
+ mnemonic, OpNode> {
+ let Inst{31} = 0;
+ }
+ let AddedComplexity = 6 in
+ def Xri : BaseAddSubImm<isSub, 0, GPR64sp, GPR64sp, addsub_shifted_imm64,
+ mnemonic, OpNode> {
+ let Inst{31} = 1;
+ }
+
+ // Add/Subtract register - Only used for CodeGen
+ def Wrr : BaseAddSubRegPseudo<GPR32, OpNode>;
+ def Xrr : BaseAddSubRegPseudo<GPR64, OpNode>;
+
+ // Add/Subtract shifted register
+ def Wrs : BaseAddSubSReg<isSub, 0, GPR32, arith_shifted_reg32, mnemonic,
+ OpNode> {
+ let Inst{31} = 0;
+ }
+ def Xrs : BaseAddSubSReg<isSub, 0, GPR64, arith_shifted_reg64, mnemonic,
+ OpNode> {
+ let Inst{31} = 1;
+ }
+ }
+
+ // Add/Subtract extended register
+ let AddedComplexity = 1, hasSideEffects = 0 in {
+ def Wrx : BaseAddSubEReg<isSub, 0, GPR32sp, GPR32sp,
+ arith_extended_reg32<i32>, mnemonic, OpNode> {
+ let Inst{31} = 0;
+ }
+ def Xrx : BaseAddSubEReg<isSub, 0, GPR64sp, GPR64sp,
+ arith_extended_reg32to64<i64>, mnemonic, OpNode> {
+ let Inst{31} = 1;
+ }
+ }
+
+ def Xrx64 : BaseAddSubEReg64<isSub, 0, GPR64sp, GPR64sp, GPR64,
+ arith_extendlsl64, mnemonic> {
+ // UXTX and SXTX only.
+ let Inst{14-13} = 0b11;
+ let Inst{31} = 1;
+ }
+
+ // add Rd, Rb, -imm -> sub Rd, Rn, imm
+ def : InstAlias<alias#"\t$Rd, $Rn, $imm",
+ (!cast<Instruction>(NAME # "Wri") GPR32sp:$Rd, GPR32sp:$Rn,
+ addsub_shifted_imm32_neg:$imm), 0>;
+ def : InstAlias<alias#"\t$Rd, $Rn, $imm",
+ (!cast<Instruction>(NAME # "Xri") GPR64sp:$Rd, GPR64sp:$Rn,
+ addsub_shifted_imm64_neg:$imm), 0>;
+
+ // Register/register aliases with no shift when SP is not used.
+ def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"),
+ GPR32, GPR32, GPR32, 0>;
+ def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"),
+ GPR64, GPR64, GPR64, 0>;
+
+ // Register/register aliases with no shift when either the destination or
+ // first source register is SP.
+ def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
+ GPR32sponly, GPR32sp, GPR32, 16>; // UXTW #0
+ def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
+ GPR32sp, GPR32sponly, GPR32, 16>; // UXTW #0
+ def : AddSubRegAlias<mnemonic,
+ !cast<Instruction>(NAME#"Xrx64"),
+ GPR64sponly, GPR64sp, GPR64, 24>; // UXTX #0
+ def : AddSubRegAlias<mnemonic,
+ !cast<Instruction>(NAME#"Xrx64"),
+ GPR64sp, GPR64sponly, GPR64, 24>; // UXTX #0
+}
+
+multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp,
+ string alias, string cmpAlias> {
+ let isCompare = 1, Defs = [NZCV] in {
+ // Add/Subtract immediate
+ def Wri : BaseAddSubImm<isSub, 1, GPR32, GPR32sp, addsub_shifted_imm32,
+ mnemonic, OpNode> {
+ let Inst{31} = 0;
+ }
+ def Xri : BaseAddSubImm<isSub, 1, GPR64, GPR64sp, addsub_shifted_imm64,
+ mnemonic, OpNode> {
+ let Inst{31} = 1;
+ }
+
+ // Add/Subtract register
+ def Wrr : BaseAddSubRegPseudo<GPR32, OpNode>;
+ def Xrr : BaseAddSubRegPseudo<GPR64, OpNode>;
+
+ // Add/Subtract shifted register
+ def Wrs : BaseAddSubSReg<isSub, 1, GPR32, arith_shifted_reg32, mnemonic,
+ OpNode> {
+ let Inst{31} = 0;
+ }
+ def Xrs : BaseAddSubSReg<isSub, 1, GPR64, arith_shifted_reg64, mnemonic,
+ OpNode> {
+ let Inst{31} = 1;
+ }
+
+ // Add/Subtract extended register
+ let AddedComplexity = 1 in {
+ def Wrx : BaseAddSubEReg<isSub, 1, GPR32, GPR32sp,
+ arith_extended_reg32<i32>, mnemonic, OpNode> {
+ let Inst{31} = 0;
+ }
+ def Xrx : BaseAddSubEReg<isSub, 1, GPR64, GPR64sp,
+ arith_extended_reg32<i64>, mnemonic, OpNode> {
+ let Inst{31} = 1;
+ }
+ }
+
+ def Xrx64 : BaseAddSubEReg64<isSub, 1, GPR64, GPR64sp, GPR64,
+ arith_extendlsl64, mnemonic> {
+ // UXTX and SXTX only.
+ let Inst{14-13} = 0b11;
+ let Inst{31} = 1;
+ }
+ } // Defs = [NZCV]
+
+ // Support negative immediates, e.g. adds Rd, Rn, -imm -> subs Rd, Rn, imm
+ def : InstAlias<alias#"\t$Rd, $Rn, $imm",
+ (!cast<Instruction>(NAME # "Wri") GPR32:$Rd, GPR32sp:$Rn,
+ addsub_shifted_imm32_neg:$imm), 0>;
+ def : InstAlias<alias#"\t$Rd, $Rn, $imm",
+ (!cast<Instruction>(NAME # "Xri") GPR64:$Rd, GPR64sp:$Rn,
+ addsub_shifted_imm64_neg:$imm), 0>;
+
+ // Compare aliases
+ def : InstAlias<cmp#"\t$src, $imm", (!cast<Instruction>(NAME#"Wri")
+ WZR, GPR32sp:$src, addsub_shifted_imm32:$imm), 5>;
+ def : InstAlias<cmp#"\t$src, $imm", (!cast<Instruction>(NAME#"Xri")
+ XZR, GPR64sp:$src, addsub_shifted_imm64:$imm), 5>;
+ def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Wrx")
+ WZR, GPR32sp:$src1, GPR32:$src2, arith_extend:$sh), 4>;
+ def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx")
+ XZR, GPR64sp:$src1, GPR32:$src2, arith_extend:$sh), 4>;
+ def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx64")
+ XZR, GPR64sp:$src1, GPR64:$src2, arith_extendlsl64:$sh), 4>;
+ def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Wrs")
+ WZR, GPR32:$src1, GPR32:$src2, arith_shift32:$sh), 4>;
+ def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrs")
+ XZR, GPR64:$src1, GPR64:$src2, arith_shift64:$sh), 4>;
+
+ // Support negative immediates, e.g. cmp Rn, -imm -> cmn Rn, imm
+ def : InstAlias<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Wri")
+ WZR, GPR32sp:$src, addsub_shifted_imm32_neg:$imm), 0>;
+ def : InstAlias<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Xri")
+ XZR, GPR64sp:$src, addsub_shifted_imm64_neg:$imm), 0>;
+
+ // Compare shorthands
+ def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Wrs")
+ WZR, GPR32:$src1, GPR32:$src2, 0), 5>;
+ def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Xrs")
+ XZR, GPR64:$src1, GPR64:$src2, 0), 5>;
+ def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Wrx")
+ WZR, GPR32sponly:$src1, GPR32:$src2, 16), 5>;
+ def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Xrx64")
+ XZR, GPR64sponly:$src1, GPR64:$src2, 24), 5>;
+
+ // Register/register aliases with no shift when SP is not used.
+ def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"),
+ GPR32, GPR32, GPR32, 0>;
+ def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"),
+ GPR64, GPR64, GPR64, 0>;
+
+ // Register/register aliases with no shift when the first source register
+ // is SP.
+ def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
+ GPR32, GPR32sponly, GPR32, 16>; // UXTW #0
+ def : AddSubRegAlias<mnemonic,
+ !cast<Instruction>(NAME#"Xrx64"),
+ GPR64, GPR64sponly, GPR64, 24>; // UXTX #0
+}
+
+//---
+// Extract
+//---
+def SDTA64EXTR : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+ SDTCisPtrTy<3>]>;
+def AArch64Extr : SDNode<"AArch64ISD::EXTR", SDTA64EXTR>;
+
+class BaseExtractImm<RegisterClass regtype, Operand imm_type, string asm,
+ list<dag> patterns>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, imm_type:$imm),
+ asm, "\t$Rd, $Rn, $Rm, $imm", "", patterns>,
+ Sched<[WriteExtr, ReadExtrHi]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ bits<6> imm;
+
+ let Inst{30-23} = 0b00100111;
+ let Inst{21} = 0;
+ let Inst{20-16} = Rm;
+ let Inst{15-10} = imm;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+multiclass ExtractImm<string asm> {
+ def Wrri : BaseExtractImm<GPR32, imm0_31, asm,
+ [(set GPR32:$Rd,
+ (AArch64Extr GPR32:$Rn, GPR32:$Rm, imm0_31:$imm))]> {
+ let Inst{31} = 0;
+ let Inst{22} = 0;
+ // imm<5> must be zero.
+ let imm{5} = 0;
+ }
+ def Xrri : BaseExtractImm<GPR64, imm0_63, asm,
+ [(set GPR64:$Rd,
+ (AArch64Extr GPR64:$Rn, GPR64:$Rm, imm0_63:$imm))]> {
+
+ let Inst{31} = 1;
+ let Inst{22} = 1;
+ }
+}
+
+//---
+// Bitfield
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseBitfieldImm<bits<2> opc,
+ RegisterClass regtype, Operand imm_type, string asm>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn, imm_type:$immr, imm_type:$imms),
+ asm, "\t$Rd, $Rn, $immr, $imms", "", []>,
+ Sched<[WriteIS, ReadI]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<6> immr;
+ bits<6> imms;
+
+ let Inst{30-29} = opc;
+ let Inst{28-23} = 0b100110;
+ let Inst{21-16} = immr;
+ let Inst{15-10} = imms;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+multiclass BitfieldImm<bits<2> opc, string asm> {
+ def Wri : BaseBitfieldImm<opc, GPR32, imm0_31, asm> {
+ let Inst{31} = 0;
+ let Inst{22} = 0;
+ // imms<5> and immr<5> must be zero, else ReservedValue().
+ let Inst{21} = 0;
+ let Inst{15} = 0;
+ }
+ def Xri : BaseBitfieldImm<opc, GPR64, imm0_63, asm> {
+ let Inst{31} = 1;
+ let Inst{22} = 1;
+ }
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseBitfieldImmWith2RegArgs<bits<2> opc,
+ RegisterClass regtype, Operand imm_type, string asm>
+ : I<(outs regtype:$Rd), (ins regtype:$src, regtype:$Rn, imm_type:$immr,
+ imm_type:$imms),
+ asm, "\t$Rd, $Rn, $immr, $imms", "$src = $Rd", []>,
+ Sched<[WriteIS, ReadI]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<6> immr;
+ bits<6> imms;
+
+ let Inst{30-29} = opc;
+ let Inst{28-23} = 0b100110;
+ let Inst{21-16} = immr;
+ let Inst{15-10} = imms;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+multiclass BitfieldImmWith2RegArgs<bits<2> opc, string asm> {
+ def Wri : BaseBitfieldImmWith2RegArgs<opc, GPR32, imm0_31, asm> {
+ let Inst{31} = 0;
+ let Inst{22} = 0;
+ // imms<5> and immr<5> must be zero, else ReservedValue().
+ let Inst{21} = 0;
+ let Inst{15} = 0;
+ }
+ def Xri : BaseBitfieldImmWith2RegArgs<opc, GPR64, imm0_63, asm> {
+ let Inst{31} = 1;
+ let Inst{22} = 1;
+ }
+}
+
+//---
+// Logical
+//---
+
+// Logical (immediate)
+class BaseLogicalImm<bits<2> opc, RegisterClass dregtype,
+ RegisterClass sregtype, Operand imm_type, string asm,
+ list<dag> pattern>
+ : I<(outs dregtype:$Rd), (ins sregtype:$Rn, imm_type:$imm),
+ asm, "\t$Rd, $Rn, $imm", "", pattern>,
+ Sched<[WriteI, ReadI]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<13> imm;
+ let Inst{30-29} = opc;
+ let Inst{28-23} = 0b100100;
+ let Inst{22} = imm{12};
+ let Inst{21-16} = imm{11-6};
+ let Inst{15-10} = imm{5-0};
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+
+ let DecoderMethod = "DecodeLogicalImmInstruction";
+}
+
+// Logical (shifted register)
+class BaseLogicalSReg<bits<2> opc, bit N, RegisterClass regtype,
+ logical_shifted_reg shifted_regtype, string asm,
+ list<dag> pattern>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm),
+ asm, "\t$Rd, $Rn, $Rm", "", pattern>,
+ Sched<[WriteISReg, ReadI, ReadISReg]> {
+ // The operands are in order to match the 'addr' MI operands, so we
+ // don't need an encoder method and by-name matching. Just use the default
+ // in-order handling. Since we're using by-order, make sure the names
+ // do not match.
+ bits<5> dst;
+ bits<5> src1;
+ bits<5> src2;
+ bits<8> shift;
+ let Inst{30-29} = opc;
+ let Inst{28-24} = 0b01010;
+ let Inst{23-22} = shift{7-6};
+ let Inst{21} = N;
+ let Inst{20-16} = src2;
+ let Inst{15-10} = shift{5-0};
+ let Inst{9-5} = src1;
+ let Inst{4-0} = dst;
+
+ let DecoderMethod = "DecodeThreeAddrSRegInstruction";
+}
+
+// Aliases for register+register logical instructions.
+class LogicalRegAlias<string asm, Instruction inst, RegisterClass regtype>
+ : InstAlias<asm#"\t$dst, $src1, $src2",
+ (inst regtype:$dst, regtype:$src1, regtype:$src2, 0)>;
+
+multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode,
+ string Alias> {
+ let AddedComplexity = 6, isReMaterializable = 1, isAsCheapAsAMove = 1 in
+ def Wri : BaseLogicalImm<opc, GPR32sp, GPR32, logical_imm32, mnemonic,
+ [(set GPR32sp:$Rd, (OpNode GPR32:$Rn,
+ logical_imm32:$imm))]> {
+ let Inst{31} = 0;
+ let Inst{22} = 0; // 64-bit version has an additional bit of immediate.
+ }
+ let AddedComplexity = 6, isReMaterializable = 1, isAsCheapAsAMove = 1 in
+ def Xri : BaseLogicalImm<opc, GPR64sp, GPR64, logical_imm64, mnemonic,
+ [(set GPR64sp:$Rd, (OpNode GPR64:$Rn,
+ logical_imm64:$imm))]> {
+ let Inst{31} = 1;
+ }
+
+ def : InstAlias<Alias # "\t$Rd, $Rn, $imm",
+ (!cast<Instruction>(NAME # "Wri") GPR32sp:$Rd, GPR32:$Rn,
+ logical_imm32_not:$imm), 0>;
+ def : InstAlias<Alias # "\t$Rd, $Rn, $imm",
+ (!cast<Instruction>(NAME # "Xri") GPR64sp:$Rd, GPR64:$Rn,
+ logical_imm64_not:$imm), 0>;
+}
+
+multiclass LogicalImmS<bits<2> opc, string mnemonic, SDNode OpNode,
+ string Alias> {
+ let isCompare = 1, Defs = [NZCV] in {
+ def Wri : BaseLogicalImm<opc, GPR32, GPR32, logical_imm32, mnemonic,
+ [(set GPR32:$Rd, (OpNode GPR32:$Rn, logical_imm32:$imm))]> {
+ let Inst{31} = 0;
+ let Inst{22} = 0; // 64-bit version has an additional bit of immediate.
+ }
+ def Xri : BaseLogicalImm<opc, GPR64, GPR64, logical_imm64, mnemonic,
+ [(set GPR64:$Rd, (OpNode GPR64:$Rn, logical_imm64:$imm))]> {
+ let Inst{31} = 1;
+ }
+ } // end Defs = [NZCV]
+
+ def : InstAlias<Alias # "\t$Rd, $Rn, $imm",
+ (!cast<Instruction>(NAME # "Wri") GPR32:$Rd, GPR32:$Rn,
+ logical_imm32_not:$imm), 0>;
+ def : InstAlias<Alias # "\t$Rd, $Rn, $imm",
+ (!cast<Instruction>(NAME # "Xri") GPR64:$Rd, GPR64:$Rn,
+ logical_imm64_not:$imm), 0>;
+}
+
+class BaseLogicalRegPseudo<RegisterClass regtype, SDPatternOperator OpNode>
+ : Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+ [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>,
+ Sched<[WriteI, ReadI, ReadI]>;
+
+// Split from LogicalImm as not all instructions have both.
+multiclass LogicalReg<bits<2> opc, bit N, string mnemonic,
+ SDPatternOperator OpNode> {
+ let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+ def Wrr : BaseLogicalRegPseudo<GPR32, OpNode>;
+ def Xrr : BaseLogicalRegPseudo<GPR64, OpNode>;
+ }
+
+ def Wrs : BaseLogicalSReg<opc, N, GPR32, logical_shifted_reg32, mnemonic,
+ [(set GPR32:$Rd, (OpNode GPR32:$Rn,
+ logical_shifted_reg32:$Rm))]> {
+ let Inst{31} = 0;
+ }
+ def Xrs : BaseLogicalSReg<opc, N, GPR64, logical_shifted_reg64, mnemonic,
+ [(set GPR64:$Rd, (OpNode GPR64:$Rn,
+ logical_shifted_reg64:$Rm))]> {
+ let Inst{31} = 1;
+ }
+
+ def : LogicalRegAlias<mnemonic,
+ !cast<Instruction>(NAME#"Wrs"), GPR32>;
+ def : LogicalRegAlias<mnemonic,
+ !cast<Instruction>(NAME#"Xrs"), GPR64>;
+}
+
+// Split from LogicalReg to allow setting NZCV Defs
+multiclass LogicalRegS<bits<2> opc, bit N, string mnemonic,
+ SDPatternOperator OpNode = null_frag> {
+ let Defs = [NZCV], mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+ def Wrr : BaseLogicalRegPseudo<GPR32, OpNode>;
+ def Xrr : BaseLogicalRegPseudo<GPR64, OpNode>;
+
+ def Wrs : BaseLogicalSReg<opc, N, GPR32, logical_shifted_reg32, mnemonic,
+ [(set GPR32:$Rd, (OpNode GPR32:$Rn, logical_shifted_reg32:$Rm))]> {
+ let Inst{31} = 0;
+ }
+ def Xrs : BaseLogicalSReg<opc, N, GPR64, logical_shifted_reg64, mnemonic,
+ [(set GPR64:$Rd, (OpNode GPR64:$Rn, logical_shifted_reg64:$Rm))]> {
+ let Inst{31} = 1;
+ }
+ } // Defs = [NZCV]
+
+ def : LogicalRegAlias<mnemonic,
+ !cast<Instruction>(NAME#"Wrs"), GPR32>;
+ def : LogicalRegAlias<mnemonic,
+ !cast<Instruction>(NAME#"Xrs"), GPR64>;
+}
+
+//---
+// Conditionally set flags
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseCondComparisonImm<bit op, RegisterClass regtype, ImmLeaf immtype,
+ string mnemonic, SDNode OpNode>
+ : I<(outs), (ins regtype:$Rn, immtype:$imm, imm32_0_15:$nzcv, ccode:$cond),
+ mnemonic, "\t$Rn, $imm, $nzcv, $cond", "",
+ [(set NZCV, (OpNode regtype:$Rn, immtype:$imm, (i32 imm:$nzcv),
+ (i32 imm:$cond), NZCV))]>,
+ Sched<[WriteI, ReadI]> {
+ let Uses = [NZCV];
+ let Defs = [NZCV];
+
+ bits<5> Rn;
+ bits<5> imm;
+ bits<4> nzcv;
+ bits<4> cond;
+
+ let Inst{30} = op;
+ let Inst{29-21} = 0b111010010;
+ let Inst{20-16} = imm;
+ let Inst{15-12} = cond;
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4} = 0b0;
+ let Inst{3-0} = nzcv;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseCondComparisonReg<bit op, RegisterClass regtype, string mnemonic,
+ SDNode OpNode>
+ : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm32_0_15:$nzcv, ccode:$cond),
+ mnemonic, "\t$Rn, $Rm, $nzcv, $cond", "",
+ [(set NZCV, (OpNode regtype:$Rn, regtype:$Rm, (i32 imm:$nzcv),
+ (i32 imm:$cond), NZCV))]>,
+ Sched<[WriteI, ReadI, ReadI]> {
+ let Uses = [NZCV];
+ let Defs = [NZCV];
+
+ bits<5> Rn;
+ bits<5> Rm;
+ bits<4> nzcv;
+ bits<4> cond;
+
+ let Inst{30} = op;
+ let Inst{29-21} = 0b111010010;
+ let Inst{20-16} = Rm;
+ let Inst{15-12} = cond;
+ let Inst{11-10} = 0b00;
+ let Inst{9-5} = Rn;
+ let Inst{4} = 0b0;
+ let Inst{3-0} = nzcv;
+}
+
+multiclass CondComparison<bit op, string mnemonic, SDNode OpNode> {
+ // immediate operand variants
+ def Wi : BaseCondComparisonImm<op, GPR32, imm32_0_31, mnemonic, OpNode> {
+ let Inst{31} = 0;
+ }
+ def Xi : BaseCondComparisonImm<op, GPR64, imm0_31, mnemonic, OpNode> {
+ let Inst{31} = 1;
+ }
+ // register operand variants
+ def Wr : BaseCondComparisonReg<op, GPR32, mnemonic, OpNode> {
+ let Inst{31} = 0;
+ }
+ def Xr : BaseCondComparisonReg<op, GPR64, mnemonic, OpNode> {
+ let Inst{31} = 1;
+ }
+}
+
+//---
+// Conditional select
+//---
+
+class BaseCondSelect<bit op, bits<2> op2, RegisterClass regtype, string asm>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
+ asm, "\t$Rd, $Rn, $Rm, $cond", "",
+ [(set regtype:$Rd,
+ (AArch64csel regtype:$Rn, regtype:$Rm, (i32 imm:$cond), NZCV))]>,
+ Sched<[WriteI, ReadI, ReadI]> {
+ let Uses = [NZCV];
+
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ bits<4> cond;
+
+ let Inst{30} = op;
+ let Inst{29-21} = 0b011010100;
+ let Inst{20-16} = Rm;
+ let Inst{15-12} = cond;
+ let Inst{11-10} = op2;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+multiclass CondSelect<bit op, bits<2> op2, string asm> {
+ def Wr : BaseCondSelect<op, op2, GPR32, asm> {
+ let Inst{31} = 0;
+ }
+ def Xr : BaseCondSelect<op, op2, GPR64, asm> {
+ let Inst{31} = 1;
+ }
+}
+
+class BaseCondSelectOp<bit op, bits<2> op2, RegisterClass regtype, string asm,
+ PatFrag frag>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
+ asm, "\t$Rd, $Rn, $Rm, $cond", "",
+ [(set regtype:$Rd,
+ (AArch64csel regtype:$Rn, (frag regtype:$Rm),
+ (i32 imm:$cond), NZCV))]>,
+ Sched<[WriteI, ReadI, ReadI]> {
+ let Uses = [NZCV];
+
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ bits<4> cond;
+
+ let Inst{30} = op;
+ let Inst{29-21} = 0b011010100;
+ let Inst{20-16} = Rm;
+ let Inst{15-12} = cond;
+ let Inst{11-10} = op2;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+def inv_cond_XFORM : SDNodeXForm<imm, [{
+ AArch64CC::CondCode CC = static_cast<AArch64CC::CondCode>(N->getZExtValue());
+ return CurDAG->getTargetConstant(AArch64CC::getInvertedCondCode(CC), SDLoc(N),
+ MVT::i32);
+}]>;
+
+multiclass CondSelectOp<bit op, bits<2> op2, string asm, PatFrag frag> {
+ def Wr : BaseCondSelectOp<op, op2, GPR32, asm, frag> {
+ let Inst{31} = 0;
+ }
+ def Xr : BaseCondSelectOp<op, op2, GPR64, asm, frag> {
+ let Inst{31} = 1;
+ }
+
+ def : Pat<(AArch64csel (frag GPR32:$Rm), GPR32:$Rn, (i32 imm:$cond), NZCV),
+ (!cast<Instruction>(NAME # Wr) GPR32:$Rn, GPR32:$Rm,
+ (inv_cond_XFORM imm:$cond))>;
+
+ def : Pat<(AArch64csel (frag GPR64:$Rm), GPR64:$Rn, (i32 imm:$cond), NZCV),
+ (!cast<Instruction>(NAME # Xr) GPR64:$Rn, GPR64:$Rm,
+ (inv_cond_XFORM imm:$cond))>;
+}
+
+//---
+// Special Mask Value
+//---
+def maski8_or_more : Operand<i32>,
+ ImmLeaf<i32, [{ return (Imm & 0xff) == 0xff; }]> {
+}
+def maski16_or_more : Operand<i32>,
+ ImmLeaf<i32, [{ return (Imm & 0xffff) == 0xffff; }]> {
+}
+
+
+//---
+// Load/store
+//---
+
+// (unsigned immediate)
+// Indexed for 8-bit registers. offset is in range [0,4095].
+def am_indexed8 : ComplexPattern<i64, 2, "SelectAddrModeIndexed8", []>;
+def am_indexed16 : ComplexPattern<i64, 2, "SelectAddrModeIndexed16", []>;
+def am_indexed32 : ComplexPattern<i64, 2, "SelectAddrModeIndexed32", []>;
+def am_indexed64 : ComplexPattern<i64, 2, "SelectAddrModeIndexed64", []>;
+def am_indexed128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed128", []>;
+
+class UImm12OffsetOperand<int Scale> : AsmOperandClass {
+ let Name = "UImm12Offset" # Scale;
+ let RenderMethod = "addUImm12OffsetOperands<" # Scale # ">";
+ let PredicateMethod = "isUImm12Offset<" # Scale # ">";
+ let DiagnosticType = "InvalidMemoryIndexed" # Scale;
+}
+
+def UImm12OffsetScale1Operand : UImm12OffsetOperand<1>;
+def UImm12OffsetScale2Operand : UImm12OffsetOperand<2>;
+def UImm12OffsetScale4Operand : UImm12OffsetOperand<4>;
+def UImm12OffsetScale8Operand : UImm12OffsetOperand<8>;
+def UImm12OffsetScale16Operand : UImm12OffsetOperand<16>;
+
+class uimm12_scaled<int Scale> : Operand<i64> {
+ let ParserMatchClass
+ = !cast<AsmOperandClass>("UImm12OffsetScale" # Scale # "Operand");
+ let EncoderMethod
+ = "getLdStUImm12OpValue<AArch64::fixup_aarch64_ldst_imm12_scale" # Scale # ">";
+ let PrintMethod = "printUImm12Offset<" # Scale # ">";
+}
+
+def uimm12s1 : uimm12_scaled<1>;
+def uimm12s2 : uimm12_scaled<2>;
+def uimm12s4 : uimm12_scaled<4>;
+def uimm12s8 : uimm12_scaled<8>;
+def uimm12s16 : uimm12_scaled<16>;
+
+class BaseLoadStoreUI<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
+ string asm, list<dag> pattern>
+ : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]", "", pattern> {
+ bits<5> Rt;
+
+ bits<5> Rn;
+ bits<12> offset;
+
+ let Inst{31-30} = sz;
+ let Inst{29-27} = 0b111;
+ let Inst{26} = V;
+ let Inst{25-24} = 0b01;
+ let Inst{23-22} = opc;
+ let Inst{21-10} = offset;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+
+ let DecoderMethod = "DecodeUnsignedLdStInstruction";
+}
+
+multiclass LoadUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ Operand indextype, string asm, list<dag> pattern> {
+ let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+ def ui : BaseLoadStoreUI<sz, V, opc, (outs regtype:$Rt),
+ (ins GPR64sp:$Rn, indextype:$offset),
+ asm, pattern>,
+ Sched<[WriteLD]>;
+
+ def : InstAlias<asm # "\t$Rt, [$Rn]",
+ (!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+multiclass StoreUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ Operand indextype, string asm, list<dag> pattern> {
+ let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+ def ui : BaseLoadStoreUI<sz, V, opc, (outs),
+ (ins regtype:$Rt, GPR64sp:$Rn, indextype:$offset),
+ asm, pattern>,
+ Sched<[WriteST]>;
+
+ def : InstAlias<asm # "\t$Rt, [$Rn]",
+ (!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+def PrefetchOperand : AsmOperandClass {
+ let Name = "Prefetch";
+ let ParserMethod = "tryParsePrefetch";
+}
+def prfop : Operand<i32> {
+ let PrintMethod = "printPrefetchOp";
+ let ParserMatchClass = PrefetchOperand;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class PrefetchUI<bits<2> sz, bit V, bits<2> opc, string asm, list<dag> pat>
+ : BaseLoadStoreUI<sz, V, opc,
+ (outs), (ins prfop:$Rt, GPR64sp:$Rn, uimm12s8:$offset),
+ asm, pat>,
+ Sched<[WriteLD]>;
+
+//---
+// Load literal
+//---
+
+// Load literal address: 19-bit immediate. The low two bits of the target
+// offset are implied zero and so are not part of the immediate.
+def am_ldrlit : Operand<OtherVT> {
+ let EncoderMethod = "getLoadLiteralOpValue";
+ let DecoderMethod = "DecodePCRelLabel19";
+ let PrintMethod = "printAlignedLabel";
+ let ParserMatchClass = PCRelLabel19Operand;
+}
+
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+class LoadLiteral<bits<2> opc, bit V, RegisterClass regtype, string asm>
+ : I<(outs regtype:$Rt), (ins am_ldrlit:$label),
+ asm, "\t$Rt, $label", "", []>,
+ Sched<[WriteLD]> {
+ bits<5> Rt;
+ bits<19> label;
+ let Inst{31-30} = opc;
+ let Inst{29-27} = 0b011;
+ let Inst{26} = V;
+ let Inst{25-24} = 0b00;
+ let Inst{23-5} = label;
+ let Inst{4-0} = Rt;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class PrefetchLiteral<bits<2> opc, bit V, string asm, list<dag> pat>
+ : I<(outs), (ins prfop:$Rt, am_ldrlit:$label),
+ asm, "\t$Rt, $label", "", pat>,
+ Sched<[WriteLD]> {
+ bits<5> Rt;
+ bits<19> label;
+ let Inst{31-30} = opc;
+ let Inst{29-27} = 0b011;
+ let Inst{26} = V;
+ let Inst{25-24} = 0b00;
+ let Inst{23-5} = label;
+ let Inst{4-0} = Rt;
+}
+
+//---
+// Load/store register offset
+//---
+
+def ro_Xindexed8 : ComplexPattern<i64, 4, "SelectAddrModeXRO<8>", []>;
+def ro_Xindexed16 : ComplexPattern<i64, 4, "SelectAddrModeXRO<16>", []>;
+def ro_Xindexed32 : ComplexPattern<i64, 4, "SelectAddrModeXRO<32>", []>;
+def ro_Xindexed64 : ComplexPattern<i64, 4, "SelectAddrModeXRO<64>", []>;
+def ro_Xindexed128 : ComplexPattern<i64, 4, "SelectAddrModeXRO<128>", []>;
+
+def ro_Windexed8 : ComplexPattern<i64, 4, "SelectAddrModeWRO<8>", []>;
+def ro_Windexed16 : ComplexPattern<i64, 4, "SelectAddrModeWRO<16>", []>;
+def ro_Windexed32 : ComplexPattern<i64, 4, "SelectAddrModeWRO<32>", []>;
+def ro_Windexed64 : ComplexPattern<i64, 4, "SelectAddrModeWRO<64>", []>;
+def ro_Windexed128 : ComplexPattern<i64, 4, "SelectAddrModeWRO<128>", []>;
+
+class MemExtendOperand<string Reg, int Width> : AsmOperandClass {
+ let Name = "Mem" # Reg # "Extend" # Width;
+ let PredicateMethod = "isMem" # Reg # "Extend<" # Width # ">";
+ let RenderMethod = "addMemExtendOperands";
+ let DiagnosticType = "InvalidMemory" # Reg # "Extend" # Width;
+}
+
+def MemWExtend8Operand : MemExtendOperand<"W", 8> {
+ // The address "[x0, x1, lsl #0]" actually maps to the variant which performs
+ // the trivial shift.
+ let RenderMethod = "addMemExtend8Operands";
+}
+def MemWExtend16Operand : MemExtendOperand<"W", 16>;
+def MemWExtend32Operand : MemExtendOperand<"W", 32>;
+def MemWExtend64Operand : MemExtendOperand<"W", 64>;
+def MemWExtend128Operand : MemExtendOperand<"W", 128>;
+
+def MemXExtend8Operand : MemExtendOperand<"X", 8> {
+ // The address "[x0, x1, lsl #0]" actually maps to the variant which performs
+ // the trivial shift.
+ let RenderMethod = "addMemExtend8Operands";
+}
+def MemXExtend16Operand : MemExtendOperand<"X", 16>;
+def MemXExtend32Operand : MemExtendOperand<"X", 32>;
+def MemXExtend64Operand : MemExtendOperand<"X", 64>;
+def MemXExtend128Operand : MemExtendOperand<"X", 128>;
+
+class ro_extend<AsmOperandClass ParserClass, string Reg, int Width>
+ : Operand<i32> {
+ let ParserMatchClass = ParserClass;
+ let PrintMethod = "printMemExtend<'" # Reg # "', " # Width # ">";
+ let DecoderMethod = "DecodeMemExtend";
+ let EncoderMethod = "getMemExtendOpValue";
+ let MIOperandInfo = (ops i32imm:$signed, i32imm:$doshift);
+}
+
+def ro_Wextend8 : ro_extend<MemWExtend8Operand, "w", 8>;
+def ro_Wextend16 : ro_extend<MemWExtend16Operand, "w", 16>;
+def ro_Wextend32 : ro_extend<MemWExtend32Operand, "w", 32>;
+def ro_Wextend64 : ro_extend<MemWExtend64Operand, "w", 64>;
+def ro_Wextend128 : ro_extend<MemWExtend128Operand, "w", 128>;
+
+def ro_Xextend8 : ro_extend<MemXExtend8Operand, "x", 8>;
+def ro_Xextend16 : ro_extend<MemXExtend16Operand, "x", 16>;
+def ro_Xextend32 : ro_extend<MemXExtend32Operand, "x", 32>;
+def ro_Xextend64 : ro_extend<MemXExtend64Operand, "x", 64>;
+def ro_Xextend128 : ro_extend<MemXExtend128Operand, "x", 128>;
+
+class ROAddrMode<ComplexPattern windex, ComplexPattern xindex,
+ Operand wextend, Operand xextend> {
+ // CodeGen-level pattern covering the entire addressing mode.
+ ComplexPattern Wpat = windex;
+ ComplexPattern Xpat = xindex;
+
+ // Asm-level Operand covering the valid "uxtw #3" style syntax.
+ Operand Wext = wextend;
+ Operand Xext = xextend;
+}
+
+def ro8 : ROAddrMode<ro_Windexed8, ro_Xindexed8, ro_Wextend8, ro_Xextend8>;
+def ro16 : ROAddrMode<ro_Windexed16, ro_Xindexed16, ro_Wextend16, ro_Xextend16>;
+def ro32 : ROAddrMode<ro_Windexed32, ro_Xindexed32, ro_Wextend32, ro_Xextend32>;
+def ro64 : ROAddrMode<ro_Windexed64, ro_Xindexed64, ro_Wextend64, ro_Xextend64>;
+def ro128 : ROAddrMode<ro_Windexed128, ro_Xindexed128, ro_Wextend128,
+ ro_Xextend128>;
+
+class LoadStore8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm, dag ins, dag outs, list<dag> pat>
+ : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+ bits<5> Rt;
+ bits<5> Rn;
+ bits<5> Rm;
+ bits<2> extend;
+ let Inst{31-30} = sz;
+ let Inst{29-27} = 0b111;
+ let Inst{26} = V;
+ let Inst{25-24} = 0b00;
+ let Inst{23-22} = opc;
+ let Inst{21} = 1;
+ let Inst{20-16} = Rm;
+ let Inst{15} = extend{1}; // sign extend Rm?
+ let Inst{14} = 1;
+ let Inst{12} = extend{0}; // do shift?
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+}
+
+class ROInstAlias<string asm, RegisterClass regtype, Instruction INST>
+ : InstAlias<asm # "\t$Rt, [$Rn, $Rm]",
+ (INST regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, 0, 0)>;
+
+multiclass Load8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm, ValueType Ty, SDPatternOperator loadop> {
+ let AddedComplexity = 10 in
+ def roW : LoadStore8RO<sz, V, opc, regtype, asm,
+ (outs regtype:$Rt),
+ (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend),
+ [(set (Ty regtype:$Rt),
+ (loadop (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm,
+ ro_Wextend8:$extend)))]>,
+ Sched<[WriteLDIdx, ReadAdrBase]> {
+ let Inst{13} = 0b0;
+ }
+
+ let AddedComplexity = 10 in
+ def roX : LoadStore8RO<sz, V, opc, regtype, asm,
+ (outs regtype:$Rt),
+ (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend),
+ [(set (Ty regtype:$Rt),
+ (loadop (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend8:$extend)))]>,
+ Sched<[WriteLDIdx, ReadAdrBase]> {
+ let Inst{13} = 0b1;
+ }
+
+ def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+multiclass Store8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm, ValueType Ty, SDPatternOperator storeop> {
+ let AddedComplexity = 10 in
+ def roW : LoadStore8RO<sz, V, opc, regtype, asm, (outs),
+ (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend),
+ [(storeop (Ty regtype:$Rt),
+ (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm,
+ ro_Wextend8:$extend))]>,
+ Sched<[WriteSTIdx, ReadAdrBase]> {
+ let Inst{13} = 0b0;
+ }
+
+ let AddedComplexity = 10 in
+ def roX : LoadStore8RO<sz, V, opc, regtype, asm, (outs),
+ (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend),
+ [(storeop (Ty regtype:$Rt),
+ (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend8:$extend))]>,
+ Sched<[WriteSTIdx, ReadAdrBase]> {
+ let Inst{13} = 0b1;
+ }
+
+ def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+class LoadStore16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm, dag ins, dag outs, list<dag> pat>
+ : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+ bits<5> Rt;
+ bits<5> Rn;
+ bits<5> Rm;
+ bits<2> extend;
+ let Inst{31-30} = sz;
+ let Inst{29-27} = 0b111;
+ let Inst{26} = V;
+ let Inst{25-24} = 0b00;
+ let Inst{23-22} = opc;
+ let Inst{21} = 1;
+ let Inst{20-16} = Rm;
+ let Inst{15} = extend{1}; // sign extend Rm?
+ let Inst{14} = 1;
+ let Inst{12} = extend{0}; // do shift?
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+}
+
+multiclass Load16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm, ValueType Ty, SDPatternOperator loadop> {
+ let AddedComplexity = 10 in
+ def roW : LoadStore16RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+ (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend),
+ [(set (Ty regtype:$Rt),
+ (loadop (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
+ ro_Wextend16:$extend)))]>,
+ Sched<[WriteLDIdx, ReadAdrBase]> {
+ let Inst{13} = 0b0;
+ }
+
+ let AddedComplexity = 10 in
+ def roX : LoadStore16RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+ (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend),
+ [(set (Ty regtype:$Rt),
+ (loadop (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend16:$extend)))]>,
+ Sched<[WriteLDIdx, ReadAdrBase]> {
+ let Inst{13} = 0b1;
+ }
+
+ def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+multiclass Store16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm, ValueType Ty, SDPatternOperator storeop> {
+ let AddedComplexity = 10 in
+ def roW : LoadStore16RO<sz, V, opc, regtype, asm, (outs),
+ (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend),
+ [(storeop (Ty regtype:$Rt),
+ (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
+ ro_Wextend16:$extend))]>,
+ Sched<[WriteSTIdx, ReadAdrBase]> {
+ let Inst{13} = 0b0;
+ }
+
+ let AddedComplexity = 10 in
+ def roX : LoadStore16RO<sz, V, opc, regtype, asm, (outs),
+ (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend),
+ [(storeop (Ty regtype:$Rt),
+ (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend16:$extend))]>,
+ Sched<[WriteSTIdx, ReadAdrBase]> {
+ let Inst{13} = 0b1;
+ }
+
+ def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+class LoadStore32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm, dag ins, dag outs, list<dag> pat>
+ : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+ bits<5> Rt;
+ bits<5> Rn;
+ bits<5> Rm;
+ bits<2> extend;
+ let Inst{31-30} = sz;
+ let Inst{29-27} = 0b111;
+ let Inst{26} = V;
+ let Inst{25-24} = 0b00;
+ let Inst{23-22} = opc;
+ let Inst{21} = 1;
+ let Inst{20-16} = Rm;
+ let Inst{15} = extend{1}; // sign extend Rm?
+ let Inst{14} = 1;
+ let Inst{12} = extend{0}; // do shift?
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+}
+
+multiclass Load32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm, ValueType Ty, SDPatternOperator loadop> {
+ let AddedComplexity = 10 in
+ def roW : LoadStore32RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+ (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend),
+ [(set (Ty regtype:$Rt),
+ (loadop (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
+ ro_Wextend32:$extend)))]>,
+ Sched<[WriteLDIdx, ReadAdrBase]> {
+ let Inst{13} = 0b0;
+ }
+
+ let AddedComplexity = 10 in
+ def roX : LoadStore32RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+ (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend),
+ [(set (Ty regtype:$Rt),
+ (loadop (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend32:$extend)))]>,
+ Sched<[WriteLDIdx, ReadAdrBase]> {
+ let Inst{13} = 0b1;
+ }
+
+ def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+multiclass Store32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm, ValueType Ty, SDPatternOperator storeop> {
+ let AddedComplexity = 10 in
+ def roW : LoadStore32RO<sz, V, opc, regtype, asm, (outs),
+ (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend),
+ [(storeop (Ty regtype:$Rt),
+ (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
+ ro_Wextend32:$extend))]>,
+ Sched<[WriteSTIdx, ReadAdrBase]> {
+ let Inst{13} = 0b0;
+ }
+
+ let AddedComplexity = 10 in
+ def roX : LoadStore32RO<sz, V, opc, regtype, asm, (outs),
+ (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend),
+ [(storeop (Ty regtype:$Rt),
+ (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend32:$extend))]>,
+ Sched<[WriteSTIdx, ReadAdrBase]> {
+ let Inst{13} = 0b1;
+ }
+
+ def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+class LoadStore64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm, dag ins, dag outs, list<dag> pat>
+ : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+ bits<5> Rt;
+ bits<5> Rn;
+ bits<5> Rm;
+ bits<2> extend;
+ let Inst{31-30} = sz;
+ let Inst{29-27} = 0b111;
+ let Inst{26} = V;
+ let Inst{25-24} = 0b00;
+ let Inst{23-22} = opc;
+ let Inst{21} = 1;
+ let Inst{20-16} = Rm;
+ let Inst{15} = extend{1}; // sign extend Rm?
+ let Inst{14} = 1;
+ let Inst{12} = extend{0}; // do shift?
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+}
+
+multiclass Load64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm, ValueType Ty, SDPatternOperator loadop> {
+ let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+ def roW : LoadStore64RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+ (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend),
+ [(set (Ty regtype:$Rt),
+ (loadop (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+ ro_Wextend64:$extend)))]>,
+ Sched<[WriteLDIdx, ReadAdrBase]> {
+ let Inst{13} = 0b0;
+ }
+
+ let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+ def roX : LoadStore64RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+ (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend),
+ [(set (Ty regtype:$Rt),
+ (loadop (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend64:$extend)))]>,
+ Sched<[WriteLDIdx, ReadAdrBase]> {
+ let Inst{13} = 0b1;
+ }
+
+ def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+multiclass Store64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm, ValueType Ty, SDPatternOperator storeop> {
+ let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+ def roW : LoadStore64RO<sz, V, opc, regtype, asm, (outs),
+ (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend),
+ [(storeop (Ty regtype:$Rt),
+ (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+ ro_Wextend64:$extend))]>,
+ Sched<[WriteSTIdx, ReadAdrBase]> {
+ let Inst{13} = 0b0;
+ }
+
+ let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+ def roX : LoadStore64RO<sz, V, opc, regtype, asm, (outs),
+ (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend),
+ [(storeop (Ty regtype:$Rt),
+ (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend64:$extend))]>,
+ Sched<[WriteSTIdx, ReadAdrBase]> {
+ let Inst{13} = 0b1;
+ }
+
+ def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+class LoadStore128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm, dag ins, dag outs, list<dag> pat>
+ : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+ bits<5> Rt;
+ bits<5> Rn;
+ bits<5> Rm;
+ bits<2> extend;
+ let Inst{31-30} = sz;
+ let Inst{29-27} = 0b111;
+ let Inst{26} = V;
+ let Inst{25-24} = 0b00;
+ let Inst{23-22} = opc;
+ let Inst{21} = 1;
+ let Inst{20-16} = Rm;
+ let Inst{15} = extend{1}; // sign extend Rm?
+ let Inst{14} = 1;
+ let Inst{12} = extend{0}; // do shift?
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+}
+
+multiclass Load128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm, ValueType Ty, SDPatternOperator loadop> {
+ let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+ def roW : LoadStore128RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+ (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend128:$extend),
+ [(set (Ty regtype:$Rt),
+ (loadop (ro_Windexed128 GPR64sp:$Rn, GPR32:$Rm,
+ ro_Wextend128:$extend)))]>,
+ Sched<[WriteLDIdx, ReadAdrBase]> {
+ let Inst{13} = 0b0;
+ }
+
+ let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+ def roX : LoadStore128RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+ (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend128:$extend),
+ [(set (Ty regtype:$Rt),
+ (loadop (ro_Xindexed128 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend128:$extend)))]>,
+ Sched<[WriteLDIdx, ReadAdrBase]> {
+ let Inst{13} = 0b1;
+ }
+
+ def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+multiclass Store128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm, ValueType Ty, SDPatternOperator storeop> {
+ let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+ def roW : LoadStore128RO<sz, V, opc, regtype, asm, (outs),
+ (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend128:$extend),
+ [(storeop (Ty regtype:$Rt),
+ (ro_Windexed128 GPR64sp:$Rn, GPR32:$Rm,
+ ro_Wextend128:$extend))]>,
+ Sched<[WriteSTIdx, ReadAdrBase]> {
+ let Inst{13} = 0b0;
+ }
+
+ let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+ def roX : LoadStore128RO<sz, V, opc, regtype, asm, (outs),
+ (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend128:$extend),
+ [(storeop (Ty regtype:$Rt),
+ (ro_Xindexed128 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend128:$extend))]>,
+ Sched<[WriteSTIdx, ReadAdrBase]> {
+ let Inst{13} = 0b1;
+ }
+
+ def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class BasePrefetchRO<bits<2> sz, bit V, bits<2> opc, dag outs, dag ins,
+ string asm, list<dag> pat>
+ : I<outs, ins, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat>,
+ Sched<[WriteLD]> {
+ bits<5> Rt;
+ bits<5> Rn;
+ bits<5> Rm;
+ bits<2> extend;
+ let Inst{31-30} = sz;
+ let Inst{29-27} = 0b111;
+ let Inst{26} = V;
+ let Inst{25-24} = 0b00;
+ let Inst{23-22} = opc;
+ let Inst{21} = 1;
+ let Inst{20-16} = Rm;
+ let Inst{15} = extend{1}; // sign extend Rm?
+ let Inst{14} = 1;
+ let Inst{12} = extend{0}; // do shift?
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+}
+
+multiclass PrefetchRO<bits<2> sz, bit V, bits<2> opc, string asm> {
+ def roW : BasePrefetchRO<sz, V, opc, (outs),
+ (ins prfop:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend),
+ asm, [(AArch64Prefetch imm:$Rt,
+ (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+ ro_Wextend64:$extend))]> {
+ let Inst{13} = 0b0;
+ }
+
+ def roX : BasePrefetchRO<sz, V, opc, (outs),
+ (ins prfop:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend),
+ asm, [(AArch64Prefetch imm:$Rt,
+ (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend64:$extend))]> {
+ let Inst{13} = 0b1;
+ }
+
+ def : InstAlias<"prfm $Rt, [$Rn, $Rm]",
+ (!cast<Instruction>(NAME # "roX") prfop:$Rt,
+ GPR64sp:$Rn, GPR64:$Rm, 0, 0)>;
+}
+
+//---
+// Load/store unscaled immediate
+//---
+
+def am_unscaled8 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled8", []>;
+def am_unscaled16 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled16", []>;
+def am_unscaled32 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled32", []>;
+def am_unscaled64 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled64", []>;
+def am_unscaled128 :ComplexPattern<i64, 2, "SelectAddrModeUnscaled128", []>;
+
+class BaseLoadStoreUnscale<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
+ string asm, list<dag> pattern>
+ : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]", "", pattern> {
+ bits<5> Rt;
+ bits<5> Rn;
+ bits<9> offset;
+ let Inst{31-30} = sz;
+ let Inst{29-27} = 0b111;
+ let Inst{26} = V;
+ let Inst{25-24} = 0b00;
+ let Inst{23-22} = opc;
+ let Inst{21} = 0;
+ let Inst{20-12} = offset;
+ let Inst{11-10} = 0b00;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+
+ let DecoderMethod = "DecodeSignedLdStInstruction";
+}
+
+multiclass LoadUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm, list<dag> pattern> {
+ let AddedComplexity = 1 in // try this before LoadUI
+ def i : BaseLoadStoreUnscale<sz, V, opc, (outs regtype:$Rt),
+ (ins GPR64sp:$Rn, simm9:$offset), asm, pattern>,
+ Sched<[WriteLD]>;
+
+ def : InstAlias<asm # "\t$Rt, [$Rn]",
+ (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+multiclass StoreUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm, list<dag> pattern> {
+ let AddedComplexity = 1 in // try this before StoreUI
+ def i : BaseLoadStoreUnscale<sz, V, opc, (outs),
+ (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
+ asm, pattern>,
+ Sched<[WriteST]>;
+
+ def : InstAlias<asm # "\t$Rt, [$Rn]",
+ (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+multiclass PrefetchUnscaled<bits<2> sz, bit V, bits<2> opc, string asm,
+ list<dag> pat> {
+ let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+ def i : BaseLoadStoreUnscale<sz, V, opc, (outs),
+ (ins prfop:$Rt, GPR64sp:$Rn, simm9:$offset),
+ asm, pat>,
+ Sched<[WriteLD]>;
+
+ def : InstAlias<asm # "\t$Rt, [$Rn]",
+ (!cast<Instruction>(NAME # "i") prfop:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+//---
+// Load/store unscaled immediate, unprivileged
+//---
+
+class BaseLoadStoreUnprivileged<bits<2> sz, bit V, bits<2> opc,
+ dag oops, dag iops, string asm>
+ : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]", "", []> {
+ bits<5> Rt;
+ bits<5> Rn;
+ bits<9> offset;
+ let Inst{31-30} = sz;
+ let Inst{29-27} = 0b111;
+ let Inst{26} = V;
+ let Inst{25-24} = 0b00;
+ let Inst{23-22} = opc;
+ let Inst{21} = 0;
+ let Inst{20-12} = offset;
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+
+ let DecoderMethod = "DecodeSignedLdStInstruction";
+}
+
+multiclass LoadUnprivileged<bits<2> sz, bit V, bits<2> opc,
+ RegisterClass regtype, string asm> {
+ let mayStore = 0, mayLoad = 1, hasSideEffects = 0 in
+ def i : BaseLoadStoreUnprivileged<sz, V, opc, (outs regtype:$Rt),
+ (ins GPR64sp:$Rn, simm9:$offset), asm>,
+ Sched<[WriteLD]>;
+
+ def : InstAlias<asm # "\t$Rt, [$Rn]",
+ (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+multiclass StoreUnprivileged<bits<2> sz, bit V, bits<2> opc,
+ RegisterClass regtype, string asm> {
+ let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in
+ def i : BaseLoadStoreUnprivileged<sz, V, opc, (outs),
+ (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
+ asm>,
+ Sched<[WriteST]>;
+
+ def : InstAlias<asm # "\t$Rt, [$Rn]",
+ (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+//---
+// Load/store pre-indexed
+//---
+
+class BaseLoadStorePreIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
+ string asm, string cstr, list<dag> pat>
+ : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]!", cstr, pat> {
+ bits<5> Rt;
+ bits<5> Rn;
+ bits<9> offset;
+ let Inst{31-30} = sz;
+ let Inst{29-27} = 0b111;
+ let Inst{26} = V;
+ let Inst{25-24} = 0;
+ let Inst{23-22} = opc;
+ let Inst{21} = 0;
+ let Inst{20-12} = offset;
+ let Inst{11-10} = 0b11;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+
+ let DecoderMethod = "DecodeSignedLdStInstruction";
+}
+
+let hasSideEffects = 0 in {
+let mayStore = 0, mayLoad = 1 in
+class LoadPreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm>
+ : BaseLoadStorePreIdx<sz, V, opc,
+ (outs GPR64sp:$wback, regtype:$Rt),
+ (ins GPR64sp:$Rn, simm9:$offset), asm,
+ "$Rn = $wback,@earlyclobber $wback", []>,
+ Sched<[WriteLD, WriteAdr]>;
+
+let mayStore = 1, mayLoad = 0 in
+class StorePreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm, SDPatternOperator storeop, ValueType Ty>
+ : BaseLoadStorePreIdx<sz, V, opc,
+ (outs GPR64sp:$wback),
+ (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
+ asm, "$Rn = $wback,@earlyclobber $wback",
+ [(set GPR64sp:$wback,
+ (storeop (Ty regtype:$Rt), GPR64sp:$Rn, simm9:$offset))]>,
+ Sched<[WriteAdr, WriteST]>;
+} // hasSideEffects = 0
+
+//---
+// Load/store post-indexed
+//---
+
+class BaseLoadStorePostIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
+ string asm, string cstr, list<dag> pat>
+ : I<oops, iops, asm, "\t$Rt, [$Rn], $offset", cstr, pat> {
+ bits<5> Rt;
+ bits<5> Rn;
+ bits<9> offset;
+ let Inst{31-30} = sz;
+ let Inst{29-27} = 0b111;
+ let Inst{26} = V;
+ let Inst{25-24} = 0b00;
+ let Inst{23-22} = opc;
+ let Inst{21} = 0b0;
+ let Inst{20-12} = offset;
+ let Inst{11-10} = 0b01;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+
+ let DecoderMethod = "DecodeSignedLdStInstruction";
+}
+
+let hasSideEffects = 0 in {
+let mayStore = 0, mayLoad = 1 in
+class LoadPostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm>
+ : BaseLoadStorePostIdx<sz, V, opc,
+ (outs GPR64sp:$wback, regtype:$Rt),
+ (ins GPR64sp:$Rn, simm9:$offset),
+ asm, "$Rn = $wback,@earlyclobber $wback", []>,
+ Sched<[WriteLD, WriteI]>;
+
+let mayStore = 1, mayLoad = 0 in
+class StorePostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm, SDPatternOperator storeop, ValueType Ty>
+ : BaseLoadStorePostIdx<sz, V, opc,
+ (outs GPR64sp:$wback),
+ (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
+ asm, "$Rn = $wback,@earlyclobber $wback",
+ [(set GPR64sp:$wback,
+ (storeop (Ty regtype:$Rt), GPR64sp:$Rn, simm9:$offset))]>,
+ Sched<[WriteAdr, WriteST, ReadAdrBase]>;
+} // hasSideEffects = 0
+
+
+//---
+// Load/store pair
+//---
+
+// (indexed, offset)
+
+class BaseLoadStorePairOffset<bits<2> opc, bit V, bit L, dag oops, dag iops,
+ string asm>
+ : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]", "", []> {
+ bits<5> Rt;
+ bits<5> Rt2;
+ bits<5> Rn;
+ bits<7> offset;
+ let Inst{31-30} = opc;
+ let Inst{29-27} = 0b101;
+ let Inst{26} = V;
+ let Inst{25-23} = 0b010;
+ let Inst{22} = L;
+ let Inst{21-15} = offset;
+ let Inst{14-10} = Rt2;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+
+ let DecoderMethod = "DecodePairLdStInstruction";
+}
+
+multiclass LoadPairOffset<bits<2> opc, bit V, RegisterClass regtype,
+ Operand indextype, string asm> {
+ let hasSideEffects = 0, mayStore = 0, mayLoad = 1 in
+ def i : BaseLoadStorePairOffset<opc, V, 1,
+ (outs regtype:$Rt, regtype:$Rt2),
+ (ins GPR64sp:$Rn, indextype:$offset), asm>,
+ Sched<[WriteLD, WriteLDHi]>;
+
+ def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
+ (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
+ GPR64sp:$Rn, 0)>;
+}
+
+
+multiclass StorePairOffset<bits<2> opc, bit V, RegisterClass regtype,
+ Operand indextype, string asm> {
+ let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in
+ def i : BaseLoadStorePairOffset<opc, V, 0, (outs),
+ (ins regtype:$Rt, regtype:$Rt2,
+ GPR64sp:$Rn, indextype:$offset),
+ asm>,
+ Sched<[WriteSTP]>;
+
+ def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
+ (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
+ GPR64sp:$Rn, 0)>;
+}
+
+// (pre-indexed)
+class BaseLoadStorePairPreIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
+ string asm>
+ : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]!", "$Rn = $wback,@earlyclobber $wback", []> {
+ bits<5> Rt;
+ bits<5> Rt2;
+ bits<5> Rn;
+ bits<7> offset;
+ let Inst{31-30} = opc;
+ let Inst{29-27} = 0b101;
+ let Inst{26} = V;
+ let Inst{25-23} = 0b011;
+ let Inst{22} = L;
+ let Inst{21-15} = offset;
+ let Inst{14-10} = Rt2;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+
+ let DecoderMethod = "DecodePairLdStInstruction";
+}
+
+let hasSideEffects = 0 in {
+let mayStore = 0, mayLoad = 1 in
+class LoadPairPreIdx<bits<2> opc, bit V, RegisterClass regtype,
+ Operand indextype, string asm>
+ : BaseLoadStorePairPreIdx<opc, V, 1,
+ (outs GPR64sp:$wback, regtype:$Rt, regtype:$Rt2),
+ (ins GPR64sp:$Rn, indextype:$offset), asm>,
+ Sched<[WriteLD, WriteLDHi, WriteAdr]>;
+
+let mayStore = 1, mayLoad = 0 in
+class StorePairPreIdx<bits<2> opc, bit V, RegisterClass regtype,
+ Operand indextype, string asm>
+ : BaseLoadStorePairPreIdx<opc, V, 0, (outs GPR64sp:$wback),
+ (ins regtype:$Rt, regtype:$Rt2,
+ GPR64sp:$Rn, indextype:$offset),
+ asm>,
+ Sched<[WriteAdr, WriteSTP]>;
+} // hasSideEffects = 0
+
+// (post-indexed)
+
+class BaseLoadStorePairPostIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
+ string asm>
+ : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn], $offset", "$Rn = $wback,@earlyclobber $wback", []> {
+ bits<5> Rt;
+ bits<5> Rt2;
+ bits<5> Rn;
+ bits<7> offset;
+ let Inst{31-30} = opc;
+ let Inst{29-27} = 0b101;
+ let Inst{26} = V;
+ let Inst{25-23} = 0b001;
+ let Inst{22} = L;
+ let Inst{21-15} = offset;
+ let Inst{14-10} = Rt2;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+
+ let DecoderMethod = "DecodePairLdStInstruction";
+}
+
+let hasSideEffects = 0 in {
+let mayStore = 0, mayLoad = 1 in
+class LoadPairPostIdx<bits<2> opc, bit V, RegisterClass regtype,
+ Operand idxtype, string asm>
+ : BaseLoadStorePairPostIdx<opc, V, 1,
+ (outs GPR64sp:$wback, regtype:$Rt, regtype:$Rt2),
+ (ins GPR64sp:$Rn, idxtype:$offset), asm>,
+ Sched<[WriteLD, WriteLDHi, WriteAdr]>;
+
+let mayStore = 1, mayLoad = 0 in
+class StorePairPostIdx<bits<2> opc, bit V, RegisterClass regtype,
+ Operand idxtype, string asm>
+ : BaseLoadStorePairPostIdx<opc, V, 0, (outs GPR64sp:$wback),
+ (ins regtype:$Rt, regtype:$Rt2,
+ GPR64sp:$Rn, idxtype:$offset),
+ asm>,
+ Sched<[WriteAdr, WriteSTP]>;
+} // hasSideEffects = 0
+
+// (no-allocate)
+
+class BaseLoadStorePairNoAlloc<bits<2> opc, bit V, bit L, dag oops, dag iops,
+ string asm>
+ : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]", "", []> {
+ bits<5> Rt;
+ bits<5> Rt2;
+ bits<5> Rn;
+ bits<7> offset;
+ let Inst{31-30} = opc;
+ let Inst{29-27} = 0b101;
+ let Inst{26} = V;
+ let Inst{25-23} = 0b000;
+ let Inst{22} = L;
+ let Inst{21-15} = offset;
+ let Inst{14-10} = Rt2;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+
+ let DecoderMethod = "DecodePairLdStInstruction";
+}
+
+multiclass LoadPairNoAlloc<bits<2> opc, bit V, RegisterClass regtype,
+ Operand indextype, string asm> {
+ let hasSideEffects = 0, mayStore = 0, mayLoad = 1 in
+ def i : BaseLoadStorePairNoAlloc<opc, V, 1,
+ (outs regtype:$Rt, regtype:$Rt2),
+ (ins GPR64sp:$Rn, indextype:$offset), asm>,
+ Sched<[WriteLD, WriteLDHi]>;
+
+
+ def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
+ (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
+ GPR64sp:$Rn, 0)>;
+}
+
+multiclass StorePairNoAlloc<bits<2> opc, bit V, RegisterClass regtype,
+ Operand indextype, string asm> {
+ let hasSideEffects = 0, mayStore = 1, mayLoad = 0 in
+ def i : BaseLoadStorePairNoAlloc<opc, V, 0, (outs),
+ (ins regtype:$Rt, regtype:$Rt2,
+ GPR64sp:$Rn, indextype:$offset),
+ asm>,
+ Sched<[WriteSTP]>;
+
+ def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
+ (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
+ GPR64sp:$Rn, 0)>;
+}
+
+//---
+// Load/store exclusive
+//---
+
+// True exclusive operations write to and/or read from the system's exclusive
+// monitors, which as far as a compiler is concerned can be modelled as a
+// random shared memory address. Hence LoadExclusive mayStore.
+//
+// Since these instructions have the undefined register bits set to 1 in
+// their canonical form, we need a post encoder method to set those bits
+// to 1 when encoding these instructions. We do this using the
+// fixLoadStoreExclusive function. This function has template parameters:
+//
+// fixLoadStoreExclusive<int hasRs, int hasRt2>
+//
+// hasRs indicates that the instruction uses the Rs field, so we won't set
+// it to 1 (and the same for Rt2). We don't need template parameters for
+// the other register fields since Rt and Rn are always used.
+//
+let hasSideEffects = 1, mayLoad = 1, mayStore = 1 in
+class BaseLoadStoreExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+ dag oops, dag iops, string asm, string operands>
+ : I<oops, iops, asm, operands, "", []> {
+ let Inst{31-30} = sz;
+ let Inst{29-24} = 0b001000;
+ let Inst{23} = o2;
+ let Inst{22} = L;
+ let Inst{21} = o1;
+ let Inst{15} = o0;
+
+ let DecoderMethod = "DecodeExclusiveLdStInstruction";
+}
+
+// Neither Rs nor Rt2 operands.
+class LoadStoreExclusiveSimple<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+ dag oops, dag iops, string asm, string operands>
+ : BaseLoadStoreExclusive<sz, o2, L, o1, o0, oops, iops, asm, operands> {
+ bits<5> Rt;
+ bits<5> Rn;
+ let Inst{20-16} = 0b11111;
+ let Unpredictable{20-16} = 0b11111;
+ let Inst{14-10} = 0b11111;
+ let Unpredictable{14-10} = 0b11111;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+
+ let PostEncoderMethod = "fixLoadStoreExclusive<0,0>";
+}
+
+// Simple load acquires don't set the exclusive monitor
+let mayLoad = 1, mayStore = 0 in
+class LoadAcquire<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+ RegisterClass regtype, string asm>
+ : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs regtype:$Rt),
+ (ins GPR64sp0:$Rn), asm, "\t$Rt, [$Rn]">,
+ Sched<[WriteLD]>;
+
+class LoadExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+ RegisterClass regtype, string asm>
+ : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs regtype:$Rt),
+ (ins GPR64sp0:$Rn), asm, "\t$Rt, [$Rn]">,
+ Sched<[WriteLD]>;
+
+class LoadExclusivePair<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+ RegisterClass regtype, string asm>
+ : BaseLoadStoreExclusive<sz, o2, L, o1, o0,
+ (outs regtype:$Rt, regtype:$Rt2),
+ (ins GPR64sp0:$Rn), asm,
+ "\t$Rt, $Rt2, [$Rn]">,
+ Sched<[WriteLD, WriteLDHi]> {
+ bits<5> Rt;
+ bits<5> Rt2;
+ bits<5> Rn;
+ let Inst{14-10} = Rt2;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+
+ let PostEncoderMethod = "fixLoadStoreExclusive<0,1>";
+}
+
+// Simple store release operations do not check the exclusive monitor.
+let mayLoad = 0, mayStore = 1 in
+class StoreRelease<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+ RegisterClass regtype, string asm>
+ : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs),
+ (ins regtype:$Rt, GPR64sp0:$Rn),
+ asm, "\t$Rt, [$Rn]">,
+ Sched<[WriteST]>;
+
+let mayLoad = 1, mayStore = 1 in
+class StoreExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+ RegisterClass regtype, string asm>
+ : BaseLoadStoreExclusive<sz, o2, L, o1, o0, (outs GPR32:$Ws),
+ (ins regtype:$Rt, GPR64sp0:$Rn),
+ asm, "\t$Ws, $Rt, [$Rn]">,
+ Sched<[WriteSTX]> {
+ bits<5> Ws;
+ bits<5> Rt;
+ bits<5> Rn;
+ let Inst{20-16} = Ws;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+
+ let Constraints = "@earlyclobber $Ws";
+ let PostEncoderMethod = "fixLoadStoreExclusive<1,0>";
+}
+
+class StoreExclusivePair<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+ RegisterClass regtype, string asm>
+ : BaseLoadStoreExclusive<sz, o2, L, o1, o0,
+ (outs GPR32:$Ws),
+ (ins regtype:$Rt, regtype:$Rt2, GPR64sp0:$Rn),
+ asm, "\t$Ws, $Rt, $Rt2, [$Rn]">,
+ Sched<[WriteSTX]> {
+ bits<5> Ws;
+ bits<5> Rt;
+ bits<5> Rt2;
+ bits<5> Rn;
+ let Inst{20-16} = Ws;
+ let Inst{14-10} = Rt2;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+
+ let Constraints = "@earlyclobber $Ws";
+}
+
+//---
+// Exception generation
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class ExceptionGeneration<bits<3> op1, bits<2> ll, string asm>
+ : I<(outs), (ins imm0_65535:$imm), asm, "\t$imm", "", []>,
+ Sched<[WriteSys]> {
+ bits<16> imm;
+ let Inst{31-24} = 0b11010100;
+ let Inst{23-21} = op1;
+ let Inst{20-5} = imm;
+ let Inst{4-2} = 0b000;
+ let Inst{1-0} = ll;
+}
+
+let Predicates = [HasFPARMv8] in {
+
+//---
+// Floating point to integer conversion
+//---
+
+class BaseFPToIntegerUnscaled<bits<2> type, bits<2> rmode, bits<3> opcode,
+ RegisterClass srcType, RegisterClass dstType,
+ string asm, list<dag> pattern>
+ : I<(outs dstType:$Rd), (ins srcType:$Rn),
+ asm, "\t$Rd, $Rn", "", pattern>,
+ Sched<[WriteFCvt]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{30-29} = 0b00;
+ let Inst{28-24} = 0b11110;
+ let Inst{23-22} = type;
+ let Inst{21} = 1;
+ let Inst{20-19} = rmode;
+ let Inst{18-16} = opcode;
+ let Inst{15-10} = 0;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseFPToInteger<bits<2> type, bits<2> rmode, bits<3> opcode,
+ RegisterClass srcType, RegisterClass dstType,
+ Operand immType, string asm, list<dag> pattern>
+ : I<(outs dstType:$Rd), (ins srcType:$Rn, immType:$scale),
+ asm, "\t$Rd, $Rn, $scale", "", pattern>,
+ Sched<[WriteFCvt]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<6> scale;
+ let Inst{30-29} = 0b00;
+ let Inst{28-24} = 0b11110;
+ let Inst{23-22} = type;
+ let Inst{21} = 0;
+ let Inst{20-19} = rmode;
+ let Inst{18-16} = opcode;
+ let Inst{15-10} = scale;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+multiclass FPToIntegerUnscaled<bits<2> rmode, bits<3> opcode, string asm,
+ SDPatternOperator OpN> {
+ // Unscaled half-precision to 32-bit
+ def UWHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, GPR32, asm,
+ [(set GPR32:$Rd, (OpN FPR16:$Rn))]> {
+ let Inst{31} = 0; // 32-bit GPR flag
+ let Predicates = [HasFullFP16];
+ }
+
+ // Unscaled half-precision to 64-bit
+ def UXHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, GPR64, asm,
+ [(set GPR64:$Rd, (OpN FPR16:$Rn))]> {
+ let Inst{31} = 1; // 64-bit GPR flag
+ let Predicates = [HasFullFP16];
+ }
+
+ // Unscaled single-precision to 32-bit
+ def UWSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR32, asm,
+ [(set GPR32:$Rd, (OpN FPR32:$Rn))]> {
+ let Inst{31} = 0; // 32-bit GPR flag
+ }
+
+ // Unscaled single-precision to 64-bit
+ def UXSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR64, asm,
+ [(set GPR64:$Rd, (OpN FPR32:$Rn))]> {
+ let Inst{31} = 1; // 64-bit GPR flag
+ }
+
+ // Unscaled double-precision to 32-bit
+ def UWDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, GPR32, asm,
+ [(set GPR32:$Rd, (OpN (f64 FPR64:$Rn)))]> {
+ let Inst{31} = 0; // 32-bit GPR flag
+ }
+
+ // Unscaled double-precision to 64-bit
+ def UXDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, GPR64, asm,
+ [(set GPR64:$Rd, (OpN (f64 FPR64:$Rn)))]> {
+ let Inst{31} = 1; // 64-bit GPR flag
+ }
+}
+
+multiclass FPToIntegerScaled<bits<2> rmode, bits<3> opcode, string asm,
+ SDPatternOperator OpN> {
+ // Scaled half-precision to 32-bit
+ def SWHri : BaseFPToInteger<0b11, rmode, opcode, FPR16, GPR32,
+ fixedpoint_f16_i32, asm,
+ [(set GPR32:$Rd, (OpN (fmul FPR16:$Rn,
+ fixedpoint_f16_i32:$scale)))]> {
+ let Inst{31} = 0; // 32-bit GPR flag
+ let scale{5} = 1;
+ let Predicates = [HasFullFP16];
+ }
+
+ // Scaled half-precision to 64-bit
+ def SXHri : BaseFPToInteger<0b11, rmode, opcode, FPR16, GPR64,
+ fixedpoint_f16_i64, asm,
+ [(set GPR64:$Rd, (OpN (fmul FPR16:$Rn,
+ fixedpoint_f16_i64:$scale)))]> {
+ let Inst{31} = 1; // 64-bit GPR flag
+ let Predicates = [HasFullFP16];
+ }
+
+ // Scaled single-precision to 32-bit
+ def SWSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR32,
+ fixedpoint_f32_i32, asm,
+ [(set GPR32:$Rd, (OpN (fmul FPR32:$Rn,
+ fixedpoint_f32_i32:$scale)))]> {
+ let Inst{31} = 0; // 32-bit GPR flag
+ let scale{5} = 1;
+ }
+
+ // Scaled single-precision to 64-bit
+ def SXSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR64,
+ fixedpoint_f32_i64, asm,
+ [(set GPR64:$Rd, (OpN (fmul FPR32:$Rn,
+ fixedpoint_f32_i64:$scale)))]> {
+ let Inst{31} = 1; // 64-bit GPR flag
+ }
+
+ // Scaled double-precision to 32-bit
+ def SWDri : BaseFPToInteger<0b01, rmode, opcode, FPR64, GPR32,
+ fixedpoint_f64_i32, asm,
+ [(set GPR32:$Rd, (OpN (fmul FPR64:$Rn,
+ fixedpoint_f64_i32:$scale)))]> {
+ let Inst{31} = 0; // 32-bit GPR flag
+ let scale{5} = 1;
+ }
+
+ // Scaled double-precision to 64-bit
+ def SXDri : BaseFPToInteger<0b01, rmode, opcode, FPR64, GPR64,
+ fixedpoint_f64_i64, asm,
+ [(set GPR64:$Rd, (OpN (fmul FPR64:$Rn,
+ fixedpoint_f64_i64:$scale)))]> {
+ let Inst{31} = 1; // 64-bit GPR flag
+ }
+}
+
+//---
+// Integer to floating point conversion
+//---
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseIntegerToFP<bit isUnsigned,
+ RegisterClass srcType, RegisterClass dstType,
+ Operand immType, string asm, list<dag> pattern>
+ : I<(outs dstType:$Rd), (ins srcType:$Rn, immType:$scale),
+ asm, "\t$Rd, $Rn, $scale", "", pattern>,
+ Sched<[WriteFCvt]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<6> scale;
+ let Inst{30-24} = 0b0011110;
+ let Inst{21-17} = 0b00001;
+ let Inst{16} = isUnsigned;
+ let Inst{15-10} = scale;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+class BaseIntegerToFPUnscaled<bit isUnsigned,
+ RegisterClass srcType, RegisterClass dstType,
+ ValueType dvt, string asm, SDNode node>
+ : I<(outs dstType:$Rd), (ins srcType:$Rn),
+ asm, "\t$Rd, $Rn", "", [(set (dvt dstType:$Rd), (node srcType:$Rn))]>,
+ Sched<[WriteFCvt]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<6> scale;
+ let Inst{30-24} = 0b0011110;
+ let Inst{21-17} = 0b10001;
+ let Inst{16} = isUnsigned;
+ let Inst{15-10} = 0b000000;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> {
+ // Unscaled
+ def UWHri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR16, f16, asm, node> {
+ let Inst{31} = 0; // 32-bit GPR flag
+ let Inst{23-22} = 0b11; // 16-bit FPR flag
+ let Predicates = [HasFullFP16];
+ }
+
+ def UWSri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR32, f32, asm, node> {
+ let Inst{31} = 0; // 32-bit GPR flag
+ let Inst{23-22} = 0b00; // 32-bit FPR flag
+ }
+
+ def UWDri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR64, f64, asm, node> {
+ let Inst{31} = 0; // 32-bit GPR flag
+ let Inst{23-22} = 0b01; // 64-bit FPR flag
+ }
+
+ def UXHri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR16, f16, asm, node> {
+ let Inst{31} = 1; // 64-bit GPR flag
+ let Inst{23-22} = 0b11; // 16-bit FPR flag
+ let Predicates = [HasFullFP16];
+ }
+
+ def UXSri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR32, f32, asm, node> {
+ let Inst{31} = 1; // 64-bit GPR flag
+ let Inst{23-22} = 0b00; // 32-bit FPR flag
+ }
+
+ def UXDri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR64, f64, asm, node> {
+ let Inst{31} = 1; // 64-bit GPR flag
+ let Inst{23-22} = 0b01; // 64-bit FPR flag
+ }
+
+ // Scaled
+ def SWHri: BaseIntegerToFP<isUnsigned, GPR32, FPR16, fixedpoint_f16_i32, asm,
+ [(set FPR16:$Rd,
+ (fdiv (node GPR32:$Rn),
+ fixedpoint_f16_i32:$scale))]> {
+ let Inst{31} = 0; // 32-bit GPR flag
+ let Inst{23-22} = 0b11; // 16-bit FPR flag
+ let scale{5} = 1;
+ let Predicates = [HasFullFP16];
+ }
+
+ def SWSri: BaseIntegerToFP<isUnsigned, GPR32, FPR32, fixedpoint_f32_i32, asm,
+ [(set FPR32:$Rd,
+ (fdiv (node GPR32:$Rn),
+ fixedpoint_f32_i32:$scale))]> {
+ let Inst{31} = 0; // 32-bit GPR flag
+ let Inst{23-22} = 0b00; // 32-bit FPR flag
+ let scale{5} = 1;
+ }
+
+ def SWDri: BaseIntegerToFP<isUnsigned, GPR32, FPR64, fixedpoint_f64_i32, asm,
+ [(set FPR64:$Rd,
+ (fdiv (node GPR32:$Rn),
+ fixedpoint_f64_i32:$scale))]> {
+ let Inst{31} = 0; // 32-bit GPR flag
+ let Inst{23-22} = 0b01; // 64-bit FPR flag
+ let scale{5} = 1;
+ }
+
+ def SXHri: BaseIntegerToFP<isUnsigned, GPR64, FPR16, fixedpoint_f16_i64, asm,
+ [(set FPR16:$Rd,
+ (fdiv (node GPR64:$Rn),
+ fixedpoint_f16_i64:$scale))]> {
+ let Inst{31} = 1; // 64-bit GPR flag
+ let Inst{23-22} = 0b11; // 16-bit FPR flag
+ let Predicates = [HasFullFP16];
+ }
+
+ def SXSri: BaseIntegerToFP<isUnsigned, GPR64, FPR32, fixedpoint_f32_i64, asm,
+ [(set FPR32:$Rd,
+ (fdiv (node GPR64:$Rn),
+ fixedpoint_f32_i64:$scale))]> {
+ let Inst{31} = 1; // 64-bit GPR flag
+ let Inst{23-22} = 0b00; // 32-bit FPR flag
+ }
+
+ def SXDri: BaseIntegerToFP<isUnsigned, GPR64, FPR64, fixedpoint_f64_i64, asm,
+ [(set FPR64:$Rd,
+ (fdiv (node GPR64:$Rn),
+ fixedpoint_f64_i64:$scale))]> {
+ let Inst{31} = 1; // 64-bit GPR flag
+ let Inst{23-22} = 0b01; // 64-bit FPR flag
+ }
+}
+
+//---
+// Unscaled integer <-> floating point conversion (i.e. FMOV)
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseUnscaledConversion<bits<2> rmode, bits<3> opcode,
+ RegisterClass srcType, RegisterClass dstType,
+ string asm>
+ : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "",
+ // We use COPY_TO_REGCLASS for these bitconvert operations.
+ // copyPhysReg() expands the resultant COPY instructions after
+ // regalloc is done. This gives greater freedom for the allocator
+ // and related passes (coalescing, copy propagation, et. al.) to
+ // be more effective.
+ [/*(set (dvt dstType:$Rd), (bitconvert (svt srcType:$Rn)))*/]>,
+ Sched<[WriteFCopy]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{30-24} = 0b0011110;
+ let Inst{21} = 1;
+ let Inst{20-19} = rmode;
+ let Inst{18-16} = opcode;
+ let Inst{15-10} = 0b000000;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseUnscaledConversionToHigh<bits<2> rmode, bits<3> opcode,
+ RegisterClass srcType, RegisterOperand dstType, string asm,
+ string kind>
+ : I<(outs dstType:$Rd), (ins srcType:$Rn, VectorIndex1:$idx), asm,
+ "{\t$Rd"#kind#"$idx, $Rn|"#kind#"\t$Rd$idx, $Rn}", "", []>,
+ Sched<[WriteFCopy]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{30-23} = 0b00111101;
+ let Inst{21} = 1;
+ let Inst{20-19} = rmode;
+ let Inst{18-16} = opcode;
+ let Inst{15-10} = 0b000000;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+
+ let DecoderMethod = "DecodeFMOVLaneInstruction";
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseUnscaledConversionFromHigh<bits<2> rmode, bits<3> opcode,
+ RegisterOperand srcType, RegisterClass dstType, string asm,
+ string kind>
+ : I<(outs dstType:$Rd), (ins srcType:$Rn, VectorIndex1:$idx), asm,
+ "{\t$Rd, $Rn"#kind#"$idx|"#kind#"\t$Rd, $Rn$idx}", "", []>,
+ Sched<[WriteFCopy]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{30-23} = 0b00111101;
+ let Inst{21} = 1;
+ let Inst{20-19} = rmode;
+ let Inst{18-16} = opcode;
+ let Inst{15-10} = 0b000000;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+
+ let DecoderMethod = "DecodeFMOVLaneInstruction";
+}
+
+
+multiclass UnscaledConversion<string asm> {
+ def WHr : BaseUnscaledConversion<0b00, 0b111, GPR32, FPR16, asm> {
+ let Inst{31} = 0; // 32-bit GPR flag
+ let Inst{23-22} = 0b11; // 16-bit FPR flag
+ let Predicates = [HasFullFP16];
+ }
+
+ def XHr : BaseUnscaledConversion<0b00, 0b111, GPR64, FPR16, asm> {
+ let Inst{31} = 1; // 64-bit GPR flag
+ let Inst{23-22} = 0b11; // 16-bit FPR flag
+ let Predicates = [HasFullFP16];
+ }
+
+ def WSr : BaseUnscaledConversion<0b00, 0b111, GPR32, FPR32, asm> {
+ let Inst{31} = 0; // 32-bit GPR flag
+ let Inst{23-22} = 0b00; // 32-bit FPR flag
+ }
+
+ def XDr : BaseUnscaledConversion<0b00, 0b111, GPR64, FPR64, asm> {
+ let Inst{31} = 1; // 64-bit GPR flag
+ let Inst{23-22} = 0b01; // 64-bit FPR flag
+ }
+
+ def HWr : BaseUnscaledConversion<0b00, 0b110, FPR16, GPR32, asm> {
+ let Inst{31} = 0; // 32-bit GPR flag
+ let Inst{23-22} = 0b11; // 16-bit FPR flag
+ let Predicates = [HasFullFP16];
+ }
+
+ def HXr : BaseUnscaledConversion<0b00, 0b110, FPR16, GPR64, asm> {
+ let Inst{31} = 1; // 64-bit GPR flag
+ let Inst{23-22} = 0b11; // 16-bit FPR flag
+ let Predicates = [HasFullFP16];
+ }
+
+ def SWr : BaseUnscaledConversion<0b00, 0b110, FPR32, GPR32, asm> {
+ let Inst{31} = 0; // 32-bit GPR flag
+ let Inst{23-22} = 0b00; // 32-bit FPR flag
+ }
+
+ def DXr : BaseUnscaledConversion<0b00, 0b110, FPR64, GPR64, asm> {
+ let Inst{31} = 1; // 64-bit GPR flag
+ let Inst{23-22} = 0b01; // 64-bit FPR flag
+ }
+
+ def XDHighr : BaseUnscaledConversionToHigh<0b01, 0b111, GPR64, V128,
+ asm, ".d"> {
+ let Inst{31} = 1;
+ let Inst{22} = 0;
+ }
+
+ def DXHighr : BaseUnscaledConversionFromHigh<0b01, 0b110, V128, GPR64,
+ asm, ".d"> {
+ let Inst{31} = 1;
+ let Inst{22} = 0;
+ }
+}
+
+//---
+// Floating point conversion
+//---
+
+class BaseFPConversion<bits<2> type, bits<2> opcode, RegisterClass dstType,
+ RegisterClass srcType, string asm, list<dag> pattern>
+ : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "", pattern>,
+ Sched<[WriteFCvt]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31-24} = 0b00011110;
+ let Inst{23-22} = type;
+ let Inst{21-17} = 0b10001;
+ let Inst{16-15} = opcode;
+ let Inst{14-10} = 0b10000;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+multiclass FPConversion<string asm> {
+ // Double-precision to Half-precision
+ def HDr : BaseFPConversion<0b01, 0b11, FPR16, FPR64, asm,
+ [(set FPR16:$Rd, (fpround FPR64:$Rn))]>;
+
+ // Double-precision to Single-precision
+ def SDr : BaseFPConversion<0b01, 0b00, FPR32, FPR64, asm,
+ [(set FPR32:$Rd, (fpround FPR64:$Rn))]>;
+
+ // Half-precision to Double-precision
+ def DHr : BaseFPConversion<0b11, 0b01, FPR64, FPR16, asm,
+ [(set FPR64:$Rd, (fpextend FPR16:$Rn))]>;
+
+ // Half-precision to Single-precision
+ def SHr : BaseFPConversion<0b11, 0b00, FPR32, FPR16, asm,
+ [(set FPR32:$Rd, (fpextend FPR16:$Rn))]>;
+
+ // Single-precision to Double-precision
+ def DSr : BaseFPConversion<0b00, 0b01, FPR64, FPR32, asm,
+ [(set FPR64:$Rd, (fpextend FPR32:$Rn))]>;
+
+ // Single-precision to Half-precision
+ def HSr : BaseFPConversion<0b00, 0b11, FPR16, FPR32, asm,
+ [(set FPR16:$Rd, (fpround FPR32:$Rn))]>;
+}
+
+//---
+// Single operand floating point data processing
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSingleOperandFPData<bits<4> opcode, RegisterClass regtype,
+ ValueType vt, string asm, SDPatternOperator node>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "",
+ [(set (vt regtype:$Rd), (node (vt regtype:$Rn)))]>,
+ Sched<[WriteF]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31-24} = 0b00011110;
+ let Inst{21-19} = 0b100;
+ let Inst{18-15} = opcode;
+ let Inst{14-10} = 0b10000;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+multiclass SingleOperandFPData<bits<4> opcode, string asm,
+ SDPatternOperator node = null_frag> {
+ def Hr : BaseSingleOperandFPData<opcode, FPR16, f16, asm, node> {
+ let Inst{23-22} = 0b11; // 16-bit size flag
+ let Predicates = [HasFullFP16];
+ }
+
+ def Sr : BaseSingleOperandFPData<opcode, FPR32, f32, asm, node> {
+ let Inst{23-22} = 0b00; // 32-bit size flag
+ }
+
+ def Dr : BaseSingleOperandFPData<opcode, FPR64, f64, asm, node> {
+ let Inst{23-22} = 0b01; // 64-bit size flag
+ }
+}
+
+//---
+// Two operand floating point data processing
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseTwoOperandFPData<bits<4> opcode, RegisterClass regtype,
+ string asm, list<dag> pat>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+ asm, "\t$Rd, $Rn, $Rm", "", pat>,
+ Sched<[WriteF]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ let Inst{31-24} = 0b00011110;
+ let Inst{21} = 1;
+ let Inst{20-16} = Rm;
+ let Inst{15-12} = opcode;
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+multiclass TwoOperandFPData<bits<4> opcode, string asm,
+ SDPatternOperator node = null_frag> {
+ def Hrr : BaseTwoOperandFPData<opcode, FPR16, asm,
+ [(set (f16 FPR16:$Rd),
+ (node (f16 FPR16:$Rn), (f16 FPR16:$Rm)))]> {
+ let Inst{23-22} = 0b11; // 16-bit size flag
+ let Predicates = [HasFullFP16];
+ }
+
+ def Srr : BaseTwoOperandFPData<opcode, FPR32, asm,
+ [(set (f32 FPR32:$Rd),
+ (node (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]> {
+ let Inst{23-22} = 0b00; // 32-bit size flag
+ }
+
+ def Drr : BaseTwoOperandFPData<opcode, FPR64, asm,
+ [(set (f64 FPR64:$Rd),
+ (node (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]> {
+ let Inst{23-22} = 0b01; // 64-bit size flag
+ }
+}
+
+multiclass TwoOperandFPDataNeg<bits<4> opcode, string asm, SDNode node> {
+ def Hrr : BaseTwoOperandFPData<opcode, FPR16, asm,
+ [(set FPR16:$Rd, (fneg (node FPR16:$Rn, (f16 FPR16:$Rm))))]> {
+ let Inst{23-22} = 0b11; // 16-bit size flag
+ let Predicates = [HasFullFP16];
+ }
+
+ def Srr : BaseTwoOperandFPData<opcode, FPR32, asm,
+ [(set FPR32:$Rd, (fneg (node FPR32:$Rn, (f32 FPR32:$Rm))))]> {
+ let Inst{23-22} = 0b00; // 32-bit size flag
+ }
+
+ def Drr : BaseTwoOperandFPData<opcode, FPR64, asm,
+ [(set FPR64:$Rd, (fneg (node FPR64:$Rn, (f64 FPR64:$Rm))))]> {
+ let Inst{23-22} = 0b01; // 64-bit size flag
+ }
+}
+
+
+//---
+// Three operand floating point data processing
+//---
+
+class BaseThreeOperandFPData<bit isNegated, bit isSub,
+ RegisterClass regtype, string asm, list<dag> pat>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, regtype: $Ra),
+ asm, "\t$Rd, $Rn, $Rm, $Ra", "", pat>,
+ Sched<[WriteFMul]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ bits<5> Ra;
+ let Inst{31-24} = 0b00011111;
+ let Inst{21} = isNegated;
+ let Inst{20-16} = Rm;
+ let Inst{15} = isSub;
+ let Inst{14-10} = Ra;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm,
+ SDPatternOperator node> {
+ def Hrrr : BaseThreeOperandFPData<isNegated, isSub, FPR16, asm,
+ [(set FPR16:$Rd,
+ (node (f16 FPR16:$Rn), (f16 FPR16:$Rm), (f16 FPR16:$Ra)))]> {
+ let Inst{23-22} = 0b11; // 16-bit size flag
+ let Predicates = [HasFullFP16];
+ }
+
+ def Srrr : BaseThreeOperandFPData<isNegated, isSub, FPR32, asm,
+ [(set FPR32:$Rd,
+ (node (f32 FPR32:$Rn), (f32 FPR32:$Rm), (f32 FPR32:$Ra)))]> {
+ let Inst{23-22} = 0b00; // 32-bit size flag
+ }
+
+ def Drrr : BaseThreeOperandFPData<isNegated, isSub, FPR64, asm,
+ [(set FPR64:$Rd,
+ (node (f64 FPR64:$Rn), (f64 FPR64:$Rm), (f64 FPR64:$Ra)))]> {
+ let Inst{23-22} = 0b01; // 64-bit size flag
+ }
+}
+
+//---
+// Floating point data comparisons
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseOneOperandFPComparison<bit signalAllNans,
+ RegisterClass regtype, string asm,
+ list<dag> pat>
+ : I<(outs), (ins regtype:$Rn), asm, "\t$Rn, #0.0", "", pat>,
+ Sched<[WriteFCmp]> {
+ bits<5> Rn;
+ let Inst{31-24} = 0b00011110;
+ let Inst{21} = 1;
+
+ let Inst{15-10} = 0b001000;
+ let Inst{9-5} = Rn;
+ let Inst{4} = signalAllNans;
+ let Inst{3-0} = 0b1000;
+
+ // Rm should be 0b00000 canonically, but we need to accept any value.
+ let PostEncoderMethod = "fixOneOperandFPComparison";
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseTwoOperandFPComparison<bit signalAllNans, RegisterClass regtype,
+ string asm, list<dag> pat>
+ : I<(outs), (ins regtype:$Rn, regtype:$Rm), asm, "\t$Rn, $Rm", "", pat>,
+ Sched<[WriteFCmp]> {
+ bits<5> Rm;
+ bits<5> Rn;
+ let Inst{31-24} = 0b00011110;
+ let Inst{21} = 1;
+ let Inst{20-16} = Rm;
+ let Inst{15-10} = 0b001000;
+ let Inst{9-5} = Rn;
+ let Inst{4} = signalAllNans;
+ let Inst{3-0} = 0b0000;
+}
+
+multiclass FPComparison<bit signalAllNans, string asm,
+ SDPatternOperator OpNode = null_frag> {
+ let Defs = [NZCV] in {
+ def Hrr : BaseTwoOperandFPComparison<signalAllNans, FPR16, asm,
+ [(OpNode FPR16:$Rn, (f16 FPR16:$Rm)), (implicit NZCV)]> {
+ let Inst{23-22} = 0b11;
+ let Predicates = [HasFullFP16];
+ }
+
+ def Hri : BaseOneOperandFPComparison<signalAllNans, FPR16, asm,
+ [(OpNode (f16 FPR16:$Rn), fpimm0), (implicit NZCV)]> {
+ let Inst{23-22} = 0b11;
+ let Predicates = [HasFullFP16];
+ }
+
+ def Srr : BaseTwoOperandFPComparison<signalAllNans, FPR32, asm,
+ [(OpNode FPR32:$Rn, (f32 FPR32:$Rm)), (implicit NZCV)]> {
+ let Inst{23-22} = 0b00;
+ }
+
+ def Sri : BaseOneOperandFPComparison<signalAllNans, FPR32, asm,
+ [(OpNode (f32 FPR32:$Rn), fpimm0), (implicit NZCV)]> {
+ let Inst{23-22} = 0b00;
+ }
+
+ def Drr : BaseTwoOperandFPComparison<signalAllNans, FPR64, asm,
+ [(OpNode FPR64:$Rn, (f64 FPR64:$Rm)), (implicit NZCV)]> {
+ let Inst{23-22} = 0b01;
+ }
+
+ def Dri : BaseOneOperandFPComparison<signalAllNans, FPR64, asm,
+ [(OpNode (f64 FPR64:$Rn), fpimm0), (implicit NZCV)]> {
+ let Inst{23-22} = 0b01;
+ }
+ } // Defs = [NZCV]
+}
+
+//---
+// Floating point conditional comparisons
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseFPCondComparison<bit signalAllNans, RegisterClass regtype,
+ string mnemonic, list<dag> pat>
+ : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm32_0_15:$nzcv, ccode:$cond),
+ mnemonic, "\t$Rn, $Rm, $nzcv, $cond", "", pat>,
+ Sched<[WriteFCmp]> {
+ let Uses = [NZCV];
+ let Defs = [NZCV];
+
+ bits<5> Rn;
+ bits<5> Rm;
+ bits<4> nzcv;
+ bits<4> cond;
+
+ let Inst{31-24} = 0b00011110;
+ let Inst{21} = 1;
+ let Inst{20-16} = Rm;
+ let Inst{15-12} = cond;
+ let Inst{11-10} = 0b01;
+ let Inst{9-5} = Rn;
+ let Inst{4} = signalAllNans;
+ let Inst{3-0} = nzcv;
+}
+
+multiclass FPCondComparison<bit signalAllNans, string mnemonic,
+ SDPatternOperator OpNode = null_frag> {
+ def Hrr : BaseFPCondComparison<signalAllNans, FPR16, mnemonic, []> {
+ let Inst{23-22} = 0b11;
+ let Predicates = [HasFullFP16];
+ }
+
+ def Srr : BaseFPCondComparison<signalAllNans, FPR32, mnemonic,
+ [(set NZCV, (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm), (i32 imm:$nzcv),
+ (i32 imm:$cond), NZCV))]> {
+ let Inst{23-22} = 0b00;
+ }
+
+ def Drr : BaseFPCondComparison<signalAllNans, FPR64, mnemonic,
+ [(set NZCV, (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm), (i32 imm:$nzcv),
+ (i32 imm:$cond), NZCV))]> {
+ let Inst{23-22} = 0b01;
+ }
+}
+
+//---
+// Floating point conditional select
+//---
+
+class BaseFPCondSelect<RegisterClass regtype, ValueType vt, string asm>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
+ asm, "\t$Rd, $Rn, $Rm, $cond", "",
+ [(set regtype:$Rd,
+ (AArch64csel (vt regtype:$Rn), regtype:$Rm,
+ (i32 imm:$cond), NZCV))]>,
+ Sched<[WriteF]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ bits<4> cond;
+
+ let Inst{31-24} = 0b00011110;
+ let Inst{21} = 1;
+ let Inst{20-16} = Rm;
+ let Inst{15-12} = cond;
+ let Inst{11-10} = 0b11;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+multiclass FPCondSelect<string asm> {
+ let Uses = [NZCV] in {
+ def Hrrr : BaseFPCondSelect<FPR16, f16, asm> {
+ let Inst{23-22} = 0b11;
+ let Predicates = [HasFullFP16];
+ }
+
+ def Srrr : BaseFPCondSelect<FPR32, f32, asm> {
+ let Inst{23-22} = 0b00;
+ }
+
+ def Drrr : BaseFPCondSelect<FPR64, f64, asm> {
+ let Inst{23-22} = 0b01;
+ }
+ } // Uses = [NZCV]
+}
+
+//---
+// Floating move immediate
+//---
+
+class BaseFPMoveImmediate<RegisterClass regtype, Operand fpimmtype, string asm>
+ : I<(outs regtype:$Rd), (ins fpimmtype:$imm), asm, "\t$Rd, $imm", "",
+ [(set regtype:$Rd, fpimmtype:$imm)]>,
+ Sched<[WriteFImm]> {
+ bits<5> Rd;
+ bits<8> imm;
+ let Inst{31-24} = 0b00011110;
+ let Inst{21} = 1;
+ let Inst{20-13} = imm;
+ let Inst{12-5} = 0b10000000;
+ let Inst{4-0} = Rd;
+}
+
+multiclass FPMoveImmediate<string asm> {
+ def Hi : BaseFPMoveImmediate<FPR16, fpimm16, asm> {
+ let Inst{23-22} = 0b11;
+ let Predicates = [HasFullFP16];
+ }
+
+ def Si : BaseFPMoveImmediate<FPR32, fpimm32, asm> {
+ let Inst{23-22} = 0b00;
+ }
+
+ def Di : BaseFPMoveImmediate<FPR64, fpimm64, asm> {
+ let Inst{23-22} = 0b01;
+ }
+}
+} // end of 'let Predicates = [HasFPARMv8]'
+
+//----------------------------------------------------------------------------
+// AdvSIMD
+//----------------------------------------------------------------------------
+
+let Predicates = [HasNEON] in {
+
+//----------------------------------------------------------------------------
+// AdvSIMD three register vector instructions
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDThreeSameVector<bit Q, bit U, bits<3> size, bits<5> opcode,
+ RegisterOperand regtype, string asm, string kind,
+ list<dag> pattern>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
+ "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
+ "|" # kind # "\t$Rd, $Rn, $Rm|}", "", pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ let Inst{31} = 0;
+ let Inst{30} = Q;
+ let Inst{29} = U;
+ let Inst{28-24} = 0b01110;
+ let Inst{23-21} = size;
+ let Inst{20-16} = Rm;
+ let Inst{15-11} = opcode;
+ let Inst{10} = 1;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<3> size, bits<5> opcode,
+ RegisterOperand regtype, string asm, string kind,
+ list<dag> pattern>
+ : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), asm,
+ "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
+ "|" # kind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ let Inst{31} = 0;
+ let Inst{30} = Q;
+ let Inst{29} = U;
+ let Inst{28-24} = 0b01110;
+ let Inst{23-21} = size;
+ let Inst{20-16} = Rm;
+ let Inst{15-11} = opcode;
+ let Inst{10} = 1;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+// All operand sizes distinguished in the encoding.
+multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v8i8 : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64,
+ asm, ".8b",
+ [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+ def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128,
+ asm, ".16b",
+ [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
+ def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64,
+ asm, ".4h",
+ [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+ def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128,
+ asm, ".8h",
+ [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+ def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64,
+ asm, ".2s",
+ [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+ def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128,
+ asm, ".4s",
+ [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+ def v2i64 : BaseSIMDThreeSameVector<1, U, 0b111, opc, V128,
+ asm, ".2d",
+ [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>;
+}
+
+// As above, but D sized elements unsupported.
+multiclass SIMDThreeSameVectorBHS<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v8i8 : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64,
+ asm, ".8b",
+ [(set V64:$Rd, (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))]>;
+ def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128,
+ asm, ".16b",
+ [(set V128:$Rd, (v16i8 (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm))))]>;
+ def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64,
+ asm, ".4h",
+ [(set V64:$Rd, (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))]>;
+ def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128,
+ asm, ".8h",
+ [(set V128:$Rd, (v8i16 (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm))))]>;
+ def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64,
+ asm, ".2s",
+ [(set V64:$Rd, (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))]>;
+ def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128,
+ asm, ".4s",
+ [(set V128:$Rd, (v4i32 (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm))))]>;
+}
+
+multiclass SIMDThreeSameVectorBHSTied<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v8i8 : BaseSIMDThreeSameVectorTied<0, U, 0b001, opc, V64,
+ asm, ".8b",
+ [(set (v8i8 V64:$dst),
+ (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+ def v16i8 : BaseSIMDThreeSameVectorTied<1, U, 0b001, opc, V128,
+ asm, ".16b",
+ [(set (v16i8 V128:$dst),
+ (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
+ def v4i16 : BaseSIMDThreeSameVectorTied<0, U, 0b011, opc, V64,
+ asm, ".4h",
+ [(set (v4i16 V64:$dst),
+ (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+ def v8i16 : BaseSIMDThreeSameVectorTied<1, U, 0b011, opc, V128,
+ asm, ".8h",
+ [(set (v8i16 V128:$dst),
+ (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+ def v2i32 : BaseSIMDThreeSameVectorTied<0, U, 0b101, opc, V64,
+ asm, ".2s",
+ [(set (v2i32 V64:$dst),
+ (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+ def v4i32 : BaseSIMDThreeSameVectorTied<1, U, 0b101, opc, V128,
+ asm, ".4s",
+ [(set (v4i32 V128:$dst),
+ (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+}
+
+// As above, but only B sized elements supported.
+multiclass SIMDThreeSameVectorB<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v8i8 : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64,
+ asm, ".8b",
+ [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+ def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128,
+ asm, ".16b",
+ [(set (v16i8 V128:$Rd),
+ (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
+}
+
+// As above, but only floating point elements supported.
+multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<3> opc,
+ string asm, SDPatternOperator OpNode> {
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v4f16 : BaseSIMDThreeSameVector<0, U, {S,0b10}, {0b00,opc}, V64,
+ asm, ".4h",
+ [(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>;
+ def v8f16 : BaseSIMDThreeSameVector<1, U, {S,0b10}, {0b00,opc}, V128,
+ asm, ".8h",
+ [(set (v8f16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>;
+ } // Predicates = [HasNEON, HasFullFP16]
+ def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0b01}, {0b11,opc}, V64,
+ asm, ".2s",
+ [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
+ def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0b01}, {0b11,opc}, V128,
+ asm, ".4s",
+ [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
+ def v2f64 : BaseSIMDThreeSameVector<1, U, {S,0b11}, {0b11,opc}, V128,
+ asm, ".2d",
+ [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
+}
+
+multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<3> opc,
+ string asm,
+ SDPatternOperator OpNode> {
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v4f16 : BaseSIMDThreeSameVector<0, U, {S,0b10}, {0b00,opc}, V64,
+ asm, ".4h",
+ [(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>;
+ def v8f16 : BaseSIMDThreeSameVector<1, U, {S,0b10}, {0b00,opc}, V128,
+ asm, ".8h",
+ [(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>;
+ } // Predicates = [HasNEON, HasFullFP16]
+ def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0b01}, {0b11,opc}, V64,
+ asm, ".2s",
+ [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
+ def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0b01}, {0b11,opc}, V128,
+ asm, ".4s",
+ [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
+ def v2f64 : BaseSIMDThreeSameVector<1, U, {S,0b11}, {0b11,opc}, V128,
+ asm, ".2d",
+ [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
+}
+
+multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<3> opc,
+ string asm, SDPatternOperator OpNode> {
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v4f16 : BaseSIMDThreeSameVectorTied<0, U, {S,0b10}, {0b00,opc}, V64,
+ asm, ".4h",
+ [(set (v4f16 V64:$dst),
+ (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>;
+ def v8f16 : BaseSIMDThreeSameVectorTied<1, U, {S,0b10}, {0b00,opc}, V128,
+ asm, ".8h",
+ [(set (v8f16 V128:$dst),
+ (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>;
+ } // Predicates = [HasNEON, HasFullFP16]
+ def v2f32 : BaseSIMDThreeSameVectorTied<0, U, {S,0b01}, {0b11,opc}, V64,
+ asm, ".2s",
+ [(set (v2f32 V64:$dst),
+ (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
+ def v4f32 : BaseSIMDThreeSameVectorTied<1, U, {S,0b01}, {0b11,opc}, V128,
+ asm, ".4s",
+ [(set (v4f32 V128:$dst),
+ (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
+ def v2f64 : BaseSIMDThreeSameVectorTied<1, U, {S,0b11}, {0b11,opc}, V128,
+ asm, ".2d",
+ [(set (v2f64 V128:$dst),
+ (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
+}
+
+// As above, but D and B sized elements unsupported.
+multiclass SIMDThreeSameVectorHS<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64,
+ asm, ".4h",
+ [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+ def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128,
+ asm, ".8h",
+ [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+ def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64,
+ asm, ".2s",
+ [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+ def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128,
+ asm, ".4s",
+ [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+}
+
+// Logical three vector ops share opcode bits, and only use B sized elements.
+multiclass SIMDLogicalThreeVector<bit U, bits<2> size, string asm,
+ SDPatternOperator OpNode = null_frag> {
+ def v8i8 : BaseSIMDThreeSameVector<0, U, {size,1}, 0b00011, V64,
+ asm, ".8b",
+ [(set (v8i8 V64:$Rd), (OpNode V64:$Rn, V64:$Rm))]>;
+ def v16i8 : BaseSIMDThreeSameVector<1, U, {size,1}, 0b00011, V128,
+ asm, ".16b",
+ [(set (v16i8 V128:$Rd), (OpNode V128:$Rn, V128:$Rm))]>;
+
+ def : Pat<(v4i16 (OpNode V64:$LHS, V64:$RHS)),
+ (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
+ def : Pat<(v2i32 (OpNode V64:$LHS, V64:$RHS)),
+ (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
+ def : Pat<(v1i64 (OpNode V64:$LHS, V64:$RHS)),
+ (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
+
+ def : Pat<(v8i16 (OpNode V128:$LHS, V128:$RHS)),
+ (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
+ def : Pat<(v4i32 (OpNode V128:$LHS, V128:$RHS)),
+ (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
+ def : Pat<(v2i64 (OpNode V128:$LHS, V128:$RHS)),
+ (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
+}
+
+multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size,
+ string asm, SDPatternOperator OpNode> {
+ def v8i8 : BaseSIMDThreeSameVectorTied<0, U, {size,1}, 0b00011, V64,
+ asm, ".8b",
+ [(set (v8i8 V64:$dst),
+ (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+ def v16i8 : BaseSIMDThreeSameVectorTied<1, U, {size,1}, 0b00011, V128,
+ asm, ".16b",
+ [(set (v16i8 V128:$dst),
+ (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
+ (v16i8 V128:$Rm)))]>;
+
+ def : Pat<(v4i16 (OpNode (v4i16 V64:$LHS), (v4i16 V64:$MHS),
+ (v4i16 V64:$RHS))),
+ (!cast<Instruction>(NAME#"v8i8")
+ V64:$LHS, V64:$MHS, V64:$RHS)>;
+ def : Pat<(v2i32 (OpNode (v2i32 V64:$LHS), (v2i32 V64:$MHS),
+ (v2i32 V64:$RHS))),
+ (!cast<Instruction>(NAME#"v8i8")
+ V64:$LHS, V64:$MHS, V64:$RHS)>;
+ def : Pat<(v1i64 (OpNode (v1i64 V64:$LHS), (v1i64 V64:$MHS),
+ (v1i64 V64:$RHS))),
+ (!cast<Instruction>(NAME#"v8i8")
+ V64:$LHS, V64:$MHS, V64:$RHS)>;
+
+ def : Pat<(v8i16 (OpNode (v8i16 V128:$LHS), (v8i16 V128:$MHS),
+ (v8i16 V128:$RHS))),
+ (!cast<Instruction>(NAME#"v16i8")
+ V128:$LHS, V128:$MHS, V128:$RHS)>;
+ def : Pat<(v4i32 (OpNode (v4i32 V128:$LHS), (v4i32 V128:$MHS),
+ (v4i32 V128:$RHS))),
+ (!cast<Instruction>(NAME#"v16i8")
+ V128:$LHS, V128:$MHS, V128:$RHS)>;
+ def : Pat<(v2i64 (OpNode (v2i64 V128:$LHS), (v2i64 V128:$MHS),
+ (v2i64 V128:$RHS))),
+ (!cast<Instruction>(NAME#"v16i8")
+ V128:$LHS, V128:$MHS, V128:$RHS)>;
+}
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD two register vector instructions.
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+ bits<2> size2, RegisterOperand regtype, string asm,
+ string dstkind, string srckind, list<dag> pattern>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
+ "{\t$Rd" # dstkind # ", $Rn" # srckind #
+ "|" # dstkind # "\t$Rd, $Rn}", "", pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31} = 0;
+ let Inst{30} = Q;
+ let Inst{29} = U;
+ let Inst{28-24} = 0b01110;
+ let Inst{23-22} = size;
+ let Inst{21} = 0b1;
+ let Inst{20-19} = size2;
+ let Inst{18-17} = 0b00;
+ let Inst{16-12} = opcode;
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
+ bits<2> size2, RegisterOperand regtype,
+ string asm, string dstkind, string srckind,
+ list<dag> pattern>
+ : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn), asm,
+ "{\t$Rd" # dstkind # ", $Rn" # srckind #
+ "|" # dstkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31} = 0;
+ let Inst{30} = Q;
+ let Inst{29} = U;
+ let Inst{28-24} = 0b01110;
+ let Inst{23-22} = size;
+ let Inst{21} = 0b1;
+ let Inst{20-19} = size2;
+ let Inst{18-17} = 0b00;
+ let Inst{16-12} = opcode;
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+// Supports B, H, and S element sizes.
+multiclass SIMDTwoVectorBHS<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64,
+ asm, ".8b", ".8b",
+ [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
+ def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128,
+ asm, ".16b", ".16b",
+ [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+ def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64,
+ asm, ".4h", ".4h",
+ [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
+ def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128,
+ asm, ".8h", ".8h",
+ [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+ def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64,
+ asm, ".2s", ".2s",
+ [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+ def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128,
+ asm, ".4s", ".4s",
+ [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+}
+
+class BaseSIMDVectorLShiftLongBySize<bit Q, bits<2> size,
+ RegisterOperand regtype, string asm, string dstkind,
+ string srckind, string amount>
+ : I<(outs V128:$Rd), (ins regtype:$Rn), asm,
+ "{\t$Rd" # dstkind # ", $Rn" # srckind # ", #" # amount #
+ "|" # dstkind # "\t$Rd, $Rn, #" # amount # "}", "", []>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31} = 0;
+ let Inst{30} = Q;
+ let Inst{29-24} = 0b101110;
+ let Inst{23-22} = size;
+ let Inst{21-10} = 0b100001001110;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+multiclass SIMDVectorLShiftLongBySizeBHS {
+ let hasSideEffects = 0 in {
+ def v8i8 : BaseSIMDVectorLShiftLongBySize<0, 0b00, V64,
+ "shll", ".8h", ".8b", "8">;
+ def v16i8 : BaseSIMDVectorLShiftLongBySize<1, 0b00, V128,
+ "shll2", ".8h", ".16b", "8">;
+ def v4i16 : BaseSIMDVectorLShiftLongBySize<0, 0b01, V64,
+ "shll", ".4s", ".4h", "16">;
+ def v8i16 : BaseSIMDVectorLShiftLongBySize<1, 0b01, V128,
+ "shll2", ".4s", ".8h", "16">;
+ def v2i32 : BaseSIMDVectorLShiftLongBySize<0, 0b10, V64,
+ "shll", ".2d", ".2s", "32">;
+ def v4i32 : BaseSIMDVectorLShiftLongBySize<1, 0b10, V128,
+ "shll2", ".2d", ".4s", "32">;
+ }
+}
+
+// Supports all element sizes.
+multiclass SIMDLongTwoVector<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v8i8_v4i16 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64,
+ asm, ".4h", ".8b",
+ [(set (v4i16 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
+ def v16i8_v8i16 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128,
+ asm, ".8h", ".16b",
+ [(set (v8i16 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+ def v4i16_v2i32 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64,
+ asm, ".2s", ".4h",
+ [(set (v2i32 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
+ def v8i16_v4i32 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128,
+ asm, ".4s", ".8h",
+ [(set (v4i32 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+ def v2i32_v1i64 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64,
+ asm, ".1d", ".2s",
+ [(set (v1i64 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+ def v4i32_v2i64 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128,
+ asm, ".2d", ".4s",
+ [(set (v2i64 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+}
+
+multiclass SIMDLongTwoVectorTied<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v8i8_v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, 0b00, V64,
+ asm, ".4h", ".8b",
+ [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd),
+ (v8i8 V64:$Rn)))]>;
+ def v16i8_v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, 0b00, V128,
+ asm, ".8h", ".16b",
+ [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd),
+ (v16i8 V128:$Rn)))]>;
+ def v4i16_v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, 0b00, V64,
+ asm, ".2s", ".4h",
+ [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd),
+ (v4i16 V64:$Rn)))]>;
+ def v8i16_v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, 0b00, V128,
+ asm, ".4s", ".8h",
+ [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd),
+ (v8i16 V128:$Rn)))]>;
+ def v2i32_v1i64 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, 0b00, V64,
+ asm, ".1d", ".2s",
+ [(set (v1i64 V64:$dst), (OpNode (v1i64 V64:$Rd),
+ (v2i32 V64:$Rn)))]>;
+ def v4i32_v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, 0b00, V128,
+ asm, ".2d", ".4s",
+ [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd),
+ (v4i32 V128:$Rn)))]>;
+}
+
+// Supports all element sizes, except 1xD.
+multiclass SIMDTwoVectorBHSDTied<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v8i8 : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, 0b00, V64,
+ asm, ".8b", ".8b",
+ [(set (v8i8 V64:$dst), (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn)))]>;
+ def v16i8 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, 0b00, V128,
+ asm, ".16b", ".16b",
+ [(set (v16i8 V128:$dst), (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>;
+ def v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, 0b00, V64,
+ asm, ".4h", ".4h",
+ [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn)))]>;
+ def v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, 0b00, V128,
+ asm, ".8h", ".8h",
+ [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn)))]>;
+ def v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, 0b00, V64,
+ asm, ".2s", ".2s",
+ [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn)))]>;
+ def v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, 0b00, V128,
+ asm, ".4s", ".4s",
+ [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>;
+ def v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b11, opc, 0b00, V128,
+ asm, ".2d", ".2d",
+ [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn)))]>;
+}
+
+multiclass SIMDTwoVectorBHSD<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode = null_frag> {
+ def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64,
+ asm, ".8b", ".8b",
+ [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
+ def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128,
+ asm, ".16b", ".16b",
+ [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+ def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64,
+ asm, ".4h", ".4h",
+ [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
+ def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128,
+ asm, ".8h", ".8h",
+ [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+ def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64,
+ asm, ".2s", ".2s",
+ [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+ def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128,
+ asm, ".4s", ".4s",
+ [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+ def v2i64 : BaseSIMDTwoSameVector<1, U, 0b11, opc, 0b00, V128,
+ asm, ".2d", ".2d",
+ [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
+}
+
+
+// Supports only B element sizes.
+multiclass SIMDTwoVectorB<bit U, bits<2> size, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v8i8 : BaseSIMDTwoSameVector<0, U, size, opc, 0b00, V64,
+ asm, ".8b", ".8b",
+ [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
+ def v16i8 : BaseSIMDTwoSameVector<1, U, size, opc, 0b00, V128,
+ asm, ".16b", ".16b",
+ [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+
+}
+
+// Supports only B and H element sizes.
+multiclass SIMDTwoVectorBH<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64,
+ asm, ".8b", ".8b",
+ [(set (v8i8 V64:$Rd), (OpNode V64:$Rn))]>;
+ def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128,
+ asm, ".16b", ".16b",
+ [(set (v16i8 V128:$Rd), (OpNode V128:$Rn))]>;
+ def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64,
+ asm, ".4h", ".4h",
+ [(set (v4i16 V64:$Rd), (OpNode V64:$Rn))]>;
+ def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128,
+ asm, ".8h", ".8h",
+ [(set (v8i16 V128:$Rd), (OpNode V128:$Rn))]>;
+}
+
+// Supports only S and D element sizes, uses high bit of the size field
+// as an extra opcode bit.
+multiclass SIMDTwoVectorFP<bit U, bit S, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64,
+ asm, ".4h", ".4h",
+ [(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn)))]>;
+ def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128,
+ asm, ".8h", ".8h",
+ [(set (v8f16 V128:$Rd), (OpNode (v8f16 V128:$Rn)))]>;
+ } // Predicates = [HasNEON, HasFullFP16]
+ def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64,
+ asm, ".2s", ".2s",
+ [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
+ def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128,
+ asm, ".4s", ".4s",
+ [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
+ def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128,
+ asm, ".2d", ".2d",
+ [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
+}
+
+// Supports only S element size.
+multiclass SIMDTwoVectorS<bit U, bit S, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v2i32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64,
+ asm, ".2s", ".2s",
+ [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+ def v4i32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128,
+ asm, ".4s", ".4s",
+ [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+}
+
+
+multiclass SIMDTwoVectorFPToInt<bit U, bit S, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64,
+ asm, ".4h", ".4h",
+ [(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn)))]>;
+ def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128,
+ asm, ".8h", ".8h",
+ [(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn)))]>;
+ } // Predicates = [HasNEON, HasFullFP16]
+ def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64,
+ asm, ".2s", ".2s",
+ [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
+ def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128,
+ asm, ".4s", ".4s",
+ [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
+ def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128,
+ asm, ".2d", ".2d",
+ [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
+}
+
+multiclass SIMDTwoVectorIntToFP<bit U, bit S, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64,
+ asm, ".4h", ".4h",
+ [(set (v4f16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
+ def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128,
+ asm, ".8h", ".8h",
+ [(set (v8f16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+ } // Predicates = [HasNEON, HasFullFP16]
+ def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64,
+ asm, ".2s", ".2s",
+ [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+ def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128,
+ asm, ".4s", ".4s",
+ [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+ def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128,
+ asm, ".2d", ".2d",
+ [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
+}
+
+
+class BaseSIMDMixedTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+ RegisterOperand inreg, RegisterOperand outreg,
+ string asm, string outkind, string inkind,
+ list<dag> pattern>
+ : I<(outs outreg:$Rd), (ins inreg:$Rn), asm,
+ "{\t$Rd" # outkind # ", $Rn" # inkind #
+ "|" # outkind # "\t$Rd, $Rn}", "", pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31} = 0;
+ let Inst{30} = Q;
+ let Inst{29} = U;
+ let Inst{28-24} = 0b01110;
+ let Inst{23-22} = size;
+ let Inst{21-17} = 0b10000;
+ let Inst{16-12} = opcode;
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+class BaseSIMDMixedTwoVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
+ RegisterOperand inreg, RegisterOperand outreg,
+ string asm, string outkind, string inkind,
+ list<dag> pattern>
+ : I<(outs outreg:$dst), (ins outreg:$Rd, inreg:$Rn), asm,
+ "{\t$Rd" # outkind # ", $Rn" # inkind #
+ "|" # outkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31} = 0;
+ let Inst{30} = Q;
+ let Inst{29} = U;
+ let Inst{28-24} = 0b01110;
+ let Inst{23-22} = size;
+ let Inst{21-17} = 0b10000;
+ let Inst{16-12} = opcode;
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+multiclass SIMDMixedTwoVector<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v8i8 : BaseSIMDMixedTwoVector<0, U, 0b00, opc, V128, V64,
+ asm, ".8b", ".8h",
+ [(set (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+ def v16i8 : BaseSIMDMixedTwoVectorTied<1, U, 0b00, opc, V128, V128,
+ asm#"2", ".16b", ".8h", []>;
+ def v4i16 : BaseSIMDMixedTwoVector<0, U, 0b01, opc, V128, V64,
+ asm, ".4h", ".4s",
+ [(set (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+ def v8i16 : BaseSIMDMixedTwoVectorTied<1, U, 0b01, opc, V128, V128,
+ asm#"2", ".8h", ".4s", []>;
+ def v2i32 : BaseSIMDMixedTwoVector<0, U, 0b10, opc, V128, V64,
+ asm, ".2s", ".2d",
+ [(set (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
+ def v4i32 : BaseSIMDMixedTwoVectorTied<1, U, 0b10, opc, V128, V128,
+ asm#"2", ".4s", ".2d", []>;
+
+ def : Pat<(concat_vectors (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn))),
+ (!cast<Instruction>(NAME # "v16i8")
+ (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+ def : Pat<(concat_vectors (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn))),
+ (!cast<Instruction>(NAME # "v8i16")
+ (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+ def : Pat<(concat_vectors (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn))),
+ (!cast<Instruction>(NAME # "v4i32")
+ (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+}
+
+class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<2> size2,
+ bits<5> opcode, RegisterOperand regtype, string asm,
+ string kind, string zero, ValueType dty,
+ ValueType sty, SDNode OpNode>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
+ "{\t$Rd" # kind # ", $Rn" # kind # ", #" # zero #
+ "|" # kind # "\t$Rd, $Rn, #" # zero # "}", "",
+ [(set (dty regtype:$Rd), (OpNode (sty regtype:$Rn)))]>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31} = 0;
+ let Inst{30} = Q;
+ let Inst{29} = U;
+ let Inst{28-24} = 0b01110;
+ let Inst{23-22} = size;
+ let Inst{21} = 0b1;
+ let Inst{20-19} = size2;
+ let Inst{18-17} = 0b00;
+ let Inst{16-12} = opcode;
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+// Comparisons support all element sizes, except 1xD.
+multiclass SIMDCmpTwoVector<bit U, bits<5> opc, string asm,
+ SDNode OpNode> {
+ def v8i8rz : BaseSIMDCmpTwoVector<0, U, 0b00, 0b00, opc, V64,
+ asm, ".8b", "0",
+ v8i8, v8i8, OpNode>;
+ def v16i8rz : BaseSIMDCmpTwoVector<1, U, 0b00, 0b00, opc, V128,
+ asm, ".16b", "0",
+ v16i8, v16i8, OpNode>;
+ def v4i16rz : BaseSIMDCmpTwoVector<0, U, 0b01, 0b00, opc, V64,
+ asm, ".4h", "0",
+ v4i16, v4i16, OpNode>;
+ def v8i16rz : BaseSIMDCmpTwoVector<1, U, 0b01, 0b00, opc, V128,
+ asm, ".8h", "0",
+ v8i16, v8i16, OpNode>;
+ def v2i32rz : BaseSIMDCmpTwoVector<0, U, 0b10, 0b00, opc, V64,
+ asm, ".2s", "0",
+ v2i32, v2i32, OpNode>;
+ def v4i32rz : BaseSIMDCmpTwoVector<1, U, 0b10, 0b00, opc, V128,
+ asm, ".4s", "0",
+ v4i32, v4i32, OpNode>;
+ def v2i64rz : BaseSIMDCmpTwoVector<1, U, 0b11, 0b00, opc, V128,
+ asm, ".2d", "0",
+ v2i64, v2i64, OpNode>;
+}
+
+// FP Comparisons support only S and D element sizes (and H for v8.2a).
+multiclass SIMDFPCmpTwoVector<bit U, bit S, bits<5> opc,
+ string asm, SDNode OpNode> {
+
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v4i16rz : BaseSIMDCmpTwoVector<0, U, {S,1}, 0b11, opc, V64,
+ asm, ".4h", "0.0",
+ v4i16, v4f16, OpNode>;
+ def v8i16rz : BaseSIMDCmpTwoVector<1, U, {S,1}, 0b11, opc, V128,
+ asm, ".8h", "0.0",
+ v8i16, v8f16, OpNode>;
+ } // Predicates = [HasNEON, HasFullFP16]
+ def v2i32rz : BaseSIMDCmpTwoVector<0, U, {S,0}, 0b00, opc, V64,
+ asm, ".2s", "0.0",
+ v2i32, v2f32, OpNode>;
+ def v4i32rz : BaseSIMDCmpTwoVector<1, U, {S,0}, 0b00, opc, V128,
+ asm, ".4s", "0.0",
+ v4i32, v4f32, OpNode>;
+ def v2i64rz : BaseSIMDCmpTwoVector<1, U, {S,1}, 0b00, opc, V128,
+ asm, ".2d", "0.0",
+ v2i64, v2f64, OpNode>;
+
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def : InstAlias<asm # "\t$Vd.4h, $Vn.4h, #0",
+ (!cast<Instruction>(NAME # v4i16rz) V64:$Vd, V64:$Vn), 0>;
+ def : InstAlias<asm # "\t$Vd.8h, $Vn.8h, #0",
+ (!cast<Instruction>(NAME # v8i16rz) V128:$Vd, V128:$Vn), 0>;
+ }
+ def : InstAlias<asm # "\t$Vd.2s, $Vn.2s, #0",
+ (!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>;
+ def : InstAlias<asm # "\t$Vd.4s, $Vn.4s, #0",
+ (!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>;
+ def : InstAlias<asm # "\t$Vd.2d, $Vn.2d, #0",
+ (!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>;
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def : InstAlias<asm # ".4h\t$Vd, $Vn, #0",
+ (!cast<Instruction>(NAME # v4i16rz) V64:$Vd, V64:$Vn), 0>;
+ def : InstAlias<asm # ".8h\t$Vd, $Vn, #0",
+ (!cast<Instruction>(NAME # v8i16rz) V128:$Vd, V128:$Vn), 0>;
+ }
+ def : InstAlias<asm # ".2s\t$Vd, $Vn, #0",
+ (!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>;
+ def : InstAlias<asm # ".4s\t$Vd, $Vn, #0",
+ (!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>;
+ def : InstAlias<asm # ".2d\t$Vd, $Vn, #0",
+ (!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDFPCvtTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+ RegisterOperand outtype, RegisterOperand intype,
+ string asm, string VdTy, string VnTy,
+ list<dag> pattern>
+ : I<(outs outtype:$Rd), (ins intype:$Rn), asm,
+ !strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "", pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31} = 0;
+ let Inst{30} = Q;
+ let Inst{29} = U;
+ let Inst{28-24} = 0b01110;
+ let Inst{23-22} = size;
+ let Inst{21-17} = 0b10000;
+ let Inst{16-12} = opcode;
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+class BaseSIMDFPCvtTwoVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
+ RegisterOperand outtype, RegisterOperand intype,
+ string asm, string VdTy, string VnTy,
+ list<dag> pattern>
+ : I<(outs outtype:$dst), (ins outtype:$Rd, intype:$Rn), asm,
+ !strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "$Rd = $dst", pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31} = 0;
+ let Inst{30} = Q;
+ let Inst{29} = U;
+ let Inst{28-24} = 0b01110;
+ let Inst{23-22} = size;
+ let Inst{21-17} = 0b10000;
+ let Inst{16-12} = opcode;
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+multiclass SIMDFPWidenTwoVector<bit U, bit S, bits<5> opc, string asm> {
+ def v4i16 : BaseSIMDFPCvtTwoVector<0, U, {S,0}, opc, V128, V64,
+ asm, ".4s", ".4h", []>;
+ def v8i16 : BaseSIMDFPCvtTwoVector<1, U, {S,0}, opc, V128, V128,
+ asm#"2", ".4s", ".8h", []>;
+ def v2i32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V128, V64,
+ asm, ".2d", ".2s", []>;
+ def v4i32 : BaseSIMDFPCvtTwoVector<1, U, {S,1}, opc, V128, V128,
+ asm#"2", ".2d", ".4s", []>;
+}
+
+multiclass SIMDFPNarrowTwoVector<bit U, bit S, bits<5> opc, string asm> {
+ def v4i16 : BaseSIMDFPCvtTwoVector<0, U, {S,0}, opc, V64, V128,
+ asm, ".4h", ".4s", []>;
+ def v8i16 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,0}, opc, V128, V128,
+ asm#"2", ".8h", ".4s", []>;
+ def v2i32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128,
+ asm, ".2s", ".2d", []>;
+ def v4i32 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,1}, opc, V128, V128,
+ asm#"2", ".4s", ".2d", []>;
+}
+
+multiclass SIMDFPInexactCvtTwoVector<bit U, bit S, bits<5> opc, string asm,
+ Intrinsic OpNode> {
+ def v2f32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128,
+ asm, ".2s", ".2d",
+ [(set (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
+ def v4f32 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,1}, opc, V128, V128,
+ asm#"2", ".4s", ".2d", []>;
+
+ def : Pat<(concat_vectors (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn))),
+ (!cast<Instruction>(NAME # "v4f32")
+ (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD three register different-size vector instructions.
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDDifferentThreeVector<bit U, bits<3> size, bits<4> opcode,
+ RegisterOperand outtype, RegisterOperand intype1,
+ RegisterOperand intype2, string asm,
+ string outkind, string inkind1, string inkind2,
+ list<dag> pattern>
+ : I<(outs outtype:$Rd), (ins intype1:$Rn, intype2:$Rm), asm,
+ "{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 #
+ "|" # outkind # "\t$Rd, $Rn, $Rm}", "", pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ let Inst{31} = 0;
+ let Inst{30} = size{0};
+ let Inst{29} = U;
+ let Inst{28-24} = 0b01110;
+ let Inst{23-22} = size{2-1};
+ let Inst{21} = 1;
+ let Inst{20-16} = Rm;
+ let Inst{15-12} = opcode;
+ let Inst{11-10} = 0b00;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDDifferentThreeVectorTied<bit U, bits<3> size, bits<4> opcode,
+ RegisterOperand outtype, RegisterOperand intype1,
+ RegisterOperand intype2, string asm,
+ string outkind, string inkind1, string inkind2,
+ list<dag> pattern>
+ : I<(outs outtype:$dst), (ins outtype:$Rd, intype1:$Rn, intype2:$Rm), asm,
+ "{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 #
+ "|" # outkind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ let Inst{31} = 0;
+ let Inst{30} = size{0};
+ let Inst{29} = U;
+ let Inst{28-24} = 0b01110;
+ let Inst{23-22} = size{2-1};
+ let Inst{21} = 1;
+ let Inst{20-16} = Rm;
+ let Inst{15-12} = opcode;
+ let Inst{11-10} = 0b00;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+// FIXME: TableGen doesn't know how to deal with expanded types that also
+// change the element count (in this case, placing the results in
+// the high elements of the result register rather than the low
+// elements). Until that's fixed, we can't code-gen those.
+multiclass SIMDNarrowThreeVectorBHS<bit U, bits<4> opc, string asm,
+ Intrinsic IntOp> {
+ def v8i16_v8i8 : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+ V64, V128, V128,
+ asm, ".8b", ".8h", ".8h",
+ [(set (v8i8 V64:$Rd), (IntOp (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+ def v8i16_v16i8 : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
+ V128, V128, V128,
+ asm#"2", ".16b", ".8h", ".8h",
+ []>;
+ def v4i32_v4i16 : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+ V64, V128, V128,
+ asm, ".4h", ".4s", ".4s",
+ [(set (v4i16 V64:$Rd), (IntOp (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+ def v4i32_v8i16 : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
+ V128, V128, V128,
+ asm#"2", ".8h", ".4s", ".4s",
+ []>;
+ def v2i64_v2i32 : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+ V64, V128, V128,
+ asm, ".2s", ".2d", ".2d",
+ [(set (v2i32 V64:$Rd), (IntOp (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>;
+ def v2i64_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
+ V128, V128, V128,
+ asm#"2", ".4s", ".2d", ".2d",
+ []>;
+
+
+ // Patterns for the '2' variants involve INSERT_SUBREG, which you can't put in
+ // a version attached to an instruction.
+ def : Pat<(concat_vectors (v8i8 V64:$Rd), (IntOp (v8i16 V128:$Rn),
+ (v8i16 V128:$Rm))),
+ (!cast<Instruction>(NAME # "v8i16_v16i8")
+ (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+ V128:$Rn, V128:$Rm)>;
+ def : Pat<(concat_vectors (v4i16 V64:$Rd), (IntOp (v4i32 V128:$Rn),
+ (v4i32 V128:$Rm))),
+ (!cast<Instruction>(NAME # "v4i32_v8i16")
+ (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+ V128:$Rn, V128:$Rm)>;
+ def : Pat<(concat_vectors (v2i32 V64:$Rd), (IntOp (v2i64 V128:$Rn),
+ (v2i64 V128:$Rm))),
+ (!cast<Instruction>(NAME # "v2i64_v4i32")
+ (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+ V128:$Rn, V128:$Rm)>;
+}
+
+multiclass SIMDDifferentThreeVectorBD<bit U, bits<4> opc, string asm,
+ Intrinsic IntOp> {
+ def v8i8 : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+ V128, V64, V64,
+ asm, ".8h", ".8b", ".8b",
+ [(set (v8i16 V128:$Rd), (IntOp (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+ def v16i8 : BaseSIMDDifferentThreeVector<U, 0b001, opc,
+ V128, V128, V128,
+ asm#"2", ".8h", ".16b", ".16b", []>;
+ let Predicates = [HasCrypto] in {
+ def v1i64 : BaseSIMDDifferentThreeVector<U, 0b110, opc,
+ V128, V64, V64,
+ asm, ".1q", ".1d", ".1d", []>;
+ def v2i64 : BaseSIMDDifferentThreeVector<U, 0b111, opc,
+ V128, V128, V128,
+ asm#"2", ".1q", ".2d", ".2d", []>;
+ }
+
+ def : Pat<(v8i16 (IntOp (v8i8 (extract_high_v16i8 V128:$Rn)),
+ (v8i8 (extract_high_v16i8 V128:$Rm)))),
+ (!cast<Instruction>(NAME#"v16i8") V128:$Rn, V128:$Rm)>;
+}
+
+multiclass SIMDLongThreeVectorHS<bit U, bits<4> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v4i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+ V128, V64, V64,
+ asm, ".4s", ".4h", ".4h",
+ [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+ def v8i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b011, opc,
+ V128, V128, V128,
+ asm#"2", ".4s", ".8h", ".8h",
+ [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn),
+ (extract_high_v8i16 V128:$Rm)))]>;
+ def v2i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+ V128, V64, V64,
+ asm, ".2d", ".2s", ".2s",
+ [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+ def v4i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b101, opc,
+ V128, V128, V128,
+ asm#"2", ".2d", ".4s", ".4s",
+ [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn),
+ (extract_high_v4i32 V128:$Rm)))]>;
+}
+
+multiclass SIMDLongThreeVectorBHSabdl<bit U, bits<4> opc, string asm,
+ SDPatternOperator OpNode = null_frag> {
+ def v8i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+ V128, V64, V64,
+ asm, ".8h", ".8b", ".8b",
+ [(set (v8i16 V128:$Rd),
+ (zext (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))))]>;
+ def v16i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b001, opc,
+ V128, V128, V128,
+ asm#"2", ".8h", ".16b", ".16b",
+ [(set (v8i16 V128:$Rd),
+ (zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn),
+ (extract_high_v16i8 V128:$Rm)))))]>;
+ def v4i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+ V128, V64, V64,
+ asm, ".4s", ".4h", ".4h",
+ [(set (v4i32 V128:$Rd),
+ (zext (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))))]>;
+ def v8i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b011, opc,
+ V128, V128, V128,
+ asm#"2", ".4s", ".8h", ".8h",
+ [(set (v4i32 V128:$Rd),
+ (zext (v4i16 (OpNode (extract_high_v8i16 V128:$Rn),
+ (extract_high_v8i16 V128:$Rm)))))]>;
+ def v2i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+ V128, V64, V64,
+ asm, ".2d", ".2s", ".2s",
+ [(set (v2i64 V128:$Rd),
+ (zext (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))))]>;
+ def v4i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b101, opc,
+ V128, V128, V128,
+ asm#"2", ".2d", ".4s", ".4s",
+ [(set (v2i64 V128:$Rd),
+ (zext (v2i32 (OpNode (extract_high_v4i32 V128:$Rn),
+ (extract_high_v4i32 V128:$Rm)))))]>;
+}
+
+multiclass SIMDLongThreeVectorTiedBHSabal<bit U, bits<4> opc,
+ string asm,
+ SDPatternOperator OpNode> {
+ def v8i8_v8i16 : BaseSIMDDifferentThreeVectorTied<U, 0b000, opc,
+ V128, V64, V64,
+ asm, ".8h", ".8b", ".8b",
+ [(set (v8i16 V128:$dst),
+ (add (v8i16 V128:$Rd),
+ (zext (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))))]>;
+ def v16i8_v8i16 : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
+ V128, V128, V128,
+ asm#"2", ".8h", ".16b", ".16b",
+ [(set (v8i16 V128:$dst),
+ (add (v8i16 V128:$Rd),
+ (zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn),
+ (extract_high_v16i8 V128:$Rm))))))]>;
+ def v4i16_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
+ V128, V64, V64,
+ asm, ".4s", ".4h", ".4h",
+ [(set (v4i32 V128:$dst),
+ (add (v4i32 V128:$Rd),
+ (zext (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))))]>;
+ def v8i16_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
+ V128, V128, V128,
+ asm#"2", ".4s", ".8h", ".8h",
+ [(set (v4i32 V128:$dst),
+ (add (v4i32 V128:$Rd),
+ (zext (v4i16 (OpNode (extract_high_v8i16 V128:$Rn),
+ (extract_high_v8i16 V128:$Rm))))))]>;
+ def v2i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
+ V128, V64, V64,
+ asm, ".2d", ".2s", ".2s",
+ [(set (v2i64 V128:$dst),
+ (add (v2i64 V128:$Rd),
+ (zext (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))))]>;
+ def v4i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
+ V128, V128, V128,
+ asm#"2", ".2d", ".4s", ".4s",
+ [(set (v2i64 V128:$dst),
+ (add (v2i64 V128:$Rd),
+ (zext (v2i32 (OpNode (extract_high_v4i32 V128:$Rn),
+ (extract_high_v4i32 V128:$Rm))))))]>;
+}
+
+multiclass SIMDLongThreeVectorBHS<bit U, bits<4> opc, string asm,
+ SDPatternOperator OpNode = null_frag> {
+ def v8i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+ V128, V64, V64,
+ asm, ".8h", ".8b", ".8b",
+ [(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+ def v16i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b001, opc,
+ V128, V128, V128,
+ asm#"2", ".8h", ".16b", ".16b",
+ [(set (v8i16 V128:$Rd), (OpNode (extract_high_v16i8 V128:$Rn),
+ (extract_high_v16i8 V128:$Rm)))]>;
+ def v4i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+ V128, V64, V64,
+ asm, ".4s", ".4h", ".4h",
+ [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+ def v8i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b011, opc,
+ V128, V128, V128,
+ asm#"2", ".4s", ".8h", ".8h",
+ [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn),
+ (extract_high_v8i16 V128:$Rm)))]>;
+ def v2i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+ V128, V64, V64,
+ asm, ".2d", ".2s", ".2s",
+ [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+ def v4i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b101, opc,
+ V128, V128, V128,
+ asm#"2", ".2d", ".4s", ".4s",
+ [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn),
+ (extract_high_v4i32 V128:$Rm)))]>;
+}
+
+multiclass SIMDLongThreeVectorTiedBHS<bit U, bits<4> opc,
+ string asm,
+ SDPatternOperator OpNode> {
+ def v8i8_v8i16 : BaseSIMDDifferentThreeVectorTied<U, 0b000, opc,
+ V128, V64, V64,
+ asm, ".8h", ".8b", ".8b",
+ [(set (v8i16 V128:$dst),
+ (OpNode (v8i16 V128:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+ def v16i8_v8i16 : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
+ V128, V128, V128,
+ asm#"2", ".8h", ".16b", ".16b",
+ [(set (v8i16 V128:$dst),
+ (OpNode (v8i16 V128:$Rd),
+ (extract_high_v16i8 V128:$Rn),
+ (extract_high_v16i8 V128:$Rm)))]>;
+ def v4i16_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
+ V128, V64, V64,
+ asm, ".4s", ".4h", ".4h",
+ [(set (v4i32 V128:$dst),
+ (OpNode (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+ def v8i16_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
+ V128, V128, V128,
+ asm#"2", ".4s", ".8h", ".8h",
+ [(set (v4i32 V128:$dst),
+ (OpNode (v4i32 V128:$Rd),
+ (extract_high_v8i16 V128:$Rn),
+ (extract_high_v8i16 V128:$Rm)))]>;
+ def v2i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
+ V128, V64, V64,
+ asm, ".2d", ".2s", ".2s",
+ [(set (v2i64 V128:$dst),
+ (OpNode (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+ def v4i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
+ V128, V128, V128,
+ asm#"2", ".2d", ".4s", ".4s",
+ [(set (v2i64 V128:$dst),
+ (OpNode (v2i64 V128:$Rd),
+ (extract_high_v4i32 V128:$Rn),
+ (extract_high_v4i32 V128:$Rm)))]>;
+}
+
+multiclass SIMDLongThreeVectorSQDMLXTiedHS<bit U, bits<4> opc, string asm,
+ SDPatternOperator Accum> {
+ def v4i16_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
+ V128, V64, V64,
+ asm, ".4s", ".4h", ".4h",
+ [(set (v4i32 V128:$dst),
+ (Accum (v4i32 V128:$Rd),
+ (v4i32 (int_aarch64_neon_sqdmull (v4i16 V64:$Rn),
+ (v4i16 V64:$Rm)))))]>;
+ def v8i16_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
+ V128, V128, V128,
+ asm#"2", ".4s", ".8h", ".8h",
+ [(set (v4i32 V128:$dst),
+ (Accum (v4i32 V128:$Rd),
+ (v4i32 (int_aarch64_neon_sqdmull (extract_high_v8i16 V128:$Rn),
+ (extract_high_v8i16 V128:$Rm)))))]>;
+ def v2i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
+ V128, V64, V64,
+ asm, ".2d", ".2s", ".2s",
+ [(set (v2i64 V128:$dst),
+ (Accum (v2i64 V128:$Rd),
+ (v2i64 (int_aarch64_neon_sqdmull (v2i32 V64:$Rn),
+ (v2i32 V64:$Rm)))))]>;
+ def v4i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
+ V128, V128, V128,
+ asm#"2", ".2d", ".4s", ".4s",
+ [(set (v2i64 V128:$dst),
+ (Accum (v2i64 V128:$Rd),
+ (v2i64 (int_aarch64_neon_sqdmull (extract_high_v4i32 V128:$Rn),
+ (extract_high_v4i32 V128:$Rm)))))]>;
+}
+
+multiclass SIMDWideThreeVectorBHS<bit U, bits<4> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v8i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+ V128, V128, V64,
+ asm, ".8h", ".8h", ".8b",
+ [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i8 V64:$Rm)))]>;
+ def v16i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b001, opc,
+ V128, V128, V128,
+ asm#"2", ".8h", ".8h", ".16b",
+ [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
+ (extract_high_v16i8 V128:$Rm)))]>;
+ def v4i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+ V128, V128, V64,
+ asm, ".4s", ".4s", ".4h",
+ [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i16 V64:$Rm)))]>;
+ def v8i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b011, opc,
+ V128, V128, V128,
+ asm#"2", ".4s", ".4s", ".8h",
+ [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
+ (extract_high_v8i16 V128:$Rm)))]>;
+ def v2i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+ V128, V128, V64,
+ asm, ".2d", ".2d", ".2s",
+ [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i32 V64:$Rm)))]>;
+ def v4i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b101, opc,
+ V128, V128, V128,
+ asm#"2", ".2d", ".2d", ".4s",
+ [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
+ (extract_high_v4i32 V128:$Rm)))]>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD bitwise extract from vector
+//----------------------------------------------------------------------------
+
+class BaseSIMDBitwiseExtract<bit size, RegisterOperand regtype, ValueType vty,
+ string asm, string kind>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, i32imm:$imm), asm,
+ "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # ", $imm" #
+ "|" # kind # "\t$Rd, $Rn, $Rm, $imm}", "",
+ [(set (vty regtype:$Rd),
+ (AArch64ext regtype:$Rn, regtype:$Rm, (i32 imm:$imm)))]>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ bits<4> imm;
+ let Inst{31} = 0;
+ let Inst{30} = size;
+ let Inst{29-21} = 0b101110000;
+ let Inst{20-16} = Rm;
+ let Inst{15} = 0;
+ let Inst{14-11} = imm;
+ let Inst{10} = 0;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+
+multiclass SIMDBitwiseExtract<string asm> {
+ def v8i8 : BaseSIMDBitwiseExtract<0, V64, v8i8, asm, ".8b"> {
+ let imm{3} = 0;
+ }
+ def v16i8 : BaseSIMDBitwiseExtract<1, V128, v16i8, asm, ".16b">;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD zip vector
+//----------------------------------------------------------------------------
+
+class BaseSIMDZipVector<bits<3> size, bits<3> opc, RegisterOperand regtype,
+ string asm, string kind, SDNode OpNode, ValueType valty>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
+ "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
+ "|" # kind # "\t$Rd, $Rn, $Rm}", "",
+ [(set (valty regtype:$Rd), (OpNode regtype:$Rn, regtype:$Rm))]>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ let Inst{31} = 0;
+ let Inst{30} = size{0};
+ let Inst{29-24} = 0b001110;
+ let Inst{23-22} = size{2-1};
+ let Inst{21} = 0;
+ let Inst{20-16} = Rm;
+ let Inst{15} = 0;
+ let Inst{14-12} = opc;
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+multiclass SIMDZipVector<bits<3>opc, string asm,
+ SDNode OpNode> {
+ def v8i8 : BaseSIMDZipVector<0b000, opc, V64,
+ asm, ".8b", OpNode, v8i8>;
+ def v16i8 : BaseSIMDZipVector<0b001, opc, V128,
+ asm, ".16b", OpNode, v16i8>;
+ def v4i16 : BaseSIMDZipVector<0b010, opc, V64,
+ asm, ".4h", OpNode, v4i16>;
+ def v8i16 : BaseSIMDZipVector<0b011, opc, V128,
+ asm, ".8h", OpNode, v8i16>;
+ def v2i32 : BaseSIMDZipVector<0b100, opc, V64,
+ asm, ".2s", OpNode, v2i32>;
+ def v4i32 : BaseSIMDZipVector<0b101, opc, V128,
+ asm, ".4s", OpNode, v4i32>;
+ def v2i64 : BaseSIMDZipVector<0b111, opc, V128,
+ asm, ".2d", OpNode, v2i64>;
+
+ def : Pat<(v4f16 (OpNode V64:$Rn, V64:$Rm)),
+ (!cast<Instruction>(NAME#"v4i16") V64:$Rn, V64:$Rm)>;
+ def : Pat<(v8f16 (OpNode V128:$Rn, V128:$Rm)),
+ (!cast<Instruction>(NAME#"v8i16") V128:$Rn, V128:$Rm)>;
+ def : Pat<(v2f32 (OpNode V64:$Rn, V64:$Rm)),
+ (!cast<Instruction>(NAME#"v2i32") V64:$Rn, V64:$Rm)>;
+ def : Pat<(v4f32 (OpNode V128:$Rn, V128:$Rm)),
+ (!cast<Instruction>(NAME#"v4i32") V128:$Rn, V128:$Rm)>;
+ def : Pat<(v2f64 (OpNode V128:$Rn, V128:$Rm)),
+ (!cast<Instruction>(NAME#"v2i64") V128:$Rn, V128:$Rm)>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD three register scalar instructions
+//----------------------------------------------------------------------------
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDThreeScalar<bit U, bits<3> size, bits<5> opcode,
+ RegisterClass regtype, string asm,
+ list<dag> pattern>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
+ "\t$Rd, $Rn, $Rm", "", pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ let Inst{31-30} = 0b01;
+ let Inst{29} = U;
+ let Inst{28-24} = 0b11110;
+ let Inst{23-21} = size;
+ let Inst{20-16} = Rm;
+ let Inst{15-11} = opcode;
+ let Inst{10} = 1;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDThreeScalarTied<bit U, bits<2> size, bit R, bits<5> opcode,
+ dag oops, dag iops, string asm,
+ list<dag> pattern>
+ : I<oops, iops, asm, "\t$Rd, $Rn, $Rm", "$Rd = $dst", pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ let Inst{31-30} = 0b01;
+ let Inst{29} = U;
+ let Inst{28-24} = 0b11110;
+ let Inst{23-22} = size;
+ let Inst{21} = R;
+ let Inst{20-16} = Rm;
+ let Inst{15-11} = opcode;
+ let Inst{10} = 1;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+multiclass SIMDThreeScalarD<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v1i64 : BaseSIMDThreeScalar<U, 0b111, opc, FPR64, asm,
+ [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
+}
+
+multiclass SIMDThreeScalarBHSD<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v1i64 : BaseSIMDThreeScalar<U, 0b111, opc, FPR64, asm,
+ [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
+ def v1i32 : BaseSIMDThreeScalar<U, 0b101, opc, FPR32, asm, []>;
+ def v1i16 : BaseSIMDThreeScalar<U, 0b011, opc, FPR16, asm, []>;
+ def v1i8 : BaseSIMDThreeScalar<U, 0b001, opc, FPR8 , asm, []>;
+
+ def : Pat<(i64 (OpNode (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+ (!cast<Instruction>(NAME#"v1i64") FPR64:$Rn, FPR64:$Rm)>;
+ def : Pat<(i32 (OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm))),
+ (!cast<Instruction>(NAME#"v1i32") FPR32:$Rn, FPR32:$Rm)>;
+}
+
+multiclass SIMDThreeScalarHS<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v1i32 : BaseSIMDThreeScalar<U, 0b101, opc, FPR32, asm,
+ [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
+ def v1i16 : BaseSIMDThreeScalar<U, 0b011, opc, FPR16, asm, []>;
+}
+
+multiclass SIMDThreeScalarHSTied<bit U, bit R, bits<5> opc, string asm,
+ SDPatternOperator OpNode = null_frag> {
+ def v1i32: BaseSIMDThreeScalarTied<U, 0b10, R, opc, (outs FPR32:$dst),
+ (ins FPR32:$Rd, FPR32:$Rn, FPR32:$Rm),
+ asm, []>;
+ def v1i16: BaseSIMDThreeScalarTied<U, 0b01, R, opc, (outs FPR16:$dst),
+ (ins FPR16:$Rd, FPR16:$Rn, FPR16:$Rm),
+ asm, []>;
+}
+
+multiclass SIMDFPThreeScalar<bit U, bit S, bits<3> opc, string asm,
+ SDPatternOperator OpNode = null_frag> {
+ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+ def #NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm,
+ [(set (f64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
+ def #NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm,
+ [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def #NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm,
+ [(set FPR16:$Rd, (OpNode FPR16:$Rn, FPR16:$Rm))]>;
+ } // Predicates = [HasNEON, HasFullFP16]
+ }
+
+ def : Pat<(v1f64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+ (!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>;
+}
+
+multiclass SIMDThreeScalarFPCmp<bit U, bit S, bits<3> opc, string asm,
+ SDPatternOperator OpNode = null_frag> {
+ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+ def #NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm,
+ [(set (i64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
+ def #NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm,
+ [(set (i32 FPR32:$Rd), (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]>;
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def #NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm,
+ []>;
+ } // Predicates = [HasNEON, HasFullFP16]
+ }
+
+ def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+ (!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>;
+}
+
+class BaseSIMDThreeScalarMixed<bit U, bits<2> size, bits<5> opcode,
+ dag oops, dag iops, string asm, string cstr, list<dag> pat>
+ : I<oops, iops, asm,
+ "\t$Rd, $Rn, $Rm", cstr, pat>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ let Inst{31-30} = 0b01;
+ let Inst{29} = U;
+ let Inst{28-24} = 0b11110;
+ let Inst{23-22} = size;
+ let Inst{21} = 1;
+ let Inst{20-16} = Rm;
+ let Inst{15-11} = opcode;
+ let Inst{10} = 0;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDThreeScalarMixedHS<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode = null_frag> {
+ def i16 : BaseSIMDThreeScalarMixed<U, 0b01, opc,
+ (outs FPR32:$Rd),
+ (ins FPR16:$Rn, FPR16:$Rm), asm, "", []>;
+ def i32 : BaseSIMDThreeScalarMixed<U, 0b10, opc,
+ (outs FPR64:$Rd),
+ (ins FPR32:$Rn, FPR32:$Rm), asm, "",
+ [(set (i64 FPR64:$Rd), (OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm)))]>;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDThreeScalarMixedTiedHS<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode = null_frag> {
+ def i16 : BaseSIMDThreeScalarMixed<U, 0b01, opc,
+ (outs FPR32:$dst),
+ (ins FPR32:$Rd, FPR16:$Rn, FPR16:$Rm),
+ asm, "$Rd = $dst", []>;
+ def i32 : BaseSIMDThreeScalarMixed<U, 0b10, opc,
+ (outs FPR64:$dst),
+ (ins FPR64:$Rd, FPR32:$Rn, FPR32:$Rm),
+ asm, "$Rd = $dst",
+ [(set (i64 FPR64:$dst),
+ (OpNode (i64 FPR64:$Rd), (i32 FPR32:$Rn), (i32 FPR32:$Rm)))]>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD two register scalar instructions
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDTwoScalar<bit U, bits<2> size, bits<2> size2, bits<5> opcode,
+ RegisterClass regtype, RegisterClass regtype2,
+ string asm, list<dag> pat>
+ : I<(outs regtype:$Rd), (ins regtype2:$Rn), asm,
+ "\t$Rd, $Rn", "", pat>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31-30} = 0b01;
+ let Inst{29} = U;
+ let Inst{28-24} = 0b11110;
+ let Inst{23-22} = size;
+ let Inst{21} = 0b1;
+ let Inst{20-19} = size2;
+ let Inst{18-17} = 0b00;
+ let Inst{16-12} = opcode;
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDTwoScalarTied<bit U, bits<2> size, bits<5> opcode,
+ RegisterClass regtype, RegisterClass regtype2,
+ string asm, list<dag> pat>
+ : I<(outs regtype:$dst), (ins regtype:$Rd, regtype2:$Rn), asm,
+ "\t$Rd, $Rn", "$Rd = $dst", pat>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31-30} = 0b01;
+ let Inst{29} = U;
+ let Inst{28-24} = 0b11110;
+ let Inst{23-22} = size;
+ let Inst{21-17} = 0b10000;
+ let Inst{16-12} = opcode;
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<2> size2, bits<5> opcode,
+ RegisterClass regtype, string asm, string zero>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
+ "\t$Rd, $Rn, #" # zero, "", []>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31-30} = 0b01;
+ let Inst{29} = U;
+ let Inst{28-24} = 0b11110;
+ let Inst{23-22} = size;
+ let Inst{21} = 0b1;
+ let Inst{20-19} = size2;
+ let Inst{18-17} = 0b00;
+ let Inst{16-12} = opcode;
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+class SIMDInexactCvtTwoScalar<bits<5> opcode, string asm>
+ : I<(outs FPR32:$Rd), (ins FPR64:$Rn), asm, "\t$Rd, $Rn", "",
+ [(set (f32 FPR32:$Rd), (int_aarch64_sisd_fcvtxn (f64 FPR64:$Rn)))]>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31-17} = 0b011111100110000;
+ let Inst{16-12} = opcode;
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+multiclass SIMDCmpTwoScalarD<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v1i64rz : BaseSIMDCmpTwoScalar<U, 0b11, 0b00, opc, FPR64, asm, "0">;
+
+ def : Pat<(v1i64 (OpNode FPR64:$Rn)),
+ (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
+}
+
+multiclass SIMDFPCmpTwoScalar<bit U, bit S, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v1i64rz : BaseSIMDCmpTwoScalar<U, {S,1}, 0b00, opc, FPR64, asm, "0.0">;
+ def v1i32rz : BaseSIMDCmpTwoScalar<U, {S,0}, 0b00, opc, FPR32, asm, "0.0">;
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v1i16rz : BaseSIMDCmpTwoScalar<U, {S,1}, 0b11, opc, FPR16, asm, "0.0">;
+ }
+
+ def : InstAlias<asm # "\t$Rd, $Rn, #0",
+ (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rd, FPR64:$Rn), 0>;
+ def : InstAlias<asm # "\t$Rd, $Rn, #0",
+ (!cast<Instruction>(NAME # v1i32rz) FPR32:$Rd, FPR32:$Rn), 0>;
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def : InstAlias<asm # "\t$Rd, $Rn, #0",
+ (!cast<Instruction>(NAME # v1i16rz) FPR16:$Rd, FPR16:$Rn), 0>;
+ }
+
+ def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn))),
+ (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
+}
+
+multiclass SIMDTwoScalarD<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode = null_frag> {
+ def v1i64 : BaseSIMDTwoScalar<U, 0b11, 0b00, opc, FPR64, FPR64, asm,
+ [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn)))]>;
+
+ def : Pat<(i64 (OpNode (i64 FPR64:$Rn))),
+ (!cast<Instruction>(NAME # "v1i64") FPR64:$Rn)>;
+}
+
+multiclass SIMDFPTwoScalar<bit U, bit S, bits<5> opc, string asm> {
+ def v1i64 : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm,[]>;
+ def v1i32 : BaseSIMDTwoScalar<U, {S,0}, 0b00, opc, FPR32, FPR32, asm,[]>;
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v1f16 : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm,[]>;
+ }
+}
+
+multiclass SIMDFPTwoScalarCVT<bit U, bit S, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v1i64 : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm,
+ [(set FPR64:$Rd, (OpNode (f64 FPR64:$Rn)))]>;
+ def v1i32 : BaseSIMDTwoScalar<U, {S,0}, 0b00, opc, FPR32, FPR32, asm,
+ [(set FPR32:$Rd, (OpNode (f32 FPR32:$Rn)))]>;
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v1i16 : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm,
+ [(set FPR16:$Rd, (OpNode (f16 FPR16:$Rn)))]>;
+ }
+}
+
+multiclass SIMDTwoScalarBHSD<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode = null_frag> {
+ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+ def v1i64 : BaseSIMDTwoScalar<U, 0b11, 0b00, opc, FPR64, FPR64, asm,
+ [(set (i64 FPR64:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
+ def v1i32 : BaseSIMDTwoScalar<U, 0b10, 0b00, opc, FPR32, FPR32, asm,
+ [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>;
+ def v1i16 : BaseSIMDTwoScalar<U, 0b01, 0b00, opc, FPR16, FPR16, asm, []>;
+ def v1i8 : BaseSIMDTwoScalar<U, 0b00, 0b00, opc, FPR8 , FPR8 , asm, []>;
+ }
+
+ def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn))),
+ (!cast<Instruction>(NAME # v1i64) FPR64:$Rn)>;
+}
+
+multiclass SIMDTwoScalarBHSDTied<bit U, bits<5> opc, string asm,
+ Intrinsic OpNode> {
+ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+ def v1i64 : BaseSIMDTwoScalarTied<U, 0b11, opc, FPR64, FPR64, asm,
+ [(set (i64 FPR64:$dst), (OpNode (i64 FPR64:$Rd), (i64 FPR64:$Rn)))]>;
+ def v1i32 : BaseSIMDTwoScalarTied<U, 0b10, opc, FPR32, FPR32, asm,
+ [(set (i32 FPR32:$dst), (OpNode (i32 FPR32:$Rd), (i32 FPR32:$Rn)))]>;
+ def v1i16 : BaseSIMDTwoScalarTied<U, 0b01, opc, FPR16, FPR16, asm, []>;
+ def v1i8 : BaseSIMDTwoScalarTied<U, 0b00, opc, FPR8 , FPR8 , asm, []>;
+ }
+
+ def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn))),
+ (!cast<Instruction>(NAME # v1i64) FPR64:$Rd, FPR64:$Rn)>;
+}
+
+
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDTwoScalarMixedBHS<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode = null_frag> {
+ def v1i32 : BaseSIMDTwoScalar<U, 0b10, 0b00, opc, FPR32, FPR64, asm,
+ [(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
+ def v1i16 : BaseSIMDTwoScalar<U, 0b01, 0b00, opc, FPR16, FPR32, asm, []>;
+ def v1i8 : BaseSIMDTwoScalar<U, 0b00, 0b00, opc, FPR8 , FPR16, asm, []>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar pairwise instructions
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDPairwiseScalar<bit U, bits<2> size, bits<5> opcode,
+ RegisterOperand regtype, RegisterOperand vectype,
+ string asm, string kind>
+ : I<(outs regtype:$Rd), (ins vectype:$Rn), asm,
+ "{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", []>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31-30} = 0b01;
+ let Inst{29} = U;
+ let Inst{28-24} = 0b11110;
+ let Inst{23-22} = size;
+ let Inst{21-17} = 0b11000;
+ let Inst{16-12} = opcode;
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+multiclass SIMDPairwiseScalarD<bit U, bits<5> opc, string asm> {
+ def v2i64p : BaseSIMDPairwiseScalar<U, 0b11, opc, FPR64Op, V128,
+ asm, ".2d">;
+}
+
+multiclass SIMDFPPairwiseScalar<bit S, bits<5> opc, string asm> {
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v2i16p : BaseSIMDPairwiseScalar<0, {S,0}, opc, FPR16Op, V64,
+ asm, ".2h">;
+ }
+ def v2i32p : BaseSIMDPairwiseScalar<1, {S,0}, opc, FPR32Op, V64,
+ asm, ".2s">;
+ def v2i64p : BaseSIMDPairwiseScalar<1, {S,1}, opc, FPR64Op, V128,
+ asm, ".2d">;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD across lanes instructions
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDAcrossLanes<bit Q, bit U, bits<2> size, bits<5> opcode,
+ RegisterClass regtype, RegisterOperand vectype,
+ string asm, string kind, list<dag> pattern>
+ : I<(outs regtype:$Rd), (ins vectype:$Rn), asm,
+ "{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31} = 0;
+ let Inst{30} = Q;
+ let Inst{29} = U;
+ let Inst{28-24} = 0b01110;
+ let Inst{23-22} = size;
+ let Inst{21-17} = 0b11000;
+ let Inst{16-12} = opcode;
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+multiclass SIMDAcrossLanesBHS<bit U, bits<5> opcode,
+ string asm> {
+ def v8i8v : BaseSIMDAcrossLanes<0, U, 0b00, opcode, FPR8, V64,
+ asm, ".8b", []>;
+ def v16i8v : BaseSIMDAcrossLanes<1, U, 0b00, opcode, FPR8, V128,
+ asm, ".16b", []>;
+ def v4i16v : BaseSIMDAcrossLanes<0, U, 0b01, opcode, FPR16, V64,
+ asm, ".4h", []>;
+ def v8i16v : BaseSIMDAcrossLanes<1, U, 0b01, opcode, FPR16, V128,
+ asm, ".8h", []>;
+ def v4i32v : BaseSIMDAcrossLanes<1, U, 0b10, opcode, FPR32, V128,
+ asm, ".4s", []>;
+}
+
+multiclass SIMDAcrossLanesHSD<bit U, bits<5> opcode, string asm> {
+ def v8i8v : BaseSIMDAcrossLanes<0, U, 0b00, opcode, FPR16, V64,
+ asm, ".8b", []>;
+ def v16i8v : BaseSIMDAcrossLanes<1, U, 0b00, opcode, FPR16, V128,
+ asm, ".16b", []>;
+ def v4i16v : BaseSIMDAcrossLanes<0, U, 0b01, opcode, FPR32, V64,
+ asm, ".4h", []>;
+ def v8i16v : BaseSIMDAcrossLanes<1, U, 0b01, opcode, FPR32, V128,
+ asm, ".8h", []>;
+ def v4i32v : BaseSIMDAcrossLanes<1, U, 0b10, opcode, FPR64, V128,
+ asm, ".4s", []>;
+}
+
+multiclass SIMDFPAcrossLanes<bits<5> opcode, bit sz1, string asm,
+ Intrinsic intOp> {
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v4i16v : BaseSIMDAcrossLanes<0, 0, {sz1, 0}, opcode, FPR16, V64,
+ asm, ".4h",
+ [(set FPR16:$Rd, (intOp (v4f16 V64:$Rn)))]>;
+ def v8i16v : BaseSIMDAcrossLanes<1, 0, {sz1, 0}, opcode, FPR16, V128,
+ asm, ".8h",
+ [(set FPR16:$Rd, (intOp (v8f16 V128:$Rn)))]>;
+ } // Predicates = [HasNEON, HasFullFP16]
+ def v4i32v : BaseSIMDAcrossLanes<1, 1, {sz1, 0}, opcode, FPR32, V128,
+ asm, ".4s",
+ [(set FPR32:$Rd, (intOp (v4f32 V128:$Rn)))]>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD INS/DUP instructions
+//----------------------------------------------------------------------------
+
+// FIXME: There has got to be a better way to factor these. ugh.
+
+class BaseSIMDInsDup<bit Q, bit op, dag outs, dag ins, string asm,
+ string operands, string constraints, list<dag> pattern>
+ : I<outs, ins, asm, operands, constraints, pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31} = 0;
+ let Inst{30} = Q;
+ let Inst{29} = op;
+ let Inst{28-21} = 0b01110000;
+ let Inst{15} = 0;
+ let Inst{10} = 1;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+class SIMDDupFromMain<bit Q, bits<5> imm5, string size, ValueType vectype,
+ RegisterOperand vecreg, RegisterClass regtype>
+ : BaseSIMDInsDup<Q, 0, (outs vecreg:$Rd), (ins regtype:$Rn), "dup",
+ "{\t$Rd" # size # ", $Rn" #
+ "|" # size # "\t$Rd, $Rn}", "",
+ [(set (vectype vecreg:$Rd), (AArch64dup regtype:$Rn))]> {
+ let Inst{20-16} = imm5;
+ let Inst{14-11} = 0b0001;
+}
+
+class SIMDDupFromElement<bit Q, string dstkind, string srckind,
+ ValueType vectype, ValueType insreg,
+ RegisterOperand vecreg, Operand idxtype,
+ ValueType elttype, SDNode OpNode>
+ : BaseSIMDInsDup<Q, 0, (outs vecreg:$Rd), (ins V128:$Rn, idxtype:$idx), "dup",
+ "{\t$Rd" # dstkind # ", $Rn" # srckind # "$idx" #
+ "|" # dstkind # "\t$Rd, $Rn$idx}", "",
+ [(set (vectype vecreg:$Rd),
+ (OpNode (insreg V128:$Rn), idxtype:$idx))]> {
+ let Inst{14-11} = 0b0000;
+}
+
+class SIMDDup64FromElement
+ : SIMDDupFromElement<1, ".2d", ".d", v2i64, v2i64, V128,
+ VectorIndexD, i64, AArch64duplane64> {
+ bits<1> idx;
+ let Inst{20} = idx;
+ let Inst{19-16} = 0b1000;
+}
+
+class SIMDDup32FromElement<bit Q, string size, ValueType vectype,
+ RegisterOperand vecreg>
+ : SIMDDupFromElement<Q, size, ".s", vectype, v4i32, vecreg,
+ VectorIndexS, i64, AArch64duplane32> {
+ bits<2> idx;
+ let Inst{20-19} = idx;
+ let Inst{18-16} = 0b100;
+}
+
+class SIMDDup16FromElement<bit Q, string size, ValueType vectype,
+ RegisterOperand vecreg>
+ : SIMDDupFromElement<Q, size, ".h", vectype, v8i16, vecreg,
+ VectorIndexH, i64, AArch64duplane16> {
+ bits<3> idx;
+ let Inst{20-18} = idx;
+ let Inst{17-16} = 0b10;
+}
+
+class SIMDDup8FromElement<bit Q, string size, ValueType vectype,
+ RegisterOperand vecreg>
+ : SIMDDupFromElement<Q, size, ".b", vectype, v16i8, vecreg,
+ VectorIndexB, i64, AArch64duplane8> {
+ bits<4> idx;
+ let Inst{20-17} = idx;
+ let Inst{16} = 1;
+}
+
+class BaseSIMDMov<bit Q, string size, bits<4> imm4, RegisterClass regtype,
+ Operand idxtype, string asm, list<dag> pattern>
+ : BaseSIMDInsDup<Q, 0, (outs regtype:$Rd), (ins V128:$Rn, idxtype:$idx), asm,
+ "{\t$Rd, $Rn" # size # "$idx" #
+ "|" # size # "\t$Rd, $Rn$idx}", "", pattern> {
+ let Inst{14-11} = imm4;
+}
+
+class SIMDSMov<bit Q, string size, RegisterClass regtype,
+ Operand idxtype>
+ : BaseSIMDMov<Q, size, 0b0101, regtype, idxtype, "smov", []>;
+class SIMDUMov<bit Q, string size, ValueType vectype, RegisterClass regtype,
+ Operand idxtype>
+ : BaseSIMDMov<Q, size, 0b0111, regtype, idxtype, "umov",
+ [(set regtype:$Rd, (vector_extract (vectype V128:$Rn), idxtype:$idx))]>;
+
+class SIMDMovAlias<string asm, string size, Instruction inst,
+ RegisterClass regtype, Operand idxtype>
+ : InstAlias<asm#"{\t$dst, $src"#size#"$idx" #
+ "|" # size # "\t$dst, $src$idx}",
+ (inst regtype:$dst, V128:$src, idxtype:$idx)>;
+
+multiclass SMov {
+ def vi8to32 : SIMDSMov<0, ".b", GPR32, VectorIndexB> {
+ bits<4> idx;
+ let Inst{20-17} = idx;
+ let Inst{16} = 1;
+ }
+ def vi8to64 : SIMDSMov<1, ".b", GPR64, VectorIndexB> {
+ bits<4> idx;
+ let Inst{20-17} = idx;
+ let Inst{16} = 1;
+ }
+ def vi16to32 : SIMDSMov<0, ".h", GPR32, VectorIndexH> {
+ bits<3> idx;
+ let Inst{20-18} = idx;
+ let Inst{17-16} = 0b10;
+ }
+ def vi16to64 : SIMDSMov<1, ".h", GPR64, VectorIndexH> {
+ bits<3> idx;
+ let Inst{20-18} = idx;
+ let Inst{17-16} = 0b10;
+ }
+ def vi32to64 : SIMDSMov<1, ".s", GPR64, VectorIndexS> {
+ bits<2> idx;
+ let Inst{20-19} = idx;
+ let Inst{18-16} = 0b100;
+ }
+}
+
+multiclass UMov {
+ def vi8 : SIMDUMov<0, ".b", v16i8, GPR32, VectorIndexB> {
+ bits<4> idx;
+ let Inst{20-17} = idx;
+ let Inst{16} = 1;
+ }
+ def vi16 : SIMDUMov<0, ".h", v8i16, GPR32, VectorIndexH> {
+ bits<3> idx;
+ let Inst{20-18} = idx;
+ let Inst{17-16} = 0b10;
+ }
+ def vi32 : SIMDUMov<0, ".s", v4i32, GPR32, VectorIndexS> {
+ bits<2> idx;
+ let Inst{20-19} = idx;
+ let Inst{18-16} = 0b100;
+ }
+ def vi64 : SIMDUMov<1, ".d", v2i64, GPR64, VectorIndexD> {
+ bits<1> idx;
+ let Inst{20} = idx;
+ let Inst{19-16} = 0b1000;
+ }
+ def : SIMDMovAlias<"mov", ".s",
+ !cast<Instruction>(NAME#"vi32"),
+ GPR32, VectorIndexS>;
+ def : SIMDMovAlias<"mov", ".d",
+ !cast<Instruction>(NAME#"vi64"),
+ GPR64, VectorIndexD>;
+}
+
+class SIMDInsFromMain<string size, ValueType vectype,
+ RegisterClass regtype, Operand idxtype>
+ : BaseSIMDInsDup<1, 0, (outs V128:$dst),
+ (ins V128:$Rd, idxtype:$idx, regtype:$Rn), "ins",
+ "{\t$Rd" # size # "$idx, $Rn" #
+ "|" # size # "\t$Rd$idx, $Rn}",
+ "$Rd = $dst",
+ [(set V128:$dst,
+ (vector_insert (vectype V128:$Rd), regtype:$Rn, idxtype:$idx))]> {
+ let Inst{14-11} = 0b0011;
+}
+
+class SIMDInsFromElement<string size, ValueType vectype,
+ ValueType elttype, Operand idxtype>
+ : BaseSIMDInsDup<1, 1, (outs V128:$dst),
+ (ins V128:$Rd, idxtype:$idx, V128:$Rn, idxtype:$idx2), "ins",
+ "{\t$Rd" # size # "$idx, $Rn" # size # "$idx2" #
+ "|" # size # "\t$Rd$idx, $Rn$idx2}",
+ "$Rd = $dst",
+ [(set V128:$dst,
+ (vector_insert
+ (vectype V128:$Rd),
+ (elttype (vector_extract (vectype V128:$Rn), idxtype:$idx2)),
+ idxtype:$idx))]>;
+
+class SIMDInsMainMovAlias<string size, Instruction inst,
+ RegisterClass regtype, Operand idxtype>
+ : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" #
+ "|" # size #"\t$dst$idx, $src}",
+ (inst V128:$dst, idxtype:$idx, regtype:$src)>;
+class SIMDInsElementMovAlias<string size, Instruction inst,
+ Operand idxtype>
+ : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" # size # "$idx2" #
+ # "|" # size #"\t$dst$idx, $src$idx2}",
+ (inst V128:$dst, idxtype:$idx, V128:$src, idxtype:$idx2)>;
+
+
+multiclass SIMDIns {
+ def vi8gpr : SIMDInsFromMain<".b", v16i8, GPR32, VectorIndexB> {
+ bits<4> idx;
+ let Inst{20-17} = idx;
+ let Inst{16} = 1;
+ }
+ def vi16gpr : SIMDInsFromMain<".h", v8i16, GPR32, VectorIndexH> {
+ bits<3> idx;
+ let Inst{20-18} = idx;
+ let Inst{17-16} = 0b10;
+ }
+ def vi32gpr : SIMDInsFromMain<".s", v4i32, GPR32, VectorIndexS> {
+ bits<2> idx;
+ let Inst{20-19} = idx;
+ let Inst{18-16} = 0b100;
+ }
+ def vi64gpr : SIMDInsFromMain<".d", v2i64, GPR64, VectorIndexD> {
+ bits<1> idx;
+ let Inst{20} = idx;
+ let Inst{19-16} = 0b1000;
+ }
+
+ def vi8lane : SIMDInsFromElement<".b", v16i8, i32, VectorIndexB> {
+ bits<4> idx;
+ bits<4> idx2;
+ let Inst{20-17} = idx;
+ let Inst{16} = 1;
+ let Inst{14-11} = idx2;
+ }
+ def vi16lane : SIMDInsFromElement<".h", v8i16, i32, VectorIndexH> {
+ bits<3> idx;
+ bits<3> idx2;
+ let Inst{20-18} = idx;
+ let Inst{17-16} = 0b10;
+ let Inst{14-12} = idx2;
+ let Inst{11} = {?};
+ }
+ def vi32lane : SIMDInsFromElement<".s", v4i32, i32, VectorIndexS> {
+ bits<2> idx;
+ bits<2> idx2;
+ let Inst{20-19} = idx;
+ let Inst{18-16} = 0b100;
+ let Inst{14-13} = idx2;
+ let Inst{12-11} = {?,?};
+ }
+ def vi64lane : SIMDInsFromElement<".d", v2i64, i64, VectorIndexD> {
+ bits<1> idx;
+ bits<1> idx2;
+ let Inst{20} = idx;
+ let Inst{19-16} = 0b1000;
+ let Inst{14} = idx2;
+ let Inst{13-11} = {?,?,?};
+ }
+
+ // For all forms of the INS instruction, the "mov" mnemonic is the
+ // preferred alias. Why they didn't just call the instruction "mov" in
+ // the first place is a very good question indeed...
+ def : SIMDInsMainMovAlias<".b", !cast<Instruction>(NAME#"vi8gpr"),
+ GPR32, VectorIndexB>;
+ def : SIMDInsMainMovAlias<".h", !cast<Instruction>(NAME#"vi16gpr"),
+ GPR32, VectorIndexH>;
+ def : SIMDInsMainMovAlias<".s", !cast<Instruction>(NAME#"vi32gpr"),
+ GPR32, VectorIndexS>;
+ def : SIMDInsMainMovAlias<".d", !cast<Instruction>(NAME#"vi64gpr"),
+ GPR64, VectorIndexD>;
+
+ def : SIMDInsElementMovAlias<".b", !cast<Instruction>(NAME#"vi8lane"),
+ VectorIndexB>;
+ def : SIMDInsElementMovAlias<".h", !cast<Instruction>(NAME#"vi16lane"),
+ VectorIndexH>;
+ def : SIMDInsElementMovAlias<".s", !cast<Instruction>(NAME#"vi32lane"),
+ VectorIndexS>;
+ def : SIMDInsElementMovAlias<".d", !cast<Instruction>(NAME#"vi64lane"),
+ VectorIndexD>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD TBL/TBX
+//----------------------------------------------------------------------------
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDTableLookup<bit Q, bits<2> len, bit op, RegisterOperand vectype,
+ RegisterOperand listtype, string asm, string kind>
+ : I<(outs vectype:$Vd), (ins listtype:$Vn, vectype:$Vm), asm,
+ "\t$Vd" # kind # ", $Vn, $Vm" # kind, "", []>,
+ Sched<[WriteV]> {
+ bits<5> Vd;
+ bits<5> Vn;
+ bits<5> Vm;
+ let Inst{31} = 0;
+ let Inst{30} = Q;
+ let Inst{29-21} = 0b001110000;
+ let Inst{20-16} = Vm;
+ let Inst{15} = 0;
+ let Inst{14-13} = len;
+ let Inst{12} = op;
+ let Inst{11-10} = 0b00;
+ let Inst{9-5} = Vn;
+ let Inst{4-0} = Vd;
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDTableLookupTied<bit Q, bits<2> len, bit op, RegisterOperand vectype,
+ RegisterOperand listtype, string asm, string kind>
+ : I<(outs vectype:$dst), (ins vectype:$Vd, listtype:$Vn, vectype:$Vm), asm,
+ "\t$Vd" # kind # ", $Vn, $Vm" # kind, "$Vd = $dst", []>,
+ Sched<[WriteV]> {
+ bits<5> Vd;
+ bits<5> Vn;
+ bits<5> Vm;
+ let Inst{31} = 0;
+ let Inst{30} = Q;
+ let Inst{29-21} = 0b001110000;
+ let Inst{20-16} = Vm;
+ let Inst{15} = 0;
+ let Inst{14-13} = len;
+ let Inst{12} = op;
+ let Inst{11-10} = 0b00;
+ let Inst{9-5} = Vn;
+ let Inst{4-0} = Vd;
+}
+
+class SIMDTableLookupAlias<string asm, Instruction inst,
+ RegisterOperand vectype, RegisterOperand listtype>
+ : InstAlias<!strconcat(asm, "\t$dst, $lst, $index"),
+ (inst vectype:$dst, listtype:$lst, vectype:$index), 0>;
+
+multiclass SIMDTableLookup<bit op, string asm> {
+ def v8i8One : BaseSIMDTableLookup<0, 0b00, op, V64, VecListOne16b,
+ asm, ".8b">;
+ def v8i8Two : BaseSIMDTableLookup<0, 0b01, op, V64, VecListTwo16b,
+ asm, ".8b">;
+ def v8i8Three : BaseSIMDTableLookup<0, 0b10, op, V64, VecListThree16b,
+ asm, ".8b">;
+ def v8i8Four : BaseSIMDTableLookup<0, 0b11, op, V64, VecListFour16b,
+ asm, ".8b">;
+ def v16i8One : BaseSIMDTableLookup<1, 0b00, op, V128, VecListOne16b,
+ asm, ".16b">;
+ def v16i8Two : BaseSIMDTableLookup<1, 0b01, op, V128, VecListTwo16b,
+ asm, ".16b">;
+ def v16i8Three: BaseSIMDTableLookup<1, 0b10, op, V128, VecListThree16b,
+ asm, ".16b">;
+ def v16i8Four : BaseSIMDTableLookup<1, 0b11, op, V128, VecListFour16b,
+ asm, ".16b">;
+
+ def : SIMDTableLookupAlias<asm # ".8b",
+ !cast<Instruction>(NAME#"v8i8One"),
+ V64, VecListOne128>;
+ def : SIMDTableLookupAlias<asm # ".8b",
+ !cast<Instruction>(NAME#"v8i8Two"),
+ V64, VecListTwo128>;
+ def : SIMDTableLookupAlias<asm # ".8b",
+ !cast<Instruction>(NAME#"v8i8Three"),
+ V64, VecListThree128>;
+ def : SIMDTableLookupAlias<asm # ".8b",
+ !cast<Instruction>(NAME#"v8i8Four"),
+ V64, VecListFour128>;
+ def : SIMDTableLookupAlias<asm # ".16b",
+ !cast<Instruction>(NAME#"v16i8One"),
+ V128, VecListOne128>;
+ def : SIMDTableLookupAlias<asm # ".16b",
+ !cast<Instruction>(NAME#"v16i8Two"),
+ V128, VecListTwo128>;
+ def : SIMDTableLookupAlias<asm # ".16b",
+ !cast<Instruction>(NAME#"v16i8Three"),
+ V128, VecListThree128>;
+ def : SIMDTableLookupAlias<asm # ".16b",
+ !cast<Instruction>(NAME#"v16i8Four"),
+ V128, VecListFour128>;
+}
+
+multiclass SIMDTableLookupTied<bit op, string asm> {
+ def v8i8One : BaseSIMDTableLookupTied<0, 0b00, op, V64, VecListOne16b,
+ asm, ".8b">;
+ def v8i8Two : BaseSIMDTableLookupTied<0, 0b01, op, V64, VecListTwo16b,
+ asm, ".8b">;
+ def v8i8Three : BaseSIMDTableLookupTied<0, 0b10, op, V64, VecListThree16b,
+ asm, ".8b">;
+ def v8i8Four : BaseSIMDTableLookupTied<0, 0b11, op, V64, VecListFour16b,
+ asm, ".8b">;
+ def v16i8One : BaseSIMDTableLookupTied<1, 0b00, op, V128, VecListOne16b,
+ asm, ".16b">;
+ def v16i8Two : BaseSIMDTableLookupTied<1, 0b01, op, V128, VecListTwo16b,
+ asm, ".16b">;
+ def v16i8Three: BaseSIMDTableLookupTied<1, 0b10, op, V128, VecListThree16b,
+ asm, ".16b">;
+ def v16i8Four : BaseSIMDTableLookupTied<1, 0b11, op, V128, VecListFour16b,
+ asm, ".16b">;
+
+ def : SIMDTableLookupAlias<asm # ".8b",
+ !cast<Instruction>(NAME#"v8i8One"),
+ V64, VecListOne128>;
+ def : SIMDTableLookupAlias<asm # ".8b",
+ !cast<Instruction>(NAME#"v8i8Two"),
+ V64, VecListTwo128>;
+ def : SIMDTableLookupAlias<asm # ".8b",
+ !cast<Instruction>(NAME#"v8i8Three"),
+ V64, VecListThree128>;
+ def : SIMDTableLookupAlias<asm # ".8b",
+ !cast<Instruction>(NAME#"v8i8Four"),
+ V64, VecListFour128>;
+ def : SIMDTableLookupAlias<asm # ".16b",
+ !cast<Instruction>(NAME#"v16i8One"),
+ V128, VecListOne128>;
+ def : SIMDTableLookupAlias<asm # ".16b",
+ !cast<Instruction>(NAME#"v16i8Two"),
+ V128, VecListTwo128>;
+ def : SIMDTableLookupAlias<asm # ".16b",
+ !cast<Instruction>(NAME#"v16i8Three"),
+ V128, VecListThree128>;
+ def : SIMDTableLookupAlias<asm # ".16b",
+ !cast<Instruction>(NAME#"v16i8Four"),
+ V128, VecListFour128>;
+}
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar CPY
+//----------------------------------------------------------------------------
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDScalarCPY<RegisterClass regtype, RegisterOperand vectype,
+ string kind, Operand idxtype>
+ : I<(outs regtype:$dst), (ins vectype:$src, idxtype:$idx), "mov",
+ "{\t$dst, $src" # kind # "$idx" #
+ "|\t$dst, $src$idx}", "", []>,
+ Sched<[WriteV]> {
+ bits<5> dst;
+ bits<5> src;
+ let Inst{31-21} = 0b01011110000;
+ let Inst{15-10} = 0b000001;
+ let Inst{9-5} = src;
+ let Inst{4-0} = dst;
+}
+
+class SIMDScalarCPYAlias<string asm, string size, Instruction inst,
+ RegisterClass regtype, RegisterOperand vectype, Operand idxtype>
+ : InstAlias<asm # "{\t$dst, $src" # size # "$index" #
+ # "|\t$dst, $src$index}",
+ (inst regtype:$dst, vectype:$src, idxtype:$index), 0>;
+
+
+multiclass SIMDScalarCPY<string asm> {
+ def i8 : BaseSIMDScalarCPY<FPR8, V128, ".b", VectorIndexB> {
+ bits<4> idx;
+ let Inst{20-17} = idx;
+ let Inst{16} = 1;
+ }
+ def i16 : BaseSIMDScalarCPY<FPR16, V128, ".h", VectorIndexH> {
+ bits<3> idx;
+ let Inst{20-18} = idx;
+ let Inst{17-16} = 0b10;
+ }
+ def i32 : BaseSIMDScalarCPY<FPR32, V128, ".s", VectorIndexS> {
+ bits<2> idx;
+ let Inst{20-19} = idx;
+ let Inst{18-16} = 0b100;
+ }
+ def i64 : BaseSIMDScalarCPY<FPR64, V128, ".d", VectorIndexD> {
+ bits<1> idx;
+ let Inst{20} = idx;
+ let Inst{19-16} = 0b1000;
+ }
+
+ def : Pat<(v1i64 (scalar_to_vector (i64 (vector_extract (v2i64 V128:$src),
+ VectorIndexD:$idx)))),
+ (!cast<Instruction>(NAME # i64) V128:$src, VectorIndexD:$idx)>;
+
+ // 'DUP' mnemonic aliases.
+ def : SIMDScalarCPYAlias<"dup", ".b",
+ !cast<Instruction>(NAME#"i8"),
+ FPR8, V128, VectorIndexB>;
+ def : SIMDScalarCPYAlias<"dup", ".h",
+ !cast<Instruction>(NAME#"i16"),
+ FPR16, V128, VectorIndexH>;
+ def : SIMDScalarCPYAlias<"dup", ".s",
+ !cast<Instruction>(NAME#"i32"),
+ FPR32, V128, VectorIndexS>;
+ def : SIMDScalarCPYAlias<"dup", ".d",
+ !cast<Instruction>(NAME#"i64"),
+ FPR64, V128, VectorIndexD>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD modified immediate instructions
+//----------------------------------------------------------------------------
+
+class BaseSIMDModifiedImm<bit Q, bit op, bit op2, dag oops, dag iops,
+ string asm, string op_string,
+ string cstr, list<dag> pattern>
+ : I<oops, iops, asm, op_string, cstr, pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<8> imm8;
+ let Inst{31} = 0;
+ let Inst{30} = Q;
+ let Inst{29} = op;
+ let Inst{28-19} = 0b0111100000;
+ let Inst{18-16} = imm8{7-5};
+ let Inst{11} = op2;
+ let Inst{10} = 1;
+ let Inst{9-5} = imm8{4-0};
+ let Inst{4-0} = Rd;
+}
+
+class BaseSIMDModifiedImmVector<bit Q, bit op, bit op2, RegisterOperand vectype,
+ Operand immtype, dag opt_shift_iop,
+ string opt_shift, string asm, string kind,
+ list<dag> pattern>
+ : BaseSIMDModifiedImm<Q, op, op2, (outs vectype:$Rd),
+ !con((ins immtype:$imm8), opt_shift_iop), asm,
+ "{\t$Rd" # kind # ", $imm8" # opt_shift #
+ "|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
+ "", pattern> {
+ let DecoderMethod = "DecodeModImmInstruction";
+}
+
+class BaseSIMDModifiedImmVectorTied<bit Q, bit op, RegisterOperand vectype,
+ Operand immtype, dag opt_shift_iop,
+ string opt_shift, string asm, string kind,
+ list<dag> pattern>
+ : BaseSIMDModifiedImm<Q, op, 0, (outs vectype:$dst),
+ !con((ins vectype:$Rd, immtype:$imm8), opt_shift_iop),
+ asm, "{\t$Rd" # kind # ", $imm8" # opt_shift #
+ "|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
+ "$Rd = $dst", pattern> {
+ let DecoderMethod = "DecodeModImmTiedInstruction";
+}
+
+class BaseSIMDModifiedImmVectorShift<bit Q, bit op, bits<2> b15_b12,
+ RegisterOperand vectype, string asm,
+ string kind, list<dag> pattern>
+ : BaseSIMDModifiedImmVector<Q, op, 0, vectype, imm0_255,
+ (ins logical_vec_shift:$shift),
+ "$shift", asm, kind, pattern> {
+ bits<2> shift;
+ let Inst{15} = b15_b12{1};
+ let Inst{14-13} = shift;
+ let Inst{12} = b15_b12{0};
+}
+
+class BaseSIMDModifiedImmVectorShiftTied<bit Q, bit op, bits<2> b15_b12,
+ RegisterOperand vectype, string asm,
+ string kind, list<dag> pattern>
+ : BaseSIMDModifiedImmVectorTied<Q, op, vectype, imm0_255,
+ (ins logical_vec_shift:$shift),
+ "$shift", asm, kind, pattern> {
+ bits<2> shift;
+ let Inst{15} = b15_b12{1};
+ let Inst{14-13} = shift;
+ let Inst{12} = b15_b12{0};
+}
+
+
+class BaseSIMDModifiedImmVectorShiftHalf<bit Q, bit op, bits<2> b15_b12,
+ RegisterOperand vectype, string asm,
+ string kind, list<dag> pattern>
+ : BaseSIMDModifiedImmVector<Q, op, 0, vectype, imm0_255,
+ (ins logical_vec_hw_shift:$shift),
+ "$shift", asm, kind, pattern> {
+ bits<2> shift;
+ let Inst{15} = b15_b12{1};
+ let Inst{14} = 0;
+ let Inst{13} = shift{0};
+ let Inst{12} = b15_b12{0};
+}
+
+class BaseSIMDModifiedImmVectorShiftHalfTied<bit Q, bit op, bits<2> b15_b12,
+ RegisterOperand vectype, string asm,
+ string kind, list<dag> pattern>
+ : BaseSIMDModifiedImmVectorTied<Q, op, vectype, imm0_255,
+ (ins logical_vec_hw_shift:$shift),
+ "$shift", asm, kind, pattern> {
+ bits<2> shift;
+ let Inst{15} = b15_b12{1};
+ let Inst{14} = 0;
+ let Inst{13} = shift{0};
+ let Inst{12} = b15_b12{0};
+}
+
+multiclass SIMDModifiedImmVectorShift<bit op, bits<2> hw_cmode, bits<2> w_cmode,
+ string asm> {
+ def v4i16 : BaseSIMDModifiedImmVectorShiftHalf<0, op, hw_cmode, V64,
+ asm, ".4h", []>;
+ def v8i16 : BaseSIMDModifiedImmVectorShiftHalf<1, op, hw_cmode, V128,
+ asm, ".8h", []>;
+
+ def v2i32 : BaseSIMDModifiedImmVectorShift<0, op, w_cmode, V64,
+ asm, ".2s", []>;
+ def v4i32 : BaseSIMDModifiedImmVectorShift<1, op, w_cmode, V128,
+ asm, ".4s", []>;
+}
+
+multiclass SIMDModifiedImmVectorShiftTied<bit op, bits<2> hw_cmode,
+ bits<2> w_cmode, string asm,
+ SDNode OpNode> {
+ def v4i16 : BaseSIMDModifiedImmVectorShiftHalfTied<0, op, hw_cmode, V64,
+ asm, ".4h",
+ [(set (v4i16 V64:$dst), (OpNode V64:$Rd,
+ imm0_255:$imm8,
+ (i32 imm:$shift)))]>;
+ def v8i16 : BaseSIMDModifiedImmVectorShiftHalfTied<1, op, hw_cmode, V128,
+ asm, ".8h",
+ [(set (v8i16 V128:$dst), (OpNode V128:$Rd,
+ imm0_255:$imm8,
+ (i32 imm:$shift)))]>;
+
+ def v2i32 : BaseSIMDModifiedImmVectorShiftTied<0, op, w_cmode, V64,
+ asm, ".2s",
+ [(set (v2i32 V64:$dst), (OpNode V64:$Rd,
+ imm0_255:$imm8,
+ (i32 imm:$shift)))]>;
+ def v4i32 : BaseSIMDModifiedImmVectorShiftTied<1, op, w_cmode, V128,
+ asm, ".4s",
+ [(set (v4i32 V128:$dst), (OpNode V128:$Rd,
+ imm0_255:$imm8,
+ (i32 imm:$shift)))]>;
+}
+
+class SIMDModifiedImmMoveMSL<bit Q, bit op, bits<4> cmode,
+ RegisterOperand vectype, string asm,
+ string kind, list<dag> pattern>
+ : BaseSIMDModifiedImmVector<Q, op, 0, vectype, imm0_255,
+ (ins move_vec_shift:$shift),
+ "$shift", asm, kind, pattern> {
+ bits<1> shift;
+ let Inst{15-13} = cmode{3-1};
+ let Inst{12} = shift;
+}
+
+class SIMDModifiedImmVectorNoShift<bit Q, bit op, bit op2, bits<4> cmode,
+ RegisterOperand vectype,
+ Operand imm_type, string asm,
+ string kind, list<dag> pattern>
+ : BaseSIMDModifiedImmVector<Q, op, op2, vectype, imm_type, (ins), "",
+ asm, kind, pattern> {
+ let Inst{15-12} = cmode;
+}
+
+class SIMDModifiedImmScalarNoShift<bit Q, bit op, bits<4> cmode, string asm,
+ list<dag> pattern>
+ : BaseSIMDModifiedImm<Q, op, 0, (outs FPR64:$Rd), (ins simdimmtype10:$imm8), asm,
+ "\t$Rd, $imm8", "", pattern> {
+ let Inst{15-12} = cmode;
+ let DecoderMethod = "DecodeModImmInstruction";
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD indexed element
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDIndexed<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
+ RegisterOperand dst_reg, RegisterOperand lhs_reg,
+ RegisterOperand rhs_reg, Operand vec_idx, string asm,
+ string apple_kind, string dst_kind, string lhs_kind,
+ string rhs_kind, list<dag> pattern>
+ : I<(outs dst_reg:$Rd), (ins lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx),
+ asm,
+ "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" #
+ "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "", pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+
+ let Inst{31} = 0;
+ let Inst{30} = Q;
+ let Inst{29} = U;
+ let Inst{28} = Scalar;
+ let Inst{27-24} = 0b1111;
+ let Inst{23-22} = size;
+ // Bit 21 must be set by the derived class.
+ let Inst{20-16} = Rm;
+ let Inst{15-12} = opc;
+ // Bit 11 must be set by the derived class.
+ let Inst{10} = 0;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDIndexedTied<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
+ RegisterOperand dst_reg, RegisterOperand lhs_reg,
+ RegisterOperand rhs_reg, Operand vec_idx, string asm,
+ string apple_kind, string dst_kind, string lhs_kind,
+ string rhs_kind, list<dag> pattern>
+ : I<(outs dst_reg:$dst),
+ (ins dst_reg:$Rd, lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx), asm,
+ "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" #
+ "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "$Rd = $dst", pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+
+ let Inst{31} = 0;
+ let Inst{30} = Q;
+ let Inst{29} = U;
+ let Inst{28} = Scalar;
+ let Inst{27-24} = 0b1111;
+ let Inst{23-22} = size;
+ // Bit 21 must be set by the derived class.
+ let Inst{20-16} = Rm;
+ let Inst{15-12} = opc;
+ // Bit 11 must be set by the derived class.
+ let Inst{10} = 0;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
+ SDPatternOperator OpNode> {
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b00, opc,
+ V64, V64,
+ V128_lo, VectorIndexH,
+ asm, ".4h", ".4h", ".4h", ".h",
+ [(set (v4f16 V64:$Rd),
+ (OpNode (v4f16 V64:$Rn),
+ (v4f16 (AArch64duplane16 (v8f16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b00, opc,
+ V128, V128,
+ V128_lo, VectorIndexH,
+ asm, ".8h", ".8h", ".8h", ".h",
+ [(set (v8f16 V128:$Rd),
+ (OpNode (v8f16 V128:$Rn),
+ (v8f16 (AArch64duplane16 (v8f16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+ } // Predicates = [HasNEON, HasFullFP16]
+
+ def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+ V64, V64,
+ V128, VectorIndexS,
+ asm, ".2s", ".2s", ".2s", ".s",
+ [(set (v2f32 V64:$Rd),
+ (OpNode (v2f32 V64:$Rn),
+ (v2f32 (AArch64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))))]> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+
+ def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+ V128, V128,
+ V128, VectorIndexS,
+ asm, ".4s", ".4s", ".4s", ".s",
+ [(set (v4f32 V128:$Rd),
+ (OpNode (v4f32 V128:$Rn),
+ (v4f32 (AArch64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))))]> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+
+ def v2i64_indexed : BaseSIMDIndexed<1, U, 0, 0b11, opc,
+ V128, V128,
+ V128, VectorIndexD,
+ asm, ".2d", ".2d", ".2d", ".d",
+ [(set (v2f64 V128:$Rd),
+ (OpNode (v2f64 V128:$Rn),
+ (v2f64 (AArch64duplane64 (v2f64 V128:$Rm), VectorIndexD:$idx))))]> {
+ bits<1> idx;
+ let Inst{11} = idx{0};
+ let Inst{21} = 0;
+ }
+
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v1i16_indexed : BaseSIMDIndexed<1, U, 1, 0b00, opc,
+ FPR16Op, FPR16Op, V128_lo, VectorIndexH,
+ asm, ".h", "", "", ".h",
+ [(set (f16 FPR16Op:$Rd),
+ (OpNode (f16 FPR16Op:$Rn),
+ (f16 (vector_extract (v8f16 V128_lo:$Rm),
+ VectorIndexH:$idx))))]> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+ } // Predicates = [HasNEON, HasFullFP16]
+
+ def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
+ FPR32Op, FPR32Op, V128, VectorIndexS,
+ asm, ".s", "", "", ".s",
+ [(set (f32 FPR32Op:$Rd),
+ (OpNode (f32 FPR32Op:$Rn),
+ (f32 (vector_extract (v4f32 V128:$Rm),
+ VectorIndexS:$idx))))]> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+
+ def v1i64_indexed : BaseSIMDIndexed<1, U, 1, 0b11, opc,
+ FPR64Op, FPR64Op, V128, VectorIndexD,
+ asm, ".d", "", "", ".d",
+ [(set (f64 FPR64Op:$Rd),
+ (OpNode (f64 FPR64Op:$Rn),
+ (f64 (vector_extract (v2f64 V128:$Rm),
+ VectorIndexD:$idx))))]> {
+ bits<1> idx;
+ let Inst{11} = idx{0};
+ let Inst{21} = 0;
+ }
+}
+
+multiclass SIMDFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> {
+ // 2 variants for the .2s version: DUPLANE from 128-bit and DUP scalar.
+ def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+ (AArch64duplane32 (v4f32 V128:$Rm),
+ VectorIndexS:$idx))),
+ (!cast<Instruction>(INST # v2i32_indexed)
+ V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+ def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+ (AArch64dup (f32 FPR32Op:$Rm)))),
+ (!cast<Instruction>(INST # "v2i32_indexed") V64:$Rd, V64:$Rn,
+ (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
+
+
+ // 2 variants for the .4s version: DUPLANE from 128-bit and DUP scalar.
+ def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+ (AArch64duplane32 (v4f32 V128:$Rm),
+ VectorIndexS:$idx))),
+ (!cast<Instruction>(INST # "v4i32_indexed")
+ V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+ def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+ (AArch64dup (f32 FPR32Op:$Rm)))),
+ (!cast<Instruction>(INST # "v4i32_indexed") V128:$Rd, V128:$Rn,
+ (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
+
+ // 2 variants for the .2d version: DUPLANE from 128-bit and DUP scalar.
+ def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
+ (AArch64duplane64 (v2f64 V128:$Rm),
+ VectorIndexD:$idx))),
+ (!cast<Instruction>(INST # "v2i64_indexed")
+ V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+ def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
+ (AArch64dup (f64 FPR64Op:$Rm)))),
+ (!cast<Instruction>(INST # "v2i64_indexed") V128:$Rd, V128:$Rn,
+ (SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>;
+
+ // 2 variants for 32-bit scalar version: extract from .2s or from .4s
+ def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
+ (vector_extract (v4f32 V128:$Rm), VectorIndexS:$idx))),
+ (!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn,
+ V128:$Rm, VectorIndexS:$idx)>;
+ def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
+ (vector_extract (v2f32 V64:$Rm), VectorIndexS:$idx))),
+ (!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn,
+ (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>;
+
+ // 1 variant for 64-bit scalar version: extract from .1d or from .2d
+ def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn),
+ (vector_extract (v2f64 V128:$Rm), VectorIndexD:$idx))),
+ (!cast<Instruction>(INST # "v1i64_indexed") FPR64:$Rd, FPR64:$Rn,
+ V128:$Rm, VectorIndexD:$idx)>;
+}
+
+multiclass SIMDFPIndexedTied<bit U, bits<4> opc, string asm> {
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b00, opc, V64, V64,
+ V128_lo, VectorIndexH,
+ asm, ".4h", ".4h", ".4h", ".h", []> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b00, opc,
+ V128, V128,
+ V128_lo, VectorIndexH,
+ asm, ".8h", ".8h", ".8h", ".h", []> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+ } // Predicates = [HasNEON, HasFullFP16]
+
+ def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, V64, V64,
+ V128, VectorIndexS,
+ asm, ".2s", ".2s", ".2s", ".s", []> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+
+ def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+ V128, V128,
+ V128, VectorIndexS,
+ asm, ".4s", ".4s", ".4s", ".s", []> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+
+ def v2i64_indexed : BaseSIMDIndexedTied<1, U, 0, 0b11, opc,
+ V128, V128,
+ V128, VectorIndexD,
+ asm, ".2d", ".2d", ".2d", ".d", []> {
+ bits<1> idx;
+ let Inst{11} = idx{0};
+ let Inst{21} = 0;
+ }
+
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v1i16_indexed : BaseSIMDIndexedTied<1, U, 1, 0b00, opc,
+ FPR16Op, FPR16Op, V128_lo, VectorIndexH,
+ asm, ".h", "", "", ".h", []> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+ } // Predicates = [HasNEON, HasFullFP16]
+
+ def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
+ FPR32Op, FPR32Op, V128, VectorIndexS,
+ asm, ".s", "", "", ".s", []> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+
+ def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b11, opc,
+ FPR64Op, FPR64Op, V128, VectorIndexD,
+ asm, ".d", "", "", ".d", []> {
+ bits<1> idx;
+ let Inst{11} = idx{0};
+ let Inst{21} = 0;
+ }
+}
+
+multiclass SIMDIndexedHS<bit U, bits<4> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, V64, V64,
+ V128_lo, VectorIndexH,
+ asm, ".4h", ".4h", ".4h", ".h",
+ [(set (v4i16 V64:$Rd),
+ (OpNode (v4i16 V64:$Rn),
+ (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
+ V128, V128,
+ V128_lo, VectorIndexH,
+ asm, ".8h", ".8h", ".8h", ".h",
+ [(set (v8i16 V128:$Rd),
+ (OpNode (v8i16 V128:$Rn),
+ (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+ V64, V64,
+ V128, VectorIndexS,
+ asm, ".2s", ".2s", ".2s", ".s",
+ [(set (v2i32 V64:$Rd),
+ (OpNode (v2i32 V64:$Rn),
+ (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+
+ def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+ V128, V128,
+ V128, VectorIndexS,
+ asm, ".4s", ".4s", ".4s", ".s",
+ [(set (v4i32 V128:$Rd),
+ (OpNode (v4i32 V128:$Rn),
+ (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+
+ def v1i16_indexed : BaseSIMDIndexed<1, U, 1, 0b01, opc,
+ FPR16Op, FPR16Op, V128_lo, VectorIndexH,
+ asm, ".h", "", "", ".h", []> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
+ FPR32Op, FPR32Op, V128, VectorIndexS,
+ asm, ".s", "", "", ".s",
+ [(set (i32 FPR32Op:$Rd),
+ (OpNode FPR32Op:$Rn,
+ (i32 (vector_extract (v4i32 V128:$Rm),
+ VectorIndexS:$idx))))]> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+}
+
+multiclass SIMDVectorIndexedHS<bit U, bits<4> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
+ V64, V64,
+ V128_lo, VectorIndexH,
+ asm, ".4h", ".4h", ".4h", ".h",
+ [(set (v4i16 V64:$Rd),
+ (OpNode (v4i16 V64:$Rn),
+ (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
+ V128, V128,
+ V128_lo, VectorIndexH,
+ asm, ".8h", ".8h", ".8h", ".h",
+ [(set (v8i16 V128:$Rd),
+ (OpNode (v8i16 V128:$Rn),
+ (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+ V64, V64,
+ V128, VectorIndexS,
+ asm, ".2s", ".2s", ".2s", ".s",
+ [(set (v2i32 V64:$Rd),
+ (OpNode (v2i32 V64:$Rn),
+ (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+
+ def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+ V128, V128,
+ V128, VectorIndexS,
+ asm, ".4s", ".4s", ".4s", ".s",
+ [(set (v4i32 V128:$Rd),
+ (OpNode (v4i32 V128:$Rn),
+ (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+}
+
+multiclass SIMDVectorIndexedHSTied<bit U, bits<4> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc, V64, V64,
+ V128_lo, VectorIndexH,
+ asm, ".4h", ".4h", ".4h", ".h",
+ [(set (v4i16 V64:$dst),
+ (OpNode (v4i16 V64:$Rd),(v4i16 V64:$Rn),
+ (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
+ V128, V128,
+ V128_lo, VectorIndexH,
+ asm, ".8h", ".8h", ".8h", ".h",
+ [(set (v8i16 V128:$dst),
+ (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
+ (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
+ V64, V64,
+ V128, VectorIndexS,
+ asm, ".2s", ".2s", ".2s", ".s",
+ [(set (v2i32 V64:$dst),
+ (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
+ (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+
+ def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+ V128, V128,
+ V128, VectorIndexS,
+ asm, ".4s", ".4s", ".4s", ".s",
+ [(set (v4i32 V128:$dst),
+ (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
+ (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+}
+
+multiclass SIMDIndexedLongSD<bit U, bits<4> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
+ V128, V64,
+ V128_lo, VectorIndexH,
+ asm, ".4s", ".4s", ".4h", ".h",
+ [(set (v4i32 V128:$Rd),
+ (OpNode (v4i16 V64:$Rn),
+ (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
+ V128, V128,
+ V128_lo, VectorIndexH,
+ asm#"2", ".4s", ".4s", ".8h", ".h",
+ [(set (v4i32 V128:$Rd),
+ (OpNode (extract_high_v8i16 V128:$Rn),
+ (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+ VectorIndexH:$idx))))]> {
+
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+ V128, V64,
+ V128, VectorIndexS,
+ asm, ".2d", ".2d", ".2s", ".s",
+ [(set (v2i64 V128:$Rd),
+ (OpNode (v2i32 V64:$Rn),
+ (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+
+ def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+ V128, V128,
+ V128, VectorIndexS,
+ asm#"2", ".2d", ".2d", ".4s", ".s",
+ [(set (v2i64 V128:$Rd),
+ (OpNode (extract_high_v4i32 V128:$Rn),
+ (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
+ VectorIndexS:$idx))))]> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+
+ def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b01, opc,
+ FPR32Op, FPR16Op, V128_lo, VectorIndexH,
+ asm, ".h", "", "", ".h", []> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ def v1i64_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
+ FPR64Op, FPR32Op, V128, VectorIndexS,
+ asm, ".s", "", "", ".s", []> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+}
+
+multiclass SIMDIndexedLongSQDMLXSDTied<bit U, bits<4> opc, string asm,
+ SDPatternOperator Accum> {
+ def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
+ V128, V64,
+ V128_lo, VectorIndexH,
+ asm, ".4s", ".4s", ".4h", ".h",
+ [(set (v4i32 V128:$dst),
+ (Accum (v4i32 V128:$Rd),
+ (v4i32 (int_aarch64_neon_sqdmull
+ (v4i16 V64:$Rn),
+ (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+ VectorIndexH:$idx))))))]> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ // FIXME: it would be nice to use the scalar (v1i32) instruction here, but an
+ // intermediate EXTRACT_SUBREG would be untyped.
+ def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
+ (i32 (vector_extract (v4i32
+ (int_aarch64_neon_sqdmull (v4i16 V64:$Rn),
+ (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+ VectorIndexH:$idx)))),
+ (i64 0))))),
+ (EXTRACT_SUBREG
+ (!cast<Instruction>(NAME # v4i16_indexed)
+ (SUBREG_TO_REG (i32 0), FPR32Op:$Rd, ssub), V64:$Rn,
+ V128_lo:$Rm, VectorIndexH:$idx),
+ ssub)>;
+
+ def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
+ V128, V128,
+ V128_lo, VectorIndexH,
+ asm#"2", ".4s", ".4s", ".8h", ".h",
+ [(set (v4i32 V128:$dst),
+ (Accum (v4i32 V128:$Rd),
+ (v4i32 (int_aarch64_neon_sqdmull
+ (extract_high_v8i16 V128:$Rn),
+ (extract_high_v8i16
+ (AArch64duplane16 (v8i16 V128_lo:$Rm),
+ VectorIndexH:$idx))))))]> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
+ V128, V64,
+ V128, VectorIndexS,
+ asm, ".2d", ".2d", ".2s", ".s",
+ [(set (v2i64 V128:$dst),
+ (Accum (v2i64 V128:$Rd),
+ (v2i64 (int_aarch64_neon_sqdmull
+ (v2i32 V64:$Rn),
+ (v2i32 (AArch64duplane32 (v4i32 V128:$Rm),
+ VectorIndexS:$idx))))))]> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+
+ def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+ V128, V128,
+ V128, VectorIndexS,
+ asm#"2", ".2d", ".2d", ".4s", ".s",
+ [(set (v2i64 V128:$dst),
+ (Accum (v2i64 V128:$Rd),
+ (v2i64 (int_aarch64_neon_sqdmull
+ (extract_high_v4i32 V128:$Rn),
+ (extract_high_v4i32
+ (AArch64duplane32 (v4i32 V128:$Rm),
+ VectorIndexS:$idx))))))]> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+
+ def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b01, opc,
+ FPR32Op, FPR16Op, V128_lo, VectorIndexH,
+ asm, ".h", "", "", ".h", []> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+
+ def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
+ FPR64Op, FPR32Op, V128, VectorIndexS,
+ asm, ".s", "", "", ".s",
+ [(set (i64 FPR64Op:$dst),
+ (Accum (i64 FPR64Op:$Rd),
+ (i64 (int_aarch64_neon_sqdmulls_scalar
+ (i32 FPR32Op:$Rn),
+ (i32 (vector_extract (v4i32 V128:$Rm),
+ VectorIndexS:$idx))))))]> {
+
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+}
+
+multiclass SIMDVectorIndexedLongSD<bit U, bits<4> opc, string asm,
+ SDPatternOperator OpNode> {
+ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+ def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
+ V128, V64,
+ V128_lo, VectorIndexH,
+ asm, ".4s", ".4s", ".4h", ".h",
+ [(set (v4i32 V128:$Rd),
+ (OpNode (v4i16 V64:$Rn),
+ (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
+ V128, V128,
+ V128_lo, VectorIndexH,
+ asm#"2", ".4s", ".4s", ".8h", ".h",
+ [(set (v4i32 V128:$Rd),
+ (OpNode (extract_high_v8i16 V128:$Rn),
+ (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+ VectorIndexH:$idx))))]> {
+
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+ V128, V64,
+ V128, VectorIndexS,
+ asm, ".2d", ".2d", ".2s", ".s",
+ [(set (v2i64 V128:$Rd),
+ (OpNode (v2i32 V64:$Rn),
+ (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+
+ def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+ V128, V128,
+ V128, VectorIndexS,
+ asm#"2", ".2d", ".2d", ".4s", ".s",
+ [(set (v2i64 V128:$Rd),
+ (OpNode (extract_high_v4i32 V128:$Rn),
+ (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
+ VectorIndexS:$idx))))]> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+ }
+}
+
+multiclass SIMDVectorIndexedLongSDTied<bit U, bits<4> opc, string asm,
+ SDPatternOperator OpNode> {
+ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+ def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
+ V128, V64,
+ V128_lo, VectorIndexH,
+ asm, ".4s", ".4s", ".4h", ".h",
+ [(set (v4i32 V128:$dst),
+ (OpNode (v4i32 V128:$Rd), (v4i16 V64:$Rn),
+ (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
+ V128, V128,
+ V128_lo, VectorIndexH,
+ asm#"2", ".4s", ".4s", ".8h", ".h",
+ [(set (v4i32 V128:$dst),
+ (OpNode (v4i32 V128:$Rd),
+ (extract_high_v8i16 V128:$Rn),
+ (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+ VectorIndexH:$idx))))]> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
+ V128, V64,
+ V128, VectorIndexS,
+ asm, ".2d", ".2d", ".2s", ".s",
+ [(set (v2i64 V128:$dst),
+ (OpNode (v2i64 V128:$Rd), (v2i32 V64:$Rn),
+ (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+
+ def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+ V128, V128,
+ V128, VectorIndexS,
+ asm#"2", ".2d", ".2d", ".4s", ".s",
+ [(set (v2i64 V128:$dst),
+ (OpNode (v2i64 V128:$Rd),
+ (extract_high_v4i32 V128:$Rn),
+ (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
+ VectorIndexS:$idx))))]> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+ }
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar shift by immediate
+//----------------------------------------------------------------------------
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDScalarShift<bit U, bits<5> opc, bits<7> fixed_imm,
+ RegisterClass regtype1, RegisterClass regtype2,
+ Operand immtype, string asm, list<dag> pattern>
+ : I<(outs regtype1:$Rd), (ins regtype2:$Rn, immtype:$imm),
+ asm, "\t$Rd, $Rn, $imm", "", pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<7> imm;
+ let Inst{31-30} = 0b01;
+ let Inst{29} = U;
+ let Inst{28-23} = 0b111110;
+ let Inst{22-16} = fixed_imm;
+ let Inst{15-11} = opc;
+ let Inst{10} = 1;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDScalarShiftTied<bit U, bits<5> opc, bits<7> fixed_imm,
+ RegisterClass regtype1, RegisterClass regtype2,
+ Operand immtype, string asm, list<dag> pattern>
+ : I<(outs regtype1:$dst), (ins regtype1:$Rd, regtype2:$Rn, immtype:$imm),
+ asm, "\t$Rd, $Rn, $imm", "$Rd = $dst", pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<7> imm;
+ let Inst{31-30} = 0b01;
+ let Inst{29} = U;
+ let Inst{28-23} = 0b111110;
+ let Inst{22-16} = fixed_imm;
+ let Inst{15-11} = opc;
+ let Inst{10} = 1;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+
+multiclass SIMDFPScalarRShift<bit U, bits<5> opc, string asm> {
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
+ FPR16, FPR16, vecshiftR16, asm, []> {
+ let Inst{19-16} = imm{3-0};
+ }
+ } // Predicates = [HasNEON, HasFullFP16]
+ def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
+ FPR32, FPR32, vecshiftR32, asm, []> {
+ let Inst{20-16} = imm{4-0};
+ }
+
+ def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+ FPR64, FPR64, vecshiftR64, asm, []> {
+ let Inst{21-16} = imm{5-0};
+ }
+}
+
+multiclass SIMDScalarRShiftD<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+ FPR64, FPR64, vecshiftR64, asm,
+ [(set (i64 FPR64:$Rd),
+ (OpNode (i64 FPR64:$Rn), (i32 vecshiftR64:$imm)))]> {
+ let Inst{21-16} = imm{5-0};
+ }
+
+ def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftR64:$imm))),
+ (!cast<Instruction>(NAME # "d") FPR64:$Rn, vecshiftR64:$imm)>;
+}
+
+multiclass SIMDScalarRShiftDTied<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode = null_frag> {
+ def d : BaseSIMDScalarShiftTied<U, opc, {1,?,?,?,?,?,?},
+ FPR64, FPR64, vecshiftR64, asm,
+ [(set (i64 FPR64:$dst), (OpNode (i64 FPR64:$Rd), (i64 FPR64:$Rn),
+ (i32 vecshiftR64:$imm)))]> {
+ let Inst{21-16} = imm{5-0};
+ }
+
+ def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
+ (i32 vecshiftR64:$imm))),
+ (!cast<Instruction>(NAME # "d") FPR64:$Rd, FPR64:$Rn,
+ vecshiftR64:$imm)>;
+}
+
+multiclass SIMDScalarLShiftD<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+ FPR64, FPR64, vecshiftL64, asm,
+ [(set (v1i64 FPR64:$Rd),
+ (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm)))]> {
+ let Inst{21-16} = imm{5-0};
+ }
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+multiclass SIMDScalarLShiftDTied<bit U, bits<5> opc, string asm> {
+ def d : BaseSIMDScalarShiftTied<U, opc, {1,?,?,?,?,?,?},
+ FPR64, FPR64, vecshiftL64, asm, []> {
+ let Inst{21-16} = imm{5-0};
+ }
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+multiclass SIMDScalarRShiftBHS<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode = null_frag> {
+ def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
+ FPR8, FPR16, vecshiftR8, asm, []> {
+ let Inst{18-16} = imm{2-0};
+ }
+
+ def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
+ FPR16, FPR32, vecshiftR16, asm, []> {
+ let Inst{19-16} = imm{3-0};
+ }
+
+ def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
+ FPR32, FPR64, vecshiftR32, asm,
+ [(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn), vecshiftR32:$imm))]> {
+ let Inst{20-16} = imm{4-0};
+ }
+}
+
+multiclass SIMDScalarLShiftBHSD<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
+ FPR8, FPR8, vecshiftL8, asm, []> {
+ let Inst{18-16} = imm{2-0};
+ }
+
+ def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
+ FPR16, FPR16, vecshiftL16, asm, []> {
+ let Inst{19-16} = imm{3-0};
+ }
+
+ def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
+ FPR32, FPR32, vecshiftL32, asm,
+ [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn), (i32 vecshiftL32:$imm)))]> {
+ let Inst{20-16} = imm{4-0};
+ }
+
+ def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+ FPR64, FPR64, vecshiftL64, asm,
+ [(set (i64 FPR64:$Rd), (OpNode (i64 FPR64:$Rn), (i32 vecshiftL64:$imm)))]> {
+ let Inst{21-16} = imm{5-0};
+ }
+
+ def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm))),
+ (!cast<Instruction>(NAME # "d") FPR64:$Rn, vecshiftL64:$imm)>;
+}
+
+multiclass SIMDScalarRShiftBHSD<bit U, bits<5> opc, string asm> {
+ def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
+ FPR8, FPR8, vecshiftR8, asm, []> {
+ let Inst{18-16} = imm{2-0};
+ }
+
+ def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
+ FPR16, FPR16, vecshiftR16, asm, []> {
+ let Inst{19-16} = imm{3-0};
+ }
+
+ def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
+ FPR32, FPR32, vecshiftR32, asm, []> {
+ let Inst{20-16} = imm{4-0};
+ }
+
+ def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+ FPR64, FPR64, vecshiftR64, asm, []> {
+ let Inst{21-16} = imm{5-0};
+ }
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD vector x indexed element
+//----------------------------------------------------------------------------
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDVectorShift<bit Q, bit U, bits<5> opc, bits<7> fixed_imm,
+ RegisterOperand dst_reg, RegisterOperand src_reg,
+ Operand immtype,
+ string asm, string dst_kind, string src_kind,
+ list<dag> pattern>
+ : I<(outs dst_reg:$Rd), (ins src_reg:$Rn, immtype:$imm),
+ asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" #
+ "|" # dst_kind # "\t$Rd, $Rn, $imm}", "", pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31} = 0;
+ let Inst{30} = Q;
+ let Inst{29} = U;
+ let Inst{28-23} = 0b011110;
+ let Inst{22-16} = fixed_imm;
+ let Inst{15-11} = opc;
+ let Inst{10} = 1;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDVectorShiftTied<bit Q, bit U, bits<5> opc, bits<7> fixed_imm,
+ RegisterOperand vectype1, RegisterOperand vectype2,
+ Operand immtype,
+ string asm, string dst_kind, string src_kind,
+ list<dag> pattern>
+ : I<(outs vectype1:$dst), (ins vectype1:$Rd, vectype2:$Rn, immtype:$imm),
+ asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" #
+ "|" # dst_kind # "\t$Rd, $Rn, $imm}", "$Rd = $dst", pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31} = 0;
+ let Inst{30} = Q;
+ let Inst{29} = U;
+ let Inst{28-23} = 0b011110;
+ let Inst{22-16} = fixed_imm;
+ let Inst{15-11} = opc;
+ let Inst{10} = 1;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+multiclass SIMDVectorRShiftSD<bit U, bits<5> opc, string asm,
+ Intrinsic OpNode> {
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+ V64, V64, vecshiftR16,
+ asm, ".4h", ".4h",
+ [(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (i32 imm:$imm)))]> {
+ bits<4> imm;
+ let Inst{19-16} = imm;
+ }
+
+ def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+ V128, V128, vecshiftR16,
+ asm, ".8h", ".8h",
+ [(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (i32 imm:$imm)))]> {
+ bits<4> imm;
+ let Inst{19-16} = imm;
+ }
+ } // Predicates = [HasNEON, HasFullFP16]
+ def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+ V64, V64, vecshiftR32,
+ asm, ".2s", ".2s",
+ [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (i32 imm:$imm)))]> {
+ bits<5> imm;
+ let Inst{20-16} = imm;
+ }
+
+ def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+ V128, V128, vecshiftR32,
+ asm, ".4s", ".4s",
+ [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (i32 imm:$imm)))]> {
+ bits<5> imm;
+ let Inst{20-16} = imm;
+ }
+
+ def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
+ V128, V128, vecshiftR64,
+ asm, ".2d", ".2d",
+ [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (i32 imm:$imm)))]> {
+ bits<6> imm;
+ let Inst{21-16} = imm;
+ }
+}
+
+multiclass SIMDVectorRShiftToFP<bit U, bits<5> opc, string asm,
+ Intrinsic OpNode> {
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+ V64, V64, vecshiftR16,
+ asm, ".4h", ".4h",
+ [(set (v4f16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (i32 imm:$imm)))]> {
+ bits<4> imm;
+ let Inst{19-16} = imm;
+ }
+
+ def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+ V128, V128, vecshiftR16,
+ asm, ".8h", ".8h",
+ [(set (v8f16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (i32 imm:$imm)))]> {
+ bits<4> imm;
+ let Inst{19-16} = imm;
+ }
+ } // Predicates = [HasNEON, HasFullFP16]
+
+ def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+ V64, V64, vecshiftR32,
+ asm, ".2s", ".2s",
+ [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (i32 imm:$imm)))]> {
+ bits<5> imm;
+ let Inst{20-16} = imm;
+ }
+
+ def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+ V128, V128, vecshiftR32,
+ asm, ".4s", ".4s",
+ [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (i32 imm:$imm)))]> {
+ bits<5> imm;
+ let Inst{20-16} = imm;
+ }
+
+ def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
+ V128, V128, vecshiftR64,
+ asm, ".2d", ".2d",
+ [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (i32 imm:$imm)))]> {
+ bits<6> imm;
+ let Inst{21-16} = imm;
+ }
+}
+
+multiclass SIMDVectorRShiftNarrowBHS<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
+ V64, V128, vecshiftR16Narrow,
+ asm, ".8b", ".8h",
+ [(set (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn), vecshiftR16Narrow:$imm))]> {
+ bits<3> imm;
+ let Inst{18-16} = imm;
+ }
+
+ def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
+ V128, V128, vecshiftR16Narrow,
+ asm#"2", ".16b", ".8h", []> {
+ bits<3> imm;
+ let Inst{18-16} = imm;
+ let hasSideEffects = 0;
+ }
+
+ def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+ V64, V128, vecshiftR32Narrow,
+ asm, ".4h", ".4s",
+ [(set (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn), vecshiftR32Narrow:$imm))]> {
+ bits<4> imm;
+ let Inst{19-16} = imm;
+ }
+
+ def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
+ V128, V128, vecshiftR32Narrow,
+ asm#"2", ".8h", ".4s", []> {
+ bits<4> imm;
+ let Inst{19-16} = imm;
+ let hasSideEffects = 0;
+ }
+
+ def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+ V64, V128, vecshiftR64Narrow,
+ asm, ".2s", ".2d",
+ [(set (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn), vecshiftR64Narrow:$imm))]> {
+ bits<5> imm;
+ let Inst{20-16} = imm;
+ }
+
+ def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
+ V128, V128, vecshiftR64Narrow,
+ asm#"2", ".4s", ".2d", []> {
+ bits<5> imm;
+ let Inst{20-16} = imm;
+ let hasSideEffects = 0;
+ }
+
+ // TableGen doesn't like patters w/ INSERT_SUBREG on the instructions
+ // themselves, so put them here instead.
+
+ // Patterns involving what's effectively an insert high and a normal
+ // intrinsic, represented by CONCAT_VECTORS.
+ def : Pat<(concat_vectors (v8i8 V64:$Rd),(OpNode (v8i16 V128:$Rn),
+ vecshiftR16Narrow:$imm)),
+ (!cast<Instruction>(NAME # "v16i8_shift")
+ (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+ V128:$Rn, vecshiftR16Narrow:$imm)>;
+ def : Pat<(concat_vectors (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn),
+ vecshiftR32Narrow:$imm)),
+ (!cast<Instruction>(NAME # "v8i16_shift")
+ (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+ V128:$Rn, vecshiftR32Narrow:$imm)>;
+ def : Pat<(concat_vectors (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn),
+ vecshiftR64Narrow:$imm)),
+ (!cast<Instruction>(NAME # "v4i32_shift")
+ (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+ V128:$Rn, vecshiftR64Narrow:$imm)>;
+}
+
+multiclass SIMDVectorLShiftBHSD<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
+ V64, V64, vecshiftL8,
+ asm, ".8b", ".8b",
+ [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn),
+ (i32 vecshiftL8:$imm)))]> {
+ bits<3> imm;
+ let Inst{18-16} = imm;
+ }
+
+ def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
+ V128, V128, vecshiftL8,
+ asm, ".16b", ".16b",
+ [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn),
+ (i32 vecshiftL8:$imm)))]> {
+ bits<3> imm;
+ let Inst{18-16} = imm;
+ }
+
+ def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+ V64, V64, vecshiftL16,
+ asm, ".4h", ".4h",
+ [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn),
+ (i32 vecshiftL16:$imm)))]> {
+ bits<4> imm;
+ let Inst{19-16} = imm;
+ }
+
+ def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+ V128, V128, vecshiftL16,
+ asm, ".8h", ".8h",
+ [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
+ (i32 vecshiftL16:$imm)))]> {
+ bits<4> imm;
+ let Inst{19-16} = imm;
+ }
+
+ def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+ V64, V64, vecshiftL32,
+ asm, ".2s", ".2s",
+ [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn),
+ (i32 vecshiftL32:$imm)))]> {
+ bits<5> imm;
+ let Inst{20-16} = imm;
+ }
+
+ def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+ V128, V128, vecshiftL32,
+ asm, ".4s", ".4s",
+ [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
+ (i32 vecshiftL32:$imm)))]> {
+ bits<5> imm;
+ let Inst{20-16} = imm;
+ }
+
+ def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
+ V128, V128, vecshiftL64,
+ asm, ".2d", ".2d",
+ [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
+ (i32 vecshiftL64:$imm)))]> {
+ bits<6> imm;
+ let Inst{21-16} = imm;
+ }
+}
+
+multiclass SIMDVectorRShiftBHSD<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
+ V64, V64, vecshiftR8,
+ asm, ".8b", ".8b",
+ [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn),
+ (i32 vecshiftR8:$imm)))]> {
+ bits<3> imm;
+ let Inst{18-16} = imm;
+ }
+
+ def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
+ V128, V128, vecshiftR8,
+ asm, ".16b", ".16b",
+ [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn),
+ (i32 vecshiftR8:$imm)))]> {
+ bits<3> imm;
+ let Inst{18-16} = imm;
+ }
+
+ def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+ V64, V64, vecshiftR16,
+ asm, ".4h", ".4h",
+ [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn),
+ (i32 vecshiftR16:$imm)))]> {
+ bits<4> imm;
+ let Inst{19-16} = imm;
+ }
+
+ def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+ V128, V128, vecshiftR16,
+ asm, ".8h", ".8h",
+ [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
+ (i32 vecshiftR16:$imm)))]> {
+ bits<4> imm;
+ let Inst{19-16} = imm;
+ }
+
+ def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+ V64, V64, vecshiftR32,
+ asm, ".2s", ".2s",
+ [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn),
+ (i32 vecshiftR32:$imm)))]> {
+ bits<5> imm;
+ let Inst{20-16} = imm;
+ }
+
+ def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+ V128, V128, vecshiftR32,
+ asm, ".4s", ".4s",
+ [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
+ (i32 vecshiftR32:$imm)))]> {
+ bits<5> imm;
+ let Inst{20-16} = imm;
+ }
+
+ def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
+ V128, V128, vecshiftR64,
+ asm, ".2d", ".2d",
+ [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
+ (i32 vecshiftR64:$imm)))]> {
+ bits<6> imm;
+ let Inst{21-16} = imm;
+ }
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDVectorRShiftBHSDTied<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode = null_frag> {
+ def v8i8_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,0,1,?,?,?},
+ V64, V64, vecshiftR8, asm, ".8b", ".8b",
+ [(set (v8i8 V64:$dst),
+ (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn),
+ (i32 vecshiftR8:$imm)))]> {
+ bits<3> imm;
+ let Inst{18-16} = imm;
+ }
+
+ def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
+ V128, V128, vecshiftR8, asm, ".16b", ".16b",
+ [(set (v16i8 V128:$dst),
+ (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
+ (i32 vecshiftR8:$imm)))]> {
+ bits<3> imm;
+ let Inst{18-16} = imm;
+ }
+
+ def v4i16_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,1,?,?,?,?},
+ V64, V64, vecshiftR16, asm, ".4h", ".4h",
+ [(set (v4i16 V64:$dst),
+ (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn),
+ (i32 vecshiftR16:$imm)))]> {
+ bits<4> imm;
+ let Inst{19-16} = imm;
+ }
+
+ def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
+ V128, V128, vecshiftR16, asm, ".8h", ".8h",
+ [(set (v8i16 V128:$dst),
+ (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
+ (i32 vecshiftR16:$imm)))]> {
+ bits<4> imm;
+ let Inst{19-16} = imm;
+ }
+
+ def v2i32_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,1,?,?,?,?,?},
+ V64, V64, vecshiftR32, asm, ".2s", ".2s",
+ [(set (v2i32 V64:$dst),
+ (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
+ (i32 vecshiftR32:$imm)))]> {
+ bits<5> imm;
+ let Inst{20-16} = imm;
+ }
+
+ def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
+ V128, V128, vecshiftR32, asm, ".4s", ".4s",
+ [(set (v4i32 V128:$dst),
+ (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
+ (i32 vecshiftR32:$imm)))]> {
+ bits<5> imm;
+ let Inst{20-16} = imm;
+ }
+
+ def v2i64_shift : BaseSIMDVectorShiftTied<1, U, opc, {1,?,?,?,?,?,?},
+ V128, V128, vecshiftR64,
+ asm, ".2d", ".2d", [(set (v2i64 V128:$dst),
+ (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn),
+ (i32 vecshiftR64:$imm)))]> {
+ bits<6> imm;
+ let Inst{21-16} = imm;
+ }
+}
+
+multiclass SIMDVectorLShiftBHSDTied<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode = null_frag> {
+ def v8i8_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,0,1,?,?,?},
+ V64, V64, vecshiftL8,
+ asm, ".8b", ".8b",
+ [(set (v8i8 V64:$dst),
+ (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn),
+ (i32 vecshiftL8:$imm)))]> {
+ bits<3> imm;
+ let Inst{18-16} = imm;
+ }
+
+ def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
+ V128, V128, vecshiftL8,
+ asm, ".16b", ".16b",
+ [(set (v16i8 V128:$dst),
+ (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
+ (i32 vecshiftL8:$imm)))]> {
+ bits<3> imm;
+ let Inst{18-16} = imm;
+ }
+
+ def v4i16_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,1,?,?,?,?},
+ V64, V64, vecshiftL16,
+ asm, ".4h", ".4h",
+ [(set (v4i16 V64:$dst),
+ (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn),
+ (i32 vecshiftL16:$imm)))]> {
+ bits<4> imm;
+ let Inst{19-16} = imm;
+ }
+
+ def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
+ V128, V128, vecshiftL16,
+ asm, ".8h", ".8h",
+ [(set (v8i16 V128:$dst),
+ (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
+ (i32 vecshiftL16:$imm)))]> {
+ bits<4> imm;
+ let Inst{19-16} = imm;
+ }
+
+ def v2i32_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,1,?,?,?,?,?},
+ V64, V64, vecshiftL32,
+ asm, ".2s", ".2s",
+ [(set (v2i32 V64:$dst),
+ (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
+ (i32 vecshiftL32:$imm)))]> {
+ bits<5> imm;
+ let Inst{20-16} = imm;
+ }
+
+ def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
+ V128, V128, vecshiftL32,
+ asm, ".4s", ".4s",
+ [(set (v4i32 V128:$dst),
+ (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
+ (i32 vecshiftL32:$imm)))]> {
+ bits<5> imm;
+ let Inst{20-16} = imm;
+ }
+
+ def v2i64_shift : BaseSIMDVectorShiftTied<1, U, opc, {1,?,?,?,?,?,?},
+ V128, V128, vecshiftL64,
+ asm, ".2d", ".2d",
+ [(set (v2i64 V128:$dst),
+ (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn),
+ (i32 vecshiftL64:$imm)))]> {
+ bits<6> imm;
+ let Inst{21-16} = imm;
+ }
+}
+
+multiclass SIMDVectorLShiftLongBHSD<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
+ V128, V64, vecshiftL8, asm, ".8h", ".8b",
+ [(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), vecshiftL8:$imm))]> {
+ bits<3> imm;
+ let Inst{18-16} = imm;
+ }
+
+ def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
+ V128, V128, vecshiftL8,
+ asm#"2", ".8h", ".16b",
+ [(set (v8i16 V128:$Rd),
+ (OpNode (extract_high_v16i8 V128:$Rn), vecshiftL8:$imm))]> {
+ bits<3> imm;
+ let Inst{18-16} = imm;
+ }
+
+ def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+ V128, V64, vecshiftL16, asm, ".4s", ".4h",
+ [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), vecshiftL16:$imm))]> {
+ bits<4> imm;
+ let Inst{19-16} = imm;
+ }
+
+ def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+ V128, V128, vecshiftL16,
+ asm#"2", ".4s", ".8h",
+ [(set (v4i32 V128:$Rd),
+ (OpNode (extract_high_v8i16 V128:$Rn), vecshiftL16:$imm))]> {
+
+ bits<4> imm;
+ let Inst{19-16} = imm;
+ }
+
+ def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+ V128, V64, vecshiftL32, asm, ".2d", ".2s",
+ [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), vecshiftL32:$imm))]> {
+ bits<5> imm;
+ let Inst{20-16} = imm;
+ }
+
+ def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+ V128, V128, vecshiftL32,
+ asm#"2", ".2d", ".4s",
+ [(set (v2i64 V128:$Rd),
+ (OpNode (extract_high_v4i32 V128:$Rn), vecshiftL32:$imm))]> {
+ bits<5> imm;
+ let Inst{20-16} = imm;
+ }
+}
+
+
+//---
+// Vector load/store
+//---
+// SIMD ldX/stX no-index memory references don't allow the optional
+// ", #0" constant and handle post-indexing explicitly, so we use
+// a more specialized parse method for them. Otherwise, it's the same as
+// the general GPR64sp handling.
+
+class BaseSIMDLdSt<bit Q, bit L, bits<4> opcode, bits<2> size,
+ string asm, dag oops, dag iops, list<dag> pattern>
+ : I<oops, iops, asm, "\t$Vt, [$Rn]", "", pattern> {
+ bits<5> Vt;
+ bits<5> Rn;
+ let Inst{31} = 0;
+ let Inst{30} = Q;
+ let Inst{29-23} = 0b0011000;
+ let Inst{22} = L;
+ let Inst{21-16} = 0b000000;
+ let Inst{15-12} = opcode;
+ let Inst{11-10} = size;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Vt;
+}
+
+class BaseSIMDLdStPost<bit Q, bit L, bits<4> opcode, bits<2> size,
+ string asm, dag oops, dag iops>
+ : I<oops, iops, asm, "\t$Vt, [$Rn], $Xm", "$Rn = $wback", []> {
+ bits<5> Vt;
+ bits<5> Rn;
+ bits<5> Xm;
+ let Inst{31} = 0;
+ let Inst{30} = Q;
+ let Inst{29-23} = 0b0011001;
+ let Inst{22} = L;
+ let Inst{21} = 0;
+ let Inst{20-16} = Xm;
+ let Inst{15-12} = opcode;
+ let Inst{11-10} = size;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Vt;
+}
+
+// The immediate form of AdvSIMD post-indexed addressing is encoded with
+// register post-index addressing from the zero register.
+multiclass SIMDLdStAliases<string asm, string layout, string Count,
+ int Offset, int Size> {
+ // E.g. "ld1 { v0.8b, v1.8b }, [x1], #16"
+ // "ld1\t$Vt, [$Rn], #16"
+ // may get mapped to
+ // (LD1Twov8b_POST VecListTwo8b:$Vt, GPR64sp:$Rn, XZR)
+ def : InstAlias<asm # "\t$Vt, [$Rn], #" # Offset,
+ (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
+ GPR64sp:$Rn,
+ !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
+ XZR), 1>;
+
+ // E.g. "ld1.8b { v0, v1 }, [x1], #16"
+ // "ld1.8b\t$Vt, [$Rn], #16"
+ // may get mapped to
+ // (LD1Twov8b_POST VecListTwo64:$Vt, GPR64sp:$Rn, XZR)
+ def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], #" # Offset,
+ (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
+ GPR64sp:$Rn,
+ !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+ XZR), 0>;
+
+ // E.g. "ld1.8b { v0, v1 }, [x1]"
+ // "ld1\t$Vt, [$Rn]"
+ // may get mapped to
+ // (LD1Twov8b VecListTwo64:$Vt, GPR64sp:$Rn)
+ def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn]",
+ (!cast<Instruction>(NAME # Count # "v" # layout)
+ !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+ GPR64sp:$Rn), 0>;
+
+ // E.g. "ld1.8b { v0, v1 }, [x1], x2"
+ // "ld1\t$Vt, [$Rn], $Xm"
+ // may get mapped to
+ // (LD1Twov8b_POST VecListTwo64:$Vt, GPR64sp:$Rn, GPR64pi8:$Xm)
+ def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], $Xm",
+ (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
+ GPR64sp:$Rn,
+ !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+ !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
+}
+
+multiclass BaseSIMDLdN<string Count, string asm, string veclist, int Offset128,
+ int Offset64, bits<4> opcode> {
+ let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
+ def v16b: BaseSIMDLdSt<1, 1, opcode, 0b00, asm,
+ (outs !cast<RegisterOperand>(veclist # "16b"):$Vt),
+ (ins GPR64sp:$Rn), []>;
+ def v8h : BaseSIMDLdSt<1, 1, opcode, 0b01, asm,
+ (outs !cast<RegisterOperand>(veclist # "8h"):$Vt),
+ (ins GPR64sp:$Rn), []>;
+ def v4s : BaseSIMDLdSt<1, 1, opcode, 0b10, asm,
+ (outs !cast<RegisterOperand>(veclist # "4s"):$Vt),
+ (ins GPR64sp:$Rn), []>;
+ def v2d : BaseSIMDLdSt<1, 1, opcode, 0b11, asm,
+ (outs !cast<RegisterOperand>(veclist # "2d"):$Vt),
+ (ins GPR64sp:$Rn), []>;
+ def v8b : BaseSIMDLdSt<0, 1, opcode, 0b00, asm,
+ (outs !cast<RegisterOperand>(veclist # "8b"):$Vt),
+ (ins GPR64sp:$Rn), []>;
+ def v4h : BaseSIMDLdSt<0, 1, opcode, 0b01, asm,
+ (outs !cast<RegisterOperand>(veclist # "4h"):$Vt),
+ (ins GPR64sp:$Rn), []>;
+ def v2s : BaseSIMDLdSt<0, 1, opcode, 0b10, asm,
+ (outs !cast<RegisterOperand>(veclist # "2s"):$Vt),
+ (ins GPR64sp:$Rn), []>;
+
+
+ def v16b_POST: BaseSIMDLdStPost<1, 1, opcode, 0b00, asm,
+ (outs GPR64sp:$wback,
+ !cast<RegisterOperand>(veclist # "16b"):$Vt),
+ (ins GPR64sp:$Rn,
+ !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+ def v8h_POST : BaseSIMDLdStPost<1, 1, opcode, 0b01, asm,
+ (outs GPR64sp:$wback,
+ !cast<RegisterOperand>(veclist # "8h"):$Vt),
+ (ins GPR64sp:$Rn,
+ !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+ def v4s_POST : BaseSIMDLdStPost<1, 1, opcode, 0b10, asm,
+ (outs GPR64sp:$wback,
+ !cast<RegisterOperand>(veclist # "4s"):$Vt),
+ (ins GPR64sp:$Rn,
+ !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+ def v2d_POST : BaseSIMDLdStPost<1, 1, opcode, 0b11, asm,
+ (outs GPR64sp:$wback,
+ !cast<RegisterOperand>(veclist # "2d"):$Vt),
+ (ins GPR64sp:$Rn,
+ !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+ def v8b_POST : BaseSIMDLdStPost<0, 1, opcode, 0b00, asm,
+ (outs GPR64sp:$wback,
+ !cast<RegisterOperand>(veclist # "8b"):$Vt),
+ (ins GPR64sp:$Rn,
+ !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+ def v4h_POST : BaseSIMDLdStPost<0, 1, opcode, 0b01, asm,
+ (outs GPR64sp:$wback,
+ !cast<RegisterOperand>(veclist # "4h"):$Vt),
+ (ins GPR64sp:$Rn,
+ !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+ def v2s_POST : BaseSIMDLdStPost<0, 1, opcode, 0b10, asm,
+ (outs GPR64sp:$wback,
+ !cast<RegisterOperand>(veclist # "2s"):$Vt),
+ (ins GPR64sp:$Rn,
+ !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+ }
+
+ defm : SIMDLdStAliases<asm, "16b", Count, Offset128, 128>;
+ defm : SIMDLdStAliases<asm, "8h", Count, Offset128, 128>;
+ defm : SIMDLdStAliases<asm, "4s", Count, Offset128, 128>;
+ defm : SIMDLdStAliases<asm, "2d", Count, Offset128, 128>;
+ defm : SIMDLdStAliases<asm, "8b", Count, Offset64, 64>;
+ defm : SIMDLdStAliases<asm, "4h", Count, Offset64, 64>;
+ defm : SIMDLdStAliases<asm, "2s", Count, Offset64, 64>;
+}
+
+// Only ld1/st1 has a v1d version.
+multiclass BaseSIMDStN<string Count, string asm, string veclist, int Offset128,
+ int Offset64, bits<4> opcode> {
+ let hasSideEffects = 0, mayStore = 1, mayLoad = 0 in {
+ def v16b : BaseSIMDLdSt<1, 0, opcode, 0b00, asm, (outs),
+ (ins !cast<RegisterOperand>(veclist # "16b"):$Vt,
+ GPR64sp:$Rn), []>;
+ def v8h : BaseSIMDLdSt<1, 0, opcode, 0b01, asm, (outs),
+ (ins !cast<RegisterOperand>(veclist # "8h"):$Vt,
+ GPR64sp:$Rn), []>;
+ def v4s : BaseSIMDLdSt<1, 0, opcode, 0b10, asm, (outs),
+ (ins !cast<RegisterOperand>(veclist # "4s"):$Vt,
+ GPR64sp:$Rn), []>;
+ def v2d : BaseSIMDLdSt<1, 0, opcode, 0b11, asm, (outs),
+ (ins !cast<RegisterOperand>(veclist # "2d"):$Vt,
+ GPR64sp:$Rn), []>;
+ def v8b : BaseSIMDLdSt<0, 0, opcode, 0b00, asm, (outs),
+ (ins !cast<RegisterOperand>(veclist # "8b"):$Vt,
+ GPR64sp:$Rn), []>;
+ def v4h : BaseSIMDLdSt<0, 0, opcode, 0b01, asm, (outs),
+ (ins !cast<RegisterOperand>(veclist # "4h"):$Vt,
+ GPR64sp:$Rn), []>;
+ def v2s : BaseSIMDLdSt<0, 0, opcode, 0b10, asm, (outs),
+ (ins !cast<RegisterOperand>(veclist # "2s"):$Vt,
+ GPR64sp:$Rn), []>;
+
+ def v16b_POST : BaseSIMDLdStPost<1, 0, opcode, 0b00, asm,
+ (outs GPR64sp:$wback),
+ (ins !cast<RegisterOperand>(veclist # "16b"):$Vt,
+ GPR64sp:$Rn,
+ !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+ def v8h_POST : BaseSIMDLdStPost<1, 0, opcode, 0b01, asm,
+ (outs GPR64sp:$wback),
+ (ins !cast<RegisterOperand>(veclist # "8h"):$Vt,
+ GPR64sp:$Rn,
+ !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+ def v4s_POST : BaseSIMDLdStPost<1, 0, opcode, 0b10, asm,
+ (outs GPR64sp:$wback),
+ (ins !cast<RegisterOperand>(veclist # "4s"):$Vt,
+ GPR64sp:$Rn,
+ !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+ def v2d_POST : BaseSIMDLdStPost<1, 0, opcode, 0b11, asm,
+ (outs GPR64sp:$wback),
+ (ins !cast<RegisterOperand>(veclist # "2d"):$Vt,
+ GPR64sp:$Rn,
+ !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+ def v8b_POST : BaseSIMDLdStPost<0, 0, opcode, 0b00, asm,
+ (outs GPR64sp:$wback),
+ (ins !cast<RegisterOperand>(veclist # "8b"):$Vt,
+ GPR64sp:$Rn,
+ !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+ def v4h_POST : BaseSIMDLdStPost<0, 0, opcode, 0b01, asm,
+ (outs GPR64sp:$wback),
+ (ins !cast<RegisterOperand>(veclist # "4h"):$Vt,
+ GPR64sp:$Rn,
+ !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+ def v2s_POST : BaseSIMDLdStPost<0, 0, opcode, 0b10, asm,
+ (outs GPR64sp:$wback),
+ (ins !cast<RegisterOperand>(veclist # "2s"):$Vt,
+ GPR64sp:$Rn,
+ !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+ }
+
+ defm : SIMDLdStAliases<asm, "16b", Count, Offset128, 128>;
+ defm : SIMDLdStAliases<asm, "8h", Count, Offset128, 128>;
+ defm : SIMDLdStAliases<asm, "4s", Count, Offset128, 128>;
+ defm : SIMDLdStAliases<asm, "2d", Count, Offset128, 128>;
+ defm : SIMDLdStAliases<asm, "8b", Count, Offset64, 64>;
+ defm : SIMDLdStAliases<asm, "4h", Count, Offset64, 64>;
+ defm : SIMDLdStAliases<asm, "2s", Count, Offset64, 64>;
+}
+
+multiclass BaseSIMDLd1<string Count, string asm, string veclist,
+ int Offset128, int Offset64, bits<4> opcode>
+ : BaseSIMDLdN<Count, asm, veclist, Offset128, Offset64, opcode> {
+
+ // LD1 instructions have extra "1d" variants.
+ let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
+ def v1d : BaseSIMDLdSt<0, 1, opcode, 0b11, asm,
+ (outs !cast<RegisterOperand>(veclist # "1d"):$Vt),
+ (ins GPR64sp:$Rn), []>;
+
+ def v1d_POST : BaseSIMDLdStPost<0, 1, opcode, 0b11, asm,
+ (outs GPR64sp:$wback,
+ !cast<RegisterOperand>(veclist # "1d"):$Vt),
+ (ins GPR64sp:$Rn,
+ !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+ }
+
+ defm : SIMDLdStAliases<asm, "1d", Count, Offset64, 64>;
+}
+
+multiclass BaseSIMDSt1<string Count, string asm, string veclist,
+ int Offset128, int Offset64, bits<4> opcode>
+ : BaseSIMDStN<Count, asm, veclist, Offset128, Offset64, opcode> {
+
+ // ST1 instructions have extra "1d" variants.
+ let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in {
+ def v1d : BaseSIMDLdSt<0, 0, opcode, 0b11, asm, (outs),
+ (ins !cast<RegisterOperand>(veclist # "1d"):$Vt,
+ GPR64sp:$Rn), []>;
+
+ def v1d_POST : BaseSIMDLdStPost<0, 0, opcode, 0b11, asm,
+ (outs GPR64sp:$wback),
+ (ins !cast<RegisterOperand>(veclist # "1d"):$Vt,
+ GPR64sp:$Rn,
+ !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+ }
+
+ defm : SIMDLdStAliases<asm, "1d", Count, Offset64, 64>;
+}
+
+multiclass SIMDLd1Multiple<string asm> {
+ defm One : BaseSIMDLd1<"One", asm, "VecListOne", 16, 8, 0b0111>;
+ defm Two : BaseSIMDLd1<"Two", asm, "VecListTwo", 32, 16, 0b1010>;
+ defm Three : BaseSIMDLd1<"Three", asm, "VecListThree", 48, 24, 0b0110>;
+ defm Four : BaseSIMDLd1<"Four", asm, "VecListFour", 64, 32, 0b0010>;
+}
+
+multiclass SIMDSt1Multiple<string asm> {
+ defm One : BaseSIMDSt1<"One", asm, "VecListOne", 16, 8, 0b0111>;
+ defm Two : BaseSIMDSt1<"Two", asm, "VecListTwo", 32, 16, 0b1010>;
+ defm Three : BaseSIMDSt1<"Three", asm, "VecListThree", 48, 24, 0b0110>;
+ defm Four : BaseSIMDSt1<"Four", asm, "VecListFour", 64, 32, 0b0010>;
+}
+
+multiclass SIMDLd2Multiple<string asm> {
+ defm Two : BaseSIMDLdN<"Two", asm, "VecListTwo", 32, 16, 0b1000>;
+}
+
+multiclass SIMDSt2Multiple<string asm> {
+ defm Two : BaseSIMDStN<"Two", asm, "VecListTwo", 32, 16, 0b1000>;
+}
+
+multiclass SIMDLd3Multiple<string asm> {
+ defm Three : BaseSIMDLdN<"Three", asm, "VecListThree", 48, 24, 0b0100>;
+}
+
+multiclass SIMDSt3Multiple<string asm> {
+ defm Three : BaseSIMDStN<"Three", asm, "VecListThree", 48, 24, 0b0100>;
+}
+
+multiclass SIMDLd4Multiple<string asm> {
+ defm Four : BaseSIMDLdN<"Four", asm, "VecListFour", 64, 32, 0b0000>;
+}
+
+multiclass SIMDSt4Multiple<string asm> {
+ defm Four : BaseSIMDStN<"Four", asm, "VecListFour", 64, 32, 0b0000>;
+}
+
+//---
+// AdvSIMD Load/store single-element
+//---
+
+class BaseSIMDLdStSingle<bit L, bit R, bits<3> opcode,
+ string asm, string operands, string cst,
+ dag oops, dag iops, list<dag> pattern>
+ : I<oops, iops, asm, operands, cst, pattern> {
+ bits<5> Vt;
+ bits<5> Rn;
+ let Inst{31} = 0;
+ let Inst{29-24} = 0b001101;
+ let Inst{22} = L;
+ let Inst{21} = R;
+ let Inst{15-13} = opcode;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Vt;
+}
+
+class BaseSIMDLdStSingleTied<bit L, bit R, bits<3> opcode,
+ string asm, string operands, string cst,
+ dag oops, dag iops, list<dag> pattern>
+ : I<oops, iops, asm, operands, "$Vt = $dst," # cst, pattern> {
+ bits<5> Vt;
+ bits<5> Rn;
+ let Inst{31} = 0;
+ let Inst{29-24} = 0b001101;
+ let Inst{22} = L;
+ let Inst{21} = R;
+ let Inst{15-13} = opcode;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Vt;
+}
+
+
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDLdR<bit Q, bit R, bits<3> opcode, bit S, bits<2> size, string asm,
+ Operand listtype>
+ : BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, [$Rn]", "",
+ (outs listtype:$Vt), (ins GPR64sp:$Rn),
+ []> {
+ let Inst{30} = Q;
+ let Inst{23} = 0;
+ let Inst{20-16} = 0b00000;
+ let Inst{12} = S;
+ let Inst{11-10} = size;
+}
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDLdRPost<bit Q, bit R, bits<3> opcode, bit S, bits<2> size,
+ string asm, Operand listtype, Operand GPR64pi>
+ : BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, [$Rn], $Xm",
+ "$Rn = $wback",
+ (outs GPR64sp:$wback, listtype:$Vt),
+ (ins GPR64sp:$Rn, GPR64pi:$Xm), []> {
+ bits<5> Xm;
+ let Inst{30} = Q;
+ let Inst{23} = 1;
+ let Inst{20-16} = Xm;
+ let Inst{12} = S;
+ let Inst{11-10} = size;
+}
+
+multiclass SIMDLdrAliases<string asm, string layout, string Count,
+ int Offset, int Size> {
+ // E.g. "ld1r { v0.8b }, [x1], #1"
+ // "ld1r.8b\t$Vt, [$Rn], #1"
+ // may get mapped to
+ // (LD1Rv8b_POST VecListOne8b:$Vt, GPR64sp:$Rn, XZR)
+ def : InstAlias<asm # "\t$Vt, [$Rn], #" # Offset,
+ (!cast<Instruction>(NAME # "v" # layout # "_POST")
+ GPR64sp:$Rn,
+ !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
+ XZR), 1>;
+
+ // E.g. "ld1r.8b { v0 }, [x1], #1"
+ // "ld1r.8b\t$Vt, [$Rn], #1"
+ // may get mapped to
+ // (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, XZR)
+ def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], #" # Offset,
+ (!cast<Instruction>(NAME # "v" # layout # "_POST")
+ GPR64sp:$Rn,
+ !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+ XZR), 0>;
+
+ // E.g. "ld1r.8b { v0 }, [x1]"
+ // "ld1r.8b\t$Vt, [$Rn]"
+ // may get mapped to
+ // (LD1Rv8b VecListOne64:$Vt, GPR64sp:$Rn)
+ def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn]",
+ (!cast<Instruction>(NAME # "v" # layout)
+ !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+ GPR64sp:$Rn), 0>;
+
+ // E.g. "ld1r.8b { v0 }, [x1], x2"
+ // "ld1r.8b\t$Vt, [$Rn], $Xm"
+ // may get mapped to
+ // (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, GPR64pi1:$Xm)
+ def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], $Xm",
+ (!cast<Instruction>(NAME # "v" # layout # "_POST")
+ GPR64sp:$Rn,
+ !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+ !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
+}
+
+multiclass SIMDLdR<bit R, bits<3> opcode, bit S, string asm, string Count,
+ int Offset1, int Offset2, int Offset4, int Offset8> {
+ def v8b : BaseSIMDLdR<0, R, opcode, S, 0b00, asm,
+ !cast<Operand>("VecList" # Count # "8b")>;
+ def v16b: BaseSIMDLdR<1, R, opcode, S, 0b00, asm,
+ !cast<Operand>("VecList" # Count #"16b")>;
+ def v4h : BaseSIMDLdR<0, R, opcode, S, 0b01, asm,
+ !cast<Operand>("VecList" # Count #"4h")>;
+ def v8h : BaseSIMDLdR<1, R, opcode, S, 0b01, asm,
+ !cast<Operand>("VecList" # Count #"8h")>;
+ def v2s : BaseSIMDLdR<0, R, opcode, S, 0b10, asm,
+ !cast<Operand>("VecList" # Count #"2s")>;
+ def v4s : BaseSIMDLdR<1, R, opcode, S, 0b10, asm,
+ !cast<Operand>("VecList" # Count #"4s")>;
+ def v1d : BaseSIMDLdR<0, R, opcode, S, 0b11, asm,
+ !cast<Operand>("VecList" # Count #"1d")>;
+ def v2d : BaseSIMDLdR<1, R, opcode, S, 0b11, asm,
+ !cast<Operand>("VecList" # Count #"2d")>;
+
+ def v8b_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b00, asm,
+ !cast<Operand>("VecList" # Count # "8b"),
+ !cast<Operand>("GPR64pi" # Offset1)>;
+ def v16b_POST: BaseSIMDLdRPost<1, R, opcode, S, 0b00, asm,
+ !cast<Operand>("VecList" # Count # "16b"),
+ !cast<Operand>("GPR64pi" # Offset1)>;
+ def v4h_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b01, asm,
+ !cast<Operand>("VecList" # Count # "4h"),
+ !cast<Operand>("GPR64pi" # Offset2)>;
+ def v8h_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b01, asm,
+ !cast<Operand>("VecList" # Count # "8h"),
+ !cast<Operand>("GPR64pi" # Offset2)>;
+ def v2s_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b10, asm,
+ !cast<Operand>("VecList" # Count # "2s"),
+ !cast<Operand>("GPR64pi" # Offset4)>;
+ def v4s_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b10, asm,
+ !cast<Operand>("VecList" # Count # "4s"),
+ !cast<Operand>("GPR64pi" # Offset4)>;
+ def v1d_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b11, asm,
+ !cast<Operand>("VecList" # Count # "1d"),
+ !cast<Operand>("GPR64pi" # Offset8)>;
+ def v2d_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b11, asm,
+ !cast<Operand>("VecList" # Count # "2d"),
+ !cast<Operand>("GPR64pi" # Offset8)>;
+
+ defm : SIMDLdrAliases<asm, "8b", Count, Offset1, 64>;
+ defm : SIMDLdrAliases<asm, "16b", Count, Offset1, 128>;
+ defm : SIMDLdrAliases<asm, "4h", Count, Offset2, 64>;
+ defm : SIMDLdrAliases<asm, "8h", Count, Offset2, 128>;
+ defm : SIMDLdrAliases<asm, "2s", Count, Offset4, 64>;
+ defm : SIMDLdrAliases<asm, "4s", Count, Offset4, 128>;
+ defm : SIMDLdrAliases<asm, "1d", Count, Offset8, 64>;
+ defm : SIMDLdrAliases<asm, "2d", Count, Offset8, 128>;
+}
+
+class SIMDLdStSingleB<bit L, bit R, bits<3> opcode, string asm,
+ dag oops, dag iops, list<dag> pattern>
+ : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
+ pattern> {
+ // idx encoded in Q:S:size fields.
+ bits<4> idx;
+ let Inst{30} = idx{3};
+ let Inst{23} = 0;
+ let Inst{20-16} = 0b00000;
+ let Inst{12} = idx{2};
+ let Inst{11-10} = idx{1-0};
+}
+class SIMDLdStSingleBTied<bit L, bit R, bits<3> opcode, string asm,
+ dag oops, dag iops, list<dag> pattern>
+ : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
+ oops, iops, pattern> {
+ // idx encoded in Q:S:size fields.
+ bits<4> idx;
+ let Inst{30} = idx{3};
+ let Inst{23} = 0;
+ let Inst{20-16} = 0b00000;
+ let Inst{12} = idx{2};
+ let Inst{11-10} = idx{1-0};
+}
+class SIMDLdStSingleBPost<bit L, bit R, bits<3> opcode, string asm,
+ dag oops, dag iops>
+ : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+ "$Rn = $wback", oops, iops, []> {
+ // idx encoded in Q:S:size fields.
+ bits<4> idx;
+ bits<5> Xm;
+ let Inst{30} = idx{3};
+ let Inst{23} = 1;
+ let Inst{20-16} = Xm;
+ let Inst{12} = idx{2};
+ let Inst{11-10} = idx{1-0};
+}
+class SIMDLdStSingleBTiedPost<bit L, bit R, bits<3> opcode, string asm,
+ dag oops, dag iops>
+ : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+ "$Rn = $wback", oops, iops, []> {
+ // idx encoded in Q:S:size fields.
+ bits<4> idx;
+ bits<5> Xm;
+ let Inst{30} = idx{3};
+ let Inst{23} = 1;
+ let Inst{20-16} = Xm;
+ let Inst{12} = idx{2};
+ let Inst{11-10} = idx{1-0};
+}
+
+class SIMDLdStSingleH<bit L, bit R, bits<3> opcode, bit size, string asm,
+ dag oops, dag iops, list<dag> pattern>
+ : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
+ pattern> {
+ // idx encoded in Q:S:size<1> fields.
+ bits<3> idx;
+ let Inst{30} = idx{2};
+ let Inst{23} = 0;
+ let Inst{20-16} = 0b00000;
+ let Inst{12} = idx{1};
+ let Inst{11} = idx{0};
+ let Inst{10} = size;
+}
+class SIMDLdStSingleHTied<bit L, bit R, bits<3> opcode, bit size, string asm,
+ dag oops, dag iops, list<dag> pattern>
+ : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
+ oops, iops, pattern> {
+ // idx encoded in Q:S:size<1> fields.
+ bits<3> idx;
+ let Inst{30} = idx{2};
+ let Inst{23} = 0;
+ let Inst{20-16} = 0b00000;
+ let Inst{12} = idx{1};
+ let Inst{11} = idx{0};
+ let Inst{10} = size;
+}
+
+class SIMDLdStSingleHPost<bit L, bit R, bits<3> opcode, bit size, string asm,
+ dag oops, dag iops>
+ : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+ "$Rn = $wback", oops, iops, []> {
+ // idx encoded in Q:S:size<1> fields.
+ bits<3> idx;
+ bits<5> Xm;
+ let Inst{30} = idx{2};
+ let Inst{23} = 1;
+ let Inst{20-16} = Xm;
+ let Inst{12} = idx{1};
+ let Inst{11} = idx{0};
+ let Inst{10} = size;
+}
+class SIMDLdStSingleHTiedPost<bit L, bit R, bits<3> opcode, bit size, string asm,
+ dag oops, dag iops>
+ : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+ "$Rn = $wback", oops, iops, []> {
+ // idx encoded in Q:S:size<1> fields.
+ bits<3> idx;
+ bits<5> Xm;
+ let Inst{30} = idx{2};
+ let Inst{23} = 1;
+ let Inst{20-16} = Xm;
+ let Inst{12} = idx{1};
+ let Inst{11} = idx{0};
+ let Inst{10} = size;
+}
+class SIMDLdStSingleS<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
+ dag oops, dag iops, list<dag> pattern>
+ : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
+ pattern> {
+ // idx encoded in Q:S fields.
+ bits<2> idx;
+ let Inst{30} = idx{1};
+ let Inst{23} = 0;
+ let Inst{20-16} = 0b00000;
+ let Inst{12} = idx{0};
+ let Inst{11-10} = size;
+}
+class SIMDLdStSingleSTied<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
+ dag oops, dag iops, list<dag> pattern>
+ : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
+ oops, iops, pattern> {
+ // idx encoded in Q:S fields.
+ bits<2> idx;
+ let Inst{30} = idx{1};
+ let Inst{23} = 0;
+ let Inst{20-16} = 0b00000;
+ let Inst{12} = idx{0};
+ let Inst{11-10} = size;
+}
+class SIMDLdStSingleSPost<bit L, bit R, bits<3> opcode, bits<2> size,
+ string asm, dag oops, dag iops>
+ : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+ "$Rn = $wback", oops, iops, []> {
+ // idx encoded in Q:S fields.
+ bits<2> idx;
+ bits<5> Xm;
+ let Inst{30} = idx{1};
+ let Inst{23} = 1;
+ let Inst{20-16} = Xm;
+ let Inst{12} = idx{0};
+ let Inst{11-10} = size;
+}
+class SIMDLdStSingleSTiedPost<bit L, bit R, bits<3> opcode, bits<2> size,
+ string asm, dag oops, dag iops>
+ : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+ "$Rn = $wback", oops, iops, []> {
+ // idx encoded in Q:S fields.
+ bits<2> idx;
+ bits<5> Xm;
+ let Inst{30} = idx{1};
+ let Inst{23} = 1;
+ let Inst{20-16} = Xm;
+ let Inst{12} = idx{0};
+ let Inst{11-10} = size;
+}
+class SIMDLdStSingleD<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
+ dag oops, dag iops, list<dag> pattern>
+ : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
+ pattern> {
+ // idx encoded in Q field.
+ bits<1> idx;
+ let Inst{30} = idx;
+ let Inst{23} = 0;
+ let Inst{20-16} = 0b00000;
+ let Inst{12} = 0;
+ let Inst{11-10} = size;
+}
+class SIMDLdStSingleDTied<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
+ dag oops, dag iops, list<dag> pattern>
+ : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
+ oops, iops, pattern> {
+ // idx encoded in Q field.
+ bits<1> idx;
+ let Inst{30} = idx;
+ let Inst{23} = 0;
+ let Inst{20-16} = 0b00000;
+ let Inst{12} = 0;
+ let Inst{11-10} = size;
+}
+class SIMDLdStSingleDPost<bit L, bit R, bits<3> opcode, bits<2> size,
+ string asm, dag oops, dag iops>
+ : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+ "$Rn = $wback", oops, iops, []> {
+ // idx encoded in Q field.
+ bits<1> idx;
+ bits<5> Xm;
+ let Inst{30} = idx;
+ let Inst{23} = 1;
+ let Inst{20-16} = Xm;
+ let Inst{12} = 0;
+ let Inst{11-10} = size;
+}
+class SIMDLdStSingleDTiedPost<bit L, bit R, bits<3> opcode, bits<2> size,
+ string asm, dag oops, dag iops>
+ : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+ "$Rn = $wback", oops, iops, []> {
+ // idx encoded in Q field.
+ bits<1> idx;
+ bits<5> Xm;
+ let Inst{30} = idx;
+ let Inst{23} = 1;
+ let Inst{20-16} = Xm;
+ let Inst{12} = 0;
+ let Inst{11-10} = size;
+}
+
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDLdSingleBTied<bit R, bits<3> opcode, string asm,
+ RegisterOperand listtype,
+ RegisterOperand GPR64pi> {
+ def i8 : SIMDLdStSingleBTied<1, R, opcode, asm,
+ (outs listtype:$dst),
+ (ins listtype:$Vt, VectorIndexB:$idx,
+ GPR64sp:$Rn), []>;
+
+ def i8_POST : SIMDLdStSingleBTiedPost<1, R, opcode, asm,
+ (outs GPR64sp:$wback, listtype:$dst),
+ (ins listtype:$Vt, VectorIndexB:$idx,
+ GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDLdSingleHTied<bit R, bits<3> opcode, bit size, string asm,
+ RegisterOperand listtype,
+ RegisterOperand GPR64pi> {
+ def i16 : SIMDLdStSingleHTied<1, R, opcode, size, asm,
+ (outs listtype:$dst),
+ (ins listtype:$Vt, VectorIndexH:$idx,
+ GPR64sp:$Rn), []>;
+
+ def i16_POST : SIMDLdStSingleHTiedPost<1, R, opcode, size, asm,
+ (outs GPR64sp:$wback, listtype:$dst),
+ (ins listtype:$Vt, VectorIndexH:$idx,
+ GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDLdSingleSTied<bit R, bits<3> opcode, bits<2> size,string asm,
+ RegisterOperand listtype,
+ RegisterOperand GPR64pi> {
+ def i32 : SIMDLdStSingleSTied<1, R, opcode, size, asm,
+ (outs listtype:$dst),
+ (ins listtype:$Vt, VectorIndexS:$idx,
+ GPR64sp:$Rn), []>;
+
+ def i32_POST : SIMDLdStSingleSTiedPost<1, R, opcode, size, asm,
+ (outs GPR64sp:$wback, listtype:$dst),
+ (ins listtype:$Vt, VectorIndexS:$idx,
+ GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDLdSingleDTied<bit R, bits<3> opcode, bits<2> size, string asm,
+ RegisterOperand listtype, RegisterOperand GPR64pi> {
+ def i64 : SIMDLdStSingleDTied<1, R, opcode, size, asm,
+ (outs listtype:$dst),
+ (ins listtype:$Vt, VectorIndexD:$idx,
+ GPR64sp:$Rn), []>;
+
+ def i64_POST : SIMDLdStSingleDTiedPost<1, R, opcode, size, asm,
+ (outs GPR64sp:$wback, listtype:$dst),
+ (ins listtype:$Vt, VectorIndexD:$idx,
+ GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+multiclass SIMDStSingleB<bit R, bits<3> opcode, string asm,
+ RegisterOperand listtype, RegisterOperand GPR64pi> {
+ def i8 : SIMDLdStSingleB<0, R, opcode, asm,
+ (outs), (ins listtype:$Vt, VectorIndexB:$idx,
+ GPR64sp:$Rn), []>;
+
+ def i8_POST : SIMDLdStSingleBPost<0, R, opcode, asm,
+ (outs GPR64sp:$wback),
+ (ins listtype:$Vt, VectorIndexB:$idx,
+ GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+multiclass SIMDStSingleH<bit R, bits<3> opcode, bit size, string asm,
+ RegisterOperand listtype, RegisterOperand GPR64pi> {
+ def i16 : SIMDLdStSingleH<0, R, opcode, size, asm,
+ (outs), (ins listtype:$Vt, VectorIndexH:$idx,
+ GPR64sp:$Rn), []>;
+
+ def i16_POST : SIMDLdStSingleHPost<0, R, opcode, size, asm,
+ (outs GPR64sp:$wback),
+ (ins listtype:$Vt, VectorIndexH:$idx,
+ GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+multiclass SIMDStSingleS<bit R, bits<3> opcode, bits<2> size,string asm,
+ RegisterOperand listtype, RegisterOperand GPR64pi> {
+ def i32 : SIMDLdStSingleS<0, R, opcode, size, asm,
+ (outs), (ins listtype:$Vt, VectorIndexS:$idx,
+ GPR64sp:$Rn), []>;
+
+ def i32_POST : SIMDLdStSingleSPost<0, R, opcode, size, asm,
+ (outs GPR64sp:$wback),
+ (ins listtype:$Vt, VectorIndexS:$idx,
+ GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+multiclass SIMDStSingleD<bit R, bits<3> opcode, bits<2> size, string asm,
+ RegisterOperand listtype, RegisterOperand GPR64pi> {
+ def i64 : SIMDLdStSingleD<0, R, opcode, size, asm,
+ (outs), (ins listtype:$Vt, VectorIndexD:$idx,
+ GPR64sp:$Rn), []>;
+
+ def i64_POST : SIMDLdStSingleDPost<0, R, opcode, size, asm,
+ (outs GPR64sp:$wback),
+ (ins listtype:$Vt, VectorIndexD:$idx,
+ GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+
+multiclass SIMDLdStSingleAliases<string asm, string layout, string Type,
+ string Count, int Offset, Operand idxtype> {
+ // E.g. "ld1 { v0.8b }[0], [x1], #1"
+ // "ld1\t$Vt, [$Rn], #1"
+ // may get mapped to
+ // (LD1Rv8b_POST VecListOne8b:$Vt, GPR64sp:$Rn, XZR)
+ def : InstAlias<asm # "\t$Vt$idx, [$Rn], #" # Offset,
+ (!cast<Instruction>(NAME # Type # "_POST")
+ GPR64sp:$Rn,
+ !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
+ idxtype:$idx, XZR), 1>;
+
+ // E.g. "ld1.8b { v0 }[0], [x1], #1"
+ // "ld1.8b\t$Vt, [$Rn], #1"
+ // may get mapped to
+ // (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, XZR)
+ def : InstAlias<asm # "." # layout # "\t$Vt$idx, [$Rn], #" # Offset,
+ (!cast<Instruction>(NAME # Type # "_POST")
+ GPR64sp:$Rn,
+ !cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
+ idxtype:$idx, XZR), 0>;
+
+ // E.g. "ld1.8b { v0 }[0], [x1]"
+ // "ld1.8b\t$Vt, [$Rn]"
+ // may get mapped to
+ // (LD1Rv8b VecListOne64:$Vt, GPR64sp:$Rn)
+ def : InstAlias<asm # "." # layout # "\t$Vt$idx, [$Rn]",
+ (!cast<Instruction>(NAME # Type)
+ !cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
+ idxtype:$idx, GPR64sp:$Rn), 0>;
+
+ // E.g. "ld1.8b { v0 }[0], [x1], x2"
+ // "ld1.8b\t$Vt, [$Rn], $Xm"
+ // may get mapped to
+ // (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, GPR64pi1:$Xm)
+ def : InstAlias<asm # "." # layout # "\t$Vt$idx, [$Rn], $Xm",
+ (!cast<Instruction>(NAME # Type # "_POST")
+ GPR64sp:$Rn,
+ !cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
+ idxtype:$idx,
+ !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
+}
+
+multiclass SIMDLdSt1SingleAliases<string asm> {
+ defm : SIMDLdStSingleAliases<asm, "b", "i8", "One", 1, VectorIndexB>;
+ defm : SIMDLdStSingleAliases<asm, "h", "i16", "One", 2, VectorIndexH>;
+ defm : SIMDLdStSingleAliases<asm, "s", "i32", "One", 4, VectorIndexS>;
+ defm : SIMDLdStSingleAliases<asm, "d", "i64", "One", 8, VectorIndexD>;
+}
+
+multiclass SIMDLdSt2SingleAliases<string asm> {
+ defm : SIMDLdStSingleAliases<asm, "b", "i8", "Two", 2, VectorIndexB>;
+ defm : SIMDLdStSingleAliases<asm, "h", "i16", "Two", 4, VectorIndexH>;
+ defm : SIMDLdStSingleAliases<asm, "s", "i32", "Two", 8, VectorIndexS>;
+ defm : SIMDLdStSingleAliases<asm, "d", "i64", "Two", 16, VectorIndexD>;
+}
+
+multiclass SIMDLdSt3SingleAliases<string asm> {
+ defm : SIMDLdStSingleAliases<asm, "b", "i8", "Three", 3, VectorIndexB>;
+ defm : SIMDLdStSingleAliases<asm, "h", "i16", "Three", 6, VectorIndexH>;
+ defm : SIMDLdStSingleAliases<asm, "s", "i32", "Three", 12, VectorIndexS>;
+ defm : SIMDLdStSingleAliases<asm, "d", "i64", "Three", 24, VectorIndexD>;
+}
+
+multiclass SIMDLdSt4SingleAliases<string asm> {
+ defm : SIMDLdStSingleAliases<asm, "b", "i8", "Four", 4, VectorIndexB>;
+ defm : SIMDLdStSingleAliases<asm, "h", "i16", "Four", 8, VectorIndexH>;
+ defm : SIMDLdStSingleAliases<asm, "s", "i32", "Four", 16, VectorIndexS>;
+ defm : SIMDLdStSingleAliases<asm, "d", "i64", "Four", 32, VectorIndexD>;
+}
+} // end of 'let Predicates = [HasNEON]'
+
+//----------------------------------------------------------------------------
+// AdvSIMD v8.1 Rounding Double Multiply Add/Subtract
+//----------------------------------------------------------------------------
+
+let Predicates = [HasNEON, HasV8_1a] in {
+
+class BaseSIMDThreeSameVectorTiedR0<bit Q, bit U, bits<2> size, bits<5> opcode,
+ RegisterOperand regtype, string asm,
+ string kind, list<dag> pattern>
+ : BaseSIMDThreeSameVectorTied<Q, U, {size,0}, opcode, regtype, asm, kind,
+ pattern> {
+}
+multiclass SIMDThreeSameVectorSQRDMLxHTiedHS<bit U, bits<5> opc, string asm,
+ SDPatternOperator Accum> {
+ def v4i16 : BaseSIMDThreeSameVectorTiedR0<0, U, 0b01, opc, V64, asm, ".4h",
+ [(set (v4i16 V64:$dst),
+ (Accum (v4i16 V64:$Rd),
+ (v4i16 (int_aarch64_neon_sqrdmulh (v4i16 V64:$Rn),
+ (v4i16 V64:$Rm)))))]>;
+ def v8i16 : BaseSIMDThreeSameVectorTiedR0<1, U, 0b01, opc, V128, asm, ".8h",
+ [(set (v8i16 V128:$dst),
+ (Accum (v8i16 V128:$Rd),
+ (v8i16 (int_aarch64_neon_sqrdmulh (v8i16 V128:$Rn),
+ (v8i16 V128:$Rm)))))]>;
+ def v2i32 : BaseSIMDThreeSameVectorTiedR0<0, U, 0b10, opc, V64, asm, ".2s",
+ [(set (v2i32 V64:$dst),
+ (Accum (v2i32 V64:$Rd),
+ (v2i32 (int_aarch64_neon_sqrdmulh (v2i32 V64:$Rn),
+ (v2i32 V64:$Rm)))))]>;
+ def v4i32 : BaseSIMDThreeSameVectorTiedR0<1, U, 0b10, opc, V128, asm, ".4s",
+ [(set (v4i32 V128:$dst),
+ (Accum (v4i32 V128:$Rd),
+ (v4i32 (int_aarch64_neon_sqrdmulh (v4i32 V128:$Rn),
+ (v4i32 V128:$Rm)))))]>;
+}
+
+multiclass SIMDIndexedSQRDMLxHSDTied<bit U, bits<4> opc, string asm,
+ SDPatternOperator Accum> {
+ def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
+ V64, V64, V128_lo, VectorIndexH,
+ asm, ".4h", ".4h", ".4h", ".h",
+ [(set (v4i16 V64:$dst),
+ (Accum (v4i16 V64:$Rd),
+ (v4i16 (int_aarch64_neon_sqrdmulh
+ (v4i16 V64:$Rn),
+ (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+ VectorIndexH:$idx))))))]> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
+ V128, V128, V128_lo, VectorIndexH,
+ asm, ".8h", ".8h", ".8h", ".h",
+ [(set (v8i16 V128:$dst),
+ (Accum (v8i16 V128:$Rd),
+ (v8i16 (int_aarch64_neon_sqrdmulh
+ (v8i16 V128:$Rn),
+ (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+ VectorIndexH:$idx))))))]> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
+ V64, V64, V128, VectorIndexS,
+ asm, ".2s", ".2s", ".2s", ".s",
+ [(set (v2i32 V64:$dst),
+ (Accum (v2i32 V64:$Rd),
+ (v2i32 (int_aarch64_neon_sqrdmulh
+ (v2i32 V64:$Rn),
+ (v2i32 (AArch64duplane32 (v4i32 V128:$Rm),
+ VectorIndexS:$idx))))))]> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+
+ // FIXME: it would be nice to use the scalar (v1i32) instruction here, but
+ // an intermediate EXTRACT_SUBREG would be untyped.
+ // FIXME: direct EXTRACT_SUBREG from v2i32 to i32 is illegal, that's why we
+ // got it lowered here as (i32 vector_extract (v4i32 insert_subvector(..)))
+ def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
+ (i32 (vector_extract
+ (v4i32 (insert_subvector
+ (undef),
+ (v2i32 (int_aarch64_neon_sqrdmulh
+ (v2i32 V64:$Rn),
+ (v2i32 (AArch64duplane32
+ (v4i32 V128:$Rm),
+ VectorIndexS:$idx)))),
+ (i32 0))),
+ (i64 0))))),
+ (EXTRACT_SUBREG
+ (v2i32 (!cast<Instruction>(NAME # v2i32_indexed)
+ (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
+ FPR32Op:$Rd,
+ ssub)),
+ V64:$Rn,
+ V128:$Rm,
+ VectorIndexS:$idx)),
+ ssub)>;
+
+ def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+ V128, V128, V128, VectorIndexS,
+ asm, ".4s", ".4s", ".4s", ".s",
+ [(set (v4i32 V128:$dst),
+ (Accum (v4i32 V128:$Rd),
+ (v4i32 (int_aarch64_neon_sqrdmulh
+ (v4i32 V128:$Rn),
+ (v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
+ VectorIndexS:$idx))))))]> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+
+ // FIXME: it would be nice to use the scalar (v1i32) instruction here, but
+ // an intermediate EXTRACT_SUBREG would be untyped.
+ def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
+ (i32 (vector_extract
+ (v4i32 (int_aarch64_neon_sqrdmulh
+ (v4i32 V128:$Rn),
+ (v4i32 (AArch64duplane32
+ (v4i32 V128:$Rm),
+ VectorIndexS:$idx)))),
+ (i64 0))))),
+ (EXTRACT_SUBREG
+ (v4i32 (!cast<Instruction>(NAME # v4i32_indexed)
+ (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
+ FPR32Op:$Rd,
+ ssub)),
+ V128:$Rn,
+ V128:$Rm,
+ VectorIndexS:$idx)),
+ ssub)>;
+
+ def i16_indexed : BaseSIMDIndexedTied<1, U, 1, 0b01, opc,
+ FPR16Op, FPR16Op, V128_lo,
+ VectorIndexH, asm, ".h", "", "", ".h",
+ []> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ def i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
+ FPR32Op, FPR32Op, V128, VectorIndexS,
+ asm, ".s", "", "", ".s",
+ [(set (i32 FPR32Op:$dst),
+ (Accum (i32 FPR32Op:$Rd),
+ (i32 (int_aarch64_neon_sqrdmulh
+ (i32 FPR32Op:$Rn),
+ (i32 (vector_extract (v4i32 V128:$Rm),
+ VectorIndexS:$idx))))))]> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+}
+} // let Predicates = [HasNeon, HasV8_1a]
+
+//----------------------------------------------------------------------------
+// Crypto extensions
+//----------------------------------------------------------------------------
+
+let Predicates = [HasCrypto] in {
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class AESBase<bits<4> opc, string asm, dag outs, dag ins, string cstr,
+ list<dag> pat>
+ : I<outs, ins, asm, "{\t$Rd.16b, $Rn.16b|.16b\t$Rd, $Rn}", cstr, pat>,
+ Sched<[WriteV]>{
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31-16} = 0b0100111000101000;
+ let Inst{15-12} = opc;
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+class AESInst<bits<4> opc, string asm, Intrinsic OpNode>
+ : AESBase<opc, asm, (outs V128:$Rd), (ins V128:$Rn), "",
+ [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+
+class AESTiedInst<bits<4> opc, string asm, Intrinsic OpNode>
+ : AESBase<opc, asm, (outs V128:$dst), (ins V128:$Rd, V128:$Rn),
+ "$Rd = $dst",
+ [(set (v16i8 V128:$dst),
+ (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>;
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class SHA3OpTiedInst<bits<3> opc, string asm, string dst_lhs_kind,
+ dag oops, dag iops, list<dag> pat>
+ : I<oops, iops, asm,
+ "{\t$Rd" # dst_lhs_kind # ", $Rn" # dst_lhs_kind # ", $Rm.4s" #
+ "|.4s\t$Rd, $Rn, $Rm}", "$Rd = $dst", pat>,
+ Sched<[WriteV]>{
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ let Inst{31-21} = 0b01011110000;
+ let Inst{20-16} = Rm;
+ let Inst{15} = 0;
+ let Inst{14-12} = opc;
+ let Inst{11-10} = 0b00;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+class SHATiedInstQSV<bits<3> opc, string asm, Intrinsic OpNode>
+ : SHA3OpTiedInst<opc, asm, "", (outs FPR128:$dst),
+ (ins FPR128:$Rd, FPR32:$Rn, V128:$Rm),
+ [(set (v4i32 FPR128:$dst),
+ (OpNode (v4i32 FPR128:$Rd), (i32 FPR32:$Rn),
+ (v4i32 V128:$Rm)))]>;
+
+class SHATiedInstVVV<bits<3> opc, string asm, Intrinsic OpNode>
+ : SHA3OpTiedInst<opc, asm, ".4s", (outs V128:$dst),
+ (ins V128:$Rd, V128:$Rn, V128:$Rm),
+ [(set (v4i32 V128:$dst),
+ (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
+ (v4i32 V128:$Rm)))]>;
+
+class SHATiedInstQQV<bits<3> opc, string asm, Intrinsic OpNode>
+ : SHA3OpTiedInst<opc, asm, "", (outs FPR128:$dst),
+ (ins FPR128:$Rd, FPR128:$Rn, V128:$Rm),
+ [(set (v4i32 FPR128:$dst),
+ (OpNode (v4i32 FPR128:$Rd), (v4i32 FPR128:$Rn),
+ (v4i32 V128:$Rm)))]>;
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class SHA2OpInst<bits<4> opc, string asm, string kind,
+ string cstr, dag oops, dag iops,
+ list<dag> pat>
+ : I<oops, iops, asm, "{\t$Rd" # kind # ", $Rn" # kind #
+ "|" # kind # "\t$Rd, $Rn}", cstr, pat>,
+ Sched<[WriteV]>{
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31-16} = 0b0101111000101000;
+ let Inst{15-12} = opc;
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+class SHATiedInstVV<bits<4> opc, string asm, Intrinsic OpNode>
+ : SHA2OpInst<opc, asm, ".4s", "$Rd = $dst", (outs V128:$dst),
+ (ins V128:$Rd, V128:$Rn),
+ [(set (v4i32 V128:$dst),
+ (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>;
+
+class SHAInstSS<bits<4> opc, string asm, Intrinsic OpNode>
+ : SHA2OpInst<opc, asm, "", "", (outs FPR32:$Rd), (ins FPR32:$Rn),
+ [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>;
+} // end of 'let Predicates = [HasCrypto]'
+
+//----------------------------------------------------------------------------
+// v8.1 atomic instructions extension:
+// * CAS
+// * CASP
+// * SWP
+// * LDOPregister<OP>, and aliases STOPregister<OP>
+
+// Instruction encodings:
+//
+// 31 30|29 24|23|22|21|20 16|15|14 10|9 5|4 0
+// CAS SZ |001000|1 |A |1 |Rs |R |11111 |Rn |Rt
+// CASP 0|SZ|001000|0 |A |1 |Rs |R |11111 |Rn |Rt
+// SWP SZ |111000|A |R |1 |Rs |1 |OPC|00|Rn |Rt
+// LD SZ |111000|A |R |1 |Rs |0 |OPC|00|Rn |Rt
+// ST SZ |111000|A |R |1 |Rs |0 |OPC|00|Rn |11111
+
+// Instruction syntax:
+//
+// CAS{<order>}[<size>] <Ws>, <Wt>, [<Xn|SP>]
+// CAS{<order>} <Xs>, <Xt>, [<Xn|SP>]
+// CASP{<order>} <Ws>, <W(s+1)>, <Wt>, <W(t+1)>, [<Xn|SP>]
+// CASP{<order>} <Xs>, <X(s+1)>, <Xt>, <X(t+1)>, [<Xn|SP>]
+// SWP{<order>}[<size>] <Ws>, <Wt>, [<Xn|SP>]
+// SWP{<order>} <Xs>, <Xt>, [<Xn|SP>]
+// LD<OP>{<order>}[<size>] <Ws>, <Wt>, [<Xn|SP>]
+// LD<OP>{<order>} <Xs>, <Xt>, [<Xn|SP>]
+// ST<OP>{<order>}[<size>] <Ws>, [<Xn|SP>]
+// ST<OP>{<order>} <Xs>, [<Xn|SP>]
+
+let Predicates = [HasLSE], mayLoad = 1, mayStore = 1, hasSideEffects = 1 in
+class BaseCASEncoding<dag oops, dag iops, string asm, string operands,
+ string cstr, list<dag> pattern>
+ : I<oops, iops, asm, operands, cstr, pattern> {
+ bits<2> Sz;
+ bit NP;
+ bit Acq;
+ bit Rel;
+ bits<5> Rs;
+ bits<5> Rn;
+ bits<5> Rt;
+ let Inst{31-30} = Sz;
+ let Inst{29-24} = 0b001000;
+ let Inst{23} = NP;
+ let Inst{22} = Acq;
+ let Inst{21} = 0b1;
+ let Inst{20-16} = Rs;
+ let Inst{15} = Rel;
+ let Inst{14-10} = 0b11111;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+ let Predicates = [HasLSE];
+}
+
+class BaseCAS<string order, string size, RegisterClass RC>
+ : BaseCASEncoding<(outs RC:$out),(ins RC:$Rs, RC:$Rt, GPR64sp:$Rn),
+ "cas" # order # size, "\t$Rs, $Rt, [$Rn]",
+ "$out = $Rs",[]>,
+ Sched<[WriteAtomic]> {
+ let NP = 1;
+}
+
+multiclass CompareAndSwap<bits<1> Acq, bits<1> Rel, string order> {
+ let Sz = 0b00, Acq = Acq, Rel = Rel in def b : BaseCAS<order, "b", GPR32>;
+ let Sz = 0b01, Acq = Acq, Rel = Rel in def h : BaseCAS<order, "h", GPR32>;
+ let Sz = 0b10, Acq = Acq, Rel = Rel in def s : BaseCAS<order, "", GPR32>;
+ let Sz = 0b11, Acq = Acq, Rel = Rel in def d : BaseCAS<order, "", GPR64>;
+}
+
+class BaseCASP<string order, string size, RegisterOperand RC>
+ : BaseCASEncoding<(outs RC:$out),(ins RC:$Rs, RC:$Rt, GPR64sp:$Rn),
+ "casp" # order # size, "\t$Rs, $Rt, [$Rn]",
+ "$out = $Rs",[]>,
+ Sched<[WriteAtomic]> {
+ let NP = 0;
+}
+
+multiclass CompareAndSwapPair<bits<1> Acq, bits<1> Rel, string order> {
+ let Sz = 0b00, Acq = Acq, Rel = Rel in
+ def s : BaseCASP<order, "", WSeqPairClassOperand>;
+ let Sz = 0b01, Acq = Acq, Rel = Rel in
+ def d : BaseCASP<order, "", XSeqPairClassOperand>;
+}
+
+let Predicates = [HasLSE] in
+class BaseSWP<string order, string size, RegisterClass RC>
+ : I<(outs RC:$Rt),(ins RC:$Rs, GPR64sp:$Rn), "swp" # order # size,
+ "\t$Rs, $Rt, [$Rn]","",[]>,
+ Sched<[WriteAtomic]> {
+ bits<2> Sz;
+ bit Acq;
+ bit Rel;
+ bits<5> Rs;
+ bits<3> opc = 0b000;
+ bits<5> Rn;
+ bits<5> Rt;
+ let Inst{31-30} = Sz;
+ let Inst{29-24} = 0b111000;
+ let Inst{23} = Acq;
+ let Inst{22} = Rel;
+ let Inst{21} = 0b1;
+ let Inst{20-16} = Rs;
+ let Inst{15} = 0b1;
+ let Inst{14-12} = opc;
+ let Inst{11-10} = 0b00;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+ let Predicates = [HasLSE];
+}
+
+multiclass Swap<bits<1> Acq, bits<1> Rel, string order> {
+ let Sz = 0b00, Acq = Acq, Rel = Rel in def b : BaseSWP<order, "b", GPR32>;
+ let Sz = 0b01, Acq = Acq, Rel = Rel in def h : BaseSWP<order, "h", GPR32>;
+ let Sz = 0b10, Acq = Acq, Rel = Rel in def s : BaseSWP<order, "", GPR32>;
+ let Sz = 0b11, Acq = Acq, Rel = Rel in def d : BaseSWP<order, "", GPR64>;
+}
+
+let Predicates = [HasLSE], mayLoad = 1, mayStore = 1, hasSideEffects = 1 in
+class BaseLDOPregister<string op, string order, string size, RegisterClass RC>
+ : I<(outs RC:$Rt),(ins RC:$Rs, GPR64sp:$Rn), "ld" # op # order # size,
+ "\t$Rs, $Rt, [$Rn]","",[]>,
+ Sched<[WriteAtomic]> {
+ bits<2> Sz;
+ bit Acq;
+ bit Rel;
+ bits<5> Rs;
+ bits<3> opc;
+ bits<5> Rn;
+ bits<5> Rt;
+ let Inst{31-30} = Sz;
+ let Inst{29-24} = 0b111000;
+ let Inst{23} = Acq;
+ let Inst{22} = Rel;
+ let Inst{21} = 0b1;
+ let Inst{20-16} = Rs;
+ let Inst{15} = 0b0;
+ let Inst{14-12} = opc;
+ let Inst{11-10} = 0b00;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+ let Predicates = [HasLSE];
+}
+
+multiclass LDOPregister<bits<3> opc, string op, bits<1> Acq, bits<1> Rel,
+ string order> {
+ let Sz = 0b00, Acq = Acq, Rel = Rel, opc = opc in
+ def b : BaseLDOPregister<op, order, "b", GPR32>;
+ let Sz = 0b01, Acq = Acq, Rel = Rel, opc = opc in
+ def h : BaseLDOPregister<op, order, "h", GPR32>;
+ let Sz = 0b10, Acq = Acq, Rel = Rel, opc = opc in
+ def s : BaseLDOPregister<op, order, "", GPR32>;
+ let Sz = 0b11, Acq = Acq, Rel = Rel, opc = opc in
+ def d : BaseLDOPregister<op, order, "", GPR64>;
+}
+
+let Predicates = [HasLSE] in
+class BaseSTOPregister<string asm, RegisterClass OP, Register Reg,
+ Instruction inst> :
+ InstAlias<asm # "\t$Rs, [$Rn]", (inst Reg, OP:$Rs, GPR64sp:$Rn)>;
+
+multiclass STOPregister<string asm, string instr> {
+ def : BaseSTOPregister<asm # "lb", GPR32, WZR,
+ !cast<Instruction>(instr # "Lb")>;
+ def : BaseSTOPregister<asm # "lh", GPR32, WZR,
+ !cast<Instruction>(instr # "Lh")>;
+ def : BaseSTOPregister<asm # "l", GPR32, WZR,
+ !cast<Instruction>(instr # "Ls")>;
+ def : BaseSTOPregister<asm # "l", GPR64, XZR,
+ !cast<Instruction>(instr # "Ld")>;
+ def : BaseSTOPregister<asm # "b", GPR32, WZR,
+ !cast<Instruction>(instr # "b")>;
+ def : BaseSTOPregister<asm # "h", GPR32, WZR,
+ !cast<Instruction>(instr # "h")>;
+ def : BaseSTOPregister<asm, GPR32, WZR,
+ !cast<Instruction>(instr # "s")>;
+ def : BaseSTOPregister<asm, GPR64, XZR,
+ !cast<Instruction>(instr # "d")>;
+}
+
+//----------------------------------------------------------------------------
+// Allow the size specifier tokens to be upper case, not just lower.
+def : TokenAlias<".8B", ".8b">;
+def : TokenAlias<".4H", ".4h">;
+def : TokenAlias<".2S", ".2s">;
+def : TokenAlias<".1D", ".1d">;
+def : TokenAlias<".16B", ".16b">;
+def : TokenAlias<".8H", ".8h">;
+def : TokenAlias<".4S", ".4s">;
+def : TokenAlias<".2D", ".2d">;
+def : TokenAlias<".1Q", ".1q">;
+def : TokenAlias<".2H", ".2h">;
+def : TokenAlias<".B", ".b">;
+def : TokenAlias<".H", ".h">;
+def : TokenAlias<".S", ".s">;
+def : TokenAlias<".D", ".d">;
+def : TokenAlias<".Q", ".q">;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
new file mode 100644
index 000000000000..b50749a29b89
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -0,0 +1,4173 @@
+//===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include <algorithm>
+
+using namespace llvm;
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "AArch64GenInstrInfo.inc"
+
+static const MachineMemOperand::Flags MOSuppressPair =
+ MachineMemOperand::MOTargetFlag1;
+
+static cl::opt<unsigned>
+TBZDisplacementBits("aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
+ cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
+
+static cl::opt<unsigned>
+CBZDisplacementBits("aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
+ cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
+
+static cl::opt<unsigned>
+BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
+ cl::desc("Restrict range of Bcc instructions (DEBUG)"));
+
+AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
+ : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP),
+ RI(STI.getTargetTriple()), Subtarget(STI) {}
+
+/// GetInstSize - Return the number of bytes of code the specified
+/// instruction may be. This returns the maximum number of bytes.
+unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
+ const MachineBasicBlock &MBB = *MI.getParent();
+ const MachineFunction *MF = MBB.getParent();
+ const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
+
+ if (MI.getOpcode() == AArch64::INLINEASM)
+ return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
+
+ // FIXME: We currently only handle pseudoinstructions that don't get expanded
+ // before the assembly printer.
+ unsigned NumBytes = 0;
+ const MCInstrDesc &Desc = MI.getDesc();
+ switch (Desc.getOpcode()) {
+ default:
+ // Anything not explicitly designated otherwise is a normal 4-byte insn.
+ NumBytes = 4;
+ break;
+ case TargetOpcode::DBG_VALUE:
+ case TargetOpcode::EH_LABEL:
+ case TargetOpcode::IMPLICIT_DEF:
+ case TargetOpcode::KILL:
+ NumBytes = 0;
+ break;
+ case TargetOpcode::STACKMAP:
+ // The upper bound for a stackmap intrinsic is the full length of its shadow
+ NumBytes = StackMapOpers(&MI).getNumPatchBytes();
+ assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
+ break;
+ case TargetOpcode::PATCHPOINT:
+ // The size of the patchpoint intrinsic is the number of bytes requested
+ NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
+ assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
+ break;
+ case AArch64::TLSDESC_CALLSEQ:
+ // This gets lowered to an instruction sequence which takes 16 bytes
+ NumBytes = 16;
+ break;
+ case AArch64::TLSDESC_CALLSEQ:
+ // This gets lowered to an instruction sequence which takes 16 bytes
+ return 16;
+ }
+
+ return NumBytes;
+}
+
+static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
+ SmallVectorImpl<MachineOperand> &Cond) {
+ // Block ends with fall-through condbranch.
+ switch (LastInst->getOpcode()) {
+ default:
+ llvm_unreachable("Unknown branch instruction?");
+ case AArch64::Bcc:
+ Target = LastInst->getOperand(1).getMBB();
+ Cond.push_back(LastInst->getOperand(0));
+ break;
+ case AArch64::CBZW:
+ case AArch64::CBZX:
+ case AArch64::CBNZW:
+ case AArch64::CBNZX:
+ Target = LastInst->getOperand(1).getMBB();
+ Cond.push_back(MachineOperand::CreateImm(-1));
+ Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
+ Cond.push_back(LastInst->getOperand(0));
+ break;
+ case AArch64::TBZW:
+ case AArch64::TBZX:
+ case AArch64::TBNZW:
+ case AArch64::TBNZX:
+ Target = LastInst->getOperand(2).getMBB();
+ Cond.push_back(MachineOperand::CreateImm(-1));
+ Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
+ Cond.push_back(LastInst->getOperand(0));
+ Cond.push_back(LastInst->getOperand(1));
+ }
+}
+
+static unsigned getBranchDisplacementBits(unsigned Opc) {
+ switch (Opc) {
+ default:
+ llvm_unreachable("unexpected opcode!");
+ case AArch64::B:
+ return 64;
+ case AArch64::TBNZW:
+ case AArch64::TBZW:
+ case AArch64::TBNZX:
+ case AArch64::TBZX:
+ return TBZDisplacementBits;
+ case AArch64::CBNZW:
+ case AArch64::CBZW:
+ case AArch64::CBNZX:
+ case AArch64::CBZX:
+ return CBZDisplacementBits;
+ case AArch64::Bcc:
+ return BCCDisplacementBits;
+ }
+}
+
+bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp,
+ int64_t BrOffset) const {
+ unsigned Bits = getBranchDisplacementBits(BranchOp);
+ assert(Bits >= 3 && "max branch displacement must be enough to jump"
+ "over conditional branch expansion");
+ return isIntN(Bits, BrOffset / 4);
+}
+
+MachineBasicBlock *AArch64InstrInfo::getBranchDestBlock(
+ const MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ default:
+ llvm_unreachable("unexpected opcode!");
+ case AArch64::B:
+ return MI.getOperand(0).getMBB();
+ case AArch64::TBZW:
+ case AArch64::TBNZW:
+ case AArch64::TBZX:
+ case AArch64::TBNZX:
+ return MI.getOperand(2).getMBB();
+ case AArch64::CBZW:
+ case AArch64::CBNZW:
+ case AArch64::CBZX:
+ case AArch64::CBNZX:
+ case AArch64::Bcc:
+ return MI.getOperand(1).getMBB();
+ }
+}
+
+// Branch analysis.
+bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const {
+ // If the block has no terminators, it just falls into the block after it.
+ MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
+ if (I == MBB.end())
+ return false;
+
+ if (!isUnpredicatedTerminator(*I))
+ return false;
+
+ // Get the last instruction in the block.
+ MachineInstr *LastInst = &*I;
+
+ // If there is only one terminator instruction, process it.
+ unsigned LastOpc = LastInst->getOpcode();
+ if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
+ if (isUncondBranchOpcode(LastOpc)) {
+ TBB = LastInst->getOperand(0).getMBB();
+ return false;
+ }
+ if (isCondBranchOpcode(LastOpc)) {
+ // Block ends with fall-through condbranch.
+ parseCondBranch(LastInst, TBB, Cond);
+ return false;
+ }
+ return true; // Can't handle indirect branch.
+ }
+
+ // Get the instruction before it if it is a terminator.
+ MachineInstr *SecondLastInst = &*I;
+ unsigned SecondLastOpc = SecondLastInst->getOpcode();
+
+ // If AllowModify is true and the block ends with two or more unconditional
+ // branches, delete all but the first unconditional branch.
+ if (AllowModify && isUncondBranchOpcode(LastOpc)) {
+ while (isUncondBranchOpcode(SecondLastOpc)) {
+ LastInst->eraseFromParent();
+ LastInst = SecondLastInst;
+ LastOpc = LastInst->getOpcode();
+ if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
+ // Return now the only terminator is an unconditional branch.
+ TBB = LastInst->getOperand(0).getMBB();
+ return false;
+ } else {
+ SecondLastInst = &*I;
+ SecondLastOpc = SecondLastInst->getOpcode();
+ }
+ }
+ }
+
+ // If there are three terminators, we don't know what sort of block this is.
+ if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
+ return true;
+
+ // If the block ends with a B and a Bcc, handle it.
+ if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+ parseCondBranch(SecondLastInst, TBB, Cond);
+ FBB = LastInst->getOperand(0).getMBB();
+ return false;
+ }
+
+ // If the block ends with two unconditional branches, handle it. The second
+ // one is not executed, so remove it.
+ if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+ TBB = SecondLastInst->getOperand(0).getMBB();
+ I = LastInst;
+ if (AllowModify)
+ I->eraseFromParent();
+ return false;
+ }
+
+ // ...likewise if it ends with an indirect branch followed by an unconditional
+ // branch.
+ if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+ I = LastInst;
+ if (AllowModify)
+ I->eraseFromParent();
+ return true;
+ }
+
+ // Otherwise, can't handle this.
+ return true;
+}
+
+bool AArch64InstrInfo::reverseBranchCondition(
+ SmallVectorImpl<MachineOperand> &Cond) const {
+ if (Cond[0].getImm() != -1) {
+ // Regular Bcc
+ AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
+ Cond[0].setImm(AArch64CC::getInvertedCondCode(CC));
+ } else {
+ // Folded compare-and-branch
+ switch (Cond[1].getImm()) {
+ default:
+ llvm_unreachable("Unknown conditional branch!");
+ case AArch64::CBZW:
+ Cond[1].setImm(AArch64::CBNZW);
+ break;
+ case AArch64::CBNZW:
+ Cond[1].setImm(AArch64::CBZW);
+ break;
+ case AArch64::CBZX:
+ Cond[1].setImm(AArch64::CBNZX);
+ break;
+ case AArch64::CBNZX:
+ Cond[1].setImm(AArch64::CBZX);
+ break;
+ case AArch64::TBZW:
+ Cond[1].setImm(AArch64::TBNZW);
+ break;
+ case AArch64::TBNZW:
+ Cond[1].setImm(AArch64::TBZW);
+ break;
+ case AArch64::TBZX:
+ Cond[1].setImm(AArch64::TBNZX);
+ break;
+ case AArch64::TBNZX:
+ Cond[1].setImm(AArch64::TBZX);
+ break;
+ }
+ }
+
+ return false;
+}
+
+unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved) const {
+ MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
+ if (I == MBB.end())
+ return 0;
+
+ if (!isUncondBranchOpcode(I->getOpcode()) &&
+ !isCondBranchOpcode(I->getOpcode()))
+ return 0;
+
+ // Remove the branch.
+ I->eraseFromParent();
+
+ I = MBB.end();
+
+ if (I == MBB.begin()) {
+ if (BytesRemoved)
+ *BytesRemoved = 4;
+ return 1;
+ }
+ --I;
+ if (!isCondBranchOpcode(I->getOpcode())) {
+ if (BytesRemoved)
+ *BytesRemoved = 4;
+ return 1;
+ }
+
+ // Remove the branch.
+ I->eraseFromParent();
+ if (BytesRemoved)
+ *BytesRemoved = 8;
+
+ return 2;
+}
+
+void AArch64InstrInfo::instantiateCondBranch(
+ MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB,
+ ArrayRef<MachineOperand> Cond) const {
+ if (Cond[0].getImm() != -1) {
+ // Regular Bcc
+ BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
+ } else {
+ // Folded compare-and-branch
+ // Note that we use addOperand instead of addReg to keep the flags.
+ const MachineInstrBuilder MIB =
+ BuildMI(&MBB, DL, get(Cond[1].getImm())).addOperand(Cond[2]);
+ if (Cond.size() > 3)
+ MIB.addImm(Cond[3].getImm());
+ MIB.addMBB(TBB);
+ }
+}
+
+unsigned AArch64InstrInfo::insertBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB,
+ ArrayRef<MachineOperand> Cond,
+ const DebugLoc &DL,
+ int *BytesAdded) const {
+ // Shouldn't be a fall through.
+ assert(TBB && "insertBranch must not be told to insert a fallthrough");
+
+ if (!FBB) {
+ if (Cond.empty()) // Unconditional branch?
+ BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
+ else
+ instantiateCondBranch(MBB, DL, TBB, Cond);
+
+ if (BytesAdded)
+ *BytesAdded = 4;
+
+ return 1;
+ }
+
+ // Two-way conditional branch.
+ instantiateCondBranch(MBB, DL, TBB, Cond);
+ BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
+
+ if (BytesAdded)
+ *BytesAdded = 8;
+
+ return 2;
+}
+
+// Find the original register that VReg is copied from.
+static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
+ while (TargetRegisterInfo::isVirtualRegister(VReg)) {
+ const MachineInstr *DefMI = MRI.getVRegDef(VReg);
+ if (!DefMI->isFullCopy())
+ return VReg;
+ VReg = DefMI->getOperand(1).getReg();
+ }
+ return VReg;
+}
+
+// Determine if VReg is defined by an instruction that can be folded into a
+// csel instruction. If so, return the folded opcode, and the replacement
+// register.
+static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
+ unsigned *NewVReg = nullptr) {
+ VReg = removeCopies(MRI, VReg);
+ if (!TargetRegisterInfo::isVirtualRegister(VReg))
+ return 0;
+
+ bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
+ const MachineInstr *DefMI = MRI.getVRegDef(VReg);
+ unsigned Opc = 0;
+ unsigned SrcOpNum = 0;
+ switch (DefMI->getOpcode()) {
+ case AArch64::ADDSXri:
+ case AArch64::ADDSWri:
+ // if NZCV is used, do not fold.
+ if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
+ return 0;
+ // fall-through to ADDXri and ADDWri.
+ LLVM_FALLTHROUGH;
+ case AArch64::ADDXri:
+ case AArch64::ADDWri:
+ // add x, 1 -> csinc.
+ if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
+ DefMI->getOperand(3).getImm() != 0)
+ return 0;
+ SrcOpNum = 1;
+ Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
+ break;
+
+ case AArch64::ORNXrr:
+ case AArch64::ORNWrr: {
+ // not x -> csinv, represented as orn dst, xzr, src.
+ unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
+ if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
+ return 0;
+ SrcOpNum = 2;
+ Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
+ break;
+ }
+
+ case AArch64::SUBSXrr:
+ case AArch64::SUBSWrr:
+ // if NZCV is used, do not fold.
+ if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
+ return 0;
+ // fall-through to SUBXrr and SUBWrr.
+ LLVM_FALLTHROUGH;
+ case AArch64::SUBXrr:
+ case AArch64::SUBWrr: {
+ // neg x -> csneg, represented as sub dst, xzr, src.
+ unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
+ if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
+ return 0;
+ SrcOpNum = 2;
+ Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
+ break;
+ }
+ default:
+ return 0;
+ }
+ assert(Opc && SrcOpNum && "Missing parameters");
+
+ if (NewVReg)
+ *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
+ return Opc;
+}
+
+bool AArch64InstrInfo::canInsertSelect(
+ const MachineBasicBlock &MBB, ArrayRef<MachineOperand> Cond,
+ unsigned TrueReg, unsigned FalseReg, int &CondCycles, int &TrueCycles,
+ int &FalseCycles) const {
+ // Check register classes.
+ const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ const TargetRegisterClass *RC =
+ RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
+ if (!RC)
+ return false;
+
+ // Expanding cbz/tbz requires an extra cycle of latency on the condition.
+ unsigned ExtraCondLat = Cond.size() != 1;
+
+ // GPRs are handled by csel.
+ // FIXME: Fold in x+1, -x, and ~x when applicable.
+ if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
+ AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
+ // Single-cycle csel, csinc, csinv, and csneg.
+ CondCycles = 1 + ExtraCondLat;
+ TrueCycles = FalseCycles = 1;
+ if (canFoldIntoCSel(MRI, TrueReg))
+ TrueCycles = 0;
+ else if (canFoldIntoCSel(MRI, FalseReg))
+ FalseCycles = 0;
+ return true;
+ }
+
+ // Scalar floating point is handled by fcsel.
+ // FIXME: Form fabs, fmin, and fmax when applicable.
+ if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
+ AArch64::FPR32RegClass.hasSubClassEq(RC)) {
+ CondCycles = 5 + ExtraCondLat;
+ TrueCycles = FalseCycles = 2;
+ return true;
+ }
+
+ // Can't do vectors.
+ return false;
+}
+
+void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL, unsigned DstReg,
+ ArrayRef<MachineOperand> Cond,
+ unsigned TrueReg, unsigned FalseReg) const {
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+ // Parse the condition code, see parseCondBranch() above.
+ AArch64CC::CondCode CC;
+ switch (Cond.size()) {
+ default:
+ llvm_unreachable("Unknown condition opcode in Cond");
+ case 1: // b.cc
+ CC = AArch64CC::CondCode(Cond[0].getImm());
+ break;
+ case 3: { // cbz/cbnz
+ // We must insert a compare against 0.
+ bool Is64Bit;
+ switch (Cond[1].getImm()) {
+ default:
+ llvm_unreachable("Unknown branch opcode in Cond");
+ case AArch64::CBZW:
+ Is64Bit = 0;
+ CC = AArch64CC::EQ;
+ break;
+ case AArch64::CBZX:
+ Is64Bit = 1;
+ CC = AArch64CC::EQ;
+ break;
+ case AArch64::CBNZW:
+ Is64Bit = 0;
+ CC = AArch64CC::NE;
+ break;
+ case AArch64::CBNZX:
+ Is64Bit = 1;
+ CC = AArch64CC::NE;
+ break;
+ }
+ unsigned SrcReg = Cond[2].getReg();
+ if (Is64Bit) {
+ // cmp reg, #0 is actually subs xzr, reg, #0.
+ MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
+ BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
+ .addReg(SrcReg)
+ .addImm(0)
+ .addImm(0);
+ } else {
+ MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
+ BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
+ .addReg(SrcReg)
+ .addImm(0)
+ .addImm(0);
+ }
+ break;
+ }
+ case 4: { // tbz/tbnz
+ // We must insert a tst instruction.
+ switch (Cond[1].getImm()) {
+ default:
+ llvm_unreachable("Unknown branch opcode in Cond");
+ case AArch64::TBZW:
+ case AArch64::TBZX:
+ CC = AArch64CC::EQ;
+ break;
+ case AArch64::TBNZW:
+ case AArch64::TBNZX:
+ CC = AArch64CC::NE;
+ break;
+ }
+ // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
+ if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
+ BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
+ .addReg(Cond[2].getReg())
+ .addImm(
+ AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
+ else
+ BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
+ .addReg(Cond[2].getReg())
+ .addImm(
+ AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
+ break;
+ }
+ }
+
+ unsigned Opc = 0;
+ const TargetRegisterClass *RC = nullptr;
+ bool TryFold = false;
+ if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
+ RC = &AArch64::GPR64RegClass;
+ Opc = AArch64::CSELXr;
+ TryFold = true;
+ } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
+ RC = &AArch64::GPR32RegClass;
+ Opc = AArch64::CSELWr;
+ TryFold = true;
+ } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
+ RC = &AArch64::FPR64RegClass;
+ Opc = AArch64::FCSELDrrr;
+ } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
+ RC = &AArch64::FPR32RegClass;
+ Opc = AArch64::FCSELSrrr;
+ }
+ assert(RC && "Unsupported regclass");
+
+ // Try folding simple instructions into the csel.
+ if (TryFold) {
+ unsigned NewVReg = 0;
+ unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
+ if (FoldedOpc) {
+ // The folded opcodes csinc, csinc and csneg apply the operation to
+ // FalseReg, so we need to invert the condition.
+ CC = AArch64CC::getInvertedCondCode(CC);
+ TrueReg = FalseReg;
+ } else
+ FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
+
+ // Fold the operation. Leave any dead instructions for DCE to clean up.
+ if (FoldedOpc) {
+ FalseReg = NewVReg;
+ Opc = FoldedOpc;
+ // The extends the live range of NewVReg.
+ MRI.clearKillFlags(NewVReg);
+ }
+ }
+
+ // Pull all virtual register into the appropriate class.
+ MRI.constrainRegClass(TrueReg, RC);
+ MRI.constrainRegClass(FalseReg, RC);
+
+ // Insert the csel.
+ BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(TrueReg).addReg(FalseReg).addImm(
+ CC);
+}
+
+/// Returns true if a MOVi32imm or MOVi64imm can be expanded to an ORRxx.
+static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) {
+ uint64_t Imm = MI.getOperand(1).getImm();
+ uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
+ uint64_t Encoding;
+ return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding);
+}
+
+// FIXME: this implementation should be micro-architecture dependent, so a
+// micro-architecture target hook should be introduced here in future.
+bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
+ if (!Subtarget.hasCustomCheapAsMoveHandling())
+ return MI.isAsCheapAsAMove();
+
+ unsigned Imm;
+
+ switch (MI.getOpcode()) {
+ default:
+ return false;
+
+ // add/sub on register without shift
+ case AArch64::ADDWri:
+ case AArch64::ADDXri:
+ case AArch64::SUBWri:
+ case AArch64::SUBXri:
+ return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 ||
+ MI.getOperand(3).getImm() == 0);
+
+ // add/sub on register with shift
+ case AArch64::ADDWrs:
+ case AArch64::ADDXrs:
+ case AArch64::SUBWrs:
+ case AArch64::SUBXrs:
+ Imm = MI.getOperand(3).getImm();
+ return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 &&
+ AArch64_AM::getArithShiftValue(Imm) < 4);
+
+ // logical ops on immediate
+ case AArch64::ANDWri:
+ case AArch64::ANDXri:
+ case AArch64::EORWri:
+ case AArch64::EORXri:
+ case AArch64::ORRWri:
+ case AArch64::ORRXri:
+ return true;
+
+ // logical ops on register without shift
+ case AArch64::ANDWrr:
+ case AArch64::ANDXrr:
+ case AArch64::BICWrr:
+ case AArch64::BICXrr:
+ case AArch64::EONWrr:
+ case AArch64::EONXrr:
+ case AArch64::EORWrr:
+ case AArch64::EORXrr:
+ case AArch64::ORNWrr:
+ case AArch64::ORNXrr:
+ case AArch64::ORRWrr:
+ case AArch64::ORRXrr:
+ return true;
+
+ // logical ops on register with shift
+ case AArch64::ANDWrs:
+ case AArch64::ANDXrs:
+ case AArch64::BICWrs:
+ case AArch64::BICXrs:
+ case AArch64::EONWrs:
+ case AArch64::EONXrs:
+ case AArch64::EORWrs:
+ case AArch64::EORXrs:
+ case AArch64::ORNWrs:
+ case AArch64::ORNXrs:
+ case AArch64::ORRWrs:
+ case AArch64::ORRXrs:
+ Imm = MI.getOperand(3).getImm();
+ return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 &&
+ AArch64_AM::getShiftValue(Imm) < 4 &&
+ AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL);
+
+ // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
+ // ORRXri, it is as cheap as MOV
+ case AArch64::MOVi32imm:
+ return canBeExpandedToORR(MI, 32);
+ case AArch64::MOVi64imm:
+ return canBeExpandedToORR(MI, 64);
+
+ // It is cheap to zero out registers if the subtarget has ZeroCycleZeroing
+ // feature.
+ case AArch64::FMOVS0:
+ case AArch64::FMOVD0:
+ return Subtarget.hasZeroCycleZeroing();
+ case TargetOpcode::COPY:
+ return (Subtarget.hasZeroCycleZeroing() &&
+ (MI.getOperand(1).getReg() == AArch64::WZR ||
+ MI.getOperand(1).getReg() == AArch64::XZR));
+ }
+
+ llvm_unreachable("Unknown opcode to check as cheap as a move!");
+}
+
+bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
+ unsigned &SrcReg, unsigned &DstReg,
+ unsigned &SubIdx) const {
+ switch (MI.getOpcode()) {
+ default:
+ return false;
+ case AArch64::SBFMXri: // aka sxtw
+ case AArch64::UBFMXri: // aka uxtw
+ // Check for the 32 -> 64 bit extension case, these instructions can do
+ // much more.
+ if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
+ return false;
+ // This is a signed or unsigned 32 -> 64 bit extension.
+ SrcReg = MI.getOperand(1).getReg();
+ DstReg = MI.getOperand(0).getReg();
+ SubIdx = AArch64::sub_32;
+ return true;
+ }
+}
+
+bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
+ MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA) const {
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ unsigned BaseRegA = 0, BaseRegB = 0;
+ int64_t OffsetA = 0, OffsetB = 0;
+ unsigned WidthA = 0, WidthB = 0;
+
+ assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
+ assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
+
+ if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
+ MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
+ return false;
+
+ // Retrieve the base register, offset from the base register and width. Width
+ // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If
+ // base registers are identical, and the offset of a lower memory access +
+ // the width doesn't overlap the offset of a higher memory access,
+ // then the memory accesses are different.
+ if (getMemOpBaseRegImmOfsWidth(MIa, BaseRegA, OffsetA, WidthA, TRI) &&
+ getMemOpBaseRegImmOfsWidth(MIb, BaseRegB, OffsetB, WidthB, TRI)) {
+ if (BaseRegA == BaseRegB) {
+ int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
+ int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
+ int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
+ if (LowOffset + LowWidth <= HighOffset)
+ return true;
+ }
+ }
+ return false;
+}
+
+/// analyzeCompare - For a comparison instruction, return the source registers
+/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
+/// Return true if the comparison instruction can be analyzed.
+bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
+ unsigned &SrcReg2, int &CmpMask,
+ int &CmpValue) const {
+ switch (MI.getOpcode()) {
+ default:
+ break;
+ case AArch64::SUBSWrr:
+ case AArch64::SUBSWrs:
+ case AArch64::SUBSWrx:
+ case AArch64::SUBSXrr:
+ case AArch64::SUBSXrs:
+ case AArch64::SUBSXrx:
+ case AArch64::ADDSWrr:
+ case AArch64::ADDSWrs:
+ case AArch64::ADDSWrx:
+ case AArch64::ADDSXrr:
+ case AArch64::ADDSXrs:
+ case AArch64::ADDSXrx:
+ // Replace SUBSWrr with SUBWrr if NZCV is not used.
+ SrcReg = MI.getOperand(1).getReg();
+ SrcReg2 = MI.getOperand(2).getReg();
+ CmpMask = ~0;
+ CmpValue = 0;
+ return true;
+ case AArch64::SUBSWri:
+ case AArch64::ADDSWri:
+ case AArch64::SUBSXri:
+ case AArch64::ADDSXri:
+ SrcReg = MI.getOperand(1).getReg();
+ SrcReg2 = 0;
+ CmpMask = ~0;
+ // FIXME: In order to convert CmpValue to 0 or 1
+ CmpValue = MI.getOperand(2).getImm() != 0;
+ return true;
+ case AArch64::ANDSWri:
+ case AArch64::ANDSXri:
+ // ANDS does not use the same encoding scheme as the others xxxS
+ // instructions.
+ SrcReg = MI.getOperand(1).getReg();
+ SrcReg2 = 0;
+ CmpMask = ~0;
+ // FIXME:The return val type of decodeLogicalImmediate is uint64_t,
+ // while the type of CmpValue is int. When converting uint64_t to int,
+ // the high 32 bits of uint64_t will be lost.
+ // In fact it causes a bug in spec2006-483.xalancbmk
+ // CmpValue is only used to compare with zero in OptimizeCompareInstr
+ CmpValue = AArch64_AM::decodeLogicalImmediate(
+ MI.getOperand(2).getImm(),
+ MI.getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0;
+ return true;
+ }
+
+ return false;
+}
+
+static bool UpdateOperandRegClass(MachineInstr &Instr) {
+ MachineBasicBlock *MBB = Instr.getParent();
+ assert(MBB && "Can't get MachineBasicBlock here");
+ MachineFunction *MF = MBB->getParent();
+ assert(MF && "Can't get MachineFunction here");
+ const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+ const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+ MachineRegisterInfo *MRI = &MF->getRegInfo();
+
+ for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
+ ++OpIdx) {
+ MachineOperand &MO = Instr.getOperand(OpIdx);
+ const TargetRegisterClass *OpRegCstraints =
+ Instr.getRegClassConstraint(OpIdx, TII, TRI);
+
+ // If there's no constraint, there's nothing to do.
+ if (!OpRegCstraints)
+ continue;
+ // If the operand is a frame index, there's nothing to do here.
+ // A frame index operand will resolve correctly during PEI.
+ if (MO.isFI())
+ continue;
+
+ assert(MO.isReg() &&
+ "Operand has register constraints without being a register!");
+
+ unsigned Reg = MO.getReg();
+ if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+ if (!OpRegCstraints->contains(Reg))
+ return false;
+ } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
+ !MRI->constrainRegClass(Reg, OpRegCstraints))
+ return false;
+ }
+
+ return true;
+}
+
+/// \brief Return the opcode that does not set flags when possible - otherwise
+/// return the original opcode. The caller is responsible to do the actual
+/// substitution and legality checking.
+static unsigned convertFlagSettingOpcode(const MachineInstr &MI) {
+ // Don't convert all compare instructions, because for some the zero register
+ // encoding becomes the sp register.
+ bool MIDefinesZeroReg = false;
+ if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR))
+ MIDefinesZeroReg = true;
+
+ switch (MI.getOpcode()) {
+ default:
+ return MI.getOpcode();
+ case AArch64::ADDSWrr:
+ return AArch64::ADDWrr;
+ case AArch64::ADDSWri:
+ return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
+ case AArch64::ADDSWrs:
+ return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
+ case AArch64::ADDSWrx:
+ return AArch64::ADDWrx;
+ case AArch64::ADDSXrr:
+ return AArch64::ADDXrr;
+ case AArch64::ADDSXri:
+ return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
+ case AArch64::ADDSXrs:
+ return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
+ case AArch64::ADDSXrx:
+ return AArch64::ADDXrx;
+ case AArch64::SUBSWrr:
+ return AArch64::SUBWrr;
+ case AArch64::SUBSWri:
+ return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
+ case AArch64::SUBSWrs:
+ return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
+ case AArch64::SUBSWrx:
+ return AArch64::SUBWrx;
+ case AArch64::SUBSXrr:
+ return AArch64::SUBXrr;
+ case AArch64::SUBSXri:
+ return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
+ case AArch64::SUBSXrs:
+ return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
+ case AArch64::SUBSXrx:
+ return AArch64::SUBXrx;
+ }
+}
+
+enum AccessKind {
+ AK_Write = 0x01,
+ AK_Read = 0x10,
+ AK_All = 0x11
+};
+
+/// True when condition flags are accessed (either by writing or reading)
+/// on the instruction trace starting at From and ending at To.
+///
+/// Note: If From and To are from different blocks it's assumed CC are accessed
+/// on the path.
+static bool areCFlagsAccessedBetweenInstrs(
+ MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,
+ const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
+ // Early exit if To is at the beginning of the BB.
+ if (To == To->getParent()->begin())
+ return true;
+
+ // Check whether the instructions are in the same basic block
+ // If not, assume the condition flags might get modified somewhere.
+ if (To->getParent() != From->getParent())
+ return true;
+
+ // From must be above To.
+ assert(std::find_if(++To.getReverse(), To->getParent()->rend(),
+ [From](MachineInstr &MI) {
+ return MI.getIterator() == From;
+ }) != To->getParent()->rend());
+
+ // We iterate backward starting \p To until we hit \p From.
+ for (--To; To != From; --To) {
+ const MachineInstr &Instr = *To;
+
+ if ( ((AccessToCheck & AK_Write) && Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
+ ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
+ return true;
+ }
+ return false;
+}
+
+/// Try to optimize a compare instruction. A compare instruction is an
+/// instruction which produces AArch64::NZCV. It can be truly compare instruction
+/// when there are no uses of its destination register.
+///
+/// The following steps are tried in order:
+/// 1. Convert CmpInstr into an unconditional version.
+/// 2. Remove CmpInstr if above there is an instruction producing a needed
+/// condition code or an instruction which can be converted into such an instruction.
+/// Only comparison with zero is supported.
+bool AArch64InstrInfo::optimizeCompareInstr(
+ MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask,
+ int CmpValue, const MachineRegisterInfo *MRI) const {
+ assert(CmpInstr.getParent());
+ assert(MRI);
+
+ // Replace SUBSWrr with SUBWrr if NZCV is not used.
+ int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true);
+ if (DeadNZCVIdx != -1) {
+ if (CmpInstr.definesRegister(AArch64::WZR) ||
+ CmpInstr.definesRegister(AArch64::XZR)) {
+ CmpInstr.eraseFromParent();
+ return true;
+ }
+ unsigned Opc = CmpInstr.getOpcode();
+ unsigned NewOpc = convertFlagSettingOpcode(CmpInstr);
+ if (NewOpc == Opc)
+ return false;
+ const MCInstrDesc &MCID = get(NewOpc);
+ CmpInstr.setDesc(MCID);
+ CmpInstr.RemoveOperand(DeadNZCVIdx);
+ bool succeeded = UpdateOperandRegClass(CmpInstr);
+ (void)succeeded;
+ assert(succeeded && "Some operands reg class are incompatible!");
+ return true;
+ }
+
+ // Continue only if we have a "ri" where immediate is zero.
+ // FIXME:CmpValue has already been converted to 0 or 1 in analyzeCompare
+ // function.
+ assert((CmpValue == 0 || CmpValue == 1) && "CmpValue must be 0 or 1!");
+ if (CmpValue != 0 || SrcReg2 != 0)
+ return false;
+
+ // CmpInstr is a Compare instruction if destination register is not used.
+ if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
+ return false;
+
+ return substituteCmpToZero(CmpInstr, SrcReg, MRI);
+}
+
+/// Get opcode of S version of Instr.
+/// If Instr is S version its opcode is returned.
+/// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
+/// or we are not interested in it.
+static unsigned sForm(MachineInstr &Instr) {
+ switch (Instr.getOpcode()) {
+ default:
+ return AArch64::INSTRUCTION_LIST_END;
+
+ case AArch64::ADDSWrr:
+ case AArch64::ADDSWri:
+ case AArch64::ADDSXrr:
+ case AArch64::ADDSXri:
+ case AArch64::SUBSWrr:
+ case AArch64::SUBSWri:
+ case AArch64::SUBSXrr:
+ case AArch64::SUBSXri:
+ return Instr.getOpcode();;
+
+ case AArch64::ADDWrr: return AArch64::ADDSWrr;
+ case AArch64::ADDWri: return AArch64::ADDSWri;
+ case AArch64::ADDXrr: return AArch64::ADDSXrr;
+ case AArch64::ADDXri: return AArch64::ADDSXri;
+ case AArch64::ADCWr: return AArch64::ADCSWr;
+ case AArch64::ADCXr: return AArch64::ADCSXr;
+ case AArch64::SUBWrr: return AArch64::SUBSWrr;
+ case AArch64::SUBWri: return AArch64::SUBSWri;
+ case AArch64::SUBXrr: return AArch64::SUBSXrr;
+ case AArch64::SUBXri: return AArch64::SUBSXri;
+ case AArch64::SBCWr: return AArch64::SBCSWr;
+ case AArch64::SBCXr: return AArch64::SBCSXr;
+ case AArch64::ANDWri: return AArch64::ANDSWri;
+ case AArch64::ANDXri: return AArch64::ANDSXri;
+ }
+}
+
+/// Check if AArch64::NZCV should be alive in successors of MBB.
+static bool areCFlagsAliveInSuccessors(MachineBasicBlock *MBB) {
+ for (auto *BB : MBB->successors())
+ if (BB->isLiveIn(AArch64::NZCV))
+ return true;
+ return false;
+}
+
+namespace {
+struct UsedNZCV {
+ bool N;
+ bool Z;
+ bool C;
+ bool V;
+ UsedNZCV(): N(false), Z(false), C(false), V(false) {}
+ UsedNZCV& operator |=(const UsedNZCV& UsedFlags) {
+ this->N |= UsedFlags.N;
+ this->Z |= UsedFlags.Z;
+ this->C |= UsedFlags.C;
+ this->V |= UsedFlags.V;
+ return *this;
+ }
+};
+} // end anonymous namespace
+
+/// Find a condition code used by the instruction.
+/// Returns AArch64CC::Invalid if either the instruction does not use condition
+/// codes or we don't optimize CmpInstr in the presence of such instructions.
+static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) {
+ switch (Instr.getOpcode()) {
+ default:
+ return AArch64CC::Invalid;
+
+ case AArch64::Bcc: {
+ int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
+ assert(Idx >= 2);
+ return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 2).getImm());
+ }
+
+ case AArch64::CSINVWr:
+ case AArch64::CSINVXr:
+ case AArch64::CSINCWr:
+ case AArch64::CSINCXr:
+ case AArch64::CSELWr:
+ case AArch64::CSELXr:
+ case AArch64::CSNEGWr:
+ case AArch64::CSNEGXr:
+ case AArch64::FCSELSrrr:
+ case AArch64::FCSELDrrr: {
+ int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
+ assert(Idx >= 1);
+ return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 1).getImm());
+ }
+ }
+}
+
+static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
+ assert(CC != AArch64CC::Invalid);
+ UsedNZCV UsedFlags;
+ switch (CC) {
+ default:
+ break;
+
+ case AArch64CC::EQ: // Z set
+ case AArch64CC::NE: // Z clear
+ UsedFlags.Z = true;
+ break;
+
+ case AArch64CC::HI: // Z clear and C set
+ case AArch64CC::LS: // Z set or C clear
+ UsedFlags.Z = true;
+ case AArch64CC::HS: // C set
+ case AArch64CC::LO: // C clear
+ UsedFlags.C = true;
+ break;
+
+ case AArch64CC::MI: // N set
+ case AArch64CC::PL: // N clear
+ UsedFlags.N = true;
+ break;
+
+ case AArch64CC::VS: // V set
+ case AArch64CC::VC: // V clear
+ UsedFlags.V = true;
+ break;
+
+ case AArch64CC::GT: // Z clear, N and V the same
+ case AArch64CC::LE: // Z set, N and V differ
+ UsedFlags.Z = true;
+ case AArch64CC::GE: // N and V the same
+ case AArch64CC::LT: // N and V differ
+ UsedFlags.N = true;
+ UsedFlags.V = true;
+ break;
+ }
+ return UsedFlags;
+}
+
+static bool isADDSRegImm(unsigned Opcode) {
+ return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
+}
+
+static bool isSUBSRegImm(unsigned Opcode) {
+ return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
+}
+
+/// Check if CmpInstr can be substituted by MI.
+///
+/// CmpInstr can be substituted:
+/// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
+/// - and, MI and CmpInstr are from the same MachineBB
+/// - and, condition flags are not alive in successors of the CmpInstr parent
+/// - and, if MI opcode is the S form there must be no defs of flags between
+/// MI and CmpInstr
+/// or if MI opcode is not the S form there must be neither defs of flags
+/// nor uses of flags between MI and CmpInstr.
+/// - and C/V flags are not used after CmpInstr
+static bool canInstrSubstituteCmpInstr(MachineInstr *MI, MachineInstr *CmpInstr,
+ const TargetRegisterInfo *TRI) {
+ assert(MI);
+ assert(sForm(*MI) != AArch64::INSTRUCTION_LIST_END);
+ assert(CmpInstr);
+
+ const unsigned CmpOpcode = CmpInstr->getOpcode();
+ if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
+ return false;
+
+ if (MI->getParent() != CmpInstr->getParent())
+ return false;
+
+ if (areCFlagsAliveInSuccessors(CmpInstr->getParent()))
+ return false;
+
+ AccessKind AccessToCheck = AK_Write;
+ if (sForm(*MI) != MI->getOpcode())
+ AccessToCheck = AK_All;
+ if (areCFlagsAccessedBetweenInstrs(MI, CmpInstr, TRI, AccessToCheck))
+ return false;
+
+ UsedNZCV NZCVUsedAfterCmp;
+ for (auto I = std::next(CmpInstr->getIterator()), E = CmpInstr->getParent()->instr_end();
+ I != E; ++I) {
+ const MachineInstr &Instr = *I;
+ if (Instr.readsRegister(AArch64::NZCV, TRI)) {
+ AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);
+ if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
+ return false;
+ NZCVUsedAfterCmp |= getUsedNZCV(CC);
+ }
+
+ if (Instr.modifiesRegister(AArch64::NZCV, TRI))
+ break;
+ }
+
+ return !NZCVUsedAfterCmp.C && !NZCVUsedAfterCmp.V;
+}
+
+/// Substitute an instruction comparing to zero with another instruction
+/// which produces needed condition flags.
+///
+/// Return true on success.
+bool AArch64InstrInfo::substituteCmpToZero(
+ MachineInstr &CmpInstr, unsigned SrcReg,
+ const MachineRegisterInfo *MRI) const {
+ assert(MRI);
+ // Get the unique definition of SrcReg.
+ MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
+ if (!MI)
+ return false;
+
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+ unsigned NewOpc = sForm(*MI);
+ if (NewOpc == AArch64::INSTRUCTION_LIST_END)
+ return false;
+
+ if (!canInstrSubstituteCmpInstr(MI, &CmpInstr, TRI))
+ return false;
+
+ // Update the instruction to set NZCV.
+ MI->setDesc(get(NewOpc));
+ CmpInstr.eraseFromParent();
+ bool succeeded = UpdateOperandRegClass(*MI);
+ (void)succeeded;
+ assert(succeeded && "Some operands reg class are incompatible!");
+ MI->addRegisterDefined(AArch64::NZCV, TRI);
+ return true;
+}
+
+bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+ if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD)
+ return false;
+
+ MachineBasicBlock &MBB = *MI.getParent();
+ DebugLoc DL = MI.getDebugLoc();
+ unsigned Reg = MI.getOperand(0).getReg();
+ const GlobalValue *GV =
+ cast<GlobalValue>((*MI.memoperands_begin())->getValue());
+ const TargetMachine &TM = MBB.getParent()->getTarget();
+ unsigned char OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
+ const unsigned char MO_NC = AArch64II::MO_NC;
+
+ if ((OpFlags & AArch64II::MO_GOT) != 0) {
+ BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
+ .addGlobalAddress(GV, 0, AArch64II::MO_GOT);
+ BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
+ .addReg(Reg, RegState::Kill)
+ .addImm(0)
+ .addMemOperand(*MI.memoperands_begin());
+ } else if (TM.getCodeModel() == CodeModel::Large) {
+ BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
+ .addGlobalAddress(GV, 0, AArch64II::MO_G3).addImm(48);
+ BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
+ .addReg(Reg, RegState::Kill)
+ .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC).addImm(32);
+ BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
+ .addReg(Reg, RegState::Kill)
+ .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC).addImm(16);
+ BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
+ .addReg(Reg, RegState::Kill)
+ .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC).addImm(0);
+ BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
+ .addReg(Reg, RegState::Kill)
+ .addImm(0)
+ .addMemOperand(*MI.memoperands_begin());
+ } else {
+ BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
+ .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
+ unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
+ BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
+ .addReg(Reg, RegState::Kill)
+ .addGlobalAddress(GV, 0, LoFlags)
+ .addMemOperand(*MI.memoperands_begin());
+ }
+
+ MBB.erase(MI);
+
+ return true;
+}
+
+/// Return true if this is this instruction has a non-zero immediate
+bool AArch64InstrInfo::hasShiftedReg(const MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ default:
+ break;
+ case AArch64::ADDSWrs:
+ case AArch64::ADDSXrs:
+ case AArch64::ADDWrs:
+ case AArch64::ADDXrs:
+ case AArch64::ANDSWrs:
+ case AArch64::ANDSXrs:
+ case AArch64::ANDWrs:
+ case AArch64::ANDXrs:
+ case AArch64::BICSWrs:
+ case AArch64::BICSXrs:
+ case AArch64::BICWrs:
+ case AArch64::BICXrs:
+ case AArch64::CRC32Brr:
+ case AArch64::CRC32CBrr:
+ case AArch64::CRC32CHrr:
+ case AArch64::CRC32CWrr:
+ case AArch64::CRC32CXrr:
+ case AArch64::CRC32Hrr:
+ case AArch64::CRC32Wrr:
+ case AArch64::CRC32Xrr:
+ case AArch64::EONWrs:
+ case AArch64::EONXrs:
+ case AArch64::EORWrs:
+ case AArch64::EORXrs:
+ case AArch64::ORNWrs:
+ case AArch64::ORNXrs:
+ case AArch64::ORRWrs:
+ case AArch64::ORRXrs:
+ case AArch64::SUBSWrs:
+ case AArch64::SUBSXrs:
+ case AArch64::SUBWrs:
+ case AArch64::SUBXrs:
+ if (MI.getOperand(3).isImm()) {
+ unsigned val = MI.getOperand(3).getImm();
+ return (val != 0);
+ }
+ break;
+ }
+ return false;
+}
+
+/// Return true if this is this instruction has a non-zero immediate
+bool AArch64InstrInfo::hasExtendedReg(const MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ default:
+ break;
+ case AArch64::ADDSWrx:
+ case AArch64::ADDSXrx:
+ case AArch64::ADDSXrx64:
+ case AArch64::ADDWrx:
+ case AArch64::ADDXrx:
+ case AArch64::ADDXrx64:
+ case AArch64::SUBSWrx:
+ case AArch64::SUBSXrx:
+ case AArch64::SUBSXrx64:
+ case AArch64::SUBWrx:
+ case AArch64::SUBXrx:
+ case AArch64::SUBXrx64:
+ if (MI.getOperand(3).isImm()) {
+ unsigned val = MI.getOperand(3).getImm();
+ return (val != 0);
+ }
+ break;
+ }
+
+ return false;
+}
+
+// Return true if this instruction simply sets its single destination register
+// to zero. This is equivalent to a register rename of the zero-register.
+bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ default:
+ break;
+ case AArch64::MOVZWi:
+ case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
+ if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
+ assert(MI.getDesc().getNumOperands() == 3 &&
+ MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
+ return true;
+ }
+ break;
+ case AArch64::ANDWri: // and Rd, Rzr, #imm
+ return MI.getOperand(1).getReg() == AArch64::WZR;
+ case AArch64::ANDXri:
+ return MI.getOperand(1).getReg() == AArch64::XZR;
+ case TargetOpcode::COPY:
+ return MI.getOperand(1).getReg() == AArch64::WZR;
+ }
+ return false;
+}
+
+// Return true if this instruction simply renames a general register without
+// modifying bits.
+bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ default:
+ break;
+ case TargetOpcode::COPY: {
+ // GPR32 copies will by lowered to ORRXrs
+ unsigned DstReg = MI.getOperand(0).getReg();
+ return (AArch64::GPR32RegClass.contains(DstReg) ||
+ AArch64::GPR64RegClass.contains(DstReg));
+ }
+ case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
+ if (MI.getOperand(1).getReg() == AArch64::XZR) {
+ assert(MI.getDesc().getNumOperands() == 4 &&
+ MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
+ return true;
+ }
+ break;
+ case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
+ if (MI.getOperand(2).getImm() == 0) {
+ assert(MI.getDesc().getNumOperands() == 4 &&
+ MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
+ return true;
+ }
+ break;
+ }
+ return false;
+}
+
+// Return true if this instruction simply renames a general register without
+// modifying bits.
+bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ default:
+ break;
+ case TargetOpcode::COPY: {
+ // FPR64 copies will by lowered to ORR.16b
+ unsigned DstReg = MI.getOperand(0).getReg();
+ return (AArch64::FPR64RegClass.contains(DstReg) ||
+ AArch64::FPR128RegClass.contains(DstReg));
+ }
+ case AArch64::ORRv16i8:
+ if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
+ assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
+ "invalid ORRv16i8 operands");
+ return true;
+ }
+ break;
+ }
+ return false;
+}
+
+unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const {
+ switch (MI.getOpcode()) {
+ default:
+ break;
+ case AArch64::LDRWui:
+ case AArch64::LDRXui:
+ case AArch64::LDRBui:
+ case AArch64::LDRHui:
+ case AArch64::LDRSui:
+ case AArch64::LDRDui:
+ case AArch64::LDRQui:
+ if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
+ MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
+ FrameIndex = MI.getOperand(1).getIndex();
+ return MI.getOperand(0).getReg();
+ }
+ break;
+ }
+
+ return 0;
+}
+
+unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const {
+ switch (MI.getOpcode()) {
+ default:
+ break;
+ case AArch64::STRWui:
+ case AArch64::STRXui:
+ case AArch64::STRBui:
+ case AArch64::STRHui:
+ case AArch64::STRSui:
+ case AArch64::STRDui:
+ case AArch64::STRQui:
+ if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
+ MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
+ FrameIndex = MI.getOperand(1).getIndex();
+ return MI.getOperand(0).getReg();
+ }
+ break;
+ }
+ return 0;
+}
+
+/// Return true if this is load/store scales or extends its register offset.
+/// This refers to scaling a dynamic index as opposed to scaled immediates.
+/// MI should be a memory op that allows scaled addressing.
+bool AArch64InstrInfo::isScaledAddr(const MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ default:
+ break;
+ case AArch64::LDRBBroW:
+ case AArch64::LDRBroW:
+ case AArch64::LDRDroW:
+ case AArch64::LDRHHroW:
+ case AArch64::LDRHroW:
+ case AArch64::LDRQroW:
+ case AArch64::LDRSBWroW:
+ case AArch64::LDRSBXroW:
+ case AArch64::LDRSHWroW:
+ case AArch64::LDRSHXroW:
+ case AArch64::LDRSWroW:
+ case AArch64::LDRSroW:
+ case AArch64::LDRWroW:
+ case AArch64::LDRXroW:
+ case AArch64::STRBBroW:
+ case AArch64::STRBroW:
+ case AArch64::STRDroW:
+ case AArch64::STRHHroW:
+ case AArch64::STRHroW:
+ case AArch64::STRQroW:
+ case AArch64::STRSroW:
+ case AArch64::STRWroW:
+ case AArch64::STRXroW:
+ case AArch64::LDRBBroX:
+ case AArch64::LDRBroX:
+ case AArch64::LDRDroX:
+ case AArch64::LDRHHroX:
+ case AArch64::LDRHroX:
+ case AArch64::LDRQroX:
+ case AArch64::LDRSBWroX:
+ case AArch64::LDRSBXroX:
+ case AArch64::LDRSHWroX:
+ case AArch64::LDRSHXroX:
+ case AArch64::LDRSWroX:
+ case AArch64::LDRSroX:
+ case AArch64::LDRWroX:
+ case AArch64::LDRXroX:
+ case AArch64::STRBBroX:
+ case AArch64::STRBroX:
+ case AArch64::STRDroX:
+ case AArch64::STRHHroX:
+ case AArch64::STRHroX:
+ case AArch64::STRQroX:
+ case AArch64::STRSroX:
+ case AArch64::STRWroX:
+ case AArch64::STRXroX:
+
+ unsigned Val = MI.getOperand(3).getImm();
+ AArch64_AM::ShiftExtendType ExtType = AArch64_AM::getMemExtendType(Val);
+ return (ExtType != AArch64_AM::UXTX) || AArch64_AM::getMemDoShift(Val);
+ }
+ return false;
+}
+
+/// Check all MachineMemOperands for a hint to suppress pairing.
+bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) const {
+ return any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
+ return MMO->getFlags() & MOSuppressPair;
+ });
+}
+
+/// Set a flag on the first MachineMemOperand to suppress pairing.
+void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) const {
+ if (MI.memoperands_empty())
+ return;
+ (*MI.memoperands_begin())->setFlags(MOSuppressPair);
+}
+
+bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) const {
+ switch (Opc) {
+ default:
+ return false;
+ case AArch64::STURSi:
+ case AArch64::STURDi:
+ case AArch64::STURQi:
+ case AArch64::STURBBi:
+ case AArch64::STURHHi:
+ case AArch64::STURWi:
+ case AArch64::STURXi:
+ case AArch64::LDURSi:
+ case AArch64::LDURDi:
+ case AArch64::LDURQi:
+ case AArch64::LDURWi:
+ case AArch64::LDURXi:
+ case AArch64::LDURSWi:
+ case AArch64::LDURHHi:
+ case AArch64::LDURBBi:
+ case AArch64::LDURSBWi:
+ case AArch64::LDURSHWi:
+ return true;
+ }
+}
+
+bool AArch64InstrInfo::isUnscaledLdSt(MachineInstr &MI) const {
+ return isUnscaledLdSt(MI.getOpcode());
+}
+
+// Is this a candidate for ld/st merging or pairing? For example, we don't
+// touch volatiles or load/stores that have a hint to avoid pair formation.
+bool AArch64InstrInfo::isCandidateToMergeOrPair(MachineInstr &MI) const {
+ // If this is a volatile load/store, don't mess with it.
+ if (MI.hasOrderedMemoryRef())
+ return false;
+
+ // Make sure this is a reg+imm (as opposed to an address reloc).
+ assert(MI.getOperand(1).isReg() && "Expected a reg operand.");
+ if (!MI.getOperand(2).isImm())
+ return false;
+
+ // Can't merge/pair if the instruction modifies the base register.
+ // e.g., ldr x0, [x0]
+ unsigned BaseReg = MI.getOperand(1).getReg();
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ if (MI.modifiesRegister(BaseReg, TRI))
+ return false;
+
+ // Check if this load/store has a hint to avoid pair formation.
+ // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
+ if (isLdStPairSuppressed(MI))
+ return false;
+
+ // On some CPUs quad load/store pairs are slower than two single load/stores.
+ if (Subtarget.avoidQuadLdStPairs()) {
+ switch (MI.getOpcode()) {
+ default:
+ break;
+ case AArch64::LDURQi:
+ case AArch64::STURQi:
+ case AArch64::LDRQui:
+ case AArch64::STRQui:
+ return false;
+ }
+ }
+
+ return true;
+}
+
+bool AArch64InstrInfo::getMemOpBaseRegImmOfs(
+ MachineInstr &LdSt, unsigned &BaseReg, int64_t &Offset,
+ const TargetRegisterInfo *TRI) const {
+ unsigned Width;
+ return getMemOpBaseRegImmOfsWidth(LdSt, BaseReg, Offset, Width, TRI);
+}
+
+bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
+ MachineInstr &LdSt, unsigned &BaseReg, int64_t &Offset, unsigned &Width,
+ const TargetRegisterInfo *TRI) const {
+ assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
+ // Handle only loads/stores with base register followed by immediate offset.
+ if (LdSt.getNumExplicitOperands() == 3) {
+ // Non-paired instruction (e.g., ldr x1, [x0, #8]).
+ if (!LdSt.getOperand(1).isReg() || !LdSt.getOperand(2).isImm())
+ return false;
+ } else if (LdSt.getNumExplicitOperands() == 4) {
+ // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
+ if (!LdSt.getOperand(1).isReg() || !LdSt.getOperand(2).isReg() ||
+ !LdSt.getOperand(3).isImm())
+ return false;
+ } else
+ return false;
+
+ // Offset is calculated as the immediate operand multiplied by the scaling factor.
+ // Unscaled instructions have scaling factor set to 1.
+ unsigned Scale = 0;
+ switch (LdSt.getOpcode()) {
+ default:
+ return false;
+ case AArch64::LDURQi:
+ case AArch64::STURQi:
+ Width = 16;
+ Scale = 1;
+ break;
+ case AArch64::LDURXi:
+ case AArch64::LDURDi:
+ case AArch64::STURXi:
+ case AArch64::STURDi:
+ Width = 8;
+ Scale = 1;
+ break;
+ case AArch64::LDURWi:
+ case AArch64::LDURSi:
+ case AArch64::LDURSWi:
+ case AArch64::STURWi:
+ case AArch64::STURSi:
+ Width = 4;
+ Scale = 1;
+ break;
+ case AArch64::LDURHi:
+ case AArch64::LDURHHi:
+ case AArch64::LDURSHXi:
+ case AArch64::LDURSHWi:
+ case AArch64::STURHi:
+ case AArch64::STURHHi:
+ Width = 2;
+ Scale = 1;
+ break;
+ case AArch64::LDURBi:
+ case AArch64::LDURBBi:
+ case AArch64::LDURSBXi:
+ case AArch64::LDURSBWi:
+ case AArch64::STURBi:
+ case AArch64::STURBBi:
+ Width = 1;
+ Scale = 1;
+ break;
+ case AArch64::LDPQi:
+ case AArch64::LDNPQi:
+ case AArch64::STPQi:
+ case AArch64::STNPQi:
+ Scale = 16;
+ Width = 32;
+ break;
+ case AArch64::LDRQui:
+ case AArch64::STRQui:
+ Scale = Width = 16;
+ break;
+ case AArch64::LDPXi:
+ case AArch64::LDPDi:
+ case AArch64::LDNPXi:
+ case AArch64::LDNPDi:
+ case AArch64::STPXi:
+ case AArch64::STPDi:
+ case AArch64::STNPXi:
+ case AArch64::STNPDi:
+ Scale = 8;
+ Width = 16;
+ break;
+ case AArch64::LDRXui:
+ case AArch64::LDRDui:
+ case AArch64::STRXui:
+ case AArch64::STRDui:
+ Scale = Width = 8;
+ break;
+ case AArch64::LDPWi:
+ case AArch64::LDPSi:
+ case AArch64::LDNPWi:
+ case AArch64::LDNPSi:
+ case AArch64::STPWi:
+ case AArch64::STPSi:
+ case AArch64::STNPWi:
+ case AArch64::STNPSi:
+ Scale = 4;
+ Width = 8;
+ break;
+ case AArch64::LDRWui:
+ case AArch64::LDRSui:
+ case AArch64::LDRSWui:
+ case AArch64::STRWui:
+ case AArch64::STRSui:
+ Scale = Width = 4;
+ break;
+ case AArch64::LDRHui:
+ case AArch64::LDRHHui:
+ case AArch64::STRHui:
+ case AArch64::STRHHui:
+ Scale = Width = 2;
+ break;
+ case AArch64::LDRBui:
+ case AArch64::LDRBBui:
+ case AArch64::STRBui:
+ case AArch64::STRBBui:
+ Scale = Width = 1;
+ break;
+ }
+
+ if (LdSt.getNumExplicitOperands() == 3) {
+ BaseReg = LdSt.getOperand(1).getReg();
+ Offset = LdSt.getOperand(2).getImm() * Scale;
+ } else {
+ assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
+ BaseReg = LdSt.getOperand(2).getReg();
+ Offset = LdSt.getOperand(3).getImm() * Scale;
+ }
+ return true;
+}
+
+// Scale the unscaled offsets. Returns false if the unscaled offset can't be
+// scaled.
+static bool scaleOffset(unsigned Opc, int64_t &Offset) {
+ unsigned OffsetStride = 1;
+ switch (Opc) {
+ default:
+ return false;
+ case AArch64::LDURQi:
+ case AArch64::STURQi:
+ OffsetStride = 16;
+ break;
+ case AArch64::LDURXi:
+ case AArch64::LDURDi:
+ case AArch64::STURXi:
+ case AArch64::STURDi:
+ OffsetStride = 8;
+ break;
+ case AArch64::LDURWi:
+ case AArch64::LDURSi:
+ case AArch64::LDURSWi:
+ case AArch64::STURWi:
+ case AArch64::STURSi:
+ OffsetStride = 4;
+ break;
+ }
+ // If the byte-offset isn't a multiple of the stride, we can't scale this
+ // offset.
+ if (Offset % OffsetStride != 0)
+ return false;
+
+ // Convert the byte-offset used by unscaled into an "element" offset used
+ // by the scaled pair load/store instructions.
+ Offset /= OffsetStride;
+ return true;
+}
+
+static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
+ if (FirstOpc == SecondOpc)
+ return true;
+ // We can also pair sign-ext and zero-ext instructions.
+ switch (FirstOpc) {
+ default:
+ return false;
+ case AArch64::LDRWui:
+ case AArch64::LDURWi:
+ return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
+ case AArch64::LDRSWui:
+ case AArch64::LDURSWi:
+ return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
+ }
+ // These instructions can't be paired based on their opcodes.
+ return false;
+}
+
+/// Detect opportunities for ldp/stp formation.
+///
+/// Only called for LdSt for which getMemOpBaseRegImmOfs returns true.
+bool AArch64InstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
+ MachineInstr &SecondLdSt,
+ unsigned NumLoads) const {
+ // Only cluster up to a single pair.
+ if (NumLoads > 1)
+ return false;
+
+ if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
+ return false;
+
+ // Can we pair these instructions based on their opcodes?
+ unsigned FirstOpc = FirstLdSt.getOpcode();
+ unsigned SecondOpc = SecondLdSt.getOpcode();
+ if (!canPairLdStOpc(FirstOpc, SecondOpc))
+ return false;
+
+ // Can't merge volatiles or load/stores that have a hint to avoid pair
+ // formation, for example.
+ if (!isCandidateToMergeOrPair(FirstLdSt) ||
+ !isCandidateToMergeOrPair(SecondLdSt))
+ return false;
+
+ // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
+ int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
+ if (isUnscaledLdSt(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
+ return false;
+
+ int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
+ if (isUnscaledLdSt(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
+ return false;
+
+ // Pairwise instructions have a 7-bit signed offset field.
+ if (Offset1 > 63 || Offset1 < -64)
+ return false;
+
+ // The caller should already have ordered First/SecondLdSt by offset.
+ assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
+ return Offset1 + 1 == Offset2;
+}
+
+bool AArch64InstrInfo::shouldScheduleAdjacent(
+ const MachineInstr &First, const MachineInstr &Second) const {
+ if (Subtarget.hasArithmeticBccFusion()) {
+ // Fuse CMN, CMP, TST followed by Bcc.
+ unsigned SecondOpcode = Second.getOpcode();
+ if (SecondOpcode == AArch64::Bcc) {
+ switch (First.getOpcode()) {
+ default:
+ return false;
+ case AArch64::ADDSWri:
+ case AArch64::ADDSWrr:
+ case AArch64::ADDSXri:
+ case AArch64::ADDSXrr:
+ case AArch64::ANDSWri:
+ case AArch64::ANDSWrr:
+ case AArch64::ANDSXri:
+ case AArch64::ANDSXrr:
+ case AArch64::SUBSWri:
+ case AArch64::SUBSWrr:
+ case AArch64::SUBSXri:
+ case AArch64::SUBSXrr:
+ case AArch64::BICSWrr:
+ case AArch64::BICSXrr:
+ return true;
+ case AArch64::ADDSWrs:
+ case AArch64::ADDSXrs:
+ case AArch64::ANDSWrs:
+ case AArch64::ANDSXrs:
+ case AArch64::SUBSWrs:
+ case AArch64::SUBSXrs:
+ case AArch64::BICSWrs:
+ case AArch64::BICSXrs:
+ // Shift value can be 0 making these behave like the "rr" variant...
+ return !hasShiftedReg(Second);
+ }
+ }
+ }
+ if (Subtarget.hasArithmeticCbzFusion()) {
+ // Fuse ALU operations followed by CBZ/CBNZ.
+ unsigned SecondOpcode = Second.getOpcode();
+ if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX ||
+ SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) {
+ switch (First.getOpcode()) {
+ default:
+ return false;
+ case AArch64::ADDWri:
+ case AArch64::ADDWrr:
+ case AArch64::ADDXri:
+ case AArch64::ADDXrr:
+ case AArch64::ANDWri:
+ case AArch64::ANDWrr:
+ case AArch64::ANDXri:
+ case AArch64::ANDXrr:
+ case AArch64::EORWri:
+ case AArch64::EORWrr:
+ case AArch64::EORXri:
+ case AArch64::EORXrr:
+ case AArch64::ORRWri:
+ case AArch64::ORRWrr:
+ case AArch64::ORRXri:
+ case AArch64::ORRXrr:
+ case AArch64::SUBWri:
+ case AArch64::SUBWrr:
+ case AArch64::SUBXri:
+ case AArch64::SUBXrr:
+ return true;
+ case AArch64::ADDWrs:
+ case AArch64::ADDXrs:
+ case AArch64::ANDWrs:
+ case AArch64::ANDXrs:
+ case AArch64::SUBWrs:
+ case AArch64::SUBXrs:
+ case AArch64::BICWrs:
+ case AArch64::BICXrs:
+ // Shift value can be 0 making these behave like the "rr" variant...
+ return !hasShiftedReg(Second);
+ }
+ }
+ }
+ return false;
+}
+
+MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue(
+ MachineFunction &MF, int FrameIx, uint64_t Offset, const MDNode *Var,
+ const MDNode *Expr, const DebugLoc &DL) const {
+ MachineInstrBuilder MIB = BuildMI(MF, DL, get(AArch64::DBG_VALUE))
+ .addFrameIndex(FrameIx)
+ .addImm(0)
+ .addImm(Offset)
+ .addMetadata(Var)
+ .addMetadata(Expr);
+ return &*MIB;
+}
+
+static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
+ unsigned Reg, unsigned SubIdx,
+ unsigned State,
+ const TargetRegisterInfo *TRI) {
+ if (!SubIdx)
+ return MIB.addReg(Reg, State);
+
+ if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
+ return MIB.addReg(Reg, State, SubIdx);
+}
+
+static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
+ unsigned NumRegs) {
+ // We really want the positive remainder mod 32 here, that happens to be
+ // easily obtainable with a mask.
+ return ((DestReg - SrcReg) & 0x1f) < NumRegs;
+}
+
+void AArch64InstrInfo::copyPhysRegTuple(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL,
+ unsigned DestReg, unsigned SrcReg, bool KillSrc, unsigned Opcode,
+ llvm::ArrayRef<unsigned> Indices) const {
+ assert(Subtarget.hasNEON() &&
+ "Unexpected register copy without NEON");
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
+ uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
+ unsigned NumRegs = Indices.size();
+
+ int SubReg = 0, End = NumRegs, Incr = 1;
+ if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
+ SubReg = NumRegs - 1;
+ End = -1;
+ Incr = -1;
+ }
+
+ for (; SubReg != End; SubReg += Incr) {
+ const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
+ AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
+ AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
+ AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
+ }
+}
+
+void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL, unsigned DestReg,
+ unsigned SrcReg, bool KillSrc) const {
+ if (AArch64::GPR32spRegClass.contains(DestReg) &&
+ (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+ if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
+ // If either operand is WSP, expand to ADD #0.
+ if (Subtarget.hasZeroCycleRegMove()) {
+ // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
+ unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32,
+ &AArch64::GPR64spRegClass);
+ unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32,
+ &AArch64::GPR64spRegClass);
+ // This instruction is reading and writing X registers. This may upset
+ // the register scavenger and machine verifier, so we need to indicate
+ // that we are reading an undefined value from SrcRegX, but a proper
+ // value from SrcReg.
+ BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
+ .addReg(SrcRegX, RegState::Undef)
+ .addImm(0)
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
+ .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+ } else {
+ BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc))
+ .addImm(0)
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+ }
+ } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroing()) {
+ BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg).addImm(0).addImm(
+ AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+ } else {
+ if (Subtarget.hasZeroCycleRegMove()) {
+ // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
+ unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32,
+ &AArch64::GPR64spRegClass);
+ unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32,
+ &AArch64::GPR64spRegClass);
+ // This instruction is reading and writing X registers. This may upset
+ // the register scavenger and machine verifier, so we need to indicate
+ // that we are reading an undefined value from SrcRegX, but a proper
+ // value from SrcReg.
+ BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
+ .addReg(AArch64::XZR)
+ .addReg(SrcRegX, RegState::Undef)
+ .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+ } else {
+ // Otherwise, expand to ORR WZR.
+ BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
+ .addReg(AArch64::WZR)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
+ }
+ return;
+ }
+
+ if (AArch64::GPR64spRegClass.contains(DestReg) &&
+ (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
+ if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
+ // If either operand is SP, expand to ADD #0.
+ BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc))
+ .addImm(0)
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+ } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroing()) {
+ BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg).addImm(0).addImm(
+ AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+ } else {
+ // Otherwise, expand to ORR XZR.
+ BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
+ .addReg(AArch64::XZR)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
+ return;
+ }
+
+ // Copy a DDDD register quad by copying the individual sub-registers.
+ if (AArch64::DDDDRegClass.contains(DestReg) &&
+ AArch64::DDDDRegClass.contains(SrcReg)) {
+ static const unsigned Indices[] = { AArch64::dsub0, AArch64::dsub1,
+ AArch64::dsub2, AArch64::dsub3 };
+ copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
+ Indices);
+ return;
+ }
+
+ // Copy a DDD register triple by copying the individual sub-registers.
+ if (AArch64::DDDRegClass.contains(DestReg) &&
+ AArch64::DDDRegClass.contains(SrcReg)) {
+ static const unsigned Indices[] = { AArch64::dsub0, AArch64::dsub1,
+ AArch64::dsub2 };
+ copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
+ Indices);
+ return;
+ }
+
+ // Copy a DD register pair by copying the individual sub-registers.
+ if (AArch64::DDRegClass.contains(DestReg) &&
+ AArch64::DDRegClass.contains(SrcReg)) {
+ static const unsigned Indices[] = { AArch64::dsub0, AArch64::dsub1 };
+ copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
+ Indices);
+ return;
+ }
+
+ // Copy a QQQQ register quad by copying the individual sub-registers.
+ if (AArch64::QQQQRegClass.contains(DestReg) &&
+ AArch64::QQQQRegClass.contains(SrcReg)) {
+ static const unsigned Indices[] = { AArch64::qsub0, AArch64::qsub1,
+ AArch64::qsub2, AArch64::qsub3 };
+ copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
+ Indices);
+ return;
+ }
+
+ // Copy a QQQ register triple by copying the individual sub-registers.
+ if (AArch64::QQQRegClass.contains(DestReg) &&
+ AArch64::QQQRegClass.contains(SrcReg)) {
+ static const unsigned Indices[] = { AArch64::qsub0, AArch64::qsub1,
+ AArch64::qsub2 };
+ copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
+ Indices);
+ return;
+ }
+
+ // Copy a QQ register pair by copying the individual sub-registers.
+ if (AArch64::QQRegClass.contains(DestReg) &&
+ AArch64::QQRegClass.contains(SrcReg)) {
+ static const unsigned Indices[] = { AArch64::qsub0, AArch64::qsub1 };
+ copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
+ Indices);
+ return;
+ }
+
+ if (AArch64::FPR128RegClass.contains(DestReg) &&
+ AArch64::FPR128RegClass.contains(SrcReg)) {
+ if(Subtarget.hasNEON()) {
+ BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+ .addReg(SrcReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ } else {
+ BuildMI(MBB, I, DL, get(AArch64::STRQpre))
+ .addReg(AArch64::SP, RegState::Define)
+ .addReg(SrcReg, getKillRegState(KillSrc))
+ .addReg(AArch64::SP)
+ .addImm(-16);
+ BuildMI(MBB, I, DL, get(AArch64::LDRQpre))
+ .addReg(AArch64::SP, RegState::Define)
+ .addReg(DestReg, RegState::Define)
+ .addReg(AArch64::SP)
+ .addImm(16);
+ }
+ return;
+ }
+
+ if (AArch64::FPR64RegClass.contains(DestReg) &&
+ AArch64::FPR64RegClass.contains(SrcReg)) {
+ if(Subtarget.hasNEON()) {
+ DestReg = RI.getMatchingSuperReg(DestReg, AArch64::dsub,
+ &AArch64::FPR128RegClass);
+ SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::dsub,
+ &AArch64::FPR128RegClass);
+ BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+ .addReg(SrcReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ } else {
+ BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
+ return;
+ }
+
+ if (AArch64::FPR32RegClass.contains(DestReg) &&
+ AArch64::FPR32RegClass.contains(SrcReg)) {
+ if(Subtarget.hasNEON()) {
+ DestReg = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
+ &AArch64::FPR128RegClass);
+ SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
+ &AArch64::FPR128RegClass);
+ BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+ .addReg(SrcReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ } else {
+ BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
+ return;
+ }
+
+ if (AArch64::FPR16RegClass.contains(DestReg) &&
+ AArch64::FPR16RegClass.contains(SrcReg)) {
+ if(Subtarget.hasNEON()) {
+ DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
+ &AArch64::FPR128RegClass);
+ SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
+ &AArch64::FPR128RegClass);
+ BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+ .addReg(SrcReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ } else {
+ DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
+ &AArch64::FPR32RegClass);
+ SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
+ &AArch64::FPR32RegClass);
+ BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
+ return;
+ }
+
+ if (AArch64::FPR8RegClass.contains(DestReg) &&
+ AArch64::FPR8RegClass.contains(SrcReg)) {
+ if(Subtarget.hasNEON()) {
+ DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
+ &AArch64::FPR128RegClass);
+ SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
+ &AArch64::FPR128RegClass);
+ BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+ .addReg(SrcReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ } else {
+ DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
+ &AArch64::FPR32RegClass);
+ SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
+ &AArch64::FPR32RegClass);
+ BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
+ return;
+ }
+
+ // Copies between GPR64 and FPR64.
+ if (AArch64::FPR64RegClass.contains(DestReg) &&
+ AArch64::GPR64RegClass.contains(SrcReg)) {
+ BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+ }
+ if (AArch64::GPR64RegClass.contains(DestReg) &&
+ AArch64::FPR64RegClass.contains(SrcReg)) {
+ BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+ }
+ // Copies between GPR32 and FPR32.
+ if (AArch64::FPR32RegClass.contains(DestReg) &&
+ AArch64::GPR32RegClass.contains(SrcReg)) {
+ BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+ }
+ if (AArch64::GPR32RegClass.contains(DestReg) &&
+ AArch64::FPR32RegClass.contains(SrcReg)) {
+ BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+ }
+
+ if (DestReg == AArch64::NZCV) {
+ assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
+ BuildMI(MBB, I, DL, get(AArch64::MSR))
+ .addImm(AArch64SysReg::NZCV)
+ .addReg(SrcReg, getKillRegState(KillSrc))
+ .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
+ return;
+ }
+
+ if (SrcReg == AArch64::NZCV) {
+ assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
+ BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
+ .addImm(AArch64SysReg::NZCV)
+ .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
+ return;
+ }
+
+ llvm_unreachable("unimplemented reg-to-reg copy");
+}
+
+void AArch64InstrInfo::storeRegToStackSlot(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned SrcReg,
+ bool isKill, int FI, const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ DebugLoc DL;
+ if (MBBI != MBB.end())
+ DL = MBBI->getDebugLoc();
+ MachineFunction &MF = *MBB.getParent();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ unsigned Align = MFI.getObjectAlignment(FI);
+
+ MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align);
+ unsigned Opc = 0;
+ bool Offset = true;
+ switch (RC->getSize()) {
+ case 1:
+ if (AArch64::FPR8RegClass.hasSubClassEq(RC))
+ Opc = AArch64::STRBui;
+ break;
+ case 2:
+ if (AArch64::FPR16RegClass.hasSubClassEq(RC))
+ Opc = AArch64::STRHui;
+ break;
+ case 4:
+ if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
+ Opc = AArch64::STRWui;
+ if (TargetRegisterInfo::isVirtualRegister(SrcReg))
+ MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
+ else
+ assert(SrcReg != AArch64::WSP);
+ } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
+ Opc = AArch64::STRSui;
+ break;
+ case 8:
+ if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
+ Opc = AArch64::STRXui;
+ if (TargetRegisterInfo::isVirtualRegister(SrcReg))
+ MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
+ else
+ assert(SrcReg != AArch64::SP);
+ } else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
+ Opc = AArch64::STRDui;
+ break;
+ case 16:
+ if (AArch64::FPR128RegClass.hasSubClassEq(RC))
+ Opc = AArch64::STRQui;
+ else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.hasNEON() &&
+ "Unexpected register store without NEON");
+ Opc = AArch64::ST1Twov1d;
+ Offset = false;
+ }
+ break;
+ case 24:
+ if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.hasNEON() &&
+ "Unexpected register store without NEON");
+ Opc = AArch64::ST1Threev1d;
+ Offset = false;
+ }
+ break;
+ case 32:
+ if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.hasNEON() &&
+ "Unexpected register store without NEON");
+ Opc = AArch64::ST1Fourv1d;
+ Offset = false;
+ } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.hasNEON() &&
+ "Unexpected register store without NEON");
+ Opc = AArch64::ST1Twov2d;
+ Offset = false;
+ }
+ break;
+ case 48:
+ if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.hasNEON() &&
+ "Unexpected register store without NEON");
+ Opc = AArch64::ST1Threev2d;
+ Offset = false;
+ }
+ break;
+ case 64:
+ if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.hasNEON() &&
+ "Unexpected register store without NEON");
+ Opc = AArch64::ST1Fourv2d;
+ Offset = false;
+ }
+ break;
+ }
+ assert(Opc && "Unknown register class");
+
+ const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DL, get(Opc))
+ .addReg(SrcReg, getKillRegState(isKill))
+ .addFrameIndex(FI);
+
+ if (Offset)
+ MI.addImm(0);
+ MI.addMemOperand(MMO);
+}
+
+void AArch64InstrInfo::loadRegFromStackSlot(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned DestReg,
+ int FI, const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ DebugLoc DL;
+ if (MBBI != MBB.end())
+ DL = MBBI->getDebugLoc();
+ MachineFunction &MF = *MBB.getParent();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ unsigned Align = MFI.getObjectAlignment(FI);
+ MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align);
+
+ unsigned Opc = 0;
+ bool Offset = true;
+ switch (RC->getSize()) {
+ case 1:
+ if (AArch64::FPR8RegClass.hasSubClassEq(RC))
+ Opc = AArch64::LDRBui;
+ break;
+ case 2:
+ if (AArch64::FPR16RegClass.hasSubClassEq(RC))
+ Opc = AArch64::LDRHui;
+ break;
+ case 4:
+ if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
+ Opc = AArch64::LDRWui;
+ if (TargetRegisterInfo::isVirtualRegister(DestReg))
+ MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
+ else
+ assert(DestReg != AArch64::WSP);
+ } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
+ Opc = AArch64::LDRSui;
+ break;
+ case 8:
+ if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
+ Opc = AArch64::LDRXui;
+ if (TargetRegisterInfo::isVirtualRegister(DestReg))
+ MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
+ else
+ assert(DestReg != AArch64::SP);
+ } else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
+ Opc = AArch64::LDRDui;
+ break;
+ case 16:
+ if (AArch64::FPR128RegClass.hasSubClassEq(RC))
+ Opc = AArch64::LDRQui;
+ else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.hasNEON() &&
+ "Unexpected register load without NEON");
+ Opc = AArch64::LD1Twov1d;
+ Offset = false;
+ }
+ break;
+ case 24:
+ if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.hasNEON() &&
+ "Unexpected register load without NEON");
+ Opc = AArch64::LD1Threev1d;
+ Offset = false;
+ }
+ break;
+ case 32:
+ if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.hasNEON() &&
+ "Unexpected register load without NEON");
+ Opc = AArch64::LD1Fourv1d;
+ Offset = false;
+ } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.hasNEON() &&
+ "Unexpected register load without NEON");
+ Opc = AArch64::LD1Twov2d;
+ Offset = false;
+ }
+ break;
+ case 48:
+ if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.hasNEON() &&
+ "Unexpected register load without NEON");
+ Opc = AArch64::LD1Threev2d;
+ Offset = false;
+ }
+ break;
+ case 64:
+ if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.hasNEON() &&
+ "Unexpected register load without NEON");
+ Opc = AArch64::LD1Fourv2d;
+ Offset = false;
+ }
+ break;
+ }
+ assert(Opc && "Unknown register class");
+
+ const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DL, get(Opc))
+ .addReg(DestReg, getDefRegState(true))
+ .addFrameIndex(FI);
+ if (Offset)
+ MI.addImm(0);
+ MI.addMemOperand(MMO);
+}
+
+void llvm::emitFrameOffset(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+ unsigned DestReg, unsigned SrcReg, int Offset,
+ const TargetInstrInfo *TII,
+ MachineInstr::MIFlag Flag, bool SetNZCV) {
+ if (DestReg == SrcReg && Offset == 0)
+ return;
+
+ assert((DestReg != AArch64::SP || Offset % 16 == 0) &&
+ "SP increment/decrement not 16-byte aligned");
+
+ bool isSub = Offset < 0;
+ if (isSub)
+ Offset = -Offset;
+
+ // FIXME: If the offset won't fit in 24-bits, compute the offset into a
+ // scratch register. If DestReg is a virtual register, use it as the
+ // scratch register; otherwise, create a new virtual register (to be
+ // replaced by the scavenger at the end of PEI). That case can be optimized
+ // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
+ // register can be loaded with offset%8 and the add/sub can use an extending
+ // instruction with LSL#3.
+ // Currently the function handles any offsets but generates a poor sequence
+ // of code.
+ // assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
+
+ unsigned Opc;
+ if (SetNZCV)
+ Opc = isSub ? AArch64::SUBSXri : AArch64::ADDSXri;
+ else
+ Opc = isSub ? AArch64::SUBXri : AArch64::ADDXri;
+ const unsigned MaxEncoding = 0xfff;
+ const unsigned ShiftSize = 12;
+ const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
+ while (((unsigned)Offset) >= (1 << ShiftSize)) {
+ unsigned ThisVal;
+ if (((unsigned)Offset) > MaxEncodableValue) {
+ ThisVal = MaxEncodableValue;
+ } else {
+ ThisVal = Offset & MaxEncodableValue;
+ }
+ assert((ThisVal >> ShiftSize) <= MaxEncoding &&
+ "Encoding cannot handle value that big");
+ BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
+ .addReg(SrcReg)
+ .addImm(ThisVal >> ShiftSize)
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftSize))
+ .setMIFlag(Flag);
+
+ SrcReg = DestReg;
+ Offset -= ThisVal;
+ if (Offset == 0)
+ return;
+ }
+ BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
+ .addReg(SrcReg)
+ .addImm(Offset)
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
+ .setMIFlag(Flag);
+}
+
+MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
+ MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
+ MachineBasicBlock::iterator InsertPt, int FrameIndex,
+ LiveIntervals *LIS) const {
+ // This is a bit of a hack. Consider this instruction:
+ //
+ // %vreg0<def> = COPY %SP; GPR64all:%vreg0
+ //
+ // We explicitly chose GPR64all for the virtual register so such a copy might
+ // be eliminated by RegisterCoalescer. However, that may not be possible, and
+ // %vreg0 may even spill. We can't spill %SP, and since it is in the GPR64all
+ // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
+ //
+ // To prevent that, we are going to constrain the %vreg0 register class here.
+ //
+ // <rdar://problem/11522048>
+ //
+ if (MI.isCopy()) {
+ unsigned DstReg = MI.getOperand(0).getReg();
+ unsigned SrcReg = MI.getOperand(1).getReg();
+ if (SrcReg == AArch64::SP &&
+ TargetRegisterInfo::isVirtualRegister(DstReg)) {
+ MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
+ return nullptr;
+ }
+ if (DstReg == AArch64::SP &&
+ TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+ MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
+ return nullptr;
+ }
+ }
+
+ // Handle the case where a copy is being spilled or refilled but the source
+ // and destination register class don't match. For example:
+ //
+ // %vreg0<def> = COPY %XZR; GPR64common:%vreg0
+ //
+ // In this case we can still safely fold away the COPY and generate the
+ // following spill code:
+ //
+ // STRXui %XZR, <fi#0>
+ //
+ // This also eliminates spilled cross register class COPYs (e.g. between x and
+ // d regs) of the same size. For example:
+ //
+ // %vreg0<def> = COPY %vreg1; GPR64:%vreg0, FPR64:%vreg1
+ //
+ // will be refilled as
+ //
+ // LDRDui %vreg0, fi<#0>
+ //
+ // instead of
+ //
+ // LDRXui %vregTemp, fi<#0>
+ // %vreg0 = FMOV %vregTemp
+ //
+ if (MI.isFullCopy() && Ops.size() == 1 &&
+ // Make sure we're only folding the explicit COPY defs/uses.
+ (Ops[0] == 0 || Ops[0] == 1)) {
+ const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ MachineBasicBlock &MBB = *MI.getParent();
+ const MachineOperand &DstMO = MI.getOperand(0);
+ const MachineOperand &SrcMO = MI.getOperand(1);
+ unsigned DstReg = DstMO.getReg();
+ unsigned SrcReg = SrcMO.getReg();
+ auto getRegClass = [&](unsigned Reg) {
+ return TargetRegisterInfo::isVirtualRegister(Reg)
+ ? MRI.getRegClass(Reg)
+ : TRI.getMinimalPhysRegClass(Reg);
+ };
+ const TargetRegisterClass &DstRC = *getRegClass(DstReg);
+ const TargetRegisterClass &SrcRC = *getRegClass(SrcReg);
+ if (DstRC.getSize() == SrcRC.getSize()) {
+ if (Ops[0] == 0)
+ storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
+ &SrcRC, &TRI);
+ else
+ loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, &DstRC, &TRI);
+ return &*--InsertPt;
+ }
+ }
+
+ // Cannot fold.
+ return nullptr;
+}
+
+int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
+ bool *OutUseUnscaledOp,
+ unsigned *OutUnscaledOp,
+ int *EmittableOffset) {
+ int Scale = 1;
+ bool IsSigned = false;
+ // The ImmIdx should be changed case by case if it is not 2.
+ unsigned ImmIdx = 2;
+ unsigned UnscaledOp = 0;
+ // Set output values in case of early exit.
+ if (EmittableOffset)
+ *EmittableOffset = 0;
+ if (OutUseUnscaledOp)
+ *OutUseUnscaledOp = false;
+ if (OutUnscaledOp)
+ *OutUnscaledOp = 0;
+ switch (MI.getOpcode()) {
+ default:
+ llvm_unreachable("unhandled opcode in rewriteAArch64FrameIndex");
+ // Vector spills/fills can't take an immediate offset.
+ case AArch64::LD1Twov2d:
+ case AArch64::LD1Threev2d:
+ case AArch64::LD1Fourv2d:
+ case AArch64::LD1Twov1d:
+ case AArch64::LD1Threev1d:
+ case AArch64::LD1Fourv1d:
+ case AArch64::ST1Twov2d:
+ case AArch64::ST1Threev2d:
+ case AArch64::ST1Fourv2d:
+ case AArch64::ST1Twov1d:
+ case AArch64::ST1Threev1d:
+ case AArch64::ST1Fourv1d:
+ return AArch64FrameOffsetCannotUpdate;
+ case AArch64::PRFMui:
+ Scale = 8;
+ UnscaledOp = AArch64::PRFUMi;
+ break;
+ case AArch64::LDRXui:
+ Scale = 8;
+ UnscaledOp = AArch64::LDURXi;
+ break;
+ case AArch64::LDRWui:
+ Scale = 4;
+ UnscaledOp = AArch64::LDURWi;
+ break;
+ case AArch64::LDRBui:
+ Scale = 1;
+ UnscaledOp = AArch64::LDURBi;
+ break;
+ case AArch64::LDRHui:
+ Scale = 2;
+ UnscaledOp = AArch64::LDURHi;
+ break;
+ case AArch64::LDRSui:
+ Scale = 4;
+ UnscaledOp = AArch64::LDURSi;
+ break;
+ case AArch64::LDRDui:
+ Scale = 8;
+ UnscaledOp = AArch64::LDURDi;
+ break;
+ case AArch64::LDRQui:
+ Scale = 16;
+ UnscaledOp = AArch64::LDURQi;
+ break;
+ case AArch64::LDRBBui:
+ Scale = 1;
+ UnscaledOp = AArch64::LDURBBi;
+ break;
+ case AArch64::LDRHHui:
+ Scale = 2;
+ UnscaledOp = AArch64::LDURHHi;
+ break;
+ case AArch64::LDRSBXui:
+ Scale = 1;
+ UnscaledOp = AArch64::LDURSBXi;
+ break;
+ case AArch64::LDRSBWui:
+ Scale = 1;
+ UnscaledOp = AArch64::LDURSBWi;
+ break;
+ case AArch64::LDRSHXui:
+ Scale = 2;
+ UnscaledOp = AArch64::LDURSHXi;
+ break;
+ case AArch64::LDRSHWui:
+ Scale = 2;
+ UnscaledOp = AArch64::LDURSHWi;
+ break;
+ case AArch64::LDRSWui:
+ Scale = 4;
+ UnscaledOp = AArch64::LDURSWi;
+ break;
+
+ case AArch64::STRXui:
+ Scale = 8;
+ UnscaledOp = AArch64::STURXi;
+ break;
+ case AArch64::STRWui:
+ Scale = 4;
+ UnscaledOp = AArch64::STURWi;
+ break;
+ case AArch64::STRBui:
+ Scale = 1;
+ UnscaledOp = AArch64::STURBi;
+ break;
+ case AArch64::STRHui:
+ Scale = 2;
+ UnscaledOp = AArch64::STURHi;
+ break;
+ case AArch64::STRSui:
+ Scale = 4;
+ UnscaledOp = AArch64::STURSi;
+ break;
+ case AArch64::STRDui:
+ Scale = 8;
+ UnscaledOp = AArch64::STURDi;
+ break;
+ case AArch64::STRQui:
+ Scale = 16;
+ UnscaledOp = AArch64::STURQi;
+ break;
+ case AArch64::STRBBui:
+ Scale = 1;
+ UnscaledOp = AArch64::STURBBi;
+ break;
+ case AArch64::STRHHui:
+ Scale = 2;
+ UnscaledOp = AArch64::STURHHi;
+ break;
+
+ case AArch64::LDPXi:
+ case AArch64::LDPDi:
+ case AArch64::STPXi:
+ case AArch64::STPDi:
+ case AArch64::LDNPXi:
+ case AArch64::LDNPDi:
+ case AArch64::STNPXi:
+ case AArch64::STNPDi:
+ ImmIdx = 3;
+ IsSigned = true;
+ Scale = 8;
+ break;
+ case AArch64::LDPQi:
+ case AArch64::STPQi:
+ case AArch64::LDNPQi:
+ case AArch64::STNPQi:
+ ImmIdx = 3;
+ IsSigned = true;
+ Scale = 16;
+ break;
+ case AArch64::LDPWi:
+ case AArch64::LDPSi:
+ case AArch64::STPWi:
+ case AArch64::STPSi:
+ case AArch64::LDNPWi:
+ case AArch64::LDNPSi:
+ case AArch64::STNPWi:
+ case AArch64::STNPSi:
+ ImmIdx = 3;
+ IsSigned = true;
+ Scale = 4;
+ break;
+
+ case AArch64::LDURXi:
+ case AArch64::LDURWi:
+ case AArch64::LDURBi:
+ case AArch64::LDURHi:
+ case AArch64::LDURSi:
+ case AArch64::LDURDi:
+ case AArch64::LDURQi:
+ case AArch64::LDURHHi:
+ case AArch64::LDURBBi:
+ case AArch64::LDURSBXi:
+ case AArch64::LDURSBWi:
+ case AArch64::LDURSHXi:
+ case AArch64::LDURSHWi:
+ case AArch64::LDURSWi:
+ case AArch64::STURXi:
+ case AArch64::STURWi:
+ case AArch64::STURBi:
+ case AArch64::STURHi:
+ case AArch64::STURSi:
+ case AArch64::STURDi:
+ case AArch64::STURQi:
+ case AArch64::STURBBi:
+ case AArch64::STURHHi:
+ Scale = 1;
+ break;
+ }
+
+ Offset += MI.getOperand(ImmIdx).getImm() * Scale;
+
+ bool useUnscaledOp = false;
+ // If the offset doesn't match the scale, we rewrite the instruction to
+ // use the unscaled instruction instead. Likewise, if we have a negative
+ // offset (and have an unscaled op to use).
+ if ((Offset & (Scale - 1)) != 0 || (Offset < 0 && UnscaledOp != 0))
+ useUnscaledOp = true;
+
+ // Use an unscaled addressing mode if the instruction has a negative offset
+ // (or if the instruction is already using an unscaled addressing mode).
+ unsigned MaskBits;
+ if (IsSigned) {
+ // ldp/stp instructions.
+ MaskBits = 7;
+ Offset /= Scale;
+ } else if (UnscaledOp == 0 || useUnscaledOp) {
+ MaskBits = 9;
+ IsSigned = true;
+ Scale = 1;
+ } else {
+ MaskBits = 12;
+ IsSigned = false;
+ Offset /= Scale;
+ }
+
+ // Attempt to fold address computation.
+ int MaxOff = (1 << (MaskBits - IsSigned)) - 1;
+ int MinOff = (IsSigned ? (-MaxOff - 1) : 0);
+ if (Offset >= MinOff && Offset <= MaxOff) {
+ if (EmittableOffset)
+ *EmittableOffset = Offset;
+ Offset = 0;
+ } else {
+ int NewOff = Offset < 0 ? MinOff : MaxOff;
+ if (EmittableOffset)
+ *EmittableOffset = NewOff;
+ Offset = (Offset - NewOff) * Scale;
+ }
+ if (OutUseUnscaledOp)
+ *OutUseUnscaledOp = useUnscaledOp;
+ if (OutUnscaledOp)
+ *OutUnscaledOp = UnscaledOp;
+ return AArch64FrameOffsetCanUpdate |
+ (Offset == 0 ? AArch64FrameOffsetIsLegal : 0);
+}
+
+bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+ unsigned FrameReg, int &Offset,
+ const AArch64InstrInfo *TII) {
+ unsigned Opcode = MI.getOpcode();
+ unsigned ImmIdx = FrameRegIdx + 1;
+
+ if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
+ Offset += MI.getOperand(ImmIdx).getImm();
+ emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
+ MI.getOperand(0).getReg(), FrameReg, Offset, TII,
+ MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
+ MI.eraseFromParent();
+ Offset = 0;
+ return true;
+ }
+
+ int NewOffset;
+ unsigned UnscaledOp;
+ bool UseUnscaledOp;
+ int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
+ &UnscaledOp, &NewOffset);
+ if (Status & AArch64FrameOffsetCanUpdate) {
+ if (Status & AArch64FrameOffsetIsLegal)
+ // Replace the FrameIndex with FrameReg.
+ MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+ if (UseUnscaledOp)
+ MI.setDesc(TII->get(UnscaledOp));
+
+ MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
+ return Offset == 0;
+ }
+
+ return false;
+}
+
+void AArch64InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+ NopInst.setOpcode(AArch64::HINT);
+ NopInst.addOperand(MCOperand::createImm(0));
+}
+
+// AArch64 supports MachineCombiner.
+bool AArch64InstrInfo::useMachineCombiner() const {
+
+ return true;
+}
+//
+// True when Opc sets flag
+static bool isCombineInstrSettingFlag(unsigned Opc) {
+ switch (Opc) {
+ case AArch64::ADDSWrr:
+ case AArch64::ADDSWri:
+ case AArch64::ADDSXrr:
+ case AArch64::ADDSXri:
+ case AArch64::SUBSWrr:
+ case AArch64::SUBSXrr:
+ // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
+ case AArch64::SUBSWri:
+ case AArch64::SUBSXri:
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+//
+// 32b Opcodes that can be combined with a MUL
+static bool isCombineInstrCandidate32(unsigned Opc) {
+ switch (Opc) {
+ case AArch64::ADDWrr:
+ case AArch64::ADDWri:
+ case AArch64::SUBWrr:
+ case AArch64::ADDSWrr:
+ case AArch64::ADDSWri:
+ case AArch64::SUBSWrr:
+ // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
+ case AArch64::SUBWri:
+ case AArch64::SUBSWri:
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+//
+// 64b Opcodes that can be combined with a MUL
+static bool isCombineInstrCandidate64(unsigned Opc) {
+ switch (Opc) {
+ case AArch64::ADDXrr:
+ case AArch64::ADDXri:
+ case AArch64::SUBXrr:
+ case AArch64::ADDSXrr:
+ case AArch64::ADDSXri:
+ case AArch64::SUBSXrr:
+ // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
+ case AArch64::SUBXri:
+ case AArch64::SUBSXri:
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+//
+// FP Opcodes that can be combined with a FMUL
+static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
+ switch (Inst.getOpcode()) {
+ default:
+ break;
+ case AArch64::FADDSrr:
+ case AArch64::FADDDrr:
+ case AArch64::FADDv2f32:
+ case AArch64::FADDv2f64:
+ case AArch64::FADDv4f32:
+ case AArch64::FSUBSrr:
+ case AArch64::FSUBDrr:
+ case AArch64::FSUBv2f32:
+ case AArch64::FSUBv2f64:
+ case AArch64::FSUBv4f32:
+ TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
+ return (Options.UnsafeFPMath ||
+ Options.AllowFPOpFusion == FPOpFusion::Fast);
+ }
+ return false;
+}
+//
+// Opcodes that can be combined with a MUL
+static bool isCombineInstrCandidate(unsigned Opc) {
+ return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
+}
+
+//
+// Utility routine that checks if \param MO is defined by an
+// \param CombineOpc instruction in the basic block \param MBB
+static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
+ unsigned CombineOpc, unsigned ZeroReg = 0,
+ bool CheckZeroReg = false) {
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ MachineInstr *MI = nullptr;
+
+ if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+ MI = MRI.getUniqueVRegDef(MO.getReg());
+ // And it needs to be in the trace (otherwise, it won't have a depth).
+ if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
+ return false;
+ // Must only used by the user we combine with.
+ if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
+ return false;
+
+ if (CheckZeroReg) {
+ assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
+ MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
+ MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
+ // The third input reg must be zero.
+ if (MI->getOperand(3).getReg() != ZeroReg)
+ return false;
+ }
+
+ return true;
+}
+
+//
+// Is \param MO defined by an integer multiply and can be combined?
+static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
+ unsigned MulOpc, unsigned ZeroReg) {
+ return canCombine(MBB, MO, MulOpc, ZeroReg, true);
+}
+
+//
+// Is \param MO defined by a floating-point multiply and can be combined?
+static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,
+ unsigned MulOpc) {
+ return canCombine(MBB, MO, MulOpc);
+}
+
+// TODO: There are many more machine instruction opcodes to match:
+// 1. Other data types (integer, vectors)
+// 2. Other math / logic operations (xor, or)
+// 3. Other forms of the same operation (intrinsics and other variants)
+bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
+ switch (Inst.getOpcode()) {
+ case AArch64::FADDDrr:
+ case AArch64::FADDSrr:
+ case AArch64::FADDv2f32:
+ case AArch64::FADDv2f64:
+ case AArch64::FADDv4f32:
+ case AArch64::FMULDrr:
+ case AArch64::FMULSrr:
+ case AArch64::FMULX32:
+ case AArch64::FMULX64:
+ case AArch64::FMULXv2f32:
+ case AArch64::FMULXv2f64:
+ case AArch64::FMULXv4f32:
+ case AArch64::FMULv2f32:
+ case AArch64::FMULv2f64:
+ case AArch64::FMULv4f32:
+ return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
+ default:
+ return false;
+ }
+}
+
+/// Find instructions that can be turned into madd.
+static bool getMaddPatterns(MachineInstr &Root,
+ SmallVectorImpl<MachineCombinerPattern> &Patterns) {
+ unsigned Opc = Root.getOpcode();
+ MachineBasicBlock &MBB = *Root.getParent();
+ bool Found = false;
+
+ if (!isCombineInstrCandidate(Opc))
+ return false;
+ if (isCombineInstrSettingFlag(Opc)) {
+ int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true);
+ // When NZCV is live bail out.
+ if (Cmp_NZCV == -1)
+ return false;
+ unsigned NewOpc = convertFlagSettingOpcode(Root);
+ // When opcode can't change bail out.
+ // CHECKME: do we miss any cases for opcode conversion?
+ if (NewOpc == Opc)
+ return false;
+ Opc = NewOpc;
+ }
+
+ switch (Opc) {
+ default:
+ break;
+ case AArch64::ADDWrr:
+ assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
+ "ADDWrr does not have register operands");
+ if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
+ AArch64::WZR)) {
+ Patterns.push_back(MachineCombinerPattern::MULADDW_OP1);
+ Found = true;
+ }
+ if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr,
+ AArch64::WZR)) {
+ Patterns.push_back(MachineCombinerPattern::MULADDW_OP2);
+ Found = true;
+ }
+ break;
+ case AArch64::ADDXrr:
+ if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
+ AArch64::XZR)) {
+ Patterns.push_back(MachineCombinerPattern::MULADDX_OP1);
+ Found = true;
+ }
+ if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr,
+ AArch64::XZR)) {
+ Patterns.push_back(MachineCombinerPattern::MULADDX_OP2);
+ Found = true;
+ }
+ break;
+ case AArch64::SUBWrr:
+ if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
+ AArch64::WZR)) {
+ Patterns.push_back(MachineCombinerPattern::MULSUBW_OP1);
+ Found = true;
+ }
+ if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr,
+ AArch64::WZR)) {
+ Patterns.push_back(MachineCombinerPattern::MULSUBW_OP2);
+ Found = true;
+ }
+ break;
+ case AArch64::SUBXrr:
+ if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
+ AArch64::XZR)) {
+ Patterns.push_back(MachineCombinerPattern::MULSUBX_OP1);
+ Found = true;
+ }
+ if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr,
+ AArch64::XZR)) {
+ Patterns.push_back(MachineCombinerPattern::MULSUBX_OP2);
+ Found = true;
+ }
+ break;
+ case AArch64::ADDWri:
+ if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
+ AArch64::WZR)) {
+ Patterns.push_back(MachineCombinerPattern::MULADDWI_OP1);
+ Found = true;
+ }
+ break;
+ case AArch64::ADDXri:
+ if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
+ AArch64::XZR)) {
+ Patterns.push_back(MachineCombinerPattern::MULADDXI_OP1);
+ Found = true;
+ }
+ break;
+ case AArch64::SUBWri:
+ if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
+ AArch64::WZR)) {
+ Patterns.push_back(MachineCombinerPattern::MULSUBWI_OP1);
+ Found = true;
+ }
+ break;
+ case AArch64::SUBXri:
+ if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
+ AArch64::XZR)) {
+ Patterns.push_back(MachineCombinerPattern::MULSUBXI_OP1);
+ Found = true;
+ }
+ break;
+ }
+ return Found;
+}
+/// Floating-Point Support
+
+/// Find instructions that can be turned into madd.
+static bool getFMAPatterns(MachineInstr &Root,
+ SmallVectorImpl<MachineCombinerPattern> &Patterns) {
+
+ if (!isCombineInstrCandidateFP(Root))
+ return 0;
+
+ MachineBasicBlock &MBB = *Root.getParent();
+ bool Found = false;
+
+ switch (Root.getOpcode()) {
+ default:
+ assert(false && "Unsupported FP instruction in combiner\n");
+ break;
+ case AArch64::FADDSrr:
+ assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
+ "FADDWrr does not have register operands");
+ if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
+ Patterns.push_back(MachineCombinerPattern::FMULADDS_OP1);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+ AArch64::FMULv1i32_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP1);
+ Found = true;
+ }
+ if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {
+ Patterns.push_back(MachineCombinerPattern::FMULADDS_OP2);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv1i32_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP2);
+ Found = true;
+ }
+ break;
+ case AArch64::FADDDrr:
+ if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
+ Patterns.push_back(MachineCombinerPattern::FMULADDD_OP1);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+ AArch64::FMULv1i64_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP1);
+ Found = true;
+ }
+ if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
+ Patterns.push_back(MachineCombinerPattern::FMULADDD_OP2);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv1i64_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP2);
+ Found = true;
+ }
+ break;
+ case AArch64::FADDv2f32:
+ if (canCombineWithFMUL(MBB, Root.getOperand(1),
+ AArch64::FMULv2i32_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP1);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+ AArch64::FMULv2f32)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP1);
+ Found = true;
+ }
+ if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv2i32_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP2);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv2f32)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP2);
+ Found = true;
+ }
+ break;
+ case AArch64::FADDv2f64:
+ if (canCombineWithFMUL(MBB, Root.getOperand(1),
+ AArch64::FMULv2i64_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP1);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+ AArch64::FMULv2f64)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP1);
+ Found = true;
+ }
+ if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv2i64_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP2);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv2f64)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP2);
+ Found = true;
+ }
+ break;
+ case AArch64::FADDv4f32:
+ if (canCombineWithFMUL(MBB, Root.getOperand(1),
+ AArch64::FMULv4i32_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP1);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+ AArch64::FMULv4f32)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP1);
+ Found = true;
+ }
+ if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv4i32_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP2);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv4f32)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP2);
+ Found = true;
+ }
+ break;
+
+ case AArch64::FSUBSrr:
+ if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
+ Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP1);
+ Found = true;
+ }
+ if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {
+ Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP2);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv1i32_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLSv1i32_indexed_OP2);
+ Found = true;
+ }
+ break;
+ case AArch64::FSUBDrr:
+ if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
+ Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP1);
+ Found = true;
+ }
+ if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
+ Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP2);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv1i64_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLSv1i64_indexed_OP2);
+ Found = true;
+ }
+ break;
+ case AArch64::FSUBv2f32:
+ if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv2i32_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP2);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv2f32)) {
+ Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP2);
+ Found = true;
+ }
+ break;
+ case AArch64::FSUBv2f64:
+ if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv2i64_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP2);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv2f64)) {
+ Patterns.push_back(MachineCombinerPattern::FMLSv2f64_OP2);
+ Found = true;
+ }
+ break;
+ case AArch64::FSUBv4f32:
+ if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv4i32_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP2);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv4f32)) {
+ Patterns.push_back(MachineCombinerPattern::FMLSv4f32_OP2);
+ Found = true;
+ }
+ break;
+ }
+ return Found;
+}
+
+/// Return true when a code sequence can improve throughput. It
+/// should be called only for instructions in loops.
+/// \param Pattern - combiner pattern
+bool
+AArch64InstrInfo::isThroughputPattern(MachineCombinerPattern Pattern) const {
+ switch (Pattern) {
+ default:
+ break;
+ case MachineCombinerPattern::FMULADDS_OP1:
+ case MachineCombinerPattern::FMULADDS_OP2:
+ case MachineCombinerPattern::FMULSUBS_OP1:
+ case MachineCombinerPattern::FMULSUBS_OP2:
+ case MachineCombinerPattern::FMULADDD_OP1:
+ case MachineCombinerPattern::FMULADDD_OP2:
+ case MachineCombinerPattern::FMULSUBD_OP1:
+ case MachineCombinerPattern::FMULSUBD_OP2:
+ case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
+ case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
+ case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
+ case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
+ case MachineCombinerPattern::FMLAv2f32_OP2:
+ case MachineCombinerPattern::FMLAv2f32_OP1:
+ case MachineCombinerPattern::FMLAv2f64_OP1:
+ case MachineCombinerPattern::FMLAv2f64_OP2:
+ case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
+ case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
+ case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
+ case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
+ case MachineCombinerPattern::FMLAv4f32_OP1:
+ case MachineCombinerPattern::FMLAv4f32_OP2:
+ case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
+ case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
+ case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
+ case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
+ case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
+ case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
+ case MachineCombinerPattern::FMLSv2f32_OP2:
+ case MachineCombinerPattern::FMLSv2f64_OP2:
+ case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
+ case MachineCombinerPattern::FMLSv4f32_OP2:
+ return true;
+ } // end switch (Pattern)
+ return false;
+}
+/// Return true when there is potentially a faster code sequence for an
+/// instruction chain ending in \p Root. All potential patterns are listed in
+/// the \p Pattern vector. Pattern should be sorted in priority order since the
+/// pattern evaluator stops checking as soon as it finds a faster sequence.
+
+bool AArch64InstrInfo::getMachineCombinerPatterns(
+ MachineInstr &Root,
+ SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
+ // Integer patterns
+ if (getMaddPatterns(Root, Patterns))
+ return true;
+ // Floating point patterns
+ if (getFMAPatterns(Root, Patterns))
+ return true;
+
+ return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);
+}
+
+enum class FMAInstKind { Default, Indexed, Accumulator };
+/// genFusedMultiply - Generate fused multiply instructions.
+/// This function supports both integer and floating point instructions.
+/// A typical example:
+/// F|MUL I=A,B,0
+/// F|ADD R,I,C
+/// ==> F|MADD R,A,B,C
+/// \param Root is the F|ADD instruction
+/// \param [out] InsInstrs is a vector of machine instructions and will
+/// contain the generated madd instruction
+/// \param IdxMulOpd is index of operand in Root that is the result of
+/// the F|MUL. In the example above IdxMulOpd is 1.
+/// \param MaddOpc the opcode fo the f|madd instruction
+static MachineInstr *
+genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
+ const TargetInstrInfo *TII, MachineInstr &Root,
+ SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
+ unsigned MaddOpc, const TargetRegisterClass *RC,
+ FMAInstKind kind = FMAInstKind::Default) {
+ assert(IdxMulOpd == 1 || IdxMulOpd == 2);
+
+ unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
+ MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
+ unsigned ResultReg = Root.getOperand(0).getReg();
+ unsigned SrcReg0 = MUL->getOperand(1).getReg();
+ bool Src0IsKill = MUL->getOperand(1).isKill();
+ unsigned SrcReg1 = MUL->getOperand(2).getReg();
+ bool Src1IsKill = MUL->getOperand(2).isKill();
+ unsigned SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
+ bool Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
+
+ if (TargetRegisterInfo::isVirtualRegister(ResultReg))
+ MRI.constrainRegClass(ResultReg, RC);
+ if (TargetRegisterInfo::isVirtualRegister(SrcReg0))
+ MRI.constrainRegClass(SrcReg0, RC);
+ if (TargetRegisterInfo::isVirtualRegister(SrcReg1))
+ MRI.constrainRegClass(SrcReg1, RC);
+ if (TargetRegisterInfo::isVirtualRegister(SrcReg2))
+ MRI.constrainRegClass(SrcReg2, RC);
+
+ MachineInstrBuilder MIB;
+ if (kind == FMAInstKind::Default)
+ MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
+ .addReg(SrcReg0, getKillRegState(Src0IsKill))
+ .addReg(SrcReg1, getKillRegState(Src1IsKill))
+ .addReg(SrcReg2, getKillRegState(Src2IsKill));
+ else if (kind == FMAInstKind::Indexed)
+ MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
+ .addReg(SrcReg2, getKillRegState(Src2IsKill))
+ .addReg(SrcReg0, getKillRegState(Src0IsKill))
+ .addReg(SrcReg1, getKillRegState(Src1IsKill))
+ .addImm(MUL->getOperand(3).getImm());
+ else if (kind == FMAInstKind::Accumulator)
+ MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
+ .addReg(SrcReg2, getKillRegState(Src2IsKill))
+ .addReg(SrcReg0, getKillRegState(Src0IsKill))
+ .addReg(SrcReg1, getKillRegState(Src1IsKill));
+ else
+ assert(false && "Invalid FMA instruction kind \n");
+ // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
+ InsInstrs.push_back(MIB);
+ return MUL;
+}
+
+/// genMaddR - Generate madd instruction and combine mul and add using
+/// an extra virtual register
+/// Example - an ADD intermediate needs to be stored in a register:
+/// MUL I=A,B,0
+/// ADD R,I,Imm
+/// ==> ORR V, ZR, Imm
+/// ==> MADD R,A,B,V
+/// \param Root is the ADD instruction
+/// \param [out] InsInstrs is a vector of machine instructions and will
+/// contain the generated madd instruction
+/// \param IdxMulOpd is index of operand in Root that is the result of
+/// the MUL. In the example above IdxMulOpd is 1.
+/// \param MaddOpc the opcode fo the madd instruction
+/// \param VR is a virtual register that holds the value of an ADD operand
+/// (V in the example above).
+static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
+ const TargetInstrInfo *TII, MachineInstr &Root,
+ SmallVectorImpl<MachineInstr *> &InsInstrs,
+ unsigned IdxMulOpd, unsigned MaddOpc,
+ unsigned VR, const TargetRegisterClass *RC) {
+ assert(IdxMulOpd == 1 || IdxMulOpd == 2);
+
+ MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
+ unsigned ResultReg = Root.getOperand(0).getReg();
+ unsigned SrcReg0 = MUL->getOperand(1).getReg();
+ bool Src0IsKill = MUL->getOperand(1).isKill();
+ unsigned SrcReg1 = MUL->getOperand(2).getReg();
+ bool Src1IsKill = MUL->getOperand(2).isKill();
+
+ if (TargetRegisterInfo::isVirtualRegister(ResultReg))
+ MRI.constrainRegClass(ResultReg, RC);
+ if (TargetRegisterInfo::isVirtualRegister(SrcReg0))
+ MRI.constrainRegClass(SrcReg0, RC);
+ if (TargetRegisterInfo::isVirtualRegister(SrcReg1))
+ MRI.constrainRegClass(SrcReg1, RC);
+ if (TargetRegisterInfo::isVirtualRegister(VR))
+ MRI.constrainRegClass(VR, RC);
+
+ MachineInstrBuilder MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc),
+ ResultReg)
+ .addReg(SrcReg0, getKillRegState(Src0IsKill))
+ .addReg(SrcReg1, getKillRegState(Src1IsKill))
+ .addReg(VR);
+ // Insert the MADD
+ InsInstrs.push_back(MIB);
+ return MUL;
+}
+
+/// When getMachineCombinerPatterns() finds potential patterns,
+/// this function generates the instructions that could replace the
+/// original code sequence
+void AArch64InstrInfo::genAlternativeCodeSequence(
+ MachineInstr &Root, MachineCombinerPattern Pattern,
+ SmallVectorImpl<MachineInstr *> &InsInstrs,
+ SmallVectorImpl<MachineInstr *> &DelInstrs,
+ DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
+ MachineBasicBlock &MBB = *Root.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ MachineFunction &MF = *MBB.getParent();
+ const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+
+ MachineInstr *MUL;
+ const TargetRegisterClass *RC;
+ unsigned Opc;
+ switch (Pattern) {
+ default:
+ // Reassociate instructions.
+ TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
+ DelInstrs, InstrIdxForVirtReg);
+ return;
+ case MachineCombinerPattern::MULADDW_OP1:
+ case MachineCombinerPattern::MULADDX_OP1:
+ // MUL I=A,B,0
+ // ADD R,I,C
+ // ==> MADD R,A,B,C
+ // --- Create(MADD);
+ if (Pattern == MachineCombinerPattern::MULADDW_OP1) {
+ Opc = AArch64::MADDWrrr;
+ RC = &AArch64::GPR32RegClass;
+ } else {
+ Opc = AArch64::MADDXrrr;
+ RC = &AArch64::GPR64RegClass;
+ }
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+ break;
+ case MachineCombinerPattern::MULADDW_OP2:
+ case MachineCombinerPattern::MULADDX_OP2:
+ // MUL I=A,B,0
+ // ADD R,C,I
+ // ==> MADD R,A,B,C
+ // --- Create(MADD);
+ if (Pattern == MachineCombinerPattern::MULADDW_OP2) {
+ Opc = AArch64::MADDWrrr;
+ RC = &AArch64::GPR32RegClass;
+ } else {
+ Opc = AArch64::MADDXrrr;
+ RC = &AArch64::GPR64RegClass;
+ }
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ break;
+ case MachineCombinerPattern::MULADDWI_OP1:
+ case MachineCombinerPattern::MULADDXI_OP1: {
+ // MUL I=A,B,0
+ // ADD R,I,Imm
+ // ==> ORR V, ZR, Imm
+ // ==> MADD R,A,B,V
+ // --- Create(MADD);
+ const TargetRegisterClass *OrrRC;
+ unsigned BitSize, OrrOpc, ZeroReg;
+ if (Pattern == MachineCombinerPattern::MULADDWI_OP1) {
+ OrrOpc = AArch64::ORRWri;
+ OrrRC = &AArch64::GPR32spRegClass;
+ BitSize = 32;
+ ZeroReg = AArch64::WZR;
+ Opc = AArch64::MADDWrrr;
+ RC = &AArch64::GPR32RegClass;
+ } else {
+ OrrOpc = AArch64::ORRXri;
+ OrrRC = &AArch64::GPR64spRegClass;
+ BitSize = 64;
+ ZeroReg = AArch64::XZR;
+ Opc = AArch64::MADDXrrr;
+ RC = &AArch64::GPR64RegClass;
+ }
+ unsigned NewVR = MRI.createVirtualRegister(OrrRC);
+ uint64_t Imm = Root.getOperand(2).getImm();
+
+ if (Root.getOperand(3).isImm()) {
+ unsigned Val = Root.getOperand(3).getImm();
+ Imm = Imm << Val;
+ }
+ uint64_t UImm = SignExtend64(Imm, BitSize);
+ uint64_t Encoding;
+ if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
+ MachineInstrBuilder MIB1 =
+ BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
+ .addReg(ZeroReg)
+ .addImm(Encoding);
+ InsInstrs.push_back(MIB1);
+ InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
+ MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
+ }
+ break;
+ }
+ case MachineCombinerPattern::MULSUBW_OP1:
+ case MachineCombinerPattern::MULSUBX_OP1: {
+ // MUL I=A,B,0
+ // SUB R,I, C
+ // ==> SUB V, 0, C
+ // ==> MADD R,A,B,V // = -C + A*B
+ // --- Create(MADD);
+ const TargetRegisterClass *SubRC;
+ unsigned SubOpc, ZeroReg;
+ if (Pattern == MachineCombinerPattern::MULSUBW_OP1) {
+ SubOpc = AArch64::SUBWrr;
+ SubRC = &AArch64::GPR32spRegClass;
+ ZeroReg = AArch64::WZR;
+ Opc = AArch64::MADDWrrr;
+ RC = &AArch64::GPR32RegClass;
+ } else {
+ SubOpc = AArch64::SUBXrr;
+ SubRC = &AArch64::GPR64spRegClass;
+ ZeroReg = AArch64::XZR;
+ Opc = AArch64::MADDXrrr;
+ RC = &AArch64::GPR64RegClass;
+ }
+ unsigned NewVR = MRI.createVirtualRegister(SubRC);
+ // SUB NewVR, 0, C
+ MachineInstrBuilder MIB1 =
+ BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR)
+ .addReg(ZeroReg)
+ .addOperand(Root.getOperand(2));
+ InsInstrs.push_back(MIB1);
+ InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
+ MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
+ break;
+ }
+ case MachineCombinerPattern::MULSUBW_OP2:
+ case MachineCombinerPattern::MULSUBX_OP2:
+ // MUL I=A,B,0
+ // SUB R,C,I
+ // ==> MSUB R,A,B,C (computes C - A*B)
+ // --- Create(MSUB);
+ if (Pattern == MachineCombinerPattern::MULSUBW_OP2) {
+ Opc = AArch64::MSUBWrrr;
+ RC = &AArch64::GPR32RegClass;
+ } else {
+ Opc = AArch64::MSUBXrrr;
+ RC = &AArch64::GPR64RegClass;
+ }
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ break;
+ case MachineCombinerPattern::MULSUBWI_OP1:
+ case MachineCombinerPattern::MULSUBXI_OP1: {
+ // MUL I=A,B,0
+ // SUB R,I, Imm
+ // ==> ORR V, ZR, -Imm
+ // ==> MADD R,A,B,V // = -Imm + A*B
+ // --- Create(MADD);
+ const TargetRegisterClass *OrrRC;
+ unsigned BitSize, OrrOpc, ZeroReg;
+ if (Pattern == MachineCombinerPattern::MULSUBWI_OP1) {
+ OrrOpc = AArch64::ORRWri;
+ OrrRC = &AArch64::GPR32spRegClass;
+ BitSize = 32;
+ ZeroReg = AArch64::WZR;
+ Opc = AArch64::MADDWrrr;
+ RC = &AArch64::GPR32RegClass;
+ } else {
+ OrrOpc = AArch64::ORRXri;
+ OrrRC = &AArch64::GPR64spRegClass;
+ BitSize = 64;
+ ZeroReg = AArch64::XZR;
+ Opc = AArch64::MADDXrrr;
+ RC = &AArch64::GPR64RegClass;
+ }
+ unsigned NewVR = MRI.createVirtualRegister(OrrRC);
+ uint64_t Imm = Root.getOperand(2).getImm();
+ if (Root.getOperand(3).isImm()) {
+ unsigned Val = Root.getOperand(3).getImm();
+ Imm = Imm << Val;
+ }
+ uint64_t UImm = SignExtend64(-Imm, BitSize);
+ uint64_t Encoding;
+ if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
+ MachineInstrBuilder MIB1 =
+ BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
+ .addReg(ZeroReg)
+ .addImm(Encoding);
+ InsInstrs.push_back(MIB1);
+ InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
+ MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
+ }
+ break;
+ }
+ // Floating Point Support
+ case MachineCombinerPattern::FMULADDS_OP1:
+ case MachineCombinerPattern::FMULADDD_OP1:
+ // MUL I=A,B,0
+ // ADD R,I,C
+ // ==> MADD R,A,B,C
+ // --- Create(MADD);
+ if (Pattern == MachineCombinerPattern::FMULADDS_OP1) {
+ Opc = AArch64::FMADDSrrr;
+ RC = &AArch64::FPR32RegClass;
+ } else {
+ Opc = AArch64::FMADDDrrr;
+ RC = &AArch64::FPR64RegClass;
+ }
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+ break;
+ case MachineCombinerPattern::FMULADDS_OP2:
+ case MachineCombinerPattern::FMULADDD_OP2:
+ // FMUL I=A,B,0
+ // FADD R,C,I
+ // ==> FMADD R,A,B,C
+ // --- Create(FMADD);
+ if (Pattern == MachineCombinerPattern::FMULADDS_OP2) {
+ Opc = AArch64::FMADDSrrr;
+ RC = &AArch64::FPR32RegClass;
+ } else {
+ Opc = AArch64::FMADDDrrr;
+ RC = &AArch64::FPR64RegClass;
+ }
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ break;
+
+ case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
+ Opc = AArch64::FMLAv1i32_indexed;
+ RC = &AArch64::FPR32RegClass;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Indexed);
+ break;
+ case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
+ Opc = AArch64::FMLAv1i32_indexed;
+ RC = &AArch64::FPR32RegClass;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Indexed);
+ break;
+
+ case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
+ Opc = AArch64::FMLAv1i64_indexed;
+ RC = &AArch64::FPR64RegClass;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Indexed);
+ break;
+ case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
+ Opc = AArch64::FMLAv1i64_indexed;
+ RC = &AArch64::FPR64RegClass;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Indexed);
+ break;
+
+ case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
+ case MachineCombinerPattern::FMLAv2f32_OP1:
+ RC = &AArch64::FPR64RegClass;
+ if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
+ Opc = AArch64::FMLAv2i32_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Indexed);
+ } else {
+ Opc = AArch64::FMLAv2f32;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Accumulator);
+ }
+ break;
+ case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
+ case MachineCombinerPattern::FMLAv2f32_OP2:
+ RC = &AArch64::FPR64RegClass;
+ if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
+ Opc = AArch64::FMLAv2i32_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Indexed);
+ } else {
+ Opc = AArch64::FMLAv2f32;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Accumulator);
+ }
+ break;
+
+ case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
+ case MachineCombinerPattern::FMLAv2f64_OP1:
+ RC = &AArch64::FPR128RegClass;
+ if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
+ Opc = AArch64::FMLAv2i64_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Indexed);
+ } else {
+ Opc = AArch64::FMLAv2f64;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Accumulator);
+ }
+ break;
+ case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
+ case MachineCombinerPattern::FMLAv2f64_OP2:
+ RC = &AArch64::FPR128RegClass;
+ if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) {
+ Opc = AArch64::FMLAv2i64_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Indexed);
+ } else {
+ Opc = AArch64::FMLAv2f64;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Accumulator);
+ }
+ break;
+
+ case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
+ case MachineCombinerPattern::FMLAv4f32_OP1:
+ RC = &AArch64::FPR128RegClass;
+ if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) {
+ Opc = AArch64::FMLAv4i32_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Indexed);
+ } else {
+ Opc = AArch64::FMLAv4f32;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Accumulator);
+ }
+ break;
+
+ case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
+ case MachineCombinerPattern::FMLAv4f32_OP2:
+ RC = &AArch64::FPR128RegClass;
+ if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
+ Opc = AArch64::FMLAv4i32_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Indexed);
+ } else {
+ Opc = AArch64::FMLAv4f32;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Accumulator);
+ }
+ break;
+
+ case MachineCombinerPattern::FMULSUBS_OP1:
+ case MachineCombinerPattern::FMULSUBD_OP1: {
+ // FMUL I=A,B,0
+ // FSUB R,I,C
+ // ==> FNMSUB R,A,B,C // = -C + A*B
+ // --- Create(FNMSUB);
+ if (Pattern == MachineCombinerPattern::FMULSUBS_OP1) {
+ Opc = AArch64::FNMSUBSrrr;
+ RC = &AArch64::FPR32RegClass;
+ } else {
+ Opc = AArch64::FNMSUBDrrr;
+ RC = &AArch64::FPR64RegClass;
+ }
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+ break;
+ }
+ case MachineCombinerPattern::FMULSUBS_OP2:
+ case MachineCombinerPattern::FMULSUBD_OP2: {
+ // FMUL I=A,B,0
+ // FSUB R,C,I
+ // ==> FMSUB R,A,B,C (computes C - A*B)
+ // --- Create(FMSUB);
+ if (Pattern == MachineCombinerPattern::FMULSUBS_OP2) {
+ Opc = AArch64::FMSUBSrrr;
+ RC = &AArch64::FPR32RegClass;
+ } else {
+ Opc = AArch64::FMSUBDrrr;
+ RC = &AArch64::FPR64RegClass;
+ }
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ break;
+
+ case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
+ Opc = AArch64::FMLSv1i32_indexed;
+ RC = &AArch64::FPR32RegClass;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Indexed);
+ break;
+
+ case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
+ Opc = AArch64::FMLSv1i64_indexed;
+ RC = &AArch64::FPR64RegClass;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Indexed);
+ break;
+
+ case MachineCombinerPattern::FMLSv2f32_OP2:
+ case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
+ RC = &AArch64::FPR64RegClass;
+ if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
+ Opc = AArch64::FMLSv2i32_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Indexed);
+ } else {
+ Opc = AArch64::FMLSv2f32;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Accumulator);
+ }
+ break;
+
+ case MachineCombinerPattern::FMLSv2f64_OP2:
+ case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
+ RC = &AArch64::FPR128RegClass;
+ if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
+ Opc = AArch64::FMLSv2i64_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Indexed);
+ } else {
+ Opc = AArch64::FMLSv2f64;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Accumulator);
+ }
+ break;
+
+ case MachineCombinerPattern::FMLSv4f32_OP2:
+ case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
+ RC = &AArch64::FPR128RegClass;
+ if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) {
+ Opc = AArch64::FMLSv4i32_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Indexed);
+ } else {
+ Opc = AArch64::FMLSv4f32;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Accumulator);
+ }
+ break;
+ }
+ } // end switch (Pattern)
+ // Record MUL and ADD/SUB for deletion
+ DelInstrs.push_back(MUL);
+ DelInstrs.push_back(&Root);
+
+ return;
+}
+
+/// \brief Replace csincr-branch sequence by simple conditional branch
+///
+/// Examples:
+/// 1.
+/// csinc w9, wzr, wzr, <condition code>
+/// tbnz w9, #0, 0x44
+/// to
+/// b.<inverted condition code>
+///
+/// 2.
+/// csinc w9, wzr, wzr, <condition code>
+/// tbz w9, #0, 0x44
+/// to
+/// b.<condition code>
+///
+/// Replace compare and branch sequence by TBZ/TBNZ instruction when the
+/// compare's constant operand is power of 2.
+///
+/// Examples:
+/// and w8, w8, #0x400
+/// cbnz w8, L1
+/// to
+/// tbnz w8, #10, L1
+///
+/// \param MI Conditional Branch
+/// \return True when the simple conditional branch is generated
+///
+bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
+ bool IsNegativeBranch = false;
+ bool IsTestAndBranch = false;
+ unsigned TargetBBInMI = 0;
+ switch (MI.getOpcode()) {
+ default:
+ llvm_unreachable("Unknown branch instruction?");
+ case AArch64::Bcc:
+ return false;
+ case AArch64::CBZW:
+ case AArch64::CBZX:
+ TargetBBInMI = 1;
+ break;
+ case AArch64::CBNZW:
+ case AArch64::CBNZX:
+ TargetBBInMI = 1;
+ IsNegativeBranch = true;
+ break;
+ case AArch64::TBZW:
+ case AArch64::TBZX:
+ TargetBBInMI = 2;
+ IsTestAndBranch = true;
+ break;
+ case AArch64::TBNZW:
+ case AArch64::TBNZX:
+ TargetBBInMI = 2;
+ IsNegativeBranch = true;
+ IsTestAndBranch = true;
+ break;
+ }
+ // So we increment a zero register and test for bits other
+ // than bit 0? Conservatively bail out in case the verifier
+ // missed this case.
+ if (IsTestAndBranch && MI.getOperand(1).getImm())
+ return false;
+
+ // Find Definition.
+ assert(MI.getParent() && "Incomplete machine instruciton\n");
+ MachineBasicBlock *MBB = MI.getParent();
+ MachineFunction *MF = MBB->getParent();
+ MachineRegisterInfo *MRI = &MF->getRegInfo();
+ unsigned VReg = MI.getOperand(0).getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(VReg))
+ return false;
+
+ MachineInstr *DefMI = MRI->getVRegDef(VReg);
+
+ // Look through COPY instructions to find definition.
+ while (DefMI->isCopy()) {
+ unsigned CopyVReg = DefMI->getOperand(1).getReg();
+ if (!MRI->hasOneNonDBGUse(CopyVReg))
+ return false;
+ if (!MRI->hasOneDef(CopyVReg))
+ return false;
+ DefMI = MRI->getVRegDef(CopyVReg);
+ }
+
+ switch (DefMI->getOpcode()) {
+ default:
+ return false;
+ // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
+ case AArch64::ANDWri:
+ case AArch64::ANDXri: {
+ if (IsTestAndBranch)
+ return false;
+ if (DefMI->getParent() != MBB)
+ return false;
+ if (!MRI->hasOneNonDBGUse(VReg))
+ return false;
+
+ bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
+ uint64_t Mask = AArch64_AM::decodeLogicalImmediate(
+ DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
+ if (!isPowerOf2_64(Mask))
+ return false;
+
+ MachineOperand &MO = DefMI->getOperand(1);
+ unsigned NewReg = MO.getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(NewReg))
+ return false;
+
+ assert(!MRI->def_empty(NewReg) && "Register must be defined.");
+
+ MachineBasicBlock &RefToMBB = *MBB;
+ MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
+ DebugLoc DL = MI.getDebugLoc();
+ unsigned Imm = Log2_64(Mask);
+ unsigned Opc = (Imm < 32)
+ ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
+ : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
+ MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
+ .addReg(NewReg)
+ .addImm(Imm)
+ .addMBB(TBB);
+ // Register lives on to the CBZ now.
+ MO.setIsKill(false);
+
+ // For immediate smaller than 32, we need to use the 32-bit
+ // variant (W) in all cases. Indeed the 64-bit variant does not
+ // allow to encode them.
+ // Therefore, if the input register is 64-bit, we need to take the
+ // 32-bit sub-part.
+ if (!Is32Bit && Imm < 32)
+ NewMI->getOperand(0).setSubReg(AArch64::sub_32);
+ MI.eraseFromParent();
+ return true;
+ }
+ // Look for CSINC
+ case AArch64::CSINCWr:
+ case AArch64::CSINCXr: {
+ if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
+ DefMI->getOperand(2).getReg() == AArch64::WZR) &&
+ !(DefMI->getOperand(1).getReg() == AArch64::XZR &&
+ DefMI->getOperand(2).getReg() == AArch64::XZR))
+ return false;
+
+ if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1)
+ return false;
+
+ AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
+ // Convert only when the condition code is not modified between
+ // the CSINC and the branch. The CC may be used by other
+ // instructions in between.
+ if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write))
+ return false;
+ MachineBasicBlock &RefToMBB = *MBB;
+ MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
+ DebugLoc DL = MI.getDebugLoc();
+ if (IsNegativeBranch)
+ CC = AArch64CC::getInvertedCondCode(CC);
+ BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
+ MI.eraseFromParent();
+ return true;
+ }
+ }
+}
+
+std::pair<unsigned, unsigned>
+AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
+ const unsigned Mask = AArch64II::MO_FRAGMENT;
+ return std::make_pair(TF & Mask, TF & ~Mask);
+}
+
+ArrayRef<std::pair<unsigned, const char *>>
+AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
+ using namespace AArch64II;
+ static const std::pair<unsigned, const char *> TargetFlags[] = {
+ {MO_PAGE, "aarch64-page"},
+ {MO_PAGEOFF, "aarch64-pageoff"},
+ {MO_G3, "aarch64-g3"},
+ {MO_G2, "aarch64-g2"},
+ {MO_G1, "aarch64-g1"},
+ {MO_G0, "aarch64-g0"},
+ {MO_HI12, "aarch64-hi12"}};
+ return makeArrayRef(TargetFlags);
+}
+
+ArrayRef<std::pair<unsigned, const char *>>
+AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
+ using namespace AArch64II;
+ static const std::pair<unsigned, const char *> TargetFlags[] = {
+ {MO_GOT, "aarch64-got"},
+ {MO_NC, "aarch64-nc"},
+ {MO_TLS, "aarch64-tls"}};
+ return makeArrayRef(TargetFlags);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h
new file mode 100644
index 000000000000..90b2c0896872
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -0,0 +1,318 @@
+//===- AArch64InstrInfo.h - AArch64 Instruction Information -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64INSTRINFO_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64INSTRINFO_H
+
+#include "AArch64.h"
+#include "AArch64RegisterInfo.h"
+#include "llvm/CodeGen/MachineCombinerPattern.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "AArch64GenInstrInfo.inc"
+
+namespace llvm {
+
+class AArch64Subtarget;
+class AArch64TargetMachine;
+
+class AArch64InstrInfo final : public AArch64GenInstrInfo {
+ const AArch64RegisterInfo RI;
+ const AArch64Subtarget &Subtarget;
+
+public:
+ explicit AArch64InstrInfo(const AArch64Subtarget &STI);
+
+ /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As
+ /// such, whenever a client has an instance of instruction info, it should
+ /// always be able to get register info as well (through this method).
+ const AArch64RegisterInfo &getRegisterInfo() const { return RI; }
+
+ unsigned getInstSizeInBytes(const MachineInstr &MI) const override;
+
+ bool isAsCheapAsAMove(const MachineInstr &MI) const override;
+
+ bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
+ unsigned &DstReg, unsigned &SubIdx) const override;
+
+ bool
+ areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
+ AliasAnalysis *AA = nullptr) const override;
+
+ unsigned isLoadFromStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const override;
+ unsigned isStoreToStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const override;
+
+ /// Returns true if there is a shiftable register and that the shift value
+ /// is non-zero.
+ bool hasShiftedReg(const MachineInstr &MI) const;
+
+ /// Returns true if there is an extendable register and that the extending
+ /// value is non-zero.
+ bool hasExtendedReg(const MachineInstr &MI) const;
+
+ /// \brief Does this instruction set its full destination register to zero?
+ bool isGPRZero(const MachineInstr &MI) const;
+
+ /// \brief Does this instruction rename a GPR without modifying bits?
+ bool isGPRCopy(const MachineInstr &MI) const;
+
+ /// \brief Does this instruction rename an FPR without modifying bits?
+ bool isFPRCopy(const MachineInstr &MI) const;
+
+ /// Return true if this is load/store scales or extends its register offset.
+ /// This refers to scaling a dynamic index as opposed to scaled immediates.
+ /// MI should be a memory op that allows scaled addressing.
+ bool isScaledAddr(const MachineInstr &MI) const;
+
+ /// Return true if pairing the given load or store is hinted to be
+ /// unprofitable.
+ bool isLdStPairSuppressed(const MachineInstr &MI) const;
+
+ /// Return true if this is an unscaled load/store.
+ bool isUnscaledLdSt(unsigned Opc) const;
+
+ /// Return true if this is an unscaled load/store.
+ bool isUnscaledLdSt(MachineInstr &MI) const;
+
+ static bool isPairableLdStInst(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ default:
+ return false;
+ // Scaled instructions.
+ case AArch64::STRSui:
+ case AArch64::STRDui:
+ case AArch64::STRQui:
+ case AArch64::STRXui:
+ case AArch64::STRWui:
+ case AArch64::LDRSui:
+ case AArch64::LDRDui:
+ case AArch64::LDRQui:
+ case AArch64::LDRXui:
+ case AArch64::LDRWui:
+ case AArch64::LDRSWui:
+ // Unscaled instructions.
+ case AArch64::STURSi:
+ case AArch64::STURDi:
+ case AArch64::STURQi:
+ case AArch64::STURWi:
+ case AArch64::STURXi:
+ case AArch64::LDURSi:
+ case AArch64::LDURDi:
+ case AArch64::LDURQi:
+ case AArch64::LDURWi:
+ case AArch64::LDURXi:
+ case AArch64::LDURSWi:
+ return true;
+ }
+ }
+
+ /// Return true if this is a load/store that can be potentially paired/merged.
+ bool isCandidateToMergeOrPair(MachineInstr &MI) const;
+
+ /// Hint that pairing the given load or store is unprofitable.
+ void suppressLdStPair(MachineInstr &MI) const;
+
+ bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
+ int64_t &Offset,
+ const TargetRegisterInfo *TRI) const override;
+
+ bool getMemOpBaseRegImmOfsWidth(MachineInstr &LdSt, unsigned &BaseReg,
+ int64_t &Offset, unsigned &Width,
+ const TargetRegisterInfo *TRI) const;
+
+ bool shouldClusterMemOps(MachineInstr &FirstLdSt, MachineInstr &SecondLdSt,
+ unsigned NumLoads) const override;
+
+ bool shouldScheduleAdjacent(const MachineInstr &First,
+ const MachineInstr &Second) const override;
+
+ MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
+ uint64_t Offset, const MDNode *Var,
+ const MDNode *Expr,
+ const DebugLoc &DL) const;
+ void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+ bool KillSrc, unsigned Opcode,
+ llvm::ArrayRef<unsigned> Indices) const;
+ void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+ bool KillSrc) const override;
+
+ void storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, unsigned SrcReg,
+ bool isKill, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
+
+ void loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, unsigned DestReg,
+ int FrameIndex, const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
+
+ using TargetInstrInfo::foldMemoryOperandImpl;
+ MachineInstr *
+ foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
+ ArrayRef<unsigned> Ops,
+ MachineBasicBlock::iterator InsertPt, int FrameIndex,
+ LiveIntervals *LIS = nullptr) const override;
+
+ /// \returns true if a branch from an instruction with opcode \p BranchOpc
+ /// bytes is capable of jumping to a position \p BrOffset bytes away.
+ bool isBranchOffsetInRange(unsigned BranchOpc,
+ int64_t BrOffset) const override;
+
+ MachineBasicBlock *getBranchDestBlock(const MachineInstr &MI) const override;
+
+ bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify = false) const override;
+ unsigned removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved = nullptr) const override;
+ unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+ const DebugLoc &DL,
+ int *BytesAdded = nullptr) const override;
+ bool
+ reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+ bool canInsertSelect(const MachineBasicBlock &, ArrayRef<MachineOperand> Cond,
+ unsigned, unsigned, int &, int &, int &) const override;
+ void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ const DebugLoc &DL, unsigned DstReg,
+ ArrayRef<MachineOperand> Cond, unsigned TrueReg,
+ unsigned FalseReg) const override;
+ void getNoopForMachoTarget(MCInst &NopInst) const override;
+
+ /// analyzeCompare - For a comparison instruction, return the source registers
+ /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
+ /// Return true if the comparison instruction can be analyzed.
+ bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
+ unsigned &SrcReg2, int &CmpMask,
+ int &CmpValue) const override;
+ /// optimizeCompareInstr - Convert the instruction supplying the argument to
+ /// the comparison into one that sets the zero bit in the flags register.
+ bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
+ unsigned SrcReg2, int CmpMask, int CmpValue,
+ const MachineRegisterInfo *MRI) const override;
+ bool optimizeCondBranch(MachineInstr &MI) const override;
+
+ /// Return true when a code sequence can improve throughput. It
+ /// should be called only for instructions in loops.
+ /// \param Pattern - combiner pattern
+ bool isThroughputPattern(MachineCombinerPattern Pattern) const override;
+ /// Return true when there is potentially a faster code sequence
+ /// for an instruction chain ending in <Root>. All potential patterns are
+ /// listed in the <Patterns> array.
+ bool getMachineCombinerPatterns(MachineInstr &Root,
+ SmallVectorImpl<MachineCombinerPattern> &Patterns)
+ const override;
+ /// Return true when Inst is associative and commutative so that it can be
+ /// reassociated.
+ bool isAssociativeAndCommutative(const MachineInstr &Inst) const override;
+ /// When getMachineCombinerPatterns() finds patterns, this function generates
+ /// the instructions that could replace the original code sequence
+ void genAlternativeCodeSequence(
+ MachineInstr &Root, MachineCombinerPattern Pattern,
+ SmallVectorImpl<MachineInstr *> &InsInstrs,
+ SmallVectorImpl<MachineInstr *> &DelInstrs,
+ DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const override;
+ /// AArch64 supports MachineCombiner.
+ bool useMachineCombiner() const override;
+
+ bool expandPostRAPseudo(MachineInstr &MI) const override;
+
+ std::pair<unsigned, unsigned>
+ decomposeMachineOperandsTargetFlags(unsigned TF) const override;
+ ArrayRef<std::pair<unsigned, const char *>>
+ getSerializableDirectMachineOperandTargetFlags() const override;
+ ArrayRef<std::pair<unsigned, const char *>>
+ getSerializableBitmaskMachineOperandTargetFlags() const override;
+
+private:
+ void instantiateCondBranch(MachineBasicBlock &MBB, const DebugLoc &DL,
+ MachineBasicBlock *TBB,
+ ArrayRef<MachineOperand> Cond) const;
+ bool substituteCmpToZero(MachineInstr &CmpInstr, unsigned SrcReg,
+ const MachineRegisterInfo *MRI) const;
+};
+
+/// emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg
+/// plus Offset. This is intended to be used from within the prolog/epilog
+/// insertion (PEI) pass, where a virtual scratch register may be allocated
+/// if necessary, to be replaced by the scavenger at the end of PEI.
+void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+ int Offset, const TargetInstrInfo *TII,
+ MachineInstr::MIFlag = MachineInstr::NoFlags,
+ bool SetNZCV = false);
+
+/// rewriteAArch64FrameIndex - Rewrite MI to access 'Offset' bytes from the
+/// FP. Return false if the offset could not be handled directly in MI, and
+/// return the left-over portion by reference.
+bool rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+ unsigned FrameReg, int &Offset,
+ const AArch64InstrInfo *TII);
+
+/// \brief Use to report the frame offset status in isAArch64FrameOffsetLegal.
+enum AArch64FrameOffsetStatus {
+ AArch64FrameOffsetCannotUpdate = 0x0, ///< Offset cannot apply.
+ AArch64FrameOffsetIsLegal = 0x1, ///< Offset is legal.
+ AArch64FrameOffsetCanUpdate = 0x2 ///< Offset can apply, at least partly.
+};
+
+/// \brief Check if the @p Offset is a valid frame offset for @p MI.
+/// The returned value reports the validity of the frame offset for @p MI.
+/// It uses the values defined by AArch64FrameOffsetStatus for that.
+/// If result == AArch64FrameOffsetCannotUpdate, @p MI cannot be updated to
+/// use an offset.eq
+/// If result & AArch64FrameOffsetIsLegal, @p Offset can completely be
+/// rewriten in @p MI.
+/// If result & AArch64FrameOffsetCanUpdate, @p Offset contains the
+/// amount that is off the limit of the legal offset.
+/// If set, @p OutUseUnscaledOp will contain the whether @p MI should be
+/// turned into an unscaled operator, which opcode is in @p OutUnscaledOp.
+/// If set, @p EmittableOffset contains the amount that can be set in @p MI
+/// (possibly with @p OutUnscaledOp if OutUseUnscaledOp is true) and that
+/// is a legal offset.
+int isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
+ bool *OutUseUnscaledOp = nullptr,
+ unsigned *OutUnscaledOp = nullptr,
+ int *EmittableOffset = nullptr);
+
+static inline bool isUncondBranchOpcode(int Opc) { return Opc == AArch64::B; }
+
+static inline bool isCondBranchOpcode(int Opc) {
+ switch (Opc) {
+ case AArch64::Bcc:
+ case AArch64::CBZW:
+ case AArch64::CBZX:
+ case AArch64::CBNZW:
+ case AArch64::CBNZX:
+ case AArch64::TBZW:
+ case AArch64::TBZX:
+ case AArch64::TBNZW:
+ case AArch64::TBNZX:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static inline bool isIndirectBranchOpcode(int Opc) { return Opc == AArch64::BR; }
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td
new file mode 100644
index 000000000000..c5b95f282ea8
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -0,0 +1,6132 @@
+//=- AArch64InstrInfo.td - Describe the AArch64 Instructions -*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// AArch64 Instruction definitions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ARM Instruction Predicate Definitions.
+//
+def HasV8_1a : Predicate<"Subtarget->hasV8_1aOps()">,
+ AssemblerPredicate<"HasV8_1aOps", "armv8.1a">;
+def HasV8_2a : Predicate<"Subtarget->hasV8_2aOps()">,
+ AssemblerPredicate<"HasV8_2aOps", "armv8.2a">;
+def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8()">,
+ AssemblerPredicate<"FeatureFPARMv8", "fp-armv8">;
+def HasNEON : Predicate<"Subtarget->hasNEON()">,
+ AssemblerPredicate<"FeatureNEON", "neon">;
+def HasCrypto : Predicate<"Subtarget->hasCrypto()">,
+ AssemblerPredicate<"FeatureCrypto", "crypto">;
+def HasCRC : Predicate<"Subtarget->hasCRC()">,
+ AssemblerPredicate<"FeatureCRC", "crc">;
+def HasLSE : Predicate<"Subtarget->hasLSE()">,
+ AssemblerPredicate<"FeatureLSE", "lse">;
+def HasRAS : Predicate<"Subtarget->hasRAS()">,
+ AssemblerPredicate<"FeatureRAS", "ras">;
+def HasPerfMon : Predicate<"Subtarget->hasPerfMon()">;
+def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">,
+ AssemblerPredicate<"FeatureFullFP16", "fullfp16">;
+def HasSPE : Predicate<"Subtarget->hasSPE()">,
+ AssemblerPredicate<"FeatureSPE", "spe">;
+
+def IsLE : Predicate<"Subtarget->isLittleEndian()">;
+def IsBE : Predicate<"!Subtarget->isLittleEndian()">;
+def UseAlternateSExtLoadCVTF32
+ : Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">;
+
+//===----------------------------------------------------------------------===//
+// AArch64-specific DAG Nodes.
+//
+
+// SDTBinaryArithWithFlagsOut - RES1, FLAGS = op LHS, RHS
+def SDTBinaryArithWithFlagsOut : SDTypeProfile<2, 2,
+ [SDTCisSameAs<0, 2>,
+ SDTCisSameAs<0, 3>,
+ SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+// SDTBinaryArithWithFlagsIn - RES1, FLAGS = op LHS, RHS, FLAGS
+def SDTBinaryArithWithFlagsIn : SDTypeProfile<1, 3,
+ [SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>,
+ SDTCisInt<0>,
+ SDTCisVT<3, i32>]>;
+
+// SDTBinaryArithWithFlagsInOut - RES1, FLAGS = op LHS, RHS, FLAGS
+def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
+ [SDTCisSameAs<0, 2>,
+ SDTCisSameAs<0, 3>,
+ SDTCisInt<0>,
+ SDTCisVT<1, i32>,
+ SDTCisVT<4, i32>]>;
+
+def SDT_AArch64Brcond : SDTypeProfile<0, 3,
+ [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>,
+ SDTCisVT<2, i32>]>;
+def SDT_AArch64cbz : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisVT<1, OtherVT>]>;
+def SDT_AArch64tbz : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>,
+ SDTCisVT<2, OtherVT>]>;
+
+
+def SDT_AArch64CSel : SDTypeProfile<1, 4,
+ [SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>,
+ SDTCisInt<3>,
+ SDTCisVT<4, i32>]>;
+def SDT_AArch64CCMP : SDTypeProfile<1, 5,
+ [SDTCisVT<0, i32>,
+ SDTCisInt<1>,
+ SDTCisSameAs<1, 2>,
+ SDTCisInt<3>,
+ SDTCisInt<4>,
+ SDTCisVT<5, i32>]>;
+def SDT_AArch64FCCMP : SDTypeProfile<1, 5,
+ [SDTCisVT<0, i32>,
+ SDTCisFP<1>,
+ SDTCisSameAs<1, 2>,
+ SDTCisInt<3>,
+ SDTCisInt<4>,
+ SDTCisVT<5, i32>]>;
+def SDT_AArch64FCmp : SDTypeProfile<0, 2,
+ [SDTCisFP<0>,
+ SDTCisSameAs<0, 1>]>;
+def SDT_AArch64Dup : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
+def SDT_AArch64DupLane : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<2>]>;
+def SDT_AArch64Zip : SDTypeProfile<1, 2, [SDTCisVec<0>,
+ SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>]>;
+def SDT_AArch64MOVIedit : SDTypeProfile<1, 1, [SDTCisInt<1>]>;
+def SDT_AArch64MOVIshift : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
+def SDT_AArch64vecimm : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisInt<2>, SDTCisInt<3>]>;
+def SDT_AArch64UnaryVec: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
+def SDT_AArch64ExtVec: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>, SDTCisInt<3>]>;
+def SDT_AArch64vshift : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>, SDTCisInt<2>]>;
+
+def SDT_AArch64unvec : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
+def SDT_AArch64fcmpz : SDTypeProfile<1, 1, []>;
+def SDT_AArch64fcmp : SDTypeProfile<1, 2, [SDTCisSameAs<1,2>]>;
+def SDT_AArch64binvec : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>]>;
+def SDT_AArch64trivec : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>,
+ SDTCisSameAs<0,3>]>;
+def SDT_AArch64TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>;
+def SDT_AArch64PREFETCH : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>]>;
+
+def SDT_AArch64ITOF : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>;
+
+def SDT_AArch64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>,
+ SDTCisPtrTy<1>]>;
+
+// Generates the general dynamic sequences, i.e.
+// adrp x0, :tlsdesc:var
+// ldr x1, [x0, #:tlsdesc_lo12:var]
+// add x0, x0, #:tlsdesc_lo12:var
+// .tlsdesccall var
+// blr x1
+
+// (the TPIDR_EL0 offset is put directly in X0, hence no "result" here)
+// number of operands (the variable)
+def SDT_AArch64TLSDescCallSeq : SDTypeProfile<0,1,
+ [SDTCisPtrTy<0>]>;
+
+def SDT_AArch64WrapperLarge : SDTypeProfile<1, 4,
+ [SDTCisVT<0, i64>, SDTCisVT<1, i32>,
+ SDTCisSameAs<1, 2>, SDTCisSameAs<1, 3>,
+ SDTCisSameAs<1, 4>]>;
+
+
+// Node definitions.
+def AArch64adrp : SDNode<"AArch64ISD::ADRP", SDTIntUnaryOp, []>;
+def AArch64addlow : SDNode<"AArch64ISD::ADDlow", SDTIntBinOp, []>;
+def AArch64LOADgot : SDNode<"AArch64ISD::LOADgot", SDTIntUnaryOp>;
+def AArch64callseq_start : SDNode<"ISD::CALLSEQ_START",
+ SDCallSeqStart<[ SDTCisVT<0, i32> ]>,
+ [SDNPHasChain, SDNPOutGlue]>;
+def AArch64callseq_end : SDNode<"ISD::CALLSEQ_END",
+ SDCallSeqEnd<[ SDTCisVT<0, i32>,
+ SDTCisVT<1, i32> ]>,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64call : SDNode<"AArch64ISD::CALL",
+ SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+ SDNPVariadic]>;
+def AArch64brcond : SDNode<"AArch64ISD::BRCOND", SDT_AArch64Brcond,
+ [SDNPHasChain]>;
+def AArch64cbz : SDNode<"AArch64ISD::CBZ", SDT_AArch64cbz,
+ [SDNPHasChain]>;
+def AArch64cbnz : SDNode<"AArch64ISD::CBNZ", SDT_AArch64cbz,
+ [SDNPHasChain]>;
+def AArch64tbz : SDNode<"AArch64ISD::TBZ", SDT_AArch64tbz,
+ [SDNPHasChain]>;
+def AArch64tbnz : SDNode<"AArch64ISD::TBNZ", SDT_AArch64tbz,
+ [SDNPHasChain]>;
+
+
+def AArch64csel : SDNode<"AArch64ISD::CSEL", SDT_AArch64CSel>;
+def AArch64csinv : SDNode<"AArch64ISD::CSINV", SDT_AArch64CSel>;
+def AArch64csneg : SDNode<"AArch64ISD::CSNEG", SDT_AArch64CSel>;
+def AArch64csinc : SDNode<"AArch64ISD::CSINC", SDT_AArch64CSel>;
+def AArch64retflag : SDNode<"AArch64ISD::RET_FLAG", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def AArch64adc : SDNode<"AArch64ISD::ADC", SDTBinaryArithWithFlagsIn >;
+def AArch64sbc : SDNode<"AArch64ISD::SBC", SDTBinaryArithWithFlagsIn>;
+def AArch64add_flag : SDNode<"AArch64ISD::ADDS", SDTBinaryArithWithFlagsOut,
+ [SDNPCommutative]>;
+def AArch64sub_flag : SDNode<"AArch64ISD::SUBS", SDTBinaryArithWithFlagsOut>;
+def AArch64and_flag : SDNode<"AArch64ISD::ANDS", SDTBinaryArithWithFlagsOut,
+ [SDNPCommutative]>;
+def AArch64adc_flag : SDNode<"AArch64ISD::ADCS", SDTBinaryArithWithFlagsInOut>;
+def AArch64sbc_flag : SDNode<"AArch64ISD::SBCS", SDTBinaryArithWithFlagsInOut>;
+
+def AArch64ccmp : SDNode<"AArch64ISD::CCMP", SDT_AArch64CCMP>;
+def AArch64ccmn : SDNode<"AArch64ISD::CCMN", SDT_AArch64CCMP>;
+def AArch64fccmp : SDNode<"AArch64ISD::FCCMP", SDT_AArch64FCCMP>;
+
+def AArch64threadpointer : SDNode<"AArch64ISD::THREAD_POINTER", SDTPtrLeaf>;
+
+def AArch64fcmp : SDNode<"AArch64ISD::FCMP", SDT_AArch64FCmp>;
+
+def AArch64dup : SDNode<"AArch64ISD::DUP", SDT_AArch64Dup>;
+def AArch64duplane8 : SDNode<"AArch64ISD::DUPLANE8", SDT_AArch64DupLane>;
+def AArch64duplane16 : SDNode<"AArch64ISD::DUPLANE16", SDT_AArch64DupLane>;
+def AArch64duplane32 : SDNode<"AArch64ISD::DUPLANE32", SDT_AArch64DupLane>;
+def AArch64duplane64 : SDNode<"AArch64ISD::DUPLANE64", SDT_AArch64DupLane>;
+
+def AArch64zip1 : SDNode<"AArch64ISD::ZIP1", SDT_AArch64Zip>;
+def AArch64zip2 : SDNode<"AArch64ISD::ZIP2", SDT_AArch64Zip>;
+def AArch64uzp1 : SDNode<"AArch64ISD::UZP1", SDT_AArch64Zip>;
+def AArch64uzp2 : SDNode<"AArch64ISD::UZP2", SDT_AArch64Zip>;
+def AArch64trn1 : SDNode<"AArch64ISD::TRN1", SDT_AArch64Zip>;
+def AArch64trn2 : SDNode<"AArch64ISD::TRN2", SDT_AArch64Zip>;
+
+def AArch64movi_edit : SDNode<"AArch64ISD::MOVIedit", SDT_AArch64MOVIedit>;
+def AArch64movi_shift : SDNode<"AArch64ISD::MOVIshift", SDT_AArch64MOVIshift>;
+def AArch64movi_msl : SDNode<"AArch64ISD::MOVImsl", SDT_AArch64MOVIshift>;
+def AArch64mvni_shift : SDNode<"AArch64ISD::MVNIshift", SDT_AArch64MOVIshift>;
+def AArch64mvni_msl : SDNode<"AArch64ISD::MVNImsl", SDT_AArch64MOVIshift>;
+def AArch64movi : SDNode<"AArch64ISD::MOVI", SDT_AArch64MOVIedit>;
+def AArch64fmov : SDNode<"AArch64ISD::FMOV", SDT_AArch64MOVIedit>;
+
+def AArch64rev16 : SDNode<"AArch64ISD::REV16", SDT_AArch64UnaryVec>;
+def AArch64rev32 : SDNode<"AArch64ISD::REV32", SDT_AArch64UnaryVec>;
+def AArch64rev64 : SDNode<"AArch64ISD::REV64", SDT_AArch64UnaryVec>;
+def AArch64ext : SDNode<"AArch64ISD::EXT", SDT_AArch64ExtVec>;
+
+def AArch64vashr : SDNode<"AArch64ISD::VASHR", SDT_AArch64vshift>;
+def AArch64vlshr : SDNode<"AArch64ISD::VLSHR", SDT_AArch64vshift>;
+def AArch64vshl : SDNode<"AArch64ISD::VSHL", SDT_AArch64vshift>;
+def AArch64sqshli : SDNode<"AArch64ISD::SQSHL_I", SDT_AArch64vshift>;
+def AArch64uqshli : SDNode<"AArch64ISD::UQSHL_I", SDT_AArch64vshift>;
+def AArch64sqshlui : SDNode<"AArch64ISD::SQSHLU_I", SDT_AArch64vshift>;
+def AArch64srshri : SDNode<"AArch64ISD::SRSHR_I", SDT_AArch64vshift>;
+def AArch64urshri : SDNode<"AArch64ISD::URSHR_I", SDT_AArch64vshift>;
+
+def AArch64not: SDNode<"AArch64ISD::NOT", SDT_AArch64unvec>;
+def AArch64bit: SDNode<"AArch64ISD::BIT", SDT_AArch64trivec>;
+def AArch64bsl: SDNode<"AArch64ISD::BSL", SDT_AArch64trivec>;
+
+def AArch64cmeq: SDNode<"AArch64ISD::CMEQ", SDT_AArch64binvec>;
+def AArch64cmge: SDNode<"AArch64ISD::CMGE", SDT_AArch64binvec>;
+def AArch64cmgt: SDNode<"AArch64ISD::CMGT", SDT_AArch64binvec>;
+def AArch64cmhi: SDNode<"AArch64ISD::CMHI", SDT_AArch64binvec>;
+def AArch64cmhs: SDNode<"AArch64ISD::CMHS", SDT_AArch64binvec>;
+
+def AArch64fcmeq: SDNode<"AArch64ISD::FCMEQ", SDT_AArch64fcmp>;
+def AArch64fcmge: SDNode<"AArch64ISD::FCMGE", SDT_AArch64fcmp>;
+def AArch64fcmgt: SDNode<"AArch64ISD::FCMGT", SDT_AArch64fcmp>;
+
+def AArch64cmeqz: SDNode<"AArch64ISD::CMEQz", SDT_AArch64unvec>;
+def AArch64cmgez: SDNode<"AArch64ISD::CMGEz", SDT_AArch64unvec>;
+def AArch64cmgtz: SDNode<"AArch64ISD::CMGTz", SDT_AArch64unvec>;
+def AArch64cmlez: SDNode<"AArch64ISD::CMLEz", SDT_AArch64unvec>;
+def AArch64cmltz: SDNode<"AArch64ISD::CMLTz", SDT_AArch64unvec>;
+def AArch64cmtst : PatFrag<(ops node:$LHS, node:$RHS),
+ (AArch64not (AArch64cmeqz (and node:$LHS, node:$RHS)))>;
+
+def AArch64fcmeqz: SDNode<"AArch64ISD::FCMEQz", SDT_AArch64fcmpz>;
+def AArch64fcmgez: SDNode<"AArch64ISD::FCMGEz", SDT_AArch64fcmpz>;
+def AArch64fcmgtz: SDNode<"AArch64ISD::FCMGTz", SDT_AArch64fcmpz>;
+def AArch64fcmlez: SDNode<"AArch64ISD::FCMLEz", SDT_AArch64fcmpz>;
+def AArch64fcmltz: SDNode<"AArch64ISD::FCMLTz", SDT_AArch64fcmpz>;
+
+def AArch64bici: SDNode<"AArch64ISD::BICi", SDT_AArch64vecimm>;
+def AArch64orri: SDNode<"AArch64ISD::ORRi", SDT_AArch64vecimm>;
+
+def AArch64neg : SDNode<"AArch64ISD::NEG", SDT_AArch64unvec>;
+
+def AArch64tcret: SDNode<"AArch64ISD::TC_RETURN", SDT_AArch64TCRET,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+def AArch64Prefetch : SDNode<"AArch64ISD::PREFETCH", SDT_AArch64PREFETCH,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+def AArch64sitof: SDNode<"AArch64ISD::SITOF", SDT_AArch64ITOF>;
+def AArch64uitof: SDNode<"AArch64ISD::UITOF", SDT_AArch64ITOF>;
+
+def AArch64tlsdesc_callseq : SDNode<"AArch64ISD::TLSDESC_CALLSEQ",
+ SDT_AArch64TLSDescCallSeq,
+ [SDNPInGlue, SDNPOutGlue, SDNPHasChain,
+ SDNPVariadic]>;
+
+
+def AArch64WrapperLarge : SDNode<"AArch64ISD::WrapperLarge",
+ SDT_AArch64WrapperLarge>;
+
+def AArch64NvCast : SDNode<"AArch64ISD::NVCAST", SDTUnaryOp>;
+
+def SDT_AArch64mull : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
+ SDTCisSameAs<1, 2>]>;
+def AArch64smull : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull>;
+def AArch64umull : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull>;
+
+def AArch64frecpe : SDNode<"AArch64ISD::FRECPE", SDTFPUnaryOp>;
+def AArch64frecps : SDNode<"AArch64ISD::FRECPS", SDTFPBinOp>;
+def AArch64frsqrte : SDNode<"AArch64ISD::FRSQRTE", SDTFPUnaryOp>;
+def AArch64frsqrts : SDNode<"AArch64ISD::FRSQRTS", SDTFPBinOp>;
+
+def AArch64saddv : SDNode<"AArch64ISD::SADDV", SDT_AArch64UnaryVec>;
+def AArch64uaddv : SDNode<"AArch64ISD::UADDV", SDT_AArch64UnaryVec>;
+def AArch64sminv : SDNode<"AArch64ISD::SMINV", SDT_AArch64UnaryVec>;
+def AArch64uminv : SDNode<"AArch64ISD::UMINV", SDT_AArch64UnaryVec>;
+def AArch64smaxv : SDNode<"AArch64ISD::SMAXV", SDT_AArch64UnaryVec>;
+def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>;
+
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+
+// AArch64 Instruction Predicate Definitions.
+def IsDarwin : Predicate<"Subtarget->isTargetDarwin()">;
+def IsNotDarwin: Predicate<"!Subtarget->isTargetDarwin()">;
+def ForCodeSize : Predicate<"ForCodeSize">;
+def NotForCodeSize : Predicate<"!ForCodeSize">;
+
+include "AArch64InstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous instructions.
+//===----------------------------------------------------------------------===//
+
+let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in {
+// We set Sched to empty list because we expect these instructions to simply get
+// removed in most cases.
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
+ [(AArch64callseq_start timm:$amt)]>, Sched<[]>;
+def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+ [(AArch64callseq_end timm:$amt1, timm:$amt2)]>,
+ Sched<[]>;
+} // Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1
+
+let isReMaterializable = 1, isCodeGenOnly = 1 in {
+// FIXME: The following pseudo instructions are only needed because remat
+// cannot handle multiple instructions. When that changes, they can be
+// removed, along with the AArch64Wrapper node.
+
+let AddedComplexity = 10 in
+def LOADgot : Pseudo<(outs GPR64:$dst), (ins i64imm:$addr),
+ [(set GPR64:$dst, (AArch64LOADgot tglobaladdr:$addr))]>,
+ Sched<[WriteLDAdr]>;
+
+// The MOVaddr instruction should match only when the add is not folded
+// into a load or store address.
+def MOVaddr
+ : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+ [(set GPR64:$dst, (AArch64addlow (AArch64adrp tglobaladdr:$hi),
+ tglobaladdr:$low))]>,
+ Sched<[WriteAdrAdr]>;
+def MOVaddrJT
+ : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+ [(set GPR64:$dst, (AArch64addlow (AArch64adrp tjumptable:$hi),
+ tjumptable:$low))]>,
+ Sched<[WriteAdrAdr]>;
+def MOVaddrCP
+ : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+ [(set GPR64:$dst, (AArch64addlow (AArch64adrp tconstpool:$hi),
+ tconstpool:$low))]>,
+ Sched<[WriteAdrAdr]>;
+def MOVaddrBA
+ : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+ [(set GPR64:$dst, (AArch64addlow (AArch64adrp tblockaddress:$hi),
+ tblockaddress:$low))]>,
+ Sched<[WriteAdrAdr]>;
+def MOVaddrTLS
+ : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+ [(set GPR64:$dst, (AArch64addlow (AArch64adrp tglobaltlsaddr:$hi),
+ tglobaltlsaddr:$low))]>,
+ Sched<[WriteAdrAdr]>;
+def MOVaddrEXT
+ : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+ [(set GPR64:$dst, (AArch64addlow (AArch64adrp texternalsym:$hi),
+ texternalsym:$low))]>,
+ Sched<[WriteAdrAdr]>;
+
+} // isReMaterializable, isCodeGenOnly
+
+def : Pat<(AArch64LOADgot tglobaltlsaddr:$addr),
+ (LOADgot tglobaltlsaddr:$addr)>;
+
+def : Pat<(AArch64LOADgot texternalsym:$addr),
+ (LOADgot texternalsym:$addr)>;
+
+def : Pat<(AArch64LOADgot tconstpool:$addr),
+ (LOADgot tconstpool:$addr)>;
+
+//===----------------------------------------------------------------------===//
+// System instructions.
+//===----------------------------------------------------------------------===//
+
+def HINT : HintI<"hint">;
+def : InstAlias<"nop", (HINT 0b000)>;
+def : InstAlias<"yield",(HINT 0b001)>;
+def : InstAlias<"wfe", (HINT 0b010)>;
+def : InstAlias<"wfi", (HINT 0b011)>;
+def : InstAlias<"sev", (HINT 0b100)>;
+def : InstAlias<"sevl", (HINT 0b101)>;
+def : InstAlias<"esb", (HINT 0b10000)>, Requires<[HasRAS]>;
+
+// v8.2a Statistical Profiling extension
+def : InstAlias<"psb $op", (HINT psbhint_op:$op)>, Requires<[HasSPE]>;
+
+// As far as LLVM is concerned this writes to the system's exclusive monitors.
+let mayLoad = 1, mayStore = 1 in
+def CLREX : CRmSystemI<imm0_15, 0b010, "clrex">;
+
+// NOTE: ideally, this would have mayStore = 0, mayLoad = 0, but we cannot
+// model patterns with sufficiently fine granularity.
+let mayLoad = ?, mayStore = ? in {
+def DMB : CRmSystemI<barrier_op, 0b101, "dmb",
+ [(int_aarch64_dmb (i32 imm32_0_15:$CRm))]>;
+
+def DSB : CRmSystemI<barrier_op, 0b100, "dsb",
+ [(int_aarch64_dsb (i32 imm32_0_15:$CRm))]>;
+
+def ISB : CRmSystemI<barrier_op, 0b110, "isb",
+ [(int_aarch64_isb (i32 imm32_0_15:$CRm))]>;
+}
+
+def : InstAlias<"clrex", (CLREX 0xf)>;
+def : InstAlias<"isb", (ISB 0xf)>;
+
+def MRS : MRSI;
+def MSR : MSRI;
+def MSRpstateImm1 : MSRpstateImm0_1;
+def MSRpstateImm4 : MSRpstateImm0_15;
+
+// The thread pointer (on Linux, at least, where this has been implemented) is
+// TPIDR_EL0.
+def : Pat<(AArch64threadpointer), (MRS 0xde82)>;
+
+// The cycle counter PMC register is PMCCNTR_EL0.
+let Predicates = [HasPerfMon] in
+def : Pat<(readcyclecounter), (MRS 0xdce8)>;
+
+// Generic system instructions
+def SYSxt : SystemXtI<0, "sys">;
+def SYSLxt : SystemLXtI<1, "sysl">;
+
+def : InstAlias<"sys $op1, $Cn, $Cm, $op2",
+ (SYSxt imm0_7:$op1, sys_cr_op:$Cn,
+ sys_cr_op:$Cm, imm0_7:$op2, XZR)>;
+
+//===----------------------------------------------------------------------===//
+// Move immediate instructions.
+//===----------------------------------------------------------------------===//
+
+defm MOVK : InsertImmediate<0b11, "movk">;
+defm MOVN : MoveImmediate<0b00, "movn">;
+
+let PostEncoderMethod = "fixMOVZ" in
+defm MOVZ : MoveImmediate<0b10, "movz">;
+
+// First group of aliases covers an implicit "lsl #0".
+def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movn $dst, $imm", (MOVNWi GPR32:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movn $dst, $imm", (MOVNXi GPR64:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movz $dst, $imm", (MOVZWi GPR32:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movz $dst, $imm", (MOVZXi GPR64:$dst, imm0_65535:$imm, 0)>;
+
+// Next, we have various ELF relocations with the ":XYZ_g0:sym" syntax.
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;
+
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;
+
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g3:$sym, 48)>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g2:$sym, 32)>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g1:$sym, 16)>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g0:$sym, 0)>;
+
+def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;
+
+def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;
+
+def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g1:$sym, 16)>;
+def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g0:$sym, 0)>;
+
+// Final group of aliases covers true "mov $Rd, $imm" cases.
+multiclass movw_mov_alias<string basename,Instruction INST, RegisterClass GPR,
+ int width, int shift> {
+ def _asmoperand : AsmOperandClass {
+ let Name = basename # width # "_lsl" # shift # "MovAlias";
+ let PredicateMethod = "is" # basename # "MovAlias<" # width # ", "
+ # shift # ">";
+ let RenderMethod = "add" # basename # "MovAliasOperands<" # shift # ">";
+ }
+
+ def _movimm : Operand<i32> {
+ let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_asmoperand");
+ }
+
+ def : InstAlias<"mov $Rd, $imm",
+ (INST GPR:$Rd, !cast<Operand>(NAME # "_movimm"):$imm, shift)>;
+}
+
+defm : movw_mov_alias<"MOVZ", MOVZWi, GPR32, 32, 0>;
+defm : movw_mov_alias<"MOVZ", MOVZWi, GPR32, 32, 16>;
+
+defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 0>;
+defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 16>;
+defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 32>;
+defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 48>;
+
+defm : movw_mov_alias<"MOVN", MOVNWi, GPR32, 32, 0>;
+defm : movw_mov_alias<"MOVN", MOVNWi, GPR32, 32, 16>;
+
+defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 0>;
+defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 16>;
+defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 32>;
+defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 48>;
+
+let isReMaterializable = 1, isCodeGenOnly = 1, isMoveImm = 1,
+ isAsCheapAsAMove = 1 in {
+// FIXME: The following pseudo instructions are only needed because remat
+// cannot handle multiple instructions. When that changes, we can select
+// directly to the real instructions and get rid of these pseudos.
+
+def MOVi32imm
+ : Pseudo<(outs GPR32:$dst), (ins i32imm:$src),
+ [(set GPR32:$dst, imm:$src)]>,
+ Sched<[WriteImm]>;
+def MOVi64imm
+ : Pseudo<(outs GPR64:$dst), (ins i64imm:$src),
+ [(set GPR64:$dst, imm:$src)]>,
+ Sched<[WriteImm]>;
+} // isReMaterializable, isCodeGenOnly
+
+// If possible, we want to use MOVi32imm even for 64-bit moves. This gives the
+// eventual expansion code fewer bits to worry about getting right. Marshalling
+// the types is a little tricky though:
+def i64imm_32bit : ImmLeaf<i64, [{
+ return (Imm & 0xffffffffULL) == static_cast<uint64_t>(Imm);
+}]>;
+
+def s64imm_32bit : ImmLeaf<i64, [{
+ int64_t Imm64 = static_cast<int64_t>(Imm);
+ return Imm64 >= std::numeric_limits<int32_t>::min() &&
+ Imm64 <= std::numeric_limits<int32_t>::max();
+}]>;
+
+def trunc_imm : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i32);
+}]>;
+
+def : Pat<(i64 i64imm_32bit:$src),
+ (SUBREG_TO_REG (i64 0), (MOVi32imm (trunc_imm imm:$src)), sub_32)>;
+
+// Materialize FP constants via MOVi32imm/MOVi64imm (MachO large code model).
+def bitcast_fpimm_to_i32 : SDNodeXForm<fpimm, [{
+return CurDAG->getTargetConstant(
+ N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i32);
+}]>;
+
+def bitcast_fpimm_to_i64 : SDNodeXForm<fpimm, [{
+return CurDAG->getTargetConstant(
+ N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i64);
+}]>;
+
+
+def : Pat<(f32 fpimm:$in),
+ (COPY_TO_REGCLASS (MOVi32imm (bitcast_fpimm_to_i32 f32:$in)), FPR32)>;
+def : Pat<(f64 fpimm:$in),
+ (COPY_TO_REGCLASS (MOVi64imm (bitcast_fpimm_to_i64 f64:$in)), FPR64)>;
+
+
+// Deal with the various forms of (ELF) large addressing with MOVZ/MOVK
+// sequences.
+def : Pat<(AArch64WrapperLarge tglobaladdr:$g3, tglobaladdr:$g2,
+ tglobaladdr:$g1, tglobaladdr:$g0),
+ (MOVKXi (MOVKXi (MOVKXi (MOVZXi tglobaladdr:$g3, 48),
+ tglobaladdr:$g2, 32),
+ tglobaladdr:$g1, 16),
+ tglobaladdr:$g0, 0)>;
+
+def : Pat<(AArch64WrapperLarge tblockaddress:$g3, tblockaddress:$g2,
+ tblockaddress:$g1, tblockaddress:$g0),
+ (MOVKXi (MOVKXi (MOVKXi (MOVZXi tblockaddress:$g3, 48),
+ tblockaddress:$g2, 32),
+ tblockaddress:$g1, 16),
+ tblockaddress:$g0, 0)>;
+
+def : Pat<(AArch64WrapperLarge tconstpool:$g3, tconstpool:$g2,
+ tconstpool:$g1, tconstpool:$g0),
+ (MOVKXi (MOVKXi (MOVKXi (MOVZXi tconstpool:$g3, 48),
+ tconstpool:$g2, 32),
+ tconstpool:$g1, 16),
+ tconstpool:$g0, 0)>;
+
+def : Pat<(AArch64WrapperLarge tjumptable:$g3, tjumptable:$g2,
+ tjumptable:$g1, tjumptable:$g0),
+ (MOVKXi (MOVKXi (MOVKXi (MOVZXi tjumptable:$g3, 48),
+ tjumptable:$g2, 32),
+ tjumptable:$g1, 16),
+ tjumptable:$g0, 0)>;
+
+
+//===----------------------------------------------------------------------===//
+// Arithmetic instructions.
+//===----------------------------------------------------------------------===//
+
+// Add/subtract with carry.
+defm ADC : AddSubCarry<0, "adc", "adcs", AArch64adc, AArch64adc_flag>;
+defm SBC : AddSubCarry<1, "sbc", "sbcs", AArch64sbc, AArch64sbc_flag>;
+
+def : InstAlias<"ngc $dst, $src", (SBCWr GPR32:$dst, WZR, GPR32:$src)>;
+def : InstAlias<"ngc $dst, $src", (SBCXr GPR64:$dst, XZR, GPR64:$src)>;
+def : InstAlias<"ngcs $dst, $src", (SBCSWr GPR32:$dst, WZR, GPR32:$src)>;
+def : InstAlias<"ngcs $dst, $src", (SBCSXr GPR64:$dst, XZR, GPR64:$src)>;
+
+// Add/subtract
+defm ADD : AddSub<0, "add", "sub", add>;
+defm SUB : AddSub<1, "sub", "add">;
+
+def : InstAlias<"mov $dst, $src",
+ (ADDWri GPR32sponly:$dst, GPR32sp:$src, 0, 0)>;
+def : InstAlias<"mov $dst, $src",
+ (ADDWri GPR32sp:$dst, GPR32sponly:$src, 0, 0)>;
+def : InstAlias<"mov $dst, $src",
+ (ADDXri GPR64sponly:$dst, GPR64sp:$src, 0, 0)>;
+def : InstAlias<"mov $dst, $src",
+ (ADDXri GPR64sp:$dst, GPR64sponly:$src, 0, 0)>;
+
+defm ADDS : AddSubS<0, "adds", AArch64add_flag, "cmn", "subs", "cmp">;
+defm SUBS : AddSubS<1, "subs", AArch64sub_flag, "cmp", "adds", "cmn">;
+
+// Use SUBS instead of SUB to enable CSE between SUBS and SUB.
+def : Pat<(sub GPR32sp:$Rn, addsub_shifted_imm32:$imm),
+ (SUBSWri GPR32sp:$Rn, addsub_shifted_imm32:$imm)>;
+def : Pat<(sub GPR64sp:$Rn, addsub_shifted_imm64:$imm),
+ (SUBSXri GPR64sp:$Rn, addsub_shifted_imm64:$imm)>;
+def : Pat<(sub GPR32:$Rn, GPR32:$Rm),
+ (SUBSWrr GPR32:$Rn, GPR32:$Rm)>;
+def : Pat<(sub GPR64:$Rn, GPR64:$Rm),
+ (SUBSXrr GPR64:$Rn, GPR64:$Rm)>;
+def : Pat<(sub GPR32:$Rn, arith_shifted_reg32:$Rm),
+ (SUBSWrs GPR32:$Rn, arith_shifted_reg32:$Rm)>;
+def : Pat<(sub GPR64:$Rn, arith_shifted_reg64:$Rm),
+ (SUBSXrs GPR64:$Rn, arith_shifted_reg64:$Rm)>;
+let AddedComplexity = 1 in {
+def : Pat<(sub GPR32sp:$R2, arith_extended_reg32<i32>:$R3),
+ (SUBSWrx GPR32sp:$R2, arith_extended_reg32<i32>:$R3)>;
+def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3),
+ (SUBSXrx GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3)>;
+}
+
+// Because of the immediate format for add/sub-imm instructions, the
+// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1).
+// These patterns capture that transformation.
+let AddedComplexity = 1 in {
+def : Pat<(add GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
+ (SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
+def : Pat<(add GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
+ (SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
+def : Pat<(sub GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
+ (ADDWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
+def : Pat<(sub GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
+ (ADDXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
+}
+
+// Because of the immediate format for add/sub-imm instructions, the
+// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1).
+// These patterns capture that transformation.
+let AddedComplexity = 1 in {
+def : Pat<(AArch64add_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
+ (SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
+def : Pat<(AArch64add_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
+ (SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
+def : Pat<(AArch64sub_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
+ (ADDSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
+def : Pat<(AArch64sub_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
+ (ADDSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
+}
+
+def : InstAlias<"neg $dst, $src", (SUBWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>;
+def : InstAlias<"neg $dst, $src", (SUBXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>;
+def : InstAlias<"neg $dst, $src$shift",
+ (SUBWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>;
+def : InstAlias<"neg $dst, $src$shift",
+ (SUBXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>;
+
+def : InstAlias<"negs $dst, $src", (SUBSWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>;
+def : InstAlias<"negs $dst, $src", (SUBSXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>;
+def : InstAlias<"negs $dst, $src$shift",
+ (SUBSWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>;
+def : InstAlias<"negs $dst, $src$shift",
+ (SUBSXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>;
+
+
+// Unsigned/Signed divide
+defm UDIV : Div<0, "udiv", udiv>;
+defm SDIV : Div<1, "sdiv", sdiv>;
+
+def : Pat<(int_aarch64_udiv GPR32:$Rn, GPR32:$Rm), (UDIVWr $Rn, $Rm)>;
+def : Pat<(int_aarch64_udiv GPR64:$Rn, GPR64:$Rm), (UDIVXr $Rn, $Rm)>;
+def : Pat<(int_aarch64_sdiv GPR32:$Rn, GPR32:$Rm), (SDIVWr $Rn, $Rm)>;
+def : Pat<(int_aarch64_sdiv GPR64:$Rn, GPR64:$Rm), (SDIVXr $Rn, $Rm)>;
+
+// Variable shift
+defm ASRV : Shift<0b10, "asr", sra>;
+defm LSLV : Shift<0b00, "lsl", shl>;
+defm LSRV : Shift<0b01, "lsr", srl>;
+defm RORV : Shift<0b11, "ror", rotr>;
+
+def : ShiftAlias<"asrv", ASRVWr, GPR32>;
+def : ShiftAlias<"asrv", ASRVXr, GPR64>;
+def : ShiftAlias<"lslv", LSLVWr, GPR32>;
+def : ShiftAlias<"lslv", LSLVXr, GPR64>;
+def : ShiftAlias<"lsrv", LSRVWr, GPR32>;
+def : ShiftAlias<"lsrv", LSRVXr, GPR64>;
+def : ShiftAlias<"rorv", RORVWr, GPR32>;
+def : ShiftAlias<"rorv", RORVXr, GPR64>;
+
+// Multiply-add
+let AddedComplexity = 7 in {
+defm MADD : MulAccum<0, "madd", add>;
+defm MSUB : MulAccum<1, "msub", sub>;
+
+def : Pat<(i32 (mul GPR32:$Rn, GPR32:$Rm)),
+ (MADDWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
+def : Pat<(i64 (mul GPR64:$Rn, GPR64:$Rm)),
+ (MADDXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
+
+def : Pat<(i32 (ineg (mul GPR32:$Rn, GPR32:$Rm))),
+ (MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
+def : Pat<(i64 (ineg (mul GPR64:$Rn, GPR64:$Rm))),
+ (MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
+def : Pat<(i32 (mul (ineg GPR32:$Rn), GPR32:$Rm)),
+ (MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
+def : Pat<(i64 (mul (ineg GPR64:$Rn), GPR64:$Rm)),
+ (MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
+} // AddedComplexity = 7
+
+let AddedComplexity = 5 in {
+def SMADDLrrr : WideMulAccum<0, 0b001, "smaddl", add, sext>;
+def SMSUBLrrr : WideMulAccum<1, 0b001, "smsubl", sub, sext>;
+def UMADDLrrr : WideMulAccum<0, 0b101, "umaddl", add, zext>;
+def UMSUBLrrr : WideMulAccum<1, 0b101, "umsubl", sub, zext>;
+
+def : Pat<(i64 (mul (sext GPR32:$Rn), (sext GPR32:$Rm))),
+ (SMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+def : Pat<(i64 (mul (zext GPR32:$Rn), (zext GPR32:$Rm))),
+ (UMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+
+def : Pat<(i64 (ineg (mul (sext GPR32:$Rn), (sext GPR32:$Rm)))),
+ (SMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+def : Pat<(i64 (ineg (mul (zext GPR32:$Rn), (zext GPR32:$Rm)))),
+ (UMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+
+def : Pat<(i64 (mul (sext GPR32:$Rn), (s64imm_32bit:$C))),
+ (SMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>;
+def : Pat<(i64 (mul (zext GPR32:$Rn), (i64imm_32bit:$C))),
+ (UMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>;
+def : Pat<(i64 (mul (sext_inreg GPR64:$Rn, i32), (s64imm_32bit:$C))),
+ (SMADDLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)),
+ (MOVi32imm (trunc_imm imm:$C)), XZR)>;
+
+def : Pat<(i64 (ineg (mul (sext GPR32:$Rn), (s64imm_32bit:$C)))),
+ (SMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>;
+def : Pat<(i64 (ineg (mul (zext GPR32:$Rn), (i64imm_32bit:$C)))),
+ (UMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>;
+def : Pat<(i64 (ineg (mul (sext_inreg GPR64:$Rn, i32), (s64imm_32bit:$C)))),
+ (SMSUBLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)),
+ (MOVi32imm (trunc_imm imm:$C)), XZR)>;
+
+def : Pat<(i64 (add (mul (sext GPR32:$Rn), (s64imm_32bit:$C)), GPR64:$Ra)),
+ (SMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
+def : Pat<(i64 (add (mul (zext GPR32:$Rn), (i64imm_32bit:$C)), GPR64:$Ra)),
+ (UMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
+def : Pat<(i64 (add (mul (sext_inreg GPR64:$Rn, i32), (s64imm_32bit:$C)),
+ GPR64:$Ra)),
+ (SMADDLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)),
+ (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
+
+def : Pat<(i64 (sub GPR64:$Ra, (mul (sext GPR32:$Rn), (s64imm_32bit:$C)))),
+ (SMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
+def : Pat<(i64 (sub GPR64:$Ra, (mul (zext GPR32:$Rn), (i64imm_32bit:$C)))),
+ (UMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
+def : Pat<(i64 (sub GPR64:$Ra, (mul (sext_inreg GPR64:$Rn, i32),
+ (s64imm_32bit:$C)))),
+ (SMSUBLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)),
+ (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
+} // AddedComplexity = 5
+
+def : MulAccumWAlias<"mul", MADDWrrr>;
+def : MulAccumXAlias<"mul", MADDXrrr>;
+def : MulAccumWAlias<"mneg", MSUBWrrr>;
+def : MulAccumXAlias<"mneg", MSUBXrrr>;
+def : WideMulAccumAlias<"smull", SMADDLrrr>;
+def : WideMulAccumAlias<"smnegl", SMSUBLrrr>;
+def : WideMulAccumAlias<"umull", UMADDLrrr>;
+def : WideMulAccumAlias<"umnegl", UMSUBLrrr>;
+
+// Multiply-high
+def SMULHrr : MulHi<0b010, "smulh", mulhs>;
+def UMULHrr : MulHi<0b110, "umulh", mulhu>;
+
+// CRC32
+def CRC32Brr : BaseCRC32<0, 0b00, 0, GPR32, int_aarch64_crc32b, "crc32b">;
+def CRC32Hrr : BaseCRC32<0, 0b01, 0, GPR32, int_aarch64_crc32h, "crc32h">;
+def CRC32Wrr : BaseCRC32<0, 0b10, 0, GPR32, int_aarch64_crc32w, "crc32w">;
+def CRC32Xrr : BaseCRC32<1, 0b11, 0, GPR64, int_aarch64_crc32x, "crc32x">;
+
+def CRC32CBrr : BaseCRC32<0, 0b00, 1, GPR32, int_aarch64_crc32cb, "crc32cb">;
+def CRC32CHrr : BaseCRC32<0, 0b01, 1, GPR32, int_aarch64_crc32ch, "crc32ch">;
+def CRC32CWrr : BaseCRC32<0, 0b10, 1, GPR32, int_aarch64_crc32cw, "crc32cw">;
+def CRC32CXrr : BaseCRC32<1, 0b11, 1, GPR64, int_aarch64_crc32cx, "crc32cx">;
+
+// v8.1 atomic CAS
+defm CAS : CompareAndSwap<0, 0, "">;
+defm CASA : CompareAndSwap<1, 0, "a">;
+defm CASL : CompareAndSwap<0, 1, "l">;
+defm CASAL : CompareAndSwap<1, 1, "al">;
+
+// v8.1 atomic CASP
+defm CASP : CompareAndSwapPair<0, 0, "">;
+defm CASPA : CompareAndSwapPair<1, 0, "a">;
+defm CASPL : CompareAndSwapPair<0, 1, "l">;
+defm CASPAL : CompareAndSwapPair<1, 1, "al">;
+
+// v8.1 atomic SWP
+defm SWP : Swap<0, 0, "">;
+defm SWPA : Swap<1, 0, "a">;
+defm SWPL : Swap<0, 1, "l">;
+defm SWPAL : Swap<1, 1, "al">;
+
+// v8.1 atomic LD<OP>(register). Performs load and then ST<OP>(register)
+defm LDADD : LDOPregister<0b000, "add", 0, 0, "">;
+defm LDADDA : LDOPregister<0b000, "add", 1, 0, "a">;
+defm LDADDL : LDOPregister<0b000, "add", 0, 1, "l">;
+defm LDADDAL : LDOPregister<0b000, "add", 1, 1, "al">;
+
+defm LDCLR : LDOPregister<0b001, "clr", 0, 0, "">;
+defm LDCLRA : LDOPregister<0b001, "clr", 1, 0, "a">;
+defm LDCLRL : LDOPregister<0b001, "clr", 0, 1, "l">;
+defm LDCLRAL : LDOPregister<0b001, "clr", 1, 1, "al">;
+
+defm LDEOR : LDOPregister<0b010, "eor", 0, 0, "">;
+defm LDEORA : LDOPregister<0b010, "eor", 1, 0, "a">;
+defm LDEORL : LDOPregister<0b010, "eor", 0, 1, "l">;
+defm LDEORAL : LDOPregister<0b010, "eor", 1, 1, "al">;
+
+defm LDSET : LDOPregister<0b011, "set", 0, 0, "">;
+defm LDSETA : LDOPregister<0b011, "set", 1, 0, "a">;
+defm LDSETL : LDOPregister<0b011, "set", 0, 1, "l">;
+defm LDSETAL : LDOPregister<0b011, "set", 1, 1, "al">;
+
+defm LDSMAX : LDOPregister<0b100, "smax", 0, 0, "">;
+defm LDSMAXA : LDOPregister<0b100, "smax", 1, 0, "a">;
+defm LDSMAXL : LDOPregister<0b100, "smax", 0, 1, "l">;
+defm LDSMAXAL : LDOPregister<0b100, "smax", 1, 1, "al">;
+
+defm LDSMIN : LDOPregister<0b101, "smin", 0, 0, "">;
+defm LDSMINA : LDOPregister<0b101, "smin", 1, 0, "a">;
+defm LDSMINL : LDOPregister<0b101, "smin", 0, 1, "l">;
+defm LDSMINAL : LDOPregister<0b101, "smin", 1, 1, "al">;
+
+defm LDUMAX : LDOPregister<0b110, "umax", 0, 0, "">;
+defm LDUMAXA : LDOPregister<0b110, "umax", 1, 0, "a">;
+defm LDUMAXL : LDOPregister<0b110, "umax", 0, 1, "l">;
+defm LDUMAXAL : LDOPregister<0b110, "umax", 1, 1, "al">;
+
+defm LDUMIN : LDOPregister<0b111, "umin", 0, 0, "">;
+defm LDUMINA : LDOPregister<0b111, "umin", 1, 0, "a">;
+defm LDUMINL : LDOPregister<0b111, "umin", 0, 1, "l">;
+defm LDUMINAL : LDOPregister<0b111, "umin", 1, 1, "al">;
+
+// v8.1 atomic ST<OP>(register) as aliases to "LD<OP>(register) when Rt=xZR"
+defm : STOPregister<"stadd","LDADD">; // STADDx
+defm : STOPregister<"stclr","LDCLR">; // STCLRx
+defm : STOPregister<"steor","LDEOR">; // STEORx
+defm : STOPregister<"stset","LDSET">; // STSETx
+defm : STOPregister<"stsmax","LDSMAX">;// STSMAXx
+defm : STOPregister<"stsmin","LDSMIN">;// STSMINx
+defm : STOPregister<"stumax","LDUMAX">;// STUMAXx
+defm : STOPregister<"stumin","LDUMIN">;// STUMINx
+
+//===----------------------------------------------------------------------===//
+// Logical instructions.
+//===----------------------------------------------------------------------===//
+
+// (immediate)
+defm ANDS : LogicalImmS<0b11, "ands", AArch64and_flag, "bics">;
+defm AND : LogicalImm<0b00, "and", and, "bic">;
+defm EOR : LogicalImm<0b10, "eor", xor, "eon">;
+defm ORR : LogicalImm<0b01, "orr", or, "orn">;
+
+// FIXME: these aliases *are* canonical sometimes (when movz can't be
+// used). Actually, it seems to be working right now, but putting logical_immXX
+// here is a bit dodgy on the AsmParser side too.
+def : InstAlias<"mov $dst, $imm", (ORRWri GPR32sp:$dst, WZR,
+ logical_imm32:$imm), 0>;
+def : InstAlias<"mov $dst, $imm", (ORRXri GPR64sp:$dst, XZR,
+ logical_imm64:$imm), 0>;
+
+
+// (register)
+defm ANDS : LogicalRegS<0b11, 0, "ands", AArch64and_flag>;
+defm BICS : LogicalRegS<0b11, 1, "bics",
+ BinOpFrag<(AArch64and_flag node:$LHS, (not node:$RHS))>>;
+defm AND : LogicalReg<0b00, 0, "and", and>;
+defm BIC : LogicalReg<0b00, 1, "bic",
+ BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
+defm EON : LogicalReg<0b10, 1, "eon",
+ BinOpFrag<(not (xor node:$LHS, node:$RHS))>>;
+defm EOR : LogicalReg<0b10, 0, "eor", xor>;
+defm ORN : LogicalReg<0b01, 1, "orn",
+ BinOpFrag<(or node:$LHS, (not node:$RHS))>>;
+defm ORR : LogicalReg<0b01, 0, "orr", or>;
+
+def : InstAlias<"mov $dst, $src", (ORRWrs GPR32:$dst, WZR, GPR32:$src, 0), 2>;
+def : InstAlias<"mov $dst, $src", (ORRXrs GPR64:$dst, XZR, GPR64:$src, 0), 2>;
+
+def : InstAlias<"mvn $Wd, $Wm", (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, 0), 3>;
+def : InstAlias<"mvn $Xd, $Xm", (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, 0), 3>;
+
+def : InstAlias<"mvn $Wd, $Wm$sh",
+ (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, logical_shift32:$sh), 2>;
+def : InstAlias<"mvn $Xd, $Xm$sh",
+ (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, logical_shift64:$sh), 2>;
+
+def : InstAlias<"tst $src1, $src2",
+ (ANDSWri WZR, GPR32:$src1, logical_imm32:$src2), 2>;
+def : InstAlias<"tst $src1, $src2",
+ (ANDSXri XZR, GPR64:$src1, logical_imm64:$src2), 2>;
+
+def : InstAlias<"tst $src1, $src2",
+ (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, 0), 3>;
+def : InstAlias<"tst $src1, $src2",
+ (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, 0), 3>;
+
+def : InstAlias<"tst $src1, $src2$sh",
+ (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, logical_shift32:$sh), 2>;
+def : InstAlias<"tst $src1, $src2$sh",
+ (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, logical_shift64:$sh), 2>;
+
+
+def : Pat<(not GPR32:$Wm), (ORNWrr WZR, GPR32:$Wm)>;
+def : Pat<(not GPR64:$Xm), (ORNXrr XZR, GPR64:$Xm)>;
+
+
+//===----------------------------------------------------------------------===//
+// One operand data processing instructions.
+//===----------------------------------------------------------------------===//
+
+defm CLS : OneOperandData<0b101, "cls">;
+defm CLZ : OneOperandData<0b100, "clz", ctlz>;
+defm RBIT : OneOperandData<0b000, "rbit">;
+
+def : Pat<(int_aarch64_rbit GPR32:$Rn), (RBITWr $Rn)>;
+def : Pat<(int_aarch64_rbit GPR64:$Rn), (RBITXr $Rn)>;
+
+def REV16Wr : OneWRegData<0b001, "rev16",
+ UnOpFrag<(rotr (bswap node:$LHS), (i64 16))>>;
+def REV16Xr : OneXRegData<0b001, "rev16", null_frag>;
+
+def : Pat<(cttz GPR32:$Rn),
+ (CLZWr (RBITWr GPR32:$Rn))>;
+def : Pat<(cttz GPR64:$Rn),
+ (CLZXr (RBITXr GPR64:$Rn))>;
+def : Pat<(ctlz (or (shl (xor (sra GPR32:$Rn, (i64 31)), GPR32:$Rn), (i64 1)),
+ (i32 1))),
+ (CLSWr GPR32:$Rn)>;
+def : Pat<(ctlz (or (shl (xor (sra GPR64:$Rn, (i64 63)), GPR64:$Rn), (i64 1)),
+ (i64 1))),
+ (CLSXr GPR64:$Rn)>;
+
+// Unlike the other one operand instructions, the instructions with the "rev"
+// mnemonic do *not* just different in the size bit, but actually use different
+// opcode bits for the different sizes.
+def REVWr : OneWRegData<0b010, "rev", bswap>;
+def REVXr : OneXRegData<0b011, "rev", bswap>;
+def REV32Xr : OneXRegData<0b010, "rev32",
+ UnOpFrag<(rotr (bswap node:$LHS), (i64 32))>>;
+
+def : InstAlias<"rev64 $Rd, $Rn", (REVXr GPR64:$Rd, GPR64:$Rn), 0>;
+
+// The bswap commutes with the rotr so we want a pattern for both possible
+// orders.
+def : Pat<(bswap (rotr GPR32:$Rn, (i64 16))), (REV16Wr GPR32:$Rn)>;
+def : Pat<(bswap (rotr GPR64:$Rn, (i64 32))), (REV32Xr GPR64:$Rn)>;
+
+//===----------------------------------------------------------------------===//
+// Bitfield immediate extraction instruction.
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0 in
+defm EXTR : ExtractImm<"extr">;
+def : InstAlias<"ror $dst, $src, $shift",
+ (EXTRWrri GPR32:$dst, GPR32:$src, GPR32:$src, imm0_31:$shift)>;
+def : InstAlias<"ror $dst, $src, $shift",
+ (EXTRXrri GPR64:$dst, GPR64:$src, GPR64:$src, imm0_63:$shift)>;
+
+def : Pat<(rotr GPR32:$Rn, (i64 imm0_31:$imm)),
+ (EXTRWrri GPR32:$Rn, GPR32:$Rn, imm0_31:$imm)>;
+def : Pat<(rotr GPR64:$Rn, (i64 imm0_63:$imm)),
+ (EXTRXrri GPR64:$Rn, GPR64:$Rn, imm0_63:$imm)>;
+
+//===----------------------------------------------------------------------===//
+// Other bitfield immediate instructions.
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0 in {
+defm BFM : BitfieldImmWith2RegArgs<0b01, "bfm">;
+defm SBFM : BitfieldImm<0b00, "sbfm">;
+defm UBFM : BitfieldImm<0b10, "ubfm">;
+}
+
+def i32shift_a : Operand<i64>, SDNodeXForm<imm, [{
+ uint64_t enc = (32 - N->getZExtValue()) & 0x1f;
+ return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
+}]>;
+
+def i32shift_b : Operand<i64>, SDNodeXForm<imm, [{
+ uint64_t enc = 31 - N->getZExtValue();
+ return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
+}]>;
+
+// min(7, 31 - shift_amt)
+def i32shift_sext_i8 : Operand<i64>, SDNodeXForm<imm, [{
+ uint64_t enc = 31 - N->getZExtValue();
+ enc = enc > 7 ? 7 : enc;
+ return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
+}]>;
+
+// min(15, 31 - shift_amt)
+def i32shift_sext_i16 : Operand<i64>, SDNodeXForm<imm, [{
+ uint64_t enc = 31 - N->getZExtValue();
+ enc = enc > 15 ? 15 : enc;
+ return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
+}]>;
+
+def i64shift_a : Operand<i64>, SDNodeXForm<imm, [{
+ uint64_t enc = (64 - N->getZExtValue()) & 0x3f;
+ return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
+}]>;
+
+def i64shift_b : Operand<i64>, SDNodeXForm<imm, [{
+ uint64_t enc = 63 - N->getZExtValue();
+ return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
+}]>;
+
+// min(7, 63 - shift_amt)
+def i64shift_sext_i8 : Operand<i64>, SDNodeXForm<imm, [{
+ uint64_t enc = 63 - N->getZExtValue();
+ enc = enc > 7 ? 7 : enc;
+ return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
+}]>;
+
+// min(15, 63 - shift_amt)
+def i64shift_sext_i16 : Operand<i64>, SDNodeXForm<imm, [{
+ uint64_t enc = 63 - N->getZExtValue();
+ enc = enc > 15 ? 15 : enc;
+ return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
+}]>;
+
+// min(31, 63 - shift_amt)
+def i64shift_sext_i32 : Operand<i64>, SDNodeXForm<imm, [{
+ uint64_t enc = 63 - N->getZExtValue();
+ enc = enc > 31 ? 31 : enc;
+ return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
+}]>;
+
+def : Pat<(shl GPR32:$Rn, (i64 imm0_31:$imm)),
+ (UBFMWri GPR32:$Rn, (i64 (i32shift_a imm0_31:$imm)),
+ (i64 (i32shift_b imm0_31:$imm)))>;
+def : Pat<(shl GPR64:$Rn, (i64 imm0_63:$imm)),
+ (UBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)),
+ (i64 (i64shift_b imm0_63:$imm)))>;
+
+let AddedComplexity = 10 in {
+def : Pat<(sra GPR32:$Rn, (i64 imm0_31:$imm)),
+ (SBFMWri GPR32:$Rn, imm0_31:$imm, 31)>;
+def : Pat<(sra GPR64:$Rn, (i64 imm0_63:$imm)),
+ (SBFMXri GPR64:$Rn, imm0_63:$imm, 63)>;
+}
+
+def : InstAlias<"asr $dst, $src, $shift",
+ (SBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>;
+def : InstAlias<"asr $dst, $src, $shift",
+ (SBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>;
+def : InstAlias<"sxtb $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 7)>;
+def : InstAlias<"sxtb $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 7)>;
+def : InstAlias<"sxth $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 15)>;
+def : InstAlias<"sxth $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 15)>;
+def : InstAlias<"sxtw $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 31)>;
+
+def : Pat<(srl GPR32:$Rn, (i64 imm0_31:$imm)),
+ (UBFMWri GPR32:$Rn, imm0_31:$imm, 31)>;
+def : Pat<(srl GPR64:$Rn, (i64 imm0_63:$imm)),
+ (UBFMXri GPR64:$Rn, imm0_63:$imm, 63)>;
+
+def : InstAlias<"lsr $dst, $src, $shift",
+ (UBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>;
+def : InstAlias<"lsr $dst, $src, $shift",
+ (UBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>;
+def : InstAlias<"uxtb $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 7)>;
+def : InstAlias<"uxtb $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 7)>;
+def : InstAlias<"uxth $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 15)>;
+def : InstAlias<"uxth $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 15)>;
+def : InstAlias<"uxtw $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 31)>;
+
+//===----------------------------------------------------------------------===//
+// Conditional comparison instructions.
+//===----------------------------------------------------------------------===//
+defm CCMN : CondComparison<0, "ccmn", AArch64ccmn>;
+defm CCMP : CondComparison<1, "ccmp", AArch64ccmp>;
+
+//===----------------------------------------------------------------------===//
+// Conditional select instructions.
+//===----------------------------------------------------------------------===//
+defm CSEL : CondSelect<0, 0b00, "csel">;
+
+def inc : PatFrag<(ops node:$in), (add node:$in, 1)>;
+defm CSINC : CondSelectOp<0, 0b01, "csinc", inc>;
+defm CSINV : CondSelectOp<1, 0b00, "csinv", not>;
+defm CSNEG : CondSelectOp<1, 0b01, "csneg", ineg>;
+
+def : Pat<(AArch64csinv GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
+ (CSINVWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csinv GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
+ (CSINVXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csneg GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
+ (CSNEGWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csneg GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
+ (CSNEGXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csinc GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
+ (CSINCWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csinc GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
+ (CSINCXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
+
+def : Pat<(AArch64csel (i32 0), (i32 1), (i32 imm:$cc), NZCV),
+ (CSINCWr WZR, WZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel (i64 0), (i64 1), (i32 imm:$cc), NZCV),
+ (CSINCXr XZR, XZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel GPR32:$tval, (i32 1), (i32 imm:$cc), NZCV),
+ (CSINCWr GPR32:$tval, WZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel GPR64:$tval, (i64 1), (i32 imm:$cc), NZCV),
+ (CSINCXr GPR64:$tval, XZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel (i32 1), GPR32:$fval, (i32 imm:$cc), NZCV),
+ (CSINCWr GPR32:$fval, WZR, (i32 (inv_cond_XFORM imm:$cc)))>;
+def : Pat<(AArch64csel (i64 1), GPR64:$fval, (i32 imm:$cc), NZCV),
+ (CSINCXr GPR64:$fval, XZR, (i32 (inv_cond_XFORM imm:$cc)))>;
+def : Pat<(AArch64csel (i32 0), (i32 -1), (i32 imm:$cc), NZCV),
+ (CSINVWr WZR, WZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel (i64 0), (i64 -1), (i32 imm:$cc), NZCV),
+ (CSINVXr XZR, XZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel GPR32:$tval, (i32 -1), (i32 imm:$cc), NZCV),
+ (CSINVWr GPR32:$tval, WZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel GPR64:$tval, (i64 -1), (i32 imm:$cc), NZCV),
+ (CSINVXr GPR64:$tval, XZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel (i32 -1), GPR32:$fval, (i32 imm:$cc), NZCV),
+ (CSINVWr GPR32:$fval, WZR, (i32 (inv_cond_XFORM imm:$cc)))>;
+def : Pat<(AArch64csel (i64 -1), GPR64:$fval, (i32 imm:$cc), NZCV),
+ (CSINVXr GPR64:$fval, XZR, (i32 (inv_cond_XFORM imm:$cc)))>;
+
+// The inverse of the condition code from the alias instruction is what is used
+// in the aliased instruction. The parser all ready inverts the condition code
+// for these aliases.
+def : InstAlias<"cset $dst, $cc",
+ (CSINCWr GPR32:$dst, WZR, WZR, inv_ccode:$cc)>;
+def : InstAlias<"cset $dst, $cc",
+ (CSINCXr GPR64:$dst, XZR, XZR, inv_ccode:$cc)>;
+
+def : InstAlias<"csetm $dst, $cc",
+ (CSINVWr GPR32:$dst, WZR, WZR, inv_ccode:$cc)>;
+def : InstAlias<"csetm $dst, $cc",
+ (CSINVXr GPR64:$dst, XZR, XZR, inv_ccode:$cc)>;
+
+def : InstAlias<"cinc $dst, $src, $cc",
+ (CSINCWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
+def : InstAlias<"cinc $dst, $src, $cc",
+ (CSINCXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;
+
+def : InstAlias<"cinv $dst, $src, $cc",
+ (CSINVWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
+def : InstAlias<"cinv $dst, $src, $cc",
+ (CSINVXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;
+
+def : InstAlias<"cneg $dst, $src, $cc",
+ (CSNEGWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
+def : InstAlias<"cneg $dst, $src, $cc",
+ (CSNEGXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;
+
+//===----------------------------------------------------------------------===//
+// PC-relative instructions.
+//===----------------------------------------------------------------------===//
+let isReMaterializable = 1 in {
+let hasSideEffects = 0, mayStore = 0, mayLoad = 0 in {
+def ADR : ADRI<0, "adr", adrlabel, []>;
+} // hasSideEffects = 0
+
+def ADRP : ADRI<1, "adrp", adrplabel,
+ [(set GPR64:$Xd, (AArch64adrp tglobaladdr:$label))]>;
+} // isReMaterializable = 1
+
+// page address of a constant pool entry, block address
+def : Pat<(AArch64adrp tconstpool:$cp), (ADRP tconstpool:$cp)>;
+def : Pat<(AArch64adrp tblockaddress:$cp), (ADRP tblockaddress:$cp)>;
+
+//===----------------------------------------------------------------------===//
+// Unconditional branch (register) instructions.
+//===----------------------------------------------------------------------===//
+
+let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
+def RET : BranchReg<0b0010, "ret", []>;
+def DRPS : SpecialReturn<0b0101, "drps">;
+def ERET : SpecialReturn<0b0100, "eret">;
+} // isReturn = 1, isTerminator = 1, isBarrier = 1
+
+// Default to the LR register.
+def : InstAlias<"ret", (RET LR)>;
+
+let isCall = 1, Defs = [LR], Uses = [SP] in {
+def BLR : BranchReg<0b0001, "blr", [(AArch64call GPR64:$Rn)]>;
+} // isCall
+
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+def BR : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>;
+} // isBranch, isTerminator, isBarrier, isIndirectBranch
+
+// Create a separate pseudo-instruction for codegen to use so that we don't
+// flag lr as used in every function. It'll be restored before the RET by the
+// epilogue if it's legitimately used.
+def RET_ReallyLR : Pseudo<(outs), (ins), [(AArch64retflag)]>,
+ Sched<[WriteBrReg]> {
+ let isTerminator = 1;
+ let isBarrier = 1;
+ let isReturn = 1;
+}
+
+// This is a directive-like pseudo-instruction. The purpose is to insert an
+// R_AARCH64_TLSDESC_CALL relocation at the offset of the following instruction
+// (which in the usual case is a BLR).
+let hasSideEffects = 1 in
+def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym), []>, Sched<[]> {
+ let AsmString = ".tlsdesccall $sym";
+}
+
+// FIXME: maybe the scratch register used shouldn't be fixed to X1?
+// FIXME: can "hasSideEffects be dropped?
+let isCall = 1, Defs = [LR, X0, X1], hasSideEffects = 1,
+ isCodeGenOnly = 1 in
+def TLSDESC_CALLSEQ
+ : Pseudo<(outs), (ins i64imm:$sym),
+ [(AArch64tlsdesc_callseq tglobaltlsaddr:$sym)]>,
+ Sched<[WriteI, WriteLD, WriteI, WriteBrReg]>;
+def : Pat<(AArch64tlsdesc_callseq texternalsym:$sym),
+ (TLSDESC_CALLSEQ texternalsym:$sym)>;
+
+//===----------------------------------------------------------------------===//
+// Conditional branch (immediate) instruction.
+//===----------------------------------------------------------------------===//
+def Bcc : BranchCond;
+
+//===----------------------------------------------------------------------===//
+// Compare-and-branch instructions.
+//===----------------------------------------------------------------------===//
+defm CBZ : CmpBranch<0, "cbz", AArch64cbz>;
+defm CBNZ : CmpBranch<1, "cbnz", AArch64cbnz>;
+
+//===----------------------------------------------------------------------===//
+// Test-bit-and-branch instructions.
+//===----------------------------------------------------------------------===//
+defm TBZ : TestBranch<0, "tbz", AArch64tbz>;
+defm TBNZ : TestBranch<1, "tbnz", AArch64tbnz>;
+
+//===----------------------------------------------------------------------===//
+// Unconditional branch (immediate) instructions.
+//===----------------------------------------------------------------------===//
+let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
+def B : BranchImm<0, "b", [(br bb:$addr)]>;
+} // isBranch, isTerminator, isBarrier
+
+let isCall = 1, Defs = [LR], Uses = [SP] in {
+def BL : CallImm<1, "bl", [(AArch64call tglobaladdr:$addr)]>;
+} // isCall
+def : Pat<(AArch64call texternalsym:$func), (BL texternalsym:$func)>;
+
+//===----------------------------------------------------------------------===//
+// Exception generation instructions.
+//===----------------------------------------------------------------------===//
+def BRK : ExceptionGeneration<0b001, 0b00, "brk">;
+def DCPS1 : ExceptionGeneration<0b101, 0b01, "dcps1">;
+def DCPS2 : ExceptionGeneration<0b101, 0b10, "dcps2">;
+def DCPS3 : ExceptionGeneration<0b101, 0b11, "dcps3">;
+def HLT : ExceptionGeneration<0b010, 0b00, "hlt">;
+def HVC : ExceptionGeneration<0b000, 0b10, "hvc">;
+def SMC : ExceptionGeneration<0b000, 0b11, "smc">;
+def SVC : ExceptionGeneration<0b000, 0b01, "svc">;
+
+// DCPSn defaults to an immediate operand of zero if unspecified.
+def : InstAlias<"dcps1", (DCPS1 0)>;
+def : InstAlias<"dcps2", (DCPS2 0)>;
+def : InstAlias<"dcps3", (DCPS3 0)>;
+
+//===----------------------------------------------------------------------===//
+// Load instructions.
+//===----------------------------------------------------------------------===//
+
+// Pair (indexed, offset)
+defm LDPW : LoadPairOffset<0b00, 0, GPR32, simm7s4, "ldp">;
+defm LDPX : LoadPairOffset<0b10, 0, GPR64, simm7s8, "ldp">;
+defm LDPS : LoadPairOffset<0b00, 1, FPR32, simm7s4, "ldp">;
+defm LDPD : LoadPairOffset<0b01, 1, FPR64, simm7s8, "ldp">;
+defm LDPQ : LoadPairOffset<0b10, 1, FPR128, simm7s16, "ldp">;
+
+defm LDPSW : LoadPairOffset<0b01, 0, GPR64, simm7s4, "ldpsw">;
+
+// Pair (pre-indexed)
+def LDPWpre : LoadPairPreIdx<0b00, 0, GPR32, simm7s4, "ldp">;
+def LDPXpre : LoadPairPreIdx<0b10, 0, GPR64, simm7s8, "ldp">;
+def LDPSpre : LoadPairPreIdx<0b00, 1, FPR32, simm7s4, "ldp">;
+def LDPDpre : LoadPairPreIdx<0b01, 1, FPR64, simm7s8, "ldp">;
+def LDPQpre : LoadPairPreIdx<0b10, 1, FPR128, simm7s16, "ldp">;
+
+def LDPSWpre : LoadPairPreIdx<0b01, 0, GPR64, simm7s4, "ldpsw">;
+
+// Pair (post-indexed)
+def LDPWpost : LoadPairPostIdx<0b00, 0, GPR32, simm7s4, "ldp">;
+def LDPXpost : LoadPairPostIdx<0b10, 0, GPR64, simm7s8, "ldp">;
+def LDPSpost : LoadPairPostIdx<0b00, 1, FPR32, simm7s4, "ldp">;
+def LDPDpost : LoadPairPostIdx<0b01, 1, FPR64, simm7s8, "ldp">;
+def LDPQpost : LoadPairPostIdx<0b10, 1, FPR128, simm7s16, "ldp">;
+
+def LDPSWpost : LoadPairPostIdx<0b01, 0, GPR64, simm7s4, "ldpsw">;
+
+
+// Pair (no allocate)
+defm LDNPW : LoadPairNoAlloc<0b00, 0, GPR32, simm7s4, "ldnp">;
+defm LDNPX : LoadPairNoAlloc<0b10, 0, GPR64, simm7s8, "ldnp">;
+defm LDNPS : LoadPairNoAlloc<0b00, 1, FPR32, simm7s4, "ldnp">;
+defm LDNPD : LoadPairNoAlloc<0b01, 1, FPR64, simm7s8, "ldnp">;
+defm LDNPQ : LoadPairNoAlloc<0b10, 1, FPR128, simm7s16, "ldnp">;
+
+//---
+// (register offset)
+//---
+
+// Integer
+defm LDRBB : Load8RO<0b00, 0, 0b01, GPR32, "ldrb", i32, zextloadi8>;
+defm LDRHH : Load16RO<0b01, 0, 0b01, GPR32, "ldrh", i32, zextloadi16>;
+defm LDRW : Load32RO<0b10, 0, 0b01, GPR32, "ldr", i32, load>;
+defm LDRX : Load64RO<0b11, 0, 0b01, GPR64, "ldr", i64, load>;
+
+// Floating-point
+defm LDRB : Load8RO<0b00, 1, 0b01, FPR8, "ldr", untyped, load>;
+defm LDRH : Load16RO<0b01, 1, 0b01, FPR16, "ldr", f16, load>;
+defm LDRS : Load32RO<0b10, 1, 0b01, FPR32, "ldr", f32, load>;
+defm LDRD : Load64RO<0b11, 1, 0b01, FPR64, "ldr", f64, load>;
+defm LDRQ : Load128RO<0b00, 1, 0b11, FPR128, "ldr", f128, load>;
+
+// Load sign-extended half-word
+defm LDRSHW : Load16RO<0b01, 0, 0b11, GPR32, "ldrsh", i32, sextloadi16>;
+defm LDRSHX : Load16RO<0b01, 0, 0b10, GPR64, "ldrsh", i64, sextloadi16>;
+
+// Load sign-extended byte
+defm LDRSBW : Load8RO<0b00, 0, 0b11, GPR32, "ldrsb", i32, sextloadi8>;
+defm LDRSBX : Load8RO<0b00, 0, 0b10, GPR64, "ldrsb", i64, sextloadi8>;
+
+// Load sign-extended word
+defm LDRSW : Load32RO<0b10, 0, 0b10, GPR64, "ldrsw", i64, sextloadi32>;
+
+// Pre-fetch.
+defm PRFM : PrefetchRO<0b11, 0, 0b10, "prfm">;
+
+// For regular load, we do not have any alignment requirement.
+// Thus, it is safe to directly map the vector loads with interesting
+// addressing modes.
+// FIXME: We could do the same for bitconvert to floating point vectors.
+multiclass ScalToVecROLoadPat<ROAddrMode ro, SDPatternOperator loadop,
+ ValueType ScalTy, ValueType VecTy,
+ Instruction LOADW, Instruction LOADX,
+ SubRegIndex sub> {
+ def : Pat<(VecTy (scalar_to_vector (ScalTy
+ (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset))))),
+ (INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
+ (LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset),
+ sub)>;
+
+ def : Pat<(VecTy (scalar_to_vector (ScalTy
+ (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset))))),
+ (INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
+ (LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset),
+ sub)>;
+}
+
+let AddedComplexity = 10 in {
+defm : ScalToVecROLoadPat<ro8, extloadi8, i32, v8i8, LDRBroW, LDRBroX, bsub>;
+defm : ScalToVecROLoadPat<ro8, extloadi8, i32, v16i8, LDRBroW, LDRBroX, bsub>;
+
+defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v4i16, LDRHroW, LDRHroX, hsub>;
+defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v8i16, LDRHroW, LDRHroX, hsub>;
+
+defm : ScalToVecROLoadPat<ro16, load, i32, v4f16, LDRHroW, LDRHroX, hsub>;
+defm : ScalToVecROLoadPat<ro16, load, i32, v8f16, LDRHroW, LDRHroX, hsub>;
+
+defm : ScalToVecROLoadPat<ro32, load, i32, v2i32, LDRSroW, LDRSroX, ssub>;
+defm : ScalToVecROLoadPat<ro32, load, i32, v4i32, LDRSroW, LDRSroX, ssub>;
+
+defm : ScalToVecROLoadPat<ro32, load, f32, v2f32, LDRSroW, LDRSroX, ssub>;
+defm : ScalToVecROLoadPat<ro32, load, f32, v4f32, LDRSroW, LDRSroX, ssub>;
+
+defm : ScalToVecROLoadPat<ro64, load, i64, v2i64, LDRDroW, LDRDroX, dsub>;
+
+defm : ScalToVecROLoadPat<ro64, load, f64, v2f64, LDRDroW, LDRDroX, dsub>;
+
+
+def : Pat <(v1i64 (scalar_to_vector (i64
+ (load (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+ ro_Wextend64:$extend))))),
+ (LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
+
+def : Pat <(v1i64 (scalar_to_vector (i64
+ (load (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend64:$extend))))),
+ (LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
+}
+
+// Match all load 64 bits width whose type is compatible with FPR64
+multiclass VecROLoadPat<ROAddrMode ro, ValueType VecTy,
+ Instruction LOADW, Instruction LOADX> {
+
+ def : Pat<(VecTy (load (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))),
+ (LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
+
+ def : Pat<(VecTy (load (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))),
+ (LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
+}
+
+let AddedComplexity = 10 in {
+let Predicates = [IsLE] in {
+ // We must do vector loads with LD1 in big-endian.
+ defm : VecROLoadPat<ro64, v2i32, LDRDroW, LDRDroX>;
+ defm : VecROLoadPat<ro64, v2f32, LDRDroW, LDRDroX>;
+ defm : VecROLoadPat<ro64, v8i8, LDRDroW, LDRDroX>;
+ defm : VecROLoadPat<ro64, v4i16, LDRDroW, LDRDroX>;
+ defm : VecROLoadPat<ro64, v4f16, LDRDroW, LDRDroX>;
+}
+
+defm : VecROLoadPat<ro64, v1i64, LDRDroW, LDRDroX>;
+defm : VecROLoadPat<ro64, v1f64, LDRDroW, LDRDroX>;
+
+// Match all load 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+ // We must do vector loads with LD1 in big-endian.
+ defm : VecROLoadPat<ro128, v2i64, LDRQroW, LDRQroX>;
+ defm : VecROLoadPat<ro128, v2f64, LDRQroW, LDRQroX>;
+ defm : VecROLoadPat<ro128, v4i32, LDRQroW, LDRQroX>;
+ defm : VecROLoadPat<ro128, v4f32, LDRQroW, LDRQroX>;
+ defm : VecROLoadPat<ro128, v8i16, LDRQroW, LDRQroX>;
+ defm : VecROLoadPat<ro128, v8f16, LDRQroW, LDRQroX>;
+ defm : VecROLoadPat<ro128, v16i8, LDRQroW, LDRQroX>;
+}
+} // AddedComplexity = 10
+
+// zextload -> i64
+multiclass ExtLoadTo64ROPat<ROAddrMode ro, SDPatternOperator loadop,
+ Instruction INSTW, Instruction INSTX> {
+ def : Pat<(i64 (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))),
+ (SUBREG_TO_REG (i64 0),
+ (INSTW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend),
+ sub_32)>;
+
+ def : Pat<(i64 (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))),
+ (SUBREG_TO_REG (i64 0),
+ (INSTX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend),
+ sub_32)>;
+}
+
+let AddedComplexity = 10 in {
+ defm : ExtLoadTo64ROPat<ro8, zextloadi8, LDRBBroW, LDRBBroX>;
+ defm : ExtLoadTo64ROPat<ro16, zextloadi16, LDRHHroW, LDRHHroX>;
+ defm : ExtLoadTo64ROPat<ro32, zextloadi32, LDRWroW, LDRWroX>;
+
+ // zextloadi1 -> zextloadi8
+ defm : ExtLoadTo64ROPat<ro8, zextloadi1, LDRBBroW, LDRBBroX>;
+
+ // extload -> zextload
+ defm : ExtLoadTo64ROPat<ro8, extloadi8, LDRBBroW, LDRBBroX>;
+ defm : ExtLoadTo64ROPat<ro16, extloadi16, LDRHHroW, LDRHHroX>;
+ defm : ExtLoadTo64ROPat<ro32, extloadi32, LDRWroW, LDRWroX>;
+
+ // extloadi1 -> zextloadi8
+ defm : ExtLoadTo64ROPat<ro8, extloadi1, LDRBBroW, LDRBBroX>;
+}
+
+
+// zextload -> i64
+multiclass ExtLoadTo32ROPat<ROAddrMode ro, SDPatternOperator loadop,
+ Instruction INSTW, Instruction INSTX> {
+ def : Pat<(i32 (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))),
+ (INSTW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
+
+ def : Pat<(i32 (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))),
+ (INSTX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
+
+}
+
+let AddedComplexity = 10 in {
+ // extload -> zextload
+ defm : ExtLoadTo32ROPat<ro8, extloadi8, LDRBBroW, LDRBBroX>;
+ defm : ExtLoadTo32ROPat<ro16, extloadi16, LDRHHroW, LDRHHroX>;
+ defm : ExtLoadTo32ROPat<ro32, extloadi32, LDRWroW, LDRWroX>;
+
+ // zextloadi1 -> zextloadi8
+ defm : ExtLoadTo32ROPat<ro8, zextloadi1, LDRBBroW, LDRBBroX>;
+}
+
+//---
+// (unsigned immediate)
+//---
+defm LDRX : LoadUI<0b11, 0, 0b01, GPR64, uimm12s8, "ldr",
+ [(set GPR64:$Rt,
+ (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)))]>;
+defm LDRW : LoadUI<0b10, 0, 0b01, GPR32, uimm12s4, "ldr",
+ [(set GPR32:$Rt,
+ (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>;
+defm LDRB : LoadUI<0b00, 1, 0b01, FPR8, uimm12s1, "ldr",
+ [(set FPR8:$Rt,
+ (load (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)))]>;
+defm LDRH : LoadUI<0b01, 1, 0b01, FPR16, uimm12s2, "ldr",
+ [(set (f16 FPR16:$Rt),
+ (load (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)))]>;
+defm LDRS : LoadUI<0b10, 1, 0b01, FPR32, uimm12s4, "ldr",
+ [(set (f32 FPR32:$Rt),
+ (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>;
+defm LDRD : LoadUI<0b11, 1, 0b01, FPR64, uimm12s8, "ldr",
+ [(set (f64 FPR64:$Rt),
+ (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)))]>;
+defm LDRQ : LoadUI<0b00, 1, 0b11, FPR128, uimm12s16, "ldr",
+ [(set (f128 FPR128:$Rt),
+ (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)))]>;
+
+// For regular load, we do not have any alignment requirement.
+// Thus, it is safe to directly map the vector loads with interesting
+// addressing modes.
+// FIXME: We could do the same for bitconvert to floating point vectors.
+def : Pat <(v8i8 (scalar_to_vector (i32
+ (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+ (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
+ (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
+def : Pat <(v16i8 (scalar_to_vector (i32
+ (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
+def : Pat <(v4i16 (scalar_to_vector (i32
+ (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+ (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
+ (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
+def : Pat <(v8i16 (scalar_to_vector (i32
+ (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+ (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
+ (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
+def : Pat <(v2i32 (scalar_to_vector (i32
+ (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
+ (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
+ (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
+def : Pat <(v4i32 (scalar_to_vector (i32
+ (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
+ (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
+ (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
+def : Pat <(v1i64 (scalar_to_vector (i64
+ (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
+ (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat <(v2i64 (scalar_to_vector (i64
+ (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
+ (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
+ (LDRDui GPR64sp:$Rn, uimm12s8:$offset), dsub)>;
+
+// Match all load 64 bits width whose type is compatible with FPR64
+let Predicates = [IsLE] in {
+ // We must use LD1 to perform vector loads in big-endian.
+ def : Pat<(v2f32 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+ (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+ def : Pat<(v8i8 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+ (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+ def : Pat<(v4i16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+ (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+ def : Pat<(v2i32 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+ (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+ def : Pat<(v4f16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+ (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+}
+def : Pat<(v1f64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+ (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat<(v1i64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+ (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+
+// Match all load 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+ // We must use LD1 to perform vector loads in big-endian.
+ def : Pat<(v4f32 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+ (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+ def : Pat<(v2f64 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+ (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+ def : Pat<(v16i8 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+ (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+ def : Pat<(v8i16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+ (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+ def : Pat<(v4i32 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+ (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+ def : Pat<(v2i64 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+ (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+ def : Pat<(v8f16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+ (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+}
+def : Pat<(f128 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+ (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+
+defm LDRHH : LoadUI<0b01, 0, 0b01, GPR32, uimm12s2, "ldrh",
+ [(set GPR32:$Rt,
+ (zextloadi16 (am_indexed16 GPR64sp:$Rn,
+ uimm12s2:$offset)))]>;
+defm LDRBB : LoadUI<0b00, 0, 0b01, GPR32, uimm12s1, "ldrb",
+ [(set GPR32:$Rt,
+ (zextloadi8 (am_indexed8 GPR64sp:$Rn,
+ uimm12s1:$offset)))]>;
+// zextload -> i64
+def : Pat<(i64 (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+ (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
+ (SUBREG_TO_REG (i64 0), (LDRHHui GPR64sp:$Rn, uimm12s2:$offset), sub_32)>;
+
+// zextloadi1 -> zextloadi8
+def : Pat<(i32 (zextloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+ (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(i64 (zextloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+ (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
+
+// extload -> zextload
+def : Pat<(i32 (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
+ (LDRHHui GPR64sp:$Rn, uimm12s2:$offset)>;
+def : Pat<(i32 (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+ (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(i32 (extloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+ (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(i64 (extloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))),
+ (SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>;
+def : Pat<(i64 (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
+ (SUBREG_TO_REG (i64 0), (LDRHHui GPR64sp:$Rn, uimm12s2:$offset), sub_32)>;
+def : Pat<(i64 (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+ (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
+def : Pat<(i64 (extloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+ (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
+
+// load sign-extended half-word
+defm LDRSHW : LoadUI<0b01, 0, 0b11, GPR32, uimm12s2, "ldrsh",
+ [(set GPR32:$Rt,
+ (sextloadi16 (am_indexed16 GPR64sp:$Rn,
+ uimm12s2:$offset)))]>;
+defm LDRSHX : LoadUI<0b01, 0, 0b10, GPR64, uimm12s2, "ldrsh",
+ [(set GPR64:$Rt,
+ (sextloadi16 (am_indexed16 GPR64sp:$Rn,
+ uimm12s2:$offset)))]>;
+
+// load sign-extended byte
+defm LDRSBW : LoadUI<0b00, 0, 0b11, GPR32, uimm12s1, "ldrsb",
+ [(set GPR32:$Rt,
+ (sextloadi8 (am_indexed8 GPR64sp:$Rn,
+ uimm12s1:$offset)))]>;
+defm LDRSBX : LoadUI<0b00, 0, 0b10, GPR64, uimm12s1, "ldrsb",
+ [(set GPR64:$Rt,
+ (sextloadi8 (am_indexed8 GPR64sp:$Rn,
+ uimm12s1:$offset)))]>;
+
+// load sign-extended word
+defm LDRSW : LoadUI<0b10, 0, 0b10, GPR64, uimm12s4, "ldrsw",
+ [(set GPR64:$Rt,
+ (sextloadi32 (am_indexed32 GPR64sp:$Rn,
+ uimm12s4:$offset)))]>;
+
+// load zero-extended word
+def : Pat<(i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))),
+ (SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>;
+
+// Pre-fetch.
+def PRFMui : PrefetchUI<0b11, 0, 0b10, "prfm",
+ [(AArch64Prefetch imm:$Rt,
+ (am_indexed64 GPR64sp:$Rn,
+ uimm12s8:$offset))]>;
+
+def : InstAlias<"prfm $Rt, [$Rn]", (PRFMui prfop:$Rt, GPR64sp:$Rn, 0)>;
+
+//---
+// (literal)
+def LDRWl : LoadLiteral<0b00, 0, GPR32, "ldr">;
+def LDRXl : LoadLiteral<0b01, 0, GPR64, "ldr">;
+def LDRSl : LoadLiteral<0b00, 1, FPR32, "ldr">;
+def LDRDl : LoadLiteral<0b01, 1, FPR64, "ldr">;
+def LDRQl : LoadLiteral<0b10, 1, FPR128, "ldr">;
+
+// load sign-extended word
+def LDRSWl : LoadLiteral<0b10, 0, GPR64, "ldrsw">;
+
+// prefetch
+def PRFMl : PrefetchLiteral<0b11, 0, "prfm", []>;
+// [(AArch64Prefetch imm:$Rt, tglobaladdr:$label)]>;
+
+//---
+// (unscaled immediate)
+defm LDURX : LoadUnscaled<0b11, 0, 0b01, GPR64, "ldur",
+ [(set GPR64:$Rt,
+ (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURW : LoadUnscaled<0b10, 0, 0b01, GPR32, "ldur",
+ [(set GPR32:$Rt,
+ (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURB : LoadUnscaled<0b00, 1, 0b01, FPR8, "ldur",
+ [(set FPR8:$Rt,
+ (load (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURH : LoadUnscaled<0b01, 1, 0b01, FPR16, "ldur",
+ [(set FPR16:$Rt,
+ (load (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURS : LoadUnscaled<0b10, 1, 0b01, FPR32, "ldur",
+ [(set (f32 FPR32:$Rt),
+ (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURD : LoadUnscaled<0b11, 1, 0b01, FPR64, "ldur",
+ [(set (f64 FPR64:$Rt),
+ (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURQ : LoadUnscaled<0b00, 1, 0b11, FPR128, "ldur",
+ [(set (f128 FPR128:$Rt),
+ (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset)))]>;
+
+defm LDURHH
+ : LoadUnscaled<0b01, 0, 0b01, GPR32, "ldurh",
+ [(set GPR32:$Rt,
+ (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURBB
+ : LoadUnscaled<0b00, 0, 0b01, GPR32, "ldurb",
+ [(set GPR32:$Rt,
+ (zextloadi8 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+
+// Match all load 64 bits width whose type is compatible with FPR64
+let Predicates = [IsLE] in {
+ def : Pat<(v2f32 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+ (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(v2i32 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+ (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(v4i16 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+ (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(v8i8 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+ (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(v4f16 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+ (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+}
+def : Pat<(v1f64 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+ (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(v1i64 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+ (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+
+// Match all load 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+ def : Pat<(v2f64 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+ (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(v2i64 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+ (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(v4f32 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+ (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(v4i32 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+ (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(v8i16 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+ (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(v16i8 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+ (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(v8f16 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+ (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+}
+
+// anyext -> zext
+def : Pat<(i32 (extloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+ (LDURHHi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i32 (extloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+ (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i32 (extloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+ (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i64 (extloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))),
+ (SUBREG_TO_REG (i64 0), (LDURWi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (extloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+ (SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (extloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+ (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (extloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+ (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+// unscaled zext
+def : Pat<(i32 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+ (LDURHHi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i32 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+ (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i32 (zextloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+ (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i64 (zextloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))),
+ (SUBREG_TO_REG (i64 0), (LDURWi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+ (SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+ (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+ (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+
+
+//---
+// LDR mnemonics fall back to LDUR for negative or unaligned offsets.
+
+// Define new assembler match classes as we want to only match these when
+// the don't otherwise match the scaled addressing mode for LDR/STR. Don't
+// associate a DiagnosticType either, as we want the diagnostic for the
+// canonical form (the scaled operand) to take precedence.
+class SImm9OffsetOperand<int Width> : AsmOperandClass {
+ let Name = "SImm9OffsetFB" # Width;
+ let PredicateMethod = "isSImm9OffsetFB<" # Width # ">";
+ let RenderMethod = "addImmOperands";
+}
+
+def SImm9OffsetFB8Operand : SImm9OffsetOperand<8>;
+def SImm9OffsetFB16Operand : SImm9OffsetOperand<16>;
+def SImm9OffsetFB32Operand : SImm9OffsetOperand<32>;
+def SImm9OffsetFB64Operand : SImm9OffsetOperand<64>;
+def SImm9OffsetFB128Operand : SImm9OffsetOperand<128>;
+
+def simm9_offset_fb8 : Operand<i64> {
+ let ParserMatchClass = SImm9OffsetFB8Operand;
+}
+def simm9_offset_fb16 : Operand<i64> {
+ let ParserMatchClass = SImm9OffsetFB16Operand;
+}
+def simm9_offset_fb32 : Operand<i64> {
+ let ParserMatchClass = SImm9OffsetFB32Operand;
+}
+def simm9_offset_fb64 : Operand<i64> {
+ let ParserMatchClass = SImm9OffsetFB64Operand;
+}
+def simm9_offset_fb128 : Operand<i64> {
+ let ParserMatchClass = SImm9OffsetFB128Operand;
+}
+
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+ (LDURXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+ (LDURWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+ (LDURBi FPR8:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+ (LDURHi FPR16:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+ (LDURSi FPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+ (LDURDi FPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+ (LDURQi FPR128:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>;
+
+// zextload -> i64
+def : Pat<(i64 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+ (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+ (SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+
+// load sign-extended half-word
+defm LDURSHW
+ : LoadUnscaled<0b01, 0, 0b11, GPR32, "ldursh",
+ [(set GPR32:$Rt,
+ (sextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURSHX
+ : LoadUnscaled<0b01, 0, 0b10, GPR64, "ldursh",
+ [(set GPR64:$Rt,
+ (sextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+
+// load sign-extended byte
+defm LDURSBW
+ : LoadUnscaled<0b00, 0, 0b11, GPR32, "ldursb",
+ [(set GPR32:$Rt,
+ (sextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURSBX
+ : LoadUnscaled<0b00, 0, 0b10, GPR64, "ldursb",
+ [(set GPR64:$Rt,
+ (sextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
+
+// load sign-extended word
+defm LDURSW
+ : LoadUnscaled<0b10, 0, 0b10, GPR64, "ldursw",
+ [(set GPR64:$Rt,
+ (sextloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
+
+// zero and sign extending aliases from generic LDR* mnemonics to LDUR*.
+def : InstAlias<"ldrb $Rt, [$Rn, $offset]",
+ (LDURBBi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"ldrh $Rt, [$Rn, $offset]",
+ (LDURHHi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"ldrsb $Rt, [$Rn, $offset]",
+ (LDURSBWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"ldrsb $Rt, [$Rn, $offset]",
+ (LDURSBXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"ldrsh $Rt, [$Rn, $offset]",
+ (LDURSHWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"ldrsh $Rt, [$Rn, $offset]",
+ (LDURSHXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"ldrsw $Rt, [$Rn, $offset]",
+ (LDURSWi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+
+// Pre-fetch.
+defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum",
+ [(AArch64Prefetch imm:$Rt,
+ (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
+
+//---
+// (unscaled immediate, unprivileged)
+defm LDTRX : LoadUnprivileged<0b11, 0, 0b01, GPR64, "ldtr">;
+defm LDTRW : LoadUnprivileged<0b10, 0, 0b01, GPR32, "ldtr">;
+
+defm LDTRH : LoadUnprivileged<0b01, 0, 0b01, GPR32, "ldtrh">;
+defm LDTRB : LoadUnprivileged<0b00, 0, 0b01, GPR32, "ldtrb">;
+
+// load sign-extended half-word
+defm LDTRSHW : LoadUnprivileged<0b01, 0, 0b11, GPR32, "ldtrsh">;
+defm LDTRSHX : LoadUnprivileged<0b01, 0, 0b10, GPR64, "ldtrsh">;
+
+// load sign-extended byte
+defm LDTRSBW : LoadUnprivileged<0b00, 0, 0b11, GPR32, "ldtrsb">;
+defm LDTRSBX : LoadUnprivileged<0b00, 0, 0b10, GPR64, "ldtrsb">;
+
+// load sign-extended word
+defm LDTRSW : LoadUnprivileged<0b10, 0, 0b10, GPR64, "ldtrsw">;
+
+//---
+// (immediate pre-indexed)
+def LDRWpre : LoadPreIdx<0b10, 0, 0b01, GPR32, "ldr">;
+def LDRXpre : LoadPreIdx<0b11, 0, 0b01, GPR64, "ldr">;
+def LDRBpre : LoadPreIdx<0b00, 1, 0b01, FPR8, "ldr">;
+def LDRHpre : LoadPreIdx<0b01, 1, 0b01, FPR16, "ldr">;
+def LDRSpre : LoadPreIdx<0b10, 1, 0b01, FPR32, "ldr">;
+def LDRDpre : LoadPreIdx<0b11, 1, 0b01, FPR64, "ldr">;
+def LDRQpre : LoadPreIdx<0b00, 1, 0b11, FPR128, "ldr">;
+
+// load sign-extended half-word
+def LDRSHWpre : LoadPreIdx<0b01, 0, 0b11, GPR32, "ldrsh">;
+def LDRSHXpre : LoadPreIdx<0b01, 0, 0b10, GPR64, "ldrsh">;
+
+// load sign-extended byte
+def LDRSBWpre : LoadPreIdx<0b00, 0, 0b11, GPR32, "ldrsb">;
+def LDRSBXpre : LoadPreIdx<0b00, 0, 0b10, GPR64, "ldrsb">;
+
+// load zero-extended byte
+def LDRBBpre : LoadPreIdx<0b00, 0, 0b01, GPR32, "ldrb">;
+def LDRHHpre : LoadPreIdx<0b01, 0, 0b01, GPR32, "ldrh">;
+
+// load sign-extended word
+def LDRSWpre : LoadPreIdx<0b10, 0, 0b10, GPR64, "ldrsw">;
+
+//---
+// (immediate post-indexed)
+def LDRWpost : LoadPostIdx<0b10, 0, 0b01, GPR32, "ldr">;
+def LDRXpost : LoadPostIdx<0b11, 0, 0b01, GPR64, "ldr">;
+def LDRBpost : LoadPostIdx<0b00, 1, 0b01, FPR8, "ldr">;
+def LDRHpost : LoadPostIdx<0b01, 1, 0b01, FPR16, "ldr">;
+def LDRSpost : LoadPostIdx<0b10, 1, 0b01, FPR32, "ldr">;
+def LDRDpost : LoadPostIdx<0b11, 1, 0b01, FPR64, "ldr">;
+def LDRQpost : LoadPostIdx<0b00, 1, 0b11, FPR128, "ldr">;
+
+// load sign-extended half-word
+def LDRSHWpost : LoadPostIdx<0b01, 0, 0b11, GPR32, "ldrsh">;
+def LDRSHXpost : LoadPostIdx<0b01, 0, 0b10, GPR64, "ldrsh">;
+
+// load sign-extended byte
+def LDRSBWpost : LoadPostIdx<0b00, 0, 0b11, GPR32, "ldrsb">;
+def LDRSBXpost : LoadPostIdx<0b00, 0, 0b10, GPR64, "ldrsb">;
+
+// load zero-extended byte
+def LDRBBpost : LoadPostIdx<0b00, 0, 0b01, GPR32, "ldrb">;
+def LDRHHpost : LoadPostIdx<0b01, 0, 0b01, GPR32, "ldrh">;
+
+// load sign-extended word
+def LDRSWpost : LoadPostIdx<0b10, 0, 0b10, GPR64, "ldrsw">;
+
+//===----------------------------------------------------------------------===//
+// Store instructions.
+//===----------------------------------------------------------------------===//
+
+// Pair (indexed, offset)
+// FIXME: Use dedicated range-checked addressing mode operand here.
+defm STPW : StorePairOffset<0b00, 0, GPR32, simm7s4, "stp">;
+defm STPX : StorePairOffset<0b10, 0, GPR64, simm7s8, "stp">;
+defm STPS : StorePairOffset<0b00, 1, FPR32, simm7s4, "stp">;
+defm STPD : StorePairOffset<0b01, 1, FPR64, simm7s8, "stp">;
+defm STPQ : StorePairOffset<0b10, 1, FPR128, simm7s16, "stp">;
+
+// Pair (pre-indexed)
+def STPWpre : StorePairPreIdx<0b00, 0, GPR32, simm7s4, "stp">;
+def STPXpre : StorePairPreIdx<0b10, 0, GPR64, simm7s8, "stp">;
+def STPSpre : StorePairPreIdx<0b00, 1, FPR32, simm7s4, "stp">;
+def STPDpre : StorePairPreIdx<0b01, 1, FPR64, simm7s8, "stp">;
+def STPQpre : StorePairPreIdx<0b10, 1, FPR128, simm7s16, "stp">;
+
+// Pair (pre-indexed)
+def STPWpost : StorePairPostIdx<0b00, 0, GPR32, simm7s4, "stp">;
+def STPXpost : StorePairPostIdx<0b10, 0, GPR64, simm7s8, "stp">;
+def STPSpost : StorePairPostIdx<0b00, 1, FPR32, simm7s4, "stp">;
+def STPDpost : StorePairPostIdx<0b01, 1, FPR64, simm7s8, "stp">;
+def STPQpost : StorePairPostIdx<0b10, 1, FPR128, simm7s16, "stp">;
+
+// Pair (no allocate)
+defm STNPW : StorePairNoAlloc<0b00, 0, GPR32, simm7s4, "stnp">;
+defm STNPX : StorePairNoAlloc<0b10, 0, GPR64, simm7s8, "stnp">;
+defm STNPS : StorePairNoAlloc<0b00, 1, FPR32, simm7s4, "stnp">;
+defm STNPD : StorePairNoAlloc<0b01, 1, FPR64, simm7s8, "stnp">;
+defm STNPQ : StorePairNoAlloc<0b10, 1, FPR128, simm7s16, "stnp">;
+
+//---
+// (Register offset)
+
+// Integer
+defm STRBB : Store8RO< 0b00, 0, 0b00, GPR32, "strb", i32, truncstorei8>;
+defm STRHH : Store16RO<0b01, 0, 0b00, GPR32, "strh", i32, truncstorei16>;
+defm STRW : Store32RO<0b10, 0, 0b00, GPR32, "str", i32, store>;
+defm STRX : Store64RO<0b11, 0, 0b00, GPR64, "str", i64, store>;
+
+
+// Floating-point
+defm STRB : Store8RO< 0b00, 1, 0b00, FPR8, "str", untyped, store>;
+defm STRH : Store16RO<0b01, 1, 0b00, FPR16, "str", f16, store>;
+defm STRS : Store32RO<0b10, 1, 0b00, FPR32, "str", f32, store>;
+defm STRD : Store64RO<0b11, 1, 0b00, FPR64, "str", f64, store>;
+defm STRQ : Store128RO<0b00, 1, 0b10, FPR128, "str", f128, store>;
+
+multiclass TruncStoreFrom64ROPat<ROAddrMode ro, SDPatternOperator storeop,
+ Instruction STRW, Instruction STRX> {
+
+ def : Pat<(storeop GPR64:$Rt,
+ (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)),
+ (STRW (EXTRACT_SUBREG GPR64:$Rt, sub_32),
+ GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
+
+ def : Pat<(storeop GPR64:$Rt,
+ (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)),
+ (STRX (EXTRACT_SUBREG GPR64:$Rt, sub_32),
+ GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
+}
+
+let AddedComplexity = 10 in {
+ // truncstore i64
+ defm : TruncStoreFrom64ROPat<ro8, truncstorei8, STRBBroW, STRBBroX>;
+ defm : TruncStoreFrom64ROPat<ro16, truncstorei16, STRHHroW, STRHHroX>;
+ defm : TruncStoreFrom64ROPat<ro32, truncstorei32, STRWroW, STRWroX>;
+}
+
+multiclass VecROStorePat<ROAddrMode ro, ValueType VecTy, RegisterClass FPR,
+ Instruction STRW, Instruction STRX> {
+ def : Pat<(store (VecTy FPR:$Rt),
+ (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)),
+ (STRW FPR:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
+
+ def : Pat<(store (VecTy FPR:$Rt),
+ (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)),
+ (STRX FPR:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
+}
+
+let AddedComplexity = 10 in {
+// Match all store 64 bits width whose type is compatible with FPR64
+let Predicates = [IsLE] in {
+ // We must use ST1 to store vectors in big-endian.
+ defm : VecROStorePat<ro64, v2i32, FPR64, STRDroW, STRDroX>;
+ defm : VecROStorePat<ro64, v2f32, FPR64, STRDroW, STRDroX>;
+ defm : VecROStorePat<ro64, v4i16, FPR64, STRDroW, STRDroX>;
+ defm : VecROStorePat<ro64, v8i8, FPR64, STRDroW, STRDroX>;
+ defm : VecROStorePat<ro64, v4f16, FPR64, STRDroW, STRDroX>;
+}
+
+defm : VecROStorePat<ro64, v1i64, FPR64, STRDroW, STRDroX>;
+defm : VecROStorePat<ro64, v1f64, FPR64, STRDroW, STRDroX>;
+
+// Match all store 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+ // We must use ST1 to store vectors in big-endian.
+ defm : VecROStorePat<ro128, v2i64, FPR128, STRQroW, STRQroX>;
+ defm : VecROStorePat<ro128, v2f64, FPR128, STRQroW, STRQroX>;
+ defm : VecROStorePat<ro128, v4i32, FPR128, STRQroW, STRQroX>;
+ defm : VecROStorePat<ro128, v4f32, FPR128, STRQroW, STRQroX>;
+ defm : VecROStorePat<ro128, v8i16, FPR128, STRQroW, STRQroX>;
+ defm : VecROStorePat<ro128, v16i8, FPR128, STRQroW, STRQroX>;
+ defm : VecROStorePat<ro128, v8f16, FPR128, STRQroW, STRQroX>;
+}
+} // AddedComplexity = 10
+
+// Match stores from lane 0 to the appropriate subreg's store.
+multiclass VecROStoreLane0Pat<ROAddrMode ro, SDPatternOperator storeop,
+ ValueType VecTy, ValueType STy,
+ SubRegIndex SubRegIdx,
+ Instruction STRW, Instruction STRX> {
+
+ def : Pat<(storeop (STy (vector_extract (VecTy VecListOne128:$Vt), 0)),
+ (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)),
+ (STRW (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx),
+ GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
+
+ def : Pat<(storeop (STy (vector_extract (VecTy VecListOne128:$Vt), 0)),
+ (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)),
+ (STRX (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx),
+ GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
+}
+
+let AddedComplexity = 19 in {
+ defm : VecROStoreLane0Pat<ro16, truncstorei16, v8i16, i32, hsub, STRHroW, STRHroX>;
+ defm : VecROStoreLane0Pat<ro16, store , v8i16, i16, hsub, STRHroW, STRHroX>;
+ defm : VecROStoreLane0Pat<ro32, truncstorei32, v4i32, i32, ssub, STRSroW, STRSroX>;
+ defm : VecROStoreLane0Pat<ro32, store , v4i32, i32, ssub, STRSroW, STRSroX>;
+ defm : VecROStoreLane0Pat<ro32, store , v4f32, f32, ssub, STRSroW, STRSroX>;
+ defm : VecROStoreLane0Pat<ro64, store , v2i64, i64, dsub, STRDroW, STRDroX>;
+ defm : VecROStoreLane0Pat<ro64, store , v2f64, f64, dsub, STRDroW, STRDroX>;
+}
+
+//---
+// (unsigned immediate)
+defm STRX : StoreUI<0b11, 0, 0b00, GPR64, uimm12s8, "str",
+ [(store GPR64:$Rt,
+ (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>;
+defm STRW : StoreUI<0b10, 0, 0b00, GPR32, uimm12s4, "str",
+ [(store GPR32:$Rt,
+ (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>;
+defm STRB : StoreUI<0b00, 1, 0b00, FPR8, uimm12s1, "str",
+ [(store FPR8:$Rt,
+ (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))]>;
+defm STRH : StoreUI<0b01, 1, 0b00, FPR16, uimm12s2, "str",
+ [(store (f16 FPR16:$Rt),
+ (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))]>;
+defm STRS : StoreUI<0b10, 1, 0b00, FPR32, uimm12s4, "str",
+ [(store (f32 FPR32:$Rt),
+ (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>;
+defm STRD : StoreUI<0b11, 1, 0b00, FPR64, uimm12s8, "str",
+ [(store (f64 FPR64:$Rt),
+ (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>;
+defm STRQ : StoreUI<0b00, 1, 0b10, FPR128, uimm12s16, "str", []>;
+
+defm STRHH : StoreUI<0b01, 0, 0b00, GPR32, uimm12s2, "strh",
+ [(truncstorei16 GPR32:$Rt,
+ (am_indexed16 GPR64sp:$Rn,
+ uimm12s2:$offset))]>;
+defm STRBB : StoreUI<0b00, 0, 0b00, GPR32, uimm12s1, "strb",
+ [(truncstorei8 GPR32:$Rt,
+ (am_indexed8 GPR64sp:$Rn,
+ uimm12s1:$offset))]>;
+
+// Match all store 64 bits width whose type is compatible with FPR64
+let AddedComplexity = 10 in {
+let Predicates = [IsLE] in {
+ // We must use ST1 to store vectors in big-endian.
+ def : Pat<(store (v2f32 FPR64:$Rt),
+ (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+ (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+ def : Pat<(store (v8i8 FPR64:$Rt),
+ (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+ (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+ def : Pat<(store (v4i16 FPR64:$Rt),
+ (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+ (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+ def : Pat<(store (v2i32 FPR64:$Rt),
+ (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+ (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+ def : Pat<(store (v4f16 FPR64:$Rt),
+ (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+ (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+}
+def : Pat<(store (v1f64 FPR64:$Rt),
+ (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+ (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat<(store (v1i64 FPR64:$Rt),
+ (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+ (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+
+// Match all store 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+ // We must use ST1 to store vectors in big-endian.
+ def : Pat<(store (v4f32 FPR128:$Rt),
+ (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+ (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+ def : Pat<(store (v2f64 FPR128:$Rt),
+ (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+ (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+ def : Pat<(store (v16i8 FPR128:$Rt),
+ (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+ (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+ def : Pat<(store (v8i16 FPR128:$Rt),
+ (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+ (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+ def : Pat<(store (v4i32 FPR128:$Rt),
+ (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+ (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+ def : Pat<(store (v2i64 FPR128:$Rt),
+ (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+ (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+ def : Pat<(store (v8f16 FPR128:$Rt),
+ (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+ (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+}
+def : Pat<(store (f128 FPR128:$Rt),
+ (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+ (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+
+// truncstore i64
+def : Pat<(truncstorei32 GPR64:$Rt,
+ (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)),
+ (STRWui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s4:$offset)>;
+def : Pat<(truncstorei16 GPR64:$Rt,
+ (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)),
+ (STRHHui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s2:$offset)>;
+def : Pat<(truncstorei8 GPR64:$Rt, (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)),
+ (STRBBui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s1:$offset)>;
+
+} // AddedComplexity = 10
+
+//---
+// (unscaled immediate)
+defm STURX : StoreUnscaled<0b11, 0, 0b00, GPR64, "stur",
+ [(store GPR64:$Rt,
+ (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURW : StoreUnscaled<0b10, 0, 0b00, GPR32, "stur",
+ [(store GPR32:$Rt,
+ (am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURB : StoreUnscaled<0b00, 1, 0b00, FPR8, "stur",
+ [(store FPR8:$Rt,
+ (am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURH : StoreUnscaled<0b01, 1, 0b00, FPR16, "stur",
+ [(store (f16 FPR16:$Rt),
+ (am_unscaled16 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURS : StoreUnscaled<0b10, 1, 0b00, FPR32, "stur",
+ [(store (f32 FPR32:$Rt),
+ (am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURD : StoreUnscaled<0b11, 1, 0b00, FPR64, "stur",
+ [(store (f64 FPR64:$Rt),
+ (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURQ : StoreUnscaled<0b00, 1, 0b10, FPR128, "stur",
+ [(store (f128 FPR128:$Rt),
+ (am_unscaled128 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURHH : StoreUnscaled<0b01, 0, 0b00, GPR32, "sturh",
+ [(truncstorei16 GPR32:$Rt,
+ (am_unscaled16 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURBB : StoreUnscaled<0b00, 0, 0b00, GPR32, "sturb",
+ [(truncstorei8 GPR32:$Rt,
+ (am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>;
+
+// Match all store 64 bits width whose type is compatible with FPR64
+let Predicates = [IsLE] in {
+ // We must use ST1 to store vectors in big-endian.
+ def : Pat<(store (v2f32 FPR64:$Rt),
+ (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+ (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(store (v8i8 FPR64:$Rt),
+ (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+ (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(store (v4i16 FPR64:$Rt),
+ (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+ (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(store (v2i32 FPR64:$Rt),
+ (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+ (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(store (v4f16 FPR64:$Rt),
+ (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+ (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+}
+def : Pat<(store (v1f64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+ (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(store (v1i64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+ (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+
+// Match all store 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+ // We must use ST1 to store vectors in big-endian.
+ def : Pat<(store (v4f32 FPR128:$Rt),
+ (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+ (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(store (v2f64 FPR128:$Rt),
+ (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+ (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(store (v16i8 FPR128:$Rt),
+ (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+ (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(store (v8i16 FPR128:$Rt),
+ (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+ (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(store (v4i32 FPR128:$Rt),
+ (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+ (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(store (v2i64 FPR128:$Rt),
+ (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+ (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(store (v2f64 FPR128:$Rt),
+ (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+ (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(store (v8f16 FPR128:$Rt),
+ (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+ (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+}
+
+// unscaled i64 truncating stores
+def : Pat<(truncstorei32 GPR64:$Rt, (am_unscaled32 GPR64sp:$Rn, simm9:$offset)),
+ (STURWi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(truncstorei16 GPR64:$Rt, (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
+ (STURHHi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
+ (STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
+
+//---
+// STR mnemonics fall back to STUR for negative or unaligned offsets.
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+ (STURXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+ (STURWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+ (STURBi FPR8:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+ (STURHi FPR16:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+ (STURSi FPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+ (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+ (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>;
+
+def : InstAlias<"strb $Rt, [$Rn, $offset]",
+ (STURBBi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"strh $Rt, [$Rn, $offset]",
+ (STURHHi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+
+//---
+// (unscaled immediate, unprivileged)
+defm STTRW : StoreUnprivileged<0b10, 0, 0b00, GPR32, "sttr">;
+defm STTRX : StoreUnprivileged<0b11, 0, 0b00, GPR64, "sttr">;
+
+defm STTRH : StoreUnprivileged<0b01, 0, 0b00, GPR32, "sttrh">;
+defm STTRB : StoreUnprivileged<0b00, 0, 0b00, GPR32, "sttrb">;
+
+//---
+// (immediate pre-indexed)
+def STRWpre : StorePreIdx<0b10, 0, 0b00, GPR32, "str", pre_store, i32>;
+def STRXpre : StorePreIdx<0b11, 0, 0b00, GPR64, "str", pre_store, i64>;
+def STRBpre : StorePreIdx<0b00, 1, 0b00, FPR8, "str", pre_store, untyped>;
+def STRHpre : StorePreIdx<0b01, 1, 0b00, FPR16, "str", pre_store, f16>;
+def STRSpre : StorePreIdx<0b10, 1, 0b00, FPR32, "str", pre_store, f32>;
+def STRDpre : StorePreIdx<0b11, 1, 0b00, FPR64, "str", pre_store, f64>;
+def STRQpre : StorePreIdx<0b00, 1, 0b10, FPR128, "str", pre_store, f128>;
+
+def STRBBpre : StorePreIdx<0b00, 0, 0b00, GPR32, "strb", pre_truncsti8, i32>;
+def STRHHpre : StorePreIdx<0b01, 0, 0b00, GPR32, "strh", pre_truncsti16, i32>;
+
+// truncstore i64
+def : Pat<(pre_truncsti32 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+ (STRWpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+ simm9:$off)>;
+def : Pat<(pre_truncsti16 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+ (STRHHpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+ simm9:$off)>;
+def : Pat<(pre_truncsti8 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+ (STRBBpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+ simm9:$off)>;
+
+def : Pat<(pre_store (v8i8 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v4i16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v2i32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v2f32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v4f16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+
+def : Pat<(pre_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v8i16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v4i32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v4f32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v8f16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+
+//---
+// (immediate post-indexed)
+def STRWpost : StorePostIdx<0b10, 0, 0b00, GPR32, "str", post_store, i32>;
+def STRXpost : StorePostIdx<0b11, 0, 0b00, GPR64, "str", post_store, i64>;
+def STRBpost : StorePostIdx<0b00, 1, 0b00, FPR8, "str", post_store, untyped>;
+def STRHpost : StorePostIdx<0b01, 1, 0b00, FPR16, "str", post_store, f16>;
+def STRSpost : StorePostIdx<0b10, 1, 0b00, FPR32, "str", post_store, f32>;
+def STRDpost : StorePostIdx<0b11, 1, 0b00, FPR64, "str", post_store, f64>;
+def STRQpost : StorePostIdx<0b00, 1, 0b10, FPR128, "str", post_store, f128>;
+
+def STRBBpost : StorePostIdx<0b00, 0, 0b00, GPR32, "strb", post_truncsti8, i32>;
+def STRHHpost : StorePostIdx<0b01, 0, 0b00, GPR32, "strh", post_truncsti16, i32>;
+
+// truncstore i64
+def : Pat<(post_truncsti32 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+ (STRWpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+ simm9:$off)>;
+def : Pat<(post_truncsti16 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+ (STRHHpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+ simm9:$off)>;
+def : Pat<(post_truncsti8 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+ (STRBBpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+ simm9:$off)>;
+
+def : Pat<(post_store (v8i8 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v4i16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v2i32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v2f32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v4f16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+
+def : Pat<(post_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v8i16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v4i32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v4f32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v8f16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+
+//===----------------------------------------------------------------------===//
+// Load/store exclusive instructions.
+//===----------------------------------------------------------------------===//
+
+def LDARW : LoadAcquire <0b10, 1, 1, 0, 1, GPR32, "ldar">;
+def LDARX : LoadAcquire <0b11, 1, 1, 0, 1, GPR64, "ldar">;
+def LDARB : LoadAcquire <0b00, 1, 1, 0, 1, GPR32, "ldarb">;
+def LDARH : LoadAcquire <0b01, 1, 1, 0, 1, GPR32, "ldarh">;
+
+def LDAXRW : LoadExclusive <0b10, 0, 1, 0, 1, GPR32, "ldaxr">;
+def LDAXRX : LoadExclusive <0b11, 0, 1, 0, 1, GPR64, "ldaxr">;
+def LDAXRB : LoadExclusive <0b00, 0, 1, 0, 1, GPR32, "ldaxrb">;
+def LDAXRH : LoadExclusive <0b01, 0, 1, 0, 1, GPR32, "ldaxrh">;
+
+def LDXRW : LoadExclusive <0b10, 0, 1, 0, 0, GPR32, "ldxr">;
+def LDXRX : LoadExclusive <0b11, 0, 1, 0, 0, GPR64, "ldxr">;
+def LDXRB : LoadExclusive <0b00, 0, 1, 0, 0, GPR32, "ldxrb">;
+def LDXRH : LoadExclusive <0b01, 0, 1, 0, 0, GPR32, "ldxrh">;
+
+def STLRW : StoreRelease <0b10, 1, 0, 0, 1, GPR32, "stlr">;
+def STLRX : StoreRelease <0b11, 1, 0, 0, 1, GPR64, "stlr">;
+def STLRB : StoreRelease <0b00, 1, 0, 0, 1, GPR32, "stlrb">;
+def STLRH : StoreRelease <0b01, 1, 0, 0, 1, GPR32, "stlrh">;
+
+def STLXRW : StoreExclusive<0b10, 0, 0, 0, 1, GPR32, "stlxr">;
+def STLXRX : StoreExclusive<0b11, 0, 0, 0, 1, GPR64, "stlxr">;
+def STLXRB : StoreExclusive<0b00, 0, 0, 0, 1, GPR32, "stlxrb">;
+def STLXRH : StoreExclusive<0b01, 0, 0, 0, 1, GPR32, "stlxrh">;
+
+def STXRW : StoreExclusive<0b10, 0, 0, 0, 0, GPR32, "stxr">;
+def STXRX : StoreExclusive<0b11, 0, 0, 0, 0, GPR64, "stxr">;
+def STXRB : StoreExclusive<0b00, 0, 0, 0, 0, GPR32, "stxrb">;
+def STXRH : StoreExclusive<0b01, 0, 0, 0, 0, GPR32, "stxrh">;
+
+def LDAXPW : LoadExclusivePair<0b10, 0, 1, 1, 1, GPR32, "ldaxp">;
+def LDAXPX : LoadExclusivePair<0b11, 0, 1, 1, 1, GPR64, "ldaxp">;
+
+def LDXPW : LoadExclusivePair<0b10, 0, 1, 1, 0, GPR32, "ldxp">;
+def LDXPX : LoadExclusivePair<0b11, 0, 1, 1, 0, GPR64, "ldxp">;
+
+def STLXPW : StoreExclusivePair<0b10, 0, 0, 1, 1, GPR32, "stlxp">;
+def STLXPX : StoreExclusivePair<0b11, 0, 0, 1, 1, GPR64, "stlxp">;
+
+def STXPW : StoreExclusivePair<0b10, 0, 0, 1, 0, GPR32, "stxp">;
+def STXPX : StoreExclusivePair<0b11, 0, 0, 1, 0, GPR64, "stxp">;
+
+let Predicates = [HasV8_1a] in {
+ // v8.1a "Limited Order Region" extension load-acquire instructions
+ def LDLARW : LoadAcquire <0b10, 1, 1, 0, 0, GPR32, "ldlar">;
+ def LDLARX : LoadAcquire <0b11, 1, 1, 0, 0, GPR64, "ldlar">;
+ def LDLARB : LoadAcquire <0b00, 1, 1, 0, 0, GPR32, "ldlarb">;
+ def LDLARH : LoadAcquire <0b01, 1, 1, 0, 0, GPR32, "ldlarh">;
+
+ // v8.1a "Limited Order Region" extension store-release instructions
+ def STLLRW : StoreRelease <0b10, 1, 0, 0, 0, GPR32, "stllr">;
+ def STLLRX : StoreRelease <0b11, 1, 0, 0, 0, GPR64, "stllr">;
+ def STLLRB : StoreRelease <0b00, 1, 0, 0, 0, GPR32, "stllrb">;
+ def STLLRH : StoreRelease <0b01, 1, 0, 0, 0, GPR32, "stllrh">;
+}
+
+//===----------------------------------------------------------------------===//
+// Scaled floating point to integer conversion instructions.
+//===----------------------------------------------------------------------===//
+
+defm FCVTAS : FPToIntegerUnscaled<0b00, 0b100, "fcvtas", int_aarch64_neon_fcvtas>;
+defm FCVTAU : FPToIntegerUnscaled<0b00, 0b101, "fcvtau", int_aarch64_neon_fcvtau>;
+defm FCVTMS : FPToIntegerUnscaled<0b10, 0b000, "fcvtms", int_aarch64_neon_fcvtms>;
+defm FCVTMU : FPToIntegerUnscaled<0b10, 0b001, "fcvtmu", int_aarch64_neon_fcvtmu>;
+defm FCVTNS : FPToIntegerUnscaled<0b00, 0b000, "fcvtns", int_aarch64_neon_fcvtns>;
+defm FCVTNU : FPToIntegerUnscaled<0b00, 0b001, "fcvtnu", int_aarch64_neon_fcvtnu>;
+defm FCVTPS : FPToIntegerUnscaled<0b01, 0b000, "fcvtps", int_aarch64_neon_fcvtps>;
+defm FCVTPU : FPToIntegerUnscaled<0b01, 0b001, "fcvtpu", int_aarch64_neon_fcvtpu>;
+defm FCVTZS : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
+defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", fp_to_uint>;
+defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
+defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", fp_to_uint>;
+
+multiclass FPToIntegerIntPats<Intrinsic round, string INST> {
+ def : Pat<(i32 (round f16:$Rn)), (!cast<Instruction>(INST # UWHr) $Rn)>;
+ def : Pat<(i64 (round f16:$Rn)), (!cast<Instruction>(INST # UXHr) $Rn)>;
+ def : Pat<(i32 (round f32:$Rn)), (!cast<Instruction>(INST # UWSr) $Rn)>;
+ def : Pat<(i64 (round f32:$Rn)), (!cast<Instruction>(INST # UXSr) $Rn)>;
+ def : Pat<(i32 (round f64:$Rn)), (!cast<Instruction>(INST # UWDr) $Rn)>;
+ def : Pat<(i64 (round f64:$Rn)), (!cast<Instruction>(INST # UXDr) $Rn)>;
+
+ def : Pat<(i32 (round (fmul f16:$Rn, fixedpoint_f16_i32:$scale))),
+ (!cast<Instruction>(INST # SWHri) $Rn, $scale)>;
+ def : Pat<(i64 (round (fmul f16:$Rn, fixedpoint_f16_i64:$scale))),
+ (!cast<Instruction>(INST # SXHri) $Rn, $scale)>;
+ def : Pat<(i32 (round (fmul f32:$Rn, fixedpoint_f32_i32:$scale))),
+ (!cast<Instruction>(INST # SWSri) $Rn, $scale)>;
+ def : Pat<(i64 (round (fmul f32:$Rn, fixedpoint_f32_i64:$scale))),
+ (!cast<Instruction>(INST # SXSri) $Rn, $scale)>;
+ def : Pat<(i32 (round (fmul f64:$Rn, fixedpoint_f64_i32:$scale))),
+ (!cast<Instruction>(INST # SWDri) $Rn, $scale)>;
+ def : Pat<(i64 (round (fmul f64:$Rn, fixedpoint_f64_i64:$scale))),
+ (!cast<Instruction>(INST # SXDri) $Rn, $scale)>;
+}
+
+defm : FPToIntegerIntPats<int_aarch64_neon_fcvtzs, "FCVTZS">;
+defm : FPToIntegerIntPats<int_aarch64_neon_fcvtzu, "FCVTZU">;
+
+multiclass FPToIntegerPats<SDNode to_int, SDNode round, string INST> {
+ def : Pat<(i32 (to_int (round f32:$Rn))),
+ (!cast<Instruction>(INST # UWSr) f32:$Rn)>;
+ def : Pat<(i64 (to_int (round f32:$Rn))),
+ (!cast<Instruction>(INST # UXSr) f32:$Rn)>;
+ def : Pat<(i32 (to_int (round f64:$Rn))),
+ (!cast<Instruction>(INST # UWDr) f64:$Rn)>;
+ def : Pat<(i64 (to_int (round f64:$Rn))),
+ (!cast<Instruction>(INST # UXDr) f64:$Rn)>;
+}
+
+defm : FPToIntegerPats<fp_to_sint, fceil, "FCVTPS">;
+defm : FPToIntegerPats<fp_to_uint, fceil, "FCVTPU">;
+defm : FPToIntegerPats<fp_to_sint, ffloor, "FCVTMS">;
+defm : FPToIntegerPats<fp_to_uint, ffloor, "FCVTMU">;
+defm : FPToIntegerPats<fp_to_sint, ftrunc, "FCVTZS">;
+defm : FPToIntegerPats<fp_to_uint, ftrunc, "FCVTZU">;
+defm : FPToIntegerPats<fp_to_sint, fround, "FCVTAS">;
+defm : FPToIntegerPats<fp_to_uint, fround, "FCVTAU">;
+
+//===----------------------------------------------------------------------===//
+// Scaled integer to floating point conversion instructions.
+//===----------------------------------------------------------------------===//
+
+defm SCVTF : IntegerToFP<0, "scvtf", sint_to_fp>;
+defm UCVTF : IntegerToFP<1, "ucvtf", uint_to_fp>;
+
+//===----------------------------------------------------------------------===//
+// Unscaled integer to floating point conversion instruction.
+//===----------------------------------------------------------------------===//
+
+defm FMOV : UnscaledConversion<"fmov">;
+
+// Add pseudo ops for FMOV 0 so we can mark them as isReMaterializable
+let isReMaterializable = 1, isCodeGenOnly = 1, isAsCheapAsAMove = 1 in {
+def FMOVS0 : Pseudo<(outs FPR32:$Rd), (ins), [(set f32:$Rd, (fpimm0))]>,
+ Sched<[WriteF]>;
+def FMOVD0 : Pseudo<(outs FPR64:$Rd), (ins), [(set f64:$Rd, (fpimm0))]>,
+ Sched<[WriteF]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating point conversion instruction.
+//===----------------------------------------------------------------------===//
+
+defm FCVT : FPConversion<"fcvt">;
+
+//===----------------------------------------------------------------------===//
+// Floating point single operand instructions.
+//===----------------------------------------------------------------------===//
+
+defm FABS : SingleOperandFPData<0b0001, "fabs", fabs>;
+defm FMOV : SingleOperandFPData<0b0000, "fmov">;
+defm FNEG : SingleOperandFPData<0b0010, "fneg", fneg>;
+defm FRINTA : SingleOperandFPData<0b1100, "frinta", fround>;
+defm FRINTI : SingleOperandFPData<0b1111, "frinti", fnearbyint>;
+defm FRINTM : SingleOperandFPData<0b1010, "frintm", ffloor>;
+defm FRINTN : SingleOperandFPData<0b1000, "frintn", int_aarch64_neon_frintn>;
+defm FRINTP : SingleOperandFPData<0b1001, "frintp", fceil>;
+
+def : Pat<(v1f64 (int_aarch64_neon_frintn (v1f64 FPR64:$Rn))),
+ (FRINTNDr FPR64:$Rn)>;
+
+defm FRINTX : SingleOperandFPData<0b1110, "frintx", frint>;
+defm FRINTZ : SingleOperandFPData<0b1011, "frintz", ftrunc>;
+
+let SchedRW = [WriteFDiv] in {
+defm FSQRT : SingleOperandFPData<0b0011, "fsqrt", fsqrt>;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating point two operand instructions.
+//===----------------------------------------------------------------------===//
+
+defm FADD : TwoOperandFPData<0b0010, "fadd", fadd>;
+let SchedRW = [WriteFDiv] in {
+defm FDIV : TwoOperandFPData<0b0001, "fdiv", fdiv>;
+}
+defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", fmaxnum>;
+defm FMAX : TwoOperandFPData<0b0100, "fmax", fmaxnan>;
+defm FMINNM : TwoOperandFPData<0b0111, "fminnm", fminnum>;
+defm FMIN : TwoOperandFPData<0b0101, "fmin", fminnan>;
+let SchedRW = [WriteFMul] in {
+defm FMUL : TwoOperandFPData<0b0000, "fmul", fmul>;
+defm FNMUL : TwoOperandFPDataNeg<0b1000, "fnmul", fmul>;
+}
+defm FSUB : TwoOperandFPData<0b0011, "fsub", fsub>;
+
+def : Pat<(v1f64 (fmaxnan (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+ (FMAXDrr FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(v1f64 (fminnan (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+ (FMINDrr FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(v1f64 (fmaxnum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+ (FMAXNMDrr FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(v1f64 (fminnum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+ (FMINNMDrr FPR64:$Rn, FPR64:$Rm)>;
+
+//===----------------------------------------------------------------------===//
+// Floating point three operand instructions.
+//===----------------------------------------------------------------------===//
+
+defm FMADD : ThreeOperandFPData<0, 0, "fmadd", fma>;
+defm FMSUB : ThreeOperandFPData<0, 1, "fmsub",
+ TriOpFrag<(fma node:$LHS, (fneg node:$MHS), node:$RHS)> >;
+defm FNMADD : ThreeOperandFPData<1, 0, "fnmadd",
+ TriOpFrag<(fneg (fma node:$LHS, node:$MHS, node:$RHS))> >;
+defm FNMSUB : ThreeOperandFPData<1, 1, "fnmsub",
+ TriOpFrag<(fma node:$LHS, node:$MHS, (fneg node:$RHS))> >;
+
+// The following def pats catch the case where the LHS of an FMA is negated.
+// The TriOpFrag above catches the case where the middle operand is negated.
+
+// N.b. FMSUB etc have the accumulator at the *end* of (outs), unlike
+// the NEON variant.
+def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, FPR32:$Ra)),
+ (FMSUBSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
+
+def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, FPR64:$Ra)),
+ (FMSUBDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
+
+// We handled -(a + b*c) for FNMADD above, now it's time for "(-a) + (-b)*c" and
+// "(-a) + b*(-c)".
+def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, (fneg FPR32:$Ra))),
+ (FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
+
+def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, (fneg FPR64:$Ra))),
+ (FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
+
+def : Pat<(f32 (fma FPR32:$Rn, (fneg FPR32:$Rm), (fneg FPR32:$Ra))),
+ (FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
+
+def : Pat<(f64 (fma FPR64:$Rn, (fneg FPR64:$Rm), (fneg FPR64:$Ra))),
+ (FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
+
+//===----------------------------------------------------------------------===//
+// Floating point comparison instructions.
+//===----------------------------------------------------------------------===//
+
+defm FCMPE : FPComparison<1, "fcmpe">;
+defm FCMP : FPComparison<0, "fcmp", AArch64fcmp>;
+
+//===----------------------------------------------------------------------===//
+// Floating point conditional comparison instructions.
+//===----------------------------------------------------------------------===//
+
+defm FCCMPE : FPCondComparison<1, "fccmpe">;
+defm FCCMP : FPCondComparison<0, "fccmp", AArch64fccmp>;
+
+//===----------------------------------------------------------------------===//
+// Floating point conditional select instruction.
+//===----------------------------------------------------------------------===//
+
+defm FCSEL : FPCondSelect<"fcsel">;
+
+// CSEL instructions providing f128 types need to be handled by a
+// pseudo-instruction since the eventual code will need to introduce basic
+// blocks and control flow.
+def F128CSEL : Pseudo<(outs FPR128:$Rd),
+ (ins FPR128:$Rn, FPR128:$Rm, ccode:$cond),
+ [(set (f128 FPR128:$Rd),
+ (AArch64csel FPR128:$Rn, FPR128:$Rm,
+ (i32 imm:$cond), NZCV))]> {
+ let Uses = [NZCV];
+ let usesCustomInserter = 1;
+ let hasNoSchedulingInfo = 1;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Floating point immediate move.
+//===----------------------------------------------------------------------===//
+
+let isReMaterializable = 1 in {
+defm FMOV : FPMoveImmediate<"fmov">;
+}
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD two vector instructions.
+//===----------------------------------------------------------------------===//
+
+defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl",
+ int_aarch64_neon_uabd>;
+// Match UABDL in log2-shuffle patterns.
+def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))),
+ (v8i16 (add (sub (zext (v8i8 V64:$opA)),
+ (zext (v8i8 V64:$opB))),
+ (AArch64vashr v8i16:$src, (i32 15))))),
+ (UABDLv8i8_v8i16 V64:$opA, V64:$opB)>;
+def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))),
+ (v8i16 (add (sub (zext (extract_high_v16i8 V128:$opA)),
+ (zext (extract_high_v16i8 V128:$opB))),
+ (AArch64vashr v8i16:$src, (i32 15))))),
+ (UABDLv16i8_v8i16 V128:$opA, V128:$opB)>;
+def : Pat<(xor (v4i32 (AArch64vashr v4i32:$src, (i32 31))),
+ (v4i32 (add (sub (zext (v4i16 V64:$opA)),
+ (zext (v4i16 V64:$opB))),
+ (AArch64vashr v4i32:$src, (i32 31))))),
+ (UABDLv4i16_v4i32 V64:$opA, V64:$opB)>;
+def : Pat<(xor (v4i32 (AArch64vashr v4i32:$src, (i32 31))),
+ (v4i32 (add (sub (zext (extract_high_v8i16 V128:$opA)),
+ (zext (extract_high_v8i16 V128:$opB))),
+ (AArch64vashr v4i32:$src, (i32 31))))),
+ (UABDLv8i16_v4i32 V128:$opA, V128:$opB)>;
+def : Pat<(xor (v2i64 (AArch64vashr v2i64:$src, (i32 63))),
+ (v2i64 (add (sub (zext (v2i32 V64:$opA)),
+ (zext (v2i32 V64:$opB))),
+ (AArch64vashr v2i64:$src, (i32 63))))),
+ (UABDLv2i32_v2i64 V64:$opA, V64:$opB)>;
+def : Pat<(xor (v2i64 (AArch64vashr v2i64:$src, (i32 63))),
+ (v2i64 (add (sub (zext (extract_high_v4i32 V128:$opA)),
+ (zext (extract_high_v4i32 V128:$opB))),
+ (AArch64vashr v2i64:$src, (i32 63))))),
+ (UABDLv4i32_v2i64 V128:$opA, V128:$opB)>;
+
+defm ABS : SIMDTwoVectorBHSD<0, 0b01011, "abs", int_aarch64_neon_abs>;
+def : Pat<(xor (v8i8 (AArch64vashr V64:$src, (i32 7))),
+ (v8i8 (add V64:$src, (AArch64vashr V64:$src, (i32 7))))),
+ (ABSv8i8 V64:$src)>;
+def : Pat<(xor (v4i16 (AArch64vashr V64:$src, (i32 15))),
+ (v4i16 (add V64:$src, (AArch64vashr V64:$src, (i32 15))))),
+ (ABSv4i16 V64:$src)>;
+def : Pat<(xor (v2i32 (AArch64vashr V64:$src, (i32 31))),
+ (v2i32 (add V64:$src, (AArch64vashr V64:$src, (i32 31))))),
+ (ABSv2i32 V64:$src)>;
+def : Pat<(xor (v16i8 (AArch64vashr V128:$src, (i32 7))),
+ (v16i8 (add V128:$src, (AArch64vashr V128:$src, (i32 7))))),
+ (ABSv16i8 V128:$src)>;
+def : Pat<(xor (v8i16 (AArch64vashr V128:$src, (i32 15))),
+ (v8i16 (add V128:$src, (AArch64vashr V128:$src, (i32 15))))),
+ (ABSv8i16 V128:$src)>;
+def : Pat<(xor (v4i32 (AArch64vashr V128:$src, (i32 31))),
+ (v4i32 (add V128:$src, (AArch64vashr V128:$src, (i32 31))))),
+ (ABSv4i32 V128:$src)>;
+def : Pat<(xor (v2i64 (AArch64vashr V128:$src, (i32 63))),
+ (v2i64 (add V128:$src, (AArch64vashr V128:$src, (i32 63))))),
+ (ABSv2i64 V128:$src)>;
+
+defm CLS : SIMDTwoVectorBHS<0, 0b00100, "cls", int_aarch64_neon_cls>;
+defm CLZ : SIMDTwoVectorBHS<1, 0b00100, "clz", ctlz>;
+defm CMEQ : SIMDCmpTwoVector<0, 0b01001, "cmeq", AArch64cmeqz>;
+defm CMGE : SIMDCmpTwoVector<1, 0b01000, "cmge", AArch64cmgez>;
+defm CMGT : SIMDCmpTwoVector<0, 0b01000, "cmgt", AArch64cmgtz>;
+defm CMLE : SIMDCmpTwoVector<1, 0b01001, "cmle", AArch64cmlez>;
+defm CMLT : SIMDCmpTwoVector<0, 0b01010, "cmlt", AArch64cmltz>;
+defm CNT : SIMDTwoVectorB<0, 0b00, 0b00101, "cnt", ctpop>;
+defm FABS : SIMDTwoVectorFP<0, 1, 0b01111, "fabs", fabs>;
+
+defm FCMEQ : SIMDFPCmpTwoVector<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>;
+defm FCMGE : SIMDFPCmpTwoVector<1, 1, 0b01100, "fcmge", AArch64fcmgez>;
+defm FCMGT : SIMDFPCmpTwoVector<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>;
+defm FCMLE : SIMDFPCmpTwoVector<1, 1, 0b01101, "fcmle", AArch64fcmlez>;
+defm FCMLT : SIMDFPCmpTwoVector<0, 1, 0b01110, "fcmlt", AArch64fcmltz>;
+defm FCVTAS : SIMDTwoVectorFPToInt<0,0,0b11100, "fcvtas",int_aarch64_neon_fcvtas>;
+defm FCVTAU : SIMDTwoVectorFPToInt<1,0,0b11100, "fcvtau",int_aarch64_neon_fcvtau>;
+defm FCVTL : SIMDFPWidenTwoVector<0, 0, 0b10111, "fcvtl">;
+def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (v4i16 V64:$Rn))),
+ (FCVTLv4i16 V64:$Rn)>;
+def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (extract_subvector (v8i16 V128:$Rn),
+ (i64 4)))),
+ (FCVTLv8i16 V128:$Rn)>;
+def : Pat<(v2f64 (fpextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>;
+def : Pat<(v2f64 (fpextend (v2f32 (extract_subvector (v4f32 V128:$Rn),
+ (i64 2))))),
+ (FCVTLv4i32 V128:$Rn)>;
+
+def : Pat<(v4f32 (fpextend (v4f16 V64:$Rn))), (FCVTLv4i16 V64:$Rn)>;
+def : Pat<(v4f32 (fpextend (v4f16 (extract_subvector (v8f16 V128:$Rn),
+ (i64 4))))),
+ (FCVTLv8i16 V128:$Rn)>;
+
+defm FCVTMS : SIMDTwoVectorFPToInt<0,0,0b11011, "fcvtms",int_aarch64_neon_fcvtms>;
+defm FCVTMU : SIMDTwoVectorFPToInt<1,0,0b11011, "fcvtmu",int_aarch64_neon_fcvtmu>;
+defm FCVTNS : SIMDTwoVectorFPToInt<0,0,0b11010, "fcvtns",int_aarch64_neon_fcvtns>;
+defm FCVTNU : SIMDTwoVectorFPToInt<1,0,0b11010, "fcvtnu",int_aarch64_neon_fcvtnu>;
+defm FCVTN : SIMDFPNarrowTwoVector<0, 0, 0b10110, "fcvtn">;
+def : Pat<(v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn))),
+ (FCVTNv4i16 V128:$Rn)>;
+def : Pat<(concat_vectors V64:$Rd,
+ (v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn)))),
+ (FCVTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+def : Pat<(v2f32 (fpround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>;
+def : Pat<(v4f16 (fpround (v4f32 V128:$Rn))), (FCVTNv4i16 V128:$Rn)>;
+def : Pat<(concat_vectors V64:$Rd, (v2f32 (fpround (v2f64 V128:$Rn)))),
+ (FCVTNv4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+defm FCVTPS : SIMDTwoVectorFPToInt<0,1,0b11010, "fcvtps",int_aarch64_neon_fcvtps>;
+defm FCVTPU : SIMDTwoVectorFPToInt<1,1,0b11010, "fcvtpu",int_aarch64_neon_fcvtpu>;
+defm FCVTXN : SIMDFPInexactCvtTwoVector<1, 0, 0b10110, "fcvtxn",
+ int_aarch64_neon_fcvtxn>;
+defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", fp_to_sint>;
+defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", fp_to_uint>;
+
+def : Pat<(v4i16 (int_aarch64_neon_fcvtzs v4f16:$Rn)), (FCVTZSv4f16 $Rn)>;
+def : Pat<(v8i16 (int_aarch64_neon_fcvtzs v8f16:$Rn)), (FCVTZSv8f16 $Rn)>;
+def : Pat<(v2i32 (int_aarch64_neon_fcvtzs v2f32:$Rn)), (FCVTZSv2f32 $Rn)>;
+def : Pat<(v4i32 (int_aarch64_neon_fcvtzs v4f32:$Rn)), (FCVTZSv4f32 $Rn)>;
+def : Pat<(v2i64 (int_aarch64_neon_fcvtzs v2f64:$Rn)), (FCVTZSv2f64 $Rn)>;
+
+def : Pat<(v4i16 (int_aarch64_neon_fcvtzu v4f16:$Rn)), (FCVTZUv4f16 $Rn)>;
+def : Pat<(v8i16 (int_aarch64_neon_fcvtzu v8f16:$Rn)), (FCVTZUv8f16 $Rn)>;
+def : Pat<(v2i32 (int_aarch64_neon_fcvtzu v2f32:$Rn)), (FCVTZUv2f32 $Rn)>;
+def : Pat<(v4i32 (int_aarch64_neon_fcvtzu v4f32:$Rn)), (FCVTZUv4f32 $Rn)>;
+def : Pat<(v2i64 (int_aarch64_neon_fcvtzu v2f64:$Rn)), (FCVTZUv2f64 $Rn)>;
+
+defm FNEG : SIMDTwoVectorFP<1, 1, 0b01111, "fneg", fneg>;
+defm FRECPE : SIMDTwoVectorFP<0, 1, 0b11101, "frecpe", int_aarch64_neon_frecpe>;
+defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", fround>;
+defm FRINTI : SIMDTwoVectorFP<1, 1, 0b11001, "frinti", fnearbyint>;
+defm FRINTM : SIMDTwoVectorFP<0, 0, 0b11001, "frintm", ffloor>;
+defm FRINTN : SIMDTwoVectorFP<0, 0, 0b11000, "frintn", int_aarch64_neon_frintn>;
+defm FRINTP : SIMDTwoVectorFP<0, 1, 0b11000, "frintp", fceil>;
+defm FRINTX : SIMDTwoVectorFP<1, 0, 0b11001, "frintx", frint>;
+defm FRINTZ : SIMDTwoVectorFP<0, 1, 0b11001, "frintz", ftrunc>;
+defm FRSQRTE: SIMDTwoVectorFP<1, 1, 0b11101, "frsqrte", int_aarch64_neon_frsqrte>;
+defm FSQRT : SIMDTwoVectorFP<1, 1, 0b11111, "fsqrt", fsqrt>;
+defm NEG : SIMDTwoVectorBHSD<1, 0b01011, "neg",
+ UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
+defm NOT : SIMDTwoVectorB<1, 0b00, 0b00101, "not", vnot>;
+// Aliases for MVN -> NOT.
+def : InstAlias<"mvn{ $Vd.8b, $Vn.8b|.8b $Vd, $Vn}",
+ (NOTv8i8 V64:$Vd, V64:$Vn)>;
+def : InstAlias<"mvn{ $Vd.16b, $Vn.16b|.16b $Vd, $Vn}",
+ (NOTv16i8 V128:$Vd, V128:$Vn)>;
+
+def : Pat<(AArch64neg (v8i8 V64:$Rn)), (NEGv8i8 V64:$Rn)>;
+def : Pat<(AArch64neg (v16i8 V128:$Rn)), (NEGv16i8 V128:$Rn)>;
+def : Pat<(AArch64neg (v4i16 V64:$Rn)), (NEGv4i16 V64:$Rn)>;
+def : Pat<(AArch64neg (v8i16 V128:$Rn)), (NEGv8i16 V128:$Rn)>;
+def : Pat<(AArch64neg (v2i32 V64:$Rn)), (NEGv2i32 V64:$Rn)>;
+def : Pat<(AArch64neg (v4i32 V128:$Rn)), (NEGv4i32 V128:$Rn)>;
+def : Pat<(AArch64neg (v2i64 V128:$Rn)), (NEGv2i64 V128:$Rn)>;
+
+def : Pat<(AArch64not (v8i8 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
+def : Pat<(AArch64not (v16i8 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(AArch64not (v4i16 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
+def : Pat<(AArch64not (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(AArch64not (v2i32 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
+def : Pat<(AArch64not (v1i64 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
+def : Pat<(AArch64not (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(AArch64not (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+
+def : Pat<(vnot (v4i16 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
+def : Pat<(vnot (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(vnot (v2i32 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
+def : Pat<(vnot (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(vnot (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+
+defm RBIT : SIMDTwoVectorB<1, 0b01, 0b00101, "rbit", int_aarch64_neon_rbit>;
+defm REV16 : SIMDTwoVectorB<0, 0b00, 0b00001, "rev16", AArch64rev16>;
+defm REV32 : SIMDTwoVectorBH<1, 0b00000, "rev32", AArch64rev32>;
+defm REV64 : SIMDTwoVectorBHS<0, 0b00000, "rev64", AArch64rev64>;
+defm SADALP : SIMDLongTwoVectorTied<0, 0b00110, "sadalp",
+ BinOpFrag<(add node:$LHS, (int_aarch64_neon_saddlp node:$RHS))> >;
+defm SADDLP : SIMDLongTwoVector<0, 0b00010, "saddlp", int_aarch64_neon_saddlp>;
+defm SCVTF : SIMDTwoVectorIntToFP<0, 0, 0b11101, "scvtf", sint_to_fp>;
+defm SHLL : SIMDVectorLShiftLongBySizeBHS;
+defm SQABS : SIMDTwoVectorBHSD<0, 0b00111, "sqabs", int_aarch64_neon_sqabs>;
+defm SQNEG : SIMDTwoVectorBHSD<1, 0b00111, "sqneg", int_aarch64_neon_sqneg>;
+defm SQXTN : SIMDMixedTwoVector<0, 0b10100, "sqxtn", int_aarch64_neon_sqxtn>;
+defm SQXTUN : SIMDMixedTwoVector<1, 0b10010, "sqxtun", int_aarch64_neon_sqxtun>;
+defm SUQADD : SIMDTwoVectorBHSDTied<0, 0b00011, "suqadd",int_aarch64_neon_suqadd>;
+defm UADALP : SIMDLongTwoVectorTied<1, 0b00110, "uadalp",
+ BinOpFrag<(add node:$LHS, (int_aarch64_neon_uaddlp node:$RHS))> >;
+defm UADDLP : SIMDLongTwoVector<1, 0b00010, "uaddlp",
+ int_aarch64_neon_uaddlp>;
+defm UCVTF : SIMDTwoVectorIntToFP<1, 0, 0b11101, "ucvtf", uint_to_fp>;
+defm UQXTN : SIMDMixedTwoVector<1, 0b10100, "uqxtn", int_aarch64_neon_uqxtn>;
+defm URECPE : SIMDTwoVectorS<0, 1, 0b11100, "urecpe", int_aarch64_neon_urecpe>;
+defm URSQRTE: SIMDTwoVectorS<1, 1, 0b11100, "ursqrte", int_aarch64_neon_ursqrte>;
+defm USQADD : SIMDTwoVectorBHSDTied<1, 0b00011, "usqadd",int_aarch64_neon_usqadd>;
+defm XTN : SIMDMixedTwoVector<0, 0b10010, "xtn", trunc>;
+
+def : Pat<(v4f16 (AArch64rev32 V64:$Rn)), (REV32v4i16 V64:$Rn)>;
+def : Pat<(v4f16 (AArch64rev64 V64:$Rn)), (REV64v4i16 V64:$Rn)>;
+def : Pat<(v8f16 (AArch64rev32 V128:$Rn)), (REV32v8i16 V128:$Rn)>;
+def : Pat<(v8f16 (AArch64rev64 V128:$Rn)), (REV64v8i16 V128:$Rn)>;
+def : Pat<(v2f32 (AArch64rev64 V64:$Rn)), (REV64v2i32 V64:$Rn)>;
+def : Pat<(v4f32 (AArch64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>;
+
+// Patterns for vector long shift (by element width). These need to match all
+// three of zext, sext and anyext so it's easier to pull the patterns out of the
+// definition.
+multiclass SIMDVectorLShiftLongBySizeBHSPats<SDPatternOperator ext> {
+ def : Pat<(AArch64vshl (v8i16 (ext (v8i8 V64:$Rn))), (i32 8)),
+ (SHLLv8i8 V64:$Rn)>;
+ def : Pat<(AArch64vshl (v8i16 (ext (extract_high_v16i8 V128:$Rn))), (i32 8)),
+ (SHLLv16i8 V128:$Rn)>;
+ def : Pat<(AArch64vshl (v4i32 (ext (v4i16 V64:$Rn))), (i32 16)),
+ (SHLLv4i16 V64:$Rn)>;
+ def : Pat<(AArch64vshl (v4i32 (ext (extract_high_v8i16 V128:$Rn))), (i32 16)),
+ (SHLLv8i16 V128:$Rn)>;
+ def : Pat<(AArch64vshl (v2i64 (ext (v2i32 V64:$Rn))), (i32 32)),
+ (SHLLv2i32 V64:$Rn)>;
+ def : Pat<(AArch64vshl (v2i64 (ext (extract_high_v4i32 V128:$Rn))), (i32 32)),
+ (SHLLv4i32 V128:$Rn)>;
+}
+
+defm : SIMDVectorLShiftLongBySizeBHSPats<anyext>;
+defm : SIMDVectorLShiftLongBySizeBHSPats<zext>;
+defm : SIMDVectorLShiftLongBySizeBHSPats<sext>;
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD three vector instructions.
+//===----------------------------------------------------------------------===//
+
+defm ADD : SIMDThreeSameVector<0, 0b10000, "add", add>;
+defm ADDP : SIMDThreeSameVector<0, 0b10111, "addp", int_aarch64_neon_addp>;
+defm CMEQ : SIMDThreeSameVector<1, 0b10001, "cmeq", AArch64cmeq>;
+defm CMGE : SIMDThreeSameVector<0, 0b00111, "cmge", AArch64cmge>;
+defm CMGT : SIMDThreeSameVector<0, 0b00110, "cmgt", AArch64cmgt>;
+defm CMHI : SIMDThreeSameVector<1, 0b00110, "cmhi", AArch64cmhi>;
+defm CMHS : SIMDThreeSameVector<1, 0b00111, "cmhs", AArch64cmhs>;
+defm CMTST : SIMDThreeSameVector<0, 0b10001, "cmtst", AArch64cmtst>;
+defm FABD : SIMDThreeSameVectorFP<1,1,0b010,"fabd", int_aarch64_neon_fabd>;
+defm FACGE : SIMDThreeSameVectorFPCmp<1,0,0b101,"facge",int_aarch64_neon_facge>;
+defm FACGT : SIMDThreeSameVectorFPCmp<1,1,0b101,"facgt",int_aarch64_neon_facgt>;
+defm FADDP : SIMDThreeSameVectorFP<1,0,0b010,"faddp",int_aarch64_neon_addp>;
+defm FADD : SIMDThreeSameVectorFP<0,0,0b010,"fadd", fadd>;
+defm FCMEQ : SIMDThreeSameVectorFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>;
+defm FCMGE : SIMDThreeSameVectorFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>;
+defm FCMGT : SIMDThreeSameVectorFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>;
+defm FDIV : SIMDThreeSameVectorFP<1,0,0b111,"fdiv", fdiv>;
+defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b000,"fmaxnmp", int_aarch64_neon_fmaxnmp>;
+defm FMAXNM : SIMDThreeSameVectorFP<0,0,0b000,"fmaxnm", fmaxnum>;
+defm FMAXP : SIMDThreeSameVectorFP<1,0,0b110,"fmaxp", int_aarch64_neon_fmaxp>;
+defm FMAX : SIMDThreeSameVectorFP<0,0,0b110,"fmax", fmaxnan>;
+defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b000,"fminnmp", int_aarch64_neon_fminnmp>;
+defm FMINNM : SIMDThreeSameVectorFP<0,1,0b000,"fminnm", fminnum>;
+defm FMINP : SIMDThreeSameVectorFP<1,1,0b110,"fminp", int_aarch64_neon_fminp>;
+defm FMIN : SIMDThreeSameVectorFP<0,1,0b110,"fmin", fminnan>;
+
+// NOTE: The operands of the PatFrag are reordered on FMLA/FMLS because the
+// instruction expects the addend first, while the fma intrinsic puts it last.
+defm FMLA : SIMDThreeSameVectorFPTied<0, 0, 0b001, "fmla",
+ TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
+defm FMLS : SIMDThreeSameVectorFPTied<0, 1, 0b001, "fmls",
+ TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
+
+// The following def pats catch the case where the LHS of an FMA is negated.
+// The TriOpFrag above catches the case where the middle operand is negated.
+def : Pat<(v2f32 (fma (fneg V64:$Rn), V64:$Rm, V64:$Rd)),
+ (FMLSv2f32 V64:$Rd, V64:$Rn, V64:$Rm)>;
+
+def : Pat<(v4f32 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
+ (FMLSv4f32 V128:$Rd, V128:$Rn, V128:$Rm)>;
+
+def : Pat<(v2f64 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
+ (FMLSv2f64 V128:$Rd, V128:$Rn, V128:$Rm)>;
+
+defm FMULX : SIMDThreeSameVectorFP<0,0,0b011,"fmulx", int_aarch64_neon_fmulx>;
+defm FMUL : SIMDThreeSameVectorFP<1,0,0b011,"fmul", fmul>;
+defm FRECPS : SIMDThreeSameVectorFP<0,0,0b111,"frecps", int_aarch64_neon_frecps>;
+defm FRSQRTS : SIMDThreeSameVectorFP<0,1,0b111,"frsqrts", int_aarch64_neon_frsqrts>;
+defm FSUB : SIMDThreeSameVectorFP<0,1,0b010,"fsub", fsub>;
+defm MLA : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla",
+ TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))> >;
+defm MLS : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls",
+ TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))> >;
+defm MUL : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>;
+defm PMUL : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>;
+defm SABA : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba",
+ TriOpFrag<(add node:$LHS, (int_aarch64_neon_sabd node:$MHS, node:$RHS))> >;
+defm SABD : SIMDThreeSameVectorBHS<0,0b01110,"sabd", int_aarch64_neon_sabd>;
+defm SHADD : SIMDThreeSameVectorBHS<0,0b00000,"shadd", int_aarch64_neon_shadd>;
+defm SHSUB : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_aarch64_neon_shsub>;
+defm SMAXP : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_aarch64_neon_smaxp>;
+defm SMAX : SIMDThreeSameVectorBHS<0,0b01100,"smax", smax>;
+defm SMINP : SIMDThreeSameVectorBHS<0,0b10101,"sminp", int_aarch64_neon_sminp>;
+defm SMIN : SIMDThreeSameVectorBHS<0,0b01101,"smin", smin>;
+defm SQADD : SIMDThreeSameVector<0,0b00001,"sqadd", int_aarch64_neon_sqadd>;
+defm SQDMULH : SIMDThreeSameVectorHS<0,0b10110,"sqdmulh",int_aarch64_neon_sqdmulh>;
+defm SQRDMULH : SIMDThreeSameVectorHS<1,0b10110,"sqrdmulh",int_aarch64_neon_sqrdmulh>;
+defm SQRSHL : SIMDThreeSameVector<0,0b01011,"sqrshl", int_aarch64_neon_sqrshl>;
+defm SQSHL : SIMDThreeSameVector<0,0b01001,"sqshl", int_aarch64_neon_sqshl>;
+defm SQSUB : SIMDThreeSameVector<0,0b00101,"sqsub", int_aarch64_neon_sqsub>;
+defm SRHADD : SIMDThreeSameVectorBHS<0,0b00010,"srhadd",int_aarch64_neon_srhadd>;
+defm SRSHL : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>;
+defm SSHL : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>;
+defm SUB : SIMDThreeSameVector<1,0b10000,"sub", sub>;
+defm UABA : SIMDThreeSameVectorBHSTied<1, 0b01111, "uaba",
+ TriOpFrag<(add node:$LHS, (int_aarch64_neon_uabd node:$MHS, node:$RHS))> >;
+defm UABD : SIMDThreeSameVectorBHS<1,0b01110,"uabd", int_aarch64_neon_uabd>;
+defm UHADD : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", int_aarch64_neon_uhadd>;
+defm UHSUB : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_aarch64_neon_uhsub>;
+defm UMAXP : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_aarch64_neon_umaxp>;
+defm UMAX : SIMDThreeSameVectorBHS<1,0b01100,"umax", umax>;
+defm UMINP : SIMDThreeSameVectorBHS<1,0b10101,"uminp", int_aarch64_neon_uminp>;
+defm UMIN : SIMDThreeSameVectorBHS<1,0b01101,"umin", umin>;
+defm UQADD : SIMDThreeSameVector<1,0b00001,"uqadd", int_aarch64_neon_uqadd>;
+defm UQRSHL : SIMDThreeSameVector<1,0b01011,"uqrshl", int_aarch64_neon_uqrshl>;
+defm UQSHL : SIMDThreeSameVector<1,0b01001,"uqshl", int_aarch64_neon_uqshl>;
+defm UQSUB : SIMDThreeSameVector<1,0b00101,"uqsub", int_aarch64_neon_uqsub>;
+defm URHADD : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", int_aarch64_neon_urhadd>;
+defm URSHL : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>;
+defm USHL : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>;
+defm SQRDMLAH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10000,"sqrdmlah",
+ int_aarch64_neon_sqadd>;
+defm SQRDMLSH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10001,"sqrdmlsh",
+ int_aarch64_neon_sqsub>;
+
+defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>;
+defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic",
+ BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >;
+defm BIF : SIMDLogicalThreeVector<1, 0b11, "bif">;
+defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", AArch64bit>;
+defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl",
+ TriOpFrag<(or (and node:$LHS, node:$MHS), (and (vnot node:$LHS), node:$RHS))>>;
+defm EOR : SIMDLogicalThreeVector<1, 0b00, "eor", xor>;
+defm ORN : SIMDLogicalThreeVector<0, 0b11, "orn",
+ BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >;
+defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>;
+
+
+def : Pat<(AArch64bsl (v8i8 V64:$Rd), V64:$Rn, V64:$Rm),
+ (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bsl (v4i16 V64:$Rd), V64:$Rn, V64:$Rm),
+ (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bsl (v2i32 V64:$Rd), V64:$Rn, V64:$Rm),
+ (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bsl (v1i64 V64:$Rd), V64:$Rn, V64:$Rm),
+ (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+
+def : Pat<(AArch64bsl (v16i8 V128:$Rd), V128:$Rn, V128:$Rm),
+ (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bsl (v8i16 V128:$Rd), V128:$Rn, V128:$Rm),
+ (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bsl (v4i32 V128:$Rd), V128:$Rn, V128:$Rm),
+ (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bsl (v2i64 V128:$Rd), V128:$Rn, V128:$Rm),
+ (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+
+def : InstAlias<"mov{\t$dst.16b, $src.16b|.16b\t$dst, $src}",
+ (ORRv16i8 V128:$dst, V128:$src, V128:$src), 1>;
+def : InstAlias<"mov{\t$dst.8h, $src.8h|.8h\t$dst, $src}",
+ (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
+def : InstAlias<"mov{\t$dst.4s, $src.4s|.4s\t$dst, $src}",
+ (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
+def : InstAlias<"mov{\t$dst.2d, $src.2d|.2d\t$dst, $src}",
+ (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
+
+def : InstAlias<"mov{\t$dst.8b, $src.8b|.8b\t$dst, $src}",
+ (ORRv8i8 V64:$dst, V64:$src, V64:$src), 1>;
+def : InstAlias<"mov{\t$dst.4h, $src.4h|.4h\t$dst, $src}",
+ (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
+def : InstAlias<"mov{\t$dst.2s, $src.2s|.2s\t$dst, $src}",
+ (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
+def : InstAlias<"mov{\t$dst.1d, $src.1d|.1d\t$dst, $src}",
+ (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
+
+def : InstAlias<"{cmls\t$dst.8b, $src1.8b, $src2.8b" #
+ "|cmls.8b\t$dst, $src1, $src2}",
+ (CMHSv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.16b, $src1.16b, $src2.16b" #
+ "|cmls.16b\t$dst, $src1, $src2}",
+ (CMHSv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.4h, $src1.4h, $src2.4h" #
+ "|cmls.4h\t$dst, $src1, $src2}",
+ (CMHSv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.8h, $src1.8h, $src2.8h" #
+ "|cmls.8h\t$dst, $src1, $src2}",
+ (CMHSv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.2s, $src1.2s, $src2.2s" #
+ "|cmls.2s\t$dst, $src1, $src2}",
+ (CMHSv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.4s, $src1.4s, $src2.4s" #
+ "|cmls.4s\t$dst, $src1, $src2}",
+ (CMHSv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.2d, $src1.2d, $src2.2d" #
+ "|cmls.2d\t$dst, $src1, $src2}",
+ (CMHSv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{cmlo\t$dst.8b, $src1.8b, $src2.8b" #
+ "|cmlo.8b\t$dst, $src1, $src2}",
+ (CMHIv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.16b, $src1.16b, $src2.16b" #
+ "|cmlo.16b\t$dst, $src1, $src2}",
+ (CMHIv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.4h, $src1.4h, $src2.4h" #
+ "|cmlo.4h\t$dst, $src1, $src2}",
+ (CMHIv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.8h, $src1.8h, $src2.8h" #
+ "|cmlo.8h\t$dst, $src1, $src2}",
+ (CMHIv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.2s, $src1.2s, $src2.2s" #
+ "|cmlo.2s\t$dst, $src1, $src2}",
+ (CMHIv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.4s, $src1.4s, $src2.4s" #
+ "|cmlo.4s\t$dst, $src1, $src2}",
+ (CMHIv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.2d, $src1.2d, $src2.2d" #
+ "|cmlo.2d\t$dst, $src1, $src2}",
+ (CMHIv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{cmle\t$dst.8b, $src1.8b, $src2.8b" #
+ "|cmle.8b\t$dst, $src1, $src2}",
+ (CMGEv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.16b, $src1.16b, $src2.16b" #
+ "|cmle.16b\t$dst, $src1, $src2}",
+ (CMGEv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.4h, $src1.4h, $src2.4h" #
+ "|cmle.4h\t$dst, $src1, $src2}",
+ (CMGEv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.8h, $src1.8h, $src2.8h" #
+ "|cmle.8h\t$dst, $src1, $src2}",
+ (CMGEv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.2s, $src1.2s, $src2.2s" #
+ "|cmle.2s\t$dst, $src1, $src2}",
+ (CMGEv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.4s, $src1.4s, $src2.4s" #
+ "|cmle.4s\t$dst, $src1, $src2}",
+ (CMGEv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.2d, $src1.2d, $src2.2d" #
+ "|cmle.2d\t$dst, $src1, $src2}",
+ (CMGEv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{cmlt\t$dst.8b, $src1.8b, $src2.8b" #
+ "|cmlt.8b\t$dst, $src1, $src2}",
+ (CMGTv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.16b, $src1.16b, $src2.16b" #
+ "|cmlt.16b\t$dst, $src1, $src2}",
+ (CMGTv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.4h, $src1.4h, $src2.4h" #
+ "|cmlt.4h\t$dst, $src1, $src2}",
+ (CMGTv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.8h, $src1.8h, $src2.8h" #
+ "|cmlt.8h\t$dst, $src1, $src2}",
+ (CMGTv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.2s, $src1.2s, $src2.2s" #
+ "|cmlt.2s\t$dst, $src1, $src2}",
+ (CMGTv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.4s, $src1.4s, $src2.4s" #
+ "|cmlt.4s\t$dst, $src1, $src2}",
+ (CMGTv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.2d, $src1.2d, $src2.2d" #
+ "|cmlt.2d\t$dst, $src1, $src2}",
+ (CMGTv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+let Predicates = [HasNEON, HasFullFP16] in {
+def : InstAlias<"{fcmle\t$dst.4h, $src1.4h, $src2.4h" #
+ "|fcmle.4h\t$dst, $src1, $src2}",
+ (FCMGEv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{fcmle\t$dst.8h, $src1.8h, $src2.8h" #
+ "|fcmle.8h\t$dst, $src1, $src2}",
+ (FCMGEv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
+}
+def : InstAlias<"{fcmle\t$dst.2s, $src1.2s, $src2.2s" #
+ "|fcmle.2s\t$dst, $src1, $src2}",
+ (FCMGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{fcmle\t$dst.4s, $src1.4s, $src2.4s" #
+ "|fcmle.4s\t$dst, $src1, $src2}",
+ (FCMGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{fcmle\t$dst.2d, $src1.2d, $src2.2d" #
+ "|fcmle.2d\t$dst, $src1, $src2}",
+ (FCMGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+let Predicates = [HasNEON, HasFullFP16] in {
+def : InstAlias<"{fcmlt\t$dst.4h, $src1.4h, $src2.4h" #
+ "|fcmlt.4h\t$dst, $src1, $src2}",
+ (FCMGTv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{fcmlt\t$dst.8h, $src1.8h, $src2.8h" #
+ "|fcmlt.8h\t$dst, $src1, $src2}",
+ (FCMGTv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
+}
+def : InstAlias<"{fcmlt\t$dst.2s, $src1.2s, $src2.2s" #
+ "|fcmlt.2s\t$dst, $src1, $src2}",
+ (FCMGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{fcmlt\t$dst.4s, $src1.4s, $src2.4s" #
+ "|fcmlt.4s\t$dst, $src1, $src2}",
+ (FCMGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{fcmlt\t$dst.2d, $src1.2d, $src2.2d" #
+ "|fcmlt.2d\t$dst, $src1, $src2}",
+ (FCMGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+let Predicates = [HasNEON, HasFullFP16] in {
+def : InstAlias<"{facle\t$dst.4h, $src1.4h, $src2.4h" #
+ "|facle.4h\t$dst, $src1, $src2}",
+ (FACGEv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{facle\t$dst.8h, $src1.8h, $src2.8h" #
+ "|facle.8h\t$dst, $src1, $src2}",
+ (FACGEv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
+}
+def : InstAlias<"{facle\t$dst.2s, $src1.2s, $src2.2s" #
+ "|facle.2s\t$dst, $src1, $src2}",
+ (FACGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{facle\t$dst.4s, $src1.4s, $src2.4s" #
+ "|facle.4s\t$dst, $src1, $src2}",
+ (FACGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{facle\t$dst.2d, $src1.2d, $src2.2d" #
+ "|facle.2d\t$dst, $src1, $src2}",
+ (FACGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+let Predicates = [HasNEON, HasFullFP16] in {
+def : InstAlias<"{faclt\t$dst.4h, $src1.4h, $src2.4h" #
+ "|faclt.4h\t$dst, $src1, $src2}",
+ (FACGTv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{faclt\t$dst.8h, $src1.8h, $src2.8h" #
+ "|faclt.8h\t$dst, $src1, $src2}",
+ (FACGTv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
+}
+def : InstAlias<"{faclt\t$dst.2s, $src1.2s, $src2.2s" #
+ "|faclt.2s\t$dst, $src1, $src2}",
+ (FACGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{faclt\t$dst.4s, $src1.4s, $src2.4s" #
+ "|faclt.4s\t$dst, $src1, $src2}",
+ (FACGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{faclt\t$dst.2d, $src1.2d, $src2.2d" #
+ "|faclt.2d\t$dst, $src1, $src2}",
+ (FACGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD three scalar instructions.
+//===----------------------------------------------------------------------===//
+
+defm ADD : SIMDThreeScalarD<0, 0b10000, "add", add>;
+defm CMEQ : SIMDThreeScalarD<1, 0b10001, "cmeq", AArch64cmeq>;
+defm CMGE : SIMDThreeScalarD<0, 0b00111, "cmge", AArch64cmge>;
+defm CMGT : SIMDThreeScalarD<0, 0b00110, "cmgt", AArch64cmgt>;
+defm CMHI : SIMDThreeScalarD<1, 0b00110, "cmhi", AArch64cmhi>;
+defm CMHS : SIMDThreeScalarD<1, 0b00111, "cmhs", AArch64cmhs>;
+defm CMTST : SIMDThreeScalarD<0, 0b10001, "cmtst", AArch64cmtst>;
+defm FABD : SIMDFPThreeScalar<1, 1, 0b010, "fabd", int_aarch64_sisd_fabd>;
+def : Pat<(v1f64 (int_aarch64_neon_fabd (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+ (FABD64 FPR64:$Rn, FPR64:$Rm)>;
+defm FACGE : SIMDThreeScalarFPCmp<1, 0, 0b101, "facge",
+ int_aarch64_neon_facge>;
+defm FACGT : SIMDThreeScalarFPCmp<1, 1, 0b101, "facgt",
+ int_aarch64_neon_facgt>;
+defm FCMEQ : SIMDThreeScalarFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>;
+defm FCMGE : SIMDThreeScalarFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>;
+defm FCMGT : SIMDThreeScalarFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>;
+defm FMULX : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx>;
+defm FRECPS : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps>;
+defm FRSQRTS : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts>;
+defm SQADD : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_aarch64_neon_sqadd>;
+defm SQDMULH : SIMDThreeScalarHS< 0, 0b10110, "sqdmulh", int_aarch64_neon_sqdmulh>;
+defm SQRDMULH : SIMDThreeScalarHS< 1, 0b10110, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
+defm SQRSHL : SIMDThreeScalarBHSD<0, 0b01011, "sqrshl",int_aarch64_neon_sqrshl>;
+defm SQSHL : SIMDThreeScalarBHSD<0, 0b01001, "sqshl", int_aarch64_neon_sqshl>;
+defm SQSUB : SIMDThreeScalarBHSD<0, 0b00101, "sqsub", int_aarch64_neon_sqsub>;
+defm SRSHL : SIMDThreeScalarD< 0, 0b01010, "srshl", int_aarch64_neon_srshl>;
+defm SSHL : SIMDThreeScalarD< 0, 0b01000, "sshl", int_aarch64_neon_sshl>;
+defm SUB : SIMDThreeScalarD< 1, 0b10000, "sub", sub>;
+defm UQADD : SIMDThreeScalarBHSD<1, 0b00001, "uqadd", int_aarch64_neon_uqadd>;
+defm UQRSHL : SIMDThreeScalarBHSD<1, 0b01011, "uqrshl",int_aarch64_neon_uqrshl>;
+defm UQSHL : SIMDThreeScalarBHSD<1, 0b01001, "uqshl", int_aarch64_neon_uqshl>;
+defm UQSUB : SIMDThreeScalarBHSD<1, 0b00101, "uqsub", int_aarch64_neon_uqsub>;
+defm URSHL : SIMDThreeScalarD< 1, 0b01010, "urshl", int_aarch64_neon_urshl>;
+defm USHL : SIMDThreeScalarD< 1, 0b01000, "ushl", int_aarch64_neon_ushl>;
+let Predicates = [HasV8_1a] in {
+ defm SQRDMLAH : SIMDThreeScalarHSTied<1, 0, 0b10000, "sqrdmlah">;
+ defm SQRDMLSH : SIMDThreeScalarHSTied<1, 0, 0b10001, "sqrdmlsh">;
+ def : Pat<(i32 (int_aarch64_neon_sqadd
+ (i32 FPR32:$Rd),
+ (i32 (int_aarch64_neon_sqrdmulh (i32 FPR32:$Rn),
+ (i32 FPR32:$Rm))))),
+ (SQRDMLAHv1i32 FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>;
+ def : Pat<(i32 (int_aarch64_neon_sqsub
+ (i32 FPR32:$Rd),
+ (i32 (int_aarch64_neon_sqrdmulh (i32 FPR32:$Rn),
+ (i32 FPR32:$Rm))))),
+ (SQRDMLSHv1i32 FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>;
+}
+
+def : InstAlias<"cmls $dst, $src1, $src2",
+ (CMHSv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"cmle $dst, $src1, $src2",
+ (CMGEv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"cmlo $dst, $src1, $src2",
+ (CMHIv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"cmlt $dst, $src1, $src2",
+ (CMGTv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"fcmle $dst, $src1, $src2",
+ (FCMGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
+def : InstAlias<"fcmle $dst, $src1, $src2",
+ (FCMGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"fcmlt $dst, $src1, $src2",
+ (FCMGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
+def : InstAlias<"fcmlt $dst, $src1, $src2",
+ (FCMGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"facle $dst, $src1, $src2",
+ (FACGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
+def : InstAlias<"facle $dst, $src1, $src2",
+ (FACGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"faclt $dst, $src1, $src2",
+ (FACGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
+def : InstAlias<"faclt $dst, $src1, $src2",
+ (FACGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD three scalar instructions (mixed operands).
+//===----------------------------------------------------------------------===//
+defm SQDMULL : SIMDThreeScalarMixedHS<0, 0b11010, "sqdmull",
+ int_aarch64_neon_sqdmulls_scalar>;
+defm SQDMLAL : SIMDThreeScalarMixedTiedHS<0, 0b10010, "sqdmlal">;
+defm SQDMLSL : SIMDThreeScalarMixedTiedHS<0, 0b10110, "sqdmlsl">;
+
+def : Pat<(i64 (int_aarch64_neon_sqadd (i64 FPR64:$Rd),
+ (i64 (int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
+ (i32 FPR32:$Rm))))),
+ (SQDMLALi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
+def : Pat<(i64 (int_aarch64_neon_sqsub (i64 FPR64:$Rd),
+ (i64 (int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
+ (i32 FPR32:$Rm))))),
+ (SQDMLSLi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD two scalar instructions.
+//===----------------------------------------------------------------------===//
+
+defm ABS : SIMDTwoScalarD< 0, 0b01011, "abs", int_aarch64_neon_abs>;
+defm CMEQ : SIMDCmpTwoScalarD< 0, 0b01001, "cmeq", AArch64cmeqz>;
+defm CMGE : SIMDCmpTwoScalarD< 1, 0b01000, "cmge", AArch64cmgez>;
+defm CMGT : SIMDCmpTwoScalarD< 0, 0b01000, "cmgt", AArch64cmgtz>;
+defm CMLE : SIMDCmpTwoScalarD< 1, 0b01001, "cmle", AArch64cmlez>;
+defm CMLT : SIMDCmpTwoScalarD< 0, 0b01010, "cmlt", AArch64cmltz>;
+defm FCMEQ : SIMDFPCmpTwoScalar<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>;
+defm FCMGE : SIMDFPCmpTwoScalar<1, 1, 0b01100, "fcmge", AArch64fcmgez>;
+defm FCMGT : SIMDFPCmpTwoScalar<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>;
+defm FCMLE : SIMDFPCmpTwoScalar<1, 1, 0b01101, "fcmle", AArch64fcmlez>;
+defm FCMLT : SIMDFPCmpTwoScalar<0, 1, 0b01110, "fcmlt", AArch64fcmltz>;
+defm FCVTAS : SIMDFPTwoScalar< 0, 0, 0b11100, "fcvtas">;
+defm FCVTAU : SIMDFPTwoScalar< 1, 0, 0b11100, "fcvtau">;
+defm FCVTMS : SIMDFPTwoScalar< 0, 0, 0b11011, "fcvtms">;
+defm FCVTMU : SIMDFPTwoScalar< 1, 0, 0b11011, "fcvtmu">;
+defm FCVTNS : SIMDFPTwoScalar< 0, 0, 0b11010, "fcvtns">;
+defm FCVTNU : SIMDFPTwoScalar< 1, 0, 0b11010, "fcvtnu">;
+defm FCVTPS : SIMDFPTwoScalar< 0, 1, 0b11010, "fcvtps">;
+defm FCVTPU : SIMDFPTwoScalar< 1, 1, 0b11010, "fcvtpu">;
+def FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">;
+defm FCVTZS : SIMDFPTwoScalar< 0, 1, 0b11011, "fcvtzs">;
+defm FCVTZU : SIMDFPTwoScalar< 1, 1, 0b11011, "fcvtzu">;
+defm FRECPE : SIMDFPTwoScalar< 0, 1, 0b11101, "frecpe">;
+defm FRECPX : SIMDFPTwoScalar< 0, 1, 0b11111, "frecpx">;
+defm FRSQRTE : SIMDFPTwoScalar< 1, 1, 0b11101, "frsqrte">;
+defm NEG : SIMDTwoScalarD< 1, 0b01011, "neg",
+ UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
+defm SCVTF : SIMDFPTwoScalarCVT< 0, 0, 0b11101, "scvtf", AArch64sitof>;
+defm SQABS : SIMDTwoScalarBHSD< 0, 0b00111, "sqabs", int_aarch64_neon_sqabs>;
+defm SQNEG : SIMDTwoScalarBHSD< 1, 0b00111, "sqneg", int_aarch64_neon_sqneg>;
+defm SQXTN : SIMDTwoScalarMixedBHS< 0, 0b10100, "sqxtn", int_aarch64_neon_scalar_sqxtn>;
+defm SQXTUN : SIMDTwoScalarMixedBHS< 1, 0b10010, "sqxtun", int_aarch64_neon_scalar_sqxtun>;
+defm SUQADD : SIMDTwoScalarBHSDTied< 0, 0b00011, "suqadd",
+ int_aarch64_neon_suqadd>;
+defm UCVTF : SIMDFPTwoScalarCVT< 1, 0, 0b11101, "ucvtf", AArch64uitof>;
+defm UQXTN : SIMDTwoScalarMixedBHS<1, 0b10100, "uqxtn", int_aarch64_neon_scalar_uqxtn>;
+defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd",
+ int_aarch64_neon_usqadd>;
+
+def : Pat<(AArch64neg (v1i64 V64:$Rn)), (NEGv1i64 V64:$Rn)>;
+
+def : Pat<(v1i64 (int_aarch64_neon_fcvtas (v1f64 FPR64:$Rn))),
+ (FCVTASv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtau (v1f64 FPR64:$Rn))),
+ (FCVTAUv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtms (v1f64 FPR64:$Rn))),
+ (FCVTMSv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtmu (v1f64 FPR64:$Rn))),
+ (FCVTMUv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtns (v1f64 FPR64:$Rn))),
+ (FCVTNSv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtnu (v1f64 FPR64:$Rn))),
+ (FCVTNUv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtps (v1f64 FPR64:$Rn))),
+ (FCVTPSv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtpu (v1f64 FPR64:$Rn))),
+ (FCVTPUv1i64 FPR64:$Rn)>;
+
+def : Pat<(f32 (int_aarch64_neon_frecpe (f32 FPR32:$Rn))),
+ (FRECPEv1i32 FPR32:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_frecpe (f64 FPR64:$Rn))),
+ (FRECPEv1i64 FPR64:$Rn)>;
+def : Pat<(v1f64 (int_aarch64_neon_frecpe (v1f64 FPR64:$Rn))),
+ (FRECPEv1i64 FPR64:$Rn)>;
+
+def : Pat<(f32 (AArch64frecpe (f32 FPR32:$Rn))),
+ (FRECPEv1i32 FPR32:$Rn)>;
+def : Pat<(v2f32 (AArch64frecpe (v2f32 V64:$Rn))),
+ (FRECPEv2f32 V64:$Rn)>;
+def : Pat<(v4f32 (AArch64frecpe (v4f32 FPR128:$Rn))),
+ (FRECPEv4f32 FPR128:$Rn)>;
+def : Pat<(f64 (AArch64frecpe (f64 FPR64:$Rn))),
+ (FRECPEv1i64 FPR64:$Rn)>;
+def : Pat<(v1f64 (AArch64frecpe (v1f64 FPR64:$Rn))),
+ (FRECPEv1i64 FPR64:$Rn)>;
+def : Pat<(v2f64 (AArch64frecpe (v2f64 FPR128:$Rn))),
+ (FRECPEv2f64 FPR128:$Rn)>;
+
+def : Pat<(f32 (AArch64frecps (f32 FPR32:$Rn), (f32 FPR32:$Rm))),
+ (FRECPS32 FPR32:$Rn, FPR32:$Rm)>;
+def : Pat<(v2f32 (AArch64frecps (v2f32 V64:$Rn), (v2f32 V64:$Rm))),
+ (FRECPSv2f32 V64:$Rn, V64:$Rm)>;
+def : Pat<(v4f32 (AArch64frecps (v4f32 FPR128:$Rn), (v4f32 FPR128:$Rm))),
+ (FRECPSv4f32 FPR128:$Rn, FPR128:$Rm)>;
+def : Pat<(f64 (AArch64frecps (f64 FPR64:$Rn), (f64 FPR64:$Rm))),
+ (FRECPS64 FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(v2f64 (AArch64frecps (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
+ (FRECPSv2f64 FPR128:$Rn, FPR128:$Rm)>;
+
+def : Pat<(f32 (int_aarch64_neon_frecpx (f32 FPR32:$Rn))),
+ (FRECPXv1i32 FPR32:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_frecpx (f64 FPR64:$Rn))),
+ (FRECPXv1i64 FPR64:$Rn)>;
+
+def : Pat<(f32 (int_aarch64_neon_frsqrte (f32 FPR32:$Rn))),
+ (FRSQRTEv1i32 FPR32:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_frsqrte (f64 FPR64:$Rn))),
+ (FRSQRTEv1i64 FPR64:$Rn)>;
+def : Pat<(v1f64 (int_aarch64_neon_frsqrte (v1f64 FPR64:$Rn))),
+ (FRSQRTEv1i64 FPR64:$Rn)>;
+
+def : Pat<(f32 (AArch64frsqrte (f32 FPR32:$Rn))),
+ (FRSQRTEv1i32 FPR32:$Rn)>;
+def : Pat<(v2f32 (AArch64frsqrte (v2f32 V64:$Rn))),
+ (FRSQRTEv2f32 V64:$Rn)>;
+def : Pat<(v4f32 (AArch64frsqrte (v4f32 FPR128:$Rn))),
+ (FRSQRTEv4f32 FPR128:$Rn)>;
+def : Pat<(f64 (AArch64frsqrte (f64 FPR64:$Rn))),
+ (FRSQRTEv1i64 FPR64:$Rn)>;
+def : Pat<(v1f64 (AArch64frsqrte (v1f64 FPR64:$Rn))),
+ (FRSQRTEv1i64 FPR64:$Rn)>;
+def : Pat<(v2f64 (AArch64frsqrte (v2f64 FPR128:$Rn))),
+ (FRSQRTEv2f64 FPR128:$Rn)>;
+
+def : Pat<(f32 (AArch64frsqrts (f32 FPR32:$Rn), (f32 FPR32:$Rm))),
+ (FRSQRTS32 FPR32:$Rn, FPR32:$Rm)>;
+def : Pat<(v2f32 (AArch64frsqrts (v2f32 V64:$Rn), (v2f32 V64:$Rm))),
+ (FRSQRTSv2f32 V64:$Rn, V64:$Rm)>;
+def : Pat<(v4f32 (AArch64frsqrts (v4f32 FPR128:$Rn), (v4f32 FPR128:$Rm))),
+ (FRSQRTSv4f32 FPR128:$Rn, FPR128:$Rm)>;
+def : Pat<(f64 (AArch64frsqrts (f64 FPR64:$Rn), (f64 FPR64:$Rm))),
+ (FRSQRTS64 FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
+ (FRSQRTSv2f64 FPR128:$Rn, FPR128:$Rm)>;
+
+// If an integer is about to be converted to a floating point value,
+// just load it on the floating point unit.
+// Here are the patterns for 8 and 16-bits to float.
+// 8-bits -> float.
+multiclass UIntToFPROLoadPat<ValueType DstTy, ValueType SrcTy,
+ SDPatternOperator loadop, Instruction UCVTF,
+ ROAddrMode ro, Instruction LDRW, Instruction LDRX,
+ SubRegIndex sub> {
+ def : Pat<(DstTy (uint_to_fp (SrcTy
+ (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm,
+ ro.Wext:$extend))))),
+ (UCVTF (INSERT_SUBREG (DstTy (IMPLICIT_DEF)),
+ (LDRW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend),
+ sub))>;
+
+ def : Pat<(DstTy (uint_to_fp (SrcTy
+ (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm,
+ ro.Wext:$extend))))),
+ (UCVTF (INSERT_SUBREG (DstTy (IMPLICIT_DEF)),
+ (LDRX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend),
+ sub))>;
+}
+
+defm : UIntToFPROLoadPat<f32, i32, zextloadi8,
+ UCVTFv1i32, ro8, LDRBroW, LDRBroX, bsub>;
+def : Pat <(f32 (uint_to_fp (i32
+ (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+ (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+ (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub))>;
+def : Pat <(f32 (uint_to_fp (i32
+ (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))),
+ (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+ (LDURBi GPR64sp:$Rn, simm9:$offset), bsub))>;
+// 16-bits -> float.
+defm : UIntToFPROLoadPat<f32, i32, zextloadi16,
+ UCVTFv1i32, ro16, LDRHroW, LDRHroX, hsub>;
+def : Pat <(f32 (uint_to_fp (i32
+ (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+ (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+ (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub))>;
+def : Pat <(f32 (uint_to_fp (i32
+ (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))),
+ (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+ (LDURHi GPR64sp:$Rn, simm9:$offset), hsub))>;
+// 32-bits are handled in target specific dag combine:
+// performIntToFpCombine.
+// 64-bits integer to 32-bits floating point, not possible with
+// UCVTF on floating point registers (both source and destination
+// must have the same size).
+
+// Here are the patterns for 8, 16, 32, and 64-bits to double.
+// 8-bits -> double.
+defm : UIntToFPROLoadPat<f64, i32, zextloadi8,
+ UCVTFv1i64, ro8, LDRBroW, LDRBroX, bsub>;
+def : Pat <(f64 (uint_to_fp (i32
+ (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+ (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+ (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub))>;
+def : Pat <(f64 (uint_to_fp (i32
+ (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))),
+ (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+ (LDURBi GPR64sp:$Rn, simm9:$offset), bsub))>;
+// 16-bits -> double.
+defm : UIntToFPROLoadPat<f64, i32, zextloadi16,
+ UCVTFv1i64, ro16, LDRHroW, LDRHroX, hsub>;
+def : Pat <(f64 (uint_to_fp (i32
+ (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+ (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+ (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub))>;
+def : Pat <(f64 (uint_to_fp (i32
+ (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))),
+ (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+ (LDURHi GPR64sp:$Rn, simm9:$offset), hsub))>;
+// 32-bits -> double.
+defm : UIntToFPROLoadPat<f64, i32, load,
+ UCVTFv1i64, ro32, LDRSroW, LDRSroX, ssub>;
+def : Pat <(f64 (uint_to_fp (i32
+ (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
+ (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+ (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub))>;
+def : Pat <(f64 (uint_to_fp (i32
+ (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset))))),
+ (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+ (LDURSi GPR64sp:$Rn, simm9:$offset), ssub))>;
+// 64-bits -> double are handled in target specific dag combine:
+// performIntToFpCombine.
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD three different-sized vector instructions.
+//===----------------------------------------------------------------------===//
+
+defm ADDHN : SIMDNarrowThreeVectorBHS<0,0b0100,"addhn", int_aarch64_neon_addhn>;
+defm SUBHN : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_aarch64_neon_subhn>;
+defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_aarch64_neon_raddhn>;
+defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_aarch64_neon_rsubhn>;
+defm PMULL : SIMDDifferentThreeVectorBD<0,0b1110,"pmull",int_aarch64_neon_pmull>;
+defm SABAL : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal",
+ int_aarch64_neon_sabd>;
+defm SABDL : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl",
+ int_aarch64_neon_sabd>;
+defm SADDL : SIMDLongThreeVectorBHS< 0, 0b0000, "saddl",
+ BinOpFrag<(add (sext node:$LHS), (sext node:$RHS))>>;
+defm SADDW : SIMDWideThreeVectorBHS< 0, 0b0001, "saddw",
+ BinOpFrag<(add node:$LHS, (sext node:$RHS))>>;
+defm SMLAL : SIMDLongThreeVectorTiedBHS<0, 0b1000, "smlal",
+ TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+defm SMLSL : SIMDLongThreeVectorTiedBHS<0, 0b1010, "smlsl",
+ TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+defm SMULL : SIMDLongThreeVectorBHS<0, 0b1100, "smull", int_aarch64_neon_smull>;
+defm SQDMLAL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1001, "sqdmlal",
+ int_aarch64_neon_sqadd>;
+defm SQDMLSL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1011, "sqdmlsl",
+ int_aarch64_neon_sqsub>;
+defm SQDMULL : SIMDLongThreeVectorHS<0, 0b1101, "sqdmull",
+ int_aarch64_neon_sqdmull>;
+defm SSUBL : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl",
+ BinOpFrag<(sub (sext node:$LHS), (sext node:$RHS))>>;
+defm SSUBW : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw",
+ BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>;
+defm UABAL : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal",
+ int_aarch64_neon_uabd>;
+defm UADDL : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl",
+ BinOpFrag<(add (zext node:$LHS), (zext node:$RHS))>>;
+defm UADDW : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw",
+ BinOpFrag<(add node:$LHS, (zext node:$RHS))>>;
+defm UMLAL : SIMDLongThreeVectorTiedBHS<1, 0b1000, "umlal",
+ TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+defm UMLSL : SIMDLongThreeVectorTiedBHS<1, 0b1010, "umlsl",
+ TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+defm UMULL : SIMDLongThreeVectorBHS<1, 0b1100, "umull", int_aarch64_neon_umull>;
+defm USUBL : SIMDLongThreeVectorBHS<1, 0b0010, "usubl",
+ BinOpFrag<(sub (zext node:$LHS), (zext node:$RHS))>>;
+defm USUBW : SIMDWideThreeVectorBHS< 1, 0b0011, "usubw",
+ BinOpFrag<(sub node:$LHS, (zext node:$RHS))>>;
+
+// Additional patterns for SMULL and UMULL
+multiclass Neon_mul_widen_patterns<SDPatternOperator opnode,
+ Instruction INST8B, Instruction INST4H, Instruction INST2S> {
+ def : Pat<(v8i16 (opnode (v8i8 V64:$Rn), (v8i8 V64:$Rm))),
+ (INST8B V64:$Rn, V64:$Rm)>;
+ def : Pat<(v4i32 (opnode (v4i16 V64:$Rn), (v4i16 V64:$Rm))),
+ (INST4H V64:$Rn, V64:$Rm)>;
+ def : Pat<(v2i64 (opnode (v2i32 V64:$Rn), (v2i32 V64:$Rm))),
+ (INST2S V64:$Rn, V64:$Rm)>;
+}
+
+defm : Neon_mul_widen_patterns<AArch64smull, SMULLv8i8_v8i16,
+ SMULLv4i16_v4i32, SMULLv2i32_v2i64>;
+defm : Neon_mul_widen_patterns<AArch64umull, UMULLv8i8_v8i16,
+ UMULLv4i16_v4i32, UMULLv2i32_v2i64>;
+
+// Additional patterns for SMLAL/SMLSL and UMLAL/UMLSL
+multiclass Neon_mulacc_widen_patterns<SDPatternOperator opnode,
+ Instruction INST8B, Instruction INST4H, Instruction INST2S> {
+ def : Pat<(v8i16 (opnode (v8i16 V128:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm))),
+ (INST8B V128:$Rd, V64:$Rn, V64:$Rm)>;
+ def : Pat<(v4i32 (opnode (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm))),
+ (INST4H V128:$Rd, V64:$Rn, V64:$Rm)>;
+ def : Pat<(v2i64 (opnode (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm))),
+ (INST2S V128:$Rd, V64:$Rn, V64:$Rm)>;
+}
+
+defm : Neon_mulacc_widen_patterns<
+ TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>,
+ SMLALv8i8_v8i16, SMLALv4i16_v4i32, SMLALv2i32_v2i64>;
+defm : Neon_mulacc_widen_patterns<
+ TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>,
+ UMLALv8i8_v8i16, UMLALv4i16_v4i32, UMLALv2i32_v2i64>;
+defm : Neon_mulacc_widen_patterns<
+ TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>,
+ SMLSLv8i8_v8i16, SMLSLv4i16_v4i32, SMLSLv2i32_v2i64>;
+defm : Neon_mulacc_widen_patterns<
+ TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>,
+ UMLSLv8i8_v8i16, UMLSLv4i16_v4i32, UMLSLv2i32_v2i64>;
+
+// Patterns for 64-bit pmull
+def : Pat<(int_aarch64_neon_pmull64 V64:$Rn, V64:$Rm),
+ (PMULLv1i64 V64:$Rn, V64:$Rm)>;
+def : Pat<(int_aarch64_neon_pmull64 (extractelt (v2i64 V128:$Rn), (i64 1)),
+ (extractelt (v2i64 V128:$Rm), (i64 1))),
+ (PMULLv2i64 V128:$Rn, V128:$Rm)>;
+
+// CodeGen patterns for addhn and subhn instructions, which can actually be
+// written in LLVM IR without too much difficulty.
+
+// ADDHN
+def : Pat<(v8i8 (trunc (v8i16 (AArch64vlshr (add V128:$Rn, V128:$Rm), (i32 8))))),
+ (ADDHNv8i16_v8i8 V128:$Rn, V128:$Rm)>;
+def : Pat<(v4i16 (trunc (v4i32 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+ (i32 16))))),
+ (ADDHNv4i32_v4i16 V128:$Rn, V128:$Rm)>;
+def : Pat<(v2i32 (trunc (v2i64 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+ (i32 32))))),
+ (ADDHNv2i64_v2i32 V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v8i8 V64:$Rd),
+ (trunc (v8i16 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+ (i32 8))))),
+ (ADDHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+ V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v4i16 V64:$Rd),
+ (trunc (v4i32 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+ (i32 16))))),
+ (ADDHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+ V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v2i32 V64:$Rd),
+ (trunc (v2i64 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+ (i32 32))))),
+ (ADDHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+ V128:$Rn, V128:$Rm)>;
+
+// SUBHN
+def : Pat<(v8i8 (trunc (v8i16 (AArch64vlshr (sub V128:$Rn, V128:$Rm), (i32 8))))),
+ (SUBHNv8i16_v8i8 V128:$Rn, V128:$Rm)>;
+def : Pat<(v4i16 (trunc (v4i32 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+ (i32 16))))),
+ (SUBHNv4i32_v4i16 V128:$Rn, V128:$Rm)>;
+def : Pat<(v2i32 (trunc (v2i64 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+ (i32 32))))),
+ (SUBHNv2i64_v2i32 V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v8i8 V64:$Rd),
+ (trunc (v8i16 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+ (i32 8))))),
+ (SUBHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+ V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v4i16 V64:$Rd),
+ (trunc (v4i32 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+ (i32 16))))),
+ (SUBHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+ V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v2i32 V64:$Rd),
+ (trunc (v2i64 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+ (i32 32))))),
+ (SUBHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+ V128:$Rn, V128:$Rm)>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD bitwise extract from vector instruction.
+//----------------------------------------------------------------------------
+
+defm EXT : SIMDBitwiseExtract<"ext">;
+
+def : Pat<(v4i16 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
+ (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
+def : Pat<(v8i16 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+ (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v2i32 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
+ (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
+def : Pat<(v2f32 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
+ (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
+def : Pat<(v4i32 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+ (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v4f32 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+ (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v2i64 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+ (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v2f64 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+ (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v4f16 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
+ (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
+def : Pat<(v8f16 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+ (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+
+// We use EXT to handle extract_subvector to copy the upper 64-bits of a
+// 128-bit vector.
+def : Pat<(v8i8 (extract_subvector V128:$Rn, (i64 8))),
+ (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 4))),
+ (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 2))),
+ (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 1))),
+ (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v4f16 (extract_subvector V128:$Rn, (i64 4))),
+ (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 2))),
+ (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v1f64 (extract_subvector V128:$Rn, (i64 1))),
+ (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD zip vector
+//----------------------------------------------------------------------------
+
+defm TRN1 : SIMDZipVector<0b010, "trn1", AArch64trn1>;
+defm TRN2 : SIMDZipVector<0b110, "trn2", AArch64trn2>;
+defm UZP1 : SIMDZipVector<0b001, "uzp1", AArch64uzp1>;
+defm UZP2 : SIMDZipVector<0b101, "uzp2", AArch64uzp2>;
+defm ZIP1 : SIMDZipVector<0b011, "zip1", AArch64zip1>;
+defm ZIP2 : SIMDZipVector<0b111, "zip2", AArch64zip2>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD TBL/TBX instructions
+//----------------------------------------------------------------------------
+
+defm TBL : SIMDTableLookup< 0, "tbl">;
+defm TBX : SIMDTableLookupTied<1, "tbx">;
+
+def : Pat<(v8i8 (int_aarch64_neon_tbl1 (v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))),
+ (TBLv8i8One VecListOne128:$Rn, V64:$Ri)>;
+def : Pat<(v16i8 (int_aarch64_neon_tbl1 (v16i8 V128:$Ri), (v16i8 V128:$Rn))),
+ (TBLv16i8One V128:$Ri, V128:$Rn)>;
+
+def : Pat<(v8i8 (int_aarch64_neon_tbx1 (v8i8 V64:$Rd),
+ (v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))),
+ (TBXv8i8One V64:$Rd, VecListOne128:$Rn, V64:$Ri)>;
+def : Pat<(v16i8 (int_aarch64_neon_tbx1 (v16i8 V128:$Rd),
+ (v16i8 V128:$Ri), (v16i8 V128:$Rn))),
+ (TBXv16i8One V128:$Rd, V128:$Ri, V128:$Rn)>;
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar CPY instruction
+//----------------------------------------------------------------------------
+
+defm CPY : SIMDScalarCPY<"cpy">;
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar pairwise instructions
+//----------------------------------------------------------------------------
+
+defm ADDP : SIMDPairwiseScalarD<0, 0b11011, "addp">;
+defm FADDP : SIMDFPPairwiseScalar<0, 0b01101, "faddp">;
+defm FMAXNMP : SIMDFPPairwiseScalar<0, 0b01100, "fmaxnmp">;
+defm FMAXP : SIMDFPPairwiseScalar<0, 0b01111, "fmaxp">;
+defm FMINNMP : SIMDFPPairwiseScalar<1, 0b01100, "fminnmp">;
+defm FMINP : SIMDFPPairwiseScalar<1, 0b01111, "fminp">;
+def : Pat<(v2i64 (AArch64saddv V128:$Rn)),
+ (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (ADDPv2i64p V128:$Rn), dsub)>;
+def : Pat<(v2i64 (AArch64uaddv V128:$Rn)),
+ (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (ADDPv2i64p V128:$Rn), dsub)>;
+def : Pat<(f32 (int_aarch64_neon_faddv (v2f32 V64:$Rn))),
+ (FADDPv2i32p V64:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_faddv (v4f32 V128:$Rn))),
+ (FADDPv2i32p (EXTRACT_SUBREG (FADDPv4f32 V128:$Rn, V128:$Rn), dsub))>;
+def : Pat<(f64 (int_aarch64_neon_faddv (v2f64 V128:$Rn))),
+ (FADDPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_fmaxnmv (v2f32 V64:$Rn))),
+ (FMAXNMPv2i32p V64:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_fmaxnmv (v2f64 V128:$Rn))),
+ (FMAXNMPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_fmaxv (v2f32 V64:$Rn))),
+ (FMAXPv2i32p V64:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_fmaxv (v2f64 V128:$Rn))),
+ (FMAXPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_fminnmv (v2f32 V64:$Rn))),
+ (FMINNMPv2i32p V64:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_fminnmv (v2f64 V128:$Rn))),
+ (FMINNMPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_fminv (v2f32 V64:$Rn))),
+ (FMINPv2i32p V64:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_fminv (v2f64 V128:$Rn))),
+ (FMINPv2i64p V128:$Rn)>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD INS/DUP instructions
+//----------------------------------------------------------------------------
+
+def DUPv8i8gpr : SIMDDupFromMain<0, {?,?,?,?,1}, ".8b", v8i8, V64, GPR32>;
+def DUPv16i8gpr : SIMDDupFromMain<1, {?,?,?,?,1}, ".16b", v16i8, V128, GPR32>;
+def DUPv4i16gpr : SIMDDupFromMain<0, {?,?,?,1,0}, ".4h", v4i16, V64, GPR32>;
+def DUPv8i16gpr : SIMDDupFromMain<1, {?,?,?,1,0}, ".8h", v8i16, V128, GPR32>;
+def DUPv2i32gpr : SIMDDupFromMain<0, {?,?,1,0,0}, ".2s", v2i32, V64, GPR32>;
+def DUPv4i32gpr : SIMDDupFromMain<1, {?,?,1,0,0}, ".4s", v4i32, V128, GPR32>;
+def DUPv2i64gpr : SIMDDupFromMain<1, {?,1,0,0,0}, ".2d", v2i64, V128, GPR64>;
+
+def DUPv2i64lane : SIMDDup64FromElement;
+def DUPv2i32lane : SIMDDup32FromElement<0, ".2s", v2i32, V64>;
+def DUPv4i32lane : SIMDDup32FromElement<1, ".4s", v4i32, V128>;
+def DUPv4i16lane : SIMDDup16FromElement<0, ".4h", v4i16, V64>;
+def DUPv8i16lane : SIMDDup16FromElement<1, ".8h", v8i16, V128>;
+def DUPv8i8lane : SIMDDup8FromElement <0, ".8b", v8i8, V64>;
+def DUPv16i8lane : SIMDDup8FromElement <1, ".16b", v16i8, V128>;
+
+def : Pat<(v2f32 (AArch64dup (f32 FPR32:$Rn))),
+ (v2f32 (DUPv2i32lane
+ (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub),
+ (i64 0)))>;
+def : Pat<(v4f32 (AArch64dup (f32 FPR32:$Rn))),
+ (v4f32 (DUPv4i32lane
+ (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub),
+ (i64 0)))>;
+def : Pat<(v2f64 (AArch64dup (f64 FPR64:$Rn))),
+ (v2f64 (DUPv2i64lane
+ (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rn, dsub),
+ (i64 0)))>;
+def : Pat<(v4f16 (AArch64dup (f16 FPR16:$Rn))),
+ (v4f16 (DUPv4i16lane
+ (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub),
+ (i64 0)))>;
+def : Pat<(v8f16 (AArch64dup (f16 FPR16:$Rn))),
+ (v8f16 (DUPv8i16lane
+ (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub),
+ (i64 0)))>;
+
+def : Pat<(v4f16 (AArch64duplane16 (v8f16 V128:$Rn), VectorIndexH:$imm)),
+ (DUPv4i16lane V128:$Rn, VectorIndexH:$imm)>;
+def : Pat<(v8f16 (AArch64duplane16 (v8f16 V128:$Rn), VectorIndexH:$imm)),
+ (DUPv8i16lane V128:$Rn, VectorIndexH:$imm)>;
+
+def : Pat<(v2f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
+ (DUPv2i32lane V128:$Rn, VectorIndexS:$imm)>;
+def : Pat<(v4f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
+ (DUPv4i32lane V128:$Rn, VectorIndexS:$imm)>;
+def : Pat<(v2f64 (AArch64duplane64 (v2f64 V128:$Rn), VectorIndexD:$imm)),
+ (DUPv2i64lane V128:$Rn, VectorIndexD:$imm)>;
+
+// If there's an (AArch64dup (vector_extract ...) ...), we can use a duplane
+// instruction even if the types don't match: we just have to remap the lane
+// carefully. N.b. this trick only applies to truncations.
+def VecIndex_x2 : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(2 * N->getZExtValue(), SDLoc(N), MVT::i64);
+}]>;
+def VecIndex_x4 : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(4 * N->getZExtValue(), SDLoc(N), MVT::i64);
+}]>;
+def VecIndex_x8 : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(8 * N->getZExtValue(), SDLoc(N), MVT::i64);
+}]>;
+
+multiclass DUPWithTruncPats<ValueType ResVT, ValueType Src64VT,
+ ValueType Src128VT, ValueType ScalVT,
+ Instruction DUP, SDNodeXForm IdxXFORM> {
+ def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src128VT V128:$Rn),
+ imm:$idx)))),
+ (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
+
+ def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src64VT V64:$Rn),
+ imm:$idx)))),
+ (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
+}
+
+defm : DUPWithTruncPats<v8i8, v4i16, v8i16, i32, DUPv8i8lane, VecIndex_x2>;
+defm : DUPWithTruncPats<v8i8, v2i32, v4i32, i32, DUPv8i8lane, VecIndex_x4>;
+defm : DUPWithTruncPats<v4i16, v2i32, v4i32, i32, DUPv4i16lane, VecIndex_x2>;
+
+defm : DUPWithTruncPats<v16i8, v4i16, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
+defm : DUPWithTruncPats<v16i8, v2i32, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
+defm : DUPWithTruncPats<v8i16, v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
+
+multiclass DUPWithTrunci64Pats<ValueType ResVT, Instruction DUP,
+ SDNodeXForm IdxXFORM> {
+ def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v2i64 V128:$Rn),
+ imm:$idx))))),
+ (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
+
+ def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v1i64 V64:$Rn),
+ imm:$idx))))),
+ (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
+}
+
+defm : DUPWithTrunci64Pats<v8i8, DUPv8i8lane, VecIndex_x8>;
+defm : DUPWithTrunci64Pats<v4i16, DUPv4i16lane, VecIndex_x4>;
+defm : DUPWithTrunci64Pats<v2i32, DUPv2i32lane, VecIndex_x2>;
+
+defm : DUPWithTrunci64Pats<v16i8, DUPv16i8lane, VecIndex_x8>;
+defm : DUPWithTrunci64Pats<v8i16, DUPv8i16lane, VecIndex_x4>;
+defm : DUPWithTrunci64Pats<v4i32, DUPv4i32lane, VecIndex_x2>;
+
+// SMOV and UMOV definitions, with some extra patterns for convenience
+defm SMOV : SMov;
+defm UMOV : UMov;
+
+def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8),
+ (i32 (SMOVvi8to32 V128:$Rn, VectorIndexB:$idx))>;
+def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8),
+ (i64 (SMOVvi8to64 V128:$Rn, VectorIndexB:$idx))>;
+def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
+ (i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>;
+def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
+ (i64 (SMOVvi16to64 V128:$Rn, VectorIndexH:$idx))>;
+def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
+ (i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>;
+def : Pat<(sext (i32 (vector_extract (v4i32 V128:$Rn), VectorIndexS:$idx))),
+ (i64 (SMOVvi32to64 V128:$Rn, VectorIndexS:$idx))>;
+
+def : Pat<(sext_inreg (i64 (anyext (i32 (vector_extract (v16i8 V128:$Rn),
+ VectorIndexB:$idx)))), i8),
+ (i64 (SMOVvi8to64 V128:$Rn, VectorIndexB:$idx))>;
+def : Pat<(sext_inreg (i64 (anyext (i32 (vector_extract (v8i16 V128:$Rn),
+ VectorIndexH:$idx)))), i16),
+ (i64 (SMOVvi16to64 V128:$Rn, VectorIndexH:$idx))>;
+
+// Extracting i8 or i16 elements will have the zero-extend transformed to
+// an 'and' mask by type legalization since neither i8 nor i16 are legal types
+// for AArch64. Match these patterns here since UMOV already zeroes out the high
+// bits of the destination register.
+def : Pat<(and (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx),
+ (i32 0xff)),
+ (i32 (UMOVvi8 V128:$Rn, VectorIndexB:$idx))>;
+def : Pat<(and (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),
+ (i32 0xffff)),
+ (i32 (UMOVvi16 V128:$Rn, VectorIndexH:$idx))>;
+
+defm INS : SIMDIns;
+
+def : Pat<(v16i8 (scalar_to_vector GPR32:$Rn)),
+ (SUBREG_TO_REG (i32 0),
+ (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
+def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)),
+ (SUBREG_TO_REG (i32 0),
+ (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
+
+def : Pat<(v8i16 (scalar_to_vector GPR32:$Rn)),
+ (SUBREG_TO_REG (i32 0),
+ (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
+def : Pat<(v4i16 (scalar_to_vector GPR32:$Rn)),
+ (SUBREG_TO_REG (i32 0),
+ (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
+
+def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))),
+ (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
+ (i32 FPR32:$Rn), ssub))>;
+def : Pat<(v4i32 (scalar_to_vector (i32 FPR32:$Rn))),
+ (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
+ (i32 FPR32:$Rn), ssub))>;
+def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))),
+ (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
+ (i64 FPR64:$Rn), dsub))>;
+
+def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))),
+ (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
+def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))),
+ (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
+
+def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))),
+ (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
+def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
+ (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
+def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$Rn))),
+ (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rn, dsub)>;
+
+def : Pat<(v4f16 (vector_insert (v4f16 V64:$Rn),
+ (f16 FPR16:$Rm), (i64 VectorIndexS:$imm))),
+ (EXTRACT_SUBREG
+ (INSvi16lane
+ (v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), V64:$Rn, dsub)),
+ VectorIndexS:$imm,
+ (v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)),
+ (i64 0)),
+ dsub)>;
+
+def : Pat<(v8f16 (vector_insert (v8f16 V128:$Rn),
+ (f16 FPR16:$Rm), (i64 VectorIndexH:$imm))),
+ (INSvi16lane
+ V128:$Rn, VectorIndexH:$imm,
+ (v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)),
+ (i64 0))>;
+
+def : Pat<(v2f32 (vector_insert (v2f32 V64:$Rn),
+ (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
+ (EXTRACT_SUBREG
+ (INSvi32lane
+ (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), V64:$Rn, dsub)),
+ VectorIndexS:$imm,
+ (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)),
+ (i64 0)),
+ dsub)>;
+def : Pat<(v4f32 (vector_insert (v4f32 V128:$Rn),
+ (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
+ (INSvi32lane
+ V128:$Rn, VectorIndexS:$imm,
+ (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)),
+ (i64 0))>;
+def : Pat<(v2f64 (vector_insert (v2f64 V128:$Rn),
+ (f64 FPR64:$Rm), (i64 VectorIndexD:$imm))),
+ (INSvi64lane
+ V128:$Rn, VectorIndexD:$imm,
+ (v2f64 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rm, dsub)),
+ (i64 0))>;
+
+// Copy an element at a constant index in one vector into a constant indexed
+// element of another.
+// FIXME refactor to a shared class/dev parameterized on vector type, vector
+// index type and INS extension
+def : Pat<(v16i8 (int_aarch64_neon_vcopy_lane
+ (v16i8 V128:$Vd), VectorIndexB:$idx, (v16i8 V128:$Vs),
+ VectorIndexB:$idx2)),
+ (v16i8 (INSvi8lane
+ V128:$Vd, VectorIndexB:$idx, V128:$Vs, VectorIndexB:$idx2)
+ )>;
+def : Pat<(v8i16 (int_aarch64_neon_vcopy_lane
+ (v8i16 V128:$Vd), VectorIndexH:$idx, (v8i16 V128:$Vs),
+ VectorIndexH:$idx2)),
+ (v8i16 (INSvi16lane
+ V128:$Vd, VectorIndexH:$idx, V128:$Vs, VectorIndexH:$idx2)
+ )>;
+def : Pat<(v4i32 (int_aarch64_neon_vcopy_lane
+ (v4i32 V128:$Vd), VectorIndexS:$idx, (v4i32 V128:$Vs),
+ VectorIndexS:$idx2)),
+ (v4i32 (INSvi32lane
+ V128:$Vd, VectorIndexS:$idx, V128:$Vs, VectorIndexS:$idx2)
+ )>;
+def : Pat<(v2i64 (int_aarch64_neon_vcopy_lane
+ (v2i64 V128:$Vd), VectorIndexD:$idx, (v2i64 V128:$Vs),
+ VectorIndexD:$idx2)),
+ (v2i64 (INSvi64lane
+ V128:$Vd, VectorIndexD:$idx, V128:$Vs, VectorIndexD:$idx2)
+ )>;
+
+multiclass Neon_INS_elt_pattern<ValueType VT128, ValueType VT64,
+ ValueType VTScal, Instruction INS> {
+ def : Pat<(VT128 (vector_insert V128:$src,
+ (VTScal (vector_extract (VT128 V128:$Rn), imm:$Immn)),
+ imm:$Immd)),
+ (INS V128:$src, imm:$Immd, V128:$Rn, imm:$Immn)>;
+
+ def : Pat<(VT128 (vector_insert V128:$src,
+ (VTScal (vector_extract (VT64 V64:$Rn), imm:$Immn)),
+ imm:$Immd)),
+ (INS V128:$src, imm:$Immd,
+ (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), imm:$Immn)>;
+
+ def : Pat<(VT64 (vector_insert V64:$src,
+ (VTScal (vector_extract (VT128 V128:$Rn), imm:$Immn)),
+ imm:$Immd)),
+ (EXTRACT_SUBREG (INS (SUBREG_TO_REG (i64 0), V64:$src, dsub),
+ imm:$Immd, V128:$Rn, imm:$Immn),
+ dsub)>;
+
+ def : Pat<(VT64 (vector_insert V64:$src,
+ (VTScal (vector_extract (VT64 V64:$Rn), imm:$Immn)),
+ imm:$Immd)),
+ (EXTRACT_SUBREG
+ (INS (SUBREG_TO_REG (i64 0), V64:$src, dsub), imm:$Immd,
+ (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), imm:$Immn),
+ dsub)>;
+}
+
+defm : Neon_INS_elt_pattern<v8f16, v4f16, f16, INSvi16lane>;
+defm : Neon_INS_elt_pattern<v4f32, v2f32, f32, INSvi32lane>;
+defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, INSvi64lane>;
+
+
+// Floating point vector extractions are codegen'd as either a sequence of
+// subregister extractions, or a MOV (aka CPY here, alias for DUP) if
+// the lane number is anything other than zero.
+def : Pat<(vector_extract (v2f64 V128:$Rn), 0),
+ (f64 (EXTRACT_SUBREG V128:$Rn, dsub))>;
+def : Pat<(vector_extract (v4f32 V128:$Rn), 0),
+ (f32 (EXTRACT_SUBREG V128:$Rn, ssub))>;
+def : Pat<(vector_extract (v8f16 V128:$Rn), 0),
+ (f16 (EXTRACT_SUBREG V128:$Rn, hsub))>;
+
+def : Pat<(vector_extract (v2f64 V128:$Rn), VectorIndexD:$idx),
+ (f64 (CPYi64 V128:$Rn, VectorIndexD:$idx))>;
+def : Pat<(vector_extract (v4f32 V128:$Rn), VectorIndexS:$idx),
+ (f32 (CPYi32 V128:$Rn, VectorIndexS:$idx))>;
+def : Pat<(vector_extract (v8f16 V128:$Rn), VectorIndexH:$idx),
+ (f16 (CPYi16 V128:$Rn, VectorIndexH:$idx))>;
+
+// All concat_vectors operations are canonicalised to act on i64 vectors for
+// AArch64. In the general case we need an instruction, which had just as well be
+// INS.
+class ConcatPat<ValueType DstTy, ValueType SrcTy>
+ : Pat<(DstTy (concat_vectors (SrcTy V64:$Rd), V64:$Rn)),
+ (INSvi64lane (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), 1,
+ (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), 0)>;
+
+def : ConcatPat<v2i64, v1i64>;
+def : ConcatPat<v2f64, v1f64>;
+def : ConcatPat<v4i32, v2i32>;
+def : ConcatPat<v4f32, v2f32>;
+def : ConcatPat<v8i16, v4i16>;
+def : ConcatPat<v8f16, v4f16>;
+def : ConcatPat<v16i8, v8i8>;
+
+// If the high lanes are undef, though, we can just ignore them:
+class ConcatUndefPat<ValueType DstTy, ValueType SrcTy>
+ : Pat<(DstTy (concat_vectors (SrcTy V64:$Rn), undef)),
+ (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub)>;
+
+def : ConcatUndefPat<v2i64, v1i64>;
+def : ConcatUndefPat<v2f64, v1f64>;
+def : ConcatUndefPat<v4i32, v2i32>;
+def : ConcatUndefPat<v4f32, v2f32>;
+def : ConcatUndefPat<v8i16, v4i16>;
+def : ConcatUndefPat<v16i8, v8i8>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD across lanes instructions
+//----------------------------------------------------------------------------
+
+defm ADDV : SIMDAcrossLanesBHS<0, 0b11011, "addv">;
+defm SMAXV : SIMDAcrossLanesBHS<0, 0b01010, "smaxv">;
+defm SMINV : SIMDAcrossLanesBHS<0, 0b11010, "sminv">;
+defm UMAXV : SIMDAcrossLanesBHS<1, 0b01010, "umaxv">;
+defm UMINV : SIMDAcrossLanesBHS<1, 0b11010, "uminv">;
+defm SADDLV : SIMDAcrossLanesHSD<0, 0b00011, "saddlv">;
+defm UADDLV : SIMDAcrossLanesHSD<1, 0b00011, "uaddlv">;
+defm FMAXNMV : SIMDFPAcrossLanes<0b01100, 0, "fmaxnmv", int_aarch64_neon_fmaxnmv>;
+defm FMAXV : SIMDFPAcrossLanes<0b01111, 0, "fmaxv", int_aarch64_neon_fmaxv>;
+defm FMINNMV : SIMDFPAcrossLanes<0b01100, 1, "fminnmv", int_aarch64_neon_fminnmv>;
+defm FMINV : SIMDFPAcrossLanes<0b01111, 1, "fminv", int_aarch64_neon_fminv>;
+
+// Patterns for across-vector intrinsics, that have a node equivalent, that
+// returns a vector (with only the low lane defined) instead of a scalar.
+// In effect, opNode is the same as (scalar_to_vector (IntNode)).
+multiclass SIMDAcrossLanesIntrinsic<string baseOpc,
+ SDPatternOperator opNode> {
+// If a lane instruction caught the vector_extract around opNode, we can
+// directly match the latter to the instruction.
+def : Pat<(v8i8 (opNode V64:$Rn)),
+ (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub)>;
+def : Pat<(v16i8 (opNode V128:$Rn)),
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub)>;
+def : Pat<(v4i16 (opNode V64:$Rn)),
+ (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub)>;
+def : Pat<(v8i16 (opNode V128:$Rn)),
+ (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub)>;
+def : Pat<(v4i32 (opNode V128:$Rn)),
+ (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub)>;
+
+
+// If none did, fallback to the explicit patterns, consuming the vector_extract.
+def : Pat<(i32 (vector_extract (insert_subvector undef, (v8i8 (opNode V64:$Rn)),
+ (i32 0)), (i64 0))),
+ (EXTRACT_SUBREG (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn),
+ bsub), ssub)>;
+def : Pat<(i32 (vector_extract (v16i8 (opNode V128:$Rn)), (i64 0))),
+ (EXTRACT_SUBREG (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn),
+ bsub), ssub)>;
+def : Pat<(i32 (vector_extract (insert_subvector undef,
+ (v4i16 (opNode V64:$Rn)), (i32 0)), (i64 0))),
+ (EXTRACT_SUBREG (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn),
+ hsub), ssub)>;
+def : Pat<(i32 (vector_extract (v8i16 (opNode V128:$Rn)), (i64 0))),
+ (EXTRACT_SUBREG (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn),
+ hsub), ssub)>;
+def : Pat<(i32 (vector_extract (v4i32 (opNode V128:$Rn)), (i64 0))),
+ (EXTRACT_SUBREG (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn),
+ ssub), ssub)>;
+
+}
+
+multiclass SIMDAcrossLanesSignedIntrinsic<string baseOpc,
+ SDPatternOperator opNode>
+ : SIMDAcrossLanesIntrinsic<baseOpc, opNode> {
+// If there is a sign extension after this intrinsic, consume it as smov already
+// performed it
+def : Pat<(i32 (sext_inreg (i32 (vector_extract (insert_subvector undef,
+ (opNode (v8i8 V64:$Rn)), (i32 0)), (i64 0))), i8)),
+ (i32 (SMOVvi8to32
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
+ (i64 0)))>;
+def : Pat<(i32 (sext_inreg (i32 (vector_extract
+ (opNode (v16i8 V128:$Rn)), (i64 0))), i8)),
+ (i32 (SMOVvi8to32
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
+ (i64 0)))>;
+def : Pat<(i32 (sext_inreg (i32 (vector_extract (insert_subvector undef,
+ (opNode (v4i16 V64:$Rn)), (i32 0)), (i64 0))), i16)),
+ (i32 (SMOVvi16to32
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
+ (i64 0)))>;
+def : Pat<(i32 (sext_inreg (i32 (vector_extract
+ (opNode (v8i16 V128:$Rn)), (i64 0))), i16)),
+ (i32 (SMOVvi16to32
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
+ (i64 0)))>;
+}
+
+multiclass SIMDAcrossLanesUnsignedIntrinsic<string baseOpc,
+ SDPatternOperator opNode>
+ : SIMDAcrossLanesIntrinsic<baseOpc, opNode> {
+// If there is a masking operation keeping only what has been actually
+// generated, consume it.
+def : Pat<(i32 (and (i32 (vector_extract (insert_subvector undef,
+ (opNode (v8i8 V64:$Rn)), (i32 0)), (i64 0))), maski8_or_more)),
+ (i32 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
+ ssub))>;
+def : Pat<(i32 (and (i32 (vector_extract (opNode (v16i8 V128:$Rn)), (i64 0))),
+ maski8_or_more)),
+ (i32 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
+ ssub))>;
+def : Pat<(i32 (and (i32 (vector_extract (insert_subvector undef,
+ (opNode (v4i16 V64:$Rn)), (i32 0)), (i64 0))), maski16_or_more)),
+ (i32 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
+ ssub))>;
+def : Pat<(i32 (and (i32 (vector_extract (opNode (v8i16 V128:$Rn)), (i64 0))),
+ maski16_or_more)),
+ (i32 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
+ ssub))>;
+}
+
+defm : SIMDAcrossLanesSignedIntrinsic<"ADDV", AArch64saddv>;
+// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm
+def : Pat<(v2i32 (AArch64saddv (v2i32 V64:$Rn))),
+ (ADDPv2i32 V64:$Rn, V64:$Rn)>;
+
+defm : SIMDAcrossLanesUnsignedIntrinsic<"ADDV", AArch64uaddv>;
+// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm
+def : Pat<(v2i32 (AArch64uaddv (v2i32 V64:$Rn))),
+ (ADDPv2i32 V64:$Rn, V64:$Rn)>;
+
+defm : SIMDAcrossLanesSignedIntrinsic<"SMAXV", AArch64smaxv>;
+def : Pat<(v2i32 (AArch64smaxv (v2i32 V64:$Rn))),
+ (SMAXPv2i32 V64:$Rn, V64:$Rn)>;
+
+defm : SIMDAcrossLanesSignedIntrinsic<"SMINV", AArch64sminv>;
+def : Pat<(v2i32 (AArch64sminv (v2i32 V64:$Rn))),
+ (SMINPv2i32 V64:$Rn, V64:$Rn)>;
+
+defm : SIMDAcrossLanesUnsignedIntrinsic<"UMAXV", AArch64umaxv>;
+def : Pat<(v2i32 (AArch64umaxv (v2i32 V64:$Rn))),
+ (UMAXPv2i32 V64:$Rn, V64:$Rn)>;
+
+defm : SIMDAcrossLanesUnsignedIntrinsic<"UMINV", AArch64uminv>;
+def : Pat<(v2i32 (AArch64uminv (v2i32 V64:$Rn))),
+ (UMINPv2i32 V64:$Rn, V64:$Rn)>;
+
+multiclass SIMDAcrossLanesSignedLongIntrinsic<string baseOpc, Intrinsic intOp> {
+ def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
+ (i32 (SMOVvi16to32
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
+ (i64 0)))>;
+def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
+ (i32 (SMOVvi16to32
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
+ (i64 0)))>;
+
+def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
+ (i32 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
+ ssub))>;
+def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
+ (i32 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
+ ssub))>;
+
+def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
+ (i64 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
+ dsub))>;
+}
+
+multiclass SIMDAcrossLanesUnsignedLongIntrinsic<string baseOpc,
+ Intrinsic intOp> {
+ def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
+ (i32 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
+ ssub))>;
+def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
+ (i32 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
+ ssub))>;
+
+def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
+ (i32 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
+ ssub))>;
+def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
+ (i32 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
+ ssub))>;
+
+def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
+ (i64 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
+ dsub))>;
+}
+
+defm : SIMDAcrossLanesSignedLongIntrinsic<"SADDLV", int_aarch64_neon_saddlv>;
+defm : SIMDAcrossLanesUnsignedLongIntrinsic<"UADDLV", int_aarch64_neon_uaddlv>;
+
+// The vaddlv_s32 intrinsic gets mapped to SADDLP.
+def : Pat<(i64 (int_aarch64_neon_saddlv (v2i32 V64:$Rn))),
+ (i64 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (SADDLPv2i32_v1i64 V64:$Rn), dsub),
+ dsub))>;
+// The vaddlv_u32 intrinsic gets mapped to UADDLP.
+def : Pat<(i64 (int_aarch64_neon_uaddlv (v2i32 V64:$Rn))),
+ (i64 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (UADDLPv2i32_v1i64 V64:$Rn), dsub),
+ dsub))>;
+
+//------------------------------------------------------------------------------
+// AdvSIMD modified immediate instructions
+//------------------------------------------------------------------------------
+
+// AdvSIMD BIC
+defm BIC : SIMDModifiedImmVectorShiftTied<1, 0b11, 0b01, "bic", AArch64bici>;
+// AdvSIMD ORR
+defm ORR : SIMDModifiedImmVectorShiftTied<0, 0b11, 0b01, "orr", AArch64orri>;
+
+def : InstAlias<"bic $Vd.4h, $imm", (BICv4i16 V64:$Vd, imm0_255:$imm, 0)>;
+def : InstAlias<"bic $Vd.8h, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0)>;
+def : InstAlias<"bic $Vd.2s, $imm", (BICv2i32 V64:$Vd, imm0_255:$imm, 0)>;
+def : InstAlias<"bic $Vd.4s, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0)>;
+
+def : InstAlias<"bic.4h $Vd, $imm", (BICv4i16 V64:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"bic.8h $Vd, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"bic.2s $Vd, $imm", (BICv2i32 V64:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"bic.4s $Vd, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : InstAlias<"orr $Vd.4h, $imm", (ORRv4i16 V64:$Vd, imm0_255:$imm, 0)>;
+def : InstAlias<"orr $Vd.8h, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0)>;
+def : InstAlias<"orr $Vd.2s, $imm", (ORRv2i32 V64:$Vd, imm0_255:$imm, 0)>;
+def : InstAlias<"orr $Vd.4s, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0)>;
+
+def : InstAlias<"orr.4h $Vd, $imm", (ORRv4i16 V64:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"orr.8h $Vd, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"orr.2s $Vd, $imm", (ORRv2i32 V64:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"orr.4s $Vd, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+// AdvSIMD FMOV
+def FMOVv2f64_ns : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1111, V128, fpimm8,
+ "fmov", ".2d",
+ [(set (v2f64 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+def FMOVv2f32_ns : SIMDModifiedImmVectorNoShift<0, 0, 0, 0b1111, V64, fpimm8,
+ "fmov", ".2s",
+ [(set (v2f32 V64:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+def FMOVv4f32_ns : SIMDModifiedImmVectorNoShift<1, 0, 0, 0b1111, V128, fpimm8,
+ "fmov", ".4s",
+ [(set (v4f32 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+let Predicates = [HasNEON, HasFullFP16] in {
+def FMOVv4f16_ns : SIMDModifiedImmVectorNoShift<0, 0, 1, 0b1111, V64, fpimm8,
+ "fmov", ".4h",
+ [(set (v4f16 V64:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+def FMOVv8f16_ns : SIMDModifiedImmVectorNoShift<1, 0, 1, 0b1111, V128, fpimm8,
+ "fmov", ".8h",
+ [(set (v8f16 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+} // Predicates = [HasNEON, HasFullFP16]
+
+// AdvSIMD MOVI
+
+// EDIT byte mask: scalar
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def MOVID : SIMDModifiedImmScalarNoShift<0, 1, 0b1110, "movi",
+ [(set FPR64:$Rd, simdimmtype10:$imm8)]>;
+// The movi_edit node has the immediate value already encoded, so we use
+// a plain imm0_255 here.
+def : Pat<(f64 (AArch64movi_edit imm0_255:$shift)),
+ (MOVID imm0_255:$shift)>;
+
+def : Pat<(v1i64 immAllZerosV), (MOVID (i32 0))>;
+def : Pat<(v2i32 immAllZerosV), (MOVID (i32 0))>;
+def : Pat<(v4i16 immAllZerosV), (MOVID (i32 0))>;
+def : Pat<(v8i8 immAllZerosV), (MOVID (i32 0))>;
+
+def : Pat<(v1i64 immAllOnesV), (MOVID (i32 255))>;
+def : Pat<(v2i32 immAllOnesV), (MOVID (i32 255))>;
+def : Pat<(v4i16 immAllOnesV), (MOVID (i32 255))>;
+def : Pat<(v8i8 immAllOnesV), (MOVID (i32 255))>;
+
+// EDIT byte mask: 2d
+
+// The movi_edit node has the immediate value already encoded, so we use
+// a plain imm0_255 in the pattern
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def MOVIv2d_ns : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1110, V128,
+ simdimmtype10,
+ "movi", ".2d",
+ [(set (v2i64 V128:$Rd), (AArch64movi_edit imm0_255:$imm8))]>;
+
+def : Pat<(v2i64 immAllZerosV), (MOVIv2d_ns (i32 0))>;
+def : Pat<(v4i32 immAllZerosV), (MOVIv2d_ns (i32 0))>;
+def : Pat<(v8i16 immAllZerosV), (MOVIv2d_ns (i32 0))>;
+def : Pat<(v16i8 immAllZerosV), (MOVIv2d_ns (i32 0))>;
+
+def : Pat<(v2i64 immAllOnesV), (MOVIv2d_ns (i32 255))>;
+def : Pat<(v4i32 immAllOnesV), (MOVIv2d_ns (i32 255))>;
+def : Pat<(v8i16 immAllOnesV), (MOVIv2d_ns (i32 255))>;
+def : Pat<(v16i8 immAllOnesV), (MOVIv2d_ns (i32 255))>;
+
+def : Pat<(v2f64 (AArch64dup (f64 fpimm0))), (MOVIv2d_ns (i32 0))>;
+def : Pat<(v4f32 (AArch64dup (f32 fpimm0))), (MOVIv2d_ns (i32 0))>;
+
+// EDIT per word & halfword: 2s, 4h, 4s, & 8h
+defm MOVI : SIMDModifiedImmVectorShift<0, 0b10, 0b00, "movi">;
+
+def : InstAlias<"movi $Vd.4h, $imm", (MOVIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi $Vd.8h, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi $Vd.2s, $imm", (MOVIv2i32 V64:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi $Vd.4s, $imm", (MOVIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : InstAlias<"movi.4h $Vd, $imm", (MOVIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi.8h $Vd, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi.2s $Vd, $imm", (MOVIv2i32 V64:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi.4s $Vd, $imm", (MOVIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : Pat<(v2i32 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
+ (MOVIv2i32 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v4i32 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
+ (MOVIv4i32 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v4i16 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
+ (MOVIv4i16 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v8i16 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
+ (MOVIv8i16 imm0_255:$imm8, imm:$shift)>;
+
+// EDIT per word: 2s & 4s with MSL shifter
+def MOVIv2s_msl : SIMDModifiedImmMoveMSL<0, 0, {1,1,0,?}, V64, "movi", ".2s",
+ [(set (v2i32 V64:$Rd),
+ (AArch64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
+def MOVIv4s_msl : SIMDModifiedImmMoveMSL<1, 0, {1,1,0,?}, V128, "movi", ".4s",
+ [(set (v4i32 V128:$Rd),
+ (AArch64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
+
+// Per byte: 8b & 16b
+def MOVIv8b_ns : SIMDModifiedImmVectorNoShift<0, 0, 0, 0b1110, V64, imm0_255,
+ "movi", ".8b",
+ [(set (v8i8 V64:$Rd), (AArch64movi imm0_255:$imm8))]>;
+def MOVIv16b_ns : SIMDModifiedImmVectorNoShift<1, 0, 0, 0b1110, V128, imm0_255,
+ "movi", ".16b",
+ [(set (v16i8 V128:$Rd), (AArch64movi imm0_255:$imm8))]>;
+
+// AdvSIMD MVNI
+
+// EDIT per word & halfword: 2s, 4h, 4s, & 8h
+defm MVNI : SIMDModifiedImmVectorShift<1, 0b10, 0b00, "mvni">;
+
+def : InstAlias<"mvni $Vd.4h, $imm", (MVNIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni $Vd.8h, $imm", (MVNIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni $Vd.2s, $imm", (MVNIv2i32 V64:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni $Vd.4s, $imm", (MVNIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : InstAlias<"mvni.4h $Vd, $imm", (MVNIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni.8h $Vd, $imm", (MVNIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni.2s $Vd, $imm", (MVNIv2i32 V64:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni.4s $Vd, $imm", (MVNIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : Pat<(v2i32 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
+ (MVNIv2i32 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v4i32 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
+ (MVNIv4i32 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v4i16 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
+ (MVNIv4i16 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v8i16 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
+ (MVNIv8i16 imm0_255:$imm8, imm:$shift)>;
+
+// EDIT per word: 2s & 4s with MSL shifter
+def MVNIv2s_msl : SIMDModifiedImmMoveMSL<0, 1, {1,1,0,?}, V64, "mvni", ".2s",
+ [(set (v2i32 V64:$Rd),
+ (AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
+def MVNIv4s_msl : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s",
+ [(set (v4i32 V128:$Rd),
+ (AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD indexed element
+//----------------------------------------------------------------------------
+
+let hasSideEffects = 0 in {
+ defm FMLA : SIMDFPIndexedTied<0, 0b0001, "fmla">;
+ defm FMLS : SIMDFPIndexedTied<0, 0b0101, "fmls">;
+}
+
+// NOTE: Operands are reordered in the FMLA/FMLS PatFrags because the
+// instruction expects the addend first, while the intrinsic expects it last.
+
+// On the other hand, there are quite a few valid combinatorial options due to
+// the commutativity of multiplication and the fact that (-x) * y = x * (-y).
+defm : SIMDFPIndexedTiedPatterns<"FMLA",
+ TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)>>;
+defm : SIMDFPIndexedTiedPatterns<"FMLA",
+ TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)>>;
+
+defm : SIMDFPIndexedTiedPatterns<"FMLS",
+ TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
+defm : SIMDFPIndexedTiedPatterns<"FMLS",
+ TriOpFrag<(fma node:$RHS, (fneg node:$MHS), node:$LHS)> >;
+defm : SIMDFPIndexedTiedPatterns<"FMLS",
+ TriOpFrag<(fma (fneg node:$RHS), node:$MHS, node:$LHS)> >;
+defm : SIMDFPIndexedTiedPatterns<"FMLS",
+ TriOpFrag<(fma (fneg node:$MHS), node:$RHS, node:$LHS)> >;
+
+multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> {
+ // 3 variants for the .2s version: DUPLANE from 128-bit, DUPLANE from 64-bit
+ // and DUP scalar.
+ def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+ (AArch64duplane32 (v4f32 (fneg V128:$Rm)),
+ VectorIndexS:$idx))),
+ (FMLSv2i32_indexed V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+ def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+ (v2f32 (AArch64duplane32
+ (v4f32 (insert_subvector undef,
+ (v2f32 (fneg V64:$Rm)),
+ (i32 0))),
+ VectorIndexS:$idx)))),
+ (FMLSv2i32_indexed V64:$Rd, V64:$Rn,
+ (SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
+ VectorIndexS:$idx)>;
+ def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+ (AArch64dup (f32 (fneg FPR32Op:$Rm))))),
+ (FMLSv2i32_indexed V64:$Rd, V64:$Rn,
+ (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
+
+ // 3 variants for the .4s version: DUPLANE from 128-bit, DUPLANE from 64-bit
+ // and DUP scalar.
+ def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+ (AArch64duplane32 (v4f32 (fneg V128:$Rm)),
+ VectorIndexS:$idx))),
+ (FMLSv4i32_indexed V128:$Rd, V128:$Rn, V128:$Rm,
+ VectorIndexS:$idx)>;
+ def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+ (v4f32 (AArch64duplane32
+ (v4f32 (insert_subvector undef,
+ (v2f32 (fneg V64:$Rm)),
+ (i32 0))),
+ VectorIndexS:$idx)))),
+ (FMLSv4i32_indexed V128:$Rd, V128:$Rn,
+ (SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
+ VectorIndexS:$idx)>;
+ def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+ (AArch64dup (f32 (fneg FPR32Op:$Rm))))),
+ (FMLSv4i32_indexed V128:$Rd, V128:$Rn,
+ (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
+
+ // 2 variants for the .2d version: DUPLANE from 128-bit, and DUP scalar
+ // (DUPLANE from 64-bit would be trivial).
+ def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
+ (AArch64duplane64 (v2f64 (fneg V128:$Rm)),
+ VectorIndexD:$idx))),
+ (FMLSv2i64_indexed
+ V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+ def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
+ (AArch64dup (f64 (fneg FPR64Op:$Rm))))),
+ (FMLSv2i64_indexed V128:$Rd, V128:$Rn,
+ (SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>;
+
+ // 2 variants for 32-bit scalar version: extract from .2s or from .4s
+ def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
+ (vector_extract (v4f32 (fneg V128:$Rm)),
+ VectorIndexS:$idx))),
+ (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
+ V128:$Rm, VectorIndexS:$idx)>;
+ def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
+ (vector_extract (v4f32 (insert_subvector undef,
+ (v2f32 (fneg V64:$Rm)),
+ (i32 0))),
+ VectorIndexS:$idx))),
+ (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
+ (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>;
+
+ // 1 variant for 64-bit scalar version: extract from .1d or from .2d
+ def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn),
+ (vector_extract (v2f64 (fneg V128:$Rm)),
+ VectorIndexS:$idx))),
+ (FMLSv1i64_indexed FPR64:$Rd, FPR64:$Rn,
+ V128:$Rm, VectorIndexS:$idx)>;
+}
+
+defm : FMLSIndexedAfterNegPatterns<
+ TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
+defm : FMLSIndexedAfterNegPatterns<
+ TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)> >;
+
+defm FMULX : SIMDFPIndexed<1, 0b1001, "fmulx", int_aarch64_neon_fmulx>;
+defm FMUL : SIMDFPIndexed<0, 0b1001, "fmul", fmul>;
+
+def : Pat<(v2f32 (fmul V64:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
+ (FMULv2i32_indexed V64:$Rn,
+ (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
+ (i64 0))>;
+def : Pat<(v4f32 (fmul V128:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
+ (FMULv4i32_indexed V128:$Rn,
+ (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
+ (i64 0))>;
+def : Pat<(v2f64 (fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))),
+ (FMULv2i64_indexed V128:$Rn,
+ (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rm, dsub),
+ (i64 0))>;
+
+defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", int_aarch64_neon_sqdmulh>;
+defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
+defm MLA : SIMDVectorIndexedHSTied<1, 0b0000, "mla",
+ TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))>>;
+defm MLS : SIMDVectorIndexedHSTied<1, 0b0100, "mls",
+ TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))>>;
+defm MUL : SIMDVectorIndexedHS<0, 0b1000, "mul", mul>;
+defm SMLAL : SIMDVectorIndexedLongSDTied<0, 0b0010, "smlal",
+ TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+defm SMLSL : SIMDVectorIndexedLongSDTied<0, 0b0110, "smlsl",
+ TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull",
+ int_aarch64_neon_smull>;
+defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal",
+ int_aarch64_neon_sqadd>;
+defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl",
+ int_aarch64_neon_sqsub>;
+defm SQRDMLAH : SIMDIndexedSQRDMLxHSDTied<1, 0b1101, "sqrdmlah",
+ int_aarch64_neon_sqadd>;
+defm SQRDMLSH : SIMDIndexedSQRDMLxHSDTied<1, 0b1111, "sqrdmlsh",
+ int_aarch64_neon_sqsub>;
+defm SQDMULL : SIMDIndexedLongSD<0, 0b1011, "sqdmull", int_aarch64_neon_sqdmull>;
+defm UMLAL : SIMDVectorIndexedLongSDTied<1, 0b0010, "umlal",
+ TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+defm UMLSL : SIMDVectorIndexedLongSDTied<1, 0b0110, "umlsl",
+ TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+defm UMULL : SIMDVectorIndexedLongSD<1, 0b1010, "umull",
+ int_aarch64_neon_umull>;
+
+// A scalar sqdmull with the second operand being a vector lane can be
+// handled directly with the indexed instruction encoding.
+def : Pat<(int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
+ (vector_extract (v4i32 V128:$Vm),
+ VectorIndexS:$idx)),
+ (SQDMULLv1i64_indexed FPR32:$Rn, V128:$Vm, VectorIndexS:$idx)>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar shift instructions
+//----------------------------------------------------------------------------
+defm FCVTZS : SIMDFPScalarRShift<0, 0b11111, "fcvtzs">;
+defm FCVTZU : SIMDFPScalarRShift<1, 0b11111, "fcvtzu">;
+defm SCVTF : SIMDFPScalarRShift<0, 0b11100, "scvtf">;
+defm UCVTF : SIMDFPScalarRShift<1, 0b11100, "ucvtf">;
+// Codegen patterns for the above. We don't put these directly on the
+// instructions because TableGen's type inference can't handle the truth.
+// Having the same base pattern for fp <--> int totally freaks it out.
+def : Pat<(int_aarch64_neon_vcvtfp2fxs FPR32:$Rn, vecshiftR32:$imm),
+ (FCVTZSs FPR32:$Rn, vecshiftR32:$imm)>;
+def : Pat<(int_aarch64_neon_vcvtfp2fxu FPR32:$Rn, vecshiftR32:$imm),
+ (FCVTZUs FPR32:$Rn, vecshiftR32:$imm)>;
+def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxs (f64 FPR64:$Rn), vecshiftR64:$imm)),
+ (FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxu (f64 FPR64:$Rn), vecshiftR64:$imm)),
+ (FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(v1i64 (int_aarch64_neon_vcvtfp2fxs (v1f64 FPR64:$Rn),
+ vecshiftR64:$imm)),
+ (FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(v1i64 (int_aarch64_neon_vcvtfp2fxu (v1f64 FPR64:$Rn),
+ vecshiftR64:$imm)),
+ (FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(int_aarch64_neon_vcvtfxs2fp FPR32:$Rn, vecshiftR32:$imm),
+ (SCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
+def : Pat<(int_aarch64_neon_vcvtfxu2fp FPR32:$Rn, vecshiftR32:$imm),
+ (UCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
+def : Pat<(f64 (int_aarch64_neon_vcvtfxs2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
+ (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(f64 (int_aarch64_neon_vcvtfxu2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
+ (UCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(v1f64 (int_aarch64_neon_vcvtfxs2fp (v1i64 FPR64:$Rn),
+ vecshiftR64:$imm)),
+ (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(v1f64 (int_aarch64_neon_vcvtfxu2fp (v1i64 FPR64:$Rn),
+ vecshiftR64:$imm)),
+ (UCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+
+defm SHL : SIMDScalarLShiftD< 0, 0b01010, "shl", AArch64vshl>;
+defm SLI : SIMDScalarLShiftDTied<1, 0b01010, "sli">;
+defm SQRSHRN : SIMDScalarRShiftBHS< 0, 0b10011, "sqrshrn",
+ int_aarch64_neon_sqrshrn>;
+defm SQRSHRUN : SIMDScalarRShiftBHS< 1, 0b10001, "sqrshrun",
+ int_aarch64_neon_sqrshrun>;
+defm SQSHLU : SIMDScalarLShiftBHSD<1, 0b01100, "sqshlu", AArch64sqshlui>;
+defm SQSHL : SIMDScalarLShiftBHSD<0, 0b01110, "sqshl", AArch64sqshli>;
+defm SQSHRN : SIMDScalarRShiftBHS< 0, 0b10010, "sqshrn",
+ int_aarch64_neon_sqshrn>;
+defm SQSHRUN : SIMDScalarRShiftBHS< 1, 0b10000, "sqshrun",
+ int_aarch64_neon_sqshrun>;
+defm SRI : SIMDScalarRShiftDTied< 1, 0b01000, "sri">;
+defm SRSHR : SIMDScalarRShiftD< 0, 0b00100, "srshr", AArch64srshri>;
+defm SRSRA : SIMDScalarRShiftDTied< 0, 0b00110, "srsra",
+ TriOpFrag<(add node:$LHS,
+ (AArch64srshri node:$MHS, node:$RHS))>>;
+defm SSHR : SIMDScalarRShiftD< 0, 0b00000, "sshr", AArch64vashr>;
+defm SSRA : SIMDScalarRShiftDTied< 0, 0b00010, "ssra",
+ TriOpFrag<(add node:$LHS,
+ (AArch64vashr node:$MHS, node:$RHS))>>;
+defm UQRSHRN : SIMDScalarRShiftBHS< 1, 0b10011, "uqrshrn",
+ int_aarch64_neon_uqrshrn>;
+defm UQSHL : SIMDScalarLShiftBHSD<1, 0b01110, "uqshl", AArch64uqshli>;
+defm UQSHRN : SIMDScalarRShiftBHS< 1, 0b10010, "uqshrn",
+ int_aarch64_neon_uqshrn>;
+defm URSHR : SIMDScalarRShiftD< 1, 0b00100, "urshr", AArch64urshri>;
+defm URSRA : SIMDScalarRShiftDTied< 1, 0b00110, "ursra",
+ TriOpFrag<(add node:$LHS,
+ (AArch64urshri node:$MHS, node:$RHS))>>;
+defm USHR : SIMDScalarRShiftD< 1, 0b00000, "ushr", AArch64vlshr>;
+defm USRA : SIMDScalarRShiftDTied< 1, 0b00010, "usra",
+ TriOpFrag<(add node:$LHS,
+ (AArch64vlshr node:$MHS, node:$RHS))>>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD vector shift instructions
+//----------------------------------------------------------------------------
+defm FCVTZS:SIMDVectorRShiftSD<0, 0b11111, "fcvtzs", int_aarch64_neon_vcvtfp2fxs>;
+defm FCVTZU:SIMDVectorRShiftSD<1, 0b11111, "fcvtzu", int_aarch64_neon_vcvtfp2fxu>;
+defm SCVTF: SIMDVectorRShiftToFP<0, 0b11100, "scvtf",
+ int_aarch64_neon_vcvtfxs2fp>;
+defm RSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn",
+ int_aarch64_neon_rshrn>;
+defm SHL : SIMDVectorLShiftBHSD<0, 0b01010, "shl", AArch64vshl>;
+defm SHRN : SIMDVectorRShiftNarrowBHS<0, 0b10000, "shrn",
+ BinOpFrag<(trunc (AArch64vashr node:$LHS, node:$RHS))>>;
+defm SLI : SIMDVectorLShiftBHSDTied<1, 0b01010, "sli", int_aarch64_neon_vsli>;
+def : Pat<(v1i64 (int_aarch64_neon_vsli (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
+ (i32 vecshiftL64:$imm))),
+ (SLId FPR64:$Rd, FPR64:$Rn, vecshiftL64:$imm)>;
+defm SQRSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10011, "sqrshrn",
+ int_aarch64_neon_sqrshrn>;
+defm SQRSHRUN: SIMDVectorRShiftNarrowBHS<1, 0b10001, "sqrshrun",
+ int_aarch64_neon_sqrshrun>;
+defm SQSHLU : SIMDVectorLShiftBHSD<1, 0b01100, "sqshlu", AArch64sqshlui>;
+defm SQSHL : SIMDVectorLShiftBHSD<0, 0b01110, "sqshl", AArch64sqshli>;
+defm SQSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10010, "sqshrn",
+ int_aarch64_neon_sqshrn>;
+defm SQSHRUN : SIMDVectorRShiftNarrowBHS<1, 0b10000, "sqshrun",
+ int_aarch64_neon_sqshrun>;
+defm SRI : SIMDVectorRShiftBHSDTied<1, 0b01000, "sri", int_aarch64_neon_vsri>;
+def : Pat<(v1i64 (int_aarch64_neon_vsri (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
+ (i32 vecshiftR64:$imm))),
+ (SRId FPR64:$Rd, FPR64:$Rn, vecshiftR64:$imm)>;
+defm SRSHR : SIMDVectorRShiftBHSD<0, 0b00100, "srshr", AArch64srshri>;
+defm SRSRA : SIMDVectorRShiftBHSDTied<0, 0b00110, "srsra",
+ TriOpFrag<(add node:$LHS,
+ (AArch64srshri node:$MHS, node:$RHS))> >;
+defm SSHLL : SIMDVectorLShiftLongBHSD<0, 0b10100, "sshll",
+ BinOpFrag<(AArch64vshl (sext node:$LHS), node:$RHS)>>;
+
+defm SSHR : SIMDVectorRShiftBHSD<0, 0b00000, "sshr", AArch64vashr>;
+defm SSRA : SIMDVectorRShiftBHSDTied<0, 0b00010, "ssra",
+ TriOpFrag<(add node:$LHS, (AArch64vashr node:$MHS, node:$RHS))>>;
+defm UCVTF : SIMDVectorRShiftToFP<1, 0b11100, "ucvtf",
+ int_aarch64_neon_vcvtfxu2fp>;
+defm UQRSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10011, "uqrshrn",
+ int_aarch64_neon_uqrshrn>;
+defm UQSHL : SIMDVectorLShiftBHSD<1, 0b01110, "uqshl", AArch64uqshli>;
+defm UQSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10010, "uqshrn",
+ int_aarch64_neon_uqshrn>;
+defm URSHR : SIMDVectorRShiftBHSD<1, 0b00100, "urshr", AArch64urshri>;
+defm URSRA : SIMDVectorRShiftBHSDTied<1, 0b00110, "ursra",
+ TriOpFrag<(add node:$LHS,
+ (AArch64urshri node:$MHS, node:$RHS))> >;
+defm USHLL : SIMDVectorLShiftLongBHSD<1, 0b10100, "ushll",
+ BinOpFrag<(AArch64vshl (zext node:$LHS), node:$RHS)>>;
+defm USHR : SIMDVectorRShiftBHSD<1, 0b00000, "ushr", AArch64vlshr>;
+defm USRA : SIMDVectorRShiftBHSDTied<1, 0b00010, "usra",
+ TriOpFrag<(add node:$LHS, (AArch64vlshr node:$MHS, node:$RHS))> >;
+
+// SHRN patterns for when a logical right shift was used instead of arithmetic
+// (the immediate guarantees no sign bits actually end up in the result so it
+// doesn't matter).
+def : Pat<(v8i8 (trunc (AArch64vlshr (v8i16 V128:$Rn), vecshiftR16Narrow:$imm))),
+ (SHRNv8i8_shift V128:$Rn, vecshiftR16Narrow:$imm)>;
+def : Pat<(v4i16 (trunc (AArch64vlshr (v4i32 V128:$Rn), vecshiftR32Narrow:$imm))),
+ (SHRNv4i16_shift V128:$Rn, vecshiftR32Narrow:$imm)>;
+def : Pat<(v2i32 (trunc (AArch64vlshr (v2i64 V128:$Rn), vecshiftR64Narrow:$imm))),
+ (SHRNv2i32_shift V128:$Rn, vecshiftR64Narrow:$imm)>;
+
+def : Pat<(v16i8 (concat_vectors (v8i8 V64:$Rd),
+ (trunc (AArch64vlshr (v8i16 V128:$Rn),
+ vecshiftR16Narrow:$imm)))),
+ (SHRNv16i8_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+ V128:$Rn, vecshiftR16Narrow:$imm)>;
+def : Pat<(v8i16 (concat_vectors (v4i16 V64:$Rd),
+ (trunc (AArch64vlshr (v4i32 V128:$Rn),
+ vecshiftR32Narrow:$imm)))),
+ (SHRNv8i16_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+ V128:$Rn, vecshiftR32Narrow:$imm)>;
+def : Pat<(v4i32 (concat_vectors (v2i32 V64:$Rd),
+ (trunc (AArch64vlshr (v2i64 V128:$Rn),
+ vecshiftR64Narrow:$imm)))),
+ (SHRNv4i32_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+ V128:$Rn, vecshiftR32Narrow:$imm)>;
+
+// Vector sign and zero extensions are implemented with SSHLL and USSHLL.
+// Anyexts are implemented as zexts.
+def : Pat<(v8i16 (sext (v8i8 V64:$Rn))), (SSHLLv8i8_shift V64:$Rn, (i32 0))>;
+def : Pat<(v8i16 (zext (v8i8 V64:$Rn))), (USHLLv8i8_shift V64:$Rn, (i32 0))>;
+def : Pat<(v8i16 (anyext (v8i8 V64:$Rn))), (USHLLv8i8_shift V64:$Rn, (i32 0))>;
+def : Pat<(v4i32 (sext (v4i16 V64:$Rn))), (SSHLLv4i16_shift V64:$Rn, (i32 0))>;
+def : Pat<(v4i32 (zext (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>;
+def : Pat<(v4i32 (anyext (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>;
+def : Pat<(v2i64 (sext (v2i32 V64:$Rn))), (SSHLLv2i32_shift V64:$Rn, (i32 0))>;
+def : Pat<(v2i64 (zext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
+def : Pat<(v2i64 (anyext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
+// Also match an extend from the upper half of a 128 bit source register.
+def : Pat<(v8i16 (anyext (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
+ (USHLLv16i8_shift V128:$Rn, (i32 0))>;
+def : Pat<(v8i16 (zext (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
+ (USHLLv16i8_shift V128:$Rn, (i32 0))>;
+def : Pat<(v8i16 (sext (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
+ (SSHLLv16i8_shift V128:$Rn, (i32 0))>;
+def : Pat<(v4i32 (anyext (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
+ (USHLLv8i16_shift V128:$Rn, (i32 0))>;
+def : Pat<(v4i32 (zext (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
+ (USHLLv8i16_shift V128:$Rn, (i32 0))>;
+def : Pat<(v4i32 (sext (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
+ (SSHLLv8i16_shift V128:$Rn, (i32 0))>;
+def : Pat<(v2i64 (anyext (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
+ (USHLLv4i32_shift V128:$Rn, (i32 0))>;
+def : Pat<(v2i64 (zext (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
+ (USHLLv4i32_shift V128:$Rn, (i32 0))>;
+def : Pat<(v2i64 (sext (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
+ (SSHLLv4i32_shift V128:$Rn, (i32 0))>;
+
+// Vector shift sxtl aliases
+def : InstAlias<"sxtl.8h $dst, $src1",
+ (SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl $dst.8h, $src1.8b",
+ (SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl.4s $dst, $src1",
+ (SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl $dst.4s, $src1.4h",
+ (SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl.2d $dst, $src1",
+ (SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl $dst.2d, $src1.2s",
+ (SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
+
+// Vector shift sxtl2 aliases
+def : InstAlias<"sxtl2.8h $dst, $src1",
+ (SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2 $dst.8h, $src1.16b",
+ (SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2.4s $dst, $src1",
+ (SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2 $dst.4s, $src1.8h",
+ (SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2.2d $dst, $src1",
+ (SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2 $dst.2d, $src1.4s",
+ (SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
+
+// Vector shift uxtl aliases
+def : InstAlias<"uxtl.8h $dst, $src1",
+ (USHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl $dst.8h, $src1.8b",
+ (USHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl.4s $dst, $src1",
+ (USHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl $dst.4s, $src1.4h",
+ (USHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl.2d $dst, $src1",
+ (USHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl $dst.2d, $src1.2s",
+ (USHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
+
+// Vector shift uxtl2 aliases
+def : InstAlias<"uxtl2.8h $dst, $src1",
+ (USHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2 $dst.8h, $src1.16b",
+ (USHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2.4s $dst, $src1",
+ (USHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2 $dst.4s, $src1.8h",
+ (USHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2.2d $dst, $src1",
+ (USHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2 $dst.2d, $src1.4s",
+ (USHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
+
+// If an integer is about to be converted to a floating point value,
+// just load it on the floating point unit.
+// These patterns are more complex because floating point loads do not
+// support sign extension.
+// The sign extension has to be explicitly added and is only supported for
+// one step: byte-to-half, half-to-word, word-to-doubleword.
+// SCVTF GPR -> FPR is 9 cycles.
+// SCVTF FPR -> FPR is 4 cyclces.
+// (sign extension with lengthen) SXTL FPR -> FPR is 2 cycles.
+// Therefore, we can do 2 sign extensions and one SCVTF FPR -> FPR
+// and still being faster.
+// However, this is not good for code size.
+// 8-bits -> float. 2 sizes step-up.
+class SExtLoadi8CVTf32Pat<dag addrmode, dag INST>
+ : Pat<(f32 (sint_to_fp (i32 (sextloadi8 addrmode)))),
+ (SCVTFv1i32 (f32 (EXTRACT_SUBREG
+ (SSHLLv4i16_shift
+ (f64
+ (EXTRACT_SUBREG
+ (SSHLLv8i8_shift
+ (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+ INST,
+ bsub),
+ 0),
+ dsub)),
+ 0),
+ ssub)))>,
+ Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>;
+
+def : SExtLoadi8CVTf32Pat<(ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext),
+ (LDRBroW GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext)>;
+def : SExtLoadi8CVTf32Pat<(ro8.Xpat GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$ext),
+ (LDRBroX GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$ext)>;
+def : SExtLoadi8CVTf32Pat<(am_indexed8 GPR64sp:$Rn, uimm12s1:$offset),
+ (LDRBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : SExtLoadi8CVTf32Pat<(am_unscaled8 GPR64sp:$Rn, simm9:$offset),
+ (LDURBi GPR64sp:$Rn, simm9:$offset)>;
+
+// 16-bits -> float. 1 size step-up.
+class SExtLoadi16CVTf32Pat<dag addrmode, dag INST>
+ : Pat<(f32 (sint_to_fp (i32 (sextloadi16 addrmode)))),
+ (SCVTFv1i32 (f32 (EXTRACT_SUBREG
+ (SSHLLv4i16_shift
+ (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+ INST,
+ hsub),
+ 0),
+ ssub)))>, Requires<[NotForCodeSize]>;
+
+def : SExtLoadi16CVTf32Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
+ (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
+def : SExtLoadi16CVTf32Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext),
+ (LDRHroX GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext)>;
+def : SExtLoadi16CVTf32Pat<(am_indexed16 GPR64sp:$Rn, uimm12s2:$offset),
+ (LDRHui GPR64sp:$Rn, uimm12s2:$offset)>;
+def : SExtLoadi16CVTf32Pat<(am_unscaled16 GPR64sp:$Rn, simm9:$offset),
+ (LDURHi GPR64sp:$Rn, simm9:$offset)>;
+
+// 32-bits to 32-bits are handled in target specific dag combine:
+// performIntToFpCombine.
+// 64-bits integer to 32-bits floating point, not possible with
+// SCVTF on floating point registers (both source and destination
+// must have the same size).
+
+// Here are the patterns for 8, 16, 32, and 64-bits to double.
+// 8-bits -> double. 3 size step-up: give up.
+// 16-bits -> double. 2 size step.
+class SExtLoadi16CVTf64Pat<dag addrmode, dag INST>
+ : Pat <(f64 (sint_to_fp (i32 (sextloadi16 addrmode)))),
+ (SCVTFv1i64 (f64 (EXTRACT_SUBREG
+ (SSHLLv2i32_shift
+ (f64
+ (EXTRACT_SUBREG
+ (SSHLLv4i16_shift
+ (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+ INST,
+ hsub),
+ 0),
+ dsub)),
+ 0),
+ dsub)))>,
+ Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>;
+
+def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
+ (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
+def : SExtLoadi16CVTf64Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext),
+ (LDRHroX GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext)>;
+def : SExtLoadi16CVTf64Pat<(am_indexed16 GPR64sp:$Rn, uimm12s2:$offset),
+ (LDRHui GPR64sp:$Rn, uimm12s2:$offset)>;
+def : SExtLoadi16CVTf64Pat<(am_unscaled16 GPR64sp:$Rn, simm9:$offset),
+ (LDURHi GPR64sp:$Rn, simm9:$offset)>;
+// 32-bits -> double. 1 size step-up.
+class SExtLoadi32CVTf64Pat<dag addrmode, dag INST>
+ : Pat <(f64 (sint_to_fp (i32 (load addrmode)))),
+ (SCVTFv1i64 (f64 (EXTRACT_SUBREG
+ (SSHLLv2i32_shift
+ (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+ INST,
+ ssub),
+ 0),
+ dsub)))>, Requires<[NotForCodeSize]>;
+
+def : SExtLoadi32CVTf64Pat<(ro32.Wpat GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext),
+ (LDRSroW GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext)>;
+def : SExtLoadi32CVTf64Pat<(ro32.Xpat GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$ext),
+ (LDRSroX GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$ext)>;
+def : SExtLoadi32CVTf64Pat<(am_indexed32 GPR64sp:$Rn, uimm12s4:$offset),
+ (LDRSui GPR64sp:$Rn, uimm12s4:$offset)>;
+def : SExtLoadi32CVTf64Pat<(am_unscaled32 GPR64sp:$Rn, simm9:$offset),
+ (LDURSi GPR64sp:$Rn, simm9:$offset)>;
+
+// 64-bits -> double are handled in target specific dag combine:
+// performIntToFpCombine.
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD Load-Store Structure
+//----------------------------------------------------------------------------
+defm LD1 : SIMDLd1Multiple<"ld1">;
+defm LD2 : SIMDLd2Multiple<"ld2">;
+defm LD3 : SIMDLd3Multiple<"ld3">;
+defm LD4 : SIMDLd4Multiple<"ld4">;
+
+defm ST1 : SIMDSt1Multiple<"st1">;
+defm ST2 : SIMDSt2Multiple<"st2">;
+defm ST3 : SIMDSt3Multiple<"st3">;
+defm ST4 : SIMDSt4Multiple<"st4">;
+
+class Ld1Pat<ValueType ty, Instruction INST>
+ : Pat<(ty (load GPR64sp:$Rn)), (INST GPR64sp:$Rn)>;
+
+def : Ld1Pat<v16i8, LD1Onev16b>;
+def : Ld1Pat<v8i16, LD1Onev8h>;
+def : Ld1Pat<v4i32, LD1Onev4s>;
+def : Ld1Pat<v2i64, LD1Onev2d>;
+def : Ld1Pat<v8i8, LD1Onev8b>;
+def : Ld1Pat<v4i16, LD1Onev4h>;
+def : Ld1Pat<v2i32, LD1Onev2s>;
+def : Ld1Pat<v1i64, LD1Onev1d>;
+
+class St1Pat<ValueType ty, Instruction INST>
+ : Pat<(store ty:$Vt, GPR64sp:$Rn),
+ (INST ty:$Vt, GPR64sp:$Rn)>;
+
+def : St1Pat<v16i8, ST1Onev16b>;
+def : St1Pat<v8i16, ST1Onev8h>;
+def : St1Pat<v4i32, ST1Onev4s>;
+def : St1Pat<v2i64, ST1Onev2d>;
+def : St1Pat<v8i8, ST1Onev8b>;
+def : St1Pat<v4i16, ST1Onev4h>;
+def : St1Pat<v2i32, ST1Onev2s>;
+def : St1Pat<v1i64, ST1Onev1d>;
+
+//---
+// Single-element
+//---
+
+defm LD1R : SIMDLdR<0, 0b110, 0, "ld1r", "One", 1, 2, 4, 8>;
+defm LD2R : SIMDLdR<1, 0b110, 0, "ld2r", "Two", 2, 4, 8, 16>;
+defm LD3R : SIMDLdR<0, 0b111, 0, "ld3r", "Three", 3, 6, 12, 24>;
+defm LD4R : SIMDLdR<1, 0b111, 0, "ld4r", "Four", 4, 8, 16, 32>;
+let mayLoad = 1, hasSideEffects = 0 in {
+defm LD1 : SIMDLdSingleBTied<0, 0b000, "ld1", VecListOneb, GPR64pi1>;
+defm LD1 : SIMDLdSingleHTied<0, 0b010, 0, "ld1", VecListOneh, GPR64pi2>;
+defm LD1 : SIMDLdSingleSTied<0, 0b100, 0b00, "ld1", VecListOnes, GPR64pi4>;
+defm LD1 : SIMDLdSingleDTied<0, 0b100, 0b01, "ld1", VecListOned, GPR64pi8>;
+defm LD2 : SIMDLdSingleBTied<1, 0b000, "ld2", VecListTwob, GPR64pi2>;
+defm LD2 : SIMDLdSingleHTied<1, 0b010, 0, "ld2", VecListTwoh, GPR64pi4>;
+defm LD2 : SIMDLdSingleSTied<1, 0b100, 0b00, "ld2", VecListTwos, GPR64pi8>;
+defm LD2 : SIMDLdSingleDTied<1, 0b100, 0b01, "ld2", VecListTwod, GPR64pi16>;
+defm LD3 : SIMDLdSingleBTied<0, 0b001, "ld3", VecListThreeb, GPR64pi3>;
+defm LD3 : SIMDLdSingleHTied<0, 0b011, 0, "ld3", VecListThreeh, GPR64pi6>;
+defm LD3 : SIMDLdSingleSTied<0, 0b101, 0b00, "ld3", VecListThrees, GPR64pi12>;
+defm LD3 : SIMDLdSingleDTied<0, 0b101, 0b01, "ld3", VecListThreed, GPR64pi24>;
+defm LD4 : SIMDLdSingleBTied<1, 0b001, "ld4", VecListFourb, GPR64pi4>;
+defm LD4 : SIMDLdSingleHTied<1, 0b011, 0, "ld4", VecListFourh, GPR64pi8>;
+defm LD4 : SIMDLdSingleSTied<1, 0b101, 0b00, "ld4", VecListFours, GPR64pi16>;
+defm LD4 : SIMDLdSingleDTied<1, 0b101, 0b01, "ld4", VecListFourd, GPR64pi32>;
+}
+
+def : Pat<(v8i8 (AArch64dup (i32 (extloadi8 GPR64sp:$Rn)))),
+ (LD1Rv8b GPR64sp:$Rn)>;
+def : Pat<(v16i8 (AArch64dup (i32 (extloadi8 GPR64sp:$Rn)))),
+ (LD1Rv16b GPR64sp:$Rn)>;
+def : Pat<(v4i16 (AArch64dup (i32 (extloadi16 GPR64sp:$Rn)))),
+ (LD1Rv4h GPR64sp:$Rn)>;
+def : Pat<(v8i16 (AArch64dup (i32 (extloadi16 GPR64sp:$Rn)))),
+ (LD1Rv8h GPR64sp:$Rn)>;
+def : Pat<(v2i32 (AArch64dup (i32 (load GPR64sp:$Rn)))),
+ (LD1Rv2s GPR64sp:$Rn)>;
+def : Pat<(v4i32 (AArch64dup (i32 (load GPR64sp:$Rn)))),
+ (LD1Rv4s GPR64sp:$Rn)>;
+def : Pat<(v2i64 (AArch64dup (i64 (load GPR64sp:$Rn)))),
+ (LD1Rv2d GPR64sp:$Rn)>;
+def : Pat<(v1i64 (AArch64dup (i64 (load GPR64sp:$Rn)))),
+ (LD1Rv1d GPR64sp:$Rn)>;
+// Grab the floating point version too
+def : Pat<(v2f32 (AArch64dup (f32 (load GPR64sp:$Rn)))),
+ (LD1Rv2s GPR64sp:$Rn)>;
+def : Pat<(v4f32 (AArch64dup (f32 (load GPR64sp:$Rn)))),
+ (LD1Rv4s GPR64sp:$Rn)>;
+def : Pat<(v2f64 (AArch64dup (f64 (load GPR64sp:$Rn)))),
+ (LD1Rv2d GPR64sp:$Rn)>;
+def : Pat<(v1f64 (AArch64dup (f64 (load GPR64sp:$Rn)))),
+ (LD1Rv1d GPR64sp:$Rn)>;
+def : Pat<(v4f16 (AArch64dup (f16 (load GPR64sp:$Rn)))),
+ (LD1Rv4h GPR64sp:$Rn)>;
+def : Pat<(v8f16 (AArch64dup (f16 (load GPR64sp:$Rn)))),
+ (LD1Rv8h GPR64sp:$Rn)>;
+
+class Ld1Lane128Pat<SDPatternOperator scalar_load, Operand VecIndex,
+ ValueType VTy, ValueType STy, Instruction LD1>
+ : Pat<(vector_insert (VTy VecListOne128:$Rd),
+ (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx),
+ (LD1 VecListOne128:$Rd, VecIndex:$idx, GPR64sp:$Rn)>;
+
+def : Ld1Lane128Pat<extloadi8, VectorIndexB, v16i8, i32, LD1i8>;
+def : Ld1Lane128Pat<extloadi16, VectorIndexH, v8i16, i32, LD1i16>;
+def : Ld1Lane128Pat<load, VectorIndexS, v4i32, i32, LD1i32>;
+def : Ld1Lane128Pat<load, VectorIndexS, v4f32, f32, LD1i32>;
+def : Ld1Lane128Pat<load, VectorIndexD, v2i64, i64, LD1i64>;
+def : Ld1Lane128Pat<load, VectorIndexD, v2f64, f64, LD1i64>;
+def : Ld1Lane128Pat<load, VectorIndexH, v8f16, f16, LD1i16>;
+
+class Ld1Lane64Pat<SDPatternOperator scalar_load, Operand VecIndex,
+ ValueType VTy, ValueType STy, Instruction LD1>
+ : Pat<(vector_insert (VTy VecListOne64:$Rd),
+ (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx),
+ (EXTRACT_SUBREG
+ (LD1 (SUBREG_TO_REG (i32 0), VecListOne64:$Rd, dsub),
+ VecIndex:$idx, GPR64sp:$Rn),
+ dsub)>;
+
+def : Ld1Lane64Pat<extloadi8, VectorIndexB, v8i8, i32, LD1i8>;
+def : Ld1Lane64Pat<extloadi16, VectorIndexH, v4i16, i32, LD1i16>;
+def : Ld1Lane64Pat<load, VectorIndexS, v2i32, i32, LD1i32>;
+def : Ld1Lane64Pat<load, VectorIndexS, v2f32, f32, LD1i32>;
+def : Ld1Lane64Pat<load, VectorIndexH, v4f16, f16, LD1i16>;
+
+
+defm LD1 : SIMDLdSt1SingleAliases<"ld1">;
+defm LD2 : SIMDLdSt2SingleAliases<"ld2">;
+defm LD3 : SIMDLdSt3SingleAliases<"ld3">;
+defm LD4 : SIMDLdSt4SingleAliases<"ld4">;
+
+// Stores
+defm ST1 : SIMDStSingleB<0, 0b000, "st1", VecListOneb, GPR64pi1>;
+defm ST1 : SIMDStSingleH<0, 0b010, 0, "st1", VecListOneh, GPR64pi2>;
+defm ST1 : SIMDStSingleS<0, 0b100, 0b00, "st1", VecListOnes, GPR64pi4>;
+defm ST1 : SIMDStSingleD<0, 0b100, 0b01, "st1", VecListOned, GPR64pi8>;
+
+let AddedComplexity = 19 in
+class St1Lane128Pat<SDPatternOperator scalar_store, Operand VecIndex,
+ ValueType VTy, ValueType STy, Instruction ST1>
+ : Pat<(scalar_store
+ (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
+ GPR64sp:$Rn),
+ (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn)>;
+
+def : St1Lane128Pat<truncstorei8, VectorIndexB, v16i8, i32, ST1i8>;
+def : St1Lane128Pat<truncstorei16, VectorIndexH, v8i16, i32, ST1i16>;
+def : St1Lane128Pat<store, VectorIndexS, v4i32, i32, ST1i32>;
+def : St1Lane128Pat<store, VectorIndexS, v4f32, f32, ST1i32>;
+def : St1Lane128Pat<store, VectorIndexD, v2i64, i64, ST1i64>;
+def : St1Lane128Pat<store, VectorIndexD, v2f64, f64, ST1i64>;
+def : St1Lane128Pat<store, VectorIndexH, v8f16, f16, ST1i16>;
+
+let AddedComplexity = 19 in
+class St1Lane64Pat<SDPatternOperator scalar_store, Operand VecIndex,
+ ValueType VTy, ValueType STy, Instruction ST1>
+ : Pat<(scalar_store
+ (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
+ GPR64sp:$Rn),
+ (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
+ VecIndex:$idx, GPR64sp:$Rn)>;
+
+def : St1Lane64Pat<truncstorei8, VectorIndexB, v8i8, i32, ST1i8>;
+def : St1Lane64Pat<truncstorei16, VectorIndexH, v4i16, i32, ST1i16>;
+def : St1Lane64Pat<store, VectorIndexS, v2i32, i32, ST1i32>;
+def : St1Lane64Pat<store, VectorIndexS, v2f32, f32, ST1i32>;
+def : St1Lane64Pat<store, VectorIndexH, v4f16, f16, ST1i16>;
+
+multiclass St1LanePost64Pat<SDPatternOperator scalar_store, Operand VecIndex,
+ ValueType VTy, ValueType STy, Instruction ST1,
+ int offset> {
+ def : Pat<(scalar_store
+ (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
+ GPR64sp:$Rn, offset),
+ (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
+ VecIndex:$idx, GPR64sp:$Rn, XZR)>;
+
+ def : Pat<(scalar_store
+ (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
+ GPR64sp:$Rn, GPR64:$Rm),
+ (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
+ VecIndex:$idx, GPR64sp:$Rn, $Rm)>;
+}
+
+defm : St1LanePost64Pat<post_truncsti8, VectorIndexB, v8i8, i32, ST1i8_POST, 1>;
+defm : St1LanePost64Pat<post_truncsti16, VectorIndexH, v4i16, i32, ST1i16_POST,
+ 2>;
+defm : St1LanePost64Pat<post_store, VectorIndexS, v2i32, i32, ST1i32_POST, 4>;
+defm : St1LanePost64Pat<post_store, VectorIndexS, v2f32, f32, ST1i32_POST, 4>;
+defm : St1LanePost64Pat<post_store, VectorIndexD, v1i64, i64, ST1i64_POST, 8>;
+defm : St1LanePost64Pat<post_store, VectorIndexD, v1f64, f64, ST1i64_POST, 8>;
+defm : St1LanePost64Pat<post_store, VectorIndexH, v4f16, f16, ST1i16_POST, 2>;
+
+multiclass St1LanePost128Pat<SDPatternOperator scalar_store, Operand VecIndex,
+ ValueType VTy, ValueType STy, Instruction ST1,
+ int offset> {
+ def : Pat<(scalar_store
+ (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
+ GPR64sp:$Rn, offset),
+ (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn, XZR)>;
+
+ def : Pat<(scalar_store
+ (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
+ GPR64sp:$Rn, GPR64:$Rm),
+ (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn, $Rm)>;
+}
+
+defm : St1LanePost128Pat<post_truncsti8, VectorIndexB, v16i8, i32, ST1i8_POST,
+ 1>;
+defm : St1LanePost128Pat<post_truncsti16, VectorIndexH, v8i16, i32, ST1i16_POST,
+ 2>;
+defm : St1LanePost128Pat<post_store, VectorIndexS, v4i32, i32, ST1i32_POST, 4>;
+defm : St1LanePost128Pat<post_store, VectorIndexS, v4f32, f32, ST1i32_POST, 4>;
+defm : St1LanePost128Pat<post_store, VectorIndexD, v2i64, i64, ST1i64_POST, 8>;
+defm : St1LanePost128Pat<post_store, VectorIndexD, v2f64, f64, ST1i64_POST, 8>;
+defm : St1LanePost128Pat<post_store, VectorIndexH, v8f16, f16, ST1i16_POST, 2>;
+
+let mayStore = 1, hasSideEffects = 0 in {
+defm ST2 : SIMDStSingleB<1, 0b000, "st2", VecListTwob, GPR64pi2>;
+defm ST2 : SIMDStSingleH<1, 0b010, 0, "st2", VecListTwoh, GPR64pi4>;
+defm ST2 : SIMDStSingleS<1, 0b100, 0b00, "st2", VecListTwos, GPR64pi8>;
+defm ST2 : SIMDStSingleD<1, 0b100, 0b01, "st2", VecListTwod, GPR64pi16>;
+defm ST3 : SIMDStSingleB<0, 0b001, "st3", VecListThreeb, GPR64pi3>;
+defm ST3 : SIMDStSingleH<0, 0b011, 0, "st3", VecListThreeh, GPR64pi6>;
+defm ST3 : SIMDStSingleS<0, 0b101, 0b00, "st3", VecListThrees, GPR64pi12>;
+defm ST3 : SIMDStSingleD<0, 0b101, 0b01, "st3", VecListThreed, GPR64pi24>;
+defm ST4 : SIMDStSingleB<1, 0b001, "st4", VecListFourb, GPR64pi4>;
+defm ST4 : SIMDStSingleH<1, 0b011, 0, "st4", VecListFourh, GPR64pi8>;
+defm ST4 : SIMDStSingleS<1, 0b101, 0b00, "st4", VecListFours, GPR64pi16>;
+defm ST4 : SIMDStSingleD<1, 0b101, 0b01, "st4", VecListFourd, GPR64pi32>;
+}
+
+defm ST1 : SIMDLdSt1SingleAliases<"st1">;
+defm ST2 : SIMDLdSt2SingleAliases<"st2">;
+defm ST3 : SIMDLdSt3SingleAliases<"st3">;
+defm ST4 : SIMDLdSt4SingleAliases<"st4">;
+
+//----------------------------------------------------------------------------
+// Crypto extensions
+//----------------------------------------------------------------------------
+
+def AESErr : AESTiedInst<0b0100, "aese", int_aarch64_crypto_aese>;
+def AESDrr : AESTiedInst<0b0101, "aesd", int_aarch64_crypto_aesd>;
+def AESMCrr : AESInst< 0b0110, "aesmc", int_aarch64_crypto_aesmc>;
+def AESIMCrr : AESInst< 0b0111, "aesimc", int_aarch64_crypto_aesimc>;
+
+def SHA1Crrr : SHATiedInstQSV<0b000, "sha1c", int_aarch64_crypto_sha1c>;
+def SHA1Prrr : SHATiedInstQSV<0b001, "sha1p", int_aarch64_crypto_sha1p>;
+def SHA1Mrrr : SHATiedInstQSV<0b010, "sha1m", int_aarch64_crypto_sha1m>;
+def SHA1SU0rrr : SHATiedInstVVV<0b011, "sha1su0", int_aarch64_crypto_sha1su0>;
+def SHA256Hrrr : SHATiedInstQQV<0b100, "sha256h", int_aarch64_crypto_sha256h>;
+def SHA256H2rrr : SHATiedInstQQV<0b101, "sha256h2",int_aarch64_crypto_sha256h2>;
+def SHA256SU1rrr :SHATiedInstVVV<0b110, "sha256su1",int_aarch64_crypto_sha256su1>;
+
+def SHA1Hrr : SHAInstSS< 0b0000, "sha1h", int_aarch64_crypto_sha1h>;
+def SHA1SU1rr : SHATiedInstVV<0b0001, "sha1su1", int_aarch64_crypto_sha1su1>;
+def SHA256SU0rr : SHATiedInstVV<0b0010, "sha256su0",int_aarch64_crypto_sha256su0>;
+
+//----------------------------------------------------------------------------
+// Compiler-pseudos
+//----------------------------------------------------------------------------
+// FIXME: Like for X86, these should go in their own separate .td file.
+
+def def32 : PatLeaf<(i32 GPR32:$src), [{
+ return isDef32(*N);
+}]>;
+
+// In the case of a 32-bit def that is known to implicitly zero-extend,
+// we can use a SUBREG_TO_REG.
+def : Pat<(i64 (zext def32:$src)), (SUBREG_TO_REG (i64 0), GPR32:$src, sub_32)>;
+
+// For an anyext, we don't care what the high bits are, so we can perform an
+// INSERT_SUBREF into an IMPLICIT_DEF.
+def : Pat<(i64 (anyext GPR32:$src)),
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32)>;
+
+// When we need to explicitly zero-extend, we use a 32-bit MOV instruction and
+// then assert the extension has happened.
+def : Pat<(i64 (zext GPR32:$src)),
+ (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>;
+
+// To sign extend, we use a signed bitfield move instruction (SBFM) on the
+// containing super-reg.
+def : Pat<(i64 (sext GPR32:$src)),
+ (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>;
+def : Pat<(i64 (sext_inreg GPR64:$src, i32)), (SBFMXri GPR64:$src, 0, 31)>;
+def : Pat<(i64 (sext_inreg GPR64:$src, i16)), (SBFMXri GPR64:$src, 0, 15)>;
+def : Pat<(i64 (sext_inreg GPR64:$src, i8)), (SBFMXri GPR64:$src, 0, 7)>;
+def : Pat<(i64 (sext_inreg GPR64:$src, i1)), (SBFMXri GPR64:$src, 0, 0)>;
+def : Pat<(i32 (sext_inreg GPR32:$src, i16)), (SBFMWri GPR32:$src, 0, 15)>;
+def : Pat<(i32 (sext_inreg GPR32:$src, i8)), (SBFMWri GPR32:$src, 0, 7)>;
+def : Pat<(i32 (sext_inreg GPR32:$src, i1)), (SBFMWri GPR32:$src, 0, 0)>;
+
+def : Pat<(shl (sext_inreg GPR32:$Rn, i8), (i64 imm0_31:$imm)),
+ (SBFMWri GPR32:$Rn, (i64 (i32shift_a imm0_31:$imm)),
+ (i64 (i32shift_sext_i8 imm0_31:$imm)))>;
+def : Pat<(shl (sext_inreg GPR64:$Rn, i8), (i64 imm0_63:$imm)),
+ (SBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)),
+ (i64 (i64shift_sext_i8 imm0_63:$imm)))>;
+
+def : Pat<(shl (sext_inreg GPR32:$Rn, i16), (i64 imm0_31:$imm)),
+ (SBFMWri GPR32:$Rn, (i64 (i32shift_a imm0_31:$imm)),
+ (i64 (i32shift_sext_i16 imm0_31:$imm)))>;
+def : Pat<(shl (sext_inreg GPR64:$Rn, i16), (i64 imm0_63:$imm)),
+ (SBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)),
+ (i64 (i64shift_sext_i16 imm0_63:$imm)))>;
+
+def : Pat<(shl (i64 (sext GPR32:$Rn)), (i64 imm0_63:$imm)),
+ (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32),
+ (i64 (i64shift_a imm0_63:$imm)),
+ (i64 (i64shift_sext_i32 imm0_63:$imm)))>;
+
+// sra patterns have an AddedComplexity of 10, so make sure we have a higher
+// AddedComplexity for the following patterns since we want to match sext + sra
+// patterns before we attempt to match a single sra node.
+let AddedComplexity = 20 in {
+// We support all sext + sra combinations which preserve at least one bit of the
+// original value which is to be sign extended. E.g. we support shifts up to
+// bitwidth-1 bits.
+def : Pat<(sra (sext_inreg GPR32:$Rn, i8), (i64 imm0_7:$imm)),
+ (SBFMWri GPR32:$Rn, (i64 imm0_7:$imm), 7)>;
+def : Pat<(sra (sext_inreg GPR64:$Rn, i8), (i64 imm0_7:$imm)),
+ (SBFMXri GPR64:$Rn, (i64 imm0_7:$imm), 7)>;
+
+def : Pat<(sra (sext_inreg GPR32:$Rn, i16), (i64 imm0_15:$imm)),
+ (SBFMWri GPR32:$Rn, (i64 imm0_15:$imm), 15)>;
+def : Pat<(sra (sext_inreg GPR64:$Rn, i16), (i64 imm0_15:$imm)),
+ (SBFMXri GPR64:$Rn, (i64 imm0_15:$imm), 15)>;
+
+def : Pat<(sra (i64 (sext GPR32:$Rn)), (i64 imm0_31:$imm)),
+ (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32),
+ (i64 imm0_31:$imm), 31)>;
+} // AddedComplexity = 20
+
+// To truncate, we can simply extract from a subregister.
+def : Pat<(i32 (trunc GPR64sp:$src)),
+ (i32 (EXTRACT_SUBREG GPR64sp:$src, sub_32))>;
+
+// __builtin_trap() uses the BRK instruction on AArch64.
+def : Pat<(trap), (BRK 1)>;
+
+// Conversions within AdvSIMD types in the same register size are free.
+// But because we need a consistent lane ordering, in big endian many
+// conversions require one or more REV instructions.
+//
+// Consider a simple memory load followed by a bitconvert then a store.
+// v0 = load v2i32
+// v1 = BITCAST v2i32 v0 to v4i16
+// store v4i16 v2
+//
+// In big endian mode every memory access has an implicit byte swap. LDR and
+// STR do a 64-bit byte swap, whereas LD1/ST1 do a byte swap per lane - that
+// is, they treat the vector as a sequence of elements to be byte-swapped.
+// The two pairs of instructions are fundamentally incompatible. We've decided
+// to use LD1/ST1 only to simplify compiler implementation.
+//
+// LD1/ST1 perform the equivalent of a sequence of LDR/STR + REV. This makes
+// the original code sequence:
+// v0 = load v2i32
+// v1 = REV v2i32 (implicit)
+// v2 = BITCAST v2i32 v1 to v4i16
+// v3 = REV v4i16 v2 (implicit)
+// store v4i16 v3
+//
+// But this is now broken - the value stored is different to the value loaded
+// due to lane reordering. To fix this, on every BITCAST we must perform two
+// other REVs:
+// v0 = load v2i32
+// v1 = REV v2i32 (implicit)
+// v2 = REV v2i32
+// v3 = BITCAST v2i32 v2 to v4i16
+// v4 = REV v4i16
+// v5 = REV v4i16 v4 (implicit)
+// store v4i16 v5
+//
+// This means an extra two instructions, but actually in most cases the two REV
+// instructions can be combined into one. For example:
+// (REV64_2s (REV64_4h X)) === (REV32_4h X)
+//
+// There is also no 128-bit REV instruction. This must be synthesized with an
+// EXT instruction.
+//
+// Most bitconverts require some sort of conversion. The only exceptions are:
+// a) Identity conversions - vNfX <-> vNiX
+// b) Single-lane-to-scalar - v1fX <-> fX or v1iX <-> iX
+//
+
+// Natural vector casts (64 bit)
+def : Pat<(v8i8 (AArch64NvCast (v2i32 FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v4i16 (AArch64NvCast (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4f16 (AArch64NvCast (v2i32 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v2i32 (AArch64NvCast (v2i32 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2f32 (AArch64NvCast (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v1i64 (AArch64NvCast (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;
+
+def : Pat<(v8i8 (AArch64NvCast (v4i16 FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v4i16 (AArch64NvCast (v4i16 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4f16 (AArch64NvCast (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v2i32 (AArch64NvCast (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v1i64 (AArch64NvCast (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>;
+
+def : Pat<(v8i8 (AArch64NvCast (v8i8 FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v4i16 (AArch64NvCast (v8i8 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4f16 (AArch64NvCast (v8i8 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v2i32 (AArch64NvCast (v8i8 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v1i64 (AArch64NvCast (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>;
+
+def : Pat<(v8i8 (AArch64NvCast (f64 FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v4i16 (AArch64NvCast (f64 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4f16 (AArch64NvCast (f64 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v2i32 (AArch64NvCast (f64 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2f32 (AArch64NvCast (f64 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v1i64 (AArch64NvCast (f64 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1f64 (AArch64NvCast (f64 FPR64:$src))), (v1f64 FPR64:$src)>;
+
+def : Pat<(v8i8 (AArch64NvCast (v2f32 FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v4i16 (AArch64NvCast (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v2i32 (AArch64NvCast (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2f32 (AArch64NvCast (v2f32 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v1i64 (AArch64NvCast (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>;
+
+// Natural vector casts (128 bit)
+def : Pat<(v16i8 (AArch64NvCast (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v8i16 (AArch64NvCast (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8f16 (AArch64NvCast (v4i32 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v4i32 (AArch64NvCast (v4i32 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4f32 (AArch64NvCast (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v2i64 (AArch64NvCast (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2f64 (AArch64NvCast (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>;
+
+def : Pat<(v16i8 (AArch64NvCast (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v8i16 (AArch64NvCast (v8i16 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8f16 (AArch64NvCast (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v4i32 (AArch64NvCast (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v2i64 (AArch64NvCast (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v4f32 (AArch64NvCast (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v2f64 (AArch64NvCast (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>;
+
+def : Pat<(v16i8 (AArch64NvCast (v16i8 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v8i16 (AArch64NvCast (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8f16 (AArch64NvCast (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v4i32 (AArch64NvCast (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v2i64 (AArch64NvCast (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v4f32 (AArch64NvCast (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v2f64 (AArch64NvCast (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>;
+
+def : Pat<(v16i8 (AArch64NvCast (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v8i16 (AArch64NvCast (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8f16 (AArch64NvCast (v2i64 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v4i32 (AArch64NvCast (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v2i64 (AArch64NvCast (v2i64 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v4f32 (AArch64NvCast (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v2f64 (AArch64NvCast (v2i64 FPR128:$src))), (v2f64 FPR128:$src)>;
+
+def : Pat<(v16i8 (AArch64NvCast (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v8i16 (AArch64NvCast (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v4i32 (AArch64NvCast (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4f32 (AArch64NvCast (v4f32 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v2i64 (AArch64NvCast (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v8f16 (AArch64NvCast (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v2f64 (AArch64NvCast (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>;
+
+def : Pat<(v16i8 (AArch64NvCast (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v8i16 (AArch64NvCast (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v4i32 (AArch64NvCast (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v2i64 (AArch64NvCast (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2f64 (AArch64NvCast (v2f64 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v8f16 (AArch64NvCast (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v4f32 (AArch64NvCast (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v8i8 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v4i16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v4f16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+
+def : Pat<(i64 (bitconvert (v8i8 V64:$Vn))),
+ (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))),
+ (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
+ (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v4f16 V64:$Vn))),
+ (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
+ (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))),
+ (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v8i8 (bitconvert GPR64:$Xn)),
+ (REV64v8i8 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+def : Pat<(v4i16 (bitconvert GPR64:$Xn)),
+ (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+def : Pat<(v2i32 (bitconvert GPR64:$Xn)),
+ (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+def : Pat<(v4f16 (bitconvert GPR64:$Xn)),
+ (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+def : Pat<(v2f32 (bitconvert GPR64:$Xn)),
+ (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+
+def : Pat<(i64 (bitconvert (v8i8 V64:$Vn))),
+ (REV64v8i8 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))),
+ (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
+ (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+def : Pat<(i64 (bitconvert (v4f16 V64:$Vn))),
+ (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
+ (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+}
+def : Pat<(v1i64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v1f64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(i64 (bitconvert (v1i64 V64:$Vn))),
+ (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(v1i64 (scalar_to_vector GPR64:$Xn)),
+ (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v1f64 (scalar_to_vector GPR64:$Xn)),
+ (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Xn))), (v1f64 FPR64:$Xn)>;
+
+def : Pat<(f32 (bitconvert (i32 GPR32:$Xn))),
+ (COPY_TO_REGCLASS GPR32:$Xn, FPR32)>;
+def : Pat<(i32 (bitconvert (f32 FPR32:$Xn))),
+ (COPY_TO_REGCLASS FPR32:$Xn, GPR32)>;
+def : Pat<(f64 (bitconvert (i64 GPR64:$Xn))),
+ (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(i64 (bitconvert (f64 FPR64:$Xn))),
+ (COPY_TO_REGCLASS FPR64:$Xn, GPR64)>;
+def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))),
+ (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v4f16 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))),
+ (v1i64 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))),
+ (v1i64 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v1i64 (bitconvert (v8i8 FPR64:$src))),
+ (v1i64 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v1i64 (bitconvert (v4f16 FPR64:$src))),
+ (v1i64 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))),
+ (v1i64 (REV64v2i32 FPR64:$src))>;
+}
+def : Pat<(v1i64 (bitconvert (v1f64 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (f64 FPR64:$src))), (v1i64 FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v8i8 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (f64 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))), (v2i32 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))),
+ (v2i32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))),
+ (v2i32 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (v8i8 FPR64:$src))),
+ (v2i32 (REV32v8i8 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (f64 FPR64:$src))),
+ (v2i32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))),
+ (v2i32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))),
+ (v2i32 (REV64v4i16 FPR64:$src))>;
+}
+def : Pat<(v2i32 (bitconvert (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v8i8 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (f64 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v4f16 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))), (v4i16 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))),
+ (v4i16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))),
+ (v4i16 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (v8i8 FPR64:$src))),
+ (v4i16 (REV16v8i8 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (f64 FPR64:$src))),
+ (v4i16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (v4f16 FPR64:$src))),
+ (v4i16 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))),
+ (v4i16 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))),
+ (v4i16 (REV64v4i16 FPR64:$src))>;
+}
+
+let Predicates = [IsLE] in {
+def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4f16 (bitconvert (v2i32 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4f16 (bitconvert (v8i8 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4f16 (bitconvert (f64 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))), (v4f16 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))),
+ (v4f16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4f16 (bitconvert (v2i32 FPR64:$src))),
+ (v4f16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))),
+ (v4f16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4f16 (bitconvert (v8i8 FPR64:$src))),
+ (v4f16 (REV16v8i8 FPR64:$src))>;
+def : Pat<(v4f16 (bitconvert (f64 FPR64:$src))),
+ (v4f16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))),
+ (v4f16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))),
+ (v4f16 (REV64v4i16 FPR64:$src))>;
+}
+
+
+
+let Predicates = [IsLE] in {
+def : Pat<(v8i8 (bitconvert (v1i64 FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v8i8 (bitconvert (v2i32 FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v8i8 (bitconvert (v4i16 FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v8i8 (bitconvert (f64 FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v8i8 (bitconvert (v2f32 FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v8i8 (bitconvert (v1f64 FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v8i8 (bitconvert (v4f16 FPR64:$src))), (v8i8 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v8i8 (bitconvert (v1i64 FPR64:$src))),
+ (v8i8 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v8i8 (bitconvert (v2i32 FPR64:$src))),
+ (v8i8 (REV32v8i8 FPR64:$src))>;
+def : Pat<(v8i8 (bitconvert (v4i16 FPR64:$src))),
+ (v8i8 (REV16v8i8 FPR64:$src))>;
+def : Pat<(v8i8 (bitconvert (f64 FPR64:$src))),
+ (v8i8 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v8i8 (bitconvert (v2f32 FPR64:$src))),
+ (v8i8 (REV32v8i8 FPR64:$src))>;
+def : Pat<(v8i8 (bitconvert (v1f64 FPR64:$src))),
+ (v8i8 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v8i8 (bitconvert (v4f16 FPR64:$src))),
+ (v8i8 (REV16v8i8 FPR64:$src))>;
+}
+
+let Predicates = [IsLE] in {
+def : Pat<(f64 (bitconvert (v2i32 FPR64:$src))), (f64 FPR64:$src)>;
+def : Pat<(f64 (bitconvert (v4i16 FPR64:$src))), (f64 FPR64:$src)>;
+def : Pat<(f64 (bitconvert (v2f32 FPR64:$src))), (f64 FPR64:$src)>;
+def : Pat<(f64 (bitconvert (v8i8 FPR64:$src))), (f64 FPR64:$src)>;
+def : Pat<(f64 (bitconvert (v4f16 FPR64:$src))), (f64 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(f64 (bitconvert (v2i32 FPR64:$src))),
+ (f64 (REV64v2i32 FPR64:$src))>;
+def : Pat<(f64 (bitconvert (v4i16 FPR64:$src))),
+ (f64 (REV64v4i16 FPR64:$src))>;
+def : Pat<(f64 (bitconvert (v2f32 FPR64:$src))),
+ (f64 (REV64v2i32 FPR64:$src))>;
+def : Pat<(f64 (bitconvert (v8i8 FPR64:$src))),
+ (f64 (REV64v8i8 FPR64:$src))>;
+def : Pat<(f64 (bitconvert (v4f16 FPR64:$src))),
+ (f64 (REV64v4i16 FPR64:$src))>;
+}
+def : Pat<(f64 (bitconvert (v1i64 FPR64:$src))), (f64 FPR64:$src)>;
+def : Pat<(f64 (bitconvert (v1f64 FPR64:$src))), (f64 FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v8i8 FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v4f16 FPR64:$src))), (v1f64 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))),
+ (v1f64 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))),
+ (v1f64 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v1f64 (bitconvert (v8i8 FPR64:$src))),
+ (v1f64 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))),
+ (v1f64 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v1f64 (bitconvert (v4f16 FPR64:$src))),
+ (v1f64 (REV64v4i16 FPR64:$src))>;
+}
+def : Pat<(v1f64 (bitconvert (v1i64 FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (f64 FPR64:$src))), (v1f64 FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v8i8 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))), (v2f32 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))),
+ (v2f32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))),
+ (v2f32 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (v8i8 FPR64:$src))),
+ (v2f32 (REV32v8i8 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))),
+ (v2f32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))),
+ (v2f32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))),
+ (v2f32 (REV64v4i16 FPR64:$src))>;
+}
+def : Pat<(v2f32 (bitconvert (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(f128 (bitconvert (v2i64 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v8f16 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))), (f128 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(f128 (bitconvert (v2i64 FPR128:$src))),
+ (f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>;
+def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))),
+ (f128 (EXTv16i8 (REV64v4i32 FPR128:$src),
+ (REV64v4i32 FPR128:$src), (i32 8)))>;
+def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))),
+ (f128 (EXTv16i8 (REV64v8i16 FPR128:$src),
+ (REV64v8i16 FPR128:$src), (i32 8)))>;
+def : Pat<(f128 (bitconvert (v8f16 FPR128:$src))),
+ (f128 (EXTv16i8 (REV64v8i16 FPR128:$src),
+ (REV64v8i16 FPR128:$src), (i32 8)))>;
+def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))),
+ (f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>;
+def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))),
+ (f128 (EXTv16i8 (REV64v4i32 FPR128:$src),
+ (REV64v4i32 FPR128:$src), (i32 8)))>;
+def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))),
+ (f128 (EXTv16i8 (REV64v16i8 FPR128:$src),
+ (REV64v16i8 FPR128:$src), (i32 8)))>;
+}
+
+let Predicates = [IsLE] in {
+def : Pat<(v2f64 (bitconvert (f128 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v8f16 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v2f64 (bitconvert (f128 FPR128:$src))),
+ (v2f64 (EXTv16i8 FPR128:$src,
+ FPR128:$src, (i32 8)))>;
+def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))),
+ (v2f64 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))),
+ (v2f64 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v2f64 (bitconvert (v8f16 FPR128:$src))),
+ (v2f64 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))),
+ (v2f64 (REV64v16i8 FPR128:$src))>;
+def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))),
+ (v2f64 (REV64v4i32 FPR128:$src))>;
+}
+def : Pat<(v2f64 (bitconvert (v2i64 FPR128:$src))), (v2f64 FPR128:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v4f32 (bitconvert (f128 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v8f16 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v4f32 (bitconvert (f128 FPR128:$src))),
+ (v4f32 (EXTv16i8 (REV64v4i32 FPR128:$src),
+ (REV64v4i32 FPR128:$src), (i32 8)))>;
+def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))),
+ (v4f32 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v4f32 (bitconvert (v8f16 FPR128:$src))),
+ (v4f32 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))),
+ (v4f32 (REV32v16i8 FPR128:$src))>;
+def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))),
+ (v4f32 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))),
+ (v4f32 (REV64v4i32 FPR128:$src))>;
+}
+def : Pat<(v4f32 (bitconvert (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v2i64 (bitconvert (f128 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v8f16 FPR128:$src))), (v2i64 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v2i64 (bitconvert (f128 FPR128:$src))),
+ (v2i64 (EXTv16i8 FPR128:$src,
+ FPR128:$src, (i32 8)))>;
+def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))),
+ (v2i64 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))),
+ (v2i64 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))),
+ (v2i64 (REV64v16i8 FPR128:$src))>;
+def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))),
+ (v2i64 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v2i64 (bitconvert (v8f16 FPR128:$src))),
+ (v2i64 (REV64v8i16 FPR128:$src))>;
+}
+def : Pat<(v2i64 (bitconvert (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v4i32 (bitconvert (f128 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v8f16 FPR128:$src))), (v4i32 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v4i32 (bitconvert (f128 FPR128:$src))),
+ (v4i32 (EXTv16i8 (REV64v4i32 FPR128:$src),
+ (REV64v4i32 FPR128:$src),
+ (i32 8)))>;
+def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))),
+ (v4i32 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))),
+ (v4i32 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))),
+ (v4i32 (REV32v16i8 FPR128:$src))>;
+def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))),
+ (v4i32 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v4i32 (bitconvert (v8f16 FPR128:$src))),
+ (v4i32 (REV32v8i16 FPR128:$src))>;
+}
+def : Pat<(v4i32 (bitconvert (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v8i16 (bitconvert (f128 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v8f16 FPR128:$src))), (v8i16 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v8i16 (bitconvert (f128 FPR128:$src))),
+ (v8i16 (EXTv16i8 (REV64v8i16 FPR128:$src),
+ (REV64v8i16 FPR128:$src),
+ (i32 8)))>;
+def : Pat<(v8i16 (bitconvert (v2i64 FPR128:$src))),
+ (v8i16 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))),
+ (v8i16 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))),
+ (v8i16 (REV16v16i8 FPR128:$src))>;
+def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))),
+ (v8i16 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))),
+ (v8i16 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v8i16 (bitconvert (v8f16 FPR128:$src))),
+ (v8i16 (REV32v8i16 FPR128:$src))>;
+}
+
+let Predicates = [IsLE] in {
+def : Pat<(v8f16 (bitconvert (f128 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v2i64 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v4i32 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v8f16 (bitconvert (f128 FPR128:$src))),
+ (v8f16 (EXTv16i8 (REV64v8i16 FPR128:$src),
+ (REV64v8i16 FPR128:$src),
+ (i32 8)))>;
+def : Pat<(v8f16 (bitconvert (v2i64 FPR128:$src))),
+ (v8f16 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v8f16 (bitconvert (v4i32 FPR128:$src))),
+ (v8f16 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))),
+ (v8f16 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v8f16 (bitconvert (v16i8 FPR128:$src))),
+ (v8f16 (REV16v16i8 FPR128:$src))>;
+def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))),
+ (v8f16 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))),
+ (v8f16 (REV32v8i16 FPR128:$src))>;
+}
+
+let Predicates = [IsLE] in {
+def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))), (v16i8 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))),
+ (v16i8 (EXTv16i8 (REV64v16i8 FPR128:$src),
+ (REV64v16i8 FPR128:$src),
+ (i32 8)))>;
+def : Pat<(v16i8 (bitconvert (v2i64 FPR128:$src))),
+ (v16i8 (REV64v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))),
+ (v16i8 (REV32v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))),
+ (v16i8 (REV16v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))),
+ (v16i8 (REV64v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))),
+ (v16i8 (REV32v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))),
+ (v16i8 (REV16v16i8 FPR128:$src))>;
+}
+
+def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 0))),
+ (EXTRACT_SUBREG V128:$Rn, dsub)>;
+def : Pat<(v8i8 (extract_subvector V128:$Rn, (i64 0))),
+ (EXTRACT_SUBREG V128:$Rn, dsub)>;
+def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 0))),
+ (EXTRACT_SUBREG V128:$Rn, dsub)>;
+def : Pat<(v4f16 (extract_subvector V128:$Rn, (i64 0))),
+ (EXTRACT_SUBREG V128:$Rn, dsub)>;
+def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 0))),
+ (EXTRACT_SUBREG V128:$Rn, dsub)>;
+def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 0))),
+ (EXTRACT_SUBREG V128:$Rn, dsub)>;
+def : Pat<(v1f64 (extract_subvector V128:$Rn, (i64 0))),
+ (EXTRACT_SUBREG V128:$Rn, dsub)>;
+
+def : Pat<(v8i8 (extract_subvector (v16i8 FPR128:$Rn), (i64 1))),
+ (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
+def : Pat<(v4i16 (extract_subvector (v8i16 FPR128:$Rn), (i64 1))),
+ (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
+def : Pat<(v2i32 (extract_subvector (v4i32 FPR128:$Rn), (i64 1))),
+ (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
+def : Pat<(v1i64 (extract_subvector (v2i64 FPR128:$Rn), (i64 1))),
+ (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
+
+// A 64-bit subvector insert to the first 128-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(insert_subvector undef, (v1i64 FPR64:$src), (i32 0)),
+ (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v1f64 FPR64:$src), (i32 0)),
+ (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v2i32 FPR64:$src), (i32 0)),
+ (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v2f32 FPR64:$src), (i32 0)),
+ (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v4i16 FPR64:$src), (i32 0)),
+ (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v4f16 FPR64:$src), (i32 0)),
+ (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v8i8 FPR64:$src), (i32 0)),
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+
+// Use pair-wise add instructions when summing up the lanes for v2f64, v2i64
+// or v2f32.
+def : Pat<(i64 (add (vector_extract (v2i64 FPR128:$Rn), (i64 0)),
+ (vector_extract (v2i64 FPR128:$Rn), (i64 1)))),
+ (i64 (ADDPv2i64p (v2i64 FPR128:$Rn)))>;
+def : Pat<(f64 (fadd (vector_extract (v2f64 FPR128:$Rn), (i64 0)),
+ (vector_extract (v2f64 FPR128:$Rn), (i64 1)))),
+ (f64 (FADDPv2i64p (v2f64 FPR128:$Rn)))>;
+ // vector_extract on 64-bit vectors gets promoted to a 128 bit vector,
+ // so we match on v4f32 here, not v2f32. This will also catch adding
+ // the low two lanes of a true v4f32 vector.
+def : Pat<(fadd (vector_extract (v4f32 FPR128:$Rn), (i64 0)),
+ (vector_extract (v4f32 FPR128:$Rn), (i64 1))),
+ (f32 (FADDPv2i32p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>;
+
+// Scalar 64-bit shifts in FPR64 registers.
+def : Pat<(i64 (int_aarch64_neon_sshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+ (SSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(i64 (int_aarch64_neon_ushl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+ (USHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(i64 (int_aarch64_neon_srshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+ (SRSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(i64 (int_aarch64_neon_urshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+ (URSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
+
+// Patterns for nontemporal/no-allocate stores.
+// We have to resort to tricks to turn a single-input store into a store pair,
+// because there is no single-input nontemporal store, only STNP.
+let Predicates = [IsLE] in {
+let AddedComplexity = 15 in {
+class NTStore128Pat<ValueType VT> :
+ Pat<(nontemporalstore (VT FPR128:$Rt),
+ (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)),
+ (STNPDi (EXTRACT_SUBREG FPR128:$Rt, dsub),
+ (CPYi64 FPR128:$Rt, (i64 1)),
+ GPR64sp:$Rn, simm7s8:$offset)>;
+
+def : NTStore128Pat<v2i64>;
+def : NTStore128Pat<v4i32>;
+def : NTStore128Pat<v8i16>;
+def : NTStore128Pat<v16i8>;
+
+class NTStore64Pat<ValueType VT> :
+ Pat<(nontemporalstore (VT FPR64:$Rt),
+ (am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)),
+ (STNPSi (EXTRACT_SUBREG FPR64:$Rt, ssub),
+ (CPYi32 (SUBREG_TO_REG (i64 0), FPR64:$Rt, dsub), (i64 1)),
+ GPR64sp:$Rn, simm7s4:$offset)>;
+
+// FIXME: Shouldn't v1f64 loads/stores be promoted to v1i64?
+def : NTStore64Pat<v1f64>;
+def : NTStore64Pat<v1i64>;
+def : NTStore64Pat<v2i32>;
+def : NTStore64Pat<v4i16>;
+def : NTStore64Pat<v8i8>;
+
+def : Pat<(nontemporalstore GPR64:$Rt,
+ (am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)),
+ (STNPWi (EXTRACT_SUBREG GPR64:$Rt, sub_32),
+ (EXTRACT_SUBREG (UBFMXri GPR64:$Rt, 32, 63), sub_32),
+ GPR64sp:$Rn, simm7s4:$offset)>;
+} // AddedComplexity=10
+} // Predicates = [IsLE]
+
+// Tail call return handling. These are all compiler pseudo-instructions,
+// so no encoding information or anything like that.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
+ def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst, i32imm:$FPDiff), []>,
+ Sched<[WriteBrReg]>;
+ def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff), []>,
+ Sched<[WriteBrReg]>;
+}
+
+def : Pat<(AArch64tcret tcGPR64:$dst, (i32 timm:$FPDiff)),
+ (TCRETURNri tcGPR64:$dst, imm:$FPDiff)>;
+def : Pat<(AArch64tcret tglobaladdr:$dst, (i32 timm:$FPDiff)),
+ (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
+def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)),
+ (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
+
+include "AArch64InstrAtomics.td"
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp b/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
new file mode 100644
index 000000000000..20de07424c53
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
@@ -0,0 +1,1161 @@
+//===- AArch64InstructionSelector.cpp ----------------------------*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the InstructionSelector class for
+/// AArch64.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64InstructionSelector.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64RegisterBankInfo.h"
+#include "AArch64RegisterInfo.h"
+#include "AArch64Subtarget.h"
+#include "AArch64TargetMachine.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "aarch64-isel"
+
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "You shouldn't build this"
+#endif
+
+#include "AArch64GenGlobalISel.inc"
+
+AArch64InstructionSelector::AArch64InstructionSelector(
+ const AArch64TargetMachine &TM, const AArch64Subtarget &STI,
+ const AArch64RegisterBankInfo &RBI)
+ : InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()),
+ TRI(*STI.getRegisterInfo()), RBI(RBI) {}
+
+// FIXME: This should be target-independent, inferred from the types declared
+// for each class in the bank.
+static const TargetRegisterClass *
+getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB,
+ const RegisterBankInfo &RBI) {
+ if (RB.getID() == AArch64::GPRRegBankID) {
+ if (Ty.getSizeInBits() <= 32)
+ return &AArch64::GPR32RegClass;
+ if (Ty.getSizeInBits() == 64)
+ return &AArch64::GPR64RegClass;
+ return nullptr;
+ }
+
+ if (RB.getID() == AArch64::FPRRegBankID) {
+ if (Ty.getSizeInBits() == 32)
+ return &AArch64::FPR32RegClass;
+ if (Ty.getSizeInBits() == 64)
+ return &AArch64::FPR64RegClass;
+ if (Ty.getSizeInBits() == 128)
+ return &AArch64::FPR128RegClass;
+ return nullptr;
+ }
+
+ return nullptr;
+}
+
+/// Check whether \p I is a currently unsupported binary operation:
+/// - it has an unsized type
+/// - an operand is not a vreg
+/// - all operands are not in the same bank
+/// These are checks that should someday live in the verifier, but right now,
+/// these are mostly limitations of the aarch64 selector.
+static bool unsupportedBinOp(const MachineInstr &I,
+ const AArch64RegisterBankInfo &RBI,
+ const MachineRegisterInfo &MRI,
+ const AArch64RegisterInfo &TRI) {
+ LLT Ty = MRI.getType(I.getOperand(0).getReg());
+ if (!Ty.isValid()) {
+ DEBUG(dbgs() << "Generic binop register should be typed\n");
+ return true;
+ }
+
+ const RegisterBank *PrevOpBank = nullptr;
+ for (auto &MO : I.operands()) {
+ // FIXME: Support non-register operands.
+ if (!MO.isReg()) {
+ DEBUG(dbgs() << "Generic inst non-reg operands are unsupported\n");
+ return true;
+ }
+
+ // FIXME: Can generic operations have physical registers operands? If
+ // so, this will need to be taught about that, and we'll need to get the
+ // bank out of the minimal class for the register.
+ // Either way, this needs to be documented (and possibly verified).
+ if (!TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+ DEBUG(dbgs() << "Generic inst has physical register operand\n");
+ return true;
+ }
+
+ const RegisterBank *OpBank = RBI.getRegBank(MO.getReg(), MRI, TRI);
+ if (!OpBank) {
+ DEBUG(dbgs() << "Generic register has no bank or class\n");
+ return true;
+ }
+
+ if (PrevOpBank && OpBank != PrevOpBank) {
+ DEBUG(dbgs() << "Generic inst operands have different banks\n");
+ return true;
+ }
+ PrevOpBank = OpBank;
+ }
+ return false;
+}
+
+/// Select the AArch64 opcode for the basic binary operation \p GenericOpc
+/// (such as G_OR or G_ADD), appropriate for the register bank \p RegBankID
+/// and of size \p OpSize.
+/// \returns \p GenericOpc if the combination is unsupported.
+static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID,
+ unsigned OpSize) {
+ switch (RegBankID) {
+ case AArch64::GPRRegBankID:
+ if (OpSize <= 32) {
+ assert((OpSize == 32 || (GenericOpc != TargetOpcode::G_SDIV &&
+ GenericOpc != TargetOpcode::G_UDIV &&
+ GenericOpc != TargetOpcode::G_LSHR &&
+ GenericOpc != TargetOpcode::G_ASHR)) &&
+ "operation should have been legalized before now");
+
+ switch (GenericOpc) {
+ case TargetOpcode::G_OR:
+ return AArch64::ORRWrr;
+ case TargetOpcode::G_XOR:
+ return AArch64::EORWrr;
+ case TargetOpcode::G_AND:
+ return AArch64::ANDWrr;
+ case TargetOpcode::G_ADD:
+ assert(OpSize != 32 && "s32 G_ADD should have been selected");
+ return AArch64::ADDWrr;
+ case TargetOpcode::G_SUB:
+ return AArch64::SUBWrr;
+ case TargetOpcode::G_SHL:
+ return AArch64::LSLVWr;
+ case TargetOpcode::G_LSHR:
+ return AArch64::LSRVWr;
+ case TargetOpcode::G_ASHR:
+ return AArch64::ASRVWr;
+ case TargetOpcode::G_SDIV:
+ return AArch64::SDIVWr;
+ case TargetOpcode::G_UDIV:
+ return AArch64::UDIVWr;
+ default:
+ return GenericOpc;
+ }
+ } else if (OpSize == 64) {
+ switch (GenericOpc) {
+ case TargetOpcode::G_OR:
+ return AArch64::ORRXrr;
+ case TargetOpcode::G_XOR:
+ return AArch64::EORXrr;
+ case TargetOpcode::G_AND:
+ return AArch64::ANDXrr;
+ case TargetOpcode::G_GEP:
+ return AArch64::ADDXrr;
+ case TargetOpcode::G_SUB:
+ return AArch64::SUBXrr;
+ case TargetOpcode::G_SHL:
+ return AArch64::LSLVXr;
+ case TargetOpcode::G_LSHR:
+ return AArch64::LSRVXr;
+ case TargetOpcode::G_ASHR:
+ return AArch64::ASRVXr;
+ case TargetOpcode::G_SDIV:
+ return AArch64::SDIVXr;
+ case TargetOpcode::G_UDIV:
+ return AArch64::UDIVXr;
+ default:
+ return GenericOpc;
+ }
+ }
+ case AArch64::FPRRegBankID:
+ switch (OpSize) {
+ case 32:
+ switch (GenericOpc) {
+ case TargetOpcode::G_FADD:
+ return AArch64::FADDSrr;
+ case TargetOpcode::G_FSUB:
+ return AArch64::FSUBSrr;
+ case TargetOpcode::G_FMUL:
+ return AArch64::FMULSrr;
+ case TargetOpcode::G_FDIV:
+ return AArch64::FDIVSrr;
+ default:
+ return GenericOpc;
+ }
+ case 64:
+ switch (GenericOpc) {
+ case TargetOpcode::G_FADD:
+ return AArch64::FADDDrr;
+ case TargetOpcode::G_FSUB:
+ return AArch64::FSUBDrr;
+ case TargetOpcode::G_FMUL:
+ return AArch64::FMULDrr;
+ case TargetOpcode::G_FDIV:
+ return AArch64::FDIVDrr;
+ case TargetOpcode::G_OR:
+ return AArch64::ORRv8i8;
+ default:
+ return GenericOpc;
+ }
+ }
+ };
+ return GenericOpc;
+}
+
+/// Select the AArch64 opcode for the G_LOAD or G_STORE operation \p GenericOpc,
+/// appropriate for the (value) register bank \p RegBankID and of memory access
+/// size \p OpSize. This returns the variant with the base+unsigned-immediate
+/// addressing mode (e.g., LDRXui).
+/// \returns \p GenericOpc if the combination is unsupported.
+static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
+ unsigned OpSize) {
+ const bool isStore = GenericOpc == TargetOpcode::G_STORE;
+ switch (RegBankID) {
+ case AArch64::GPRRegBankID:
+ switch (OpSize) {
+ case 8:
+ return isStore ? AArch64::STRBBui : AArch64::LDRBBui;
+ case 16:
+ return isStore ? AArch64::STRHHui : AArch64::LDRHHui;
+ case 32:
+ return isStore ? AArch64::STRWui : AArch64::LDRWui;
+ case 64:
+ return isStore ? AArch64::STRXui : AArch64::LDRXui;
+ }
+ case AArch64::FPRRegBankID:
+ switch (OpSize) {
+ case 8:
+ return isStore ? AArch64::STRBui : AArch64::LDRBui;
+ case 16:
+ return isStore ? AArch64::STRHui : AArch64::LDRHui;
+ case 32:
+ return isStore ? AArch64::STRSui : AArch64::LDRSui;
+ case 64:
+ return isStore ? AArch64::STRDui : AArch64::LDRDui;
+ }
+ };
+ return GenericOpc;
+}
+
+static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
+ MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
+ const RegisterBankInfo &RBI) {
+
+ unsigned DstReg = I.getOperand(0).getReg();
+ if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
+ assert(I.isCopy() && "Generic operators do not allow physical registers");
+ return true;
+ }
+
+ const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI);
+ const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
+ unsigned SrcReg = I.getOperand(1).getReg();
+ const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
+ (void)SrcSize;
+ assert((!TargetRegisterInfo::isPhysicalRegister(SrcReg) || I.isCopy()) &&
+ "No phys reg on generic operators");
+ assert(
+ (DstSize == SrcSize ||
+ // Copies are a mean to setup initial types, the number of
+ // bits may not exactly match.
+ (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
+ DstSize <= RBI.getSizeInBits(SrcReg, MRI, TRI)) ||
+ // Copies are a mean to copy bits around, as long as we are
+ // on the same register class, that's fine. Otherwise, that
+ // means we need some SUBREG_TO_REG or AND & co.
+ (((DstSize + 31) / 32 == (SrcSize + 31) / 32) && DstSize > SrcSize)) &&
+ "Copy with different width?!");
+ assert((DstSize <= 64 || RegBank.getID() == AArch64::FPRRegBankID) &&
+ "GPRs cannot get more than 64-bit width values");
+ const TargetRegisterClass *RC = nullptr;
+
+ if (RegBank.getID() == AArch64::FPRRegBankID) {
+ if (DstSize <= 32)
+ RC = &AArch64::FPR32RegClass;
+ else if (DstSize <= 64)
+ RC = &AArch64::FPR64RegClass;
+ else if (DstSize <= 128)
+ RC = &AArch64::FPR128RegClass;
+ else {
+ DEBUG(dbgs() << "Unexpected bitcast size " << DstSize << '\n');
+ return false;
+ }
+ } else {
+ assert(RegBank.getID() == AArch64::GPRRegBankID &&
+ "Bitcast for the flags?");
+ RC =
+ DstSize <= 32 ? &AArch64::GPR32allRegClass : &AArch64::GPR64allRegClass;
+ }
+
+ // No need to constrain SrcReg. It will get constrained when
+ // we hit another of its use or its defs.
+ // Copies do not have constraints.
+ if (!RBI.constrainGenericRegister(DstReg, *RC, MRI)) {
+ DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
+ << " operand\n");
+ return false;
+ }
+ I.setDesc(TII.get(AArch64::COPY));
+ return true;
+}
+
+static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) {
+ if (!DstTy.isScalar() || !SrcTy.isScalar())
+ return GenericOpc;
+
+ const unsigned DstSize = DstTy.getSizeInBits();
+ const unsigned SrcSize = SrcTy.getSizeInBits();
+
+ switch (DstSize) {
+ case 32:
+ switch (SrcSize) {
+ case 32:
+ switch (GenericOpc) {
+ case TargetOpcode::G_SITOFP:
+ return AArch64::SCVTFUWSri;
+ case TargetOpcode::G_UITOFP:
+ return AArch64::UCVTFUWSri;
+ case TargetOpcode::G_FPTOSI:
+ return AArch64::FCVTZSUWSr;
+ case TargetOpcode::G_FPTOUI:
+ return AArch64::FCVTZUUWSr;
+ default:
+ return GenericOpc;
+ }
+ case 64:
+ switch (GenericOpc) {
+ case TargetOpcode::G_SITOFP:
+ return AArch64::SCVTFUXSri;
+ case TargetOpcode::G_UITOFP:
+ return AArch64::UCVTFUXSri;
+ case TargetOpcode::G_FPTOSI:
+ return AArch64::FCVTZSUWDr;
+ case TargetOpcode::G_FPTOUI:
+ return AArch64::FCVTZUUWDr;
+ default:
+ return GenericOpc;
+ }
+ default:
+ return GenericOpc;
+ }
+ case 64:
+ switch (SrcSize) {
+ case 32:
+ switch (GenericOpc) {
+ case TargetOpcode::G_SITOFP:
+ return AArch64::SCVTFUWDri;
+ case TargetOpcode::G_UITOFP:
+ return AArch64::UCVTFUWDri;
+ case TargetOpcode::G_FPTOSI:
+ return AArch64::FCVTZSUXSr;
+ case TargetOpcode::G_FPTOUI:
+ return AArch64::FCVTZUUXSr;
+ default:
+ return GenericOpc;
+ }
+ case 64:
+ switch (GenericOpc) {
+ case TargetOpcode::G_SITOFP:
+ return AArch64::SCVTFUXDri;
+ case TargetOpcode::G_UITOFP:
+ return AArch64::UCVTFUXDri;
+ case TargetOpcode::G_FPTOSI:
+ return AArch64::FCVTZSUXDr;
+ case TargetOpcode::G_FPTOUI:
+ return AArch64::FCVTZUUXDr;
+ default:
+ return GenericOpc;
+ }
+ default:
+ return GenericOpc;
+ }
+ default:
+ return GenericOpc;
+ };
+ return GenericOpc;
+}
+
+static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) {
+ switch (P) {
+ default:
+ llvm_unreachable("Unknown condition code!");
+ case CmpInst::ICMP_NE:
+ return AArch64CC::NE;
+ case CmpInst::ICMP_EQ:
+ return AArch64CC::EQ;
+ case CmpInst::ICMP_SGT:
+ return AArch64CC::GT;
+ case CmpInst::ICMP_SGE:
+ return AArch64CC::GE;
+ case CmpInst::ICMP_SLT:
+ return AArch64CC::LT;
+ case CmpInst::ICMP_SLE:
+ return AArch64CC::LE;
+ case CmpInst::ICMP_UGT:
+ return AArch64CC::HI;
+ case CmpInst::ICMP_UGE:
+ return AArch64CC::HS;
+ case CmpInst::ICMP_ULT:
+ return AArch64CC::LO;
+ case CmpInst::ICMP_ULE:
+ return AArch64CC::LS;
+ }
+}
+
+static void changeFCMPPredToAArch64CC(CmpInst::Predicate P,
+ AArch64CC::CondCode &CondCode,
+ AArch64CC::CondCode &CondCode2) {
+ CondCode2 = AArch64CC::AL;
+ switch (P) {
+ default:
+ llvm_unreachable("Unknown FP condition!");
+ case CmpInst::FCMP_OEQ:
+ CondCode = AArch64CC::EQ;
+ break;
+ case CmpInst::FCMP_OGT:
+ CondCode = AArch64CC::GT;
+ break;
+ case CmpInst::FCMP_OGE:
+ CondCode = AArch64CC::GE;
+ break;
+ case CmpInst::FCMP_OLT:
+ CondCode = AArch64CC::MI;
+ break;
+ case CmpInst::FCMP_OLE:
+ CondCode = AArch64CC::LS;
+ break;
+ case CmpInst::FCMP_ONE:
+ CondCode = AArch64CC::MI;
+ CondCode2 = AArch64CC::GT;
+ break;
+ case CmpInst::FCMP_ORD:
+ CondCode = AArch64CC::VC;
+ break;
+ case CmpInst::FCMP_UNO:
+ CondCode = AArch64CC::VS;
+ break;
+ case CmpInst::FCMP_UEQ:
+ CondCode = AArch64CC::EQ;
+ CondCode2 = AArch64CC::VS;
+ break;
+ case CmpInst::FCMP_UGT:
+ CondCode = AArch64CC::HI;
+ break;
+ case CmpInst::FCMP_UGE:
+ CondCode = AArch64CC::PL;
+ break;
+ case CmpInst::FCMP_ULT:
+ CondCode = AArch64CC::LT;
+ break;
+ case CmpInst::FCMP_ULE:
+ CondCode = AArch64CC::LE;
+ break;
+ case CmpInst::FCMP_UNE:
+ CondCode = AArch64CC::NE;
+ break;
+ }
+}
+
+bool AArch64InstructionSelector::select(MachineInstr &I) const {
+ assert(I.getParent() && "Instruction should be in a basic block!");
+ assert(I.getParent()->getParent() && "Instruction should be in a function!");
+
+ MachineBasicBlock &MBB = *I.getParent();
+ MachineFunction &MF = *MBB.getParent();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ unsigned Opcode = I.getOpcode();
+ if (!isPreISelGenericOpcode(I.getOpcode())) {
+ // Certain non-generic instructions also need some special handling.
+
+ if (Opcode == TargetOpcode::LOAD_STACK_GUARD)
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+
+ if (Opcode == TargetOpcode::PHI) {
+ const unsigned DefReg = I.getOperand(0).getReg();
+ const LLT DefTy = MRI.getType(DefReg);
+
+ const TargetRegisterClass *DefRC = nullptr;
+ if (TargetRegisterInfo::isPhysicalRegister(DefReg)) {
+ DefRC = TRI.getRegClass(DefReg);
+ } else {
+ const RegClassOrRegBank &RegClassOrBank =
+ MRI.getRegClassOrRegBank(DefReg);
+
+ DefRC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
+ if (!DefRC) {
+ if (!DefTy.isValid()) {
+ DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
+ return false;
+ }
+ const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
+ DefRC = getRegClassForTypeOnBank(DefTy, RB, RBI);
+ if (!DefRC) {
+ DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
+ return false;
+ }
+ }
+ }
+
+ return RBI.constrainGenericRegister(DefReg, *DefRC, MRI);
+ }
+
+ if (I.isCopy())
+ return selectCopy(I, TII, MRI, TRI, RBI);
+
+ return true;
+ }
+
+
+ if (I.getNumOperands() != I.getNumExplicitOperands()) {
+ DEBUG(dbgs() << "Generic instruction has unexpected implicit operands\n");
+ return false;
+ }
+
+ if (selectImpl(I))
+ return true;
+
+ LLT Ty =
+ I.getOperand(0).isReg() ? MRI.getType(I.getOperand(0).getReg()) : LLT{};
+
+ switch (Opcode) {
+ case TargetOpcode::G_BRCOND: {
+ if (Ty.getSizeInBits() > 32) {
+ // We shouldn't need this on AArch64, but it would be implemented as an
+ // EXTRACT_SUBREG followed by a TBNZW because TBNZX has no encoding if the
+ // bit being tested is < 32.
+ DEBUG(dbgs() << "G_BRCOND has type: " << Ty
+ << ", expected at most 32-bits");
+ return false;
+ }
+
+ const unsigned CondReg = I.getOperand(0).getReg();
+ MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
+
+ auto MIB = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::TBNZW))
+ .addUse(CondReg)
+ .addImm(/*bit offset=*/0)
+ .addMBB(DestMBB);
+
+ I.eraseFromParent();
+ return constrainSelectedInstRegOperands(*MIB.getInstr(), TII, TRI, RBI);
+ }
+
+ case TargetOpcode::G_FCONSTANT:
+ case TargetOpcode::G_CONSTANT: {
+ const bool isFP = Opcode == TargetOpcode::G_FCONSTANT;
+
+ const LLT s32 = LLT::scalar(32);
+ const LLT s64 = LLT::scalar(64);
+ const LLT p0 = LLT::pointer(0, 64);
+
+ const unsigned DefReg = I.getOperand(0).getReg();
+ const LLT DefTy = MRI.getType(DefReg);
+ const unsigned DefSize = DefTy.getSizeInBits();
+ const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
+
+ // FIXME: Redundant check, but even less readable when factored out.
+ if (isFP) {
+ if (Ty != s32 && Ty != s64) {
+ DEBUG(dbgs() << "Unable to materialize FP " << Ty
+ << " constant, expected: " << s32 << " or " << s64
+ << '\n');
+ return false;
+ }
+
+ if (RB.getID() != AArch64::FPRRegBankID) {
+ DEBUG(dbgs() << "Unable to materialize FP " << Ty
+ << " constant on bank: " << RB << ", expected: FPR\n");
+ return false;
+ }
+ } else {
+ if (Ty != s32 && Ty != s64 && Ty != p0) {
+ DEBUG(dbgs() << "Unable to materialize integer " << Ty
+ << " constant, expected: " << s32 << ", " << s64 << ", or "
+ << p0 << '\n');
+ return false;
+ }
+
+ if (RB.getID() != AArch64::GPRRegBankID) {
+ DEBUG(dbgs() << "Unable to materialize integer " << Ty
+ << " constant on bank: " << RB << ", expected: GPR\n");
+ return false;
+ }
+ }
+
+ const unsigned MovOpc =
+ DefSize == 32 ? AArch64::MOVi32imm : AArch64::MOVi64imm;
+
+ I.setDesc(TII.get(MovOpc));
+
+ if (isFP) {
+ const TargetRegisterClass &GPRRC =
+ DefSize == 32 ? AArch64::GPR32RegClass : AArch64::GPR64RegClass;
+ const TargetRegisterClass &FPRRC =
+ DefSize == 32 ? AArch64::FPR32RegClass : AArch64::FPR64RegClass;
+
+ const unsigned DefGPRReg = MRI.createVirtualRegister(&GPRRC);
+ MachineOperand &RegOp = I.getOperand(0);
+ RegOp.setReg(DefGPRReg);
+
+ BuildMI(MBB, std::next(I.getIterator()), I.getDebugLoc(),
+ TII.get(AArch64::COPY))
+ .addDef(DefReg)
+ .addUse(DefGPRReg);
+
+ if (!RBI.constrainGenericRegister(DefReg, FPRRC, MRI)) {
+ DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n");
+ return false;
+ }
+
+ MachineOperand &ImmOp = I.getOperand(1);
+ // FIXME: Is going through int64_t always correct?
+ ImmOp.ChangeToImmediate(
+ ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
+ } else {
+ uint64_t Val = I.getOperand(1).getCImm()->getZExtValue();
+ I.getOperand(1).ChangeToImmediate(Val);
+ }
+
+ constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+ return true;
+ }
+
+ case TargetOpcode::G_FRAME_INDEX: {
+ // allocas and G_FRAME_INDEX are only supported in addrspace(0).
+ if (Ty != LLT::pointer(0, 64)) {
+ DEBUG(dbgs() << "G_FRAME_INDEX pointer has type: " << Ty
+ << ", expected: " << LLT::pointer(0, 64) << '\n');
+ return false;
+ }
+
+ I.setDesc(TII.get(AArch64::ADDXri));
+
+ // MOs for a #0 shifted immediate.
+ I.addOperand(MachineOperand::CreateImm(0));
+ I.addOperand(MachineOperand::CreateImm(0));
+
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+ }
+
+ case TargetOpcode::G_GLOBAL_VALUE: {
+ auto GV = I.getOperand(1).getGlobal();
+ if (GV->isThreadLocal()) {
+ // FIXME: we don't support TLS yet.
+ return false;
+ }
+ unsigned char OpFlags = STI.ClassifyGlobalReference(GV, TM);
+ if (OpFlags & AArch64II::MO_GOT) {
+ I.setDesc(TII.get(AArch64::LOADgot));
+ I.getOperand(1).setTargetFlags(OpFlags);
+ } else {
+ I.setDesc(TII.get(AArch64::MOVaddr));
+ I.getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_PAGE);
+ MachineInstrBuilder MIB(MF, I);
+ MIB.addGlobalAddress(GV, I.getOperand(1).getOffset(),
+ OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+ }
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+ }
+
+ case TargetOpcode::G_LOAD:
+ case TargetOpcode::G_STORE: {
+ LLT MemTy = Ty;
+ LLT PtrTy = MRI.getType(I.getOperand(1).getReg());
+
+ if (PtrTy != LLT::pointer(0, 64)) {
+ DEBUG(dbgs() << "Load/Store pointer has type: " << PtrTy
+ << ", expected: " << LLT::pointer(0, 64) << '\n');
+ return false;
+ }
+
+#ifndef NDEBUG
+ // Sanity-check the pointer register.
+ const unsigned PtrReg = I.getOperand(1).getReg();
+ const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI);
+ assert(PtrRB.getID() == AArch64::GPRRegBankID &&
+ "Load/Store pointer operand isn't a GPR");
+ assert(MRI.getType(PtrReg).isPointer() &&
+ "Load/Store pointer operand isn't a pointer");
+#endif
+
+ const unsigned ValReg = I.getOperand(0).getReg();
+ const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI);
+
+ const unsigned NewOpc =
+ selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemTy.getSizeInBits());
+ if (NewOpc == I.getOpcode())
+ return false;
+
+ I.setDesc(TII.get(NewOpc));
+
+ I.addOperand(MachineOperand::CreateImm(0));
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+ }
+
+ case TargetOpcode::G_MUL: {
+ // Reject the various things we don't support yet.
+ if (unsupportedBinOp(I, RBI, MRI, TRI))
+ return false;
+
+ const unsigned DefReg = I.getOperand(0).getReg();
+ const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
+
+ if (RB.getID() != AArch64::GPRRegBankID) {
+ DEBUG(dbgs() << "G_MUL on bank: " << RB << ", expected: GPR\n");
+ return false;
+ }
+
+ unsigned ZeroReg;
+ unsigned NewOpc;
+ if (Ty.isScalar() && Ty.getSizeInBits() <= 32) {
+ NewOpc = AArch64::MADDWrrr;
+ ZeroReg = AArch64::WZR;
+ } else if (Ty == LLT::scalar(64)) {
+ NewOpc = AArch64::MADDXrrr;
+ ZeroReg = AArch64::XZR;
+ } else {
+ DEBUG(dbgs() << "G_MUL has type: " << Ty << ", expected: "
+ << LLT::scalar(32) << " or " << LLT::scalar(64) << '\n');
+ return false;
+ }
+
+ I.setDesc(TII.get(NewOpc));
+
+ I.addOperand(MachineOperand::CreateReg(ZeroReg, /*isDef=*/false));
+
+ // Now that we selected an opcode, we need to constrain the register
+ // operands to use appropriate classes.
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+ }
+
+ case TargetOpcode::G_FADD:
+ case TargetOpcode::G_FSUB:
+ case TargetOpcode::G_FMUL:
+ case TargetOpcode::G_FDIV:
+
+ case TargetOpcode::G_OR:
+ case TargetOpcode::G_XOR:
+ case TargetOpcode::G_AND:
+ case TargetOpcode::G_SHL:
+ case TargetOpcode::G_LSHR:
+ case TargetOpcode::G_ASHR:
+ case TargetOpcode::G_SDIV:
+ case TargetOpcode::G_UDIV:
+ case TargetOpcode::G_ADD:
+ case TargetOpcode::G_SUB:
+ case TargetOpcode::G_GEP: {
+ // Reject the various things we don't support yet.
+ if (unsupportedBinOp(I, RBI, MRI, TRI))
+ return false;
+
+ const unsigned OpSize = Ty.getSizeInBits();
+
+ const unsigned DefReg = I.getOperand(0).getReg();
+ const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
+
+ const unsigned NewOpc = selectBinaryOp(I.getOpcode(), RB.getID(), OpSize);
+ if (NewOpc == I.getOpcode())
+ return false;
+
+ I.setDesc(TII.get(NewOpc));
+ // FIXME: Should the type be always reset in setDesc?
+
+ // Now that we selected an opcode, we need to constrain the register
+ // operands to use appropriate classes.
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+ }
+
+ case TargetOpcode::G_PTRTOINT:
+ case TargetOpcode::G_TRUNC: {
+ const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
+ const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
+
+ const unsigned DstReg = I.getOperand(0).getReg();
+ const unsigned SrcReg = I.getOperand(1).getReg();
+
+ const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
+ const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
+
+ if (DstRB.getID() != SrcRB.getID()) {
+ DEBUG(dbgs() << "G_TRUNC input/output on different banks\n");
+ return false;
+ }
+
+ if (DstRB.getID() == AArch64::GPRRegBankID) {
+ const TargetRegisterClass *DstRC =
+ getRegClassForTypeOnBank(DstTy, DstRB, RBI);
+ if (!DstRC)
+ return false;
+
+ const TargetRegisterClass *SrcRC =
+ getRegClassForTypeOnBank(SrcTy, SrcRB, RBI);
+ if (!SrcRC)
+ return false;
+
+ if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
+ !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
+ DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
+ return false;
+ }
+
+ if (DstRC == SrcRC) {
+ // Nothing to be done
+ } else if (DstRC == &AArch64::GPR32RegClass &&
+ SrcRC == &AArch64::GPR64RegClass) {
+ I.getOperand(1).setSubReg(AArch64::sub_32);
+ } else {
+ return false;
+ }
+
+ I.setDesc(TII.get(TargetOpcode::COPY));
+ return true;
+ } else if (DstRB.getID() == AArch64::FPRRegBankID) {
+ if (DstTy == LLT::vector(4, 16) && SrcTy == LLT::vector(4, 32)) {
+ I.setDesc(TII.get(AArch64::XTNv4i16));
+ constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ case TargetOpcode::G_ANYEXT: {
+ const unsigned DstReg = I.getOperand(0).getReg();
+ const unsigned SrcReg = I.getOperand(1).getReg();
+
+ const RegisterBank &RBDst = *RBI.getRegBank(DstReg, MRI, TRI);
+ if (RBDst.getID() != AArch64::GPRRegBankID) {
+ DEBUG(dbgs() << "G_ANYEXT on bank: " << RBDst << ", expected: GPR\n");
+ return false;
+ }
+
+ const RegisterBank &RBSrc = *RBI.getRegBank(SrcReg, MRI, TRI);
+ if (RBSrc.getID() != AArch64::GPRRegBankID) {
+ DEBUG(dbgs() << "G_ANYEXT on bank: " << RBSrc << ", expected: GPR\n");
+ return false;
+ }
+
+ const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
+
+ if (DstSize == 0) {
+ DEBUG(dbgs() << "G_ANYEXT operand has no size, not a gvreg?\n");
+ return false;
+ }
+
+ if (DstSize != 64 && DstSize > 32) {
+ DEBUG(dbgs() << "G_ANYEXT to size: " << DstSize
+ << ", expected: 32 or 64\n");
+ return false;
+ }
+ // At this point G_ANYEXT is just like a plain COPY, but we need
+ // to explicitly form the 64-bit value if any.
+ if (DstSize > 32) {
+ unsigned ExtSrc = MRI.createVirtualRegister(&AArch64::GPR64allRegClass);
+ BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG))
+ .addDef(ExtSrc)
+ .addImm(0)
+ .addUse(SrcReg)
+ .addImm(AArch64::sub_32);
+ I.getOperand(1).setReg(ExtSrc);
+ }
+ return selectCopy(I, TII, MRI, TRI, RBI);
+ }
+
+ case TargetOpcode::G_ZEXT:
+ case TargetOpcode::G_SEXT: {
+ unsigned Opcode = I.getOpcode();
+ const LLT DstTy = MRI.getType(I.getOperand(0).getReg()),
+ SrcTy = MRI.getType(I.getOperand(1).getReg());
+ const bool isSigned = Opcode == TargetOpcode::G_SEXT;
+ const unsigned DefReg = I.getOperand(0).getReg();
+ const unsigned SrcReg = I.getOperand(1).getReg();
+ const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
+
+ if (RB.getID() != AArch64::GPRRegBankID) {
+ DEBUG(dbgs() << TII.getName(I.getOpcode()) << " on bank: " << RB
+ << ", expected: GPR\n");
+ return false;
+ }
+
+ MachineInstr *ExtI;
+ if (DstTy == LLT::scalar(64)) {
+ // FIXME: Can we avoid manually doing this?
+ if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass, MRI)) {
+ DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode)
+ << " operand\n");
+ return false;
+ }
+
+ const unsigned SrcXReg =
+ MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+ BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG))
+ .addDef(SrcXReg)
+ .addImm(0)
+ .addUse(SrcReg)
+ .addImm(AArch64::sub_32);
+
+ const unsigned NewOpc = isSigned ? AArch64::SBFMXri : AArch64::UBFMXri;
+ ExtI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(NewOpc))
+ .addDef(DefReg)
+ .addUse(SrcXReg)
+ .addImm(0)
+ .addImm(SrcTy.getSizeInBits() - 1);
+ } else if (DstTy.isScalar() && DstTy.getSizeInBits() <= 32) {
+ const unsigned NewOpc = isSigned ? AArch64::SBFMWri : AArch64::UBFMWri;
+ ExtI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(NewOpc))
+ .addDef(DefReg)
+ .addUse(SrcReg)
+ .addImm(0)
+ .addImm(SrcTy.getSizeInBits() - 1);
+ } else {
+ return false;
+ }
+
+ constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
+
+ I.eraseFromParent();
+ return true;
+ }
+
+ case TargetOpcode::G_SITOFP:
+ case TargetOpcode::G_UITOFP:
+ case TargetOpcode::G_FPTOSI:
+ case TargetOpcode::G_FPTOUI: {
+ const LLT DstTy = MRI.getType(I.getOperand(0).getReg()),
+ SrcTy = MRI.getType(I.getOperand(1).getReg());
+ const unsigned NewOpc = selectFPConvOpc(Opcode, DstTy, SrcTy);
+ if (NewOpc == Opcode)
+ return false;
+
+ I.setDesc(TII.get(NewOpc));
+ constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+
+ return true;
+ }
+
+
+ case TargetOpcode::G_INTTOPTR:
+ case TargetOpcode::G_BITCAST:
+ return selectCopy(I, TII, MRI, TRI, RBI);
+
+ case TargetOpcode::G_FPEXT: {
+ if (MRI.getType(I.getOperand(0).getReg()) != LLT::scalar(64)) {
+ DEBUG(dbgs() << "G_FPEXT to type " << Ty
+ << ", expected: " << LLT::scalar(64) << '\n');
+ return false;
+ }
+
+ if (MRI.getType(I.getOperand(1).getReg()) != LLT::scalar(32)) {
+ DEBUG(dbgs() << "G_FPEXT from type " << Ty
+ << ", expected: " << LLT::scalar(32) << '\n');
+ return false;
+ }
+
+ const unsigned DefReg = I.getOperand(0).getReg();
+ const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
+
+ if (RB.getID() != AArch64::FPRRegBankID) {
+ DEBUG(dbgs() << "G_FPEXT on bank: " << RB << ", expected: FPR\n");
+ return false;
+ }
+
+ I.setDesc(TII.get(AArch64::FCVTDSr));
+ constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+
+ return true;
+ }
+
+ case TargetOpcode::G_FPTRUNC: {
+ if (MRI.getType(I.getOperand(0).getReg()) != LLT::scalar(32)) {
+ DEBUG(dbgs() << "G_FPTRUNC to type " << Ty
+ << ", expected: " << LLT::scalar(32) << '\n');
+ return false;
+ }
+
+ if (MRI.getType(I.getOperand(1).getReg()) != LLT::scalar(64)) {
+ DEBUG(dbgs() << "G_FPTRUNC from type " << Ty
+ << ", expected: " << LLT::scalar(64) << '\n');
+ return false;
+ }
+
+ const unsigned DefReg = I.getOperand(0).getReg();
+ const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
+
+ if (RB.getID() != AArch64::FPRRegBankID) {
+ DEBUG(dbgs() << "G_FPTRUNC on bank: " << RB << ", expected: FPR\n");
+ return false;
+ }
+
+ I.setDesc(TII.get(AArch64::FCVTSDr));
+ constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+
+ return true;
+ }
+
+ case TargetOpcode::G_SELECT: {
+ if (MRI.getType(I.getOperand(1).getReg()) != LLT::scalar(1)) {
+ DEBUG(dbgs() << "G_SELECT cond has type: " << Ty
+ << ", expected: " << LLT::scalar(1) << '\n');
+ return false;
+ }
+
+ const unsigned CondReg = I.getOperand(1).getReg();
+ const unsigned TReg = I.getOperand(2).getReg();
+ const unsigned FReg = I.getOperand(3).getReg();
+
+ unsigned CSelOpc = 0;
+
+ if (Ty == LLT::scalar(32)) {
+ CSelOpc = AArch64::CSELWr;
+ } else if (Ty == LLT::scalar(64)) {
+ CSelOpc = AArch64::CSELXr;
+ } else {
+ return false;
+ }
+
+ MachineInstr &TstMI =
+ *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::ANDSWri))
+ .addDef(AArch64::WZR)
+ .addUse(CondReg)
+ .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
+
+ MachineInstr &CSelMI = *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CSelOpc))
+ .addDef(I.getOperand(0).getReg())
+ .addUse(TReg)
+ .addUse(FReg)
+ .addImm(AArch64CC::NE);
+
+ constrainSelectedInstRegOperands(TstMI, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(CSelMI, TII, TRI, RBI);
+
+ I.eraseFromParent();
+ return true;
+ }
+ case TargetOpcode::G_ICMP: {
+ if (Ty != LLT::scalar(1)) {
+ DEBUG(dbgs() << "G_ICMP result has type: " << Ty
+ << ", expected: " << LLT::scalar(1) << '\n');
+ return false;
+ }
+
+ unsigned CmpOpc = 0;
+ unsigned ZReg = 0;
+
+ LLT CmpTy = MRI.getType(I.getOperand(2).getReg());
+ if (CmpTy == LLT::scalar(32)) {
+ CmpOpc = AArch64::SUBSWrr;
+ ZReg = AArch64::WZR;
+ } else if (CmpTy == LLT::scalar(64) || CmpTy.isPointer()) {
+ CmpOpc = AArch64::SUBSXrr;
+ ZReg = AArch64::XZR;
+ } else {
+ return false;
+ }
+
+ const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(
+ (CmpInst::Predicate)I.getOperand(1).getPredicate());
+
+ MachineInstr &CmpMI = *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CmpOpc))
+ .addDef(ZReg)
+ .addUse(I.getOperand(2).getReg())
+ .addUse(I.getOperand(3).getReg());
+
+ MachineInstr &CSetMI =
+ *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::CSINCWr))
+ .addDef(I.getOperand(0).getReg())
+ .addUse(AArch64::WZR)
+ .addUse(AArch64::WZR)
+ .addImm(CC);
+
+ constrainSelectedInstRegOperands(CmpMI, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(CSetMI, TII, TRI, RBI);
+
+ I.eraseFromParent();
+ return true;
+ }
+
+ case TargetOpcode::G_FCMP: {
+ if (Ty != LLT::scalar(1)) {
+ DEBUG(dbgs() << "G_FCMP result has type: " << Ty
+ << ", expected: " << LLT::scalar(1) << '\n');
+ return false;
+ }
+
+ unsigned CmpOpc = 0;
+ LLT CmpTy = MRI.getType(I.getOperand(2).getReg());
+ if (CmpTy == LLT::scalar(32)) {
+ CmpOpc = AArch64::FCMPSrr;
+ } else if (CmpTy == LLT::scalar(64)) {
+ CmpOpc = AArch64::FCMPDrr;
+ } else {
+ return false;
+ }
+
+ // FIXME: regbank
+
+ AArch64CC::CondCode CC1, CC2;
+ changeFCMPPredToAArch64CC(
+ (CmpInst::Predicate)I.getOperand(1).getPredicate(), CC1, CC2);
+
+ MachineInstr &CmpMI = *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CmpOpc))
+ .addUse(I.getOperand(2).getReg())
+ .addUse(I.getOperand(3).getReg());
+
+ const unsigned DefReg = I.getOperand(0).getReg();
+ unsigned Def1Reg = DefReg;
+ if (CC2 != AArch64CC::AL)
+ Def1Reg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
+
+ MachineInstr &CSetMI =
+ *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::CSINCWr))
+ .addDef(Def1Reg)
+ .addUse(AArch64::WZR)
+ .addUse(AArch64::WZR)
+ .addImm(CC1);
+
+ if (CC2 != AArch64CC::AL) {
+ unsigned Def2Reg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
+ MachineInstr &CSet2MI =
+ *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::CSINCWr))
+ .addDef(Def2Reg)
+ .addUse(AArch64::WZR)
+ .addUse(AArch64::WZR)
+ .addImm(CC2);
+ MachineInstr &OrMI =
+ *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::ORRWrr))
+ .addDef(DefReg)
+ .addUse(Def1Reg)
+ .addUse(Def2Reg);
+ constrainSelectedInstRegOperands(OrMI, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(CSet2MI, TII, TRI, RBI);
+ }
+
+ constrainSelectedInstRegOperands(CmpMI, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(CSetMI, TII, TRI, RBI);
+
+ I.eraseFromParent();
+ return true;
+ }
+ }
+
+ return false;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.h b/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.h
new file mode 100644
index 000000000000..0d44e696ac20
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.h
@@ -0,0 +1,47 @@
+//===- AArch64InstructionSelector --------------------------------*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the InstructionSelector class for
+/// AArch64.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64INSTRUCTIONSELECTOR_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64INSTRUCTIONSELECTOR_H
+
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+
+namespace llvm {
+class AArch64InstrInfo;
+class AArch64RegisterBankInfo;
+class AArch64RegisterInfo;
+class AArch64Subtarget;
+class AArch64TargetMachine;
+
+class AArch64InstructionSelector : public InstructionSelector {
+public:
+ AArch64InstructionSelector(const AArch64TargetMachine &TM,
+ const AArch64Subtarget &STI,
+ const AArch64RegisterBankInfo &RBI);
+
+ virtual bool select(MachineInstr &I) const override;
+
+private:
+ /// tblgen-erated 'select' implementation, used as the initial selector for
+ /// the patterns that don't require complex C++.
+ bool selectImpl(MachineInstr &I) const;
+
+ const AArch64TargetMachine &TM;
+ const AArch64Subtarget &STI;
+ const AArch64InstrInfo &TII;
+ const AArch64RegisterInfo &TRI;
+ const AArch64RegisterBankInfo &RBI;
+};
+
+} // End llvm namespace.
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp
new file mode 100644
index 000000000000..83f276a8161b
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp
@@ -0,0 +1,204 @@
+//===- AArch64LegalizerInfo.cpp ----------------------------------*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the Machinelegalizer class for
+/// AArch64.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64LegalizerInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/Target/TargetOpcodes.h"
+
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "You shouldn't build this"
+#endif
+
+AArch64LegalizerInfo::AArch64LegalizerInfo() {
+ using namespace TargetOpcode;
+ const LLT p0 = LLT::pointer(0, 64);
+ const LLT s1 = LLT::scalar(1);
+ const LLT s8 = LLT::scalar(8);
+ const LLT s16 = LLT::scalar(16);
+ const LLT s32 = LLT::scalar(32);
+ const LLT s64 = LLT::scalar(64);
+ const LLT v2s32 = LLT::vector(2, 32);
+ const LLT v4s32 = LLT::vector(4, 32);
+ const LLT v2s64 = LLT::vector(2, 64);
+
+ for (auto BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR, G_SHL}) {
+ // These operations naturally get the right answer when used on
+ // GPR32, even if the actual type is narrower.
+ for (auto Ty : {s1, s8, s16, s32, s64, v2s32, v4s32, v2s64})
+ setAction({BinOp, Ty}, Legal);
+ }
+
+ setAction({G_GEP, p0}, Legal);
+ setAction({G_GEP, 1, s64}, Legal);
+
+ for (auto Ty : {s1, s8, s16, s32})
+ setAction({G_GEP, 1, Ty}, WidenScalar);
+
+ for (auto BinOp : {G_LSHR, G_ASHR, G_SDIV, G_UDIV}) {
+ for (auto Ty : {s32, s64})
+ setAction({BinOp, Ty}, Legal);
+
+ for (auto Ty : {s1, s8, s16})
+ setAction({BinOp, Ty}, WidenScalar);
+ }
+
+ for (auto BinOp : { G_SREM, G_UREM })
+ for (auto Ty : { s1, s8, s16, s32, s64 })
+ setAction({BinOp, Ty}, Lower);
+
+ for (auto Op : { G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_SMULO, G_UMULO }) {
+ for (auto Ty : { s32, s64 })
+ setAction({Op, Ty}, Legal);
+
+ setAction({Op, 1, s1}, Legal);
+ }
+
+ for (auto BinOp : {G_FADD, G_FSUB, G_FMUL, G_FDIV})
+ for (auto Ty : {s32, s64})
+ setAction({BinOp, Ty}, Legal);
+
+ setAction({G_FREM, s32}, Libcall);
+ setAction({G_FREM, s64}, Libcall);
+
+ for (auto MemOp : {G_LOAD, G_STORE}) {
+ for (auto Ty : {s8, s16, s32, s64, p0, v2s32})
+ setAction({MemOp, Ty}, Legal);
+
+ setAction({MemOp, s1}, WidenScalar);
+
+ // And everything's fine in addrspace 0.
+ setAction({MemOp, 1, p0}, Legal);
+ }
+
+ // Constants
+ for (auto Ty : {s32, s64}) {
+ setAction({TargetOpcode::G_CONSTANT, Ty}, Legal);
+ setAction({TargetOpcode::G_FCONSTANT, Ty}, Legal);
+ }
+
+ setAction({G_CONSTANT, p0}, Legal);
+
+ for (auto Ty : {s1, s8, s16})
+ setAction({TargetOpcode::G_CONSTANT, Ty}, WidenScalar);
+
+ setAction({TargetOpcode::G_FCONSTANT, s16}, WidenScalar);
+
+ setAction({G_ICMP, s1}, Legal);
+ setAction({G_ICMP, 1, s32}, Legal);
+ setAction({G_ICMP, 1, s64}, Legal);
+ setAction({G_ICMP, 1, p0}, Legal);
+
+ for (auto Ty : {s1, s8, s16}) {
+ setAction({G_ICMP, 1, Ty}, WidenScalar);
+ }
+
+ setAction({G_FCMP, s1}, Legal);
+ setAction({G_FCMP, 1, s32}, Legal);
+ setAction({G_FCMP, 1, s64}, Legal);
+
+ // Extensions
+ for (auto Ty : { s1, s8, s16, s32, s64 }) {
+ setAction({G_ZEXT, Ty}, Legal);
+ setAction({G_SEXT, Ty}, Legal);
+ setAction({G_ANYEXT, Ty}, Legal);
+ }
+
+ for (auto Ty : { s1, s8, s16, s32 }) {
+ setAction({G_ZEXT, 1, Ty}, Legal);
+ setAction({G_SEXT, 1, Ty}, Legal);
+ setAction({G_ANYEXT, 1, Ty}, Legal);
+ }
+
+ setAction({G_FPEXT, s64}, Legal);
+ setAction({G_FPEXT, 1, s32}, Legal);
+
+ // Truncations
+ for (auto Ty : { s16, s32 })
+ setAction({G_FPTRUNC, Ty}, Legal);
+
+ for (auto Ty : { s32, s64 })
+ setAction({G_FPTRUNC, 1, Ty}, Legal);
+
+ for (auto Ty : { s1, s8, s16, s32 })
+ setAction({G_TRUNC, Ty}, Legal);
+
+ for (auto Ty : { s8, s16, s32, s64 })
+ setAction({G_TRUNC, 1, Ty}, Legal);
+
+ // Conversions
+ for (auto Ty : { s1, s8, s16, s32, s64 }) {
+ setAction({G_FPTOSI, 0, Ty}, Legal);
+ setAction({G_FPTOUI, 0, Ty}, Legal);
+ setAction({G_SITOFP, 1, Ty}, Legal);
+ setAction({G_UITOFP, 1, Ty}, Legal);
+ }
+
+ for (auto Ty : { s32, s64 }) {
+ setAction({G_FPTOSI, 1, Ty}, Legal);
+ setAction({G_FPTOUI, 1, Ty}, Legal);
+ setAction({G_SITOFP, 0, Ty}, Legal);
+ setAction({G_UITOFP, 0, Ty}, Legal);
+ }
+
+ // Control-flow
+ for (auto Ty : {s1, s8, s16, s32})
+ setAction({G_BRCOND, Ty}, Legal);
+
+ // Select
+ for (auto Ty : {s1, s8, s16, s32, s64})
+ setAction({G_SELECT, Ty}, Legal);
+
+ setAction({G_SELECT, 1, s1}, Legal);
+
+ // Pointer-handling
+ setAction({G_FRAME_INDEX, p0}, Legal);
+ setAction({G_GLOBAL_VALUE, p0}, Legal);
+
+ for (auto Ty : {s1, s8, s16, s32, s64})
+ setAction({G_PTRTOINT, 0, Ty}, Legal);
+
+ setAction({G_PTRTOINT, 1, p0}, Legal);
+
+ setAction({G_INTTOPTR, 0, p0}, Legal);
+ setAction({G_INTTOPTR, 1, s64}, Legal);
+
+ // Casts for 32 and 64-bit width type are just copies.
+ for (auto Ty : {s1, s8, s16, s32, s64}) {
+ setAction({G_BITCAST, 0, Ty}, Legal);
+ setAction({G_BITCAST, 1, Ty}, Legal);
+ }
+
+ // For the sake of copying bits around, the type does not really
+ // matter as long as it fits a register.
+ for (int EltSize = 8; EltSize <= 64; EltSize *= 2) {
+ setAction({G_BITCAST, 0, LLT::vector(128/EltSize, EltSize)}, Legal);
+ setAction({G_BITCAST, 1, LLT::vector(128/EltSize, EltSize)}, Legal);
+ if (EltSize >= 64)
+ continue;
+
+ setAction({G_BITCAST, 0, LLT::vector(64/EltSize, EltSize)}, Legal);
+ setAction({G_BITCAST, 1, LLT::vector(64/EltSize, EltSize)}, Legal);
+ if (EltSize >= 32)
+ continue;
+
+ setAction({G_BITCAST, 0, LLT::vector(32/EltSize, EltSize)}, Legal);
+ setAction({G_BITCAST, 1, LLT::vector(32/EltSize, EltSize)}, Legal);
+ }
+
+ computeTables();
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h
new file mode 100644
index 000000000000..feacbef9f147
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h
@@ -0,0 +1,30 @@
+//===- AArch64LegalizerInfo --------------------------------------*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the Machinelegalizer class for
+/// AArch64.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MACHINELEGALIZER_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINELEGALIZER_H
+
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+
+namespace llvm {
+
+class LLVMContext;
+
+/// This class provides the information for the target register banks.
+class AArch64LegalizerInfo : public LegalizerInfo {
+public:
+ AArch64LegalizerInfo();
+};
+} // End llvm namespace.
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
new file mode 100644
index 000000000000..dcb05601e5f4
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -0,0 +1,1728 @@
+//=- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that performs load / store related peephole
+// optimizations. This pass should be run after register allocation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-ldst-opt"
+
+STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
+STATISTIC(NumPostFolded, "Number of post-index updates folded");
+STATISTIC(NumPreFolded, "Number of pre-index updates folded");
+STATISTIC(NumUnscaledPairCreated,
+ "Number of load/store from unscaled generated");
+STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
+STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
+
+// The LdStLimit limits how far we search for load/store pairs.
+static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
+ cl::init(20), cl::Hidden);
+
+// The UpdateLimit limits how far we search for update instructions when we form
+// pre-/post-index instructions.
+static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
+ cl::Hidden);
+
+#define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
+
+namespace {
+
+typedef struct LdStPairFlags {
+ // If a matching instruction is found, MergeForward is set to true if the
+ // merge is to remove the first instruction and replace the second with
+ // a pair-wise insn, and false if the reverse is true.
+ bool MergeForward;
+
+ // SExtIdx gives the index of the result of the load pair that must be
+ // extended. The value of SExtIdx assumes that the paired load produces the
+ // value in this order: (I, returned iterator), i.e., -1 means no value has
+ // to be extended, 0 means I, and 1 means the returned iterator.
+ int SExtIdx;
+
+ LdStPairFlags() : MergeForward(false), SExtIdx(-1) {}
+
+ void setMergeForward(bool V = true) { MergeForward = V; }
+ bool getMergeForward() const { return MergeForward; }
+
+ void setSExtIdx(int V) { SExtIdx = V; }
+ int getSExtIdx() const { return SExtIdx; }
+
+} LdStPairFlags;
+
+struct AArch64LoadStoreOpt : public MachineFunctionPass {
+ static char ID;
+ AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
+ initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
+ }
+
+ const AArch64InstrInfo *TII;
+ const TargetRegisterInfo *TRI;
+ const AArch64Subtarget *Subtarget;
+
+ // Track which registers have been modified and used.
+ BitVector ModifiedRegs, UsedRegs;
+
+ // Scan the instructions looking for a load/store that can be combined
+ // with the current instruction into a load/store pair.
+ // Return the matching instruction if one is found, else MBB->end().
+ MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
+ LdStPairFlags &Flags,
+ unsigned Limit,
+ bool FindNarrowMerge);
+
+ // Scan the instructions looking for a store that writes to the address from
+ // which the current load instruction reads. Return true if one is found.
+ bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
+ MachineBasicBlock::iterator &StoreI);
+
+ // Merge the two instructions indicated into a wider narrow store instruction.
+ MachineBasicBlock::iterator
+ mergeNarrowZeroStores(MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator MergeMI,
+ const LdStPairFlags &Flags);
+
+ // Merge the two instructions indicated into a single pair-wise instruction.
+ MachineBasicBlock::iterator
+ mergePairedInsns(MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator Paired,
+ const LdStPairFlags &Flags);
+
+ // Promote the load that reads directly from the address stored to.
+ MachineBasicBlock::iterator
+ promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
+ MachineBasicBlock::iterator StoreI);
+
+ // Scan the instruction list to find a base register update that can
+ // be combined with the current instruction (a load or store) using
+ // pre or post indexed addressing with writeback. Scan forwards.
+ MachineBasicBlock::iterator
+ findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
+ int UnscaledOffset, unsigned Limit);
+
+ // Scan the instruction list to find a base register update that can
+ // be combined with the current instruction (a load or store) using
+ // pre or post indexed addressing with writeback. Scan backwards.
+ MachineBasicBlock::iterator
+ findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
+
+ // Find an instruction that updates the base register of the ld/st
+ // instruction.
+ bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
+ unsigned BaseReg, int Offset);
+
+ // Merge a pre- or post-index base register update into a ld/st instruction.
+ MachineBasicBlock::iterator
+ mergeUpdateInsn(MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator Update, bool IsPreIdx);
+
+ // Find and merge zero store instructions.
+ bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
+
+ // Find and pair ldr/str instructions.
+ bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI);
+
+ // Find and promote load instructions which read directly from store.
+ bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
+
+ bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+ StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; }
+};
+char AArch64LoadStoreOpt::ID = 0;
+} // namespace
+
+INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
+ AARCH64_LOAD_STORE_OPT_NAME, false, false)
+
+static bool isNarrowStore(unsigned Opc) {
+ switch (Opc) {
+ default:
+ return false;
+ case AArch64::STRBBui:
+ case AArch64::STURBBi:
+ case AArch64::STRHHui:
+ case AArch64::STURHHi:
+ return true;
+ }
+}
+
+// Scaling factor for unscaled load or store.
+static int getMemScale(MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ default:
+ llvm_unreachable("Opcode has unknown scale!");
+ case AArch64::LDRBBui:
+ case AArch64::LDURBBi:
+ case AArch64::LDRSBWui:
+ case AArch64::LDURSBWi:
+ case AArch64::STRBBui:
+ case AArch64::STURBBi:
+ return 1;
+ case AArch64::LDRHHui:
+ case AArch64::LDURHHi:
+ case AArch64::LDRSHWui:
+ case AArch64::LDURSHWi:
+ case AArch64::STRHHui:
+ case AArch64::STURHHi:
+ return 2;
+ case AArch64::LDRSui:
+ case AArch64::LDURSi:
+ case AArch64::LDRSWui:
+ case AArch64::LDURSWi:
+ case AArch64::LDRWui:
+ case AArch64::LDURWi:
+ case AArch64::STRSui:
+ case AArch64::STURSi:
+ case AArch64::STRWui:
+ case AArch64::STURWi:
+ case AArch64::LDPSi:
+ case AArch64::LDPSWi:
+ case AArch64::LDPWi:
+ case AArch64::STPSi:
+ case AArch64::STPWi:
+ return 4;
+ case AArch64::LDRDui:
+ case AArch64::LDURDi:
+ case AArch64::LDRXui:
+ case AArch64::LDURXi:
+ case AArch64::STRDui:
+ case AArch64::STURDi:
+ case AArch64::STRXui:
+ case AArch64::STURXi:
+ case AArch64::LDPDi:
+ case AArch64::LDPXi:
+ case AArch64::STPDi:
+ case AArch64::STPXi:
+ return 8;
+ case AArch64::LDRQui:
+ case AArch64::LDURQi:
+ case AArch64::STRQui:
+ case AArch64::STURQi:
+ case AArch64::LDPQi:
+ case AArch64::STPQi:
+ return 16;
+ }
+}
+
+static unsigned getMatchingNonSExtOpcode(unsigned Opc,
+ bool *IsValidLdStrOpc = nullptr) {
+ if (IsValidLdStrOpc)
+ *IsValidLdStrOpc = true;
+ switch (Opc) {
+ default:
+ if (IsValidLdStrOpc)
+ *IsValidLdStrOpc = false;
+ return UINT_MAX;
+ case AArch64::STRDui:
+ case AArch64::STURDi:
+ case AArch64::STRQui:
+ case AArch64::STURQi:
+ case AArch64::STRBBui:
+ case AArch64::STURBBi:
+ case AArch64::STRHHui:
+ case AArch64::STURHHi:
+ case AArch64::STRWui:
+ case AArch64::STURWi:
+ case AArch64::STRXui:
+ case AArch64::STURXi:
+ case AArch64::LDRDui:
+ case AArch64::LDURDi:
+ case AArch64::LDRQui:
+ case AArch64::LDURQi:
+ case AArch64::LDRWui:
+ case AArch64::LDURWi:
+ case AArch64::LDRXui:
+ case AArch64::LDURXi:
+ case AArch64::STRSui:
+ case AArch64::STURSi:
+ case AArch64::LDRSui:
+ case AArch64::LDURSi:
+ return Opc;
+ case AArch64::LDRSWui:
+ return AArch64::LDRWui;
+ case AArch64::LDURSWi:
+ return AArch64::LDURWi;
+ }
+}
+
+static unsigned getMatchingWideOpcode(unsigned Opc) {
+ switch (Opc) {
+ default:
+ llvm_unreachable("Opcode has no wide equivalent!");
+ case AArch64::STRBBui:
+ return AArch64::STRHHui;
+ case AArch64::STRHHui:
+ return AArch64::STRWui;
+ case AArch64::STURBBi:
+ return AArch64::STURHHi;
+ case AArch64::STURHHi:
+ return AArch64::STURWi;
+ case AArch64::STURWi:
+ return AArch64::STURXi;
+ case AArch64::STRWui:
+ return AArch64::STRXui;
+ }
+}
+
+static unsigned getMatchingPairOpcode(unsigned Opc) {
+ switch (Opc) {
+ default:
+ llvm_unreachable("Opcode has no pairwise equivalent!");
+ case AArch64::STRSui:
+ case AArch64::STURSi:
+ return AArch64::STPSi;
+ case AArch64::STRDui:
+ case AArch64::STURDi:
+ return AArch64::STPDi;
+ case AArch64::STRQui:
+ case AArch64::STURQi:
+ return AArch64::STPQi;
+ case AArch64::STRWui:
+ case AArch64::STURWi:
+ return AArch64::STPWi;
+ case AArch64::STRXui:
+ case AArch64::STURXi:
+ return AArch64::STPXi;
+ case AArch64::LDRSui:
+ case AArch64::LDURSi:
+ return AArch64::LDPSi;
+ case AArch64::LDRDui:
+ case AArch64::LDURDi:
+ return AArch64::LDPDi;
+ case AArch64::LDRQui:
+ case AArch64::LDURQi:
+ return AArch64::LDPQi;
+ case AArch64::LDRWui:
+ case AArch64::LDURWi:
+ return AArch64::LDPWi;
+ case AArch64::LDRXui:
+ case AArch64::LDURXi:
+ return AArch64::LDPXi;
+ case AArch64::LDRSWui:
+ case AArch64::LDURSWi:
+ return AArch64::LDPSWi;
+ }
+}
+
+static unsigned isMatchingStore(MachineInstr &LoadInst,
+ MachineInstr &StoreInst) {
+ unsigned LdOpc = LoadInst.getOpcode();
+ unsigned StOpc = StoreInst.getOpcode();
+ switch (LdOpc) {
+ default:
+ llvm_unreachable("Unsupported load instruction!");
+ case AArch64::LDRBBui:
+ return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui ||
+ StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
+ case AArch64::LDURBBi:
+ return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi ||
+ StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
+ case AArch64::LDRHHui:
+ return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui ||
+ StOpc == AArch64::STRXui;
+ case AArch64::LDURHHi:
+ return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi ||
+ StOpc == AArch64::STURXi;
+ case AArch64::LDRWui:
+ return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
+ case AArch64::LDURWi:
+ return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
+ case AArch64::LDRXui:
+ return StOpc == AArch64::STRXui;
+ case AArch64::LDURXi:
+ return StOpc == AArch64::STURXi;
+ }
+}
+
+static unsigned getPreIndexedOpcode(unsigned Opc) {
+ switch (Opc) {
+ default:
+ llvm_unreachable("Opcode has no pre-indexed equivalent!");
+ case AArch64::STRSui:
+ return AArch64::STRSpre;
+ case AArch64::STRDui:
+ return AArch64::STRDpre;
+ case AArch64::STRQui:
+ return AArch64::STRQpre;
+ case AArch64::STRBBui:
+ return AArch64::STRBBpre;
+ case AArch64::STRHHui:
+ return AArch64::STRHHpre;
+ case AArch64::STRWui:
+ return AArch64::STRWpre;
+ case AArch64::STRXui:
+ return AArch64::STRXpre;
+ case AArch64::LDRSui:
+ return AArch64::LDRSpre;
+ case AArch64::LDRDui:
+ return AArch64::LDRDpre;
+ case AArch64::LDRQui:
+ return AArch64::LDRQpre;
+ case AArch64::LDRBBui:
+ return AArch64::LDRBBpre;
+ case AArch64::LDRHHui:
+ return AArch64::LDRHHpre;
+ case AArch64::LDRWui:
+ return AArch64::LDRWpre;
+ case AArch64::LDRXui:
+ return AArch64::LDRXpre;
+ case AArch64::LDRSWui:
+ return AArch64::LDRSWpre;
+ case AArch64::LDPSi:
+ return AArch64::LDPSpre;
+ case AArch64::LDPSWi:
+ return AArch64::LDPSWpre;
+ case AArch64::LDPDi:
+ return AArch64::LDPDpre;
+ case AArch64::LDPQi:
+ return AArch64::LDPQpre;
+ case AArch64::LDPWi:
+ return AArch64::LDPWpre;
+ case AArch64::LDPXi:
+ return AArch64::LDPXpre;
+ case AArch64::STPSi:
+ return AArch64::STPSpre;
+ case AArch64::STPDi:
+ return AArch64::STPDpre;
+ case AArch64::STPQi:
+ return AArch64::STPQpre;
+ case AArch64::STPWi:
+ return AArch64::STPWpre;
+ case AArch64::STPXi:
+ return AArch64::STPXpre;
+ }
+}
+
+static unsigned getPostIndexedOpcode(unsigned Opc) {
+ switch (Opc) {
+ default:
+ llvm_unreachable("Opcode has no post-indexed wise equivalent!");
+ case AArch64::STRSui:
+ return AArch64::STRSpost;
+ case AArch64::STRDui:
+ return AArch64::STRDpost;
+ case AArch64::STRQui:
+ return AArch64::STRQpost;
+ case AArch64::STRBBui:
+ return AArch64::STRBBpost;
+ case AArch64::STRHHui:
+ return AArch64::STRHHpost;
+ case AArch64::STRWui:
+ return AArch64::STRWpost;
+ case AArch64::STRXui:
+ return AArch64::STRXpost;
+ case AArch64::LDRSui:
+ return AArch64::LDRSpost;
+ case AArch64::LDRDui:
+ return AArch64::LDRDpost;
+ case AArch64::LDRQui:
+ return AArch64::LDRQpost;
+ case AArch64::LDRBBui:
+ return AArch64::LDRBBpost;
+ case AArch64::LDRHHui:
+ return AArch64::LDRHHpost;
+ case AArch64::LDRWui:
+ return AArch64::LDRWpost;
+ case AArch64::LDRXui:
+ return AArch64::LDRXpost;
+ case AArch64::LDRSWui:
+ return AArch64::LDRSWpost;
+ case AArch64::LDPSi:
+ return AArch64::LDPSpost;
+ case AArch64::LDPSWi:
+ return AArch64::LDPSWpost;
+ case AArch64::LDPDi:
+ return AArch64::LDPDpost;
+ case AArch64::LDPQi:
+ return AArch64::LDPQpost;
+ case AArch64::LDPWi:
+ return AArch64::LDPWpost;
+ case AArch64::LDPXi:
+ return AArch64::LDPXpost;
+ case AArch64::STPSi:
+ return AArch64::STPSpost;
+ case AArch64::STPDi:
+ return AArch64::STPDpost;
+ case AArch64::STPQi:
+ return AArch64::STPQpost;
+ case AArch64::STPWi:
+ return AArch64::STPWpost;
+ case AArch64::STPXi:
+ return AArch64::STPXpost;
+ }
+}
+
+static bool isPairedLdSt(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ default:
+ return false;
+ case AArch64::LDPSi:
+ case AArch64::LDPSWi:
+ case AArch64::LDPDi:
+ case AArch64::LDPQi:
+ case AArch64::LDPWi:
+ case AArch64::LDPXi:
+ case AArch64::STPSi:
+ case AArch64::STPDi:
+ case AArch64::STPQi:
+ case AArch64::STPWi:
+ case AArch64::STPXi:
+ return true;
+ }
+}
+
+static const MachineOperand &getLdStRegOp(const MachineInstr &MI,
+ unsigned PairedRegOp = 0) {
+ assert(PairedRegOp < 2 && "Unexpected register operand idx.");
+ unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0;
+ return MI.getOperand(Idx);
+}
+
+static const MachineOperand &getLdStBaseOp(const MachineInstr &MI) {
+ unsigned Idx = isPairedLdSt(MI) ? 2 : 1;
+ return MI.getOperand(Idx);
+}
+
+static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI) {
+ unsigned Idx = isPairedLdSt(MI) ? 3 : 2;
+ return MI.getOperand(Idx);
+}
+
+static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
+ MachineInstr &StoreInst,
+ const AArch64InstrInfo *TII) {
+ assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
+ int LoadSize = getMemScale(LoadInst);
+ int StoreSize = getMemScale(StoreInst);
+ int UnscaledStOffset = TII->isUnscaledLdSt(StoreInst)
+ ? getLdStOffsetOp(StoreInst).getImm()
+ : getLdStOffsetOp(StoreInst).getImm() * StoreSize;
+ int UnscaledLdOffset = TII->isUnscaledLdSt(LoadInst)
+ ? getLdStOffsetOp(LoadInst).getImm()
+ : getLdStOffsetOp(LoadInst).getImm() * LoadSize;
+ return (UnscaledStOffset <= UnscaledLdOffset) &&
+ (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
+}
+
+static bool isPromotableZeroStoreInst(MachineInstr &MI) {
+ unsigned Opc = MI.getOpcode();
+ return (Opc == AArch64::STRWui || Opc == AArch64::STURWi ||
+ isNarrowStore(Opc)) &&
+ getLdStRegOp(MI).getReg() == AArch64::WZR;
+}
+
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator MergeMI,
+ const LdStPairFlags &Flags) {
+ assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) &&
+ "Expected promotable zero stores.");
+
+ MachineBasicBlock::iterator NextI = I;
+ ++NextI;
+ // If NextI is the second of the two instructions to be merged, we need
+ // to skip one further. Either way we merge will invalidate the iterator,
+ // and we don't need to scan the new instruction, as it's a pairwise
+ // instruction, which we're not considering for further action anyway.
+ if (NextI == MergeMI)
+ ++NextI;
+
+ unsigned Opc = I->getOpcode();
+ bool IsScaled = !TII->isUnscaledLdSt(Opc);
+ int OffsetStride = IsScaled ? 1 : getMemScale(*I);
+
+ bool MergeForward = Flags.getMergeForward();
+ // Insert our new paired instruction after whichever of the paired
+ // instructions MergeForward indicates.
+ MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I;
+ // Also based on MergeForward is from where we copy the base register operand
+ // so we get the flags compatible with the input code.
+ const MachineOperand &BaseRegOp =
+ MergeForward ? getLdStBaseOp(*MergeMI) : getLdStBaseOp(*I);
+
+ // Which register is Rt and which is Rt2 depends on the offset order.
+ MachineInstr *RtMI;
+ if (getLdStOffsetOp(*I).getImm() ==
+ getLdStOffsetOp(*MergeMI).getImm() + OffsetStride)
+ RtMI = &*MergeMI;
+ else
+ RtMI = &*I;
+
+ int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
+ // Change the scaled offset from small to large type.
+ if (IsScaled) {
+ assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
+ OffsetImm /= 2;
+ }
+
+ // Construct the new instruction.
+ DebugLoc DL = I->getDebugLoc();
+ MachineBasicBlock *MBB = I->getParent();
+ MachineInstrBuilder MIB;
+ MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc)))
+ .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
+ .addOperand(BaseRegOp)
+ .addImm(OffsetImm)
+ .setMemRefs(I->mergeMemRefsWith(*MergeMI));
+ (void)MIB;
+
+ DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n ");
+ DEBUG(I->print(dbgs()));
+ DEBUG(dbgs() << " ");
+ DEBUG(MergeMI->print(dbgs()));
+ DEBUG(dbgs() << " with instruction:\n ");
+ DEBUG(((MachineInstr *)MIB)->print(dbgs()));
+ DEBUG(dbgs() << "\n");
+
+ // Erase the old instructions.
+ I->eraseFromParent();
+ MergeMI->eraseFromParent();
+ return NextI;
+}
+
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator Paired,
+ const LdStPairFlags &Flags) {
+ MachineBasicBlock::iterator NextI = I;
+ ++NextI;
+ // If NextI is the second of the two instructions to be merged, we need
+ // to skip one further. Either way we merge will invalidate the iterator,
+ // and we don't need to scan the new instruction, as it's a pairwise
+ // instruction, which we're not considering for further action anyway.
+ if (NextI == Paired)
+ ++NextI;
+
+ int SExtIdx = Flags.getSExtIdx();
+ unsigned Opc =
+ SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
+ bool IsUnscaled = TII->isUnscaledLdSt(Opc);
+ int OffsetStride = IsUnscaled ? getMemScale(*I) : 1;
+
+ bool MergeForward = Flags.getMergeForward();
+ // Insert our new paired instruction after whichever of the paired
+ // instructions MergeForward indicates.
+ MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
+ // Also based on MergeForward is from where we copy the base register operand
+ // so we get the flags compatible with the input code.
+ const MachineOperand &BaseRegOp =
+ MergeForward ? getLdStBaseOp(*Paired) : getLdStBaseOp(*I);
+
+ int Offset = getLdStOffsetOp(*I).getImm();
+ int PairedOffset = getLdStOffsetOp(*Paired).getImm();
+ bool PairedIsUnscaled = TII->isUnscaledLdSt(Paired->getOpcode());
+ if (IsUnscaled != PairedIsUnscaled) {
+ // We're trying to pair instructions that differ in how they are scaled. If
+ // I is scaled then scale the offset of Paired accordingly. Otherwise, do
+ // the opposite (i.e., make Paired's offset unscaled).
+ int MemSize = getMemScale(*Paired);
+ if (PairedIsUnscaled) {
+ // If the unscaled offset isn't a multiple of the MemSize, we can't
+ // pair the operations together.
+ assert(!(PairedOffset % getMemScale(*Paired)) &&
+ "Offset should be a multiple of the stride!");
+ PairedOffset /= MemSize;
+ } else {
+ PairedOffset *= MemSize;
+ }
+ }
+
+ // Which register is Rt and which is Rt2 depends on the offset order.
+ MachineInstr *RtMI, *Rt2MI;
+ if (Offset == PairedOffset + OffsetStride) {
+ RtMI = &*Paired;
+ Rt2MI = &*I;
+ // Here we swapped the assumption made for SExtIdx.
+ // I.e., we turn ldp I, Paired into ldp Paired, I.
+ // Update the index accordingly.
+ if (SExtIdx != -1)
+ SExtIdx = (SExtIdx + 1) % 2;
+ } else {
+ RtMI = &*I;
+ Rt2MI = &*Paired;
+ }
+ int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
+ // Scale the immediate offset, if necessary.
+ if (TII->isUnscaledLdSt(RtMI->getOpcode())) {
+ assert(!(OffsetImm % getMemScale(*RtMI)) &&
+ "Unscaled offset cannot be scaled.");
+ OffsetImm /= getMemScale(*RtMI);
+ }
+
+ // Construct the new instruction.
+ MachineInstrBuilder MIB;
+ DebugLoc DL = I->getDebugLoc();
+ MachineBasicBlock *MBB = I->getParent();
+ MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingPairOpcode(Opc)))
+ .addOperand(getLdStRegOp(*RtMI))
+ .addOperand(getLdStRegOp(*Rt2MI))
+ .addOperand(BaseRegOp)
+ .addImm(OffsetImm)
+ .setMemRefs(I->mergeMemRefsWith(*Paired));
+
+ (void)MIB;
+
+ DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n ");
+ DEBUG(I->print(dbgs()));
+ DEBUG(dbgs() << " ");
+ DEBUG(Paired->print(dbgs()));
+ DEBUG(dbgs() << " with instruction:\n ");
+ if (SExtIdx != -1) {
+ // Generate the sign extension for the proper result of the ldp.
+ // I.e., with X1, that would be:
+ // %W1<def> = KILL %W1, %X1<imp-def>
+ // %X1<def> = SBFMXri %X1<kill>, 0, 31
+ MachineOperand &DstMO = MIB->getOperand(SExtIdx);
+ // Right now, DstMO has the extended register, since it comes from an
+ // extended opcode.
+ unsigned DstRegX = DstMO.getReg();
+ // Get the W variant of that register.
+ unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
+ // Update the result of LDP to use the W instead of the X variant.
+ DstMO.setReg(DstRegW);
+ DEBUG(((MachineInstr *)MIB)->print(dbgs()));
+ DEBUG(dbgs() << "\n");
+ // Make the machine verifier happy by providing a definition for
+ // the X register.
+ // Insert this definition right after the generated LDP, i.e., before
+ // InsertionPoint.
+ MachineInstrBuilder MIBKill =
+ BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW)
+ .addReg(DstRegW)
+ .addReg(DstRegX, RegState::Define);
+ MIBKill->getOperand(2).setImplicit();
+ // Create the sign extension.
+ MachineInstrBuilder MIBSXTW =
+ BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX)
+ .addReg(DstRegX)
+ .addImm(0)
+ .addImm(31);
+ (void)MIBSXTW;
+ DEBUG(dbgs() << " Extend operand:\n ");
+ DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
+ } else {
+ DEBUG(((MachineInstr *)MIB)->print(dbgs()));
+ }
+ DEBUG(dbgs() << "\n");
+
+ // Erase the old instructions.
+ I->eraseFromParent();
+ Paired->eraseFromParent();
+
+ return NextI;
+}
+
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
+ MachineBasicBlock::iterator StoreI) {
+ MachineBasicBlock::iterator NextI = LoadI;
+ ++NextI;
+
+ int LoadSize = getMemScale(*LoadI);
+ int StoreSize = getMemScale(*StoreI);
+ unsigned LdRt = getLdStRegOp(*LoadI).getReg();
+ unsigned StRt = getLdStRegOp(*StoreI).getReg();
+ bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
+
+ assert((IsStoreXReg ||
+ TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
+ "Unexpected RegClass");
+
+ MachineInstr *BitExtMI;
+ if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) {
+ // Remove the load, if the destination register of the loads is the same
+ // register for stored value.
+ if (StRt == LdRt && LoadSize == 8) {
+ StoreI->clearRegisterKills(StRt, TRI);
+ DEBUG(dbgs() << "Remove load instruction:\n ");
+ DEBUG(LoadI->print(dbgs()));
+ DEBUG(dbgs() << "\n");
+ LoadI->eraseFromParent();
+ return NextI;
+ }
+ // Replace the load with a mov if the load and store are in the same size.
+ BitExtMI =
+ BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
+ TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt)
+ .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR)
+ .addReg(StRt)
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+ } else {
+ // FIXME: Currently we disable this transformation in big-endian targets as
+ // performance and correctness are verified only in little-endian.
+ if (!Subtarget->isLittleEndian())
+ return NextI;
+ bool IsUnscaled = TII->isUnscaledLdSt(*LoadI);
+ assert(IsUnscaled == TII->isUnscaledLdSt(*StoreI) &&
+ "Unsupported ld/st match");
+ assert(LoadSize <= StoreSize && "Invalid load size");
+ int UnscaledLdOffset = IsUnscaled
+ ? getLdStOffsetOp(*LoadI).getImm()
+ : getLdStOffsetOp(*LoadI).getImm() * LoadSize;
+ int UnscaledStOffset = IsUnscaled
+ ? getLdStOffsetOp(*StoreI).getImm()
+ : getLdStOffsetOp(*StoreI).getImm() * StoreSize;
+ int Width = LoadSize * 8;
+ int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
+ int Imms = Immr + Width - 1;
+ unsigned DestReg = IsStoreXReg
+ ? TRI->getMatchingSuperReg(LdRt, AArch64::sub_32,
+ &AArch64::GPR64RegClass)
+ : LdRt;
+
+ assert((UnscaledLdOffset >= UnscaledStOffset &&
+ (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
+ "Invalid offset");
+
+ Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
+ Imms = Immr + Width - 1;
+ if (UnscaledLdOffset == UnscaledStOffset) {
+ uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N
+ | ((Immr) << 6) // immr
+ | ((Imms) << 0) // imms
+ ;
+
+ BitExtMI =
+ BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
+ TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
+ DestReg)
+ .addReg(StRt)
+ .addImm(AndMaskEncoded);
+ } else {
+ BitExtMI =
+ BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
+ TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
+ DestReg)
+ .addReg(StRt)
+ .addImm(Immr)
+ .addImm(Imms);
+ }
+ }
+ StoreI->clearRegisterKills(StRt, TRI);
+
+ (void)BitExtMI;
+
+ DEBUG(dbgs() << "Promoting load by replacing :\n ");
+ DEBUG(StoreI->print(dbgs()));
+ DEBUG(dbgs() << " ");
+ DEBUG(LoadI->print(dbgs()));
+ DEBUG(dbgs() << " with instructions:\n ");
+ DEBUG(StoreI->print(dbgs()));
+ DEBUG(dbgs() << " ");
+ DEBUG((BitExtMI)->print(dbgs()));
+ DEBUG(dbgs() << "\n");
+
+ // Erase the old instructions.
+ LoadI->eraseFromParent();
+ return NextI;
+}
+
+/// trackRegDefsUses - Remember what registers the specified instruction uses
+/// and modifies.
+static void trackRegDefsUses(const MachineInstr &MI, BitVector &ModifiedRegs,
+ BitVector &UsedRegs,
+ const TargetRegisterInfo *TRI) {
+ for (const MachineOperand &MO : MI.operands()) {
+ if (MO.isRegMask())
+ ModifiedRegs.setBitsNotInMask(MO.getRegMask());
+
+ if (!MO.isReg())
+ continue;
+ unsigned Reg = MO.getReg();
+ if (!Reg)
+ continue;
+ if (MO.isDef()) {
+ // WZR/XZR are not modified even when used as a destination register.
+ if (Reg != AArch64::WZR && Reg != AArch64::XZR)
+ for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+ ModifiedRegs.set(*AI);
+ } else {
+ assert(MO.isUse() && "Reg operand not a def and not a use?!?");
+ for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+ UsedRegs.set(*AI);
+ }
+ }
+}
+
+static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
+ // Convert the byte-offset used by unscaled into an "element" offset used
+ // by the scaled pair load/store instructions.
+ if (IsUnscaled) {
+ // If the byte-offset isn't a multiple of the stride, there's no point
+ // trying to match it.
+ if (Offset % OffsetStride)
+ return false;
+ Offset /= OffsetStride;
+ }
+ return Offset <= 63 && Offset >= -64;
+}
+
+// Do alignment, specialized to power of 2 and for signed ints,
+// avoiding having to do a C-style cast from uint_64t to int when
+// using alignTo from include/llvm/Support/MathExtras.h.
+// FIXME: Move this function to include/MathExtras.h?
+static int alignTo(int Num, int PowOf2) {
+ return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
+}
+
+static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb,
+ const AArch64InstrInfo *TII) {
+ // One of the instructions must modify memory.
+ if (!MIa.mayStore() && !MIb.mayStore())
+ return false;
+
+ // Both instructions must be memory operations.
+ if (!MIa.mayLoadOrStore() && !MIb.mayLoadOrStore())
+ return false;
+
+ return !TII->areMemAccessesTriviallyDisjoint(MIa, MIb);
+}
+
+static bool mayAlias(MachineInstr &MIa,
+ SmallVectorImpl<MachineInstr *> &MemInsns,
+ const AArch64InstrInfo *TII) {
+ for (MachineInstr *MIb : MemInsns)
+ if (mayAlias(MIa, *MIb, TII))
+ return true;
+
+ return false;
+}
+
+bool AArch64LoadStoreOpt::findMatchingStore(
+ MachineBasicBlock::iterator I, unsigned Limit,
+ MachineBasicBlock::iterator &StoreI) {
+ MachineBasicBlock::iterator B = I->getParent()->begin();
+ MachineBasicBlock::iterator MBBI = I;
+ MachineInstr &LoadMI = *I;
+ unsigned BaseReg = getLdStBaseOp(LoadMI).getReg();
+
+ // If the load is the first instruction in the block, there's obviously
+ // not any matching store.
+ if (MBBI == B)
+ return false;
+
+ // Track which registers have been modified and used between the first insn
+ // and the second insn.
+ ModifiedRegs.reset();
+ UsedRegs.reset();
+
+ unsigned Count = 0;
+ do {
+ --MBBI;
+ MachineInstr &MI = *MBBI;
+
+ // Don't count transient instructions towards the search limit since there
+ // may be different numbers of them if e.g. debug information is present.
+ if (!MI.isTransient())
+ ++Count;
+
+ // If the load instruction reads directly from the address to which the
+ // store instruction writes and the stored value is not modified, we can
+ // promote the load. Since we do not handle stores with pre-/post-index,
+ // it's unnecessary to check if BaseReg is modified by the store itself.
+ if (MI.mayStore() && isMatchingStore(LoadMI, MI) &&
+ BaseReg == getLdStBaseOp(MI).getReg() &&
+ isLdOffsetInRangeOfSt(LoadMI, MI, TII) &&
+ !ModifiedRegs[getLdStRegOp(MI).getReg()]) {
+ StoreI = MBBI;
+ return true;
+ }
+
+ if (MI.isCall())
+ return false;
+
+ // Update modified / uses register lists.
+ trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+
+ // Otherwise, if the base register is modified, we have no match, so
+ // return early.
+ if (ModifiedRegs[BaseReg])
+ return false;
+
+ // If we encounter a store aliased with the load, return early.
+ if (MI.mayStore() && mayAlias(LoadMI, MI, TII))
+ return false;
+ } while (MBBI != B && Count < Limit);
+ return false;
+}
+
+// Returns true if FirstMI and MI are candidates for merging or pairing.
+// Otherwise, returns false.
+static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
+ LdStPairFlags &Flags,
+ const AArch64InstrInfo *TII) {
+ // If this is volatile or if pairing is suppressed, not a candidate.
+ if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
+ return false;
+
+ // We should have already checked FirstMI for pair suppression and volatility.
+ assert(!FirstMI.hasOrderedMemoryRef() &&
+ !TII->isLdStPairSuppressed(FirstMI) &&
+ "FirstMI shouldn't get here if either of these checks are true.");
+
+ unsigned OpcA = FirstMI.getOpcode();
+ unsigned OpcB = MI.getOpcode();
+
+ // Opcodes match: nothing more to check.
+ if (OpcA == OpcB)
+ return true;
+
+ // Try to match a sign-extended load/store with a zero-extended load/store.
+ bool IsValidLdStrOpc, PairIsValidLdStrOpc;
+ unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc);
+ assert(IsValidLdStrOpc &&
+ "Given Opc should be a Load or Store with an immediate");
+ // OpcA will be the first instruction in the pair.
+ if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) {
+ Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 1 : 0);
+ return true;
+ }
+
+ // If the second instruction isn't even a mergable/pairable load/store, bail
+ // out.
+ if (!PairIsValidLdStrOpc)
+ return false;
+
+ // FIXME: We don't support merging narrow stores with mixed scaled/unscaled
+ // offsets.
+ if (isNarrowStore(OpcA) || isNarrowStore(OpcB))
+ return false;
+
+ // Try to match an unscaled load/store with a scaled load/store.
+ return TII->isUnscaledLdSt(OpcA) != TII->isUnscaledLdSt(OpcB) &&
+ getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB);
+
+ // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
+}
+
+/// Scan the instructions looking for a load/store that can be combined with the
+/// current instruction into a wider equivalent or a load/store pair.
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
+ LdStPairFlags &Flags, unsigned Limit,
+ bool FindNarrowMerge) {
+ MachineBasicBlock::iterator E = I->getParent()->end();
+ MachineBasicBlock::iterator MBBI = I;
+ MachineInstr &FirstMI = *I;
+ ++MBBI;
+
+ bool MayLoad = FirstMI.mayLoad();
+ bool IsUnscaled = TII->isUnscaledLdSt(FirstMI);
+ unsigned Reg = getLdStRegOp(FirstMI).getReg();
+ unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
+ int Offset = getLdStOffsetOp(FirstMI).getImm();
+ int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1;
+ bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);
+
+ // Track which registers have been modified and used between the first insn
+ // (inclusive) and the second insn.
+ ModifiedRegs.reset();
+ UsedRegs.reset();
+
+ // Remember any instructions that read/write memory between FirstMI and MI.
+ SmallVector<MachineInstr *, 4> MemInsns;
+
+ for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
+ MachineInstr &MI = *MBBI;
+
+ // Don't count transient instructions towards the search limit since there
+ // may be different numbers of them if e.g. debug information is present.
+ if (!MI.isTransient())
+ ++Count;
+
+ Flags.setSExtIdx(-1);
+ if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
+ getLdStOffsetOp(MI).isImm()) {
+ assert(MI.mayLoadOrStore() && "Expected memory operation.");
+ // If we've found another instruction with the same opcode, check to see
+ // if the base and offset are compatible with our starting instruction.
+ // These instructions all have scaled immediate operands, so we just
+ // check for +1/-1. Make sure to check the new instruction offset is
+ // actually an immediate and not a symbolic reference destined for
+ // a relocation.
+ unsigned MIBaseReg = getLdStBaseOp(MI).getReg();
+ int MIOffset = getLdStOffsetOp(MI).getImm();
+ bool MIIsUnscaled = TII->isUnscaledLdSt(MI);
+ if (IsUnscaled != MIIsUnscaled) {
+ // We're trying to pair instructions that differ in how they are scaled.
+ // If FirstMI is scaled then scale the offset of MI accordingly.
+ // Otherwise, do the opposite (i.e., make MI's offset unscaled).
+ int MemSize = getMemScale(MI);
+ if (MIIsUnscaled) {
+ // If the unscaled offset isn't a multiple of the MemSize, we can't
+ // pair the operations together: bail and keep looking.
+ if (MIOffset % MemSize) {
+ trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+ MemInsns.push_back(&MI);
+ continue;
+ }
+ MIOffset /= MemSize;
+ } else {
+ MIOffset *= MemSize;
+ }
+ }
+
+ if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) ||
+ (Offset + OffsetStride == MIOffset))) {
+ int MinOffset = Offset < MIOffset ? Offset : MIOffset;
+ if (FindNarrowMerge) {
+ // If the alignment requirements of the scaled wide load/store
+ // instruction can't express the offset of the scaled narrow input,
+ // bail and keep looking. For promotable zero stores, allow only when
+ // the stored value is the same (i.e., WZR).
+ if ((!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) ||
+ (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) {
+ trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+ MemInsns.push_back(&MI);
+ continue;
+ }
+ } else {
+ // Pairwise instructions have a 7-bit signed offset field. Single
+ // insns have a 12-bit unsigned offset field. If the resultant
+ // immediate offset of merging these instructions is out of range for
+ // a pairwise instruction, bail and keep looking.
+ if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) {
+ trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+ MemInsns.push_back(&MI);
+ continue;
+ }
+ // If the alignment requirements of the paired (scaled) instruction
+ // can't express the offset of the unscaled input, bail and keep
+ // looking.
+ if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) {
+ trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+ MemInsns.push_back(&MI);
+ continue;
+ }
+ }
+ // If the destination register of the loads is the same register, bail
+ // and keep looking. A load-pair instruction with both destination
+ // registers the same is UNPREDICTABLE and will result in an exception.
+ if (MayLoad && Reg == getLdStRegOp(MI).getReg()) {
+ trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+ MemInsns.push_back(&MI);
+ continue;
+ }
+
+ // If the Rt of the second instruction was not modified or used between
+ // the two instructions and none of the instructions between the second
+ // and first alias with the second, we can combine the second into the
+ // first.
+ if (!ModifiedRegs[getLdStRegOp(MI).getReg()] &&
+ !(MI.mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) &&
+ !mayAlias(MI, MemInsns, TII)) {
+ Flags.setMergeForward(false);
+ return MBBI;
+ }
+
+ // Likewise, if the Rt of the first instruction is not modified or used
+ // between the two instructions and none of the instructions between the
+ // first and the second alias with the first, we can combine the first
+ // into the second.
+ if (!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] &&
+ !(MayLoad && UsedRegs[getLdStRegOp(FirstMI).getReg()]) &&
+ !mayAlias(FirstMI, MemInsns, TII)) {
+ Flags.setMergeForward(true);
+ return MBBI;
+ }
+ // Unable to combine these instructions due to interference in between.
+ // Keep looking.
+ }
+ }
+
+ // If the instruction wasn't a matching load or store. Stop searching if we
+ // encounter a call instruction that might modify memory.
+ if (MI.isCall())
+ return E;
+
+ // Update modified / uses register lists.
+ trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+
+ // Otherwise, if the base register is modified, we have no match, so
+ // return early.
+ if (ModifiedRegs[BaseReg])
+ return E;
+
+ // Update list of instructions that read/write memory.
+ if (MI.mayLoadOrStore())
+ MemInsns.push_back(&MI);
+ }
+ return E;
+}
+
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator Update,
+ bool IsPreIdx) {
+ assert((Update->getOpcode() == AArch64::ADDXri ||
+ Update->getOpcode() == AArch64::SUBXri) &&
+ "Unexpected base register update instruction to merge!");
+ MachineBasicBlock::iterator NextI = I;
+ // Return the instruction following the merged instruction, which is
+ // the instruction following our unmerged load. Unless that's the add/sub
+ // instruction we're merging, in which case it's the one after that.
+ if (++NextI == Update)
+ ++NextI;
+
+ int Value = Update->getOperand(2).getImm();
+ assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
+ "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
+ if (Update->getOpcode() == AArch64::SUBXri)
+ Value = -Value;
+
+ unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
+ : getPostIndexedOpcode(I->getOpcode());
+ MachineInstrBuilder MIB;
+ if (!isPairedLdSt(*I)) {
+ // Non-paired instruction.
+ MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
+ .addOperand(getLdStRegOp(*Update))
+ .addOperand(getLdStRegOp(*I))
+ .addOperand(getLdStBaseOp(*I))
+ .addImm(Value)
+ .setMemRefs(I->memoperands_begin(), I->memoperands_end());
+ } else {
+ // Paired instruction.
+ int Scale = getMemScale(*I);
+ MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
+ .addOperand(getLdStRegOp(*Update))
+ .addOperand(getLdStRegOp(*I, 0))
+ .addOperand(getLdStRegOp(*I, 1))
+ .addOperand(getLdStBaseOp(*I))
+ .addImm(Value / Scale)
+ .setMemRefs(I->memoperands_begin(), I->memoperands_end());
+ }
+ (void)MIB;
+
+ if (IsPreIdx)
+ DEBUG(dbgs() << "Creating pre-indexed load/store.");
+ else
+ DEBUG(dbgs() << "Creating post-indexed load/store.");
+ DEBUG(dbgs() << " Replacing instructions:\n ");
+ DEBUG(I->print(dbgs()));
+ DEBUG(dbgs() << " ");
+ DEBUG(Update->print(dbgs()));
+ DEBUG(dbgs() << " with instruction:\n ");
+ DEBUG(((MachineInstr *)MIB)->print(dbgs()));
+ DEBUG(dbgs() << "\n");
+
+ // Erase the old instructions for the block.
+ I->eraseFromParent();
+ Update->eraseFromParent();
+
+ return NextI;
+}
+
+bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
+ MachineInstr &MI,
+ unsigned BaseReg, int Offset) {
+ switch (MI.getOpcode()) {
+ default:
+ break;
+ case AArch64::SUBXri:
+ case AArch64::ADDXri:
+ // Make sure it's a vanilla immediate operand, not a relocation or
+ // anything else we can't handle.
+ if (!MI.getOperand(2).isImm())
+ break;
+ // Watch out for 1 << 12 shifted value.
+ if (AArch64_AM::getShiftValue(MI.getOperand(3).getImm()))
+ break;
+
+ // The update instruction source and destination register must be the
+ // same as the load/store base register.
+ if (MI.getOperand(0).getReg() != BaseReg ||
+ MI.getOperand(1).getReg() != BaseReg)
+ break;
+
+ bool IsPairedInsn = isPairedLdSt(MemMI);
+ int UpdateOffset = MI.getOperand(2).getImm();
+ if (MI.getOpcode() == AArch64::SUBXri)
+ UpdateOffset = -UpdateOffset;
+
+ // For non-paired load/store instructions, the immediate must fit in a
+ // signed 9-bit integer.
+ if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256))
+ break;
+
+ // For paired load/store instructions, the immediate must be a multiple of
+ // the scaling factor. The scaled offset must also fit into a signed 7-bit
+ // integer.
+ if (IsPairedInsn) {
+ int Scale = getMemScale(MemMI);
+ if (UpdateOffset % Scale != 0)
+ break;
+
+ int ScaledOffset = UpdateOffset / Scale;
+ if (ScaledOffset > 63 || ScaledOffset < -64)
+ break;
+ }
+
+ // If we have a non-zero Offset, we check that it matches the amount
+ // we're adding to the register.
+ if (!Offset || Offset == UpdateOffset)
+ return true;
+ break;
+ }
+ return false;
+}
+
+MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
+ MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
+ MachineBasicBlock::iterator E = I->getParent()->end();
+ MachineInstr &MemMI = *I;
+ MachineBasicBlock::iterator MBBI = I;
+
+ unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
+ int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI);
+
+ // Scan forward looking for post-index opportunities. Updating instructions
+ // can't be formed if the memory instruction doesn't have the offset we're
+ // looking for.
+ if (MIUnscaledOffset != UnscaledOffset)
+ return E;
+
+ // If the base register overlaps a destination register, we can't
+ // merge the update.
+ bool IsPairedInsn = isPairedLdSt(MemMI);
+ for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
+ unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
+ if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
+ return E;
+ }
+
+ // Track which registers have been modified and used between the first insn
+ // (inclusive) and the second insn.
+ ModifiedRegs.reset();
+ UsedRegs.reset();
+ ++MBBI;
+ for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
+ MachineInstr &MI = *MBBI;
+
+ // Don't count transient instructions towards the search limit since there
+ // may be different numbers of them if e.g. debug information is present.
+ if (!MI.isTransient())
+ ++Count;
+
+ // If we found a match, return it.
+ if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset))
+ return MBBI;
+
+ // Update the status of what the instruction clobbered and used.
+ trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+
+ // Otherwise, if the base register is used or modified, we have no match, so
+ // return early.
+ if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
+ return E;
+ }
+ return E;
+}
+
+MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
+ MachineBasicBlock::iterator I, unsigned Limit) {
+ MachineBasicBlock::iterator B = I->getParent()->begin();
+ MachineBasicBlock::iterator E = I->getParent()->end();
+ MachineInstr &MemMI = *I;
+ MachineBasicBlock::iterator MBBI = I;
+
+ unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
+ int Offset = getLdStOffsetOp(MemMI).getImm();
+
+ // If the load/store is the first instruction in the block, there's obviously
+ // not any matching update. Ditto if the memory offset isn't zero.
+ if (MBBI == B || Offset != 0)
+ return E;
+ // If the base register overlaps a destination register, we can't
+ // merge the update.
+ bool IsPairedInsn = isPairedLdSt(MemMI);
+ for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
+ unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
+ if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
+ return E;
+ }
+
+ // Track which registers have been modified and used between the first insn
+ // (inclusive) and the second insn.
+ ModifiedRegs.reset();
+ UsedRegs.reset();
+ unsigned Count = 0;
+ do {
+ --MBBI;
+ MachineInstr &MI = *MBBI;
+
+ // Don't count transient instructions towards the search limit since there
+ // may be different numbers of them if e.g. debug information is present.
+ if (!MI.isTransient())
+ ++Count;
+
+ // If we found a match, return it.
+ if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset))
+ return MBBI;
+
+ // Update the status of what the instruction clobbered and used.
+ trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+
+ // Otherwise, if the base register is used or modified, we have no match, so
+ // return early.
+ if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
+ return E;
+ } while (MBBI != B && Count < Limit);
+ return E;
+}
+
+bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
+ MachineBasicBlock::iterator &MBBI) {
+ MachineInstr &MI = *MBBI;
+ // If this is a volatile load, don't mess with it.
+ if (MI.hasOrderedMemoryRef())
+ return false;
+
+ // Make sure this is a reg+imm.
+ // FIXME: It is possible to extend it to handle reg+reg cases.
+ if (!getLdStOffsetOp(MI).isImm())
+ return false;
+
+ // Look backward up to LdStLimit instructions.
+ MachineBasicBlock::iterator StoreI;
+ if (findMatchingStore(MBBI, LdStLimit, StoreI)) {
+ ++NumLoadsFromStoresPromoted;
+ // Promote the load. Keeping the iterator straight is a
+ // pain, so we let the merge routine tell us what the next instruction
+ // is after it's done mucking about.
+ MBBI = promoteLoadFromStore(MBBI, StoreI);
+ return true;
+ }
+ return false;
+}
+
+// Merge adjacent zero stores into a wider store.
+bool AArch64LoadStoreOpt::tryToMergeZeroStInst(
+ MachineBasicBlock::iterator &MBBI) {
+ assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store.");
+ MachineInstr &MI = *MBBI;
+ MachineBasicBlock::iterator E = MI.getParent()->end();
+
+ if (!TII->isCandidateToMergeOrPair(MI))
+ return false;
+
+ // Look ahead up to LdStLimit instructions for a mergable instruction.
+ LdStPairFlags Flags;
+ MachineBasicBlock::iterator MergeMI =
+ findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true);
+ if (MergeMI != E) {
+ ++NumZeroStoresPromoted;
+
+ // Keeping the iterator straight is a pain, so we let the merge routine tell
+ // us what the next instruction is after it's done mucking about.
+ MBBI = mergeNarrowZeroStores(MBBI, MergeMI, Flags);
+ return true;
+ }
+ return false;
+}
+
+// Find loads and stores that can be merged into a single load or store pair
+// instruction.
+bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
+ MachineInstr &MI = *MBBI;
+ MachineBasicBlock::iterator E = MI.getParent()->end();
+
+ if (!TII->isCandidateToMergeOrPair(MI))
+ return false;
+
+ // Early exit if the offset is not possible to match. (6 bits of positive
+ // range, plus allow an extra one in case we find a later insn that matches
+ // with Offset-1)
+ bool IsUnscaled = TII->isUnscaledLdSt(MI);
+ int Offset = getLdStOffsetOp(MI).getImm();
+ int OffsetStride = IsUnscaled ? getMemScale(MI) : 1;
+ if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
+ return false;
+
+ // Look ahead up to LdStLimit instructions for a pairable instruction.
+ LdStPairFlags Flags;
+ MachineBasicBlock::iterator Paired =
+ findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false);
+ if (Paired != E) {
+ ++NumPairCreated;
+ if (TII->isUnscaledLdSt(MI))
+ ++NumUnscaledPairCreated;
+ // Keeping the iterator straight is a pain, so we let the merge routine tell
+ // us what the next instruction is after it's done mucking about.
+ MBBI = mergePairedInsns(MBBI, Paired, Flags);
+ return true;
+ }
+ return false;
+}
+
+bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
+ bool EnableNarrowZeroStOpt) {
+ bool Modified = false;
+ // Four tranformations to do here:
+ // 1) Find loads that directly read from stores and promote them by
+ // replacing with mov instructions. If the store is wider than the load,
+ // the load will be replaced with a bitfield extract.
+ // e.g.,
+ // str w1, [x0, #4]
+ // ldrh w2, [x0, #6]
+ // ; becomes
+ // str w1, [x0, #4]
+ // lsr w2, w1, #16
+ for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+ MBBI != E;) {
+ MachineInstr &MI = *MBBI;
+ switch (MI.getOpcode()) {
+ default:
+ // Just move on to the next instruction.
+ ++MBBI;
+ break;
+ // Scaled instructions.
+ case AArch64::LDRBBui:
+ case AArch64::LDRHHui:
+ case AArch64::LDRWui:
+ case AArch64::LDRXui:
+ // Unscaled instructions.
+ case AArch64::LDURBBi:
+ case AArch64::LDURHHi:
+ case AArch64::LDURWi:
+ case AArch64::LDURXi: {
+ if (tryToPromoteLoadFromStore(MBBI)) {
+ Modified = true;
+ break;
+ }
+ ++MBBI;
+ break;
+ }
+ }
+ }
+ // 2) Merge adjacent zero stores into a wider store.
+ // e.g.,
+ // strh wzr, [x0]
+ // strh wzr, [x0, #2]
+ // ; becomes
+ // str wzr, [x0]
+ // e.g.,
+ // str wzr, [x0]
+ // str wzr, [x0, #4]
+ // ; becomes
+ // str xzr, [x0]
+ for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+ EnableNarrowZeroStOpt && MBBI != E;) {
+ if (isPromotableZeroStoreInst(*MBBI)) {
+ if (tryToMergeZeroStInst(MBBI)) {
+ Modified = true;
+ } else
+ ++MBBI;
+ } else
+ ++MBBI;
+ }
+
+ // 3) Find loads and stores that can be merged into a single load or store
+ // pair instruction.
+ // e.g.,
+ // ldr x0, [x2]
+ // ldr x1, [x2, #8]
+ // ; becomes
+ // ldp x0, x1, [x2]
+ for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+ MBBI != E;) {
+ if (TII->isPairableLdStInst(*MBBI) && tryToPairLdStInst(MBBI))
+ Modified = true;
+ else
+ ++MBBI;
+ }
+ // 4) Find base register updates that can be merged into the load or store
+ // as a base-reg writeback.
+ // e.g.,
+ // ldr x0, [x2]
+ // add x2, x2, #4
+ // ; becomes
+ // ldr x0, [x2], #4
+ for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+ MBBI != E;) {
+ MachineInstr &MI = *MBBI;
+ // Do update merging. It's simpler to keep this separate from the above
+ // switchs, though not strictly necessary.
+ unsigned Opc = MI.getOpcode();
+ switch (Opc) {
+ default:
+ // Just move on to the next instruction.
+ ++MBBI;
+ break;
+ // Scaled instructions.
+ case AArch64::STRSui:
+ case AArch64::STRDui:
+ case AArch64::STRQui:
+ case AArch64::STRXui:
+ case AArch64::STRWui:
+ case AArch64::STRHHui:
+ case AArch64::STRBBui:
+ case AArch64::LDRSui:
+ case AArch64::LDRDui:
+ case AArch64::LDRQui:
+ case AArch64::LDRXui:
+ case AArch64::LDRWui:
+ case AArch64::LDRHHui:
+ case AArch64::LDRBBui:
+ // Unscaled instructions.
+ case AArch64::STURSi:
+ case AArch64::STURDi:
+ case AArch64::STURQi:
+ case AArch64::STURWi:
+ case AArch64::STURXi:
+ case AArch64::LDURSi:
+ case AArch64::LDURDi:
+ case AArch64::LDURQi:
+ case AArch64::LDURWi:
+ case AArch64::LDURXi:
+ // Paired instructions.
+ case AArch64::LDPSi:
+ case AArch64::LDPSWi:
+ case AArch64::LDPDi:
+ case AArch64::LDPQi:
+ case AArch64::LDPWi:
+ case AArch64::LDPXi:
+ case AArch64::STPSi:
+ case AArch64::STPDi:
+ case AArch64::STPQi:
+ case AArch64::STPWi:
+ case AArch64::STPXi: {
+ // Make sure this is a reg+imm (as opposed to an address reloc).
+ if (!getLdStOffsetOp(MI).isImm()) {
+ ++MBBI;
+ break;
+ }
+ // Look forward to try to form a post-index instruction. For example,
+ // ldr x0, [x20]
+ // add x20, x20, #32
+ // merged into:
+ // ldr x0, [x20], #32
+ MachineBasicBlock::iterator Update =
+ findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit);
+ if (Update != E) {
+ // Merge the update into the ld/st.
+ MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);
+ Modified = true;
+ ++NumPostFolded;
+ break;
+ }
+ // Don't know how to handle pre/post-index versions, so move to the next
+ // instruction.
+ if (TII->isUnscaledLdSt(Opc)) {
+ ++MBBI;
+ break;
+ }
+
+ // Look back to try to find a pre-index instruction. For example,
+ // add x0, x0, #8
+ // ldr x1, [x0]
+ // merged into:
+ // ldr x1, [x0, #8]!
+ Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit);
+ if (Update != E) {
+ // Merge the update into the ld/st.
+ MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
+ Modified = true;
+ ++NumPreFolded;
+ break;
+ }
+ // The immediate in the load/store is scaled by the size of the memory
+ // operation. The immediate in the add we're looking for,
+ // however, is not, so adjust here.
+ int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI);
+
+ // Look forward to try to find a post-index instruction. For example,
+ // ldr x1, [x0, #64]
+ // add x0, x0, #64
+ // merged into:
+ // ldr x1, [x0, #64]!
+ Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit);
+ if (Update != E) {
+ // Merge the update into the ld/st.
+ MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
+ Modified = true;
+ ++NumPreFolded;
+ break;
+ }
+
+ // Nothing found. Just move to the next instruction.
+ ++MBBI;
+ break;
+ }
+ }
+ }
+
+ return Modified;
+}
+
+bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
+ if (skipFunction(*Fn.getFunction()))
+ return false;
+
+ Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
+ TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
+ TRI = Subtarget->getRegisterInfo();
+
+ // Resize the modified and used register bitfield trackers. We do this once
+ // per function and then clear the bitfield each time we optimize a load or
+ // store.
+ ModifiedRegs.resize(TRI->getNumRegs());
+ UsedRegs.resize(TRI->getNumRegs());
+
+ bool Modified = false;
+ bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign();
+ for (auto &MBB : Fn)
+ Modified |= optimizeBlock(MBB, enableNarrowZeroStOpt);
+
+ return Modified;
+}
+
+// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and
+// stores near one another? Note: The pre-RA instruction scheduler already has
+// hooks to try and schedule pairable loads/stores together to improve pairing
+// opportunities. Thus, pre-RA pairing pass may not be worth the effort.
+
+// FIXME: When pairing store instructions it's very possible for this pass to
+// hoist a store with a KILL marker above another use (without a KILL marker).
+// The resulting IR is invalid, but nothing uses the KILL markers after this
+// pass, so it's never caused a problem in practice.
+
+/// createAArch64LoadStoreOptimizationPass - returns an instance of the
+/// load / store optimization pass.
+FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
+ return new AArch64LoadStoreOpt();
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
new file mode 100644
index 000000000000..45083df7ab45
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -0,0 +1,216 @@
+//==-- AArch64MCInstLower.cpp - Convert AArch64 MachineInstr to an MCInst --==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower AArch64 MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64MCInstLower.h"
+#include "MCTargetDesc/AArch64MCExpr.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+extern cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration;
+
+AArch64MCInstLower::AArch64MCInstLower(MCContext &ctx, AsmPrinter &printer)
+ : Ctx(ctx), Printer(printer) {}
+
+MCSymbol *
+AArch64MCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
+ return Printer.getSymbol(MO.getGlobal());
+}
+
+MCSymbol *
+AArch64MCInstLower::GetExternalSymbolSymbol(const MachineOperand &MO) const {
+ return Printer.GetExternalSymbolSymbol(MO.getSymbolName());
+}
+
+MCOperand AArch64MCInstLower::lowerSymbolOperandDarwin(const MachineOperand &MO,
+ MCSymbol *Sym) const {
+ // FIXME: We would like an efficient form for this, so we don't have to do a
+ // lot of extra uniquing.
+ MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
+ if ((MO.getTargetFlags() & AArch64II::MO_GOT) != 0) {
+ if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE)
+ RefKind = MCSymbolRefExpr::VK_GOTPAGE;
+ else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
+ AArch64II::MO_PAGEOFF)
+ RefKind = MCSymbolRefExpr::VK_GOTPAGEOFF;
+ else
+ llvm_unreachable("Unexpected target flags with MO_GOT on GV operand");
+ } else if ((MO.getTargetFlags() & AArch64II::MO_TLS) != 0) {
+ if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE)
+ RefKind = MCSymbolRefExpr::VK_TLVPPAGE;
+ else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
+ AArch64II::MO_PAGEOFF)
+ RefKind = MCSymbolRefExpr::VK_TLVPPAGEOFF;
+ else
+ llvm_unreachable("Unexpected target flags with MO_TLS on GV operand");
+ } else {
+ if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE)
+ RefKind = MCSymbolRefExpr::VK_PAGE;
+ else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
+ AArch64II::MO_PAGEOFF)
+ RefKind = MCSymbolRefExpr::VK_PAGEOFF;
+ }
+ const MCExpr *Expr = MCSymbolRefExpr::create(Sym, RefKind, Ctx);
+ if (!MO.isJTI() && MO.getOffset())
+ Expr = MCBinaryExpr::createAdd(
+ Expr, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
+ return MCOperand::createExpr(Expr);
+}
+
+MCOperand AArch64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO,
+ MCSymbol *Sym) const {
+ uint32_t RefFlags = 0;
+
+ if (MO.getTargetFlags() & AArch64II::MO_GOT)
+ RefFlags |= AArch64MCExpr::VK_GOT;
+ else if (MO.getTargetFlags() & AArch64II::MO_TLS) {
+ TLSModel::Model Model;
+ if (MO.isGlobal()) {
+ const GlobalValue *GV = MO.getGlobal();
+ Model = Printer.TM.getTLSModel(GV);
+ if (!EnableAArch64ELFLocalDynamicTLSGeneration &&
+ Model == TLSModel::LocalDynamic)
+ Model = TLSModel::GeneralDynamic;
+
+ } else {
+ assert(MO.isSymbol() &&
+ StringRef(MO.getSymbolName()) == "_TLS_MODULE_BASE_" &&
+ "unexpected external TLS symbol");
+ // The general dynamic access sequence is used to get the
+ // address of _TLS_MODULE_BASE_.
+ Model = TLSModel::GeneralDynamic;
+ }
+ switch (Model) {
+ case TLSModel::InitialExec:
+ RefFlags |= AArch64MCExpr::VK_GOTTPREL;
+ break;
+ case TLSModel::LocalExec:
+ RefFlags |= AArch64MCExpr::VK_TPREL;
+ break;
+ case TLSModel::LocalDynamic:
+ RefFlags |= AArch64MCExpr::VK_DTPREL;
+ break;
+ case TLSModel::GeneralDynamic:
+ RefFlags |= AArch64MCExpr::VK_TLSDESC;
+ break;
+ }
+ } else {
+ // No modifier means this is a generic reference, classified as absolute for
+ // the cases where it matters (:abs_g0: etc).
+ RefFlags |= AArch64MCExpr::VK_ABS;
+ }
+
+ if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE)
+ RefFlags |= AArch64MCExpr::VK_PAGE;
+ else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
+ AArch64II::MO_PAGEOFF)
+ RefFlags |= AArch64MCExpr::VK_PAGEOFF;
+ else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G3)
+ RefFlags |= AArch64MCExpr::VK_G3;
+ else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G2)
+ RefFlags |= AArch64MCExpr::VK_G2;
+ else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G1)
+ RefFlags |= AArch64MCExpr::VK_G1;
+ else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G0)
+ RefFlags |= AArch64MCExpr::VK_G0;
+ else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_HI12)
+ RefFlags |= AArch64MCExpr::VK_HI12;
+
+ if (MO.getTargetFlags() & AArch64II::MO_NC)
+ RefFlags |= AArch64MCExpr::VK_NC;
+
+ const MCExpr *Expr =
+ MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, Ctx);
+ if (!MO.isJTI() && MO.getOffset())
+ Expr = MCBinaryExpr::createAdd(
+ Expr, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
+
+ AArch64MCExpr::VariantKind RefKind;
+ RefKind = static_cast<AArch64MCExpr::VariantKind>(RefFlags);
+ Expr = AArch64MCExpr::create(Expr, RefKind, Ctx);
+
+ return MCOperand::createExpr(Expr);
+}
+
+MCOperand AArch64MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
+ MCSymbol *Sym) const {
+ if (Printer.TM.getTargetTriple().isOSDarwin())
+ return lowerSymbolOperandDarwin(MO, Sym);
+
+ assert(Printer.TM.getTargetTriple().isOSBinFormatELF() &&
+ "Expect Darwin or ELF target");
+ return lowerSymbolOperandELF(MO, Sym);
+}
+
+bool AArch64MCInstLower::lowerOperand(const MachineOperand &MO,
+ MCOperand &MCOp) const {
+ switch (MO.getType()) {
+ default:
+ llvm_unreachable("unknown operand type");
+ case MachineOperand::MO_Register:
+ // Ignore all implicit register operands.
+ if (MO.isImplicit())
+ return false;
+ MCOp = MCOperand::createReg(MO.getReg());
+ break;
+ case MachineOperand::MO_RegisterMask:
+ // Regmasks are like implicit defs.
+ return false;
+ case MachineOperand::MO_Immediate:
+ MCOp = MCOperand::createImm(MO.getImm());
+ break;
+ case MachineOperand::MO_MachineBasicBlock:
+ MCOp = MCOperand::createExpr(
+ MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx));
+ break;
+ case MachineOperand::MO_GlobalAddress:
+ MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
+ break;
+ case MachineOperand::MO_ExternalSymbol:
+ MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO));
+ break;
+ case MachineOperand::MO_MCSymbol:
+ MCOp = LowerSymbolOperand(MO, MO.getMCSymbol());
+ break;
+ case MachineOperand::MO_JumpTableIndex:
+ MCOp = LowerSymbolOperand(MO, Printer.GetJTISymbol(MO.getIndex()));
+ break;
+ case MachineOperand::MO_ConstantPoolIndex:
+ MCOp = LowerSymbolOperand(MO, Printer.GetCPISymbol(MO.getIndex()));
+ break;
+ case MachineOperand::MO_BlockAddress:
+ MCOp = LowerSymbolOperand(
+ MO, Printer.GetBlockAddressSymbol(MO.getBlockAddress()));
+ break;
+ }
+ return true;
+}
+
+void AArch64MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+ OutMI.setOpcode(MI->getOpcode());
+
+ for (const MachineOperand &MO : MI->operands()) {
+ MCOperand MCOp;
+ if (lowerOperand(MO, MCOp))
+ OutMI.addOperand(MCOp);
+ }
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.h b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.h
new file mode 100644
index 000000000000..1e29b80c2d62
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.h
@@ -0,0 +1,52 @@
+//===-- AArch64MCInstLower.h - Lower MachineInstr to MCInst ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MCINSTLOWER_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64MCINSTLOWER_H
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+class AsmPrinter;
+class MCAsmInfo;
+class MCContext;
+class MCInst;
+class MCOperand;
+class MCSymbol;
+class MachineInstr;
+class MachineModuleInfoMachO;
+class MachineOperand;
+class Mangler;
+
+/// AArch64MCInstLower - This class is used to lower an MachineInstr
+/// into an MCInst.
+class LLVM_LIBRARY_VISIBILITY AArch64MCInstLower {
+ MCContext &Ctx;
+ AsmPrinter &Printer;
+ Triple TargetTriple;
+
+public:
+ AArch64MCInstLower(MCContext &ctx, AsmPrinter &printer);
+
+ bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
+ void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+ MCOperand lowerSymbolOperandDarwin(const MachineOperand &MO,
+ MCSymbol *Sym) const;
+ MCOperand lowerSymbolOperandELF(const MachineOperand &MO,
+ MCSymbol *Sym) const;
+ MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+
+ MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
+ MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
new file mode 100644
index 000000000000..ca2860afe13d
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -0,0 +1,198 @@
+//=- AArch64MachineFunctionInfo.h - AArch64 machine function info -*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares AArch64-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/MC/MCLinkerOptimizationHint.h"
+
+namespace llvm {
+
+/// AArch64FunctionInfo - This class is derived from MachineFunctionInfo and
+/// contains private AArch64-specific information for each MachineFunction.
+class AArch64FunctionInfo final : public MachineFunctionInfo {
+
+ /// Number of bytes of arguments this function has on the stack. If the callee
+ /// is expected to restore the argument stack this should be a multiple of 16,
+ /// all usable during a tail call.
+ ///
+ /// The alternative would forbid tail call optimisation in some cases: if we
+ /// want to transfer control from a function with 8-bytes of stack-argument
+ /// space to a function with 16-bytes then misalignment of this value would
+ /// make a stack adjustment necessary, which could not be undone by the
+ /// callee.
+ unsigned BytesInStackArgArea;
+
+ /// The number of bytes to restore to deallocate space for incoming
+ /// arguments. Canonically 0 in the C calling convention, but non-zero when
+ /// callee is expected to pop the args.
+ unsigned ArgumentStackToRestore;
+
+ /// HasStackFrame - True if this function has a stack frame. Set by
+ /// determineCalleeSaves().
+ bool HasStackFrame;
+
+ /// \brief Amount of stack frame size, not including callee-saved registers.
+ unsigned LocalStackSize;
+
+ /// \brief Amount of stack frame size used for saving callee-saved registers.
+ unsigned CalleeSavedStackSize;
+
+ /// \brief Number of TLS accesses using the special (combinable)
+ /// _TLS_MODULE_BASE_ symbol.
+ unsigned NumLocalDynamicTLSAccesses;
+
+ /// \brief FrameIndex for start of varargs area for arguments passed on the
+ /// stack.
+ int VarArgsStackIndex;
+
+ /// \brief FrameIndex for start of varargs area for arguments passed in
+ /// general purpose registers.
+ int VarArgsGPRIndex;
+
+ /// \brief Size of the varargs area for arguments passed in general purpose
+ /// registers.
+ unsigned VarArgsGPRSize;
+
+ /// \brief FrameIndex for start of varargs area for arguments passed in
+ /// floating-point registers.
+ int VarArgsFPRIndex;
+
+ /// \brief Size of the varargs area for arguments passed in floating-point
+ /// registers.
+ unsigned VarArgsFPRSize;
+
+ /// True if this function has a subset of CSRs that is handled explicitly via
+ /// copies.
+ bool IsSplitCSR;
+
+ /// True when the stack gets realigned dynamically because the size of stack
+ /// frame is unknown at compile time. e.g., in case of VLAs.
+ bool StackRealigned;
+
+ /// True when the callee-save stack area has unused gaps that may be used for
+ /// other stack allocations.
+ bool CalleeSaveStackHasFreeSpace;
+
+public:
+ AArch64FunctionInfo()
+ : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
+ NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
+ VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0),
+ IsSplitCSR(false), StackRealigned(false),
+ CalleeSaveStackHasFreeSpace(false) {}
+
+ explicit AArch64FunctionInfo(MachineFunction &MF)
+ : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
+ NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
+ VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0),
+ IsSplitCSR(false), StackRealigned(false),
+ CalleeSaveStackHasFreeSpace(false) {
+ (void)MF;
+ }
+
+ unsigned getBytesInStackArgArea() const { return BytesInStackArgArea; }
+ void setBytesInStackArgArea(unsigned bytes) { BytesInStackArgArea = bytes; }
+
+ unsigned getArgumentStackToRestore() const { return ArgumentStackToRestore; }
+ void setArgumentStackToRestore(unsigned bytes) {
+ ArgumentStackToRestore = bytes;
+ }
+
+ bool hasStackFrame() const { return HasStackFrame; }
+ void setHasStackFrame(bool s) { HasStackFrame = s; }
+
+ bool isStackRealigned() const { return StackRealigned; }
+ void setStackRealigned(bool s) { StackRealigned = s; }
+
+ bool hasCalleeSaveStackFreeSpace() const {
+ return CalleeSaveStackHasFreeSpace;
+ }
+ void setCalleeSaveStackHasFreeSpace(bool s) {
+ CalleeSaveStackHasFreeSpace = s;
+ }
+
+ bool isSplitCSR() const { return IsSplitCSR; }
+ void setIsSplitCSR(bool s) { IsSplitCSR = s; }
+
+ void setLocalStackSize(unsigned Size) { LocalStackSize = Size; }
+ unsigned getLocalStackSize() const { return LocalStackSize; }
+
+ void setCalleeSavedStackSize(unsigned Size) { CalleeSavedStackSize = Size; }
+ unsigned getCalleeSavedStackSize() const { return CalleeSavedStackSize; }
+
+ void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamicTLSAccesses; }
+ unsigned getNumLocalDynamicTLSAccesses() const {
+ return NumLocalDynamicTLSAccesses;
+ }
+
+ int getVarArgsStackIndex() const { return VarArgsStackIndex; }
+ void setVarArgsStackIndex(int Index) { VarArgsStackIndex = Index; }
+
+ int getVarArgsGPRIndex() const { return VarArgsGPRIndex; }
+ void setVarArgsGPRIndex(int Index) { VarArgsGPRIndex = Index; }
+
+ unsigned getVarArgsGPRSize() const { return VarArgsGPRSize; }
+ void setVarArgsGPRSize(unsigned Size) { VarArgsGPRSize = Size; }
+
+ int getVarArgsFPRIndex() const { return VarArgsFPRIndex; }
+ void setVarArgsFPRIndex(int Index) { VarArgsFPRIndex = Index; }
+
+ unsigned getVarArgsFPRSize() const { return VarArgsFPRSize; }
+ void setVarArgsFPRSize(unsigned Size) { VarArgsFPRSize = Size; }
+
+ typedef SmallPtrSet<const MachineInstr *, 16> SetOfInstructions;
+
+ const SetOfInstructions &getLOHRelated() const { return LOHRelated; }
+
+ // Shortcuts for LOH related types.
+ class MILOHDirective {
+ MCLOHType Kind;
+
+ /// Arguments of this directive. Order matters.
+ SmallVector<const MachineInstr *, 3> Args;
+
+ public:
+ typedef ArrayRef<const MachineInstr *> LOHArgs;
+
+ MILOHDirective(MCLOHType Kind, LOHArgs Args)
+ : Kind(Kind), Args(Args.begin(), Args.end()) {
+ assert(isValidMCLOHType(Kind) && "Invalid LOH directive type!");
+ }
+
+ MCLOHType getKind() const { return Kind; }
+ LOHArgs getArgs() const { return Args; }
+ };
+
+ typedef MILOHDirective::LOHArgs MILOHArgs;
+ typedef SmallVector<MILOHDirective, 32> MILOHContainer;
+
+ const MILOHContainer &getLOHContainer() const { return LOHContainerSet; }
+
+ /// Add a LOH directive of this @p Kind and this @p Args.
+ void addLOHDirective(MCLOHType Kind, MILOHArgs Args) {
+ LOHContainerSet.push_back(MILOHDirective(Kind, Args));
+ LOHRelated.insert(Args.begin(), Args.end());
+ }
+
+private:
+ // Hold the lists of LOHs.
+ MILOHContainer LOHContainerSet;
+ SetOfInstructions LOHRelated;
+};
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp b/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
new file mode 100644
index 000000000000..038162c6f54a
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
@@ -0,0 +1,383 @@
+//===-- AArch64PBQPRegAlloc.cpp - AArch64 specific PBQP constraints -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This file contains the AArch64 / Cortex-A57 specific register allocation
+// constraints for use by the PBQP register allocator.
+//
+// It is essentially a transcription of what is contained in
+// AArch64A57FPLoadBalancing, which tries to use a balanced
+// mix of odd and even D-registers when performing a critical sequence of
+// independent, non-quadword FP/ASIMD floating-point multiply-accumulates.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "aarch64-pbqp"
+
+#include "AArch64.h"
+#include "AArch64PBQPRegAlloc.h"
+#include "AArch64RegisterInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegAllocPBQP.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+#ifndef NDEBUG
+bool isFPReg(unsigned reg) {
+ return AArch64::FPR32RegClass.contains(reg) ||
+ AArch64::FPR64RegClass.contains(reg) ||
+ AArch64::FPR128RegClass.contains(reg);
+}
+#endif
+
+bool isOdd(unsigned reg) {
+ switch (reg) {
+ default:
+ llvm_unreachable("Register is not from the expected class !");
+ case AArch64::S1:
+ case AArch64::S3:
+ case AArch64::S5:
+ case AArch64::S7:
+ case AArch64::S9:
+ case AArch64::S11:
+ case AArch64::S13:
+ case AArch64::S15:
+ case AArch64::S17:
+ case AArch64::S19:
+ case AArch64::S21:
+ case AArch64::S23:
+ case AArch64::S25:
+ case AArch64::S27:
+ case AArch64::S29:
+ case AArch64::S31:
+ case AArch64::D1:
+ case AArch64::D3:
+ case AArch64::D5:
+ case AArch64::D7:
+ case AArch64::D9:
+ case AArch64::D11:
+ case AArch64::D13:
+ case AArch64::D15:
+ case AArch64::D17:
+ case AArch64::D19:
+ case AArch64::D21:
+ case AArch64::D23:
+ case AArch64::D25:
+ case AArch64::D27:
+ case AArch64::D29:
+ case AArch64::D31:
+ case AArch64::Q1:
+ case AArch64::Q3:
+ case AArch64::Q5:
+ case AArch64::Q7:
+ case AArch64::Q9:
+ case AArch64::Q11:
+ case AArch64::Q13:
+ case AArch64::Q15:
+ case AArch64::Q17:
+ case AArch64::Q19:
+ case AArch64::Q21:
+ case AArch64::Q23:
+ case AArch64::Q25:
+ case AArch64::Q27:
+ case AArch64::Q29:
+ case AArch64::Q31:
+ return true;
+ case AArch64::S0:
+ case AArch64::S2:
+ case AArch64::S4:
+ case AArch64::S6:
+ case AArch64::S8:
+ case AArch64::S10:
+ case AArch64::S12:
+ case AArch64::S14:
+ case AArch64::S16:
+ case AArch64::S18:
+ case AArch64::S20:
+ case AArch64::S22:
+ case AArch64::S24:
+ case AArch64::S26:
+ case AArch64::S28:
+ case AArch64::S30:
+ case AArch64::D0:
+ case AArch64::D2:
+ case AArch64::D4:
+ case AArch64::D6:
+ case AArch64::D8:
+ case AArch64::D10:
+ case AArch64::D12:
+ case AArch64::D14:
+ case AArch64::D16:
+ case AArch64::D18:
+ case AArch64::D20:
+ case AArch64::D22:
+ case AArch64::D24:
+ case AArch64::D26:
+ case AArch64::D28:
+ case AArch64::D30:
+ case AArch64::Q0:
+ case AArch64::Q2:
+ case AArch64::Q4:
+ case AArch64::Q6:
+ case AArch64::Q8:
+ case AArch64::Q10:
+ case AArch64::Q12:
+ case AArch64::Q14:
+ case AArch64::Q16:
+ case AArch64::Q18:
+ case AArch64::Q20:
+ case AArch64::Q22:
+ case AArch64::Q24:
+ case AArch64::Q26:
+ case AArch64::Q28:
+ case AArch64::Q30:
+ return false;
+
+ }
+}
+
+bool haveSameParity(unsigned reg1, unsigned reg2) {
+ assert(isFPReg(reg1) && "Expecting an FP register for reg1");
+ assert(isFPReg(reg2) && "Expecting an FP register for reg2");
+
+ return isOdd(reg1) == isOdd(reg2);
+}
+
+}
+
+bool A57ChainingConstraint::addIntraChainConstraint(PBQPRAGraph &G, unsigned Rd,
+ unsigned Ra) {
+ if (Rd == Ra)
+ return false;
+
+ LiveIntervals &LIs = G.getMetadata().LIS;
+
+ if (TRI->isPhysicalRegister(Rd) || TRI->isPhysicalRegister(Ra)) {
+ DEBUG(dbgs() << "Rd is a physical reg:" << TRI->isPhysicalRegister(Rd)
+ << '\n');
+ DEBUG(dbgs() << "Ra is a physical reg:" << TRI->isPhysicalRegister(Ra)
+ << '\n');
+ return false;
+ }
+
+ PBQPRAGraph::NodeId node1 = G.getMetadata().getNodeIdForVReg(Rd);
+ PBQPRAGraph::NodeId node2 = G.getMetadata().getNodeIdForVReg(Ra);
+
+ const PBQPRAGraph::NodeMetadata::AllowedRegVector *vRdAllowed =
+ &G.getNodeMetadata(node1).getAllowedRegs();
+ const PBQPRAGraph::NodeMetadata::AllowedRegVector *vRaAllowed =
+ &G.getNodeMetadata(node2).getAllowedRegs();
+
+ PBQPRAGraph::EdgeId edge = G.findEdge(node1, node2);
+
+ // The edge does not exist. Create one with the appropriate interference
+ // costs.
+ if (edge == G.invalidEdgeId()) {
+ const LiveInterval &ld = LIs.getInterval(Rd);
+ const LiveInterval &la = LIs.getInterval(Ra);
+ bool livesOverlap = ld.overlaps(la);
+
+ PBQPRAGraph::RawMatrix costs(vRdAllowed->size() + 1,
+ vRaAllowed->size() + 1, 0);
+ for (unsigned i = 0, ie = vRdAllowed->size(); i != ie; ++i) {
+ unsigned pRd = (*vRdAllowed)[i];
+ for (unsigned j = 0, je = vRaAllowed->size(); j != je; ++j) {
+ unsigned pRa = (*vRaAllowed)[j];
+ if (livesOverlap && TRI->regsOverlap(pRd, pRa))
+ costs[i + 1][j + 1] = std::numeric_limits<PBQP::PBQPNum>::infinity();
+ else
+ costs[i + 1][j + 1] = haveSameParity(pRd, pRa) ? 0.0 : 1.0;
+ }
+ }
+ G.addEdge(node1, node2, std::move(costs));
+ return true;
+ }
+
+ if (G.getEdgeNode1Id(edge) == node2) {
+ std::swap(node1, node2);
+ std::swap(vRdAllowed, vRaAllowed);
+ }
+
+ // Enforce minCost(sameParity(RaClass)) > maxCost(otherParity(RdClass))
+ PBQPRAGraph::RawMatrix costs(G.getEdgeCosts(edge));
+ for (unsigned i = 0, ie = vRdAllowed->size(); i != ie; ++i) {
+ unsigned pRd = (*vRdAllowed)[i];
+
+ // Get the maximum cost (excluding unallocatable reg) for same parity
+ // registers
+ PBQP::PBQPNum sameParityMax = std::numeric_limits<PBQP::PBQPNum>::min();
+ for (unsigned j = 0, je = vRaAllowed->size(); j != je; ++j) {
+ unsigned pRa = (*vRaAllowed)[j];
+ if (haveSameParity(pRd, pRa))
+ if (costs[i + 1][j + 1] !=
+ std::numeric_limits<PBQP::PBQPNum>::infinity() &&
+ costs[i + 1][j + 1] > sameParityMax)
+ sameParityMax = costs[i + 1][j + 1];
+ }
+
+ // Ensure all registers with a different parity have a higher cost
+ // than sameParityMax
+ for (unsigned j = 0, je = vRaAllowed->size(); j != je; ++j) {
+ unsigned pRa = (*vRaAllowed)[j];
+ if (!haveSameParity(pRd, pRa))
+ if (sameParityMax > costs[i + 1][j + 1])
+ costs[i + 1][j + 1] = sameParityMax + 1.0;
+ }
+ }
+ G.updateEdgeCosts(edge, std::move(costs));
+
+ return true;
+}
+
+void A57ChainingConstraint::addInterChainConstraint(PBQPRAGraph &G, unsigned Rd,
+ unsigned Ra) {
+ LiveIntervals &LIs = G.getMetadata().LIS;
+
+ // Do some Chain management
+ if (Chains.count(Ra)) {
+ if (Rd != Ra) {
+ DEBUG(dbgs() << "Moving acc chain from " << PrintReg(Ra, TRI) << " to "
+ << PrintReg(Rd, TRI) << '\n';);
+ Chains.remove(Ra);
+ Chains.insert(Rd);
+ }
+ } else {
+ DEBUG(dbgs() << "Creating new acc chain for " << PrintReg(Rd, TRI)
+ << '\n';);
+ Chains.insert(Rd);
+ }
+
+ PBQPRAGraph::NodeId node1 = G.getMetadata().getNodeIdForVReg(Rd);
+
+ const LiveInterval &ld = LIs.getInterval(Rd);
+ for (auto r : Chains) {
+ // Skip self
+ if (r == Rd)
+ continue;
+
+ const LiveInterval &lr = LIs.getInterval(r);
+ if (ld.overlaps(lr)) {
+ const PBQPRAGraph::NodeMetadata::AllowedRegVector *vRdAllowed =
+ &G.getNodeMetadata(node1).getAllowedRegs();
+
+ PBQPRAGraph::NodeId node2 = G.getMetadata().getNodeIdForVReg(r);
+ const PBQPRAGraph::NodeMetadata::AllowedRegVector *vRrAllowed =
+ &G.getNodeMetadata(node2).getAllowedRegs();
+
+ PBQPRAGraph::EdgeId edge = G.findEdge(node1, node2);
+ assert(edge != G.invalidEdgeId() &&
+ "PBQP error ! The edge should exist !");
+
+ DEBUG(dbgs() << "Refining constraint !\n";);
+
+ if (G.getEdgeNode1Id(edge) == node2) {
+ std::swap(node1, node2);
+ std::swap(vRdAllowed, vRrAllowed);
+ }
+
+ // Enforce that cost is higher with all other Chains of the same parity
+ PBQP::Matrix costs(G.getEdgeCosts(edge));
+ for (unsigned i = 0, ie = vRdAllowed->size(); i != ie; ++i) {
+ unsigned pRd = (*vRdAllowed)[i];
+
+ // Get the maximum cost (excluding unallocatable reg) for all other
+ // parity registers
+ PBQP::PBQPNum sameParityMax = std::numeric_limits<PBQP::PBQPNum>::min();
+ for (unsigned j = 0, je = vRrAllowed->size(); j != je; ++j) {
+ unsigned pRa = (*vRrAllowed)[j];
+ if (!haveSameParity(pRd, pRa))
+ if (costs[i + 1][j + 1] !=
+ std::numeric_limits<PBQP::PBQPNum>::infinity() &&
+ costs[i + 1][j + 1] > sameParityMax)
+ sameParityMax = costs[i + 1][j + 1];
+ }
+
+ // Ensure all registers with same parity have a higher cost
+ // than sameParityMax
+ for (unsigned j = 0, je = vRrAllowed->size(); j != je; ++j) {
+ unsigned pRa = (*vRrAllowed)[j];
+ if (haveSameParity(pRd, pRa))
+ if (sameParityMax > costs[i + 1][j + 1])
+ costs[i + 1][j + 1] = sameParityMax + 1.0;
+ }
+ }
+ G.updateEdgeCosts(edge, std::move(costs));
+ }
+ }
+}
+
+static bool regJustKilledBefore(const LiveIntervals &LIs, unsigned reg,
+ const MachineInstr &MI) {
+ const LiveInterval &LI = LIs.getInterval(reg);
+ SlotIndex SI = LIs.getInstructionIndex(MI);
+ return LI.expiredAt(SI);
+}
+
+void A57ChainingConstraint::apply(PBQPRAGraph &G) {
+ const MachineFunction &MF = G.getMetadata().MF;
+ LiveIntervals &LIs = G.getMetadata().LIS;
+
+ TRI = MF.getSubtarget().getRegisterInfo();
+ DEBUG(MF.dump());
+
+ for (const auto &MBB: MF) {
+ Chains.clear(); // FIXME: really needed ? Could not work at MF level ?
+
+ for (const auto &MI: MBB) {
+
+ // Forget Chains which have expired
+ for (auto r : Chains) {
+ SmallVector<unsigned, 8> toDel;
+ if(regJustKilledBefore(LIs, r, MI)) {
+ DEBUG(dbgs() << "Killing chain " << PrintReg(r, TRI) << " at ";
+ MI.print(dbgs()););
+ toDel.push_back(r);
+ }
+
+ while (!toDel.empty()) {
+ Chains.remove(toDel.back());
+ toDel.pop_back();
+ }
+ }
+
+ switch (MI.getOpcode()) {
+ case AArch64::FMSUBSrrr:
+ case AArch64::FMADDSrrr:
+ case AArch64::FNMSUBSrrr:
+ case AArch64::FNMADDSrrr:
+ case AArch64::FMSUBDrrr:
+ case AArch64::FMADDDrrr:
+ case AArch64::FNMSUBDrrr:
+ case AArch64::FNMADDDrrr: {
+ unsigned Rd = MI.getOperand(0).getReg();
+ unsigned Ra = MI.getOperand(3).getReg();
+
+ if (addIntraChainConstraint(G, Rd, Ra))
+ addInterChainConstraint(G, Rd, Ra);
+ break;
+ }
+
+ case AArch64::FMLAv2f32:
+ case AArch64::FMLSv2f32: {
+ unsigned Rd = MI.getOperand(0).getReg();
+ addInterChainConstraint(G, Rd, Rd);
+ break;
+ }
+
+ default:
+ break;
+ }
+ }
+ }
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.h b/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.h
new file mode 100644
index 000000000000..4f656f94ea12
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.h
@@ -0,0 +1,38 @@
+//===-- AArch64PBQPRegAlloc.h - AArch64 specific PBQP constraints -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64PBQPREGALOC_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64PBQPREGALOC_H
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/PBQPRAConstraint.h"
+
+namespace llvm {
+
+/// Add the accumulator chaining constraint to a PBQP graph
+class A57ChainingConstraint : public PBQPRAConstraint {
+public:
+ // Add A57 specific constraints to the PBQP graph.
+ void apply(PBQPRAGraph &G) override;
+
+private:
+ SmallSetVector<unsigned, 32> Chains;
+ const TargetRegisterInfo *TRI;
+
+ // Add the accumulator chaining constraint, inside the chain, i.e. so that
+ // parity(Rd) == parity(Ra).
+ // \return true if a constraint was added
+ bool addIntraChainConstraint(PBQPRAGraph &G, unsigned Rd, unsigned Ra);
+
+ // Add constraints between existing chains
+ void addInterChainConstraint(PBQPRAGraph &G, unsigned Rd, unsigned Ra);
+};
+}
+
+#endif // LLVM_LIB_TARGET_AARCH64_AARCH64PBQPREGALOC_H
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h b/contrib/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
new file mode 100644
index 000000000000..9e9eec48c555
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
@@ -0,0 +1,6591 @@
+//===-- AArch64PerfectShuffle.h - AdvSIMD Perfect Shuffle Table -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file, which was autogenerated by llvm-PerfectShuffle, contains data
+// for the optimal way to build a perfect shuffle using AdvSIMD instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64PERFECTSHUFFLE_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64PERFECTSHUFFLE_H
+
+// 31 entries have cost 0
+// 242 entries have cost 1
+// 1447 entries have cost 2
+// 3602 entries have cost 3
+// 1237 entries have cost 4
+// 2 entries have cost 5
+
+// This table is 6561*4 = 26244 bytes in size.
+static const unsigned PerfectShuffleTable[6561+1] = {
+ 135053414U, // <0,0,0,0>: Cost 1 vdup0 LHS
+ 1543503974U, // <0,0,0,1>: Cost 2 vext2 <0,0,0,0>, LHS
+ 2618572962U, // <0,0,0,2>: Cost 3 vext2 <0,2,0,0>, <0,2,0,0>
+ 2568054923U, // <0,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
+ 1476398390U, // <0,0,0,4>: Cost 2 vext1 <0,0,0,0>, RHS
+ 2550140624U, // <0,0,0,5>: Cost 3 vext1 <0,0,0,0>, <5,1,7,3>
+ 2550141434U, // <0,0,0,6>: Cost 3 vext1 <0,0,0,0>, <6,2,7,3>
+ 2591945711U, // <0,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
+ 135053414U, // <0,0,0,u>: Cost 1 vdup0 LHS
+ 2886516736U, // <0,0,1,0>: Cost 3 vzipl LHS, <0,0,0,0>
+ 1812775014U, // <0,0,1,1>: Cost 2 vzipl LHS, LHS
+ 1618133094U, // <0,0,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
+ 2625209292U, // <0,0,1,3>: Cost 3 vext2 <1,3,0,0>, <1,3,0,0>
+ 2886558034U, // <0,0,1,4>: Cost 3 vzipl LHS, <0,4,1,5>
+ 2617246864U, // <0,0,1,5>: Cost 3 vext2 <0,0,0,0>, <1,5,3,7>
+ 3659723031U, // <0,0,1,6>: Cost 4 vext1 <6,0,0,1>, <6,0,0,1>
+ 2591953904U, // <0,0,1,7>: Cost 3 vext1 <7,0,0,1>, <7,0,0,1>
+ 1812775581U, // <0,0,1,u>: Cost 2 vzipl LHS, LHS
+ 3020734464U, // <0,0,2,0>: Cost 3 vtrnl LHS, <0,0,0,0>
+ 3020734474U, // <0,0,2,1>: Cost 3 vtrnl LHS, <0,0,1,1>
+ 1946992742U, // <0,0,2,2>: Cost 2 vtrnl LHS, LHS
+ 2631181989U, // <0,0,2,3>: Cost 3 vext2 <2,3,0,0>, <2,3,0,0>
+ 3020734668U, // <0,0,2,4>: Cost 3 vtrnl LHS, <0,2,4,6>
+ 3826550569U, // <0,0,2,5>: Cost 4 vuzpl <0,2,0,2>, <2,4,5,6>
+ 2617247674U, // <0,0,2,6>: Cost 3 vext2 <0,0,0,0>, <2,6,3,7>
+ 2591962097U, // <0,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
+ 1946992796U, // <0,0,2,u>: Cost 2 vtrnl LHS, LHS
+ 2635163787U, // <0,0,3,0>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
+ 2686419196U, // <0,0,3,1>: Cost 3 vext3 <0,3,1,0>, <0,3,1,0>
+ 2686492933U, // <0,0,3,2>: Cost 3 vext3 <0,3,2,0>, <0,3,2,0>
+ 2617248156U, // <0,0,3,3>: Cost 3 vext2 <0,0,0,0>, <3,3,3,3>
+ 2617248258U, // <0,0,3,4>: Cost 3 vext2 <0,0,0,0>, <3,4,5,6>
+ 3826551298U, // <0,0,3,5>: Cost 4 vuzpl <0,2,0,2>, <3,4,5,6>
+ 3690990200U, // <0,0,3,6>: Cost 4 vext2 <0,0,0,0>, <3,6,0,7>
+ 3713551042U, // <0,0,3,7>: Cost 4 vext2 <3,7,0,0>, <3,7,0,0>
+ 2635163787U, // <0,0,3,u>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
+ 2617248658U, // <0,0,4,0>: Cost 3 vext2 <0,0,0,0>, <4,0,5,1>
+ 2888450150U, // <0,0,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
+ 3021570150U, // <0,0,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
+ 3641829519U, // <0,0,4,3>: Cost 4 vext1 <3,0,0,4>, <3,0,0,4>
+ 3021570252U, // <0,0,4,4>: Cost 3 vtrnl <0,2,4,6>, <0,2,4,6>
+ 1543507254U, // <0,0,4,5>: Cost 2 vext2 <0,0,0,0>, RHS
+ 2752810294U, // <0,0,4,6>: Cost 3 vuzpl <0,2,0,2>, RHS
+ 3786998152U, // <0,0,4,7>: Cost 4 vext3 <4,7,5,0>, <0,4,7,5>
+ 1543507497U, // <0,0,4,u>: Cost 2 vext2 <0,0,0,0>, RHS
+ 2684354972U, // <0,0,5,0>: Cost 3 vext3 <0,0,0,0>, <0,5,0,7>
+ 2617249488U, // <0,0,5,1>: Cost 3 vext2 <0,0,0,0>, <5,1,7,3>
+ 3765617070U, // <0,0,5,2>: Cost 4 vext3 <1,2,3,0>, <0,5,2,7>
+ 3635865780U, // <0,0,5,3>: Cost 4 vext1 <2,0,0,5>, <3,0,4,5>
+ 2617249734U, // <0,0,5,4>: Cost 3 vext2 <0,0,0,0>, <5,4,7,6>
+ 2617249796U, // <0,0,5,5>: Cost 3 vext2 <0,0,0,0>, <5,5,5,5>
+ 2718712274U, // <0,0,5,6>: Cost 3 vext3 <5,6,7,0>, <0,5,6,7>
+ 2617249960U, // <0,0,5,7>: Cost 3 vext2 <0,0,0,0>, <5,7,5,7>
+ 2720039396U, // <0,0,5,u>: Cost 3 vext3 <5,u,7,0>, <0,5,u,7>
+ 2684355053U, // <0,0,6,0>: Cost 3 vext3 <0,0,0,0>, <0,6,0,7>
+ 3963609190U, // <0,0,6,1>: Cost 4 vzipl <0,6,2,7>, LHS
+ 2617250298U, // <0,0,6,2>: Cost 3 vext2 <0,0,0,0>, <6,2,7,3>
+ 3796435464U, // <0,0,6,3>: Cost 4 vext3 <6,3,7,0>, <0,6,3,7>
+ 3659762998U, // <0,0,6,4>: Cost 4 vext1 <6,0,0,6>, RHS
+ 3659763810U, // <0,0,6,5>: Cost 4 vext1 <6,0,0,6>, <5,6,7,0>
+ 2617250616U, // <0,0,6,6>: Cost 3 vext2 <0,0,0,0>, <6,6,6,6>
+ 2657727309U, // <0,0,6,7>: Cost 3 vext2 <6,7,0,0>, <6,7,0,0>
+ 2658390942U, // <0,0,6,u>: Cost 3 vext2 <6,u,0,0>, <6,u,0,0>
+ 2659054575U, // <0,0,7,0>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
+ 3635880854U, // <0,0,7,1>: Cost 4 vext1 <2,0,0,7>, <1,2,3,0>
+ 3635881401U, // <0,0,7,2>: Cost 4 vext1 <2,0,0,7>, <2,0,0,7>
+ 3734787298U, // <0,0,7,3>: Cost 4 vext2 <7,3,0,0>, <7,3,0,0>
+ 2617251174U, // <0,0,7,4>: Cost 3 vext2 <0,0,0,0>, <7,4,5,6>
+ 3659772002U, // <0,0,7,5>: Cost 4 vext1 <6,0,0,7>, <5,6,7,0>
+ 3659772189U, // <0,0,7,6>: Cost 4 vext1 <6,0,0,7>, <6,0,0,7>
+ 2617251436U, // <0,0,7,7>: Cost 3 vext2 <0,0,0,0>, <7,7,7,7>
+ 2659054575U, // <0,0,7,u>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
+ 135053414U, // <0,0,u,0>: Cost 1 vdup0 LHS
+ 1817419878U, // <0,0,u,1>: Cost 2 vzipl LHS, LHS
+ 1947435110U, // <0,0,u,2>: Cost 2 vtrnl LHS, LHS
+ 2568120467U, // <0,0,u,3>: Cost 3 vext1 <3,0,0,u>, <3,0,0,u>
+ 1476463926U, // <0,0,u,4>: Cost 2 vext1 <0,0,0,u>, RHS
+ 1543510170U, // <0,0,u,5>: Cost 2 vext2 <0,0,0,0>, RHS
+ 2752813210U, // <0,0,u,6>: Cost 3 vuzpl <0,2,0,2>, RHS
+ 2592011255U, // <0,0,u,7>: Cost 3 vext1 <7,0,0,u>, <7,0,0,u>
+ 135053414U, // <0,0,u,u>: Cost 1 vdup0 LHS
+ 2618581002U, // <0,1,0,0>: Cost 3 vext2 <0,2,0,1>, <0,0,1,1>
+ 1557446758U, // <0,1,0,1>: Cost 2 vext2 <2,3,0,1>, LHS
+ 2618581155U, // <0,1,0,2>: Cost 3 vext2 <0,2,0,1>, <0,2,0,1>
+ 2690548468U, // <0,1,0,3>: Cost 3 vext3 <1,0,3,0>, <1,0,3,0>
+ 2626543954U, // <0,1,0,4>: Cost 3 vext2 <1,5,0,1>, <0,4,1,5>
+ 4094985216U, // <0,1,0,5>: Cost 4 vtrnl <0,2,0,2>, <1,3,5,7>
+ 2592019278U, // <0,1,0,6>: Cost 3 vext1 <7,0,1,0>, <6,7,0,1>
+ 2592019448U, // <0,1,0,7>: Cost 3 vext1 <7,0,1,0>, <7,0,1,0>
+ 1557447325U, // <0,1,0,u>: Cost 2 vext2 <2,3,0,1>, LHS
+ 1476476938U, // <0,1,1,0>: Cost 2 vext1 <0,0,1,1>, <0,0,1,1>
+ 2886517556U, // <0,1,1,1>: Cost 3 vzipl LHS, <1,1,1,1>
+ 2886517654U, // <0,1,1,2>: Cost 3 vzipl LHS, <1,2,3,0>
+ 2886517720U, // <0,1,1,3>: Cost 3 vzipl LHS, <1,3,1,3>
+ 1476480310U, // <0,1,1,4>: Cost 2 vext1 <0,0,1,1>, RHS
+ 2886558864U, // <0,1,1,5>: Cost 3 vzipl LHS, <1,5,3,7>
+ 2550223354U, // <0,1,1,6>: Cost 3 vext1 <0,0,1,1>, <6,2,7,3>
+ 2550223856U, // <0,1,1,7>: Cost 3 vext1 <0,0,1,1>, <7,0,0,1>
+ 1476482862U, // <0,1,1,u>: Cost 2 vext1 <0,0,1,1>, LHS
+ 1494401126U, // <0,1,2,0>: Cost 2 vext1 <3,0,1,2>, LHS
+ 3020735284U, // <0,1,2,1>: Cost 3 vtrnl LHS, <1,1,1,1>
+ 2562172349U, // <0,1,2,2>: Cost 3 vext1 <2,0,1,2>, <2,0,1,2>
+ 835584U, // <0,1,2,3>: Cost 0 copy LHS
+ 1494404406U, // <0,1,2,4>: Cost 2 vext1 <3,0,1,2>, RHS
+ 3020735488U, // <0,1,2,5>: Cost 3 vtrnl LHS, <1,3,5,7>
+ 2631190458U, // <0,1,2,6>: Cost 3 vext2 <2,3,0,1>, <2,6,3,7>
+ 1518294010U, // <0,1,2,7>: Cost 2 vext1 <7,0,1,2>, <7,0,1,2>
+ 835584U, // <0,1,2,u>: Cost 0 copy LHS
+ 2692318156U, // <0,1,3,0>: Cost 3 vext3 <1,3,0,0>, <1,3,0,0>
+ 2691875800U, // <0,1,3,1>: Cost 3 vext3 <1,2,3,0>, <1,3,1,3>
+ 2691875806U, // <0,1,3,2>: Cost 3 vext3 <1,2,3,0>, <1,3,2,0>
+ 2692539367U, // <0,1,3,3>: Cost 3 vext3 <1,3,3,0>, <1,3,3,0>
+ 2562182454U, // <0,1,3,4>: Cost 3 vext1 <2,0,1,3>, RHS
+ 2691875840U, // <0,1,3,5>: Cost 3 vext3 <1,2,3,0>, <1,3,5,7>
+ 2692760578U, // <0,1,3,6>: Cost 3 vext3 <1,3,6,0>, <1,3,6,0>
+ 2639817411U, // <0,1,3,7>: Cost 3 vext2 <3,7,0,1>, <3,7,0,1>
+ 2691875863U, // <0,1,3,u>: Cost 3 vext3 <1,2,3,0>, <1,3,u,3>
+ 2568159334U, // <0,1,4,0>: Cost 3 vext1 <3,0,1,4>, LHS
+ 4095312692U, // <0,1,4,1>: Cost 4 vtrnl <0,2,4,6>, <1,1,1,1>
+ 2568160934U, // <0,1,4,2>: Cost 3 vext1 <3,0,1,4>, <2,3,0,1>
+ 2568161432U, // <0,1,4,3>: Cost 3 vext1 <3,0,1,4>, <3,0,1,4>
+ 2568162614U, // <0,1,4,4>: Cost 3 vext1 <3,0,1,4>, RHS
+ 1557450038U, // <0,1,4,5>: Cost 2 vext2 <2,3,0,1>, RHS
+ 2754235702U, // <0,1,4,6>: Cost 3 vuzpl <0,4,1,5>, RHS
+ 2592052220U, // <0,1,4,7>: Cost 3 vext1 <7,0,1,4>, <7,0,1,4>
+ 1557450281U, // <0,1,4,u>: Cost 2 vext2 <2,3,0,1>, RHS
+ 3765617775U, // <0,1,5,0>: Cost 4 vext3 <1,2,3,0>, <1,5,0,1>
+ 2647781007U, // <0,1,5,1>: Cost 3 vext2 <5,1,0,1>, <5,1,0,1>
+ 3704934138U, // <0,1,5,2>: Cost 4 vext2 <2,3,0,1>, <5,2,3,0>
+ 2691875984U, // <0,1,5,3>: Cost 3 vext3 <1,2,3,0>, <1,5,3,7>
+ 2657734598U, // <0,1,5,4>: Cost 3 vext2 <6,7,0,1>, <5,4,7,6>
+ 2650435539U, // <0,1,5,5>: Cost 3 vext2 <5,5,0,1>, <5,5,0,1>
+ 2651099172U, // <0,1,5,6>: Cost 3 vext2 <5,6,0,1>, <5,6,0,1>
+ 2651762805U, // <0,1,5,7>: Cost 3 vext2 <5,7,0,1>, <5,7,0,1>
+ 2691876029U, // <0,1,5,u>: Cost 3 vext3 <1,2,3,0>, <1,5,u,7>
+ 2592063590U, // <0,1,6,0>: Cost 3 vext1 <7,0,1,6>, LHS
+ 3765617871U, // <0,1,6,1>: Cost 4 vext3 <1,2,3,0>, <1,6,1,7>
+ 2654417337U, // <0,1,6,2>: Cost 3 vext2 <6,2,0,1>, <6,2,0,1>
+ 3765617889U, // <0,1,6,3>: Cost 4 vext3 <1,2,3,0>, <1,6,3,7>
+ 2592066870U, // <0,1,6,4>: Cost 3 vext1 <7,0,1,6>, RHS
+ 3765617907U, // <0,1,6,5>: Cost 4 vext3 <1,2,3,0>, <1,6,5,7>
+ 2657071869U, // <0,1,6,6>: Cost 3 vext2 <6,6,0,1>, <6,6,0,1>
+ 1583993678U, // <0,1,6,7>: Cost 2 vext2 <6,7,0,1>, <6,7,0,1>
+ 1584657311U, // <0,1,6,u>: Cost 2 vext2 <6,u,0,1>, <6,u,0,1>
+ 2657735672U, // <0,1,7,0>: Cost 3 vext2 <6,7,0,1>, <7,0,1,0>
+ 2657735808U, // <0,1,7,1>: Cost 3 vext2 <6,7,0,1>, <7,1,7,1>
+ 2631193772U, // <0,1,7,2>: Cost 3 vext2 <2,3,0,1>, <7,2,3,0>
+ 2661053667U, // <0,1,7,3>: Cost 3 vext2 <7,3,0,1>, <7,3,0,1>
+ 2657736038U, // <0,1,7,4>: Cost 3 vext2 <6,7,0,1>, <7,4,5,6>
+ 3721524621U, // <0,1,7,5>: Cost 4 vext2 <5,1,0,1>, <7,5,1,0>
+ 2657736158U, // <0,1,7,6>: Cost 3 vext2 <6,7,0,1>, <7,6,1,0>
+ 2657736300U, // <0,1,7,7>: Cost 3 vext2 <6,7,0,1>, <7,7,7,7>
+ 2657736322U, // <0,1,7,u>: Cost 3 vext2 <6,7,0,1>, <7,u,1,2>
+ 1494450278U, // <0,1,u,0>: Cost 2 vext1 <3,0,1,u>, LHS
+ 1557452590U, // <0,1,u,1>: Cost 2 vext2 <2,3,0,1>, LHS
+ 2754238254U, // <0,1,u,2>: Cost 3 vuzpl <0,4,1,5>, LHS
+ 835584U, // <0,1,u,3>: Cost 0 copy LHS
+ 1494453558U, // <0,1,u,4>: Cost 2 vext1 <3,0,1,u>, RHS
+ 1557452954U, // <0,1,u,5>: Cost 2 vext2 <2,3,0,1>, RHS
+ 2754238618U, // <0,1,u,6>: Cost 3 vuzpl <0,4,1,5>, RHS
+ 1518343168U, // <0,1,u,7>: Cost 2 vext1 <7,0,1,u>, <7,0,1,u>
+ 835584U, // <0,1,u,u>: Cost 0 copy LHS
+ 2752299008U, // <0,2,0,0>: Cost 3 vuzpl LHS, <0,0,0,0>
+ 1544847462U, // <0,2,0,1>: Cost 2 vext2 <0,2,0,2>, LHS
+ 1678557286U, // <0,2,0,2>: Cost 2 vuzpl LHS, LHS
+ 2696521165U, // <0,2,0,3>: Cost 3 vext3 <2,0,3,0>, <2,0,3,0>
+ 2752340172U, // <0,2,0,4>: Cost 3 vuzpl LHS, <0,2,4,6>
+ 2691876326U, // <0,2,0,5>: Cost 3 vext3 <1,2,3,0>, <2,0,5,7>
+ 2618589695U, // <0,2,0,6>: Cost 3 vext2 <0,2,0,2>, <0,6,2,7>
+ 2592093185U, // <0,2,0,7>: Cost 3 vext1 <7,0,2,0>, <7,0,2,0>
+ 1678557340U, // <0,2,0,u>: Cost 2 vuzpl LHS, LHS
+ 2618589942U, // <0,2,1,0>: Cost 3 vext2 <0,2,0,2>, <1,0,3,2>
+ 2752299828U, // <0,2,1,1>: Cost 3 vuzpl LHS, <1,1,1,1>
+ 2886518376U, // <0,2,1,2>: Cost 3 vzipl LHS, <2,2,2,2>
+ 2752299766U, // <0,2,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
+ 2550295862U, // <0,2,1,4>: Cost 3 vext1 <0,0,2,1>, RHS
+ 2752340992U, // <0,2,1,5>: Cost 3 vuzpl LHS, <1,3,5,7>
+ 2886559674U, // <0,2,1,6>: Cost 3 vzipl LHS, <2,6,3,7>
+ 3934208106U, // <0,2,1,7>: Cost 4 vuzpr <7,0,1,2>, <0,1,2,7>
+ 2752340771U, // <0,2,1,u>: Cost 3 vuzpl LHS, <1,0,u,2>
+ 1476558868U, // <0,2,2,0>: Cost 2 vext1 <0,0,2,2>, <0,0,2,2>
+ 2226628029U, // <0,2,2,1>: Cost 3 vrev <2,0,1,2>
+ 2752300648U, // <0,2,2,2>: Cost 3 vuzpl LHS, <2,2,2,2>
+ 3020736114U, // <0,2,2,3>: Cost 3 vtrnl LHS, <2,2,3,3>
+ 1476562230U, // <0,2,2,4>: Cost 2 vext1 <0,0,2,2>, RHS
+ 2550304464U, // <0,2,2,5>: Cost 3 vext1 <0,0,2,2>, <5,1,7,3>
+ 2618591162U, // <0,2,2,6>: Cost 3 vext2 <0,2,0,2>, <2,6,3,7>
+ 2550305777U, // <0,2,2,7>: Cost 3 vext1 <0,0,2,2>, <7,0,0,2>
+ 1476564782U, // <0,2,2,u>: Cost 2 vext1 <0,0,2,2>, LHS
+ 2618591382U, // <0,2,3,0>: Cost 3 vext2 <0,2,0,2>, <3,0,1,2>
+ 2752301206U, // <0,2,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
+ 3826043121U, // <0,2,3,2>: Cost 4 vuzpl LHS, <3,1,2,3>
+ 2752301468U, // <0,2,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
+ 2618591746U, // <0,2,3,4>: Cost 3 vext2 <0,2,0,2>, <3,4,5,6>
+ 2752301570U, // <0,2,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
+ 3830688102U, // <0,2,3,6>: Cost 4 vuzpl LHS, <3,2,6,3>
+ 2698807012U, // <0,2,3,7>: Cost 3 vext3 <2,3,7,0>, <2,3,7,0>
+ 2752301269U, // <0,2,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
+ 2562261094U, // <0,2,4,0>: Cost 3 vext1 <2,0,2,4>, LHS
+ 4095313828U, // <0,2,4,1>: Cost 4 vtrnl <0,2,4,6>, <2,6,1,3>
+ 2226718152U, // <0,2,4,2>: Cost 3 vrev <2,0,2,4>
+ 2568235169U, // <0,2,4,3>: Cost 3 vext1 <3,0,2,4>, <3,0,2,4>
+ 2562264374U, // <0,2,4,4>: Cost 3 vext1 <2,0,2,4>, RHS
+ 1544850742U, // <0,2,4,5>: Cost 2 vext2 <0,2,0,2>, RHS
+ 1678560566U, // <0,2,4,6>: Cost 2 vuzpl LHS, RHS
+ 2592125957U, // <0,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
+ 1678560584U, // <0,2,4,u>: Cost 2 vuzpl LHS, RHS
+ 2691876686U, // <0,2,5,0>: Cost 3 vext3 <1,2,3,0>, <2,5,0,7>
+ 2618592976U, // <0,2,5,1>: Cost 3 vext2 <0,2,0,2>, <5,1,7,3>
+ 3765618528U, // <0,2,5,2>: Cost 4 vext3 <1,2,3,0>, <2,5,2,7>
+ 3765618536U, // <0,2,5,3>: Cost 4 vext3 <1,2,3,0>, <2,5,3,6>
+ 2618593222U, // <0,2,5,4>: Cost 3 vext2 <0,2,0,2>, <5,4,7,6>
+ 2752303108U, // <0,2,5,5>: Cost 3 vuzpl LHS, <5,5,5,5>
+ 2618593378U, // <0,2,5,6>: Cost 3 vext2 <0,2,0,2>, <5,6,7,0>
+ 2824785206U, // <0,2,5,7>: Cost 3 vuzpr <1,0,3,2>, RHS
+ 2824785207U, // <0,2,5,u>: Cost 3 vuzpr <1,0,3,2>, RHS
+ 2752303950U, // <0,2,6,0>: Cost 3 vuzpl LHS, <6,7,0,1>
+ 3830690081U, // <0,2,6,1>: Cost 4 vuzpl LHS, <6,0,1,2>
+ 2618593786U, // <0,2,6,2>: Cost 3 vext2 <0,2,0,2>, <6,2,7,3>
+ 2691876794U, // <0,2,6,3>: Cost 3 vext3 <1,2,3,0>, <2,6,3,7>
+ 2752303990U, // <0,2,6,4>: Cost 3 vuzpl LHS, <6,7,4,5>
+ 3830690445U, // <0,2,6,5>: Cost 4 vuzpl LHS, <6,4,5,6>
+ 2752303928U, // <0,2,6,6>: Cost 3 vuzpl LHS, <6,6,6,6>
+ 2657743695U, // <0,2,6,7>: Cost 3 vext2 <6,7,0,2>, <6,7,0,2>
+ 2691876839U, // <0,2,6,u>: Cost 3 vext3 <1,2,3,0>, <2,6,u,7>
+ 2659070961U, // <0,2,7,0>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
+ 2659734594U, // <0,2,7,1>: Cost 3 vext2 <7,1,0,2>, <7,1,0,2>
+ 3734140051U, // <0,2,7,2>: Cost 4 vext2 <7,2,0,2>, <7,2,0,2>
+ 2701166596U, // <0,2,7,3>: Cost 3 vext3 <2,7,3,0>, <2,7,3,0>
+ 2662389094U, // <0,2,7,4>: Cost 3 vext2 <7,5,0,2>, <7,4,5,6>
+ 2662389126U, // <0,2,7,5>: Cost 3 vext2 <7,5,0,2>, <7,5,0,2>
+ 3736794583U, // <0,2,7,6>: Cost 4 vext2 <7,6,0,2>, <7,6,0,2>
+ 2752304748U, // <0,2,7,7>: Cost 3 vuzpl LHS, <7,7,7,7>
+ 2659070961U, // <0,2,7,u>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
+ 1476608026U, // <0,2,u,0>: Cost 2 vext1 <0,0,2,u>, <0,0,2,u>
+ 1544853294U, // <0,2,u,1>: Cost 2 vext2 <0,2,0,2>, LHS
+ 1678563118U, // <0,2,u,2>: Cost 2 vuzpl LHS, LHS
+ 3021178482U, // <0,2,u,3>: Cost 3 vtrnl LHS, <2,2,3,3>
+ 1476611382U, // <0,2,u,4>: Cost 2 vext1 <0,0,2,u>, RHS
+ 1544853658U, // <0,2,u,5>: Cost 2 vext2 <0,2,0,2>, RHS
+ 1678563482U, // <0,2,u,6>: Cost 2 vuzpl LHS, RHS
+ 2824785449U, // <0,2,u,7>: Cost 3 vuzpr <1,0,3,2>, RHS
+ 1678563172U, // <0,2,u,u>: Cost 2 vuzpl LHS, LHS
+ 2556329984U, // <0,3,0,0>: Cost 3 vext1 <1,0,3,0>, <0,0,0,0>
+ 2686421142U, // <0,3,0,1>: Cost 3 vext3 <0,3,1,0>, <3,0,1,2>
+ 2562303437U, // <0,3,0,2>: Cost 3 vext1 <2,0,3,0>, <2,0,3,0>
+ 4094986652U, // <0,3,0,3>: Cost 4 vtrnl <0,2,0,2>, <3,3,3,3>
+ 2556333366U, // <0,3,0,4>: Cost 3 vext1 <1,0,3,0>, RHS
+ 4094986754U, // <0,3,0,5>: Cost 4 vtrnl <0,2,0,2>, <3,4,5,6>
+ 3798796488U, // <0,3,0,6>: Cost 4 vext3 <6,7,3,0>, <3,0,6,7>
+ 3776530634U, // <0,3,0,7>: Cost 4 vext3 <3,0,7,0>, <3,0,7,0>
+ 2556335918U, // <0,3,0,u>: Cost 3 vext1 <1,0,3,0>, LHS
+ 2886518934U, // <0,3,1,0>: Cost 3 vzipl LHS, <3,0,1,2>
+ 2556338933U, // <0,3,1,1>: Cost 3 vext1 <1,0,3,1>, <1,0,3,1>
+ 2691877105U, // <0,3,1,2>: Cost 3 vext3 <1,2,3,0>, <3,1,2,3>
+ 2886519196U, // <0,3,1,3>: Cost 3 vzipl LHS, <3,3,3,3>
+ 2886519298U, // <0,3,1,4>: Cost 3 vzipl LHS, <3,4,5,6>
+ 4095740418U, // <0,3,1,5>: Cost 4 vtrnl <0,3,1,4>, <3,4,5,6>
+ 3659944242U, // <0,3,1,6>: Cost 4 vext1 <6,0,3,1>, <6,0,3,1>
+ 3769600286U, // <0,3,1,7>: Cost 4 vext3 <1,u,3,0>, <3,1,7,3>
+ 2886519582U, // <0,3,1,u>: Cost 3 vzipl LHS, <3,u,1,2>
+ 1482604646U, // <0,3,2,0>: Cost 2 vext1 <1,0,3,2>, LHS
+ 1482605302U, // <0,3,2,1>: Cost 2 vext1 <1,0,3,2>, <1,0,3,2>
+ 2556348008U, // <0,3,2,2>: Cost 3 vext1 <1,0,3,2>, <2,2,2,2>
+ 3020736924U, // <0,3,2,3>: Cost 3 vtrnl LHS, <3,3,3,3>
+ 1482607926U, // <0,3,2,4>: Cost 2 vext1 <1,0,3,2>, RHS
+ 3020737026U, // <0,3,2,5>: Cost 3 vtrnl LHS, <3,4,5,6>
+ 2598154746U, // <0,3,2,6>: Cost 3 vext1 <u,0,3,2>, <6,2,7,3>
+ 2598155258U, // <0,3,2,7>: Cost 3 vext1 <u,0,3,2>, <7,0,1,2>
+ 1482610478U, // <0,3,2,u>: Cost 2 vext1 <1,0,3,2>, LHS
+ 3692341398U, // <0,3,3,0>: Cost 4 vext2 <0,2,0,3>, <3,0,1,2>
+ 2635851999U, // <0,3,3,1>: Cost 3 vext2 <3,1,0,3>, <3,1,0,3>
+ 3636069840U, // <0,3,3,2>: Cost 4 vext1 <2,0,3,3>, <2,0,3,3>
+ 2691877276U, // <0,3,3,3>: Cost 3 vext3 <1,2,3,0>, <3,3,3,3>
+ 3961522690U, // <0,3,3,4>: Cost 4 vzipl <0,3,1,4>, <3,4,5,6>
+ 3826797058U, // <0,3,3,5>: Cost 4 vuzpl <0,2,3,5>, <3,4,5,6>
+ 3703622282U, // <0,3,3,6>: Cost 4 vext2 <2,1,0,3>, <3,6,2,7>
+ 3769600452U, // <0,3,3,7>: Cost 4 vext3 <1,u,3,0>, <3,3,7,7>
+ 2640497430U, // <0,3,3,u>: Cost 3 vext2 <3,u,0,3>, <3,u,0,3>
+ 3962194070U, // <0,3,4,0>: Cost 4 vzipl <0,4,1,5>, <3,0,1,2>
+ 2232617112U, // <0,3,4,1>: Cost 3 vrev <3,0,1,4>
+ 2232690849U, // <0,3,4,2>: Cost 3 vrev <3,0,2,4>
+ 4095314332U, // <0,3,4,3>: Cost 4 vtrnl <0,2,4,6>, <3,3,3,3>
+ 3962194434U, // <0,3,4,4>: Cost 4 vzipl <0,4,1,5>, <3,4,5,6>
+ 2691877378U, // <0,3,4,5>: Cost 3 vext3 <1,2,3,0>, <3,4,5,6>
+ 3826765110U, // <0,3,4,6>: Cost 4 vuzpl <0,2,3,1>, RHS
+ 3665941518U, // <0,3,4,7>: Cost 4 vext1 <7,0,3,4>, <7,0,3,4>
+ 2691877405U, // <0,3,4,u>: Cost 3 vext3 <1,2,3,0>, <3,4,u,6>
+ 3630112870U, // <0,3,5,0>: Cost 4 vext1 <1,0,3,5>, LHS
+ 3630113526U, // <0,3,5,1>: Cost 4 vext1 <1,0,3,5>, <1,0,3,2>
+ 4035199734U, // <0,3,5,2>: Cost 4 vzipr <1,4,0,5>, <1,0,3,2>
+ 3769600578U, // <0,3,5,3>: Cost 4 vext3 <1,u,3,0>, <3,5,3,7>
+ 2232846516U, // <0,3,5,4>: Cost 3 vrev <3,0,4,5>
+ 3779037780U, // <0,3,5,5>: Cost 4 vext3 <3,4,5,0>, <3,5,5,7>
+ 2718714461U, // <0,3,5,6>: Cost 3 vext3 <5,6,7,0>, <3,5,6,7>
+ 2706106975U, // <0,3,5,7>: Cost 3 vext3 <3,5,7,0>, <3,5,7,0>
+ 2233141464U, // <0,3,5,u>: Cost 3 vrev <3,0,u,5>
+ 2691877496U, // <0,3,6,0>: Cost 3 vext3 <1,2,3,0>, <3,6,0,7>
+ 3727511914U, // <0,3,6,1>: Cost 4 vext2 <6,1,0,3>, <6,1,0,3>
+ 3765619338U, // <0,3,6,2>: Cost 4 vext3 <1,2,3,0>, <3,6,2,7>
+ 3765619347U, // <0,3,6,3>: Cost 4 vext3 <1,2,3,0>, <3,6,3,7>
+ 3765987996U, // <0,3,6,4>: Cost 4 vext3 <1,2,u,0>, <3,6,4,7>
+ 3306670270U, // <0,3,6,5>: Cost 4 vrev <3,0,5,6>
+ 3792456365U, // <0,3,6,6>: Cost 4 vext3 <5,6,7,0>, <3,6,6,6>
+ 2706770608U, // <0,3,6,7>: Cost 3 vext3 <3,6,7,0>, <3,6,7,0>
+ 2706844345U, // <0,3,6,u>: Cost 3 vext3 <3,6,u,0>, <3,6,u,0>
+ 3769600707U, // <0,3,7,0>: Cost 4 vext3 <1,u,3,0>, <3,7,0,1>
+ 2659742787U, // <0,3,7,1>: Cost 3 vext2 <7,1,0,3>, <7,1,0,3>
+ 3636102612U, // <0,3,7,2>: Cost 4 vext1 <2,0,3,7>, <2,0,3,7>
+ 3769600740U, // <0,3,7,3>: Cost 4 vext3 <1,u,3,0>, <3,7,3,7>
+ 3769600747U, // <0,3,7,4>: Cost 4 vext3 <1,u,3,0>, <3,7,4,5>
+ 3769600758U, // <0,3,7,5>: Cost 4 vext3 <1,u,3,0>, <3,7,5,7>
+ 3659993400U, // <0,3,7,6>: Cost 4 vext1 <6,0,3,7>, <6,0,3,7>
+ 3781176065U, // <0,3,7,7>: Cost 4 vext3 <3,7,7,0>, <3,7,7,0>
+ 2664388218U, // <0,3,7,u>: Cost 3 vext2 <7,u,0,3>, <7,u,0,3>
+ 1482653798U, // <0,3,u,0>: Cost 2 vext1 <1,0,3,u>, LHS
+ 1482654460U, // <0,3,u,1>: Cost 2 vext1 <1,0,3,u>, <1,0,3,u>
+ 2556397160U, // <0,3,u,2>: Cost 3 vext1 <1,0,3,u>, <2,2,2,2>
+ 3021179292U, // <0,3,u,3>: Cost 3 vtrnl LHS, <3,3,3,3>
+ 1482657078U, // <0,3,u,4>: Cost 2 vext1 <1,0,3,u>, RHS
+ 3021179394U, // <0,3,u,5>: Cost 3 vtrnl LHS, <3,4,5,6>
+ 2598203898U, // <0,3,u,6>: Cost 3 vext1 <u,0,3,u>, <6,2,7,3>
+ 2708097874U, // <0,3,u,7>: Cost 3 vext3 <3,u,7,0>, <3,u,7,0>
+ 1482659630U, // <0,3,u,u>: Cost 2 vext1 <1,0,3,u>, LHS
+ 2617278468U, // <0,4,0,0>: Cost 3 vext2 <0,0,0,4>, <0,0,0,4>
+ 2618605670U, // <0,4,0,1>: Cost 3 vext2 <0,2,0,4>, LHS
+ 2618605734U, // <0,4,0,2>: Cost 3 vext2 <0,2,0,4>, <0,2,0,4>
+ 3642091695U, // <0,4,0,3>: Cost 4 vext1 <3,0,4,0>, <3,0,4,0>
+ 2753134796U, // <0,4,0,4>: Cost 3 vuzpl <0,2,4,6>, <0,2,4,6>
+ 2718714770U, // <0,4,0,5>: Cost 3 vext3 <5,6,7,0>, <4,0,5,1>
+ 3021245750U, // <0,4,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
+ 3665982483U, // <0,4,0,7>: Cost 4 vext1 <7,0,4,0>, <7,0,4,0>
+ 3021245768U, // <0,4,0,u>: Cost 3 vtrnl <0,2,0,2>, RHS
+ 2568355942U, // <0,4,1,0>: Cost 3 vext1 <3,0,4,1>, LHS
+ 3692348212U, // <0,4,1,1>: Cost 4 vext2 <0,2,0,4>, <1,1,1,1>
+ 3692348310U, // <0,4,1,2>: Cost 4 vext2 <0,2,0,4>, <1,2,3,0>
+ 2568358064U, // <0,4,1,3>: Cost 3 vext1 <3,0,4,1>, <3,0,4,1>
+ 2568359222U, // <0,4,1,4>: Cost 3 vext1 <3,0,4,1>, RHS
+ 1812778294U, // <0,4,1,5>: Cost 2 vzipl LHS, RHS
+ 3022671158U, // <0,4,1,6>: Cost 3 vtrnl <0,4,1,5>, RHS
+ 2592248852U, // <0,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
+ 1812778537U, // <0,4,1,u>: Cost 2 vzipl LHS, RHS
+ 2568364134U, // <0,4,2,0>: Cost 3 vext1 <3,0,4,2>, LHS
+ 2238573423U, // <0,4,2,1>: Cost 3 vrev <4,0,1,2>
+ 3692349032U, // <0,4,2,2>: Cost 4 vext2 <0,2,0,4>, <2,2,2,2>
+ 2631214761U, // <0,4,2,3>: Cost 3 vext2 <2,3,0,4>, <2,3,0,4>
+ 2568367414U, // <0,4,2,4>: Cost 3 vext1 <3,0,4,2>, RHS
+ 2887028022U, // <0,4,2,5>: Cost 3 vzipl <0,2,0,2>, RHS
+ 1946996022U, // <0,4,2,6>: Cost 2 vtrnl LHS, RHS
+ 2592257045U, // <0,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
+ 1946996040U, // <0,4,2,u>: Cost 2 vtrnl LHS, RHS
+ 3692349590U, // <0,4,3,0>: Cost 4 vext2 <0,2,0,4>, <3,0,1,2>
+ 3826878614U, // <0,4,3,1>: Cost 4 vuzpl <0,2,4,6>, <3,0,1,2>
+ 3826878625U, // <0,4,3,2>: Cost 4 vuzpl <0,2,4,6>, <3,0,2,4>
+ 3692349852U, // <0,4,3,3>: Cost 4 vext2 <0,2,0,4>, <3,3,3,3>
+ 3692349954U, // <0,4,3,4>: Cost 4 vext2 <0,2,0,4>, <3,4,5,6>
+ 3826878978U, // <0,4,3,5>: Cost 4 vuzpl <0,2,4,6>, <3,4,5,6>
+ 4095200566U, // <0,4,3,6>: Cost 4 vtrnl <0,2,3,1>, RHS
+ 3713583814U, // <0,4,3,7>: Cost 4 vext2 <3,7,0,4>, <3,7,0,4>
+ 3692350238U, // <0,4,3,u>: Cost 4 vext2 <0,2,0,4>, <3,u,1,2>
+ 2550464552U, // <0,4,4,0>: Cost 3 vext1 <0,0,4,4>, <0,0,4,4>
+ 3962194914U, // <0,4,4,1>: Cost 4 vzipl <0,4,1,5>, <4,1,5,0>
+ 3693677631U, // <0,4,4,2>: Cost 4 vext2 <0,4,0,4>, <4,2,6,3>
+ 3642124467U, // <0,4,4,3>: Cost 4 vext1 <3,0,4,4>, <3,0,4,4>
+ 2718715088U, // <0,4,4,4>: Cost 3 vext3 <5,6,7,0>, <4,4,4,4>
+ 2618608950U, // <0,4,4,5>: Cost 3 vext2 <0,2,0,4>, RHS
+ 2753137974U, // <0,4,4,6>: Cost 3 vuzpl <0,2,4,6>, RHS
+ 3666015255U, // <0,4,4,7>: Cost 4 vext1 <7,0,4,4>, <7,0,4,4>
+ 2618609193U, // <0,4,4,u>: Cost 3 vext2 <0,2,0,4>, RHS
+ 2568388710U, // <0,4,5,0>: Cost 3 vext1 <3,0,4,5>, LHS
+ 2568389526U, // <0,4,5,1>: Cost 3 vext1 <3,0,4,5>, <1,2,3,0>
+ 3636159963U, // <0,4,5,2>: Cost 4 vext1 <2,0,4,5>, <2,0,4,5>
+ 2568390836U, // <0,4,5,3>: Cost 3 vext1 <3,0,4,5>, <3,0,4,5>
+ 2568391990U, // <0,4,5,4>: Cost 3 vext1 <3,0,4,5>, RHS
+ 2718715180U, // <0,4,5,5>: Cost 3 vext3 <5,6,7,0>, <4,5,5,6>
+ 1618136374U, // <0,4,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
+ 2592281624U, // <0,4,5,7>: Cost 3 vext1 <7,0,4,5>, <7,0,4,5>
+ 1618136392U, // <0,4,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
+ 2550480938U, // <0,4,6,0>: Cost 3 vext1 <0,0,4,6>, <0,0,4,6>
+ 3826880801U, // <0,4,6,1>: Cost 4 vuzpl <0,2,4,6>, <6,0,1,2>
+ 2562426332U, // <0,4,6,2>: Cost 3 vext1 <2,0,4,6>, <2,0,4,6>
+ 3786190181U, // <0,4,6,3>: Cost 4 vext3 <4,6,3,0>, <4,6,3,0>
+ 2718715252U, // <0,4,6,4>: Cost 3 vext3 <5,6,7,0>, <4,6,4,6>
+ 3826881165U, // <0,4,6,5>: Cost 4 vuzpl <0,2,4,6>, <6,4,5,6>
+ 2712669568U, // <0,4,6,6>: Cost 3 vext3 <4,6,6,0>, <4,6,6,0>
+ 2657760081U, // <0,4,6,7>: Cost 3 vext2 <6,7,0,4>, <6,7,0,4>
+ 2718715284U, // <0,4,6,u>: Cost 3 vext3 <5,6,7,0>, <4,6,u,2>
+ 3654090854U, // <0,4,7,0>: Cost 4 vext1 <5,0,4,7>, LHS
+ 3934229326U, // <0,4,7,1>: Cost 4 vuzpr <7,0,1,4>, <6,7,0,1>
+ 3734156437U, // <0,4,7,2>: Cost 4 vext2 <7,2,0,4>, <7,2,0,4>
+ 3734820070U, // <0,4,7,3>: Cost 4 vext2 <7,3,0,4>, <7,3,0,4>
+ 3654094134U, // <0,4,7,4>: Cost 4 vext1 <5,0,4,7>, RHS
+ 2713259464U, // <0,4,7,5>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
+ 2713333201U, // <0,4,7,6>: Cost 3 vext3 <4,7,6,0>, <4,7,6,0>
+ 3654095866U, // <0,4,7,7>: Cost 4 vext1 <5,0,4,7>, <7,0,1,2>
+ 2713259464U, // <0,4,7,u>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
+ 2568413286U, // <0,4,u,0>: Cost 3 vext1 <3,0,4,u>, LHS
+ 2618611502U, // <0,4,u,1>: Cost 3 vext2 <0,2,0,4>, LHS
+ 2753140526U, // <0,4,u,2>: Cost 3 vuzpl <0,2,4,6>, LHS
+ 2568415415U, // <0,4,u,3>: Cost 3 vext1 <3,0,4,u>, <3,0,4,u>
+ 2568416566U, // <0,4,u,4>: Cost 3 vext1 <3,0,4,u>, RHS
+ 1817423158U, // <0,4,u,5>: Cost 2 vzipl LHS, RHS
+ 1947438390U, // <0,4,u,6>: Cost 2 vtrnl LHS, RHS
+ 2592306203U, // <0,4,u,7>: Cost 3 vext1 <7,0,4,u>, <7,0,4,u>
+ 1947438408U, // <0,4,u,u>: Cost 2 vtrnl LHS, RHS
+ 3630219264U, // <0,5,0,0>: Cost 4 vext1 <1,0,5,0>, <0,0,0,0>
+ 2625912934U, // <0,5,0,1>: Cost 3 vext2 <1,4,0,5>, LHS
+ 3692355748U, // <0,5,0,2>: Cost 4 vext2 <0,2,0,5>, <0,2,0,2>
+ 3693019384U, // <0,5,0,3>: Cost 4 vext2 <0,3,0,5>, <0,3,0,5>
+ 3630222646U, // <0,5,0,4>: Cost 4 vext1 <1,0,5,0>, RHS
+ 3699655062U, // <0,5,0,5>: Cost 4 vext2 <1,4,0,5>, <0,5,0,1>
+ 2718715508U, // <0,5,0,6>: Cost 3 vext3 <5,6,7,0>, <5,0,6,1>
+ 3087011126U, // <0,5,0,7>: Cost 3 vtrnr <0,0,0,0>, RHS
+ 2625913501U, // <0,5,0,u>: Cost 3 vext2 <1,4,0,5>, LHS
+ 1500659814U, // <0,5,1,0>: Cost 2 vext1 <4,0,5,1>, LHS
+ 2886520528U, // <0,5,1,1>: Cost 3 vzipl LHS, <5,1,7,3>
+ 2574403176U, // <0,5,1,2>: Cost 3 vext1 <4,0,5,1>, <2,2,2,2>
+ 2574403734U, // <0,5,1,3>: Cost 3 vext1 <4,0,5,1>, <3,0,1,2>
+ 1500662674U, // <0,5,1,4>: Cost 2 vext1 <4,0,5,1>, <4,0,5,1>
+ 2886520836U, // <0,5,1,5>: Cost 3 vzipl LHS, <5,5,5,5>
+ 2886520930U, // <0,5,1,6>: Cost 3 vzipl LHS, <5,6,7,0>
+ 2718715600U, // <0,5,1,7>: Cost 3 vext3 <5,6,7,0>, <5,1,7,3>
+ 1500665646U, // <0,5,1,u>: Cost 2 vext1 <4,0,5,1>, LHS
+ 2556493926U, // <0,5,2,0>: Cost 3 vext1 <1,0,5,2>, LHS
+ 2244546120U, // <0,5,2,1>: Cost 3 vrev <5,0,1,2>
+ 3692357256U, // <0,5,2,2>: Cost 4 vext2 <0,2,0,5>, <2,2,5,7>
+ 2568439994U, // <0,5,2,3>: Cost 3 vext1 <3,0,5,2>, <3,0,5,2>
+ 2556497206U, // <0,5,2,4>: Cost 3 vext1 <1,0,5,2>, RHS
+ 3020738564U, // <0,5,2,5>: Cost 3 vtrnl LHS, <5,5,5,5>
+ 4027877161U, // <0,5,2,6>: Cost 4 vzipr <0,2,0,2>, <2,4,5,6>
+ 3093220662U, // <0,5,2,7>: Cost 3 vtrnr <1,0,3,2>, RHS
+ 3093220663U, // <0,5,2,u>: Cost 3 vtrnr <1,0,3,2>, RHS
+ 3699656854U, // <0,5,3,0>: Cost 4 vext2 <1,4,0,5>, <3,0,1,2>
+ 3699656927U, // <0,5,3,1>: Cost 4 vext2 <1,4,0,5>, <3,1,0,3>
+ 3699657006U, // <0,5,3,2>: Cost 4 vext2 <1,4,0,5>, <3,2,0,1>
+ 3699657116U, // <0,5,3,3>: Cost 4 vext2 <1,4,0,5>, <3,3,3,3>
+ 2637859284U, // <0,5,3,4>: Cost 3 vext2 <3,4,0,5>, <3,4,0,5>
+ 3790319453U, // <0,5,3,5>: Cost 4 vext3 <5,3,5,0>, <5,3,5,0>
+ 3699657354U, // <0,5,3,6>: Cost 4 vext2 <1,4,0,5>, <3,6,2,7>
+ 2716725103U, // <0,5,3,7>: Cost 3 vext3 <5,3,7,0>, <5,3,7,0>
+ 2716798840U, // <0,5,3,u>: Cost 3 vext3 <5,3,u,0>, <5,3,u,0>
+ 2661747602U, // <0,5,4,0>: Cost 3 vext2 <7,4,0,5>, <4,0,5,1>
+ 3630252810U, // <0,5,4,1>: Cost 4 vext1 <1,0,5,4>, <1,0,5,4>
+ 3636225507U, // <0,5,4,2>: Cost 4 vext1 <2,0,5,4>, <2,0,5,4>
+ 3716910172U, // <0,5,4,3>: Cost 4 vext2 <4,3,0,5>, <4,3,0,5>
+ 3962195892U, // <0,5,4,4>: Cost 4 vzipl <0,4,1,5>, <5,4,5,6>
+ 2625916214U, // <0,5,4,5>: Cost 3 vext2 <1,4,0,5>, RHS
+ 3718901071U, // <0,5,4,6>: Cost 4 vext2 <4,6,0,5>, <4,6,0,5>
+ 2718715846U, // <0,5,4,7>: Cost 3 vext3 <5,6,7,0>, <5,4,7,6>
+ 2625916457U, // <0,5,4,u>: Cost 3 vext2 <1,4,0,5>, RHS
+ 3791278034U, // <0,5,5,0>: Cost 4 vext3 <5,5,0,0>, <5,5,0,0>
+ 3791351771U, // <0,5,5,1>: Cost 4 vext3 <5,5,1,0>, <5,5,1,0>
+ 3318386260U, // <0,5,5,2>: Cost 4 vrev <5,0,2,5>
+ 3791499245U, // <0,5,5,3>: Cost 4 vext3 <5,5,3,0>, <5,5,3,0>
+ 3318533734U, // <0,5,5,4>: Cost 4 vrev <5,0,4,5>
+ 2718715908U, // <0,5,5,5>: Cost 3 vext3 <5,6,7,0>, <5,5,5,5>
+ 2657767522U, // <0,5,5,6>: Cost 3 vext2 <6,7,0,5>, <5,6,7,0>
+ 2718715928U, // <0,5,5,7>: Cost 3 vext3 <5,6,7,0>, <5,5,7,7>
+ 2718715937U, // <0,5,5,u>: Cost 3 vext3 <5,6,7,0>, <5,5,u,7>
+ 2592358502U, // <0,5,6,0>: Cost 3 vext1 <7,0,5,6>, LHS
+ 3792015404U, // <0,5,6,1>: Cost 4 vext3 <5,6,1,0>, <5,6,1,0>
+ 3731509754U, // <0,5,6,2>: Cost 4 vext2 <6,7,0,5>, <6,2,7,3>
+ 3785748546U, // <0,5,6,3>: Cost 4 vext3 <4,5,6,0>, <5,6,3,4>
+ 2592361782U, // <0,5,6,4>: Cost 3 vext1 <7,0,5,6>, RHS
+ 2592362594U, // <0,5,6,5>: Cost 3 vext1 <7,0,5,6>, <5,6,7,0>
+ 3785748576U, // <0,5,6,6>: Cost 4 vext3 <4,5,6,0>, <5,6,6,7>
+ 1644974178U, // <0,5,6,7>: Cost 2 vext3 <5,6,7,0>, <5,6,7,0>
+ 1645047915U, // <0,5,6,u>: Cost 2 vext3 <5,6,u,0>, <5,6,u,0>
+ 2562506854U, // <0,5,7,0>: Cost 3 vext1 <2,0,5,7>, LHS
+ 2562507670U, // <0,5,7,1>: Cost 3 vext1 <2,0,5,7>, <1,2,3,0>
+ 2562508262U, // <0,5,7,2>: Cost 3 vext1 <2,0,5,7>, <2,0,5,7>
+ 3636250774U, // <0,5,7,3>: Cost 4 vext1 <2,0,5,7>, <3,0,1,2>
+ 2562510134U, // <0,5,7,4>: Cost 3 vext1 <2,0,5,7>, RHS
+ 2718716072U, // <0,5,7,5>: Cost 3 vext3 <5,6,7,0>, <5,7,5,7>
+ 2718716074U, // <0,5,7,6>: Cost 3 vext3 <5,6,7,0>, <5,7,6,0>
+ 2719379635U, // <0,5,7,7>: Cost 3 vext3 <5,7,7,0>, <5,7,7,0>
+ 2562512686U, // <0,5,7,u>: Cost 3 vext1 <2,0,5,7>, LHS
+ 1500717158U, // <0,5,u,0>: Cost 2 vext1 <4,0,5,u>, LHS
+ 2625918766U, // <0,5,u,1>: Cost 3 vext2 <1,4,0,5>, LHS
+ 2719674583U, // <0,5,u,2>: Cost 3 vext3 <5,u,2,0>, <5,u,2,0>
+ 2568489152U, // <0,5,u,3>: Cost 3 vext1 <3,0,5,u>, <3,0,5,u>
+ 1500720025U, // <0,5,u,4>: Cost 2 vext1 <4,0,5,u>, <4,0,5,u>
+ 2625919130U, // <0,5,u,5>: Cost 3 vext2 <1,4,0,5>, RHS
+ 2586407243U, // <0,5,u,6>: Cost 3 vext1 <6,0,5,u>, <6,0,5,u>
+ 1646301444U, // <0,5,u,7>: Cost 2 vext3 <5,u,7,0>, <5,u,7,0>
+ 1646375181U, // <0,5,u,u>: Cost 2 vext3 <5,u,u,0>, <5,u,u,0>
+ 2586411110U, // <0,6,0,0>: Cost 3 vext1 <6,0,6,0>, LHS
+ 2619949158U, // <0,6,0,1>: Cost 3 vext2 <0,4,0,6>, LHS
+ 2619949220U, // <0,6,0,2>: Cost 3 vext2 <0,4,0,6>, <0,2,0,2>
+ 3785748789U, // <0,6,0,3>: Cost 4 vext3 <4,5,6,0>, <6,0,3,4>
+ 2619949386U, // <0,6,0,4>: Cost 3 vext2 <0,4,0,6>, <0,4,0,6>
+ 2586415202U, // <0,6,0,5>: Cost 3 vext1 <6,0,6,0>, <5,6,7,0>
+ 2586415436U, // <0,6,0,6>: Cost 3 vext1 <6,0,6,0>, <6,0,6,0>
+ 2952793398U, // <0,6,0,7>: Cost 3 vzipr <0,0,0,0>, RHS
+ 2619949725U, // <0,6,0,u>: Cost 3 vext2 <0,4,0,6>, LHS
+ 2562531430U, // <0,6,1,0>: Cost 3 vext1 <2,0,6,1>, LHS
+ 3693691700U, // <0,6,1,1>: Cost 4 vext2 <0,4,0,6>, <1,1,1,1>
+ 2886521338U, // <0,6,1,2>: Cost 3 vzipl LHS, <6,2,7,3>
+ 3693691864U, // <0,6,1,3>: Cost 4 vext2 <0,4,0,6>, <1,3,1,3>
+ 2562534710U, // <0,6,1,4>: Cost 3 vext1 <2,0,6,1>, RHS
+ 2580450932U, // <0,6,1,5>: Cost 3 vext1 <5,0,6,1>, <5,0,6,1>
+ 2886521656U, // <0,6,1,6>: Cost 3 vzipl LHS, <6,6,6,6>
+ 2966736182U, // <0,6,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
+ 2966736183U, // <0,6,1,u>: Cost 3 vzipr <2,3,0,1>, RHS
+ 1500741734U, // <0,6,2,0>: Cost 2 vext1 <4,0,6,2>, LHS
+ 2250518817U, // <0,6,2,1>: Cost 3 vrev <6,0,1,2>
+ 2574485096U, // <0,6,2,2>: Cost 3 vext1 <4,0,6,2>, <2,2,2,2>
+ 2631894694U, // <0,6,2,3>: Cost 3 vext2 <2,4,0,6>, <2,3,0,1>
+ 1500744604U, // <0,6,2,4>: Cost 2 vext1 <4,0,6,2>, <4,0,6,2>
+ 2574487248U, // <0,6,2,5>: Cost 3 vext1 <4,0,6,2>, <5,1,7,3>
+ 3020739384U, // <0,6,2,6>: Cost 3 vtrnl LHS, <6,6,6,6>
+ 2954136886U, // <0,6,2,7>: Cost 3 vzipr <0,2,0,2>, RHS
+ 1500747566U, // <0,6,2,u>: Cost 2 vext1 <4,0,6,2>, LHS
+ 3693693078U, // <0,6,3,0>: Cost 4 vext2 <0,4,0,6>, <3,0,1,2>
+ 3705637136U, // <0,6,3,1>: Cost 4 vext2 <2,4,0,6>, <3,1,5,7>
+ 3705637192U, // <0,6,3,2>: Cost 4 vext2 <2,4,0,6>, <3,2,3,0>
+ 3693693340U, // <0,6,3,3>: Cost 4 vext2 <0,4,0,6>, <3,3,3,3>
+ 2637867477U, // <0,6,3,4>: Cost 3 vext2 <3,4,0,6>, <3,4,0,6>
+ 3705637424U, // <0,6,3,5>: Cost 4 vext2 <2,4,0,6>, <3,5,1,7>
+ 3666154056U, // <0,6,3,6>: Cost 4 vext1 <7,0,6,3>, <6,3,7,0>
+ 2722697800U, // <0,6,3,7>: Cost 3 vext3 <6,3,7,0>, <6,3,7,0>
+ 2722771537U, // <0,6,3,u>: Cost 3 vext3 <6,3,u,0>, <6,3,u,0>
+ 2562556006U, // <0,6,4,0>: Cost 3 vext1 <2,0,6,4>, LHS
+ 4095316257U, // <0,6,4,1>: Cost 4 vtrnl <0,2,4,6>, <6,0,1,2>
+ 2562557420U, // <0,6,4,2>: Cost 3 vext1 <2,0,6,4>, <2,0,6,4>
+ 3636299926U, // <0,6,4,3>: Cost 4 vext1 <2,0,6,4>, <3,0,1,2>
+ 2562559286U, // <0,6,4,4>: Cost 3 vext1 <2,0,6,4>, RHS
+ 2619952438U, // <0,6,4,5>: Cost 3 vext2 <0,4,0,6>, RHS
+ 2723287696U, // <0,6,4,6>: Cost 3 vext3 <6,4,6,0>, <6,4,6,0>
+ 4027895094U, // <0,6,4,7>: Cost 4 vzipr <0,2,0,4>, RHS
+ 2619952681U, // <0,6,4,u>: Cost 3 vext2 <0,4,0,6>, RHS
+ 2718716594U, // <0,6,5,0>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
+ 3648250774U, // <0,6,5,1>: Cost 4 vext1 <4,0,6,5>, <1,2,3,0>
+ 3792458436U, // <0,6,5,2>: Cost 4 vext3 <5,6,7,0>, <6,5,2,7>
+ 3705638767U, // <0,6,5,3>: Cost 5 vext2 <2,4,0,6>, <5,3,7,0>
+ 3648252831U, // <0,6,5,4>: Cost 4 vext1 <4,0,6,5>, <4,0,6,5>
+ 3797619416U, // <0,6,5,5>: Cost 4 vext3 <6,5,5,0>, <6,5,5,0>
+ 3792458472U, // <0,6,5,6>: Cost 4 vext3 <5,6,7,0>, <6,5,6,7>
+ 4035202358U, // <0,6,5,7>: Cost 4 vzipr <1,4,0,5>, RHS
+ 2718716594U, // <0,6,5,u>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
+ 3786412796U, // <0,6,6,0>: Cost 4 vext3 <4,6,6,0>, <6,6,0,0>
+ 3792458504U, // <0,6,6,1>: Cost 4 vext3 <5,6,7,0>, <6,6,1,3>
+ 3728200126U, // <0,6,6,2>: Cost 4 vext2 <6,2,0,6>, <6,2,0,6>
+ 3798135575U, // <0,6,6,3>: Cost 4 vext3 <6,6,3,0>, <6,6,3,0>
+ 3786412836U, // <0,6,6,4>: Cost 4 vext3 <4,6,6,0>, <6,6,4,4>
+ 3792458543U, // <0,6,6,5>: Cost 4 vext3 <5,6,7,0>, <6,6,5,6>
+ 2718716728U, // <0,6,6,6>: Cost 3 vext3 <5,6,7,0>, <6,6,6,6>
+ 2718716738U, // <0,6,6,7>: Cost 3 vext3 <5,6,7,0>, <6,6,7,7>
+ 2718716747U, // <0,6,6,u>: Cost 3 vext3 <5,6,7,0>, <6,6,u,7>
+ 2718716750U, // <0,6,7,0>: Cost 3 vext3 <5,6,7,0>, <6,7,0,1>
+ 2724909910U, // <0,6,7,1>: Cost 3 vext3 <6,7,1,0>, <6,7,1,0>
+ 3636323823U, // <0,6,7,2>: Cost 4 vext1 <2,0,6,7>, <2,0,6,7>
+ 2725057384U, // <0,6,7,3>: Cost 3 vext3 <6,7,3,0>, <6,7,3,0>
+ 2718716790U, // <0,6,7,4>: Cost 3 vext3 <5,6,7,0>, <6,7,4,5>
+ 2718716800U, // <0,6,7,5>: Cost 3 vext3 <5,6,7,0>, <6,7,5,6>
+ 3792458629U, // <0,6,7,6>: Cost 4 vext3 <5,6,7,0>, <6,7,6,2>
+ 2725352332U, // <0,6,7,7>: Cost 3 vext3 <6,7,7,0>, <6,7,7,0>
+ 2718716822U, // <0,6,7,u>: Cost 3 vext3 <5,6,7,0>, <6,7,u,1>
+ 1500790886U, // <0,6,u,0>: Cost 2 vext1 <4,0,6,u>, LHS
+ 2619954990U, // <0,6,u,1>: Cost 3 vext2 <0,4,0,6>, LHS
+ 2562590192U, // <0,6,u,2>: Cost 3 vext1 <2,0,6,u>, <2,0,6,u>
+ 2725721017U, // <0,6,u,3>: Cost 3 vext3 <6,u,3,0>, <6,u,3,0>
+ 1500793762U, // <0,6,u,4>: Cost 2 vext1 <4,0,6,u>, <4,0,6,u>
+ 2619955354U, // <0,6,u,5>: Cost 3 vext2 <0,4,0,6>, RHS
+ 2725942228U, // <0,6,u,6>: Cost 3 vext3 <6,u,6,0>, <6,u,6,0>
+ 2954186038U, // <0,6,u,7>: Cost 3 vzipr <0,2,0,u>, RHS
+ 1500796718U, // <0,6,u,u>: Cost 2 vext1 <4,0,6,u>, LHS
+ 2256401391U, // <0,7,0,0>: Cost 3 vrev <7,0,0,0>
+ 2632564838U, // <0,7,0,1>: Cost 3 vext2 <2,5,0,7>, LHS
+ 2256548865U, // <0,7,0,2>: Cost 3 vrev <7,0,2,0>
+ 3700998396U, // <0,7,0,3>: Cost 4 vext2 <1,6,0,7>, <0,3,1,0>
+ 2718716952U, // <0,7,0,4>: Cost 3 vext3 <5,6,7,0>, <7,0,4,5>
+ 2718716962U, // <0,7,0,5>: Cost 3 vext3 <5,6,7,0>, <7,0,5,6>
+ 2621284845U, // <0,7,0,6>: Cost 3 vext2 <0,6,0,7>, <0,6,0,7>
+ 3904685542U, // <0,7,0,7>: Cost 4 vuzpr <2,0,5,7>, <2,0,5,7>
+ 2632565405U, // <0,7,0,u>: Cost 3 vext2 <2,5,0,7>, LHS
+ 2256409584U, // <0,7,1,0>: Cost 3 vrev <7,0,0,1>
+ 3706307380U, // <0,7,1,1>: Cost 4 vext2 <2,5,0,7>, <1,1,1,1>
+ 2632565654U, // <0,7,1,2>: Cost 3 vext2 <2,5,0,7>, <1,2,3,0>
+ 3769603168U, // <0,7,1,3>: Cost 4 vext3 <1,u,3,0>, <7,1,3,5>
+ 2256704532U, // <0,7,1,4>: Cost 3 vrev <7,0,4,1>
+ 3769603184U, // <0,7,1,5>: Cost 4 vext3 <1,u,3,0>, <7,1,5,3>
+ 3700999366U, // <0,7,1,6>: Cost 4 vext2 <1,6,0,7>, <1,6,0,7>
+ 2886522476U, // <0,7,1,7>: Cost 3 vzipl LHS, <7,7,7,7>
+ 2256999480U, // <0,7,1,u>: Cost 3 vrev <7,0,u,1>
+ 2586501222U, // <0,7,2,0>: Cost 3 vext1 <6,0,7,2>, LHS
+ 1182749690U, // <0,7,2,1>: Cost 2 vrev <7,0,1,2>
+ 3636356595U, // <0,7,2,2>: Cost 4 vext1 <2,0,7,2>, <2,0,7,2>
+ 2727711916U, // <0,7,2,3>: Cost 3 vext3 <7,2,3,0>, <7,2,3,0>
+ 2586504502U, // <0,7,2,4>: Cost 3 vext1 <6,0,7,2>, RHS
+ 2632566606U, // <0,7,2,5>: Cost 3 vext2 <2,5,0,7>, <2,5,0,7>
+ 2586505559U, // <0,7,2,6>: Cost 3 vext1 <6,0,7,2>, <6,0,7,2>
+ 3020740204U, // <0,7,2,7>: Cost 3 vtrnl LHS, <7,7,7,7>
+ 1183265849U, // <0,7,2,u>: Cost 2 vrev <7,0,u,2>
+ 3701000342U, // <0,7,3,0>: Cost 4 vext2 <1,6,0,7>, <3,0,1,2>
+ 3706308849U, // <0,7,3,1>: Cost 4 vext2 <2,5,0,7>, <3,1,2,3>
+ 3330315268U, // <0,7,3,2>: Cost 4 vrev <7,0,2,3>
+ 3706309020U, // <0,7,3,3>: Cost 4 vext2 <2,5,0,7>, <3,3,3,3>
+ 3706309122U, // <0,7,3,4>: Cost 4 vext2 <2,5,0,7>, <3,4,5,6>
+ 3712281127U, // <0,7,3,5>: Cost 4 vext2 <3,5,0,7>, <3,5,0,7>
+ 2639202936U, // <0,7,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
+ 3802412321U, // <0,7,3,7>: Cost 4 vext3 <7,3,7,0>, <7,3,7,0>
+ 2640530202U, // <0,7,3,u>: Cost 3 vext2 <3,u,0,7>, <3,u,0,7>
+ 3654287462U, // <0,7,4,0>: Cost 4 vext1 <5,0,7,4>, LHS
+ 2256507900U, // <0,7,4,1>: Cost 3 vrev <7,0,1,4>
+ 2256581637U, // <0,7,4,2>: Cost 3 vrev <7,0,2,4>
+ 3660262008U, // <0,7,4,3>: Cost 4 vext1 <6,0,7,4>, <3,6,0,7>
+ 3786413405U, // <0,7,4,4>: Cost 4 vext3 <4,6,6,0>, <7,4,4,6>
+ 2632568118U, // <0,7,4,5>: Cost 3 vext2 <2,5,0,7>, RHS
+ 3718917457U, // <0,7,4,6>: Cost 4 vext2 <4,6,0,7>, <4,6,0,7>
+ 3787003255U, // <0,7,4,7>: Cost 4 vext3 <4,7,5,0>, <7,4,7,5>
+ 2632568361U, // <0,7,4,u>: Cost 3 vext2 <2,5,0,7>, RHS
+ 3706310268U, // <0,7,5,0>: Cost 4 vext2 <2,5,0,7>, <5,0,7,0>
+ 3792459156U, // <0,7,5,1>: Cost 4 vext3 <5,6,7,0>, <7,5,1,7>
+ 3330331654U, // <0,7,5,2>: Cost 4 vrev <7,0,2,5>
+ 3722899255U, // <0,7,5,3>: Cost 4 vext2 <5,3,0,7>, <5,3,0,7>
+ 2256737304U, // <0,7,5,4>: Cost 3 vrev <7,0,4,5>
+ 3724226521U, // <0,7,5,5>: Cost 4 vext2 <5,5,0,7>, <5,5,0,7>
+ 2718717377U, // <0,7,5,6>: Cost 3 vext3 <5,6,7,0>, <7,5,6,7>
+ 2729997763U, // <0,7,5,7>: Cost 3 vext3 <7,5,7,0>, <7,5,7,0>
+ 2720044499U, // <0,7,5,u>: Cost 3 vext3 <5,u,7,0>, <7,5,u,7>
+ 3712946517U, // <0,7,6,0>: Cost 4 vext2 <3,6,0,7>, <6,0,7,0>
+ 2256524286U, // <0,7,6,1>: Cost 3 vrev <7,0,1,6>
+ 3792459246U, // <0,7,6,2>: Cost 4 vext3 <5,6,7,0>, <7,6,2,7>
+ 3796440567U, // <0,7,6,3>: Cost 4 vext3 <6,3,7,0>, <7,6,3,7>
+ 3654307126U, // <0,7,6,4>: Cost 4 vext1 <5,0,7,6>, RHS
+ 2656457394U, // <0,7,6,5>: Cost 3 vext2 <6,5,0,7>, <6,5,0,7>
+ 3792459281U, // <0,7,6,6>: Cost 4 vext3 <5,6,7,0>, <7,6,6,6>
+ 2730661396U, // <0,7,6,7>: Cost 3 vext3 <7,6,7,0>, <7,6,7,0>
+ 2658448293U, // <0,7,6,u>: Cost 3 vext2 <6,u,0,7>, <6,u,0,7>
+ 3787003431U, // <0,7,7,0>: Cost 4 vext3 <4,7,5,0>, <7,7,0,1>
+ 3654312854U, // <0,7,7,1>: Cost 4 vext1 <5,0,7,7>, <1,2,3,0>
+ 3654313446U, // <0,7,7,2>: Cost 4 vext1 <5,0,7,7>, <2,0,5,7>
+ 3804771905U, // <0,7,7,3>: Cost 4 vext3 <7,7,3,0>, <7,7,3,0>
+ 3654315318U, // <0,7,7,4>: Cost 4 vext1 <5,0,7,7>, RHS
+ 3654315651U, // <0,7,7,5>: Cost 4 vext1 <5,0,7,7>, <5,0,7,7>
+ 3660288348U, // <0,7,7,6>: Cost 4 vext1 <6,0,7,7>, <6,0,7,7>
+ 2718717548U, // <0,7,7,7>: Cost 3 vext3 <5,6,7,0>, <7,7,7,7>
+ 2664420990U, // <0,7,7,u>: Cost 3 vext2 <7,u,0,7>, <7,u,0,7>
+ 2256466935U, // <0,7,u,0>: Cost 3 vrev <7,0,0,u>
+ 1182798848U, // <0,7,u,1>: Cost 2 vrev <7,0,1,u>
+ 2256614409U, // <0,7,u,2>: Cost 3 vrev <7,0,2,u>
+ 2731693714U, // <0,7,u,3>: Cost 3 vext3 <7,u,3,0>, <7,u,3,0>
+ 2256761883U, // <0,7,u,4>: Cost 3 vrev <7,0,4,u>
+ 2632571034U, // <0,7,u,5>: Cost 3 vext2 <2,5,0,7>, RHS
+ 2669066421U, // <0,7,u,6>: Cost 3 vext2 <u,6,0,7>, <u,6,0,7>
+ 2731988662U, // <0,7,u,7>: Cost 3 vext3 <7,u,7,0>, <7,u,7,0>
+ 1183315007U, // <0,7,u,u>: Cost 2 vrev <7,0,u,u>
+ 135053414U, // <0,u,0,0>: Cost 1 vdup0 LHS
+ 1544896614U, // <0,u,0,1>: Cost 2 vext2 <0,2,0,u>, LHS
+ 1678999654U, // <0,u,0,2>: Cost 2 vuzpl LHS, LHS
+ 2691880677U, // <0,u,0,3>: Cost 3 vext3 <1,2,3,0>, <u,0,3,2>
+ 1476988214U, // <0,u,0,4>: Cost 2 vext1 <0,0,u,0>, RHS
+ 2718791419U, // <0,u,0,5>: Cost 3 vext3 <5,6,u,0>, <u,0,5,6>
+ 3021248666U, // <0,u,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
+ 2592535607U, // <0,u,0,7>: Cost 3 vext1 <7,0,u,0>, <7,0,u,0>
+ 135053414U, // <0,u,0,u>: Cost 1 vdup0 LHS
+ 1476993097U, // <0,u,1,0>: Cost 2 vext1 <0,0,u,1>, <0,0,u,1>
+ 1812780846U, // <0,u,1,1>: Cost 2 vzipl LHS, LHS
+ 1618138926U, // <0,u,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
+ 2752742134U, // <0,u,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
+ 1476996406U, // <0,u,1,4>: Cost 2 vext1 <0,0,u,1>, RHS
+ 1812781210U, // <0,u,1,5>: Cost 2 vzipl LHS, RHS
+ 2887006416U, // <0,u,1,6>: Cost 3 vzipl LHS, <u,6,3,7>
+ 2966736200U, // <0,u,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
+ 1812781413U, // <0,u,1,u>: Cost 2 vzipl LHS, LHS
+ 1482973286U, // <0,u,2,0>: Cost 2 vext1 <1,0,u,2>, LHS
+ 1482973987U, // <0,u,2,1>: Cost 2 vext1 <1,0,u,2>, <1,0,u,2>
+ 1946998574U, // <0,u,2,2>: Cost 2 vtrnl LHS, LHS
+ 835584U, // <0,u,2,3>: Cost 0 copy LHS
+ 1482976566U, // <0,u,2,4>: Cost 2 vext1 <1,0,u,2>, RHS
+ 3020781631U, // <0,u,2,5>: Cost 3 vtrnl LHS, <u,4,5,6>
+ 1946998938U, // <0,u,2,6>: Cost 2 vtrnl LHS, RHS
+ 1518810169U, // <0,u,2,7>: Cost 2 vext1 <7,0,u,2>, <7,0,u,2>
+ 835584U, // <0,u,2,u>: Cost 0 copy LHS
+ 2618640534U, // <0,u,3,0>: Cost 3 vext2 <0,2,0,u>, <3,0,1,2>
+ 2752743574U, // <0,u,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
+ 2636556597U, // <0,u,3,2>: Cost 3 vext2 <3,2,0,u>, <3,2,0,u>
+ 2752743836U, // <0,u,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
+ 2618640898U, // <0,u,3,4>: Cost 3 vext2 <0,2,0,u>, <3,4,5,6>
+ 2752743938U, // <0,u,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
+ 2639202936U, // <0,u,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
+ 2639874762U, // <0,u,3,7>: Cost 3 vext2 <3,7,0,u>, <3,7,0,u>
+ 2752743637U, // <0,u,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
+ 2562703462U, // <0,u,4,0>: Cost 3 vext1 <2,0,u,4>, LHS
+ 2888455982U, // <0,u,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
+ 3021575982U, // <0,u,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
+ 2568677591U, // <0,u,4,3>: Cost 3 vext1 <3,0,u,4>, <3,0,u,4>
+ 2562706742U, // <0,u,4,4>: Cost 3 vext1 <2,0,u,4>, RHS
+ 1544899894U, // <0,u,4,5>: Cost 2 vext2 <0,2,0,u>, RHS
+ 1679002934U, // <0,u,4,6>: Cost 2 vuzpl LHS, RHS
+ 2718718033U, // <0,u,4,7>: Cost 3 vext3 <5,6,7,0>, <u,4,7,6>
+ 1679002952U, // <0,u,4,u>: Cost 2 vuzpl LHS, RHS
+ 2568683622U, // <0,u,5,0>: Cost 3 vext1 <3,0,u,5>, LHS
+ 2568684438U, // <0,u,5,1>: Cost 3 vext1 <3,0,u,5>, <1,2,3,0>
+ 3765622902U, // <0,u,5,2>: Cost 4 vext3 <1,2,3,0>, <u,5,2,7>
+ 2691881087U, // <0,u,5,3>: Cost 3 vext3 <1,2,3,0>, <u,5,3,7>
+ 2568686902U, // <0,u,5,4>: Cost 3 vext1 <3,0,u,5>, RHS
+ 2650492890U, // <0,u,5,5>: Cost 3 vext2 <5,5,0,u>, <5,5,0,u>
+ 1618139290U, // <0,u,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
+ 2824834358U, // <0,u,5,7>: Cost 3 vuzpr <1,0,3,u>, RHS
+ 1618139308U, // <0,u,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
+ 2592579686U, // <0,u,6,0>: Cost 3 vext1 <7,0,u,6>, LHS
+ 2262496983U, // <0,u,6,1>: Cost 3 vrev <u,0,1,6>
+ 2654474688U, // <0,u,6,2>: Cost 3 vext2 <6,2,0,u>, <6,2,0,u>
+ 2691881168U, // <0,u,6,3>: Cost 3 vext3 <1,2,3,0>, <u,6,3,7>
+ 2592582966U, // <0,u,6,4>: Cost 3 vext1 <7,0,u,6>, RHS
+ 2656465587U, // <0,u,6,5>: Cost 3 vext2 <6,5,0,u>, <6,5,0,u>
+ 2657129220U, // <0,u,6,6>: Cost 3 vext2 <6,6,0,u>, <6,6,0,u>
+ 1584051029U, // <0,u,6,7>: Cost 2 vext2 <6,7,0,u>, <6,7,0,u>
+ 1584714662U, // <0,u,6,u>: Cost 2 vext2 <6,u,0,u>, <6,u,0,u>
+ 2562728038U, // <0,u,7,0>: Cost 3 vext1 <2,0,u,7>, LHS
+ 2562728854U, // <0,u,7,1>: Cost 3 vext1 <2,0,u,7>, <1,2,3,0>
+ 2562729473U, // <0,u,7,2>: Cost 3 vext1 <2,0,u,7>, <2,0,u,7>
+ 2661111018U, // <0,u,7,3>: Cost 3 vext2 <7,3,0,u>, <7,3,0,u>
+ 2562731318U, // <0,u,7,4>: Cost 3 vext1 <2,0,u,7>, RHS
+ 2718718258U, // <0,u,7,5>: Cost 3 vext3 <5,6,7,0>, <u,7,5,6>
+ 2586620261U, // <0,u,7,6>: Cost 3 vext1 <6,0,u,7>, <6,0,u,7>
+ 2657793644U, // <0,u,7,7>: Cost 3 vext2 <6,7,0,u>, <7,7,7,7>
+ 2562733870U, // <0,u,7,u>: Cost 3 vext1 <2,0,u,7>, LHS
+ 135053414U, // <0,u,u,0>: Cost 1 vdup0 LHS
+ 1544902446U, // <0,u,u,1>: Cost 2 vext2 <0,2,0,u>, LHS
+ 1679005486U, // <0,u,u,2>: Cost 2 vuzpl LHS, LHS
+ 835584U, // <0,u,u,3>: Cost 0 copy LHS
+ 1483025718U, // <0,u,u,4>: Cost 2 vext1 <1,0,u,u>, RHS
+ 1544902810U, // <0,u,u,5>: Cost 2 vext2 <0,2,0,u>, RHS
+ 1679005850U, // <0,u,u,6>: Cost 2 vuzpl LHS, RHS
+ 1518859327U, // <0,u,u,7>: Cost 2 vext1 <7,0,u,u>, <7,0,u,u>
+ 835584U, // <0,u,u,u>: Cost 0 copy LHS
+ 2689744896U, // <1,0,0,0>: Cost 3 vext3 <0,u,1,1>, <0,0,0,0>
+ 1610694666U, // <1,0,0,1>: Cost 2 vext3 <0,0,1,1>, <0,0,1,1>
+ 2689744916U, // <1,0,0,2>: Cost 3 vext3 <0,u,1,1>, <0,0,2,2>
+ 2619310332U, // <1,0,0,3>: Cost 3 vext2 <0,3,1,0>, <0,3,1,0>
+ 2684657701U, // <1,0,0,4>: Cost 3 vext3 <0,0,4,1>, <0,0,4,1>
+ 2620637598U, // <1,0,0,5>: Cost 3 vext2 <0,5,1,0>, <0,5,1,0>
+ 3708977654U, // <1,0,0,6>: Cost 4 vext2 <3,0,1,0>, <0,6,1,7>
+ 3666351168U, // <1,0,0,7>: Cost 4 vext1 <7,1,0,0>, <7,1,0,0>
+ 1611210825U, // <1,0,0,u>: Cost 2 vext3 <0,0,u,1>, <0,0,u,1>
+ 2556780646U, // <1,0,1,0>: Cost 3 vext1 <1,1,0,1>, LHS
+ 2556781355U, // <1,0,1,1>: Cost 3 vext1 <1,1,0,1>, <1,1,0,1>
+ 1616003174U, // <1,0,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
+ 3693052888U, // <1,0,1,3>: Cost 4 vext2 <0,3,1,0>, <1,3,1,3>
+ 2556783926U, // <1,0,1,4>: Cost 3 vext1 <1,1,0,1>, RHS
+ 2580672143U, // <1,0,1,5>: Cost 3 vext1 <5,1,0,1>, <5,1,0,1>
+ 2724839566U, // <1,0,1,6>: Cost 3 vext3 <6,7,0,1>, <0,1,6,7>
+ 3654415354U, // <1,0,1,7>: Cost 4 vext1 <5,1,0,1>, <7,0,1,2>
+ 1616003228U, // <1,0,1,u>: Cost 2 vext3 <0,u,1,1>, LHS
+ 2685690019U, // <1,0,2,0>: Cost 3 vext3 <0,2,0,1>, <0,2,0,1>
+ 2685763756U, // <1,0,2,1>: Cost 3 vext3 <0,2,1,1>, <0,2,1,1>
+ 2698297524U, // <1,0,2,2>: Cost 3 vext3 <2,3,0,1>, <0,2,2,0>
+ 2685911230U, // <1,0,2,3>: Cost 3 vext3 <0,2,3,1>, <0,2,3,1>
+ 2689745100U, // <1,0,2,4>: Cost 3 vext3 <0,u,1,1>, <0,2,4,6>
+ 3764814038U, // <1,0,2,5>: Cost 4 vext3 <1,1,1,1>, <0,2,5,7>
+ 2724839640U, // <1,0,2,6>: Cost 3 vext3 <6,7,0,1>, <0,2,6,0>
+ 2592625658U, // <1,0,2,7>: Cost 3 vext1 <7,1,0,2>, <7,0,1,2>
+ 2686279915U, // <1,0,2,u>: Cost 3 vext3 <0,2,u,1>, <0,2,u,1>
+ 3087843328U, // <1,0,3,0>: Cost 3 vtrnr LHS, <0,0,0,0>
+ 3087843338U, // <1,0,3,1>: Cost 3 vtrnr LHS, <0,0,1,1>
+ 67944550U, // <1,0,3,2>: Cost 1 vrev LHS
+ 2568743135U, // <1,0,3,3>: Cost 3 vext1 <3,1,0,3>, <3,1,0,3>
+ 2562772278U, // <1,0,3,4>: Cost 3 vext1 <2,1,0,3>, RHS
+ 4099850454U, // <1,0,3,5>: Cost 4 vtrnl <1,0,3,2>, <0,2,5,7>
+ 3704998538U, // <1,0,3,6>: Cost 4 vext2 <2,3,1,0>, <3,6,2,7>
+ 2592633923U, // <1,0,3,7>: Cost 3 vext1 <7,1,0,3>, <7,1,0,3>
+ 68386972U, // <1,0,3,u>: Cost 1 vrev LHS
+ 2620640146U, // <1,0,4,0>: Cost 3 vext2 <0,5,1,0>, <4,0,5,1>
+ 2689745234U, // <1,0,4,1>: Cost 3 vext3 <0,u,1,1>, <0,4,1,5>
+ 2689745244U, // <1,0,4,2>: Cost 3 vext3 <0,u,1,1>, <0,4,2,6>
+ 3760980320U, // <1,0,4,3>: Cost 4 vext3 <0,4,3,1>, <0,4,3,1>
+ 3761054057U, // <1,0,4,4>: Cost 4 vext3 <0,4,4,1>, <0,4,4,1>
+ 2619313462U, // <1,0,4,5>: Cost 3 vext2 <0,3,1,0>, RHS
+ 3761201531U, // <1,0,4,6>: Cost 4 vext3 <0,4,6,1>, <0,4,6,1>
+ 3666383940U, // <1,0,4,7>: Cost 4 vext1 <7,1,0,4>, <7,1,0,4>
+ 2619313705U, // <1,0,4,u>: Cost 3 vext2 <0,3,1,0>, RHS
+ 4029300736U, // <1,0,5,0>: Cost 4 vzipr <0,4,1,5>, <0,0,0,0>
+ 2895249510U, // <1,0,5,1>: Cost 3 vzipl <1,5,3,7>, LHS
+ 3028287590U, // <1,0,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
+ 3642501345U, // <1,0,5,3>: Cost 4 vext1 <3,1,0,5>, <3,1,0,5>
+ 2215592058U, // <1,0,5,4>: Cost 3 vrev <0,1,4,5>
+ 3724242907U, // <1,0,5,5>: Cost 4 vext2 <5,5,1,0>, <5,5,1,0>
+ 3724906540U, // <1,0,5,6>: Cost 4 vext2 <5,6,1,0>, <5,6,1,0>
+ 3911118134U, // <1,0,5,7>: Cost 4 vuzpr <3,1,3,0>, RHS
+ 3028287644U, // <1,0,5,u>: Cost 3 vtrnl <1,3,5,7>, LHS
+ 3762086375U, // <1,0,6,0>: Cost 4 vext3 <0,6,0,1>, <0,6,0,1>
+ 2698297846U, // <1,0,6,1>: Cost 3 vext3 <2,3,0,1>, <0,6,1,7>
+ 3760022015U, // <1,0,6,2>: Cost 4 vext3 <0,2,u,1>, <0,6,2,7>
+ 3642509538U, // <1,0,6,3>: Cost 4 vext1 <3,1,0,6>, <3,1,0,6>
+ 3762381323U, // <1,0,6,4>: Cost 4 vext3 <0,6,4,1>, <0,6,4,1>
+ 3730215604U, // <1,0,6,5>: Cost 4 vext2 <6,5,1,0>, <6,5,1,0>
+ 3730879237U, // <1,0,6,6>: Cost 4 vext2 <6,6,1,0>, <6,6,1,0>
+ 2657801046U, // <1,0,6,7>: Cost 3 vext2 <6,7,1,0>, <6,7,1,0>
+ 2658464679U, // <1,0,6,u>: Cost 3 vext2 <6,u,1,0>, <6,u,1,0>
+ 2659128312U, // <1,0,7,0>: Cost 3 vext2 <7,0,1,0>, <7,0,1,0>
+ 4047898278U, // <1,0,7,1>: Cost 4 vzipr <3,5,1,7>, <2,3,0,1>
+ 2215460970U, // <1,0,7,2>: Cost 3 vrev <0,1,2,7>
+ 3734861035U, // <1,0,7,3>: Cost 4 vext2 <7,3,1,0>, <7,3,1,0>
+ 3731543398U, // <1,0,7,4>: Cost 4 vext2 <6,7,1,0>, <7,4,5,6>
+ 3736188301U, // <1,0,7,5>: Cost 4 vext2 <7,5,1,0>, <7,5,1,0>
+ 2663110110U, // <1,0,7,6>: Cost 3 vext2 <7,6,1,0>, <7,6,1,0>
+ 3731543660U, // <1,0,7,7>: Cost 4 vext2 <6,7,1,0>, <7,7,7,7>
+ 2664437376U, // <1,0,7,u>: Cost 3 vext2 <7,u,1,0>, <7,u,1,0>
+ 3087884288U, // <1,0,u,0>: Cost 3 vtrnr LHS, <0,0,0,0>
+ 1616003730U, // <1,0,u,1>: Cost 2 vext3 <0,u,1,1>, <0,u,1,1>
+ 67985515U, // <1,0,u,2>: Cost 1 vrev LHS
+ 2689893028U, // <1,0,u,3>: Cost 3 vext3 <0,u,3,1>, <0,u,3,1>
+ 2689745586U, // <1,0,u,4>: Cost 3 vext3 <0,u,1,1>, <0,u,4,6>
+ 2619316378U, // <1,0,u,5>: Cost 3 vext2 <0,3,1,0>, RHS
+ 2669082807U, // <1,0,u,6>: Cost 3 vext2 <u,6,1,0>, <u,6,1,0>
+ 2592674888U, // <1,0,u,7>: Cost 3 vext1 <7,1,0,u>, <7,1,0,u>
+ 68427937U, // <1,0,u,u>: Cost 1 vrev LHS
+ 1543585802U, // <1,1,0,0>: Cost 2 vext2 <0,0,1,1>, <0,0,1,1>
+ 1548894310U, // <1,1,0,1>: Cost 2 vext2 <0,u,1,1>, LHS
+ 2618654892U, // <1,1,0,2>: Cost 3 vext2 <0,2,1,1>, <0,2,1,1>
+ 2689745654U, // <1,1,0,3>: Cost 3 vext3 <0,u,1,1>, <1,0,3,2>
+ 2622636370U, // <1,1,0,4>: Cost 3 vext2 <0,u,1,1>, <0,4,1,5>
+ 2620645791U, // <1,1,0,5>: Cost 3 vext2 <0,5,1,1>, <0,5,1,1>
+ 3696378367U, // <1,1,0,6>: Cost 4 vext2 <0,u,1,1>, <0,6,2,7>
+ 3666424905U, // <1,1,0,7>: Cost 4 vext1 <7,1,1,0>, <7,1,1,0>
+ 1548894866U, // <1,1,0,u>: Cost 2 vext2 <0,u,1,1>, <0,u,1,1>
+ 1483112550U, // <1,1,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
+ 202162278U, // <1,1,1,1>: Cost 1 vdup1 LHS
+ 2622636950U, // <1,1,1,2>: Cost 3 vext2 <0,u,1,1>, <1,2,3,0>
+ 2622637016U, // <1,1,1,3>: Cost 3 vext2 <0,u,1,1>, <1,3,1,3>
+ 1483115830U, // <1,1,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
+ 2622637200U, // <1,1,1,5>: Cost 3 vext2 <0,u,1,1>, <1,5,3,7>
+ 2622637263U, // <1,1,1,6>: Cost 3 vext2 <0,u,1,1>, <1,6,1,7>
+ 2592691274U, // <1,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
+ 202162278U, // <1,1,1,u>: Cost 1 vdup1 LHS
+ 2550890588U, // <1,1,2,0>: Cost 3 vext1 <0,1,1,2>, <0,1,1,2>
+ 2617329183U, // <1,1,2,1>: Cost 3 vext2 <0,0,1,1>, <2,1,3,1>
+ 2622637672U, // <1,1,2,2>: Cost 3 vext2 <0,u,1,1>, <2,2,2,2>
+ 2622637734U, // <1,1,2,3>: Cost 3 vext2 <0,u,1,1>, <2,3,0,1>
+ 2550893878U, // <1,1,2,4>: Cost 3 vext1 <0,1,1,2>, RHS
+ 3696379744U, // <1,1,2,5>: Cost 4 vext2 <0,u,1,1>, <2,5,2,7>
+ 2622638010U, // <1,1,2,6>: Cost 3 vext2 <0,u,1,1>, <2,6,3,7>
+ 3804554170U, // <1,1,2,7>: Cost 4 vext3 <7,7,0,1>, <1,2,7,0>
+ 2622638139U, // <1,1,2,u>: Cost 3 vext2 <0,u,1,1>, <2,u,0,1>
+ 2622638230U, // <1,1,3,0>: Cost 3 vext2 <0,u,1,1>, <3,0,1,2>
+ 3087844148U, // <1,1,3,1>: Cost 3 vtrnr LHS, <1,1,1,1>
+ 4161585244U, // <1,1,3,2>: Cost 4 vtrnr LHS, <0,1,1,2>
+ 2014101606U, // <1,1,3,3>: Cost 2 vtrnr LHS, LHS
+ 2622638594U, // <1,1,3,4>: Cost 3 vext2 <0,u,1,1>, <3,4,5,6>
+ 2689745920U, // <1,1,3,5>: Cost 3 vext3 <0,u,1,1>, <1,3,5,7>
+ 3763487753U, // <1,1,3,6>: Cost 4 vext3 <0,u,1,1>, <1,3,6,7>
+ 2592707660U, // <1,1,3,7>: Cost 3 vext1 <7,1,1,3>, <7,1,1,3>
+ 2014101611U, // <1,1,3,u>: Cost 2 vtrnr LHS, LHS
+ 2556878950U, // <1,1,4,0>: Cost 3 vext1 <1,1,1,4>, LHS
+ 2221335351U, // <1,1,4,1>: Cost 3 vrev <1,1,1,4>
+ 3696380988U, // <1,1,4,2>: Cost 4 vext2 <0,u,1,1>, <4,2,6,0>
+ 3763487805U, // <1,1,4,3>: Cost 4 vext3 <0,u,1,1>, <1,4,3,5>
+ 2556882230U, // <1,1,4,4>: Cost 3 vext1 <1,1,1,4>, RHS
+ 1548897590U, // <1,1,4,5>: Cost 2 vext2 <0,u,1,1>, RHS
+ 2758184246U, // <1,1,4,6>: Cost 3 vuzpl <1,1,1,1>, RHS
+ 3666457677U, // <1,1,4,7>: Cost 4 vext1 <7,1,1,4>, <7,1,1,4>
+ 1548897833U, // <1,1,4,u>: Cost 2 vext2 <0,u,1,1>, RHS
+ 2693653615U, // <1,1,5,0>: Cost 3 vext3 <1,5,0,1>, <1,5,0,1>
+ 2617331408U, // <1,1,5,1>: Cost 3 vext2 <0,0,1,1>, <5,1,7,3>
+ 4029302934U, // <1,1,5,2>: Cost 4 vzipr <0,4,1,5>, <3,0,1,2>
+ 2689746064U, // <1,1,5,3>: Cost 3 vext3 <0,u,1,1>, <1,5,3,7>
+ 2221564755U, // <1,1,5,4>: Cost 3 vrev <1,1,4,5>
+ 2955559250U, // <1,1,5,5>: Cost 3 vzipr <0,4,1,5>, <0,4,1,5>
+ 2617331810U, // <1,1,5,6>: Cost 3 vext2 <0,0,1,1>, <5,6,7,0>
+ 2825293110U, // <1,1,5,7>: Cost 3 vuzpr <1,1,1,1>, RHS
+ 2689746109U, // <1,1,5,u>: Cost 3 vext3 <0,u,1,1>, <1,5,u,7>
+ 3696382241U, // <1,1,6,0>: Cost 4 vext2 <0,u,1,1>, <6,0,1,2>
+ 2689746127U, // <1,1,6,1>: Cost 3 vext3 <0,u,1,1>, <1,6,1,7>
+ 2617332218U, // <1,1,6,2>: Cost 3 vext2 <0,0,1,1>, <6,2,7,3>
+ 3763487969U, // <1,1,6,3>: Cost 4 vext3 <0,u,1,1>, <1,6,3,7>
+ 3696382605U, // <1,1,6,4>: Cost 4 vext2 <0,u,1,1>, <6,4,5,6>
+ 4029309266U, // <1,1,6,5>: Cost 4 vzipr <0,4,1,6>, <0,4,1,5>
+ 2617332536U, // <1,1,6,6>: Cost 3 vext2 <0,0,1,1>, <6,6,6,6>
+ 2724840702U, // <1,1,6,7>: Cost 3 vext3 <6,7,0,1>, <1,6,7,0>
+ 2725504263U, // <1,1,6,u>: Cost 3 vext3 <6,u,0,1>, <1,6,u,0>
+ 2617332720U, // <1,1,7,0>: Cost 3 vext2 <0,0,1,1>, <7,0,0,1>
+ 2659800138U, // <1,1,7,1>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
+ 3691074717U, // <1,1,7,2>: Cost 4 vext2 <0,0,1,1>, <7,2,1,3>
+ 4167811174U, // <1,1,7,3>: Cost 4 vtrnr <1,1,5,7>, LHS
+ 2617333094U, // <1,1,7,4>: Cost 3 vext2 <0,0,1,1>, <7,4,5,6>
+ 3295396702U, // <1,1,7,5>: Cost 4 vrev <1,1,5,7>
+ 3803891014U, // <1,1,7,6>: Cost 4 vext3 <7,6,0,1>, <1,7,6,0>
+ 2617333356U, // <1,1,7,7>: Cost 3 vext2 <0,0,1,1>, <7,7,7,7>
+ 2659800138U, // <1,1,7,u>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
+ 1483112550U, // <1,1,u,0>: Cost 2 vext1 <1,1,1,1>, LHS
+ 202162278U, // <1,1,u,1>: Cost 1 vdup1 LHS
+ 2622642056U, // <1,1,u,2>: Cost 3 vext2 <0,u,1,1>, <u,2,3,3>
+ 2014142566U, // <1,1,u,3>: Cost 2 vtrnr LHS, LHS
+ 1483115830U, // <1,1,u,4>: Cost 2 vext1 <1,1,1,1>, RHS
+ 1548900506U, // <1,1,u,5>: Cost 2 vext2 <0,u,1,1>, RHS
+ 2622642384U, // <1,1,u,6>: Cost 3 vext2 <0,u,1,1>, <u,6,3,7>
+ 2825293353U, // <1,1,u,7>: Cost 3 vuzpr <1,1,1,1>, RHS
+ 202162278U, // <1,1,u,u>: Cost 1 vdup1 LHS
+ 2635251712U, // <1,2,0,0>: Cost 3 vext2 <3,0,1,2>, <0,0,0,0>
+ 1561509990U, // <1,2,0,1>: Cost 2 vext2 <3,0,1,2>, LHS
+ 2618663085U, // <1,2,0,2>: Cost 3 vext2 <0,2,1,2>, <0,2,1,2>
+ 2696529358U, // <1,2,0,3>: Cost 3 vext3 <2,0,3,1>, <2,0,3,1>
+ 2635252050U, // <1,2,0,4>: Cost 3 vext2 <3,0,1,2>, <0,4,1,5>
+ 3769533926U, // <1,2,0,5>: Cost 4 vext3 <1,u,2,1>, <2,0,5,7>
+ 2621317617U, // <1,2,0,6>: Cost 3 vext2 <0,6,1,2>, <0,6,1,2>
+ 2659140170U, // <1,2,0,7>: Cost 3 vext2 <7,0,1,2>, <0,7,2,1>
+ 1561510557U, // <1,2,0,u>: Cost 2 vext2 <3,0,1,2>, LHS
+ 2623308516U, // <1,2,1,0>: Cost 3 vext2 <1,0,1,2>, <1,0,1,2>
+ 2635252532U, // <1,2,1,1>: Cost 3 vext2 <3,0,1,2>, <1,1,1,1>
+ 2631271318U, // <1,2,1,2>: Cost 3 vext2 <2,3,1,2>, <1,2,3,0>
+ 2958180454U, // <1,2,1,3>: Cost 3 vzipr <0,u,1,1>, LHS
+ 2550959414U, // <1,2,1,4>: Cost 3 vext1 <0,1,2,1>, RHS
+ 2635252880U, // <1,2,1,5>: Cost 3 vext2 <3,0,1,2>, <1,5,3,7>
+ 2635252952U, // <1,2,1,6>: Cost 3 vext2 <3,0,1,2>, <1,6,2,7>
+ 3732882731U, // <1,2,1,7>: Cost 4 vext2 <7,0,1,2>, <1,7,3,0>
+ 2958180459U, // <1,2,1,u>: Cost 3 vzipr <0,u,1,1>, LHS
+ 2629281213U, // <1,2,2,0>: Cost 3 vext2 <2,0,1,2>, <2,0,1,2>
+ 2635253280U, // <1,2,2,1>: Cost 3 vext2 <3,0,1,2>, <2,1,3,2>
+ 2618664552U, // <1,2,2,2>: Cost 3 vext2 <0,2,1,2>, <2,2,2,2>
+ 2689746546U, // <1,2,2,3>: Cost 3 vext3 <0,u,1,1>, <2,2,3,3>
+ 3764815485U, // <1,2,2,4>: Cost 4 vext3 <1,1,1,1>, <2,2,4,5>
+ 3760023176U, // <1,2,2,5>: Cost 4 vext3 <0,2,u,1>, <2,2,5,7>
+ 2635253690U, // <1,2,2,6>: Cost 3 vext2 <3,0,1,2>, <2,6,3,7>
+ 2659141610U, // <1,2,2,7>: Cost 3 vext2 <7,0,1,2>, <2,7,0,1>
+ 2689746591U, // <1,2,2,u>: Cost 3 vext3 <0,u,1,1>, <2,2,u,3>
+ 403488870U, // <1,2,3,0>: Cost 1 vext1 LHS, LHS
+ 1477231350U, // <1,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+ 1477232232U, // <1,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
+ 1477233052U, // <1,2,3,3>: Cost 2 vext1 LHS, <3,3,3,3>
+ 403492150U, // <1,2,3,4>: Cost 1 vext1 LHS, RHS
+ 1525010128U, // <1,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
+ 1525010938U, // <1,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+ 1525011450U, // <1,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
+ 403494702U, // <1,2,3,u>: Cost 1 vext1 LHS, LHS
+ 2641226607U, // <1,2,4,0>: Cost 3 vext2 <4,0,1,2>, <4,0,1,2>
+ 3624723446U, // <1,2,4,1>: Cost 4 vext1 <0,1,2,4>, <1,3,4,6>
+ 3301123609U, // <1,2,4,2>: Cost 4 vrev <2,1,2,4>
+ 2598759198U, // <1,2,4,3>: Cost 3 vext1 <u,1,2,4>, <3,u,1,2>
+ 2659142864U, // <1,2,4,4>: Cost 3 vext2 <7,0,1,2>, <4,4,4,4>
+ 1561513270U, // <1,2,4,5>: Cost 2 vext2 <3,0,1,2>, RHS
+ 2659143028U, // <1,2,4,6>: Cost 3 vext2 <7,0,1,2>, <4,6,4,6>
+ 2659143112U, // <1,2,4,7>: Cost 3 vext2 <7,0,1,2>, <4,7,5,0>
+ 1561513513U, // <1,2,4,u>: Cost 2 vext2 <3,0,1,2>, RHS
+ 2550988902U, // <1,2,5,0>: Cost 3 vext1 <0,1,2,5>, LHS
+ 2550989824U, // <1,2,5,1>: Cost 3 vext1 <0,1,2,5>, <1,3,5,7>
+ 3624732264U, // <1,2,5,2>: Cost 4 vext1 <0,1,2,5>, <2,2,2,2>
+ 2955559014U, // <1,2,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
+ 2550992182U, // <1,2,5,4>: Cost 3 vext1 <0,1,2,5>, RHS
+ 2659143684U, // <1,2,5,5>: Cost 3 vext2 <7,0,1,2>, <5,5,5,5>
+ 2659143778U, // <1,2,5,6>: Cost 3 vext2 <7,0,1,2>, <5,6,7,0>
+ 2659143848U, // <1,2,5,7>: Cost 3 vext2 <7,0,1,2>, <5,7,5,7>
+ 2550994734U, // <1,2,5,u>: Cost 3 vext1 <0,1,2,5>, LHS
+ 2700289945U, // <1,2,6,0>: Cost 3 vext3 <2,6,0,1>, <2,6,0,1>
+ 2635256232U, // <1,2,6,1>: Cost 3 vext2 <3,0,1,2>, <6,1,7,2>
+ 2659144186U, // <1,2,6,2>: Cost 3 vext2 <7,0,1,2>, <6,2,7,3>
+ 2689746874U, // <1,2,6,3>: Cost 3 vext3 <0,u,1,1>, <2,6,3,7>
+ 3763488705U, // <1,2,6,4>: Cost 4 vext3 <0,u,1,1>, <2,6,4,5>
+ 3763488716U, // <1,2,6,5>: Cost 4 vext3 <0,u,1,1>, <2,6,5,7>
+ 2659144504U, // <1,2,6,6>: Cost 3 vext2 <7,0,1,2>, <6,6,6,6>
+ 2657817432U, // <1,2,6,7>: Cost 3 vext2 <6,7,1,2>, <6,7,1,2>
+ 2689746919U, // <1,2,6,u>: Cost 3 vext3 <0,u,1,1>, <2,6,u,7>
+ 1585402874U, // <1,2,7,0>: Cost 2 vext2 <7,0,1,2>, <7,0,1,2>
+ 2659144770U, // <1,2,7,1>: Cost 3 vext2 <7,0,1,2>, <7,1,0,2>
+ 3708998858U, // <1,2,7,2>: Cost 4 vext2 <3,0,1,2>, <7,2,6,3>
+ 2635257059U, // <1,2,7,3>: Cost 3 vext2 <3,0,1,2>, <7,3,0,1>
+ 2659145062U, // <1,2,7,4>: Cost 3 vext2 <7,0,1,2>, <7,4,5,6>
+ 3732886916U, // <1,2,7,5>: Cost 4 vext2 <7,0,1,2>, <7,5,0,0>
+ 3732886998U, // <1,2,7,6>: Cost 4 vext2 <7,0,1,2>, <7,6,0,1>
+ 2659145255U, // <1,2,7,7>: Cost 3 vext2 <7,0,1,2>, <7,7,0,1>
+ 1590711938U, // <1,2,7,u>: Cost 2 vext2 <7,u,1,2>, <7,u,1,2>
+ 403529835U, // <1,2,u,0>: Cost 1 vext1 LHS, LHS
+ 1477272310U, // <1,2,u,1>: Cost 2 vext1 LHS, <1,0,3,2>
+ 1477273192U, // <1,2,u,2>: Cost 2 vext1 LHS, <2,2,2,2>
+ 1477273750U, // <1,2,u,3>: Cost 2 vext1 LHS, <3,0,1,2>
+ 403533110U, // <1,2,u,4>: Cost 1 vext1 LHS, RHS
+ 1561516186U, // <1,2,u,5>: Cost 2 vext2 <3,0,1,2>, RHS
+ 1525051898U, // <1,2,u,6>: Cost 2 vext1 LHS, <6,2,7,3>
+ 1525052410U, // <1,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
+ 403535662U, // <1,2,u,u>: Cost 1 vext1 LHS, LHS
+ 2819407872U, // <1,3,0,0>: Cost 3 vuzpr LHS, <0,0,0,0>
+ 1551564902U, // <1,3,0,1>: Cost 2 vext2 <1,3,1,3>, LHS
+ 2819408630U, // <1,3,0,2>: Cost 3 vuzpr LHS, <1,0,3,2>
+ 2619334911U, // <1,3,0,3>: Cost 3 vext2 <0,3,1,3>, <0,3,1,3>
+ 2625306962U, // <1,3,0,4>: Cost 3 vext2 <1,3,1,3>, <0,4,1,5>
+ 3832725879U, // <1,3,0,5>: Cost 4 vuzpl <1,2,3,0>, <0,4,5,6>
+ 3699048959U, // <1,3,0,6>: Cost 4 vext2 <1,3,1,3>, <0,6,2,7>
+ 3776538827U, // <1,3,0,7>: Cost 4 vext3 <3,0,7,1>, <3,0,7,1>
+ 1551565469U, // <1,3,0,u>: Cost 2 vext2 <1,3,1,3>, LHS
+ 2618671862U, // <1,3,1,0>: Cost 3 vext2 <0,2,1,3>, <1,0,3,2>
+ 2819408692U, // <1,3,1,1>: Cost 3 vuzpr LHS, <1,1,1,1>
+ 2624643975U, // <1,3,1,2>: Cost 3 vext2 <1,2,1,3>, <1,2,1,3>
+ 1745666150U, // <1,3,1,3>: Cost 2 vuzpr LHS, LHS
+ 2557005110U, // <1,3,1,4>: Cost 3 vext1 <1,1,3,1>, RHS
+ 2625307792U, // <1,3,1,5>: Cost 3 vext2 <1,3,1,3>, <1,5,3,7>
+ 3698386127U, // <1,3,1,6>: Cost 4 vext2 <1,2,1,3>, <1,6,1,7>
+ 2592838748U, // <1,3,1,7>: Cost 3 vext1 <7,1,3,1>, <7,1,3,1>
+ 1745666155U, // <1,3,1,u>: Cost 2 vuzpr LHS, LHS
+ 2819408790U, // <1,3,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
+ 2625308193U, // <1,3,2,1>: Cost 3 vext2 <1,3,1,3>, <2,1,3,3>
+ 2819408036U, // <1,3,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
+ 2819851890U, // <1,3,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
+ 2819408794U, // <1,3,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
+ 3893149890U, // <1,3,2,5>: Cost 4 vuzpr LHS, <0,2,3,5>
+ 2819408076U, // <1,3,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
+ 3772041583U, // <1,3,2,7>: Cost 4 vext3 <2,3,0,1>, <3,2,7,3>
+ 2819408042U, // <1,3,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
+ 1483276390U, // <1,3,3,0>: Cost 2 vext1 <1,1,3,3>, LHS
+ 1483277128U, // <1,3,3,1>: Cost 2 vext1 <1,1,3,3>, <1,1,3,3>
+ 2557019752U, // <1,3,3,2>: Cost 3 vext1 <1,1,3,3>, <2,2,2,2>
+ 2819408856U, // <1,3,3,3>: Cost 3 vuzpr LHS, <1,3,1,3>
+ 1483279670U, // <1,3,3,4>: Cost 2 vext1 <1,1,3,3>, RHS
+ 2819409614U, // <1,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
+ 2598826490U, // <1,3,3,6>: Cost 3 vext1 <u,1,3,3>, <6,2,7,3>
+ 3087844352U, // <1,3,3,7>: Cost 3 vtrnr LHS, <1,3,5,7>
+ 1483282222U, // <1,3,3,u>: Cost 2 vext1 <1,1,3,3>, LHS
+ 2568970342U, // <1,3,4,0>: Cost 3 vext1 <3,1,3,4>, LHS
+ 2568971224U, // <1,3,4,1>: Cost 3 vext1 <3,1,3,4>, <1,3,1,3>
+ 3832761290U, // <1,3,4,2>: Cost 4 vuzpl <1,2,3,4>, <4,1,2,3>
+ 2233428219U, // <1,3,4,3>: Cost 3 vrev <3,1,3,4>
+ 2568973622U, // <1,3,4,4>: Cost 3 vext1 <3,1,3,4>, RHS
+ 1551568182U, // <1,3,4,5>: Cost 2 vext2 <1,3,1,3>, RHS
+ 2819410434U, // <1,3,4,6>: Cost 3 vuzpr LHS, <3,4,5,6>
+ 3666605151U, // <1,3,4,7>: Cost 4 vext1 <7,1,3,4>, <7,1,3,4>
+ 1551568425U, // <1,3,4,u>: Cost 2 vext2 <1,3,1,3>, RHS
+ 2563006566U, // <1,3,5,0>: Cost 3 vext1 <2,1,3,5>, LHS
+ 2568979456U, // <1,3,5,1>: Cost 3 vext1 <3,1,3,5>, <1,3,5,7>
+ 2563008035U, // <1,3,5,2>: Cost 3 vext1 <2,1,3,5>, <2,1,3,5>
+ 2233436412U, // <1,3,5,3>: Cost 3 vrev <3,1,3,5>
+ 2563009846U, // <1,3,5,4>: Cost 3 vext1 <2,1,3,5>, RHS
+ 2867187716U, // <1,3,5,5>: Cost 3 vuzpr LHS, <5,5,5,5>
+ 2655834214U, // <1,3,5,6>: Cost 3 vext2 <6,4,1,3>, <5,6,7,4>
+ 1745669430U, // <1,3,5,7>: Cost 2 vuzpr LHS, RHS
+ 1745669431U, // <1,3,5,u>: Cost 2 vuzpr LHS, RHS
+ 2867187810U, // <1,3,6,0>: Cost 3 vuzpr LHS, <5,6,7,0>
+ 3699052931U, // <1,3,6,1>: Cost 4 vext2 <1,3,1,3>, <6,1,3,1>
+ 2654507460U, // <1,3,6,2>: Cost 3 vext2 <6,2,1,3>, <6,2,1,3>
+ 3766291091U, // <1,3,6,3>: Cost 4 vext3 <1,3,3,1>, <3,6,3,7>
+ 2655834726U, // <1,3,6,4>: Cost 3 vext2 <6,4,1,3>, <6,4,1,3>
+ 3923384562U, // <1,3,6,5>: Cost 4 vuzpr <5,1,7,3>, <u,6,7,5>
+ 2657161992U, // <1,3,6,6>: Cost 3 vext2 <6,6,1,3>, <6,6,1,3>
+ 2819852218U, // <1,3,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
+ 2819852219U, // <1,3,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
+ 2706926275U, // <1,3,7,0>: Cost 3 vext3 <3,7,0,1>, <3,7,0,1>
+ 2659816524U, // <1,3,7,1>: Cost 3 vext2 <7,1,1,3>, <7,1,1,3>
+ 3636766245U, // <1,3,7,2>: Cost 4 vext1 <2,1,3,7>, <2,1,3,7>
+ 2867187903U, // <1,3,7,3>: Cost 3 vuzpr LHS, <5,7,u,3>
+ 2625312102U, // <1,3,7,4>: Cost 3 vext2 <1,3,1,3>, <7,4,5,6>
+ 2867188598U, // <1,3,7,5>: Cost 3 vuzpr LHS, <6,7,4,5>
+ 3728250344U, // <1,3,7,6>: Cost 4 vext2 <6,2,1,3>, <7,6,2,1>
+ 2867187880U, // <1,3,7,7>: Cost 3 vuzpr LHS, <5,7,5,7>
+ 2707516171U, // <1,3,7,u>: Cost 3 vext3 <3,7,u,1>, <3,7,u,1>
+ 1483317350U, // <1,3,u,0>: Cost 2 vext1 <1,1,3,u>, LHS
+ 1483318093U, // <1,3,u,1>: Cost 2 vext1 <1,1,3,u>, <1,1,3,u>
+ 2819410718U, // <1,3,u,2>: Cost 3 vuzpr LHS, <3,u,1,2>
+ 1745666717U, // <1,3,u,3>: Cost 2 vuzpr LHS, LHS
+ 1483320630U, // <1,3,u,4>: Cost 2 vext1 <1,1,3,u>, RHS
+ 1551571098U, // <1,3,u,5>: Cost 2 vext2 <1,3,1,3>, RHS
+ 2819410758U, // <1,3,u,6>: Cost 3 vuzpr LHS, <3,u,5,6>
+ 1745669673U, // <1,3,u,7>: Cost 2 vuzpr LHS, RHS
+ 1745666722U, // <1,3,u,u>: Cost 2 vuzpr LHS, LHS
+ 2617352205U, // <1,4,0,0>: Cost 3 vext2 <0,0,1,4>, <0,0,1,4>
+ 2619342950U, // <1,4,0,1>: Cost 3 vext2 <0,3,1,4>, LHS
+ 3692421295U, // <1,4,0,2>: Cost 4 vext2 <0,2,1,4>, <0,2,1,4>
+ 2619343104U, // <1,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
+ 2617352530U, // <1,4,0,4>: Cost 3 vext2 <0,0,1,4>, <0,4,1,5>
+ 1634880402U, // <1,4,0,5>: Cost 2 vext3 <4,0,5,1>, <4,0,5,1>
+ 2713930652U, // <1,4,0,6>: Cost 3 vext3 <4,u,5,1>, <4,0,6,2>
+ 3732898396U, // <1,4,0,7>: Cost 4 vext2 <7,0,1,4>, <0,7,4,1>
+ 1635101613U, // <1,4,0,u>: Cost 2 vext3 <4,0,u,1>, <4,0,u,1>
+ 3693085430U, // <1,4,1,0>: Cost 4 vext2 <0,3,1,4>, <1,0,3,2>
+ 2623988535U, // <1,4,1,1>: Cost 3 vext2 <1,1,1,4>, <1,1,1,4>
+ 3693085590U, // <1,4,1,2>: Cost 4 vext2 <0,3,1,4>, <1,2,3,0>
+ 3692422134U, // <1,4,1,3>: Cost 4 vext2 <0,2,1,4>, <1,3,4,6>
+ 3693085726U, // <1,4,1,4>: Cost 4 vext2 <0,3,1,4>, <1,4,0,1>
+ 2892401974U, // <1,4,1,5>: Cost 3 vzipl <1,1,1,1>, RHS
+ 3026619702U, // <1,4,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
+ 3800206324U, // <1,4,1,7>: Cost 4 vext3 <7,0,4,1>, <4,1,7,0>
+ 2892402217U, // <1,4,1,u>: Cost 3 vzipl <1,1,1,1>, RHS
+ 3966978927U, // <1,4,2,0>: Cost 4 vzipl <1,2,3,4>, <4,0,1,2>
+ 3966979018U, // <1,4,2,1>: Cost 4 vzipl <1,2,3,4>, <4,1,2,3>
+ 3693086312U, // <1,4,2,2>: Cost 4 vext2 <0,3,1,4>, <2,2,2,2>
+ 2635269798U, // <1,4,2,3>: Cost 3 vext2 <3,0,1,4>, <2,3,0,1>
+ 3966979280U, // <1,4,2,4>: Cost 4 vzipl <1,2,3,4>, <4,4,4,4>
+ 2893204790U, // <1,4,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
+ 3693086650U, // <1,4,2,6>: Cost 4 vext2 <0,3,1,4>, <2,6,3,7>
+ 3666662502U, // <1,4,2,7>: Cost 4 vext1 <7,1,4,2>, <7,1,4,2>
+ 2893205033U, // <1,4,2,u>: Cost 3 vzipl <1,2,3,0>, RHS
+ 2563063910U, // <1,4,3,0>: Cost 3 vext1 <2,1,4,3>, LHS
+ 2563064730U, // <1,4,3,1>: Cost 3 vext1 <2,1,4,3>, <1,2,3,4>
+ 2563065386U, // <1,4,3,2>: Cost 3 vext1 <2,1,4,3>, <2,1,4,3>
+ 3693087132U, // <1,4,3,3>: Cost 4 vext2 <0,3,1,4>, <3,3,3,3>
+ 2619345410U, // <1,4,3,4>: Cost 3 vext2 <0,3,1,4>, <3,4,5,6>
+ 3087843666U, // <1,4,3,5>: Cost 3 vtrnr LHS, <0,4,1,5>
+ 3087843676U, // <1,4,3,6>: Cost 3 vtrnr LHS, <0,4,2,6>
+ 3666670695U, // <1,4,3,7>: Cost 4 vext1 <7,1,4,3>, <7,1,4,3>
+ 3087843669U, // <1,4,3,u>: Cost 3 vtrnr LHS, <0,4,1,u>
+ 2620672914U, // <1,4,4,0>: Cost 3 vext2 <0,5,1,4>, <4,0,5,1>
+ 3630842706U, // <1,4,4,1>: Cost 4 vext1 <1,1,4,4>, <1,1,4,4>
+ 3313069003U, // <1,4,4,2>: Cost 4 vrev <4,1,2,4>
+ 3642788100U, // <1,4,4,3>: Cost 4 vext1 <3,1,4,4>, <3,1,4,4>
+ 2713930960U, // <1,4,4,4>: Cost 3 vext3 <4,u,5,1>, <4,4,4,4>
+ 2619346230U, // <1,4,4,5>: Cost 3 vext2 <0,3,1,4>, RHS
+ 2713930980U, // <1,4,4,6>: Cost 3 vext3 <4,u,5,1>, <4,4,6,6>
+ 3736882642U, // <1,4,4,7>: Cost 4 vext2 <7,6,1,4>, <4,7,6,1>
+ 2619346473U, // <1,4,4,u>: Cost 3 vext2 <0,3,1,4>, RHS
+ 2557108326U, // <1,4,5,0>: Cost 3 vext1 <1,1,4,5>, LHS
+ 2557109075U, // <1,4,5,1>: Cost 3 vext1 <1,1,4,5>, <1,1,4,5>
+ 2598913774U, // <1,4,5,2>: Cost 3 vext1 <u,1,4,5>, <2,3,u,1>
+ 3630852246U, // <1,4,5,3>: Cost 4 vext1 <1,1,4,5>, <3,0,1,2>
+ 2557111606U, // <1,4,5,4>: Cost 3 vext1 <1,1,4,5>, RHS
+ 2895252790U, // <1,4,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
+ 1616006454U, // <1,4,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
+ 3899059510U, // <1,4,5,7>: Cost 4 vuzpr <1,1,1,4>, RHS
+ 1616006472U, // <1,4,5,u>: Cost 2 vext3 <0,u,1,1>, RHS
+ 2557116518U, // <1,4,6,0>: Cost 3 vext1 <1,1,4,6>, LHS
+ 2557117236U, // <1,4,6,1>: Cost 3 vext1 <1,1,4,6>, <1,1,1,1>
+ 3630859880U, // <1,4,6,2>: Cost 4 vext1 <1,1,4,6>, <2,2,2,2>
+ 2569062550U, // <1,4,6,3>: Cost 3 vext1 <3,1,4,6>, <3,0,1,2>
+ 2557119798U, // <1,4,6,4>: Cost 3 vext1 <1,1,4,6>, RHS
+ 3763490174U, // <1,4,6,5>: Cost 4 vext3 <0,u,1,1>, <4,6,5,7>
+ 3763490183U, // <1,4,6,6>: Cost 4 vext3 <0,u,1,1>, <4,6,6,7>
+ 2712751498U, // <1,4,6,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
+ 2557122350U, // <1,4,6,u>: Cost 3 vext1 <1,1,4,6>, LHS
+ 2659161084U, // <1,4,7,0>: Cost 3 vext2 <7,0,1,4>, <7,0,1,4>
+ 3732903040U, // <1,4,7,1>: Cost 4 vext2 <7,0,1,4>, <7,1,7,1>
+ 3734230174U, // <1,4,7,2>: Cost 4 vext2 <7,2,1,4>, <7,2,1,4>
+ 3734893807U, // <1,4,7,3>: Cost 4 vext2 <7,3,1,4>, <7,3,1,4>
+ 3660729654U, // <1,4,7,4>: Cost 4 vext1 <6,1,4,7>, RHS
+ 3786493384U, // <1,4,7,5>: Cost 4 vext3 <4,6,7,1>, <4,7,5,0>
+ 2713341394U, // <1,4,7,6>: Cost 3 vext3 <4,7,6,1>, <4,7,6,1>
+ 3660731386U, // <1,4,7,7>: Cost 4 vext1 <6,1,4,7>, <7,0,1,2>
+ 2664470148U, // <1,4,7,u>: Cost 3 vext2 <7,u,1,4>, <7,u,1,4>
+ 2557132902U, // <1,4,u,0>: Cost 3 vext1 <1,1,4,u>, LHS
+ 2619348782U, // <1,4,u,1>: Cost 3 vext2 <0,3,1,4>, LHS
+ 2563106351U, // <1,4,u,2>: Cost 3 vext1 <2,1,4,u>, <2,1,4,u>
+ 2713783816U, // <1,4,u,3>: Cost 3 vext3 <4,u,3,1>, <4,u,3,1>
+ 2622666815U, // <1,4,u,4>: Cost 3 vext2 <0,u,1,4>, <u,4,5,6>
+ 1640189466U, // <1,4,u,5>: Cost 2 vext3 <4,u,5,1>, <4,u,5,1>
+ 1616006697U, // <1,4,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
+ 2712751498U, // <1,4,u,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
+ 1616006715U, // <1,4,u,u>: Cost 2 vext3 <0,u,1,1>, RHS
+ 2620014592U, // <1,5,0,0>: Cost 3 vext2 <0,4,1,5>, <0,0,0,0>
+ 1546272870U, // <1,5,0,1>: Cost 2 vext2 <0,4,1,5>, LHS
+ 2618687664U, // <1,5,0,2>: Cost 3 vext2 <0,2,1,5>, <0,2,1,5>
+ 3693093120U, // <1,5,0,3>: Cost 4 vext2 <0,3,1,5>, <0,3,1,4>
+ 1546273106U, // <1,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
+ 2620678563U, // <1,5,0,5>: Cost 3 vext2 <0,5,1,5>, <0,5,1,5>
+ 2714668660U, // <1,5,0,6>: Cost 3 vext3 <5,0,6,1>, <5,0,6,1>
+ 3772042877U, // <1,5,0,7>: Cost 4 vext3 <2,3,0,1>, <5,0,7,1>
+ 1546273437U, // <1,5,0,u>: Cost 2 vext2 <0,4,1,5>, LHS
+ 2620015350U, // <1,5,1,0>: Cost 3 vext2 <0,4,1,5>, <1,0,3,2>
+ 2620015412U, // <1,5,1,1>: Cost 3 vext2 <0,4,1,5>, <1,1,1,1>
+ 2620015510U, // <1,5,1,2>: Cost 3 vext2 <0,4,1,5>, <1,2,3,0>
+ 2618688512U, // <1,5,1,3>: Cost 3 vext2 <0,2,1,5>, <1,3,5,7>
+ 2620015677U, // <1,5,1,4>: Cost 3 vext2 <0,4,1,5>, <1,4,3,5>
+ 2620015727U, // <1,5,1,5>: Cost 3 vext2 <0,4,1,5>, <1,5,0,1>
+ 2620015859U, // <1,5,1,6>: Cost 3 vext2 <0,4,1,5>, <1,6,5,7>
+ 3093728566U, // <1,5,1,7>: Cost 3 vtrnr <1,1,1,1>, RHS
+ 2620015981U, // <1,5,1,u>: Cost 3 vext2 <0,4,1,5>, <1,u,1,3>
+ 3692430816U, // <1,5,2,0>: Cost 4 vext2 <0,2,1,5>, <2,0,5,1>
+ 2620016163U, // <1,5,2,1>: Cost 3 vext2 <0,4,1,5>, <2,1,3,5>
+ 2620016232U, // <1,5,2,2>: Cost 3 vext2 <0,4,1,5>, <2,2,2,2>
+ 2620016294U, // <1,5,2,3>: Cost 3 vext2 <0,4,1,5>, <2,3,0,1>
+ 3693758221U, // <1,5,2,4>: Cost 4 vext2 <0,4,1,5>, <2,4,2,5>
+ 3692431209U, // <1,5,2,5>: Cost 4 vext2 <0,2,1,5>, <2,5,3,7>
+ 2620016570U, // <1,5,2,6>: Cost 3 vext2 <0,4,1,5>, <2,6,3,7>
+ 4173598006U, // <1,5,2,7>: Cost 4 vtrnr <2,1,3,2>, RHS
+ 2620016699U, // <1,5,2,u>: Cost 3 vext2 <0,4,1,5>, <2,u,0,1>
+ 2620016790U, // <1,5,3,0>: Cost 3 vext2 <0,4,1,5>, <3,0,1,2>
+ 2569110672U, // <1,5,3,1>: Cost 3 vext1 <3,1,5,3>, <1,5,3,7>
+ 3693758785U, // <1,5,3,2>: Cost 4 vext2 <0,4,1,5>, <3,2,2,2>
+ 2620017052U, // <1,5,3,3>: Cost 3 vext2 <0,4,1,5>, <3,3,3,3>
+ 2620017154U, // <1,5,3,4>: Cost 3 vext2 <0,4,1,5>, <3,4,5,6>
+ 3135623172U, // <1,5,3,5>: Cost 3 vtrnr LHS, <5,5,5,5>
+ 4161587048U, // <1,5,3,6>: Cost 4 vtrnr LHS, <2,5,3,6>
+ 2014104886U, // <1,5,3,7>: Cost 2 vtrnr LHS, RHS
+ 2014104887U, // <1,5,3,u>: Cost 2 vtrnr LHS, RHS
+ 2620017554U, // <1,5,4,0>: Cost 3 vext2 <0,4,1,5>, <4,0,5,1>
+ 2620017634U, // <1,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
+ 3693759551U, // <1,5,4,2>: Cost 4 vext2 <0,4,1,5>, <4,2,6,3>
+ 3642861837U, // <1,5,4,3>: Cost 4 vext1 <3,1,5,4>, <3,1,5,4>
+ 2575092710U, // <1,5,4,4>: Cost 3 vext1 <4,1,5,4>, <4,1,5,4>
+ 1546276150U, // <1,5,4,5>: Cost 2 vext2 <0,4,1,5>, RHS
+ 2759855414U, // <1,5,4,6>: Cost 3 vuzpl <1,3,5,7>, RHS
+ 2713931718U, // <1,5,4,7>: Cost 3 vext3 <4,u,5,1>, <5,4,7,6>
+ 1546276393U, // <1,5,4,u>: Cost 2 vext2 <0,4,1,5>, RHS
+ 2557182054U, // <1,5,5,0>: Cost 3 vext1 <1,1,5,5>, LHS
+ 2557182812U, // <1,5,5,1>: Cost 3 vext1 <1,1,5,5>, <1,1,5,5>
+ 3630925347U, // <1,5,5,2>: Cost 4 vext1 <1,1,5,5>, <2,1,3,5>
+ 4029301675U, // <1,5,5,3>: Cost 4 vzipr <0,4,1,5>, <1,2,5,3>
+ 2557185334U, // <1,5,5,4>: Cost 3 vext1 <1,1,5,5>, RHS
+ 2713931780U, // <1,5,5,5>: Cost 3 vext3 <4,u,5,1>, <5,5,5,5>
+ 2667794530U, // <1,5,5,6>: Cost 3 vext2 <u,4,1,5>, <5,6,7,0>
+ 2713931800U, // <1,5,5,7>: Cost 3 vext3 <4,u,5,1>, <5,5,7,7>
+ 2557187886U, // <1,5,5,u>: Cost 3 vext1 <1,1,5,5>, LHS
+ 2718208036U, // <1,5,6,0>: Cost 3 vext3 <5,6,0,1>, <5,6,0,1>
+ 2620019115U, // <1,5,6,1>: Cost 3 vext2 <0,4,1,5>, <6,1,7,5>
+ 2667794938U, // <1,5,6,2>: Cost 3 vext2 <u,4,1,5>, <6,2,7,3>
+ 3787673666U, // <1,5,6,3>: Cost 4 vext3 <4,u,5,1>, <5,6,3,4>
+ 3693761165U, // <1,5,6,4>: Cost 4 vext2 <0,4,1,5>, <6,4,5,6>
+ 3319279297U, // <1,5,6,5>: Cost 4 vrev <5,1,5,6>
+ 2667795256U, // <1,5,6,6>: Cost 3 vext2 <u,4,1,5>, <6,6,6,6>
+ 2713931874U, // <1,5,6,7>: Cost 3 vext3 <4,u,5,1>, <5,6,7,0>
+ 2713931883U, // <1,5,6,u>: Cost 3 vext3 <4,u,5,1>, <5,6,u,0>
+ 2557198438U, // <1,5,7,0>: Cost 3 vext1 <1,1,5,7>, LHS
+ 2557199156U, // <1,5,7,1>: Cost 3 vext1 <1,1,5,7>, <1,1,1,1>
+ 2569143974U, // <1,5,7,2>: Cost 3 vext1 <3,1,5,7>, <2,3,0,1>
+ 2569144592U, // <1,5,7,3>: Cost 3 vext1 <3,1,5,7>, <3,1,5,7>
+ 2557201718U, // <1,5,7,4>: Cost 3 vext1 <1,1,5,7>, RHS
+ 2713931944U, // <1,5,7,5>: Cost 3 vext3 <4,u,5,1>, <5,7,5,7>
+ 3787673770U, // <1,5,7,6>: Cost 4 vext3 <4,u,5,1>, <5,7,6,0>
+ 2719387828U, // <1,5,7,7>: Cost 3 vext3 <5,7,7,1>, <5,7,7,1>
+ 2557204270U, // <1,5,7,u>: Cost 3 vext1 <1,1,5,7>, LHS
+ 2620020435U, // <1,5,u,0>: Cost 3 vext2 <0,4,1,5>, <u,0,1,2>
+ 1546278702U, // <1,5,u,1>: Cost 2 vext2 <0,4,1,5>, LHS
+ 2620020616U, // <1,5,u,2>: Cost 3 vext2 <0,4,1,5>, <u,2,3,3>
+ 2620020668U, // <1,5,u,3>: Cost 3 vext2 <0,4,1,5>, <u,3,0,1>
+ 1594054682U, // <1,5,u,4>: Cost 2 vext2 <u,4,1,5>, <u,4,1,5>
+ 1546279066U, // <1,5,u,5>: Cost 2 vext2 <0,4,1,5>, RHS
+ 2620020944U, // <1,5,u,6>: Cost 3 vext2 <0,4,1,5>, <u,6,3,7>
+ 2014145846U, // <1,5,u,7>: Cost 2 vtrnr LHS, RHS
+ 2014145847U, // <1,5,u,u>: Cost 2 vtrnr LHS, RHS
+ 3692437504U, // <1,6,0,0>: Cost 4 vext2 <0,2,1,6>, <0,0,0,0>
+ 2618695782U, // <1,6,0,1>: Cost 3 vext2 <0,2,1,6>, LHS
+ 2618695857U, // <1,6,0,2>: Cost 3 vext2 <0,2,1,6>, <0,2,1,6>
+ 3794161970U, // <1,6,0,3>: Cost 4 vext3 <6,0,3,1>, <6,0,3,1>
+ 2620023122U, // <1,6,0,4>: Cost 3 vext2 <0,4,1,6>, <0,4,1,5>
+ 2620686756U, // <1,6,0,5>: Cost 3 vext2 <0,5,1,6>, <0,5,1,6>
+ 2621350389U, // <1,6,0,6>: Cost 3 vext2 <0,6,1,6>, <0,6,1,6>
+ 4028599606U, // <1,6,0,7>: Cost 4 vzipr <0,3,1,0>, RHS
+ 2618696349U, // <1,6,0,u>: Cost 3 vext2 <0,2,1,6>, LHS
+ 3692438262U, // <1,6,1,0>: Cost 4 vext2 <0,2,1,6>, <1,0,3,2>
+ 2625995572U, // <1,6,1,1>: Cost 3 vext2 <1,4,1,6>, <1,1,1,1>
+ 3692438422U, // <1,6,1,2>: Cost 4 vext2 <0,2,1,6>, <1,2,3,0>
+ 3692438488U, // <1,6,1,3>: Cost 4 vext2 <0,2,1,6>, <1,3,1,3>
+ 2625995820U, // <1,6,1,4>: Cost 3 vext2 <1,4,1,6>, <1,4,1,6>
+ 3692438672U, // <1,6,1,5>: Cost 4 vext2 <0,2,1,6>, <1,5,3,7>
+ 3692438720U, // <1,6,1,6>: Cost 4 vext2 <0,2,1,6>, <1,6,0,1>
+ 2958183734U, // <1,6,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
+ 2958183735U, // <1,6,1,u>: Cost 3 vzipr <0,u,1,1>, RHS
+ 2721526201U, // <1,6,2,0>: Cost 3 vext3 <6,2,0,1>, <6,2,0,1>
+ 3692439097U, // <1,6,2,1>: Cost 4 vext2 <0,2,1,6>, <2,1,6,0>
+ 3692439144U, // <1,6,2,2>: Cost 4 vext2 <0,2,1,6>, <2,2,2,2>
+ 3692439206U, // <1,6,2,3>: Cost 4 vext2 <0,2,1,6>, <2,3,0,1>
+ 3636948278U, // <1,6,2,4>: Cost 4 vext1 <2,1,6,2>, RHS
+ 3787674092U, // <1,6,2,5>: Cost 4 vext3 <4,u,5,1>, <6,2,5,7>
+ 2618697658U, // <1,6,2,6>: Cost 3 vext2 <0,2,1,6>, <2,6,3,7>
+ 2970799414U, // <1,6,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
+ 2970799415U, // <1,6,2,u>: Cost 3 vzipr <3,0,1,2>, RHS
+ 2563211366U, // <1,6,3,0>: Cost 3 vext1 <2,1,6,3>, LHS
+ 3699738854U, // <1,6,3,1>: Cost 4 vext2 <1,4,1,6>, <3,1,1,1>
+ 2563212860U, // <1,6,3,2>: Cost 3 vext1 <2,1,6,3>, <2,1,6,3>
+ 3692439964U, // <1,6,3,3>: Cost 4 vext2 <0,2,1,6>, <3,3,3,3>
+ 2563214646U, // <1,6,3,4>: Cost 3 vext1 <2,1,6,3>, RHS
+ 4191820018U, // <1,6,3,5>: Cost 4 vtrnr <5,1,7,3>, <u,6,7,5>
+ 2587103648U, // <1,6,3,6>: Cost 3 vext1 <6,1,6,3>, <6,1,6,3>
+ 3087845306U, // <1,6,3,7>: Cost 3 vtrnr LHS, <2,6,3,7>
+ 3087845307U, // <1,6,3,u>: Cost 3 vtrnr LHS, <2,6,3,u>
+ 3693767570U, // <1,6,4,0>: Cost 4 vext2 <0,4,1,6>, <4,0,5,1>
+ 3693767650U, // <1,6,4,1>: Cost 4 vext2 <0,4,1,6>, <4,1,5,0>
+ 3636962877U, // <1,6,4,2>: Cost 4 vext1 <2,1,6,4>, <2,1,6,4>
+ 3325088134U, // <1,6,4,3>: Cost 4 vrev <6,1,3,4>
+ 3693767898U, // <1,6,4,4>: Cost 4 vext2 <0,4,1,6>, <4,4,5,5>
+ 2618699062U, // <1,6,4,5>: Cost 3 vext2 <0,2,1,6>, RHS
+ 3833670966U, // <1,6,4,6>: Cost 4 vuzpl <1,3,6,7>, RHS
+ 4028632374U, // <1,6,4,7>: Cost 4 vzipr <0,3,1,4>, RHS
+ 2618699305U, // <1,6,4,u>: Cost 3 vext2 <0,2,1,6>, RHS
+ 3693768264U, // <1,6,5,0>: Cost 4 vext2 <0,4,1,6>, <5,0,1,2>
+ 3630998373U, // <1,6,5,1>: Cost 4 vext1 <1,1,6,5>, <1,1,6,5>
+ 3636971070U, // <1,6,5,2>: Cost 4 vext1 <2,1,6,5>, <2,1,6,5>
+ 3642943767U, // <1,6,5,3>: Cost 4 vext1 <3,1,6,5>, <3,1,6,5>
+ 3693768628U, // <1,6,5,4>: Cost 4 vext2 <0,4,1,6>, <5,4,5,6>
+ 3732918276U, // <1,6,5,5>: Cost 4 vext2 <7,0,1,6>, <5,5,5,5>
+ 2620690530U, // <1,6,5,6>: Cost 3 vext2 <0,5,1,6>, <5,6,7,0>
+ 2955562294U, // <1,6,5,7>: Cost 3 vzipr <0,4,1,5>, RHS
+ 2955562295U, // <1,6,5,u>: Cost 3 vzipr <0,4,1,5>, RHS
+ 2724180733U, // <1,6,6,0>: Cost 3 vext3 <6,6,0,1>, <6,6,0,1>
+ 3631006566U, // <1,6,6,1>: Cost 4 vext1 <1,1,6,6>, <1,1,6,6>
+ 3631007674U, // <1,6,6,2>: Cost 4 vext1 <1,1,6,6>, <2,6,3,7>
+ 3692442184U, // <1,6,6,3>: Cost 4 vext2 <0,2,1,6>, <6,3,7,0>
+ 3631009078U, // <1,6,6,4>: Cost 4 vext1 <1,1,6,6>, RHS
+ 3787674416U, // <1,6,6,5>: Cost 4 vext3 <4,u,5,1>, <6,6,5,7>
+ 2713932600U, // <1,6,6,6>: Cost 3 vext3 <4,u,5,1>, <6,6,6,6>
+ 2713932610U, // <1,6,6,7>: Cost 3 vext3 <4,u,5,1>, <6,6,7,7>
+ 2713932619U, // <1,6,6,u>: Cost 3 vext3 <4,u,5,1>, <6,6,u,7>
+ 1651102542U, // <1,6,7,0>: Cost 2 vext3 <6,7,0,1>, <6,7,0,1>
+ 2724918103U, // <1,6,7,1>: Cost 3 vext3 <6,7,1,1>, <6,7,1,1>
+ 2698302306U, // <1,6,7,2>: Cost 3 vext3 <2,3,0,1>, <6,7,2,3>
+ 3642960153U, // <1,6,7,3>: Cost 4 vext1 <3,1,6,7>, <3,1,6,7>
+ 2713932662U, // <1,6,7,4>: Cost 3 vext3 <4,u,5,1>, <6,7,4,5>
+ 2725213051U, // <1,6,7,5>: Cost 3 vext3 <6,7,5,1>, <6,7,5,1>
+ 2724844426U, // <1,6,7,6>: Cost 3 vext3 <6,7,0,1>, <6,7,6,7>
+ 4035956022U, // <1,6,7,7>: Cost 4 vzipr <1,5,1,7>, RHS
+ 1651692438U, // <1,6,7,u>: Cost 2 vext3 <6,7,u,1>, <6,7,u,1>
+ 1651766175U, // <1,6,u,0>: Cost 2 vext3 <6,u,0,1>, <6,u,0,1>
+ 2618701614U, // <1,6,u,1>: Cost 3 vext2 <0,2,1,6>, LHS
+ 3135663508U, // <1,6,u,2>: Cost 3 vtrnr LHS, <4,6,u,2>
+ 3692443580U, // <1,6,u,3>: Cost 4 vext2 <0,2,1,6>, <u,3,0,1>
+ 2713932743U, // <1,6,u,4>: Cost 3 vext3 <4,u,5,1>, <6,u,4,5>
+ 2618701978U, // <1,6,u,5>: Cost 3 vext2 <0,2,1,6>, RHS
+ 2622683344U, // <1,6,u,6>: Cost 3 vext2 <0,u,1,6>, <u,6,3,7>
+ 3087886266U, // <1,6,u,7>: Cost 3 vtrnr LHS, <2,6,3,7>
+ 1652356071U, // <1,6,u,u>: Cost 2 vext3 <6,u,u,1>, <6,u,u,1>
+ 2726171632U, // <1,7,0,0>: Cost 3 vext3 <7,0,0,1>, <7,0,0,1>
+ 2626666598U, // <1,7,0,1>: Cost 3 vext2 <1,5,1,7>, LHS
+ 3695100067U, // <1,7,0,2>: Cost 4 vext2 <0,6,1,7>, <0,2,0,1>
+ 3707044102U, // <1,7,0,3>: Cost 4 vext2 <2,6,1,7>, <0,3,2,1>
+ 2726466580U, // <1,7,0,4>: Cost 3 vext3 <7,0,4,1>, <7,0,4,1>
+ 3654921933U, // <1,7,0,5>: Cost 4 vext1 <5,1,7,0>, <5,1,7,0>
+ 2621358582U, // <1,7,0,6>: Cost 3 vext2 <0,6,1,7>, <0,6,1,7>
+ 2622022215U, // <1,7,0,7>: Cost 3 vext2 <0,7,1,7>, <0,7,1,7>
+ 2626667165U, // <1,7,0,u>: Cost 3 vext2 <1,5,1,7>, LHS
+ 2593128550U, // <1,7,1,0>: Cost 3 vext1 <7,1,7,1>, LHS
+ 2626667316U, // <1,7,1,1>: Cost 3 vext2 <1,5,1,7>, <1,1,1,1>
+ 3700409238U, // <1,7,1,2>: Cost 4 vext2 <1,5,1,7>, <1,2,3,0>
+ 2257294428U, // <1,7,1,3>: Cost 3 vrev <7,1,3,1>
+ 2593131830U, // <1,7,1,4>: Cost 3 vext1 <7,1,7,1>, RHS
+ 2626667646U, // <1,7,1,5>: Cost 3 vext2 <1,5,1,7>, <1,5,1,7>
+ 2627331279U, // <1,7,1,6>: Cost 3 vext2 <1,6,1,7>, <1,6,1,7>
+ 2593133696U, // <1,7,1,7>: Cost 3 vext1 <7,1,7,1>, <7,1,7,1>
+ 2628658545U, // <1,7,1,u>: Cost 3 vext2 <1,u,1,7>, <1,u,1,7>
+ 2587164774U, // <1,7,2,0>: Cost 3 vext1 <6,1,7,2>, LHS
+ 3701073445U, // <1,7,2,1>: Cost 4 vext2 <1,6,1,7>, <2,1,3,7>
+ 3700409960U, // <1,7,2,2>: Cost 4 vext2 <1,5,1,7>, <2,2,2,2>
+ 2638612134U, // <1,7,2,3>: Cost 3 vext2 <3,5,1,7>, <2,3,0,1>
+ 2587168054U, // <1,7,2,4>: Cost 3 vext1 <6,1,7,2>, RHS
+ 3706382167U, // <1,7,2,5>: Cost 4 vext2 <2,5,1,7>, <2,5,1,7>
+ 2587169192U, // <1,7,2,6>: Cost 3 vext1 <6,1,7,2>, <6,1,7,2>
+ 3660911610U, // <1,7,2,7>: Cost 4 vext1 <6,1,7,2>, <7,0,1,2>
+ 2587170606U, // <1,7,2,u>: Cost 3 vext1 <6,1,7,2>, LHS
+ 1507459174U, // <1,7,3,0>: Cost 2 vext1 <5,1,7,3>, LHS
+ 2569257984U, // <1,7,3,1>: Cost 3 vext1 <3,1,7,3>, <1,3,5,7>
+ 2581202536U, // <1,7,3,2>: Cost 3 vext1 <5,1,7,3>, <2,2,2,2>
+ 2569259294U, // <1,7,3,3>: Cost 3 vext1 <3,1,7,3>, <3,1,7,3>
+ 1507462454U, // <1,7,3,4>: Cost 2 vext1 <5,1,7,3>, RHS
+ 1507462864U, // <1,7,3,5>: Cost 2 vext1 <5,1,7,3>, <5,1,7,3>
+ 2581205498U, // <1,7,3,6>: Cost 3 vext1 <5,1,7,3>, <6,2,7,3>
+ 2581206010U, // <1,7,3,7>: Cost 3 vext1 <5,1,7,3>, <7,0,1,2>
+ 1507465006U, // <1,7,3,u>: Cost 2 vext1 <5,1,7,3>, LHS
+ 2728826164U, // <1,7,4,0>: Cost 3 vext3 <7,4,0,1>, <7,4,0,1>
+ 3654951732U, // <1,7,4,1>: Cost 4 vext1 <5,1,7,4>, <1,1,1,1>
+ 3330987094U, // <1,7,4,2>: Cost 4 vrev <7,1,2,4>
+ 3331060831U, // <1,7,4,3>: Cost 4 vrev <7,1,3,4>
+ 3787674971U, // <1,7,4,4>: Cost 4 vext3 <4,u,5,1>, <7,4,4,4>
+ 2626669878U, // <1,7,4,5>: Cost 3 vext2 <1,5,1,7>, RHS
+ 3785979241U, // <1,7,4,6>: Cost 4 vext3 <4,6,0,1>, <7,4,6,0>
+ 3787085176U, // <1,7,4,7>: Cost 4 vext3 <4,7,6,1>, <7,4,7,6>
+ 2626670121U, // <1,7,4,u>: Cost 3 vext2 <1,5,1,7>, RHS
+ 2569273446U, // <1,7,5,0>: Cost 3 vext1 <3,1,7,5>, LHS
+ 2569274368U, // <1,7,5,1>: Cost 3 vext1 <3,1,7,5>, <1,3,5,7>
+ 3643016808U, // <1,7,5,2>: Cost 4 vext1 <3,1,7,5>, <2,2,2,2>
+ 2569275680U, // <1,7,5,3>: Cost 3 vext1 <3,1,7,5>, <3,1,7,5>
+ 2569276726U, // <1,7,5,4>: Cost 3 vext1 <3,1,7,5>, RHS
+ 4102034790U, // <1,7,5,5>: Cost 4 vtrnl <1,3,5,7>, <7,4,5,6>
+ 2651222067U, // <1,7,5,6>: Cost 3 vext2 <5,6,1,7>, <5,6,1,7>
+ 3899378998U, // <1,7,5,7>: Cost 4 vuzpr <1,1,5,7>, RHS
+ 2569279278U, // <1,7,5,u>: Cost 3 vext1 <3,1,7,5>, LHS
+ 2730153430U, // <1,7,6,0>: Cost 3 vext3 <7,6,0,1>, <7,6,0,1>
+ 2724845022U, // <1,7,6,1>: Cost 3 vext3 <6,7,0,1>, <7,6,1,0>
+ 3643025338U, // <1,7,6,2>: Cost 4 vext1 <3,1,7,6>, <2,6,3,7>
+ 3643025697U, // <1,7,6,3>: Cost 4 vext1 <3,1,7,6>, <3,1,7,6>
+ 3643026742U, // <1,7,6,4>: Cost 4 vext1 <3,1,7,6>, RHS
+ 3654971091U, // <1,7,6,5>: Cost 4 vext1 <5,1,7,6>, <5,1,7,6>
+ 3787675153U, // <1,7,6,6>: Cost 4 vext3 <4,u,5,1>, <7,6,6,6>
+ 2724845076U, // <1,7,6,7>: Cost 3 vext3 <6,7,0,1>, <7,6,7,0>
+ 2725508637U, // <1,7,6,u>: Cost 3 vext3 <6,u,0,1>, <7,6,u,0>
+ 2730817063U, // <1,7,7,0>: Cost 3 vext3 <7,7,0,1>, <7,7,0,1>
+ 3631088436U, // <1,7,7,1>: Cost 4 vext1 <1,1,7,7>, <1,1,1,1>
+ 3660949158U, // <1,7,7,2>: Cost 4 vext1 <6,1,7,7>, <2,3,0,1>
+ 3801904705U, // <1,7,7,3>: Cost 4 vext3 <7,3,0,1>, <7,7,3,0>
+ 3631090998U, // <1,7,7,4>: Cost 4 vext1 <1,1,7,7>, RHS
+ 2662503828U, // <1,7,7,5>: Cost 3 vext2 <7,5,1,7>, <7,5,1,7>
+ 3660951981U, // <1,7,7,6>: Cost 4 vext1 <6,1,7,7>, <6,1,7,7>
+ 2713933420U, // <1,7,7,7>: Cost 3 vext3 <4,u,5,1>, <7,7,7,7>
+ 2731406959U, // <1,7,7,u>: Cost 3 vext3 <7,7,u,1>, <7,7,u,1>
+ 1507500134U, // <1,7,u,0>: Cost 2 vext1 <5,1,7,u>, LHS
+ 2626672430U, // <1,7,u,1>: Cost 3 vext2 <1,5,1,7>, LHS
+ 2581243496U, // <1,7,u,2>: Cost 3 vext1 <5,1,7,u>, <2,2,2,2>
+ 2569300259U, // <1,7,u,3>: Cost 3 vext1 <3,1,7,u>, <3,1,7,u>
+ 1507503414U, // <1,7,u,4>: Cost 2 vext1 <5,1,7,u>, RHS
+ 1507503829U, // <1,7,u,5>: Cost 2 vext1 <5,1,7,u>, <5,1,7,u>
+ 2581246458U, // <1,7,u,6>: Cost 3 vext1 <5,1,7,u>, <6,2,7,3>
+ 2581246970U, // <1,7,u,7>: Cost 3 vext1 <5,1,7,u>, <7,0,1,2>
+ 1507505966U, // <1,7,u,u>: Cost 2 vext1 <5,1,7,u>, LHS
+ 1543643153U, // <1,u,0,0>: Cost 2 vext2 <0,0,1,u>, <0,0,1,u>
+ 1546297446U, // <1,u,0,1>: Cost 2 vext2 <0,4,1,u>, LHS
+ 2819448852U, // <1,u,0,2>: Cost 3 vuzpr LHS, <0,0,2,2>
+ 2619375876U, // <1,u,0,3>: Cost 3 vext2 <0,3,1,u>, <0,3,1,u>
+ 1546297685U, // <1,u,0,4>: Cost 2 vext2 <0,4,1,u>, <0,4,1,u>
+ 1658771190U, // <1,u,0,5>: Cost 2 vext3 <u,0,5,1>, <u,0,5,1>
+ 2736789248U, // <1,u,0,6>: Cost 3 vext3 <u,7,0,1>, <u,0,6,2>
+ 2659189376U, // <1,u,0,7>: Cost 3 vext2 <7,0,1,u>, <0,7,u,1>
+ 1546298013U, // <1,u,0,u>: Cost 2 vext2 <0,4,1,u>, LHS
+ 1483112550U, // <1,u,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
+ 202162278U, // <1,u,1,1>: Cost 1 vdup1 LHS
+ 1616009006U, // <1,u,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
+ 1745707110U, // <1,u,1,3>: Cost 2 vuzpr LHS, LHS
+ 1483115830U, // <1,u,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
+ 2620040336U, // <1,u,1,5>: Cost 3 vext2 <0,4,1,u>, <1,5,3,7>
+ 3026622618U, // <1,u,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
+ 2958183752U, // <1,u,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
+ 202162278U, // <1,u,1,u>: Cost 1 vdup1 LHS
+ 2819449750U, // <1,u,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
+ 2893207342U, // <1,u,2,1>: Cost 3 vzipl <1,2,3,0>, LHS
+ 2819448996U, // <1,u,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
+ 2819450482U, // <1,u,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
+ 2819449754U, // <1,u,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
+ 2893207706U, // <1,u,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
+ 2819449036U, // <1,u,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
+ 2970799432U, // <1,u,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
+ 2819449002U, // <1,u,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
+ 403931292U, // <1,u,3,0>: Cost 1 vext1 LHS, LHS
+ 1477673718U, // <1,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+ 115726126U, // <1,u,3,2>: Cost 1 vrev LHS
+ 2014102173U, // <1,u,3,3>: Cost 2 vtrnr LHS, LHS
+ 403934518U, // <1,u,3,4>: Cost 1 vext1 LHS, RHS
+ 1507536601U, // <1,u,3,5>: Cost 2 vext1 <5,1,u,3>, <5,1,u,3>
+ 1525453306U, // <1,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+ 2014105129U, // <1,u,3,7>: Cost 2 vtrnr LHS, RHS
+ 403937070U, // <1,u,3,u>: Cost 1 vext1 LHS, LHS
+ 2620042157U, // <1,u,4,0>: Cost 3 vext2 <0,4,1,u>, <4,0,u,1>
+ 2620042237U, // <1,u,4,1>: Cost 3 vext2 <0,4,1,u>, <4,1,u,0>
+ 2263217967U, // <1,u,4,2>: Cost 3 vrev <u,1,2,4>
+ 2569341224U, // <1,u,4,3>: Cost 3 vext1 <3,1,u,4>, <3,1,u,4>
+ 2569342262U, // <1,u,4,4>: Cost 3 vext1 <3,1,u,4>, RHS
+ 1546300726U, // <1,u,4,5>: Cost 2 vext2 <0,4,1,u>, RHS
+ 2819449180U, // <1,u,4,6>: Cost 3 vuzpr LHS, <0,4,2,6>
+ 2724845649U, // <1,u,4,7>: Cost 3 vext3 <6,7,0,1>, <u,4,7,6>
+ 1546300969U, // <1,u,4,u>: Cost 2 vext2 <0,4,1,u>, RHS
+ 2551431270U, // <1,u,5,0>: Cost 3 vext1 <0,1,u,5>, LHS
+ 2551432192U, // <1,u,5,1>: Cost 3 vext1 <0,1,u,5>, <1,3,5,7>
+ 3028293422U, // <1,u,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
+ 2955559068U, // <1,u,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
+ 2551434550U, // <1,u,5,4>: Cost 3 vext1 <0,1,u,5>, RHS
+ 2895255706U, // <1,u,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
+ 1616009370U, // <1,u,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
+ 1745710390U, // <1,u,5,7>: Cost 2 vuzpr LHS, RHS
+ 1745710391U, // <1,u,5,u>: Cost 2 vuzpr LHS, RHS
+ 2653221159U, // <1,u,6,0>: Cost 3 vext2 <6,0,1,u>, <6,0,1,u>
+ 2725509303U, // <1,u,6,1>: Cost 3 vext3 <6,u,0,1>, <u,6,1,0>
+ 2659193338U, // <1,u,6,2>: Cost 3 vext2 <7,0,1,u>, <6,2,7,3>
+ 2689751248U, // <1,u,6,3>: Cost 3 vext3 <0,u,1,1>, <u,6,3,7>
+ 2867228774U, // <1,u,6,4>: Cost 3 vuzpr LHS, <5,6,7,4>
+ 3764820194U, // <1,u,6,5>: Cost 4 vext3 <1,1,1,1>, <u,6,5,7>
+ 2657202957U, // <1,u,6,6>: Cost 3 vext2 <6,6,1,u>, <6,6,1,u>
+ 2819450810U, // <1,u,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
+ 2819450811U, // <1,u,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
+ 1585452032U, // <1,u,7,0>: Cost 2 vext2 <7,0,1,u>, <7,0,1,u>
+ 2557420340U, // <1,u,7,1>: Cost 3 vext1 <1,1,u,7>, <1,1,1,1>
+ 2569365158U, // <1,u,7,2>: Cost 3 vext1 <3,1,u,7>, <2,3,0,1>
+ 2569365803U, // <1,u,7,3>: Cost 3 vext1 <3,1,u,7>, <3,1,u,7>
+ 2557422902U, // <1,u,7,4>: Cost 3 vext1 <1,1,u,7>, RHS
+ 2662512021U, // <1,u,7,5>: Cost 3 vext2 <7,5,1,u>, <7,5,1,u>
+ 2724845884U, // <1,u,7,6>: Cost 3 vext3 <6,7,0,1>, <u,7,6,7>
+ 2659194476U, // <1,u,7,7>: Cost 3 vext2 <7,0,1,u>, <7,7,7,7>
+ 1590761096U, // <1,u,7,u>: Cost 2 vext2 <7,u,1,u>, <7,u,1,u>
+ 403972257U, // <1,u,u,0>: Cost 1 vext1 LHS, LHS
+ 202162278U, // <1,u,u,1>: Cost 1 vdup1 LHS
+ 115767091U, // <1,u,u,2>: Cost 1 vrev LHS
+ 1745707677U, // <1,u,u,3>: Cost 2 vuzpr LHS, LHS
+ 403975478U, // <1,u,u,4>: Cost 1 vext1 LHS, RHS
+ 1546303642U, // <1,u,u,5>: Cost 2 vext2 <0,4,1,u>, RHS
+ 1616009613U, // <1,u,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
+ 1745710633U, // <1,u,u,7>: Cost 2 vuzpr LHS, RHS
+ 403978030U, // <1,u,u,u>: Cost 1 vext1 LHS, LHS
+ 2551463936U, // <2,0,0,0>: Cost 3 vext1 <0,2,0,0>, <0,0,0,0>
+ 2685698058U, // <2,0,0,1>: Cost 3 vext3 <0,2,0,2>, <0,0,1,1>
+ 1610776596U, // <2,0,0,2>: Cost 2 vext3 <0,0,2,2>, <0,0,2,2>
+ 2619384069U, // <2,0,0,3>: Cost 3 vext2 <0,3,2,0>, <0,3,2,0>
+ 2551467318U, // <2,0,0,4>: Cost 3 vext1 <0,2,0,0>, RHS
+ 3899836596U, // <2,0,0,5>: Cost 4 vuzpr <1,2,3,0>, <3,0,4,5>
+ 2621374968U, // <2,0,0,6>: Cost 3 vext2 <0,6,2,0>, <0,6,2,0>
+ 4168271334U, // <2,0,0,7>: Cost 4 vtrnr <1,2,3,0>, <2,0,5,7>
+ 1611219018U, // <2,0,0,u>: Cost 2 vext3 <0,0,u,2>, <0,0,u,2>
+ 2551472138U, // <2,0,1,0>: Cost 3 vext1 <0,2,0,1>, <0,0,1,1>
+ 2690564186U, // <2,0,1,1>: Cost 3 vext3 <1,0,3,2>, <0,1,1,0>
+ 1611956326U, // <2,0,1,2>: Cost 2 vext3 <0,2,0,2>, LHS
+ 2826092646U, // <2,0,1,3>: Cost 3 vuzpr <1,2,3,0>, LHS
+ 2551475510U, // <2,0,1,4>: Cost 3 vext1 <0,2,0,1>, RHS
+ 3692463248U, // <2,0,1,5>: Cost 4 vext2 <0,2,2,0>, <1,5,3,7>
+ 2587308473U, // <2,0,1,6>: Cost 3 vext1 <6,2,0,1>, <6,2,0,1>
+ 3661050874U, // <2,0,1,7>: Cost 4 vext1 <6,2,0,1>, <7,0,1,2>
+ 1611956380U, // <2,0,1,u>: Cost 2 vext3 <0,2,0,2>, LHS
+ 1477738598U, // <2,0,2,0>: Cost 2 vext1 <0,2,0,2>, LHS
+ 2551481078U, // <2,0,2,1>: Cost 3 vext1 <0,2,0,2>, <1,0,3,2>
+ 2551481796U, // <2,0,2,2>: Cost 3 vext1 <0,2,0,2>, <2,0,2,0>
+ 2551482518U, // <2,0,2,3>: Cost 3 vext1 <0,2,0,2>, <3,0,1,2>
+ 1477741878U, // <2,0,2,4>: Cost 2 vext1 <0,2,0,2>, RHS
+ 2551484112U, // <2,0,2,5>: Cost 3 vext1 <0,2,0,2>, <5,1,7,3>
+ 2551484759U, // <2,0,2,6>: Cost 3 vext1 <0,2,0,2>, <6,0,7,2>
+ 2551485434U, // <2,0,2,7>: Cost 3 vext1 <0,2,0,2>, <7,0,1,2>
+ 1477744430U, // <2,0,2,u>: Cost 2 vext1 <0,2,0,2>, LHS
+ 2953625600U, // <2,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
+ 2953627302U, // <2,0,3,1>: Cost 3 vzipr LHS, <2,3,0,1>
+ 2953625764U, // <2,0,3,2>: Cost 3 vzipr LHS, <0,2,0,2>
+ 4027369695U, // <2,0,3,3>: Cost 4 vzipr LHS, <3,1,0,3>
+ 3625233718U, // <2,0,3,4>: Cost 4 vext1 <0,2,0,3>, RHS
+ 3899836110U, // <2,0,3,5>: Cost 4 vuzpr <1,2,3,0>, <2,3,4,5>
+ 4032012618U, // <2,0,3,6>: Cost 4 vzipr LHS, <0,4,0,6>
+ 3899835392U, // <2,0,3,7>: Cost 4 vuzpr <1,2,3,0>, <1,3,5,7>
+ 2953625770U, // <2,0,3,u>: Cost 3 vzipr LHS, <0,2,0,u>
+ 2551496806U, // <2,0,4,0>: Cost 3 vext1 <0,2,0,4>, LHS
+ 2685698386U, // <2,0,4,1>: Cost 3 vext3 <0,2,0,2>, <0,4,1,5>
+ 2685698396U, // <2,0,4,2>: Cost 3 vext3 <0,2,0,2>, <0,4,2,6>
+ 3625240726U, // <2,0,4,3>: Cost 4 vext1 <0,2,0,4>, <3,0,1,2>
+ 2551500086U, // <2,0,4,4>: Cost 3 vext1 <0,2,0,4>, RHS
+ 2618723638U, // <2,0,4,5>: Cost 3 vext2 <0,2,2,0>, RHS
+ 2765409590U, // <2,0,4,6>: Cost 3 vuzpl <2,3,0,1>, RHS
+ 3799990664U, // <2,0,4,7>: Cost 4 vext3 <7,0,1,2>, <0,4,7,5>
+ 2685698450U, // <2,0,4,u>: Cost 3 vext3 <0,2,0,2>, <0,4,u,6>
+ 3625246822U, // <2,0,5,0>: Cost 4 vext1 <0,2,0,5>, LHS
+ 3289776304U, // <2,0,5,1>: Cost 4 vrev <0,2,1,5>
+ 2690564526U, // <2,0,5,2>: Cost 3 vext3 <1,0,3,2>, <0,5,2,7>
+ 3289923778U, // <2,0,5,3>: Cost 4 vrev <0,2,3,5>
+ 2216255691U, // <2,0,5,4>: Cost 3 vrev <0,2,4,5>
+ 3726307332U, // <2,0,5,5>: Cost 4 vext2 <5,u,2,0>, <5,5,5,5>
+ 3726307426U, // <2,0,5,6>: Cost 4 vext2 <5,u,2,0>, <5,6,7,0>
+ 2826095926U, // <2,0,5,7>: Cost 3 vuzpr <1,2,3,0>, RHS
+ 2216550639U, // <2,0,5,u>: Cost 3 vrev <0,2,u,5>
+ 4162420736U, // <2,0,6,0>: Cost 4 vtrnr <0,2,4,6>, <0,0,0,0>
+ 2901885030U, // <2,0,6,1>: Cost 3 vzipl <2,6,3,7>, LHS
+ 2685698559U, // <2,0,6,2>: Cost 3 vext3 <0,2,0,2>, <0,6,2,7>
+ 3643173171U, // <2,0,6,3>: Cost 4 vext1 <3,2,0,6>, <3,2,0,6>
+ 2216263884U, // <2,0,6,4>: Cost 3 vrev <0,2,4,6>
+ 3730289341U, // <2,0,6,5>: Cost 4 vext2 <6,5,2,0>, <6,5,2,0>
+ 3726308152U, // <2,0,6,6>: Cost 4 vext2 <5,u,2,0>, <6,6,6,6>
+ 3899836346U, // <2,0,6,7>: Cost 4 vuzpr <1,2,3,0>, <2,6,3,7>
+ 2216558832U, // <2,0,6,u>: Cost 3 vrev <0,2,u,6>
+ 2659202049U, // <2,0,7,0>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
+ 3726308437U, // <2,0,7,1>: Cost 4 vext2 <5,u,2,0>, <7,1,2,3>
+ 2726249034U, // <2,0,7,2>: Cost 3 vext3 <7,0,1,2>, <0,7,2,1>
+ 3734934772U, // <2,0,7,3>: Cost 4 vext2 <7,3,2,0>, <7,3,2,0>
+ 3726308710U, // <2,0,7,4>: Cost 4 vext2 <5,u,2,0>, <7,4,5,6>
+ 3726308814U, // <2,0,7,5>: Cost 4 vext2 <5,u,2,0>, <7,5,u,2>
+ 3736925671U, // <2,0,7,6>: Cost 4 vext2 <7,6,2,0>, <7,6,2,0>
+ 3726308972U, // <2,0,7,7>: Cost 4 vext2 <5,u,2,0>, <7,7,7,7>
+ 2659202049U, // <2,0,7,u>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
+ 1477787750U, // <2,0,u,0>: Cost 2 vext1 <0,2,0,u>, LHS
+ 2953668262U, // <2,0,u,1>: Cost 3 vzipr LHS, <2,3,0,1>
+ 1611956893U, // <2,0,u,2>: Cost 2 vext3 <0,2,0,2>, LHS
+ 2551531670U, // <2,0,u,3>: Cost 3 vext1 <0,2,0,u>, <3,0,1,2>
+ 1477791030U, // <2,0,u,4>: Cost 2 vext1 <0,2,0,u>, RHS
+ 2618726554U, // <2,0,u,5>: Cost 3 vext2 <0,2,2,0>, RHS
+ 2765412506U, // <2,0,u,6>: Cost 3 vuzpl <2,3,0,1>, RHS
+ 2826096169U, // <2,0,u,7>: Cost 3 vuzpr <1,2,3,0>, RHS
+ 1611956947U, // <2,0,u,u>: Cost 2 vext3 <0,2,0,2>, LHS
+ 2569453670U, // <2,1,0,0>: Cost 3 vext1 <3,2,1,0>, LHS
+ 2619392102U, // <2,1,0,1>: Cost 3 vext2 <0,3,2,1>, LHS
+ 3759440619U, // <2,1,0,2>: Cost 4 vext3 <0,2,0,2>, <1,0,2,0>
+ 1616823030U, // <2,1,0,3>: Cost 2 vext3 <1,0,3,2>, <1,0,3,2>
+ 2569456950U, // <2,1,0,4>: Cost 3 vext1 <3,2,1,0>, RHS
+ 2690712328U, // <2,1,0,5>: Cost 3 vext3 <1,0,5,2>, <1,0,5,2>
+ 3661115841U, // <2,1,0,6>: Cost 4 vext1 <6,2,1,0>, <6,2,1,0>
+ 2622046794U, // <2,1,0,7>: Cost 3 vext2 <0,7,2,1>, <0,7,2,1>
+ 1617191715U, // <2,1,0,u>: Cost 2 vext3 <1,0,u,2>, <1,0,u,2>
+ 2551545958U, // <2,1,1,0>: Cost 3 vext1 <0,2,1,1>, LHS
+ 2685698868U, // <2,1,1,1>: Cost 3 vext3 <0,2,0,2>, <1,1,1,1>
+ 2628682646U, // <2,1,1,2>: Cost 3 vext2 <1,u,2,1>, <1,2,3,0>
+ 2685698888U, // <2,1,1,3>: Cost 3 vext3 <0,2,0,2>, <1,1,3,3>
+ 2551549238U, // <2,1,1,4>: Cost 3 vext1 <0,2,1,1>, RHS
+ 3693134992U, // <2,1,1,5>: Cost 4 vext2 <0,3,2,1>, <1,5,3,7>
+ 3661124034U, // <2,1,1,6>: Cost 4 vext1 <6,2,1,1>, <6,2,1,1>
+ 3625292794U, // <2,1,1,7>: Cost 4 vext1 <0,2,1,1>, <7,0,1,2>
+ 2685698933U, // <2,1,1,u>: Cost 3 vext3 <0,2,0,2>, <1,1,u,3>
+ 2551554150U, // <2,1,2,0>: Cost 3 vext1 <0,2,1,2>, LHS
+ 3893649571U, // <2,1,2,1>: Cost 4 vuzpr <0,2,0,1>, <0,2,0,1>
+ 2551555688U, // <2,1,2,2>: Cost 3 vext1 <0,2,1,2>, <2,2,2,2>
+ 2685698966U, // <2,1,2,3>: Cost 3 vext3 <0,2,0,2>, <1,2,3,0>
+ 2551557430U, // <2,1,2,4>: Cost 3 vext1 <0,2,1,2>, RHS
+ 3763422123U, // <2,1,2,5>: Cost 4 vext3 <0,u,0,2>, <1,2,5,3>
+ 3693135802U, // <2,1,2,6>: Cost 4 vext2 <0,3,2,1>, <2,6,3,7>
+ 2726249402U, // <2,1,2,7>: Cost 3 vext3 <7,0,1,2>, <1,2,7,0>
+ 2685699011U, // <2,1,2,u>: Cost 3 vext3 <0,2,0,2>, <1,2,u,0>
+ 2551562342U, // <2,1,3,0>: Cost 3 vext1 <0,2,1,3>, LHS
+ 2953625610U, // <2,1,3,1>: Cost 3 vzipr LHS, <0,0,1,1>
+ 2953627798U, // <2,1,3,2>: Cost 3 vzipr LHS, <3,0,1,2>
+ 2953626584U, // <2,1,3,3>: Cost 3 vzipr LHS, <1,3,1,3>
+ 2551565622U, // <2,1,3,4>: Cost 3 vext1 <0,2,1,3>, RHS
+ 2953625938U, // <2,1,3,5>: Cost 3 vzipr LHS, <0,4,1,5>
+ 2587398596U, // <2,1,3,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
+ 4032013519U, // <2,1,3,7>: Cost 4 vzipr LHS, <1,6,1,7>
+ 2953625617U, // <2,1,3,u>: Cost 3 vzipr LHS, <0,0,1,u>
+ 2690565154U, // <2,1,4,0>: Cost 3 vext3 <1,0,3,2>, <1,4,0,5>
+ 3625313270U, // <2,1,4,1>: Cost 4 vext1 <0,2,1,4>, <1,3,4,6>
+ 3771532340U, // <2,1,4,2>: Cost 4 vext3 <2,2,2,2>, <1,4,2,5>
+ 1148404634U, // <2,1,4,3>: Cost 2 vrev <1,2,3,4>
+ 3625315638U, // <2,1,4,4>: Cost 4 vext1 <0,2,1,4>, RHS
+ 2619395382U, // <2,1,4,5>: Cost 3 vext2 <0,3,2,1>, RHS
+ 3837242678U, // <2,1,4,6>: Cost 4 vuzpl <2,0,1,2>, RHS
+ 3799991394U, // <2,1,4,7>: Cost 4 vext3 <7,0,1,2>, <1,4,7,6>
+ 1148773319U, // <2,1,4,u>: Cost 2 vrev <1,2,u,4>
+ 2551578726U, // <2,1,5,0>: Cost 3 vext1 <0,2,1,5>, LHS
+ 2551579648U, // <2,1,5,1>: Cost 3 vext1 <0,2,1,5>, <1,3,5,7>
+ 3625321952U, // <2,1,5,2>: Cost 4 vext1 <0,2,1,5>, <2,0,5,1>
+ 2685699216U, // <2,1,5,3>: Cost 3 vext3 <0,2,0,2>, <1,5,3,7>
+ 2551582006U, // <2,1,5,4>: Cost 3 vext1 <0,2,1,5>, RHS
+ 3740913668U, // <2,1,5,5>: Cost 4 vext2 <u,3,2,1>, <5,5,5,5>
+ 3661156806U, // <2,1,5,6>: Cost 4 vext1 <6,2,1,5>, <6,2,1,5>
+ 3893652790U, // <2,1,5,7>: Cost 4 vuzpr <0,2,0,1>, RHS
+ 2685699261U, // <2,1,5,u>: Cost 3 vext3 <0,2,0,2>, <1,5,u,7>
+ 2551586918U, // <2,1,6,0>: Cost 3 vext1 <0,2,1,6>, LHS
+ 3625329398U, // <2,1,6,1>: Cost 4 vext1 <0,2,1,6>, <1,0,3,2>
+ 2551588794U, // <2,1,6,2>: Cost 3 vext1 <0,2,1,6>, <2,6,3,7>
+ 3088679014U, // <2,1,6,3>: Cost 3 vtrnr <0,2,4,6>, LHS
+ 2551590198U, // <2,1,6,4>: Cost 3 vext1 <0,2,1,6>, RHS
+ 4029382994U, // <2,1,6,5>: Cost 4 vzipr <0,4,2,6>, <0,4,1,5>
+ 3625333560U, // <2,1,6,6>: Cost 4 vext1 <0,2,1,6>, <6,6,6,6>
+ 3731624800U, // <2,1,6,7>: Cost 4 vext2 <6,7,2,1>, <6,7,2,1>
+ 2551592750U, // <2,1,6,u>: Cost 3 vext1 <0,2,1,6>, LHS
+ 2622051322U, // <2,1,7,0>: Cost 3 vext2 <0,7,2,1>, <7,0,1,2>
+ 3733615699U, // <2,1,7,1>: Cost 4 vext2 <7,1,2,1>, <7,1,2,1>
+ 3795125538U, // <2,1,7,2>: Cost 4 vext3 <6,1,7,2>, <1,7,2,0>
+ 2222171037U, // <2,1,7,3>: Cost 3 vrev <1,2,3,7>
+ 3740915046U, // <2,1,7,4>: Cost 4 vext2 <u,3,2,1>, <7,4,5,6>
+ 3296060335U, // <2,1,7,5>: Cost 4 vrev <1,2,5,7>
+ 3736933864U, // <2,1,7,6>: Cost 4 vext2 <7,6,2,1>, <7,6,2,1>
+ 3805300055U, // <2,1,7,7>: Cost 4 vext3 <7,u,1,2>, <1,7,7,u>
+ 2669827714U, // <2,1,7,u>: Cost 3 vext2 <u,7,2,1>, <7,u,1,2>
+ 2551603302U, // <2,1,u,0>: Cost 3 vext1 <0,2,1,u>, LHS
+ 2953666570U, // <2,1,u,1>: Cost 3 vzipr LHS, <0,0,1,1>
+ 2953668758U, // <2,1,u,2>: Cost 3 vzipr LHS, <3,0,1,2>
+ 1148437406U, // <2,1,u,3>: Cost 2 vrev <1,2,3,u>
+ 2551606582U, // <2,1,u,4>: Cost 3 vext1 <0,2,1,u>, RHS
+ 2953666898U, // <2,1,u,5>: Cost 3 vzipr LHS, <0,4,1,5>
+ 2587398596U, // <2,1,u,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
+ 2669828370U, // <2,1,u,7>: Cost 3 vext2 <u,7,2,1>, <u,7,2,1>
+ 1148806091U, // <2,1,u,u>: Cost 2 vrev <1,2,u,u>
+ 1543667732U, // <2,2,0,0>: Cost 2 vext2 <0,0,2,2>, <0,0,2,2>
+ 1548976230U, // <2,2,0,1>: Cost 2 vext2 <0,u,2,2>, LHS
+ 2685699524U, // <2,2,0,2>: Cost 3 vext3 <0,2,0,2>, <2,0,2,0>
+ 2685699535U, // <2,2,0,3>: Cost 3 vext3 <0,2,0,2>, <2,0,3,2>
+ 2551614774U, // <2,2,0,4>: Cost 3 vext1 <0,2,2,0>, RHS
+ 3704422830U, // <2,2,0,5>: Cost 4 vext2 <2,2,2,2>, <0,5,2,7>
+ 3893657642U, // <2,2,0,6>: Cost 4 vuzpr <0,2,0,2>, <0,0,4,6>
+ 3770574323U, // <2,2,0,7>: Cost 4 vext3 <2,0,7,2>, <2,0,7,2>
+ 1548976796U, // <2,2,0,u>: Cost 2 vext2 <0,u,2,2>, <0,u,2,2>
+ 2622718710U, // <2,2,1,0>: Cost 3 vext2 <0,u,2,2>, <1,0,3,2>
+ 2622718772U, // <2,2,1,1>: Cost 3 vext2 <0,u,2,2>, <1,1,1,1>
+ 2622718870U, // <2,2,1,2>: Cost 3 vext2 <0,u,2,2>, <1,2,3,0>
+ 2819915878U, // <2,2,1,3>: Cost 3 vuzpr <0,2,0,2>, LHS
+ 3625364790U, // <2,2,1,4>: Cost 4 vext1 <0,2,2,1>, RHS
+ 2622719120U, // <2,2,1,5>: Cost 3 vext2 <0,u,2,2>, <1,5,3,7>
+ 3760031292U, // <2,2,1,6>: Cost 4 vext3 <0,2,u,2>, <2,1,6,3>
+ 3667170468U, // <2,2,1,7>: Cost 4 vext1 <7,2,2,1>, <7,2,2,1>
+ 2819915883U, // <2,2,1,u>: Cost 3 vuzpr <0,2,0,2>, LHS
+ 1489829990U, // <2,2,2,0>: Cost 2 vext1 <2,2,2,2>, LHS
+ 2563572470U, // <2,2,2,1>: Cost 3 vext1 <2,2,2,2>, <1,0,3,2>
+ 269271142U, // <2,2,2,2>: Cost 1 vdup2 LHS
+ 2685699698U, // <2,2,2,3>: Cost 3 vext3 <0,2,0,2>, <2,2,3,3>
+ 1489833270U, // <2,2,2,4>: Cost 2 vext1 <2,2,2,2>, RHS
+ 2685699720U, // <2,2,2,5>: Cost 3 vext3 <0,2,0,2>, <2,2,5,7>
+ 2622719930U, // <2,2,2,6>: Cost 3 vext2 <0,u,2,2>, <2,6,3,7>
+ 2593436837U, // <2,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
+ 269271142U, // <2,2,2,u>: Cost 1 vdup2 LHS
+ 2685699750U, // <2,2,3,0>: Cost 3 vext3 <0,2,0,2>, <2,3,0,1>
+ 2690565806U, // <2,2,3,1>: Cost 3 vext3 <1,0,3,2>, <2,3,1,0>
+ 2953627240U, // <2,2,3,2>: Cost 3 vzipr LHS, <2,2,2,2>
+ 1879883878U, // <2,2,3,3>: Cost 2 vzipr LHS, LHS
+ 2685699790U, // <2,2,3,4>: Cost 3 vext3 <0,2,0,2>, <2,3,4,5>
+ 3893659342U, // <2,2,3,5>: Cost 4 vuzpr <0,2,0,2>, <2,3,4,5>
+ 2958270812U, // <2,2,3,6>: Cost 3 vzipr LHS, <0,4,2,6>
+ 2593445030U, // <2,2,3,7>: Cost 3 vext1 <7,2,2,3>, <7,2,2,3>
+ 1879883883U, // <2,2,3,u>: Cost 2 vzipr LHS, LHS
+ 2551644262U, // <2,2,4,0>: Cost 3 vext1 <0,2,2,4>, LHS
+ 3625386742U, // <2,2,4,1>: Cost 4 vext1 <0,2,2,4>, <1,0,3,2>
+ 2551645902U, // <2,2,4,2>: Cost 3 vext1 <0,2,2,4>, <2,3,4,5>
+ 3759441686U, // <2,2,4,3>: Cost 4 vext3 <0,2,0,2>, <2,4,3,5>
+ 2551647542U, // <2,2,4,4>: Cost 3 vext1 <0,2,2,4>, RHS
+ 1548979510U, // <2,2,4,5>: Cost 2 vext2 <0,u,2,2>, RHS
+ 2764901686U, // <2,2,4,6>: Cost 3 vuzpl <2,2,2,2>, RHS
+ 3667195047U, // <2,2,4,7>: Cost 4 vext1 <7,2,2,4>, <7,2,2,4>
+ 1548979753U, // <2,2,4,u>: Cost 2 vext2 <0,u,2,2>, RHS
+ 3696463432U, // <2,2,5,0>: Cost 4 vext2 <0,u,2,2>, <5,0,1,2>
+ 2617413328U, // <2,2,5,1>: Cost 3 vext2 <0,0,2,2>, <5,1,7,3>
+ 2685699936U, // <2,2,5,2>: Cost 3 vext3 <0,2,0,2>, <2,5,2,7>
+ 4027383910U, // <2,2,5,3>: Cost 4 vzipr <0,1,2,5>, LHS
+ 2228201085U, // <2,2,5,4>: Cost 3 vrev <2,2,4,5>
+ 2617413636U, // <2,2,5,5>: Cost 3 vext2 <0,0,2,2>, <5,5,5,5>
+ 2617413730U, // <2,2,5,6>: Cost 3 vext2 <0,0,2,2>, <5,6,7,0>
+ 2819919158U, // <2,2,5,7>: Cost 3 vuzpr <0,2,0,2>, RHS
+ 2819919159U, // <2,2,5,u>: Cost 3 vuzpr <0,2,0,2>, RHS
+ 3625402554U, // <2,2,6,0>: Cost 4 vext1 <0,2,2,6>, <0,2,2,6>
+ 3760031652U, // <2,2,6,1>: Cost 4 vext3 <0,2,u,2>, <2,6,1,3>
+ 2617414138U, // <2,2,6,2>: Cost 3 vext2 <0,0,2,2>, <6,2,7,3>
+ 2685700026U, // <2,2,6,3>: Cost 3 vext3 <0,2,0,2>, <2,6,3,7>
+ 3625405750U, // <2,2,6,4>: Cost 4 vext1 <0,2,2,6>, RHS
+ 3760031692U, // <2,2,6,5>: Cost 4 vext3 <0,2,u,2>, <2,6,5,7>
+ 3088679116U, // <2,2,6,6>: Cost 3 vtrnr <0,2,4,6>, <0,2,4,6>
+ 2657891169U, // <2,2,6,7>: Cost 3 vext2 <6,7,2,2>, <6,7,2,2>
+ 2685700071U, // <2,2,6,u>: Cost 3 vext3 <0,2,0,2>, <2,6,u,7>
+ 2726250474U, // <2,2,7,0>: Cost 3 vext3 <7,0,1,2>, <2,7,0,1>
+ 3704427616U, // <2,2,7,1>: Cost 4 vext2 <2,2,2,2>, <7,1,3,5>
+ 2660545701U, // <2,2,7,2>: Cost 3 vext2 <7,2,2,2>, <7,2,2,2>
+ 4030718054U, // <2,2,7,3>: Cost 4 vzipr <0,6,2,7>, LHS
+ 2617415014U, // <2,2,7,4>: Cost 3 vext2 <0,0,2,2>, <7,4,5,6>
+ 3302033032U, // <2,2,7,5>: Cost 4 vrev <2,2,5,7>
+ 3661246929U, // <2,2,7,6>: Cost 4 vext1 <6,2,2,7>, <6,2,2,7>
+ 2617415276U, // <2,2,7,7>: Cost 3 vext2 <0,0,2,2>, <7,7,7,7>
+ 2731558962U, // <2,2,7,u>: Cost 3 vext3 <7,u,1,2>, <2,7,u,1>
+ 1489829990U, // <2,2,u,0>: Cost 2 vext1 <2,2,2,2>, LHS
+ 1548982062U, // <2,2,u,1>: Cost 2 vext2 <0,u,2,2>, LHS
+ 269271142U, // <2,2,u,2>: Cost 1 vdup2 LHS
+ 1879924838U, // <2,2,u,3>: Cost 2 vzipr LHS, LHS
+ 1489833270U, // <2,2,u,4>: Cost 2 vext1 <2,2,2,2>, RHS
+ 1548982426U, // <2,2,u,5>: Cost 2 vext2 <0,u,2,2>, RHS
+ 2953666908U, // <2,2,u,6>: Cost 3 vzipr LHS, <0,4,2,6>
+ 2819919401U, // <2,2,u,7>: Cost 3 vuzpr <0,2,0,2>, RHS
+ 269271142U, // <2,2,u,u>: Cost 1 vdup2 LHS
+ 1544339456U, // <2,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+ 470597734U, // <2,3,0,1>: Cost 1 vext2 LHS, LHS
+ 1548984484U, // <2,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+ 2619408648U, // <2,3,0,3>: Cost 3 vext2 <0,3,2,3>, <0,3,2,3>
+ 1548984658U, // <2,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+ 2665857454U, // <2,3,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
+ 2622726655U, // <2,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
+ 2593494188U, // <2,3,0,7>: Cost 3 vext1 <7,2,3,0>, <7,2,3,0>
+ 470598301U, // <2,3,0,u>: Cost 1 vext2 LHS, LHS
+ 1544340214U, // <2,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+ 1544340276U, // <2,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+ 1544340374U, // <2,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+ 1548985304U, // <2,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+ 2551696694U, // <2,3,1,4>: Cost 3 vext1 <0,2,3,1>, RHS
+ 1548985488U, // <2,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+ 2622727375U, // <2,3,1,6>: Cost 3 vext2 LHS, <1,6,1,7>
+ 2665858347U, // <2,3,1,7>: Cost 3 vext2 LHS, <1,7,3,0>
+ 1548985709U, // <2,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
+ 2622727613U, // <2,3,2,0>: Cost 3 vext2 LHS, <2,0,1,2>
+ 2622727711U, // <2,3,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
+ 1544341096U, // <2,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
+ 1544341158U, // <2,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+ 2622727958U, // <2,3,2,4>: Cost 3 vext2 LHS, <2,4,3,5>
+ 2622728032U, // <2,3,2,5>: Cost 3 vext2 LHS, <2,5,2,7>
+ 1548986298U, // <2,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+ 2665859050U, // <2,3,2,7>: Cost 3 vext2 LHS, <2,7,0,1>
+ 1548986427U, // <2,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
+ 1548986518U, // <2,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+ 2622728415U, // <2,3,3,1>: Cost 3 vext2 LHS, <3,1,0,3>
+ 1489913458U, // <2,3,3,2>: Cost 2 vext1 <2,2,3,3>, <2,2,3,3>
+ 1544341916U, // <2,3,3,3>: Cost 2 vext2 LHS, <3,3,3,3>
+ 1548986882U, // <2,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+ 2665859632U, // <2,3,3,5>: Cost 3 vext2 LHS, <3,5,1,7>
+ 2234304870U, // <2,3,3,6>: Cost 3 vrev <3,2,6,3>
+ 2958271632U, // <2,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
+ 1548987166U, // <2,3,3,u>: Cost 2 vext2 LHS, <3,u,1,2>
+ 1483948134U, // <2,3,4,0>: Cost 2 vext1 <1,2,3,4>, LHS
+ 1483948954U, // <2,3,4,1>: Cost 2 vext1 <1,2,3,4>, <1,2,3,4>
+ 2622729276U, // <2,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
+ 2557692054U, // <2,3,4,3>: Cost 3 vext1 <1,2,3,4>, <3,0,1,2>
+ 1483951414U, // <2,3,4,4>: Cost 2 vext1 <1,2,3,4>, RHS
+ 470601014U, // <2,3,4,5>: Cost 1 vext2 LHS, RHS
+ 1592118644U, // <2,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+ 2593526960U, // <2,3,4,7>: Cost 3 vext1 <7,2,3,4>, <7,2,3,4>
+ 470601257U, // <2,3,4,u>: Cost 1 vext2 LHS, RHS
+ 2551726182U, // <2,3,5,0>: Cost 3 vext1 <0,2,3,5>, LHS
+ 1592118992U, // <2,3,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
+ 2665860862U, // <2,3,5,2>: Cost 3 vext2 LHS, <5,2,3,4>
+ 2551728642U, // <2,3,5,3>: Cost 3 vext1 <0,2,3,5>, <3,4,5,6>
+ 1592119238U, // <2,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+ 1592119300U, // <2,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+ 1592119394U, // <2,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
+ 1592119464U, // <2,3,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
+ 1592119545U, // <2,3,5,u>: Cost 2 vext2 LHS, <5,u,5,7>
+ 2622730529U, // <2,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
+ 2557707164U, // <2,3,6,1>: Cost 3 vext1 <1,2,3,6>, <1,2,3,6>
+ 1592119802U, // <2,3,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
+ 2665861682U, // <2,3,6,3>: Cost 3 vext2 LHS, <6,3,4,5>
+ 2622730893U, // <2,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
+ 2665861810U, // <2,3,6,5>: Cost 3 vext2 LHS, <6,5,0,7>
+ 1592120120U, // <2,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+ 1592120142U, // <2,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+ 1592120223U, // <2,3,6,u>: Cost 2 vext2 LHS, <6,u,0,1>
+ 1592120314U, // <2,3,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
+ 2659890261U, // <2,3,7,1>: Cost 3 vext2 <7,1,2,3>, <7,1,2,3>
+ 2660553894U, // <2,3,7,2>: Cost 3 vext2 <7,2,2,3>, <7,2,2,3>
+ 2665862371U, // <2,3,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
+ 1592120678U, // <2,3,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
+ 2665862534U, // <2,3,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
+ 2665862614U, // <2,3,7,6>: Cost 3 vext2 LHS, <7,6,0,1>
+ 1592120940U, // <2,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+ 1592120962U, // <2,3,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
+ 1548990163U, // <2,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
+ 470603566U, // <2,3,u,1>: Cost 1 vext2 LHS, LHS
+ 1548990341U, // <2,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
+ 1548990396U, // <2,3,u,3>: Cost 2 vext2 LHS, <u,3,0,1>
+ 1548990527U, // <2,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
+ 470603930U, // <2,3,u,5>: Cost 1 vext2 LHS, RHS
+ 1548990672U, // <2,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
+ 1592121600U, // <2,3,u,7>: Cost 2 vext2 LHS, <u,7,0,1>
+ 470604133U, // <2,3,u,u>: Cost 1 vext2 LHS, LHS
+ 2617425942U, // <2,4,0,0>: Cost 3 vext2 <0,0,2,4>, <0,0,2,4>
+ 2618753126U, // <2,4,0,1>: Cost 3 vext2 <0,2,2,4>, LHS
+ 2618753208U, // <2,4,0,2>: Cost 3 vext2 <0,2,2,4>, <0,2,2,4>
+ 2619416841U, // <2,4,0,3>: Cost 3 vext2 <0,3,2,4>, <0,3,2,4>
+ 2587593628U, // <2,4,0,4>: Cost 3 vext1 <6,2,4,0>, <4,0,6,2>
+ 2712832914U, // <2,4,0,5>: Cost 3 vext3 <4,6,u,2>, <4,0,5,1>
+ 1634962332U, // <2,4,0,6>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
+ 3799993252U, // <2,4,0,7>: Cost 4 vext3 <7,0,1,2>, <4,0,7,1>
+ 1634962332U, // <2,4,0,u>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
+ 2619417334U, // <2,4,1,0>: Cost 3 vext2 <0,3,2,4>, <1,0,3,2>
+ 3692495668U, // <2,4,1,1>: Cost 4 vext2 <0,2,2,4>, <1,1,1,1>
+ 2625389466U, // <2,4,1,2>: Cost 3 vext2 <1,3,2,4>, <1,2,3,4>
+ 2826125414U, // <2,4,1,3>: Cost 3 vuzpr <1,2,3,4>, LHS
+ 3699794995U, // <2,4,1,4>: Cost 4 vext2 <1,4,2,4>, <1,4,2,4>
+ 3692496016U, // <2,4,1,5>: Cost 4 vext2 <0,2,2,4>, <1,5,3,7>
+ 3763424238U, // <2,4,1,6>: Cost 4 vext3 <0,u,0,2>, <4,1,6,3>
+ 3667317942U, // <2,4,1,7>: Cost 4 vext1 <7,2,4,1>, <7,2,4,1>
+ 2826125419U, // <2,4,1,u>: Cost 3 vuzpr <1,2,3,4>, LHS
+ 2629371336U, // <2,4,2,0>: Cost 3 vext2 <2,0,2,4>, <2,0,2,4>
+ 3699131946U, // <2,4,2,1>: Cost 4 vext2 <1,3,2,4>, <2,1,4,3>
+ 2630698602U, // <2,4,2,2>: Cost 3 vext2 <2,2,2,4>, <2,2,2,4>
+ 2618754766U, // <2,4,2,3>: Cost 3 vext2 <0,2,2,4>, <2,3,4,5>
+ 2826126234U, // <2,4,2,4>: Cost 3 vuzpr <1,2,3,4>, <1,2,3,4>
+ 2899119414U, // <2,4,2,5>: Cost 3 vzipl <2,2,2,2>, RHS
+ 3033337142U, // <2,4,2,6>: Cost 3 vtrnl <2,2,2,2>, RHS
+ 3800214597U, // <2,4,2,7>: Cost 4 vext3 <7,0,4,2>, <4,2,7,0>
+ 2899119657U, // <2,4,2,u>: Cost 3 vzipl <2,2,2,2>, RHS
+ 2635344033U, // <2,4,3,0>: Cost 3 vext2 <3,0,2,4>, <3,0,2,4>
+ 4032012325U, // <2,4,3,1>: Cost 4 vzipr LHS, <0,0,4,1>
+ 3692497228U, // <2,4,3,2>: Cost 4 vext2 <0,2,2,4>, <3,2,3,4>
+ 3692497308U, // <2,4,3,3>: Cost 4 vext2 <0,2,2,4>, <3,3,3,3>
+ 3001404624U, // <2,4,3,4>: Cost 3 vzipr LHS, <4,4,4,4>
+ 2953627342U, // <2,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
+ 2953625804U, // <2,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
+ 3899868160U, // <2,4,3,7>: Cost 4 vuzpr <1,2,3,4>, <1,3,5,7>
+ 2953625806U, // <2,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
+ 2710916266U, // <2,4,4,0>: Cost 3 vext3 <4,4,0,2>, <4,4,0,2>
+ 3899869648U, // <2,4,4,1>: Cost 4 vuzpr <1,2,3,4>, <3,4,0,1>
+ 3899869658U, // <2,4,4,2>: Cost 4 vuzpr <1,2,3,4>, <3,4,1,2>
+ 3899868930U, // <2,4,4,3>: Cost 4 vuzpr <1,2,3,4>, <2,4,1,3>
+ 2712833232U, // <2,4,4,4>: Cost 3 vext3 <4,6,u,2>, <4,4,4,4>
+ 2618756406U, // <2,4,4,5>: Cost 3 vext2 <0,2,2,4>, RHS
+ 2765737270U, // <2,4,4,6>: Cost 3 vuzpl <2,3,4,5>, RHS
+ 4168304426U, // <2,4,4,7>: Cost 4 vtrnr <1,2,3,4>, <2,4,5,7>
+ 2618756649U, // <2,4,4,u>: Cost 3 vext2 <0,2,2,4>, RHS
+ 2551800011U, // <2,4,5,0>: Cost 3 vext1 <0,2,4,5>, <0,2,4,5>
+ 2569716470U, // <2,4,5,1>: Cost 3 vext1 <3,2,4,5>, <1,0,3,2>
+ 2563745405U, // <2,4,5,2>: Cost 3 vext1 <2,2,4,5>, <2,2,4,5>
+ 2569718102U, // <2,4,5,3>: Cost 3 vext1 <3,2,4,5>, <3,2,4,5>
+ 2551803190U, // <2,4,5,4>: Cost 3 vext1 <0,2,4,5>, RHS
+ 3625545732U, // <2,4,5,5>: Cost 4 vext1 <0,2,4,5>, <5,5,5,5>
+ 1611959606U, // <2,4,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
+ 2826128694U, // <2,4,5,7>: Cost 3 vuzpr <1,2,3,4>, RHS
+ 1611959624U, // <2,4,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
+ 1478066278U, // <2,4,6,0>: Cost 2 vext1 <0,2,4,6>, LHS
+ 2551808758U, // <2,4,6,1>: Cost 3 vext1 <0,2,4,6>, <1,0,3,2>
+ 2551809516U, // <2,4,6,2>: Cost 3 vext1 <0,2,4,6>, <2,0,6,4>
+ 2551810198U, // <2,4,6,3>: Cost 3 vext1 <0,2,4,6>, <3,0,1,2>
+ 1478069558U, // <2,4,6,4>: Cost 2 vext1 <0,2,4,6>, RHS
+ 2901888310U, // <2,4,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
+ 2551812920U, // <2,4,6,6>: Cost 3 vext1 <0,2,4,6>, <6,6,6,6>
+ 2726251914U, // <2,4,6,7>: Cost 3 vext3 <7,0,1,2>, <4,6,7,1>
+ 1478072110U, // <2,4,6,u>: Cost 2 vext1 <0,2,4,6>, LHS
+ 2659234821U, // <2,4,7,0>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
+ 3786722726U, // <2,4,7,1>: Cost 4 vext3 <4,7,1,2>, <4,7,1,2>
+ 3734303911U, // <2,4,7,2>: Cost 4 vext2 <7,2,2,4>, <7,2,2,4>
+ 3734967544U, // <2,4,7,3>: Cost 4 vext2 <7,3,2,4>, <7,3,2,4>
+ 3727005030U, // <2,4,7,4>: Cost 4 vext2 <6,0,2,4>, <7,4,5,6>
+ 2726251976U, // <2,4,7,5>: Cost 3 vext3 <7,0,1,2>, <4,7,5,0>
+ 2726251986U, // <2,4,7,6>: Cost 3 vext3 <7,0,1,2>, <4,7,6,1>
+ 3727005292U, // <2,4,7,7>: Cost 4 vext2 <6,0,2,4>, <7,7,7,7>
+ 2659234821U, // <2,4,7,u>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
+ 1478082662U, // <2,4,u,0>: Cost 2 vext1 <0,2,4,u>, LHS
+ 2618758958U, // <2,4,u,1>: Cost 3 vext2 <0,2,2,4>, LHS
+ 2551826024U, // <2,4,u,2>: Cost 3 vext1 <0,2,4,u>, <2,2,2,2>
+ 2551826582U, // <2,4,u,3>: Cost 3 vext1 <0,2,4,u>, <3,0,1,2>
+ 1478085942U, // <2,4,u,4>: Cost 2 vext1 <0,2,4,u>, RHS
+ 2953668302U, // <2,4,u,5>: Cost 3 vzipr LHS, <2,3,4,5>
+ 1611959849U, // <2,4,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
+ 2826128937U, // <2,4,u,7>: Cost 3 vuzpr <1,2,3,4>, RHS
+ 1611959867U, // <2,4,u,u>: Cost 2 vext3 <0,2,0,2>, RHS
+ 3691839488U, // <2,5,0,0>: Cost 4 vext2 <0,1,2,5>, <0,0,0,0>
+ 2618097766U, // <2,5,0,1>: Cost 3 vext2 <0,1,2,5>, LHS
+ 2620088484U, // <2,5,0,2>: Cost 3 vext2 <0,4,2,5>, <0,2,0,2>
+ 2619425034U, // <2,5,0,3>: Cost 3 vext2 <0,3,2,5>, <0,3,2,5>
+ 2620088667U, // <2,5,0,4>: Cost 3 vext2 <0,4,2,5>, <0,4,2,5>
+ 2620752300U, // <2,5,0,5>: Cost 3 vext2 <0,5,2,5>, <0,5,2,5>
+ 3693830655U, // <2,5,0,6>: Cost 4 vext2 <0,4,2,5>, <0,6,2,7>
+ 3094531382U, // <2,5,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
+ 2618098333U, // <2,5,0,u>: Cost 3 vext2 <0,1,2,5>, LHS
+ 3691840246U, // <2,5,1,0>: Cost 4 vext2 <0,1,2,5>, <1,0,3,2>
+ 3691840308U, // <2,5,1,1>: Cost 4 vext2 <0,1,2,5>, <1,1,1,1>
+ 2626061206U, // <2,5,1,2>: Cost 3 vext2 <1,4,2,5>, <1,2,3,0>
+ 2618098688U, // <2,5,1,3>: Cost 3 vext2 <0,1,2,5>, <1,3,5,7>
+ 2626061364U, // <2,5,1,4>: Cost 3 vext2 <1,4,2,5>, <1,4,2,5>
+ 3691840656U, // <2,5,1,5>: Cost 4 vext2 <0,1,2,5>, <1,5,3,7>
+ 3789082310U, // <2,5,1,6>: Cost 4 vext3 <5,1,6,2>, <5,1,6,2>
+ 2712833744U, // <2,5,1,7>: Cost 3 vext3 <4,6,u,2>, <5,1,7,3>
+ 2628715896U, // <2,5,1,u>: Cost 3 vext2 <1,u,2,5>, <1,u,2,5>
+ 3693831613U, // <2,5,2,0>: Cost 4 vext2 <0,4,2,5>, <2,0,1,2>
+ 4026698642U, // <2,5,2,1>: Cost 4 vzipr <0,0,2,2>, <4,0,5,1>
+ 2632033896U, // <2,5,2,2>: Cost 3 vext2 <2,4,2,5>, <2,2,2,2>
+ 3691841190U, // <2,5,2,3>: Cost 4 vext2 <0,1,2,5>, <2,3,0,1>
+ 2632034061U, // <2,5,2,4>: Cost 3 vext2 <2,4,2,5>, <2,4,2,5>
+ 3691841352U, // <2,5,2,5>: Cost 4 vext2 <0,1,2,5>, <2,5,0,1>
+ 3691841466U, // <2,5,2,6>: Cost 4 vext2 <0,1,2,5>, <2,6,3,7>
+ 3088354614U, // <2,5,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
+ 3088354615U, // <2,5,2,u>: Cost 3 vtrnr <0,2,0,2>, RHS
+ 2557829222U, // <2,5,3,0>: Cost 3 vext1 <1,2,5,3>, LHS
+ 2557830059U, // <2,5,3,1>: Cost 3 vext1 <1,2,5,3>, <1,2,5,3>
+ 2575746766U, // <2,5,3,2>: Cost 3 vext1 <4,2,5,3>, <2,3,4,5>
+ 3691841948U, // <2,5,3,3>: Cost 4 vext2 <0,1,2,5>, <3,3,3,3>
+ 2619427330U, // <2,5,3,4>: Cost 3 vext2 <0,3,2,5>, <3,4,5,6>
+ 2581720847U, // <2,5,3,5>: Cost 3 vext1 <5,2,5,3>, <5,2,5,3>
+ 2953628162U, // <2,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
+ 2953626624U, // <2,5,3,7>: Cost 3 vzipr LHS, <1,3,5,7>
+ 2953626625U, // <2,5,3,u>: Cost 3 vzipr LHS, <1,3,5,u>
+ 2569781350U, // <2,5,4,0>: Cost 3 vext1 <3,2,5,4>, LHS
+ 3631580076U, // <2,5,4,1>: Cost 4 vext1 <1,2,5,4>, <1,2,5,4>
+ 2569782990U, // <2,5,4,2>: Cost 3 vext1 <3,2,5,4>, <2,3,4,5>
+ 2569783646U, // <2,5,4,3>: Cost 3 vext1 <3,2,5,4>, <3,2,5,4>
+ 2569784630U, // <2,5,4,4>: Cost 3 vext1 <3,2,5,4>, RHS
+ 2618101046U, // <2,5,4,5>: Cost 3 vext2 <0,1,2,5>, RHS
+ 3893905922U, // <2,5,4,6>: Cost 4 vuzpr <0,2,3,5>, <3,4,5,6>
+ 3094564150U, // <2,5,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
+ 2618101289U, // <2,5,4,u>: Cost 3 vext2 <0,1,2,5>, RHS
+ 2551873638U, // <2,5,5,0>: Cost 3 vext1 <0,2,5,5>, LHS
+ 3637560320U, // <2,5,5,1>: Cost 4 vext1 <2,2,5,5>, <1,3,5,7>
+ 3637560966U, // <2,5,5,2>: Cost 4 vext1 <2,2,5,5>, <2,2,5,5>
+ 3723030343U, // <2,5,5,3>: Cost 4 vext2 <5,3,2,5>, <5,3,2,5>
+ 2551876918U, // <2,5,5,4>: Cost 3 vext1 <0,2,5,5>, RHS
+ 2712834052U, // <2,5,5,5>: Cost 3 vext3 <4,6,u,2>, <5,5,5,5>
+ 4028713474U, // <2,5,5,6>: Cost 4 vzipr <0,3,2,5>, <3,4,5,6>
+ 2712834072U, // <2,5,5,7>: Cost 3 vext3 <4,6,u,2>, <5,5,7,7>
+ 2712834081U, // <2,5,5,u>: Cost 3 vext3 <4,6,u,2>, <5,5,u,7>
+ 2575769702U, // <2,5,6,0>: Cost 3 vext1 <4,2,5,6>, LHS
+ 3631596462U, // <2,5,6,1>: Cost 4 vext1 <1,2,5,6>, <1,2,5,6>
+ 2655924730U, // <2,5,6,2>: Cost 3 vext2 <6,4,2,5>, <6,2,7,3>
+ 3643541856U, // <2,5,6,3>: Cost 4 vext1 <3,2,5,6>, <3,2,5,6>
+ 2655924849U, // <2,5,6,4>: Cost 3 vext2 <6,4,2,5>, <6,4,2,5>
+ 3787755607U, // <2,5,6,5>: Cost 4 vext3 <4,u,6,2>, <5,6,5,7>
+ 4029385218U, // <2,5,6,6>: Cost 4 vzipr <0,4,2,6>, <3,4,5,6>
+ 3088682294U, // <2,5,6,7>: Cost 3 vtrnr <0,2,4,6>, RHS
+ 3088682295U, // <2,5,6,u>: Cost 3 vtrnr <0,2,4,6>, RHS
+ 2563833958U, // <2,5,7,0>: Cost 3 vext1 <2,2,5,7>, LHS
+ 2551890678U, // <2,5,7,1>: Cost 3 vext1 <0,2,5,7>, <1,0,3,2>
+ 2563835528U, // <2,5,7,2>: Cost 3 vext1 <2,2,5,7>, <2,2,5,7>
+ 3637577878U, // <2,5,7,3>: Cost 4 vext1 <2,2,5,7>, <3,0,1,2>
+ 2563837238U, // <2,5,7,4>: Cost 3 vext1 <2,2,5,7>, RHS
+ 2712834216U, // <2,5,7,5>: Cost 3 vext3 <4,6,u,2>, <5,7,5,7>
+ 2712834220U, // <2,5,7,6>: Cost 3 vext3 <4,6,u,2>, <5,7,6,2>
+ 4174449974U, // <2,5,7,7>: Cost 4 vtrnr <2,2,5,7>, RHS
+ 2563839790U, // <2,5,7,u>: Cost 3 vext1 <2,2,5,7>, LHS
+ 2563842150U, // <2,5,u,0>: Cost 3 vext1 <2,2,5,u>, LHS
+ 2618103598U, // <2,5,u,1>: Cost 3 vext2 <0,1,2,5>, LHS
+ 2563843721U, // <2,5,u,2>: Cost 3 vext1 <2,2,5,u>, <2,2,5,u>
+ 2569816418U, // <2,5,u,3>: Cost 3 vext1 <3,2,5,u>, <3,2,5,u>
+ 2622748735U, // <2,5,u,4>: Cost 3 vext2 <0,u,2,5>, <u,4,5,6>
+ 2618103962U, // <2,5,u,5>: Cost 3 vext2 <0,1,2,5>, RHS
+ 2953669122U, // <2,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
+ 2953667584U, // <2,5,u,7>: Cost 3 vzipr LHS, <1,3,5,7>
+ 2618104165U, // <2,5,u,u>: Cost 3 vext2 <0,1,2,5>, LHS
+ 2620096512U, // <2,6,0,0>: Cost 3 vext2 <0,4,2,6>, <0,0,0,0>
+ 1546354790U, // <2,6,0,1>: Cost 2 vext2 <0,4,2,6>, LHS
+ 2620096676U, // <2,6,0,2>: Cost 3 vext2 <0,4,2,6>, <0,2,0,2>
+ 3693838588U, // <2,6,0,3>: Cost 4 vext2 <0,4,2,6>, <0,3,1,0>
+ 1546355036U, // <2,6,0,4>: Cost 2 vext2 <0,4,2,6>, <0,4,2,6>
+ 3694502317U, // <2,6,0,5>: Cost 4 vext2 <0,5,2,6>, <0,5,2,6>
+ 2551911246U, // <2,6,0,6>: Cost 3 vext1 <0,2,6,0>, <6,7,0,1>
+ 2720723287U, // <2,6,0,7>: Cost 3 vext3 <6,0,7,2>, <6,0,7,2>
+ 1546355357U, // <2,6,0,u>: Cost 2 vext2 <0,4,2,6>, LHS
+ 2620097270U, // <2,6,1,0>: Cost 3 vext2 <0,4,2,6>, <1,0,3,2>
+ 2620097332U, // <2,6,1,1>: Cost 3 vext2 <0,4,2,6>, <1,1,1,1>
+ 2620097430U, // <2,6,1,2>: Cost 3 vext2 <0,4,2,6>, <1,2,3,0>
+ 2820243558U, // <2,6,1,3>: Cost 3 vuzpr <0,2,4,6>, LHS
+ 2620097598U, // <2,6,1,4>: Cost 3 vext2 <0,4,2,6>, <1,4,3,6>
+ 2620097680U, // <2,6,1,5>: Cost 3 vext2 <0,4,2,6>, <1,5,3,7>
+ 3693839585U, // <2,6,1,6>: Cost 4 vext2 <0,4,2,6>, <1,6,3,7>
+ 2721386920U, // <2,6,1,7>: Cost 3 vext3 <6,1,7,2>, <6,1,7,2>
+ 2820243563U, // <2,6,1,u>: Cost 3 vuzpr <0,2,4,6>, LHS
+ 2714014137U, // <2,6,2,0>: Cost 3 vext3 <4,u,6,2>, <6,2,0,1>
+ 2712834500U, // <2,6,2,1>: Cost 3 vext3 <4,6,u,2>, <6,2,1,3>
+ 2620098152U, // <2,6,2,2>: Cost 3 vext2 <0,4,2,6>, <2,2,2,2>
+ 2620098214U, // <2,6,2,3>: Cost 3 vext2 <0,4,2,6>, <2,3,0,1>
+ 2632042254U, // <2,6,2,4>: Cost 3 vext2 <2,4,2,6>, <2,4,2,6>
+ 2712834540U, // <2,6,2,5>: Cost 3 vext3 <4,6,u,2>, <6,2,5,7>
+ 2820243660U, // <2,6,2,6>: Cost 3 vuzpr <0,2,4,6>, <0,2,4,6>
+ 2958265654U, // <2,6,2,7>: Cost 3 vzipr <0,u,2,2>, RHS
+ 2620098619U, // <2,6,2,u>: Cost 3 vext2 <0,4,2,6>, <2,u,0,1>
+ 2620098710U, // <2,6,3,0>: Cost 3 vext2 <0,4,2,6>, <3,0,1,2>
+ 3893986982U, // <2,6,3,1>: Cost 4 vuzpr <0,2,4,6>, <2,3,0,1>
+ 2569848762U, // <2,6,3,2>: Cost 3 vext1 <3,2,6,3>, <2,6,3,7>
+ 2620098972U, // <2,6,3,3>: Cost 3 vext2 <0,4,2,6>, <3,3,3,3>
+ 2620099074U, // <2,6,3,4>: Cost 3 vext2 <0,4,2,6>, <3,4,5,6>
+ 3893987022U, // <2,6,3,5>: Cost 4 vuzpr <0,2,4,6>, <2,3,4,5>
+ 3001404644U, // <2,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
+ 1879887158U, // <2,6,3,7>: Cost 2 vzipr LHS, RHS
+ 1879887159U, // <2,6,3,u>: Cost 2 vzipr LHS, RHS
+ 2620099484U, // <2,6,4,0>: Cost 3 vext2 <0,4,2,6>, <4,0,6,2>
+ 2620099566U, // <2,6,4,1>: Cost 3 vext2 <0,4,2,6>, <4,1,6,3>
+ 2620099644U, // <2,6,4,2>: Cost 3 vext2 <0,4,2,6>, <4,2,6,0>
+ 3643599207U, // <2,6,4,3>: Cost 4 vext1 <3,2,6,4>, <3,2,6,4>
+ 2575830080U, // <2,6,4,4>: Cost 3 vext1 <4,2,6,4>, <4,2,6,4>
+ 1546358070U, // <2,6,4,5>: Cost 2 vext2 <0,4,2,6>, RHS
+ 2667875700U, // <2,6,4,6>: Cost 3 vext2 <u,4,2,6>, <4,6,4,6>
+ 4028042550U, // <2,6,4,7>: Cost 4 vzipr <0,2,2,4>, RHS
+ 1546358313U, // <2,6,4,u>: Cost 2 vext2 <0,4,2,6>, RHS
+ 3693841992U, // <2,6,5,0>: Cost 4 vext2 <0,4,2,6>, <5,0,1,2>
+ 2667876048U, // <2,6,5,1>: Cost 3 vext2 <u,4,2,6>, <5,1,7,3>
+ 2712834756U, // <2,6,5,2>: Cost 3 vext3 <4,6,u,2>, <6,5,2,7>
+ 3643607400U, // <2,6,5,3>: Cost 4 vext1 <3,2,6,5>, <3,2,6,5>
+ 2252091873U, // <2,6,5,4>: Cost 3 vrev <6,2,4,5>
+ 2667876356U, // <2,6,5,5>: Cost 3 vext2 <u,4,2,6>, <5,5,5,5>
+ 2667876450U, // <2,6,5,6>: Cost 3 vext2 <u,4,2,6>, <5,6,7,0>
+ 2820246838U, // <2,6,5,7>: Cost 3 vuzpr <0,2,4,6>, RHS
+ 2820246839U, // <2,6,5,u>: Cost 3 vuzpr <0,2,4,6>, RHS
+ 2563899494U, // <2,6,6,0>: Cost 3 vext1 <2,2,6,6>, LHS
+ 3893988683U, // <2,6,6,1>: Cost 4 vuzpr <0,2,4,6>, <4,6,0,1>
+ 2563901072U, // <2,6,6,2>: Cost 3 vext1 <2,2,6,6>, <2,2,6,6>
+ 3893987236U, // <2,6,6,3>: Cost 4 vuzpr <0,2,4,6>, <2,6,1,3>
+ 2563902774U, // <2,6,6,4>: Cost 3 vext1 <2,2,6,6>, RHS
+ 3893988723U, // <2,6,6,5>: Cost 4 vuzpr <0,2,4,6>, <4,6,4,5>
+ 2712834872U, // <2,6,6,6>: Cost 3 vext3 <4,6,u,2>, <6,6,6,6>
+ 2955644214U, // <2,6,6,7>: Cost 3 vzipr <0,4,2,6>, RHS
+ 2955644215U, // <2,6,6,u>: Cost 3 vzipr <0,4,2,6>, RHS
+ 2712834894U, // <2,6,7,0>: Cost 3 vext3 <4,6,u,2>, <6,7,0,1>
+ 2724926296U, // <2,6,7,1>: Cost 3 vext3 <6,7,1,2>, <6,7,1,2>
+ 2725000033U, // <2,6,7,2>: Cost 3 vext3 <6,7,2,2>, <6,7,2,2>
+ 2702365544U, // <2,6,7,3>: Cost 3 vext3 <3,0,1,2>, <6,7,3,0>
+ 2712834934U, // <2,6,7,4>: Cost 3 vext3 <4,6,u,2>, <6,7,4,5>
+ 3776107393U, // <2,6,7,5>: Cost 4 vext3 <3,0,1,2>, <6,7,5,7>
+ 2725294981U, // <2,6,7,6>: Cost 3 vext3 <6,7,6,2>, <6,7,6,2>
+ 2726253452U, // <2,6,7,7>: Cost 3 vext3 <7,0,1,2>, <6,7,7,0>
+ 2712834966U, // <2,6,7,u>: Cost 3 vext3 <4,6,u,2>, <6,7,u,1>
+ 2620102355U, // <2,6,u,0>: Cost 3 vext2 <0,4,2,6>, <u,0,1,2>
+ 1546360622U, // <2,6,u,1>: Cost 2 vext2 <0,4,2,6>, LHS
+ 2620102536U, // <2,6,u,2>: Cost 3 vext2 <0,4,2,6>, <u,2,3,3>
+ 2820244125U, // <2,6,u,3>: Cost 3 vuzpr <0,2,4,6>, LHS
+ 1594136612U, // <2,6,u,4>: Cost 2 vext2 <u,4,2,6>, <u,4,2,6>
+ 1546360986U, // <2,6,u,5>: Cost 2 vext2 <0,4,2,6>, RHS
+ 2620102864U, // <2,6,u,6>: Cost 3 vext2 <0,4,2,6>, <u,6,3,7>
+ 1879928118U, // <2,6,u,7>: Cost 2 vzipr LHS, RHS
+ 1879928119U, // <2,6,u,u>: Cost 2 vzipr LHS, RHS
+ 2726179825U, // <2,7,0,0>: Cost 3 vext3 <7,0,0,2>, <7,0,0,2>
+ 1652511738U, // <2,7,0,1>: Cost 2 vext3 <7,0,1,2>, <7,0,1,2>
+ 2621431972U, // <2,7,0,2>: Cost 3 vext2 <0,6,2,7>, <0,2,0,2>
+ 2257949868U, // <2,7,0,3>: Cost 3 vrev <7,2,3,0>
+ 2726474773U, // <2,7,0,4>: Cost 3 vext3 <7,0,4,2>, <7,0,4,2>
+ 2620768686U, // <2,7,0,5>: Cost 3 vext2 <0,5,2,7>, <0,5,2,7>
+ 2621432319U, // <2,7,0,6>: Cost 3 vext2 <0,6,2,7>, <0,6,2,7>
+ 2599760953U, // <2,7,0,7>: Cost 3 vext1 <u,2,7,0>, <7,0,u,2>
+ 1653027897U, // <2,7,0,u>: Cost 2 vext3 <7,0,u,2>, <7,0,u,2>
+ 2639348470U, // <2,7,1,0>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
+ 3695174452U, // <2,7,1,1>: Cost 4 vext2 <0,6,2,7>, <1,1,1,1>
+ 3695174550U, // <2,7,1,2>: Cost 4 vext2 <0,6,2,7>, <1,2,3,0>
+ 3694511104U, // <2,7,1,3>: Cost 4 vext2 <0,5,2,7>, <1,3,5,7>
+ 3713090594U, // <2,7,1,4>: Cost 4 vext2 <3,6,2,7>, <1,4,0,5>
+ 3693184144U, // <2,7,1,5>: Cost 4 vext2 <0,3,2,7>, <1,5,3,7>
+ 2627405016U, // <2,7,1,6>: Cost 3 vext2 <1,6,2,7>, <1,6,2,7>
+ 3799995519U, // <2,7,1,7>: Cost 4 vext3 <7,0,1,2>, <7,1,7,0>
+ 2639348470U, // <2,7,1,u>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
+ 3695175101U, // <2,7,2,0>: Cost 4 vext2 <0,6,2,7>, <2,0,1,2>
+ 3643655168U, // <2,7,2,1>: Cost 4 vext1 <3,2,7,2>, <1,3,5,7>
+ 2257892517U, // <2,7,2,2>: Cost 3 vrev <7,2,2,2>
+ 3695175334U, // <2,7,2,3>: Cost 4 vext2 <0,6,2,7>, <2,3,0,1>
+ 3695175465U, // <2,7,2,4>: Cost 4 vext2 <0,6,2,7>, <2,4,5,6>
+ 2632714080U, // <2,7,2,5>: Cost 3 vext2 <2,5,2,7>, <2,5,2,7>
+ 2633377713U, // <2,7,2,6>: Cost 3 vext2 <2,6,2,7>, <2,6,2,7>
+ 3695175658U, // <2,7,2,7>: Cost 4 vext2 <0,6,2,7>, <2,7,0,1>
+ 2634704979U, // <2,7,2,u>: Cost 3 vext2 <2,u,2,7>, <2,u,2,7>
+ 1514094694U, // <2,7,3,0>: Cost 2 vext1 <6,2,7,3>, LHS
+ 2569921680U, // <2,7,3,1>: Cost 3 vext1 <3,2,7,3>, <1,5,3,7>
+ 2587838056U, // <2,7,3,2>: Cost 3 vext1 <6,2,7,3>, <2,2,2,2>
+ 2569922927U, // <2,7,3,3>: Cost 3 vext1 <3,2,7,3>, <3,2,7,3>
+ 1514097974U, // <2,7,3,4>: Cost 2 vext1 <6,2,7,3>, RHS
+ 2581868321U, // <2,7,3,5>: Cost 3 vext1 <5,2,7,3>, <5,2,7,3>
+ 1514099194U, // <2,7,3,6>: Cost 2 vext1 <6,2,7,3>, <6,2,7,3>
+ 2587841530U, // <2,7,3,7>: Cost 3 vext1 <6,2,7,3>, <7,0,1,2>
+ 1514100526U, // <2,7,3,u>: Cost 2 vext1 <6,2,7,3>, LHS
+ 2708706617U, // <2,7,4,0>: Cost 3 vext3 <4,0,6,2>, <7,4,0,6>
+ 3649643418U, // <2,7,4,1>: Cost 4 vext1 <4,2,7,4>, <1,2,3,4>
+ 3649644330U, // <2,7,4,2>: Cost 4 vext1 <4,2,7,4>, <2,4,5,7>
+ 2257982640U, // <2,7,4,3>: Cost 3 vrev <7,2,3,4>
+ 3649645641U, // <2,7,4,4>: Cost 4 vext1 <4,2,7,4>, <4,2,7,4>
+ 2621435190U, // <2,7,4,5>: Cost 3 vext2 <0,6,2,7>, RHS
+ 2712835441U, // <2,7,4,6>: Cost 3 vext3 <4,6,u,2>, <7,4,6,u>
+ 3799995762U, // <2,7,4,7>: Cost 4 vext3 <7,0,1,2>, <7,4,7,0>
+ 2621435433U, // <2,7,4,u>: Cost 3 vext2 <0,6,2,7>, RHS
+ 2729497990U, // <2,7,5,0>: Cost 3 vext3 <7,5,0,2>, <7,5,0,2>
+ 3643679744U, // <2,7,5,1>: Cost 4 vext1 <3,2,7,5>, <1,3,5,7>
+ 3637708424U, // <2,7,5,2>: Cost 4 vext1 <2,2,7,5>, <2,2,5,7>
+ 3643681137U, // <2,7,5,3>: Cost 4 vext1 <3,2,7,5>, <3,2,7,5>
+ 2599800118U, // <2,7,5,4>: Cost 3 vext1 <u,2,7,5>, RHS
+ 3786577334U, // <2,7,5,5>: Cost 4 vext3 <4,6,u,2>, <7,5,5,5>
+ 3786577345U, // <2,7,5,6>: Cost 4 vext3 <4,6,u,2>, <7,5,6,7>
+ 2599802214U, // <2,7,5,7>: Cost 3 vext1 <u,2,7,5>, <7,4,5,6>
+ 2599802670U, // <2,7,5,u>: Cost 3 vext1 <u,2,7,5>, LHS
+ 2581889126U, // <2,7,6,0>: Cost 3 vext1 <5,2,7,6>, LHS
+ 3643687936U, // <2,7,6,1>: Cost 4 vext1 <3,2,7,6>, <1,3,5,7>
+ 2663240186U, // <2,7,6,2>: Cost 3 vext2 <7,6,2,7>, <6,2,7,3>
+ 3643689330U, // <2,7,6,3>: Cost 4 vext1 <3,2,7,6>, <3,2,7,6>
+ 2581892406U, // <2,7,6,4>: Cost 3 vext1 <5,2,7,6>, RHS
+ 2581892900U, // <2,7,6,5>: Cost 3 vext1 <5,2,7,6>, <5,2,7,6>
+ 2587865597U, // <2,7,6,6>: Cost 3 vext1 <6,2,7,6>, <6,2,7,6>
+ 3786577428U, // <2,7,6,7>: Cost 4 vext3 <4,6,u,2>, <7,6,7,0>
+ 2581894958U, // <2,7,6,u>: Cost 3 vext1 <5,2,7,6>, LHS
+ 2726254119U, // <2,7,7,0>: Cost 3 vext3 <7,0,1,2>, <7,7,0,1>
+ 3804640817U, // <2,7,7,1>: Cost 4 vext3 <7,7,1,2>, <7,7,1,2>
+ 3637724826U, // <2,7,7,2>: Cost 4 vext1 <2,2,7,7>, <2,2,7,7>
+ 3734992123U, // <2,7,7,3>: Cost 4 vext2 <7,3,2,7>, <7,3,2,7>
+ 2552040758U, // <2,7,7,4>: Cost 3 vext1 <0,2,7,7>, RHS
+ 3799995992U, // <2,7,7,5>: Cost 4 vext3 <7,0,1,2>, <7,7,5,5>
+ 2663241198U, // <2,7,7,6>: Cost 3 vext2 <7,6,2,7>, <7,6,2,7>
+ 2712835692U, // <2,7,7,7>: Cost 3 vext3 <4,6,u,2>, <7,7,7,7>
+ 2731562607U, // <2,7,7,u>: Cost 3 vext3 <7,u,1,2>, <7,7,u,1>
+ 1514135654U, // <2,7,u,0>: Cost 2 vext1 <6,2,7,u>, LHS
+ 1657820802U, // <2,7,u,1>: Cost 2 vext3 <7,u,1,2>, <7,u,1,2>
+ 2587879016U, // <2,7,u,2>: Cost 3 vext1 <6,2,7,u>, <2,2,2,2>
+ 2569963892U, // <2,7,u,3>: Cost 3 vext1 <3,2,7,u>, <3,2,7,u>
+ 1514138934U, // <2,7,u,4>: Cost 2 vext1 <6,2,7,u>, RHS
+ 2621438106U, // <2,7,u,5>: Cost 3 vext2 <0,6,2,7>, RHS
+ 1514140159U, // <2,7,u,6>: Cost 2 vext1 <6,2,7,u>, <6,2,7,u>
+ 2587882490U, // <2,7,u,7>: Cost 3 vext1 <6,2,7,u>, <7,0,1,2>
+ 1514141486U, // <2,7,u,u>: Cost 2 vext1 <6,2,7,u>, LHS
+ 1544380416U, // <2,u,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+ 470638699U, // <2,u,0,1>: Cost 1 vext2 LHS, LHS
+ 1544380580U, // <2,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+ 1658631909U, // <2,u,0,3>: Cost 2 vext3 <u,0,3,2>, <u,0,3,2>
+ 1544380754U, // <2,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+ 2665898414U, // <2,u,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
+ 1658853120U, // <2,u,0,6>: Cost 2 vext3 <u,0,6,2>, <u,0,6,2>
+ 3094531625U, // <2,u,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
+ 470639261U, // <2,u,0,u>: Cost 1 vext2 LHS, LHS
+ 1544381174U, // <2,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+ 1544381236U, // <2,u,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+ 1544381334U, // <2,u,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+ 1544381400U, // <2,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+ 2618123325U, // <2,u,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
+ 1544381584U, // <2,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+ 2618123489U, // <2,u,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
+ 2726254427U, // <2,u,1,7>: Cost 3 vext3 <7,0,1,2>, <u,1,7,3>
+ 1544381823U, // <2,u,1,u>: Cost 2 vext2 LHS, <1,u,3,3>
+ 1478328422U, // <2,u,2,0>: Cost 2 vext1 <0,2,u,2>, LHS
+ 2618123807U, // <2,u,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
+ 269271142U, // <2,u,2,2>: Cost 1 vdup2 LHS
+ 1544382118U, // <2,u,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+ 1478331702U, // <2,u,2,4>: Cost 2 vext1 <0,2,u,2>, RHS
+ 2618124136U, // <2,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+ 1544382394U, // <2,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+ 3088354857U, // <2,u,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
+ 269271142U, // <2,u,2,u>: Cost 1 vdup2 LHS
+ 1544382614U, // <2,u,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+ 2953627374U, // <2,u,3,1>: Cost 3 vzipr LHS, <2,3,u,1>
+ 1490282143U, // <2,u,3,2>: Cost 2 vext1 <2,2,u,3>, <2,2,u,3>
+ 1879883932U, // <2,u,3,3>: Cost 2 vzipr LHS, LHS
+ 1544382978U, // <2,u,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+ 2953627378U, // <2,u,3,5>: Cost 3 vzipr LHS, <2,3,u,5>
+ 1514172931U, // <2,u,3,6>: Cost 2 vext1 <6,2,u,3>, <6,2,u,3>
+ 1879887176U, // <2,u,3,7>: Cost 2 vzipr LHS, RHS
+ 1879883937U, // <2,u,3,u>: Cost 2 vzipr LHS, LHS
+ 1484316774U, // <2,u,4,0>: Cost 2 vext1 <1,2,u,4>, LHS
+ 1484317639U, // <2,u,4,1>: Cost 2 vext1 <1,2,u,4>, <1,2,u,4>
+ 2552088270U, // <2,u,4,2>: Cost 3 vext1 <0,2,u,4>, <2,3,4,5>
+ 1190213513U, // <2,u,4,3>: Cost 2 vrev <u,2,3,4>
+ 1484320054U, // <2,u,4,4>: Cost 2 vext1 <1,2,u,4>, RHS
+ 470641974U, // <2,u,4,5>: Cost 1 vext2 LHS, RHS
+ 1592159604U, // <2,u,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+ 3094564393U, // <2,u,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
+ 470642217U, // <2,u,4,u>: Cost 1 vext2 LHS, RHS
+ 2552094959U, // <2,u,5,0>: Cost 3 vext1 <0,2,u,5>, <0,2,u,5>
+ 1592159952U, // <2,u,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
+ 2564040353U, // <2,u,5,2>: Cost 3 vext1 <2,2,u,5>, <2,2,u,5>
+ 2690275455U, // <2,u,5,3>: Cost 3 vext3 <0,u,u,2>, <u,5,3,7>
+ 1592160198U, // <2,u,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+ 1592160260U, // <2,u,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+ 1611962522U, // <2,u,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
+ 1592160424U, // <2,u,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
+ 1611962540U, // <2,u,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
+ 1478361190U, // <2,u,6,0>: Cost 2 vext1 <0,2,u,6>, LHS
+ 2552103670U, // <2,u,6,1>: Cost 3 vext1 <0,2,u,6>, <1,0,3,2>
+ 1592160762U, // <2,u,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
+ 2685704400U, // <2,u,6,3>: Cost 3 vext3 <0,2,0,2>, <u,6,3,7>
+ 1478364470U, // <2,u,6,4>: Cost 2 vext1 <0,2,u,6>, RHS
+ 2901891226U, // <2,u,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
+ 1592161080U, // <2,u,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+ 1592161102U, // <2,u,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+ 1478367022U, // <2,u,6,u>: Cost 2 vext1 <0,2,u,6>, LHS
+ 1592161274U, // <2,u,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
+ 2659931226U, // <2,u,7,1>: Cost 3 vext2 <7,1,2,u>, <7,1,2,u>
+ 2564056739U, // <2,u,7,2>: Cost 3 vext1 <2,2,u,7>, <2,2,u,7>
+ 2665903331U, // <2,u,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
+ 1592161638U, // <2,u,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
+ 2665903494U, // <2,u,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
+ 2587947527U, // <2,u,7,6>: Cost 3 vext1 <6,2,u,7>, <6,2,u,7>
+ 1592161900U, // <2,u,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+ 1592161922U, // <2,u,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
+ 1478377574U, // <2,u,u,0>: Cost 2 vext1 <0,2,u,u>, LHS
+ 470644526U, // <2,u,u,1>: Cost 1 vext2 LHS, LHS
+ 269271142U, // <2,u,u,2>: Cost 1 vdup2 LHS
+ 1879924892U, // <2,u,u,3>: Cost 2 vzipr LHS, LHS
+ 1478380854U, // <2,u,u,4>: Cost 2 vext1 <0,2,u,u>, RHS
+ 470644890U, // <2,u,u,5>: Cost 1 vext2 LHS, RHS
+ 1611962765U, // <2,u,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
+ 1879928136U, // <2,u,u,7>: Cost 2 vzipr LHS, RHS
+ 470645093U, // <2,u,u,u>: Cost 1 vext2 LHS, LHS
+ 1611448320U, // <3,0,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
+ 1611890698U, // <3,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
+ 1611890708U, // <3,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
+ 3763576860U, // <3,0,0,3>: Cost 4 vext3 LHS, <0,0,3,1>
+ 2689835045U, // <3,0,0,4>: Cost 3 vext3 LHS, <0,0,4,1>
+ 3698508206U, // <3,0,0,5>: Cost 4 vext2 <1,2,3,0>, <0,5,2,7>
+ 3763576887U, // <3,0,0,6>: Cost 4 vext3 LHS, <0,0,6,1>
+ 3667678434U, // <3,0,0,7>: Cost 4 vext1 <7,3,0,0>, <7,3,0,0>
+ 1616093258U, // <3,0,0,u>: Cost 2 vext3 LHS, <0,0,u,2>
+ 1490337894U, // <3,0,1,0>: Cost 2 vext1 <2,3,0,1>, LHS
+ 2685632602U, // <3,0,1,1>: Cost 3 vext3 LHS, <0,1,1,0>
+ 537706598U, // <3,0,1,2>: Cost 1 vext3 LHS, LHS
+ 2624766936U, // <3,0,1,3>: Cost 3 vext2 <1,2,3,0>, <1,3,1,3>
+ 1490341174U, // <3,0,1,4>: Cost 2 vext1 <2,3,0,1>, RHS
+ 2624767120U, // <3,0,1,5>: Cost 3 vext2 <1,2,3,0>, <1,5,3,7>
+ 2732966030U, // <3,0,1,6>: Cost 3 vext3 LHS, <0,1,6,7>
+ 2593944803U, // <3,0,1,7>: Cost 3 vext1 <7,3,0,1>, <7,3,0,1>
+ 537706652U, // <3,0,1,u>: Cost 1 vext3 LHS, LHS
+ 1611890852U, // <3,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+ 2685632684U, // <3,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
+ 2685632692U, // <3,0,2,2>: Cost 3 vext3 LHS, <0,2,2,0>
+ 2685632702U, // <3,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
+ 1611890892U, // <3,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+ 2732966102U, // <3,0,2,5>: Cost 3 vext3 LHS, <0,2,5,7>
+ 2624767930U, // <3,0,2,6>: Cost 3 vext2 <1,2,3,0>, <2,6,3,7>
+ 2685632744U, // <3,0,2,7>: Cost 3 vext3 LHS, <0,2,7,7>
+ 1611890924U, // <3,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
+ 2624768150U, // <3,0,3,0>: Cost 3 vext2 <1,2,3,0>, <3,0,1,2>
+ 2685632764U, // <3,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
+ 2685632774U, // <3,0,3,2>: Cost 3 vext3 LHS, <0,3,2,1>
+ 2624768412U, // <3,0,3,3>: Cost 3 vext2 <1,2,3,0>, <3,3,3,3>
+ 2624768514U, // <3,0,3,4>: Cost 3 vext2 <1,2,3,0>, <3,4,5,6>
+ 3702491714U, // <3,0,3,5>: Cost 4 vext2 <1,u,3,0>, <3,5,3,7>
+ 2624768632U, // <3,0,3,6>: Cost 3 vext2 <1,2,3,0>, <3,6,0,7>
+ 3702491843U, // <3,0,3,7>: Cost 4 vext2 <1,u,3,0>, <3,7,0,1>
+ 2686959934U, // <3,0,3,u>: Cost 3 vext3 <0,3,u,3>, <0,3,u,3>
+ 2689835336U, // <3,0,4,0>: Cost 3 vext3 LHS, <0,4,0,4>
+ 1611891026U, // <3,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
+ 1611891036U, // <3,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
+ 3763577184U, // <3,0,4,3>: Cost 4 vext3 LHS, <0,4,3,1>
+ 2689835374U, // <3,0,4,4>: Cost 3 vext3 LHS, <0,4,4,6>
+ 1551027510U, // <3,0,4,5>: Cost 2 vext2 <1,2,3,0>, RHS
+ 2666573172U, // <3,0,4,6>: Cost 3 vext2 <u,2,3,0>, <4,6,4,6>
+ 3667711206U, // <3,0,4,7>: Cost 4 vext1 <7,3,0,4>, <7,3,0,4>
+ 1616093586U, // <3,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
+ 2685190556U, // <3,0,5,0>: Cost 3 vext3 LHS, <0,5,0,7>
+ 2666573520U, // <3,0,5,1>: Cost 3 vext2 <u,2,3,0>, <5,1,7,3>
+ 3040886886U, // <3,0,5,2>: Cost 3 vtrnl <3,4,5,6>, LHS
+ 3625912834U, // <3,0,5,3>: Cost 4 vext1 <0,3,0,5>, <3,4,5,6>
+ 2666573766U, // <3,0,5,4>: Cost 3 vext2 <u,2,3,0>, <5,4,7,6>
+ 2666573828U, // <3,0,5,5>: Cost 3 vext2 <u,2,3,0>, <5,5,5,5>
+ 2732966354U, // <3,0,5,6>: Cost 3 vext3 LHS, <0,5,6,7>
+ 2666573992U, // <3,0,5,7>: Cost 3 vext2 <u,2,3,0>, <5,7,5,7>
+ 3040886940U, // <3,0,5,u>: Cost 3 vtrnl <3,4,5,6>, LHS
+ 2685190637U, // <3,0,6,0>: Cost 3 vext3 LHS, <0,6,0,7>
+ 2732966390U, // <3,0,6,1>: Cost 3 vext3 LHS, <0,6,1,7>
+ 2689835519U, // <3,0,6,2>: Cost 3 vext3 LHS, <0,6,2,7>
+ 3667724438U, // <3,0,6,3>: Cost 4 vext1 <7,3,0,6>, <3,0,1,2>
+ 3763577355U, // <3,0,6,4>: Cost 4 vext3 LHS, <0,6,4,1>
+ 3806708243U, // <3,0,6,5>: Cost 4 vext3 LHS, <0,6,5,0>
+ 2666574648U, // <3,0,6,6>: Cost 3 vext2 <u,2,3,0>, <6,6,6,6>
+ 2657948520U, // <3,0,6,7>: Cost 3 vext2 <6,7,3,0>, <6,7,3,0>
+ 2689835573U, // <3,0,6,u>: Cost 3 vext3 LHS, <0,6,u,7>
+ 2666574842U, // <3,0,7,0>: Cost 3 vext2 <u,2,3,0>, <7,0,1,2>
+ 2685633095U, // <3,0,7,1>: Cost 3 vext3 LHS, <0,7,1,7>
+ 2660603052U, // <3,0,7,2>: Cost 3 vext2 <7,2,3,0>, <7,2,3,0>
+ 3643844997U, // <3,0,7,3>: Cost 4 vext1 <3,3,0,7>, <3,3,0,7>
+ 2666575206U, // <3,0,7,4>: Cost 3 vext2 <u,2,3,0>, <7,4,5,6>
+ 3655790391U, // <3,0,7,5>: Cost 4 vext1 <5,3,0,7>, <5,3,0,7>
+ 3731690968U, // <3,0,7,6>: Cost 4 vext2 <6,7,3,0>, <7,6,0,3>
+ 2666575468U, // <3,0,7,7>: Cost 3 vext2 <u,2,3,0>, <7,7,7,7>
+ 2664584850U, // <3,0,7,u>: Cost 3 vext2 <7,u,3,0>, <7,u,3,0>
+ 1616093834U, // <3,0,u,0>: Cost 2 vext3 LHS, <0,u,0,2>
+ 1611891346U, // <3,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
+ 537707165U, // <3,0,u,2>: Cost 1 vext3 LHS, LHS
+ 2689835684U, // <3,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
+ 1616093874U, // <3,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
+ 1551030426U, // <3,0,u,5>: Cost 2 vext2 <1,2,3,0>, RHS
+ 2624772304U, // <3,0,u,6>: Cost 3 vext2 <1,2,3,0>, <u,6,3,7>
+ 2594002154U, // <3,0,u,7>: Cost 3 vext1 <7,3,0,u>, <7,3,0,u>
+ 537707219U, // <3,0,u,u>: Cost 1 vext3 LHS, LHS
+ 2552201318U, // <3,1,0,0>: Cost 3 vext1 <0,3,1,0>, LHS
+ 2618802278U, // <3,1,0,1>: Cost 3 vext2 <0,2,3,1>, LHS
+ 2618802366U, // <3,1,0,2>: Cost 3 vext2 <0,2,3,1>, <0,2,3,1>
+ 1611449078U, // <3,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
+ 2552204598U, // <3,1,0,4>: Cost 3 vext1 <0,3,1,0>, RHS
+ 2732966663U, // <3,1,0,5>: Cost 3 vext3 LHS, <1,0,5,1>
+ 3906258396U, // <3,1,0,6>: Cost 4 vuzpr <2,3,0,1>, <2,0,4,6>
+ 3667752171U, // <3,1,0,7>: Cost 4 vext1 <7,3,1,0>, <7,3,1,0>
+ 1611891491U, // <3,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
+ 2689835819U, // <3,1,1,0>: Cost 3 vext3 LHS, <1,1,0,1>
+ 1611449140U, // <3,1,1,1>: Cost 2 vext3 LHS, <1,1,1,1>
+ 2624775063U, // <3,1,1,2>: Cost 3 vext2 <1,2,3,1>, <1,2,3,1>
+ 1611891528U, // <3,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
+ 2689835859U, // <3,1,1,4>: Cost 3 vext3 LHS, <1,1,4,5>
+ 2689835868U, // <3,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
+ 3763577701U, // <3,1,1,6>: Cost 4 vext3 LHS, <1,1,6,5>
+ 3765273452U, // <3,1,1,7>: Cost 4 vext3 <1,1,7,3>, <1,1,7,3>
+ 1611891573U, // <3,1,1,u>: Cost 2 vext3 LHS, <1,1,u,3>
+ 2629420494U, // <3,1,2,0>: Cost 3 vext2 <2,0,3,1>, <2,0,3,1>
+ 2689835911U, // <3,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
+ 2564163248U, // <3,1,2,2>: Cost 3 vext1 <2,3,1,2>, <2,3,1,2>
+ 1611449238U, // <3,1,2,3>: Cost 2 vext3 LHS, <1,2,3,0>
+ 2564164918U, // <3,1,2,4>: Cost 3 vext1 <2,3,1,2>, RHS
+ 2689835947U, // <3,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
+ 3692545978U, // <3,1,2,6>: Cost 4 vext2 <0,2,3,1>, <2,6,3,7>
+ 2732966842U, // <3,1,2,7>: Cost 3 vext3 LHS, <1,2,7,0>
+ 1611891651U, // <3,1,2,u>: Cost 2 vext3 LHS, <1,2,u,0>
+ 1484456038U, // <3,1,3,0>: Cost 2 vext1 <1,3,1,3>, LHS
+ 1611891672U, // <3,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
+ 2685633502U, // <3,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
+ 2685633512U, // <3,1,3,3>: Cost 3 vext3 LHS, <1,3,3,1>
+ 1484459318U, // <3,1,3,4>: Cost 2 vext1 <1,3,1,3>, RHS
+ 1611891712U, // <3,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
+ 2689836041U, // <3,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
+ 2733409294U, // <3,1,3,7>: Cost 3 vext3 LHS, <1,3,7,3>
+ 1611891735U, // <3,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
+ 2552234086U, // <3,1,4,0>: Cost 3 vext1 <0,3,1,4>, LHS
+ 2732966955U, // <3,1,4,1>: Cost 3 vext3 LHS, <1,4,1,5>
+ 2732966964U, // <3,1,4,2>: Cost 3 vext3 LHS, <1,4,2,5>
+ 2685633597U, // <3,1,4,3>: Cost 3 vext3 LHS, <1,4,3,5>
+ 2552237366U, // <3,1,4,4>: Cost 3 vext1 <0,3,1,4>, RHS
+ 2618805558U, // <3,1,4,5>: Cost 3 vext2 <0,2,3,1>, RHS
+ 2769472822U, // <3,1,4,6>: Cost 3 vuzpl <3,0,1,2>, RHS
+ 3667784943U, // <3,1,4,7>: Cost 4 vext1 <7,3,1,4>, <7,3,1,4>
+ 2685633642U, // <3,1,4,u>: Cost 3 vext3 LHS, <1,4,u,5>
+ 2689836143U, // <3,1,5,0>: Cost 3 vext3 LHS, <1,5,0,1>
+ 2564187280U, // <3,1,5,1>: Cost 3 vext1 <2,3,1,5>, <1,5,3,7>
+ 2564187827U, // <3,1,5,2>: Cost 3 vext1 <2,3,1,5>, <2,3,1,5>
+ 1611891856U, // <3,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
+ 2689836183U, // <3,1,5,4>: Cost 3 vext3 LHS, <1,5,4,5>
+ 3759375522U, // <3,1,5,5>: Cost 4 vext3 LHS, <1,5,5,7>
+ 3720417378U, // <3,1,5,6>: Cost 4 vext2 <4,u,3,1>, <5,6,7,0>
+ 2832518454U, // <3,1,5,7>: Cost 3 vuzpr <2,3,0,1>, RHS
+ 1611891901U, // <3,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
+ 3763578048U, // <3,1,6,0>: Cost 4 vext3 LHS, <1,6,0,1>
+ 2689836239U, // <3,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
+ 2732967128U, // <3,1,6,2>: Cost 3 vext3 LHS, <1,6,2,7>
+ 2685633761U, // <3,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
+ 3763578088U, // <3,1,6,4>: Cost 4 vext3 LHS, <1,6,4,5>
+ 2689836275U, // <3,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
+ 3763578108U, // <3,1,6,6>: Cost 4 vext3 LHS, <1,6,6,7>
+ 2732967166U, // <3,1,6,7>: Cost 3 vext3 LHS, <1,6,7,0>
+ 2685633806U, // <3,1,6,u>: Cost 3 vext3 LHS, <1,6,u,7>
+ 3631972454U, // <3,1,7,0>: Cost 4 vext1 <1,3,1,7>, LHS
+ 2659947612U, // <3,1,7,1>: Cost 3 vext2 <7,1,3,1>, <7,1,3,1>
+ 4036102294U, // <3,1,7,2>: Cost 4 vzipr <1,5,3,7>, <3,0,1,2>
+ 3095396454U, // <3,1,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
+ 3631975734U, // <3,1,7,4>: Cost 4 vext1 <1,3,1,7>, RHS
+ 2222982144U, // <3,1,7,5>: Cost 3 vrev <1,3,5,7>
+ 3296797705U, // <3,1,7,6>: Cost 4 vrev <1,3,6,7>
+ 3720418924U, // <3,1,7,7>: Cost 4 vext2 <4,u,3,1>, <7,7,7,7>
+ 3095396459U, // <3,1,7,u>: Cost 3 vtrnr <1,3,5,7>, LHS
+ 1484496998U, // <3,1,u,0>: Cost 2 vext1 <1,3,1,u>, LHS
+ 1611892077U, // <3,1,u,1>: Cost 2 vext3 LHS, <1,u,1,3>
+ 2685633907U, // <3,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
+ 1611892092U, // <3,1,u,3>: Cost 2 vext3 LHS, <1,u,3,0>
+ 1484500278U, // <3,1,u,4>: Cost 2 vext1 <1,3,1,u>, RHS
+ 1611892117U, // <3,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
+ 2685633950U, // <3,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
+ 2832518697U, // <3,1,u,7>: Cost 3 vuzpr <2,3,0,1>, RHS
+ 1611892140U, // <3,1,u,u>: Cost 2 vext3 LHS, <1,u,u,3>
+ 2623455232U, // <3,2,0,0>: Cost 3 vext2 <1,0,3,2>, <0,0,0,0>
+ 1549713510U, // <3,2,0,1>: Cost 2 vext2 <1,0,3,2>, LHS
+ 2689836484U, // <3,2,0,2>: Cost 3 vext3 LHS, <2,0,2,0>
+ 2685633997U, // <3,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
+ 2623455570U, // <3,2,0,4>: Cost 3 vext2 <1,0,3,2>, <0,4,1,5>
+ 2732967398U, // <3,2,0,5>: Cost 3 vext3 LHS, <2,0,5,7>
+ 2689836524U, // <3,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
+ 2229044964U, // <3,2,0,7>: Cost 3 vrev <2,3,7,0>
+ 1549714077U, // <3,2,0,u>: Cost 2 vext2 <1,0,3,2>, LHS
+ 1549714166U, // <3,2,1,0>: Cost 2 vext2 <1,0,3,2>, <1,0,3,2>
+ 2623456052U, // <3,2,1,1>: Cost 3 vext2 <1,0,3,2>, <1,1,1,1>
+ 2623456150U, // <3,2,1,2>: Cost 3 vext2 <1,0,3,2>, <1,2,3,0>
+ 2685634079U, // <3,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
+ 2552286518U, // <3,2,1,4>: Cost 3 vext1 <0,3,2,1>, RHS
+ 2623456400U, // <3,2,1,5>: Cost 3 vext2 <1,0,3,2>, <1,5,3,7>
+ 2689836604U, // <3,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
+ 3667834101U, // <3,2,1,7>: Cost 4 vext1 <7,3,2,1>, <7,3,2,1>
+ 1155385070U, // <3,2,1,u>: Cost 2 vrev <2,3,u,1>
+ 2689836629U, // <3,2,2,0>: Cost 3 vext3 LHS, <2,2,0,1>
+ 2689836640U, // <3,2,2,1>: Cost 3 vext3 LHS, <2,2,1,3>
+ 1611449960U, // <3,2,2,2>: Cost 2 vext3 LHS, <2,2,2,2>
+ 1611892338U, // <3,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
+ 2689836669U, // <3,2,2,4>: Cost 3 vext3 LHS, <2,2,4,5>
+ 2689836680U, // <3,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
+ 2689836688U, // <3,2,2,6>: Cost 3 vext3 LHS, <2,2,6,6>
+ 3763578518U, // <3,2,2,7>: Cost 4 vext3 LHS, <2,2,7,3>
+ 1611892383U, // <3,2,2,u>: Cost 2 vext3 LHS, <2,2,u,3>
+ 1611450022U, // <3,2,3,0>: Cost 2 vext3 LHS, <2,3,0,1>
+ 2685191854U, // <3,2,3,1>: Cost 3 vext3 LHS, <2,3,1,0>
+ 2685191865U, // <3,2,3,2>: Cost 3 vext3 LHS, <2,3,2,2>
+ 2685191875U, // <3,2,3,3>: Cost 3 vext3 LHS, <2,3,3,3>
+ 1611450062U, // <3,2,3,4>: Cost 2 vext3 LHS, <2,3,4,5>
+ 2732967635U, // <3,2,3,5>: Cost 3 vext3 LHS, <2,3,5,1>
+ 2732967645U, // <3,2,3,6>: Cost 3 vext3 LHS, <2,3,6,2>
+ 2732967652U, // <3,2,3,7>: Cost 3 vext3 LHS, <2,3,7,0>
+ 1611450094U, // <3,2,3,u>: Cost 2 vext3 LHS, <2,3,u,1>
+ 2558279782U, // <3,2,4,0>: Cost 3 vext1 <1,3,2,4>, LHS
+ 2558280602U, // <3,2,4,1>: Cost 3 vext1 <1,3,2,4>, <1,2,3,4>
+ 2732967692U, // <3,2,4,2>: Cost 3 vext3 LHS, <2,4,2,4>
+ 2685634326U, // <3,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
+ 2558283062U, // <3,2,4,4>: Cost 3 vext1 <1,3,2,4>, RHS
+ 1549716790U, // <3,2,4,5>: Cost 2 vext2 <1,0,3,2>, RHS
+ 2689836844U, // <3,2,4,6>: Cost 3 vext3 LHS, <2,4,6,0>
+ 2229077736U, // <3,2,4,7>: Cost 3 vrev <2,3,7,4>
+ 1549717033U, // <3,2,4,u>: Cost 2 vext2 <1,0,3,2>, RHS
+ 2552316006U, // <3,2,5,0>: Cost 3 vext1 <0,3,2,5>, LHS
+ 2228643507U, // <3,2,5,1>: Cost 3 vrev <2,3,1,5>
+ 2689836896U, // <3,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
+ 2685634408U, // <3,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
+ 1155122894U, // <3,2,5,4>: Cost 2 vrev <2,3,4,5>
+ 2665263108U, // <3,2,5,5>: Cost 3 vext2 <u,0,3,2>, <5,5,5,5>
+ 2689836932U, // <3,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
+ 2665263272U, // <3,2,5,7>: Cost 3 vext2 <u,0,3,2>, <5,7,5,7>
+ 1155417842U, // <3,2,5,u>: Cost 2 vrev <2,3,u,5>
+ 2689836953U, // <3,2,6,0>: Cost 3 vext3 LHS, <2,6,0,1>
+ 2689836964U, // <3,2,6,1>: Cost 3 vext3 LHS, <2,6,1,3>
+ 2689836976U, // <3,2,6,2>: Cost 3 vext3 LHS, <2,6,2,6>
+ 1611892666U, // <3,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
+ 2689836993U, // <3,2,6,4>: Cost 3 vext3 LHS, <2,6,4,5>
+ 2689837004U, // <3,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
+ 2689837013U, // <3,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
+ 2665263950U, // <3,2,6,7>: Cost 3 vext2 <u,0,3,2>, <6,7,0,1>
+ 1611892711U, // <3,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
+ 2665264122U, // <3,2,7,0>: Cost 3 vext2 <u,0,3,2>, <7,0,1,2>
+ 2623460419U, // <3,2,7,1>: Cost 3 vext2 <1,0,3,2>, <7,1,0,3>
+ 4169138340U, // <3,2,7,2>: Cost 4 vtrnr <1,3,5,7>, <0,2,0,2>
+ 2962358374U, // <3,2,7,3>: Cost 3 vzipr <1,5,3,7>, LHS
+ 2665264486U, // <3,2,7,4>: Cost 3 vext2 <u,0,3,2>, <7,4,5,6>
+ 2228954841U, // <3,2,7,5>: Cost 3 vrev <2,3,5,7>
+ 2229028578U, // <3,2,7,6>: Cost 3 vrev <2,3,6,7>
+ 2665264748U, // <3,2,7,7>: Cost 3 vext2 <u,0,3,2>, <7,7,7,7>
+ 2962358379U, // <3,2,7,u>: Cost 3 vzipr <1,5,3,7>, LHS
+ 1611892795U, // <3,2,u,0>: Cost 2 vext3 LHS, <2,u,0,1>
+ 1549719342U, // <3,2,u,1>: Cost 2 vext2 <1,0,3,2>, LHS
+ 1611449960U, // <3,2,u,2>: Cost 2 vext3 LHS, <2,2,2,2>
+ 1611892824U, // <3,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
+ 1611892835U, // <3,2,u,4>: Cost 2 vext3 LHS, <2,u,4,5>
+ 1549719706U, // <3,2,u,5>: Cost 2 vext2 <1,0,3,2>, RHS
+ 2689837168U, // <3,2,u,6>: Cost 3 vext3 LHS, <2,u,6,0>
+ 2665265408U, // <3,2,u,7>: Cost 3 vext2 <u,0,3,2>, <u,7,0,1>
+ 1611892867U, // <3,2,u,u>: Cost 2 vext3 LHS, <2,u,u,1>
+ 2685192331U, // <3,3,0,0>: Cost 3 vext3 LHS, <3,0,0,0>
+ 1611450518U, // <3,3,0,1>: Cost 2 vext3 LHS, <3,0,1,2>
+ 2685634717U, // <3,3,0,2>: Cost 3 vext3 LHS, <3,0,2,0>
+ 2564294806U, // <3,3,0,3>: Cost 3 vext1 <2,3,3,0>, <3,0,1,2>
+ 2685634736U, // <3,3,0,4>: Cost 3 vext3 LHS, <3,0,4,1>
+ 2732968122U, // <3,3,0,5>: Cost 3 vext3 LHS, <3,0,5,2>
+ 3763579075U, // <3,3,0,6>: Cost 4 vext3 LHS, <3,0,6,2>
+ 4034053264U, // <3,3,0,7>: Cost 4 vzipr <1,2,3,0>, <1,5,3,7>
+ 1611450581U, // <3,3,0,u>: Cost 2 vext3 LHS, <3,0,u,2>
+ 2685192415U, // <3,3,1,0>: Cost 3 vext3 LHS, <3,1,0,3>
+ 1550385992U, // <3,3,1,1>: Cost 2 vext2 <1,1,3,3>, <1,1,3,3>
+ 2685192433U, // <3,3,1,2>: Cost 3 vext3 LHS, <3,1,2,3>
+ 2685634808U, // <3,3,1,3>: Cost 3 vext3 LHS, <3,1,3,1>
+ 2558332214U, // <3,3,1,4>: Cost 3 vext1 <1,3,3,1>, RHS
+ 2685634828U, // <3,3,1,5>: Cost 3 vext3 LHS, <3,1,5,3>
+ 3759376661U, // <3,3,1,6>: Cost 4 vext3 LHS, <3,1,6,3>
+ 2703477022U, // <3,3,1,7>: Cost 3 vext3 <3,1,7,3>, <3,1,7,3>
+ 1555031423U, // <3,3,1,u>: Cost 2 vext2 <1,u,3,3>, <1,u,3,3>
+ 2564309094U, // <3,3,2,0>: Cost 3 vext1 <2,3,3,2>, LHS
+ 2630100513U, // <3,3,2,1>: Cost 3 vext2 <2,1,3,3>, <2,1,3,3>
+ 1557022322U, // <3,3,2,2>: Cost 2 vext2 <2,2,3,3>, <2,2,3,3>
+ 2685192520U, // <3,3,2,3>: Cost 3 vext3 LHS, <3,2,3,0>
+ 2564312374U, // <3,3,2,4>: Cost 3 vext1 <2,3,3,2>, RHS
+ 2732968286U, // <3,3,2,5>: Cost 3 vext3 LHS, <3,2,5,4>
+ 2685634918U, // <3,3,2,6>: Cost 3 vext3 LHS, <3,2,6,3>
+ 2704140655U, // <3,3,2,7>: Cost 3 vext3 <3,2,7,3>, <3,2,7,3>
+ 1561004120U, // <3,3,2,u>: Cost 2 vext2 <2,u,3,3>, <2,u,3,3>
+ 1496547430U, // <3,3,3,0>: Cost 2 vext1 <3,3,3,3>, LHS
+ 2624129256U, // <3,3,3,1>: Cost 3 vext2 <1,1,3,3>, <3,1,1,3>
+ 2630764866U, // <3,3,3,2>: Cost 3 vext2 <2,2,3,3>, <3,2,2,3>
+ 336380006U, // <3,3,3,3>: Cost 1 vdup3 LHS
+ 1496550710U, // <3,3,3,4>: Cost 2 vext1 <3,3,3,3>, RHS
+ 2732968368U, // <3,3,3,5>: Cost 3 vext3 LHS, <3,3,5,5>
+ 2624129683U, // <3,3,3,6>: Cost 3 vext2 <1,1,3,3>, <3,6,3,7>
+ 2594182400U, // <3,3,3,7>: Cost 3 vext1 <7,3,3,3>, <7,3,3,3>
+ 336380006U, // <3,3,3,u>: Cost 1 vdup3 LHS
+ 2558353510U, // <3,3,4,0>: Cost 3 vext1 <1,3,3,4>, LHS
+ 2558354411U, // <3,3,4,1>: Cost 3 vext1 <1,3,3,4>, <1,3,3,4>
+ 2564327108U, // <3,3,4,2>: Cost 3 vext1 <2,3,3,4>, <2,3,3,4>
+ 2564327938U, // <3,3,4,3>: Cost 3 vext1 <2,3,3,4>, <3,4,5,6>
+ 2960343962U, // <3,3,4,4>: Cost 3 vzipr <1,2,3,4>, <1,2,3,4>
+ 1611893250U, // <3,3,4,5>: Cost 2 vext3 LHS, <3,4,5,6>
+ 2771619126U, // <3,3,4,6>: Cost 3 vuzpl <3,3,3,3>, RHS
+ 4034086032U, // <3,3,4,7>: Cost 4 vzipr <1,2,3,4>, <1,5,3,7>
+ 1611893277U, // <3,3,4,u>: Cost 2 vext3 LHS, <3,4,u,6>
+ 2558361702U, // <3,3,5,0>: Cost 3 vext1 <1,3,3,5>, LHS
+ 2558362604U, // <3,3,5,1>: Cost 3 vext1 <1,3,3,5>, <1,3,3,5>
+ 2558363342U, // <3,3,5,2>: Cost 3 vext1 <1,3,3,5>, <2,3,4,5>
+ 2732968512U, // <3,3,5,3>: Cost 3 vext3 LHS, <3,5,3,5>
+ 2558364982U, // <3,3,5,4>: Cost 3 vext1 <1,3,3,5>, RHS
+ 3101279950U, // <3,3,5,5>: Cost 3 vtrnr <2,3,4,5>, <2,3,4,5>
+ 2665934946U, // <3,3,5,6>: Cost 3 vext2 <u,1,3,3>, <5,6,7,0>
+ 2826636598U, // <3,3,5,7>: Cost 3 vuzpr <1,3,1,3>, RHS
+ 2826636599U, // <3,3,5,u>: Cost 3 vuzpr <1,3,1,3>, RHS
+ 2732968568U, // <3,3,6,0>: Cost 3 vext3 LHS, <3,6,0,7>
+ 3763579521U, // <3,3,6,1>: Cost 4 vext3 LHS, <3,6,1,7>
+ 2732968586U, // <3,3,6,2>: Cost 3 vext3 LHS, <3,6,2,7>
+ 2732968595U, // <3,3,6,3>: Cost 3 vext3 LHS, <3,6,3,7>
+ 2732968604U, // <3,3,6,4>: Cost 3 vext3 LHS, <3,6,4,7>
+ 3763579557U, // <3,3,6,5>: Cost 4 vext3 LHS, <3,6,5,7>
+ 2732968621U, // <3,3,6,6>: Cost 3 vext3 LHS, <3,6,6,6>
+ 2657973099U, // <3,3,6,7>: Cost 3 vext2 <6,7,3,3>, <6,7,3,3>
+ 2658636732U, // <3,3,6,u>: Cost 3 vext2 <6,u,3,3>, <6,u,3,3>
+ 2558378086U, // <3,3,7,0>: Cost 3 vext1 <1,3,3,7>, LHS
+ 2558378990U, // <3,3,7,1>: Cost 3 vext1 <1,3,3,7>, <1,3,3,7>
+ 2564351687U, // <3,3,7,2>: Cost 3 vext1 <2,3,3,7>, <2,3,3,7>
+ 2661291264U, // <3,3,7,3>: Cost 3 vext2 <7,3,3,3>, <7,3,3,3>
+ 2558381366U, // <3,3,7,4>: Cost 3 vext1 <1,3,3,7>, RHS
+ 2732968694U, // <3,3,7,5>: Cost 3 vext3 LHS, <3,7,5,7>
+ 3781126907U, // <3,3,7,6>: Cost 4 vext3 <3,7,6,3>, <3,7,6,3>
+ 3095397376U, // <3,3,7,7>: Cost 3 vtrnr <1,3,5,7>, <1,3,5,7>
+ 2558383918U, // <3,3,7,u>: Cost 3 vext1 <1,3,3,7>, LHS
+ 1496547430U, // <3,3,u,0>: Cost 2 vext1 <3,3,3,3>, LHS
+ 1611893534U, // <3,3,u,1>: Cost 2 vext3 LHS, <3,u,1,2>
+ 1592858504U, // <3,3,u,2>: Cost 2 vext2 <u,2,3,3>, <u,2,3,3>
+ 336380006U, // <3,3,u,3>: Cost 1 vdup3 LHS
+ 1496550710U, // <3,3,u,4>: Cost 2 vext1 <3,3,3,3>, RHS
+ 1611893574U, // <3,3,u,5>: Cost 2 vext3 LHS, <3,u,5,6>
+ 2690280268U, // <3,3,u,6>: Cost 3 vext3 LHS, <3,u,6,3>
+ 2826636841U, // <3,3,u,7>: Cost 3 vuzpr <1,3,1,3>, RHS
+ 336380006U, // <3,3,u,u>: Cost 1 vdup3 LHS
+ 2624798720U, // <3,4,0,0>: Cost 3 vext2 <1,2,3,4>, <0,0,0,0>
+ 1551056998U, // <3,4,0,1>: Cost 2 vext2 <1,2,3,4>, LHS
+ 2624798884U, // <3,4,0,2>: Cost 3 vext2 <1,2,3,4>, <0,2,0,2>
+ 3693232384U, // <3,4,0,3>: Cost 4 vext2 <0,3,3,4>, <0,3,1,4>
+ 2624799058U, // <3,4,0,4>: Cost 3 vext2 <1,2,3,4>, <0,4,1,5>
+ 1659227026U, // <3,4,0,5>: Cost 2 vext3 LHS, <4,0,5,1>
+ 1659227036U, // <3,4,0,6>: Cost 2 vext3 LHS, <4,0,6,2>
+ 3667973382U, // <3,4,0,7>: Cost 4 vext1 <7,3,4,0>, <7,3,4,0>
+ 1551057565U, // <3,4,0,u>: Cost 2 vext2 <1,2,3,4>, LHS
+ 2624799478U, // <3,4,1,0>: Cost 3 vext2 <1,2,3,4>, <1,0,3,2>
+ 2624799540U, // <3,4,1,1>: Cost 3 vext2 <1,2,3,4>, <1,1,1,1>
+ 1551057818U, // <3,4,1,2>: Cost 2 vext2 <1,2,3,4>, <1,2,3,4>
+ 2624799704U, // <3,4,1,3>: Cost 3 vext2 <1,2,3,4>, <1,3,1,3>
+ 2564377910U, // <3,4,1,4>: Cost 3 vext1 <2,3,4,1>, RHS
+ 2689838050U, // <3,4,1,5>: Cost 3 vext3 LHS, <4,1,5,0>
+ 2689838062U, // <3,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
+ 2628117807U, // <3,4,1,7>: Cost 3 vext2 <1,7,3,4>, <1,7,3,4>
+ 1555039616U, // <3,4,1,u>: Cost 2 vext2 <1,u,3,4>, <1,u,3,4>
+ 3626180710U, // <3,4,2,0>: Cost 4 vext1 <0,3,4,2>, LHS
+ 2624800298U, // <3,4,2,1>: Cost 3 vext2 <1,2,3,4>, <2,1,4,3>
+ 2624800360U, // <3,4,2,2>: Cost 3 vext2 <1,2,3,4>, <2,2,2,2>
+ 2624800422U, // <3,4,2,3>: Cost 3 vext2 <1,2,3,4>, <2,3,0,1>
+ 2624800514U, // <3,4,2,4>: Cost 3 vext2 <1,2,3,4>, <2,4,1,3>
+ 2709965878U, // <3,4,2,5>: Cost 3 vext3 <4,2,5,3>, <4,2,5,3>
+ 2689838140U, // <3,4,2,6>: Cost 3 vext3 LHS, <4,2,6,0>
+ 2634090504U, // <3,4,2,7>: Cost 3 vext2 <2,7,3,4>, <2,7,3,4>
+ 2689838158U, // <3,4,2,u>: Cost 3 vext3 LHS, <4,2,u,0>
+ 2624800918U, // <3,4,3,0>: Cost 3 vext2 <1,2,3,4>, <3,0,1,2>
+ 2636081403U, // <3,4,3,1>: Cost 3 vext2 <3,1,3,4>, <3,1,3,4>
+ 2636745036U, // <3,4,3,2>: Cost 3 vext2 <3,2,3,4>, <3,2,3,4>
+ 2624801180U, // <3,4,3,3>: Cost 3 vext2 <1,2,3,4>, <3,3,3,3>
+ 2624801232U, // <3,4,3,4>: Cost 3 vext2 <1,2,3,4>, <3,4,0,1>
+ 2905836854U, // <3,4,3,5>: Cost 3 vzipl <3,3,3,3>, RHS
+ 3040054582U, // <3,4,3,6>: Cost 3 vtrnl <3,3,3,3>, RHS
+ 3702524611U, // <3,4,3,7>: Cost 4 vext2 <1,u,3,4>, <3,7,0,1>
+ 2624801566U, // <3,4,3,u>: Cost 3 vext2 <1,2,3,4>, <3,u,1,2>
+ 2564399206U, // <3,4,4,0>: Cost 3 vext1 <2,3,4,4>, LHS
+ 2564400026U, // <3,4,4,1>: Cost 3 vext1 <2,3,4,4>, <1,2,3,4>
+ 2564400845U, // <3,4,4,2>: Cost 3 vext1 <2,3,4,4>, <2,3,4,4>
+ 2570373542U, // <3,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
+ 1659227344U, // <3,4,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
+ 1551060278U, // <3,4,4,5>: Cost 2 vext2 <1,2,3,4>, RHS
+ 1659227364U, // <3,4,4,6>: Cost 2 vext3 LHS, <4,4,6,6>
+ 3668006154U, // <3,4,4,7>: Cost 4 vext1 <7,3,4,4>, <7,3,4,4>
+ 1551060521U, // <3,4,4,u>: Cost 2 vext2 <1,2,3,4>, RHS
+ 1490665574U, // <3,4,5,0>: Cost 2 vext1 <2,3,4,5>, LHS
+ 2689838341U, // <3,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
+ 1490667214U, // <3,4,5,2>: Cost 2 vext1 <2,3,4,5>, <2,3,4,5>
+ 2564409494U, // <3,4,5,3>: Cost 3 vext1 <2,3,4,5>, <3,0,1,2>
+ 1490668854U, // <3,4,5,4>: Cost 2 vext1 <2,3,4,5>, RHS
+ 2689838381U, // <3,4,5,5>: Cost 3 vext3 LHS, <4,5,5,7>
+ 537709878U, // <3,4,5,6>: Cost 1 vext3 LHS, RHS
+ 2594272523U, // <3,4,5,7>: Cost 3 vext1 <7,3,4,5>, <7,3,4,5>
+ 537709896U, // <3,4,5,u>: Cost 1 vext3 LHS, RHS
+ 2689838411U, // <3,4,6,0>: Cost 3 vext3 LHS, <4,6,0,1>
+ 2558444534U, // <3,4,6,1>: Cost 3 vext1 <1,3,4,6>, <1,3,4,6>
+ 2666607098U, // <3,4,6,2>: Cost 3 vext2 <u,2,3,4>, <6,2,7,3>
+ 2558446082U, // <3,4,6,3>: Cost 3 vext1 <1,3,4,6>, <3,4,5,6>
+ 1659227508U, // <3,4,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
+ 2689838462U, // <3,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
+ 2689838471U, // <3,4,6,6>: Cost 3 vext3 LHS, <4,6,6,7>
+ 2657981292U, // <3,4,6,7>: Cost 3 vext2 <6,7,3,4>, <6,7,3,4>
+ 1659227540U, // <3,4,6,u>: Cost 2 vext3 LHS, <4,6,u,2>
+ 2666607610U, // <3,4,7,0>: Cost 3 vext2 <u,2,3,4>, <7,0,1,2>
+ 3702527072U, // <3,4,7,1>: Cost 4 vext2 <1,u,3,4>, <7,1,3,5>
+ 2660635824U, // <3,4,7,2>: Cost 3 vext2 <7,2,3,4>, <7,2,3,4>
+ 3644139945U, // <3,4,7,3>: Cost 4 vext1 <3,3,4,7>, <3,3,4,7>
+ 2666607974U, // <3,4,7,4>: Cost 3 vext2 <u,2,3,4>, <7,4,5,6>
+ 2732969416U, // <3,4,7,5>: Cost 3 vext3 LHS, <4,7,5,0>
+ 2732969425U, // <3,4,7,6>: Cost 3 vext3 LHS, <4,7,6,0>
+ 2666608236U, // <3,4,7,7>: Cost 3 vext2 <u,2,3,4>, <7,7,7,7>
+ 2664617622U, // <3,4,7,u>: Cost 3 vext2 <7,u,3,4>, <7,u,3,4>
+ 1490690150U, // <3,4,u,0>: Cost 2 vext1 <2,3,4,u>, LHS
+ 1551062830U, // <3,4,u,1>: Cost 2 vext2 <1,2,3,4>, LHS
+ 1490691793U, // <3,4,u,2>: Cost 2 vext1 <2,3,4,u>, <2,3,4,u>
+ 2624804796U, // <3,4,u,3>: Cost 3 vext2 <1,2,3,4>, <u,3,0,1>
+ 1490693430U, // <3,4,u,4>: Cost 2 vext1 <2,3,4,u>, RHS
+ 1551063194U, // <3,4,u,5>: Cost 2 vext2 <1,2,3,4>, RHS
+ 537710121U, // <3,4,u,6>: Cost 1 vext3 LHS, RHS
+ 2594297102U, // <3,4,u,7>: Cost 3 vext1 <7,3,4,u>, <7,3,4,u>
+ 537710139U, // <3,4,u,u>: Cost 1 vext3 LHS, RHS
+ 3692576768U, // <3,5,0,0>: Cost 4 vext2 <0,2,3,5>, <0,0,0,0>
+ 2618835046U, // <3,5,0,1>: Cost 3 vext2 <0,2,3,5>, LHS
+ 2618835138U, // <3,5,0,2>: Cost 3 vext2 <0,2,3,5>, <0,2,3,5>
+ 3692577024U, // <3,5,0,3>: Cost 4 vext2 <0,2,3,5>, <0,3,1,4>
+ 2689838690U, // <3,5,0,4>: Cost 3 vext3 LHS, <5,0,4,1>
+ 2732969579U, // <3,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
+ 2732969588U, // <3,5,0,6>: Cost 3 vext3 LHS, <5,0,6,1>
+ 2246963055U, // <3,5,0,7>: Cost 3 vrev <5,3,7,0>
+ 2618835613U, // <3,5,0,u>: Cost 3 vext2 <0,2,3,5>, LHS
+ 2594308198U, // <3,5,1,0>: Cost 3 vext1 <7,3,5,1>, LHS
+ 3692577588U, // <3,5,1,1>: Cost 4 vext2 <0,2,3,5>, <1,1,1,1>
+ 2624807835U, // <3,5,1,2>: Cost 3 vext2 <1,2,3,5>, <1,2,3,5>
+ 2625471468U, // <3,5,1,3>: Cost 3 vext2 <1,3,3,5>, <1,3,3,5>
+ 2626135101U, // <3,5,1,4>: Cost 3 vext2 <1,4,3,5>, <1,4,3,5>
+ 2594311888U, // <3,5,1,5>: Cost 3 vext1 <7,3,5,1>, <5,1,7,3>
+ 3699877107U, // <3,5,1,6>: Cost 4 vext2 <1,4,3,5>, <1,6,5,7>
+ 1641680592U, // <3,5,1,7>: Cost 2 vext3 <5,1,7,3>, <5,1,7,3>
+ 1641754329U, // <3,5,1,u>: Cost 2 vext3 <5,1,u,3>, <5,1,u,3>
+ 3692578274U, // <3,5,2,0>: Cost 4 vext2 <0,2,3,5>, <2,0,5,3>
+ 2630116899U, // <3,5,2,1>: Cost 3 vext2 <2,1,3,5>, <2,1,3,5>
+ 3692578408U, // <3,5,2,2>: Cost 4 vext2 <0,2,3,5>, <2,2,2,2>
+ 2625472206U, // <3,5,2,3>: Cost 3 vext2 <1,3,3,5>, <2,3,4,5>
+ 2632107798U, // <3,5,2,4>: Cost 3 vext2 <2,4,3,5>, <2,4,3,5>
+ 2715938575U, // <3,5,2,5>: Cost 3 vext3 <5,2,5,3>, <5,2,5,3>
+ 3692578746U, // <3,5,2,6>: Cost 4 vext2 <0,2,3,5>, <2,6,3,7>
+ 2716086049U, // <3,5,2,7>: Cost 3 vext3 <5,2,7,3>, <5,2,7,3>
+ 2634762330U, // <3,5,2,u>: Cost 3 vext2 <2,u,3,5>, <2,u,3,5>
+ 3692578966U, // <3,5,3,0>: Cost 4 vext2 <0,2,3,5>, <3,0,1,2>
+ 2636089596U, // <3,5,3,1>: Cost 3 vext2 <3,1,3,5>, <3,1,3,5>
+ 3699214668U, // <3,5,3,2>: Cost 4 vext2 <1,3,3,5>, <3,2,3,4>
+ 2638080412U, // <3,5,3,3>: Cost 3 vext2 <3,4,3,5>, <3,3,3,3>
+ 2618837506U, // <3,5,3,4>: Cost 3 vext2 <0,2,3,5>, <3,4,5,6>
+ 2832844494U, // <3,5,3,5>: Cost 3 vuzpr <2,3,4,5>, <2,3,4,5>
+ 4033415682U, // <3,5,3,6>: Cost 4 vzipr <1,1,3,3>, <3,4,5,6>
+ 3095072054U, // <3,5,3,7>: Cost 3 vtrnr <1,3,1,3>, RHS
+ 3095072055U, // <3,5,3,u>: Cost 3 vtrnr <1,3,1,3>, RHS
+ 2600304742U, // <3,5,4,0>: Cost 3 vext1 <u,3,5,4>, LHS
+ 3763580815U, // <3,5,4,1>: Cost 4 vext3 LHS, <5,4,1,5>
+ 2564474582U, // <3,5,4,2>: Cost 3 vext1 <2,3,5,4>, <2,3,5,4>
+ 3699879044U, // <3,5,4,3>: Cost 4 vext2 <1,4,3,5>, <4,3,5,0>
+ 2600308022U, // <3,5,4,4>: Cost 3 vext1 <u,3,5,4>, RHS
+ 2618838326U, // <3,5,4,5>: Cost 3 vext2 <0,2,3,5>, RHS
+ 2772454710U, // <3,5,4,6>: Cost 3 vuzpl <3,4,5,6>, RHS
+ 1659228102U, // <3,5,4,7>: Cost 2 vext3 LHS, <5,4,7,6>
+ 1659228111U, // <3,5,4,u>: Cost 2 vext3 LHS, <5,4,u,6>
+ 2570453094U, // <3,5,5,0>: Cost 3 vext1 <3,3,5,5>, LHS
+ 2624810704U, // <3,5,5,1>: Cost 3 vext2 <1,2,3,5>, <5,1,7,3>
+ 2570454734U, // <3,5,5,2>: Cost 3 vext1 <3,3,5,5>, <2,3,4,5>
+ 2570455472U, // <3,5,5,3>: Cost 3 vext1 <3,3,5,5>, <3,3,5,5>
+ 2570456374U, // <3,5,5,4>: Cost 3 vext1 <3,3,5,5>, RHS
+ 1659228164U, // <3,5,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
+ 2732969998U, // <3,5,5,6>: Cost 3 vext3 LHS, <5,5,6,6>
+ 1659228184U, // <3,5,5,7>: Cost 2 vext3 LHS, <5,5,7,7>
+ 1659228193U, // <3,5,5,u>: Cost 2 vext3 LHS, <5,5,u,7>
+ 2732970020U, // <3,5,6,0>: Cost 3 vext3 LHS, <5,6,0,1>
+ 2732970035U, // <3,5,6,1>: Cost 3 vext3 LHS, <5,6,1,7>
+ 2564490968U, // <3,5,6,2>: Cost 3 vext1 <2,3,5,6>, <2,3,5,6>
+ 2732970050U, // <3,5,6,3>: Cost 3 vext3 LHS, <5,6,3,4>
+ 2732970060U, // <3,5,6,4>: Cost 3 vext3 LHS, <5,6,4,5>
+ 2732970071U, // <3,5,6,5>: Cost 3 vext3 LHS, <5,6,5,7>
+ 2732970080U, // <3,5,6,6>: Cost 3 vext3 LHS, <5,6,6,7>
+ 1659228258U, // <3,5,6,7>: Cost 2 vext3 LHS, <5,6,7,0>
+ 1659228267U, // <3,5,6,u>: Cost 2 vext3 LHS, <5,6,u,0>
+ 1484783718U, // <3,5,7,0>: Cost 2 vext1 <1,3,5,7>, LHS
+ 1484784640U, // <3,5,7,1>: Cost 2 vext1 <1,3,5,7>, <1,3,5,7>
+ 2558527080U, // <3,5,7,2>: Cost 3 vext1 <1,3,5,7>, <2,2,2,2>
+ 2558527638U, // <3,5,7,3>: Cost 3 vext1 <1,3,5,7>, <3,0,1,2>
+ 1484786998U, // <3,5,7,4>: Cost 2 vext1 <1,3,5,7>, RHS
+ 1659228328U, // <3,5,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
+ 2732970154U, // <3,5,7,6>: Cost 3 vext3 LHS, <5,7,6,0>
+ 2558531180U, // <3,5,7,7>: Cost 3 vext1 <1,3,5,7>, <7,7,7,7>
+ 1484789550U, // <3,5,7,u>: Cost 2 vext1 <1,3,5,7>, LHS
+ 1484791910U, // <3,5,u,0>: Cost 2 vext1 <1,3,5,u>, LHS
+ 1484792833U, // <3,5,u,1>: Cost 2 vext1 <1,3,5,u>, <1,3,5,u>
+ 2558535272U, // <3,5,u,2>: Cost 3 vext1 <1,3,5,u>, <2,2,2,2>
+ 2558535830U, // <3,5,u,3>: Cost 3 vext1 <1,3,5,u>, <3,0,1,2>
+ 1484795190U, // <3,5,u,4>: Cost 2 vext1 <1,3,5,u>, RHS
+ 1659228409U, // <3,5,u,5>: Cost 2 vext3 LHS, <5,u,5,7>
+ 2772457626U, // <3,5,u,6>: Cost 3 vuzpl <3,4,5,6>, RHS
+ 1646326023U, // <3,5,u,7>: Cost 2 vext3 <5,u,7,3>, <5,u,7,3>
+ 1484797742U, // <3,5,u,u>: Cost 2 vext1 <1,3,5,u>, LHS
+ 2558541926U, // <3,6,0,0>: Cost 3 vext1 <1,3,6,0>, LHS
+ 2689839393U, // <3,6,0,1>: Cost 3 vext3 LHS, <6,0,1,2>
+ 2689839404U, // <3,6,0,2>: Cost 3 vext3 LHS, <6,0,2,4>
+ 3706519808U, // <3,6,0,3>: Cost 4 vext2 <2,5,3,6>, <0,3,1,4>
+ 2689839420U, // <3,6,0,4>: Cost 3 vext3 LHS, <6,0,4,2>
+ 2732970314U, // <3,6,0,5>: Cost 3 vext3 LHS, <6,0,5,7>
+ 2732970316U, // <3,6,0,6>: Cost 3 vext3 LHS, <6,0,6,0>
+ 2960313654U, // <3,6,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
+ 2689839456U, // <3,6,0,u>: Cost 3 vext3 LHS, <6,0,u,2>
+ 3763581290U, // <3,6,1,0>: Cost 4 vext3 LHS, <6,1,0,3>
+ 3763581297U, // <3,6,1,1>: Cost 4 vext3 LHS, <6,1,1,1>
+ 2624816028U, // <3,6,1,2>: Cost 3 vext2 <1,2,3,6>, <1,2,3,6>
+ 3763581315U, // <3,6,1,3>: Cost 4 vext3 LHS, <6,1,3,1>
+ 2626143294U, // <3,6,1,4>: Cost 3 vext2 <1,4,3,6>, <1,4,3,6>
+ 3763581335U, // <3,6,1,5>: Cost 4 vext3 LHS, <6,1,5,3>
+ 2721321376U, // <3,6,1,6>: Cost 3 vext3 <6,1,6,3>, <6,1,6,3>
+ 2721395113U, // <3,6,1,7>: Cost 3 vext3 <6,1,7,3>, <6,1,7,3>
+ 2628797826U, // <3,6,1,u>: Cost 3 vext2 <1,u,3,6>, <1,u,3,6>
+ 2594390118U, // <3,6,2,0>: Cost 3 vext1 <7,3,6,2>, LHS
+ 2721616324U, // <3,6,2,1>: Cost 3 vext3 <6,2,1,3>, <6,2,1,3>
+ 2630788725U, // <3,6,2,2>: Cost 3 vext2 <2,2,3,6>, <2,2,3,6>
+ 3763581395U, // <3,6,2,3>: Cost 4 vext3 LHS, <6,2,3,0>
+ 2632115991U, // <3,6,2,4>: Cost 3 vext2 <2,4,3,6>, <2,4,3,6>
+ 2632779624U, // <3,6,2,5>: Cost 3 vext2 <2,5,3,6>, <2,5,3,6>
+ 2594394618U, // <3,6,2,6>: Cost 3 vext1 <7,3,6,2>, <6,2,7,3>
+ 1648316922U, // <3,6,2,7>: Cost 2 vext3 <6,2,7,3>, <6,2,7,3>
+ 1648390659U, // <3,6,2,u>: Cost 2 vext3 <6,2,u,3>, <6,2,u,3>
+ 3693914262U, // <3,6,3,0>: Cost 4 vext2 <0,4,3,6>, <3,0,1,2>
+ 3638281176U, // <3,6,3,1>: Cost 4 vext1 <2,3,6,3>, <1,3,1,3>
+ 3696568678U, // <3,6,3,2>: Cost 4 vext2 <0,u,3,6>, <3,2,6,3>
+ 2638088604U, // <3,6,3,3>: Cost 3 vext2 <3,4,3,6>, <3,3,3,3>
+ 2632780290U, // <3,6,3,4>: Cost 3 vext2 <2,5,3,6>, <3,4,5,6>
+ 3712494145U, // <3,6,3,5>: Cost 4 vext2 <3,5,3,6>, <3,5,3,6>
+ 3698559612U, // <3,6,3,6>: Cost 4 vext2 <1,2,3,6>, <3,6,1,2>
+ 2959674678U, // <3,6,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
+ 2959674679U, // <3,6,3,u>: Cost 3 vzipr <1,1,3,3>, RHS
+ 3763581536U, // <3,6,4,0>: Cost 4 vext3 LHS, <6,4,0,6>
+ 2722943590U, // <3,6,4,1>: Cost 3 vext3 <6,4,1,3>, <6,4,1,3>
+ 2732970609U, // <3,6,4,2>: Cost 3 vext3 LHS, <6,4,2,5>
+ 3698560147U, // <3,6,4,3>: Cost 4 vext2 <1,2,3,6>, <4,3,6,6>
+ 2732970628U, // <3,6,4,4>: Cost 3 vext3 LHS, <6,4,4,6>
+ 2689839757U, // <3,6,4,5>: Cost 3 vext3 LHS, <6,4,5,6>
+ 2732970640U, // <3,6,4,6>: Cost 3 vext3 LHS, <6,4,6,0>
+ 2960346422U, // <3,6,4,7>: Cost 3 vzipr <1,2,3,4>, RHS
+ 2689839784U, // <3,6,4,u>: Cost 3 vext3 LHS, <6,4,u,6>
+ 2576498790U, // <3,6,5,0>: Cost 3 vext1 <4,3,6,5>, LHS
+ 3650241270U, // <3,6,5,1>: Cost 4 vext1 <4,3,6,5>, <1,0,3,2>
+ 2732970692U, // <3,6,5,2>: Cost 3 vext3 LHS, <6,5,2,7>
+ 2576501250U, // <3,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
+ 2576501906U, // <3,6,5,4>: Cost 3 vext1 <4,3,6,5>, <4,3,6,5>
+ 3650244622U, // <3,6,5,5>: Cost 4 vext1 <4,3,6,5>, <5,5,6,6>
+ 4114633528U, // <3,6,5,6>: Cost 4 vtrnl <3,4,5,6>, <6,6,6,6>
+ 2732970735U, // <3,6,5,7>: Cost 3 vext3 LHS, <6,5,7,5>
+ 2576504622U, // <3,6,5,u>: Cost 3 vext1 <4,3,6,5>, LHS
+ 2732970749U, // <3,6,6,0>: Cost 3 vext3 LHS, <6,6,0,1>
+ 2724270856U, // <3,6,6,1>: Cost 3 vext3 <6,6,1,3>, <6,6,1,3>
+ 2624819706U, // <3,6,6,2>: Cost 3 vext2 <1,2,3,6>, <6,2,7,3>
+ 3656223234U, // <3,6,6,3>: Cost 4 vext1 <5,3,6,6>, <3,4,5,6>
+ 2732970788U, // <3,6,6,4>: Cost 3 vext3 LHS, <6,6,4,4>
+ 2732970800U, // <3,6,6,5>: Cost 3 vext3 LHS, <6,6,5,7>
+ 1659228984U, // <3,6,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
+ 1659228994U, // <3,6,6,7>: Cost 2 vext3 LHS, <6,6,7,7>
+ 1659229003U, // <3,6,6,u>: Cost 2 vext3 LHS, <6,6,u,7>
+ 1659229006U, // <3,6,7,0>: Cost 2 vext3 LHS, <6,7,0,1>
+ 2558600201U, // <3,6,7,1>: Cost 3 vext1 <1,3,6,7>, <1,3,6,7>
+ 2558601146U, // <3,6,7,2>: Cost 3 vext1 <1,3,6,7>, <2,6,3,7>
+ 2725081963U, // <3,6,7,3>: Cost 3 vext3 <6,7,3,3>, <6,7,3,3>
+ 1659229046U, // <3,6,7,4>: Cost 2 vext3 LHS, <6,7,4,5>
+ 2715423611U, // <3,6,7,5>: Cost 3 vext3 <5,1,7,3>, <6,7,5,1>
+ 2722059141U, // <3,6,7,6>: Cost 3 vext3 <6,2,7,3>, <6,7,6,2>
+ 2962361654U, // <3,6,7,7>: Cost 3 vzipr <1,5,3,7>, RHS
+ 1659229078U, // <3,6,7,u>: Cost 2 vext3 LHS, <6,7,u,1>
+ 1659229087U, // <3,6,u,0>: Cost 2 vext3 LHS, <6,u,0,1>
+ 2689840041U, // <3,6,u,1>: Cost 3 vext3 LHS, <6,u,1,2>
+ 2558609339U, // <3,6,u,2>: Cost 3 vext1 <1,3,6,u>, <2,6,3,u>
+ 2576525853U, // <3,6,u,3>: Cost 3 vext1 <4,3,6,u>, <3,4,u,6>
+ 1659229127U, // <3,6,u,4>: Cost 2 vext3 LHS, <6,u,4,5>
+ 2689840081U, // <3,6,u,5>: Cost 3 vext3 LHS, <6,u,5,6>
+ 1659228984U, // <3,6,u,6>: Cost 2 vext3 LHS, <6,6,6,6>
+ 1652298720U, // <3,6,u,7>: Cost 2 vext3 <6,u,7,3>, <6,u,7,3>
+ 1659229159U, // <3,6,u,u>: Cost 2 vext3 LHS, <6,u,u,1>
+ 2626813952U, // <3,7,0,0>: Cost 3 vext2 <1,5,3,7>, <0,0,0,0>
+ 1553072230U, // <3,7,0,1>: Cost 2 vext2 <1,5,3,7>, LHS
+ 2626814116U, // <3,7,0,2>: Cost 3 vext2 <1,5,3,7>, <0,2,0,2>
+ 3700556028U, // <3,7,0,3>: Cost 4 vext2 <1,5,3,7>, <0,3,1,0>
+ 2626814290U, // <3,7,0,4>: Cost 3 vext2 <1,5,3,7>, <0,4,1,5>
+ 2582507375U, // <3,7,0,5>: Cost 3 vext1 <5,3,7,0>, <5,3,7,0>
+ 2588480072U, // <3,7,0,6>: Cost 3 vext1 <6,3,7,0>, <6,3,7,0>
+ 2732971055U, // <3,7,0,7>: Cost 3 vext3 LHS, <7,0,7,1>
+ 1553072797U, // <3,7,0,u>: Cost 2 vext2 <1,5,3,7>, LHS
+ 2626814710U, // <3,7,1,0>: Cost 3 vext2 <1,5,3,7>, <1,0,3,2>
+ 2626814772U, // <3,7,1,1>: Cost 3 vext2 <1,5,3,7>, <1,1,1,1>
+ 2626814870U, // <3,7,1,2>: Cost 3 vext2 <1,5,3,7>, <1,2,3,0>
+ 2625487854U, // <3,7,1,3>: Cost 3 vext2 <1,3,3,7>, <1,3,3,7>
+ 2582514998U, // <3,7,1,4>: Cost 3 vext1 <5,3,7,1>, RHS
+ 1553073296U, // <3,7,1,5>: Cost 2 vext2 <1,5,3,7>, <1,5,3,7>
+ 2627478753U, // <3,7,1,6>: Cost 3 vext2 <1,6,3,7>, <1,6,3,7>
+ 2727367810U, // <3,7,1,7>: Cost 3 vext3 <7,1,7,3>, <7,1,7,3>
+ 1555064195U, // <3,7,1,u>: Cost 2 vext2 <1,u,3,7>, <1,u,3,7>
+ 2588491878U, // <3,7,2,0>: Cost 3 vext1 <6,3,7,2>, LHS
+ 3700557318U, // <3,7,2,1>: Cost 4 vext2 <1,5,3,7>, <2,1,0,3>
+ 2626815592U, // <3,7,2,2>: Cost 3 vext2 <1,5,3,7>, <2,2,2,2>
+ 2626815654U, // <3,7,2,3>: Cost 3 vext2 <1,5,3,7>, <2,3,0,1>
+ 2588495158U, // <3,7,2,4>: Cost 3 vext1 <6,3,7,2>, RHS
+ 2632787817U, // <3,7,2,5>: Cost 3 vext2 <2,5,3,7>, <2,5,3,7>
+ 1559709626U, // <3,7,2,6>: Cost 2 vext2 <2,6,3,7>, <2,6,3,7>
+ 2728031443U, // <3,7,2,7>: Cost 3 vext3 <7,2,7,3>, <7,2,7,3>
+ 1561036892U, // <3,7,2,u>: Cost 2 vext2 <2,u,3,7>, <2,u,3,7>
+ 2626816150U, // <3,7,3,0>: Cost 3 vext2 <1,5,3,7>, <3,0,1,2>
+ 2626816268U, // <3,7,3,1>: Cost 3 vext2 <1,5,3,7>, <3,1,5,3>
+ 2633451878U, // <3,7,3,2>: Cost 3 vext2 <2,6,3,7>, <3,2,6,3>
+ 2626816412U, // <3,7,3,3>: Cost 3 vext2 <1,5,3,7>, <3,3,3,3>
+ 2626816514U, // <3,7,3,4>: Cost 3 vext2 <1,5,3,7>, <3,4,5,6>
+ 2638760514U, // <3,7,3,5>: Cost 3 vext2 <3,5,3,7>, <3,5,3,7>
+ 2639424147U, // <3,7,3,6>: Cost 3 vext2 <3,6,3,7>, <3,6,3,7>
+ 2826961920U, // <3,7,3,7>: Cost 3 vuzpr <1,3,5,7>, <1,3,5,7>
+ 2626816798U, // <3,7,3,u>: Cost 3 vext2 <1,5,3,7>, <3,u,1,2>
+ 2582536294U, // <3,7,4,0>: Cost 3 vext1 <5,3,7,4>, LHS
+ 2582537360U, // <3,7,4,1>: Cost 3 vext1 <5,3,7,4>, <1,5,3,7>
+ 2588510138U, // <3,7,4,2>: Cost 3 vext1 <6,3,7,4>, <2,6,3,7>
+ 3700558996U, // <3,7,4,3>: Cost 4 vext2 <1,5,3,7>, <4,3,6,7>
+ 2582539574U, // <3,7,4,4>: Cost 3 vext1 <5,3,7,4>, RHS
+ 1553075510U, // <3,7,4,5>: Cost 2 vext2 <1,5,3,7>, RHS
+ 2588512844U, // <3,7,4,6>: Cost 3 vext1 <6,3,7,4>, <6,3,7,4>
+ 2564625766U, // <3,7,4,7>: Cost 3 vext1 <2,3,7,4>, <7,4,5,6>
+ 1553075753U, // <3,7,4,u>: Cost 2 vext2 <1,5,3,7>, RHS
+ 2732971398U, // <3,7,5,0>: Cost 3 vext3 LHS, <7,5,0,2>
+ 2626817744U, // <3,7,5,1>: Cost 3 vext2 <1,5,3,7>, <5,1,7,3>
+ 3700559649U, // <3,7,5,2>: Cost 4 vext2 <1,5,3,7>, <5,2,7,3>
+ 2626817903U, // <3,7,5,3>: Cost 3 vext2 <1,5,3,7>, <5,3,7,0>
+ 2258728203U, // <3,7,5,4>: Cost 3 vrev <7,3,4,5>
+ 2732971446U, // <3,7,5,5>: Cost 3 vext3 LHS, <7,5,5,5>
+ 2732971457U, // <3,7,5,6>: Cost 3 vext3 LHS, <7,5,6,7>
+ 2826964278U, // <3,7,5,7>: Cost 3 vuzpr <1,3,5,7>, RHS
+ 2826964279U, // <3,7,5,u>: Cost 3 vuzpr <1,3,5,7>, RHS
+ 2732971478U, // <3,7,6,0>: Cost 3 vext3 LHS, <7,6,0,1>
+ 2732971486U, // <3,7,6,1>: Cost 3 vext3 LHS, <7,6,1,0>
+ 2633454074U, // <3,7,6,2>: Cost 3 vext2 <2,6,3,7>, <6,2,7,3>
+ 2633454152U, // <3,7,6,3>: Cost 3 vext2 <2,6,3,7>, <6,3,7,0>
+ 2732971518U, // <3,7,6,4>: Cost 3 vext3 LHS, <7,6,4,5>
+ 2732971526U, // <3,7,6,5>: Cost 3 vext3 LHS, <7,6,5,4>
+ 2732971537U, // <3,7,6,6>: Cost 3 vext3 LHS, <7,6,6,6>
+ 2732971540U, // <3,7,6,7>: Cost 3 vext3 LHS, <7,6,7,0>
+ 2726041124U, // <3,7,6,u>: Cost 3 vext3 <6,u,7,3>, <7,6,u,7>
+ 2570616934U, // <3,7,7,0>: Cost 3 vext1 <3,3,7,7>, LHS
+ 2570617856U, // <3,7,7,1>: Cost 3 vext1 <3,3,7,7>, <1,3,5,7>
+ 2564646635U, // <3,7,7,2>: Cost 3 vext1 <2,3,7,7>, <2,3,7,7>
+ 2570619332U, // <3,7,7,3>: Cost 3 vext1 <3,3,7,7>, <3,3,7,7>
+ 2570620214U, // <3,7,7,4>: Cost 3 vext1 <3,3,7,7>, RHS
+ 2582564726U, // <3,7,7,5>: Cost 3 vext1 <5,3,7,7>, <5,3,7,7>
+ 2588537423U, // <3,7,7,6>: Cost 3 vext1 <6,3,7,7>, <6,3,7,7>
+ 1659229804U, // <3,7,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
+ 1659229804U, // <3,7,7,u>: Cost 2 vext3 LHS, <7,7,7,7>
+ 2626819795U, // <3,7,u,0>: Cost 3 vext2 <1,5,3,7>, <u,0,1,2>
+ 1553078062U, // <3,7,u,1>: Cost 2 vext2 <1,5,3,7>, LHS
+ 2626819973U, // <3,7,u,2>: Cost 3 vext2 <1,5,3,7>, <u,2,3,0>
+ 2826961565U, // <3,7,u,3>: Cost 3 vuzpr <1,3,5,7>, LHS
+ 2626820159U, // <3,7,u,4>: Cost 3 vext2 <1,5,3,7>, <u,4,5,6>
+ 1553078426U, // <3,7,u,5>: Cost 2 vext2 <1,5,3,7>, RHS
+ 1595545808U, // <3,7,u,6>: Cost 2 vext2 <u,6,3,7>, <u,6,3,7>
+ 1659229804U, // <3,7,u,7>: Cost 2 vext3 LHS, <7,7,7,7>
+ 1553078629U, // <3,7,u,u>: Cost 2 vext2 <1,5,3,7>, LHS
+ 1611448320U, // <3,u,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
+ 1611896531U, // <3,u,0,1>: Cost 2 vext3 LHS, <u,0,1,2>
+ 1659672284U, // <3,u,0,2>: Cost 2 vext3 LHS, <u,0,2,2>
+ 1616099045U, // <3,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
+ 2685638381U, // <3,u,0,4>: Cost 3 vext3 LHS, <u,0,4,1>
+ 1663874806U, // <3,u,0,5>: Cost 2 vext3 LHS, <u,0,5,1>
+ 1663874816U, // <3,u,0,6>: Cost 2 vext3 LHS, <u,0,6,2>
+ 2960313672U, // <3,u,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
+ 1611896594U, // <3,u,0,u>: Cost 2 vext3 LHS, <u,0,u,2>
+ 1549763324U, // <3,u,1,0>: Cost 2 vext2 <1,0,3,u>, <1,0,3,u>
+ 1550426957U, // <3,u,1,1>: Cost 2 vext2 <1,1,3,u>, <1,1,3,u>
+ 537712430U, // <3,u,1,2>: Cost 1 vext3 LHS, LHS
+ 1616541495U, // <3,u,1,3>: Cost 2 vext3 LHS, <u,1,3,3>
+ 1490930998U, // <3,u,1,4>: Cost 2 vext1 <2,3,u,1>, RHS
+ 1553081489U, // <3,u,1,5>: Cost 2 vext2 <1,5,3,u>, <1,5,3,u>
+ 2627486946U, // <3,u,1,6>: Cost 3 vext2 <1,6,3,u>, <1,6,3,u>
+ 1659230043U, // <3,u,1,7>: Cost 2 vext3 LHS, <u,1,7,3>
+ 537712484U, // <3,u,1,u>: Cost 1 vext3 LHS, LHS
+ 1611890852U, // <3,u,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+ 2624833102U, // <3,u,2,1>: Cost 3 vext2 <1,2,3,u>, <2,1,u,3>
+ 1557063287U, // <3,u,2,2>: Cost 2 vext2 <2,2,3,u>, <2,2,3,u>
+ 1616099205U, // <3,u,2,3>: Cost 2 vext3 LHS, <u,2,3,0>
+ 1611890892U, // <3,u,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+ 2689841054U, // <3,u,2,5>: Cost 3 vext3 LHS, <u,2,5,7>
+ 1559717819U, // <3,u,2,6>: Cost 2 vext2 <2,6,3,u>, <2,6,3,u>
+ 1659230124U, // <3,u,2,7>: Cost 2 vext3 LHS, <u,2,7,3>
+ 1616541618U, // <3,u,2,u>: Cost 2 vext3 LHS, <u,2,u,0>
+ 1611896764U, // <3,u,3,0>: Cost 2 vext3 LHS, <u,3,0,1>
+ 1484973079U, // <3,u,3,1>: Cost 2 vext1 <1,3,u,3>, <1,3,u,3>
+ 2685638607U, // <3,u,3,2>: Cost 3 vext3 LHS, <u,3,2,2>
+ 336380006U, // <3,u,3,3>: Cost 1 vdup3 LHS
+ 1611896804U, // <3,u,3,4>: Cost 2 vext3 LHS, <u,3,4,5>
+ 1616541679U, // <3,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
+ 2690283512U, // <3,u,3,6>: Cost 3 vext3 LHS, <u,3,6,7>
+ 2959674696U, // <3,u,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
+ 336380006U, // <3,u,3,u>: Cost 1 vdup3 LHS
+ 2558722150U, // <3,u,4,0>: Cost 3 vext1 <1,3,u,4>, LHS
+ 1659672602U, // <3,u,4,1>: Cost 2 vext3 LHS, <u,4,1,5>
+ 1659672612U, // <3,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
+ 2689841196U, // <3,u,4,3>: Cost 3 vext3 LHS, <u,4,3,5>
+ 1659227344U, // <3,u,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
+ 1611896895U, // <3,u,4,5>: Cost 2 vext3 LHS, <u,4,5,6>
+ 1663875144U, // <3,u,4,6>: Cost 2 vext3 LHS, <u,4,6,6>
+ 1659230289U, // <3,u,4,7>: Cost 2 vext3 LHS, <u,4,7,6>
+ 1611896922U, // <3,u,4,u>: Cost 2 vext3 LHS, <u,4,u,6>
+ 1490960486U, // <3,u,5,0>: Cost 2 vext1 <2,3,u,5>, LHS
+ 2689841261U, // <3,u,5,1>: Cost 3 vext3 LHS, <u,5,1,7>
+ 1490962162U, // <3,u,5,2>: Cost 2 vext1 <2,3,u,5>, <2,3,u,5>
+ 1616541823U, // <3,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
+ 1490963766U, // <3,u,5,4>: Cost 2 vext1 <2,3,u,5>, RHS
+ 1659228164U, // <3,u,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
+ 537712794U, // <3,u,5,6>: Cost 1 vext3 LHS, RHS
+ 1659230371U, // <3,u,5,7>: Cost 2 vext3 LHS, <u,5,7,7>
+ 537712812U, // <3,u,5,u>: Cost 1 vext3 LHS, RHS
+ 2689841327U, // <3,u,6,0>: Cost 3 vext3 LHS, <u,6,0,1>
+ 2558739482U, // <3,u,6,1>: Cost 3 vext1 <1,3,u,6>, <1,3,u,6>
+ 2689841351U, // <3,u,6,2>: Cost 3 vext3 LHS, <u,6,2,7>
+ 1616099536U, // <3,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
+ 1659227508U, // <3,u,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
+ 2690283746U, // <3,u,6,5>: Cost 3 vext3 LHS, <u,6,5,7>
+ 1659228984U, // <3,u,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
+ 1659230445U, // <3,u,6,7>: Cost 2 vext3 LHS, <u,6,7,0>
+ 1616099581U, // <3,u,6,u>: Cost 2 vext3 LHS, <u,6,u,7>
+ 1485004902U, // <3,u,7,0>: Cost 2 vext1 <1,3,u,7>, LHS
+ 1485005851U, // <3,u,7,1>: Cost 2 vext1 <1,3,u,7>, <1,3,u,7>
+ 2558748264U, // <3,u,7,2>: Cost 3 vext1 <1,3,u,7>, <2,2,2,2>
+ 3095397021U, // <3,u,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
+ 1485008182U, // <3,u,7,4>: Cost 2 vext1 <1,3,u,7>, RHS
+ 1659228328U, // <3,u,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
+ 2722060599U, // <3,u,7,6>: Cost 3 vext3 <6,2,7,3>, <u,7,6,2>
+ 1659229804U, // <3,u,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
+ 1485010734U, // <3,u,7,u>: Cost 2 vext1 <1,3,u,7>, LHS
+ 1616099665U, // <3,u,u,0>: Cost 2 vext3 LHS, <u,u,0,1>
+ 1611897179U, // <3,u,u,1>: Cost 2 vext3 LHS, <u,u,1,2>
+ 537712997U, // <3,u,u,2>: Cost 1 vext3 LHS, LHS
+ 336380006U, // <3,u,u,3>: Cost 1 vdup3 LHS
+ 1616099705U, // <3,u,u,4>: Cost 2 vext3 LHS, <u,u,4,5>
+ 1611897219U, // <3,u,u,5>: Cost 2 vext3 LHS, <u,u,5,6>
+ 537713037U, // <3,u,u,6>: Cost 1 vext3 LHS, RHS
+ 1659230607U, // <3,u,u,7>: Cost 2 vext3 LHS, <u,u,7,0>
+ 537713051U, // <3,u,u,u>: Cost 1 vext3 LHS, LHS
+ 2691907584U, // <4,0,0,0>: Cost 3 vext3 <1,2,3,4>, <0,0,0,0>
+ 2691907594U, // <4,0,0,1>: Cost 3 vext3 <1,2,3,4>, <0,0,1,1>
+ 2691907604U, // <4,0,0,2>: Cost 3 vext3 <1,2,3,4>, <0,0,2,2>
+ 3709862144U, // <4,0,0,3>: Cost 4 vext2 <3,1,4,0>, <0,3,1,4>
+ 2684682280U, // <4,0,0,4>: Cost 3 vext3 <0,0,4,4>, <0,0,4,4>
+ 3694600633U, // <4,0,0,5>: Cost 4 vext2 <0,5,4,0>, <0,5,4,0>
+ 3291431290U, // <4,0,0,6>: Cost 4 vrev <0,4,6,0>
+ 3668342067U, // <4,0,0,7>: Cost 4 vext1 <7,4,0,0>, <7,4,0,0>
+ 2691907657U, // <4,0,0,u>: Cost 3 vext3 <1,2,3,4>, <0,0,u,1>
+ 2570715238U, // <4,0,1,0>: Cost 3 vext1 <3,4,0,1>, LHS
+ 2570716058U, // <4,0,1,1>: Cost 3 vext1 <3,4,0,1>, <1,2,3,4>
+ 1618165862U, // <4,0,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
+ 2570717648U, // <4,0,1,3>: Cost 3 vext1 <3,4,0,1>, <3,4,0,1>
+ 2570718518U, // <4,0,1,4>: Cost 3 vext1 <3,4,0,1>, RHS
+ 2594607206U, // <4,0,1,5>: Cost 3 vext1 <7,4,0,1>, <5,6,7,4>
+ 3662377563U, // <4,0,1,6>: Cost 4 vext1 <6,4,0,1>, <6,4,0,1>
+ 2594608436U, // <4,0,1,7>: Cost 3 vext1 <7,4,0,1>, <7,4,0,1>
+ 1618165916U, // <4,0,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
+ 2685714598U, // <4,0,2,0>: Cost 3 vext3 <0,2,0,4>, <0,2,0,4>
+ 3759530159U, // <4,0,2,1>: Cost 4 vext3 <0,2,1,4>, <0,2,1,4>
+ 2685862072U, // <4,0,2,2>: Cost 3 vext3 <0,2,2,4>, <0,2,2,4>
+ 2631476937U, // <4,0,2,3>: Cost 3 vext2 <2,3,4,0>, <2,3,4,0>
+ 2685714636U, // <4,0,2,4>: Cost 3 vext3 <0,2,0,4>, <0,2,4,6>
+ 3765649622U, // <4,0,2,5>: Cost 4 vext3 <1,2,3,4>, <0,2,5,7>
+ 2686157020U, // <4,0,2,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
+ 3668358453U, // <4,0,2,7>: Cost 4 vext1 <7,4,0,2>, <7,4,0,2>
+ 2686304494U, // <4,0,2,u>: Cost 3 vext3 <0,2,u,4>, <0,2,u,4>
+ 3632529510U, // <4,0,3,0>: Cost 4 vext1 <1,4,0,3>, LHS
+ 2686451968U, // <4,0,3,1>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
+ 2686525705U, // <4,0,3,2>: Cost 3 vext3 <0,3,2,4>, <0,3,2,4>
+ 3760341266U, // <4,0,3,3>: Cost 4 vext3 <0,3,3,4>, <0,3,3,4>
+ 3632532790U, // <4,0,3,4>: Cost 4 vext1 <1,4,0,3>, RHS
+ 3913254606U, // <4,0,3,5>: Cost 4 vuzpr <3,4,5,0>, <2,3,4,5>
+ 3705219740U, // <4,0,3,6>: Cost 4 vext2 <2,3,4,0>, <3,6,4,7>
+ 3713845990U, // <4,0,3,7>: Cost 4 vext2 <3,7,4,0>, <3,7,4,0>
+ 2686451968U, // <4,0,3,u>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
+ 2552823910U, // <4,0,4,0>: Cost 3 vext1 <0,4,0,4>, LHS
+ 2691907922U, // <4,0,4,1>: Cost 3 vext3 <1,2,3,4>, <0,4,1,5>
+ 2691907932U, // <4,0,4,2>: Cost 3 vext3 <1,2,3,4>, <0,4,2,6>
+ 3626567830U, // <4,0,4,3>: Cost 4 vext1 <0,4,0,4>, <3,0,1,2>
+ 2552827190U, // <4,0,4,4>: Cost 3 vext1 <0,4,0,4>, RHS
+ 2631478582U, // <4,0,4,5>: Cost 3 vext2 <2,3,4,0>, RHS
+ 3626570017U, // <4,0,4,6>: Cost 4 vext1 <0,4,0,4>, <6,0,1,2>
+ 3668374839U, // <4,0,4,7>: Cost 4 vext1 <7,4,0,4>, <7,4,0,4>
+ 2552829742U, // <4,0,4,u>: Cost 3 vext1 <0,4,0,4>, LHS
+ 2558804070U, // <4,0,5,0>: Cost 3 vext1 <1,4,0,5>, LHS
+ 1839644774U, // <4,0,5,1>: Cost 2 vzipl RHS, LHS
+ 2913386660U, // <4,0,5,2>: Cost 3 vzipl RHS, <0,2,0,2>
+ 2570750420U, // <4,0,5,3>: Cost 3 vext1 <3,4,0,5>, <3,4,0,5>
+ 2558807350U, // <4,0,5,4>: Cost 3 vext1 <1,4,0,5>, RHS
+ 3987128750U, // <4,0,5,5>: Cost 4 vzipl RHS, <0,5,2,7>
+ 3987128822U, // <4,0,5,6>: Cost 4 vzipl RHS, <0,6,1,7>
+ 2594641208U, // <4,0,5,7>: Cost 3 vext1 <7,4,0,5>, <7,4,0,5>
+ 1839645341U, // <4,0,5,u>: Cost 2 vzipl RHS, LHS
+ 2552840294U, // <4,0,6,0>: Cost 3 vext1 <0,4,0,6>, LHS
+ 3047604234U, // <4,0,6,1>: Cost 3 vtrnl RHS, <0,0,1,1>
+ 1973862502U, // <4,0,6,2>: Cost 2 vtrnl RHS, LHS
+ 2570758613U, // <4,0,6,3>: Cost 3 vext1 <3,4,0,6>, <3,4,0,6>
+ 2552843574U, // <4,0,6,4>: Cost 3 vext1 <0,4,0,6>, RHS
+ 2217664887U, // <4,0,6,5>: Cost 3 vrev <0,4,5,6>
+ 3662418528U, // <4,0,6,6>: Cost 4 vext1 <6,4,0,6>, <6,4,0,6>
+ 2658022257U, // <4,0,6,7>: Cost 3 vext2 <6,7,4,0>, <6,7,4,0>
+ 1973862556U, // <4,0,6,u>: Cost 2 vtrnl RHS, LHS
+ 3731764218U, // <4,0,7,0>: Cost 4 vext2 <6,7,4,0>, <7,0,1,2>
+ 3988324454U, // <4,0,7,1>: Cost 4 vzipl <4,7,5,0>, LHS
+ 4122034278U, // <4,0,7,2>: Cost 4 vtrnl <4,6,7,1>, LHS
+ 3735082246U, // <4,0,7,3>: Cost 4 vext2 <7,3,4,0>, <7,3,4,0>
+ 3731764536U, // <4,0,7,4>: Cost 4 vext2 <6,7,4,0>, <7,4,0,5>
+ 3937145718U, // <4,0,7,5>: Cost 4 vuzpr <7,4,5,0>, <6,7,4,5>
+ 3737073145U, // <4,0,7,6>: Cost 4 vext2 <7,6,4,0>, <7,6,4,0>
+ 3731764844U, // <4,0,7,7>: Cost 4 vext2 <6,7,4,0>, <7,7,7,7>
+ 4122034332U, // <4,0,7,u>: Cost 4 vtrnl <4,6,7,1>, LHS
+ 2552856678U, // <4,0,u,0>: Cost 3 vext1 <0,4,0,u>, LHS
+ 1841635430U, // <4,0,u,1>: Cost 2 vzipl RHS, LHS
+ 1618166429U, // <4,0,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
+ 2570774999U, // <4,0,u,3>: Cost 3 vext1 <3,4,0,u>, <3,4,0,u>
+ 2552859958U, // <4,0,u,4>: Cost 3 vext1 <0,4,0,u>, RHS
+ 2631481498U, // <4,0,u,5>: Cost 3 vext2 <2,3,4,0>, RHS
+ 2686157020U, // <4,0,u,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
+ 2594665787U, // <4,0,u,7>: Cost 3 vext1 <7,4,0,u>, <7,4,0,u>
+ 1618166483U, // <4,0,u,u>: Cost 2 vext3 <1,2,3,4>, LHS
+ 2617548837U, // <4,1,0,0>: Cost 3 vext2 <0,0,4,1>, <0,0,4,1>
+ 2622857318U, // <4,1,0,1>: Cost 3 vext2 <0,u,4,1>, LHS
+ 3693281484U, // <4,1,0,2>: Cost 4 vext2 <0,3,4,1>, <0,2,4,6>
+ 2691908342U, // <4,1,0,3>: Cost 3 vext3 <1,2,3,4>, <1,0,3,2>
+ 2622857554U, // <4,1,0,4>: Cost 3 vext2 <0,u,4,1>, <0,4,1,5>
+ 3764470538U, // <4,1,0,5>: Cost 4 vext3 <1,0,5,4>, <1,0,5,4>
+ 3695272459U, // <4,1,0,6>: Cost 4 vext2 <0,6,4,1>, <0,6,4,1>
+ 3733094980U, // <4,1,0,7>: Cost 4 vext2 <7,0,4,1>, <0,7,1,4>
+ 2622857885U, // <4,1,0,u>: Cost 3 vext2 <0,u,4,1>, LHS
+ 3696599798U, // <4,1,1,0>: Cost 4 vext2 <0,u,4,1>, <1,0,3,2>
+ 2691097399U, // <4,1,1,1>: Cost 3 vext3 <1,1,1,4>, <1,1,1,4>
+ 2631484314U, // <4,1,1,2>: Cost 3 vext2 <2,3,4,1>, <1,2,3,4>
+ 2691908424U, // <4,1,1,3>: Cost 3 vext3 <1,2,3,4>, <1,1,3,3>
+ 3696600125U, // <4,1,1,4>: Cost 4 vext2 <0,u,4,1>, <1,4,3,5>
+ 3696600175U, // <4,1,1,5>: Cost 4 vext2 <0,u,4,1>, <1,5,0,1>
+ 3696600307U, // <4,1,1,6>: Cost 4 vext2 <0,u,4,1>, <1,6,5,7>
+ 3668423997U, // <4,1,1,7>: Cost 4 vext1 <7,4,1,1>, <7,4,1,1>
+ 2691908469U, // <4,1,1,u>: Cost 3 vext3 <1,2,3,4>, <1,1,u,3>
+ 2570797158U, // <4,1,2,0>: Cost 3 vext1 <3,4,1,2>, LHS
+ 2570797978U, // <4,1,2,1>: Cost 3 vext1 <3,4,1,2>, <1,2,3,4>
+ 3696600680U, // <4,1,2,2>: Cost 4 vext2 <0,u,4,1>, <2,2,2,2>
+ 1618166682U, // <4,1,2,3>: Cost 2 vext3 <1,2,3,4>, <1,2,3,4>
+ 2570800438U, // <4,1,2,4>: Cost 3 vext1 <3,4,1,2>, RHS
+ 3765650347U, // <4,1,2,5>: Cost 4 vext3 <1,2,3,4>, <1,2,5,3>
+ 3696601018U, // <4,1,2,6>: Cost 4 vext2 <0,u,4,1>, <2,6,3,7>
+ 3668432190U, // <4,1,2,7>: Cost 4 vext1 <7,4,1,2>, <7,4,1,2>
+ 1618535367U, // <4,1,2,u>: Cost 2 vext3 <1,2,u,4>, <1,2,u,4>
+ 2564833382U, // <4,1,3,0>: Cost 3 vext1 <2,4,1,3>, LHS
+ 2691908568U, // <4,1,3,1>: Cost 3 vext3 <1,2,3,4>, <1,3,1,3>
+ 2691908578U, // <4,1,3,2>: Cost 3 vext3 <1,2,3,4>, <1,3,2,4>
+ 2692572139U, // <4,1,3,3>: Cost 3 vext3 <1,3,3,4>, <1,3,3,4>
+ 2564836662U, // <4,1,3,4>: Cost 3 vext1 <2,4,1,3>, RHS
+ 2691908608U, // <4,1,3,5>: Cost 3 vext3 <1,2,3,4>, <1,3,5,7>
+ 2588725862U, // <4,1,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+ 3662468090U, // <4,1,3,7>: Cost 4 vext1 <6,4,1,3>, <7,0,1,2>
+ 2691908631U, // <4,1,3,u>: Cost 3 vext3 <1,2,3,4>, <1,3,u,3>
+ 3760194590U, // <4,1,4,0>: Cost 4 vext3 <0,3,1,4>, <1,4,0,1>
+ 3693947874U, // <4,1,4,1>: Cost 4 vext2 <0,4,4,1>, <4,1,5,0>
+ 3765650484U, // <4,1,4,2>: Cost 4 vext3 <1,2,3,4>, <1,4,2,5>
+ 3113877606U, // <4,1,4,3>: Cost 3 vtrnr <4,4,4,4>, LHS
+ 3760194630U, // <4,1,4,4>: Cost 4 vext3 <0,3,1,4>, <1,4,4,5>
+ 2622860598U, // <4,1,4,5>: Cost 3 vext2 <0,u,4,1>, RHS
+ 3297436759U, // <4,1,4,6>: Cost 4 vrev <1,4,6,4>
+ 3800007772U, // <4,1,4,7>: Cost 4 vext3 <7,0,1,4>, <1,4,7,0>
+ 2622860841U, // <4,1,4,u>: Cost 3 vext2 <0,u,4,1>, RHS
+ 1479164006U, // <4,1,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
+ 2552906486U, // <4,1,5,1>: Cost 3 vext1 <0,4,1,5>, <1,0,3,2>
+ 2552907299U, // <4,1,5,2>: Cost 3 vext1 <0,4,1,5>, <2,1,3,5>
+ 2552907926U, // <4,1,5,3>: Cost 3 vext1 <0,4,1,5>, <3,0,1,2>
+ 1479167286U, // <4,1,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
+ 2913387664U, // <4,1,5,5>: Cost 3 vzipl RHS, <1,5,3,7>
+ 2600686074U, // <4,1,5,6>: Cost 3 vext1 <u,4,1,5>, <6,2,7,3>
+ 2600686586U, // <4,1,5,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
+ 1479169838U, // <4,1,5,u>: Cost 2 vext1 <0,4,1,5>, LHS
+ 2552914022U, // <4,1,6,0>: Cost 3 vext1 <0,4,1,6>, LHS
+ 2558886708U, // <4,1,6,1>: Cost 3 vext1 <1,4,1,6>, <1,1,1,1>
+ 4028205206U, // <4,1,6,2>: Cost 4 vzipr <0,2,4,6>, <3,0,1,2>
+ 3089858662U, // <4,1,6,3>: Cost 3 vtrnr <0,4,2,6>, LHS
+ 2552917302U, // <4,1,6,4>: Cost 3 vext1 <0,4,1,6>, RHS
+ 2223637584U, // <4,1,6,5>: Cost 3 vrev <1,4,5,6>
+ 4121347081U, // <4,1,6,6>: Cost 4 vtrnl RHS, <1,3,6,7>
+ 3721155406U, // <4,1,6,7>: Cost 4 vext2 <5,0,4,1>, <6,7,0,1>
+ 2552919854U, // <4,1,6,u>: Cost 3 vext1 <0,4,1,6>, LHS
+ 2659357716U, // <4,1,7,0>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
+ 3733763173U, // <4,1,7,1>: Cost 4 vext2 <7,1,4,1>, <7,1,4,1>
+ 3734426806U, // <4,1,7,2>: Cost 4 vext2 <7,2,4,1>, <7,2,4,1>
+ 2695226671U, // <4,1,7,3>: Cost 3 vext3 <1,7,3,4>, <1,7,3,4>
+ 3721155942U, // <4,1,7,4>: Cost 4 vext2 <5,0,4,1>, <7,4,5,6>
+ 3721155976U, // <4,1,7,5>: Cost 4 vext2 <5,0,4,1>, <7,5,0,4>
+ 3662500458U, // <4,1,7,6>: Cost 4 vext1 <6,4,1,7>, <6,4,1,7>
+ 3721156204U, // <4,1,7,7>: Cost 4 vext2 <5,0,4,1>, <7,7,7,7>
+ 2659357716U, // <4,1,7,u>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
+ 1479188582U, // <4,1,u,0>: Cost 2 vext1 <0,4,1,u>, LHS
+ 2552931062U, // <4,1,u,1>: Cost 3 vext1 <0,4,1,u>, <1,0,3,2>
+ 2552931944U, // <4,1,u,2>: Cost 3 vext1 <0,4,1,u>, <2,2,2,2>
+ 1622148480U, // <4,1,u,3>: Cost 2 vext3 <1,u,3,4>, <1,u,3,4>
+ 1479191862U, // <4,1,u,4>: Cost 2 vext1 <0,4,1,u>, RHS
+ 2622863514U, // <4,1,u,5>: Cost 3 vext2 <0,u,4,1>, RHS
+ 2588725862U, // <4,1,u,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+ 2600686586U, // <4,1,u,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
+ 1479194414U, // <4,1,u,u>: Cost 2 vext1 <0,4,1,u>, LHS
+ 2617557030U, // <4,2,0,0>: Cost 3 vext2 <0,0,4,2>, <0,0,4,2>
+ 2622865510U, // <4,2,0,1>: Cost 3 vext2 <0,u,4,2>, LHS
+ 2622865612U, // <4,2,0,2>: Cost 3 vext2 <0,u,4,2>, <0,2,4,6>
+ 3693289753U, // <4,2,0,3>: Cost 4 vext2 <0,3,4,2>, <0,3,4,2>
+ 2635473244U, // <4,2,0,4>: Cost 3 vext2 <3,0,4,2>, <0,4,2,6>
+ 3765650918U, // <4,2,0,5>: Cost 4 vext3 <1,2,3,4>, <2,0,5,7>
+ 2696775148U, // <4,2,0,6>: Cost 3 vext3 <2,0,6,4>, <2,0,6,4>
+ 3695944285U, // <4,2,0,7>: Cost 4 vext2 <0,7,4,2>, <0,7,4,2>
+ 2622866077U, // <4,2,0,u>: Cost 3 vext2 <0,u,4,2>, LHS
+ 3696607990U, // <4,2,1,0>: Cost 4 vext2 <0,u,4,2>, <1,0,3,2>
+ 3696608052U, // <4,2,1,1>: Cost 4 vext2 <0,u,4,2>, <1,1,1,1>
+ 3696608150U, // <4,2,1,2>: Cost 4 vext2 <0,u,4,2>, <1,2,3,0>
+ 3895574630U, // <4,2,1,3>: Cost 4 vuzpr <0,4,u,2>, LHS
+ 2691909162U, // <4,2,1,4>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
+ 3696608400U, // <4,2,1,5>: Cost 4 vext2 <0,u,4,2>, <1,5,3,7>
+ 3760784956U, // <4,2,1,6>: Cost 4 vext3 <0,4,0,4>, <2,1,6,3>
+ 3773908549U, // <4,2,1,7>: Cost 5 vext3 <2,5,7,4>, <2,1,7,3>
+ 2691909162U, // <4,2,1,u>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
+ 3696608748U, // <4,2,2,0>: Cost 4 vext2 <0,u,4,2>, <2,0,6,4>
+ 3696608828U, // <4,2,2,1>: Cost 4 vext2 <0,u,4,2>, <2,1,6,3>
+ 2691909224U, // <4,2,2,2>: Cost 3 vext3 <1,2,3,4>, <2,2,2,2>
+ 2691909234U, // <4,2,2,3>: Cost 3 vext3 <1,2,3,4>, <2,2,3,3>
+ 3759605368U, // <4,2,2,4>: Cost 4 vext3 <0,2,2,4>, <2,2,4,0>
+ 3696609156U, // <4,2,2,5>: Cost 4 vext2 <0,u,4,2>, <2,5,6,7>
+ 3760785040U, // <4,2,2,6>: Cost 4 vext3 <0,4,0,4>, <2,2,6,6>
+ 3668505927U, // <4,2,2,7>: Cost 4 vext1 <7,4,2,2>, <7,4,2,2>
+ 2691909279U, // <4,2,2,u>: Cost 3 vext3 <1,2,3,4>, <2,2,u,3>
+ 2691909286U, // <4,2,3,0>: Cost 3 vext3 <1,2,3,4>, <2,3,0,1>
+ 3764840111U, // <4,2,3,1>: Cost 4 vext3 <1,1,1,4>, <2,3,1,1>
+ 3765651129U, // <4,2,3,2>: Cost 4 vext3 <1,2,3,4>, <2,3,2,2>
+ 2698544836U, // <4,2,3,3>: Cost 3 vext3 <2,3,3,4>, <2,3,3,4>
+ 2685863630U, // <4,2,3,4>: Cost 3 vext3 <0,2,2,4>, <2,3,4,5>
+ 2698692310U, // <4,2,3,5>: Cost 3 vext3 <2,3,5,4>, <2,3,5,4>
+ 3772507871U, // <4,2,3,6>: Cost 4 vext3 <2,3,6,4>, <2,3,6,4>
+ 2698839784U, // <4,2,3,7>: Cost 3 vext3 <2,3,7,4>, <2,3,7,4>
+ 2691909358U, // <4,2,3,u>: Cost 3 vext3 <1,2,3,4>, <2,3,u,1>
+ 2564915302U, // <4,2,4,0>: Cost 3 vext1 <2,4,2,4>, LHS
+ 2564916122U, // <4,2,4,1>: Cost 3 vext1 <2,4,2,4>, <1,2,3,4>
+ 2564917004U, // <4,2,4,2>: Cost 3 vext1 <2,4,2,4>, <2,4,2,4>
+ 2699208469U, // <4,2,4,3>: Cost 3 vext3 <2,4,3,4>, <2,4,3,4>
+ 2564918582U, // <4,2,4,4>: Cost 3 vext1 <2,4,2,4>, RHS
+ 2622868790U, // <4,2,4,5>: Cost 3 vext2 <0,u,4,2>, RHS
+ 2229667632U, // <4,2,4,6>: Cost 3 vrev <2,4,6,4>
+ 3800082229U, // <4,2,4,7>: Cost 4 vext3 <7,0,2,4>, <2,4,7,0>
+ 2622869033U, // <4,2,4,u>: Cost 3 vext2 <0,u,4,2>, RHS
+ 2552979558U, // <4,2,5,0>: Cost 3 vext1 <0,4,2,5>, LHS
+ 2558952342U, // <4,2,5,1>: Cost 3 vext1 <1,4,2,5>, <1,2,3,0>
+ 2564925032U, // <4,2,5,2>: Cost 3 vext1 <2,4,2,5>, <2,2,2,2>
+ 2967060582U, // <4,2,5,3>: Cost 3 vzipr <2,3,4,5>, LHS
+ 2552982838U, // <4,2,5,4>: Cost 3 vext1 <0,4,2,5>, RHS
+ 3987130190U, // <4,2,5,5>: Cost 4 vzipl RHS, <2,5,0,7>
+ 2913388474U, // <4,2,5,6>: Cost 3 vzipl RHS, <2,6,3,7>
+ 3895577910U, // <4,2,5,7>: Cost 4 vuzpr <0,4,u,2>, RHS
+ 2552985390U, // <4,2,5,u>: Cost 3 vext1 <0,4,2,5>, LHS
+ 1479245926U, // <4,2,6,0>: Cost 2 vext1 <0,4,2,6>, LHS
+ 2552988406U, // <4,2,6,1>: Cost 3 vext1 <0,4,2,6>, <1,0,3,2>
+ 2552989288U, // <4,2,6,2>: Cost 3 vext1 <0,4,2,6>, <2,2,2,2>
+ 2954461286U, // <4,2,6,3>: Cost 3 vzipr <0,2,4,6>, LHS
+ 1479249206U, // <4,2,6,4>: Cost 2 vext1 <0,4,2,6>, RHS
+ 2229610281U, // <4,2,6,5>: Cost 3 vrev <2,4,5,6>
+ 2600767994U, // <4,2,6,6>: Cost 3 vext1 <u,4,2,6>, <6,2,7,3>
+ 2600768506U, // <4,2,6,7>: Cost 3 vext1 <u,4,2,6>, <7,0,1,2>
+ 1479251758U, // <4,2,6,u>: Cost 2 vext1 <0,4,2,6>, LHS
+ 2659365909U, // <4,2,7,0>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
+ 3733771366U, // <4,2,7,1>: Cost 4 vext2 <7,1,4,2>, <7,1,4,2>
+ 3734434999U, // <4,2,7,2>: Cost 4 vext2 <7,2,4,2>, <7,2,4,2>
+ 2701199368U, // <4,2,7,3>: Cost 3 vext3 <2,7,3,4>, <2,7,3,4>
+ 4175774618U, // <4,2,7,4>: Cost 4 vtrnr <2,4,5,7>, <1,2,3,4>
+ 3303360298U, // <4,2,7,5>: Cost 4 vrev <2,4,5,7>
+ 3727136217U, // <4,2,7,6>: Cost 4 vext2 <6,0,4,2>, <7,6,0,4>
+ 3727136364U, // <4,2,7,7>: Cost 4 vext2 <6,0,4,2>, <7,7,7,7>
+ 2659365909U, // <4,2,7,u>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
+ 1479262310U, // <4,2,u,0>: Cost 2 vext1 <0,4,2,u>, LHS
+ 2553004790U, // <4,2,u,1>: Cost 3 vext1 <0,4,2,u>, <1,0,3,2>
+ 2553005672U, // <4,2,u,2>: Cost 3 vext1 <0,4,2,u>, <2,2,2,2>
+ 2954477670U, // <4,2,u,3>: Cost 3 vzipr <0,2,4,u>, LHS
+ 1479265590U, // <4,2,u,4>: Cost 2 vext1 <0,4,2,u>, RHS
+ 2622871706U, // <4,2,u,5>: Cost 3 vext2 <0,u,4,2>, RHS
+ 2229700404U, // <4,2,u,6>: Cost 3 vrev <2,4,6,u>
+ 2600784890U, // <4,2,u,7>: Cost 3 vext1 <u,4,2,u>, <7,0,1,2>
+ 1479268142U, // <4,2,u,u>: Cost 2 vext1 <0,4,2,u>, LHS
+ 3765651595U, // <4,3,0,0>: Cost 4 vext3 <1,2,3,4>, <3,0,0,0>
+ 2691909782U, // <4,3,0,1>: Cost 3 vext3 <1,2,3,4>, <3,0,1,2>
+ 2702452897U, // <4,3,0,2>: Cost 3 vext3 <3,0,2,4>, <3,0,2,4>
+ 3693297946U, // <4,3,0,3>: Cost 4 vext2 <0,3,4,3>, <0,3,4,3>
+ 3760711856U, // <4,3,0,4>: Cost 4 vext3 <0,3,u,4>, <3,0,4,1>
+ 2235533820U, // <4,3,0,5>: Cost 3 vrev <3,4,5,0>
+ 3309349381U, // <4,3,0,6>: Cost 4 vrev <3,4,6,0>
+ 3668563278U, // <4,3,0,7>: Cost 4 vext1 <7,4,3,0>, <7,4,3,0>
+ 2691909845U, // <4,3,0,u>: Cost 3 vext3 <1,2,3,4>, <3,0,u,2>
+ 2235173328U, // <4,3,1,0>: Cost 3 vrev <3,4,0,1>
+ 3764840678U, // <4,3,1,1>: Cost 4 vext3 <1,1,1,4>, <3,1,1,1>
+ 2630173594U, // <4,3,1,2>: Cost 3 vext2 <2,1,4,3>, <1,2,3,4>
+ 2703190267U, // <4,3,1,3>: Cost 3 vext3 <3,1,3,4>, <3,1,3,4>
+ 3760195840U, // <4,3,1,4>: Cost 4 vext3 <0,3,1,4>, <3,1,4,0>
+ 3765651724U, // <4,3,1,5>: Cost 4 vext3 <1,2,3,4>, <3,1,5,3>
+ 3309357574U, // <4,3,1,6>: Cost 4 vrev <3,4,6,1>
+ 3769633054U, // <4,3,1,7>: Cost 4 vext3 <1,u,3,4>, <3,1,7,3>
+ 2703558952U, // <4,3,1,u>: Cost 3 vext3 <3,1,u,4>, <3,1,u,4>
+ 3626770534U, // <4,3,2,0>: Cost 4 vext1 <0,4,3,2>, LHS
+ 2630174250U, // <4,3,2,1>: Cost 3 vext2 <2,1,4,3>, <2,1,4,3>
+ 3765651777U, // <4,3,2,2>: Cost 4 vext3 <1,2,3,4>, <3,2,2,2>
+ 2703853900U, // <4,3,2,3>: Cost 3 vext3 <3,2,3,4>, <3,2,3,4>
+ 3626773814U, // <4,3,2,4>: Cost 4 vext1 <0,4,3,2>, RHS
+ 2704001374U, // <4,3,2,5>: Cost 3 vext3 <3,2,5,4>, <3,2,5,4>
+ 3765651814U, // <4,3,2,6>: Cost 4 vext3 <1,2,3,4>, <3,2,6,3>
+ 3769633135U, // <4,3,2,7>: Cost 4 vext3 <1,u,3,4>, <3,2,7,3>
+ 2634819681U, // <4,3,2,u>: Cost 3 vext2 <2,u,4,3>, <2,u,4,3>
+ 3765651839U, // <4,3,3,0>: Cost 4 vext3 <1,2,3,4>, <3,3,0,1>
+ 3765651848U, // <4,3,3,1>: Cost 4 vext3 <1,2,3,4>, <3,3,1,1>
+ 3710552404U, // <4,3,3,2>: Cost 4 vext2 <3,2,4,3>, <3,2,4,3>
+ 2691910044U, // <4,3,3,3>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
+ 2704591270U, // <4,3,3,4>: Cost 3 vext3 <3,3,4,4>, <3,3,4,4>
+ 3769633202U, // <4,3,3,5>: Cost 4 vext3 <1,u,3,4>, <3,3,5,7>
+ 3703917212U, // <4,3,3,6>: Cost 4 vext2 <2,1,4,3>, <3,6,4,7>
+ 3769633220U, // <4,3,3,7>: Cost 4 vext3 <1,u,3,4>, <3,3,7,7>
+ 2691910044U, // <4,3,3,u>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
+ 2691910096U, // <4,3,4,0>: Cost 3 vext3 <1,2,3,4>, <3,4,0,1>
+ 2691910106U, // <4,3,4,1>: Cost 3 vext3 <1,2,3,4>, <3,4,1,2>
+ 2564990741U, // <4,3,4,2>: Cost 3 vext1 <2,4,3,4>, <2,4,3,4>
+ 3765651946U, // <4,3,4,3>: Cost 4 vext3 <1,2,3,4>, <3,4,3,0>
+ 2691910136U, // <4,3,4,4>: Cost 3 vext3 <1,2,3,4>, <3,4,4,5>
+ 2686454274U, // <4,3,4,5>: Cost 3 vext3 <0,3,1,4>, <3,4,5,6>
+ 2235640329U, // <4,3,4,6>: Cost 3 vrev <3,4,6,4>
+ 3801483792U, // <4,3,4,7>: Cost 4 vext3 <7,2,3,4>, <3,4,7,2>
+ 2691910168U, // <4,3,4,u>: Cost 3 vext3 <1,2,3,4>, <3,4,u,1>
+ 2559025254U, // <4,3,5,0>: Cost 3 vext1 <1,4,3,5>, LHS
+ 2559026237U, // <4,3,5,1>: Cost 3 vext1 <1,4,3,5>, <1,4,3,5>
+ 2564998862U, // <4,3,5,2>: Cost 3 vext1 <2,4,3,5>, <2,3,4,5>
+ 2570971548U, // <4,3,5,3>: Cost 3 vext1 <3,4,3,5>, <3,3,3,3>
+ 2559028534U, // <4,3,5,4>: Cost 3 vext1 <1,4,3,5>, RHS
+ 4163519477U, // <4,3,5,5>: Cost 4 vtrnr <0,4,1,5>, <1,3,4,5>
+ 3309390346U, // <4,3,5,6>: Cost 4 vrev <3,4,6,5>
+ 2706139747U, // <4,3,5,7>: Cost 3 vext3 <3,5,7,4>, <3,5,7,4>
+ 2559031086U, // <4,3,5,u>: Cost 3 vext1 <1,4,3,5>, LHS
+ 2559033446U, // <4,3,6,0>: Cost 3 vext1 <1,4,3,6>, LHS
+ 2559034430U, // <4,3,6,1>: Cost 3 vext1 <1,4,3,6>, <1,4,3,6>
+ 2565007127U, // <4,3,6,2>: Cost 3 vext1 <2,4,3,6>, <2,4,3,6>
+ 2570979740U, // <4,3,6,3>: Cost 3 vext1 <3,4,3,6>, <3,3,3,3>
+ 2559036726U, // <4,3,6,4>: Cost 3 vext1 <1,4,3,6>, RHS
+ 1161841154U, // <4,3,6,5>: Cost 2 vrev <3,4,5,6>
+ 4028203932U, // <4,3,6,6>: Cost 4 vzipr <0,2,4,6>, <1,2,3,6>
+ 2706803380U, // <4,3,6,7>: Cost 3 vext3 <3,6,7,4>, <3,6,7,4>
+ 1162062365U, // <4,3,6,u>: Cost 2 vrev <3,4,u,6>
+ 3769633475U, // <4,3,7,0>: Cost 4 vext3 <1,u,3,4>, <3,7,0,1>
+ 3769633488U, // <4,3,7,1>: Cost 4 vext3 <1,u,3,4>, <3,7,1,5>
+ 3638757144U, // <4,3,7,2>: Cost 4 vext1 <2,4,3,7>, <2,4,3,7>
+ 3769633508U, // <4,3,7,3>: Cost 4 vext3 <1,u,3,4>, <3,7,3,7>
+ 3769633515U, // <4,3,7,4>: Cost 4 vext3 <1,u,3,4>, <3,7,4,5>
+ 3769633526U, // <4,3,7,5>: Cost 4 vext3 <1,u,3,4>, <3,7,5,7>
+ 3662647932U, // <4,3,7,6>: Cost 4 vext1 <6,4,3,7>, <6,4,3,7>
+ 3781208837U, // <4,3,7,7>: Cost 4 vext3 <3,7,7,4>, <3,7,7,4>
+ 3769633547U, // <4,3,7,u>: Cost 4 vext3 <1,u,3,4>, <3,7,u,1>
+ 2559049830U, // <4,3,u,0>: Cost 3 vext1 <1,4,3,u>, LHS
+ 2691910430U, // <4,3,u,1>: Cost 3 vext3 <1,2,3,4>, <3,u,1,2>
+ 2565023513U, // <4,3,u,2>: Cost 3 vext1 <2,4,3,u>, <2,4,3,u>
+ 2707835698U, // <4,3,u,3>: Cost 3 vext3 <3,u,3,4>, <3,u,3,4>
+ 2559053110U, // <4,3,u,4>: Cost 3 vext1 <1,4,3,u>, RHS
+ 1161857540U, // <4,3,u,5>: Cost 2 vrev <3,4,5,u>
+ 2235673101U, // <4,3,u,6>: Cost 3 vrev <3,4,6,u>
+ 2708130646U, // <4,3,u,7>: Cost 3 vext3 <3,u,7,4>, <3,u,7,4>
+ 1162078751U, // <4,3,u,u>: Cost 2 vrev <3,4,u,u>
+ 2617573416U, // <4,4,0,0>: Cost 3 vext2 <0,0,4,4>, <0,0,4,4>
+ 1570373734U, // <4,4,0,1>: Cost 2 vext2 <4,4,4,4>, LHS
+ 2779676774U, // <4,4,0,2>: Cost 3 vuzpl <4,6,4,6>, LHS
+ 3760196480U, // <4,4,0,3>: Cost 4 vext3 <0,3,1,4>, <4,0,3,1>
+ 2576977100U, // <4,4,0,4>: Cost 3 vext1 <4,4,4,0>, <4,4,4,0>
+ 2718747538U, // <4,4,0,5>: Cost 3 vext3 <5,6,7,4>, <4,0,5,1>
+ 2718747548U, // <4,4,0,6>: Cost 3 vext3 <5,6,7,4>, <4,0,6,2>
+ 3668637015U, // <4,4,0,7>: Cost 4 vext1 <7,4,4,0>, <7,4,4,0>
+ 1570374301U, // <4,4,0,u>: Cost 2 vext2 <4,4,4,4>, LHS
+ 2644116214U, // <4,4,1,0>: Cost 3 vext2 <4,4,4,4>, <1,0,3,2>
+ 2644116276U, // <4,4,1,1>: Cost 3 vext2 <4,4,4,4>, <1,1,1,1>
+ 2691910602U, // <4,4,1,2>: Cost 3 vext3 <1,2,3,4>, <4,1,2,3>
+ 2644116440U, // <4,4,1,3>: Cost 3 vext2 <4,4,4,4>, <1,3,1,3>
+ 2711227356U, // <4,4,1,4>: Cost 3 vext3 <4,4,4,4>, <4,1,4,3>
+ 2709310438U, // <4,4,1,5>: Cost 3 vext3 <4,1,5,4>, <4,1,5,4>
+ 3765652462U, // <4,4,1,6>: Cost 4 vext3 <1,2,3,4>, <4,1,6,3>
+ 3768970231U, // <4,4,1,7>: Cost 4 vext3 <1,7,3,4>, <4,1,7,3>
+ 2695891968U, // <4,4,1,u>: Cost 3 vext3 <1,u,3,4>, <4,1,u,3>
+ 3703260634U, // <4,4,2,0>: Cost 4 vext2 <2,0,4,4>, <2,0,4,4>
+ 3765652499U, // <4,4,2,1>: Cost 4 vext3 <1,2,3,4>, <4,2,1,4>
+ 2644117096U, // <4,4,2,2>: Cost 3 vext2 <4,4,4,4>, <2,2,2,2>
+ 2631509709U, // <4,4,2,3>: Cost 3 vext2 <2,3,4,4>, <2,3,4,4>
+ 2644117269U, // <4,4,2,4>: Cost 3 vext2 <4,4,4,4>, <2,4,3,4>
+ 3705251698U, // <4,4,2,5>: Cost 4 vext2 <2,3,4,4>, <2,5,4,7>
+ 2710047808U, // <4,4,2,6>: Cost 3 vext3 <4,2,6,4>, <4,2,6,4>
+ 3783863369U, // <4,4,2,7>: Cost 4 vext3 <4,2,7,4>, <4,2,7,4>
+ 2634827874U, // <4,4,2,u>: Cost 3 vext2 <2,u,4,4>, <2,u,4,4>
+ 2644117654U, // <4,4,3,0>: Cost 3 vext2 <4,4,4,4>, <3,0,1,2>
+ 3638797210U, // <4,4,3,1>: Cost 4 vext1 <2,4,4,3>, <1,2,3,4>
+ 3638798082U, // <4,4,3,2>: Cost 4 vext1 <2,4,4,3>, <2,4,1,3>
+ 2637482406U, // <4,4,3,3>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
+ 2638146039U, // <4,4,3,4>: Cost 3 vext2 <3,4,4,4>, <3,4,4,4>
+ 3913287374U, // <4,4,3,5>: Cost 4 vuzpr <3,4,5,4>, <2,3,4,5>
+ 3765652625U, // <4,4,3,6>: Cost 4 vext3 <1,2,3,4>, <4,3,6,4>
+ 3713878762U, // <4,4,3,7>: Cost 4 vext2 <3,7,4,4>, <3,7,4,4>
+ 2637482406U, // <4,4,3,u>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
+ 1503264870U, // <4,4,4,0>: Cost 2 vext1 <4,4,4,4>, LHS
+ 2577007514U, // <4,4,4,1>: Cost 3 vext1 <4,4,4,4>, <1,2,3,4>
+ 2577008232U, // <4,4,4,2>: Cost 3 vext1 <4,4,4,4>, <2,2,2,2>
+ 2571037175U, // <4,4,4,3>: Cost 3 vext1 <3,4,4,4>, <3,4,4,4>
+ 161926454U, // <4,4,4,4>: Cost 1 vdup0 RHS
+ 1570377014U, // <4,4,4,5>: Cost 2 vext2 <4,4,4,4>, RHS
+ 2779680054U, // <4,4,4,6>: Cost 3 vuzpl <4,6,4,6>, RHS
+ 2594927963U, // <4,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
+ 161926454U, // <4,4,4,u>: Cost 1 vdup0 RHS
+ 2571042918U, // <4,4,5,0>: Cost 3 vext1 <3,4,4,5>, LHS
+ 2571043738U, // <4,4,5,1>: Cost 3 vext1 <3,4,4,5>, <1,2,3,4>
+ 3638814495U, // <4,4,5,2>: Cost 4 vext1 <2,4,4,5>, <2,4,4,5>
+ 2571045368U, // <4,4,5,3>: Cost 3 vext1 <3,4,4,5>, <3,4,4,5>
+ 2571046198U, // <4,4,5,4>: Cost 3 vext1 <3,4,4,5>, RHS
+ 1839648054U, // <4,4,5,5>: Cost 2 vzipl RHS, RHS
+ 1618169142U, // <4,4,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
+ 2594936156U, // <4,4,5,7>: Cost 3 vext1 <7,4,4,5>, <7,4,4,5>
+ 1618169160U, // <4,4,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
+ 2553135206U, // <4,4,6,0>: Cost 3 vext1 <0,4,4,6>, LHS
+ 3626877686U, // <4,4,6,1>: Cost 4 vext1 <0,4,4,6>, <1,0,3,2>
+ 2565080782U, // <4,4,6,2>: Cost 3 vext1 <2,4,4,6>, <2,3,4,5>
+ 2571053561U, // <4,4,6,3>: Cost 3 vext1 <3,4,4,6>, <3,4,4,6>
+ 2553138486U, // <4,4,6,4>: Cost 3 vext1 <0,4,4,6>, RHS
+ 2241555675U, // <4,4,6,5>: Cost 3 vrev <4,4,5,6>
+ 1973865782U, // <4,4,6,6>: Cost 2 vtrnl RHS, RHS
+ 2658055029U, // <4,4,6,7>: Cost 3 vext2 <6,7,4,4>, <6,7,4,4>
+ 1973865800U, // <4,4,6,u>: Cost 2 vtrnl RHS, RHS
+ 2644120570U, // <4,4,7,0>: Cost 3 vext2 <4,4,4,4>, <7,0,1,2>
+ 3638829978U, // <4,4,7,1>: Cost 4 vext1 <2,4,4,7>, <1,2,3,4>
+ 3638830881U, // <4,4,7,2>: Cost 4 vext1 <2,4,4,7>, <2,4,4,7>
+ 3735115018U, // <4,4,7,3>: Cost 4 vext2 <7,3,4,4>, <7,3,4,4>
+ 2662036827U, // <4,4,7,4>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
+ 2713292236U, // <4,4,7,5>: Cost 3 vext3 <4,7,5,4>, <4,7,5,4>
+ 2713365973U, // <4,4,7,6>: Cost 3 vext3 <4,7,6,4>, <4,7,6,4>
+ 2644121196U, // <4,4,7,7>: Cost 3 vext2 <4,4,4,4>, <7,7,7,7>
+ 2662036827U, // <4,4,7,u>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
+ 1503297638U, // <4,4,u,0>: Cost 2 vext1 <4,4,4,u>, LHS
+ 1570379566U, // <4,4,u,1>: Cost 2 vext2 <4,4,4,4>, LHS
+ 2779682606U, // <4,4,u,2>: Cost 3 vuzpl <4,6,4,6>, LHS
+ 2571069947U, // <4,4,u,3>: Cost 3 vext1 <3,4,4,u>, <3,4,4,u>
+ 161926454U, // <4,4,u,4>: Cost 1 vdup0 RHS
+ 1841638710U, // <4,4,u,5>: Cost 2 vzipl RHS, RHS
+ 1618169385U, // <4,4,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
+ 2594960735U, // <4,4,u,7>: Cost 3 vext1 <7,4,4,u>, <7,4,4,u>
+ 161926454U, // <4,4,u,u>: Cost 1 vdup0 RHS
+ 2631516160U, // <4,5,0,0>: Cost 3 vext2 <2,3,4,5>, <0,0,0,0>
+ 1557774438U, // <4,5,0,1>: Cost 2 vext2 <2,3,4,5>, LHS
+ 2618908875U, // <4,5,0,2>: Cost 3 vext2 <0,2,4,5>, <0,2,4,5>
+ 2571078140U, // <4,5,0,3>: Cost 3 vext1 <3,4,5,0>, <3,4,5,0>
+ 2626871634U, // <4,5,0,4>: Cost 3 vext2 <1,5,4,5>, <0,4,1,5>
+ 3705258414U, // <4,5,0,5>: Cost 4 vext2 <2,3,4,5>, <0,5,2,7>
+ 2594968438U, // <4,5,0,6>: Cost 3 vext1 <7,4,5,0>, <6,7,4,5>
+ 2594968928U, // <4,5,0,7>: Cost 3 vext1 <7,4,5,0>, <7,4,5,0>
+ 1557775005U, // <4,5,0,u>: Cost 2 vext2 <2,3,4,5>, LHS
+ 2631516918U, // <4,5,1,0>: Cost 3 vext2 <2,3,4,5>, <1,0,3,2>
+ 2624217939U, // <4,5,1,1>: Cost 3 vext2 <1,1,4,5>, <1,1,4,5>
+ 2631517078U, // <4,5,1,2>: Cost 3 vext2 <2,3,4,5>, <1,2,3,0>
+ 2821341286U, // <4,5,1,3>: Cost 3 vuzpr <0,4,1,5>, LHS
+ 3895086054U, // <4,5,1,4>: Cost 4 vuzpr <0,4,1,5>, <4,1,5,4>
+ 2626872471U, // <4,5,1,5>: Cost 3 vext2 <1,5,4,5>, <1,5,4,5>
+ 3895083131U, // <4,5,1,6>: Cost 4 vuzpr <0,4,1,5>, <0,1,4,6>
+ 2718748368U, // <4,5,1,7>: Cost 3 vext3 <5,6,7,4>, <5,1,7,3>
+ 2821341291U, // <4,5,1,u>: Cost 3 vuzpr <0,4,1,5>, LHS
+ 2571092070U, // <4,5,2,0>: Cost 3 vext1 <3,4,5,2>, LHS
+ 3699287585U, // <4,5,2,1>: Cost 4 vext2 <1,3,4,5>, <2,1,3,3>
+ 2630854269U, // <4,5,2,2>: Cost 3 vext2 <2,2,4,5>, <2,2,4,5>
+ 1557776078U, // <4,5,2,3>: Cost 2 vext2 <2,3,4,5>, <2,3,4,5>
+ 2631517974U, // <4,5,2,4>: Cost 3 vext2 <2,3,4,5>, <2,4,3,5>
+ 3692652384U, // <4,5,2,5>: Cost 4 vext2 <0,2,4,5>, <2,5,2,7>
+ 2631518138U, // <4,5,2,6>: Cost 3 vext2 <2,3,4,5>, <2,6,3,7>
+ 4164013366U, // <4,5,2,7>: Cost 4 vtrnr <0,4,u,2>, RHS
+ 1561094243U, // <4,5,2,u>: Cost 2 vext2 <2,u,4,5>, <2,u,4,5>
+ 2631518358U, // <4,5,3,0>: Cost 3 vext2 <2,3,4,5>, <3,0,1,2>
+ 3895084710U, // <4,5,3,1>: Cost 4 vuzpr <0,4,1,5>, <2,3,0,1>
+ 2631518540U, // <4,5,3,2>: Cost 3 vext2 <2,3,4,5>, <3,2,3,4>
+ 2631518620U, // <4,5,3,3>: Cost 3 vext2 <2,3,4,5>, <3,3,3,3>
+ 2631518716U, // <4,5,3,4>: Cost 3 vext2 <2,3,4,5>, <3,4,5,0>
+ 2631518784U, // <4,5,3,5>: Cost 3 vext2 <2,3,4,5>, <3,5,3,5>
+ 2658060980U, // <4,5,3,6>: Cost 3 vext2 <6,7,4,5>, <3,6,7,4>
+ 2640145131U, // <4,5,3,7>: Cost 3 vext2 <3,7,4,5>, <3,7,4,5>
+ 2631519006U, // <4,5,3,u>: Cost 3 vext2 <2,3,4,5>, <3,u,1,2>
+ 2571108454U, // <4,5,4,0>: Cost 3 vext1 <3,4,5,4>, LHS
+ 3632907342U, // <4,5,4,1>: Cost 4 vext1 <1,4,5,4>, <1,4,5,4>
+ 2571110094U, // <4,5,4,2>: Cost 3 vext1 <3,4,5,4>, <2,3,4,5>
+ 2571110912U, // <4,5,4,3>: Cost 3 vext1 <3,4,5,4>, <3,4,5,4>
+ 2571111734U, // <4,5,4,4>: Cost 3 vext1 <3,4,5,4>, RHS
+ 1557777718U, // <4,5,4,5>: Cost 2 vext2 <2,3,4,5>, RHS
+ 2645454195U, // <4,5,4,6>: Cost 3 vext2 <4,6,4,5>, <4,6,4,5>
+ 2718748614U, // <4,5,4,7>: Cost 3 vext3 <5,6,7,4>, <5,4,7,6>
+ 1557777961U, // <4,5,4,u>: Cost 2 vext2 <2,3,4,5>, RHS
+ 1503346790U, // <4,5,5,0>: Cost 2 vext1 <4,4,5,5>, LHS
+ 2913398480U, // <4,5,5,1>: Cost 3 vzipl RHS, <5,1,7,3>
+ 2631519998U, // <4,5,5,2>: Cost 3 vext2 <2,3,4,5>, <5,2,3,4>
+ 2577090710U, // <4,5,5,3>: Cost 3 vext1 <4,4,5,5>, <3,0,1,2>
+ 1503349978U, // <4,5,5,4>: Cost 2 vext1 <4,4,5,5>, <4,4,5,5>
+ 2631520260U, // <4,5,5,5>: Cost 3 vext2 <2,3,4,5>, <5,5,5,5>
+ 2913390690U, // <4,5,5,6>: Cost 3 vzipl RHS, <5,6,7,0>
+ 2821344566U, // <4,5,5,7>: Cost 3 vuzpr <0,4,1,5>, RHS
+ 1503352622U, // <4,5,5,u>: Cost 2 vext1 <4,4,5,5>, LHS
+ 1497383014U, // <4,5,6,0>: Cost 2 vext1 <3,4,5,6>, LHS
+ 2559181904U, // <4,5,6,1>: Cost 3 vext1 <1,4,5,6>, <1,4,5,6>
+ 2565154601U, // <4,5,6,2>: Cost 3 vext1 <2,4,5,6>, <2,4,5,6>
+ 1497385474U, // <4,5,6,3>: Cost 2 vext1 <3,4,5,6>, <3,4,5,6>
+ 1497386294U, // <4,5,6,4>: Cost 2 vext1 <3,4,5,6>, RHS
+ 3047608324U, // <4,5,6,5>: Cost 3 vtrnl RHS, <5,5,5,5>
+ 2571129656U, // <4,5,6,6>: Cost 3 vext1 <3,4,5,6>, <6,6,6,6>
+ 27705344U, // <4,5,6,7>: Cost 0 copy RHS
+ 27705344U, // <4,5,6,u>: Cost 0 copy RHS
+ 2565161062U, // <4,5,7,0>: Cost 3 vext1 <2,4,5,7>, LHS
+ 2565161882U, // <4,5,7,1>: Cost 3 vext1 <2,4,5,7>, <1,2,3,4>
+ 2565162794U, // <4,5,7,2>: Cost 3 vext1 <2,4,5,7>, <2,4,5,7>
+ 2661381387U, // <4,5,7,3>: Cost 3 vext2 <7,3,4,5>, <7,3,4,5>
+ 2565164342U, // <4,5,7,4>: Cost 3 vext1 <2,4,5,7>, RHS
+ 2718748840U, // <4,5,7,5>: Cost 3 vext3 <5,6,7,4>, <5,7,5,7>
+ 2718748846U, // <4,5,7,6>: Cost 3 vext3 <5,6,7,4>, <5,7,6,4>
+ 2719412407U, // <4,5,7,7>: Cost 3 vext3 <5,7,7,4>, <5,7,7,4>
+ 2565166894U, // <4,5,7,u>: Cost 3 vext1 <2,4,5,7>, LHS
+ 1497399398U, // <4,5,u,0>: Cost 2 vext1 <3,4,5,u>, LHS
+ 1557780270U, // <4,5,u,1>: Cost 2 vext2 <2,3,4,5>, LHS
+ 2631522181U, // <4,5,u,2>: Cost 3 vext2 <2,3,4,5>, <u,2,3,0>
+ 1497401860U, // <4,5,u,3>: Cost 2 vext1 <3,4,5,u>, <3,4,5,u>
+ 1497402678U, // <4,5,u,4>: Cost 2 vext1 <3,4,5,u>, RHS
+ 1557780634U, // <4,5,u,5>: Cost 2 vext2 <2,3,4,5>, RHS
+ 2631522512U, // <4,5,u,6>: Cost 3 vext2 <2,3,4,5>, <u,6,3,7>
+ 27705344U, // <4,5,u,7>: Cost 0 copy RHS
+ 27705344U, // <4,5,u,u>: Cost 0 copy RHS
+ 2618916864U, // <4,6,0,0>: Cost 3 vext2 <0,2,4,6>, <0,0,0,0>
+ 1545175142U, // <4,6,0,1>: Cost 2 vext2 <0,2,4,6>, LHS
+ 1545175244U, // <4,6,0,2>: Cost 2 vext2 <0,2,4,6>, <0,2,4,6>
+ 3692658940U, // <4,6,0,3>: Cost 4 vext2 <0,2,4,6>, <0,3,1,0>
+ 2618917202U, // <4,6,0,4>: Cost 3 vext2 <0,2,4,6>, <0,4,1,5>
+ 3852910806U, // <4,6,0,5>: Cost 4 vuzpl RHS, <0,2,5,7>
+ 2253525648U, // <4,6,0,6>: Cost 3 vrev <6,4,6,0>
+ 4040764726U, // <4,6,0,7>: Cost 4 vzipr <2,3,4,0>, RHS
+ 1545175709U, // <4,6,0,u>: Cost 2 vext2 <0,2,4,6>, LHS
+ 2618917622U, // <4,6,1,0>: Cost 3 vext2 <0,2,4,6>, <1,0,3,2>
+ 2618917684U, // <4,6,1,1>: Cost 3 vext2 <0,2,4,6>, <1,1,1,1>
+ 2618917782U, // <4,6,1,2>: Cost 3 vext2 <0,2,4,6>, <1,2,3,0>
+ 2618917848U, // <4,6,1,3>: Cost 3 vext2 <0,2,4,6>, <1,3,1,3>
+ 3692659773U, // <4,6,1,4>: Cost 4 vext2 <0,2,4,6>, <1,4,3,5>
+ 2618918032U, // <4,6,1,5>: Cost 3 vext2 <0,2,4,6>, <1,5,3,7>
+ 3692659937U, // <4,6,1,6>: Cost 4 vext2 <0,2,4,6>, <1,6,3,7>
+ 4032146742U, // <4,6,1,7>: Cost 4 vzipr <0,u,4,1>, RHS
+ 2618918253U, // <4,6,1,u>: Cost 3 vext2 <0,2,4,6>, <1,u,1,3>
+ 2618918380U, // <4,6,2,0>: Cost 3 vext2 <0,2,4,6>, <2,0,6,4>
+ 2618918460U, // <4,6,2,1>: Cost 3 vext2 <0,2,4,6>, <2,1,6,3>
+ 2618918504U, // <4,6,2,2>: Cost 3 vext2 <0,2,4,6>, <2,2,2,2>
+ 2618918566U, // <4,6,2,3>: Cost 3 vext2 <0,2,4,6>, <2,3,0,1>
+ 2618918679U, // <4,6,2,4>: Cost 3 vext2 <0,2,4,6>, <2,4,3,6>
+ 2618918788U, // <4,6,2,5>: Cost 3 vext2 <0,2,4,6>, <2,5,6,7>
+ 2618918842U, // <4,6,2,6>: Cost 3 vext2 <0,2,4,6>, <2,6,3,7>
+ 2718749178U, // <4,6,2,7>: Cost 3 vext3 <5,6,7,4>, <6,2,7,3>
+ 2618918971U, // <4,6,2,u>: Cost 3 vext2 <0,2,4,6>, <2,u,0,1>
+ 2618919062U, // <4,6,3,0>: Cost 3 vext2 <0,2,4,6>, <3,0,1,2>
+ 2636171526U, // <4,6,3,1>: Cost 3 vext2 <3,1,4,6>, <3,1,4,6>
+ 3692661057U, // <4,6,3,2>: Cost 4 vext2 <0,2,4,6>, <3,2,2,2>
+ 2618919324U, // <4,6,3,3>: Cost 3 vext2 <0,2,4,6>, <3,3,3,3>
+ 2618919426U, // <4,6,3,4>: Cost 3 vext2 <0,2,4,6>, <3,4,5,6>
+ 2638826058U, // <4,6,3,5>: Cost 3 vext2 <3,5,4,6>, <3,5,4,6>
+ 3913303030U, // <4,6,3,6>: Cost 4 vuzpr <3,4,5,6>, <1,3,4,6>
+ 2722730572U, // <4,6,3,7>: Cost 3 vext3 <6,3,7,4>, <6,3,7,4>
+ 2618919710U, // <4,6,3,u>: Cost 3 vext2 <0,2,4,6>, <3,u,1,2>
+ 2565210214U, // <4,6,4,0>: Cost 3 vext1 <2,4,6,4>, LHS
+ 2718749286U, // <4,6,4,1>: Cost 3 vext3 <5,6,7,4>, <6,4,1,3>
+ 2565211952U, // <4,6,4,2>: Cost 3 vext1 <2,4,6,4>, <2,4,6,4>
+ 2571184649U, // <4,6,4,3>: Cost 3 vext1 <3,4,6,4>, <3,4,6,4>
+ 2565213494U, // <4,6,4,4>: Cost 3 vext1 <2,4,6,4>, RHS
+ 1545178422U, // <4,6,4,5>: Cost 2 vext2 <0,2,4,6>, RHS
+ 1705430326U, // <4,6,4,6>: Cost 2 vuzpl RHS, RHS
+ 2595075437U, // <4,6,4,7>: Cost 3 vext1 <7,4,6,4>, <7,4,6,4>
+ 1545178665U, // <4,6,4,u>: Cost 2 vext2 <0,2,4,6>, RHS
+ 2565218406U, // <4,6,5,0>: Cost 3 vext1 <2,4,6,5>, LHS
+ 2645462736U, // <4,6,5,1>: Cost 3 vext2 <4,6,4,6>, <5,1,7,3>
+ 2913399290U, // <4,6,5,2>: Cost 3 vzipl RHS, <6,2,7,3>
+ 3913305394U, // <4,6,5,3>: Cost 4 vuzpr <3,4,5,6>, <4,5,6,3>
+ 2645462982U, // <4,6,5,4>: Cost 3 vext2 <4,6,4,6>, <5,4,7,6>
+ 2779172868U, // <4,6,5,5>: Cost 3 vuzpl RHS, <5,5,5,5>
+ 2913391416U, // <4,6,5,6>: Cost 3 vzipl RHS, <6,6,6,6>
+ 2821426486U, // <4,6,5,7>: Cost 3 vuzpr <0,4,2,6>, RHS
+ 2821426487U, // <4,6,5,u>: Cost 3 vuzpr <0,4,2,6>, RHS
+ 1503428710U, // <4,6,6,0>: Cost 2 vext1 <4,4,6,6>, LHS
+ 2577171190U, // <4,6,6,1>: Cost 3 vext1 <4,4,6,6>, <1,0,3,2>
+ 2645463546U, // <4,6,6,2>: Cost 3 vext2 <4,6,4,6>, <6,2,7,3>
+ 2577172630U, // <4,6,6,3>: Cost 3 vext1 <4,4,6,6>, <3,0,1,2>
+ 1503431908U, // <4,6,6,4>: Cost 2 vext1 <4,4,6,6>, <4,4,6,6>
+ 2253501069U, // <4,6,6,5>: Cost 3 vrev <6,4,5,6>
+ 2618921784U, // <4,6,6,6>: Cost 3 vext2 <0,2,4,6>, <6,6,6,6>
+ 2954464566U, // <4,6,6,7>: Cost 3 vzipr <0,2,4,6>, RHS
+ 1503434542U, // <4,6,6,u>: Cost 2 vext1 <4,4,6,6>, LHS
+ 2645464058U, // <4,6,7,0>: Cost 3 vext2 <4,6,4,6>, <7,0,1,2>
+ 2779173882U, // <4,6,7,1>: Cost 3 vuzpl RHS, <7,0,1,2>
+ 3638978355U, // <4,6,7,2>: Cost 4 vext1 <2,4,6,7>, <2,4,6,7>
+ 2725090156U, // <4,6,7,3>: Cost 3 vext3 <6,7,3,4>, <6,7,3,4>
+ 2645464422U, // <4,6,7,4>: Cost 3 vext2 <4,6,4,6>, <7,4,5,6>
+ 2779174246U, // <4,6,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
+ 3852915914U, // <4,6,7,6>: Cost 4 vuzpl RHS, <7,2,6,3>
+ 2779174508U, // <4,6,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
+ 2779173945U, // <4,6,7,u>: Cost 3 vuzpl RHS, <7,0,u,2>
+ 1503445094U, // <4,6,u,0>: Cost 2 vext1 <4,4,6,u>, LHS
+ 1545180974U, // <4,6,u,1>: Cost 2 vext2 <0,2,4,6>, LHS
+ 1705432878U, // <4,6,u,2>: Cost 2 vuzpl RHS, LHS
+ 2618922940U, // <4,6,u,3>: Cost 3 vext2 <0,2,4,6>, <u,3,0,1>
+ 1503448294U, // <4,6,u,4>: Cost 2 vext1 <4,4,6,u>, <4,4,6,u>
+ 1545181338U, // <4,6,u,5>: Cost 2 vext2 <0,2,4,6>, RHS
+ 1705433242U, // <4,6,u,6>: Cost 2 vuzpl RHS, RHS
+ 2954480950U, // <4,6,u,7>: Cost 3 vzipr <0,2,4,u>, RHS
+ 1545181541U, // <4,6,u,u>: Cost 2 vext2 <0,2,4,6>, LHS
+ 3706601472U, // <4,7,0,0>: Cost 4 vext2 <2,5,4,7>, <0,0,0,0>
+ 2632859750U, // <4,7,0,1>: Cost 3 vext2 <2,5,4,7>, LHS
+ 2726343685U, // <4,7,0,2>: Cost 3 vext3 <7,0,2,4>, <7,0,2,4>
+ 3701293312U, // <4,7,0,3>: Cost 4 vext2 <1,6,4,7>, <0,3,1,4>
+ 3706601810U, // <4,7,0,4>: Cost 4 vext2 <2,5,4,7>, <0,4,1,5>
+ 2259424608U, // <4,7,0,5>: Cost 3 vrev <7,4,5,0>
+ 3695321617U, // <4,7,0,6>: Cost 4 vext2 <0,6,4,7>, <0,6,4,7>
+ 3800454194U, // <4,7,0,7>: Cost 4 vext3 <7,0,7,4>, <7,0,7,4>
+ 2632860317U, // <4,7,0,u>: Cost 3 vext2 <2,5,4,7>, LHS
+ 2259064116U, // <4,7,1,0>: Cost 3 vrev <7,4,0,1>
+ 3700630324U, // <4,7,1,1>: Cost 4 vext2 <1,5,4,7>, <1,1,1,1>
+ 2632860570U, // <4,7,1,2>: Cost 3 vext2 <2,5,4,7>, <1,2,3,4>
+ 3769635936U, // <4,7,1,3>: Cost 4 vext3 <1,u,3,4>, <7,1,3,5>
+ 3656920374U, // <4,7,1,4>: Cost 4 vext1 <5,4,7,1>, RHS
+ 3700630681U, // <4,7,1,5>: Cost 4 vext2 <1,5,4,7>, <1,5,4,7>
+ 3701294314U, // <4,7,1,6>: Cost 4 vext2 <1,6,4,7>, <1,6,4,7>
+ 3793818754U, // <4,7,1,7>: Cost 4 vext3 <5,u,7,4>, <7,1,7,3>
+ 2259654012U, // <4,7,1,u>: Cost 3 vrev <7,4,u,1>
+ 3656925286U, // <4,7,2,0>: Cost 4 vext1 <5,4,7,2>, LHS
+ 3706603050U, // <4,7,2,1>: Cost 4 vext2 <2,5,4,7>, <2,1,4,3>
+ 3706603112U, // <4,7,2,2>: Cost 4 vext2 <2,5,4,7>, <2,2,2,2>
+ 2727744688U, // <4,7,2,3>: Cost 3 vext3 <7,2,3,4>, <7,2,3,4>
+ 3705939745U, // <4,7,2,4>: Cost 4 vext2 <2,4,4,7>, <2,4,4,7>
+ 2632861554U, // <4,7,2,5>: Cost 3 vext2 <2,5,4,7>, <2,5,4,7>
+ 3706603450U, // <4,7,2,6>: Cost 4 vext2 <2,5,4,7>, <2,6,3,7>
+ 3792491731U, // <4,7,2,7>: Cost 4 vext3 <5,6,7,4>, <7,2,7,3>
+ 2634852453U, // <4,7,2,u>: Cost 3 vext2 <2,u,4,7>, <2,u,4,7>
+ 3706603670U, // <4,7,3,0>: Cost 4 vext2 <2,5,4,7>, <3,0,1,2>
+ 3662906266U, // <4,7,3,1>: Cost 4 vext1 <6,4,7,3>, <1,2,3,4>
+ 3725183326U, // <4,7,3,2>: Cost 4 vext2 <5,6,4,7>, <3,2,5,4>
+ 3706603932U, // <4,7,3,3>: Cost 4 vext2 <2,5,4,7>, <3,3,3,3>
+ 3701295618U, // <4,7,3,4>: Cost 4 vext2 <1,6,4,7>, <3,4,5,6>
+ 2638834251U, // <4,7,3,5>: Cost 3 vext2 <3,5,4,7>, <3,5,4,7>
+ 2639497884U, // <4,7,3,6>: Cost 3 vext2 <3,6,4,7>, <3,6,4,7>
+ 3802445093U, // <4,7,3,7>: Cost 4 vext3 <7,3,7,4>, <7,3,7,4>
+ 2640825150U, // <4,7,3,u>: Cost 3 vext2 <3,u,4,7>, <3,u,4,7>
+ 2718750004U, // <4,7,4,0>: Cost 3 vext3 <5,6,7,4>, <7,4,0,1>
+ 3706604490U, // <4,7,4,1>: Cost 4 vext2 <2,5,4,7>, <4,1,2,3>
+ 3656943474U, // <4,7,4,2>: Cost 4 vext1 <5,4,7,4>, <2,5,4,7>
+ 3779884371U, // <4,7,4,3>: Cost 4 vext3 <3,5,7,4>, <7,4,3,5>
+ 2259383643U, // <4,7,4,4>: Cost 3 vrev <7,4,4,4>
+ 2632863030U, // <4,7,4,5>: Cost 3 vext2 <2,5,4,7>, RHS
+ 2259531117U, // <4,7,4,6>: Cost 3 vrev <7,4,6,4>
+ 3907340074U, // <4,7,4,7>: Cost 4 vuzpr <2,4,5,7>, <2,4,5,7>
+ 2632863273U, // <4,7,4,u>: Cost 3 vext2 <2,5,4,7>, RHS
+ 2913391610U, // <4,7,5,0>: Cost 3 vzipl RHS, <7,0,1,2>
+ 3645006848U, // <4,7,5,1>: Cost 4 vext1 <3,4,7,5>, <1,3,5,7>
+ 2589181646U, // <4,7,5,2>: Cost 3 vext1 <6,4,7,5>, <2,3,4,5>
+ 3645008403U, // <4,7,5,3>: Cost 4 vext1 <3,4,7,5>, <3,4,7,5>
+ 2913391974U, // <4,7,5,4>: Cost 3 vzipl RHS, <7,4,5,6>
+ 2583211973U, // <4,7,5,5>: Cost 3 vext1 <5,4,7,5>, <5,4,7,5>
+ 2589184670U, // <4,7,5,6>: Cost 3 vext1 <6,4,7,5>, <6,4,7,5>
+ 2913392236U, // <4,7,5,7>: Cost 3 vzipl RHS, <7,7,7,7>
+ 2913392258U, // <4,7,5,u>: Cost 3 vzipl RHS, <7,u,1,2>
+ 1509474406U, // <4,7,6,0>: Cost 2 vext1 <5,4,7,6>, LHS
+ 3047609338U, // <4,7,6,1>: Cost 3 vtrnl RHS, <7,0,1,2>
+ 2583217768U, // <4,7,6,2>: Cost 3 vext1 <5,4,7,6>, <2,2,2,2>
+ 2583218326U, // <4,7,6,3>: Cost 3 vext1 <5,4,7,6>, <3,0,1,2>
+ 1509477686U, // <4,7,6,4>: Cost 2 vext1 <5,4,7,6>, RHS
+ 1509478342U, // <4,7,6,5>: Cost 2 vext1 <5,4,7,6>, <5,4,7,6>
+ 2583220730U, // <4,7,6,6>: Cost 3 vext1 <5,4,7,6>, <6,2,7,3>
+ 3047609964U, // <4,7,6,7>: Cost 3 vtrnl RHS, <7,7,7,7>
+ 1509480238U, // <4,7,6,u>: Cost 2 vext1 <5,4,7,6>, LHS
+ 3650994278U, // <4,7,7,0>: Cost 4 vext1 <4,4,7,7>, LHS
+ 3650995098U, // <4,7,7,1>: Cost 4 vext1 <4,4,7,7>, <1,2,3,4>
+ 3650996010U, // <4,7,7,2>: Cost 4 vext1 <4,4,7,7>, <2,4,5,7>
+ 3804804677U, // <4,7,7,3>: Cost 4 vext3 <7,7,3,4>, <7,7,3,4>
+ 3650997486U, // <4,7,7,4>: Cost 4 vext1 <4,4,7,7>, <4,4,7,7>
+ 2662725039U, // <4,7,7,5>: Cost 3 vext2 <7,5,4,7>, <7,5,4,7>
+ 3662942880U, // <4,7,7,6>: Cost 4 vext1 <6,4,7,7>, <6,4,7,7>
+ 2718750316U, // <4,7,7,7>: Cost 3 vext3 <5,6,7,4>, <7,7,7,7>
+ 2664715938U, // <4,7,7,u>: Cost 3 vext2 <7,u,4,7>, <7,u,4,7>
+ 1509490790U, // <4,7,u,0>: Cost 2 vext1 <5,4,7,u>, LHS
+ 2632865582U, // <4,7,u,1>: Cost 3 vext2 <2,5,4,7>, LHS
+ 2583234152U, // <4,7,u,2>: Cost 3 vext1 <5,4,7,u>, <2,2,2,2>
+ 2583234710U, // <4,7,u,3>: Cost 3 vext1 <5,4,7,u>, <3,0,1,2>
+ 1509494070U, // <4,7,u,4>: Cost 2 vext1 <5,4,7,u>, RHS
+ 1509494728U, // <4,7,u,5>: Cost 2 vext1 <5,4,7,u>, <5,4,7,u>
+ 2583237114U, // <4,7,u,6>: Cost 3 vext1 <5,4,7,u>, <6,2,7,3>
+ 3047757420U, // <4,7,u,7>: Cost 3 vtrnl RHS, <7,7,7,7>
+ 1509496622U, // <4,7,u,u>: Cost 2 vext1 <5,4,7,u>, LHS
+ 2618933248U, // <4,u,0,0>: Cost 3 vext2 <0,2,4,u>, <0,0,0,0>
+ 1545191526U, // <4,u,0,1>: Cost 2 vext2 <0,2,4,u>, LHS
+ 1545191630U, // <4,u,0,2>: Cost 2 vext2 <0,2,4,u>, <0,2,4,u>
+ 2691913445U, // <4,u,0,3>: Cost 3 vext3 <1,2,3,4>, <u,0,3,2>
+ 2618933586U, // <4,u,0,4>: Cost 3 vext2 <0,2,4,u>, <0,4,1,5>
+ 2265397305U, // <4,u,0,5>: Cost 3 vrev <u,4,5,0>
+ 2595189625U, // <4,u,0,6>: Cost 3 vext1 <7,4,u,0>, <6,7,4,u>
+ 2595190139U, // <4,u,0,7>: Cost 3 vext1 <7,4,u,0>, <7,4,u,0>
+ 1545192093U, // <4,u,0,u>: Cost 2 vext2 <0,2,4,u>, LHS
+ 2618934006U, // <4,u,1,0>: Cost 3 vext2 <0,2,4,u>, <1,0,3,2>
+ 2618934068U, // <4,u,1,1>: Cost 3 vext2 <0,2,4,u>, <1,1,1,1>
+ 1618171694U, // <4,u,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
+ 2618934232U, // <4,u,1,3>: Cost 3 vext2 <0,2,4,u>, <1,3,1,3>
+ 2695894848U, // <4,u,1,4>: Cost 3 vext3 <1,u,3,4>, <u,1,4,3>
+ 2618934416U, // <4,u,1,5>: Cost 3 vext2 <0,2,4,u>, <1,5,3,7>
+ 3692676321U, // <4,u,1,6>: Cost 4 vext2 <0,2,4,u>, <1,6,3,7>
+ 2718750555U, // <4,u,1,7>: Cost 3 vext3 <5,6,7,4>, <u,1,7,3>
+ 1618171748U, // <4,u,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
+ 2553397350U, // <4,u,2,0>: Cost 3 vext1 <0,4,u,2>, LHS
+ 2630215215U, // <4,u,2,1>: Cost 3 vext2 <2,1,4,u>, <2,1,4,u>
+ 2618934888U, // <4,u,2,2>: Cost 3 vext2 <0,2,4,u>, <2,2,2,2>
+ 1557800657U, // <4,u,2,3>: Cost 2 vext2 <2,3,4,u>, <2,3,4,u>
+ 2618935065U, // <4,u,2,4>: Cost 3 vext2 <0,2,4,u>, <2,4,3,u>
+ 2733864859U, // <4,u,2,5>: Cost 3 vext3 <u,2,5,4>, <u,2,5,4>
+ 2618935226U, // <4,u,2,6>: Cost 3 vext2 <0,2,4,u>, <2,6,3,7>
+ 2718750636U, // <4,u,2,7>: Cost 3 vext3 <5,6,7,4>, <u,2,7,3>
+ 1561118822U, // <4,u,2,u>: Cost 2 vext2 <2,u,4,u>, <2,u,4,u>
+ 2618935446U, // <4,u,3,0>: Cost 3 vext2 <0,2,4,u>, <3,0,1,2>
+ 2779318422U, // <4,u,3,1>: Cost 3 vuzpl RHS, <3,0,1,2>
+ 2636851545U, // <4,u,3,2>: Cost 3 vext2 <3,2,4,u>, <3,2,4,u>
+ 2618935708U, // <4,u,3,3>: Cost 3 vext2 <0,2,4,u>, <3,3,3,3>
+ 2618935810U, // <4,u,3,4>: Cost 3 vext2 <0,2,4,u>, <3,4,5,6>
+ 2691913711U, // <4,u,3,5>: Cost 3 vext3 <1,2,3,4>, <u,3,5,7>
+ 2588725862U, // <4,u,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+ 2640169710U, // <4,u,3,7>: Cost 3 vext2 <3,7,4,u>, <3,7,4,u>
+ 2618936094U, // <4,u,3,u>: Cost 3 vext2 <0,2,4,u>, <3,u,1,2>
+ 1503559782U, // <4,u,4,0>: Cost 2 vext1 <4,4,u,4>, LHS
+ 2692282391U, // <4,u,4,1>: Cost 3 vext3 <1,2,u,4>, <u,4,1,2>
+ 2565359426U, // <4,u,4,2>: Cost 3 vext1 <2,4,u,4>, <2,4,u,4>
+ 2571332123U, // <4,u,4,3>: Cost 3 vext1 <3,4,u,4>, <3,4,u,4>
+ 161926454U, // <4,u,4,4>: Cost 1 vdup0 RHS
+ 1545194806U, // <4,u,4,5>: Cost 2 vext2 <0,2,4,u>, RHS
+ 1705577782U, // <4,u,4,6>: Cost 2 vuzpl RHS, RHS
+ 2718750801U, // <4,u,4,7>: Cost 3 vext3 <5,6,7,4>, <u,4,7,6>
+ 161926454U, // <4,u,4,u>: Cost 1 vdup0 RHS
+ 1479164006U, // <4,u,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
+ 1839650606U, // <4,u,5,1>: Cost 2 vzipl RHS, LHS
+ 2565367502U, // <4,u,5,2>: Cost 3 vext1 <2,4,u,5>, <2,3,4,5>
+ 3089777309U, // <4,u,5,3>: Cost 3 vtrnr <0,4,1,5>, LHS
+ 1479167286U, // <4,u,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
+ 1839650970U, // <4,u,5,5>: Cost 2 vzipl RHS, RHS
+ 1618172058U, // <4,u,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
+ 3089780265U, // <4,u,5,7>: Cost 3 vtrnr <0,4,1,5>, RHS
+ 1618172076U, // <4,u,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
+ 1479688294U, // <4,u,6,0>: Cost 2 vext1 <0,4,u,6>, LHS
+ 2553430774U, // <4,u,6,1>: Cost 3 vext1 <0,4,u,6>, <1,0,3,2>
+ 1973868334U, // <4,u,6,2>: Cost 2 vtrnl RHS, LHS
+ 1497606685U, // <4,u,6,3>: Cost 2 vext1 <3,4,u,6>, <3,4,u,6>
+ 1479691574U, // <4,u,6,4>: Cost 2 vext1 <0,4,u,6>, RHS
+ 1509552079U, // <4,u,6,5>: Cost 2 vext1 <5,4,u,6>, <5,4,u,6>
+ 1973868698U, // <4,u,6,6>: Cost 2 vtrnl RHS, RHS
+ 27705344U, // <4,u,6,7>: Cost 0 copy RHS
+ 27705344U, // <4,u,6,u>: Cost 0 copy RHS
+ 2565382246U, // <4,u,7,0>: Cost 3 vext1 <2,4,u,7>, LHS
+ 2565383066U, // <4,u,7,1>: Cost 3 vext1 <2,4,u,7>, <1,2,3,4>
+ 2565384005U, // <4,u,7,2>: Cost 3 vext1 <2,4,u,7>, <2,4,u,7>
+ 2661405966U, // <4,u,7,3>: Cost 3 vext2 <7,3,4,u>, <7,3,4,u>
+ 2565385526U, // <4,u,7,4>: Cost 3 vext1 <2,4,u,7>, RHS
+ 2779321702U, // <4,u,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
+ 2589274793U, // <4,u,7,6>: Cost 3 vext1 <6,4,u,7>, <6,4,u,7>
+ 2779321964U, // <4,u,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
+ 2565388078U, // <4,u,7,u>: Cost 3 vext1 <2,4,u,7>, LHS
+ 1479704678U, // <4,u,u,0>: Cost 2 vext1 <0,4,u,u>, LHS
+ 1545197358U, // <4,u,u,1>: Cost 2 vext2 <0,2,4,u>, LHS
+ 1618172261U, // <4,u,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
+ 1497623071U, // <4,u,u,3>: Cost 2 vext1 <3,4,u,u>, <3,4,u,u>
+ 161926454U, // <4,u,u,4>: Cost 1 vdup0 RHS
+ 1545197722U, // <4,u,u,5>: Cost 2 vext2 <0,2,4,u>, RHS
+ 1618172301U, // <4,u,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
+ 27705344U, // <4,u,u,7>: Cost 0 copy RHS
+ 27705344U, // <4,u,u,u>: Cost 0 copy RHS
+ 2687123456U, // <5,0,0,0>: Cost 3 vext3 <0,4,1,5>, <0,0,0,0>
+ 2687123466U, // <5,0,0,1>: Cost 3 vext3 <0,4,1,5>, <0,0,1,1>
+ 2687123476U, // <5,0,0,2>: Cost 3 vext3 <0,4,1,5>, <0,0,2,2>
+ 3710599434U, // <5,0,0,3>: Cost 4 vext2 <3,2,5,0>, <0,3,2,5>
+ 2642166098U, // <5,0,0,4>: Cost 3 vext2 <4,1,5,0>, <0,4,1,5>
+ 3657060306U, // <5,0,0,5>: Cost 4 vext1 <5,5,0,0>, <5,5,0,0>
+ 3292094923U, // <5,0,0,6>: Cost 4 vrev <0,5,6,0>
+ 3669005700U, // <5,0,0,7>: Cost 4 vext1 <7,5,0,0>, <7,5,0,0>
+ 2687123530U, // <5,0,0,u>: Cost 3 vext3 <0,4,1,5>, <0,0,u,2>
+ 2559434854U, // <5,0,1,0>: Cost 3 vext1 <1,5,0,1>, LHS
+ 2559435887U, // <5,0,1,1>: Cost 3 vext1 <1,5,0,1>, <1,5,0,1>
+ 1613381734U, // <5,0,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
+ 3698656256U, // <5,0,1,3>: Cost 4 vext2 <1,2,5,0>, <1,3,5,7>
+ 2559438134U, // <5,0,1,4>: Cost 3 vext1 <1,5,0,1>, RHS
+ 2583326675U, // <5,0,1,5>: Cost 3 vext1 <5,5,0,1>, <5,5,0,1>
+ 3715908851U, // <5,0,1,6>: Cost 4 vext2 <4,1,5,0>, <1,6,5,7>
+ 3657069562U, // <5,0,1,7>: Cost 4 vext1 <5,5,0,1>, <7,0,1,2>
+ 1613381788U, // <5,0,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
+ 2686017700U, // <5,0,2,0>: Cost 3 vext3 <0,2,4,5>, <0,2,0,2>
+ 2685796528U, // <5,0,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
+ 2698625208U, // <5,0,2,2>: Cost 3 vext3 <2,3,4,5>, <0,2,2,4>
+ 2685944002U, // <5,0,2,3>: Cost 3 vext3 <0,2,3,5>, <0,2,3,5>
+ 2686017739U, // <5,0,2,4>: Cost 3 vext3 <0,2,4,5>, <0,2,4,5>
+ 2686091476U, // <5,0,2,5>: Cost 3 vext3 <0,2,5,5>, <0,2,5,5>
+ 2725167324U, // <5,0,2,6>: Cost 3 vext3 <6,7,4,5>, <0,2,6,4>
+ 2595280230U, // <5,0,2,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
+ 2686312687U, // <5,0,2,u>: Cost 3 vext3 <0,2,u,5>, <0,2,u,5>
+ 3760128248U, // <5,0,3,0>: Cost 4 vext3 <0,3,0,5>, <0,3,0,5>
+ 3759685888U, // <5,0,3,1>: Cost 4 vext3 <0,2,3,5>, <0,3,1,4>
+ 2686533898U, // <5,0,3,2>: Cost 3 vext3 <0,3,2,5>, <0,3,2,5>
+ 3760349459U, // <5,0,3,3>: Cost 4 vext3 <0,3,3,5>, <0,3,3,5>
+ 2638187004U, // <5,0,3,4>: Cost 3 vext2 <3,4,5,0>, <3,4,5,0>
+ 3776348452U, // <5,0,3,5>: Cost 4 vext3 <3,0,4,5>, <0,3,5,4>
+ 3713256094U, // <5,0,3,6>: Cost 4 vext2 <3,6,5,0>, <3,6,5,0>
+ 3914064896U, // <5,0,3,7>: Cost 4 vuzpr <3,5,7,0>, <1,3,5,7>
+ 2686976320U, // <5,0,3,u>: Cost 3 vext3 <0,3,u,5>, <0,3,u,5>
+ 2559459430U, // <5,0,4,0>: Cost 3 vext1 <1,5,0,4>, LHS
+ 1613381970U, // <5,0,4,1>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
+ 2687123804U, // <5,0,4,2>: Cost 3 vext3 <0,4,1,5>, <0,4,2,6>
+ 3761013092U, // <5,0,4,3>: Cost 4 vext3 <0,4,3,5>, <0,4,3,5>
+ 2559462710U, // <5,0,4,4>: Cost 3 vext1 <1,5,0,4>, RHS
+ 2638187830U, // <5,0,4,5>: Cost 3 vext2 <3,4,5,0>, RHS
+ 3761234303U, // <5,0,4,6>: Cost 4 vext3 <0,4,6,5>, <0,4,6,5>
+ 2646150600U, // <5,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
+ 1613381970U, // <5,0,4,u>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
+ 3766763926U, // <5,0,5,0>: Cost 4 vext3 <1,4,0,5>, <0,5,0,1>
+ 2919268454U, // <5,0,5,1>: Cost 3 vzipl <5,5,5,5>, LHS
+ 3053486182U, // <5,0,5,2>: Cost 3 vtrnl <5,5,5,5>, LHS
+ 3723210589U, // <5,0,5,3>: Cost 4 vext2 <5,3,5,0>, <5,3,5,0>
+ 3766763966U, // <5,0,5,4>: Cost 4 vext3 <1,4,0,5>, <0,5,4,5>
+ 2650796031U, // <5,0,5,5>: Cost 3 vext2 <5,5,5,0>, <5,5,5,0>
+ 3719893090U, // <5,0,5,6>: Cost 4 vext2 <4,7,5,0>, <5,6,7,0>
+ 3914067254U, // <5,0,5,7>: Cost 4 vuzpr <3,5,7,0>, RHS
+ 2919269021U, // <5,0,5,u>: Cost 3 vzipl <5,5,5,5>, LHS
+ 4047519744U, // <5,0,6,0>: Cost 4 vzipr <3,4,5,6>, <0,0,0,0>
+ 2920038502U, // <5,0,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
+ 3759759871U, // <5,0,6,2>: Cost 4 vext3 <0,2,4,5>, <0,6,2,7>
+ 3645164070U, // <5,0,6,3>: Cost 4 vext1 <3,5,0,6>, <3,5,0,6>
+ 3762414095U, // <5,0,6,4>: Cost 4 vext3 <0,6,4,5>, <0,6,4,5>
+ 3993780690U, // <5,0,6,5>: Cost 4 vzipl <5,6,7,0>, <0,5,6,7>
+ 3719893816U, // <5,0,6,6>: Cost 4 vext2 <4,7,5,0>, <6,6,6,6>
+ 2662077302U, // <5,0,6,7>: Cost 3 vext2 <7,4,5,0>, <6,7,4,5>
+ 2920039069U, // <5,0,6,u>: Cost 3 vzipl <5,6,7,0>, LHS
+ 2565455974U, // <5,0,7,0>: Cost 3 vext1 <2,5,0,7>, LHS
+ 2565456790U, // <5,0,7,1>: Cost 3 vext1 <2,5,0,7>, <1,2,3,0>
+ 2565457742U, // <5,0,7,2>: Cost 3 vext1 <2,5,0,7>, <2,5,0,7>
+ 3639199894U, // <5,0,7,3>: Cost 4 vext1 <2,5,0,7>, <3,0,1,2>
+ 2565459254U, // <5,0,7,4>: Cost 3 vext1 <2,5,0,7>, RHS
+ 2589347938U, // <5,0,7,5>: Cost 3 vext1 <6,5,0,7>, <5,6,7,0>
+ 2589348530U, // <5,0,7,6>: Cost 3 vext1 <6,5,0,7>, <6,5,0,7>
+ 4188456422U, // <5,0,7,7>: Cost 4 vtrnr RHS, <2,0,5,7>
+ 2565461806U, // <5,0,7,u>: Cost 3 vext1 <2,5,0,7>, LHS
+ 2687124106U, // <5,0,u,0>: Cost 3 vext3 <0,4,1,5>, <0,u,0,2>
+ 1616036502U, // <5,0,u,1>: Cost 2 vext3 <0,u,1,5>, <0,u,1,5>
+ 1613382301U, // <5,0,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
+ 2689925800U, // <5,0,u,3>: Cost 3 vext3 <0,u,3,5>, <0,u,3,5>
+ 2687124146U, // <5,0,u,4>: Cost 3 vext3 <0,4,1,5>, <0,u,4,6>
+ 2638190746U, // <5,0,u,5>: Cost 3 vext2 <3,4,5,0>, RHS
+ 2589356723U, // <5,0,u,6>: Cost 3 vext1 <6,5,0,u>, <6,5,0,u>
+ 2595280230U, // <5,0,u,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
+ 1613382355U, // <5,0,u,u>: Cost 2 vext3 <0,4,1,5>, LHS
+ 2646818816U, // <5,1,0,0>: Cost 3 vext2 <4,u,5,1>, <0,0,0,0>
+ 1573077094U, // <5,1,0,1>: Cost 2 vext2 <4,u,5,1>, LHS
+ 2646818980U, // <5,1,0,2>: Cost 3 vext2 <4,u,5,1>, <0,2,0,2>
+ 2687124214U, // <5,1,0,3>: Cost 3 vext3 <0,4,1,5>, <1,0,3,2>
+ 2641510738U, // <5,1,0,4>: Cost 3 vext2 <4,0,5,1>, <0,4,1,5>
+ 2641510814U, // <5,1,0,5>: Cost 3 vext2 <4,0,5,1>, <0,5,1,0>
+ 3720561142U, // <5,1,0,6>: Cost 4 vext2 <4,u,5,1>, <0,6,1,7>
+ 3298141357U, // <5,1,0,7>: Cost 4 vrev <1,5,7,0>
+ 1573077661U, // <5,1,0,u>: Cost 2 vext2 <4,u,5,1>, LHS
+ 2223891567U, // <5,1,1,0>: Cost 3 vrev <1,5,0,1>
+ 2687124276U, // <5,1,1,1>: Cost 3 vext3 <0,4,1,5>, <1,1,1,1>
+ 2646819734U, // <5,1,1,2>: Cost 3 vext2 <4,u,5,1>, <1,2,3,0>
+ 2687124296U, // <5,1,1,3>: Cost 3 vext3 <0,4,1,5>, <1,1,3,3>
+ 2691326803U, // <5,1,1,4>: Cost 3 vext3 <1,1,4,5>, <1,1,4,5>
+ 2691400540U, // <5,1,1,5>: Cost 3 vext3 <1,1,5,5>, <1,1,5,5>
+ 3765216101U, // <5,1,1,6>: Cost 4 vext3 <1,1,6,5>, <1,1,6,5>
+ 3765289838U, // <5,1,1,7>: Cost 4 vext3 <1,1,7,5>, <1,1,7,5>
+ 2687124341U, // <5,1,1,u>: Cost 3 vext3 <0,4,1,5>, <1,1,u,3>
+ 3297641584U, // <5,1,2,0>: Cost 4 vrev <1,5,0,2>
+ 3763520391U, // <5,1,2,1>: Cost 4 vext3 <0,u,1,5>, <1,2,1,3>
+ 2646820456U, // <5,1,2,2>: Cost 3 vext2 <4,u,5,1>, <2,2,2,2>
+ 2687124374U, // <5,1,2,3>: Cost 3 vext3 <0,4,1,5>, <1,2,3,0>
+ 2691990436U, // <5,1,2,4>: Cost 3 vext3 <1,2,4,5>, <1,2,4,5>
+ 2687124395U, // <5,1,2,5>: Cost 3 vext3 <0,4,1,5>, <1,2,5,3>
+ 2646820794U, // <5,1,2,6>: Cost 3 vext2 <4,u,5,1>, <2,6,3,7>
+ 3808199610U, // <5,1,2,7>: Cost 4 vext3 <u,3,4,5>, <1,2,7,0>
+ 2687124419U, // <5,1,2,u>: Cost 3 vext3 <0,4,1,5>, <1,2,u,0>
+ 2577440870U, // <5,1,3,0>: Cost 3 vext1 <4,5,1,3>, LHS
+ 2687124440U, // <5,1,3,1>: Cost 3 vext3 <0,4,1,5>, <1,3,1,3>
+ 3759686627U, // <5,1,3,2>: Cost 4 vext3 <0,2,3,5>, <1,3,2,5>
+ 2692580332U, // <5,1,3,3>: Cost 3 vext3 <1,3,3,5>, <1,3,3,5>
+ 2687124469U, // <5,1,3,4>: Cost 3 vext3 <0,4,1,5>, <1,3,4,5>
+ 2685207552U, // <5,1,3,5>: Cost 3 vext3 <0,1,2,5>, <1,3,5,7>
+ 3760866313U, // <5,1,3,6>: Cost 4 vext3 <0,4,1,5>, <1,3,6,7>
+ 2692875280U, // <5,1,3,7>: Cost 3 vext3 <1,3,7,5>, <1,3,7,5>
+ 2687124503U, // <5,1,3,u>: Cost 3 vext3 <0,4,1,5>, <1,3,u,3>
+ 1567771538U, // <5,1,4,0>: Cost 2 vext2 <4,0,5,1>, <4,0,5,1>
+ 2693096491U, // <5,1,4,1>: Cost 3 vext3 <1,4,1,5>, <1,4,1,5>
+ 2693170228U, // <5,1,4,2>: Cost 3 vext3 <1,4,2,5>, <1,4,2,5>
+ 2687124541U, // <5,1,4,3>: Cost 3 vext3 <0,4,1,5>, <1,4,3,5>
+ 2646822096U, // <5,1,4,4>: Cost 3 vext2 <4,u,5,1>, <4,4,4,4>
+ 1573080374U, // <5,1,4,5>: Cost 2 vext2 <4,u,5,1>, RHS
+ 2646822260U, // <5,1,4,6>: Cost 3 vext2 <4,u,5,1>, <4,6,4,6>
+ 3298174129U, // <5,1,4,7>: Cost 4 vrev <1,5,7,4>
+ 1573080602U, // <5,1,4,u>: Cost 2 vext2 <4,u,5,1>, <4,u,5,1>
+ 2687124591U, // <5,1,5,0>: Cost 3 vext3 <0,4,1,5>, <1,5,0,1>
+ 2646822543U, // <5,1,5,1>: Cost 3 vext2 <4,u,5,1>, <5,1,0,1>
+ 3760866433U, // <5,1,5,2>: Cost 4 vext3 <0,4,1,5>, <1,5,2,1>
+ 2687124624U, // <5,1,5,3>: Cost 3 vext3 <0,4,1,5>, <1,5,3,7>
+ 2687124631U, // <5,1,5,4>: Cost 3 vext3 <0,4,1,5>, <1,5,4,5>
+ 2646822916U, // <5,1,5,5>: Cost 3 vext2 <4,u,5,1>, <5,5,5,5>
+ 2646823010U, // <5,1,5,6>: Cost 3 vext2 <4,u,5,1>, <5,6,7,0>
+ 2646823080U, // <5,1,5,7>: Cost 3 vext2 <4,u,5,1>, <5,7,5,7>
+ 2687124663U, // <5,1,5,u>: Cost 3 vext3 <0,4,1,5>, <1,5,u,1>
+ 2553577574U, // <5,1,6,0>: Cost 3 vext1 <0,5,1,6>, LHS
+ 3763520719U, // <5,1,6,1>: Cost 4 vext3 <0,u,1,5>, <1,6,1,7>
+ 2646823418U, // <5,1,6,2>: Cost 3 vext2 <4,u,5,1>, <6,2,7,3>
+ 3760866529U, // <5,1,6,3>: Cost 4 vext3 <0,4,1,5>, <1,6,3,7>
+ 2553580854U, // <5,1,6,4>: Cost 3 vext1 <0,5,1,6>, RHS
+ 2687124723U, // <5,1,6,5>: Cost 3 vext3 <0,4,1,5>, <1,6,5,7>
+ 2646823736U, // <5,1,6,6>: Cost 3 vext2 <4,u,5,1>, <6,6,6,6>
+ 2646823758U, // <5,1,6,7>: Cost 3 vext2 <4,u,5,1>, <6,7,0,1>
+ 2646823839U, // <5,1,6,u>: Cost 3 vext2 <4,u,5,1>, <6,u,0,1>
+ 2559557734U, // <5,1,7,0>: Cost 3 vext1 <1,5,1,7>, LHS
+ 2559558452U, // <5,1,7,1>: Cost 3 vext1 <1,5,1,7>, <1,1,1,1>
+ 2571503270U, // <5,1,7,2>: Cost 3 vext1 <3,5,1,7>, <2,3,0,1>
+ 2040971366U, // <5,1,7,3>: Cost 2 vtrnr RHS, LHS
+ 2559561014U, // <5,1,7,4>: Cost 3 vext1 <1,5,1,7>, RHS
+ 2595393232U, // <5,1,7,5>: Cost 3 vext1 <7,5,1,7>, <5,1,7,3>
+ 4188455035U, // <5,1,7,6>: Cost 4 vtrnr RHS, <0,1,4,6>
+ 2646824556U, // <5,1,7,7>: Cost 3 vext2 <4,u,5,1>, <7,7,7,7>
+ 2040971371U, // <5,1,7,u>: Cost 2 vtrnr RHS, LHS
+ 1591662326U, // <5,1,u,0>: Cost 2 vext2 <u,0,5,1>, <u,0,5,1>
+ 1573082926U, // <5,1,u,1>: Cost 2 vext2 <4,u,5,1>, LHS
+ 2695824760U, // <5,1,u,2>: Cost 3 vext3 <1,u,2,5>, <1,u,2,5>
+ 2040979558U, // <5,1,u,3>: Cost 2 vtrnr RHS, LHS
+ 2687124874U, // <5,1,u,4>: Cost 3 vext3 <0,4,1,5>, <1,u,4,5>
+ 1573083290U, // <5,1,u,5>: Cost 2 vext2 <4,u,5,1>, RHS
+ 2646825168U, // <5,1,u,6>: Cost 3 vext2 <4,u,5,1>, <u,6,3,7>
+ 2646825216U, // <5,1,u,7>: Cost 3 vext2 <4,u,5,1>, <u,7,0,1>
+ 2040979563U, // <5,1,u,u>: Cost 2 vtrnr RHS, LHS
+ 3702652928U, // <5,2,0,0>: Cost 4 vext2 <1,u,5,2>, <0,0,0,0>
+ 2628911206U, // <5,2,0,1>: Cost 3 vext2 <1,u,5,2>, LHS
+ 2641518756U, // <5,2,0,2>: Cost 3 vext2 <4,0,5,2>, <0,2,0,2>
+ 3759760847U, // <5,2,0,3>: Cost 4 vext3 <0,2,4,5>, <2,0,3,2>
+ 3760866775U, // <5,2,0,4>: Cost 4 vext3 <0,4,1,5>, <2,0,4,1>
+ 3759539680U, // <5,2,0,5>: Cost 4 vext3 <0,2,1,5>, <2,0,5,1>
+ 3760866796U, // <5,2,0,6>: Cost 4 vext3 <0,4,1,5>, <2,0,6,4>
+ 3304114054U, // <5,2,0,7>: Cost 4 vrev <2,5,7,0>
+ 2628911773U, // <5,2,0,u>: Cost 3 vext2 <1,u,5,2>, LHS
+ 2623603464U, // <5,2,1,0>: Cost 3 vext2 <1,0,5,2>, <1,0,5,2>
+ 3698008921U, // <5,2,1,1>: Cost 4 vext2 <1,1,5,2>, <1,1,5,2>
+ 3633325603U, // <5,2,1,2>: Cost 4 vext1 <1,5,2,1>, <2,1,3,5>
+ 2687125027U, // <5,2,1,3>: Cost 3 vext3 <0,4,1,5>, <2,1,3,5>
+ 3633327414U, // <5,2,1,4>: Cost 4 vext1 <1,5,2,1>, RHS
+ 3759539760U, // <5,2,1,5>: Cost 4 vext3 <0,2,1,5>, <2,1,5,0>
+ 3760866876U, // <5,2,1,6>: Cost 4 vext3 <0,4,1,5>, <2,1,6,3>
+ 3304122247U, // <5,2,1,7>: Cost 4 vrev <2,5,7,1>
+ 2687125072U, // <5,2,1,u>: Cost 3 vext3 <0,4,1,5>, <2,1,u,5>
+ 3633332326U, // <5,2,2,0>: Cost 4 vext1 <1,5,2,2>, LHS
+ 3759760992U, // <5,2,2,1>: Cost 4 vext3 <0,2,4,5>, <2,2,1,3>
+ 2687125096U, // <5,2,2,2>: Cost 3 vext3 <0,4,1,5>, <2,2,2,2>
+ 2687125106U, // <5,2,2,3>: Cost 3 vext3 <0,4,1,5>, <2,2,3,3>
+ 2697963133U, // <5,2,2,4>: Cost 3 vext3 <2,2,4,5>, <2,2,4,5>
+ 3759466120U, // <5,2,2,5>: Cost 4 vext3 <0,2,0,5>, <2,2,5,7>
+ 3760866960U, // <5,2,2,6>: Cost 4 vext3 <0,4,1,5>, <2,2,6,6>
+ 3771926168U, // <5,2,2,7>: Cost 4 vext3 <2,2,7,5>, <2,2,7,5>
+ 2687125151U, // <5,2,2,u>: Cost 3 vext3 <0,4,1,5>, <2,2,u,3>
+ 2687125158U, // <5,2,3,0>: Cost 3 vext3 <0,4,1,5>, <2,3,0,1>
+ 2698405555U, // <5,2,3,1>: Cost 3 vext3 <2,3,1,5>, <2,3,1,5>
+ 2577516238U, // <5,2,3,2>: Cost 3 vext1 <4,5,2,3>, <2,3,4,5>
+ 3759687365U, // <5,2,3,3>: Cost 4 vext3 <0,2,3,5>, <2,3,3,5>
+ 1624884942U, // <5,2,3,4>: Cost 2 vext3 <2,3,4,5>, <2,3,4,5>
+ 2698700503U, // <5,2,3,5>: Cost 3 vext3 <2,3,5,5>, <2,3,5,5>
+ 3772368608U, // <5,2,3,6>: Cost 4 vext3 <2,3,4,5>, <2,3,6,5>
+ 3702655716U, // <5,2,3,7>: Cost 4 vext2 <1,u,5,2>, <3,7,3,7>
+ 1625179890U, // <5,2,3,u>: Cost 2 vext3 <2,3,u,5>, <2,3,u,5>
+ 2641521555U, // <5,2,4,0>: Cost 3 vext2 <4,0,5,2>, <4,0,5,2>
+ 3772368642U, // <5,2,4,1>: Cost 4 vext3 <2,3,4,5>, <2,4,1,3>
+ 2699142925U, // <5,2,4,2>: Cost 3 vext3 <2,4,2,5>, <2,4,2,5>
+ 2698626838U, // <5,2,4,3>: Cost 3 vext3 <2,3,4,5>, <2,4,3,5>
+ 2698626848U, // <5,2,4,4>: Cost 3 vext3 <2,3,4,5>, <2,4,4,6>
+ 2628914486U, // <5,2,4,5>: Cost 3 vext2 <1,u,5,2>, RHS
+ 2645503353U, // <5,2,4,6>: Cost 3 vext2 <4,6,5,2>, <4,6,5,2>
+ 3304146826U, // <5,2,4,7>: Cost 4 vrev <2,5,7,4>
+ 2628914729U, // <5,2,4,u>: Cost 3 vext2 <1,u,5,2>, RHS
+ 2553643110U, // <5,2,5,0>: Cost 3 vext1 <0,5,2,5>, LHS
+ 3758950227U, // <5,2,5,1>: Cost 4 vext3 <0,1,2,5>, <2,5,1,3>
+ 3759761248U, // <5,2,5,2>: Cost 4 vext3 <0,2,4,5>, <2,5,2,7>
+ 2982396006U, // <5,2,5,3>: Cost 3 vzipr <4,u,5,5>, LHS
+ 2553646390U, // <5,2,5,4>: Cost 3 vext1 <0,5,2,5>, RHS
+ 2553647108U, // <5,2,5,5>: Cost 3 vext1 <0,5,2,5>, <5,5,5,5>
+ 3760867204U, // <5,2,5,6>: Cost 4 vext3 <0,4,1,5>, <2,5,6,7>
+ 3702657141U, // <5,2,5,7>: Cost 4 vext2 <1,u,5,2>, <5,7,0,1>
+ 2982396011U, // <5,2,5,u>: Cost 3 vzipr <4,u,5,5>, LHS
+ 3627393126U, // <5,2,6,0>: Cost 4 vext1 <0,5,2,6>, LHS
+ 3760867236U, // <5,2,6,1>: Cost 4 vext3 <0,4,1,5>, <2,6,1,3>
+ 2645504506U, // <5,2,6,2>: Cost 3 vext2 <4,6,5,2>, <6,2,7,3>
+ 2687125434U, // <5,2,6,3>: Cost 3 vext3 <0,4,1,5>, <2,6,3,7>
+ 2700617665U, // <5,2,6,4>: Cost 3 vext3 <2,6,4,5>, <2,6,4,5>
+ 3760867276U, // <5,2,6,5>: Cost 4 vext3 <0,4,1,5>, <2,6,5,7>
+ 3763521493U, // <5,2,6,6>: Cost 4 vext3 <0,u,1,5>, <2,6,6,7>
+ 3719246670U, // <5,2,6,7>: Cost 4 vext2 <4,6,5,2>, <6,7,0,1>
+ 2687125479U, // <5,2,6,u>: Cost 3 vext3 <0,4,1,5>, <2,6,u,7>
+ 2565603430U, // <5,2,7,0>: Cost 3 vext1 <2,5,2,7>, LHS
+ 2553660150U, // <5,2,7,1>: Cost 3 vext1 <0,5,2,7>, <1,0,3,2>
+ 2565605216U, // <5,2,7,2>: Cost 3 vext1 <2,5,2,7>, <2,5,2,7>
+ 2961178726U, // <5,2,7,3>: Cost 3 vzipr <1,3,5,7>, LHS
+ 2565606710U, // <5,2,7,4>: Cost 3 vext1 <2,5,2,7>, RHS
+ 4034920552U, // <5,2,7,5>: Cost 4 vzipr <1,3,5,7>, <0,1,2,5>
+ 3114713292U, // <5,2,7,6>: Cost 3 vtrnr RHS, <0,2,4,6>
+ 3702658668U, // <5,2,7,7>: Cost 4 vext2 <1,u,5,2>, <7,7,7,7>
+ 2961178731U, // <5,2,7,u>: Cost 3 vzipr <1,3,5,7>, LHS
+ 2687125563U, // <5,2,u,0>: Cost 3 vext3 <0,4,1,5>, <2,u,0,1>
+ 2628917038U, // <5,2,u,1>: Cost 3 vext2 <1,u,5,2>, LHS
+ 2565613409U, // <5,2,u,2>: Cost 3 vext1 <2,5,2,u>, <2,5,2,u>
+ 2687125592U, // <5,2,u,3>: Cost 3 vext3 <0,4,1,5>, <2,u,3,3>
+ 1628203107U, // <5,2,u,4>: Cost 2 vext3 <2,u,4,5>, <2,u,4,5>
+ 2628917402U, // <5,2,u,5>: Cost 3 vext2 <1,u,5,2>, RHS
+ 2702092405U, // <5,2,u,6>: Cost 3 vext3 <2,u,6,5>, <2,u,6,5>
+ 3304179598U, // <5,2,u,7>: Cost 4 vrev <2,5,7,u>
+ 1628498055U, // <5,2,u,u>: Cost 2 vext3 <2,u,u,5>, <2,u,u,5>
+ 3760867467U, // <5,3,0,0>: Cost 4 vext3 <0,4,1,5>, <3,0,0,0>
+ 2687125654U, // <5,3,0,1>: Cost 3 vext3 <0,4,1,5>, <3,0,1,2>
+ 3759761565U, // <5,3,0,2>: Cost 4 vext3 <0,2,4,5>, <3,0,2,0>
+ 3633391766U, // <5,3,0,3>: Cost 4 vext1 <1,5,3,0>, <3,0,1,2>
+ 2687125680U, // <5,3,0,4>: Cost 3 vext3 <0,4,1,5>, <3,0,4,1>
+ 3760277690U, // <5,3,0,5>: Cost 4 vext3 <0,3,2,5>, <3,0,5,2>
+ 3310013014U, // <5,3,0,6>: Cost 4 vrev <3,5,6,0>
+ 2236344927U, // <5,3,0,7>: Cost 3 vrev <3,5,7,0>
+ 2687125717U, // <5,3,0,u>: Cost 3 vext3 <0,4,1,5>, <3,0,u,2>
+ 3760867551U, // <5,3,1,0>: Cost 4 vext3 <0,4,1,5>, <3,1,0,3>
+ 3760867558U, // <5,3,1,1>: Cost 4 vext3 <0,4,1,5>, <3,1,1,1>
+ 2624938923U, // <5,3,1,2>: Cost 3 vext2 <1,2,5,3>, <1,2,5,3>
+ 2703198460U, // <5,3,1,3>: Cost 3 vext3 <3,1,3,5>, <3,1,3,5>
+ 3760867587U, // <5,3,1,4>: Cost 4 vext3 <0,4,1,5>, <3,1,4,3>
+ 2636219536U, // <5,3,1,5>: Cost 3 vext2 <3,1,5,3>, <1,5,3,7>
+ 3698681075U, // <5,3,1,6>: Cost 4 vext2 <1,2,5,3>, <1,6,5,7>
+ 2703493408U, // <5,3,1,7>: Cost 3 vext3 <3,1,7,5>, <3,1,7,5>
+ 2628920721U, // <5,3,1,u>: Cost 3 vext2 <1,u,5,3>, <1,u,5,3>
+ 3766765870U, // <5,3,2,0>: Cost 4 vext3 <1,4,0,5>, <3,2,0,1>
+ 3698681379U, // <5,3,2,1>: Cost 4 vext2 <1,2,5,3>, <2,1,3,5>
+ 3760867649U, // <5,3,2,2>: Cost 4 vext3 <0,4,1,5>, <3,2,2,2>
+ 2698627404U, // <5,3,2,3>: Cost 3 vext3 <2,3,4,5>, <3,2,3,4>
+ 2703935830U, // <5,3,2,4>: Cost 3 vext3 <3,2,4,5>, <3,2,4,5>
+ 2698627422U, // <5,3,2,5>: Cost 3 vext3 <2,3,4,5>, <3,2,5,4>
+ 3760867686U, // <5,3,2,6>: Cost 4 vext3 <0,4,1,5>, <3,2,6,3>
+ 3769788783U, // <5,3,2,7>: Cost 4 vext3 <1,u,5,5>, <3,2,7,3>
+ 2701945209U, // <5,3,2,u>: Cost 3 vext3 <2,u,4,5>, <3,2,u,4>
+ 3760867711U, // <5,3,3,0>: Cost 4 vext3 <0,4,1,5>, <3,3,0,1>
+ 2636220684U, // <5,3,3,1>: Cost 3 vext2 <3,1,5,3>, <3,1,5,3>
+ 3772369298U, // <5,3,3,2>: Cost 4 vext3 <2,3,4,5>, <3,3,2,2>
+ 2687125916U, // <5,3,3,3>: Cost 3 vext3 <0,4,1,5>, <3,3,3,3>
+ 2704599463U, // <5,3,3,4>: Cost 3 vext3 <3,3,4,5>, <3,3,4,5>
+ 2704673200U, // <5,3,3,5>: Cost 3 vext3 <3,3,5,5>, <3,3,5,5>
+ 3709962935U, // <5,3,3,6>: Cost 4 vext2 <3,1,5,3>, <3,6,7,7>
+ 3772369346U, // <5,3,3,7>: Cost 4 vext3 <2,3,4,5>, <3,3,7,5>
+ 2704894411U, // <5,3,3,u>: Cost 3 vext3 <3,3,u,5>, <3,3,u,5>
+ 2704968148U, // <5,3,4,0>: Cost 3 vext3 <3,4,0,5>, <3,4,0,5>
+ 3698682850U, // <5,3,4,1>: Cost 4 vext2 <1,2,5,3>, <4,1,5,0>
+ 2642857014U, // <5,3,4,2>: Cost 3 vext2 <4,2,5,3>, <4,2,5,3>
+ 2705189359U, // <5,3,4,3>: Cost 3 vext3 <3,4,3,5>, <3,4,3,5>
+ 2705263096U, // <5,3,4,4>: Cost 3 vext3 <3,4,4,5>, <3,4,4,5>
+ 2685946370U, // <5,3,4,5>: Cost 3 vext3 <0,2,3,5>, <3,4,5,6>
+ 3779152394U, // <5,3,4,6>: Cost 4 vext3 <3,4,6,5>, <3,4,6,5>
+ 2236377699U, // <5,3,4,7>: Cost 3 vrev <3,5,7,4>
+ 2687126045U, // <5,3,4,u>: Cost 3 vext3 <0,4,1,5>, <3,4,u,6>
+ 2571632742U, // <5,3,5,0>: Cost 3 vext1 <3,5,3,5>, LHS
+ 2559689870U, // <5,3,5,1>: Cost 3 vext1 <1,5,3,5>, <1,5,3,5>
+ 2571634382U, // <5,3,5,2>: Cost 3 vext1 <3,5,3,5>, <2,3,4,5>
+ 2571635264U, // <5,3,5,3>: Cost 3 vext1 <3,5,3,5>, <3,5,3,5>
+ 2571636022U, // <5,3,5,4>: Cost 3 vext1 <3,5,3,5>, RHS
+ 2559692804U, // <5,3,5,5>: Cost 3 vext1 <1,5,3,5>, <5,5,5,5>
+ 3720581218U, // <5,3,5,6>: Cost 4 vext2 <4,u,5,3>, <5,6,7,0>
+ 2236385892U, // <5,3,5,7>: Cost 3 vrev <3,5,7,5>
+ 2571638574U, // <5,3,5,u>: Cost 3 vext1 <3,5,3,5>, LHS
+ 2565668966U, // <5,3,6,0>: Cost 3 vext1 <2,5,3,6>, LHS
+ 3633439887U, // <5,3,6,1>: Cost 4 vext1 <1,5,3,6>, <1,5,3,6>
+ 2565670760U, // <5,3,6,2>: Cost 3 vext1 <2,5,3,6>, <2,5,3,6>
+ 2565671426U, // <5,3,6,3>: Cost 3 vext1 <2,5,3,6>, <3,4,5,6>
+ 2565672246U, // <5,3,6,4>: Cost 3 vext1 <2,5,3,6>, RHS
+ 3639414630U, // <5,3,6,5>: Cost 4 vext1 <2,5,3,6>, <5,3,6,0>
+ 4047521640U, // <5,3,6,6>: Cost 4 vzipr <3,4,5,6>, <2,5,3,6>
+ 2725169844U, // <5,3,6,7>: Cost 3 vext3 <6,7,4,5>, <3,6,7,4>
+ 2565674798U, // <5,3,6,u>: Cost 3 vext1 <2,5,3,6>, LHS
+ 1485963366U, // <5,3,7,0>: Cost 2 vext1 <1,5,3,7>, LHS
+ 1485964432U, // <5,3,7,1>: Cost 2 vext1 <1,5,3,7>, <1,5,3,7>
+ 2559706728U, // <5,3,7,2>: Cost 3 vext1 <1,5,3,7>, <2,2,2,2>
+ 2559707286U, // <5,3,7,3>: Cost 3 vext1 <1,5,3,7>, <3,0,1,2>
+ 1485966646U, // <5,3,7,4>: Cost 2 vext1 <1,5,3,7>, RHS
+ 2559708880U, // <5,3,7,5>: Cost 3 vext1 <1,5,3,7>, <5,1,7,3>
+ 2601513466U, // <5,3,7,6>: Cost 3 vext1 <u,5,3,7>, <6,2,7,3>
+ 3114714112U, // <5,3,7,7>: Cost 3 vtrnr RHS, <1,3,5,7>
+ 1485969198U, // <5,3,7,u>: Cost 2 vext1 <1,5,3,7>, LHS
+ 1485971558U, // <5,3,u,0>: Cost 2 vext1 <1,5,3,u>, LHS
+ 1485972625U, // <5,3,u,1>: Cost 2 vext1 <1,5,3,u>, <1,5,3,u>
+ 2559714920U, // <5,3,u,2>: Cost 3 vext1 <1,5,3,u>, <2,2,2,2>
+ 2559715478U, // <5,3,u,3>: Cost 3 vext1 <1,5,3,u>, <3,0,1,2>
+ 1485974838U, // <5,3,u,4>: Cost 2 vext1 <1,5,3,u>, RHS
+ 2687126342U, // <5,3,u,5>: Cost 3 vext3 <0,4,1,5>, <3,u,5,6>
+ 2601521658U, // <5,3,u,6>: Cost 3 vext1 <u,5,3,u>, <6,2,7,3>
+ 2236410471U, // <5,3,u,7>: Cost 3 vrev <3,5,7,u>
+ 1485977390U, // <5,3,u,u>: Cost 2 vext1 <1,5,3,u>, LHS
+ 3627491430U, // <5,4,0,0>: Cost 4 vext1 <0,5,4,0>, LHS
+ 2636890214U, // <5,4,0,1>: Cost 3 vext2 <3,2,5,4>, LHS
+ 3703333028U, // <5,4,0,2>: Cost 4 vext2 <2,0,5,4>, <0,2,0,2>
+ 3782249348U, // <5,4,0,3>: Cost 4 vext3 <4,0,3,5>, <4,0,3,5>
+ 2642198866U, // <5,4,0,4>: Cost 3 vext2 <4,1,5,4>, <0,4,1,5>
+ 2687126418U, // <5,4,0,5>: Cost 3 vext3 <0,4,1,5>, <4,0,5,1>
+ 2242243887U, // <5,4,0,6>: Cost 3 vrev <4,5,6,0>
+ 3316059448U, // <5,4,0,7>: Cost 4 vrev <4,5,7,0>
+ 2636890781U, // <5,4,0,u>: Cost 3 vext2 <3,2,5,4>, LHS
+ 2241809658U, // <5,4,1,0>: Cost 3 vrev <4,5,0,1>
+ 3698025307U, // <5,4,1,1>: Cost 4 vext2 <1,1,5,4>, <1,1,5,4>
+ 3698688940U, // <5,4,1,2>: Cost 4 vext2 <1,2,5,4>, <1,2,5,4>
+ 3698689024U, // <5,4,1,3>: Cost 4 vext2 <1,2,5,4>, <1,3,5,7>
+ 3700016206U, // <5,4,1,4>: Cost 4 vext2 <1,4,5,4>, <1,4,5,4>
+ 2687126498U, // <5,4,1,5>: Cost 3 vext3 <0,4,1,5>, <4,1,5,0>
+ 3760868336U, // <5,4,1,6>: Cost 4 vext3 <0,4,1,5>, <4,1,6,5>
+ 3316067641U, // <5,4,1,7>: Cost 4 vrev <4,5,7,1>
+ 2242399554U, // <5,4,1,u>: Cost 3 vrev <4,5,u,1>
+ 3703334371U, // <5,4,2,0>: Cost 4 vext2 <2,0,5,4>, <2,0,5,4>
+ 3703998004U, // <5,4,2,1>: Cost 4 vext2 <2,1,5,4>, <2,1,5,4>
+ 3704661637U, // <5,4,2,2>: Cost 4 vext2 <2,2,5,4>, <2,2,5,4>
+ 2636891854U, // <5,4,2,3>: Cost 3 vext2 <3,2,5,4>, <2,3,4,5>
+ 3705988903U, // <5,4,2,4>: Cost 4 vext2 <2,4,5,4>, <2,4,5,4>
+ 2698628150U, // <5,4,2,5>: Cost 3 vext3 <2,3,4,5>, <4,2,5,3>
+ 3760868415U, // <5,4,2,6>: Cost 4 vext3 <0,4,1,5>, <4,2,6,3>
+ 3783871562U, // <5,4,2,7>: Cost 4 vext3 <4,2,7,5>, <4,2,7,5>
+ 2666752099U, // <5,4,2,u>: Cost 3 vext2 <u,2,5,4>, <2,u,4,5>
+ 3639459942U, // <5,4,3,0>: Cost 4 vext1 <2,5,4,3>, LHS
+ 3709970701U, // <5,4,3,1>: Cost 4 vext2 <3,1,5,4>, <3,1,5,4>
+ 2636892510U, // <5,4,3,2>: Cost 3 vext2 <3,2,5,4>, <3,2,5,4>
+ 3710634396U, // <5,4,3,3>: Cost 4 vext2 <3,2,5,4>, <3,3,3,3>
+ 2638219776U, // <5,4,3,4>: Cost 3 vext2 <3,4,5,4>, <3,4,5,4>
+ 3766987908U, // <5,4,3,5>: Cost 4 vext3 <1,4,3,5>, <4,3,5,0>
+ 2710719634U, // <5,4,3,6>: Cost 3 vext3 <4,3,6,5>, <4,3,6,5>
+ 3914097664U, // <5,4,3,7>: Cost 4 vuzpr <3,5,7,4>, <1,3,5,7>
+ 2640874308U, // <5,4,3,u>: Cost 3 vext2 <3,u,5,4>, <3,u,5,4>
+ 2583642214U, // <5,4,4,0>: Cost 3 vext1 <5,5,4,4>, LHS
+ 2642201574U, // <5,4,4,1>: Cost 3 vext2 <4,1,5,4>, <4,1,5,4>
+ 3710635062U, // <5,4,4,2>: Cost 4 vext2 <3,2,5,4>, <4,2,5,3>
+ 3717270664U, // <5,4,4,3>: Cost 4 vext2 <4,3,5,4>, <4,3,5,4>
+ 2713963728U, // <5,4,4,4>: Cost 3 vext3 <4,u,5,5>, <4,4,4,4>
+ 1637567706U, // <5,4,4,5>: Cost 2 vext3 <4,4,5,5>, <4,4,5,5>
+ 2242276659U, // <5,4,4,6>: Cost 3 vrev <4,5,6,4>
+ 2646183372U, // <5,4,4,7>: Cost 3 vext2 <4,7,5,4>, <4,7,5,4>
+ 1637788917U, // <5,4,4,u>: Cost 2 vext3 <4,4,u,5>, <4,4,u,5>
+ 2559762534U, // <5,4,5,0>: Cost 3 vext1 <1,5,4,5>, LHS
+ 2559763607U, // <5,4,5,1>: Cost 3 vext1 <1,5,4,5>, <1,5,4,5>
+ 2698628366U, // <5,4,5,2>: Cost 3 vext3 <2,3,4,5>, <4,5,2,3>
+ 3633506454U, // <5,4,5,3>: Cost 4 vext1 <1,5,4,5>, <3,0,1,2>
+ 2559765814U, // <5,4,5,4>: Cost 3 vext1 <1,5,4,5>, RHS
+ 2583654395U, // <5,4,5,5>: Cost 3 vext1 <5,5,4,5>, <5,5,4,5>
+ 1613385014U, // <5,4,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
+ 3901639990U, // <5,4,5,7>: Cost 4 vuzpr <1,5,0,4>, RHS
+ 1613385032U, // <5,4,5,u>: Cost 2 vext3 <0,4,1,5>, RHS
+ 2559770726U, // <5,4,6,0>: Cost 3 vext1 <1,5,4,6>, LHS
+ 2559771648U, // <5,4,6,1>: Cost 3 vext1 <1,5,4,6>, <1,3,5,7>
+ 3633514088U, // <5,4,6,2>: Cost 4 vext1 <1,5,4,6>, <2,2,2,2>
+ 2571717122U, // <5,4,6,3>: Cost 3 vext1 <3,5,4,6>, <3,4,5,6>
+ 2559774006U, // <5,4,6,4>: Cost 3 vext1 <1,5,4,6>, RHS
+ 2712636796U, // <5,4,6,5>: Cost 3 vext3 <4,6,5,5>, <4,6,5,5>
+ 3760868743U, // <5,4,6,6>: Cost 4 vext3 <0,4,1,5>, <4,6,6,7>
+ 2712784270U, // <5,4,6,7>: Cost 3 vext3 <4,6,7,5>, <4,6,7,5>
+ 2559776558U, // <5,4,6,u>: Cost 3 vext1 <1,5,4,6>, LHS
+ 2565750886U, // <5,4,7,0>: Cost 3 vext1 <2,5,4,7>, LHS
+ 2565751706U, // <5,4,7,1>: Cost 3 vext1 <2,5,4,7>, <1,2,3,4>
+ 2565752690U, // <5,4,7,2>: Cost 3 vext1 <2,5,4,7>, <2,5,4,7>
+ 2571725387U, // <5,4,7,3>: Cost 3 vext1 <3,5,4,7>, <3,5,4,7>
+ 2565754166U, // <5,4,7,4>: Cost 3 vext1 <2,5,4,7>, RHS
+ 3114713426U, // <5,4,7,5>: Cost 3 vtrnr RHS, <0,4,1,5>
+ 94817590U, // <5,4,7,6>: Cost 1 vrev RHS
+ 2595616175U, // <5,4,7,7>: Cost 3 vext1 <7,5,4,7>, <7,5,4,7>
+ 94965064U, // <5,4,7,u>: Cost 1 vrev RHS
+ 2559787110U, // <5,4,u,0>: Cost 3 vext1 <1,5,4,u>, LHS
+ 2559788186U, // <5,4,u,1>: Cost 3 vext1 <1,5,4,u>, <1,5,4,u>
+ 2242014483U, // <5,4,u,2>: Cost 3 vrev <4,5,2,u>
+ 2667419628U, // <5,4,u,3>: Cost 3 vext2 <u,3,5,4>, <u,3,5,4>
+ 2559790390U, // <5,4,u,4>: Cost 3 vext1 <1,5,4,u>, RHS
+ 1640222238U, // <5,4,u,5>: Cost 2 vext3 <4,u,5,5>, <4,u,5,5>
+ 94825783U, // <5,4,u,6>: Cost 1 vrev RHS
+ 2714111536U, // <5,4,u,7>: Cost 3 vext3 <4,u,7,5>, <4,u,7,5>
+ 94973257U, // <5,4,u,u>: Cost 1 vrev RHS
+ 2646851584U, // <5,5,0,0>: Cost 3 vext2 <4,u,5,5>, <0,0,0,0>
+ 1573109862U, // <5,5,0,1>: Cost 2 vext2 <4,u,5,5>, LHS
+ 2646851748U, // <5,5,0,2>: Cost 3 vext2 <4,u,5,5>, <0,2,0,2>
+ 3760279130U, // <5,5,0,3>: Cost 4 vext3 <0,3,2,5>, <5,0,3,2>
+ 2687127138U, // <5,5,0,4>: Cost 3 vext3 <0,4,1,5>, <5,0,4,1>
+ 2248142847U, // <5,5,0,5>: Cost 3 vrev <5,5,5,0>
+ 3720593910U, // <5,5,0,6>: Cost 4 vext2 <4,u,5,5>, <0,6,1,7>
+ 4182502710U, // <5,5,0,7>: Cost 4 vtrnr <3,5,7,0>, RHS
+ 1573110429U, // <5,5,0,u>: Cost 2 vext2 <4,u,5,5>, LHS
+ 2646852342U, // <5,5,1,0>: Cost 3 vext2 <4,u,5,5>, <1,0,3,2>
+ 2624291676U, // <5,5,1,1>: Cost 3 vext2 <1,1,5,5>, <1,1,5,5>
+ 2646852502U, // <5,5,1,2>: Cost 3 vext2 <4,u,5,5>, <1,2,3,0>
+ 2646852568U, // <5,5,1,3>: Cost 3 vext2 <4,u,5,5>, <1,3,1,3>
+ 2715217591U, // <5,5,1,4>: Cost 3 vext3 <5,1,4,5>, <5,1,4,5>
+ 2628936848U, // <5,5,1,5>: Cost 3 vext2 <1,u,5,5>, <1,5,3,7>
+ 3698033907U, // <5,5,1,6>: Cost 4 vext2 <1,1,5,5>, <1,6,5,7>
+ 2713964240U, // <5,5,1,7>: Cost 3 vext3 <4,u,5,5>, <5,1,7,3>
+ 2628937107U, // <5,5,1,u>: Cost 3 vext2 <1,u,5,5>, <1,u,5,5>
+ 3645497446U, // <5,5,2,0>: Cost 4 vext1 <3,5,5,2>, LHS
+ 3760869099U, // <5,5,2,1>: Cost 4 vext3 <0,4,1,5>, <5,2,1,3>
+ 2646853224U, // <5,5,2,2>: Cost 3 vext2 <4,u,5,5>, <2,2,2,2>
+ 2698628862U, // <5,5,2,3>: Cost 3 vext3 <2,3,4,5>, <5,2,3,4>
+ 3772370694U, // <5,5,2,4>: Cost 4 vext3 <2,3,4,5>, <5,2,4,3>
+ 2713964303U, // <5,5,2,5>: Cost 3 vext3 <4,u,5,5>, <5,2,5,3>
+ 2646853562U, // <5,5,2,6>: Cost 3 vext2 <4,u,5,5>, <2,6,3,7>
+ 4038198272U, // <5,5,2,7>: Cost 4 vzipr <1,u,5,2>, <1,3,5,7>
+ 2701946667U, // <5,5,2,u>: Cost 3 vext3 <2,u,4,5>, <5,2,u,4>
+ 2646853782U, // <5,5,3,0>: Cost 3 vext2 <4,u,5,5>, <3,0,1,2>
+ 3698034922U, // <5,5,3,1>: Cost 4 vext2 <1,1,5,5>, <3,1,1,5>
+ 3702679919U, // <5,5,3,2>: Cost 4 vext2 <1,u,5,5>, <3,2,7,3>
+ 2637564336U, // <5,5,3,3>: Cost 3 vext2 <3,3,5,5>, <3,3,5,5>
+ 2646854146U, // <5,5,3,4>: Cost 3 vext2 <4,u,5,5>, <3,4,5,6>
+ 2638891602U, // <5,5,3,5>: Cost 3 vext2 <3,5,5,5>, <3,5,5,5>
+ 3702680247U, // <5,5,3,6>: Cost 4 vext2 <1,u,5,5>, <3,6,7,7>
+ 3702680259U, // <5,5,3,7>: Cost 4 vext2 <1,u,5,5>, <3,7,0,1>
+ 2646854430U, // <5,5,3,u>: Cost 3 vext2 <4,u,5,5>, <3,u,1,2>
+ 2646854546U, // <5,5,4,0>: Cost 3 vext2 <4,u,5,5>, <4,0,5,1>
+ 2642209767U, // <5,5,4,1>: Cost 3 vext2 <4,1,5,5>, <4,1,5,5>
+ 3711306806U, // <5,5,4,2>: Cost 4 vext2 <3,3,5,5>, <4,2,5,3>
+ 3645516369U, // <5,5,4,3>: Cost 4 vext1 <3,5,5,4>, <3,5,5,4>
+ 1570458842U, // <5,5,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
+ 1573113142U, // <5,5,4,5>: Cost 2 vext2 <4,u,5,5>, RHS
+ 2645527932U, // <5,5,4,6>: Cost 3 vext2 <4,6,5,5>, <4,6,5,5>
+ 2713964486U, // <5,5,4,7>: Cost 3 vext3 <4,u,5,5>, <5,4,7,6>
+ 1573113374U, // <5,5,4,u>: Cost 2 vext2 <4,u,5,5>, <4,u,5,5>
+ 1509982310U, // <5,5,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
+ 2646855376U, // <5,5,5,1>: Cost 3 vext2 <4,u,5,5>, <5,1,7,3>
+ 2583725672U, // <5,5,5,2>: Cost 3 vext1 <5,5,5,5>, <2,2,2,2>
+ 2583726230U, // <5,5,5,3>: Cost 3 vext1 <5,5,5,5>, <3,0,1,2>
+ 1509985590U, // <5,5,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
+ 229035318U, // <5,5,5,5>: Cost 1 vdup1 RHS
+ 2646855778U, // <5,5,5,6>: Cost 3 vext2 <4,u,5,5>, <5,6,7,0>
+ 2646855848U, // <5,5,5,7>: Cost 3 vext2 <4,u,5,5>, <5,7,5,7>
+ 229035318U, // <5,5,5,u>: Cost 1 vdup1 RHS
+ 2577760358U, // <5,5,6,0>: Cost 3 vext1 <4,5,5,6>, LHS
+ 3633587361U, // <5,5,6,1>: Cost 4 vext1 <1,5,5,6>, <1,5,5,6>
+ 2646856186U, // <5,5,6,2>: Cost 3 vext2 <4,u,5,5>, <6,2,7,3>
+ 3633588738U, // <5,5,6,3>: Cost 4 vext1 <1,5,5,6>, <3,4,5,6>
+ 2718535756U, // <5,5,6,4>: Cost 3 vext3 <5,6,4,5>, <5,6,4,5>
+ 2644202223U, // <5,5,6,5>: Cost 3 vext2 <4,4,5,5>, <6,5,7,5>
+ 2973780482U, // <5,5,6,6>: Cost 3 vzipr <3,4,5,6>, <3,4,5,6>
+ 2646856526U, // <5,5,6,7>: Cost 3 vext2 <4,u,5,5>, <6,7,0,1>
+ 2646856607U, // <5,5,6,u>: Cost 3 vext2 <4,u,5,5>, <6,u,0,1>
+ 2571796582U, // <5,5,7,0>: Cost 3 vext1 <3,5,5,7>, LHS
+ 3633595392U, // <5,5,7,1>: Cost 4 vext1 <1,5,5,7>, <1,3,5,7>
+ 2571798222U, // <5,5,7,2>: Cost 3 vext1 <3,5,5,7>, <2,3,4,5>
+ 2571799124U, // <5,5,7,3>: Cost 3 vext1 <3,5,5,7>, <3,5,5,7>
+ 2571799862U, // <5,5,7,4>: Cost 3 vext1 <3,5,5,7>, RHS
+ 3114717188U, // <5,5,7,5>: Cost 3 vtrnr RHS, <5,5,5,5>
+ 4034923010U, // <5,5,7,6>: Cost 4 vzipr <1,3,5,7>, <3,4,5,6>
+ 2040974646U, // <5,5,7,7>: Cost 2 vtrnr RHS, RHS
+ 2040974647U, // <5,5,7,u>: Cost 2 vtrnr RHS, RHS
+ 1509982310U, // <5,5,u,0>: Cost 2 vext1 <5,5,5,5>, LHS
+ 1573115694U, // <5,5,u,1>: Cost 2 vext2 <4,u,5,5>, LHS
+ 2571806414U, // <5,5,u,2>: Cost 3 vext1 <3,5,5,u>, <2,3,4,5>
+ 2571807317U, // <5,5,u,3>: Cost 3 vext1 <3,5,5,u>, <3,5,5,u>
+ 1509985590U, // <5,5,u,4>: Cost 2 vext1 <5,5,5,5>, RHS
+ 229035318U, // <5,5,u,5>: Cost 1 vdup1 RHS
+ 2646857936U, // <5,5,u,6>: Cost 3 vext2 <4,u,5,5>, <u,6,3,7>
+ 2040982838U, // <5,5,u,7>: Cost 2 vtrnr RHS, RHS
+ 229035318U, // <5,5,u,u>: Cost 1 vdup1 RHS
+ 2638233600U, // <5,6,0,0>: Cost 3 vext2 <3,4,5,6>, <0,0,0,0>
+ 1564491878U, // <5,6,0,1>: Cost 2 vext2 <3,4,5,6>, LHS
+ 2632261796U, // <5,6,0,2>: Cost 3 vext2 <2,4,5,6>, <0,2,0,2>
+ 2638233856U, // <5,6,0,3>: Cost 3 vext2 <3,4,5,6>, <0,3,1,4>
+ 2638233938U, // <5,6,0,4>: Cost 3 vext2 <3,4,5,6>, <0,4,1,5>
+ 3706003885U, // <5,6,0,5>: Cost 4 vext2 <2,4,5,6>, <0,5,2,6>
+ 3706003967U, // <5,6,0,6>: Cost 4 vext2 <2,4,5,6>, <0,6,2,7>
+ 4047473974U, // <5,6,0,7>: Cost 4 vzipr <3,4,5,0>, RHS
+ 1564492445U, // <5,6,0,u>: Cost 2 vext2 <3,4,5,6>, LHS
+ 2638234358U, // <5,6,1,0>: Cost 3 vext2 <3,4,5,6>, <1,0,3,2>
+ 2638234420U, // <5,6,1,1>: Cost 3 vext2 <3,4,5,6>, <1,1,1,1>
+ 2638234518U, // <5,6,1,2>: Cost 3 vext2 <3,4,5,6>, <1,2,3,0>
+ 2638234584U, // <5,6,1,3>: Cost 3 vext2 <3,4,5,6>, <1,3,1,3>
+ 2626290768U, // <5,6,1,4>: Cost 3 vext2 <1,4,5,6>, <1,4,5,6>
+ 2638234768U, // <5,6,1,5>: Cost 3 vext2 <3,4,5,6>, <1,5,3,7>
+ 3700032719U, // <5,6,1,6>: Cost 4 vext2 <1,4,5,6>, <1,6,1,7>
+ 2982366518U, // <5,6,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
+ 2628945300U, // <5,6,1,u>: Cost 3 vext2 <1,u,5,6>, <1,u,5,6>
+ 3706004925U, // <5,6,2,0>: Cost 4 vext2 <2,4,5,6>, <2,0,1,2>
+ 3711976966U, // <5,6,2,1>: Cost 4 vext2 <3,4,5,6>, <2,1,0,3>
+ 2638235240U, // <5,6,2,2>: Cost 3 vext2 <3,4,5,6>, <2,2,2,2>
+ 2638235302U, // <5,6,2,3>: Cost 3 vext2 <3,4,5,6>, <2,3,0,1>
+ 2632263465U, // <5,6,2,4>: Cost 3 vext2 <2,4,5,6>, <2,4,5,6>
+ 2638235496U, // <5,6,2,5>: Cost 3 vext2 <3,4,5,6>, <2,5,3,6>
+ 2638235578U, // <5,6,2,6>: Cost 3 vext2 <3,4,5,6>, <2,6,3,7>
+ 2713965050U, // <5,6,2,7>: Cost 3 vext3 <4,u,5,5>, <6,2,7,3>
+ 2634917997U, // <5,6,2,u>: Cost 3 vext2 <2,u,5,6>, <2,u,5,6>
+ 2638235798U, // <5,6,3,0>: Cost 3 vext2 <3,4,5,6>, <3,0,1,2>
+ 3711977695U, // <5,6,3,1>: Cost 4 vext2 <3,4,5,6>, <3,1,0,3>
+ 3710650720U, // <5,6,3,2>: Cost 4 vext2 <3,2,5,6>, <3,2,5,6>
+ 2638236060U, // <5,6,3,3>: Cost 3 vext2 <3,4,5,6>, <3,3,3,3>
+ 1564494338U, // <5,6,3,4>: Cost 2 vext2 <3,4,5,6>, <3,4,5,6>
+ 2638236234U, // <5,6,3,5>: Cost 3 vext2 <3,4,5,6>, <3,5,4,6>
+ 3711978104U, // <5,6,3,6>: Cost 4 vext2 <3,4,5,6>, <3,6,0,7>
+ 4034227510U, // <5,6,3,7>: Cost 4 vzipr <1,2,5,3>, RHS
+ 1567148870U, // <5,6,3,u>: Cost 2 vext2 <3,u,5,6>, <3,u,5,6>
+ 2577817702U, // <5,6,4,0>: Cost 3 vext1 <4,5,6,4>, LHS
+ 3700034544U, // <5,6,4,1>: Cost 4 vext2 <1,4,5,6>, <4,1,6,5>
+ 2723033713U, // <5,6,4,2>: Cost 3 vext3 <6,4,2,5>, <6,4,2,5>
+ 2638236818U, // <5,6,4,3>: Cost 3 vext2 <3,4,5,6>, <4,3,6,5>
+ 2644208859U, // <5,6,4,4>: Cost 3 vext2 <4,4,5,6>, <4,4,5,6>
+ 1564495158U, // <5,6,4,5>: Cost 2 vext2 <3,4,5,6>, RHS
+ 2645536125U, // <5,6,4,6>: Cost 3 vext2 <4,6,5,6>, <4,6,5,6>
+ 2723402398U, // <5,6,4,7>: Cost 3 vext3 <6,4,7,5>, <6,4,7,5>
+ 1564495401U, // <5,6,4,u>: Cost 2 vext2 <3,4,5,6>, RHS
+ 2577825894U, // <5,6,5,0>: Cost 3 vext1 <4,5,6,5>, LHS
+ 2662125264U, // <5,6,5,1>: Cost 3 vext2 <7,4,5,6>, <5,1,7,3>
+ 3775836867U, // <5,6,5,2>: Cost 4 vext3 <2,u,6,5>, <6,5,2,6>
+ 3711979343U, // <5,6,5,3>: Cost 4 vext2 <3,4,5,6>, <5,3,3,4>
+ 2650181556U, // <5,6,5,4>: Cost 3 vext2 <5,4,5,6>, <5,4,5,6>
+ 2662125572U, // <5,6,5,5>: Cost 3 vext2 <7,4,5,6>, <5,5,5,5>
+ 2638237732U, // <5,6,5,6>: Cost 3 vext2 <3,4,5,6>, <5,6,0,1>
+ 2982399286U, // <5,6,5,7>: Cost 3 vzipr <4,u,5,5>, RHS
+ 2982399287U, // <5,6,5,u>: Cost 3 vzipr <4,u,5,5>, RHS
+ 2583806054U, // <5,6,6,0>: Cost 3 vext1 <5,5,6,6>, LHS
+ 3711979910U, // <5,6,6,1>: Cost 4 vext2 <3,4,5,6>, <6,1,3,4>
+ 2662126074U, // <5,6,6,2>: Cost 3 vext2 <7,4,5,6>, <6,2,7,3>
+ 2583808514U, // <5,6,6,3>: Cost 3 vext1 <5,5,6,6>, <3,4,5,6>
+ 2583809334U, // <5,6,6,4>: Cost 3 vext1 <5,5,6,6>, RHS
+ 2583810062U, // <5,6,6,5>: Cost 3 vext1 <5,5,6,6>, <5,5,6,6>
+ 2638238520U, // <5,6,6,6>: Cost 3 vext2 <3,4,5,6>, <6,6,6,6>
+ 2973781302U, // <5,6,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
+ 2973781303U, // <5,6,6,u>: Cost 3 vzipr <3,4,5,6>, RHS
+ 430358630U, // <5,6,7,0>: Cost 1 vext1 RHS, LHS
+ 1504101110U, // <5,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
+ 1504101992U, // <5,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+ 1504102550U, // <5,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
+ 430361910U, // <5,6,7,4>: Cost 1 vext1 RHS, RHS
+ 1504104390U, // <5,6,7,5>: Cost 2 vext1 RHS, <5,4,7,6>
+ 1504105272U, // <5,6,7,6>: Cost 2 vext1 RHS, <6,6,6,6>
+ 1504106092U, // <5,6,7,7>: Cost 2 vext1 RHS, <7,7,7,7>
+ 430364462U, // <5,6,7,u>: Cost 1 vext1 RHS, LHS
+ 430366822U, // <5,6,u,0>: Cost 1 vext1 RHS, LHS
+ 1564497710U, // <5,6,u,1>: Cost 2 vext2 <3,4,5,6>, LHS
+ 1504110184U, // <5,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
+ 1504110742U, // <5,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
+ 430370103U, // <5,6,u,4>: Cost 1 vext1 RHS, RHS
+ 1564498074U, // <5,6,u,5>: Cost 2 vext2 <3,4,5,6>, RHS
+ 1504113146U, // <5,6,u,6>: Cost 2 vext1 RHS, <6,2,7,3>
+ 1504113658U, // <5,6,u,7>: Cost 2 vext1 RHS, <7,0,1,2>
+ 430372654U, // <5,6,u,u>: Cost 1 vext1 RHS, LHS
+ 2625634304U, // <5,7,0,0>: Cost 3 vext2 <1,3,5,7>, <0,0,0,0>
+ 1551892582U, // <5,7,0,1>: Cost 2 vext2 <1,3,5,7>, LHS
+ 2625634468U, // <5,7,0,2>: Cost 3 vext2 <1,3,5,7>, <0,2,0,2>
+ 2571889247U, // <5,7,0,3>: Cost 3 vext1 <3,5,7,0>, <3,5,7,0>
+ 2625634642U, // <5,7,0,4>: Cost 3 vext2 <1,3,5,7>, <0,4,1,5>
+ 2595778728U, // <5,7,0,5>: Cost 3 vext1 <7,5,7,0>, <5,7,5,7>
+ 3699376639U, // <5,7,0,6>: Cost 4 vext2 <1,3,5,7>, <0,6,2,7>
+ 2260235715U, // <5,7,0,7>: Cost 3 vrev <7,5,7,0>
+ 1551893149U, // <5,7,0,u>: Cost 2 vext2 <1,3,5,7>, LHS
+ 2625635062U, // <5,7,1,0>: Cost 3 vext2 <1,3,5,7>, <1,0,3,2>
+ 2624308020U, // <5,7,1,1>: Cost 3 vext2 <1,1,5,7>, <1,1,1,1>
+ 2625635222U, // <5,7,1,2>: Cost 3 vext2 <1,3,5,7>, <1,2,3,0>
+ 1551893504U, // <5,7,1,3>: Cost 2 vext2 <1,3,5,7>, <1,3,5,7>
+ 2571898166U, // <5,7,1,4>: Cost 3 vext1 <3,5,7,1>, RHS
+ 2625635472U, // <5,7,1,5>: Cost 3 vext2 <1,3,5,7>, <1,5,3,7>
+ 2627626227U, // <5,7,1,6>: Cost 3 vext2 <1,6,5,7>, <1,6,5,7>
+ 3702031684U, // <5,7,1,7>: Cost 4 vext2 <1,7,5,7>, <1,7,5,7>
+ 1555211669U, // <5,7,1,u>: Cost 2 vext2 <1,u,5,7>, <1,u,5,7>
+ 2629617126U, // <5,7,2,0>: Cost 3 vext2 <2,0,5,7>, <2,0,5,7>
+ 3699377670U, // <5,7,2,1>: Cost 4 vext2 <1,3,5,7>, <2,1,0,3>
+ 2625635944U, // <5,7,2,2>: Cost 3 vext2 <1,3,5,7>, <2,2,2,2>
+ 2625636006U, // <5,7,2,3>: Cost 3 vext2 <1,3,5,7>, <2,3,0,1>
+ 2632271658U, // <5,7,2,4>: Cost 3 vext2 <2,4,5,7>, <2,4,5,7>
+ 2625636201U, // <5,7,2,5>: Cost 3 vext2 <1,3,5,7>, <2,5,3,7>
+ 2625636282U, // <5,7,2,6>: Cost 3 vext2 <1,3,5,7>, <2,6,3,7>
+ 3708004381U, // <5,7,2,7>: Cost 4 vext2 <2,7,5,7>, <2,7,5,7>
+ 2625636411U, // <5,7,2,u>: Cost 3 vext2 <1,3,5,7>, <2,u,0,1>
+ 2625636502U, // <5,7,3,0>: Cost 3 vext2 <1,3,5,7>, <3,0,1,2>
+ 2625636604U, // <5,7,3,1>: Cost 3 vext2 <1,3,5,7>, <3,1,3,5>
+ 3699378478U, // <5,7,3,2>: Cost 4 vext2 <1,3,5,7>, <3,2,0,1>
+ 2625636764U, // <5,7,3,3>: Cost 3 vext2 <1,3,5,7>, <3,3,3,3>
+ 2625636866U, // <5,7,3,4>: Cost 3 vext2 <1,3,5,7>, <3,4,5,6>
+ 2625636959U, // <5,7,3,5>: Cost 3 vext2 <1,3,5,7>, <3,5,7,0>
+ 3699378808U, // <5,7,3,6>: Cost 4 vext2 <1,3,5,7>, <3,6,0,7>
+ 2640235254U, // <5,7,3,7>: Cost 3 vext2 <3,7,5,7>, <3,7,5,7>
+ 2625637150U, // <5,7,3,u>: Cost 3 vext2 <1,3,5,7>, <3,u,1,2>
+ 2571919462U, // <5,7,4,0>: Cost 3 vext1 <3,5,7,4>, LHS
+ 2571920384U, // <5,7,4,1>: Cost 3 vext1 <3,5,7,4>, <1,3,5,7>
+ 3699379260U, // <5,7,4,2>: Cost 4 vext2 <1,3,5,7>, <4,2,6,0>
+ 2571922019U, // <5,7,4,3>: Cost 3 vext1 <3,5,7,4>, <3,5,7,4>
+ 2571922742U, // <5,7,4,4>: Cost 3 vext1 <3,5,7,4>, RHS
+ 1551895862U, // <5,7,4,5>: Cost 2 vext2 <1,3,5,7>, RHS
+ 2846277980U, // <5,7,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
+ 2646207951U, // <5,7,4,7>: Cost 3 vext2 <4,7,5,7>, <4,7,5,7>
+ 1551896105U, // <5,7,4,u>: Cost 2 vext2 <1,3,5,7>, RHS
+ 2583871590U, // <5,7,5,0>: Cost 3 vext1 <5,5,7,5>, LHS
+ 2652180176U, // <5,7,5,1>: Cost 3 vext2 <5,7,5,7>, <5,1,7,3>
+ 2625638177U, // <5,7,5,2>: Cost 3 vext2 <1,3,5,7>, <5,2,7,3>
+ 2625638262U, // <5,7,5,3>: Cost 3 vext2 <1,3,5,7>, <5,3,7,7>
+ 2583874870U, // <5,7,5,4>: Cost 3 vext1 <5,5,7,5>, RHS
+ 2846281732U, // <5,7,5,5>: Cost 3 vuzpr RHS, <5,5,5,5>
+ 2651517015U, // <5,7,5,6>: Cost 3 vext2 <5,6,5,7>, <5,6,5,7>
+ 1772539190U, // <5,7,5,7>: Cost 2 vuzpr RHS, RHS
+ 1772539191U, // <5,7,5,u>: Cost 2 vuzpr RHS, RHS
+ 2846281826U, // <5,7,6,0>: Cost 3 vuzpr RHS, <5,6,7,0>
+ 3699380615U, // <5,7,6,1>: Cost 4 vext2 <1,3,5,7>, <6,1,3,5>
+ 2846281108U, // <5,7,6,2>: Cost 3 vuzpr RHS, <4,6,u,2>
+ 2589854210U, // <5,7,6,3>: Cost 3 vext1 <6,5,7,6>, <3,4,5,6>
+ 2846281830U, // <5,7,6,4>: Cost 3 vuzpr RHS, <5,6,7,4>
+ 2725467658U, // <5,7,6,5>: Cost 3 vext3 <6,7,u,5>, <7,6,5,u>
+ 2846281076U, // <5,7,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
+ 2846279610U, // <5,7,6,7>: Cost 3 vuzpr RHS, <2,6,3,7>
+ 2846279611U, // <5,7,6,u>: Cost 3 vuzpr RHS, <2,6,3,u>
+ 1510146150U, // <5,7,7,0>: Cost 2 vext1 <5,5,7,7>, LHS
+ 2846282574U, // <5,7,7,1>: Cost 3 vuzpr RHS, <6,7,0,1>
+ 2583889512U, // <5,7,7,2>: Cost 3 vext1 <5,5,7,7>, <2,2,2,2>
+ 2846281919U, // <5,7,7,3>: Cost 3 vuzpr RHS, <5,7,u,3>
+ 1510149430U, // <5,7,7,4>: Cost 2 vext1 <5,5,7,7>, RHS
+ 1510150168U, // <5,7,7,5>: Cost 2 vext1 <5,5,7,7>, <5,5,7,7>
+ 2583892474U, // <5,7,7,6>: Cost 3 vext1 <5,5,7,7>, <6,2,7,3>
+ 2625640044U, // <5,7,7,7>: Cost 3 vext2 <1,3,5,7>, <7,7,7,7>
+ 1510151982U, // <5,7,7,u>: Cost 2 vext1 <5,5,7,7>, LHS
+ 1510154342U, // <5,7,u,0>: Cost 2 vext1 <5,5,7,u>, LHS
+ 1551898414U, // <5,7,u,1>: Cost 2 vext2 <1,3,5,7>, LHS
+ 2625640325U, // <5,7,u,2>: Cost 3 vext2 <1,3,5,7>, <u,2,3,0>
+ 1772536477U, // <5,7,u,3>: Cost 2 vuzpr RHS, LHS
+ 1510157622U, // <5,7,u,4>: Cost 2 vext1 <5,5,7,u>, RHS
+ 1551898778U, // <5,7,u,5>: Cost 2 vext2 <1,3,5,7>, RHS
+ 2625640656U, // <5,7,u,6>: Cost 3 vext2 <1,3,5,7>, <u,6,3,7>
+ 1772539433U, // <5,7,u,7>: Cost 2 vuzpr RHS, RHS
+ 1551898981U, // <5,7,u,u>: Cost 2 vext2 <1,3,5,7>, LHS
+ 2625642496U, // <5,u,0,0>: Cost 3 vext2 <1,3,5,u>, <0,0,0,0>
+ 1551900774U, // <5,u,0,1>: Cost 2 vext2 <1,3,5,u>, LHS
+ 2625642660U, // <5,u,0,2>: Cost 3 vext2 <1,3,5,u>, <0,2,0,2>
+ 2698630885U, // <5,u,0,3>: Cost 3 vext3 <2,3,4,5>, <u,0,3,2>
+ 2687129325U, // <5,u,0,4>: Cost 3 vext3 <0,4,1,5>, <u,0,4,1>
+ 2689783542U, // <5,u,0,5>: Cost 3 vext3 <0,u,1,5>, <u,0,5,1>
+ 2266134675U, // <5,u,0,6>: Cost 3 vrev <u,5,6,0>
+ 2595853772U, // <5,u,0,7>: Cost 3 vext1 <7,5,u,0>, <7,5,u,0>
+ 1551901341U, // <5,u,0,u>: Cost 2 vext2 <1,3,5,u>, LHS
+ 2625643254U, // <5,u,1,0>: Cost 3 vext2 <1,3,5,u>, <1,0,3,2>
+ 2625643316U, // <5,u,1,1>: Cost 3 vext2 <1,3,5,u>, <1,1,1,1>
+ 1613387566U, // <5,u,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
+ 1551901697U, // <5,u,1,3>: Cost 2 vext2 <1,3,5,u>, <1,3,5,u>
+ 2626307154U, // <5,u,1,4>: Cost 3 vext2 <1,4,5,u>, <1,4,5,u>
+ 2689783622U, // <5,u,1,5>: Cost 3 vext3 <0,u,1,5>, <u,1,5,0>
+ 2627634420U, // <5,u,1,6>: Cost 3 vext2 <1,6,5,u>, <1,6,5,u>
+ 2982366536U, // <5,u,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
+ 1613387620U, // <5,u,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
+ 2846286742U, // <5,u,2,0>: Cost 3 vuzpr RHS, <1,2,3,0>
+ 2685796528U, // <5,u,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
+ 2625644136U, // <5,u,2,2>: Cost 3 vext2 <1,3,5,u>, <2,2,2,2>
+ 2687129480U, // <5,u,2,3>: Cost 3 vext3 <0,4,1,5>, <u,2,3,3>
+ 2632279851U, // <5,u,2,4>: Cost 3 vext2 <2,4,5,u>, <2,4,5,u>
+ 2625644394U, // <5,u,2,5>: Cost 3 vext2 <1,3,5,u>, <2,5,3,u>
+ 2625644474U, // <5,u,2,6>: Cost 3 vext2 <1,3,5,u>, <2,6,3,7>
+ 2713966508U, // <5,u,2,7>: Cost 3 vext3 <4,u,5,5>, <u,2,7,3>
+ 2625644603U, // <5,u,2,u>: Cost 3 vext2 <1,3,5,u>, <2,u,0,1>
+ 2687129532U, // <5,u,3,0>: Cost 3 vext3 <0,4,1,5>, <u,3,0,1>
+ 2636261649U, // <5,u,3,1>: Cost 3 vext2 <3,1,5,u>, <3,1,5,u>
+ 2636925282U, // <5,u,3,2>: Cost 3 vext2 <3,2,5,u>, <3,2,5,u>
+ 2625644956U, // <5,u,3,3>: Cost 3 vext2 <1,3,5,u>, <3,3,3,3>
+ 1564510724U, // <5,u,3,4>: Cost 2 vext2 <3,4,5,u>, <3,4,5,u>
+ 2625645160U, // <5,u,3,5>: Cost 3 vext2 <1,3,5,u>, <3,5,u,0>
+ 2734610422U, // <5,u,3,6>: Cost 3 vext3 <u,3,6,5>, <u,3,6,5>
+ 2640243447U, // <5,u,3,7>: Cost 3 vext2 <3,7,5,u>, <3,7,5,u>
+ 1567165256U, // <5,u,3,u>: Cost 2 vext2 <3,u,5,u>, <3,u,5,u>
+ 1567828889U, // <5,u,4,0>: Cost 2 vext2 <4,0,5,u>, <4,0,5,u>
+ 1661163546U, // <5,u,4,1>: Cost 2 vext3 <u,4,1,5>, <u,4,1,5>
+ 2734463012U, // <5,u,4,2>: Cost 3 vext3 <u,3,4,5>, <u,4,2,6>
+ 2698631212U, // <5,u,4,3>: Cost 3 vext3 <2,3,4,5>, <u,4,3,5>
+ 1570458842U, // <5,u,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
+ 1551904054U, // <5,u,4,5>: Cost 2 vext2 <1,3,5,u>, RHS
+ 2846286172U, // <5,u,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
+ 2646216144U, // <5,u,4,7>: Cost 3 vext2 <4,7,5,u>, <4,7,5,u>
+ 1551904297U, // <5,u,4,u>: Cost 2 vext2 <1,3,5,u>, RHS
+ 1509982310U, // <5,u,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
+ 2560058555U, // <5,u,5,1>: Cost 3 vext1 <1,5,u,5>, <1,5,u,5>
+ 2698926194U, // <5,u,5,2>: Cost 3 vext3 <2,3,u,5>, <u,5,2,3>
+ 2698631295U, // <5,u,5,3>: Cost 3 vext3 <2,3,4,5>, <u,5,3,7>
+ 1509985590U, // <5,u,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
+ 229035318U, // <5,u,5,5>: Cost 1 vdup1 RHS
+ 1613387930U, // <5,u,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
+ 1772547382U, // <5,u,5,7>: Cost 2 vuzpr RHS, RHS
+ 229035318U, // <5,u,5,u>: Cost 1 vdup1 RHS
+ 2566037606U, // <5,u,6,0>: Cost 3 vext1 <2,5,u,6>, LHS
+ 2920044334U, // <5,u,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
+ 2566039445U, // <5,u,6,2>: Cost 3 vext1 <2,5,u,6>, <2,5,u,6>
+ 2687129808U, // <5,u,6,3>: Cost 3 vext3 <0,4,1,5>, <u,6,3,7>
+ 2566040886U, // <5,u,6,4>: Cost 3 vext1 <2,5,u,6>, RHS
+ 2920044698U, // <5,u,6,5>: Cost 3 vzipl <5,6,7,0>, RHS
+ 2846289268U, // <5,u,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
+ 2973781320U, // <5,u,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
+ 2687129853U, // <5,u,6,u>: Cost 3 vext3 <0,4,1,5>, <u,6,u,7>
+ 430506086U, // <5,u,7,0>: Cost 1 vext1 RHS, LHS
+ 1486333117U, // <5,u,7,1>: Cost 2 vext1 <1,5,u,7>, <1,5,u,7>
+ 1504249448U, // <5,u,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+ 2040971933U, // <5,u,7,3>: Cost 2 vtrnr RHS, LHS
+ 430509384U, // <5,u,7,4>: Cost 1 vext1 RHS, RHS
+ 1504251600U, // <5,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+ 118708378U, // <5,u,7,6>: Cost 1 vrev RHS
+ 2040974889U, // <5,u,7,7>: Cost 2 vtrnr RHS, RHS
+ 430511918U, // <5,u,7,u>: Cost 1 vext1 RHS, LHS
+ 430514278U, // <5,u,u,0>: Cost 1 vext1 RHS, LHS
+ 1551906606U, // <5,u,u,1>: Cost 2 vext2 <1,3,5,u>, LHS
+ 1613388133U, // <5,u,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
+ 1772544669U, // <5,u,u,3>: Cost 2 vuzpr RHS, LHS
+ 430517577U, // <5,u,u,4>: Cost 1 vext1 RHS, RHS
+ 229035318U, // <5,u,u,5>: Cost 1 vdup1 RHS
+ 118716571U, // <5,u,u,6>: Cost 1 vrev RHS
+ 1772547625U, // <5,u,u,7>: Cost 2 vuzpr RHS, RHS
+ 430520110U, // <5,u,u,u>: Cost 1 vext1 RHS, LHS
+ 2686025728U, // <6,0,0,0>: Cost 3 vext3 <0,2,4,6>, <0,0,0,0>
+ 2686025738U, // <6,0,0,1>: Cost 3 vext3 <0,2,4,6>, <0,0,1,1>
+ 2686025748U, // <6,0,0,2>: Cost 3 vext3 <0,2,4,6>, <0,0,2,2>
+ 3779084320U, // <6,0,0,3>: Cost 4 vext3 <3,4,5,6>, <0,0,3,5>
+ 2642903388U, // <6,0,0,4>: Cost 3 vext2 <4,2,6,0>, <0,4,2,6>
+ 3657723939U, // <6,0,0,5>: Cost 4 vext1 <5,6,0,0>, <5,6,0,0>
+ 3926676514U, // <6,0,0,6>: Cost 4 vuzpr <5,6,7,0>, <7,0,5,6>
+ 3926675786U, // <6,0,0,7>: Cost 4 vuzpr <5,6,7,0>, <6,0,5,7>
+ 2686025802U, // <6,0,0,u>: Cost 3 vext3 <0,2,4,6>, <0,0,u,2>
+ 2566070374U, // <6,0,1,0>: Cost 3 vext1 <2,6,0,1>, LHS
+ 3759767642U, // <6,0,1,1>: Cost 4 vext3 <0,2,4,6>, <0,1,1,0>
+ 1612284006U, // <6,0,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
+ 2583988738U, // <6,0,1,3>: Cost 3 vext1 <5,6,0,1>, <3,4,5,6>
+ 2566073654U, // <6,0,1,4>: Cost 3 vext1 <2,6,0,1>, RHS
+ 2583990308U, // <6,0,1,5>: Cost 3 vext1 <5,6,0,1>, <5,6,0,1>
+ 2589963005U, // <6,0,1,6>: Cost 3 vext1 <6,6,0,1>, <6,6,0,1>
+ 2595935702U, // <6,0,1,7>: Cost 3 vext1 <7,6,0,1>, <7,6,0,1>
+ 1612284060U, // <6,0,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
+ 2686025892U, // <6,0,2,0>: Cost 3 vext3 <0,2,4,6>, <0,2,0,2>
+ 2685804721U, // <6,0,2,1>: Cost 3 vext3 <0,2,1,6>, <0,2,1,6>
+ 3759620282U, // <6,0,2,2>: Cost 4 vext3 <0,2,2,6>, <0,2,2,6>
+ 2705342658U, // <6,0,2,3>: Cost 3 vext3 <3,4,5,6>, <0,2,3,5>
+ 1612284108U, // <6,0,2,4>: Cost 2 vext3 <0,2,4,6>, <0,2,4,6>
+ 3706029956U, // <6,0,2,5>: Cost 4 vext2 <2,4,6,0>, <2,5,6,7>
+ 2686173406U, // <6,0,2,6>: Cost 3 vext3 <0,2,6,6>, <0,2,6,6>
+ 3651769338U, // <6,0,2,7>: Cost 4 vext1 <4,6,0,2>, <7,0,1,2>
+ 1612579056U, // <6,0,2,u>: Cost 2 vext3 <0,2,u,6>, <0,2,u,6>
+ 3706030230U, // <6,0,3,0>: Cost 4 vext2 <2,4,6,0>, <3,0,1,2>
+ 2705342720U, // <6,0,3,1>: Cost 3 vext3 <3,4,5,6>, <0,3,1,4>
+ 2705342730U, // <6,0,3,2>: Cost 3 vext3 <3,4,5,6>, <0,3,2,5>
+ 3706030492U, // <6,0,3,3>: Cost 4 vext2 <2,4,6,0>, <3,3,3,3>
+ 2644896258U, // <6,0,3,4>: Cost 3 vext2 <4,5,6,0>, <3,4,5,6>
+ 3718638154U, // <6,0,3,5>: Cost 4 vext2 <4,5,6,0>, <3,5,4,6>
+ 3729918619U, // <6,0,3,6>: Cost 4 vext2 <6,4,6,0>, <3,6,4,6>
+ 3926672384U, // <6,0,3,7>: Cost 4 vuzpr <5,6,7,0>, <1,3,5,7>
+ 2705342784U, // <6,0,3,u>: Cost 3 vext3 <3,4,5,6>, <0,3,u,5>
+ 2687058250U, // <6,0,4,0>: Cost 3 vext3 <0,4,0,6>, <0,4,0,6>
+ 2686026066U, // <6,0,4,1>: Cost 3 vext3 <0,2,4,6>, <0,4,1,5>
+ 1613463900U, // <6,0,4,2>: Cost 2 vext3 <0,4,2,6>, <0,4,2,6>
+ 3761021285U, // <6,0,4,3>: Cost 4 vext3 <0,4,3,6>, <0,4,3,6>
+ 2687353198U, // <6,0,4,4>: Cost 3 vext3 <0,4,4,6>, <0,4,4,6>
+ 2632289590U, // <6,0,4,5>: Cost 3 vext2 <2,4,6,0>, RHS
+ 2645560704U, // <6,0,4,6>: Cost 3 vext2 <4,6,6,0>, <4,6,6,0>
+ 2646224337U, // <6,0,4,7>: Cost 3 vext2 <4,7,6,0>, <4,7,6,0>
+ 1613906322U, // <6,0,4,u>: Cost 2 vext3 <0,4,u,6>, <0,4,u,6>
+ 3651788902U, // <6,0,5,0>: Cost 4 vext1 <4,6,0,5>, LHS
+ 2687795620U, // <6,0,5,1>: Cost 3 vext3 <0,5,1,6>, <0,5,1,6>
+ 3761611181U, // <6,0,5,2>: Cost 4 vext3 <0,5,2,6>, <0,5,2,6>
+ 3723284326U, // <6,0,5,3>: Cost 4 vext2 <5,3,6,0>, <5,3,6,0>
+ 2646224838U, // <6,0,5,4>: Cost 3 vext2 <4,7,6,0>, <5,4,7,6>
+ 3718639630U, // <6,0,5,5>: Cost 4 vext2 <4,5,6,0>, <5,5,6,6>
+ 2652196962U, // <6,0,5,6>: Cost 3 vext2 <5,7,6,0>, <5,6,7,0>
+ 2852932918U, // <6,0,5,7>: Cost 3 vuzpr <5,6,7,0>, RHS
+ 2852932919U, // <6,0,5,u>: Cost 3 vuzpr <5,6,7,0>, RHS
+ 2852933730U, // <6,0,6,0>: Cost 3 vuzpr <5,6,7,0>, <5,6,7,0>
+ 2925985894U, // <6,0,6,1>: Cost 3 vzipl <6,6,6,6>, LHS
+ 3060203622U, // <6,0,6,2>: Cost 3 vtrnl <6,6,6,6>, LHS
+ 3718640178U, // <6,0,6,3>: Cost 4 vext2 <4,5,6,0>, <6,3,4,5>
+ 2656178832U, // <6,0,6,4>: Cost 3 vext2 <6,4,6,0>, <6,4,6,0>
+ 3725939378U, // <6,0,6,5>: Cost 4 vext2 <5,7,6,0>, <6,5,0,7>
+ 2657506098U, // <6,0,6,6>: Cost 3 vext2 <6,6,6,0>, <6,6,6,0>
+ 2619020110U, // <6,0,6,7>: Cost 3 vext2 <0,2,6,0>, <6,7,0,1>
+ 2925986461U, // <6,0,6,u>: Cost 3 vzipl <6,6,6,6>, LHS
+ 2572091494U, // <6,0,7,0>: Cost 3 vext1 <3,6,0,7>, LHS
+ 2572092310U, // <6,0,7,1>: Cost 3 vext1 <3,6,0,7>, <1,2,3,0>
+ 2980495524U, // <6,0,7,2>: Cost 3 vzipr RHS, <0,2,0,2>
+ 2572094072U, // <6,0,7,3>: Cost 3 vext1 <3,6,0,7>, <3,6,0,7>
+ 2572094774U, // <6,0,7,4>: Cost 3 vext1 <3,6,0,7>, RHS
+ 4054238242U, // <6,0,7,5>: Cost 4 vzipr RHS, <1,4,0,5>
+ 3645837653U, // <6,0,7,6>: Cost 4 vext1 <3,6,0,7>, <6,0,7,0>
+ 4054239054U, // <6,0,7,7>: Cost 4 vzipr RHS, <2,5,0,7>
+ 2572097326U, // <6,0,7,u>: Cost 3 vext1 <3,6,0,7>, LHS
+ 2686026378U, // <6,0,u,0>: Cost 3 vext3 <0,2,4,6>, <0,u,0,2>
+ 2686026386U, // <6,0,u,1>: Cost 3 vext3 <0,2,4,6>, <0,u,1,1>
+ 1612284573U, // <6,0,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
+ 2705343144U, // <6,0,u,3>: Cost 3 vext3 <3,4,5,6>, <0,u,3,5>
+ 1616265906U, // <6,0,u,4>: Cost 2 vext3 <0,u,4,6>, <0,u,4,6>
+ 2632292506U, // <6,0,u,5>: Cost 3 vext2 <2,4,6,0>, RHS
+ 2590020356U, // <6,0,u,6>: Cost 3 vext1 <6,6,0,u>, <6,6,0,u>
+ 2852933161U, // <6,0,u,7>: Cost 3 vuzpr <5,6,7,0>, RHS
+ 1612284627U, // <6,0,u,u>: Cost 2 vext3 <0,2,4,6>, LHS
+ 2595995750U, // <6,1,0,0>: Cost 3 vext1 <7,6,1,0>, LHS
+ 2646229094U, // <6,1,0,1>: Cost 3 vext2 <4,7,6,1>, LHS
+ 3694092492U, // <6,1,0,2>: Cost 4 vext2 <0,4,6,1>, <0,2,4,6>
+ 2686026486U, // <6,1,0,3>: Cost 3 vext3 <0,2,4,6>, <1,0,3,2>
+ 2595999030U, // <6,1,0,4>: Cost 3 vext1 <7,6,1,0>, RHS
+ 3767730952U, // <6,1,0,5>: Cost 4 vext3 <1,5,4,6>, <1,0,5,2>
+ 2596000590U, // <6,1,0,6>: Cost 3 vext1 <7,6,1,0>, <6,7,0,1>
+ 2596001246U, // <6,1,0,7>: Cost 3 vext1 <7,6,1,0>, <7,6,1,0>
+ 2686026531U, // <6,1,0,u>: Cost 3 vext3 <0,2,4,6>, <1,0,u,2>
+ 3763602219U, // <6,1,1,0>: Cost 4 vext3 <0,u,2,6>, <1,1,0,1>
+ 2686026548U, // <6,1,1,1>: Cost 3 vext3 <0,2,4,6>, <1,1,1,1>
+ 3764929346U, // <6,1,1,2>: Cost 4 vext3 <1,1,2,6>, <1,1,2,6>
+ 2686026568U, // <6,1,1,3>: Cost 3 vext3 <0,2,4,6>, <1,1,3,3>
+ 2691334996U, // <6,1,1,4>: Cost 3 vext3 <1,1,4,6>, <1,1,4,6>
+ 3760874332U, // <6,1,1,5>: Cost 4 vext3 <0,4,1,6>, <1,1,5,5>
+ 3765224294U, // <6,1,1,6>: Cost 4 vext3 <1,1,6,6>, <1,1,6,6>
+ 3669751263U, // <6,1,1,7>: Cost 4 vext1 <7,6,1,1>, <7,6,1,1>
+ 2686026613U, // <6,1,1,u>: Cost 3 vext3 <0,2,4,6>, <1,1,u,3>
+ 2554208358U, // <6,1,2,0>: Cost 3 vext1 <0,6,1,2>, LHS
+ 3763602311U, // <6,1,2,1>: Cost 4 vext3 <0,u,2,6>, <1,2,1,3>
+ 3639895971U, // <6,1,2,2>: Cost 4 vext1 <2,6,1,2>, <2,6,1,2>
+ 2686026646U, // <6,1,2,3>: Cost 3 vext3 <0,2,4,6>, <1,2,3,0>
+ 2554211638U, // <6,1,2,4>: Cost 3 vext1 <0,6,1,2>, RHS
+ 3760874411U, // <6,1,2,5>: Cost 4 vext3 <0,4,1,6>, <1,2,5,3>
+ 2554212858U, // <6,1,2,6>: Cost 3 vext1 <0,6,1,2>, <6,2,7,3>
+ 3802973114U, // <6,1,2,7>: Cost 4 vext3 <7,4,5,6>, <1,2,7,0>
+ 2686026691U, // <6,1,2,u>: Cost 3 vext3 <0,2,4,6>, <1,2,u,0>
+ 2566160486U, // <6,1,3,0>: Cost 3 vext1 <2,6,1,3>, LHS
+ 2686026712U, // <6,1,3,1>: Cost 3 vext3 <0,2,4,6>, <1,3,1,3>
+ 2686026724U, // <6,1,3,2>: Cost 3 vext3 <0,2,4,6>, <1,3,2,6>
+ 3759768552U, // <6,1,3,3>: Cost 4 vext3 <0,2,4,6>, <1,3,3,1>
+ 2692662262U, // <6,1,3,4>: Cost 3 vext3 <1,3,4,6>, <1,3,4,6>
+ 2686026752U, // <6,1,3,5>: Cost 3 vext3 <0,2,4,6>, <1,3,5,7>
+ 2590053128U, // <6,1,3,6>: Cost 3 vext1 <6,6,1,3>, <6,6,1,3>
+ 3663795194U, // <6,1,3,7>: Cost 4 vext1 <6,6,1,3>, <7,0,1,2>
+ 2686026775U, // <6,1,3,u>: Cost 3 vext3 <0,2,4,6>, <1,3,u,3>
+ 2641587099U, // <6,1,4,0>: Cost 3 vext2 <4,0,6,1>, <4,0,6,1>
+ 2693104684U, // <6,1,4,1>: Cost 3 vext3 <1,4,1,6>, <1,4,1,6>
+ 3639912357U, // <6,1,4,2>: Cost 4 vext1 <2,6,1,4>, <2,6,1,4>
+ 2687206462U, // <6,1,4,3>: Cost 3 vext3 <0,4,2,6>, <1,4,3,6>
+ 3633941814U, // <6,1,4,4>: Cost 4 vext1 <1,6,1,4>, RHS
+ 2693399632U, // <6,1,4,5>: Cost 3 vext3 <1,4,5,6>, <1,4,5,6>
+ 3765077075U, // <6,1,4,6>: Cost 4 vext3 <1,1,4,6>, <1,4,6,0>
+ 2646232530U, // <6,1,4,7>: Cost 3 vext2 <4,7,6,1>, <4,7,6,1>
+ 2687206507U, // <6,1,4,u>: Cost 3 vext3 <0,4,2,6>, <1,4,u,6>
+ 2647559796U, // <6,1,5,0>: Cost 3 vext2 <5,0,6,1>, <5,0,6,1>
+ 3765077118U, // <6,1,5,1>: Cost 4 vext3 <1,1,4,6>, <1,5,1,7>
+ 3767583878U, // <6,1,5,2>: Cost 4 vext3 <1,5,2,6>, <1,5,2,6>
+ 2686026896U, // <6,1,5,3>: Cost 3 vext3 <0,2,4,6>, <1,5,3,7>
+ 2693989528U, // <6,1,5,4>: Cost 3 vext3 <1,5,4,6>, <1,5,4,6>
+ 3767805089U, // <6,1,5,5>: Cost 4 vext3 <1,5,5,6>, <1,5,5,6>
+ 2652868706U, // <6,1,5,6>: Cost 3 vext2 <5,u,6,1>, <5,6,7,0>
+ 3908250934U, // <6,1,5,7>: Cost 4 vuzpr <2,6,0,1>, RHS
+ 2686026941U, // <6,1,5,u>: Cost 3 vext3 <0,2,4,6>, <1,5,u,7>
+ 2554241126U, // <6,1,6,0>: Cost 3 vext1 <0,6,1,6>, LHS
+ 3763602639U, // <6,1,6,1>: Cost 4 vext3 <0,u,2,6>, <1,6,1,7>
+ 3759547607U, // <6,1,6,2>: Cost 4 vext3 <0,2,1,6>, <1,6,2,6>
+ 3115221094U, // <6,1,6,3>: Cost 3 vtrnr <4,6,4,6>, LHS
+ 2554244406U, // <6,1,6,4>: Cost 3 vext1 <0,6,1,6>, RHS
+ 3760874739U, // <6,1,6,5>: Cost 4 vext3 <0,4,1,6>, <1,6,5,7>
+ 2554245944U, // <6,1,6,6>: Cost 3 vext1 <0,6,1,6>, <6,6,6,6>
+ 3719975758U, // <6,1,6,7>: Cost 4 vext2 <4,7,6,1>, <6,7,0,1>
+ 3115221099U, // <6,1,6,u>: Cost 3 vtrnr <4,6,4,6>, LHS
+ 2560221286U, // <6,1,7,0>: Cost 3 vext1 <1,6,1,7>, LHS
+ 2560222415U, // <6,1,7,1>: Cost 3 vext1 <1,6,1,7>, <1,6,1,7>
+ 2980497558U, // <6,1,7,2>: Cost 3 vzipr RHS, <3,0,1,2>
+ 3103211622U, // <6,1,7,3>: Cost 3 vtrnr <2,6,3,7>, LHS
+ 2560224566U, // <6,1,7,4>: Cost 3 vext1 <1,6,1,7>, RHS
+ 2980495698U, // <6,1,7,5>: Cost 3 vzipr RHS, <0,4,1,5>
+ 3633967526U, // <6,1,7,6>: Cost 4 vext1 <1,6,1,7>, <6,1,7,0>
+ 4054237686U, // <6,1,7,7>: Cost 4 vzipr RHS, <0,6,1,7>
+ 2560227118U, // <6,1,7,u>: Cost 3 vext1 <1,6,1,7>, LHS
+ 2560229478U, // <6,1,u,0>: Cost 3 vext1 <1,6,1,u>, LHS
+ 2686027117U, // <6,1,u,1>: Cost 3 vext3 <0,2,4,6>, <1,u,1,3>
+ 2686027129U, // <6,1,u,2>: Cost 3 vext3 <0,2,4,6>, <1,u,2,6>
+ 2686027132U, // <6,1,u,3>: Cost 3 vext3 <0,2,4,6>, <1,u,3,0>
+ 2687206795U, // <6,1,u,4>: Cost 3 vext3 <0,4,2,6>, <1,u,4,6>
+ 2686027157U, // <6,1,u,5>: Cost 3 vext3 <0,2,4,6>, <1,u,5,7>
+ 2590094093U, // <6,1,u,6>: Cost 3 vext1 <6,6,1,u>, <6,6,1,u>
+ 2596066790U, // <6,1,u,7>: Cost 3 vext1 <7,6,1,u>, <7,6,1,u>
+ 2686027177U, // <6,1,u,u>: Cost 3 vext3 <0,2,4,6>, <1,u,u,0>
+ 2646900736U, // <6,2,0,0>: Cost 3 vext2 <4,u,6,2>, <0,0,0,0>
+ 1573159014U, // <6,2,0,1>: Cost 2 vext2 <4,u,6,2>, LHS
+ 2646900900U, // <6,2,0,2>: Cost 3 vext2 <4,u,6,2>, <0,2,0,2>
+ 3759769037U, // <6,2,0,3>: Cost 4 vext3 <0,2,4,6>, <2,0,3,0>
+ 2641592668U, // <6,2,0,4>: Cost 3 vext2 <4,0,6,2>, <0,4,2,6>
+ 3779085794U, // <6,2,0,5>: Cost 4 vext3 <3,4,5,6>, <2,0,5,3>
+ 2686027244U, // <6,2,0,6>: Cost 3 vext3 <0,2,4,6>, <2,0,6,4>
+ 3669816807U, // <6,2,0,7>: Cost 4 vext1 <7,6,2,0>, <7,6,2,0>
+ 1573159581U, // <6,2,0,u>: Cost 2 vext2 <4,u,6,2>, LHS
+ 2230527897U, // <6,2,1,0>: Cost 3 vrev <2,6,0,1>
+ 2646901556U, // <6,2,1,1>: Cost 3 vext2 <4,u,6,2>, <1,1,1,1>
+ 2646901654U, // <6,2,1,2>: Cost 3 vext2 <4,u,6,2>, <1,2,3,0>
+ 2847047782U, // <6,2,1,3>: Cost 3 vuzpr <4,6,u,2>, LHS
+ 3771049517U, // <6,2,1,4>: Cost 4 vext3 <2,1,4,6>, <2,1,4,6>
+ 2646901904U, // <6,2,1,5>: Cost 3 vext2 <4,u,6,2>, <1,5,3,7>
+ 2686027324U, // <6,2,1,6>: Cost 3 vext3 <0,2,4,6>, <2,1,6,3>
+ 3669825000U, // <6,2,1,7>: Cost 4 vext1 <7,6,2,1>, <7,6,2,1>
+ 2231117793U, // <6,2,1,u>: Cost 3 vrev <2,6,u,1>
+ 3763603029U, // <6,2,2,0>: Cost 4 vext3 <0,u,2,6>, <2,2,0,1>
+ 3759769184U, // <6,2,2,1>: Cost 4 vext3 <0,2,4,6>, <2,2,1,3>
+ 2686027368U, // <6,2,2,2>: Cost 3 vext3 <0,2,4,6>, <2,2,2,2>
+ 2686027378U, // <6,2,2,3>: Cost 3 vext3 <0,2,4,6>, <2,2,3,3>
+ 2697971326U, // <6,2,2,4>: Cost 3 vext3 <2,2,4,6>, <2,2,4,6>
+ 3759769224U, // <6,2,2,5>: Cost 4 vext3 <0,2,4,6>, <2,2,5,7>
+ 2698118800U, // <6,2,2,6>: Cost 3 vext3 <2,2,6,6>, <2,2,6,6>
+ 3920794092U, // <6,2,2,7>: Cost 4 vuzpr <4,6,u,2>, <6,2,5,7>
+ 2686027423U, // <6,2,2,u>: Cost 3 vext3 <0,2,4,6>, <2,2,u,3>
+ 2686027430U, // <6,2,3,0>: Cost 3 vext3 <0,2,4,6>, <2,3,0,1>
+ 3759769262U, // <6,2,3,1>: Cost 4 vext3 <0,2,4,6>, <2,3,1,0>
+ 2698487485U, // <6,2,3,2>: Cost 3 vext3 <2,3,2,6>, <2,3,2,6>
+ 2705344196U, // <6,2,3,3>: Cost 3 vext3 <3,4,5,6>, <2,3,3,4>
+ 2686027470U, // <6,2,3,4>: Cost 3 vext3 <0,2,4,6>, <2,3,4,5>
+ 2698708696U, // <6,2,3,5>: Cost 3 vext3 <2,3,5,6>, <2,3,5,6>
+ 2724660961U, // <6,2,3,6>: Cost 3 vext3 <6,6,6,6>, <2,3,6,6>
+ 2729232104U, // <6,2,3,7>: Cost 3 vext3 <7,4,5,6>, <2,3,7,4>
+ 2686027502U, // <6,2,3,u>: Cost 3 vext3 <0,2,4,6>, <2,3,u,1>
+ 1567853468U, // <6,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
+ 3759769351U, // <6,2,4,1>: Cost 4 vext3 <0,2,4,6>, <2,4,1,u>
+ 2699151118U, // <6,2,4,2>: Cost 3 vext3 <2,4,2,6>, <2,4,2,6>
+ 2686027543U, // <6,2,4,3>: Cost 3 vext3 <0,2,4,6>, <2,4,3,6>
+ 2699298592U, // <6,2,4,4>: Cost 3 vext3 <2,4,4,6>, <2,4,4,6>
+ 1573162294U, // <6,2,4,5>: Cost 2 vext2 <4,u,6,2>, RHS
+ 2686027564U, // <6,2,4,6>: Cost 3 vext3 <0,2,4,6>, <2,4,6,0>
+ 3719982547U, // <6,2,4,7>: Cost 4 vext2 <4,7,6,2>, <4,7,6,2>
+ 1573162532U, // <6,2,4,u>: Cost 2 vext2 <4,u,6,2>, <4,u,6,2>
+ 3779086154U, // <6,2,5,0>: Cost 4 vext3 <3,4,5,6>, <2,5,0,3>
+ 2646904528U, // <6,2,5,1>: Cost 3 vext2 <4,u,6,2>, <5,1,7,3>
+ 3759769440U, // <6,2,5,2>: Cost 4 vext3 <0,2,4,6>, <2,5,2,7>
+ 2699888488U, // <6,2,5,3>: Cost 3 vext3 <2,5,3,6>, <2,5,3,6>
+ 2230855617U, // <6,2,5,4>: Cost 3 vrev <2,6,4,5>
+ 2646904836U, // <6,2,5,5>: Cost 3 vext2 <4,u,6,2>, <5,5,5,5>
+ 2646904930U, // <6,2,5,6>: Cost 3 vext2 <4,u,6,2>, <5,6,7,0>
+ 2847051062U, // <6,2,5,7>: Cost 3 vuzpr <4,6,u,2>, RHS
+ 2700257173U, // <6,2,5,u>: Cost 3 vext3 <2,5,u,6>, <2,5,u,6>
+ 2687207321U, // <6,2,6,0>: Cost 3 vext3 <0,4,2,6>, <2,6,0,1>
+ 2686027684U, // <6,2,6,1>: Cost 3 vext3 <0,2,4,6>, <2,6,1,3>
+ 2566260656U, // <6,2,6,2>: Cost 3 vext1 <2,6,2,6>, <2,6,2,6>
+ 2685806522U, // <6,2,6,3>: Cost 3 vext3 <0,2,1,6>, <2,6,3,7>
+ 2687207361U, // <6,2,6,4>: Cost 3 vext3 <0,4,2,6>, <2,6,4,5>
+ 2686027724U, // <6,2,6,5>: Cost 3 vext3 <0,2,4,6>, <2,6,5,7>
+ 2646905656U, // <6,2,6,6>: Cost 3 vext2 <4,u,6,2>, <6,6,6,6>
+ 2646905678U, // <6,2,6,7>: Cost 3 vext2 <4,u,6,2>, <6,7,0,1>
+ 2686027751U, // <6,2,6,u>: Cost 3 vext3 <0,2,4,6>, <2,6,u,7>
+ 2554323046U, // <6,2,7,0>: Cost 3 vext1 <0,6,2,7>, LHS
+ 2572239606U, // <6,2,7,1>: Cost 3 vext1 <3,6,2,7>, <1,0,3,2>
+ 2566268849U, // <6,2,7,2>: Cost 3 vext1 <2,6,2,7>, <2,6,2,7>
+ 1906753638U, // <6,2,7,3>: Cost 2 vzipr RHS, LHS
+ 2554326326U, // <6,2,7,4>: Cost 3 vext1 <0,6,2,7>, RHS
+ 3304687564U, // <6,2,7,5>: Cost 4 vrev <2,6,5,7>
+ 2980495708U, // <6,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
+ 2646906476U, // <6,2,7,7>: Cost 3 vext2 <4,u,6,2>, <7,7,7,7>
+ 1906753643U, // <6,2,7,u>: Cost 2 vzipr RHS, LHS
+ 1591744256U, // <6,2,u,0>: Cost 2 vext2 <u,0,6,2>, <u,0,6,2>
+ 1573164846U, // <6,2,u,1>: Cost 2 vext2 <4,u,6,2>, LHS
+ 2701805650U, // <6,2,u,2>: Cost 3 vext3 <2,u,2,6>, <2,u,2,6>
+ 1906761830U, // <6,2,u,3>: Cost 2 vzipr RHS, LHS
+ 2686027875U, // <6,2,u,4>: Cost 3 vext3 <0,2,4,6>, <2,u,4,5>
+ 1573165210U, // <6,2,u,5>: Cost 2 vext2 <4,u,6,2>, RHS
+ 2686322800U, // <6,2,u,6>: Cost 3 vext3 <0,2,u,6>, <2,u,6,0>
+ 2847051305U, // <6,2,u,7>: Cost 3 vuzpr <4,6,u,2>, RHS
+ 1906761835U, // <6,2,u,u>: Cost 2 vzipr RHS, LHS
+ 3759769739U, // <6,3,0,0>: Cost 4 vext3 <0,2,4,6>, <3,0,0,0>
+ 2686027926U, // <6,3,0,1>: Cost 3 vext3 <0,2,4,6>, <3,0,1,2>
+ 2686027937U, // <6,3,0,2>: Cost 3 vext3 <0,2,4,6>, <3,0,2,4>
+ 3640027286U, // <6,3,0,3>: Cost 4 vext1 <2,6,3,0>, <3,0,1,2>
+ 2687207601U, // <6,3,0,4>: Cost 3 vext3 <0,4,2,6>, <3,0,4,2>
+ 2705344698U, // <6,3,0,5>: Cost 3 vext3 <3,4,5,6>, <3,0,5,2>
+ 3663917847U, // <6,3,0,6>: Cost 4 vext1 <6,6,3,0>, <6,6,3,0>
+ 2237008560U, // <6,3,0,7>: Cost 3 vrev <3,6,7,0>
+ 2686027989U, // <6,3,0,u>: Cost 3 vext3 <0,2,4,6>, <3,0,u,2>
+ 3759769823U, // <6,3,1,0>: Cost 4 vext3 <0,2,4,6>, <3,1,0,3>
+ 3759769830U, // <6,3,1,1>: Cost 4 vext3 <0,2,4,6>, <3,1,1,1>
+ 3759769841U, // <6,3,1,2>: Cost 4 vext3 <0,2,4,6>, <3,1,2,3>
+ 3759769848U, // <6,3,1,3>: Cost 4 vext3 <0,2,4,6>, <3,1,3,1>
+ 2703280390U, // <6,3,1,4>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
+ 3759769868U, // <6,3,1,5>: Cost 4 vext3 <0,2,4,6>, <3,1,5,3>
+ 3704063194U, // <6,3,1,6>: Cost 4 vext2 <2,1,6,3>, <1,6,3,0>
+ 3767732510U, // <6,3,1,7>: Cost 4 vext3 <1,5,4,6>, <3,1,7,3>
+ 2703280390U, // <6,3,1,u>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
+ 3704063468U, // <6,3,2,0>: Cost 4 vext2 <2,1,6,3>, <2,0,6,4>
+ 2630321724U, // <6,3,2,1>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
+ 3759769921U, // <6,3,2,2>: Cost 4 vext3 <0,2,4,6>, <3,2,2,2>
+ 3759769928U, // <6,3,2,3>: Cost 4 vext3 <0,2,4,6>, <3,2,3,0>
+ 3704063767U, // <6,3,2,4>: Cost 4 vext2 <2,1,6,3>, <2,4,3,6>
+ 3704063876U, // <6,3,2,5>: Cost 4 vext2 <2,1,6,3>, <2,5,6,7>
+ 2636957626U, // <6,3,2,6>: Cost 3 vext2 <3,2,6,3>, <2,6,3,7>
+ 3777907058U, // <6,3,2,7>: Cost 4 vext3 <3,2,7,6>, <3,2,7,6>
+ 2630321724U, // <6,3,2,u>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
+ 3759769983U, // <6,3,3,0>: Cost 4 vext3 <0,2,4,6>, <3,3,0,1>
+ 3710036245U, // <6,3,3,1>: Cost 4 vext2 <3,1,6,3>, <3,1,6,3>
+ 2636958054U, // <6,3,3,2>: Cost 3 vext2 <3,2,6,3>, <3,2,6,3>
+ 2686028188U, // <6,3,3,3>: Cost 3 vext3 <0,2,4,6>, <3,3,3,3>
+ 2704607656U, // <6,3,3,4>: Cost 3 vext3 <3,3,4,6>, <3,3,4,6>
+ 3773041072U, // <6,3,3,5>: Cost 4 vext3 <2,4,4,6>, <3,3,5,5>
+ 3711363731U, // <6,3,3,6>: Cost 4 vext2 <3,3,6,3>, <3,6,3,7>
+ 3767732676U, // <6,3,3,7>: Cost 4 vext3 <1,5,4,6>, <3,3,7,7>
+ 2707999179U, // <6,3,3,u>: Cost 3 vext3 <3,u,5,6>, <3,3,u,5>
+ 2584232038U, // <6,3,4,0>: Cost 3 vext1 <5,6,3,4>, LHS
+ 2642267118U, // <6,3,4,1>: Cost 3 vext2 <4,1,6,3>, <4,1,6,3>
+ 2642930751U, // <6,3,4,2>: Cost 3 vext2 <4,2,6,3>, <4,2,6,3>
+ 2705197552U, // <6,3,4,3>: Cost 3 vext3 <3,4,3,6>, <3,4,3,6>
+ 2584235318U, // <6,3,4,4>: Cost 3 vext1 <5,6,3,4>, RHS
+ 1631603202U, // <6,3,4,5>: Cost 2 vext3 <3,4,5,6>, <3,4,5,6>
+ 2654211444U, // <6,3,4,6>: Cost 3 vext2 <6,1,6,3>, <4,6,4,6>
+ 2237041332U, // <6,3,4,7>: Cost 3 vrev <3,6,7,4>
+ 1631824413U, // <6,3,4,u>: Cost 2 vext3 <3,4,u,6>, <3,4,u,6>
+ 3640066150U, // <6,3,5,0>: Cost 4 vext1 <2,6,3,5>, LHS
+ 3772746288U, // <6,3,5,1>: Cost 4 vext3 <2,4,0,6>, <3,5,1,7>
+ 3640067790U, // <6,3,5,2>: Cost 4 vext1 <2,6,3,5>, <2,3,4,5>
+ 3773041216U, // <6,3,5,3>: Cost 4 vext3 <2,4,4,6>, <3,5,3,5>
+ 2705934922U, // <6,3,5,4>: Cost 3 vext3 <3,5,4,6>, <3,5,4,6>
+ 3773041236U, // <6,3,5,5>: Cost 4 vext3 <2,4,4,6>, <3,5,5,7>
+ 3779086940U, // <6,3,5,6>: Cost 4 vext3 <3,4,5,6>, <3,5,6,6>
+ 3767732831U, // <6,3,5,7>: Cost 4 vext3 <1,5,4,6>, <3,5,7,0>
+ 2706229870U, // <6,3,5,u>: Cost 3 vext3 <3,5,u,6>, <3,5,u,6>
+ 2602164326U, // <6,3,6,0>: Cost 3 vext1 <u,6,3,6>, LHS
+ 2654212512U, // <6,3,6,1>: Cost 3 vext2 <6,1,6,3>, <6,1,6,3>
+ 2566334393U, // <6,3,6,2>: Cost 3 vext1 <2,6,3,6>, <2,6,3,6>
+ 3704066588U, // <6,3,6,3>: Cost 4 vext2 <2,1,6,3>, <6,3,2,1>
+ 2602167524U, // <6,3,6,4>: Cost 3 vext1 <u,6,3,6>, <4,4,6,6>
+ 3710702321U, // <6,3,6,5>: Cost 4 vext2 <3,2,6,3>, <6,5,7,7>
+ 2724661933U, // <6,3,6,6>: Cost 3 vext3 <6,6,6,6>, <3,6,6,6>
+ 3710702465U, // <6,3,6,7>: Cost 4 vext2 <3,2,6,3>, <6,7,5,7>
+ 2602170158U, // <6,3,6,u>: Cost 3 vext1 <u,6,3,6>, LHS
+ 1492598886U, // <6,3,7,0>: Cost 2 vext1 <2,6,3,7>, LHS
+ 2560369889U, // <6,3,7,1>: Cost 3 vext1 <1,6,3,7>, <1,6,3,7>
+ 1492600762U, // <6,3,7,2>: Cost 2 vext1 <2,6,3,7>, <2,6,3,7>
+ 2566342806U, // <6,3,7,3>: Cost 3 vext1 <2,6,3,7>, <3,0,1,2>
+ 1492602166U, // <6,3,7,4>: Cost 2 vext1 <2,6,3,7>, RHS
+ 2602176208U, // <6,3,7,5>: Cost 3 vext1 <u,6,3,7>, <5,1,7,3>
+ 2566345210U, // <6,3,7,6>: Cost 3 vext1 <2,6,3,7>, <6,2,7,3>
+ 2980496528U, // <6,3,7,7>: Cost 3 vzipr RHS, <1,5,3,7>
+ 1492604718U, // <6,3,7,u>: Cost 2 vext1 <2,6,3,7>, LHS
+ 1492607078U, // <6,3,u,0>: Cost 2 vext1 <2,6,3,u>, LHS
+ 2686028574U, // <6,3,u,1>: Cost 3 vext3 <0,2,4,6>, <3,u,1,2>
+ 1492608955U, // <6,3,u,2>: Cost 2 vext1 <2,6,3,u>, <2,6,3,u>
+ 2566350998U, // <6,3,u,3>: Cost 3 vext1 <2,6,3,u>, <3,0,1,2>
+ 1492610358U, // <6,3,u,4>: Cost 2 vext1 <2,6,3,u>, RHS
+ 1634257734U, // <6,3,u,5>: Cost 2 vext3 <3,u,5,6>, <3,u,5,6>
+ 2566353489U, // <6,3,u,6>: Cost 3 vext1 <2,6,3,u>, <6,3,u,0>
+ 2980504720U, // <6,3,u,7>: Cost 3 vzipr RHS, <1,5,3,7>
+ 1492612910U, // <6,3,u,u>: Cost 2 vext1 <2,6,3,u>, LHS
+ 3703406592U, // <6,4,0,0>: Cost 4 vext2 <2,0,6,4>, <0,0,0,0>
+ 2629664870U, // <6,4,0,1>: Cost 3 vext2 <2,0,6,4>, LHS
+ 2629664972U, // <6,4,0,2>: Cost 3 vext2 <2,0,6,4>, <0,2,4,6>
+ 3779087232U, // <6,4,0,3>: Cost 4 vext3 <3,4,5,6>, <4,0,3,1>
+ 2642936156U, // <6,4,0,4>: Cost 3 vext2 <4,2,6,4>, <0,4,2,6>
+ 2712570770U, // <6,4,0,5>: Cost 3 vext3 <4,6,4,6>, <4,0,5,1>
+ 2687208348U, // <6,4,0,6>: Cost 3 vext3 <0,4,2,6>, <4,0,6,2>
+ 3316723081U, // <6,4,0,7>: Cost 4 vrev <4,6,7,0>
+ 2629665437U, // <6,4,0,u>: Cost 3 vext2 <2,0,6,4>, LHS
+ 2242473291U, // <6,4,1,0>: Cost 3 vrev <4,6,0,1>
+ 3700089652U, // <6,4,1,1>: Cost 4 vext2 <1,4,6,4>, <1,1,1,1>
+ 3703407510U, // <6,4,1,2>: Cost 4 vext2 <2,0,6,4>, <1,2,3,0>
+ 2852962406U, // <6,4,1,3>: Cost 3 vuzpr <5,6,7,4>, LHS
+ 3628166454U, // <6,4,1,4>: Cost 4 vext1 <0,6,4,1>, RHS
+ 3760876514U, // <6,4,1,5>: Cost 4 vext3 <0,4,1,6>, <4,1,5,0>
+ 2687208430U, // <6,4,1,6>: Cost 3 vext3 <0,4,2,6>, <4,1,6,3>
+ 3316731274U, // <6,4,1,7>: Cost 4 vrev <4,6,7,1>
+ 2243063187U, // <6,4,1,u>: Cost 3 vrev <4,6,u,1>
+ 2629666284U, // <6,4,2,0>: Cost 3 vext2 <2,0,6,4>, <2,0,6,4>
+ 3703408188U, // <6,4,2,1>: Cost 4 vext2 <2,0,6,4>, <2,1,6,3>
+ 3703408232U, // <6,4,2,2>: Cost 4 vext2 <2,0,6,4>, <2,2,2,2>
+ 3703408294U, // <6,4,2,3>: Cost 4 vext2 <2,0,6,4>, <2,3,0,1>
+ 2632320816U, // <6,4,2,4>: Cost 3 vext2 <2,4,6,4>, <2,4,6,4>
+ 2923384118U, // <6,4,2,5>: Cost 3 vzipl <6,2,7,3>, RHS
+ 2687208508U, // <6,4,2,6>: Cost 3 vext3 <0,4,2,6>, <4,2,6,0>
+ 3760950341U, // <6,4,2,7>: Cost 4 vext3 <0,4,2,6>, <4,2,7,0>
+ 2634975348U, // <6,4,2,u>: Cost 3 vext2 <2,u,6,4>, <2,u,6,4>
+ 3703408790U, // <6,4,3,0>: Cost 4 vext2 <2,0,6,4>, <3,0,1,2>
+ 3316305238U, // <6,4,3,1>: Cost 4 vrev <4,6,1,3>
+ 3703408947U, // <6,4,3,2>: Cost 4 vext2 <2,0,6,4>, <3,2,0,6>
+ 3703409052U, // <6,4,3,3>: Cost 4 vext2 <2,0,6,4>, <3,3,3,3>
+ 2644929026U, // <6,4,3,4>: Cost 3 vext2 <4,5,6,4>, <3,4,5,6>
+ 3718670922U, // <6,4,3,5>: Cost 4 vext2 <4,5,6,4>, <3,5,4,6>
+ 2705345682U, // <6,4,3,6>: Cost 3 vext3 <3,4,5,6>, <4,3,6,5>
+ 3926705152U, // <6,4,3,7>: Cost 4 vuzpr <5,6,7,4>, <1,3,5,7>
+ 2668817222U, // <6,4,3,u>: Cost 3 vext2 <u,5,6,4>, <3,u,5,6>
+ 2590277734U, // <6,4,4,0>: Cost 3 vext1 <6,6,4,4>, LHS
+ 3716017135U, // <6,4,4,1>: Cost 4 vext2 <4,1,6,4>, <4,1,6,4>
+ 2642938944U, // <6,4,4,2>: Cost 3 vext2 <4,2,6,4>, <4,2,6,4>
+ 3717344401U, // <6,4,4,3>: Cost 4 vext2 <4,3,6,4>, <4,3,6,4>
+ 2712571088U, // <6,4,4,4>: Cost 3 vext3 <4,6,4,6>, <4,4,4,4>
+ 2629668150U, // <6,4,4,5>: Cost 3 vext2 <2,0,6,4>, RHS
+ 1637649636U, // <6,4,4,6>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
+ 2646257109U, // <6,4,4,7>: Cost 3 vext2 <4,7,6,4>, <4,7,6,4>
+ 1637649636U, // <6,4,4,u>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
+ 2566398054U, // <6,4,5,0>: Cost 3 vext1 <2,6,4,5>, LHS
+ 3760876805U, // <6,4,5,1>: Cost 4 vext3 <0,4,1,6>, <4,5,1,3>
+ 2566399937U, // <6,4,5,2>: Cost 3 vext1 <2,6,4,5>, <2,6,4,5>
+ 2584316418U, // <6,4,5,3>: Cost 3 vext1 <5,6,4,5>, <3,4,5,6>
+ 2566401334U, // <6,4,5,4>: Cost 3 vext1 <2,6,4,5>, RHS
+ 2584318028U, // <6,4,5,5>: Cost 3 vext1 <5,6,4,5>, <5,6,4,5>
+ 1612287286U, // <6,4,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
+ 2852965686U, // <6,4,5,7>: Cost 3 vuzpr <5,6,7,4>, RHS
+ 1612287304U, // <6,4,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
+ 1504608358U, // <6,4,6,0>: Cost 2 vext1 <4,6,4,6>, LHS
+ 2578350838U, // <6,4,6,1>: Cost 3 vext1 <4,6,4,6>, <1,0,3,2>
+ 2578351720U, // <6,4,6,2>: Cost 3 vext1 <4,6,4,6>, <2,2,2,2>
+ 2578352278U, // <6,4,6,3>: Cost 3 vext1 <4,6,4,6>, <3,0,1,2>
+ 1504611638U, // <6,4,6,4>: Cost 2 vext1 <4,6,4,6>, RHS
+ 2578353872U, // <6,4,6,5>: Cost 3 vext1 <4,6,4,6>, <5,1,7,3>
+ 2578354682U, // <6,4,6,6>: Cost 3 vext1 <4,6,4,6>, <6,2,7,3>
+ 2578355194U, // <6,4,6,7>: Cost 3 vext1 <4,6,4,6>, <7,0,1,2>
+ 1504614190U, // <6,4,6,u>: Cost 2 vext1 <4,6,4,6>, LHS
+ 2572386406U, // <6,4,7,0>: Cost 3 vext1 <3,6,4,7>, LHS
+ 2572387226U, // <6,4,7,1>: Cost 3 vext1 <3,6,4,7>, <1,2,3,4>
+ 3640157902U, // <6,4,7,2>: Cost 4 vext1 <2,6,4,7>, <2,3,4,5>
+ 2572389020U, // <6,4,7,3>: Cost 3 vext1 <3,6,4,7>, <3,6,4,7>
+ 2572389686U, // <6,4,7,4>: Cost 3 vext1 <3,6,4,7>, RHS
+ 2980497102U, // <6,4,7,5>: Cost 3 vzipr RHS, <2,3,4,5>
+ 2980495564U, // <6,4,7,6>: Cost 3 vzipr RHS, <0,2,4,6>
+ 4054239090U, // <6,4,7,7>: Cost 4 vzipr RHS, <2,5,4,7>
+ 2572392238U, // <6,4,7,u>: Cost 3 vext1 <3,6,4,7>, LHS
+ 1504608358U, // <6,4,u,0>: Cost 2 vext1 <4,6,4,6>, LHS
+ 2629670702U, // <6,4,u,1>: Cost 3 vext2 <2,0,6,4>, LHS
+ 2566424516U, // <6,4,u,2>: Cost 3 vext1 <2,6,4,u>, <2,6,4,u>
+ 2584340994U, // <6,4,u,3>: Cost 3 vext1 <5,6,4,u>, <3,4,5,6>
+ 1640156694U, // <6,4,u,4>: Cost 2 vext3 <4,u,4,6>, <4,u,4,6>
+ 2629671066U, // <6,4,u,5>: Cost 3 vext2 <2,0,6,4>, RHS
+ 1612287529U, // <6,4,u,6>: Cost 2 vext3 <0,2,4,6>, RHS
+ 2852965929U, // <6,4,u,7>: Cost 3 vuzpr <5,6,7,4>, RHS
+ 1612287547U, // <6,4,u,u>: Cost 2 vext3 <0,2,4,6>, RHS
+ 3708723200U, // <6,5,0,0>: Cost 4 vext2 <2,u,6,5>, <0,0,0,0>
+ 2634981478U, // <6,5,0,1>: Cost 3 vext2 <2,u,6,5>, LHS
+ 3694125260U, // <6,5,0,2>: Cost 4 vext2 <0,4,6,5>, <0,2,4,6>
+ 3779087962U, // <6,5,0,3>: Cost 4 vext3 <3,4,5,6>, <5,0,3,2>
+ 3760877154U, // <6,5,0,4>: Cost 4 vext3 <0,4,1,6>, <5,0,4,1>
+ 4195110916U, // <6,5,0,5>: Cost 4 vtrnr <5,6,7,0>, <5,5,5,5>
+ 3696779775U, // <6,5,0,6>: Cost 4 vext2 <0,u,6,5>, <0,6,2,7>
+ 1175212130U, // <6,5,0,7>: Cost 2 vrev <5,6,7,0>
+ 1175285867U, // <6,5,0,u>: Cost 2 vrev <5,6,u,0>
+ 2248445988U, // <6,5,1,0>: Cost 3 vrev <5,6,0,1>
+ 3698107237U, // <6,5,1,1>: Cost 4 vext2 <1,1,6,5>, <1,1,6,5>
+ 3708724118U, // <6,5,1,2>: Cost 4 vext2 <2,u,6,5>, <1,2,3,0>
+ 3908575334U, // <6,5,1,3>: Cost 4 vuzpr <2,6,4,5>, LHS
+ 3716023376U, // <6,5,1,4>: Cost 4 vext2 <4,1,6,5>, <1,4,5,6>
+ 3708724368U, // <6,5,1,5>: Cost 4 vext2 <2,u,6,5>, <1,5,3,7>
+ 3767733960U, // <6,5,1,6>: Cost 4 vext3 <1,5,4,6>, <5,1,6,4>
+ 2712571600U, // <6,5,1,7>: Cost 3 vext3 <4,6,4,6>, <5,1,7,3>
+ 2712571609U, // <6,5,1,u>: Cost 3 vext3 <4,6,4,6>, <5,1,u,3>
+ 2578391142U, // <6,5,2,0>: Cost 3 vext1 <4,6,5,2>, LHS
+ 3704079934U, // <6,5,2,1>: Cost 4 vext2 <2,1,6,5>, <2,1,6,5>
+ 3708724840U, // <6,5,2,2>: Cost 4 vext2 <2,u,6,5>, <2,2,2,2>
+ 3705407182U, // <6,5,2,3>: Cost 4 vext2 <2,3,6,5>, <2,3,4,5>
+ 2578394422U, // <6,5,2,4>: Cost 3 vext1 <4,6,5,2>, RHS
+ 3717351272U, // <6,5,2,5>: Cost 4 vext2 <4,3,6,5>, <2,5,3,6>
+ 2634983354U, // <6,5,2,6>: Cost 3 vext2 <2,u,6,5>, <2,6,3,7>
+ 3115486518U, // <6,5,2,7>: Cost 3 vtrnr <4,6,u,2>, RHS
+ 2634983541U, // <6,5,2,u>: Cost 3 vext2 <2,u,6,5>, <2,u,6,5>
+ 3708725398U, // <6,5,3,0>: Cost 4 vext2 <2,u,6,5>, <3,0,1,2>
+ 3710052631U, // <6,5,3,1>: Cost 4 vext2 <3,1,6,5>, <3,1,6,5>
+ 3708725606U, // <6,5,3,2>: Cost 4 vext2 <2,u,6,5>, <3,2,6,3>
+ 3708725660U, // <6,5,3,3>: Cost 4 vext2 <2,u,6,5>, <3,3,3,3>
+ 2643610114U, // <6,5,3,4>: Cost 3 vext2 <4,3,6,5>, <3,4,5,6>
+ 3717352010U, // <6,5,3,5>: Cost 4 vext2 <4,3,6,5>, <3,5,4,6>
+ 3773632358U, // <6,5,3,6>: Cost 4 vext3 <2,5,3,6>, <5,3,6,0>
+ 2248978533U, // <6,5,3,7>: Cost 3 vrev <5,6,7,3>
+ 2249052270U, // <6,5,3,u>: Cost 3 vrev <5,6,u,3>
+ 2596323430U, // <6,5,4,0>: Cost 3 vext1 <7,6,5,4>, LHS
+ 3716025328U, // <6,5,4,1>: Cost 4 vext2 <4,1,6,5>, <4,1,6,5>
+ 3716688961U, // <6,5,4,2>: Cost 4 vext2 <4,2,6,5>, <4,2,6,5>
+ 2643610770U, // <6,5,4,3>: Cost 3 vext2 <4,3,6,5>, <4,3,6,5>
+ 2596326710U, // <6,5,4,4>: Cost 3 vext1 <7,6,5,4>, RHS
+ 2634984758U, // <6,5,4,5>: Cost 3 vext2 <2,u,6,5>, RHS
+ 3767734199U, // <6,5,4,6>: Cost 4 vext3 <1,5,4,6>, <5,4,6,0>
+ 1643696070U, // <6,5,4,7>: Cost 2 vext3 <5,4,7,6>, <5,4,7,6>
+ 1643769807U, // <6,5,4,u>: Cost 2 vext3 <5,4,u,6>, <5,4,u,6>
+ 2578415718U, // <6,5,5,0>: Cost 3 vext1 <4,6,5,5>, LHS
+ 3652158198U, // <6,5,5,1>: Cost 4 vext1 <4,6,5,5>, <1,0,3,2>
+ 3652159080U, // <6,5,5,2>: Cost 4 vext1 <4,6,5,5>, <2,2,2,2>
+ 3652159638U, // <6,5,5,3>: Cost 4 vext1 <4,6,5,5>, <3,0,1,2>
+ 2578418998U, // <6,5,5,4>: Cost 3 vext1 <4,6,5,5>, RHS
+ 2712571908U, // <6,5,5,5>: Cost 3 vext3 <4,6,4,6>, <5,5,5,5>
+ 2718027790U, // <6,5,5,6>: Cost 3 vext3 <5,5,6,6>, <5,5,6,6>
+ 2712571928U, // <6,5,5,7>: Cost 3 vext3 <4,6,4,6>, <5,5,7,7>
+ 2712571937U, // <6,5,5,u>: Cost 3 vext3 <4,6,4,6>, <5,5,u,7>
+ 2705346596U, // <6,5,6,0>: Cost 3 vext3 <3,4,5,6>, <5,6,0,1>
+ 3767144496U, // <6,5,6,1>: Cost 4 vext3 <1,4,5,6>, <5,6,1,4>
+ 3773116473U, // <6,5,6,2>: Cost 4 vext3 <2,4,5,6>, <5,6,2,4>
+ 2705346626U, // <6,5,6,3>: Cost 3 vext3 <3,4,5,6>, <5,6,3,4>
+ 2705346636U, // <6,5,6,4>: Cost 3 vext3 <3,4,5,6>, <5,6,4,5>
+ 3908577217U, // <6,5,6,5>: Cost 4 vuzpr <2,6,4,5>, <2,6,4,5>
+ 2578428728U, // <6,5,6,6>: Cost 3 vext1 <4,6,5,6>, <6,6,6,6>
+ 2712572002U, // <6,5,6,7>: Cost 3 vext3 <4,6,4,6>, <5,6,7,0>
+ 2705346668U, // <6,5,6,u>: Cost 3 vext3 <3,4,5,6>, <5,6,u,1>
+ 2560516198U, // <6,5,7,0>: Cost 3 vext1 <1,6,5,7>, LHS
+ 2560517363U, // <6,5,7,1>: Cost 3 vext1 <1,6,5,7>, <1,6,5,7>
+ 2566490060U, // <6,5,7,2>: Cost 3 vext1 <2,6,5,7>, <2,6,5,7>
+ 3634260118U, // <6,5,7,3>: Cost 4 vext1 <1,6,5,7>, <3,0,1,2>
+ 2560519478U, // <6,5,7,4>: Cost 3 vext1 <1,6,5,7>, RHS
+ 2980498650U, // <6,5,7,5>: Cost 3 vzipr RHS, <4,4,5,5>
+ 2980497922U, // <6,5,7,6>: Cost 3 vzipr RHS, <3,4,5,6>
+ 3103214902U, // <6,5,7,7>: Cost 3 vtrnr <2,6,3,7>, RHS
+ 2560522030U, // <6,5,7,u>: Cost 3 vext1 <1,6,5,7>, LHS
+ 2560524390U, // <6,5,u,0>: Cost 3 vext1 <1,6,5,u>, LHS
+ 2560525556U, // <6,5,u,1>: Cost 3 vext1 <1,6,5,u>, <1,6,5,u>
+ 2566498253U, // <6,5,u,2>: Cost 3 vext1 <2,6,5,u>, <2,6,5,u>
+ 2646931439U, // <6,5,u,3>: Cost 3 vext2 <4,u,6,5>, <u,3,5,7>
+ 2560527670U, // <6,5,u,4>: Cost 3 vext1 <1,6,5,u>, RHS
+ 2634987674U, // <6,5,u,5>: Cost 3 vext2 <2,u,6,5>, RHS
+ 2980506114U, // <6,5,u,6>: Cost 3 vzipr RHS, <3,4,5,6>
+ 1175277674U, // <6,5,u,7>: Cost 2 vrev <5,6,7,u>
+ 1175351411U, // <6,5,u,u>: Cost 2 vrev <5,6,u,u>
+ 2578448486U, // <6,6,0,0>: Cost 3 vext1 <4,6,6,0>, LHS
+ 1573191782U, // <6,6,0,1>: Cost 2 vext2 <4,u,6,6>, LHS
+ 2686030124U, // <6,6,0,2>: Cost 3 vext3 <0,2,4,6>, <6,0,2,4>
+ 3779088690U, // <6,6,0,3>: Cost 4 vext3 <3,4,5,6>, <6,0,3,1>
+ 2687209788U, // <6,6,0,4>: Cost 3 vext3 <0,4,2,6>, <6,0,4,2>
+ 3652194000U, // <6,6,0,5>: Cost 4 vext1 <4,6,6,0>, <5,1,7,3>
+ 2254852914U, // <6,6,0,6>: Cost 3 vrev <6,6,6,0>
+ 4041575734U, // <6,6,0,7>: Cost 4 vzipr <2,4,6,0>, RHS
+ 1573192349U, // <6,6,0,u>: Cost 2 vext2 <4,u,6,6>, LHS
+ 2646934262U, // <6,6,1,0>: Cost 3 vext2 <4,u,6,6>, <1,0,3,2>
+ 2646934324U, // <6,6,1,1>: Cost 3 vext2 <4,u,6,6>, <1,1,1,1>
+ 2646934422U, // <6,6,1,2>: Cost 3 vext2 <4,u,6,6>, <1,2,3,0>
+ 2846785638U, // <6,6,1,3>: Cost 3 vuzpr <4,6,4,6>, LHS
+ 3760951694U, // <6,6,1,4>: Cost 4 vext3 <0,4,2,6>, <6,1,4,3>
+ 2646934672U, // <6,6,1,5>: Cost 3 vext2 <4,u,6,6>, <1,5,3,7>
+ 2712572320U, // <6,6,1,6>: Cost 3 vext3 <4,6,4,6>, <6,1,6,3>
+ 3775549865U, // <6,6,1,7>: Cost 4 vext3 <2,u,2,6>, <6,1,7,3>
+ 2846785643U, // <6,6,1,u>: Cost 3 vuzpr <4,6,4,6>, LHS
+ 3759772094U, // <6,6,2,0>: Cost 4 vext3 <0,2,4,6>, <6,2,0,6>
+ 3704751676U, // <6,6,2,1>: Cost 4 vext2 <2,2,6,6>, <2,1,6,3>
+ 2631009936U, // <6,6,2,2>: Cost 3 vext2 <2,2,6,6>, <2,2,6,6>
+ 2646935206U, // <6,6,2,3>: Cost 3 vext2 <4,u,6,6>, <2,3,0,1>
+ 3759772127U, // <6,6,2,4>: Cost 4 vext3 <0,2,4,6>, <6,2,4,3>
+ 3704752004U, // <6,6,2,5>: Cost 4 vext2 <2,2,6,6>, <2,5,6,7>
+ 2646935482U, // <6,6,2,6>: Cost 3 vext2 <4,u,6,6>, <2,6,3,7>
+ 2712572410U, // <6,6,2,7>: Cost 3 vext3 <4,6,4,6>, <6,2,7,3>
+ 2712572419U, // <6,6,2,u>: Cost 3 vext3 <4,6,4,6>, <6,2,u,3>
+ 2646935702U, // <6,6,3,0>: Cost 3 vext2 <4,u,6,6>, <3,0,1,2>
+ 3777024534U, // <6,6,3,1>: Cost 4 vext3 <3,1,4,6>, <6,3,1,4>
+ 3704752453U, // <6,6,3,2>: Cost 4 vext2 <2,2,6,6>, <3,2,2,6>
+ 2646935964U, // <6,6,3,3>: Cost 3 vext2 <4,u,6,6>, <3,3,3,3>
+ 2705347122U, // <6,6,3,4>: Cost 3 vext3 <3,4,5,6>, <6,3,4,5>
+ 3779678778U, // <6,6,3,5>: Cost 4 vext3 <3,5,4,6>, <6,3,5,4>
+ 2657553069U, // <6,6,3,6>: Cost 3 vext2 <6,6,6,6>, <3,6,6,6>
+ 4039609654U, // <6,6,3,7>: Cost 4 vzipr <2,1,6,3>, RHS
+ 2708001366U, // <6,6,3,u>: Cost 3 vext3 <3,u,5,6>, <6,3,u,5>
+ 2578481254U, // <6,6,4,0>: Cost 3 vext1 <4,6,6,4>, LHS
+ 3652223734U, // <6,6,4,1>: Cost 4 vext1 <4,6,6,4>, <1,0,3,2>
+ 3760951922U, // <6,6,4,2>: Cost 4 vext3 <0,4,2,6>, <6,4,2,6>
+ 3779089019U, // <6,6,4,3>: Cost 4 vext3 <3,4,5,6>, <6,4,3,6>
+ 1570540772U, // <6,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
+ 1573195062U, // <6,6,4,5>: Cost 2 vext2 <4,u,6,6>, RHS
+ 2712572560U, // <6,6,4,6>: Cost 3 vext3 <4,6,4,6>, <6,4,6,0>
+ 2723410591U, // <6,6,4,7>: Cost 3 vext3 <6,4,7,6>, <6,4,7,6>
+ 1573195304U, // <6,6,4,u>: Cost 2 vext2 <4,u,6,6>, <4,u,6,6>
+ 3640287334U, // <6,6,5,0>: Cost 4 vext1 <2,6,6,5>, LHS
+ 2646937296U, // <6,6,5,1>: Cost 3 vext2 <4,u,6,6>, <5,1,7,3>
+ 3640289235U, // <6,6,5,2>: Cost 4 vext1 <2,6,6,5>, <2,6,6,5>
+ 3720679279U, // <6,6,5,3>: Cost 4 vext2 <4,u,6,6>, <5,3,7,0>
+ 2646937542U, // <6,6,5,4>: Cost 3 vext2 <4,u,6,6>, <5,4,7,6>
+ 2646937604U, // <6,6,5,5>: Cost 3 vext2 <4,u,6,6>, <5,5,5,5>
+ 2646937698U, // <6,6,5,6>: Cost 3 vext2 <4,u,6,6>, <5,6,7,0>
+ 2846788918U, // <6,6,5,7>: Cost 3 vuzpr <4,6,4,6>, RHS
+ 2846788919U, // <6,6,5,u>: Cost 3 vuzpr <4,6,4,6>, RHS
+ 1516699750U, // <6,6,6,0>: Cost 2 vext1 <6,6,6,6>, LHS
+ 2590442230U, // <6,6,6,1>: Cost 3 vext1 <6,6,6,6>, <1,0,3,2>
+ 2646938106U, // <6,6,6,2>: Cost 3 vext2 <4,u,6,6>, <6,2,7,3>
+ 2590443670U, // <6,6,6,3>: Cost 3 vext1 <6,6,6,6>, <3,0,1,2>
+ 1516703030U, // <6,6,6,4>: Cost 2 vext1 <6,6,6,6>, RHS
+ 2590445264U, // <6,6,6,5>: Cost 3 vext1 <6,6,6,6>, <5,1,7,3>
+ 296144182U, // <6,6,6,6>: Cost 1 vdup2 RHS
+ 2712572738U, // <6,6,6,7>: Cost 3 vext3 <4,6,4,6>, <6,6,7,7>
+ 296144182U, // <6,6,6,u>: Cost 1 vdup2 RHS
+ 2566561894U, // <6,6,7,0>: Cost 3 vext1 <2,6,6,7>, LHS
+ 3634332924U, // <6,6,7,1>: Cost 4 vext1 <1,6,6,7>, <1,6,6,7>
+ 2566563797U, // <6,6,7,2>: Cost 3 vext1 <2,6,6,7>, <2,6,6,7>
+ 2584480258U, // <6,6,7,3>: Cost 3 vext1 <5,6,6,7>, <3,4,5,6>
+ 2566565174U, // <6,6,7,4>: Cost 3 vext1 <2,6,6,7>, RHS
+ 2717438846U, // <6,6,7,5>: Cost 3 vext3 <5,4,7,6>, <6,7,5,4>
+ 2980500280U, // <6,6,7,6>: Cost 3 vzipr RHS, <6,6,6,6>
+ 1906756918U, // <6,6,7,7>: Cost 2 vzipr RHS, RHS
+ 1906756919U, // <6,6,7,u>: Cost 2 vzipr RHS, RHS
+ 1516699750U, // <6,6,u,0>: Cost 2 vext1 <6,6,6,6>, LHS
+ 1573197614U, // <6,6,u,1>: Cost 2 vext2 <4,u,6,6>, LHS
+ 2566571990U, // <6,6,u,2>: Cost 3 vext1 <2,6,6,u>, <2,6,6,u>
+ 2846786205U, // <6,6,u,3>: Cost 3 vuzpr <4,6,4,6>, LHS
+ 1516703030U, // <6,6,u,4>: Cost 2 vext1 <6,6,6,6>, RHS
+ 1573197978U, // <6,6,u,5>: Cost 2 vext2 <4,u,6,6>, RHS
+ 296144182U, // <6,6,u,6>: Cost 1 vdup2 RHS
+ 1906765110U, // <6,6,u,7>: Cost 2 vzipr RHS, RHS
+ 296144182U, // <6,6,u,u>: Cost 1 vdup2 RHS
+ 1571209216U, // <6,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+ 497467494U, // <6,7,0,1>: Cost 1 vext2 RHS, LHS
+ 1571209380U, // <6,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+ 2644951292U, // <6,7,0,3>: Cost 3 vext2 RHS, <0,3,1,0>
+ 1571209554U, // <6,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+ 1510756450U, // <6,7,0,5>: Cost 2 vext1 <5,6,7,0>, <5,6,7,0>
+ 2644951542U, // <6,7,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
+ 2584499194U, // <6,7,0,7>: Cost 3 vext1 <5,6,7,0>, <7,0,1,2>
+ 497468061U, // <6,7,0,u>: Cost 1 vext2 RHS, LHS
+ 1571209974U, // <6,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+ 1571210036U, // <6,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+ 1571210134U, // <6,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
+ 1571210200U, // <6,7,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
+ 2644952098U, // <6,7,1,4>: Cost 3 vext2 RHS, <1,4,0,5>
+ 1571210384U, // <6,7,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
+ 2644952271U, // <6,7,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
+ 2578535418U, // <6,7,1,7>: Cost 3 vext1 <4,6,7,1>, <7,0,1,2>
+ 1571210605U, // <6,7,1,u>: Cost 2 vext2 RHS, <1,u,1,3>
+ 2644952509U, // <6,7,2,0>: Cost 3 vext2 RHS, <2,0,1,2>
+ 2644952582U, // <6,7,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
+ 1571210856U, // <6,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+ 1571210918U, // <6,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+ 2644952828U, // <6,7,2,4>: Cost 3 vext2 RHS, <2,4,0,6>
+ 2633009028U, // <6,7,2,5>: Cost 3 vext2 <2,5,6,7>, <2,5,6,7>
+ 1571211194U, // <6,7,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
+ 2668840938U, // <6,7,2,7>: Cost 3 vext2 RHS, <2,7,0,1>
+ 1571211323U, // <6,7,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
+ 1571211414U, // <6,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+ 2644953311U, // <6,7,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
+ 2644953390U, // <6,7,3,2>: Cost 3 vext2 RHS, <3,2,0,1>
+ 1571211676U, // <6,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+ 1571211778U, // <6,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+ 2644953648U, // <6,7,3,5>: Cost 3 vext2 RHS, <3,5,1,7>
+ 2644953720U, // <6,7,3,6>: Cost 3 vext2 RHS, <3,6,0,7>
+ 2644953795U, // <6,7,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
+ 1571212062U, // <6,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+ 1573202834U, // <6,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+ 2644954058U, // <6,7,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
+ 2644954166U, // <6,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
+ 2644954258U, // <6,7,4,3>: Cost 3 vext2 RHS, <4,3,6,5>
+ 1571212496U, // <6,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+ 497470774U, // <6,7,4,5>: Cost 1 vext2 RHS, RHS
+ 1573203316U, // <6,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+ 2646281688U, // <6,7,4,7>: Cost 3 vext2 <4,7,6,7>, <4,7,6,7>
+ 497471017U, // <6,7,4,u>: Cost 1 vext2 RHS, RHS
+ 2644954696U, // <6,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
+ 1573203664U, // <6,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+ 2644954878U, // <6,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
+ 2644954991U, // <6,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
+ 1571213254U, // <6,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+ 1571213316U, // <6,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+ 1571213410U, // <6,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
+ 1573204136U, // <6,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+ 1573204217U, // <6,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
+ 2644955425U, // <6,7,6,0>: Cost 3 vext2 RHS, <6,0,1,2>
+ 2644955561U, // <6,7,6,1>: Cost 3 vext2 RHS, <6,1,7,3>
+ 1573204474U, // <6,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+ 2644955698U, // <6,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
+ 2644955789U, // <6,7,6,4>: Cost 3 vext2 RHS, <6,4,5,6>
+ 2644955889U, // <6,7,6,5>: Cost 3 vext2 RHS, <6,5,7,7>
+ 1571214136U, // <6,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
+ 1571214158U, // <6,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+ 1573204895U, // <6,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
+ 1573204986U, // <6,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
+ 2572608656U, // <6,7,7,1>: Cost 3 vext1 <3,6,7,7>, <1,5,3,7>
+ 2644956362U, // <6,7,7,2>: Cost 3 vext2 RHS, <7,2,6,3>
+ 2572610231U, // <6,7,7,3>: Cost 3 vext1 <3,6,7,7>, <3,6,7,7>
+ 1573205350U, // <6,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
+ 2646947220U, // <6,7,7,5>: Cost 3 vext2 RHS, <7,5,1,7>
+ 1516786498U, // <6,7,7,6>: Cost 2 vext1 <6,6,7,7>, <6,6,7,7>
+ 1571214956U, // <6,7,7,7>: Cost 2 vext2 RHS, <7,7,7,7>
+ 1573205634U, // <6,7,7,u>: Cost 2 vext2 RHS, <7,u,1,2>
+ 1571215059U, // <6,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
+ 497473326U, // <6,7,u,1>: Cost 1 vext2 RHS, LHS
+ 1571215237U, // <6,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
+ 1571215292U, // <6,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+ 1571215423U, // <6,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
+ 497473690U, // <6,7,u,5>: Cost 1 vext2 RHS, RHS
+ 1571215568U, // <6,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
+ 1573206272U, // <6,7,u,7>: Cost 2 vext2 RHS, <u,7,0,1>
+ 497473893U, // <6,7,u,u>: Cost 1 vext2 RHS, LHS
+ 1571217408U, // <6,u,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+ 497475686U, // <6,u,0,1>: Cost 1 vext2 RHS, LHS
+ 1571217572U, // <6,u,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+ 2689865445U, // <6,u,0,3>: Cost 3 vext3 <0,u,2,6>, <u,0,3,2>
+ 1571217746U, // <6,u,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+ 1510830187U, // <6,u,0,5>: Cost 2 vext1 <5,6,u,0>, <5,6,u,0>
+ 2644959734U, // <6,u,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
+ 1193130221U, // <6,u,0,7>: Cost 2 vrev <u,6,7,0>
+ 497476253U, // <6,u,0,u>: Cost 1 vext2 RHS, LHS
+ 1571218166U, // <6,u,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+ 1571218228U, // <6,u,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+ 1612289838U, // <6,u,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
+ 1571218392U, // <6,u,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
+ 2566663478U, // <6,u,1,4>: Cost 3 vext1 <2,6,u,1>, RHS
+ 1571218576U, // <6,u,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
+ 2644960463U, // <6,u,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
+ 2717439835U, // <6,u,1,7>: Cost 3 vext3 <5,4,7,6>, <u,1,7,3>
+ 1612289892U, // <6,u,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
+ 1504870502U, // <6,u,2,0>: Cost 2 vext1 <4,6,u,2>, LHS
+ 2644960774U, // <6,u,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
+ 1571219048U, // <6,u,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+ 1571219110U, // <6,u,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+ 1504873782U, // <6,u,2,4>: Cost 2 vext1 <4,6,u,2>, RHS
+ 2633017221U, // <6,u,2,5>: Cost 3 vext2 <2,5,6,u>, <2,5,6,u>
+ 1571219386U, // <6,u,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
+ 2712573868U, // <6,u,2,7>: Cost 3 vext3 <4,6,4,6>, <u,2,7,3>
+ 1571219515U, // <6,u,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
+ 1571219606U, // <6,u,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+ 2644961503U, // <6,u,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
+ 2566678499U, // <6,u,3,2>: Cost 3 vext1 <2,6,u,3>, <2,6,u,3>
+ 1571219868U, // <6,u,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+ 1571219970U, // <6,u,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+ 2689865711U, // <6,u,3,5>: Cost 3 vext3 <0,u,2,6>, <u,3,5,7>
+ 2708002806U, // <6,u,3,6>: Cost 3 vext3 <3,u,5,6>, <u,3,6,5>
+ 2644961987U, // <6,u,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
+ 1571220254U, // <6,u,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+ 1571220370U, // <6,u,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+ 2644962250U, // <6,u,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
+ 1661245476U, // <6,u,4,2>: Cost 2 vext3 <u,4,2,6>, <u,4,2,6>
+ 2686031917U, // <6,u,4,3>: Cost 3 vext3 <0,2,4,6>, <u,4,3,6>
+ 1571220688U, // <6,u,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+ 497478967U, // <6,u,4,5>: Cost 1 vext2 RHS, RHS
+ 1571220852U, // <6,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+ 1661614161U, // <6,u,4,7>: Cost 2 vext3 <u,4,7,6>, <u,4,7,6>
+ 497479209U, // <6,u,4,u>: Cost 1 vext2 RHS, RHS
+ 2566692966U, // <6,u,5,0>: Cost 3 vext1 <2,6,u,5>, LHS
+ 1571221200U, // <6,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+ 2566694885U, // <6,u,5,2>: Cost 3 vext1 <2,6,u,5>, <2,6,u,5>
+ 2689865855U, // <6,u,5,3>: Cost 3 vext3 <0,u,2,6>, <u,5,3,7>
+ 1571221446U, // <6,u,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+ 1571221508U, // <6,u,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+ 1612290202U, // <6,u,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
+ 1571221672U, // <6,u,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+ 1612290220U, // <6,u,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
+ 1504903270U, // <6,u,6,0>: Cost 2 vext1 <4,6,u,6>, LHS
+ 2644963752U, // <6,u,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
+ 1571222010U, // <6,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+ 2686032080U, // <6,u,6,3>: Cost 3 vext3 <0,2,4,6>, <u,6,3,7>
+ 1504906550U, // <6,u,6,4>: Cost 2 vext1 <4,6,u,6>, RHS
+ 2644964079U, // <6,u,6,5>: Cost 3 vext2 RHS, <6,5,7,5>
+ 296144182U, // <6,u,6,6>: Cost 1 vdup2 RHS
+ 1571222350U, // <6,u,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+ 296144182U, // <6,u,6,u>: Cost 1 vdup2 RHS
+ 1492967526U, // <6,u,7,0>: Cost 2 vext1 <2,6,u,7>, LHS
+ 2560738574U, // <6,u,7,1>: Cost 3 vext1 <1,6,u,7>, <1,6,u,7>
+ 1492969447U, // <6,u,7,2>: Cost 2 vext1 <2,6,u,7>, <2,6,u,7>
+ 1906753692U, // <6,u,7,3>: Cost 2 vzipr RHS, LHS
+ 1492970806U, // <6,u,7,4>: Cost 2 vext1 <2,6,u,7>, RHS
+ 2980495761U, // <6,u,7,5>: Cost 3 vzipr RHS, <0,4,u,5>
+ 1516860235U, // <6,u,7,6>: Cost 2 vext1 <6,6,u,7>, <6,6,u,7>
+ 1906756936U, // <6,u,7,7>: Cost 2 vzipr RHS, RHS
+ 1492973358U, // <6,u,7,u>: Cost 2 vext1 <2,6,u,7>, LHS
+ 1492975718U, // <6,u,u,0>: Cost 2 vext1 <2,6,u,u>, LHS
+ 497481518U, // <6,u,u,1>: Cost 1 vext2 RHS, LHS
+ 1612290405U, // <6,u,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
+ 1571223484U, // <6,u,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+ 1492978998U, // <6,u,u,4>: Cost 2 vext1 <2,6,u,u>, RHS
+ 497481882U, // <6,u,u,5>: Cost 1 vext2 RHS, RHS
+ 296144182U, // <6,u,u,6>: Cost 1 vdup2 RHS
+ 1906765128U, // <6,u,u,7>: Cost 2 vzipr RHS, RHS
+ 497482085U, // <6,u,u,u>: Cost 1 vext2 RHS, LHS
+ 1638318080U, // <7,0,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
+ 1638318090U, // <7,0,0,1>: Cost 2 vext3 RHS, <0,0,1,1>
+ 1638318100U, // <7,0,0,2>: Cost 2 vext3 RHS, <0,0,2,2>
+ 3646442178U, // <7,0,0,3>: Cost 4 vext1 <3,7,0,0>, <3,7,0,0>
+ 2712059941U, // <7,0,0,4>: Cost 3 vext3 RHS, <0,0,4,1>
+ 2651603364U, // <7,0,0,5>: Cost 3 vext2 <5,6,7,0>, <0,5,1,6>
+ 2590618445U, // <7,0,0,6>: Cost 3 vext1 <6,7,0,0>, <6,7,0,0>
+ 3785801798U, // <7,0,0,7>: Cost 4 vext3 RHS, <0,0,7,7>
+ 1638318153U, // <7,0,0,u>: Cost 2 vext3 RHS, <0,0,u,1>
+ 1516879974U, // <7,0,1,0>: Cost 2 vext1 <6,7,0,1>, LHS
+ 2693922911U, // <7,0,1,1>: Cost 3 vext3 <1,5,3,7>, <0,1,1,5>
+ 564576358U, // <7,0,1,2>: Cost 1 vext3 RHS, LHS
+ 2638996480U, // <7,0,1,3>: Cost 3 vext2 <3,5,7,0>, <1,3,5,7>
+ 1516883254U, // <7,0,1,4>: Cost 2 vext1 <6,7,0,1>, RHS
+ 2649613456U, // <7,0,1,5>: Cost 3 vext2 <5,3,7,0>, <1,5,3,7>
+ 1516884814U, // <7,0,1,6>: Cost 2 vext1 <6,7,0,1>, <6,7,0,1>
+ 2590626808U, // <7,0,1,7>: Cost 3 vext1 <6,7,0,1>, <7,0,1,0>
+ 564576412U, // <7,0,1,u>: Cost 1 vext3 RHS, LHS
+ 1638318244U, // <7,0,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
+ 2692743344U, // <7,0,2,1>: Cost 3 vext3 <1,3,5,7>, <0,2,1,5>
+ 2712060084U, // <7,0,2,2>: Cost 3 vext3 RHS, <0,2,2,0>
+ 2712060094U, // <7,0,2,3>: Cost 3 vext3 RHS, <0,2,3,1>
+ 1638318284U, // <7,0,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
+ 2712060118U, // <7,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
+ 2651604922U, // <7,0,2,6>: Cost 3 vext2 <5,6,7,0>, <2,6,3,7>
+ 2686255336U, // <7,0,2,7>: Cost 3 vext3 <0,2,7,7>, <0,2,7,7>
+ 1638318316U, // <7,0,2,u>: Cost 2 vext3 RHS, <0,2,u,2>
+ 2651605142U, // <7,0,3,0>: Cost 3 vext2 <5,6,7,0>, <3,0,1,2>
+ 2712060156U, // <7,0,3,1>: Cost 3 vext3 RHS, <0,3,1,0>
+ 2712060165U, // <7,0,3,2>: Cost 3 vext3 RHS, <0,3,2,0>
+ 2651605404U, // <7,0,3,3>: Cost 3 vext2 <5,6,7,0>, <3,3,3,3>
+ 2651605506U, // <7,0,3,4>: Cost 3 vext2 <5,6,7,0>, <3,4,5,6>
+ 2638998111U, // <7,0,3,5>: Cost 3 vext2 <3,5,7,0>, <3,5,7,0>
+ 2639661744U, // <7,0,3,6>: Cost 3 vext2 <3,6,7,0>, <3,6,7,0>
+ 3712740068U, // <7,0,3,7>: Cost 4 vext2 <3,5,7,0>, <3,7,3,7>
+ 2640989010U, // <7,0,3,u>: Cost 3 vext2 <3,u,7,0>, <3,u,7,0>
+ 2712060232U, // <7,0,4,0>: Cost 3 vext3 RHS, <0,4,0,4>
+ 1638318418U, // <7,0,4,1>: Cost 2 vext3 RHS, <0,4,1,5>
+ 1638318428U, // <7,0,4,2>: Cost 2 vext3 RHS, <0,4,2,6>
+ 3646474950U, // <7,0,4,3>: Cost 4 vext1 <3,7,0,4>, <3,7,0,4>
+ 2712060270U, // <7,0,4,4>: Cost 3 vext3 RHS, <0,4,4,6>
+ 1577864502U, // <7,0,4,5>: Cost 2 vext2 <5,6,7,0>, RHS
+ 2651606388U, // <7,0,4,6>: Cost 3 vext2 <5,6,7,0>, <4,6,4,6>
+ 3787792776U, // <7,0,4,7>: Cost 4 vext3 RHS, <0,4,7,5>
+ 1638318481U, // <7,0,4,u>: Cost 2 vext3 RHS, <0,4,u,5>
+ 2590654566U, // <7,0,5,0>: Cost 3 vext1 <6,7,0,5>, LHS
+ 2651606736U, // <7,0,5,1>: Cost 3 vext2 <5,6,7,0>, <5,1,7,3>
+ 2712060334U, // <7,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
+ 2649616239U, // <7,0,5,3>: Cost 3 vext2 <5,3,7,0>, <5,3,7,0>
+ 2651606982U, // <7,0,5,4>: Cost 3 vext2 <5,6,7,0>, <5,4,7,6>
+ 2651607044U, // <7,0,5,5>: Cost 3 vext2 <5,6,7,0>, <5,5,5,5>
+ 1577865314U, // <7,0,5,6>: Cost 2 vext2 <5,6,7,0>, <5,6,7,0>
+ 2651607208U, // <7,0,5,7>: Cost 3 vext2 <5,6,7,0>, <5,7,5,7>
+ 1579192580U, // <7,0,5,u>: Cost 2 vext2 <5,u,7,0>, <5,u,7,0>
+ 2688393709U, // <7,0,6,0>: Cost 3 vext3 <0,6,0,7>, <0,6,0,7>
+ 2712060406U, // <7,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
+ 2688541183U, // <7,0,6,2>: Cost 3 vext3 <0,6,2,7>, <0,6,2,7>
+ 2655588936U, // <7,0,6,3>: Cost 3 vext2 <6,3,7,0>, <6,3,7,0>
+ 3762430481U, // <7,0,6,4>: Cost 4 vext3 <0,6,4,7>, <0,6,4,7>
+ 2651607730U, // <7,0,6,5>: Cost 3 vext2 <5,6,7,0>, <6,5,0,7>
+ 2651607864U, // <7,0,6,6>: Cost 3 vext2 <5,6,7,0>, <6,6,6,6>
+ 2651607886U, // <7,0,6,7>: Cost 3 vext2 <5,6,7,0>, <6,7,0,1>
+ 2688983605U, // <7,0,6,u>: Cost 3 vext3 <0,6,u,7>, <0,6,u,7>
+ 2651608058U, // <7,0,7,0>: Cost 3 vext2 <5,6,7,0>, <7,0,1,2>
+ 2932703334U, // <7,0,7,1>: Cost 3 vzipl <7,7,7,7>, LHS
+ 3066921062U, // <7,0,7,2>: Cost 3 vtrnl <7,7,7,7>, LHS
+ 3712742678U, // <7,0,7,3>: Cost 4 vext2 <3,5,7,0>, <7,3,5,7>
+ 2651608422U, // <7,0,7,4>: Cost 3 vext2 <5,6,7,0>, <7,4,5,6>
+ 2651608513U, // <7,0,7,5>: Cost 3 vext2 <5,6,7,0>, <7,5,6,7>
+ 2663552532U, // <7,0,7,6>: Cost 3 vext2 <7,6,7,0>, <7,6,7,0>
+ 2651608684U, // <7,0,7,7>: Cost 3 vext2 <5,6,7,0>, <7,7,7,7>
+ 2651608706U, // <7,0,7,u>: Cost 3 vext2 <5,6,7,0>, <7,u,1,2>
+ 1638318730U, // <7,0,u,0>: Cost 2 vext3 RHS, <0,u,0,2>
+ 1638318738U, // <7,0,u,1>: Cost 2 vext3 RHS, <0,u,1,1>
+ 564576925U, // <7,0,u,2>: Cost 1 vext3 RHS, LHS
+ 2572765898U, // <7,0,u,3>: Cost 3 vext1 <3,7,0,u>, <3,7,0,u>
+ 1638318770U, // <7,0,u,4>: Cost 2 vext3 RHS, <0,u,4,6>
+ 1577867418U, // <7,0,u,5>: Cost 2 vext2 <5,6,7,0>, RHS
+ 1516942165U, // <7,0,u,6>: Cost 2 vext1 <6,7,0,u>, <6,7,0,u>
+ 2651609344U, // <7,0,u,7>: Cost 3 vext2 <5,6,7,0>, <u,7,0,1>
+ 564576979U, // <7,0,u,u>: Cost 1 vext3 RHS, LHS
+ 2590687334U, // <7,1,0,0>: Cost 3 vext1 <6,7,1,0>, LHS
+ 2639003750U, // <7,1,0,1>: Cost 3 vext2 <3,5,7,1>, LHS
+ 2793357414U, // <7,1,0,2>: Cost 3 vuzpl <7,0,1,2>, LHS
+ 1638318838U, // <7,1,0,3>: Cost 2 vext3 RHS, <1,0,3,2>
+ 2590690614U, // <7,1,0,4>: Cost 3 vext1 <6,7,1,0>, RHS
+ 2712060679U, // <7,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
+ 2590692182U, // <7,1,0,6>: Cost 3 vext1 <6,7,1,0>, <6,7,1,0>
+ 3785802521U, // <7,1,0,7>: Cost 4 vext3 RHS, <1,0,7,1>
+ 1638318883U, // <7,1,0,u>: Cost 2 vext3 RHS, <1,0,u,2>
+ 2712060715U, // <7,1,1,0>: Cost 3 vext3 RHS, <1,1,0,1>
+ 1638318900U, // <7,1,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
+ 3774300994U, // <7,1,1,2>: Cost 4 vext3 <2,6,3,7>, <1,1,2,6>
+ 1638318920U, // <7,1,1,3>: Cost 2 vext3 RHS, <1,1,3,3>
+ 2712060755U, // <7,1,1,4>: Cost 3 vext3 RHS, <1,1,4,5>
+ 2691416926U, // <7,1,1,5>: Cost 3 vext3 <1,1,5,7>, <1,1,5,7>
+ 2590700375U, // <7,1,1,6>: Cost 3 vext1 <6,7,1,1>, <6,7,1,1>
+ 3765158766U, // <7,1,1,7>: Cost 4 vext3 <1,1,5,7>, <1,1,7,5>
+ 1638318965U, // <7,1,1,u>: Cost 2 vext3 RHS, <1,1,u,3>
+ 2712060796U, // <7,1,2,0>: Cost 3 vext3 RHS, <1,2,0,1>
+ 2712060807U, // <7,1,2,1>: Cost 3 vext3 RHS, <1,2,1,3>
+ 3712747112U, // <7,1,2,2>: Cost 4 vext2 <3,5,7,1>, <2,2,2,2>
+ 1638318998U, // <7,1,2,3>: Cost 2 vext3 RHS, <1,2,3,0>
+ 2712060836U, // <7,1,2,4>: Cost 3 vext3 RHS, <1,2,4,5>
+ 2712060843U, // <7,1,2,5>: Cost 3 vext3 RHS, <1,2,5,3>
+ 2590708568U, // <7,1,2,6>: Cost 3 vext1 <6,7,1,2>, <6,7,1,2>
+ 2735948730U, // <7,1,2,7>: Cost 3 vext3 RHS, <1,2,7,0>
+ 1638319043U, // <7,1,2,u>: Cost 2 vext3 RHS, <1,2,u,0>
+ 2712060876U, // <7,1,3,0>: Cost 3 vext3 RHS, <1,3,0,0>
+ 1638319064U, // <7,1,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
+ 2712060894U, // <7,1,3,2>: Cost 3 vext3 RHS, <1,3,2,0>
+ 2692596718U, // <7,1,3,3>: Cost 3 vext3 <1,3,3,7>, <1,3,3,7>
+ 2712060917U, // <7,1,3,4>: Cost 3 vext3 RHS, <1,3,4,5>
+ 1619002368U, // <7,1,3,5>: Cost 2 vext3 <1,3,5,7>, <1,3,5,7>
+ 2692817929U, // <7,1,3,6>: Cost 3 vext3 <1,3,6,7>, <1,3,6,7>
+ 2735948814U, // <7,1,3,7>: Cost 3 vext3 RHS, <1,3,7,3>
+ 1619223579U, // <7,1,3,u>: Cost 2 vext3 <1,3,u,7>, <1,3,u,7>
+ 2712060962U, // <7,1,4,0>: Cost 3 vext3 RHS, <1,4,0,5>
+ 2712060971U, // <7,1,4,1>: Cost 3 vext3 RHS, <1,4,1,5>
+ 2712060980U, // <7,1,4,2>: Cost 3 vext3 RHS, <1,4,2,5>
+ 2712060989U, // <7,1,4,3>: Cost 3 vext3 RHS, <1,4,3,5>
+ 3785802822U, // <7,1,4,4>: Cost 4 vext3 RHS, <1,4,4,5>
+ 2639007030U, // <7,1,4,5>: Cost 3 vext2 <3,5,7,1>, RHS
+ 2645642634U, // <7,1,4,6>: Cost 3 vext2 <4,6,7,1>, <4,6,7,1>
+ 3719384520U, // <7,1,4,7>: Cost 4 vext2 <4,6,7,1>, <4,7,5,0>
+ 2639007273U, // <7,1,4,u>: Cost 3 vext2 <3,5,7,1>, RHS
+ 2572812390U, // <7,1,5,0>: Cost 3 vext1 <3,7,1,5>, LHS
+ 2693776510U, // <7,1,5,1>: Cost 3 vext3 <1,5,1,7>, <1,5,1,7>
+ 3774301318U, // <7,1,5,2>: Cost 4 vext3 <2,6,3,7>, <1,5,2,6>
+ 1620182160U, // <7,1,5,3>: Cost 2 vext3 <1,5,3,7>, <1,5,3,7>
+ 2572815670U, // <7,1,5,4>: Cost 3 vext1 <3,7,1,5>, RHS
+ 3766486178U, // <7,1,5,5>: Cost 4 vext3 <1,3,5,7>, <1,5,5,7>
+ 2651615331U, // <7,1,5,6>: Cost 3 vext2 <5,6,7,1>, <5,6,7,1>
+ 2652278964U, // <7,1,5,7>: Cost 3 vext2 <5,7,7,1>, <5,7,7,1>
+ 1620550845U, // <7,1,5,u>: Cost 2 vext3 <1,5,u,7>, <1,5,u,7>
+ 3768108230U, // <7,1,6,0>: Cost 4 vext3 <1,6,0,7>, <1,6,0,7>
+ 2694440143U, // <7,1,6,1>: Cost 3 vext3 <1,6,1,7>, <1,6,1,7>
+ 2712061144U, // <7,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
+ 2694587617U, // <7,1,6,3>: Cost 3 vext3 <1,6,3,7>, <1,6,3,7>
+ 3768403178U, // <7,1,6,4>: Cost 4 vext3 <1,6,4,7>, <1,6,4,7>
+ 2694735091U, // <7,1,6,5>: Cost 3 vext3 <1,6,5,7>, <1,6,5,7>
+ 3768550652U, // <7,1,6,6>: Cost 4 vext3 <1,6,6,7>, <1,6,6,7>
+ 2652279630U, // <7,1,6,7>: Cost 3 vext2 <5,7,7,1>, <6,7,0,1>
+ 2694956302U, // <7,1,6,u>: Cost 3 vext3 <1,6,u,7>, <1,6,u,7>
+ 2645644282U, // <7,1,7,0>: Cost 3 vext2 <4,6,7,1>, <7,0,1,2>
+ 2859062094U, // <7,1,7,1>: Cost 3 vuzpr <6,7,0,1>, <6,7,0,1>
+ 3779462437U, // <7,1,7,2>: Cost 4 vext3 <3,5,1,7>, <1,7,2,3>
+ 3121938534U, // <7,1,7,3>: Cost 3 vtrnr <5,7,5,7>, LHS
+ 2554916150U, // <7,1,7,4>: Cost 3 vext1 <0,7,1,7>, RHS
+ 3769140548U, // <7,1,7,5>: Cost 4 vext3 <1,7,5,7>, <1,7,5,7>
+ 3726022164U, // <7,1,7,6>: Cost 4 vext2 <5,7,7,1>, <7,6,7,0>
+ 2554918508U, // <7,1,7,7>: Cost 3 vext1 <0,7,1,7>, <7,7,7,7>
+ 3121938539U, // <7,1,7,u>: Cost 3 vtrnr <5,7,5,7>, LHS
+ 2572836966U, // <7,1,u,0>: Cost 3 vext1 <3,7,1,u>, LHS
+ 1638319469U, // <7,1,u,1>: Cost 2 vext3 RHS, <1,u,1,3>
+ 2712061299U, // <7,1,u,2>: Cost 3 vext3 RHS, <1,u,2,0>
+ 1622173059U, // <7,1,u,3>: Cost 2 vext3 <1,u,3,7>, <1,u,3,7>
+ 2572840246U, // <7,1,u,4>: Cost 3 vext1 <3,7,1,u>, RHS
+ 1622320533U, // <7,1,u,5>: Cost 2 vext3 <1,u,5,7>, <1,u,5,7>
+ 2696136094U, // <7,1,u,6>: Cost 3 vext3 <1,u,6,7>, <1,u,6,7>
+ 2859060777U, // <7,1,u,7>: Cost 3 vuzpr <6,7,0,1>, RHS
+ 1622541744U, // <7,1,u,u>: Cost 2 vext3 <1,u,u,7>, <1,u,u,7>
+ 2712061364U, // <7,2,0,0>: Cost 3 vext3 RHS, <2,0,0,2>
+ 2712061373U, // <7,2,0,1>: Cost 3 vext3 RHS, <2,0,1,2>
+ 2712061380U, // <7,2,0,2>: Cost 3 vext3 RHS, <2,0,2,0>
+ 2712061389U, // <7,2,0,3>: Cost 3 vext3 RHS, <2,0,3,0>
+ 2712061404U, // <7,2,0,4>: Cost 3 vext3 RHS, <2,0,4,6>
+ 2696725990U, // <7,2,0,5>: Cost 3 vext3 <2,0,5,7>, <2,0,5,7>
+ 2712061417U, // <7,2,0,6>: Cost 3 vext3 RHS, <2,0,6,1>
+ 3785803251U, // <7,2,0,7>: Cost 4 vext3 RHS, <2,0,7,2>
+ 2696947201U, // <7,2,0,u>: Cost 3 vext3 <2,0,u,7>, <2,0,u,7>
+ 2712061446U, // <7,2,1,0>: Cost 3 vext3 RHS, <2,1,0,3>
+ 3785803276U, // <7,2,1,1>: Cost 4 vext3 RHS, <2,1,1,0>
+ 3785803285U, // <7,2,1,2>: Cost 4 vext3 RHS, <2,1,2,0>
+ 2712061471U, // <7,2,1,3>: Cost 3 vext3 RHS, <2,1,3,1>
+ 2712061482U, // <7,2,1,4>: Cost 3 vext3 RHS, <2,1,4,3>
+ 3766486576U, // <7,2,1,5>: Cost 4 vext3 <1,3,5,7>, <2,1,5,0>
+ 2712061500U, // <7,2,1,6>: Cost 3 vext3 RHS, <2,1,6,3>
+ 2602718850U, // <7,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
+ 2712061516U, // <7,2,1,u>: Cost 3 vext3 RHS, <2,1,u,1>
+ 2712061525U, // <7,2,2,0>: Cost 3 vext3 RHS, <2,2,0,1>
+ 2712061536U, // <7,2,2,1>: Cost 3 vext3 RHS, <2,2,1,3>
+ 1638319720U, // <7,2,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
+ 1638319730U, // <7,2,2,3>: Cost 2 vext3 RHS, <2,2,3,3>
+ 2712061565U, // <7,2,2,4>: Cost 3 vext3 RHS, <2,2,4,5>
+ 2698053256U, // <7,2,2,5>: Cost 3 vext3 <2,2,5,7>, <2,2,5,7>
+ 2712061584U, // <7,2,2,6>: Cost 3 vext3 RHS, <2,2,6,6>
+ 3771795096U, // <7,2,2,7>: Cost 4 vext3 <2,2,5,7>, <2,2,7,5>
+ 1638319775U, // <7,2,2,u>: Cost 2 vext3 RHS, <2,2,u,3>
+ 1638319782U, // <7,2,3,0>: Cost 2 vext3 RHS, <2,3,0,1>
+ 2693924531U, // <7,2,3,1>: Cost 3 vext3 <1,5,3,7>, <2,3,1,5>
+ 2700560061U, // <7,2,3,2>: Cost 3 vext3 <2,6,3,7>, <2,3,2,6>
+ 2693924551U, // <7,2,3,3>: Cost 3 vext3 <1,5,3,7>, <2,3,3,7>
+ 1638319822U, // <7,2,3,4>: Cost 2 vext3 RHS, <2,3,4,5>
+ 2698716889U, // <7,2,3,5>: Cost 3 vext3 <2,3,5,7>, <2,3,5,7>
+ 2712061665U, // <7,2,3,6>: Cost 3 vext3 RHS, <2,3,6,6>
+ 2735949540U, // <7,2,3,7>: Cost 3 vext3 RHS, <2,3,7,0>
+ 1638319854U, // <7,2,3,u>: Cost 2 vext3 RHS, <2,3,u,1>
+ 2712061692U, // <7,2,4,0>: Cost 3 vext3 RHS, <2,4,0,6>
+ 2712061698U, // <7,2,4,1>: Cost 3 vext3 RHS, <2,4,1,3>
+ 2712061708U, // <7,2,4,2>: Cost 3 vext3 RHS, <2,4,2,4>
+ 2712061718U, // <7,2,4,3>: Cost 3 vext3 RHS, <2,4,3,5>
+ 2712061728U, // <7,2,4,4>: Cost 3 vext3 RHS, <2,4,4,6>
+ 2699380522U, // <7,2,4,5>: Cost 3 vext3 <2,4,5,7>, <2,4,5,7>
+ 2712061740U, // <7,2,4,6>: Cost 3 vext3 RHS, <2,4,6,0>
+ 3809691445U, // <7,2,4,7>: Cost 4 vext3 RHS, <2,4,7,0>
+ 2699601733U, // <7,2,4,u>: Cost 3 vext3 <2,4,u,7>, <2,4,u,7>
+ 2699675470U, // <7,2,5,0>: Cost 3 vext3 <2,5,0,7>, <2,5,0,7>
+ 3766486867U, // <7,2,5,1>: Cost 4 vext3 <1,3,5,7>, <2,5,1,3>
+ 2699822944U, // <7,2,5,2>: Cost 3 vext3 <2,5,2,7>, <2,5,2,7>
+ 2692745065U, // <7,2,5,3>: Cost 3 vext3 <1,3,5,7>, <2,5,3,7>
+ 2699970418U, // <7,2,5,4>: Cost 3 vext3 <2,5,4,7>, <2,5,4,7>
+ 3766486907U, // <7,2,5,5>: Cost 4 vext3 <1,3,5,7>, <2,5,5,7>
+ 2700117892U, // <7,2,5,6>: Cost 3 vext3 <2,5,6,7>, <2,5,6,7>
+ 3771795334U, // <7,2,5,7>: Cost 4 vext3 <2,2,5,7>, <2,5,7,0>
+ 2692745110U, // <7,2,5,u>: Cost 3 vext3 <1,3,5,7>, <2,5,u,7>
+ 2572894310U, // <7,2,6,0>: Cost 3 vext1 <3,7,2,6>, LHS
+ 2712061860U, // <7,2,6,1>: Cost 3 vext3 RHS, <2,6,1,3>
+ 2700486577U, // <7,2,6,2>: Cost 3 vext3 <2,6,2,7>, <2,6,2,7>
+ 1626818490U, // <7,2,6,3>: Cost 2 vext3 <2,6,3,7>, <2,6,3,7>
+ 2572897590U, // <7,2,6,4>: Cost 3 vext1 <3,7,2,6>, RHS
+ 2700707788U, // <7,2,6,5>: Cost 3 vext3 <2,6,5,7>, <2,6,5,7>
+ 2700781525U, // <7,2,6,6>: Cost 3 vext3 <2,6,6,7>, <2,6,6,7>
+ 3774597086U, // <7,2,6,7>: Cost 4 vext3 <2,6,7,7>, <2,6,7,7>
+ 1627187175U, // <7,2,6,u>: Cost 2 vext3 <2,6,u,7>, <2,6,u,7>
+ 2735949802U, // <7,2,7,0>: Cost 3 vext3 RHS, <2,7,0,1>
+ 3780200434U, // <7,2,7,1>: Cost 4 vext3 <3,6,2,7>, <2,7,1,0>
+ 3773564928U, // <7,2,7,2>: Cost 4 vext3 <2,5,2,7>, <2,7,2,5>
+ 2986541158U, // <7,2,7,3>: Cost 3 vzipr <5,5,7,7>, LHS
+ 2554989878U, // <7,2,7,4>: Cost 3 vext1 <0,7,2,7>, RHS
+ 3775113245U, // <7,2,7,5>: Cost 4 vext3 <2,7,5,7>, <2,7,5,7>
+ 4060283228U, // <7,2,7,6>: Cost 4 vzipr <5,5,7,7>, <0,4,2,6>
+ 2554992236U, // <7,2,7,7>: Cost 3 vext1 <0,7,2,7>, <7,7,7,7>
+ 2986541163U, // <7,2,7,u>: Cost 3 vzipr <5,5,7,7>, LHS
+ 1638320187U, // <7,2,u,0>: Cost 2 vext3 RHS, <2,u,0,1>
+ 2693924936U, // <7,2,u,1>: Cost 3 vext3 <1,5,3,7>, <2,u,1,5>
+ 1638319720U, // <7,2,u,2>: Cost 2 vext3 RHS, <2,2,2,2>
+ 1628145756U, // <7,2,u,3>: Cost 2 vext3 <2,u,3,7>, <2,u,3,7>
+ 1638320227U, // <7,2,u,4>: Cost 2 vext3 RHS, <2,u,4,5>
+ 2702035054U, // <7,2,u,5>: Cost 3 vext3 <2,u,5,7>, <2,u,5,7>
+ 2702108791U, // <7,2,u,6>: Cost 3 vext3 <2,u,6,7>, <2,u,6,7>
+ 2735949945U, // <7,2,u,7>: Cost 3 vext3 RHS, <2,u,7,0>
+ 1628514441U, // <7,2,u,u>: Cost 2 vext3 <2,u,u,7>, <2,u,u,7>
+ 2712062091U, // <7,3,0,0>: Cost 3 vext3 RHS, <3,0,0,0>
+ 1638320278U, // <7,3,0,1>: Cost 2 vext3 RHS, <3,0,1,2>
+ 2712062109U, // <7,3,0,2>: Cost 3 vext3 RHS, <3,0,2,0>
+ 2590836886U, // <7,3,0,3>: Cost 3 vext1 <6,7,3,0>, <3,0,1,2>
+ 2712062128U, // <7,3,0,4>: Cost 3 vext3 RHS, <3,0,4,1>
+ 2712062138U, // <7,3,0,5>: Cost 3 vext3 RHS, <3,0,5,2>
+ 2590839656U, // <7,3,0,6>: Cost 3 vext1 <6,7,3,0>, <6,7,3,0>
+ 3311414017U, // <7,3,0,7>: Cost 4 vrev <3,7,7,0>
+ 1638320341U, // <7,3,0,u>: Cost 2 vext3 RHS, <3,0,u,2>
+ 2237164227U, // <7,3,1,0>: Cost 3 vrev <3,7,0,1>
+ 2712062182U, // <7,3,1,1>: Cost 3 vext3 RHS, <3,1,1,1>
+ 2712062193U, // <7,3,1,2>: Cost 3 vext3 RHS, <3,1,2,3>
+ 2692745468U, // <7,3,1,3>: Cost 3 vext3 <1,3,5,7>, <3,1,3,5>
+ 2712062214U, // <7,3,1,4>: Cost 3 vext3 RHS, <3,1,4,6>
+ 2693925132U, // <7,3,1,5>: Cost 3 vext3 <1,5,3,7>, <3,1,5,3>
+ 3768183059U, // <7,3,1,6>: Cost 4 vext3 <1,6,1,7>, <3,1,6,1>
+ 2692745504U, // <7,3,1,7>: Cost 3 vext3 <1,3,5,7>, <3,1,7,5>
+ 2696063273U, // <7,3,1,u>: Cost 3 vext3 <1,u,5,7>, <3,1,u,5>
+ 2712062254U, // <7,3,2,0>: Cost 3 vext3 RHS, <3,2,0,1>
+ 2712062262U, // <7,3,2,1>: Cost 3 vext3 RHS, <3,2,1,0>
+ 2712062273U, // <7,3,2,2>: Cost 3 vext3 RHS, <3,2,2,2>
+ 2712062280U, // <7,3,2,3>: Cost 3 vext3 RHS, <3,2,3,0>
+ 2712062294U, // <7,3,2,4>: Cost 3 vext3 RHS, <3,2,4,5>
+ 2712062302U, // <7,3,2,5>: Cost 3 vext3 RHS, <3,2,5,4>
+ 2700560742U, // <7,3,2,6>: Cost 3 vext3 <2,6,3,7>, <3,2,6,3>
+ 2712062319U, // <7,3,2,7>: Cost 3 vext3 RHS, <3,2,7,3>
+ 2712062325U, // <7,3,2,u>: Cost 3 vext3 RHS, <3,2,u,0>
+ 2712062335U, // <7,3,3,0>: Cost 3 vext3 RHS, <3,3,0,1>
+ 2636368158U, // <7,3,3,1>: Cost 3 vext2 <3,1,7,3>, <3,1,7,3>
+ 2637031791U, // <7,3,3,2>: Cost 3 vext2 <3,2,7,3>, <3,2,7,3>
+ 1638320540U, // <7,3,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
+ 2712062374U, // <7,3,3,4>: Cost 3 vext3 RHS, <3,3,4,4>
+ 2704689586U, // <7,3,3,5>: Cost 3 vext3 <3,3,5,7>, <3,3,5,7>
+ 2590864235U, // <7,3,3,6>: Cost 3 vext1 <6,7,3,3>, <6,7,3,3>
+ 2704837060U, // <7,3,3,7>: Cost 3 vext3 <3,3,7,7>, <3,3,7,7>
+ 1638320540U, // <7,3,3,u>: Cost 2 vext3 RHS, <3,3,3,3>
+ 2712062416U, // <7,3,4,0>: Cost 3 vext3 RHS, <3,4,0,1>
+ 2712062426U, // <7,3,4,1>: Cost 3 vext3 RHS, <3,4,1,2>
+ 2566981640U, // <7,3,4,2>: Cost 3 vext1 <2,7,3,4>, <2,7,3,4>
+ 2712062447U, // <7,3,4,3>: Cost 3 vext3 RHS, <3,4,3,5>
+ 2712062456U, // <7,3,4,4>: Cost 3 vext3 RHS, <3,4,4,5>
+ 1638320642U, // <7,3,4,5>: Cost 2 vext3 RHS, <3,4,5,6>
+ 2648313204U, // <7,3,4,6>: Cost 3 vext2 <5,1,7,3>, <4,6,4,6>
+ 3311446789U, // <7,3,4,7>: Cost 4 vrev <3,7,7,4>
+ 1638320669U, // <7,3,4,u>: Cost 2 vext3 RHS, <3,4,u,6>
+ 2602819686U, // <7,3,5,0>: Cost 3 vext1 <u,7,3,5>, LHS
+ 1574571728U, // <7,3,5,1>: Cost 2 vext2 <5,1,7,3>, <5,1,7,3>
+ 2648977185U, // <7,3,5,2>: Cost 3 vext2 <5,2,7,3>, <5,2,7,3>
+ 2705869378U, // <7,3,5,3>: Cost 3 vext3 <3,5,3,7>, <3,5,3,7>
+ 2237491947U, // <7,3,5,4>: Cost 3 vrev <3,7,4,5>
+ 2706016852U, // <7,3,5,5>: Cost 3 vext3 <3,5,5,7>, <3,5,5,7>
+ 2648313954U, // <7,3,5,6>: Cost 3 vext2 <5,1,7,3>, <5,6,7,0>
+ 2692745823U, // <7,3,5,7>: Cost 3 vext3 <1,3,5,7>, <3,5,7,0>
+ 1579217159U, // <7,3,5,u>: Cost 2 vext2 <5,u,7,3>, <5,u,7,3>
+ 2706311800U, // <7,3,6,0>: Cost 3 vext3 <3,6,0,7>, <3,6,0,7>
+ 2654286249U, // <7,3,6,1>: Cost 3 vext2 <6,1,7,3>, <6,1,7,3>
+ 1581208058U, // <7,3,6,2>: Cost 2 vext2 <6,2,7,3>, <6,2,7,3>
+ 2706533011U, // <7,3,6,3>: Cost 3 vext3 <3,6,3,7>, <3,6,3,7>
+ 2706606748U, // <7,3,6,4>: Cost 3 vext3 <3,6,4,7>, <3,6,4,7>
+ 3780422309U, // <7,3,6,5>: Cost 4 vext3 <3,6,5,7>, <3,6,5,7>
+ 2712062637U, // <7,3,6,6>: Cost 3 vext3 RHS, <3,6,6,6>
+ 2706827959U, // <7,3,6,7>: Cost 3 vext3 <3,6,7,7>, <3,6,7,7>
+ 1585189856U, // <7,3,6,u>: Cost 2 vext2 <6,u,7,3>, <6,u,7,3>
+ 2693925571U, // <7,3,7,0>: Cost 3 vext3 <1,5,3,7>, <3,7,0,1>
+ 2693925584U, // <7,3,7,1>: Cost 3 vext3 <1,5,3,7>, <3,7,1,5>
+ 2700561114U, // <7,3,7,2>: Cost 3 vext3 <2,6,3,7>, <3,7,2,6>
+ 2572978916U, // <7,3,7,3>: Cost 3 vext1 <3,7,3,7>, <3,7,3,7>
+ 2693925611U, // <7,3,7,4>: Cost 3 vext3 <1,5,3,7>, <3,7,4,5>
+ 2707344118U, // <7,3,7,5>: Cost 3 vext3 <3,7,5,7>, <3,7,5,7>
+ 2654950894U, // <7,3,7,6>: Cost 3 vext2 <6,2,7,3>, <7,6,2,7>
+ 2648315500U, // <7,3,7,7>: Cost 3 vext2 <5,1,7,3>, <7,7,7,7>
+ 2693925643U, // <7,3,7,u>: Cost 3 vext3 <1,5,3,7>, <3,7,u,1>
+ 2237221578U, // <7,3,u,0>: Cost 3 vrev <3,7,0,u>
+ 1638320926U, // <7,3,u,1>: Cost 2 vext3 RHS, <3,u,1,2>
+ 1593153452U, // <7,3,u,2>: Cost 2 vext2 <u,2,7,3>, <u,2,7,3>
+ 1638320540U, // <7,3,u,3>: Cost 2 vext3 RHS, <3,3,3,3>
+ 2237516526U, // <7,3,u,4>: Cost 3 vrev <3,7,4,u>
+ 1638320966U, // <7,3,u,5>: Cost 2 vext3 RHS, <3,u,5,6>
+ 2712062796U, // <7,3,u,6>: Cost 3 vext3 RHS, <3,u,6,3>
+ 2692967250U, // <7,3,u,7>: Cost 3 vext3 <1,3,u,7>, <3,u,7,0>
+ 1638320989U, // <7,3,u,u>: Cost 2 vext3 RHS, <3,u,u,2>
+ 2651635712U, // <7,4,0,0>: Cost 3 vext2 <5,6,7,4>, <0,0,0,0>
+ 1577893990U, // <7,4,0,1>: Cost 2 vext2 <5,6,7,4>, LHS
+ 2651635876U, // <7,4,0,2>: Cost 3 vext2 <5,6,7,4>, <0,2,0,2>
+ 3785804672U, // <7,4,0,3>: Cost 4 vext3 RHS, <4,0,3,1>
+ 2651636050U, // <7,4,0,4>: Cost 3 vext2 <5,6,7,4>, <0,4,1,5>
+ 1638468498U, // <7,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
+ 1638468508U, // <7,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
+ 3787795364U, // <7,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
+ 1640459181U, // <7,4,0,u>: Cost 2 vext3 RHS, <4,0,u,1>
+ 2651636470U, // <7,4,1,0>: Cost 3 vext2 <5,6,7,4>, <1,0,3,2>
+ 2651636532U, // <7,4,1,1>: Cost 3 vext2 <5,6,7,4>, <1,1,1,1>
+ 2712062922U, // <7,4,1,2>: Cost 3 vext3 RHS, <4,1,2,3>
+ 2639029248U, // <7,4,1,3>: Cost 3 vext2 <3,5,7,4>, <1,3,5,7>
+ 2712062940U, // <7,4,1,4>: Cost 3 vext3 RHS, <4,1,4,3>
+ 2712062946U, // <7,4,1,5>: Cost 3 vext3 RHS, <4,1,5,0>
+ 2712062958U, // <7,4,1,6>: Cost 3 vext3 RHS, <4,1,6,3>
+ 3785804791U, // <7,4,1,7>: Cost 4 vext3 RHS, <4,1,7,3>
+ 2712062973U, // <7,4,1,u>: Cost 3 vext3 RHS, <4,1,u,0>
+ 3785804807U, // <7,4,2,0>: Cost 4 vext3 RHS, <4,2,0,1>
+ 3785804818U, // <7,4,2,1>: Cost 4 vext3 RHS, <4,2,1,3>
+ 2651637352U, // <7,4,2,2>: Cost 3 vext2 <5,6,7,4>, <2,2,2,2>
+ 2651637414U, // <7,4,2,3>: Cost 3 vext2 <5,6,7,4>, <2,3,0,1>
+ 3716753194U, // <7,4,2,4>: Cost 4 vext2 <4,2,7,4>, <2,4,5,7>
+ 2712063030U, // <7,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
+ 2712063036U, // <7,4,2,6>: Cost 3 vext3 RHS, <4,2,6,0>
+ 3773123658U, // <7,4,2,7>: Cost 4 vext3 <2,4,5,7>, <4,2,7,5>
+ 2712063054U, // <7,4,2,u>: Cost 3 vext3 RHS, <4,2,u,0>
+ 2651637910U, // <7,4,3,0>: Cost 3 vext2 <5,6,7,4>, <3,0,1,2>
+ 3712772348U, // <7,4,3,1>: Cost 4 vext2 <3,5,7,4>, <3,1,3,5>
+ 3785804906U, // <7,4,3,2>: Cost 4 vext3 RHS, <4,3,2,1>
+ 2651638172U, // <7,4,3,3>: Cost 3 vext2 <5,6,7,4>, <3,3,3,3>
+ 2651638274U, // <7,4,3,4>: Cost 3 vext2 <5,6,7,4>, <3,4,5,6>
+ 2639030883U, // <7,4,3,5>: Cost 3 vext2 <3,5,7,4>, <3,5,7,4>
+ 2712063122U, // <7,4,3,6>: Cost 3 vext3 RHS, <4,3,6,5>
+ 3712772836U, // <7,4,3,7>: Cost 4 vext2 <3,5,7,4>, <3,7,3,7>
+ 2641021782U, // <7,4,3,u>: Cost 3 vext2 <3,u,7,4>, <3,u,7,4>
+ 2714053802U, // <7,4,4,0>: Cost 3 vext3 RHS, <4,4,0,2>
+ 3785804978U, // <7,4,4,1>: Cost 4 vext3 RHS, <4,4,1,1>
+ 3716754505U, // <7,4,4,2>: Cost 4 vext2 <4,2,7,4>, <4,2,7,4>
+ 3785804998U, // <7,4,4,3>: Cost 4 vext3 RHS, <4,4,3,3>
+ 1638321360U, // <7,4,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
+ 1638468826U, // <7,4,4,5>: Cost 2 vext3 RHS, <4,4,5,5>
+ 1638468836U, // <7,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
+ 3785215214U, // <7,4,4,7>: Cost 4 vext3 <4,4,7,7>, <4,4,7,7>
+ 1640459509U, // <7,4,4,u>: Cost 2 vext3 RHS, <4,4,u,5>
+ 1517207654U, // <7,4,5,0>: Cost 2 vext1 <6,7,4,5>, LHS
+ 2573034640U, // <7,4,5,1>: Cost 3 vext1 <3,7,4,5>, <1,5,3,7>
+ 2712063246U, // <7,4,5,2>: Cost 3 vext3 RHS, <4,5,2,3>
+ 2573036267U, // <7,4,5,3>: Cost 3 vext1 <3,7,4,5>, <3,7,4,5>
+ 1517210934U, // <7,4,5,4>: Cost 2 vext1 <6,7,4,5>, RHS
+ 2711989549U, // <7,4,5,5>: Cost 3 vext3 <4,5,5,7>, <4,5,5,7>
+ 564579638U, // <7,4,5,6>: Cost 1 vext3 RHS, RHS
+ 2651639976U, // <7,4,5,7>: Cost 3 vext2 <5,6,7,4>, <5,7,5,7>
+ 564579656U, // <7,4,5,u>: Cost 1 vext3 RHS, RHS
+ 2712063307U, // <7,4,6,0>: Cost 3 vext3 RHS, <4,6,0,1>
+ 3767668056U, // <7,4,6,1>: Cost 4 vext3 <1,5,3,7>, <4,6,1,5>
+ 2651640314U, // <7,4,6,2>: Cost 3 vext2 <5,6,7,4>, <6,2,7,3>
+ 2655621708U, // <7,4,6,3>: Cost 3 vext2 <6,3,7,4>, <6,3,7,4>
+ 1638468980U, // <7,4,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
+ 2712063358U, // <7,4,6,5>: Cost 3 vext3 RHS, <4,6,5,7>
+ 2712063367U, // <7,4,6,6>: Cost 3 vext3 RHS, <4,6,6,7>
+ 2712210826U, // <7,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
+ 1638469012U, // <7,4,6,u>: Cost 2 vext3 RHS, <4,6,u,2>
+ 2651640826U, // <7,4,7,0>: Cost 3 vext2 <5,6,7,4>, <7,0,1,2>
+ 3773713830U, // <7,4,7,1>: Cost 4 vext3 <2,5,4,7>, <4,7,1,2>
+ 3773713842U, // <7,4,7,2>: Cost 4 vext3 <2,5,4,7>, <4,7,2,5>
+ 3780349372U, // <7,4,7,3>: Cost 4 vext3 <3,6,4,7>, <4,7,3,6>
+ 2651641140U, // <7,4,7,4>: Cost 3 vext2 <5,6,7,4>, <7,4,0,1>
+ 2712210888U, // <7,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
+ 2712210898U, // <7,4,7,6>: Cost 3 vext3 RHS, <4,7,6,1>
+ 2651641452U, // <7,4,7,7>: Cost 3 vext2 <5,6,7,4>, <7,7,7,7>
+ 2713538026U, // <7,4,7,u>: Cost 3 vext3 <4,7,u,7>, <4,7,u,7>
+ 1517232230U, // <7,4,u,0>: Cost 2 vext1 <6,7,4,u>, LHS
+ 1577899822U, // <7,4,u,1>: Cost 2 vext2 <5,6,7,4>, LHS
+ 2712063489U, // <7,4,u,2>: Cost 3 vext3 RHS, <4,u,2,3>
+ 2573060846U, // <7,4,u,3>: Cost 3 vext1 <3,7,4,u>, <3,7,4,u>
+ 1640312342U, // <7,4,u,4>: Cost 2 vext3 RHS, <4,u,4,6>
+ 1638469146U, // <7,4,u,5>: Cost 2 vext3 RHS, <4,u,5,1>
+ 564579881U, // <7,4,u,6>: Cost 1 vext3 RHS, RHS
+ 2714054192U, // <7,4,u,7>: Cost 3 vext3 RHS, <4,u,7,5>
+ 564579899U, // <7,4,u,u>: Cost 1 vext3 RHS, RHS
+ 2579038310U, // <7,5,0,0>: Cost 3 vext1 <4,7,5,0>, LHS
+ 2636382310U, // <7,5,0,1>: Cost 3 vext2 <3,1,7,5>, LHS
+ 2796339302U, // <7,5,0,2>: Cost 3 vuzpl <7,4,5,6>, LHS
+ 3646810719U, // <7,5,0,3>: Cost 4 vext1 <3,7,5,0>, <3,5,7,0>
+ 2712063586U, // <7,5,0,4>: Cost 3 vext3 RHS, <5,0,4,1>
+ 2735951467U, // <7,5,0,5>: Cost 3 vext3 RHS, <5,0,5,1>
+ 2735951476U, // <7,5,0,6>: Cost 3 vext3 RHS, <5,0,6,1>
+ 2579043322U, // <7,5,0,7>: Cost 3 vext1 <4,7,5,0>, <7,0,1,2>
+ 2636382877U, // <7,5,0,u>: Cost 3 vext2 <3,1,7,5>, LHS
+ 2712211087U, // <7,5,1,0>: Cost 3 vext3 RHS, <5,1,0,1>
+ 3698180916U, // <7,5,1,1>: Cost 4 vext2 <1,1,7,5>, <1,1,1,1>
+ 3710124950U, // <7,5,1,2>: Cost 4 vext2 <3,1,7,5>, <1,2,3,0>
+ 2636383232U, // <7,5,1,3>: Cost 3 vext2 <3,1,7,5>, <1,3,5,7>
+ 2712211127U, // <7,5,1,4>: Cost 3 vext3 RHS, <5,1,4,5>
+ 2590994128U, // <7,5,1,5>: Cost 3 vext1 <6,7,5,1>, <5,1,7,3>
+ 2590995323U, // <7,5,1,6>: Cost 3 vext1 <6,7,5,1>, <6,7,5,1>
+ 1638469328U, // <7,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
+ 1638469337U, // <7,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
+ 3785805536U, // <7,5,2,0>: Cost 4 vext3 RHS, <5,2,0,1>
+ 3785805544U, // <7,5,2,1>: Cost 4 vext3 RHS, <5,2,1,0>
+ 3704817288U, // <7,5,2,2>: Cost 4 vext2 <2,2,7,5>, <2,2,5,7>
+ 2712063742U, // <7,5,2,3>: Cost 3 vext3 RHS, <5,2,3,4>
+ 3716761386U, // <7,5,2,4>: Cost 4 vext2 <4,2,7,5>, <2,4,5,7>
+ 2714054415U, // <7,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
+ 3774304024U, // <7,5,2,6>: Cost 4 vext3 <2,6,3,7>, <5,2,6,3>
+ 2712063777U, // <7,5,2,7>: Cost 3 vext3 RHS, <5,2,7,3>
+ 2712063787U, // <7,5,2,u>: Cost 3 vext3 RHS, <5,2,u,4>
+ 3634888806U, // <7,5,3,0>: Cost 4 vext1 <1,7,5,3>, LHS
+ 2636384544U, // <7,5,3,1>: Cost 3 vext2 <3,1,7,5>, <3,1,7,5>
+ 3710790001U, // <7,5,3,2>: Cost 4 vext2 <3,2,7,5>, <3,2,7,5>
+ 3710126492U, // <7,5,3,3>: Cost 4 vext2 <3,1,7,5>, <3,3,3,3>
+ 3634892086U, // <7,5,3,4>: Cost 4 vext1 <1,7,5,3>, RHS
+ 2639039076U, // <7,5,3,5>: Cost 3 vext2 <3,5,7,5>, <3,5,7,5>
+ 3713444533U, // <7,5,3,6>: Cost 4 vext2 <3,6,7,5>, <3,6,7,5>
+ 2693926767U, // <7,5,3,7>: Cost 3 vext3 <1,5,3,7>, <5,3,7,0>
+ 2712063864U, // <7,5,3,u>: Cost 3 vext3 RHS, <5,3,u,0>
+ 2579071078U, // <7,5,4,0>: Cost 3 vext1 <4,7,5,4>, LHS
+ 3646841856U, // <7,5,4,1>: Cost 4 vext1 <3,7,5,4>, <1,3,5,7>
+ 3716762698U, // <7,5,4,2>: Cost 4 vext2 <4,2,7,5>, <4,2,7,5>
+ 3646843491U, // <7,5,4,3>: Cost 4 vext1 <3,7,5,4>, <3,5,7,4>
+ 2579074358U, // <7,5,4,4>: Cost 3 vext1 <4,7,5,4>, RHS
+ 2636385590U, // <7,5,4,5>: Cost 3 vext2 <3,1,7,5>, RHS
+ 2645675406U, // <7,5,4,6>: Cost 3 vext2 <4,6,7,5>, <4,6,7,5>
+ 1638322118U, // <7,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
+ 1638469583U, // <7,5,4,u>: Cost 2 vext3 RHS, <5,4,u,6>
+ 2714054611U, // <7,5,5,0>: Cost 3 vext3 RHS, <5,5,0,1>
+ 2652974800U, // <7,5,5,1>: Cost 3 vext2 <5,u,7,5>, <5,1,7,3>
+ 3710127905U, // <7,5,5,2>: Cost 4 vext2 <3,1,7,5>, <5,2,7,3>
+ 3785805808U, // <7,5,5,3>: Cost 4 vext3 RHS, <5,5,3,3>
+ 2712211450U, // <7,5,5,4>: Cost 3 vext3 RHS, <5,5,4,4>
+ 1638322180U, // <7,5,5,5>: Cost 2 vext3 RHS, <5,5,5,5>
+ 2712064014U, // <7,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
+ 1638469656U, // <7,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
+ 1638469665U, // <7,5,5,u>: Cost 2 vext3 RHS, <5,5,u,7>
+ 2712064036U, // <7,5,6,0>: Cost 3 vext3 RHS, <5,6,0,1>
+ 2714054707U, // <7,5,6,1>: Cost 3 vext3 RHS, <5,6,1,7>
+ 3785805879U, // <7,5,6,2>: Cost 4 vext3 RHS, <5,6,2,2>
+ 2712064066U, // <7,5,6,3>: Cost 3 vext3 RHS, <5,6,3,4>
+ 2712064076U, // <7,5,6,4>: Cost 3 vext3 RHS, <5,6,4,5>
+ 2714054743U, // <7,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
+ 2712064096U, // <7,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
+ 1638322274U, // <7,5,6,7>: Cost 2 vext3 RHS, <5,6,7,0>
+ 1638469739U, // <7,5,6,u>: Cost 2 vext3 RHS, <5,6,u,0>
+ 1511325798U, // <7,5,7,0>: Cost 2 vext1 <5,7,5,7>, LHS
+ 2692747392U, // <7,5,7,1>: Cost 3 vext3 <1,3,5,7>, <5,7,1,3>
+ 2585069160U, // <7,5,7,2>: Cost 3 vext1 <5,7,5,7>, <2,2,2,2>
+ 2573126390U, // <7,5,7,3>: Cost 3 vext1 <3,7,5,7>, <3,7,5,7>
+ 1511329078U, // <7,5,7,4>: Cost 2 vext1 <5,7,5,7>, RHS
+ 1638469800U, // <7,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
+ 2712211626U, // <7,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
+ 2712211636U, // <7,5,7,7>: Cost 3 vext3 RHS, <5,7,7,1>
+ 1638469823U, // <7,5,7,u>: Cost 2 vext3 RHS, <5,7,u,3>
+ 1511333990U, // <7,5,u,0>: Cost 2 vext1 <5,7,5,u>, LHS
+ 2636388142U, // <7,5,u,1>: Cost 3 vext2 <3,1,7,5>, LHS
+ 2712211671U, // <7,5,u,2>: Cost 3 vext3 RHS, <5,u,2,0>
+ 2573134583U, // <7,5,u,3>: Cost 3 vext1 <3,7,5,u>, <3,7,5,u>
+ 1511337270U, // <7,5,u,4>: Cost 2 vext1 <5,7,5,u>, RHS
+ 1638469881U, // <7,5,u,5>: Cost 2 vext3 RHS, <5,u,5,7>
+ 2712064258U, // <7,5,u,6>: Cost 3 vext3 RHS, <5,u,6,7>
+ 1638469892U, // <7,5,u,7>: Cost 2 vext3 RHS, <5,u,7,0>
+ 1638469904U, // <7,5,u,u>: Cost 2 vext3 RHS, <5,u,u,3>
+ 2650324992U, // <7,6,0,0>: Cost 3 vext2 <5,4,7,6>, <0,0,0,0>
+ 1576583270U, // <7,6,0,1>: Cost 2 vext2 <5,4,7,6>, LHS
+ 2712064300U, // <7,6,0,2>: Cost 3 vext3 RHS, <6,0,2,4>
+ 2255295336U, // <7,6,0,3>: Cost 3 vrev <6,7,3,0>
+ 2712064316U, // <7,6,0,4>: Cost 3 vext3 RHS, <6,0,4,2>
+ 2585088098U, // <7,6,0,5>: Cost 3 vext1 <5,7,6,0>, <5,6,7,0>
+ 2735952204U, // <7,6,0,6>: Cost 3 vext3 RHS, <6,0,6,0>
+ 2712211799U, // <7,6,0,7>: Cost 3 vext3 RHS, <6,0,7,2>
+ 1576583837U, // <7,6,0,u>: Cost 2 vext2 <5,4,7,6>, LHS
+ 1181340494U, // <7,6,1,0>: Cost 2 vrev <6,7,0,1>
+ 2650325812U, // <7,6,1,1>: Cost 3 vext2 <5,4,7,6>, <1,1,1,1>
+ 2650325910U, // <7,6,1,2>: Cost 3 vext2 <5,4,7,6>, <1,2,3,0>
+ 2650325976U, // <7,6,1,3>: Cost 3 vext2 <5,4,7,6>, <1,3,1,3>
+ 2579123510U, // <7,6,1,4>: Cost 3 vext1 <4,7,6,1>, RHS
+ 2650326160U, // <7,6,1,5>: Cost 3 vext2 <5,4,7,6>, <1,5,3,7>
+ 2714055072U, // <7,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
+ 2712064425U, // <7,6,1,7>: Cost 3 vext3 RHS, <6,1,7,3>
+ 1181930390U, // <7,6,1,u>: Cost 2 vrev <6,7,u,1>
+ 2712211897U, // <7,6,2,0>: Cost 3 vext3 RHS, <6,2,0,1>
+ 2714055108U, // <7,6,2,1>: Cost 3 vext3 RHS, <6,2,1,3>
+ 2650326632U, // <7,6,2,2>: Cost 3 vext2 <5,4,7,6>, <2,2,2,2>
+ 2650326694U, // <7,6,2,3>: Cost 3 vext2 <5,4,7,6>, <2,3,0,1>
+ 2714055137U, // <7,6,2,4>: Cost 3 vext3 RHS, <6,2,4,5>
+ 2714055148U, // <7,6,2,5>: Cost 3 vext3 RHS, <6,2,5,7>
+ 2650326970U, // <7,6,2,6>: Cost 3 vext2 <5,4,7,6>, <2,6,3,7>
+ 1638470138U, // <7,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
+ 1638470147U, // <7,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
+ 2650327190U, // <7,6,3,0>: Cost 3 vext2 <5,4,7,6>, <3,0,1,2>
+ 2255172441U, // <7,6,3,1>: Cost 3 vrev <6,7,1,3>
+ 2255246178U, // <7,6,3,2>: Cost 3 vrev <6,7,2,3>
+ 2650327452U, // <7,6,3,3>: Cost 3 vext2 <5,4,7,6>, <3,3,3,3>
+ 2712064562U, // <7,6,3,4>: Cost 3 vext3 RHS, <6,3,4,5>
+ 2650327627U, // <7,6,3,5>: Cost 3 vext2 <5,4,7,6>, <3,5,4,7>
+ 3713452726U, // <7,6,3,6>: Cost 4 vext2 <3,6,7,6>, <3,6,7,6>
+ 2700563016U, // <7,6,3,7>: Cost 3 vext3 <2,6,3,7>, <6,3,7,0>
+ 2712064593U, // <7,6,3,u>: Cost 3 vext3 RHS, <6,3,u,0>
+ 2650327954U, // <7,6,4,0>: Cost 3 vext2 <5,4,7,6>, <4,0,5,1>
+ 2735952486U, // <7,6,4,1>: Cost 3 vext3 RHS, <6,4,1,3>
+ 2735952497U, // <7,6,4,2>: Cost 3 vext3 RHS, <6,4,2,5>
+ 2255328108U, // <7,6,4,3>: Cost 3 vrev <6,7,3,4>
+ 2712212100U, // <7,6,4,4>: Cost 3 vext3 RHS, <6,4,4,6>
+ 1576586550U, // <7,6,4,5>: Cost 2 vext2 <5,4,7,6>, RHS
+ 2714055312U, // <7,6,4,6>: Cost 3 vext3 RHS, <6,4,6,0>
+ 2712212126U, // <7,6,4,7>: Cost 3 vext3 RHS, <6,4,7,5>
+ 1576586793U, // <7,6,4,u>: Cost 2 vext2 <5,4,7,6>, RHS
+ 2579152998U, // <7,6,5,0>: Cost 3 vext1 <4,7,6,5>, LHS
+ 2650328784U, // <7,6,5,1>: Cost 3 vext2 <5,4,7,6>, <5,1,7,3>
+ 2714055364U, // <7,6,5,2>: Cost 3 vext3 RHS, <6,5,2,7>
+ 3785806538U, // <7,6,5,3>: Cost 4 vext3 RHS, <6,5,3,4>
+ 1576587206U, // <7,6,5,4>: Cost 2 vext2 <5,4,7,6>, <5,4,7,6>
+ 2650329092U, // <7,6,5,5>: Cost 3 vext2 <5,4,7,6>, <5,5,5,5>
+ 2650329186U, // <7,6,5,6>: Cost 3 vext2 <5,4,7,6>, <5,6,7,0>
+ 2712064753U, // <7,6,5,7>: Cost 3 vext3 RHS, <6,5,7,7>
+ 1181963162U, // <7,6,5,u>: Cost 2 vrev <6,7,u,5>
+ 2714055421U, // <7,6,6,0>: Cost 3 vext3 RHS, <6,6,0,1>
+ 2714055432U, // <7,6,6,1>: Cost 3 vext3 RHS, <6,6,1,3>
+ 2650329594U, // <7,6,6,2>: Cost 3 vext2 <5,4,7,6>, <6,2,7,3>
+ 3785806619U, // <7,6,6,3>: Cost 4 vext3 RHS, <6,6,3,4>
+ 2712212260U, // <7,6,6,4>: Cost 3 vext3 RHS, <6,6,4,4>
+ 2714055472U, // <7,6,6,5>: Cost 3 vext3 RHS, <6,6,5,7>
+ 1638323000U, // <7,6,6,6>: Cost 2 vext3 RHS, <6,6,6,6>
+ 1638470466U, // <7,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
+ 1638470475U, // <7,6,6,u>: Cost 2 vext3 RHS, <6,6,u,7>
+ 1638323022U, // <7,6,7,0>: Cost 2 vext3 RHS, <6,7,0,1>
+ 2712064854U, // <7,6,7,1>: Cost 3 vext3 RHS, <6,7,1,0>
+ 2712064865U, // <7,6,7,2>: Cost 3 vext3 RHS, <6,7,2,2>
+ 2712064872U, // <7,6,7,3>: Cost 3 vext3 RHS, <6,7,3,0>
+ 1638323062U, // <7,6,7,4>: Cost 2 vext3 RHS, <6,7,4,5>
+ 2712064894U, // <7,6,7,5>: Cost 3 vext3 RHS, <6,7,5,4>
+ 2712064905U, // <7,6,7,6>: Cost 3 vext3 RHS, <6,7,6,6>
+ 2712064915U, // <7,6,7,7>: Cost 3 vext3 RHS, <6,7,7,7>
+ 1638323094U, // <7,6,7,u>: Cost 2 vext3 RHS, <6,7,u,1>
+ 1638470559U, // <7,6,u,0>: Cost 2 vext3 RHS, <6,u,0,1>
+ 1576589102U, // <7,6,u,1>: Cost 2 vext2 <5,4,7,6>, LHS
+ 2712212402U, // <7,6,u,2>: Cost 3 vext3 RHS, <6,u,2,2>
+ 2712212409U, // <7,6,u,3>: Cost 3 vext3 RHS, <6,u,3,0>
+ 1638470599U, // <7,6,u,4>: Cost 2 vext3 RHS, <6,u,4,5>
+ 1576589466U, // <7,6,u,5>: Cost 2 vext2 <5,4,7,6>, RHS
+ 1638323000U, // <7,6,u,6>: Cost 2 vext3 RHS, <6,6,6,6>
+ 1638470624U, // <7,6,u,7>: Cost 2 vext3 RHS, <6,u,7,3>
+ 1638470631U, // <7,6,u,u>: Cost 2 vext3 RHS, <6,u,u,1>
+ 2712065007U, // <7,7,0,0>: Cost 3 vext3 RHS, <7,0,0,0>
+ 1638323194U, // <7,7,0,1>: Cost 2 vext3 RHS, <7,0,1,2>
+ 2712065025U, // <7,7,0,2>: Cost 3 vext3 RHS, <7,0,2,0>
+ 3646958337U, // <7,7,0,3>: Cost 4 vext1 <3,7,7,0>, <3,7,7,0>
+ 2712065044U, // <7,7,0,4>: Cost 3 vext3 RHS, <7,0,4,1>
+ 2585161907U, // <7,7,0,5>: Cost 3 vext1 <5,7,7,0>, <5,7,7,0>
+ 2591134604U, // <7,7,0,6>: Cost 3 vext1 <6,7,7,0>, <6,7,7,0>
+ 2591134714U, // <7,7,0,7>: Cost 3 vext1 <6,7,7,0>, <7,0,1,2>
+ 1638323257U, // <7,7,0,u>: Cost 2 vext3 RHS, <7,0,u,2>
+ 2712065091U, // <7,7,1,0>: Cost 3 vext3 RHS, <7,1,0,3>
+ 2712065098U, // <7,7,1,1>: Cost 3 vext3 RHS, <7,1,1,1>
+ 2712065109U, // <7,7,1,2>: Cost 3 vext3 RHS, <7,1,2,3>
+ 2692748384U, // <7,7,1,3>: Cost 3 vext3 <1,3,5,7>, <7,1,3,5>
+ 2585169206U, // <7,7,1,4>: Cost 3 vext1 <5,7,7,1>, RHS
+ 2693928048U, // <7,7,1,5>: Cost 3 vext3 <1,5,3,7>, <7,1,5,3>
+ 2585170766U, // <7,7,1,6>: Cost 3 vext1 <5,7,7,1>, <6,7,0,1>
+ 2735953024U, // <7,7,1,7>: Cost 3 vext3 RHS, <7,1,7,1>
+ 2695918731U, // <7,7,1,u>: Cost 3 vext3 <1,u,3,7>, <7,1,u,3>
+ 3770471574U, // <7,7,2,0>: Cost 4 vext3 <2,0,5,7>, <7,2,0,5>
+ 3785807002U, // <7,7,2,1>: Cost 4 vext3 RHS, <7,2,1,0>
+ 2712065189U, // <7,7,2,2>: Cost 3 vext3 RHS, <7,2,2,2>
+ 2712065196U, // <7,7,2,3>: Cost 3 vext3 RHS, <7,2,3,0>
+ 3773125818U, // <7,7,2,4>: Cost 4 vext3 <2,4,5,7>, <7,2,4,5>
+ 3766490305U, // <7,7,2,5>: Cost 4 vext3 <1,3,5,7>, <7,2,5,3>
+ 2700563658U, // <7,7,2,6>: Cost 3 vext3 <2,6,3,7>, <7,2,6,3>
+ 2735953107U, // <7,7,2,7>: Cost 3 vext3 RHS, <7,2,7,3>
+ 2701890780U, // <7,7,2,u>: Cost 3 vext3 <2,u,3,7>, <7,2,u,3>
+ 2712065251U, // <7,7,3,0>: Cost 3 vext3 RHS, <7,3,0,1>
+ 3766490350U, // <7,7,3,1>: Cost 4 vext3 <1,3,5,7>, <7,3,1,3>
+ 3774305530U, // <7,7,3,2>: Cost 4 vext3 <2,6,3,7>, <7,3,2,6>
+ 2637728196U, // <7,7,3,3>: Cost 3 vext2 <3,3,7,7>, <3,3,7,7>
+ 2712065291U, // <7,7,3,4>: Cost 3 vext3 RHS, <7,3,4,5>
+ 2585186486U, // <7,7,3,5>: Cost 3 vext1 <5,7,7,3>, <5,7,7,3>
+ 2639719095U, // <7,7,3,6>: Cost 3 vext2 <3,6,7,7>, <3,6,7,7>
+ 2640382728U, // <7,7,3,7>: Cost 3 vext2 <3,7,7,7>, <3,7,7,7>
+ 2641046361U, // <7,7,3,u>: Cost 3 vext2 <3,u,7,7>, <3,u,7,7>
+ 2712212792U, // <7,7,4,0>: Cost 3 vext3 RHS, <7,4,0,5>
+ 3646989312U, // <7,7,4,1>: Cost 4 vext1 <3,7,7,4>, <1,3,5,7>
+ 3785807176U, // <7,7,4,2>: Cost 4 vext3 RHS, <7,4,2,3>
+ 3646991109U, // <7,7,4,3>: Cost 4 vext1 <3,7,7,4>, <3,7,7,4>
+ 2712065371U, // <7,7,4,4>: Cost 3 vext3 RHS, <7,4,4,4>
+ 1638323558U, // <7,7,4,5>: Cost 2 vext3 RHS, <7,4,5,6>
+ 2712212845U, // <7,7,4,6>: Cost 3 vext3 RHS, <7,4,6,4>
+ 2591167846U, // <7,7,4,7>: Cost 3 vext1 <6,7,7,4>, <7,4,5,6>
+ 1638323585U, // <7,7,4,u>: Cost 2 vext3 RHS, <7,4,u,6>
+ 2585198694U, // <7,7,5,0>: Cost 3 vext1 <5,7,7,5>, LHS
+ 2712212884U, // <7,7,5,1>: Cost 3 vext3 RHS, <7,5,1,7>
+ 3711471393U, // <7,7,5,2>: Cost 4 vext2 <3,3,7,7>, <5,2,7,3>
+ 2649673590U, // <7,7,5,3>: Cost 3 vext2 <5,3,7,7>, <5,3,7,7>
+ 2712065455U, // <7,7,5,4>: Cost 3 vext3 RHS, <7,5,4,7>
+ 1577259032U, // <7,7,5,5>: Cost 2 vext2 <5,5,7,7>, <5,5,7,7>
+ 2712065473U, // <7,7,5,6>: Cost 3 vext3 RHS, <7,5,6,7>
+ 2712212936U, // <7,7,5,7>: Cost 3 vext3 RHS, <7,5,7,5>
+ 1579249931U, // <7,7,5,u>: Cost 2 vext2 <5,u,7,7>, <5,u,7,7>
+ 2591178854U, // <7,7,6,0>: Cost 3 vext1 <6,7,7,6>, LHS
+ 2735953374U, // <7,7,6,1>: Cost 3 vext3 RHS, <7,6,1,0>
+ 2712212974U, // <7,7,6,2>: Cost 3 vext3 RHS, <7,6,2,7>
+ 2655646287U, // <7,7,6,3>: Cost 3 vext2 <6,3,7,7>, <6,3,7,7>
+ 2591182134U, // <7,7,6,4>: Cost 3 vext1 <6,7,7,6>, RHS
+ 2656973553U, // <7,7,6,5>: Cost 3 vext2 <6,5,7,7>, <6,5,7,7>
+ 1583895362U, // <7,7,6,6>: Cost 2 vext2 <6,6,7,7>, <6,6,7,7>
+ 2712065556U, // <7,7,6,7>: Cost 3 vext3 RHS, <7,6,7,0>
+ 1585222628U, // <7,7,6,u>: Cost 2 vext2 <6,u,7,7>, <6,u,7,7>
+ 1523417190U, // <7,7,7,0>: Cost 2 vext1 <7,7,7,7>, LHS
+ 2597159670U, // <7,7,7,1>: Cost 3 vext1 <7,7,7,7>, <1,0,3,2>
+ 2597160552U, // <7,7,7,2>: Cost 3 vext1 <7,7,7,7>, <2,2,2,2>
+ 2597161110U, // <7,7,7,3>: Cost 3 vext1 <7,7,7,7>, <3,0,1,2>
+ 1523420470U, // <7,7,7,4>: Cost 2 vext1 <7,7,7,7>, RHS
+ 2651002296U, // <7,7,7,5>: Cost 3 vext2 <5,5,7,7>, <7,5,5,7>
+ 2657637906U, // <7,7,7,6>: Cost 3 vext2 <6,6,7,7>, <7,6,6,7>
+ 363253046U, // <7,7,7,7>: Cost 1 vdup3 RHS
+ 363253046U, // <7,7,7,u>: Cost 1 vdup3 RHS
+ 1523417190U, // <7,7,u,0>: Cost 2 vext1 <7,7,7,7>, LHS
+ 1638471298U, // <7,7,u,1>: Cost 2 vext3 RHS, <7,u,1,2>
+ 2712213132U, // <7,7,u,2>: Cost 3 vext3 RHS, <7,u,2,3>
+ 2712213138U, // <7,7,u,3>: Cost 3 vext3 RHS, <7,u,3,0>
+ 1523420470U, // <7,7,u,4>: Cost 2 vext1 <7,7,7,7>, RHS
+ 1638471338U, // <7,7,u,5>: Cost 2 vext3 RHS, <7,u,5,6>
+ 1595840756U, // <7,7,u,6>: Cost 2 vext2 <u,6,7,7>, <u,6,7,7>
+ 363253046U, // <7,7,u,7>: Cost 1 vdup3 RHS
+ 363253046U, // <7,7,u,u>: Cost 1 vdup3 RHS
+ 1638318080U, // <7,u,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
+ 1638323923U, // <7,u,0,1>: Cost 2 vext3 RHS, <u,0,1,2>
+ 1662211804U, // <7,u,0,2>: Cost 2 vext3 RHS, <u,0,2,2>
+ 1638323941U, // <7,u,0,3>: Cost 2 vext3 RHS, <u,0,3,2>
+ 2712065773U, // <7,u,0,4>: Cost 3 vext3 RHS, <u,0,4,1>
+ 1662359286U, // <7,u,0,5>: Cost 2 vext3 RHS, <u,0,5,1>
+ 1662359296U, // <7,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
+ 2987150664U, // <7,u,0,7>: Cost 3 vzipr <5,6,7,0>, RHS
+ 1638323986U, // <7,u,0,u>: Cost 2 vext3 RHS, <u,0,u,2>
+ 1517469798U, // <7,u,1,0>: Cost 2 vext1 <6,7,u,1>, LHS
+ 1638318900U, // <7,u,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
+ 564582190U, // <7,u,1,2>: Cost 1 vext3 RHS, LHS
+ 1638324023U, // <7,u,1,3>: Cost 2 vext3 RHS, <u,1,3,3>
+ 1517473078U, // <7,u,1,4>: Cost 2 vext1 <6,7,u,1>, RHS
+ 2693928777U, // <7,u,1,5>: Cost 3 vext3 <1,5,3,7>, <u,1,5,3>
+ 1517474710U, // <7,u,1,6>: Cost 2 vext1 <6,7,u,1>, <6,7,u,1>
+ 1640462171U, // <7,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
+ 564582244U, // <7,u,1,u>: Cost 1 vext3 RHS, LHS
+ 1638318244U, // <7,u,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
+ 2712065907U, // <7,u,2,1>: Cost 3 vext3 RHS, <u,2,1,0>
+ 1638319720U, // <7,u,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
+ 1638324101U, // <7,u,2,3>: Cost 2 vext3 RHS, <u,2,3,0>
+ 1638318284U, // <7,u,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
+ 2712065947U, // <7,u,2,5>: Cost 3 vext3 RHS, <u,2,5,4>
+ 2700564387U, // <7,u,2,6>: Cost 3 vext3 <2,6,3,7>, <u,2,6,3>
+ 1640314796U, // <7,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
+ 1638324146U, // <7,u,2,u>: Cost 2 vext3 RHS, <u,2,u,0>
+ 1638324156U, // <7,u,3,0>: Cost 2 vext3 RHS, <u,3,0,1>
+ 1638319064U, // <7,u,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
+ 2700564435U, // <7,u,3,2>: Cost 3 vext3 <2,6,3,7>, <u,3,2,6>
+ 1638320540U, // <7,u,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
+ 1638324196U, // <7,u,3,4>: Cost 2 vext3 RHS, <u,3,4,5>
+ 1638324207U, // <7,u,3,5>: Cost 2 vext3 RHS, <u,3,5,7>
+ 2700564472U, // <7,u,3,6>: Cost 3 vext3 <2,6,3,7>, <u,3,6,7>
+ 2695919610U, // <7,u,3,7>: Cost 3 vext3 <1,u,3,7>, <u,3,7,0>
+ 1638324228U, // <7,u,3,u>: Cost 2 vext3 RHS, <u,3,u,1>
+ 2712066061U, // <7,u,4,0>: Cost 3 vext3 RHS, <u,4,0,1>
+ 1662212122U, // <7,u,4,1>: Cost 2 vext3 RHS, <u,4,1,5>
+ 1662212132U, // <7,u,4,2>: Cost 2 vext3 RHS, <u,4,2,6>
+ 2712066092U, // <7,u,4,3>: Cost 3 vext3 RHS, <u,4,3,5>
+ 1638321360U, // <7,u,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
+ 1638324287U, // <7,u,4,5>: Cost 2 vext3 RHS, <u,4,5,6>
+ 1662359624U, // <7,u,4,6>: Cost 2 vext3 RHS, <u,4,6,6>
+ 1640314961U, // <7,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
+ 1638324314U, // <7,u,4,u>: Cost 2 vext3 RHS, <u,4,u,6>
+ 1517502566U, // <7,u,5,0>: Cost 2 vext1 <6,7,u,5>, LHS
+ 1574612693U, // <7,u,5,1>: Cost 2 vext2 <5,1,7,u>, <5,1,7,u>
+ 2712066162U, // <7,u,5,2>: Cost 3 vext3 RHS, <u,5,2,3>
+ 1638324351U, // <7,u,5,3>: Cost 2 vext3 RHS, <u,5,3,7>
+ 1576603592U, // <7,u,5,4>: Cost 2 vext2 <5,4,7,u>, <5,4,7,u>
+ 1577267225U, // <7,u,5,5>: Cost 2 vext2 <5,5,7,u>, <5,5,7,u>
+ 564582554U, // <7,u,5,6>: Cost 1 vext3 RHS, RHS
+ 1640462499U, // <7,u,5,7>: Cost 2 vext3 RHS, <u,5,7,7>
+ 564582572U, // <7,u,5,u>: Cost 1 vext3 RHS, RHS
+ 2712066223U, // <7,u,6,0>: Cost 3 vext3 RHS, <u,6,0,1>
+ 2712066238U, // <7,u,6,1>: Cost 3 vext3 RHS, <u,6,1,7>
+ 1581249023U, // <7,u,6,2>: Cost 2 vext2 <6,2,7,u>, <6,2,7,u>
+ 1638324432U, // <7,u,6,3>: Cost 2 vext3 RHS, <u,6,3,7>
+ 1638468980U, // <7,u,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
+ 2712066274U, // <7,u,6,5>: Cost 3 vext3 RHS, <u,6,5,7>
+ 1583903555U, // <7,u,6,6>: Cost 2 vext2 <6,6,7,u>, <6,6,7,u>
+ 1640315117U, // <7,u,6,7>: Cost 2 vext3 RHS, <u,6,7,0>
+ 1638324477U, // <7,u,6,u>: Cost 2 vext3 RHS, <u,6,u,7>
+ 1638471936U, // <7,u,7,0>: Cost 2 vext3 RHS, <u,7,0,1>
+ 2692970763U, // <7,u,7,1>: Cost 3 vext3 <1,3,u,7>, <u,7,1,3>
+ 2700933399U, // <7,u,7,2>: Cost 3 vext3 <2,6,u,7>, <u,7,2,6>
+ 2573347601U, // <7,u,7,3>: Cost 3 vext1 <3,7,u,7>, <3,7,u,7>
+ 1638471976U, // <7,u,7,4>: Cost 2 vext3 RHS, <u,7,4,5>
+ 1511551171U, // <7,u,7,5>: Cost 2 vext1 <5,7,u,7>, <5,7,u,7>
+ 2712213815U, // <7,u,7,6>: Cost 3 vext3 RHS, <u,7,6,2>
+ 363253046U, // <7,u,7,7>: Cost 1 vdup3 RHS
+ 363253046U, // <7,u,7,u>: Cost 1 vdup3 RHS
+ 1638324561U, // <7,u,u,0>: Cost 2 vext3 RHS, <u,u,0,1>
+ 1638324571U, // <7,u,u,1>: Cost 2 vext3 RHS, <u,u,1,2>
+ 564582757U, // <7,u,u,2>: Cost 1 vext3 RHS, LHS
+ 1638324587U, // <7,u,u,3>: Cost 2 vext3 RHS, <u,u,3,0>
+ 1638324601U, // <7,u,u,4>: Cost 2 vext3 RHS, <u,u,4,5>
+ 1638324611U, // <7,u,u,5>: Cost 2 vext3 RHS, <u,u,5,6>
+ 564582797U, // <7,u,u,6>: Cost 1 vext3 RHS, RHS
+ 363253046U, // <7,u,u,7>: Cost 1 vdup3 RHS
+ 564582811U, // <7,u,u,u>: Cost 1 vext3 RHS, LHS
+ 135053414U, // <u,0,0,0>: Cost 1 vdup0 LHS
+ 1611489290U, // <u,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
+ 1611489300U, // <u,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
+ 2568054923U, // <u,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
+ 1481706806U, // <u,0,0,4>: Cost 2 vext1 <0,u,0,0>, RHS
+ 2555449040U, // <u,0,0,5>: Cost 3 vext1 <0,u,0,0>, <5,1,7,3>
+ 2591282078U, // <u,0,0,6>: Cost 3 vext1 <6,u,0,0>, <6,u,0,0>
+ 2591945711U, // <u,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
+ 135053414U, // <u,0,0,u>: Cost 1 vdup0 LHS
+ 1493655654U, // <u,0,1,0>: Cost 2 vext1 <2,u,0,1>, LHS
+ 1860550758U, // <u,0,1,1>: Cost 2 vzipl LHS, LHS
+ 537747563U, // <u,0,1,2>: Cost 1 vext3 LHS, LHS
+ 2625135576U, // <u,0,1,3>: Cost 3 vext2 <1,2,u,0>, <1,3,1,3>
+ 1493658934U, // <u,0,1,4>: Cost 2 vext1 <2,u,0,1>, RHS
+ 2625135760U, // <u,0,1,5>: Cost 3 vext2 <1,2,u,0>, <1,5,3,7>
+ 1517548447U, // <u,0,1,6>: Cost 2 vext1 <6,u,0,1>, <6,u,0,1>
+ 2591290362U, // <u,0,1,7>: Cost 3 vext1 <6,u,0,1>, <7,0,1,2>
+ 537747612U, // <u,0,1,u>: Cost 1 vext3 LHS, LHS
+ 1611489444U, // <u,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+ 2685231276U, // <u,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
+ 1994768486U, // <u,0,2,2>: Cost 2 vtrnl LHS, LHS
+ 2685231294U, // <u,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
+ 1611489484U, // <u,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+ 2712068310U, // <u,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
+ 2625136570U, // <u,0,2,6>: Cost 3 vext2 <1,2,u,0>, <2,6,3,7>
+ 2591962097U, // <u,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
+ 1611489516U, // <u,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
+ 2954067968U, // <u,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
+ 2685231356U, // <u,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
+ 72589981U, // <u,0,3,2>: Cost 1 vrev LHS
+ 2625137052U, // <u,0,3,3>: Cost 3 vext2 <1,2,u,0>, <3,3,3,3>
+ 2625137154U, // <u,0,3,4>: Cost 3 vext2 <1,2,u,0>, <3,4,5,6>
+ 2639071848U, // <u,0,3,5>: Cost 3 vext2 <3,5,u,0>, <3,5,u,0>
+ 2639735481U, // <u,0,3,6>: Cost 3 vext2 <3,6,u,0>, <3,6,u,0>
+ 2597279354U, // <u,0,3,7>: Cost 3 vext1 <7,u,0,3>, <7,u,0,3>
+ 73032403U, // <u,0,3,u>: Cost 1 vrev LHS
+ 2687074636U, // <u,0,4,0>: Cost 3 vext3 <0,4,0,u>, <0,4,0,u>
+ 1611489618U, // <u,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
+ 1611489628U, // <u,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
+ 3629222038U, // <u,0,4,3>: Cost 4 vext1 <0,u,0,4>, <3,0,1,2>
+ 2555481398U, // <u,0,4,4>: Cost 3 vext1 <0,u,0,4>, RHS
+ 1551396150U, // <u,0,4,5>: Cost 2 vext2 <1,2,u,0>, RHS
+ 2651680116U, // <u,0,4,6>: Cost 3 vext2 <5,6,u,0>, <4,6,4,6>
+ 2646150600U, // <u,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
+ 1611932050U, // <u,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
+ 2561458278U, // <u,0,5,0>: Cost 3 vext1 <1,u,0,5>, LHS
+ 1863532646U, // <u,0,5,1>: Cost 2 vzipl RHS, LHS
+ 2712068526U, // <u,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
+ 2649689976U, // <u,0,5,3>: Cost 3 vext2 <5,3,u,0>, <5,3,u,0>
+ 2220237489U, // <u,0,5,4>: Cost 3 vrev <0,u,4,5>
+ 2651680772U, // <u,0,5,5>: Cost 3 vext2 <5,6,u,0>, <5,5,5,5>
+ 1577939051U, // <u,0,5,6>: Cost 2 vext2 <5,6,u,0>, <5,6,u,0>
+ 2830077238U, // <u,0,5,7>: Cost 3 vuzpr <1,u,3,0>, RHS
+ 1579266317U, // <u,0,5,u>: Cost 2 vext2 <5,u,u,0>, <5,u,u,0>
+ 2555494502U, // <u,0,6,0>: Cost 3 vext1 <0,u,0,6>, LHS
+ 2712068598U, // <u,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
+ 1997750374U, // <u,0,6,2>: Cost 2 vtrnl RHS, LHS
+ 2655662673U, // <u,0,6,3>: Cost 3 vext2 <6,3,u,0>, <6,3,u,0>
+ 2555497782U, // <u,0,6,4>: Cost 3 vext1 <0,u,0,6>, RHS
+ 2651681459U, // <u,0,6,5>: Cost 3 vext2 <5,6,u,0>, <6,5,0,u>
+ 2651681592U, // <u,0,6,6>: Cost 3 vext2 <5,6,u,0>, <6,6,6,6>
+ 2651681614U, // <u,0,6,7>: Cost 3 vext2 <5,6,u,0>, <6,7,0,1>
+ 1997750428U, // <u,0,6,u>: Cost 2 vtrnl RHS, LHS
+ 2567446630U, // <u,0,7,0>: Cost 3 vext1 <2,u,0,7>, LHS
+ 2567447446U, // <u,0,7,1>: Cost 3 vext1 <2,u,0,7>, <1,2,3,0>
+ 2567448641U, // <u,0,7,2>: Cost 3 vext1 <2,u,0,7>, <2,u,0,7>
+ 2573421338U, // <u,0,7,3>: Cost 3 vext1 <3,u,0,7>, <3,u,0,7>
+ 2567449910U, // <u,0,7,4>: Cost 3 vext1 <2,u,0,7>, RHS
+ 2651682242U, // <u,0,7,5>: Cost 3 vext2 <5,6,u,0>, <7,5,6,u>
+ 2591339429U, // <u,0,7,6>: Cost 3 vext1 <6,u,0,7>, <6,u,0,7>
+ 2651682412U, // <u,0,7,7>: Cost 3 vext2 <5,6,u,0>, <7,7,7,7>
+ 2567452462U, // <u,0,7,u>: Cost 3 vext1 <2,u,0,7>, LHS
+ 135053414U, // <u,0,u,0>: Cost 1 vdup0 LHS
+ 1611489938U, // <u,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
+ 537748125U, // <u,0,u,2>: Cost 1 vext3 LHS, LHS
+ 2685674148U, // <u,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
+ 1611932338U, // <u,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
+ 1551399066U, // <u,0,u,5>: Cost 2 vext2 <1,2,u,0>, RHS
+ 1517605798U, // <u,0,u,6>: Cost 2 vext1 <6,u,0,u>, <6,u,0,u>
+ 2830077481U, // <u,0,u,7>: Cost 3 vuzpr <1,u,3,0>, RHS
+ 537748179U, // <u,0,u,u>: Cost 1 vext3 LHS, LHS
+ 1544101961U, // <u,1,0,0>: Cost 2 vext2 <0,0,u,1>, <0,0,u,1>
+ 1558036582U, // <u,1,0,1>: Cost 2 vext2 <2,3,u,1>, LHS
+ 2619171051U, // <u,1,0,2>: Cost 3 vext2 <0,2,u,1>, <0,2,u,1>
+ 1611490038U, // <u,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
+ 2555522358U, // <u,1,0,4>: Cost 3 vext1 <0,u,1,0>, RHS
+ 2712068871U, // <u,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
+ 2591355815U, // <u,1,0,6>: Cost 3 vext1 <6,u,1,0>, <6,u,1,0>
+ 2597328512U, // <u,1,0,7>: Cost 3 vext1 <7,u,1,0>, <7,u,1,0>
+ 1611490083U, // <u,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
+ 1481785446U, // <u,1,1,0>: Cost 2 vext1 <0,u,1,1>, LHS
+ 202162278U, // <u,1,1,1>: Cost 1 vdup1 LHS
+ 2555528808U, // <u,1,1,2>: Cost 3 vext1 <0,u,1,1>, <2,2,2,2>
+ 1611490120U, // <u,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
+ 1481788726U, // <u,1,1,4>: Cost 2 vext1 <0,u,1,1>, RHS
+ 2689876828U, // <u,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
+ 2591364008U, // <u,1,1,6>: Cost 3 vext1 <6,u,1,1>, <6,u,1,1>
+ 2592691274U, // <u,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
+ 202162278U, // <u,1,1,u>: Cost 1 vdup1 LHS
+ 1499709542U, // <u,1,2,0>: Cost 2 vext1 <3,u,1,2>, LHS
+ 2689876871U, // <u,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
+ 2631116445U, // <u,1,2,2>: Cost 3 vext2 <2,2,u,1>, <2,2,u,1>
+ 835584U, // <u,1,2,3>: Cost 0 copy LHS
+ 1499712822U, // <u,1,2,4>: Cost 2 vext1 <3,u,1,2>, RHS
+ 2689876907U, // <u,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
+ 2631780282U, // <u,1,2,6>: Cost 3 vext2 <2,3,u,1>, <2,6,3,7>
+ 1523603074U, // <u,1,2,7>: Cost 2 vext1 <7,u,1,2>, <7,u,1,2>
+ 835584U, // <u,1,2,u>: Cost 0 copy LHS
+ 1487773798U, // <u,1,3,0>: Cost 2 vext1 <1,u,1,3>, LHS
+ 1611490264U, // <u,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
+ 2685232094U, // <u,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
+ 2018746470U, // <u,1,3,3>: Cost 2 vtrnr LHS, LHS
+ 1487777078U, // <u,1,3,4>: Cost 2 vext1 <1,u,1,3>, RHS
+ 1611490304U, // <u,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
+ 2685674505U, // <u,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
+ 2640407307U, // <u,1,3,7>: Cost 3 vext2 <3,7,u,1>, <3,7,u,1>
+ 1611490327U, // <u,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
+ 1567992749U, // <u,1,4,0>: Cost 2 vext2 <4,0,u,1>, <4,0,u,1>
+ 2693121070U, // <u,1,4,1>: Cost 3 vext3 <1,4,1,u>, <1,4,1,u>
+ 2693194807U, // <u,1,4,2>: Cost 3 vext3 <1,4,2,u>, <1,4,2,u>
+ 1152386432U, // <u,1,4,3>: Cost 2 vrev <1,u,3,4>
+ 2555555126U, // <u,1,4,4>: Cost 3 vext1 <0,u,1,4>, RHS
+ 1558039862U, // <u,1,4,5>: Cost 2 vext2 <2,3,u,1>, RHS
+ 2645716371U, // <u,1,4,6>: Cost 3 vext2 <4,6,u,1>, <4,6,u,1>
+ 2597361284U, // <u,1,4,7>: Cost 3 vext1 <7,u,1,4>, <7,u,1,4>
+ 1152755117U, // <u,1,4,u>: Cost 2 vrev <1,u,u,4>
+ 1481818214U, // <u,1,5,0>: Cost 2 vext1 <0,u,1,5>, LHS
+ 2555560694U, // <u,1,5,1>: Cost 3 vext1 <0,u,1,5>, <1,0,3,2>
+ 2555561576U, // <u,1,5,2>: Cost 3 vext1 <0,u,1,5>, <2,2,2,2>
+ 1611490448U, // <u,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
+ 1481821494U, // <u,1,5,4>: Cost 2 vext1 <0,u,1,5>, RHS
+ 2651025435U, // <u,1,5,5>: Cost 3 vext2 <5,5,u,1>, <5,5,u,1>
+ 2651689068U, // <u,1,5,6>: Cost 3 vext2 <5,6,u,1>, <5,6,u,1>
+ 2823966006U, // <u,1,5,7>: Cost 3 vuzpr <0,u,1,1>, RHS
+ 1611932861U, // <u,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
+ 2555568230U, // <u,1,6,0>: Cost 3 vext1 <0,u,1,6>, LHS
+ 2689877199U, // <u,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
+ 2712069336U, // <u,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
+ 2685232353U, // <u,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
+ 2555571510U, // <u,1,6,4>: Cost 3 vext1 <0,u,1,6>, RHS
+ 2689877235U, // <u,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
+ 2657661765U, // <u,1,6,6>: Cost 3 vext2 <6,6,u,1>, <6,6,u,1>
+ 1584583574U, // <u,1,6,7>: Cost 2 vext2 <6,7,u,1>, <6,7,u,1>
+ 1585247207U, // <u,1,6,u>: Cost 2 vext2 <6,u,u,1>, <6,u,u,1>
+ 2561548390U, // <u,1,7,0>: Cost 3 vext1 <1,u,1,7>, LHS
+ 2561549681U, // <u,1,7,1>: Cost 3 vext1 <1,u,1,7>, <1,u,1,7>
+ 2573493926U, // <u,1,7,2>: Cost 3 vext1 <3,u,1,7>, <2,3,0,1>
+ 2042962022U, // <u,1,7,3>: Cost 2 vtrnr RHS, LHS
+ 2561551670U, // <u,1,7,4>: Cost 3 vext1 <1,u,1,7>, RHS
+ 2226300309U, // <u,1,7,5>: Cost 3 vrev <1,u,5,7>
+ 2658325990U, // <u,1,7,6>: Cost 3 vext2 <6,7,u,1>, <7,6,1,u>
+ 2658326124U, // <u,1,7,7>: Cost 3 vext2 <6,7,u,1>, <7,7,7,7>
+ 2042962027U, // <u,1,7,u>: Cost 2 vtrnr RHS, LHS
+ 1481842790U, // <u,1,u,0>: Cost 2 vext1 <0,u,1,u>, LHS
+ 202162278U, // <u,1,u,1>: Cost 1 vdup1 LHS
+ 2685674867U, // <u,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
+ 835584U, // <u,1,u,3>: Cost 0 copy LHS
+ 1481846070U, // <u,1,u,4>: Cost 2 vext1 <0,u,1,u>, RHS
+ 1611933077U, // <u,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
+ 2685674910U, // <u,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
+ 1523652232U, // <u,1,u,7>: Cost 2 vext1 <7,u,1,u>, <7,u,1,u>
+ 835584U, // <u,1,u,u>: Cost 0 copy LHS
+ 1544110154U, // <u,2,0,0>: Cost 2 vext2 <0,0,u,2>, <0,0,u,2>
+ 1545437286U, // <u,2,0,1>: Cost 2 vext2 <0,2,u,2>, LHS
+ 1545437420U, // <u,2,0,2>: Cost 2 vext2 <0,2,u,2>, <0,2,u,2>
+ 2685232589U, // <u,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
+ 2619179346U, // <u,2,0,4>: Cost 3 vext2 <0,2,u,2>, <0,4,1,5>
+ 2712069606U, // <u,2,0,5>: Cost 3 vext3 RHS, <2,0,5,7>
+ 2689877484U, // <u,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
+ 2659656273U, // <u,2,0,7>: Cost 3 vext2 <7,0,u,2>, <0,7,2,u>
+ 1545437853U, // <u,2,0,u>: Cost 2 vext2 <0,2,u,2>, LHS
+ 1550082851U, // <u,2,1,0>: Cost 2 vext2 <1,0,u,2>, <1,0,u,2>
+ 2619179828U, // <u,2,1,1>: Cost 3 vext2 <0,2,u,2>, <1,1,1,1>
+ 2619179926U, // <u,2,1,2>: Cost 3 vext2 <0,2,u,2>, <1,2,3,0>
+ 2685232671U, // <u,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
+ 2555604278U, // <u,2,1,4>: Cost 3 vext1 <0,u,2,1>, RHS
+ 2619180176U, // <u,2,1,5>: Cost 3 vext2 <0,2,u,2>, <1,5,3,7>
+ 2689877564U, // <u,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
+ 2602718850U, // <u,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
+ 1158703235U, // <u,2,1,u>: Cost 2 vrev <2,u,u,1>
+ 1481867366U, // <u,2,2,0>: Cost 2 vext1 <0,u,2,2>, LHS
+ 2555609846U, // <u,2,2,1>: Cost 3 vext1 <0,u,2,2>, <1,0,3,2>
+ 269271142U, // <u,2,2,2>: Cost 1 vdup2 LHS
+ 1611490930U, // <u,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
+ 1481870646U, // <u,2,2,4>: Cost 2 vext1 <0,u,2,2>, RHS
+ 2689877640U, // <u,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
+ 2619180986U, // <u,2,2,6>: Cost 3 vext2 <0,2,u,2>, <2,6,3,7>
+ 2593436837U, // <u,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
+ 269271142U, // <u,2,2,u>: Cost 1 vdup2 LHS
+ 408134301U, // <u,2,3,0>: Cost 1 vext1 LHS, LHS
+ 1481876214U, // <u,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+ 1481877096U, // <u,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
+ 1880326246U, // <u,2,3,3>: Cost 2 vzipr LHS, LHS
+ 408137014U, // <u,2,3,4>: Cost 1 vext1 LHS, RHS
+ 1529654992U, // <u,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
+ 1529655802U, // <u,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+ 1529656314U, // <u,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
+ 408139566U, // <u,2,3,u>: Cost 1 vext1 LHS, LHS
+ 1567853468U, // <u,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
+ 2561598362U, // <u,2,4,1>: Cost 3 vext1 <1,u,2,4>, <1,2,3,4>
+ 2555627214U, // <u,2,4,2>: Cost 3 vext1 <0,u,2,4>, <2,3,4,5>
+ 2685232918U, // <u,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
+ 2555628854U, // <u,2,4,4>: Cost 3 vext1 <0,u,2,4>, RHS
+ 1545440566U, // <u,2,4,5>: Cost 2 vext2 <0,2,u,2>, RHS
+ 1571982740U, // <u,2,4,6>: Cost 2 vext2 <4,6,u,2>, <4,6,u,2>
+ 2592125957U, // <u,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
+ 1545440809U, // <u,2,4,u>: Cost 2 vext2 <0,2,u,2>, RHS
+ 2555633766U, // <u,2,5,0>: Cost 3 vext1 <0,u,2,5>, LHS
+ 2561606550U, // <u,2,5,1>: Cost 3 vext1 <1,u,2,5>, <1,2,3,0>
+ 2689877856U, // <u,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
+ 2685233000U, // <u,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
+ 1158441059U, // <u,2,5,4>: Cost 2 vrev <2,u,4,5>
+ 2645725188U, // <u,2,5,5>: Cost 3 vext2 <4,6,u,2>, <5,5,5,5>
+ 2689877892U, // <u,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
+ 2823900470U, // <u,2,5,7>: Cost 3 vuzpr <0,u,0,2>, RHS
+ 1158736007U, // <u,2,5,u>: Cost 2 vrev <2,u,u,5>
+ 1481900134U, // <u,2,6,0>: Cost 2 vext1 <0,u,2,6>, LHS
+ 2555642614U, // <u,2,6,1>: Cost 3 vext1 <0,u,2,6>, <1,0,3,2>
+ 2555643496U, // <u,2,6,2>: Cost 3 vext1 <0,u,2,6>, <2,2,2,2>
+ 1611491258U, // <u,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
+ 1481903414U, // <u,2,6,4>: Cost 2 vext1 <0,u,2,6>, RHS
+ 2689877964U, // <u,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
+ 2689877973U, // <u,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
+ 2645726030U, // <u,2,6,7>: Cost 3 vext2 <4,6,u,2>, <6,7,0,1>
+ 1611933671U, // <u,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
+ 1585919033U, // <u,2,7,0>: Cost 2 vext2 <7,0,u,2>, <7,0,u,2>
+ 2573566710U, // <u,2,7,1>: Cost 3 vext1 <3,u,2,7>, <1,0,3,2>
+ 2567596115U, // <u,2,7,2>: Cost 3 vext1 <2,u,2,7>, <2,u,2,7>
+ 1906901094U, // <u,2,7,3>: Cost 2 vzipr RHS, LHS
+ 2555653430U, // <u,2,7,4>: Cost 3 vext1 <0,u,2,7>, RHS
+ 2800080230U, // <u,2,7,5>: Cost 3 vuzpl LHS, <7,4,5,6>
+ 2980643164U, // <u,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
+ 2645726828U, // <u,2,7,7>: Cost 3 vext2 <4,6,u,2>, <7,7,7,7>
+ 1906901099U, // <u,2,7,u>: Cost 2 vzipr RHS, LHS
+ 408175266U, // <u,2,u,0>: Cost 1 vext1 LHS, LHS
+ 1545443118U, // <u,2,u,1>: Cost 2 vext2 <0,2,u,2>, LHS
+ 269271142U, // <u,2,u,2>: Cost 1 vdup2 LHS
+ 1611491416U, // <u,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
+ 408177974U, // <u,2,u,4>: Cost 1 vext1 LHS, RHS
+ 1545443482U, // <u,2,u,5>: Cost 2 vext2 <0,2,u,2>, RHS
+ 1726339226U, // <u,2,u,6>: Cost 2 vuzpl LHS, RHS
+ 1529697274U, // <u,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
+ 408180526U, // <u,2,u,u>: Cost 1 vext1 LHS, LHS
+ 1544781824U, // <u,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+ 471040156U, // <u,3,0,1>: Cost 1 vext2 LHS, LHS
+ 1544781988U, // <u,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+ 2618523900U, // <u,3,0,3>: Cost 3 vext2 LHS, <0,3,1,0>
+ 1544782162U, // <u,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+ 2238188352U, // <u,3,0,5>: Cost 3 vrev <3,u,5,0>
+ 2623169023U, // <u,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
+ 2238335826U, // <u,3,0,7>: Cost 3 vrev <3,u,7,0>
+ 471040669U, // <u,3,0,u>: Cost 1 vext2 LHS, LHS
+ 1544782582U, // <u,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+ 1544782644U, // <u,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+ 1544782742U, // <u,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+ 1544782808U, // <u,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+ 2618524733U, // <u,3,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
+ 1544782992U, // <u,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+ 2618524897U, // <u,3,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
+ 2703517987U, // <u,3,1,7>: Cost 3 vext3 <3,1,7,u>, <3,1,7,u>
+ 1544783213U, // <u,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
+ 1529716838U, // <u,3,2,0>: Cost 2 vext1 <u,u,3,2>, LHS
+ 1164167966U, // <u,3,2,1>: Cost 2 vrev <3,u,1,2>
+ 1544783464U, // <u,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
+ 1544783526U, // <u,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+ 1529720118U, // <u,3,2,4>: Cost 2 vext1 <u,u,3,2>, RHS
+ 2618525544U, // <u,3,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+ 1544783802U, // <u,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+ 2704181620U, // <u,3,2,7>: Cost 3 vext3 <3,2,7,u>, <3,2,7,u>
+ 1544783931U, // <u,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
+ 1544784022U, // <u,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+ 1487922559U, // <u,3,3,1>: Cost 2 vext1 <1,u,3,3>, <1,u,3,3>
+ 1493895256U, // <u,3,3,2>: Cost 2 vext1 <2,u,3,3>, <2,u,3,3>
+ 336380006U, // <u,3,3,3>: Cost 1 vdup3 LHS
+ 1544784386U, // <u,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+ 2824054478U, // <u,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
+ 2238286668U, // <u,3,3,6>: Cost 3 vrev <3,u,6,3>
+ 2954069136U, // <u,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
+ 336380006U, // <u,3,3,u>: Cost 1 vdup3 LHS
+ 1487929446U, // <u,3,4,0>: Cost 2 vext1 <1,u,3,4>, LHS
+ 1487930752U, // <u,3,4,1>: Cost 2 vext1 <1,u,3,4>, <1,u,3,4>
+ 2623171644U, // <u,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
+ 2561673366U, // <u,3,4,3>: Cost 3 vext1 <1,u,3,4>, <3,0,1,2>
+ 1487932726U, // <u,3,4,4>: Cost 2 vext1 <1,u,3,4>, RHS
+ 471043382U, // <u,3,4,5>: Cost 1 vext2 LHS, RHS
+ 1592561012U, // <u,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+ 2238368598U, // <u,3,4,7>: Cost 3 vrev <3,u,7,4>
+ 471043625U, // <u,3,4,u>: Cost 1 vext2 LHS, RHS
+ 2555707494U, // <u,3,5,0>: Cost 3 vext1 <0,u,3,5>, LHS
+ 1574645465U, // <u,3,5,1>: Cost 2 vext2 <5,1,u,3>, <5,1,u,3>
+ 2567653106U, // <u,3,5,2>: Cost 3 vext1 <2,u,3,5>, <2,3,u,5>
+ 2555709954U, // <u,3,5,3>: Cost 3 vext1 <0,u,3,5>, <3,4,5,6>
+ 1592561606U, // <u,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+ 1592561668U, // <u,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+ 1592561762U, // <u,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
+ 1750314294U, // <u,3,5,7>: Cost 2 vuzpr LHS, RHS
+ 1750314295U, // <u,3,5,u>: Cost 2 vuzpr LHS, RHS
+ 2623172897U, // <u,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
+ 2561688962U, // <u,3,6,1>: Cost 3 vext1 <1,u,3,6>, <1,u,3,6>
+ 1581281795U, // <u,3,6,2>: Cost 2 vext2 <6,2,u,3>, <6,2,u,3>
+ 2706541204U, // <u,3,6,3>: Cost 3 vext3 <3,6,3,u>, <3,6,3,u>
+ 2623173261U, // <u,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
+ 1164495686U, // <u,3,6,5>: Cost 2 vrev <3,u,5,6>
+ 1592562488U, // <u,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+ 1592562510U, // <u,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+ 1164716897U, // <u,3,6,u>: Cost 2 vrev <3,u,u,6>
+ 1487954022U, // <u,3,7,0>: Cost 2 vext1 <1,u,3,7>, LHS
+ 1487955331U, // <u,3,7,1>: Cost 2 vext1 <1,u,3,7>, <1,u,3,7>
+ 1493928028U, // <u,3,7,2>: Cost 2 vext1 <2,u,3,7>, <2,u,3,7>
+ 2561697942U, // <u,3,7,3>: Cost 3 vext1 <1,u,3,7>, <3,0,1,2>
+ 1487957302U, // <u,3,7,4>: Cost 2 vext1 <1,u,3,7>, RHS
+ 2707352311U, // <u,3,7,5>: Cost 3 vext3 <3,7,5,u>, <3,7,5,u>
+ 2655024623U, // <u,3,7,6>: Cost 3 vext2 <6,2,u,3>, <7,6,2,u>
+ 1592563308U, // <u,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+ 1487959854U, // <u,3,7,u>: Cost 2 vext1 <1,u,3,7>, LHS
+ 1544787667U, // <u,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
+ 471045934U, // <u,3,u,1>: Cost 1 vext2 LHS, LHS
+ 1549432709U, // <u,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
+ 336380006U, // <u,3,u,3>: Cost 1 vdup3 LHS
+ 1544788031U, // <u,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
+ 471046298U, // <u,3,u,5>: Cost 1 vext2 LHS, RHS
+ 1549433040U, // <u,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
+ 1750314537U, // <u,3,u,7>: Cost 2 vuzpr LHS, RHS
+ 471046501U, // <u,3,u,u>: Cost 1 vext2 LHS, LHS
+ 2625167360U, // <u,4,0,0>: Cost 3 vext2 <1,2,u,4>, <0,0,0,0>
+ 1551425638U, // <u,4,0,1>: Cost 2 vext2 <1,2,u,4>, LHS
+ 2619195630U, // <u,4,0,2>: Cost 3 vext2 <0,2,u,4>, <0,2,u,4>
+ 2619343104U, // <u,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
+ 2625167698U, // <u,4,0,4>: Cost 3 vext2 <1,2,u,4>, <0,4,1,5>
+ 1638329234U, // <u,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
+ 1638329244U, // <u,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
+ 3787803556U, // <u,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
+ 1551426205U, // <u,4,0,u>: Cost 2 vext2 <1,2,u,4>, LHS
+ 2555748454U, // <u,4,1,0>: Cost 3 vext1 <0,u,4,1>, LHS
+ 2625168180U, // <u,4,1,1>: Cost 3 vext2 <1,2,u,4>, <1,1,1,1>
+ 1551426503U, // <u,4,1,2>: Cost 2 vext2 <1,2,u,4>, <1,2,u,4>
+ 2625168344U, // <u,4,1,3>: Cost 3 vext2 <1,2,u,4>, <1,3,1,3>
+ 2555751734U, // <u,4,1,4>: Cost 3 vext1 <0,u,4,1>, RHS
+ 1860554038U, // <u,4,1,5>: Cost 2 vzipl LHS, RHS
+ 2689879022U, // <u,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
+ 2592248852U, // <u,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
+ 1555408301U, // <u,4,1,u>: Cost 2 vext2 <1,u,u,4>, <1,u,u,4>
+ 2555756646U, // <u,4,2,0>: Cost 3 vext1 <0,u,4,2>, LHS
+ 2625168943U, // <u,4,2,1>: Cost 3 vext2 <1,2,u,4>, <2,1,4,u>
+ 2625169000U, // <u,4,2,2>: Cost 3 vext2 <1,2,u,4>, <2,2,2,2>
+ 2619197134U, // <u,4,2,3>: Cost 3 vext2 <0,2,u,4>, <2,3,4,5>
+ 2555759926U, // <u,4,2,4>: Cost 3 vext1 <0,u,4,2>, RHS
+ 2712071222U, // <u,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
+ 1994771766U, // <u,4,2,6>: Cost 2 vtrnl LHS, RHS
+ 2592257045U, // <u,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
+ 1994771784U, // <u,4,2,u>: Cost 2 vtrnl LHS, RHS
+ 2625169558U, // <u,4,3,0>: Cost 3 vext2 <1,2,u,4>, <3,0,1,2>
+ 2567709594U, // <u,4,3,1>: Cost 3 vext1 <2,u,4,3>, <1,2,3,4>
+ 2567710817U, // <u,4,3,2>: Cost 3 vext1 <2,u,4,3>, <2,u,4,3>
+ 2625169820U, // <u,4,3,3>: Cost 3 vext2 <1,2,u,4>, <3,3,3,3>
+ 2625169922U, // <u,4,3,4>: Cost 3 vext2 <1,2,u,4>, <3,4,5,6>
+ 2954069710U, // <u,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
+ 2954068172U, // <u,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
+ 3903849472U, // <u,4,3,7>: Cost 4 vuzpr <1,u,3,4>, <1,3,5,7>
+ 2954068174U, // <u,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
+ 1505919078U, // <u,4,4,0>: Cost 2 vext1 <4,u,4,4>, LHS
+ 2567717831U, // <u,4,4,1>: Cost 3 vext1 <2,u,4,4>, <1,2,u,4>
+ 2567719010U, // <u,4,4,2>: Cost 3 vext1 <2,u,4,4>, <2,u,4,4>
+ 2570373542U, // <u,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
+ 161926454U, // <u,4,4,4>: Cost 1 vdup0 RHS
+ 1551428918U, // <u,4,4,5>: Cost 2 vext2 <1,2,u,4>, RHS
+ 1638329572U, // <u,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
+ 2594927963U, // <u,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
+ 161926454U, // <u,4,4,u>: Cost 1 vdup0 RHS
+ 1493983334U, // <u,4,5,0>: Cost 2 vext1 <2,u,4,5>, LHS
+ 2689879301U, // <u,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
+ 1493985379U, // <u,4,5,2>: Cost 2 vext1 <2,u,4,5>, <2,u,4,5>
+ 2567727254U, // <u,4,5,3>: Cost 3 vext1 <2,u,4,5>, <3,0,1,2>
+ 1493986614U, // <u,4,5,4>: Cost 2 vext1 <2,u,4,5>, RHS
+ 1863535926U, // <u,4,5,5>: Cost 2 vzipl RHS, RHS
+ 537750838U, // <u,4,5,6>: Cost 1 vext3 LHS, RHS
+ 2830110006U, // <u,4,5,7>: Cost 3 vuzpr <1,u,3,4>, RHS
+ 537750856U, // <u,4,5,u>: Cost 1 vext3 LHS, RHS
+ 1482047590U, // <u,4,6,0>: Cost 2 vext1 <0,u,4,6>, LHS
+ 2555790070U, // <u,4,6,1>: Cost 3 vext1 <0,u,4,6>, <1,0,3,2>
+ 2555790952U, // <u,4,6,2>: Cost 3 vext1 <0,u,4,6>, <2,2,2,2>
+ 2555791510U, // <u,4,6,3>: Cost 3 vext1 <0,u,4,6>, <3,0,1,2>
+ 1482050870U, // <u,4,6,4>: Cost 2 vext1 <0,u,4,6>, RHS
+ 2689879422U, // <u,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
+ 1997753654U, // <u,4,6,6>: Cost 2 vtrnl RHS, RHS
+ 2712071562U, // <u,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
+ 1482053422U, // <u,4,6,u>: Cost 2 vext1 <0,u,4,6>, LHS
+ 2567741542U, // <u,4,7,0>: Cost 3 vext1 <2,u,4,7>, LHS
+ 2567742362U, // <u,4,7,1>: Cost 3 vext1 <2,u,4,7>, <1,2,3,4>
+ 2567743589U, // <u,4,7,2>: Cost 3 vext1 <2,u,4,7>, <2,u,4,7>
+ 2573716286U, // <u,4,7,3>: Cost 3 vext1 <3,u,4,7>, <3,u,4,7>
+ 2567744822U, // <u,4,7,4>: Cost 3 vext1 <2,u,4,7>, RHS
+ 2712071624U, // <u,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
+ 96808489U, // <u,4,7,6>: Cost 1 vrev RHS
+ 2651715180U, // <u,4,7,7>: Cost 3 vext2 <5,6,u,4>, <7,7,7,7>
+ 96955963U, // <u,4,7,u>: Cost 1 vrev RHS
+ 1482063974U, // <u,4,u,0>: Cost 2 vext1 <0,u,4,u>, LHS
+ 1551431470U, // <u,4,u,1>: Cost 2 vext2 <1,2,u,4>, LHS
+ 1494009958U, // <u,4,u,2>: Cost 2 vext1 <2,u,4,u>, <2,u,4,u>
+ 2555807894U, // <u,4,u,3>: Cost 3 vext1 <0,u,4,u>, <3,0,1,2>
+ 161926454U, // <u,4,u,4>: Cost 1 vdup0 RHS
+ 1551431834U, // <u,4,u,5>: Cost 2 vext2 <1,2,u,4>, RHS
+ 537751081U, // <u,4,u,6>: Cost 1 vext3 LHS, RHS
+ 2830110249U, // <u,4,u,7>: Cost 3 vuzpr <1,u,3,4>, RHS
+ 537751099U, // <u,4,u,u>: Cost 1 vext3 LHS, RHS
+ 2631811072U, // <u,5,0,0>: Cost 3 vext2 <2,3,u,5>, <0,0,0,0>
+ 1558069350U, // <u,5,0,1>: Cost 2 vext2 <2,3,u,5>, LHS
+ 2619203823U, // <u,5,0,2>: Cost 3 vext2 <0,2,u,5>, <0,2,u,5>
+ 2619867456U, // <u,5,0,3>: Cost 3 vext2 <0,3,u,5>, <0,3,u,5>
+ 1546273106U, // <u,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
+ 2733010539U, // <u,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
+ 2597622682U, // <u,5,0,6>: Cost 3 vext1 <7,u,5,0>, <6,7,u,5>
+ 1176539396U, // <u,5,0,7>: Cost 2 vrev <5,u,7,0>
+ 1558069917U, // <u,5,0,u>: Cost 2 vext2 <2,3,u,5>, LHS
+ 1505968230U, // <u,5,1,0>: Cost 2 vext1 <4,u,5,1>, LHS
+ 2624512887U, // <u,5,1,1>: Cost 3 vext2 <1,1,u,5>, <1,1,u,5>
+ 2631811990U, // <u,5,1,2>: Cost 3 vext2 <2,3,u,5>, <1,2,3,0>
+ 2618541056U, // <u,5,1,3>: Cost 3 vext2 <0,1,u,5>, <1,3,5,7>
+ 1505971510U, // <u,5,1,4>: Cost 2 vext1 <4,u,5,1>, RHS
+ 2627167419U, // <u,5,1,5>: Cost 3 vext2 <1,5,u,5>, <1,5,u,5>
+ 2579714554U, // <u,5,1,6>: Cost 3 vext1 <4,u,5,1>, <6,2,7,3>
+ 1638330064U, // <u,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
+ 1638477529U, // <u,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
+ 2561802342U, // <u,5,2,0>: Cost 3 vext1 <1,u,5,2>, LHS
+ 2561803264U, // <u,5,2,1>: Cost 3 vext1 <1,u,5,2>, <1,3,5,7>
+ 2631149217U, // <u,5,2,2>: Cost 3 vext2 <2,2,u,5>, <2,2,u,5>
+ 1558071026U, // <u,5,2,3>: Cost 2 vext2 <2,3,u,5>, <2,3,u,5>
+ 2561805622U, // <u,5,2,4>: Cost 3 vext1 <1,u,5,2>, RHS
+ 2714062607U, // <u,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
+ 2631813050U, // <u,5,2,6>: Cost 3 vext2 <2,3,u,5>, <2,6,3,7>
+ 3092335926U, // <u,5,2,7>: Cost 3 vtrnr <0,u,0,2>, RHS
+ 1561389191U, // <u,5,2,u>: Cost 2 vext2 <2,u,u,5>, <2,u,u,5>
+ 2561810534U, // <u,5,3,0>: Cost 3 vext1 <1,u,5,3>, LHS
+ 2561811857U, // <u,5,3,1>: Cost 3 vext1 <1,u,5,3>, <1,u,5,3>
+ 2631813474U, // <u,5,3,2>: Cost 3 vext2 <2,3,u,5>, <3,2,5,u>
+ 2631813532U, // <u,5,3,3>: Cost 3 vext2 <2,3,u,5>, <3,3,3,3>
+ 2619869698U, // <u,5,3,4>: Cost 3 vext2 <0,3,u,5>, <3,4,5,6>
+ 3001847002U, // <u,5,3,5>: Cost 3 vzipr LHS, <4,4,5,5>
+ 2954070530U, // <u,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
+ 2018749750U, // <u,5,3,7>: Cost 2 vtrnr LHS, RHS
+ 2018749751U, // <u,5,3,u>: Cost 2 vtrnr LHS, RHS
+ 2573762662U, // <u,5,4,0>: Cost 3 vext1 <3,u,5,4>, LHS
+ 2620017634U, // <u,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
+ 2573764338U, // <u,5,4,2>: Cost 3 vext1 <3,u,5,4>, <2,3,u,5>
+ 2573765444U, // <u,5,4,3>: Cost 3 vext1 <3,u,5,4>, <3,u,5,4>
+ 1570680053U, // <u,5,4,4>: Cost 2 vext2 <4,4,u,5>, <4,4,u,5>
+ 1558072630U, // <u,5,4,5>: Cost 2 vext2 <2,3,u,5>, RHS
+ 2645749143U, // <u,5,4,6>: Cost 3 vext2 <4,6,u,5>, <4,6,u,5>
+ 1638330310U, // <u,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
+ 1558072873U, // <u,5,4,u>: Cost 2 vext2 <2,3,u,5>, RHS
+ 1506000998U, // <u,5,5,0>: Cost 2 vext1 <4,u,5,5>, LHS
+ 2561827984U, // <u,5,5,1>: Cost 3 vext1 <1,u,5,5>, <1,5,3,7>
+ 2579744360U, // <u,5,5,2>: Cost 3 vext1 <4,u,5,5>, <2,2,2,2>
+ 2579744918U, // <u,5,5,3>: Cost 3 vext1 <4,u,5,5>, <3,0,1,2>
+ 1506004278U, // <u,5,5,4>: Cost 2 vext1 <4,u,5,5>, RHS
+ 229035318U, // <u,5,5,5>: Cost 1 vdup1 RHS
+ 2712072206U, // <u,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
+ 1638330392U, // <u,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
+ 229035318U, // <u,5,5,u>: Cost 1 vdup1 RHS
+ 1500037222U, // <u,5,6,0>: Cost 2 vext1 <3,u,5,6>, LHS
+ 2561836436U, // <u,5,6,1>: Cost 3 vext1 <1,u,5,6>, <1,u,5,6>
+ 2567809133U, // <u,5,6,2>: Cost 3 vext1 <2,u,5,6>, <2,u,5,6>
+ 1500040006U, // <u,5,6,3>: Cost 2 vext1 <3,u,5,6>, <3,u,5,6>
+ 1500040502U, // <u,5,6,4>: Cost 2 vext1 <3,u,5,6>, RHS
+ 2714062935U, // <u,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
+ 2712072288U, // <u,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
+ 27705344U, // <u,5,6,7>: Cost 0 copy RHS
+ 27705344U, // <u,5,6,u>: Cost 0 copy RHS
+ 1488101478U, // <u,5,7,0>: Cost 2 vext1 <1,u,5,7>, LHS
+ 1488102805U, // <u,5,7,1>: Cost 2 vext1 <1,u,5,7>, <1,u,5,7>
+ 2561844840U, // <u,5,7,2>: Cost 3 vext1 <1,u,5,7>, <2,2,2,2>
+ 2561845398U, // <u,5,7,3>: Cost 3 vext1 <1,u,5,7>, <3,0,1,2>
+ 1488104758U, // <u,5,7,4>: Cost 2 vext1 <1,u,5,7>, RHS
+ 1638330536U, // <u,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
+ 2712072362U, // <u,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
+ 2042965302U, // <u,5,7,7>: Cost 2 vtrnr RHS, RHS
+ 1488107310U, // <u,5,7,u>: Cost 2 vext1 <1,u,5,7>, LHS
+ 1488109670U, // <u,5,u,0>: Cost 2 vext1 <1,u,5,u>, LHS
+ 1488110998U, // <u,5,u,1>: Cost 2 vext1 <1,u,5,u>, <1,u,5,u>
+ 2561853032U, // <u,5,u,2>: Cost 3 vext1 <1,u,5,u>, <2,2,2,2>
+ 1500056392U, // <u,5,u,3>: Cost 2 vext1 <3,u,5,u>, <3,u,5,u>
+ 1488112950U, // <u,5,u,4>: Cost 2 vext1 <1,u,5,u>, RHS
+ 229035318U, // <u,5,u,5>: Cost 1 vdup1 RHS
+ 2954111490U, // <u,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
+ 27705344U, // <u,5,u,7>: Cost 0 copy RHS
+ 27705344U, // <u,5,u,u>: Cost 0 copy RHS
+ 2619211776U, // <u,6,0,0>: Cost 3 vext2 <0,2,u,6>, <0,0,0,0>
+ 1545470054U, // <u,6,0,1>: Cost 2 vext2 <0,2,u,6>, LHS
+ 1545470192U, // <u,6,0,2>: Cost 2 vext2 <0,2,u,6>, <0,2,u,6>
+ 2255958969U, // <u,6,0,3>: Cost 3 vrev <6,u,3,0>
+ 1546797458U, // <u,6,0,4>: Cost 2 vext2 <0,4,u,6>, <0,4,u,6>
+ 2720624971U, // <u,6,0,5>: Cost 3 vext3 <6,0,5,u>, <6,0,5,u>
+ 2256180180U, // <u,6,0,6>: Cost 3 vrev <6,u,6,0>
+ 2960682294U, // <u,6,0,7>: Cost 3 vzipr <1,2,u,0>, RHS
+ 1545470621U, // <u,6,0,u>: Cost 2 vext2 <0,2,u,6>, LHS
+ 1182004127U, // <u,6,1,0>: Cost 2 vrev <6,u,0,1>
+ 2619212596U, // <u,6,1,1>: Cost 3 vext2 <0,2,u,6>, <1,1,1,1>
+ 2619212694U, // <u,6,1,2>: Cost 3 vext2 <0,2,u,6>, <1,2,3,0>
+ 2619212760U, // <u,6,1,3>: Cost 3 vext2 <0,2,u,6>, <1,3,1,3>
+ 2626511979U, // <u,6,1,4>: Cost 3 vext2 <1,4,u,6>, <1,4,u,6>
+ 2619212944U, // <u,6,1,5>: Cost 3 vext2 <0,2,u,6>, <1,5,3,7>
+ 2714063264U, // <u,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
+ 2967326006U, // <u,6,1,7>: Cost 3 vzipr <2,3,u,1>, RHS
+ 1182594023U, // <u,6,1,u>: Cost 2 vrev <6,u,u,1>
+ 1506050150U, // <u,6,2,0>: Cost 2 vext1 <4,u,6,2>, LHS
+ 2579792630U, // <u,6,2,1>: Cost 3 vext1 <4,u,6,2>, <1,0,3,2>
+ 2619213416U, // <u,6,2,2>: Cost 3 vext2 <0,2,u,6>, <2,2,2,2>
+ 2619213478U, // <u,6,2,3>: Cost 3 vext2 <0,2,u,6>, <2,3,0,1>
+ 1506053430U, // <u,6,2,4>: Cost 2 vext1 <4,u,6,2>, RHS
+ 2633148309U, // <u,6,2,5>: Cost 3 vext2 <2,5,u,6>, <2,5,u,6>
+ 2619213754U, // <u,6,2,6>: Cost 3 vext2 <0,2,u,6>, <2,6,3,7>
+ 1638330874U, // <u,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
+ 1638478339U, // <u,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
+ 2619213974U, // <u,6,3,0>: Cost 3 vext2 <0,2,u,6>, <3,0,1,2>
+ 2255836074U, // <u,6,3,1>: Cost 3 vrev <6,u,1,3>
+ 2255909811U, // <u,6,3,2>: Cost 3 vrev <6,u,2,3>
+ 2619214236U, // <u,6,3,3>: Cost 3 vext2 <0,2,u,6>, <3,3,3,3>
+ 1564715549U, // <u,6,3,4>: Cost 2 vext2 <3,4,u,6>, <3,4,u,6>
+ 2639121006U, // <u,6,3,5>: Cost 3 vext2 <3,5,u,6>, <3,5,u,6>
+ 3001847012U, // <u,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
+ 1880329526U, // <u,6,3,7>: Cost 2 vzipr LHS, RHS
+ 1880329527U, // <u,6,3,u>: Cost 2 vzipr LHS, RHS
+ 2567864422U, // <u,6,4,0>: Cost 3 vext1 <2,u,6,4>, LHS
+ 2733011558U, // <u,6,4,1>: Cost 3 vext3 LHS, <6,4,1,3>
+ 2567866484U, // <u,6,4,2>: Cost 3 vext1 <2,u,6,4>, <2,u,6,4>
+ 2638458005U, // <u,6,4,3>: Cost 3 vext2 <3,4,u,6>, <4,3,6,u>
+ 1570540772U, // <u,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
+ 1545473334U, // <u,6,4,5>: Cost 2 vext2 <0,2,u,6>, RHS
+ 1572015512U, // <u,6,4,6>: Cost 2 vext2 <4,6,u,6>, <4,6,u,6>
+ 2960715062U, // <u,6,4,7>: Cost 3 vzipr <1,2,u,4>, RHS
+ 1545473577U, // <u,6,4,u>: Cost 2 vext2 <0,2,u,6>, RHS
+ 2567872614U, // <u,6,5,0>: Cost 3 vext1 <2,u,6,5>, LHS
+ 2645757648U, // <u,6,5,1>: Cost 3 vext2 <4,6,u,6>, <5,1,7,3>
+ 2567874490U, // <u,6,5,2>: Cost 3 vext1 <2,u,6,5>, <2,6,3,7>
+ 2576501250U, // <u,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
+ 1576660943U, // <u,6,5,4>: Cost 2 vext2 <5,4,u,6>, <5,4,u,6>
+ 2645757956U, // <u,6,5,5>: Cost 3 vext2 <4,6,u,6>, <5,5,5,5>
+ 2645758050U, // <u,6,5,6>: Cost 3 vext2 <4,6,u,6>, <5,6,7,0>
+ 2824080694U, // <u,6,5,7>: Cost 3 vuzpr <0,u,2,6>, RHS
+ 1182626795U, // <u,6,5,u>: Cost 2 vrev <6,u,u,5>
+ 1506082918U, // <u,6,6,0>: Cost 2 vext1 <4,u,6,6>, LHS
+ 2579825398U, // <u,6,6,1>: Cost 3 vext1 <4,u,6,6>, <1,0,3,2>
+ 2645758458U, // <u,6,6,2>: Cost 3 vext2 <4,6,u,6>, <6,2,7,3>
+ 2579826838U, // <u,6,6,3>: Cost 3 vext1 <4,u,6,6>, <3,0,1,2>
+ 1506086198U, // <u,6,6,4>: Cost 2 vext1 <4,u,6,6>, RHS
+ 2579828432U, // <u,6,6,5>: Cost 3 vext1 <4,u,6,6>, <5,1,7,3>
+ 296144182U, // <u,6,6,6>: Cost 1 vdup2 RHS
+ 1638331202U, // <u,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
+ 296144182U, // <u,6,6,u>: Cost 1 vdup2 RHS
+ 432349286U, // <u,6,7,0>: Cost 1 vext1 RHS, LHS
+ 1506091766U, // <u,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
+ 1506092648U, // <u,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+ 1506093206U, // <u,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
+ 432352809U, // <u,6,7,4>: Cost 1 vext1 RHS, RHS
+ 1506094800U, // <u,6,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+ 1506095610U, // <u,6,7,6>: Cost 2 vext1 RHS, <6,2,7,3>
+ 1906904374U, // <u,6,7,7>: Cost 2 vzipr RHS, RHS
+ 432355118U, // <u,6,7,u>: Cost 1 vext1 RHS, LHS
+ 432357478U, // <u,6,u,0>: Cost 1 vext1 RHS, LHS
+ 1545475886U, // <u,6,u,1>: Cost 2 vext2 <0,2,u,6>, LHS
+ 1506100840U, // <u,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
+ 1506101398U, // <u,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
+ 432361002U, // <u,6,u,4>: Cost 1 vext1 RHS, RHS
+ 1545476250U, // <u,6,u,5>: Cost 2 vext2 <0,2,u,6>, RHS
+ 296144182U, // <u,6,u,6>: Cost 1 vdup2 RHS
+ 1880370486U, // <u,6,u,7>: Cost 2 vzipr LHS, RHS
+ 432363310U, // <u,6,u,u>: Cost 1 vext1 RHS, LHS
+ 1571356672U, // <u,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+ 497614950U, // <u,7,0,1>: Cost 1 vext2 RHS, LHS
+ 1571356836U, // <u,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+ 2573880146U, // <u,7,0,3>: Cost 3 vext1 <3,u,7,0>, <3,u,7,0>
+ 1571357010U, // <u,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+ 1512083716U, // <u,7,0,5>: Cost 2 vext1 <5,u,7,0>, <5,u,7,0>
+ 2621874741U, // <u,7,0,6>: Cost 3 vext2 <0,6,u,7>, <0,6,u,7>
+ 2585826298U, // <u,7,0,7>: Cost 3 vext1 <5,u,7,0>, <7,0,1,2>
+ 497615517U, // <u,7,0,u>: Cost 1 vext2 RHS, LHS
+ 1571357430U, // <u,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+ 1571357492U, // <u,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+ 1571357590U, // <u,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
+ 1552114715U, // <u,7,1,3>: Cost 2 vext2 <1,3,u,7>, <1,3,u,7>
+ 2573888822U, // <u,7,1,4>: Cost 3 vext1 <3,u,7,1>, RHS
+ 1553441981U, // <u,7,1,5>: Cost 2 vext2 <1,5,u,7>, <1,5,u,7>
+ 2627847438U, // <u,7,1,6>: Cost 3 vext2 <1,6,u,7>, <1,6,u,7>
+ 2727408775U, // <u,7,1,7>: Cost 3 vext3 <7,1,7,u>, <7,1,7,u>
+ 1555432880U, // <u,7,1,u>: Cost 2 vext2 <1,u,u,7>, <1,u,u,7>
+ 2629838337U, // <u,7,2,0>: Cost 3 vext2 <2,0,u,7>, <2,0,u,7>
+ 1188058754U, // <u,7,2,1>: Cost 2 vrev <7,u,1,2>
+ 1571358312U, // <u,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+ 1571358374U, // <u,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+ 2632492869U, // <u,7,2,4>: Cost 3 vext2 <2,4,u,7>, <2,4,u,7>
+ 2633156502U, // <u,7,2,5>: Cost 3 vext2 <2,5,u,7>, <2,5,u,7>
+ 1560078311U, // <u,7,2,6>: Cost 2 vext2 <2,6,u,7>, <2,6,u,7>
+ 2728072408U, // <u,7,2,7>: Cost 3 vext3 <7,2,7,u>, <7,2,7,u>
+ 1561405577U, // <u,7,2,u>: Cost 2 vext2 <2,u,u,7>, <2,u,u,7>
+ 1571358870U, // <u,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+ 2627184913U, // <u,7,3,1>: Cost 3 vext2 <1,5,u,7>, <3,1,5,u>
+ 2633820523U, // <u,7,3,2>: Cost 3 vext2 <2,6,u,7>, <3,2,6,u>
+ 1571359132U, // <u,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+ 1571359234U, // <u,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+ 1512108295U, // <u,7,3,5>: Cost 2 vext1 <5,u,7,3>, <5,u,7,3>
+ 1518080992U, // <u,7,3,6>: Cost 2 vext1 <6,u,7,3>, <6,u,7,3>
+ 2640456465U, // <u,7,3,7>: Cost 3 vext2 <3,7,u,7>, <3,7,u,7>
+ 1571359518U, // <u,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+ 1571359634U, // <u,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+ 2573911067U, // <u,7,4,1>: Cost 3 vext1 <3,u,7,4>, <1,3,u,7>
+ 2645101622U, // <u,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
+ 2573912918U, // <u,7,4,3>: Cost 3 vext1 <3,u,7,4>, <3,u,7,4>
+ 1571359952U, // <u,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+ 497618248U, // <u,7,4,5>: Cost 1 vext2 RHS, RHS
+ 1571360116U, // <u,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+ 2645102024U, // <u,7,4,7>: Cost 3 vext2 RHS, <4,7,5,0>
+ 497618473U, // <u,7,4,u>: Cost 1 vext2 RHS, RHS
+ 2645102152U, // <u,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
+ 1571360464U, // <u,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+ 2645102334U, // <u,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
+ 2645102447U, // <u,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
+ 1571360710U, // <u,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+ 1571360772U, // <u,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+ 1571360866U, // <u,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
+ 1571360936U, // <u,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+ 1571361017U, // <u,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
+ 1530044518U, // <u,7,6,0>: Cost 2 vext1 <u,u,7,6>, LHS
+ 2645103016U, // <u,7,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
+ 1571361274U, // <u,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+ 2645103154U, // <u,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
+ 1530047798U, // <u,7,6,4>: Cost 2 vext1 <u,u,7,6>, RHS
+ 1188386474U, // <u,7,6,5>: Cost 2 vrev <7,u,5,6>
+ 1571361592U, // <u,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
+ 1571361614U, // <u,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+ 1571361695U, // <u,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
+ 1571361786U, // <u,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
+ 2573935616U, // <u,7,7,1>: Cost 3 vext1 <3,u,7,7>, <1,3,5,7>
+ 2645103781U, // <u,7,7,2>: Cost 3 vext2 RHS, <7,2,2,2>
+ 2573937497U, // <u,7,7,3>: Cost 3 vext1 <3,u,7,7>, <3,u,7,7>
+ 1571362150U, // <u,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
+ 1512141067U, // <u,7,7,5>: Cost 2 vext1 <5,u,7,7>, <5,u,7,7>
+ 1518113764U, // <u,7,7,6>: Cost 2 vext1 <6,u,7,7>, <6,u,7,7>
+ 363253046U, // <u,7,7,7>: Cost 1 vdup3 RHS
+ 363253046U, // <u,7,7,u>: Cost 1 vdup3 RHS
+ 1571362515U, // <u,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
+ 497620782U, // <u,7,u,1>: Cost 1 vext2 RHS, LHS
+ 1571362693U, // <u,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
+ 1571362748U, // <u,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+ 1571362879U, // <u,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
+ 497621146U, // <u,7,u,5>: Cost 1 vext2 RHS, RHS
+ 1571363024U, // <u,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
+ 363253046U, // <u,7,u,7>: Cost 1 vdup3 RHS
+ 497621349U, // <u,7,u,u>: Cost 1 vext2 RHS, LHS
+ 135053414U, // <u,u,0,0>: Cost 1 vdup0 LHS
+ 471081121U, // <u,u,0,1>: Cost 1 vext2 LHS, LHS
+ 1544822948U, // <u,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+ 1616140005U, // <u,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
+ 1544823122U, // <u,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+ 1512157453U, // <u,u,0,5>: Cost 2 vext1 <5,u,u,0>, <5,u,u,0>
+ 1662220032U, // <u,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
+ 1194457487U, // <u,u,0,7>: Cost 2 vrev <u,u,7,0>
+ 471081629U, // <u,u,0,u>: Cost 1 vext2 LHS, LHS
+ 1544823542U, // <u,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+ 202162278U, // <u,u,1,1>: Cost 1 vdup1 LHS
+ 537753390U, // <u,u,1,2>: Cost 1 vext3 LHS, LHS
+ 1544823768U, // <u,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+ 1494248758U, // <u,u,1,4>: Cost 2 vext1 <2,u,u,1>, RHS
+ 1544823952U, // <u,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+ 1518138343U, // <u,u,1,6>: Cost 2 vext1 <6,u,u,1>, <6,u,u,1>
+ 1640322907U, // <u,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
+ 537753444U, // <u,u,1,u>: Cost 1 vext3 LHS, LHS
+ 1482309734U, // <u,u,2,0>: Cost 2 vext1 <0,u,u,2>, LHS
+ 1194031451U, // <u,u,2,1>: Cost 2 vrev <u,u,1,2>
+ 269271142U, // <u,u,2,2>: Cost 1 vdup2 LHS
+ 835584U, // <u,u,2,3>: Cost 0 copy LHS
+ 1482313014U, // <u,u,2,4>: Cost 2 vext1 <0,u,u,2>, RHS
+ 2618566504U, // <u,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+ 1544824762U, // <u,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+ 1638479788U, // <u,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
+ 835584U, // <u,u,2,u>: Cost 0 copy LHS
+ 408576723U, // <u,u,3,0>: Cost 1 vext1 LHS, LHS
+ 1482318582U, // <u,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+ 120371557U, // <u,u,3,2>: Cost 1 vrev LHS
+ 336380006U, // <u,u,3,3>: Cost 1 vdup3 LHS
+ 408579382U, // <u,u,3,4>: Cost 1 vext1 LHS, RHS
+ 1616140271U, // <u,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
+ 1530098170U, // <u,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+ 1880329544U, // <u,u,3,7>: Cost 2 vzipr LHS, RHS
+ 408581934U, // <u,u,3,u>: Cost 1 vext1 LHS, LHS
+ 1488298086U, // <u,u,4,0>: Cost 2 vext1 <1,u,u,4>, LHS
+ 1488299437U, // <u,u,4,1>: Cost 2 vext1 <1,u,u,4>, <1,u,u,4>
+ 1659271204U, // <u,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
+ 1194195311U, // <u,u,4,3>: Cost 2 vrev <u,u,3,4>
+ 161926454U, // <u,u,4,4>: Cost 1 vdup0 RHS
+ 471084342U, // <u,u,4,5>: Cost 1 vext2 LHS, RHS
+ 1571368308U, // <u,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+ 1640323153U, // <u,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
+ 471084585U, // <u,u,4,u>: Cost 1 vext2 LHS, RHS
+ 1494278246U, // <u,u,5,0>: Cost 2 vext1 <2,u,u,5>, LHS
+ 1571368656U, // <u,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+ 1494280327U, // <u,u,5,2>: Cost 2 vext1 <2,u,u,5>, <2,u,u,5>
+ 1616140415U, // <u,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
+ 1494281526U, // <u,u,5,4>: Cost 2 vext1 <2,u,u,5>, RHS
+ 229035318U, // <u,u,5,5>: Cost 1 vdup1 RHS
+ 537753754U, // <u,u,5,6>: Cost 1 vext3 LHS, RHS
+ 1750355254U, // <u,u,5,7>: Cost 2 vuzpr LHS, RHS
+ 537753772U, // <u,u,5,u>: Cost 1 vext3 LHS, RHS
+ 1482342502U, // <u,u,6,0>: Cost 2 vext1 <0,u,u,6>, LHS
+ 2556084982U, // <u,u,6,1>: Cost 3 vext1 <0,u,u,6>, <1,0,3,2>
+ 1571369466U, // <u,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+ 1611938000U, // <u,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
+ 1482345782U, // <u,u,6,4>: Cost 2 vext1 <0,u,u,6>, RHS
+ 1194359171U, // <u,u,6,5>: Cost 2 vrev <u,u,5,6>
+ 296144182U, // <u,u,6,6>: Cost 1 vdup2 RHS
+ 27705344U, // <u,u,6,7>: Cost 0 copy RHS
+ 27705344U, // <u,u,6,u>: Cost 0 copy RHS
+ 432496742U, // <u,u,7,0>: Cost 1 vext1 RHS, LHS
+ 1488324016U, // <u,u,7,1>: Cost 2 vext1 <1,u,u,7>, <1,u,u,7>
+ 1494296713U, // <u,u,7,2>: Cost 2 vext1 <2,u,u,7>, <2,u,u,7>
+ 1906901148U, // <u,u,7,3>: Cost 2 vzipr RHS, LHS
+ 432500283U, // <u,u,7,4>: Cost 1 vext1 RHS, RHS
+ 1506242256U, // <u,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+ 120699277U, // <u,u,7,6>: Cost 1 vrev RHS
+ 363253046U, // <u,u,7,7>: Cost 1 vdup3 RHS
+ 432502574U, // <u,u,7,u>: Cost 1 vext1 RHS, LHS
+ 408617688U, // <u,u,u,0>: Cost 1 vext1 LHS, LHS
+ 471086894U, // <u,u,u,1>: Cost 1 vext2 LHS, LHS
+ 537753957U, // <u,u,u,2>: Cost 1 vext3 LHS, LHS
+ 835584U, // <u,u,u,3>: Cost 0 copy LHS
+ 408620342U, // <u,u,u,4>: Cost 1 vext1 LHS, RHS
+ 471087258U, // <u,u,u,5>: Cost 1 vext2 LHS, RHS
+ 537753997U, // <u,u,u,6>: Cost 1 vext3 LHS, RHS
+ 27705344U, // <u,u,u,7>: Cost 0 copy RHS
+ 835584U, // <u,u,u,u>: Cost 0 copy LHS
+ 0
+};
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp b/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp
new file mode 100644
index 000000000000..8693f76d7c32
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp
@@ -0,0 +1,566 @@
+//=- AArch64PromoteConstant.cpp --- Promote constant to global for AArch64 -==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64PromoteConstant pass which promotes constants
+// to global variables when this is likely to be more efficient. Currently only
+// types related to constant vector (i.e., constant vector, array of constant
+// vectors, constant structure with a constant vector field, etc.) are promoted
+// to global variables. Constant vectors are likely to be lowered in target
+// constant pool during instruction selection already; therefore, the access
+// will remain the same (memory load), but the structure types are not split
+// into different constant pool accesses for each field. A bonus side effect is
+// that created globals may be merged by the global merge pass.
+//
+// FIXME: This pass may be useful for other targets too.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-promote-const"
+
+// Stress testing mode - disable heuristics.
+static cl::opt<bool> Stress("aarch64-stress-promote-const", cl::Hidden,
+ cl::desc("Promote all vector constants"));
+
+STATISTIC(NumPromoted, "Number of promoted constants");
+STATISTIC(NumPromotedUses, "Number of promoted constants uses");
+
+//===----------------------------------------------------------------------===//
+// AArch64PromoteConstant
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// Promotes interesting constant into global variables.
+/// The motivating example is:
+/// static const uint16_t TableA[32] = {
+/// 41944, 40330, 38837, 37450, 36158, 34953, 33826, 32768,
+/// 31776, 30841, 29960, 29128, 28340, 27595, 26887, 26215,
+/// 25576, 24967, 24386, 23832, 23302, 22796, 22311, 21846,
+/// 21400, 20972, 20561, 20165, 19785, 19419, 19066, 18725,
+/// };
+///
+/// uint8x16x4_t LoadStatic(void) {
+/// uint8x16x4_t ret;
+/// ret.val[0] = vld1q_u16(TableA + 0);
+/// ret.val[1] = vld1q_u16(TableA + 8);
+/// ret.val[2] = vld1q_u16(TableA + 16);
+/// ret.val[3] = vld1q_u16(TableA + 24);
+/// return ret;
+/// }
+///
+/// The constants in this example are folded into the uses. Thus, 4 different
+/// constants are created.
+///
+/// As their type is vector the cheapest way to create them is to load them
+/// for the memory.
+///
+/// Therefore the final assembly final has 4 different loads. With this pass
+/// enabled, only one load is issued for the constants.
+class AArch64PromoteConstant : public ModulePass {
+
+public:
+ struct PromotedConstant {
+ bool ShouldConvert = false;
+ GlobalVariable *GV = nullptr;
+ };
+ typedef SmallDenseMap<Constant *, PromotedConstant, 16> PromotionCacheTy;
+
+ struct UpdateRecord {
+ Constant *C;
+ Instruction *User;
+ unsigned Op;
+
+ UpdateRecord(Constant *C, Instruction *User, unsigned Op)
+ : C(C), User(User), Op(Op) {}
+ };
+
+ static char ID;
+ AArch64PromoteConstant() : ModulePass(ID) {
+ initializeAArch64PromoteConstantPass(*PassRegistry::getPassRegistry());
+ }
+
+ StringRef getPassName() const override { return "AArch64 Promote Constant"; }
+
+ /// Iterate over the functions and promote the interesting constants into
+ /// global variables with module scope.
+ bool runOnModule(Module &M) override {
+ DEBUG(dbgs() << getPassName() << '\n');
+ if (skipModule(M))
+ return false;
+ bool Changed = false;
+ PromotionCacheTy PromotionCache;
+ for (auto &MF : M) {
+ Changed |= runOnFunction(MF, PromotionCache);
+ }
+ return Changed;
+ }
+
+private:
+ /// Look for interesting constants used within the given function.
+ /// Promote them into global variables, load these global variables within
+ /// the related function, so that the number of inserted load is minimal.
+ bool runOnFunction(Function &F, PromotionCacheTy &PromotionCache);
+
+ // This transformation requires dominator info
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ }
+
+ /// Type to store a list of Uses.
+ typedef SmallVector<std::pair<Instruction *, unsigned>, 4> Uses;
+ /// Map an insertion point to all the uses it dominates.
+ typedef DenseMap<Instruction *, Uses> InsertionPoints;
+
+ /// Find the closest point that dominates the given Use.
+ Instruction *findInsertionPoint(Instruction &User, unsigned OpNo);
+
+ /// Check if the given insertion point is dominated by an existing
+ /// insertion point.
+ /// If true, the given use is added to the list of dominated uses for
+ /// the related existing point.
+ /// \param NewPt the insertion point to be checked
+ /// \param User the user of the constant
+ /// \param OpNo the operand number of the use
+ /// \param InsertPts existing insertion points
+ /// \pre NewPt and all instruction in InsertPts belong to the same function
+ /// \return true if one of the insertion point in InsertPts dominates NewPt,
+ /// false otherwise
+ bool isDominated(Instruction *NewPt, Instruction *User, unsigned OpNo,
+ InsertionPoints &InsertPts);
+
+ /// Check if the given insertion point can be merged with an existing
+ /// insertion point in a common dominator.
+ /// If true, the given use is added to the list of the created insertion
+ /// point.
+ /// \param NewPt the insertion point to be checked
+ /// \param User the user of the constant
+ /// \param OpNo the operand number of the use
+ /// \param InsertPts existing insertion points
+ /// \pre NewPt and all instruction in InsertPts belong to the same function
+ /// \pre isDominated returns false for the exact same parameters.
+ /// \return true if it exists an insertion point in InsertPts that could
+ /// have been merged with NewPt in a common dominator,
+ /// false otherwise
+ bool tryAndMerge(Instruction *NewPt, Instruction *User, unsigned OpNo,
+ InsertionPoints &InsertPts);
+
+ /// Compute the minimal insertion points to dominates all the interesting
+ /// uses of value.
+ /// Insertion points are group per function and each insertion point
+ /// contains a list of all the uses it dominates within the related function
+ /// \param User the user of the constant
+ /// \param OpNo the operand number of the constant
+ /// \param[out] InsertPts output storage of the analysis
+ void computeInsertionPoint(Instruction *User, unsigned OpNo,
+ InsertionPoints &InsertPts);
+
+ /// Insert a definition of a new global variable at each point contained in
+ /// InsPtsPerFunc and update the related uses (also contained in
+ /// InsPtsPerFunc).
+ void insertDefinitions(Function &F, GlobalVariable &GV,
+ InsertionPoints &InsertPts);
+
+ /// Do the constant promotion indicated by the Updates records, keeping track
+ /// of globals in PromotionCache.
+ void promoteConstants(Function &F, SmallVectorImpl<UpdateRecord> &Updates,
+ PromotionCacheTy &PromotionCache);
+
+ /// Transfer the list of dominated uses of IPI to NewPt in InsertPts.
+ /// Append Use to this list and delete the entry of IPI in InsertPts.
+ static void appendAndTransferDominatedUses(Instruction *NewPt,
+ Instruction *User, unsigned OpNo,
+ InsertionPoints::iterator &IPI,
+ InsertionPoints &InsertPts) {
+ // Record the dominated use.
+ IPI->second.emplace_back(User, OpNo);
+ // Transfer the dominated uses of IPI to NewPt
+ // Inserting into the DenseMap may invalidate existing iterator.
+ // Keep a copy of the key to find the iterator to erase. Keep a copy of the
+ // value so that we don't have to dereference IPI->second.
+ Instruction *OldInstr = IPI->first;
+ Uses OldUses = std::move(IPI->second);
+ InsertPts[NewPt] = std::move(OldUses);
+ // Erase IPI.
+ InsertPts.erase(OldInstr);
+ }
+};
+} // end anonymous namespace
+
+char AArch64PromoteConstant::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AArch64PromoteConstant, "aarch64-promote-const",
+ "AArch64 Promote Constant Pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(AArch64PromoteConstant, "aarch64-promote-const",
+ "AArch64 Promote Constant Pass", false, false)
+
+ModulePass *llvm::createAArch64PromoteConstantPass() {
+ return new AArch64PromoteConstant();
+}
+
+/// Check if the given type uses a vector type.
+static bool isConstantUsingVectorTy(const Type *CstTy) {
+ if (CstTy->isVectorTy())
+ return true;
+ if (CstTy->isStructTy()) {
+ for (unsigned EltIdx = 0, EndEltIdx = CstTy->getStructNumElements();
+ EltIdx < EndEltIdx; ++EltIdx)
+ if (isConstantUsingVectorTy(CstTy->getStructElementType(EltIdx)))
+ return true;
+ } else if (CstTy->isArrayTy())
+ return isConstantUsingVectorTy(CstTy->getArrayElementType());
+ return false;
+}
+
+/// Check if the given use (Instruction + OpIdx) of Cst should be converted into
+/// a load of a global variable initialized with Cst.
+/// A use should be converted if it is legal to do so.
+/// For instance, it is not legal to turn the mask operand of a shuffle vector
+/// into a load of a global variable.
+static bool shouldConvertUse(const Constant *Cst, const Instruction *Instr,
+ unsigned OpIdx) {
+ // shufflevector instruction expects a const for the mask argument, i.e., the
+ // third argument. Do not promote this use in that case.
+ if (isa<const ShuffleVectorInst>(Instr) && OpIdx == 2)
+ return false;
+
+ // extractvalue instruction expects a const idx.
+ if (isa<const ExtractValueInst>(Instr) && OpIdx > 0)
+ return false;
+
+ // extractvalue instruction expects a const idx.
+ if (isa<const InsertValueInst>(Instr) && OpIdx > 1)
+ return false;
+
+ if (isa<const AllocaInst>(Instr) && OpIdx > 0)
+ return false;
+
+ // Alignment argument must be constant.
+ if (isa<const LoadInst>(Instr) && OpIdx > 0)
+ return false;
+
+ // Alignment argument must be constant.
+ if (isa<const StoreInst>(Instr) && OpIdx > 1)
+ return false;
+
+ // Index must be constant.
+ if (isa<const GetElementPtrInst>(Instr) && OpIdx > 0)
+ return false;
+
+ // Personality function and filters must be constant.
+ // Give up on that instruction.
+ if (isa<const LandingPadInst>(Instr))
+ return false;
+
+ // Switch instruction expects constants to compare to.
+ if (isa<const SwitchInst>(Instr))
+ return false;
+
+ // Expected address must be a constant.
+ if (isa<const IndirectBrInst>(Instr))
+ return false;
+
+ // Do not mess with intrinsics.
+ if (isa<const IntrinsicInst>(Instr))
+ return false;
+
+ // Do not mess with inline asm.
+ const CallInst *CI = dyn_cast<const CallInst>(Instr);
+ return !(CI && isa<const InlineAsm>(CI->getCalledValue()));
+}
+
+/// Check if the given Cst should be converted into
+/// a load of a global variable initialized with Cst.
+/// A constant should be converted if it is likely that the materialization of
+/// the constant will be tricky. Thus, we give up on zero or undef values.
+///
+/// \todo Currently, accept only vector related types.
+/// Also we give up on all simple vector type to keep the existing
+/// behavior. Otherwise, we should push here all the check of the lowering of
+/// BUILD_VECTOR. By giving up, we lose the potential benefit of merging
+/// constant via global merge and the fact that the same constant is stored
+/// only once with this method (versus, as many function that uses the constant
+/// for the regular approach, even for float).
+/// Again, the simplest solution would be to promote every
+/// constant and rematerialize them when they are actually cheap to create.
+static bool shouldConvertImpl(const Constant *Cst) {
+ if (isa<const UndefValue>(Cst))
+ return false;
+
+ // FIXME: In some cases, it may be interesting to promote in memory
+ // a zero initialized constant.
+ // E.g., when the type of Cst require more instructions than the
+ // adrp/add/load sequence or when this sequence can be shared by several
+ // instances of Cst.
+ // Ideally, we could promote this into a global and rematerialize the constant
+ // when it was a bad idea.
+ if (Cst->isZeroValue())
+ return false;
+
+ if (Stress)
+ return true;
+
+ // FIXME: see function \todo
+ if (Cst->getType()->isVectorTy())
+ return false;
+ return isConstantUsingVectorTy(Cst->getType());
+}
+
+static bool
+shouldConvert(Constant &C,
+ AArch64PromoteConstant::PromotionCacheTy &PromotionCache) {
+ auto Converted = PromotionCache.insert(
+ std::make_pair(&C, AArch64PromoteConstant::PromotedConstant()));
+ if (Converted.second)
+ Converted.first->second.ShouldConvert = shouldConvertImpl(&C);
+ return Converted.first->second.ShouldConvert;
+}
+
+Instruction *AArch64PromoteConstant::findInsertionPoint(Instruction &User,
+ unsigned OpNo) {
+ // If this user is a phi, the insertion point is in the related
+ // incoming basic block.
+ if (PHINode *PhiInst = dyn_cast<PHINode>(&User))
+ return PhiInst->getIncomingBlock(OpNo)->getTerminator();
+
+ return &User;
+}
+
+bool AArch64PromoteConstant::isDominated(Instruction *NewPt, Instruction *User,
+ unsigned OpNo,
+ InsertionPoints &InsertPts) {
+
+ DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
+ *NewPt->getParent()->getParent()).getDomTree();
+
+ // Traverse all the existing insertion points and check if one is dominating
+ // NewPt. If it is, remember that.
+ for (auto &IPI : InsertPts) {
+ if (NewPt == IPI.first || DT.dominates(IPI.first, NewPt) ||
+ // When IPI.first is a terminator instruction, DT may think that
+ // the result is defined on the edge.
+ // Here we are testing the insertion point, not the definition.
+ (IPI.first->getParent() != NewPt->getParent() &&
+ DT.dominates(IPI.first->getParent(), NewPt->getParent()))) {
+ // No need to insert this point. Just record the dominated use.
+ DEBUG(dbgs() << "Insertion point dominated by:\n");
+ DEBUG(IPI.first->print(dbgs()));
+ DEBUG(dbgs() << '\n');
+ IPI.second.emplace_back(User, OpNo);
+ return true;
+ }
+ }
+ return false;
+}
+
+bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt, Instruction *User,
+ unsigned OpNo,
+ InsertionPoints &InsertPts) {
+ DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
+ *NewPt->getParent()->getParent()).getDomTree();
+ BasicBlock *NewBB = NewPt->getParent();
+
+ // Traverse all the existing insertion point and check if one is dominated by
+ // NewPt and thus useless or can be combined with NewPt into a common
+ // dominator.
+ for (InsertionPoints::iterator IPI = InsertPts.begin(),
+ EndIPI = InsertPts.end();
+ IPI != EndIPI; ++IPI) {
+ BasicBlock *CurBB = IPI->first->getParent();
+ if (NewBB == CurBB) {
+ // Instructions are in the same block.
+ // By construction, NewPt is dominating the other.
+ // Indeed, isDominated returned false with the exact same arguments.
+ DEBUG(dbgs() << "Merge insertion point with:\n");
+ DEBUG(IPI->first->print(dbgs()));
+ DEBUG(dbgs() << "\nat considered insertion point.\n");
+ appendAndTransferDominatedUses(NewPt, User, OpNo, IPI, InsertPts);
+ return true;
+ }
+
+ // Look for a common dominator
+ BasicBlock *CommonDominator = DT.findNearestCommonDominator(NewBB, CurBB);
+ // If none exists, we cannot merge these two points.
+ if (!CommonDominator)
+ continue;
+
+ if (CommonDominator != NewBB) {
+ // By construction, the CommonDominator cannot be CurBB.
+ assert(CommonDominator != CurBB &&
+ "Instruction has not been rejected during isDominated check!");
+ // Take the last instruction of the CommonDominator as insertion point
+ NewPt = CommonDominator->getTerminator();
+ }
+ // else, CommonDominator is the block of NewBB, hence NewBB is the last
+ // possible insertion point in that block.
+ DEBUG(dbgs() << "Merge insertion point with:\n");
+ DEBUG(IPI->first->print(dbgs()));
+ DEBUG(dbgs() << '\n');
+ DEBUG(NewPt->print(dbgs()));
+ DEBUG(dbgs() << '\n');
+ appendAndTransferDominatedUses(NewPt, User, OpNo, IPI, InsertPts);
+ return true;
+ }
+ return false;
+}
+
+void AArch64PromoteConstant::computeInsertionPoint(
+ Instruction *User, unsigned OpNo, InsertionPoints &InsertPts) {
+ DEBUG(dbgs() << "Considered use, opidx " << OpNo << ":\n");
+ DEBUG(User->print(dbgs()));
+ DEBUG(dbgs() << '\n');
+
+ Instruction *InsertionPoint = findInsertionPoint(*User, OpNo);
+
+ DEBUG(dbgs() << "Considered insertion point:\n");
+ DEBUG(InsertionPoint->print(dbgs()));
+ DEBUG(dbgs() << '\n');
+
+ if (isDominated(InsertionPoint, User, OpNo, InsertPts))
+ return;
+ // This insertion point is useful, check if we can merge some insertion
+ // point in a common dominator or if NewPt dominates an existing one.
+ if (tryAndMerge(InsertionPoint, User, OpNo, InsertPts))
+ return;
+
+ DEBUG(dbgs() << "Keep considered insertion point\n");
+
+ // It is definitely useful by its own
+ InsertPts[InsertionPoint].emplace_back(User, OpNo);
+}
+
+static void ensurePromotedGV(Function &F, Constant &C,
+ AArch64PromoteConstant::PromotedConstant &PC) {
+ assert(PC.ShouldConvert &&
+ "Expected that we should convert this to a global");
+ if (PC.GV)
+ return;
+ PC.GV = new GlobalVariable(
+ *F.getParent(), C.getType(), true, GlobalValue::InternalLinkage, nullptr,
+ "_PromotedConst", nullptr, GlobalVariable::NotThreadLocal);
+ PC.GV->setInitializer(&C);
+ DEBUG(dbgs() << "Global replacement: ");
+ DEBUG(PC.GV->print(dbgs()));
+ DEBUG(dbgs() << '\n');
+ ++NumPromoted;
+}
+
+void AArch64PromoteConstant::insertDefinitions(Function &F,
+ GlobalVariable &PromotedGV,
+ InsertionPoints &InsertPts) {
+#ifndef NDEBUG
+ // Do more checking for debug purposes.
+ DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
+#endif
+ assert(!InsertPts.empty() && "Empty uses does not need a definition");
+
+ for (const auto &IPI : InsertPts) {
+ // Create the load of the global variable.
+ IRBuilder<> Builder(IPI.first);
+ LoadInst *LoadedCst = Builder.CreateLoad(&PromotedGV);
+ DEBUG(dbgs() << "**********\n");
+ DEBUG(dbgs() << "New def: ");
+ DEBUG(LoadedCst->print(dbgs()));
+ DEBUG(dbgs() << '\n');
+
+ // Update the dominated uses.
+ for (auto Use : IPI.second) {
+#ifndef NDEBUG
+ assert(DT.dominates(LoadedCst,
+ findInsertionPoint(*Use.first, Use.second)) &&
+ "Inserted definition does not dominate all its uses!");
+#endif
+ DEBUG({
+ dbgs() << "Use to update " << Use.second << ":";
+ Use.first->print(dbgs());
+ dbgs() << '\n';
+ });
+ Use.first->setOperand(Use.second, LoadedCst);
+ ++NumPromotedUses;
+ }
+ }
+}
+
+void AArch64PromoteConstant::promoteConstants(
+ Function &F, SmallVectorImpl<UpdateRecord> &Updates,
+ PromotionCacheTy &PromotionCache) {
+ // Promote the constants.
+ for (auto U = Updates.begin(), E = Updates.end(); U != E;) {
+ DEBUG(dbgs() << "** Compute insertion points **\n");
+ auto First = U;
+ Constant *C = First->C;
+ InsertionPoints InsertPts;
+ do {
+ computeInsertionPoint(U->User, U->Op, InsertPts);
+ } while (++U != E && U->C == C);
+
+ auto &Promotion = PromotionCache[C];
+ ensurePromotedGV(F, *C, Promotion);
+ insertDefinitions(F, *Promotion.GV, InsertPts);
+ }
+}
+
+bool AArch64PromoteConstant::runOnFunction(Function &F,
+ PromotionCacheTy &PromotionCache) {
+ // Look for instructions using constant vector. Promote that constant to a
+ // global variable. Create as few loads of this variable as possible and
+ // update the uses accordingly.
+ SmallVector<UpdateRecord, 64> Updates;
+ for (Instruction &I : instructions(&F)) {
+ // Traverse the operand, looking for constant vectors. Replace them by a
+ // load of a global variable of constant vector type.
+ for (Use &U : I.operands()) {
+ Constant *Cst = dyn_cast<Constant>(U);
+ // There is no point in promoting global values as they are already
+ // global. Do not promote constant expressions either, as they may
+ // require some code expansion.
+ if (!Cst || isa<GlobalValue>(Cst) || isa<ConstantExpr>(Cst))
+ continue;
+
+ // Check if this constant is worth promoting.
+ if (!shouldConvert(*Cst, PromotionCache))
+ continue;
+
+ // Check if this use should be promoted.
+ unsigned OpNo = &U - I.op_begin();
+ if (!shouldConvertUse(Cst, &I, OpNo))
+ continue;
+
+ Updates.emplace_back(Cst, &I, OpNo);
+ }
+ }
+
+ if (Updates.empty())
+ return false;
+
+ promoteConstants(F, Updates, PromotionCache);
+ return true;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp b/contrib/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
new file mode 100644
index 000000000000..8f45e6a80a36
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
@@ -0,0 +1,179 @@
+//=- AArch64RedundantCopyElimination.cpp - Remove useless copy for AArch64 -=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// This pass removes unnecessary zero copies in BBs that are targets of
+// cbz/cbnz instructions. For instance, the copy instruction in the code below
+// can be removed because the CBZW jumps to BB#2 when W0 is zero.
+// BB#1:
+// CBZW %W0, <BB#2>
+// BB#2:
+// %W0 = COPY %WZR
+// This pass should be run after register allocation.
+//
+// FIXME: This should be extended to handle any constant other than zero. E.g.,
+// cmp w0, #1
+// b.eq .BB1
+// BB1:
+// mov w0, #1
+//
+// FIXME: This could also be extended to check the whole dominance subtree below
+// the comparison if the compile time regression is acceptable.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-copyelim"
+
+STATISTIC(NumCopiesRemoved, "Number of copies removed.");
+
+namespace {
+class AArch64RedundantCopyElimination : public MachineFunctionPass {
+ const MachineRegisterInfo *MRI;
+ const TargetRegisterInfo *TRI;
+
+public:
+ static char ID;
+ AArch64RedundantCopyElimination() : MachineFunctionPass(ID) {
+ initializeAArch64RedundantCopyEliminationPass(
+ *PassRegistry::getPassRegistry());
+ }
+ bool optimizeCopy(MachineBasicBlock *MBB);
+ bool runOnMachineFunction(MachineFunction &MF) override;
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+ StringRef getPassName() const override {
+ return "AArch64 Redundant Copy Elimination";
+ }
+};
+char AArch64RedundantCopyElimination::ID = 0;
+}
+
+INITIALIZE_PASS(AArch64RedundantCopyElimination, "aarch64-copyelim",
+ "AArch64 redundant copy elimination pass", false, false)
+
+static bool guaranteesZeroRegInBlock(MachineInstr &MI, MachineBasicBlock *MBB) {
+ unsigned Opc = MI.getOpcode();
+ // Check if the current basic block is the target block to which the
+ // CBZ/CBNZ instruction jumps when its Wt/Xt is zero.
+ if ((Opc == AArch64::CBZW || Opc == AArch64::CBZX) &&
+ MBB == MI.getOperand(1).getMBB())
+ return true;
+ else if ((Opc == AArch64::CBNZW || Opc == AArch64::CBNZX) &&
+ MBB != MI.getOperand(1).getMBB())
+ return true;
+
+ return false;
+}
+
+bool AArch64RedundantCopyElimination::optimizeCopy(MachineBasicBlock *MBB) {
+ // Check if the current basic block has a single predecessor.
+ if (MBB->pred_size() != 1)
+ return false;
+
+ MachineBasicBlock *PredMBB = *MBB->pred_begin();
+ MachineBasicBlock::iterator CompBr = PredMBB->getLastNonDebugInstr();
+ if (CompBr == PredMBB->end() || PredMBB->succ_size() != 2)
+ return false;
+
+ ++CompBr;
+ do {
+ --CompBr;
+ if (guaranteesZeroRegInBlock(*CompBr, MBB))
+ break;
+ } while (CompBr != PredMBB->begin() && CompBr->isTerminator());
+
+ // We've not found a CBZ/CBNZ, time to bail out.
+ if (!guaranteesZeroRegInBlock(*CompBr, MBB))
+ return false;
+
+ unsigned TargetReg = CompBr->getOperand(0).getReg();
+ if (!TargetReg)
+ return false;
+ assert(TargetRegisterInfo::isPhysicalRegister(TargetReg) &&
+ "Expect physical register");
+
+ // Remember all registers aliasing with TargetReg.
+ SmallSetVector<unsigned, 8> TargetRegs;
+ for (MCRegAliasIterator AI(TargetReg, TRI, true); AI.isValid(); ++AI)
+ TargetRegs.insert(*AI);
+
+ bool Changed = false;
+ MachineBasicBlock::iterator LastChange = MBB->begin();
+ unsigned SmallestDef = TargetReg;
+ // Remove redundant Copy instructions unless TargetReg is modified.
+ for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) {
+ MachineInstr *MI = &*I;
+ ++I;
+ if (MI->isCopy() && MI->getOperand(0).isReg() &&
+ MI->getOperand(1).isReg()) {
+
+ unsigned DefReg = MI->getOperand(0).getReg();
+ unsigned SrcReg = MI->getOperand(1).getReg();
+
+ if ((SrcReg == AArch64::XZR || SrcReg == AArch64::WZR) &&
+ !MRI->isReserved(DefReg) &&
+ (TargetReg == DefReg || TRI->isSuperRegister(DefReg, TargetReg))) {
+ DEBUG(dbgs() << "Remove redundant Copy : ");
+ DEBUG((MI)->print(dbgs()));
+
+ MI->eraseFromParent();
+ Changed = true;
+ LastChange = I;
+ NumCopiesRemoved++;
+ SmallestDef =
+ TRI->isSubRegister(SmallestDef, DefReg) ? DefReg : SmallestDef;
+ continue;
+ }
+ }
+
+ if (MI->modifiesRegister(TargetReg, TRI))
+ break;
+ }
+
+ if (!Changed)
+ return false;
+
+ // Otherwise, we have to fixup the use-def chain, starting with the
+ // CBZ/CBNZ. Conservatively mark as much as we can live.
+ CompBr->clearRegisterKills(SmallestDef, TRI);
+
+ if (none_of(TargetRegs, [&](unsigned Reg) { return MBB->isLiveIn(Reg); }))
+ MBB->addLiveIn(TargetReg);
+
+ // Clear any kills of TargetReg between CompBr and the last removed COPY.
+ for (MachineInstr &MMI : make_range(MBB->begin(), LastChange))
+ MMI.clearRegisterKills(SmallestDef, TRI);
+
+ return true;
+}
+
+bool AArch64RedundantCopyElimination::runOnMachineFunction(
+ MachineFunction &MF) {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+ TRI = MF.getSubtarget().getRegisterInfo();
+ MRI = &MF.getRegInfo();
+ bool Changed = false;
+ for (MachineBasicBlock &MBB : MF)
+ Changed |= optimizeCopy(&MBB);
+ return Changed;
+}
+
+FunctionPass *llvm::createAArch64RedundantCopyEliminationPass() {
+ return new AArch64RedundantCopyElimination();
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
new file mode 100644
index 000000000000..a5fd2fbdde19
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
@@ -0,0 +1,577 @@
+//===- AArch64RegisterBankInfo.cpp -------------------------------*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the RegisterBankInfo class for
+/// AArch64.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64RegisterBankInfo.h"
+#include "AArch64InstrInfo.h" // For XXXRegClassID.
+#include "llvm/CodeGen/LowLevelType.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+// This file will be TableGen'ed at some point.
+#include "AArch64GenRegisterBankInfo.def"
+
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "You shouldn't build this"
+#endif
+
+AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
+ : RegisterBankInfo(AArch64::RegBanks, AArch64::NumRegisterBanks) {
+ static bool AlreadyInit = false;
+ // We have only one set of register banks, whatever the subtarget
+ // is. Therefore, the initialization of the RegBanks table should be
+ // done only once. Indeed the table of all register banks
+ // (AArch64::RegBanks) is unique in the compiler. At some point, it
+ // will get tablegen'ed and the whole constructor becomes empty.
+ if (AlreadyInit)
+ return;
+ AlreadyInit = true;
+ // Initialize the GPR bank.
+ createRegisterBank(AArch64::GPRRegBankID, "GPR");
+ // The GPR register bank is fully defined by all the registers in
+ // GR64all + its subclasses.
+ addRegBankCoverage(AArch64::GPRRegBankID, AArch64::GPR64allRegClassID, TRI);
+ const RegisterBank &RBGPR = getRegBank(AArch64::GPRRegBankID);
+ (void)RBGPR;
+ assert(&AArch64::GPRRegBank == &RBGPR &&
+ "The order in RegBanks is messed up");
+ assert(RBGPR.covers(*TRI.getRegClass(AArch64::GPR32RegClassID)) &&
+ "Subclass not added?");
+ assert(RBGPR.getSize() == 64 && "GPRs should hold up to 64-bit");
+
+ // Initialize the FPR bank.
+ createRegisterBank(AArch64::FPRRegBankID, "FPR");
+ // The FPR register bank is fully defined by all the registers in
+ // GR64all + its subclasses.
+ addRegBankCoverage(AArch64::FPRRegBankID, AArch64::QQQQRegClassID, TRI);
+ const RegisterBank &RBFPR = getRegBank(AArch64::FPRRegBankID);
+ (void)RBFPR;
+ assert(&AArch64::FPRRegBank == &RBFPR &&
+ "The order in RegBanks is messed up");
+ assert(RBFPR.covers(*TRI.getRegClass(AArch64::QQRegClassID)) &&
+ "Subclass not added?");
+ assert(RBFPR.covers(*TRI.getRegClass(AArch64::FPR64RegClassID)) &&
+ "Subclass not added?");
+ assert(RBFPR.getSize() == 512 &&
+ "FPRs should hold up to 512-bit via QQQQ sequence");
+
+ // Initialize the CCR bank.
+ createRegisterBank(AArch64::CCRRegBankID, "CCR");
+ addRegBankCoverage(AArch64::CCRRegBankID, AArch64::CCRRegClassID, TRI);
+ const RegisterBank &RBCCR = getRegBank(AArch64::CCRRegBankID);
+ (void)RBCCR;
+ assert(&AArch64::CCRRegBank == &RBCCR &&
+ "The order in RegBanks is messed up");
+ assert(RBCCR.covers(*TRI.getRegClass(AArch64::CCRRegClassID)) &&
+ "Class not added?");
+ assert(RBCCR.getSize() == 32 && "CCR should hold up to 32-bit");
+
+ // Check that the TableGen'ed like file is in sync we our expectations.
+ // First, the Idx.
+ assert(AArch64::PartialMappingIdx::PMI_GPR32 ==
+ AArch64::PartialMappingIdx::PMI_FirstGPR &&
+ "GPR32 index not first in the GPR list");
+ assert(AArch64::PartialMappingIdx::PMI_GPR64 ==
+ AArch64::PartialMappingIdx::PMI_LastGPR &&
+ "GPR64 index not last in the GPR list");
+ assert(AArch64::PartialMappingIdx::PMI_FirstGPR <=
+ AArch64::PartialMappingIdx::PMI_LastGPR &&
+ "GPR list is backward");
+ assert(AArch64::PartialMappingIdx::PMI_FPR32 ==
+ AArch64::PartialMappingIdx::PMI_FirstFPR &&
+ "FPR32 index not first in the FPR list");
+ assert(AArch64::PartialMappingIdx::PMI_FPR512 ==
+ AArch64::PartialMappingIdx::PMI_LastFPR &&
+ "FPR512 index not last in the FPR list");
+ assert(AArch64::PartialMappingIdx::PMI_FirstFPR <=
+ AArch64::PartialMappingIdx::PMI_LastFPR &&
+ "FPR list is backward");
+ assert(AArch64::PartialMappingIdx::PMI_FPR32 + 1 ==
+ AArch64::PartialMappingIdx::PMI_FPR64 &&
+ AArch64::PartialMappingIdx::PMI_FPR64 + 1 ==
+ AArch64::PartialMappingIdx::PMI_FPR128 &&
+ AArch64::PartialMappingIdx::PMI_FPR128 + 1 ==
+ AArch64::PartialMappingIdx::PMI_FPR256 &&
+ AArch64::PartialMappingIdx::PMI_FPR256 + 1 ==
+ AArch64::PartialMappingIdx::PMI_FPR512 &&
+ "FPR indices not properly ordered");
+// Now, the content.
+// Check partial mapping.
+#define CHECK_PARTIALMAP(Idx, ValStartIdx, ValLength, RB) \
+ do { \
+ const PartialMapping &Map = \
+ AArch64::PartMappings[AArch64::PartialMappingIdx::Idx - \
+ AArch64::PartialMappingIdx::PMI_Min]; \
+ (void)Map; \
+ assert(Map.StartIdx == ValStartIdx && Map.Length == ValLength && \
+ Map.RegBank == &RB && #Idx " is incorrectly initialized"); \
+ } while (0)
+
+ CHECK_PARTIALMAP(PMI_GPR32, 0, 32, RBGPR);
+ CHECK_PARTIALMAP(PMI_GPR64, 0, 64, RBGPR);
+ CHECK_PARTIALMAP(PMI_FPR32, 0, 32, RBFPR);
+ CHECK_PARTIALMAP(PMI_FPR64, 0, 64, RBFPR);
+ CHECK_PARTIALMAP(PMI_FPR128, 0, 128, RBFPR);
+ CHECK_PARTIALMAP(PMI_FPR256, 0, 256, RBFPR);
+ CHECK_PARTIALMAP(PMI_FPR512, 0, 512, RBFPR);
+
+// Check value mapping.
+#define CHECK_VALUEMAP_IMPL(RBName, Size, Offset) \
+ do { \
+ unsigned PartialMapBaseIdx = \
+ AArch64::PartialMappingIdx::PMI_##RBName##Size - \
+ AArch64::PartialMappingIdx::PMI_Min; \
+ (void)PartialMapBaseIdx; \
+ const ValueMapping &Map = AArch64::getValueMapping( \
+ AArch64::PartialMappingIdx::PMI_First##RBName, Size)[Offset]; \
+ (void)Map; \
+ assert(Map.BreakDown == &AArch64::PartMappings[PartialMapBaseIdx] && \
+ Map.NumBreakDowns == 1 && #RBName #Size \
+ " " #Offset " is incorrectly initialized"); \
+ } while (0)
+
+#define CHECK_VALUEMAP(RBName, Size) CHECK_VALUEMAP_IMPL(RBName, Size, 0)
+
+ CHECK_VALUEMAP(GPR, 32);
+ CHECK_VALUEMAP(GPR, 64);
+ CHECK_VALUEMAP(FPR, 32);
+ CHECK_VALUEMAP(FPR, 64);
+ CHECK_VALUEMAP(FPR, 128);
+ CHECK_VALUEMAP(FPR, 256);
+ CHECK_VALUEMAP(FPR, 512);
+
+// Check the value mapping for 3-operands instructions where all the operands
+// map to the same value mapping.
+#define CHECK_VALUEMAP_3OPS(RBName, Size) \
+ do { \
+ CHECK_VALUEMAP_IMPL(RBName, Size, 0); \
+ CHECK_VALUEMAP_IMPL(RBName, Size, 1); \
+ CHECK_VALUEMAP_IMPL(RBName, Size, 2); \
+ } while (0)
+
+ CHECK_VALUEMAP_3OPS(GPR, 32);
+ CHECK_VALUEMAP_3OPS(GPR, 64);
+ CHECK_VALUEMAP_3OPS(FPR, 32);
+ CHECK_VALUEMAP_3OPS(FPR, 64);
+ CHECK_VALUEMAP_3OPS(FPR, 128);
+ CHECK_VALUEMAP_3OPS(FPR, 256);
+ CHECK_VALUEMAP_3OPS(FPR, 512);
+
+#define CHECK_VALUEMAP_CROSSREGCPY(RBNameDst, RBNameSrc, Size) \
+ do { \
+ unsigned PartialMapDstIdx = \
+ AArch64::PMI_##RBNameDst##Size - AArch64::PMI_Min; \
+ unsigned PartialMapSrcIdx = \
+ AArch64::PMI_##RBNameSrc##Size - AArch64::PMI_Min; \
+ (void) PartialMapDstIdx; \
+ (void) PartialMapSrcIdx; \
+ const ValueMapping *Map = AArch64::getCopyMapping( \
+ AArch64::PMI_First##RBNameDst == AArch64::PMI_FirstGPR, \
+ AArch64::PMI_First##RBNameSrc == AArch64::PMI_FirstGPR, Size); \
+ (void) Map; \
+ assert(Map[0].BreakDown == &AArch64::PartMappings[PartialMapDstIdx] && \
+ Map[0].NumBreakDowns == 1 && #RBNameDst #Size \
+ " Dst is incorrectly initialized"); \
+ assert(Map[1].BreakDown == &AArch64::PartMappings[PartialMapSrcIdx] && \
+ Map[1].NumBreakDowns == 1 && #RBNameSrc #Size \
+ " Src is incorrectly initialized"); \
+ \
+ } while (0)
+
+ CHECK_VALUEMAP_CROSSREGCPY(GPR, GPR, 32);
+ CHECK_VALUEMAP_CROSSREGCPY(GPR, FPR, 32);
+ CHECK_VALUEMAP_CROSSREGCPY(GPR, GPR, 64);
+ CHECK_VALUEMAP_CROSSREGCPY(GPR, FPR, 64);
+ CHECK_VALUEMAP_CROSSREGCPY(FPR, FPR, 32);
+ CHECK_VALUEMAP_CROSSREGCPY(FPR, GPR, 32);
+ CHECK_VALUEMAP_CROSSREGCPY(FPR, FPR, 64);
+ CHECK_VALUEMAP_CROSSREGCPY(FPR, GPR, 64);
+
+ assert(verify(TRI) && "Invalid register bank information");
+}
+
+unsigned AArch64RegisterBankInfo::copyCost(const RegisterBank &A,
+ const RegisterBank &B,
+ unsigned Size) const {
+ // What do we do with different size?
+ // copy are same size.
+ // Will introduce other hooks for different size:
+ // * extract cost.
+ // * build_sequence cost.
+
+ // Copy from (resp. to) GPR to (resp. from) FPR involves FMOV.
+ // FIXME: This should be deduced from the scheduling model.
+ if (&A == &AArch64::GPRRegBank && &B == &AArch64::FPRRegBank)
+ // FMOVXDr or FMOVWSr.
+ return 5;
+ if (&A == &AArch64::FPRRegBank && &B == &AArch64::GPRRegBank)
+ // FMOVDXr or FMOVSWr.
+ return 4;
+
+ return RegisterBankInfo::copyCost(A, B, Size);
+}
+
+const RegisterBank &AArch64RegisterBankInfo::getRegBankFromRegClass(
+ const TargetRegisterClass &RC) const {
+ switch (RC.getID()) {
+ case AArch64::FPR8RegClassID:
+ case AArch64::FPR16RegClassID:
+ case AArch64::FPR32RegClassID:
+ case AArch64::FPR64RegClassID:
+ case AArch64::FPR128RegClassID:
+ case AArch64::FPR128_loRegClassID:
+ case AArch64::DDRegClassID:
+ case AArch64::DDDRegClassID:
+ case AArch64::DDDDRegClassID:
+ case AArch64::QQRegClassID:
+ case AArch64::QQQRegClassID:
+ case AArch64::QQQQRegClassID:
+ return getRegBank(AArch64::FPRRegBankID);
+ case AArch64::GPR32commonRegClassID:
+ case AArch64::GPR32RegClassID:
+ case AArch64::GPR32spRegClassID:
+ case AArch64::GPR32sponlyRegClassID:
+ case AArch64::GPR32allRegClassID:
+ case AArch64::GPR64commonRegClassID:
+ case AArch64::GPR64RegClassID:
+ case AArch64::GPR64spRegClassID:
+ case AArch64::GPR64sponlyRegClassID:
+ case AArch64::GPR64allRegClassID:
+ case AArch64::tcGPR64RegClassID:
+ case AArch64::WSeqPairsClassRegClassID:
+ case AArch64::XSeqPairsClassRegClassID:
+ return getRegBank(AArch64::GPRRegBankID);
+ case AArch64::CCRRegClassID:
+ return getRegBank(AArch64::CCRRegBankID);
+ default:
+ llvm_unreachable("Register class not supported");
+ }
+}
+
+RegisterBankInfo::InstructionMappings
+AArch64RegisterBankInfo::getInstrAlternativeMappings(
+ const MachineInstr &MI) const {
+ const MachineFunction &MF = *MI.getParent()->getParent();
+ const TargetSubtargetInfo &STI = MF.getSubtarget();
+ const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ switch (MI.getOpcode()) {
+ case TargetOpcode::G_OR: {
+ // 32 and 64-bit or can be mapped on either FPR or
+ // GPR for the same cost.
+ unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI);
+ if (Size != 32 && Size != 64)
+ break;
+
+ // If the instruction has any implicit-defs or uses,
+ // do not mess with it.
+ if (MI.getNumOperands() != 3)
+ break;
+ InstructionMappings AltMappings;
+ InstructionMapping GPRMapping(
+ /*ID*/ 1, /*Cost*/ 1,
+ AArch64::getValueMapping(AArch64::PMI_FirstGPR, Size),
+ /*NumOperands*/ 3);
+ InstructionMapping FPRMapping(
+ /*ID*/ 2, /*Cost*/ 1,
+ AArch64::getValueMapping(AArch64::PMI_FirstFPR, Size),
+ /*NumOperands*/ 3);
+
+ AltMappings.emplace_back(std::move(GPRMapping));
+ AltMappings.emplace_back(std::move(FPRMapping));
+ return AltMappings;
+ }
+ case TargetOpcode::G_BITCAST: {
+ unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI);
+ if (Size != 32 && Size != 64)
+ break;
+
+ // If the instruction has any implicit-defs or uses,
+ // do not mess with it.
+ if (MI.getNumOperands() != 2)
+ break;
+
+ InstructionMappings AltMappings;
+ InstructionMapping GPRMapping(
+ /*ID*/ 1, /*Cost*/ 1,
+ AArch64::getCopyMapping(/*DstIsGPR*/ true, /*SrcIsGPR*/ true, Size),
+ /*NumOperands*/ 2);
+ InstructionMapping FPRMapping(
+ /*ID*/ 2, /*Cost*/ 1,
+ AArch64::getCopyMapping(/*DstIsGPR*/ false, /*SrcIsGPR*/ false, Size),
+ /*NumOperands*/ 2);
+ InstructionMapping GPRToFPRMapping(
+ /*ID*/ 3,
+ /*Cost*/ copyCost(AArch64::GPRRegBank, AArch64::FPRRegBank, Size),
+ AArch64::getCopyMapping(/*DstIsGPR*/ false, /*SrcIsGPR*/ true, Size),
+ /*NumOperands*/ 2);
+ InstructionMapping FPRToGPRMapping(
+ /*ID*/ 3,
+ /*Cost*/ copyCost(AArch64::GPRRegBank, AArch64::FPRRegBank, Size),
+ AArch64::getCopyMapping(/*DstIsGPR*/ true, /*SrcIsGPR*/ false, Size),
+ /*NumOperands*/ 2);
+
+ AltMappings.emplace_back(std::move(GPRMapping));
+ AltMappings.emplace_back(std::move(FPRMapping));
+ AltMappings.emplace_back(std::move(GPRToFPRMapping));
+ AltMappings.emplace_back(std::move(FPRToGPRMapping));
+ return AltMappings;
+ }
+ case TargetOpcode::G_LOAD: {
+ unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI);
+ if (Size != 64)
+ break;
+
+ // If the instruction has any implicit-defs or uses,
+ // do not mess with it.
+ if (MI.getNumOperands() != 2)
+ break;
+
+ InstructionMappings AltMappings;
+ InstructionMapping GPRMapping(
+ /*ID*/ 1, /*Cost*/ 1,
+ getOperandsMapping(
+ {AArch64::getValueMapping(AArch64::PMI_FirstGPR, Size),
+ // Addresses are GPR 64-bit.
+ AArch64::getValueMapping(AArch64::PMI_FirstGPR, 64)}),
+ /*NumOperands*/ 2);
+ InstructionMapping FPRMapping(
+ /*ID*/ 2, /*Cost*/ 1,
+ getOperandsMapping(
+ {AArch64::getValueMapping(AArch64::PMI_FirstFPR, Size),
+ // Addresses are GPR 64-bit.
+ AArch64::getValueMapping(AArch64::PMI_FirstGPR, 64)}),
+ /*NumOperands*/ 2);
+
+ AltMappings.emplace_back(std::move(GPRMapping));
+ AltMappings.emplace_back(std::move(FPRMapping));
+ return AltMappings;
+ }
+ default:
+ break;
+ }
+ return RegisterBankInfo::getInstrAlternativeMappings(MI);
+}
+
+void AArch64RegisterBankInfo::applyMappingImpl(
+ const OperandsMapper &OpdMapper) const {
+ switch (OpdMapper.getMI().getOpcode()) {
+ case TargetOpcode::G_OR:
+ case TargetOpcode::G_BITCAST:
+ case TargetOpcode::G_LOAD: {
+ // Those ID must match getInstrAlternativeMappings.
+ assert((OpdMapper.getInstrMapping().getID() >= 1 &&
+ OpdMapper.getInstrMapping().getID() <= 4) &&
+ "Don't know how to handle that ID");
+ return applyDefaultMapping(OpdMapper);
+ }
+ default:
+ llvm_unreachable("Don't know how to handle that operation");
+ }
+}
+
+/// Returns whether opcode \p Opc is a pre-isel generic floating-point opcode,
+/// having only floating-point operands.
+static bool isPreISelGenericFloatingPointOpcode(unsigned Opc) {
+ switch (Opc) {
+ case TargetOpcode::G_FADD:
+ case TargetOpcode::G_FSUB:
+ case TargetOpcode::G_FMUL:
+ case TargetOpcode::G_FDIV:
+ case TargetOpcode::G_FCONSTANT:
+ case TargetOpcode::G_FPEXT:
+ case TargetOpcode::G_FPTRUNC:
+ return true;
+ }
+ return false;
+}
+
+RegisterBankInfo::InstructionMapping
+AArch64RegisterBankInfo::getSameKindOfOperandsMapping(const MachineInstr &MI) {
+ const unsigned Opc = MI.getOpcode();
+ const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ unsigned NumOperands = MI.getNumOperands();
+ assert(NumOperands <= 3 &&
+ "This code is for instructions with 3 or less operands");
+
+ LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+ unsigned Size = Ty.getSizeInBits();
+ bool IsFPR = Ty.isVector() || isPreISelGenericFloatingPointOpcode(Opc);
+
+#ifndef NDEBUG
+ // Make sure all the operands are using similar size and type.
+ // Should probably be checked by the machine verifier.
+ // This code won't catch cases where the number of lanes is
+ // different between the operands.
+ // If we want to go to that level of details, it is probably
+ // best to check that the types are the same, period.
+ // Currently, we just check that the register banks are the same
+ // for each types.
+ for (unsigned Idx = 1; Idx != NumOperands; ++Idx) {
+ LLT OpTy = MRI.getType(MI.getOperand(Idx).getReg());
+ assert(AArch64::getRegBankBaseIdxOffset(OpTy.getSizeInBits()) ==
+ AArch64::getRegBankBaseIdxOffset(Size) &&
+ "Operand has incompatible size");
+ bool OpIsFPR = OpTy.isVector() || isPreISelGenericFloatingPointOpcode(Opc);
+ (void)OpIsFPR;
+ assert(IsFPR == OpIsFPR && "Operand has incompatible type");
+ }
+#endif // End NDEBUG.
+
+ AArch64::PartialMappingIdx RBIdx =
+ IsFPR ? AArch64::PMI_FirstFPR : AArch64::PMI_FirstGPR;
+
+ return InstructionMapping{DefaultMappingID, 1,
+ AArch64::getValueMapping(RBIdx, Size), NumOperands};
+}
+
+RegisterBankInfo::InstructionMapping
+AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
+ const unsigned Opc = MI.getOpcode();
+ const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ // Try the default logic for non-generic instructions that are either copies
+ // or already have some operands assigned to banks.
+ if (!isPreISelGenericOpcode(Opc)) {
+ RegisterBankInfo::InstructionMapping Mapping = getInstrMappingImpl(MI);
+ if (Mapping.isValid())
+ return Mapping;
+ }
+
+ switch (Opc) {
+ // G_{F|S|U}REM are not listed because they are not legal.
+ // Arithmetic ops.
+ case TargetOpcode::G_ADD:
+ case TargetOpcode::G_SUB:
+ case TargetOpcode::G_GEP:
+ case TargetOpcode::G_MUL:
+ case TargetOpcode::G_SDIV:
+ case TargetOpcode::G_UDIV:
+ // Bitwise ops.
+ case TargetOpcode::G_AND:
+ case TargetOpcode::G_OR:
+ case TargetOpcode::G_XOR:
+ // Shifts.
+ case TargetOpcode::G_SHL:
+ case TargetOpcode::G_LSHR:
+ case TargetOpcode::G_ASHR:
+ // Floating point ops.
+ case TargetOpcode::G_FADD:
+ case TargetOpcode::G_FSUB:
+ case TargetOpcode::G_FMUL:
+ case TargetOpcode::G_FDIV:
+ return getSameKindOfOperandsMapping(MI);
+ case TargetOpcode::G_BITCAST: {
+ LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+ LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
+ unsigned Size = DstTy.getSizeInBits();
+ bool DstIsGPR = !DstTy.isVector();
+ bool SrcIsGPR = !SrcTy.isVector();
+ const RegisterBank &DstRB =
+ DstIsGPR ? AArch64::GPRRegBank : AArch64::FPRRegBank;
+ const RegisterBank &SrcRB =
+ SrcIsGPR ? AArch64::GPRRegBank : AArch64::FPRRegBank;
+ return InstructionMapping{DefaultMappingID, copyCost(DstRB, SrcRB, Size),
+ AArch64::getCopyMapping(DstIsGPR, SrcIsGPR, Size),
+ /*NumOperands*/ 2};
+ }
+ case TargetOpcode::G_SEQUENCE:
+ // FIXME: support this, but the generic code is really not going to do
+ // anything sane.
+ return InstructionMapping();
+ default:
+ break;
+ }
+
+ unsigned NumOperands = MI.getNumOperands();
+
+ // Track the size and bank of each register. We don't do partial mappings.
+ SmallVector<unsigned, 4> OpSize(NumOperands);
+ SmallVector<AArch64::PartialMappingIdx, 4> OpRegBankIdx(NumOperands);
+ for (unsigned Idx = 0; Idx < NumOperands; ++Idx) {
+ auto &MO = MI.getOperand(Idx);
+ if (!MO.isReg())
+ continue;
+
+ LLT Ty = MRI.getType(MO.getReg());
+ OpSize[Idx] = Ty.getSizeInBits();
+
+ // As a top-level guess, vectors go in FPRs, scalars and pointers in GPRs.
+ // For floating-point instructions, scalars go in FPRs.
+ if (Ty.isVector() || isPreISelGenericFloatingPointOpcode(Opc))
+ OpRegBankIdx[Idx] = AArch64::PMI_FirstFPR;
+ else
+ OpRegBankIdx[Idx] = AArch64::PMI_FirstGPR;
+ }
+
+ unsigned Cost = 1;
+ // Some of the floating-point instructions have mixed GPR and FPR operands:
+ // fine-tune the computed mapping.
+ switch (Opc) {
+ case TargetOpcode::G_SITOFP:
+ case TargetOpcode::G_UITOFP: {
+ OpRegBankIdx = {AArch64::PMI_FirstFPR, AArch64::PMI_FirstGPR};
+ break;
+ }
+ case TargetOpcode::G_FPTOSI:
+ case TargetOpcode::G_FPTOUI: {
+ OpRegBankIdx = {AArch64::PMI_FirstGPR, AArch64::PMI_FirstFPR};
+ break;
+ }
+ case TargetOpcode::G_FCMP: {
+ OpRegBankIdx = {AArch64::PMI_FirstGPR,
+ /* Predicate */ AArch64::PMI_None, AArch64::PMI_FirstFPR,
+ AArch64::PMI_FirstFPR};
+ break;
+ }
+ case TargetOpcode::G_BITCAST: {
+ // This is going to be a cross register bank copy and this is expensive.
+ if (OpRegBankIdx[0] != OpRegBankIdx[1])
+ Cost =
+ copyCost(*AArch64::PartMappings[OpRegBankIdx[0]].RegBank,
+ *AArch64::PartMappings[OpRegBankIdx[1]].RegBank, OpSize[0]);
+ break;
+ }
+ case TargetOpcode::G_LOAD: {
+ // Loading in vector unit is slightly more expensive.
+ // This is actually only true for the LD1R and co instructions,
+ // but anyway for the fast mode this number does not matter and
+ // for the greedy mode the cost of the cross bank copy will
+ // offset this number.
+ // FIXME: Should be derived from the scheduling model.
+ if (OpRegBankIdx[0] >= AArch64::PMI_FirstFPR)
+ Cost = 2;
+ }
+ }
+
+ // Finally construct the computed mapping.
+ RegisterBankInfo::InstructionMapping Mapping =
+ InstructionMapping{DefaultMappingID, Cost, nullptr, NumOperands};
+ SmallVector<const ValueMapping *, 8> OpdsMapping(NumOperands);
+ for (unsigned Idx = 0; Idx < NumOperands; ++Idx)
+ if (MI.getOperand(Idx).isReg())
+ OpdsMapping[Idx] =
+ AArch64::getValueMapping(OpRegBankIdx[Idx], OpSize[Idx]);
+
+ Mapping.setOperandsMapping(getOperandsMapping(OpdsMapping));
+ return Mapping;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h
new file mode 100644
index 000000000000..f763235049d4
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h
@@ -0,0 +1,66 @@
+//===- AArch64RegisterBankInfo -----------------------------------*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the RegisterBankInfo class for AArch64.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERBANKINFO_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERBANKINFO_H
+
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+
+namespace llvm {
+
+class TargetRegisterInfo;
+
+namespace AArch64 {
+enum {
+ GPRRegBankID = 0, /// General Purpose Registers: W, X.
+ FPRRegBankID = 1, /// Floating Point/Vector Registers: B, H, S, D, Q.
+ CCRRegBankID = 2, /// Conditional register: NZCV.
+ NumRegisterBanks
+};
+
+extern RegisterBank GPRRegBank;
+extern RegisterBank FPRRegBank;
+extern RegisterBank CCRRegBank;
+} // End AArch64 namespace.
+
+/// This class provides the information for the target register banks.
+class AArch64RegisterBankInfo final : public RegisterBankInfo {
+ /// See RegisterBankInfo::applyMapping.
+ void applyMappingImpl(const OperandsMapper &OpdMapper) const override;
+
+ /// Get an instruction mapping where all the operands map to
+ /// the same register bank and have similar size.
+ ///
+ /// \pre MI.getNumOperands() <= 3
+ ///
+ /// \return An InstructionMappings with a statically allocated
+ /// OperandsMapping.
+ static InstructionMapping
+ getSameKindOfOperandsMapping(const MachineInstr &MI);
+
+public:
+ AArch64RegisterBankInfo(const TargetRegisterInfo &TRI);
+
+ unsigned copyCost(const RegisterBank &A, const RegisterBank &B,
+ unsigned Size) const override;
+
+ const RegisterBank &
+ getRegBankFromRegClass(const TargetRegisterClass &RC) const override;
+
+ InstructionMappings
+ getInstrAlternativeMappings(const MachineInstr &MI) const override;
+
+ InstructionMapping getInstrMapping(const MachineInstr &MI) const override;
+};
+} // End llvm namespace.
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
new file mode 100644
index 000000000000..98fad71aa18a
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -0,0 +1,450 @@
+//===- AArch64RegisterInfo.cpp - AArch64 Register Information -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64RegisterInfo.h"
+#include "AArch64FrameLowering.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+#define GET_REGINFO_TARGET_DESC
+#include "AArch64GenRegisterInfo.inc"
+
+AArch64RegisterInfo::AArch64RegisterInfo(const Triple &TT)
+ : AArch64GenRegisterInfo(AArch64::LR), TT(TT) {}
+
+const MCPhysReg *
+AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+ assert(MF && "Invalid MachineFunction pointer.");
+ if (MF->getFunction()->getCallingConv() == CallingConv::GHC)
+ // GHC set of callee saved regs is empty as all those regs are
+ // used for passing STG regs around
+ return CSR_AArch64_NoRegs_SaveList;
+ if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg)
+ return CSR_AArch64_AllRegs_SaveList;
+ if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS)
+ return MF->getInfo<AArch64FunctionInfo>()->isSplitCSR() ?
+ CSR_AArch64_CXX_TLS_Darwin_PE_SaveList :
+ CSR_AArch64_CXX_TLS_Darwin_SaveList;
+ if (MF->getSubtarget<AArch64Subtarget>().getTargetLowering()
+ ->supportSwiftError() &&
+ MF->getFunction()->getAttributes().hasAttrSomewhere(
+ Attribute::SwiftError))
+ return CSR_AArch64_AAPCS_SwiftError_SaveList;
+ if (MF->getFunction()->getCallingConv() == CallingConv::PreserveMost)
+ return CSR_AArch64_RT_MostRegs_SaveList;
+ else
+ return CSR_AArch64_AAPCS_SaveList;
+}
+
+const MCPhysReg *AArch64RegisterInfo::getCalleeSavedRegsViaCopy(
+ const MachineFunction *MF) const {
+ assert(MF && "Invalid MachineFunction pointer.");
+ if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
+ MF->getInfo<AArch64FunctionInfo>()->isSplitCSR())
+ return CSR_AArch64_CXX_TLS_Darwin_ViaCopy_SaveList;
+ return nullptr;
+}
+
+const uint32_t *
+AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
+ CallingConv::ID CC) const {
+ if (CC == CallingConv::GHC)
+ // This is academic becase all GHC calls are (supposed to be) tail calls
+ return CSR_AArch64_NoRegs_RegMask;
+ if (CC == CallingConv::AnyReg)
+ return CSR_AArch64_AllRegs_RegMask;
+ if (CC == CallingConv::CXX_FAST_TLS)
+ return CSR_AArch64_CXX_TLS_Darwin_RegMask;
+ if (MF.getSubtarget<AArch64Subtarget>().getTargetLowering()
+ ->supportSwiftError() &&
+ MF.getFunction()->getAttributes().hasAttrSomewhere(Attribute::SwiftError))
+ return CSR_AArch64_AAPCS_SwiftError_RegMask;
+ if (CC == CallingConv::PreserveMost)
+ return CSR_AArch64_RT_MostRegs_RegMask;
+ else
+ return CSR_AArch64_AAPCS_RegMask;
+}
+
+const uint32_t *AArch64RegisterInfo::getTLSCallPreservedMask() const {
+ if (TT.isOSDarwin())
+ return CSR_AArch64_TLS_Darwin_RegMask;
+
+ assert(TT.isOSBinFormatELF() && "only expect Darwin or ELF TLS");
+ return CSR_AArch64_TLS_ELF_RegMask;
+}
+
+const uint32_t *
+AArch64RegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF,
+ CallingConv::ID CC) const {
+ // This should return a register mask that is the same as that returned by
+ // getCallPreservedMask but that additionally preserves the register used for
+ // the first i64 argument (which must also be the register used to return a
+ // single i64 return value)
+ //
+ // In case that the calling convention does not use the same register for
+ // both, the function should return NULL (does not currently apply)
+ assert(CC != CallingConv::GHC && "should not be GHC calling convention.");
+ return CSR_AArch64_AAPCS_ThisReturn_RegMask;
+}
+
+BitVector
+AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+ const AArch64FrameLowering *TFI = getFrameLowering(MF);
+
+ // FIXME: avoid re-calculating this every time.
+ BitVector Reserved(getNumRegs());
+ markSuperRegs(Reserved, AArch64::SP);
+ markSuperRegs(Reserved, AArch64::XZR);
+ markSuperRegs(Reserved, AArch64::WSP);
+ markSuperRegs(Reserved, AArch64::WZR);
+
+ if (TFI->hasFP(MF) || TT.isOSDarwin()) {
+ markSuperRegs(Reserved, AArch64::FP);
+ markSuperRegs(Reserved, AArch64::W29);
+ }
+
+ if (MF.getSubtarget<AArch64Subtarget>().isX18Reserved()) {
+ markSuperRegs(Reserved, AArch64::X18); // Platform register
+ markSuperRegs(Reserved, AArch64::W18);
+ }
+
+ if (hasBasePointer(MF)) {
+ markSuperRegs(Reserved, AArch64::X19);
+ markSuperRegs(Reserved, AArch64::W19);
+ }
+
+ assert(checkAllSuperRegsMarked(Reserved));
+ return Reserved;
+}
+
+bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF,
+ unsigned Reg) const {
+ const AArch64FrameLowering *TFI = getFrameLowering(MF);
+
+ switch (Reg) {
+ default:
+ break;
+ case AArch64::SP:
+ case AArch64::XZR:
+ case AArch64::WSP:
+ case AArch64::WZR:
+ return true;
+ case AArch64::X18:
+ case AArch64::W18:
+ return MF.getSubtarget<AArch64Subtarget>().isX18Reserved();
+ case AArch64::FP:
+ case AArch64::W29:
+ return TFI->hasFP(MF) || TT.isOSDarwin();
+ case AArch64::W19:
+ case AArch64::X19:
+ return hasBasePointer(MF);
+ }
+
+ return false;
+}
+
+bool AArch64RegisterInfo::isConstantPhysReg(unsigned PhysReg) const {
+ return PhysReg == AArch64::WZR || PhysReg == AArch64::XZR;
+}
+
+const TargetRegisterClass *
+AArch64RegisterInfo::getPointerRegClass(const MachineFunction &MF,
+ unsigned Kind) const {
+ return &AArch64::GPR64RegClass;
+}
+
+const TargetRegisterClass *
+AArch64RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
+ if (RC == &AArch64::CCRRegClass)
+ return &AArch64::GPR64RegClass; // Only MSR & MRS copy NZCV.
+ return RC;
+}
+
+unsigned AArch64RegisterInfo::getBaseRegister() const { return AArch64::X19; }
+
+bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ // In the presence of variable sized objects, if the fixed stack size is
+ // large enough that referencing from the FP won't result in things being
+ // in range relatively often, we can use a base pointer to allow access
+ // from the other direction like the SP normally works.
+ // Furthermore, if both variable sized objects are present, and the
+ // stack needs to be dynamically re-aligned, the base pointer is the only
+ // reliable way to reference the locals.
+ if (MFI.hasVarSizedObjects()) {
+ if (needsStackRealignment(MF))
+ return true;
+ // Conservatively estimate whether the negative offset from the frame
+ // pointer will be sufficient to reach. If a function has a smallish
+ // frame, it's less likely to have lots of spills and callee saved
+ // space, so it's all more likely to be within range of the frame pointer.
+ // If it's wrong, we'll materialize the constant and still get to the
+ // object; it's just suboptimal. Negative offsets use the unscaled
+ // load/store instructions, which have a 9-bit signed immediate.
+ return MFI.getLocalFrameSize() >= 256;
+ }
+
+ return false;
+}
+
+unsigned
+AArch64RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+ const AArch64FrameLowering *TFI = getFrameLowering(MF);
+ return TFI->hasFP(MF) ? AArch64::FP : AArch64::SP;
+}
+
+bool AArch64RegisterInfo::requiresRegisterScavenging(
+ const MachineFunction &MF) const {
+ return true;
+}
+
+bool AArch64RegisterInfo::requiresVirtualBaseRegisters(
+ const MachineFunction &MF) const {
+ return true;
+}
+
+bool
+AArch64RegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ // AArch64FrameLowering::resolveFrameIndexReference() can always fall back
+ // to the stack pointer, so only put the emergency spill slot next to the
+ // FP when there's no better way to access it (SP or base pointer).
+ return MFI.hasVarSizedObjects() && !hasBasePointer(MF);
+}
+
+bool AArch64RegisterInfo::requiresFrameIndexScavenging(
+ const MachineFunction &MF) const {
+ return true;
+}
+
+bool
+AArch64RegisterInfo::cannotEliminateFrame(const MachineFunction &MF) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ if (MF.getTarget().Options.DisableFramePointerElim(MF) && MFI.adjustsStack())
+ return true;
+ return MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken();
+}
+
+/// needsFrameBaseReg - Returns true if the instruction's frame index
+/// reference would be better served by a base register other than FP
+/// or SP. Used by LocalStackFrameAllocation to determine which frame index
+/// references it should create new base registers for.
+bool AArch64RegisterInfo::needsFrameBaseReg(MachineInstr *MI,
+ int64_t Offset) const {
+ for (unsigned i = 0; !MI->getOperand(i).isFI(); ++i)
+ assert(i < MI->getNumOperands() &&
+ "Instr doesn't have FrameIndex operand!");
+
+ // It's the load/store FI references that cause issues, as it can be difficult
+ // to materialize the offset if it won't fit in the literal field. Estimate
+ // based on the size of the local frame and some conservative assumptions
+ // about the rest of the stack frame (note, this is pre-regalloc, so
+ // we don't know everything for certain yet) whether this offset is likely
+ // to be out of range of the immediate. Return true if so.
+
+ // We only generate virtual base registers for loads and stores, so
+ // return false for everything else.
+ if (!MI->mayLoad() && !MI->mayStore())
+ return false;
+
+ // Without a virtual base register, if the function has variable sized
+ // objects, all fixed-size local references will be via the frame pointer,
+ // Approximate the offset and see if it's legal for the instruction.
+ // Note that the incoming offset is based on the SP value at function entry,
+ // so it'll be negative.
+ MachineFunction &MF = *MI->getParent()->getParent();
+ const AArch64FrameLowering *TFI = getFrameLowering(MF);
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ // Estimate an offset from the frame pointer.
+ // Conservatively assume all GPR callee-saved registers get pushed.
+ // FP, LR, X19-X28, D8-D15. 64-bits each.
+ int64_t FPOffset = Offset - 16 * 20;
+ // Estimate an offset from the stack pointer.
+ // The incoming offset is relating to the SP at the start of the function,
+ // but when we access the local it'll be relative to the SP after local
+ // allocation, so adjust our SP-relative offset by that allocation size.
+ Offset += MFI.getLocalFrameSize();
+ // Assume that we'll have at least some spill slots allocated.
+ // FIXME: This is a total SWAG number. We should run some statistics
+ // and pick a real one.
+ Offset += 128; // 128 bytes of spill slots
+
+ // If there is a frame pointer, try using it.
+ // The FP is only available if there is no dynamic realignment. We
+ // don't know for sure yet whether we'll need that, so we guess based
+ // on whether there are any local variables that would trigger it.
+ if (TFI->hasFP(MF) && isFrameOffsetLegal(MI, AArch64::FP, FPOffset))
+ return false;
+
+ // If we can reference via the stack pointer or base pointer, try that.
+ // FIXME: This (and the code that resolves the references) can be improved
+ // to only disallow SP relative references in the live range of
+ // the VLA(s). In practice, it's unclear how much difference that
+ // would make, but it may be worth doing.
+ if (isFrameOffsetLegal(MI, AArch64::SP, Offset))
+ return false;
+
+ // The offset likely isn't legal; we want to allocate a virtual base register.
+ return true;
+}
+
+bool AArch64RegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
+ unsigned BaseReg,
+ int64_t Offset) const {
+ assert(Offset <= INT_MAX && "Offset too big to fit in int.");
+ assert(MI && "Unable to get the legal offset for nil instruction.");
+ int SaveOffset = Offset;
+ return isAArch64FrameOffsetLegal(*MI, SaveOffset) & AArch64FrameOffsetIsLegal;
+}
+
+/// Insert defining instruction(s) for BaseReg to be a pointer to FrameIdx
+/// at the beginning of the basic block.
+void AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
+ unsigned BaseReg,
+ int FrameIdx,
+ int64_t Offset) const {
+ MachineBasicBlock::iterator Ins = MBB->begin();
+ DebugLoc DL; // Defaults to "unknown"
+ if (Ins != MBB->end())
+ DL = Ins->getDebugLoc();
+ const MachineFunction &MF = *MBB->getParent();
+ const AArch64InstrInfo *TII =
+ MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
+ const MCInstrDesc &MCID = TII->get(AArch64::ADDXri);
+ MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+ MRI.constrainRegClass(BaseReg, TII->getRegClass(MCID, 0, this, MF));
+ unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0);
+
+ BuildMI(*MBB, Ins, DL, MCID, BaseReg)
+ .addFrameIndex(FrameIdx)
+ .addImm(Offset)
+ .addImm(Shifter);
+}
+
+void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+ int64_t Offset) const {
+ int Off = Offset; // ARM doesn't need the general 64-bit offsets
+ unsigned i = 0;
+
+ while (!MI.getOperand(i).isFI()) {
+ ++i;
+ assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+ }
+ const MachineFunction *MF = MI.getParent()->getParent();
+ const AArch64InstrInfo *TII =
+ MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
+ bool Done = rewriteAArch64FrameIndex(MI, i, BaseReg, Off, TII);
+ assert(Done && "Unable to resolve frame index!");
+ (void)Done;
+}
+
+void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+ int SPAdj, unsigned FIOperandNum,
+ RegScavenger *RS) const {
+ assert(SPAdj == 0 && "Unexpected");
+
+ MachineInstr &MI = *II;
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineFunction &MF = *MBB.getParent();
+ const AArch64InstrInfo *TII =
+ MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
+ const AArch64FrameLowering *TFI = getFrameLowering(MF);
+
+ int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+ unsigned FrameReg;
+ int Offset;
+
+ // Special handling of dbg_value, stackmap and patchpoint instructions.
+ if (MI.isDebugValue() || MI.getOpcode() == TargetOpcode::STACKMAP ||
+ MI.getOpcode() == TargetOpcode::PATCHPOINT) {
+ Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg,
+ /*PreferFP=*/true);
+ Offset += MI.getOperand(FIOperandNum + 1).getImm();
+ MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false /*isDef*/);
+ MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+ return;
+ }
+
+ // Modify MI as necessary to handle as much of 'Offset' as possible
+ Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg);
+ if (rewriteAArch64FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII))
+ return;
+
+ assert((!RS || !RS->isScavengingFrameIndex(FrameIndex)) &&
+ "Emergency spill slot is out of reach");
+
+ // If we get here, the immediate doesn't fit into the instruction. We folded
+ // as much as possible above. Handle the rest, providing a register that is
+ // SP+LargeImm.
+ unsigned ScratchReg =
+ MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+ emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII);
+ MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true);
+}
+
+unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+ MachineFunction &MF) const {
+ const AArch64FrameLowering *TFI = getFrameLowering(MF);
+
+ switch (RC->getID()) {
+ default:
+ return 0;
+ case AArch64::GPR32RegClassID:
+ case AArch64::GPR32spRegClassID:
+ case AArch64::GPR32allRegClassID:
+ case AArch64::GPR64spRegClassID:
+ case AArch64::GPR64allRegClassID:
+ case AArch64::GPR64RegClassID:
+ case AArch64::GPR32commonRegClassID:
+ case AArch64::GPR64commonRegClassID:
+ return 32 - 1 // XZR/SP
+ - (TFI->hasFP(MF) || TT.isOSDarwin()) // FP
+ - MF.getSubtarget<AArch64Subtarget>()
+ .isX18Reserved() // X18 reserved as platform register
+ - hasBasePointer(MF); // X19
+ case AArch64::FPR8RegClassID:
+ case AArch64::FPR16RegClassID:
+ case AArch64::FPR32RegClassID:
+ case AArch64::FPR64RegClassID:
+ case AArch64::FPR128RegClassID:
+ return 32;
+
+ case AArch64::DDRegClassID:
+ case AArch64::DDDRegClassID:
+ case AArch64::DDDDRegClassID:
+ case AArch64::QQRegClassID:
+ case AArch64::QQQRegClassID:
+ case AArch64::QQQQRegClassID:
+ return 32;
+
+ case AArch64::FPR128_loRegClassID:
+ return 16;
+ }
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
new file mode 100644
index 000000000000..8ce893516fe2
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -0,0 +1,106 @@
+//==- AArch64RegisterInfo.h - AArch64 Register Information Impl --*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 implementation of the MRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERINFO_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERINFO_H
+
+#define GET_REGINFO_HEADER
+#include "AArch64GenRegisterInfo.inc"
+
+namespace llvm {
+
+class MachineFunction;
+class RegScavenger;
+class TargetRegisterClass;
+class Triple;
+
+class AArch64RegisterInfo final : public AArch64GenRegisterInfo {
+ const Triple &TT;
+
+public:
+ AArch64RegisterInfo(const Triple &TT);
+
+ bool isReservedReg(const MachineFunction &MF, unsigned Reg) const;
+
+ /// Code Generation virtual methods...
+ const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+ const MCPhysReg *
+ getCalleeSavedRegsViaCopy(const MachineFunction *MF) const;
+ const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+ CallingConv::ID) const override;
+
+ unsigned getCSRFirstUseCost() const override {
+ // The cost will be compared against BlockFrequency where entry has the
+ // value of 1 << 14. A value of 5 will choose to spill or split really
+ // cold path instead of using a callee-saved register.
+ return 5;
+ }
+
+ // Calls involved in thread-local variable lookup save more registers than
+ // normal calls, so they need a different mask to represent this.
+ const uint32_t *getTLSCallPreservedMask() const;
+
+ /// getThisReturnPreservedMask - Returns a call preserved mask specific to the
+ /// case that 'returned' is on an i64 first argument if the calling convention
+ /// is one that can (partially) model this attribute with a preserved mask
+ /// (i.e. it is a calling convention that uses the same register for the first
+ /// i64 argument and an i64 return value)
+ ///
+ /// Should return NULL in the case that the calling convention does not have
+ /// this property
+ const uint32_t *getThisReturnPreservedMask(const MachineFunction &MF,
+ CallingConv::ID) const;
+
+ BitVector getReservedRegs(const MachineFunction &MF) const override;
+ bool isConstantPhysReg(unsigned PhysReg) const override;
+ const TargetRegisterClass *
+ getPointerRegClass(const MachineFunction &MF,
+ unsigned Kind = 0) const override;
+ const TargetRegisterClass *
+ getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
+
+ bool requiresRegisterScavenging(const MachineFunction &MF) const override;
+ bool useFPForScavengingIndex(const MachineFunction &MF) const override;
+ bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
+
+ bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
+ bool isFrameOffsetLegal(const MachineInstr *MI, unsigned BaseReg,
+ int64_t Offset) const override;
+ void materializeFrameBaseRegister(MachineBasicBlock *MBB, unsigned BaseReg,
+ int FrameIdx,
+ int64_t Offset) const override;
+ void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+ int64_t Offset) const override;
+ void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+ unsigned FIOperandNum,
+ RegScavenger *RS = nullptr) const override;
+ bool cannotEliminateFrame(const MachineFunction &MF) const;
+
+ bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override;
+ bool hasBasePointer(const MachineFunction &MF) const;
+ unsigned getBaseRegister() const;
+
+ // Debug information queries.
+ unsigned getFrameRegister(const MachineFunction &MF) const override;
+
+ unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+ MachineFunction &MF) const override;
+
+ bool trackLivenessAfterRegAlloc(const MachineFunction&) const override {
+ return true;
+ }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
new file mode 100644
index 000000000000..7e29ee5e9baf
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -0,0 +1,635 @@
+//=- AArch64RegisterInfo.td - Describe the AArch64 Registers -*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+
+class AArch64Reg<bits<16> enc, string n, list<Register> subregs = [],
+ list<string> altNames = []>
+ : Register<n, altNames> {
+ let HWEncoding = enc;
+ let Namespace = "AArch64";
+ let SubRegs = subregs;
+}
+
+let Namespace = "AArch64" in {
+ def sub_32 : SubRegIndex<32>;
+
+ def bsub : SubRegIndex<8>;
+ def hsub : SubRegIndex<16>;
+ def ssub : SubRegIndex<32>;
+ def dsub : SubRegIndex<32>;
+ def sube32 : SubRegIndex<32>;
+ def subo32 : SubRegIndex<32>;
+ def qhisub : SubRegIndex<64>;
+ def qsub : SubRegIndex<64>;
+ def sube64 : SubRegIndex<64>;
+ def subo64 : SubRegIndex<64>;
+ // Note: Code depends on these having consecutive numbers
+ def dsub0 : SubRegIndex<64>;
+ def dsub1 : SubRegIndex<64>;
+ def dsub2 : SubRegIndex<64>;
+ def dsub3 : SubRegIndex<64>;
+ // Note: Code depends on these having consecutive numbers
+ def qsub0 : SubRegIndex<128>;
+ def qsub1 : SubRegIndex<128>;
+ def qsub2 : SubRegIndex<128>;
+ def qsub3 : SubRegIndex<128>;
+}
+
+let Namespace = "AArch64" in {
+ def vreg : RegAltNameIndex;
+ def vlist1 : RegAltNameIndex;
+}
+
+//===----------------------------------------------------------------------===//
+// Registers
+//===----------------------------------------------------------------------===//
+def W0 : AArch64Reg<0, "w0" >, DwarfRegNum<[0]>;
+def W1 : AArch64Reg<1, "w1" >, DwarfRegNum<[1]>;
+def W2 : AArch64Reg<2, "w2" >, DwarfRegNum<[2]>;
+def W3 : AArch64Reg<3, "w3" >, DwarfRegNum<[3]>;
+def W4 : AArch64Reg<4, "w4" >, DwarfRegNum<[4]>;
+def W5 : AArch64Reg<5, "w5" >, DwarfRegNum<[5]>;
+def W6 : AArch64Reg<6, "w6" >, DwarfRegNum<[6]>;
+def W7 : AArch64Reg<7, "w7" >, DwarfRegNum<[7]>;
+def W8 : AArch64Reg<8, "w8" >, DwarfRegNum<[8]>;
+def W9 : AArch64Reg<9, "w9" >, DwarfRegNum<[9]>;
+def W10 : AArch64Reg<10, "w10">, DwarfRegNum<[10]>;
+def W11 : AArch64Reg<11, "w11">, DwarfRegNum<[11]>;
+def W12 : AArch64Reg<12, "w12">, DwarfRegNum<[12]>;
+def W13 : AArch64Reg<13, "w13">, DwarfRegNum<[13]>;
+def W14 : AArch64Reg<14, "w14">, DwarfRegNum<[14]>;
+def W15 : AArch64Reg<15, "w15">, DwarfRegNum<[15]>;
+def W16 : AArch64Reg<16, "w16">, DwarfRegNum<[16]>;
+def W17 : AArch64Reg<17, "w17">, DwarfRegNum<[17]>;
+def W18 : AArch64Reg<18, "w18">, DwarfRegNum<[18]>;
+def W19 : AArch64Reg<19, "w19">, DwarfRegNum<[19]>;
+def W20 : AArch64Reg<20, "w20">, DwarfRegNum<[20]>;
+def W21 : AArch64Reg<21, "w21">, DwarfRegNum<[21]>;
+def W22 : AArch64Reg<22, "w22">, DwarfRegNum<[22]>;
+def W23 : AArch64Reg<23, "w23">, DwarfRegNum<[23]>;
+def W24 : AArch64Reg<24, "w24">, DwarfRegNum<[24]>;
+def W25 : AArch64Reg<25, "w25">, DwarfRegNum<[25]>;
+def W26 : AArch64Reg<26, "w26">, DwarfRegNum<[26]>;
+def W27 : AArch64Reg<27, "w27">, DwarfRegNum<[27]>;
+def W28 : AArch64Reg<28, "w28">, DwarfRegNum<[28]>;
+def W29 : AArch64Reg<29, "w29">, DwarfRegNum<[29]>;
+def W30 : AArch64Reg<30, "w30">, DwarfRegNum<[30]>;
+def WSP : AArch64Reg<31, "wsp">, DwarfRegNum<[31]>;
+def WZR : AArch64Reg<31, "wzr">, DwarfRegAlias<WSP>;
+
+let SubRegIndices = [sub_32] in {
+def X0 : AArch64Reg<0, "x0", [W0]>, DwarfRegAlias<W0>;
+def X1 : AArch64Reg<1, "x1", [W1]>, DwarfRegAlias<W1>;
+def X2 : AArch64Reg<2, "x2", [W2]>, DwarfRegAlias<W2>;
+def X3 : AArch64Reg<3, "x3", [W3]>, DwarfRegAlias<W3>;
+def X4 : AArch64Reg<4, "x4", [W4]>, DwarfRegAlias<W4>;
+def X5 : AArch64Reg<5, "x5", [W5]>, DwarfRegAlias<W5>;
+def X6 : AArch64Reg<6, "x6", [W6]>, DwarfRegAlias<W6>;
+def X7 : AArch64Reg<7, "x7", [W7]>, DwarfRegAlias<W7>;
+def X8 : AArch64Reg<8, "x8", [W8]>, DwarfRegAlias<W8>;
+def X9 : AArch64Reg<9, "x9", [W9]>, DwarfRegAlias<W9>;
+def X10 : AArch64Reg<10, "x10", [W10]>, DwarfRegAlias<W10>;
+def X11 : AArch64Reg<11, "x11", [W11]>, DwarfRegAlias<W11>;
+def X12 : AArch64Reg<12, "x12", [W12]>, DwarfRegAlias<W12>;
+def X13 : AArch64Reg<13, "x13", [W13]>, DwarfRegAlias<W13>;
+def X14 : AArch64Reg<14, "x14", [W14]>, DwarfRegAlias<W14>;
+def X15 : AArch64Reg<15, "x15", [W15]>, DwarfRegAlias<W15>;
+def X16 : AArch64Reg<16, "x16", [W16]>, DwarfRegAlias<W16>;
+def X17 : AArch64Reg<17, "x17", [W17]>, DwarfRegAlias<W17>;
+def X18 : AArch64Reg<18, "x18", [W18]>, DwarfRegAlias<W18>;
+def X19 : AArch64Reg<19, "x19", [W19]>, DwarfRegAlias<W19>;
+def X20 : AArch64Reg<20, "x20", [W20]>, DwarfRegAlias<W20>;
+def X21 : AArch64Reg<21, "x21", [W21]>, DwarfRegAlias<W21>;
+def X22 : AArch64Reg<22, "x22", [W22]>, DwarfRegAlias<W22>;
+def X23 : AArch64Reg<23, "x23", [W23]>, DwarfRegAlias<W23>;
+def X24 : AArch64Reg<24, "x24", [W24]>, DwarfRegAlias<W24>;
+def X25 : AArch64Reg<25, "x25", [W25]>, DwarfRegAlias<W25>;
+def X26 : AArch64Reg<26, "x26", [W26]>, DwarfRegAlias<W26>;
+def X27 : AArch64Reg<27, "x27", [W27]>, DwarfRegAlias<W27>;
+def X28 : AArch64Reg<28, "x28", [W28]>, DwarfRegAlias<W28>;
+def FP : AArch64Reg<29, "x29", [W29]>, DwarfRegAlias<W29>;
+def LR : AArch64Reg<30, "x30", [W30]>, DwarfRegAlias<W30>;
+def SP : AArch64Reg<31, "sp", [WSP]>, DwarfRegAlias<WSP>;
+def XZR : AArch64Reg<31, "xzr", [WZR]>, DwarfRegAlias<WSP>;
+}
+
+// Condition code register.
+def NZCV : AArch64Reg<0, "nzcv">;
+
+// GPR register classes with the intersections of GPR32/GPR32sp and
+// GPR64/GPR64sp for use by the coalescer.
+def GPR32common : RegisterClass<"AArch64", [i32], 32, (sequence "W%u", 0, 30)> {
+ let AltOrders = [(rotl GPR32common, 8)];
+ let AltOrderSelect = [{ return 1; }];
+}
+def GPR64common : RegisterClass<"AArch64", [i64], 64,
+ (add (sequence "X%u", 0, 28), FP, LR)> {
+ let AltOrders = [(rotl GPR64common, 8)];
+ let AltOrderSelect = [{ return 1; }];
+}
+// GPR register classes which exclude SP/WSP.
+def GPR32 : RegisterClass<"AArch64", [i32], 32, (add GPR32common, WZR)> {
+ let AltOrders = [(rotl GPR32, 8)];
+ let AltOrderSelect = [{ return 1; }];
+}
+def GPR64 : RegisterClass<"AArch64", [i64], 64, (add GPR64common, XZR)> {
+ let AltOrders = [(rotl GPR64, 8)];
+ let AltOrderSelect = [{ return 1; }];
+}
+
+// GPR register classes which include SP/WSP.
+def GPR32sp : RegisterClass<"AArch64", [i32], 32, (add GPR32common, WSP)> {
+ let AltOrders = [(rotl GPR32sp, 8)];
+ let AltOrderSelect = [{ return 1; }];
+}
+def GPR64sp : RegisterClass<"AArch64", [i64], 64, (add GPR64common, SP)> {
+ let AltOrders = [(rotl GPR64sp, 8)];
+ let AltOrderSelect = [{ return 1; }];
+}
+
+def GPR32sponly : RegisterClass<"AArch64", [i32], 32, (add WSP)>;
+def GPR64sponly : RegisterClass<"AArch64", [i64], 64, (add SP)>;
+
+def GPR64spPlus0Operand : AsmOperandClass {
+ let Name = "GPR64sp0";
+ let RenderMethod = "addRegOperands";
+ let ParserMethod = "tryParseGPR64sp0Operand";
+}
+
+def GPR64sp0 : RegisterOperand<GPR64sp> {
+ let ParserMatchClass = GPR64spPlus0Operand;
+}
+
+// GPR register classes which include WZR/XZR AND SP/WSP. This is not a
+// constraint used by any instructions, it is used as a common super-class.
+def GPR32all : RegisterClass<"AArch64", [i32], 32, (add GPR32common, WZR, WSP)>;
+def GPR64all : RegisterClass<"AArch64", [i64], 64, (add GPR64common, XZR, SP)>;
+
+// For tail calls, we can't use callee-saved registers, as they are restored
+// to the saved value before the tail call, which would clobber a call address.
+// This is for indirect tail calls to store the address of the destination.
+def tcGPR64 : RegisterClass<"AArch64", [i64], 64, (sub GPR64common, X19, X20, X21,
+ X22, X23, X24, X25, X26,
+ X27, X28, FP, LR)>;
+
+// GPR register classes for post increment amount of vector load/store that
+// has alternate printing when Rm=31 and prints a constant immediate value
+// equal to the total number of bytes transferred.
+
+// FIXME: TableGen *should* be able to do these itself now. There appears to be
+// a bug in counting how many operands a Post-indexed MCInst should have which
+// means the aliases don't trigger.
+def GPR64pi1 : RegisterOperand<GPR64, "printPostIncOperand<1>">;
+def GPR64pi2 : RegisterOperand<GPR64, "printPostIncOperand<2>">;
+def GPR64pi3 : RegisterOperand<GPR64, "printPostIncOperand<3>">;
+def GPR64pi4 : RegisterOperand<GPR64, "printPostIncOperand<4>">;
+def GPR64pi6 : RegisterOperand<GPR64, "printPostIncOperand<6>">;
+def GPR64pi8 : RegisterOperand<GPR64, "printPostIncOperand<8>">;
+def GPR64pi12 : RegisterOperand<GPR64, "printPostIncOperand<12>">;
+def GPR64pi16 : RegisterOperand<GPR64, "printPostIncOperand<16>">;
+def GPR64pi24 : RegisterOperand<GPR64, "printPostIncOperand<24>">;
+def GPR64pi32 : RegisterOperand<GPR64, "printPostIncOperand<32>">;
+def GPR64pi48 : RegisterOperand<GPR64, "printPostIncOperand<48>">;
+def GPR64pi64 : RegisterOperand<GPR64, "printPostIncOperand<64>">;
+
+// Condition code regclass.
+def CCR : RegisterClass<"AArch64", [i32], 32, (add NZCV)> {
+ let CopyCost = -1; // Don't allow copying of status registers.
+
+ // CCR is not allocatable.
+ let isAllocatable = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating Point Scalar Registers
+//===----------------------------------------------------------------------===//
+
+def B0 : AArch64Reg<0, "b0">, DwarfRegNum<[64]>;
+def B1 : AArch64Reg<1, "b1">, DwarfRegNum<[65]>;
+def B2 : AArch64Reg<2, "b2">, DwarfRegNum<[66]>;
+def B3 : AArch64Reg<3, "b3">, DwarfRegNum<[67]>;
+def B4 : AArch64Reg<4, "b4">, DwarfRegNum<[68]>;
+def B5 : AArch64Reg<5, "b5">, DwarfRegNum<[69]>;
+def B6 : AArch64Reg<6, "b6">, DwarfRegNum<[70]>;
+def B7 : AArch64Reg<7, "b7">, DwarfRegNum<[71]>;
+def B8 : AArch64Reg<8, "b8">, DwarfRegNum<[72]>;
+def B9 : AArch64Reg<9, "b9">, DwarfRegNum<[73]>;
+def B10 : AArch64Reg<10, "b10">, DwarfRegNum<[74]>;
+def B11 : AArch64Reg<11, "b11">, DwarfRegNum<[75]>;
+def B12 : AArch64Reg<12, "b12">, DwarfRegNum<[76]>;
+def B13 : AArch64Reg<13, "b13">, DwarfRegNum<[77]>;
+def B14 : AArch64Reg<14, "b14">, DwarfRegNum<[78]>;
+def B15 : AArch64Reg<15, "b15">, DwarfRegNum<[79]>;
+def B16 : AArch64Reg<16, "b16">, DwarfRegNum<[80]>;
+def B17 : AArch64Reg<17, "b17">, DwarfRegNum<[81]>;
+def B18 : AArch64Reg<18, "b18">, DwarfRegNum<[82]>;
+def B19 : AArch64Reg<19, "b19">, DwarfRegNum<[83]>;
+def B20 : AArch64Reg<20, "b20">, DwarfRegNum<[84]>;
+def B21 : AArch64Reg<21, "b21">, DwarfRegNum<[85]>;
+def B22 : AArch64Reg<22, "b22">, DwarfRegNum<[86]>;
+def B23 : AArch64Reg<23, "b23">, DwarfRegNum<[87]>;
+def B24 : AArch64Reg<24, "b24">, DwarfRegNum<[88]>;
+def B25 : AArch64Reg<25, "b25">, DwarfRegNum<[89]>;
+def B26 : AArch64Reg<26, "b26">, DwarfRegNum<[90]>;
+def B27 : AArch64Reg<27, "b27">, DwarfRegNum<[91]>;
+def B28 : AArch64Reg<28, "b28">, DwarfRegNum<[92]>;
+def B29 : AArch64Reg<29, "b29">, DwarfRegNum<[93]>;
+def B30 : AArch64Reg<30, "b30">, DwarfRegNum<[94]>;
+def B31 : AArch64Reg<31, "b31">, DwarfRegNum<[95]>;
+
+let SubRegIndices = [bsub] in {
+def H0 : AArch64Reg<0, "h0", [B0]>, DwarfRegAlias<B0>;
+def H1 : AArch64Reg<1, "h1", [B1]>, DwarfRegAlias<B1>;
+def H2 : AArch64Reg<2, "h2", [B2]>, DwarfRegAlias<B2>;
+def H3 : AArch64Reg<3, "h3", [B3]>, DwarfRegAlias<B3>;
+def H4 : AArch64Reg<4, "h4", [B4]>, DwarfRegAlias<B4>;
+def H5 : AArch64Reg<5, "h5", [B5]>, DwarfRegAlias<B5>;
+def H6 : AArch64Reg<6, "h6", [B6]>, DwarfRegAlias<B6>;
+def H7 : AArch64Reg<7, "h7", [B7]>, DwarfRegAlias<B7>;
+def H8 : AArch64Reg<8, "h8", [B8]>, DwarfRegAlias<B8>;
+def H9 : AArch64Reg<9, "h9", [B9]>, DwarfRegAlias<B9>;
+def H10 : AArch64Reg<10, "h10", [B10]>, DwarfRegAlias<B10>;
+def H11 : AArch64Reg<11, "h11", [B11]>, DwarfRegAlias<B11>;
+def H12 : AArch64Reg<12, "h12", [B12]>, DwarfRegAlias<B12>;
+def H13 : AArch64Reg<13, "h13", [B13]>, DwarfRegAlias<B13>;
+def H14 : AArch64Reg<14, "h14", [B14]>, DwarfRegAlias<B14>;
+def H15 : AArch64Reg<15, "h15", [B15]>, DwarfRegAlias<B15>;
+def H16 : AArch64Reg<16, "h16", [B16]>, DwarfRegAlias<B16>;
+def H17 : AArch64Reg<17, "h17", [B17]>, DwarfRegAlias<B17>;
+def H18 : AArch64Reg<18, "h18", [B18]>, DwarfRegAlias<B18>;
+def H19 : AArch64Reg<19, "h19", [B19]>, DwarfRegAlias<B19>;
+def H20 : AArch64Reg<20, "h20", [B20]>, DwarfRegAlias<B20>;
+def H21 : AArch64Reg<21, "h21", [B21]>, DwarfRegAlias<B21>;
+def H22 : AArch64Reg<22, "h22", [B22]>, DwarfRegAlias<B22>;
+def H23 : AArch64Reg<23, "h23", [B23]>, DwarfRegAlias<B23>;
+def H24 : AArch64Reg<24, "h24", [B24]>, DwarfRegAlias<B24>;
+def H25 : AArch64Reg<25, "h25", [B25]>, DwarfRegAlias<B25>;
+def H26 : AArch64Reg<26, "h26", [B26]>, DwarfRegAlias<B26>;
+def H27 : AArch64Reg<27, "h27", [B27]>, DwarfRegAlias<B27>;
+def H28 : AArch64Reg<28, "h28", [B28]>, DwarfRegAlias<B28>;
+def H29 : AArch64Reg<29, "h29", [B29]>, DwarfRegAlias<B29>;
+def H30 : AArch64Reg<30, "h30", [B30]>, DwarfRegAlias<B30>;
+def H31 : AArch64Reg<31, "h31", [B31]>, DwarfRegAlias<B31>;
+}
+
+let SubRegIndices = [hsub] in {
+def S0 : AArch64Reg<0, "s0", [H0]>, DwarfRegAlias<B0>;
+def S1 : AArch64Reg<1, "s1", [H1]>, DwarfRegAlias<B1>;
+def S2 : AArch64Reg<2, "s2", [H2]>, DwarfRegAlias<B2>;
+def S3 : AArch64Reg<3, "s3", [H3]>, DwarfRegAlias<B3>;
+def S4 : AArch64Reg<4, "s4", [H4]>, DwarfRegAlias<B4>;
+def S5 : AArch64Reg<5, "s5", [H5]>, DwarfRegAlias<B5>;
+def S6 : AArch64Reg<6, "s6", [H6]>, DwarfRegAlias<B6>;
+def S7 : AArch64Reg<7, "s7", [H7]>, DwarfRegAlias<B7>;
+def S8 : AArch64Reg<8, "s8", [H8]>, DwarfRegAlias<B8>;
+def S9 : AArch64Reg<9, "s9", [H9]>, DwarfRegAlias<B9>;
+def S10 : AArch64Reg<10, "s10", [H10]>, DwarfRegAlias<B10>;
+def S11 : AArch64Reg<11, "s11", [H11]>, DwarfRegAlias<B11>;
+def S12 : AArch64Reg<12, "s12", [H12]>, DwarfRegAlias<B12>;
+def S13 : AArch64Reg<13, "s13", [H13]>, DwarfRegAlias<B13>;
+def S14 : AArch64Reg<14, "s14", [H14]>, DwarfRegAlias<B14>;
+def S15 : AArch64Reg<15, "s15", [H15]>, DwarfRegAlias<B15>;
+def S16 : AArch64Reg<16, "s16", [H16]>, DwarfRegAlias<B16>;
+def S17 : AArch64Reg<17, "s17", [H17]>, DwarfRegAlias<B17>;
+def S18 : AArch64Reg<18, "s18", [H18]>, DwarfRegAlias<B18>;
+def S19 : AArch64Reg<19, "s19", [H19]>, DwarfRegAlias<B19>;
+def S20 : AArch64Reg<20, "s20", [H20]>, DwarfRegAlias<B20>;
+def S21 : AArch64Reg<21, "s21", [H21]>, DwarfRegAlias<B21>;
+def S22 : AArch64Reg<22, "s22", [H22]>, DwarfRegAlias<B22>;
+def S23 : AArch64Reg<23, "s23", [H23]>, DwarfRegAlias<B23>;
+def S24 : AArch64Reg<24, "s24", [H24]>, DwarfRegAlias<B24>;
+def S25 : AArch64Reg<25, "s25", [H25]>, DwarfRegAlias<B25>;
+def S26 : AArch64Reg<26, "s26", [H26]>, DwarfRegAlias<B26>;
+def S27 : AArch64Reg<27, "s27", [H27]>, DwarfRegAlias<B27>;
+def S28 : AArch64Reg<28, "s28", [H28]>, DwarfRegAlias<B28>;
+def S29 : AArch64Reg<29, "s29", [H29]>, DwarfRegAlias<B29>;
+def S30 : AArch64Reg<30, "s30", [H30]>, DwarfRegAlias<B30>;
+def S31 : AArch64Reg<31, "s31", [H31]>, DwarfRegAlias<B31>;
+}
+
+let SubRegIndices = [ssub], RegAltNameIndices = [vreg, vlist1] in {
+def D0 : AArch64Reg<0, "d0", [S0], ["v0", ""]>, DwarfRegAlias<B0>;
+def D1 : AArch64Reg<1, "d1", [S1], ["v1", ""]>, DwarfRegAlias<B1>;
+def D2 : AArch64Reg<2, "d2", [S2], ["v2", ""]>, DwarfRegAlias<B2>;
+def D3 : AArch64Reg<3, "d3", [S3], ["v3", ""]>, DwarfRegAlias<B3>;
+def D4 : AArch64Reg<4, "d4", [S4], ["v4", ""]>, DwarfRegAlias<B4>;
+def D5 : AArch64Reg<5, "d5", [S5], ["v5", ""]>, DwarfRegAlias<B5>;
+def D6 : AArch64Reg<6, "d6", [S6], ["v6", ""]>, DwarfRegAlias<B6>;
+def D7 : AArch64Reg<7, "d7", [S7], ["v7", ""]>, DwarfRegAlias<B7>;
+def D8 : AArch64Reg<8, "d8", [S8], ["v8", ""]>, DwarfRegAlias<B8>;
+def D9 : AArch64Reg<9, "d9", [S9], ["v9", ""]>, DwarfRegAlias<B9>;
+def D10 : AArch64Reg<10, "d10", [S10], ["v10", ""]>, DwarfRegAlias<B10>;
+def D11 : AArch64Reg<11, "d11", [S11], ["v11", ""]>, DwarfRegAlias<B11>;
+def D12 : AArch64Reg<12, "d12", [S12], ["v12", ""]>, DwarfRegAlias<B12>;
+def D13 : AArch64Reg<13, "d13", [S13], ["v13", ""]>, DwarfRegAlias<B13>;
+def D14 : AArch64Reg<14, "d14", [S14], ["v14", ""]>, DwarfRegAlias<B14>;
+def D15 : AArch64Reg<15, "d15", [S15], ["v15", ""]>, DwarfRegAlias<B15>;
+def D16 : AArch64Reg<16, "d16", [S16], ["v16", ""]>, DwarfRegAlias<B16>;
+def D17 : AArch64Reg<17, "d17", [S17], ["v17", ""]>, DwarfRegAlias<B17>;
+def D18 : AArch64Reg<18, "d18", [S18], ["v18", ""]>, DwarfRegAlias<B18>;
+def D19 : AArch64Reg<19, "d19", [S19], ["v19", ""]>, DwarfRegAlias<B19>;
+def D20 : AArch64Reg<20, "d20", [S20], ["v20", ""]>, DwarfRegAlias<B20>;
+def D21 : AArch64Reg<21, "d21", [S21], ["v21", ""]>, DwarfRegAlias<B21>;
+def D22 : AArch64Reg<22, "d22", [S22], ["v22", ""]>, DwarfRegAlias<B22>;
+def D23 : AArch64Reg<23, "d23", [S23], ["v23", ""]>, DwarfRegAlias<B23>;
+def D24 : AArch64Reg<24, "d24", [S24], ["v24", ""]>, DwarfRegAlias<B24>;
+def D25 : AArch64Reg<25, "d25", [S25], ["v25", ""]>, DwarfRegAlias<B25>;
+def D26 : AArch64Reg<26, "d26", [S26], ["v26", ""]>, DwarfRegAlias<B26>;
+def D27 : AArch64Reg<27, "d27", [S27], ["v27", ""]>, DwarfRegAlias<B27>;
+def D28 : AArch64Reg<28, "d28", [S28], ["v28", ""]>, DwarfRegAlias<B28>;
+def D29 : AArch64Reg<29, "d29", [S29], ["v29", ""]>, DwarfRegAlias<B29>;
+def D30 : AArch64Reg<30, "d30", [S30], ["v30", ""]>, DwarfRegAlias<B30>;
+def D31 : AArch64Reg<31, "d31", [S31], ["v31", ""]>, DwarfRegAlias<B31>;
+}
+
+let SubRegIndices = [dsub], RegAltNameIndices = [vreg, vlist1] in {
+def Q0 : AArch64Reg<0, "q0", [D0], ["v0", ""]>, DwarfRegAlias<B0>;
+def Q1 : AArch64Reg<1, "q1", [D1], ["v1", ""]>, DwarfRegAlias<B1>;
+def Q2 : AArch64Reg<2, "q2", [D2], ["v2", ""]>, DwarfRegAlias<B2>;
+def Q3 : AArch64Reg<3, "q3", [D3], ["v3", ""]>, DwarfRegAlias<B3>;
+def Q4 : AArch64Reg<4, "q4", [D4], ["v4", ""]>, DwarfRegAlias<B4>;
+def Q5 : AArch64Reg<5, "q5", [D5], ["v5", ""]>, DwarfRegAlias<B5>;
+def Q6 : AArch64Reg<6, "q6", [D6], ["v6", ""]>, DwarfRegAlias<B6>;
+def Q7 : AArch64Reg<7, "q7", [D7], ["v7", ""]>, DwarfRegAlias<B7>;
+def Q8 : AArch64Reg<8, "q8", [D8], ["v8", ""]>, DwarfRegAlias<B8>;
+def Q9 : AArch64Reg<9, "q9", [D9], ["v9", ""]>, DwarfRegAlias<B9>;
+def Q10 : AArch64Reg<10, "q10", [D10], ["v10", ""]>, DwarfRegAlias<B10>;
+def Q11 : AArch64Reg<11, "q11", [D11], ["v11", ""]>, DwarfRegAlias<B11>;
+def Q12 : AArch64Reg<12, "q12", [D12], ["v12", ""]>, DwarfRegAlias<B12>;
+def Q13 : AArch64Reg<13, "q13", [D13], ["v13", ""]>, DwarfRegAlias<B13>;
+def Q14 : AArch64Reg<14, "q14", [D14], ["v14", ""]>, DwarfRegAlias<B14>;
+def Q15 : AArch64Reg<15, "q15", [D15], ["v15", ""]>, DwarfRegAlias<B15>;
+def Q16 : AArch64Reg<16, "q16", [D16], ["v16", ""]>, DwarfRegAlias<B16>;
+def Q17 : AArch64Reg<17, "q17", [D17], ["v17", ""]>, DwarfRegAlias<B17>;
+def Q18 : AArch64Reg<18, "q18", [D18], ["v18", ""]>, DwarfRegAlias<B18>;
+def Q19 : AArch64Reg<19, "q19", [D19], ["v19", ""]>, DwarfRegAlias<B19>;
+def Q20 : AArch64Reg<20, "q20", [D20], ["v20", ""]>, DwarfRegAlias<B20>;
+def Q21 : AArch64Reg<21, "q21", [D21], ["v21", ""]>, DwarfRegAlias<B21>;
+def Q22 : AArch64Reg<22, "q22", [D22], ["v22", ""]>, DwarfRegAlias<B22>;
+def Q23 : AArch64Reg<23, "q23", [D23], ["v23", ""]>, DwarfRegAlias<B23>;
+def Q24 : AArch64Reg<24, "q24", [D24], ["v24", ""]>, DwarfRegAlias<B24>;
+def Q25 : AArch64Reg<25, "q25", [D25], ["v25", ""]>, DwarfRegAlias<B25>;
+def Q26 : AArch64Reg<26, "q26", [D26], ["v26", ""]>, DwarfRegAlias<B26>;
+def Q27 : AArch64Reg<27, "q27", [D27], ["v27", ""]>, DwarfRegAlias<B27>;
+def Q28 : AArch64Reg<28, "q28", [D28], ["v28", ""]>, DwarfRegAlias<B28>;
+def Q29 : AArch64Reg<29, "q29", [D29], ["v29", ""]>, DwarfRegAlias<B29>;
+def Q30 : AArch64Reg<30, "q30", [D30], ["v30", ""]>, DwarfRegAlias<B30>;
+def Q31 : AArch64Reg<31, "q31", [D31], ["v31", ""]>, DwarfRegAlias<B31>;
+}
+
+def FPR8 : RegisterClass<"AArch64", [untyped], 8, (sequence "B%u", 0, 31)> {
+ let Size = 8;
+}
+def FPR16 : RegisterClass<"AArch64", [f16], 16, (sequence "H%u", 0, 31)> {
+ let Size = 16;
+}
+def FPR32 : RegisterClass<"AArch64", [f32, i32], 32,(sequence "S%u", 0, 31)>;
+def FPR64 : RegisterClass<"AArch64", [f64, i64, v2f32, v1f64, v8i8, v4i16, v2i32,
+ v1i64, v4f16],
+ 64, (sequence "D%u", 0, 31)>;
+// We don't (yet) have an f128 legal type, so don't use that here. We
+// normalize 128-bit vectors to v2f64 for arg passing and such, so use
+// that here.
+def FPR128 : RegisterClass<"AArch64",
+ [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, f128,
+ v8f16],
+ 128, (sequence "Q%u", 0, 31)>;
+
+// The lower 16 vector registers. Some instructions can only take registers
+// in this range.
+def FPR128_lo : RegisterClass<"AArch64",
+ [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16],
+ 128, (trunc FPR128, 16)>;
+
+// Pairs, triples, and quads of 64-bit vector registers.
+def DSeqPairs : RegisterTuples<[dsub0, dsub1], [(rotl FPR64, 0), (rotl FPR64, 1)]>;
+def DSeqTriples : RegisterTuples<[dsub0, dsub1, dsub2],
+ [(rotl FPR64, 0), (rotl FPR64, 1),
+ (rotl FPR64, 2)]>;
+def DSeqQuads : RegisterTuples<[dsub0, dsub1, dsub2, dsub3],
+ [(rotl FPR64, 0), (rotl FPR64, 1),
+ (rotl FPR64, 2), (rotl FPR64, 3)]>;
+def DD : RegisterClass<"AArch64", [untyped], 64, (add DSeqPairs)> {
+ let Size = 128;
+}
+def DDD : RegisterClass<"AArch64", [untyped], 64, (add DSeqTriples)> {
+ let Size = 192;
+}
+def DDDD : RegisterClass<"AArch64", [untyped], 64, (add DSeqQuads)> {
+ let Size = 256;
+}
+
+// Pairs, triples, and quads of 128-bit vector registers.
+def QSeqPairs : RegisterTuples<[qsub0, qsub1], [(rotl FPR128, 0), (rotl FPR128, 1)]>;
+def QSeqTriples : RegisterTuples<[qsub0, qsub1, qsub2],
+ [(rotl FPR128, 0), (rotl FPR128, 1),
+ (rotl FPR128, 2)]>;
+def QSeqQuads : RegisterTuples<[qsub0, qsub1, qsub2, qsub3],
+ [(rotl FPR128, 0), (rotl FPR128, 1),
+ (rotl FPR128, 2), (rotl FPR128, 3)]>;
+def QQ : RegisterClass<"AArch64", [untyped], 128, (add QSeqPairs)> {
+ let Size = 256;
+}
+def QQQ : RegisterClass<"AArch64", [untyped], 128, (add QSeqTriples)> {
+ let Size = 384;
+}
+def QQQQ : RegisterClass<"AArch64", [untyped], 128, (add QSeqQuads)> {
+ let Size = 512;
+}
+
+
+// Vector operand versions of the FP registers. Alternate name printing and
+// assmebler matching.
+def VectorReg64AsmOperand : AsmOperandClass {
+ let Name = "VectorReg64";
+ let PredicateMethod = "isVectorReg";
+}
+def VectorReg128AsmOperand : AsmOperandClass {
+ let Name = "VectorReg128";
+ let PredicateMethod = "isVectorReg";
+}
+
+def V64 : RegisterOperand<FPR64, "printVRegOperand"> {
+ let ParserMatchClass = VectorReg64AsmOperand;
+}
+
+def V128 : RegisterOperand<FPR128, "printVRegOperand"> {
+ let ParserMatchClass = VectorReg128AsmOperand;
+}
+
+def VectorRegLoAsmOperand : AsmOperandClass { let Name = "VectorRegLo"; }
+def V128_lo : RegisterOperand<FPR128_lo, "printVRegOperand"> {
+ let ParserMatchClass = VectorRegLoAsmOperand;
+}
+
+class TypedVecListAsmOperand<int count, int regsize, int lanes, string kind>
+ : AsmOperandClass {
+ let Name = "TypedVectorList" # count # "_" # lanes # kind;
+
+ let PredicateMethod
+ = "isTypedVectorList<" # count # ", " # lanes # ", '" # kind # "'>";
+ let RenderMethod = "addVectorList" # regsize # "Operands<" # count # ">";
+}
+
+class TypedVecListRegOperand<RegisterClass Reg, int lanes, string kind>
+ : RegisterOperand<Reg, "printTypedVectorList<" # lanes # ", '"
+ # kind # "'>">;
+
+multiclass VectorList<int count, RegisterClass Reg64, RegisterClass Reg128> {
+ // With implicit types (probably on instruction instead). E.g. { v0, v1 }
+ def _64AsmOperand : AsmOperandClass {
+ let Name = NAME # "64";
+ let PredicateMethod = "isImplicitlyTypedVectorList<" # count # ">";
+ let RenderMethod = "addVectorList64Operands<" # count # ">";
+ }
+
+ def "64" : RegisterOperand<Reg64, "printImplicitlyTypedVectorList"> {
+ let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_64AsmOperand");
+ }
+
+ def _128AsmOperand : AsmOperandClass {
+ let Name = NAME # "128";
+ let PredicateMethod = "isImplicitlyTypedVectorList<" # count # ">";
+ let RenderMethod = "addVectorList128Operands<" # count # ">";
+ }
+
+ def "128" : RegisterOperand<Reg128, "printImplicitlyTypedVectorList"> {
+ let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_128AsmOperand");
+ }
+
+ // 64-bit register lists with explicit type.
+
+ // { v0.8b, v1.8b }
+ def _8bAsmOperand : TypedVecListAsmOperand<count, 64, 8, "b">;
+ def "8b" : TypedVecListRegOperand<Reg64, 8, "b"> {
+ let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_8bAsmOperand");
+ }
+
+ // { v0.4h, v1.4h }
+ def _4hAsmOperand : TypedVecListAsmOperand<count, 64, 4, "h">;
+ def "4h" : TypedVecListRegOperand<Reg64, 4, "h"> {
+ let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_4hAsmOperand");
+ }
+
+ // { v0.2s, v1.2s }
+ def _2sAsmOperand : TypedVecListAsmOperand<count, 64, 2, "s">;
+ def "2s" : TypedVecListRegOperand<Reg64, 2, "s"> {
+ let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_2sAsmOperand");
+ }
+
+ // { v0.1d, v1.1d }
+ def _1dAsmOperand : TypedVecListAsmOperand<count, 64, 1, "d">;
+ def "1d" : TypedVecListRegOperand<Reg64, 1, "d"> {
+ let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_1dAsmOperand");
+ }
+
+ // 128-bit register lists with explicit type
+
+ // { v0.16b, v1.16b }
+ def _16bAsmOperand : TypedVecListAsmOperand<count, 128, 16, "b">;
+ def "16b" : TypedVecListRegOperand<Reg128, 16, "b"> {
+ let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_16bAsmOperand");
+ }
+
+ // { v0.8h, v1.8h }
+ def _8hAsmOperand : TypedVecListAsmOperand<count, 128, 8, "h">;
+ def "8h" : TypedVecListRegOperand<Reg128, 8, "h"> {
+ let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_8hAsmOperand");
+ }
+
+ // { v0.4s, v1.4s }
+ def _4sAsmOperand : TypedVecListAsmOperand<count, 128, 4, "s">;
+ def "4s" : TypedVecListRegOperand<Reg128, 4, "s"> {
+ let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_4sAsmOperand");
+ }
+
+ // { v0.2d, v1.2d }
+ def _2dAsmOperand : TypedVecListAsmOperand<count, 128, 2, "d">;
+ def "2d" : TypedVecListRegOperand<Reg128, 2, "d"> {
+ let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_2dAsmOperand");
+ }
+
+ // { v0.b, v1.b }
+ def _bAsmOperand : TypedVecListAsmOperand<count, 128, 0, "b">;
+ def "b" : TypedVecListRegOperand<Reg128, 0, "b"> {
+ let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_bAsmOperand");
+ }
+
+ // { v0.h, v1.h }
+ def _hAsmOperand : TypedVecListAsmOperand<count, 128, 0, "h">;
+ def "h" : TypedVecListRegOperand<Reg128, 0, "h"> {
+ let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_hAsmOperand");
+ }
+
+ // { v0.s, v1.s }
+ def _sAsmOperand : TypedVecListAsmOperand<count, 128, 0, "s">;
+ def "s" : TypedVecListRegOperand<Reg128, 0, "s"> {
+ let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_sAsmOperand");
+ }
+
+ // { v0.d, v1.d }
+ def _dAsmOperand : TypedVecListAsmOperand<count, 128, 0, "d">;
+ def "d" : TypedVecListRegOperand<Reg128, 0, "d"> {
+ let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_dAsmOperand");
+ }
+
+
+}
+
+defm VecListOne : VectorList<1, FPR64, FPR128>;
+defm VecListTwo : VectorList<2, DD, QQ>;
+defm VecListThree : VectorList<3, DDD, QQQ>;
+defm VecListFour : VectorList<4, DDDD, QQQQ>;
+
+
+// Register operand versions of the scalar FP registers.
+def FPR16Op : RegisterOperand<FPR16, "printOperand">;
+def FPR32Op : RegisterOperand<FPR32, "printOperand">;
+def FPR64Op : RegisterOperand<FPR64, "printOperand">;
+def FPR128Op : RegisterOperand<FPR128, "printOperand">;
+
+
+//===----------------------------------------------------------------------===//
+// ARMv8.1a atomic CASP register operands
+
+
+def WSeqPairs : RegisterTuples<[sube32, subo32],
+ [(rotl GPR32, 0), (rotl GPR32, 1)]>;
+def XSeqPairs : RegisterTuples<[sube64, subo64],
+ [(rotl GPR64, 0), (rotl GPR64, 1)]>;
+
+def WSeqPairsClass : RegisterClass<"AArch64", [untyped], 32,
+ (add WSeqPairs)>{
+ let Size = 64;
+}
+def XSeqPairsClass : RegisterClass<"AArch64", [untyped], 64,
+ (add XSeqPairs)>{
+ let Size = 128;
+}
+
+
+let RenderMethod = "addRegOperands", ParserMethod="tryParseGPRSeqPair" in {
+ def WSeqPairsAsmOperandClass : AsmOperandClass { let Name = "WSeqPair"; }
+ def XSeqPairsAsmOperandClass : AsmOperandClass { let Name = "XSeqPair"; }
+}
+
+def WSeqPairClassOperand :
+ RegisterOperand<WSeqPairsClass, "printGPRSeqPairsClassOperand<32>"> {
+ let ParserMatchClass = WSeqPairsAsmOperandClass;
+}
+def XSeqPairClassOperand :
+ RegisterOperand<XSeqPairsClass, "printGPRSeqPairsClassOperand<64>"> {
+ let ParserMatchClass = XSeqPairsAsmOperandClass;
+}
+
+
+//===----- END: v8.1a atomic CASP register operands -----------------------===//
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedA53.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedA53.td
new file mode 100644
index 000000000000..93ca079275c8
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedA53.td
@@ -0,0 +1,293 @@
+//==- AArch64SchedA53.td - Cortex-A53 Scheduling Definitions -*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the ARM Cortex A53 processors.
+//
+//===----------------------------------------------------------------------===//
+
+// ===---------------------------------------------------------------------===//
+// The following definitions describe the simpler per-operand machine model.
+// This works with MachineScheduler. See MCSchedModel.h for details.
+
+// Cortex-A53 machine model for scheduling and other instruction cost heuristics.
+def CortexA53Model : SchedMachineModel {
+ let MicroOpBufferSize = 0; // Explicitly set to zero since A53 is in-order.
+ let IssueWidth = 2; // 2 micro-ops are dispatched per cycle.
+ let LoadLatency = 3; // Optimistic load latency assuming bypass.
+ // This is overriden by OperandCycles if the
+ // Itineraries are queried instead.
+ let MispredictPenalty = 9; // Based on "Cortex-A53 Software Optimisation
+ // Specification - Instruction Timings"
+ // v 1.0 Spreadsheet
+ let CompleteModel = 1;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available.
+
+// Modeling each pipeline as a ProcResource using the BufferSize = 0 since
+// Cortex-A53 is in-order.
+
+def A53UnitALU : ProcResource<2> { let BufferSize = 0; } // Int ALU
+def A53UnitMAC : ProcResource<1> { let BufferSize = 0; } // Int MAC
+def A53UnitDiv : ProcResource<1> { let BufferSize = 0; } // Int Division
+def A53UnitLdSt : ProcResource<1> { let BufferSize = 0; } // Load/Store
+def A53UnitB : ProcResource<1> { let BufferSize = 0; } // Branch
+def A53UnitFPALU : ProcResource<1> { let BufferSize = 0; } // FP ALU
+def A53UnitFPMDS : ProcResource<1> { let BufferSize = 0; } // FP Mult/Div/Sqrt
+
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedWrite types which both map the ProcResources and
+// set the latency.
+
+let SchedModel = CortexA53Model in {
+
+// ALU - Despite having a full latency of 4, most of the ALU instructions can
+// forward a cycle earlier and then two cycles earlier in the case of a
+// shift-only instruction. These latencies will be incorrect when the
+// result cannot be forwarded, but modeling isn't rocket surgery.
+def : WriteRes<WriteImm, [A53UnitALU]> { let Latency = 3; }
+def : WriteRes<WriteI, [A53UnitALU]> { let Latency = 3; }
+def : WriteRes<WriteISReg, [A53UnitALU]> { let Latency = 3; }
+def : WriteRes<WriteIEReg, [A53UnitALU]> { let Latency = 3; }
+def : WriteRes<WriteIS, [A53UnitALU]> { let Latency = 2; }
+def : WriteRes<WriteExtr, [A53UnitALU]> { let Latency = 3; }
+
+// MAC
+def : WriteRes<WriteIM32, [A53UnitMAC]> { let Latency = 4; }
+def : WriteRes<WriteIM64, [A53UnitMAC]> { let Latency = 4; }
+
+// Div
+def : WriteRes<WriteID32, [A53UnitDiv]> { let Latency = 4; }
+def : WriteRes<WriteID64, [A53UnitDiv]> { let Latency = 4; }
+
+// Load
+def : WriteRes<WriteLD, [A53UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteLDIdx, [A53UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteLDHi, [A53UnitLdSt]> { let Latency = 4; }
+
+// Vector Load - Vector loads take 1-5 cycles to issue. For the WriteVecLd
+// below, choosing the median of 3 which makes the latency 6.
+// May model this more carefully in the future. The remaining
+// A53WriteVLD# types represent the 1-5 cycle issues explicitly.
+def : WriteRes<WriteVLD, [A53UnitLdSt]> { let Latency = 6;
+ let ResourceCycles = [3]; }
+def A53WriteVLD1 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 4; }
+def A53WriteVLD2 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 5;
+ let ResourceCycles = [2]; }
+def A53WriteVLD3 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 6;
+ let ResourceCycles = [3]; }
+def A53WriteVLD4 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 7;
+ let ResourceCycles = [4]; }
+def A53WriteVLD5 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 8;
+ let ResourceCycles = [5]; }
+
+// Pre/Post Indexing - Performed as part of address generation which is already
+// accounted for in the WriteST* latencies below
+def : WriteRes<WriteAdr, []> { let Latency = 0; }
+
+// Store
+def : WriteRes<WriteST, [A53UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteSTP, [A53UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteSTIdx, [A53UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteSTX, [A53UnitLdSt]> { let Latency = 4; }
+
+// Vector Store - Similar to vector loads, can take 1-3 cycles to issue.
+def : WriteRes<WriteVST, [A53UnitLdSt]> { let Latency = 5;
+ let ResourceCycles = [2];}
+def A53WriteVST1 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 4; }
+def A53WriteVST2 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 5;
+ let ResourceCycles = [2]; }
+def A53WriteVST3 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 6;
+ let ResourceCycles = [3]; }
+
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+
+// Branch
+def : WriteRes<WriteBr, [A53UnitB]>;
+def : WriteRes<WriteBrReg, [A53UnitB]>;
+def : WriteRes<WriteSys, [A53UnitB]>;
+def : WriteRes<WriteBarrier, [A53UnitB]>;
+def : WriteRes<WriteHint, [A53UnitB]>;
+
+// FP ALU
+def : WriteRes<WriteF, [A53UnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFCmp, [A53UnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFCvt, [A53UnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFCopy, [A53UnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFImm, [A53UnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteV, [A53UnitFPALU]> { let Latency = 6; }
+
+// FP Mul, Div, Sqrt
+def : WriteRes<WriteFMul, [A53UnitFPMDS]> { let Latency = 6; }
+def : WriteRes<WriteFDiv, [A53UnitFPMDS]> { let Latency = 33;
+ let ResourceCycles = [29]; }
+def A53WriteFMAC : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 10; }
+def A53WriteFDivSP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 18;
+ let ResourceCycles = [14]; }
+def A53WriteFDivDP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 33;
+ let ResourceCycles = [29]; }
+def A53WriteFSqrtSP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 17;
+ let ResourceCycles = [13]; }
+def A53WriteFSqrtDP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 32;
+ let ResourceCycles = [28]; }
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedRead types.
+
+// No forwarding for these reads.
+def : ReadAdvance<ReadExtrHi, 0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD, 0>;
+
+// ALU - Most operands in the ALU pipes are not needed for two cycles. Shiftable
+// operands are needed one cycle later if and only if they are to be
+// shifted. Otherwise, they too are needed two cycles later. This same
+// ReadAdvance applies to Extended registers as well, even though there is
+// a separate SchedPredicate for them.
+def : ReadAdvance<ReadI, 2, [WriteImm,WriteI,
+ WriteISReg, WriteIEReg,WriteIS,
+ WriteID32,WriteID64,
+ WriteIM32,WriteIM64]>;
+def A53ReadShifted : SchedReadAdvance<1, [WriteImm,WriteI,
+ WriteISReg, WriteIEReg,WriteIS,
+ WriteID32,WriteID64,
+ WriteIM32,WriteIM64]>;
+def A53ReadNotShifted : SchedReadAdvance<2, [WriteImm,WriteI,
+ WriteISReg, WriteIEReg,WriteIS,
+ WriteID32,WriteID64,
+ WriteIM32,WriteIM64]>;
+def A53ReadISReg : SchedReadVariant<[
+ SchedVar<RegShiftedPred, [A53ReadShifted]>,
+ SchedVar<NoSchedPred, [A53ReadNotShifted]>]>;
+def : SchedAlias<ReadISReg, A53ReadISReg>;
+
+def A53ReadIEReg : SchedReadVariant<[
+ SchedVar<RegExtendedPred, [A53ReadShifted]>,
+ SchedVar<NoSchedPred, [A53ReadNotShifted]>]>;
+def : SchedAlias<ReadIEReg, A53ReadIEReg>;
+
+// MAC - Operands are generally needed one cycle later in the MAC pipe.
+// Accumulator operands are needed two cycles later.
+def : ReadAdvance<ReadIM, 1, [WriteImm,WriteI,
+ WriteISReg, WriteIEReg,WriteIS,
+ WriteID32,WriteID64,
+ WriteIM32,WriteIM64]>;
+def : ReadAdvance<ReadIMA, 2, [WriteImm,WriteI,
+ WriteISReg, WriteIEReg,WriteIS,
+ WriteID32,WriteID64,
+ WriteIM32,WriteIM64]>;
+
+// Div
+def : ReadAdvance<ReadID, 1, [WriteImm,WriteI,
+ WriteISReg, WriteIEReg,WriteIS,
+ WriteID32,WriteID64,
+ WriteIM32,WriteIM64]>;
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific InstRWs.
+
+//---
+// Miscellaneous
+//---
+def : InstRW<[WriteI], (instrs COPY)>;
+
+//---
+// Vector Loads
+//---
+def : InstRW<[A53WriteVLD1], (instregex "LD1i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD3], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD4], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD3, WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A53WriteVLD1], (instregex "LD2i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVLD1], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD2], (instregex "LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[A53WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD2i(8|16|32|64)(_POST)?$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>;
+def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>;
+
+def : InstRW<[A53WriteVLD2], (instregex "LD3i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD4], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[A53WriteVLD3], (instregex "LD3Threev(2d)$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[A53WriteVLD3, WriteAdr], (instregex "LD3Threev(2d)_POST$")>;
+
+def : InstRW<[A53WriteVLD2], (instregex "LD4i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD5], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[A53WriteVLD4], (instregex "LD4Fourv(2d)$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD5, WriteAdr], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD4Fourv(2d)_POST$")>;
+
+//---
+// Vector Stores
+//---
+def : InstRW<[A53WriteVST1], (instregex "ST1i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A53WriteVST1], (instregex "ST2i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVST1], (instregex "ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A53WriteVST2], (instregex "ST3i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVST3], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST3Threev(2d)$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVST3, WriteAdr], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST3Threev(2d)_POST$")>;
+
+def : InstRW<[A53WriteVST2], (instregex "ST4i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVST3], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST4Fourv(2d)$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST4i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVST3, WriteAdr], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST4Fourv(2d)_POST$")>;
+
+//---
+// Floating Point MAC, DIV, SQRT
+//---
+def : InstRW<[A53WriteFMAC], (instregex "^FN?M(ADD|SUB).*")>;
+def : InstRW<[A53WriteFMAC], (instregex "^FML(A|S).*")>;
+def : InstRW<[A53WriteFDivSP], (instrs FDIVSrr)>;
+def : InstRW<[A53WriteFDivDP], (instrs FDIVDrr)>;
+def : InstRW<[A53WriteFDivSP], (instregex "^FDIVv.*32$")>;
+def : InstRW<[A53WriteFDivDP], (instregex "^FDIVv.*64$")>;
+def : InstRW<[A53WriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
+def : InstRW<[A53WriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
+
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedA57.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedA57.td
new file mode 100644
index 000000000000..99c48d0146e4
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedA57.td
@@ -0,0 +1,664 @@
+//=- AArch64SchedA57.td - ARM Cortex-A57 Scheduling Defs -----*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for ARM Cortex-A57 to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// The Cortex-A57 is a traditional superscaler microprocessor with a
+// conservative 3-wide in-order stage for decode and dispatch. Combined with the
+// much wider out-of-order issue stage, this produced a need to carefully
+// schedule micro-ops so that all three decoded each cycle are successfully
+// issued as the reservation station(s) simply don't stay occupied for long.
+// Therefore, IssueWidth is set to the narrower of the two at three, while still
+// modeling the machine as out-of-order.
+
+def CortexA57Model : SchedMachineModel {
+ let IssueWidth = 3; // 3-way decode and dispatch
+ let MicroOpBufferSize = 128; // 128 micro-op re-order buffer
+ let LoadLatency = 4; // Optimistic load latency
+ let MispredictPenalty = 14; // Fetch + Decode/Rename/Dispatch + Branch
+
+ // Enable partial & runtime unrolling. The magic number is chosen based on
+ // experiments and benchmarking data.
+ let LoopMicroOpBufferSize = 16;
+ let CompleteModel = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on Cortex-A57.
+// Cortex A-57 has 8 pipelines that each has its own 8-entry queue where
+// micro-ops wait for their operands and then issue out-of-order.
+
+def A57UnitB : ProcResource<1>; // Type B micro-ops
+def A57UnitI : ProcResource<2>; // Type I micro-ops
+def A57UnitM : ProcResource<1>; // Type M micro-ops
+def A57UnitL : ProcResource<1>; // Type L micro-ops
+def A57UnitS : ProcResource<1>; // Type S micro-ops
+def A57UnitX : ProcResource<1>; // Type X micro-ops
+def A57UnitW : ProcResource<1>; // Type W micro-ops
+let SchedModel = CortexA57Model in {
+ def A57UnitV : ProcResGroup<[A57UnitX, A57UnitW]>; // Type V micro-ops
+}
+
+let SchedModel = CortexA57Model in {
+
+//===----------------------------------------------------------------------===//
+// Define customized scheduler read/write types specific to the Cortex-A57.
+
+include "AArch64SchedA57WriteRes.td"
+
+//===----------------------------------------------------------------------===//
+// Map the target-defined scheduler read/write resources and latency for
+// Cortex-A57. The Cortex-A57 types are directly associated with resources, so
+// defining the aliases precludes the need for mapping them using WriteRes. The
+// aliases are sufficient for creating a coarse, working model. As the model
+// evolves, InstRWs will be used to override some of these SchedAliases.
+//
+// WARNING: Using SchedAliases is convenient and works well for latency and
+// resource lookup for instructions. However, this creates an entry in
+// AArch64WriteLatencyTable with a WriteResourceID of 0, breaking
+// any SchedReadAdvance since the lookup will fail.
+
+def : SchedAlias<WriteImm, A57Write_1cyc_1I>;
+def : SchedAlias<WriteI, A57Write_1cyc_1I>;
+def : SchedAlias<WriteISReg, A57Write_2cyc_1M>;
+def : SchedAlias<WriteIEReg, A57Write_2cyc_1M>;
+def : SchedAlias<WriteExtr, A57Write_1cyc_1I>;
+def : SchedAlias<WriteIS, A57Write_1cyc_1I>;
+def : SchedAlias<WriteID32, A57Write_19cyc_1M>;
+def : SchedAlias<WriteID64, A57Write_35cyc_1M>;
+def : WriteRes<WriteIM32, [A57UnitM]> { let Latency = 3; }
+def : WriteRes<WriteIM64, [A57UnitM]> { let Latency = 5; }
+def : SchedAlias<WriteBr, A57Write_1cyc_1B>;
+def : SchedAlias<WriteBrReg, A57Write_1cyc_1B>;
+def : SchedAlias<WriteLD, A57Write_4cyc_1L>;
+def : SchedAlias<WriteST, A57Write_1cyc_1S>;
+def : SchedAlias<WriteSTP, A57Write_1cyc_1S>;
+def : SchedAlias<WriteAdr, A57Write_1cyc_1I>;
+def : SchedAlias<WriteLDIdx, A57Write_4cyc_1I_1L>;
+def : SchedAlias<WriteSTIdx, A57Write_1cyc_1I_1S>;
+def : SchedAlias<WriteF, A57Write_3cyc_1V>;
+def : SchedAlias<WriteFCmp, A57Write_3cyc_1V>;
+def : SchedAlias<WriteFCvt, A57Write_5cyc_1V>;
+def : SchedAlias<WriteFCopy, A57Write_5cyc_1L>;
+def : SchedAlias<WriteFImm, A57Write_3cyc_1V>;
+def : SchedAlias<WriteFMul, A57Write_5cyc_1V>;
+def : SchedAlias<WriteFDiv, A57Write_17cyc_1W>;
+def : SchedAlias<WriteV, A57Write_3cyc_1V>;
+def : SchedAlias<WriteVLD, A57Write_5cyc_1L>;
+def : SchedAlias<WriteVST, A57Write_1cyc_1S>;
+
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+
+def : WriteRes<WriteSys, []> { let Latency = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint, []> { let Latency = 1; }
+
+def : WriteRes<WriteLDHi, []> { let Latency = 4; }
+
+// Forwarding logic is only modeled for multiply and accumulate
+def : ReadAdvance<ReadI, 0>;
+def : ReadAdvance<ReadISReg, 0>;
+def : ReadAdvance<ReadIEReg, 0>;
+def : ReadAdvance<ReadIM, 0>;
+def : ReadAdvance<ReadIMA, 2, [WriteIM32, WriteIM64]>;
+def : ReadAdvance<ReadID, 0>;
+def : ReadAdvance<ReadExtrHi, 0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD, 0>;
+
+
+//===----------------------------------------------------------------------===//
+// Specialize the coarse model by associating instruction groups with the
+// subtarget-defined types. As the modeled is refined, this will override most
+// of the above ShchedAlias mappings.
+
+// Miscellaneous
+// -----------------------------------------------------------------------------
+
+def : InstRW<[WriteI], (instrs COPY)>;
+
+
+// Branch Instructions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[A57Write_1cyc_1B_1I], (instrs BL)>;
+def : InstRW<[A57Write_2cyc_1B_1I], (instrs BLR)>;
+
+
+// Shifted Register with Shift == 0
+// ----------------------------------------------------------------------------
+
+def A57WriteISReg : SchedWriteVariant<[
+ SchedVar<RegShiftedPred, [WriteISReg]>,
+ SchedVar<NoSchedPred, [WriteI]>]>;
+def : InstRW<[A57WriteISReg], (instregex ".*rs$")>;
+
+
+// Divide and Multiply Instructions
+// -----------------------------------------------------------------------------
+
+// Multiply high
+def : InstRW<[A57Write_6cyc_1M], (instrs SMULHrr, UMULHrr)>;
+
+
+// Miscellaneous Data-Processing Instructions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[A57Write_1cyc_1I], (instrs EXTRWrri)>;
+def : InstRW<[A57Write_3cyc_1I_1M], (instrs EXTRXrri)>;
+def : InstRW<[A57Write_2cyc_1M], (instregex "BFM")>;
+
+
+// Cryptography Extensions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[A57Write_3cyc_1W], (instregex "^AES")>;
+def : InstRW<[A57Write_6cyc_2V], (instregex "^SHA1SU0")>;
+def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA1(H|SU1)")>;
+def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA1[CMP]")>;
+def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA256SU0")>;
+def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA256(H|H2|SU1)")>;
+def : InstRW<[A57Write_3cyc_1W], (instregex "^CRC32")>;
+
+
+// Vector Load
+// -----------------------------------------------------------------------------
+
+def : InstRW<[A57Write_8cyc_1L_1V], (instregex "LD1i(8|16|32)$")>;
+def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr], (instregex "LD1i(8|16|32)_POST$")>;
+def : InstRW<[A57Write_5cyc_1L], (instregex "LD1i(64)$")>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instregex "LD1i(64)_POST$")>;
+
+def : InstRW<[A57Write_8cyc_1L_1V], (instregex "LD1Rv(8b|4h|2s)$")>;
+def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr], (instregex "LD1Rv(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_5cyc_1L], (instregex "LD1Rv(1d)$")>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instregex "LD1Rv(1d)_POST$")>;
+def : InstRW<[A57Write_8cyc_1L_1V], (instregex "LD1Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr], (instregex "LD1Rv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A57Write_5cyc_1L], (instregex "LD1Onev(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_5cyc_1L], (instregex "LD1Onev(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A57Write_5cyc_1L], (instregex "LD1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L], (instregex "LD1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr], (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L], (instregex "LD1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_7cyc_3L], (instregex "LD1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_7cyc_3L, WriteAdr], (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L], (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_8cyc_4L], (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_8cyc_4L, WriteAdr], (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A57Write_8cyc_1L_2V], (instregex "LD2i(8|16)$")>;
+def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr], (instregex "LD2i(8|16)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L], (instregex "LD2i(32)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr], (instregex "LD2i(32)_POST$")>;
+def : InstRW<[A57Write_8cyc_1L_1V], (instregex "LD2i(64)$")>;
+def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr], (instregex "LD2i(64)_POST$")>;
+
+def : InstRW<[A57Write_8cyc_1L_1V], (instregex "LD2Rv(8b|4h|2s)$")>;
+def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr], (instregex "LD2Rv(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_5cyc_1L], (instregex "LD2Rv(1d)$")>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instregex "LD2Rv(1d)_POST$")>;
+def : InstRW<[A57Write_8cyc_1L_2V], (instregex "LD2Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr], (instregex "LD2Rv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A57Write_8cyc_1L_1V], (instregex "LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr], (instregex "LD2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_9cyc_2L_2V], (instregex "LD2Twov(16b|8h|4s)$")>;
+def : InstRW<[A57Write_9cyc_2L_2V, WriteAdr], (instregex "LD2Twov(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L], (instregex "LD2Twov(2d)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr], (instregex "LD2Twov(2d)_POST$")>;
+
+def : InstRW<[A57Write_9cyc_1L_3V], (instregex "LD3i(8|16)$")>;
+def : InstRW<[A57Write_9cyc_1L_3V, WriteAdr], (instregex "LD3i(8|16)_POST$")>;
+def : InstRW<[A57Write_8cyc_1L_2V], (instregex "LD3i(32)$")>;
+def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr], (instregex "LD3i(32)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L], (instregex "LD3i(64)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr], (instregex "LD3i(64)_POST$")>;
+
+def : InstRW<[A57Write_8cyc_1L_2V], (instregex "LD3Rv(8b|4h|2s)$")>;
+def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr], (instregex "LD3Rv(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L], (instregex "LD3Rv(1d)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr], (instregex "LD3Rv(1d)_POST$")>;
+def : InstRW<[A57Write_9cyc_1L_3V], (instregex "LD3Rv(16b|8h|4s)$")>;
+def : InstRW<[A57Write_9cyc_1L_3V, WriteAdr], (instregex "LD3Rv(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_9cyc_2L_3V], (instregex "LD3Rv(2d)$")>;
+def : InstRW<[A57Write_9cyc_2L_3V, WriteAdr], (instregex "LD3Rv(2d)_POST$")>;
+
+def : InstRW<[A57Write_9cyc_2L_2V], (instregex "LD3Threev(8b|4h|2s)$")>;
+def : InstRW<[A57Write_9cyc_2L_2V, WriteAdr], (instregex "LD3Threev(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_10cyc_3L_4V], (instregex "LD3Threev(16b|8h|4s)$")>;
+def : InstRW<[A57Write_10cyc_3L_4V, WriteAdr], (instregex "LD3Threev(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_8cyc_4L], (instregex "LD3Threev(2d)$")>;
+def : InstRW<[A57Write_8cyc_4L, WriteAdr], (instregex "LD3Threev(2d)_POST$")>;
+
+def : InstRW<[A57Write_9cyc_2L_3V], (instregex "LD4i(8|16)$")>;
+def : InstRW<[A57Write_9cyc_2L_3V, WriteAdr], (instregex "LD4i(8|16)_POST$")>;
+def : InstRW<[A57Write_8cyc_1L_2V], (instregex "LD4i(32)$")>;
+def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr], (instregex "LD4i(32)_POST$")>;
+def : InstRW<[A57Write_9cyc_2L_3V], (instregex "LD4i(64)$")>;
+def : InstRW<[A57Write_9cyc_2L_3V, WriteAdr], (instregex "LD4i(64)_POST$")>;
+
+def : InstRW<[A57Write_8cyc_1L_2V], (instregex "LD4Rv(8b|4h|2s)$")>;
+def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr], (instregex "LD4Rv(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L], (instregex "LD4Rv(1d)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr], (instregex "LD4Rv(1d)_POST$")>;
+def : InstRW<[A57Write_9cyc_2L_3V], (instregex "LD4Rv(16b|8h|4s)$")>;
+def : InstRW<[A57Write_9cyc_2L_3V, WriteAdr], (instregex "LD4Rv(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_9cyc_2L_4V], (instregex "LD4Rv(2d)$")>;
+def : InstRW<[A57Write_9cyc_2L_4V, WriteAdr], (instregex "LD4Rv(2d)_POST$")>;
+
+def : InstRW<[A57Write_9cyc_2L_2V], (instregex "LD4Fourv(8b|4h|2s)$")>;
+def : InstRW<[A57Write_9cyc_2L_2V, WriteAdr], (instregex "LD4Fourv(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_11cyc_4L_4V], (instregex "LD4Fourv(16b|8h|4s)$")>;
+def : InstRW<[A57Write_11cyc_4L_4V, WriteAdr], (instregex "LD4Fourv(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_8cyc_4L], (instregex "LD4Fourv(2d)$")>;
+def : InstRW<[A57Write_8cyc_4L, WriteAdr], (instregex "LD4Fourv(2d)_POST$")>;
+
+// Vector Store
+// -----------------------------------------------------------------------------
+
+def : InstRW<[A57Write_1cyc_1S], (instregex "ST1i(8|16|32)$")>;
+def : InstRW<[A57Write_1cyc_1S, WriteAdr], (instregex "ST1i(8|16|32)_POST$")>;
+def : InstRW<[A57Write_3cyc_1S_1V], (instregex "ST1i(64)$")>;
+def : InstRW<[A57Write_3cyc_1S_1V, WriteAdr], (instregex "ST1i(64)_POST$")>;
+
+def : InstRW<[A57Write_1cyc_1S], (instregex "ST1Onev(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_1cyc_1S, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_2cyc_2S], (instregex "ST1Onev(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_2cyc_2S, WriteAdr], (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A57Write_2cyc_2S], (instregex "ST1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_2cyc_2S, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_4cyc_4S], (instregex "ST1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_4cyc_4S, WriteAdr], (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A57Write_3cyc_3S], (instregex "ST1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_3cyc_3S, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_6cyc_6S], (instregex "ST1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_6cyc_6S, WriteAdr], (instregex "ST1Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A57Write_4cyc_4S], (instregex "ST1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_4cyc_4S, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_8cyc_8S], (instregex "ST1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_8cyc_8S, WriteAdr], (instregex "ST1Fourv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A57Write_3cyc_1S_1V], (instregex "ST2i(8|16|32)$")>;
+def : InstRW<[A57Write_3cyc_1S_1V, WriteAdr], (instregex "ST2i(8|16|32)_POST$")>;
+def : InstRW<[A57Write_2cyc_2S], (instregex "ST2i(64)$")>;
+def : InstRW<[A57Write_2cyc_2S, WriteAdr], (instregex "ST2i(64)_POST$")>;
+
+def : InstRW<[A57Write_3cyc_2S_1V], (instregex "ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[A57Write_3cyc_2S_1V, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_4cyc_4S_2V], (instregex "ST2Twov(16b|8h|4s)$")>;
+def : InstRW<[A57Write_4cyc_4S_2V, WriteAdr], (instregex "ST2Twov(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_4cyc_4S], (instregex "ST2Twov(2d)$")>;
+def : InstRW<[A57Write_4cyc_4S, WriteAdr], (instregex "ST2Twov(2d)_POST$")>;
+
+def : InstRW<[A57Write_3cyc_1S_1V], (instregex "ST3i(8|16)$")>;
+def : InstRW<[A57Write_3cyc_1S_1V, WriteAdr], (instregex "ST3i(8|16)_POST$")>;
+def : InstRW<[A57Write_3cyc_3S], (instregex "ST3i(32)$")>;
+def : InstRW<[A57Write_3cyc_3S, WriteAdr], (instregex "ST3i(32)_POST$")>;
+def : InstRW<[A57Write_3cyc_2S_1V], (instregex "ST3i(64)$")>;
+def : InstRW<[A57Write_3cyc_2S_1V, WriteAdr], (instregex "ST3i(64)_POST$")>;
+
+def : InstRW<[A57Write_3cyc_3S_2V], (instregex "ST3Threev(8b|4h|2s)$")>;
+def : InstRW<[A57Write_3cyc_3S_2V, WriteAdr], (instregex "ST3Threev(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_6cyc_6S_4V], (instregex "ST3Threev(16b|8h|4s)$")>;
+def : InstRW<[A57Write_6cyc_6S_4V, WriteAdr], (instregex "ST3Threev(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_6cyc_6S], (instregex "ST3Threev(2d)$")>;
+def : InstRW<[A57Write_6cyc_6S, WriteAdr], (instregex "ST3Threev(2d)_POST$")>;
+
+def : InstRW<[A57Write_3cyc_1S_1V], (instregex "ST4i(8|16)$")>;
+def : InstRW<[A57Write_3cyc_1S_1V, WriteAdr], (instregex "ST4i(8|16)_POST$")>;
+def : InstRW<[A57Write_4cyc_4S], (instregex "ST4i(32)$")>;
+def : InstRW<[A57Write_4cyc_4S, WriteAdr], (instregex "ST4i(32)_POST$")>;
+def : InstRW<[A57Write_3cyc_2S_1V], (instregex "ST4i(64)$")>;
+def : InstRW<[A57Write_3cyc_2S_1V, WriteAdr], (instregex "ST4i(64)_POST$")>;
+
+def : InstRW<[A57Write_4cyc_4S_2V], (instregex "ST4Fourv(8b|4h|2s)$")>;
+def : InstRW<[A57Write_4cyc_4S_2V, WriteAdr], (instregex "ST4Fourv(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_8cyc_8S_4V], (instregex "ST4Fourv(16b|8h|4s)$")>;
+def : InstRW<[A57Write_8cyc_8S_4V, WriteAdr], (instregex "ST4Fourv(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_8cyc_8S], (instregex "ST4Fourv(2d)$")>;
+def : InstRW<[A57Write_8cyc_8S, WriteAdr], (instregex "ST4Fourv(2d)_POST$")>;
+
+// Vector - Integer
+// -----------------------------------------------------------------------------
+
+// Reference for forms in this group
+// D form - v8i8, v4i16, v2i32
+// Q form - v16i8, v8i16, v4i32
+// D form - v1i8, v1i16, v1i32, v1i64
+// Q form - v16i8, v8i16, v4i32, v2i64
+// D form - v8i8_v8i16, v4i16_v4i32, v2i32_v2i64
+// Q form - v16i8_v8i16, v8i16_v4i32, v4i32_v2i64
+
+// ASIMD absolute diff accum, D-form
+def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]ABA(v8i8|v4i16|v2i32)$")>;
+// ASIMD absolute diff accum, Q-form
+def : InstRW<[A57Write_5cyc_2X], (instregex "^[SU]ABA(v16i8|v8i16|v4i32)$")>;
+// ASIMD absolute diff accum long
+def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]ABAL")>;
+
+// ASIMD arith, reduce, 4H/4S
+def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v$")>;
+// ASIMD arith, reduce, 8B/8H
+def : InstRW<[A57Write_7cyc_1V_1X], (instregex "^[SU]?ADDL?V(v8i16|v4i32)v$")>;
+// ASIMD arith, reduce, 16B
+def : InstRW<[A57Write_8cyc_2X], (instregex "^[SU]?ADDL?Vv16i8v$")>;
+
+// ASIMD max/min, reduce, 4H/4S
+def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v$")>;
+// ASIMD max/min, reduce, 8B/8H
+def : InstRW<[A57Write_7cyc_1V_1X], (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v$")>;
+// ASIMD max/min, reduce, 16B
+def : InstRW<[A57Write_8cyc_2X], (instregex "^[SU](MIN|MAX)Vv16i8v$")>;
+
+// ASIMD multiply, D-form
+def : InstRW<[A57Write_5cyc_1W], (instregex "^(P?MUL|SQR?DMULH)(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)(_indexed)?$")>;
+// ASIMD multiply, Q-form
+def : InstRW<[A57Write_6cyc_2W], (instregex "^(P?MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>;
+
+// ASIMD multiply accumulate, D-form
+def : InstRW<[A57Write_5cyc_1W], (instregex "^ML[AS](v8i8|v4i16|v2i32)(_indexed)?$")>;
+// ASIMD multiply accumulate, Q-form
+def : InstRW<[A57Write_6cyc_2W], (instregex "^ML[AS](v16i8|v8i16|v4i32)(_indexed)?$")>;
+
+// ASIMD multiply accumulate long
+// ASIMD multiply accumulate saturating long
+def A57WriteIVMA : SchedWriteRes<[A57UnitW]> { let Latency = 5; }
+def A57ReadIVMA4 : SchedReadAdvance<4, [A57WriteIVMA]>;
+def : InstRW<[A57WriteIVMA, A57ReadIVMA4], (instregex "^(S|U|SQD)ML[AS]L")>;
+
+// ASIMD multiply long
+def : InstRW<[A57Write_5cyc_1W], (instregex "^(S|U|SQD)MULL")>;
+def : InstRW<[A57Write_5cyc_1W], (instregex "^PMULL(v8i8|v16i8)")>;
+def : InstRW<[A57Write_3cyc_1W], (instregex "^PMULL(v1i64|v2i64)")>;
+
+// ASIMD pairwise add and accumulate
+// ASIMD shift accumulate
+def A57WriteIVA : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
+def A57ReadIVA3 : SchedReadAdvance<3, [A57WriteIVA]>;
+def : InstRW<[A57WriteIVA, A57ReadIVA3], (instregex "^[SU]ADALP")>;
+def : InstRW<[A57WriteIVA, A57ReadIVA3], (instregex "^(S|SR|U|UR)SRA")>;
+
+// ASIMD shift by immed, complex
+def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]?(Q|R){1,2}SHR")>;
+def : InstRW<[A57Write_4cyc_1X], (instregex "^SQSHLU")>;
+
+
+// ASIMD shift by register, basic, Q-form
+def : InstRW<[A57Write_4cyc_2X], (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>;
+
+// ASIMD shift by register, complex, D-form
+def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU][QR]{1,2}SHL(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32|b|d|h|s)")>;
+
+// ASIMD shift by register, complex, Q-form
+def : InstRW<[A57Write_5cyc_2X], (instregex "^[SU][QR]{1,2}SHL(v16i8|v8i16|v4i32|v2i64)")>;
+
+
+// Vector - Floating Point
+// -----------------------------------------------------------------------------
+
+// Reference for forms in this group
+// D form - v2f32
+// Q form - v4f32, v2f64
+// D form - 32, 64
+// D form - v1i32, v1i64
+// D form - v2i32
+// Q form - v4i32, v2i64
+
+// ASIMD FP arith, normal, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^(FABD|FADD|FSUB)(v2f32|32|64|v2i32p)")>;
+// ASIMD FP arith, normal, Q-form
+def : InstRW<[A57Write_5cyc_2V], (instregex "^(FABD|FADD|FSUB)(v4f32|v2f64|v2i64p)")>;
+
+// ASIMD FP arith, pairwise, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^FADDP(v2f32|32|64|v2i32)")>;
+// ASIMD FP arith, pairwise, Q-form
+def : InstRW<[A57Write_9cyc_3V], (instregex "^FADDP(v4f32|v2f64|v2i64)")>;
+
+// ASIMD FP compare, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^(FACGE|FACGT|FCMEQ|FCMGE|FCMGT|FCMLE|FCMLT)(v2f32|32|64|v1i32|v2i32|v1i64)")>;
+// ASIMD FP compare, Q-form
+def : InstRW<[A57Write_5cyc_2V], (instregex "^(FACGE|FACGT|FCMEQ|FCMGE|FCMGT|FCMLE|FCMLT)(v4f32|v2f64|v4i32|v2i64)")>;
+
+// ASIMD FP convert, long and narrow
+def : InstRW<[A57Write_8cyc_3V], (instregex "^FCVT(L|N|XN)v")>;
+// ASIMD FP convert, other, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v2f32|v1i32|v2i32|v1i64)")>;
+// ASIMD FP convert, other, Q-form
+def : InstRW<[A57Write_5cyc_2V], (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v4f32|v2f64|v4i32|v2i64)")>;
+
+// ASIMD FP divide, D-form, F32
+def : InstRW<[A57Write_17cyc_1W], (instregex "FDIVv2f32")>;
+// ASIMD FP divide, Q-form, F32
+def : InstRW<[A57Write_34cyc_2W], (instregex "FDIVv4f32")>;
+// ASIMD FP divide, Q-form, F64
+def : InstRW<[A57Write_64cyc_2W], (instregex "FDIVv2f64")>;
+
+// Note: These were simply duplicated from ASIMD FDIV because of missing documentation
+// ASIMD FP square root, D-form, F32
+def : InstRW<[A57Write_17cyc_1W], (instregex "FSQRTv2f32")>;
+// ASIMD FP square root, Q-form, F32
+def : InstRW<[A57Write_34cyc_2W], (instregex "FSQRTv4f32")>;
+// ASIMD FP square root, Q-form, F64
+def : InstRW<[A57Write_64cyc_2W], (instregex "FSQRTv2f64")>;
+
+// ASIMD FP max/min, normal, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^(FMAX|FMIN)(NM)?(v2f32)")>;
+// ASIMD FP max/min, normal, Q-form
+def : InstRW<[A57Write_5cyc_2V], (instregex "^(FMAX|FMIN)(NM)?(v4f32|v2f64)")>;
+// ASIMD FP max/min, pairwise, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^(FMAX|FMIN)(NM)?P(v2f32|v2i32)")>;
+// ASIMD FP max/min, pairwise, Q-form
+def : InstRW<[A57Write_9cyc_3V], (instregex "^(FMAX|FMIN)(NM)?P(v4f32|v2f64|v2i64)")>;
+// ASIMD FP max/min, reduce
+def : InstRW<[A57Write_10cyc_3V], (instregex "^(FMAX|FMIN)(NM)?Vv")>;
+
+// ASIMD FP multiply, D-form, FZ
+def : InstRW<[A57Write_5cyc_1V], (instregex "^FMULX?(v2f32|v1i32|v2i32|v1i64|32|64)")>;
+// ASIMD FP multiply, Q-form, FZ
+def : InstRW<[A57Write_5cyc_2V], (instregex "^FMULX?(v4f32|v2f64|v4i32|v2i64)")>;
+
+// ASIMD FP multiply accumulate, D-form, FZ
+// ASIMD FP multiply accumulate, Q-form, FZ
+def A57WriteFPVMAD : SchedWriteRes<[A57UnitV]> { let Latency = 9; }
+def A57WriteFPVMAQ : SchedWriteRes<[A57UnitV, A57UnitV]> { let Latency = 10; }
+def A57ReadFPVMA5 : SchedReadAdvance<5, [A57WriteFPVMAD, A57WriteFPVMAQ]>;
+def : InstRW<[A57WriteFPVMAD, A57ReadFPVMA5], (instregex "^FML[AS](v2f32|v1i32|v2i32|v1i64)")>;
+def : InstRW<[A57WriteFPVMAQ, A57ReadFPVMA5], (instregex "^FML[AS](v4f32|v2f64|v4i32|v2i64)")>;
+
+// ASIMD FP round, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^FRINT[AIMNPXZ](v2f32)")>;
+// ASIMD FP round, Q-form
+def : InstRW<[A57Write_5cyc_2V], (instregex "^FRINT[AIMNPXZ](v4f32|v2f64)")>;
+
+
+// Vector - Miscellaneous
+// -----------------------------------------------------------------------------
+
+// Reference for forms in this group
+// D form - v8i8, v4i16, v2i32
+// Q form - v16i8, v8i16, v4i32
+// D form - v1i8, v1i16, v1i32, v1i64
+// Q form - v16i8, v8i16, v4i32, v2i64
+
+// ASIMD bitwise insert, Q-form
+def : InstRW<[A57Write_3cyc_2V], (instregex "^(BIF|BIT|BSL)v16i8")>;
+
+// ASIMD duplicate, gen reg, D-form and Q-form
+def : InstRW<[A57Write_8cyc_1L_1V], (instregex "^CPY")>;
+def : InstRW<[A57Write_8cyc_1L_1V], (instregex "^DUPv.+gpr")>;
+
+// ASIMD move, saturating
+def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]QXTU?N")>;
+
+// ASIMD reciprocal estimate, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^[FU](RECP|RSQRT)(E|X)(v2f32|v1i32|v2i32|v1i64)")>;
+// ASIMD reciprocal estimate, Q-form
+def : InstRW<[A57Write_5cyc_2V], (instregex "^[FU](RECP|RSQRT)(E|X)(v2f64|v4f32|v4i32)")>;
+
+// ASIMD reciprocal step, D-form, FZ
+def : InstRW<[A57Write_9cyc_1V], (instregex "^F(RECP|RSQRT)S(v2f32|v1i32|v2i32|v1i64|32|64)")>;
+// ASIMD reciprocal step, Q-form, FZ
+def : InstRW<[A57Write_9cyc_2V], (instregex "^F(RECP|RSQRT)S(v2f64|v4f32|v4i32)")>;
+
+// ASIMD table lookup, D-form
+def : InstRW<[A57Write_3cyc_1V], (instregex "^TB[LX]v8i8One")>;
+def : InstRW<[A57Write_6cyc_2V], (instregex "^TB[LX]v8i8Two")>;
+def : InstRW<[A57Write_9cyc_3V], (instregex "^TB[LX]v8i8Three")>;
+def : InstRW<[A57Write_12cyc_4V], (instregex "^TB[LX]v8i8Four")>;
+// ASIMD table lookup, Q-form
+def : InstRW<[A57Write_6cyc_3V], (instregex "^TB[LX]v16i8One")>;
+def : InstRW<[A57Write_9cyc_5V], (instregex "^TB[LX]v16i8Two")>;
+def : InstRW<[A57Write_12cyc_7V], (instregex "^TB[LX]v16i8Three")>;
+def : InstRW<[A57Write_15cyc_9V], (instregex "^TB[LX]v16i8Four")>;
+
+// ASIMD transfer, element to gen reg
+def : InstRW<[A57Write_6cyc_1I_1L], (instregex "^[SU]MOVv")>;
+
+// ASIMD transfer, gen reg to element
+def : InstRW<[A57Write_8cyc_1L_1V], (instregex "^INSv")>;
+
+// ASIMD unzip/zip, Q-form
+def : InstRW<[A57Write_6cyc_3V], (instregex "^(UZP|ZIP)(1|2)(v16i8|v8i16|v4i32|v2i64)")>;
+
+
+// Remainder
+// -----------------------------------------------------------------------------
+
+def : InstRW<[A57Write_5cyc_1V], (instregex "^F(ADD|SUB)[DS]rr")>;
+
+def A57WriteFPMA : SchedWriteRes<[A57UnitV]> { let Latency = 9; }
+def A57ReadFPMA5 : SchedReadAdvance<5, [A57WriteFPMA]>;
+def A57ReadFPM : SchedReadAdvance<0>;
+def : InstRW<[A57WriteFPMA, A57ReadFPM, A57ReadFPM, A57ReadFPMA5], (instregex "^FN?M(ADD|SUB)[DS]rrr")>;
+
+def : InstRW<[A57Write_10cyc_1L_1V], (instregex "^[FSU]CVT[AMNPZ][SU](_Int)?[SU]?[XW]?[DS]?[rds]i?")>;
+def : InstRW<[A57Write_10cyc_1L_1V], (instregex "^[SU]CVTF")>;
+
+def : InstRW<[A57Write_32cyc_1W], (instrs FDIVDrr)>;
+def : InstRW<[A57Write_17cyc_1W], (instrs FDIVSrr)>;
+
+def : InstRW<[A57Write_5cyc_1V], (instregex "^F(MAX|MIN).+rr")>;
+
+def : InstRW<[A57Write_5cyc_1V], (instregex "^FRINT.+r")>;
+
+def : InstRW<[A57Write_32cyc_1W], (instrs FSQRTDr)>;
+def : InstRW<[A57Write_17cyc_1W], (instrs FSQRTSr)>;
+
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi], (instrs LDNPDi)>;
+def : InstRW<[A57Write_6cyc_2L, WriteLDHi], (instrs LDNPQi)>;
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi], (instrs LDNPSi)>;
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi], (instrs LDPDi)>;
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi, WriteAdr], (instrs LDPDpost)>;
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi, WriteAdr], (instrs LDPDpre)>;
+def : InstRW<[A57Write_6cyc_2L, WriteLDHi], (instrs LDPQi)>;
+def : InstRW<[A57Write_6cyc_2L, WriteLDHi, WriteAdr], (instrs LDPQpost)>;
+def : InstRW<[A57Write_6cyc_2L, WriteLDHi, WriteAdr], (instrs LDPQpre)>;
+def : InstRW<[A57Write_5cyc_1I_2L, WriteLDHi], (instrs LDPSWi)>;
+def : InstRW<[A57Write_5cyc_1I_2L, WriteLDHi, WriteAdr], (instrs LDPSWpost)>;
+def : InstRW<[A57Write_5cyc_1I_2L, WriteLDHi, WriteAdr], (instrs LDPSWpre)>;
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi], (instrs LDPSi)>;
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi, WriteAdr], (instrs LDPSpost)>;
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi, WriteAdr], (instrs LDPSpre)>;
+def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRBpost)>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRBpre)>;
+def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRBroW)>;
+def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRBroX)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRBui)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRDl)>;
+def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRDpost)>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRDpre)>;
+def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRDroW)>;
+def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRDroX)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRDui)>;
+def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRHHroW)>;
+def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRHHroX)>;
+def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRHpost)>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRHpre)>;
+def : InstRW<[A57Write_6cyc_1I_1L, ReadAdrBase], (instrs LDRHroW)>;
+def : InstRW<[A57Write_6cyc_1I_1L, ReadAdrBase], (instrs LDRHroX)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRHui)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRQl)>;
+def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRQpost)>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRQpre)>;
+def : InstRW<[A57Write_6cyc_1I_1L, ReadAdrBase], (instrs LDRQroW)>;
+def : InstRW<[A57Write_6cyc_1I_1L, ReadAdrBase], (instrs LDRQroX)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRQui)>;
+def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRSHWroW)>;
+def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRSHWroX)>;
+def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRSHXroW)>;
+def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRSHXroX)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRSl)>;
+def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRSpost)>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRSpre)>;
+def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRSroW)>;
+def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRSroX)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRSui)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDURBi)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDURDi)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDURHi)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDURQi)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDURSi)>;
+
+def : InstRW<[A57Write_2cyc_2S], (instrs STNPDi)>;
+def : InstRW<[A57Write_4cyc_1I_4S], (instrs STNPQi)>;
+def : InstRW<[A57Write_2cyc_2S], (instrs STNPXi)>;
+def : InstRW<[A57Write_2cyc_2S], (instrs STPDi)>;
+def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S], (instrs STPDpost)>;
+def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S], (instrs STPDpre)>;
+def : InstRW<[A57Write_4cyc_1I_4S], (instrs STPQi)>;
+def : InstRW<[WriteAdr, A57Write_4cyc_1I_4S], (instrs STPQpost)>;
+def : InstRW<[WriteAdr, A57Write_4cyc_2I_4S], (instrs STPQpre)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STPSpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STPSpre)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STPWpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STPWpre)>;
+def : InstRW<[A57Write_2cyc_2S], (instrs STPXi)>;
+def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S], (instrs STPXpost)>;
+def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S], (instrs STPXpre)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRBBpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRBBpre)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRBpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STRBpre)>;
+def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRBroW)>;
+def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRBroX)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRDpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STRDpre)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRHHpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRHHpre)>;
+def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRHHroW)>;
+def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRHHroX)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRHpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STRHpre)>;
+def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRHroW)>;
+def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRHroX)>;
+def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S, ReadAdrBase], (instrs STRQpost)>;
+def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S], (instrs STRQpre)>;
+def : InstRW<[A57Write_2cyc_1I_2S, ReadAdrBase], (instrs STRQroW)>;
+def : InstRW<[A57Write_2cyc_1I_2S, ReadAdrBase], (instrs STRQroX)>;
+def : InstRW<[A57Write_2cyc_1I_2S], (instrs STRQui)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRSpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STRSpre)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRWpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRWpre)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRXpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRXpre)>;
+def : InstRW<[A57Write_2cyc_2S], (instrs STURQi)>;
+
+} // SchedModel = CortexA57Model
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedA57WriteRes.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedA57WriteRes.td
new file mode 100644
index 000000000000..55005e1d9ed1
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedA57WriteRes.td
@@ -0,0 +1,544 @@
+//=- AArch64SchedA57WriteRes.td - ARM Cortex-A57 Write Res ---*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Contains all of the Cortex-A57 specific SchedWriteRes types. The approach
+// below is to define a generic SchedWriteRes for every combination of
+// latency and microOps. The naming conventions is to use a prefix, one field
+// for latency, and one or more microOp count/type designators.
+// Prefix: A57Write
+// Latency: #cyc
+// MicroOp Count/Types: #(B|I|M|L|S|X|W|V)
+//
+// e.g. A57Write_6cyc_1I_6S_4V means the total latency is 6 and there are
+// 11 micro-ops to be issued down one I pipe, six S pipes and four V pipes.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Define Generic 1 micro-op types
+
+def A57Write_5cyc_1L : SchedWriteRes<[A57UnitL]> { let Latency = 5; }
+def A57Write_5cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 5; }
+def A57Write_5cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 5; }
+def A57Write_5cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 5; }
+def A57Write_10cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 10; }
+def A57Write_17cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 17;
+ let ResourceCycles = [17]; }
+def A57Write_19cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 19;
+ let ResourceCycles = [19]; }
+def A57Write_1cyc_1B : SchedWriteRes<[A57UnitB]> { let Latency = 1; }
+def A57Write_1cyc_1I : SchedWriteRes<[A57UnitI]> { let Latency = 1; }
+def A57Write_1cyc_1S : SchedWriteRes<[A57UnitS]> { let Latency = 1; }
+def A57Write_2cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 2; }
+def A57Write_32cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 32;
+ let ResourceCycles = [32]; }
+def A57Write_35cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 35;
+ let ResourceCycles = [35]; }
+def A57Write_3cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 3; }
+def A57Write_3cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 3; }
+def A57Write_3cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 3; }
+def A57Write_3cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 3; }
+def A57Write_4cyc_1L : SchedWriteRes<[A57UnitL]> { let Latency = 4; }
+def A57Write_4cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
+def A57Write_9cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 9; }
+def A57Write_6cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 6; }
+def A57Write_6cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 6; }
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 2 micro-op types
+
+def A57Write_64cyc_2W : SchedWriteRes<[A57UnitW, A57UnitW]> {
+ let Latency = 64;
+ let NumMicroOps = 2;
+ let ResourceCycles = [32, 32];
+}
+def A57Write_6cyc_1I_1L : SchedWriteRes<[A57UnitI,
+ A57UnitL]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+def A57Write_7cyc_1V_1X : SchedWriteRes<[A57UnitV,
+ A57UnitX]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+}
+def A57Write_8cyc_1L_1V : SchedWriteRes<[A57UnitL,
+ A57UnitV]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+def A57Write_9cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+}
+def A57Write_8cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+def A57Write_6cyc_2L : SchedWriteRes<[A57UnitL, A57UnitL]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+def A57Write_6cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+def A57Write_6cyc_2W : SchedWriteRes<[A57UnitW, A57UnitW]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+def A57Write_5cyc_1I_1L : SchedWriteRes<[A57UnitI,
+ A57UnitL]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+def A57Write_5cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+def A57Write_5cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+def A57Write_10cyc_1L_1V : SchedWriteRes<[A57UnitL,
+ A57UnitV]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+}
+def A57Write_10cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+}
+def A57Write_1cyc_1B_1I : SchedWriteRes<[A57UnitB,
+ A57UnitI]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+def A57Write_1cyc_1I_1S : SchedWriteRes<[A57UnitI,
+ A57UnitS]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+def A57Write_2cyc_1B_1I : SchedWriteRes<[A57UnitB,
+ A57UnitI]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+def A57Write_2cyc_2S : SchedWriteRes<[A57UnitS, A57UnitS]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+def A57Write_2cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+def A57Write_34cyc_2W : SchedWriteRes<[A57UnitW, A57UnitW]> {
+ let Latency = 34;
+ let NumMicroOps = 2;
+ let ResourceCycles = [17, 17];
+}
+def A57Write_3cyc_1I_1M : SchedWriteRes<[A57UnitI,
+ A57UnitM]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+def A57Write_3cyc_1I_1S : SchedWriteRes<[A57UnitI,
+ A57UnitS]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+def A57Write_3cyc_1S_1V : SchedWriteRes<[A57UnitS,
+ A57UnitV]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+def A57Write_3cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+def A57Write_4cyc_1I_1L : SchedWriteRes<[A57UnitI,
+ A57UnitL]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+def A57Write_4cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 3 micro-op types
+
+def A57Write_10cyc_3V : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+}
+def A57Write_2cyc_1I_2S : SchedWriteRes<[A57UnitI,
+ A57UnitS, A57UnitS]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+}
+def A57Write_3cyc_1I_1S_1V : SchedWriteRes<[A57UnitI,
+ A57UnitS,
+ A57UnitV]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+}
+def A57Write_3cyc_1M_2S : SchedWriteRes<[A57UnitM,
+ A57UnitS, A57UnitS]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+}
+def A57Write_3cyc_3S : SchedWriteRes<[A57UnitS, A57UnitS, A57UnitS]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+}
+def A57Write_3cyc_2S_1V : SchedWriteRes<[A57UnitS, A57UnitS,
+ A57UnitV]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+}
+def A57Write_5cyc_1I_2L : SchedWriteRes<[A57UnitI,
+ A57UnitL, A57UnitL]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+}
+def A57Write_6cyc_1I_2L : SchedWriteRes<[A57UnitI,
+ A57UnitL, A57UnitL]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+}
+def A57Write_6cyc_3V : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+}
+def A57Write_7cyc_3L : SchedWriteRes<[A57UnitL, A57UnitL, A57UnitL]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+}
+def A57Write_8cyc_1I_1L_1V : SchedWriteRes<[A57UnitI,
+ A57UnitL,
+ A57UnitV]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+}
+def A57Write_8cyc_1L_2V : SchedWriteRes<[A57UnitL,
+ A57UnitV, A57UnitV]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+}
+def A57Write_8cyc_3V : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+}
+def A57Write_9cyc_3V : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 4 micro-op types
+
+def A57Write_2cyc_2I_2S : SchedWriteRes<[A57UnitI, A57UnitI,
+ A57UnitS, A57UnitS]> {
+ let Latency = 2;
+ let NumMicroOps = 4;
+}
+def A57Write_3cyc_2I_2S : SchedWriteRes<[A57UnitI, A57UnitI,
+ A57UnitS, A57UnitS]> {
+ let Latency = 3;
+ let NumMicroOps = 4;
+}
+def A57Write_3cyc_1I_3S : SchedWriteRes<[A57UnitI,
+ A57UnitS, A57UnitS, A57UnitS]> {
+ let Latency = 3;
+ let NumMicroOps = 4;
+}
+def A57Write_3cyc_1I_2S_1V : SchedWriteRes<[A57UnitI,
+ A57UnitS, A57UnitS,
+ A57UnitV]> {
+ let Latency = 3;
+ let NumMicroOps = 4;
+}
+def A57Write_4cyc_4S : SchedWriteRes<[A57UnitS, A57UnitS,
+ A57UnitS, A57UnitS]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+}
+def A57Write_7cyc_1I_3L : SchedWriteRes<[A57UnitI,
+ A57UnitL, A57UnitL, A57UnitL]> {
+ let Latency = 7;
+ let NumMicroOps = 4;
+}
+def A57Write_5cyc_2I_2L : SchedWriteRes<[A57UnitI, A57UnitI,
+ A57UnitL, A57UnitL]> {
+ let Latency = 5;
+ let NumMicroOps = 4;
+}
+def A57Write_8cyc_1I_1L_2V : SchedWriteRes<[A57UnitI,
+ A57UnitL,
+ A57UnitV, A57UnitV]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+}
+def A57Write_8cyc_4L : SchedWriteRes<[A57UnitL, A57UnitL,
+ A57UnitL, A57UnitL]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+}
+def A57Write_9cyc_2L_2V : SchedWriteRes<[A57UnitL, A57UnitL,
+ A57UnitV, A57UnitV]> {
+ let Latency = 9;
+ let NumMicroOps = 4;
+}
+def A57Write_9cyc_1L_3V : SchedWriteRes<[A57UnitL,
+ A57UnitV, A57UnitV, A57UnitV]> {
+ let Latency = 9;
+ let NumMicroOps = 4;
+}
+def A57Write_12cyc_4V : SchedWriteRes<[A57UnitV, A57UnitV,
+ A57UnitV, A57UnitV]> {
+ let Latency = 12;
+ let NumMicroOps = 4;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 5 micro-op types
+
+def A57Write_3cyc_3S_2V : SchedWriteRes<[A57UnitS, A57UnitS, A57UnitS,
+ A57UnitV, A57UnitV]> {
+ let Latency = 3;
+ let NumMicroOps = 5;
+}
+def A57Write_8cyc_1I_4L : SchedWriteRes<[A57UnitI,
+ A57UnitL, A57UnitL,
+ A57UnitL, A57UnitL]> {
+ let Latency = 8;
+ let NumMicroOps = 5;
+}
+def A57Write_4cyc_1I_4S : SchedWriteRes<[A57UnitI,
+ A57UnitS, A57UnitS,
+ A57UnitS, A57UnitS]> {
+ let Latency = 4;
+ let NumMicroOps = 5;
+}
+def A57Write_9cyc_1I_2L_2V : SchedWriteRes<[A57UnitI,
+ A57UnitL, A57UnitL,
+ A57UnitV, A57UnitV]> {
+ let Latency = 9;
+ let NumMicroOps = 5;
+}
+def A57Write_9cyc_1I_1L_3V : SchedWriteRes<[A57UnitI,
+ A57UnitL,
+ A57UnitV, A57UnitV, A57UnitV]> {
+ let Latency = 9;
+ let NumMicroOps = 5;
+}
+def A57Write_9cyc_2L_3V : SchedWriteRes<[A57UnitL, A57UnitL,
+ A57UnitV, A57UnitV, A57UnitV]> {
+ let Latency = 9;
+ let NumMicroOps = 5;
+}
+def A57Write_9cyc_5V : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV,
+ A57UnitV, A57UnitV]> {
+ let Latency = 9;
+ let NumMicroOps = 5;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 6 micro-op types
+
+def A57Write_3cyc_1I_3S_2V : SchedWriteRes<[A57UnitI,
+ A57UnitS, A57UnitS, A57UnitS,
+ A57UnitV, A57UnitV]> {
+ let Latency = 3;
+ let NumMicroOps = 6;
+}
+def A57Write_4cyc_2I_4S : SchedWriteRes<[A57UnitI, A57UnitI,
+ A57UnitS, A57UnitS,
+ A57UnitS, A57UnitS]> {
+ let Latency = 4;
+ let NumMicroOps = 6;
+}
+def A57Write_4cyc_4S_2V : SchedWriteRes<[A57UnitS, A57UnitS,
+ A57UnitS, A57UnitS,
+ A57UnitV, A57UnitV]> {
+ let Latency = 4;
+ let NumMicroOps = 6;
+}
+def A57Write_6cyc_6S : SchedWriteRes<[A57UnitS, A57UnitS, A57UnitS,
+ A57UnitS, A57UnitS, A57UnitS]> {
+ let Latency = 6;
+ let NumMicroOps = 6;
+}
+def A57Write_9cyc_1I_2L_3V : SchedWriteRes<[A57UnitI,
+ A57UnitL, A57UnitL,
+ A57UnitV, A57UnitV, A57UnitV]> {
+ let Latency = 9;
+ let NumMicroOps = 6;
+}
+def A57Write_9cyc_1I_1L_4V : SchedWriteRes<[A57UnitI,
+ A57UnitL,
+ A57UnitV, A57UnitV,
+ A57UnitV, A57UnitV]> {
+ let Latency = 9;
+ let NumMicroOps = 6;
+}
+def A57Write_9cyc_2L_4V : SchedWriteRes<[A57UnitL, A57UnitL,
+ A57UnitV, A57UnitV,
+ A57UnitV, A57UnitV]> {
+ let Latency = 9;
+ let NumMicroOps = 6;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 7 micro-op types
+
+def A57Write_10cyc_3L_4V : SchedWriteRes<[A57UnitL, A57UnitL, A57UnitL,
+ A57UnitV, A57UnitV,
+ A57UnitV, A57UnitV]> {
+ let Latency = 10;
+ let NumMicroOps = 7;
+}
+def A57Write_4cyc_1I_4S_2V : SchedWriteRes<[A57UnitI,
+ A57UnitS, A57UnitS,
+ A57UnitS, A57UnitS,
+ A57UnitV, A57UnitV]> {
+ let Latency = 4;
+ let NumMicroOps = 7;
+}
+def A57Write_6cyc_1I_6S : SchedWriteRes<[A57UnitI,
+ A57UnitS, A57UnitS, A57UnitS,
+ A57UnitS, A57UnitS, A57UnitS]> {
+ let Latency = 6;
+ let NumMicroOps = 7;
+}
+def A57Write_9cyc_1I_2L_4V : SchedWriteRes<[A57UnitI,
+ A57UnitL, A57UnitL,
+ A57UnitV, A57UnitV,
+ A57UnitV, A57UnitV]> {
+ let Latency = 9;
+ let NumMicroOps = 7;
+}
+def A57Write_12cyc_7V : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV,
+ A57UnitV, A57UnitV,
+ A57UnitV, A57UnitV]> {
+ let Latency = 12;
+ let NumMicroOps = 7;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 8 micro-op types
+
+def A57Write_10cyc_1I_3L_4V : SchedWriteRes<[A57UnitI,
+ A57UnitL, A57UnitL, A57UnitL,
+ A57UnitV, A57UnitV,
+ A57UnitV, A57UnitV]> {
+ let Latency = 10;
+ let NumMicroOps = 8;
+}
+def A57Write_11cyc_4L_4V : SchedWriteRes<[A57UnitL, A57UnitL,
+ A57UnitL, A57UnitL,
+ A57UnitV, A57UnitV,
+ A57UnitV, A57UnitV]> {
+ let Latency = 11;
+ let NumMicroOps = 8;
+}
+def A57Write_8cyc_8S : SchedWriteRes<[A57UnitS, A57UnitS,
+ A57UnitS, A57UnitS,
+ A57UnitS, A57UnitS,
+ A57UnitS, A57UnitS]> {
+ let Latency = 8;
+ let NumMicroOps = 8;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 9 micro-op types
+
+def A57Write_8cyc_1I_8S : SchedWriteRes<[A57UnitI,
+ A57UnitS, A57UnitS,
+ A57UnitS, A57UnitS,
+ A57UnitS, A57UnitS,
+ A57UnitS, A57UnitS]> {
+ let Latency = 8;
+ let NumMicroOps = 9;
+}
+def A57Write_11cyc_1I_4L_4V : SchedWriteRes<[A57UnitI,
+ A57UnitL, A57UnitL,
+ A57UnitL, A57UnitL,
+ A57UnitV, A57UnitV,
+ A57UnitV, A57UnitV]> {
+ let Latency = 11;
+ let NumMicroOps = 9;
+}
+def A57Write_15cyc_9V : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV,
+ A57UnitV, A57UnitV, A57UnitV,
+ A57UnitV, A57UnitV, A57UnitV]> {
+ let Latency = 15;
+ let NumMicroOps = 9;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 10 micro-op types
+
+def A57Write_6cyc_6S_4V : SchedWriteRes<[A57UnitS, A57UnitS, A57UnitS,
+ A57UnitS, A57UnitS, A57UnitS,
+ A57UnitV, A57UnitV,
+ A57UnitV, A57UnitV]> {
+ let Latency = 6;
+ let NumMicroOps = 10;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 11 micro-op types
+
+def A57Write_6cyc_1I_6S_4V : SchedWriteRes<[A57UnitI,
+ A57UnitS, A57UnitS, A57UnitS,
+ A57UnitS, A57UnitS, A57UnitS,
+ A57UnitV, A57UnitV,
+ A57UnitV, A57UnitV]> {
+ let Latency = 6;
+ let NumMicroOps = 11;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 12 micro-op types
+
+def A57Write_8cyc_8S_4V : SchedWriteRes<[A57UnitS, A57UnitS, A57UnitS, A57UnitS,
+ A57UnitS, A57UnitS, A57UnitS, A57UnitS,
+ A57UnitV, A57UnitV,
+ A57UnitV, A57UnitV]> {
+ let Latency = 8;
+ let NumMicroOps = 12;
+}
+
+//===----------------------------------------------------------------------===//
+// Define Generic 13 micro-op types
+
+def A57Write_8cyc_1I_8S_4V : SchedWriteRes<[A57UnitI,
+ A57UnitS, A57UnitS, A57UnitS,
+ A57UnitS, A57UnitS, A57UnitS,
+ A57UnitS, A57UnitS,
+ A57UnitV, A57UnitV,
+ A57UnitV, A57UnitV]> {
+ let Latency = 8;
+ let NumMicroOps = 13;
+}
+
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedCyclone.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedCyclone.td
new file mode 100644
index 000000000000..9fd3ae6818e5
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedCyclone.td
@@ -0,0 +1,869 @@
+//=- AArch64SchedCyclone.td - Cyclone Scheduling Definitions -*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for AArch64 Cyclone to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def CycloneModel : SchedMachineModel {
+ let IssueWidth = 6; // 6 micro-ops are dispatched per cycle.
+ let MicroOpBufferSize = 192; // Based on the reorder buffer.
+ let LoadLatency = 4; // Optimistic load latency.
+ let MispredictPenalty = 16; // 14-19 cycles are typical.
+ let CompleteModel = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on Cyclone.
+
+// 4 integer pipes
+def CyUnitI : ProcResource<4> {
+ let BufferSize = 48;
+}
+
+// 2 branch units: I[0..1]
+def CyUnitB : ProcResource<2> {
+ let Super = CyUnitI;
+ let BufferSize = 24;
+}
+
+// 1 indirect-branch unit: I[0]
+def CyUnitBR : ProcResource<1> {
+ let Super = CyUnitB;
+}
+
+// 2 shifter pipes: I[2..3]
+// When an instruction consumes a CyUnitIS, it also consumes a CyUnitI
+def CyUnitIS : ProcResource<2> {
+ let Super = CyUnitI;
+ let BufferSize = 24;
+}
+
+// 1 mul pipe: I[0]
+def CyUnitIM : ProcResource<1> {
+ let Super = CyUnitBR;
+ let BufferSize = 32;
+}
+
+// 1 div pipe: I[1]
+def CyUnitID : ProcResource<1> {
+ let Super = CyUnitB;
+ let BufferSize = 16;
+}
+
+// 1 integer division unit. This is driven by the ID pipe, but only
+// consumes the pipe for one cycle at issue and another cycle at writeback.
+def CyUnitIntDiv : ProcResource<1>;
+
+// 2 ld/st pipes.
+def CyUnitLS : ProcResource<2> {
+ let BufferSize = 28;
+}
+
+// 3 fp/vector pipes.
+def CyUnitV : ProcResource<3> {
+ let BufferSize = 48;
+}
+// 2 fp/vector arithmetic and multiply pipes: V[0-1]
+def CyUnitVM : ProcResource<2> {
+ let Super = CyUnitV;
+ let BufferSize = 32;
+}
+// 1 fp/vector division/sqrt pipe: V[2]
+def CyUnitVD : ProcResource<1> {
+ let Super = CyUnitV;
+ let BufferSize = 16;
+}
+// 1 fp compare pipe: V[0]
+def CyUnitVC : ProcResource<1> {
+ let Super = CyUnitVM;
+ let BufferSize = 16;
+}
+
+// 2 fp division/square-root units. These are driven by the VD pipe,
+// but only consume the pipe for one cycle at issue and a cycle at writeback.
+def CyUnitFloatDiv : ProcResource<2>;
+
+//===----------------------------------------------------------------------===//
+// Define scheduler read/write resources and latency on Cyclone.
+// This mirrors sections 7.7-7.9 of the Tuning Guide v1.0.1.
+
+let SchedModel = CycloneModel in {
+
+//---
+// 7.8.1. Moves
+//---
+
+// A single nop micro-op (uX).
+def WriteX : SchedWriteRes<[]> { let Latency = 0; }
+
+// Move zero is a register rename (to machine register zero).
+// The move is replaced by a single nop micro-op.
+// MOVZ Rd, #0
+// AND Rd, Rzr, #imm
+def WriteZPred : SchedPredicate<[{TII->isGPRZero(*MI)}]>;
+def WriteImmZ : SchedWriteVariant<[
+ SchedVar<WriteZPred, [WriteX]>,
+ SchedVar<NoSchedPred, [WriteImm]>]>;
+def : InstRW<[WriteImmZ], (instrs MOVZWi,MOVZXi,ANDWri,ANDXri)>;
+
+// Move GPR is a register rename and single nop micro-op.
+// ORR Xd, XZR, Xm
+// ADD Xd, Xn, #0
+def WriteIMovPred : SchedPredicate<[{TII->isGPRCopy(*MI)}]>;
+def WriteVMovPred : SchedPredicate<[{TII->isFPRCopy(*MI)}]>;
+def WriteMov : SchedWriteVariant<[
+ SchedVar<WriteIMovPred, [WriteX]>,
+ SchedVar<WriteVMovPred, [WriteX]>,
+ SchedVar<NoSchedPred, [WriteI]>]>;
+def : InstRW<[WriteMov], (instrs COPY,ORRXrr,ADDXrr)>;
+
+// Move non-zero immediate is an integer ALU op.
+// MOVN,MOVZ,MOVK
+def : WriteRes<WriteImm, [CyUnitI]>;
+
+//---
+// 7.8.2-7.8.5. Arithmetic and Logical, Comparison, Conditional,
+// Shifts and Bitfield Operations
+//---
+
+// ADR,ADRP
+// ADD(S)ri,SUB(S)ri,AND(S)ri,EORri,ORRri
+// ADD(S)rr,SUB(S)rr,AND(S)rr,BIC(S)rr,EONrr,EORrr,ORNrr,ORRrr
+// ADC(S),SBC(S)
+// Aliases: CMN, CMP, TST
+//
+// Conditional operations.
+// CCMNi,CCMPi,CCMNr,CCMPr,
+// CSEL,CSINC,CSINV,CSNEG
+//
+// Bit counting and reversal operations.
+// CLS,CLZ,RBIT,REV,REV16,REV32
+def : WriteRes<WriteI, [CyUnitI]>;
+
+// ADD with shifted register operand is a single micro-op that
+// consumes a shift pipeline for two cycles.
+// ADD(S)rs,SUB(S)rs,AND(S)rs,BIC(S)rs,EONrs,EORrs,ORNrs,ORRrs
+// EXAMPLE: ADDrs Xn, Xm LSL #imm
+def : WriteRes<WriteISReg, [CyUnitIS]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+
+// ADD with extended register operand is the same as shifted reg operand.
+// ADD(S)re,SUB(S)re
+// EXAMPLE: ADDXre Xn, Xm, UXTB #1
+def : WriteRes<WriteIEReg, [CyUnitIS]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+
+// Variable shift and bitfield operations.
+// ASRV,LSLV,LSRV,RORV,BFM,SBFM,UBFM
+def : WriteRes<WriteIS, [CyUnitIS]>;
+
+// EXTR Shifts a pair of registers and requires two micro-ops.
+// The second micro-op is delayed, as modeled by ReadExtrHi.
+// EXTR Xn, Xm, #imm
+def : WriteRes<WriteExtr, [CyUnitIS, CyUnitIS]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+// EXTR's first register read is delayed by one cycle, effectively
+// shortening its writer's latency.
+// EXTR Xn, Xm, #imm
+def : ReadAdvance<ReadExtrHi, 1>;
+
+//---
+// 7.8.6. Multiplies
+//---
+
+// MUL/MNEG are aliases for MADD/MSUB.
+// MADDW,MSUBW,SMADDL,SMSUBL,UMADDL,UMSUBL
+def : WriteRes<WriteIM32, [CyUnitIM]> {
+ let Latency = 4;
+}
+// MADDX,MSUBX,SMULH,UMULH
+def : WriteRes<WriteIM64, [CyUnitIM]> {
+ let Latency = 5;
+}
+
+//---
+// 7.8.7. Divide
+//---
+
+// 32-bit divide takes 7-13 cycles. 10 cycles covers a 20-bit quotient.
+// The ID pipe is consumed for 2 cycles: issue and writeback.
+// SDIVW,UDIVW
+def : WriteRes<WriteID32, [CyUnitID, CyUnitIntDiv]> {
+ let Latency = 10;
+ let ResourceCycles = [2, 10];
+}
+// 64-bit divide takes 7-21 cycles. 13 cycles covers a 32-bit quotient.
+// The ID pipe is consumed for 2 cycles: issue and writeback.
+// SDIVX,UDIVX
+def : WriteRes<WriteID64, [CyUnitID, CyUnitIntDiv]> {
+ let Latency = 13;
+ let ResourceCycles = [2, 13];
+}
+
+//---
+// 7.8.8,7.8.10. Load/Store, single element
+//---
+
+// Integer loads take 4 cycles and use one LS unit for one cycle.
+def : WriteRes<WriteLD, [CyUnitLS]> {
+ let Latency = 4;
+}
+
+// Store-load forwarding is 4 cycles.
+//
+// Note: The store-exclusive sequence incorporates this
+// latency. However, general heuristics should not model the
+// dependence between a store and subsequent may-alias load because
+// hardware speculation works.
+def : WriteRes<WriteST, [CyUnitLS]> {
+ let Latency = 4;
+}
+
+// Load from base address plus an optionally scaled register offset.
+// Rt latency is latency WriteIS + WriteLD.
+// EXAMPLE: LDR Xn, Xm [, lsl 3]
+def CyWriteLDIdx : SchedWriteVariant<[
+ SchedVar<ScaledIdxPred, [WriteIS, WriteLD]>, // Load from scaled register.
+ SchedVar<NoSchedPred, [WriteLD]>]>; // Load from register offset.
+def : SchedAlias<WriteLDIdx, CyWriteLDIdx>; // Map AArch64->Cyclone type.
+
+// EXAMPLE: STR Xn, Xm [, lsl 3]
+def CyWriteSTIdx : SchedWriteVariant<[
+ SchedVar<ScaledIdxPred, [WriteIS, WriteST]>, // Store to scaled register.
+ SchedVar<NoSchedPred, [WriteST]>]>; // Store to register offset.
+def : SchedAlias<WriteSTIdx, CyWriteSTIdx>; // Map AArch64->Cyclone type.
+
+// Read the (unshifted) base register Xn in the second micro-op one cycle later.
+// EXAMPLE: LDR Xn, Xm [, lsl 3]
+def ReadBaseRS : SchedReadAdvance<1>;
+def CyReadAdrBase : SchedReadVariant<[
+ SchedVar<ScaledIdxPred, [ReadBaseRS]>, // Read base reg after shifting offset.
+ SchedVar<NoSchedPred, [ReadDefault]>]>; // Read base reg with no shift.
+def : SchedAlias<ReadAdrBase, CyReadAdrBase>; // Map AArch64->Cyclone type.
+
+//---
+// 7.8.9,7.8.11. Load/Store, paired
+//---
+
+// Address pre/post increment is a simple ALU op with one cycle latency.
+def : WriteRes<WriteAdr, [CyUnitI]>;
+
+// LDP high register write is fused with the load, but a nop micro-op remains.
+def : WriteRes<WriteLDHi, []> {
+ let Latency = 4;
+}
+
+// STP is a vector op and store, except for QQ, which is just two stores.
+def : SchedAlias<WriteSTP, WriteVSTShuffle>;
+def : InstRW<[WriteST, WriteST], (instrs STPQi)>;
+
+//---
+// 7.8.13. Branches
+//---
+
+// Branches take a single micro-op.
+// The misprediction penalty is defined as a SchedMachineModel property.
+def : WriteRes<WriteBr, [CyUnitB]> {let Latency = 0;}
+def : WriteRes<WriteBrReg, [CyUnitBR]> {let Latency = 0;}
+
+//---
+// 7.8.14. Never-issued Instructions, Barrier and Hint Operations
+//---
+
+// NOP,SEV,SEVL,WFE,WFI,YIELD
+def : WriteRes<WriteHint, []> {let Latency = 0;}
+// ISB
+def : InstRW<[WriteI], (instrs ISB)>;
+// SLREX,DMB,DSB
+def : WriteRes<WriteBarrier, [CyUnitLS]>;
+
+// System instructions get an invalid latency because the latency of
+// other operations across them is meaningless.
+def : WriteRes<WriteSys, []> {let Latency = -1;}
+
+//===----------------------------------------------------------------------===//
+// 7.9 Vector Unit Instructions
+
+// Simple vector operations take 2 cycles.
+def : WriteRes<WriteV, [CyUnitV]> {let Latency = 2;}
+
+// Define some longer latency vector op types for Cyclone.
+def CyWriteV3 : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
+def CyWriteV4 : SchedWriteRes<[CyUnitV]> {let Latency = 4;}
+def CyWriteV5 : SchedWriteRes<[CyUnitV]> {let Latency = 5;}
+def CyWriteV6 : SchedWriteRes<[CyUnitV]> {let Latency = 6;}
+
+// Simple floating-point operations take 2 cycles.
+def : WriteRes<WriteF, [CyUnitV]> {let Latency = 2;}
+
+//---
+// 7.9.1 Vector Moves
+//---
+
+// TODO: Add Cyclone-specific zero-cycle zeros. LLVM currently
+// generates expensive int-float conversion instead:
+// FMOVDi Dd, #0.0
+// FMOVv2f64ns Vd.2d, #0.0
+
+// FMOVSi,FMOVDi
+def : WriteRes<WriteFImm, [CyUnitV]> {let Latency = 2;}
+
+// MOVI,MVNI are WriteV
+// FMOVv2f32ns,FMOVv2f64ns,FMOVv4f32ns are WriteV
+
+// Move FPR is a register rename and single nop micro-op.
+// ORR.16b Vd,Vn,Vn
+// COPY is handled above in the WriteMov Variant.
+def WriteVMov : SchedWriteVariant<[
+ SchedVar<WriteVMovPred, [WriteX]>,
+ SchedVar<NoSchedPred, [WriteV]>]>;
+def : InstRW<[WriteVMov], (instrs ORRv16i8)>;
+
+// FMOVSr,FMOVDr are WriteF.
+
+// MOV V,V is a WriteV.
+
+// CPY D,V[x] is a WriteV
+
+// INS V[x],V[y] is a WriteV.
+
+// FMOVWSr,FMOVXDr,FMOVXDHighr
+def : WriteRes<WriteFCopy, [CyUnitLS]> {
+ let Latency = 5;
+}
+
+// FMOVSWr,FMOVDXr
+def : InstRW<[WriteLD], (instrs FMOVSWr,FMOVDXr,FMOVDXHighr)>;
+
+// INS V[x],R
+def CyWriteCopyToFPR : WriteSequence<[WriteVLD, WriteV]>;
+def : InstRW<[CyWriteCopyToFPR], (instregex "INSv")>;
+
+// SMOV,UMOV R,V[x]
+def CyWriteCopyToGPR : WriteSequence<[WriteLD, WriteI]>;
+def : InstRW<[CyWriteCopyToGPR], (instregex "SMOVv","UMOVv")>;
+
+// DUP V,R
+def : InstRW<[CyWriteCopyToFPR], (instregex "DUPv")>;
+
+// DUP V,V[x] is a WriteV.
+
+//---
+// 7.9.2 Integer Arithmetic, Logical, and Comparisons
+//---
+
+// BIC,ORR V,#imm are WriteV
+
+def : InstRW<[CyWriteV3], (instregex "ABSv")>;
+
+// MVN,NEG,NOT are WriteV
+
+def : InstRW<[CyWriteV3], (instregex "SQABSv","SQNEGv")>;
+
+// ADDP is a WriteV.
+def CyWriteVADDLP : SchedWriteRes<[CyUnitV]> {let Latency = 2;}
+def : InstRW<[CyWriteVADDLP], (instregex "SADDLPv","UADDLPv")>;
+
+def : InstRW<[CyWriteV3],
+ (instregex "ADDVv","SMAXVv","UMAXVv","SMINVv","UMINVv")>;
+
+def : InstRW<[CyWriteV3], (instregex "SADDLV","UADDLV")>;
+
+// ADD,SUB are WriteV
+
+// Forward declare.
+def CyWriteVABD : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
+
+// Add/Diff and accumulate uses the vector multiply unit.
+def CyWriteVAccum : SchedWriteRes<[CyUnitVM]> {let Latency = 3;}
+def CyReadVAccum : SchedReadAdvance<1,
+ [CyWriteVAccum, CyWriteVADDLP, CyWriteVABD]>;
+
+def : InstRW<[CyWriteVAccum, CyReadVAccum],
+ (instregex "SADALP","UADALP")>;
+
+def : InstRW<[CyWriteVAccum, CyReadVAccum],
+ (instregex "SABAv","UABAv","SABALv","UABALv")>;
+
+def : InstRW<[CyWriteV3], (instregex "SQADDv","SQSUBv","UQADDv","UQSUBv")>;
+
+def : InstRW<[CyWriteV3], (instregex "SUQADDv","USQADDv")>;
+
+def : InstRW<[CyWriteV4], (instregex "ADDHNv","RADDHNv", "RSUBHNv", "SUBHNv")>;
+
+// WriteV includes:
+// AND,BIC,CMTST,EOR,ORN,ORR
+// ADDP
+// SHADD,SHSUB,SRHADD,UHADD,UHSUB,URHADD
+// SADDL,SSUBL,UADDL,USUBL
+// SADDW,SSUBW,UADDW,USUBW
+
+def : InstRW<[CyWriteV3], (instregex "CMEQv","CMGEv","CMGTv",
+ "CMLEv","CMLTv",
+ "CMHIv","CMHSv")>;
+
+def : InstRW<[CyWriteV3], (instregex "SMAXv","SMINv","UMAXv","UMINv",
+ "SMAXPv","SMINPv","UMAXPv","UMINPv")>;
+
+def : InstRW<[CyWriteVABD], (instregex "SABDv","UABDv",
+ "SABDLv","UABDLv")>;
+
+//---
+// 7.9.3 Floating Point Arithmetic and Comparisons
+//---
+
+// FABS,FNEG are WriteF
+
+def : InstRW<[CyWriteV4], (instrs FADDPv2i32p)>;
+def : InstRW<[CyWriteV5], (instrs FADDPv2i64p)>;
+
+def : InstRW<[CyWriteV3], (instregex "FMAXPv2i","FMAXNMPv2i",
+ "FMINPv2i","FMINNMPv2i")>;
+
+def : InstRW<[CyWriteV4], (instregex "FMAXVv","FMAXNMVv","FMINVv","FMINNMVv")>;
+
+def : InstRW<[CyWriteV4], (instrs FADDSrr,FADDv2f32,FADDv4f32,
+ FSUBSrr,FSUBv2f32,FSUBv4f32,
+ FADDPv2f32,FADDPv4f32,
+ FABD32,FABDv2f32,FABDv4f32)>;
+def : InstRW<[CyWriteV5], (instrs FADDDrr,FADDv2f64,
+ FSUBDrr,FSUBv2f64,
+ FADDPv2f64,
+ FABD64,FABDv2f64)>;
+
+def : InstRW<[CyWriteV3], (instregex "FCMEQ","FCMGT","FCMLE","FCMLT")>;
+
+def : InstRW<[CyWriteV3], (instregex "FACGE","FACGT",
+ "FMAXS","FMAXD","FMAXv",
+ "FMINS","FMIND","FMINv",
+ "FMAXNMS","FMAXNMD","FMAXNMv",
+ "FMINNMS","FMINNMD","FMINNMv",
+ "FMAXPv2f","FMAXPv4f",
+ "FMINPv2f","FMINPv4f",
+ "FMAXNMPv2f","FMAXNMPv4f",
+ "FMINNMPv2f","FMINNMPv4f")>;
+
+// FCMP,FCMPE,FCCMP,FCCMPE
+def : WriteRes<WriteFCmp, [CyUnitVC]> {let Latency = 4;}
+
+// FCSEL is a WriteF.
+
+//---
+// 7.9.4 Shifts and Bitfield Operations
+//---
+
+// SHL is a WriteV
+
+def CyWriteVSHR : SchedWriteRes<[CyUnitV]> {let Latency = 2;}
+def : InstRW<[CyWriteVSHR], (instregex "SSHRv","USHRv")>;
+
+def CyWriteVSRSHR : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
+def : InstRW<[CyWriteVSRSHR], (instregex "SRSHRv","URSHRv")>;
+
+// Shift and accumulate uses the vector multiply unit.
+def CyWriteVShiftAcc : SchedWriteRes<[CyUnitVM]> {let Latency = 3;}
+def CyReadVShiftAcc : SchedReadAdvance<1,
+ [CyWriteVShiftAcc, CyWriteVSHR, CyWriteVSRSHR]>;
+def : InstRW<[CyWriteVShiftAcc, CyReadVShiftAcc],
+ (instregex "SRSRAv","SSRAv","URSRAv","USRAv")>;
+
+// SSHL,USHL are WriteV.
+
+def : InstRW<[CyWriteV3], (instregex "SRSHLv","URSHLv")>;
+
+// SQSHL,SQSHLU,UQSHL are WriteV.
+
+def : InstRW<[CyWriteV3], (instregex "SQRSHLv","UQRSHLv")>;
+
+// WriteV includes:
+// SHLL,SSHLL,USHLL
+// SLI,SRI
+// BIF,BIT,BSL
+// EXT
+// CLS,CLZ,CNT,RBIT,REV16,REV32,REV64,XTN
+// XTN2
+
+def : InstRW<[CyWriteV4],
+ (instregex "RSHRNv","SHRNv",
+ "SQRSHRNv","SQRSHRUNv","SQSHRNv","SQSHRUNv",
+ "UQRSHRNv","UQSHRNv","SQXTNv","SQXTUNv","UQXTNv")>;
+
+//---
+// 7.9.5 Multiplication
+//---
+
+def CyWriteVMul : SchedWriteRes<[CyUnitVM]> { let Latency = 4;}
+def : InstRW<[CyWriteVMul], (instregex "MULv","SMULLv","UMULLv",
+ "SQDMULLv","SQDMULHv","SQRDMULHv")>;
+
+// FMUL,FMULX,FNMUL default to WriteFMul.
+def : WriteRes<WriteFMul, [CyUnitVM]> { let Latency = 4;}
+
+def CyWriteV64Mul : SchedWriteRes<[CyUnitVM]> { let Latency = 5;}
+def : InstRW<[CyWriteV64Mul], (instrs FMULDrr,FMULv2f64,FMULv2i64_indexed,
+ FNMULDrr,FMULX64,FMULXv2f64,FMULXv2i64_indexed)>;
+
+def CyReadVMulAcc : SchedReadAdvance<1, [CyWriteVMul, CyWriteV64Mul]>;
+def : InstRW<[CyWriteVMul, CyReadVMulAcc],
+ (instregex "MLA","MLS","SMLAL","SMLSL","UMLAL","UMLSL",
+ "SQDMLAL","SQDMLSL")>;
+
+def CyWriteSMul : SchedWriteRes<[CyUnitVM]> { let Latency = 8;}
+def CyWriteDMul : SchedWriteRes<[CyUnitVM]> { let Latency = 10;}
+def CyReadSMul : SchedReadAdvance<4, [CyWriteSMul]>;
+def CyReadDMul : SchedReadAdvance<5, [CyWriteDMul]>;
+
+def : InstRW<[CyWriteSMul, CyReadSMul],
+ (instrs FMADDSrrr,FMSUBSrrr,FNMADDSrrr,FNMSUBSrrr,
+ FMLAv2f32,FMLAv4f32,
+ FMLAv1i32_indexed,FMLAv1i64_indexed,FMLAv2i32_indexed)>;
+def : InstRW<[CyWriteDMul, CyReadDMul],
+ (instrs FMADDDrrr,FMSUBDrrr,FNMADDDrrr,FNMSUBDrrr,
+ FMLAv2f64,FMLAv2i64_indexed,
+ FMLSv2f64,FMLSv2i64_indexed)>;
+
+def CyWritePMUL : SchedWriteRes<[CyUnitVD]> { let Latency = 3; }
+def : InstRW<[CyWritePMUL], (instregex "PMULv", "PMULLv")>;
+
+//---
+// 7.9.6 Divide and Square Root
+//---
+
+// FDIV,FSQRT
+// TODO: Add 64-bit variant with 19 cycle latency.
+// TODO: Specialize FSQRT for longer latency.
+def : WriteRes<WriteFDiv, [CyUnitVD, CyUnitFloatDiv]> {
+ let Latency = 17;
+ let ResourceCycles = [2, 17];
+}
+
+def : InstRW<[CyWriteV4], (instregex "FRECPEv","FRECPXv","URECPEv","URSQRTEv")>;
+
+def WriteFRSQRTE : SchedWriteRes<[CyUnitVM]> { let Latency = 4; }
+def : InstRW<[WriteFRSQRTE], (instregex "FRSQRTEv")>;
+
+def WriteFRECPS : SchedWriteRes<[CyUnitVM]> { let Latency = 8; }
+def WriteFRSQRTS : SchedWriteRes<[CyUnitVM]> { let Latency = 10; }
+def : InstRW<[WriteFRECPS], (instregex "FRECPSv")>;
+def : InstRW<[WriteFRSQRTS], (instregex "FRSQRTSv")>;
+
+//---
+// 7.9.7 Integer-FP Conversions
+//---
+
+// FCVT lengthen f16/s32
+def : InstRW<[WriteV], (instrs FCVTSHr,FCVTDHr,FCVTDSr)>;
+
+// FCVT,FCVTN,FCVTXN
+// SCVTF,UCVTF V,V
+// FRINT(AIMNPXZ) V,V
+def : WriteRes<WriteFCvt, [CyUnitV]> {let Latency = 4;}
+
+// SCVT/UCVT S/D, Rd = VLD5+V4: 9 cycles.
+def CyWriteCvtToFPR : WriteSequence<[WriteVLD, CyWriteV4]>;
+def : InstRW<[CyWriteCopyToFPR], (instregex "FCVT[AMNPZ][SU][SU][WX][SD]r")>;
+
+// FCVT Rd, S/D = V6+LD4: 10 cycles
+def CyWriteCvtToGPR : WriteSequence<[CyWriteV6, WriteLD]>;
+def : InstRW<[CyWriteCvtToGPR], (instregex "[SU]CVTF[SU][WX][SD]r")>;
+
+// FCVTL is a WriteV
+
+//---
+// 7.9.8-7.9.10 Cryptography, Data Transposition, Table Lookup
+//---
+
+def CyWriteCrypto2 : SchedWriteRes<[CyUnitVD]> {let Latency = 2;}
+def : InstRW<[CyWriteCrypto2], (instrs AESIMCrr, AESMCrr, SHA1Hrr,
+ AESDrr, AESErr, SHA1SU1rr, SHA256SU0rr,
+ SHA1SU0rrr)>;
+
+def CyWriteCrypto3 : SchedWriteRes<[CyUnitVD]> {let Latency = 3;}
+def : InstRW<[CyWriteCrypto3], (instrs SHA256SU1rrr)>;
+
+def CyWriteCrypto6 : SchedWriteRes<[CyUnitVD]> {let Latency = 6;}
+def : InstRW<[CyWriteCrypto6], (instrs SHA1Crrr, SHA1Mrrr, SHA1Prrr,
+ SHA256Hrrr,SHA256H2rrr)>;
+
+// TRN,UZP,ZUP are WriteV.
+
+// TBL,TBX are WriteV.
+
+//---
+// 7.9.11-7.9.14 Load/Store, single element and paired
+//---
+
+// Loading into the vector unit takes 5 cycles vs 4 for integer loads.
+def : WriteRes<WriteVLD, [CyUnitLS]> {
+ let Latency = 5;
+}
+
+// Store-load forwarding is 4 cycles.
+def : WriteRes<WriteVST, [CyUnitLS]> {
+ let Latency = 4;
+}
+
+// WriteVLDPair/VSTPair sequences are expanded by the target description.
+
+//---
+// 7.9.15 Load, element operations
+//---
+
+// Only the first WriteVLD and WriteAdr for writeback matches def operands.
+// Subsequent WriteVLDs consume resources. Since all loaded values have the
+// same latency, this is acceptable.
+
+// Vd is read 5 cycles after issuing the vector load.
+def : ReadAdvance<ReadVLD, 5>;
+
+def : InstRW<[WriteVLD],
+ (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLD, WriteAdr],
+ (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
+
+// Register writes from the load's high half are fused micro-ops.
+def : InstRW<[WriteVLD],
+ (instregex "LD1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteVLD, WriteAdr],
+ (instregex "LD1Twov(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVLD, WriteVLD],
+ (instregex "LD1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
+ (instregex "LD1Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLD, WriteVLD],
+ (instregex "LD1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
+ (instregex "LD1Threev(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVLD, WriteVLD, WriteVLD],
+ (instregex "LD1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD],
+ (instregex "LD1Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLD, WriteVLD],
+ (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
+ (instregex "LD1Fourv(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVLD, WriteVLD, WriteVLD, WriteVLD],
+ (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD, WriteVLD],
+ (instregex "LD1Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD],
+ (instregex "LD1i(8|16|32)$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],
+ (instregex "LD1i(8|16|32)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD], (instrs LD1i64)>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],(instrs LD1i64_POST)>;
+
+def : InstRW<[WriteVLDShuffle],
+ (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr],
+ (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[WriteVLDShuffle, WriteV],
+ (instregex "LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV],
+ (instregex "LD2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle],
+ (instregex "LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle],
+ (instregex "LD2Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV],
+ (instregex "LD2i(8|16|32)$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV],
+ (instregex "LD2i(8|16|32)_POST")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV],
+ (instregex "LD2i64$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV],
+ (instregex "LD2i64_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteV],
+ (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV],
+ (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV],
+ (instregex "LD3Threev(8b|4h|2s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV],
+ (instregex "LD3Threev(8b|4h|2s)_POST")>;
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteVLDShuffle],
+ (instregex "LD3Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteVLDShuffle],
+ (instregex "LD3Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV],
+ (instregex "LD3i(8|16|32)$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV],
+ (instregex "LD3i(8|16|32)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV],
+ (instregex "LD3i64$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV],
+ (instregex "LD3i64_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteV, WriteV],
+ (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV],
+ (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV],
+ (instrs LD3Rv1d,LD3Rv2d)>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV],
+ (instrs LD3Rv1d_POST,LD3Rv2d_POST)>;
+
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV],
+ (instregex "LD4Fourv(8b|4h|2s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV],
+ (instregex "LD4Fourv(8b|4h|2s)_POST")>;
+def : InstRW<[WriteVLDPairShuffle, WriteVLDPairShuffle,
+ WriteVLDPairShuffle, WriteVLDPairShuffle],
+ (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDPairShuffle, WriteAdr, WriteVLDPairShuffle,
+ WriteVLDPairShuffle, WriteVLDPairShuffle],
+ (instregex "LD4Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV, WriteV],
+ (instregex "LD4i(8|16|32)$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV, WriteV],
+ (instregex "LD4i(8|16|32)_POST")>;
+
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV, WriteV],
+ (instrs LD4i64)>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV],
+ (instrs LD4i64_POST)>;
+
+def : InstRW<[WriteVLDShuffle, WriteV, WriteV, WriteV],
+ (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV, WriteV],
+ (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV],
+ (instrs LD4Rv1d,LD4Rv2d)>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV],
+ (instrs LD4Rv1d_POST,LD4Rv2d_POST)>;
+
+//---
+// 7.9.16 Store, element operations
+//---
+
+// Only the WriteAdr for writeback matches a def operands.
+// Subsequent WriteVLDs only consume resources.
+
+def : InstRW<[WriteVST],
+ (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVST],
+ (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle],
+ (instregex "ST1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle],
+ (instregex "ST1Twov(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVST, WriteVST],
+ (instregex "ST1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVST, WriteVST],
+ (instregex "ST1Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle, WriteVST],
+ (instregex "ST1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVST],
+ (instregex "ST1Threev(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVST, WriteVST, WriteVST],
+ (instregex "ST1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST],
+ (instregex "ST1Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
+ (instregex "ST1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
+ (instregex "ST1Fourv(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST],
+ (instregex "ST1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST, WriteVST],
+ (instregex "ST1Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle], (instregex "ST1i(8|16|32)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST1i(8|16|32)_POST")>;
+
+def : InstRW<[WriteVSTShuffle], (instrs ST1i64)>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST1i64_POST)>;
+
+def : InstRW<[WriteVSTShuffle],
+ (instregex "ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle],
+ (instregex "ST2Twov(8b|4h|2s)_POST")>;
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
+ (instregex "ST2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
+ (instregex "ST2Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle], (instregex "ST2i(8|16|32)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST2i(8|16|32)_POST")>;
+def : InstRW<[WriteVSTShuffle], (instrs ST2i64)>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST2i64_POST)>;
+
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
+ (instregex "ST3Threev(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
+ (instregex "ST3Threev(8b|4h|2s)_POST")>;
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle],
+ (instregex "ST3Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle],
+ (instregex "ST3Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle], (instregex "ST3i(8|16|32)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST3i(8|16|32)_POST")>;
+
+def :InstRW<[WriteVSTShuffle, WriteVSTShuffle], (instrs ST3i64)>;
+def :InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], (instrs ST3i64_POST)>;
+
+def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle],
+ (instregex "ST4Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle],
+ (instregex "ST4Fourv(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle,
+ WriteVSTPairShuffle, WriteVSTPairShuffle],
+ (instregex "ST4Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle,
+ WriteVSTPairShuffle, WriteVSTPairShuffle],
+ (instregex "ST4Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTPairShuffle], (instregex "ST4i(8|16|32)$")>;
+def : InstRW<[WriteAdr, WriteVSTPairShuffle], (instregex "ST4i(8|16|32)_POST")>;
+
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], (instrs ST4i64)>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],(instrs ST4i64_POST)>;
+
+// Atomic operations are not supported.
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+
+//---
+// Unused SchedRead types
+//---
+
+def : ReadAdvance<ReadI, 0>;
+def : ReadAdvance<ReadISReg, 0>;
+def : ReadAdvance<ReadIEReg, 0>;
+def : ReadAdvance<ReadIM, 0>;
+def : ReadAdvance<ReadIMA, 0>;
+def : ReadAdvance<ReadID, 0>;
+
+} // SchedModel = CycloneModel
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkor.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkor.td
new file mode 100644
index 000000000000..19a6d6f2a1ad
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkor.td
@@ -0,0 +1,26 @@
+//==- AArch64SchedFalkor.td - Falkor Scheduling Definitions -*- tablegen -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Qualcomm Falkor to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Define the SchedMachineModel and provide basic properties for coarse grained
+// instruction cost model.
+
+def FalkorModel : SchedMachineModel {
+ let IssueWidth = 4; // 4-wide issue for expanded uops.
+ let MicroOpBufferSize = 128; // Out-of-order with temporary unified issue buffer.
+ let LoopMicroOpBufferSize = 16;
+ let LoadLatency = 3; // Optimistic load latency.
+ let MispredictPenalty = 11; // Minimum branch misprediction penalty.
+ let CompleteModel = 0;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedKryo.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedKryo.td
new file mode 100644
index 000000000000..4e491a04c78d
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedKryo.td
@@ -0,0 +1,133 @@
+//==- AArch64SchedKryo.td - Qualcomm Kryo Scheduling Defs ---*- tablegen -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Qualcomm Kryo to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// The issue width is set to five, matching the five issue queues for expanded
+// uops. Now, the latency spreadsheet has information based on fragmented uops,
+// but these do not actually take up an issue queue.
+
+def KryoModel : SchedMachineModel {
+ let IssueWidth = 5; // 5-wide issue for expanded uops
+ let MicroOpBufferSize = 128; // Out-of-order with temporary unified issue buffer
+ let LoadLatency = 4; // Optimistic load latency
+ let MispredictPenalty = 14; // Fetch + Decode/Rename/Dispatch + Branch
+
+ // Enable partial & runtime unrolling. The magic number is chosen based on
+ // experiments and benchmarking data.
+ let LoopMicroOpBufferSize = 16;
+ let CompleteModel = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on Kryo.
+
+let SchedModel = KryoModel in {
+ def KryoUnitXA : ProcResource<1>; // Type X(A) micro-ops
+ def KryoUnitXB : ProcResource<1>; // Type X(B) micro-ops
+ def KryoUnitYA : ProcResource<1>; // Type Y(A) micro-ops
+ def KryoUnitYB : ProcResource<1>; // Type Y(B) micro-ops
+ def KryoUnitX : ProcResGroup<[KryoUnitXA, // Type X micro-ops
+ KryoUnitXB]>;
+ def KryoUnitY : ProcResGroup<[KryoUnitYA, // Type Y micro-ops
+ KryoUnitYB]>;
+ def KryoUnitXY : ProcResGroup<[KryoUnitXA, // Type XY micro-ops
+ KryoUnitXB,
+ KryoUnitYA,
+ KryoUnitYB]>;
+ def KryoUnitLSA : ProcResource<1>; // Type LS(A) micro-ops
+ def KryoUnitLSB : ProcResource<1>; // Type LS(B) micro-ops
+ def KryoUnitLS : ProcResGroup<[KryoUnitLSA, // Type LS micro-ops
+ KryoUnitLSB]>;
+}
+
+let SchedModel = KryoModel in {
+
+//===----------------------------------------------------------------------===//
+// Map the target-defined scheduler read/write resources and latency for
+// Kryo.
+
+def : WriteRes<WriteImm, [KryoUnitXY]> { let Latency = 1; }
+def : WriteRes<WriteI, [KryoUnitXY]> { let Latency = 1; }
+def : WriteRes<WriteISReg, [KryoUnitXY, KryoUnitXY]>
+ { let Latency = 2; let NumMicroOps = 2; }
+def : WriteRes<WriteIEReg, [KryoUnitXY, KryoUnitXY]>
+ { let Latency = 2; let NumMicroOps = 2; }
+def : WriteRes<WriteExtr, [KryoUnitXY, KryoUnitX]>
+ { let Latency = 2; let NumMicroOps = 2; }
+def : WriteRes<WriteIS, [KryoUnitXY]> { let Latency = 2; }
+def : WriteRes<WriteID32, [KryoUnitXA, KryoUnitY]>
+ { let Latency = 8; let NumMicroOps = 1; } // Fragent -1
+def : WriteRes<WriteID64, [KryoUnitXA, KryoUnitY]>
+ { let Latency = 8; let NumMicroOps = 1; } // Fragent -1
+def : WriteRes<WriteIM32, [KryoUnitX]> { let Latency = 5; }
+def : WriteRes<WriteIM64, [KryoUnitX]> { let Latency = 5; }
+def : WriteRes<WriteBr, [KryoUnitXY]> { let Latency = 1; }
+def : WriteRes<WriteBrReg, [KryoUnitXY]> { let Latency = 1; }
+def : WriteRes<WriteLD, [KryoUnitLS]> { let Latency = 4; }
+def : WriteRes<WriteST, [KryoUnitLS]> { let Latency = 4; }
+def : WriteRes<WriteSTP, [KryoUnitLS]> { let Latency = 4; }
+def : WriteRes<WriteAdr, [KryoUnitXY]> { let Latency = 6; }
+def : WriteRes<WriteLDIdx, [KryoUnitLS]> { let Latency = 4; }
+def : WriteRes<WriteSTIdx, [KryoUnitLS]> { let Latency = 4; }
+def : WriteRes<WriteF, [KryoUnitXY, KryoUnitXY]>
+ { let Latency = 3; let NumMicroOps = 2; }
+def : WriteRes<WriteFCmp, [KryoUnitXY]> { let Latency = 2; }
+def : WriteRes<WriteFCvt, [KryoUnitX]> { let Latency = 4; }
+def : WriteRes<WriteFCopy, [KryoUnitXY]> { let Latency = 6; }
+def : WriteRes<WriteFImm, [KryoUnitXY]> { let Latency = 6; }
+def : WriteRes<WriteFMul, [KryoUnitX, KryoUnitX]>
+ { let Latency = 6; let NumMicroOps = 2; }
+def : WriteRes<WriteFDiv, [KryoUnitXA, KryoUnitY]>
+ { let Latency = 12; let NumMicroOps = 2; } // Fragent -1 / NoRSV +1
+def : WriteRes<WriteV, [KryoUnitXY]> { let Latency = 6; }
+def : WriteRes<WriteVLD, [KryoUnitLS]> { let Latency = 4; }
+def : WriteRes<WriteVST, [KryoUnitLS]> { let Latency = 4; }
+
+def : WriteRes<WriteSys, []> { let Latency = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint, []> { let Latency = 1; }
+
+def : WriteRes<WriteLDHi, []> { let Latency = 4; }
+
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+
+// No forwarding logic is modelled yet.
+def : ReadAdvance<ReadI, 0>;
+def : ReadAdvance<ReadISReg, 0>;
+def : ReadAdvance<ReadIEReg, 0>;
+def : ReadAdvance<ReadIM, 0>;
+def : ReadAdvance<ReadIMA, 0>;
+def : ReadAdvance<ReadID, 0>;
+def : ReadAdvance<ReadExtrHi, 0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD, 0>;
+
+
+//===----------------------------------------------------------------------===//
+// Specialize the coarse model by associating instruction groups with the
+// subtarget-defined types. As the modeled is refined, this will override most
+// of the above SchedWriteRes and SchedAlias mappings.
+
+// Miscellaneous
+// -----------------------------------------------------------------------------
+
+def : InstRW<[WriteI], (instrs COPY)>;
+
+
+// Detailed Refinedments
+// -----------------------------------------------------------------------------
+include "AArch64SchedKryoDetails.td"
+
+
+} // SchedModel = KryoModel
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td
new file mode 100644
index 000000000000..426ae6103e4b
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td
@@ -0,0 +1,2358 @@
+//=- AArch64SchedKryoDetails.td - QC Kryo Scheduling Defs ----*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the uop and latency details for the machine model for the
+// Qualcomm Kryo subtarget.
+//
+//===----------------------------------------------------------------------===//
+
+def KryoWrite_3cyc_X_noRSV_138ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_X_noRSV_138ln],
+ (instregex "(S|U)R?SRA(d|(v2i32|v4i16|v8i8)_shift)")>;
+
+def KryoWrite_3cyc_X_X_139ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_X_X_139ln],
+ (instregex "(S|U)R?SRA(v2i64|v4i32|v8i16|v16i8)_shift")>;
+
+def KryoWrite_4cyc_XY_XY_noRSV_172ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 4; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_4cyc_XY_XY_noRSV_172ln],
+ (instregex "(S|U)ABA(v8i8|v4i16|v2i32)")>;
+def KryoWrite_4cyc_XY_XY_XY_XY_178ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitXY, KryoUnitXY]> {
+ let Latency = 4; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_4cyc_XY_XY_XY_XY_178ln],
+ (instregex "(S|U)ABA(v16i8|v8i16|v4i32)")>;
+def KryoWrite_3cyc_XY_XY_XY_XY_177ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitXY, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_XY_XY_177ln],
+ (instregex "(S|U)ABALv.*")>;
+def KryoWrite_3cyc_XY_XY_166ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_166ln],
+ (instregex "(S|U)(ABD|QSUB|RHADD)(v16i8|v8i16|v4i32|v2i64)")>;
+def KryoWrite_3cyc_XY_noRSV_159ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_159ln],
+ (instregex "(S|U)(ABD|RHADD)(v8i8|v4i16|v2i32)")>;
+def KryoWrite_3cyc_XY_XY_165ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_165ln],
+ (instregex "(S|U)ABDLv.*")>;
+def KryoWrite_3cyc_X_noRSV_154ln :
+ SchedWriteRes<[KryoUnitX]> {
+let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_X_noRSV_154ln],
+ (instregex "(S|U)ADALP(v8i8|v4i16|v2i32)_v.*")>;
+def KryoWrite_3cyc_X_X_155ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_X_X_155ln],
+ (instregex "(S|U)ADALP(v16i8|v8i16|v4i32)_v.*")>;
+def KryoWrite_2cyc_XY_XY_151ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_151ln],
+ (instregex "(S|U)(ADD|SUB)Lv.*")>;
+def KryoWrite_2cyc_XY_noRSV_148ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_148ln],
+ (instregex "((S|U)ADDLP|ABS)(v2i32|v4i16|v8i8)(_v.*)?")>;
+def KryoWrite_2cyc_XY_XY_150ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_150ln],
+ (instregex "((S|U)ADDLP|ABS)(v2i64|v4i32|v8i16|v16i8)(_v.*)?")>;
+def KryoWrite_3cyc_XY_XY_XY_noRSV_179ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_XY_noRSV_179ln],
+ (instrs SADDLVv4i32v, UADDLVv4i32v)>;
+def KryoWrite_5cyc_XY_XY_XY_noRSV_180ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitXY]> {
+ let Latency = 5; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_5cyc_XY_XY_XY_noRSV_180ln],
+ (instrs SADDLVv8i16v, UADDLVv8i16v)>;
+def KryoWrite_6cyc_XY_XY_X_noRSV_181ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitX]> {
+ let Latency = 6; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_6cyc_XY_XY_X_noRSV_181ln],
+ (instrs SADDLVv16i8v, UADDLVv16i8v)>;
+def KryoWrite_3cyc_XY_noRSV_158ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_158ln],
+ (instrs SADDLVv4i16v, UADDLVv4i16v, ADDVv4i16v)>;
+def KryoWrite_4cyc_X_noRSV_169ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_169ln],
+ (instrs SADDLVv8i8v, UADDLVv8i8v, ADDVv8i8v)>;
+def KryoWrite_2cyc_XY_XY_XY_XY_176ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_XY_XY_176ln],
+ (instregex "(S|U)(ADDW|SUBW)v.*")>;
+def KryoWrite_4cyc_X_noRSV_40ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_40ln],
+ (instregex "(S|U)CVTFS(W|X)(D|S)ri")>;
+def KryoWrite_4cyc_X_noRSV_97ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_97ln],
+ (instregex "(S|U)CVTFU(W|X)(D|S)ri")>;
+def KryoWrite_4cyc_X_noRSV_110ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_110ln],
+ (instregex "(S|U)CVTF(v1i32|v2i32|v1i64|v2f32|d|s)(_shift)?")>;
+def KryoWrite_4cyc_X_X_114ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_114ln],
+ (instregex "(S|U)CVTF(v2i64|v4i32|v2f64|v4f32)(_shift)?")>;
+def KryoWrite_1cyc_XA_Y_98ln :
+ SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XA_Y_98ln],
+ (instregex "(S|U)DIV(_Int)?(W|X)r")>;
+def KryoWrite_2cyc_XY_XY_152ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_152ln],
+ (instregex "(S|U)H(ADD|SUB)(v16i8|v8i16|v4i32)")>;
+def KryoWrite_2cyc_XY_noRSV_149ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_149ln],
+ (instregex "((S|U)H(ADD|SUB)|ADDP)(v8i8|v4i16|v2i32)")>;
+def KryoWrite_4cyc_X_70ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_X_70ln],
+ (instregex "(S|U)(MADDL|MSUBL)rrr")>;
+def KryoWrite_4cyc_X_X_191ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_191ln],
+ (instregex "(S|U|SQD)(MLAL|MLSL|MULL)v.*")>;
+def KryoWrite_1cyc_XY_195ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_195ln],
+ (instregex "(S|U)MOVv.*")>;
+def KryoWrite_5cyc_X_71ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 5; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_5cyc_X_71ln],
+ (instrs SMULHrr, UMULHrr)>;
+def KryoWrite_3cyc_XY_noRSV_186ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_186ln],
+ (instregex "^(S|U)QADD(v8i8|v4i16|v2i32)")>;
+def KryoWrite_3cyc_XY_XY_187ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_187ln],
+ (instregex "^(S|U)QADD(v16i8|v8i16|v4i32|v2i64)")>;
+def KryoWrite_3cyc_XY_noRSV_69ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_69ln],
+ (instregex "(S|U|SU|US)QADD(v1i8|v1i16|v2i16|v1i32|v1i64)")>;
+def KryoWrite_3cyc_XY_noRSV_248ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_248ln],
+ (instregex "(S|U)QSHLU?(d|s|h|b|(v8i8|v4i16|v2i32)_shift)$")>;
+def KryoWrite_3cyc_XY_XY_250ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_250ln],
+ (instregex "(S|U)(QSHLU?|RSHR)(v16i8|v8i16|v4i32|v2i64)_shift$")>;
+def KryoWrite_3cyc_XY_noRSV_246ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_246ln],
+ (instregex "(S|U)(QSHL|RSHL|QRSHL)(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32)$")>;
+def KryoWrite_3cyc_XY_XY_251ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_251ln],
+ (instregex "(S|U)(QSHL|RSHL|QRSHL)(v16i8|v8i16|v4i32|v2i64)$")>;
+def KryoWrite_6cyc_XY_X_238ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitX]> {
+ let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_XY_X_238ln],
+ (instregex "((S|U)QR?SHRN|SQR?SHRUN)(v16i8|v8i16|v4i32)_shift$")>;
+def KryoWrite_3cyc_XY_noRSV_249ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_249ln],
+ (instregex "((S|U)QR?SHRN|SQR?SHRUN)(s|h|b)?")>;
+def KryoWrite_6cyc_XY_X_noRSV_252ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitX]> {
+ let Latency = 6; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_6cyc_XY_X_noRSV_252ln],
+ (instregex "((S|U)QR?SHRN|SQR?SHRUN)(v8i8|v4i16|v2i32)_shift?")>;
+def KryoWrite_3cyc_XY_noRSV_161ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_161ln],
+ (instregex "(S|U)QSUB(v8i8|v4i16|v2i32|v1i64|v1i32|v1i16|v1i8)")>;
+def KryoWrite_3cyc_XY_noRSV_163ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_163ln],
+ (instregex "(S|U)QXTU?N(v16i8|v8i16|v4i32|v8i8|v4i16|v2i32)")>;
+def KryoWrite_3cyc_XY_noRSV_162ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_162ln],
+ (instregex "(S|U)QXTU?N(v1i8|v1i16|v1i32)")>;
+def KryoWrite_3cyc_XY_noRSV_247ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_247ln],
+ (instregex "(S|U)RSHR(d|(v8i8|v4i16|v2i32)_shift)$")>;
+def KryoWrite_2cyc_XY_noRSV_239ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_239ln],
+ (instregex "(S|U)SHL(d|v8i8|v4i16|v2i32|v1i64)$")>;
+def KryoWrite_2cyc_XY_XY_243ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_243ln],
+ (instregex "(S|U)SHL(v16i8|v8i16|v4i32|v2i64)$")>;
+def KryoWrite_2cyc_XY_XY_241ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_241ln],
+ (instregex "(S|U)?SHLL(v16i8|v8i16|v4i32|v8i8|v4i16|v2i32)(_shift)?$")>;
+def KryoWrite_2cyc_XY_noRSV_240ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_240ln],
+ (instregex "((S|U)SHR|SHL)(d|(v8i8|v4i16|v2i32)_shift)$")>;
+def KryoWrite_2cyc_XY_XY_242ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_242ln],
+ (instregex "((S|U)SHR|SHL)(v16i8|v8i16|v4i32|v2i64)_shift$")>;
+def KryoWrite_2cyc_XY_XY_183ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_183ln],
+ (instregex "(S|U)(MAX|MIN)P?(v16i8|v8i16|v4i32)")>;
+def KryoWrite_2cyc_XY_noRSV_182ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_182ln],
+ (instregex "(S|U)(MAX|MIN)P?(v8i8|v4i16|v2i32)")>;
+def KryoWrite_3cyc_XY_noRSV_184ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_184ln],
+ (instregex "(S|U)(MAX|MIN)V(v4i16v|v8i8v|v4i32)")>;
+def KryoWrite_4cyc_X_noRSV_185ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_185ln],
+ (instregex "(S|U)(MAX|MIN)V(v16i8v|v8i16v)")>;
+def KryoWrite_2cyc_XY_noRSV_67ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_67ln],
+ (instrs ABSv1i64)>;
+def KryoWrite_1cyc_XY_63ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_63ln, ReadI, ReadI],
+ (instregex "ADC.*")>;
+def KryoWrite_1cyc_XY_63_1ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_63_1ln],
+ (instregex "ADR.*")>;
+def KryoWrite_1cyc_XY_62ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_62ln, ReadI],
+ (instregex "ADDS?(W|X)ri")>;
+def KryoWrite_2cyc_XY_XY_64ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_64ln, ReadI, ReadI],
+ (instregex "ADDS?(W|X)r(r|s|x)(64)?")>;
+def KryoWrite_1cyc_XY_noRSV_65ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_65ln],
+ (instrs ADDv1i64)>;
+def KryoWrite_1cyc_XY_noRSV_144ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_144ln],
+ (instregex "(ADD|SUB)(v8i8|v4i16|v2i32|v1i64)")>;
+def KryoWrite_1cyc_XY_XY_146ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_146ln],
+ (instregex "(ADD|SUB)(v16i8|v8i16|v4i32|v2i64)")>;
+def KryoWrite_4cyc_XY_X_noRSV_171ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_4cyc_XY_X_noRSV_171ln],
+ (instregex "(ADD|SUB)HNv.*")>;
+def KryoWrite_1cyc_XY_noRSV_66ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_66ln],
+ (instrs ADDPv2i64p)>;
+def KryoWrite_2cyc_XY_XY_153ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_153ln],
+ (instregex "ADDP(v16i8|v8i16|v4i32|v2i64)")>;
+def KryoWrite_3cyc_XY_XY_noRSV_170ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_noRSV_170ln],
+ (instrs ADDVv4i32v)>;
+def KryoWrite_4cyc_XY_XY_noRSV_173ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 4; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_4cyc_XY_XY_noRSV_173ln],
+ (instrs ADDVv8i16v)>;
+def KryoWrite_5cyc_XY_X_noRSV_174ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitX]> {
+ let Latency = 5; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_5cyc_XY_X_noRSV_174ln],
+ (instrs ADDVv16i8v)>;
+def KryoWrite_3cyc_XY_XY_X_X_27ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitX, KryoUnitX]> {
+ let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_X_X_27ln],
+ (instrs AESDrr, AESErr)>;
+def KryoWrite_2cyc_X_X_22ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_X_X_22ln],
+ (instrs AESIMCrr, AESMCrr)>;
+def KryoWrite_1cyc_XY_noRSV_76ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_76ln],
+ (instregex "((AND|ORN|EOR|EON)S?(Wr[rsi]|v8i8|v4i16|v2i32)|(ORR|BIC)S?(Wr[rs]|v8i8|v4i16|v2i32))")>;
+def KryoWrite_1cyc_XY_XY_79ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_79ln],
+ (instregex "((AND|ORN|EOR|EON)S?(Xr[rsi]|v16i8|v8i16|v4i32)|(ORR|BIC)S?(Xr[rs]|v16i8|v8i16|v4i32))")>;
+def KryoWrite_1cyc_X_72ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_X_72ln],
+ (instregex "(S|U)?BFM.*")>;
+def KryoWrite_1cyc_XY_noRSV_77ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_77ln],
+ (instregex "(BIC|ORR)S?Wri")>;
+def KryoWrite_1cyc_XY_XY_78ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_78ln],
+ (instregex "(BIC|ORR)S?Xri")>;
+def KryoWrite_1cyc_X_noRSV_74ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_noRSV_74ln],
+ (instrs BIFv8i8, BITv8i8, BSLv8i8)>;
+def KryoWrite_1cyc_X_X_75ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_75ln],
+ (instrs BIFv16i8, BITv16i8, BSLv16i8)>;
+def KryoWrite_0cyc_noRSV_11ln :
+ SchedWriteRes<[]> {
+ let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_noRSV_11ln],
+ (instrs BRK, DCPS1, DCPS2, DCPS3, HLT, HVC, ISB, HINT, SMC, SVC)>;
+def KryoWrite_0cyc_XY_16ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_XY_16ln, ReadI],
+ (instregex "(CCMN|CCMP)(W|X)i")>;
+def KryoWrite_0cyc_XY_16_1ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_XY_16_1ln, ReadI, ReadI],
+ (instregex "(CCMN|CCMP)(W|X)r")>;
+def KryoWrite_2cyc_XY_3ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_2cyc_XY_3ln, ReadI],
+ (instregex "(CLS|CLZ)(W|X)r")>;
+def KryoWrite_2cyc_XY_noRSV_7ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_7ln],
+ (instregex "(CLS|CLZ|CNT)(v4i32|v8i16|v16i8)")>;
+def KryoWrite_2cyc_XY_XY_8ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_8ln],
+ (instregex "(CLS|CLZ|CNT)(v2i32|v4i16|v8i8)")>;
+def KryoWrite_2cyc_XY_noRSV_80ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_80ln],
+ (instregex "CM(EQ|GE|HS|GT|HI|TST)(v8i8|v4i16|v2i32|v1i64)$")>;
+def KryoWrite_2cyc_XY_XY_83ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_83ln],
+ (instregex "CM(EQ|GE|HS|GT|HI|TST)(v16i8|v8i16|v4i32|v2i64)$")>;
+def KryoWrite_2cyc_XY_noRSV_81ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_81ln],
+ (instregex "CM(EQ|LE|GE|GT|LT)(v8i8|v4i16|v2i32|v1i64)rz$")>;
+def KryoWrite_2cyc_XY_XY_82ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_82ln],
+ (instregex "CM(EQ|LE|GE|GT|LT)(v16i8|v8i16|v4i32|v2i64)rz$")>;
+def KryoWrite_3cyc_XY_4ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_XY_4ln, ReadI, ReadISReg],
+ (instregex "CRC32.*")>;
+def KryoWrite_1cyc_XY_20ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_20ln, ReadI, ReadI],
+ (instregex "CSEL(W|X)r")>;
+def KryoWrite_1cyc_X_17ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_X_17ln, ReadI, ReadI],
+ (instregex "(CSINC|CSNEG)(W|X)r")>;
+def KryoWrite_1cyc_XY_18ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_18ln, ReadI, ReadI],
+ (instregex "(CSINV)(W|X)r")>;
+def KryoWrite_3cyc_LS_X_13ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitX]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_X_13ln],
+ (instrs DRPS)>;
+def KryoWrite_0cyc_LS_10ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_LS_10ln],
+ (instrs DSB, DMB, CLREX)>;
+def KryoWrite_1cyc_X_noRSV_196ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_noRSV_196ln],
+ (instregex "DUP(v8i8|v4i16|v2i32)(gpr|lane)")>;
+def KryoWrite_1cyc_X_X_197ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_197ln],
+ (instregex "DUP(v16i8|v8i16|v4i32|v2i64)(gpr|lane)")>;
+def KryoWrite_3cyc_LS_LS_X_15ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX]> {
+ let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_X_15ln],
+ (instrs ERET)>;
+def KryoWrite_1cyc_X_noRSV_207ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_noRSV_207ln],
+ (instrs EXTv8i8)>;
+def KryoWrite_1cyc_X_X_212ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_212ln],
+ (instrs EXTv16i8)>;
+def KryoWrite_2cyc_XY_X_136ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitX]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_X_136ln],
+ (instrs EXTRWrri, EXTRXrri)>;
+def KryoWrite_2cyc_XY_noRSV_35ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_35ln],
+ (instregex "F(MAX|MIN)(NM)?P?(D|S)rr")>;
+def KryoWrite_2cyc_XY_XY_106ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_106ln],
+ (instregex "(F(MAX|MIN)(NM)?P?|FAC(GE|GT)|FCM(EQ|GE|GT))(v2i64p|v2f64|v4f32)")>;
+def KryoWrite_2cyc_XY_noRSV_104ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_104ln],
+ (instregex "(F(MAX|MIN)(NM)?P?|FAC(GE|GT)|FCM(EQ|GE|GT))(v2f32|v2i32p)")>;
+def KryoWrite_3cyc_XY_noRSV_107ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_107ln],
+ (instregex "F(MAX|MIN)(NM)?Vv4i32v")>;
+def KryoWrite_3cyc_XY_noRSV_101ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_101ln],
+ (instregex "FABD(32|64|v2f32)")>;
+def KryoWrite_3cyc_XY_XY_103ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_103ln],
+ (instregex "(FABD|FADD|FSUB|FADDP)(v4f32|v2f64)")>;
+def KryoWrite_1cyc_XY_noRSV_48ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_48ln],
+ (instregex "F(ABS|NEG)(D|S)r")>;
+def KryoWrite_1cyc_XY_noRSV_124ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_124ln],
+ (instregex "F(ABS|NEG)v2f32")>;
+def KryoWrite_1cyc_XY_XY_125ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_125ln],
+ (instregex "F(ABS|NEG)(v2f64|v4f32)")>;
+def KryoWrite_2cyc_XY_noRSV_33ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_33ln],
+ (instregex "(FAC(GE|GT)|FCM(EQ|GE|GT))(32|64)")>;
+def KryoWrite_3cyc_XY_noRSV_30ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_30ln],
+ (instregex "(FADD|FSUB)(D|S)rr")>;
+def KryoWrite_3cyc_XY_noRSV_100ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_100ln],
+ (instregex "(FADD|FSUB|FADDP)v2f32")>;
+def KryoWrite_3cyc_XY_noRSV_29ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_29ln],
+ (instregex "FADDP(v2i32p|v2i64p)")>;
+def KryoWrite_0cyc_XY_31ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_XY_31ln],
+ (instregex "FCCMPE?(D|S)rr")>;
+def KryoWrite_2cyc_XY_noRSV_34ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_34ln],
+ (instregex "FCM(EQ|LE|GE|GT|LT)(v1i32|v1i64)rz")>;
+def KryoWrite_2cyc_XY_XY_36ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_36ln],
+ (instregex "FCM(EQ|LE|GE|GT|LT)(v2i64|v4i32)rz")>;
+def KryoWrite_2cyc_XY_noRSV_105ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_105ln],
+ (instregex "FCM(EQ|LE|GE|GT|LT)v2i32rz")>;
+def KryoWrite_0cyc_XY_32ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_XY_32ln],
+ (instregex "FCMPE?(D|S)r(r|i)")>;
+def KryoWrite_1cyc_XY_noRSV_49ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_49ln],
+ (instrs FCSELDrrr, FCSELSrrr)>;
+def KryoWrite_4cyc_X_noRSV_41ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_41ln],
+ (instrs FCVTDHr, FCVTDSr, FCVTHDr, FCVTHSr, FCVTSDr, FCVTSHr)>;
+def KryoWrite_4cyc_X_38ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_X_38ln],
+ (instregex "FCVT(((A|N|M|P)(S|U)(S|U)|Z(S|U)_Int(S|U))(W|X)(D|S)ri?|Z(S|U)(d|s))$")>;
+def KryoWrite_4cyc_X_noRSV_113ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_113ln],
+ (instregex "FCVT((A|N|M|P)(S|U)|Z(S|U)_Int)(v1i32|v1i64|v2f32)$")>;
+def KryoWrite_4cyc_X_X_117ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_117ln],
+ (instregex "FCVT((A|N|M|P)(S|U)|Z(S|U)_Int)(v4f32|v2f64)$")>;
+def KryoWrite_5cyc_X_X_XY_noRSV_119ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitXY]> {
+ let Latency = 5; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_5cyc_X_X_XY_noRSV_119ln],
+ (instregex "FCVTX?N(v2f32|v4f32|v2i32|v4i16|v4i32|v8i16)$")>;
+def KryoWrite_4cyc_X_X_116ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_116ln],
+ (instregex "FCVTL(v2i32|v4i16|v4i32|v8i16)$")>;
+def KryoWrite_4cyc_X_noRSV_112ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_112ln],
+ (instrs FCVTXNv1i64)>;
+def KryoWrite_4cyc_X_37ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_X_37ln],
+ (instregex "FCVTZ(S|U)(S|U)(W|X)(D|S)ri?$")>;
+def KryoWrite_4cyc_X_noRSV_111ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_111ln],
+ (instregex "FCVTZ(S|U)(v2f32|v1i32|v1i64|v2i32(_shift)?)$")>;
+def KryoWrite_4cyc_X_X_115ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_115ln],
+ (instregex "FCVTZ(S|U)(v2f64|v4f32|(v2i64|v4i32)(_shift)?)$")>;
+def KryoWrite_1cyc_XA_Y_noRSV_43ln :
+ SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_43ln],
+ (instrs FDIVDrr, FDIVSrr)>;
+def KryoWrite_1cyc_XA_Y_noRSV_121ln :
+ SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_121ln],
+ (instrs FDIVv2f32)>;
+def KryoWrite_1cyc_XA_Y_XA_Y_123ln :
+ SchedWriteRes<[KryoUnitXA, KryoUnitY, KryoUnitXA, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_1cyc_XA_Y_XA_Y_123ln],
+ (instrs FDIVv2f64, FDIVv4f32)>;
+def KryoWrite_5cyc_X_noRSV_55ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_55ln],
+ (instregex "FN?M(ADD|SUB)Srrr")>;
+def KryoWrite_6cyc_X_noRSV_57ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_noRSV_57ln],
+ (instregex "FN?M(ADD|SUB)Drrr")>;
+def KryoWrite_5cyc_X_noRSV_51ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_51ln],
+ (instrs FMLAv2f32, FMLSv2f32, FMLAv1i32_indexed, FMLSv1i32_indexed)>;
+def KryoWrite_5cyc_X_X_56ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_X_56ln],
+ (instrs FMLAv4f32, FMLSv4f32)>;
+def KryoWrite_6cyc_X_X_61ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_X_61ln],
+ (instrs FMLAv2f64, FMLSv2f64)>;
+def KryoWrite_5cyc_X_noRSV_128ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_128ln],
+ (instrs FMLAv2i32_indexed, FMLSv2i32_indexed)>;
+def KryoWrite_5cyc_X_X_131ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_X_131ln],
+ (instrs FMLAv4i32_indexed, FMLSv4i32_indexed)>;
+def KryoWrite_6cyc_X_X_134ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_X_134ln],
+ (instrs FMLAv2i64_indexed, FMLSv2i64_indexed)>;
+def KryoWrite_6cyc_X_noRSV_60ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_noRSV_60ln],
+ (instrs FMLAv1i64_indexed, FMLSv1i64_indexed, FMULv1i64_indexed, FMULXv1i64_indexed)>;
+def KryoWrite_1cyc_XY_45ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_45ln],
+ (instregex "FMOV(XDHigh|DXHigh|DX)r")>;
+def KryoWrite_1cyc_XY_noRSV_47ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_47ln],
+ (instregex "FMOV(Di|Dr|Si|Sr|SWr|WSr|XDr|v.*_ns)")>;
+def KryoWrite_5cyc_X_noRSV_53ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_53ln],
+ (instrs FMULv1i32_indexed, FMULXv1i32_indexed)>;
+def KryoWrite_5cyc_X_noRSV_127ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_127ln],
+ (instrs FMULv2f32, FMULXv2f32, FMULv2i32_indexed, FMULXv2i32_indexed)>;
+def KryoWrite_5cyc_X_X_130ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_X_130ln],
+ (instrs FMULv4f32, FMULXv4f32, FMULv4i32_indexed, FMULXv4i32_indexed)>;
+def KryoWrite_6cyc_X_X_133ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_X_133ln],
+ (instrs FMULv2f64, FMULXv2f64, FMULv2i64_indexed, FMULXv2i64_indexed)>;
+def KryoWrite_5cyc_X_noRSV_54ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_54ln],
+ (instrs FMULSrr, FNMULSrr, FMULX32)>;
+def KryoWrite_6cyc_X_noRSV_59ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_noRSV_59ln],
+ (instrs FMULDrr, FNMULDrr, FMULX64)>;
+def KryoWrite_3cyc_XY_noRSV_28ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_28ln],
+ (instrs FRECPEv1i32, FRECPEv1i64, FRSQRTEv1i32, FRSQRTEv1i64 )>;
+def KryoWrite_3cyc_XY_noRSV_99ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_99ln],
+ (instrs FRECPEv2f32, FRSQRTEv2f32)>;
+def KryoWrite_3cyc_XY_XY_102ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_102ln],
+ (instrs FRECPEv2f64, FRECPEv4f32, FRSQRTEv2f64, FRSQRTEv4f32)>;
+def KryoWrite_5cyc_X_noRSV_52ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_52ln],
+ (instrs FRECPS32, FRSQRTS32)>;
+def KryoWrite_6cyc_X_noRSV_58ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_noRSV_58ln],
+ (instrs FRECPS64, FRSQRTS64)>;
+def KryoWrite_5cyc_X_noRSV_126ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_126ln],
+ (instrs FRECPSv2f32, FRSQRTSv2f32)>;
+def KryoWrite_5cyc_X_X_129ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_X_129ln],
+ (instrs FRECPSv4f32, FRSQRTSv4f32)>;
+def KryoWrite_6cyc_X_X_132ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_X_132ln],
+ (instrs FRECPSv2f64, FRSQRTSv2f64)>;
+def KryoWrite_3cyc_XY_noRSV_50ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_50ln],
+ (instrs FRECPXv1i32, FRECPXv1i64)>;
+def KryoWrite_2cyc_XY_noRSV_39ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_39ln],
+ (instregex "FRINT(A|I|M|N|P|X|Z)(S|D)r")>;
+def KryoWrite_2cyc_XY_noRSV_108ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_108ln],
+ (instregex "FRINT(A|I|M|N|P|X|Z)v2f32")>;
+def KryoWrite_2cyc_XY_XY_109ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_109ln],
+ (instregex "FRINT(A|I|M|N|P|X|Z)(v2f64|v4f32)")>;
+def KryoWrite_1cyc_XA_Y_noRSV_42ln :
+ SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_42ln],
+ (instregex "FSQRT(S|D)r")>;
+def KryoWrite_1cyc_XA_Y_noRSV_120ln :
+ SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_120ln],
+ (instregex "FSQRTv2f32")>;
+def KryoWrite_1cyc_XA_Y_XA_Y_122ln :
+ SchedWriteRes<[KryoUnitXA, KryoUnitY, KryoUnitXA, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_1cyc_XA_Y_XA_Y_122ln],
+ (instregex "FSQRT(v2f64|v4f32)")>;
+def KryoWrite_1cyc_X_201ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_X_201ln],
+ (instregex "INSv.*")>;
+def KryoWrite_3cyc_LS_255ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_255ln],
+ (instregex "LD1(One(v16b|v8h|v4s|v2d)|i64)$")>;
+def KryoWrite_4cyc_LS_X_270ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_LS_X_270ln],
+ (instregex "LD1(i8|i16|i32)$")>;
+def KryoWrite_3cyc_LS_noRSV_285ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_285ln],
+ (instregex "LD1One(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_289ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_289ln, WriteAdr],
+ (instregex "LD1(One(v16b|v8h|v4s|v2d)|i64)_POST$")>;
+def KryoWrite_4cyc_LS_XY_X_298ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_X_298ln, WriteAdr],
+ (instregex "LD1(i8|i16|i32)_POST$")>;
+def KryoWrite_3cyc_LS_LS_LS_308ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_LS_308ln],
+ (instregex "LD1Three(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_3cyc_LS_XY_noRSV_317ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_317ln, WriteAdr],
+ (instregex "LD1One(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_LS_LS_328ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_LS_LS_328ln, WriteAdr],
+ (instregex "LD1Four(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_3cyc_LS_XY_LS_LS_332ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_332ln, WriteAdr],
+ (instregex "LD1Three(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_348ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_348ln],
+ (instregex "LD1Three(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_LS_LS_351ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_LS_351ln],
+ (instregex "LD1Four(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_noRSV_358ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_noRSV_358ln],
+ (instregex "LD1Four(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_360ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_360ln, WriteAdr],
+ (instregex "LD1Three(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_noRSV_368ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 7;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_noRSV_368ln, WriteAdr],
+ (instregex "LD1Four(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_281ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_281ln],
+ (instregex "LD(1|2)Two(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_3cyc_LS_noRSV_noRSV_311ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_noRSV_311ln],
+ (instregex "LD(1|2)Two(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_313ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_313ln, WriteAdr],
+ (instregex "LD(1|2)Two(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_3cyc_LS_XY_noRSV_noRSV_334ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_noRSV_334ln, WriteAdr],
+ (instregex "LD(1|2)Two(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_256ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_256ln],
+ (instregex "LD1R(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_3cyc_LS_noRSV_286ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_286ln],
+ (instregex "LD1R(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_290ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_290ln, WriteAdr],
+ (instregex "LD1R(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_3cyc_LS_XY_noRSV_318ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_318ln, WriteAdr],
+ (instregex "LD1R(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_257ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_257ln],
+ (instregex "LD2i64$")>;
+def KryoWrite_3cyc_LS_XY_291ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_291ln, WriteAdr],
+ (instregex "LD2i64_POST$")>;
+def KryoWrite_4cyc_LS_X_X_296ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitX, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_4cyc_LS_X_X_296ln],
+ (instregex "LD2(i8|i16|i32)$")>;
+def KryoWrite_4cyc_LS_XY_X_X_321ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitX, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_X_X_321ln, WriteAdr],
+ (instregex "LD2(i8|i16|i32)_POST$")>;
+def KryoWrite_3cyc_LS_LS_282ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_282ln],
+ (instregex "LD2R(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_3cyc_LS_noRSV_noRSV_312ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_noRSV_312ln],
+ (instregex "LD2R(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_314ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_314ln, WriteAdr],
+ (instregex "LD2R(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_3cyc_LS_XY_noRSV_noRSV_335ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_noRSV_335ln, WriteAdr],
+ (instregex "LD2R(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_283ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_283ln],
+ (instregex "LD3i64$")>;
+def KryoWrite_3cyc_LS_LS_LS_309ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_LS_309ln],
+ (instregex "LD3Threev2d$")>;
+def KryoWrite_3cyc_LS_XY_LS_315ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_315ln, WriteAdr],
+ (instregex "LD3i64_POST$")>;
+def KryoWrite_4cyc_LS_X_X_X_320ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_4cyc_LS_X_X_X_320ln],
+ (instregex "LD3(i8|i16|i32)$")>;
+def KryoWrite_3cyc_LS_XY_LS_LS_331ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_331ln, WriteAdr],
+ (instregex "LD3Threev2d_POST$")>;
+def KryoWrite_4cyc_LS_XY_X_X_X_338ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitX, KryoUnitX, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_X_X_X_338ln, WriteAdr],
+ (instregex "LD3(i8|i16|i32)_POST$")>;
+def KryoWrite_4cyc_LS_LS_X_X_X_noRSV_noRSV_noRSV_373ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 8;
+}
+def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_noRSV_noRSV_noRSV_373ln],
+ (instregex "LD3Three(v8b|v4h|v2s)$")>;
+def KryoWrite_4cyc_LS_XY_LS_X_X_X_noRSV_noRSV_noRSV_380ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitX, KryoUnitX,
+ KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 9;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_LS_X_X_X_noRSV_noRSV_noRSV_380ln, WriteAdr],
+ (instregex "LD3Three(v8b|v4h|v2s)_POST$")>;
+def KryoWrite_4cyc_LS_LS_X_X_X_LS_LS_X_X_X_381ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX,
+ KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 10;
+}
+def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_LS_LS_X_X_X_381ln],
+ (instregex "LD3Three(v16b|v8h|v4s)$")>;
+def KryoWrite_4cyc_LS_LS_X_X_X_LS_XY_LS_X_X_X_383ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX,
+ KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitX, KryoUnitX,
+ KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 11;
+}
+def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_LS_XY_LS_X_X_X_383ln, WriteAdr],
+ (instregex "LD3Three(v16b|v8h|v4s)_POST$")>;
+def KryoWrite_3cyc_LS_LS_LS_310ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_LS_310ln],
+ (instregex "LD3R(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_LS_333ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_333ln, WriteAdr],
+ (instregex "LD3R(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_349ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_349ln],
+ (instregex "LD3R(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_361ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_361ln, WriteAdr],
+ (instregex "LD3R(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_284ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_284ln],
+ (instregex "LD4i64$")>;
+def KryoWrite_3cyc_LS_XY_LS_316ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_316ln, WriteAdr],
+ (instregex "LD4i64_POST$")>;
+def KryoWrite_3cyc_LS_LS_LS_LS_329ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_LS_LS_329ln],
+ (instregex "LD4Four(v2d)$")>;
+def KryoWrite_4cyc_LS_X_X_X_X_337ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_4cyc_LS_X_X_X_X_337ln],
+ (instregex "LD4(i8|i16|i32)$")>;
+def KryoWrite_3cyc_LS_XY_LS_LS_LS_350ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_LS_350ln, WriteAdr],
+ (instregex "LD4Four(v2d)_POST$")>;
+def KryoWrite_4cyc_LS_XY_X_X_X_X_355ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitX, KryoUnitX, KryoUnitX,
+ KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_X_X_X_X_355ln, WriteAdr],
+ (instregex "LD4(i8|i16|i32)_POST$")>;
+def KryoWrite_4cyc_LS_LS_X_X_X_X_noRSV_noRSV_noRSV_noRSV_382ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX,
+ KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 10;
+}
+def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_X_noRSV_noRSV_noRSV_noRSV_382ln],
+ (instregex "LD4Four(v8b|v4h|v2s)$")>;
+def KryoWrite_4cyc_LS_XY_LS_X_X_X_X_noRSV_noRSV_noRSV_noRSV_384ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitX, KryoUnitX,
+ KryoUnitX, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 11;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_LS_X_X_X_X_noRSV_noRSV_noRSV_noRSV_384ln, WriteAdr],
+ (instregex "LD4Four(v8b|v4h|v2s)_POST$")>;
+def KryoWrite_4cyc_LS_LS_X_X_X_X_LS_LS_X_X_X_X_386ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX,
+ KryoUnitX, KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX,
+ KryoUnitX, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 12;
+}
+def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_X_LS_LS_X_X_X_X_386ln],
+ (instregex "LD4Four(v16b|v8h|v4s)$")>;
+def KryoWrite_4cyc_LS_LS_X_X_X_X_LS_XY_LS_X_X_X_X_389ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX,
+ KryoUnitX, KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitX,
+ KryoUnitX, KryoUnitX, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 13;
+}
+def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_X_LS_XY_LS_X_X_X_X_389ln, WriteAdr],
+ (instregex "LD4Four(v16b|v8h|v4s)_POST$")>;
+def KryoWrite_3cyc_LS_LS_LS_LS_330ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_LS_LS_330ln],
+ (instregex "LD4R(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_LS_LS_352ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_LS_352ln, WriteAdr],
+ (instregex "LD4R(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_noRSV_359ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_noRSV_359ln],
+ (instregex "LD4R(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_noRSV_369ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 7;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_noRSV_369ln, WriteAdr],
+ (instregex "LD4R(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_400ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_400ln],
+ (instregex "(LDAX?R(B|H|W|X)|LDAXP(W|X))")>;
+def KryoWrite_3cyc_LS_LS_401ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_401ln, WriteLDHi],
+ (instrs LDNPQi)>;
+def KryoWrite_3cyc_LS_noRSV_noRSV_408ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_noRSV_408ln, WriteLDHi],
+ (instrs LDNPDi, LDNPSi)>;
+def KryoWrite_3cyc_LS_394ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_394ln, WriteLDHi],
+ (instrs LDNPWi, LDNPXi)>;
+def KryoWrite_3cyc_LS_LS_402ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_402ln, WriteLDHi],
+ (instrs LDPQi)>;
+def KryoWrite_3cyc_LS_noRSV_noRSV_409ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_noRSV_409ln, WriteLDHi],
+ (instrs LDPDi, LDPSi)>;
+def KryoWrite_3cyc_LS_XY_LS_410ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_410ln, WriteLDHi, WriteAdr],
+ (instregex "LDPQ(post|pre)")>;
+def KryoWrite_3cyc_LS_XY_noRSV_noRSV_411ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_noRSV_411ln, WriteLDHi, WriteAdr],
+ (instregex "LDP(D|S)(post|pre)")>;
+def KryoWrite_3cyc_LS_393ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_393ln, WriteLDHi],
+ (instrs LDPWi, LDPXi)>;
+def KryoWrite_3cyc_LS_XY_403ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_403ln, WriteLDHi, WriteAdr],
+ (instregex "LDP(W|X)(post|pre)")>;
+def KryoWrite_4cyc_LS_395ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_LS_395ln, WriteLDHi],
+ (instrs LDPSWi)>;
+def KryoWrite_4cyc_LS_XY_405ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_405ln, WriteLDHi, WriteAdr],
+ (instrs LDPSWpost, LDPSWpre)>;
+def KryoWrite_3cyc_LS_264ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_264ln],
+ (instrs LDRQui, LDRQl)>;
+def KryoWrite_4cyc_X_LS_271ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitLS]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_LS_271ln],
+ (instrs LDRQroW, LDRQroX)>;
+def KryoWrite_3cyc_LS_noRSV_287ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_287ln],
+ (instregex "LDR((D|S)l|(D|S|H|B)ui)")>;
+def KryoWrite_3cyc_LS_XY_293ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_293ln, WriteAdr],
+ (instrs LDRQpost, LDRQpre)>;
+def KryoWrite_4cyc_X_LS_noRSV_297ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitLS]> {
+ let Latency = 4; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_4cyc_X_LS_noRSV_297ln],
+ (instregex "LDR(D|S|H|B)ro(W|X)")>;
+def KryoWrite_3cyc_LS_XY_noRSV_319ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_319ln, WriteAdr],
+ (instregex "LDR(D|S|H|B)(post|pre)")>;
+def KryoWrite_3cyc_LS_261ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_261ln],
+ (instregex "LDR(BB|HH|W|X)ui")>;
+def KryoWrite_3cyc_LS_XY_292ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_292ln, WriteAdr],
+ (instregex "LDR(BB|HH|W|X)(post|pre)")>;
+def KryoWrite_4cyc_X_LS_272ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitLS]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_LS_272ln],
+ (instregex "(LDR(BB|HH|W|X)ro(W|X)|PRFMro(W|X))")>;
+def KryoWrite_3cyc_LS_262ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_262ln],
+ (instrs LDRWl, LDRXl)>;
+def KryoWrite_4cyc_LS_268ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_LS_268ln],
+ (instregex "LDRS(BW|BX|HW|HX|W)ui")>;
+def KryoWrite_5cyc_X_LS_273ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitLS]> {
+ let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_LS_273ln],
+ (instregex "LDRS(BW|BX|HW|HX|W)ro(W|X)")>;
+def KryoWrite_4cyc_LS_XY_294ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_294ln, WriteAdr],
+ (instregex "LDRS(BW|BX|HW|HX|W)(post|pre)")>;
+def KryoWrite_4cyc_LS_269ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_LS_269ln],
+ (instrs LDRSWl)>;
+def KryoWrite_3cyc_LS_260ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_260ln],
+ (instregex "LDTR(B|H|W|X)i")>;
+def KryoWrite_4cyc_LS_267ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_LS_267ln],
+ (instregex "LDTRS(BW|BX|HW|HX|W)i")>;
+def KryoWrite_3cyc_LS_263ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_263ln],
+ (instrs LDURQi)>;
+def KryoWrite_3cyc_LS_noRSV_288ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_288ln],
+ (instregex "LDUR(D|S|H|B)i")>;
+def KryoWrite_3cyc_LS_259ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_259ln],
+ (instregex "LDUR(BB|HH|W|X)i")>;
+def KryoWrite_4cyc_LS_266ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_LS_266ln],
+ (instregex "LDURS(B|H)?(W|X)i")>;
+def KryoWrite_3cyc_LS_258ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_258ln],
+ (instregex "LDXP(W|X)")>;
+def KryoWrite_3cyc_LS_258_1ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_258_1ln],
+ (instregex "LDXR(B|H|W|X)")>;
+def KryoWrite_2cyc_XY_XY_137ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_137ln],
+ (instrs LSLVWr, LSLVXr)>;
+def KryoWrite_1cyc_XY_135ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_135ln],
+ (instregex "(LS|AS|RO)RV(W|X)r")>;
+def KryoWrite_4cyc_X_84ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_X_84ln],
+ (instrs MADDWrrr, MSUBWrrr)>;
+def KryoWrite_5cyc_X_85ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 5; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_5cyc_X_85ln],
+ (instrs MADDXrrr, MSUBXrrr)>;
+def KryoWrite_4cyc_X_noRSV_188ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_188ln],
+ (instregex "(MLA|MLS|MUL)(v8i8|v4i16|v2i32)(_indexed)?")>;
+def KryoWrite_4cyc_X_X_192ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_192ln],
+ (instregex "(MLA|MLS|MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?")>;
+def KryoWrite_1cyc_XY_noRSV_198ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_198ln],
+ (instregex "(MOVI|MVNI)(D|v8b_ns|v2i32|v4i16|v2s_msl)")>;
+def KryoWrite_1cyc_XY_XY_199ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_199ln],
+ (instregex "(MOVI|MVNI)(v2d_ns|v16b_ns|v4i32|v8i16|v4s_msl)")>;
+def KryoWrite_1cyc_X_89ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_X_89ln],
+ (instrs MOVKWi, MOVKXi)>;
+def KryoWrite_1cyc_XY_91ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_91ln],
+ (instrs MOVNWi, MOVNXi)>;
+def KryoWrite_1cyc_XY_90ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_90ln],
+ (instrs MOVZWi, MOVZXi)>;
+def KryoWrite_2cyc_XY_93ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_2cyc_XY_93ln],
+ (instrs MRS)>;
+def KryoWrite_0cyc_X_87ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_X_87ln],
+ (instrs MSRpstateImm4)>;
+def : InstRW<[KryoWrite_0cyc_X_87ln],
+ (instrs MSRpstateImm1)>;
+def KryoWrite_0cyc_XY_88ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_XY_88ln],
+ (instrs MSR)>;
+def KryoWrite_1cyc_XY_noRSV_143ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_143ln],
+ (instregex "NEG(v8i8|v4i16|v2i32|v1i64)")>;
+def KryoWrite_1cyc_XY_XY_145ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_145ln],
+ (instregex "NEG(v16i8|v8i16|v4i32|v2i64)")>;
+def KryoWrite_1cyc_XY_noRSV_193ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_193ln],
+ (instrs NOTv8i8)>;
+def KryoWrite_1cyc_XY_XY_194ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_194ln],
+ (instrs NOTv16i8)>;
+def KryoWrite_2cyc_XY_noRSV_234ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_234ln],
+ (instrs PMULv8i8)>;
+def KryoWrite_2cyc_XY_XY_236ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_236ln],
+ (instrs PMULv16i8)>;
+def KryoWrite_2cyc_XY_XY_235ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_235ln],
+ (instrs PMULLv8i8, PMULLv16i8)>;
+def KryoWrite_3cyc_XY_XY_237ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_237ln],
+ (instrs PMULLv1i64, PMULLv2i64)>;
+def KryoWrite_0cyc_LS_254ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_LS_254ln],
+ (instrs PRFMl, PRFMui)>;
+def KryoWrite_0cyc_LS_253ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_LS_253ln],
+ (instrs PRFUMi)>;
+def KryoWrite_6cyc_XY_X_noRSV_175ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitX]> {
+ let Latency = 6; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_6cyc_XY_X_noRSV_175ln],
+ (instregex "R(ADD|SUB)HNv.*")>;
+def KryoWrite_2cyc_XY_204ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_2cyc_XY_204ln],
+ (instrs RBITWr, RBITXr)>;
+def KryoWrite_2cyc_XY_noRSV_218ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_218ln],
+ (instrs RBITv8i8)>;
+def KryoWrite_2cyc_XY_XY_219ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_219ln],
+ (instrs RBITv16i8)>;
+def KryoWrite_1cyc_X_202ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_X_202ln],
+ (instregex "REV(16|32)?(W|X)r")>;
+def KryoWrite_1cyc_XY_noRSV_214ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_214ln],
+ (instregex "REV(16|32|64)(v8i8|v4i16|v2i32)")>;
+def KryoWrite_1cyc_XY_XY_216ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_216ln],
+ (instregex "REV(16|32|64)(v16i8|v8i16|v4i32)")>;
+def KryoWrite_3cyc_X_noRSV_244ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_X_noRSV_244ln],
+ (instregex "S(L|R)I(d|(v8i8|v4i16|v2i32)_shift)")>;
+def KryoWrite_3cyc_X_X_245ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_X_X_245ln],
+ (instregex "S(L|R)I(v16i8|v8i16|v4i32|v2i64)_shift")>;
+def KryoWrite_1cyc_XY_2ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_2ln, ReadI, ReadI],
+ (instregex "SBCS?(W|X)r")>;
+def KryoWrite_2cyc_XA_XA_XA_24ln :
+ SchedWriteRes<[KryoUnitXA, KryoUnitXA, KryoUnitXA]> {
+ let Latency = 2; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_2cyc_XA_XA_XA_24ln],
+ (instrs SHA1Crrr, SHA1Mrrr, SHA1Prrr)>;
+def KryoWrite_1cyc_XY_noRSV_21ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_21ln],
+ (instrs SHA1Hrr)>;
+def KryoWrite_2cyc_X_X_23ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_X_X_23ln],
+ (instrs SHA1SU0rrr, SHA1SU1rr, SHA256SU0rr)>;
+def KryoWrite_4cyc_XA_XA_XA_25ln :
+ SchedWriteRes<[KryoUnitXA, KryoUnitXA, KryoUnitXA]> {
+ let Latency = 4; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_4cyc_XA_XA_XA_25ln],
+ (instrs SHA256Hrrr, SHA256H2rrr)>;
+def KryoWrite_3cyc_XY_XY_X_X_26ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitX, KryoUnitX]> {
+ let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_X_X_26ln],
+ (instrs SHA256SU1rrr)>;
+def KryoWrite_4cyc_X_noRSV_189ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_189ln],
+ (instregex "SQR?DMULH(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?")>;
+def KryoWrite_3cyc_XY_noRSV_68ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_68ln],
+ (instregex "SQ(ABS|NEG)(v1i8|v1i16|v1i32|v1i64)")>;
+def KryoWrite_3cyc_XY_noRSV_157ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_157ln],
+ (instregex "SQ(ABS|NEG)(v8i8|v4i16|v2i32)")>;
+def KryoWrite_3cyc_XY_XY_164ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_164ln],
+ (instregex "SQ(ABS|NEG)(v16i8|v8i16|v4i32|v2i64)")>;
+def KryoWrite_4cyc_X_noRSV_190ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_190ln],
+ (instregex "SQD(MLAL|MLSL|MULL)(i16|i32)")>;
+def KryoWrite_0cyc_LS_Y_274ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_274ln],
+ (instregex "ST1(One(v8b|v4h|v2s|v1d|v16b|v8h|v4s|v2d)|(i8|i16|i32|i64)|Two(v8b|v4h|v2s|v1d))$")>;
+def KryoWrite_1cyc_LS_Y_X_301ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_X_301ln],
+ (instregex "ST1(One(v8b|v4h|v2s|v1d|v16b|v8h|v4s|v2d)|(i8|i16|i32|i64)|Two(v8b|v4h|v2s|v1d))_POST$")>;
+def KryoWrite_1cyc_LS_Y_XY_305ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_305ln],
+ (instregex "ST1(One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))_POST$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_323ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 4;
+}
+def : InstRW<[WriteAdr, KryoWrite_0cyc_LS_Y_LS_Y_323ln],
+ (instregex "ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))_POST$")>;
+def KryoWrite_1cyc_LS_Y_XY_LS_Y_345ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_1cyc_LS_Y_XY_LS_Y_345ln],
+ (instregex "ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_356ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitLS,
+ KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_356ln],
+ (instregex "ST1Three(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_1cyc_LS_Y_XY_LS_Y_LS_Y_366ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY,
+ KryoUnitLS, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 7;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_LS_Y_LS_Y_366ln],
+ (instregex "ST1Three(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_LS_Y_371ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitLS,
+ KryoUnitY, KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 8;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_LS_Y_371ln],
+ (instregex "ST1Four(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_XY_LS_Y_LS_Y_377ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitXY,
+ KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 9;
+}
+def : InstRW<[WriteAdr, KryoWrite_0cyc_LS_Y_LS_Y_XY_LS_Y_LS_Y_377ln],
+ (instregex "ST1Four(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_0cyc_LS_Y_275ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_275ln],
+ (instregex "ST2(Two(v8b|v4h|v2s|v1d|v16b|v8h|v4s|v2d)|(i8|i16|i32|i64))$")>;
+def KryoWrite_1cyc_LS_Y_XY_306ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_306ln],
+ (instregex "ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))_POST$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_322ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_322ln],
+ (instregex "ST2Two(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_1cyc_LS_Y_XY_LS_Y_344ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 5;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_LS_Y_344ln],
+ (instregex "ST2Two(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_324ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_324ln],
+ (instregex "ST3(Threev1d|(i8|i16|i32|i64))$")>;
+def KryoWrite_1cyc_LS_Y_XY_LS_Y_346ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 5;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_LS_Y_346ln],
+ (instregex "ST3(Threev1d|(i8|i16|i32|i64))_POST$")>;
+def KryoWrite_1cyc_X_X_LS_Y_LS_Y_353ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitLS,
+ KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_LS_Y_LS_Y_353ln],
+ (instregex "ST3Three(v8b|v4h|v2s)$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_357ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitLS,
+ KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_357ln],
+ (instregex "ST3Threev2d$")>;
+def KryoWrite_1cyc_X_X_LS_Y_XY_LS_Y_363ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitXY,
+ KryoUnitLS, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 7;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_X_X_LS_Y_XY_LS_Y_363ln],
+ (instregex "ST3Three(v8b|v4h|v2s)_POST$")>;
+def KryoWrite_1cyc_LS_Y_XY_LS_Y_LS_Y_367ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY,
+ KryoUnitLS, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 7;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_LS_Y_LS_Y_367ln],
+ (instregex "ST3Threev2d_POST$")>;
+def KryoWrite_1cyc_X_X_LS_Y_LS_Y_X_X_LS_Y_LS_Y_385ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitLS,
+ KryoUnitY, KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY,
+ KryoUnitLS, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 12;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_LS_Y_LS_Y_X_X_LS_Y_LS_Y_385ln],
+ (instregex "ST3Three(v16b|v8h|v4s)$")>;
+def KryoWrite_1cyc_X_X_LS_Y_LS_Y_X_X_LS_Y_XY_LS_Y_388ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitLS,
+ KryoUnitY, KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY,
+ KryoUnitXY, KryoUnitLS, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 13;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_X_X_LS_Y_LS_Y_X_X_LS_Y_XY_LS_Y_388ln],
+ (instregex "ST3Three(v16b|v8h|v4s)_POST$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_325ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_325ln],
+ (instregex "ST4(Fourv1d|(i8|i16|i32|i64))$")>;
+def KryoWrite_1cyc_LS_Y_XY_LS_Y_347ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 5;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_LS_Y_347ln],
+ (instregex "ST4(Fourv1d|(i8|i16|i32|i64))_POST$")>;
+def KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_370ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitX,
+ KryoUnitX, KryoUnitLS, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 8;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_370ln],
+ (instregex "ST4Four(v8b|v4h|v2s)$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_LS_Y_372ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitLS,
+ KryoUnitY, KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 8;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_LS_Y_372ln],
+ (instregex "ST4Fourv2d$")>;
+def KryoWrite_1cyc_X_X_LS_Y_XY_X_X_LS_Y_375ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitXY,
+ KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 9;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_X_X_LS_Y_XY_X_X_LS_Y_375ln],
+ (instregex "ST4Four(v8b|v4h|v2s)_POST$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_XY_LS_Y_LS_Y_379ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitXY,
+ KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 9;
+}
+def : InstRW<[WriteAdr, KryoWrite_0cyc_LS_Y_LS_Y_XY_LS_Y_LS_Y_379ln],
+ (instregex "ST4Fourv2d_POST$")>;
+def KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_390ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitX,
+ KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitX, KryoUnitX,
+ KryoUnitLS, KryoUnitY, KryoUnitX, KryoUnitX, KryoUnitLS,
+ KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 16;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_390ln],
+ (instregex "ST4Four(v16b|v8h|v4s)$")>;
+def KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_XY_X_X_LS_Y_392ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitX,
+ KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitX, KryoUnitX,
+ KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitX, KryoUnitX,
+ KryoUnitLS, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 17;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_XY_X_X_LS_Y_392ln],
+ (instregex "ST4Four(v16b|v8h|v4s)_POST$")>;
+def KryoWrite_0cyc_LS_LS_Y_299ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_0cyc_LS_LS_Y_299ln],
+ (instregex "STLR(B|H|W|X)")>;
+def KryoWrite_3cyc_LS_LS_Y_307ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitY]> {
+ let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_Y_307ln],
+ (instregex "STLX(P(W|X)|R(B|H|W|X))")>;
+def KryoWrite_0cyc_LS_Y_276ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_276ln],
+ (instrs STNPDi, STNPSi)>;
+def KryoWrite_0cyc_LS_Y_LS_Y_326ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_326ln],
+ (instrs STNPQi)>;
+def KryoWrite_0cyc_LS_Y_280ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_280ln],
+ (instrs STNPWi, STNPXi)>;
+def KryoWrite_0cyc_LS_Y_277ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_277ln],
+ (instregex "STP(D|S)i")>;
+def KryoWrite_1cyc_LS_Y_X_303ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_X_303ln],
+ (instregex "STP(D|S)(post|pre)")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_327ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_327ln],
+ (instrs STPQi)>;
+def KryoWrite_1cyc_LS_Y_X_LS_Y_343ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitX, KryoUnitLS, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 5;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_X_LS_Y_343ln],
+ (instrs STPQpost, STPQpre)>;
+def KryoWrite_0cyc_LS_Y_279ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_279ln],
+ (instregex "STP(W|X)i")>;
+def KryoWrite_1cyc_LS_X_Y_300ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitX, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_X_Y_300ln],
+ (instregex "STP(W|X)(post|pre)")>;
+def KryoWrite_0cyc_LS_Y_278ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_278ln],
+ (instregex "STR(Q|D|S|H|B)ui")>;
+def KryoWrite_1cyc_X_LS_Y_295ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitLS, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_1cyc_X_LS_Y_295ln],
+ (instregex "STR(D|S|H|B)ro(W|X)")>;
+def KryoWrite_1cyc_LS_Y_X_304ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_X_304ln],
+ (instregex "STR(Q|D|S|H|B)(post|pre)")>;
+def KryoWrite_2cyc_X_LS_Y_XY_LS_Y_354ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS,
+ KryoUnitY]> {
+ let Latency = 2; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_2cyc_X_LS_Y_XY_LS_Y_354ln],
+ (instregex "STRQro(W|X)")>;
+def KryoWrite_0cyc_LS_Y_399ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_399ln],
+ (instregex "STR(BB|HH|W|X)ui")>;
+def KryoWrite_1cyc_X_LS_Y_406ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitLS, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_1cyc_X_LS_Y_406ln],
+ (instregex "STR(BB|HH|W|X)ro(W|X)")>;
+def KryoWrite_1cyc_LS_X_Y_407ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitX, KryoUnitY]> {
+ let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_X_Y_407ln],
+ (instregex "STR(BB|HH|W|X)(post|pre)")>;
+def KryoWrite_0cyc_LS_Y_398ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_398ln],
+ (instregex "STTR(B|H|W|X)i")>;
+def KryoWrite_0cyc_LS_Y_396ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_396ln],
+ (instregex "STUR(Q|D|S|H|B)i")>;
+def KryoWrite_0cyc_LS_Y_397ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+ let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_397ln],
+ (instregex "STUR(BB|HH|W|X)i")>;
+def KryoWrite_3cyc_LS_Y_404ln :
+ SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_Y_404ln],
+ (instregex "STX(P(W|X)|R(B|H|W|X))")>;
+def KryoWrite_3cyc_XY_noRSV_160ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_160ln],
+ (instregex "^(SU|US)QADD(v8i8|v4i16|v2i32)")>;
+def KryoWrite_3cyc_XY_XY_167ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_167ln],
+ (instregex "^(SU|US)QADD(v16i8|v8i16|v4i32|v2i64)")>;
+def KryoWrite_1cyc_XY_1ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_1ln, ReadI],
+ (instregex "SUBS?(W|X)ri")>;
+def KryoWrite_2cyc_XY_XY_5ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_5ln, ReadI, ReadIEReg],
+ (instregex "SUBS?(W|X)rx")>;
+def KryoWrite_2cyc_XY_XY_5_1ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_5_1ln, ReadI, ReadISReg],
+ (instregex "SUBS?(W|X)rs")>;
+def KryoWrite_1cyc_XY_noRSV_6ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_6ln, ReadI, ReadI],
+ (instregex "SUBS?(W|X)rr")>;
+def KryoWrite_0cyc_LS_9ln :
+ SchedWriteRes<[KryoUnitLS]> {
+ let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_LS_9ln],
+ (instregex "SYSL?xt")>;
+def KryoWrite_1cyc_X_noRSV_205ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_noRSV_205ln],
+ (instrs TBLv8i8One)>;
+def KryoWrite_1cyc_X_X_208ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_208ln],
+ (instrs TBLv16i8One)>;
+def KryoWrite_2cyc_X_X_X_noRSV_222ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX]> {
+ let Latency = 2; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_2cyc_X_X_X_noRSV_222ln],
+ (instrs TBLv8i8Two)>;
+def KryoWrite_2cyc_X_X_X_X_X_X_224ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+ KryoUnitX]> {
+ let Latency = 2; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_2cyc_X_X_X_X_X_X_224ln],
+ (instrs TBLv16i8Two)>;
+def KryoWrite_3cyc_X_X_X_X_X_noRSV_225ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX]> {
+ let Latency = 3; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_3cyc_X_X_X_X_X_noRSV_225ln],
+ (instrs TBLv8i8Three)>;
+def KryoWrite_3cyc_X_X_X_X_X_X_X_noRSV_228ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+ KryoUnitX, KryoUnitX]> {
+ let Latency = 3; let NumMicroOps = 8;
+}
+def : InstRW<[KryoWrite_3cyc_X_X_X_X_X_X_X_noRSV_228ln],
+ (instrs TBLv8i8Four)>;
+def KryoWrite_4cyc_X_X_X_X_X_X_X_X_XY_X_X_230ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+ KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitXY, KryoUnitX,
+ KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 11;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_X_X_X_X_X_X_XY_X_X_230ln],
+ (instrs TBLv16i8Three)>;
+def KryoWrite_4cyc_X_X_X_X_X_X_X_X_X_X_XY_X_X_X_X_232ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+ KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+ KryoUnitXY, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 15;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_X_X_X_X_X_X_X_X_XY_X_X_X_X_232ln],
+ (instrs TBLv16i8Four)>;
+def KryoWrite_2cyc_X_X_noRSV_220ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 2; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_2cyc_X_X_noRSV_220ln],
+ (instrs TBXv8i8One)>;
+def KryoWrite_2cyc_X_X_X_X_221ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX]> {
+ let Latency = 2; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_2cyc_X_X_X_X_221ln],
+ (instrs TBXv16i8One)>;
+def KryoWrite_3cyc_X_X_X_X_noRSV_223ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX]> {
+ let Latency = 3; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_3cyc_X_X_X_X_noRSV_223ln],
+ (instrs TBXv8i8Two)>;
+def KryoWrite_4cyc_X_X_X_X_X_X_noRSV_226ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+ KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 7;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_X_X_X_X_noRSV_226ln],
+ (instrs TBXv8i8Three)>;
+def KryoWrite_3cyc_X_X_X_X_X_X_X_X_227ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+ KryoUnitX, KryoUnitX, KryoUnitX]> {
+ let Latency = 3; let NumMicroOps = 8;
+}
+def : InstRW<[KryoWrite_3cyc_X_X_X_X_X_X_X_X_227ln],
+ (instrs TBXv16i8Two)>;
+def KryoWrite_4cyc_X_X_X_X_X_X_X_X_noRSV_229ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+ KryoUnitX, KryoUnitX, KryoUnitX]> {
+ let Latency = 4; let NumMicroOps = 9;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_X_X_X_X_X_X_noRSV_229ln],
+ (instrs TBXv8i8Four)>;
+def KryoWrite_5cyc_X_X_X_X_X_X_X_X_X_XY_X_X_X_231ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+ KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitXY,
+ KryoUnitX, KryoUnitX, KryoUnitX]> {
+ let Latency = 5; let NumMicroOps = 13;
+}
+def : InstRW<[KryoWrite_5cyc_X_X_X_X_X_X_X_X_X_XY_X_X_X_231ln],
+ (instrs TBXv16i8Three)>;
+def KryoWrite_5cyc_X_X_X_X_X_X_X_X_X_X_X_XY_X_X_X_X_X_233ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+ KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+ KryoUnitX, KryoUnitXY, KryoUnitX, KryoUnitX, KryoUnitX,
+ KryoUnitX, KryoUnitX]> {
+ let Latency = 5; let NumMicroOps = 17;
+}
+def : InstRW<[KryoWrite_5cyc_X_X_X_X_X_X_X_X_X_X_X_XY_X_X_X_X_X_233ln],
+ (instrs TBXv16i8Four)>;
+def KryoWrite_1cyc_XY_XY_217ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_217ln],
+ (instregex "((TRN1|TRN2|ZIP1|UZP1|UZP2)v2i64|ZIP2(v2i64|v4i32|v8i16|v16i8))")>;
+def KryoWrite_1cyc_X_X_211ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_211ln],
+ (instregex "(TRN1|TRN2)(v4i32|v8i16|v16i8)")>;
+def KryoWrite_1cyc_X_XY_213ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_XY_213ln],
+ (instregex "(TRN1|TRN2)(v2i32|v4i16|v8i8)")>;
+def KryoWrite_3cyc_XY_noRSV_156ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_156ln],
+ (instrs URECPEv2i32, URSQRTEv2i32)>;
+def KryoWrite_3cyc_XY_XY_168ln :
+ SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+ let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_168ln],
+ (instrs URECPEv4i32, URSQRTEv4i32)>;
+def KryoWrite_1cyc_X_X_210ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_210ln],
+ (instregex "(UZP1|UZP2)(v4i32|v8i16|v16i8)")>;
+def KryoWrite_1cyc_X_noRSV_206ln :
+ SchedWriteRes<[KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_noRSV_206ln],
+ (instregex "(UZP1|UZP2|ZIP1|ZIP2)(v2i32|v4i16|v8i8)")>;
+def KryoWrite_1cyc_XY_noRSV_215ln :
+ SchedWriteRes<[KryoUnitXY]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_215ln],
+ (instregex "XTNv.*")>;
+def KryoWrite_1cyc_X_X_209ln :
+ SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+ let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_209ln],
+ (instregex "ZIP1(v4i32|v8i16|v16i8)")>;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedM1.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedM1.td
new file mode 100644
index 000000000000..14d6891253fa
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedM1.td
@@ -0,0 +1,380 @@
+//=- AArch64SchedM1.td - Samsung Exynos-M1 Scheduling Defs ---*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Samsung Exynos-M1 to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// The Exynos-M1 is a traditional superscalar microprocessor with a
+// 4-wide in-order stage for decode and dispatch and a wider issue stage.
+// The execution units and loads and stores are out-of-order.
+
+def ExynosM1Model : SchedMachineModel {
+ let IssueWidth = 4; // Up to 4 uops per cycle.
+ let MicroOpBufferSize = 96; // ROB size.
+ let LoopMicroOpBufferSize = 24; // Based on the instruction queue size.
+ let LoadLatency = 4; // Optimistic load cases.
+ let MispredictPenalty = 14; // Minimum branch misprediction penalty.
+ let CompleteModel = 0; // Use the default model otherwise.
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on the Exynos-M1,
+// which has 9 pipelines, each with its own queue with out-of-order dispatch.
+
+def M1UnitA : ProcResource<2>; // Simple integer
+def M1UnitC : ProcResource<1>; // Simple and complex integer
+def M1UnitD : ProcResource<1>; // Integer division (inside C, serialized)
+def M1UnitB : ProcResource<2>; // Branch
+def M1UnitL : ProcResource<1>; // Load
+def M1UnitS : ProcResource<1>; // Store
+def M1PipeF0 : ProcResource<1>; // FP #0
+let Super = M1PipeF0 in {
+ def M1UnitFMAC : ProcResource<1>; // FP multiplication
+ def M1UnitNAL0 : ProcResource<1>; // Simple vector
+ def M1UnitNMISC : ProcResource<1>; // Miscellanea
+ def M1UnitFCVT : ProcResource<1>; // FP conversion
+ def M1UnitNCRYPT : ProcResource<1>; // Cryptographic
+}
+def M1PipeF1 : ProcResource<1>; // FP #1
+let Super = M1PipeF1 in {
+ def M1UnitFADD : ProcResource<1>; // Simple FP
+ def M1UnitNAL1 : ProcResource<1>; // Simple vector
+ def M1UnitFVAR : ProcResource<1>; // FP division & square root (serialized)
+ def M1UnitFST : ProcResource<1>; // FP store
+}
+
+let SchedModel = ExynosM1Model in {
+ def M1UnitALU : ProcResGroup<[M1UnitA,
+ M1UnitC]>; // All integer
+ def M1UnitNALU : ProcResGroup<[M1UnitNAL0,
+ M1UnitNAL1]>; // All simple vector
+}
+
+let SchedModel = ExynosM1Model in {
+
+//===----------------------------------------------------------------------===//
+// Coarse scheduling model for the Exynos-M1.
+
+def M1WriteA1 : SchedWriteRes<[M1UnitALU]> { let Latency = 1; }
+def M1WriteA2 : SchedWriteRes<[M1UnitALU]> { let Latency = 2; }
+def M1WriteC1 : SchedWriteRes<[M1UnitC]> { let Latency = 1; }
+def M1WriteC2 : SchedWriteRes<[M1UnitC]> { let Latency = 2; }
+
+def M1WriteB1 : SchedWriteRes<[M1UnitB]> { let Latency = 1; }
+
+def M1WriteL5 : SchedWriteRes<[M1UnitL]> { let Latency = 5; }
+def M1WriteLA : SchedWriteVariant<[SchedVar<ScaledIdxPred, [M1WriteL5,
+ M1WriteA1]>,
+ SchedVar<NoSchedPred, [M1WriteL5]>]>;
+
+def M1WriteS1 : SchedWriteRes<[M1UnitS]> { let Latency = 1; }
+def M1WriteS2 : SchedWriteRes<[M1UnitS]> { let Latency = 2; }
+def M1WriteS4 : SchedWriteRes<[M1UnitS]> { let Latency = 4; }
+def M1WriteSA : SchedWriteVariant<[SchedVar<ScaledIdxPred, [M1WriteS2,
+ M1WriteA1]>,
+ SchedVar<NoSchedPred, [M1WriteS1]>]>;
+
+def M1ReadAdrBase : SchedReadVariant<[SchedVar<ScaledIdxPred, [ReadDefault]>,
+ SchedVar<NoSchedPred, [ReadDefault]>]>;
+def : SchedAlias<ReadAdrBase, M1ReadAdrBase>;
+
+// Branch instructions.
+// NOTE: Unconditional direct branches actually take neither cycles nor units.
+def : WriteRes<WriteBr, [M1UnitB]> { let Latency = 1; }
+def : WriteRes<WriteBrReg, [M1UnitC]> { let Latency = 1; }
+
+// Arithmetic and logical integer instructions.
+def : WriteRes<WriteI, [M1UnitALU]> { let Latency = 1; }
+// TODO: Shift over 3 and some extensions take 2 cycles.
+def : WriteRes<WriteISReg, [M1UnitALU]> { let Latency = 1; }
+def : WriteRes<WriteIEReg, [M1UnitALU]> { let Latency = 1; }
+def : WriteRes<WriteIS, [M1UnitALU]> { let Latency = 1; }
+
+// Move instructions.
+def : WriteRes<WriteImm, [M1UnitALU]> { let Latency = 1; }
+
+// Divide and multiply instructions.
+def : WriteRes<WriteID32, [M1UnitC,
+ M1UnitD]> { let Latency = 13;
+ let ResourceCycles = [1, 13]; }
+def : WriteRes<WriteID64, [M1UnitC,
+ M1UnitD]> { let Latency = 21;
+ let ResourceCycles = [1, 21]; }
+// TODO: Long multiplication take 5 cycles and also the ALU.
+// TODO: Multiplication with accumulation can be advanced.
+def : WriteRes<WriteIM32, [M1UnitC]> { let Latency = 3; }
+// TODO: 64-bit multiplication has a throughput of 1/2.
+def : WriteRes<WriteIM64, [M1UnitC]> { let Latency = 4; }
+
+// Miscellaneous instructions.
+def : WriteRes<WriteExtr, [M1UnitALU,
+ M1UnitALU]> { let Latency = 2; }
+
+// TODO: The latency for the post or pre register is 1 cycle.
+def : WriteRes<WriteAdr, []> { let Latency = 0; }
+
+// Load instructions.
+def : WriteRes<WriteLD, [M1UnitL]> { let Latency = 4; }
+def : WriteRes<WriteLDHi, [M1UnitALU]> { let Latency = 4; }
+def : SchedAlias<WriteLDIdx, M1WriteLA>;
+
+// Store instructions.
+def : WriteRes<WriteST, [M1UnitS]> { let Latency = 1; }
+def : WriteRes<WriteSTP, [M1UnitS]> { let Latency = 1; }
+def : WriteRes<WriteSTX, [M1UnitS]> { let Latency = 1; }
+def : SchedAlias<WriteSTIdx, M1WriteSA>;
+
+// FP data instructions.
+def : WriteRes<WriteF, [M1UnitFADD]> { let Latency = 3; }
+// TODO: FCCMP is much different.
+def : WriteRes<WriteFCmp, [M1UnitNMISC]> { let Latency = 4; }
+def : WriteRes<WriteFDiv, [M1UnitFVAR]> { let Latency = 15;
+ let ResourceCycles = [15]; }
+def : WriteRes<WriteFMul, [M1UnitFMAC]> { let Latency = 4; }
+
+// FP miscellaneous instructions.
+// TODO: Conversion between register files is much different.
+def : WriteRes<WriteFCvt, [M1UnitFCVT]> { let Latency = 3; }
+def : WriteRes<WriteFImm, [M1UnitNALU]> { let Latency = 1; }
+def : WriteRes<WriteFCopy, [M1UnitS]> { let Latency = 4; }
+
+// FP load instructions.
+// TODO: ASIMD loads are much different.
+def : WriteRes<WriteVLD, [M1UnitL]> { let Latency = 5; }
+
+// FP store instructions.
+// TODO: ASIMD stores are much different.
+def : WriteRes<WriteVST, [M1UnitS, M1UnitFST]> { let Latency = 1; }
+
+// ASIMD FP instructions.
+def : WriteRes<WriteV, [M1UnitFADD]> { let Latency = 3; }
+
+// Other miscellaneous instructions.
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint, []> { let Latency = 1; }
+def : WriteRes<WriteSys, []> { let Latency = 1; }
+
+//===----------------------------------------------------------------------===//
+// Generic fast forwarding.
+
+// TODO: Add FP register forwarding rules.
+
+def : ReadAdvance<ReadI, 0>;
+def : ReadAdvance<ReadISReg, 0>;
+def : ReadAdvance<ReadIEReg, 0>;
+def : ReadAdvance<ReadIM, 0>;
+// Integer multiply-accumulate.
+// TODO: The forwarding for WriteIM64 saves actually 3 cycles.
+def : ReadAdvance<ReadIMA, 2, [WriteIM32, WriteIM64]>;
+def : ReadAdvance<ReadID, 0>;
+def : ReadAdvance<ReadExtrHi, 0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD, 0>;
+
+//===----------------------------------------------------------------------===//
+// Finer scheduling model for the Exynos-M1.
+
+def M1WriteNEONA : SchedWriteRes<[M1UnitNALU,
+ M1UnitNALU,
+ M1UnitFADD]> { let Latency = 9; }
+def M1WriteNEONB : SchedWriteRes<[M1UnitNALU,
+ M1UnitFST]> { let Latency = 5; }
+def M1WriteNEONC : SchedWriteRes<[M1UnitNALU,
+ M1UnitFST]> { let Latency = 6; }
+def M1WriteNEOND : SchedWriteRes<[M1UnitNALU,
+ M1UnitFST,
+ M1UnitL]> { let Latency = 10; }
+def M1WriteNEONE : SchedWriteRes<[M1UnitFCVT,
+ M1UnitFST]> { let Latency = 8; }
+def M1WriteNEONF : SchedWriteRes<[M1UnitFCVT,
+ M1UnitFST,
+ M1UnitL]> { let Latency = 13; }
+def M1WriteNEONG : SchedWriteRes<[M1UnitNMISC,
+ M1UnitFST]> { let Latency = 6; }
+def M1WriteNEONH : SchedWriteRes<[M1UnitNALU,
+ M1UnitFST]> { let Latency = 3; }
+def M1WriteNEONI : SchedWriteRes<[M1UnitFST,
+ M1UnitL]> { let Latency = 9; }
+def M1WriteNEONJ : SchedWriteRes<[M1UnitNMISC,
+ M1UnitFMAC]> { let Latency = 6; }
+def M1WriteNEONK : SchedWriteRes<[M1UnitNMISC,
+ M1UnitFMAC]> { let Latency = 7; }
+def M1WriteFADD3 : SchedWriteRes<[M1UnitFADD]> { let Latency = 3; }
+def M1WriteFCVT3 : SchedWriteRes<[M1UnitFCVT]> { let Latency = 3; }
+def M1WriteFCVT4 : SchedWriteRes<[M1UnitFCVT]> { let Latency = 4; }
+def M1WriteFMAC4 : SchedWriteRes<[M1UnitFMAC]> { let Latency = 4; }
+def M1WriteFMAC5 : SchedWriteRes<[M1UnitFMAC]> { let Latency = 5; }
+def M1WriteFVAR15 : SchedWriteRes<[M1UnitFVAR]> { let Latency = 15;
+ let ResourceCycles = [15]; }
+def M1WriteFVAR23 : SchedWriteRes<[M1UnitFVAR]> { let Latency = 23;
+ let ResourceCycles = [23]; }
+def M1WriteNALU1 : SchedWriteRes<[M1UnitNALU]> { let Latency = 1; }
+def M1WriteNALU2 : SchedWriteRes<[M1UnitNALU]> { let Latency = 2; }
+def M1WriteNAL11 : SchedWriteRes<[M1UnitNAL1]> { let Latency = 1; }
+def M1WriteNAL12 : SchedWriteRes<[M1UnitNAL1]> { let Latency = 2; }
+def M1WriteNAL13 : SchedWriteRes<[M1UnitNAL1]> { let Latency = 3; }
+def M1WriteNCRYPT1 : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; }
+def M1WriteNCRYPT5 : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 5; }
+def M1WriteNMISC1 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 1; }
+def M1WriteNMISC2 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 2; }
+def M1WriteNMISC3 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 3; }
+def M1WriteNMISC4 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 4; }
+def M1WriteTB : SchedWriteRes<[M1UnitC,
+ M1UnitALU]> { let Latency = 2; }
+
+// Branch instructions
+def : InstRW<[M1WriteB1], (instrs Bcc)>;
+// NOTE: Conditional branch and link adds a B uop.
+def : InstRW<[M1WriteA1], (instrs BL)>;
+// NOTE: Indirect branch and link with LR adds an ALU uop.
+def : InstRW<[M1WriteA1,
+ M1WriteC1], (instrs BLR)>;
+def : InstRW<[M1WriteC1], (instregex "^CBN?Z[WX]")>;
+def : InstRW<[M1WriteC1,
+ M1WriteA2], (instregex "^TBN?Z[WX]")>;
+
+// Arithmetic and logical integer instructions.
+def : InstRW<[M1WriteA1], (instrs COPY)>;
+
+// Divide and multiply instructions.
+
+// Miscellaneous instructions.
+
+// Load instructions.
+
+// Store instructions.
+
+// FP data instructions.
+def : InstRW<[M1WriteNALU1], (instregex "^F(ABS|NEG)[DS]r")>;
+def : InstRW<[M1WriteFADD3], (instregex "^F(ADD|SUB)[DS]rr")>;
+def : InstRW<[M1WriteNEONG], (instregex "^FCCMPE?[DS]rr")>;
+def : InstRW<[M1WriteNMISC4], (instregex "^FCMPE?[DS]r")>;
+def : InstRW<[M1WriteFVAR15], (instrs FDIVSrr)>;
+def : InstRW<[M1WriteFVAR23], (instrs FDIVDrr)>;
+def : InstRW<[M1WriteNMISC2], (instregex "^F(MAX|MIN).+rr")>;
+def : InstRW<[M1WriteFMAC4], (instregex "^FN?MUL[DS]rr")>;
+def : InstRW<[M1WriteFMAC5], (instregex "^FN?M(ADD|SUB)[DS]rrr")>;
+def : InstRW<[M1WriteFCVT3], (instregex "^FRINT.+r")>;
+def : InstRW<[M1WriteNEONH], (instregex "^FCSEL[DS]rrr")>;
+def : InstRW<[M1WriteFVAR15], (instrs FSQRTSr)>;
+def : InstRW<[M1WriteFVAR23], (instrs FSQRTDr)>;
+
+// FP miscellaneous instructions.
+def : InstRW<[M1WriteFCVT3], (instregex "^FCVT[DS][DS]r")>;
+def : InstRW<[M1WriteNEONF], (instregex "^[FSU]CVT[AMNPZ][SU](_Int)?[SU]?[XW]?[DS]?[rds]i?")>;
+def : InstRW<[M1WriteNEONE], (instregex "^[SU]CVTF[SU]")>;
+def : InstRW<[M1WriteNALU1], (instregex "^FMOV[DS][ir]")>;
+def : InstRW<[M1WriteS4], (instregex "^FMOV[WX][DS](High)?r")>;
+def : InstRW<[M1WriteNEONI], (instregex "^FMOV[DS][WX](High)?r")>;
+
+// FP load instructions.
+
+// FP store instructions.
+
+// ASIMD instructions.
+def : InstRW<[M1WriteNMISC3], (instregex "^[SU]ABAL?v")>;
+def : InstRW<[M1WriteNMISC1], (instregex "^[SU]ABDL?v")>;
+def : InstRW<[M1WriteNMISC1], (instregex "^(SQ)?ABSv")>;
+def : InstRW<[M1WriteNMISC1], (instregex "^SQNEGv")>;
+def : InstRW<[M1WriteNALU1], (instregex "^(ADD|NEG|SUB)v")>;
+def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?H(ADD|SUB)v")>;
+def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?AD[AD](L|LP|P|W)V?2?v")>;
+def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?SUB[LW]2?v")>;
+def : InstRW<[M1WriteNMISC3], (instregex "^R?(ADD|SUB)HN?2?v")>;
+def : InstRW<[M1WriteNMISC3], (instregex "^[SU]+Q(ADD|SUB)v")>;
+def : InstRW<[M1WriteNMISC3], (instregex "^[SU]RHADDv")>;
+def : InstRW<[M1WriteNMISC1], (instregex "^CM(EQ|GE|GT|HI|HS|LE|LT)v")>;
+def : InstRW<[M1WriteNALU1], (instregex "^CMTSTv")>;
+def : InstRW<[M1WriteNALU1], (instregex "^(AND|BIC|EOR|MVNI|NOT|ORN|ORR)v")>;
+def : InstRW<[M1WriteNMISC1], (instregex "^[SU](MIN|MAX)v")>;
+def : InstRW<[M1WriteNMISC2], (instregex "^[SU](MIN|MAX)Pv")>;
+def : InstRW<[M1WriteNMISC3], (instregex "^[SU](MIN|MAX)Vv")>;
+def : InstRW<[M1WriteNMISC4], (instregex "^(MUL|SQR?DMULH)v")>;
+def : InstRW<[M1WriteNMISC4], (instregex "^ML[AS]v")>;
+def : InstRW<[M1WriteNMISC4], (instregex "^(S|U|SQD|SQRD)ML[AS][HL]v")>;
+def : InstRW<[M1WriteNMISC4], (instregex "^(S|U|SQD)MULLv")>;
+def : InstRW<[M1WriteNAL13], (instregex "^(S|SR|U|UR)SRAv")>;
+def : InstRW<[M1WriteNALU1], (instregex "^[SU]?SH(L|LL|R)2?v")>;
+def : InstRW<[M1WriteNALU1], (instregex "^S[LR]Iv")>;
+def : InstRW<[M1WriteNAL13], (instregex "^[SU]?(Q|QR|R)?SHR(N|U|UN)?2?v")>;
+def : InstRW<[M1WriteNAL13], (instregex "^[SU](Q|QR|R)SHLU?v")>;
+
+// ASIMD FP instructions.
+def : InstRW<[M1WriteNALU1], (instregex "^F(ABS|NEG)v")>;
+def : InstRW<[M1WriteNMISC3], (instregex "^F(ABD|ADD|SUB)v")>;
+def : InstRW<[M1WriteNEONA], (instregex "^FADDP")>;
+def : InstRW<[M1WriteNMISC1], (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v[^1]")>;
+def : InstRW<[M1WriteFCVT3], (instregex "^[FVSU]CVTX?[AFLMNPZ][SU]?(_Int)?v")>;
+def : InstRW<[M1WriteFVAR15], (instregex "FDIVv.f32")>;
+def : InstRW<[M1WriteFVAR23], (instregex "FDIVv2f64")>;
+def : InstRW<[M1WriteFVAR15], (instregex "FSQRTv.f32")>;
+def : InstRW<[M1WriteFVAR23], (instregex "FSQRTv2f64")>;
+def : InstRW<[M1WriteNMISC1], (instregex "^F(MAX|MIN)(NM)?V?v")>;
+def : InstRW<[M1WriteNMISC2], (instregex "^F(MAX|MIN)(NM)?Pv")>;
+def : InstRW<[M1WriteNEONJ], (instregex "^FMULX?v.i")>;
+def : InstRW<[M1WriteFMAC4], (instregex "^FMULX?v.f")>;
+def : InstRW<[M1WriteNEONK], (instregex "^FML[AS]v.i")>;
+def : InstRW<[M1WriteFMAC5], (instregex "^FML[AS]v.f")>;
+def : InstRW<[M1WriteFCVT3], (instregex "^FRINT[AIMNPXZ]v")>;
+
+// ASIMD miscellaneous instructions.
+def : InstRW<[M1WriteNALU1], (instregex "^RBITv")>;
+def : InstRW<[M1WriteNAL11], (instregex "^(BIF|BIT|BSL)v")>;
+def : InstRW<[M1WriteNALU1], (instregex "^CPY")>;
+def : InstRW<[M1WriteNEONB], (instregex "^DUPv.+gpr")>;
+def : InstRW<[M1WriteNALU1], (instregex "^DUPv.+lane")>;
+def : InstRW<[M1WriteNAL13], (instregex "^[SU]?Q?XTU?Nv")>;
+def : InstRW<[M1WriteNEONC], (instregex "^INSv.+gpr")>;
+def : InstRW<[M1WriteFCVT4], (instregex "^[FU](RECP|RSQRT)Ev")>;
+def : InstRW<[M1WriteNMISC1], (instregex "^[FU](RECP|RSQRT)Xv")>;
+def : InstRW<[M1WriteFMAC5], (instregex "^F(RECP|RSQRT)Sv")>;
+def : InstRW<[M1WriteNALU1], (instregex "^REV(16|32|64)v")>;
+def : InstRW<[M1WriteNAL11], (instregex "^TB[LX]v8i8One")>;
+def : InstRW<[WriteSequence<[M1WriteNAL11], 2>],
+ (instregex "^TB[LX]v8i8Two")>;
+def : InstRW<[WriteSequence<[M1WriteNAL11], 3>],
+ (instregex "^TB[LX]v8i8Three")>;
+def : InstRW<[WriteSequence<[M1WriteNAL11], 4>],
+ (instregex "^TB[LX]v8i8Four")>;
+def : InstRW<[M1WriteNAL12], (instregex "^TB[LX]v16i8One")>;
+def : InstRW<[WriteSequence<[M1WriteNAL12], 2>],
+ (instregex "^TB[LX]v16i8Two")>;
+def : InstRW<[WriteSequence<[M1WriteNAL12], 3>],
+ (instregex "^TB[LX]v16i8Three")>;
+def : InstRW<[WriteSequence<[M1WriteNAL12], 4>],
+ (instregex "^TB[LX]v16i8Four")>;
+def : InstRW<[M1WriteNEOND], (instregex "^[SU]MOVv")>;
+def : InstRW<[M1WriteNALU1], (instregex "^INSv.+lane")>;
+def : InstRW<[M1WriteNALU1], (instregex "^(TRN|UZP)[12](v8i8|v4i16|v2i32)")>;
+def : InstRW<[M1WriteNALU2], (instregex "^(TRN|UZP)[12](v16i8|v8i16|v4i32|v2i64)")>;
+def : InstRW<[M1WriteNALU1], (instregex "^ZIP[12]v")>;
+
+// ASIMD load instructions.
+
+// ASIMD store instructions.
+
+// Cryptography instructions.
+def M1WriteAES : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; }
+def M1ReadAES : SchedReadAdvance<1, [M1WriteAES]>;
+def : InstRW<[M1WriteAES, M1ReadAES], (instregex "^AES")>;
+
+def : InstRW<[M1WriteNCRYPT1], (instregex "^PMUL")>;
+def : InstRW<[M1WriteNCRYPT1], (instregex "^SHA1(H|SU)")>;
+def : InstRW<[M1WriteNCRYPT5], (instregex "^SHA1[CMP]")>;
+def : InstRW<[M1WriteNCRYPT1], (instregex "^SHA256SU0")>;
+def : InstRW<[M1WriteNCRYPT5], (instregex "^SHA256(H|SU1)")>;
+
+// CRC instructions.
+def : InstRW<[M1WriteC2], (instregex "^CRC32")>;
+
+} // SchedModel = ExynosM1Model
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedVulcan.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedVulcan.td
new file mode 100644
index 000000000000..35a40c314bf4
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedVulcan.td
@@ -0,0 +1,852 @@
+//=- AArch64SchedVulcan.td - Vulcan Scheduling Defs ----------*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// 1. Introduction
+//
+// This file defines the machine model for Broadcom Vulcan to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// 2. Pipeline Description.
+
+def VulcanModel : SchedMachineModel {
+ let IssueWidth = 4; // 4 micro-ops dispatched at a time.
+ let MicroOpBufferSize = 180; // 180 entries in micro-op re-order buffer.
+ let LoadLatency = 4; // Optimistic load latency.
+ let MispredictPenalty = 12; // Extra cycles for mispredicted branch.
+ // Determined via a mix of micro-arch details and experimentation.
+ let LoopMicroOpBufferSize = 32;
+ let PostRAScheduler = 1; // Using PostRA sched.
+ let CompleteModel = 1;
+}
+
+// Define the issue ports.
+
+// Port 0: ALU, FP/SIMD.
+def VulcanP0 : ProcResource<1>;
+
+// Port 1: ALU, FP/SIMD, integer mul/div.
+def VulcanP1 : ProcResource<1>;
+
+// Port 2: ALU, Branch.
+def VulcanP2 : ProcResource<1>;
+
+// Port 3: Store data.
+def VulcanP3 : ProcResource<1>;
+
+// Port 4: Load/store.
+def VulcanP4 : ProcResource<1>;
+
+// Port 5: Load/store.
+def VulcanP5 : ProcResource<1>;
+
+let SchedModel = VulcanModel in {
+
+// Define groups for the functional units on each issue port. Each group
+// created will be used by a WriteRes later on.
+//
+// NOTE: Some groups only contain one member. This is a way to create names for
+// the various functional units that share a single issue port. For example,
+// "VulcanI1" for ALU ops on port 1 and "VulcanF1" for FP ops on port 1.
+
+// Integer divide and multiply micro-ops only on port 1.
+def VulcanI1 : ProcResGroup<[VulcanP1]>;
+
+// Branch micro-ops only on port 2.
+def VulcanI2 : ProcResGroup<[VulcanP2]>;
+
+// ALU micro-ops on ports 0, 1, and 2.
+def VulcanI012 : ProcResGroup<[VulcanP0, VulcanP1, VulcanP2]>;
+
+// Crypto FP/SIMD micro-ops only on port 1.
+def VulcanF1 : ProcResGroup<[VulcanP1]>;
+
+// FP/SIMD micro-ops on ports 0 and 1.
+def VulcanF01 : ProcResGroup<[VulcanP0, VulcanP1]>;
+
+// Store data micro-ops only on port 3.
+def VulcanSD : ProcResGroup<[VulcanP3]>;
+
+// Load/store micro-ops on ports 4 and 5.
+def VulcanLS01 : ProcResGroup<[VulcanP4, VulcanP5]>;
+
+// 60 entry unified scheduler.
+def VulcanAny : ProcResGroup<[VulcanP0, VulcanP1, VulcanP2,
+ VulcanP3, VulcanP4, VulcanP5]> {
+ let BufferSize=60;
+}
+
+// Define commonly used write types for InstRW specializations.
+// All definitions follow the format: VulcanWrite_<NumCycles>Cyc_<Resources>.
+
+// 3 cycles on I1.
+def VulcanWrite_3Cyc_I1 : SchedWriteRes<[VulcanI1]> { let Latency = 3; }
+
+// 4 cycles on I1.
+def VulcanWrite_4Cyc_I1 : SchedWriteRes<[VulcanI1]> { let Latency = 4; }
+
+// 1 cycle on I0, I1, or I2.
+def VulcanWrite_1Cyc_I012 : SchedWriteRes<[VulcanI012]> { let Latency = 1; }
+
+// 5 cycles on F1.
+def VulcanWrite_5Cyc_F1 : SchedWriteRes<[VulcanF1]> { let Latency = 5; }
+
+// 7 cycles on F1.
+def VulcanWrite_7Cyc_F1 : SchedWriteRes<[VulcanF1]> { let Latency = 7; }
+
+// 4 cycles on F0 or F1.
+def VulcanWrite_4Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 4; }
+
+// 5 cycles on F0 or F1.
+def VulcanWrite_5Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 5; }
+
+// 6 cycles on F0 or F1.
+def VulcanWrite_6Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 6; }
+
+// 7 cycles on F0 or F1.
+def VulcanWrite_7Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 7; }
+
+// 8 cycles on F0 or F1.
+def VulcanWrite_8Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 8; }
+
+// 16 cycles on F0 or F1.
+def VulcanWrite_16Cyc_F01 : SchedWriteRes<[VulcanF01]> {
+ let Latency = 16;
+ let ResourceCycles = [8];
+}
+
+// 23 cycles on F0 or F1.
+def VulcanWrite_23Cyc_F01 : SchedWriteRes<[VulcanF01]> {
+ let Latency = 23;
+ let ResourceCycles = [11];
+}
+
+// 1 cycles on LS0 or LS1.
+def VulcanWrite_1Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 1; }
+
+// 4 cycles on LS0 or LS1.
+def VulcanWrite_4Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 4; }
+
+// 5 cycles on LS0 or LS1.
+def VulcanWrite_5Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 5; }
+
+// 6 cycles on LS0 or LS1.
+def VulcanWrite_6Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 6; }
+
+// 5 cycles on LS0 or LS1 and I0, I1, or I2.
+def VulcanWrite_5Cyc_LS01_I012 : SchedWriteRes<[VulcanLS01, VulcanI012]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+// 5 cycles on LS0 or LS1 and 2 of I0, I1, or I2.
+def VulcanWrite_6Cyc_LS01_I012_I012 :
+ SchedWriteRes<[VulcanLS01, VulcanI012, VulcanI012]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+}
+
+// 1 cycles on LS0 or LS1 and F0 or F1.
+def VulcanWrite_1Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+// 5 cycles on LS0 or LS1 and F0 or F1.
+def VulcanWrite_5Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+// 6 cycles on LS0 or LS1 and F0 or F1.
+def VulcanWrite_6Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+// 7 cycles on LS0 or LS1 and F0 or F1.
+def VulcanWrite_7Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+}
+
+// 8 cycles on LS0 or LS1 and F0 or F1.
+def VulcanWrite_8Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+
+// Define commonly used read types.
+
+// No forwarding is provided for these types.
+def : ReadAdvance<ReadI, 0>;
+def : ReadAdvance<ReadISReg, 0>;
+def : ReadAdvance<ReadIEReg, 0>;
+def : ReadAdvance<ReadIM, 0>;
+def : ReadAdvance<ReadIMA, 0>;
+def : ReadAdvance<ReadID, 0>;
+def : ReadAdvance<ReadExtrHi, 0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD, 0>;
+
+}
+
+
+//===----------------------------------------------------------------------===//
+// 3. Instruction Tables.
+
+let SchedModel = VulcanModel in {
+
+//---
+// 3.1 Branch Instructions
+//---
+
+// Branch, immed
+// Branch and link, immed
+// Compare and branch
+def : WriteRes<WriteBr, [VulcanI2]> { let Latency = 1; }
+
+def : WriteRes<WriteSys, []> { let Latency = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint, []> { let Latency = 1; }
+
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+
+// Branch, register
+// Branch and link, register != LR
+// Branch and link, register = LR
+def : WriteRes<WriteBrReg, [VulcanI2]> { let Latency = 1; }
+
+//---
+// 3.2 Arithmetic and Logical Instructions
+// 3.3 Move and Shift Instructions
+//---
+
+// ALU, basic
+// Conditional compare
+// Conditional select
+// Address generation
+def : WriteRes<WriteI, [VulcanI012]> { let Latency = 1; }
+def : InstRW<[WriteI], (instrs COPY)>;
+
+// ALU, extend and/or shift
+def : WriteRes<WriteISReg, [VulcanI012]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+
+def : WriteRes<WriteIEReg, [VulcanI012]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+
+// Move immed
+def : WriteRes<WriteImm, [VulcanI012]> { let Latency = 1; }
+
+// Variable shift
+def : WriteRes<WriteIS, [VulcanI012]> { let Latency = 1; }
+
+//---
+// 3.4 Divide and Multiply Instructions
+//---
+
+// Divide, W-form
+// Latency range of 13-23. Take the average.
+def : WriteRes<WriteID32, [VulcanI1]> {
+ let Latency = 18;
+ let ResourceCycles = [18];
+}
+
+// Divide, X-form
+// Latency range of 13-39. Take the average.
+def : WriteRes<WriteID64, [VulcanI1]> {
+ let Latency = 26;
+ let ResourceCycles = [26];
+}
+
+// Multiply accumulate, W-form
+def : WriteRes<WriteIM32, [VulcanI012]> { let Latency = 5; }
+
+// Multiply accumulate, X-form
+def : WriteRes<WriteIM64, [VulcanI012]> { let Latency = 5; }
+
+// Bitfield extract, two reg
+def : WriteRes<WriteExtr, [VulcanI012]> { let Latency = 1; }
+
+// Bitfield move, basic
+// Bitfield move, insert
+// NOTE: Handled by WriteIS.
+
+// Count leading
+def : InstRW<[VulcanWrite_3Cyc_I1], (instregex "^CLS(W|X)r$",
+ "^CLZ(W|X)r$")>;
+
+// Reverse bits/bytes
+// NOTE: Handled by WriteI.
+
+//---
+// 3.6 Load Instructions
+// 3.10 FP Load Instructions
+//---
+
+// Load register, literal
+// Load register, unscaled immed
+// Load register, immed unprivileged
+// Load register, unsigned immed
+def : WriteRes<WriteLD, [VulcanLS01]> { let Latency = 4; }
+
+// Load register, immed post-index
+// NOTE: Handled by WriteLD, WriteI.
+// Load register, immed pre-index
+// NOTE: Handled by WriteLD, WriteAdr.
+def : WriteRes<WriteAdr, [VulcanI012]> { let Latency = 1; }
+
+// Load register offset, basic
+// Load register, register offset, scale by 4/8
+// Load register, register offset, scale by 2
+// Load register offset, extend
+// Load register, register offset, extend, scale by 4/8
+// Load register, register offset, extend, scale by 2
+def VulcanWriteLDIdx : SchedWriteVariant<[
+ SchedVar<ScaledIdxPred, [VulcanWrite_6Cyc_LS01_I012_I012]>,
+ SchedVar<NoSchedPred, [VulcanWrite_5Cyc_LS01_I012]>]>;
+def : SchedAlias<WriteLDIdx, VulcanWriteLDIdx>;
+
+def VulcanReadAdrBase : SchedReadVariant<[
+ SchedVar<ScaledIdxPred, [ReadDefault]>,
+ SchedVar<NoSchedPred, [ReadDefault]>]>;
+def : SchedAlias<ReadAdrBase, VulcanReadAdrBase>;
+
+// Load pair, immed offset, normal
+// Load pair, immed offset, signed words, base != SP
+// Load pair, immed offset signed words, base = SP
+// LDP only breaks into *one* LS micro-op. Thus
+// the resources are handling by WriteLD.
+def : WriteRes<WriteLDHi, []> {
+ let Latency = 5;
+}
+
+// Load pair, immed pre-index, normal
+// Load pair, immed pre-index, signed words
+// Load pair, immed post-index, normal
+// Load pair, immed post-index, signed words
+// NOTE: Handled by WriteLD, WriteLDHi, WriteAdr.
+
+//--
+// 3.7 Store Instructions
+// 3.11 FP Store Instructions
+//--
+
+// Store register, unscaled immed
+// Store register, immed unprivileged
+// Store register, unsigned immed
+def : WriteRes<WriteST, [VulcanLS01, VulcanSD]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+// Store register, immed post-index
+// NOTE: Handled by WriteAdr, WriteST, ReadAdrBase
+
+// Store register, immed pre-index
+// NOTE: Handled by WriteAdr, WriteST
+
+// Store register, register offset, basic
+// Store register, register offset, scaled by 4/8
+// Store register, register offset, scaled by 2
+// Store register, register offset, extend
+// Store register, register offset, extend, scale by 4/8
+// Store register, register offset, extend, scale by 1
+def : WriteRes<WriteSTIdx, [VulcanLS01, VulcanSD, VulcanI012]> {
+ let Latency = 1;
+ let NumMicroOps = 3;
+}
+
+// Store pair, immed offset, W-form
+// Store pair, immed offset, X-form
+def : WriteRes<WriteSTP, [VulcanLS01, VulcanSD]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+// Store pair, immed post-index, W-form
+// Store pair, immed post-index, X-form
+// Store pair, immed pre-index, W-form
+// Store pair, immed pre-index, X-form
+// NOTE: Handled by WriteAdr, WriteSTP.
+
+//---
+// 3.8 FP Data Processing Instructions
+//---
+
+// FP absolute value
+// FP min/max
+// FP negate
+def : WriteRes<WriteF, [VulcanF01]> { let Latency = 5; }
+
+// FP arithmetic
+def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FADD", "^FSUB")>;
+
+// FP compare
+def : WriteRes<WriteFCmp, [VulcanF01]> { let Latency = 5; }
+
+// FP divide, S-form
+// FP square root, S-form
+def : WriteRes<WriteFDiv, [VulcanF01]> {
+ let Latency = 16;
+ let ResourceCycles = [8];
+}
+
+// FP divide, D-form
+// FP square root, D-form
+def : InstRW<[VulcanWrite_23Cyc_F01], (instrs FDIVDrr, FSQRTDr)>;
+
+// FP multiply
+// FP multiply accumulate
+def : WriteRes<WriteFMul, [VulcanF01]> { let Latency = 6; }
+
+// FP round to integral
+def : InstRW<[VulcanWrite_7Cyc_F01],
+ (instregex "^FRINT(A|I|M|N|P|X|Z)(Sr|Dr)")>;
+
+// FP select
+def : InstRW<[VulcanWrite_4Cyc_F01], (instregex "^FCSEL")>;
+
+//---
+// 3.9 FP Miscellaneous Instructions
+//---
+
+// FP convert, from vec to vec reg
+// FP convert, from gen to vec reg
+// FP convert, from vec to gen reg
+def : WriteRes<WriteFCvt, [VulcanF01]> { let Latency = 7; }
+
+// FP move, immed
+// FP move, register
+def : WriteRes<WriteFImm, [VulcanF01]> { let Latency = 4; }
+
+// FP transfer, from gen to vec reg
+// FP transfer, from vec to gen reg
+def : WriteRes<WriteFCopy, [VulcanF01]> { let Latency = 4; }
+def : InstRW<[VulcanWrite_5Cyc_F01], (instrs FMOVXDHighr, FMOVDXHighr)>;
+
+//---
+// 3.12 ASIMD Integer Instructions
+//---
+
+// ASIMD absolute diff, D-form
+// ASIMD absolute diff, Q-form
+// ASIMD absolute diff accum, D-form
+// ASIMD absolute diff accum, Q-form
+// ASIMD absolute diff accum long
+// ASIMD absolute diff long
+// ASIMD arith, basic
+// ASIMD arith, complex
+// ASIMD compare
+// ASIMD logical (AND, BIC, EOR)
+// ASIMD max/min, basic
+// ASIMD max/min, reduce, 4H/4S
+// ASIMD max/min, reduce, 8B/8H
+// ASIMD max/min, reduce, 16B
+// ASIMD multiply, D-form
+// ASIMD multiply, Q-form
+// ASIMD multiply accumulate long
+// ASIMD multiply accumulate saturating long
+// ASIMD multiply long
+// ASIMD pairwise add and accumulate
+// ASIMD shift accumulate
+// ASIMD shift by immed, basic
+// ASIMD shift by immed and insert, basic, D-form
+// ASIMD shift by immed and insert, basic, Q-form
+// ASIMD shift by immed, complex
+// ASIMD shift by register, basic, D-form
+// ASIMD shift by register, basic, Q-form
+// ASIMD shift by register, complex, D-form
+// ASIMD shift by register, complex, Q-form
+def : WriteRes<WriteV, [VulcanF01]> { let Latency = 7; }
+
+// ASIMD arith, reduce, 4H/4S
+// ASIMD arith, reduce, 8B/8H
+// ASIMD arith, reduce, 16B
+def : InstRW<[VulcanWrite_5Cyc_F01],
+ (instregex "^ADDVv", "^SADDLVv", "^UADDLVv")>;
+
+// ASIMD logical (MOV, MVN, ORN, ORR)
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^ORRv", "^ORNv", "^NOTv")>;
+
+// ASIMD polynomial (8x8) multiply long
+def : InstRW<[VulcanWrite_5Cyc_F01], (instrs PMULLv8i8, PMULLv16i8)>;
+
+//---
+// 3.13 ASIMD Floating-point Instructions
+//---
+
+// ASIMD FP absolute value
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FABSv")>;
+
+// ASIMD FP arith, normal, D-form
+// ASIMD FP arith, normal, Q-form
+def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FABDv", "^FADDv", "^FSUBv")>;
+
+// ASIMD FP arith,pairwise, D-form
+// ASIMD FP arith, pairwise, Q-form
+def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FADDPv")>;
+
+// ASIMD FP compare, D-form
+// ASIMD FP compare, Q-form
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FACGEv", "^FACGTv")>;
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FCMEQv", "^FCMGEv",
+ "^FCMGTv", "^FCMLEv",
+ "^FCMLTv")>;
+
+// ASIMD FP convert, long
+// ASIMD FP convert, narrow
+// ASIMD FP convert, other, D-form
+// ASIMD FP convert, other, Q-form
+// NOTE: Handled by WriteV.
+
+// ASIMD FP divide, D-form, F32
+def : InstRW<[VulcanWrite_16Cyc_F01], (instrs FDIVv2f32)>;
+
+// ASIMD FP divide, Q-form, F32
+def : InstRW<[VulcanWrite_16Cyc_F01], (instrs FDIVv4f32)>;
+
+// ASIMD FP divide, Q-form, F64
+def : InstRW<[VulcanWrite_23Cyc_F01], (instrs FDIVv2f64)>;
+
+// ASIMD FP max/min, normal, D-form
+// ASIMD FP max/min, normal, Q-form
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMAXv", "^FMAXNMv",
+ "^FMINv", "^FMINNMv")>;
+
+// ASIMD FP max/min, pairwise, D-form
+// ASIMD FP max/min, pairwise, Q-form
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMAXPv", "^FMAXNMPv",
+ "^FMINPv", "^FMINNMPv")>;
+
+// ASIMD FP max/min, reduce
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMAXVv", "^FMAXNMVv",
+ "^FMINVv", "^FMINNMVv")>;
+
+// ASIMD FP multiply, D-form, FZ
+// ASIMD FP multiply, D-form, no FZ
+// ASIMD FP multiply, Q-form, FZ
+// ASIMD FP multiply, Q-form, no FZ
+def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FMULv", "^FMULXv")>;
+
+// ASIMD FP multiply accumulate, Dform, FZ
+// ASIMD FP multiply accumulate, Dform, no FZ
+// ASIMD FP multiply accumulate, Qform, FZ
+// ASIMD FP multiply accumulate, Qform, no FZ
+def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FMLAv", "^FMLSv")>;
+
+// ASIMD FP negate
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FNEGv")>;
+
+// ASIMD FP round, D-form
+// ASIMD FP round, Q-form
+// NOTE: Handled by WriteV.
+
+//--
+// 3.14 ASIMD Miscellaneous Instructions
+//--
+
+// ASIMD bit reverse
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^RBITv")>;
+
+// ASIMD bitwise insert, D-form
+// ASIMD bitwise insert, Q-form
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^BIFv", "^BITv", "^BSLv")>;
+
+// ASIMD count, D-form
+// ASIMD count, Q-form
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^CLSv", "^CLZv", "^CNTv")>;
+
+// ASIMD duplicate, gen reg
+// ASIMD duplicate, element
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^DUPv")>;
+
+// ASIMD extract
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^EXTv")>;
+
+// ASIMD extract narrow
+// ASIMD extract narrow, saturating
+// NOTE: Handled by WriteV.
+
+// ASIMD insert, element to element
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^INSv")>;
+
+// ASIMD move, integer immed
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^MOVIv", "^MOVIDv")>;
+
+// ASIMD move, FP immed
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMOVv")>;
+
+// ASIMD reciprocal estimate, D-form
+// ASIMD reciprocal estimate, Q-form
+def : InstRW<[VulcanWrite_5Cyc_F01],
+ (instregex "^FRECPEv", "^FRECPXv", "^URECPEv",
+ "^FRSQRTEv", "^URSQRTEv")>;
+
+// ASIMD reciprocal step, D-form, FZ
+// ASIMD reciprocal step, D-form, no FZ
+// ASIMD reciprocal step, Q-form, FZ
+// ASIMD reciprocal step, Q-form, no FZ
+def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FRECPSv", "^FRSQRTSv")>;
+
+// ASIMD reverse
+def : InstRW<[VulcanWrite_5Cyc_F01],
+ (instregex "^REV16v", "^REV32v", "^REV64v")>;
+
+// ASIMD table lookup, D-form
+// ASIMD table lookup, Q-form
+def : InstRW<[VulcanWrite_8Cyc_F01], (instregex "^TBLv", "^TBXv")>;
+
+// ASIMD transfer, element to word or word
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^UMOVv")>;
+
+// ASIMD transfer, element to gen reg
+def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^SMOVv", "^UMOVv")>;
+
+// ASIMD transfer gen reg to element
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^INSv")>;
+
+// ASIMD transpose
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^TRN1v", "^TRN2v",
+ "^UZP1v", "^UZP2v")>;
+
+// ASIMD unzip/zip
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^ZIP1v", "^ZIP2v")>;
+
+//--
+// 3.15 ASIMD Load Instructions
+//--
+
+// ASIMD load, 1 element, multiple, 1 reg, D-form
+// ASIMD load, 1 element, multiple, 1 reg, Q-form
+def : InstRW<[VulcanWrite_4Cyc_LS01],
+ (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_4Cyc_LS01, WriteAdr],
+ (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 2 reg, D-form
+// ASIMD load, 1 element, multiple, 2 reg, Q-form
+def : InstRW<[VulcanWrite_4Cyc_LS01],
+ (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_4Cyc_LS01, WriteAdr],
+ (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 3 reg, D-form
+// ASIMD load, 1 element, multiple, 3 reg, Q-form
+def : InstRW<[VulcanWrite_5Cyc_LS01],
+ (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_5Cyc_LS01, WriteAdr],
+ (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 4 reg, D-form
+// ASIMD load, 1 element, multiple, 4 reg, Q-form
+def : InstRW<[VulcanWrite_6Cyc_LS01],
+ (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_6Cyc_LS01, WriteAdr],
+ (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, one lane, B/H/S
+// ASIMD load, 1 element, one lane, D
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01], (instregex "^LD1i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr],
+ (instregex "^LD1i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 1 element, all lanes, D-form, B/H/S
+// ASIMD load, 1 element, all lanes, D-form, D
+// ASIMD load, 1 element, all lanes, Q-form
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01],
+ (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr],
+ (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 2 element, multiple, D-form, B/H/S
+// ASIMD load, 2 element, multiple, Q-form, D
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01],
+ (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr],
+ (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 2 element, one lane, B/H
+// ASIMD load, 2 element, one lane, S
+// ASIMD load, 2 element, one lane, D
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01], (instregex "^LD2i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr],
+ (instregex "^LD2i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 2 element, all lanes, D-form, B/H/S
+// ASIMD load, 2 element, all lanes, D-form, D
+// ASIMD load, 2 element, all lanes, Q-form
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01],
+ (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr],
+ (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 3 element, multiple, D-form, B/H/S
+// ASIMD load, 3 element, multiple, Q-form, B/H/S
+// ASIMD load, 3 element, multiple, Q-form, D
+def : InstRW<[VulcanWrite_8Cyc_LS01_F01],
+ (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_8Cyc_LS01_F01, WriteAdr],
+ (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 3 element, one lone, B/H
+// ASIMD load, 3 element, one lane, S
+// ASIMD load, 3 element, one lane, D
+def : InstRW<[VulcanWrite_7Cyc_LS01_F01], (instregex "^LD3i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_7Cyc_LS01_F01, WriteAdr],
+ (instregex "^LD3i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 3 element, all lanes, D-form, B/H/S
+// ASIMD load, 3 element, all lanes, D-form, D
+// ASIMD load, 3 element, all lanes, Q-form, B/H/S
+// ASIMD load, 3 element, all lanes, Q-form, D
+def : InstRW<[VulcanWrite_7Cyc_LS01_F01],
+ (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_7Cyc_LS01_F01, WriteAdr],
+ (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 4 element, multiple, D-form, B/H/S
+// ASIMD load, 4 element, multiple, Q-form, B/H/S
+// ASIMD load, 4 element, multiple, Q-form, D
+def : InstRW<[VulcanWrite_8Cyc_LS01_F01],
+ (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_8Cyc_LS01_F01, WriteAdr],
+ (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 4 element, one lane, B/H
+// ASIMD load, 4 element, one lane, S
+// ASIMD load, 4 element, one lane, D
+def : InstRW<[VulcanWrite_6Cyc_LS01_F01], (instregex "^LD4i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_6Cyc_LS01_F01, WriteAdr],
+ (instregex "^LD4i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 4 element, all lanes, D-form, B/H/S
+// ASIMD load, 4 element, all lanes, D-form, D
+// ASIMD load, 4 element, all lanes, Q-form, B/H/S
+// ASIMD load, 4 element, all lanes, Q-form, D
+def : InstRW<[VulcanWrite_6Cyc_LS01_F01],
+ (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_6Cyc_LS01_F01, WriteAdr],
+ (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+//--
+// 3.16 ASIMD Store Instructions
+//--
+
+// ASIMD store, 1 element, multiple, 1 reg, D-form
+// ASIMD store, 1 element, multiple, 1 reg, Q-form
+def : InstRW<[VulcanWrite_1Cyc_LS01],
+ (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr],
+ (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 2 reg, D-form
+// ASIMD store, 1 element, multiple, 2 reg, Q-form
+def : InstRW<[VulcanWrite_1Cyc_LS01],
+ (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr],
+ (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 3 reg, D-form
+// ASIMD store, 1 element, multiple, 3 reg, Q-form
+def : InstRW<[VulcanWrite_1Cyc_LS01],
+ (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr],
+ (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 4 reg, D-form
+// ASIMD store, 1 element, multiple, 4 reg, Q-form
+def : InstRW<[VulcanWrite_1Cyc_LS01],
+ (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr],
+ (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, one lane, B/H/S
+// ASIMD store, 1 element, one lane, D
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01],
+ (instregex "^ST1i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr],
+ (instregex "^ST1i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 2 element, multiple, D-form, B/H/S
+// ASIMD store, 2 element, multiple, Q-form, B/H/S
+// ASIMD store, 2 element, multiple, Q-form, D
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01],
+ (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr],
+ (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 2 element, one lane, B/H/S
+// ASIMD store, 2 element, one lane, D
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01],
+ (instregex "^ST2i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr],
+ (instregex "^ST2i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 3 element, multiple, D-form, B/H/S
+// ASIMD store, 3 element, multiple, Q-form, B/H/S
+// ASIMD store, 3 element, multiple, Q-form, D
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01],
+ (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr],
+ (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 3 element, one lane, B/H
+// ASIMD store, 3 element, one lane, S
+// ASIMD store, 3 element, one lane, D
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01], (instregex "^ST3i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr],
+ (instregex "^ST3i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 4 element, multiple, D-form, B/H/S
+// ASIMD store, 4 element, multiple, Q-form, B/H/S
+// ASIMD store, 4 element, multiple, Q-form, D
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01],
+ (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr],
+ (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 4 element, one lane, B/H
+// ASIMD store, 4 element, one lane, S
+// ASIMD store, 4 element, one lane, D
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01], (instregex "^ST4i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr],
+ (instregex "^ST4i(8|16|32|64)_POST$")>;
+
+//--
+// 3.17 Cryptography Extensions
+//--
+
+// Crypto AES ops
+def : InstRW<[VulcanWrite_5Cyc_F1], (instregex "^AES")>;
+
+// Crypto polynomial (64x64) multiply long
+def : InstRW<[VulcanWrite_5Cyc_F1], (instrs PMULLv1i64, PMULLv2i64)>;
+
+// Crypto SHA1 xor ops
+// Crypto SHA1 schedule acceleration ops
+// Crypto SHA256 schedule acceleration op (1 u-op)
+// Crypto SHA256 schedule acceleration op (2 u-ops)
+// Crypto SHA256 hash acceleration ops
+def : InstRW<[VulcanWrite_7Cyc_F1], (instregex "^SHA")>;
+
+//--
+// 3.18 CRC
+//--
+
+// CRC checksum ops
+def : InstRW<[VulcanWrite_4Cyc_I1], (instregex "^CRC32")>;
+
+} // SchedModel = VulcanModel
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64Schedule.td b/contrib/llvm/lib/Target/AArch64/AArch64Schedule.td
new file mode 100644
index 000000000000..ce81f48acf71
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64Schedule.td
@@ -0,0 +1,106 @@
+//==-- AArch64Schedule.td - AArch64 Scheduling Definitions -*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// Define TII for use in SchedVariant Predicates.
+// const MachineInstr *MI and const TargetSchedModel *SchedModel
+// are defined by default.
+def : PredicateProlog<[{
+ const AArch64InstrInfo *TII =
+ static_cast<const AArch64InstrInfo*>(SchedModel->getInstrInfo());
+ (void)TII;
+}]>;
+
+// AArch64 Scheduler Definitions
+
+def WriteImm : SchedWrite; // MOVN, MOVZ
+// TODO: Provide variants for MOV32/64imm Pseudos that dynamically
+// select the correct sequence of WriteImms.
+
+def WriteI : SchedWrite; // ALU
+def WriteISReg : SchedWrite; // ALU of Shifted-Reg
+def WriteIEReg : SchedWrite; // ALU of Extended-Reg
+def ReadI : SchedRead; // ALU
+def ReadISReg : SchedRead; // ALU of Shifted-Reg
+def ReadIEReg : SchedRead; // ALU of Extended-Reg
+def WriteExtr : SchedWrite; // EXTR shifts a reg pair
+def ReadExtrHi : SchedRead; // Read the high reg of the EXTR pair
+def WriteIS : SchedWrite; // Shift/Scale
+def WriteID32 : SchedWrite; // 32-bit Divide
+def WriteID64 : SchedWrite; // 64-bit Divide
+def ReadID : SchedRead; // 32/64-bit Divide
+def WriteIM32 : SchedWrite; // 32-bit Multiply
+def WriteIM64 : SchedWrite; // 64-bit Multiply
+def ReadIM : SchedRead; // 32/64-bit Multiply
+def ReadIMA : SchedRead; // 32/64-bit Multiply Accumulate
+def WriteBr : SchedWrite; // Branch
+def WriteBrReg : SchedWrite; // Indirect Branch
+
+def WriteLD : SchedWrite; // Load from base addr plus immediate offset
+def WriteST : SchedWrite; // Store to base addr plus immediate offset
+def WriteSTP : SchedWrite; // Store a register pair.
+def WriteAdr : SchedWrite; // Address pre/post increment.
+
+def WriteLDIdx : SchedWrite; // Load from a register index (maybe scaled).
+def WriteSTIdx : SchedWrite; // Store to a register index (maybe scaled).
+def ReadAdrBase : SchedRead; // Read the base resister of a reg-offset LD/ST.
+
+// Predicate for determining when a shiftable register is shifted.
+def RegShiftedPred : SchedPredicate<[{TII->hasShiftedReg(*MI)}]>;
+
+// Predicate for determining when a extendedable register is extended.
+def RegExtendedPred : SchedPredicate<[{TII->hasExtendedReg(*MI)}]>;
+
+// ScaledIdxPred is true if a WriteLDIdx operand will be
+// scaled. Subtargets can use this to dynamically select resources and
+// latency for WriteLDIdx and ReadAdrBase.
+def ScaledIdxPred : SchedPredicate<[{TII->isScaledAddr(*MI)}]>;
+
+// Serialized two-level address load.
+// EXAMPLE: LOADGot
+def WriteLDAdr : WriteSequence<[WriteAdr, WriteLD]>;
+
+// Serialized two-level address lookup.
+// EXAMPLE: MOVaddr...
+def WriteAdrAdr : WriteSequence<[WriteAdr, WriteAdr]>;
+
+// The second register of a load-pair.
+// LDP,LDPSW,LDNP,LDXP,LDAXP
+def WriteLDHi : SchedWrite;
+
+// Store-exclusive is a store followed by a dependent load.
+def WriteSTX : WriteSequence<[WriteST, WriteLD]>;
+
+def WriteSys : SchedWrite; // Long, variable latency system ops.
+def WriteBarrier : SchedWrite; // Memory barrier.
+def WriteHint : SchedWrite; // Hint instruction.
+
+def WriteF : SchedWrite; // General floating-point ops.
+def WriteFCmp : SchedWrite; // Floating-point compare.
+def WriteFCvt : SchedWrite; // Float conversion.
+def WriteFCopy : SchedWrite; // Float-int register copy.
+def WriteFImm : SchedWrite; // Floating-point immediate.
+def WriteFMul : SchedWrite; // Floating-point multiply.
+def WriteFDiv : SchedWrite; // Floating-point division.
+
+def WriteV : SchedWrite; // Vector ops.
+def WriteVLD : SchedWrite; // Vector loads.
+def WriteVST : SchedWrite; // Vector stores.
+
+def WriteAtomic : SchedWrite; // Atomic memory operations (CAS, Swap, LDOP)
+
+// Read the unwritten lanes of the VLD's destination registers.
+def ReadVLD : SchedRead;
+
+// Sequential vector load and shuffle.
+def WriteVLDShuffle : WriteSequence<[WriteVLD, WriteV]>;
+def WriteVLDPairShuffle : WriteSequence<[WriteVLD, WriteV, WriteV]>;
+
+// Store a shuffled vector.
+def WriteVSTShuffle : WriteSequence<[WriteV, WriteVST]>;
+def WriteVSTPairShuffle : WriteSequence<[WriteV, WriteV, WriteVST]>;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
new file mode 100644
index 000000000000..66a8f332513a
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -0,0 +1,59 @@
+//===-- AArch64SelectionDAGInfo.cpp - AArch64 SelectionDAG Info -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64SelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64TargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-selectiondag-info"
+
+SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
+ SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align, bool isVolatile,
+ MachinePointerInfo DstPtrInfo) const {
+ // Check to see if there is a specialized entry-point for memory zeroing.
+ ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
+ ConstantSDNode *SizeValue = dyn_cast<ConstantSDNode>(Size);
+ const AArch64Subtarget &STI =
+ DAG.getMachineFunction().getSubtarget<AArch64Subtarget>();
+ const char *bzeroEntry =
+ (V && V->isNullValue()) ? STI.getBZeroEntry() : nullptr;
+ // For small size (< 256), it is not beneficial to use bzero
+ // instead of memset.
+ if (bzeroEntry && (!SizeValue || SizeValue->getZExtValue() > 256)) {
+ const AArch64TargetLowering &TLI = *STI.getTargetLowering();
+
+ EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout());
+ Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
+ TargetLowering::ArgListTy Args;
+ TargetLowering::ArgListEntry Entry;
+ Entry.Node = Dst;
+ Entry.Ty = IntPtrTy;
+ Args.push_back(Entry);
+ Entry.Node = Size;
+ Args.push_back(Entry);
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(dl).setChain(Chain)
+ .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+ DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args))
+ .setDiscardResult();
+ std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+ return CallResult.second;
+ }
+ return SDValue();
+}
+bool AArch64SelectionDAGInfo::generateFMAsInMachineCombiner(
+ CodeGenOpt::Level OptLevel) const {
+ if (OptLevel >= CodeGenOpt::Aggressive)
+ return true;
+ return false;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
new file mode 100644
index 000000000000..7e4f11091226
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
@@ -0,0 +1,31 @@
+//===-- AArch64SelectionDAGInfo.h - AArch64 SelectionDAG Info ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the AArch64 subclass for SelectionDAGTargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64SELECTIONDAGINFO_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64SELECTIONDAGINFO_H
+
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+
+namespace llvm {
+
+class AArch64SelectionDAGInfo : public SelectionDAGTargetInfo {
+public:
+ SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
+ SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align, bool isVolatile,
+ MachinePointerInfo DstPtrInfo) const override;
+ bool generateFMAsInMachineCombiner(CodeGenOpt::Level OptLevel) const override;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/contrib/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp
new file mode 100644
index 000000000000..fe984ccbaf1d
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp
@@ -0,0 +1,171 @@
+//===--- AArch64StorePairSuppress.cpp --- Suppress store pair formation ---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass identifies floating point stores that should not be combined into
+// store pairs. Later we may do the same for floating point loads.
+// ===---------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-stp-suppress"
+
+#define STPSUPPRESS_PASS_NAME "AArch64 Store Pair Suppression"
+
+namespace {
+class AArch64StorePairSuppress : public MachineFunctionPass {
+ const AArch64InstrInfo *TII;
+ const TargetRegisterInfo *TRI;
+ const MachineRegisterInfo *MRI;
+ TargetSchedModel SchedModel;
+ MachineTraceMetrics *Traces;
+ MachineTraceMetrics::Ensemble *MinInstr;
+
+public:
+ static char ID;
+ AArch64StorePairSuppress() : MachineFunctionPass(ID) {
+ initializeAArch64StorePairSuppressPass(*PassRegistry::getPassRegistry());
+ }
+
+ StringRef getPassName() const override { return STPSUPPRESS_PASS_NAME; }
+
+ bool runOnMachineFunction(MachineFunction &F) override;
+
+private:
+ bool shouldAddSTPToBlock(const MachineBasicBlock *BB);
+
+ bool isNarrowFPStore(const MachineInstr &MI);
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<MachineTraceMetrics>();
+ AU.addPreserved<MachineTraceMetrics>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+char AArch64StorePairSuppress::ID = 0;
+} // anonymous
+
+INITIALIZE_PASS(AArch64StorePairSuppress, "aarch64-stp-suppress",
+ STPSUPPRESS_PASS_NAME, false, false)
+
+FunctionPass *llvm::createAArch64StorePairSuppressPass() {
+ return new AArch64StorePairSuppress();
+}
+
+/// Return true if an STP can be added to this block without increasing the
+/// critical resource height. STP is good to form in Ld/St limited blocks and
+/// bad to form in float-point limited blocks. This is true independent of the
+/// critical path. If the critical path is longer than the resource height, the
+/// extra vector ops can limit physreg renaming. Otherwise, it could simply
+/// oversaturate the vector units.
+bool AArch64StorePairSuppress::shouldAddSTPToBlock(const MachineBasicBlock *BB) {
+ if (!MinInstr)
+ MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
+
+ MachineTraceMetrics::Trace BBTrace = MinInstr->getTrace(BB);
+ unsigned ResLength = BBTrace.getResourceLength();
+
+ // Get the machine model's scheduling class for STPQi.
+ // Bypass TargetSchedule's SchedClass resolution since we only have an opcode.
+ unsigned SCIdx = TII->get(AArch64::STPDi).getSchedClass();
+ const MCSchedClassDesc *SCDesc =
+ SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
+
+ // If a subtarget does not define resources for STPQi, bail here.
+ if (SCDesc->isValid() && !SCDesc->isVariant()) {
+ unsigned ResLenWithSTP = BBTrace.getResourceLength(None, SCDesc);
+ if (ResLenWithSTP > ResLength) {
+ DEBUG(dbgs() << " Suppress STP in BB: " << BB->getNumber()
+ << " resources " << ResLength << " -> " << ResLenWithSTP
+ << "\n");
+ return false;
+ }
+ }
+ return true;
+}
+
+/// Return true if this is a floating-point store smaller than the V reg. On
+/// cyclone, these require a vector shuffle before storing a pair.
+/// Ideally we would call getMatchingPairOpcode() and have the machine model
+/// tell us if it's profitable with no cpu knowledge here.
+///
+/// FIXME: We plan to develop a decent Target abstraction for simple loads and
+/// stores. Until then use a nasty switch similar to AArch64LoadStoreOptimizer.
+bool AArch64StorePairSuppress::isNarrowFPStore(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ default:
+ return false;
+ case AArch64::STRSui:
+ case AArch64::STRDui:
+ case AArch64::STURSi:
+ case AArch64::STURDi:
+ return true;
+ }
+}
+
+bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
+ const TargetSubtargetInfo &ST = MF.getSubtarget();
+ TII = static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
+ TRI = ST.getRegisterInfo();
+ MRI = &MF.getRegInfo();
+ SchedModel.init(ST.getSchedModel(), &ST, TII);
+ Traces = &getAnalysis<MachineTraceMetrics>();
+ MinInstr = nullptr;
+
+ DEBUG(dbgs() << "*** " << getPassName() << ": " << MF.getName() << '\n');
+
+ if (!SchedModel.hasInstrSchedModel()) {
+ DEBUG(dbgs() << " Skipping pass: no machine model present.\n");
+ return false;
+ }
+
+ // Check for a sequence of stores to the same base address. We don't need to
+ // precisely determine whether a store pair can be formed. But we do want to
+ // filter out most situations where we can't form store pairs to avoid
+ // computing trace metrics in those cases.
+ for (auto &MBB : MF) {
+ bool SuppressSTP = false;
+ unsigned PrevBaseReg = 0;
+ for (auto &MI : MBB) {
+ if (!isNarrowFPStore(MI))
+ continue;
+ unsigned BaseReg;
+ int64_t Offset;
+ if (TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI)) {
+ if (PrevBaseReg == BaseReg) {
+ // If this block can take STPs, skip ahead to the next block.
+ if (!SuppressSTP && shouldAddSTPToBlock(MI.getParent()))
+ break;
+ // Otherwise, continue unpairing the stores in this block.
+ DEBUG(dbgs() << "Unpairing store " << MI << "\n");
+ SuppressSTP = true;
+ TII->suppressLdStPair(MI);
+ }
+ PrevBaseReg = BaseReg;
+ } else
+ PrevBaseReg = 0;
+ }
+ }
+ // This pass just sets some internal MachineMemOperand flags. It can't really
+ // invalidate anything.
+ return false;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
new file mode 100644
index 000000000000..f58bbbd26132
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -0,0 +1,188 @@
+//===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64 specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64Subtarget.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64PBQPRegAlloc.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-subtarget"
+
+#define GET_SUBTARGETINFO_CTOR
+#define GET_SUBTARGETINFO_TARGET_DESC
+#include "AArch64GenSubtargetInfo.inc"
+
+static cl::opt<bool>
+EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if "
+ "converter pass"), cl::init(true), cl::Hidden);
+
+// If OS supports TBI, use this flag to enable it.
+static cl::opt<bool>
+UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of "
+ "an address is ignored"), cl::init(false), cl::Hidden);
+
+AArch64Subtarget &
+AArch64Subtarget::initializeSubtargetDependencies(StringRef FS,
+ StringRef CPUString) {
+ // Determine default and user-specified characteristics
+
+ if (CPUString.empty())
+ CPUString = "generic";
+
+ ParseSubtargetFeatures(CPUString, FS);
+ initializeProperties();
+
+ return *this;
+}
+
+void AArch64Subtarget::initializeProperties() {
+ // Initialize CPU specific properties. We should add a tablegen feature for
+ // this in the future so we can specify it together with the subtarget
+ // features.
+ switch (ARMProcFamily) {
+ case Cyclone:
+ CacheLineSize = 64;
+ PrefetchDistance = 280;
+ MinPrefetchStride = 2048;
+ MaxPrefetchIterationsAhead = 3;
+ break;
+ case CortexA57:
+ MaxInterleaveFactor = 4;
+ break;
+ case ExynosM1:
+ MaxInterleaveFactor = 4;
+ MaxJumpTableSize = 8;
+ PrefFunctionAlignment = 4;
+ PrefLoopAlignment = 3;
+ break;
+ case Falkor:
+ MaxInterleaveFactor = 4;
+ break;
+ case Kryo:
+ MaxInterleaveFactor = 4;
+ VectorInsertExtractBaseCost = 2;
+ CacheLineSize = 128;
+ PrefetchDistance = 740;
+ MinPrefetchStride = 1024;
+ MaxPrefetchIterationsAhead = 11;
+ break;
+ case Vulcan:
+ MaxInterleaveFactor = 4;
+ break;
+ case CortexA35: break;
+ case CortexA53: break;
+ case CortexA72: break;
+ case CortexA73: break;
+ case Others: break;
+ }
+}
+
+AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
+ const std::string &FS,
+ const TargetMachine &TM, bool LittleEndian)
+ : AArch64GenSubtargetInfo(TT, CPU, FS), ReserveX18(TT.isOSDarwin()),
+ IsLittle(LittleEndian), TargetTriple(TT), FrameLowering(),
+ InstrInfo(initializeSubtargetDependencies(FS, CPU)), TSInfo(),
+ TLInfo(TM, *this), GISel() {}
+
+const CallLowering *AArch64Subtarget::getCallLowering() const {
+ assert(GISel && "Access to GlobalISel APIs not set");
+ return GISel->getCallLowering();
+}
+
+const InstructionSelector *AArch64Subtarget::getInstructionSelector() const {
+ assert(GISel && "Access to GlobalISel APIs not set");
+ return GISel->getInstructionSelector();
+}
+
+const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const {
+ assert(GISel && "Access to GlobalISel APIs not set");
+ return GISel->getLegalizerInfo();
+}
+
+const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const {
+ assert(GISel && "Access to GlobalISel APIs not set");
+ return GISel->getRegBankInfo();
+}
+
+/// Find the target operand flags that describe how a global value should be
+/// referenced for the current subtarget.
+unsigned char
+AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
+ const TargetMachine &TM) const {
+ // MachO large model always goes via a GOT, simply to get a single 8-byte
+ // absolute relocation on all global addresses.
+ if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
+ return AArch64II::MO_GOT;
+
+ if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
+ return AArch64II::MO_GOT;
+
+ // The small code mode's direct accesses use ADRP, which cannot necessarily
+ // produce the value 0 (if the code is above 4GB).
+ if (TM.getCodeModel() == CodeModel::Small && GV->hasExternalWeakLinkage())
+ return AArch64II::MO_GOT;
+
+ return AArch64II::MO_NO_FLAG;
+}
+
+/// This function returns the name of a function which has an interface
+/// like the non-standard bzero function, if such a function exists on
+/// the current subtarget and it is considered prefereable over
+/// memset with zero passed as the second argument. Otherwise it
+/// returns null.
+const char *AArch64Subtarget::getBZeroEntry() const {
+ // Prefer bzero on Darwin only.
+ if(isTargetDarwin())
+ return "bzero";
+
+ return nullptr;
+}
+
+void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
+ unsigned NumRegionInstrs) const {
+ // LNT run (at least on Cyclone) showed reasonably significant gains for
+ // bi-directional scheduling. 253.perlbmk.
+ Policy.OnlyTopDown = false;
+ Policy.OnlyBottomUp = false;
+ // Enabling or Disabling the latency heuristic is a close call: It seems to
+ // help nearly no benchmark on out-of-order architectures, on the other hand
+ // it regresses register pressure on a few benchmarking.
+ Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;
+}
+
+bool AArch64Subtarget::enableEarlyIfConversion() const {
+ return EnableEarlyIfConvert;
+}
+
+bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
+ if (!UseAddressTopByteIgnored)
+ return false;
+
+ if (TargetTriple.isiOS()) {
+ unsigned Major, Minor, Micro;
+ TargetTriple.getiOSVersion(Major, Minor, Micro);
+ return Major >= 8;
+ }
+
+ return false;
+}
+
+std::unique_ptr<PBQPRAConstraint>
+AArch64Subtarget::getCustomPBQPConstraints() const {
+ return balanceFPOps() ? llvm::make_unique<A57ChainingConstraint>() : nullptr;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h
new file mode 100644
index 000000000000..73f63b8b9f67
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -0,0 +1,265 @@
+//===--- AArch64Subtarget.h - Define Subtarget for the AArch64 -*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the AArch64 specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64SUBTARGET_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64SUBTARGET_H
+
+#include "AArch64FrameLowering.h"
+#include "AArch64ISelLowering.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64RegisterInfo.h"
+#include "AArch64SelectionDAGInfo.h"
+#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "AArch64GenSubtargetInfo.inc"
+
+namespace llvm {
+class GlobalValue;
+class StringRef;
+class Triple;
+
+class AArch64Subtarget final : public AArch64GenSubtargetInfo {
+public:
+ enum ARMProcFamilyEnum : uint8_t {
+ Others,
+ CortexA35,
+ CortexA53,
+ CortexA57,
+ CortexA72,
+ CortexA73,
+ Cyclone,
+ ExynosM1,
+ Falkor,
+ Kryo,
+ Vulcan
+ };
+
+protected:
+ /// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others.
+ ARMProcFamilyEnum ARMProcFamily = Others;
+
+ bool HasV8_1aOps = false;
+ bool HasV8_2aOps = false;
+
+ bool HasFPARMv8 = false;
+ bool HasNEON = false;
+ bool HasCrypto = false;
+ bool HasCRC = false;
+ bool HasLSE = false;
+ bool HasRAS = false;
+ bool HasPerfMon = false;
+ bool HasFullFP16 = false;
+ bool HasSPE = false;
+
+ // HasZeroCycleRegMove - Has zero-cycle register mov instructions.
+ bool HasZeroCycleRegMove = false;
+
+ // HasZeroCycleZeroing - Has zero-cycle zeroing instructions.
+ bool HasZeroCycleZeroing = false;
+
+ // StrictAlign - Disallow unaligned memory accesses.
+ bool StrictAlign = false;
+ bool UseAA = false;
+ bool PredictableSelectIsExpensive = false;
+ bool BalanceFPOps = false;
+ bool CustomAsCheapAsMove = false;
+ bool UsePostRAScheduler = false;
+ bool Misaligned128StoreIsSlow = false;
+ bool AvoidQuadLdStPairs = false;
+ bool UseAlternateSExtLoadCVTF32Pattern = false;
+ bool HasArithmeticBccFusion = false;
+ bool HasArithmeticCbzFusion = false;
+ bool DisableLatencySchedHeuristic = false;
+ bool UseRSqrt = false;
+ uint8_t MaxInterleaveFactor = 2;
+ uint8_t VectorInsertExtractBaseCost = 3;
+ uint16_t CacheLineSize = 0;
+ uint16_t PrefetchDistance = 0;
+ uint16_t MinPrefetchStride = 1;
+ unsigned MaxPrefetchIterationsAhead = UINT_MAX;
+ unsigned PrefFunctionAlignment = 0;
+ unsigned PrefLoopAlignment = 0;
+ unsigned MaxJumpTableSize = 0;
+
+ // ReserveX18 - X18 is not available as a general purpose register.
+ bool ReserveX18;
+
+ bool IsLittle;
+
+ /// TargetTriple - What processor and OS we're targeting.
+ Triple TargetTriple;
+
+ AArch64FrameLowering FrameLowering;
+ AArch64InstrInfo InstrInfo;
+ AArch64SelectionDAGInfo TSInfo;
+ AArch64TargetLowering TLInfo;
+ /// Gather the accessor points to GlobalISel-related APIs.
+ /// This is used to avoid ifndefs spreading around while GISel is
+ /// an optional library.
+ std::unique_ptr<GISelAccessor> GISel;
+
+private:
+ /// initializeSubtargetDependencies - Initializes using CPUString and the
+ /// passed in feature string so that we can use initializer lists for
+ /// subtarget initialization.
+ AArch64Subtarget &initializeSubtargetDependencies(StringRef FS,
+ StringRef CPUString);
+
+ /// Initialize properties based on the selected processor family.
+ void initializeProperties();
+
+public:
+ /// This constructor initializes the data members to match that
+ /// of the specified triple.
+ AArch64Subtarget(const Triple &TT, const std::string &CPU,
+ const std::string &FS, const TargetMachine &TM,
+ bool LittleEndian);
+
+ /// This object will take onwership of \p GISelAccessor.
+ void setGISelAccessor(GISelAccessor &GISel) {
+ this->GISel.reset(&GISel);
+ }
+
+ const AArch64SelectionDAGInfo *getSelectionDAGInfo() const override {
+ return &TSInfo;
+ }
+ const AArch64FrameLowering *getFrameLowering() const override {
+ return &FrameLowering;
+ }
+ const AArch64TargetLowering *getTargetLowering() const override {
+ return &TLInfo;
+ }
+ const AArch64InstrInfo *getInstrInfo() const override { return &InstrInfo; }
+ const AArch64RegisterInfo *getRegisterInfo() const override {
+ return &getInstrInfo()->getRegisterInfo();
+ }
+ const CallLowering *getCallLowering() const override;
+ const InstructionSelector *getInstructionSelector() const override;
+ const LegalizerInfo *getLegalizerInfo() const override;
+ const RegisterBankInfo *getRegBankInfo() const override;
+ const Triple &getTargetTriple() const { return TargetTriple; }
+ bool enableMachineScheduler() const override { return true; }
+ bool enablePostRAScheduler() const override {
+ return UsePostRAScheduler;
+ }
+
+ /// Returns ARM processor family.
+ /// Avoid this function! CPU specifics should be kept local to this class
+ /// and preferably modeled with SubtargetFeatures or properties in
+ /// initializeProperties().
+ ARMProcFamilyEnum getProcFamily() const {
+ return ARMProcFamily;
+ }
+
+ bool hasV8_1aOps() const { return HasV8_1aOps; }
+ bool hasV8_2aOps() const { return HasV8_2aOps; }
+
+ bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; }
+
+ bool hasZeroCycleZeroing() const { return HasZeroCycleZeroing; }
+
+ bool requiresStrictAlign() const { return StrictAlign; }
+
+ bool isXRaySupported() const override { return true; }
+
+ bool isX18Reserved() const { return ReserveX18; }
+ bool hasFPARMv8() const { return HasFPARMv8; }
+ bool hasNEON() const { return HasNEON; }
+ bool hasCrypto() const { return HasCrypto; }
+ bool hasCRC() const { return HasCRC; }
+ bool hasLSE() const { return HasLSE; }
+ bool hasRAS() const { return HasRAS; }
+ bool balanceFPOps() const { return BalanceFPOps; }
+ bool predictableSelectIsExpensive() const {
+ return PredictableSelectIsExpensive;
+ }
+ bool hasCustomCheapAsMoveHandling() const { return CustomAsCheapAsMove; }
+ bool isMisaligned128StoreSlow() const { return Misaligned128StoreIsSlow; }
+ bool avoidQuadLdStPairs() const { return AvoidQuadLdStPairs; }
+ bool useAlternateSExtLoadCVTF32Pattern() const {
+ return UseAlternateSExtLoadCVTF32Pattern;
+ }
+ bool hasArithmeticBccFusion() const { return HasArithmeticBccFusion; }
+ bool hasArithmeticCbzFusion() const { return HasArithmeticCbzFusion; }
+ bool useRSqrt() const { return UseRSqrt; }
+ unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
+ unsigned getVectorInsertExtractBaseCost() const {
+ return VectorInsertExtractBaseCost;
+ }
+ unsigned getCacheLineSize() const { return CacheLineSize; }
+ unsigned getPrefetchDistance() const { return PrefetchDistance; }
+ unsigned getMinPrefetchStride() const { return MinPrefetchStride; }
+ unsigned getMaxPrefetchIterationsAhead() const {
+ return MaxPrefetchIterationsAhead;
+ }
+ unsigned getPrefFunctionAlignment() const { return PrefFunctionAlignment; }
+ unsigned getPrefLoopAlignment() const { return PrefLoopAlignment; }
+
+ unsigned getMaximumJumpTableSize() const { return MaxJumpTableSize; }
+
+ /// CPU has TBI (top byte of addresses is ignored during HW address
+ /// translation) and OS enables it.
+ bool supportsAddressTopByteIgnored() const;
+
+ bool hasPerfMon() const { return HasPerfMon; }
+ bool hasFullFP16() const { return HasFullFP16; }
+ bool hasSPE() const { return HasSPE; }
+
+ bool isLittleEndian() const { return IsLittle; }
+
+ bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
+ bool isTargetIOS() const { return TargetTriple.isiOS(); }
+ bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
+ bool isTargetWindows() const { return TargetTriple.isOSWindows(); }
+ bool isTargetAndroid() const { return TargetTriple.isAndroid(); }
+
+ bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); }
+ bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
+ bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
+
+ bool useAA() const override { return UseAA; }
+
+ /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
+ /// that still makes it profitable to inline the call.
+ unsigned getMaxInlineSizeThreshold() const { return 64; }
+
+ /// ParseSubtargetFeatures - Parses features string setting specified
+ /// subtarget options. Definition of function is auto generated by tblgen.
+ void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+ /// ClassifyGlobalReference - Find the target operand flags that describe
+ /// how a global value should be referenced for the current subtarget.
+ unsigned char ClassifyGlobalReference(const GlobalValue *GV,
+ const TargetMachine &TM) const;
+
+ /// This function returns the name of a function which has an interface
+ /// like the non-standard bzero function, if such a function exists on
+ /// the current subtarget and it is considered prefereable over
+ /// memset with zero passed as the second argument. Otherwise it
+ /// returns null.
+ const char *getBZeroEntry() const;
+
+ void overrideSchedPolicy(MachineSchedPolicy &Policy,
+ unsigned NumRegionInstrs) const override;
+
+ bool enableEarlyIfConversion() const override;
+
+ std::unique_ptr<PBQPRAConstraint> getCustomPBQPConstraints() const override;
+};
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/contrib/llvm/lib/Target/AArch64/AArch64SystemOperands.td
new file mode 100644
index 000000000000..a3736c0868fb
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SystemOperands.td
@@ -0,0 +1,1018 @@
+//===- AArch64SystemOperands.td ----------------------------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the symbolic operands permitted for various kinds of
+// AArch64 system instruction.
+//
+//===----------------------------------------------------------------------===//
+
+include "llvm/TableGen/SearchableTable.td"
+
+//===----------------------------------------------------------------------===//
+// AT (address translate) instruction options.
+//===----------------------------------------------------------------------===//
+
+class AT<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+ bits<3> op2> : SearchableTable {
+ let SearchableFields = ["Name", "Encoding"];
+ let EnumValueField = "Encoding";
+
+ string Name = name;
+ bits<16> Encoding;
+ let Encoding{15-14} = op0;
+ let Encoding{13-11} = op1;
+ let Encoding{10-7} = crn;
+ let Encoding{6-3} = crm;
+ let Encoding{2-0} = op2;
+}
+
+def : AT<"S1E1R", 0b01, 0b000, 0b0111, 0b1000, 0b000>;
+def : AT<"S1E2R", 0b01, 0b100, 0b0111, 0b1000, 0b000>;
+def : AT<"S1E3R", 0b01, 0b110, 0b0111, 0b1000, 0b000>;
+def : AT<"S1E1W", 0b01, 0b000, 0b0111, 0b1000, 0b001>;
+def : AT<"S1E2W", 0b01, 0b100, 0b0111, 0b1000, 0b001>;
+def : AT<"S1E3W", 0b01, 0b110, 0b0111, 0b1000, 0b001>;
+def : AT<"S1E0R", 0b01, 0b000, 0b0111, 0b1000, 0b010>;
+def : AT<"S1E0W", 0b01, 0b000, 0b0111, 0b1000, 0b011>;
+def : AT<"S12E1R", 0b01, 0b100, 0b0111, 0b1000, 0b100>;
+def : AT<"S12E1W", 0b01, 0b100, 0b0111, 0b1000, 0b101>;
+def : AT<"S12E0R", 0b01, 0b100, 0b0111, 0b1000, 0b110>;
+def : AT<"S12E0W", 0b01, 0b100, 0b0111, 0b1000, 0b111>;
+def : AT<"S1E1RP", 0b01, 0b000, 0b0111, 0b1001, 0b000>;
+def : AT<"S1E1WP", 0b01, 0b000, 0b0111, 0b1001, 0b001>;
+
+
+//===----------------------------------------------------------------------===//
+// DMB/DSB (data barrier) instruction options.
+//===----------------------------------------------------------------------===//
+
+class DB<string name, bits<4> encoding> : SearchableTable {
+ let SearchableFields = ["Name", "Encoding"];
+ let EnumValueField = "Encoding";
+
+ string Name = name;
+ bits<4> Encoding = encoding;
+}
+
+def : DB<"oshld", 0x1>;
+def : DB<"oshst", 0x2>;
+def : DB<"osh", 0x3>;
+def : DB<"nshld", 0x5>;
+def : DB<"nshst", 0x6>;
+def : DB<"nsh", 0x7>;
+def : DB<"ishld", 0x9>;
+def : DB<"ishst", 0xa>;
+def : DB<"ish", 0xb>;
+def : DB<"ld", 0xd>;
+def : DB<"st", 0xe>;
+def : DB<"sy", 0xf>;
+
+//===----------------------------------------------------------------------===//
+// DC (data cache maintenance) instruction options.
+//===----------------------------------------------------------------------===//
+
+class DC<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+ bits<3> op2> : SearchableTable {
+ let SearchableFields = ["Name", "Encoding"];
+ let EnumValueField = "Encoding";
+
+ string Name = name;
+ bits<16> Encoding;
+ let Encoding{15-14} = op0;
+ let Encoding{13-11} = op1;
+ let Encoding{10-7} = crn;
+ let Encoding{6-3} = crm;
+ let Encoding{2-0} = op2;
+}
+
+def : DC<"ZVA", 0b01, 0b011, 0b0111, 0b0100, 0b001>;
+def : DC<"IVAC", 0b01, 0b000, 0b0111, 0b0110, 0b001>;
+def : DC<"ISW", 0b01, 0b000, 0b0111, 0b0110, 0b010>;
+def : DC<"CVAC", 0b01, 0b011, 0b0111, 0b1010, 0b001>;
+def : DC<"CSW", 0b01, 0b000, 0b0111, 0b1010, 0b010>;
+def : DC<"CVAU", 0b01, 0b011, 0b0111, 0b1011, 0b001>;
+def : DC<"CIVAC", 0b01, 0b011, 0b0111, 0b1110, 0b001>;
+def : DC<"CISW", 0b01, 0b000, 0b0111, 0b1110, 0b010>;
+
+//===----------------------------------------------------------------------===//
+// IC (instruction cache maintenance) instruction options.
+//===----------------------------------------------------------------------===//
+
+class IC<string name, bits<3> op1, bits<4> crn, bits<4> crm, bits<3> op2,
+ bit needsreg> : SearchableTable {
+ let SearchableFields = ["Name", "Encoding"];
+ let EnumValueField = "Encoding";
+
+ string Name = name;
+ bits<14> Encoding;
+ let Encoding{13-11} = op1;
+ let Encoding{10-7} = crn;
+ let Encoding{6-3} = crm;
+ let Encoding{2-0} = op2;
+ bit NeedsReg = needsreg;
+}
+
+def : IC<"IALLUIS", 0b000, 0b0111, 0b0001, 0b000, 0>;
+def : IC<"IALLU", 0b000, 0b0111, 0b0101, 0b000, 0>;
+def : IC<"IVAU", 0b000, 0b0111, 0b0001, 0b000, 1>;
+
+//===----------------------------------------------------------------------===//
+// ISB (instruction-fetch barrier) instruction options.
+//===----------------------------------------------------------------------===//
+
+class ISB<string name, bits<4> encoding> : SearchableTable{
+ let SearchableFields = ["Name", "Encoding"];
+ let EnumValueField = "Encoding";
+
+ string Name = name;
+ bits<4> Encoding;
+ let Encoding = encoding;
+}
+
+def : ISB<"sy", 0xf>;
+
+//===----------------------------------------------------------------------===//
+// PRFM (prefetch) instruction options.
+//===----------------------------------------------------------------------===//
+
+class PRFM<string name, bits<5> encoding> : SearchableTable {
+ let SearchableFields = ["Name", "Encoding"];
+ let EnumValueField = "Encoding";
+
+ string Name = name;
+ bits<5> Encoding;
+ let Encoding = encoding;
+}
+
+def : PRFM<"pldl1keep", 0x00>;
+def : PRFM<"pldl1strm", 0x01>;
+def : PRFM<"pldl2keep", 0x02>;
+def : PRFM<"pldl2strm", 0x03>;
+def : PRFM<"pldl3keep", 0x04>;
+def : PRFM<"pldl3strm", 0x05>;
+def : PRFM<"plil1keep", 0x08>;
+def : PRFM<"plil1strm", 0x09>;
+def : PRFM<"plil2keep", 0x0a>;
+def : PRFM<"plil2strm", 0x0b>;
+def : PRFM<"plil3keep", 0x0c>;
+def : PRFM<"plil3strm", 0x0d>;
+def : PRFM<"pstl1keep", 0x10>;
+def : PRFM<"pstl1strm", 0x11>;
+def : PRFM<"pstl2keep", 0x12>;
+def : PRFM<"pstl2strm", 0x13>;
+def : PRFM<"pstl3keep", 0x14>;
+def : PRFM<"pstl3strm", 0x15>;
+
+//===----------------------------------------------------------------------===//
+// PState instruction options.
+//===----------------------------------------------------------------------===//
+
+class PState<string name, bits<5> encoding> : SearchableTable {
+ let SearchableFields = ["Name", "Encoding"];
+ let EnumValueField = "Encoding";
+
+ string Name = name;
+ bits<5> Encoding;
+ let Encoding = encoding;
+ code Requires = [{ {} }];
+}
+
+def : PState<"SPSel", 0b00101>;
+def : PState<"DAIFSet", 0b11110>;
+def : PState<"DAIFClr", 0b11111>;
+// v8.1a "Privileged Access Never" extension-specific PStates
+let Requires = [{ {AArch64::HasV8_1aOps} }] in
+def : PState<"PAN", 0b00100>;
+// v8.2a "User Access Override" extension-specific PStates
+let Requires = [{ {AArch64::HasV8_2aOps} }] in
+def : PState<"UAO", 0b00011>;
+
+
+//===----------------------------------------------------------------------===//
+// PSB instruction options.
+//===----------------------------------------------------------------------===//
+
+class PSB<string name, bits<5> encoding> : SearchableTable {
+ let SearchableFields = ["Name", "Encoding"];
+ let EnumValueField = "Encoding";
+
+ string Name = name;
+ bits<5> Encoding;
+ let Encoding = encoding;
+}
+
+def : PSB<"csync", 0x11>;
+
+//===----------------------------------------------------------------------===//
+// TLBI (translation lookaside buffer invalidate) instruction options.
+//===----------------------------------------------------------------------===//
+
+class TLBI<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+ bits<3> op2, bit needsreg = 1> : SearchableTable {
+ let SearchableFields = ["Name", "Encoding"];
+ let EnumValueField = "Encoding";
+
+ string Name = name;
+ bits<16> Encoding;
+ let Encoding{15-14} = op0;
+ let Encoding{13-11} = op1;
+ let Encoding{10-7} = crn;
+ let Encoding{6-3} = crm;
+ let Encoding{2-0} = op2;
+ bit NeedsReg = needsreg;
+}
+
+def : TLBI<"IPAS2E1IS", 0b01, 0b100, 0b1000, 0b0000, 0b001>;
+def : TLBI<"IPAS2LE1IS", 0b01, 0b100, 0b1000, 0b0000, 0b101>;
+def : TLBI<"VMALLE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b000, 0>;
+def : TLBI<"ALLE2IS", 0b01, 0b100, 0b1000, 0b0011, 0b000, 0>;
+def : TLBI<"ALLE3IS", 0b01, 0b110, 0b1000, 0b0011, 0b000, 0>;
+def : TLBI<"VAE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b001>;
+def : TLBI<"VAE2IS", 0b01, 0b100, 0b1000, 0b0011, 0b001>;
+def : TLBI<"VAE3IS", 0b01, 0b110, 0b1000, 0b0011, 0b001>;
+def : TLBI<"ASIDE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b010>;
+def : TLBI<"VAAE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b011>;
+def : TLBI<"ALLE1IS", 0b01, 0b100, 0b1000, 0b0011, 0b100, 0>;
+def : TLBI<"VALE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b101>;
+def : TLBI<"VALE2IS", 0b01, 0b100, 0b1000, 0b0011, 0b101>;
+def : TLBI<"VALE3IS", 0b01, 0b110, 0b1000, 0b0011, 0b101>;
+def : TLBI<"VMALLS12E1IS", 0b01, 0b100, 0b1000, 0b0011, 0b110, 0>;
+def : TLBI<"VAALE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b111>;
+def : TLBI<"IPAS2E1", 0b01, 0b100, 0b1000, 0b0100, 0b001>;
+def : TLBI<"IPAS2LE1", 0b01, 0b100, 0b1000, 0b0100, 0b101>;
+def : TLBI<"VMALLE1", 0b01, 0b000, 0b1000, 0b0111, 0b000, 0>;
+def : TLBI<"ALLE2", 0b01, 0b100, 0b1000, 0b0111, 0b000, 0>;
+def : TLBI<"ALLE3", 0b01, 0b110, 0b1000, 0b0111, 0b000, 0>;
+def : TLBI<"VAE1", 0b01, 0b000, 0b1000, 0b0111, 0b001>;
+def : TLBI<"VAE2", 0b01, 0b100, 0b1000, 0b0111, 0b001>;
+def : TLBI<"VAE3", 0b01, 0b110, 0b1000, 0b0111, 0b001>;
+def : TLBI<"ASIDE1", 0b01, 0b000, 0b1000, 0b0111, 0b010>;
+def : TLBI<"VAAE1", 0b01, 0b000, 0b1000, 0b0111, 0b011>;
+def : TLBI<"ALLE1", 0b01, 0b100, 0b1000, 0b0111, 0b100, 0>;
+def : TLBI<"VALE1", 0b01, 0b000, 0b1000, 0b0111, 0b101>;
+def : TLBI<"VALE2", 0b01, 0b100, 0b1000, 0b0111, 0b101>;
+def : TLBI<"VALE3", 0b01, 0b110, 0b1000, 0b0111, 0b101>;
+def : TLBI<"VMALLS12E1", 0b01, 0b100, 0b1000, 0b0111, 0b110, 0>;
+def : TLBI<"VAALE1", 0b01, 0b000, 0b1000, 0b0111, 0b111>;
+
+
+//===----------------------------------------------------------------------===//
+// MRS/MSR (system register read/write) instruction options.
+//===----------------------------------------------------------------------===//
+
+class SysReg<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+ bits<3> op2> : SearchableTable {
+ let SearchableFields = ["Name", "Encoding"];
+ let EnumValueField = "Encoding";
+
+ string Name = name;
+ bits<16> Encoding;
+ let Encoding{15-14} = op0;
+ let Encoding{13-11} = op1;
+ let Encoding{10-7} = crn;
+ let Encoding{6-3} = crm;
+ let Encoding{2-0} = op2;
+ bit Readable = ?;
+ bit Writeable = ?;
+ code Requires = [{ {} }];
+}
+
+class RWSysReg<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+ bits<3> op2>
+ : SysReg<name, op0, op1, crn, crm, op2> {
+ let Readable = 1;
+ let Writeable = 1;
+}
+
+class ROSysReg<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+ bits<3> op2>
+ : SysReg<name, op0, op1, crn, crm, op2> {
+ let Readable = 1;
+ let Writeable = 0;
+}
+
+class WOSysReg<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+ bits<3> op2>
+ : SysReg<name, op0, op1, crn, crm, op2> {
+ let Readable = 0;
+ let Writeable = 1;
+}
+
+//===----------------------
+// Read-only regs
+//===----------------------
+
+// Op0 Op1 CRn CRm Op2
+def : ROSysReg<"MDCCSR_EL0", 0b10, 0b011, 0b0000, 0b0001, 0b000>;
+def : ROSysReg<"DBGDTRRX_EL0", 0b10, 0b011, 0b0000, 0b0101, 0b000>;
+def : ROSysReg<"MDRAR_EL1", 0b10, 0b000, 0b0001, 0b0000, 0b000>;
+def : ROSysReg<"OSLSR_EL1", 0b10, 0b000, 0b0001, 0b0001, 0b100>;
+def : ROSysReg<"DBGAUTHSTATUS_EL1", 0b10, 0b000, 0b0111, 0b1110, 0b110>;
+def : ROSysReg<"PMCEID0_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b110>;
+def : ROSysReg<"PMCEID1_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b111>;
+def : ROSysReg<"MIDR_EL1", 0b11, 0b000, 0b0000, 0b0000, 0b000>;
+def : ROSysReg<"CCSIDR_EL1", 0b11, 0b001, 0b0000, 0b0000, 0b000>;
+def : ROSysReg<"CLIDR_EL1", 0b11, 0b001, 0b0000, 0b0000, 0b001>;
+def : ROSysReg<"CTR_EL0", 0b11, 0b011, 0b0000, 0b0000, 0b001>;
+def : ROSysReg<"MPIDR_EL1", 0b11, 0b000, 0b0000, 0b0000, 0b101>;
+def : ROSysReg<"REVIDR_EL1", 0b11, 0b000, 0b0000, 0b0000, 0b110>;
+def : ROSysReg<"AIDR_EL1", 0b11, 0b001, 0b0000, 0b0000, 0b111>;
+def : ROSysReg<"DCZID_EL0", 0b11, 0b011, 0b0000, 0b0000, 0b111>;
+def : ROSysReg<"ID_PFR0_EL1", 0b11, 0b000, 0b0000, 0b0001, 0b000>;
+def : ROSysReg<"ID_PFR1_EL1", 0b11, 0b000, 0b0000, 0b0001, 0b001>;
+def : ROSysReg<"ID_DFR0_EL1", 0b11, 0b000, 0b0000, 0b0001, 0b010>;
+def : ROSysReg<"ID_AFR0_EL1", 0b11, 0b000, 0b0000, 0b0001, 0b011>;
+def : ROSysReg<"ID_MMFR0_EL1", 0b11, 0b000, 0b0000, 0b0001, 0b100>;
+def : ROSysReg<"ID_MMFR1_EL1", 0b11, 0b000, 0b0000, 0b0001, 0b101>;
+def : ROSysReg<"ID_MMFR2_EL1", 0b11, 0b000, 0b0000, 0b0001, 0b110>;
+def : ROSysReg<"ID_MMFR3_EL1", 0b11, 0b000, 0b0000, 0b0001, 0b111>;
+def : ROSysReg<"ID_ISAR0_EL1", 0b11, 0b000, 0b0000, 0b0010, 0b000>;
+def : ROSysReg<"ID_ISAR1_EL1", 0b11, 0b000, 0b0000, 0b0010, 0b001>;
+def : ROSysReg<"ID_ISAR2_EL1", 0b11, 0b000, 0b0000, 0b0010, 0b010>;
+def : ROSysReg<"ID_ISAR3_EL1", 0b11, 0b000, 0b0000, 0b0010, 0b011>;
+def : ROSysReg<"ID_ISAR4_EL1", 0b11, 0b000, 0b0000, 0b0010, 0b100>;
+def : ROSysReg<"ID_ISAR5_EL1", 0b11, 0b000, 0b0000, 0b0010, 0b101>;
+def : ROSysReg<"ID_AA64PFR0_EL1", 0b11, 0b000, 0b0000, 0b0100, 0b000>;
+def : ROSysReg<"ID_AA64PFR1_EL1", 0b11, 0b000, 0b0000, 0b0100, 0b001>;
+def : ROSysReg<"ID_AA64DFR0_EL1", 0b11, 0b000, 0b0000, 0b0101, 0b000>;
+def : ROSysReg<"ID_AA64DFR1_EL1", 0b11, 0b000, 0b0000, 0b0101, 0b001>;
+def : ROSysReg<"ID_AA64AFR0_EL1", 0b11, 0b000, 0b0000, 0b0101, 0b100>;
+def : ROSysReg<"ID_AA64AFR1_EL1", 0b11, 0b000, 0b0000, 0b0101, 0b101>;
+def : ROSysReg<"ID_AA64ISAR0_EL1", 0b11, 0b000, 0b0000, 0b0110, 0b000>;
+def : ROSysReg<"ID_AA64ISAR1_EL1", 0b11, 0b000, 0b0000, 0b0110, 0b001>;
+def : ROSysReg<"ID_AA64MMFR0_EL1", 0b11, 0b000, 0b0000, 0b0111, 0b000>;
+def : ROSysReg<"ID_AA64MMFR1_EL1", 0b11, 0b000, 0b0000, 0b0111, 0b001>;
+def : ROSysReg<"ID_AA64MMFR2_EL1", 0b11, 0b000, 0b0000, 0b0111, 0b010> {
+ let Requires = [{ {AArch64::HasV8_2aOps} }];
+}
+def : ROSysReg<"MVFR0_EL1", 0b11, 0b000, 0b0000, 0b0011, 0b000>;
+def : ROSysReg<"MVFR1_EL1", 0b11, 0b000, 0b0000, 0b0011, 0b001>;
+def : ROSysReg<"MVFR2_EL1", 0b11, 0b000, 0b0000, 0b0011, 0b010>;
+def : ROSysReg<"RVBAR_EL1", 0b11, 0b000, 0b1100, 0b0000, 0b001>;
+def : ROSysReg<"RVBAR_EL2", 0b11, 0b100, 0b1100, 0b0000, 0b001>;
+def : ROSysReg<"RVBAR_EL3", 0b11, 0b110, 0b1100, 0b0000, 0b001>;
+def : ROSysReg<"ISR_EL1", 0b11, 0b000, 0b1100, 0b0001, 0b000>;
+def : ROSysReg<"CNTPCT_EL0", 0b11, 0b011, 0b1110, 0b0000, 0b001>;
+def : ROSysReg<"CNTVCT_EL0", 0b11, 0b011, 0b1110, 0b0000, 0b010>;
+def : ROSysReg<"ID_MMFR4_EL1", 0b11, 0b000, 0b0000, 0b0010, 0b110>;
+
+// Trace registers
+// Op0 Op1 CRn CRm Op2
+def : ROSysReg<"TRCSTATR", 0b10, 0b001, 0b0000, 0b0011, 0b000>;
+def : ROSysReg<"TRCIDR8", 0b10, 0b001, 0b0000, 0b0000, 0b110>;
+def : ROSysReg<"TRCIDR9", 0b10, 0b001, 0b0000, 0b0001, 0b110>;
+def : ROSysReg<"TRCIDR10", 0b10, 0b001, 0b0000, 0b0010, 0b110>;
+def : ROSysReg<"TRCIDR11", 0b10, 0b001, 0b0000, 0b0011, 0b110>;
+def : ROSysReg<"TRCIDR12", 0b10, 0b001, 0b0000, 0b0100, 0b110>;
+def : ROSysReg<"TRCIDR13", 0b10, 0b001, 0b0000, 0b0101, 0b110>;
+def : ROSysReg<"TRCIDR0", 0b10, 0b001, 0b0000, 0b1000, 0b111>;
+def : ROSysReg<"TRCIDR1", 0b10, 0b001, 0b0000, 0b1001, 0b111>;
+def : ROSysReg<"TRCIDR2", 0b10, 0b001, 0b0000, 0b1010, 0b111>;
+def : ROSysReg<"TRCIDR3", 0b10, 0b001, 0b0000, 0b1011, 0b111>;
+def : ROSysReg<"TRCIDR4", 0b10, 0b001, 0b0000, 0b1100, 0b111>;
+def : ROSysReg<"TRCIDR5", 0b10, 0b001, 0b0000, 0b1101, 0b111>;
+def : ROSysReg<"TRCIDR6", 0b10, 0b001, 0b0000, 0b1110, 0b111>;
+def : ROSysReg<"TRCIDR7", 0b10, 0b001, 0b0000, 0b1111, 0b111>;
+def : ROSysReg<"TRCOSLSR", 0b10, 0b001, 0b0001, 0b0001, 0b100>;
+def : ROSysReg<"TRCPDSR", 0b10, 0b001, 0b0001, 0b0101, 0b100>;
+def : ROSysReg<"TRCDEVAFF0", 0b10, 0b001, 0b0111, 0b1010, 0b110>;
+def : ROSysReg<"TRCDEVAFF1", 0b10, 0b001, 0b0111, 0b1011, 0b110>;
+def : ROSysReg<"TRCLSR", 0b10, 0b001, 0b0111, 0b1101, 0b110>;
+def : ROSysReg<"TRCAUTHSTATUS", 0b10, 0b001, 0b0111, 0b1110, 0b110>;
+def : ROSysReg<"TRCDEVARCH", 0b10, 0b001, 0b0111, 0b1111, 0b110>;
+def : ROSysReg<"TRCDEVID", 0b10, 0b001, 0b0111, 0b0010, 0b111>;
+def : ROSysReg<"TRCDEVTYPE", 0b10, 0b001, 0b0111, 0b0011, 0b111>;
+def : ROSysReg<"TRCPIDR4", 0b10, 0b001, 0b0111, 0b0100, 0b111>;
+def : ROSysReg<"TRCPIDR5", 0b10, 0b001, 0b0111, 0b0101, 0b111>;
+def : ROSysReg<"TRCPIDR6", 0b10, 0b001, 0b0111, 0b0110, 0b111>;
+def : ROSysReg<"TRCPIDR7", 0b10, 0b001, 0b0111, 0b0111, 0b111>;
+def : ROSysReg<"TRCPIDR0", 0b10, 0b001, 0b0111, 0b1000, 0b111>;
+def : ROSysReg<"TRCPIDR1", 0b10, 0b001, 0b0111, 0b1001, 0b111>;
+def : ROSysReg<"TRCPIDR2", 0b10, 0b001, 0b0111, 0b1010, 0b111>;
+def : ROSysReg<"TRCPIDR3", 0b10, 0b001, 0b0111, 0b1011, 0b111>;
+def : ROSysReg<"TRCCIDR0", 0b10, 0b001, 0b0111, 0b1100, 0b111>;
+def : ROSysReg<"TRCCIDR1", 0b10, 0b001, 0b0111, 0b1101, 0b111>;
+def : ROSysReg<"TRCCIDR2", 0b10, 0b001, 0b0111, 0b1110, 0b111>;
+def : ROSysReg<"TRCCIDR3", 0b10, 0b001, 0b0111, 0b1111, 0b111>;
+
+// GICv3 registers
+// Op0 Op1 CRn CRm Op2
+def : ROSysReg<"ICC_IAR1_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b000>;
+def : ROSysReg<"ICC_IAR0_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b000>;
+def : ROSysReg<"ICC_HPPIR1_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b010>;
+def : ROSysReg<"ICC_HPPIR0_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b010>;
+def : ROSysReg<"ICC_RPR_EL1", 0b11, 0b000, 0b1100, 0b1011, 0b011>;
+def : ROSysReg<"ICH_VTR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b001>;
+def : ROSysReg<"ICH_EISR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b011>;
+def : ROSysReg<"ICH_ELSR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b101>;
+
+// v8.1a "Limited Ordering Regions" extension-specific system register
+// Op0 Op1 CRn CRm Op2
+let Requires = [{ {AArch64::HasV8_1aOps} }] in
+def : ROSysReg<"LORID_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b111>;
+
+// v8.2a "RAS extension" registers
+// Op0 Op1 CRn CRm Op2
+let Requires = [{ {AArch64::FeatureRAS} }] in {
+def : ROSysReg<"ERRIDR_EL1", 0b11, 0b000, 0b0101, 0b0011, 0b000>;
+def : ROSysReg<"ERXFR_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b000>;
+}
+
+//===----------------------
+// Write-only regs
+//===----------------------
+
+// Op0 Op1 CRn CRm Op2
+def : WOSysReg<"DBGDTRTX_EL0", 0b10, 0b011, 0b0000, 0b0101, 0b000>;
+def : WOSysReg<"OSLAR_EL1", 0b10, 0b000, 0b0001, 0b0000, 0b100>;
+def : WOSysReg<"PMSWINC_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b100>;
+
+// Trace Registers
+// Op0 Op1 CRn CRm Op2
+def : WOSysReg<"TRCOSLAR", 0b10, 0b001, 0b0001, 0b0000, 0b100>;
+def : WOSysReg<"TRCLAR", 0b10, 0b001, 0b0111, 0b1100, 0b110>;
+
+// GICv3 registers
+// Op0 Op1 CRn CRm Op2
+def : WOSysReg<"ICC_EOIR1_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b001>;
+def : WOSysReg<"ICC_EOIR0_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b001>;
+def : WOSysReg<"ICC_DIR_EL1", 0b11, 0b000, 0b1100, 0b1011, 0b001>;
+def : WOSysReg<"ICC_SGI1R_EL1", 0b11, 0b000, 0b1100, 0b1011, 0b101>;
+def : WOSysReg<"ICC_ASGI1R_EL1", 0b11, 0b000, 0b1100, 0b1011, 0b110>;
+def : WOSysReg<"ICC_SGI0R_EL1", 0b11, 0b000, 0b1100, 0b1011, 0b111>;
+
+//===----------------------
+// Read-write regs
+//===----------------------
+
+// Op0 Op1 CRn CRm Op2
+def : RWSysReg<"OSDTRRX_EL1", 0b10, 0b000, 0b0000, 0b0000, 0b010>;
+def : RWSysReg<"OSDTRTX_EL1", 0b10, 0b000, 0b0000, 0b0011, 0b010>;
+def : RWSysReg<"TEECR32_EL1", 0b10, 0b010, 0b0000, 0b0000, 0b000>;
+def : RWSysReg<"MDCCINT_EL1", 0b10, 0b000, 0b0000, 0b0010, 0b000>;
+def : RWSysReg<"MDSCR_EL1", 0b10, 0b000, 0b0000, 0b0010, 0b010>;
+def : RWSysReg<"DBGDTR_EL0", 0b10, 0b011, 0b0000, 0b0100, 0b000>;
+def : RWSysReg<"OSECCR_EL1", 0b10, 0b000, 0b0000, 0b0110, 0b010>;
+def : RWSysReg<"DBGVCR32_EL2", 0b10, 0b100, 0b0000, 0b0111, 0b000>;
+def : RWSysReg<"DBGBVR0_EL1", 0b10, 0b000, 0b0000, 0b0000, 0b100>;
+def : RWSysReg<"DBGBVR1_EL1", 0b10, 0b000, 0b0000, 0b0001, 0b100>;
+def : RWSysReg<"DBGBVR2_EL1", 0b10, 0b000, 0b0000, 0b0010, 0b100>;
+def : RWSysReg<"DBGBVR3_EL1", 0b10, 0b000, 0b0000, 0b0011, 0b100>;
+def : RWSysReg<"DBGBVR4_EL1", 0b10, 0b000, 0b0000, 0b0100, 0b100>;
+def : RWSysReg<"DBGBVR5_EL1", 0b10, 0b000, 0b0000, 0b0101, 0b100>;
+def : RWSysReg<"DBGBVR6_EL1", 0b10, 0b000, 0b0000, 0b0110, 0b100>;
+def : RWSysReg<"DBGBVR7_EL1", 0b10, 0b000, 0b0000, 0b0111, 0b100>;
+def : RWSysReg<"DBGBVR8_EL1", 0b10, 0b000, 0b0000, 0b1000, 0b100>;
+def : RWSysReg<"DBGBVR9_EL1", 0b10, 0b000, 0b0000, 0b1001, 0b100>;
+def : RWSysReg<"DBGBVR10_EL1", 0b10, 0b000, 0b0000, 0b1010, 0b100>;
+def : RWSysReg<"DBGBVR11_EL1", 0b10, 0b000, 0b0000, 0b1011, 0b100>;
+def : RWSysReg<"DBGBVR12_EL1", 0b10, 0b000, 0b0000, 0b1100, 0b100>;
+def : RWSysReg<"DBGBVR13_EL1", 0b10, 0b000, 0b0000, 0b1101, 0b100>;
+def : RWSysReg<"DBGBVR14_EL1", 0b10, 0b000, 0b0000, 0b1110, 0b100>;
+def : RWSysReg<"DBGBVR15_EL1", 0b10, 0b000, 0b0000, 0b1111, 0b100>;
+def : RWSysReg<"DBGBCR0_EL1", 0b10, 0b000, 0b0000, 0b0000, 0b101>;
+def : RWSysReg<"DBGBCR1_EL1", 0b10, 0b000, 0b0000, 0b0001, 0b101>;
+def : RWSysReg<"DBGBCR2_EL1", 0b10, 0b000, 0b0000, 0b0010, 0b101>;
+def : RWSysReg<"DBGBCR3_EL1", 0b10, 0b000, 0b0000, 0b0011, 0b101>;
+def : RWSysReg<"DBGBCR4_EL1", 0b10, 0b000, 0b0000, 0b0100, 0b101>;
+def : RWSysReg<"DBGBCR5_EL1", 0b10, 0b000, 0b0000, 0b0101, 0b101>;
+def : RWSysReg<"DBGBCR6_EL1", 0b10, 0b000, 0b0000, 0b0110, 0b101>;
+def : RWSysReg<"DBGBCR7_EL1", 0b10, 0b000, 0b0000, 0b0111, 0b101>;
+def : RWSysReg<"DBGBCR8_EL1", 0b10, 0b000, 0b0000, 0b1000, 0b101>;
+def : RWSysReg<"DBGBCR9_EL1", 0b10, 0b000, 0b0000, 0b1001, 0b101>;
+def : RWSysReg<"DBGBCR10_EL1", 0b10, 0b000, 0b0000, 0b1010, 0b101>;
+def : RWSysReg<"DBGBCR11_EL1", 0b10, 0b000, 0b0000, 0b1011, 0b101>;
+def : RWSysReg<"DBGBCR12_EL1", 0b10, 0b000, 0b0000, 0b1100, 0b101>;
+def : RWSysReg<"DBGBCR13_EL1", 0b10, 0b000, 0b0000, 0b1101, 0b101>;
+def : RWSysReg<"DBGBCR14_EL1", 0b10, 0b000, 0b0000, 0b1110, 0b101>;
+def : RWSysReg<"DBGBCR15_EL1", 0b10, 0b000, 0b0000, 0b1111, 0b101>;
+def : RWSysReg<"DBGWVR0_EL1", 0b10, 0b000, 0b0000, 0b0000, 0b110>;
+def : RWSysReg<"DBGWVR1_EL1", 0b10, 0b000, 0b0000, 0b0001, 0b110>;
+def : RWSysReg<"DBGWVR2_EL1", 0b10, 0b000, 0b0000, 0b0010, 0b110>;
+def : RWSysReg<"DBGWVR3_EL1", 0b10, 0b000, 0b0000, 0b0011, 0b110>;
+def : RWSysReg<"DBGWVR4_EL1", 0b10, 0b000, 0b0000, 0b0100, 0b110>;
+def : RWSysReg<"DBGWVR5_EL1", 0b10, 0b000, 0b0000, 0b0101, 0b110>;
+def : RWSysReg<"DBGWVR6_EL1", 0b10, 0b000, 0b0000, 0b0110, 0b110>;
+def : RWSysReg<"DBGWVR7_EL1", 0b10, 0b000, 0b0000, 0b0111, 0b110>;
+def : RWSysReg<"DBGWVR8_EL1", 0b10, 0b000, 0b0000, 0b1000, 0b110>;
+def : RWSysReg<"DBGWVR9_EL1", 0b10, 0b000, 0b0000, 0b1001, 0b110>;
+def : RWSysReg<"DBGWVR10_EL1", 0b10, 0b000, 0b0000, 0b1010, 0b110>;
+def : RWSysReg<"DBGWVR11_EL1", 0b10, 0b000, 0b0000, 0b1011, 0b110>;
+def : RWSysReg<"DBGWVR12_EL1", 0b10, 0b000, 0b0000, 0b1100, 0b110>;
+def : RWSysReg<"DBGWVR13_EL1", 0b10, 0b000, 0b0000, 0b1101, 0b110>;
+def : RWSysReg<"DBGWVR14_EL1", 0b10, 0b000, 0b0000, 0b1110, 0b110>;
+def : RWSysReg<"DBGWVR15_EL1", 0b10, 0b000, 0b0000, 0b1111, 0b110>;
+def : RWSysReg<"DBGWCR0_EL1", 0b10, 0b000, 0b0000, 0b0000, 0b111>;
+def : RWSysReg<"DBGWCR1_EL1", 0b10, 0b000, 0b0000, 0b0001, 0b111>;
+def : RWSysReg<"DBGWCR2_EL1", 0b10, 0b000, 0b0000, 0b0010, 0b111>;
+def : RWSysReg<"DBGWCR3_EL1", 0b10, 0b000, 0b0000, 0b0011, 0b111>;
+def : RWSysReg<"DBGWCR4_EL1", 0b10, 0b000, 0b0000, 0b0100, 0b111>;
+def : RWSysReg<"DBGWCR5_EL1", 0b10, 0b000, 0b0000, 0b0101, 0b111>;
+def : RWSysReg<"DBGWCR6_EL1", 0b10, 0b000, 0b0000, 0b0110, 0b111>;
+def : RWSysReg<"DBGWCR7_EL1", 0b10, 0b000, 0b0000, 0b0111, 0b111>;
+def : RWSysReg<"DBGWCR8_EL1", 0b10, 0b000, 0b0000, 0b1000, 0b111>;
+def : RWSysReg<"DBGWCR9_EL1", 0b10, 0b000, 0b0000, 0b1001, 0b111>;
+def : RWSysReg<"DBGWCR10_EL1", 0b10, 0b000, 0b0000, 0b1010, 0b111>;
+def : RWSysReg<"DBGWCR11_EL1", 0b10, 0b000, 0b0000, 0b1011, 0b111>;
+def : RWSysReg<"DBGWCR12_EL1", 0b10, 0b000, 0b0000, 0b1100, 0b111>;
+def : RWSysReg<"DBGWCR13_EL1", 0b10, 0b000, 0b0000, 0b1101, 0b111>;
+def : RWSysReg<"DBGWCR14_EL1", 0b10, 0b000, 0b0000, 0b1110, 0b111>;
+def : RWSysReg<"DBGWCR15_EL1", 0b10, 0b000, 0b0000, 0b1111, 0b111>;
+def : RWSysReg<"TEEHBR32_EL1", 0b10, 0b010, 0b0001, 0b0000, 0b000>;
+def : RWSysReg<"OSDLR_EL1", 0b10, 0b000, 0b0001, 0b0011, 0b100>;
+def : RWSysReg<"DBGPRCR_EL1", 0b10, 0b000, 0b0001, 0b0100, 0b100>;
+def : RWSysReg<"DBGCLAIMSET_EL1", 0b10, 0b000, 0b0111, 0b1000, 0b110>;
+def : RWSysReg<"DBGCLAIMCLR_EL1", 0b10, 0b000, 0b0111, 0b1001, 0b110>;
+def : RWSysReg<"CSSELR_EL1", 0b11, 0b010, 0b0000, 0b0000, 0b000>;
+def : RWSysReg<"VPIDR_EL2", 0b11, 0b100, 0b0000, 0b0000, 0b000>;
+def : RWSysReg<"VMPIDR_EL2", 0b11, 0b100, 0b0000, 0b0000, 0b101>;
+def : RWSysReg<"CPACR_EL1", 0b11, 0b000, 0b0001, 0b0000, 0b010>;
+def : RWSysReg<"SCTLR_EL1", 0b11, 0b000, 0b0001, 0b0000, 0b000>;
+def : RWSysReg<"SCTLR_EL2", 0b11, 0b100, 0b0001, 0b0000, 0b000>;
+def : RWSysReg<"SCTLR_EL3", 0b11, 0b110, 0b0001, 0b0000, 0b000>;
+def : RWSysReg<"ACTLR_EL1", 0b11, 0b000, 0b0001, 0b0000, 0b001>;
+def : RWSysReg<"ACTLR_EL2", 0b11, 0b100, 0b0001, 0b0000, 0b001>;
+def : RWSysReg<"ACTLR_EL3", 0b11, 0b110, 0b0001, 0b0000, 0b001>;
+def : RWSysReg<"HCR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b000>;
+def : RWSysReg<"SCR_EL3", 0b11, 0b110, 0b0001, 0b0001, 0b000>;
+def : RWSysReg<"MDCR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b001>;
+def : RWSysReg<"SDER32_EL3", 0b11, 0b110, 0b0001, 0b0001, 0b001>;
+def : RWSysReg<"CPTR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b010>;
+def : RWSysReg<"CPTR_EL3", 0b11, 0b110, 0b0001, 0b0001, 0b010>;
+def : RWSysReg<"HSTR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b011>;
+def : RWSysReg<"HACR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b111>;
+def : RWSysReg<"MDCR_EL3", 0b11, 0b110, 0b0001, 0b0011, 0b001>;
+def : RWSysReg<"TTBR0_EL1", 0b11, 0b000, 0b0010, 0b0000, 0b000>;
+def : RWSysReg<"TTBR0_EL2", 0b11, 0b100, 0b0010, 0b0000, 0b000>;
+def : RWSysReg<"TTBR0_EL3", 0b11, 0b110, 0b0010, 0b0000, 0b000>;
+def : RWSysReg<"TTBR1_EL1", 0b11, 0b000, 0b0010, 0b0000, 0b001>;
+def : RWSysReg<"TCR_EL1", 0b11, 0b000, 0b0010, 0b0000, 0b010>;
+def : RWSysReg<"TCR_EL2", 0b11, 0b100, 0b0010, 0b0000, 0b010>;
+def : RWSysReg<"TCR_EL3", 0b11, 0b110, 0b0010, 0b0000, 0b010>;
+def : RWSysReg<"VTTBR_EL2", 0b11, 0b100, 0b0010, 0b0001, 0b000>;
+def : RWSysReg<"VTCR_EL2", 0b11, 0b100, 0b0010, 0b0001, 0b010>;
+def : RWSysReg<"DACR32_EL2", 0b11, 0b100, 0b0011, 0b0000, 0b000>;
+def : RWSysReg<"SPSR_EL1", 0b11, 0b000, 0b0100, 0b0000, 0b000>;
+def : RWSysReg<"SPSR_EL2", 0b11, 0b100, 0b0100, 0b0000, 0b000>;
+def : RWSysReg<"SPSR_EL3", 0b11, 0b110, 0b0100, 0b0000, 0b000>;
+def : RWSysReg<"ELR_EL1", 0b11, 0b000, 0b0100, 0b0000, 0b001>;
+def : RWSysReg<"ELR_EL2", 0b11, 0b100, 0b0100, 0b0000, 0b001>;
+def : RWSysReg<"ELR_EL3", 0b11, 0b110, 0b0100, 0b0000, 0b001>;
+def : RWSysReg<"SP_EL0", 0b11, 0b000, 0b0100, 0b0001, 0b000>;
+def : RWSysReg<"SP_EL1", 0b11, 0b100, 0b0100, 0b0001, 0b000>;
+def : RWSysReg<"SP_EL2", 0b11, 0b110, 0b0100, 0b0001, 0b000>;
+def : RWSysReg<"SPSel", 0b11, 0b000, 0b0100, 0b0010, 0b000>;
+def : RWSysReg<"NZCV", 0b11, 0b011, 0b0100, 0b0010, 0b000>;
+def : RWSysReg<"DAIF", 0b11, 0b011, 0b0100, 0b0010, 0b001>;
+def : RWSysReg<"CurrentEL", 0b11, 0b000, 0b0100, 0b0010, 0b010>;
+def : RWSysReg<"SPSR_irq", 0b11, 0b100, 0b0100, 0b0011, 0b000>;
+def : RWSysReg<"SPSR_abt", 0b11, 0b100, 0b0100, 0b0011, 0b001>;
+def : RWSysReg<"SPSR_und", 0b11, 0b100, 0b0100, 0b0011, 0b010>;
+def : RWSysReg<"SPSR_fiq", 0b11, 0b100, 0b0100, 0b0011, 0b011>;
+def : RWSysReg<"FPCR", 0b11, 0b011, 0b0100, 0b0100, 0b000>;
+def : RWSysReg<"FPSR", 0b11, 0b011, 0b0100, 0b0100, 0b001>;
+def : RWSysReg<"DSPSR_EL0", 0b11, 0b011, 0b0100, 0b0101, 0b000>;
+def : RWSysReg<"DLR_EL0", 0b11, 0b011, 0b0100, 0b0101, 0b001>;
+def : RWSysReg<"IFSR32_EL2", 0b11, 0b100, 0b0101, 0b0000, 0b001>;
+def : RWSysReg<"AFSR0_EL1", 0b11, 0b000, 0b0101, 0b0001, 0b000>;
+def : RWSysReg<"AFSR0_EL2", 0b11, 0b100, 0b0101, 0b0001, 0b000>;
+def : RWSysReg<"AFSR0_EL3", 0b11, 0b110, 0b0101, 0b0001, 0b000>;
+def : RWSysReg<"AFSR1_EL1", 0b11, 0b000, 0b0101, 0b0001, 0b001>;
+def : RWSysReg<"AFSR1_EL2", 0b11, 0b100, 0b0101, 0b0001, 0b001>;
+def : RWSysReg<"AFSR1_EL3", 0b11, 0b110, 0b0101, 0b0001, 0b001>;
+def : RWSysReg<"ESR_EL1", 0b11, 0b000, 0b0101, 0b0010, 0b000>;
+def : RWSysReg<"ESR_EL2", 0b11, 0b100, 0b0101, 0b0010, 0b000>;
+def : RWSysReg<"ESR_EL3", 0b11, 0b110, 0b0101, 0b0010, 0b000>;
+def : RWSysReg<"FPEXC32_EL2", 0b11, 0b100, 0b0101, 0b0011, 0b000>;
+def : RWSysReg<"FAR_EL1", 0b11, 0b000, 0b0110, 0b0000, 0b000>;
+def : RWSysReg<"FAR_EL2", 0b11, 0b100, 0b0110, 0b0000, 0b000>;
+def : RWSysReg<"FAR_EL3", 0b11, 0b110, 0b0110, 0b0000, 0b000>;
+def : RWSysReg<"HPFAR_EL2", 0b11, 0b100, 0b0110, 0b0000, 0b100>;
+def : RWSysReg<"PAR_EL1", 0b11, 0b000, 0b0111, 0b0100, 0b000>;
+def : RWSysReg<"PMCR_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b000>;
+def : RWSysReg<"PMCNTENSET_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b001>;
+def : RWSysReg<"PMCNTENCLR_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b010>;
+def : RWSysReg<"PMOVSCLR_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b011>;
+def : RWSysReg<"PMSELR_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b101>;
+def : RWSysReg<"PMCCNTR_EL0", 0b11, 0b011, 0b1001, 0b1101, 0b000>;
+def : RWSysReg<"PMXEVTYPER_EL0", 0b11, 0b011, 0b1001, 0b1101, 0b001>;
+def : RWSysReg<"PMXEVCNTR_EL0", 0b11, 0b011, 0b1001, 0b1101, 0b010>;
+def : RWSysReg<"PMUSERENR_EL0", 0b11, 0b011, 0b1001, 0b1110, 0b000>;
+def : RWSysReg<"PMINTENSET_EL1", 0b11, 0b000, 0b1001, 0b1110, 0b001>;
+def : RWSysReg<"PMINTENCLR_EL1", 0b11, 0b000, 0b1001, 0b1110, 0b010>;
+def : RWSysReg<"PMOVSSET_EL0", 0b11, 0b011, 0b1001, 0b1110, 0b011>;
+def : RWSysReg<"MAIR_EL1", 0b11, 0b000, 0b1010, 0b0010, 0b000>;
+def : RWSysReg<"MAIR_EL2", 0b11, 0b100, 0b1010, 0b0010, 0b000>;
+def : RWSysReg<"MAIR_EL3", 0b11, 0b110, 0b1010, 0b0010, 0b000>;
+def : RWSysReg<"AMAIR_EL1", 0b11, 0b000, 0b1010, 0b0011, 0b000>;
+def : RWSysReg<"AMAIR_EL2", 0b11, 0b100, 0b1010, 0b0011, 0b000>;
+def : RWSysReg<"AMAIR_EL3", 0b11, 0b110, 0b1010, 0b0011, 0b000>;
+def : RWSysReg<"VBAR_EL1", 0b11, 0b000, 0b1100, 0b0000, 0b000>;
+def : RWSysReg<"VBAR_EL2", 0b11, 0b100, 0b1100, 0b0000, 0b000>;
+def : RWSysReg<"VBAR_EL3", 0b11, 0b110, 0b1100, 0b0000, 0b000>;
+def : RWSysReg<"RMR_EL1", 0b11, 0b000, 0b1100, 0b0000, 0b010>;
+def : RWSysReg<"RMR_EL2", 0b11, 0b100, 0b1100, 0b0000, 0b010>;
+def : RWSysReg<"RMR_EL3", 0b11, 0b110, 0b1100, 0b0000, 0b010>;
+def : RWSysReg<"CONTEXTIDR_EL1", 0b11, 0b000, 0b1101, 0b0000, 0b001>;
+def : RWSysReg<"TPIDR_EL0", 0b11, 0b011, 0b1101, 0b0000, 0b010>;
+def : RWSysReg<"TPIDR_EL2", 0b11, 0b100, 0b1101, 0b0000, 0b010>;
+def : RWSysReg<"TPIDR_EL3", 0b11, 0b110, 0b1101, 0b0000, 0b010>;
+def : RWSysReg<"TPIDRRO_EL0", 0b11, 0b011, 0b1101, 0b0000, 0b011>;
+def : RWSysReg<"TPIDR_EL1", 0b11, 0b000, 0b1101, 0b0000, 0b100>;
+def : RWSysReg<"CNTFRQ_EL0", 0b11, 0b011, 0b1110, 0b0000, 0b000>;
+def : RWSysReg<"CNTVOFF_EL2", 0b11, 0b100, 0b1110, 0b0000, 0b011>;
+def : RWSysReg<"CNTKCTL_EL1", 0b11, 0b000, 0b1110, 0b0001, 0b000>;
+def : RWSysReg<"CNTHCTL_EL2", 0b11, 0b100, 0b1110, 0b0001, 0b000>;
+def : RWSysReg<"CNTP_TVAL_EL0", 0b11, 0b011, 0b1110, 0b0010, 0b000>;
+def : RWSysReg<"CNTHP_TVAL_EL2", 0b11, 0b100, 0b1110, 0b0010, 0b000>;
+def : RWSysReg<"CNTPS_TVAL_EL1", 0b11, 0b111, 0b1110, 0b0010, 0b000>;
+def : RWSysReg<"CNTP_CTL_EL0", 0b11, 0b011, 0b1110, 0b0010, 0b001>;
+def : RWSysReg<"CNTHP_CTL_EL2", 0b11, 0b100, 0b1110, 0b0010, 0b001>;
+def : RWSysReg<"CNTPS_CTL_EL1", 0b11, 0b111, 0b1110, 0b0010, 0b001>;
+def : RWSysReg<"CNTP_CVAL_EL0", 0b11, 0b011, 0b1110, 0b0010, 0b010>;
+def : RWSysReg<"CNTHP_CVAL_EL2", 0b11, 0b100, 0b1110, 0b0010, 0b010>;
+def : RWSysReg<"CNTPS_CVAL_EL1", 0b11, 0b111, 0b1110, 0b0010, 0b010>;
+def : RWSysReg<"CNTV_TVAL_EL0", 0b11, 0b011, 0b1110, 0b0011, 0b000>;
+def : RWSysReg<"CNTV_CTL_EL0", 0b11, 0b011, 0b1110, 0b0011, 0b001>;
+def : RWSysReg<"CNTV_CVAL_EL0", 0b11, 0b011, 0b1110, 0b0011, 0b010>;
+def : RWSysReg<"PMEVCNTR0_EL0", 0b11, 0b011, 0b1110, 0b1000, 0b000>;
+def : RWSysReg<"PMEVCNTR1_EL0", 0b11, 0b011, 0b1110, 0b1000, 0b001>;
+def : RWSysReg<"PMEVCNTR2_EL0", 0b11, 0b011, 0b1110, 0b1000, 0b010>;
+def : RWSysReg<"PMEVCNTR3_EL0", 0b11, 0b011, 0b1110, 0b1000, 0b011>;
+def : RWSysReg<"PMEVCNTR4_EL0", 0b11, 0b011, 0b1110, 0b1000, 0b100>;
+def : RWSysReg<"PMEVCNTR5_EL0", 0b11, 0b011, 0b1110, 0b1000, 0b101>;
+def : RWSysReg<"PMEVCNTR6_EL0", 0b11, 0b011, 0b1110, 0b1000, 0b110>;
+def : RWSysReg<"PMEVCNTR7_EL0", 0b11, 0b011, 0b1110, 0b1000, 0b111>;
+def : RWSysReg<"PMEVCNTR8_EL0", 0b11, 0b011, 0b1110, 0b1001, 0b000>;
+def : RWSysReg<"PMEVCNTR9_EL0", 0b11, 0b011, 0b1110, 0b1001, 0b001>;
+def : RWSysReg<"PMEVCNTR10_EL0", 0b11, 0b011, 0b1110, 0b1001, 0b010>;
+def : RWSysReg<"PMEVCNTR11_EL0", 0b11, 0b011, 0b1110, 0b1001, 0b011>;
+def : RWSysReg<"PMEVCNTR12_EL0", 0b11, 0b011, 0b1110, 0b1001, 0b100>;
+def : RWSysReg<"PMEVCNTR13_EL0", 0b11, 0b011, 0b1110, 0b1001, 0b101>;
+def : RWSysReg<"PMEVCNTR14_EL0", 0b11, 0b011, 0b1110, 0b1001, 0b110>;
+def : RWSysReg<"PMEVCNTR15_EL0", 0b11, 0b011, 0b1110, 0b1001, 0b111>;
+def : RWSysReg<"PMEVCNTR16_EL0", 0b11, 0b011, 0b1110, 0b1010, 0b000>;
+def : RWSysReg<"PMEVCNTR17_EL0", 0b11, 0b011, 0b1110, 0b1010, 0b001>;
+def : RWSysReg<"PMEVCNTR18_EL0", 0b11, 0b011, 0b1110, 0b1010, 0b010>;
+def : RWSysReg<"PMEVCNTR19_EL0", 0b11, 0b011, 0b1110, 0b1010, 0b011>;
+def : RWSysReg<"PMEVCNTR20_EL0", 0b11, 0b011, 0b1110, 0b1010, 0b100>;
+def : RWSysReg<"PMEVCNTR21_EL0", 0b11, 0b011, 0b1110, 0b1010, 0b101>;
+def : RWSysReg<"PMEVCNTR22_EL0", 0b11, 0b011, 0b1110, 0b1010, 0b110>;
+def : RWSysReg<"PMEVCNTR23_EL0", 0b11, 0b011, 0b1110, 0b1010, 0b111>;
+def : RWSysReg<"PMEVCNTR24_EL0", 0b11, 0b011, 0b1110, 0b1011, 0b000>;
+def : RWSysReg<"PMEVCNTR25_EL0", 0b11, 0b011, 0b1110, 0b1011, 0b001>;
+def : RWSysReg<"PMEVCNTR26_EL0", 0b11, 0b011, 0b1110, 0b1011, 0b010>;
+def : RWSysReg<"PMEVCNTR27_EL0", 0b11, 0b011, 0b1110, 0b1011, 0b011>;
+def : RWSysReg<"PMEVCNTR28_EL0", 0b11, 0b011, 0b1110, 0b1011, 0b100>;
+def : RWSysReg<"PMEVCNTR29_EL0", 0b11, 0b011, 0b1110, 0b1011, 0b101>;
+def : RWSysReg<"PMEVCNTR30_EL0", 0b11, 0b011, 0b1110, 0b1011, 0b110>;
+def : RWSysReg<"PMCCFILTR_EL0", 0b11, 0b011, 0b1110, 0b1111, 0b111>;
+def : RWSysReg<"PMEVTYPER0_EL0", 0b11, 0b011, 0b1110, 0b1100, 0b000>;
+def : RWSysReg<"PMEVTYPER1_EL0", 0b11, 0b011, 0b1110, 0b1100, 0b001>;
+def : RWSysReg<"PMEVTYPER2_EL0", 0b11, 0b011, 0b1110, 0b1100, 0b010>;
+def : RWSysReg<"PMEVTYPER3_EL0", 0b11, 0b011, 0b1110, 0b1100, 0b011>;
+def : RWSysReg<"PMEVTYPER4_EL0", 0b11, 0b011, 0b1110, 0b1100, 0b100>;
+def : RWSysReg<"PMEVTYPER5_EL0", 0b11, 0b011, 0b1110, 0b1100, 0b101>;
+def : RWSysReg<"PMEVTYPER6_EL0", 0b11, 0b011, 0b1110, 0b1100, 0b110>;
+def : RWSysReg<"PMEVTYPER7_EL0", 0b11, 0b011, 0b1110, 0b1100, 0b111>;
+def : RWSysReg<"PMEVTYPER8_EL0", 0b11, 0b011, 0b1110, 0b1101, 0b000>;
+def : RWSysReg<"PMEVTYPER9_EL0", 0b11, 0b011, 0b1110, 0b1101, 0b001>;
+def : RWSysReg<"PMEVTYPER10_EL0", 0b11, 0b011, 0b1110, 0b1101, 0b010>;
+def : RWSysReg<"PMEVTYPER11_EL0", 0b11, 0b011, 0b1110, 0b1101, 0b011>;
+def : RWSysReg<"PMEVTYPER12_EL0", 0b11, 0b011, 0b1110, 0b1101, 0b100>;
+def : RWSysReg<"PMEVTYPER13_EL0", 0b11, 0b011, 0b1110, 0b1101, 0b101>;
+def : RWSysReg<"PMEVTYPER14_EL0", 0b11, 0b011, 0b1110, 0b1101, 0b110>;
+def : RWSysReg<"PMEVTYPER15_EL0", 0b11, 0b011, 0b1110, 0b1101, 0b111>;
+def : RWSysReg<"PMEVTYPER16_EL0", 0b11, 0b011, 0b1110, 0b1110, 0b000>;
+def : RWSysReg<"PMEVTYPER17_EL0", 0b11, 0b011, 0b1110, 0b1110, 0b001>;
+def : RWSysReg<"PMEVTYPER18_EL0", 0b11, 0b011, 0b1110, 0b1110, 0b010>;
+def : RWSysReg<"PMEVTYPER19_EL0", 0b11, 0b011, 0b1110, 0b1110, 0b011>;
+def : RWSysReg<"PMEVTYPER20_EL0", 0b11, 0b011, 0b1110, 0b1110, 0b100>;
+def : RWSysReg<"PMEVTYPER21_EL0", 0b11, 0b011, 0b1110, 0b1110, 0b101>;
+def : RWSysReg<"PMEVTYPER22_EL0", 0b11, 0b011, 0b1110, 0b1110, 0b110>;
+def : RWSysReg<"PMEVTYPER23_EL0", 0b11, 0b011, 0b1110, 0b1110, 0b111>;
+def : RWSysReg<"PMEVTYPER24_EL0", 0b11, 0b011, 0b1110, 0b1111, 0b000>;
+def : RWSysReg<"PMEVTYPER25_EL0", 0b11, 0b011, 0b1110, 0b1111, 0b001>;
+def : RWSysReg<"PMEVTYPER26_EL0", 0b11, 0b011, 0b1110, 0b1111, 0b010>;
+def : RWSysReg<"PMEVTYPER27_EL0", 0b11, 0b011, 0b1110, 0b1111, 0b011>;
+def : RWSysReg<"PMEVTYPER28_EL0", 0b11, 0b011, 0b1110, 0b1111, 0b100>;
+def : RWSysReg<"PMEVTYPER29_EL0", 0b11, 0b011, 0b1110, 0b1111, 0b101>;
+def : RWSysReg<"PMEVTYPER30_EL0", 0b11, 0b011, 0b1110, 0b1111, 0b110>;
+
+// Trace registers
+// Op0 Op1 CRn CRm Op2
+def : RWSysReg<"TRCPRGCTLR", 0b10, 0b001, 0b0000, 0b0001, 0b000>;
+def : RWSysReg<"TRCPROCSELR", 0b10, 0b001, 0b0000, 0b0010, 0b000>;
+def : RWSysReg<"TRCCONFIGR", 0b10, 0b001, 0b0000, 0b0100, 0b000>;
+def : RWSysReg<"TRCAUXCTLR", 0b10, 0b001, 0b0000, 0b0110, 0b000>;
+def : RWSysReg<"TRCEVENTCTL0R", 0b10, 0b001, 0b0000, 0b1000, 0b000>;
+def : RWSysReg<"TRCEVENTCTL1R", 0b10, 0b001, 0b0000, 0b1001, 0b000>;
+def : RWSysReg<"TRCSTALLCTLR", 0b10, 0b001, 0b0000, 0b1011, 0b000>;
+def : RWSysReg<"TRCTSCTLR", 0b10, 0b001, 0b0000, 0b1100, 0b000>;
+def : RWSysReg<"TRCSYNCPR", 0b10, 0b001, 0b0000, 0b1101, 0b000>;
+def : RWSysReg<"TRCCCCTLR", 0b10, 0b001, 0b0000, 0b1110, 0b000>;
+def : RWSysReg<"TRCBBCTLR", 0b10, 0b001, 0b0000, 0b1111, 0b000>;
+def : RWSysReg<"TRCTRACEIDR", 0b10, 0b001, 0b0000, 0b0000, 0b001>;
+def : RWSysReg<"TRCQCTLR", 0b10, 0b001, 0b0000, 0b0001, 0b001>;
+def : RWSysReg<"TRCVICTLR", 0b10, 0b001, 0b0000, 0b0000, 0b010>;
+def : RWSysReg<"TRCVIIECTLR", 0b10, 0b001, 0b0000, 0b0001, 0b010>;
+def : RWSysReg<"TRCVISSCTLR", 0b10, 0b001, 0b0000, 0b0010, 0b010>;
+def : RWSysReg<"TRCVIPCSSCTLR", 0b10, 0b001, 0b0000, 0b0011, 0b010>;
+def : RWSysReg<"TRCVDCTLR", 0b10, 0b001, 0b0000, 0b1000, 0b010>;
+def : RWSysReg<"TRCVDSACCTLR", 0b10, 0b001, 0b0000, 0b1001, 0b010>;
+def : RWSysReg<"TRCVDARCCTLR", 0b10, 0b001, 0b0000, 0b1010, 0b010>;
+def : RWSysReg<"TRCSEQEVR0", 0b10, 0b001, 0b0000, 0b0000, 0b100>;
+def : RWSysReg<"TRCSEQEVR1", 0b10, 0b001, 0b0000, 0b0001, 0b100>;
+def : RWSysReg<"TRCSEQEVR2", 0b10, 0b001, 0b0000, 0b0010, 0b100>;
+def : RWSysReg<"TRCSEQRSTEVR", 0b10, 0b001, 0b0000, 0b0110, 0b100>;
+def : RWSysReg<"TRCSEQSTR", 0b10, 0b001, 0b0000, 0b0111, 0b100>;
+def : RWSysReg<"TRCEXTINSELR", 0b10, 0b001, 0b0000, 0b1000, 0b100>;
+def : RWSysReg<"TRCCNTRLDVR0", 0b10, 0b001, 0b0000, 0b0000, 0b101>;
+def : RWSysReg<"TRCCNTRLDVR1", 0b10, 0b001, 0b0000, 0b0001, 0b101>;
+def : RWSysReg<"TRCCNTRLDVR2", 0b10, 0b001, 0b0000, 0b0010, 0b101>;
+def : RWSysReg<"TRCCNTRLDVR3", 0b10, 0b001, 0b0000, 0b0011, 0b101>;
+def : RWSysReg<"TRCCNTCTLR0", 0b10, 0b001, 0b0000, 0b0100, 0b101>;
+def : RWSysReg<"TRCCNTCTLR1", 0b10, 0b001, 0b0000, 0b0101, 0b101>;
+def : RWSysReg<"TRCCNTCTLR2", 0b10, 0b001, 0b0000, 0b0110, 0b101>;
+def : RWSysReg<"TRCCNTCTLR3", 0b10, 0b001, 0b0000, 0b0111, 0b101>;
+def : RWSysReg<"TRCCNTVR0", 0b10, 0b001, 0b0000, 0b1000, 0b101>;
+def : RWSysReg<"TRCCNTVR1", 0b10, 0b001, 0b0000, 0b1001, 0b101>;
+def : RWSysReg<"TRCCNTVR2", 0b10, 0b001, 0b0000, 0b1010, 0b101>;
+def : RWSysReg<"TRCCNTVR3", 0b10, 0b001, 0b0000, 0b1011, 0b101>;
+def : RWSysReg<"TRCIMSPEC0", 0b10, 0b001, 0b0000, 0b0000, 0b111>;
+def : RWSysReg<"TRCIMSPEC1", 0b10, 0b001, 0b0000, 0b0001, 0b111>;
+def : RWSysReg<"TRCIMSPEC2", 0b10, 0b001, 0b0000, 0b0010, 0b111>;
+def : RWSysReg<"TRCIMSPEC3", 0b10, 0b001, 0b0000, 0b0011, 0b111>;
+def : RWSysReg<"TRCIMSPEC4", 0b10, 0b001, 0b0000, 0b0100, 0b111>;
+def : RWSysReg<"TRCIMSPEC5", 0b10, 0b001, 0b0000, 0b0101, 0b111>;
+def : RWSysReg<"TRCIMSPEC6", 0b10, 0b001, 0b0000, 0b0110, 0b111>;
+def : RWSysReg<"TRCIMSPEC7", 0b10, 0b001, 0b0000, 0b0111, 0b111>;
+def : RWSysReg<"TRCRSCTLR2", 0b10, 0b001, 0b0001, 0b0010, 0b000>;
+def : RWSysReg<"TRCRSCTLR3", 0b10, 0b001, 0b0001, 0b0011, 0b000>;
+def : RWSysReg<"TRCRSCTLR4", 0b10, 0b001, 0b0001, 0b0100, 0b000>;
+def : RWSysReg<"TRCRSCTLR5", 0b10, 0b001, 0b0001, 0b0101, 0b000>;
+def : RWSysReg<"TRCRSCTLR6", 0b10, 0b001, 0b0001, 0b0110, 0b000>;
+def : RWSysReg<"TRCRSCTLR7", 0b10, 0b001, 0b0001, 0b0111, 0b000>;
+def : RWSysReg<"TRCRSCTLR8", 0b10, 0b001, 0b0001, 0b1000, 0b000>;
+def : RWSysReg<"TRCRSCTLR9", 0b10, 0b001, 0b0001, 0b1001, 0b000>;
+def : RWSysReg<"TRCRSCTLR10", 0b10, 0b001, 0b0001, 0b1010, 0b000>;
+def : RWSysReg<"TRCRSCTLR11", 0b10, 0b001, 0b0001, 0b1011, 0b000>;
+def : RWSysReg<"TRCRSCTLR12", 0b10, 0b001, 0b0001, 0b1100, 0b000>;
+def : RWSysReg<"TRCRSCTLR13", 0b10, 0b001, 0b0001, 0b1101, 0b000>;
+def : RWSysReg<"TRCRSCTLR14", 0b10, 0b001, 0b0001, 0b1110, 0b000>;
+def : RWSysReg<"TRCRSCTLR15", 0b10, 0b001, 0b0001, 0b1111, 0b000>;
+def : RWSysReg<"TRCRSCTLR16", 0b10, 0b001, 0b0001, 0b0000, 0b001>;
+def : RWSysReg<"TRCRSCTLR17", 0b10, 0b001, 0b0001, 0b0001, 0b001>;
+def : RWSysReg<"TRCRSCTLR18", 0b10, 0b001, 0b0001, 0b0010, 0b001>;
+def : RWSysReg<"TRCRSCTLR19", 0b10, 0b001, 0b0001, 0b0011, 0b001>;
+def : RWSysReg<"TRCRSCTLR20", 0b10, 0b001, 0b0001, 0b0100, 0b001>;
+def : RWSysReg<"TRCRSCTLR21", 0b10, 0b001, 0b0001, 0b0101, 0b001>;
+def : RWSysReg<"TRCRSCTLR22", 0b10, 0b001, 0b0001, 0b0110, 0b001>;
+def : RWSysReg<"TRCRSCTLR23", 0b10, 0b001, 0b0001, 0b0111, 0b001>;
+def : RWSysReg<"TRCRSCTLR24", 0b10, 0b001, 0b0001, 0b1000, 0b001>;
+def : RWSysReg<"TRCRSCTLR25", 0b10, 0b001, 0b0001, 0b1001, 0b001>;
+def : RWSysReg<"TRCRSCTLR26", 0b10, 0b001, 0b0001, 0b1010, 0b001>;
+def : RWSysReg<"TRCRSCTLR27", 0b10, 0b001, 0b0001, 0b1011, 0b001>;
+def : RWSysReg<"TRCRSCTLR28", 0b10, 0b001, 0b0001, 0b1100, 0b001>;
+def : RWSysReg<"TRCRSCTLR29", 0b10, 0b001, 0b0001, 0b1101, 0b001>;
+def : RWSysReg<"TRCRSCTLR30", 0b10, 0b001, 0b0001, 0b1110, 0b001>;
+def : RWSysReg<"TRCRSCTLR31", 0b10, 0b001, 0b0001, 0b1111, 0b001>;
+def : RWSysReg<"TRCSSCCR0", 0b10, 0b001, 0b0001, 0b0000, 0b010>;
+def : RWSysReg<"TRCSSCCR1", 0b10, 0b001, 0b0001, 0b0001, 0b010>;
+def : RWSysReg<"TRCSSCCR2", 0b10, 0b001, 0b0001, 0b0010, 0b010>;
+def : RWSysReg<"TRCSSCCR3", 0b10, 0b001, 0b0001, 0b0011, 0b010>;
+def : RWSysReg<"TRCSSCCR4", 0b10, 0b001, 0b0001, 0b0100, 0b010>;
+def : RWSysReg<"TRCSSCCR5", 0b10, 0b001, 0b0001, 0b0101, 0b010>;
+def : RWSysReg<"TRCSSCCR6", 0b10, 0b001, 0b0001, 0b0110, 0b010>;
+def : RWSysReg<"TRCSSCCR7", 0b10, 0b001, 0b0001, 0b0111, 0b010>;
+def : RWSysReg<"TRCSSCSR0", 0b10, 0b001, 0b0001, 0b1000, 0b010>;
+def : RWSysReg<"TRCSSCSR1", 0b10, 0b001, 0b0001, 0b1001, 0b010>;
+def : RWSysReg<"TRCSSCSR2", 0b10, 0b001, 0b0001, 0b1010, 0b010>;
+def : RWSysReg<"TRCSSCSR3", 0b10, 0b001, 0b0001, 0b1011, 0b010>;
+def : RWSysReg<"TRCSSCSR4", 0b10, 0b001, 0b0001, 0b1100, 0b010>;
+def : RWSysReg<"TRCSSCSR5", 0b10, 0b001, 0b0001, 0b1101, 0b010>;
+def : RWSysReg<"TRCSSCSR6", 0b10, 0b001, 0b0001, 0b1110, 0b010>;
+def : RWSysReg<"TRCSSCSR7", 0b10, 0b001, 0b0001, 0b1111, 0b010>;
+def : RWSysReg<"TRCSSPCICR0", 0b10, 0b001, 0b0001, 0b0000, 0b011>;
+def : RWSysReg<"TRCSSPCICR1", 0b10, 0b001, 0b0001, 0b0001, 0b011>;
+def : RWSysReg<"TRCSSPCICR2", 0b10, 0b001, 0b0001, 0b0010, 0b011>;
+def : RWSysReg<"TRCSSPCICR3", 0b10, 0b001, 0b0001, 0b0011, 0b011>;
+def : RWSysReg<"TRCSSPCICR4", 0b10, 0b001, 0b0001, 0b0100, 0b011>;
+def : RWSysReg<"TRCSSPCICR5", 0b10, 0b001, 0b0001, 0b0101, 0b011>;
+def : RWSysReg<"TRCSSPCICR6", 0b10, 0b001, 0b0001, 0b0110, 0b011>;
+def : RWSysReg<"TRCSSPCICR7", 0b10, 0b001, 0b0001, 0b0111, 0b011>;
+def : RWSysReg<"TRCPDCR", 0b10, 0b001, 0b0001, 0b0100, 0b100>;
+def : RWSysReg<"TRCACVR0", 0b10, 0b001, 0b0010, 0b0000, 0b000>;
+def : RWSysReg<"TRCACVR1", 0b10, 0b001, 0b0010, 0b0010, 0b000>;
+def : RWSysReg<"TRCACVR2", 0b10, 0b001, 0b0010, 0b0100, 0b000>;
+def : RWSysReg<"TRCACVR3", 0b10, 0b001, 0b0010, 0b0110, 0b000>;
+def : RWSysReg<"TRCACVR4", 0b10, 0b001, 0b0010, 0b1000, 0b000>;
+def : RWSysReg<"TRCACVR5", 0b10, 0b001, 0b0010, 0b1010, 0b000>;
+def : RWSysReg<"TRCACVR6", 0b10, 0b001, 0b0010, 0b1100, 0b000>;
+def : RWSysReg<"TRCACVR7", 0b10, 0b001, 0b0010, 0b1110, 0b000>;
+def : RWSysReg<"TRCACVR8", 0b10, 0b001, 0b0010, 0b0000, 0b001>;
+def : RWSysReg<"TRCACVR9", 0b10, 0b001, 0b0010, 0b0010, 0b001>;
+def : RWSysReg<"TRCACVR10", 0b10, 0b001, 0b0010, 0b0100, 0b001>;
+def : RWSysReg<"TRCACVR11", 0b10, 0b001, 0b0010, 0b0110, 0b001>;
+def : RWSysReg<"TRCACVR12", 0b10, 0b001, 0b0010, 0b1000, 0b001>;
+def : RWSysReg<"TRCACVR13", 0b10, 0b001, 0b0010, 0b1010, 0b001>;
+def : RWSysReg<"TRCACVR14", 0b10, 0b001, 0b0010, 0b1100, 0b001>;
+def : RWSysReg<"TRCACVR15", 0b10, 0b001, 0b0010, 0b1110, 0b001>;
+def : RWSysReg<"TRCACATR0", 0b10, 0b001, 0b0010, 0b0000, 0b010>;
+def : RWSysReg<"TRCACATR1", 0b10, 0b001, 0b0010, 0b0010, 0b010>;
+def : RWSysReg<"TRCACATR2", 0b10, 0b001, 0b0010, 0b0100, 0b010>;
+def : RWSysReg<"TRCACATR3", 0b10, 0b001, 0b0010, 0b0110, 0b010>;
+def : RWSysReg<"TRCACATR4", 0b10, 0b001, 0b0010, 0b1000, 0b010>;
+def : RWSysReg<"TRCACATR5", 0b10, 0b001, 0b0010, 0b1010, 0b010>;
+def : RWSysReg<"TRCACATR6", 0b10, 0b001, 0b0010, 0b1100, 0b010>;
+def : RWSysReg<"TRCACATR7", 0b10, 0b001, 0b0010, 0b1110, 0b010>;
+def : RWSysReg<"TRCACATR8", 0b10, 0b001, 0b0010, 0b0000, 0b011>;
+def : RWSysReg<"TRCACATR9", 0b10, 0b001, 0b0010, 0b0010, 0b011>;
+def : RWSysReg<"TRCACATR10", 0b10, 0b001, 0b0010, 0b0100, 0b011>;
+def : RWSysReg<"TRCACATR11", 0b10, 0b001, 0b0010, 0b0110, 0b011>;
+def : RWSysReg<"TRCACATR12", 0b10, 0b001, 0b0010, 0b1000, 0b011>;
+def : RWSysReg<"TRCACATR13", 0b10, 0b001, 0b0010, 0b1010, 0b011>;
+def : RWSysReg<"TRCACATR14", 0b10, 0b001, 0b0010, 0b1100, 0b011>;
+def : RWSysReg<"TRCACATR15", 0b10, 0b001, 0b0010, 0b1110, 0b011>;
+def : RWSysReg<"TRCDVCVR0", 0b10, 0b001, 0b0010, 0b0000, 0b100>;
+def : RWSysReg<"TRCDVCVR1", 0b10, 0b001, 0b0010, 0b0100, 0b100>;
+def : RWSysReg<"TRCDVCVR2", 0b10, 0b001, 0b0010, 0b1000, 0b100>;
+def : RWSysReg<"TRCDVCVR3", 0b10, 0b001, 0b0010, 0b1100, 0b100>;
+def : RWSysReg<"TRCDVCVR4", 0b10, 0b001, 0b0010, 0b0000, 0b101>;
+def : RWSysReg<"TRCDVCVR5", 0b10, 0b001, 0b0010, 0b0100, 0b101>;
+def : RWSysReg<"TRCDVCVR6", 0b10, 0b001, 0b0010, 0b1000, 0b101>;
+def : RWSysReg<"TRCDVCVR7", 0b10, 0b001, 0b0010, 0b1100, 0b101>;
+def : RWSysReg<"TRCDVCMR0", 0b10, 0b001, 0b0010, 0b0000, 0b110>;
+def : RWSysReg<"TRCDVCMR1", 0b10, 0b001, 0b0010, 0b0100, 0b110>;
+def : RWSysReg<"TRCDVCMR2", 0b10, 0b001, 0b0010, 0b1000, 0b110>;
+def : RWSysReg<"TRCDVCMR3", 0b10, 0b001, 0b0010, 0b1100, 0b110>;
+def : RWSysReg<"TRCDVCMR4", 0b10, 0b001, 0b0010, 0b0000, 0b111>;
+def : RWSysReg<"TRCDVCMR5", 0b10, 0b001, 0b0010, 0b0100, 0b111>;
+def : RWSysReg<"TRCDVCMR6", 0b10, 0b001, 0b0010, 0b1000, 0b111>;
+def : RWSysReg<"TRCDVCMR7", 0b10, 0b001, 0b0010, 0b1100, 0b111>;
+def : RWSysReg<"TRCCIDCVR0", 0b10, 0b001, 0b0011, 0b0000, 0b000>;
+def : RWSysReg<"TRCCIDCVR1", 0b10, 0b001, 0b0011, 0b0010, 0b000>;
+def : RWSysReg<"TRCCIDCVR2", 0b10, 0b001, 0b0011, 0b0100, 0b000>;
+def : RWSysReg<"TRCCIDCVR3", 0b10, 0b001, 0b0011, 0b0110, 0b000>;
+def : RWSysReg<"TRCCIDCVR4", 0b10, 0b001, 0b0011, 0b1000, 0b000>;
+def : RWSysReg<"TRCCIDCVR5", 0b10, 0b001, 0b0011, 0b1010, 0b000>;
+def : RWSysReg<"TRCCIDCVR6", 0b10, 0b001, 0b0011, 0b1100, 0b000>;
+def : RWSysReg<"TRCCIDCVR7", 0b10, 0b001, 0b0011, 0b1110, 0b000>;
+def : RWSysReg<"TRCVMIDCVR0", 0b10, 0b001, 0b0011, 0b0000, 0b001>;
+def : RWSysReg<"TRCVMIDCVR1", 0b10, 0b001, 0b0011, 0b0010, 0b001>;
+def : RWSysReg<"TRCVMIDCVR2", 0b10, 0b001, 0b0011, 0b0100, 0b001>;
+def : RWSysReg<"TRCVMIDCVR3", 0b10, 0b001, 0b0011, 0b0110, 0b001>;
+def : RWSysReg<"TRCVMIDCVR4", 0b10, 0b001, 0b0011, 0b1000, 0b001>;
+def : RWSysReg<"TRCVMIDCVR5", 0b10, 0b001, 0b0011, 0b1010, 0b001>;
+def : RWSysReg<"TRCVMIDCVR6", 0b10, 0b001, 0b0011, 0b1100, 0b001>;
+def : RWSysReg<"TRCVMIDCVR7", 0b10, 0b001, 0b0011, 0b1110, 0b001>;
+def : RWSysReg<"TRCCIDCCTLR0", 0b10, 0b001, 0b0011, 0b0000, 0b010>;
+def : RWSysReg<"TRCCIDCCTLR1", 0b10, 0b001, 0b0011, 0b0001, 0b010>;
+def : RWSysReg<"TRCVMIDCCTLR0", 0b10, 0b001, 0b0011, 0b0010, 0b010>;
+def : RWSysReg<"TRCVMIDCCTLR1", 0b10, 0b001, 0b0011, 0b0011, 0b010>;
+def : RWSysReg<"TRCITCTRL", 0b10, 0b001, 0b0111, 0b0000, 0b100>;
+def : RWSysReg<"TRCCLAIMSET", 0b10, 0b001, 0b0111, 0b1000, 0b110>;
+def : RWSysReg<"TRCCLAIMCLR", 0b10, 0b001, 0b0111, 0b1001, 0b110>;
+
+// GICv3 registers
+// Op0 Op1 CRn CRm Op2
+def : RWSysReg<"ICC_BPR1_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b011>;
+def : RWSysReg<"ICC_BPR0_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b011>;
+def : RWSysReg<"ICC_PMR_EL1", 0b11, 0b000, 0b0100, 0b0110, 0b000>;
+def : RWSysReg<"ICC_CTLR_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b100>;
+def : RWSysReg<"ICC_CTLR_EL3", 0b11, 0b110, 0b1100, 0b1100, 0b100>;
+def : RWSysReg<"ICC_SRE_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b101>;
+def : RWSysReg<"ICC_SRE_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b101>;
+def : RWSysReg<"ICC_SRE_EL3", 0b11, 0b110, 0b1100, 0b1100, 0b101>;
+def : RWSysReg<"ICC_IGRPEN0_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b110>;
+def : RWSysReg<"ICC_IGRPEN1_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b111>;
+def : RWSysReg<"ICC_IGRPEN1_EL3", 0b11, 0b110, 0b1100, 0b1100, 0b111>;
+def : RWSysReg<"ICC_SEIEN_EL1", 0b11, 0b000, 0b1100, 0b1101, 0b000>;
+def : RWSysReg<"ICC_AP0R0_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b100>;
+def : RWSysReg<"ICC_AP0R1_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b101>;
+def : RWSysReg<"ICC_AP0R2_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b110>;
+def : RWSysReg<"ICC_AP0R3_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b111>;
+def : RWSysReg<"ICC_AP1R0_EL1", 0b11, 0b000, 0b1100, 0b1001, 0b000>;
+def : RWSysReg<"ICC_AP1R1_EL1", 0b11, 0b000, 0b1100, 0b1001, 0b001>;
+def : RWSysReg<"ICC_AP1R2_EL1", 0b11, 0b000, 0b1100, 0b1001, 0b010>;
+def : RWSysReg<"ICC_AP1R3_EL1", 0b11, 0b000, 0b1100, 0b1001, 0b011>;
+def : RWSysReg<"ICH_AP0R0_EL2", 0b11, 0b100, 0b1100, 0b1000, 0b000>;
+def : RWSysReg<"ICH_AP0R1_EL2", 0b11, 0b100, 0b1100, 0b1000, 0b001>;
+def : RWSysReg<"ICH_AP0R2_EL2", 0b11, 0b100, 0b1100, 0b1000, 0b010>;
+def : RWSysReg<"ICH_AP0R3_EL2", 0b11, 0b100, 0b1100, 0b1000, 0b011>;
+def : RWSysReg<"ICH_AP1R0_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b000>;
+def : RWSysReg<"ICH_AP1R1_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b001>;
+def : RWSysReg<"ICH_AP1R2_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b010>;
+def : RWSysReg<"ICH_AP1R3_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b011>;
+def : RWSysReg<"ICH_HCR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b000>;
+def : RWSysReg<"ICH_MISR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b010>;
+def : RWSysReg<"ICH_VMCR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b111>;
+def : RWSysReg<"ICH_VSEIR_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b100>;
+def : RWSysReg<"ICH_LR0_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b000>;
+def : RWSysReg<"ICH_LR1_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b001>;
+def : RWSysReg<"ICH_LR2_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b010>;
+def : RWSysReg<"ICH_LR3_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b011>;
+def : RWSysReg<"ICH_LR4_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b100>;
+def : RWSysReg<"ICH_LR5_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b101>;
+def : RWSysReg<"ICH_LR6_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b110>;
+def : RWSysReg<"ICH_LR7_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b111>;
+def : RWSysReg<"ICH_LR8_EL2", 0b11, 0b100, 0b1100, 0b1101, 0b000>;
+def : RWSysReg<"ICH_LR9_EL2", 0b11, 0b100, 0b1100, 0b1101, 0b001>;
+def : RWSysReg<"ICH_LR10_EL2", 0b11, 0b100, 0b1100, 0b1101, 0b010>;
+def : RWSysReg<"ICH_LR11_EL2", 0b11, 0b100, 0b1100, 0b1101, 0b011>;
+def : RWSysReg<"ICH_LR12_EL2", 0b11, 0b100, 0b1100, 0b1101, 0b100>;
+def : RWSysReg<"ICH_LR13_EL2", 0b11, 0b100, 0b1100, 0b1101, 0b101>;
+def : RWSysReg<"ICH_LR14_EL2", 0b11, 0b100, 0b1100, 0b1101, 0b110>;
+def : RWSysReg<"ICH_LR15_EL2", 0b11, 0b100, 0b1100, 0b1101, 0b111>;
+
+// v8.1a "Privileged Access Never" extension-specific system registers
+let Requires = [{ {AArch64::HasV8_1aOps} }] in
+def : RWSysReg<"PAN", 0b11, 0b000, 0b0100, 0b0010, 0b011>;
+
+// v8.1a "Limited Ordering Regions" extension-specific system registers
+// Op0 Op1 CRn CRm Op2
+let Requires = [{ {AArch64::HasV8_1aOps} }] in {
+def : RWSysReg<"LORSA_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b000>;
+def : RWSysReg<"LOREA_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b001>;
+def : RWSysReg<"LORN_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b010>;
+def : RWSysReg<"LORC_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b011>;
+}
+
+// v8.1a "Virtualization hos extensions" system registers
+// Op0 Op1 CRn CRm Op2
+let Requires = [{ {AArch64::HasV8_1aOps} }] in {
+def : RWSysReg<"TTBR1_EL2", 0b11, 0b100, 0b0010, 0b0000, 0b001>;
+def : RWSysReg<"CONTEXTIDR_EL2", 0b11, 0b100, 0b1101, 0b0000, 0b001>;
+def : RWSysReg<"CNTHV_TVAL_EL2", 0b11, 0b100, 0b1110, 0b0011, 0b000>;
+def : RWSysReg<"CNTHV_CVAL_EL2", 0b11, 0b100, 0b1110, 0b0011, 0b010>;
+def : RWSysReg<"CNTHV_CTL_EL2", 0b11, 0b100, 0b1110, 0b0011, 0b001>;
+def : RWSysReg<"SCTLR_EL12", 0b11, 0b101, 0b0001, 0b0000, 0b000>;
+def : RWSysReg<"CPACR_EL12", 0b11, 0b101, 0b0001, 0b0000, 0b010>;
+def : RWSysReg<"TTBR0_EL12", 0b11, 0b101, 0b0010, 0b0000, 0b000>;
+def : RWSysReg<"TTBR1_EL12", 0b11, 0b101, 0b0010, 0b0000, 0b001>;
+def : RWSysReg<"TCR_EL12", 0b11, 0b101, 0b0010, 0b0000, 0b010>;
+def : RWSysReg<"AFSR0_EL12", 0b11, 0b101, 0b0101, 0b0001, 0b000>;
+def : RWSysReg<"AFSR1_EL12", 0b11, 0b101, 0b0101, 0b0001, 0b001>;
+def : RWSysReg<"ESR_EL12", 0b11, 0b101, 0b0101, 0b0010, 0b000>;
+def : RWSysReg<"FAR_EL12", 0b11, 0b101, 0b0110, 0b0000, 0b000>;
+def : RWSysReg<"MAIR_EL12", 0b11, 0b101, 0b1010, 0b0010, 0b000>;
+def : RWSysReg<"AMAIR_EL12", 0b11, 0b101, 0b1010, 0b0011, 0b000>;
+def : RWSysReg<"VBAR_EL12", 0b11, 0b101, 0b1100, 0b0000, 0b000>;
+def : RWSysReg<"CONTEXTIDR_EL12", 0b11, 0b101, 0b1101, 0b0000, 0b001>;
+def : RWSysReg<"CNTKCTL_EL12", 0b11, 0b101, 0b1110, 0b0001, 0b000>;
+def : RWSysReg<"CNTP_TVAL_EL02", 0b11, 0b101, 0b1110, 0b0010, 0b000>;
+def : RWSysReg<"CNTP_CTL_EL02", 0b11, 0b101, 0b1110, 0b0010, 0b001>;
+def : RWSysReg<"CNTP_CVAL_EL02", 0b11, 0b101, 0b1110, 0b0010, 0b010>;
+def : RWSysReg<"CNTV_TVAL_EL02", 0b11, 0b101, 0b1110, 0b0011, 0b000>;
+def : RWSysReg<"CNTV_CTL_EL02", 0b11, 0b101, 0b1110, 0b0011, 0b001>;
+def : RWSysReg<"CNTV_CVAL_EL02", 0b11, 0b101, 0b1110, 0b0011, 0b010>;
+def : RWSysReg<"SPSR_EL12", 0b11, 0b101, 0b0100, 0b0000, 0b000>;
+def : RWSysReg<"ELR_EL12", 0b11, 0b101, 0b0100, 0b0000, 0b001>;
+}
+// v8.2a registers
+// Op0 Op1 CRn CRm Op2
+let Requires = [{ {AArch64::HasV8_2aOps} }] in
+def : RWSysReg<"UAO", 0b11, 0b000, 0b0100, 0b0010, 0b100>;
+
+// v8.2a "Statistical Profiling extension" registers
+// Op0 Op1 CRn CRm Op2
+let Requires = [{ {AArch64::FeatureSPE} }] in {
+def : RWSysReg<"PMBLIMITR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b000>;
+def : RWSysReg<"PMBPTR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b001>;
+def : RWSysReg<"PMBSR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b011>;
+def : RWSysReg<"PMBIDR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b111>;
+def : RWSysReg<"PMSCR_EL2", 0b11, 0b100, 0b1001, 0b1001, 0b000>;
+def : RWSysReg<"PMSCR_EL12", 0b11, 0b101, 0b1001, 0b1001, 0b000>;
+def : RWSysReg<"PMSCR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b000>;
+def : RWSysReg<"PMSICR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b010>;
+def : RWSysReg<"PMSIRR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b011>;
+def : RWSysReg<"PMSFCR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b100>;
+def : RWSysReg<"PMSEVFR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b101>;
+def : RWSysReg<"PMSLATFR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b110>;
+def : RWSysReg<"PMSIDR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b111>;
+}
+
+// v8.2a "RAS extension" registers
+// Op0 Op1 CRn CRm Op2
+let Requires = [{ {AArch64::FeatureRAS} }] in {
+def : RWSysReg<"ERRSELR_EL1", 0b11, 0b000, 0b0101, 0b0011, 0b001>;
+def : RWSysReg<"ERXCTLR_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b001>;
+def : RWSysReg<"ERXSTATUS_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b010>;
+def : RWSysReg<"ERXADDR_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b011>;
+def : RWSysReg<"ERXMISC0_EL1", 0b11, 0b000, 0b0101, 0b0101, 0b000>;
+def : RWSysReg<"ERXMISC1_EL1", 0b11, 0b000, 0b0101, 0b0101, 0b001>;
+def : RWSysReg<"DISR_EL1", 0b11, 0b000, 0b1100, 0b0001, 0b001>;
+def : RWSysReg<"VDISR_EL2", 0b11, 0b100, 0b1100, 0b0001, 0b001>;
+def : RWSysReg<"VSESR_EL2", 0b11, 0b100, 0b0101, 0b0010, 0b011>;
+}
+
+// Cyclone specific system registers
+// Op0 Op1 CRn CRm Op2
+let Requires = [{ {AArch64::ProcCyclone} }] in
+def : RWSysReg<"CPM_IOACC_CTL_EL3", 0b11, 0b111, 0b1111, 0b0010, 0b000>;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
new file mode 100644
index 000000000000..e4ef0d4bb8db
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -0,0 +1,489 @@
+//===-- AArch64TargetMachine.cpp - Define TargetMachine for AArch64 -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64CallLowering.h"
+#include "AArch64InstructionSelector.h"
+#include "AArch64LegalizerInfo.h"
+#include "AArch64RegisterBankInfo.h"
+#include "AArch64TargetMachine.h"
+#include "AArch64TargetObjectFile.h"
+#include "AArch64TargetTransformInfo.h"
+#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
+#include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+static cl::opt<bool> EnableCCMP("aarch64-enable-ccmp",
+ cl::desc("Enable the CCMP formation pass"),
+ cl::init(true), cl::Hidden);
+
+static cl::opt<bool> EnableMCR("aarch64-enable-mcr",
+ cl::desc("Enable the machine combiner pass"),
+ cl::init(true), cl::Hidden);
+
+static cl::opt<bool> EnableStPairSuppress("aarch64-enable-stp-suppress",
+ cl::desc("Suppress STP for AArch64"),
+ cl::init(true), cl::Hidden);
+
+static cl::opt<bool> EnableAdvSIMDScalar(
+ "aarch64-enable-simd-scalar",
+ cl::desc("Enable use of AdvSIMD scalar integer instructions"),
+ cl::init(false), cl::Hidden);
+
+static cl::opt<bool>
+ EnablePromoteConstant("aarch64-enable-promote-const",
+ cl::desc("Enable the promote constant pass"),
+ cl::init(true), cl::Hidden);
+
+static cl::opt<bool> EnableCollectLOH(
+ "aarch64-enable-collect-loh",
+ cl::desc("Enable the pass that emits the linker optimization hints (LOH)"),
+ cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+ EnableDeadRegisterElimination("aarch64-enable-dead-defs", cl::Hidden,
+ cl::desc("Enable the pass that removes dead"
+ " definitons and replaces stores to"
+ " them with stores to the zero"
+ " register"),
+ cl::init(true));
+
+static cl::opt<bool> EnableRedundantCopyElimination(
+ "aarch64-enable-copyelim",
+ cl::desc("Enable the redundant copy elimination pass"), cl::init(true),
+ cl::Hidden);
+
+static cl::opt<bool> EnableLoadStoreOpt("aarch64-enable-ldst-opt",
+ cl::desc("Enable the load/store pair"
+ " optimization pass"),
+ cl::init(true), cl::Hidden);
+
+static cl::opt<bool> EnableAtomicTidy(
+ "aarch64-enable-atomic-cfg-tidy", cl::Hidden,
+ cl::desc("Run SimplifyCFG after expanding atomic operations"
+ " to make use of cmpxchg flow-based information"),
+ cl::init(true));
+
+static cl::opt<bool>
+EnableEarlyIfConversion("aarch64-enable-early-ifcvt", cl::Hidden,
+ cl::desc("Run early if-conversion"),
+ cl::init(true));
+
+static cl::opt<bool>
+ EnableCondOpt("aarch64-enable-condopt",
+ cl::desc("Enable the condition optimizer pass"),
+ cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+EnableA53Fix835769("aarch64-fix-cortex-a53-835769", cl::Hidden,
+ cl::desc("Work around Cortex-A53 erratum 835769"),
+ cl::init(false));
+
+static cl::opt<bool>
+ EnableAddressTypePromotion("aarch64-enable-type-promotion", cl::Hidden,
+ cl::desc("Enable the type promotion pass"),
+ cl::init(true));
+
+static cl::opt<bool>
+ EnableGEPOpt("aarch64-enable-gep-opt", cl::Hidden,
+ cl::desc("Enable optimizations on complex GEPs"),
+ cl::init(false));
+
+static cl::opt<bool>
+ BranchRelaxation("aarch64-enable-branch-relax", cl::Hidden, cl::init(true),
+ cl::desc("Relax out of range conditional branches"));
+
+// FIXME: Unify control over GlobalMerge.
+static cl::opt<cl::boolOrDefault>
+ EnableGlobalMerge("aarch64-enable-global-merge", cl::Hidden,
+ cl::desc("Enable the global merge pass"));
+
+static cl::opt<bool>
+ EnableLoopDataPrefetch("aarch64-enable-loop-data-prefetch", cl::Hidden,
+ cl::desc("Enable the loop data prefetch pass"),
+ cl::init(true));
+
+extern "C" void LLVMInitializeAArch64Target() {
+ // Register the target.
+ RegisterTargetMachine<AArch64leTargetMachine> X(getTheAArch64leTarget());
+ RegisterTargetMachine<AArch64beTargetMachine> Y(getTheAArch64beTarget());
+ RegisterTargetMachine<AArch64leTargetMachine> Z(getTheARM64Target());
+ auto PR = PassRegistry::getPassRegistry();
+ initializeGlobalISel(*PR);
+ initializeAArch64A53Fix835769Pass(*PR);
+ initializeAArch64A57FPLoadBalancingPass(*PR);
+ initializeAArch64AddressTypePromotionPass(*PR);
+ initializeAArch64AdvSIMDScalarPass(*PR);
+ initializeAArch64CollectLOHPass(*PR);
+ initializeAArch64ConditionalComparesPass(*PR);
+ initializeAArch64ConditionOptimizerPass(*PR);
+ initializeAArch64DeadRegisterDefinitionsPass(*PR);
+ initializeAArch64ExpandPseudoPass(*PR);
+ initializeAArch64LoadStoreOptPass(*PR);
+ initializeAArch64VectorByElementOptPass(*PR);
+ initializeAArch64PromoteConstantPass(*PR);
+ initializeAArch64RedundantCopyEliminationPass(*PR);
+ initializeAArch64StorePairSuppressPass(*PR);
+ initializeLDTLSCleanupPass(*PR);
+}
+
+//===----------------------------------------------------------------------===//
+// AArch64 Lowering public interface.
+//===----------------------------------------------------------------------===//
+static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
+ if (TT.isOSBinFormatMachO())
+ return make_unique<AArch64_MachoTargetObjectFile>();
+
+ return make_unique<AArch64_ELFTargetObjectFile>();
+}
+
+// Helper function to build a DataLayout string
+static std::string computeDataLayout(const Triple &TT,
+ const MCTargetOptions &Options,
+ bool LittleEndian) {
+ if (Options.getABIName() == "ilp32")
+ return "e-m:e-p:32:32-i8:8-i16:16-i64:64-S128";
+ if (TT.isOSBinFormatMachO())
+ return "e-m:o-i64:64-i128:128-n32:64-S128";
+ if (LittleEndian)
+ return "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128";
+ return "E-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128";
+}
+
+static Reloc::Model getEffectiveRelocModel(const Triple &TT,
+ Optional<Reloc::Model> RM) {
+ // AArch64 Darwin is always PIC.
+ if (TT.isOSDarwin())
+ return Reloc::PIC_;
+ // On ELF platforms the default static relocation model has a smart enough
+ // linker to cope with referencing external symbols defined in a shared
+ // library. Hence DynamicNoPIC doesn't need to be promoted to PIC.
+ if (!RM.hasValue() || *RM == Reloc::DynamicNoPIC)
+ return Reloc::Static;
+ return *RM;
+}
+
+/// Create an AArch64 architecture model.
+///
+AArch64TargetMachine::AArch64TargetMachine(
+ const Target &T, const Triple &TT, StringRef CPU, StringRef FS,
+ const TargetOptions &Options, Optional<Reloc::Model> RM,
+ CodeModel::Model CM, CodeGenOpt::Level OL, bool LittleEndian)
+ // This nested ternary is horrible, but DL needs to be properly
+ // initialized before TLInfo is constructed.
+ : LLVMTargetMachine(T, computeDataLayout(TT, Options.MCOptions,
+ LittleEndian),
+ TT, CPU, FS, Options,
+ getEffectiveRelocModel(TT, RM), CM, OL),
+ TLOF(createTLOF(getTargetTriple())),
+ isLittle(LittleEndian) {
+ initAsmInfo();
+}
+
+AArch64TargetMachine::~AArch64TargetMachine() {}
+
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+namespace {
+struct AArch64GISelActualAccessor : public GISelAccessor {
+ std::unique_ptr<CallLowering> CallLoweringInfo;
+ std::unique_ptr<InstructionSelector> InstSelector;
+ std::unique_ptr<LegalizerInfo> Legalizer;
+ std::unique_ptr<RegisterBankInfo> RegBankInfo;
+ const CallLowering *getCallLowering() const override {
+ return CallLoweringInfo.get();
+ }
+ const InstructionSelector *getInstructionSelector() const override {
+ return InstSelector.get();
+ }
+ const LegalizerInfo *getLegalizerInfo() const override {
+ return Legalizer.get();
+ }
+ const RegisterBankInfo *getRegBankInfo() const override {
+ return RegBankInfo.get();
+ }
+};
+} // End anonymous namespace.
+#endif
+
+const AArch64Subtarget *
+AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
+ Attribute CPUAttr = F.getFnAttribute("target-cpu");
+ Attribute FSAttr = F.getFnAttribute("target-features");
+
+ std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
+ ? CPUAttr.getValueAsString().str()
+ : TargetCPU;
+ std::string FS = !FSAttr.hasAttribute(Attribute::None)
+ ? FSAttr.getValueAsString().str()
+ : TargetFS;
+
+ auto &I = SubtargetMap[CPU + FS];
+ if (!I) {
+ // This needs to be done before we create a new subtarget since any
+ // creation will depend on the TM and the code generation flags on the
+ // function that reside in TargetOptions.
+ resetTargetOptions(F);
+ I = llvm::make_unique<AArch64Subtarget>(TargetTriple, CPU, FS, *this,
+ isLittle);
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+ GISelAccessor *GISel = new GISelAccessor();
+#else
+ AArch64GISelActualAccessor *GISel =
+ new AArch64GISelActualAccessor();
+ GISel->CallLoweringInfo.reset(
+ new AArch64CallLowering(*I->getTargetLowering()));
+ GISel->Legalizer.reset(new AArch64LegalizerInfo());
+
+ auto *RBI = new AArch64RegisterBankInfo(*I->getRegisterInfo());
+
+ // FIXME: At this point, we can't rely on Subtarget having RBI.
+ // It's awkward to mix passing RBI and the Subtarget; should we pass
+ // TII/TRI as well?
+ GISel->InstSelector.reset(new AArch64InstructionSelector(*this, *I, *RBI));
+
+ GISel->RegBankInfo.reset(RBI);
+#endif
+ I->setGISelAccessor(*GISel);
+ }
+ return I.get();
+}
+
+void AArch64leTargetMachine::anchor() { }
+
+AArch64leTargetMachine::AArch64leTargetMachine(
+ const Target &T, const Triple &TT, StringRef CPU, StringRef FS,
+ const TargetOptions &Options, Optional<Reloc::Model> RM,
+ CodeModel::Model CM, CodeGenOpt::Level OL)
+ : AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
+
+void AArch64beTargetMachine::anchor() { }
+
+AArch64beTargetMachine::AArch64beTargetMachine(
+ const Target &T, const Triple &TT, StringRef CPU, StringRef FS,
+ const TargetOptions &Options, Optional<Reloc::Model> RM,
+ CodeModel::Model CM, CodeGenOpt::Level OL)
+ : AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
+
+namespace {
+/// AArch64 Code Generator Pass Configuration Options.
+class AArch64PassConfig : public TargetPassConfig {
+public:
+ AArch64PassConfig(AArch64TargetMachine *TM, PassManagerBase &PM)
+ : TargetPassConfig(TM, PM) {
+ if (TM->getOptLevel() != CodeGenOpt::None)
+ substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
+ }
+
+ AArch64TargetMachine &getAArch64TargetMachine() const {
+ return getTM<AArch64TargetMachine>();
+ }
+
+ ScheduleDAGInstrs *
+ createMachineScheduler(MachineSchedContext *C) const override {
+ ScheduleDAGMILive *DAG = createGenericSchedLive(C);
+ DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+ DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
+ DAG->addMutation(createMacroFusionDAGMutation(DAG->TII));
+ return DAG;
+ }
+
+ void addIRPasses() override;
+ bool addPreISel() override;
+ bool addInstSelector() override;
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+ bool addIRTranslator() override;
+ bool addLegalizeMachineIR() override;
+ bool addRegBankSelect() override;
+ bool addGlobalInstructionSelect() override;
+#endif
+ bool addILPOpts() override;
+ void addPreRegAlloc() override;
+ void addPostRegAlloc() override;
+ void addPreSched2() override;
+ void addPreEmitPass() override;
+};
+} // namespace
+
+TargetIRAnalysis AArch64TargetMachine::getTargetIRAnalysis() {
+ return TargetIRAnalysis([this](const Function &F) {
+ return TargetTransformInfo(AArch64TTIImpl(this, F));
+ });
+}
+
+TargetPassConfig *AArch64TargetMachine::createPassConfig(PassManagerBase &PM) {
+ return new AArch64PassConfig(this, PM);
+}
+
+void AArch64PassConfig::addIRPasses() {
+ // Always expand atomic operations, we don't deal with atomicrmw or cmpxchg
+ // ourselves.
+ addPass(createAtomicExpandPass(TM));
+
+ // Cmpxchg instructions are often used with a subsequent comparison to
+ // determine whether it succeeded. We can exploit existing control-flow in
+ // ldrex/strex loops to simplify this, but it needs tidying up.
+ if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy)
+ addPass(createCFGSimplificationPass());
+
+ // Run LoopDataPrefetch
+ //
+ // Run this before LSR to remove the multiplies involved in computing the
+ // pointer values N iterations ahead.
+ if (TM->getOptLevel() != CodeGenOpt::None && EnableLoopDataPrefetch)
+ addPass(createLoopDataPrefetchPass());
+
+ TargetPassConfig::addIRPasses();
+
+ // Match interleaved memory accesses to ldN/stN intrinsics.
+ if (TM->getOptLevel() != CodeGenOpt::None)
+ addPass(createInterleavedAccessPass(TM));
+
+ if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) {
+ // Call SeparateConstOffsetFromGEP pass to extract constants within indices
+ // and lower a GEP with multiple indices to either arithmetic operations or
+ // multiple GEPs with single index.
+ addPass(createSeparateConstOffsetFromGEPPass(TM, true));
+ // Call EarlyCSE pass to find and remove subexpressions in the lowered
+ // result.
+ addPass(createEarlyCSEPass());
+ // Do loop invariant code motion in case part of the lowered result is
+ // invariant.
+ addPass(createLICMPass());
+ }
+}
+
+// Pass Pipeline Configuration
+bool AArch64PassConfig::addPreISel() {
+ // Run promote constant before global merge, so that the promoted constants
+ // get a chance to be merged
+ if (TM->getOptLevel() != CodeGenOpt::None && EnablePromoteConstant)
+ addPass(createAArch64PromoteConstantPass());
+ // FIXME: On AArch64, this depends on the type.
+ // Basically, the addressable offsets are up to 4095 * Ty.getSizeInBytes().
+ // and the offset has to be a multiple of the related size in bytes.
+ if ((TM->getOptLevel() != CodeGenOpt::None &&
+ EnableGlobalMerge == cl::BOU_UNSET) ||
+ EnableGlobalMerge == cl::BOU_TRUE) {
+ bool OnlyOptimizeForSize = (TM->getOptLevel() < CodeGenOpt::Aggressive) &&
+ (EnableGlobalMerge == cl::BOU_UNSET);
+ addPass(createGlobalMergePass(TM, 4095, OnlyOptimizeForSize));
+ }
+
+ if (TM->getOptLevel() != CodeGenOpt::None && EnableAddressTypePromotion)
+ addPass(createAArch64AddressTypePromotionPass());
+
+ return false;
+}
+
+bool AArch64PassConfig::addInstSelector() {
+ addPass(createAArch64ISelDag(getAArch64TargetMachine(), getOptLevel()));
+
+ // For ELF, cleanup any local-dynamic TLS accesses (i.e. combine as many
+ // references to _TLS_MODULE_BASE_ as possible.
+ if (TM->getTargetTriple().isOSBinFormatELF() &&
+ getOptLevel() != CodeGenOpt::None)
+ addPass(createAArch64CleanupLocalDynamicTLSPass());
+
+ return false;
+}
+
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+bool AArch64PassConfig::addIRTranslator() {
+ addPass(new IRTranslator());
+ return false;
+}
+bool AArch64PassConfig::addLegalizeMachineIR() {
+ addPass(new Legalizer());
+ return false;
+}
+bool AArch64PassConfig::addRegBankSelect() {
+ addPass(new RegBankSelect());
+ return false;
+}
+bool AArch64PassConfig::addGlobalInstructionSelect() {
+ addPass(new InstructionSelect());
+ return false;
+}
+#endif
+
+bool AArch64PassConfig::addILPOpts() {
+ if (EnableCondOpt)
+ addPass(createAArch64ConditionOptimizerPass());
+ if (EnableCCMP)
+ addPass(createAArch64ConditionalCompares());
+ if (EnableMCR)
+ addPass(&MachineCombinerID);
+ if (EnableEarlyIfConversion)
+ addPass(&EarlyIfConverterID);
+ if (EnableStPairSuppress)
+ addPass(createAArch64StorePairSuppressPass());
+ addPass(createAArch64VectorByElementOptPass());
+ return true;
+}
+
+void AArch64PassConfig::addPreRegAlloc() {
+ // Change dead register definitions to refer to the zero register.
+ if (TM->getOptLevel() != CodeGenOpt::None && EnableDeadRegisterElimination)
+ addPass(createAArch64DeadRegisterDefinitions());
+
+ // Use AdvSIMD scalar instructions whenever profitable.
+ if (TM->getOptLevel() != CodeGenOpt::None && EnableAdvSIMDScalar) {
+ addPass(createAArch64AdvSIMDScalar());
+ // The AdvSIMD pass may produce copies that can be rewritten to
+ // be register coaleascer friendly.
+ addPass(&PeepholeOptimizerID);
+ }
+}
+
+void AArch64PassConfig::addPostRegAlloc() {
+ // Remove redundant copy instructions.
+ if (TM->getOptLevel() != CodeGenOpt::None && EnableRedundantCopyElimination)
+ addPass(createAArch64RedundantCopyEliminationPass());
+
+ if (TM->getOptLevel() != CodeGenOpt::None && usingDefaultRegAlloc())
+ // Improve performance for some FP/SIMD code for A57.
+ addPass(createAArch64A57FPLoadBalancing());
+}
+
+void AArch64PassConfig::addPreSched2() {
+ // Expand some pseudo instructions to allow proper scheduling.
+ addPass(createAArch64ExpandPseudoPass());
+ // Use load/store pair instructions when possible.
+ if (TM->getOptLevel() != CodeGenOpt::None && EnableLoadStoreOpt)
+ addPass(createAArch64LoadStoreOptimizationPass());
+}
+
+void AArch64PassConfig::addPreEmitPass() {
+ if (EnableA53Fix835769)
+ addPass(createAArch64A53Fix835769());
+ // Relax conditional branch instructions if they're otherwise out of
+ // range of their destination.
+ if (BranchRelaxation)
+ addPass(&BranchRelaxationPassID);
+
+ if (TM->getOptLevel() != CodeGenOpt::None && EnableCollectLOH &&
+ TM->getTargetTriple().isOSBinFormatMachO())
+ addPass(createAArch64CollectLOHPass());
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h
new file mode 100644
index 000000000000..6fa5e83957e1
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h
@@ -0,0 +1,76 @@
+//==-- AArch64TargetMachine.h - Define TargetMachine for AArch64 -*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the AArch64 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64TARGETMACHINE_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64TARGETMACHINE_H
+
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class AArch64TargetMachine : public LLVMTargetMachine {
+protected:
+ std::unique_ptr<TargetLoweringObjectFile> TLOF;
+ mutable StringMap<std::unique_ptr<AArch64Subtarget>> SubtargetMap;
+
+public:
+ AArch64TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL, bool IsLittleEndian);
+
+ ~AArch64TargetMachine() override;
+ const AArch64Subtarget *getSubtargetImpl(const Function &F) const override;
+
+ // Pass Pipeline Configuration
+ TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+ /// \brief Get the TargetIRAnalysis for this target.
+ TargetIRAnalysis getTargetIRAnalysis() override;
+
+ TargetLoweringObjectFile* getObjFileLowering() const override {
+ return TLOF.get();
+ }
+
+private:
+ bool isLittle;
+};
+
+// AArch64 little endian target machine.
+//
+class AArch64leTargetMachine : public AArch64TargetMachine {
+ virtual void anchor();
+public:
+ AArch64leTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL);
+};
+
+// AArch64 big endian target machine.
+//
+class AArch64beTargetMachine : public AArch64TargetMachine {
+ virtual void anchor();
+public:
+ AArch64beTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp
new file mode 100644
index 000000000000..8875f9b72647
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp
@@ -0,0 +1,72 @@
+//===-- AArch64TargetObjectFile.cpp - AArch64 Object Info -----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64TargetObjectFile.h"
+#include "AArch64TargetMachine.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Dwarf.h"
+using namespace llvm;
+using namespace dwarf;
+
+void AArch64_ELFTargetObjectFile::Initialize(MCContext &Ctx,
+ const TargetMachine &TM) {
+ TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+ InitializeELF(TM.Options.UseInitArray);
+}
+
+AArch64_MachoTargetObjectFile::AArch64_MachoTargetObjectFile()
+ : TargetLoweringObjectFileMachO() {
+ SupportGOTPCRelWithOffset = false;
+}
+
+const MCExpr *AArch64_MachoTargetObjectFile::getTTypeGlobalReference(
+ const GlobalValue *GV, unsigned Encoding, const TargetMachine &TM,
+ MachineModuleInfo *MMI, MCStreamer &Streamer) const {
+ // On Darwin, we can reference dwarf symbols with foo@GOT-., which
+ // is an indirect pc-relative reference. The default implementation
+ // won't reference using the GOT, so we need this target-specific
+ // version.
+ if (Encoding & (DW_EH_PE_indirect | DW_EH_PE_pcrel)) {
+ const MCSymbol *Sym = TM.getSymbol(GV);
+ const MCExpr *Res =
+ MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOT, getContext());
+ MCSymbol *PCSym = getContext().createTempSymbol();
+ Streamer.EmitLabel(PCSym);
+ const MCExpr *PC = MCSymbolRefExpr::create(PCSym, getContext());
+ return MCBinaryExpr::createSub(Res, PC, getContext());
+ }
+
+ return TargetLoweringObjectFileMachO::getTTypeGlobalReference(
+ GV, Encoding, TM, MMI, Streamer);
+}
+
+MCSymbol *AArch64_MachoTargetObjectFile::getCFIPersonalitySymbol(
+ const GlobalValue *GV, const TargetMachine &TM,
+ MachineModuleInfo *MMI) const {
+ return TM.getSymbol(GV);
+}
+
+const MCExpr *AArch64_MachoTargetObjectFile::getIndirectSymViaGOTPCRel(
+ const MCSymbol *Sym, const MCValue &MV, int64_t Offset,
+ MachineModuleInfo *MMI, MCStreamer &Streamer) const {
+ assert((Offset+MV.getConstant() == 0) &&
+ "Arch64 does not support GOT PC rel with extra offset");
+ // On ARM64 Darwin, we can reference symbols with foo@GOT-., which
+ // is an indirect pc-relative reference.
+ const MCExpr *Res =
+ MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOT, getContext());
+ MCSymbol *PCSym = getContext().createTempSymbol();
+ Streamer.EmitLabel(PCSym);
+ const MCExpr *PC = MCSymbolRefExpr::create(PCSym, getContext());
+ return MCBinaryExpr::createSub(Res, PC, getContext());
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h b/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h
new file mode 100644
index 000000000000..05e1dfa9e6c9
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h
@@ -0,0 +1,47 @@
+//===-- AArch64TargetObjectFile.h - AArch64 Object Info -*- C++ ---------*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64TARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64TARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+namespace llvm {
+class AArch64TargetMachine;
+
+/// This implementation is used for AArch64 ELF targets (Linux in particular).
+class AArch64_ELFTargetObjectFile : public TargetLoweringObjectFileELF {
+ void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+};
+
+/// AArch64_MachoTargetObjectFile - This TLOF implementation is used for Darwin.
+class AArch64_MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
+public:
+ AArch64_MachoTargetObjectFile();
+
+ const MCExpr *getTTypeGlobalReference(const GlobalValue *GV,
+ unsigned Encoding,
+ const TargetMachine &TM,
+ MachineModuleInfo *MMI,
+ MCStreamer &Streamer) const override;
+
+ MCSymbol *getCFIPersonalitySymbol(const GlobalValue *GV,
+ const TargetMachine &TM,
+ MachineModuleInfo *MMI) const override;
+
+ const MCExpr *getIndirectSymViaGOTPCRel(const MCSymbol *Sym,
+ const MCValue &MV, int64_t Offset,
+ MachineModuleInfo *MMI,
+ MCStreamer &Streamer) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
new file mode 100644
index 000000000000..88c98865bbc6
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -0,0 +1,643 @@
+//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64TargetTransformInfo.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/CostTable.h"
+#include "llvm/Target/TargetLowering.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64tti"
+
+/// \brief Calculate the cost of materializing a 64-bit value. This helper
+/// method might only calculate a fraction of a larger immediate. Therefore it
+/// is valid to return a cost of ZERO.
+int AArch64TTIImpl::getIntImmCost(int64_t Val) {
+ // Check if the immediate can be encoded within an instruction.
+ if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
+ return 0;
+
+ if (Val < 0)
+ Val = ~Val;
+
+ // Calculate how many moves we will need to materialize this constant.
+ unsigned LZ = countLeadingZeros((uint64_t)Val);
+ return (64 - LZ + 15) / 16;
+}
+
+/// \brief Calculate the cost of materializing the given constant.
+int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
+ assert(Ty->isIntegerTy());
+
+ unsigned BitSize = Ty->getPrimitiveSizeInBits();
+ if (BitSize == 0)
+ return ~0U;
+
+ // Sign-extend all constants to a multiple of 64-bit.
+ APInt ImmVal = Imm;
+ if (BitSize & 0x3f)
+ ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
+
+ // Split the constant into 64-bit chunks and calculate the cost for each
+ // chunk.
+ int Cost = 0;
+ for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
+ APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
+ int64_t Val = Tmp.getSExtValue();
+ Cost += getIntImmCost(Val);
+ }
+ // We need at least one instruction to materialze the constant.
+ return std::max(1, Cost);
+}
+
+int AArch64TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
+ const APInt &Imm, Type *Ty) {
+ assert(Ty->isIntegerTy());
+
+ unsigned BitSize = Ty->getPrimitiveSizeInBits();
+ // There is no cost model for constants with a bit size of 0. Return TCC_Free
+ // here, so that constant hoisting will ignore this constant.
+ if (BitSize == 0)
+ return TTI::TCC_Free;
+
+ unsigned ImmIdx = ~0U;
+ switch (Opcode) {
+ default:
+ return TTI::TCC_Free;
+ case Instruction::GetElementPtr:
+ // Always hoist the base address of a GetElementPtr.
+ if (Idx == 0)
+ return 2 * TTI::TCC_Basic;
+ return TTI::TCC_Free;
+ case Instruction::Store:
+ ImmIdx = 0;
+ break;
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::Mul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ case Instruction::ICmp:
+ ImmIdx = 1;
+ break;
+ // Always return TCC_Free for the shift value of a shift instruction.
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ if (Idx == 1)
+ return TTI::TCC_Free;
+ break;
+ case Instruction::Trunc:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::IntToPtr:
+ case Instruction::PtrToInt:
+ case Instruction::BitCast:
+ case Instruction::PHI:
+ case Instruction::Call:
+ case Instruction::Select:
+ case Instruction::Ret:
+ case Instruction::Load:
+ break;
+ }
+
+ if (Idx == ImmIdx) {
+ int NumConstants = (BitSize + 63) / 64;
+ int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
+ return (Cost <= NumConstants * TTI::TCC_Basic)
+ ? static_cast<int>(TTI::TCC_Free)
+ : Cost;
+ }
+ return AArch64TTIImpl::getIntImmCost(Imm, Ty);
+}
+
+int AArch64TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
+ const APInt &Imm, Type *Ty) {
+ assert(Ty->isIntegerTy());
+
+ unsigned BitSize = Ty->getPrimitiveSizeInBits();
+ // There is no cost model for constants with a bit size of 0. Return TCC_Free
+ // here, so that constant hoisting will ignore this constant.
+ if (BitSize == 0)
+ return TTI::TCC_Free;
+
+ switch (IID) {
+ default:
+ return TTI::TCC_Free;
+ case Intrinsic::sadd_with_overflow:
+ case Intrinsic::uadd_with_overflow:
+ case Intrinsic::ssub_with_overflow:
+ case Intrinsic::usub_with_overflow:
+ case Intrinsic::smul_with_overflow:
+ case Intrinsic::umul_with_overflow:
+ if (Idx == 1) {
+ int NumConstants = (BitSize + 63) / 64;
+ int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
+ return (Cost <= NumConstants * TTI::TCC_Basic)
+ ? static_cast<int>(TTI::TCC_Free)
+ : Cost;
+ }
+ break;
+ case Intrinsic::experimental_stackmap:
+ if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+ return TTI::TCC_Free;
+ break;
+ case Intrinsic::experimental_patchpoint_void:
+ case Intrinsic::experimental_patchpoint_i64:
+ if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+ return TTI::TCC_Free;
+ break;
+ }
+ return AArch64TTIImpl::getIntImmCost(Imm, Ty);
+}
+
+TargetTransformInfo::PopcntSupportKind
+AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
+ assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
+ if (TyWidth == 32 || TyWidth == 64)
+ return TTI::PSK_FastHardware;
+ // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
+ return TTI::PSK_Software;
+}
+
+int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ assert(ISD && "Invalid opcode");
+
+ EVT SrcTy = TLI->getValueType(DL, Src);
+ EVT DstTy = TLI->getValueType(DL, Dst);
+
+ if (!SrcTy.isSimple() || !DstTy.isSimple())
+ return BaseT::getCastInstrCost(Opcode, Dst, Src);
+
+ static const TypeConversionCostTblEntry
+ ConversionTbl[] = {
+ { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
+ { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
+
+ // The number of shll instructions for the extension.
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
+ { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
+ { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
+ { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
+ { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
+ { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
+ { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
+
+ // LowerVectorINT_TO_FP:
+ { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
+ { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
+
+ // Complex: to v2f32
+ { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
+ { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
+ { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
+ { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
+ { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
+ { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
+
+ // Complex: to v4f32
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
+
+ // Complex: to v8f32
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
+
+ // Complex: to v16f32
+ { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
+ { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
+
+ // Complex: to v2f64
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
+
+
+ // LowerVectorFP_TO_INT
+ { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
+ { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
+ { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
+ { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
+
+ // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
+ { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
+ { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
+ { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
+ { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 },
+
+ // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
+ { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
+ { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 },
+ { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
+ { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 },
+
+ // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
+ { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
+ { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
+ { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 },
+ { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
+ { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
+ { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 },
+ };
+
+ if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
+ DstTy.getSimpleVT(),
+ SrcTy.getSimpleVT()))
+ return Entry->Cost;
+
+ return BaseT::getCastInstrCost(Opcode, Dst, Src);
+}
+
+int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
+ VectorType *VecTy,
+ unsigned Index) {
+
+ // Make sure we were given a valid extend opcode.
+ assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
+ "Invalid opcode");
+
+ // We are extending an element we extract from a vector, so the source type
+ // of the extend is the element type of the vector.
+ auto *Src = VecTy->getElementType();
+
+ // Sign- and zero-extends are for integer types only.
+ assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
+
+ // Get the cost for the extract. We compute the cost (if any) for the extend
+ // below.
+ auto Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy, Index);
+
+ // Legalize the types.
+ auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy);
+ auto DstVT = TLI->getValueType(DL, Dst);
+ auto SrcVT = TLI->getValueType(DL, Src);
+
+ // If the resulting type is still a vector and the destination type is legal,
+ // we may get the extension for free. If not, get the default cost for the
+ // extend.
+ if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
+ return Cost + getCastInstrCost(Opcode, Dst, Src);
+
+ // The destination type should be larger than the element type. If not, get
+ // the default cost for the extend.
+ if (DstVT.getSizeInBits() < SrcVT.getSizeInBits())
+ return Cost + getCastInstrCost(Opcode, Dst, Src);
+
+ switch (Opcode) {
+ default:
+ llvm_unreachable("Opcode should be either SExt or ZExt");
+
+ // For sign-extends, we only need a smov, which performs the extension
+ // automatically.
+ case Instruction::SExt:
+ return Cost;
+
+ // For zero-extends, the extend is performed automatically by a umov unless
+ // the destination type is i64 and the element type is i8 or i16.
+ case Instruction::ZExt:
+ if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
+ return Cost;
+ }
+
+ // If we are unable to perform the extend for free, get the default cost.
+ return Cost + getCastInstrCost(Opcode, Dst, Src);
+}
+
+int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
+ unsigned Index) {
+ assert(Val->isVectorTy() && "This must be a vector type");
+
+ if (Index != -1U) {
+ // Legalize the type.
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
+
+ // This type is legalized to a scalar type.
+ if (!LT.second.isVector())
+ return 0;
+
+ // The type may be split. Normalize the index to the new type.
+ unsigned Width = LT.second.getVectorNumElements();
+ Index = Index % Width;
+
+ // The element at index zero is already inside the vector.
+ if (Index == 0)
+ return 0;
+ }
+
+ // All other insert/extracts cost this much.
+ return ST->getVectorInsertExtractBaseCost();
+}
+
+int AArch64TTIImpl::getArithmeticInstrCost(
+ unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
+ TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
+ TTI::OperandValueProperties Opd2PropInfo) {
+ // Legalize the type.
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+
+ if (ISD == ISD::SDIV &&
+ Opd2Info == TargetTransformInfo::OK_UniformConstantValue &&
+ Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
+ // On AArch64, scalar signed division by constants power-of-two are
+ // normally expanded to the sequence ADD + CMP + SELECT + SRA.
+ // The OperandValue properties many not be same as that of previous
+ // operation; conservatively assume OP_None.
+ int Cost = getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
+ Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
+ Cost += getArithmeticInstrCost(Instruction::Select, Ty, Opd1Info, Opd2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
+ Cost += getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info, Opd2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
+ return Cost;
+ }
+
+ switch (ISD) {
+ default:
+ return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+ Opd1PropInfo, Opd2PropInfo);
+ case ISD::ADD:
+ case ISD::MUL:
+ case ISD::XOR:
+ case ISD::OR:
+ case ISD::AND:
+ // These nodes are marked as 'custom' for combining purposes only.
+ // We know that they are legal. See LowerAdd in ISelLowering.
+ return 1 * LT.first;
+ }
+}
+
+int AArch64TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
+ // Address computations in vectorized code with non-consecutive addresses will
+ // likely result in more instructions compared to scalar code where the
+ // computation can more often be merged into the index mode. The resulting
+ // extra micro-ops can significantly decrease throughput.
+ unsigned NumVectorInstToHideOverhead = 10;
+
+ if (Ty->isVectorTy() && IsComplex)
+ return NumVectorInstToHideOverhead;
+
+ // In many cases the address computation is not merged into the instruction
+ // addressing mode.
+ return 1;
+}
+
+int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+ Type *CondTy) {
+
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ // We don't lower some vector selects well that are wider than the register
+ // width.
+ if (ValTy->isVectorTy() && ISD == ISD::SELECT) {
+ // We would need this many instructions to hide the scalarization happening.
+ const int AmortizationCost = 20;
+ static const TypeConversionCostTblEntry
+ VectorSelectTbl[] = {
+ { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
+ { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
+ { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
+ { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
+ { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
+ { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
+ };
+
+ EVT SelCondTy = TLI->getValueType(DL, CondTy);
+ EVT SelValTy = TLI->getValueType(DL, ValTy);
+ if (SelCondTy.isSimple() && SelValTy.isSimple()) {
+ if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
+ SelCondTy.getSimpleVT(),
+ SelValTy.getSimpleVT()))
+ return Entry->Cost;
+ }
+ }
+ return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+}
+
+int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+ unsigned Alignment, unsigned AddressSpace) {
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
+
+ if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
+ Src->isVectorTy() && Alignment != 16 &&
+ Src->getVectorElementType()->isIntegerTy(64)) {
+ // Unaligned stores are extremely inefficient. We don't split
+ // unaligned v2i64 stores because the negative impact that has shown in
+ // practice on inlined memcpy code.
+ // We make v2i64 stores expensive so that we will only vectorize if there
+ // are 6 other instructions getting vectorized.
+ int AmortizationCost = 6;
+
+ return LT.first * 2 * AmortizationCost;
+ }
+
+ if (Src->isVectorTy() && Src->getVectorElementType()->isIntegerTy(8) &&
+ Src->getVectorNumElements() < 8) {
+ // We scalarize the loads/stores because there is not v.4b register and we
+ // have to promote the elements to v.4h.
+ unsigned NumVecElts = Src->getVectorNumElements();
+ unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
+ // We generate 2 instructions per vector element.
+ return NumVectorizableInstsToAmortize * NumVecElts * 2;
+ }
+
+ return LT.first;
+}
+
+int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+ unsigned Factor,
+ ArrayRef<unsigned> Indices,
+ unsigned Alignment,
+ unsigned AddressSpace) {
+ assert(Factor >= 2 && "Invalid interleave factor");
+ assert(isa<VectorType>(VecTy) && "Expect a vector type");
+
+ if (Factor <= TLI->getMaxSupportedInterleaveFactor()) {
+ unsigned NumElts = VecTy->getVectorNumElements();
+ Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
+ unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
+
+ // ldN/stN only support legal vector types of size 64 or 128 in bits.
+ if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128))
+ return Factor;
+ }
+
+ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace);
+}
+
+int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
+ int Cost = 0;
+ for (auto *I : Tys) {
+ if (!I->isVectorTy())
+ continue;
+ if (I->getScalarSizeInBits() * I->getVectorNumElements() == 128)
+ Cost += getMemoryOpCost(Instruction::Store, I, 128, 0) +
+ getMemoryOpCost(Instruction::Load, I, 128, 0);
+ }
+ return Cost;
+}
+
+unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
+ return ST->getMaxInterleaveFactor();
+}
+
+void AArch64TTIImpl::getUnrollingPreferences(Loop *L,
+ TTI::UnrollingPreferences &UP) {
+ // Enable partial unrolling and runtime unrolling.
+ BaseT::getUnrollingPreferences(L, UP);
+
+ // For inner loop, it is more likely to be a hot one, and the runtime check
+ // can be promoted out from LICM pass, so the overhead is less, let's try
+ // a larger threshold to unroll more loops.
+ if (L->getLoopDepth() > 1)
+ UP.PartialThreshold *= 2;
+
+ // Disable partial & runtime unrolling on -Os.
+ UP.PartialOptSizeThreshold = 0;
+}
+
+Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
+ Type *ExpectedType) {
+ switch (Inst->getIntrinsicID()) {
+ default:
+ return nullptr;
+ case Intrinsic::aarch64_neon_st2:
+ case Intrinsic::aarch64_neon_st3:
+ case Intrinsic::aarch64_neon_st4: {
+ // Create a struct type
+ StructType *ST = dyn_cast<StructType>(ExpectedType);
+ if (!ST)
+ return nullptr;
+ unsigned NumElts = Inst->getNumArgOperands() - 1;
+ if (ST->getNumElements() != NumElts)
+ return nullptr;
+ for (unsigned i = 0, e = NumElts; i != e; ++i) {
+ if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
+ return nullptr;
+ }
+ Value *Res = UndefValue::get(ExpectedType);
+ IRBuilder<> Builder(Inst);
+ for (unsigned i = 0, e = NumElts; i != e; ++i) {
+ Value *L = Inst->getArgOperand(i);
+ Res = Builder.CreateInsertValue(Res, L, i);
+ }
+ return Res;
+ }
+ case Intrinsic::aarch64_neon_ld2:
+ case Intrinsic::aarch64_neon_ld3:
+ case Intrinsic::aarch64_neon_ld4:
+ if (Inst->getType() == ExpectedType)
+ return Inst;
+ return nullptr;
+ }
+}
+
+bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
+ MemIntrinsicInfo &Info) {
+ switch (Inst->getIntrinsicID()) {
+ default:
+ break;
+ case Intrinsic::aarch64_neon_ld2:
+ case Intrinsic::aarch64_neon_ld3:
+ case Intrinsic::aarch64_neon_ld4:
+ Info.ReadMem = true;
+ Info.WriteMem = false;
+ Info.IsSimple = true;
+ Info.NumMemRefs = 1;
+ Info.PtrVal = Inst->getArgOperand(0);
+ break;
+ case Intrinsic::aarch64_neon_st2:
+ case Intrinsic::aarch64_neon_st3:
+ case Intrinsic::aarch64_neon_st4:
+ Info.ReadMem = false;
+ Info.WriteMem = true;
+ Info.IsSimple = true;
+ Info.NumMemRefs = 1;
+ Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1);
+ break;
+ }
+
+ switch (Inst->getIntrinsicID()) {
+ default:
+ return false;
+ case Intrinsic::aarch64_neon_ld2:
+ case Intrinsic::aarch64_neon_st2:
+ Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
+ break;
+ case Intrinsic::aarch64_neon_ld3:
+ case Intrinsic::aarch64_neon_st3:
+ Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
+ break;
+ case Intrinsic::aarch64_neon_ld4:
+ case Intrinsic::aarch64_neon_st4:
+ Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
+ break;
+ }
+ return true;
+}
+
+unsigned AArch64TTIImpl::getCacheLineSize() {
+ return ST->getCacheLineSize();
+}
+
+unsigned AArch64TTIImpl::getPrefetchDistance() {
+ return ST->getPrefetchDistance();
+}
+
+unsigned AArch64TTIImpl::getMinPrefetchStride() {
+ return ST->getMinPrefetchStride();
+}
+
+unsigned AArch64TTIImpl::getMaxPrefetchIterationsAhead() {
+ return ST->getMaxPrefetchIterationsAhead();
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
new file mode 100644
index 000000000000..24642cb1698e
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -0,0 +1,139 @@
+//===-- AArch64TargetTransformInfo.h - AArch64 specific TTI -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file a TargetTransformInfo::Concept conforming object specific to the
+/// AArch64 target machine. It uses the target's detailed information to
+/// provide more precise answers to certain TTI queries, while letting the
+/// target independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64TARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64TARGETTRANSFORMINFO_H
+
+#include "AArch64.h"
+#include "AArch64TargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Target/TargetLowering.h"
+#include <algorithm>
+
+namespace llvm {
+
+class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
+ typedef BasicTTIImplBase<AArch64TTIImpl> BaseT;
+ typedef TargetTransformInfo TTI;
+ friend BaseT;
+
+ const AArch64Subtarget *ST;
+ const AArch64TargetLowering *TLI;
+
+ /// Estimate the overhead of scalarizing an instruction. Insert and Extract
+ /// are set if the result needs to be inserted and/or extracted from vectors.
+ unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract);
+
+ const AArch64Subtarget *getST() const { return ST; }
+ const AArch64TargetLowering *getTLI() const { return TLI; }
+
+ enum MemIntrinsicType {
+ VECTOR_LDST_TWO_ELEMENTS,
+ VECTOR_LDST_THREE_ELEMENTS,
+ VECTOR_LDST_FOUR_ELEMENTS
+ };
+
+public:
+ explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F)
+ : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+ TLI(ST->getTargetLowering()) {}
+
+ /// \name Scalar TTI Implementations
+ /// @{
+
+ using BaseT::getIntImmCost;
+ int getIntImmCost(int64_t Val);
+ int getIntImmCost(const APInt &Imm, Type *Ty);
+ int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
+ int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+ Type *Ty);
+ TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
+
+ /// @}
+
+ /// \name Vector TTI Implementations
+ /// @{
+
+ bool enableInterleavedAccessVectorization() { return true; }
+
+ unsigned getNumberOfRegisters(bool Vector) {
+ if (Vector) {
+ if (ST->hasNEON())
+ return 32;
+ return 0;
+ }
+ return 31;
+ }
+
+ unsigned getRegisterBitWidth(bool Vector) {
+ if (Vector) {
+ if (ST->hasNEON())
+ return 128;
+ return 0;
+ }
+ return 64;
+ }
+
+ unsigned getMaxInterleaveFactor(unsigned VF);
+
+ int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
+
+ int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
+ unsigned Index);
+
+ int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+
+ int getArithmeticInstrCost(
+ unsigned Opcode, Type *Ty,
+ TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+ TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+ TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+
+ int getAddressComputationCost(Type *Ty, bool IsComplex);
+
+ int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+
+ int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+ unsigned AddressSpace);
+
+ int getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys);
+
+ void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
+
+ Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
+ Type *ExpectedType);
+
+ bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info);
+
+ int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
+ ArrayRef<unsigned> Indices, unsigned Alignment,
+ unsigned AddressSpace);
+
+ unsigned getCacheLineSize();
+
+ unsigned getPrefetchDistance();
+
+ unsigned getMinPrefetchStride();
+
+ unsigned getMaxPrefetchIterationsAhead();
+ /// @}
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp b/contrib/llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp
new file mode 100644
index 000000000000..e3b1d7cea48d
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp
@@ -0,0 +1,371 @@
+//=- AArch64VectorByElementOpt.cpp - AArch64 vector by element inst opt pass =//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that performs optimization for vector by element
+// SIMD instructions.
+//
+// Certain SIMD instructions with vector element operand are not efficient.
+// Rewrite them into SIMD instructions with vector operands. This rewrite
+// is driven by the latency of the instructions.
+//
+// Example:
+// fmla v0.4s, v1.4s, v2.s[1]
+// is rewritten into
+// dup v3.4s, v2.s[1]
+// fmla v0.4s, v1.4s, v3.4s
+//===----------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-vectorbyelement-opt"
+
+STATISTIC(NumModifiedInstr,
+ "Number of vector by element instructions modified");
+
+#define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME \
+ "AArch64 vector by element instruction optimization pass"
+
+namespace {
+
+struct AArch64VectorByElementOpt : public MachineFunctionPass {
+ static char ID;
+ AArch64VectorByElementOpt() : MachineFunctionPass(ID) {
+ initializeAArch64VectorByElementOptPass(*PassRegistry::getPassRegistry());
+ }
+
+ const TargetInstrInfo *TII;
+ MachineRegisterInfo *MRI;
+ TargetSchedModel SchedModel;
+
+ /// Based only on latency of instructions, determine if it is cost efficient
+ /// to replace the instruction InstDesc by the two instructions InstDescRep1
+ /// and InstDescRep2.
+ /// Return true if replacement is recommended.
+ bool
+ shouldReplaceInstruction(MachineFunction *MF, const MCInstrDesc *InstDesc,
+ const MCInstrDesc *InstDescRep1,
+ const MCInstrDesc *InstDescRep2,
+ std::map<unsigned, bool> &VecInstElemTable) const;
+
+ /// Determine if we need to exit the vector by element instruction
+ /// optimization pass early. This makes sure that Targets with no need
+ /// for this optimization do not spent any compile time on this pass.
+ /// This check is done by comparing the latency of an indexed FMLA
+ /// instruction to the latency of the DUP + the latency of a vector
+ /// FMLA instruction. We do not check on other related instructions such
+ /// as FMLS as we assume that if the situation shows up for one
+ /// instruction, then it is likely to show up for the related ones.
+ /// Return true if early exit of the pass is recommended.
+ bool earlyExitVectElement(MachineFunction *MF);
+
+ /// Check whether an equivalent DUP instruction has already been
+ /// created or not.
+ /// Return true when the dup instruction already exists. In this case,
+ /// DestReg will point to the destination of the already created DUP.
+ bool reuseDUP(MachineInstr &MI, unsigned DupOpcode, unsigned SrcReg,
+ unsigned LaneNumber, unsigned *DestReg) const;
+
+ /// Certain SIMD instructions with vector element operand are not efficient.
+ /// Rewrite them into SIMD instructions with vector operands. This rewrite
+ /// is driven by the latency of the instructions.
+ /// Return true if the SIMD instruction is modified.
+ bool optimizeVectElement(MachineInstr &MI,
+ std::map<unsigned, bool> *VecInstElemTable) const;
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+
+ StringRef getPassName() const override {
+ return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME;
+ }
+};
+char AArch64VectorByElementOpt::ID = 0;
+} // namespace
+
+INITIALIZE_PASS(AArch64VectorByElementOpt, "aarch64-vectorbyelement-opt",
+ AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false)
+
+/// Based only on latency of instructions, determine if it is cost efficient
+/// to replace the instruction InstDesc by the two instructions InstDescRep1
+/// and InstDescRep2. Note that it is assumed in this fuction that an
+/// instruction of type InstDesc is always replaced by the same two
+/// instructions as results are cached here.
+/// Return true if replacement is recommended.
+bool AArch64VectorByElementOpt::shouldReplaceInstruction(
+ MachineFunction *MF, const MCInstrDesc *InstDesc,
+ const MCInstrDesc *InstDescRep1, const MCInstrDesc *InstDescRep2,
+ std::map<unsigned, bool> &VecInstElemTable) const {
+ // Check if replacment decision is alredy available in the cached table.
+ // if so, return it.
+ if (!VecInstElemTable.empty() &&
+ VecInstElemTable.find(InstDesc->getOpcode()) != VecInstElemTable.end())
+ return VecInstElemTable[InstDesc->getOpcode()];
+
+ unsigned SCIdx = InstDesc->getSchedClass();
+ unsigned SCIdxRep1 = InstDescRep1->getSchedClass();
+ unsigned SCIdxRep2 = InstDescRep2->getSchedClass();
+ const MCSchedClassDesc *SCDesc =
+ SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
+ const MCSchedClassDesc *SCDescRep1 =
+ SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdxRep1);
+ const MCSchedClassDesc *SCDescRep2 =
+ SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdxRep2);
+
+ // If a subtarget does not define resources for any of the instructions
+ // of interest, then return false for no replacement.
+ if (!SCDesc->isValid() || SCDesc->isVariant() || !SCDescRep1->isValid() ||
+ SCDescRep1->isVariant() || !SCDescRep2->isValid() ||
+ SCDescRep2->isVariant()) {
+ VecInstElemTable[InstDesc->getOpcode()] = false;
+ return false;
+ }
+
+ if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) >
+ SchedModel.computeInstrLatency(InstDescRep1->getOpcode()) +
+ SchedModel.computeInstrLatency(InstDescRep2->getOpcode())) {
+ VecInstElemTable[InstDesc->getOpcode()] = true;
+ return true;
+ }
+ VecInstElemTable[InstDesc->getOpcode()] = false;
+ return false;
+}
+
+/// Determine if we need to exit the vector by element instruction
+/// optimization pass early. This makes sure that Targets with no need
+/// for this optimization do not spent any compile time on this pass.
+/// This check is done by comparing the latency of an indexed FMLA
+/// instruction to the latency of the DUP + the latency of a vector
+/// FMLA instruction. We do not check on other related instructions such
+/// as FMLS as we assume that if the situation shows up for one
+/// instruction, then it is likely to show up for the related ones.
+/// Return true if early exit of the pass is recommended.
+bool AArch64VectorByElementOpt::earlyExitVectElement(MachineFunction *MF) {
+ std::map<unsigned, bool> VecInstElemTable;
+ const MCInstrDesc *IndexMulMCID = &TII->get(AArch64::FMLAv4i32_indexed);
+ const MCInstrDesc *DupMCID = &TII->get(AArch64::DUPv4i32lane);
+ const MCInstrDesc *MulMCID = &TII->get(AArch64::FMULv4f32);
+
+ if (!shouldReplaceInstruction(MF, IndexMulMCID, DupMCID, MulMCID,
+ VecInstElemTable))
+ return true;
+ return false;
+}
+
+/// Check whether an equivalent DUP instruction has already been
+/// created or not.
+/// Return true when the dup instruction already exists. In this case,
+/// DestReg will point to the destination of the already created DUP.
+bool AArch64VectorByElementOpt::reuseDUP(MachineInstr &MI, unsigned DupOpcode,
+ unsigned SrcReg, unsigned LaneNumber,
+ unsigned *DestReg) const {
+ for (MachineBasicBlock::iterator MII = MI, MIE = MI.getParent()->begin();
+ MII != MIE;) {
+ MII--;
+ MachineInstr *CurrentMI = &*MII;
+
+ if (CurrentMI->getOpcode() == DupOpcode &&
+ CurrentMI->getNumOperands() == 3 &&
+ CurrentMI->getOperand(1).getReg() == SrcReg &&
+ CurrentMI->getOperand(2).getImm() == LaneNumber) {
+ *DestReg = CurrentMI->getOperand(0).getReg();
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/// Certain SIMD instructions with vector element operand are not efficient.
+/// Rewrite them into SIMD instructions with vector operands. This rewrite
+/// is driven by the latency of the instructions.
+/// The instruction of concerns are for the time being fmla, fmls, fmul,
+/// and fmulx and hence they are hardcoded.
+///
+/// Example:
+/// fmla v0.4s, v1.4s, v2.s[1]
+/// is rewritten into
+/// dup v3.4s, v2.s[1] // dup not necessary if redundant
+/// fmla v0.4s, v1.4s, v3.4s
+/// Return true if the SIMD instruction is modified.
+bool AArch64VectorByElementOpt::optimizeVectElement(
+ MachineInstr &MI, std::map<unsigned, bool> *VecInstElemTable) const {
+ const MCInstrDesc *MulMCID, *DupMCID;
+ const TargetRegisterClass *RC = &AArch64::FPR128RegClass;
+
+ switch (MI.getOpcode()) {
+ default:
+ return false;
+
+ // 4X32 instructions
+ case AArch64::FMLAv4i32_indexed:
+ DupMCID = &TII->get(AArch64::DUPv4i32lane);
+ MulMCID = &TII->get(AArch64::FMLAv4f32);
+ break;
+ case AArch64::FMLSv4i32_indexed:
+ DupMCID = &TII->get(AArch64::DUPv4i32lane);
+ MulMCID = &TII->get(AArch64::FMLSv4f32);
+ break;
+ case AArch64::FMULXv4i32_indexed:
+ DupMCID = &TII->get(AArch64::DUPv4i32lane);
+ MulMCID = &TII->get(AArch64::FMULXv4f32);
+ break;
+ case AArch64::FMULv4i32_indexed:
+ DupMCID = &TII->get(AArch64::DUPv4i32lane);
+ MulMCID = &TII->get(AArch64::FMULv4f32);
+ break;
+
+ // 2X64 instructions
+ case AArch64::FMLAv2i64_indexed:
+ DupMCID = &TII->get(AArch64::DUPv2i64lane);
+ MulMCID = &TII->get(AArch64::FMLAv2f64);
+ break;
+ case AArch64::FMLSv2i64_indexed:
+ DupMCID = &TII->get(AArch64::DUPv2i64lane);
+ MulMCID = &TII->get(AArch64::FMLSv2f64);
+ break;
+ case AArch64::FMULXv2i64_indexed:
+ DupMCID = &TII->get(AArch64::DUPv2i64lane);
+ MulMCID = &TII->get(AArch64::FMULXv2f64);
+ break;
+ case AArch64::FMULv2i64_indexed:
+ DupMCID = &TII->get(AArch64::DUPv2i64lane);
+ MulMCID = &TII->get(AArch64::FMULv2f64);
+ break;
+
+ // 2X32 instructions
+ case AArch64::FMLAv2i32_indexed:
+ RC = &AArch64::FPR64RegClass;
+ DupMCID = &TII->get(AArch64::DUPv2i32lane);
+ MulMCID = &TII->get(AArch64::FMLAv2f32);
+ break;
+ case AArch64::FMLSv2i32_indexed:
+ RC = &AArch64::FPR64RegClass;
+ DupMCID = &TII->get(AArch64::DUPv2i32lane);
+ MulMCID = &TII->get(AArch64::FMLSv2f32);
+ break;
+ case AArch64::FMULXv2i32_indexed:
+ RC = &AArch64::FPR64RegClass;
+ DupMCID = &TII->get(AArch64::DUPv2i32lane);
+ MulMCID = &TII->get(AArch64::FMULXv2f32);
+ break;
+ case AArch64::FMULv2i32_indexed:
+ RC = &AArch64::FPR64RegClass;
+ DupMCID = &TII->get(AArch64::DUPv2i32lane);
+ MulMCID = &TII->get(AArch64::FMULv2f32);
+ break;
+ }
+
+ if (!shouldReplaceInstruction(MI.getParent()->getParent(),
+ &TII->get(MI.getOpcode()), DupMCID, MulMCID,
+ *VecInstElemTable))
+ return false;
+
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+ // get the operands of the current SIMD arithmetic instruction.
+ unsigned MulDest = MI.getOperand(0).getReg();
+ unsigned SrcReg0 = MI.getOperand(1).getReg();
+ unsigned Src0IsKill = getKillRegState(MI.getOperand(1).isKill());
+ unsigned SrcReg1 = MI.getOperand(2).getReg();
+ unsigned Src1IsKill = getKillRegState(MI.getOperand(2).isKill());
+ unsigned DupDest;
+
+ // Instructions of interest have either 4 or 5 operands.
+ if (MI.getNumOperands() == 5) {
+ unsigned SrcReg2 = MI.getOperand(3).getReg();
+ unsigned Src2IsKill = getKillRegState(MI.getOperand(3).isKill());
+ unsigned LaneNumber = MI.getOperand(4).getImm();
+
+ // Create a new DUP instruction. Note that if an equivalent DUP instruction
+ // has already been created before, then use that one instread of creating
+ // a new one.
+ if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg2, LaneNumber, &DupDest)) {
+ DupDest = MRI.createVirtualRegister(RC);
+ BuildMI(MBB, MI, DL, *DupMCID, DupDest)
+ .addReg(SrcReg2, Src2IsKill)
+ .addImm(LaneNumber);
+ }
+ BuildMI(MBB, MI, DL, *MulMCID, MulDest)
+ .addReg(SrcReg0, Src0IsKill)
+ .addReg(SrcReg1, Src1IsKill)
+ .addReg(DupDest, Src2IsKill);
+ } else if (MI.getNumOperands() == 4) {
+ unsigned LaneNumber = MI.getOperand(3).getImm();
+ if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg1, LaneNumber, &DupDest)) {
+ DupDest = MRI.createVirtualRegister(RC);
+ BuildMI(MBB, MI, DL, *DupMCID, DupDest)
+ .addReg(SrcReg1, Src1IsKill)
+ .addImm(LaneNumber);
+ }
+ BuildMI(MBB, MI, DL, *MulMCID, MulDest)
+ .addReg(SrcReg0, Src0IsKill)
+ .addReg(DupDest, Src1IsKill);
+ } else {
+ return false;
+ }
+
+ ++NumModifiedInstr;
+ return true;
+}
+
+bool AArch64VectorByElementOpt::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
+ TII = MF.getSubtarget().getInstrInfo();
+ MRI = &MF.getRegInfo();
+ const TargetSubtargetInfo &ST = MF.getSubtarget();
+ const AArch64InstrInfo *AAII =
+ static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
+ if (!AAII)
+ return false;
+ SchedModel.init(ST.getSchedModel(), &ST, AAII);
+ if (!SchedModel.hasInstrSchedModel())
+ return false;
+
+ // A simple check to exit this pass early for targets that do not need it.
+ if (earlyExitVectElement(&MF))
+ return false;
+
+ bool Changed = false;
+ std::map<unsigned, bool> VecInstElemTable;
+ SmallVector<MachineInstr *, 8> RemoveMIs;
+
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end();
+ MII != MIE;) {
+ MachineInstr &MI = *MII;
+ if (optimizeVectElement(MI, &VecInstElemTable)) {
+ // Add MI to the list of instructions to be removed given that it has
+ // been replaced.
+ RemoveMIs.push_back(&MI);
+ Changed = true;
+ }
+ ++MII;
+ }
+ }
+
+ for (MachineInstr *MI : RemoveMIs)
+ MI->eraseFromParent();
+
+ return Changed;
+}
+
+/// createAArch64VectorByElementOptPass - returns an instance of the
+/// vector by element optimization pass.
+FunctionPass *llvm::createAArch64VectorByElementOptPass() {
+ return new AArch64VectorByElementOpt();
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
new file mode 100644
index 000000000000..db84afacf30e
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -0,0 +1,4625 @@
+//==- AArch64AsmParser.cpp - Parse AArch64 assembly to MCInst instructions -==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "MCTargetDesc/AArch64MCExpr.h"
+#include "MCTargetDesc/AArch64TargetStreamer.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetParser.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdio>
+using namespace llvm;
+
+namespace {
+
+class AArch64Operand;
+
+class AArch64AsmParser : public MCTargetAsmParser {
+private:
+ StringRef Mnemonic; ///< Instruction mnemonic.
+
+ // Map of register aliases registers via the .req directive.
+ StringMap<std::pair<bool, unsigned> > RegisterReqs;
+
+ AArch64TargetStreamer &getTargetStreamer() {
+ MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
+ return static_cast<AArch64TargetStreamer &>(TS);
+ }
+
+ SMLoc getLoc() const { return getParser().getTok().getLoc(); }
+
+ bool parseSysAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands);
+ AArch64CC::CondCode parseCondCodeString(StringRef Cond);
+ bool parseCondCode(OperandVector &Operands, bool invertCondCode);
+ unsigned matchRegisterNameAlias(StringRef Name, bool isVector);
+ int tryParseRegister();
+ int tryMatchVectorRegister(StringRef &Kind, bool expected);
+ bool parseRegister(OperandVector &Operands);
+ bool parseSymbolicImmVal(const MCExpr *&ImmVal);
+ bool parseVectorList(OperandVector &Operands);
+ bool parseOperand(OperandVector &Operands, bool isCondCode,
+ bool invertCondCode);
+
+ bool showMatchError(SMLoc Loc, unsigned ErrCode);
+
+ bool parseDirectiveArch(SMLoc L);
+ bool parseDirectiveCPU(SMLoc L);
+ bool parseDirectiveWord(unsigned Size, SMLoc L);
+ bool parseDirectiveInst(SMLoc L);
+
+ bool parseDirectiveTLSDescCall(SMLoc L);
+
+ bool parseDirectiveLOH(StringRef LOH, SMLoc L);
+ bool parseDirectiveLtorg(SMLoc L);
+
+ bool parseDirectiveReq(StringRef Name, SMLoc L);
+ bool parseDirectiveUnreq(SMLoc L);
+
+ bool validateInstruction(MCInst &Inst, SmallVectorImpl<SMLoc> &Loc);
+ bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands, MCStreamer &Out,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) override;
+/// @name Auto-generated Match Functions
+/// {
+
+#define GET_ASSEMBLER_HEADER
+#include "AArch64GenAsmMatcher.inc"
+
+ /// }
+
+ OperandMatchResultTy tryParseOptionalShiftExtend(OperandVector &Operands);
+ OperandMatchResultTy tryParseBarrierOperand(OperandVector &Operands);
+ OperandMatchResultTy tryParseMRSSystemRegister(OperandVector &Operands);
+ OperandMatchResultTy tryParseSysReg(OperandVector &Operands);
+ OperandMatchResultTy tryParseSysCROperand(OperandVector &Operands);
+ OperandMatchResultTy tryParsePrefetch(OperandVector &Operands);
+ OperandMatchResultTy tryParsePSBHint(OperandVector &Operands);
+ OperandMatchResultTy tryParseAdrpLabel(OperandVector &Operands);
+ OperandMatchResultTy tryParseAdrLabel(OperandVector &Operands);
+ OperandMatchResultTy tryParseFPImm(OperandVector &Operands);
+ OperandMatchResultTy tryParseAddSubImm(OperandVector &Operands);
+ OperandMatchResultTy tryParseGPR64sp0Operand(OperandVector &Operands);
+ bool tryParseVectorRegister(OperandVector &Operands);
+ OperandMatchResultTy tryParseGPRSeqPair(OperandVector &Operands);
+
+public:
+ enum AArch64MatchResultTy {
+ Match_InvalidSuffix = FIRST_TARGET_MATCH_RESULT_TY,
+#define GET_OPERAND_DIAGNOSTIC_TYPES
+#include "AArch64GenAsmMatcher.inc"
+ };
+ bool IsILP32;
+ AArch64AsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
+ const MCInstrInfo &MII, const MCTargetOptions &Options)
+ : MCTargetAsmParser(Options, STI) {
+ IsILP32 = Options.getABIName() == "ilp32";
+ MCAsmParserExtension::Initialize(Parser);
+ MCStreamer &S = getParser().getStreamer();
+ if (S.getTargetStreamer() == nullptr)
+ new AArch64TargetStreamer(S);
+
+ // Initialize the set of available features.
+ setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
+ }
+
+ bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+ SMLoc NameLoc, OperandVector &Operands) override;
+ bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+ bool ParseDirective(AsmToken DirectiveID) override;
+ unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
+ unsigned Kind) override;
+
+ static bool classifySymbolRef(const MCExpr *Expr,
+ AArch64MCExpr::VariantKind &ELFRefKind,
+ MCSymbolRefExpr::VariantKind &DarwinRefKind,
+ int64_t &Addend);
+};
+} // end anonymous namespace
+
+namespace {
+
+/// AArch64Operand - Instances of this class represent a parsed AArch64 machine
+/// instruction.
+class AArch64Operand : public MCParsedAsmOperand {
+private:
+ enum KindTy {
+ k_Immediate,
+ k_ShiftedImm,
+ k_CondCode,
+ k_Register,
+ k_VectorList,
+ k_VectorIndex,
+ k_Token,
+ k_SysReg,
+ k_SysCR,
+ k_Prefetch,
+ k_ShiftExtend,
+ k_FPImm,
+ k_Barrier,
+ k_PSBHint,
+ } Kind;
+
+ SMLoc StartLoc, EndLoc;
+
+ struct TokOp {
+ const char *Data;
+ unsigned Length;
+ bool IsSuffix; // Is the operand actually a suffix on the mnemonic.
+ };
+
+ struct RegOp {
+ unsigned RegNum;
+ bool isVector;
+ };
+
+ struct VectorListOp {
+ unsigned RegNum;
+ unsigned Count;
+ unsigned NumElements;
+ unsigned ElementKind;
+ };
+
+ struct VectorIndexOp {
+ unsigned Val;
+ };
+
+ struct ImmOp {
+ const MCExpr *Val;
+ };
+
+ struct ShiftedImmOp {
+ const MCExpr *Val;
+ unsigned ShiftAmount;
+ };
+
+ struct CondCodeOp {
+ AArch64CC::CondCode Code;
+ };
+
+ struct FPImmOp {
+ unsigned Val; // Encoded 8-bit representation.
+ };
+
+ struct BarrierOp {
+ const char *Data;
+ unsigned Length;
+ unsigned Val; // Not the enum since not all values have names.
+ };
+
+ struct SysRegOp {
+ const char *Data;
+ unsigned Length;
+ uint32_t MRSReg;
+ uint32_t MSRReg;
+ uint32_t PStateField;
+ };
+
+ struct SysCRImmOp {
+ unsigned Val;
+ };
+
+ struct PrefetchOp {
+ const char *Data;
+ unsigned Length;
+ unsigned Val;
+ };
+
+ struct PSBHintOp {
+ const char *Data;
+ unsigned Length;
+ unsigned Val;
+ };
+
+ struct ShiftExtendOp {
+ AArch64_AM::ShiftExtendType Type;
+ unsigned Amount;
+ bool HasExplicitAmount;
+ };
+
+ struct ExtendOp {
+ unsigned Val;
+ };
+
+ union {
+ struct TokOp Tok;
+ struct RegOp Reg;
+ struct VectorListOp VectorList;
+ struct VectorIndexOp VectorIndex;
+ struct ImmOp Imm;
+ struct ShiftedImmOp ShiftedImm;
+ struct CondCodeOp CondCode;
+ struct FPImmOp FPImm;
+ struct BarrierOp Barrier;
+ struct SysRegOp SysReg;
+ struct SysCRImmOp SysCRImm;
+ struct PrefetchOp Prefetch;
+ struct PSBHintOp PSBHint;
+ struct ShiftExtendOp ShiftExtend;
+ };
+
+ // Keep the MCContext around as the MCExprs may need manipulated during
+ // the add<>Operands() calls.
+ MCContext &Ctx;
+
+public:
+ AArch64Operand(KindTy K, MCContext &Ctx) : Kind(K), Ctx(Ctx) {}
+
+ AArch64Operand(const AArch64Operand &o) : MCParsedAsmOperand(), Ctx(o.Ctx) {
+ Kind = o.Kind;
+ StartLoc = o.StartLoc;
+ EndLoc = o.EndLoc;
+ switch (Kind) {
+ case k_Token:
+ Tok = o.Tok;
+ break;
+ case k_Immediate:
+ Imm = o.Imm;
+ break;
+ case k_ShiftedImm:
+ ShiftedImm = o.ShiftedImm;
+ break;
+ case k_CondCode:
+ CondCode = o.CondCode;
+ break;
+ case k_FPImm:
+ FPImm = o.FPImm;
+ break;
+ case k_Barrier:
+ Barrier = o.Barrier;
+ break;
+ case k_Register:
+ Reg = o.Reg;
+ break;
+ case k_VectorList:
+ VectorList = o.VectorList;
+ break;
+ case k_VectorIndex:
+ VectorIndex = o.VectorIndex;
+ break;
+ case k_SysReg:
+ SysReg = o.SysReg;
+ break;
+ case k_SysCR:
+ SysCRImm = o.SysCRImm;
+ break;
+ case k_Prefetch:
+ Prefetch = o.Prefetch;
+ break;
+ case k_PSBHint:
+ PSBHint = o.PSBHint;
+ break;
+ case k_ShiftExtend:
+ ShiftExtend = o.ShiftExtend;
+ break;
+ }
+ }
+
+ /// getStartLoc - Get the location of the first token of this operand.
+ SMLoc getStartLoc() const override { return StartLoc; }
+ /// getEndLoc - Get the location of the last token of this operand.
+ SMLoc getEndLoc() const override { return EndLoc; }
+
+ StringRef getToken() const {
+ assert(Kind == k_Token && "Invalid access!");
+ return StringRef(Tok.Data, Tok.Length);
+ }
+
+ bool isTokenSuffix() const {
+ assert(Kind == k_Token && "Invalid access!");
+ return Tok.IsSuffix;
+ }
+
+ const MCExpr *getImm() const {
+ assert(Kind == k_Immediate && "Invalid access!");
+ return Imm.Val;
+ }
+
+ const MCExpr *getShiftedImmVal() const {
+ assert(Kind == k_ShiftedImm && "Invalid access!");
+ return ShiftedImm.Val;
+ }
+
+ unsigned getShiftedImmShift() const {
+ assert(Kind == k_ShiftedImm && "Invalid access!");
+ return ShiftedImm.ShiftAmount;
+ }
+
+ AArch64CC::CondCode getCondCode() const {
+ assert(Kind == k_CondCode && "Invalid access!");
+ return CondCode.Code;
+ }
+
+ unsigned getFPImm() const {
+ assert(Kind == k_FPImm && "Invalid access!");
+ return FPImm.Val;
+ }
+
+ unsigned getBarrier() const {
+ assert(Kind == k_Barrier && "Invalid access!");
+ return Barrier.Val;
+ }
+
+ StringRef getBarrierName() const {
+ assert(Kind == k_Barrier && "Invalid access!");
+ return StringRef(Barrier.Data, Barrier.Length);
+ }
+
+ unsigned getReg() const override {
+ assert(Kind == k_Register && "Invalid access!");
+ return Reg.RegNum;
+ }
+
+ unsigned getVectorListStart() const {
+ assert(Kind == k_VectorList && "Invalid access!");
+ return VectorList.RegNum;
+ }
+
+ unsigned getVectorListCount() const {
+ assert(Kind == k_VectorList && "Invalid access!");
+ return VectorList.Count;
+ }
+
+ unsigned getVectorIndex() const {
+ assert(Kind == k_VectorIndex && "Invalid access!");
+ return VectorIndex.Val;
+ }
+
+ StringRef getSysReg() const {
+ assert(Kind == k_SysReg && "Invalid access!");
+ return StringRef(SysReg.Data, SysReg.Length);
+ }
+
+ unsigned getSysCR() const {
+ assert(Kind == k_SysCR && "Invalid access!");
+ return SysCRImm.Val;
+ }
+
+ unsigned getPrefetch() const {
+ assert(Kind == k_Prefetch && "Invalid access!");
+ return Prefetch.Val;
+ }
+
+ unsigned getPSBHint() const {
+ assert(Kind == k_PSBHint && "Invalid access!");
+ return PSBHint.Val;
+ }
+
+ StringRef getPSBHintName() const {
+ assert(Kind == k_PSBHint && "Invalid access!");
+ return StringRef(PSBHint.Data, PSBHint.Length);
+ }
+
+ StringRef getPrefetchName() const {
+ assert(Kind == k_Prefetch && "Invalid access!");
+ return StringRef(Prefetch.Data, Prefetch.Length);
+ }
+
+ AArch64_AM::ShiftExtendType getShiftExtendType() const {
+ assert(Kind == k_ShiftExtend && "Invalid access!");
+ return ShiftExtend.Type;
+ }
+
+ unsigned getShiftExtendAmount() const {
+ assert(Kind == k_ShiftExtend && "Invalid access!");
+ return ShiftExtend.Amount;
+ }
+
+ bool hasShiftExtendAmount() const {
+ assert(Kind == k_ShiftExtend && "Invalid access!");
+ return ShiftExtend.HasExplicitAmount;
+ }
+
+ bool isImm() const override { return Kind == k_Immediate; }
+ bool isMem() const override { return false; }
+ bool isSImm9() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ return (Val >= -256 && Val < 256);
+ }
+ bool isSImm7s4() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ return (Val >= -256 && Val <= 252 && (Val & 3) == 0);
+ }
+ bool isSImm7s8() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ return (Val >= -512 && Val <= 504 && (Val & 7) == 0);
+ }
+ bool isSImm7s16() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ return (Val >= -1024 && Val <= 1008 && (Val & 15) == 0);
+ }
+
+ bool isSymbolicUImm12Offset(const MCExpr *Expr, unsigned Scale) const {
+ AArch64MCExpr::VariantKind ELFRefKind;
+ MCSymbolRefExpr::VariantKind DarwinRefKind;
+ int64_t Addend;
+ if (!AArch64AsmParser::classifySymbolRef(Expr, ELFRefKind, DarwinRefKind,
+ Addend)) {
+ // If we don't understand the expression, assume the best and
+ // let the fixup and relocation code deal with it.
+ return true;
+ }
+
+ if (DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF ||
+ ELFRefKind == AArch64MCExpr::VK_LO12 ||
+ ELFRefKind == AArch64MCExpr::VK_GOT_LO12 ||
+ ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12 ||
+ ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC ||
+ ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 ||
+ ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC ||
+ ELFRefKind == AArch64MCExpr::VK_GOTTPREL_LO12_NC ||
+ ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12) {
+ // Note that we don't range-check the addend. It's adjusted modulo page
+ // size when converted, so there is no "out of range" condition when using
+ // @pageoff.
+ return Addend >= 0 && (Addend % Scale) == 0;
+ } else if (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF ||
+ DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF) {
+ // @gotpageoff/@tlvppageoff can only be used directly, not with an addend.
+ return Addend == 0;
+ }
+
+ return false;
+ }
+
+ template <int Scale> bool isUImm12Offset() const {
+ if (!isImm())
+ return false;
+
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return isSymbolicUImm12Offset(getImm(), Scale);
+
+ int64_t Val = MCE->getValue();
+ return (Val % Scale) == 0 && Val >= 0 && (Val / Scale) < 0x1000;
+ }
+
+ bool isImm0_1() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ return (Val >= 0 && Val < 2);
+ }
+ bool isImm0_7() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ return (Val >= 0 && Val < 8);
+ }
+ bool isImm1_8() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ return (Val > 0 && Val < 9);
+ }
+ bool isImm0_15() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ return (Val >= 0 && Val < 16);
+ }
+ bool isImm1_16() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ return (Val > 0 && Val < 17);
+ }
+ bool isImm0_31() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ return (Val >= 0 && Val < 32);
+ }
+ bool isImm1_31() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ return (Val >= 1 && Val < 32);
+ }
+ bool isImm1_32() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ return (Val >= 1 && Val < 33);
+ }
+ bool isImm0_63() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ return (Val >= 0 && Val < 64);
+ }
+ bool isImm1_63() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ return (Val >= 1 && Val < 64);
+ }
+ bool isImm1_64() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ return (Val >= 1 && Val < 65);
+ }
+ bool isImm0_127() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ return (Val >= 0 && Val < 128);
+ }
+ bool isImm0_255() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ return (Val >= 0 && Val < 256);
+ }
+ bool isImm0_65535() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ return (Val >= 0 && Val < 65536);
+ }
+ bool isImm32_63() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ return (Val >= 32 && Val < 64);
+ }
+ bool isLogicalImm32() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ if (Val >> 32 != 0 && Val >> 32 != ~0LL)
+ return false;
+ Val &= 0xFFFFFFFF;
+ return AArch64_AM::isLogicalImmediate(Val, 32);
+ }
+ bool isLogicalImm64() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ return AArch64_AM::isLogicalImmediate(MCE->getValue(), 64);
+ }
+ bool isLogicalImm32Not() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = ~MCE->getValue() & 0xFFFFFFFF;
+ return AArch64_AM::isLogicalImmediate(Val, 32);
+ }
+ bool isLogicalImm64Not() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ return AArch64_AM::isLogicalImmediate(~MCE->getValue(), 64);
+ }
+ bool isShiftedImm() const { return Kind == k_ShiftedImm; }
+ bool isAddSubImm() const {
+ if (!isShiftedImm() && !isImm())
+ return false;
+
+ const MCExpr *Expr;
+
+ // An ADD/SUB shifter is either 'lsl #0' or 'lsl #12'.
+ if (isShiftedImm()) {
+ unsigned Shift = ShiftedImm.ShiftAmount;
+ Expr = ShiftedImm.Val;
+ if (Shift != 0 && Shift != 12)
+ return false;
+ } else {
+ Expr = getImm();
+ }
+
+ AArch64MCExpr::VariantKind ELFRefKind;
+ MCSymbolRefExpr::VariantKind DarwinRefKind;
+ int64_t Addend;
+ if (AArch64AsmParser::classifySymbolRef(Expr, ELFRefKind,
+ DarwinRefKind, Addend)) {
+ return DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF
+ || DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF
+ || (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF && Addend == 0)
+ || ELFRefKind == AArch64MCExpr::VK_LO12
+ || ELFRefKind == AArch64MCExpr::VK_DTPREL_HI12
+ || ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12
+ || ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC
+ || ELFRefKind == AArch64MCExpr::VK_TPREL_HI12
+ || ELFRefKind == AArch64MCExpr::VK_TPREL_LO12
+ || ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC
+ || ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12;
+ }
+
+ // If it's a constant, it should be a real immediate in range:
+ if (auto *CE = dyn_cast<MCConstantExpr>(Expr))
+ return CE->getValue() >= 0 && CE->getValue() <= 0xfff;
+
+ // If it's an expression, we hope for the best and let the fixup/relocation
+ // code deal with it.
+ return true;
+ }
+ bool isAddSubImmNeg() const {
+ if (!isShiftedImm() && !isImm())
+ return false;
+
+ const MCExpr *Expr;
+
+ // An ADD/SUB shifter is either 'lsl #0' or 'lsl #12'.
+ if (isShiftedImm()) {
+ unsigned Shift = ShiftedImm.ShiftAmount;
+ Expr = ShiftedImm.Val;
+ if (Shift != 0 && Shift != 12)
+ return false;
+ } else
+ Expr = getImm();
+
+ // Otherwise it should be a real negative immediate in range:
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr);
+ return CE != nullptr && CE->getValue() < 0 && -CE->getValue() <= 0xfff;
+ }
+ bool isCondCode() const { return Kind == k_CondCode; }
+ bool isSIMDImmType10() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ return AArch64_AM::isAdvSIMDModImmType10(MCE->getValue());
+ }
+ bool isBranchTarget26() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return true;
+ int64_t Val = MCE->getValue();
+ if (Val & 0x3)
+ return false;
+ return (Val >= -(0x2000000 << 2) && Val <= (0x1ffffff << 2));
+ }
+ bool isPCRelLabel19() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return true;
+ int64_t Val = MCE->getValue();
+ if (Val & 0x3)
+ return false;
+ return (Val >= -(0x40000 << 2) && Val <= (0x3ffff << 2));
+ }
+ bool isBranchTarget14() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return true;
+ int64_t Val = MCE->getValue();
+ if (Val & 0x3)
+ return false;
+ return (Val >= -(0x2000 << 2) && Val <= (0x1fff << 2));
+ }
+
+ bool
+ isMovWSymbol(ArrayRef<AArch64MCExpr::VariantKind> AllowedModifiers) const {
+ if (!isImm())
+ return false;
+
+ AArch64MCExpr::VariantKind ELFRefKind;
+ MCSymbolRefExpr::VariantKind DarwinRefKind;
+ int64_t Addend;
+ if (!AArch64AsmParser::classifySymbolRef(getImm(), ELFRefKind,
+ DarwinRefKind, Addend)) {
+ return false;
+ }
+ if (DarwinRefKind != MCSymbolRefExpr::VK_None)
+ return false;
+
+ for (unsigned i = 0; i != AllowedModifiers.size(); ++i) {
+ if (ELFRefKind == AllowedModifiers[i])
+ return Addend == 0;
+ }
+
+ return false;
+ }
+
+ bool isMovZSymbolG3() const {
+ return isMovWSymbol(AArch64MCExpr::VK_ABS_G3);
+ }
+
+ bool isMovZSymbolG2() const {
+ return isMovWSymbol({AArch64MCExpr::VK_ABS_G2, AArch64MCExpr::VK_ABS_G2_S,
+ AArch64MCExpr::VK_TPREL_G2,
+ AArch64MCExpr::VK_DTPREL_G2});
+ }
+
+ bool isMovZSymbolG1() const {
+ return isMovWSymbol({
+ AArch64MCExpr::VK_ABS_G1, AArch64MCExpr::VK_ABS_G1_S,
+ AArch64MCExpr::VK_GOTTPREL_G1, AArch64MCExpr::VK_TPREL_G1,
+ AArch64MCExpr::VK_DTPREL_G1,
+ });
+ }
+
+ bool isMovZSymbolG0() const {
+ return isMovWSymbol({AArch64MCExpr::VK_ABS_G0, AArch64MCExpr::VK_ABS_G0_S,
+ AArch64MCExpr::VK_TPREL_G0,
+ AArch64MCExpr::VK_DTPREL_G0});
+ }
+
+ bool isMovKSymbolG3() const {
+ return isMovWSymbol(AArch64MCExpr::VK_ABS_G3);
+ }
+
+ bool isMovKSymbolG2() const {
+ return isMovWSymbol(AArch64MCExpr::VK_ABS_G2_NC);
+ }
+
+ bool isMovKSymbolG1() const {
+ return isMovWSymbol({AArch64MCExpr::VK_ABS_G1_NC,
+ AArch64MCExpr::VK_TPREL_G1_NC,
+ AArch64MCExpr::VK_DTPREL_G1_NC});
+ }
+
+ bool isMovKSymbolG0() const {
+ return isMovWSymbol(
+ {AArch64MCExpr::VK_ABS_G0_NC, AArch64MCExpr::VK_GOTTPREL_G0_NC,
+ AArch64MCExpr::VK_TPREL_G0_NC, AArch64MCExpr::VK_DTPREL_G0_NC});
+ }
+
+ template<int RegWidth, int Shift>
+ bool isMOVZMovAlias() const {
+ if (!isImm()) return false;
+
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ uint64_t Value = CE->getValue();
+
+ return AArch64_AM::isMOVZMovAlias(Value, Shift, RegWidth);
+ }
+
+ template<int RegWidth, int Shift>
+ bool isMOVNMovAlias() const {
+ if (!isImm()) return false;
+
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ uint64_t Value = CE->getValue();
+
+ return AArch64_AM::isMOVNMovAlias(Value, Shift, RegWidth);
+ }
+
+ bool isFPImm() const { return Kind == k_FPImm; }
+ bool isBarrier() const { return Kind == k_Barrier; }
+ bool isSysReg() const { return Kind == k_SysReg; }
+ bool isMRSSystemRegister() const {
+ if (!isSysReg()) return false;
+
+ return SysReg.MRSReg != -1U;
+ }
+ bool isMSRSystemRegister() const {
+ if (!isSysReg()) return false;
+ return SysReg.MSRReg != -1U;
+ }
+ bool isSystemPStateFieldWithImm0_1() const {
+ if (!isSysReg()) return false;
+ return (SysReg.PStateField == AArch64PState::PAN ||
+ SysReg.PStateField == AArch64PState::UAO);
+ }
+ bool isSystemPStateFieldWithImm0_15() const {
+ if (!isSysReg() || isSystemPStateFieldWithImm0_1()) return false;
+ return SysReg.PStateField != -1U;
+ }
+ bool isReg() const override { return Kind == k_Register && !Reg.isVector; }
+ bool isVectorReg() const { return Kind == k_Register && Reg.isVector; }
+ bool isVectorRegLo() const {
+ return Kind == k_Register && Reg.isVector &&
+ AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains(
+ Reg.RegNum);
+ }
+ bool isGPR32as64() const {
+ return Kind == k_Register && !Reg.isVector &&
+ AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(Reg.RegNum);
+ }
+ bool isWSeqPair() const {
+ return Kind == k_Register && !Reg.isVector &&
+ AArch64MCRegisterClasses[AArch64::WSeqPairsClassRegClassID].contains(
+ Reg.RegNum);
+ }
+ bool isXSeqPair() const {
+ return Kind == k_Register && !Reg.isVector &&
+ AArch64MCRegisterClasses[AArch64::XSeqPairsClassRegClassID].contains(
+ Reg.RegNum);
+ }
+
+ bool isGPR64sp0() const {
+ return Kind == k_Register && !Reg.isVector &&
+ AArch64MCRegisterClasses[AArch64::GPR64spRegClassID].contains(Reg.RegNum);
+ }
+
+ /// Is this a vector list with the type implicit (presumably attached to the
+ /// instruction itself)?
+ template <unsigned NumRegs> bool isImplicitlyTypedVectorList() const {
+ return Kind == k_VectorList && VectorList.Count == NumRegs &&
+ !VectorList.ElementKind;
+ }
+
+ template <unsigned NumRegs, unsigned NumElements, char ElementKind>
+ bool isTypedVectorList() const {
+ if (Kind != k_VectorList)
+ return false;
+ if (VectorList.Count != NumRegs)
+ return false;
+ if (VectorList.ElementKind != ElementKind)
+ return false;
+ return VectorList.NumElements == NumElements;
+ }
+
+ bool isVectorIndex1() const {
+ return Kind == k_VectorIndex && VectorIndex.Val == 1;
+ }
+ bool isVectorIndexB() const {
+ return Kind == k_VectorIndex && VectorIndex.Val < 16;
+ }
+ bool isVectorIndexH() const {
+ return Kind == k_VectorIndex && VectorIndex.Val < 8;
+ }
+ bool isVectorIndexS() const {
+ return Kind == k_VectorIndex && VectorIndex.Val < 4;
+ }
+ bool isVectorIndexD() const {
+ return Kind == k_VectorIndex && VectorIndex.Val < 2;
+ }
+ bool isToken() const override { return Kind == k_Token; }
+ bool isTokenEqual(StringRef Str) const {
+ return Kind == k_Token && getToken() == Str;
+ }
+ bool isSysCR() const { return Kind == k_SysCR; }
+ bool isPrefetch() const { return Kind == k_Prefetch; }
+ bool isPSBHint() const { return Kind == k_PSBHint; }
+ bool isShiftExtend() const { return Kind == k_ShiftExtend; }
+ bool isShifter() const {
+ if (!isShiftExtend())
+ return false;
+
+ AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+ return (ST == AArch64_AM::LSL || ST == AArch64_AM::LSR ||
+ ST == AArch64_AM::ASR || ST == AArch64_AM::ROR ||
+ ST == AArch64_AM::MSL);
+ }
+ bool isExtend() const {
+ if (!isShiftExtend())
+ return false;
+
+ AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+ return (ET == AArch64_AM::UXTB || ET == AArch64_AM::SXTB ||
+ ET == AArch64_AM::UXTH || ET == AArch64_AM::SXTH ||
+ ET == AArch64_AM::UXTW || ET == AArch64_AM::SXTW ||
+ ET == AArch64_AM::UXTX || ET == AArch64_AM::SXTX ||
+ ET == AArch64_AM::LSL) &&
+ getShiftExtendAmount() <= 4;
+ }
+
+ bool isExtend64() const {
+ if (!isExtend())
+ return false;
+ // UXTX and SXTX require a 64-bit source register (the ExtendLSL64 class).
+ AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+ return ET != AArch64_AM::UXTX && ET != AArch64_AM::SXTX;
+ }
+ bool isExtendLSL64() const {
+ if (!isExtend())
+ return false;
+ AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+ return (ET == AArch64_AM::UXTX || ET == AArch64_AM::SXTX ||
+ ET == AArch64_AM::LSL) &&
+ getShiftExtendAmount() <= 4;
+ }
+
+ template<int Width> bool isMemXExtend() const {
+ if (!isExtend())
+ return false;
+ AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+ return (ET == AArch64_AM::LSL || ET == AArch64_AM::SXTX) &&
+ (getShiftExtendAmount() == Log2_32(Width / 8) ||
+ getShiftExtendAmount() == 0);
+ }
+
+ template<int Width> bool isMemWExtend() const {
+ if (!isExtend())
+ return false;
+ AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+ return (ET == AArch64_AM::UXTW || ET == AArch64_AM::SXTW) &&
+ (getShiftExtendAmount() == Log2_32(Width / 8) ||
+ getShiftExtendAmount() == 0);
+ }
+
+ template <unsigned width>
+ bool isArithmeticShifter() const {
+ if (!isShifter())
+ return false;
+
+ // An arithmetic shifter is LSL, LSR, or ASR.
+ AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+ return (ST == AArch64_AM::LSL || ST == AArch64_AM::LSR ||
+ ST == AArch64_AM::ASR) && getShiftExtendAmount() < width;
+ }
+
+ template <unsigned width>
+ bool isLogicalShifter() const {
+ if (!isShifter())
+ return false;
+
+ // A logical shifter is LSL, LSR, ASR or ROR.
+ AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+ return (ST == AArch64_AM::LSL || ST == AArch64_AM::LSR ||
+ ST == AArch64_AM::ASR || ST == AArch64_AM::ROR) &&
+ getShiftExtendAmount() < width;
+ }
+
+ bool isMovImm32Shifter() const {
+ if (!isShifter())
+ return false;
+
+ // A MOVi shifter is LSL of 0, 16, 32, or 48.
+ AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+ if (ST != AArch64_AM::LSL)
+ return false;
+ uint64_t Val = getShiftExtendAmount();
+ return (Val == 0 || Val == 16);
+ }
+
+ bool isMovImm64Shifter() const {
+ if (!isShifter())
+ return false;
+
+ // A MOVi shifter is LSL of 0 or 16.
+ AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+ if (ST != AArch64_AM::LSL)
+ return false;
+ uint64_t Val = getShiftExtendAmount();
+ return (Val == 0 || Val == 16 || Val == 32 || Val == 48);
+ }
+
+ bool isLogicalVecShifter() const {
+ if (!isShifter())
+ return false;
+
+ // A logical vector shifter is a left shift by 0, 8, 16, or 24.
+ unsigned Shift = getShiftExtendAmount();
+ return getShiftExtendType() == AArch64_AM::LSL &&
+ (Shift == 0 || Shift == 8 || Shift == 16 || Shift == 24);
+ }
+
+ bool isLogicalVecHalfWordShifter() const {
+ if (!isLogicalVecShifter())
+ return false;
+
+ // A logical vector shifter is a left shift by 0 or 8.
+ unsigned Shift = getShiftExtendAmount();
+ return getShiftExtendType() == AArch64_AM::LSL &&
+ (Shift == 0 || Shift == 8);
+ }
+
+ bool isMoveVecShifter() const {
+ if (!isShiftExtend())
+ return false;
+
+ // A logical vector shifter is a left shift by 8 or 16.
+ unsigned Shift = getShiftExtendAmount();
+ return getShiftExtendType() == AArch64_AM::MSL &&
+ (Shift == 8 || Shift == 16);
+ }
+
+ // Fallback unscaled operands are for aliases of LDR/STR that fall back
+ // to LDUR/STUR when the offset is not legal for the former but is for
+ // the latter. As such, in addition to checking for being a legal unscaled
+ // address, also check that it is not a legal scaled address. This avoids
+ // ambiguity in the matcher.
+ template<int Width>
+ bool isSImm9OffsetFB() const {
+ return isSImm9() && !isUImm12Offset<Width / 8>();
+ }
+
+ bool isAdrpLabel() const {
+ // Validation was handled during parsing, so we just sanity check that
+ // something didn't go haywire.
+ if (!isImm())
+ return false;
+
+ if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
+ int64_t Val = CE->getValue();
+ int64_t Min = - (4096 * (1LL << (21 - 1)));
+ int64_t Max = 4096 * ((1LL << (21 - 1)) - 1);
+ return (Val % 4096) == 0 && Val >= Min && Val <= Max;
+ }
+
+ return true;
+ }
+
+ bool isAdrLabel() const {
+ // Validation was handled during parsing, so we just sanity check that
+ // something didn't go haywire.
+ if (!isImm())
+ return false;
+
+ if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
+ int64_t Val = CE->getValue();
+ int64_t Min = - (1LL << (21 - 1));
+ int64_t Max = ((1LL << (21 - 1)) - 1);
+ return Val >= Min && Val <= Max;
+ }
+
+ return true;
+ }
+
+ void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+ // Add as immediates when possible. Null MCExpr = 0.
+ if (!Expr)
+ Inst.addOperand(MCOperand::createImm(0));
+ else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+ Inst.addOperand(MCOperand::createImm(CE->getValue()));
+ else
+ Inst.addOperand(MCOperand::createExpr(Expr));
+ }
+
+ void addRegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getReg()));
+ }
+
+ void addGPR32as64Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ assert(
+ AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(getReg()));
+
+ const MCRegisterInfo *RI = Ctx.getRegisterInfo();
+ uint32_t Reg = RI->getRegClass(AArch64::GPR32RegClassID).getRegister(
+ RI->getEncodingValue(getReg()));
+
+ Inst.addOperand(MCOperand::createReg(Reg));
+ }
+
+ void addVectorReg64Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ assert(
+ AArch64MCRegisterClasses[AArch64::FPR128RegClassID].contains(getReg()));
+ Inst.addOperand(MCOperand::createReg(AArch64::D0 + getReg() - AArch64::Q0));
+ }
+
+ void addVectorReg128Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ assert(
+ AArch64MCRegisterClasses[AArch64::FPR128RegClassID].contains(getReg()));
+ Inst.addOperand(MCOperand::createReg(getReg()));
+ }
+
+ void addVectorRegLoOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getReg()));
+ }
+
+ template <unsigned NumRegs>
+ void addVectorList64Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ static const unsigned FirstRegs[] = { AArch64::D0,
+ AArch64::D0_D1,
+ AArch64::D0_D1_D2,
+ AArch64::D0_D1_D2_D3 };
+ unsigned FirstReg = FirstRegs[NumRegs - 1];
+
+ Inst.addOperand(
+ MCOperand::createReg(FirstReg + getVectorListStart() - AArch64::Q0));
+ }
+
+ template <unsigned NumRegs>
+ void addVectorList128Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ static const unsigned FirstRegs[] = { AArch64::Q0,
+ AArch64::Q0_Q1,
+ AArch64::Q0_Q1_Q2,
+ AArch64::Q0_Q1_Q2_Q3 };
+ unsigned FirstReg = FirstRegs[NumRegs - 1];
+
+ Inst.addOperand(
+ MCOperand::createReg(FirstReg + getVectorListStart() - AArch64::Q0));
+ }
+
+ void addVectorIndex1Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createImm(getVectorIndex()));
+ }
+
+ void addVectorIndexBOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createImm(getVectorIndex()));
+ }
+
+ void addVectorIndexHOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createImm(getVectorIndex()));
+ }
+
+ void addVectorIndexSOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createImm(getVectorIndex()));
+ }
+
+ void addVectorIndexDOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createImm(getVectorIndex()));
+ }
+
+ void addImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ // If this is a pageoff symrefexpr with an addend, adjust the addend
+ // to be only the page-offset portion. Otherwise, just add the expr
+ // as-is.
+ addExpr(Inst, getImm());
+ }
+
+ void addAddSubImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 2 && "Invalid number of operands!");
+ if (isShiftedImm()) {
+ addExpr(Inst, getShiftedImmVal());
+ Inst.addOperand(MCOperand::createImm(getShiftedImmShift()));
+ } else {
+ addExpr(Inst, getImm());
+ Inst.addOperand(MCOperand::createImm(0));
+ }
+ }
+
+ void addAddSubImmNegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 2 && "Invalid number of operands!");
+
+ const MCExpr *MCE = isShiftedImm() ? getShiftedImmVal() : getImm();
+ const MCConstantExpr *CE = cast<MCConstantExpr>(MCE);
+ int64_t Val = -CE->getValue();
+ unsigned ShiftAmt = isShiftedImm() ? ShiftedImm.ShiftAmount : 0;
+
+ Inst.addOperand(MCOperand::createImm(Val));
+ Inst.addOperand(MCOperand::createImm(ShiftAmt));
+ }
+
+ void addCondCodeOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createImm(getCondCode()));
+ }
+
+ void addAdrpLabelOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ addExpr(Inst, getImm());
+ else
+ Inst.addOperand(MCOperand::createImm(MCE->getValue() >> 12));
+ }
+
+ void addAdrLabelOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+
+ template<int Scale>
+ void addUImm12OffsetOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+
+ if (!MCE) {
+ Inst.addOperand(MCOperand::createExpr(getImm()));
+ return;
+ }
+ Inst.addOperand(MCOperand::createImm(MCE->getValue() / Scale));
+ }
+
+ void addSImm9Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+ Inst.addOperand(MCOperand::createImm(MCE->getValue()));
+ }
+
+ void addSImm7s4Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+ Inst.addOperand(MCOperand::createImm(MCE->getValue() / 4));
+ }
+
+ void addSImm7s8Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+ Inst.addOperand(MCOperand::createImm(MCE->getValue() / 8));
+ }
+
+ void addSImm7s16Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+ Inst.addOperand(MCOperand::createImm(MCE->getValue() / 16));
+ }
+
+ void addImm0_1Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+ Inst.addOperand(MCOperand::createImm(MCE->getValue()));
+ }
+
+ void addImm0_7Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+ Inst.addOperand(MCOperand::createImm(MCE->getValue()));
+ }
+
+ void addImm1_8Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+ Inst.addOperand(MCOperand::createImm(MCE->getValue()));
+ }
+
+ void addImm0_15Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+ Inst.addOperand(MCOperand::createImm(MCE->getValue()));
+ }
+
+ void addImm1_16Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+ assert(MCE && "Invalid constant immediate operand!");
+ Inst.addOperand(MCOperand::createImm(MCE->getValue()));
+ }
+
+ void addImm0_31Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+ Inst.addOperand(MCOperand::createImm(MCE->getValue()));
+ }
+
+ void addImm1_31Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+ Inst.addOperand(MCOperand::createImm(MCE->getValue()));
+ }
+
+ void addImm1_32Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+ Inst.addOperand(MCOperand::createImm(MCE->getValue()));
+ }
+
+ void addImm0_63Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+ Inst.addOperand(MCOperand::createImm(MCE->getValue()));
+ }
+
+ void addImm1_63Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+ Inst.addOperand(MCOperand::createImm(MCE->getValue()));
+ }
+
+ void addImm1_64Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+ Inst.addOperand(MCOperand::createImm(MCE->getValue()));
+ }
+
+ void addImm0_127Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+ Inst.addOperand(MCOperand::createImm(MCE->getValue()));
+ }
+
+ void addImm0_255Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+ Inst.addOperand(MCOperand::createImm(MCE->getValue()));
+ }
+
+ void addImm0_65535Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+ Inst.addOperand(MCOperand::createImm(MCE->getValue()));
+ }
+
+ void addImm32_63Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+ Inst.addOperand(MCOperand::createImm(MCE->getValue()));
+ }
+
+ void addLogicalImm32Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+ uint64_t encoding =
+ AArch64_AM::encodeLogicalImmediate(MCE->getValue() & 0xFFFFFFFF, 32);
+ Inst.addOperand(MCOperand::createImm(encoding));
+ }
+
+ void addLogicalImm64Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+ uint64_t encoding = AArch64_AM::encodeLogicalImmediate(MCE->getValue(), 64);
+ Inst.addOperand(MCOperand::createImm(encoding));
+ }
+
+ void addLogicalImm32NotOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+ int64_t Val = ~MCE->getValue() & 0xFFFFFFFF;
+ uint64_t encoding = AArch64_AM::encodeLogicalImmediate(Val, 32);
+ Inst.addOperand(MCOperand::createImm(encoding));
+ }
+
+ void addLogicalImm64NotOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+ uint64_t encoding =
+ AArch64_AM::encodeLogicalImmediate(~MCE->getValue(), 64);
+ Inst.addOperand(MCOperand::createImm(encoding));
+ }
+
+ void addSIMDImmType10Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+ uint64_t encoding = AArch64_AM::encodeAdvSIMDModImmType10(MCE->getValue());
+ Inst.addOperand(MCOperand::createImm(encoding));
+ }
+
+ void addBranchTarget26Operands(MCInst &Inst, unsigned N) const {
+ // Branch operands don't encode the low bits, so shift them off
+ // here. If it's a label, however, just put it on directly as there's
+ // not enough information now to do anything.
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE) {
+ addExpr(Inst, getImm());
+ return;
+ }
+ assert(MCE && "Invalid constant immediate operand!");
+ Inst.addOperand(MCOperand::createImm(MCE->getValue() >> 2));
+ }
+
+ void addPCRelLabel19Operands(MCInst &Inst, unsigned N) const {
+ // Branch operands don't encode the low bits, so shift them off
+ // here. If it's a label, however, just put it on directly as there's
+ // not enough information now to do anything.
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE) {
+ addExpr(Inst, getImm());
+ return;
+ }
+ assert(MCE && "Invalid constant immediate operand!");
+ Inst.addOperand(MCOperand::createImm(MCE->getValue() >> 2));
+ }
+
+ void addBranchTarget14Operands(MCInst &Inst, unsigned N) const {
+ // Branch operands don't encode the low bits, so shift them off
+ // here. If it's a label, however, just put it on directly as there's
+ // not enough information now to do anything.
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE) {
+ addExpr(Inst, getImm());
+ return;
+ }
+ assert(MCE && "Invalid constant immediate operand!");
+ Inst.addOperand(MCOperand::createImm(MCE->getValue() >> 2));
+ }
+
+ void addFPImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createImm(getFPImm()));
+ }
+
+ void addBarrierOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createImm(getBarrier()));
+ }
+
+ void addMRSSystemRegisterOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+
+ Inst.addOperand(MCOperand::createImm(SysReg.MRSReg));
+ }
+
+ void addMSRSystemRegisterOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+
+ Inst.addOperand(MCOperand::createImm(SysReg.MSRReg));
+ }
+
+ void addSystemPStateFieldWithImm0_1Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+
+ Inst.addOperand(MCOperand::createImm(SysReg.PStateField));
+ }
+
+ void addSystemPStateFieldWithImm0_15Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+
+ Inst.addOperand(MCOperand::createImm(SysReg.PStateField));
+ }
+
+ void addSysCROperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createImm(getSysCR()));
+ }
+
+ void addPrefetchOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createImm(getPrefetch()));
+ }
+
+ void addPSBHintOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createImm(getPSBHint()));
+ }
+
+ void addShifterOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ unsigned Imm =
+ AArch64_AM::getShifterImm(getShiftExtendType(), getShiftExtendAmount());
+ Inst.addOperand(MCOperand::createImm(Imm));
+ }
+
+ void addExtendOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+ if (ET == AArch64_AM::LSL) ET = AArch64_AM::UXTW;
+ unsigned Imm = AArch64_AM::getArithExtendImm(ET, getShiftExtendAmount());
+ Inst.addOperand(MCOperand::createImm(Imm));
+ }
+
+ void addExtend64Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+ if (ET == AArch64_AM::LSL) ET = AArch64_AM::UXTX;
+ unsigned Imm = AArch64_AM::getArithExtendImm(ET, getShiftExtendAmount());
+ Inst.addOperand(MCOperand::createImm(Imm));
+ }
+
+ void addMemExtendOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 2 && "Invalid number of operands!");
+ AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+ bool IsSigned = ET == AArch64_AM::SXTW || ET == AArch64_AM::SXTX;
+ Inst.addOperand(MCOperand::createImm(IsSigned));
+ Inst.addOperand(MCOperand::createImm(getShiftExtendAmount() != 0));
+ }
+
+ // For 8-bit load/store instructions with a register offset, both the
+ // "DoShift" and "NoShift" variants have a shift of 0. Because of this,
+ // they're disambiguated by whether the shift was explicit or implicit rather
+ // than its size.
+ void addMemExtend8Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 2 && "Invalid number of operands!");
+ AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+ bool IsSigned = ET == AArch64_AM::SXTW || ET == AArch64_AM::SXTX;
+ Inst.addOperand(MCOperand::createImm(IsSigned));
+ Inst.addOperand(MCOperand::createImm(hasShiftExtendAmount()));
+ }
+
+ template<int Shift>
+ void addMOVZMovAliasOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
+ uint64_t Value = CE->getValue();
+ Inst.addOperand(MCOperand::createImm((Value >> Shift) & 0xffff));
+ }
+
+ template<int Shift>
+ void addMOVNMovAliasOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
+ uint64_t Value = CE->getValue();
+ Inst.addOperand(MCOperand::createImm((~Value >> Shift) & 0xffff));
+ }
+
+ void print(raw_ostream &OS) const override;
+
+ static std::unique_ptr<AArch64Operand>
+ CreateToken(StringRef Str, bool IsSuffix, SMLoc S, MCContext &Ctx) {
+ auto Op = make_unique<AArch64Operand>(k_Token, Ctx);
+ Op->Tok.Data = Str.data();
+ Op->Tok.Length = Str.size();
+ Op->Tok.IsSuffix = IsSuffix;
+ Op->StartLoc = S;
+ Op->EndLoc = S;
+ return Op;
+ }
+
+ static std::unique_ptr<AArch64Operand>
+ CreateReg(unsigned RegNum, bool isVector, SMLoc S, SMLoc E, MCContext &Ctx) {
+ auto Op = make_unique<AArch64Operand>(k_Register, Ctx);
+ Op->Reg.RegNum = RegNum;
+ Op->Reg.isVector = isVector;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ static std::unique_ptr<AArch64Operand>
+ CreateVectorList(unsigned RegNum, unsigned Count, unsigned NumElements,
+ char ElementKind, SMLoc S, SMLoc E, MCContext &Ctx) {
+ auto Op = make_unique<AArch64Operand>(k_VectorList, Ctx);
+ Op->VectorList.RegNum = RegNum;
+ Op->VectorList.Count = Count;
+ Op->VectorList.NumElements = NumElements;
+ Op->VectorList.ElementKind = ElementKind;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ static std::unique_ptr<AArch64Operand>
+ CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E, MCContext &Ctx) {
+ auto Op = make_unique<AArch64Operand>(k_VectorIndex, Ctx);
+ Op->VectorIndex.Val = Idx;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ static std::unique_ptr<AArch64Operand> CreateImm(const MCExpr *Val, SMLoc S,
+ SMLoc E, MCContext &Ctx) {
+ auto Op = make_unique<AArch64Operand>(k_Immediate, Ctx);
+ Op->Imm.Val = Val;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ static std::unique_ptr<AArch64Operand> CreateShiftedImm(const MCExpr *Val,
+ unsigned ShiftAmount,
+ SMLoc S, SMLoc E,
+ MCContext &Ctx) {
+ auto Op = make_unique<AArch64Operand>(k_ShiftedImm, Ctx);
+ Op->ShiftedImm .Val = Val;
+ Op->ShiftedImm.ShiftAmount = ShiftAmount;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ static std::unique_ptr<AArch64Operand>
+ CreateCondCode(AArch64CC::CondCode Code, SMLoc S, SMLoc E, MCContext &Ctx) {
+ auto Op = make_unique<AArch64Operand>(k_CondCode, Ctx);
+ Op->CondCode.Code = Code;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ static std::unique_ptr<AArch64Operand> CreateFPImm(unsigned Val, SMLoc S,
+ MCContext &Ctx) {
+ auto Op = make_unique<AArch64Operand>(k_FPImm, Ctx);
+ Op->FPImm.Val = Val;
+ Op->StartLoc = S;
+ Op->EndLoc = S;
+ return Op;
+ }
+
+ static std::unique_ptr<AArch64Operand> CreateBarrier(unsigned Val,
+ StringRef Str,
+ SMLoc S,
+ MCContext &Ctx) {
+ auto Op = make_unique<AArch64Operand>(k_Barrier, Ctx);
+ Op->Barrier.Val = Val;
+ Op->Barrier.Data = Str.data();
+ Op->Barrier.Length = Str.size();
+ Op->StartLoc = S;
+ Op->EndLoc = S;
+ return Op;
+ }
+
+ static std::unique_ptr<AArch64Operand> CreateSysReg(StringRef Str, SMLoc S,
+ uint32_t MRSReg,
+ uint32_t MSRReg,
+ uint32_t PStateField,
+ MCContext &Ctx) {
+ auto Op = make_unique<AArch64Operand>(k_SysReg, Ctx);
+ Op->SysReg.Data = Str.data();
+ Op->SysReg.Length = Str.size();
+ Op->SysReg.MRSReg = MRSReg;
+ Op->SysReg.MSRReg = MSRReg;
+ Op->SysReg.PStateField = PStateField;
+ Op->StartLoc = S;
+ Op->EndLoc = S;
+ return Op;
+ }
+
+ static std::unique_ptr<AArch64Operand> CreateSysCR(unsigned Val, SMLoc S,
+ SMLoc E, MCContext &Ctx) {
+ auto Op = make_unique<AArch64Operand>(k_SysCR, Ctx);
+ Op->SysCRImm.Val = Val;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ static std::unique_ptr<AArch64Operand> CreatePrefetch(unsigned Val,
+ StringRef Str,
+ SMLoc S,
+ MCContext &Ctx) {
+ auto Op = make_unique<AArch64Operand>(k_Prefetch, Ctx);
+ Op->Prefetch.Val = Val;
+ Op->Barrier.Data = Str.data();
+ Op->Barrier.Length = Str.size();
+ Op->StartLoc = S;
+ Op->EndLoc = S;
+ return Op;
+ }
+
+ static std::unique_ptr<AArch64Operand> CreatePSBHint(unsigned Val,
+ StringRef Str,
+ SMLoc S,
+ MCContext &Ctx) {
+ auto Op = make_unique<AArch64Operand>(k_PSBHint, Ctx);
+ Op->PSBHint.Val = Val;
+ Op->PSBHint.Data = Str.data();
+ Op->PSBHint.Length = Str.size();
+ Op->StartLoc = S;
+ Op->EndLoc = S;
+ return Op;
+ }
+
+ static std::unique_ptr<AArch64Operand>
+ CreateShiftExtend(AArch64_AM::ShiftExtendType ShOp, unsigned Val,
+ bool HasExplicitAmount, SMLoc S, SMLoc E, MCContext &Ctx) {
+ auto Op = make_unique<AArch64Operand>(k_ShiftExtend, Ctx);
+ Op->ShiftExtend.Type = ShOp;
+ Op->ShiftExtend.Amount = Val;
+ Op->ShiftExtend.HasExplicitAmount = HasExplicitAmount;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+};
+
+} // end anonymous namespace.
+
+void AArch64Operand::print(raw_ostream &OS) const {
+ switch (Kind) {
+ case k_FPImm:
+ OS << "<fpimm " << getFPImm() << "("
+ << AArch64_AM::getFPImmFloat(getFPImm()) << ") >";
+ break;
+ case k_Barrier: {
+ StringRef Name = getBarrierName();
+ if (!Name.empty())
+ OS << "<barrier " << Name << ">";
+ else
+ OS << "<barrier invalid #" << getBarrier() << ">";
+ break;
+ }
+ case k_Immediate:
+ OS << *getImm();
+ break;
+ case k_ShiftedImm: {
+ unsigned Shift = getShiftedImmShift();
+ OS << "<shiftedimm ";
+ OS << *getShiftedImmVal();
+ OS << ", lsl #" << AArch64_AM::getShiftValue(Shift) << ">";
+ break;
+ }
+ case k_CondCode:
+ OS << "<condcode " << getCondCode() << ">";
+ break;
+ case k_Register:
+ OS << "<register " << getReg() << ">";
+ break;
+ case k_VectorList: {
+ OS << "<vectorlist ";
+ unsigned Reg = getVectorListStart();
+ for (unsigned i = 0, e = getVectorListCount(); i != e; ++i)
+ OS << Reg + i << " ";
+ OS << ">";
+ break;
+ }
+ case k_VectorIndex:
+ OS << "<vectorindex " << getVectorIndex() << ">";
+ break;
+ case k_SysReg:
+ OS << "<sysreg: " << getSysReg() << '>';
+ break;
+ case k_Token:
+ OS << "'" << getToken() << "'";
+ break;
+ case k_SysCR:
+ OS << "c" << getSysCR();
+ break;
+ case k_Prefetch: {
+ StringRef Name = getPrefetchName();
+ if (!Name.empty())
+ OS << "<prfop " << Name << ">";
+ else
+ OS << "<prfop invalid #" << getPrefetch() << ">";
+ break;
+ }
+ case k_PSBHint: {
+ OS << getPSBHintName();
+ break;
+ }
+ case k_ShiftExtend: {
+ OS << "<" << AArch64_AM::getShiftExtendName(getShiftExtendType()) << " #"
+ << getShiftExtendAmount();
+ if (!hasShiftExtendAmount())
+ OS << "<imp>";
+ OS << '>';
+ break;
+ }
+ }
+}
+
+/// @name Auto-generated Match Functions
+/// {
+
+static unsigned MatchRegisterName(StringRef Name);
+
+/// }
+
+static unsigned matchVectorRegName(StringRef Name) {
+ return StringSwitch<unsigned>(Name.lower())
+ .Case("v0", AArch64::Q0)
+ .Case("v1", AArch64::Q1)
+ .Case("v2", AArch64::Q2)
+ .Case("v3", AArch64::Q3)
+ .Case("v4", AArch64::Q4)
+ .Case("v5", AArch64::Q5)
+ .Case("v6", AArch64::Q6)
+ .Case("v7", AArch64::Q7)
+ .Case("v8", AArch64::Q8)
+ .Case("v9", AArch64::Q9)
+ .Case("v10", AArch64::Q10)
+ .Case("v11", AArch64::Q11)
+ .Case("v12", AArch64::Q12)
+ .Case("v13", AArch64::Q13)
+ .Case("v14", AArch64::Q14)
+ .Case("v15", AArch64::Q15)
+ .Case("v16", AArch64::Q16)
+ .Case("v17", AArch64::Q17)
+ .Case("v18", AArch64::Q18)
+ .Case("v19", AArch64::Q19)
+ .Case("v20", AArch64::Q20)
+ .Case("v21", AArch64::Q21)
+ .Case("v22", AArch64::Q22)
+ .Case("v23", AArch64::Q23)
+ .Case("v24", AArch64::Q24)
+ .Case("v25", AArch64::Q25)
+ .Case("v26", AArch64::Q26)
+ .Case("v27", AArch64::Q27)
+ .Case("v28", AArch64::Q28)
+ .Case("v29", AArch64::Q29)
+ .Case("v30", AArch64::Q30)
+ .Case("v31", AArch64::Q31)
+ .Default(0);
+}
+
+static bool isValidVectorKind(StringRef Name) {
+ return StringSwitch<bool>(Name.lower())
+ .Case(".8b", true)
+ .Case(".16b", true)
+ .Case(".4h", true)
+ .Case(".8h", true)
+ .Case(".2s", true)
+ .Case(".4s", true)
+ .Case(".1d", true)
+ .Case(".2d", true)
+ .Case(".1q", true)
+ // Accept the width neutral ones, too, for verbose syntax. If those
+ // aren't used in the right places, the token operand won't match so
+ // all will work out.
+ .Case(".b", true)
+ .Case(".h", true)
+ .Case(".s", true)
+ .Case(".d", true)
+ // Needed for fp16 scalar pairwise reductions
+ .Case(".2h", true)
+ .Default(false);
+}
+
+static void parseValidVectorKind(StringRef Name, unsigned &NumElements,
+ char &ElementKind) {
+ assert(isValidVectorKind(Name));
+
+ ElementKind = Name.lower()[Name.size() - 1];
+ NumElements = 0;
+
+ if (Name.size() == 2)
+ return;
+
+ // Parse the lane count
+ Name = Name.drop_front();
+ while (isdigit(Name.front())) {
+ NumElements = 10 * NumElements + (Name.front() - '0');
+ Name = Name.drop_front();
+ }
+}
+
+bool AArch64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+ SMLoc &EndLoc) {
+ StartLoc = getLoc();
+ RegNo = tryParseRegister();
+ EndLoc = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+ return (RegNo == (unsigned)-1);
+}
+
+// Matches a register name or register alias previously defined by '.req'
+unsigned AArch64AsmParser::matchRegisterNameAlias(StringRef Name,
+ bool isVector) {
+ unsigned RegNum = isVector ? matchVectorRegName(Name)
+ : MatchRegisterName(Name);
+
+ if (RegNum == 0) {
+ // Check for aliases registered via .req. Canonicalize to lower case.
+ // That's more consistent since register names are case insensitive, and
+ // it's how the original entry was passed in from MC/MCParser/AsmParser.
+ auto Entry = RegisterReqs.find(Name.lower());
+ if (Entry == RegisterReqs.end())
+ return 0;
+ // set RegNum if the match is the right kind of register
+ if (isVector == Entry->getValue().first)
+ RegNum = Entry->getValue().second;
+ }
+ return RegNum;
+}
+
+/// tryParseRegister - Try to parse a register name. The token must be an
+/// Identifier when called, and if it is a register name the token is eaten and
+/// the register is added to the operand list.
+int AArch64AsmParser::tryParseRegister() {
+ MCAsmParser &Parser = getParser();
+ const AsmToken &Tok = Parser.getTok();
+ if (Tok.isNot(AsmToken::Identifier))
+ return -1;
+
+ std::string lowerCase = Tok.getString().lower();
+ unsigned RegNum = matchRegisterNameAlias(lowerCase, false);
+ // Also handle a few aliases of registers.
+ if (RegNum == 0)
+ RegNum = StringSwitch<unsigned>(lowerCase)
+ .Case("fp", AArch64::FP)
+ .Case("lr", AArch64::LR)
+ .Case("x31", AArch64::XZR)
+ .Case("w31", AArch64::WZR)
+ .Default(0);
+
+ if (RegNum == 0)
+ return -1;
+
+ Parser.Lex(); // Eat identifier token.
+ return RegNum;
+}
+
+/// tryMatchVectorRegister - Try to parse a vector register name with optional
+/// kind specifier. If it is a register specifier, eat the token and return it.
+int AArch64AsmParser::tryMatchVectorRegister(StringRef &Kind, bool expected) {
+ MCAsmParser &Parser = getParser();
+ if (Parser.getTok().isNot(AsmToken::Identifier)) {
+ TokError("vector register expected");
+ return -1;
+ }
+
+ StringRef Name = Parser.getTok().getString();
+ // If there is a kind specifier, it's separated from the register name by
+ // a '.'.
+ size_t Start = 0, Next = Name.find('.');
+ StringRef Head = Name.slice(Start, Next);
+ unsigned RegNum = matchRegisterNameAlias(Head, true);
+
+ if (RegNum) {
+ if (Next != StringRef::npos) {
+ Kind = Name.slice(Next, StringRef::npos);
+ if (!isValidVectorKind(Kind)) {
+ TokError("invalid vector kind qualifier");
+ return -1;
+ }
+ }
+ Parser.Lex(); // Eat the register token.
+ return RegNum;
+ }
+
+ if (expected)
+ TokError("vector register expected");
+ return -1;
+}
+
+/// tryParseSysCROperand - Try to parse a system instruction CR operand name.
+OperandMatchResultTy
+AArch64AsmParser::tryParseSysCROperand(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ SMLoc S = getLoc();
+
+ if (Parser.getTok().isNot(AsmToken::Identifier)) {
+ Error(S, "Expected cN operand where 0 <= N <= 15");
+ return MatchOperand_ParseFail;
+ }
+
+ StringRef Tok = Parser.getTok().getIdentifier();
+ if (Tok[0] != 'c' && Tok[0] != 'C') {
+ Error(S, "Expected cN operand where 0 <= N <= 15");
+ return MatchOperand_ParseFail;
+ }
+
+ uint32_t CRNum;
+ bool BadNum = Tok.drop_front().getAsInteger(10, CRNum);
+ if (BadNum || CRNum > 15) {
+ Error(S, "Expected cN operand where 0 <= N <= 15");
+ return MatchOperand_ParseFail;
+ }
+
+ Parser.Lex(); // Eat identifier token.
+ Operands.push_back(
+ AArch64Operand::CreateSysCR(CRNum, S, getLoc(), getContext()));
+ return MatchOperand_Success;
+}
+
+/// tryParsePrefetch - Try to parse a prefetch operand.
+OperandMatchResultTy
+AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ SMLoc S = getLoc();
+ const AsmToken &Tok = Parser.getTok();
+ // Either an identifier for named values or a 5-bit immediate.
+ // Eat optional hash.
+ if (parseOptionalToken(AsmToken::Hash) ||
+ Tok.is(AsmToken::Integer)) {
+ const MCExpr *ImmVal;
+ if (getParser().parseExpression(ImmVal))
+ return MatchOperand_ParseFail;
+
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+ if (!MCE) {
+ TokError("immediate value expected for prefetch operand");
+ return MatchOperand_ParseFail;
+ }
+ unsigned prfop = MCE->getValue();
+ if (prfop > 31) {
+ TokError("prefetch operand out of range, [0,31] expected");
+ return MatchOperand_ParseFail;
+ }
+
+ auto PRFM = AArch64PRFM::lookupPRFMByEncoding(MCE->getValue());
+ Operands.push_back(AArch64Operand::CreatePrefetch(
+ prfop, PRFM ? PRFM->Name : "", S, getContext()));
+ return MatchOperand_Success;
+ }
+
+ if (Tok.isNot(AsmToken::Identifier)) {
+ TokError("pre-fetch hint expected");
+ return MatchOperand_ParseFail;
+ }
+
+ auto PRFM = AArch64PRFM::lookupPRFMByName(Tok.getString());
+ if (!PRFM) {
+ TokError("pre-fetch hint expected");
+ return MatchOperand_ParseFail;
+ }
+
+ Parser.Lex(); // Eat identifier token.
+ Operands.push_back(AArch64Operand::CreatePrefetch(
+ PRFM->Encoding, Tok.getString(), S, getContext()));
+ return MatchOperand_Success;
+}
+
+/// tryParsePSBHint - Try to parse a PSB operand, mapped to Hint command
+OperandMatchResultTy
+AArch64AsmParser::tryParsePSBHint(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ SMLoc S = getLoc();
+ const AsmToken &Tok = Parser.getTok();
+ if (Tok.isNot(AsmToken::Identifier)) {
+ TokError("invalid operand for instruction");
+ return MatchOperand_ParseFail;
+ }
+
+ auto PSB = AArch64PSBHint::lookupPSBByName(Tok.getString());
+ if (!PSB) {
+ TokError("invalid operand for instruction");
+ return MatchOperand_ParseFail;
+ }
+
+ Parser.Lex(); // Eat identifier token.
+ Operands.push_back(AArch64Operand::CreatePSBHint(
+ PSB->Encoding, Tok.getString(), S, getContext()));
+ return MatchOperand_Success;
+}
+
+/// tryParseAdrpLabel - Parse and validate a source label for the ADRP
+/// instruction.
+OperandMatchResultTy
+AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ SMLoc S = getLoc();
+ const MCExpr *Expr;
+
+ if (Parser.getTok().is(AsmToken::Hash)) {
+ Parser.Lex(); // Eat hash token.
+ }
+
+ if (parseSymbolicImmVal(Expr))
+ return MatchOperand_ParseFail;
+
+ AArch64MCExpr::VariantKind ELFRefKind;
+ MCSymbolRefExpr::VariantKind DarwinRefKind;
+ int64_t Addend;
+ if (classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) {
+ if (DarwinRefKind == MCSymbolRefExpr::VK_None &&
+ ELFRefKind == AArch64MCExpr::VK_INVALID) {
+ // No modifier was specified at all; this is the syntax for an ELF basic
+ // ADRP relocation (unfortunately).
+ Expr =
+ AArch64MCExpr::create(Expr, AArch64MCExpr::VK_ABS_PAGE, getContext());
+ } else if ((DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGE ||
+ DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGE) &&
+ Addend != 0) {
+ Error(S, "gotpage label reference not allowed an addend");
+ return MatchOperand_ParseFail;
+ } else if (DarwinRefKind != MCSymbolRefExpr::VK_PAGE &&
+ DarwinRefKind != MCSymbolRefExpr::VK_GOTPAGE &&
+ DarwinRefKind != MCSymbolRefExpr::VK_TLVPPAGE &&
+ ELFRefKind != AArch64MCExpr::VK_GOT_PAGE &&
+ ELFRefKind != AArch64MCExpr::VK_GOTTPREL_PAGE &&
+ ELFRefKind != AArch64MCExpr::VK_TLSDESC_PAGE) {
+ // The operand must be an @page or @gotpage qualified symbolref.
+ Error(S, "page or gotpage label reference expected");
+ return MatchOperand_ParseFail;
+ }
+ }
+
+ // We have either a label reference possibly with addend or an immediate. The
+ // addend is a raw value here. The linker will adjust it to only reference the
+ // page.
+ SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+ Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext()));
+
+ return MatchOperand_Success;
+}
+
+/// tryParseAdrLabel - Parse and validate a source label for the ADR
+/// instruction.
+OperandMatchResultTy
+AArch64AsmParser::tryParseAdrLabel(OperandVector &Operands) {
+ SMLoc S = getLoc();
+ const MCExpr *Expr;
+
+ parseOptionalToken(AsmToken::Hash);
+ if (getParser().parseExpression(Expr))
+ return MatchOperand_ParseFail;
+
+ SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+ Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext()));
+
+ return MatchOperand_Success;
+}
+
+/// tryParseFPImm - A floating point immediate expression operand.
+OperandMatchResultTy
+AArch64AsmParser::tryParseFPImm(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ SMLoc S = getLoc();
+
+ bool Hash = parseOptionalToken(AsmToken::Hash);
+
+ // Handle negation, as that still comes through as a separate token.
+ bool isNegative = parseOptionalToken(AsmToken::Minus);
+
+ const AsmToken &Tok = Parser.getTok();
+ if (Tok.is(AsmToken::Real)) {
+ APFloat RealVal(APFloat::IEEEdouble(), Tok.getString());
+ if (isNegative)
+ RealVal.changeSign();
+
+ uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
+ int Val = AArch64_AM::getFP64Imm(APInt(64, IntVal));
+ Parser.Lex(); // Eat the token.
+ // Check for out of range values. As an exception, we let Zero through,
+ // as we handle that special case in post-processing before matching in
+ // order to use the zero register for it.
+ if (Val == -1 && !RealVal.isPosZero()) {
+ TokError("expected compatible register or floating-point constant");
+ return MatchOperand_ParseFail;
+ }
+ Operands.push_back(AArch64Operand::CreateFPImm(Val, S, getContext()));
+ return MatchOperand_Success;
+ }
+ if (Tok.is(AsmToken::Integer)) {
+ int64_t Val;
+ if (!isNegative && Tok.getString().startswith("0x")) {
+ Val = Tok.getIntVal();
+ if (Val > 255 || Val < 0) {
+ TokError("encoded floating point value out of range");
+ return MatchOperand_ParseFail;
+ }
+ } else {
+ APFloat RealVal(APFloat::IEEEdouble(), Tok.getString());
+ uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
+ // If we had a '-' in front, toggle the sign bit.
+ IntVal ^= (uint64_t)isNegative << 63;
+ Val = AArch64_AM::getFP64Imm(APInt(64, IntVal));
+ }
+ Parser.Lex(); // Eat the token.
+ Operands.push_back(AArch64Operand::CreateFPImm(Val, S, getContext()));
+ return MatchOperand_Success;
+ }
+
+ if (!Hash)
+ return MatchOperand_NoMatch;
+
+ TokError("invalid floating point immediate");
+ return MatchOperand_ParseFail;
+}
+
+/// tryParseAddSubImm - Parse ADD/SUB shifted immediate operand
+OperandMatchResultTy
+AArch64AsmParser::tryParseAddSubImm(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ SMLoc S = getLoc();
+
+ if (Parser.getTok().is(AsmToken::Hash))
+ Parser.Lex(); // Eat '#'
+ else if (Parser.getTok().isNot(AsmToken::Integer))
+ // Operand should start from # or should be integer, emit error otherwise.
+ return MatchOperand_NoMatch;
+
+ const MCExpr *Imm;
+ if (parseSymbolicImmVal(Imm))
+ return MatchOperand_ParseFail;
+ else if (Parser.getTok().isNot(AsmToken::Comma)) {
+ uint64_t ShiftAmount = 0;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Imm);
+ if (MCE) {
+ int64_t Val = MCE->getValue();
+ if (Val > 0xfff && (Val & 0xfff) == 0) {
+ Imm = MCConstantExpr::create(Val >> 12, getContext());
+ ShiftAmount = 12;
+ }
+ }
+ SMLoc E = Parser.getTok().getLoc();
+ Operands.push_back(AArch64Operand::CreateShiftedImm(Imm, ShiftAmount, S, E,
+ getContext()));
+ return MatchOperand_Success;
+ }
+
+ // Eat ','
+ Parser.Lex();
+
+ // The optional operand must be "lsl #N" where N is non-negative.
+ if (!Parser.getTok().is(AsmToken::Identifier) ||
+ !Parser.getTok().getIdentifier().equals_lower("lsl")) {
+ Error(Parser.getTok().getLoc(), "only 'lsl #+N' valid after immediate");
+ return MatchOperand_ParseFail;
+ }
+
+ // Eat 'lsl'
+ Parser.Lex();
+
+ parseOptionalToken(AsmToken::Hash);
+
+ if (Parser.getTok().isNot(AsmToken::Integer)) {
+ Error(Parser.getTok().getLoc(), "only 'lsl #+N' valid after immediate");
+ return MatchOperand_ParseFail;
+ }
+
+ int64_t ShiftAmount = Parser.getTok().getIntVal();
+
+ if (ShiftAmount < 0) {
+ Error(Parser.getTok().getLoc(), "positive shift amount required");
+ return MatchOperand_ParseFail;
+ }
+ Parser.Lex(); // Eat the number
+
+ SMLoc E = Parser.getTok().getLoc();
+ Operands.push_back(AArch64Operand::CreateShiftedImm(Imm, ShiftAmount,
+ S, E, getContext()));
+ return MatchOperand_Success;
+}
+
+/// parseCondCodeString - Parse a Condition Code string.
+AArch64CC::CondCode AArch64AsmParser::parseCondCodeString(StringRef Cond) {
+ AArch64CC::CondCode CC = StringSwitch<AArch64CC::CondCode>(Cond.lower())
+ .Case("eq", AArch64CC::EQ)
+ .Case("ne", AArch64CC::NE)
+ .Case("cs", AArch64CC::HS)
+ .Case("hs", AArch64CC::HS)
+ .Case("cc", AArch64CC::LO)
+ .Case("lo", AArch64CC::LO)
+ .Case("mi", AArch64CC::MI)
+ .Case("pl", AArch64CC::PL)
+ .Case("vs", AArch64CC::VS)
+ .Case("vc", AArch64CC::VC)
+ .Case("hi", AArch64CC::HI)
+ .Case("ls", AArch64CC::LS)
+ .Case("ge", AArch64CC::GE)
+ .Case("lt", AArch64CC::LT)
+ .Case("gt", AArch64CC::GT)
+ .Case("le", AArch64CC::LE)
+ .Case("al", AArch64CC::AL)
+ .Case("nv", AArch64CC::NV)
+ .Default(AArch64CC::Invalid);
+ return CC;
+}
+
+/// parseCondCode - Parse a Condition Code operand.
+bool AArch64AsmParser::parseCondCode(OperandVector &Operands,
+ bool invertCondCode) {
+ MCAsmParser &Parser = getParser();
+ SMLoc S = getLoc();
+ const AsmToken &Tok = Parser.getTok();
+ assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
+
+ StringRef Cond = Tok.getString();
+ AArch64CC::CondCode CC = parseCondCodeString(Cond);
+ if (CC == AArch64CC::Invalid)
+ return TokError("invalid condition code");
+ Parser.Lex(); // Eat identifier token.
+
+ if (invertCondCode) {
+ if (CC == AArch64CC::AL || CC == AArch64CC::NV)
+ return TokError("condition codes AL and NV are invalid for this instruction");
+ CC = AArch64CC::getInvertedCondCode(AArch64CC::CondCode(CC));
+ }
+
+ Operands.push_back(
+ AArch64Operand::CreateCondCode(CC, S, getLoc(), getContext()));
+ return false;
+}
+
+/// tryParseOptionalShift - Some operands take an optional shift argument. Parse
+/// them if present.
+OperandMatchResultTy
+AArch64AsmParser::tryParseOptionalShiftExtend(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ const AsmToken &Tok = Parser.getTok();
+ std::string LowerID = Tok.getString().lower();
+ AArch64_AM::ShiftExtendType ShOp =
+ StringSwitch<AArch64_AM::ShiftExtendType>(LowerID)
+ .Case("lsl", AArch64_AM::LSL)
+ .Case("lsr", AArch64_AM::LSR)
+ .Case("asr", AArch64_AM::ASR)
+ .Case("ror", AArch64_AM::ROR)
+ .Case("msl", AArch64_AM::MSL)
+ .Case("uxtb", AArch64_AM::UXTB)
+ .Case("uxth", AArch64_AM::UXTH)
+ .Case("uxtw", AArch64_AM::UXTW)
+ .Case("uxtx", AArch64_AM::UXTX)
+ .Case("sxtb", AArch64_AM::SXTB)
+ .Case("sxth", AArch64_AM::SXTH)
+ .Case("sxtw", AArch64_AM::SXTW)
+ .Case("sxtx", AArch64_AM::SXTX)
+ .Default(AArch64_AM::InvalidShiftExtend);
+
+ if (ShOp == AArch64_AM::InvalidShiftExtend)
+ return MatchOperand_NoMatch;
+
+ SMLoc S = Tok.getLoc();
+ Parser.Lex();
+
+ bool Hash = parseOptionalToken(AsmToken::Hash);
+
+ if (!Hash && getLexer().isNot(AsmToken::Integer)) {
+ if (ShOp == AArch64_AM::LSL || ShOp == AArch64_AM::LSR ||
+ ShOp == AArch64_AM::ASR || ShOp == AArch64_AM::ROR ||
+ ShOp == AArch64_AM::MSL) {
+ // We expect a number here.
+ TokError("expected #imm after shift specifier");
+ return MatchOperand_ParseFail;
+ }
+
+ // "extend" type operations don't need an immediate, #0 is implicit.
+ SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+ Operands.push_back(
+ AArch64Operand::CreateShiftExtend(ShOp, 0, false, S, E, getContext()));
+ return MatchOperand_Success;
+ }
+
+ // Make sure we do actually have a number, identifier or a parenthesized
+ // expression.
+ SMLoc E = Parser.getTok().getLoc();
+ if (!Parser.getTok().is(AsmToken::Integer) &&
+ !Parser.getTok().is(AsmToken::LParen) &&
+ !Parser.getTok().is(AsmToken::Identifier)) {
+ Error(E, "expected integer shift amount");
+ return MatchOperand_ParseFail;
+ }
+
+ const MCExpr *ImmVal;
+ if (getParser().parseExpression(ImmVal))
+ return MatchOperand_ParseFail;
+
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+ if (!MCE) {
+ Error(E, "expected constant '#imm' after shift specifier");
+ return MatchOperand_ParseFail;
+ }
+
+ E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+ Operands.push_back(AArch64Operand::CreateShiftExtend(
+ ShOp, MCE->getValue(), true, S, E, getContext()));
+ return MatchOperand_Success;
+}
+
+/// parseSysAlias - The IC, DC, AT, and TLBI instructions are simple aliases for
+/// the SYS instruction. Parse them specially so that we create a SYS MCInst.
+bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
+ OperandVector &Operands) {
+ if (Name.find('.') != StringRef::npos)
+ return TokError("invalid operand");
+
+ Mnemonic = Name;
+ Operands.push_back(
+ AArch64Operand::CreateToken("sys", false, NameLoc, getContext()));
+
+ MCAsmParser &Parser = getParser();
+ const AsmToken &Tok = Parser.getTok();
+ StringRef Op = Tok.getString();
+ SMLoc S = Tok.getLoc();
+
+ const MCExpr *Expr = nullptr;
+
+#define SYS_ALIAS(op1, Cn, Cm, op2) \
+ do { \
+ Expr = MCConstantExpr::create(op1, getContext()); \
+ Operands.push_back( \
+ AArch64Operand::CreateImm(Expr, S, getLoc(), getContext())); \
+ Operands.push_back( \
+ AArch64Operand::CreateSysCR(Cn, S, getLoc(), getContext())); \
+ Operands.push_back( \
+ AArch64Operand::CreateSysCR(Cm, S, getLoc(), getContext())); \
+ Expr = MCConstantExpr::create(op2, getContext()); \
+ Operands.push_back( \
+ AArch64Operand::CreateImm(Expr, S, getLoc(), getContext())); \
+ } while (0)
+
+ if (Mnemonic == "ic") {
+ if (!Op.compare_lower("ialluis")) {
+ // SYS #0, C7, C1, #0
+ SYS_ALIAS(0, 7, 1, 0);
+ } else if (!Op.compare_lower("iallu")) {
+ // SYS #0, C7, C5, #0
+ SYS_ALIAS(0, 7, 5, 0);
+ } else if (!Op.compare_lower("ivau")) {
+ // SYS #3, C7, C5, #1
+ SYS_ALIAS(3, 7, 5, 1);
+ } else {
+ return TokError("invalid operand for IC instruction");
+ }
+ } else if (Mnemonic == "dc") {
+ if (!Op.compare_lower("zva")) {
+ // SYS #3, C7, C4, #1
+ SYS_ALIAS(3, 7, 4, 1);
+ } else if (!Op.compare_lower("ivac")) {
+ // SYS #3, C7, C6, #1
+ SYS_ALIAS(0, 7, 6, 1);
+ } else if (!Op.compare_lower("isw")) {
+ // SYS #0, C7, C6, #2
+ SYS_ALIAS(0, 7, 6, 2);
+ } else if (!Op.compare_lower("cvac")) {
+ // SYS #3, C7, C10, #1
+ SYS_ALIAS(3, 7, 10, 1);
+ } else if (!Op.compare_lower("csw")) {
+ // SYS #0, C7, C10, #2
+ SYS_ALIAS(0, 7, 10, 2);
+ } else if (!Op.compare_lower("cvau")) {
+ // SYS #3, C7, C11, #1
+ SYS_ALIAS(3, 7, 11, 1);
+ } else if (!Op.compare_lower("civac")) {
+ // SYS #3, C7, C14, #1
+ SYS_ALIAS(3, 7, 14, 1);
+ } else if (!Op.compare_lower("cisw")) {
+ // SYS #0, C7, C14, #2
+ SYS_ALIAS(0, 7, 14, 2);
+ } else if (!Op.compare_lower("cvap")) {
+ if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) {
+ // SYS #3, C7, C12, #1
+ SYS_ALIAS(3, 7, 12, 1);
+ } else {
+ return TokError("DC CVAP requires ARMv8.2a");
+ }
+ } else {
+ return TokError("invalid operand for DC instruction");
+ }
+ } else if (Mnemonic == "at") {
+ if (!Op.compare_lower("s1e1r")) {
+ // SYS #0, C7, C8, #0
+ SYS_ALIAS(0, 7, 8, 0);
+ } else if (!Op.compare_lower("s1e2r")) {
+ // SYS #4, C7, C8, #0
+ SYS_ALIAS(4, 7, 8, 0);
+ } else if (!Op.compare_lower("s1e3r")) {
+ // SYS #6, C7, C8, #0
+ SYS_ALIAS(6, 7, 8, 0);
+ } else if (!Op.compare_lower("s1e1w")) {
+ // SYS #0, C7, C8, #1
+ SYS_ALIAS(0, 7, 8, 1);
+ } else if (!Op.compare_lower("s1e2w")) {
+ // SYS #4, C7, C8, #1
+ SYS_ALIAS(4, 7, 8, 1);
+ } else if (!Op.compare_lower("s1e3w")) {
+ // SYS #6, C7, C8, #1
+ SYS_ALIAS(6, 7, 8, 1);
+ } else if (!Op.compare_lower("s1e0r")) {
+ // SYS #0, C7, C8, #3
+ SYS_ALIAS(0, 7, 8, 2);
+ } else if (!Op.compare_lower("s1e0w")) {
+ // SYS #0, C7, C8, #3
+ SYS_ALIAS(0, 7, 8, 3);
+ } else if (!Op.compare_lower("s12e1r")) {
+ // SYS #4, C7, C8, #4
+ SYS_ALIAS(4, 7, 8, 4);
+ } else if (!Op.compare_lower("s12e1w")) {
+ // SYS #4, C7, C8, #5
+ SYS_ALIAS(4, 7, 8, 5);
+ } else if (!Op.compare_lower("s12e0r")) {
+ // SYS #4, C7, C8, #6
+ SYS_ALIAS(4, 7, 8, 6);
+ } else if (!Op.compare_lower("s12e0w")) {
+ // SYS #4, C7, C8, #7
+ SYS_ALIAS(4, 7, 8, 7);
+ } else if (!Op.compare_lower("s1e1rp")) {
+ if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) {
+ // SYS #0, C7, C9, #0
+ SYS_ALIAS(0, 7, 9, 0);
+ } else {
+ return TokError("AT S1E1RP requires ARMv8.2a");
+ }
+ } else if (!Op.compare_lower("s1e1wp")) {
+ if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) {
+ // SYS #0, C7, C9, #1
+ SYS_ALIAS(0, 7, 9, 1);
+ } else {
+ return TokError("AT S1E1WP requires ARMv8.2a");
+ }
+ } else {
+ return TokError("invalid operand for AT instruction");
+ }
+ } else if (Mnemonic == "tlbi") {
+ if (!Op.compare_lower("vmalle1is")) {
+ // SYS #0, C8, C3, #0
+ SYS_ALIAS(0, 8, 3, 0);
+ } else if (!Op.compare_lower("alle2is")) {
+ // SYS #4, C8, C3, #0
+ SYS_ALIAS(4, 8, 3, 0);
+ } else if (!Op.compare_lower("alle3is")) {
+ // SYS #6, C8, C3, #0
+ SYS_ALIAS(6, 8, 3, 0);
+ } else if (!Op.compare_lower("vae1is")) {
+ // SYS #0, C8, C3, #1
+ SYS_ALIAS(0, 8, 3, 1);
+ } else if (!Op.compare_lower("vae2is")) {
+ // SYS #4, C8, C3, #1
+ SYS_ALIAS(4, 8, 3, 1);
+ } else if (!Op.compare_lower("vae3is")) {
+ // SYS #6, C8, C3, #1
+ SYS_ALIAS(6, 8, 3, 1);
+ } else if (!Op.compare_lower("aside1is")) {
+ // SYS #0, C8, C3, #2
+ SYS_ALIAS(0, 8, 3, 2);
+ } else if (!Op.compare_lower("vaae1is")) {
+ // SYS #0, C8, C3, #3
+ SYS_ALIAS(0, 8, 3, 3);
+ } else if (!Op.compare_lower("alle1is")) {
+ // SYS #4, C8, C3, #4
+ SYS_ALIAS(4, 8, 3, 4);
+ } else if (!Op.compare_lower("vale1is")) {
+ // SYS #0, C8, C3, #5
+ SYS_ALIAS(0, 8, 3, 5);
+ } else if (!Op.compare_lower("vaale1is")) {
+ // SYS #0, C8, C3, #7
+ SYS_ALIAS(0, 8, 3, 7);
+ } else if (!Op.compare_lower("vmalle1")) {
+ // SYS #0, C8, C7, #0
+ SYS_ALIAS(0, 8, 7, 0);
+ } else if (!Op.compare_lower("alle2")) {
+ // SYS #4, C8, C7, #0
+ SYS_ALIAS(4, 8, 7, 0);
+ } else if (!Op.compare_lower("vale2is")) {
+ // SYS #4, C8, C3, #5
+ SYS_ALIAS(4, 8, 3, 5);
+ } else if (!Op.compare_lower("vale3is")) {
+ // SYS #6, C8, C3, #5
+ SYS_ALIAS(6, 8, 3, 5);
+ } else if (!Op.compare_lower("alle3")) {
+ // SYS #6, C8, C7, #0
+ SYS_ALIAS(6, 8, 7, 0);
+ } else if (!Op.compare_lower("vae1")) {
+ // SYS #0, C8, C7, #1
+ SYS_ALIAS(0, 8, 7, 1);
+ } else if (!Op.compare_lower("vae2")) {
+ // SYS #4, C8, C7, #1
+ SYS_ALIAS(4, 8, 7, 1);
+ } else if (!Op.compare_lower("vae3")) {
+ // SYS #6, C8, C7, #1
+ SYS_ALIAS(6, 8, 7, 1);
+ } else if (!Op.compare_lower("aside1")) {
+ // SYS #0, C8, C7, #2
+ SYS_ALIAS(0, 8, 7, 2);
+ } else if (!Op.compare_lower("vaae1")) {
+ // SYS #0, C8, C7, #3
+ SYS_ALIAS(0, 8, 7, 3);
+ } else if (!Op.compare_lower("alle1")) {
+ // SYS #4, C8, C7, #4
+ SYS_ALIAS(4, 8, 7, 4);
+ } else if (!Op.compare_lower("vale1")) {
+ // SYS #0, C8, C7, #5
+ SYS_ALIAS(0, 8, 7, 5);
+ } else if (!Op.compare_lower("vale2")) {
+ // SYS #4, C8, C7, #5
+ SYS_ALIAS(4, 8, 7, 5);
+ } else if (!Op.compare_lower("vale3")) {
+ // SYS #6, C8, C7, #5
+ SYS_ALIAS(6, 8, 7, 5);
+ } else if (!Op.compare_lower("vaale1")) {
+ // SYS #0, C8, C7, #7
+ SYS_ALIAS(0, 8, 7, 7);
+ } else if (!Op.compare_lower("ipas2e1")) {
+ // SYS #4, C8, C4, #1
+ SYS_ALIAS(4, 8, 4, 1);
+ } else if (!Op.compare_lower("ipas2le1")) {
+ // SYS #4, C8, C4, #5
+ SYS_ALIAS(4, 8, 4, 5);
+ } else if (!Op.compare_lower("ipas2e1is")) {
+ // SYS #4, C8, C4, #1
+ SYS_ALIAS(4, 8, 0, 1);
+ } else if (!Op.compare_lower("ipas2le1is")) {
+ // SYS #4, C8, C4, #5
+ SYS_ALIAS(4, 8, 0, 5);
+ } else if (!Op.compare_lower("vmalls12e1")) {
+ // SYS #4, C8, C7, #6
+ SYS_ALIAS(4, 8, 7, 6);
+ } else if (!Op.compare_lower("vmalls12e1is")) {
+ // SYS #4, C8, C3, #6
+ SYS_ALIAS(4, 8, 3, 6);
+ } else {
+ return TokError("invalid operand for TLBI instruction");
+ }
+ }
+
+#undef SYS_ALIAS
+
+ Parser.Lex(); // Eat operand.
+
+ bool ExpectRegister = (Op.lower().find("all") == StringRef::npos);
+ bool HasRegister = false;
+
+ // Check for the optional register operand.
+ if (parseOptionalToken(AsmToken::Comma)) {
+ if (Tok.isNot(AsmToken::Identifier) || parseRegister(Operands))
+ return TokError("expected register operand");
+ HasRegister = true;
+ }
+
+ if (ExpectRegister && !HasRegister) {
+ return TokError("specified " + Mnemonic + " op requires a register");
+ }
+ else if (!ExpectRegister && HasRegister) {
+ return TokError("specified " + Mnemonic + " op does not use a register");
+ }
+
+ if (parseToken(AsmToken::EndOfStatement, "unexpected token in argument list"))
+ return true;
+
+ return false;
+}
+
+OperandMatchResultTy
+AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ const AsmToken &Tok = Parser.getTok();
+
+ // Can be either a #imm style literal or an option name
+ if (parseOptionalToken(AsmToken::Hash) ||
+ Tok.is(AsmToken::Integer)) {
+ // Immediate operand.
+ const MCExpr *ImmVal;
+ SMLoc ExprLoc = getLoc();
+ if (getParser().parseExpression(ImmVal))
+ return MatchOperand_ParseFail;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+ if (!MCE) {
+ Error(ExprLoc, "immediate value expected for barrier operand");
+ return MatchOperand_ParseFail;
+ }
+ if (MCE->getValue() < 0 || MCE->getValue() > 15) {
+ Error(ExprLoc, "barrier operand out of range");
+ return MatchOperand_ParseFail;
+ }
+ auto DB = AArch64DB::lookupDBByEncoding(MCE->getValue());
+ Operands.push_back(AArch64Operand::CreateBarrier(
+ MCE->getValue(), DB ? DB->Name : "", ExprLoc, getContext()));
+ return MatchOperand_Success;
+ }
+
+ if (Tok.isNot(AsmToken::Identifier)) {
+ TokError("invalid operand for instruction");
+ return MatchOperand_ParseFail;
+ }
+
+ auto DB = AArch64DB::lookupDBByName(Tok.getString());
+ if (!DB) {
+ TokError("invalid barrier option name");
+ return MatchOperand_ParseFail;
+ }
+
+ // The only valid named option for ISB is 'sy'
+ if (Mnemonic == "isb" && DB->Encoding != AArch64DB::sy) {
+ TokError("'sy' or #imm operand expected");
+ return MatchOperand_ParseFail;
+ }
+
+ Operands.push_back(AArch64Operand::CreateBarrier(
+ DB->Encoding, Tok.getString(), getLoc(), getContext()));
+ Parser.Lex(); // Consume the option
+
+ return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+AArch64AsmParser::tryParseSysReg(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ const AsmToken &Tok = Parser.getTok();
+
+ if (Tok.isNot(AsmToken::Identifier))
+ return MatchOperand_NoMatch;
+
+ int MRSReg, MSRReg;
+ auto SysReg = AArch64SysReg::lookupSysRegByName(Tok.getString());
+ if (SysReg && SysReg->haveFeatures(getSTI().getFeatureBits())) {
+ MRSReg = SysReg->Readable ? SysReg->Encoding : -1;
+ MSRReg = SysReg->Writeable ? SysReg->Encoding : -1;
+ } else
+ MRSReg = MSRReg = AArch64SysReg::parseGenericRegister(Tok.getString());
+
+ auto PState = AArch64PState::lookupPStateByName(Tok.getString());
+ unsigned PStateImm = -1;
+ if (PState && PState->haveFeatures(getSTI().getFeatureBits()))
+ PStateImm = PState->Encoding;
+
+ Operands.push_back(
+ AArch64Operand::CreateSysReg(Tok.getString(), getLoc(), MRSReg, MSRReg,
+ PStateImm, getContext()));
+ Parser.Lex(); // Eat identifier
+
+ return MatchOperand_Success;
+}
+
+/// tryParseVectorRegister - Parse a vector register operand.
+bool AArch64AsmParser::tryParseVectorRegister(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ if (Parser.getTok().isNot(AsmToken::Identifier))
+ return true;
+
+ SMLoc S = getLoc();
+ // Check for a vector register specifier first.
+ StringRef Kind;
+ int64_t Reg = tryMatchVectorRegister(Kind, false);
+ if (Reg == -1)
+ return true;
+ Operands.push_back(
+ AArch64Operand::CreateReg(Reg, true, S, getLoc(), getContext()));
+ // If there was an explicit qualifier, that goes on as a literal text
+ // operand.
+ if (!Kind.empty())
+ Operands.push_back(
+ AArch64Operand::CreateToken(Kind, false, S, getContext()));
+
+ // If there is an index specifier following the register, parse that too.
+ SMLoc SIdx = getLoc();
+ if (parseOptionalToken(AsmToken::LBrac)) {
+ const MCExpr *ImmVal;
+ if (getParser().parseExpression(ImmVal))
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+ if (!MCE) {
+ TokError("immediate value expected for vector index");
+ return false;
+ }
+
+ SMLoc E = getLoc();
+
+ if (parseToken(AsmToken::RBrac, "']' expected"))
+ return false;
+
+ Operands.push_back(AArch64Operand::CreateVectorIndex(MCE->getValue(), SIdx,
+ E, getContext()));
+ }
+
+ return false;
+}
+
+/// parseRegister - Parse a non-vector register operand.
+bool AArch64AsmParser::parseRegister(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ SMLoc S = getLoc();
+ // Try for a vector register.
+ if (!tryParseVectorRegister(Operands))
+ return false;
+
+ // Try for a scalar register.
+ int64_t Reg = tryParseRegister();
+ if (Reg == -1)
+ return true;
+ Operands.push_back(
+ AArch64Operand::CreateReg(Reg, false, S, getLoc(), getContext()));
+
+ // A small number of instructions (FMOVXDhighr, for example) have "[1]"
+ // as a string token in the instruction itself.
+ SMLoc LBracS = getLoc();
+ const AsmToken &Tok = Parser.getTok();
+ if (parseOptionalToken(AsmToken::LBrac)) {
+ if (Tok.is(AsmToken::Integer)) {
+ SMLoc IntS = getLoc();
+ int64_t Val = Tok.getIntVal();
+ if (Val == 1) {
+ Parser.Lex();
+ SMLoc RBracS = getLoc();
+ if (parseOptionalToken(AsmToken::RBrac)) {
+ Operands.push_back(
+ AArch64Operand::CreateToken("[", false, LBracS, getContext()));
+ Operands.push_back(
+ AArch64Operand::CreateToken("1", false, IntS, getContext()));
+ Operands.push_back(
+ AArch64Operand::CreateToken("]", false, RBracS, getContext()));
+ return false;
+ }
+ }
+ }
+ }
+
+ return false;
+}
+
+bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) {
+ MCAsmParser &Parser = getParser();
+ bool HasELFModifier = false;
+ AArch64MCExpr::VariantKind RefKind;
+
+ if (parseOptionalToken(AsmToken::Colon)) {
+ HasELFModifier = true;
+
+ if (Parser.getTok().isNot(AsmToken::Identifier))
+ return TokError("expect relocation specifier in operand after ':'");
+
+ std::string LowerCase = Parser.getTok().getIdentifier().lower();
+ RefKind = StringSwitch<AArch64MCExpr::VariantKind>(LowerCase)
+ .Case("lo12", AArch64MCExpr::VK_LO12)
+ .Case("abs_g3", AArch64MCExpr::VK_ABS_G3)
+ .Case("abs_g2", AArch64MCExpr::VK_ABS_G2)
+ .Case("abs_g2_s", AArch64MCExpr::VK_ABS_G2_S)
+ .Case("abs_g2_nc", AArch64MCExpr::VK_ABS_G2_NC)
+ .Case("abs_g1", AArch64MCExpr::VK_ABS_G1)
+ .Case("abs_g1_s", AArch64MCExpr::VK_ABS_G1_S)
+ .Case("abs_g1_nc", AArch64MCExpr::VK_ABS_G1_NC)
+ .Case("abs_g0", AArch64MCExpr::VK_ABS_G0)
+ .Case("abs_g0_s", AArch64MCExpr::VK_ABS_G0_S)
+ .Case("abs_g0_nc", AArch64MCExpr::VK_ABS_G0_NC)
+ .Case("dtprel_g2", AArch64MCExpr::VK_DTPREL_G2)
+ .Case("dtprel_g1", AArch64MCExpr::VK_DTPREL_G1)
+ .Case("dtprel_g1_nc", AArch64MCExpr::VK_DTPREL_G1_NC)
+ .Case("dtprel_g0", AArch64MCExpr::VK_DTPREL_G0)
+ .Case("dtprel_g0_nc", AArch64MCExpr::VK_DTPREL_G0_NC)
+ .Case("dtprel_hi12", AArch64MCExpr::VK_DTPREL_HI12)
+ .Case("dtprel_lo12", AArch64MCExpr::VK_DTPREL_LO12)
+ .Case("dtprel_lo12_nc", AArch64MCExpr::VK_DTPREL_LO12_NC)
+ .Case("tprel_g2", AArch64MCExpr::VK_TPREL_G2)
+ .Case("tprel_g1", AArch64MCExpr::VK_TPREL_G1)
+ .Case("tprel_g1_nc", AArch64MCExpr::VK_TPREL_G1_NC)
+ .Case("tprel_g0", AArch64MCExpr::VK_TPREL_G0)
+ .Case("tprel_g0_nc", AArch64MCExpr::VK_TPREL_G0_NC)
+ .Case("tprel_hi12", AArch64MCExpr::VK_TPREL_HI12)
+ .Case("tprel_lo12", AArch64MCExpr::VK_TPREL_LO12)
+ .Case("tprel_lo12_nc", AArch64MCExpr::VK_TPREL_LO12_NC)
+ .Case("tlsdesc_lo12", AArch64MCExpr::VK_TLSDESC_LO12)
+ .Case("got", AArch64MCExpr::VK_GOT_PAGE)
+ .Case("got_lo12", AArch64MCExpr::VK_GOT_LO12)
+ .Case("gottprel", AArch64MCExpr::VK_GOTTPREL_PAGE)
+ .Case("gottprel_lo12", AArch64MCExpr::VK_GOTTPREL_LO12_NC)
+ .Case("gottprel_g1", AArch64MCExpr::VK_GOTTPREL_G1)
+ .Case("gottprel_g0_nc", AArch64MCExpr::VK_GOTTPREL_G0_NC)
+ .Case("tlsdesc", AArch64MCExpr::VK_TLSDESC_PAGE)
+ .Default(AArch64MCExpr::VK_INVALID);
+
+ if (RefKind == AArch64MCExpr::VK_INVALID)
+ return TokError("expect relocation specifier in operand after ':'");
+
+ Parser.Lex(); // Eat identifier
+
+ if (parseToken(AsmToken::Colon, "expect ':' after relocation specifier"))
+ return true;
+ }
+
+ if (getParser().parseExpression(ImmVal))
+ return true;
+
+ if (HasELFModifier)
+ ImmVal = AArch64MCExpr::create(ImmVal, RefKind, getContext());
+
+ return false;
+}
+
+/// parseVectorList - Parse a vector list operand for AdvSIMD instructions.
+bool AArch64AsmParser::parseVectorList(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ assert(Parser.getTok().is(AsmToken::LCurly) && "Token is not a Left Bracket");
+ SMLoc S = getLoc();
+ Parser.Lex(); // Eat left bracket token.
+ StringRef Kind;
+ int64_t FirstReg = tryMatchVectorRegister(Kind, true);
+ if (FirstReg == -1)
+ return true;
+ int64_t PrevReg = FirstReg;
+ unsigned Count = 1;
+
+ if (parseOptionalToken(AsmToken::Minus)) {
+ SMLoc Loc = getLoc();
+ StringRef NextKind;
+ int64_t Reg = tryMatchVectorRegister(NextKind, true);
+ if (Reg == -1)
+ return true;
+ // Any Kind suffices must match on all regs in the list.
+ if (Kind != NextKind)
+ return Error(Loc, "mismatched register size suffix");
+
+ unsigned Space = (PrevReg < Reg) ? (Reg - PrevReg) : (Reg + 32 - PrevReg);
+
+ if (Space == 0 || Space > 3) {
+ return Error(Loc, "invalid number of vectors");
+ }
+
+ Count += Space;
+ }
+ else {
+ while (parseOptionalToken(AsmToken::Comma)) {
+ SMLoc Loc = getLoc();
+ StringRef NextKind;
+ int64_t Reg = tryMatchVectorRegister(NextKind, true);
+ if (Reg == -1)
+ return true;
+ // Any Kind suffices must match on all regs in the list.
+ if (Kind != NextKind)
+ return Error(Loc, "mismatched register size suffix");
+
+ // Registers must be incremental (with wraparound at 31)
+ if (getContext().getRegisterInfo()->getEncodingValue(Reg) !=
+ (getContext().getRegisterInfo()->getEncodingValue(PrevReg) + 1) % 32)
+ return Error(Loc, "registers must be sequential");
+
+ PrevReg = Reg;
+ ++Count;
+ }
+ }
+
+ if (parseToken(AsmToken::RCurly, "'}' expected"))
+ return true;
+
+ if (Count > 4)
+ return Error(S, "invalid number of vectors");
+
+ unsigned NumElements = 0;
+ char ElementKind = 0;
+ if (!Kind.empty())
+ parseValidVectorKind(Kind, NumElements, ElementKind);
+
+ Operands.push_back(AArch64Operand::CreateVectorList(
+ FirstReg, Count, NumElements, ElementKind, S, getLoc(), getContext()));
+
+ // If there is an index specifier following the list, parse that too.
+ SMLoc SIdx = getLoc();
+ if (parseOptionalToken(AsmToken::LBrac)) { // Eat left bracket token.
+ const MCExpr *ImmVal;
+ if (getParser().parseExpression(ImmVal))
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+ if (!MCE) {
+ TokError("immediate value expected for vector index");
+ return false;
+ }
+
+ SMLoc E = getLoc();
+ if (parseToken(AsmToken::RBrac, "']' expected"))
+ return false;
+
+ Operands.push_back(AArch64Operand::CreateVectorIndex(MCE->getValue(), SIdx,
+ E, getContext()));
+ }
+ return false;
+}
+
+OperandMatchResultTy
+AArch64AsmParser::tryParseGPR64sp0Operand(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ const AsmToken &Tok = Parser.getTok();
+ if (!Tok.is(AsmToken::Identifier))
+ return MatchOperand_NoMatch;
+
+ unsigned RegNum = matchRegisterNameAlias(Tok.getString().lower(), false);
+
+ MCContext &Ctx = getContext();
+ const MCRegisterInfo *RI = Ctx.getRegisterInfo();
+ if (!RI->getRegClass(AArch64::GPR64spRegClassID).contains(RegNum))
+ return MatchOperand_NoMatch;
+
+ SMLoc S = getLoc();
+ Parser.Lex(); // Eat register
+
+ if (!parseOptionalToken(AsmToken::Comma)) {
+ Operands.push_back(
+ AArch64Operand::CreateReg(RegNum, false, S, getLoc(), Ctx));
+ return MatchOperand_Success;
+ }
+
+ parseOptionalToken(AsmToken::Hash);
+
+ if (Parser.getTok().isNot(AsmToken::Integer)) {
+ Error(getLoc(), "index must be absent or #0");
+ return MatchOperand_ParseFail;
+ }
+
+ const MCExpr *ImmVal;
+ if (Parser.parseExpression(ImmVal) || !isa<MCConstantExpr>(ImmVal) ||
+ cast<MCConstantExpr>(ImmVal)->getValue() != 0) {
+ Error(getLoc(), "index must be absent or #0");
+ return MatchOperand_ParseFail;
+ }
+
+ Operands.push_back(
+ AArch64Operand::CreateReg(RegNum, false, S, getLoc(), Ctx));
+ return MatchOperand_Success;
+}
+
+/// parseOperand - Parse a arm instruction operand. For now this parses the
+/// operand regardless of the mnemonic.
+bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
+ bool invertCondCode) {
+ MCAsmParser &Parser = getParser();
+ // Check if the current operand has a custom associated parser, if so, try to
+ // custom parse the operand, or fallback to the general approach.
+ OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+ if (ResTy == MatchOperand_Success)
+ return false;
+ // If there wasn't a custom match, try the generic matcher below. Otherwise,
+ // there was a match, but an error occurred, in which case, just return that
+ // the operand parsing failed.
+ if (ResTy == MatchOperand_ParseFail)
+ return true;
+
+ // Nothing custom, so do general case parsing.
+ SMLoc S, E;
+ switch (getLexer().getKind()) {
+ default: {
+ SMLoc S = getLoc();
+ const MCExpr *Expr;
+ if (parseSymbolicImmVal(Expr))
+ return Error(S, "invalid operand");
+
+ SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+ Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext()));
+ return false;
+ }
+ case AsmToken::LBrac: {
+ SMLoc Loc = Parser.getTok().getLoc();
+ Operands.push_back(AArch64Operand::CreateToken("[", false, Loc,
+ getContext()));
+ Parser.Lex(); // Eat '['
+
+ // There's no comma after a '[', so we can parse the next operand
+ // immediately.
+ return parseOperand(Operands, false, false);
+ }
+ case AsmToken::LCurly:
+ return parseVectorList(Operands);
+ case AsmToken::Identifier: {
+ // If we're expecting a Condition Code operand, then just parse that.
+ if (isCondCode)
+ return parseCondCode(Operands, invertCondCode);
+
+ // If it's a register name, parse it.
+ if (!parseRegister(Operands))
+ return false;
+
+ // This could be an optional "shift" or "extend" operand.
+ OperandMatchResultTy GotShift = tryParseOptionalShiftExtend(Operands);
+ // We can only continue if no tokens were eaten.
+ if (GotShift != MatchOperand_NoMatch)
+ return GotShift;
+
+ // This was not a register so parse other operands that start with an
+ // identifier (like labels) as expressions and create them as immediates.
+ const MCExpr *IdVal;
+ S = getLoc();
+ if (getParser().parseExpression(IdVal))
+ return true;
+ E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+ Operands.push_back(AArch64Operand::CreateImm(IdVal, S, E, getContext()));
+ return false;
+ }
+ case AsmToken::Integer:
+ case AsmToken::Real:
+ case AsmToken::Hash: {
+ // #42 -> immediate.
+ S = getLoc();
+
+ parseOptionalToken(AsmToken::Hash);
+
+ // Parse a negative sign
+ bool isNegative = false;
+ if (Parser.getTok().is(AsmToken::Minus)) {
+ isNegative = true;
+ // We need to consume this token only when we have a Real, otherwise
+ // we let parseSymbolicImmVal take care of it
+ if (Parser.getLexer().peekTok().is(AsmToken::Real))
+ Parser.Lex();
+ }
+
+ // The only Real that should come through here is a literal #0.0 for
+ // the fcmp[e] r, #0.0 instructions. They expect raw token operands,
+ // so convert the value.
+ const AsmToken &Tok = Parser.getTok();
+ if (Tok.is(AsmToken::Real)) {
+ APFloat RealVal(APFloat::IEEEdouble(), Tok.getString());
+ uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
+ if (Mnemonic != "fcmp" && Mnemonic != "fcmpe" && Mnemonic != "fcmeq" &&
+ Mnemonic != "fcmge" && Mnemonic != "fcmgt" && Mnemonic != "fcmle" &&
+ Mnemonic != "fcmlt")
+ return TokError("unexpected floating point literal");
+ else if (IntVal != 0 || isNegative)
+ return TokError("expected floating-point constant #0.0");
+ Parser.Lex(); // Eat the token.
+
+ Operands.push_back(
+ AArch64Operand::CreateToken("#0", false, S, getContext()));
+ Operands.push_back(
+ AArch64Operand::CreateToken(".0", false, S, getContext()));
+ return false;
+ }
+
+ const MCExpr *ImmVal;
+ if (parseSymbolicImmVal(ImmVal))
+ return true;
+
+ E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+ Operands.push_back(AArch64Operand::CreateImm(ImmVal, S, E, getContext()));
+ return false;
+ }
+ case AsmToken::Equal: {
+ SMLoc Loc = getLoc();
+ if (Mnemonic != "ldr") // only parse for ldr pseudo (e.g. ldr r0, =val)
+ return TokError("unexpected token in operand");
+ Parser.Lex(); // Eat '='
+ const MCExpr *SubExprVal;
+ if (getParser().parseExpression(SubExprVal))
+ return true;
+
+ if (Operands.size() < 2 ||
+ !static_cast<AArch64Operand &>(*Operands[1]).isReg())
+ return Error(Loc, "Only valid when first operand is register");
+
+ bool IsXReg =
+ AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
+ Operands[1]->getReg());
+
+ MCContext& Ctx = getContext();
+ E = SMLoc::getFromPointer(Loc.getPointer() - 1);
+ // If the op is an imm and can be fit into a mov, then replace ldr with mov.
+ if (isa<MCConstantExpr>(SubExprVal)) {
+ uint64_t Imm = (cast<MCConstantExpr>(SubExprVal))->getValue();
+ uint32_t ShiftAmt = 0, MaxShiftAmt = IsXReg ? 48 : 16;
+ while(Imm > 0xFFFF && countTrailingZeros(Imm) >= 16) {
+ ShiftAmt += 16;
+ Imm >>= 16;
+ }
+ if (ShiftAmt <= MaxShiftAmt && Imm <= 0xFFFF) {
+ Operands[0] = AArch64Operand::CreateToken("movz", false, Loc, Ctx);
+ Operands.push_back(AArch64Operand::CreateImm(
+ MCConstantExpr::create(Imm, Ctx), S, E, Ctx));
+ if (ShiftAmt)
+ Operands.push_back(AArch64Operand::CreateShiftExtend(AArch64_AM::LSL,
+ ShiftAmt, true, S, E, Ctx));
+ return false;
+ }
+ APInt Simm = APInt(64, Imm << ShiftAmt);
+ // check if the immediate is an unsigned or signed 32-bit int for W regs
+ if (!IsXReg && !(Simm.isIntN(32) || Simm.isSignedIntN(32)))
+ return Error(Loc, "Immediate too large for register");
+ }
+ // If it is a label or an imm that cannot fit in a movz, put it into CP.
+ const MCExpr *CPLoc =
+ getTargetStreamer().addConstantPoolEntry(SubExprVal, IsXReg ? 8 : 4, Loc);
+ Operands.push_back(AArch64Operand::CreateImm(CPLoc, S, E, Ctx));
+ return false;
+ }
+ }
+}
+
+/// ParseInstruction - Parse an AArch64 instruction mnemonic followed by its
+/// operands.
+bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info,
+ StringRef Name, SMLoc NameLoc,
+ OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ Name = StringSwitch<StringRef>(Name.lower())
+ .Case("beq", "b.eq")
+ .Case("bne", "b.ne")
+ .Case("bhs", "b.hs")
+ .Case("bcs", "b.cs")
+ .Case("blo", "b.lo")
+ .Case("bcc", "b.cc")
+ .Case("bmi", "b.mi")
+ .Case("bpl", "b.pl")
+ .Case("bvs", "b.vs")
+ .Case("bvc", "b.vc")
+ .Case("bhi", "b.hi")
+ .Case("bls", "b.ls")
+ .Case("bge", "b.ge")
+ .Case("blt", "b.lt")
+ .Case("bgt", "b.gt")
+ .Case("ble", "b.le")
+ .Case("bal", "b.al")
+ .Case("bnv", "b.nv")
+ .Default(Name);
+
+ // First check for the AArch64-specific .req directive.
+ if (Parser.getTok().is(AsmToken::Identifier) &&
+ Parser.getTok().getIdentifier() == ".req") {
+ parseDirectiveReq(Name, NameLoc);
+ // We always return 'error' for this, as we're done with this
+ // statement and don't need to match the 'instruction."
+ return true;
+ }
+
+ // Create the leading tokens for the mnemonic, split by '.' characters.
+ size_t Start = 0, Next = Name.find('.');
+ StringRef Head = Name.slice(Start, Next);
+
+ // IC, DC, AT, and TLBI instructions are aliases for the SYS instruction.
+ if (Head == "ic" || Head == "dc" || Head == "at" || Head == "tlbi")
+ return parseSysAlias(Head, NameLoc, Operands);
+
+ Operands.push_back(
+ AArch64Operand::CreateToken(Head, false, NameLoc, getContext()));
+ Mnemonic = Head;
+
+ // Handle condition codes for a branch mnemonic
+ if (Head == "b" && Next != StringRef::npos) {
+ Start = Next;
+ Next = Name.find('.', Start + 1);
+ Head = Name.slice(Start + 1, Next);
+
+ SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() +
+ (Head.data() - Name.data()));
+ AArch64CC::CondCode CC = parseCondCodeString(Head);
+ if (CC == AArch64CC::Invalid)
+ return Error(SuffixLoc, "invalid condition code");
+ Operands.push_back(
+ AArch64Operand::CreateToken(".", true, SuffixLoc, getContext()));
+ Operands.push_back(
+ AArch64Operand::CreateCondCode(CC, NameLoc, NameLoc, getContext()));
+ }
+
+ // Add the remaining tokens in the mnemonic.
+ while (Next != StringRef::npos) {
+ Start = Next;
+ Next = Name.find('.', Start + 1);
+ Head = Name.slice(Start, Next);
+ SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() +
+ (Head.data() - Name.data()) + 1);
+ Operands.push_back(
+ AArch64Operand::CreateToken(Head, true, SuffixLoc, getContext()));
+ }
+
+ // Conditional compare instructions have a Condition Code operand, which needs
+ // to be parsed and an immediate operand created.
+ bool condCodeFourthOperand =
+ (Head == "ccmp" || Head == "ccmn" || Head == "fccmp" ||
+ Head == "fccmpe" || Head == "fcsel" || Head == "csel" ||
+ Head == "csinc" || Head == "csinv" || Head == "csneg");
+
+ // These instructions are aliases to some of the conditional select
+ // instructions. However, the condition code is inverted in the aliased
+ // instruction.
+ //
+ // FIXME: Is this the correct way to handle these? Or should the parser
+ // generate the aliased instructions directly?
+ bool condCodeSecondOperand = (Head == "cset" || Head == "csetm");
+ bool condCodeThirdOperand =
+ (Head == "cinc" || Head == "cinv" || Head == "cneg");
+
+ // Read the remaining operands.
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ // Read the first operand.
+ if (parseOperand(Operands, false, false)) {
+ return true;
+ }
+
+ unsigned N = 2;
+ while (parseOptionalToken(AsmToken::Comma)) {
+ // Parse and remember the operand.
+ if (parseOperand(Operands, (N == 4 && condCodeFourthOperand) ||
+ (N == 3 && condCodeThirdOperand) ||
+ (N == 2 && condCodeSecondOperand),
+ condCodeSecondOperand || condCodeThirdOperand)) {
+ return true;
+ }
+
+ // After successfully parsing some operands there are two special cases to
+ // consider (i.e. notional operands not separated by commas). Both are due
+ // to memory specifiers:
+ // + An RBrac will end an address for load/store/prefetch
+ // + An '!' will indicate a pre-indexed operation.
+ //
+ // It's someone else's responsibility to make sure these tokens are sane
+ // in the given context!
+
+ SMLoc RLoc = Parser.getTok().getLoc();
+ if (parseOptionalToken(AsmToken::RBrac))
+ Operands.push_back(
+ AArch64Operand::CreateToken("]", false, RLoc, getContext()));
+ SMLoc ELoc = Parser.getTok().getLoc();
+ if (parseOptionalToken(AsmToken::Exclaim))
+ Operands.push_back(
+ AArch64Operand::CreateToken("!", false, ELoc, getContext()));
+
+ ++N;
+ }
+ }
+
+ if (parseToken(AsmToken::EndOfStatement, "unexpected token in argument list"))
+ return true;
+
+ return false;
+}
+
+// FIXME: This entire function is a giant hack to provide us with decent
+// operand range validation/diagnostics until TableGen/MC can be extended
+// to support autogeneration of this kind of validation.
+bool AArch64AsmParser::validateInstruction(MCInst &Inst,
+ SmallVectorImpl<SMLoc> &Loc) {
+ const MCRegisterInfo *RI = getContext().getRegisterInfo();
+ // Check for indexed addressing modes w/ the base register being the
+ // same as a destination/source register or pair load where
+ // the Rt == Rt2. All of those are undefined behaviour.
+ switch (Inst.getOpcode()) {
+ case AArch64::LDPSWpre:
+ case AArch64::LDPWpost:
+ case AArch64::LDPWpre:
+ case AArch64::LDPXpost:
+ case AArch64::LDPXpre: {
+ unsigned Rt = Inst.getOperand(1).getReg();
+ unsigned Rt2 = Inst.getOperand(2).getReg();
+ unsigned Rn = Inst.getOperand(3).getReg();
+ if (RI->isSubRegisterEq(Rn, Rt))
+ return Error(Loc[0], "unpredictable LDP instruction, writeback base "
+ "is also a destination");
+ if (RI->isSubRegisterEq(Rn, Rt2))
+ return Error(Loc[1], "unpredictable LDP instruction, writeback base "
+ "is also a destination");
+ LLVM_FALLTHROUGH;
+ }
+ case AArch64::LDPDi:
+ case AArch64::LDPQi:
+ case AArch64::LDPSi:
+ case AArch64::LDPSWi:
+ case AArch64::LDPWi:
+ case AArch64::LDPXi: {
+ unsigned Rt = Inst.getOperand(0).getReg();
+ unsigned Rt2 = Inst.getOperand(1).getReg();
+ if (Rt == Rt2)
+ return Error(Loc[1], "unpredictable LDP instruction, Rt2==Rt");
+ break;
+ }
+ case AArch64::LDPDpost:
+ case AArch64::LDPDpre:
+ case AArch64::LDPQpost:
+ case AArch64::LDPQpre:
+ case AArch64::LDPSpost:
+ case AArch64::LDPSpre:
+ case AArch64::LDPSWpost: {
+ unsigned Rt = Inst.getOperand(1).getReg();
+ unsigned Rt2 = Inst.getOperand(2).getReg();
+ if (Rt == Rt2)
+ return Error(Loc[1], "unpredictable LDP instruction, Rt2==Rt");
+ break;
+ }
+ case AArch64::STPDpost:
+ case AArch64::STPDpre:
+ case AArch64::STPQpost:
+ case AArch64::STPQpre:
+ case AArch64::STPSpost:
+ case AArch64::STPSpre:
+ case AArch64::STPWpost:
+ case AArch64::STPWpre:
+ case AArch64::STPXpost:
+ case AArch64::STPXpre: {
+ unsigned Rt = Inst.getOperand(1).getReg();
+ unsigned Rt2 = Inst.getOperand(2).getReg();
+ unsigned Rn = Inst.getOperand(3).getReg();
+ if (RI->isSubRegisterEq(Rn, Rt))
+ return Error(Loc[0], "unpredictable STP instruction, writeback base "
+ "is also a source");
+ if (RI->isSubRegisterEq(Rn, Rt2))
+ return Error(Loc[1], "unpredictable STP instruction, writeback base "
+ "is also a source");
+ break;
+ }
+ case AArch64::LDRBBpre:
+ case AArch64::LDRBpre:
+ case AArch64::LDRHHpre:
+ case AArch64::LDRHpre:
+ case AArch64::LDRSBWpre:
+ case AArch64::LDRSBXpre:
+ case AArch64::LDRSHWpre:
+ case AArch64::LDRSHXpre:
+ case AArch64::LDRSWpre:
+ case AArch64::LDRWpre:
+ case AArch64::LDRXpre:
+ case AArch64::LDRBBpost:
+ case AArch64::LDRBpost:
+ case AArch64::LDRHHpost:
+ case AArch64::LDRHpost:
+ case AArch64::LDRSBWpost:
+ case AArch64::LDRSBXpost:
+ case AArch64::LDRSHWpost:
+ case AArch64::LDRSHXpost:
+ case AArch64::LDRSWpost:
+ case AArch64::LDRWpost:
+ case AArch64::LDRXpost: {
+ unsigned Rt = Inst.getOperand(1).getReg();
+ unsigned Rn = Inst.getOperand(2).getReg();
+ if (RI->isSubRegisterEq(Rn, Rt))
+ return Error(Loc[0], "unpredictable LDR instruction, writeback base "
+ "is also a source");
+ break;
+ }
+ case AArch64::STRBBpost:
+ case AArch64::STRBpost:
+ case AArch64::STRHHpost:
+ case AArch64::STRHpost:
+ case AArch64::STRWpost:
+ case AArch64::STRXpost:
+ case AArch64::STRBBpre:
+ case AArch64::STRBpre:
+ case AArch64::STRHHpre:
+ case AArch64::STRHpre:
+ case AArch64::STRWpre:
+ case AArch64::STRXpre: {
+ unsigned Rt = Inst.getOperand(1).getReg();
+ unsigned Rn = Inst.getOperand(2).getReg();
+ if (RI->isSubRegisterEq(Rn, Rt))
+ return Error(Loc[0], "unpredictable STR instruction, writeback base "
+ "is also a source");
+ break;
+ }
+ }
+
+ // Now check immediate ranges. Separate from the above as there is overlap
+ // in the instructions being checked and this keeps the nested conditionals
+ // to a minimum.
+ switch (Inst.getOpcode()) {
+ case AArch64::ADDSWri:
+ case AArch64::ADDSXri:
+ case AArch64::ADDWri:
+ case AArch64::ADDXri:
+ case AArch64::SUBSWri:
+ case AArch64::SUBSXri:
+ case AArch64::SUBWri:
+ case AArch64::SUBXri: {
+ // Annoyingly we can't do this in the isAddSubImm predicate, so there is
+ // some slight duplication here.
+ if (Inst.getOperand(2).isExpr()) {
+ const MCExpr *Expr = Inst.getOperand(2).getExpr();
+ AArch64MCExpr::VariantKind ELFRefKind;
+ MCSymbolRefExpr::VariantKind DarwinRefKind;
+ int64_t Addend;
+ if (classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) {
+
+ // Only allow these with ADDXri.
+ if ((DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF ||
+ DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF) &&
+ Inst.getOpcode() == AArch64::ADDXri)
+ return false;
+
+ // Only allow these with ADDXri/ADDWri
+ if ((ELFRefKind == AArch64MCExpr::VK_LO12 ||
+ ELFRefKind == AArch64MCExpr::VK_DTPREL_HI12 ||
+ ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12 ||
+ ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC ||
+ ELFRefKind == AArch64MCExpr::VK_TPREL_HI12 ||
+ ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 ||
+ ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC ||
+ ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12) &&
+ (Inst.getOpcode() == AArch64::ADDXri ||
+ Inst.getOpcode() == AArch64::ADDWri))
+ return false;
+
+ // Don't allow symbol refs in the immediate field otherwise
+ // Note: Loc.back() may be Loc[1] or Loc[2] depending on the number of
+ // operands of the original instruction (i.e. 'add w0, w1, borked' vs
+ // 'cmp w0, 'borked')
+ return Error(Loc.back(), "invalid immediate expression");
+ }
+ // We don't validate more complex expressions here
+ }
+ return false;
+ }
+ default:
+ return false;
+ }
+}
+
+bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) {
+ switch (ErrCode) {
+ case Match_MissingFeature:
+ return Error(Loc,
+ "instruction requires a CPU feature not currently enabled");
+ case Match_InvalidOperand:
+ return Error(Loc, "invalid operand for instruction");
+ case Match_InvalidSuffix:
+ return Error(Loc, "invalid type suffix for instruction");
+ case Match_InvalidCondCode:
+ return Error(Loc, "expected AArch64 condition code");
+ case Match_AddSubRegExtendSmall:
+ return Error(Loc,
+ "expected '[su]xt[bhw]' or 'lsl' with optional integer in range [0, 4]");
+ case Match_AddSubRegExtendLarge:
+ return Error(Loc,
+ "expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]");
+ case Match_AddSubSecondSource:
+ return Error(Loc,
+ "expected compatible register, symbol or integer in range [0, 4095]");
+ case Match_LogicalSecondSource:
+ return Error(Loc, "expected compatible register or logical immediate");
+ case Match_InvalidMovImm32Shift:
+ return Error(Loc, "expected 'lsl' with optional integer 0 or 16");
+ case Match_InvalidMovImm64Shift:
+ return Error(Loc, "expected 'lsl' with optional integer 0, 16, 32 or 48");
+ case Match_AddSubRegShift32:
+ return Error(Loc,
+ "expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]");
+ case Match_AddSubRegShift64:
+ return Error(Loc,
+ "expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 63]");
+ case Match_InvalidFPImm:
+ return Error(Loc,
+ "expected compatible register or floating-point constant");
+ case Match_InvalidMemoryIndexedSImm9:
+ return Error(Loc, "index must be an integer in range [-256, 255].");
+ case Match_InvalidMemoryIndexed4SImm7:
+ return Error(Loc, "index must be a multiple of 4 in range [-256, 252].");
+ case Match_InvalidMemoryIndexed8SImm7:
+ return Error(Loc, "index must be a multiple of 8 in range [-512, 504].");
+ case Match_InvalidMemoryIndexed16SImm7:
+ return Error(Loc, "index must be a multiple of 16 in range [-1024, 1008].");
+ case Match_InvalidMemoryWExtend8:
+ return Error(Loc,
+ "expected 'uxtw' or 'sxtw' with optional shift of #0");
+ case Match_InvalidMemoryWExtend16:
+ return Error(Loc,
+ "expected 'uxtw' or 'sxtw' with optional shift of #0 or #1");
+ case Match_InvalidMemoryWExtend32:
+ return Error(Loc,
+ "expected 'uxtw' or 'sxtw' with optional shift of #0 or #2");
+ case Match_InvalidMemoryWExtend64:
+ return Error(Loc,
+ "expected 'uxtw' or 'sxtw' with optional shift of #0 or #3");
+ case Match_InvalidMemoryWExtend128:
+ return Error(Loc,
+ "expected 'uxtw' or 'sxtw' with optional shift of #0 or #4");
+ case Match_InvalidMemoryXExtend8:
+ return Error(Loc,
+ "expected 'lsl' or 'sxtx' with optional shift of #0");
+ case Match_InvalidMemoryXExtend16:
+ return Error(Loc,
+ "expected 'lsl' or 'sxtx' with optional shift of #0 or #1");
+ case Match_InvalidMemoryXExtend32:
+ return Error(Loc,
+ "expected 'lsl' or 'sxtx' with optional shift of #0 or #2");
+ case Match_InvalidMemoryXExtend64:
+ return Error(Loc,
+ "expected 'lsl' or 'sxtx' with optional shift of #0 or #3");
+ case Match_InvalidMemoryXExtend128:
+ return Error(Loc,
+ "expected 'lsl' or 'sxtx' with optional shift of #0 or #4");
+ case Match_InvalidMemoryIndexed1:
+ return Error(Loc, "index must be an integer in range [0, 4095].");
+ case Match_InvalidMemoryIndexed2:
+ return Error(Loc, "index must be a multiple of 2 in range [0, 8190].");
+ case Match_InvalidMemoryIndexed4:
+ return Error(Loc, "index must be a multiple of 4 in range [0, 16380].");
+ case Match_InvalidMemoryIndexed8:
+ return Error(Loc, "index must be a multiple of 8 in range [0, 32760].");
+ case Match_InvalidMemoryIndexed16:
+ return Error(Loc, "index must be a multiple of 16 in range [0, 65520].");
+ case Match_InvalidImm0_1:
+ return Error(Loc, "immediate must be an integer in range [0, 1].");
+ case Match_InvalidImm0_7:
+ return Error(Loc, "immediate must be an integer in range [0, 7].");
+ case Match_InvalidImm0_15:
+ return Error(Loc, "immediate must be an integer in range [0, 15].");
+ case Match_InvalidImm0_31:
+ return Error(Loc, "immediate must be an integer in range [0, 31].");
+ case Match_InvalidImm0_63:
+ return Error(Loc, "immediate must be an integer in range [0, 63].");
+ case Match_InvalidImm0_127:
+ return Error(Loc, "immediate must be an integer in range [0, 127].");
+ case Match_InvalidImm0_65535:
+ return Error(Loc, "immediate must be an integer in range [0, 65535].");
+ case Match_InvalidImm1_8:
+ return Error(Loc, "immediate must be an integer in range [1, 8].");
+ case Match_InvalidImm1_16:
+ return Error(Loc, "immediate must be an integer in range [1, 16].");
+ case Match_InvalidImm1_32:
+ return Error(Loc, "immediate must be an integer in range [1, 32].");
+ case Match_InvalidImm1_64:
+ return Error(Loc, "immediate must be an integer in range [1, 64].");
+ case Match_InvalidIndex1:
+ return Error(Loc, "expected lane specifier '[1]'");
+ case Match_InvalidIndexB:
+ return Error(Loc, "vector lane must be an integer in range [0, 15].");
+ case Match_InvalidIndexH:
+ return Error(Loc, "vector lane must be an integer in range [0, 7].");
+ case Match_InvalidIndexS:
+ return Error(Loc, "vector lane must be an integer in range [0, 3].");
+ case Match_InvalidIndexD:
+ return Error(Loc, "vector lane must be an integer in range [0, 1].");
+ case Match_InvalidLabel:
+ return Error(Loc, "expected label or encodable integer pc offset");
+ case Match_MRS:
+ return Error(Loc, "expected readable system register");
+ case Match_MSR:
+ return Error(Loc, "expected writable system register or pstate");
+ case Match_MnemonicFail:
+ return Error(Loc, "unrecognized instruction mnemonic");
+ default:
+ llvm_unreachable("unexpected error code!");
+ }
+}
+
+static const char *getSubtargetFeatureName(uint64_t Val);
+
+bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands,
+ MCStreamer &Out,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) {
+ assert(!Operands.empty() && "Unexpect empty operand list!");
+ AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[0]);
+ assert(Op.isToken() && "Leading operand should always be a mnemonic!");
+
+ StringRef Tok = Op.getToken();
+ unsigned NumOperands = Operands.size();
+
+ if (NumOperands == 4 && Tok == "lsl") {
+ AArch64Operand &Op2 = static_cast<AArch64Operand &>(*Operands[2]);
+ AArch64Operand &Op3 = static_cast<AArch64Operand &>(*Operands[3]);
+ if (Op2.isReg() && Op3.isImm()) {
+ const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3.getImm());
+ if (Op3CE) {
+ uint64_t Op3Val = Op3CE->getValue();
+ uint64_t NewOp3Val = 0;
+ uint64_t NewOp4Val = 0;
+ if (AArch64MCRegisterClasses[AArch64::GPR32allRegClassID].contains(
+ Op2.getReg())) {
+ NewOp3Val = (32 - Op3Val) & 0x1f;
+ NewOp4Val = 31 - Op3Val;
+ } else {
+ NewOp3Val = (64 - Op3Val) & 0x3f;
+ NewOp4Val = 63 - Op3Val;
+ }
+
+ const MCExpr *NewOp3 = MCConstantExpr::create(NewOp3Val, getContext());
+ const MCExpr *NewOp4 = MCConstantExpr::create(NewOp4Val, getContext());
+
+ Operands[0] = AArch64Operand::CreateToken(
+ "ubfm", false, Op.getStartLoc(), getContext());
+ Operands.push_back(AArch64Operand::CreateImm(
+ NewOp4, Op3.getStartLoc(), Op3.getEndLoc(), getContext()));
+ Operands[3] = AArch64Operand::CreateImm(NewOp3, Op3.getStartLoc(),
+ Op3.getEndLoc(), getContext());
+ }
+ }
+ } else if (NumOperands == 4 && Tok == "bfc") {
+ // FIXME: Horrible hack to handle BFC->BFM alias.
+ AArch64Operand &Op1 = static_cast<AArch64Operand &>(*Operands[1]);
+ AArch64Operand LSBOp = static_cast<AArch64Operand &>(*Operands[2]);
+ AArch64Operand WidthOp = static_cast<AArch64Operand &>(*Operands[3]);
+
+ if (Op1.isReg() && LSBOp.isImm() && WidthOp.isImm()) {
+ const MCConstantExpr *LSBCE = dyn_cast<MCConstantExpr>(LSBOp.getImm());
+ const MCConstantExpr *WidthCE = dyn_cast<MCConstantExpr>(WidthOp.getImm());
+
+ if (LSBCE && WidthCE) {
+ uint64_t LSB = LSBCE->getValue();
+ uint64_t Width = WidthCE->getValue();
+
+ uint64_t RegWidth = 0;
+ if (AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
+ Op1.getReg()))
+ RegWidth = 64;
+ else
+ RegWidth = 32;
+
+ if (LSB >= RegWidth)
+ return Error(LSBOp.getStartLoc(),
+ "expected integer in range [0, 31]");
+ if (Width < 1 || Width > RegWidth)
+ return Error(WidthOp.getStartLoc(),
+ "expected integer in range [1, 32]");
+
+ uint64_t ImmR = 0;
+ if (RegWidth == 32)
+ ImmR = (32 - LSB) & 0x1f;
+ else
+ ImmR = (64 - LSB) & 0x3f;
+
+ uint64_t ImmS = Width - 1;
+
+ if (ImmR != 0 && ImmS >= ImmR)
+ return Error(WidthOp.getStartLoc(),
+ "requested insert overflows register");
+
+ const MCExpr *ImmRExpr = MCConstantExpr::create(ImmR, getContext());
+ const MCExpr *ImmSExpr = MCConstantExpr::create(ImmS, getContext());
+ Operands[0] = AArch64Operand::CreateToken(
+ "bfm", false, Op.getStartLoc(), getContext());
+ Operands[2] = AArch64Operand::CreateReg(
+ RegWidth == 32 ? AArch64::WZR : AArch64::XZR, false, SMLoc(),
+ SMLoc(), getContext());
+ Operands[3] = AArch64Operand::CreateImm(
+ ImmRExpr, LSBOp.getStartLoc(), LSBOp.getEndLoc(), getContext());
+ Operands.emplace_back(
+ AArch64Operand::CreateImm(ImmSExpr, WidthOp.getStartLoc(),
+ WidthOp.getEndLoc(), getContext()));
+ }
+ }
+ } else if (NumOperands == 5) {
+ // FIXME: Horrible hack to handle the BFI -> BFM, SBFIZ->SBFM, and
+ // UBFIZ -> UBFM aliases.
+ if (Tok == "bfi" || Tok == "sbfiz" || Tok == "ubfiz") {
+ AArch64Operand &Op1 = static_cast<AArch64Operand &>(*Operands[1]);
+ AArch64Operand &Op3 = static_cast<AArch64Operand &>(*Operands[3]);
+ AArch64Operand &Op4 = static_cast<AArch64Operand &>(*Operands[4]);
+
+ if (Op1.isReg() && Op3.isImm() && Op4.isImm()) {
+ const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3.getImm());
+ const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4.getImm());
+
+ if (Op3CE && Op4CE) {
+ uint64_t Op3Val = Op3CE->getValue();
+ uint64_t Op4Val = Op4CE->getValue();
+
+ uint64_t RegWidth = 0;
+ if (AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
+ Op1.getReg()))
+ RegWidth = 64;
+ else
+ RegWidth = 32;
+
+ if (Op3Val >= RegWidth)
+ return Error(Op3.getStartLoc(),
+ "expected integer in range [0, 31]");
+ if (Op4Val < 1 || Op4Val > RegWidth)
+ return Error(Op4.getStartLoc(),
+ "expected integer in range [1, 32]");
+
+ uint64_t NewOp3Val = 0;
+ if (RegWidth == 32)
+ NewOp3Val = (32 - Op3Val) & 0x1f;
+ else
+ NewOp3Val = (64 - Op3Val) & 0x3f;
+
+ uint64_t NewOp4Val = Op4Val - 1;
+
+ if (NewOp3Val != 0 && NewOp4Val >= NewOp3Val)
+ return Error(Op4.getStartLoc(),
+ "requested insert overflows register");
+
+ const MCExpr *NewOp3 =
+ MCConstantExpr::create(NewOp3Val, getContext());
+ const MCExpr *NewOp4 =
+ MCConstantExpr::create(NewOp4Val, getContext());
+ Operands[3] = AArch64Operand::CreateImm(
+ NewOp3, Op3.getStartLoc(), Op3.getEndLoc(), getContext());
+ Operands[4] = AArch64Operand::CreateImm(
+ NewOp4, Op4.getStartLoc(), Op4.getEndLoc(), getContext());
+ if (Tok == "bfi")
+ Operands[0] = AArch64Operand::CreateToken(
+ "bfm", false, Op.getStartLoc(), getContext());
+ else if (Tok == "sbfiz")
+ Operands[0] = AArch64Operand::CreateToken(
+ "sbfm", false, Op.getStartLoc(), getContext());
+ else if (Tok == "ubfiz")
+ Operands[0] = AArch64Operand::CreateToken(
+ "ubfm", false, Op.getStartLoc(), getContext());
+ else
+ llvm_unreachable("No valid mnemonic for alias?");
+ }
+ }
+
+ // FIXME: Horrible hack to handle the BFXIL->BFM, SBFX->SBFM, and
+ // UBFX -> UBFM aliases.
+ } else if (NumOperands == 5 &&
+ (Tok == "bfxil" || Tok == "sbfx" || Tok == "ubfx")) {
+ AArch64Operand &Op1 = static_cast<AArch64Operand &>(*Operands[1]);
+ AArch64Operand &Op3 = static_cast<AArch64Operand &>(*Operands[3]);
+ AArch64Operand &Op4 = static_cast<AArch64Operand &>(*Operands[4]);
+
+ if (Op1.isReg() && Op3.isImm() && Op4.isImm()) {
+ const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3.getImm());
+ const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4.getImm());
+
+ if (Op3CE && Op4CE) {
+ uint64_t Op3Val = Op3CE->getValue();
+ uint64_t Op4Val = Op4CE->getValue();
+
+ uint64_t RegWidth = 0;
+ if (AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
+ Op1.getReg()))
+ RegWidth = 64;
+ else
+ RegWidth = 32;
+
+ if (Op3Val >= RegWidth)
+ return Error(Op3.getStartLoc(),
+ "expected integer in range [0, 31]");
+ if (Op4Val < 1 || Op4Val > RegWidth)
+ return Error(Op4.getStartLoc(),
+ "expected integer in range [1, 32]");
+
+ uint64_t NewOp4Val = Op3Val + Op4Val - 1;
+
+ if (NewOp4Val >= RegWidth || NewOp4Val < Op3Val)
+ return Error(Op4.getStartLoc(),
+ "requested extract overflows register");
+
+ const MCExpr *NewOp4 =
+ MCConstantExpr::create(NewOp4Val, getContext());
+ Operands[4] = AArch64Operand::CreateImm(
+ NewOp4, Op4.getStartLoc(), Op4.getEndLoc(), getContext());
+ if (Tok == "bfxil")
+ Operands[0] = AArch64Operand::CreateToken(
+ "bfm", false, Op.getStartLoc(), getContext());
+ else if (Tok == "sbfx")
+ Operands[0] = AArch64Operand::CreateToken(
+ "sbfm", false, Op.getStartLoc(), getContext());
+ else if (Tok == "ubfx")
+ Operands[0] = AArch64Operand::CreateToken(
+ "ubfm", false, Op.getStartLoc(), getContext());
+ else
+ llvm_unreachable("No valid mnemonic for alias?");
+ }
+ }
+ }
+ }
+ // FIXME: Horrible hack for sxtw and uxtw with Wn src and Xd dst operands.
+ // InstAlias can't quite handle this since the reg classes aren't
+ // subclasses.
+ if (NumOperands == 3 && (Tok == "sxtw" || Tok == "uxtw")) {
+ // The source register can be Wn here, but the matcher expects a
+ // GPR64. Twiddle it here if necessary.
+ AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[2]);
+ if (Op.isReg()) {
+ unsigned Reg = getXRegFromWReg(Op.getReg());
+ Operands[2] = AArch64Operand::CreateReg(Reg, false, Op.getStartLoc(),
+ Op.getEndLoc(), getContext());
+ }
+ }
+ // FIXME: Likewise for sxt[bh] with a Xd dst operand
+ else if (NumOperands == 3 && (Tok == "sxtb" || Tok == "sxth")) {
+ AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[1]);
+ if (Op.isReg() &&
+ AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
+ Op.getReg())) {
+ // The source register can be Wn here, but the matcher expects a
+ // GPR64. Twiddle it here if necessary.
+ AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[2]);
+ if (Op.isReg()) {
+ unsigned Reg = getXRegFromWReg(Op.getReg());
+ Operands[2] = AArch64Operand::CreateReg(Reg, false, Op.getStartLoc(),
+ Op.getEndLoc(), getContext());
+ }
+ }
+ }
+ // FIXME: Likewise for uxt[bh] with a Xd dst operand
+ else if (NumOperands == 3 && (Tok == "uxtb" || Tok == "uxth")) {
+ AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[1]);
+ if (Op.isReg() &&
+ AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
+ Op.getReg())) {
+ // The source register can be Wn here, but the matcher expects a
+ // GPR32. Twiddle it here if necessary.
+ AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[1]);
+ if (Op.isReg()) {
+ unsigned Reg = getWRegFromXReg(Op.getReg());
+ Operands[1] = AArch64Operand::CreateReg(Reg, false, Op.getStartLoc(),
+ Op.getEndLoc(), getContext());
+ }
+ }
+ }
+
+ // Yet another horrible hack to handle FMOV Rd, #0.0 using [WX]ZR.
+ if (NumOperands == 3 && Tok == "fmov") {
+ AArch64Operand &RegOp = static_cast<AArch64Operand &>(*Operands[1]);
+ AArch64Operand &ImmOp = static_cast<AArch64Operand &>(*Operands[2]);
+ if (RegOp.isReg() && ImmOp.isFPImm() && ImmOp.getFPImm() == (unsigned)-1) {
+ unsigned zreg =
+ !AArch64MCRegisterClasses[AArch64::FPR64RegClassID].contains(
+ RegOp.getReg())
+ ? AArch64::WZR
+ : AArch64::XZR;
+ Operands[2] = AArch64Operand::CreateReg(zreg, false, Op.getStartLoc(),
+ Op.getEndLoc(), getContext());
+ }
+ }
+
+ MCInst Inst;
+ // First try to match against the secondary set of tables containing the
+ // short-form NEON instructions (e.g. "fadd.2s v0, v1, v2").
+ unsigned MatchResult =
+ MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 1);
+
+ // If that fails, try against the alternate table containing long-form NEON:
+ // "fadd v0.2s, v1.2s, v2.2s"
+ if (MatchResult != Match_Success) {
+ // But first, save the short-form match result: we can use it in case the
+ // long-form match also fails.
+ auto ShortFormNEONErrorInfo = ErrorInfo;
+ auto ShortFormNEONMatchResult = MatchResult;
+
+ MatchResult =
+ MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 0);
+
+ // Now, both matches failed, and the long-form match failed on the mnemonic
+ // suffix token operand. The short-form match failure is probably more
+ // relevant: use it instead.
+ if (MatchResult == Match_InvalidOperand && ErrorInfo == 1 &&
+ Operands.size() > 1 && ((AArch64Operand &)*Operands[1]).isToken() &&
+ ((AArch64Operand &)*Operands[1]).isTokenSuffix()) {
+ MatchResult = ShortFormNEONMatchResult;
+ ErrorInfo = ShortFormNEONErrorInfo;
+ }
+ }
+
+
+ switch (MatchResult) {
+ case Match_Success: {
+ // Perform range checking and other semantic validations
+ SmallVector<SMLoc, 8> OperandLocs;
+ NumOperands = Operands.size();
+ for (unsigned i = 1; i < NumOperands; ++i)
+ OperandLocs.push_back(Operands[i]->getStartLoc());
+ if (validateInstruction(Inst, OperandLocs))
+ return true;
+
+ Inst.setLoc(IDLoc);
+ Out.EmitInstruction(Inst, getSTI());
+ return false;
+ }
+ case Match_MissingFeature: {
+ assert(ErrorInfo && "Unknown missing feature!");
+ // Special case the error message for the very common case where only
+ // a single subtarget feature is missing (neon, e.g.).
+ std::string Msg = "instruction requires:";
+ uint64_t Mask = 1;
+ for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) {
+ if (ErrorInfo & Mask) {
+ Msg += " ";
+ Msg += getSubtargetFeatureName(ErrorInfo & Mask);
+ }
+ Mask <<= 1;
+ }
+ return Error(IDLoc, Msg);
+ }
+ case Match_MnemonicFail:
+ return showMatchError(IDLoc, MatchResult);
+ case Match_InvalidOperand: {
+ SMLoc ErrorLoc = IDLoc;
+
+ if (ErrorInfo != ~0ULL) {
+ if (ErrorInfo >= Operands.size())
+ return Error(IDLoc, "too few operands for instruction",
+ SMRange(IDLoc, getTok().getLoc()));
+
+ ErrorLoc = ((AArch64Operand &)*Operands[ErrorInfo]).getStartLoc();
+ if (ErrorLoc == SMLoc())
+ ErrorLoc = IDLoc;
+ }
+ // If the match failed on a suffix token operand, tweak the diagnostic
+ // accordingly.
+ if (((AArch64Operand &)*Operands[ErrorInfo]).isToken() &&
+ ((AArch64Operand &)*Operands[ErrorInfo]).isTokenSuffix())
+ MatchResult = Match_InvalidSuffix;
+
+ return showMatchError(ErrorLoc, MatchResult);
+ }
+ case Match_InvalidMemoryIndexed1:
+ case Match_InvalidMemoryIndexed2:
+ case Match_InvalidMemoryIndexed4:
+ case Match_InvalidMemoryIndexed8:
+ case Match_InvalidMemoryIndexed16:
+ case Match_InvalidCondCode:
+ case Match_AddSubRegExtendSmall:
+ case Match_AddSubRegExtendLarge:
+ case Match_AddSubSecondSource:
+ case Match_LogicalSecondSource:
+ case Match_AddSubRegShift32:
+ case Match_AddSubRegShift64:
+ case Match_InvalidMovImm32Shift:
+ case Match_InvalidMovImm64Shift:
+ case Match_InvalidFPImm:
+ case Match_InvalidMemoryWExtend8:
+ case Match_InvalidMemoryWExtend16:
+ case Match_InvalidMemoryWExtend32:
+ case Match_InvalidMemoryWExtend64:
+ case Match_InvalidMemoryWExtend128:
+ case Match_InvalidMemoryXExtend8:
+ case Match_InvalidMemoryXExtend16:
+ case Match_InvalidMemoryXExtend32:
+ case Match_InvalidMemoryXExtend64:
+ case Match_InvalidMemoryXExtend128:
+ case Match_InvalidMemoryIndexed4SImm7:
+ case Match_InvalidMemoryIndexed8SImm7:
+ case Match_InvalidMemoryIndexed16SImm7:
+ case Match_InvalidMemoryIndexedSImm9:
+ case Match_InvalidImm0_1:
+ case Match_InvalidImm0_7:
+ case Match_InvalidImm0_15:
+ case Match_InvalidImm0_31:
+ case Match_InvalidImm0_63:
+ case Match_InvalidImm0_127:
+ case Match_InvalidImm0_65535:
+ case Match_InvalidImm1_8:
+ case Match_InvalidImm1_16:
+ case Match_InvalidImm1_32:
+ case Match_InvalidImm1_64:
+ case Match_InvalidIndex1:
+ case Match_InvalidIndexB:
+ case Match_InvalidIndexH:
+ case Match_InvalidIndexS:
+ case Match_InvalidIndexD:
+ case Match_InvalidLabel:
+ case Match_MSR:
+ case Match_MRS: {
+ if (ErrorInfo >= Operands.size())
+ return Error(IDLoc, "too few operands for instruction", SMRange(IDLoc, (*Operands.back()).getEndLoc()));
+ // Any time we get here, there's nothing fancy to do. Just get the
+ // operand SMLoc and display the diagnostic.
+ SMLoc ErrorLoc = ((AArch64Operand &)*Operands[ErrorInfo]).getStartLoc();
+ if (ErrorLoc == SMLoc())
+ ErrorLoc = IDLoc;
+ return showMatchError(ErrorLoc, MatchResult);
+ }
+ }
+
+ llvm_unreachable("Implement any new match types added!");
+}
+
+/// ParseDirective parses the arm specific directives
+bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
+ const MCObjectFileInfo::Environment Format =
+ getContext().getObjectFileInfo()->getObjectFileType();
+ bool IsMachO = Format == MCObjectFileInfo::IsMachO;
+ bool IsCOFF = Format == MCObjectFileInfo::IsCOFF;
+
+ StringRef IDVal = DirectiveID.getIdentifier();
+ SMLoc Loc = DirectiveID.getLoc();
+ if (IDVal == ".arch")
+ parseDirectiveArch(Loc);
+ else if (IDVal == ".cpu")
+ parseDirectiveCPU(Loc);
+ else if (IDVal == ".hword")
+ parseDirectiveWord(2, Loc);
+ else if (IDVal == ".word")
+ parseDirectiveWord(4, Loc);
+ else if (IDVal == ".xword")
+ parseDirectiveWord(8, Loc);
+ else if (IDVal == ".tlsdesccall")
+ parseDirectiveTLSDescCall(Loc);
+ else if (IDVal == ".ltorg" || IDVal == ".pool")
+ parseDirectiveLtorg(Loc);
+ else if (IDVal == ".unreq")
+ parseDirectiveUnreq(Loc);
+ else if (!IsMachO && !IsCOFF) {
+ if (IDVal == ".inst")
+ parseDirectiveInst(Loc);
+ else
+ return true;
+ } else if (IDVal == MCLOHDirectiveName())
+ parseDirectiveLOH(IDVal, Loc);
+ else
+ return true;
+ return false;
+}
+
+static const struct {
+ const char *Name;
+ const FeatureBitset Features;
+} ExtensionMap[] = {
+ { "crc", {AArch64::FeatureCRC} },
+ { "crypto", {AArch64::FeatureCrypto} },
+ { "fp", {AArch64::FeatureFPARMv8} },
+ { "simd", {AArch64::FeatureNEON} },
+ { "ras", {AArch64::FeatureRAS} },
+ { "lse", {AArch64::FeatureLSE} },
+
+ // FIXME: Unsupported extensions
+ { "pan", {} },
+ { "lor", {} },
+ { "rdma", {} },
+ { "profile", {} },
+};
+
+/// parseDirectiveArch
+/// ::= .arch token
+bool AArch64AsmParser::parseDirectiveArch(SMLoc L) {
+ SMLoc ArchLoc = getLoc();
+
+ StringRef Arch, ExtensionString;
+ std::tie(Arch, ExtensionString) =
+ getParser().parseStringToEndOfStatement().trim().split('+');
+
+ unsigned ID = AArch64::parseArch(Arch);
+ if (ID == static_cast<unsigned>(AArch64::ArchKind::AK_INVALID))
+ return Error(ArchLoc, "unknown arch name");
+
+ if (parseToken(AsmToken::EndOfStatement))
+ return true;
+
+ // Get the architecture and extension features.
+ std::vector<StringRef> AArch64Features;
+ AArch64::getArchFeatures(ID, AArch64Features);
+ AArch64::getExtensionFeatures(AArch64::getDefaultExtensions("generic", ID),
+ AArch64Features);
+
+ MCSubtargetInfo &STI = copySTI();
+ std::vector<std::string> ArchFeatures(AArch64Features.begin(), AArch64Features.end());
+ STI.setDefaultFeatures("generic", join(ArchFeatures.begin(), ArchFeatures.end(), ","));
+
+ SmallVector<StringRef, 4> RequestedExtensions;
+ if (!ExtensionString.empty())
+ ExtensionString.split(RequestedExtensions, '+');
+
+ FeatureBitset Features = STI.getFeatureBits();
+ for (auto Name : RequestedExtensions) {
+ bool EnableFeature = true;
+
+ if (Name.startswith_lower("no")) {
+ EnableFeature = false;
+ Name = Name.substr(2);
+ }
+
+ for (const auto &Extension : ExtensionMap) {
+ if (Extension.Name != Name)
+ continue;
+
+ if (Extension.Features.none())
+ report_fatal_error("unsupported architectural extension: " + Name);
+
+ FeatureBitset ToggleFeatures = EnableFeature
+ ? (~Features & Extension.Features)
+ : ( Features & Extension.Features);
+ uint64_t Features =
+ ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures));
+ setAvailableFeatures(Features);
+ break;
+ }
+ }
+ return false;
+}
+
+/// parseDirectiveCPU
+/// ::= .cpu id
+bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
+ SMLoc CPULoc = getLoc();
+
+ StringRef CPU, ExtensionString;
+ std::tie(CPU, ExtensionString) =
+ getParser().parseStringToEndOfStatement().trim().split('+');
+
+ if (parseToken(AsmToken::EndOfStatement))
+ return true;
+
+ SmallVector<StringRef, 4> RequestedExtensions;
+ if (!ExtensionString.empty())
+ ExtensionString.split(RequestedExtensions, '+');
+
+ // FIXME This is using tablegen data, but should be moved to ARMTargetParser
+ // once that is tablegen'ed
+ if (!getSTI().isCPUStringValid(CPU)) {
+ Error(CPULoc, "unknown CPU name");
+ return false;
+ }
+
+ MCSubtargetInfo &STI = copySTI();
+ STI.setDefaultFeatures(CPU, "");
+
+ FeatureBitset Features = STI.getFeatureBits();
+ for (auto Name : RequestedExtensions) {
+ bool EnableFeature = true;
+
+ if (Name.startswith_lower("no")) {
+ EnableFeature = false;
+ Name = Name.substr(2);
+ }
+
+ for (const auto &Extension : ExtensionMap) {
+ if (Extension.Name != Name)
+ continue;
+
+ if (Extension.Features.none())
+ report_fatal_error("unsupported architectural extension: " + Name);
+
+ FeatureBitset ToggleFeatures = EnableFeature
+ ? (~Features & Extension.Features)
+ : ( Features & Extension.Features);
+ uint64_t Features =
+ ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures));
+ setAvailableFeatures(Features);
+
+ break;
+ }
+ }
+ return false;
+}
+
+/// parseDirectiveWord
+/// ::= .word [ expression (, expression)* ]
+bool AArch64AsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
+ auto parseOp = [&]() -> bool {
+ const MCExpr *Value;
+ if (getParser().parseExpression(Value))
+ return true;
+ getParser().getStreamer().EmitValue(Value, Size, L);
+ return false;
+ };
+
+ if (parseMany(parseOp))
+ return true;
+ return false;
+}
+
+/// parseDirectiveInst
+/// ::= .inst opcode [, ...]
+bool AArch64AsmParser::parseDirectiveInst(SMLoc Loc) {
+ if (getLexer().is(AsmToken::EndOfStatement))
+ return Error(Loc, "expected expression following '.inst' directive");
+
+ auto parseOp = [&]() -> bool {
+ SMLoc L = getLoc();
+ const MCExpr *Expr;
+ if (check(getParser().parseExpression(Expr), L, "expected expression"))
+ return true;
+ const MCConstantExpr *Value = dyn_cast_or_null<MCConstantExpr>(Expr);
+ if (check(!Value, L, "expected constant expression"))
+ return true;
+ getTargetStreamer().emitInst(Value->getValue());
+ return false;
+ };
+
+ if (parseMany(parseOp))
+ return addErrorSuffix(" in '.inst' directive");
+ return false;
+}
+
+// parseDirectiveTLSDescCall:
+// ::= .tlsdesccall symbol
+bool AArch64AsmParser::parseDirectiveTLSDescCall(SMLoc L) {
+ StringRef Name;
+ if (check(getParser().parseIdentifier(Name), L,
+ "expected symbol after directive") ||
+ parseToken(AsmToken::EndOfStatement))
+ return true;
+
+ MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
+ const MCExpr *Expr = MCSymbolRefExpr::create(Sym, getContext());
+ Expr = AArch64MCExpr::create(Expr, AArch64MCExpr::VK_TLSDESC, getContext());
+
+ MCInst Inst;
+ Inst.setOpcode(AArch64::TLSDESCCALL);
+ Inst.addOperand(MCOperand::createExpr(Expr));
+
+ getParser().getStreamer().EmitInstruction(Inst, getSTI());
+ return false;
+}
+
+/// ::= .loh <lohName | lohId> label1, ..., labelN
+/// The number of arguments depends on the loh identifier.
+bool AArch64AsmParser::parseDirectiveLOH(StringRef IDVal, SMLoc Loc) {
+ MCLOHType Kind;
+ if (getParser().getTok().isNot(AsmToken::Identifier)) {
+ if (getParser().getTok().isNot(AsmToken::Integer))
+ return TokError("expected an identifier or a number in directive");
+ // We successfully get a numeric value for the identifier.
+ // Check if it is valid.
+ int64_t Id = getParser().getTok().getIntVal();
+ if (Id <= -1U && !isValidMCLOHType(Id))
+ return TokError("invalid numeric identifier in directive");
+ Kind = (MCLOHType)Id;
+ } else {
+ StringRef Name = getTok().getIdentifier();
+ // We successfully parse an identifier.
+ // Check if it is a recognized one.
+ int Id = MCLOHNameToId(Name);
+
+ if (Id == -1)
+ return TokError("invalid identifier in directive");
+ Kind = (MCLOHType)Id;
+ }
+ // Consume the identifier.
+ Lex();
+ // Get the number of arguments of this LOH.
+ int NbArgs = MCLOHIdToNbArgs(Kind);
+
+ assert(NbArgs != -1 && "Invalid number of arguments");
+
+ SmallVector<MCSymbol *, 3> Args;
+ for (int Idx = 0; Idx < NbArgs; ++Idx) {
+ StringRef Name;
+ if (getParser().parseIdentifier(Name))
+ return TokError("expected identifier in directive");
+ Args.push_back(getContext().getOrCreateSymbol(Name));
+
+ if (Idx + 1 == NbArgs)
+ break;
+ if (parseToken(AsmToken::Comma,
+ "unexpected token in '" + Twine(IDVal) + "' directive"))
+ return true;
+ }
+ if (parseToken(AsmToken::EndOfStatement,
+ "unexpected token in '" + Twine(IDVal) + "' directive"))
+ return true;
+
+ getStreamer().EmitLOHDirective((MCLOHType)Kind, Args);
+ return false;
+}
+
+/// parseDirectiveLtorg
+/// ::= .ltorg | .pool
+bool AArch64AsmParser::parseDirectiveLtorg(SMLoc L) {
+ if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive"))
+ return true;
+ getTargetStreamer().emitCurrentConstantPool();
+ return false;
+}
+
+/// parseDirectiveReq
+/// ::= name .req registername
+bool AArch64AsmParser::parseDirectiveReq(StringRef Name, SMLoc L) {
+ MCAsmParser &Parser = getParser();
+ Parser.Lex(); // Eat the '.req' token.
+ SMLoc SRegLoc = getLoc();
+ unsigned RegNum = tryParseRegister();
+ bool IsVector = false;
+
+ if (RegNum == static_cast<unsigned>(-1)) {
+ StringRef Kind;
+ RegNum = tryMatchVectorRegister(Kind, false);
+ if (!Kind.empty())
+ return Error(SRegLoc, "vector register without type specifier expected");
+ IsVector = true;
+ }
+
+ if (RegNum == static_cast<unsigned>(-1))
+ return Error(SRegLoc, "register name or alias expected");
+
+ // Shouldn't be anything else.
+ if (parseToken(AsmToken::EndOfStatement,
+ "unexpected input in .req directive"))
+ return true;
+
+ auto pair = std::make_pair(IsVector, RegNum);
+ if (RegisterReqs.insert(std::make_pair(Name, pair)).first->second != pair)
+ Warning(L, "ignoring redefinition of register alias '" + Name + "'");
+
+ return false;
+}
+
+/// parseDirectiveUneq
+/// ::= .unreq registername
+bool AArch64AsmParser::parseDirectiveUnreq(SMLoc L) {
+ MCAsmParser &Parser = getParser();
+ if (getTok().isNot(AsmToken::Identifier))
+ return TokError("unexpected input in .unreq directive.");
+ RegisterReqs.erase(Parser.getTok().getIdentifier().lower());
+ Parser.Lex(); // Eat the identifier.
+ if (parseToken(AsmToken::EndOfStatement))
+ return addErrorSuffix("in '.unreq' directive");
+ return false;
+}
+
+bool
+AArch64AsmParser::classifySymbolRef(const MCExpr *Expr,
+ AArch64MCExpr::VariantKind &ELFRefKind,
+ MCSymbolRefExpr::VariantKind &DarwinRefKind,
+ int64_t &Addend) {
+ ELFRefKind = AArch64MCExpr::VK_INVALID;
+ DarwinRefKind = MCSymbolRefExpr::VK_None;
+ Addend = 0;
+
+ if (const AArch64MCExpr *AE = dyn_cast<AArch64MCExpr>(Expr)) {
+ ELFRefKind = AE->getKind();
+ Expr = AE->getSubExpr();
+ }
+
+ const MCSymbolRefExpr *SE = dyn_cast<MCSymbolRefExpr>(Expr);
+ if (SE) {
+ // It's a simple symbol reference with no addend.
+ DarwinRefKind = SE->getKind();
+ return true;
+ }
+
+ const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(Expr);
+ if (!BE)
+ return false;
+
+ SE = dyn_cast<MCSymbolRefExpr>(BE->getLHS());
+ if (!SE)
+ return false;
+ DarwinRefKind = SE->getKind();
+
+ if (BE->getOpcode() != MCBinaryExpr::Add &&
+ BE->getOpcode() != MCBinaryExpr::Sub)
+ return false;
+
+ // See if the addend is is a constant, otherwise there's more going
+ // on here than we can deal with.
+ auto AddendExpr = dyn_cast<MCConstantExpr>(BE->getRHS());
+ if (!AddendExpr)
+ return false;
+
+ Addend = AddendExpr->getValue();
+ if (BE->getOpcode() == MCBinaryExpr::Sub)
+ Addend = -Addend;
+
+ // It's some symbol reference + a constant addend, but really
+ // shouldn't use both Darwin and ELF syntax.
+ return ELFRefKind == AArch64MCExpr::VK_INVALID ||
+ DarwinRefKind == MCSymbolRefExpr::VK_None;
+}
+
+/// Force static initialization.
+extern "C" void LLVMInitializeAArch64AsmParser() {
+ RegisterMCAsmParser<AArch64AsmParser> X(getTheAArch64leTarget());
+ RegisterMCAsmParser<AArch64AsmParser> Y(getTheAArch64beTarget());
+ RegisterMCAsmParser<AArch64AsmParser> Z(getTheARM64Target());
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_SUBTARGET_FEATURE_NAME
+#define GET_MATCHER_IMPLEMENTATION
+#include "AArch64GenAsmMatcher.inc"
+
+// Define this matcher function after the auto-generated include so we
+// have the match class enum definitions.
+unsigned AArch64AsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
+ unsigned Kind) {
+ AArch64Operand &Op = static_cast<AArch64Operand &>(AsmOp);
+ // If the kind is a token for a literal immediate, check if our asm
+ // operand matches. This is for InstAliases which have a fixed-value
+ // immediate in the syntax.
+ int64_t ExpectedVal;
+ switch (Kind) {
+ default:
+ return Match_InvalidOperand;
+ case MCK__35_0:
+ ExpectedVal = 0;
+ break;
+ case MCK__35_1:
+ ExpectedVal = 1;
+ break;
+ case MCK__35_12:
+ ExpectedVal = 12;
+ break;
+ case MCK__35_16:
+ ExpectedVal = 16;
+ break;
+ case MCK__35_2:
+ ExpectedVal = 2;
+ break;
+ case MCK__35_24:
+ ExpectedVal = 24;
+ break;
+ case MCK__35_3:
+ ExpectedVal = 3;
+ break;
+ case MCK__35_32:
+ ExpectedVal = 32;
+ break;
+ case MCK__35_4:
+ ExpectedVal = 4;
+ break;
+ case MCK__35_48:
+ ExpectedVal = 48;
+ break;
+ case MCK__35_6:
+ ExpectedVal = 6;
+ break;
+ case MCK__35_64:
+ ExpectedVal = 64;
+ break;
+ case MCK__35_8:
+ ExpectedVal = 8;
+ break;
+ }
+ if (!Op.isImm())
+ return Match_InvalidOperand;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm());
+ if (!CE)
+ return Match_InvalidOperand;
+ if (CE->getValue() == ExpectedVal)
+ return Match_Success;
+ return Match_InvalidOperand;
+}
+
+
+OperandMatchResultTy
+AArch64AsmParser::tryParseGPRSeqPair(OperandVector &Operands) {
+
+ SMLoc S = getLoc();
+
+ if (getParser().getTok().isNot(AsmToken::Identifier)) {
+ Error(S, "expected register");
+ return MatchOperand_ParseFail;
+ }
+
+ int FirstReg = tryParseRegister();
+ if (FirstReg == -1) {
+ return MatchOperand_ParseFail;
+ }
+ const MCRegisterClass &WRegClass =
+ AArch64MCRegisterClasses[AArch64::GPR32RegClassID];
+ const MCRegisterClass &XRegClass =
+ AArch64MCRegisterClasses[AArch64::GPR64RegClassID];
+
+ bool isXReg = XRegClass.contains(FirstReg),
+ isWReg = WRegClass.contains(FirstReg);
+ if (!isXReg && !isWReg) {
+ Error(S, "expected first even register of a "
+ "consecutive same-size even/odd register pair");
+ return MatchOperand_ParseFail;
+ }
+
+ const MCRegisterInfo *RI = getContext().getRegisterInfo();
+ unsigned FirstEncoding = RI->getEncodingValue(FirstReg);
+
+ if (FirstEncoding & 0x1) {
+ Error(S, "expected first even register of a "
+ "consecutive same-size even/odd register pair");
+ return MatchOperand_ParseFail;
+ }
+
+ SMLoc M = getLoc();
+ if (getParser().getTok().isNot(AsmToken::Comma)) {
+ Error(M, "expected comma");
+ return MatchOperand_ParseFail;
+ }
+ // Eat the comma
+ getParser().Lex();
+
+ SMLoc E = getLoc();
+ int SecondReg = tryParseRegister();
+ if (SecondReg ==-1) {
+ return MatchOperand_ParseFail;
+ }
+
+ if (RI->getEncodingValue(SecondReg) != FirstEncoding + 1 ||
+ (isXReg && !XRegClass.contains(SecondReg)) ||
+ (isWReg && !WRegClass.contains(SecondReg))) {
+ Error(E,"expected second odd register of a "
+ "consecutive same-size even/odd register pair");
+ return MatchOperand_ParseFail;
+ }
+
+ unsigned Pair = 0;
+ if(isXReg) {
+ Pair = RI->getMatchingSuperReg(FirstReg, AArch64::sube64,
+ &AArch64MCRegisterClasses[AArch64::XSeqPairsClassRegClassID]);
+ } else {
+ Pair = RI->getMatchingSuperReg(FirstReg, AArch64::sube32,
+ &AArch64MCRegisterClasses[AArch64::WSeqPairsClassRegClassID]);
+ }
+
+ Operands.push_back(AArch64Operand::CreateReg(Pair, false, S, getLoc(),
+ getContext()));
+
+ return MatchOperand_Success;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
new file mode 100644
index 000000000000..0d860a7eef79
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -0,0 +1,1588 @@
+//===- AArch64Disassembler.cpp - Disassembler for AArch64 -------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64Disassembler.h"
+#include "AArch64ExternalSymbolizer.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-disassembler"
+
+// Pull DecodeStatus and its enum values into the global namespace.
+typedef llvm::MCDisassembler::DecodeStatus DecodeStatus;
+
+// Forward declare these because the autogenerated code will reference them.
+// Definitions are further down.
+static DecodeStatus DecodeFPR128RegisterClass(llvm::MCInst &Inst,
+ unsigned RegNo, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeFPR128_loRegisterClass(llvm::MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeFPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeFPR16RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeFPR8RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeGPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeGPR64spRegisterClass(llvm::MCInst &Inst,
+ unsigned RegNo, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeGPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeGPR32spRegisterClass(llvm::MCInst &Inst,
+ unsigned RegNo, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeQQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeQQQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeDDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeDDDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeFixedPointScaleImm32(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeFixedPointScaleImm64(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodePCRelLabel19(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMemExtend(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMRSSystemRegister(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMSRSystemRegister(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst,
+ uint32_t insn,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeMoveImmInstruction(llvm::MCInst &Inst, uint32_t insn,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeUnsignedLdStInstruction(llvm::MCInst &Inst,
+ uint32_t insn,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeSignedLdStInstruction(llvm::MCInst &Inst,
+ uint32_t insn, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeExclusiveLdStInstruction(llvm::MCInst &Inst,
+ uint32_t insn,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodePairLdStInstruction(llvm::MCInst &Inst, uint32_t insn,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeAddSubERegInstruction(llvm::MCInst &Inst,
+ uint32_t insn, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeLogicalImmInstruction(llvm::MCInst &Inst,
+ uint32_t insn, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeModImmInstruction(llvm::MCInst &Inst, uint32_t insn,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeModImmTiedInstruction(llvm::MCInst &Inst,
+ uint32_t insn, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeAdrInstruction(llvm::MCInst &Inst, uint32_t insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeBaseAddSubImm(llvm::MCInst &Inst, uint32_t insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeUnconditionalBranch(llvm::MCInst &Inst, uint32_t insn,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst,
+ uint32_t insn,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn,
+ uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeVecShiftR64Imm(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftR64ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder);
+static DecodeStatus DecodeVecShiftR32Imm(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftR32ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder);
+static DecodeStatus DecodeVecShiftR16Imm(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftR16ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder);
+static DecodeStatus DecodeVecShiftR8Imm(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftL64Imm(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftL32Imm(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftL16Imm(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftL8Imm(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeWSeqPairsClassRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Addr,
+ const void *Decoder);
+static DecodeStatus DecodeXSeqPairsClassRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Addr,
+ const void *Decoder);
+
+static bool Check(DecodeStatus &Out, DecodeStatus In) {
+ switch (In) {
+ case MCDisassembler::Success:
+ // Out stays the same.
+ return true;
+ case MCDisassembler::SoftFail:
+ Out = In;
+ return true;
+ case MCDisassembler::Fail:
+ Out = In;
+ return false;
+ }
+ llvm_unreachable("Invalid DecodeStatus!");
+}
+
+#include "AArch64GenDisassemblerTables.inc"
+#include "AArch64GenInstrInfo.inc"
+
+#define Success llvm::MCDisassembler::Success
+#define Fail llvm::MCDisassembler::Fail
+#define SoftFail llvm::MCDisassembler::SoftFail
+
+static MCDisassembler *createAArch64Disassembler(const Target &T,
+ const MCSubtargetInfo &STI,
+ MCContext &Ctx) {
+ return new AArch64Disassembler(STI, Ctx);
+}
+
+DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes,
+ uint64_t Address,
+ raw_ostream &OS,
+ raw_ostream &CS) const {
+ CommentStream = &CS;
+
+ Size = 0;
+ // We want to read exactly 4 bytes of data.
+ if (Bytes.size() < 4)
+ return Fail;
+ Size = 4;
+
+ // Encoded as a small-endian 32-bit word in the stream.
+ uint32_t Insn =
+ (Bytes[3] << 24) | (Bytes[2] << 16) | (Bytes[1] << 8) | (Bytes[0] << 0);
+
+ // Calling the auto-generated decoder function.
+ return decodeInstruction(DecoderTable32, MI, Insn, Address, this, STI);
+}
+
+static MCSymbolizer *
+createAArch64ExternalSymbolizer(const Triple &TT, LLVMOpInfoCallback GetOpInfo,
+ LLVMSymbolLookupCallback SymbolLookUp,
+ void *DisInfo, MCContext *Ctx,
+ std::unique_ptr<MCRelocationInfo> &&RelInfo) {
+ return new llvm::AArch64ExternalSymbolizer(*Ctx, move(RelInfo), GetOpInfo,
+ SymbolLookUp, DisInfo);
+}
+
+extern "C" void LLVMInitializeAArch64Disassembler() {
+ TargetRegistry::RegisterMCDisassembler(getTheAArch64leTarget(),
+ createAArch64Disassembler);
+ TargetRegistry::RegisterMCDisassembler(getTheAArch64beTarget(),
+ createAArch64Disassembler);
+ TargetRegistry::RegisterMCSymbolizer(getTheAArch64leTarget(),
+ createAArch64ExternalSymbolizer);
+ TargetRegistry::RegisterMCSymbolizer(getTheAArch64beTarget(),
+ createAArch64ExternalSymbolizer);
+
+ TargetRegistry::RegisterMCDisassembler(getTheARM64Target(),
+ createAArch64Disassembler);
+ TargetRegistry::RegisterMCSymbolizer(getTheARM64Target(),
+ createAArch64ExternalSymbolizer);
+}
+
+static const unsigned FPR128DecoderTable[] = {
+ AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, AArch64::Q4,
+ AArch64::Q5, AArch64::Q6, AArch64::Q7, AArch64::Q8, AArch64::Q9,
+ AArch64::Q10, AArch64::Q11, AArch64::Q12, AArch64::Q13, AArch64::Q14,
+ AArch64::Q15, AArch64::Q16, AArch64::Q17, AArch64::Q18, AArch64::Q19,
+ AArch64::Q20, AArch64::Q21, AArch64::Q22, AArch64::Q23, AArch64::Q24,
+ AArch64::Q25, AArch64::Q26, AArch64::Q27, AArch64::Q28, AArch64::Q29,
+ AArch64::Q30, AArch64::Q31
+};
+
+static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Addr,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return Fail;
+
+ unsigned Register = FPR128DecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return Success;
+}
+
+static DecodeStatus DecodeFPR128_loRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Addr,
+ const void *Decoder) {
+ if (RegNo > 15)
+ return Fail;
+ return DecodeFPR128RegisterClass(Inst, RegNo, Addr, Decoder);
+}
+
+static const unsigned FPR64DecoderTable[] = {
+ AArch64::D0, AArch64::D1, AArch64::D2, AArch64::D3, AArch64::D4,
+ AArch64::D5, AArch64::D6, AArch64::D7, AArch64::D8, AArch64::D9,
+ AArch64::D10, AArch64::D11, AArch64::D12, AArch64::D13, AArch64::D14,
+ AArch64::D15, AArch64::D16, AArch64::D17, AArch64::D18, AArch64::D19,
+ AArch64::D20, AArch64::D21, AArch64::D22, AArch64::D23, AArch64::D24,
+ AArch64::D25, AArch64::D26, AArch64::D27, AArch64::D28, AArch64::D29,
+ AArch64::D30, AArch64::D31
+};
+
+static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Addr,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return Fail;
+
+ unsigned Register = FPR64DecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return Success;
+}
+
+static const unsigned FPR32DecoderTable[] = {
+ AArch64::S0, AArch64::S1, AArch64::S2, AArch64::S3, AArch64::S4,
+ AArch64::S5, AArch64::S6, AArch64::S7, AArch64::S8, AArch64::S9,
+ AArch64::S10, AArch64::S11, AArch64::S12, AArch64::S13, AArch64::S14,
+ AArch64::S15, AArch64::S16, AArch64::S17, AArch64::S18, AArch64::S19,
+ AArch64::S20, AArch64::S21, AArch64::S22, AArch64::S23, AArch64::S24,
+ AArch64::S25, AArch64::S26, AArch64::S27, AArch64::S28, AArch64::S29,
+ AArch64::S30, AArch64::S31
+};
+
+static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Addr,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return Fail;
+
+ unsigned Register = FPR32DecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return Success;
+}
+
+static const unsigned FPR16DecoderTable[] = {
+ AArch64::H0, AArch64::H1, AArch64::H2, AArch64::H3, AArch64::H4,
+ AArch64::H5, AArch64::H6, AArch64::H7, AArch64::H8, AArch64::H9,
+ AArch64::H10, AArch64::H11, AArch64::H12, AArch64::H13, AArch64::H14,
+ AArch64::H15, AArch64::H16, AArch64::H17, AArch64::H18, AArch64::H19,
+ AArch64::H20, AArch64::H21, AArch64::H22, AArch64::H23, AArch64::H24,
+ AArch64::H25, AArch64::H26, AArch64::H27, AArch64::H28, AArch64::H29,
+ AArch64::H30, AArch64::H31
+};
+
+static DecodeStatus DecodeFPR16RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Addr,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return Fail;
+
+ unsigned Register = FPR16DecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return Success;
+}
+
+static const unsigned FPR8DecoderTable[] = {
+ AArch64::B0, AArch64::B1, AArch64::B2, AArch64::B3, AArch64::B4,
+ AArch64::B5, AArch64::B6, AArch64::B7, AArch64::B8, AArch64::B9,
+ AArch64::B10, AArch64::B11, AArch64::B12, AArch64::B13, AArch64::B14,
+ AArch64::B15, AArch64::B16, AArch64::B17, AArch64::B18, AArch64::B19,
+ AArch64::B20, AArch64::B21, AArch64::B22, AArch64::B23, AArch64::B24,
+ AArch64::B25, AArch64::B26, AArch64::B27, AArch64::B28, AArch64::B29,
+ AArch64::B30, AArch64::B31
+};
+
+static DecodeStatus DecodeFPR8RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Addr,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return Fail;
+
+ unsigned Register = FPR8DecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return Success;
+}
+
+static const unsigned GPR64DecoderTable[] = {
+ AArch64::X0, AArch64::X1, AArch64::X2, AArch64::X3, AArch64::X4,
+ AArch64::X5, AArch64::X6, AArch64::X7, AArch64::X8, AArch64::X9,
+ AArch64::X10, AArch64::X11, AArch64::X12, AArch64::X13, AArch64::X14,
+ AArch64::X15, AArch64::X16, AArch64::X17, AArch64::X18, AArch64::X19,
+ AArch64::X20, AArch64::X21, AArch64::X22, AArch64::X23, AArch64::X24,
+ AArch64::X25, AArch64::X26, AArch64::X27, AArch64::X28, AArch64::FP,
+ AArch64::LR, AArch64::XZR
+};
+
+static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Addr,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return Fail;
+
+ unsigned Register = GPR64DecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return Success;
+}
+
+static DecodeStatus DecodeGPR64spRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Addr,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return Fail;
+ unsigned Register = GPR64DecoderTable[RegNo];
+ if (Register == AArch64::XZR)
+ Register = AArch64::SP;
+ Inst.addOperand(MCOperand::createReg(Register));
+ return Success;
+}
+
+static const unsigned GPR32DecoderTable[] = {
+ AArch64::W0, AArch64::W1, AArch64::W2, AArch64::W3, AArch64::W4,
+ AArch64::W5, AArch64::W6, AArch64::W7, AArch64::W8, AArch64::W9,
+ AArch64::W10, AArch64::W11, AArch64::W12, AArch64::W13, AArch64::W14,
+ AArch64::W15, AArch64::W16, AArch64::W17, AArch64::W18, AArch64::W19,
+ AArch64::W20, AArch64::W21, AArch64::W22, AArch64::W23, AArch64::W24,
+ AArch64::W25, AArch64::W26, AArch64::W27, AArch64::W28, AArch64::W29,
+ AArch64::W30, AArch64::WZR
+};
+
+static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Addr,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return Fail;
+
+ unsigned Register = GPR32DecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return Success;
+}
+
+static DecodeStatus DecodeGPR32spRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Addr,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return Fail;
+
+ unsigned Register = GPR32DecoderTable[RegNo];
+ if (Register == AArch64::WZR)
+ Register = AArch64::WSP;
+ Inst.addOperand(MCOperand::createReg(Register));
+ return Success;
+}
+
+static const unsigned VectorDecoderTable[] = {
+ AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, AArch64::Q4,
+ AArch64::Q5, AArch64::Q6, AArch64::Q7, AArch64::Q8, AArch64::Q9,
+ AArch64::Q10, AArch64::Q11, AArch64::Q12, AArch64::Q13, AArch64::Q14,
+ AArch64::Q15, AArch64::Q16, AArch64::Q17, AArch64::Q18, AArch64::Q19,
+ AArch64::Q20, AArch64::Q21, AArch64::Q22, AArch64::Q23, AArch64::Q24,
+ AArch64::Q25, AArch64::Q26, AArch64::Q27, AArch64::Q28, AArch64::Q29,
+ AArch64::Q30, AArch64::Q31
+};
+
+static DecodeStatus DecodeVectorRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Addr,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return Fail;
+
+ unsigned Register = VectorDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return Success;
+}
+
+static const unsigned QQDecoderTable[] = {
+ AArch64::Q0_Q1, AArch64::Q1_Q2, AArch64::Q2_Q3, AArch64::Q3_Q4,
+ AArch64::Q4_Q5, AArch64::Q5_Q6, AArch64::Q6_Q7, AArch64::Q7_Q8,
+ AArch64::Q8_Q9, AArch64::Q9_Q10, AArch64::Q10_Q11, AArch64::Q11_Q12,
+ AArch64::Q12_Q13, AArch64::Q13_Q14, AArch64::Q14_Q15, AArch64::Q15_Q16,
+ AArch64::Q16_Q17, AArch64::Q17_Q18, AArch64::Q18_Q19, AArch64::Q19_Q20,
+ AArch64::Q20_Q21, AArch64::Q21_Q22, AArch64::Q22_Q23, AArch64::Q23_Q24,
+ AArch64::Q24_Q25, AArch64::Q25_Q26, AArch64::Q26_Q27, AArch64::Q27_Q28,
+ AArch64::Q28_Q29, AArch64::Q29_Q30, AArch64::Q30_Q31, AArch64::Q31_Q0
+};
+
+static DecodeStatus DecodeQQRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Addr, const void *Decoder) {
+ if (RegNo > 31)
+ return Fail;
+ unsigned Register = QQDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return Success;
+}
+
+static const unsigned QQQDecoderTable[] = {
+ AArch64::Q0_Q1_Q2, AArch64::Q1_Q2_Q3, AArch64::Q2_Q3_Q4,
+ AArch64::Q3_Q4_Q5, AArch64::Q4_Q5_Q6, AArch64::Q5_Q6_Q7,
+ AArch64::Q6_Q7_Q8, AArch64::Q7_Q8_Q9, AArch64::Q8_Q9_Q10,
+ AArch64::Q9_Q10_Q11, AArch64::Q10_Q11_Q12, AArch64::Q11_Q12_Q13,
+ AArch64::Q12_Q13_Q14, AArch64::Q13_Q14_Q15, AArch64::Q14_Q15_Q16,
+ AArch64::Q15_Q16_Q17, AArch64::Q16_Q17_Q18, AArch64::Q17_Q18_Q19,
+ AArch64::Q18_Q19_Q20, AArch64::Q19_Q20_Q21, AArch64::Q20_Q21_Q22,
+ AArch64::Q21_Q22_Q23, AArch64::Q22_Q23_Q24, AArch64::Q23_Q24_Q25,
+ AArch64::Q24_Q25_Q26, AArch64::Q25_Q26_Q27, AArch64::Q26_Q27_Q28,
+ AArch64::Q27_Q28_Q29, AArch64::Q28_Q29_Q30, AArch64::Q29_Q30_Q31,
+ AArch64::Q30_Q31_Q0, AArch64::Q31_Q0_Q1
+};
+
+static DecodeStatus DecodeQQQRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Addr, const void *Decoder) {
+ if (RegNo > 31)
+ return Fail;
+ unsigned Register = QQQDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return Success;
+}
+
+static const unsigned QQQQDecoderTable[] = {
+ AArch64::Q0_Q1_Q2_Q3, AArch64::Q1_Q2_Q3_Q4, AArch64::Q2_Q3_Q4_Q5,
+ AArch64::Q3_Q4_Q5_Q6, AArch64::Q4_Q5_Q6_Q7, AArch64::Q5_Q6_Q7_Q8,
+ AArch64::Q6_Q7_Q8_Q9, AArch64::Q7_Q8_Q9_Q10, AArch64::Q8_Q9_Q10_Q11,
+ AArch64::Q9_Q10_Q11_Q12, AArch64::Q10_Q11_Q12_Q13, AArch64::Q11_Q12_Q13_Q14,
+ AArch64::Q12_Q13_Q14_Q15, AArch64::Q13_Q14_Q15_Q16, AArch64::Q14_Q15_Q16_Q17,
+ AArch64::Q15_Q16_Q17_Q18, AArch64::Q16_Q17_Q18_Q19, AArch64::Q17_Q18_Q19_Q20,
+ AArch64::Q18_Q19_Q20_Q21, AArch64::Q19_Q20_Q21_Q22, AArch64::Q20_Q21_Q22_Q23,
+ AArch64::Q21_Q22_Q23_Q24, AArch64::Q22_Q23_Q24_Q25, AArch64::Q23_Q24_Q25_Q26,
+ AArch64::Q24_Q25_Q26_Q27, AArch64::Q25_Q26_Q27_Q28, AArch64::Q26_Q27_Q28_Q29,
+ AArch64::Q27_Q28_Q29_Q30, AArch64::Q28_Q29_Q30_Q31, AArch64::Q29_Q30_Q31_Q0,
+ AArch64::Q30_Q31_Q0_Q1, AArch64::Q31_Q0_Q1_Q2
+};
+
+static DecodeStatus DecodeQQQQRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Addr,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return Fail;
+ unsigned Register = QQQQDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return Success;
+}
+
+static const unsigned DDDecoderTable[] = {
+ AArch64::D0_D1, AArch64::D1_D2, AArch64::D2_D3, AArch64::D3_D4,
+ AArch64::D4_D5, AArch64::D5_D6, AArch64::D6_D7, AArch64::D7_D8,
+ AArch64::D8_D9, AArch64::D9_D10, AArch64::D10_D11, AArch64::D11_D12,
+ AArch64::D12_D13, AArch64::D13_D14, AArch64::D14_D15, AArch64::D15_D16,
+ AArch64::D16_D17, AArch64::D17_D18, AArch64::D18_D19, AArch64::D19_D20,
+ AArch64::D20_D21, AArch64::D21_D22, AArch64::D22_D23, AArch64::D23_D24,
+ AArch64::D24_D25, AArch64::D25_D26, AArch64::D26_D27, AArch64::D27_D28,
+ AArch64::D28_D29, AArch64::D29_D30, AArch64::D30_D31, AArch64::D31_D0
+};
+
+static DecodeStatus DecodeDDRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Addr, const void *Decoder) {
+ if (RegNo > 31)
+ return Fail;
+ unsigned Register = DDDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return Success;
+}
+
+static const unsigned DDDDecoderTable[] = {
+ AArch64::D0_D1_D2, AArch64::D1_D2_D3, AArch64::D2_D3_D4,
+ AArch64::D3_D4_D5, AArch64::D4_D5_D6, AArch64::D5_D6_D7,
+ AArch64::D6_D7_D8, AArch64::D7_D8_D9, AArch64::D8_D9_D10,
+ AArch64::D9_D10_D11, AArch64::D10_D11_D12, AArch64::D11_D12_D13,
+ AArch64::D12_D13_D14, AArch64::D13_D14_D15, AArch64::D14_D15_D16,
+ AArch64::D15_D16_D17, AArch64::D16_D17_D18, AArch64::D17_D18_D19,
+ AArch64::D18_D19_D20, AArch64::D19_D20_D21, AArch64::D20_D21_D22,
+ AArch64::D21_D22_D23, AArch64::D22_D23_D24, AArch64::D23_D24_D25,
+ AArch64::D24_D25_D26, AArch64::D25_D26_D27, AArch64::D26_D27_D28,
+ AArch64::D27_D28_D29, AArch64::D28_D29_D30, AArch64::D29_D30_D31,
+ AArch64::D30_D31_D0, AArch64::D31_D0_D1
+};
+
+static DecodeStatus DecodeDDDRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Addr, const void *Decoder) {
+ if (RegNo > 31)
+ return Fail;
+ unsigned Register = DDDDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return Success;
+}
+
+static const unsigned DDDDDecoderTable[] = {
+ AArch64::D0_D1_D2_D3, AArch64::D1_D2_D3_D4, AArch64::D2_D3_D4_D5,
+ AArch64::D3_D4_D5_D6, AArch64::D4_D5_D6_D7, AArch64::D5_D6_D7_D8,
+ AArch64::D6_D7_D8_D9, AArch64::D7_D8_D9_D10, AArch64::D8_D9_D10_D11,
+ AArch64::D9_D10_D11_D12, AArch64::D10_D11_D12_D13, AArch64::D11_D12_D13_D14,
+ AArch64::D12_D13_D14_D15, AArch64::D13_D14_D15_D16, AArch64::D14_D15_D16_D17,
+ AArch64::D15_D16_D17_D18, AArch64::D16_D17_D18_D19, AArch64::D17_D18_D19_D20,
+ AArch64::D18_D19_D20_D21, AArch64::D19_D20_D21_D22, AArch64::D20_D21_D22_D23,
+ AArch64::D21_D22_D23_D24, AArch64::D22_D23_D24_D25, AArch64::D23_D24_D25_D26,
+ AArch64::D24_D25_D26_D27, AArch64::D25_D26_D27_D28, AArch64::D26_D27_D28_D29,
+ AArch64::D27_D28_D29_D30, AArch64::D28_D29_D30_D31, AArch64::D29_D30_D31_D0,
+ AArch64::D30_D31_D0_D1, AArch64::D31_D0_D1_D2
+};
+
+static DecodeStatus DecodeDDDDRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Addr,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return Fail;
+ unsigned Register = DDDDDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return Success;
+}
+
+static DecodeStatus DecodeFixedPointScaleImm32(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ // scale{5} is asserted as 1 in tblgen.
+ Imm |= 0x20;
+ Inst.addOperand(MCOperand::createImm(64 - Imm));
+ return Success;
+}
+
+static DecodeStatus DecodeFixedPointScaleImm64(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ Inst.addOperand(MCOperand::createImm(64 - Imm));
+ return Success;
+}
+
+static DecodeStatus DecodePCRelLabel19(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder) {
+ int64_t ImmVal = Imm;
+ const AArch64Disassembler *Dis =
+ static_cast<const AArch64Disassembler *>(Decoder);
+
+ // Sign-extend 19-bit immediate.
+ if (ImmVal & (1 << (19 - 1)))
+ ImmVal |= ~((1LL << 19) - 1);
+
+ if (!Dis->tryAddingSymbolicOperand(Inst, ImmVal * 4, Addr,
+ Inst.getOpcode() != AArch64::LDRXl, 0, 4))
+ Inst.addOperand(MCOperand::createImm(ImmVal));
+ return Success;
+}
+
+static DecodeStatus DecodeMemExtend(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Address, const void *Decoder) {
+ Inst.addOperand(MCOperand::createImm((Imm >> 1) & 1));
+ Inst.addOperand(MCOperand::createImm(Imm & 1));
+ return Success;
+}
+
+static DecodeStatus DecodeMRSSystemRegister(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Address,
+ const void *Decoder) {
+ Inst.addOperand(MCOperand::createImm(Imm));
+
+ // Every system register in the encoding space is valid with the syntax
+ // S<op0>_<op1>_<Cn>_<Cm>_<op2>, so decoding system registers always succeeds.
+ return Success;
+}
+
+static DecodeStatus DecodeMSRSystemRegister(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Address,
+ const void *Decoder) {
+ Inst.addOperand(MCOperand::createImm(Imm));
+
+ return Success;
+}
+
+static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ // This decoder exists to add the dummy Lane operand to the MCInst, which must
+ // be 1 in assembly but has no other real manifestation.
+ unsigned Rd = fieldFromInstruction(Insn, 0, 5);
+ unsigned Rn = fieldFromInstruction(Insn, 5, 5);
+ unsigned IsToVec = fieldFromInstruction(Insn, 16, 1);
+
+ if (IsToVec) {
+ DecodeFPR128RegisterClass(Inst, Rd, Address, Decoder);
+ DecodeGPR64RegisterClass(Inst, Rn, Address, Decoder);
+ } else {
+ DecodeGPR64RegisterClass(Inst, Rd, Address, Decoder);
+ DecodeFPR128RegisterClass(Inst, Rn, Address, Decoder);
+ }
+
+ // Add the lane
+ Inst.addOperand(MCOperand::createImm(1));
+
+ return Success;
+}
+
+static DecodeStatus DecodeVecShiftRImm(llvm::MCInst &Inst, unsigned Imm,
+ unsigned Add) {
+ Inst.addOperand(MCOperand::createImm(Add - Imm));
+ return Success;
+}
+
+static DecodeStatus DecodeVecShiftLImm(llvm::MCInst &Inst, unsigned Imm,
+ unsigned Add) {
+ Inst.addOperand(MCOperand::createImm((Imm + Add) & (Add - 1)));
+ return Success;
+}
+
+static DecodeStatus DecodeVecShiftR64Imm(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder) {
+ return DecodeVecShiftRImm(Inst, Imm, 64);
+}
+
+static DecodeStatus DecodeVecShiftR64ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ return DecodeVecShiftRImm(Inst, Imm | 0x20, 64);
+}
+
+static DecodeStatus DecodeVecShiftR32Imm(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder) {
+ return DecodeVecShiftRImm(Inst, Imm, 32);
+}
+
+static DecodeStatus DecodeVecShiftR32ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ return DecodeVecShiftRImm(Inst, Imm | 0x10, 32);
+}
+
+static DecodeStatus DecodeVecShiftR16Imm(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder) {
+ return DecodeVecShiftRImm(Inst, Imm, 16);
+}
+
+static DecodeStatus DecodeVecShiftR16ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ return DecodeVecShiftRImm(Inst, Imm | 0x8, 16);
+}
+
+static DecodeStatus DecodeVecShiftR8Imm(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder) {
+ return DecodeVecShiftRImm(Inst, Imm, 8);
+}
+
+static DecodeStatus DecodeVecShiftL64Imm(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder) {
+ return DecodeVecShiftLImm(Inst, Imm, 64);
+}
+
+static DecodeStatus DecodeVecShiftL32Imm(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder) {
+ return DecodeVecShiftLImm(Inst, Imm, 32);
+}
+
+static DecodeStatus DecodeVecShiftL16Imm(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder) {
+ return DecodeVecShiftLImm(Inst, Imm, 16);
+}
+
+static DecodeStatus DecodeVecShiftL8Imm(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder) {
+ return DecodeVecShiftLImm(Inst, Imm, 8);
+}
+
+static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst,
+ uint32_t insn, uint64_t Addr,
+ const void *Decoder) {
+ unsigned Rd = fieldFromInstruction(insn, 0, 5);
+ unsigned Rn = fieldFromInstruction(insn, 5, 5);
+ unsigned Rm = fieldFromInstruction(insn, 16, 5);
+ unsigned shiftHi = fieldFromInstruction(insn, 22, 2);
+ unsigned shiftLo = fieldFromInstruction(insn, 10, 6);
+ unsigned shift = (shiftHi << 6) | shiftLo;
+ switch (Inst.getOpcode()) {
+ default:
+ return Fail;
+ case AArch64::ADDWrs:
+ case AArch64::ADDSWrs:
+ case AArch64::SUBWrs:
+ case AArch64::SUBSWrs:
+ // if shift == '11' then ReservedValue()
+ if (shiftHi == 0x3)
+ return Fail;
+ // Deliberate fallthrough
+ case AArch64::ANDWrs:
+ case AArch64::ANDSWrs:
+ case AArch64::BICWrs:
+ case AArch64::BICSWrs:
+ case AArch64::ORRWrs:
+ case AArch64::ORNWrs:
+ case AArch64::EORWrs:
+ case AArch64::EONWrs: {
+ // if sf == '0' and imm6<5> == '1' then ReservedValue()
+ if (shiftLo >> 5 == 1)
+ return Fail;
+ DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+ DecodeGPR32RegisterClass(Inst, Rn, Addr, Decoder);
+ DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
+ break;
+ }
+ case AArch64::ADDXrs:
+ case AArch64::ADDSXrs:
+ case AArch64::SUBXrs:
+ case AArch64::SUBSXrs:
+ // if shift == '11' then ReservedValue()
+ if (shiftHi == 0x3)
+ return Fail;
+ // Deliberate fallthrough
+ case AArch64::ANDXrs:
+ case AArch64::ANDSXrs:
+ case AArch64::BICXrs:
+ case AArch64::BICSXrs:
+ case AArch64::ORRXrs:
+ case AArch64::ORNXrs:
+ case AArch64::EORXrs:
+ case AArch64::EONXrs:
+ DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+ DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder);
+ DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
+ break;
+ }
+
+ Inst.addOperand(MCOperand::createImm(shift));
+ return Success;
+}
+
+static DecodeStatus DecodeMoveImmInstruction(llvm::MCInst &Inst, uint32_t insn,
+ uint64_t Addr,
+ const void *Decoder) {
+ unsigned Rd = fieldFromInstruction(insn, 0, 5);
+ unsigned imm = fieldFromInstruction(insn, 5, 16);
+ unsigned shift = fieldFromInstruction(insn, 21, 2);
+ shift <<= 4;
+ switch (Inst.getOpcode()) {
+ default:
+ return Fail;
+ case AArch64::MOVZWi:
+ case AArch64::MOVNWi:
+ case AArch64::MOVKWi:
+ if (shift & (1U << 5))
+ return Fail;
+ DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+ break;
+ case AArch64::MOVZXi:
+ case AArch64::MOVNXi:
+ case AArch64::MOVKXi:
+ DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+ break;
+ }
+
+ if (Inst.getOpcode() == AArch64::MOVKWi ||
+ Inst.getOpcode() == AArch64::MOVKXi)
+ Inst.addOperand(Inst.getOperand(0));
+
+ Inst.addOperand(MCOperand::createImm(imm));
+ Inst.addOperand(MCOperand::createImm(shift));
+ return Success;
+}
+
+static DecodeStatus DecodeUnsignedLdStInstruction(llvm::MCInst &Inst,
+ uint32_t insn, uint64_t Addr,
+ const void *Decoder) {
+ unsigned Rt = fieldFromInstruction(insn, 0, 5);
+ unsigned Rn = fieldFromInstruction(insn, 5, 5);
+ unsigned offset = fieldFromInstruction(insn, 10, 12);
+ const AArch64Disassembler *Dis =
+ static_cast<const AArch64Disassembler *>(Decoder);
+
+ switch (Inst.getOpcode()) {
+ default:
+ return Fail;
+ case AArch64::PRFMui:
+ // Rt is an immediate in prefetch.
+ Inst.addOperand(MCOperand::createImm(Rt));
+ break;
+ case AArch64::STRBBui:
+ case AArch64::LDRBBui:
+ case AArch64::LDRSBWui:
+ case AArch64::STRHHui:
+ case AArch64::LDRHHui:
+ case AArch64::LDRSHWui:
+ case AArch64::STRWui:
+ case AArch64::LDRWui:
+ DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+ break;
+ case AArch64::LDRSBXui:
+ case AArch64::LDRSHXui:
+ case AArch64::LDRSWui:
+ case AArch64::STRXui:
+ case AArch64::LDRXui:
+ DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+ break;
+ case AArch64::LDRQui:
+ case AArch64::STRQui:
+ DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
+ break;
+ case AArch64::LDRDui:
+ case AArch64::STRDui:
+ DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
+ break;
+ case AArch64::LDRSui:
+ case AArch64::STRSui:
+ DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
+ break;
+ case AArch64::LDRHui:
+ case AArch64::STRHui:
+ DecodeFPR16RegisterClass(Inst, Rt, Addr, Decoder);
+ break;
+ case AArch64::LDRBui:
+ case AArch64::STRBui:
+ DecodeFPR8RegisterClass(Inst, Rt, Addr, Decoder);
+ break;
+ }
+
+ DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+ if (!Dis->tryAddingSymbolicOperand(Inst, offset, Addr, Fail, 0, 4))
+ Inst.addOperand(MCOperand::createImm(offset));
+ return Success;
+}
+
+static DecodeStatus DecodeSignedLdStInstruction(llvm::MCInst &Inst,
+ uint32_t insn, uint64_t Addr,
+ const void *Decoder) {
+ unsigned Rt = fieldFromInstruction(insn, 0, 5);
+ unsigned Rn = fieldFromInstruction(insn, 5, 5);
+ int64_t offset = fieldFromInstruction(insn, 12, 9);
+
+ // offset is a 9-bit signed immediate, so sign extend it to
+ // fill the unsigned.
+ if (offset & (1 << (9 - 1)))
+ offset |= ~((1LL << 9) - 1);
+
+ // First operand is always the writeback to the address register, if needed.
+ switch (Inst.getOpcode()) {
+ default:
+ break;
+ case AArch64::LDRSBWpre:
+ case AArch64::LDRSHWpre:
+ case AArch64::STRBBpre:
+ case AArch64::LDRBBpre:
+ case AArch64::STRHHpre:
+ case AArch64::LDRHHpre:
+ case AArch64::STRWpre:
+ case AArch64::LDRWpre:
+ case AArch64::LDRSBWpost:
+ case AArch64::LDRSHWpost:
+ case AArch64::STRBBpost:
+ case AArch64::LDRBBpost:
+ case AArch64::STRHHpost:
+ case AArch64::LDRHHpost:
+ case AArch64::STRWpost:
+ case AArch64::LDRWpost:
+ case AArch64::LDRSBXpre:
+ case AArch64::LDRSHXpre:
+ case AArch64::STRXpre:
+ case AArch64::LDRSWpre:
+ case AArch64::LDRXpre:
+ case AArch64::LDRSBXpost:
+ case AArch64::LDRSHXpost:
+ case AArch64::STRXpost:
+ case AArch64::LDRSWpost:
+ case AArch64::LDRXpost:
+ case AArch64::LDRQpre:
+ case AArch64::STRQpre:
+ case AArch64::LDRQpost:
+ case AArch64::STRQpost:
+ case AArch64::LDRDpre:
+ case AArch64::STRDpre:
+ case AArch64::LDRDpost:
+ case AArch64::STRDpost:
+ case AArch64::LDRSpre:
+ case AArch64::STRSpre:
+ case AArch64::LDRSpost:
+ case AArch64::STRSpost:
+ case AArch64::LDRHpre:
+ case AArch64::STRHpre:
+ case AArch64::LDRHpost:
+ case AArch64::STRHpost:
+ case AArch64::LDRBpre:
+ case AArch64::STRBpre:
+ case AArch64::LDRBpost:
+ case AArch64::STRBpost:
+ DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+ break;
+ }
+
+ switch (Inst.getOpcode()) {
+ default:
+ return Fail;
+ case AArch64::PRFUMi:
+ // Rt is an immediate in prefetch.
+ Inst.addOperand(MCOperand::createImm(Rt));
+ break;
+ case AArch64::STURBBi:
+ case AArch64::LDURBBi:
+ case AArch64::LDURSBWi:
+ case AArch64::STURHHi:
+ case AArch64::LDURHHi:
+ case AArch64::LDURSHWi:
+ case AArch64::STURWi:
+ case AArch64::LDURWi:
+ case AArch64::LDTRSBWi:
+ case AArch64::LDTRSHWi:
+ case AArch64::STTRWi:
+ case AArch64::LDTRWi:
+ case AArch64::STTRHi:
+ case AArch64::LDTRHi:
+ case AArch64::LDTRBi:
+ case AArch64::STTRBi:
+ case AArch64::LDRSBWpre:
+ case AArch64::LDRSHWpre:
+ case AArch64::STRBBpre:
+ case AArch64::LDRBBpre:
+ case AArch64::STRHHpre:
+ case AArch64::LDRHHpre:
+ case AArch64::STRWpre:
+ case AArch64::LDRWpre:
+ case AArch64::LDRSBWpost:
+ case AArch64::LDRSHWpost:
+ case AArch64::STRBBpost:
+ case AArch64::LDRBBpost:
+ case AArch64::STRHHpost:
+ case AArch64::LDRHHpost:
+ case AArch64::STRWpost:
+ case AArch64::LDRWpost:
+ DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+ break;
+ case AArch64::LDURSBXi:
+ case AArch64::LDURSHXi:
+ case AArch64::LDURSWi:
+ case AArch64::STURXi:
+ case AArch64::LDURXi:
+ case AArch64::LDTRSBXi:
+ case AArch64::LDTRSHXi:
+ case AArch64::LDTRSWi:
+ case AArch64::STTRXi:
+ case AArch64::LDTRXi:
+ case AArch64::LDRSBXpre:
+ case AArch64::LDRSHXpre:
+ case AArch64::STRXpre:
+ case AArch64::LDRSWpre:
+ case AArch64::LDRXpre:
+ case AArch64::LDRSBXpost:
+ case AArch64::LDRSHXpost:
+ case AArch64::STRXpost:
+ case AArch64::LDRSWpost:
+ case AArch64::LDRXpost:
+ DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+ break;
+ case AArch64::LDURQi:
+ case AArch64::STURQi:
+ case AArch64::LDRQpre:
+ case AArch64::STRQpre:
+ case AArch64::LDRQpost:
+ case AArch64::STRQpost:
+ DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
+ break;
+ case AArch64::LDURDi:
+ case AArch64::STURDi:
+ case AArch64::LDRDpre:
+ case AArch64::STRDpre:
+ case AArch64::LDRDpost:
+ case AArch64::STRDpost:
+ DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
+ break;
+ case AArch64::LDURSi:
+ case AArch64::STURSi:
+ case AArch64::LDRSpre:
+ case AArch64::STRSpre:
+ case AArch64::LDRSpost:
+ case AArch64::STRSpost:
+ DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
+ break;
+ case AArch64::LDURHi:
+ case AArch64::STURHi:
+ case AArch64::LDRHpre:
+ case AArch64::STRHpre:
+ case AArch64::LDRHpost:
+ case AArch64::STRHpost:
+ DecodeFPR16RegisterClass(Inst, Rt, Addr, Decoder);
+ break;
+ case AArch64::LDURBi:
+ case AArch64::STURBi:
+ case AArch64::LDRBpre:
+ case AArch64::STRBpre:
+ case AArch64::LDRBpost:
+ case AArch64::STRBpost:
+ DecodeFPR8RegisterClass(Inst, Rt, Addr, Decoder);
+ break;
+ }
+
+ DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+ Inst.addOperand(MCOperand::createImm(offset));
+
+ bool IsLoad = fieldFromInstruction(insn, 22, 1);
+ bool IsIndexed = fieldFromInstruction(insn, 10, 2) != 0;
+ bool IsFP = fieldFromInstruction(insn, 26, 1);
+
+ // Cannot write back to a transfer register (but xzr != sp).
+ if (IsLoad && IsIndexed && !IsFP && Rn != 31 && Rt == Rn)
+ return SoftFail;
+
+ return Success;
+}
+
+static DecodeStatus DecodeExclusiveLdStInstruction(llvm::MCInst &Inst,
+ uint32_t insn, uint64_t Addr,
+ const void *Decoder) {
+ unsigned Rt = fieldFromInstruction(insn, 0, 5);
+ unsigned Rn = fieldFromInstruction(insn, 5, 5);
+ unsigned Rt2 = fieldFromInstruction(insn, 10, 5);
+ unsigned Rs = fieldFromInstruction(insn, 16, 5);
+
+ unsigned Opcode = Inst.getOpcode();
+ switch (Opcode) {
+ default:
+ return Fail;
+ case AArch64::STLXRW:
+ case AArch64::STLXRB:
+ case AArch64::STLXRH:
+ case AArch64::STXRW:
+ case AArch64::STXRB:
+ case AArch64::STXRH:
+ DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
+ LLVM_FALLTHROUGH;
+ case AArch64::LDARW:
+ case AArch64::LDARB:
+ case AArch64::LDARH:
+ case AArch64::LDAXRW:
+ case AArch64::LDAXRB:
+ case AArch64::LDAXRH:
+ case AArch64::LDXRW:
+ case AArch64::LDXRB:
+ case AArch64::LDXRH:
+ case AArch64::STLRW:
+ case AArch64::STLRB:
+ case AArch64::STLRH:
+ case AArch64::STLLRW:
+ case AArch64::STLLRB:
+ case AArch64::STLLRH:
+ case AArch64::LDLARW:
+ case AArch64::LDLARB:
+ case AArch64::LDLARH:
+ DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+ break;
+ case AArch64::STLXRX:
+ case AArch64::STXRX:
+ DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
+ LLVM_FALLTHROUGH;
+ case AArch64::LDARX:
+ case AArch64::LDAXRX:
+ case AArch64::LDXRX:
+ case AArch64::STLRX:
+ case AArch64::LDLARX:
+ case AArch64::STLLRX:
+ DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+ break;
+ case AArch64::STLXPW:
+ case AArch64::STXPW:
+ DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
+ LLVM_FALLTHROUGH;
+ case AArch64::LDAXPW:
+ case AArch64::LDXPW:
+ DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+ DecodeGPR32RegisterClass(Inst, Rt2, Addr, Decoder);
+ break;
+ case AArch64::STLXPX:
+ case AArch64::STXPX:
+ DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
+ LLVM_FALLTHROUGH;
+ case AArch64::LDAXPX:
+ case AArch64::LDXPX:
+ DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+ DecodeGPR64RegisterClass(Inst, Rt2, Addr, Decoder);
+ break;
+ }
+
+ DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+
+ // You shouldn't load to the same register twice in an instruction...
+ if ((Opcode == AArch64::LDAXPW || Opcode == AArch64::LDXPW ||
+ Opcode == AArch64::LDAXPX || Opcode == AArch64::LDXPX) &&
+ Rt == Rt2)
+ return SoftFail;
+
+ return Success;
+}
+
+static DecodeStatus DecodePairLdStInstruction(llvm::MCInst &Inst, uint32_t insn,
+ uint64_t Addr,
+ const void *Decoder) {
+ unsigned Rt = fieldFromInstruction(insn, 0, 5);
+ unsigned Rn = fieldFromInstruction(insn, 5, 5);
+ unsigned Rt2 = fieldFromInstruction(insn, 10, 5);
+ int64_t offset = fieldFromInstruction(insn, 15, 7);
+ bool IsLoad = fieldFromInstruction(insn, 22, 1);
+
+ // offset is a 7-bit signed immediate, so sign extend it to
+ // fill the unsigned.
+ if (offset & (1 << (7 - 1)))
+ offset |= ~((1LL << 7) - 1);
+
+ unsigned Opcode = Inst.getOpcode();
+ bool NeedsDisjointWritebackTransfer = false;
+
+ // First operand is always writeback of base register.
+ switch (Opcode) {
+ default:
+ break;
+ case AArch64::LDPXpost:
+ case AArch64::STPXpost:
+ case AArch64::LDPSWpost:
+ case AArch64::LDPXpre:
+ case AArch64::STPXpre:
+ case AArch64::LDPSWpre:
+ case AArch64::LDPWpost:
+ case AArch64::STPWpost:
+ case AArch64::LDPWpre:
+ case AArch64::STPWpre:
+ case AArch64::LDPQpost:
+ case AArch64::STPQpost:
+ case AArch64::LDPQpre:
+ case AArch64::STPQpre:
+ case AArch64::LDPDpost:
+ case AArch64::STPDpost:
+ case AArch64::LDPDpre:
+ case AArch64::STPDpre:
+ case AArch64::LDPSpost:
+ case AArch64::STPSpost:
+ case AArch64::LDPSpre:
+ case AArch64::STPSpre:
+ DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+ break;
+ }
+
+ switch (Opcode) {
+ default:
+ return Fail;
+ case AArch64::LDPXpost:
+ case AArch64::STPXpost:
+ case AArch64::LDPSWpost:
+ case AArch64::LDPXpre:
+ case AArch64::STPXpre:
+ case AArch64::LDPSWpre:
+ NeedsDisjointWritebackTransfer = true;
+ LLVM_FALLTHROUGH;
+ case AArch64::LDNPXi:
+ case AArch64::STNPXi:
+ case AArch64::LDPXi:
+ case AArch64::STPXi:
+ case AArch64::LDPSWi:
+ DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+ DecodeGPR64RegisterClass(Inst, Rt2, Addr, Decoder);
+ break;
+ case AArch64::LDPWpost:
+ case AArch64::STPWpost:
+ case AArch64::LDPWpre:
+ case AArch64::STPWpre:
+ NeedsDisjointWritebackTransfer = true;
+ LLVM_FALLTHROUGH;
+ case AArch64::LDNPWi:
+ case AArch64::STNPWi:
+ case AArch64::LDPWi:
+ case AArch64::STPWi:
+ DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+ DecodeGPR32RegisterClass(Inst, Rt2, Addr, Decoder);
+ break;
+ case AArch64::LDNPQi:
+ case AArch64::STNPQi:
+ case AArch64::LDPQpost:
+ case AArch64::STPQpost:
+ case AArch64::LDPQi:
+ case AArch64::STPQi:
+ case AArch64::LDPQpre:
+ case AArch64::STPQpre:
+ DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
+ DecodeFPR128RegisterClass(Inst, Rt2, Addr, Decoder);
+ break;
+ case AArch64::LDNPDi:
+ case AArch64::STNPDi:
+ case AArch64::LDPDpost:
+ case AArch64::STPDpost:
+ case AArch64::LDPDi:
+ case AArch64::STPDi:
+ case AArch64::LDPDpre:
+ case AArch64::STPDpre:
+ DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
+ DecodeFPR64RegisterClass(Inst, Rt2, Addr, Decoder);
+ break;
+ case AArch64::LDNPSi:
+ case AArch64::STNPSi:
+ case AArch64::LDPSpost:
+ case AArch64::STPSpost:
+ case AArch64::LDPSi:
+ case AArch64::STPSi:
+ case AArch64::LDPSpre:
+ case AArch64::STPSpre:
+ DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
+ DecodeFPR32RegisterClass(Inst, Rt2, Addr, Decoder);
+ break;
+ }
+
+ DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+ Inst.addOperand(MCOperand::createImm(offset));
+
+ // You shouldn't load to the same register twice in an instruction...
+ if (IsLoad && Rt == Rt2)
+ return SoftFail;
+
+ // ... or do any operation that writes-back to a transfer register. But note
+ // that "stp xzr, xzr, [sp], #4" is fine because xzr and sp are different.
+ if (NeedsDisjointWritebackTransfer && Rn != 31 && (Rt == Rn || Rt2 == Rn))
+ return SoftFail;
+
+ return Success;
+}
+
+static DecodeStatus DecodeAddSubERegInstruction(llvm::MCInst &Inst,
+ uint32_t insn, uint64_t Addr,
+ const void *Decoder) {
+ unsigned Rd = fieldFromInstruction(insn, 0, 5);
+ unsigned Rn = fieldFromInstruction(insn, 5, 5);
+ unsigned Rm = fieldFromInstruction(insn, 16, 5);
+ unsigned extend = fieldFromInstruction(insn, 10, 6);
+
+ unsigned shift = extend & 0x7;
+ if (shift > 4)
+ return Fail;
+
+ switch (Inst.getOpcode()) {
+ default:
+ return Fail;
+ case AArch64::ADDWrx:
+ case AArch64::SUBWrx:
+ DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder);
+ DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
+ DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
+ break;
+ case AArch64::ADDSWrx:
+ case AArch64::SUBSWrx:
+ DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+ DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
+ DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
+ break;
+ case AArch64::ADDXrx:
+ case AArch64::SUBXrx:
+ DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
+ DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+ DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
+ break;
+ case AArch64::ADDSXrx:
+ case AArch64::SUBSXrx:
+ DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+ DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+ DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
+ break;
+ case AArch64::ADDXrx64:
+ case AArch64::SUBXrx64:
+ DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
+ DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+ DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
+ break;
+ case AArch64::SUBSXrx64:
+ case AArch64::ADDSXrx64:
+ DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+ DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+ DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
+ break;
+ }
+
+ Inst.addOperand(MCOperand::createImm(extend));
+ return Success;
+}
+
+static DecodeStatus DecodeLogicalImmInstruction(llvm::MCInst &Inst,
+ uint32_t insn, uint64_t Addr,
+ const void *Decoder) {
+ unsigned Rd = fieldFromInstruction(insn, 0, 5);
+ unsigned Rn = fieldFromInstruction(insn, 5, 5);
+ unsigned Datasize = fieldFromInstruction(insn, 31, 1);
+ unsigned imm;
+
+ if (Datasize) {
+ if (Inst.getOpcode() == AArch64::ANDSXri)
+ DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+ else
+ DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
+ DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder);
+ imm = fieldFromInstruction(insn, 10, 13);
+ if (!AArch64_AM::isValidDecodeLogicalImmediate(imm, 64))
+ return Fail;
+ } else {
+ if (Inst.getOpcode() == AArch64::ANDSWri)
+ DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+ else
+ DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder);
+ DecodeGPR32RegisterClass(Inst, Rn, Addr, Decoder);
+ imm = fieldFromInstruction(insn, 10, 12);
+ if (!AArch64_AM::isValidDecodeLogicalImmediate(imm, 32))
+ return Fail;
+ }
+ Inst.addOperand(MCOperand::createImm(imm));
+ return Success;
+}
+
+static DecodeStatus DecodeModImmInstruction(llvm::MCInst &Inst, uint32_t insn,
+ uint64_t Addr,
+ const void *Decoder) {
+ unsigned Rd = fieldFromInstruction(insn, 0, 5);
+ unsigned cmode = fieldFromInstruction(insn, 12, 4);
+ unsigned imm = fieldFromInstruction(insn, 16, 3) << 5;
+ imm |= fieldFromInstruction(insn, 5, 5);
+
+ if (Inst.getOpcode() == AArch64::MOVID)
+ DecodeFPR64RegisterClass(Inst, Rd, Addr, Decoder);
+ else
+ DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
+
+ Inst.addOperand(MCOperand::createImm(imm));
+
+ switch (Inst.getOpcode()) {
+ default:
+ break;
+ case AArch64::MOVIv4i16:
+ case AArch64::MOVIv8i16:
+ case AArch64::MVNIv4i16:
+ case AArch64::MVNIv8i16:
+ case AArch64::MOVIv2i32:
+ case AArch64::MOVIv4i32:
+ case AArch64::MVNIv2i32:
+ case AArch64::MVNIv4i32:
+ Inst.addOperand(MCOperand::createImm((cmode & 6) << 2));
+ break;
+ case AArch64::MOVIv2s_msl:
+ case AArch64::MOVIv4s_msl:
+ case AArch64::MVNIv2s_msl:
+ case AArch64::MVNIv4s_msl:
+ Inst.addOperand(MCOperand::createImm(cmode & 1 ? 0x110 : 0x108));
+ break;
+ }
+
+ return Success;
+}
+
+static DecodeStatus DecodeModImmTiedInstruction(llvm::MCInst &Inst,
+ uint32_t insn, uint64_t Addr,
+ const void *Decoder) {
+ unsigned Rd = fieldFromInstruction(insn, 0, 5);
+ unsigned cmode = fieldFromInstruction(insn, 12, 4);
+ unsigned imm = fieldFromInstruction(insn, 16, 3) << 5;
+ imm |= fieldFromInstruction(insn, 5, 5);
+
+ // Tied operands added twice.
+ DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
+ DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
+
+ Inst.addOperand(MCOperand::createImm(imm));
+ Inst.addOperand(MCOperand::createImm((cmode & 6) << 2));
+
+ return Success;
+}
+
+static DecodeStatus DecodeAdrInstruction(llvm::MCInst &Inst, uint32_t insn,
+ uint64_t Addr, const void *Decoder) {
+ unsigned Rd = fieldFromInstruction(insn, 0, 5);
+ int64_t imm = fieldFromInstruction(insn, 5, 19) << 2;
+ imm |= fieldFromInstruction(insn, 29, 2);
+ const AArch64Disassembler *Dis =
+ static_cast<const AArch64Disassembler *>(Decoder);
+
+ // Sign-extend the 21-bit immediate.
+ if (imm & (1 << (21 - 1)))
+ imm |= ~((1LL << 21) - 1);
+
+ DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+ if (!Dis->tryAddingSymbolicOperand(Inst, imm, Addr, Fail, 0, 4))
+ Inst.addOperand(MCOperand::createImm(imm));
+
+ return Success;
+}
+
+static DecodeStatus DecodeBaseAddSubImm(llvm::MCInst &Inst, uint32_t insn,
+ uint64_t Addr, const void *Decoder) {
+ unsigned Rd = fieldFromInstruction(insn, 0, 5);
+ unsigned Rn = fieldFromInstruction(insn, 5, 5);
+ unsigned Imm = fieldFromInstruction(insn, 10, 14);
+ unsigned S = fieldFromInstruction(insn, 29, 1);
+ unsigned Datasize = fieldFromInstruction(insn, 31, 1);
+
+ unsigned ShifterVal = (Imm >> 12) & 3;
+ unsigned ImmVal = Imm & 0xFFF;
+ const AArch64Disassembler *Dis =
+ static_cast<const AArch64Disassembler *>(Decoder);
+
+ if (ShifterVal != 0 && ShifterVal != 1)
+ return Fail;
+
+ if (Datasize) {
+ if (Rd == 31 && !S)
+ DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
+ else
+ DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+ DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+ } else {
+ if (Rd == 31 && !S)
+ DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder);
+ else
+ DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+ DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
+ }
+
+ if (!Dis->tryAddingSymbolicOperand(Inst, Imm, Addr, Fail, 0, 4))
+ Inst.addOperand(MCOperand::createImm(ImmVal));
+ Inst.addOperand(MCOperand::createImm(12 * ShifterVal));
+ return Success;
+}
+
+static DecodeStatus DecodeUnconditionalBranch(llvm::MCInst &Inst, uint32_t insn,
+ uint64_t Addr,
+ const void *Decoder) {
+ int64_t imm = fieldFromInstruction(insn, 0, 26);
+ const AArch64Disassembler *Dis =
+ static_cast<const AArch64Disassembler *>(Decoder);
+
+ // Sign-extend the 26-bit immediate.
+ if (imm & (1 << (26 - 1)))
+ imm |= ~((1LL << 26) - 1);
+
+ if (!Dis->tryAddingSymbolicOperand(Inst, imm * 4, Addr, true, 0, 4))
+ Inst.addOperand(MCOperand::createImm(imm));
+
+ return Success;
+}
+
+static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst,
+ uint32_t insn, uint64_t Addr,
+ const void *Decoder) {
+ uint64_t op1 = fieldFromInstruction(insn, 16, 3);
+ uint64_t op2 = fieldFromInstruction(insn, 5, 3);
+ uint64_t crm = fieldFromInstruction(insn, 8, 4);
+
+ uint64_t pstate_field = (op1 << 3) | op2;
+
+ if ((pstate_field == AArch64PState::PAN ||
+ pstate_field == AArch64PState::UAO) && crm > 1)
+ return Fail;
+
+ Inst.addOperand(MCOperand::createImm(pstate_field));
+ Inst.addOperand(MCOperand::createImm(crm));
+
+ const AArch64Disassembler *Dis =
+ static_cast<const AArch64Disassembler *>(Decoder);
+ auto PState = AArch64PState::lookupPStateByEncoding(pstate_field);
+ if (PState && PState->haveFeatures(Dis->getSubtargetInfo().getFeatureBits()))
+ return Success;
+ return Fail;
+}
+
+static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn,
+ uint64_t Addr, const void *Decoder) {
+ uint64_t Rt = fieldFromInstruction(insn, 0, 5);
+ uint64_t bit = fieldFromInstruction(insn, 31, 1) << 5;
+ bit |= fieldFromInstruction(insn, 19, 5);
+ int64_t dst = fieldFromInstruction(insn, 5, 14);
+ const AArch64Disassembler *Dis =
+ static_cast<const AArch64Disassembler *>(Decoder);
+
+ // Sign-extend 14-bit immediate.
+ if (dst & (1 << (14 - 1)))
+ dst |= ~((1LL << 14) - 1);
+
+ if (fieldFromInstruction(insn, 31, 1) == 0)
+ DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+ else
+ DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+ Inst.addOperand(MCOperand::createImm(bit));
+ if (!Dis->tryAddingSymbolicOperand(Inst, dst * 4, Addr, true, 0, 4))
+ Inst.addOperand(MCOperand::createImm(dst));
+
+ return Success;
+}
+
+static DecodeStatus DecodeGPRSeqPairsClassRegisterClass(MCInst &Inst,
+ unsigned RegClassID,
+ unsigned RegNo,
+ uint64_t Addr,
+ const void *Decoder) {
+ // Register number must be even (see CASP instruction)
+ if (RegNo & 0x1)
+ return Fail;
+
+ unsigned Register = AArch64MCRegisterClasses[RegClassID].getRegister(RegNo);
+ Inst.addOperand(MCOperand::createReg(Register));
+ return Success;
+}
+
+static DecodeStatus DecodeWSeqPairsClassRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Addr,
+ const void *Decoder) {
+ return DecodeGPRSeqPairsClassRegisterClass(Inst,
+ AArch64::WSeqPairsClassRegClassID,
+ RegNo, Addr, Decoder);
+}
+
+static DecodeStatus DecodeXSeqPairsClassRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Addr,
+ const void *Decoder) {
+ return DecodeGPRSeqPairsClassRegisterClass(Inst,
+ AArch64::XSeqPairsClassRegClassID,
+ RegNo, Addr, Decoder);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
new file mode 100644
index 000000000000..24e353cf4b96
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
@@ -0,0 +1,38 @@
+//===- AArch64Disassembler.h - Disassembler for AArch64 ---------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64DISASSEMBLER_H
+#define LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64DISASSEMBLER_H
+
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+
+namespace llvm {
+
+class MCInst;
+class raw_ostream;
+
+class AArch64Disassembler : public MCDisassembler {
+public:
+ AArch64Disassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
+ : MCDisassembler(STI, Ctx) {}
+
+ ~AArch64Disassembler() {}
+
+ MCDisassembler::DecodeStatus
+ getInstruction(MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes,
+ uint64_t Address, raw_ostream &VStream,
+ raw_ostream &CStream) const override;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
new file mode 100644
index 000000000000..19d0ba2e1c41
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
@@ -0,0 +1,222 @@
+//===- AArch64ExternalSymbolizer.cpp - Symbolizer for AArch64 ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64ExternalSymbolizer.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-disassembler"
+
+static MCSymbolRefExpr::VariantKind
+getVariant(uint64_t LLVMDisassembler_VariantKind) {
+ switch (LLVMDisassembler_VariantKind) {
+ case LLVMDisassembler_VariantKind_None:
+ return MCSymbolRefExpr::VK_None;
+ case LLVMDisassembler_VariantKind_ARM64_PAGE:
+ return MCSymbolRefExpr::VK_PAGE;
+ case LLVMDisassembler_VariantKind_ARM64_PAGEOFF:
+ return MCSymbolRefExpr::VK_PAGEOFF;
+ case LLVMDisassembler_VariantKind_ARM64_GOTPAGE:
+ return MCSymbolRefExpr::VK_GOTPAGE;
+ case LLVMDisassembler_VariantKind_ARM64_GOTPAGEOFF:
+ return MCSymbolRefExpr::VK_GOTPAGEOFF;
+ case LLVMDisassembler_VariantKind_ARM64_TLVP:
+ case LLVMDisassembler_VariantKind_ARM64_TLVOFF:
+ default:
+ llvm_unreachable("bad LLVMDisassembler_VariantKind");
+ }
+}
+
+/// tryAddingSymbolicOperand - tryAddingSymbolicOperand trys to add a symbolic
+/// operand in place of the immediate Value in the MCInst. The immediate
+/// Value has not had any PC adjustment made by the caller. If the instruction
+/// is a branch that adds the PC to the immediate Value then isBranch is
+/// Success, else Fail. If GetOpInfo is non-null, then it is called to get any
+/// symbolic information at the Address for this instrution. If that returns
+/// non-zero then the symbolic information it returns is used to create an
+/// MCExpr and that is added as an operand to the MCInst. If GetOpInfo()
+/// returns zero and isBranch is Success then a symbol look up for
+/// Address + Value is done and if a symbol is found an MCExpr is created with
+/// that, else an MCExpr with Address + Value is created. If GetOpInfo()
+/// returns zero and isBranch is Fail then the Opcode of the MCInst is
+/// tested and for ADRP an other instructions that help to load of pointers
+/// a symbol look up is done to see it is returns a specific reference type
+/// to add to the comment stream. This function returns Success if it adds
+/// an operand to the MCInst and Fail otherwise.
+bool AArch64ExternalSymbolizer::tryAddingSymbolicOperand(
+ MCInst &MI, raw_ostream &CommentStream, int64_t Value, uint64_t Address,
+ bool IsBranch, uint64_t Offset, uint64_t InstSize) {
+ // FIXME: This method shares a lot of code with
+ // MCExternalSymbolizer::tryAddingSymbolicOperand. It may be possible
+ // refactor the MCExternalSymbolizer interface to allow more of this
+ // implementation to be shared.
+ //
+ struct LLVMOpInfo1 SymbolicOp;
+ memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1));
+ SymbolicOp.Value = Value;
+ uint64_t ReferenceType;
+ const char *ReferenceName;
+ if (!GetOpInfo ||
+ !GetOpInfo(DisInfo, Address, 0 /* Offset */, InstSize, 1, &SymbolicOp)) {
+ if (IsBranch) {
+ ReferenceType = LLVMDisassembler_ReferenceType_In_Branch;
+ const char *Name = SymbolLookUp(DisInfo, Address + Value, &ReferenceType,
+ Address, &ReferenceName);
+ if (Name) {
+ SymbolicOp.AddSymbol.Name = Name;
+ SymbolicOp.AddSymbol.Present = true;
+ SymbolicOp.Value = 0;
+ } else {
+ SymbolicOp.Value = Address + Value;
+ }
+ if (ReferenceType == LLVMDisassembler_ReferenceType_Out_SymbolStub)
+ CommentStream << "symbol stub for: " << ReferenceName;
+ else if (ReferenceType ==
+ LLVMDisassembler_ReferenceType_Out_Objc_Message)
+ CommentStream << "Objc message: " << ReferenceName;
+ } else if (MI.getOpcode() == AArch64::ADRP) {
+ ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADRP;
+ // otool expects the fully encoded ADRP instruction to be passed in as
+ // the value here, so reconstruct it:
+ const MCRegisterInfo &MCRI = *Ctx.getRegisterInfo();
+ uint32_t EncodedInst = 0x90000000;
+ EncodedInst |= (Value & 0x3) << 29; // immlo
+ EncodedInst |= ((Value >> 2) & 0x7FFFF) << 5; // immhi
+ EncodedInst |= MCRI.getEncodingValue(MI.getOperand(0).getReg()); // reg
+ SymbolLookUp(DisInfo, EncodedInst, &ReferenceType, Address,
+ &ReferenceName);
+ CommentStream << format("0x%llx",
+ 0xfffffffffffff000LL & (Address + Value));
+ } else if (MI.getOpcode() == AArch64::ADDXri ||
+ MI.getOpcode() == AArch64::LDRXui ||
+ MI.getOpcode() == AArch64::LDRXl ||
+ MI.getOpcode() == AArch64::ADR) {
+ if (MI.getOpcode() == AArch64::ADDXri)
+ ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADDXri;
+ else if (MI.getOpcode() == AArch64::LDRXui)
+ ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_LDRXui;
+ if (MI.getOpcode() == AArch64::LDRXl) {
+ ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_LDRXl;
+ SymbolLookUp(DisInfo, Address + Value, &ReferenceType, Address,
+ &ReferenceName);
+ } else if (MI.getOpcode() == AArch64::ADR) {
+ ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADR;
+ SymbolLookUp(DisInfo, Address + Value, &ReferenceType, Address,
+ &ReferenceName);
+ } else {
+ const MCRegisterInfo &MCRI = *Ctx.getRegisterInfo();
+ // otool expects the fully encoded ADD/LDR instruction to be passed in
+ // as the value here, so reconstruct it:
+ unsigned EncodedInst =
+ MI.getOpcode() == AArch64::ADDXri ? 0x91000000: 0xF9400000;
+ EncodedInst |= Value << 10; // imm12 [+ shift:2 for ADD]
+ EncodedInst |=
+ MCRI.getEncodingValue(MI.getOperand(1).getReg()) << 5; // Rn
+ EncodedInst |= MCRI.getEncodingValue(MI.getOperand(0).getReg()); // Rd
+
+ SymbolLookUp(DisInfo, EncodedInst, &ReferenceType, Address,
+ &ReferenceName);
+ }
+ if (ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_SymAddr)
+ CommentStream << "literal pool symbol address: " << ReferenceName;
+ else if (ReferenceType ==
+ LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr) {
+ CommentStream << "literal pool for: \"";
+ CommentStream.write_escaped(ReferenceName);
+ CommentStream << "\"";
+ } else if (ReferenceType ==
+ LLVMDisassembler_ReferenceType_Out_Objc_CFString_Ref)
+ CommentStream << "Objc cfstring ref: @\"" << ReferenceName << "\"";
+ else if (ReferenceType ==
+ LLVMDisassembler_ReferenceType_Out_Objc_Message)
+ CommentStream << "Objc message: " << ReferenceName;
+ else if (ReferenceType ==
+ LLVMDisassembler_ReferenceType_Out_Objc_Message_Ref)
+ CommentStream << "Objc message ref: " << ReferenceName;
+ else if (ReferenceType ==
+ LLVMDisassembler_ReferenceType_Out_Objc_Selector_Ref)
+ CommentStream << "Objc selector ref: " << ReferenceName;
+ else if (ReferenceType ==
+ LLVMDisassembler_ReferenceType_Out_Objc_Class_Ref)
+ CommentStream << "Objc class ref: " << ReferenceName;
+ // For these instructions, the SymbolLookUp() above is just to get the
+ // ReferenceType and ReferenceName. We want to make sure not to
+ // fall through so we don't build an MCExpr to leave the disassembly
+ // of the immediate values of these instructions to the InstPrinter.
+ return false;
+ } else {
+ return false;
+ }
+ }
+
+ const MCExpr *Add = nullptr;
+ if (SymbolicOp.AddSymbol.Present) {
+ if (SymbolicOp.AddSymbol.Name) {
+ StringRef Name(SymbolicOp.AddSymbol.Name);
+ MCSymbol *Sym = Ctx.getOrCreateSymbol(Name);
+ MCSymbolRefExpr::VariantKind Variant = getVariant(SymbolicOp.VariantKind);
+ if (Variant != MCSymbolRefExpr::VK_None)
+ Add = MCSymbolRefExpr::create(Sym, Variant, Ctx);
+ else
+ Add = MCSymbolRefExpr::create(Sym, Ctx);
+ } else {
+ Add = MCConstantExpr::create(SymbolicOp.AddSymbol.Value, Ctx);
+ }
+ }
+
+ const MCExpr *Sub = nullptr;
+ if (SymbolicOp.SubtractSymbol.Present) {
+ if (SymbolicOp.SubtractSymbol.Name) {
+ StringRef Name(SymbolicOp.SubtractSymbol.Name);
+ MCSymbol *Sym = Ctx.getOrCreateSymbol(Name);
+ Sub = MCSymbolRefExpr::create(Sym, Ctx);
+ } else {
+ Sub = MCConstantExpr::create(SymbolicOp.SubtractSymbol.Value, Ctx);
+ }
+ }
+
+ const MCExpr *Off = nullptr;
+ if (SymbolicOp.Value != 0)
+ Off = MCConstantExpr::create(SymbolicOp.Value, Ctx);
+
+ const MCExpr *Expr;
+ if (Sub) {
+ const MCExpr *LHS;
+ if (Add)
+ LHS = MCBinaryExpr::createSub(Add, Sub, Ctx);
+ else
+ LHS = MCUnaryExpr::createMinus(Sub, Ctx);
+ if (Off)
+ Expr = MCBinaryExpr::createAdd(LHS, Off, Ctx);
+ else
+ Expr = LHS;
+ } else if (Add) {
+ if (Off)
+ Expr = MCBinaryExpr::createAdd(Add, Off, Ctx);
+ else
+ Expr = Add;
+ } else {
+ if (Off)
+ Expr = Off;
+ else
+ Expr = MCConstantExpr::create(0, Ctx);
+ }
+
+ MI.addOperand(MCOperand::createExpr(Expr));
+
+ return true;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
new file mode 100644
index 000000000000..49e844963797
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
@@ -0,0 +1,38 @@
+//===- AArch64ExternalSymbolizer.h - Symbolizer for AArch64 -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Symbolize AArch64 assembly code during disassembly using callbacks.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64EXTERNALSYMBOLIZER_H
+#define LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64EXTERNALSYMBOLIZER_H
+
+#include "llvm/MC/MCDisassembler/MCExternalSymbolizer.h"
+
+namespace llvm {
+
+class AArch64ExternalSymbolizer : public MCExternalSymbolizer {
+public:
+ AArch64ExternalSymbolizer(MCContext &Ctx,
+ std::unique_ptr<MCRelocationInfo> RelInfo,
+ LLVMOpInfoCallback GetOpInfo,
+ LLVMSymbolLookupCallback SymbolLookUp,
+ void *DisInfo)
+ : MCExternalSymbolizer(Ctx, std::move(RelInfo), GetOpInfo, SymbolLookUp,
+ DisInfo) {}
+
+ bool tryAddingSymbolicOperand(MCInst &MI, raw_ostream &CommentStream,
+ int64_t Value, uint64_t Address, bool IsBranch,
+ uint64_t Offset, uint64_t InstSize) override;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
new file mode 100644
index 000000000000..b4f85204714f
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
@@ -0,0 +1,1475 @@
+//==-- AArch64InstPrinter.cpp - Convert AArch64 MCInst to assembly syntax --==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an AArch64 MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64InstPrinter.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+#define GET_INSTRUCTION_NAME
+#define PRINT_ALIAS_INSTR
+#include "AArch64GenAsmWriter.inc"
+#define GET_INSTRUCTION_NAME
+#define PRINT_ALIAS_INSTR
+#include "AArch64GenAsmWriter1.inc"
+
+AArch64InstPrinter::AArch64InstPrinter(const MCAsmInfo &MAI,
+ const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI)
+ : MCInstPrinter(MAI, MII, MRI) {}
+
+AArch64AppleInstPrinter::AArch64AppleInstPrinter(const MCAsmInfo &MAI,
+ const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI)
+ : AArch64InstPrinter(MAI, MII, MRI) {}
+
+void AArch64InstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+ // This is for .cfi directives.
+ OS << getRegisterName(RegNo);
+}
+
+void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+ StringRef Annot,
+ const MCSubtargetInfo &STI) {
+ // Check for special encodings and print the canonical alias instead.
+
+ unsigned Opcode = MI->getOpcode();
+
+ if (Opcode == AArch64::SYSxt)
+ if (printSysAlias(MI, STI, O)) {
+ printAnnotation(O, Annot);
+ return;
+ }
+
+ // SBFM/UBFM should print to a nicer aliased form if possible.
+ if (Opcode == AArch64::SBFMXri || Opcode == AArch64::SBFMWri ||
+ Opcode == AArch64::UBFMXri || Opcode == AArch64::UBFMWri) {
+ const MCOperand &Op0 = MI->getOperand(0);
+ const MCOperand &Op1 = MI->getOperand(1);
+ const MCOperand &Op2 = MI->getOperand(2);
+ const MCOperand &Op3 = MI->getOperand(3);
+
+ bool IsSigned = (Opcode == AArch64::SBFMXri || Opcode == AArch64::SBFMWri);
+ bool Is64Bit = (Opcode == AArch64::SBFMXri || Opcode == AArch64::UBFMXri);
+ if (Op2.isImm() && Op2.getImm() == 0 && Op3.isImm()) {
+ const char *AsmMnemonic = nullptr;
+
+ switch (Op3.getImm()) {
+ default:
+ break;
+ case 7:
+ if (IsSigned)
+ AsmMnemonic = "sxtb";
+ else if (!Is64Bit)
+ AsmMnemonic = "uxtb";
+ break;
+ case 15:
+ if (IsSigned)
+ AsmMnemonic = "sxth";
+ else if (!Is64Bit)
+ AsmMnemonic = "uxth";
+ break;
+ case 31:
+ // *xtw is only valid for signed 64-bit operations.
+ if (Is64Bit && IsSigned)
+ AsmMnemonic = "sxtw";
+ break;
+ }
+
+ if (AsmMnemonic) {
+ O << '\t' << AsmMnemonic << '\t' << getRegisterName(Op0.getReg())
+ << ", " << getRegisterName(getWRegFromXReg(Op1.getReg()));
+ printAnnotation(O, Annot);
+ return;
+ }
+ }
+
+ // All immediate shifts are aliases, implemented using the Bitfield
+ // instruction. In all cases the immediate shift amount shift must be in
+ // the range 0 to (reg.size -1).
+ if (Op2.isImm() && Op3.isImm()) {
+ const char *AsmMnemonic = nullptr;
+ int shift = 0;
+ int64_t immr = Op2.getImm();
+ int64_t imms = Op3.getImm();
+ if (Opcode == AArch64::UBFMWri && imms != 0x1F && ((imms + 1) == immr)) {
+ AsmMnemonic = "lsl";
+ shift = 31 - imms;
+ } else if (Opcode == AArch64::UBFMXri && imms != 0x3f &&
+ ((imms + 1 == immr))) {
+ AsmMnemonic = "lsl";
+ shift = 63 - imms;
+ } else if (Opcode == AArch64::UBFMWri && imms == 0x1f) {
+ AsmMnemonic = "lsr";
+ shift = immr;
+ } else if (Opcode == AArch64::UBFMXri && imms == 0x3f) {
+ AsmMnemonic = "lsr";
+ shift = immr;
+ } else if (Opcode == AArch64::SBFMWri && imms == 0x1f) {
+ AsmMnemonic = "asr";
+ shift = immr;
+ } else if (Opcode == AArch64::SBFMXri && imms == 0x3f) {
+ AsmMnemonic = "asr";
+ shift = immr;
+ }
+ if (AsmMnemonic) {
+ O << '\t' << AsmMnemonic << '\t' << getRegisterName(Op0.getReg())
+ << ", " << getRegisterName(Op1.getReg()) << ", #" << shift;
+ printAnnotation(O, Annot);
+ return;
+ }
+ }
+
+ // SBFIZ/UBFIZ aliases
+ if (Op2.getImm() > Op3.getImm()) {
+ O << '\t' << (IsSigned ? "sbfiz" : "ubfiz") << '\t'
+ << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op1.getReg())
+ << ", #" << (Is64Bit ? 64 : 32) - Op2.getImm() << ", #" << Op3.getImm() + 1;
+ printAnnotation(O, Annot);
+ return;
+ }
+
+ // Otherwise SBFX/UBFX is the preferred form
+ O << '\t' << (IsSigned ? "sbfx" : "ubfx") << '\t'
+ << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op1.getReg())
+ << ", #" << Op2.getImm() << ", #" << Op3.getImm() - Op2.getImm() + 1;
+ printAnnotation(O, Annot);
+ return;
+ }
+
+ if (Opcode == AArch64::BFMXri || Opcode == AArch64::BFMWri) {
+ const MCOperand &Op0 = MI->getOperand(0); // Op1 == Op0
+ const MCOperand &Op2 = MI->getOperand(2);
+ int ImmR = MI->getOperand(3).getImm();
+ int ImmS = MI->getOperand(4).getImm();
+
+ if ((Op2.getReg() == AArch64::WZR || Op2.getReg() == AArch64::XZR) &&
+ (ImmR == 0 || ImmS < ImmR)) {
+ // BFC takes precedence over its entire range, sligtly differently to BFI.
+ int BitWidth = Opcode == AArch64::BFMXri ? 64 : 32;
+ int LSB = (BitWidth - ImmR) % BitWidth;
+ int Width = ImmS + 1;
+
+ O << "\tbfc\t" << getRegisterName(Op0.getReg())
+ << ", #" << LSB << ", #" << Width;
+ printAnnotation(O, Annot);
+ return;
+ } else if (ImmS < ImmR) {
+ // BFI alias
+ int BitWidth = Opcode == AArch64::BFMXri ? 64 : 32;
+ int LSB = (BitWidth - ImmR) % BitWidth;
+ int Width = ImmS + 1;
+
+ O << "\tbfi\t" << getRegisterName(Op0.getReg()) << ", "
+ << getRegisterName(Op2.getReg()) << ", #" << LSB << ", #" << Width;
+ printAnnotation(O, Annot);
+ return;
+ }
+
+ int LSB = ImmR;
+ int Width = ImmS - ImmR + 1;
+ // Otherwise BFXIL the preferred form
+ O << "\tbfxil\t"
+ << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op2.getReg())
+ << ", #" << LSB << ", #" << Width;
+ printAnnotation(O, Annot);
+ return;
+ }
+
+ // Symbolic operands for MOVZ, MOVN and MOVK already imply a shift
+ // (e.g. :gottprel_g1: is always going to be "lsl #16") so it should not be
+ // printed.
+ if ((Opcode == AArch64::MOVZXi || Opcode == AArch64::MOVZWi ||
+ Opcode == AArch64::MOVNXi || Opcode == AArch64::MOVNWi) &&
+ MI->getOperand(1).isExpr()) {
+ if (Opcode == AArch64::MOVZXi || Opcode == AArch64::MOVZWi)
+ O << "\tmovz\t";
+ else
+ O << "\tmovn\t";
+
+ O << getRegisterName(MI->getOperand(0).getReg()) << ", #";
+ MI->getOperand(1).getExpr()->print(O, &MAI);
+ return;
+ }
+
+ if ((Opcode == AArch64::MOVKXi || Opcode == AArch64::MOVKWi) &&
+ MI->getOperand(2).isExpr()) {
+ O << "\tmovk\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #";
+ MI->getOperand(2).getExpr()->print(O, &MAI);
+ return;
+ }
+
+ // MOVZ, MOVN and "ORR wzr, #imm" instructions are aliases for MOV, but their
+ // domains overlap so they need to be prioritized. The chain is "MOVZ lsl #0 >
+ // MOVZ lsl #N > MOVN lsl #0 > MOVN lsl #N > ORR". The highest instruction
+ // that can represent the move is the MOV alias, and the rest get printed
+ // normally.
+ if ((Opcode == AArch64::MOVZXi || Opcode == AArch64::MOVZWi) &&
+ MI->getOperand(1).isImm() && MI->getOperand(2).isImm()) {
+ int RegWidth = Opcode == AArch64::MOVZXi ? 64 : 32;
+ int Shift = MI->getOperand(2).getImm();
+ uint64_t Value = (uint64_t)MI->getOperand(1).getImm() << Shift;
+
+ if (AArch64_AM::isMOVZMovAlias(Value, Shift,
+ Opcode == AArch64::MOVZXi ? 64 : 32)) {
+ O << "\tmov\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #"
+ << formatImm(SignExtend64(Value, RegWidth));
+ return;
+ }
+ }
+
+ if ((Opcode == AArch64::MOVNXi || Opcode == AArch64::MOVNWi) &&
+ MI->getOperand(1).isImm() && MI->getOperand(2).isImm()) {
+ int RegWidth = Opcode == AArch64::MOVNXi ? 64 : 32;
+ int Shift = MI->getOperand(2).getImm();
+ uint64_t Value = ~((uint64_t)MI->getOperand(1).getImm() << Shift);
+ if (RegWidth == 32)
+ Value = Value & 0xffffffff;
+
+ if (AArch64_AM::isMOVNMovAlias(Value, Shift, RegWidth)) {
+ O << "\tmov\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #"
+ << formatImm(SignExtend64(Value, RegWidth));
+ return;
+ }
+ }
+
+ if ((Opcode == AArch64::ORRXri || Opcode == AArch64::ORRWri) &&
+ (MI->getOperand(1).getReg() == AArch64::XZR ||
+ MI->getOperand(1).getReg() == AArch64::WZR) &&
+ MI->getOperand(2).isImm()) {
+ int RegWidth = Opcode == AArch64::ORRXri ? 64 : 32;
+ uint64_t Value = AArch64_AM::decodeLogicalImmediate(
+ MI->getOperand(2).getImm(), RegWidth);
+ if (!AArch64_AM::isAnyMOVWMovAlias(Value, RegWidth)) {
+ O << "\tmov\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #"
+ << formatImm(SignExtend64(Value, RegWidth));
+ return;
+ }
+ }
+
+ if (!printAliasInstr(MI, STI, O))
+ printInstruction(MI, STI, O);
+
+ printAnnotation(O, Annot);
+}
+
+static bool isTblTbxInstruction(unsigned Opcode, StringRef &Layout,
+ bool &IsTbx) {
+ switch (Opcode) {
+ case AArch64::TBXv8i8One:
+ case AArch64::TBXv8i8Two:
+ case AArch64::TBXv8i8Three:
+ case AArch64::TBXv8i8Four:
+ IsTbx = true;
+ Layout = ".8b";
+ return true;
+ case AArch64::TBLv8i8One:
+ case AArch64::TBLv8i8Two:
+ case AArch64::TBLv8i8Three:
+ case AArch64::TBLv8i8Four:
+ IsTbx = false;
+ Layout = ".8b";
+ return true;
+ case AArch64::TBXv16i8One:
+ case AArch64::TBXv16i8Two:
+ case AArch64::TBXv16i8Three:
+ case AArch64::TBXv16i8Four:
+ IsTbx = true;
+ Layout = ".16b";
+ return true;
+ case AArch64::TBLv16i8One:
+ case AArch64::TBLv16i8Two:
+ case AArch64::TBLv16i8Three:
+ case AArch64::TBLv16i8Four:
+ IsTbx = false;
+ Layout = ".16b";
+ return true;
+ default:
+ return false;
+ }
+}
+
+struct LdStNInstrDesc {
+ unsigned Opcode;
+ const char *Mnemonic;
+ const char *Layout;
+ int ListOperand;
+ bool HasLane;
+ int NaturalOffset;
+};
+
+static const LdStNInstrDesc LdStNInstInfo[] = {
+ { AArch64::LD1i8, "ld1", ".b", 1, true, 0 },
+ { AArch64::LD1i16, "ld1", ".h", 1, true, 0 },
+ { AArch64::LD1i32, "ld1", ".s", 1, true, 0 },
+ { AArch64::LD1i64, "ld1", ".d", 1, true, 0 },
+ { AArch64::LD1i8_POST, "ld1", ".b", 2, true, 1 },
+ { AArch64::LD1i16_POST, "ld1", ".h", 2, true, 2 },
+ { AArch64::LD1i32_POST, "ld1", ".s", 2, true, 4 },
+ { AArch64::LD1i64_POST, "ld1", ".d", 2, true, 8 },
+ { AArch64::LD1Rv16b, "ld1r", ".16b", 0, false, 0 },
+ { AArch64::LD1Rv8h, "ld1r", ".8h", 0, false, 0 },
+ { AArch64::LD1Rv4s, "ld1r", ".4s", 0, false, 0 },
+ { AArch64::LD1Rv2d, "ld1r", ".2d", 0, false, 0 },
+ { AArch64::LD1Rv8b, "ld1r", ".8b", 0, false, 0 },
+ { AArch64::LD1Rv4h, "ld1r", ".4h", 0, false, 0 },
+ { AArch64::LD1Rv2s, "ld1r", ".2s", 0, false, 0 },
+ { AArch64::LD1Rv1d, "ld1r", ".1d", 0, false, 0 },
+ { AArch64::LD1Rv16b_POST, "ld1r", ".16b", 1, false, 1 },
+ { AArch64::LD1Rv8h_POST, "ld1r", ".8h", 1, false, 2 },
+ { AArch64::LD1Rv4s_POST, "ld1r", ".4s", 1, false, 4 },
+ { AArch64::LD1Rv2d_POST, "ld1r", ".2d", 1, false, 8 },
+ { AArch64::LD1Rv8b_POST, "ld1r", ".8b", 1, false, 1 },
+ { AArch64::LD1Rv4h_POST, "ld1r", ".4h", 1, false, 2 },
+ { AArch64::LD1Rv2s_POST, "ld1r", ".2s", 1, false, 4 },
+ { AArch64::LD1Rv1d_POST, "ld1r", ".1d", 1, false, 8 },
+ { AArch64::LD1Onev16b, "ld1", ".16b", 0, false, 0 },
+ { AArch64::LD1Onev8h, "ld1", ".8h", 0, false, 0 },
+ { AArch64::LD1Onev4s, "ld1", ".4s", 0, false, 0 },
+ { AArch64::LD1Onev2d, "ld1", ".2d", 0, false, 0 },
+ { AArch64::LD1Onev8b, "ld1", ".8b", 0, false, 0 },
+ { AArch64::LD1Onev4h, "ld1", ".4h", 0, false, 0 },
+ { AArch64::LD1Onev2s, "ld1", ".2s", 0, false, 0 },
+ { AArch64::LD1Onev1d, "ld1", ".1d", 0, false, 0 },
+ { AArch64::LD1Onev16b_POST, "ld1", ".16b", 1, false, 16 },
+ { AArch64::LD1Onev8h_POST, "ld1", ".8h", 1, false, 16 },
+ { AArch64::LD1Onev4s_POST, "ld1", ".4s", 1, false, 16 },
+ { AArch64::LD1Onev2d_POST, "ld1", ".2d", 1, false, 16 },
+ { AArch64::LD1Onev8b_POST, "ld1", ".8b", 1, false, 8 },
+ { AArch64::LD1Onev4h_POST, "ld1", ".4h", 1, false, 8 },
+ { AArch64::LD1Onev2s_POST, "ld1", ".2s", 1, false, 8 },
+ { AArch64::LD1Onev1d_POST, "ld1", ".1d", 1, false, 8 },
+ { AArch64::LD1Twov16b, "ld1", ".16b", 0, false, 0 },
+ { AArch64::LD1Twov8h, "ld1", ".8h", 0, false, 0 },
+ { AArch64::LD1Twov4s, "ld1", ".4s", 0, false, 0 },
+ { AArch64::LD1Twov2d, "ld1", ".2d", 0, false, 0 },
+ { AArch64::LD1Twov8b, "ld1", ".8b", 0, false, 0 },
+ { AArch64::LD1Twov4h, "ld1", ".4h", 0, false, 0 },
+ { AArch64::LD1Twov2s, "ld1", ".2s", 0, false, 0 },
+ { AArch64::LD1Twov1d, "ld1", ".1d", 0, false, 0 },
+ { AArch64::LD1Twov16b_POST, "ld1", ".16b", 1, false, 32 },
+ { AArch64::LD1Twov8h_POST, "ld1", ".8h", 1, false, 32 },
+ { AArch64::LD1Twov4s_POST, "ld1", ".4s", 1, false, 32 },
+ { AArch64::LD1Twov2d_POST, "ld1", ".2d", 1, false, 32 },
+ { AArch64::LD1Twov8b_POST, "ld1", ".8b", 1, false, 16 },
+ { AArch64::LD1Twov4h_POST, "ld1", ".4h", 1, false, 16 },
+ { AArch64::LD1Twov2s_POST, "ld1", ".2s", 1, false, 16 },
+ { AArch64::LD1Twov1d_POST, "ld1", ".1d", 1, false, 16 },
+ { AArch64::LD1Threev16b, "ld1", ".16b", 0, false, 0 },
+ { AArch64::LD1Threev8h, "ld1", ".8h", 0, false, 0 },
+ { AArch64::LD1Threev4s, "ld1", ".4s", 0, false, 0 },
+ { AArch64::LD1Threev2d, "ld1", ".2d", 0, false, 0 },
+ { AArch64::LD1Threev8b, "ld1", ".8b", 0, false, 0 },
+ { AArch64::LD1Threev4h, "ld1", ".4h", 0, false, 0 },
+ { AArch64::LD1Threev2s, "ld1", ".2s", 0, false, 0 },
+ { AArch64::LD1Threev1d, "ld1", ".1d", 0, false, 0 },
+ { AArch64::LD1Threev16b_POST, "ld1", ".16b", 1, false, 48 },
+ { AArch64::LD1Threev8h_POST, "ld1", ".8h", 1, false, 48 },
+ { AArch64::LD1Threev4s_POST, "ld1", ".4s", 1, false, 48 },
+ { AArch64::LD1Threev2d_POST, "ld1", ".2d", 1, false, 48 },
+ { AArch64::LD1Threev8b_POST, "ld1", ".8b", 1, false, 24 },
+ { AArch64::LD1Threev4h_POST, "ld1", ".4h", 1, false, 24 },
+ { AArch64::LD1Threev2s_POST, "ld1", ".2s", 1, false, 24 },
+ { AArch64::LD1Threev1d_POST, "ld1", ".1d", 1, false, 24 },
+ { AArch64::LD1Fourv16b, "ld1", ".16b", 0, false, 0 },
+ { AArch64::LD1Fourv8h, "ld1", ".8h", 0, false, 0 },
+ { AArch64::LD1Fourv4s, "ld1", ".4s", 0, false, 0 },
+ { AArch64::LD1Fourv2d, "ld1", ".2d", 0, false, 0 },
+ { AArch64::LD1Fourv8b, "ld1", ".8b", 0, false, 0 },
+ { AArch64::LD1Fourv4h, "ld1", ".4h", 0, false, 0 },
+ { AArch64::LD1Fourv2s, "ld1", ".2s", 0, false, 0 },
+ { AArch64::LD1Fourv1d, "ld1", ".1d", 0, false, 0 },
+ { AArch64::LD1Fourv16b_POST, "ld1", ".16b", 1, false, 64 },
+ { AArch64::LD1Fourv8h_POST, "ld1", ".8h", 1, false, 64 },
+ { AArch64::LD1Fourv4s_POST, "ld1", ".4s", 1, false, 64 },
+ { AArch64::LD1Fourv2d_POST, "ld1", ".2d", 1, false, 64 },
+ { AArch64::LD1Fourv8b_POST, "ld1", ".8b", 1, false, 32 },
+ { AArch64::LD1Fourv4h_POST, "ld1", ".4h", 1, false, 32 },
+ { AArch64::LD1Fourv2s_POST, "ld1", ".2s", 1, false, 32 },
+ { AArch64::LD1Fourv1d_POST, "ld1", ".1d", 1, false, 32 },
+ { AArch64::LD2i8, "ld2", ".b", 1, true, 0 },
+ { AArch64::LD2i16, "ld2", ".h", 1, true, 0 },
+ { AArch64::LD2i32, "ld2", ".s", 1, true, 0 },
+ { AArch64::LD2i64, "ld2", ".d", 1, true, 0 },
+ { AArch64::LD2i8_POST, "ld2", ".b", 2, true, 2 },
+ { AArch64::LD2i16_POST, "ld2", ".h", 2, true, 4 },
+ { AArch64::LD2i32_POST, "ld2", ".s", 2, true, 8 },
+ { AArch64::LD2i64_POST, "ld2", ".d", 2, true, 16 },
+ { AArch64::LD2Rv16b, "ld2r", ".16b", 0, false, 0 },
+ { AArch64::LD2Rv8h, "ld2r", ".8h", 0, false, 0 },
+ { AArch64::LD2Rv4s, "ld2r", ".4s", 0, false, 0 },
+ { AArch64::LD2Rv2d, "ld2r", ".2d", 0, false, 0 },
+ { AArch64::LD2Rv8b, "ld2r", ".8b", 0, false, 0 },
+ { AArch64::LD2Rv4h, "ld2r", ".4h", 0, false, 0 },
+ { AArch64::LD2Rv2s, "ld2r", ".2s", 0, false, 0 },
+ { AArch64::LD2Rv1d, "ld2r", ".1d", 0, false, 0 },
+ { AArch64::LD2Rv16b_POST, "ld2r", ".16b", 1, false, 2 },
+ { AArch64::LD2Rv8h_POST, "ld2r", ".8h", 1, false, 4 },
+ { AArch64::LD2Rv4s_POST, "ld2r", ".4s", 1, false, 8 },
+ { AArch64::LD2Rv2d_POST, "ld2r", ".2d", 1, false, 16 },
+ { AArch64::LD2Rv8b_POST, "ld2r", ".8b", 1, false, 2 },
+ { AArch64::LD2Rv4h_POST, "ld2r", ".4h", 1, false, 4 },
+ { AArch64::LD2Rv2s_POST, "ld2r", ".2s", 1, false, 8 },
+ { AArch64::LD2Rv1d_POST, "ld2r", ".1d", 1, false, 16 },
+ { AArch64::LD2Twov16b, "ld2", ".16b", 0, false, 0 },
+ { AArch64::LD2Twov8h, "ld2", ".8h", 0, false, 0 },
+ { AArch64::LD2Twov4s, "ld2", ".4s", 0, false, 0 },
+ { AArch64::LD2Twov2d, "ld2", ".2d", 0, false, 0 },
+ { AArch64::LD2Twov8b, "ld2", ".8b", 0, false, 0 },
+ { AArch64::LD2Twov4h, "ld2", ".4h", 0, false, 0 },
+ { AArch64::LD2Twov2s, "ld2", ".2s", 0, false, 0 },
+ { AArch64::LD2Twov16b_POST, "ld2", ".16b", 1, false, 32 },
+ { AArch64::LD2Twov8h_POST, "ld2", ".8h", 1, false, 32 },
+ { AArch64::LD2Twov4s_POST, "ld2", ".4s", 1, false, 32 },
+ { AArch64::LD2Twov2d_POST, "ld2", ".2d", 1, false, 32 },
+ { AArch64::LD2Twov8b_POST, "ld2", ".8b", 1, false, 16 },
+ { AArch64::LD2Twov4h_POST, "ld2", ".4h", 1, false, 16 },
+ { AArch64::LD2Twov2s_POST, "ld2", ".2s", 1, false, 16 },
+ { AArch64::LD3i8, "ld3", ".b", 1, true, 0 },
+ { AArch64::LD3i16, "ld3", ".h", 1, true, 0 },
+ { AArch64::LD3i32, "ld3", ".s", 1, true, 0 },
+ { AArch64::LD3i64, "ld3", ".d", 1, true, 0 },
+ { AArch64::LD3i8_POST, "ld3", ".b", 2, true, 3 },
+ { AArch64::LD3i16_POST, "ld3", ".h", 2, true, 6 },
+ { AArch64::LD3i32_POST, "ld3", ".s", 2, true, 12 },
+ { AArch64::LD3i64_POST, "ld3", ".d", 2, true, 24 },
+ { AArch64::LD3Rv16b, "ld3r", ".16b", 0, false, 0 },
+ { AArch64::LD3Rv8h, "ld3r", ".8h", 0, false, 0 },
+ { AArch64::LD3Rv4s, "ld3r", ".4s", 0, false, 0 },
+ { AArch64::LD3Rv2d, "ld3r", ".2d", 0, false, 0 },
+ { AArch64::LD3Rv8b, "ld3r", ".8b", 0, false, 0 },
+ { AArch64::LD3Rv4h, "ld3r", ".4h", 0, false, 0 },
+ { AArch64::LD3Rv2s, "ld3r", ".2s", 0, false, 0 },
+ { AArch64::LD3Rv1d, "ld3r", ".1d", 0, false, 0 },
+ { AArch64::LD3Rv16b_POST, "ld3r", ".16b", 1, false, 3 },
+ { AArch64::LD3Rv8h_POST, "ld3r", ".8h", 1, false, 6 },
+ { AArch64::LD3Rv4s_POST, "ld3r", ".4s", 1, false, 12 },
+ { AArch64::LD3Rv2d_POST, "ld3r", ".2d", 1, false, 24 },
+ { AArch64::LD3Rv8b_POST, "ld3r", ".8b", 1, false, 3 },
+ { AArch64::LD3Rv4h_POST, "ld3r", ".4h", 1, false, 6 },
+ { AArch64::LD3Rv2s_POST, "ld3r", ".2s", 1, false, 12 },
+ { AArch64::LD3Rv1d_POST, "ld3r", ".1d", 1, false, 24 },
+ { AArch64::LD3Threev16b, "ld3", ".16b", 0, false, 0 },
+ { AArch64::LD3Threev8h, "ld3", ".8h", 0, false, 0 },
+ { AArch64::LD3Threev4s, "ld3", ".4s", 0, false, 0 },
+ { AArch64::LD3Threev2d, "ld3", ".2d", 0, false, 0 },
+ { AArch64::LD3Threev8b, "ld3", ".8b", 0, false, 0 },
+ { AArch64::LD3Threev4h, "ld3", ".4h", 0, false, 0 },
+ { AArch64::LD3Threev2s, "ld3", ".2s", 0, false, 0 },
+ { AArch64::LD3Threev16b_POST, "ld3", ".16b", 1, false, 48 },
+ { AArch64::LD3Threev8h_POST, "ld3", ".8h", 1, false, 48 },
+ { AArch64::LD3Threev4s_POST, "ld3", ".4s", 1, false, 48 },
+ { AArch64::LD3Threev2d_POST, "ld3", ".2d", 1, false, 48 },
+ { AArch64::LD3Threev8b_POST, "ld3", ".8b", 1, false, 24 },
+ { AArch64::LD3Threev4h_POST, "ld3", ".4h", 1, false, 24 },
+ { AArch64::LD3Threev2s_POST, "ld3", ".2s", 1, false, 24 },
+ { AArch64::LD4i8, "ld4", ".b", 1, true, 0 },
+ { AArch64::LD4i16, "ld4", ".h", 1, true, 0 },
+ { AArch64::LD4i32, "ld4", ".s", 1, true, 0 },
+ { AArch64::LD4i64, "ld4", ".d", 1, true, 0 },
+ { AArch64::LD4i8_POST, "ld4", ".b", 2, true, 4 },
+ { AArch64::LD4i16_POST, "ld4", ".h", 2, true, 8 },
+ { AArch64::LD4i32_POST, "ld4", ".s", 2, true, 16 },
+ { AArch64::LD4i64_POST, "ld4", ".d", 2, true, 32 },
+ { AArch64::LD4Rv16b, "ld4r", ".16b", 0, false, 0 },
+ { AArch64::LD4Rv8h, "ld4r", ".8h", 0, false, 0 },
+ { AArch64::LD4Rv4s, "ld4r", ".4s", 0, false, 0 },
+ { AArch64::LD4Rv2d, "ld4r", ".2d", 0, false, 0 },
+ { AArch64::LD4Rv8b, "ld4r", ".8b", 0, false, 0 },
+ { AArch64::LD4Rv4h, "ld4r", ".4h", 0, false, 0 },
+ { AArch64::LD4Rv2s, "ld4r", ".2s", 0, false, 0 },
+ { AArch64::LD4Rv1d, "ld4r", ".1d", 0, false, 0 },
+ { AArch64::LD4Rv16b_POST, "ld4r", ".16b", 1, false, 4 },
+ { AArch64::LD4Rv8h_POST, "ld4r", ".8h", 1, false, 8 },
+ { AArch64::LD4Rv4s_POST, "ld4r", ".4s", 1, false, 16 },
+ { AArch64::LD4Rv2d_POST, "ld4r", ".2d", 1, false, 32 },
+ { AArch64::LD4Rv8b_POST, "ld4r", ".8b", 1, false, 4 },
+ { AArch64::LD4Rv4h_POST, "ld4r", ".4h", 1, false, 8 },
+ { AArch64::LD4Rv2s_POST, "ld4r", ".2s", 1, false, 16 },
+ { AArch64::LD4Rv1d_POST, "ld4r", ".1d", 1, false, 32 },
+ { AArch64::LD4Fourv16b, "ld4", ".16b", 0, false, 0 },
+ { AArch64::LD4Fourv8h, "ld4", ".8h", 0, false, 0 },
+ { AArch64::LD4Fourv4s, "ld4", ".4s", 0, false, 0 },
+ { AArch64::LD4Fourv2d, "ld4", ".2d", 0, false, 0 },
+ { AArch64::LD4Fourv8b, "ld4", ".8b", 0, false, 0 },
+ { AArch64::LD4Fourv4h, "ld4", ".4h", 0, false, 0 },
+ { AArch64::LD4Fourv2s, "ld4", ".2s", 0, false, 0 },
+ { AArch64::LD4Fourv16b_POST, "ld4", ".16b", 1, false, 64 },
+ { AArch64::LD4Fourv8h_POST, "ld4", ".8h", 1, false, 64 },
+ { AArch64::LD4Fourv4s_POST, "ld4", ".4s", 1, false, 64 },
+ { AArch64::LD4Fourv2d_POST, "ld4", ".2d", 1, false, 64 },
+ { AArch64::LD4Fourv8b_POST, "ld4", ".8b", 1, false, 32 },
+ { AArch64::LD4Fourv4h_POST, "ld4", ".4h", 1, false, 32 },
+ { AArch64::LD4Fourv2s_POST, "ld4", ".2s", 1, false, 32 },
+ { AArch64::ST1i8, "st1", ".b", 0, true, 0 },
+ { AArch64::ST1i16, "st1", ".h", 0, true, 0 },
+ { AArch64::ST1i32, "st1", ".s", 0, true, 0 },
+ { AArch64::ST1i64, "st1", ".d", 0, true, 0 },
+ { AArch64::ST1i8_POST, "st1", ".b", 1, true, 1 },
+ { AArch64::ST1i16_POST, "st1", ".h", 1, true, 2 },
+ { AArch64::ST1i32_POST, "st1", ".s", 1, true, 4 },
+ { AArch64::ST1i64_POST, "st1", ".d", 1, true, 8 },
+ { AArch64::ST1Onev16b, "st1", ".16b", 0, false, 0 },
+ { AArch64::ST1Onev8h, "st1", ".8h", 0, false, 0 },
+ { AArch64::ST1Onev4s, "st1", ".4s", 0, false, 0 },
+ { AArch64::ST1Onev2d, "st1", ".2d", 0, false, 0 },
+ { AArch64::ST1Onev8b, "st1", ".8b", 0, false, 0 },
+ { AArch64::ST1Onev4h, "st1", ".4h", 0, false, 0 },
+ { AArch64::ST1Onev2s, "st1", ".2s", 0, false, 0 },
+ { AArch64::ST1Onev1d, "st1", ".1d", 0, false, 0 },
+ { AArch64::ST1Onev16b_POST, "st1", ".16b", 1, false, 16 },
+ { AArch64::ST1Onev8h_POST, "st1", ".8h", 1, false, 16 },
+ { AArch64::ST1Onev4s_POST, "st1", ".4s", 1, false, 16 },
+ { AArch64::ST1Onev2d_POST, "st1", ".2d", 1, false, 16 },
+ { AArch64::ST1Onev8b_POST, "st1", ".8b", 1, false, 8 },
+ { AArch64::ST1Onev4h_POST, "st1", ".4h", 1, false, 8 },
+ { AArch64::ST1Onev2s_POST, "st1", ".2s", 1, false, 8 },
+ { AArch64::ST1Onev1d_POST, "st1", ".1d", 1, false, 8 },
+ { AArch64::ST1Twov16b, "st1", ".16b", 0, false, 0 },
+ { AArch64::ST1Twov8h, "st1", ".8h", 0, false, 0 },
+ { AArch64::ST1Twov4s, "st1", ".4s", 0, false, 0 },
+ { AArch64::ST1Twov2d, "st1", ".2d", 0, false, 0 },
+ { AArch64::ST1Twov8b, "st1", ".8b", 0, false, 0 },
+ { AArch64::ST1Twov4h, "st1", ".4h", 0, false, 0 },
+ { AArch64::ST1Twov2s, "st1", ".2s", 0, false, 0 },
+ { AArch64::ST1Twov1d, "st1", ".1d", 0, false, 0 },
+ { AArch64::ST1Twov16b_POST, "st1", ".16b", 1, false, 32 },
+ { AArch64::ST1Twov8h_POST, "st1", ".8h", 1, false, 32 },
+ { AArch64::ST1Twov4s_POST, "st1", ".4s", 1, false, 32 },
+ { AArch64::ST1Twov2d_POST, "st1", ".2d", 1, false, 32 },
+ { AArch64::ST1Twov8b_POST, "st1", ".8b", 1, false, 16 },
+ { AArch64::ST1Twov4h_POST, "st1", ".4h", 1, false, 16 },
+ { AArch64::ST1Twov2s_POST, "st1", ".2s", 1, false, 16 },
+ { AArch64::ST1Twov1d_POST, "st1", ".1d", 1, false, 16 },
+ { AArch64::ST1Threev16b, "st1", ".16b", 0, false, 0 },
+ { AArch64::ST1Threev8h, "st1", ".8h", 0, false, 0 },
+ { AArch64::ST1Threev4s, "st1", ".4s", 0, false, 0 },
+ { AArch64::ST1Threev2d, "st1", ".2d", 0, false, 0 },
+ { AArch64::ST1Threev8b, "st1", ".8b", 0, false, 0 },
+ { AArch64::ST1Threev4h, "st1", ".4h", 0, false, 0 },
+ { AArch64::ST1Threev2s, "st1", ".2s", 0, false, 0 },
+ { AArch64::ST1Threev1d, "st1", ".1d", 0, false, 0 },
+ { AArch64::ST1Threev16b_POST, "st1", ".16b", 1, false, 48 },
+ { AArch64::ST1Threev8h_POST, "st1", ".8h", 1, false, 48 },
+ { AArch64::ST1Threev4s_POST, "st1", ".4s", 1, false, 48 },
+ { AArch64::ST1Threev2d_POST, "st1", ".2d", 1, false, 48 },
+ { AArch64::ST1Threev8b_POST, "st1", ".8b", 1, false, 24 },
+ { AArch64::ST1Threev4h_POST, "st1", ".4h", 1, false, 24 },
+ { AArch64::ST1Threev2s_POST, "st1", ".2s", 1, false, 24 },
+ { AArch64::ST1Threev1d_POST, "st1", ".1d", 1, false, 24 },
+ { AArch64::ST1Fourv16b, "st1", ".16b", 0, false, 0 },
+ { AArch64::ST1Fourv8h, "st1", ".8h", 0, false, 0 },
+ { AArch64::ST1Fourv4s, "st1", ".4s", 0, false, 0 },
+ { AArch64::ST1Fourv2d, "st1", ".2d", 0, false, 0 },
+ { AArch64::ST1Fourv8b, "st1", ".8b", 0, false, 0 },
+ { AArch64::ST1Fourv4h, "st1", ".4h", 0, false, 0 },
+ { AArch64::ST1Fourv2s, "st1", ".2s", 0, false, 0 },
+ { AArch64::ST1Fourv1d, "st1", ".1d", 0, false, 0 },
+ { AArch64::ST1Fourv16b_POST, "st1", ".16b", 1, false, 64 },
+ { AArch64::ST1Fourv8h_POST, "st1", ".8h", 1, false, 64 },
+ { AArch64::ST1Fourv4s_POST, "st1", ".4s", 1, false, 64 },
+ { AArch64::ST1Fourv2d_POST, "st1", ".2d", 1, false, 64 },
+ { AArch64::ST1Fourv8b_POST, "st1", ".8b", 1, false, 32 },
+ { AArch64::ST1Fourv4h_POST, "st1", ".4h", 1, false, 32 },
+ { AArch64::ST1Fourv2s_POST, "st1", ".2s", 1, false, 32 },
+ { AArch64::ST1Fourv1d_POST, "st1", ".1d", 1, false, 32 },
+ { AArch64::ST2i8, "st2", ".b", 0, true, 0 },
+ { AArch64::ST2i16, "st2", ".h", 0, true, 0 },
+ { AArch64::ST2i32, "st2", ".s", 0, true, 0 },
+ { AArch64::ST2i64, "st2", ".d", 0, true, 0 },
+ { AArch64::ST2i8_POST, "st2", ".b", 1, true, 2 },
+ { AArch64::ST2i16_POST, "st2", ".h", 1, true, 4 },
+ { AArch64::ST2i32_POST, "st2", ".s", 1, true, 8 },
+ { AArch64::ST2i64_POST, "st2", ".d", 1, true, 16 },
+ { AArch64::ST2Twov16b, "st2", ".16b", 0, false, 0 },
+ { AArch64::ST2Twov8h, "st2", ".8h", 0, false, 0 },
+ { AArch64::ST2Twov4s, "st2", ".4s", 0, false, 0 },
+ { AArch64::ST2Twov2d, "st2", ".2d", 0, false, 0 },
+ { AArch64::ST2Twov8b, "st2", ".8b", 0, false, 0 },
+ { AArch64::ST2Twov4h, "st2", ".4h", 0, false, 0 },
+ { AArch64::ST2Twov2s, "st2", ".2s", 0, false, 0 },
+ { AArch64::ST2Twov16b_POST, "st2", ".16b", 1, false, 32 },
+ { AArch64::ST2Twov8h_POST, "st2", ".8h", 1, false, 32 },
+ { AArch64::ST2Twov4s_POST, "st2", ".4s", 1, false, 32 },
+ { AArch64::ST2Twov2d_POST, "st2", ".2d", 1, false, 32 },
+ { AArch64::ST2Twov8b_POST, "st2", ".8b", 1, false, 16 },
+ { AArch64::ST2Twov4h_POST, "st2", ".4h", 1, false, 16 },
+ { AArch64::ST2Twov2s_POST, "st2", ".2s", 1, false, 16 },
+ { AArch64::ST3i8, "st3", ".b", 0, true, 0 },
+ { AArch64::ST3i16, "st3", ".h", 0, true, 0 },
+ { AArch64::ST3i32, "st3", ".s", 0, true, 0 },
+ { AArch64::ST3i64, "st3", ".d", 0, true, 0 },
+ { AArch64::ST3i8_POST, "st3", ".b", 1, true, 3 },
+ { AArch64::ST3i16_POST, "st3", ".h", 1, true, 6 },
+ { AArch64::ST3i32_POST, "st3", ".s", 1, true, 12 },
+ { AArch64::ST3i64_POST, "st3", ".d", 1, true, 24 },
+ { AArch64::ST3Threev16b, "st3", ".16b", 0, false, 0 },
+ { AArch64::ST3Threev8h, "st3", ".8h", 0, false, 0 },
+ { AArch64::ST3Threev4s, "st3", ".4s", 0, false, 0 },
+ { AArch64::ST3Threev2d, "st3", ".2d", 0, false, 0 },
+ { AArch64::ST3Threev8b, "st3", ".8b", 0, false, 0 },
+ { AArch64::ST3Threev4h, "st3", ".4h", 0, false, 0 },
+ { AArch64::ST3Threev2s, "st3", ".2s", 0, false, 0 },
+ { AArch64::ST3Threev16b_POST, "st3", ".16b", 1, false, 48 },
+ { AArch64::ST3Threev8h_POST, "st3", ".8h", 1, false, 48 },
+ { AArch64::ST3Threev4s_POST, "st3", ".4s", 1, false, 48 },
+ { AArch64::ST3Threev2d_POST, "st3", ".2d", 1, false, 48 },
+ { AArch64::ST3Threev8b_POST, "st3", ".8b", 1, false, 24 },
+ { AArch64::ST3Threev4h_POST, "st3", ".4h", 1, false, 24 },
+ { AArch64::ST3Threev2s_POST, "st3", ".2s", 1, false, 24 },
+ { AArch64::ST4i8, "st4", ".b", 0, true, 0 },
+ { AArch64::ST4i16, "st4", ".h", 0, true, 0 },
+ { AArch64::ST4i32, "st4", ".s", 0, true, 0 },
+ { AArch64::ST4i64, "st4", ".d", 0, true, 0 },
+ { AArch64::ST4i8_POST, "st4", ".b", 1, true, 4 },
+ { AArch64::ST4i16_POST, "st4", ".h", 1, true, 8 },
+ { AArch64::ST4i32_POST, "st4", ".s", 1, true, 16 },
+ { AArch64::ST4i64_POST, "st4", ".d", 1, true, 32 },
+ { AArch64::ST4Fourv16b, "st4", ".16b", 0, false, 0 },
+ { AArch64::ST4Fourv8h, "st4", ".8h", 0, false, 0 },
+ { AArch64::ST4Fourv4s, "st4", ".4s", 0, false, 0 },
+ { AArch64::ST4Fourv2d, "st4", ".2d", 0, false, 0 },
+ { AArch64::ST4Fourv8b, "st4", ".8b", 0, false, 0 },
+ { AArch64::ST4Fourv4h, "st4", ".4h", 0, false, 0 },
+ { AArch64::ST4Fourv2s, "st4", ".2s", 0, false, 0 },
+ { AArch64::ST4Fourv16b_POST, "st4", ".16b", 1, false, 64 },
+ { AArch64::ST4Fourv8h_POST, "st4", ".8h", 1, false, 64 },
+ { AArch64::ST4Fourv4s_POST, "st4", ".4s", 1, false, 64 },
+ { AArch64::ST4Fourv2d_POST, "st4", ".2d", 1, false, 64 },
+ { AArch64::ST4Fourv8b_POST, "st4", ".8b", 1, false, 32 },
+ { AArch64::ST4Fourv4h_POST, "st4", ".4h", 1, false, 32 },
+ { AArch64::ST4Fourv2s_POST, "st4", ".2s", 1, false, 32 },
+};
+
+static const LdStNInstrDesc *getLdStNInstrDesc(unsigned Opcode) {
+ unsigned Idx;
+ for (Idx = 0; Idx != array_lengthof(LdStNInstInfo); ++Idx)
+ if (LdStNInstInfo[Idx].Opcode == Opcode)
+ return &LdStNInstInfo[Idx];
+
+ return nullptr;
+}
+
+void AArch64AppleInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+ StringRef Annot,
+ const MCSubtargetInfo &STI) {
+ unsigned Opcode = MI->getOpcode();
+ StringRef Layout, Mnemonic;
+
+ bool IsTbx;
+ if (isTblTbxInstruction(MI->getOpcode(), Layout, IsTbx)) {
+ O << "\t" << (IsTbx ? "tbx" : "tbl") << Layout << '\t'
+ << getRegisterName(MI->getOperand(0).getReg(), AArch64::vreg) << ", ";
+
+ unsigned ListOpNum = IsTbx ? 2 : 1;
+ printVectorList(MI, ListOpNum, STI, O, "");
+
+ O << ", "
+ << getRegisterName(MI->getOperand(ListOpNum + 1).getReg(), AArch64::vreg);
+ printAnnotation(O, Annot);
+ return;
+ }
+
+ if (const LdStNInstrDesc *LdStDesc = getLdStNInstrDesc(Opcode)) {
+ O << "\t" << LdStDesc->Mnemonic << LdStDesc->Layout << '\t';
+
+ // Now onto the operands: first a vector list with possible lane
+ // specifier. E.g. { v0 }[2]
+ int OpNum = LdStDesc->ListOperand;
+ printVectorList(MI, OpNum++, STI, O, "");
+
+ if (LdStDesc->HasLane)
+ O << '[' << MI->getOperand(OpNum++).getImm() << ']';
+
+ // Next the address: [xN]
+ unsigned AddrReg = MI->getOperand(OpNum++).getReg();
+ O << ", [" << getRegisterName(AddrReg) << ']';
+
+ // Finally, there might be a post-indexed offset.
+ if (LdStDesc->NaturalOffset != 0) {
+ unsigned Reg = MI->getOperand(OpNum++).getReg();
+ if (Reg != AArch64::XZR)
+ O << ", " << getRegisterName(Reg);
+ else {
+ assert(LdStDesc->NaturalOffset && "no offset on post-inc instruction?");
+ O << ", #" << LdStDesc->NaturalOffset;
+ }
+ }
+
+ printAnnotation(O, Annot);
+ return;
+ }
+
+ AArch64InstPrinter::printInst(MI, O, Annot, STI);
+}
+
+bool AArch64InstPrinter::printSysAlias(const MCInst *MI,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+#ifndef NDEBUG
+ unsigned Opcode = MI->getOpcode();
+ assert(Opcode == AArch64::SYSxt && "Invalid opcode for SYS alias!");
+#endif
+
+ const char *Asm = nullptr;
+ const MCOperand &Op1 = MI->getOperand(0);
+ const MCOperand &Cn = MI->getOperand(1);
+ const MCOperand &Cm = MI->getOperand(2);
+ const MCOperand &Op2 = MI->getOperand(3);
+
+ unsigned Op1Val = Op1.getImm();
+ unsigned CnVal = Cn.getImm();
+ unsigned CmVal = Cm.getImm();
+ unsigned Op2Val = Op2.getImm();
+
+ if (CnVal == 7) {
+ switch (CmVal) {
+ default:
+ break;
+
+ // IC aliases
+ case 1:
+ if (Op1Val == 0 && Op2Val == 0)
+ Asm = "ic\tialluis";
+ break;
+ case 5:
+ if (Op1Val == 0 && Op2Val == 0)
+ Asm = "ic\tiallu";
+ else if (Op1Val == 3 && Op2Val == 1)
+ Asm = "ic\tivau";
+ break;
+
+ // DC aliases
+ case 4:
+ if (Op1Val == 3 && Op2Val == 1)
+ Asm = "dc\tzva";
+ break;
+ case 6:
+ if (Op1Val == 0 && Op2Val == 1)
+ Asm = "dc\tivac";
+ if (Op1Val == 0 && Op2Val == 2)
+ Asm = "dc\tisw";
+ break;
+ case 10:
+ if (Op1Val == 3 && Op2Val == 1)
+ Asm = "dc\tcvac";
+ else if (Op1Val == 0 && Op2Val == 2)
+ Asm = "dc\tcsw";
+ break;
+ case 11:
+ if (Op1Val == 3 && Op2Val == 1)
+ Asm = "dc\tcvau";
+ break;
+ case 12:
+ if (Op1Val == 3 && Op2Val == 1 &&
+ (STI.getFeatureBits()[AArch64::HasV8_2aOps]))
+ Asm = "dc\tcvap";
+ break;
+ case 14:
+ if (Op1Val == 3 && Op2Val == 1)
+ Asm = "dc\tcivac";
+ else if (Op1Val == 0 && Op2Val == 2)
+ Asm = "dc\tcisw";
+ break;
+
+ // AT aliases
+ case 8:
+ switch (Op1Val) {
+ default:
+ break;
+ case 0:
+ switch (Op2Val) {
+ default:
+ break;
+ case 0: Asm = "at\ts1e1r"; break;
+ case 1: Asm = "at\ts1e1w"; break;
+ case 2: Asm = "at\ts1e0r"; break;
+ case 3: Asm = "at\ts1e0w"; break;
+ }
+ break;
+ case 4:
+ switch (Op2Val) {
+ default:
+ break;
+ case 0: Asm = "at\ts1e2r"; break;
+ case 1: Asm = "at\ts1e2w"; break;
+ case 4: Asm = "at\ts12e1r"; break;
+ case 5: Asm = "at\ts12e1w"; break;
+ case 6: Asm = "at\ts12e0r"; break;
+ case 7: Asm = "at\ts12e0w"; break;
+ }
+ break;
+ case 6:
+ switch (Op2Val) {
+ default:
+ break;
+ case 0: Asm = "at\ts1e3r"; break;
+ case 1: Asm = "at\ts1e3w"; break;
+ }
+ break;
+ }
+ break;
+ case 9:
+ switch (Op1Val) {
+ default:
+ break;
+ case 0:
+ if (STI.getFeatureBits()[AArch64::HasV8_2aOps]) {
+ switch (Op2Val) {
+ default:
+ break;
+ case 0: Asm = "at\ts1e1rp"; break;
+ case 1: Asm = "at\ts1e1wp"; break;
+ }
+ }
+ break;
+ }
+ }
+ } else if (CnVal == 8) {
+ // TLBI aliases
+ switch (CmVal) {
+ default:
+ break;
+ case 3:
+ switch (Op1Val) {
+ default:
+ break;
+ case 0:
+ switch (Op2Val) {
+ default:
+ break;
+ case 0: Asm = "tlbi\tvmalle1is"; break;
+ case 1: Asm = "tlbi\tvae1is"; break;
+ case 2: Asm = "tlbi\taside1is"; break;
+ case 3: Asm = "tlbi\tvaae1is"; break;
+ case 5: Asm = "tlbi\tvale1is"; break;
+ case 7: Asm = "tlbi\tvaale1is"; break;
+ }
+ break;
+ case 4:
+ switch (Op2Val) {
+ default:
+ break;
+ case 0: Asm = "tlbi\talle2is"; break;
+ case 1: Asm = "tlbi\tvae2is"; break;
+ case 4: Asm = "tlbi\talle1is"; break;
+ case 5: Asm = "tlbi\tvale2is"; break;
+ case 6: Asm = "tlbi\tvmalls12e1is"; break;
+ }
+ break;
+ case 6:
+ switch (Op2Val) {
+ default:
+ break;
+ case 0: Asm = "tlbi\talle3is"; break;
+ case 1: Asm = "tlbi\tvae3is"; break;
+ case 5: Asm = "tlbi\tvale3is"; break;
+ }
+ break;
+ }
+ break;
+ case 0:
+ switch (Op1Val) {
+ default:
+ break;
+ case 4:
+ switch (Op2Val) {
+ default:
+ break;
+ case 1: Asm = "tlbi\tipas2e1is"; break;
+ case 5: Asm = "tlbi\tipas2le1is"; break;
+ }
+ break;
+ }
+ break;
+ case 4:
+ switch (Op1Val) {
+ default:
+ break;
+ case 4:
+ switch (Op2Val) {
+ default:
+ break;
+ case 1: Asm = "tlbi\tipas2e1"; break;
+ case 5: Asm = "tlbi\tipas2le1"; break;
+ }
+ break;
+ }
+ break;
+ case 7:
+ switch (Op1Val) {
+ default:
+ break;
+ case 0:
+ switch (Op2Val) {
+ default:
+ break;
+ case 0: Asm = "tlbi\tvmalle1"; break;
+ case 1: Asm = "tlbi\tvae1"; break;
+ case 2: Asm = "tlbi\taside1"; break;
+ case 3: Asm = "tlbi\tvaae1"; break;
+ case 5: Asm = "tlbi\tvale1"; break;
+ case 7: Asm = "tlbi\tvaale1"; break;
+ }
+ break;
+ case 4:
+ switch (Op2Val) {
+ default:
+ break;
+ case 0: Asm = "tlbi\talle2"; break;
+ case 1: Asm = "tlbi\tvae2"; break;
+ case 4: Asm = "tlbi\talle1"; break;
+ case 5: Asm = "tlbi\tvale2"; break;
+ case 6: Asm = "tlbi\tvmalls12e1"; break;
+ }
+ break;
+ case 6:
+ switch (Op2Val) {
+ default:
+ break;
+ case 0: Asm = "tlbi\talle3"; break;
+ case 1: Asm = "tlbi\tvae3"; break;
+ case 5: Asm = "tlbi\tvale3"; break;
+ }
+ break;
+ }
+ break;
+ }
+ }
+
+ if (Asm) {
+ unsigned Reg = MI->getOperand(4).getReg();
+
+ O << '\t' << Asm;
+ if (StringRef(Asm).lower().find("all") == StringRef::npos)
+ O << ", " << getRegisterName(Reg);
+ }
+
+ return Asm != nullptr;
+}
+
+void AArch64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isReg()) {
+ unsigned Reg = Op.getReg();
+ O << getRegisterName(Reg);
+ } else if (Op.isImm()) {
+ printImm(MI, OpNo, STI, O);
+ } else {
+ assert(Op.isExpr() && "unknown operand kind in printOperand");
+ Op.getExpr()->print(O, &MAI);
+ }
+}
+
+void AArch64InstPrinter::printImm(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ O << "#" << formatImm(Op.getImm());
+}
+
+void AArch64InstPrinter::printImmHex(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ O << format("#%#llx", Op.getImm());
+}
+
+void AArch64InstPrinter::printPostIncOperand(const MCInst *MI, unsigned OpNo,
+ unsigned Imm, raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isReg()) {
+ unsigned Reg = Op.getReg();
+ if (Reg == AArch64::XZR)
+ O << "#" << Imm;
+ else
+ O << getRegisterName(Reg);
+ } else
+ llvm_unreachable("unknown operand kind in printPostIncOperand64");
+}
+
+void AArch64InstPrinter::printVRegOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ assert(Op.isReg() && "Non-register vreg operand!");
+ unsigned Reg = Op.getReg();
+ O << getRegisterName(Reg, AArch64::vreg);
+}
+
+void AArch64InstPrinter::printSysCROperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ assert(Op.isImm() && "System instruction C[nm] operands must be immediates!");
+ O << "c" << Op.getImm();
+}
+
+void AArch64InstPrinter::printAddSubImm(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &MO = MI->getOperand(OpNum);
+ if (MO.isImm()) {
+ unsigned Val = (MO.getImm() & 0xfff);
+ assert(Val == MO.getImm() && "Add/sub immediate out of range!");
+ unsigned Shift =
+ AArch64_AM::getShiftValue(MI->getOperand(OpNum + 1).getImm());
+ O << '#' << formatImm(Val);
+ if (Shift != 0)
+ printShifter(MI, OpNum + 1, STI, O);
+
+ if (CommentStream)
+ *CommentStream << '=' << formatImm(Val << Shift) << '\n';
+ } else {
+ assert(MO.isExpr() && "Unexpected operand type!");
+ MO.getExpr()->print(O, &MAI);
+ printShifter(MI, OpNum + 1, STI, O);
+ }
+}
+
+void AArch64InstPrinter::printLogicalImm32(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ uint64_t Val = MI->getOperand(OpNum).getImm();
+ O << "#0x";
+ O.write_hex(AArch64_AM::decodeLogicalImmediate(Val, 32));
+}
+
+void AArch64InstPrinter::printLogicalImm64(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ uint64_t Val = MI->getOperand(OpNum).getImm();
+ O << "#0x";
+ O.write_hex(AArch64_AM::decodeLogicalImmediate(Val, 64));
+}
+
+void AArch64InstPrinter::printShifter(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned Val = MI->getOperand(OpNum).getImm();
+ // LSL #0 should not be printed.
+ if (AArch64_AM::getShiftType(Val) == AArch64_AM::LSL &&
+ AArch64_AM::getShiftValue(Val) == 0)
+ return;
+ O << ", " << AArch64_AM::getShiftExtendName(AArch64_AM::getShiftType(Val))
+ << " #" << AArch64_AM::getShiftValue(Val);
+}
+
+void AArch64InstPrinter::printShiftedRegister(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ O << getRegisterName(MI->getOperand(OpNum).getReg());
+ printShifter(MI, OpNum + 1, STI, O);
+}
+
+void AArch64InstPrinter::printExtendedRegister(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ O << getRegisterName(MI->getOperand(OpNum).getReg());
+ printArithExtend(MI, OpNum + 1, STI, O);
+}
+
+void AArch64InstPrinter::printArithExtend(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned Val = MI->getOperand(OpNum).getImm();
+ AArch64_AM::ShiftExtendType ExtType = AArch64_AM::getArithExtendType(Val);
+ unsigned ShiftVal = AArch64_AM::getArithShiftValue(Val);
+
+ // If the destination or first source register operand is [W]SP, print
+ // UXTW/UXTX as LSL, and if the shift amount is also zero, print nothing at
+ // all.
+ if (ExtType == AArch64_AM::UXTW || ExtType == AArch64_AM::UXTX) {
+ unsigned Dest = MI->getOperand(0).getReg();
+ unsigned Src1 = MI->getOperand(1).getReg();
+ if ( ((Dest == AArch64::SP || Src1 == AArch64::SP) &&
+ ExtType == AArch64_AM::UXTX) ||
+ ((Dest == AArch64::WSP || Src1 == AArch64::WSP) &&
+ ExtType == AArch64_AM::UXTW) ) {
+ if (ShiftVal != 0)
+ O << ", lsl #" << ShiftVal;
+ return;
+ }
+ }
+ O << ", " << AArch64_AM::getShiftExtendName(ExtType);
+ if (ShiftVal != 0)
+ O << " #" << ShiftVal;
+}
+
+void AArch64InstPrinter::printMemExtend(const MCInst *MI, unsigned OpNum,
+ raw_ostream &O, char SrcRegKind,
+ unsigned Width) {
+ unsigned SignExtend = MI->getOperand(OpNum).getImm();
+ unsigned DoShift = MI->getOperand(OpNum + 1).getImm();
+
+ // sxtw, sxtx, uxtw or lsl (== uxtx)
+ bool IsLSL = !SignExtend && SrcRegKind == 'x';
+ if (IsLSL)
+ O << "lsl";
+ else
+ O << (SignExtend ? 's' : 'u') << "xt" << SrcRegKind;
+
+ if (DoShift || IsLSL)
+ O << " #" << Log2_32(Width / 8);
+}
+
+void AArch64InstPrinter::printCondCode(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ AArch64CC::CondCode CC = (AArch64CC::CondCode)MI->getOperand(OpNum).getImm();
+ O << AArch64CC::getCondCodeName(CC);
+}
+
+void AArch64InstPrinter::printInverseCondCode(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ AArch64CC::CondCode CC = (AArch64CC::CondCode)MI->getOperand(OpNum).getImm();
+ O << AArch64CC::getCondCodeName(AArch64CC::getInvertedCondCode(CC));
+}
+
+void AArch64InstPrinter::printAMNoIndex(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()) << ']';
+}
+
+template<int Scale>
+void AArch64InstPrinter::printImmScale(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ O << '#' << formatImm(Scale * MI->getOperand(OpNum).getImm());
+}
+
+void AArch64InstPrinter::printUImm12Offset(const MCInst *MI, unsigned OpNum,
+ unsigned Scale, raw_ostream &O) {
+ const MCOperand MO = MI->getOperand(OpNum);
+ if (MO.isImm()) {
+ O << "#" << formatImm(MO.getImm() * Scale);
+ } else {
+ assert(MO.isExpr() && "Unexpected operand type!");
+ MO.getExpr()->print(O, &MAI);
+ }
+}
+
+void AArch64InstPrinter::printAMIndexedWB(const MCInst *MI, unsigned OpNum,
+ unsigned Scale, raw_ostream &O) {
+ const MCOperand MO1 = MI->getOperand(OpNum + 1);
+ O << '[' << getRegisterName(MI->getOperand(OpNum).getReg());
+ if (MO1.isImm()) {
+ O << ", #" << formatImm(MO1.getImm() * Scale);
+ } else {
+ assert(MO1.isExpr() && "Unexpected operand type!");
+ O << ", ";
+ MO1.getExpr()->print(O, &MAI);
+ }
+ O << ']';
+}
+
+void AArch64InstPrinter::printPrefetchOp(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned prfop = MI->getOperand(OpNum).getImm();
+ auto PRFM = AArch64PRFM::lookupPRFMByEncoding(prfop);
+ if (PRFM)
+ O << PRFM->Name;
+ else
+ O << '#' << formatImm(prfop);
+}
+
+void AArch64InstPrinter::printPSBHintOp(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned psbhintop = MI->getOperand(OpNum).getImm();
+ auto PSB = AArch64PSBHint::lookupPSBByEncoding(psbhintop);
+ if (PSB)
+ O << PSB->Name;
+ else
+ O << '#' << formatImm(psbhintop);
+}
+
+void AArch64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &MO = MI->getOperand(OpNum);
+ float FPImm =
+ MO.isFPImm() ? MO.getFPImm() : AArch64_AM::getFPImmFloat(MO.getImm());
+
+ // 8 decimal places are enough to perfectly represent permitted floats.
+ O << format("#%.8f", FPImm);
+}
+
+static unsigned getNextVectorRegister(unsigned Reg, unsigned Stride = 1) {
+ while (Stride--) {
+ switch (Reg) {
+ default:
+ llvm_unreachable("Vector register expected!");
+ case AArch64::Q0: Reg = AArch64::Q1; break;
+ case AArch64::Q1: Reg = AArch64::Q2; break;
+ case AArch64::Q2: Reg = AArch64::Q3; break;
+ case AArch64::Q3: Reg = AArch64::Q4; break;
+ case AArch64::Q4: Reg = AArch64::Q5; break;
+ case AArch64::Q5: Reg = AArch64::Q6; break;
+ case AArch64::Q6: Reg = AArch64::Q7; break;
+ case AArch64::Q7: Reg = AArch64::Q8; break;
+ case AArch64::Q8: Reg = AArch64::Q9; break;
+ case AArch64::Q9: Reg = AArch64::Q10; break;
+ case AArch64::Q10: Reg = AArch64::Q11; break;
+ case AArch64::Q11: Reg = AArch64::Q12; break;
+ case AArch64::Q12: Reg = AArch64::Q13; break;
+ case AArch64::Q13: Reg = AArch64::Q14; break;
+ case AArch64::Q14: Reg = AArch64::Q15; break;
+ case AArch64::Q15: Reg = AArch64::Q16; break;
+ case AArch64::Q16: Reg = AArch64::Q17; break;
+ case AArch64::Q17: Reg = AArch64::Q18; break;
+ case AArch64::Q18: Reg = AArch64::Q19; break;
+ case AArch64::Q19: Reg = AArch64::Q20; break;
+ case AArch64::Q20: Reg = AArch64::Q21; break;
+ case AArch64::Q21: Reg = AArch64::Q22; break;
+ case AArch64::Q22: Reg = AArch64::Q23; break;
+ case AArch64::Q23: Reg = AArch64::Q24; break;
+ case AArch64::Q24: Reg = AArch64::Q25; break;
+ case AArch64::Q25: Reg = AArch64::Q26; break;
+ case AArch64::Q26: Reg = AArch64::Q27; break;
+ case AArch64::Q27: Reg = AArch64::Q28; break;
+ case AArch64::Q28: Reg = AArch64::Q29; break;
+ case AArch64::Q29: Reg = AArch64::Q30; break;
+ case AArch64::Q30: Reg = AArch64::Q31; break;
+ // Vector lists can wrap around.
+ case AArch64::Q31:
+ Reg = AArch64::Q0;
+ break;
+ }
+ }
+ return Reg;
+}
+
+template<unsigned size>
+void AArch64InstPrinter::printGPRSeqPairsClassOperand(const MCInst *MI,
+ unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ static_assert(size == 64 || size == 32,
+ "Template parameter must be either 32 or 64");
+ unsigned Reg = MI->getOperand(OpNum).getReg();
+
+ unsigned Sube = (size == 32) ? AArch64::sube32 : AArch64::sube64;
+ unsigned Subo = (size == 32) ? AArch64::subo32 : AArch64::subo64;
+
+ unsigned Even = MRI.getSubReg(Reg, Sube);
+ unsigned Odd = MRI.getSubReg(Reg, Subo);
+ O << getRegisterName(Even) << ", " << getRegisterName(Odd);
+}
+
+void AArch64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O,
+ StringRef LayoutSuffix) {
+ unsigned Reg = MI->getOperand(OpNum).getReg();
+
+ O << "{ ";
+
+ // Work out how many registers there are in the list (if there is an actual
+ // list).
+ unsigned NumRegs = 1;
+ if (MRI.getRegClass(AArch64::DDRegClassID).contains(Reg) ||
+ MRI.getRegClass(AArch64::QQRegClassID).contains(Reg))
+ NumRegs = 2;
+ else if (MRI.getRegClass(AArch64::DDDRegClassID).contains(Reg) ||
+ MRI.getRegClass(AArch64::QQQRegClassID).contains(Reg))
+ NumRegs = 3;
+ else if (MRI.getRegClass(AArch64::DDDDRegClassID).contains(Reg) ||
+ MRI.getRegClass(AArch64::QQQQRegClassID).contains(Reg))
+ NumRegs = 4;
+
+ // Now forget about the list and find out what the first register is.
+ if (unsigned FirstReg = MRI.getSubReg(Reg, AArch64::dsub0))
+ Reg = FirstReg;
+ else if (unsigned FirstReg = MRI.getSubReg(Reg, AArch64::qsub0))
+ Reg = FirstReg;
+
+ // If it's a D-reg, we need to promote it to the equivalent Q-reg before
+ // printing (otherwise getRegisterName fails).
+ if (MRI.getRegClass(AArch64::FPR64RegClassID).contains(Reg)) {
+ const MCRegisterClass &FPR128RC =
+ MRI.getRegClass(AArch64::FPR128RegClassID);
+ Reg = MRI.getMatchingSuperReg(Reg, AArch64::dsub, &FPR128RC);
+ }
+
+ for (unsigned i = 0; i < NumRegs; ++i, Reg = getNextVectorRegister(Reg)) {
+ O << getRegisterName(Reg, AArch64::vreg) << LayoutSuffix;
+ if (i + 1 != NumRegs)
+ O << ", ";
+ }
+
+ O << " }";
+}
+
+void
+AArch64InstPrinter::printImplicitlyTypedVectorList(const MCInst *MI,
+ unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ printVectorList(MI, OpNum, STI, O, "");
+}
+
+template <unsigned NumLanes, char LaneKind>
+void AArch64InstPrinter::printTypedVectorList(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ std::string Suffix(".");
+ if (NumLanes)
+ Suffix += itostr(NumLanes) + LaneKind;
+ else
+ Suffix += LaneKind;
+
+ printVectorList(MI, OpNum, STI, O, Suffix);
+}
+
+void AArch64InstPrinter::printVectorIndex(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ O << "[" << MI->getOperand(OpNum).getImm() << "]";
+}
+
+void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNum);
+
+ // If the label has already been resolved to an immediate offset (say, when
+ // we're running the disassembler), just print the immediate.
+ if (Op.isImm()) {
+ O << "#" << formatImm(Op.getImm() * 4);
+ return;
+ }
+
+ // If the branch target is simply an address then print it in hex.
+ const MCConstantExpr *BranchTarget =
+ dyn_cast<MCConstantExpr>(MI->getOperand(OpNum).getExpr());
+ int64_t Address;
+ if (BranchTarget && BranchTarget->evaluateAsAbsolute(Address)) {
+ O << "0x";
+ O.write_hex(Address);
+ } else {
+ // Otherwise, just print the expression.
+ MI->getOperand(OpNum).getExpr()->print(O, &MAI);
+ }
+}
+
+void AArch64InstPrinter::printAdrpLabel(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNum);
+
+ // If the label has already been resolved to an immediate offset (say, when
+ // we're running the disassembler), just print the immediate.
+ if (Op.isImm()) {
+ O << "#" << formatImm(Op.getImm() * (1 << 12));
+ return;
+ }
+
+ // Otherwise, just print the expression.
+ MI->getOperand(OpNum).getExpr()->print(O, &MAI);
+}
+
+void AArch64InstPrinter::printBarrierOption(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned Val = MI->getOperand(OpNo).getImm();
+ unsigned Opcode = MI->getOpcode();
+
+ StringRef Name;
+ if (Opcode == AArch64::ISB) {
+ auto ISB = AArch64ISB::lookupISBByEncoding(Val);
+ Name = ISB ? ISB->Name : "";
+ } else {
+ auto DB = AArch64DB::lookupDBByEncoding(Val);
+ Name = DB ? DB->Name : "";
+ }
+ if (!Name.empty())
+ O << Name;
+ else
+ O << "#" << Val;
+}
+
+void AArch64InstPrinter::printMRSSystemRegister(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned Val = MI->getOperand(OpNo).getImm();
+
+ // Horrible hack for the one register that has identical encodings but
+ // different names in MSR and MRS. Because of this, one of MRS and MSR is
+ // going to get the wrong entry
+ if (Val == AArch64SysReg::DBGDTRRX_EL0) {
+ O << "DBGDTRRX_EL0";
+ return;
+ }
+
+ const AArch64SysReg::SysReg *Reg = AArch64SysReg::lookupSysRegByEncoding(Val);
+ if (Reg && Reg->Readable && Reg->haveFeatures(STI.getFeatureBits()))
+ O << Reg->Name;
+ else
+ O << AArch64SysReg::genericRegisterString(Val);
+}
+
+void AArch64InstPrinter::printMSRSystemRegister(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned Val = MI->getOperand(OpNo).getImm();
+
+ // Horrible hack for the one register that has identical encodings but
+ // different names in MSR and MRS. Because of this, one of MRS and MSR is
+ // going to get the wrong entry
+ if (Val == AArch64SysReg::DBGDTRTX_EL0) {
+ O << "DBGDTRTX_EL0";
+ return;
+ }
+
+ const AArch64SysReg::SysReg *Reg = AArch64SysReg::lookupSysRegByEncoding(Val);
+ if (Reg && Reg->Writeable && Reg->haveFeatures(STI.getFeatureBits()))
+ O << Reg->Name;
+ else
+ O << AArch64SysReg::genericRegisterString(Val);
+}
+
+void AArch64InstPrinter::printSystemPStateField(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned Val = MI->getOperand(OpNo).getImm();
+
+ auto PState = AArch64PState::lookupPStateByEncoding(Val);
+ if (PState && PState->haveFeatures(STI.getFeatureBits()))
+ O << PState->Name;
+ else
+ O << "#" << formatImm(Val);
+}
+
+void AArch64InstPrinter::printSIMDType10Operand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned RawVal = MI->getOperand(OpNo).getImm();
+ uint64_t Val = AArch64_AM::decodeAdvSIMDModImmType10(RawVal);
+ O << format("#%#016llx", Val);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
new file mode 100644
index 000000000000..65dca99ed04e
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
@@ -0,0 +1,188 @@
+//===-- AArch64InstPrinter.h - Convert AArch64 MCInst to assembly syntax --===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an AArch64 MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_INSTPRINTER_AARCH64INSTPRINTER_H
+#define LLVM_LIB_TARGET_AARCH64_INSTPRINTER_AARCH64INSTPRINTER_H
+
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+
+class AArch64InstPrinter : public MCInstPrinter {
+public:
+ AArch64InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI);
+
+ void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
+ const MCSubtargetInfo &STI) override;
+ void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+
+ // Autogenerated by tblgen.
+ virtual void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ virtual bool printAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+ unsigned PrintMethodIdx,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ virtual StringRef getRegName(unsigned RegNo) const {
+ return getRegisterName(RegNo);
+ }
+ static const char *getRegisterName(unsigned RegNo,
+ unsigned AltIdx = AArch64::NoRegAltName);
+
+protected:
+ bool printSysAlias(const MCInst *MI, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ // Operand printers
+ void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printImm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printImmHex(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printPostIncOperand(const MCInst *MI, unsigned OpNo, unsigned Imm,
+ raw_ostream &O);
+ template <int Amount>
+ void printPostIncOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ printPostIncOperand(MI, OpNo, Amount, O);
+ }
+
+ void printVRegOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printSysCROperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printAddSubImm(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printLogicalImm32(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printLogicalImm64(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printShifter(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printShiftedRegister(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printExtendedRegister(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printArithExtend(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+
+ void printMemExtend(const MCInst *MI, unsigned OpNum, raw_ostream &O,
+ char SrcRegKind, unsigned Width);
+ template <char SrcRegKind, unsigned Width>
+ void printMemExtend(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ printMemExtend(MI, OpNum, O, SrcRegKind, Width);
+ }
+
+ void printCondCode(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printInverseCondCode(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printAlignedLabel(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printUImm12Offset(const MCInst *MI, unsigned OpNum, unsigned Scale,
+ raw_ostream &O);
+ void printAMIndexedWB(const MCInst *MI, unsigned OpNum, unsigned Scale,
+ raw_ostream &O);
+
+ template <int Scale>
+ void printUImm12Offset(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ printUImm12Offset(MI, OpNum, Scale, O);
+ }
+
+ template <int BitWidth>
+ void printAMIndexedWB(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ printAMIndexedWB(MI, OpNum, BitWidth / 8, O);
+ }
+
+ void printAMNoIndex(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+
+ template <int Scale>
+ void printImmScale(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+
+ void printPrefetchOp(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+
+ void printPSBHintOp(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+
+ void printFPImmOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+
+ void printVectorList(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O,
+ StringRef LayoutSuffix);
+
+ /// Print a list of vector registers where the type suffix is implicit
+ /// (i.e. attached to the instruction rather than the registers).
+ void printImplicitlyTypedVectorList(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O);
+
+ template <unsigned NumLanes, char LaneKind>
+ void printTypedVectorList(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+
+ void printVectorIndex(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printAdrpLabel(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printBarrierOption(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printMSRSystemRegister(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printMRSSystemRegister(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printSystemPStateField(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printSIMDType10Operand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ template<unsigned size>
+ void printGPRSeqPairsClassOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O);
+};
+
+class AArch64AppleInstPrinter : public AArch64InstPrinter {
+public:
+ AArch64AppleInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI);
+
+ void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
+ const MCSubtargetInfo &STI) override;
+
+ void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI,
+ raw_ostream &O) override;
+ bool printAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI,
+ raw_ostream &O) override;
+ void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+ unsigned PrintMethodIdx,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) override;
+ StringRef getRegName(unsigned RegNo) const override {
+ return getRegisterName(RegNo);
+ }
+ static const char *getRegisterName(unsigned RegNo,
+ unsigned AltIdx = AArch64::NoRegAltName);
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
new file mode 100644
index 000000000000..3e5ef4df4706
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
@@ -0,0 +1,803 @@
+//===- AArch64AddressingModes.h - AArch64 Addressing Modes ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 addressing mode implementation stuff.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64ADDRESSINGMODES_H
+#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64ADDRESSINGMODES_H
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+
+namespace llvm {
+
+/// AArch64_AM - AArch64 Addressing Mode Stuff
+namespace AArch64_AM {
+
+//===----------------------------------------------------------------------===//
+// Shifts
+//
+
+enum ShiftExtendType {
+ InvalidShiftExtend = -1,
+ LSL = 0,
+ LSR,
+ ASR,
+ ROR,
+ MSL,
+
+ UXTB,
+ UXTH,
+ UXTW,
+ UXTX,
+
+ SXTB,
+ SXTH,
+ SXTW,
+ SXTX,
+};
+
+/// getShiftName - Get the string encoding for the shift type.
+static inline const char *getShiftExtendName(AArch64_AM::ShiftExtendType ST) {
+ switch (ST) {
+ default: llvm_unreachable("unhandled shift type!");
+ case AArch64_AM::LSL: return "lsl";
+ case AArch64_AM::LSR: return "lsr";
+ case AArch64_AM::ASR: return "asr";
+ case AArch64_AM::ROR: return "ror";
+ case AArch64_AM::MSL: return "msl";
+ case AArch64_AM::UXTB: return "uxtb";
+ case AArch64_AM::UXTH: return "uxth";
+ case AArch64_AM::UXTW: return "uxtw";
+ case AArch64_AM::UXTX: return "uxtx";
+ case AArch64_AM::SXTB: return "sxtb";
+ case AArch64_AM::SXTH: return "sxth";
+ case AArch64_AM::SXTW: return "sxtw";
+ case AArch64_AM::SXTX: return "sxtx";
+ }
+ return nullptr;
+}
+
+/// getShiftType - Extract the shift type.
+static inline AArch64_AM::ShiftExtendType getShiftType(unsigned Imm) {
+ switch ((Imm >> 6) & 0x7) {
+ default: return AArch64_AM::InvalidShiftExtend;
+ case 0: return AArch64_AM::LSL;
+ case 1: return AArch64_AM::LSR;
+ case 2: return AArch64_AM::ASR;
+ case 3: return AArch64_AM::ROR;
+ case 4: return AArch64_AM::MSL;
+ }
+}
+
+/// getShiftValue - Extract the shift value.
+static inline unsigned getShiftValue(unsigned Imm) {
+ return Imm & 0x3f;
+}
+
+/// getShifterImm - Encode the shift type and amount:
+/// imm: 6-bit shift amount
+/// shifter: 000 ==> lsl
+/// 001 ==> lsr
+/// 010 ==> asr
+/// 011 ==> ror
+/// 100 ==> msl
+/// {8-6} = shifter
+/// {5-0} = imm
+static inline unsigned getShifterImm(AArch64_AM::ShiftExtendType ST,
+ unsigned Imm) {
+ assert((Imm & 0x3f) == Imm && "Illegal shifted immedate value!");
+ unsigned STEnc = 0;
+ switch (ST) {
+ default: llvm_unreachable("Invalid shift requested");
+ case AArch64_AM::LSL: STEnc = 0; break;
+ case AArch64_AM::LSR: STEnc = 1; break;
+ case AArch64_AM::ASR: STEnc = 2; break;
+ case AArch64_AM::ROR: STEnc = 3; break;
+ case AArch64_AM::MSL: STEnc = 4; break;
+ }
+ return (STEnc << 6) | (Imm & 0x3f);
+}
+
+//===----------------------------------------------------------------------===//
+// Extends
+//
+
+/// getArithShiftValue - get the arithmetic shift value.
+static inline unsigned getArithShiftValue(unsigned Imm) {
+ return Imm & 0x7;
+}
+
+/// getExtendType - Extract the extend type for operands of arithmetic ops.
+static inline AArch64_AM::ShiftExtendType getExtendType(unsigned Imm) {
+ assert((Imm & 0x7) == Imm && "invalid immediate!");
+ switch (Imm) {
+ default: llvm_unreachable("Compiler bug!");
+ case 0: return AArch64_AM::UXTB;
+ case 1: return AArch64_AM::UXTH;
+ case 2: return AArch64_AM::UXTW;
+ case 3: return AArch64_AM::UXTX;
+ case 4: return AArch64_AM::SXTB;
+ case 5: return AArch64_AM::SXTH;
+ case 6: return AArch64_AM::SXTW;
+ case 7: return AArch64_AM::SXTX;
+ }
+}
+
+static inline AArch64_AM::ShiftExtendType getArithExtendType(unsigned Imm) {
+ return getExtendType((Imm >> 3) & 0x7);
+}
+
+/// Mapping from extend bits to required operation:
+/// shifter: 000 ==> uxtb
+/// 001 ==> uxth
+/// 010 ==> uxtw
+/// 011 ==> uxtx
+/// 100 ==> sxtb
+/// 101 ==> sxth
+/// 110 ==> sxtw
+/// 111 ==> sxtx
+inline unsigned getExtendEncoding(AArch64_AM::ShiftExtendType ET) {
+ switch (ET) {
+ default: llvm_unreachable("Invalid extend type requested");
+ case AArch64_AM::UXTB: return 0; break;
+ case AArch64_AM::UXTH: return 1; break;
+ case AArch64_AM::UXTW: return 2; break;
+ case AArch64_AM::UXTX: return 3; break;
+ case AArch64_AM::SXTB: return 4; break;
+ case AArch64_AM::SXTH: return 5; break;
+ case AArch64_AM::SXTW: return 6; break;
+ case AArch64_AM::SXTX: return 7; break;
+ }
+}
+
+/// getArithExtendImm - Encode the extend type and shift amount for an
+/// arithmetic instruction:
+/// imm: 3-bit extend amount
+/// {5-3} = shifter
+/// {2-0} = imm3
+static inline unsigned getArithExtendImm(AArch64_AM::ShiftExtendType ET,
+ unsigned Imm) {
+ assert((Imm & 0x7) == Imm && "Illegal shifted immedate value!");
+ return (getExtendEncoding(ET) << 3) | (Imm & 0x7);
+}
+
+/// getMemDoShift - Extract the "do shift" flag value for load/store
+/// instructions.
+static inline bool getMemDoShift(unsigned Imm) {
+ return (Imm & 0x1) != 0;
+}
+
+/// getExtendType - Extract the extend type for the offset operand of
+/// loads/stores.
+static inline AArch64_AM::ShiftExtendType getMemExtendType(unsigned Imm) {
+ return getExtendType((Imm >> 1) & 0x7);
+}
+
+/// getExtendImm - Encode the extend type and amount for a load/store inst:
+/// doshift: should the offset be scaled by the access size
+/// shifter: 000 ==> uxtb
+/// 001 ==> uxth
+/// 010 ==> uxtw
+/// 011 ==> uxtx
+/// 100 ==> sxtb
+/// 101 ==> sxth
+/// 110 ==> sxtw
+/// 111 ==> sxtx
+/// {3-1} = shifter
+/// {0} = doshift
+static inline unsigned getMemExtendImm(AArch64_AM::ShiftExtendType ET,
+ bool DoShift) {
+ return (getExtendEncoding(ET) << 1) | unsigned(DoShift);
+}
+
+static inline uint64_t ror(uint64_t elt, unsigned size) {
+ return ((elt & 1) << (size-1)) | (elt >> 1);
+}
+
+/// processLogicalImmediate - Determine if an immediate value can be encoded
+/// as the immediate operand of a logical instruction for the given register
+/// size. If so, return true with "encoding" set to the encoded value in
+/// the form N:immr:imms.
+static inline bool processLogicalImmediate(uint64_t Imm, unsigned RegSize,
+ uint64_t &Encoding) {
+ if (Imm == 0ULL || Imm == ~0ULL ||
+ (RegSize != 64 && (Imm >> RegSize != 0 || Imm == ~0U)))
+ return false;
+
+ // First, determine the element size.
+ unsigned Size = RegSize;
+
+ do {
+ Size /= 2;
+ uint64_t Mask = (1ULL << Size) - 1;
+
+ if ((Imm & Mask) != ((Imm >> Size) & Mask)) {
+ Size *= 2;
+ break;
+ }
+ } while (Size > 2);
+
+ // Second, determine the rotation to make the element be: 0^m 1^n.
+ uint32_t CTO, I;
+ uint64_t Mask = ((uint64_t)-1LL) >> (64 - Size);
+ Imm &= Mask;
+
+ if (isShiftedMask_64(Imm)) {
+ I = countTrailingZeros(Imm);
+ assert(I < 64 && "undefined behavior");
+ CTO = countTrailingOnes(Imm >> I);
+ } else {
+ Imm |= ~Mask;
+ if (!isShiftedMask_64(~Imm))
+ return false;
+
+ unsigned CLO = countLeadingOnes(Imm);
+ I = 64 - CLO;
+ CTO = CLO + countTrailingOnes(Imm) - (64 - Size);
+ }
+
+ // Encode in Immr the number of RORs it would take to get *from* 0^m 1^n
+ // to our target value, where I is the number of RORs to go the opposite
+ // direction.
+ assert(Size > I && "I should be smaller than element size");
+ unsigned Immr = (Size - I) & (Size - 1);
+
+ // If size has a 1 in the n'th bit, create a value that has zeroes in
+ // bits [0, n] and ones above that.
+ uint64_t NImms = ~(Size-1) << 1;
+
+ // Or the CTO value into the low bits, which must be below the Nth bit
+ // bit mentioned above.
+ NImms |= (CTO-1);
+
+ // Extract the seventh bit and toggle it to create the N field.
+ unsigned N = ((NImms >> 6) & 1) ^ 1;
+
+ Encoding = (N << 12) | (Immr << 6) | (NImms & 0x3f);
+ return true;
+}
+
+/// isLogicalImmediate - Return true if the immediate is valid for a logical
+/// immediate instruction of the given register size. Return false otherwise.
+static inline bool isLogicalImmediate(uint64_t imm, unsigned regSize) {
+ uint64_t encoding;
+ return processLogicalImmediate(imm, regSize, encoding);
+}
+
+/// encodeLogicalImmediate - Return the encoded immediate value for a logical
+/// immediate instruction of the given register size.
+static inline uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize) {
+ uint64_t encoding = 0;
+ bool res = processLogicalImmediate(imm, regSize, encoding);
+ assert(res && "invalid logical immediate");
+ (void)res;
+ return encoding;
+}
+
+/// decodeLogicalImmediate - Decode a logical immediate value in the form
+/// "N:immr:imms" (where the immr and imms fields are each 6 bits) into the
+/// integer value it represents with regSize bits.
+static inline uint64_t decodeLogicalImmediate(uint64_t val, unsigned regSize) {
+ // Extract the N, imms, and immr fields.
+ unsigned N = (val >> 12) & 1;
+ unsigned immr = (val >> 6) & 0x3f;
+ unsigned imms = val & 0x3f;
+
+ assert((regSize == 64 || N == 0) && "undefined logical immediate encoding");
+ int len = 31 - countLeadingZeros((N << 6) | (~imms & 0x3f));
+ assert(len >= 0 && "undefined logical immediate encoding");
+ unsigned size = (1 << len);
+ unsigned R = immr & (size - 1);
+ unsigned S = imms & (size - 1);
+ assert(S != size - 1 && "undefined logical immediate encoding");
+ uint64_t pattern = (1ULL << (S + 1)) - 1;
+ for (unsigned i = 0; i < R; ++i)
+ pattern = ror(pattern, size);
+
+ // Replicate the pattern to fill the regSize.
+ while (size != regSize) {
+ pattern |= (pattern << size);
+ size *= 2;
+ }
+ return pattern;
+}
+
+/// isValidDecodeLogicalImmediate - Check to see if the logical immediate value
+/// in the form "N:immr:imms" (where the immr and imms fields are each 6 bits)
+/// is a valid encoding for an integer value with regSize bits.
+static inline bool isValidDecodeLogicalImmediate(uint64_t val,
+ unsigned regSize) {
+ // Extract the N and imms fields needed for checking.
+ unsigned N = (val >> 12) & 1;
+ unsigned imms = val & 0x3f;
+
+ if (regSize == 32 && N != 0) // undefined logical immediate encoding
+ return false;
+ int len = 31 - countLeadingZeros((N << 6) | (~imms & 0x3f));
+ if (len < 0) // undefined logical immediate encoding
+ return false;
+ unsigned size = (1 << len);
+ unsigned S = imms & (size - 1);
+ if (S == size - 1) // undefined logical immediate encoding
+ return false;
+
+ return true;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating-point Immediates
+//
+static inline float getFPImmFloat(unsigned Imm) {
+ // We expect an 8-bit binary encoding of a floating-point number here.
+ union {
+ uint32_t I;
+ float F;
+ } FPUnion;
+
+ uint8_t Sign = (Imm >> 7) & 0x1;
+ uint8_t Exp = (Imm >> 4) & 0x7;
+ uint8_t Mantissa = Imm & 0xf;
+
+ // 8-bit FP iEEEE Float Encoding
+ // abcd efgh aBbbbbbc defgh000 00000000 00000000
+ //
+ // where B = NOT(b);
+
+ FPUnion.I = 0;
+ FPUnion.I |= Sign << 31;
+ FPUnion.I |= ((Exp & 0x4) != 0 ? 0 : 1) << 30;
+ FPUnion.I |= ((Exp & 0x4) != 0 ? 0x1f : 0) << 25;
+ FPUnion.I |= (Exp & 0x3) << 23;
+ FPUnion.I |= Mantissa << 19;
+ return FPUnion.F;
+}
+
+/// getFP16Imm - Return an 8-bit floating-point version of the 16-bit
+/// floating-point value. If the value cannot be represented as an 8-bit
+/// floating-point value, then return -1.
+static inline int getFP16Imm(const APInt &Imm) {
+ uint32_t Sign = Imm.lshr(15).getZExtValue() & 1;
+ int32_t Exp = (Imm.lshr(10).getSExtValue() & 0x1f) - 15; // -14 to 15
+ int32_t Mantissa = Imm.getZExtValue() & 0x3ff; // 10 bits
+
+ // We can handle 4 bits of mantissa.
+ // mantissa = (16+UInt(e:f:g:h))/16.
+ if (Mantissa & 0x3f)
+ return -1;
+ Mantissa >>= 6;
+
+ // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
+ if (Exp < -3 || Exp > 4)
+ return -1;
+ Exp = ((Exp+3) & 0x7) ^ 4;
+
+ return ((int)Sign << 7) | (Exp << 4) | Mantissa;
+}
+
+static inline int getFP16Imm(const APFloat &FPImm) {
+ return getFP16Imm(FPImm.bitcastToAPInt());
+}
+
+/// getFP32Imm - Return an 8-bit floating-point version of the 32-bit
+/// floating-point value. If the value cannot be represented as an 8-bit
+/// floating-point value, then return -1.
+static inline int getFP32Imm(const APInt &Imm) {
+ uint32_t Sign = Imm.lshr(31).getZExtValue() & 1;
+ int32_t Exp = (Imm.lshr(23).getSExtValue() & 0xff) - 127; // -126 to 127
+ int64_t Mantissa = Imm.getZExtValue() & 0x7fffff; // 23 bits
+
+ // We can handle 4 bits of mantissa.
+ // mantissa = (16+UInt(e:f:g:h))/16.
+ if (Mantissa & 0x7ffff)
+ return -1;
+ Mantissa >>= 19;
+ if ((Mantissa & 0xf) != Mantissa)
+ return -1;
+
+ // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
+ if (Exp < -3 || Exp > 4)
+ return -1;
+ Exp = ((Exp+3) & 0x7) ^ 4;
+
+ return ((int)Sign << 7) | (Exp << 4) | Mantissa;
+}
+
+static inline int getFP32Imm(const APFloat &FPImm) {
+ return getFP32Imm(FPImm.bitcastToAPInt());
+}
+
+/// getFP64Imm - Return an 8-bit floating-point version of the 64-bit
+/// floating-point value. If the value cannot be represented as an 8-bit
+/// floating-point value, then return -1.
+static inline int getFP64Imm(const APInt &Imm) {
+ uint64_t Sign = Imm.lshr(63).getZExtValue() & 1;
+ int64_t Exp = (Imm.lshr(52).getSExtValue() & 0x7ff) - 1023; // -1022 to 1023
+ uint64_t Mantissa = Imm.getZExtValue() & 0xfffffffffffffULL;
+
+ // We can handle 4 bits of mantissa.
+ // mantissa = (16+UInt(e:f:g:h))/16.
+ if (Mantissa & 0xffffffffffffULL)
+ return -1;
+ Mantissa >>= 48;
+ if ((Mantissa & 0xf) != Mantissa)
+ return -1;
+
+ // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
+ if (Exp < -3 || Exp > 4)
+ return -1;
+ Exp = ((Exp+3) & 0x7) ^ 4;
+
+ return ((int)Sign << 7) | (Exp << 4) | Mantissa;
+}
+
+static inline int getFP64Imm(const APFloat &FPImm) {
+ return getFP64Imm(FPImm.bitcastToAPInt());
+}
+
+//===--------------------------------------------------------------------===//
+// AdvSIMD Modified Immediates
+//===--------------------------------------------------------------------===//
+
+// 0x00 0x00 0x00 abcdefgh 0x00 0x00 0x00 abcdefgh
+static inline bool isAdvSIMDModImmType1(uint64_t Imm) {
+ return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+ ((Imm & 0xffffff00ffffff00ULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType1(uint64_t Imm) {
+ return (Imm & 0xffULL);
+}
+
+static inline uint64_t decodeAdvSIMDModImmType1(uint8_t Imm) {
+ uint64_t EncVal = Imm;
+ return (EncVal << 32) | EncVal;
+}
+
+// 0x00 0x00 abcdefgh 0x00 0x00 0x00 abcdefgh 0x00
+static inline bool isAdvSIMDModImmType2(uint64_t Imm) {
+ return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+ ((Imm & 0xffff00ffffff00ffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType2(uint64_t Imm) {
+ return (Imm & 0xff00ULL) >> 8;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType2(uint8_t Imm) {
+ uint64_t EncVal = Imm;
+ return (EncVal << 40) | (EncVal << 8);
+}
+
+// 0x00 abcdefgh 0x00 0x00 0x00 abcdefgh 0x00 0x00
+static inline bool isAdvSIMDModImmType3(uint64_t Imm) {
+ return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+ ((Imm & 0xff00ffffff00ffffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType3(uint64_t Imm) {
+ return (Imm & 0xff0000ULL) >> 16;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType3(uint8_t Imm) {
+ uint64_t EncVal = Imm;
+ return (EncVal << 48) | (EncVal << 16);
+}
+
+// abcdefgh 0x00 0x00 0x00 abcdefgh 0x00 0x00 0x00
+static inline bool isAdvSIMDModImmType4(uint64_t Imm) {
+ return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+ ((Imm & 0x00ffffff00ffffffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType4(uint64_t Imm) {
+ return (Imm & 0xff000000ULL) >> 24;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType4(uint8_t Imm) {
+ uint64_t EncVal = Imm;
+ return (EncVal << 56) | (EncVal << 24);
+}
+
+// 0x00 abcdefgh 0x00 abcdefgh 0x00 abcdefgh 0x00 abcdefgh
+static inline bool isAdvSIMDModImmType5(uint64_t Imm) {
+ return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+ (((Imm & 0x00ff0000ULL) >> 16) == (Imm & 0x000000ffULL)) &&
+ ((Imm & 0xff00ff00ff00ff00ULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType5(uint64_t Imm) {
+ return (Imm & 0xffULL);
+}
+
+static inline uint64_t decodeAdvSIMDModImmType5(uint8_t Imm) {
+ uint64_t EncVal = Imm;
+ return (EncVal << 48) | (EncVal << 32) | (EncVal << 16) | EncVal;
+}
+
+// abcdefgh 0x00 abcdefgh 0x00 abcdefgh 0x00 abcdefgh 0x00
+static inline bool isAdvSIMDModImmType6(uint64_t Imm) {
+ return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+ (((Imm & 0xff000000ULL) >> 16) == (Imm & 0x0000ff00ULL)) &&
+ ((Imm & 0x00ff00ff00ff00ffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType6(uint64_t Imm) {
+ return (Imm & 0xff00ULL) >> 8;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType6(uint8_t Imm) {
+ uint64_t EncVal = Imm;
+ return (EncVal << 56) | (EncVal << 40) | (EncVal << 24) | (EncVal << 8);
+}
+
+// 0x00 0x00 abcdefgh 0xFF 0x00 0x00 abcdefgh 0xFF
+static inline bool isAdvSIMDModImmType7(uint64_t Imm) {
+ return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+ ((Imm & 0xffff00ffffff00ffULL) == 0x000000ff000000ffULL);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType7(uint64_t Imm) {
+ return (Imm & 0xff00ULL) >> 8;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType7(uint8_t Imm) {
+ uint64_t EncVal = Imm;
+ return (EncVal << 40) | (EncVal << 8) | 0x000000ff000000ffULL;
+}
+
+// 0x00 abcdefgh 0xFF 0xFF 0x00 abcdefgh 0xFF 0xFF
+static inline bool isAdvSIMDModImmType8(uint64_t Imm) {
+ return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+ ((Imm & 0xff00ffffff00ffffULL) == 0x0000ffff0000ffffULL);
+}
+
+static inline uint64_t decodeAdvSIMDModImmType8(uint8_t Imm) {
+ uint64_t EncVal = Imm;
+ return (EncVal << 48) | (EncVal << 16) | 0x0000ffff0000ffffULL;
+}
+
+static inline uint8_t encodeAdvSIMDModImmType8(uint64_t Imm) {
+ return (Imm & 0x00ff0000ULL) >> 16;
+}
+
+// abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh
+static inline bool isAdvSIMDModImmType9(uint64_t Imm) {
+ return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+ ((Imm >> 48) == (Imm & 0x0000ffffULL)) &&
+ ((Imm >> 56) == (Imm & 0x000000ffULL));
+}
+
+static inline uint8_t encodeAdvSIMDModImmType9(uint64_t Imm) {
+ return (Imm & 0xffULL);
+}
+
+static inline uint64_t decodeAdvSIMDModImmType9(uint8_t Imm) {
+ uint64_t EncVal = Imm;
+ EncVal |= (EncVal << 8);
+ EncVal |= (EncVal << 16);
+ EncVal |= (EncVal << 32);
+ return EncVal;
+}
+
+// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
+// cmode: 1110, op: 1
+static inline bool isAdvSIMDModImmType10(uint64_t Imm) {
+ uint64_t ByteA = Imm & 0xff00000000000000ULL;
+ uint64_t ByteB = Imm & 0x00ff000000000000ULL;
+ uint64_t ByteC = Imm & 0x0000ff0000000000ULL;
+ uint64_t ByteD = Imm & 0x000000ff00000000ULL;
+ uint64_t ByteE = Imm & 0x00000000ff000000ULL;
+ uint64_t ByteF = Imm & 0x0000000000ff0000ULL;
+ uint64_t ByteG = Imm & 0x000000000000ff00ULL;
+ uint64_t ByteH = Imm & 0x00000000000000ffULL;
+
+ return (ByteA == 0ULL || ByteA == 0xff00000000000000ULL) &&
+ (ByteB == 0ULL || ByteB == 0x00ff000000000000ULL) &&
+ (ByteC == 0ULL || ByteC == 0x0000ff0000000000ULL) &&
+ (ByteD == 0ULL || ByteD == 0x000000ff00000000ULL) &&
+ (ByteE == 0ULL || ByteE == 0x00000000ff000000ULL) &&
+ (ByteF == 0ULL || ByteF == 0x0000000000ff0000ULL) &&
+ (ByteG == 0ULL || ByteG == 0x000000000000ff00ULL) &&
+ (ByteH == 0ULL || ByteH == 0x00000000000000ffULL);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType10(uint64_t Imm) {
+ uint8_t BitA = (Imm & 0xff00000000000000ULL) != 0;
+ uint8_t BitB = (Imm & 0x00ff000000000000ULL) != 0;
+ uint8_t BitC = (Imm & 0x0000ff0000000000ULL) != 0;
+ uint8_t BitD = (Imm & 0x000000ff00000000ULL) != 0;
+ uint8_t BitE = (Imm & 0x00000000ff000000ULL) != 0;
+ uint8_t BitF = (Imm & 0x0000000000ff0000ULL) != 0;
+ uint8_t BitG = (Imm & 0x000000000000ff00ULL) != 0;
+ uint8_t BitH = (Imm & 0x00000000000000ffULL) != 0;
+
+ uint8_t EncVal = BitA;
+ EncVal <<= 1;
+ EncVal |= BitB;
+ EncVal <<= 1;
+ EncVal |= BitC;
+ EncVal <<= 1;
+ EncVal |= BitD;
+ EncVal <<= 1;
+ EncVal |= BitE;
+ EncVal <<= 1;
+ EncVal |= BitF;
+ EncVal <<= 1;
+ EncVal |= BitG;
+ EncVal <<= 1;
+ EncVal |= BitH;
+ return EncVal;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType10(uint8_t Imm) {
+ uint64_t EncVal = 0;
+ if (Imm & 0x80) EncVal |= 0xff00000000000000ULL;
+ if (Imm & 0x40) EncVal |= 0x00ff000000000000ULL;
+ if (Imm & 0x20) EncVal |= 0x0000ff0000000000ULL;
+ if (Imm & 0x10) EncVal |= 0x000000ff00000000ULL;
+ if (Imm & 0x08) EncVal |= 0x00000000ff000000ULL;
+ if (Imm & 0x04) EncVal |= 0x0000000000ff0000ULL;
+ if (Imm & 0x02) EncVal |= 0x000000000000ff00ULL;
+ if (Imm & 0x01) EncVal |= 0x00000000000000ffULL;
+ return EncVal;
+}
+
+// aBbbbbbc defgh000 0x00 0x00 aBbbbbbc defgh000 0x00 0x00
+static inline bool isAdvSIMDModImmType11(uint64_t Imm) {
+ uint64_t BString = (Imm & 0x7E000000ULL) >> 25;
+ return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+ (BString == 0x1f || BString == 0x20) &&
+ ((Imm & 0x0007ffff0007ffffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType11(uint64_t Imm) {
+ uint8_t BitA = (Imm & 0x80000000ULL) != 0;
+ uint8_t BitB = (Imm & 0x20000000ULL) != 0;
+ uint8_t BitC = (Imm & 0x01000000ULL) != 0;
+ uint8_t BitD = (Imm & 0x00800000ULL) != 0;
+ uint8_t BitE = (Imm & 0x00400000ULL) != 0;
+ uint8_t BitF = (Imm & 0x00200000ULL) != 0;
+ uint8_t BitG = (Imm & 0x00100000ULL) != 0;
+ uint8_t BitH = (Imm & 0x00080000ULL) != 0;
+
+ uint8_t EncVal = BitA;
+ EncVal <<= 1;
+ EncVal |= BitB;
+ EncVal <<= 1;
+ EncVal |= BitC;
+ EncVal <<= 1;
+ EncVal |= BitD;
+ EncVal <<= 1;
+ EncVal |= BitE;
+ EncVal <<= 1;
+ EncVal |= BitF;
+ EncVal <<= 1;
+ EncVal |= BitG;
+ EncVal <<= 1;
+ EncVal |= BitH;
+ return EncVal;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType11(uint8_t Imm) {
+ uint64_t EncVal = 0;
+ if (Imm & 0x80) EncVal |= 0x80000000ULL;
+ if (Imm & 0x40) EncVal |= 0x3e000000ULL;
+ else EncVal |= 0x40000000ULL;
+ if (Imm & 0x20) EncVal |= 0x01000000ULL;
+ if (Imm & 0x10) EncVal |= 0x00800000ULL;
+ if (Imm & 0x08) EncVal |= 0x00400000ULL;
+ if (Imm & 0x04) EncVal |= 0x00200000ULL;
+ if (Imm & 0x02) EncVal |= 0x00100000ULL;
+ if (Imm & 0x01) EncVal |= 0x00080000ULL;
+ return (EncVal << 32) | EncVal;
+}
+
+// aBbbbbbb bbcdefgh 0x00 0x00 0x00 0x00 0x00 0x00
+static inline bool isAdvSIMDModImmType12(uint64_t Imm) {
+ uint64_t BString = (Imm & 0x7fc0000000000000ULL) >> 54;
+ return ((BString == 0xff || BString == 0x100) &&
+ ((Imm & 0x0000ffffffffffffULL) == 0));
+}
+
+static inline uint8_t encodeAdvSIMDModImmType12(uint64_t Imm) {
+ uint8_t BitA = (Imm & 0x8000000000000000ULL) != 0;
+ uint8_t BitB = (Imm & 0x0040000000000000ULL) != 0;
+ uint8_t BitC = (Imm & 0x0020000000000000ULL) != 0;
+ uint8_t BitD = (Imm & 0x0010000000000000ULL) != 0;
+ uint8_t BitE = (Imm & 0x0008000000000000ULL) != 0;
+ uint8_t BitF = (Imm & 0x0004000000000000ULL) != 0;
+ uint8_t BitG = (Imm & 0x0002000000000000ULL) != 0;
+ uint8_t BitH = (Imm & 0x0001000000000000ULL) != 0;
+
+ uint8_t EncVal = BitA;
+ EncVal <<= 1;
+ EncVal |= BitB;
+ EncVal <<= 1;
+ EncVal |= BitC;
+ EncVal <<= 1;
+ EncVal |= BitD;
+ EncVal <<= 1;
+ EncVal |= BitE;
+ EncVal <<= 1;
+ EncVal |= BitF;
+ EncVal <<= 1;
+ EncVal |= BitG;
+ EncVal <<= 1;
+ EncVal |= BitH;
+ return EncVal;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType12(uint8_t Imm) {
+ uint64_t EncVal = 0;
+ if (Imm & 0x80) EncVal |= 0x8000000000000000ULL;
+ if (Imm & 0x40) EncVal |= 0x3fc0000000000000ULL;
+ else EncVal |= 0x4000000000000000ULL;
+ if (Imm & 0x20) EncVal |= 0x0020000000000000ULL;
+ if (Imm & 0x10) EncVal |= 0x0010000000000000ULL;
+ if (Imm & 0x08) EncVal |= 0x0008000000000000ULL;
+ if (Imm & 0x04) EncVal |= 0x0004000000000000ULL;
+ if (Imm & 0x02) EncVal |= 0x0002000000000000ULL;
+ if (Imm & 0x01) EncVal |= 0x0001000000000000ULL;
+ return (EncVal << 32) | EncVal;
+}
+
+inline static bool isAnyMOVZMovAlias(uint64_t Value, int RegWidth) {
+ for (int Shift = 0; Shift <= RegWidth - 16; Shift += 16)
+ if ((Value & ~(0xffffULL << Shift)) == 0)
+ return true;
+
+ return false;
+}
+
+inline static bool isMOVZMovAlias(uint64_t Value, int Shift, int RegWidth) {
+ if (RegWidth == 32)
+ Value &= 0xffffffffULL;
+
+ // "lsl #0" takes precedence: in practice this only affects "#0, lsl #0".
+ if (Value == 0 && Shift != 0)
+ return false;
+
+ return (Value & ~(0xffffULL << Shift)) == 0;
+}
+
+inline static bool isMOVNMovAlias(uint64_t Value, int Shift, int RegWidth) {
+ // MOVZ takes precedence over MOVN.
+ if (isAnyMOVZMovAlias(Value, RegWidth))
+ return false;
+
+ Value = ~Value;
+ if (RegWidth == 32)
+ Value &= 0xffffffffULL;
+
+ return isMOVZMovAlias(Value, Shift, RegWidth);
+}
+
+inline static bool isAnyMOVWMovAlias(uint64_t Value, int RegWidth) {
+ if (isAnyMOVZMovAlias(Value, RegWidth))
+ return true;
+
+ // It's not a MOVZ, but it might be a MOVN.
+ Value = ~Value;
+ if (RegWidth == 32)
+ Value &= 0xffffffffULL;
+
+ return isAnyMOVZMovAlias(Value, RegWidth);
+}
+
+} // end namespace AArch64_AM
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
new file mode 100644
index 000000000000..14c0327f5fa8
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -0,0 +1,612 @@
+//===-- AArch64AsmBackend.cpp - AArch64 Assembler Backend -----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64RegisterInfo.h"
+#include "MCTargetDesc/AArch64FixupKinds.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
+using namespace llvm;
+
+namespace {
+
+class AArch64AsmBackend : public MCAsmBackend {
+ static const unsigned PCRelFlagVal =
+ MCFixupKindInfo::FKF_IsAlignedDownTo32Bits | MCFixupKindInfo::FKF_IsPCRel;
+public:
+ bool IsLittleEndian;
+
+public:
+ AArch64AsmBackend(const Target &T, bool IsLittleEndian)
+ : MCAsmBackend(), IsLittleEndian(IsLittleEndian) {}
+
+ unsigned getNumFixupKinds() const override {
+ return AArch64::NumTargetFixupKinds;
+ }
+
+ const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
+ const static MCFixupKindInfo Infos[AArch64::NumTargetFixupKinds] = {
+ // This table *must* be in the order that the fixup_* kinds are defined in
+ // AArch64FixupKinds.h.
+ //
+ // Name Offset (bits) Size (bits) Flags
+ { "fixup_aarch64_pcrel_adr_imm21", 0, 32, PCRelFlagVal },
+ { "fixup_aarch64_pcrel_adrp_imm21", 0, 32, PCRelFlagVal },
+ { "fixup_aarch64_add_imm12", 10, 12, 0 },
+ { "fixup_aarch64_ldst_imm12_scale1", 10, 12, 0 },
+ { "fixup_aarch64_ldst_imm12_scale2", 10, 12, 0 },
+ { "fixup_aarch64_ldst_imm12_scale4", 10, 12, 0 },
+ { "fixup_aarch64_ldst_imm12_scale8", 10, 12, 0 },
+ { "fixup_aarch64_ldst_imm12_scale16", 10, 12, 0 },
+ { "fixup_aarch64_ldr_pcrel_imm19", 5, 19, PCRelFlagVal },
+ { "fixup_aarch64_movw", 5, 16, 0 },
+ { "fixup_aarch64_pcrel_branch14", 5, 14, PCRelFlagVal },
+ { "fixup_aarch64_pcrel_branch19", 5, 19, PCRelFlagVal },
+ { "fixup_aarch64_pcrel_branch26", 0, 26, PCRelFlagVal },
+ { "fixup_aarch64_pcrel_call26", 0, 26, PCRelFlagVal },
+ { "fixup_aarch64_tlsdesc_call", 0, 0, 0 }
+ };
+
+ if (Kind < FirstTargetFixupKind)
+ return MCAsmBackend::getFixupKindInfo(Kind);
+
+ assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+ "Invalid kind!");
+ return Infos[Kind - FirstTargetFixupKind];
+ }
+
+ void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+ uint64_t Value, bool IsPCRel) const override;
+
+ bool mayNeedRelaxation(const MCInst &Inst) const override;
+ bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+ const MCRelaxableFragment *DF,
+ const MCAsmLayout &Layout) const override;
+ void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+ MCInst &Res) const override;
+ bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+
+ void HandleAssemblerFlag(MCAssemblerFlag Flag) {}
+
+ unsigned getPointerSize() const { return 8; }
+
+ unsigned getFixupKindContainereSizeInBytes(unsigned Kind) const;
+};
+
+} // end anonymous namespace
+
+/// \brief The number of bytes the fixup may change.
+static unsigned getFixupKindNumBytes(unsigned Kind) {
+ switch (Kind) {
+ default:
+ llvm_unreachable("Unknown fixup kind!");
+
+ case AArch64::fixup_aarch64_tlsdesc_call:
+ return 0;
+
+ case FK_Data_1:
+ return 1;
+
+ case FK_Data_2:
+ case AArch64::fixup_aarch64_movw:
+ return 2;
+
+ case AArch64::fixup_aarch64_pcrel_branch14:
+ case AArch64::fixup_aarch64_add_imm12:
+ case AArch64::fixup_aarch64_ldst_imm12_scale1:
+ case AArch64::fixup_aarch64_ldst_imm12_scale2:
+ case AArch64::fixup_aarch64_ldst_imm12_scale4:
+ case AArch64::fixup_aarch64_ldst_imm12_scale8:
+ case AArch64::fixup_aarch64_ldst_imm12_scale16:
+ case AArch64::fixup_aarch64_ldr_pcrel_imm19:
+ case AArch64::fixup_aarch64_pcrel_branch19:
+ return 3;
+
+ case AArch64::fixup_aarch64_pcrel_adr_imm21:
+ case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+ case AArch64::fixup_aarch64_pcrel_branch26:
+ case AArch64::fixup_aarch64_pcrel_call26:
+ case FK_Data_4:
+ return 4;
+
+ case FK_Data_8:
+ return 8;
+ }
+}
+
+static unsigned AdrImmBits(unsigned Value) {
+ unsigned lo2 = Value & 0x3;
+ unsigned hi19 = (Value & 0x1ffffc) >> 2;
+ return (hi19 << 5) | (lo2 << 29);
+}
+
+static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
+ MCContext *Ctx) {
+ unsigned Kind = Fixup.getKind();
+ int64_t SignedValue = static_cast<int64_t>(Value);
+ switch (Kind) {
+ default:
+ llvm_unreachable("Unknown fixup kind!");
+ case AArch64::fixup_aarch64_pcrel_adr_imm21:
+ if (Ctx && (SignedValue > 2097151 || SignedValue < -2097152))
+ Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+ return AdrImmBits(Value & 0x1fffffULL);
+ case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+ return AdrImmBits((Value & 0x1fffff000ULL) >> 12);
+ case AArch64::fixup_aarch64_ldr_pcrel_imm19:
+ case AArch64::fixup_aarch64_pcrel_branch19:
+ // Signed 21-bit immediate
+ if (SignedValue > 2097151 || SignedValue < -2097152)
+ if (Ctx) Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+ if (Ctx && (Value & 0x3))
+ Ctx->reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
+ // Low two bits are not encoded.
+ return (Value >> 2) & 0x7ffff;
+ case AArch64::fixup_aarch64_add_imm12:
+ case AArch64::fixup_aarch64_ldst_imm12_scale1:
+ // Unsigned 12-bit immediate
+ if (Ctx && Value >= 0x1000)
+ Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+ return Value;
+ case AArch64::fixup_aarch64_ldst_imm12_scale2:
+ // Unsigned 12-bit immediate which gets multiplied by 2
+ if (Ctx && (Value >= 0x2000))
+ Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+ if (Ctx && (Value & 0x1))
+ Ctx->reportError(Fixup.getLoc(), "fixup must be 2-byte aligned");
+ return Value >> 1;
+ case AArch64::fixup_aarch64_ldst_imm12_scale4:
+ // Unsigned 12-bit immediate which gets multiplied by 4
+ if (Ctx && (Value >= 0x4000))
+ Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+ if (Ctx && (Value & 0x3))
+ Ctx->reportError(Fixup.getLoc(), "fixup must be 4-byte aligned");
+ return Value >> 2;
+ case AArch64::fixup_aarch64_ldst_imm12_scale8:
+ // Unsigned 12-bit immediate which gets multiplied by 8
+ if (Ctx && (Value >= 0x8000))
+ Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+ if (Ctx && (Value & 0x7))
+ Ctx->reportError(Fixup.getLoc(), "fixup must be 8-byte aligned");
+ return Value >> 3;
+ case AArch64::fixup_aarch64_ldst_imm12_scale16:
+ // Unsigned 12-bit immediate which gets multiplied by 16
+ if (Ctx && (Value >= 0x10000))
+ Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+ if (Ctx && (Value & 0xf))
+ Ctx->reportError(Fixup.getLoc(), "fixup must be 16-byte aligned");
+ return Value >> 4;
+ case AArch64::fixup_aarch64_movw:
+ if (Ctx)
+ Ctx->reportError(Fixup.getLoc(),
+ "no resolvable MOVZ/MOVK fixups supported yet");
+ return Value;
+ case AArch64::fixup_aarch64_pcrel_branch14:
+ // Signed 16-bit immediate
+ if (Ctx && (SignedValue > 32767 || SignedValue < -32768))
+ Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+ // Low two bits are not encoded (4-byte alignment assumed).
+ if (Ctx && (Value & 0x3))
+ Ctx->reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
+ return (Value >> 2) & 0x3fff;
+ case AArch64::fixup_aarch64_pcrel_branch26:
+ case AArch64::fixup_aarch64_pcrel_call26:
+ // Signed 28-bit immediate
+ if (Ctx && (SignedValue > 134217727 || SignedValue < -134217728))
+ Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+ // Low two bits are not encoded (4-byte alignment assumed).
+ if (Ctx && (Value & 0x3))
+ Ctx->reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
+ return (Value >> 2) & 0x3ffffff;
+ case FK_Data_1:
+ case FK_Data_2:
+ case FK_Data_4:
+ case FK_Data_8:
+ return Value;
+ }
+}
+
+/// getFixupKindContainereSizeInBytes - The number of bytes of the
+/// container involved in big endian or 0 if the item is little endian
+unsigned AArch64AsmBackend::getFixupKindContainereSizeInBytes(unsigned Kind) const {
+ if (IsLittleEndian)
+ return 0;
+
+ switch (Kind) {
+ default:
+ llvm_unreachable("Unknown fixup kind!");
+
+ case FK_Data_1:
+ return 1;
+ case FK_Data_2:
+ return 2;
+ case FK_Data_4:
+ return 4;
+ case FK_Data_8:
+ return 8;
+
+ case AArch64::fixup_aarch64_tlsdesc_call:
+ case AArch64::fixup_aarch64_movw:
+ case AArch64::fixup_aarch64_pcrel_branch14:
+ case AArch64::fixup_aarch64_add_imm12:
+ case AArch64::fixup_aarch64_ldst_imm12_scale1:
+ case AArch64::fixup_aarch64_ldst_imm12_scale2:
+ case AArch64::fixup_aarch64_ldst_imm12_scale4:
+ case AArch64::fixup_aarch64_ldst_imm12_scale8:
+ case AArch64::fixup_aarch64_ldst_imm12_scale16:
+ case AArch64::fixup_aarch64_ldr_pcrel_imm19:
+ case AArch64::fixup_aarch64_pcrel_branch19:
+ case AArch64::fixup_aarch64_pcrel_adr_imm21:
+ case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+ case AArch64::fixup_aarch64_pcrel_branch26:
+ case AArch64::fixup_aarch64_pcrel_call26:
+ // Instructions are always little endian
+ return 0;
+ }
+}
+
+void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+ unsigned DataSize, uint64_t Value,
+ bool IsPCRel) const {
+ unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
+ if (!Value)
+ return; // Doesn't change encoding.
+ MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
+ // Apply any target-specific value adjustments.
+ Value = adjustFixupValue(Fixup, Value, nullptr);
+
+ // Shift the value into position.
+ Value <<= Info.TargetOffset;
+
+ unsigned Offset = Fixup.getOffset();
+ assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+
+ // Used to point to big endian bytes.
+ unsigned FulleSizeInBytes = getFixupKindContainereSizeInBytes(Fixup.getKind());
+
+ // For each byte of the fragment that the fixup touches, mask in the
+ // bits from the fixup value.
+ if (FulleSizeInBytes == 0) {
+ // Handle as little-endian
+ for (unsigned i = 0; i != NumBytes; ++i) {
+ Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+ }
+ } else {
+ // Handle as big-endian
+ assert((Offset + FulleSizeInBytes) <= DataSize && "Invalid fixup size!");
+ assert(NumBytes <= FulleSizeInBytes && "Invalid fixup size!");
+ for (unsigned i = 0; i != NumBytes; ++i) {
+ unsigned Idx = FulleSizeInBytes - 1 - i;
+ Data[Offset + Idx] |= uint8_t((Value >> (i * 8)) & 0xff);
+ }
+ }
+}
+
+bool AArch64AsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
+ return false;
+}
+
+bool AArch64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
+ uint64_t Value,
+ const MCRelaxableFragment *DF,
+ const MCAsmLayout &Layout) const {
+ // FIXME: This isn't correct for AArch64. Just moving the "generic" logic
+ // into the targets for now.
+ //
+ // Relax if the value is too big for a (signed) i8.
+ return int64_t(Value) != int64_t(int8_t(Value));
+}
+
+void AArch64AsmBackend::relaxInstruction(const MCInst &Inst,
+ const MCSubtargetInfo &STI,
+ MCInst &Res) const {
+ llvm_unreachable("AArch64AsmBackend::relaxInstruction() unimplemented");
+}
+
+bool AArch64AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+ // If the count is not 4-byte aligned, we must be writing data into the text
+ // section (otherwise we have unaligned instructions, and thus have far
+ // bigger problems), so just write zeros instead.
+ OW->WriteZeros(Count % 4);
+
+ // We are properly aligned, so write NOPs as requested.
+ Count /= 4;
+ for (uint64_t i = 0; i != Count; ++i)
+ OW->write32(0xd503201f);
+ return true;
+}
+
+namespace {
+
+namespace CU {
+
+/// \brief Compact unwind encoding values.
+enum CompactUnwindEncodings {
+ /// \brief A "frameless" leaf function, where no non-volatile registers are
+ /// saved. The return remains in LR throughout the function.
+ UNWIND_ARM64_MODE_FRAMELESS = 0x02000000,
+
+ /// \brief No compact unwind encoding available. Instead the low 23-bits of
+ /// the compact unwind encoding is the offset of the DWARF FDE in the
+ /// __eh_frame section. This mode is never used in object files. It is only
+ /// generated by the linker in final linked images, which have only DWARF info
+ /// for a function.
+ UNWIND_ARM64_MODE_DWARF = 0x03000000,
+
+ /// \brief This is a standard arm64 prologue where FP/LR are immediately
+ /// pushed on the stack, then SP is copied to FP. If there are any
+ /// non-volatile register saved, they are copied into the stack fame in pairs
+ /// in a contiguous ranger right below the saved FP/LR pair. Any subset of the
+ /// five X pairs and four D pairs can be saved, but the memory layout must be
+ /// in register number order.
+ UNWIND_ARM64_MODE_FRAME = 0x04000000,
+
+ /// \brief Frame register pair encodings.
+ UNWIND_ARM64_FRAME_X19_X20_PAIR = 0x00000001,
+ UNWIND_ARM64_FRAME_X21_X22_PAIR = 0x00000002,
+ UNWIND_ARM64_FRAME_X23_X24_PAIR = 0x00000004,
+ UNWIND_ARM64_FRAME_X25_X26_PAIR = 0x00000008,
+ UNWIND_ARM64_FRAME_X27_X28_PAIR = 0x00000010,
+ UNWIND_ARM64_FRAME_D8_D9_PAIR = 0x00000100,
+ UNWIND_ARM64_FRAME_D10_D11_PAIR = 0x00000200,
+ UNWIND_ARM64_FRAME_D12_D13_PAIR = 0x00000400,
+ UNWIND_ARM64_FRAME_D14_D15_PAIR = 0x00000800
+};
+
+} // end CU namespace
+
+// FIXME: This should be in a separate file.
+class DarwinAArch64AsmBackend : public AArch64AsmBackend {
+ const MCRegisterInfo &MRI;
+
+ /// \brief Encode compact unwind stack adjustment for frameless functions.
+ /// See UNWIND_ARM64_FRAMELESS_STACK_SIZE_MASK in compact_unwind_encoding.h.
+ /// The stack size always needs to be 16 byte aligned.
+ uint32_t encodeStackAdjustment(uint32_t StackSize) const {
+ return (StackSize / 16) << 12;
+ }
+
+public:
+ DarwinAArch64AsmBackend(const Target &T, const MCRegisterInfo &MRI)
+ : AArch64AsmBackend(T, /*IsLittleEndian*/true), MRI(MRI) {}
+
+ MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+ return createAArch64MachObjectWriter(OS, MachO::CPU_TYPE_ARM64,
+ MachO::CPU_SUBTYPE_ARM64_ALL);
+ }
+
+ /// \brief Generate the compact unwind encoding from the CFI directives.
+ uint32_t generateCompactUnwindEncoding(
+ ArrayRef<MCCFIInstruction> Instrs) const override {
+ if (Instrs.empty())
+ return CU::UNWIND_ARM64_MODE_FRAMELESS;
+
+ bool HasFP = false;
+ unsigned StackSize = 0;
+
+ uint32_t CompactUnwindEncoding = 0;
+ for (size_t i = 0, e = Instrs.size(); i != e; ++i) {
+ const MCCFIInstruction &Inst = Instrs[i];
+
+ switch (Inst.getOperation()) {
+ default:
+ // Cannot handle this directive: bail out.
+ return CU::UNWIND_ARM64_MODE_DWARF;
+ case MCCFIInstruction::OpDefCfa: {
+ // Defines a frame pointer.
+ assert(getXRegFromWReg(MRI.getLLVMRegNum(Inst.getRegister(), true)) ==
+ AArch64::FP &&
+ "Invalid frame pointer!");
+ assert(i + 2 < e && "Insufficient CFI instructions to define a frame!");
+
+ const MCCFIInstruction &LRPush = Instrs[++i];
+ assert(LRPush.getOperation() == MCCFIInstruction::OpOffset &&
+ "Link register not pushed!");
+ const MCCFIInstruction &FPPush = Instrs[++i];
+ assert(FPPush.getOperation() == MCCFIInstruction::OpOffset &&
+ "Frame pointer not pushed!");
+
+ unsigned LRReg = MRI.getLLVMRegNum(LRPush.getRegister(), true);
+ unsigned FPReg = MRI.getLLVMRegNum(FPPush.getRegister(), true);
+
+ LRReg = getXRegFromWReg(LRReg);
+ FPReg = getXRegFromWReg(FPReg);
+
+ assert(LRReg == AArch64::LR && FPReg == AArch64::FP &&
+ "Pushing invalid registers for frame!");
+
+ // Indicate that the function has a frame.
+ CompactUnwindEncoding |= CU::UNWIND_ARM64_MODE_FRAME;
+ HasFP = true;
+ break;
+ }
+ case MCCFIInstruction::OpDefCfaOffset: {
+ assert(StackSize == 0 && "We already have the CFA offset!");
+ StackSize = std::abs(Inst.getOffset());
+ break;
+ }
+ case MCCFIInstruction::OpOffset: {
+ // Registers are saved in pairs. We expect there to be two consecutive
+ // `.cfi_offset' instructions with the appropriate registers specified.
+ unsigned Reg1 = MRI.getLLVMRegNum(Inst.getRegister(), true);
+ if (i + 1 == e)
+ return CU::UNWIND_ARM64_MODE_DWARF;
+
+ const MCCFIInstruction &Inst2 = Instrs[++i];
+ if (Inst2.getOperation() != MCCFIInstruction::OpOffset)
+ return CU::UNWIND_ARM64_MODE_DWARF;
+ unsigned Reg2 = MRI.getLLVMRegNum(Inst2.getRegister(), true);
+
+ // N.B. The encodings must be in register number order, and the X
+ // registers before the D registers.
+
+ // X19/X20 pair = 0x00000001,
+ // X21/X22 pair = 0x00000002,
+ // X23/X24 pair = 0x00000004,
+ // X25/X26 pair = 0x00000008,
+ // X27/X28 pair = 0x00000010
+ Reg1 = getXRegFromWReg(Reg1);
+ Reg2 = getXRegFromWReg(Reg2);
+
+ if (Reg1 == AArch64::X19 && Reg2 == AArch64::X20 &&
+ (CompactUnwindEncoding & 0xF1E) == 0)
+ CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X19_X20_PAIR;
+ else if (Reg1 == AArch64::X21 && Reg2 == AArch64::X22 &&
+ (CompactUnwindEncoding & 0xF1C) == 0)
+ CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X21_X22_PAIR;
+ else if (Reg1 == AArch64::X23 && Reg2 == AArch64::X24 &&
+ (CompactUnwindEncoding & 0xF18) == 0)
+ CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X23_X24_PAIR;
+ else if (Reg1 == AArch64::X25 && Reg2 == AArch64::X26 &&
+ (CompactUnwindEncoding & 0xF10) == 0)
+ CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X25_X26_PAIR;
+ else if (Reg1 == AArch64::X27 && Reg2 == AArch64::X28 &&
+ (CompactUnwindEncoding & 0xF00) == 0)
+ CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X27_X28_PAIR;
+ else {
+ Reg1 = getDRegFromBReg(Reg1);
+ Reg2 = getDRegFromBReg(Reg2);
+
+ // D8/D9 pair = 0x00000100,
+ // D10/D11 pair = 0x00000200,
+ // D12/D13 pair = 0x00000400,
+ // D14/D15 pair = 0x00000800
+ if (Reg1 == AArch64::D8 && Reg2 == AArch64::D9 &&
+ (CompactUnwindEncoding & 0xE00) == 0)
+ CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D8_D9_PAIR;
+ else if (Reg1 == AArch64::D10 && Reg2 == AArch64::D11 &&
+ (CompactUnwindEncoding & 0xC00) == 0)
+ CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D10_D11_PAIR;
+ else if (Reg1 == AArch64::D12 && Reg2 == AArch64::D13 &&
+ (CompactUnwindEncoding & 0x800) == 0)
+ CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D12_D13_PAIR;
+ else if (Reg1 == AArch64::D14 && Reg2 == AArch64::D15)
+ CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D14_D15_PAIR;
+ else
+ // A pair was pushed which we cannot handle.
+ return CU::UNWIND_ARM64_MODE_DWARF;
+ }
+
+ break;
+ }
+ }
+ }
+
+ if (!HasFP) {
+ // With compact unwind info we can only represent stack adjustments of up
+ // to 65520 bytes.
+ if (StackSize > 65520)
+ return CU::UNWIND_ARM64_MODE_DWARF;
+
+ CompactUnwindEncoding |= CU::UNWIND_ARM64_MODE_FRAMELESS;
+ CompactUnwindEncoding |= encodeStackAdjustment(StackSize);
+ }
+
+ return CompactUnwindEncoding;
+ }
+
+ void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
+ const MCFixup &Fixup, const MCFragment *DF,
+ const MCValue &Target, uint64_t &Value,
+ bool &IsResolved) override {
+ // Try to get the encoded value for the fixup as-if we're mapping it into
+ // the instruction. This allows adjustFixupValue() to issue a diagnostic
+ // if the value is invalid.
+ if (IsResolved)
+ (void)adjustFixupValue(Fixup, Value, &Asm.getContext());
+ }
+};
+
+} // end anonymous namespace
+
+namespace {
+
+class ELFAArch64AsmBackend : public AArch64AsmBackend {
+public:
+ uint8_t OSABI;
+ bool IsILP32;
+
+ ELFAArch64AsmBackend(const Target &T, uint8_t OSABI, bool IsLittleEndian,
+ bool IsILP32)
+ : AArch64AsmBackend(T, IsLittleEndian), OSABI(OSABI), IsILP32(IsILP32) {}
+
+ MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+ return createAArch64ELFObjectWriter(OS, OSABI, IsLittleEndian, IsILP32);
+ }
+
+ void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
+ const MCFixup &Fixup, const MCFragment *DF,
+ const MCValue &Target, uint64_t &Value,
+ bool &IsResolved) override;
+};
+
+void ELFAArch64AsmBackend::processFixupValue(
+ const MCAssembler &Asm, const MCAsmLayout &Layout, const MCFixup &Fixup,
+ const MCFragment *DF, const MCValue &Target, uint64_t &Value,
+ bool &IsResolved) {
+ // The ADRP instruction adds some multiple of 0x1000 to the current PC &
+ // ~0xfff. This means that the required offset to reach a symbol can vary by
+ // up to one step depending on where the ADRP is in memory. For example:
+ //
+ // ADRP x0, there
+ // there:
+ //
+ // If the ADRP occurs at address 0xffc then "there" will be at 0x1000 and
+ // we'll need that as an offset. At any other address "there" will be in the
+ // same page as the ADRP and the instruction should encode 0x0. Assuming the
+ // section isn't 0x1000-aligned, we therefore need to delegate this decision
+ // to the linker -- a relocation!
+ if ((uint32_t)Fixup.getKind() == AArch64::fixup_aarch64_pcrel_adrp_imm21)
+ IsResolved = false;
+
+ // Try to get the encoded value for the fixup as-if we're mapping it into
+ // the instruction. This allows adjustFixupValue() to issue a diagnostic
+ // if the value is invalid.
+ if (IsResolved)
+ (void)adjustFixupValue(Fixup, Value, &Asm.getContext());
+}
+
+}
+
+MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T,
+ const MCRegisterInfo &MRI,
+ const Triple &TheTriple,
+ StringRef CPU,
+ const MCTargetOptions &Options) {
+ if (TheTriple.isOSBinFormatMachO())
+ return new DarwinAArch64AsmBackend(T, MRI);
+
+ assert(TheTriple.isOSBinFormatELF() && "Expect either MachO or ELF target");
+ uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
+ bool IsILP32 = Options.getABIName() == "ilp32";
+ return new ELFAArch64AsmBackend(T, OSABI, /*IsLittleEndian=*/true, IsILP32);
+}
+
+MCAsmBackend *llvm::createAArch64beAsmBackend(const Target &T,
+ const MCRegisterInfo &MRI,
+ const Triple &TheTriple,
+ StringRef CPU,
+ const MCTargetOptions &Options) {
+ assert(TheTriple.isOSBinFormatELF() &&
+ "Big endian is only supported for ELF targets!");
+ uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
+ bool IsILP32 = Options.getABIName() == "ilp32";
+ return new ELFAArch64AsmBackend(T, OSABI, /*IsLittleEndian=*/false, IsILP32);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
new file mode 100644
index 000000000000..a1edb3cef46a
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -0,0 +1,346 @@
+//===-- AArch64ELFObjectWriter.cpp - AArch64 ELF Writer -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file handles ELF-specific object emission, converting LLVM's internal
+// fixups into the appropriate relocations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AArch64FixupKinds.h"
+#include "MCTargetDesc/AArch64MCExpr.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+class AArch64ELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+ AArch64ELFObjectWriter(uint8_t OSABI, bool IsLittleEndian, bool IsILP32);
+
+ ~AArch64ELFObjectWriter() override;
+
+protected:
+ unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+ const MCFixup &Fixup, bool IsPCRel) const override;
+ bool IsILP32;
+private:
+};
+}
+
+AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI,
+ bool IsLittleEndian,
+ bool IsILP32)
+ : MCELFObjectTargetWriter(/*Is64Bit*/ true, OSABI, ELF::EM_AARCH64,
+ /*HasRelocationAddend*/ true),
+ IsILP32(IsILP32) {}
+
+AArch64ELFObjectWriter::~AArch64ELFObjectWriter() {}
+
+#define R_CLS(rtype) \
+ IsILP32 ? ELF::R_AARCH64_P32_##rtype : ELF::R_AARCH64_##rtype
+#define BAD_ILP32_MOV(lp64rtype) "ILP32 absolute MOV relocation not "\
+ "supported (LP64 eqv: " #lp64rtype ")"
+
+// assumes IsILP32 is true
+static bool isNonILP32reloc(const MCFixup &Fixup,
+ AArch64MCExpr::VariantKind RefKind,
+ MCContext &Ctx) {
+ if ((unsigned)Fixup.getKind() != AArch64::fixup_aarch64_movw)
+ return false;
+ switch(RefKind) {
+ case AArch64MCExpr::VK_ABS_G3:
+ Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G3));
+ return true;
+ case AArch64MCExpr::VK_ABS_G2:
+ Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G2));
+ return true;
+ case AArch64MCExpr::VK_ABS_G2_S:
+ Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_SABS_G2));
+ return ELF::R_AARCH64_NONE;
+ case AArch64MCExpr::VK_ABS_G2_NC:
+ Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G2_NC));
+ return ELF::R_AARCH64_NONE;
+ case AArch64MCExpr::VK_ABS_G1_S:
+ Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_SABS_G1));
+ return ELF::R_AARCH64_NONE;
+ case AArch64MCExpr::VK_ABS_G1_NC:
+ Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G1_NC));
+ return ELF::R_AARCH64_NONE;
+ case AArch64MCExpr::VK_DTPREL_G2:
+ Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLD_MOVW_DTPREL_G2));
+ return ELF::R_AARCH64_NONE;
+ case AArch64MCExpr::VK_DTPREL_G1_NC:
+ Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLD_MOVW_DTPREL_G1_NC));
+ return ELF::R_AARCH64_NONE;
+ case AArch64MCExpr::VK_TPREL_G2:
+ Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLE_MOVW_TPREL_G2));
+ return ELF::R_AARCH64_NONE;
+ case AArch64MCExpr::VK_TPREL_G1_NC:
+ Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLE_MOVW_TPREL_G1_NC));
+ return ELF::R_AARCH64_NONE;
+ case AArch64MCExpr::VK_GOTTPREL_G1:
+ Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSIE_MOVW_GOTTPREL_G1));
+ return ELF::R_AARCH64_NONE;
+ case AArch64MCExpr::VK_GOTTPREL_G0_NC:
+ Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSIE_MOVW_GOTTPREL_G0_NC));
+ return ELF::R_AARCH64_NONE;
+ default: return false;
+ }
+ return false;
+}
+
+unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
+ const MCValue &Target,
+ const MCFixup &Fixup,
+ bool IsPCRel) const {
+ AArch64MCExpr::VariantKind RefKind =
+ static_cast<AArch64MCExpr::VariantKind>(Target.getRefKind());
+ AArch64MCExpr::VariantKind SymLoc = AArch64MCExpr::getSymbolLoc(RefKind);
+ bool IsNC = AArch64MCExpr::isNotChecked(RefKind);
+
+ assert((!Target.getSymA() ||
+ Target.getSymA()->getKind() == MCSymbolRefExpr::VK_None) &&
+ "Should only be expression-level modifiers here");
+
+ assert((!Target.getSymB() ||
+ Target.getSymB()->getKind() == MCSymbolRefExpr::VK_None) &&
+ "Should only be expression-level modifiers here");
+
+ if (IsPCRel) {
+ switch ((unsigned)Fixup.getKind()) {
+ case FK_Data_1:
+ Ctx.reportError(Fixup.getLoc(), "1-byte data relocations not supported");
+ return ELF::R_AARCH64_NONE;
+ case FK_Data_2:
+ return R_CLS(PREL16);
+ case FK_Data_4:
+ return R_CLS(PREL32);
+ case FK_Data_8:
+ if (IsILP32) {
+ Ctx.reportError(Fixup.getLoc(), "ILP32 8 byte PC relative data "
+ "relocation not supported (LP64 eqv: PREL64)");
+ return ELF::R_AARCH64_NONE;
+ } else
+ return ELF::R_AARCH64_PREL64;
+ case AArch64::fixup_aarch64_pcrel_adr_imm21:
+ assert(SymLoc == AArch64MCExpr::VK_NONE && "unexpected ADR relocation");
+ return R_CLS(ADR_PREL_LO21);
+ case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+ if (SymLoc == AArch64MCExpr::VK_ABS && !IsNC)
+ return R_CLS(ADR_PREL_PG_HI21);
+ if (SymLoc == AArch64MCExpr::VK_GOT && !IsNC)
+ return R_CLS(ADR_GOT_PAGE);
+ if (SymLoc == AArch64MCExpr::VK_GOTTPREL && !IsNC)
+ return R_CLS(TLSIE_ADR_GOTTPREL_PAGE21);
+ if (SymLoc == AArch64MCExpr::VK_TLSDESC && !IsNC)
+ return R_CLS(TLSDESC_ADR_PAGE21);
+ Ctx.reportError(Fixup.getLoc(),
+ "invalid symbol kind for ADRP relocation");
+ return ELF::R_AARCH64_NONE;
+ case AArch64::fixup_aarch64_pcrel_branch26:
+ return R_CLS(JUMP26);
+ case AArch64::fixup_aarch64_pcrel_call26:
+ return R_CLS(CALL26);
+ case AArch64::fixup_aarch64_ldr_pcrel_imm19:
+ if (SymLoc == AArch64MCExpr::VK_GOTTPREL)
+ return R_CLS(TLSIE_LD_GOTTPREL_PREL19);
+ return R_CLS(LD_PREL_LO19);
+ case AArch64::fixup_aarch64_pcrel_branch14:
+ return R_CLS(TSTBR14);
+ case AArch64::fixup_aarch64_pcrel_branch19:
+ return R_CLS(CONDBR19);
+ default:
+ Ctx.reportError(Fixup.getLoc(), "Unsupported pc-relative fixup kind");
+ return ELF::R_AARCH64_NONE;
+ }
+ } else {
+ if (IsILP32 && isNonILP32reloc(Fixup, RefKind, Ctx))
+ return ELF::R_AARCH64_NONE;
+ switch ((unsigned)Fixup.getKind()) {
+ case FK_Data_1:
+ Ctx.reportError(Fixup.getLoc(), "1-byte data relocations not supported");
+ return ELF::R_AARCH64_NONE;
+ case FK_Data_2:
+ return R_CLS(ABS16);
+ case FK_Data_4:
+ return R_CLS(ABS32);
+ case FK_Data_8:
+ if (IsILP32) {
+ Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(ABS64));
+ return ELF::R_AARCH64_NONE;
+ } else
+ return ELF::R_AARCH64_ABS64;
+ case AArch64::fixup_aarch64_add_imm12:
+ if (RefKind == AArch64MCExpr::VK_DTPREL_HI12)
+ return R_CLS(TLSLD_ADD_DTPREL_HI12);
+ if (RefKind == AArch64MCExpr::VK_TPREL_HI12)
+ return R_CLS(TLSLE_ADD_TPREL_HI12);
+ if (RefKind == AArch64MCExpr::VK_DTPREL_LO12_NC)
+ return R_CLS(TLSLD_ADD_DTPREL_LO12_NC);
+ if (RefKind == AArch64MCExpr::VK_DTPREL_LO12)
+ return R_CLS(TLSLD_ADD_DTPREL_LO12);
+ if (RefKind == AArch64MCExpr::VK_TPREL_LO12_NC)
+ return R_CLS(TLSLE_ADD_TPREL_LO12_NC);
+ if (RefKind == AArch64MCExpr::VK_TPREL_LO12)
+ return R_CLS(TLSLE_ADD_TPREL_LO12);
+ if (RefKind == AArch64MCExpr::VK_TLSDESC_LO12)
+ return R_CLS(TLSDESC_ADD_LO12_NC);
+ if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+ return R_CLS(ADD_ABS_LO12_NC);
+
+ Ctx.reportError(Fixup.getLoc(),
+ "invalid fixup for add (uimm12) instruction");
+ return ELF::R_AARCH64_NONE;
+ case AArch64::fixup_aarch64_ldst_imm12_scale1:
+ if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+ return R_CLS(LDST8_ABS_LO12_NC);
+ if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC)
+ return R_CLS(TLSLD_LDST8_DTPREL_LO12);
+ if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC)
+ return R_CLS(TLSLD_LDST8_DTPREL_LO12_NC);
+ if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC)
+ return R_CLS(TLSLE_LDST8_TPREL_LO12);
+ if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
+ return R_CLS(TLSLE_LDST8_TPREL_LO12_NC);
+
+ Ctx.reportError(Fixup.getLoc(),
+ "invalid fixup for 8-bit load/store instruction");
+ return ELF::R_AARCH64_NONE;
+ case AArch64::fixup_aarch64_ldst_imm12_scale2:
+ if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+ return R_CLS(LDST16_ABS_LO12_NC);
+ if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC)
+ return R_CLS(TLSLD_LDST16_DTPREL_LO12);
+ if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC)
+ return R_CLS(TLSLD_LDST16_DTPREL_LO12_NC);
+ if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC)
+ return R_CLS(TLSLE_LDST16_TPREL_LO12);
+ if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
+ return R_CLS(TLSLE_LDST16_TPREL_LO12_NC);
+
+ Ctx.reportError(Fixup.getLoc(),
+ "invalid fixup for 16-bit load/store instruction");
+ return ELF::R_AARCH64_NONE;
+ case AArch64::fixup_aarch64_ldst_imm12_scale4:
+ if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+ return R_CLS(LDST32_ABS_LO12_NC);
+ if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC)
+ return R_CLS(TLSLD_LDST32_DTPREL_LO12);
+ if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC)
+ return R_CLS(TLSLD_LDST32_DTPREL_LO12_NC);
+ if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC)
+ return R_CLS(TLSLE_LDST32_TPREL_LO12);
+ if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
+ return R_CLS(TLSLE_LDST32_TPREL_LO12_NC);
+
+ Ctx.reportError(Fixup.getLoc(),
+ "invalid fixup for 32-bit load/store instruction");
+ return ELF::R_AARCH64_NONE;
+ case AArch64::fixup_aarch64_ldst_imm12_scale8:
+ if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+ return R_CLS(LDST64_ABS_LO12_NC);
+ if (SymLoc == AArch64MCExpr::VK_GOT && IsNC)
+ return R_CLS(LD64_GOT_LO12_NC);
+ if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC)
+ return R_CLS(TLSLD_LDST64_DTPREL_LO12);
+ if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC)
+ return R_CLS(TLSLD_LDST64_DTPREL_LO12_NC);
+ if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC)
+ return R_CLS(TLSLE_LDST64_TPREL_LO12);
+ if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
+ return R_CLS(TLSLE_LDST64_TPREL_LO12_NC);
+ if (SymLoc == AArch64MCExpr::VK_GOTTPREL && IsNC)
+ return IsILP32 ? ELF::R_AARCH64_P32_TLSIE_LD32_GOTTPREL_LO12_NC
+ : ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC;
+ if (SymLoc == AArch64MCExpr::VK_TLSDESC && IsNC)
+ return IsILP32 ? ELF::R_AARCH64_P32_TLSDESC_LD32_LO12_NC
+ : ELF::R_AARCH64_TLSDESC_LD64_LO12_NC;
+
+ Ctx.reportError(Fixup.getLoc(),
+ "invalid fixup for 64-bit load/store instruction");
+ return ELF::R_AARCH64_NONE;
+ case AArch64::fixup_aarch64_ldst_imm12_scale16:
+ if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+ return R_CLS(LDST128_ABS_LO12_NC);
+
+ Ctx.reportError(Fixup.getLoc(),
+ "invalid fixup for 128-bit load/store instruction");
+ return ELF::R_AARCH64_NONE;
+ // ILP32 case not reached here, tested with isNonILP32reloc
+ case AArch64::fixup_aarch64_movw:
+ if (RefKind == AArch64MCExpr::VK_ABS_G3)
+ return ELF::R_AARCH64_MOVW_UABS_G3;
+ if (RefKind == AArch64MCExpr::VK_ABS_G2)
+ return ELF::R_AARCH64_MOVW_UABS_G2;
+ if (RefKind == AArch64MCExpr::VK_ABS_G2_S)
+ return ELF::R_AARCH64_MOVW_SABS_G2;
+ if (RefKind == AArch64MCExpr::VK_ABS_G2_NC)
+ return ELF::R_AARCH64_MOVW_UABS_G2_NC;
+ if (RefKind == AArch64MCExpr::VK_ABS_G1)
+ return R_CLS(MOVW_UABS_G1);
+ if (RefKind == AArch64MCExpr::VK_ABS_G1_S)
+ return ELF::R_AARCH64_MOVW_SABS_G1;
+ if (RefKind == AArch64MCExpr::VK_ABS_G1_NC)
+ return ELF::R_AARCH64_MOVW_UABS_G1_NC;
+ if (RefKind == AArch64MCExpr::VK_ABS_G0)
+ return R_CLS(MOVW_UABS_G0);
+ if (RefKind == AArch64MCExpr::VK_ABS_G0_S)
+ return R_CLS(MOVW_SABS_G0);
+ if (RefKind == AArch64MCExpr::VK_ABS_G0_NC)
+ return R_CLS(MOVW_UABS_G0_NC);
+ if (RefKind == AArch64MCExpr::VK_DTPREL_G2)
+ return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G2;
+ if (RefKind == AArch64MCExpr::VK_DTPREL_G1)
+ return R_CLS(TLSLD_MOVW_DTPREL_G1);
+ if (RefKind == AArch64MCExpr::VK_DTPREL_G1_NC)
+ return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC;
+ if (RefKind == AArch64MCExpr::VK_DTPREL_G0)
+ return R_CLS(TLSLD_MOVW_DTPREL_G0);
+ if (RefKind == AArch64MCExpr::VK_DTPREL_G0_NC)
+ return R_CLS(TLSLD_MOVW_DTPREL_G0_NC);
+ if (RefKind == AArch64MCExpr::VK_TPREL_G2)
+ return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G2;
+ if (RefKind == AArch64MCExpr::VK_TPREL_G1)
+ return R_CLS(TLSLE_MOVW_TPREL_G1);
+ if (RefKind == AArch64MCExpr::VK_TPREL_G1_NC)
+ return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1_NC;
+ if (RefKind == AArch64MCExpr::VK_TPREL_G0)
+ return R_CLS(TLSLE_MOVW_TPREL_G0);
+ if (RefKind == AArch64MCExpr::VK_TPREL_G0_NC)
+ return R_CLS(TLSLE_MOVW_TPREL_G0_NC);
+ if (RefKind == AArch64MCExpr::VK_GOTTPREL_G1)
+ return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G1;
+ if (RefKind == AArch64MCExpr::VK_GOTTPREL_G0_NC)
+ return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC;
+ Ctx.reportError(Fixup.getLoc(),
+ "invalid fixup for movz/movk instruction");
+ return ELF::R_AARCH64_NONE;
+ case AArch64::fixup_aarch64_tlsdesc_call:
+ return R_CLS(TLSDESC_CALL);
+ default:
+ Ctx.reportError(Fixup.getLoc(), "Unknown ELF relocation type");
+ return ELF::R_AARCH64_NONE;
+ }
+ }
+
+ llvm_unreachable("Unimplemented fixup -> relocation");
+}
+
+MCObjectWriter *llvm::createAArch64ELFObjectWriter(raw_pwrite_stream &OS,
+ uint8_t OSABI,
+ bool IsLittleEndian,
+ bool IsILP32) {
+ MCELFObjectTargetWriter *MOTW =
+ new AArch64ELFObjectWriter(OSABI, IsLittleEndian, IsILP32);
+ return createELFObjectWriter(MOTW, OS, IsLittleEndian);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
new file mode 100644
index 000000000000..685907a2178e
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -0,0 +1,217 @@
+//===- lib/MC/AArch64ELFStreamer.cpp - ELF Object Output for AArch64 ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file assembles .s files and emits AArch64 ELF .o object files. Different
+// from generic ELF streamer in emitting mapping symbols ($x and $d) to delimit
+// regions of data and code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64TargetStreamer.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+class AArch64ELFStreamer;
+
+class AArch64TargetAsmStreamer : public AArch64TargetStreamer {
+ formatted_raw_ostream &OS;
+
+ void emitInst(uint32_t Inst) override;
+
+public:
+ AArch64TargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
+};
+
+AArch64TargetAsmStreamer::AArch64TargetAsmStreamer(MCStreamer &S,
+ formatted_raw_ostream &OS)
+ : AArch64TargetStreamer(S), OS(OS) {}
+
+void AArch64TargetAsmStreamer::emitInst(uint32_t Inst) {
+ OS << "\t.inst\t0x" << Twine::utohexstr(Inst) << "\n";
+}
+
+class AArch64TargetELFStreamer : public AArch64TargetStreamer {
+private:
+ AArch64ELFStreamer &getStreamer();
+
+ void emitInst(uint32_t Inst) override;
+
+public:
+ AArch64TargetELFStreamer(MCStreamer &S) : AArch64TargetStreamer(S) {}
+};
+
+/// Extend the generic ELFStreamer class so that it can emit mapping symbols at
+/// the appropriate points in the object files. These symbols are defined in the
+/// AArch64 ELF ABI:
+/// infocenter.arm.com/help/topic/com.arm.doc.ihi0056a/IHI0056A_aaelf64.pdf
+///
+/// In brief: $x or $d should be emitted at the start of each contiguous region
+/// of A64 code or data in a section. In practice, this emission does not rely
+/// on explicit assembler directives but on inherent properties of the
+/// directives doing the emission (e.g. ".byte" is data, "add x0, x0, x0" an
+/// instruction).
+///
+/// As a result this system is orthogonal to the DataRegion infrastructure used
+/// by MachO. Beware!
+class AArch64ELFStreamer : public MCELFStreamer {
+public:
+ friend class AArch64TargetELFStreamer;
+
+ AArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+ raw_pwrite_stream &OS, MCCodeEmitter *Emitter)
+ : MCELFStreamer(Context, TAB, OS, Emitter), MappingSymbolCounter(0),
+ LastEMS(EMS_None) {}
+
+ void ChangeSection(MCSection *Section, const MCExpr *Subsection) override {
+ // We have to keep track of the mapping symbol state of any sections we
+ // use. Each one should start off as EMS_None, which is provided as the
+ // default constructor by DenseMap::lookup.
+ LastMappingSymbols[getPreviousSection().first] = LastEMS;
+ LastEMS = LastMappingSymbols.lookup(Section);
+
+ MCELFStreamer::ChangeSection(Section, Subsection);
+ }
+
+ /// This function is the one used to emit instruction data into the ELF
+ /// streamer. We override it to add the appropriate mapping symbol if
+ /// necessary.
+ void EmitInstruction(const MCInst &Inst,
+ const MCSubtargetInfo &STI) override {
+ EmitA64MappingSymbol();
+ MCELFStreamer::EmitInstruction(Inst, STI);
+ }
+
+ /// Emit a 32-bit value as an instruction. This is only used for the .inst
+ /// directive, EmitInstruction should be used in other cases.
+ void emitInst(uint32_t Inst) {
+ char Buffer[4];
+
+ // We can't just use EmitIntValue here, as that will emit a data mapping
+ // symbol, and swap the endianness on big-endian systems (instructions are
+ // always little-endian).
+ for (unsigned I = 0; I < 4; ++I) {
+ Buffer[I] = uint8_t(Inst);
+ Inst >>= 8;
+ }
+
+ EmitA64MappingSymbol();
+ MCELFStreamer::EmitBytes(StringRef(Buffer, 4));
+ }
+
+ /// This is one of the functions used to emit data into an ELF section, so the
+ /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d)
+ /// if necessary.
+ void EmitBytes(StringRef Data) override {
+ EmitDataMappingSymbol();
+ MCELFStreamer::EmitBytes(Data);
+ }
+
+ /// This is one of the functions used to emit data into an ELF section, so the
+ /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d)
+ /// if necessary.
+ void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override {
+ EmitDataMappingSymbol();
+ MCELFStreamer::EmitValueImpl(Value, Size, Loc);
+ }
+
+private:
+ enum ElfMappingSymbol {
+ EMS_None,
+ EMS_A64,
+ EMS_Data
+ };
+
+ void EmitDataMappingSymbol() {
+ if (LastEMS == EMS_Data)
+ return;
+ EmitMappingSymbol("$d");
+ LastEMS = EMS_Data;
+ }
+
+ void EmitA64MappingSymbol() {
+ if (LastEMS == EMS_A64)
+ return;
+ EmitMappingSymbol("$x");
+ LastEMS = EMS_A64;
+ }
+
+ void EmitMappingSymbol(StringRef Name) {
+ auto *Symbol = cast<MCSymbolELF>(getContext().getOrCreateSymbol(
+ Name + "." + Twine(MappingSymbolCounter++)));
+ EmitLabel(Symbol);
+ Symbol->setType(ELF::STT_NOTYPE);
+ Symbol->setBinding(ELF::STB_LOCAL);
+ Symbol->setExternal(false);
+ }
+
+ int64_t MappingSymbolCounter;
+
+ DenseMap<const MCSection *, ElfMappingSymbol> LastMappingSymbols;
+ ElfMappingSymbol LastEMS;
+};
+} // end anonymous namespace
+
+AArch64ELFStreamer &AArch64TargetELFStreamer::getStreamer() {
+ return static_cast<AArch64ELFStreamer &>(Streamer);
+}
+
+void AArch64TargetELFStreamer::emitInst(uint32_t Inst) {
+ getStreamer().emitInst(Inst);
+}
+
+namespace llvm {
+MCTargetStreamer *createAArch64AsmTargetStreamer(MCStreamer &S,
+ formatted_raw_ostream &OS,
+ MCInstPrinter *InstPrint,
+ bool isVerboseAsm) {
+ return new AArch64TargetAsmStreamer(S, OS);
+}
+
+MCELFStreamer *createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+ raw_pwrite_stream &OS,
+ MCCodeEmitter *Emitter, bool RelaxAll) {
+ AArch64ELFStreamer *S = new AArch64ELFStreamer(Context, TAB, OS, Emitter);
+ if (RelaxAll)
+ S->getAssembler().setRelaxAll(true);
+ return S;
+}
+
+MCTargetStreamer *
+createAArch64ObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
+ const Triple &TT = STI.getTargetTriple();
+ if (TT.isOSBinFormatELF())
+ return new AArch64TargetELFStreamer(S);
+ return nullptr;
+}
+}
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
new file mode 100644
index 000000000000..ef48203c8bc0
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
@@ -0,0 +1,26 @@
+//===-- AArch64ELFStreamer.h - ELF Streamer for AArch64 ---------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements ELF streamer information for the AArch64 backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64ELFSTREAMER_H
+#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64ELFSTREAMER_H
+
+#include "llvm/MC/MCELFStreamer.h"
+
+namespace llvm {
+
+MCELFStreamer *createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+ raw_pwrite_stream &OS,
+ MCCodeEmitter *Emitter, bool RelaxAll);
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
new file mode 100644
index 000000000000..0f5b765c7697
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
@@ -0,0 +1,76 @@
+//===-- AArch64FixupKinds.h - AArch64 Specific Fixup Entries ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64FIXUPKINDS_H
+#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64FIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace AArch64 {
+
+enum Fixups {
+ // fixup_aarch64_pcrel_adr_imm21 - A 21-bit pc-relative immediate inserted into
+ // an ADR instruction.
+ fixup_aarch64_pcrel_adr_imm21 = FirstTargetFixupKind,
+
+ // fixup_aarch64_pcrel_adrp_imm21 - A 21-bit pc-relative immediate inserted into
+ // an ADRP instruction.
+ fixup_aarch64_pcrel_adrp_imm21,
+
+ // fixup_aarch64_imm12 - 12-bit fixup for add/sub instructions.
+ // No alignment adjustment. All value bits are encoded.
+ fixup_aarch64_add_imm12,
+
+ // fixup_aarch64_ldst_imm12_* - unsigned 12-bit fixups for load and
+ // store instructions.
+ fixup_aarch64_ldst_imm12_scale1,
+ fixup_aarch64_ldst_imm12_scale2,
+ fixup_aarch64_ldst_imm12_scale4,
+ fixup_aarch64_ldst_imm12_scale8,
+ fixup_aarch64_ldst_imm12_scale16,
+
+ // fixup_aarch64_ldr_pcrel_imm19 - The high 19 bits of a 21-bit pc-relative
+ // immediate. Same encoding as fixup_aarch64_pcrel_adrhi, except this is used by
+ // pc-relative loads and generates relocations directly when necessary.
+ fixup_aarch64_ldr_pcrel_imm19,
+
+ // FIXME: comment
+ fixup_aarch64_movw,
+
+ // fixup_aarch64_pcrel_imm14 - The high 14 bits of a 21-bit pc-relative
+ // immediate.
+ fixup_aarch64_pcrel_branch14,
+
+ // fixup_aarch64_pcrel_branch19 - The high 19 bits of a 21-bit pc-relative
+ // immediate. Same encoding as fixup_aarch64_pcrel_adrhi, except this is use by
+ // b.cc and generates relocations directly when necessary.
+ fixup_aarch64_pcrel_branch19,
+
+ // fixup_aarch64_pcrel_branch26 - The high 26 bits of a 28-bit pc-relative
+ // immediate.
+ fixup_aarch64_pcrel_branch26,
+
+ // fixup_aarch64_pcrel_call26 - The high 26 bits of a 28-bit pc-relative
+ // immediate. Distinguished from branch26 only on ELF.
+ fixup_aarch64_pcrel_call26,
+
+ // fixup_aarch64_tlsdesc_call - zero-space placeholder for the ELF
+ // R_AARCH64_TLSDESC_CALL relocation.
+ fixup_aarch64_tlsdesc_call,
+
+ // Marker
+ LastTargetFixupKind,
+ NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+
+} // end namespace AArch64
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
new file mode 100644
index 000000000000..8fc822329595
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -0,0 +1,100 @@
+//===-- AArch64MCAsmInfo.cpp - AArch64 asm properties ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the AArch64MCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64MCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+enum AsmWriterVariantTy {
+ Default = -1,
+ Generic = 0,
+ Apple = 1
+};
+
+static cl::opt<AsmWriterVariantTy> AsmWriterVariant(
+ "aarch64-neon-syntax", cl::init(Default),
+ cl::desc("Choose style of NEON code to emit from AArch64 backend:"),
+ cl::values(clEnumValN(Generic, "generic", "Emit generic NEON assembly"),
+ clEnumValN(Apple, "apple", "Emit Apple-style NEON assembly")));
+
+AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() {
+ // We prefer NEON instructions to be printed in the short form.
+ AssemblerDialect = AsmWriterVariant == Default ? 1 : AsmWriterVariant;
+
+ PrivateGlobalPrefix = "L";
+ PrivateLabelPrefix = "L";
+ SeparatorString = "%%";
+ CommentString = ";";
+ PointerSize = CalleeSaveStackSlotSize = 8;
+
+ AlignmentIsInBytes = false;
+ UsesELFSectionDirectiveForBSS = true;
+ SupportsDebugInformation = true;
+ UseDataRegionDirectives = true;
+
+ ExceptionsType = ExceptionHandling::DwarfCFI;
+}
+
+const MCExpr *AArch64MCAsmInfoDarwin::getExprForPersonalitySymbol(
+ const MCSymbol *Sym, unsigned Encoding, MCStreamer &Streamer) const {
+ // On Darwin, we can reference dwarf symbols with foo@GOT-., which
+ // is an indirect pc-relative reference. The default implementation
+ // won't reference using the GOT, so we need this target-specific
+ // version.
+ MCContext &Context = Streamer.getContext();
+ const MCExpr *Res =
+ MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOT, Context);
+ MCSymbol *PCSym = Context.createTempSymbol();
+ Streamer.EmitLabel(PCSym);
+ const MCExpr *PC = MCSymbolRefExpr::create(PCSym, Context);
+ return MCBinaryExpr::createSub(Res, PC, Context);
+}
+
+AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) {
+ if (T.getArch() == Triple::aarch64_be)
+ IsLittleEndian = false;
+
+ // We prefer NEON instructions to be printed in the short form.
+ AssemblerDialect = AsmWriterVariant == Default ? 0 : AsmWriterVariant;
+
+ PointerSize = 8;
+
+ // ".comm align is in bytes but .align is pow-2."
+ AlignmentIsInBytes = false;
+
+ CommentString = "//";
+ PrivateGlobalPrefix = ".L";
+ PrivateLabelPrefix = ".L";
+ Code32Directive = ".code\t32";
+
+ Data16bitsDirective = "\t.hword\t";
+ Data32bitsDirective = "\t.word\t";
+ Data64bitsDirective = "\t.xword\t";
+
+ UseDataRegionDirectives = false;
+
+ WeakRefDirective = "\t.weak\t";
+
+ SupportsDebugInformation = true;
+
+ // Exceptions handling
+ ExceptionsType = ExceptionHandling::DwarfCFI;
+
+ UseIntegratedAssembler = true;
+
+ HasIdentDirective = true;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
new file mode 100644
index 000000000000..253cd30f26ee
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -0,0 +1,38 @@
+//=====-- AArch64MCAsmInfo.h - AArch64 asm properties ---------*- C++ -*--====//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the AArch64MCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H
+#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H
+
+#include "llvm/MC/MCAsmInfoDarwin.h"
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+class MCStreamer;
+class Target;
+class Triple;
+
+struct AArch64MCAsmInfoDarwin : public MCAsmInfoDarwin {
+ explicit AArch64MCAsmInfoDarwin();
+ const MCExpr *
+ getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
+ MCStreamer &Streamer) const override;
+};
+
+struct AArch64MCAsmInfoELF : public MCAsmInfoELF {
+ explicit AArch64MCAsmInfoELF(const Triple &T);
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
new file mode 100644
index 000000000000..f7058cdf2373
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -0,0 +1,603 @@
+//=- AArch64/AArch64MCCodeEmitter.cpp - Convert AArch64 code to machine code-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64MCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "MCTargetDesc/AArch64FixupKinds.h"
+#include "MCTargetDesc/AArch64MCExpr.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "mccodeemitter"
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted.");
+STATISTIC(MCNumFixups, "Number of MC fixups created.");
+
+namespace {
+
+class AArch64MCCodeEmitter : public MCCodeEmitter {
+ MCContext &Ctx;
+ const MCInstrInfo &MCII;
+
+ AArch64MCCodeEmitter(const AArch64MCCodeEmitter &); // DO NOT IMPLEMENT
+ void operator=(const AArch64MCCodeEmitter &); // DO NOT IMPLEMENT
+public:
+ AArch64MCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
+ : Ctx(ctx), MCII(mcii) {}
+
+ ~AArch64MCCodeEmitter() override {}
+
+ // getBinaryCodeForInstr - TableGen'erated function for getting the
+ // binary encoding for an instruction.
+ uint64_t getBinaryCodeForInstr(const MCInst &MI,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getMachineOpValue - Return binary encoding of operand. If the machine
+ /// operand requires relocation, record the relocation and return zero.
+ unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getLdStUImm12OpValue - Return encoding info for 12-bit unsigned immediate
+ /// attached to a load, store or prfm instruction. If operand requires a
+ /// relocation, record it and return zero in that part of the encoding.
+ template <uint32_t FixupKind>
+ uint32_t getLdStUImm12OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getAdrLabelOpValue - Return encoding info for 21-bit immediate ADR label
+ /// target.
+ uint32_t getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getAddSubImmOpValue - Return encoding for the 12-bit immediate value and
+ /// the 2-bit shift field.
+ uint32_t getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getCondBranchTargetOpValue - Return the encoded value for a conditional
+ /// branch target.
+ uint32_t getCondBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getLoadLiteralOpValue - Return the encoded value for a load-literal
+ /// pc-relative address.
+ uint32_t getLoadLiteralOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getMemExtendOpValue - Return the encoded value for a reg-extend load/store
+ /// instruction: bit 0 is whether a shift is present, bit 1 is whether the
+ /// operation is a sign extend (as opposed to a zero extend).
+ uint32_t getMemExtendOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getTestBranchTargetOpValue - Return the encoded value for a test-bit-and-
+ /// branch target.
+ uint32_t getTestBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getBranchTargetOpValue - Return the encoded value for an unconditional
+ /// branch target.
+ uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getMoveWideImmOpValue - Return the encoded value for the immediate operand
+ /// of a MOVZ or MOVK instruction.
+ uint32_t getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getVecShifterOpValue - Return the encoded value for the vector shifter.
+ uint32_t getVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getMoveVecShifterOpValue - Return the encoded value for the vector move
+ /// shifter (MSL).
+ uint32_t getMoveVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getFixedPointScaleOpValue - Return the encoded value for the
+ // FP-to-fixed-point scale factor.
+ uint32_t getFixedPointScaleOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ uint32_t getVecShiftR64OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ uint32_t getVecShiftR32OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ uint32_t getVecShiftR16OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ uint32_t getVecShiftR8OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ uint32_t getVecShiftL64OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ uint32_t getVecShiftL32OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ uint32_t getVecShiftL16OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ uint32_t getVecShiftL8OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ unsigned fixMOVZ(const MCInst &MI, unsigned EncodedValue,
+ const MCSubtargetInfo &STI) const;
+
+ void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const override;
+
+ unsigned fixMulHigh(const MCInst &MI, unsigned EncodedValue,
+ const MCSubtargetInfo &STI) const;
+
+ template<int hasRs, int hasRt2> unsigned
+ fixLoadStoreExclusive(const MCInst &MI, unsigned EncodedValue,
+ const MCSubtargetInfo &STI) const;
+
+ unsigned fixOneOperandFPComparison(const MCInst &MI, unsigned EncodedValue,
+ const MCSubtargetInfo &STI) const;
+
+private:
+ uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;
+ void verifyInstructionPredicates(const MCInst &MI,
+ uint64_t AvailableFeatures) const;
+};
+
+} // end anonymous namespace
+
+MCCodeEmitter *llvm::createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx) {
+ return new AArch64MCCodeEmitter(MCII, Ctx);
+}
+
+/// getMachineOpValue - Return binary encoding of operand. If the machine
+/// operand requires relocation, record the relocation and return zero.
+unsigned
+AArch64MCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ if (MO.isReg())
+ return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
+
+ assert(MO.isImm() && "did not expect relocated expression");
+ return static_cast<unsigned>(MO.getImm());
+}
+
+template<unsigned FixupKind> uint32_t
+AArch64MCCodeEmitter::getLdStUImm12OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ uint32_t ImmVal = 0;
+
+ if (MO.isImm())
+ ImmVal = static_cast<uint32_t>(MO.getImm());
+ else {
+ assert(MO.isExpr() && "unable to encode load/store imm operand");
+ MCFixupKind Kind = MCFixupKind(FixupKind);
+ Fixups.push_back(MCFixup::create(0, MO.getExpr(), Kind, MI.getLoc()));
+ ++MCNumFixups;
+ }
+
+ return ImmVal;
+}
+
+/// getAdrLabelOpValue - Return encoding info for 21-bit immediate ADR label
+/// target.
+uint32_t
+AArch64MCCodeEmitter::getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+
+ // If the destination is an immediate, we have nothing to do.
+ if (MO.isImm())
+ return MO.getImm();
+ assert(MO.isExpr() && "Unexpected target type!");
+ const MCExpr *Expr = MO.getExpr();
+
+ MCFixupKind Kind = MI.getOpcode() == AArch64::ADR
+ ? MCFixupKind(AArch64::fixup_aarch64_pcrel_adr_imm21)
+ : MCFixupKind(AArch64::fixup_aarch64_pcrel_adrp_imm21);
+ Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc()));
+
+ MCNumFixups += 1;
+
+ // All of the information is in the fixup.
+ return 0;
+}
+
+/// getAddSubImmOpValue - Return encoding for the 12-bit immediate value and
+/// the 2-bit shift field. The shift field is stored in bits 13-14 of the
+/// return value.
+uint32_t
+AArch64MCCodeEmitter::getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // Suboperands are [imm, shifter].
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+ assert(AArch64_AM::getShiftType(MO1.getImm()) == AArch64_AM::LSL &&
+ "unexpected shift type for add/sub immediate");
+ unsigned ShiftVal = AArch64_AM::getShiftValue(MO1.getImm());
+ assert((ShiftVal == 0 || ShiftVal == 12) &&
+ "unexpected shift value for add/sub immediate");
+ if (MO.isImm())
+ return MO.getImm() | (ShiftVal == 0 ? 0 : (1 << ShiftVal));
+ assert(MO.isExpr() && "Unable to encode MCOperand!");
+ const MCExpr *Expr = MO.getExpr();
+
+ // Encode the 12 bits of the fixup.
+ MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_add_imm12);
+ Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc()));
+
+ ++MCNumFixups;
+
+ // Set the shift bit of the add instruction for relocation types
+ // R_AARCH64_TLSLE_ADD_TPREL_HI12 and R_AARCH64_TLSLD_ADD_DTPREL_HI12.
+ if (const AArch64MCExpr *A64E = dyn_cast<AArch64MCExpr>(Expr)) {
+ AArch64MCExpr::VariantKind RefKind = A64E->getKind();
+ if (RefKind == AArch64MCExpr::VK_TPREL_HI12 ||
+ RefKind == AArch64MCExpr::VK_DTPREL_HI12)
+ ShiftVal = 12;
+ }
+ return ShiftVal == 0 ? 0 : (1 << ShiftVal);
+}
+
+/// getCondBranchTargetOpValue - Return the encoded value for a conditional
+/// branch target.
+uint32_t AArch64MCCodeEmitter::getCondBranchTargetOpValue(
+ const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+
+ // If the destination is an immediate, we have nothing to do.
+ if (MO.isImm())
+ return MO.getImm();
+ assert(MO.isExpr() && "Unexpected target type!");
+
+ MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_pcrel_branch19);
+ Fixups.push_back(MCFixup::create(0, MO.getExpr(), Kind, MI.getLoc()));
+
+ ++MCNumFixups;
+
+ // All of the information is in the fixup.
+ return 0;
+}
+
+/// getLoadLiteralOpValue - Return the encoded value for a load-literal
+/// pc-relative address.
+uint32_t
+AArch64MCCodeEmitter::getLoadLiteralOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+
+ // If the destination is an immediate, we have nothing to do.
+ if (MO.isImm())
+ return MO.getImm();
+ assert(MO.isExpr() && "Unexpected target type!");
+
+ MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_ldr_pcrel_imm19);
+ Fixups.push_back(MCFixup::create(0, MO.getExpr(), Kind, MI.getLoc()));
+
+ ++MCNumFixups;
+
+ // All of the information is in the fixup.
+ return 0;
+}
+
+uint32_t
+AArch64MCCodeEmitter::getMemExtendOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ unsigned SignExtend = MI.getOperand(OpIdx).getImm();
+ unsigned DoShift = MI.getOperand(OpIdx + 1).getImm();
+ return (SignExtend << 1) | DoShift;
+}
+
+uint32_t
+AArch64MCCodeEmitter::getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+
+ if (MO.isImm())
+ return MO.getImm();
+ assert(MO.isExpr() && "Unexpected movz/movk immediate");
+
+ Fixups.push_back(MCFixup::create(
+ 0, MO.getExpr(), MCFixupKind(AArch64::fixup_aarch64_movw), MI.getLoc()));
+
+ ++MCNumFixups;
+
+ return 0;
+}
+
+/// getTestBranchTargetOpValue - Return the encoded value for a test-bit-and-
+/// branch target.
+uint32_t AArch64MCCodeEmitter::getTestBranchTargetOpValue(
+ const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+
+ // If the destination is an immediate, we have nothing to do.
+ if (MO.isImm())
+ return MO.getImm();
+ assert(MO.isExpr() && "Unexpected ADR target type!");
+
+ MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_pcrel_branch14);
+ Fixups.push_back(MCFixup::create(0, MO.getExpr(), Kind, MI.getLoc()));
+
+ ++MCNumFixups;
+
+ // All of the information is in the fixup.
+ return 0;
+}
+
+/// getBranchTargetOpValue - Return the encoded value for an unconditional
+/// branch target.
+uint32_t
+AArch64MCCodeEmitter::getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+
+ // If the destination is an immediate, we have nothing to do.
+ if (MO.isImm())
+ return MO.getImm();
+ assert(MO.isExpr() && "Unexpected ADR target type!");
+
+ MCFixupKind Kind = MI.getOpcode() == AArch64::BL
+ ? MCFixupKind(AArch64::fixup_aarch64_pcrel_call26)
+ : MCFixupKind(AArch64::fixup_aarch64_pcrel_branch26);
+ Fixups.push_back(MCFixup::create(0, MO.getExpr(), Kind, MI.getLoc()));
+
+ ++MCNumFixups;
+
+ // All of the information is in the fixup.
+ return 0;
+}
+
+/// getVecShifterOpValue - Return the encoded value for the vector shifter:
+///
+/// 00 -> 0
+/// 01 -> 8
+/// 10 -> 16
+/// 11 -> 24
+uint32_t
+AArch64MCCodeEmitter::getVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+
+ switch (MO.getImm()) {
+ default:
+ break;
+ case 0:
+ return 0;
+ case 8:
+ return 1;
+ case 16:
+ return 2;
+ case 24:
+ return 3;
+ }
+
+ llvm_unreachable("Invalid value for vector shift amount!");
+}
+
+/// getFixedPointScaleOpValue - Return the encoded value for the
+// FP-to-fixed-point scale factor.
+uint32_t AArch64MCCodeEmitter::getFixedPointScaleOpValue(
+ const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+ return 64 - MO.getImm();
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftR64OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+ return 64 - MO.getImm();
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftR32OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+ return 32 - MO.getImm();
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftR16OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+ return 16 - MO.getImm();
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftR8OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+ return 8 - MO.getImm();
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftL64OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+ return MO.getImm() - 64;
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftL32OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+ return MO.getImm() - 32;
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftL16OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+ return MO.getImm() - 16;
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftL8OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+ return MO.getImm() - 8;
+}
+
+/// getMoveVecShifterOpValue - Return the encoded value for the vector move
+/// shifter (MSL).
+uint32_t AArch64MCCodeEmitter::getMoveVecShifterOpValue(
+ const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ assert(MO.isImm() &&
+ "Expected an immediate value for the move shift amount!");
+ unsigned ShiftVal = AArch64_AM::getShiftValue(MO.getImm());
+ assert((ShiftVal == 8 || ShiftVal == 16) && "Invalid shift amount!");
+ return ShiftVal == 8 ? 0 : 1;
+}
+
+unsigned AArch64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue,
+ const MCSubtargetInfo &STI) const {
+ // If one of the signed fixup kinds is applied to a MOVZ instruction, the
+ // eventual result could be either a MOVZ or a MOVN. It's the MCCodeEmitter's
+ // job to ensure that any bits possibly affected by this are 0. This means we
+ // must zero out bit 30 (essentially emitting a MOVN).
+ MCOperand UImm16MO = MI.getOperand(1);
+
+ // Nothing to do if there's no fixup.
+ if (UImm16MO.isImm())
+ return EncodedValue;
+
+ const AArch64MCExpr *A64E = cast<AArch64MCExpr>(UImm16MO.getExpr());
+ switch (A64E->getKind()) {
+ case AArch64MCExpr::VK_DTPREL_G2:
+ case AArch64MCExpr::VK_DTPREL_G1:
+ case AArch64MCExpr::VK_DTPREL_G0:
+ case AArch64MCExpr::VK_GOTTPREL_G1:
+ case AArch64MCExpr::VK_TPREL_G2:
+ case AArch64MCExpr::VK_TPREL_G1:
+ case AArch64MCExpr::VK_TPREL_G0:
+ return EncodedValue & ~(1u << 30);
+ default:
+ // Nothing to do for an unsigned fixup.
+ return EncodedValue;
+ }
+
+
+ return EncodedValue & ~(1u << 30);
+}
+
+void AArch64MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ verifyInstructionPredicates(MI,
+ computeAvailableFeatures(STI.getFeatureBits()));
+
+ if (MI.getOpcode() == AArch64::TLSDESCCALL) {
+ // This is a directive which applies an R_AARCH64_TLSDESC_CALL to the
+ // following (BLR) instruction. It doesn't emit any code itself so it
+ // doesn't go through the normal TableGenerated channels.
+ MCFixupKind Fixup = MCFixupKind(AArch64::fixup_aarch64_tlsdesc_call);
+ Fixups.push_back(MCFixup::create(0, MI.getOperand(0).getExpr(), Fixup));
+ return;
+ }
+
+ uint64_t Binary = getBinaryCodeForInstr(MI, Fixups, STI);
+ support::endian::Writer<support::little>(OS).write<uint32_t>(Binary);
+ ++MCNumEmitted; // Keep track of the # of mi's emitted.
+}
+
+unsigned
+AArch64MCCodeEmitter::fixMulHigh(const MCInst &MI,
+ unsigned EncodedValue,
+ const MCSubtargetInfo &STI) const {
+ // The Ra field of SMULH and UMULH is unused: it should be assembled as 31
+ // (i.e. all bits 1) but is ignored by the processor.
+ EncodedValue |= 0x1f << 10;
+ return EncodedValue;
+}
+
+template<int hasRs, int hasRt2> unsigned
+AArch64MCCodeEmitter::fixLoadStoreExclusive(const MCInst &MI,
+ unsigned EncodedValue,
+ const MCSubtargetInfo &STI) const {
+ if (!hasRs) EncodedValue |= 0x001F0000;
+ if (!hasRt2) EncodedValue |= 0x00007C00;
+
+ return EncodedValue;
+}
+
+unsigned AArch64MCCodeEmitter::fixOneOperandFPComparison(
+ const MCInst &MI, unsigned EncodedValue, const MCSubtargetInfo &STI) const {
+ // The Rm field of FCMP and friends is unused - it should be assembled
+ // as 0, but is ignored by the processor.
+ EncodedValue &= ~(0x1f << 16);
+ return EncodedValue;
+}
+
+#define ENABLE_INSTR_PREDICATE_VERIFIER
+#include "AArch64GenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
new file mode 100644
index 000000000000..a540f49866a9
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -0,0 +1,145 @@
+//===-- AArch64MCExpr.cpp - AArch64 specific MC expression classes --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the assembly expression modifiers
+// accepted by the AArch64 architecture (e.g. ":lo12:", ":gottprel_g1:", ...).
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64MCExpr.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Object/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64symbolrefexpr"
+
+const AArch64MCExpr *AArch64MCExpr::create(const MCExpr *Expr, VariantKind Kind,
+ MCContext &Ctx) {
+ return new (Ctx) AArch64MCExpr(Expr, Kind);
+}
+
+StringRef AArch64MCExpr::getVariantKindName() const {
+ switch (static_cast<uint32_t>(getKind())) {
+ case VK_CALL: return "";
+ case VK_LO12: return ":lo12:";
+ case VK_ABS_G3: return ":abs_g3:";
+ case VK_ABS_G2: return ":abs_g2:";
+ case VK_ABS_G2_S: return ":abs_g2_s:";
+ case VK_ABS_G2_NC: return ":abs_g2_nc:";
+ case VK_ABS_G1: return ":abs_g1:";
+ case VK_ABS_G1_S: return ":abs_g1_s:";
+ case VK_ABS_G1_NC: return ":abs_g1_nc:";
+ case VK_ABS_G0: return ":abs_g0:";
+ case VK_ABS_G0_S: return ":abs_g0_s:";
+ case VK_ABS_G0_NC: return ":abs_g0_nc:";
+ case VK_DTPREL_G2: return ":dtprel_g2:";
+ case VK_DTPREL_G1: return ":dtprel_g1:";
+ case VK_DTPREL_G1_NC: return ":dtprel_g1_nc:";
+ case VK_DTPREL_G0: return ":dtprel_g0:";
+ case VK_DTPREL_G0_NC: return ":dtprel_g0_nc:";
+ case VK_DTPREL_HI12: return ":dtprel_hi12:";
+ case VK_DTPREL_LO12: return ":dtprel_lo12:";
+ case VK_DTPREL_LO12_NC: return ":dtprel_lo12_nc:";
+ case VK_TPREL_G2: return ":tprel_g2:";
+ case VK_TPREL_G1: return ":tprel_g1:";
+ case VK_TPREL_G1_NC: return ":tprel_g1_nc:";
+ case VK_TPREL_G0: return ":tprel_g0:";
+ case VK_TPREL_G0_NC: return ":tprel_g0_nc:";
+ case VK_TPREL_HI12: return ":tprel_hi12:";
+ case VK_TPREL_LO12: return ":tprel_lo12:";
+ case VK_TPREL_LO12_NC: return ":tprel_lo12_nc:";
+ case VK_TLSDESC_LO12: return ":tlsdesc_lo12:";
+ case VK_ABS_PAGE: return "";
+ case VK_GOT_PAGE: return ":got:";
+ case VK_GOT_LO12: return ":got_lo12:";
+ case VK_GOTTPREL_PAGE: return ":gottprel:";
+ case VK_GOTTPREL_LO12_NC: return ":gottprel_lo12:";
+ case VK_GOTTPREL_G1: return ":gottprel_g1:";
+ case VK_GOTTPREL_G0_NC: return ":gottprel_g0_nc:";
+ case VK_TLSDESC: return "";
+ case VK_TLSDESC_PAGE: return ":tlsdesc:";
+ default:
+ llvm_unreachable("Invalid ELF symbol kind");
+ }
+}
+
+void AArch64MCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
+ if (getKind() != VK_NONE)
+ OS << getVariantKindName();
+ Expr->print(OS, MAI);
+}
+
+void AArch64MCExpr::visitUsedExpr(MCStreamer &Streamer) const {
+ Streamer.visitUsedExpr(*getSubExpr());
+}
+
+MCFragment *AArch64MCExpr::findAssociatedFragment() const {
+ llvm_unreachable("FIXME: what goes here?");
+}
+
+bool AArch64MCExpr::evaluateAsRelocatableImpl(MCValue &Res,
+ const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const {
+ if (!getSubExpr()->evaluateAsRelocatable(Res, Layout, Fixup))
+ return false;
+
+ Res =
+ MCValue::get(Res.getSymA(), Res.getSymB(), Res.getConstant(), getKind());
+
+ return true;
+}
+
+static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
+ switch (Expr->getKind()) {
+ case MCExpr::Target:
+ llvm_unreachable("Can't handle nested target expression");
+ break;
+ case MCExpr::Constant:
+ break;
+
+ case MCExpr::Binary: {
+ const MCBinaryExpr *BE = cast<MCBinaryExpr>(Expr);
+ fixELFSymbolsInTLSFixupsImpl(BE->getLHS(), Asm);
+ fixELFSymbolsInTLSFixupsImpl(BE->getRHS(), Asm);
+ break;
+ }
+
+ case MCExpr::SymbolRef: {
+ // We're known to be under a TLS fixup, so any symbol should be
+ // modified. There should be only one.
+ const MCSymbolRefExpr &SymRef = *cast<MCSymbolRefExpr>(Expr);
+ cast<MCSymbolELF>(SymRef.getSymbol()).setType(ELF::STT_TLS);
+ break;
+ }
+
+ case MCExpr::Unary:
+ fixELFSymbolsInTLSFixupsImpl(cast<MCUnaryExpr>(Expr)->getSubExpr(), Asm);
+ break;
+ }
+}
+
+void AArch64MCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
+ switch (getSymbolLoc(Kind)) {
+ default:
+ return;
+ case VK_DTPREL:
+ case VK_GOTTPREL:
+ case VK_TPREL:
+ case VK_TLSDESC:
+ break;
+ }
+
+ fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
new file mode 100644
index 000000000000..db36a65564ce
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
@@ -0,0 +1,167 @@
+//=--- AArch64MCExpr.h - AArch64 specific MC expression classes ---*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes AArch64-specific MCExprs, used for modifiers like
+// ":lo12:" or ":gottprel_g1:".
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCEXPR_H
+#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCEXPR_H
+
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+class AArch64MCExpr : public MCTargetExpr {
+public:
+ enum VariantKind {
+ VK_NONE = 0x000,
+
+ // Symbol locations specifying (roughly speaking) what calculation should be
+ // performed to construct the final address for the relocated
+ // symbol. E.g. direct, via the GOT, ...
+ VK_ABS = 0x001,
+ VK_SABS = 0x002,
+ VK_GOT = 0x003,
+ VK_DTPREL = 0x004,
+ VK_GOTTPREL = 0x005,
+ VK_TPREL = 0x006,
+ VK_TLSDESC = 0x007,
+ VK_SymLocBits = 0x00f,
+
+ // Variants specifying which part of the final address calculation is
+ // used. E.g. the low 12 bits for an ADD/LDR, the middle 16 bits for a
+ // MOVZ/MOVK.
+ VK_PAGE = 0x010,
+ VK_PAGEOFF = 0x020,
+ VK_HI12 = 0x030,
+ VK_G0 = 0x040,
+ VK_G1 = 0x050,
+ VK_G2 = 0x060,
+ VK_G3 = 0x070,
+ VK_AddressFragBits = 0x0f0,
+
+ // Whether the final relocation is a checked one (where a linker should
+ // perform a range-check on the final address) or not. Note that this field
+ // is unfortunately sometimes omitted from the assembly syntax. E.g. :lo12:
+ // on its own is a non-checked relocation. We side with ELF on being
+ // explicit about this!
+ VK_NC = 0x100,
+
+ // Convenience definitions for referring to specific textual representations
+ // of relocation specifiers. Note that this means the "_NC" is sometimes
+ // omitted in line with assembly syntax here (VK_LO12 rather than VK_LO12_NC
+ // since a user would write ":lo12:").
+ VK_CALL = VK_ABS,
+ VK_ABS_PAGE = VK_ABS | VK_PAGE,
+ VK_ABS_G3 = VK_ABS | VK_G3,
+ VK_ABS_G2 = VK_ABS | VK_G2,
+ VK_ABS_G2_S = VK_SABS | VK_G2,
+ VK_ABS_G2_NC = VK_ABS | VK_G2 | VK_NC,
+ VK_ABS_G1 = VK_ABS | VK_G1,
+ VK_ABS_G1_S = VK_SABS | VK_G1,
+ VK_ABS_G1_NC = VK_ABS | VK_G1 | VK_NC,
+ VK_ABS_G0 = VK_ABS | VK_G0,
+ VK_ABS_G0_S = VK_SABS | VK_G0,
+ VK_ABS_G0_NC = VK_ABS | VK_G0 | VK_NC,
+ VK_LO12 = VK_ABS | VK_PAGEOFF | VK_NC,
+ VK_GOT_LO12 = VK_GOT | VK_PAGEOFF | VK_NC,
+ VK_GOT_PAGE = VK_GOT | VK_PAGE,
+ VK_DTPREL_G2 = VK_DTPREL | VK_G2,
+ VK_DTPREL_G1 = VK_DTPREL | VK_G1,
+ VK_DTPREL_G1_NC = VK_DTPREL | VK_G1 | VK_NC,
+ VK_DTPREL_G0 = VK_DTPREL | VK_G0,
+ VK_DTPREL_G0_NC = VK_DTPREL | VK_G0 | VK_NC,
+ VK_DTPREL_HI12 = VK_DTPREL | VK_HI12,
+ VK_DTPREL_LO12 = VK_DTPREL | VK_PAGEOFF,
+ VK_DTPREL_LO12_NC = VK_DTPREL | VK_PAGEOFF | VK_NC,
+ VK_GOTTPREL_PAGE = VK_GOTTPREL | VK_PAGE,
+ VK_GOTTPREL_LO12_NC = VK_GOTTPREL | VK_PAGEOFF | VK_NC,
+ VK_GOTTPREL_G1 = VK_GOTTPREL | VK_G1,
+ VK_GOTTPREL_G0_NC = VK_GOTTPREL | VK_G0 | VK_NC,
+ VK_TPREL_G2 = VK_TPREL | VK_G2,
+ VK_TPREL_G1 = VK_TPREL | VK_G1,
+ VK_TPREL_G1_NC = VK_TPREL | VK_G1 | VK_NC,
+ VK_TPREL_G0 = VK_TPREL | VK_G0,
+ VK_TPREL_G0_NC = VK_TPREL | VK_G0 | VK_NC,
+ VK_TPREL_HI12 = VK_TPREL | VK_HI12,
+ VK_TPREL_LO12 = VK_TPREL | VK_PAGEOFF,
+ VK_TPREL_LO12_NC = VK_TPREL | VK_PAGEOFF | VK_NC,
+ VK_TLSDESC_LO12 = VK_TLSDESC | VK_PAGEOFF | VK_NC,
+ VK_TLSDESC_PAGE = VK_TLSDESC | VK_PAGE,
+
+ VK_INVALID = 0xfff
+ };
+
+private:
+ const MCExpr *Expr;
+ const VariantKind Kind;
+
+ explicit AArch64MCExpr(const MCExpr *Expr, VariantKind Kind)
+ : Expr(Expr), Kind(Kind) {}
+
+public:
+ /// @name Construction
+ /// @{
+
+ static const AArch64MCExpr *create(const MCExpr *Expr, VariantKind Kind,
+ MCContext &Ctx);
+
+ /// @}
+ /// @name Accessors
+ /// @{
+
+ /// Get the kind of this expression.
+ VariantKind getKind() const { return Kind; }
+
+ /// Get the expression this modifier applies to.
+ const MCExpr *getSubExpr() const { return Expr; }
+
+ /// @}
+ /// @name VariantKind information extractors.
+ /// @{
+
+ static VariantKind getSymbolLoc(VariantKind Kind) {
+ return static_cast<VariantKind>(Kind & VK_SymLocBits);
+ }
+
+ static VariantKind getAddressFrag(VariantKind Kind) {
+ return static_cast<VariantKind>(Kind & VK_AddressFragBits);
+ }
+
+ static bool isNotChecked(VariantKind Kind) { return Kind & VK_NC; }
+
+ /// @}
+
+ /// Convert the variant kind into an ELF-appropriate modifier
+ /// (e.g. ":got:", ":lo12:").
+ StringRef getVariantKindName() const;
+
+ void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+
+ void visitUsedExpr(MCStreamer &Streamer) const override;
+
+ MCFragment *findAssociatedFragment() const override;
+
+ bool evaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const override;
+
+ void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override;
+
+ static bool classof(const MCExpr *E) {
+ return E->getKind() == MCExpr::Target;
+ }
+
+ static bool classof(const AArch64MCExpr *) { return true; }
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
new file mode 100644
index 000000000000..e9d38d3dcf10
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -0,0 +1,169 @@
+//===-- AArch64MCTargetDesc.cpp - AArch64 Target Descriptions ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides AArch64 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64MCTargetDesc.h"
+#include "AArch64ELFStreamer.h"
+#include "AArch64MCAsmInfo.h"
+#include "InstPrinter/AArch64InstPrinter.h"
+#include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "AArch64GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "AArch64GenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "AArch64GenRegisterInfo.inc"
+
+static MCInstrInfo *createAArch64MCInstrInfo() {
+ MCInstrInfo *X = new MCInstrInfo();
+ InitAArch64MCInstrInfo(X);
+ return X;
+}
+
+static MCSubtargetInfo *
+createAArch64MCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
+ if (CPU.empty())
+ CPU = "generic";
+
+ return createAArch64MCSubtargetInfoImpl(TT, CPU, FS);
+}
+
+static MCRegisterInfo *createAArch64MCRegisterInfo(const Triple &Triple) {
+ MCRegisterInfo *X = new MCRegisterInfo();
+ InitAArch64MCRegisterInfo(X, AArch64::LR);
+ return X;
+}
+
+static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI,
+ const Triple &TheTriple) {
+ MCAsmInfo *MAI;
+ if (TheTriple.isOSBinFormatMachO())
+ MAI = new AArch64MCAsmInfoDarwin();
+ else {
+ assert(TheTriple.isOSBinFormatELF() && "Only expect Darwin or ELF");
+ MAI = new AArch64MCAsmInfoELF(TheTriple);
+ }
+
+ // Initial state of the frame pointer is SP.
+ unsigned Reg = MRI.getDwarfRegNum(AArch64::SP, true);
+ MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, Reg, 0);
+ MAI->addInitialFrameState(Inst);
+
+ return MAI;
+}
+
+static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM,
+ CodeModel::Model &CM) {
+ assert((TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()) &&
+ "Only expect Darwin and ELF targets");
+
+ if (CM == CodeModel::Default)
+ CM = CodeModel::Small;
+ // The default MCJIT memory managers make no guarantees about where they can
+ // find an executable page; JITed code needs to be able to refer to globals
+ // no matter how far away they are.
+ else if (CM == CodeModel::JITDefault)
+ CM = CodeModel::Large;
+ else if (CM != CodeModel::Small && CM != CodeModel::Large)
+ report_fatal_error(
+ "Only small and large code models are allowed on AArch64");
+}
+
+static MCInstPrinter *createAArch64MCInstPrinter(const Triple &T,
+ unsigned SyntaxVariant,
+ const MCAsmInfo &MAI,
+ const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI) {
+ if (SyntaxVariant == 0)
+ return new AArch64InstPrinter(MAI, MII, MRI);
+ if (SyntaxVariant == 1)
+ return new AArch64AppleInstPrinter(MAI, MII, MRI);
+
+ return nullptr;
+}
+
+static MCStreamer *createELFStreamer(const Triple &T, MCContext &Ctx,
+ MCAsmBackend &TAB, raw_pwrite_stream &OS,
+ MCCodeEmitter *Emitter, bool RelaxAll) {
+ return createAArch64ELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll);
+}
+
+static MCStreamer *createMachOStreamer(MCContext &Ctx, MCAsmBackend &TAB,
+ raw_pwrite_stream &OS,
+ MCCodeEmitter *Emitter, bool RelaxAll,
+ bool DWARFMustBeAtTheEnd) {
+ return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll,
+ DWARFMustBeAtTheEnd,
+ /*LabelSections*/ true);
+}
+
+static MCInstrAnalysis *createAArch64InstrAnalysis(const MCInstrInfo *Info) {
+ return new MCInstrAnalysis(Info);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeAArch64TargetMC() {
+ for (Target *T : {&getTheAArch64leTarget(), &getTheAArch64beTarget(),
+ &getTheARM64Target()}) {
+ // Register the MC asm info.
+ RegisterMCAsmInfoFn X(*T, createAArch64MCAsmInfo);
+
+ // Register the MC codegen info.
+ TargetRegistry::registerMCAdjustCodeGenOpts(*T, adjustCodeGenOpts);
+
+ // Register the MC instruction info.
+ TargetRegistry::RegisterMCInstrInfo(*T, createAArch64MCInstrInfo);
+
+ // Register the MC register info.
+ TargetRegistry::RegisterMCRegInfo(*T, createAArch64MCRegisterInfo);
+
+ // Register the MC subtarget info.
+ TargetRegistry::RegisterMCSubtargetInfo(*T, createAArch64MCSubtargetInfo);
+
+ // Register the MC instruction analyzer.
+ TargetRegistry::RegisterMCInstrAnalysis(*T, createAArch64InstrAnalysis);
+
+ // Register the MC Code Emitter
+ TargetRegistry::RegisterMCCodeEmitter(*T, createAArch64MCCodeEmitter);
+
+ // Register the obj streamers.
+ TargetRegistry::RegisterELFStreamer(*T, createELFStreamer);
+ TargetRegistry::RegisterMachOStreamer(*T, createMachOStreamer);
+
+ // Register the obj target streamer.
+ TargetRegistry::RegisterObjectTargetStreamer(
+ *T, createAArch64ObjectTargetStreamer);
+
+ // Register the asm streamer.
+ TargetRegistry::RegisterAsmTargetStreamer(*T,
+ createAArch64AsmTargetStreamer);
+ // Register the MCInstPrinter.
+ TargetRegistry::RegisterMCInstPrinter(*T, createAArch64MCInstPrinter);
+ }
+
+ // Register the asm backend.
+ for (Target *T : {&getTheAArch64leTarget(), &getTheARM64Target()})
+ TargetRegistry::RegisterMCAsmBackend(*T, createAArch64leAsmBackend);
+ TargetRegistry::RegisterMCAsmBackend(getTheAArch64beTarget(),
+ createAArch64beAsmBackend);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
new file mode 100644
index 000000000000..615d7dab2c51
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
@@ -0,0 +1,87 @@
+//===-- AArch64MCTargetDesc.h - AArch64 Target Descriptions -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides AArch64 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCTARGETDESC_H
+#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+class formatted_raw_ostream;
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCInstPrinter;
+class MCRegisterInfo;
+class MCObjectWriter;
+class MCStreamer;
+class MCSubtargetInfo;
+class MCTargetOptions;
+class MCTargetStreamer;
+class StringRef;
+class Target;
+class Triple;
+class raw_ostream;
+class raw_pwrite_stream;
+
+Target &getTheAArch64leTarget();
+Target &getTheAArch64beTarget();
+Target &getTheARM64Target();
+
+MCCodeEmitter *createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx);
+MCAsmBackend *createAArch64leAsmBackend(const Target &T,
+ const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options);
+MCAsmBackend *createAArch64beAsmBackend(const Target &T,
+ const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options);
+
+MCObjectWriter *createAArch64ELFObjectWriter(raw_pwrite_stream &OS,
+ uint8_t OSABI,
+ bool IsLittleEndian,
+ bool IsILP32);
+
+MCObjectWriter *createAArch64MachObjectWriter(raw_pwrite_stream &OS,
+ uint32_t CPUType,
+ uint32_t CPUSubtype);
+
+MCTargetStreamer *createAArch64AsmTargetStreamer(MCStreamer &S,
+ formatted_raw_ostream &OS,
+ MCInstPrinter *InstPrint,
+ bool isVerboseAsm);
+
+MCTargetStreamer *createAArch64ObjectTargetStreamer(MCStreamer &S,
+ const MCSubtargetInfo &STI);
+
+} // End llvm namespace
+
+// Defines symbolic names for AArch64 registers. This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "AArch64GenRegisterInfo.inc"
+
+// Defines symbolic names for the AArch64 instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "AArch64GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "AArch64GenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
new file mode 100644
index 000000000000..53a68527ee8e
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
@@ -0,0 +1,431 @@
+//===-- AArch64MachObjectWriter.cpp - ARM Mach Object Writer --------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AArch64FixupKinds.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
+using namespace llvm;
+
+namespace {
+class AArch64MachObjectWriter : public MCMachObjectTargetWriter {
+ bool getAArch64FixupKindMachOInfo(const MCFixup &Fixup, unsigned &RelocType,
+ const MCSymbolRefExpr *Sym,
+ unsigned &Log2Size, const MCAssembler &Asm);
+
+public:
+ AArch64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype)
+ : MCMachObjectTargetWriter(true /* is64Bit */, CPUType, CPUSubtype) {}
+
+ void recordRelocation(MachObjectWriter *Writer, MCAssembler &Asm,
+ const MCAsmLayout &Layout, const MCFragment *Fragment,
+ const MCFixup &Fixup, MCValue Target,
+ uint64_t &FixedValue) override;
+};
+}
+
+bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
+ const MCFixup &Fixup, unsigned &RelocType, const MCSymbolRefExpr *Sym,
+ unsigned &Log2Size, const MCAssembler &Asm) {
+ RelocType = unsigned(MachO::ARM64_RELOC_UNSIGNED);
+ Log2Size = ~0U;
+
+ switch ((unsigned)Fixup.getKind()) {
+ default:
+ return false;
+
+ case FK_Data_1:
+ Log2Size = llvm::Log2_32(1);
+ return true;
+ case FK_Data_2:
+ Log2Size = llvm::Log2_32(2);
+ return true;
+ case FK_Data_4:
+ Log2Size = llvm::Log2_32(4);
+ if (Sym->getKind() == MCSymbolRefExpr::VK_GOT)
+ RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT);
+ return true;
+ case FK_Data_8:
+ Log2Size = llvm::Log2_32(8);
+ if (Sym->getKind() == MCSymbolRefExpr::VK_GOT)
+ RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT);
+ return true;
+ case AArch64::fixup_aarch64_add_imm12:
+ case AArch64::fixup_aarch64_ldst_imm12_scale1:
+ case AArch64::fixup_aarch64_ldst_imm12_scale2:
+ case AArch64::fixup_aarch64_ldst_imm12_scale4:
+ case AArch64::fixup_aarch64_ldst_imm12_scale8:
+ case AArch64::fixup_aarch64_ldst_imm12_scale16:
+ Log2Size = llvm::Log2_32(4);
+ switch (Sym->getKind()) {
+ default:
+ return false;
+ case MCSymbolRefExpr::VK_PAGEOFF:
+ RelocType = unsigned(MachO::ARM64_RELOC_PAGEOFF12);
+ return true;
+ case MCSymbolRefExpr::VK_GOTPAGEOFF:
+ RelocType = unsigned(MachO::ARM64_RELOC_GOT_LOAD_PAGEOFF12);
+ return true;
+ case MCSymbolRefExpr::VK_TLVPPAGEOFF:
+ RelocType = unsigned(MachO::ARM64_RELOC_TLVP_LOAD_PAGEOFF12);
+ return true;
+ }
+ case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+ Log2Size = llvm::Log2_32(4);
+ // This encompasses the relocation for the whole 21-bit value.
+ switch (Sym->getKind()) {
+ default: {
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "ADR/ADRP relocations must be GOT relative");
+ return false;
+ }
+ case MCSymbolRefExpr::VK_PAGE:
+ RelocType = unsigned(MachO::ARM64_RELOC_PAGE21);
+ return true;
+ case MCSymbolRefExpr::VK_GOTPAGE:
+ RelocType = unsigned(MachO::ARM64_RELOC_GOT_LOAD_PAGE21);
+ return true;
+ case MCSymbolRefExpr::VK_TLVPPAGE:
+ RelocType = unsigned(MachO::ARM64_RELOC_TLVP_LOAD_PAGE21);
+ return true;
+ }
+ return true;
+ case AArch64::fixup_aarch64_pcrel_branch26:
+ case AArch64::fixup_aarch64_pcrel_call26:
+ Log2Size = llvm::Log2_32(4);
+ RelocType = unsigned(MachO::ARM64_RELOC_BRANCH26);
+ return true;
+ }
+}
+
+static bool canUseLocalRelocation(const MCSectionMachO &Section,
+ const MCSymbol &Symbol, unsigned Log2Size) {
+ // Debug info sections can use local relocations.
+ if (Section.hasAttribute(MachO::S_ATTR_DEBUG))
+ return true;
+
+ // Otherwise, only pointer sized relocations are supported.
+ if (Log2Size != 3)
+ return false;
+
+ // But only if they don't point to a few forbidden sections.
+ if (!Symbol.isInSection())
+ return true;
+ const MCSectionMachO &RefSec = cast<MCSectionMachO>(Symbol.getSection());
+ if (RefSec.getType() == MachO::S_CSTRING_LITERALS)
+ return false;
+
+ if (RefSec.getSegmentName() == "__DATA" &&
+ RefSec.getSectionName() == "__objc_classrefs")
+ return false;
+
+ // FIXME: ld64 currently handles internal pointer-sized relocations
+ // incorrectly (applying the addend twice). We should be able to return true
+ // unconditionally by this point when that's fixed.
+ return false;
+}
+
+void AArch64MachObjectWriter::recordRelocation(
+ MachObjectWriter *Writer, MCAssembler &Asm, const MCAsmLayout &Layout,
+ const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
+ uint64_t &FixedValue) {
+ unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+
+ // See <reloc.h>.
+ uint32_t FixupOffset = Layout.getFragmentOffset(Fragment);
+ unsigned Log2Size = 0;
+ int64_t Value = 0;
+ unsigned Index = 0;
+ unsigned Type = 0;
+ unsigned Kind = Fixup.getKind();
+ const MCSymbol *RelSymbol = nullptr;
+
+ FixupOffset += Fixup.getOffset();
+
+ // AArch64 pcrel relocation addends do not include the section offset.
+ if (IsPCRel)
+ FixedValue += FixupOffset;
+
+ // ADRP fixups use relocations for the whole symbol value and only
+ // put the addend in the instruction itself. Clear out any value the
+ // generic code figured out from the sybmol definition.
+ if (Kind == AArch64::fixup_aarch64_pcrel_adrp_imm21)
+ FixedValue = 0;
+
+ // imm19 relocations are for conditional branches, which require
+ // assembler local symbols. If we got here, that's not what we have,
+ // so complain loudly.
+ if (Kind == AArch64::fixup_aarch64_pcrel_branch19) {
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "conditional branch requires assembler-local"
+ " label. '" +
+ Target.getSymA()->getSymbol().getName() +
+ "' is external.");
+ return;
+ }
+
+ // 14-bit branch relocations should only target internal labels, and so
+ // should never get here.
+ if (Kind == AArch64::fixup_aarch64_pcrel_branch14) {
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "Invalid relocation on conditional branch!");
+ return;
+ }
+
+ if (!getAArch64FixupKindMachOInfo(Fixup, Type, Target.getSymA(), Log2Size,
+ Asm)) {
+ Asm.getContext().reportError(Fixup.getLoc(), "unknown AArch64 fixup kind!");
+ return;
+ }
+
+ Value = Target.getConstant();
+
+ if (Target.isAbsolute()) { // constant
+ // FIXME: Should this always be extern?
+ // SymbolNum of 0 indicates the absolute section.
+ Type = MachO::ARM64_RELOC_UNSIGNED;
+
+ if (IsPCRel) {
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "PC relative absolute relocation!");
+ return;
+
+ // FIXME: x86_64 sets the type to a branch reloc here. Should we do
+ // something similar?
+ }
+ } else if (Target.getSymB()) { // A - B + constant
+ const MCSymbol *A = &Target.getSymA()->getSymbol();
+ const MCSymbol *A_Base = Asm.getAtom(*A);
+
+ const MCSymbol *B = &Target.getSymB()->getSymbol();
+ const MCSymbol *B_Base = Asm.getAtom(*B);
+
+ // Check for "_foo@got - .", which comes through here as:
+ // Ltmp0:
+ // ... _foo@got - Ltmp0
+ if (Target.getSymA()->getKind() == MCSymbolRefExpr::VK_GOT &&
+ Target.getSymB()->getKind() == MCSymbolRefExpr::VK_None &&
+ Layout.getSymbolOffset(*B) ==
+ Layout.getFragmentOffset(Fragment) + Fixup.getOffset()) {
+ // SymB is the PC, so use a PC-rel pointer-to-GOT relocation.
+ Type = MachO::ARM64_RELOC_POINTER_TO_GOT;
+ IsPCRel = 1;
+ MachO::any_relocation_info MRE;
+ MRE.r_word0 = FixupOffset;
+ MRE.r_word1 = (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
+ Writer->addRelocation(A_Base, Fragment->getParent(), MRE);
+ return;
+ } else if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
+ Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) {
+ // Otherwise, neither symbol can be modified.
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "unsupported relocation of modified symbol");
+ return;
+ }
+
+ // We don't support PCrel relocations of differences.
+ if (IsPCRel) {
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "unsupported pc-relative relocation of "
+ "difference");
+ return;
+ }
+
+ // AArch64 always uses external relocations. If there is no symbol to use as
+ // a base address (a local symbol with no preceding non-local symbol),
+ // error out.
+ //
+ // FIXME: We should probably just synthesize an external symbol and use
+ // that.
+ if (!A_Base) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(),
+ "unsupported relocation of local symbol '" + A->getName() +
+ "'. Must have non-local symbol earlier in section.");
+ return;
+ }
+ if (!B_Base) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(),
+ "unsupported relocation of local symbol '" + B->getName() +
+ "'. Must have non-local symbol earlier in section.");
+ return;
+ }
+
+ if (A_Base == B_Base && A_Base) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(), "unsupported relocation with identical base");
+ return;
+ }
+
+ Value += (!A->getFragment() ? 0 : Writer->getSymbolAddress(*A, Layout)) -
+ (!A_Base || !A_Base->getFragment() ? 0 : Writer->getSymbolAddress(
+ *A_Base, Layout));
+ Value -= (!B->getFragment() ? 0 : Writer->getSymbolAddress(*B, Layout)) -
+ (!B_Base || !B_Base->getFragment() ? 0 : Writer->getSymbolAddress(
+ *B_Base, Layout));
+
+ Type = MachO::ARM64_RELOC_UNSIGNED;
+
+ MachO::any_relocation_info MRE;
+ MRE.r_word0 = FixupOffset;
+ MRE.r_word1 = (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
+ Writer->addRelocation(A_Base, Fragment->getParent(), MRE);
+
+ RelSymbol = B_Base;
+ Type = MachO::ARM64_RELOC_SUBTRACTOR;
+ } else { // A + constant
+ const MCSymbol *Symbol = &Target.getSymA()->getSymbol();
+ const MCSectionMachO &Section =
+ static_cast<const MCSectionMachO &>(*Fragment->getParent());
+
+ bool CanUseLocalRelocation =
+ canUseLocalRelocation(Section, *Symbol, Log2Size);
+ if (Symbol->isTemporary() && (Value || !CanUseLocalRelocation)) {
+ const MCSection &Sec = Symbol->getSection();
+ if (!Asm.getContext().getAsmInfo()->isSectionAtomizableBySymbols(Sec))
+ Symbol->setUsedInReloc();
+ }
+
+ const MCSymbol *Base = Asm.getAtom(*Symbol);
+
+ // If the symbol is a variable and we weren't able to get a Base for it
+ // (i.e., it's not in the symbol table associated with a section) resolve
+ // the relocation based its expansion instead.
+ if (Symbol->isVariable() && !Base) {
+ // If the evaluation is an absolute value, just use that directly
+ // to keep things easy.
+ int64_t Res;
+ if (Symbol->getVariableValue()->evaluateAsAbsolute(
+ Res, Layout, Writer->getSectionAddressMap())) {
+ FixedValue = Res;
+ return;
+ }
+
+ // FIXME: Will the Target we already have ever have any data in it
+ // we need to preserve and merge with the new Target? How about
+ // the FixedValue?
+ if (!Symbol->getVariableValue()->evaluateAsRelocatable(Target, &Layout,
+ &Fixup)) {
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "unable to resolve variable '" +
+ Symbol->getName() + "'");
+ return;
+ }
+ return recordRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+ FixedValue);
+ }
+
+ // Relocations inside debug sections always use local relocations when
+ // possible. This seems to be done because the debugger doesn't fully
+ // understand relocation entries and expects to find values that
+ // have already been fixed up.
+ if (Symbol->isInSection()) {
+ if (Section.hasAttribute(MachO::S_ATTR_DEBUG))
+ Base = nullptr;
+ }
+
+ // AArch64 uses external relocations as much as possible. For debug
+ // sections, and for pointer-sized relocations (.quad), we allow section
+ // relocations. It's code sections that run into trouble.
+ if (Base) {
+ RelSymbol = Base;
+
+ // Add the local offset, if needed.
+ if (Base != Symbol)
+ Value +=
+ Layout.getSymbolOffset(*Symbol) - Layout.getSymbolOffset(*Base);
+ } else if (Symbol->isInSection()) {
+ if (!CanUseLocalRelocation) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(),
+ "unsupported relocation of local symbol '" + Symbol->getName() +
+ "'. Must have non-local symbol earlier in section.");
+ return;
+ }
+ // Adjust the relocation to be section-relative.
+ // The index is the section ordinal (1-based).
+ const MCSection &Sec = Symbol->getSection();
+ Index = Sec.getOrdinal() + 1;
+ Value += Writer->getSymbolAddress(*Symbol, Layout);
+
+ if (IsPCRel)
+ Value -= Writer->getFragmentAddress(Fragment, Layout) +
+ Fixup.getOffset() + (1ULL << Log2Size);
+ } else {
+ // Resolve constant variables.
+ if (Symbol->isVariable()) {
+ int64_t Res;
+ if (Symbol->getVariableValue()->evaluateAsAbsolute(
+ Res, Layout, Writer->getSectionAddressMap())) {
+ FixedValue = Res;
+ return;
+ }
+ }
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "unsupported relocation of variable '" +
+ Symbol->getName() + "'");
+ return;
+ }
+ }
+
+ // If the relocation kind is Branch26, Page21, or Pageoff12, any addend
+ // is represented via an Addend relocation, not encoded directly into
+ // the instruction.
+ if ((Type == MachO::ARM64_RELOC_BRANCH26 ||
+ Type == MachO::ARM64_RELOC_PAGE21 ||
+ Type == MachO::ARM64_RELOC_PAGEOFF12) &&
+ Value) {
+ assert((Value & 0xff000000) == 0 && "Added relocation out of range!");
+
+ MachO::any_relocation_info MRE;
+ MRE.r_word0 = FixupOffset;
+ MRE.r_word1 =
+ (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
+ Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
+
+ // Now set up the Addend relocation.
+ Type = MachO::ARM64_RELOC_ADDEND;
+ Index = Value;
+ RelSymbol = nullptr;
+ IsPCRel = 0;
+ Log2Size = 2;
+
+ // Put zero into the instruction itself. The addend is in the relocation.
+ Value = 0;
+ }
+
+ // If there's any addend left to handle, encode it in the instruction.
+ FixedValue = Value;
+
+ // struct relocation_info (8 bytes)
+ MachO::any_relocation_info MRE;
+ MRE.r_word0 = FixupOffset;
+ MRE.r_word1 =
+ (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
+ Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
+}
+
+MCObjectWriter *llvm::createAArch64MachObjectWriter(raw_pwrite_stream &OS,
+ uint32_t CPUType,
+ uint32_t CPUSubtype) {
+ return createMachObjectWriter(
+ new AArch64MachObjectWriter(CPUType, CPUSubtype), OS,
+ /*IsLittleEndian=*/true);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
new file mode 100644
index 000000000000..3e86a42d5be6
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
@@ -0,0 +1,41 @@
+//===- AArch64TargetStreamer.cpp - AArch64TargetStreamer class ------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64TargetStreamer class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64TargetStreamer.h"
+#include "llvm/MC/ConstantPools.h"
+using namespace llvm;
+
+//
+// AArch64TargetStreamer Implemenation
+//
+AArch64TargetStreamer::AArch64TargetStreamer(MCStreamer &S)
+ : MCTargetStreamer(S), ConstantPools(new AssemblerConstantPools()) {}
+
+AArch64TargetStreamer::~AArch64TargetStreamer() {}
+
+// The constant pool handling is shared by all AArch64TargetStreamer
+// implementations.
+const MCExpr *AArch64TargetStreamer::addConstantPoolEntry(const MCExpr *Expr,
+ unsigned Size,
+ SMLoc Loc) {
+ return ConstantPools->addEntry(Streamer, Expr, Size, Loc);
+}
+
+void AArch64TargetStreamer::emitCurrentConstantPool() {
+ ConstantPools->emitForCurrentSection(Streamer);
+}
+
+// finish() - write out any non-empty assembler constant pools.
+void AArch64TargetStreamer::finish() { ConstantPools->emitAll(Streamer); }
+
+void AArch64TargetStreamer::emitInst(uint32_t Inst) {}
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
new file mode 100644
index 000000000000..51432830f795
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
@@ -0,0 +1,42 @@
+//===-- AArch64TargetStreamer.h - AArch64 Target Streamer ------*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64TARGETSTREAMER_H
+#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64TARGETSTREAMER_H
+
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+
+class AArch64TargetStreamer : public MCTargetStreamer {
+public:
+ AArch64TargetStreamer(MCStreamer &S);
+ ~AArch64TargetStreamer() override;
+
+ void finish() override;
+
+ /// Callback used to implement the ldr= pseudo.
+ /// Add a new entry to the constant pool for the current section and return an
+ /// MCExpr that can be used to refer to the constant pool location.
+ const MCExpr *addConstantPoolEntry(const MCExpr *, unsigned Size, SMLoc Loc);
+
+ /// Callback used to implemnt the .ltorg directive.
+ /// Emit contents of constant pool for the current section.
+ void emitCurrentConstantPool();
+
+ /// Callback used to implement the .inst directive.
+ virtual void emitInst(uint32_t Inst);
+
+private:
+ std::unique_ptr<AssemblerConstantPools> ConstantPools;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp b/contrib/llvm/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
new file mode 100644
index 000000000000..7ac9a5a08484
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
@@ -0,0 +1,39 @@
+//===-- AArch64TargetInfo.cpp - AArch64 Target Implementation -----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+namespace llvm {
+Target &getTheAArch64leTarget() {
+ static Target TheAArch64leTarget;
+ return TheAArch64leTarget;
+}
+Target &getTheAArch64beTarget() {
+ static Target TheAArch64beTarget;
+ return TheAArch64beTarget;
+}
+Target &getTheARM64Target() {
+ static Target TheARM64Target;
+ return TheARM64Target;
+}
+} // namespace llvm
+
+extern "C" void LLVMInitializeAArch64TargetInfo() {
+ // Now register the "arm64" name for use with "-march". We don't want it to
+ // take possession of the Triple::aarch64 tag though.
+ TargetRegistry::RegisterTarget(getTheARM64Target(), "arm64",
+ "ARM64 (little endian)",
+ [](Triple::ArchType) { return false; }, true);
+
+ RegisterTarget<Triple::aarch64, /*HasJIT=*/true> Z(
+ getTheAArch64leTarget(), "aarch64", "AArch64 (little endian)");
+ RegisterTarget<Triple::aarch64_be, /*HasJIT=*/true> W(
+ getTheAArch64beTarget(), "aarch64_be", "AArch64 (big endian)");
+}
diff --git a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
new file mode 100644
index 000000000000..e65ba1f2401d
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@@ -0,0 +1,122 @@
+//===-- AArch64BaseInfo.cpp - AArch64 Base encoding information------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides basic encoding and assembly information for AArch64.
+//
+//===----------------------------------------------------------------------===//
+#include "AArch64BaseInfo.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Regex.h"
+
+using namespace llvm;
+
+namespace llvm {
+ namespace AArch64AT {
+#define GET_AT_IMPL
+#include "AArch64GenSystemOperands.inc"
+ }
+}
+
+
+namespace llvm {
+ namespace AArch64DB {
+#define GET_DB_IMPL
+#include "AArch64GenSystemOperands.inc"
+ }
+}
+
+namespace llvm {
+ namespace AArch64DC {
+#define GET_DC_IMPL
+#include "AArch64GenSystemOperands.inc"
+ }
+}
+
+namespace llvm {
+ namespace AArch64IC {
+#define GET_IC_IMPL
+#include "AArch64GenSystemOperands.inc"
+ }
+}
+
+namespace llvm {
+ namespace AArch64ISB {
+#define GET_ISB_IMPL
+#include "AArch64GenSystemOperands.inc"
+ }
+}
+namespace llvm {
+ namespace AArch64PRFM {
+#define GET_PRFM_IMPL
+#include "AArch64GenSystemOperands.inc"
+ }
+}
+
+namespace llvm {
+ namespace AArch64PState {
+#define GET_PSTATE_IMPL
+#include "AArch64GenSystemOperands.inc"
+ }
+}
+
+namespace llvm {
+ namespace AArch64PSBHint {
+#define GET_PSB_IMPL
+#include "AArch64GenSystemOperands.inc"
+ }
+}
+
+namespace llvm {
+ namespace AArch64SysReg {
+#define GET_SYSREG_IMPL
+#include "AArch64GenSystemOperands.inc"
+ }
+}
+
+uint32_t AArch64SysReg::parseGenericRegister(StringRef Name) {
+ // Try to parse an S<op0>_<op1>_<Cn>_<Cm>_<op2> register name
+ Regex GenericRegPattern("^S([0-3])_([0-7])_C([0-9]|1[0-5])_C([0-9]|1[0-5])_([0-7])$");
+
+ std::string UpperName = Name.upper();
+ SmallVector<StringRef, 5> Ops;
+ if (!GenericRegPattern.match(UpperName, &Ops))
+ return -1;
+
+ uint32_t Op0 = 0, Op1 = 0, CRn = 0, CRm = 0, Op2 = 0;
+ uint32_t Bits;
+ Ops[1].getAsInteger(10, Op0);
+ Ops[2].getAsInteger(10, Op1);
+ Ops[3].getAsInteger(10, CRn);
+ Ops[4].getAsInteger(10, CRm);
+ Ops[5].getAsInteger(10, Op2);
+ Bits = (Op0 << 14) | (Op1 << 11) | (CRn << 7) | (CRm << 3) | Op2;
+
+ return Bits;
+}
+
+std::string AArch64SysReg::genericRegisterString(uint32_t Bits) {
+ assert(Bits < 0x10000);
+ uint32_t Op0 = (Bits >> 14) & 0x3;
+ uint32_t Op1 = (Bits >> 11) & 0x7;
+ uint32_t CRn = (Bits >> 7) & 0xf;
+ uint32_t CRm = (Bits >> 3) & 0xf;
+ uint32_t Op2 = Bits & 0x7;
+
+ return "S" + utostr(Op0) + "_" + utostr(Op1) + "_C" + utostr(CRn) + "_C" +
+ utostr(CRm) + "_" + utostr(Op2);
+}
+
+namespace llvm {
+ namespace AArch64TLBI {
+#define GET_TLBI_IMPL
+#include "AArch64GenSystemOperands.inc"
+ }
+}
diff --git a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
new file mode 100644
index 000000000000..dcc39176031c
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -0,0 +1,524 @@
+//===-- AArch64BaseInfo.h - Top level definitions for AArch64 ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the AArch64 target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_UTILS_AARCH64BASEINFO_H
+#define LLVM_LIB_TARGET_AARCH64_UTILS_AARCH64BASEINFO_H
+
+// FIXME: Is it easiest to fix this layering violation by moving the .inc
+// #includes from AArch64MCTargetDesc.h to here?
+#include "MCTargetDesc/AArch64MCTargetDesc.h" // For AArch64::X0 and friends.
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+inline static unsigned getWRegFromXReg(unsigned Reg) {
+ switch (Reg) {
+ case AArch64::X0: return AArch64::W0;
+ case AArch64::X1: return AArch64::W1;
+ case AArch64::X2: return AArch64::W2;
+ case AArch64::X3: return AArch64::W3;
+ case AArch64::X4: return AArch64::W4;
+ case AArch64::X5: return AArch64::W5;
+ case AArch64::X6: return AArch64::W6;
+ case AArch64::X7: return AArch64::W7;
+ case AArch64::X8: return AArch64::W8;
+ case AArch64::X9: return AArch64::W9;
+ case AArch64::X10: return AArch64::W10;
+ case AArch64::X11: return AArch64::W11;
+ case AArch64::X12: return AArch64::W12;
+ case AArch64::X13: return AArch64::W13;
+ case AArch64::X14: return AArch64::W14;
+ case AArch64::X15: return AArch64::W15;
+ case AArch64::X16: return AArch64::W16;
+ case AArch64::X17: return AArch64::W17;
+ case AArch64::X18: return AArch64::W18;
+ case AArch64::X19: return AArch64::W19;
+ case AArch64::X20: return AArch64::W20;
+ case AArch64::X21: return AArch64::W21;
+ case AArch64::X22: return AArch64::W22;
+ case AArch64::X23: return AArch64::W23;
+ case AArch64::X24: return AArch64::W24;
+ case AArch64::X25: return AArch64::W25;
+ case AArch64::X26: return AArch64::W26;
+ case AArch64::X27: return AArch64::W27;
+ case AArch64::X28: return AArch64::W28;
+ case AArch64::FP: return AArch64::W29;
+ case AArch64::LR: return AArch64::W30;
+ case AArch64::SP: return AArch64::WSP;
+ case AArch64::XZR: return AArch64::WZR;
+ }
+ // For anything else, return it unchanged.
+ return Reg;
+}
+
+inline static unsigned getXRegFromWReg(unsigned Reg) {
+ switch (Reg) {
+ case AArch64::W0: return AArch64::X0;
+ case AArch64::W1: return AArch64::X1;
+ case AArch64::W2: return AArch64::X2;
+ case AArch64::W3: return AArch64::X3;
+ case AArch64::W4: return AArch64::X4;
+ case AArch64::W5: return AArch64::X5;
+ case AArch64::W6: return AArch64::X6;
+ case AArch64::W7: return AArch64::X7;
+ case AArch64::W8: return AArch64::X8;
+ case AArch64::W9: return AArch64::X9;
+ case AArch64::W10: return AArch64::X10;
+ case AArch64::W11: return AArch64::X11;
+ case AArch64::W12: return AArch64::X12;
+ case AArch64::W13: return AArch64::X13;
+ case AArch64::W14: return AArch64::X14;
+ case AArch64::W15: return AArch64::X15;
+ case AArch64::W16: return AArch64::X16;
+ case AArch64::W17: return AArch64::X17;
+ case AArch64::W18: return AArch64::X18;
+ case AArch64::W19: return AArch64::X19;
+ case AArch64::W20: return AArch64::X20;
+ case AArch64::W21: return AArch64::X21;
+ case AArch64::W22: return AArch64::X22;
+ case AArch64::W23: return AArch64::X23;
+ case AArch64::W24: return AArch64::X24;
+ case AArch64::W25: return AArch64::X25;
+ case AArch64::W26: return AArch64::X26;
+ case AArch64::W27: return AArch64::X27;
+ case AArch64::W28: return AArch64::X28;
+ case AArch64::W29: return AArch64::FP;
+ case AArch64::W30: return AArch64::LR;
+ case AArch64::WSP: return AArch64::SP;
+ case AArch64::WZR: return AArch64::XZR;
+ }
+ // For anything else, return it unchanged.
+ return Reg;
+}
+
+static inline unsigned getBRegFromDReg(unsigned Reg) {
+ switch (Reg) {
+ case AArch64::D0: return AArch64::B0;
+ case AArch64::D1: return AArch64::B1;
+ case AArch64::D2: return AArch64::B2;
+ case AArch64::D3: return AArch64::B3;
+ case AArch64::D4: return AArch64::B4;
+ case AArch64::D5: return AArch64::B5;
+ case AArch64::D6: return AArch64::B6;
+ case AArch64::D7: return AArch64::B7;
+ case AArch64::D8: return AArch64::B8;
+ case AArch64::D9: return AArch64::B9;
+ case AArch64::D10: return AArch64::B10;
+ case AArch64::D11: return AArch64::B11;
+ case AArch64::D12: return AArch64::B12;
+ case AArch64::D13: return AArch64::B13;
+ case AArch64::D14: return AArch64::B14;
+ case AArch64::D15: return AArch64::B15;
+ case AArch64::D16: return AArch64::B16;
+ case AArch64::D17: return AArch64::B17;
+ case AArch64::D18: return AArch64::B18;
+ case AArch64::D19: return AArch64::B19;
+ case AArch64::D20: return AArch64::B20;
+ case AArch64::D21: return AArch64::B21;
+ case AArch64::D22: return AArch64::B22;
+ case AArch64::D23: return AArch64::B23;
+ case AArch64::D24: return AArch64::B24;
+ case AArch64::D25: return AArch64::B25;
+ case AArch64::D26: return AArch64::B26;
+ case AArch64::D27: return AArch64::B27;
+ case AArch64::D28: return AArch64::B28;
+ case AArch64::D29: return AArch64::B29;
+ case AArch64::D30: return AArch64::B30;
+ case AArch64::D31: return AArch64::B31;
+ }
+ // For anything else, return it unchanged.
+ return Reg;
+}
+
+
+static inline unsigned getDRegFromBReg(unsigned Reg) {
+ switch (Reg) {
+ case AArch64::B0: return AArch64::D0;
+ case AArch64::B1: return AArch64::D1;
+ case AArch64::B2: return AArch64::D2;
+ case AArch64::B3: return AArch64::D3;
+ case AArch64::B4: return AArch64::D4;
+ case AArch64::B5: return AArch64::D5;
+ case AArch64::B6: return AArch64::D6;
+ case AArch64::B7: return AArch64::D7;
+ case AArch64::B8: return AArch64::D8;
+ case AArch64::B9: return AArch64::D9;
+ case AArch64::B10: return AArch64::D10;
+ case AArch64::B11: return AArch64::D11;
+ case AArch64::B12: return AArch64::D12;
+ case AArch64::B13: return AArch64::D13;
+ case AArch64::B14: return AArch64::D14;
+ case AArch64::B15: return AArch64::D15;
+ case AArch64::B16: return AArch64::D16;
+ case AArch64::B17: return AArch64::D17;
+ case AArch64::B18: return AArch64::D18;
+ case AArch64::B19: return AArch64::D19;
+ case AArch64::B20: return AArch64::D20;
+ case AArch64::B21: return AArch64::D21;
+ case AArch64::B22: return AArch64::D22;
+ case AArch64::B23: return AArch64::D23;
+ case AArch64::B24: return AArch64::D24;
+ case AArch64::B25: return AArch64::D25;
+ case AArch64::B26: return AArch64::D26;
+ case AArch64::B27: return AArch64::D27;
+ case AArch64::B28: return AArch64::D28;
+ case AArch64::B29: return AArch64::D29;
+ case AArch64::B30: return AArch64::D30;
+ case AArch64::B31: return AArch64::D31;
+ }
+ // For anything else, return it unchanged.
+ return Reg;
+}
+
+namespace AArch64CC {
+
+// The CondCodes constants map directly to the 4-bit encoding of the condition
+// field for predicated instructions.
+enum CondCode { // Meaning (integer) Meaning (floating-point)
+ EQ = 0x0, // Equal Equal
+ NE = 0x1, // Not equal Not equal, or unordered
+ HS = 0x2, // Unsigned higher or same >, ==, or unordered
+ LO = 0x3, // Unsigned lower Less than
+ MI = 0x4, // Minus, negative Less than
+ PL = 0x5, // Plus, positive or zero >, ==, or unordered
+ VS = 0x6, // Overflow Unordered
+ VC = 0x7, // No overflow Not unordered
+ HI = 0x8, // Unsigned higher Greater than, or unordered
+ LS = 0x9, // Unsigned lower or same Less than or equal
+ GE = 0xa, // Greater than or equal Greater than or equal
+ LT = 0xb, // Less than Less than, or unordered
+ GT = 0xc, // Greater than Greater than
+ LE = 0xd, // Less than or equal <, ==, or unordered
+ AL = 0xe, // Always (unconditional) Always (unconditional)
+ NV = 0xf, // Always (unconditional) Always (unconditional)
+ // Note the NV exists purely to disassemble 0b1111. Execution is "always".
+ Invalid
+};
+
+inline static const char *getCondCodeName(CondCode Code) {
+ switch (Code) {
+ default: llvm_unreachable("Unknown condition code");
+ case EQ: return "eq";
+ case NE: return "ne";
+ case HS: return "hs";
+ case LO: return "lo";
+ case MI: return "mi";
+ case PL: return "pl";
+ case VS: return "vs";
+ case VC: return "vc";
+ case HI: return "hi";
+ case LS: return "ls";
+ case GE: return "ge";
+ case LT: return "lt";
+ case GT: return "gt";
+ case LE: return "le";
+ case AL: return "al";
+ case NV: return "nv";
+ }
+}
+
+inline static CondCode getInvertedCondCode(CondCode Code) {
+ // To reverse a condition it's necessary to only invert the low bit:
+
+ return static_cast<CondCode>(static_cast<unsigned>(Code) ^ 0x1);
+}
+
+/// Given a condition code, return NZCV flags that would satisfy that condition.
+/// The flag bits are in the format expected by the ccmp instructions.
+/// Note that many different flag settings can satisfy a given condition code,
+/// this function just returns one of them.
+inline static unsigned getNZCVToSatisfyCondCode(CondCode Code) {
+ // NZCV flags encoded as expected by ccmp instructions, ARMv8 ISA 5.5.7.
+ enum { N = 8, Z = 4, C = 2, V = 1 };
+ switch (Code) {
+ default: llvm_unreachable("Unknown condition code");
+ case EQ: return Z; // Z == 1
+ case NE: return 0; // Z == 0
+ case HS: return C; // C == 1
+ case LO: return 0; // C == 0
+ case MI: return N; // N == 1
+ case PL: return 0; // N == 0
+ case VS: return V; // V == 1
+ case VC: return 0; // V == 0
+ case HI: return C; // C == 1 && Z == 0
+ case LS: return 0; // C == 0 || Z == 1
+ case GE: return 0; // N == V
+ case LT: return N; // N != V
+ case GT: return 0; // Z == 0 && N == V
+ case LE: return Z; // Z == 1 || N != V
+ }
+}
+} // end namespace AArch64CC
+
+namespace AArch64AT{
+ struct AT {
+ const char *Name;
+ uint16_t Encoding;
+ };
+
+ #define GET_AT_DECL
+ #include "AArch64GenSystemOperands.inc"
+
+}
+namespace AArch64DB {
+ struct DB {
+ const char *Name;
+ uint16_t Encoding;
+ };
+
+ #define GET_DB_DECL
+ #include "AArch64GenSystemOperands.inc"
+}
+
+namespace AArch64DC {
+ struct DC {
+ const char *Name;
+ uint16_t Encoding;
+ };
+
+ #define GET_DC_DECL
+ #include "AArch64GenSystemOperands.inc"
+}
+
+namespace AArch64IC {
+ struct IC {
+ const char *Name;
+ uint16_t Encoding;
+ bool NeedsReg;
+ };
+ #define GET_IC_DECL
+ #include "AArch64GenSystemOperands.inc"
+}
+
+namespace AArch64ISB {
+ struct ISB {
+ const char *Name;
+ uint16_t Encoding;
+ };
+ #define GET_ISB_DECL
+ #include "AArch64GenSystemOperands.inc"
+}
+
+namespace AArch64PRFM {
+ struct PRFM {
+ const char *Name;
+ uint16_t Encoding;
+ };
+ #define GET_PRFM_DECL
+ #include "AArch64GenSystemOperands.inc"
+}
+
+namespace AArch64PState {
+ struct PState {
+ const char *Name;
+ uint16_t Encoding;
+ FeatureBitset FeaturesRequired;
+
+ bool haveFeatures(FeatureBitset ActiveFeatures) const {
+ return (FeaturesRequired & ActiveFeatures) == FeaturesRequired;
+ }
+ };
+ #define GET_PSTATE_DECL
+ #include "AArch64GenSystemOperands.inc"
+}
+
+namespace AArch64PSBHint {
+ struct PSB {
+ const char *Name;
+ uint16_t Encoding;
+ };
+ #define GET_PSB_DECL
+ #include "AArch64GenSystemOperands.inc"
+}
+
+namespace AArch64SE {
+ enum ShiftExtSpecifiers {
+ Invalid = -1,
+ LSL,
+ MSL,
+ LSR,
+ ASR,
+ ROR,
+
+ UXTB,
+ UXTH,
+ UXTW,
+ UXTX,
+
+ SXTB,
+ SXTH,
+ SXTW,
+ SXTX
+ };
+}
+
+namespace AArch64Layout {
+ enum VectorLayout {
+ Invalid = -1,
+ VL_8B,
+ VL_4H,
+ VL_2S,
+ VL_1D,
+
+ VL_16B,
+ VL_8H,
+ VL_4S,
+ VL_2D,
+
+ // Bare layout for the 128-bit vector
+ // (only show ".b", ".h", ".s", ".d" without vector number)
+ VL_B,
+ VL_H,
+ VL_S,
+ VL_D
+ };
+}
+
+inline static const char *
+AArch64VectorLayoutToString(AArch64Layout::VectorLayout Layout) {
+ switch (Layout) {
+ case AArch64Layout::VL_8B: return ".8b";
+ case AArch64Layout::VL_4H: return ".4h";
+ case AArch64Layout::VL_2S: return ".2s";
+ case AArch64Layout::VL_1D: return ".1d";
+ case AArch64Layout::VL_16B: return ".16b";
+ case AArch64Layout::VL_8H: return ".8h";
+ case AArch64Layout::VL_4S: return ".4s";
+ case AArch64Layout::VL_2D: return ".2d";
+ case AArch64Layout::VL_B: return ".b";
+ case AArch64Layout::VL_H: return ".h";
+ case AArch64Layout::VL_S: return ".s";
+ case AArch64Layout::VL_D: return ".d";
+ default: llvm_unreachable("Unknown Vector Layout");
+ }
+}
+
+inline static AArch64Layout::VectorLayout
+AArch64StringToVectorLayout(StringRef LayoutStr) {
+ return StringSwitch<AArch64Layout::VectorLayout>(LayoutStr)
+ .Case(".8b", AArch64Layout::VL_8B)
+ .Case(".4h", AArch64Layout::VL_4H)
+ .Case(".2s", AArch64Layout::VL_2S)
+ .Case(".1d", AArch64Layout::VL_1D)
+ .Case(".16b", AArch64Layout::VL_16B)
+ .Case(".8h", AArch64Layout::VL_8H)
+ .Case(".4s", AArch64Layout::VL_4S)
+ .Case(".2d", AArch64Layout::VL_2D)
+ .Case(".b", AArch64Layout::VL_B)
+ .Case(".h", AArch64Layout::VL_H)
+ .Case(".s", AArch64Layout::VL_S)
+ .Case(".d", AArch64Layout::VL_D)
+ .Default(AArch64Layout::Invalid);
+}
+
+namespace AArch64SysReg {
+ struct SysReg {
+ const char *Name;
+ unsigned Encoding;
+ bool Readable;
+ bool Writeable;
+ FeatureBitset FeaturesRequired;
+
+ bool haveFeatures(FeatureBitset ActiveFeatures) const {
+ return (FeaturesRequired & ActiveFeatures) == FeaturesRequired;
+ }
+ };
+
+ #define GET_SYSREG_DECL
+ #include "AArch64GenSystemOperands.inc"
+
+ const SysReg *lookupSysRegByName(StringRef);
+ const SysReg *lookupSysRegByEncoding(uint16_t);
+
+ uint32_t parseGenericRegister(StringRef Name);
+ std::string genericRegisterString(uint32_t Bits);
+}
+
+namespace AArch64TLBI {
+ struct TLBI {
+ const char *Name;
+ uint16_t Encoding;
+ bool NeedsReg;
+ };
+ #define GET_TLBI_DECL
+ #include "AArch64GenSystemOperands.inc"
+}
+
+namespace AArch64II {
+ /// Target Operand Flag enum.
+ enum TOF {
+ //===------------------------------------------------------------------===//
+ // AArch64 Specific MachineOperand flags.
+
+ MO_NO_FLAG,
+
+ MO_FRAGMENT = 0xf,
+
+ /// MO_PAGE - A symbol operand with this flag represents the pc-relative
+ /// offset of the 4K page containing the symbol. This is used with the
+ /// ADRP instruction.
+ MO_PAGE = 1,
+
+ /// MO_PAGEOFF - A symbol operand with this flag represents the offset of
+ /// that symbol within a 4K page. This offset is added to the page address
+ /// to produce the complete address.
+ MO_PAGEOFF = 2,
+
+ /// MO_G3 - A symbol operand with this flag (granule 3) represents the high
+ /// 16-bits of a 64-bit address, used in a MOVZ or MOVK instruction
+ MO_G3 = 3,
+
+ /// MO_G2 - A symbol operand with this flag (granule 2) represents the bits
+ /// 32-47 of a 64-bit address, used in a MOVZ or MOVK instruction
+ MO_G2 = 4,
+
+ /// MO_G1 - A symbol operand with this flag (granule 1) represents the bits
+ /// 16-31 of a 64-bit address, used in a MOVZ or MOVK instruction
+ MO_G1 = 5,
+
+ /// MO_G0 - A symbol operand with this flag (granule 0) represents the bits
+ /// 0-15 of a 64-bit address, used in a MOVZ or MOVK instruction
+ MO_G0 = 6,
+
+ /// MO_HI12 - This flag indicates that a symbol operand represents the bits
+ /// 13-24 of a 64-bit address, used in a arithmetic immediate-shifted-left-
+ /// by-12-bits instruction.
+ MO_HI12 = 7,
+
+ /// MO_GOT - This flag indicates that a symbol operand represents the
+ /// address of the GOT entry for the symbol, rather than the address of
+ /// the symbol itself.
+ MO_GOT = 0x10,
+
+ /// MO_NC - Indicates whether the linker is expected to check the symbol
+ /// reference for overflow. For example in an ADRP/ADD pair of relocations
+ /// the ADRP usually does check, but not the ADD.
+ MO_NC = 0x20,
+
+ /// MO_TLS - Indicates that the operand being accessed is some kind of
+ /// thread-local symbol. On Darwin, only one type of thread-local access
+ /// exists (pre linker-relaxation), but on ELF the TLSModel used for the
+ /// referee will affect interpretation.
+ MO_TLS = 0x40
+ };
+} // end namespace AArch64II
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h
new file mode 100644
index 000000000000..7b0a7f4b6058
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -0,0 +1,175 @@
+//===-- AMDGPU.h - MachineFunction passes hw codegen --------------*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPU_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPU_H
+
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class AMDGPUTargetMachine;
+class FunctionPass;
+class GCNTargetMachine;
+class ModulePass;
+class Pass;
+class Target;
+class TargetMachine;
+class PassRegistry;
+
+// R600 Passes
+FunctionPass *createR600VectorRegMerger(TargetMachine &tm);
+FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm);
+FunctionPass *createR600EmitClauseMarkers();
+FunctionPass *createR600ClauseMergePass(TargetMachine &tm);
+FunctionPass *createR600Packetizer(TargetMachine &tm);
+FunctionPass *createR600ControlFlowFinalizer(TargetMachine &tm);
+FunctionPass *createAMDGPUCFGStructurizerPass();
+
+// SI Passes
+FunctionPass *createSITypeRewriter();
+FunctionPass *createSIAnnotateControlFlowPass();
+FunctionPass *createSIFoldOperandsPass();
+FunctionPass *createSILowerI1CopiesPass();
+FunctionPass *createSIShrinkInstructionsPass();
+FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm);
+FunctionPass *createSIWholeQuadModePass();
+FunctionPass *createSIFixControlFlowLiveIntervalsPass();
+FunctionPass *createSIFixSGPRCopiesPass();
+FunctionPass *createSIDebuggerInsertNopsPass();
+FunctionPass *createSIInsertWaitsPass();
+FunctionPass *createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM = nullptr);
+
+ModulePass *createAMDGPUAnnotateKernelFeaturesPass();
+void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
+extern char &AMDGPUAnnotateKernelFeaturesID;
+
+void initializeSIFoldOperandsPass(PassRegistry &);
+extern char &SIFoldOperandsID;
+
+void initializeSIShrinkInstructionsPass(PassRegistry&);
+extern char &SIShrinkInstructionsID;
+
+void initializeSIFixSGPRCopiesPass(PassRegistry &);
+extern char &SIFixSGPRCopiesID;
+
+void initializeSILowerI1CopiesPass(PassRegistry &);
+extern char &SILowerI1CopiesID;
+
+void initializeSILoadStoreOptimizerPass(PassRegistry &);
+extern char &SILoadStoreOptimizerID;
+
+void initializeSIWholeQuadModePass(PassRegistry &);
+extern char &SIWholeQuadModeID;
+
+void initializeSILowerControlFlowPass(PassRegistry &);
+extern char &SILowerControlFlowID;
+
+void initializeSIInsertSkipsPass(PassRegistry &);
+extern char &SIInsertSkipsPassID;
+
+void initializeSIOptimizeExecMaskingPass(PassRegistry &);
+extern char &SIOptimizeExecMaskingID;
+
+// Passes common to R600 and SI
+FunctionPass *createAMDGPUPromoteAlloca(const TargetMachine *TM = nullptr);
+void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
+extern char &AMDGPUPromoteAllocaID;
+
+Pass *createAMDGPUStructurizeCFGPass();
+FunctionPass *createAMDGPUISelDag(TargetMachine &TM,
+ CodeGenOpt::Level OptLevel);
+ModulePass *createAMDGPUAlwaysInlinePass();
+ModulePass *createAMDGPUOpenCLImageTypeLoweringPass();
+FunctionPass *createAMDGPUAnnotateUniformValues();
+
+FunctionPass* createAMDGPUUnifyMetadataPass();
+void initializeAMDGPUUnifyMetadataPass(PassRegistry&);
+extern char &AMDGPUUnifyMetadataID;
+
+void initializeSIFixControlFlowLiveIntervalsPass(PassRegistry&);
+extern char &SIFixControlFlowLiveIntervalsID;
+
+void initializeAMDGPUAnnotateUniformValuesPass(PassRegistry&);
+extern char &AMDGPUAnnotateUniformValuesPassID;
+
+void initializeAMDGPUCodeGenPreparePass(PassRegistry&);
+extern char &AMDGPUCodeGenPrepareID;
+
+void initializeSIAnnotateControlFlowPass(PassRegistry&);
+extern char &SIAnnotateControlFlowPassID;
+
+void initializeSIDebuggerInsertNopsPass(PassRegistry&);
+extern char &SIDebuggerInsertNopsID;
+
+void initializeSIInsertWaitsPass(PassRegistry&);
+extern char &SIInsertWaitsID;
+
+Target &getTheAMDGPUTarget();
+Target &getTheGCNTarget();
+
+namespace AMDGPU {
+enum TargetIndex {
+ TI_CONSTDATA_START,
+ TI_SCRATCH_RSRC_DWORD0,
+ TI_SCRATCH_RSRC_DWORD1,
+ TI_SCRATCH_RSRC_DWORD2,
+ TI_SCRATCH_RSRC_DWORD3
+};
+}
+
+} // End namespace llvm
+
+/// OpenCL uses address spaces to differentiate between
+/// various memory regions on the hardware. On the CPU
+/// all of the address spaces point to the same memory,
+/// however on the GPU, each address space points to
+/// a separate piece of memory that is unique from other
+/// memory locations.
+namespace AMDGPUAS {
+enum AddressSpaces : unsigned {
+ PRIVATE_ADDRESS = 0, ///< Address space for private memory.
+ GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0).
+ CONSTANT_ADDRESS = 2, ///< Address space for constant memory (VTX2)
+ LOCAL_ADDRESS = 3, ///< Address space for local memory.
+ FLAT_ADDRESS = 4, ///< Address space for flat memory.
+ REGION_ADDRESS = 5, ///< Address space for region memory.
+ PARAM_D_ADDRESS = 6, ///< Address space for direct addressible parameter memory (CONST0)
+ PARAM_I_ADDRESS = 7, ///< Address space for indirect addressible parameter memory (VTX1)
+
+ // Do not re-order the CONSTANT_BUFFER_* enums. Several places depend on this
+ // order to be able to dynamically index a constant buffer, for example:
+ //
+ // ConstantBufferAS = CONSTANT_BUFFER_0 + CBIdx
+
+ CONSTANT_BUFFER_0 = 8,
+ CONSTANT_BUFFER_1 = 9,
+ CONSTANT_BUFFER_2 = 10,
+ CONSTANT_BUFFER_3 = 11,
+ CONSTANT_BUFFER_4 = 12,
+ CONSTANT_BUFFER_5 = 13,
+ CONSTANT_BUFFER_6 = 14,
+ CONSTANT_BUFFER_7 = 15,
+ CONSTANT_BUFFER_8 = 16,
+ CONSTANT_BUFFER_9 = 17,
+ CONSTANT_BUFFER_10 = 18,
+ CONSTANT_BUFFER_11 = 19,
+ CONSTANT_BUFFER_12 = 20,
+ CONSTANT_BUFFER_13 = 21,
+ CONSTANT_BUFFER_14 = 22,
+ CONSTANT_BUFFER_15 = 23,
+
+ // Some places use this if the address space can't be determined.
+ UNKNOWN_ADDRESS_SPACE = ~0u
+};
+
+} // namespace AMDGPUAS
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td
new file mode 100644
index 000000000000..0b2badff7ccf
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -0,0 +1,530 @@
+//===-- AMDGPU.td - AMDGPU Tablegen files --------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===------------------------------------------------------------===//
+// Subtarget Features (device properties)
+//===------------------------------------------------------------===//
+
+def FeatureFP64 : SubtargetFeature<"fp64",
+ "FP64",
+ "true",
+ "Enable double precision operations"
+>;
+
+def FeatureFastFMAF32 : SubtargetFeature<"fast-fmaf",
+ "FastFMAF32",
+ "true",
+ "Assuming f32 fma is at least as fast as mul + add"
+>;
+
+def HalfRate64Ops : SubtargetFeature<"half-rate-64-ops",
+ "HalfRate64Ops",
+ "true",
+ "Most fp64 instructions are half rate instead of quarter"
+>;
+
+def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst",
+ "R600ALUInst",
+ "false",
+ "Older version of ALU instructions encoding"
+>;
+
+def FeatureVertexCache : SubtargetFeature<"HasVertexCache",
+ "HasVertexCache",
+ "true",
+ "Specify use of dedicated vertex cache"
+>;
+
+def FeatureCaymanISA : SubtargetFeature<"caymanISA",
+ "CaymanISA",
+ "true",
+ "Use Cayman ISA"
+>;
+
+def FeatureCFALUBug : SubtargetFeature<"cfalubug",
+ "CFALUBug",
+ "true",
+ "GPU has CF_ALU bug"
+>;
+
+def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
+ "FlatAddressSpace",
+ "true",
+ "Support flat address space"
+>;
+
+def FeatureUnalignedBufferAccess : SubtargetFeature<"unaligned-buffer-access",
+ "UnalignedBufferAccess",
+ "true",
+ "Support unaligned global loads and stores"
+>;
+
+def FeatureUnalignedScratchAccess : SubtargetFeature<"unaligned-scratch-access",
+ "UnalignedScratchAccess",
+ "true",
+ "Support unaligned scratch loads and stores"
+>;
+
+// XNACK is disabled if SH_MEM_CONFIG.ADDRESS_MODE = GPUVM on chips that support
+// XNACK. The current default kernel driver setting is:
+// - graphics ring: XNACK disabled
+// - compute ring: XNACK enabled
+//
+// If XNACK is enabled, the VMEM latency can be worse.
+// If XNACK is disabled, the 2 SGPRs can be used for general purposes.
+def FeatureXNACK : SubtargetFeature<"xnack",
+ "EnableXNACK",
+ "true",
+ "Enable XNACK support"
+>;
+
+def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug",
+ "SGPRInitBug",
+ "true",
+ "VI SGPR initilization bug requiring a fixed SGPR allocation size"
+>;
+
+class SubtargetFeatureFetchLimit <string Value> :
+ SubtargetFeature <"fetch"#Value,
+ "TexVTXClauseSize",
+ Value,
+ "Limit the maximum number of fetches in a clause to "#Value
+>;
+
+def FeatureFetchLimit8 : SubtargetFeatureFetchLimit <"8">;
+def FeatureFetchLimit16 : SubtargetFeatureFetchLimit <"16">;
+
+class SubtargetFeatureWavefrontSize <int Value> : SubtargetFeature<
+ "wavefrontsize"#Value,
+ "WavefrontSize",
+ !cast<string>(Value),
+ "The number of threads per wavefront"
+>;
+
+def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>;
+def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>;
+def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>;
+
+class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <
+ "ldsbankcount"#Value,
+ "LDSBankCount",
+ !cast<string>(Value),
+ "The number of LDS banks per compute unit."
+>;
+
+def FeatureLDSBankCount16 : SubtargetFeatureLDSBankCount<16>;
+def FeatureLDSBankCount32 : SubtargetFeatureLDSBankCount<32>;
+
+class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature<
+ "localmemorysize"#Value,
+ "LocalMemorySize",
+ !cast<string>(Value),
+ "The size of local memory in bytes"
+>;
+
+def FeatureGCN : SubtargetFeature<"gcn",
+ "IsGCN",
+ "true",
+ "GCN or newer GPU"
+>;
+
+def FeatureGCN1Encoding : SubtargetFeature<"gcn1-encoding",
+ "GCN1Encoding",
+ "true",
+ "Encoding format for SI and CI"
+>;
+
+def FeatureGCN3Encoding : SubtargetFeature<"gcn3-encoding",
+ "GCN3Encoding",
+ "true",
+ "Encoding format for VI"
+>;
+
+def FeatureCIInsts : SubtargetFeature<"ci-insts",
+ "CIInsts",
+ "true",
+ "Additional intstructions for CI+"
+>;
+
+def FeatureSMemRealTime : SubtargetFeature<"s-memrealtime",
+ "HasSMemRealTime",
+ "true",
+ "Has s_memrealtime instruction"
+>;
+
+def FeatureInv2PiInlineImm : SubtargetFeature<"inv-2pi-inline-imm",
+ "HasInv2PiInlineImm",
+ "true",
+ "Has 1 / (2 * pi) as inline immediate"
+>;
+
+def Feature16BitInsts : SubtargetFeature<"16-bit-insts",
+ "Has16BitInsts",
+ "true",
+ "Has i16/f16 instructions"
+>;
+
+def FeatureMovrel : SubtargetFeature<"movrel",
+ "HasMovrel",
+ "true",
+ "Has v_movrel*_b32 instructions"
+>;
+
+def FeatureVGPRIndexMode : SubtargetFeature<"vgpr-index-mode",
+ "HasVGPRIndexMode",
+ "true",
+ "Has VGPR mode register indexing"
+>;
+
+def FeatureScalarStores : SubtargetFeature<"scalar-stores",
+ "HasScalarStores",
+ "true",
+ "Has store scalar memory instructions"
+>;
+
+//===------------------------------------------------------------===//
+// Subtarget Features (options and debugging)
+//===------------------------------------------------------------===//
+
+def FeatureFP16Denormals : SubtargetFeature<"fp16-denormals",
+ "FP16Denormals",
+ "true",
+ "Enable half precision denormal handling"
+>;
+
+// Some instructions do not support denormals despite this flag. Using
+// fp32 denormals also causes instructions to run at the double
+// precision rate for the device.
+def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals",
+ "FP32Denormals",
+ "true",
+ "Enable single precision denormal handling"
+>;
+
+def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals",
+ "FP64Denormals",
+ "true",
+ "Enable double precision denormal handling",
+ [FeatureFP64]
+>;
+
+def FeatureFPExceptions : SubtargetFeature<"fp-exceptions",
+ "FPExceptions",
+ "true",
+ "Enable floating point exceptions"
+>;
+
+class FeatureMaxPrivateElementSize<int size> : SubtargetFeature<
+ "max-private-element-size-"#size,
+ "MaxPrivateElementSize",
+ !cast<string>(size),
+ "Maximum private access size may be "#size
+>;
+
+def FeatureMaxPrivateElementSize4 : FeatureMaxPrivateElementSize<4>;
+def FeatureMaxPrivateElementSize8 : FeatureMaxPrivateElementSize<8>;
+def FeatureMaxPrivateElementSize16 : FeatureMaxPrivateElementSize<16>;
+
+def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling",
+ "EnableVGPRSpilling",
+ "true",
+ "Enable spilling of VGPRs to scratch memory"
+>;
+
+def FeatureDumpCode : SubtargetFeature <"DumpCode",
+ "DumpCode",
+ "true",
+ "Dump MachineInstrs in the CodeEmitter"
+>;
+
+def FeatureDumpCodeLower : SubtargetFeature <"dumpcode",
+ "DumpCode",
+ "true",
+ "Dump MachineInstrs in the CodeEmitter"
+>;
+
+def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca",
+ "EnablePromoteAlloca",
+ "true",
+ "Enable promote alloca pass"
+>;
+
+// XXX - This should probably be removed once enabled by default
+def FeatureEnableLoadStoreOpt : SubtargetFeature <"load-store-opt",
+ "EnableLoadStoreOpt",
+ "true",
+ "Enable SI load/store optimizer pass"
+>;
+
+// Performance debugging feature. Allow using DS instruction immediate
+// offsets even if the base pointer can't be proven to be base. On SI,
+// base pointer values that won't give the same result as a 16-bit add
+// are not safe to fold, but this will override the conservative test
+// for the base pointer.
+def FeatureEnableUnsafeDSOffsetFolding : SubtargetFeature <
+ "unsafe-ds-offset-folding",
+ "EnableUnsafeDSOffsetFolding",
+ "true",
+ "Force using DS instruction immediate offsets on SI"
+>;
+
+def FeatureEnableSIScheduler : SubtargetFeature<"si-scheduler",
+ "EnableSIScheduler",
+ "true",
+ "Enable SI Machine Scheduler"
+>;
+
+def FeatureFlatForGlobal : SubtargetFeature<"flat-for-global",
+ "FlatForGlobal",
+ "true",
+ "Force to generate flat instruction for global"
+>;
+
+// Dummy feature used to disable assembler instructions.
+def FeatureDisable : SubtargetFeature<"",
+ "FeatureDisable","true",
+ "Dummy feature to disable assembler instructions"
+>;
+
+class SubtargetFeatureGeneration <string Value,
+ list<SubtargetFeature> Implies> :
+ SubtargetFeature <Value, "Gen", "AMDGPUSubtarget::"#Value,
+ Value#" GPU generation", Implies>;
+
+def FeatureLocalMemorySize0 : SubtargetFeatureLocalMemorySize<0>;
+def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>;
+def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>;
+
+def FeatureR600 : SubtargetFeatureGeneration<"R600",
+ [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0]
+>;
+
+def FeatureR700 : SubtargetFeatureGeneration<"R700",
+ [FeatureFetchLimit16, FeatureLocalMemorySize0]
+>;
+
+def FeatureEvergreen : SubtargetFeatureGeneration<"EVERGREEN",
+ [FeatureFetchLimit16, FeatureLocalMemorySize32768]
+>;
+
+def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS",
+ [FeatureFetchLimit16, FeatureWavefrontSize64,
+ FeatureLocalMemorySize32768]
+>;
+
+def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
+ [FeatureFP64, FeatureLocalMemorySize32768,
+ FeatureWavefrontSize64, FeatureGCN, FeatureGCN1Encoding,
+ FeatureLDSBankCount32, FeatureMovrel]
+>;
+
+def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS",
+ [FeatureFP64, FeatureLocalMemorySize65536,
+ FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace,
+ FeatureGCN1Encoding, FeatureCIInsts, FeatureMovrel]
+>;
+
+def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
+ [FeatureFP64, FeatureLocalMemorySize65536,
+ FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
+ FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
+ FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel,
+ FeatureScalarStores, FeatureInv2PiInlineImm
+ ]
+>;
+
+class SubtargetFeatureISAVersion <int Major, int Minor, int Stepping,
+ list<SubtargetFeature> Implies>
+ : SubtargetFeature <
+ "isaver"#Major#"."#Minor#"."#Stepping,
+ "IsaVersion",
+ "ISAVersion"#Major#"_"#Minor#"_"#Stepping,
+ "Instruction set version number",
+ Implies
+>;
+
+def FeatureISAVersion7_0_0 : SubtargetFeatureISAVersion <7,0,0,
+ [FeatureSeaIslands,
+ FeatureLDSBankCount32]>;
+
+def FeatureISAVersion7_0_1 : SubtargetFeatureISAVersion <7,0,1,
+ [FeatureSeaIslands,
+ HalfRate64Ops,
+ FeatureLDSBankCount32,
+ FeatureFastFMAF32]>;
+
+def FeatureISAVersion7_0_2 : SubtargetFeatureISAVersion <7,0,2,
+ [FeatureSeaIslands,
+ FeatureLDSBankCount16]>;
+
+def FeatureISAVersion8_0_0 : SubtargetFeatureISAVersion <8,0,0,
+ [FeatureVolcanicIslands,
+ FeatureLDSBankCount32,
+ FeatureSGPRInitBug]>;
+
+def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1,
+ [FeatureVolcanicIslands,
+ FeatureLDSBankCount32,
+ FeatureXNACK]>;
+
+def FeatureISAVersion8_0_2 : SubtargetFeatureISAVersion <8,0,2,
+ [FeatureVolcanicIslands,
+ FeatureLDSBankCount32,
+ FeatureSGPRInitBug]>;
+
+def FeatureISAVersion8_0_3 : SubtargetFeatureISAVersion <8,0,3,
+ [FeatureVolcanicIslands,
+ FeatureLDSBankCount32]>;
+
+def FeatureISAVersion8_0_4 : SubtargetFeatureISAVersion <8,0,4,
+ [FeatureVolcanicIslands,
+ FeatureLDSBankCount32]>;
+
+def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0,
+ [FeatureVolcanicIslands,
+ FeatureLDSBankCount16,
+ FeatureXNACK]>;
+
+//===----------------------------------------------------------------------===//
+// Debugger related subtarget features.
+//===----------------------------------------------------------------------===//
+
+def FeatureDebuggerInsertNops : SubtargetFeature<
+ "amdgpu-debugger-insert-nops",
+ "DebuggerInsertNops",
+ "true",
+ "Insert one nop instruction for each high level source statement"
+>;
+
+def FeatureDebuggerReserveRegs : SubtargetFeature<
+ "amdgpu-debugger-reserve-regs",
+ "DebuggerReserveRegs",
+ "true",
+ "Reserve registers for debugger usage"
+>;
+
+def FeatureDebuggerEmitPrologue : SubtargetFeature<
+ "amdgpu-debugger-emit-prologue",
+ "DebuggerEmitPrologue",
+ "true",
+ "Emit debugger prologue"
+>;
+
+//===----------------------------------------------------------------------===//
+
+def AMDGPUInstrInfo : InstrInfo {
+ let guessInstructionProperties = 1;
+ let noNamedPositionallyEncodedOperands = 1;
+}
+
+def AMDGPUAsmParser : AsmParser {
+ // Some of the R600 registers have the same name, so this crashes.
+ // For example T0_XYZW and T0_XY both have the asm name T0.
+ let ShouldEmitMatchRegisterName = 0;
+}
+
+def AMDGPUAsmWriter : AsmWriter {
+ int PassSubtarget = 1;
+}
+
+def AMDGPUAsmVariants {
+ string Default = "Default";
+ int Default_ID = 0;
+ string VOP3 = "VOP3";
+ int VOP3_ID = 1;
+ string SDWA = "SDWA";
+ int SDWA_ID = 2;
+ string DPP = "DPP";
+ int DPP_ID = 3;
+ string Disable = "Disable";
+ int Disable_ID = 4;
+}
+
+def DefaultAMDGPUAsmParserVariant : AsmParserVariant {
+ let Variant = AMDGPUAsmVariants.Default_ID;
+ let Name = AMDGPUAsmVariants.Default;
+}
+
+def VOP3AsmParserVariant : AsmParserVariant {
+ let Variant = AMDGPUAsmVariants.VOP3_ID;
+ let Name = AMDGPUAsmVariants.VOP3;
+}
+
+def SDWAAsmParserVariant : AsmParserVariant {
+ let Variant = AMDGPUAsmVariants.SDWA_ID;
+ let Name = AMDGPUAsmVariants.SDWA;
+}
+
+def DPPAsmParserVariant : AsmParserVariant {
+ let Variant = AMDGPUAsmVariants.DPP_ID;
+ let Name = AMDGPUAsmVariants.DPP;
+}
+
+def AMDGPU : Target {
+ // Pull in Instruction Info:
+ let InstructionSet = AMDGPUInstrInfo;
+ let AssemblyParsers = [AMDGPUAsmParser];
+ let AssemblyParserVariants = [DefaultAMDGPUAsmParserVariant,
+ VOP3AsmParserVariant,
+ SDWAAsmParserVariant,
+ DPPAsmParserVariant];
+ let AssemblyWriters = [AMDGPUAsmWriter];
+}
+
+// Dummy Instruction itineraries for pseudo instructions
+def ALU_NULL : FuncUnit;
+def NullALU : InstrItinClass;
+
+//===----------------------------------------------------------------------===//
+// Predicate helper class
+//===----------------------------------------------------------------------===//
+
+def TruePredicate : Predicate<"true">;
+
+def isSICI : Predicate<
+ "Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS"
+>, AssemblerPredicate<"FeatureGCN1Encoding">;
+
+def isVI : Predicate <
+ "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">,
+ AssemblerPredicate<"FeatureGCN3Encoding">;
+
+def isCIVI : Predicate <
+ "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS || "
+ "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS"
+>, AssemblerPredicate<"FeatureCIInsts">;
+
+def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">;
+
+def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">;
+
+class PredicateControl {
+ Predicate SubtargetPredicate;
+ Predicate SIAssemblerPredicate = isSICI;
+ Predicate VIAssemblerPredicate = isVI;
+ list<Predicate> AssemblerPredicates = [];
+ Predicate AssemblerPredicate = TruePredicate;
+ list<Predicate> OtherPredicates = [];
+ list<Predicate> Predicates = !listconcat([SubtargetPredicate, AssemblerPredicate],
+ AssemblerPredicates,
+ OtherPredicates);
+}
+
+// Include AMDGPU TD files
+include "R600Schedule.td"
+include "SISchedule.td"
+include "Processors.td"
+include "AMDGPUInstrInfo.td"
+include "AMDGPUIntrinsics.td"
+include "AMDGPURegisterInfo.td"
+include "AMDGPUInstructions.td"
+include "AMDGPUCallingConv.td"
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
new file mode 100644
index 000000000000..067a16a2af7f
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -0,0 +1,75 @@
+//===-- AMDGPUAlwaysInlinePass.cpp - Promote Allocas ----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass marks all internal functions as always_inline and creates
+/// duplicates of all other functions a marks the duplicates as always_inline.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUAlwaysInline : public ModulePass {
+ static char ID;
+
+public:
+ AMDGPUAlwaysInline() : ModulePass(ID) { }
+ bool runOnModule(Module &M) override;
+ StringRef getPassName() const override { return "AMDGPU Always Inline Pass"; }
+};
+
+} // End anonymous namespace
+
+char AMDGPUAlwaysInline::ID = 0;
+
+bool AMDGPUAlwaysInline::runOnModule(Module &M) {
+ std::vector<GlobalAlias*> AliasesToRemove;
+ std::vector<Function *> FuncsToClone;
+
+ for (GlobalAlias &A : M.aliases()) {
+ if (Function* F = dyn_cast<Function>(A.getAliasee())) {
+ A.replaceAllUsesWith(F);
+ AliasesToRemove.push_back(&A);
+ }
+ }
+
+ for (GlobalAlias* A : AliasesToRemove) {
+ A->eraseFromParent();
+ }
+
+ for (Function &F : M) {
+ if (!F.hasLocalLinkage() && !F.isDeclaration() && !F.use_empty() &&
+ !F.hasFnAttribute(Attribute::NoInline))
+ FuncsToClone.push_back(&F);
+ }
+
+ for (Function *F : FuncsToClone) {
+ ValueToValueMapTy VMap;
+ Function *NewFunc = CloneFunction(F, VMap);
+ NewFunc->setLinkage(GlobalValue::InternalLinkage);
+ F->replaceAllUsesWith(NewFunc);
+ }
+
+ for (Function &F : M) {
+ if (F.hasLocalLinkage() && !F.hasFnAttribute(Attribute::NoInline)) {
+ F.addFnAttr(Attribute::AlwaysInline);
+ }
+ }
+ return false;
+}
+
+ModulePass *llvm::createAMDGPUAlwaysInlinePass() {
+ return new AMDGPUAlwaysInline();
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
new file mode 100644
index 000000000000..c98d25e20185
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -0,0 +1,222 @@
+//===-- AMDGPUAnnotateKernelFeaturesPass.cpp ------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This pass adds target attributes to functions which use intrinsics
+/// which will impact calling convention lowering.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+
+#define DEBUG_TYPE "amdgpu-annotate-kernel-features"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUAnnotateKernelFeatures : public ModulePass {
+private:
+ static bool hasAddrSpaceCast(const Function &F);
+
+ void addAttrToCallers(Function *Intrin, StringRef AttrName);
+ bool addAttrsForIntrinsics(Module &M, ArrayRef<StringRef[2]>);
+
+public:
+ static char ID;
+
+ AMDGPUAnnotateKernelFeatures() : ModulePass(ID) { }
+ bool runOnModule(Module &M) override;
+ StringRef getPassName() const override {
+ return "AMDGPU Annotate Kernel Features";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ ModulePass::getAnalysisUsage(AU);
+ }
+
+ static bool visitConstantExpr(const ConstantExpr *CE);
+ static bool visitConstantExprsRecursively(
+ const Constant *EntryC,
+ SmallPtrSet<const Constant *, 8> &ConstantExprVisited);
+};
+
+}
+
+char AMDGPUAnnotateKernelFeatures::ID = 0;
+
+char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
+
+INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
+ "Add AMDGPU function attributes", false, false)
+
+
+// The queue ptr is only needed when casting to flat, not from it.
+static bool castRequiresQueuePtr(unsigned SrcAS) {
+ return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
+}
+
+static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
+ return castRequiresQueuePtr(ASC->getSrcAddressSpace());
+}
+
+bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
+ if (CE->getOpcode() == Instruction::AddrSpaceCast) {
+ unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
+ return castRequiresQueuePtr(SrcAS);
+ }
+
+ return false;
+}
+
+bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
+ const Constant *EntryC,
+ SmallPtrSet<const Constant *, 8> &ConstantExprVisited) {
+
+ if (!ConstantExprVisited.insert(EntryC).second)
+ return false;
+
+ SmallVector<const Constant *, 16> Stack;
+ Stack.push_back(EntryC);
+
+ while (!Stack.empty()) {
+ const Constant *C = Stack.pop_back_val();
+
+ // Check this constant expression.
+ if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
+ if (visitConstantExpr(CE))
+ return true;
+ }
+
+ // Visit all sub-expressions.
+ for (const Use &U : C->operands()) {
+ const auto *OpC = dyn_cast<Constant>(U);
+ if (!OpC)
+ continue;
+
+ if (!ConstantExprVisited.insert(OpC).second)
+ continue;
+
+ Stack.push_back(OpC);
+ }
+ }
+
+ return false;
+}
+
+// Return true if an addrspacecast is used that requires the queue ptr.
+bool AMDGPUAnnotateKernelFeatures::hasAddrSpaceCast(const Function &F) {
+ SmallPtrSet<const Constant *, 8> ConstantExprVisited;
+
+ for (const BasicBlock &BB : F) {
+ for (const Instruction &I : BB) {
+ if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
+ if (castRequiresQueuePtr(ASC))
+ return true;
+ }
+
+ for (const Use &U : I.operands()) {
+ const auto *OpC = dyn_cast<Constant>(U);
+ if (!OpC)
+ continue;
+
+ if (visitConstantExprsRecursively(OpC, ConstantExprVisited))
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+void AMDGPUAnnotateKernelFeatures::addAttrToCallers(Function *Intrin,
+ StringRef AttrName) {
+ SmallPtrSet<Function *, 4> SeenFuncs;
+
+ for (User *U : Intrin->users()) {
+ // CallInst is the only valid user for an intrinsic.
+ CallInst *CI = cast<CallInst>(U);
+
+ Function *CallingFunction = CI->getParent()->getParent();
+ if (SeenFuncs.insert(CallingFunction).second)
+ CallingFunction->addFnAttr(AttrName);
+ }
+}
+
+bool AMDGPUAnnotateKernelFeatures::addAttrsForIntrinsics(
+ Module &M,
+ ArrayRef<StringRef[2]> IntrinsicToAttr) {
+ bool Changed = false;
+
+ for (const StringRef *Arr : IntrinsicToAttr) {
+ if (Function *Fn = M.getFunction(Arr[0])) {
+ addAttrToCallers(Fn, Arr[1]);
+ Changed = true;
+ }
+ }
+
+ return Changed;
+}
+
+bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) {
+ Triple TT(M.getTargetTriple());
+
+ static const StringRef IntrinsicToAttr[][2] = {
+ // .x omitted
+ { "llvm.amdgcn.workitem.id.y", "amdgpu-work-item-id-y" },
+ { "llvm.amdgcn.workitem.id.z", "amdgpu-work-item-id-z" },
+
+ { "llvm.amdgcn.workgroup.id.y", "amdgpu-work-group-id-y" },
+ { "llvm.amdgcn.workgroup.id.z", "amdgpu-work-group-id-z" },
+
+ { "llvm.r600.read.tgid.y", "amdgpu-work-group-id-y" },
+ { "llvm.r600.read.tgid.z", "amdgpu-work-group-id-z" },
+
+ // .x omitted
+ { "llvm.r600.read.tidig.y", "amdgpu-work-item-id-y" },
+ { "llvm.r600.read.tidig.z", "amdgpu-work-item-id-z" }
+ };
+
+ static const StringRef HSAIntrinsicToAttr[][2] = {
+ { "llvm.amdgcn.dispatch.ptr", "amdgpu-dispatch-ptr" },
+ { "llvm.amdgcn.queue.ptr", "amdgpu-queue-ptr" },
+ { "llvm.amdgcn.dispatch.id", "amdgpu-dispatch-id" }
+ };
+
+ // TODO: We should not add the attributes if the known compile time workgroup
+ // size is 1 for y/z.
+
+ // TODO: Intrinsics that require queue ptr.
+
+ // We do not need to note the x workitem or workgroup id because they are
+ // always initialized.
+
+ bool Changed = addAttrsForIntrinsics(M, IntrinsicToAttr);
+ if (TT.getOS() == Triple::AMDHSA || TT.getOS() == Triple::Mesa3D) {
+ Changed |= addAttrsForIntrinsics(M, HSAIntrinsicToAttr);
+
+ for (Function &F : M) {
+ if (F.hasFnAttribute("amdgpu-queue-ptr"))
+ continue;
+
+ if (hasAddrSpaceCast(F))
+ F.addFnAttr("amdgpu-queue-ptr");
+ }
+ }
+
+ return Changed;
+}
+
+ModulePass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
+ return new AMDGPUAnnotateKernelFeatures();
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
new file mode 100644
index 000000000000..c011be6fa169
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -0,0 +1,189 @@
+//===-- AMDGPUAnnotateUniformValues.cpp - ---------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass adds amdgpu.uniform metadata to IR values so this information
+/// can be used during instruction selection.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUIntrinsicInfo.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "amdgpu-annotate-uniform"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUAnnotateUniformValues : public FunctionPass,
+ public InstVisitor<AMDGPUAnnotateUniformValues> {
+ DivergenceAnalysis *DA;
+ MemoryDependenceResults *MDR;
+ LoopInfo *LI;
+ DenseMap<Value*, GetElementPtrInst*> noClobberClones;
+ bool isKernelFunc;
+
+public:
+ static char ID;
+ AMDGPUAnnotateUniformValues() :
+ FunctionPass(ID) { }
+ bool doInitialization(Module &M) override;
+ bool runOnFunction(Function &F) override;
+ StringRef getPassName() const override {
+ return "AMDGPU Annotate Uniform Values";
+ }
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<DivergenceAnalysis>();
+ AU.addRequired<MemoryDependenceWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.setPreservesAll();
+ }
+
+ void visitBranchInst(BranchInst &I);
+ void visitLoadInst(LoadInst &I);
+ bool isClobberedInFunction(LoadInst * Load);
+};
+
+} // End anonymous namespace
+
+INITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
+ "Add AMDGPU uniform metadata", false, false)
+INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
+ "Add AMDGPU uniform metadata", false, false)
+
+char AMDGPUAnnotateUniformValues::ID = 0;
+
+static void setUniformMetadata(Instruction *I) {
+ I->setMetadata("amdgpu.uniform", MDNode::get(I->getContext(), {}));
+}
+static void setNoClobberMetadata(Instruction *I) {
+ I->setMetadata("amdgpu.noclobber", MDNode::get(I->getContext(), {}));
+}
+
+static void DFS(BasicBlock *Root, SetVector<BasicBlock*> & Set) {
+ for (auto I : predecessors(Root))
+ if (Set.insert(I))
+ DFS(I, Set);
+}
+
+bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst * Load) {
+ // 1. get Loop for the Load->getparent();
+ // 2. if it exists, collect all the BBs from the most outer
+ // loop and check for the writes. If NOT - start DFS over all preds.
+ // 3. Start DFS over all preds from the most outer loop header.
+ SetVector<BasicBlock *> Checklist;
+ BasicBlock *Start = Load->getParent();
+ Checklist.insert(Start);
+ const Value *Ptr = Load->getPointerOperand();
+ const Loop *L = LI->getLoopFor(Start);
+ if (L) {
+ const Loop *P = L;
+ do {
+ L = P;
+ P = P->getParentLoop();
+ } while (P);
+ Checklist.insert(L->block_begin(), L->block_end());
+ Start = L->getHeader();
+ }
+
+ DFS(Start, Checklist);
+ for (auto &BB : Checklist) {
+ BasicBlock::iterator StartIt = (BB == Load->getParent()) ?
+ BasicBlock::iterator(Load) : BB->end();
+ if (MDR->getPointerDependencyFrom(MemoryLocation(Ptr),
+ true, StartIt, BB, Load).isClobber())
+ return true;
+ }
+ return false;
+}
+
+void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) {
+ if (I.isUnconditional())
+ return;
+
+ Value *Cond = I.getCondition();
+ if (!DA->isUniform(Cond))
+ return;
+
+ setUniformMetadata(I.getParent()->getTerminator());
+}
+
+void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
+ Value *Ptr = I.getPointerOperand();
+ if (!DA->isUniform(Ptr))
+ return;
+ auto isGlobalLoad = [](LoadInst &Load)->bool {
+ return Load.getPointerAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
+ };
+ // We're tracking up to the Function boundaries
+ // We cannot go beyond because of FunctionPass restrictions
+ // Thus we can ensure that memory not clobbered for memory
+ // operations that live in kernel only.
+ bool NotClobbered = isKernelFunc && !isClobberedInFunction(&I);
+ Instruction *PtrI = dyn_cast<Instruction>(Ptr);
+ if (!PtrI && NotClobbered && isGlobalLoad(I)) {
+ if (isa<Argument>(Ptr) || isa<GlobalValue>(Ptr)) {
+ // Lookup for the existing GEP
+ if (noClobberClones.count(Ptr)) {
+ PtrI = noClobberClones[Ptr];
+ } else {
+ // Create GEP of the Value
+ Function *F = I.getParent()->getParent();
+ Value *Idx = Constant::getIntegerValue(
+ Type::getInt32Ty(Ptr->getContext()), APInt(64, 0));
+ // Insert GEP at the entry to make it dominate all uses
+ PtrI = GetElementPtrInst::Create(
+ Ptr->getType()->getPointerElementType(), Ptr,
+ ArrayRef<Value*>(Idx), Twine(""), F->getEntryBlock().getFirstNonPHI());
+ }
+ I.replaceUsesOfWith(Ptr, PtrI);
+ }
+ }
+
+ if (PtrI) {
+ setUniformMetadata(PtrI);
+ if (NotClobbered)
+ setNoClobberMetadata(PtrI);
+ }
+}
+
+bool AMDGPUAnnotateUniformValues::doInitialization(Module &M) {
+ return false;
+}
+
+bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ DA = &getAnalysis<DivergenceAnalysis>();
+ MDR = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ isKernelFunc = F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
+
+ visit(F);
+ noClobberClones.clear();
+ return true;
+}
+
+FunctionPass *
+llvm::createAMDGPUAnnotateUniformValues() {
+ return new AMDGPUAnnotateUniformValues();
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
new file mode 100644
index 000000000000..a8e6902c252b
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -0,0 +1,826 @@
+//===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
+/// code. When passed an MCAsmStreamer it prints assembly and when passed
+/// an MCObjectStreamer it outputs binary code.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#include "AMDGPUAsmPrinter.h"
+#include "MCTargetDesc/AMDGPUTargetStreamer.h"
+#include "InstPrinter/AMDGPUInstPrinter.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "AMDGPU.h"
+#include "AMDKernelCodeT.h"
+#include "AMDGPUSubtarget.h"
+#include "R600Defines.h"
+#include "R600MachineFunctionInfo.h"
+#include "R600RegisterInfo.h"
+#include "SIDefines.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+using namespace llvm;
+
+// TODO: This should get the default rounding mode from the kernel. We just set
+// the default here, but this could change if the OpenCL rounding mode pragmas
+// are used.
+//
+// The denormal mode here should match what is reported by the OpenCL runtime
+// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
+// can also be override to flush with the -cl-denorms-are-zero compiler flag.
+//
+// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
+// precision, and leaves single precision to flush all and does not report
+// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
+// CL_FP_DENORM for both.
+//
+// FIXME: It seems some instructions do not support single precision denormals
+// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
+// and sin_f32, cos_f32 on most parts).
+
+// We want to use these instructions, and using fp32 denormals also causes
+// instructions to run at the double precision rate for the device so it's
+// probably best to just report no single precision denormals.
+static uint32_t getFPMode(const MachineFunction &F) {
+ const SISubtarget& ST = F.getSubtarget<SISubtarget>();
+ // TODO: Is there any real use for the flush in only / flush out only modes?
+
+ uint32_t FP32Denormals =
+ ST.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
+
+ uint32_t FP64Denormals =
+ ST.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
+
+ return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
+ FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
+ FP_DENORM_MODE_SP(FP32Denormals) |
+ FP_DENORM_MODE_DP(FP64Denormals);
+}
+
+static AsmPrinter *
+createAMDGPUAsmPrinterPass(TargetMachine &tm,
+ std::unique_ptr<MCStreamer> &&Streamer) {
+ return new AMDGPUAsmPrinter(tm, std::move(Streamer));
+}
+
+extern "C" void LLVMInitializeAMDGPUAsmPrinter() {
+ TargetRegistry::RegisterAsmPrinter(getTheAMDGPUTarget(),
+ createAMDGPUAsmPrinterPass);
+ TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(),
+ createAMDGPUAsmPrinterPass);
+}
+
+AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
+ std::unique_ptr<MCStreamer> Streamer)
+ : AsmPrinter(TM, std::move(Streamer)) {}
+
+StringRef AMDGPUAsmPrinter::getPassName() const {
+ return "AMDGPU Assembly Printer";
+}
+
+void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
+ if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
+ return;
+
+ // Need to construct an MCSubtargetInfo here in case we have no functions
+ // in the module.
+ std::unique_ptr<MCSubtargetInfo> STI(TM.getTarget().createMCSubtargetInfo(
+ TM.getTargetTriple().str(), TM.getTargetCPU(),
+ TM.getTargetFeatureString()));
+
+ AMDGPUTargetStreamer *TS =
+ static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
+
+ TS->EmitDirectiveHSACodeObjectVersion(2, 1);
+
+ AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(STI->getFeatureBits());
+ TS->EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor, ISA.Stepping,
+ "AMD", "AMDGPU");
+
+ // Emit runtime metadata.
+ TS->EmitRuntimeMetadata(M);
+}
+
+bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough(
+ const MachineBasicBlock *MBB) const {
+ if (!AsmPrinter::isBlockOnlyReachableByFallthrough(MBB))
+ return false;
+
+ if (MBB->empty())
+ return true;
+
+ // If this is a block implementing a long branch, an expression relative to
+ // the start of the block is needed. to the start of the block.
+ // XXX - Is there a smarter way to check this?
+ return (MBB->back().getOpcode() != AMDGPU::S_SETPC_B64);
+}
+
+
+void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
+ const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
+ SIProgramInfo KernelInfo;
+ if (STM.isAmdCodeObjectV2()) {
+ getSIProgramInfo(KernelInfo, *MF);
+ EmitAmdKernelCodeT(*MF, KernelInfo);
+ }
+}
+
+void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
+ const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+ const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
+ if (MFI->isKernel() && STM.isAmdCodeObjectV2()) {
+ AMDGPUTargetStreamer *TS =
+ static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
+ SmallString<128> SymbolName;
+ getNameWithPrefix(SymbolName, MF->getFunction()),
+ TS->EmitAMDGPUSymbolType(SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
+ }
+
+ AsmPrinter::EmitFunctionEntryLabel();
+}
+
+void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
+
+ // Group segment variables aren't emitted in HSA.
+ if (AMDGPU::isGroupSegment(GV))
+ return;
+
+ AsmPrinter::EmitGlobalVariable(GV);
+}
+
+bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+
+ // The starting address of all shader programs must be 256 bytes aligned.
+ MF.setAlignment(8);
+
+ SetupMachineFunction(MF);
+
+ MCContext &Context = getObjFileLowering().getContext();
+ MCSectionELF *ConfigSection =
+ Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
+ OutStreamer->SwitchSection(ConfigSection);
+
+ const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
+ SIProgramInfo KernelInfo;
+ if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+ getSIProgramInfo(KernelInfo, MF);
+ if (!STM.isAmdHsaOS()) {
+ EmitProgramInfoSI(MF, KernelInfo);
+ }
+ } else {
+ EmitProgramInfoR600(MF);
+ }
+
+ DisasmLines.clear();
+ HexLines.clear();
+ DisasmLineMaxLen = 0;
+
+ EmitFunctionBody();
+
+ if (isVerbose()) {
+ MCSectionELF *CommentSection =
+ Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
+ OutStreamer->SwitchSection(CommentSection);
+
+ if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+ OutStreamer->emitRawComment(" Kernel info:", false);
+ OutStreamer->emitRawComment(" codeLenInByte = " + Twine(KernelInfo.CodeLen),
+ false);
+ OutStreamer->emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR),
+ false);
+ OutStreamer->emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR),
+ false);
+ OutStreamer->emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode),
+ false);
+ OutStreamer->emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode),
+ false);
+ OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize),
+ false);
+ OutStreamer->emitRawComment(" LDSByteSize: " + Twine(KernelInfo.LDSSize) +
+ " bytes/workgroup (compile time only)", false);
+
+ OutStreamer->emitRawComment(" SGPRBlocks: " +
+ Twine(KernelInfo.SGPRBlocks), false);
+ OutStreamer->emitRawComment(" VGPRBlocks: " +
+ Twine(KernelInfo.VGPRBlocks), false);
+
+ OutStreamer->emitRawComment(" NumSGPRsForWavesPerEU: " +
+ Twine(KernelInfo.NumSGPRsForWavesPerEU), false);
+ OutStreamer->emitRawComment(" NumVGPRsForWavesPerEU: " +
+ Twine(KernelInfo.NumVGPRsForWavesPerEU), false);
+
+ OutStreamer->emitRawComment(" ReservedVGPRFirst: " + Twine(KernelInfo.ReservedVGPRFirst),
+ false);
+ OutStreamer->emitRawComment(" ReservedVGPRCount: " + Twine(KernelInfo.ReservedVGPRCount),
+ false);
+
+ if (MF.getSubtarget<SISubtarget>().debuggerEmitPrologue()) {
+ OutStreamer->emitRawComment(" DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +
+ Twine(KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false);
+ OutStreamer->emitRawComment(" DebuggerPrivateSegmentBufferSGPR: s" +
+ Twine(KernelInfo.DebuggerPrivateSegmentBufferSGPR), false);
+ }
+
+ OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
+ Twine(G_00B84C_USER_SGPR(KernelInfo.ComputePGMRSrc2)),
+ false);
+ OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
+ Twine(G_00B84C_TGID_X_EN(KernelInfo.ComputePGMRSrc2)),
+ false);
+ OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
+ Twine(G_00B84C_TGID_Y_EN(KernelInfo.ComputePGMRSrc2)),
+ false);
+ OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
+ Twine(G_00B84C_TGID_Z_EN(KernelInfo.ComputePGMRSrc2)),
+ false);
+ OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
+ Twine(G_00B84C_TIDIG_COMP_CNT(KernelInfo.ComputePGMRSrc2)),
+ false);
+
+ } else {
+ R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
+ OutStreamer->emitRawComment(
+ Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->CFStackSize)));
+ }
+ }
+
+ if (STM.dumpCode()) {
+
+ OutStreamer->SwitchSection(
+ Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0));
+
+ for (size_t i = 0; i < DisasmLines.size(); ++i) {
+ std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
+ Comment += " ; " + HexLines[i] + "\n";
+
+ OutStreamer->EmitBytes(StringRef(DisasmLines[i]));
+ OutStreamer->EmitBytes(StringRef(Comment));
+ }
+ }
+
+ return false;
+}
+
+void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
+ unsigned MaxGPR = 0;
+ bool killPixel = false;
+ const R600Subtarget &STM = MF.getSubtarget<R600Subtarget>();
+ const R600RegisterInfo *RI = STM.getRegisterInfo();
+ const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
+
+ for (const MachineBasicBlock &MBB : MF) {
+ for (const MachineInstr &MI : MBB) {
+ if (MI.getOpcode() == AMDGPU::KILLGT)
+ killPixel = true;
+ unsigned numOperands = MI.getNumOperands();
+ for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
+ const MachineOperand &MO = MI.getOperand(op_idx);
+ if (!MO.isReg())
+ continue;
+ unsigned HWReg = RI->getEncodingValue(MO.getReg()) & 0xff;
+
+ // Register with value > 127 aren't GPR
+ if (HWReg > 127)
+ continue;
+ MaxGPR = std::max(MaxGPR, HWReg);
+ }
+ }
+ }
+
+ unsigned RsrcReg;
+ if (STM.getGeneration() >= R600Subtarget::EVERGREEN) {
+ // Evergreen / Northern Islands
+ switch (MF.getFunction()->getCallingConv()) {
+ default: LLVM_FALLTHROUGH;
+ case CallingConv::AMDGPU_CS: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break;
+ case CallingConv::AMDGPU_GS: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break;
+ case CallingConv::AMDGPU_PS: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break;
+ case CallingConv::AMDGPU_VS: RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break;
+ }
+ } else {
+ // R600 / R700
+ switch (MF.getFunction()->getCallingConv()) {
+ default: LLVM_FALLTHROUGH;
+ case CallingConv::AMDGPU_GS: LLVM_FALLTHROUGH;
+ case CallingConv::AMDGPU_CS: LLVM_FALLTHROUGH;
+ case CallingConv::AMDGPU_VS: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break;
+ case CallingConv::AMDGPU_PS: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break;
+ }
+ }
+
+ OutStreamer->EmitIntValue(RsrcReg, 4);
+ OutStreamer->EmitIntValue(S_NUM_GPRS(MaxGPR + 1) |
+ S_STACK_SIZE(MFI->CFStackSize), 4);
+ OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4);
+ OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4);
+
+ if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {
+ OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4);
+ OutStreamer->EmitIntValue(alignTo(MFI->getLDSSize(), 4) >> 2, 4);
+ }
+}
+
+void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
+ const MachineFunction &MF) const {
+ const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ uint64_t CodeSize = 0;
+ unsigned MaxSGPR = 0;
+ unsigned MaxVGPR = 0;
+ bool VCCUsed = false;
+ bool FlatUsed = false;
+ const SIRegisterInfo *RI = STM.getRegisterInfo();
+ const SIInstrInfo *TII = STM.getInstrInfo();
+
+ for (const MachineBasicBlock &MBB : MF) {
+ for (const MachineInstr &MI : MBB) {
+ // TODO: CodeSize should account for multiple functions.
+
+ // TODO: Should we count size of debug info?
+ if (MI.isDebugValue())
+ continue;
+
+ if (isVerbose())
+ CodeSize += TII->getInstSizeInBytes(MI);
+
+ unsigned numOperands = MI.getNumOperands();
+ for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
+ const MachineOperand &MO = MI.getOperand(op_idx);
+ unsigned width = 0;
+ bool isSGPR = false;
+
+ if (!MO.isReg())
+ continue;
+
+ unsigned reg = MO.getReg();
+ switch (reg) {
+ case AMDGPU::EXEC:
+ case AMDGPU::EXEC_LO:
+ case AMDGPU::EXEC_HI:
+ case AMDGPU::SCC:
+ case AMDGPU::M0:
+ continue;
+
+ case AMDGPU::VCC:
+ case AMDGPU::VCC_LO:
+ case AMDGPU::VCC_HI:
+ VCCUsed = true;
+ continue;
+
+ case AMDGPU::FLAT_SCR:
+ case AMDGPU::FLAT_SCR_LO:
+ case AMDGPU::FLAT_SCR_HI:
+ // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
+ // instructions aren't used to access the scratch buffer.
+ if (MFI->hasFlatScratchInit())
+ FlatUsed = true;
+ continue;
+
+ case AMDGPU::TBA:
+ case AMDGPU::TBA_LO:
+ case AMDGPU::TBA_HI:
+ case AMDGPU::TMA:
+ case AMDGPU::TMA_LO:
+ case AMDGPU::TMA_HI:
+ llvm_unreachable("trap handler registers should not be used");
+
+ default:
+ break;
+ }
+
+ if (AMDGPU::SReg_32RegClass.contains(reg)) {
+ assert(!AMDGPU::TTMP_32RegClass.contains(reg) &&
+ "trap handler registers should not be used");
+ isSGPR = true;
+ width = 1;
+ } else if (AMDGPU::VGPR_32RegClass.contains(reg)) {
+ isSGPR = false;
+ width = 1;
+ } else if (AMDGPU::SReg_64RegClass.contains(reg)) {
+ assert(!AMDGPU::TTMP_64RegClass.contains(reg) &&
+ "trap handler registers should not be used");
+ isSGPR = true;
+ width = 2;
+ } else if (AMDGPU::VReg_64RegClass.contains(reg)) {
+ isSGPR = false;
+ width = 2;
+ } else if (AMDGPU::VReg_96RegClass.contains(reg)) {
+ isSGPR = false;
+ width = 3;
+ } else if (AMDGPU::SReg_128RegClass.contains(reg)) {
+ isSGPR = true;
+ width = 4;
+ } else if (AMDGPU::VReg_128RegClass.contains(reg)) {
+ isSGPR = false;
+ width = 4;
+ } else if (AMDGPU::SReg_256RegClass.contains(reg)) {
+ isSGPR = true;
+ width = 8;
+ } else if (AMDGPU::VReg_256RegClass.contains(reg)) {
+ isSGPR = false;
+ width = 8;
+ } else if (AMDGPU::SReg_512RegClass.contains(reg)) {
+ isSGPR = true;
+ width = 16;
+ } else if (AMDGPU::VReg_512RegClass.contains(reg)) {
+ isSGPR = false;
+ width = 16;
+ } else {
+ llvm_unreachable("Unknown register class");
+ }
+ unsigned hwReg = RI->getEncodingValue(reg) & 0xff;
+ unsigned maxUsed = hwReg + width - 1;
+ if (isSGPR) {
+ MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR;
+ } else {
+ MaxVGPR = maxUsed > MaxVGPR ? maxUsed : MaxVGPR;
+ }
+ }
+ }
+ }
+
+ unsigned ExtraSGPRs = 0;
+
+ if (VCCUsed)
+ ExtraSGPRs = 2;
+
+ if (STM.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) {
+ if (FlatUsed)
+ ExtraSGPRs = 4;
+ } else {
+ if (STM.isXNACKEnabled())
+ ExtraSGPRs = 4;
+
+ if (FlatUsed)
+ ExtraSGPRs = 6;
+ }
+
+ // Record first reserved register and reserved register count fields, and
+ // update max register counts if "amdgpu-debugger-reserve-regs" attribute was
+ // requested.
+ ProgInfo.ReservedVGPRFirst = STM.debuggerReserveRegs() ? MaxVGPR + 1 : 0;
+ ProgInfo.ReservedVGPRCount = RI->getNumDebuggerReservedVGPRs(STM);
+
+ // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and
+ // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue"
+ // attribute was requested.
+ if (STM.debuggerEmitPrologue()) {
+ ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR =
+ RI->getHWRegIndex(MFI->getScratchWaveOffsetReg());
+ ProgInfo.DebuggerPrivateSegmentBufferSGPR =
+ RI->getHWRegIndex(MFI->getScratchRSrcReg());
+ }
+
+ // Check the addressable register limit before we add ExtraSGPRs.
+ if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
+ !STM.hasSGPRInitBug()) {
+ unsigned MaxAddressableNumSGPRs = STM.getMaxNumSGPRs();
+ if (MaxSGPR + 1 > MaxAddressableNumSGPRs) {
+ // This can happen due to a compiler bug or when using inline asm.
+ LLVMContext &Ctx = MF.getFunction()->getContext();
+ DiagnosticInfoResourceLimit Diag(*MF.getFunction(),
+ "addressable scalar registers",
+ MaxSGPR + 1, DS_Error,
+ DK_ResourceLimit, MaxAddressableNumSGPRs);
+ Ctx.diagnose(Diag);
+ MaxSGPR = MaxAddressableNumSGPRs - 1;
+ }
+ }
+
+ // Account for extra SGPRs and VGPRs reserved for debugger use.
+ MaxSGPR += ExtraSGPRs;
+ MaxVGPR += RI->getNumDebuggerReservedVGPRs(STM);
+
+ // We found the maximum register index. They start at 0, so add one to get the
+ // number of registers.
+ ProgInfo.NumVGPR = MaxVGPR + 1;
+ ProgInfo.NumSGPR = MaxSGPR + 1;
+
+ // Adjust number of registers used to meet default/requested minimum/maximum
+ // number of waves per execution unit request.
+ ProgInfo.NumSGPRsForWavesPerEU = std::max(
+ ProgInfo.NumSGPR, RI->getMinNumSGPRs(STM, MFI->getMaxWavesPerEU()));
+ ProgInfo.NumVGPRsForWavesPerEU = std::max(
+ ProgInfo.NumVGPR, RI->getMinNumVGPRs(MFI->getMaxWavesPerEU()));
+
+ if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
+ STM.hasSGPRInitBug()) {
+ unsigned MaxNumSGPRs = STM.getMaxNumSGPRs();
+ if (ProgInfo.NumSGPR > MaxNumSGPRs) {
+ // This can happen due to a compiler bug or when using inline asm to use the
+ // registers which are usually reserved for vcc etc.
+
+ LLVMContext &Ctx = MF.getFunction()->getContext();
+ DiagnosticInfoResourceLimit Diag(*MF.getFunction(),
+ "scalar registers",
+ ProgInfo.NumSGPR, DS_Error,
+ DK_ResourceLimit, MaxNumSGPRs);
+ Ctx.diagnose(Diag);
+ ProgInfo.NumSGPR = MaxNumSGPRs;
+ ProgInfo.NumSGPRsForWavesPerEU = MaxNumSGPRs;
+ }
+ }
+
+ if (STM.hasSGPRInitBug()) {
+ ProgInfo.NumSGPR = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
+ ProgInfo.NumSGPRsForWavesPerEU = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
+ }
+
+ if (MFI->NumUserSGPRs > STM.getMaxNumUserSGPRs()) {
+ LLVMContext &Ctx = MF.getFunction()->getContext();
+ DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "user SGPRs",
+ MFI->NumUserSGPRs, DS_Error);
+ Ctx.diagnose(Diag);
+ }
+
+ if (MFI->getLDSSize() > static_cast<unsigned>(STM.getLocalMemorySize())) {
+ LLVMContext &Ctx = MF.getFunction()->getContext();
+ DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "local memory",
+ MFI->getLDSSize(), DS_Error);
+ Ctx.diagnose(Diag);
+ }
+
+ // SGPRBlocks is actual number of SGPR blocks minus 1.
+ ProgInfo.SGPRBlocks = alignTo(ProgInfo.NumSGPRsForWavesPerEU,
+ RI->getSGPRAllocGranule());
+ ProgInfo.SGPRBlocks = ProgInfo.SGPRBlocks / RI->getSGPRAllocGranule() - 1;
+
+ // VGPRBlocks is actual number of VGPR blocks minus 1.
+ ProgInfo.VGPRBlocks = alignTo(ProgInfo.NumVGPRsForWavesPerEU,
+ RI->getVGPRAllocGranule());
+ ProgInfo.VGPRBlocks = ProgInfo.VGPRBlocks / RI->getVGPRAllocGranule() - 1;
+
+ // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
+ // register.
+ ProgInfo.FloatMode = getFPMode(MF);
+
+ ProgInfo.IEEEMode = STM.enableIEEEBit(MF);
+
+ // Make clamp modifier on NaN input returns 0.
+ ProgInfo.DX10Clamp = 1;
+
+ const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+ ProgInfo.ScratchSize = FrameInfo.getStackSize();
+
+ ProgInfo.FlatUsed = FlatUsed;
+ ProgInfo.VCCUsed = VCCUsed;
+ ProgInfo.CodeLen = CodeSize;
+
+ unsigned LDSAlignShift;
+ if (STM.getGeneration() < SISubtarget::SEA_ISLANDS) {
+ // LDS is allocated in 64 dword blocks.
+ LDSAlignShift = 8;
+ } else {
+ // LDS is allocated in 128 dword blocks.
+ LDSAlignShift = 9;
+ }
+
+ unsigned LDSSpillSize =
+ MFI->LDSWaveSpillSize * MFI->getMaxFlatWorkGroupSize();
+
+ ProgInfo.LDSSize = MFI->getLDSSize() + LDSSpillSize;
+ ProgInfo.LDSBlocks =
+ alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
+
+ // Scratch is allocated in 256 dword blocks.
+ unsigned ScratchAlignShift = 10;
+ // We need to program the hardware with the amount of scratch memory that
+ // is used by the entire wave. ProgInfo.ScratchSize is the amount of
+ // scratch memory used per thread.
+ ProgInfo.ScratchBlocks =
+ alignTo(ProgInfo.ScratchSize * STM.getWavefrontSize(),
+ 1ULL << ScratchAlignShift) >>
+ ScratchAlignShift;
+
+ ProgInfo.ComputePGMRSrc1 =
+ S_00B848_VGPRS(ProgInfo.VGPRBlocks) |
+ S_00B848_SGPRS(ProgInfo.SGPRBlocks) |
+ S_00B848_PRIORITY(ProgInfo.Priority) |
+ S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
+ S_00B848_PRIV(ProgInfo.Priv) |
+ S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) |
+ S_00B848_DEBUG_MODE(ProgInfo.DebugMode) |
+ S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
+
+ // 0 = X, 1 = XY, 2 = XYZ
+ unsigned TIDIGCompCnt = 0;
+ if (MFI->hasWorkItemIDZ())
+ TIDIGCompCnt = 2;
+ else if (MFI->hasWorkItemIDY())
+ TIDIGCompCnt = 1;
+
+ ProgInfo.ComputePGMRSrc2 =
+ S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
+ S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) |
+ S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) |
+ S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) |
+ S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) |
+ S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) |
+ S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) |
+ S_00B84C_EXCP_EN_MSB(0) |
+ S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks) |
+ S_00B84C_EXCP_EN(0);
+}
+
+static unsigned getRsrcReg(CallingConv::ID CallConv) {
+ switch (CallConv) {
+ default: LLVM_FALLTHROUGH;
+ case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1;
+ case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
+ case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
+ case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
+ }
+}
+
+void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
+ const SIProgramInfo &KernelInfo) {
+ const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ unsigned RsrcReg = getRsrcReg(MF.getFunction()->getCallingConv());
+
+ if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {
+ OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
+
+ OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc1, 4);
+
+ OutStreamer->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
+ OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc2, 4);
+
+ OutStreamer->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);
+ OutStreamer->EmitIntValue(S_00B860_WAVESIZE(KernelInfo.ScratchBlocks), 4);
+
+ // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
+ // 0" comment but I don't see a corresponding field in the register spec.
+ } else {
+ OutStreamer->EmitIntValue(RsrcReg, 4);
+ OutStreamer->EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) |
+ S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4);
+ if (STM.isVGPRSpillingEnabled(*MF.getFunction())) {
+ OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
+ OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4);
+ }
+ }
+
+ if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_PS) {
+ OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
+ OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4);
+ OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
+ OutStreamer->EmitIntValue(MFI->PSInputEna, 4);
+ OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4);
+ OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4);
+ }
+
+ OutStreamer->EmitIntValue(R_SPILLED_SGPRS, 4);
+ OutStreamer->EmitIntValue(MFI->getNumSpilledSGPRs(), 4);
+ OutStreamer->EmitIntValue(R_SPILLED_VGPRS, 4);
+ OutStreamer->EmitIntValue(MFI->getNumSpilledVGPRs(), 4);
+}
+
+// This is supposed to be log2(Size)
+static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
+ switch (Size) {
+ case 4:
+ return AMD_ELEMENT_4_BYTES;
+ case 8:
+ return AMD_ELEMENT_8_BYTES;
+ case 16:
+ return AMD_ELEMENT_16_BYTES;
+ default:
+ llvm_unreachable("invalid private_element_size");
+ }
+}
+
+void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
+ const SIProgramInfo &KernelInfo) const {
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
+ amd_kernel_code_t header;
+
+ AMDGPU::initDefaultAMDKernelCodeT(header, STM.getFeatureBits());
+
+ header.compute_pgm_resource_registers =
+ KernelInfo.ComputePGMRSrc1 |
+ (KernelInfo.ComputePGMRSrc2 << 32);
+ header.code_properties = AMD_CODE_PROPERTY_IS_PTR64;
+
+
+ AMD_HSA_BITS_SET(header.code_properties,
+ AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
+ getElementByteSizeValue(STM.getMaxPrivateElementSize()));
+
+ if (MFI->hasPrivateSegmentBuffer()) {
+ header.code_properties |=
+ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
+ }
+
+ if (MFI->hasDispatchPtr())
+ header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
+
+ if (MFI->hasQueuePtr())
+ header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
+
+ if (MFI->hasKernargSegmentPtr())
+ header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
+
+ if (MFI->hasDispatchID())
+ header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
+
+ if (MFI->hasFlatScratchInit())
+ header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
+
+ // TODO: Private segment size
+
+ if (MFI->hasGridWorkgroupCountX()) {
+ header.code_properties |=
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X;
+ }
+
+ if (MFI->hasGridWorkgroupCountY()) {
+ header.code_properties |=
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y;
+ }
+
+ if (MFI->hasGridWorkgroupCountZ()) {
+ header.code_properties |=
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z;
+ }
+
+ if (MFI->hasDispatchPtr())
+ header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
+
+ if (STM.debuggerSupported())
+ header.code_properties |= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED;
+
+ if (STM.isXNACKEnabled())
+ header.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
+
+ // FIXME: Should use getKernArgSize
+ header.kernarg_segment_byte_size =
+ STM.getKernArgSegmentSize(MFI->getABIArgOffset());
+ header.wavefront_sgpr_count = KernelInfo.NumSGPR;
+ header.workitem_vgpr_count = KernelInfo.NumVGPR;
+ header.workitem_private_segment_byte_size = KernelInfo.ScratchSize;
+ header.workgroup_group_segment_byte_size = KernelInfo.LDSSize;
+ header.reserved_vgpr_first = KernelInfo.ReservedVGPRFirst;
+ header.reserved_vgpr_count = KernelInfo.ReservedVGPRCount;
+
+ // These alignment values are specified in powers of two, so alignment =
+ // 2^n. The minimum alignment is 2^4 = 16.
+ header.kernarg_segment_alignment = std::max((size_t)4,
+ countTrailingZeros(MFI->getMaxKernArgAlign()));
+
+ if (STM.debuggerEmitPrologue()) {
+ header.debug_wavefront_private_segment_offset_sgpr =
+ KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
+ header.debug_private_segment_buffer_sgpr =
+ KernelInfo.DebuggerPrivateSegmentBufferSGPR;
+ }
+
+ AMDGPUTargetStreamer *TS =
+ static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
+
+ OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
+ TS->EmitAMDKernelCodeT(header);
+}
+
+bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant,
+ const char *ExtraCode, raw_ostream &O) {
+ if (ExtraCode && ExtraCode[0]) {
+ if (ExtraCode[1] != 0)
+ return true; // Unknown modifier.
+
+ switch (ExtraCode[0]) {
+ default:
+ // See if this is a generic print operand
+ return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+ case 'r':
+ break;
+ }
+ }
+
+ AMDGPUInstPrinter::printRegOperand(MI->getOperand(OpNo).getReg(), O,
+ *TM.getSubtargetImpl(*MF->getFunction())->getRegisterInfo());
+ return false;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
new file mode 100644
index 000000000000..9a4bafef3a25
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -0,0 +1,160 @@
+//===-- AMDGPUAsmPrinter.h - Print AMDGPU assembly code ---------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief AMDGPU Assembly printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H
+
+#include "AMDGPUMCInstLower.h"
+
+#include "llvm/CodeGen/AsmPrinter.h"
+#include <vector>
+
+namespace llvm {
+class MCOperand;
+
+class AMDGPUAsmPrinter final : public AsmPrinter {
+private:
+ struct SIProgramInfo {
+ SIProgramInfo() :
+ VGPRBlocks(0),
+ SGPRBlocks(0),
+ Priority(0),
+ FloatMode(0),
+ Priv(0),
+ DX10Clamp(0),
+ DebugMode(0),
+ IEEEMode(0),
+ ScratchSize(0),
+ ComputePGMRSrc1(0),
+ LDSBlocks(0),
+ ScratchBlocks(0),
+ ComputePGMRSrc2(0),
+ NumVGPR(0),
+ NumSGPR(0),
+ FlatUsed(false),
+ NumSGPRsForWavesPerEU(0),
+ NumVGPRsForWavesPerEU(0),
+ ReservedVGPRFirst(0),
+ ReservedVGPRCount(0),
+ DebuggerWavefrontPrivateSegmentOffsetSGPR((uint16_t)-1),
+ DebuggerPrivateSegmentBufferSGPR((uint16_t)-1),
+ VCCUsed(false),
+ CodeLen(0) {}
+
+ // Fields set in PGM_RSRC1 pm4 packet.
+ uint32_t VGPRBlocks;
+ uint32_t SGPRBlocks;
+ uint32_t Priority;
+ uint32_t FloatMode;
+ uint32_t Priv;
+ uint32_t DX10Clamp;
+ uint32_t DebugMode;
+ uint32_t IEEEMode;
+ uint32_t ScratchSize;
+
+ uint64_t ComputePGMRSrc1;
+
+ // Fields set in PGM_RSRC2 pm4 packet.
+ uint32_t LDSBlocks;
+ uint32_t ScratchBlocks;
+
+ uint64_t ComputePGMRSrc2;
+
+ uint32_t NumVGPR;
+ uint32_t NumSGPR;
+ uint32_t LDSSize;
+ bool FlatUsed;
+
+ // Number of SGPRs that meets number of waves per execution unit request.
+ uint32_t NumSGPRsForWavesPerEU;
+
+ // Number of VGPRs that meets number of waves per execution unit request.
+ uint32_t NumVGPRsForWavesPerEU;
+
+ // If ReservedVGPRCount is 0 then must be 0. Otherwise, this is the first
+ // fixed VGPR number reserved.
+ uint16_t ReservedVGPRFirst;
+
+ // The number of consecutive VGPRs reserved.
+ uint16_t ReservedVGPRCount;
+
+ // Fixed SGPR number used to hold wave scratch offset for entire kernel
+ // execution, or uint16_t(-1) if the register is not used or not known.
+ uint16_t DebuggerWavefrontPrivateSegmentOffsetSGPR;
+
+ // Fixed SGPR number of the first 4 SGPRs used to hold scratch V# for entire
+ // kernel execution, or uint16_t(-1) if the register is not used or not
+ // known.
+ uint16_t DebuggerPrivateSegmentBufferSGPR;
+
+ // Bonus information for debugging.
+ bool VCCUsed;
+ uint64_t CodeLen;
+ };
+
+ void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF) const;
+ void findNumUsedRegistersSI(const MachineFunction &MF,
+ unsigned &NumSGPR,
+ unsigned &NumVGPR) const;
+
+ /// \brief Emit register usage information so that the GPU driver
+ /// can correctly setup the GPU state.
+ void EmitProgramInfoR600(const MachineFunction &MF);
+ void EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo);
+ void EmitAmdKernelCodeT(const MachineFunction &MF,
+ const SIProgramInfo &KernelInfo) const;
+
+public:
+ explicit AMDGPUAsmPrinter(TargetMachine &TM,
+ std::unique_ptr<MCStreamer> Streamer);
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override;
+
+ /// \brief Wrapper for MCInstLowering.lowerOperand() for the tblgen'erated
+ /// pseudo lowering.
+ bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
+
+ /// \brief tblgen'erated driver function for lowering simple MI->MC pseudo
+ /// instructions.
+ bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
+ const MachineInstr *MI);
+
+ /// Implemented in AMDGPUMCInstLower.cpp
+ void EmitInstruction(const MachineInstr *MI) override;
+
+ void EmitFunctionBodyStart() override;
+
+ void EmitFunctionEntryLabel() override;
+
+ void EmitGlobalVariable(const GlobalVariable *GV) override;
+
+ void EmitStartOfAsmFile(Module &M) override;
+
+ bool isBlockOnlyReachableByFallthrough(
+ const MachineBasicBlock *MBB) const override;
+
+ bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &O) override;
+
+protected:
+ std::vector<std::string> DisasmLines, HexLines;
+ size_t DisasmLineMaxLen;
+};
+
+} // End anonymous llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
new file mode 100644
index 000000000000..d53cc153dc9a
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -0,0 +1,42 @@
+//===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the lowering of LLVM calls to machine code calls for
+/// GlobalISel.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUCallLowering.h"
+#include "AMDGPUISelLowering.h"
+
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "This shouldn't be built without GISel"
+#endif
+
+AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
+ : CallLowering(&TLI) {
+}
+
+bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
+ const Value *Val, unsigned VReg) const {
+ return true;
+}
+
+bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
+ const Function &F,
+ ArrayRef<unsigned> VRegs) const {
+ // TODO: Implement once there are generic loads/stores.
+ return true;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
new file mode 100644
index 000000000000..9ae87c9397ab
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -0,0 +1,34 @@
+//===- lib/Target/AMDGPU/AMDGPUCallLowering.h - Call lowering -*- C++ -*---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file describes how to lower LLVM calls to machine code calls.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUCALLLOWERING_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUCALLLOWERING_H
+
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+
+namespace llvm {
+
+class AMDGPUTargetLowering;
+
+class AMDGPUCallLowering: public CallLowering {
+ public:
+ AMDGPUCallLowering(const AMDGPUTargetLowering &TLI);
+
+ bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val,
+ unsigned VReg) const override;
+ bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
+ ArrayRef<unsigned> VRegs) const override;
+};
+} // End of namespace llvm;
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
new file mode 100644
index 000000000000..47dfa4992068
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -0,0 +1,135 @@
+//===---- AMDCallingConv.td - Calling Conventions for Radeon GPUs ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the AMD Radeon GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+// Inversion of CCIfInReg
+class CCIfNotInReg<CCAction A> : CCIf<"!ArgFlags.isInReg()", A> {}
+
+// Calling convention for SI
+def CC_SI : CallingConv<[
+
+ CCIfInReg<CCIfType<[f32, i32] , CCAssignToReg<[
+ SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
+ SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
+ SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
+ SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31,
+ SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39
+ ]>>>,
+
+ CCIfInReg<CCIfType<[i64] , CCAssignToRegWithShadow<
+ [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14,
+ SGPR16, SGPR18, SGPR20, SGPR22, SGPR24, SGPR26, SGPR28, SGPR30,
+ SGPR32, SGPR34, SGPR36, SGPR38 ],
+ [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15,
+ SGPR17, SGPR19, SGPR21, SGPR23, SGPR25, SGPR27, SGPR29, SGPR31,
+ SGPR33, SGPR35, SGPR37, SGPR39 ]
+ >>>,
+
+ // 32*4 + 4 is the minimum for a fetch shader consumer with 32 inputs.
+ CCIfNotInReg<CCIfType<[f32, i32] , CCAssignToReg<[
+ VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
+ VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
+ VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
+ VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31,
+ VGPR32, VGPR33, VGPR34, VGPR35, VGPR36, VGPR37, VGPR38, VGPR39,
+ VGPR40, VGPR41, VGPR42, VGPR43, VGPR44, VGPR45, VGPR46, VGPR47,
+ VGPR48, VGPR49, VGPR50, VGPR51, VGPR52, VGPR53, VGPR54, VGPR55,
+ VGPR56, VGPR57, VGPR58, VGPR59, VGPR60, VGPR61, VGPR62, VGPR63,
+ VGPR64, VGPR65, VGPR66, VGPR67, VGPR68, VGPR69, VGPR70, VGPR71,
+ VGPR72, VGPR73, VGPR74, VGPR75, VGPR76, VGPR77, VGPR78, VGPR79,
+ VGPR80, VGPR81, VGPR82, VGPR83, VGPR84, VGPR85, VGPR86, VGPR87,
+ VGPR88, VGPR89, VGPR90, VGPR91, VGPR92, VGPR93, VGPR94, VGPR95,
+ VGPR96, VGPR97, VGPR98, VGPR99, VGPR100, VGPR101, VGPR102, VGPR103,
+ VGPR104, VGPR105, VGPR106, VGPR107, VGPR108, VGPR109, VGPR110, VGPR111,
+ VGPR112, VGPR113, VGPR114, VGPR115, VGPR116, VGPR117, VGPR118, VGPR119,
+ VGPR120, VGPR121, VGPR122, VGPR123, VGPR124, VGPR125, VGPR126, VGPR127,
+ VGPR128, VGPR129, VGPR130, VGPR131, VGPR132, VGPR133, VGPR134, VGPR135
+ ]>>>,
+
+ CCIfByVal<CCIfType<[i64] , CCAssignToRegWithShadow<
+ [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14,
+ SGPR16, SGPR18, SGPR20, SGPR22, SGPR24, SGPR26, SGPR28, SGPR30,
+ SGPR32, SGPR34, SGPR36, SGPR38 ],
+ [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15,
+ SGPR17, SGPR19, SGPR21, SGPR23, SGPR25, SGPR27, SGPR29, SGPR31,
+ SGPR33, SGPR35, SGPR37, SGPR39 ]
+ >>>
+
+]>;
+
+def RetCC_SI : CallingConv<[
+ CCIfType<[i32] , CCAssignToReg<[
+ SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
+ SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
+ SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
+ SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31,
+ SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39
+ ]>>,
+
+ // 32*4 + 4 is the minimum for a fetch shader with 32 outputs.
+ CCIfType<[f32] , CCAssignToReg<[
+ VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
+ VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
+ VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
+ VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31,
+ VGPR32, VGPR33, VGPR34, VGPR35, VGPR36, VGPR37, VGPR38, VGPR39,
+ VGPR40, VGPR41, VGPR42, VGPR43, VGPR44, VGPR45, VGPR46, VGPR47,
+ VGPR48, VGPR49, VGPR50, VGPR51, VGPR52, VGPR53, VGPR54, VGPR55,
+ VGPR56, VGPR57, VGPR58, VGPR59, VGPR60, VGPR61, VGPR62, VGPR63,
+ VGPR64, VGPR65, VGPR66, VGPR67, VGPR68, VGPR69, VGPR70, VGPR71,
+ VGPR72, VGPR73, VGPR74, VGPR75, VGPR76, VGPR77, VGPR78, VGPR79,
+ VGPR80, VGPR81, VGPR82, VGPR83, VGPR84, VGPR85, VGPR86, VGPR87,
+ VGPR88, VGPR89, VGPR90, VGPR91, VGPR92, VGPR93, VGPR94, VGPR95,
+ VGPR96, VGPR97, VGPR98, VGPR99, VGPR100, VGPR101, VGPR102, VGPR103,
+ VGPR104, VGPR105, VGPR106, VGPR107, VGPR108, VGPR109, VGPR110, VGPR111,
+ VGPR112, VGPR113, VGPR114, VGPR115, VGPR116, VGPR117, VGPR118, VGPR119,
+ VGPR120, VGPR121, VGPR122, VGPR123, VGPR124, VGPR125, VGPR126, VGPR127,
+ VGPR128, VGPR129, VGPR130, VGPR131, VGPR132, VGPR133, VGPR134, VGPR135
+ ]>>
+]>;
+
+// Calling convention for R600
+def CC_R600 : CallingConv<[
+ CCIfInReg<CCIfType<[v4f32, v4i32] , CCAssignToReg<[
+ T0_XYZW, T1_XYZW, T2_XYZW, T3_XYZW, T4_XYZW, T5_XYZW, T6_XYZW, T7_XYZW,
+ T8_XYZW, T9_XYZW, T10_XYZW, T11_XYZW, T12_XYZW, T13_XYZW, T14_XYZW, T15_XYZW,
+ T16_XYZW, T17_XYZW, T18_XYZW, T19_XYZW, T20_XYZW, T21_XYZW, T22_XYZW,
+ T23_XYZW, T24_XYZW, T25_XYZW, T26_XYZW, T27_XYZW, T28_XYZW, T29_XYZW,
+ T30_XYZW, T31_XYZW, T32_XYZW
+ ]>>>
+]>;
+
+// Calling convention for compute kernels
+def CC_AMDGPU_Kernel : CallingConv<[
+ CCCustom<"allocateKernArg">
+]>;
+
+def CC_AMDGPU : CallingConv<[
+ CCIf<"static_cast<const AMDGPUSubtarget&>"
+ "(State.getMachineFunction().getSubtarget()).getGeneration() >="
+ "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
+ "!AMDGPU::isShader(State.getCallingConv())",
+ CCDelegateTo<CC_AMDGPU_Kernel>>,
+ CCIf<"static_cast<const AMDGPUSubtarget&>"
+ "(State.getMachineFunction().getSubtarget()).getGeneration() < "
+ "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
+ "!AMDGPU::isShader(State.getCallingConv())",
+ CCDelegateTo<CC_AMDGPU_Kernel>>,
+ CCIf<"static_cast<const AMDGPUSubtarget&>"
+ "(State.getMachineFunction().getSubtarget()).getGeneration() >= "
+ "AMDGPUSubtarget::SOUTHERN_ISLANDS",
+ CCDelegateTo<CC_SI>>,
+ CCIf<"static_cast<const AMDGPUSubtarget&>"
+ "(State.getMachineFunction().getSubtarget()).getGeneration() < "
+ "AMDGPUSubtarget::SOUTHERN_ISLANDS",
+ CCDelegateTo<CC_R600>>
+]>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
new file mode 100644
index 000000000000..e6230547a9b3
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -0,0 +1,480 @@
+//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass does misc. AMDGPU optimizations on IR before instruction
+/// selection.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUIntrinsicInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "AMDGPUTargetMachine.h"
+
+#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "amdgpu-codegenprepare"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUCodeGenPrepare : public FunctionPass,
+ public InstVisitor<AMDGPUCodeGenPrepare, bool> {
+ const GCNTargetMachine *TM;
+ const SISubtarget *ST;
+ DivergenceAnalysis *DA;
+ Module *Mod;
+ bool HasUnsafeFPMath;
+
+ /// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to
+ /// binary operation \p V.
+ ///
+ /// \returns Binary operation \p V.
+ Value *copyFlags(const BinaryOperator &I, Value *V) const;
+
+ /// \returns \p T's base element bit width.
+ unsigned getBaseElementBitWidth(const Type *T) const;
+
+ /// \returns Equivalent 32 bit integer type for given type \p T. For example,
+ /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
+ /// is returned.
+ Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
+
+ /// \returns True if binary operation \p I is a signed binary operation, false
+ /// otherwise.
+ bool isSigned(const BinaryOperator &I) const;
+
+ /// \returns True if the condition of 'select' operation \p I comes from a
+ /// signed 'icmp' operation, false otherwise.
+ bool isSigned(const SelectInst &I) const;
+
+ /// \returns True if type \p T needs to be promoted to 32 bit integer type,
+ /// false otherwise.
+ bool needsPromotionToI32(const Type *T) const;
+
+ /// \brief Promotes uniform binary operation \p I to equivalent 32 bit binary
+ /// operation.
+ ///
+ /// \details \p I's base element bit width must be greater than 1 and less
+ /// than or equal 16. Promotion is done by sign or zero extending operands to
+ /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
+ /// truncating the result of 32 bit binary operation back to \p I's original
+ /// type. Division operation is not promoted.
+ ///
+ /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
+ /// false otherwise.
+ bool promoteUniformOpToI32(BinaryOperator &I) const;
+
+ /// \brief Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
+ ///
+ /// \details \p I's base element bit width must be greater than 1 and less
+ /// than or equal 16. Promotion is done by sign or zero extending operands to
+ /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
+ ///
+ /// \returns True.
+ bool promoteUniformOpToI32(ICmpInst &I) const;
+
+ /// \brief Promotes uniform 'select' operation \p I to 32 bit 'select'
+ /// operation.
+ ///
+ /// \details \p I's base element bit width must be greater than 1 and less
+ /// than or equal 16. Promotion is done by sign or zero extending operands to
+ /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
+ /// result of 32 bit 'select' operation back to \p I's original type.
+ ///
+ /// \returns True.
+ bool promoteUniformOpToI32(SelectInst &I) const;
+
+ /// \brief Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
+ /// intrinsic.
+ ///
+ /// \details \p I's base element bit width must be greater than 1 and less
+ /// than or equal 16. Promotion is done by zero extending the operand to 32
+ /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
+ /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
+ /// shift amount is 32 minus \p I's base element bit width), and truncating
+ /// the result of the shift operation back to \p I's original type.
+ ///
+ /// \returns True.
+ bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
+
+public:
+ static char ID;
+ AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) :
+ FunctionPass(ID),
+ TM(static_cast<const GCNTargetMachine *>(TM)),
+ ST(nullptr),
+ DA(nullptr),
+ Mod(nullptr),
+ HasUnsafeFPMath(false) { }
+
+ bool visitFDiv(BinaryOperator &I);
+
+ bool visitInstruction(Instruction &I) { return false; }
+ bool visitBinaryOperator(BinaryOperator &I);
+ bool visitICmpInst(ICmpInst &I);
+ bool visitSelectInst(SelectInst &I);
+
+ bool visitIntrinsicInst(IntrinsicInst &I);
+ bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
+
+ bool doInitialization(Module &M) override;
+ bool runOnFunction(Function &F) override;
+
+ StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<DivergenceAnalysis>();
+ AU.setPreservesAll();
+ }
+};
+
+} // End anonymous namespace
+
+Value *AMDGPUCodeGenPrepare::copyFlags(
+ const BinaryOperator &I, Value *V) const {
+ BinaryOperator *BinOp = dyn_cast<BinaryOperator>(V);
+ if (!BinOp) // Possibly constant expression.
+ return V;
+
+ if (isa<OverflowingBinaryOperator>(BinOp)) {
+ BinOp->setHasNoSignedWrap(I.hasNoSignedWrap());
+ BinOp->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
+ } else if (isa<PossiblyExactOperator>(BinOp))
+ BinOp->setIsExact(I.isExact());
+
+ return V;
+}
+
+unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const {
+ assert(needsPromotionToI32(T) && "T does not need promotion to i32");
+
+ if (T->isIntegerTy())
+ return T->getIntegerBitWidth();
+ return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();
+}
+
+Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const {
+ assert(needsPromotionToI32(T) && "T does not need promotion to i32");
+
+ if (T->isIntegerTy())
+ return B.getInt32Ty();
+ return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements());
+}
+
+bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const {
+ return I.getOpcode() == Instruction::AShr ||
+ I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
+}
+
+bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
+ return isa<ICmpInst>(I.getOperand(0)) ?
+ cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
+}
+
+bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const {
+ if (T->isIntegerTy() && T->getIntegerBitWidth() > 1 &&
+ T->getIntegerBitWidth() <= 16)
+ return true;
+ if (!T->isVectorTy())
+ return false;
+ return needsPromotionToI32(cast<VectorType>(T)->getElementType());
+}
+
+bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
+ assert(needsPromotionToI32(I.getType()) &&
+ "I does not need promotion to i32");
+
+ if (I.getOpcode() == Instruction::SDiv ||
+ I.getOpcode() == Instruction::UDiv)
+ return false;
+
+ IRBuilder<> Builder(&I);
+ Builder.SetCurrentDebugLocation(I.getDebugLoc());
+
+ Type *I32Ty = getI32Ty(Builder, I.getType());
+ Value *ExtOp0 = nullptr;
+ Value *ExtOp1 = nullptr;
+ Value *ExtRes = nullptr;
+ Value *TruncRes = nullptr;
+
+ if (isSigned(I)) {
+ ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
+ ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
+ } else {
+ ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
+ ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
+ }
+ ExtRes = copyFlags(I, Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1));
+ TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
+
+ I.replaceAllUsesWith(TruncRes);
+ I.eraseFromParent();
+
+ return true;
+}
+
+bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const {
+ assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
+ "I does not need promotion to i32");
+
+ IRBuilder<> Builder(&I);
+ Builder.SetCurrentDebugLocation(I.getDebugLoc());
+
+ Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());
+ Value *ExtOp0 = nullptr;
+ Value *ExtOp1 = nullptr;
+ Value *NewICmp = nullptr;
+
+ if (I.isSigned()) {
+ ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
+ ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
+ } else {
+ ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
+ ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
+ }
+ NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
+
+ I.replaceAllUsesWith(NewICmp);
+ I.eraseFromParent();
+
+ return true;
+}
+
+bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const {
+ assert(needsPromotionToI32(I.getType()) &&
+ "I does not need promotion to i32");
+
+ IRBuilder<> Builder(&I);
+ Builder.SetCurrentDebugLocation(I.getDebugLoc());
+
+ Type *I32Ty = getI32Ty(Builder, I.getType());
+ Value *ExtOp1 = nullptr;
+ Value *ExtOp2 = nullptr;
+ Value *ExtRes = nullptr;
+ Value *TruncRes = nullptr;
+
+ if (isSigned(I)) {
+ ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
+ ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
+ } else {
+ ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
+ ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
+ }
+ ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
+ TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
+
+ I.replaceAllUsesWith(TruncRes);
+ I.eraseFromParent();
+
+ return true;
+}
+
+bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
+ IntrinsicInst &I) const {
+ assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
+ "I must be bitreverse intrinsic");
+ assert(needsPromotionToI32(I.getType()) &&
+ "I does not need promotion to i32");
+
+ IRBuilder<> Builder(&I);
+ Builder.SetCurrentDebugLocation(I.getDebugLoc());
+
+ Type *I32Ty = getI32Ty(Builder, I.getType());
+ Function *I32 =
+ Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty });
+ Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
+ Value *ExtRes = Builder.CreateCall(I32, { ExtOp });
+ Value *LShrOp =
+ Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));
+ Value *TruncRes =
+ Builder.CreateTrunc(LShrOp, I.getType());
+
+ I.replaceAllUsesWith(TruncRes);
+ I.eraseFromParent();
+
+ return true;
+}
+
+static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
+ const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
+ if (!CNum)
+ return false;
+
+ // Reciprocal f32 is handled separately without denormals.
+ return UnsafeDiv || CNum->isExactlyValue(+1.0);
+}
+
+// Insert an intrinsic for fast fdiv for safe math situations where we can
+// reduce precision. Leave fdiv for situations where the generic node is
+// expected to be optimized.
+bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
+ Type *Ty = FDiv.getType();
+
+ if (!Ty->getScalarType()->isFloatTy())
+ return false;
+
+ MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
+ if (!FPMath)
+ return false;
+
+ const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
+ float ULP = FPOp->getFPAccuracy();
+ if (ULP < 2.5f)
+ return false;
+
+ FastMathFlags FMF = FPOp->getFastMathFlags();
+ bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() ||
+ FMF.allowReciprocal();
+ if (ST->hasFP32Denormals() && !UnsafeDiv)
+ return false;
+
+ IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
+ Builder.setFastMathFlags(FMF);
+ Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
+
+ const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo();
+ Function *Decl
+ = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {});
+
+ Value *Num = FDiv.getOperand(0);
+ Value *Den = FDiv.getOperand(1);
+
+ Value *NewFDiv = nullptr;
+
+ if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
+ NewFDiv = UndefValue::get(VT);
+
+ // FIXME: Doesn't do the right thing for cases where the vector is partially
+ // constant. This works when the scalarizer pass is run first.
+ for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
+ Value *NumEltI = Builder.CreateExtractElement(Num, I);
+ Value *DenEltI = Builder.CreateExtractElement(Den, I);
+ Value *NewElt;
+
+ if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) {
+ NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
+ } else {
+ NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
+ }
+
+ NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
+ }
+ } else {
+ if (!shouldKeepFDivF32(Num, UnsafeDiv))
+ NewFDiv = Builder.CreateCall(Decl, { Num, Den });
+ }
+
+ if (NewFDiv) {
+ FDiv.replaceAllUsesWith(NewFDiv);
+ NewFDiv->takeName(&FDiv);
+ FDiv.eraseFromParent();
+ }
+
+ return true;
+}
+
+static bool hasUnsafeFPMath(const Function &F) {
+ Attribute Attr = F.getFnAttribute("unsafe-fp-math");
+ return Attr.getValueAsString() == "true";
+}
+
+bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
+ bool Changed = false;
+
+ if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
+ DA->isUniform(&I))
+ Changed |= promoteUniformOpToI32(I);
+
+ return Changed;
+}
+
+bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
+ bool Changed = false;
+
+ if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
+ DA->isUniform(&I))
+ Changed |= promoteUniformOpToI32(I);
+
+ return Changed;
+}
+
+bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
+ bool Changed = false;
+
+ if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
+ DA->isUniform(&I))
+ Changed |= promoteUniformOpToI32(I);
+
+ return Changed;
+}
+
+bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
+ switch (I.getIntrinsicID()) {
+ case Intrinsic::bitreverse:
+ return visitBitreverseIntrinsicInst(I);
+ default:
+ return false;
+ }
+}
+
+bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
+ bool Changed = false;
+
+ if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
+ DA->isUniform(&I))
+ Changed |= promoteUniformBitreverseToI32(I);
+
+ return Changed;
+}
+
+bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
+ Mod = &M;
+ return false;
+}
+
+bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
+ if (!TM || skipFunction(F))
+ return false;
+
+ ST = &TM->getSubtarget<SISubtarget>(F);
+ DA = &getAnalysis<DivergenceAnalysis>();
+ HasUnsafeFPMath = hasUnsafeFPMath(F);
+
+ bool MadeChange = false;
+
+ for (BasicBlock &BB : F) {
+ BasicBlock::iterator Next;
+ for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
+ Next = std::next(I);
+ MadeChange |= visit(*I);
+ }
+ }
+
+ return MadeChange;
+}
+
+INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
+ "AMDGPU IR optimizations", false, false)
+INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE,
+ "AMDGPU IR optimizations", false, false)
+
+char AMDGPUCodeGenPrepare::ID = 0;
+
+FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) {
+ return new AMDGPUCodeGenPrepare(TM);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
new file mode 100644
index 000000000000..805fb7102a35
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
@@ -0,0 +1,102 @@
+//===----------------------- AMDGPUFrameLowering.cpp ----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// Interface to describe a layout of a stack frame on a AMDGPU target machine.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUFrameLowering.h"
+#include "AMDGPURegisterInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/Support/MathExtras.h"
+
+using namespace llvm;
+AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, unsigned StackAl,
+ int LAO, unsigned TransAl)
+ : TargetFrameLowering(D, StackAl, LAO, TransAl) { }
+
+AMDGPUFrameLowering::~AMDGPUFrameLowering() = default;
+
+unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const {
+ // XXX: Hardcoding to 1 for now.
+ //
+ // I think the StackWidth should stored as metadata associated with the
+ // MachineFunction. This metadata can either be added by a frontend, or
+ // calculated by a R600 specific LLVM IR pass.
+ //
+ // The StackWidth determines how stack objects are laid out in memory.
+ // For a vector stack variable, like: int4 stack[2], the data will be stored
+ // in the following ways depending on the StackWidth.
+ //
+ // StackWidth = 1:
+ //
+ // T0.X = stack[0].x
+ // T1.X = stack[0].y
+ // T2.X = stack[0].z
+ // T3.X = stack[0].w
+ // T4.X = stack[1].x
+ // T5.X = stack[1].y
+ // T6.X = stack[1].z
+ // T7.X = stack[1].w
+ //
+ // StackWidth = 2:
+ //
+ // T0.X = stack[0].x
+ // T0.Y = stack[0].y
+ // T1.X = stack[0].z
+ // T1.Y = stack[0].w
+ // T2.X = stack[1].x
+ // T2.Y = stack[1].y
+ // T3.X = stack[1].z
+ // T3.Y = stack[1].w
+ //
+ // StackWidth = 4:
+ // T0.X = stack[0].x
+ // T0.Y = stack[0].y
+ // T0.Z = stack[0].z
+ // T0.W = stack[0].w
+ // T1.X = stack[1].x
+ // T1.Y = stack[1].y
+ // T1.Z = stack[1].z
+ // T1.W = stack[1].w
+ return 1;
+}
+
+/// \returns The number of registers allocated for \p FI.
+int AMDGPUFrameLowering::getFrameIndexReference(const MachineFunction &MF,
+ int FI,
+ unsigned &FrameReg) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const AMDGPURegisterInfo *RI
+ = MF.getSubtarget<AMDGPUSubtarget>().getRegisterInfo();
+
+ // Fill in FrameReg output argument.
+ FrameReg = RI->getFrameRegister(MF);
+
+ // Start the offset at 2 so we don't overwrite work group information.
+ // XXX: We should only do this when the shader actually uses this
+ // information.
+ unsigned OffsetBytes = 2 * (getStackWidth(MF) * 4);
+ int UpperBound = FI == -1 ? MFI.getNumObjects() : FI;
+
+ for (int i = MFI.getObjectIndexBegin(); i < UpperBound; ++i) {
+ OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlignment(i));
+ OffsetBytes += MFI.getObjectSize(i);
+ // Each register holds 4 bytes, so we must always align the offset to at
+ // least 4 bytes, so that 2 frame objects won't share the same register.
+ OffsetBytes = alignTo(OffsetBytes, 4);
+ }
+
+ if (FI != -1)
+ OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlignment(FI));
+
+ return OffsetBytes / (getStackWidth(MF) * 4);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h
new file mode 100644
index 000000000000..5d51351a00d2
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h
@@ -0,0 +1,47 @@
+//===--------------------- AMDGPUFrameLowering.h ----------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface to describe a layout of a stack frame on an AMDGPU target.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H
+
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+
+/// \brief Information about the stack frame layout on the AMDGPU targets.
+///
+/// It holds the direction of the stack growth, the known stack alignment on
+/// entry to each function, and the offset to the locals area.
+/// See TargetFrameInfo for more comments.
+class AMDGPUFrameLowering : public TargetFrameLowering {
+public:
+ AMDGPUFrameLowering(StackDirection D, unsigned StackAl, int LAO,
+ unsigned TransAl = 1);
+ ~AMDGPUFrameLowering() override;
+
+ /// \returns The number of 32-bit sub-registers that are used when storing
+ /// values to the stack.
+ unsigned getStackWidth(const MachineFunction &MF) const;
+
+ int getFrameIndexReference(const MachineFunction &MF, int FI,
+ unsigned &FrameReg) const override;
+
+ bool hasFP(const MachineFunction &MF) const override {
+ return false;
+ }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
new file mode 100644
index 000000000000..ef3b44f7c211
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -0,0 +1,1632 @@
+//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Defines an instruction selector for the AMDGPU target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPURegisterInfo.h"
+#include "AMDGPUISelLowering.h" // For AMDGPUISD
+#include "AMDGPUSubtarget.h"
+#include "SIDefines.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "SIISelLowering.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+#include <cstdint>
+#include <new>
+#include <vector>
+
+using namespace llvm;
+
+namespace llvm {
+
+class R600InstrInfo;
+
+} // end namespace llvm
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+/// AMDGPU specific code to select AMDGPU machine instructions for
+/// SelectionDAG operations.
+class AMDGPUDAGToDAGISel : public SelectionDAGISel {
+ // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
+ // make the right decision when generating code for different targets.
+ const AMDGPUSubtarget *Subtarget;
+
+public:
+ explicit AMDGPUDAGToDAGISel(TargetMachine &TM, CodeGenOpt::Level OptLevel)
+ : SelectionDAGISel(TM, OptLevel) {}
+ ~AMDGPUDAGToDAGISel() override = default;
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+ void Select(SDNode *N) override;
+ StringRef getPassName() const override;
+ void PostprocessISelDAG() override;
+
+private:
+ SDValue foldFrameIndex(SDValue N) const;
+ bool isInlineImmediate(const SDNode *N) const;
+ bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs,
+ const R600InstrInfo *TII);
+ bool FoldOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
+ bool FoldDotOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
+
+ bool isConstantLoad(const MemSDNode *N, int cbID) const;
+ bool isUniformBr(const SDNode *N) const;
+
+ SDNode *glueCopyToM0(SDNode *N) const;
+
+ const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
+ bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
+ bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg,
+ SDValue& Offset);
+ bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
+ bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
+ bool isDSOffsetLegal(const SDValue &Base, unsigned Offset,
+ unsigned OffsetBits) const;
+ bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
+ bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
+ SDValue &Offset1) const;
+ bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
+ SDValue &SOffset, SDValue &Offset, SDValue &Offen,
+ SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC,
+ SDValue &TFE) const;
+ bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
+ SDValue &SOffset, SDValue &Offset, SDValue &GLC,
+ SDValue &SLC, SDValue &TFE) const;
+ bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
+ SDValue &VAddr, SDValue &SOffset, SDValue &Offset,
+ SDValue &SLC) const;
+ bool SelectMUBUFScratch(SDValue Addr, SDValue &RSrc, SDValue &VAddr,
+ SDValue &SOffset, SDValue &ImmOffset) const;
+ bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset,
+ SDValue &Offset, SDValue &GLC, SDValue &SLC,
+ SDValue &TFE) const;
+ bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
+ SDValue &Offset, SDValue &SLC) const;
+ bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
+ SDValue &Offset) const;
+ bool SelectMUBUFConstant(SDValue Constant,
+ SDValue &SOffset,
+ SDValue &ImmOffset) const;
+ bool SelectMUBUFIntrinsicOffset(SDValue Offset, SDValue &SOffset,
+ SDValue &ImmOffset) const;
+ bool SelectMUBUFIntrinsicVOffset(SDValue Offset, SDValue &SOffset,
+ SDValue &ImmOffset, SDValue &VOffset) const;
+
+ bool SelectFlat(SDValue Addr, SDValue &VAddr,
+ SDValue &SLC, SDValue &TFE) const;
+
+ bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset,
+ bool &Imm) const;
+ bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset,
+ bool &Imm) const;
+ bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
+ bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
+ bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
+ bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const;
+ bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const;
+ bool SelectSMRDBufferSgpr(SDValue Addr, SDValue &Offset) const;
+ bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
+ bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+ bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+ bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
+ SDValue &Clamp, SDValue &Omod) const;
+ bool SelectVOP3NoMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
+ SDValue &Clamp, SDValue &Omod) const;
+
+ bool SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, SDValue &SrcMods,
+ SDValue &Omod) const;
+ bool SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, SDValue &SrcMods,
+ SDValue &Clamp,
+ SDValue &Omod) const;
+
+ void SelectADD_SUB_I64(SDNode *N);
+ void SelectDIV_SCALE(SDNode *N);
+ void SelectFMA_W_CHAIN(SDNode *N);
+ void SelectFMUL_W_CHAIN(SDNode *N);
+
+ SDNode *getS_BFE(unsigned Opcode, const SDLoc &DL, SDValue Val,
+ uint32_t Offset, uint32_t Width);
+ void SelectS_BFEFromShifts(SDNode *N);
+ void SelectS_BFE(SDNode *N);
+ bool isCBranchSCC(const SDNode *N) const;
+ void SelectBRCOND(SDNode *N);
+ void SelectATOMIC_CMP_SWAP(SDNode *N);
+
+ // Include the pieces autogenerated from the target description.
+#include "AMDGPUGenDAGISel.inc"
+};
+
+} // end anonymous namespace
+
+/// \brief This pass converts a legalized DAG into a AMDGPU-specific
+// DAG, ready for instruction scheduling.
+FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM,
+ CodeGenOpt::Level OptLevel) {
+ return new AMDGPUDAGToDAGISel(TM, OptLevel);
+}
+
+bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
+ Subtarget = &MF.getSubtarget<AMDGPUSubtarget>();
+ return SelectionDAGISel::runOnMachineFunction(MF);
+}
+
+bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
+ const SIInstrInfo *TII
+ = static_cast<const SISubtarget *>(Subtarget)->getInstrInfo();
+
+ if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
+ return TII->isInlineConstant(C->getAPIntValue());
+
+ if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
+ return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt());
+
+ return false;
+}
+
+/// \brief Determine the register class for \p OpNo
+/// \returns The register class of the virtual register that will be used for
+/// the given operand number \OpNo or NULL if the register class cannot be
+/// determined.
+const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
+ unsigned OpNo) const {
+ if (!N->isMachineOpcode()) {
+ if (N->getOpcode() == ISD::CopyToReg) {
+ unsigned Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
+ if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo();
+ return MRI.getRegClass(Reg);
+ }
+
+ const SIRegisterInfo *TRI
+ = static_cast<const SISubtarget *>(Subtarget)->getRegisterInfo();
+ return TRI->getPhysRegClass(Reg);
+ }
+
+ return nullptr;
+ }
+
+ switch (N->getMachineOpcode()) {
+ default: {
+ const MCInstrDesc &Desc =
+ Subtarget->getInstrInfo()->get(N->getMachineOpcode());
+ unsigned OpIdx = Desc.getNumDefs() + OpNo;
+ if (OpIdx >= Desc.getNumOperands())
+ return nullptr;
+ int RegClass = Desc.OpInfo[OpIdx].RegClass;
+ if (RegClass == -1)
+ return nullptr;
+
+ return Subtarget->getRegisterInfo()->getRegClass(RegClass);
+ }
+ case AMDGPU::REG_SEQUENCE: {
+ unsigned RCID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+ const TargetRegisterClass *SuperRC =
+ Subtarget->getRegisterInfo()->getRegClass(RCID);
+
+ SDValue SubRegOp = N->getOperand(OpNo + 1);
+ unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue();
+ return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
+ SubRegIdx);
+ }
+ }
+}
+
+SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const {
+ if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
+ cast<MemSDNode>(N)->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
+ return N;
+
+ const SITargetLowering& Lowering =
+ *static_cast<const SITargetLowering*>(getTargetLowering());
+
+ // Write max value to m0 before each load operation
+
+ SDValue M0 = Lowering.copyToM0(*CurDAG, CurDAG->getEntryNode(), SDLoc(N),
+ CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
+
+ SDValue Glue = M0.getValue(1);
+
+ SmallVector <SDValue, 8> Ops;
+ for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+ Ops.push_back(N->getOperand(i));
+ }
+ Ops.push_back(Glue);
+ CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
+
+ return N;
+}
+
+static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) {
+ switch (NumVectorElts) {
+ case 1:
+ return AMDGPU::SReg_32_XM0RegClassID;
+ case 2:
+ return AMDGPU::SReg_64RegClassID;
+ case 4:
+ return AMDGPU::SReg_128RegClassID;
+ case 8:
+ return AMDGPU::SReg_256RegClassID;
+ case 16:
+ return AMDGPU::SReg_512RegClassID;
+ }
+
+ llvm_unreachable("invalid vector size");
+}
+
+void AMDGPUDAGToDAGISel::Select(SDNode *N) {
+ unsigned int Opc = N->getOpcode();
+ if (N->isMachineOpcode()) {
+ N->setNodeId(-1);
+ return; // Already selected.
+ }
+
+ if (isa<AtomicSDNode>(N) ||
+ (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC))
+ N = glueCopyToM0(N);
+
+ switch (Opc) {
+ default: break;
+ // We are selecting i64 ADD here instead of custom lower it during
+ // DAG legalization, so we can fold some i64 ADDs used for address
+ // calculation into the LOAD and STORE instructions.
+ case ISD::ADD:
+ case ISD::ADDC:
+ case ISD::ADDE:
+ case ISD::SUB:
+ case ISD::SUBC:
+ case ISD::SUBE: {
+ if (N->getValueType(0) != MVT::i64 ||
+ Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
+ break;
+
+ SelectADD_SUB_I64(N);
+ return;
+ }
+ case AMDGPUISD::FMUL_W_CHAIN: {
+ SelectFMUL_W_CHAIN(N);
+ return;
+ }
+ case AMDGPUISD::FMA_W_CHAIN: {
+ SelectFMA_W_CHAIN(N);
+ return;
+ }
+
+ case ISD::SCALAR_TO_VECTOR:
+ case AMDGPUISD::BUILD_VERTICAL_VECTOR:
+ case ISD::BUILD_VECTOR: {
+ unsigned RegClassID;
+ const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo();
+ EVT VT = N->getValueType(0);
+ unsigned NumVectorElts = VT.getVectorNumElements();
+ EVT EltVT = VT.getVectorElementType();
+ assert(EltVT.bitsEq(MVT::i32));
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+ RegClassID = selectSGPRVectorRegClassID(NumVectorElts);
+ } else {
+ // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
+ // that adds a 128 bits reg copy when going through TwoAddressInstructions
+ // pass. We want to avoid 128 bits copies as much as possible because they
+ // can't be bundled by our scheduler.
+ switch(NumVectorElts) {
+ case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break;
+ case 4:
+ if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR)
+ RegClassID = AMDGPU::R600_Reg128VerticalRegClassID;
+ else
+ RegClassID = AMDGPU::R600_Reg128RegClassID;
+ break;
+ default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
+ }
+ }
+
+ SDLoc DL(N);
+ SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
+
+ if (NumVectorElts == 1) {
+ CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
+ RegClass);
+ return;
+ }
+
+ assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not "
+ "supported yet");
+ // 16 = Max Num Vector Elements
+ // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
+ // 1 = Vector Register Class
+ SmallVector<SDValue, 16 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
+
+ RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
+ bool IsRegSeq = true;
+ unsigned NOps = N->getNumOperands();
+ for (unsigned i = 0; i < NOps; i++) {
+ // XXX: Why is this here?
+ if (isa<RegisterSDNode>(N->getOperand(i))) {
+ IsRegSeq = false;
+ break;
+ }
+ RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
+ RegSeqArgs[1 + (2 * i) + 1] =
+ CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL,
+ MVT::i32);
+ }
+
+ if (NOps != NumVectorElts) {
+ // Fill in the missing undef elements if this was a scalar_to_vector.
+ assert(Opc == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
+
+ MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
+ DL, EltVT);
+ for (unsigned i = NOps; i < NumVectorElts; ++i) {
+ RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
+ RegSeqArgs[1 + (2 * i) + 1] =
+ CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL, MVT::i32);
+ }
+ }
+
+ if (!IsRegSeq)
+ break;
+ CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
+ return;
+ }
+ case ISD::BUILD_PAIR: {
+ SDValue RC, SubReg0, SubReg1;
+ if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
+ break;
+ }
+ SDLoc DL(N);
+ if (N->getValueType(0) == MVT::i128) {
+ RC = CurDAG->getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32);
+ SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
+ SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
+ } else if (N->getValueType(0) == MVT::i64) {
+ RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
+ SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
+ SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
+ } else {
+ llvm_unreachable("Unhandled value type for BUILD_PAIR");
+ }
+ const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
+ N->getOperand(1), SubReg1 };
+ ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
+ N->getValueType(0), Ops));
+ return;
+ }
+
+ case ISD::Constant:
+ case ISD::ConstantFP: {
+ if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
+ N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
+ break;
+
+ uint64_t Imm;
+ if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N))
+ Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
+ else {
+ ConstantSDNode *C = cast<ConstantSDNode>(N);
+ Imm = C->getZExtValue();
+ }
+
+ SDLoc DL(N);
+ SDNode *Lo = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
+ CurDAG->getConstant(Imm & 0xFFFFFFFF, DL,
+ MVT::i32));
+ SDNode *Hi = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
+ CurDAG->getConstant(Imm >> 32, DL, MVT::i32));
+ const SDValue Ops[] = {
+ CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
+ SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
+ SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
+ };
+
+ ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
+ N->getValueType(0), Ops));
+ return;
+ }
+ case ISD::LOAD:
+ case ISD::STORE: {
+ N = glueCopyToM0(N);
+ break;
+ }
+
+ case AMDGPUISD::BFE_I32:
+ case AMDGPUISD::BFE_U32: {
+ if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
+ break;
+
+ // There is a scalar version available, but unlike the vector version which
+ // has a separate operand for the offset and width, the scalar version packs
+ // the width and offset into a single operand. Try to move to the scalar
+ // version if the offsets are constant, so that we can try to keep extended
+ // loads of kernel arguments in SGPRs.
+
+ // TODO: Technically we could try to pattern match scalar bitshifts of
+ // dynamic values, but it's probably not useful.
+ ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!Offset)
+ break;
+
+ ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
+ if (!Width)
+ break;
+
+ bool Signed = Opc == AMDGPUISD::BFE_I32;
+
+ uint32_t OffsetVal = Offset->getZExtValue();
+ uint32_t WidthVal = Width->getZExtValue();
+
+ ReplaceNode(N, getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32,
+ SDLoc(N), N->getOperand(0), OffsetVal, WidthVal));
+ return;
+ }
+ case AMDGPUISD::DIV_SCALE: {
+ SelectDIV_SCALE(N);
+ return;
+ }
+ case ISD::CopyToReg: {
+ const SITargetLowering& Lowering =
+ *static_cast<const SITargetLowering*>(getTargetLowering());
+ Lowering.legalizeTargetIndependentNode(N, *CurDAG);
+ break;
+ }
+ case ISD::AND:
+ case ISD::SRL:
+ case ISD::SRA:
+ case ISD::SIGN_EXTEND_INREG:
+ if (N->getValueType(0) != MVT::i32 ||
+ Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
+ break;
+
+ SelectS_BFE(N);
+ return;
+ case ISD::BRCOND:
+ SelectBRCOND(N);
+ return;
+
+ case AMDGPUISD::ATOMIC_CMP_SWAP:
+ SelectATOMIC_CMP_SWAP(N);
+ return;
+ }
+
+ SelectCode(N);
+}
+
+bool AMDGPUDAGToDAGISel::isConstantLoad(const MemSDNode *N, int CbId) const {
+ if (!N->readMem())
+ return false;
+ if (CbId == -1)
+ return N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
+
+ return N->getAddressSpace() == AMDGPUAS::CONSTANT_BUFFER_0 + CbId;
+}
+
+bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
+ const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
+ const Instruction *Term = BB->getTerminator();
+ return Term->getMetadata("amdgpu.uniform") ||
+ Term->getMetadata("structurizecfg.uniform");
+}
+
+StringRef AMDGPUDAGToDAGISel::getPassName() const {
+ return "AMDGPU DAG->DAG Pattern Instruction Selection";
+}
+
+//===----------------------------------------------------------------------===//
+// Complex Patterns
+//===----------------------------------------------------------------------===//
+
+bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr,
+ SDValue& IntPtr) {
+ if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) {
+ IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, SDLoc(Addr),
+ true);
+ return true;
+ }
+ return false;
+}
+
+bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
+ SDValue& BaseReg, SDValue &Offset) {
+ if (!isa<ConstantSDNode>(Addr)) {
+ BaseReg = Addr;
+ Offset = CurDAG->getIntPtrConstant(0, SDLoc(Addr), true);
+ return true;
+ }
+ return false;
+}
+
+bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
+ SDValue &Offset) {
+ ConstantSDNode *IMMOffset;
+
+ if (Addr.getOpcode() == ISD::ADD
+ && (IMMOffset = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
+ && isInt<16>(IMMOffset->getZExtValue())) {
+
+ Base = Addr.getOperand(0);
+ Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr),
+ MVT::i32);
+ return true;
+ // If the pointer address is constant, we can move it to the offset field.
+ } else if ((IMMOffset = dyn_cast<ConstantSDNode>(Addr))
+ && isInt<16>(IMMOffset->getZExtValue())) {
+ Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
+ SDLoc(CurDAG->getEntryNode()),
+ AMDGPU::ZERO, MVT::i32);
+ Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr),
+ MVT::i32);
+ return true;
+ }
+
+ // Default case, no offset
+ Base = Addr;
+ Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
+ SDValue &Offset) {
+ ConstantSDNode *C;
+ SDLoc DL(Addr);
+
+ if ((C = dyn_cast<ConstantSDNode>(Addr))) {
+ Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
+ Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
+ } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
+ (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
+ Base = Addr.getOperand(0);
+ Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
+ } else {
+ Base = Addr;
+ Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
+ }
+
+ return true;
+}
+
+void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
+ SDLoc DL(N);
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ unsigned Opcode = N->getOpcode();
+ bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
+ bool ProduceCarry =
+ ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
+ bool IsAdd =
+ (Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE);
+
+ SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
+ SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
+
+ SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+ DL, MVT::i32, LHS, Sub0);
+ SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+ DL, MVT::i32, LHS, Sub1);
+
+ SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+ DL, MVT::i32, RHS, Sub0);
+ SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+ DL, MVT::i32, RHS, Sub1);
+
+ SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
+
+ unsigned Opc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
+ unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
+
+ SDNode *AddLo;
+ if (!ConsumeCarry) {
+ SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
+ AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);
+ } else {
+ SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };
+ AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
+ }
+ SDValue AddHiArgs[] = {
+ SDValue(Hi0, 0),
+ SDValue(Hi1, 0),
+ SDValue(AddLo, 1)
+ };
+ SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);
+
+ SDValue RegSequenceArgs[] = {
+ CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
+ SDValue(AddLo,0),
+ Sub0,
+ SDValue(AddHi,0),
+ Sub1,
+ };
+ SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
+ MVT::i64, RegSequenceArgs);
+
+ if (ProduceCarry) {
+ // Replace the carry-use
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(AddHi, 1));
+ }
+
+ // Replace the remaining uses.
+ CurDAG->ReplaceAllUsesWith(N, RegSequence);
+ CurDAG->RemoveDeadNode(N);
+}
+
+void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
+ SDLoc SL(N);
+ // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
+ SDValue Ops[10];
+
+ SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
+ SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
+ SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
+ Ops[8] = N->getOperand(0);
+ Ops[9] = N->getOperand(4);
+
+ CurDAG->SelectNodeTo(N, AMDGPU::V_FMA_F32, N->getVTList(), Ops);
+}
+
+void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
+ SDLoc SL(N);
+ // src0_modifiers, src0, src1_modifiers, src1, clamp, omod
+ SDValue Ops[8];
+
+ SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
+ SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
+ Ops[6] = N->getOperand(0);
+ Ops[7] = N->getOperand(3);
+
+ CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
+}
+
+// We need to handle this here because tablegen doesn't support matching
+// instructions with multiple outputs.
+void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
+ SDLoc SL(N);
+ EVT VT = N->getValueType(0);
+
+ assert(VT == MVT::f32 || VT == MVT::f64);
+
+ unsigned Opc
+ = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32;
+
+ // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
+ // omod
+ SDValue Ops[8];
+
+ SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
+ SelectVOP3Mods(N->getOperand(1), Ops[3], Ops[2]);
+ SelectVOP3Mods(N->getOperand(2), Ops[5], Ops[4]);
+ CurDAG->SelectNodeTo(N, Opc, VT, MVT::i1, Ops);
+}
+
+bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset,
+ unsigned OffsetBits) const {
+ if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
+ (OffsetBits == 8 && !isUInt<8>(Offset)))
+ return false;
+
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS ||
+ Subtarget->unsafeDSOffsetFoldingEnabled())
+ return true;
+
+ // On Southern Islands instruction with a negative base value and an offset
+ // don't seem to work.
+ return CurDAG->SignBitIsZero(Base);
+}
+
+bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const {
+ SDLoc DL(Addr);
+ if (CurDAG->isBaseWithConstantOffset(Addr)) {
+ SDValue N0 = Addr.getOperand(0);
+ SDValue N1 = Addr.getOperand(1);
+ ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
+ if (isDSOffsetLegal(N0, C1->getSExtValue(), 16)) {
+ // (add n0, c0)
+ Base = N0;
+ Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
+ return true;
+ }
+ } else if (Addr.getOpcode() == ISD::SUB) {
+ // sub C, x -> add (sub 0, x), C
+ if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
+ int64_t ByteOffset = C->getSExtValue();
+ if (isUInt<16>(ByteOffset)) {
+ SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
+
+ // XXX - This is kind of hacky. Create a dummy sub node so we can check
+ // the known bits in isDSOffsetLegal. We need to emit the selected node
+ // here, so this is thrown away.
+ SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
+ Zero, Addr.getOperand(1));
+
+ if (isDSOffsetLegal(Sub, ByteOffset, 16)) {
+ MachineSDNode *MachineSub
+ = CurDAG->getMachineNode(AMDGPU::V_SUB_I32_e32, DL, MVT::i32,
+ Zero, Addr.getOperand(1));
+
+ Base = SDValue(MachineSub, 0);
+ Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
+ return true;
+ }
+ }
+ }
+ } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
+ // If we have a constant address, prefer to put the constant into the
+ // offset. This can save moves to load the constant address since multiple
+ // operations can share the zero base address register, and enables merging
+ // into read2 / write2 instructions.
+
+ SDLoc DL(Addr);
+
+ if (isUInt<16>(CAddr->getZExtValue())) {
+ SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
+ MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
+ DL, MVT::i32, Zero);
+ Base = SDValue(MovZero, 0);
+ Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
+ return true;
+ }
+ }
+
+ // default case
+ Base = Addr;
+ Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
+ return true;
+}
+
+// TODO: If offset is too big, put low 16-bit into offset.
+bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
+ SDValue &Offset0,
+ SDValue &Offset1) const {
+ SDLoc DL(Addr);
+
+ if (CurDAG->isBaseWithConstantOffset(Addr)) {
+ SDValue N0 = Addr.getOperand(0);
+ SDValue N1 = Addr.getOperand(1);
+ ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
+ unsigned DWordOffset0 = C1->getZExtValue() / 4;
+ unsigned DWordOffset1 = DWordOffset0 + 1;
+ // (add n0, c0)
+ if (isDSOffsetLegal(N0, DWordOffset1, 8)) {
+ Base = N0;
+ Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8);
+ Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8);
+ return true;
+ }
+ } else if (Addr.getOpcode() == ISD::SUB) {
+ // sub C, x -> add (sub 0, x), C
+ if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
+ unsigned DWordOffset0 = C->getZExtValue() / 4;
+ unsigned DWordOffset1 = DWordOffset0 + 1;
+
+ if (isUInt<8>(DWordOffset0)) {
+ SDLoc DL(Addr);
+ SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
+
+ // XXX - This is kind of hacky. Create a dummy sub node so we can check
+ // the known bits in isDSOffsetLegal. We need to emit the selected node
+ // here, so this is thrown away.
+ SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
+ Zero, Addr.getOperand(1));
+
+ if (isDSOffsetLegal(Sub, DWordOffset1, 8)) {
+ MachineSDNode *MachineSub
+ = CurDAG->getMachineNode(AMDGPU::V_SUB_I32_e32, DL, MVT::i32,
+ Zero, Addr.getOperand(1));
+
+ Base = SDValue(MachineSub, 0);
+ Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8);
+ Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8);
+ return true;
+ }
+ }
+ }
+ } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
+ unsigned DWordOffset0 = CAddr->getZExtValue() / 4;
+ unsigned DWordOffset1 = DWordOffset0 + 1;
+ assert(4 * DWordOffset0 == CAddr->getZExtValue());
+
+ if (isUInt<8>(DWordOffset0) && isUInt<8>(DWordOffset1)) {
+ SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
+ MachineSDNode *MovZero
+ = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
+ DL, MVT::i32, Zero);
+ Base = SDValue(MovZero, 0);
+ Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8);
+ Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8);
+ return true;
+ }
+ }
+
+ // default case
+
+ // FIXME: This is broken on SI where we still need to check if the base
+ // pointer is positive here.
+ Base = Addr;
+ Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8);
+ Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8);
+ return true;
+}
+
+static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) {
+ return isUInt<12>(Imm->getZExtValue());
+}
+
+bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
+ SDValue &VAddr, SDValue &SOffset,
+ SDValue &Offset, SDValue &Offen,
+ SDValue &Idxen, SDValue &Addr64,
+ SDValue &GLC, SDValue &SLC,
+ SDValue &TFE) const {
+ // Subtarget prefers to use flat instruction
+ if (Subtarget->useFlatForGlobal())
+ return false;
+
+ SDLoc DL(Addr);
+
+ if (!GLC.getNode())
+ GLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
+ if (!SLC.getNode())
+ SLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
+ TFE = CurDAG->getTargetConstant(0, DL, MVT::i1);
+
+ Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
+ Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
+ Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
+ SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
+
+ if (CurDAG->isBaseWithConstantOffset(Addr)) {
+ SDValue N0 = Addr.getOperand(0);
+ SDValue N1 = Addr.getOperand(1);
+ ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
+
+ if (N0.getOpcode() == ISD::ADD) {
+ // (add (add N2, N3), C1) -> addr64
+ SDValue N2 = N0.getOperand(0);
+ SDValue N3 = N0.getOperand(1);
+ Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
+ Ptr = N2;
+ VAddr = N3;
+ } else {
+ // (add N0, C1) -> offset
+ VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
+ Ptr = N0;
+ }
+
+ if (isLegalMUBUFImmOffset(C1)) {
+ Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
+ return true;
+ }
+
+ if (isUInt<32>(C1->getZExtValue())) {
+ // Illegal offset, store it in soffset.
+ Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+ SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
+ CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
+ 0);
+ return true;
+ }
+ }
+
+ if (Addr.getOpcode() == ISD::ADD) {
+ // (add N0, N1) -> addr64
+ SDValue N0 = Addr.getOperand(0);
+ SDValue N1 = Addr.getOperand(1);
+ Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
+ Ptr = N0;
+ VAddr = N1;
+ Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+ return true;
+ }
+
+ // default case -> offset
+ VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
+ Ptr = Addr;
+ Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
+ SDValue &VAddr, SDValue &SOffset,
+ SDValue &Offset, SDValue &GLC,
+ SDValue &SLC, SDValue &TFE) const {
+ SDValue Ptr, Offen, Idxen, Addr64;
+
+ // addr64 bit was removed for volcanic islands.
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ return false;
+
+ if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
+ GLC, SLC, TFE))
+ return false;
+
+ ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
+ if (C->getSExtValue()) {
+ SDLoc DL(Addr);
+
+ const SITargetLowering& Lowering =
+ *static_cast<const SITargetLowering*>(getTargetLowering());
+
+ SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
+ return true;
+ }
+
+ return false;
+}
+
+bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
+ SDValue &VAddr, SDValue &SOffset,
+ SDValue &Offset,
+ SDValue &SLC) const {
+ SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1);
+ SDValue GLC, TFE;
+
+ return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE);
+}
+
+SDValue AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
+ if (auto FI = dyn_cast<FrameIndexSDNode>(N))
+ return CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
+ return N;
+}
+
+bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
+ SDValue &VAddr, SDValue &SOffset,
+ SDValue &ImmOffset) const {
+
+ SDLoc DL(Addr);
+ MachineFunction &MF = CurDAG->getMachineFunction();
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+ Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
+ SOffset = CurDAG->getRegister(Info->getScratchWaveOffsetReg(), MVT::i32);
+
+ // (add n0, c1)
+ if (CurDAG->isBaseWithConstantOffset(Addr)) {
+ SDValue N0 = Addr.getOperand(0);
+ SDValue N1 = Addr.getOperand(1);
+
+ // Offsets in vaddr must be positive.
+ ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
+ if (isLegalMUBUFImmOffset(C1)) {
+ VAddr = foldFrameIndex(N0);
+ ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
+ return true;
+ }
+ }
+
+ // (node)
+ VAddr = foldFrameIndex(Addr);
+ ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
+ SDValue &SOffset, SDValue &Offset,
+ SDValue &GLC, SDValue &SLC,
+ SDValue &TFE) const {
+ SDValue Ptr, VAddr, Offen, Idxen, Addr64;
+ const SIInstrInfo *TII =
+ static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+
+ if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
+ GLC, SLC, TFE))
+ return false;
+
+ if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
+ !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
+ !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
+ uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
+ APInt::getAllOnesValue(32).getZExtValue(); // Size
+ SDLoc DL(Addr);
+
+ const SITargetLowering& Lowering =
+ *static_cast<const SITargetLowering*>(getTargetLowering());
+
+ SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
+ return true;
+ }
+ return false;
+}
+
+bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
+ SDValue &Soffset, SDValue &Offset
+ ) const {
+ SDValue GLC, SLC, TFE;
+
+ return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE);
+}
+bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
+ SDValue &Soffset, SDValue &Offset,
+ SDValue &SLC) const {
+ SDValue GLC, TFE;
+
+ return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE);
+}
+
+bool AMDGPUDAGToDAGISel::SelectMUBUFConstant(SDValue Constant,
+ SDValue &SOffset,
+ SDValue &ImmOffset) const {
+ SDLoc DL(Constant);
+ uint32_t Imm = cast<ConstantSDNode>(Constant)->getZExtValue();
+ uint32_t Overflow = 0;
+
+ if (Imm >= 4096) {
+ if (Imm <= 4095 + 64) {
+ // Use an SOffset inline constant for 1..64
+ Overflow = Imm - 4095;
+ Imm = 4095;
+ } else {
+ // Try to keep the same value in SOffset for adjacent loads, so that
+ // the corresponding register contents can be re-used.
+ //
+ // Load values with all low-bits set into SOffset, so that a larger
+ // range of values can be covered using s_movk_i32
+ uint32_t High = (Imm + 1) & ~4095;
+ uint32_t Low = (Imm + 1) & 4095;
+ Imm = Low;
+ Overflow = High - 1;
+ }
+ }
+
+ // There is a hardware bug in SI and CI which prevents address clamping in
+ // MUBUF instructions from working correctly with SOffsets. The immediate
+ // offset is unaffected.
+ if (Overflow > 0 &&
+ Subtarget->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
+ return false;
+
+ ImmOffset = CurDAG->getTargetConstant(Imm, DL, MVT::i16);
+
+ if (Overflow <= 64)
+ SOffset = CurDAG->getTargetConstant(Overflow, DL, MVT::i32);
+ else
+ SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
+ CurDAG->getTargetConstant(Overflow, DL, MVT::i32)),
+ 0);
+
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicOffset(SDValue Offset,
+ SDValue &SOffset,
+ SDValue &ImmOffset) const {
+ SDLoc DL(Offset);
+
+ if (!isa<ConstantSDNode>(Offset))
+ return false;
+
+ return SelectMUBUFConstant(Offset, SOffset, ImmOffset);
+}
+
+bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicVOffset(SDValue Offset,
+ SDValue &SOffset,
+ SDValue &ImmOffset,
+ SDValue &VOffset) const {
+ SDLoc DL(Offset);
+
+ // Don't generate an unnecessary voffset for constant offsets.
+ if (isa<ConstantSDNode>(Offset)) {
+ SDValue Tmp1, Tmp2;
+
+ // When necessary, use a voffset in <= CI anyway to work around a hardware
+ // bug.
+ if (Subtarget->getGeneration() > AMDGPUSubtarget::SEA_ISLANDS ||
+ SelectMUBUFConstant(Offset, Tmp1, Tmp2))
+ return false;
+ }
+
+ if (CurDAG->isBaseWithConstantOffset(Offset)) {
+ SDValue N0 = Offset.getOperand(0);
+ SDValue N1 = Offset.getOperand(1);
+ if (cast<ConstantSDNode>(N1)->getSExtValue() >= 0 &&
+ SelectMUBUFConstant(N1, SOffset, ImmOffset)) {
+ VOffset = N0;
+ return true;
+ }
+ }
+
+ SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
+ ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+ VOffset = Offset;
+
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectFlat(SDValue Addr,
+ SDValue &VAddr,
+ SDValue &SLC,
+ SDValue &TFE) const {
+ VAddr = Addr;
+ TFE = SLC = CurDAG->getTargetConstant(0, SDLoc(), MVT::i1);
+ return true;
+}
+
+///
+/// \param EncodedOffset This is the immediate value that will be encoded
+/// directly into the instruction. On SI/CI the \p EncodedOffset
+/// will be in units of dwords and on VI+ it will be units of bytes.
+static bool isLegalSMRDImmOffset(const AMDGPUSubtarget *ST,
+ int64_t EncodedOffset) {
+ return ST->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ?
+ isUInt<8>(EncodedOffset) : isUInt<20>(EncodedOffset);
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
+ SDValue &Offset, bool &Imm) const {
+
+ // FIXME: Handle non-constant offsets.
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
+ if (!C)
+ return false;
+
+ SDLoc SL(ByteOffsetNode);
+ AMDGPUSubtarget::Generation Gen = Subtarget->getGeneration();
+ int64_t ByteOffset = C->getSExtValue();
+ int64_t EncodedOffset = Gen < AMDGPUSubtarget::VOLCANIC_ISLANDS ?
+ ByteOffset >> 2 : ByteOffset;
+
+ if (isLegalSMRDImmOffset(Subtarget, EncodedOffset)) {
+ Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32);
+ Imm = true;
+ return true;
+ }
+
+ if (!isUInt<32>(EncodedOffset) || !isUInt<32>(ByteOffset))
+ return false;
+
+ if (Gen == AMDGPUSubtarget::SEA_ISLANDS && isUInt<32>(EncodedOffset)) {
+ // 32-bit Immediates are supported on Sea Islands.
+ Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32);
+ } else {
+ SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
+ Offset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32,
+ C32Bit), 0);
+ }
+ Imm = false;
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
+ SDValue &Offset, bool &Imm) const {
+ SDLoc SL(Addr);
+ if (CurDAG->isBaseWithConstantOffset(Addr)) {
+ SDValue N0 = Addr.getOperand(0);
+ SDValue N1 = Addr.getOperand(1);
+
+ if (SelectSMRDOffset(N1, Offset, Imm)) {
+ SBase = N0;
+ return true;
+ }
+ }
+ SBase = Addr;
+ Offset = CurDAG->getTargetConstant(0, SL, MVT::i32);
+ Imm = true;
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
+ SDValue &Offset) const {
+ bool Imm;
+ return SelectSMRD(Addr, SBase, Offset, Imm) && Imm;
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
+ SDValue &Offset) const {
+
+ if (Subtarget->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS)
+ return false;
+
+ bool Imm;
+ if (!SelectSMRD(Addr, SBase, Offset, Imm))
+ return false;
+
+ return !Imm && isa<ConstantSDNode>(Offset);
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
+ SDValue &Offset) const {
+ bool Imm;
+ return SelectSMRD(Addr, SBase, Offset, Imm) && !Imm &&
+ !isa<ConstantSDNode>(Offset);
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr,
+ SDValue &Offset) const {
+ bool Imm;
+ return SelectSMRDOffset(Addr, Offset, Imm) && Imm;
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue Addr,
+ SDValue &Offset) const {
+ if (Subtarget->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS)
+ return false;
+
+ bool Imm;
+ if (!SelectSMRDOffset(Addr, Offset, Imm))
+ return false;
+
+ return !Imm && isa<ConstantSDNode>(Offset);
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgpr(SDValue Addr,
+ SDValue &Offset) const {
+ bool Imm;
+ return SelectSMRDOffset(Addr, Offset, Imm) && !Imm &&
+ !isa<ConstantSDNode>(Offset);
+}
+
+bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
+ SDValue &Base,
+ SDValue &Offset) const {
+ SDLoc DL(Index);
+
+ if (CurDAG->isBaseWithConstantOffset(Index)) {
+ SDValue N0 = Index.getOperand(0);
+ SDValue N1 = Index.getOperand(1);
+ ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
+
+ // (add n0, c0)
+ Base = N0;
+ Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
+ return true;
+ }
+
+ if (isa<ConstantSDNode>(Index))
+ return false;
+
+ Base = Index;
+ Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
+ return true;
+}
+
+SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, const SDLoc &DL,
+ SDValue Val, uint32_t Offset,
+ uint32_t Width) {
+ // Transformation function, pack the offset and width of a BFE into
+ // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
+ // source, bits [5:0] contain the offset and bits [22:16] the width.
+ uint32_t PackedVal = Offset | (Width << 16);
+ SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
+
+ return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
+}
+
+void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
+ // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
+ // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
+ // Predicate: 0 < b <= c < 32
+
+ const SDValue &Shl = N->getOperand(0);
+ ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+
+ if (B && C) {
+ uint32_t BVal = B->getZExtValue();
+ uint32_t CVal = C->getZExtValue();
+
+ if (0 < BVal && BVal <= CVal && CVal < 32) {
+ bool Signed = N->getOpcode() == ISD::SRA;
+ unsigned Opcode = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
+
+ ReplaceNode(N, getS_BFE(Opcode, SDLoc(N), Shl.getOperand(0), CVal - BVal,
+ 32 - CVal));
+ return;
+ }
+ }
+ SelectCode(N);
+}
+
+void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
+ switch (N->getOpcode()) {
+ case ISD::AND:
+ if (N->getOperand(0).getOpcode() == ISD::SRL) {
+ // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
+ // Predicate: isMask(mask)
+ const SDValue &Srl = N->getOperand(0);
+ ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
+ ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
+
+ if (Shift && Mask) {
+ uint32_t ShiftVal = Shift->getZExtValue();
+ uint32_t MaskVal = Mask->getZExtValue();
+
+ if (isMask_32(MaskVal)) {
+ uint32_t WidthVal = countPopulation(MaskVal);
+
+ ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N),
+ Srl.getOperand(0), ShiftVal, WidthVal));
+ return;
+ }
+ }
+ }
+ break;
+ case ISD::SRL:
+ if (N->getOperand(0).getOpcode() == ISD::AND) {
+ // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
+ // Predicate: isMask(mask >> b)
+ const SDValue &And = N->getOperand(0);
+ ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
+
+ if (Shift && Mask) {
+ uint32_t ShiftVal = Shift->getZExtValue();
+ uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
+
+ if (isMask_32(MaskVal)) {
+ uint32_t WidthVal = countPopulation(MaskVal);
+
+ ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N),
+ And.getOperand(0), ShiftVal, WidthVal));
+ return;
+ }
+ }
+ } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
+ SelectS_BFEFromShifts(N);
+ return;
+ }
+ break;
+ case ISD::SRA:
+ if (N->getOperand(0).getOpcode() == ISD::SHL) {
+ SelectS_BFEFromShifts(N);
+ return;
+ }
+ break;
+
+ case ISD::SIGN_EXTEND_INREG: {
+ // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
+ SDValue Src = N->getOperand(0);
+ if (Src.getOpcode() != ISD::SRL)
+ break;
+
+ const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
+ if (!Amt)
+ break;
+
+ unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
+ ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_I32, SDLoc(N), Src.getOperand(0),
+ Amt->getZExtValue(), Width));
+ return;
+ }
+ }
+
+ SelectCode(N);
+}
+
+bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
+ assert(N->getOpcode() == ISD::BRCOND);
+ if (!N->hasOneUse())
+ return false;
+
+ SDValue Cond = N->getOperand(1);
+ if (Cond.getOpcode() == ISD::CopyToReg)
+ Cond = Cond.getOperand(2);
+
+ if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
+ return false;
+
+ MVT VT = Cond.getOperand(0).getSimpleValueType();
+ if (VT == MVT::i32)
+ return true;
+
+ if (VT == MVT::i64) {
+ auto ST = static_cast<const SISubtarget *>(Subtarget);
+
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+ return (CC == ISD::SETEQ || CC == ISD::SETNE) && ST->hasScalarCompareEq64();
+ }
+
+ return false;
+}
+
+void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
+ SDValue Cond = N->getOperand(1);
+
+ if (Cond.isUndef()) {
+ CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
+ N->getOperand(2), N->getOperand(0));
+ return;
+ }
+
+ if (isCBranchSCC(N)) {
+ // This brcond will use S_CBRANCH_SCC*, so let tablegen handle it.
+ SelectCode(N);
+ return;
+ }
+
+ SDLoc SL(N);
+
+ SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, AMDGPU::VCC, Cond);
+ CurDAG->SelectNodeTo(N, AMDGPU::S_CBRANCH_VCCNZ, MVT::Other,
+ N->getOperand(2), // Basic Block
+ VCC.getValue(0));
+}
+
+// This is here because there isn't a way to use the generated sub0_sub1 as the
+// subreg index to EXTRACT_SUBREG in tablegen.
+void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
+ MemSDNode *Mem = cast<MemSDNode>(N);
+ unsigned AS = Mem->getAddressSpace();
+ if (AS == AMDGPUAS::FLAT_ADDRESS) {
+ SelectCode(N);
+ return;
+ }
+
+ MVT VT = N->getSimpleValueType(0);
+ bool Is32 = (VT == MVT::i32);
+ SDLoc SL(N);
+
+ MachineSDNode *CmpSwap = nullptr;
+ if (Subtarget->hasAddr64()) {
+ SDValue SRsrc, VAddr, SOffset, Offset, GLC, SLC;
+
+ if (SelectMUBUFAddr64(Mem->getBasePtr(), SRsrc, VAddr, SOffset, Offset, SLC)) {
+ unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_RTN_ADDR64 :
+ AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_RTN_ADDR64;
+ SDValue CmpVal = Mem->getOperand(2);
+
+ // XXX - Do we care about glue operands?
+
+ SDValue Ops[] = {
+ CmpVal, VAddr, SRsrc, SOffset, Offset, SLC, Mem->getChain()
+ };
+
+ CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops);
+ }
+ }
+
+ if (!CmpSwap) {
+ SDValue SRsrc, SOffset, Offset, SLC;
+ if (SelectMUBUFOffset(Mem->getBasePtr(), SRsrc, SOffset, Offset, SLC)) {
+ unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_RTN_OFFSET :
+ AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_RTN_OFFSET;
+
+ SDValue CmpVal = Mem->getOperand(2);
+ SDValue Ops[] = {
+ CmpVal, SRsrc, SOffset, Offset, SLC, Mem->getChain()
+ };
+
+ CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops);
+ }
+ }
+
+ if (!CmpSwap) {
+ SelectCode(N);
+ return;
+ }
+
+ MachineSDNode::mmo_iterator MMOs = MF->allocateMemRefsArray(1);
+ *MMOs = Mem->getMemOperand();
+ CmpSwap->setMemRefs(MMOs, MMOs + 1);
+
+ unsigned SubReg = Is32 ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
+ SDValue Extract
+ = CurDAG->getTargetExtractSubreg(SubReg, SL, VT, SDValue(CmpSwap, 0));
+
+ ReplaceUses(SDValue(N, 0), Extract);
+ ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 1));
+ CurDAG->RemoveDeadNode(N);
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const {
+ unsigned Mods = 0;
+
+ Src = In;
+
+ if (Src.getOpcode() == ISD::FNEG) {
+ Mods |= SISrcMods::NEG;
+ Src = Src.getOperand(0);
+ }
+
+ if (Src.getOpcode() == ISD::FABS) {
+ Mods |= SISrcMods::ABS;
+ Src = Src.getOperand(0);
+ }
+
+ SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const {
+ bool Res = SelectVOP3Mods(In, Src, SrcMods);
+ return Res && cast<ConstantSDNode>(SrcMods)->isNullValue();
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
+ SDValue &SrcMods, SDValue &Clamp,
+ SDValue &Omod) const {
+ SDLoc DL(In);
+ // FIXME: Handle Clamp and Omod
+ Clamp = CurDAG->getTargetConstant(0, DL, MVT::i32);
+ Omod = CurDAG->getTargetConstant(0, DL, MVT::i32);
+
+ return SelectVOP3Mods(In, Src, SrcMods);
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3NoMods0(SDValue In, SDValue &Src,
+ SDValue &SrcMods, SDValue &Clamp,
+ SDValue &Omod) const {
+ bool Res = SelectVOP3Mods0(In, Src, SrcMods, Clamp, Omod);
+
+ return Res && cast<ConstantSDNode>(SrcMods)->isNullValue() &&
+ cast<ConstantSDNode>(Clamp)->isNullValue() &&
+ cast<ConstantSDNode>(Omod)->isNullValue();
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp(SDValue In, SDValue &Src,
+ SDValue &SrcMods,
+ SDValue &Omod) const {
+ // FIXME: Handle Omod
+ Omod = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
+
+ return SelectVOP3Mods(In, Src, SrcMods);
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src,
+ SDValue &SrcMods,
+ SDValue &Clamp,
+ SDValue &Omod) const {
+ Clamp = Omod = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
+ return SelectVOP3Mods(In, Src, SrcMods);
+}
+
+void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
+ const AMDGPUTargetLowering& Lowering =
+ *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
+ bool IsModified = false;
+ do {
+ IsModified = false;
+ // Go over all selected nodes and try to fold them a bit more
+ for (SDNode &Node : CurDAG->allnodes()) {
+ MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(&Node);
+ if (!MachineNode)
+ continue;
+
+ SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
+ if (ResNode != &Node) {
+ ReplaceUses(&Node, ResNode);
+ IsModified = true;
+ }
+ }
+ CurDAG->RemoveDeadNodes();
+ } while (IsModified);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
new file mode 100644
index 000000000000..a87204d46eae
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -0,0 +1,3176 @@
+//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This is the parent TargetLowering class for hardware code gen
+/// targets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUISelLowering.h"
+#include "AMDGPU.h"
+#include "AMDGPUFrameLowering.h"
+#include "AMDGPUIntrinsicInfo.h"
+#include "AMDGPURegisterInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "R600MachineFunctionInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "SIInstrInfo.h"
+using namespace llvm;
+
+static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State) {
+ MachineFunction &MF = State.getMachineFunction();
+ AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
+
+ uint64_t Offset = MFI->allocateKernArg(LocVT.getStoreSize(),
+ ArgFlags.getOrigAlign());
+ State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ return true;
+}
+
+#include "AMDGPUGenCallingConv.inc"
+
+// Find a larger type to do a load / store of a vector with.
+EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
+ unsigned StoreSize = VT.getStoreSizeInBits();
+ if (StoreSize <= 32)
+ return EVT::getIntegerVT(Ctx, StoreSize);
+
+ assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
+ return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
+}
+
+AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
+ const AMDGPUSubtarget &STI)
+ : TargetLowering(TM), Subtarget(&STI) {
+ // Lower floating point store/load to integer store/load to reduce the number
+ // of patterns in tablegen.
+ setOperationAction(ISD::LOAD, MVT::f32, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
+
+ setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
+
+ setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
+
+ setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
+
+ setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
+
+ setOperationAction(ISD::LOAD, MVT::i64, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
+
+ setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
+
+ setOperationAction(ISD::LOAD, MVT::f64, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
+
+ setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
+
+ // There are no 64-bit extloads. These should be done as a 32-bit extload and
+ // an extension to 64-bit.
+ for (MVT VT : MVT::integer_valuetypes()) {
+ setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
+ }
+
+ for (MVT VT : MVT::integer_valuetypes()) {
+ if (VT == MVT::i64)
+ continue;
+
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
+
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
+
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
+ }
+
+ for (MVT VT : MVT::integer_vector_valuetypes()) {
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
+ }
+
+ setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
+
+ setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
+
+ setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
+
+ setOperationAction(ISD::STORE, MVT::f32, Promote);
+ AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
+
+ setOperationAction(ISD::STORE, MVT::v2f32, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
+
+ setOperationAction(ISD::STORE, MVT::v4f32, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
+
+ setOperationAction(ISD::STORE, MVT::v8f32, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
+
+ setOperationAction(ISD::STORE, MVT::v16f32, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
+
+ setOperationAction(ISD::STORE, MVT::i64, Promote);
+ AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
+
+ setOperationAction(ISD::STORE, MVT::v2i64, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
+
+ setOperationAction(ISD::STORE, MVT::f64, Promote);
+ AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
+
+ setOperationAction(ISD::STORE, MVT::v2f64, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
+
+ setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
+ setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
+
+ setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
+ setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
+
+ setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
+ setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
+ setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
+
+ setTruncStoreAction(MVT::i64, MVT::i1, Expand);
+ setTruncStoreAction(MVT::i64, MVT::i8, Expand);
+ setTruncStoreAction(MVT::i64, MVT::i16, Expand);
+ setTruncStoreAction(MVT::i64, MVT::i32, Expand);
+
+ setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
+ setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
+ setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
+ setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
+
+ setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+ setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
+ setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
+ setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
+
+ setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+ setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+ setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
+ setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
+
+ setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
+ setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
+
+ setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
+ setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
+
+
+ setOperationAction(ISD::Constant, MVT::i32, Legal);
+ setOperationAction(ISD::Constant, MVT::i64, Legal);
+ setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+ setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
+
+ setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+ setOperationAction(ISD::BRIND, MVT::Other, Expand);
+
+ // This is totally unsupported, just custom lower to produce an error.
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+
+ // We need to custom lower some of the intrinsics
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+
+ // Library functions. These default to Expand, but we have instructions
+ // for them.
+ setOperationAction(ISD::FCEIL, MVT::f32, Legal);
+ setOperationAction(ISD::FEXP2, MVT::f32, Legal);
+ setOperationAction(ISD::FPOW, MVT::f32, Legal);
+ setOperationAction(ISD::FLOG2, MVT::f32, Legal);
+ setOperationAction(ISD::FABS, MVT::f32, Legal);
+ setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
+ setOperationAction(ISD::FRINT, MVT::f32, Legal);
+ setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
+ setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
+ setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
+
+ setOperationAction(ISD::FROUND, MVT::f32, Custom);
+ setOperationAction(ISD::FROUND, MVT::f64, Custom);
+
+ setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
+ setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
+
+ setOperationAction(ISD::FREM, MVT::f32, Custom);
+ setOperationAction(ISD::FREM, MVT::f64, Custom);
+
+ // v_mad_f32 does not support denormals according to some sources.
+ if (!Subtarget->hasFP32Denormals())
+ setOperationAction(ISD::FMAD, MVT::f32, Legal);
+
+ // Expand to fneg + fadd.
+ setOperationAction(ISD::FSUB, MVT::f64, Expand);
+
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
+
+ if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
+ setOperationAction(ISD::FCEIL, MVT::f64, Custom);
+ setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
+ setOperationAction(ISD::FRINT, MVT::f64, Custom);
+ setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
+ }
+
+ if (!Subtarget->hasBFI()) {
+ // fcopysign can be done in a single instruction with BFI.
+ setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+ }
+
+ setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
+ setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
+
+ const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
+ for (MVT VT : ScalarIntVTs) {
+ // These should use [SU]DIVREM, so set them to expand
+ setOperationAction(ISD::SDIV, VT, Expand);
+ setOperationAction(ISD::UDIV, VT, Expand);
+ setOperationAction(ISD::SREM, VT, Expand);
+ setOperationAction(ISD::UREM, VT, Expand);
+
+ // GPU does not have divrem function for signed or unsigned.
+ setOperationAction(ISD::SDIVREM, VT, Custom);
+ setOperationAction(ISD::UDIVREM, VT, Custom);
+
+ // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
+ setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+ setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+
+ setOperationAction(ISD::BSWAP, VT, Expand);
+ setOperationAction(ISD::CTTZ, VT, Expand);
+ setOperationAction(ISD::CTLZ, VT, Expand);
+ }
+
+ if (!Subtarget->hasBCNT(32))
+ setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+
+ if (!Subtarget->hasBCNT(64))
+ setOperationAction(ISD::CTPOP, MVT::i64, Expand);
+
+ // The hardware supports 32-bit ROTR, but not ROTL.
+ setOperationAction(ISD::ROTL, MVT::i32, Expand);
+ setOperationAction(ISD::ROTL, MVT::i64, Expand);
+ setOperationAction(ISD::ROTR, MVT::i64, Expand);
+
+ setOperationAction(ISD::MUL, MVT::i64, Expand);
+ setOperationAction(ISD::MULHU, MVT::i64, Expand);
+ setOperationAction(ISD::MULHS, MVT::i64, Expand);
+ setOperationAction(ISD::UDIV, MVT::i32, Expand);
+ setOperationAction(ISD::UREM, MVT::i32, Expand);
+ setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+ setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
+
+ setOperationAction(ISD::SMIN, MVT::i32, Legal);
+ setOperationAction(ISD::UMIN, MVT::i32, Legal);
+ setOperationAction(ISD::SMAX, MVT::i32, Legal);
+ setOperationAction(ISD::UMAX, MVT::i32, Legal);
+
+ if (Subtarget->hasFFBH())
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
+
+ if (Subtarget->hasFFBL())
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Legal);
+
+ setOperationAction(ISD::CTLZ, MVT::i64, Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
+
+ // We only really have 32-bit BFE instructions (and 16-bit on VI).
+ //
+ // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
+ // effort to match them now. We want this to be false for i64 cases when the
+ // extraction isn't restricted to the upper or lower half. Ideally we would
+ // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
+ // span the midpoint are probably relatively rare, so don't worry about them
+ // for now.
+ if (Subtarget->hasBFE())
+ setHasExtractBitsInsn(true);
+
+ static const MVT::SimpleValueType VectorIntTypes[] = {
+ MVT::v2i32, MVT::v4i32
+ };
+
+ for (MVT VT : VectorIntTypes) {
+ // Expand the following operations for the current type by default.
+ setOperationAction(ISD::ADD, VT, Expand);
+ setOperationAction(ISD::AND, VT, Expand);
+ setOperationAction(ISD::FP_TO_SINT, VT, Expand);
+ setOperationAction(ISD::FP_TO_UINT, VT, Expand);
+ setOperationAction(ISD::MUL, VT, Expand);
+ setOperationAction(ISD::MULHU, VT, Expand);
+ setOperationAction(ISD::MULHS, VT, Expand);
+ setOperationAction(ISD::OR, VT, Expand);
+ setOperationAction(ISD::SHL, VT, Expand);
+ setOperationAction(ISD::SRA, VT, Expand);
+ setOperationAction(ISD::SRL, VT, Expand);
+ setOperationAction(ISD::ROTL, VT, Expand);
+ setOperationAction(ISD::ROTR, VT, Expand);
+ setOperationAction(ISD::SUB, VT, Expand);
+ setOperationAction(ISD::SINT_TO_FP, VT, Expand);
+ setOperationAction(ISD::UINT_TO_FP, VT, Expand);
+ setOperationAction(ISD::SDIV, VT, Expand);
+ setOperationAction(ISD::UDIV, VT, Expand);
+ setOperationAction(ISD::SREM, VT, Expand);
+ setOperationAction(ISD::UREM, VT, Expand);
+ setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+ setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+ setOperationAction(ISD::SDIVREM, VT, Custom);
+ setOperationAction(ISD::UDIVREM, VT, Expand);
+ setOperationAction(ISD::ADDC, VT, Expand);
+ setOperationAction(ISD::SUBC, VT, Expand);
+ setOperationAction(ISD::ADDE, VT, Expand);
+ setOperationAction(ISD::SUBE, VT, Expand);
+ setOperationAction(ISD::SELECT, VT, Expand);
+ setOperationAction(ISD::VSELECT, VT, Expand);
+ setOperationAction(ISD::SELECT_CC, VT, Expand);
+ setOperationAction(ISD::XOR, VT, Expand);
+ setOperationAction(ISD::BSWAP, VT, Expand);
+ setOperationAction(ISD::CTPOP, VT, Expand);
+ setOperationAction(ISD::CTTZ, VT, Expand);
+ setOperationAction(ISD::CTLZ, VT, Expand);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
+ }
+
+ static const MVT::SimpleValueType FloatVectorTypes[] = {
+ MVT::v2f32, MVT::v4f32
+ };
+
+ for (MVT VT : FloatVectorTypes) {
+ setOperationAction(ISD::FABS, VT, Expand);
+ setOperationAction(ISD::FMINNUM, VT, Expand);
+ setOperationAction(ISD::FMAXNUM, VT, Expand);
+ setOperationAction(ISD::FADD, VT, Expand);
+ setOperationAction(ISD::FCEIL, VT, Expand);
+ setOperationAction(ISD::FCOS, VT, Expand);
+ setOperationAction(ISD::FDIV, VT, Expand);
+ setOperationAction(ISD::FEXP2, VT, Expand);
+ setOperationAction(ISD::FLOG2, VT, Expand);
+ setOperationAction(ISD::FREM, VT, Expand);
+ setOperationAction(ISD::FPOW, VT, Expand);
+ setOperationAction(ISD::FFLOOR, VT, Expand);
+ setOperationAction(ISD::FTRUNC, VT, Expand);
+ setOperationAction(ISD::FMUL, VT, Expand);
+ setOperationAction(ISD::FMA, VT, Expand);
+ setOperationAction(ISD::FRINT, VT, Expand);
+ setOperationAction(ISD::FNEARBYINT, VT, Expand);
+ setOperationAction(ISD::FSQRT, VT, Expand);
+ setOperationAction(ISD::FSIN, VT, Expand);
+ setOperationAction(ISD::FSUB, VT, Expand);
+ setOperationAction(ISD::FNEG, VT, Expand);
+ setOperationAction(ISD::VSELECT, VT, Expand);
+ setOperationAction(ISD::SELECT_CC, VT, Expand);
+ setOperationAction(ISD::FCOPYSIGN, VT, Expand);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
+ }
+
+ // This causes using an unrolled select operation rather than expansion with
+ // bit operations. This is in general better, but the alternative using BFI
+ // instructions may be better if the select sources are SGPRs.
+ setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
+ AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
+
+ setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
+ AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
+
+ // There are no libcalls of any kind.
+ for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
+ setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
+
+ setBooleanContents(ZeroOrNegativeOneBooleanContent);
+ setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+
+ setSchedulingPreference(Sched::RegPressure);
+ setJumpIsExpensive(true);
+ setHasMultipleConditionRegisters(true);
+
+ // SI at least has hardware support for floating point exceptions, but no way
+ // of using or handling them is implemented. They are also optional in OpenCL
+ // (Section 7.3)
+ setHasFloatingPointExceptions(Subtarget->hasFPExceptions());
+
+ PredictableSelectIsExpensive = false;
+
+ // We want to find all load dependencies for long chains of stores to enable
+ // merging into very wide vectors. The problem is with vectors with > 4
+ // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
+ // vectors are a legal type, even though we have to split the loads
+ // usually. When we can more precisely specify load legality per address
+ // space, we should be able to make FindBetterChain/MergeConsecutiveStores
+ // smarter so that they can figure out what to do in 2 iterations without all
+ // N > 4 stores on the same chain.
+ GatherAllAliasesMaxDepth = 16;
+
+ // FIXME: Need to really handle these.
+ MaxStoresPerMemcpy = 4096;
+ MaxStoresPerMemmove = 4096;
+ MaxStoresPerMemset = 4096;
+
+ setTargetDAGCombine(ISD::BITCAST);
+ setTargetDAGCombine(ISD::SHL);
+ setTargetDAGCombine(ISD::SRA);
+ setTargetDAGCombine(ISD::SRL);
+ setTargetDAGCombine(ISD::MUL);
+ setTargetDAGCombine(ISD::MULHU);
+ setTargetDAGCombine(ISD::MULHS);
+ setTargetDAGCombine(ISD::SELECT);
+ setTargetDAGCombine(ISD::SELECT_CC);
+ setTargetDAGCombine(ISD::STORE);
+ setTargetDAGCombine(ISD::FADD);
+ setTargetDAGCombine(ISD::FSUB);
+}
+
+//===----------------------------------------------------------------------===//
+// Target Information
+//===----------------------------------------------------------------------===//
+
+MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
+ return MVT::i32;
+}
+
+bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
+ return true;
+}
+
+// The backend supports 32 and 64 bit floating point immediates.
+// FIXME: Why are we reporting vectors of FP immediates as legal?
+bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+ EVT ScalarVT = VT.getScalarType();
+ return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
+ (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
+}
+
+// We don't want to shrink f64 / f32 constants.
+bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
+ EVT ScalarVT = VT.getScalarType();
+ return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
+}
+
+bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
+ ISD::LoadExtType,
+ EVT NewVT) const {
+
+ unsigned NewSize = NewVT.getStoreSizeInBits();
+
+ // If we are reducing to a 32-bit load, this is always better.
+ if (NewSize == 32)
+ return true;
+
+ EVT OldVT = N->getValueType(0);
+ unsigned OldSize = OldVT.getStoreSizeInBits();
+
+ // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
+ // extloads, so doing one requires using a buffer_load. In cases where we
+ // still couldn't use a scalar load, using the wider load shouldn't really
+ // hurt anything.
+
+ // If the old size already had to be an extload, there's no harm in continuing
+ // to reduce the width.
+ return (OldSize < 32);
+}
+
+bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
+ EVT CastTy) const {
+
+ assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
+
+ if (LoadTy.getScalarType() == MVT::i32)
+ return false;
+
+ unsigned LScalarSize = LoadTy.getScalarSizeInBits();
+ unsigned CastScalarSize = CastTy.getScalarSizeInBits();
+
+ return (LScalarSize < CastScalarSize) ||
+ (CastScalarSize >= 32);
+}
+
+// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
+// profitable with the expansion for 64-bit since it's generally good to
+// speculate things.
+// FIXME: These should really have the size as a parameter.
+bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const {
+ return true;
+}
+
+bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
+ return true;
+}
+
+//===---------------------------------------------------------------------===//
+// Target Properties
+//===---------------------------------------------------------------------===//
+
+bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
+ assert(VT.isFloatingPoint());
+ return VT == MVT::f32 || VT == MVT::f64 || (Subtarget->has16BitInsts() &&
+ VT == MVT::f16);
+}
+
+bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
+ return isFAbsFree(VT);
+}
+
+bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
+ unsigned NumElem,
+ unsigned AS) const {
+ return true;
+}
+
+bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
+ // There are few operations which truly have vector input operands. Any vector
+ // operation is going to involve operations on each component, and a
+ // build_vector will be a copy per element, so it always makes sense to use a
+ // build_vector input in place of the extracted element to avoid a copy into a
+ // super register.
+ //
+ // We should probably only do this if all users are extracts only, but this
+ // should be the common case.
+ return true;
+}
+
+bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
+ // Truncate is just accessing a subregister.
+
+ unsigned SrcSize = Source.getSizeInBits();
+ unsigned DestSize = Dest.getSizeInBits();
+
+ return DestSize < SrcSize && DestSize % 32 == 0 ;
+}
+
+bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
+ // Truncate is just accessing a subregister.
+
+ unsigned SrcSize = Source->getScalarSizeInBits();
+ unsigned DestSize = Dest->getScalarSizeInBits();
+
+ if (DestSize== 16 && Subtarget->has16BitInsts())
+ return SrcSize >= 32;
+
+ return DestSize < SrcSize && DestSize % 32 == 0;
+}
+
+bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
+ unsigned SrcSize = Src->getScalarSizeInBits();
+ unsigned DestSize = Dest->getScalarSizeInBits();
+
+ if (SrcSize == 16 && Subtarget->has16BitInsts())
+ return DestSize >= 32;
+
+ return SrcSize == 32 && DestSize == 64;
+}
+
+bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
+ // Any register load of a 64-bit value really requires 2 32-bit moves. For all
+ // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
+ // this will enable reducing 64-bit operations the 32-bit, which is always
+ // good.
+
+ if (Src == MVT::i16)
+ return Dest == MVT::i32 ||Dest == MVT::i64 ;
+
+ return Src == MVT::i32 && Dest == MVT::i64;
+}
+
+bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+ return isZExtFree(Val.getValueType(), VT2);
+}
+
+bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
+ // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
+ // limited number of native 64-bit operations. Shrinking an operation to fit
+ // in a single 32-bit register should always be helpful. As currently used,
+ // this is much less general than the name suggests, and is only used in
+ // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
+ // not profitable, and may actually be harmful.
+ return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
+}
+
+//===---------------------------------------------------------------------===//
+// TargetLowering Callbacks
+//===---------------------------------------------------------------------===//
+
+/// The SelectionDAGBuilder will automatically promote function arguments
+/// with illegal types. However, this does not work for the AMDGPU targets
+/// since the function arguments are stored in memory as these illegal types.
+/// In order to handle this properly we need to get the original types sizes
+/// from the LLVM IR Function and fixup the ISD:InputArg values before
+/// passing them to AnalyzeFormalArguments()
+
+/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
+/// input values across multiple registers. Each item in the Ins array
+/// represents a single value that will be stored in regsters. Ins[x].VT is
+/// the value type of the value that will be stored in the register, so
+/// whatever SDNode we lower the argument to needs to be this type.
+///
+/// In order to correctly lower the arguments we need to know the size of each
+/// argument. Since Ins[x].VT gives us the size of the register that will
+/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
+/// for the orignal function argument so that we can deduce the correct memory
+/// type to use for Ins[x]. In most cases the correct memory type will be
+/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
+/// we have a kernel argument of type v8i8, this argument will be split into
+/// 8 parts and each part will be represented by its own item in the Ins array.
+/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
+/// the argument before it was split. From this, we deduce that the memory type
+/// for each individual part is i8. We pass the memory type as LocVT to the
+/// calling convention analysis function and the register type (Ins[x].VT) as
+/// the ValVT.
+void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State,
+ const SmallVectorImpl<ISD::InputArg> &Ins) const {
+ for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+ const ISD::InputArg &In = Ins[i];
+ EVT MemVT;
+
+ unsigned NumRegs = getNumRegisters(State.getContext(), In.ArgVT);
+
+ if (!Subtarget->isAmdHsaOS() &&
+ (In.ArgVT == MVT::i16 || In.ArgVT == MVT::i8 || In.ArgVT == MVT::f16)) {
+ // The ABI says the caller will extend these values to 32-bits.
+ MemVT = In.ArgVT.isInteger() ? MVT::i32 : MVT::f32;
+ } else if (NumRegs == 1) {
+ // This argument is not split, so the IR type is the memory type.
+ assert(!In.Flags.isSplit());
+ if (In.ArgVT.isExtended()) {
+ // We have an extended type, like i24, so we should just use the register type
+ MemVT = In.VT;
+ } else {
+ MemVT = In.ArgVT;
+ }
+ } else if (In.ArgVT.isVector() && In.VT.isVector() &&
+ In.ArgVT.getScalarType() == In.VT.getScalarType()) {
+ assert(In.ArgVT.getVectorNumElements() > In.VT.getVectorNumElements());
+ // We have a vector value which has been split into a vector with
+ // the same scalar type, but fewer elements. This should handle
+ // all the floating-point vector types.
+ MemVT = In.VT;
+ } else if (In.ArgVT.isVector() &&
+ In.ArgVT.getVectorNumElements() == NumRegs) {
+ // This arg has been split so that each element is stored in a separate
+ // register.
+ MemVT = In.ArgVT.getScalarType();
+ } else if (In.ArgVT.isExtended()) {
+ // We have an extended type, like i65.
+ MemVT = In.VT;
+ } else {
+ unsigned MemoryBits = In.ArgVT.getStoreSizeInBits() / NumRegs;
+ assert(In.ArgVT.getStoreSizeInBits() % NumRegs == 0);
+ if (In.VT.isInteger()) {
+ MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
+ } else if (In.VT.isVector()) {
+ assert(!In.VT.getScalarType().isFloatingPoint());
+ unsigned NumElements = In.VT.getVectorNumElements();
+ assert(MemoryBits % NumElements == 0);
+ // This vector type has been split into another vector type with
+ // a different elements size.
+ EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
+ MemoryBits / NumElements);
+ MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
+ } else {
+ llvm_unreachable("cannot deduce memory type.");
+ }
+ }
+
+ // Convert one element vectors to scalar.
+ if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
+ MemVT = MemVT.getScalarType();
+
+ if (MemVT.isExtended()) {
+ // This should really only happen if we have vec3 arguments
+ assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3);
+ MemVT = MemVT.getPow2VectorType(State.getContext());
+ }
+
+ assert(MemVT.isSimple());
+ allocateKernArg(i, In.VT, MemVT.getSimpleVT(), CCValAssign::Full, In.Flags,
+ State);
+ }
+}
+
+void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State,
+ const SmallVectorImpl<ISD::InputArg> &Ins) const {
+ State.AnalyzeFormalArguments(Ins, CC_AMDGPU);
+}
+
+void AMDGPUTargetLowering::AnalyzeReturn(CCState &State,
+ const SmallVectorImpl<ISD::OutputArg> &Outs) const {
+
+ State.AnalyzeReturn(Outs, RetCC_SI);
+}
+
+SDValue
+AMDGPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SDLoc &DL, SelectionDAG &DAG) const {
+ return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
+}
+
+//===---------------------------------------------------------------------===//
+// Target specific lowering
+//===---------------------------------------------------------------------===//
+
+SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const {
+ SDValue Callee = CLI.Callee;
+ SelectionDAG &DAG = CLI.DAG;
+
+ const Function &Fn = *DAG.getMachineFunction().getFunction();
+
+ StringRef FuncName("<unknown>");
+
+ if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
+ FuncName = G->getSymbol();
+ else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+ FuncName = G->getGlobal()->getName();
+
+ DiagnosticInfoUnsupported NoCalls(
+ Fn, "unsupported call to function " + FuncName, CLI.DL.getDebugLoc());
+ DAG.getContext()->diagnose(NoCalls);
+
+ if (!CLI.IsTailCall) {
+ for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
+ InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
+ }
+
+ return DAG.getEntryNode();
+}
+
+SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+ SelectionDAG &DAG) const {
+ const Function &Fn = *DAG.getMachineFunction().getFunction();
+
+ DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
+ SDLoc(Op).getDebugLoc());
+ DAG.getContext()->diagnose(NoDynamicAlloca);
+ auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
+ return DAG.getMergeValues(Ops, SDLoc());
+}
+
+SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
+ SelectionDAG &DAG) const {
+ switch (Op.getOpcode()) {
+ default:
+ Op->dump(&DAG);
+ llvm_unreachable("Custom lowering code for this"
+ "instruction is not implemented yet!");
+ break;
+ case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
+ case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
+ case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
+ case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+ case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
+ case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
+ case ISD::FREM: return LowerFREM(Op, DAG);
+ case ISD::FCEIL: return LowerFCEIL(Op, DAG);
+ case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
+ case ISD::FRINT: return LowerFRINT(Op, DAG);
+ case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
+ case ISD::FROUND: return LowerFROUND(Op, DAG);
+ case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
+ case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
+ case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
+ case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
+ case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
+ case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
+ case ISD::CTLZ:
+ case ISD::CTLZ_ZERO_UNDEF:
+ return LowerCTLZ(Op, DAG);
+ case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
+ }
+ return Op;
+}
+
+void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
+ SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const {
+ switch (N->getOpcode()) {
+ case ISD::SIGN_EXTEND_INREG:
+ // Different parts of legalization seem to interpret which type of
+ // sign_extend_inreg is the one to check for custom lowering. The extended
+ // from type is what really matters, but some places check for custom
+ // lowering of the result type. This results in trying to use
+ // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
+ // nothing here and let the illegal result integer be handled normally.
+ return;
+ default:
+ return;
+ }
+}
+
+static bool hasDefinedInitializer(const GlobalValue *GV) {
+ const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+ if (!GVar || !GVar->hasInitializer())
+ return false;
+
+ return !isa<UndefValue>(GVar->getInitializer());
+}
+
+SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
+ SDValue Op,
+ SelectionDAG &DAG) const {
+
+ const DataLayout &DL = DAG.getDataLayout();
+ GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
+ const GlobalValue *GV = G->getGlobal();
+
+ switch (G->getAddressSpace()) {
+ case AMDGPUAS::LOCAL_ADDRESS: {
+ // XXX: What does the value of G->getOffset() mean?
+ assert(G->getOffset() == 0 &&
+ "Do not know what to do with an non-zero offset");
+
+ // TODO: We could emit code to handle the initialization somewhere.
+ if (hasDefinedInitializer(GV))
+ break;
+
+ unsigned Offset = MFI->allocateLDSGlobal(DL, *GV);
+ return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
+ }
+ }
+
+ const Function &Fn = *DAG.getMachineFunction().getFunction();
+ DiagnosticInfoUnsupported BadInit(
+ Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc());
+ DAG.getContext()->diagnose(BadInit);
+ return SDValue();
+}
+
+SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
+ SelectionDAG &DAG) const {
+ SmallVector<SDValue, 8> Args;
+
+ for (const SDUse &U : Op->ops())
+ DAG.ExtractVectorElements(U.get(), Args);
+
+ return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
+}
+
+SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
+ SelectionDAG &DAG) const {
+
+ SmallVector<SDValue, 8> Args;
+ unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+ EVT VT = Op.getValueType();
+ DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
+ VT.getVectorNumElements());
+
+ return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
+}
+
+SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
+ SelectionDAG &DAG) const {
+ unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+
+ switch (IntrinsicID) {
+ default: return Op;
+ case AMDGPUIntrinsic::AMDGPU_clamp: // Legacy name.
+ return DAG.getNode(AMDGPUISD::CLAMP, DL, VT,
+ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+ case AMDGPUIntrinsic::AMDGPU_bfe_i32:
+ return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
+ Op.getOperand(1),
+ Op.getOperand(2),
+ Op.getOperand(3));
+
+ case AMDGPUIntrinsic::AMDGPU_bfe_u32:
+ return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
+ Op.getOperand(1),
+ Op.getOperand(2),
+ Op.getOperand(3));
+ }
+}
+
+/// \brief Generate Min/Max node
+SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(const SDLoc &DL, EVT VT,
+ SDValue LHS, SDValue RHS,
+ SDValue True, SDValue False,
+ SDValue CC,
+ DAGCombinerInfo &DCI) const {
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ return SDValue();
+
+ if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
+ switch (CCOpcode) {
+ case ISD::SETOEQ:
+ case ISD::SETONE:
+ case ISD::SETUNE:
+ case ISD::SETNE:
+ case ISD::SETUEQ:
+ case ISD::SETEQ:
+ case ISD::SETFALSE:
+ case ISD::SETFALSE2:
+ case ISD::SETTRUE:
+ case ISD::SETTRUE2:
+ case ISD::SETUO:
+ case ISD::SETO:
+ break;
+ case ISD::SETULE:
+ case ISD::SETULT: {
+ if (LHS == True)
+ return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
+ return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
+ }
+ case ISD::SETOLE:
+ case ISD::SETOLT:
+ case ISD::SETLE:
+ case ISD::SETLT: {
+ // Ordered. Assume ordered for undefined.
+
+ // Only do this after legalization to avoid interfering with other combines
+ // which might occur.
+ if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
+ !DCI.isCalledByLegalizer())
+ return SDValue();
+
+ // We need to permute the operands to get the correct NaN behavior. The
+ // selected operand is the second one based on the failing compare with NaN,
+ // so permute it based on the compare type the hardware uses.
+ if (LHS == True)
+ return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
+ return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
+ }
+ case ISD::SETUGE:
+ case ISD::SETUGT: {
+ if (LHS == True)
+ return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
+ return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
+ }
+ case ISD::SETGT:
+ case ISD::SETGE:
+ case ISD::SETOGE:
+ case ISD::SETOGT: {
+ if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
+ !DCI.isCalledByLegalizer())
+ return SDValue();
+
+ if (LHS == True)
+ return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
+ return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
+ }
+ case ISD::SETCC_INVALID:
+ llvm_unreachable("Invalid setcc condcode!");
+ }
+ return SDValue();
+}
+
+std::pair<SDValue, SDValue>
+AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+
+ SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
+
+ const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+ const SDValue One = DAG.getConstant(1, SL, MVT::i32);
+
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
+
+ return std::make_pair(Lo, Hi);
+}
+
+SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+
+ SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
+ const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
+}
+
+SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+
+ SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
+ const SDValue One = DAG.getConstant(1, SL, MVT::i32);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
+}
+
+SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
+ SelectionDAG &DAG) const {
+ LoadSDNode *Load = cast<LoadSDNode>(Op);
+ EVT VT = Op.getValueType();
+
+
+ // If this is a 2 element vector, we really want to scalarize and not create
+ // weird 1 element vectors.
+ if (VT.getVectorNumElements() == 2)
+ return scalarizeVectorLoad(Load, DAG);
+
+ SDValue BasePtr = Load->getBasePtr();
+ EVT PtrVT = BasePtr.getValueType();
+ EVT MemVT = Load->getMemoryVT();
+ SDLoc SL(Op);
+
+ const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
+
+ EVT LoVT, HiVT;
+ EVT LoMemVT, HiMemVT;
+ SDValue Lo, Hi;
+
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+ std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
+ std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT);
+
+ unsigned Size = LoMemVT.getStoreSize();
+ unsigned BaseAlign = Load->getAlignment();
+ unsigned HiAlign = MinAlign(BaseAlign, Size);
+
+ SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
+ Load->getChain(), BasePtr, SrcValue, LoMemVT,
+ BaseAlign, Load->getMemOperand()->getFlags());
+ SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
+ DAG.getConstant(Size, SL, PtrVT));
+ SDValue HiLoad =
+ DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
+ HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
+ HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
+
+ SDValue Ops[] = {
+ DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad),
+ DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
+ LoLoad.getValue(1), HiLoad.getValue(1))
+ };
+
+ return DAG.getMergeValues(Ops, SL);
+}
+
+SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
+ SelectionDAG &DAG) const {
+ StoreSDNode *Store = cast<StoreSDNode>(Op);
+ SDValue Val = Store->getValue();
+ EVT VT = Val.getValueType();
+
+ // If this is a 2 element vector, we really want to scalarize and not create
+ // weird 1 element vectors.
+ if (VT.getVectorNumElements() == 2)
+ return scalarizeVectorStore(Store, DAG);
+
+ EVT MemVT = Store->getMemoryVT();
+ SDValue Chain = Store->getChain();
+ SDValue BasePtr = Store->getBasePtr();
+ SDLoc SL(Op);
+
+ EVT LoVT, HiVT;
+ EVT LoMemVT, HiMemVT;
+ SDValue Lo, Hi;
+
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+ std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
+ std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT);
+
+ EVT PtrVT = BasePtr.getValueType();
+ SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
+ DAG.getConstant(LoMemVT.getStoreSize(), SL,
+ PtrVT));
+
+ const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
+ unsigned BaseAlign = Store->getAlignment();
+ unsigned Size = LoMemVT.getStoreSize();
+ unsigned HiAlign = MinAlign(BaseAlign, Size);
+
+ SDValue LoStore =
+ DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
+ Store->getMemOperand()->getFlags());
+ SDValue HiStore =
+ DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
+ HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
+
+ return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
+}
+
+// This is a shortcut for integer division because we have fast i32<->f32
+// conversions, and fast f32 reciprocal instructions. The fractional part of a
+// float is enough to accurately represent up to a 24-bit signed integer.
+SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
+ bool Sign) const {
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ MVT IntVT = MVT::i32;
+ MVT FltVT = MVT::f32;
+
+ unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
+ if (LHSSignBits < 9)
+ return SDValue();
+
+ unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
+ if (RHSSignBits < 9)
+ return SDValue();
+
+ unsigned BitSize = VT.getSizeInBits();
+ unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
+ unsigned DivBits = BitSize - SignBits;
+ if (Sign)
+ ++DivBits;
+
+ ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
+ ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
+
+ SDValue jq = DAG.getConstant(1, DL, IntVT);
+
+ if (Sign) {
+ // char|short jq = ia ^ ib;
+ jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
+
+ // jq = jq >> (bitsize - 2)
+ jq = DAG.getNode(ISD::SRA, DL, VT, jq,
+ DAG.getConstant(BitSize - 2, DL, VT));
+
+ // jq = jq | 0x1
+ jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
+ }
+
+ // int ia = (int)LHS;
+ SDValue ia = LHS;
+
+ // int ib, (int)RHS;
+ SDValue ib = RHS;
+
+ // float fa = (float)ia;
+ SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
+
+ // float fb = (float)ib;
+ SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
+
+ SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
+ fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
+
+ // fq = trunc(fq);
+ fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
+
+ // float fqneg = -fq;
+ SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
+
+ // float fr = mad(fqneg, fb, fa);
+ SDValue fr = DAG.getNode(ISD::FMAD, DL, FltVT, fqneg, fb, fa);
+
+ // int iq = (int)fq;
+ SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
+
+ // fr = fabs(fr);
+ fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
+
+ // fb = fabs(fb);
+ fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
+
+ EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+
+ // int cv = fr >= fb;
+ SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
+
+ // jq = (cv ? jq : 0);
+ jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
+
+ // dst = iq + jq;
+ SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
+
+ // Rem needs compensation, it's easier to recompute it
+ SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
+ Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
+
+ // Truncate to number of bits this divide really is.
+ if (Sign) {
+ SDValue InRegSize
+ = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
+ Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
+ Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
+ } else {
+ SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
+ Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
+ Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
+ }
+
+ return DAG.getMergeValues({ Div, Rem }, DL);
+}
+
+void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
+ SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &Results) const {
+ assert(Op.getValueType() == MVT::i64);
+
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+ EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
+
+ SDValue one = DAG.getConstant(1, DL, HalfVT);
+ SDValue zero = DAG.getConstant(0, DL, HalfVT);
+
+ //HiLo split
+ SDValue LHS = Op.getOperand(0);
+ SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero);
+ SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one);
+
+ SDValue RHS = Op.getOperand(1);
+ SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
+ SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
+
+ if (VT == MVT::i64 &&
+ DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
+ DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
+
+ SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
+ LHS_Lo, RHS_Lo);
+
+ SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), zero});
+ SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), zero});
+
+ Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
+ Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
+ return;
+ }
+
+ // Get Speculative values
+ SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
+ SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
+
+ SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
+ SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, zero});
+ REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
+
+ SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
+ SDValue DIV_Lo = zero;
+
+ const unsigned halfBitWidth = HalfVT.getSizeInBits();
+
+ for (unsigned i = 0; i < halfBitWidth; ++i) {
+ const unsigned bitPos = halfBitWidth - i - 1;
+ SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
+ // Get value of high bit
+ SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
+ HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
+ HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
+
+ // Shift
+ REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
+ // Add LHS high bit
+ REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
+
+ SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
+ SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETUGE);
+
+ DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
+
+ // Update REM
+ SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
+ REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
+ }
+
+ SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
+ DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
+ Results.push_back(DIV);
+ Results.push_back(REM);
+}
+
+SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+
+ if (VT == MVT::i64) {
+ SmallVector<SDValue, 2> Results;
+ LowerUDIVREM64(Op, DAG, Results);
+ return DAG.getMergeValues(Results, DL);
+ }
+
+ if (VT == MVT::i32) {
+ if (SDValue Res = LowerDIVREM24(Op, DAG, false))
+ return Res;
+ }
+
+ SDValue Num = Op.getOperand(0);
+ SDValue Den = Op.getOperand(1);
+
+ // RCP = URECIP(Den) = 2^32 / Den + e
+ // e is rounding error.
+ SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
+
+ // RCP_LO = mul(RCP, Den) */
+ SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den);
+
+ // RCP_HI = mulhu (RCP, Den) */
+ SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
+
+ // NEG_RCP_LO = -RCP_LO
+ SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
+ RCP_LO);
+
+ // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
+ SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
+ NEG_RCP_LO, RCP_LO,
+ ISD::SETEQ);
+ // Calculate the rounding error from the URECIP instruction
+ // E = mulhu(ABS_RCP_LO, RCP)
+ SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);
+
+ // RCP_A_E = RCP + E
+ SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);
+
+ // RCP_S_E = RCP - E
+ SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);
+
+ // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
+ SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
+ RCP_A_E, RCP_S_E,
+ ISD::SETEQ);
+ // Quotient = mulhu(Tmp0, Num)
+ SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
+
+ // Num_S_Remainder = Quotient * Den
+ SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den);
+
+ // Remainder = Num - Num_S_Remainder
+ SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
+
+ // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
+ SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
+ DAG.getConstant(-1, DL, VT),
+ DAG.getConstant(0, DL, VT),
+ ISD::SETUGE);
+ // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
+ SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num,
+ Num_S_Remainder,
+ DAG.getConstant(-1, DL, VT),
+ DAG.getConstant(0, DL, VT),
+ ISD::SETUGE);
+ // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
+ SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
+ Remainder_GE_Zero);
+
+ // Calculate Division result:
+
+ // Quotient_A_One = Quotient + 1
+ SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
+ DAG.getConstant(1, DL, VT));
+
+ // Quotient_S_One = Quotient - 1
+ SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
+ DAG.getConstant(1, DL, VT));
+
+ // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
+ SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
+ Quotient, Quotient_A_One, ISD::SETEQ);
+
+ // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
+ Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
+ Quotient_S_One, Div, ISD::SETEQ);
+
+ // Calculate Rem result:
+
+ // Remainder_S_Den = Remainder - Den
+ SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);
+
+ // Remainder_A_Den = Remainder + Den
+ SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);
+
+ // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
+ SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
+ Remainder, Remainder_S_Den, ISD::SETEQ);
+
+ // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
+ Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
+ Remainder_A_Den, Rem, ISD::SETEQ);
+ SDValue Ops[2] = {
+ Div,
+ Rem
+ };
+ return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+
+ SDValue Zero = DAG.getConstant(0, DL, VT);
+ SDValue NegOne = DAG.getConstant(-1, DL, VT);
+
+ if (VT == MVT::i32) {
+ if (SDValue Res = LowerDIVREM24(Op, DAG, true))
+ return Res;
+ }
+
+ if (VT == MVT::i64 &&
+ DAG.ComputeNumSignBits(LHS) > 32 &&
+ DAG.ComputeNumSignBits(RHS) > 32) {
+ EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
+
+ //HiLo split
+ SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
+ SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
+ SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
+ LHS_Lo, RHS_Lo);
+ SDValue Res[2] = {
+ DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
+ DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
+ };
+ return DAG.getMergeValues(Res, DL);
+ }
+
+ SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
+ SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
+ SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
+ SDValue RSign = LHSign; // Remainder sign is the same as LHS
+
+ LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
+ RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
+
+ LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
+ RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
+
+ SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
+ SDValue Rem = Div.getValue(1);
+
+ Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
+ Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
+
+ Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
+ Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
+
+ SDValue Res[2] = {
+ Div,
+ Rem
+ };
+ return DAG.getMergeValues(Res, DL);
+}
+
+// (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y))
+SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ EVT VT = Op.getValueType();
+ SDValue X = Op.getOperand(0);
+ SDValue Y = Op.getOperand(1);
+
+ // TODO: Should this propagate fast-math-flags?
+
+ SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y);
+ SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div);
+ SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y);
+
+ return DAG.getNode(ISD::FSUB, SL, VT, X, Mul);
+}
+
+SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ SDValue Src = Op.getOperand(0);
+
+ // result = trunc(src)
+ // if (src > 0.0 && src != result)
+ // result += 1.0
+
+ SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
+
+ const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
+ const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
+
+ EVT SetCCVT =
+ getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
+
+ SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
+ SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
+ SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
+
+ SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
+ // TODO: Should this propagate fast-math-flags?
+ return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
+}
+
+static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
+ SelectionDAG &DAG) {
+ const unsigned FractBits = 52;
+ const unsigned ExpBits = 11;
+
+ SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
+ Hi,
+ DAG.getConstant(FractBits - 32, SL, MVT::i32),
+ DAG.getConstant(ExpBits, SL, MVT::i32));
+ SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
+ DAG.getConstant(1023, SL, MVT::i32));
+
+ return Exp;
+}
+
+SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ SDValue Src = Op.getOperand(0);
+
+ assert(Op.getValueType() == MVT::f64);
+
+ const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+ const SDValue One = DAG.getConstant(1, SL, MVT::i32);
+
+ SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
+
+ // Extract the upper half, since this is where we will find the sign and
+ // exponent.
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
+
+ SDValue Exp = extractF64Exponent(Hi, SL, DAG);
+
+ const unsigned FractBits = 52;
+
+ // Extract the sign bit.
+ const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
+ SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
+
+ // Extend back to to 64-bits.
+ SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
+ SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
+
+ SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
+ const SDValue FractMask
+ = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
+
+ SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
+ SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
+ SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
+
+ EVT SetCCVT =
+ getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
+
+ const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
+
+ SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
+ SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
+
+ SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
+ SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
+
+ return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
+}
+
+SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ SDValue Src = Op.getOperand(0);
+
+ assert(Op.getValueType() == MVT::f64);
+
+ APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
+ SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
+ SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
+
+ // TODO: Should this propagate fast-math-flags?
+
+ SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
+ SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
+
+ SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
+
+ APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
+ SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
+
+ EVT SetCCVT =
+ getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
+ SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
+
+ return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
+}
+
+SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
+ // FNEARBYINT and FRINT are the same, except in their handling of FP
+ // exceptions. Those aren't really meaningful for us, and OpenCL only has
+ // rint, so just treat them as equivalent.
+ return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
+}
+
+// XXX - May require not supporting f32 denormals?
+SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ SDValue X = Op.getOperand(0);
+
+ SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X);
+
+ // TODO: Should this propagate fast-math-flags?
+
+ SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T);
+
+ SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff);
+
+ const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f32);
+ const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
+ const SDValue Half = DAG.getConstantFP(0.5, SL, MVT::f32);
+
+ SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f32, One, X);
+
+ EVT SetCCVT =
+ getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
+
+ SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
+
+ SDValue Sel = DAG.getNode(ISD::SELECT, SL, MVT::f32, Cmp, SignOne, Zero);
+
+ return DAG.getNode(ISD::FADD, SL, MVT::f32, T, Sel);
+}
+
+SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ SDValue X = Op.getOperand(0);
+
+ SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X);
+
+ const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+ const SDValue One = DAG.getConstant(1, SL, MVT::i32);
+ const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32);
+ const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32);
+ EVT SetCCVT =
+ getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
+
+ SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
+
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One);
+
+ SDValue Exp = extractF64Exponent(Hi, SL, DAG);
+
+ const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), SL,
+ MVT::i64);
+
+ SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp);
+ SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64,
+ DAG.getConstant(INT64_C(0x0008000000000000), SL,
+ MVT::i64),
+ Exp);
+
+ SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M);
+ SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT,
+ DAG.getConstant(0, SL, MVT::i64), Tmp0,
+ ISD::SETNE);
+
+ SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1,
+ D, DAG.getConstant(0, SL, MVT::i64));
+ SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2);
+
+ K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64));
+ K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K);
+
+ SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
+ SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
+ SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ);
+
+ SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64,
+ ExpEqNegOne,
+ DAG.getConstantFP(1.0, SL, MVT::f64),
+ DAG.getConstantFP(0.0, SL, MVT::f64));
+
+ SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X);
+
+ K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K);
+ K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K);
+
+ return K;
+}
+
+SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+
+ if (VT == MVT::f32)
+ return LowerFROUND32(Op, DAG);
+
+ if (VT == MVT::f64)
+ return LowerFROUND64(Op, DAG);
+
+ llvm_unreachable("unhandled type");
+}
+
+SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ SDValue Src = Op.getOperand(0);
+
+ // result = trunc(src);
+ // if (src < 0.0 && src != result)
+ // result += -1.0.
+
+ SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
+
+ const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
+ const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
+
+ EVT SetCCVT =
+ getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
+
+ SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
+ SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
+ SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
+
+ SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
+ // TODO: Should this propagate fast-math-flags?
+ return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
+}
+
+SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ SDValue Src = Op.getOperand(0);
+ bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF;
+
+ if (ZeroUndef && Src.getValueType() == MVT::i32)
+ return DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, Src);
+
+ SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
+
+ const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+ const SDValue One = DAG.getConstant(1, SL, MVT::i32);
+
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
+
+ EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
+ *DAG.getContext(), MVT::i32);
+
+ SDValue Hi0 = DAG.getSetCC(SL, SetCCVT, Hi, Zero, ISD::SETEQ);
+
+ SDValue CtlzLo = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Lo);
+ SDValue CtlzHi = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Hi);
+
+ const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32);
+ SDValue Add = DAG.getNode(ISD::ADD, SL, MVT::i32, CtlzLo, Bits32);
+
+ // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x))
+ SDValue NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0, Add, CtlzHi);
+
+ if (!ZeroUndef) {
+ // Test if the full 64-bit input is zero.
+
+ // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32,
+ // which we probably don't want.
+ SDValue Lo0 = DAG.getSetCC(SL, SetCCVT, Lo, Zero, ISD::SETEQ);
+ SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0, Hi0);
+
+ // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction
+ // with the same cycles, otherwise it is slower.
+ // SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src,
+ // DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ);
+
+ const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32);
+
+ // The instruction returns -1 for 0 input, but the defined intrinsic
+ // behavior is to return the number of bits.
+ NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32,
+ SrcIsZero, Bits32, NewCtlz);
+ }
+
+ return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewCtlz);
+}
+
+SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
+ bool Signed) const {
+ // Unsigned
+ // cul2f(ulong u)
+ //{
+ // uint lz = clz(u);
+ // uint e = (u != 0) ? 127U + 63U - lz : 0;
+ // u = (u << lz) & 0x7fffffffffffffffUL;
+ // ulong t = u & 0xffffffffffUL;
+ // uint v = (e << 23) | (uint)(u >> 40);
+ // uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
+ // return as_float(v + r);
+ //}
+ // Signed
+ // cl2f(long l)
+ //{
+ // long s = l >> 63;
+ // float r = cul2f((l + s) ^ s);
+ // return s ? -r : r;
+ //}
+
+ SDLoc SL(Op);
+ SDValue Src = Op.getOperand(0);
+ SDValue L = Src;
+
+ SDValue S;
+ if (Signed) {
+ const SDValue SignBit = DAG.getConstant(63, SL, MVT::i64);
+ S = DAG.getNode(ISD::SRA, SL, MVT::i64, L, SignBit);
+
+ SDValue LPlusS = DAG.getNode(ISD::ADD, SL, MVT::i64, L, S);
+ L = DAG.getNode(ISD::XOR, SL, MVT::i64, LPlusS, S);
+ }
+
+ EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
+ *DAG.getContext(), MVT::f32);
+
+
+ SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32);
+ SDValue ZeroI64 = DAG.getConstant(0, SL, MVT::i64);
+ SDValue LZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i64, L);
+ LZ = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LZ);
+
+ SDValue K = DAG.getConstant(127U + 63U, SL, MVT::i32);
+ SDValue E = DAG.getSelect(SL, MVT::i32,
+ DAG.getSetCC(SL, SetCCVT, L, ZeroI64, ISD::SETNE),
+ DAG.getNode(ISD::SUB, SL, MVT::i32, K, LZ),
+ ZeroI32);
+
+ SDValue U = DAG.getNode(ISD::AND, SL, MVT::i64,
+ DAG.getNode(ISD::SHL, SL, MVT::i64, L, LZ),
+ DAG.getConstant((-1ULL) >> 1, SL, MVT::i64));
+
+ SDValue T = DAG.getNode(ISD::AND, SL, MVT::i64, U,
+ DAG.getConstant(0xffffffffffULL, SL, MVT::i64));
+
+ SDValue UShl = DAG.getNode(ISD::SRL, SL, MVT::i64,
+ U, DAG.getConstant(40, SL, MVT::i64));
+
+ SDValue V = DAG.getNode(ISD::OR, SL, MVT::i32,
+ DAG.getNode(ISD::SHL, SL, MVT::i32, E, DAG.getConstant(23, SL, MVT::i32)),
+ DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, UShl));
+
+ SDValue C = DAG.getConstant(0x8000000000ULL, SL, MVT::i64);
+ SDValue RCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETUGT);
+ SDValue TCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETEQ);
+
+ SDValue One = DAG.getConstant(1, SL, MVT::i32);
+
+ SDValue VTrunc1 = DAG.getNode(ISD::AND, SL, MVT::i32, V, One);
+
+ SDValue R = DAG.getSelect(SL, MVT::i32,
+ RCmp,
+ One,
+ DAG.getSelect(SL, MVT::i32, TCmp, VTrunc1, ZeroI32));
+ R = DAG.getNode(ISD::ADD, SL, MVT::i32, V, R);
+ R = DAG.getNode(ISD::BITCAST, SL, MVT::f32, R);
+
+ if (!Signed)
+ return R;
+
+ SDValue RNeg = DAG.getNode(ISD::FNEG, SL, MVT::f32, R);
+ return DAG.getSelect(SL, MVT::f32, DAG.getSExtOrTrunc(S, SL, SetCCVT), RNeg, R);
+}
+
+SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
+ bool Signed) const {
+ SDLoc SL(Op);
+ SDValue Src = Op.getOperand(0);
+
+ SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
+
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
+ DAG.getConstant(0, SL, MVT::i32));
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
+ DAG.getConstant(1, SL, MVT::i32));
+
+ SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
+ SL, MVT::f64, Hi);
+
+ SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
+
+ SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
+ DAG.getConstant(32, SL, MVT::i32));
+ // TODO: Should this propagate fast-math-flags?
+ return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
+}
+
+SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Op.getOperand(0).getValueType() == MVT::i64 &&
+ "operation should be legal");
+
+ // TODO: Factor out code common with LowerSINT_TO_FP.
+
+ EVT DestVT = Op.getValueType();
+ if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
+ SDLoc DL(Op);
+ SDValue Src = Op.getOperand(0);
+
+ SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
+ SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
+ SDValue FPRound =
+ DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
+
+ return FPRound;
+ }
+
+ if (DestVT == MVT::f32)
+ return LowerINT_TO_FP32(Op, DAG, false);
+
+ assert(DestVT == MVT::f64);
+ return LowerINT_TO_FP64(Op, DAG, false);
+}
+
+SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Op.getOperand(0).getValueType() == MVT::i64 &&
+ "operation should be legal");
+
+ // TODO: Factor out code common with LowerUINT_TO_FP.
+
+ EVT DestVT = Op.getValueType();
+ if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
+ SDLoc DL(Op);
+ SDValue Src = Op.getOperand(0);
+
+ SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
+ SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
+ SDValue FPRound =
+ DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
+
+ return FPRound;
+ }
+
+ if (DestVT == MVT::f32)
+ return LowerINT_TO_FP32(Op, DAG, true);
+
+ assert(DestVT == MVT::f64);
+ return LowerINT_TO_FP64(Op, DAG, true);
+}
+
+SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
+ bool Signed) const {
+ SDLoc SL(Op);
+
+ SDValue Src = Op.getOperand(0);
+
+ SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
+
+ SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), SL,
+ MVT::f64);
+ SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL,
+ MVT::f64);
+ // TODO: Should this propagate fast-math-flags?
+ SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0);
+
+ SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul);
+
+
+ SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc);
+
+ SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL,
+ MVT::i32, FloorMul);
+ SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
+
+ SDValue Result = DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi});
+
+ return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);
+}
+
+SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
+
+ if (getTargetMachine().Options.UnsafeFPMath) {
+ // There is a generic expand for FP_TO_FP16 with unsafe fast math.
+ return SDValue();
+ }
+
+ SDLoc DL(Op);
+ SDValue N0 = Op.getOperand(0);
+ assert (N0.getSimpleValueType() == MVT::f64);
+
+ // f64 -> f16 conversion using round-to-nearest-even rounding mode.
+ const unsigned ExpMask = 0x7ff;
+ const unsigned ExpBiasf64 = 1023;
+ const unsigned ExpBiasf16 = 15;
+ SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
+ SDValue One = DAG.getConstant(1, DL, MVT::i32);
+ SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
+ SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
+ DAG.getConstant(32, DL, MVT::i64));
+ UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
+ U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
+ SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
+ DAG.getConstant(20, DL, MVT::i64));
+ E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
+ DAG.getConstant(ExpMask, DL, MVT::i32));
+ // Subtract the fp64 exponent bias (1023) to get the real exponent and
+ // add the f16 bias (15) to get the biased exponent for the f16 format.
+ E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
+ DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
+
+ SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
+ DAG.getConstant(8, DL, MVT::i32));
+ M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
+ DAG.getConstant(0xffe, DL, MVT::i32));
+
+ SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
+ DAG.getConstant(0x1ff, DL, MVT::i32));
+ MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
+
+ SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
+ M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
+
+ // (M != 0 ? 0x0200 : 0) | 0x7c00;
+ SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
+ DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
+ Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
+
+ // N = M | (E << 12);
+ SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
+ DAG.getNode(ISD::SHL, DL, MVT::i32, E,
+ DAG.getConstant(12, DL, MVT::i32)));
+
+ // B = clamp(1-E, 0, 13);
+ SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
+ One, E);
+ SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
+ B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
+ DAG.getConstant(13, DL, MVT::i32));
+
+ SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
+ DAG.getConstant(0x1000, DL, MVT::i32));
+
+ SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
+ SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
+ SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
+ D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
+
+ SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
+ SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
+ DAG.getConstant(0x7, DL, MVT::i32));
+ V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
+ DAG.getConstant(2, DL, MVT::i32));
+ SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
+ One, Zero, ISD::SETEQ);
+ SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
+ One, Zero, ISD::SETGT);
+ V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
+ V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
+
+ V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
+ DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
+ V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
+ I, V, ISD::SETEQ);
+
+ // Extract the sign bit.
+ SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
+ DAG.getConstant(16, DL, MVT::i32));
+ Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
+ DAG.getConstant(0x8000, DL, MVT::i32));
+
+ V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
+ return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
+}
+
+SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue Src = Op.getOperand(0);
+
+ // TODO: Factor out code common with LowerFP_TO_UINT.
+
+ EVT SrcVT = Src.getValueType();
+ if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
+ SDLoc DL(Op);
+
+ SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
+ SDValue FpToInt32 =
+ DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
+
+ return FpToInt32;
+ }
+
+ if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
+ return LowerFP64_TO_INT(Op, DAG, true);
+
+ return SDValue();
+}
+
+SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue Src = Op.getOperand(0);
+
+ // TODO: Factor out code common with LowerFP_TO_SINT.
+
+ EVT SrcVT = Src.getValueType();
+ if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
+ SDLoc DL(Op);
+
+ SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
+ SDValue FpToInt32 =
+ DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
+
+ return FpToInt32;
+ }
+
+ if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
+ return LowerFP64_TO_INT(Op, DAG, false);
+
+ return SDValue();
+}
+
+SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+ MVT VT = Op.getSimpleValueType();
+ MVT ScalarVT = VT.getScalarType();
+
+ assert(VT.isVector());
+
+ SDValue Src = Op.getOperand(0);
+ SDLoc DL(Op);
+
+ // TODO: Don't scalarize on Evergreen?
+ unsigned NElts = VT.getVectorNumElements();
+ SmallVector<SDValue, 8> Args;
+ DAG.ExtractVectorElements(Src, Args, 0, NElts);
+
+ SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
+ for (unsigned I = 0; I < NElts; ++I)
+ Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
+
+ return DAG.getBuildVector(VT, DL, Args);
+}
+
+//===----------------------------------------------------------------------===//
+// Custom DAG optimizations
+//===----------------------------------------------------------------------===//
+
+static bool isU24(SDValue Op, SelectionDAG &DAG) {
+ APInt KnownZero, KnownOne;
+ EVT VT = Op.getValueType();
+ DAG.computeKnownBits(Op, KnownZero, KnownOne);
+
+ return (VT.getSizeInBits() - KnownZero.countLeadingOnes()) <= 24;
+}
+
+static bool isI24(SDValue Op, SelectionDAG &DAG) {
+ EVT VT = Op.getValueType();
+
+ // In order for this to be a signed 24-bit value, bit 23, must
+ // be a sign bit.
+ return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
+ // as unsigned 24-bit values.
+ (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24;
+}
+
+static bool simplifyI24(SDNode *Node24, unsigned OpIdx,
+ TargetLowering::DAGCombinerInfo &DCI) {
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue Op = Node24->getOperand(OpIdx);
+ EVT VT = Op.getValueType();
+
+ APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24);
+ APInt KnownZero, KnownOne;
+ TargetLowering::TargetLoweringOpt TLO(DAG, true, true);
+ if (TLO.SimplifyDemandedBits(Node24, OpIdx, Demanded, DCI))
+ return true;
+
+ return false;
+}
+
+template <typename IntTy>
+static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
+ uint32_t Width, const SDLoc &DL) {
+ if (Width + Offset < 32) {
+ uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
+ IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
+ return DAG.getConstant(Result, DL, MVT::i32);
+ }
+
+ return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
+}
+
+static bool hasVolatileUser(SDNode *Val) {
+ for (SDNode *U : Val->uses()) {
+ if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
+ if (M->isVolatile())
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
+ // i32 vectors are the canonical memory type.
+ if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
+ return false;
+
+ if (!VT.isByteSized())
+ return false;
+
+ unsigned Size = VT.getStoreSize();
+
+ if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
+ return false;
+
+ if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
+ return false;
+
+ return true;
+}
+
+// Replace load of an illegal type with a store of a bitcast to a friendlier
+// type.
+SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ if (!DCI.isBeforeLegalize())
+ return SDValue();
+
+ LoadSDNode *LN = cast<LoadSDNode>(N);
+ if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
+ return SDValue();
+
+ SDLoc SL(N);
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = LN->getMemoryVT();
+
+ unsigned Size = VT.getStoreSize();
+ unsigned Align = LN->getAlignment();
+ if (Align < Size && isTypeLegal(VT)) {
+ bool IsFast;
+ unsigned AS = LN->getAddressSpace();
+
+ // Expand unaligned loads earlier than legalization. Due to visitation order
+ // problems during legalization, the emitted instructions to pack and unpack
+ // the bytes again are not eliminated in the case of an unaligned copy.
+ if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
+ if (VT.isVector())
+ return scalarizeVectorLoad(LN, DAG);
+
+ SDValue Ops[2];
+ std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
+ return DAG.getMergeValues(Ops, SDLoc(N));
+ }
+
+ if (!IsFast)
+ return SDValue();
+ }
+
+ if (!shouldCombineMemoryType(VT))
+ return SDValue();
+
+ EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
+
+ SDValue NewLoad
+ = DAG.getLoad(NewVT, SL, LN->getChain(),
+ LN->getBasePtr(), LN->getMemOperand());
+
+ SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
+ DCI.CombineTo(N, BC, NewLoad.getValue(1));
+ return SDValue(N, 0);
+}
+
+// Replace store of an illegal type with a store of a bitcast to a friendlier
+// type.
+SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ if (!DCI.isBeforeLegalize())
+ return SDValue();
+
+ StoreSDNode *SN = cast<StoreSDNode>(N);
+ if (SN->isVolatile() || !ISD::isNormalStore(SN))
+ return SDValue();
+
+ EVT VT = SN->getMemoryVT();
+ unsigned Size = VT.getStoreSize();
+
+ SDLoc SL(N);
+ SelectionDAG &DAG = DCI.DAG;
+ unsigned Align = SN->getAlignment();
+ if (Align < Size && isTypeLegal(VT)) {
+ bool IsFast;
+ unsigned AS = SN->getAddressSpace();
+
+ // Expand unaligned stores earlier than legalization. Due to visitation
+ // order problems during legalization, the emitted instructions to pack and
+ // unpack the bytes again are not eliminated in the case of an unaligned
+ // copy.
+ if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
+ if (VT.isVector())
+ return scalarizeVectorStore(SN, DAG);
+
+ return expandUnalignedStore(SN, DAG);
+ }
+
+ if (!IsFast)
+ return SDValue();
+ }
+
+ if (!shouldCombineMemoryType(VT))
+ return SDValue();
+
+ EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
+ SDValue Val = SN->getValue();
+
+ //DCI.AddToWorklist(Val.getNode());
+
+ bool OtherUses = !Val.hasOneUse();
+ SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
+ if (OtherUses) {
+ SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
+ DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
+ }
+
+ return DAG.getStore(SN->getChain(), SL, CastVal,
+ SN->getBasePtr(), SN->getMemOperand());
+}
+
+/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
+/// binary operation \p Opc to it with the corresponding constant operands.
+SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
+ DAGCombinerInfo &DCI, const SDLoc &SL,
+ unsigned Opc, SDValue LHS,
+ uint32_t ValLo, uint32_t ValHi) const {
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
+
+ SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
+ SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
+
+ SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
+ SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
+
+ // Re-visit the ands. It's possible we eliminated one of them and it could
+ // simplify the vector.
+ DCI.AddToWorklist(Lo.getNode());
+ DCI.AddToWorklist(Hi.getNode());
+
+ SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
+ return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
+}
+
+SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ if (N->getValueType(0) != MVT::i64)
+ return SDValue();
+
+ // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
+
+ // On some subtargets, 64-bit shift is a quarter rate instruction. In the
+ // common case, splitting this into a move and a 32-bit shift is faster and
+ // the same code size.
+ const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!RHS)
+ return SDValue();
+
+ unsigned RHSVal = RHS->getZExtValue();
+ if (RHSVal < 32)
+ return SDValue();
+
+ SDValue LHS = N->getOperand(0);
+
+ SDLoc SL(N);
+ SelectionDAG &DAG = DCI.DAG;
+
+ SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
+
+ SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
+ SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
+
+ const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+
+ SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
+ return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
+}
+
+SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ if (N->getValueType(0) != MVT::i64)
+ return SDValue();
+
+ const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!RHS)
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc SL(N);
+ unsigned RHSVal = RHS->getZExtValue();
+
+ // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
+ if (RHSVal == 32) {
+ SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
+ SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
+ DAG.getConstant(31, SL, MVT::i32));
+
+ SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
+ return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
+ }
+
+ // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
+ if (RHSVal == 63) {
+ SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
+ SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
+ DAG.getConstant(31, SL, MVT::i32));
+ SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
+ return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
+ }
+
+ return SDValue();
+}
+
+SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ if (N->getValueType(0) != MVT::i64)
+ return SDValue();
+
+ const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!RHS)
+ return SDValue();
+
+ unsigned ShiftAmt = RHS->getZExtValue();
+ if (ShiftAmt < 32)
+ return SDValue();
+
+ // srl i64:x, C for C >= 32
+ // =>
+ // build_pair (srl hi_32(x), C - 32), 0
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc SL(N);
+
+ SDValue One = DAG.getConstant(1, SL, MVT::i32);
+ SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+
+ SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, N->getOperand(0));
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
+ VecOp, One);
+
+ SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
+ SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
+
+ SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
+
+ return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
+}
+
+// We need to specifically handle i64 mul here to avoid unnecessary conversion
+// instructions. If we only match on the legalized i64 mul expansion,
+// SimplifyDemandedBits will be unable to remove them because there will be
+// multiple uses due to the separate mul + mulh[su].
+static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
+ SDValue N0, SDValue N1, unsigned Size, bool Signed) {
+ if (Size <= 32) {
+ unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
+ return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
+ }
+
+ // Because we want to eliminate extension instructions before the
+ // operation, we need to create a single user here (i.e. not the separate
+ // mul_lo + mul_hi) so that SimplifyDemandedBits will deal with it.
+
+ unsigned MulOpc = Signed ? AMDGPUISD::MUL_LOHI_I24 : AMDGPUISD::MUL_LOHI_U24;
+
+ SDValue Mul = DAG.getNode(MulOpc, SL,
+ DAG.getVTList(MVT::i32, MVT::i32), N0, N1);
+
+ return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64,
+ Mul.getValue(0), Mul.getValue(1));
+}
+
+SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ EVT VT = N->getValueType(0);
+
+ unsigned Size = VT.getSizeInBits();
+ if (VT.isVector() || Size > 64)
+ return SDValue();
+
+ // There are i16 integer mul/mad.
+ if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc DL(N);
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDValue Mul;
+
+ if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
+ N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
+ N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
+ Mul = getMul24(DAG, DL, N0, N1, Size, false);
+ } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
+ N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
+ N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
+ Mul = getMul24(DAG, DL, N0, N1, Size, true);
+ } else {
+ return SDValue();
+ }
+
+ // We need to use sext even for MUL_U24, because MUL_U24 is used
+ // for signed multiply of 8 and 16-bit types.
+ return DAG.getSExtOrTrunc(Mul, DL, VT);
+}
+
+SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ EVT VT = N->getValueType(0);
+
+ if (!Subtarget->hasMulI24() || VT.isVector())
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc DL(N);
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ if (!isI24(N0, DAG) || !isI24(N1, DAG))
+ return SDValue();
+
+ N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
+ N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
+
+ SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
+ DCI.AddToWorklist(Mulhi.getNode());
+ return DAG.getSExtOrTrunc(Mulhi, DL, VT);
+}
+
+SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ EVT VT = N->getValueType(0);
+
+ if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc DL(N);
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ if (!isU24(N0, DAG) || !isU24(N1, DAG))
+ return SDValue();
+
+ N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
+ N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
+
+ SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
+ DCI.AddToWorklist(Mulhi.getNode());
+ return DAG.getZExtOrTrunc(Mulhi, DL, VT);
+}
+
+SDValue AMDGPUTargetLowering::performMulLoHi24Combine(
+ SDNode *N, DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+
+ // Simplify demanded bits before splitting into multiple users.
+ if (simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI))
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ bool Signed = (N->getOpcode() == AMDGPUISD::MUL_LOHI_I24);
+
+ unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
+ unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
+
+ SDLoc SL(N);
+
+ SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
+ SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
+ return DAG.getMergeValues({ MulLo, MulHi }, SL);
+}
+
+static bool isNegativeOne(SDValue Val) {
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
+ return C->isAllOnesValue();
+ return false;
+}
+
+static bool isCtlzOpc(unsigned Opc) {
+ return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
+}
+
+SDValue AMDGPUTargetLowering::getFFBH_U32(SelectionDAG &DAG,
+ SDValue Op,
+ const SDLoc &DL) const {
+ EVT VT = Op.getValueType();
+ EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
+ if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
+ LegalVT != MVT::i16))
+ return SDValue();
+
+ if (VT != MVT::i32)
+ Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
+
+ SDValue FFBH = DAG.getNode(AMDGPUISD::FFBH_U32, DL, MVT::i32, Op);
+ if (VT != MVT::i32)
+ FFBH = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBH);
+
+ return FFBH;
+}
+
+// The native instructions return -1 on 0 input. Optimize out a select that
+// produces -1 on 0.
+//
+// TODO: If zero is not undef, we could also do this if the output is compared
+// against the bitwidth.
+//
+// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
+SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond,
+ SDValue LHS, SDValue RHS,
+ DAGCombinerInfo &DCI) const {
+ ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
+ if (!CmpRhs || !CmpRhs->isNullValue())
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+ SDValue CmpLHS = Cond.getOperand(0);
+
+ // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
+ if (CCOpcode == ISD::SETEQ &&
+ isCtlzOpc(RHS.getOpcode()) &&
+ RHS.getOperand(0) == CmpLHS &&
+ isNegativeOne(LHS)) {
+ return getFFBH_U32(DAG, CmpLHS, SL);
+ }
+
+ // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
+ if (CCOpcode == ISD::SETNE &&
+ isCtlzOpc(LHS.getOpcode()) &&
+ LHS.getOperand(0) == CmpLHS &&
+ isNegativeOne(RHS)) {
+ return getFFBH_U32(DAG, CmpLHS, SL);
+ }
+
+ return SDValue();
+}
+
+SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SDValue Cond = N->getOperand(0);
+ if (Cond.getOpcode() != ISD::SETCC)
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+ SDValue LHS = Cond.getOperand(0);
+ SDValue RHS = Cond.getOperand(1);
+ SDValue CC = Cond.getOperand(2);
+
+ SDValue True = N->getOperand(1);
+ SDValue False = N->getOperand(2);
+
+ if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
+ SelectionDAG &DAG = DCI.DAG;
+ if ((DAG.isConstantValueOfAnyType(True) ||
+ DAG.isConstantValueOfAnyType(True)) &&
+ (!DAG.isConstantValueOfAnyType(False) &&
+ !DAG.isConstantValueOfAnyType(False))) {
+ // Swap cmp + select pair to move constant to false input.
+ // This will allow using VOPC cndmasks more often.
+ // select (setcc x, y), k, x -> select (setcc y, x) x, x
+
+ SDLoc SL(N);
+ ISD::CondCode NewCC = getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
+ LHS.getValueType().isInteger());
+
+ SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
+ return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
+ }
+ }
+
+ if (VT == MVT::f32 && Cond.hasOneUse()) {
+ SDValue MinMax
+ = CombineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
+ // Revisit this node so we can catch min3/max3/med3 patterns.
+ //DCI.AddToWorklist(MinMax.getNode());
+ return MinMax;
+ }
+
+ // There's no reason to not do this if the condition has other uses.
+ return performCtlzCombine(SDLoc(N), Cond, True, False, DCI);
+}
+
+SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc DL(N);
+
+ switch(N->getOpcode()) {
+ default:
+ break;
+ case ISD::BITCAST: {
+ EVT DestVT = N->getValueType(0);
+
+ // Push casts through vector builds. This helps avoid emitting a large
+ // number of copies when materializing floating point vector constants.
+ //
+ // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
+ // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
+ if (DestVT.isVector()) {
+ SDValue Src = N->getOperand(0);
+ if (Src.getOpcode() == ISD::BUILD_VECTOR) {
+ EVT SrcVT = Src.getValueType();
+ unsigned NElts = DestVT.getVectorNumElements();
+
+ if (SrcVT.getVectorNumElements() == NElts) {
+ EVT DestEltVT = DestVT.getVectorElementType();
+
+ SmallVector<SDValue, 8> CastedElts;
+ SDLoc SL(N);
+ for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
+ SDValue Elt = Src.getOperand(I);
+ CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
+ }
+
+ return DAG.getBuildVector(DestVT, SL, CastedElts);
+ }
+ }
+ }
+
+ if (DestVT.getSizeInBits() != 64 && !DestVT.isVector())
+ break;
+
+ // Fold bitcasts of constants.
+ //
+ // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
+ // TODO: Generalize and move to DAGCombiner
+ SDValue Src = N->getOperand(0);
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
+ assert(Src.getValueType() == MVT::i64);
+ SDLoc SL(N);
+ uint64_t CVal = C->getZExtValue();
+ return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT,
+ DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
+ DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
+ }
+
+ if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
+ const APInt &Val = C->getValueAPF().bitcastToAPInt();
+ SDLoc SL(N);
+ uint64_t CVal = Val.getZExtValue();
+ SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
+ DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
+ DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
+
+ return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
+ }
+
+ break;
+ }
+ case ISD::SHL: {
+ if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
+ break;
+
+ return performShlCombine(N, DCI);
+ }
+ case ISD::SRL: {
+ if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
+ break;
+
+ return performSrlCombine(N, DCI);
+ }
+ case ISD::SRA: {
+ if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
+ break;
+
+ return performSraCombine(N, DCI);
+ }
+ case ISD::MUL:
+ return performMulCombine(N, DCI);
+ case ISD::MULHS:
+ return performMulhsCombine(N, DCI);
+ case ISD::MULHU:
+ return performMulhuCombine(N, DCI);
+ case AMDGPUISD::MUL_I24:
+ case AMDGPUISD::MUL_U24:
+ case AMDGPUISD::MULHI_I24:
+ case AMDGPUISD::MULHI_U24: {
+ // If the first call to simplify is successfull, then N may end up being
+ // deleted, so we shouldn't call simplifyI24 again.
+ simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI);
+ return SDValue();
+ }
+ case AMDGPUISD::MUL_LOHI_I24:
+ case AMDGPUISD::MUL_LOHI_U24:
+ return performMulLoHi24Combine(N, DCI);
+ case ISD::SELECT:
+ return performSelectCombine(N, DCI);
+ case AMDGPUISD::BFE_I32:
+ case AMDGPUISD::BFE_U32: {
+ assert(!N->getValueType(0).isVector() &&
+ "Vector handling of BFE not implemented");
+ ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
+ if (!Width)
+ break;
+
+ uint32_t WidthVal = Width->getZExtValue() & 0x1f;
+ if (WidthVal == 0)
+ return DAG.getConstant(0, DL, MVT::i32);
+
+ ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!Offset)
+ break;
+
+ SDValue BitsFrom = N->getOperand(0);
+ uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
+
+ bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
+
+ if (OffsetVal == 0) {
+ // This is already sign / zero extended, so try to fold away extra BFEs.
+ unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
+
+ unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
+ if (OpSignBits >= SignBits)
+ return BitsFrom;
+
+ EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
+ if (Signed) {
+ // This is a sign_extend_inreg. Replace it to take advantage of existing
+ // DAG Combines. If not eliminated, we will match back to BFE during
+ // selection.
+
+ // TODO: The sext_inreg of extended types ends, although we can could
+ // handle them in a single BFE.
+ return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
+ DAG.getValueType(SmallVT));
+ }
+
+ return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
+ }
+
+ if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
+ if (Signed) {
+ return constantFoldBFE<int32_t>(DAG,
+ CVal->getSExtValue(),
+ OffsetVal,
+ WidthVal,
+ DL);
+ }
+
+ return constantFoldBFE<uint32_t>(DAG,
+ CVal->getZExtValue(),
+ OffsetVal,
+ WidthVal,
+ DL);
+ }
+
+ if ((OffsetVal + WidthVal) >= 32) {
+ SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
+ return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
+ BitsFrom, ShiftVal);
+ }
+
+ if (BitsFrom.hasOneUse()) {
+ APInt Demanded = APInt::getBitsSet(32,
+ OffsetVal,
+ OffsetVal + WidthVal);
+
+ APInt KnownZero, KnownOne;
+ TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+ !DCI.isBeforeLegalizeOps());
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) ||
+ TLI.SimplifyDemandedBits(BitsFrom, Demanded,
+ KnownZero, KnownOne, TLO)) {
+ DCI.CommitTargetLoweringOpt(TLO);
+ }
+ }
+
+ break;
+ }
+ case ISD::LOAD:
+ return performLoadCombine(N, DCI);
+ case ISD::STORE:
+ return performStoreCombine(N, DCI);
+ }
+ return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+// Helper functions
+//===----------------------------------------------------------------------===//
+
+SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
+ const TargetRegisterClass *RC,
+ unsigned Reg, EVT VT) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ unsigned VirtualRegister;
+ if (!MRI.isLiveIn(Reg)) {
+ VirtualRegister = MRI.createVirtualRegister(RC);
+ MRI.addLiveIn(Reg, VirtualRegister);
+ } else {
+ VirtualRegister = MRI.getLiveInVirtReg(Reg);
+ }
+ return DAG.getRegister(VirtualRegister, VT);
+}
+
+uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
+ const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const {
+ unsigned Alignment = Subtarget->getAlignmentForImplicitArgPtr();
+ uint64_t ArgOffset = alignTo(MFI->getABIArgOffset(), Alignment);
+ switch (Param) {
+ case GRID_DIM:
+ return ArgOffset;
+ case GRID_OFFSET:
+ return ArgOffset + 4;
+ }
+ llvm_unreachable("unexpected implicit parameter type");
+}
+
+#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
+
+const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
+ switch ((AMDGPUISD::NodeType)Opcode) {
+ case AMDGPUISD::FIRST_NUMBER: break;
+ // AMDIL DAG nodes
+ NODE_NAME_CASE(CALL);
+ NODE_NAME_CASE(UMUL);
+ NODE_NAME_CASE(BRANCH_COND);
+
+ // AMDGPU DAG nodes
+ NODE_NAME_CASE(ENDPGM)
+ NODE_NAME_CASE(RETURN)
+ NODE_NAME_CASE(DWORDADDR)
+ NODE_NAME_CASE(FRACT)
+ NODE_NAME_CASE(SETCC)
+ NODE_NAME_CASE(SETREG)
+ NODE_NAME_CASE(FMA_W_CHAIN)
+ NODE_NAME_CASE(FMUL_W_CHAIN)
+ NODE_NAME_CASE(CLAMP)
+ NODE_NAME_CASE(COS_HW)
+ NODE_NAME_CASE(SIN_HW)
+ NODE_NAME_CASE(FMAX_LEGACY)
+ NODE_NAME_CASE(FMIN_LEGACY)
+ NODE_NAME_CASE(FMAX3)
+ NODE_NAME_CASE(SMAX3)
+ NODE_NAME_CASE(UMAX3)
+ NODE_NAME_CASE(FMIN3)
+ NODE_NAME_CASE(SMIN3)
+ NODE_NAME_CASE(UMIN3)
+ NODE_NAME_CASE(FMED3)
+ NODE_NAME_CASE(SMED3)
+ NODE_NAME_CASE(UMED3)
+ NODE_NAME_CASE(URECIP)
+ NODE_NAME_CASE(DIV_SCALE)
+ NODE_NAME_CASE(DIV_FMAS)
+ NODE_NAME_CASE(DIV_FIXUP)
+ NODE_NAME_CASE(TRIG_PREOP)
+ NODE_NAME_CASE(RCP)
+ NODE_NAME_CASE(RSQ)
+ NODE_NAME_CASE(RCP_LEGACY)
+ NODE_NAME_CASE(RSQ_LEGACY)
+ NODE_NAME_CASE(FMUL_LEGACY)
+ NODE_NAME_CASE(RSQ_CLAMP)
+ NODE_NAME_CASE(LDEXP)
+ NODE_NAME_CASE(FP_CLASS)
+ NODE_NAME_CASE(DOT4)
+ NODE_NAME_CASE(CARRY)
+ NODE_NAME_CASE(BORROW)
+ NODE_NAME_CASE(BFE_U32)
+ NODE_NAME_CASE(BFE_I32)
+ NODE_NAME_CASE(BFI)
+ NODE_NAME_CASE(BFM)
+ NODE_NAME_CASE(FFBH_U32)
+ NODE_NAME_CASE(FFBH_I32)
+ NODE_NAME_CASE(MUL_U24)
+ NODE_NAME_CASE(MUL_I24)
+ NODE_NAME_CASE(MULHI_U24)
+ NODE_NAME_CASE(MULHI_I24)
+ NODE_NAME_CASE(MUL_LOHI_U24)
+ NODE_NAME_CASE(MUL_LOHI_I24)
+ NODE_NAME_CASE(MAD_U24)
+ NODE_NAME_CASE(MAD_I24)
+ NODE_NAME_CASE(TEXTURE_FETCH)
+ NODE_NAME_CASE(EXPORT)
+ NODE_NAME_CASE(EXPORT_DONE)
+ NODE_NAME_CASE(R600_EXPORT)
+ NODE_NAME_CASE(CONST_ADDRESS)
+ NODE_NAME_CASE(REGISTER_LOAD)
+ NODE_NAME_CASE(REGISTER_STORE)
+ NODE_NAME_CASE(LOAD_INPUT)
+ NODE_NAME_CASE(SAMPLE)
+ NODE_NAME_CASE(SAMPLEB)
+ NODE_NAME_CASE(SAMPLED)
+ NODE_NAME_CASE(SAMPLEL)
+ NODE_NAME_CASE(CVT_F32_UBYTE0)
+ NODE_NAME_CASE(CVT_F32_UBYTE1)
+ NODE_NAME_CASE(CVT_F32_UBYTE2)
+ NODE_NAME_CASE(CVT_F32_UBYTE3)
+ NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
+ NODE_NAME_CASE(CONST_DATA_PTR)
+ NODE_NAME_CASE(PC_ADD_REL_OFFSET)
+ NODE_NAME_CASE(KILL)
+ case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
+ NODE_NAME_CASE(SENDMSG)
+ NODE_NAME_CASE(INTERP_MOV)
+ NODE_NAME_CASE(INTERP_P1)
+ NODE_NAME_CASE(INTERP_P2)
+ NODE_NAME_CASE(STORE_MSKOR)
+ NODE_NAME_CASE(LOAD_CONSTANT)
+ NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
+ NODE_NAME_CASE(ATOMIC_CMP_SWAP)
+ NODE_NAME_CASE(ATOMIC_INC)
+ NODE_NAME_CASE(ATOMIC_DEC)
+ NODE_NAME_CASE(BUFFER_LOAD)
+ NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
+ case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
+ }
+ return nullptr;
+}
+
+SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
+ SelectionDAG &DAG, int Enabled,
+ int &RefinementSteps,
+ bool &UseOneConstNR,
+ bool Reciprocal) const {
+ EVT VT = Operand.getValueType();
+
+ if (VT == MVT::f32) {
+ RefinementSteps = 0;
+ return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
+ }
+
+ // TODO: There is also f64 rsq instruction, but the documentation is less
+ // clear on its precision.
+
+ return SDValue();
+}
+
+SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
+ SelectionDAG &DAG, int Enabled,
+ int &RefinementSteps) const {
+ EVT VT = Operand.getValueType();
+
+ if (VT == MVT::f32) {
+ // Reciprocal, < 1 ulp error.
+ //
+ // This reciprocal approximation converges to < 0.5 ulp error with one
+ // newton rhapson performed with two fused multiple adds (FMAs).
+
+ RefinementSteps = 0;
+ return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
+ }
+
+ // TODO: There is also f64 rcp instruction, but the documentation is less
+ // clear on its precision.
+
+ return SDValue();
+}
+
+void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
+ const SDValue Op,
+ APInt &KnownZero,
+ APInt &KnownOne,
+ const SelectionDAG &DAG,
+ unsigned Depth) const {
+
+ KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything.
+
+ APInt KnownZero2;
+ APInt KnownOne2;
+ unsigned Opc = Op.getOpcode();
+
+ switch (Opc) {
+ default:
+ break;
+ case AMDGPUISD::CARRY:
+ case AMDGPUISD::BORROW: {
+ KnownZero = APInt::getHighBitsSet(32, 31);
+ break;
+ }
+
+ case AMDGPUISD::BFE_I32:
+ case AMDGPUISD::BFE_U32: {
+ ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+ if (!CWidth)
+ return;
+
+ unsigned BitWidth = 32;
+ uint32_t Width = CWidth->getZExtValue() & 0x1f;
+
+ if (Opc == AMDGPUISD::BFE_U32)
+ KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - Width);
+
+ break;
+ }
+ }
+}
+
+unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
+ SDValue Op,
+ const SelectionDAG &DAG,
+ unsigned Depth) const {
+ switch (Op.getOpcode()) {
+ case AMDGPUISD::BFE_I32: {
+ ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+ if (!Width)
+ return 1;
+
+ unsigned SignBits = 32 - Width->getZExtValue() + 1;
+ if (!isNullConstant(Op.getOperand(1)))
+ return SignBits;
+
+ // TODO: Could probably figure something out with non-0 offsets.
+ unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+ return std::max(SignBits, Op0SignBits);
+ }
+
+ case AMDGPUISD::BFE_U32: {
+ ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+ return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
+ }
+
+ case AMDGPUISD::CARRY:
+ case AMDGPUISD::BORROW:
+ return 31;
+
+ default:
+ return 1;
+ }
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
new file mode 100644
index 000000000000..5cc5efb331e3
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -0,0 +1,338 @@
+//===-- AMDGPUISelLowering.h - AMDGPU Lowering Interface --------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface definition of the TargetLowering class that is common
+/// to all AMD GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H
+
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+class AMDGPUMachineFunction;
+class AMDGPUSubtarget;
+class MachineRegisterInfo;
+
+class AMDGPUTargetLowering : public TargetLowering {
+private:
+ /// \returns AMDGPUISD::FFBH_U32 node if the incoming \p Op may have been
+ /// legalized from a smaller type VT. Need to match pre-legalized type because
+ /// the generic legalization inserts the add/sub between the select and
+ /// compare.
+ SDValue getFFBH_U32(SelectionDAG &DAG, SDValue Op, const SDLoc &DL) const;
+
+protected:
+ const AMDGPUSubtarget *Subtarget;
+
+ SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+ /// \brief Split a vector store into multiple scalar stores.
+ /// \returns The resulting chain.
+
+ SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue LowerFROUND32(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const;
+ SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const;
+ SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, bool Signed) const;
+ SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
+
+protected:
+ bool shouldCombineMemoryType(EVT VT) const;
+ SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+ SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL,
+ unsigned Opc, SDValue LHS,
+ uint32_t ValLo, uint32_t ValHi) const;
+ SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performMulLoHi24Combine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performCtlzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS,
+ SDValue RHS, DAGCombinerInfo &DCI) const;
+ SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+ static EVT getEquivalentMemType(LLVMContext &Context, EVT VT);
+
+ virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
+ SelectionDAG &DAG) const;
+
+ /// Return 64-bit value Op as two 32-bit integers.
+ std::pair<SDValue, SDValue> split64BitValue(SDValue Op,
+ SelectionDAG &DAG) const;
+ SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const;
+ SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const;
+
+ /// \brief Split a vector load into 2 loads of half the vector.
+ SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const;
+
+ /// \brief Split a vector store into 2 stores of half the vector.
+ SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const;
+ void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &Results) const;
+ void analyzeFormalArgumentsCompute(CCState &State,
+ const SmallVectorImpl<ISD::InputArg> &Ins) const;
+ void AnalyzeFormalArguments(CCState &State,
+ const SmallVectorImpl<ISD::InputArg> &Ins) const;
+ void AnalyzeReturn(CCState &State,
+ const SmallVectorImpl<ISD::OutputArg> &Outs) const;
+
+public:
+ AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI);
+
+ bool isFAbsFree(EVT VT) const override;
+ bool isFNegFree(EVT VT) const override;
+ bool isTruncateFree(EVT Src, EVT Dest) const override;
+ bool isTruncateFree(Type *Src, Type *Dest) const override;
+
+ bool isZExtFree(Type *Src, Type *Dest) const override;
+ bool isZExtFree(EVT Src, EVT Dest) const override;
+ bool isZExtFree(SDValue Val, EVT VT2) const override;
+
+ bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
+
+ MVT getVectorIdxTy(const DataLayout &) const override;
+ bool isSelectSupported(SelectSupportKind) const override;
+
+ bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+ bool ShouldShrinkFPConstant(EVT VT) const override;
+ bool shouldReduceLoadWidth(SDNode *Load,
+ ISD::LoadExtType ExtType,
+ EVT ExtVT) const override;
+
+ bool isLoadBitCastBeneficial(EVT, EVT) const final;
+
+ bool storeOfVectorConstantIsCheap(EVT MemVT,
+ unsigned NumElem,
+ unsigned AS) const override;
+ bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override;
+ bool isCheapToSpeculateCttz() const override;
+ bool isCheapToSpeculateCtlz() const override;
+
+ SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
+ SelectionDAG &DAG) const override;
+ SDValue LowerCall(CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const override;
+
+ SDValue LowerDYNAMIC_STACKALLOC(SDValue Op,
+ SelectionDAG &DAG) const;
+
+ SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+ SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+ void ReplaceNodeResults(SDNode * N,
+ SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const override;
+
+ SDValue CombineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS,
+ SDValue RHS, SDValue True, SDValue False,
+ SDValue CC, DAGCombinerInfo &DCI) const;
+
+ const char* getTargetNodeName(unsigned Opcode) const override;
+
+ bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override {
+ return true;
+ }
+ SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
+ int &RefinementSteps, bool &UseOneConstNR,
+ bool Reciprocal) const override;
+ SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
+ int &RefinementSteps) const override;
+
+ virtual SDNode *PostISelFolding(MachineSDNode *N,
+ SelectionDAG &DAG) const = 0;
+
+ /// \brief Determine which of the bits specified in \p Mask are known to be
+ /// either zero or one and return them in the \p KnownZero and \p KnownOne
+ /// bitsets.
+ void computeKnownBitsForTargetNode(const SDValue Op,
+ APInt &KnownZero,
+ APInt &KnownOne,
+ const SelectionDAG &DAG,
+ unsigned Depth = 0) const override;
+
+ unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const SelectionDAG &DAG,
+ unsigned Depth = 0) const override;
+
+ /// \brief Helper function that adds Reg to the LiveIn list of the DAG's
+ /// MachineFunction.
+ ///
+ /// \returns a RegisterSDNode representing Reg.
+ virtual SDValue CreateLiveInRegister(SelectionDAG &DAG,
+ const TargetRegisterClass *RC,
+ unsigned Reg, EVT VT) const;
+
+ enum ImplicitParameter {
+ FIRST_IMPLICIT,
+ GRID_DIM = FIRST_IMPLICIT,
+ GRID_OFFSET,
+ };
+
+ /// \brief Helper function that returns the byte offset of the given
+ /// type of implicit parameter.
+ uint32_t getImplicitParameterOffset(const AMDGPUMachineFunction *MFI,
+ const ImplicitParameter Param) const;
+};
+
+namespace AMDGPUISD {
+
+enum NodeType : unsigned {
+ // AMDIL ISD Opcodes
+ FIRST_NUMBER = ISD::BUILTIN_OP_END,
+ CALL, // Function call based on a single integer
+ UMUL, // 32bit unsigned multiplication
+ BRANCH_COND,
+ // End AMDIL ISD Opcodes
+ ENDPGM,
+ RETURN,
+ DWORDADDR,
+ FRACT,
+ CLAMP,
+ // This is SETCC with the full mask result which is used for a compare with a
+ // result bit per item in the wavefront.
+ SETCC,
+ SETREG,
+ // FP ops with input and output chain.
+ FMA_W_CHAIN,
+ FMUL_W_CHAIN,
+
+ // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi.
+ // Denormals handled on some parts.
+ COS_HW,
+ SIN_HW,
+ FMAX_LEGACY,
+ FMIN_LEGACY,
+ FMAX3,
+ SMAX3,
+ UMAX3,
+ FMIN3,
+ SMIN3,
+ UMIN3,
+ FMED3,
+ SMED3,
+ UMED3,
+ URECIP,
+ DIV_SCALE,
+ DIV_FMAS,
+ DIV_FIXUP,
+ TRIG_PREOP, // 1 ULP max error for f64
+
+ // RCP, RSQ - For f32, 1 ULP max error, no denormal handling.
+ // For f64, max error 2^29 ULP, handles denormals.
+ RCP,
+ RSQ,
+ RCP_LEGACY,
+ RSQ_LEGACY,
+ FMUL_LEGACY,
+ RSQ_CLAMP,
+ LDEXP,
+ FP_CLASS,
+ DOT4,
+ CARRY,
+ BORROW,
+ BFE_U32, // Extract range of bits with zero extension to 32-bits.
+ BFE_I32, // Extract range of bits with sign extension to 32-bits.
+ BFI, // (src0 & src1) | (~src0 & src2)
+ BFM, // Insert a range of bits into a 32-bit word.
+ FFBH_U32, // ctlz with -1 if input is zero.
+ FFBH_I32,
+ MUL_U24,
+ MUL_I24,
+ MULHI_U24,
+ MULHI_I24,
+ MAD_U24,
+ MAD_I24,
+ MUL_LOHI_I24,
+ MUL_LOHI_U24,
+ TEXTURE_FETCH,
+ EXPORT, // exp on SI+
+ EXPORT_DONE, // exp on SI+ with done bit set
+ R600_EXPORT,
+ CONST_ADDRESS,
+ REGISTER_LOAD,
+ REGISTER_STORE,
+ LOAD_INPUT,
+ SAMPLE,
+ SAMPLEB,
+ SAMPLED,
+ SAMPLEL,
+
+ // These cvt_f32_ubyte* nodes need to remain consecutive and in order.
+ CVT_F32_UBYTE0,
+ CVT_F32_UBYTE1,
+ CVT_F32_UBYTE2,
+ CVT_F32_UBYTE3,
+ /// This node is for VLIW targets and it is used to represent a vector
+ /// that is stored in consecutive registers with the same channel.
+ /// For example:
+ /// |X |Y|Z|W|
+ /// T0|v.x| | | |
+ /// T1|v.y| | | |
+ /// T2|v.z| | | |
+ /// T3|v.w| | | |
+ BUILD_VERTICAL_VECTOR,
+ /// Pointer to the start of the shader's constant data.
+ CONST_DATA_PTR,
+ SENDMSG,
+ INTERP_MOV,
+ INTERP_P1,
+ INTERP_P2,
+ PC_ADD_REL_OFFSET,
+ KILL,
+ FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
+ STORE_MSKOR,
+ LOAD_CONSTANT,
+ TBUFFER_STORE_FORMAT,
+ ATOMIC_CMP_SWAP,
+ ATOMIC_INC,
+ ATOMIC_DEC,
+ BUFFER_LOAD,
+ BUFFER_LOAD_FORMAT,
+ LAST_AMDGPU_ISD_NUMBER
+};
+
+
+} // End namespace AMDGPUISD
+
+} // End namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
new file mode 100644
index 000000000000..e4dc6599e156
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -0,0 +1,115 @@
+//===-- AMDGPUInstrInfo.cpp - Base class for AMD GPU InstrInfo ------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Implementation of the TargetInstrInfo class that is common to all
+/// AMD GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPURegisterInfo.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_CTOR_DTOR
+#define GET_INSTRMAP_INFO
+#include "AMDGPUGenInstrInfo.inc"
+
+// Pin the vtable to this file.
+void AMDGPUInstrInfo::anchor() {}
+
+AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &ST)
+ : AMDGPUGenInstrInfo(-1, -1), ST(ST) {}
+
+// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
+// the first 16 loads will be interleaved with the stores, and the next 16 will
+// be clustered as expected. It should really split into 2 16 store batches.
+//
+// Loads are clustered until this returns false, rather than trying to schedule
+// groups of stores. This also means we have to deal with saying different
+// address space loads should be clustered, and ones which might cause bank
+// conflicts.
+//
+// This might be deprecated so it might not be worth that much effort to fix.
+bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
+ int64_t Offset0, int64_t Offset1,
+ unsigned NumLoads) const {
+ assert(Offset1 > Offset0 &&
+ "Second offset should be larger than first offset!");
+ // If we have less than 16 loads in a row, and the offsets are within 64
+ // bytes, then schedule together.
+
+ // A cacheline is 64 bytes (for global memory).
+ return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
+}
+
+int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const {
+ switch (Channels) {
+ default: return Opcode;
+ case 1: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_1);
+ case 2: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_2);
+ case 3: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_3);
+ }
+}
+
+// This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
+enum SIEncodingFamily {
+ SI = 0,
+ VI = 1
+};
+
+// Wrapper for Tablegen'd function. enum Subtarget is not defined in any
+// header files, so we need to wrap it in a function that takes unsigned
+// instead.
+namespace llvm {
+namespace AMDGPU {
+static int getMCOpcode(uint16_t Opcode, unsigned Gen) {
+ return getMCOpcodeGen(Opcode, static_cast<Subtarget>(Gen));
+}
+}
+}
+
+static SIEncodingFamily subtargetEncodingFamily(const AMDGPUSubtarget &ST) {
+ switch (ST.getGeneration()) {
+ case AMDGPUSubtarget::SOUTHERN_ISLANDS:
+ case AMDGPUSubtarget::SEA_ISLANDS:
+ return SIEncodingFamily::SI;
+ case AMDGPUSubtarget::VOLCANIC_ISLANDS:
+ return SIEncodingFamily::VI;
+
+ // FIXME: This should never be called for r600 GPUs.
+ case AMDGPUSubtarget::R600:
+ case AMDGPUSubtarget::R700:
+ case AMDGPUSubtarget::EVERGREEN:
+ case AMDGPUSubtarget::NORTHERN_ISLANDS:
+ return SIEncodingFamily::SI;
+ }
+
+ llvm_unreachable("Unknown subtarget generation!");
+}
+
+int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const {
+ int MCOp = AMDGPU::getMCOpcode(Opcode, subtargetEncodingFamily(ST));
+
+ // -1 means that Opcode is already a native instruction.
+ if (MCOp == -1)
+ return Opcode;
+
+ // (uint16_t)-1 means that Opcode is a pseudo instruction that has
+ // no encoding in the given subtarget generation.
+ if (MCOp == (uint16_t)-1)
+ return -1;
+
+ return MCOp;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
new file mode 100644
index 000000000000..bd8e389639f5
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -0,0 +1,57 @@
+//===-- AMDGPUInstrInfo.h - AMDGPU Instruction Information ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Contains the definition of a TargetInstrInfo class that is common
+/// to all AMD GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRINFO_H
+
+#include "llvm/Target/TargetInstrInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#define GET_INSTRINFO_ENUM
+#include "AMDGPUGenInstrInfo.inc"
+
+namespace llvm {
+
+class AMDGPUSubtarget;
+class MachineFunction;
+class MachineInstr;
+class MachineInstrBuilder;
+
+class AMDGPUInstrInfo : public AMDGPUGenInstrInfo {
+private:
+ const AMDGPUSubtarget &ST;
+
+ virtual void anchor();
+
+public:
+ explicit AMDGPUInstrInfo(const AMDGPUSubtarget &st);
+
+ bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+ int64_t Offset1, int64_t Offset2,
+ unsigned NumLoads) const override;
+
+ /// \brief Return a target-specific opcode if Opcode is a pseudo instruction.
+ /// Return -1 if the target-specific opcode for the pseudo instruction does
+ /// not exist. If Opcode is not a pseudo instruction, this is identity.
+ int pseudoToMCOpcode(int Opcode) const;
+
+ /// \brief Given a MIMG \p Opcode that writes all 4 channels, return the
+ /// equivalent opcode that writes \p Channels Channels.
+ int getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const;
+};
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
new file mode 100644
index 000000000000..e7b40016e272
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -0,0 +1,330 @@
+//===-- AMDGPUInstrInfo.td - AMDGPU DAG nodes --------------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains DAG node defintions for the AMDGPU target.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// AMDGPU DAG Profiles
+//===----------------------------------------------------------------------===//
+
+def AMDGPUDTIntTernaryOp : SDTypeProfile<1, 3, [
+ SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3>
+]>;
+
+def AMDGPUTrigPreOp : SDTypeProfile<1, 2,
+ [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>]
+>;
+
+def AMDGPULdExpOp : SDTypeProfile<1, 2,
+ [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>]
+>;
+
+def AMDGPUFPClassOp : SDTypeProfile<1, 2,
+ [SDTCisInt<0>, SDTCisFP<1>, SDTCisInt<2>]
+>;
+
+def AMDGPUDivScaleOp : SDTypeProfile<2, 3,
+ [SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>]
+>;
+
+// float, float, float, vcc
+def AMDGPUFmasOp : SDTypeProfile<1, 4,
+ [SDTCisFP<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisInt<4>]
+>;
+
+def AMDGPUKillSDT : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+//===----------------------------------------------------------------------===//
+// AMDGPU DAG Nodes
+//
+
+def AMDGPUconstdata_ptr : SDNode<
+ "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 1, [SDTCisVT<0, iPTR>,
+ SDTCisVT<0, iPTR>]>
+>;
+
+// This argument to this node is a dword address.
+def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>;
+
+def AMDGPUcos : SDNode<"AMDGPUISD::COS_HW", SDTFPUnaryOp>;
+def AMDGPUsin : SDNode<"AMDGPUISD::SIN_HW", SDTFPUnaryOp>;
+
+// out = a - floor(a)
+def AMDGPUfract : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>;
+
+// out = 1.0 / a
+def AMDGPUrcp : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>;
+
+// out = 1.0 / sqrt(a)
+def AMDGPUrsq : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>;
+
+// out = 1.0 / sqrt(a)
+def AMDGPUrcp_legacy : SDNode<"AMDGPUISD::RCP_LEGACY", SDTFPUnaryOp>;
+def AMDGPUrsq_legacy : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>;
+
+// out = 1.0 / sqrt(a) result clamped to +/- max_float.
+def AMDGPUrsq_clamp : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>;
+
+def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>;
+
+def AMDGPUfp_class : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>;
+
+// out = max(a, b) a and b are floats, where a nan comparison fails.
+// This is not commutative because this gives the second operand:
+// x < nan ? x : nan -> nan
+// nan < x ? nan : x -> x
+def AMDGPUfmax_legacy : SDNode<"AMDGPUISD::FMAX_LEGACY", SDTFPBinOp,
+ []
+>;
+
+def AMDGPUfmul_legacy : SDNode<"AMDGPUISD::FMUL_LEGACY", SDTFPBinOp,
+ [SDNPCommutative, SDNPAssociative]
+>;
+
+def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPTernaryOp, []>;
+
+// out = max(a, b) a and b are signed ints
+def AMDGPUsmax : SDNode<"AMDGPUISD::SMAX", SDTIntBinOp,
+ [SDNPCommutative, SDNPAssociative]
+>;
+
+// out = max(a, b) a and b are unsigned ints
+def AMDGPUumax : SDNode<"AMDGPUISD::UMAX", SDTIntBinOp,
+ [SDNPCommutative, SDNPAssociative]
+>;
+
+// out = min(a, b) a and b are floats, where a nan comparison fails.
+def AMDGPUfmin_legacy : SDNode<"AMDGPUISD::FMIN_LEGACY", SDTFPBinOp,
+ []
+>;
+
+// FIXME: TableGen doesn't like commutative instructions with more
+// than 2 operands.
+// out = max(a, b, c) a, b and c are floats
+def AMDGPUfmax3 : SDNode<"AMDGPUISD::FMAX3", SDTFPTernaryOp,
+ [/*SDNPCommutative, SDNPAssociative*/]
+>;
+
+// out = max(a, b, c) a, b, and c are signed ints
+def AMDGPUsmax3 : SDNode<"AMDGPUISD::SMAX3", AMDGPUDTIntTernaryOp,
+ [/*SDNPCommutative, SDNPAssociative*/]
+>;
+
+// out = max(a, b, c) a, b and c are unsigned ints
+def AMDGPUumax3 : SDNode<"AMDGPUISD::UMAX3", AMDGPUDTIntTernaryOp,
+ [/*SDNPCommutative, SDNPAssociative*/]
+>;
+
+// out = min(a, b, c) a, b and c are floats
+def AMDGPUfmin3 : SDNode<"AMDGPUISD::FMIN3", SDTFPTernaryOp,
+ [/*SDNPCommutative, SDNPAssociative*/]
+>;
+
+// out = min(a, b, c) a, b and c are signed ints
+def AMDGPUsmin3 : SDNode<"AMDGPUISD::SMIN3", AMDGPUDTIntTernaryOp,
+ [/*SDNPCommutative, SDNPAssociative*/]
+>;
+
+// out = min(a, b) a and b are unsigned ints
+def AMDGPUumin3 : SDNode<"AMDGPUISD::UMIN3", AMDGPUDTIntTernaryOp,
+ [/*SDNPCommutative, SDNPAssociative*/]
+>;
+
+// out = (src0 + src1 > 0xFFFFFFFF) ? 1 : 0
+def AMDGPUcarry : SDNode<"AMDGPUISD::CARRY", SDTIntBinOp, []>;
+
+// out = (src1 > src0) ? 1 : 0
+def AMDGPUborrow : SDNode<"AMDGPUISD::BORROW", SDTIntBinOp, []>;
+
+def AMDGPUSetCCOp : SDTypeProfile<1, 3, [ // setcc
+ SDTCisVT<0, i64>, SDTCisSameAs<1, 2>, SDTCisVT<3, OtherVT>
+]>;
+
+def AMDGPUsetcc : SDNode<"AMDGPUISD::SETCC", AMDGPUSetCCOp>;
+
+def AMDGPUSetRegOp : SDTypeProfile<0, 2, [
+ SDTCisInt<0>, SDTCisInt<1>
+]>;
+
+def AMDGPUsetreg : SDNode<"AMDGPUISD::SETREG", AMDGPUSetRegOp, [
+ SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>;
+
+def AMDGPUfma : SDNode<"AMDGPUISD::FMA_W_CHAIN", SDTFPTernaryOp, [
+ SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def AMDGPUmul : SDNode<"AMDGPUISD::FMUL_W_CHAIN", SDTFPBinOp, [
+ SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0",
+ SDTIntToFPOp, []>;
+def AMDGPUcvt_f32_ubyte1 : SDNode<"AMDGPUISD::CVT_F32_UBYTE1",
+ SDTIntToFPOp, []>;
+def AMDGPUcvt_f32_ubyte2 : SDNode<"AMDGPUISD::CVT_F32_UBYTE2",
+ SDTIntToFPOp, []>;
+def AMDGPUcvt_f32_ubyte3 : SDNode<"AMDGPUISD::CVT_F32_UBYTE3",
+ SDTIntToFPOp, []>;
+
+
+// urecip - This operation is a helper for integer division, it returns the
+// result of 1 / a as a fractional unsigned integer.
+// out = (2^32 / a) + e
+// e is rounding error
+def AMDGPUurecip : SDNode<"AMDGPUISD::URECIP", SDTIntUnaryOp>;
+
+// Special case divide preop and flags.
+def AMDGPUdiv_scale : SDNode<"AMDGPUISD::DIV_SCALE", AMDGPUDivScaleOp>;
+
+// Special case divide FMA with scale and flags (src0 = Quotient,
+// src1 = Denominator, src2 = Numerator).
+def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp>;
+
+// Single or double precision division fixup.
+// Special case divide fixup and flags(src0 = Quotient, src1 =
+// Denominator, src2 = Numerator).
+def AMDGPUdiv_fixup : SDNode<"AMDGPUISD::DIV_FIXUP", SDTFPTernaryOp>;
+
+// Look Up 2.0 / pi src0 with segment select src1[4:0]
+def AMDGPUtrig_preop : SDNode<"AMDGPUISD::TRIG_PREOP", AMDGPUTrigPreOp>;
+
+def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD",
+ SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisInt<2>]>,
+ [SDNPHasChain, SDNPMayLoad]>;
+
+def AMDGPUregister_store : SDNode<"AMDGPUISD::REGISTER_STORE",
+ SDTypeProfile<0, 3, [SDTCisPtrTy<1>, SDTCisInt<2>]>,
+ [SDNPHasChain, SDNPMayStore]>;
+
+// MSKOR instructions are atomic memory instructions used mainly for storing
+// 8-bit and 16-bit values. The definition is:
+//
+// MSKOR(dst, mask, src) MEM[dst] = ((MEM[dst] & ~mask) | src)
+//
+// src0: vec4(src, 0, 0, mask)
+// src1: dst - rat offset (aka pointer) in dwords
+def AMDGPUstore_mskor : SDNode<"AMDGPUISD::STORE_MSKOR",
+ SDTypeProfile<0, 2, []>,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+def AMDGPUatomic_cmp_swap : SDNode<"AMDGPUISD::ATOMIC_CMP_SWAP",
+ SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisVec<2>]>,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
+ SDNPMemOperand]>;
+
+def AMDGPUround : SDNode<"ISD::FROUND",
+ SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>>;
+
+def AMDGPUbfe_u32 : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>;
+def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;
+def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;
+def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;
+
+def AMDGPUffbh_u32 : SDNode<"AMDGPUISD::FFBH_U32", SDTIntUnaryOp>;
+def AMDGPUffbh_i32 : SDNode<"AMDGPUISD::FFBH_I32", SDTIntUnaryOp>;
+
+// Signed and unsigned 24-bit multiply. The highest 8-bits are ignore
+// when performing the mulitply. The result is a 32-bit value.
+def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp,
+ [SDNPCommutative, SDNPAssociative]
+>;
+def AMDGPUmul_i24 : SDNode<"AMDGPUISD::MUL_I24", SDTIntBinOp,
+ [SDNPCommutative, SDNPAssociative]
+>;
+
+def AMDGPUmulhi_u24 : SDNode<"AMDGPUISD::MULHI_U24", SDTIntBinOp,
+ [SDNPCommutative, SDNPAssociative]
+>;
+def AMDGPUmulhi_i24 : SDNode<"AMDGPUISD::MULHI_I24", SDTIntBinOp,
+ [SDNPCommutative, SDNPAssociative]
+>;
+
+def AMDGPUmad_u24 : SDNode<"AMDGPUISD::MAD_U24", AMDGPUDTIntTernaryOp,
+ []
+>;
+def AMDGPUmad_i24 : SDNode<"AMDGPUISD::MAD_I24", AMDGPUDTIntTernaryOp,
+ []
+>;
+
+def AMDGPUsmed3 : SDNode<"AMDGPUISD::SMED3", AMDGPUDTIntTernaryOp,
+ []
+>;
+
+def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp,
+ []
+>;
+
+def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>;
+
+def AMDGPUsendmsg : SDNode<"AMDGPUISD::SENDMSG",
+ SDTypeProfile<0, 1, [SDTCisInt<0>]>,
+ [SDNPHasChain, SDNPInGlue]>;
+
+def AMDGPUinterp_mov : SDNode<"AMDGPUISD::INTERP_MOV",
+ SDTypeProfile<1, 3, [SDTCisFP<0>]>,
+ [SDNPInGlue]>;
+
+def AMDGPUinterp_p1 : SDNode<"AMDGPUISD::INTERP_P1",
+ SDTypeProfile<1, 3, [SDTCisFP<0>]>,
+ [SDNPInGlue, SDNPOutGlue]>;
+
+def AMDGPUinterp_p2 : SDNode<"AMDGPUISD::INTERP_P2",
+ SDTypeProfile<1, 4, [SDTCisFP<0>]>,
+ [SDNPInGlue]>;
+
+
+def AMDGPUkill : SDNode<"AMDGPUISD::KILL", AMDGPUKillSDT,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+// SI+ export
+def AMDGPUExportOp : SDTypeProfile<0, 8, [
+ SDTCisInt<0>, // i8 en
+ SDTCisInt<1>, // i1 vm
+ // skip done
+ SDTCisInt<2>, // i8 tgt
+ SDTCisSameAs<3, 1>, // i1 compr
+ SDTCisFP<4>, // f32 src0
+ SDTCisSameAs<5, 4>, // f32 src1
+ SDTCisSameAs<6, 4>, // f32 src2
+ SDTCisSameAs<7, 4> // f32 src3
+]>;
+
+def AMDGPUexport: SDNode<"AMDGPUISD::EXPORT", AMDGPUExportOp,
+ [SDNPHasChain, SDNPMayStore]>;
+
+def AMDGPUexport_done: SDNode<"AMDGPUISD::EXPORT_DONE", AMDGPUExportOp,
+ [SDNPHasChain, SDNPMayLoad, SDNPMayStore]>;
+
+
+def R600ExportOp : SDTypeProfile<0, 7, [SDTCisFP<0>, SDTCisInt<1>]>;
+
+def R600_EXPORT: SDNode<"AMDGPUISD::R600_EXPORT", R600ExportOp,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+//===----------------------------------------------------------------------===//
+// Flow Control Profile Types
+//===----------------------------------------------------------------------===//
+// Branch instruction where second and third are basic blocks
+def SDTIL_BRCond : SDTypeProfile<0, 2, [
+ SDTCisVT<0, OtherVT>
+ ]>;
+
+//===----------------------------------------------------------------------===//
+// Flow Control DAG Nodes
+//===----------------------------------------------------------------------===//
+def IL_brcond : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChain]>;
+
+//===----------------------------------------------------------------------===//
+// Call/Return DAG Nodes
+//===----------------------------------------------------------------------===//
+def AMDGPUendpgm : SDNode<"AMDGPUISD::ENDPGM", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue]>;
+
+def AMDGPUreturn : SDNode<"AMDGPUISD::RETURN", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
new file mode 100644
index 000000000000..513df3a9cdf3
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -0,0 +1,677 @@
+//===-- AMDGPUInstructions.td - Common instruction defs ---*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains instruction defs that are common to all hw codegen
+// targets.
+//
+//===----------------------------------------------------------------------===//
+
+class AMDGPUInst <dag outs, dag ins, string asm = "",
+ list<dag> pattern = []> : Instruction {
+ field bit isRegisterLoad = 0;
+ field bit isRegisterStore = 0;
+
+ let Namespace = "AMDGPU";
+ let OutOperandList = outs;
+ let InOperandList = ins;
+ let AsmString = asm;
+ let Pattern = pattern;
+ let Itinerary = NullALU;
+
+ // SoftFail is a field the disassembler can use to provide a way for
+ // instructions to not match without killing the whole decode process. It is
+ // mainly used for ARM, but Tablegen expects this field to exist or it fails
+ // to build the decode table.
+ field bits<64> SoftFail = 0;
+
+ let DecoderNamespace = Namespace;
+
+ let TSFlags{63} = isRegisterLoad;
+ let TSFlags{62} = isRegisterStore;
+}
+
+class AMDGPUShaderInst <dag outs, dag ins, string asm = "",
+ list<dag> pattern = []> : AMDGPUInst<outs, ins, asm, pattern> {
+
+ field bits<32> Inst = 0xffffffff;
+}
+
+def FP16Denormals : Predicate<"Subtarget.hasFP16Denormals()">;
+def FP32Denormals : Predicate<"Subtarget.hasFP32Denormals()">;
+def FP64Denormals : Predicate<"Subtarget.hasFP64Denormals()">;
+def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">;
+
+def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>;
+def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
+
+let OperandType = "OPERAND_IMMEDIATE" in {
+
+def u32imm : Operand<i32> {
+ let PrintMethod = "printU32ImmOperand";
+}
+
+def u16imm : Operand<i16> {
+ let PrintMethod = "printU16ImmOperand";
+}
+
+def u8imm : Operand<i8> {
+ let PrintMethod = "printU8ImmOperand";
+}
+
+} // End OperandType = "OPERAND_IMMEDIATE"
+
+//===--------------------------------------------------------------------===//
+// Custom Operands
+//===--------------------------------------------------------------------===//
+def brtarget : Operand<OtherVT>;
+
+//===----------------------------------------------------------------------===//
+// PatLeafs for floating-point comparisons
+//===----------------------------------------------------------------------===//
+
+def COND_OEQ : PatLeaf <
+ (cond),
+ [{return N->get() == ISD::SETOEQ || N->get() == ISD::SETEQ;}]
+>;
+
+def COND_ONE : PatLeaf <
+ (cond),
+ [{return N->get() == ISD::SETONE || N->get() == ISD::SETNE;}]
+>;
+
+def COND_OGT : PatLeaf <
+ (cond),
+ [{return N->get() == ISD::SETOGT || N->get() == ISD::SETGT;}]
+>;
+
+def COND_OGE : PatLeaf <
+ (cond),
+ [{return N->get() == ISD::SETOGE || N->get() == ISD::SETGE;}]
+>;
+
+def COND_OLT : PatLeaf <
+ (cond),
+ [{return N->get() == ISD::SETOLT || N->get() == ISD::SETLT;}]
+>;
+
+def COND_OLE : PatLeaf <
+ (cond),
+ [{return N->get() == ISD::SETOLE || N->get() == ISD::SETLE;}]
+>;
+
+
+def COND_O : PatLeaf <(cond), [{return N->get() == ISD::SETO;}]>;
+def COND_UO : PatLeaf <(cond), [{return N->get() == ISD::SETUO;}]>;
+
+//===----------------------------------------------------------------------===//
+// PatLeafs for unsigned / unordered comparisons
+//===----------------------------------------------------------------------===//
+
+def COND_UEQ : PatLeaf <(cond), [{return N->get() == ISD::SETUEQ;}]>;
+def COND_UNE : PatLeaf <(cond), [{return N->get() == ISD::SETUNE;}]>;
+def COND_UGT : PatLeaf <(cond), [{return N->get() == ISD::SETUGT;}]>;
+def COND_UGE : PatLeaf <(cond), [{return N->get() == ISD::SETUGE;}]>;
+def COND_ULT : PatLeaf <(cond), [{return N->get() == ISD::SETULT;}]>;
+def COND_ULE : PatLeaf <(cond), [{return N->get() == ISD::SETULE;}]>;
+
+// XXX - For some reason R600 version is preferring to use unordered
+// for setne?
+def COND_UNE_NE : PatLeaf <
+ (cond),
+ [{return N->get() == ISD::SETUNE || N->get() == ISD::SETNE;}]
+>;
+
+//===----------------------------------------------------------------------===//
+// PatLeafs for signed comparisons
+//===----------------------------------------------------------------------===//
+
+def COND_SGT : PatLeaf <(cond), [{return N->get() == ISD::SETGT;}]>;
+def COND_SGE : PatLeaf <(cond), [{return N->get() == ISD::SETGE;}]>;
+def COND_SLT : PatLeaf <(cond), [{return N->get() == ISD::SETLT;}]>;
+def COND_SLE : PatLeaf <(cond), [{return N->get() == ISD::SETLE;}]>;
+
+//===----------------------------------------------------------------------===//
+// PatLeafs for integer equality
+//===----------------------------------------------------------------------===//
+
+def COND_EQ : PatLeaf <
+ (cond),
+ [{return N->get() == ISD::SETEQ || N->get() == ISD::SETUEQ;}]
+>;
+
+def COND_NE : PatLeaf <
+ (cond),
+ [{return N->get() == ISD::SETNE || N->get() == ISD::SETUNE;}]
+>;
+
+def COND_NULL : PatLeaf <
+ (cond),
+ [{(void)N; return false;}]
+>;
+
+
+//===----------------------------------------------------------------------===//
+// Misc. PatFrags
+//===----------------------------------------------------------------------===//
+
+class HasOneUseBinOp<SDPatternOperator op> : PatFrag<
+ (ops node:$src0, node:$src1),
+ (op $src0, $src1),
+ [{ return N->hasOneUse(); }]
+>;
+
+class HasOneUseTernaryOp<SDPatternOperator op> : PatFrag<
+ (ops node:$src0, node:$src1, node:$src2),
+ (op $src0, $src1, $src2),
+ [{ return N->hasOneUse(); }]
+>;
+
+//===----------------------------------------------------------------------===//
+// Load/Store Pattern Fragments
+//===----------------------------------------------------------------------===//
+
+class PrivateMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
+ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
+}]>;
+
+class PrivateLoad <SDPatternOperator op> : PrivateMemOp <
+ (ops node:$ptr), (op node:$ptr)
+>;
+
+class PrivateStore <SDPatternOperator op> : PrivateMemOp <
+ (ops node:$value, node:$ptr), (op node:$value, node:$ptr)
+>;
+
+def load_private : PrivateLoad <load>;
+
+def truncstorei8_private : PrivateStore <truncstorei8>;
+def truncstorei16_private : PrivateStore <truncstorei16>;
+def store_private : PrivateStore <store>;
+
+class GlobalMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
+ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
+}]>;
+
+// Global address space loads
+class GlobalLoad <SDPatternOperator op> : GlobalMemOp <
+ (ops node:$ptr), (op node:$ptr)
+>;
+
+def global_load : GlobalLoad <load>;
+
+// Global address space stores
+class GlobalStore <SDPatternOperator op> : GlobalMemOp <
+ (ops node:$value, node:$ptr), (op node:$value, node:$ptr)
+>;
+
+def global_store : GlobalStore <store>;
+def global_store_atomic : GlobalStore<atomic_store>;
+
+
+class ConstantMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
+ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
+}]>;
+
+// Constant address space loads
+class ConstantLoad <SDPatternOperator op> : ConstantMemOp <
+ (ops node:$ptr), (op node:$ptr)
+>;
+
+def constant_load : ConstantLoad<load>;
+
+class LocalMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
+ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+}]>;
+
+// Local address space loads
+class LocalLoad <SDPatternOperator op> : LocalMemOp <
+ (ops node:$ptr), (op node:$ptr)
+>;
+
+class LocalStore <SDPatternOperator op> : LocalMemOp <
+ (ops node:$value, node:$ptr), (op node:$value, node:$ptr)
+>;
+
+class FlatMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
+ return cast<MemSDNode>(N)->getAddressSPace() == AMDGPUAS::FLAT_ADDRESS;
+}]>;
+
+class FlatLoad <SDPatternOperator op> : FlatMemOp <
+ (ops node:$ptr), (op node:$ptr)
+>;
+
+class AZExtLoadBase <SDPatternOperator ld_node>: PatFrag<(ops node:$ptr),
+ (ld_node node:$ptr), [{
+ LoadSDNode *L = cast<LoadSDNode>(N);
+ return L->getExtensionType() == ISD::ZEXTLOAD ||
+ L->getExtensionType() == ISD::EXTLOAD;
+}]>;
+
+def az_extload : AZExtLoadBase <unindexedload>;
+
+def az_extloadi8 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
+ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def az_extloadi8_global : GlobalLoad <az_extloadi8>;
+def sextloadi8_global : GlobalLoad <sextloadi8>;
+
+def az_extloadi8_constant : ConstantLoad <az_extloadi8>;
+def sextloadi8_constant : ConstantLoad <sextloadi8>;
+
+def az_extloadi8_local : LocalLoad <az_extloadi8>;
+def sextloadi8_local : LocalLoad <sextloadi8>;
+
+def extloadi8_private : PrivateLoad <az_extloadi8>;
+def sextloadi8_private : PrivateLoad <sextloadi8>;
+
+def az_extloadi16 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
+ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def az_extloadi16_global : GlobalLoad <az_extloadi16>;
+def sextloadi16_global : GlobalLoad <sextloadi16>;
+
+def az_extloadi16_constant : ConstantLoad <az_extloadi16>;
+def sextloadi16_constant : ConstantLoad <sextloadi16>;
+
+def az_extloadi16_local : LocalLoad <az_extloadi16>;
+def sextloadi16_local : LocalLoad <sextloadi16>;
+
+def extloadi16_private : PrivateLoad <az_extloadi16>;
+def sextloadi16_private : PrivateLoad <sextloadi16>;
+
+def az_extloadi32 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
+ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def az_extloadi32_global : GlobalLoad <az_extloadi32>;
+
+def az_extloadi32_flat : FlatLoad <az_extloadi32>;
+
+def az_extloadi32_constant : ConstantLoad <az_extloadi32>;
+
+def truncstorei8_global : GlobalStore <truncstorei8>;
+def truncstorei16_global : GlobalStore <truncstorei16>;
+
+def local_store : LocalStore <store>;
+def truncstorei8_local : LocalStore <truncstorei8>;
+def truncstorei16_local : LocalStore <truncstorei16>;
+
+def local_load : LocalLoad <load>;
+
+class Aligned8Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{
+ return cast<MemSDNode>(N)->getAlignment() % 8 == 0;
+}]>;
+
+def local_load_aligned8bytes : Aligned8Bytes <
+ (ops node:$ptr), (local_load node:$ptr)
+>;
+
+def local_store_aligned8bytes : Aligned8Bytes <
+ (ops node:$val, node:$ptr), (local_store node:$val, node:$ptr)
+>;
+
+class local_binary_atomic_op<SDNode atomic_op> :
+ PatFrag<(ops node:$ptr, node:$value),
+ (atomic_op node:$ptr, node:$value), [{
+ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+}]>;
+
+
+def atomic_swap_local : local_binary_atomic_op<atomic_swap>;
+def atomic_load_add_local : local_binary_atomic_op<atomic_load_add>;
+def atomic_load_sub_local : local_binary_atomic_op<atomic_load_sub>;
+def atomic_load_and_local : local_binary_atomic_op<atomic_load_and>;
+def atomic_load_or_local : local_binary_atomic_op<atomic_load_or>;
+def atomic_load_xor_local : local_binary_atomic_op<atomic_load_xor>;
+def atomic_load_nand_local : local_binary_atomic_op<atomic_load_nand>;
+def atomic_load_min_local : local_binary_atomic_op<atomic_load_min>;
+def atomic_load_max_local : local_binary_atomic_op<atomic_load_max>;
+def atomic_load_umin_local : local_binary_atomic_op<atomic_load_umin>;
+def atomic_load_umax_local : local_binary_atomic_op<atomic_load_umax>;
+
+def mskor_global : PatFrag<(ops node:$val, node:$ptr),
+ (AMDGPUstore_mskor node:$val, node:$ptr), [{
+ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
+}]>;
+
+multiclass AtomicCmpSwapLocal <SDNode cmp_swap_node> {
+
+ def _32_local : PatFrag <
+ (ops node:$ptr, node:$cmp, node:$swap),
+ (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{
+ AtomicSDNode *AN = cast<AtomicSDNode>(N);
+ return AN->getMemoryVT() == MVT::i32 &&
+ AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+ }]>;
+
+ def _64_local : PatFrag<
+ (ops node:$ptr, node:$cmp, node:$swap),
+ (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{
+ AtomicSDNode *AN = cast<AtomicSDNode>(N);
+ return AN->getMemoryVT() == MVT::i64 &&
+ AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+ }]>;
+}
+
+defm atomic_cmp_swap : AtomicCmpSwapLocal <atomic_cmp_swap>;
+
+multiclass global_binary_atomic_op<SDNode atomic_op> {
+ def "" : PatFrag<
+ (ops node:$ptr, node:$value),
+ (atomic_op node:$ptr, node:$value),
+ [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>;
+
+ def _noret : PatFrag<
+ (ops node:$ptr, node:$value),
+ (atomic_op node:$ptr, node:$value),
+ [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>;
+
+ def _ret : PatFrag<
+ (ops node:$ptr, node:$value),
+ (atomic_op node:$ptr, node:$value),
+ [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>;
+}
+
+defm atomic_swap_global : global_binary_atomic_op<atomic_swap>;
+defm atomic_add_global : global_binary_atomic_op<atomic_load_add>;
+defm atomic_and_global : global_binary_atomic_op<atomic_load_and>;
+defm atomic_max_global : global_binary_atomic_op<atomic_load_max>;
+defm atomic_min_global : global_binary_atomic_op<atomic_load_min>;
+defm atomic_or_global : global_binary_atomic_op<atomic_load_or>;
+defm atomic_sub_global : global_binary_atomic_op<atomic_load_sub>;
+defm atomic_umax_global : global_binary_atomic_op<atomic_load_umax>;
+defm atomic_umin_global : global_binary_atomic_op<atomic_load_umin>;
+defm atomic_xor_global : global_binary_atomic_op<atomic_load_xor>;
+
+//legacy
+def AMDGPUatomic_cmp_swap_global : PatFrag<
+ (ops node:$ptr, node:$value),
+ (AMDGPUatomic_cmp_swap node:$ptr, node:$value),
+ [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>;
+
+def atomic_cmp_swap_global : PatFrag<
+ (ops node:$ptr, node:$cmp, node:$value),
+ (atomic_cmp_swap node:$ptr, node:$cmp, node:$value),
+ [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>;
+
+def atomic_cmp_swap_global_noret : PatFrag<
+ (ops node:$ptr, node:$cmp, node:$value),
+ (atomic_cmp_swap node:$ptr, node:$cmp, node:$value),
+ [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>;
+
+def atomic_cmp_swap_global_ret : PatFrag<
+ (ops node:$ptr, node:$cmp, node:$value),
+ (atomic_cmp_swap node:$ptr, node:$cmp, node:$value),
+ [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>;
+
+//===----------------------------------------------------------------------===//
+// Misc Pattern Fragments
+//===----------------------------------------------------------------------===//
+
+class Constants {
+int TWO_PI = 0x40c90fdb;
+int PI = 0x40490fdb;
+int TWO_PI_INV = 0x3e22f983;
+int FP_UINT_MAX_PLUS_1 = 0x4f800000; // 1 << 32 in floating point encoding
+int FP16_ONE = 0x3C00;
+int FP32_ONE = 0x3f800000;
+int FP32_NEG_ONE = 0xbf800000;
+int FP64_ONE = 0x3ff0000000000000;
+int FP64_NEG_ONE = 0xbff0000000000000;
+}
+def CONST : Constants;
+
+def FP_ZERO : PatLeaf <
+ (fpimm),
+ [{return N->getValueAPF().isZero();}]
+>;
+
+def FP_ONE : PatLeaf <
+ (fpimm),
+ [{return N->isExactlyValue(1.0);}]
+>;
+
+def FP_HALF : PatLeaf <
+ (fpimm),
+ [{return N->isExactlyValue(0.5);}]
+>;
+
+let isCodeGenOnly = 1, isPseudo = 1 in {
+
+let usesCustomInserter = 1 in {
+
+class CLAMP <RegisterClass rc> : AMDGPUShaderInst <
+ (outs rc:$dst),
+ (ins rc:$src0),
+ "CLAMP $dst, $src0",
+ [(set f32:$dst, (AMDGPUclamp f32:$src0, (f32 FP_ZERO), (f32 FP_ONE)))]
+>;
+
+class FABS <RegisterClass rc> : AMDGPUShaderInst <
+ (outs rc:$dst),
+ (ins rc:$src0),
+ "FABS $dst, $src0",
+ [(set f32:$dst, (fabs f32:$src0))]
+>;
+
+class FNEG <RegisterClass rc> : AMDGPUShaderInst <
+ (outs rc:$dst),
+ (ins rc:$src0),
+ "FNEG $dst, $src0",
+ [(set f32:$dst, (fneg f32:$src0))]
+>;
+
+} // usesCustomInserter = 1
+
+multiclass RegisterLoadStore <RegisterClass dstClass, Operand addrClass,
+ ComplexPattern addrPat> {
+let UseNamedOperandTable = 1 in {
+
+ def RegisterLoad : AMDGPUShaderInst <
+ (outs dstClass:$dst),
+ (ins addrClass:$addr, i32imm:$chan),
+ "RegisterLoad $dst, $addr",
+ [(set i32:$dst, (AMDGPUregister_load addrPat:$addr, (i32 timm:$chan)))]
+ > {
+ let isRegisterLoad = 1;
+ }
+
+ def RegisterStore : AMDGPUShaderInst <
+ (outs),
+ (ins dstClass:$val, addrClass:$addr, i32imm:$chan),
+ "RegisterStore $val, $addr",
+ [(AMDGPUregister_store i32:$val, addrPat:$addr, (i32 timm:$chan))]
+ > {
+ let isRegisterStore = 1;
+ }
+}
+}
+
+} // End isCodeGenOnly = 1, isPseudo = 1
+
+/* Generic helper patterns for intrinsics */
+/* -------------------------------------- */
+
+class POW_Common <AMDGPUInst log_ieee, AMDGPUInst exp_ieee, AMDGPUInst mul>
+ : Pat <
+ (fpow f32:$src0, f32:$src1),
+ (exp_ieee (mul f32:$src1, (log_ieee f32:$src0)))
+>;
+
+/* Other helper patterns */
+/* --------------------- */
+
+/* Extract element pattern */
+class Extract_Element <ValueType sub_type, ValueType vec_type, int sub_idx,
+ SubRegIndex sub_reg>
+ : Pat<
+ (sub_type (extractelt vec_type:$src, sub_idx)),
+ (EXTRACT_SUBREG $src, sub_reg)
+>;
+
+/* Insert element pattern */
+class Insert_Element <ValueType elem_type, ValueType vec_type,
+ int sub_idx, SubRegIndex sub_reg>
+ : Pat <
+ (insertelt vec_type:$vec, elem_type:$elem, sub_idx),
+ (INSERT_SUBREG $vec, $elem, sub_reg)
+>;
+
+// XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer
+// can handle COPY instructions.
+// bitconvert pattern
+class BitConvert <ValueType dt, ValueType st, RegisterClass rc> : Pat <
+ (dt (bitconvert (st rc:$src0))),
+ (dt rc:$src0)
+>;
+
+// XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer
+// can handle COPY instructions.
+class DwordAddrPat<ValueType vt, RegisterClass rc> : Pat <
+ (vt (AMDGPUdwordaddr (vt rc:$addr))),
+ (vt rc:$addr)
+>;
+
+// BFI_INT patterns
+
+multiclass BFIPatterns <Instruction BFI_INT,
+ Instruction LoadImm32,
+ RegisterClass RC64> {
+ // Definition from ISA doc:
+ // (y & x) | (z & ~x)
+ def : Pat <
+ (or (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))),
+ (BFI_INT $x, $y, $z)
+ >;
+
+ // SHA-256 Ch function
+ // z ^ (x & (y ^ z))
+ def : Pat <
+ (xor i32:$z, (and i32:$x, (xor i32:$y, i32:$z))),
+ (BFI_INT $x, $y, $z)
+ >;
+
+ def : Pat <
+ (fcopysign f32:$src0, f32:$src1),
+ (BFI_INT (LoadImm32 (i32 0x7fffffff)), $src0, $src1)
+ >;
+
+ def : Pat <
+ (f64 (fcopysign f64:$src0, f64:$src1)),
+ (REG_SEQUENCE RC64,
+ (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
+ (BFI_INT (LoadImm32 (i32 0x7fffffff)),
+ (i32 (EXTRACT_SUBREG $src0, sub1)),
+ (i32 (EXTRACT_SUBREG $src1, sub1))), sub1)
+ >;
+
+ def : Pat <
+ (f64 (fcopysign f64:$src0, f32:$src1)),
+ (REG_SEQUENCE RC64,
+ (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
+ (BFI_INT (LoadImm32 (i32 0x7fffffff)),
+ (i32 (EXTRACT_SUBREG $src0, sub1)),
+ $src1), sub1)
+ >;
+}
+
+// SHA-256 Ma patterns
+
+// ((x & z) | (y & (x | z))) -> BFI_INT (XOR x, y), z, y
+class SHA256MaPattern <Instruction BFI_INT, Instruction XOR> : Pat <
+ (or (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))),
+ (BFI_INT (XOR i32:$x, i32:$y), i32:$z, i32:$y)
+>;
+
+// Bitfield extract patterns
+
+def IMMZeroBasedBitfieldMask : PatLeaf <(imm), [{
+ return isMask_32(N->getZExtValue());
+}]>;
+
+def IMMPopCount : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(countPopulation(N->getZExtValue()), SDLoc(N),
+ MVT::i32);
+}]>;
+
+class BFEPattern <Instruction BFE, Instruction MOV> : Pat <
+ (i32 (and (i32 (srl i32:$src, i32:$rshift)), IMMZeroBasedBitfieldMask:$mask)),
+ (BFE $src, $rshift, (MOV (i32 (IMMPopCount $mask))))
+>;
+
+// rotr pattern
+class ROTRPattern <Instruction BIT_ALIGN> : Pat <
+ (rotr i32:$src0, i32:$src1),
+ (BIT_ALIGN $src0, $src0, $src1)
+>;
+
+// This matches 16 permutations of
+// max(min(x, y), min(max(x, y), z))
+class IntMed3Pat<Instruction med3Inst,
+ SDPatternOperator max,
+ SDPatternOperator max_oneuse,
+ SDPatternOperator min_oneuse> : Pat<
+ (max (min_oneuse i32:$src0, i32:$src1),
+ (min_oneuse (max_oneuse i32:$src0, i32:$src1), i32:$src2)),
+ (med3Inst $src0, $src1, $src2)
+>;
+
+let Properties = [SDNPCommutative, SDNPAssociative] in {
+def smax_oneuse : HasOneUseBinOp<smax>;
+def smin_oneuse : HasOneUseBinOp<smin>;
+def umax_oneuse : HasOneUseBinOp<umax>;
+def umin_oneuse : HasOneUseBinOp<umin>;
+def sub_oneuse : HasOneUseBinOp<sub>;
+} // Properties = [SDNPCommutative, SDNPAssociative]
+
+def select_oneuse : HasOneUseTernaryOp<select>;
+
+// Special conversion patterns
+
+def cvt_rpi_i32_f32 : PatFrag <
+ (ops node:$src),
+ (fp_to_sint (ffloor (fadd $src, FP_HALF))),
+ [{ (void) N; return TM.Options.NoNaNsFPMath; }]
+>;
+
+def cvt_flr_i32_f32 : PatFrag <
+ (ops node:$src),
+ (fp_to_sint (ffloor $src)),
+ [{ (void)N; return TM.Options.NoNaNsFPMath; }]
+>;
+
+class IMad24Pat<Instruction Inst> : Pat <
+ (add (AMDGPUmul_i24 i32:$src0, i32:$src1), i32:$src2),
+ (Inst $src0, $src1, $src2)
+>;
+
+class UMad24Pat<Instruction Inst> : Pat <
+ (add (AMDGPUmul_u24 i32:$src0, i32:$src1), i32:$src2),
+ (Inst $src0, $src1, $src2)
+>;
+
+class RcpPat<Instruction RcpInst, ValueType vt> : Pat <
+ (fdiv FP_ONE, vt:$src),
+ (RcpInst $src)
+>;
+
+class RsqPat<Instruction RsqInst, ValueType vt> : Pat <
+ (AMDGPUrcp (fsqrt vt:$src)),
+ (RsqInst $src)
+>;
+
+include "R600Instructions.td"
+include "R700Instructions.td"
+include "EvergreenInstructions.td"
+include "CaymanInstructions.td"
+
+include "SIInstrInfo.td"
+
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
new file mode 100644
index 000000000000..8e3471bd2083
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
@@ -0,0 +1,110 @@
+//===- AMDGPUIntrinsicInfo.cpp - AMDGPU Intrinsic Information ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief AMDGPU Implementation of the IntrinsicInfo class.
+//
+//===-----------------------------------------------------------------------===//
+
+#include "AMDGPUIntrinsicInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+
+using namespace llvm;
+
+AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo()
+ : TargetIntrinsicInfo() {}
+
+static const char *const IntrinsicNameTable[] = {
+#define GET_INTRINSIC_NAME_TABLE
+#include "AMDGPUGenIntrinsics.inc"
+#undef GET_INTRINSIC_NAME_TABLE
+};
+
+namespace {
+#define GET_INTRINSIC_ATTRIBUTES
+#include "AMDGPUGenIntrinsics.inc"
+#undef GET_INTRINSIC_ATTRIBUTES
+}
+
+StringRef AMDGPUIntrinsicInfo::getName(unsigned IntrID,
+ ArrayRef<Type *> Tys) const {
+ if (IntrID < Intrinsic::num_intrinsics)
+ return StringRef();
+
+ assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics &&
+ "Invalid intrinsic ID");
+
+ return IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics];
+}
+
+std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
+ unsigned NumTys) const {
+ return getName(IntrID, makeArrayRef(Tys, NumTys)).str();
+}
+
+FunctionType *AMDGPUIntrinsicInfo::getType(LLVMContext &Context, unsigned ID,
+ ArrayRef<Type*> Tys) const {
+ // FIXME: Re-use Intrinsic::getType machinery
+ switch (ID) {
+ case AMDGPUIntrinsic::amdgcn_fdiv_fast: {
+ Type *F32Ty = Type::getFloatTy(Context);
+ return FunctionType::get(F32Ty, { F32Ty, F32Ty }, false);
+ }
+ default:
+ llvm_unreachable("unhandled intrinsic");
+ }
+}
+
+unsigned AMDGPUIntrinsicInfo::lookupName(const char *NameData,
+ unsigned Len) const {
+ StringRef Name(NameData, Len);
+ if (!Name.startswith("llvm."))
+ return 0; // All intrinsics start with 'llvm.'
+
+ // Look for a name match in our table. If the intrinsic is not overloaded,
+ // require an exact match. If it is overloaded, require a prefix match. The
+ // AMDGPU enum enum starts at Intrinsic::num_intrinsics.
+ int Idx = Intrinsic::lookupLLVMIntrinsicByName(IntrinsicNameTable, Name);
+ if (Idx >= 0) {
+ bool IsPrefixMatch = Name.size() > strlen(IntrinsicNameTable[Idx]);
+ return IsPrefixMatch == isOverloaded(Idx + 1)
+ ? Intrinsic::num_intrinsics + Idx
+ : 0;
+ }
+
+ return 0;
+}
+
+bool AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const {
+// Overload Table
+#define GET_INTRINSIC_OVERLOAD_TABLE
+#include "AMDGPUGenIntrinsics.inc"
+#undef GET_INTRINSIC_OVERLOAD_TABLE
+}
+
+Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
+ ArrayRef<Type *> Tys) const {
+ FunctionType *FTy = getType(M->getContext(), IntrID, Tys);
+ Function *F
+ = cast<Function>(M->getOrInsertFunction(getName(IntrID, Tys), FTy));
+
+ AttributeSet AS = getAttributes(M->getContext(),
+ static_cast<AMDGPUIntrinsic::ID>(IntrID));
+ F->setAttributes(AS);
+ return F;
+}
+
+Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
+ Type **Tys,
+ unsigned NumTys) const {
+ return getDeclaration(M, IntrID, makeArrayRef(Tys, NumTys));
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
new file mode 100644
index 000000000000..6cb8b9644642
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
@@ -0,0 +1,58 @@
+//===- AMDGPUIntrinsicInfo.h - AMDGPU Intrinsic Information ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface for the AMDGPU Implementation of the Intrinsic Info class.
+//
+//===-----------------------------------------------------------------------===//
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINTRINSICINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUINTRINSICINFO_H
+
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+
+namespace llvm {
+class TargetMachine;
+
+namespace AMDGPUIntrinsic {
+enum ID {
+ last_non_AMDGPU_intrinsic = Intrinsic::num_intrinsics - 1,
+#define GET_INTRINSIC_ENUM_VALUES
+#include "AMDGPUGenIntrinsics.inc"
+#undef GET_INTRINSIC_ENUM_VALUES
+ , num_AMDGPU_intrinsics
+};
+
+} // end namespace AMDGPUIntrinsic
+
+class AMDGPUIntrinsicInfo final : public TargetIntrinsicInfo {
+public:
+ AMDGPUIntrinsicInfo();
+
+ StringRef getName(unsigned IntrId, ArrayRef<Type *> Tys = None) const;
+
+ std::string getName(unsigned IntrId, Type **Tys = nullptr,
+ unsigned NumTys = 0) const override;
+
+ unsigned lookupName(const char *Name, unsigned Len) const override;
+ bool isOverloaded(unsigned IID) const override;
+ Function *getDeclaration(Module *M, unsigned ID,
+ Type **Tys = nullptr,
+ unsigned NumTys = 0) const override;
+
+ Function *getDeclaration(Module *M, unsigned ID,
+ ArrayRef<Type *> = None) const;
+
+ FunctionType *getType(LLVMContext &Context, unsigned ID,
+ ArrayRef<Type*> Tys = None) const;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td
new file mode 100644
index 000000000000..ceae0b575395
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td
@@ -0,0 +1,36 @@
+//===-- AMDGPUIntrinsics.td - Common intrinsics -*- tablegen -*-----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines intrinsics that are used by all hw codegen targets.
+//
+//===----------------------------------------------------------------------===//
+
+let TargetPrefix = "AMDGPU", isTarget = 1 in {
+ def int_AMDGPU_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+
+ def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>;
+ def int_AMDGPU_kilp : Intrinsic<[], [], []>;
+
+ // Deprecated in favor of llvm.amdgcn.sffbh
+ def int_AMDGPU_flbit_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+
+ // Deprecated in favor of separate int_amdgcn_cube* intrinsics.
+ def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+
+ // Deprecated in favor of expanded bit operations
+ def int_AMDGPU_bfe_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+ def int_AMDGPU_bfe_u32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+
+ // Deprecated in favor of llvm.amdgcn.rsq
+ def int_AMDGPU_rsq : Intrinsic<
+ [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]
+ >;
+}
+
+include "SIIntrinsics.td"
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
new file mode 100644
index 000000000000..7d56355074b1
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -0,0 +1,242 @@
+//===- AMDGPUMCInstLower.cpp - Lower AMDGPU MachineInstr to an MCInst -----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Code to lower AMDGPU MachineInstrs to their corresponding MCInst.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#include "AMDGPUMCInstLower.h"
+#include "AMDGPUAsmPrinter.h"
+#include "AMDGPUSubtarget.h"
+#include "AMDGPUTargetMachine.h"
+#include "InstPrinter/AMDGPUInstPrinter.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include <algorithm>
+
+using namespace llvm;
+
+#include "AMDGPUGenMCPseudoLowering.inc"
+
+
+AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &st,
+ const AsmPrinter &ap):
+ Ctx(ctx), ST(st), AP(ap) { }
+
+static MCSymbolRefExpr::VariantKind getVariantKind(unsigned MOFlags) {
+ switch (MOFlags) {
+ default:
+ return MCSymbolRefExpr::VK_None;
+ case SIInstrInfo::MO_GOTPCREL:
+ return MCSymbolRefExpr::VK_GOTPCREL;
+ case SIInstrInfo::MO_GOTPCREL32_LO:
+ return MCSymbolRefExpr::VK_AMDGPU_GOTPCREL32_LO;
+ case SIInstrInfo::MO_GOTPCREL32_HI:
+ return MCSymbolRefExpr::VK_AMDGPU_GOTPCREL32_HI;
+ case SIInstrInfo::MO_REL32_LO:
+ return MCSymbolRefExpr::VK_AMDGPU_REL32_LO;
+ case SIInstrInfo::MO_REL32_HI:
+ return MCSymbolRefExpr::VK_AMDGPU_REL32_HI;
+ }
+}
+
+const MCExpr *AMDGPUMCInstLower::getLongBranchBlockExpr(
+ const MachineBasicBlock &SrcBB,
+ const MachineOperand &MO) const {
+ const MCExpr *DestBBSym
+ = MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx);
+ const MCExpr *SrcBBSym = MCSymbolRefExpr::create(SrcBB.getSymbol(), Ctx);
+
+ assert(SrcBB.front().getOpcode() == AMDGPU::S_GETPC_B64 &&
+ ST.getInstrInfo()->get(AMDGPU::S_GETPC_B64).Size == 4);
+
+ // s_getpc_b64 returns the address of next instruction.
+ const MCConstantExpr *One = MCConstantExpr::create(4, Ctx);
+ SrcBBSym = MCBinaryExpr::createAdd(SrcBBSym, One, Ctx);
+
+ if (MO.getTargetFlags() == AMDGPU::TF_LONG_BRANCH_FORWARD)
+ return MCBinaryExpr::createSub(DestBBSym, SrcBBSym, Ctx);
+
+ assert(MO.getTargetFlags() == AMDGPU::TF_LONG_BRANCH_BACKWARD);
+ return MCBinaryExpr::createSub(SrcBBSym, DestBBSym, Ctx);
+}
+
+bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO,
+ MCOperand &MCOp) const {
+ switch (MO.getType()) {
+ default:
+ llvm_unreachable("unknown operand type");
+ case MachineOperand::MO_Immediate:
+ MCOp = MCOperand::createImm(MO.getImm());
+ return true;
+ case MachineOperand::MO_Register:
+ MCOp = MCOperand::createReg(AMDGPU::getMCReg(MO.getReg(), ST));
+ return true;
+ case MachineOperand::MO_MachineBasicBlock: {
+ if (MO.getTargetFlags() != 0) {
+ MCOp = MCOperand::createExpr(
+ getLongBranchBlockExpr(*MO.getParent()->getParent(), MO));
+ } else {
+ MCOp = MCOperand::createExpr(
+ MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx));
+ }
+
+ return true;
+ }
+ case MachineOperand::MO_GlobalAddress: {
+ const GlobalValue *GV = MO.getGlobal();
+ SmallString<128> SymbolName;
+ AP.getNameWithPrefix(SymbolName, GV);
+ MCSymbol *Sym = Ctx.getOrCreateSymbol(SymbolName);
+ const MCExpr *SymExpr =
+ MCSymbolRefExpr::create(Sym, getVariantKind(MO.getTargetFlags()),Ctx);
+ const MCExpr *Expr = MCBinaryExpr::createAdd(SymExpr,
+ MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
+ MCOp = MCOperand::createExpr(Expr);
+ return true;
+ }
+ case MachineOperand::MO_ExternalSymbol: {
+ MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(MO.getSymbolName()));
+ Sym->setExternal(true);
+ const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx);
+ MCOp = MCOperand::createExpr(Expr);
+ return true;
+ }
+ }
+}
+
+void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
+
+ int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(MI->getOpcode());
+
+ if (MCOpcode == -1) {
+ LLVMContext &C = MI->getParent()->getParent()->getFunction()->getContext();
+ C.emitError("AMDGPUMCInstLower::lower - Pseudo instruction doesn't have "
+ "a target-specific version: " + Twine(MI->getOpcode()));
+ }
+
+ OutMI.setOpcode(MCOpcode);
+
+ for (const MachineOperand &MO : MI->explicit_operands()) {
+ MCOperand MCOp;
+ lowerOperand(MO, MCOp);
+ OutMI.addOperand(MCOp);
+ }
+}
+
+bool AMDGPUAsmPrinter::lowerOperand(const MachineOperand &MO,
+ MCOperand &MCOp) const {
+ const AMDGPUSubtarget &STI = MF->getSubtarget<AMDGPUSubtarget>();
+ AMDGPUMCInstLower MCInstLowering(OutContext, STI, *this);
+ return MCInstLowering.lowerOperand(MO, MCOp);
+}
+
+void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+ if (emitPseudoExpansionLowering(*OutStreamer, MI))
+ return;
+
+ const AMDGPUSubtarget &STI = MF->getSubtarget<AMDGPUSubtarget>();
+ AMDGPUMCInstLower MCInstLowering(OutContext, STI, *this);
+
+ StringRef Err;
+ if (!STI.getInstrInfo()->verifyInstruction(*MI, Err)) {
+ LLVMContext &C = MI->getParent()->getParent()->getFunction()->getContext();
+ C.emitError("Illegal instruction detected: " + Err);
+ MI->dump();
+ }
+
+ if (MI->isBundle()) {
+ const MachineBasicBlock *MBB = MI->getParent();
+ MachineBasicBlock::const_instr_iterator I = ++MI->getIterator();
+ while (I != MBB->instr_end() && I->isInsideBundle()) {
+ EmitInstruction(&*I);
+ ++I;
+ }
+ } else {
+ // We don't want SI_MASK_BRANCH/SI_RETURN encoded. They are placeholder
+ // terminator instructions and should only be printed as comments.
+ if (MI->getOpcode() == AMDGPU::SI_MASK_BRANCH) {
+ if (isVerbose()) {
+ SmallVector<char, 16> BBStr;
+ raw_svector_ostream Str(BBStr);
+
+ const MachineBasicBlock *MBB = MI->getOperand(0).getMBB();
+ const MCSymbolRefExpr *Expr
+ = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext);
+ Expr->print(Str, MAI);
+ OutStreamer->emitRawComment(" mask branch " + BBStr);
+ }
+
+ return;
+ }
+
+ if (MI->getOpcode() == AMDGPU::SI_RETURN) {
+ if (isVerbose())
+ OutStreamer->emitRawComment(" return");
+ return;
+ }
+
+ if (MI->getOpcode() == AMDGPU::WAVE_BARRIER) {
+ if (isVerbose())
+ OutStreamer->emitRawComment(" wave barrier");
+ return;
+ }
+
+ MCInst TmpInst;
+ MCInstLowering.lower(MI, TmpInst);
+ EmitToStreamer(*OutStreamer, TmpInst);
+
+ if (STI.dumpCode()) {
+ // Disassemble instruction/operands to text.
+ DisasmLines.resize(DisasmLines.size() + 1);
+ std::string &DisasmLine = DisasmLines.back();
+ raw_string_ostream DisasmStream(DisasmLine);
+
+ AMDGPUInstPrinter InstPrinter(*TM.getMCAsmInfo(),
+ *STI.getInstrInfo(),
+ *STI.getRegisterInfo());
+ InstPrinter.printInst(&TmpInst, DisasmStream, StringRef(), STI);
+
+ // Disassemble instruction/operands to hex representation.
+ SmallVector<MCFixup, 4> Fixups;
+ SmallVector<char, 16> CodeBytes;
+ raw_svector_ostream CodeStream(CodeBytes);
+
+ auto &ObjStreamer = static_cast<MCObjectStreamer&>(*OutStreamer);
+ MCCodeEmitter &InstEmitter = ObjStreamer.getAssembler().getEmitter();
+ InstEmitter.encodeInstruction(TmpInst, CodeStream, Fixups,
+ MF->getSubtarget<MCSubtargetInfo>());
+ HexLines.resize(HexLines.size() + 1);
+ std::string &HexLine = HexLines.back();
+ raw_string_ostream HexStream(HexLine);
+
+ for (size_t i = 0; i < CodeBytes.size(); i += 4) {
+ unsigned int CodeDWord = *(unsigned int *)&CodeBytes[i];
+ HexStream << format("%s%08X", (i > 0 ? " " : ""), CodeDWord);
+ }
+
+ DisasmStream.flush();
+ DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLine.size());
+ }
+ }
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
new file mode 100644
index 000000000000..57d2d85daecd
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
@@ -0,0 +1,46 @@
+//===- AMDGPUMCInstLower.h MachineInstr Lowering Interface ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMCINSTLOWER_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMCINSTLOWER_H
+
+namespace llvm {
+
+class AMDGPUSubtarget;
+class AsmPrinter;
+class MachineBasicBlock;
+class MachineInstr;
+class MachineOperand;
+class MCContext;
+class MCExpr;
+class MCInst;
+class MCOperand;
+
+class AMDGPUMCInstLower {
+ MCContext &Ctx;
+ const AMDGPUSubtarget &ST;
+ const AsmPrinter &AP;
+
+ const MCExpr *getLongBranchBlockExpr(const MachineBasicBlock &SrcBB,
+ const MachineOperand &MO) const;
+
+public:
+ AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &ST,
+ const AsmPrinter &AP);
+
+ bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
+
+ /// \brief Lower a MachineInstr to an MCInst
+ void lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+};
+
+} // End namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
new file mode 100644
index 000000000000..40c3327a98db
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -0,0 +1,47 @@
+//===-- AMDGPUMachineFunctionInfo.cpp ---------------------------------------=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMachineFunction.h"
+#include "AMDGPUSubtarget.h"
+
+using namespace llvm;
+
+AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
+ MachineFunctionInfo(),
+ LocalMemoryObjects(),
+ KernArgSize(0),
+ MaxKernArgAlign(0),
+ LDSSize(0),
+ ABIArgOffset(0),
+ IsKernel(MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_KERNEL ||
+ MF.getFunction()->getCallingConv() == CallingConv::SPIR_KERNEL) {
+ // FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset,
+ // except reserved size is not correctly aligned.
+}
+
+unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
+ const GlobalValue &GV) {
+ auto Entry = LocalMemoryObjects.insert(std::make_pair(&GV, 0));
+ if (!Entry.second)
+ return Entry.first->second;
+
+ unsigned Align = GV.getAlignment();
+ if (Align == 0)
+ Align = DL.getABITypeAlignment(GV.getValueType());
+
+ /// TODO: We should sort these to minimize wasted space due to alignment
+ /// padding. Currently the padding is decided by the first encountered use
+ /// during lowering.
+ unsigned Offset = LDSSize = alignTo(LDSSize, Align);
+
+ Entry.first->second = Offset;
+ LDSSize += DL.getTypeAllocSize(GV.getValueType());
+
+ return Offset;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
new file mode 100644
index 000000000000..5d0640b816f3
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -0,0 +1,77 @@
+//===-- AMDGPUMachineFunctionInfo.h -------------------------------*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEFUNCTION_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEFUNCTION_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/ADT/DenseMap.h"
+
+namespace llvm {
+
+class AMDGPUMachineFunction : public MachineFunctionInfo {
+ /// A map to keep track of local memory objects and their offsets within the
+ /// local memory space.
+ SmallDenseMap<const GlobalValue *, unsigned, 4> LocalMemoryObjects;
+
+ uint64_t KernArgSize;
+ unsigned MaxKernArgAlign;
+
+ /// Number of bytes in the LDS that are being used.
+ unsigned LDSSize;
+
+ // FIXME: This should probably be removed.
+ /// Start of implicit kernel args
+ unsigned ABIArgOffset;
+
+ bool IsKernel;
+
+public:
+ AMDGPUMachineFunction(const MachineFunction &MF);
+
+ uint64_t allocateKernArg(uint64_t Size, unsigned Align) {
+ assert(isPowerOf2_32(Align));
+ KernArgSize = alignTo(KernArgSize, Align);
+
+ uint64_t Result = KernArgSize;
+ KernArgSize += Size;
+
+ MaxKernArgAlign = std::max(Align, MaxKernArgAlign);
+ return Result;
+ }
+
+ uint64_t getKernArgSize() const {
+ return KernArgSize;
+ }
+
+ unsigned getMaxKernArgAlign() const {
+ return MaxKernArgAlign;
+ }
+
+ void setABIArgOffset(unsigned NewOffset) {
+ ABIArgOffset = NewOffset;
+ }
+
+ unsigned getABIArgOffset() const {
+ return ABIArgOffset;
+ }
+
+ unsigned getLDSSize() const {
+ return LDSSize;
+ }
+
+ bool isKernel() const {
+ return IsKernel;
+ }
+
+ unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalValue &GV);
+};
+
+}
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp
new file mode 100644
index 000000000000..410bd52d9c21
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp
@@ -0,0 +1,372 @@
+//===-- AMDGPUOpenCLImageTypeLoweringPass.cpp -----------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass resolves calls to OpenCL image attribute, image resource ID and
+/// sampler resource ID getter functions.
+///
+/// Image attributes (size and format) are expected to be passed to the kernel
+/// as kernel arguments immediately following the image argument itself,
+/// therefore this pass adds image size and format arguments to the kernel
+/// functions in the module. The kernel functions with image arguments are
+/// re-created using the new signature. The new arguments are added to the
+/// kernel metadata with kernel_arg_type set to "image_size" or "image_format".
+/// Note: this pass may invalidate pointers to functions.
+///
+/// Resource IDs of read-only images, write-only images and samplers are
+/// defined to be their index among the kernel arguments of the same
+/// type and access qualifier.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+using namespace llvm;
+
+namespace {
+
+StringRef GetImageSizeFunc = "llvm.OpenCL.image.get.size";
+StringRef GetImageFormatFunc = "llvm.OpenCL.image.get.format";
+StringRef GetImageResourceIDFunc = "llvm.OpenCL.image.get.resource.id";
+StringRef GetSamplerResourceIDFunc = "llvm.OpenCL.sampler.get.resource.id";
+
+StringRef ImageSizeArgMDType = "__llvm_image_size";
+StringRef ImageFormatArgMDType = "__llvm_image_format";
+
+StringRef KernelsMDNodeName = "opencl.kernels";
+StringRef KernelArgMDNodeNames[] = {
+ "kernel_arg_addr_space",
+ "kernel_arg_access_qual",
+ "kernel_arg_type",
+ "kernel_arg_base_type",
+ "kernel_arg_type_qual"};
+const unsigned NumKernelArgMDNodes = 5;
+
+typedef SmallVector<Metadata *, 8> MDVector;
+struct KernelArgMD {
+ MDVector ArgVector[NumKernelArgMDNodes];
+};
+
+} // end anonymous namespace
+
+static inline bool
+IsImageType(StringRef TypeString) {
+ return TypeString == "image2d_t" || TypeString == "image3d_t";
+}
+
+static inline bool
+IsSamplerType(StringRef TypeString) {
+ return TypeString == "sampler_t";
+}
+
+static Function *
+GetFunctionFromMDNode(MDNode *Node) {
+ if (!Node)
+ return nullptr;
+
+ size_t NumOps = Node->getNumOperands();
+ if (NumOps != NumKernelArgMDNodes + 1)
+ return nullptr;
+
+ auto F = mdconst::dyn_extract<Function>(Node->getOperand(0));
+ if (!F)
+ return nullptr;
+
+ // Sanity checks.
+ size_t ExpectNumArgNodeOps = F->arg_size() + 1;
+ for (size_t i = 0; i < NumKernelArgMDNodes; ++i) {
+ MDNode *ArgNode = dyn_cast_or_null<MDNode>(Node->getOperand(i + 1));
+ if (ArgNode->getNumOperands() != ExpectNumArgNodeOps)
+ return nullptr;
+ if (!ArgNode->getOperand(0))
+ return nullptr;
+
+ // FIXME: It should be possible to do image lowering when some metadata
+ // args missing or not in the expected order.
+ MDString *StringNode = dyn_cast<MDString>(ArgNode->getOperand(0));
+ if (!StringNode || StringNode->getString() != KernelArgMDNodeNames[i])
+ return nullptr;
+ }
+
+ return F;
+}
+
+static StringRef
+AccessQualFromMD(MDNode *KernelMDNode, unsigned ArgIdx) {
+ MDNode *ArgAQNode = cast<MDNode>(KernelMDNode->getOperand(2));
+ return cast<MDString>(ArgAQNode->getOperand(ArgIdx + 1))->getString();
+}
+
+static StringRef
+ArgTypeFromMD(MDNode *KernelMDNode, unsigned ArgIdx) {
+ MDNode *ArgTypeNode = cast<MDNode>(KernelMDNode->getOperand(3));
+ return cast<MDString>(ArgTypeNode->getOperand(ArgIdx + 1))->getString();
+}
+
+static MDVector
+GetArgMD(MDNode *KernelMDNode, unsigned OpIdx) {
+ MDVector Res;
+ for (unsigned i = 0; i < NumKernelArgMDNodes; ++i) {
+ MDNode *Node = cast<MDNode>(KernelMDNode->getOperand(i + 1));
+ Res.push_back(Node->getOperand(OpIdx));
+ }
+ return Res;
+}
+
+static void
+PushArgMD(KernelArgMD &MD, const MDVector &V) {
+ assert(V.size() == NumKernelArgMDNodes);
+ for (unsigned i = 0; i < NumKernelArgMDNodes; ++i) {
+ MD.ArgVector[i].push_back(V[i]);
+ }
+}
+
+namespace {
+
+class AMDGPUOpenCLImageTypeLoweringPass : public ModulePass {
+ static char ID;
+
+ LLVMContext *Context;
+ Type *Int32Type;
+ Type *ImageSizeType;
+ Type *ImageFormatType;
+ SmallVector<Instruction *, 4> InstsToErase;
+
+ bool replaceImageUses(Argument &ImageArg, uint32_t ResourceID,
+ Argument &ImageSizeArg,
+ Argument &ImageFormatArg) {
+ bool Modified = false;
+
+ for (auto &Use : ImageArg.uses()) {
+ auto Inst = dyn_cast<CallInst>(Use.getUser());
+ if (!Inst) {
+ continue;
+ }
+
+ Function *F = Inst->getCalledFunction();
+ if (!F)
+ continue;
+
+ Value *Replacement = nullptr;
+ StringRef Name = F->getName();
+ if (Name.startswith(GetImageResourceIDFunc)) {
+ Replacement = ConstantInt::get(Int32Type, ResourceID);
+ } else if (Name.startswith(GetImageSizeFunc)) {
+ Replacement = &ImageSizeArg;
+ } else if (Name.startswith(GetImageFormatFunc)) {
+ Replacement = &ImageFormatArg;
+ } else {
+ continue;
+ }
+
+ Inst->replaceAllUsesWith(Replacement);
+ InstsToErase.push_back(Inst);
+ Modified = true;
+ }
+
+ return Modified;
+ }
+
+ bool replaceSamplerUses(Argument &SamplerArg, uint32_t ResourceID) {
+ bool Modified = false;
+
+ for (const auto &Use : SamplerArg.uses()) {
+ auto Inst = dyn_cast<CallInst>(Use.getUser());
+ if (!Inst) {
+ continue;
+ }
+
+ Function *F = Inst->getCalledFunction();
+ if (!F)
+ continue;
+
+ Value *Replacement = nullptr;
+ StringRef Name = F->getName();
+ if (Name == GetSamplerResourceIDFunc) {
+ Replacement = ConstantInt::get(Int32Type, ResourceID);
+ } else {
+ continue;
+ }
+
+ Inst->replaceAllUsesWith(Replacement);
+ InstsToErase.push_back(Inst);
+ Modified = true;
+ }
+
+ return Modified;
+ }
+
+ bool replaceImageAndSamplerUses(Function *F, MDNode *KernelMDNode) {
+ uint32_t NumReadOnlyImageArgs = 0;
+ uint32_t NumWriteOnlyImageArgs = 0;
+ uint32_t NumSamplerArgs = 0;
+
+ bool Modified = false;
+ InstsToErase.clear();
+ for (auto ArgI = F->arg_begin(); ArgI != F->arg_end(); ++ArgI) {
+ Argument &Arg = *ArgI;
+ StringRef Type = ArgTypeFromMD(KernelMDNode, Arg.getArgNo());
+
+ // Handle image types.
+ if (IsImageType(Type)) {
+ StringRef AccessQual = AccessQualFromMD(KernelMDNode, Arg.getArgNo());
+ uint32_t ResourceID;
+ if (AccessQual == "read_only") {
+ ResourceID = NumReadOnlyImageArgs++;
+ } else if (AccessQual == "write_only") {
+ ResourceID = NumWriteOnlyImageArgs++;
+ } else {
+ llvm_unreachable("Wrong image access qualifier.");
+ }
+
+ Argument &SizeArg = *(++ArgI);
+ Argument &FormatArg = *(++ArgI);
+ Modified |= replaceImageUses(Arg, ResourceID, SizeArg, FormatArg);
+
+ // Handle sampler type.
+ } else if (IsSamplerType(Type)) {
+ uint32_t ResourceID = NumSamplerArgs++;
+ Modified |= replaceSamplerUses(Arg, ResourceID);
+ }
+ }
+ for (unsigned i = 0; i < InstsToErase.size(); ++i) {
+ InstsToErase[i]->eraseFromParent();
+ }
+
+ return Modified;
+ }
+
+ std::tuple<Function *, MDNode *>
+ addImplicitArgs(Function *F, MDNode *KernelMDNode) {
+ bool Modified = false;
+
+ FunctionType *FT = F->getFunctionType();
+ SmallVector<Type *, 8> ArgTypes;
+
+ // Metadata operands for new MDNode.
+ KernelArgMD NewArgMDs;
+ PushArgMD(NewArgMDs, GetArgMD(KernelMDNode, 0));
+
+ // Add implicit arguments to the signature.
+ for (unsigned i = 0; i < FT->getNumParams(); ++i) {
+ ArgTypes.push_back(FT->getParamType(i));
+ MDVector ArgMD = GetArgMD(KernelMDNode, i + 1);
+ PushArgMD(NewArgMDs, ArgMD);
+
+ if (!IsImageType(ArgTypeFromMD(KernelMDNode, i)))
+ continue;
+
+ // Add size implicit argument.
+ ArgTypes.push_back(ImageSizeType);
+ ArgMD[2] = ArgMD[3] = MDString::get(*Context, ImageSizeArgMDType);
+ PushArgMD(NewArgMDs, ArgMD);
+
+ // Add format implicit argument.
+ ArgTypes.push_back(ImageFormatType);
+ ArgMD[2] = ArgMD[3] = MDString::get(*Context, ImageFormatArgMDType);
+ PushArgMD(NewArgMDs, ArgMD);
+
+ Modified = true;
+ }
+ if (!Modified) {
+ return std::make_tuple(nullptr, nullptr);
+ }
+
+ // Create function with new signature and clone the old body into it.
+ auto NewFT = FunctionType::get(FT->getReturnType(), ArgTypes, false);
+ auto NewF = Function::Create(NewFT, F->getLinkage(), F->getName());
+ ValueToValueMapTy VMap;
+ auto NewFArgIt = NewF->arg_begin();
+ for (auto &Arg: F->args()) {
+ auto ArgName = Arg.getName();
+ NewFArgIt->setName(ArgName);
+ VMap[&Arg] = &(*NewFArgIt++);
+ if (IsImageType(ArgTypeFromMD(KernelMDNode, Arg.getArgNo()))) {
+ (NewFArgIt++)->setName(Twine("__size_") + ArgName);
+ (NewFArgIt++)->setName(Twine("__format_") + ArgName);
+ }
+ }
+ SmallVector<ReturnInst*, 8> Returns;
+ CloneFunctionInto(NewF, F, VMap, /*ModuleLevelChanges=*/false, Returns);
+
+ // Build new MDNode.
+ SmallVector<llvm::Metadata *, 6> KernelMDArgs;
+ KernelMDArgs.push_back(ConstantAsMetadata::get(NewF));
+ for (unsigned i = 0; i < NumKernelArgMDNodes; ++i)
+ KernelMDArgs.push_back(MDNode::get(*Context, NewArgMDs.ArgVector[i]));
+ MDNode *NewMDNode = MDNode::get(*Context, KernelMDArgs);
+
+ return std::make_tuple(NewF, NewMDNode);
+ }
+
+ bool transformKernels(Module &M) {
+ NamedMDNode *KernelsMDNode = M.getNamedMetadata(KernelsMDNodeName);
+ if (!KernelsMDNode)
+ return false;
+
+ bool Modified = false;
+ for (unsigned i = 0; i < KernelsMDNode->getNumOperands(); ++i) {
+ MDNode *KernelMDNode = KernelsMDNode->getOperand(i);
+ Function *F = GetFunctionFromMDNode(KernelMDNode);
+ if (!F)
+ continue;
+
+ Function *NewF;
+ MDNode *NewMDNode;
+ std::tie(NewF, NewMDNode) = addImplicitArgs(F, KernelMDNode);
+ if (NewF) {
+ // Replace old function and metadata with new ones.
+ F->eraseFromParent();
+ M.getFunctionList().push_back(NewF);
+ M.getOrInsertFunction(NewF->getName(), NewF->getFunctionType(),
+ NewF->getAttributes());
+ KernelsMDNode->setOperand(i, NewMDNode);
+
+ F = NewF;
+ KernelMDNode = NewMDNode;
+ Modified = true;
+ }
+
+ Modified |= replaceImageAndSamplerUses(F, KernelMDNode);
+ }
+
+ return Modified;
+ }
+
+ public:
+ AMDGPUOpenCLImageTypeLoweringPass() : ModulePass(ID) {}
+
+ bool runOnModule(Module &M) override {
+ Context = &M.getContext();
+ Int32Type = Type::getInt32Ty(M.getContext());
+ ImageSizeType = ArrayType::get(Int32Type, 3);
+ ImageFormatType = ArrayType::get(Int32Type, 2);
+
+ return transformKernels(M);
+ }
+
+ StringRef getPassName() const override {
+ return "AMDGPU OpenCL Image Type Pass";
+ }
+};
+
+char AMDGPUOpenCLImageTypeLoweringPass::ID = 0;
+
+} // end anonymous namespace
+
+ModulePass *llvm::createAMDGPUOpenCLImageTypeLoweringPass() {
+ return new AMDGPUOpenCLImageTypeLoweringPass();
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h
new file mode 100644
index 000000000000..947d45b66969
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h
@@ -0,0 +1,42 @@
+//===-- AMDGPUNoteType.h - AMDGPU ELF PT_NOTE section info-------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// Enums and constants for AMDGPU PT_NOTE sections.
+///
+//
+//===----------------------------------------------------------------------===//
+//
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUPTNOTE_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUPTNOTE_H
+
+namespace AMDGPU {
+
+namespace PT_NOTE {
+
+const char SectionName[] = ".note";
+
+const char NoteName[] = "AMD";
+
+enum NoteType{
+ NT_AMDGPU_HSA_CODE_OBJECT_VERSION = 1,
+ NT_AMDGPU_HSA_HSAIL = 2,
+ NT_AMDGPU_HSA_ISA = 3,
+ NT_AMDGPU_HSA_PRODUCER = 4,
+ NT_AMDGPU_HSA_PRODUCER_OPTIONS = 5,
+ NT_AMDGPU_HSA_EXTENSION = 6,
+ NT_AMDGPU_HSA_RUNTIME_METADATA = 7,
+ NT_AMDGPU_HSA_HLDEBUG_DEBUG = 101,
+ NT_AMDGPU_HSA_HLDEBUG_TARGET = 102
+};
+}
+}
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUNOTETYPE_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
new file mode 100644
index 000000000000..baa28de7a770
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -0,0 +1,840 @@
+//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass eliminates allocas by either converting them into vectors or
+// by migrating them to local address space.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "amdgpu-promote-alloca"
+
+using namespace llvm;
+
+namespace {
+
+// FIXME: This can create globals so should be a module pass.
+class AMDGPUPromoteAlloca : public FunctionPass {
+private:
+ const TargetMachine *TM;
+ Module *Mod;
+ const DataLayout *DL;
+ MDNode *MaxWorkGroupSizeRange;
+
+ // FIXME: This should be per-kernel.
+ uint32_t LocalMemLimit;
+ uint32_t CurrentLocalMemUsage;
+
+ bool IsAMDGCN;
+ bool IsAMDHSA;
+
+ std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
+ Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
+
+ /// BaseAlloca is the alloca root the search started from.
+ /// Val may be that alloca or a recursive user of it.
+ bool collectUsesWithPtrTypes(Value *BaseAlloca,
+ Value *Val,
+ std::vector<Value*> &WorkList) const;
+
+ /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
+ /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
+ /// Returns true if both operands are derived from the same alloca. Val should
+ /// be the same value as one of the input operands of UseInst.
+ bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
+ Instruction *UseInst,
+ int OpIdx0, int OpIdx1) const;
+
+public:
+ static char ID;
+
+ AMDGPUPromoteAlloca(const TargetMachine *TM_ = nullptr) :
+ FunctionPass(ID),
+ TM(TM_),
+ Mod(nullptr),
+ DL(nullptr),
+ MaxWorkGroupSizeRange(nullptr),
+ LocalMemLimit(0),
+ CurrentLocalMemUsage(0),
+ IsAMDGCN(false),
+ IsAMDHSA(false) { }
+
+ bool doInitialization(Module &M) override;
+ bool runOnFunction(Function &F) override;
+
+ StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
+
+ void handleAlloca(AllocaInst &I);
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ FunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // End anonymous namespace
+
+char AMDGPUPromoteAlloca::ID = 0;
+
+INITIALIZE_TM_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
+ "AMDGPU promote alloca to vector or LDS", false, false)
+
+char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
+
+
+bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
+ if (!TM)
+ return false;
+
+ Mod = &M;
+ DL = &Mod->getDataLayout();
+
+ // The maximum workitem id.
+ //
+ // FIXME: Should get as subtarget property. Usually runtime enforced max is
+ // 256.
+ MDBuilder MDB(Mod->getContext());
+ MaxWorkGroupSizeRange = MDB.createRange(APInt(32, 0), APInt(32, 2048));
+
+ const Triple &TT = TM->getTargetTriple();
+
+ IsAMDGCN = TT.getArch() == Triple::amdgcn;
+ IsAMDHSA = TT.getOS() == Triple::AMDHSA;
+
+ return false;
+}
+
+bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
+ if (!TM || skipFunction(F))
+ return false;
+
+ const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
+ if (!ST.isPromoteAllocaEnabled())
+ return false;
+
+ FunctionType *FTy = F.getFunctionType();
+
+ // If the function has any arguments in the local address space, then it's
+ // possible these arguments require the entire local memory space, so
+ // we cannot use local memory in the pass.
+ for (Type *ParamTy : FTy->params()) {
+ PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
+ if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+ LocalMemLimit = 0;
+ DEBUG(dbgs() << "Function has local memory argument. Promoting to "
+ "local memory disabled.\n");
+ return false;
+ }
+ }
+
+ LocalMemLimit = ST.getLocalMemorySize();
+ if (LocalMemLimit == 0)
+ return false;
+
+ const DataLayout &DL = Mod->getDataLayout();
+
+ // Check how much local memory is being used by global objects
+ CurrentLocalMemUsage = 0;
+ for (GlobalVariable &GV : Mod->globals()) {
+ if (GV.getType()->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
+ continue;
+
+ for (const User *U : GV.users()) {
+ const Instruction *Use = dyn_cast<Instruction>(U);
+ if (!Use)
+ continue;
+
+ if (Use->getParent()->getParent() == &F) {
+ unsigned Align = GV.getAlignment();
+ if (Align == 0)
+ Align = DL.getABITypeAlignment(GV.getValueType());
+
+ // FIXME: Try to account for padding here. The padding is currently
+ // determined from the inverse order of uses in the function. I'm not
+ // sure if the use list order is in any way connected to this, so the
+ // total reported size is likely incorrect.
+ uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
+ CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);
+ CurrentLocalMemUsage += AllocSize;
+ break;
+ }
+ }
+ }
+
+ unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage);
+
+ // Restrict local memory usage so that we don't drastically reduce occupancy,
+ // unless it is already significantly reduced.
+
+ // TODO: Have some sort of hint or other heuristics to guess occupancy based
+ // on other factors..
+ unsigned OccupancyHint = ST.getWavesPerEU(F).second;
+ if (OccupancyHint == 0)
+ OccupancyHint = 7;
+
+ // Clamp to max value.
+ OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
+
+ // Check the hint but ignore it if it's obviously wrong from the existing LDS
+ // usage.
+ MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
+
+
+ // Round up to the next tier of usage.
+ unsigned MaxSizeWithWaveCount
+ = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy);
+
+ // Program is possibly broken by using more local mem than available.
+ if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
+ return false;
+
+ LocalMemLimit = MaxSizeWithWaveCount;
+
+ DEBUG(
+ dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n"
+ << " Rounding size to " << MaxSizeWithWaveCount
+ << " with a maximum occupancy of " << MaxOccupancy << '\n'
+ << " and " << (LocalMemLimit - CurrentLocalMemUsage)
+ << " available for promotion\n"
+ );
+
+ BasicBlock &EntryBB = *F.begin();
+ for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) {
+ AllocaInst *AI = dyn_cast<AllocaInst>(I);
+
+ ++I;
+ if (AI)
+ handleAlloca(*AI);
+ }
+
+ return true;
+}
+
+std::pair<Value *, Value *>
+AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
+ if (!IsAMDHSA) {
+ Function *LocalSizeYFn
+ = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
+ Function *LocalSizeZFn
+ = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z);
+
+ CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
+ CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
+
+ LocalSizeY->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
+ LocalSizeZ->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
+
+ return std::make_pair(LocalSizeY, LocalSizeZ);
+ }
+
+ // We must read the size out of the dispatch pointer.
+ assert(IsAMDGCN);
+
+ // We are indexing into this struct, and want to extract the workgroup_size_*
+ // fields.
+ //
+ // typedef struct hsa_kernel_dispatch_packet_s {
+ // uint16_t header;
+ // uint16_t setup;
+ // uint16_t workgroup_size_x ;
+ // uint16_t workgroup_size_y;
+ // uint16_t workgroup_size_z;
+ // uint16_t reserved0;
+ // uint32_t grid_size_x ;
+ // uint32_t grid_size_y ;
+ // uint32_t grid_size_z;
+ //
+ // uint32_t private_segment_size;
+ // uint32_t group_segment_size;
+ // uint64_t kernel_object;
+ //
+ // #ifdef HSA_LARGE_MODEL
+ // void *kernarg_address;
+ // #elif defined HSA_LITTLE_ENDIAN
+ // void *kernarg_address;
+ // uint32_t reserved1;
+ // #else
+ // uint32_t reserved1;
+ // void *kernarg_address;
+ // #endif
+ // uint64_t reserved2;
+ // hsa_signal_t completion_signal; // uint64_t wrapper
+ // } hsa_kernel_dispatch_packet_t
+ //
+ Function *DispatchPtrFn
+ = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
+
+ CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
+ DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NoAlias);
+ DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
+
+ // Size of the dispatch packet struct.
+ DispatchPtr->addDereferenceableAttr(AttributeSet::ReturnIndex, 64);
+
+ Type *I32Ty = Type::getInt32Ty(Mod->getContext());
+ Value *CastDispatchPtr = Builder.CreateBitCast(
+ DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS));
+
+ // We could do a single 64-bit load here, but it's likely that the basic
+ // 32-bit and extract sequence is already present, and it is probably easier
+ // to CSE this. The loads should be mergable later anyway.
+ Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 1);
+ LoadInst *LoadXY = Builder.CreateAlignedLoad(GEPXY, 4);
+
+ Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 2);
+ LoadInst *LoadZU = Builder.CreateAlignedLoad(GEPZU, 4);
+
+ MDNode *MD = llvm::MDNode::get(Mod->getContext(), None);
+ LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
+ LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
+ LoadZU->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
+
+ // Extract y component. Upper half of LoadZU should be zero already.
+ Value *Y = Builder.CreateLShr(LoadXY, 16);
+
+ return std::make_pair(Y, LoadZU);
+}
+
+Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
+ Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic;
+
+ switch (N) {
+ case 0:
+ IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_x
+ : Intrinsic::r600_read_tidig_x;
+ break;
+ case 1:
+ IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_y
+ : Intrinsic::r600_read_tidig_y;
+ break;
+
+ case 2:
+ IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_z
+ : Intrinsic::r600_read_tidig_z;
+ break;
+ default:
+ llvm_unreachable("invalid dimension");
+ }
+
+ Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
+ CallInst *CI = Builder.CreateCall(WorkitemIdFn);
+ CI->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
+
+ return CI;
+}
+
+static VectorType *arrayTypeToVecType(Type *ArrayTy) {
+ return VectorType::get(ArrayTy->getArrayElementType(),
+ ArrayTy->getArrayNumElements());
+}
+
+static Value *
+calculateVectorIndex(Value *Ptr,
+ const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
+ GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
+
+ auto I = GEPIdx.find(GEP);
+ return I == GEPIdx.end() ? nullptr : I->second;
+}
+
+static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
+ // FIXME we only support simple cases
+ if (GEP->getNumOperands() != 3)
+ return nullptr;
+
+ ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
+ if (!I0 || !I0->isZero())
+ return nullptr;
+
+ return GEP->getOperand(2);
+}
+
+// Not an instruction handled below to turn into a vector.
+//
+// TODO: Check isTriviallyVectorizable for calls and handle other
+// instructions.
+static bool canVectorizeInst(Instruction *Inst, User *User) {
+ switch (Inst->getOpcode()) {
+ case Instruction::Load:
+ case Instruction::BitCast:
+ case Instruction::AddrSpaceCast:
+ return true;
+ case Instruction::Store: {
+ // Must be the stored pointer operand, not a stored value.
+ StoreInst *SI = cast<StoreInst>(Inst);
+ return SI->getPointerOperand() == User;
+ }
+ default:
+ return false;
+ }
+}
+
+static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
+ ArrayType *AllocaTy = dyn_cast<ArrayType>(Alloca->getAllocatedType());
+
+ DEBUG(dbgs() << "Alloca candidate for vectorization\n");
+
+ // FIXME: There is no reason why we can't support larger arrays, we
+ // are just being conservative for now.
+ if (!AllocaTy ||
+ AllocaTy->getElementType()->isVectorTy() ||
+ AllocaTy->getNumElements() > 4 ||
+ AllocaTy->getNumElements() < 2) {
+ DEBUG(dbgs() << " Cannot convert type to vector\n");
+ return false;
+ }
+
+ std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
+ std::vector<Value*> WorkList;
+ for (User *AllocaUser : Alloca->users()) {
+ GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
+ if (!GEP) {
+ if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca))
+ return false;
+
+ WorkList.push_back(AllocaUser);
+ continue;
+ }
+
+ Value *Index = GEPToVectorIndex(GEP);
+
+ // If we can't compute a vector index from this GEP, then we can't
+ // promote this alloca to vector.
+ if (!Index) {
+ DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP << '\n');
+ return false;
+ }
+
+ GEPVectorIdx[GEP] = Index;
+ for (User *GEPUser : AllocaUser->users()) {
+ if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser))
+ return false;
+
+ WorkList.push_back(GEPUser);
+ }
+ }
+
+ VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
+
+ DEBUG(dbgs() << " Converting alloca to vector "
+ << *AllocaTy << " -> " << *VectorTy << '\n');
+
+ for (Value *V : WorkList) {
+ Instruction *Inst = cast<Instruction>(V);
+ IRBuilder<> Builder(Inst);
+ switch (Inst->getOpcode()) {
+ case Instruction::Load: {
+ Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
+ Value *Ptr = Inst->getOperand(0);
+ Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
+
+ Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
+ Value *VecValue = Builder.CreateLoad(BitCast);
+ Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
+ Inst->replaceAllUsesWith(ExtractElement);
+ Inst->eraseFromParent();
+ break;
+ }
+ case Instruction::Store: {
+ Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
+
+ Value *Ptr = Inst->getOperand(1);
+ Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
+ Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
+ Value *VecValue = Builder.CreateLoad(BitCast);
+ Value *NewVecValue = Builder.CreateInsertElement(VecValue,
+ Inst->getOperand(0),
+ Index);
+ Builder.CreateStore(NewVecValue, BitCast);
+ Inst->eraseFromParent();
+ break;
+ }
+ case Instruction::BitCast:
+ case Instruction::AddrSpaceCast:
+ break;
+
+ default:
+ llvm_unreachable("Inconsistency in instructions promotable to vector");
+ }
+ }
+ return true;
+}
+
+static bool isCallPromotable(CallInst *CI) {
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
+ if (!II)
+ return false;
+
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::memcpy:
+ case Intrinsic::memmove:
+ case Intrinsic::memset:
+ case Intrinsic::lifetime_start:
+ case Intrinsic::lifetime_end:
+ case Intrinsic::invariant_start:
+ case Intrinsic::invariant_end:
+ case Intrinsic::invariant_group_barrier:
+ case Intrinsic::objectsize:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca,
+ Value *Val,
+ Instruction *Inst,
+ int OpIdx0,
+ int OpIdx1) const {
+ // Figure out which operand is the one we might not be promoting.
+ Value *OtherOp = Inst->getOperand(OpIdx0);
+ if (Val == OtherOp)
+ OtherOp = Inst->getOperand(OpIdx1);
+
+ if (isa<ConstantPointerNull>(OtherOp))
+ return true;
+
+ Value *OtherObj = GetUnderlyingObject(OtherOp, *DL);
+ if (!isa<AllocaInst>(OtherObj))
+ return false;
+
+ // TODO: We should be able to replace undefs with the right pointer type.
+
+ // TODO: If we know the other base object is another promotable
+ // alloca, not necessarily this alloca, we can do this. The
+ // important part is both must have the same address space at
+ // the end.
+ if (OtherObj != BaseAlloca) {
+ DEBUG(dbgs() << "Found a binary instruction with another alloca object\n");
+ return false;
+ }
+
+ return true;
+}
+
+bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
+ Value *BaseAlloca,
+ Value *Val,
+ std::vector<Value*> &WorkList) const {
+
+ for (User *User : Val->users()) {
+ if (is_contained(WorkList, User))
+ continue;
+
+ if (CallInst *CI = dyn_cast<CallInst>(User)) {
+ if (!isCallPromotable(CI))
+ return false;
+
+ WorkList.push_back(User);
+ continue;
+ }
+
+ Instruction *UseInst = cast<Instruction>(User);
+ if (UseInst->getOpcode() == Instruction::PtrToInt)
+ return false;
+
+ if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
+ if (LI->isVolatile())
+ return false;
+
+ continue;
+ }
+
+ if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
+ if (SI->isVolatile())
+ return false;
+
+ // Reject if the stored value is not the pointer operand.
+ if (SI->getPointerOperand() != Val)
+ return false;
+ } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
+ if (RMW->isVolatile())
+ return false;
+ } else if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
+ if (CAS->isVolatile())
+ return false;
+ }
+
+ // Only promote a select if we know that the other select operand
+ // is from another pointer that will also be promoted.
+ if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
+ if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
+ return false;
+
+ // May need to rewrite constant operands.
+ WorkList.push_back(ICmp);
+ }
+
+ if (UseInst->getOpcode() == Instruction::AddrSpaceCast) {
+ // Don't collect the users of this.
+ WorkList.push_back(User);
+ continue;
+ }
+
+ if (!User->getType()->isPointerTy())
+ continue;
+
+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) {
+ // Be conservative if an address could be computed outside the bounds of
+ // the alloca.
+ if (!GEP->isInBounds())
+ return false;
+ }
+
+ // Only promote a select if we know that the other select operand is from
+ // another pointer that will also be promoted.
+ if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
+ if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
+ return false;
+ }
+
+ // Repeat for phis.
+ if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
+ // TODO: Handle more complex cases. We should be able to replace loops
+ // over arrays.
+ switch (Phi->getNumIncomingValues()) {
+ case 1:
+ break;
+ case 2:
+ if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
+ return false;
+ break;
+ default:
+ return false;
+ }
+ }
+
+ WorkList.push_back(User);
+ if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList))
+ return false;
+ }
+
+ return true;
+}
+
+// FIXME: Should try to pick the most likely to be profitable allocas first.
+void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
+ // Array allocations are probably not worth handling, since an allocation of
+ // the array type is the canonical form.
+ if (!I.isStaticAlloca() || I.isArrayAllocation())
+ return;
+
+ IRBuilder<> Builder(&I);
+
+ // First try to replace the alloca with a vector
+ Type *AllocaTy = I.getAllocatedType();
+
+ DEBUG(dbgs() << "Trying to promote " << I << '\n');
+
+ if (tryPromoteAllocaToVector(&I)) {
+ DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
+ return;
+ }
+
+ const Function &ContainingFunction = *I.getParent()->getParent();
+
+ // Don't promote the alloca to LDS for shader calling conventions as the work
+ // item ID intrinsics are not supported for these calling conventions.
+ // Furthermore not all LDS is available for some of the stages.
+ if (AMDGPU::isShader(ContainingFunction.getCallingConv()))
+ return;
+
+ const AMDGPUSubtarget &ST =
+ TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction);
+ // FIXME: We should also try to get this value from the reqd_work_group_size
+ // function attribute if it is available.
+ unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
+
+ const DataLayout &DL = Mod->getDataLayout();
+
+ unsigned Align = I.getAlignment();
+ if (Align == 0)
+ Align = DL.getABITypeAlignment(I.getAllocatedType());
+
+ // FIXME: This computed padding is likely wrong since it depends on inverse
+ // usage order.
+ //
+ // FIXME: It is also possible that if we're allowed to use all of the memory
+ // could could end up using more than the maximum due to alignment padding.
+
+ uint32_t NewSize = alignTo(CurrentLocalMemUsage, Align);
+ uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy);
+ NewSize += AllocSize;
+
+ if (NewSize > LocalMemLimit) {
+ DEBUG(dbgs() << " " << AllocSize
+ << " bytes of local memory not available to promote\n");
+ return;
+ }
+
+ CurrentLocalMemUsage = NewSize;
+
+ std::vector<Value*> WorkList;
+
+ if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
+ DEBUG(dbgs() << " Do not know how to convert all uses\n");
+ return;
+ }
+
+ DEBUG(dbgs() << "Promoting alloca to local memory\n");
+
+ Function *F = I.getParent()->getParent();
+
+ Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
+ GlobalVariable *GV = new GlobalVariable(
+ *Mod, GVTy, false, GlobalValue::InternalLinkage,
+ UndefValue::get(GVTy),
+ Twine(F->getName()) + Twine('.') + I.getName(),
+ nullptr,
+ GlobalVariable::NotThreadLocal,
+ AMDGPUAS::LOCAL_ADDRESS);
+ GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+ GV->setAlignment(I.getAlignment());
+
+ Value *TCntY, *TCntZ;
+
+ std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
+ Value *TIdX = getWorkitemID(Builder, 0);
+ Value *TIdY = getWorkitemID(Builder, 1);
+ Value *TIdZ = getWorkitemID(Builder, 2);
+
+ Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
+ Tmp0 = Builder.CreateMul(Tmp0, TIdX);
+ Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
+ Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
+ TID = Builder.CreateAdd(TID, TIdZ);
+
+ Value *Indices[] = {
+ Constant::getNullValue(Type::getInt32Ty(Mod->getContext())),
+ TID
+ };
+
+ Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
+ I.mutateType(Offset->getType());
+ I.replaceAllUsesWith(Offset);
+ I.eraseFromParent();
+
+ for (Value *V : WorkList) {
+ CallInst *Call = dyn_cast<CallInst>(V);
+ if (!Call) {
+ if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
+ Value *Src0 = CI->getOperand(0);
+ Type *EltTy = Src0->getType()->getPointerElementType();
+ PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
+
+ if (isa<ConstantPointerNull>(CI->getOperand(0)))
+ CI->setOperand(0, ConstantPointerNull::get(NewTy));
+
+ if (isa<ConstantPointerNull>(CI->getOperand(1)))
+ CI->setOperand(1, ConstantPointerNull::get(NewTy));
+
+ continue;
+ }
+
+ // The operand's value should be corrected on its own and we don't want to
+ // touch the users.
+ if (isa<AddrSpaceCastInst>(V))
+ continue;
+
+ Type *EltTy = V->getType()->getPointerElementType();
+ PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
+
+ // FIXME: It doesn't really make sense to try to do this for all
+ // instructions.
+ V->mutateType(NewTy);
+
+ // Adjust the types of any constant operands.
+ if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
+ if (isa<ConstantPointerNull>(SI->getOperand(1)))
+ SI->setOperand(1, ConstantPointerNull::get(NewTy));
+
+ if (isa<ConstantPointerNull>(SI->getOperand(2)))
+ SI->setOperand(2, ConstantPointerNull::get(NewTy));
+ } else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
+ for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
+ if (isa<ConstantPointerNull>(Phi->getIncomingValue(I)))
+ Phi->setIncomingValue(I, ConstantPointerNull::get(NewTy));
+ }
+ }
+
+ continue;
+ }
+
+ IntrinsicInst *Intr = cast<IntrinsicInst>(Call);
+ Builder.SetInsertPoint(Intr);
+ switch (Intr->getIntrinsicID()) {
+ case Intrinsic::lifetime_start:
+ case Intrinsic::lifetime_end:
+ // These intrinsics are for address space 0 only
+ Intr->eraseFromParent();
+ continue;
+ case Intrinsic::memcpy: {
+ MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
+ Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
+ MemCpy->getLength(), MemCpy->getAlignment(),
+ MemCpy->isVolatile());
+ Intr->eraseFromParent();
+ continue;
+ }
+ case Intrinsic::memmove: {
+ MemMoveInst *MemMove = cast<MemMoveInst>(Intr);
+ Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getRawSource(),
+ MemMove->getLength(), MemMove->getAlignment(),
+ MemMove->isVolatile());
+ Intr->eraseFromParent();
+ continue;
+ }
+ case Intrinsic::memset: {
+ MemSetInst *MemSet = cast<MemSetInst>(Intr);
+ Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
+ MemSet->getLength(), MemSet->getAlignment(),
+ MemSet->isVolatile());
+ Intr->eraseFromParent();
+ continue;
+ }
+ case Intrinsic::invariant_start:
+ case Intrinsic::invariant_end:
+ case Intrinsic::invariant_group_barrier:
+ Intr->eraseFromParent();
+ // FIXME: I think the invariant marker should still theoretically apply,
+ // but the intrinsics need to be changed to accept pointers with any
+ // address space.
+ continue;
+ case Intrinsic::objectsize: {
+ Value *Src = Intr->getOperand(0);
+ Type *SrcTy = Src->getType()->getPointerElementType();
+ Function *ObjectSize = Intrinsic::getDeclaration(Mod,
+ Intrinsic::objectsize,
+ { Intr->getType(), PointerType::get(SrcTy, AMDGPUAS::LOCAL_ADDRESS) }
+ );
+
+ CallInst *NewCall
+ = Builder.CreateCall(ObjectSize, { Src, Intr->getOperand(1) });
+ Intr->replaceAllUsesWith(NewCall);
+ Intr->eraseFromParent();
+ continue;
+ }
+ default:
+ Intr->dump();
+ llvm_unreachable("Don't know how to promote alloca intrinsic use.");
+ }
+ }
+}
+
+FunctionPass *llvm::createAMDGPUPromoteAlloca(const TargetMachine *TM) {
+ return new AMDGPUPromoteAlloca(TM);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
new file mode 100644
index 000000000000..941f2d8a468a
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
@@ -0,0 +1,52 @@
+//===-- AMDGPURegisterInfo.cpp - AMDGPU Register Information -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Parent TargetRegisterInfo class common to all hw codegen targets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPURegisterInfo.h"
+#include "AMDGPUTargetMachine.h"
+
+using namespace llvm;
+
+AMDGPURegisterInfo::AMDGPURegisterInfo() : AMDGPUGenRegisterInfo(0) {}
+
+//===----------------------------------------------------------------------===//
+// Function handling callbacks - Functions are a seldom used feature of GPUS, so
+// they are not supported at this time.
+//===----------------------------------------------------------------------===//
+
+// Dummy to not crash RegisterClassInfo.
+static const MCPhysReg CalleeSavedReg = AMDGPU::NoRegister;
+
+const MCPhysReg *AMDGPURegisterInfo::getCalleeSavedRegs(
+ const MachineFunction *) const {
+ return &CalleeSavedReg;
+}
+
+unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+ return AMDGPU::NoRegister;
+}
+
+unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const {
+ static const unsigned SubRegs[] = {
+ AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4,
+ AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, AMDGPU::sub8, AMDGPU::sub9,
+ AMDGPU::sub10, AMDGPU::sub11, AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14,
+ AMDGPU::sub15
+ };
+
+ assert(Channel < array_lengthof(SubRegs));
+ return SubRegs[Channel];
+}
+
+#define GET_REGINFO_TARGET_DESC
+#include "AMDGPUGenRegisterInfo.inc"
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h
new file mode 100644
index 000000000000..ef51aad95dce
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h
@@ -0,0 +1,43 @@
+//===-- AMDGPURegisterInfo.h - AMDGPURegisterInfo Interface -*- C++ -*-----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief TargetRegisterInfo interface that is implemented by all hw codegen
+/// targets.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#define GET_REGINFO_ENUM
+#include "AMDGPUGenRegisterInfo.inc"
+
+namespace llvm {
+
+class AMDGPUSubtarget;
+class TargetInstrInfo;
+
+struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
+ AMDGPURegisterInfo();
+
+ /// \returns the sub reg enum value for the given \p Channel
+ /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0)
+ unsigned getSubRegFromChannel(unsigned Channel) const;
+
+ const MCPhysReg* getCalleeSavedRegs(const MachineFunction *MF) const override;
+ unsigned getFrameRegister(const MachineFunction &MF) const override;
+};
+
+} // End namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.td
new file mode 100644
index 000000000000..ba0490abee8c
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.td
@@ -0,0 +1,25 @@
+//===-- AMDGPURegisterInfo.td - AMDGPU register info -------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Tablegen register definitions common to all hw codegen targets.
+//
+//===----------------------------------------------------------------------===//
+
+let Namespace = "AMDGPU" in {
+
+foreach Index = 0-15 in {
+ def sub#Index : SubRegIndex<32, !shl(Index, 5)>;
+}
+
+def INDIRECT_BASE_ADDR : Register <"INDIRECT_BASE_ADDR">;
+
+}
+
+include "R600RegisterInfo.td"
+include "SIRegisterInfo.td"
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h
new file mode 100644
index 000000000000..ecd2ac72bf1b
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h
@@ -0,0 +1,193 @@
+//===-- AMDGPURuntimeMetadata.h - AMDGPU Runtime Metadata -------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// Enums and structure types used by runtime metadata.
+///
+/// Runtime requests certain information (metadata) about kernels to be able
+/// to execute the kernels and answer the queries about the kernels.
+/// The metadata is represented as a note element in the .note ELF section of a
+/// binary (code object). The desc field of the note element is a YAML string
+/// consisting of key-value pairs. Each key is a string. Each value can be
+/// an integer, a string, or an YAML sequence. There are 3 levels of YAML maps.
+/// At the beginning of the YAML string is the module level YAML map. A
+/// kernel-level YAML map is in the amd.Kernels sequence. A
+/// kernel-argument-level map is in the amd.Args sequence.
+///
+/// The format should be kept backward compatible. New enum values and bit
+/// fields should be appended at the end. It is suggested to bump up the
+/// revision number whenever the format changes and document the change
+/// in the revision in this header.
+///
+//
+//===----------------------------------------------------------------------===//
+//
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H
+
+#include <cstdint>
+#include <vector>
+#include <string>
+
+namespace AMDGPU {
+
+namespace RuntimeMD {
+
+ // Version and revision of runtime metadata
+ const unsigned char MDVersion = 2;
+ const unsigned char MDRevision = 0;
+
+ // Name of keys for runtime metadata.
+ namespace KeyName {
+ const char MDVersion[] = "amd.MDVersion"; // Runtime metadata version
+ const char Language[] = "amd.Language"; // Language
+ const char LanguageVersion[] = "amd.LanguageVersion"; // Language version
+ const char Kernels[] = "amd.Kernels"; // Kernels
+ const char KernelName[] = "amd.KernelName"; // Kernel name
+ const char Args[] = "amd.Args"; // Kernel arguments
+ const char ArgSize[] = "amd.ArgSize"; // Kernel arg size
+ const char ArgAlign[] = "amd.ArgAlign"; // Kernel arg alignment
+ const char ArgTypeName[] = "amd.ArgTypeName"; // Kernel type name
+ const char ArgName[] = "amd.ArgName"; // Kernel name
+ const char ArgKind[] = "amd.ArgKind"; // Kernel argument kind
+ const char ArgValueType[] = "amd.ArgValueType"; // Kernel argument value type
+ const char ArgAddrQual[] = "amd.ArgAddrQual"; // Kernel argument address qualifier
+ const char ArgAccQual[] = "amd.ArgAccQual"; // Kernel argument access qualifier
+ const char ArgIsConst[] = "amd.ArgIsConst"; // Kernel argument is const qualified
+ const char ArgIsRestrict[] = "amd.ArgIsRestrict"; // Kernel argument is restrict qualified
+ const char ArgIsVolatile[] = "amd.ArgIsVolatile"; // Kernel argument is volatile qualified
+ const char ArgIsPipe[] = "amd.ArgIsPipe"; // Kernel argument is pipe qualified
+ const char ReqdWorkGroupSize[] = "amd.ReqdWorkGroupSize"; // Required work group size
+ const char WorkGroupSizeHint[] = "amd.WorkGroupSizeHint"; // Work group size hint
+ const char VecTypeHint[] = "amd.VecTypeHint"; // Vector type hint
+ const char KernelIndex[] = "amd.KernelIndex"; // Kernel index for device enqueue
+ const char NoPartialWorkGroups[] = "amd.NoPartialWorkGroups"; // No partial work groups
+ const char PrintfInfo[] = "amd.PrintfInfo"; // Prinf function call information
+ const char ArgActualAcc[] = "amd.ArgActualAcc"; // The actual kernel argument access qualifier
+ const char ArgPointeeAlign[] = "amd.ArgPointeeAlign"; // Alignment of pointee type
+ }
+
+ namespace KernelArg {
+ enum Kind : uint8_t {
+ ByValue = 0,
+ GlobalBuffer = 1,
+ DynamicSharedPointer = 2,
+ Sampler = 3,
+ Image = 4,
+ Pipe = 5,
+ Queue = 6,
+ HiddenGlobalOffsetX = 7,
+ HiddenGlobalOffsetY = 8,
+ HiddenGlobalOffsetZ = 9,
+ HiddenNone = 10,
+ HiddenPrintfBuffer = 11,
+ HiddenDefaultQueue = 12,
+ HiddenCompletionAction = 13,
+ };
+
+ enum ValueType : uint16_t {
+ Struct = 0,
+ I8 = 1,
+ U8 = 2,
+ I16 = 3,
+ U16 = 4,
+ F16 = 5,
+ I32 = 6,
+ U32 = 7,
+ F32 = 8,
+ I64 = 9,
+ U64 = 10,
+ F64 = 11,
+ };
+
+ // Avoid using 'None' since it conflicts with a macro in X11 header file.
+ enum AccessQualifer : uint8_t {
+ AccNone = 0,
+ ReadOnly = 1,
+ WriteOnly = 2,
+ ReadWrite = 3,
+ };
+
+ enum AddressSpaceQualifer : uint8_t {
+ Private = 0,
+ Global = 1,
+ Constant = 2,
+ Local = 3,
+ Generic = 4,
+ Region = 5,
+ };
+ } // namespace KernelArg
+
+ // Invalid values are used to indicate an optional key should not be emitted.
+ const uint8_t INVALID_ADDR_QUAL = 0xff;
+ const uint8_t INVALID_ACC_QUAL = 0xff;
+ const uint32_t INVALID_KERNEL_INDEX = ~0U;
+
+ namespace KernelArg {
+ // In-memory representation of kernel argument information.
+ struct Metadata {
+ uint32_t Size;
+ uint32_t Align;
+ uint32_t PointeeAlign;
+ uint8_t Kind;
+ uint16_t ValueType;
+ std::string TypeName;
+ std::string Name;
+ uint8_t AddrQual;
+ uint8_t AccQual;
+ uint8_t IsVolatile;
+ uint8_t IsConst;
+ uint8_t IsRestrict;
+ uint8_t IsPipe;
+ Metadata() : Size(0), Align(0), PointeeAlign(0), Kind(0), ValueType(0),
+ AddrQual(INVALID_ADDR_QUAL), AccQual(INVALID_ACC_QUAL), IsVolatile(0),
+ IsConst(0), IsRestrict(0), IsPipe(0) {}
+ };
+ }
+
+ namespace Kernel {
+ // In-memory representation of kernel information.
+ struct Metadata {
+ std::string Name;
+ std::string Language;
+ std::vector<uint8_t> LanguageVersion;
+ std::vector<uint32_t> ReqdWorkGroupSize;
+ std::vector<uint32_t> WorkGroupSizeHint;
+ std::string VecTypeHint;
+ uint32_t KernelIndex;
+ uint8_t NoPartialWorkGroups;
+ std::vector<KernelArg::Metadata> Args;
+ Metadata() : KernelIndex(INVALID_KERNEL_INDEX), NoPartialWorkGroups(0) {}
+ };
+ }
+
+ namespace Program {
+ // In-memory representation of program information.
+ struct Metadata {
+ std::vector<uint8_t> MDVersionSeq;
+ std::vector<std::string> PrintfInfo;
+ std::vector<Kernel::Metadata> Kernels;
+
+ explicit Metadata(){}
+
+ // Construct from an YAML string.
+ explicit Metadata(const std::string &YAML);
+
+ // Convert to YAML string.
+ std::string toYAML();
+
+ // Convert from YAML string.
+ static Metadata fromYAML(const std::string &S);
+ };
+ }
+} // namespace RuntimeMD
+} // namespace AMDGPU
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
new file mode 100644
index 000000000000..74851aedbb21
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -0,0 +1,362 @@
+//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUSubtarget.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include <algorithm>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-subtarget"
+
+#define GET_SUBTARGETINFO_ENUM
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "AMDGPUGenSubtargetInfo.inc"
+
+AMDGPUSubtarget::~AMDGPUSubtarget() = default;
+
+AMDGPUSubtarget &
+AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
+ StringRef GPU, StringRef FS) {
+ // Determine default and user-specified characteristics
+ // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
+ // enabled, but some instructions do not respect them and they run at the
+ // double precision rate, so don't enable by default.
+ //
+ // We want to be able to turn these off, but making this a subtarget feature
+ // for SI has the unhelpful behavior that it unsets everything else if you
+ // disable it.
+
+ SmallString<256> FullFS("+promote-alloca,+fp64-denormals,+load-store-opt,");
+ if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
+ FullFS += "+flat-for-global,+unaligned-buffer-access,";
+ FullFS += FS;
+
+ ParseSubtargetFeatures(GPU, FullFS);
+
+ // FIXME: I don't think think Evergreen has any useful support for
+ // denormals, but should be checked. Should we issue a warning somewhere
+ // if someone tries to enable these?
+ if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
+ FP16Denormals = false;
+ FP32Denormals = false;
+ FP64Denormals = false;
+ }
+
+ // Set defaults if needed.
+ if (MaxPrivateElementSize == 0)
+ MaxPrivateElementSize = 4;
+
+ return *this;
+}
+
+AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
+ const TargetMachine &TM)
+ : AMDGPUGenSubtargetInfo(TT, GPU, FS),
+ TargetTriple(TT),
+ Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
+ IsaVersion(ISAVersion0_0_0),
+ WavefrontSize(64),
+ LocalMemorySize(0),
+ LDSBankCount(0),
+ MaxPrivateElementSize(0),
+
+ FastFMAF32(false),
+ HalfRate64Ops(false),
+
+ FP16Denormals(false),
+ FP32Denormals(false),
+ FP64Denormals(false),
+ FPExceptions(false),
+ FlatForGlobal(false),
+ UnalignedScratchAccess(false),
+ UnalignedBufferAccess(false),
+
+ EnableXNACK(false),
+ DebuggerInsertNops(false),
+ DebuggerReserveRegs(false),
+ DebuggerEmitPrologue(false),
+
+ EnableVGPRSpilling(false),
+ EnablePromoteAlloca(false),
+ EnableLoadStoreOpt(false),
+ EnableUnsafeDSOffsetFolding(false),
+ EnableSIScheduler(false),
+ DumpCode(false),
+
+ FP64(false),
+ IsGCN(false),
+ GCN1Encoding(false),
+ GCN3Encoding(false),
+ CIInsts(false),
+ SGPRInitBug(false),
+ HasSMemRealTime(false),
+ Has16BitInsts(false),
+ HasMovrel(false),
+ HasVGPRIndexMode(false),
+ HasScalarStores(false),
+ HasInv2PiInlineImm(false),
+ FlatAddressSpace(false),
+
+ R600ALUInst(false),
+ CaymanISA(false),
+ CFALUBug(false),
+ HasVertexCache(false),
+ TexVTXClauseSize(0),
+ ScalarizeGlobal(false),
+
+ FeatureDisable(false),
+ InstrItins(getInstrItineraryForCPU(GPU)) {
+ initializeSubtargetDependencies(TT, GPU, FS);
+}
+
+// FIXME: These limits are for SI. Did they change with the larger maximum LDS
+// size?
+unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const {
+ switch (NWaves) {
+ case 10:
+ return 1638;
+ case 9:
+ return 1820;
+ case 8:
+ return 2048;
+ case 7:
+ return 2340;
+ case 6:
+ return 2730;
+ case 5:
+ return 3276;
+ case 4:
+ return 4096;
+ case 3:
+ return 5461;
+ case 2:
+ return 8192;
+ default:
+ return getLocalMemorySize();
+ }
+}
+
+unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const {
+ if (Bytes <= 1638)
+ return 10;
+
+ if (Bytes <= 1820)
+ return 9;
+
+ if (Bytes <= 2048)
+ return 8;
+
+ if (Bytes <= 2340)
+ return 7;
+
+ if (Bytes <= 2730)
+ return 6;
+
+ if (Bytes <= 3276)
+ return 5;
+
+ if (Bytes <= 4096)
+ return 4;
+
+ if (Bytes <= 5461)
+ return 3;
+
+ if (Bytes <= 8192)
+ return 2;
+
+ return 1;
+}
+
+std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
+ const Function &F) const {
+ // Default minimum/maximum flat work group sizes.
+ std::pair<unsigned, unsigned> Default =
+ AMDGPU::isCompute(F.getCallingConv()) ?
+ std::pair<unsigned, unsigned>(getWavefrontSize() * 2,
+ getWavefrontSize() * 4) :
+ std::pair<unsigned, unsigned>(1, getWavefrontSize());
+
+ // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
+ // starts using "amdgpu-flat-work-group-size" attribute.
+ Default.second = AMDGPU::getIntegerAttribute(
+ F, "amdgpu-max-work-group-size", Default.second);
+ Default.first = std::min(Default.first, Default.second);
+
+ // Requested minimum/maximum flat work group sizes.
+ std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
+ F, "amdgpu-flat-work-group-size", Default);
+
+ // Make sure requested minimum is less than requested maximum.
+ if (Requested.first > Requested.second)
+ return Default;
+
+ // Make sure requested values do not violate subtarget's specifications.
+ if (Requested.first < getMinFlatWorkGroupSize())
+ return Default;
+ if (Requested.second > getMaxFlatWorkGroupSize())
+ return Default;
+
+ return Requested;
+}
+
+std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
+ const Function &F) const {
+ // Default minimum/maximum number of waves per execution unit.
+ std::pair<unsigned, unsigned> Default(1, 0);
+
+ // Default/requested minimum/maximum flat work group sizes.
+ std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
+
+ // If minimum/maximum flat work group sizes were explicitly requested using
+ // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
+ // number of waves per execution unit to values implied by requested
+ // minimum/maximum flat work group sizes.
+ unsigned MinImpliedByFlatWorkGroupSize =
+ getMaxWavesPerEU(FlatWorkGroupSizes.second);
+ bool RequestedFlatWorkGroupSize = false;
+
+ // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
+ // starts using "amdgpu-flat-work-group-size" attribute.
+ if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
+ F.hasFnAttribute("amdgpu-flat-work-group-size")) {
+ Default.first = MinImpliedByFlatWorkGroupSize;
+ RequestedFlatWorkGroupSize = true;
+ }
+
+ // Requested minimum/maximum number of waves per execution unit.
+ std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
+ F, "amdgpu-waves-per-eu", Default, true);
+
+ // Make sure requested minimum is less than requested maximum.
+ if (Requested.second && Requested.first > Requested.second)
+ return Default;
+
+ // Make sure requested values do not violate subtarget's specifications.
+ if (Requested.first < getMinWavesPerEU() ||
+ Requested.first > getMaxWavesPerEU())
+ return Default;
+ if (Requested.second > getMaxWavesPerEU())
+ return Default;
+
+ // Make sure requested values are compatible with values implied by requested
+ // minimum/maximum flat work group sizes.
+ if (RequestedFlatWorkGroupSize &&
+ Requested.first > MinImpliedByFlatWorkGroupSize)
+ return Default;
+
+ return Requested;
+}
+
+R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
+ const TargetMachine &TM) :
+ AMDGPUSubtarget(TT, GPU, FS, TM),
+ InstrInfo(*this),
+ FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
+ TLInfo(TM, *this) {}
+
+SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
+ const TargetMachine &TM) :
+ AMDGPUSubtarget(TT, GPU, FS, TM),
+ InstrInfo(*this),
+ FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
+ TLInfo(TM, *this) {}
+
+void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
+ unsigned NumRegionInstrs) const {
+ // Track register pressure so the scheduler can try to decrease
+ // pressure once register usage is above the threshold defined by
+ // SIRegisterInfo::getRegPressureSetLimit()
+ Policy.ShouldTrackPressure = true;
+
+ // Enabling both top down and bottom up scheduling seems to give us less
+ // register spills than just using one of these approaches on its own.
+ Policy.OnlyTopDown = false;
+ Policy.OnlyBottomUp = false;
+
+ // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
+ if (!enableSIScheduler())
+ Policy.ShouldTrackLaneMasks = true;
+}
+
+bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
+ return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
+}
+
+unsigned SISubtarget::getKernArgSegmentSize(unsigned ExplicitArgBytes) const {
+ unsigned ImplicitBytes = getImplicitArgNumBytes();
+ if (ImplicitBytes == 0)
+ return ExplicitArgBytes;
+
+ unsigned Alignment = getAlignmentForImplicitArgPtr();
+ return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
+}
+
+unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
+ if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+ if (SGPRs <= 80)
+ return 10;
+ if (SGPRs <= 88)
+ return 9;
+ if (SGPRs <= 100)
+ return 8;
+ return 7;
+ }
+ if (SGPRs <= 48)
+ return 10;
+ if (SGPRs <= 56)
+ return 9;
+ if (SGPRs <= 64)
+ return 8;
+ if (SGPRs <= 72)
+ return 7;
+ if (SGPRs <= 80)
+ return 6;
+ return 5;
+}
+
+unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
+ if (VGPRs <= 24)
+ return 10;
+ if (VGPRs <= 28)
+ return 9;
+ if (VGPRs <= 32)
+ return 8;
+ if (VGPRs <= 36)
+ return 7;
+ if (VGPRs <= 40)
+ return 6;
+ if (VGPRs <= 48)
+ return 5;
+ if (VGPRs <= 64)
+ return 4;
+ if (VGPRs <= 84)
+ return 3;
+ if (VGPRs <= 128)
+ return 2;
+ return 1;
+}
+
+unsigned SISubtarget::getMaxNumSGPRs() const {
+ if (hasSGPRInitBug())
+ return SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
+
+ if (getGeneration() >= VOLCANIC_ISLANDS)
+ return 102;
+
+ return 104;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
new file mode 100644
index 000000000000..51ba501bddd1
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -0,0 +1,607 @@
+//=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU ------*- C++ -*-====//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief AMDGPU specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
+
+#include "AMDGPU.h"
+#include "R600InstrInfo.h"
+#include "R600ISelLowering.h"
+#include "R600FrameLowering.h"
+#include "SIInstrInfo.h"
+#include "SIISelLowering.h"
+#include "SIFrameLowering.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "AMDGPUGenSubtargetInfo.inc"
+
+namespace llvm {
+
+class StringRef;
+
+class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
+public:
+ enum Generation {
+ R600 = 0,
+ R700,
+ EVERGREEN,
+ NORTHERN_ISLANDS,
+ SOUTHERN_ISLANDS,
+ SEA_ISLANDS,
+ VOLCANIC_ISLANDS,
+ };
+
+ enum {
+ ISAVersion0_0_0,
+ ISAVersion7_0_0,
+ ISAVersion7_0_1,
+ ISAVersion7_0_2,
+ ISAVersion8_0_0,
+ ISAVersion8_0_1,
+ ISAVersion8_0_2,
+ ISAVersion8_0_3,
+ ISAVersion8_0_4,
+ ISAVersion8_1_0,
+ };
+
+protected:
+ // Basic subtarget description.
+ Triple TargetTriple;
+ Generation Gen;
+ unsigned IsaVersion;
+ unsigned WavefrontSize;
+ int LocalMemorySize;
+ int LDSBankCount;
+ unsigned MaxPrivateElementSize;
+
+ // Possibly statically set by tablegen, but may want to be overridden.
+ bool FastFMAF32;
+ bool HalfRate64Ops;
+
+ // Dynamially set bits that enable features.
+ bool FP16Denormals;
+ bool FP32Denormals;
+ bool FP64Denormals;
+ bool FPExceptions;
+ bool FlatForGlobal;
+ bool UnalignedScratchAccess;
+ bool UnalignedBufferAccess;
+ bool EnableXNACK;
+ bool DebuggerInsertNops;
+ bool DebuggerReserveRegs;
+ bool DebuggerEmitPrologue;
+
+ // Used as options.
+ bool EnableVGPRSpilling;
+ bool EnablePromoteAlloca;
+ bool EnableLoadStoreOpt;
+ bool EnableUnsafeDSOffsetFolding;
+ bool EnableSIScheduler;
+ bool DumpCode;
+
+ // Subtarget statically properties set by tablegen
+ bool FP64;
+ bool IsGCN;
+ bool GCN1Encoding;
+ bool GCN3Encoding;
+ bool CIInsts;
+ bool SGPRInitBug;
+ bool HasSMemRealTime;
+ bool Has16BitInsts;
+ bool HasMovrel;
+ bool HasVGPRIndexMode;
+ bool HasScalarStores;
+ bool HasInv2PiInlineImm;
+ bool FlatAddressSpace;
+ bool R600ALUInst;
+ bool CaymanISA;
+ bool CFALUBug;
+ bool HasVertexCache;
+ short TexVTXClauseSize;
+ bool ScalarizeGlobal;
+
+ // Dummy feature to use for assembler in tablegen.
+ bool FeatureDisable;
+
+ InstrItineraryData InstrItins;
+ SelectionDAGTargetInfo TSInfo;
+
+public:
+ AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
+ const TargetMachine &TM);
+ ~AMDGPUSubtarget() override;
+
+ AMDGPUSubtarget &initializeSubtargetDependencies(const Triple &TT,
+ StringRef GPU, StringRef FS);
+
+ const AMDGPUInstrInfo *getInstrInfo() const override = 0;
+ const AMDGPUFrameLowering *getFrameLowering() const override = 0;
+ const AMDGPUTargetLowering *getTargetLowering() const override = 0;
+ const AMDGPURegisterInfo *getRegisterInfo() const override = 0;
+
+ const InstrItineraryData *getInstrItineraryData() const override {
+ return &InstrItins;
+ }
+
+ // Nothing implemented, just prevent crashes on use.
+ const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+ return &TSInfo;
+ }
+
+ void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+ bool isAmdHsaOS() const {
+ return TargetTriple.getOS() == Triple::AMDHSA;
+ }
+
+ bool isMesa3DOS() const {
+ return TargetTriple.getOS() == Triple::Mesa3D;
+ }
+
+ bool isOpenCLEnv() const {
+ return TargetTriple.getEnvironment() == Triple::OpenCL;
+ }
+
+ Generation getGeneration() const {
+ return Gen;
+ }
+
+ unsigned getWavefrontSize() const {
+ return WavefrontSize;
+ }
+
+ int getLocalMemorySize() const {
+ return LocalMemorySize;
+ }
+
+ int getLDSBankCount() const {
+ return LDSBankCount;
+ }
+
+ unsigned getMaxPrivateElementSize() const {
+ return MaxPrivateElementSize;
+ }
+
+ bool has16BitInsts() const {
+ return Has16BitInsts;
+ }
+
+ bool hasHWFP64() const {
+ return FP64;
+ }
+
+ bool hasFastFMAF32() const {
+ return FastFMAF32;
+ }
+
+ bool hasHalfRate64Ops() const {
+ return HalfRate64Ops;
+ }
+
+ bool hasAddr64() const {
+ return (getGeneration() < VOLCANIC_ISLANDS);
+ }
+
+ bool hasBFE() const {
+ return (getGeneration() >= EVERGREEN);
+ }
+
+ bool hasBFI() const {
+ return (getGeneration() >= EVERGREEN);
+ }
+
+ bool hasBFM() const {
+ return hasBFE();
+ }
+
+ bool hasBCNT(unsigned Size) const {
+ if (Size == 32)
+ return (getGeneration() >= EVERGREEN);
+
+ if (Size == 64)
+ return (getGeneration() >= SOUTHERN_ISLANDS);
+
+ return false;
+ }
+
+ bool hasMulU24() const {
+ return (getGeneration() >= EVERGREEN);
+ }
+
+ bool hasMulI24() const {
+ return (getGeneration() >= SOUTHERN_ISLANDS ||
+ hasCaymanISA());
+ }
+
+ bool hasFFBL() const {
+ return (getGeneration() >= EVERGREEN);
+ }
+
+ bool hasFFBH() const {
+ return (getGeneration() >= EVERGREEN);
+ }
+
+ bool hasCARRY() const {
+ return (getGeneration() >= EVERGREEN);
+ }
+
+ bool hasBORROW() const {
+ return (getGeneration() >= EVERGREEN);
+ }
+
+ bool hasCaymanISA() const {
+ return CaymanISA;
+ }
+
+ bool isPromoteAllocaEnabled() const {
+ return EnablePromoteAlloca;
+ }
+
+ bool unsafeDSOffsetFoldingEnabled() const {
+ return EnableUnsafeDSOffsetFolding;
+ }
+
+ bool dumpCode() const {
+ return DumpCode;
+ }
+
+ bool enableIEEEBit(const MachineFunction &MF) const {
+ return AMDGPU::isCompute(MF.getFunction()->getCallingConv());
+ }
+
+ /// Return the amount of LDS that can be used that will not restrict the
+ /// occupancy lower than WaveCount.
+ unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount) const;
+
+ /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if
+ /// the given LDS memory size is the only constraint.
+ unsigned getOccupancyWithLocalMemSize(uint32_t Bytes) const;
+
+ bool hasFP16Denormals() const {
+ return FP16Denormals;
+ }
+
+ bool hasFP32Denormals() const {
+ return FP32Denormals;
+ }
+
+ bool hasFP64Denormals() const {
+ return FP64Denormals;
+ }
+
+ bool hasFPExceptions() const {
+ return FPExceptions;
+ }
+
+ bool useFlatForGlobal() const {
+ return FlatForGlobal;
+ }
+
+ bool hasUnalignedBufferAccess() const {
+ return UnalignedBufferAccess;
+ }
+
+ bool hasUnalignedScratchAccess() const {
+ return UnalignedScratchAccess;
+ }
+
+ bool isXNACKEnabled() const {
+ return EnableXNACK;
+ }
+
+ bool isAmdCodeObjectV2() const {
+ return isAmdHsaOS() || isMesa3DOS();
+ }
+
+ /// \brief Returns the offset in bytes from the start of the input buffer
+ /// of the first explicit kernel argument.
+ unsigned getExplicitKernelArgOffset() const {
+ return isAmdCodeObjectV2() ? 0 : 36;
+ }
+
+ unsigned getAlignmentForImplicitArgPtr() const {
+ return isAmdHsaOS() ? 8 : 4;
+ }
+
+ unsigned getImplicitArgNumBytes() const {
+ if (isMesa3DOS())
+ return 16;
+ if (isAmdHsaOS() && isOpenCLEnv())
+ return 32;
+ return 0;
+ }
+
+ unsigned getStackAlignment() const {
+ // Scratch is allocated in 256 dword per wave blocks.
+ return 4 * 256 / getWavefrontSize();
+ }
+
+ bool enableMachineScheduler() const override {
+ return true;
+ }
+
+ bool enableSubRegLiveness() const override {
+ return true;
+ }
+
+ /// \returns Number of execution units per compute unit supported by the
+ /// subtarget.
+ unsigned getEUsPerCU() const {
+ return 4;
+ }
+
+ /// \returns Maximum number of work groups per compute unit supported by the
+ /// subtarget and limited by given flat work group size.
+ unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const {
+ if (getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
+ return 8;
+ return getWavesPerWorkGroup(FlatWorkGroupSize) == 1 ? 40 : 16;
+ }
+
+ /// \returns Maximum number of waves per compute unit supported by the
+ /// subtarget without any kind of limitation.
+ unsigned getMaxWavesPerCU() const {
+ return getMaxWavesPerEU() * getEUsPerCU();
+ }
+
+ /// \returns Maximum number of waves per compute unit supported by the
+ /// subtarget and limited by given flat work group size.
+ unsigned getMaxWavesPerCU(unsigned FlatWorkGroupSize) const {
+ return getWavesPerWorkGroup(FlatWorkGroupSize);
+ }
+
+ /// \returns Minimum number of waves per execution unit supported by the
+ /// subtarget.
+ unsigned getMinWavesPerEU() const {
+ return 1;
+ }
+
+ /// \returns Maximum number of waves per execution unit supported by the
+ /// subtarget without any kind of limitation.
+ unsigned getMaxWavesPerEU() const {
+ if (getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
+ return 8;
+ // FIXME: Need to take scratch memory into account.
+ return 10;
+ }
+
+ /// \returns Maximum number of waves per execution unit supported by the
+ /// subtarget and limited by given flat work group size.
+ unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const {
+ return alignTo(getMaxWavesPerCU(FlatWorkGroupSize), getEUsPerCU()) /
+ getEUsPerCU();
+ }
+
+ /// \returns Minimum flat work group size supported by the subtarget.
+ unsigned getMinFlatWorkGroupSize() const {
+ return 1;
+ }
+
+ /// \returns Maximum flat work group size supported by the subtarget.
+ unsigned getMaxFlatWorkGroupSize() const {
+ return 2048;
+ }
+
+ /// \returns Number of waves per work group given the flat work group size.
+ unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const {
+ return alignTo(FlatWorkGroupSize, getWavefrontSize()) / getWavefrontSize();
+ }
+
+ void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b;}
+ bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal;}
+
+ /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
+ /// for function \p F, or minimum/maximum flat work group sizes explicitly
+ /// requested using "amdgpu-flat-work-group-size" attribute attached to
+ /// function \p F.
+ ///
+ /// \returns Subtarget's default values if explicitly requested values cannot
+ /// be converted to integer, or violate subtarget's specifications.
+ std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
+
+ /// \returns Subtarget's default pair of minimum/maximum number of waves per
+ /// execution unit for function \p F, or minimum/maximum number of waves per
+ /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
+ /// attached to function \p F.
+ ///
+ /// \returns Subtarget's default values if explicitly requested values cannot
+ /// be converted to integer, violate subtarget's specifications, or are not
+ /// compatible with minimum/maximum number of waves limited by flat work group
+ /// size, register usage, and/or lds usage.
+ std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
+};
+
+class R600Subtarget final : public AMDGPUSubtarget {
+private:
+ R600InstrInfo InstrInfo;
+ R600FrameLowering FrameLowering;
+ R600TargetLowering TLInfo;
+
+public:
+ R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
+ const TargetMachine &TM);
+
+ const R600InstrInfo *getInstrInfo() const override {
+ return &InstrInfo;
+ }
+
+ const R600FrameLowering *getFrameLowering() const override {
+ return &FrameLowering;
+ }
+
+ const R600TargetLowering *getTargetLowering() const override {
+ return &TLInfo;
+ }
+
+ const R600RegisterInfo *getRegisterInfo() const override {
+ return &InstrInfo.getRegisterInfo();
+ }
+
+ bool hasCFAluBug() const {
+ return CFALUBug;
+ }
+
+ bool hasVertexCache() const {
+ return HasVertexCache;
+ }
+
+ short getTexVTXClauseSize() const {
+ return TexVTXClauseSize;
+ }
+};
+
+class SISubtarget final : public AMDGPUSubtarget {
+public:
+ enum {
+ // The closed Vulkan driver sets 96, which limits the wave count to 8 but
+ // doesn't spill SGPRs as much as when 80 is set.
+ FIXED_SGPR_COUNT_FOR_INIT_BUG = 96
+ };
+
+private:
+ SIInstrInfo InstrInfo;
+ SIFrameLowering FrameLowering;
+ SITargetLowering TLInfo;
+ std::unique_ptr<GISelAccessor> GISel;
+
+public:
+ SISubtarget(const Triple &TT, StringRef CPU, StringRef FS,
+ const TargetMachine &TM);
+
+ const SIInstrInfo *getInstrInfo() const override {
+ return &InstrInfo;
+ }
+
+ const SIFrameLowering *getFrameLowering() const override {
+ return &FrameLowering;
+ }
+
+ const SITargetLowering *getTargetLowering() const override {
+ return &TLInfo;
+ }
+
+ const CallLowering *getCallLowering() const override {
+ assert(GISel && "Access to GlobalISel APIs not set");
+ return GISel->getCallLowering();
+ }
+
+ const SIRegisterInfo *getRegisterInfo() const override {
+ return &InstrInfo.getRegisterInfo();
+ }
+
+ void setGISelAccessor(GISelAccessor &GISel) {
+ this->GISel.reset(&GISel);
+ }
+
+ void overrideSchedPolicy(MachineSchedPolicy &Policy,
+ unsigned NumRegionInstrs) const override;
+
+ bool isVGPRSpillingEnabled(const Function& F) const;
+
+ unsigned getMaxNumUserSGPRs() const {
+ return 16;
+ }
+
+ bool hasFlatAddressSpace() const {
+ return FlatAddressSpace;
+ }
+
+ bool hasSMemRealTime() const {
+ return HasSMemRealTime;
+ }
+
+ bool hasMovrel() const {
+ return HasMovrel;
+ }
+
+ bool hasVGPRIndexMode() const {
+ return HasVGPRIndexMode;
+ }
+
+ bool hasScalarCompareEq64() const {
+ return getGeneration() >= VOLCANIC_ISLANDS;
+ }
+
+ bool hasScalarStores() const {
+ return HasScalarStores;
+ }
+
+ bool hasInv2PiInlineImm() const {
+ return HasInv2PiInlineImm;
+ }
+
+ bool enableSIScheduler() const {
+ return EnableSIScheduler;
+ }
+
+ bool debuggerSupported() const {
+ return debuggerInsertNops() && debuggerReserveRegs() &&
+ debuggerEmitPrologue();
+ }
+
+ bool debuggerInsertNops() const {
+ return DebuggerInsertNops;
+ }
+
+ bool debuggerReserveRegs() const {
+ return DebuggerReserveRegs;
+ }
+
+ bool debuggerEmitPrologue() const {
+ return DebuggerEmitPrologue;
+ }
+
+ bool loadStoreOptEnabled() const {
+ return EnableLoadStoreOpt;
+ }
+
+ bool hasSGPRInitBug() const {
+ return SGPRInitBug;
+ }
+
+ bool has12DWordStoreHazard() const {
+ return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
+ }
+
+ unsigned getKernArgSegmentSize(unsigned ExplictArgBytes) const;
+
+ /// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs
+ unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
+
+ /// Return the maximum number of waves per SIMD for kernels using \p VGPRs VGPRs
+ unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
+
+ /// \returns True if waitcnt instruction is needed before barrier instruction,
+ /// false otherwise.
+ bool needWaitcntBeforeBarrier() const {
+ return true;
+ }
+
+ unsigned getMaxNumSGPRs() const;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
new file mode 100644
index 000000000000..d8a0c716279c
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -0,0 +1,645 @@
+//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief The AMDGPU target machine contains all of the hardware specific
+/// information needed to emit code for R600 and SI GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUTargetMachine.h"
+#include "AMDGPU.h"
+#include "AMDGPUCallLowering.h"
+#include "AMDGPUTargetObjectFile.h"
+#include "AMDGPUTargetTransformInfo.h"
+#include "GCNSchedStrategy.h"
+#include "R600MachineScheduler.h"
+#include "SIMachineScheduler.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
+#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/AlwaysInliner.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Vectorize.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include <memory>
+
+using namespace llvm;
+
+static cl::opt<bool> EnableR600StructurizeCFG(
+ "r600-ir-structurize",
+ cl::desc("Use StructurizeCFG IR pass"),
+ cl::init(true));
+
+static cl::opt<bool> EnableSROA(
+ "amdgpu-sroa",
+ cl::desc("Run SROA after promote alloca pass"),
+ cl::ReallyHidden,
+ cl::init(true));
+
+static cl::opt<bool> EnableR600IfConvert(
+ "r600-if-convert",
+ cl::desc("Use if conversion pass"),
+ cl::ReallyHidden,
+ cl::init(true));
+
+// Option to disable vectorizer for tests.
+static cl::opt<bool> EnableLoadStoreVectorizer(
+ "amdgpu-load-store-vectorizer",
+ cl::desc("Enable load store vectorizer"),
+ cl::init(true),
+ cl::Hidden);
+
+// Option to to control global loads scalarization
+static cl::opt<bool> ScalarizeGlobal(
+ "amdgpu-scalarize-global-loads",
+ cl::desc("Enable global load scalarization"),
+ cl::init(false),
+ cl::Hidden);
+
+extern "C" void LLVMInitializeAMDGPUTarget() {
+ // Register the target
+ RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
+ RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
+
+ PassRegistry *PR = PassRegistry::getPassRegistry();
+ initializeSILowerI1CopiesPass(*PR);
+ initializeSIFixSGPRCopiesPass(*PR);
+ initializeSIFoldOperandsPass(*PR);
+ initializeSIShrinkInstructionsPass(*PR);
+ initializeSIFixControlFlowLiveIntervalsPass(*PR);
+ initializeSILoadStoreOptimizerPass(*PR);
+ initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
+ initializeAMDGPUAnnotateUniformValuesPass(*PR);
+ initializeAMDGPUPromoteAllocaPass(*PR);
+ initializeAMDGPUCodeGenPreparePass(*PR);
+ initializeAMDGPUUnifyMetadataPass(*PR);
+ initializeSIAnnotateControlFlowPass(*PR);
+ initializeSIInsertWaitsPass(*PR);
+ initializeSIWholeQuadModePass(*PR);
+ initializeSILowerControlFlowPass(*PR);
+ initializeSIInsertSkipsPass(*PR);
+ initializeSIDebuggerInsertNopsPass(*PR);
+ initializeSIOptimizeExecMaskingPass(*PR);
+}
+
+static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
+ return llvm::make_unique<AMDGPUTargetObjectFile>();
+}
+
+static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
+ return new ScheduleDAGMILive(C, llvm::make_unique<R600SchedStrategy>());
+}
+
+static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
+ return new SIScheduleDAGMI(C);
+}
+
+static ScheduleDAGInstrs *
+createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
+ ScheduleDAGMILive *DAG =
+ new ScheduleDAGMILive(C,
+ llvm::make_unique<GCNMaxOccupancySchedStrategy>(C));
+ DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+ DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
+ return DAG;
+}
+
+static MachineSchedRegistry
+R600SchedRegistry("r600", "Run R600's custom scheduler",
+ createR600MachineScheduler);
+
+static MachineSchedRegistry
+SISchedRegistry("si", "Run SI's custom scheduler",
+ createSIMachineScheduler);
+
+static MachineSchedRegistry
+GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
+ "Run GCN scheduler to maximize occupancy",
+ createGCNMaxOccupancyMachineScheduler);
+
+static StringRef computeDataLayout(const Triple &TT) {
+ if (TT.getArch() == Triple::r600) {
+ // 32-bit pointers.
+ return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
+ "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
+ }
+
+ // 32-bit private, local, and region pointers. 64-bit global, constant and
+ // flat.
+ return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32"
+ "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
+ "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
+}
+
+LLVM_READNONE
+static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
+ if (!GPU.empty())
+ return GPU;
+
+ // HSA only supports CI+, so change the default GPU to a CI for HSA.
+ if (TT.getArch() == Triple::amdgcn)
+ return (TT.getOS() == Triple::AMDHSA) ? "kaveri" : "tahiti";
+
+ return "r600";
+}
+
+static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
+ // The AMDGPU toolchain only supports generating shared objects, so we
+ // must always use PIC.
+ return Reloc::PIC_;
+}
+
+AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
+ StringRef CPU, StringRef FS,
+ TargetOptions Options,
+ Optional<Reloc::Model> RM,
+ CodeModel::Model CM,
+ CodeGenOpt::Level OptLevel)
+ : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
+ FS, Options, getEffectiveRelocModel(RM), CM, OptLevel),
+ TLOF(createTLOF(getTargetTriple())) {
+ initAsmInfo();
+}
+
+AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
+
+StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
+ Attribute GPUAttr = F.getFnAttribute("target-cpu");
+ return GPUAttr.hasAttribute(Attribute::None) ?
+ getTargetCPU() : GPUAttr.getValueAsString();
+}
+
+StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
+ Attribute FSAttr = F.getFnAttribute("target-features");
+
+ return FSAttr.hasAttribute(Attribute::None) ?
+ getTargetFeatureString() :
+ FSAttr.getValueAsString();
+}
+
+void AMDGPUTargetMachine::addEarlyAsPossiblePasses(PassManagerBase &PM) {
+ PM.add(createAMDGPUUnifyMetadataPass());
+}
+
+//===----------------------------------------------------------------------===//
+// R600 Target Machine (R600 -> Cayman)
+//===----------------------------------------------------------------------===//
+
+R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,
+ StringRef CPU, StringRef FS,
+ TargetOptions Options,
+ Optional<Reloc::Model> RM,
+ CodeModel::Model CM, CodeGenOpt::Level OL)
+ : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {
+ setRequiresStructuredCFG(true);
+}
+
+const R600Subtarget *R600TargetMachine::getSubtargetImpl(
+ const Function &F) const {
+ StringRef GPU = getGPUName(F);
+ StringRef FS = getFeatureString(F);
+
+ SmallString<128> SubtargetKey(GPU);
+ SubtargetKey.append(FS);
+
+ auto &I = SubtargetMap[SubtargetKey];
+ if (!I) {
+ // This needs to be done before we create a new subtarget since any
+ // creation will depend on the TM and the code generation flags on the
+ // function that reside in TargetOptions.
+ resetTargetOptions(F);
+ I = llvm::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this);
+ }
+
+ return I.get();
+}
+
+//===----------------------------------------------------------------------===//
+// GCN Target Machine (SI+)
+//===----------------------------------------------------------------------===//
+
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+namespace {
+
+struct SIGISelActualAccessor : public GISelAccessor {
+ std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
+ const AMDGPUCallLowering *getCallLowering() const override {
+ return CallLoweringInfo.get();
+ }
+};
+
+} // end anonymous namespace
+#endif
+
+GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
+ StringRef CPU, StringRef FS,
+ TargetOptions Options,
+ Optional<Reloc::Model> RM,
+ CodeModel::Model CM, CodeGenOpt::Level OL)
+ : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
+
+const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
+ StringRef GPU = getGPUName(F);
+ StringRef FS = getFeatureString(F);
+
+ SmallString<128> SubtargetKey(GPU);
+ SubtargetKey.append(FS);
+
+ auto &I = SubtargetMap[SubtargetKey];
+ if (!I) {
+ // This needs to be done before we create a new subtarget since any
+ // creation will depend on the TM and the code generation flags on the
+ // function that reside in TargetOptions.
+ resetTargetOptions(F);
+ I = llvm::make_unique<SISubtarget>(TargetTriple, GPU, FS, *this);
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+ GISelAccessor *GISel = new GISelAccessor();
+#else
+ SIGISelActualAccessor *GISel = new SIGISelActualAccessor();
+ GISel->CallLoweringInfo.reset(
+ new AMDGPUCallLowering(*I->getTargetLowering()));
+#endif
+
+ I->setGISelAccessor(*GISel);
+ }
+
+ I->setScalarizeGlobalBehavior(ScalarizeGlobal);
+
+ return I.get();
+}
+
+//===----------------------------------------------------------------------===//
+// AMDGPU Pass Setup
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+class AMDGPUPassConfig : public TargetPassConfig {
+public:
+ AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM)
+ : TargetPassConfig(TM, PM) {
+ // Exceptions and StackMaps are not supported, so these passes will never do
+ // anything.
+ disablePass(&StackMapLivenessID);
+ disablePass(&FuncletLayoutID);
+ }
+
+ AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
+ return getTM<AMDGPUTargetMachine>();
+ }
+
+ ScheduleDAGInstrs *
+ createMachineScheduler(MachineSchedContext *C) const override {
+ ScheduleDAGMILive *DAG = createGenericSchedLive(C);
+ DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+ DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
+ return DAG;
+ }
+
+ void addEarlyCSEOrGVNPass();
+ void addStraightLineScalarOptimizationPasses();
+ void addIRPasses() override;
+ void addCodeGenPrepare() override;
+ bool addPreISel() override;
+ bool addInstSelector() override;
+ bool addGCPasses() override;
+};
+
+class R600PassConfig final : public AMDGPUPassConfig {
+public:
+ R600PassConfig(TargetMachine *TM, PassManagerBase &PM)
+ : AMDGPUPassConfig(TM, PM) {}
+
+ ScheduleDAGInstrs *createMachineScheduler(
+ MachineSchedContext *C) const override {
+ return createR600MachineScheduler(C);
+ }
+
+ bool addPreISel() override;
+ void addPreRegAlloc() override;
+ void addPreSched2() override;
+ void addPreEmitPass() override;
+};
+
+class GCNPassConfig final : public AMDGPUPassConfig {
+public:
+ GCNPassConfig(TargetMachine *TM, PassManagerBase &PM)
+ : AMDGPUPassConfig(TM, PM) {}
+
+ GCNTargetMachine &getGCNTargetMachine() const {
+ return getTM<GCNTargetMachine>();
+ }
+
+ ScheduleDAGInstrs *
+ createMachineScheduler(MachineSchedContext *C) const override;
+
+ void addIRPasses() override;
+ bool addPreISel() override;
+ void addMachineSSAOptimization() override;
+ bool addInstSelector() override;
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+ bool addIRTranslator() override;
+ bool addLegalizeMachineIR() override;
+ bool addRegBankSelect() override;
+ bool addGlobalInstructionSelect() override;
+#endif
+ void addFastRegAlloc(FunctionPass *RegAllocPass) override;
+ void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
+ void addPreRegAlloc() override;
+ void addPostRegAlloc() override;
+ void addPreSched2() override;
+ void addPreEmitPass() override;
+};
+
+} // end anonymous namespace
+
+TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() {
+ return TargetIRAnalysis([this](const Function &F) {
+ return TargetTransformInfo(AMDGPUTTIImpl(this, F));
+ });
+}
+
+void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
+ if (getOptLevel() == CodeGenOpt::Aggressive)
+ addPass(createGVNPass());
+ else
+ addPass(createEarlyCSEPass());
+}
+
+void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
+ addPass(createSeparateConstOffsetFromGEPPass());
+ addPass(createSpeculativeExecutionPass());
+ // ReassociateGEPs exposes more opportunites for SLSR. See
+ // the example in reassociate-geps-and-slsr.ll.
+ addPass(createStraightLineStrengthReducePass());
+ // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
+ // EarlyCSE can reuse.
+ addEarlyCSEOrGVNPass();
+ // Run NaryReassociate after EarlyCSE/GVN to be more effective.
+ addPass(createNaryReassociatePass());
+ // NaryReassociate on GEPs creates redundant common expressions, so run
+ // EarlyCSE after it.
+ addPass(createEarlyCSEPass());
+}
+
+void AMDGPUPassConfig::addIRPasses() {
+ // There is no reason to run these.
+ disablePass(&StackMapLivenessID);
+ disablePass(&FuncletLayoutID);
+ disablePass(&PatchableFunctionID);
+
+ // Function calls are not supported, so make sure we inline everything.
+ addPass(createAMDGPUAlwaysInlinePass());
+ addPass(createAlwaysInlinerLegacyPass());
+ // We need to add the barrier noop pass, otherwise adding the function
+ // inlining pass will cause all of the PassConfigs passes to be run
+ // one function at a time, which means if we have a nodule with two
+ // functions, then we will generate code for the first function
+ // without ever running any passes on the second.
+ addPass(createBarrierNoopPass());
+
+ // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
+ addPass(createAMDGPUOpenCLImageTypeLoweringPass());
+
+ const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
+ if (TM.getOptLevel() > CodeGenOpt::None) {
+ addPass(createAMDGPUPromoteAlloca(&TM));
+
+ if (EnableSROA)
+ addPass(createSROAPass());
+
+ addStraightLineScalarOptimizationPasses();
+ }
+
+ TargetPassConfig::addIRPasses();
+
+ // EarlyCSE is not always strong enough to clean up what LSR produces. For
+ // example, GVN can combine
+ //
+ // %0 = add %a, %b
+ // %1 = add %b, %a
+ //
+ // and
+ //
+ // %0 = shl nsw %a, 2
+ // %1 = shl %a, 2
+ //
+ // but EarlyCSE can do neither of them.
+ if (getOptLevel() != CodeGenOpt::None)
+ addEarlyCSEOrGVNPass();
+}
+
+void AMDGPUPassConfig::addCodeGenPrepare() {
+ TargetPassConfig::addCodeGenPrepare();
+
+ if (EnableLoadStoreVectorizer)
+ addPass(createLoadStoreVectorizerPass());
+}
+
+bool AMDGPUPassConfig::addPreISel() {
+ addPass(createFlattenCFGPass());
+ return false;
+}
+
+bool AMDGPUPassConfig::addInstSelector() {
+ addPass(createAMDGPUISelDag(getAMDGPUTargetMachine(), getOptLevel()));
+ return false;
+}
+
+bool AMDGPUPassConfig::addGCPasses() {
+ // Do nothing. GC is not supported.
+ return false;
+}
+
+//===----------------------------------------------------------------------===//
+// R600 Pass Setup
+//===----------------------------------------------------------------------===//
+
+bool R600PassConfig::addPreISel() {
+ AMDGPUPassConfig::addPreISel();
+
+ if (EnableR600StructurizeCFG)
+ addPass(createStructurizeCFGPass());
+ return false;
+}
+
+void R600PassConfig::addPreRegAlloc() {
+ addPass(createR600VectorRegMerger(*TM));
+}
+
+void R600PassConfig::addPreSched2() {
+ addPass(createR600EmitClauseMarkers(), false);
+ if (EnableR600IfConvert)
+ addPass(&IfConverterID, false);
+ addPass(createR600ClauseMergePass(*TM), false);
+}
+
+void R600PassConfig::addPreEmitPass() {
+ addPass(createAMDGPUCFGStructurizerPass(), false);
+ addPass(createR600ExpandSpecialInstrsPass(*TM), false);
+ addPass(&FinalizeMachineBundlesID, false);
+ addPass(createR600Packetizer(*TM), false);
+ addPass(createR600ControlFlowFinalizer(*TM), false);
+}
+
+TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
+ return new R600PassConfig(this, PM);
+}
+
+//===----------------------------------------------------------------------===//
+// GCN Pass Setup
+//===----------------------------------------------------------------------===//
+
+ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
+ MachineSchedContext *C) const {
+ const SISubtarget &ST = C->MF->getSubtarget<SISubtarget>();
+ if (ST.enableSIScheduler())
+ return createSIMachineScheduler(C);
+ return createGCNMaxOccupancyMachineScheduler(C);
+}
+
+bool GCNPassConfig::addPreISel() {
+ AMDGPUPassConfig::addPreISel();
+
+ // FIXME: We need to run a pass to propagate the attributes when calls are
+ // supported.
+ addPass(&AMDGPUAnnotateKernelFeaturesID);
+ addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
+ addPass(createSinkingPass());
+ addPass(createSITypeRewriter());
+ addPass(createAMDGPUAnnotateUniformValues());
+ addPass(createSIAnnotateControlFlowPass());
+
+ return false;
+}
+
+void GCNPassConfig::addMachineSSAOptimization() {
+ TargetPassConfig::addMachineSSAOptimization();
+
+ // We want to fold operands after PeepholeOptimizer has run (or as part of
+ // it), because it will eliminate extra copies making it easier to fold the
+ // real source operand. We want to eliminate dead instructions after, so that
+ // we see fewer uses of the copies. We then need to clean up the dead
+ // instructions leftover after the operands are folded as well.
+ //
+ // XXX - Can we get away without running DeadMachineInstructionElim again?
+ addPass(&SIFoldOperandsID);
+ addPass(&DeadMachineInstructionElimID);
+ addPass(&SILoadStoreOptimizerID);
+}
+
+void GCNPassConfig::addIRPasses() {
+ // TODO: May want to move later or split into an early and late one.
+ addPass(createAMDGPUCodeGenPreparePass(&getGCNTargetMachine()));
+
+ AMDGPUPassConfig::addIRPasses();
+}
+
+bool GCNPassConfig::addInstSelector() {
+ AMDGPUPassConfig::addInstSelector();
+ addPass(createSILowerI1CopiesPass());
+ addPass(&SIFixSGPRCopiesID);
+ return false;
+}
+
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+bool GCNPassConfig::addIRTranslator() {
+ addPass(new IRTranslator());
+ return false;
+}
+
+bool GCNPassConfig::addLegalizeMachineIR() {
+ return false;
+}
+
+bool GCNPassConfig::addRegBankSelect() {
+ return false;
+}
+
+bool GCNPassConfig::addGlobalInstructionSelect() {
+ return false;
+}
+#endif
+
+void GCNPassConfig::addPreRegAlloc() {
+ addPass(createSIShrinkInstructionsPass());
+ addPass(createSIWholeQuadModePass());
+}
+
+void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
+ // FIXME: We have to disable the verifier here because of PHIElimination +
+ // TwoAddressInstructions disabling it.
+
+ // This must be run immediately after phi elimination and before
+ // TwoAddressInstructions, otherwise the processing of the tied operand of
+ // SI_ELSE will introduce a copy of the tied operand source after the else.
+ insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
+
+ TargetPassConfig::addFastRegAlloc(RegAllocPass);
+}
+
+void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
+ // This needs to be run directly before register allocation because earlier
+ // passes might recompute live intervals.
+ insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID);
+
+ // This must be run immediately after phi elimination and before
+ // TwoAddressInstructions, otherwise the processing of the tied operand of
+ // SI_ELSE will introduce a copy of the tied operand source after the else.
+ insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
+
+ TargetPassConfig::addOptimizedRegAlloc(RegAllocPass);
+}
+
+void GCNPassConfig::addPostRegAlloc() {
+ addPass(&SIOptimizeExecMaskingID);
+ TargetPassConfig::addPostRegAlloc();
+}
+
+void GCNPassConfig::addPreSched2() {
+}
+
+void GCNPassConfig::addPreEmitPass() {
+ // The hazard recognizer that runs as part of the post-ra scheduler does not
+ // guarantee to be able handle all hazards correctly. This is because if there
+ // are multiple scheduling regions in a basic block, the regions are scheduled
+ // bottom up, so when we begin to schedule a region we don't know what
+ // instructions were emitted directly before it.
+ //
+ // Here we add a stand-alone hazard recognizer pass which can handle all
+ // cases.
+ addPass(&PostRAHazardRecognizerID);
+
+ addPass(createSIInsertWaitsPass());
+ addPass(createSIShrinkInstructionsPass());
+ addPass(&SIInsertSkipsPassID);
+ addPass(createSIDebuggerInsertNopsPass());
+ addPass(&BranchRelaxationPassID);
+}
+
+TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
+ return new GCNPassConfig(this, PM);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
new file mode 100644
index 000000000000..9496773a073f
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -0,0 +1,103 @@
+//===-- AMDGPUTargetMachine.h - AMDGPU TargetMachine Interface --*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief The AMDGPU TargetMachine interface definition for hw codgen targets.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETMACHINE_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETMACHINE_H
+
+#include "AMDGPUIntrinsicInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Target/TargetMachine.h"
+#include <memory>
+
+namespace llvm {
+
+//===----------------------------------------------------------------------===//
+// AMDGPU Target Machine (R600+)
+//===----------------------------------------------------------------------===//
+
+class AMDGPUTargetMachine : public LLVMTargetMachine {
+protected:
+ std::unique_ptr<TargetLoweringObjectFile> TLOF;
+ AMDGPUIntrinsicInfo IntrinsicInfo;
+
+ StringRef getGPUName(const Function &F) const;
+ StringRef getFeatureString(const Function &F) const;
+
+public:
+ AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, TargetOptions Options,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL);
+ ~AMDGPUTargetMachine() override;
+
+ const AMDGPUSubtarget *getSubtargetImpl() const;
+ const AMDGPUSubtarget *getSubtargetImpl(const Function &) const override = 0;
+
+ const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override {
+ return &IntrinsicInfo;
+ }
+ TargetIRAnalysis getTargetIRAnalysis() override;
+
+ TargetLoweringObjectFile *getObjFileLowering() const override {
+ return TLOF.get();
+ }
+ void addEarlyAsPossiblePasses(PassManagerBase &PM) override;
+};
+
+//===----------------------------------------------------------------------===//
+// R600 Target Machine (R600 -> Cayman)
+//===----------------------------------------------------------------------===//
+
+class R600TargetMachine final : public AMDGPUTargetMachine {
+private:
+ mutable StringMap<std::unique_ptr<R600Subtarget>> SubtargetMap;
+
+public:
+ R600TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, TargetOptions Options,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL);
+
+ TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+ const R600Subtarget *getSubtargetImpl(const Function &) const override;
+};
+
+//===----------------------------------------------------------------------===//
+// GCN Target Machine (SI+)
+//===----------------------------------------------------------------------===//
+
+class GCNTargetMachine final : public AMDGPUTargetMachine {
+private:
+ mutable StringMap<std::unique_ptr<SISubtarget>> SubtargetMap;
+
+public:
+ GCNTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, TargetOptions Options,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL);
+
+ TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+ const SISubtarget *getSubtargetImpl(const Function &) const override;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETMACHINE_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
new file mode 100644
index 000000000000..1fddc88a705a
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
@@ -0,0 +1,30 @@
+//===-- AMDGPUHSATargetObjectFile.cpp - AMDGPU Object Files ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUTargetObjectFile.h"
+#include "AMDGPU.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/ELF.h"
+#include "Utils/AMDGPUBaseInfo.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Generic Object File
+//===----------------------------------------------------------------------===//
+
+MCSection *AMDGPUTargetObjectFile::SelectSectionForGlobal(
+ const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+ if (Kind.isReadOnly() && AMDGPU::isReadOnlySegment(GO) &&
+ AMDGPU::shouldEmitConstantsToTextSection(TM.getTargetTriple()))
+ return TextSection;
+
+ return TargetLoweringObjectFileELF::SelectSectionForGlobal(GO, Kind, TM);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
new file mode 100644
index 000000000000..de327786dff6
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
@@ -0,0 +1,32 @@
+//===-- AMDGPUTargetObjectFile.h - AMDGPU Object Info ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file declares the AMDGPU-specific subclass of
+/// TargetLoweringObjectFile.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class AMDGPUTargetObjectFile : public TargetLoweringObjectFileELF {
+ public:
+ MCSection *SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind,
+ const TargetMachine &TM) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
new file mode 100644
index 000000000000..a1a352642242
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -0,0 +1,340 @@
+//===-- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// This file implements a TargetTransformInfo analysis pass specific to the
+// AMDGPU target machine. It uses the target's detailed information to provide
+// more precise answers to certain TTI queries, while letting the target
+// independent and default TTI implementations handle the rest.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUTargetTransformInfo.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/CostTable.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "AMDGPUtti"
+
+
+void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L,
+ TTI::UnrollingPreferences &UP) {
+ UP.Threshold = 300; // Twice the default.
+ UP.MaxCount = UINT_MAX;
+ UP.Partial = true;
+
+ // TODO: Do we want runtime unrolling?
+
+ for (const BasicBlock *BB : L->getBlocks()) {
+ const DataLayout &DL = BB->getModule()->getDataLayout();
+ for (const Instruction &I : *BB) {
+ const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
+ if (!GEP || GEP->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
+ continue;
+
+ const Value *Ptr = GEP->getPointerOperand();
+ const AllocaInst *Alloca =
+ dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL));
+ if (Alloca) {
+ // We want to do whatever we can to limit the number of alloca
+ // instructions that make it through to the code generator. allocas
+ // require us to use indirect addressing, which is slow and prone to
+ // compiler bugs. If this loop does an address calculation on an
+ // alloca ptr, then we want to use a higher than normal loop unroll
+ // threshold. This will give SROA a better chance to eliminate these
+ // allocas.
+ //
+ // Don't use the maximum allowed value here as it will make some
+ // programs way too big.
+ UP.Threshold = 800;
+ }
+ }
+ }
+}
+
+unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) {
+ if (Vec)
+ return 0;
+
+ // Number of VGPRs on SI.
+ if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
+ return 256;
+
+ return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
+}
+
+unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) {
+ return Vector ? 0 : 32;
+}
+
+unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
+ switch (AddrSpace) {
+ case AMDGPUAS::GLOBAL_ADDRESS:
+ case AMDGPUAS::CONSTANT_ADDRESS:
+ case AMDGPUAS::FLAT_ADDRESS:
+ return 128;
+ case AMDGPUAS::LOCAL_ADDRESS:
+ case AMDGPUAS::REGION_ADDRESS:
+ return 64;
+ case AMDGPUAS::PRIVATE_ADDRESS:
+ return 8 * ST->getMaxPrivateElementSize();
+ default:
+ if (ST->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS &&
+ (AddrSpace == AMDGPUAS::PARAM_D_ADDRESS ||
+ AddrSpace == AMDGPUAS::PARAM_I_ADDRESS ||
+ (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 &&
+ AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15)))
+ return 128;
+ llvm_unreachable("unhandled address space");
+ }
+}
+
+unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
+ // Semi-arbitrary large amount.
+ return 64;
+}
+
+int AMDGPUTTIImpl::getArithmeticInstrCost(
+ unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
+ TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
+ TTI::OperandValueProperties Opd2PropInfo) {
+
+ EVT OrigTy = TLI->getValueType(DL, Ty);
+ if (!OrigTy.isSimple()) {
+ return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+ Opd1PropInfo, Opd2PropInfo);
+ }
+
+ // Legalize the type.
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+
+ // Because we don't have any legal vector operations, but the legal types, we
+ // need to account for split vectors.
+ unsigned NElts = LT.second.isVector() ?
+ LT.second.getVectorNumElements() : 1;
+
+ MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
+
+ switch (ISD) {
+ case ISD::SHL:
+ case ISD::SRL:
+ case ISD::SRA: {
+ if (SLT == MVT::i64)
+ return get64BitInstrCost() * LT.first * NElts;
+
+ // i32
+ return getFullRateInstrCost() * LT.first * NElts;
+ }
+ case ISD::ADD:
+ case ISD::SUB:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR: {
+ if (SLT == MVT::i64){
+ // and, or and xor are typically split into 2 VALU instructions.
+ return 2 * getFullRateInstrCost() * LT.first * NElts;
+ }
+
+ return LT.first * NElts * getFullRateInstrCost();
+ }
+ case ISD::MUL: {
+ const int QuarterRateCost = getQuarterRateInstrCost();
+ if (SLT == MVT::i64) {
+ const int FullRateCost = getFullRateInstrCost();
+ return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
+ }
+
+ // i32
+ return QuarterRateCost * NElts * LT.first;
+ }
+ case ISD::FADD:
+ case ISD::FSUB:
+ case ISD::FMUL:
+ if (SLT == MVT::f64)
+ return LT.first * NElts * get64BitInstrCost();
+
+ if (SLT == MVT::f32 || SLT == MVT::f16)
+ return LT.first * NElts * getFullRateInstrCost();
+ break;
+
+ case ISD::FDIV:
+ case ISD::FREM:
+ // FIXME: frem should be handled separately. The fdiv in it is most of it,
+ // but the current lowering is also not entirely correct.
+ if (SLT == MVT::f64) {
+ int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost();
+
+ // Add cost of workaround.
+ if (ST->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS)
+ Cost += 3 * getFullRateInstrCost();
+
+ return LT.first * Cost * NElts;
+ }
+
+ // Assuming no fp32 denormals lowering.
+ if (SLT == MVT::f32 || SLT == MVT::f16) {
+ assert(!ST->hasFP32Denormals() && "will change when supported");
+ int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost();
+ return LT.first * NElts * Cost;
+ }
+
+ break;
+ default:
+ break;
+ }
+
+ return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+ Opd1PropInfo, Opd2PropInfo);
+}
+
+unsigned AMDGPUTTIImpl::getCFInstrCost(unsigned Opcode) {
+ // XXX - For some reason this isn't called for switch.
+ switch (Opcode) {
+ case Instruction::Br:
+ case Instruction::Ret:
+ return 10;
+ default:
+ return BaseT::getCFInstrCost(Opcode);
+ }
+}
+
+int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
+ unsigned Index) {
+ switch (Opcode) {
+ case Instruction::ExtractElement:
+ case Instruction::InsertElement:
+ // Extracts are just reads of a subregister, so are free. Inserts are
+ // considered free because we don't want to have any cost for scalarizing
+ // operations, and we don't have to copy into a different register class.
+
+ // Dynamic indexing isn't free and is best avoided.
+ return Index == ~0u ? 2 : 0;
+ default:
+ return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
+ }
+}
+
+static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII,
+ const IntrinsicInst *I) {
+ switch (I->getIntrinsicID()) {
+ default:
+ return false;
+ case Intrinsic::not_intrinsic:
+ // This means we have an intrinsic that isn't defined in
+ // IntrinsicsAMDGPU.td
+ break;
+
+ case Intrinsic::amdgcn_workitem_id_x:
+ case Intrinsic::amdgcn_workitem_id_y:
+ case Intrinsic::amdgcn_workitem_id_z:
+ case Intrinsic::amdgcn_interp_mov:
+ case Intrinsic::amdgcn_interp_p1:
+ case Intrinsic::amdgcn_interp_p2:
+ case Intrinsic::amdgcn_mbcnt_hi:
+ case Intrinsic::amdgcn_mbcnt_lo:
+ case Intrinsic::r600_read_tidig_x:
+ case Intrinsic::r600_read_tidig_y:
+ case Intrinsic::r600_read_tidig_z:
+ case Intrinsic::amdgcn_image_atomic_swap:
+ case Intrinsic::amdgcn_image_atomic_add:
+ case Intrinsic::amdgcn_image_atomic_sub:
+ case Intrinsic::amdgcn_image_atomic_smin:
+ case Intrinsic::amdgcn_image_atomic_umin:
+ case Intrinsic::amdgcn_image_atomic_smax:
+ case Intrinsic::amdgcn_image_atomic_umax:
+ case Intrinsic::amdgcn_image_atomic_and:
+ case Intrinsic::amdgcn_image_atomic_or:
+ case Intrinsic::amdgcn_image_atomic_xor:
+ case Intrinsic::amdgcn_image_atomic_inc:
+ case Intrinsic::amdgcn_image_atomic_dec:
+ case Intrinsic::amdgcn_image_atomic_cmpswap:
+ case Intrinsic::amdgcn_buffer_atomic_swap:
+ case Intrinsic::amdgcn_buffer_atomic_add:
+ case Intrinsic::amdgcn_buffer_atomic_sub:
+ case Intrinsic::amdgcn_buffer_atomic_smin:
+ case Intrinsic::amdgcn_buffer_atomic_umin:
+ case Intrinsic::amdgcn_buffer_atomic_smax:
+ case Intrinsic::amdgcn_buffer_atomic_umax:
+ case Intrinsic::amdgcn_buffer_atomic_and:
+ case Intrinsic::amdgcn_buffer_atomic_or:
+ case Intrinsic::amdgcn_buffer_atomic_xor:
+ case Intrinsic::amdgcn_buffer_atomic_cmpswap:
+ case Intrinsic::amdgcn_ps_live:
+ return true;
+ }
+
+ StringRef Name = I->getCalledFunction()->getName();
+ switch (TII->lookupName((const char *)Name.bytes_begin(), Name.size())) {
+ default:
+ return false;
+ case AMDGPUIntrinsic::SI_fs_interp:
+ case AMDGPUIntrinsic::SI_fs_constant:
+ return true;
+ }
+}
+
+static bool isArgPassedInSGPR(const Argument *A) {
+ const Function *F = A->getParent();
+
+ // Arguments to compute shaders are never a source of divergence.
+ if (!AMDGPU::isShader(F->getCallingConv()))
+ return true;
+
+ // For non-compute shaders, SGPR inputs are marked with either inreg or byval.
+ if (F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::InReg) ||
+ F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::ByVal))
+ return true;
+
+ // Everything else is in VGPRs.
+ return false;
+}
+
+///
+/// \returns true if the result of the value could potentially be
+/// different across workitems in a wavefront.
+bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
+
+ if (const Argument *A = dyn_cast<Argument>(V))
+ return !isArgPassedInSGPR(A);
+
+ // Loads from the private address space are divergent, because threads
+ // can execute the load instruction with the same inputs and get different
+ // results.
+ //
+ // All other loads are not divergent, because if threads issue loads with the
+ // same arguments, they will always get the same result.
+ if (const LoadInst *Load = dyn_cast<LoadInst>(V))
+ return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
+
+ // Atomics are divergent because they are executed sequentially: when an
+ // atomic operation refers to the same address in each thread, then each
+ // thread after the first sees the value written by the previous thread as
+ // original value.
+ if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
+ return true;
+
+ if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
+ const TargetMachine &TM = getTLI()->getTargetMachine();
+ return isIntrinsicSourceOfDivergence(TM.getIntrinsicInfo(), Intrinsic);
+ }
+
+ // Assume all function calls are a source of divergence.
+ if (isa<CallInst>(V) || isa<InvokeInst>(V))
+ return true;
+
+ return false;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
new file mode 100644
index 000000000000..1177007644ff
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -0,0 +1,98 @@
+//===-- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI -------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file a TargetTransformInfo::Concept conforming object specific to the
+/// AMDGPU target machine. It uses the target's detailed information to
+/// provide more precise answers to certain TTI queries, while letting the
+/// target independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
+
+#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+
+namespace llvm {
+class AMDGPUTargetLowering;
+
+class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
+ typedef BasicTTIImplBase<AMDGPUTTIImpl> BaseT;
+ typedef TargetTransformInfo TTI;
+ friend BaseT;
+
+ const AMDGPUSubtarget *ST;
+ const AMDGPUTargetLowering *TLI;
+
+ const AMDGPUSubtarget *getST() const { return ST; }
+ const AMDGPUTargetLowering *getTLI() const { return TLI; }
+
+
+ static inline int getFullRateInstrCost() {
+ return TargetTransformInfo::TCC_Basic;
+ }
+
+ static inline int getHalfRateInstrCost() {
+ return 2 * TargetTransformInfo::TCC_Basic;
+ }
+
+ // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe
+ // should be 2 or 4.
+ static inline int getQuarterRateInstrCost() {
+ return 3 * TargetTransformInfo::TCC_Basic;
+ }
+
+ // On some parts, normal fp64 operations are half rate, and others
+ // quarter. This also applies to some integer operations.
+ inline int get64BitInstrCost() const {
+ return ST->hasHalfRate64Ops() ?
+ getHalfRateInstrCost() : getQuarterRateInstrCost();
+ }
+
+public:
+ explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
+ : BaseT(TM, F.getParent()->getDataLayout()),
+ ST(TM->getSubtargetImpl(F)),
+ TLI(ST->getTargetLowering()) {}
+
+ bool hasBranchDivergence() { return true; }
+
+ void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
+
+ TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) {
+ assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
+ return TTI::PSK_FastHardware;
+ }
+
+ unsigned getNumberOfRegisters(bool Vector);
+ unsigned getRegisterBitWidth(bool Vector);
+ unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
+ unsigned getMaxInterleaveFactor(unsigned VF);
+
+ int getArithmeticInstrCost(
+ unsigned Opcode, Type *Ty,
+ TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+ TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+ TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+
+ unsigned getCFInstrCost(unsigned Opcode);
+
+ int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
+ bool isSourceOfDivergence(const Value *V) const;
+
+ unsigned getVectorSplitCost() { return 0; }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
new file mode 100644
index 000000000000..bf501a1e8405
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
@@ -0,0 +1,149 @@
+//===-- AMDGPUUnifyMetadata.cpp - Unify OpenCL metadata -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// \brief This pass that unifies multiple OpenCL metadata due to linking.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+namespace {
+ namespace kOCLMD {
+ const char SpirVer[] = "opencl.spir.version";
+ const char OCLVer[] = "opencl.ocl.version";
+ const char UsedExt[] = "opencl.used.extensions";
+ const char UsedOptCoreFeat[] = "opencl.used.optional.core.features";
+ const char CompilerOptions[] = "opencl.compiler.options";
+ const char LLVMIdent[] = "llvm.ident";
+ }
+
+ /// \brief Unify multiple OpenCL metadata due to linking.
+ class AMDGPUUnifyMetadata : public FunctionPass {
+ public:
+ static char ID;
+ explicit AMDGPUUnifyMetadata() : FunctionPass(ID) {};
+
+ private:
+ // This should really be a module pass but we have to run it as early
+ // as possible, so given function passes are executed first and
+ // TargetMachine::addEarlyAsPossiblePasses() expects only function passes
+ // it has to be a function pass.
+ virtual bool runOnModule(Module &M);
+
+ // \todo: Convert to a module pass.
+ virtual bool runOnFunction(Function &F);
+
+ /// \brief Unify version metadata.
+ /// \return true if changes are made.
+ /// Assume the named metadata has operands each of which is a pair of
+ /// integer constant, e.g.
+ /// !Name = {!n1, !n2}
+ /// !n1 = {i32 1, i32 2}
+ /// !n2 = {i32 2, i32 0}
+ /// Keep the largest version as the sole operand if PickFirst is false.
+ /// Otherwise pick it from the first value, representing kernel module.
+ bool unifyVersionMD(Module &M, StringRef Name, bool PickFirst) {
+ auto NamedMD = M.getNamedMetadata(Name);
+ if (!NamedMD || NamedMD->getNumOperands() <= 1)
+ return false;
+ MDNode *MaxMD = nullptr;
+ auto MaxVer = 0U;
+ for (const auto &VersionMD : NamedMD->operands()) {
+ assert(VersionMD->getNumOperands() == 2);
+ auto CMajor = mdconst::extract<ConstantInt>(VersionMD->getOperand(0));
+ auto VersionMajor = CMajor->getZExtValue();
+ auto CMinor = mdconst::extract<ConstantInt>(VersionMD->getOperand(1));
+ auto VersionMinor = CMinor->getZExtValue();
+ auto Ver = (VersionMajor * 100) + (VersionMinor * 10);
+ if (Ver > MaxVer) {
+ MaxVer = Ver;
+ MaxMD = VersionMD;
+ }
+ if (PickFirst)
+ break;
+ }
+ NamedMD->eraseFromParent();
+ NamedMD = M.getOrInsertNamedMetadata(Name);
+ NamedMD->addOperand(MaxMD);
+ return true;
+ }
+
+ /// \brief Unify version metadata.
+ /// \return true if changes are made.
+ /// Assume the named metadata has operands each of which is a list e.g.
+ /// !Name = {!n1, !n2}
+ /// !n1 = !{!"cl_khr_fp16", {!"cl_khr_fp64"}}
+ /// !n2 = !{!"cl_khr_image"}
+ /// Combine it into a single list with unique operands.
+ bool unifyExtensionMD(Module &M, StringRef Name) {
+ auto NamedMD = M.getNamedMetadata(Name);
+ if (!NamedMD || NamedMD->getNumOperands() == 1)
+ return false;
+
+ SmallVector<Metadata *, 4> All;
+ for (const auto &MD : NamedMD->operands())
+ for (const auto &Op : MD->operands())
+ if (std::find(All.begin(), All.end(), Op.get()) == All.end())
+ All.push_back(Op.get());
+
+ NamedMD->eraseFromParent();
+ NamedMD = M.getOrInsertNamedMetadata(Name);
+ for (const auto &MD : All)
+ NamedMD->addOperand(MDNode::get(M.getContext(), MD));
+
+ return true;
+ }
+};
+
+} // end anonymous namespace
+
+char AMDGPUUnifyMetadata::ID = 0;
+
+char &llvm::AMDGPUUnifyMetadataID = AMDGPUUnifyMetadata::ID;
+
+INITIALIZE_PASS(AMDGPUUnifyMetadata, "amdgpu-unify-metadata",
+ "Unify multiple OpenCL metadata due to linking",
+ false, false)
+
+FunctionPass* llvm::createAMDGPUUnifyMetadataPass() {
+ return new AMDGPUUnifyMetadata();
+}
+
+bool AMDGPUUnifyMetadata::runOnModule(Module &M) {
+ const char* Vers[] = {
+ kOCLMD::SpirVer,
+ kOCLMD::OCLVer
+ };
+ const char* Exts[] = {
+ kOCLMD::UsedExt,
+ kOCLMD::UsedOptCoreFeat,
+ kOCLMD::CompilerOptions,
+ kOCLMD::LLVMIdent
+ };
+
+ bool Changed = false;
+
+ for (auto &I : Vers)
+ Changed |= unifyVersionMD(M, I, true);
+
+ for (auto &I : Exts)
+ Changed |= unifyExtensionMD(M, I);
+
+ return Changed;
+}
+
+bool AMDGPUUnifyMetadata::runOnFunction(Function &F) {
+ return runOnModule(*F.getParent());
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
new file mode 100644
index 000000000000..7faeccdc5df3
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -0,0 +1,1745 @@
+//===-- AMDILCFGStructurizer.cpp - CFG Structurizer -----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//==-----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "R600InstrInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include <deque>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "structcfg"
+
+#define DEFAULT_VEC_SLOTS 8
+
+// TODO: move-begin.
+
+//===----------------------------------------------------------------------===//
+//
+// Statistics for CFGStructurizer.
+//
+//===----------------------------------------------------------------------===//
+
+STATISTIC(numSerialPatternMatch, "CFGStructurizer number of serial pattern "
+ "matched");
+STATISTIC(numIfPatternMatch, "CFGStructurizer number of if pattern "
+ "matched");
+STATISTIC(numClonedBlock, "CFGStructurizer cloned blocks");
+STATISTIC(numClonedInstr, "CFGStructurizer cloned instructions");
+
+namespace llvm {
+ void initializeAMDGPUCFGStructurizerPass(PassRegistry&);
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Miscellaneous utility for CFGStructurizer.
+//
+//===----------------------------------------------------------------------===//
+namespace {
+#define SHOWNEWINSTR(i) \
+ DEBUG(dbgs() << "New instr: " << *i << "\n");
+
+#define SHOWNEWBLK(b, msg) \
+DEBUG( \
+ dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
+ dbgs() << "\n"; \
+);
+
+#define SHOWBLK_DETAIL(b, msg) \
+DEBUG( \
+ if (b) { \
+ dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
+ b->print(dbgs()); \
+ dbgs() << "\n"; \
+ } \
+);
+
+#define INVALIDSCCNUM -1
+
+template<class NodeT>
+void ReverseVector(SmallVectorImpl<NodeT *> &Src) {
+ size_t sz = Src.size();
+ for (size_t i = 0; i < sz/2; ++i) {
+ NodeT *t = Src[i];
+ Src[i] = Src[sz - i - 1];
+ Src[sz - i - 1] = t;
+ }
+}
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+//
+// supporting data structure for CFGStructurizer
+//
+//===----------------------------------------------------------------------===//
+
+
+namespace {
+
+class BlockInformation {
+public:
+ bool IsRetired;
+ int SccNum;
+ BlockInformation() : IsRetired(false), SccNum(INVALIDSCCNUM) {}
+};
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+//
+// CFGStructurizer
+//
+//===----------------------------------------------------------------------===//
+
+namespace {
+class AMDGPUCFGStructurizer : public MachineFunctionPass {
+public:
+ typedef SmallVector<MachineBasicBlock *, 32> MBBVector;
+ typedef std::map<MachineBasicBlock *, BlockInformation *> MBBInfoMap;
+ typedef std::map<MachineLoop *, MachineBasicBlock *> LoopLandInfoMap;
+
+ enum PathToKind {
+ Not_SinglePath = 0,
+ SinglePath_InPath = 1,
+ SinglePath_NotInPath = 2
+ };
+
+ static char ID;
+
+ AMDGPUCFGStructurizer() :
+ MachineFunctionPass(ID), TII(nullptr), TRI(nullptr) {
+ initializeAMDGPUCFGStructurizerPass(*PassRegistry::getPassRegistry());
+ }
+
+ StringRef getPassName() const override {
+ return "AMDGPU Control Flow Graph structurizer Pass";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<MachinePostDominatorTree>();
+ AU.addRequired<MachineLoopInfo>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ /// Perform the CFG structurization
+ bool run();
+
+ /// Perform the CFG preparation
+ /// This step will remove every unconditionnal/dead jump instructions and make
+ /// sure all loops have an exit block
+ bool prepare();
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ TII = MF.getSubtarget<R600Subtarget>().getInstrInfo();
+ TRI = &TII->getRegisterInfo();
+ DEBUG(MF.dump(););
+ OrderedBlks.clear();
+ Visited.clear();
+ FuncRep = &MF;
+ MLI = &getAnalysis<MachineLoopInfo>();
+ DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI););
+ MDT = &getAnalysis<MachineDominatorTree>();
+ DEBUG(MDT->print(dbgs(), (const llvm::Module*)nullptr););
+ PDT = &getAnalysis<MachinePostDominatorTree>();
+ DEBUG(PDT->print(dbgs()););
+ prepare();
+ run();
+ DEBUG(MF.dump(););
+ return true;
+ }
+
+protected:
+ MachineDominatorTree *MDT;
+ MachinePostDominatorTree *PDT;
+ MachineLoopInfo *MLI;
+ const R600InstrInfo *TII;
+ const R600RegisterInfo *TRI;
+
+ // PRINT FUNCTIONS
+ /// Print the ordered Blocks.
+ void printOrderedBlocks() const {
+ size_t i = 0;
+ for (MBBVector::const_iterator iterBlk = OrderedBlks.begin(),
+ iterBlkEnd = OrderedBlks.end(); iterBlk != iterBlkEnd; ++iterBlk, ++i) {
+ dbgs() << "BB" << (*iterBlk)->getNumber();
+ dbgs() << "(" << getSCCNum(*iterBlk) << "," << (*iterBlk)->size() << ")";
+ if (i != 0 && i % 10 == 0) {
+ dbgs() << "\n";
+ } else {
+ dbgs() << " ";
+ }
+ }
+ }
+ static void PrintLoopinfo(const MachineLoopInfo &LoopInfo) {
+ for (MachineLoop::iterator iter = LoopInfo.begin(),
+ iterEnd = LoopInfo.end(); iter != iterEnd; ++iter) {
+ (*iter)->print(dbgs(), 0);
+ }
+ }
+
+ // UTILITY FUNCTIONS
+ int getSCCNum(MachineBasicBlock *MBB) const;
+ MachineBasicBlock *getLoopLandInfo(MachineLoop *LoopRep) const;
+ bool hasBackEdge(MachineBasicBlock *MBB) const;
+ bool isRetiredBlock(MachineBasicBlock *MBB) const;
+ bool isActiveLoophead(MachineBasicBlock *MBB) const;
+ PathToKind singlePathTo(MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB,
+ bool AllowSideEntry = true) const;
+ int countActiveBlock(MBBVector::const_iterator It,
+ MBBVector::const_iterator E) const;
+ bool needMigrateBlock(MachineBasicBlock *MBB) const;
+
+ // Utility Functions
+ void reversePredicateSetter(MachineBasicBlock::iterator I,
+ MachineBasicBlock &MBB);
+ /// Compute the reversed DFS post order of Blocks
+ void orderBlocks(MachineFunction *MF);
+
+ // Function originally from CFGStructTraits
+ void insertInstrEnd(MachineBasicBlock *MBB, int NewOpcode,
+ const DebugLoc &DL = DebugLoc());
+ MachineInstr *insertInstrBefore(MachineBasicBlock *MBB, int NewOpcode,
+ const DebugLoc &DL = DebugLoc());
+ MachineInstr *insertInstrBefore(MachineBasicBlock::iterator I, int NewOpcode);
+ void insertCondBranchBefore(MachineBasicBlock::iterator I, int NewOpcode,
+ const DebugLoc &DL);
+ void insertCondBranchBefore(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator I, int NewOpcode,
+ int RegNum, const DebugLoc &DL);
+ static int getBranchNzeroOpcode(int OldOpcode);
+ static int getBranchZeroOpcode(int OldOpcode);
+ static int getContinueNzeroOpcode(int OldOpcode);
+ static int getContinueZeroOpcode(int OldOpcode);
+ static MachineBasicBlock *getTrueBranch(MachineInstr *MI);
+ static void setTrueBranch(MachineInstr *MI, MachineBasicBlock *MBB);
+ static MachineBasicBlock *getFalseBranch(MachineBasicBlock *MBB,
+ MachineInstr *MI);
+ static bool isCondBranch(MachineInstr *MI);
+ static bool isUncondBranch(MachineInstr *MI);
+ static DebugLoc getLastDebugLocInBB(MachineBasicBlock *MBB);
+ static MachineInstr *getNormalBlockBranchInstr(MachineBasicBlock *MBB);
+ /// The correct naming for this is getPossibleLoopendBlockBranchInstr.
+ ///
+ /// BB with backward-edge could have move instructions after the branch
+ /// instruction. Such move instruction "belong to" the loop backward-edge.
+ MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *MBB);
+ static MachineInstr *getReturnInstr(MachineBasicBlock *MBB);
+ static bool isReturnBlock(MachineBasicBlock *MBB);
+ static void cloneSuccessorList(MachineBasicBlock *DstMBB,
+ MachineBasicBlock *SrcMBB) ;
+ static MachineBasicBlock *clone(MachineBasicBlock *MBB);
+ /// MachineBasicBlock::ReplaceUsesOfBlockWith doesn't serve the purpose
+ /// because the AMDGPU instruction is not recognized as terminator fix this
+ /// and retire this routine
+ void replaceInstrUseOfBlockWith(MachineBasicBlock *SrcMBB,
+ MachineBasicBlock *OldMBB, MachineBasicBlock *NewBlk);
+ static void wrapup(MachineBasicBlock *MBB);
+
+
+ int patternMatch(MachineBasicBlock *MBB);
+ int patternMatchGroup(MachineBasicBlock *MBB);
+ int serialPatternMatch(MachineBasicBlock *MBB);
+ int ifPatternMatch(MachineBasicBlock *MBB);
+ int loopendPatternMatch();
+ int mergeLoop(MachineLoop *LoopRep);
+
+ /// return true iff src1Blk->succ_size() == 0 && src1Blk and src2Blk are in
+ /// the same loop with LoopLandInfo without explicitly keeping track of
+ /// loopContBlks and loopBreakBlks, this is a method to get the information.
+ bool isSameloopDetachedContbreak(MachineBasicBlock *Src1MBB,
+ MachineBasicBlock *Src2MBB);
+ int handleJumpintoIf(MachineBasicBlock *HeadMBB,
+ MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB);
+ int handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
+ MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB);
+ int improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
+ MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
+ MachineBasicBlock **LandMBBPtr);
+ void showImproveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
+ MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
+ MachineBasicBlock *LandMBB, bool Detail = false);
+ int cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
+ MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB);
+ void mergeSerialBlock(MachineBasicBlock *DstMBB,
+ MachineBasicBlock *SrcMBB);
+
+ void mergeIfthenelseBlock(MachineInstr *BranchMI,
+ MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB,
+ MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB);
+ void mergeLooplandBlock(MachineBasicBlock *DstMBB,
+ MachineBasicBlock *LandMBB);
+ void mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
+ MachineBasicBlock *LandMBB);
+ void settleLoopcontBlock(MachineBasicBlock *ContingMBB,
+ MachineBasicBlock *ContMBB);
+ /// normalizeInfiniteLoopExit change
+ /// B1:
+ /// uncond_br LoopHeader
+ ///
+ /// to
+ /// B1:
+ /// cond_br 1 LoopHeader dummyExit
+ /// and return the newly added dummy exit block
+ MachineBasicBlock *normalizeInfiniteLoopExit(MachineLoop *LoopRep);
+ void removeUnconditionalBranch(MachineBasicBlock *MBB);
+ /// Remove duplicate branches instructions in a block.
+ /// For instance
+ /// B0:
+ /// cond_br X B1 B2
+ /// cond_br X B1 B2
+ /// is transformed to
+ /// B0:
+ /// cond_br X B1 B2
+ void removeRedundantConditionalBranch(MachineBasicBlock *MBB);
+ void addDummyExitBlock(SmallVectorImpl<MachineBasicBlock *> &RetMBB);
+ void removeSuccessor(MachineBasicBlock *MBB);
+ MachineBasicBlock *cloneBlockForPredecessor(MachineBasicBlock *MBB,
+ MachineBasicBlock *PredMBB);
+ void migrateInstruction(MachineBasicBlock *SrcMBB,
+ MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I);
+ void recordSccnum(MachineBasicBlock *MBB, int SCCNum);
+ void retireBlock(MachineBasicBlock *MBB);
+
+
+private:
+ MBBInfoMap BlockInfoMap;
+ LoopLandInfoMap LLInfoMap;
+ std::map<MachineLoop *, bool> Visited;
+ MachineFunction *FuncRep;
+ SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> OrderedBlks;
+};
+
+int AMDGPUCFGStructurizer::getSCCNum(MachineBasicBlock *MBB) const {
+ MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB);
+ if (It == BlockInfoMap.end())
+ return INVALIDSCCNUM;
+ return (*It).second->SccNum;
+}
+
+MachineBasicBlock *AMDGPUCFGStructurizer::getLoopLandInfo(MachineLoop *LoopRep)
+ const {
+ LoopLandInfoMap::const_iterator It = LLInfoMap.find(LoopRep);
+ if (It == LLInfoMap.end())
+ return nullptr;
+ return (*It).second;
+}
+
+bool AMDGPUCFGStructurizer::hasBackEdge(MachineBasicBlock *MBB) const {
+ MachineLoop *LoopRep = MLI->getLoopFor(MBB);
+ if (!LoopRep)
+ return false;
+ MachineBasicBlock *LoopHeader = LoopRep->getHeader();
+ return MBB->isSuccessor(LoopHeader);
+}
+
+bool AMDGPUCFGStructurizer::isRetiredBlock(MachineBasicBlock *MBB) const {
+ MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB);
+ if (It == BlockInfoMap.end())
+ return false;
+ return (*It).second->IsRetired;
+}
+
+bool AMDGPUCFGStructurizer::isActiveLoophead(MachineBasicBlock *MBB) const {
+ MachineLoop *LoopRep = MLI->getLoopFor(MBB);
+ while (LoopRep && LoopRep->getHeader() == MBB) {
+ MachineBasicBlock *LoopLand = getLoopLandInfo(LoopRep);
+ if(!LoopLand)
+ return true;
+ if (!isRetiredBlock(LoopLand))
+ return true;
+ LoopRep = LoopRep->getParentLoop();
+ }
+ return false;
+}
+AMDGPUCFGStructurizer::PathToKind AMDGPUCFGStructurizer::singlePathTo(
+ MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB,
+ bool AllowSideEntry) const {
+ assert(DstMBB);
+ if (SrcMBB == DstMBB)
+ return SinglePath_InPath;
+ while (SrcMBB && SrcMBB->succ_size() == 1) {
+ SrcMBB = *SrcMBB->succ_begin();
+ if (SrcMBB == DstMBB)
+ return SinglePath_InPath;
+ if (!AllowSideEntry && SrcMBB->pred_size() > 1)
+ return Not_SinglePath;
+ }
+ if (SrcMBB && SrcMBB->succ_size()==0)
+ return SinglePath_NotInPath;
+ return Not_SinglePath;
+}
+
+int AMDGPUCFGStructurizer::countActiveBlock(MBBVector::const_iterator It,
+ MBBVector::const_iterator E) const {
+ int Count = 0;
+ while (It != E) {
+ if (!isRetiredBlock(*It))
+ ++Count;
+ ++It;
+ }
+ return Count;
+}
+
+bool AMDGPUCFGStructurizer::needMigrateBlock(MachineBasicBlock *MBB) const {
+ unsigned BlockSizeThreshold = 30;
+ unsigned CloneInstrThreshold = 100;
+ bool MultiplePreds = MBB && (MBB->pred_size() > 1);
+
+ if(!MultiplePreds)
+ return false;
+ unsigned BlkSize = MBB->size();
+ return ((BlkSize > BlockSizeThreshold) &&
+ (BlkSize * (MBB->pred_size() - 1) > CloneInstrThreshold));
+}
+
+void AMDGPUCFGStructurizer::reversePredicateSetter(
+ MachineBasicBlock::iterator I, MachineBasicBlock &MBB) {
+ assert(I.isValid() && "Expected valid iterator");
+ for (;; --I) {
+ if (I == MBB.end())
+ continue;
+ if (I->getOpcode() == AMDGPU::PRED_X) {
+ switch (I->getOperand(2).getImm()) {
+ case AMDGPU::PRED_SETE_INT:
+ I->getOperand(2).setImm(AMDGPU::PRED_SETNE_INT);
+ return;
+ case AMDGPU::PRED_SETNE_INT:
+ I->getOperand(2).setImm(AMDGPU::PRED_SETE_INT);
+ return;
+ case AMDGPU::PRED_SETE:
+ I->getOperand(2).setImm(AMDGPU::PRED_SETNE);
+ return;
+ case AMDGPU::PRED_SETNE:
+ I->getOperand(2).setImm(AMDGPU::PRED_SETE);
+ return;
+ default:
+ llvm_unreachable("PRED_X Opcode invalid!");
+ }
+ }
+ }
+}
+
+void AMDGPUCFGStructurizer::insertInstrEnd(MachineBasicBlock *MBB,
+ int NewOpcode, const DebugLoc &DL) {
+ MachineInstr *MI =
+ MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL);
+ MBB->push_back(MI);
+ //assume the instruction doesn't take any reg operand ...
+ SHOWNEWINSTR(MI);
+}
+
+MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(MachineBasicBlock *MBB,
+ int NewOpcode,
+ const DebugLoc &DL) {
+ MachineInstr *MI =
+ MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL);
+ if (MBB->begin() != MBB->end())
+ MBB->insert(MBB->begin(), MI);
+ else
+ MBB->push_back(MI);
+ SHOWNEWINSTR(MI);
+ return MI;
+}
+
+MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(
+ MachineBasicBlock::iterator I, int NewOpcode) {
+ MachineInstr *OldMI = &(*I);
+ MachineBasicBlock *MBB = OldMI->getParent();
+ MachineInstr *NewMBB =
+ MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DebugLoc());
+ MBB->insert(I, NewMBB);
+ //assume the instruction doesn't take any reg operand ...
+ SHOWNEWINSTR(NewMBB);
+ return NewMBB;
+}
+
+void AMDGPUCFGStructurizer::insertCondBranchBefore(
+ MachineBasicBlock::iterator I, int NewOpcode, const DebugLoc &DL) {
+ MachineInstr *OldMI = &(*I);
+ MachineBasicBlock *MBB = OldMI->getParent();
+ MachineFunction *MF = MBB->getParent();
+ MachineInstr *NewMI = MF->CreateMachineInstr(TII->get(NewOpcode), DL);
+ MBB->insert(I, NewMI);
+ MachineInstrBuilder MIB(*MF, NewMI);
+ MIB.addReg(OldMI->getOperand(1).getReg(), false);
+ SHOWNEWINSTR(NewMI);
+ //erase later oldInstr->eraseFromParent();
+}
+
+void AMDGPUCFGStructurizer::insertCondBranchBefore(
+ MachineBasicBlock *blk, MachineBasicBlock::iterator I, int NewOpcode,
+ int RegNum, const DebugLoc &DL) {
+ MachineFunction *MF = blk->getParent();
+ MachineInstr *NewInstr = MF->CreateMachineInstr(TII->get(NewOpcode), DL);
+ //insert before
+ blk->insert(I, NewInstr);
+ MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false);
+ SHOWNEWINSTR(NewInstr);
+}
+
+int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) {
+ switch(OldOpcode) {
+ case AMDGPU::JUMP_COND:
+ case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
+ case AMDGPU::BRANCH_COND_i32:
+ case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32;
+ default: llvm_unreachable("internal error");
+ }
+ return -1;
+}
+
+int AMDGPUCFGStructurizer::getBranchZeroOpcode(int OldOpcode) {
+ switch(OldOpcode) {
+ case AMDGPU::JUMP_COND:
+ case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
+ case AMDGPU::BRANCH_COND_i32:
+ case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32;
+ default: llvm_unreachable("internal error");
+ }
+ return -1;
+}
+
+int AMDGPUCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) {
+ switch(OldOpcode) {
+ case AMDGPU::JUMP_COND:
+ case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32;
+ default: llvm_unreachable("internal error");
+ };
+ return -1;
+}
+
+int AMDGPUCFGStructurizer::getContinueZeroOpcode(int OldOpcode) {
+ switch(OldOpcode) {
+ case AMDGPU::JUMP_COND:
+ case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALZ_i32;
+ default: llvm_unreachable("internal error");
+ }
+ return -1;
+}
+
+MachineBasicBlock *AMDGPUCFGStructurizer::getTrueBranch(MachineInstr *MI) {
+ return MI->getOperand(0).getMBB();
+}
+
+void AMDGPUCFGStructurizer::setTrueBranch(MachineInstr *MI,
+ MachineBasicBlock *MBB) {
+ MI->getOperand(0).setMBB(MBB);
+}
+
+MachineBasicBlock *
+AMDGPUCFGStructurizer::getFalseBranch(MachineBasicBlock *MBB,
+ MachineInstr *MI) {
+ assert(MBB->succ_size() == 2);
+ MachineBasicBlock *TrueBranch = getTrueBranch(MI);
+ MachineBasicBlock::succ_iterator It = MBB->succ_begin();
+ MachineBasicBlock::succ_iterator Next = It;
+ ++Next;
+ return (*It == TrueBranch) ? *Next : *It;
+}
+
+bool AMDGPUCFGStructurizer::isCondBranch(MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ case AMDGPU::JUMP_COND:
+ case AMDGPU::BRANCH_COND_i32:
+ case AMDGPU::BRANCH_COND_f32: return true;
+ default:
+ return false;
+ }
+ return false;
+}
+
+bool AMDGPUCFGStructurizer::isUncondBranch(MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ case AMDGPU::JUMP:
+ case AMDGPU::BRANCH:
+ return true;
+ default:
+ return false;
+ }
+ return false;
+}
+
+DebugLoc AMDGPUCFGStructurizer::getLastDebugLocInBB(MachineBasicBlock *MBB) {
+ //get DebugLoc from the first MachineBasicBlock instruction with debug info
+ DebugLoc DL;
+ for (MachineBasicBlock::iterator It = MBB->begin(); It != MBB->end();
+ ++It) {
+ MachineInstr *instr = &(*It);
+ if (instr->getDebugLoc())
+ DL = instr->getDebugLoc();
+ }
+ return DL;
+}
+
+MachineInstr *AMDGPUCFGStructurizer::getNormalBlockBranchInstr(
+ MachineBasicBlock *MBB) {
+ MachineBasicBlock::reverse_iterator It = MBB->rbegin();
+ MachineInstr *MI = &*It;
+ if (MI && (isCondBranch(MI) || isUncondBranch(MI)))
+ return MI;
+ return nullptr;
+}
+
+MachineInstr *AMDGPUCFGStructurizer::getLoopendBlockBranchInstr(
+ MachineBasicBlock *MBB) {
+ for (MachineBasicBlock::reverse_iterator It = MBB->rbegin(), E = MBB->rend();
+ It != E; ++It) {
+ // FIXME: Simplify
+ MachineInstr *MI = &*It;
+ if (MI) {
+ if (isCondBranch(MI) || isUncondBranch(MI))
+ return MI;
+ else if (!TII->isMov(MI->getOpcode()))
+ break;
+ }
+ }
+ return nullptr;
+}
+
+MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) {
+ MachineBasicBlock::reverse_iterator It = MBB->rbegin();
+ if (It != MBB->rend()) {
+ MachineInstr *instr = &(*It);
+ if (instr->getOpcode() == AMDGPU::RETURN)
+ return instr;
+ }
+ return nullptr;
+}
+
+bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) {
+ MachineInstr *MI = getReturnInstr(MBB);
+ bool IsReturn = (MBB->succ_size() == 0);
+ if (MI)
+ assert(IsReturn);
+ else if (IsReturn)
+ DEBUG(
+ dbgs() << "BB" << MBB->getNumber()
+ <<" is return block without RETURN instr\n";);
+ return IsReturn;
+}
+
+void AMDGPUCFGStructurizer::cloneSuccessorList(MachineBasicBlock *DstMBB,
+ MachineBasicBlock *SrcMBB) {
+ for (MachineBasicBlock::succ_iterator It = SrcMBB->succ_begin(),
+ iterEnd = SrcMBB->succ_end(); It != iterEnd; ++It)
+ DstMBB->addSuccessor(*It); // *iter's predecessor is also taken care of
+}
+
+MachineBasicBlock *AMDGPUCFGStructurizer::clone(MachineBasicBlock *MBB) {
+ MachineFunction *Func = MBB->getParent();
+ MachineBasicBlock *NewMBB = Func->CreateMachineBasicBlock();
+ Func->push_back(NewMBB); //insert to function
+ for (const MachineInstr &It : *MBB)
+ NewMBB->push_back(Func->CloneMachineInstr(&It));
+ return NewMBB;
+}
+
+void AMDGPUCFGStructurizer::replaceInstrUseOfBlockWith(
+ MachineBasicBlock *SrcMBB, MachineBasicBlock *OldMBB,
+ MachineBasicBlock *NewBlk) {
+ MachineInstr *BranchMI = getLoopendBlockBranchInstr(SrcMBB);
+ if (BranchMI && isCondBranch(BranchMI) &&
+ getTrueBranch(BranchMI) == OldMBB)
+ setTrueBranch(BranchMI, NewBlk);
+}
+
+void AMDGPUCFGStructurizer::wrapup(MachineBasicBlock *MBB) {
+ assert((!MBB->getParent()->getJumpTableInfo()
+ || MBB->getParent()->getJumpTableInfo()->isEmpty())
+ && "found a jump table");
+
+ //collect continue right before endloop
+ SmallVector<MachineInstr *, DEFAULT_VEC_SLOTS> ContInstr;
+ MachineBasicBlock::iterator Pre = MBB->begin();
+ MachineBasicBlock::iterator E = MBB->end();
+ MachineBasicBlock::iterator It = Pre;
+ while (It != E) {
+ if (Pre->getOpcode() == AMDGPU::CONTINUE
+ && It->getOpcode() == AMDGPU::ENDLOOP)
+ ContInstr.push_back(&*Pre);
+ Pre = It;
+ ++It;
+ }
+
+ //delete continue right before endloop
+ for (unsigned i = 0; i < ContInstr.size(); ++i)
+ ContInstr[i]->eraseFromParent();
+
+ // TODO to fix up jump table so later phase won't be confused. if
+ // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but
+ // there isn't such an interface yet. alternatively, replace all the other
+ // blocks in the jump table with the entryBlk //}
+
+}
+
+
+bool AMDGPUCFGStructurizer::prepare() {
+ bool Changed = false;
+
+ //FIXME: if not reducible flow graph, make it so ???
+
+ DEBUG(dbgs() << "AMDGPUCFGStructurizer::prepare\n";);
+
+ orderBlocks(FuncRep);
+
+ SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> RetBlks;
+
+ // Add an ExitBlk to loop that don't have one
+ for (MachineLoopInfo::iterator It = MLI->begin(),
+ E = MLI->end(); It != E; ++It) {
+ MachineLoop *LoopRep = (*It);
+ MBBVector ExitingMBBs;
+ LoopRep->getExitingBlocks(ExitingMBBs);
+
+ if (ExitingMBBs.size() == 0) {
+ MachineBasicBlock* DummyExitBlk = normalizeInfiniteLoopExit(LoopRep);
+ if (DummyExitBlk)
+ RetBlks.push_back(DummyExitBlk);
+ }
+ }
+
+ // Remove unconditional branch instr.
+ // Add dummy exit block iff there are multiple returns.
+ for (SmallVectorImpl<MachineBasicBlock *>::const_iterator
+ It = OrderedBlks.begin(), E = OrderedBlks.end(); It != E; ++It) {
+ MachineBasicBlock *MBB = *It;
+ removeUnconditionalBranch(MBB);
+ removeRedundantConditionalBranch(MBB);
+ if (isReturnBlock(MBB)) {
+ RetBlks.push_back(MBB);
+ }
+ assert(MBB->succ_size() <= 2);
+ }
+
+ if (RetBlks.size() >= 2) {
+ addDummyExitBlock(RetBlks);
+ Changed = true;
+ }
+
+ return Changed;
+}
+
+bool AMDGPUCFGStructurizer::run() {
+
+ //Assume reducible CFG...
+ DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n");
+
+#ifdef STRESSTEST
+ //Use the worse block ordering to test the algorithm.
+ ReverseVector(orderedBlks);
+#endif
+
+ DEBUG(dbgs() << "Ordered blocks:\n"; printOrderedBlocks(););
+ int NumIter = 0;
+ bool Finish = false;
+ MachineBasicBlock *MBB;
+ bool MakeProgress = false;
+ int NumRemainedBlk = countActiveBlock(OrderedBlks.begin(),
+ OrderedBlks.end());
+
+ do {
+ ++NumIter;
+ DEBUG(
+ dbgs() << "numIter = " << NumIter
+ << ", numRemaintedBlk = " << NumRemainedBlk << "\n";
+ );
+
+ SmallVectorImpl<MachineBasicBlock *>::const_iterator It =
+ OrderedBlks.begin();
+ SmallVectorImpl<MachineBasicBlock *>::const_iterator E =
+ OrderedBlks.end();
+
+ SmallVectorImpl<MachineBasicBlock *>::const_iterator SccBeginIter =
+ It;
+ MachineBasicBlock *SccBeginMBB = nullptr;
+ int SccNumBlk = 0; // The number of active blocks, init to a
+ // maximum possible number.
+ int SccNumIter; // Number of iteration in this SCC.
+
+ while (It != E) {
+ MBB = *It;
+
+ if (!SccBeginMBB) {
+ SccBeginIter = It;
+ SccBeginMBB = MBB;
+ SccNumIter = 0;
+ SccNumBlk = NumRemainedBlk; // Init to maximum possible number.
+ DEBUG(
+ dbgs() << "start processing SCC" << getSCCNum(SccBeginMBB);
+ dbgs() << "\n";
+ );
+ }
+
+ if (!isRetiredBlock(MBB))
+ patternMatch(MBB);
+
+ ++It;
+
+ bool ContNextScc = true;
+ if (It == E
+ || getSCCNum(SccBeginMBB) != getSCCNum(*It)) {
+ // Just finish one scc.
+ ++SccNumIter;
+ int sccRemainedNumBlk = countActiveBlock(SccBeginIter, It);
+ if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= SccNumBlk) {
+ DEBUG(
+ dbgs() << "Can't reduce SCC " << getSCCNum(MBB)
+ << ", sccNumIter = " << SccNumIter;
+ dbgs() << "doesn't make any progress\n";
+ );
+ ContNextScc = true;
+ } else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < SccNumBlk) {
+ SccNumBlk = sccRemainedNumBlk;
+ It = SccBeginIter;
+ ContNextScc = false;
+ DEBUG(
+ dbgs() << "repeat processing SCC" << getSCCNum(MBB)
+ << "sccNumIter = " << SccNumIter << '\n';
+ );
+ } else {
+ // Finish the current scc.
+ ContNextScc = true;
+ }
+ } else {
+ // Continue on next component in the current scc.
+ ContNextScc = false;
+ }
+
+ if (ContNextScc)
+ SccBeginMBB = nullptr;
+ } //while, "one iteration" over the function.
+
+ MachineBasicBlock *EntryMBB =
+ *GraphTraits<MachineFunction *>::nodes_begin(FuncRep);
+ if (EntryMBB->succ_size() == 0) {
+ Finish = true;
+ DEBUG(
+ dbgs() << "Reduce to one block\n";
+ );
+ } else {
+ int NewnumRemainedBlk
+ = countActiveBlock(OrderedBlks.begin(), OrderedBlks.end());
+ // consider cloned blocks ??
+ if (NewnumRemainedBlk == 1 || NewnumRemainedBlk < NumRemainedBlk) {
+ MakeProgress = true;
+ NumRemainedBlk = NewnumRemainedBlk;
+ } else {
+ MakeProgress = false;
+ DEBUG(
+ dbgs() << "No progress\n";
+ );
+ }
+ }
+ } while (!Finish && MakeProgress);
+
+ // Misc wrap up to maintain the consistency of the Function representation.
+ wrapup(*GraphTraits<MachineFunction *>::nodes_begin(FuncRep));
+
+ // Detach retired Block, release memory.
+ for (MBBInfoMap::iterator It = BlockInfoMap.begin(), E = BlockInfoMap.end();
+ It != E; ++It) {
+ if ((*It).second && (*It).second->IsRetired) {
+ assert(((*It).first)->getNumber() != -1);
+ DEBUG(
+ dbgs() << "Erase BB" << ((*It).first)->getNumber() << "\n";
+ );
+ (*It).first->eraseFromParent(); //Remove from the parent Function.
+ }
+ delete (*It).second;
+ }
+ BlockInfoMap.clear();
+ LLInfoMap.clear();
+
+ if (!Finish) {
+ DEBUG(FuncRep->viewCFG());
+ report_fatal_error("IRREDUCIBLE_CFG");
+ }
+
+ return true;
+}
+
+
+
+void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) {
+ int SccNum = 0;
+ MachineBasicBlock *MBB;
+ for (scc_iterator<MachineFunction *> It = scc_begin(MF); !It.isAtEnd();
+ ++It, ++SccNum) {
+ const std::vector<MachineBasicBlock *> &SccNext = *It;
+ for (std::vector<MachineBasicBlock *>::const_iterator
+ blockIter = SccNext.begin(), blockEnd = SccNext.end();
+ blockIter != blockEnd; ++blockIter) {
+ MBB = *blockIter;
+ OrderedBlks.push_back(MBB);
+ recordSccnum(MBB, SccNum);
+ }
+ }
+
+ //walk through all the block in func to check for unreachable
+ typedef GraphTraits<MachineFunction *> GTM;
+ auto It = GTM::nodes_begin(MF), E = GTM::nodes_end(MF);
+ for (; It != E; ++It) {
+ MachineBasicBlock *MBB = *It;
+ SccNum = getSCCNum(MBB);
+ if (SccNum == INVALIDSCCNUM)
+ dbgs() << "unreachable block BB" << MBB->getNumber() << "\n";
+ }
+}
+
+int AMDGPUCFGStructurizer::patternMatch(MachineBasicBlock *MBB) {
+ int NumMatch = 0;
+ int CurMatch;
+
+ DEBUG(
+ dbgs() << "Begin patternMatch BB" << MBB->getNumber() << "\n";
+ );
+
+ while ((CurMatch = patternMatchGroup(MBB)) > 0)
+ NumMatch += CurMatch;
+
+ DEBUG(
+ dbgs() << "End patternMatch BB" << MBB->getNumber()
+ << ", numMatch = " << NumMatch << "\n";
+ );
+
+ return NumMatch;
+}
+
+int AMDGPUCFGStructurizer::patternMatchGroup(MachineBasicBlock *MBB) {
+ int NumMatch = 0;
+ NumMatch += loopendPatternMatch();
+ NumMatch += serialPatternMatch(MBB);
+ NumMatch += ifPatternMatch(MBB);
+ return NumMatch;
+}
+
+
+int AMDGPUCFGStructurizer::serialPatternMatch(MachineBasicBlock *MBB) {
+ if (MBB->succ_size() != 1)
+ return 0;
+
+ MachineBasicBlock *childBlk = *MBB->succ_begin();
+ if (childBlk->pred_size() != 1 || isActiveLoophead(childBlk))
+ return 0;
+
+ mergeSerialBlock(MBB, childBlk);
+ ++numSerialPatternMatch;
+ return 1;
+}
+
+int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
+ //two edges
+ if (MBB->succ_size() != 2)
+ return 0;
+ if (hasBackEdge(MBB))
+ return 0;
+ MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
+ if (!BranchMI)
+ return 0;
+
+ assert(isCondBranch(BranchMI));
+ int NumMatch = 0;
+
+ MachineBasicBlock *TrueMBB = getTrueBranch(BranchMI);
+ NumMatch += serialPatternMatch(TrueMBB);
+ NumMatch += ifPatternMatch(TrueMBB);
+ MachineBasicBlock *FalseMBB = getFalseBranch(MBB, BranchMI);
+ NumMatch += serialPatternMatch(FalseMBB);
+ NumMatch += ifPatternMatch(FalseMBB);
+ MachineBasicBlock *LandBlk;
+ int Cloned = 0;
+
+ assert (!TrueMBB->succ_empty() || !FalseMBB->succ_empty());
+ // TODO: Simplify
+ if (TrueMBB->succ_size() == 1 && FalseMBB->succ_size() == 1
+ && *TrueMBB->succ_begin() == *FalseMBB->succ_begin()) {
+ // Diamond pattern
+ LandBlk = *TrueMBB->succ_begin();
+ } else if (TrueMBB->succ_size() == 1 && *TrueMBB->succ_begin() == FalseMBB) {
+ // Triangle pattern, false is empty
+ LandBlk = FalseMBB;
+ FalseMBB = nullptr;
+ } else if (FalseMBB->succ_size() == 1
+ && *FalseMBB->succ_begin() == TrueMBB) {
+ // Triangle pattern, true is empty
+ // We reverse the predicate to make a triangle, empty false pattern;
+ std::swap(TrueMBB, FalseMBB);
+ reversePredicateSetter(MBB->end(), *MBB);
+ LandBlk = FalseMBB;
+ FalseMBB = nullptr;
+ } else if (FalseMBB->succ_size() == 1
+ && isSameloopDetachedContbreak(TrueMBB, FalseMBB)) {
+ LandBlk = *FalseMBB->succ_begin();
+ } else if (TrueMBB->succ_size() == 1
+ && isSameloopDetachedContbreak(FalseMBB, TrueMBB)) {
+ LandBlk = *TrueMBB->succ_begin();
+ } else {
+ return NumMatch + handleJumpintoIf(MBB, TrueMBB, FalseMBB);
+ }
+
+ // improveSimpleJumpinfoIf can handle the case where landBlk == NULL but the
+ // new BB created for landBlk==NULL may introduce new challenge to the
+ // reduction process.
+ if (LandBlk &&
+ ((TrueMBB && TrueMBB->pred_size() > 1)
+ || (FalseMBB && FalseMBB->pred_size() > 1))) {
+ Cloned += improveSimpleJumpintoIf(MBB, TrueMBB, FalseMBB, &LandBlk);
+ }
+
+ if (TrueMBB && TrueMBB->pred_size() > 1) {
+ TrueMBB = cloneBlockForPredecessor(TrueMBB, MBB);
+ ++Cloned;
+ }
+
+ if (FalseMBB && FalseMBB->pred_size() > 1) {
+ FalseMBB = cloneBlockForPredecessor(FalseMBB, MBB);
+ ++Cloned;
+ }
+
+ mergeIfthenelseBlock(BranchMI, MBB, TrueMBB, FalseMBB, LandBlk);
+
+ ++numIfPatternMatch;
+
+ numClonedBlock += Cloned;
+
+ return 1 + Cloned + NumMatch;
+}
+
+int AMDGPUCFGStructurizer::loopendPatternMatch() {
+ std::deque<MachineLoop *> NestedLoops;
+ for (auto &It: *MLI)
+ for (MachineLoop *ML : depth_first(It))
+ NestedLoops.push_front(ML);
+
+ if (NestedLoops.size() == 0)
+ return 0;
+
+ // Process nested loop outside->inside (we did push_front),
+ // so "continue" to a outside loop won't be mistaken as "break"
+ // of the current loop.
+ int Num = 0;
+ for (MachineLoop *ExaminedLoop : NestedLoops) {
+ if (ExaminedLoop->getNumBlocks() == 0 || Visited[ExaminedLoop])
+ continue;
+ DEBUG(dbgs() << "Processing:\n"; ExaminedLoop->dump(););
+ int NumBreak = mergeLoop(ExaminedLoop);
+ if (NumBreak == -1)
+ break;
+ Num += NumBreak;
+ }
+ return Num;
+}
+
+int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) {
+ MachineBasicBlock *LoopHeader = LoopRep->getHeader();
+ MBBVector ExitingMBBs;
+ LoopRep->getExitingBlocks(ExitingMBBs);
+ assert(!ExitingMBBs.empty() && "Infinite Loop not supported");
+ DEBUG(dbgs() << "Loop has " << ExitingMBBs.size() << " exiting blocks\n";);
+ // We assume a single ExitBlk
+ MBBVector ExitBlks;
+ LoopRep->getExitBlocks(ExitBlks);
+ SmallPtrSet<MachineBasicBlock *, 2> ExitBlkSet;
+ for (unsigned i = 0, e = ExitBlks.size(); i < e; ++i)
+ ExitBlkSet.insert(ExitBlks[i]);
+ assert(ExitBlkSet.size() == 1);
+ MachineBasicBlock *ExitBlk = *ExitBlks.begin();
+ assert(ExitBlk && "Loop has several exit block");
+ MBBVector LatchBlks;
+ typedef GraphTraits<Inverse<MachineBasicBlock*> > InvMBBTraits;
+ InvMBBTraits::ChildIteratorType PI = InvMBBTraits::child_begin(LoopHeader),
+ PE = InvMBBTraits::child_end(LoopHeader);
+ for (; PI != PE; PI++) {
+ if (LoopRep->contains(*PI))
+ LatchBlks.push_back(*PI);
+ }
+
+ for (unsigned i = 0, e = ExitingMBBs.size(); i < e; ++i)
+ mergeLoopbreakBlock(ExitingMBBs[i], ExitBlk);
+ for (unsigned i = 0, e = LatchBlks.size(); i < e; ++i)
+ settleLoopcontBlock(LatchBlks[i], LoopHeader);
+ int Match = 0;
+ do {
+ Match = 0;
+ Match += serialPatternMatch(LoopHeader);
+ Match += ifPatternMatch(LoopHeader);
+ } while (Match > 0);
+ mergeLooplandBlock(LoopHeader, ExitBlk);
+ MachineLoop *ParentLoop = LoopRep->getParentLoop();
+ if (ParentLoop)
+ MLI->changeLoopFor(LoopHeader, ParentLoop);
+ else
+ MLI->removeBlock(LoopHeader);
+ Visited[LoopRep] = true;
+ return 1;
+}
+
+bool AMDGPUCFGStructurizer::isSameloopDetachedContbreak(
+ MachineBasicBlock *Src1MBB, MachineBasicBlock *Src2MBB) {
+ if (Src1MBB->succ_size() == 0) {
+ MachineLoop *LoopRep = MLI->getLoopFor(Src1MBB);
+ if (LoopRep&& LoopRep == MLI->getLoopFor(Src2MBB)) {
+ MachineBasicBlock *&TheEntry = LLInfoMap[LoopRep];
+ if (TheEntry) {
+ DEBUG(
+ dbgs() << "isLoopContBreakBlock yes src1 = BB"
+ << Src1MBB->getNumber()
+ << " src2 = BB" << Src2MBB->getNumber() << "\n";
+ );
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+int AMDGPUCFGStructurizer::handleJumpintoIf(MachineBasicBlock *HeadMBB,
+ MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
+ int Num = handleJumpintoIfImp(HeadMBB, TrueMBB, FalseMBB);
+ if (Num == 0) {
+ DEBUG(
+ dbgs() << "handleJumpintoIf swap trueBlk and FalseBlk" << "\n";
+ );
+ Num = handleJumpintoIfImp(HeadMBB, FalseMBB, TrueMBB);
+ }
+ return Num;
+}
+
+int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
+ MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
+ int Num = 0;
+ MachineBasicBlock *DownBlk;
+
+ //trueBlk could be the common post dominator
+ DownBlk = TrueMBB;
+
+ DEBUG(
+ dbgs() << "handleJumpintoIfImp head = BB" << HeadMBB->getNumber()
+ << " true = BB" << TrueMBB->getNumber()
+ << ", numSucc=" << TrueMBB->succ_size()
+ << " false = BB" << FalseMBB->getNumber() << "\n";
+ );
+
+ while (DownBlk) {
+ DEBUG(
+ dbgs() << "check down = BB" << DownBlk->getNumber();
+ );
+
+ if (singlePathTo(FalseMBB, DownBlk) == SinglePath_InPath) {
+ DEBUG(
+ dbgs() << " working\n";
+ );
+
+ Num += cloneOnSideEntryTo(HeadMBB, TrueMBB, DownBlk);
+ Num += cloneOnSideEntryTo(HeadMBB, FalseMBB, DownBlk);
+
+ numClonedBlock += Num;
+ Num += serialPatternMatch(*HeadMBB->succ_begin());
+ Num += serialPatternMatch(*std::next(HeadMBB->succ_begin()));
+ Num += ifPatternMatch(HeadMBB);
+ assert(Num > 0);
+
+ break;
+ }
+ DEBUG(
+ dbgs() << " not working\n";
+ );
+ DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin()) : nullptr;
+ } // walk down the postDomTree
+
+ return Num;
+}
+
+void AMDGPUCFGStructurizer::showImproveSimpleJumpintoIf(
+ MachineBasicBlock *HeadMBB, MachineBasicBlock *TrueMBB,
+ MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB, bool Detail) {
+ dbgs() << "head = BB" << HeadMBB->getNumber()
+ << " size = " << HeadMBB->size();
+ if (Detail) {
+ dbgs() << "\n";
+ HeadMBB->print(dbgs());
+ dbgs() << "\n";
+ }
+
+ if (TrueMBB) {
+ dbgs() << ", true = BB" << TrueMBB->getNumber() << " size = "
+ << TrueMBB->size() << " numPred = " << TrueMBB->pred_size();
+ if (Detail) {
+ dbgs() << "\n";
+ TrueMBB->print(dbgs());
+ dbgs() << "\n";
+ }
+ }
+ if (FalseMBB) {
+ dbgs() << ", false = BB" << FalseMBB->getNumber() << " size = "
+ << FalseMBB->size() << " numPred = " << FalseMBB->pred_size();
+ if (Detail) {
+ dbgs() << "\n";
+ FalseMBB->print(dbgs());
+ dbgs() << "\n";
+ }
+ }
+ if (LandMBB) {
+ dbgs() << ", land = BB" << LandMBB->getNumber() << " size = "
+ << LandMBB->size() << " numPred = " << LandMBB->pred_size();
+ if (Detail) {
+ dbgs() << "\n";
+ LandMBB->print(dbgs());
+ dbgs() << "\n";
+ }
+ }
+
+ dbgs() << "\n";
+}
+
+int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
+ MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
+ MachineBasicBlock **LandMBBPtr) {
+ bool MigrateTrue = false;
+ bool MigrateFalse = false;
+
+ MachineBasicBlock *LandBlk = *LandMBBPtr;
+
+ assert((!TrueMBB || TrueMBB->succ_size() <= 1)
+ && (!FalseMBB || FalseMBB->succ_size() <= 1));
+
+ if (TrueMBB == FalseMBB)
+ return 0;
+
+ MigrateTrue = needMigrateBlock(TrueMBB);
+ MigrateFalse = needMigrateBlock(FalseMBB);
+
+ if (!MigrateTrue && !MigrateFalse)
+ return 0;
+
+ // If we need to migrate either trueBlk and falseBlk, migrate the rest that
+ // have more than one predecessors. without doing this, its predecessor
+ // rather than headBlk will have undefined value in initReg.
+ if (!MigrateTrue && TrueMBB && TrueMBB->pred_size() > 1)
+ MigrateTrue = true;
+ if (!MigrateFalse && FalseMBB && FalseMBB->pred_size() > 1)
+ MigrateFalse = true;
+
+ DEBUG(
+ dbgs() << "before improveSimpleJumpintoIf: ";
+ showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0);
+ );
+
+ // org: headBlk => if () {trueBlk} else {falseBlk} => landBlk
+ //
+ // new: headBlk => if () {initReg = 1; org trueBlk branch} else
+ // {initReg = 0; org falseBlk branch }
+ // => landBlk => if (initReg) {org trueBlk} else {org falseBlk}
+ // => org landBlk
+ // if landBlk->pred_size() > 2, put the about if-else inside
+ // if (initReg !=2) {...}
+ //
+ // add initReg = initVal to headBlk
+
+ const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
+ if (!MigrateTrue || !MigrateFalse) {
+ // XXX: We have an opportunity here to optimize the "branch into if" case
+ // here. Branch into if looks like this:
+ // entry
+ // / |
+ // diamond_head branch_from
+ // / \ |
+ // diamond_false diamond_true
+ // \ /
+ // done
+ //
+ // The diamond_head block begins the "if" and the diamond_true block
+ // is the block being "branched into".
+ //
+ // If MigrateTrue is true, then TrueBB is the block being "branched into"
+ // and if MigrateFalse is true, then FalseBB is the block being
+ // "branched into"
+ //
+ // Here is the pseudo code for how I think the optimization should work:
+ // 1. Insert MOV GPR0, 0 before the branch instruction in diamond_head.
+ // 2. Insert MOV GPR0, 1 before the branch instruction in branch_from.
+ // 3. Move the branch instruction from diamond_head into its own basic
+ // block (new_block).
+ // 4. Add an unconditional branch from diamond_head to new_block
+ // 5. Replace the branch instruction in branch_from with an unconditional
+ // branch to new_block. If branch_from has multiple predecessors, then
+ // we need to replace the True/False block in the branch
+ // instruction instead of replacing it.
+ // 6. Change the condition of the branch instruction in new_block from
+ // COND to (COND || GPR0)
+ //
+ // In order insert these MOV instruction, we will need to use the
+ // RegisterScavenger. Usually liveness stops being tracked during
+ // the late machine optimization passes, however if we implement
+ // bool TargetRegisterInfo::requiresRegisterScavenging(
+ // const MachineFunction &MF)
+ // and have it return true, liveness will be tracked correctly
+ // by generic optimization passes. We will also need to make sure that
+ // all of our target-specific passes that run after regalloc and before
+ // the CFGStructurizer track liveness and we will need to modify this pass
+ // to correctly track liveness.
+ //
+ // After the above changes, the new CFG should look like this:
+ // entry
+ // / |
+ // diamond_head branch_from
+ // \ /
+ // new_block
+ // / |
+ // diamond_false diamond_true
+ // \ /
+ // done
+ //
+ // Without this optimization, we are forced to duplicate the diamond_true
+ // block and we will end up with a CFG like this:
+ //
+ // entry
+ // / |
+ // diamond_head branch_from
+ // / \ |
+ // diamond_false diamond_true diamond_true (duplicate)
+ // \ / |
+ // done --------------------|
+ //
+ // Duplicating diamond_true can be very costly especially if it has a
+ // lot of instructions.
+ return 0;
+ }
+
+ int NumNewBlk = 0;
+
+ bool LandBlkHasOtherPred = (LandBlk->pred_size() > 2);
+
+ //insert AMDGPU::ENDIF to avoid special case "input landBlk == NULL"
+ MachineBasicBlock::iterator I = insertInstrBefore(LandBlk, AMDGPU::ENDIF);
+
+ if (LandBlkHasOtherPred) {
+ report_fatal_error("Extra register needed to handle CFG");
+ unsigned CmpResReg =
+ HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
+ report_fatal_error("Extra compare instruction needed to handle CFG");
+ insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET,
+ CmpResReg, DebugLoc());
+ }
+
+ // XXX: We are running this after RA, so creating virtual registers will
+ // cause an assertion failure in the PostRA scheduling pass.
+ unsigned InitReg =
+ HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
+ insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, InitReg,
+ DebugLoc());
+
+ if (MigrateTrue) {
+ migrateInstruction(TrueMBB, LandBlk, I);
+ // need to uncondionally insert the assignment to ensure a path from its
+ // predecessor rather than headBlk has valid value in initReg if
+ // (initVal != 1).
+ report_fatal_error("Extra register needed to handle CFG");
+ }
+ insertInstrBefore(I, AMDGPU::ELSE);
+
+ if (MigrateFalse) {
+ migrateInstruction(FalseMBB, LandBlk, I);
+ // need to uncondionally insert the assignment to ensure a path from its
+ // predecessor rather than headBlk has valid value in initReg if
+ // (initVal != 0)
+ report_fatal_error("Extra register needed to handle CFG");
+ }
+
+ if (LandBlkHasOtherPred) {
+ // add endif
+ insertInstrBefore(I, AMDGPU::ENDIF);
+
+ // put initReg = 2 to other predecessors of landBlk
+ for (MachineBasicBlock::pred_iterator PI = LandBlk->pred_begin(),
+ PE = LandBlk->pred_end(); PI != PE; ++PI) {
+ MachineBasicBlock *MBB = *PI;
+ if (MBB != TrueMBB && MBB != FalseMBB)
+ report_fatal_error("Extra register needed to handle CFG");
+ }
+ }
+ DEBUG(
+ dbgs() << "result from improveSimpleJumpintoIf: ";
+ showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0);
+ );
+
+ // update landBlk
+ *LandMBBPtr = LandBlk;
+
+ return NumNewBlk;
+}
+
+void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB,
+ MachineBasicBlock *SrcMBB) {
+ DEBUG(
+ dbgs() << "serialPattern BB" << DstMBB->getNumber()
+ << " <= BB" << SrcMBB->getNumber() << "\n";
+ );
+ DstMBB->splice(DstMBB->end(), SrcMBB, SrcMBB->begin(), SrcMBB->end());
+
+ DstMBB->removeSuccessor(SrcMBB, true);
+ cloneSuccessorList(DstMBB, SrcMBB);
+
+ removeSuccessor(SrcMBB);
+ MLI->removeBlock(SrcMBB);
+ retireBlock(SrcMBB);
+}
+
+void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
+ MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB,
+ MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB) {
+ assert (TrueMBB);
+ DEBUG(
+ dbgs() << "ifPattern BB" << MBB->getNumber();
+ dbgs() << "{ ";
+ if (TrueMBB) {
+ dbgs() << "BB" << TrueMBB->getNumber();
+ }
+ dbgs() << " } else ";
+ dbgs() << "{ ";
+ if (FalseMBB) {
+ dbgs() << "BB" << FalseMBB->getNumber();
+ }
+ dbgs() << " }\n ";
+ dbgs() << "landBlock: ";
+ if (!LandMBB) {
+ dbgs() << "NULL";
+ } else {
+ dbgs() << "BB" << LandMBB->getNumber();
+ }
+ dbgs() << "\n";
+ );
+
+ int OldOpcode = BranchMI->getOpcode();
+ DebugLoc BranchDL = BranchMI->getDebugLoc();
+
+// transform to
+// if cond
+// trueBlk
+// else
+// falseBlk
+// endif
+// landBlk
+
+ MachineBasicBlock::iterator I = BranchMI;
+ insertCondBranchBefore(I, getBranchNzeroOpcode(OldOpcode),
+ BranchDL);
+
+ if (TrueMBB) {
+ MBB->splice(I, TrueMBB, TrueMBB->begin(), TrueMBB->end());
+ MBB->removeSuccessor(TrueMBB, true);
+ if (LandMBB && TrueMBB->succ_size()!=0)
+ TrueMBB->removeSuccessor(LandMBB, true);
+ retireBlock(TrueMBB);
+ MLI->removeBlock(TrueMBB);
+ }
+
+ if (FalseMBB) {
+ insertInstrBefore(I, AMDGPU::ELSE);
+ MBB->splice(I, FalseMBB, FalseMBB->begin(),
+ FalseMBB->end());
+ MBB->removeSuccessor(FalseMBB, true);
+ if (LandMBB && FalseMBB->succ_size() != 0)
+ FalseMBB->removeSuccessor(LandMBB, true);
+ retireBlock(FalseMBB);
+ MLI->removeBlock(FalseMBB);
+ }
+ insertInstrBefore(I, AMDGPU::ENDIF);
+
+ BranchMI->eraseFromParent();
+
+ if (LandMBB && TrueMBB && FalseMBB)
+ MBB->addSuccessor(LandMBB);
+
+}
+
+void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk,
+ MachineBasicBlock *LandMBB) {
+ DEBUG(dbgs() << "loopPattern header = BB" << DstBlk->getNumber()
+ << " land = BB" << LandMBB->getNumber() << "\n";);
+
+ insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc());
+ insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc());
+ DstBlk->replaceSuccessor(DstBlk, LandMBB);
+}
+
+
+void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
+ MachineBasicBlock *LandMBB) {
+ DEBUG(dbgs() << "loopbreakPattern exiting = BB" << ExitingMBB->getNumber()
+ << " land = BB" << LandMBB->getNumber() << "\n";);
+ MachineInstr *BranchMI = getLoopendBlockBranchInstr(ExitingMBB);
+ assert(BranchMI && isCondBranch(BranchMI));
+ DebugLoc DL = BranchMI->getDebugLoc();
+ MachineBasicBlock *TrueBranch = getTrueBranch(BranchMI);
+ MachineBasicBlock::iterator I = BranchMI;
+ if (TrueBranch != LandMBB)
+ reversePredicateSetter(I, *I->getParent());
+ insertCondBranchBefore(ExitingMBB, I, AMDGPU::IF_PREDICATE_SET, AMDGPU::PREDICATE_BIT, DL);
+ insertInstrBefore(I, AMDGPU::BREAK);
+ insertInstrBefore(I, AMDGPU::ENDIF);
+ //now branchInst can be erase safely
+ BranchMI->eraseFromParent();
+ //now take care of successors, retire blocks
+ ExitingMBB->removeSuccessor(LandMBB, true);
+}
+
+void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB,
+ MachineBasicBlock *ContMBB) {
+ DEBUG(dbgs() << "settleLoopcontBlock conting = BB"
+ << ContingMBB->getNumber()
+ << ", cont = BB" << ContMBB->getNumber() << "\n";);
+
+ MachineInstr *MI = getLoopendBlockBranchInstr(ContingMBB);
+ if (MI) {
+ assert(isCondBranch(MI));
+ MachineBasicBlock::iterator I = MI;
+ MachineBasicBlock *TrueBranch = getTrueBranch(MI);
+ int OldOpcode = MI->getOpcode();
+ DebugLoc DL = MI->getDebugLoc();
+
+ bool UseContinueLogical = ((&*ContingMBB->rbegin()) == MI);
+
+ if (!UseContinueLogical) {
+ int BranchOpcode =
+ TrueBranch == ContMBB ? getBranchNzeroOpcode(OldOpcode) :
+ getBranchZeroOpcode(OldOpcode);
+ insertCondBranchBefore(I, BranchOpcode, DL);
+ // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
+ insertInstrEnd(ContingMBB, AMDGPU::CONTINUE, DL);
+ insertInstrEnd(ContingMBB, AMDGPU::ENDIF, DL);
+ } else {
+ int BranchOpcode =
+ TrueBranch == ContMBB ? getContinueNzeroOpcode(OldOpcode) :
+ getContinueZeroOpcode(OldOpcode);
+ insertCondBranchBefore(I, BranchOpcode, DL);
+ }
+
+ MI->eraseFromParent();
+ } else {
+ // if we've arrived here then we've already erased the branch instruction
+ // travel back up the basic block to see the last reference of our debug
+ // location we've just inserted that reference here so it should be
+ // representative insertEnd to ensure phi-moves, if exist, go before the
+ // continue-instr.
+ insertInstrEnd(ContingMBB, AMDGPU::CONTINUE,
+ getLastDebugLocInBB(ContingMBB));
+ }
+}
+
+int AMDGPUCFGStructurizer::cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
+ MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB) {
+ int Cloned = 0;
+ assert(PreMBB->isSuccessor(SrcMBB));
+ while (SrcMBB && SrcMBB != DstMBB) {
+ assert(SrcMBB->succ_size() == 1);
+ if (SrcMBB->pred_size() > 1) {
+ SrcMBB = cloneBlockForPredecessor(SrcMBB, PreMBB);
+ ++Cloned;
+ }
+
+ PreMBB = SrcMBB;
+ SrcMBB = *SrcMBB->succ_begin();
+ }
+
+ return Cloned;
+}
+
+MachineBasicBlock *
+AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB,
+ MachineBasicBlock *PredMBB) {
+ assert(PredMBB->isSuccessor(MBB) &&
+ "succBlk is not a prececessor of curBlk");
+
+ MachineBasicBlock *CloneMBB = clone(MBB); //clone instructions
+ replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB);
+ //srcBlk, oldBlk, newBlk
+
+ PredMBB->replaceSuccessor(MBB, CloneMBB);
+
+ // add all successor to cloneBlk
+ cloneSuccessorList(CloneMBB, MBB);
+
+ numClonedInstr += MBB->size();
+
+ DEBUG(
+ dbgs() << "Cloned block: " << "BB"
+ << MBB->getNumber() << "size " << MBB->size() << "\n";
+ );
+
+ SHOWNEWBLK(CloneMBB, "result of Cloned block: ");
+
+ return CloneMBB;
+}
+
+void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB,
+ MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I) {
+ MachineBasicBlock::iterator SpliceEnd;
+ //look for the input branchinstr, not the AMDGPU branchinstr
+ MachineInstr *BranchMI = getNormalBlockBranchInstr(SrcMBB);
+ if (!BranchMI) {
+ DEBUG(
+ dbgs() << "migrateInstruction don't see branch instr\n" ;
+ );
+ SpliceEnd = SrcMBB->end();
+ } else {
+ DEBUG(dbgs() << "migrateInstruction see branch instr: " << *BranchMI);
+ SpliceEnd = BranchMI;
+ }
+ DEBUG(
+ dbgs() << "migrateInstruction before splice dstSize = " << DstMBB->size()
+ << "srcSize = " << SrcMBB->size() << "\n";
+ );
+
+ //splice insert before insertPos
+ DstMBB->splice(I, SrcMBB, SrcMBB->begin(), SpliceEnd);
+
+ DEBUG(
+ dbgs() << "migrateInstruction after splice dstSize = " << DstMBB->size()
+ << "srcSize = " << SrcMBB->size() << '\n';
+ );
+}
+
+MachineBasicBlock *
+AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) {
+ MachineBasicBlock *LoopHeader = LoopRep->getHeader();
+ MachineBasicBlock *LoopLatch = LoopRep->getLoopLatch();
+
+ if (!LoopHeader || !LoopLatch)
+ return nullptr;
+ MachineInstr *BranchMI = getLoopendBlockBranchInstr(LoopLatch);
+ // Is LoopRep an infinite loop ?
+ if (!BranchMI || !isUncondBranch(BranchMI))
+ return nullptr;
+
+ MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
+ FuncRep->push_back(DummyExitBlk); //insert to function
+ SHOWNEWBLK(DummyExitBlk, "DummyExitBlock to normalize infiniteLoop: ");
+ DEBUG(dbgs() << "Old branch instr: " << *BranchMI << "\n";);
+ LLVMContext &Ctx = LoopHeader->getParent()->getFunction()->getContext();
+ Ctx.emitError("Extra register needed to handle CFG");
+ return nullptr;
+}
+
+void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) {
+ MachineInstr *BranchMI;
+
+ // I saw two unconditional branch in one basic block in example
+ // test_fc_do_while_or.c need to fix the upstream on this to remove the loop.
+ while ((BranchMI = getLoopendBlockBranchInstr(MBB))
+ && isUncondBranch(BranchMI)) {
+ DEBUG(dbgs() << "Removing uncond branch instr: " << *BranchMI);
+ BranchMI->eraseFromParent();
+ }
+}
+
+void AMDGPUCFGStructurizer::removeRedundantConditionalBranch(
+ MachineBasicBlock *MBB) {
+ if (MBB->succ_size() != 2)
+ return;
+ MachineBasicBlock *MBB1 = *MBB->succ_begin();
+ MachineBasicBlock *MBB2 = *std::next(MBB->succ_begin());
+ if (MBB1 != MBB2)
+ return;
+
+ MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
+ assert(BranchMI && isCondBranch(BranchMI));
+ DEBUG(dbgs() << "Removing unneeded cond branch instr: " << *BranchMI);
+ BranchMI->eraseFromParent();
+ SHOWNEWBLK(MBB1, "Removing redundant successor");
+ MBB->removeSuccessor(MBB1, true);
+}
+
+void AMDGPUCFGStructurizer::addDummyExitBlock(
+ SmallVectorImpl<MachineBasicBlock*> &RetMBB) {
+ MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
+ FuncRep->push_back(DummyExitBlk); //insert to function
+ insertInstrEnd(DummyExitBlk, AMDGPU::RETURN);
+
+ for (SmallVectorImpl<MachineBasicBlock *>::iterator It = RetMBB.begin(),
+ E = RetMBB.end(); It != E; ++It) {
+ MachineBasicBlock *MBB = *It;
+ MachineInstr *MI = getReturnInstr(MBB);
+ if (MI)
+ MI->eraseFromParent();
+ MBB->addSuccessor(DummyExitBlk);
+ DEBUG(
+ dbgs() << "Add dummyExitBlock to BB" << MBB->getNumber()
+ << " successors\n";
+ );
+ }
+ SHOWNEWBLK(DummyExitBlk, "DummyExitBlock: ");
+}
+
+void AMDGPUCFGStructurizer::removeSuccessor(MachineBasicBlock *MBB) {
+ while (MBB->succ_size())
+ MBB->removeSuccessor(*MBB->succ_begin());
+}
+
+void AMDGPUCFGStructurizer::recordSccnum(MachineBasicBlock *MBB,
+ int SccNum) {
+ BlockInformation *&srcBlkInfo = BlockInfoMap[MBB];
+ if (!srcBlkInfo)
+ srcBlkInfo = new BlockInformation();
+ srcBlkInfo->SccNum = SccNum;
+}
+
+void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) {
+ DEBUG(
+ dbgs() << "Retiring BB" << MBB->getNumber() << "\n";
+ );
+
+ BlockInformation *&SrcBlkInfo = BlockInfoMap[MBB];
+
+ if (!SrcBlkInfo)
+ SrcBlkInfo = new BlockInformation();
+
+ SrcBlkInfo->IsRetired = true;
+ assert(MBB->succ_size() == 0 && MBB->pred_size() == 0
+ && "can't retire block yet");
+}
+
+char AMDGPUCFGStructurizer::ID = 0;
+
+} // end anonymous namespace
+
+
+INITIALIZE_PASS_BEGIN(AMDGPUCFGStructurizer, "amdgpustructurizer",
+ "AMDGPU CFG Structurizer", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(AMDGPUCFGStructurizer, "amdgpustructurizer",
+ "AMDGPU CFG Structurizer", false, false)
+
+FunctionPass *llvm::createAMDGPUCFGStructurizerPass() {
+ return new AMDGPUCFGStructurizer();
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h b/contrib/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h
new file mode 100644
index 000000000000..5d243e949fd3
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h
@@ -0,0 +1,658 @@
+//===-- AMDGPUKernelCodeT.h - Print AMDGPU assembly code ---------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file AMDKernelCodeT.h
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDKERNELCODET_H
+#define AMDKERNELCODET_H
+
+#include "llvm/MC/SubtargetFeature.h"
+
+#include <cstddef>
+#include <cstdint>
+
+#include "llvm/Support/Debug.h"
+//---------------------------------------------------------------------------//
+// AMD Kernel Code, and its dependencies //
+//---------------------------------------------------------------------------//
+
+typedef uint8_t hsa_powertwo8_t;
+typedef uint32_t hsa_ext_code_kind_t;
+typedef uint8_t hsa_ext_brig_profile8_t;
+typedef uint8_t hsa_ext_brig_machine_model8_t;
+typedef uint64_t hsa_ext_control_directive_present64_t;
+typedef uint16_t hsa_ext_exception_kind16_t;
+typedef uint32_t hsa_ext_code_kind32_t;
+
+typedef struct hsa_dim3_s {
+ uint32_t x;
+ uint32_t y;
+ uint32_t z;
+} hsa_dim3_t;
+
+/// The version of the amd_*_code_t struct. Minor versions must be
+/// backward compatible.
+typedef uint32_t amd_code_version32_t;
+enum amd_code_version_t {
+ AMD_CODE_VERSION_MAJOR = 0,
+ AMD_CODE_VERSION_MINOR = 1
+};
+
+// Sets val bits for specified mask in specified dst packed instance.
+#define AMD_HSA_BITS_SET(dst, mask, val) \
+ dst &= (~(1 << mask ## _SHIFT) & ~mask); \
+ dst |= (((val) << mask ## _SHIFT) & mask)
+
+// Gets bits for specified mask from specified src packed instance.
+#define AMD_HSA_BITS_GET(src, mask) \
+ ((src & mask) >> mask ## _SHIFT) \
+
+/// The values used to define the number of bytes to use for the
+/// swizzle element size.
+enum amd_element_byte_size_t {
+ AMD_ELEMENT_2_BYTES = 0,
+ AMD_ELEMENT_4_BYTES = 1,
+ AMD_ELEMENT_8_BYTES = 2,
+ AMD_ELEMENT_16_BYTES = 3
+};
+
+/// Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and
+/// COMPUTE_PGM_RSRC2 registers.
+typedef uint64_t amd_compute_pgm_resource_register64_t;
+
+/// Every amd_*_code_t has the following properties, which are composed of
+/// a number of bit fields. Every bit field has a mask (AMD_CODE_PROPERTY_*),
+/// bit width (AMD_CODE_PROPERTY_*_WIDTH, and bit shift amount
+/// (AMD_CODE_PROPERTY_*_SHIFT) for convenient access. Unused bits must be 0.
+///
+/// (Note that bit fields cannot be used as their layout is
+/// implementation defined in the C standard and so cannot be used to
+/// specify an ABI)
+typedef uint32_t amd_code_property32_t;
+enum amd_code_property_mask_t {
+
+ /// Enable the setup of the SGPR user data registers
+ /// (AMD_CODE_PROPERTY_ENABLE_SGPR_*), see documentation of amd_kernel_code_t
+ /// for initial register state.
+ ///
+ /// The total number of SGPRuser data registers requested must not
+ /// exceed 16. Any requests beyond 16 will be ignored.
+ ///
+ /// Used to set COMPUTE_PGM_RSRC2.USER_SGPR (set to total count of
+ /// SGPR user data registers enabled up to 16).
+
+ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT = 0,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH = 1,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT,
+
+ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT = 1,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH = 1,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT,
+
+ AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT = 2,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH = 1,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT,
+
+ AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT = 3,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH = 1,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT,
+
+ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT = 4,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH = 1,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT,
+
+ AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT = 5,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH = 1,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT,
+
+ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT = 6,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH = 1,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT,
+
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT = 7,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH = 1,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT,
+
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT = 8,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH = 1,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT,
+
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT = 9,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH = 1,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT,
+
+ AMD_CODE_PROPERTY_RESERVED1_SHIFT = 10,
+ AMD_CODE_PROPERTY_RESERVED1_WIDTH = 6,
+ AMD_CODE_PROPERTY_RESERVED1 = ((1 << AMD_CODE_PROPERTY_RESERVED1_WIDTH) - 1) << AMD_CODE_PROPERTY_RESERVED1_SHIFT,
+
+ /// Control wave ID base counter for GDS ordered-append. Used to set
+ /// COMPUTE_DISPATCH_INITIATOR.ORDERED_APPEND_ENBL. (Not sure if
+ /// ORDERED_APPEND_MODE also needs to be settable)
+ AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT = 16,
+ AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH = 1,
+ AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS = ((1 << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT,
+
+ /// The interleave (swizzle) element size in bytes required by the
+ /// code for private memory. This must be 2, 4, 8 or 16. This value
+ /// is provided to the finalizer when it is invoked and is recorded
+ /// here. The hardware will interleave the memory requests of each
+ /// lane of a wavefront by this element size to ensure each
+ /// work-item gets a distinct memory memory location. Therefore, the
+ /// finalizer ensures that all load and store operations done to
+ /// private memory do not exceed this size. For example, if the
+ /// element size is 4 (32-bits or dword) and a 64-bit value must be
+ /// loaded, the finalizer will generate two 32-bit loads. This
+ /// ensures that the interleaving will get the work-item
+ /// specific dword for both halves of the 64-bit value. If it just
+ /// did a 64-bit load then it would get one dword which belonged to
+ /// its own work-item, but the second dword would belong to the
+ /// adjacent lane work-item since the interleaving is in dwords.
+ ///
+ /// The value used must match the value that the runtime configures
+ /// the GPU flat scratch (SH_STATIC_MEM_CONFIG.ELEMENT_SIZE). This
+ /// is generally DWORD.
+ ///
+ /// uSE VALUES FROM THE AMD_ELEMENT_BYTE_SIZE_T ENUM.
+ AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT = 17,
+ AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH = 2,
+ AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE = ((1 << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT,
+
+ /// Are global memory addresses 64 bits. Must match
+ /// amd_kernel_code_t.hsail_machine_model ==
+ /// HSA_MACHINE_LARGE. Must also match
+ /// SH_MEM_CONFIG.PTR32 (GFX6 (SI)/GFX7 (CI)),
+ /// SH_MEM_CONFIG.ADDRESS_MODE (GFX8 (VI)+).
+ AMD_CODE_PROPERTY_IS_PTR64_SHIFT = 19,
+ AMD_CODE_PROPERTY_IS_PTR64_WIDTH = 1,
+ AMD_CODE_PROPERTY_IS_PTR64 = ((1 << AMD_CODE_PROPERTY_IS_PTR64_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_PTR64_SHIFT,
+
+ /// Indicate if the generated ISA is using a dynamically sized call
+ /// stack. This can happen if calls are implemented using a call
+ /// stack and recursion, alloca or calls to indirect functions are
+ /// present. In these cases the Finalizer cannot compute the total
+ /// private segment size at compile time. In this case the
+ /// workitem_private_segment_byte_size only specifies the statically
+ /// know private segment size, and additional space must be added
+ /// for the call stack.
+ AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT = 20,
+ AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH = 1,
+ AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK = ((1 << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT,
+
+ /// Indicate if code generated has support for debugging.
+ AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT = 21,
+ AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH = 1,
+ AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT,
+
+ AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT = 22,
+ AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH = 1,
+ AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT,
+
+ AMD_CODE_PROPERTY_RESERVED2_SHIFT = 23,
+ AMD_CODE_PROPERTY_RESERVED2_WIDTH = 9,
+ AMD_CODE_PROPERTY_RESERVED2 = ((1 << AMD_CODE_PROPERTY_RESERVED2_WIDTH) - 1) << AMD_CODE_PROPERTY_RESERVED2_SHIFT
+};
+
+/// @brief The hsa_ext_control_directives_t specifies the values for the HSAIL
+/// control directives. These control how the finalizer generates code. This
+/// struct is used both as an argument to hsaFinalizeKernel to specify values for
+/// the control directives, and is used in HsaKernelCode to record the values of
+/// the control directives that the finalize used when generating the code which
+/// either came from the finalizer argument or explicit HSAIL control
+/// directives. See the definition of the control directives in HSA Programmer's
+/// Reference Manual which also defines how the values specified as finalizer
+/// arguments have to agree with the control directives in the HSAIL code.
+typedef struct hsa_ext_control_directives_s {
+ /// This is a bit set indicating which control directives have been
+ /// specified. If the value is 0 then there are no control directives specified
+ /// and the rest of the fields can be ignored. The bits are accessed using the
+ /// hsa_ext_control_directives_present_mask_t. Any control directive that is not
+ /// enabled in this bit set must have the value of all 0s.
+ hsa_ext_control_directive_present64_t enabled_control_directives;
+
+ /// If enableBreakExceptions is not enabled then must be 0, otherwise must be
+ /// non-0 and specifies the set of HSAIL exceptions that must have the BREAK
+ /// policy enabled. If this set is not empty then the generated code may have
+ /// lower performance than if the set is empty. If the kernel being finalized
+ /// has any enablebreakexceptions control directives, then the values specified
+ /// by this argument are unioned with the values in these control
+ /// directives. If any of the functions the kernel calls have an
+ /// enablebreakexceptions control directive, then they must be equal or a
+ /// subset of, this union.
+ hsa_ext_exception_kind16_t enable_break_exceptions;
+
+ /// If enableDetectExceptions is not enabled then must be 0, otherwise must be
+ /// non-0 and specifies the set of HSAIL exceptions that must have the DETECT
+ /// policy enabled. If this set is not empty then the generated code may have
+ /// lower performance than if the set is empty. However, an implementation
+ /// should endeavour to make the performance impact small. If the kernel being
+ /// finalized has any enabledetectexceptions control directives, then the
+ /// values specified by this argument are unioned with the values in these
+ /// control directives. If any of the functions the kernel calls have an
+ /// enabledetectexceptions control directive, then they must be equal or a
+ /// subset of, this union.
+ hsa_ext_exception_kind16_t enable_detect_exceptions;
+
+ /// If maxDynamicGroupSize is not enabled then must be 0, and any amount of
+ /// dynamic group segment can be allocated for a dispatch, otherwise the value
+ /// specifies the maximum number of bytes of dynamic group segment that can be
+ /// allocated for a dispatch. If the kernel being finalized has any
+ /// maxdynamicsize control directives, then the values must be the same, and
+ /// must be the same as this argument if it is enabled. This value can be used
+ /// by the finalizer to determine the maximum number of bytes of group memory
+ /// used by each work-group by adding this value to the group memory required
+ /// for all group segment variables used by the kernel and all functions it
+ /// calls, and group memory used to implement other HSAIL features such as
+ /// fbarriers and the detect exception operations. This can allow the finalizer
+ /// to determine the expected number of work-groups that can be executed by a
+ /// compute unit and allow more resources to be allocated to the work-items if
+ /// it is known that fewer work-groups can be executed due to group memory
+ /// limitations.
+ uint32_t max_dynamic_group_size;
+
+ /// If maxFlatGridSize is not enabled then must be 0, otherwise must be greater
+ /// than 0. See HSA Programmer's Reference Manual description of
+ /// maxflatgridsize control directive.
+ uint32_t max_flat_grid_size;
+
+ /// If maxFlatWorkgroupSize is not enabled then must be 0, otherwise must be
+ /// greater than 0. See HSA Programmer's Reference Manual description of
+ /// maxflatworkgroupsize control directive.
+ uint32_t max_flat_workgroup_size;
+
+ /// If requestedWorkgroupsPerCu is not enabled then must be 0, and the
+ /// finalizer is free to generate ISA that may result in any number of
+ /// work-groups executing on a single compute unit. Otherwise, the finalizer
+ /// should attempt to generate ISA that will allow the specified number of
+ /// work-groups to execute on a single compute unit. This is only a hint and
+ /// can be ignored by the finalizer. If the kernel being finalized, or any of
+ /// the functions it calls, has a requested control directive, then the values
+ /// must be the same. This can be used to determine the number of resources
+ /// that should be allocated to a single work-group and work-item. For example,
+ /// a low value may allow more resources to be allocated, resulting in higher
+ /// per work-item performance, as it is known there will never be more than the
+ /// specified number of work-groups actually executing on the compute
+ /// unit. Conversely, a high value may allocate fewer resources, resulting in
+ /// lower per work-item performance, which is offset by the fact it allows more
+ /// work-groups to actually execute on the compute unit.
+ uint32_t requested_workgroups_per_cu;
+
+ /// If not enabled then all elements for Dim3 must be 0, otherwise every
+ /// element must be greater than 0. See HSA Programmer's Reference Manual
+ /// description of requiredgridsize control directive.
+ hsa_dim3_t required_grid_size;
+
+ /// If requiredWorkgroupSize is not enabled then all elements for Dim3 must be
+ /// 0, and the produced code can be dispatched with any legal work-group range
+ /// consistent with the dispatch dimensions. Otherwise, the code produced must
+ /// always be dispatched with the specified work-group range. No element of the
+ /// specified range must be 0. It must be consistent with required_dimensions
+ /// and max_flat_workgroup_size. If the kernel being finalized, or any of the
+ /// functions it calls, has a requiredworkgroupsize control directive, then the
+ /// values must be the same. Specifying a value can allow the finalizer to
+ /// optimize work-group id operations, and if the number of work-items in the
+ /// work-group is less than the WAVESIZE then barrier operations can be
+ /// optimized to just a memory fence.
+ hsa_dim3_t required_workgroup_size;
+
+ /// If requiredDim is not enabled then must be 0 and the produced kernel code
+ /// can be dispatched with 1, 2 or 3 dimensions. If enabled then the value is
+ /// 1..3 and the code produced must only be dispatched with a dimension that
+ /// matches. Other values are illegal. If the kernel being finalized, or any of
+ /// the functions it calls, has a requireddimsize control directive, then the
+ /// values must be the same. This can be used to optimize the code generated to
+ /// compute the absolute and flat work-group and work-item id, and the dim
+ /// HSAIL operations.
+ uint8_t required_dim;
+
+ /// Reserved. Must be 0.
+ uint8_t reserved[75];
+} hsa_ext_control_directives_t;
+
+/// AMD Kernel Code Object (amd_kernel_code_t). GPU CP uses the AMD Kernel
+/// Code Object to set up the hardware to execute the kernel dispatch.
+///
+/// Initial Kernel Register State.
+///
+/// Initial kernel register state will be set up by CP/SPI prior to the start
+/// of execution of every wavefront. This is limited by the constraints of the
+/// current hardware.
+///
+/// The order of the SGPR registers is defined, but the Finalizer can specify
+/// which ones are actually setup in the amd_kernel_code_t object using the
+/// enable_sgpr_* bit fields. The register numbers used for enabled registers
+/// are dense starting at SGPR0: the first enabled register is SGPR0, the next
+/// enabled register is SGPR1 etc.; disabled registers do not have an SGPR
+/// number.
+///
+/// The initial SGPRs comprise up to 16 User SRGPs that are set up by CP and
+/// apply to all waves of the grid. It is possible to specify more than 16 User
+/// SGPRs using the enable_sgpr_* bit fields, in which case only the first 16
+/// are actually initialized. These are then immediately followed by the System
+/// SGPRs that are set up by ADC/SPI and can have different values for each wave
+/// of the grid dispatch.
+///
+/// SGPR register initial state is defined as follows:
+///
+/// Private Segment Buffer (enable_sgpr_private_segment_buffer):
+/// Number of User SGPR registers: 4. V# that can be used, together with
+/// Scratch Wave Offset as an offset, to access the Private/Spill/Arg
+/// segments using a segment address. It must be set as follows:
+/// - Base address: of the scratch memory area used by the dispatch. It
+/// does not include the scratch wave offset. It will be the per process
+/// SH_HIDDEN_PRIVATE_BASE_VMID plus any offset from this dispatch (for
+/// example there may be a per pipe offset, or per AQL Queue offset).
+/// - Stride + data_format: Element Size * Index Stride (???)
+/// - Cache swizzle: ???
+/// - Swizzle enable: SH_STATIC_MEM_CONFIG.SWIZZLE_ENABLE (must be 1 for
+/// scratch)
+/// - Num records: Flat Scratch Work Item Size / Element Size (???)
+/// - Dst_sel_*: ???
+/// - Num_format: ???
+/// - Element_size: SH_STATIC_MEM_CONFIG.ELEMENT_SIZE (will be DWORD, must
+/// agree with amd_kernel_code_t.privateElementSize)
+/// - Index_stride: SH_STATIC_MEM_CONFIG.INDEX_STRIDE (will be 64 as must
+/// be number of wavefront lanes for scratch, must agree with
+/// amd_kernel_code_t.wavefrontSize)
+/// - Add tid enable: 1
+/// - ATC: from SH_MEM_CONFIG.PRIVATE_ATC,
+/// - Hash_enable: ???
+/// - Heap: ???
+/// - Mtype: from SH_STATIC_MEM_CONFIG.PRIVATE_MTYPE
+/// - Type: 0 (a buffer) (???)
+///
+/// Dispatch Ptr (enable_sgpr_dispatch_ptr):
+/// Number of User SGPR registers: 2. 64 bit address of AQL dispatch packet
+/// for kernel actually executing.
+///
+/// Queue Ptr (enable_sgpr_queue_ptr):
+/// Number of User SGPR registers: 2. 64 bit address of AmdQueue object for
+/// AQL queue on which the dispatch packet was queued.
+///
+/// Kernarg Segment Ptr (enable_sgpr_kernarg_segment_ptr):
+/// Number of User SGPR registers: 2. 64 bit address of Kernarg segment. This
+/// is directly copied from the kernargPtr in the dispatch packet. Having CP
+/// load it once avoids loading it at the beginning of every wavefront.
+///
+/// Dispatch Id (enable_sgpr_dispatch_id):
+/// Number of User SGPR registers: 2. 64 bit Dispatch ID of the dispatch
+/// packet being executed.
+///
+/// Flat Scratch Init (enable_sgpr_flat_scratch_init):
+/// Number of User SGPR registers: 2. This is 2 SGPRs.
+///
+/// For CI/VI:
+/// The first SGPR is a 32 bit byte offset from SH_MEM_HIDDEN_PRIVATE_BASE
+/// to base of memory for scratch for this dispatch. This is the same offset
+/// used in computing the Scratch Segment Buffer base address. The value of
+/// Scratch Wave Offset must be added by the kernel code and moved to
+/// SGPRn-4 for use as the FLAT SCRATCH BASE in flat memory instructions.
+///
+/// The second SGPR is 32 bit byte size of a single work-item's scratch
+/// memory usage. This is directly loaded from the dispatch packet Private
+/// Segment Byte Size and rounded up to a multiple of DWORD.
+///
+/// \todo [Does CP need to round this to >4 byte alignment?]
+///
+/// The kernel code must move to SGPRn-3 for use as the FLAT SCRATCH SIZE in
+/// flat memory instructions. Having CP load it once avoids loading it at
+/// the beginning of every wavefront.
+///
+/// For PI:
+/// This is the 64 bit base address of the scratch backing memory for
+/// allocated by CP for this dispatch.
+///
+/// Private Segment Size (enable_sgpr_private_segment_size):
+/// Number of User SGPR registers: 1. The 32 bit byte size of a single
+/// work-item's scratch memory allocation. This is the value from the dispatch
+/// packet. Private Segment Byte Size rounded up by CP to a multiple of DWORD.
+///
+/// \todo [Does CP need to round this to >4 byte alignment?]
+///
+/// Having CP load it once avoids loading it at the beginning of every
+/// wavefront.
+///
+/// \todo [This will not be used for CI/VI since it is the same value as
+/// the second SGPR of Flat Scratch Init. However, it is need for PI which
+/// changes meaning of Flat Scratchg Init..]
+///
+/// Grid Work-Group Count X (enable_sgpr_grid_workgroup_count_x):
+/// Number of User SGPR registers: 1. 32 bit count of the number of
+/// work-groups in the X dimension for the grid being executed. Computed from
+/// the fields in the HsaDispatchPacket as
+/// ((gridSize.x+workgroupSize.x-1)/workgroupSize.x).
+///
+/// Grid Work-Group Count Y (enable_sgpr_grid_workgroup_count_y):
+/// Number of User SGPR registers: 1. 32 bit count of the number of
+/// work-groups in the Y dimension for the grid being executed. Computed from
+/// the fields in the HsaDispatchPacket as
+/// ((gridSize.y+workgroupSize.y-1)/workgroupSize.y).
+///
+/// Only initialized if <16 previous SGPRs initialized.
+///
+/// Grid Work-Group Count Z (enable_sgpr_grid_workgroup_count_z):
+/// Number of User SGPR registers: 1. 32 bit count of the number of
+/// work-groups in the Z dimension for the grid being executed. Computed
+/// from the fields in the HsaDispatchPacket as
+/// ((gridSize.z+workgroupSize.z-1)/workgroupSize.z).
+///
+/// Only initialized if <16 previous SGPRs initialized.
+///
+/// Work-Group Id X (enable_sgpr_workgroup_id_x):
+/// Number of System SGPR registers: 1. 32 bit work group id in X dimension
+/// of grid for wavefront. Always present.
+///
+/// Work-Group Id Y (enable_sgpr_workgroup_id_y):
+/// Number of System SGPR registers: 1. 32 bit work group id in Y dimension
+/// of grid for wavefront.
+///
+/// Work-Group Id Z (enable_sgpr_workgroup_id_z):
+/// Number of System SGPR registers: 1. 32 bit work group id in Z dimension
+/// of grid for wavefront. If present then Work-group Id Y will also be
+/// present
+///
+/// Work-Group Info (enable_sgpr_workgroup_info):
+/// Number of System SGPR registers: 1. {first_wave, 14'b0000,
+/// ordered_append_term[10:0], threadgroup_size_in_waves[5:0]}
+///
+/// Private Segment Wave Byte Offset
+/// (enable_sgpr_private_segment_wave_byte_offset):
+/// Number of System SGPR registers: 1. 32 bit byte offset from base of
+/// dispatch scratch base. Must be used as an offset with Private/Spill/Arg
+/// segment address when using Scratch Segment Buffer. It must be added to
+/// Flat Scratch Offset if setting up FLAT SCRATCH for flat addressing.
+///
+///
+/// The order of the VGPR registers is defined, but the Finalizer can specify
+/// which ones are actually setup in the amd_kernel_code_t object using the
+/// enableVgpr* bit fields. The register numbers used for enabled registers
+/// are dense starting at VGPR0: the first enabled register is VGPR0, the next
+/// enabled register is VGPR1 etc.; disabled registers do not have an VGPR
+/// number.
+///
+/// VGPR register initial state is defined as follows:
+///
+/// Work-Item Id X (always initialized):
+/// Number of registers: 1. 32 bit work item id in X dimension of work-group
+/// for wavefront lane.
+///
+/// Work-Item Id X (enable_vgpr_workitem_id > 0):
+/// Number of registers: 1. 32 bit work item id in Y dimension of work-group
+/// for wavefront lane.
+///
+/// Work-Item Id X (enable_vgpr_workitem_id > 0):
+/// Number of registers: 1. 32 bit work item id in Z dimension of work-group
+/// for wavefront lane.
+///
+///
+/// The setting of registers is being done by existing GPU hardware as follows:
+/// 1) SGPRs before the Work-Group Ids are set by CP using the 16 User Data
+/// registers.
+/// 2) Work-group Id registers X, Y, Z are set by SPI which supports any
+/// combination including none.
+/// 3) Scratch Wave Offset is also set by SPI which is why its value cannot
+/// be added into the value Flat Scratch Offset which would avoid the
+/// Finalizer generated prolog having to do the add.
+/// 4) The VGPRs are set by SPI which only supports specifying either (X),
+/// (X, Y) or (X, Y, Z).
+///
+/// Flat Scratch Dispatch Offset and Flat Scratch Size are adjacent SGRRs so
+/// they can be moved as a 64 bit value to the hardware required SGPRn-3 and
+/// SGPRn-4 respectively using the Finalizer ?FLAT_SCRATCH? Register.
+///
+/// The global segment can be accessed either using flat operations or buffer
+/// operations. If buffer operations are used then the Global Buffer used to
+/// access HSAIL Global/Readonly/Kernarg (which are combine) segments using a
+/// segment address is not passed into the kernel code by CP since its base
+/// address is always 0. Instead the Finalizer generates prolog code to
+/// initialize 4 SGPRs with a V# that has the following properties, and then
+/// uses that in the buffer instructions:
+/// - base address of 0
+/// - no swizzle
+/// - ATC=1
+/// - MTYPE set to support memory coherence specified in
+/// amd_kernel_code_t.globalMemoryCoherence
+///
+/// When the Global Buffer is used to access the Kernarg segment, must add the
+/// dispatch packet kernArgPtr to a kernarg segment address before using this V#.
+/// Alternatively scalar loads can be used if the kernarg offset is uniform, as
+/// the kernarg segment is constant for the duration of the kernel execution.
+///
+
+typedef struct amd_kernel_code_s {
+ uint32_t amd_kernel_code_version_major;
+ uint32_t amd_kernel_code_version_minor;
+ uint16_t amd_machine_kind;
+ uint16_t amd_machine_version_major;
+ uint16_t amd_machine_version_minor;
+ uint16_t amd_machine_version_stepping;
+
+ /// Byte offset (possibly negative) from start of amd_kernel_code_t
+ /// object to kernel's entry point instruction. The actual code for
+ /// the kernel is required to be 256 byte aligned to match hardware
+ /// requirements (SQ cache line is 16). The code must be position
+ /// independent code (PIC) for AMD devices to give runtime the
+ /// option of copying code to discrete GPU memory or APU L2
+ /// cache. The Finalizer should endeavour to allocate all kernel
+ /// machine code in contiguous memory pages so that a device
+ /// pre-fetcher will tend to only pre-fetch Kernel Code objects,
+ /// improving cache performance.
+ int64_t kernel_code_entry_byte_offset;
+
+ /// Range of bytes to consider prefetching expressed as an offset
+ /// and size. The offset is from the start (possibly negative) of
+ /// amd_kernel_code_t object. Set both to 0 if no prefetch
+ /// information is available.
+ int64_t kernel_code_prefetch_byte_offset;
+ uint64_t kernel_code_prefetch_byte_size;
+
+ /// Number of bytes of scratch backing memory required for full
+ /// occupancy of target chip. This takes into account the number of
+ /// bytes of scratch per work-item, the wavefront size, the maximum
+ /// number of wavefronts per CU, and the number of CUs. This is an
+ /// upper limit on scratch. If the grid being dispatched is small it
+ /// may only need less than this. If the kernel uses no scratch, or
+ /// the Finalizer has not computed this value, it must be 0.
+ uint64_t max_scratch_backing_memory_byte_size;
+
+ /// Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and
+ /// COMPUTE_PGM_RSRC2 registers.
+ uint64_t compute_pgm_resource_registers;
+
+ /// Code properties. See amd_code_property_mask_t for a full list of
+ /// properties.
+ uint32_t code_properties;
+
+ /// The amount of memory required for the combined private, spill
+ /// and arg segments for a work-item in bytes. If
+ /// is_dynamic_callstack is 1 then additional space must be added to
+ /// this value for the call stack.
+ uint32_t workitem_private_segment_byte_size;
+
+ /// The amount of group segment memory required by a work-group in
+ /// bytes. This does not include any dynamically allocated group
+ /// segment memory that may be added when the kernel is
+ /// dispatched.
+ uint32_t workgroup_group_segment_byte_size;
+
+ /// Number of byte of GDS required by kernel dispatch. Must be 0 if
+ /// not using GDS.
+ uint32_t gds_segment_byte_size;
+
+ /// The size in bytes of the kernarg segment that holds the values
+ /// of the arguments to the kernel. This could be used by CP to
+ /// prefetch the kernarg segment pointed to by the dispatch packet.
+ uint64_t kernarg_segment_byte_size;
+
+ /// Number of fbarrier's used in the kernel and all functions it
+ /// calls. If the implementation uses group memory to allocate the
+ /// fbarriers then that amount must already be included in the
+ /// workgroup_group_segment_byte_size total.
+ uint32_t workgroup_fbarrier_count;
+
+ /// Number of scalar registers used by a wavefront. This includes
+ /// the special SGPRs for VCC, Flat Scratch Base, Flat Scratch Size
+ /// and XNACK (for GFX8 (VI)). It does not include the 16 SGPR added if a
+ /// trap handler is enabled. Used to set COMPUTE_PGM_RSRC1.SGPRS.
+ uint16_t wavefront_sgpr_count;
+
+ /// Number of vector registers used by each work-item. Used to set
+ /// COMPUTE_PGM_RSRC1.VGPRS.
+ uint16_t workitem_vgpr_count;
+
+ /// If reserved_vgpr_count is 0 then must be 0. Otherwise, this is the
+ /// first fixed VGPR number reserved.
+ uint16_t reserved_vgpr_first;
+
+ /// The number of consecutive VGPRs reserved by the client. If
+ /// is_debug_supported then this count includes VGPRs reserved
+ /// for debugger use.
+ uint16_t reserved_vgpr_count;
+
+ /// If reserved_sgpr_count is 0 then must be 0. Otherwise, this is the
+ /// first fixed SGPR number reserved.
+ uint16_t reserved_sgpr_first;
+
+ /// The number of consecutive SGPRs reserved by the client. If
+ /// is_debug_supported then this count includes SGPRs reserved
+ /// for debugger use.
+ uint16_t reserved_sgpr_count;
+
+ /// If is_debug_supported is 0 then must be 0. Otherwise, this is the
+ /// fixed SGPR number used to hold the wave scratch offset for the
+ /// entire kernel execution, or uint16_t(-1) if the register is not
+ /// used or not known.
+ uint16_t debug_wavefront_private_segment_offset_sgpr;
+
+ /// If is_debug_supported is 0 then must be 0. Otherwise, this is the
+ /// fixed SGPR number of the first of 4 SGPRs used to hold the
+ /// scratch V# used for the entire kernel execution, or uint16_t(-1)
+ /// if the registers are not used or not known.
+ uint16_t debug_private_segment_buffer_sgpr;
+
+ /// The maximum byte alignment of variables used by the kernel in
+ /// the specified memory segment. Expressed as a power of two. Must
+ /// be at least HSA_POWERTWO_16.
+ uint8_t kernarg_segment_alignment;
+ uint8_t group_segment_alignment;
+ uint8_t private_segment_alignment;
+
+ /// Wavefront size expressed as a power of two. Must be a power of 2
+ /// in range 1..64 inclusive. Used to support runtime query that
+ /// obtains wavefront size, which may be used by application to
+ /// allocated dynamic group memory and set the dispatch work-group
+ /// size.
+ uint8_t wavefront_size;
+
+ int32_t call_convention;
+ uint8_t reserved3[12];
+ uint64_t runtime_loader_kernel_symbol;
+ uint64_t control_directives[16];
+} amd_kernel_code_t;
+
+#endif // AMDKERNELCODET_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
new file mode 100644
index 000000000000..a6c31629e7c4
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -0,0 +1,3599 @@
+//===-- AMDGPUAsmParser.cpp - Parse SI asm to MCInst instructions ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDKernelCodeT.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "MCTargetDesc/AMDGPUTargetStreamer.h"
+#include "SIDefines.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "Utils/AMDKernelCodeTUtils.h"
+#include "Utils/AMDGPUAsmUtils.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCAsmParserExtension.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/SMLoc.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::AMDGPU;
+
+namespace {
+
+class AMDGPUAsmParser;
+
+enum RegisterKind { IS_UNKNOWN, IS_VGPR, IS_SGPR, IS_TTMP, IS_SPECIAL };
+
+//===----------------------------------------------------------------------===//
+// Operand
+//===----------------------------------------------------------------------===//
+
+class AMDGPUOperand : public MCParsedAsmOperand {
+ enum KindTy {
+ Token,
+ Immediate,
+ Register,
+ Expression
+ } Kind;
+
+ SMLoc StartLoc, EndLoc;
+ const AMDGPUAsmParser *AsmParser;
+
+public:
+ AMDGPUOperand(enum KindTy Kind_, const AMDGPUAsmParser *AsmParser_)
+ : MCParsedAsmOperand(), Kind(Kind_), AsmParser(AsmParser_) {}
+
+ typedef std::unique_ptr<AMDGPUOperand> Ptr;
+
+ struct Modifiers {
+ bool Abs = false;
+ bool Neg = false;
+ bool Sext = false;
+
+ bool hasFPModifiers() const { return Abs || Neg; }
+ bool hasIntModifiers() const { return Sext; }
+ bool hasModifiers() const { return hasFPModifiers() || hasIntModifiers(); }
+
+ int64_t getFPModifiersOperand() const {
+ int64_t Operand = 0;
+ Operand |= Abs ? SISrcMods::ABS : 0;
+ Operand |= Neg ? SISrcMods::NEG : 0;
+ return Operand;
+ }
+
+ int64_t getIntModifiersOperand() const {
+ int64_t Operand = 0;
+ Operand |= Sext ? SISrcMods::SEXT : 0;
+ return Operand;
+ }
+
+ int64_t getModifiersOperand() const {
+ assert(!(hasFPModifiers() && hasIntModifiers())
+ && "fp and int modifiers should not be used simultaneously");
+ if (hasFPModifiers()) {
+ return getFPModifiersOperand();
+ } else if (hasIntModifiers()) {
+ return getIntModifiersOperand();
+ } else {
+ return 0;
+ }
+ }
+
+ friend raw_ostream &operator <<(raw_ostream &OS, AMDGPUOperand::Modifiers Mods);
+ };
+
+ enum ImmTy {
+ ImmTyNone,
+ ImmTyGDS,
+ ImmTyOffen,
+ ImmTyIdxen,
+ ImmTyAddr64,
+ ImmTyOffset,
+ ImmTyOffset0,
+ ImmTyOffset1,
+ ImmTyGLC,
+ ImmTySLC,
+ ImmTyTFE,
+ ImmTyClampSI,
+ ImmTyOModSI,
+ ImmTyDppCtrl,
+ ImmTyDppRowMask,
+ ImmTyDppBankMask,
+ ImmTyDppBoundCtrl,
+ ImmTySdwaDstSel,
+ ImmTySdwaSrc0Sel,
+ ImmTySdwaSrc1Sel,
+ ImmTySdwaDstUnused,
+ ImmTyDMask,
+ ImmTyUNorm,
+ ImmTyDA,
+ ImmTyR128,
+ ImmTyLWE,
+ ImmTyExpTgt,
+ ImmTyExpCompr,
+ ImmTyExpVM,
+ ImmTyHwreg,
+ ImmTyOff,
+ ImmTySendMsg,
+ ImmTyInterpSlot,
+ ImmTyInterpAttr,
+ ImmTyAttrChan
+ };
+
+ struct TokOp {
+ const char *Data;
+ unsigned Length;
+ };
+
+ struct ImmOp {
+ int64_t Val;
+ ImmTy Type;
+ bool IsFPImm;
+ Modifiers Mods;
+ };
+
+ struct RegOp {
+ unsigned RegNo;
+ bool IsForcedVOP3;
+ Modifiers Mods;
+ };
+
+ union {
+ TokOp Tok;
+ ImmOp Imm;
+ RegOp Reg;
+ const MCExpr *Expr;
+ };
+
+ bool isToken() const override {
+ if (Kind == Token)
+ return true;
+
+ if (Kind != Expression || !Expr)
+ return false;
+
+ // When parsing operands, we can't always tell if something was meant to be
+ // a token, like 'gds', or an expression that references a global variable.
+ // In this case, we assume the string is an expression, and if we need to
+ // interpret is a token, then we treat the symbol name as the token.
+ return isa<MCSymbolRefExpr>(Expr);
+ }
+
+ bool isImm() const override {
+ return Kind == Immediate;
+ }
+
+ bool isInlinableImm(MVT type) const;
+ bool isLiteralImm(MVT type) const;
+
+ bool isRegKind() const {
+ return Kind == Register;
+ }
+
+ bool isReg() const override {
+ return isRegKind() && !Reg.Mods.hasModifiers();
+ }
+
+ bool isRegOrImmWithInputMods(MVT type) const {
+ return isRegKind() || isInlinableImm(type);
+ }
+
+ bool isRegOrImmWithInt16InputMods() const {
+ return isRegOrImmWithInputMods(MVT::i16);
+ }
+
+ bool isRegOrImmWithInt32InputMods() const {
+ return isRegOrImmWithInputMods(MVT::i32);
+ }
+
+ bool isRegOrImmWithInt64InputMods() const {
+ return isRegOrImmWithInputMods(MVT::i64);
+ }
+
+ bool isRegOrImmWithFP16InputMods() const {
+ return isRegOrImmWithInputMods(MVT::f16);
+ }
+
+ bool isRegOrImmWithFP32InputMods() const {
+ return isRegOrImmWithInputMods(MVT::f32);
+ }
+
+ bool isRegOrImmWithFP64InputMods() const {
+ return isRegOrImmWithInputMods(MVT::f64);
+ }
+
+ bool isVReg32OrOff() const {
+ return isOff() || isRegClass(AMDGPU::VGPR_32RegClassID);
+ }
+
+ bool isImmTy(ImmTy ImmT) const {
+ return isImm() && Imm.Type == ImmT;
+ }
+
+ bool isImmModifier() const {
+ return isImm() && Imm.Type != ImmTyNone;
+ }
+
+ bool isClampSI() const { return isImmTy(ImmTyClampSI); }
+ bool isOModSI() const { return isImmTy(ImmTyOModSI); }
+ bool isDMask() const { return isImmTy(ImmTyDMask); }
+ bool isUNorm() const { return isImmTy(ImmTyUNorm); }
+ bool isDA() const { return isImmTy(ImmTyDA); }
+ bool isR128() const { return isImmTy(ImmTyUNorm); }
+ bool isLWE() const { return isImmTy(ImmTyLWE); }
+ bool isOff() const { return isImmTy(ImmTyOff); }
+ bool isExpTgt() const { return isImmTy(ImmTyExpTgt); }
+ bool isExpVM() const { return isImmTy(ImmTyExpVM); }
+ bool isExpCompr() const { return isImmTy(ImmTyExpCompr); }
+ bool isOffen() const { return isImmTy(ImmTyOffen); }
+ bool isIdxen() const { return isImmTy(ImmTyIdxen); }
+ bool isAddr64() const { return isImmTy(ImmTyAddr64); }
+ bool isOffset() const { return isImmTy(ImmTyOffset) && isUInt<16>(getImm()); }
+ bool isOffset0() const { return isImmTy(ImmTyOffset0) && isUInt<16>(getImm()); }
+ bool isOffset1() const { return isImmTy(ImmTyOffset1) && isUInt<8>(getImm()); }
+ bool isGDS() const { return isImmTy(ImmTyGDS); }
+ bool isGLC() const { return isImmTy(ImmTyGLC); }
+ bool isSLC() const { return isImmTy(ImmTySLC); }
+ bool isTFE() const { return isImmTy(ImmTyTFE); }
+ bool isBankMask() const { return isImmTy(ImmTyDppBankMask); }
+ bool isRowMask() const { return isImmTy(ImmTyDppRowMask); }
+ bool isBoundCtrl() const { return isImmTy(ImmTyDppBoundCtrl); }
+ bool isSDWADstSel() const { return isImmTy(ImmTySdwaDstSel); }
+ bool isSDWASrc0Sel() const { return isImmTy(ImmTySdwaSrc0Sel); }
+ bool isSDWASrc1Sel() const { return isImmTy(ImmTySdwaSrc1Sel); }
+ bool isSDWADstUnused() const { return isImmTy(ImmTySdwaDstUnused); }
+ bool isInterpSlot() const { return isImmTy(ImmTyInterpSlot); }
+ bool isInterpAttr() const { return isImmTy(ImmTyInterpAttr); }
+ bool isAttrChan() const { return isImmTy(ImmTyAttrChan); }
+
+ bool isMod() const {
+ return isClampSI() || isOModSI();
+ }
+
+ bool isRegOrImm() const {
+ return isReg() || isImm();
+ }
+
+ bool isRegClass(unsigned RCID) const;
+
+ bool isSCSrcB16() const {
+ return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::i16);
+ }
+
+ bool isSCSrcB32() const {
+ return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::i32);
+ }
+
+ bool isSCSrcB64() const {
+ return isRegClass(AMDGPU::SReg_64RegClassID) || isInlinableImm(MVT::i64);
+ }
+
+ bool isSCSrcF16() const {
+ return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::f16);
+ }
+
+ bool isSCSrcF32() const {
+ return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::f32);
+ }
+
+ bool isSCSrcF64() const {
+ return isRegClass(AMDGPU::SReg_64RegClassID) || isInlinableImm(MVT::f64);
+ }
+
+ bool isSSrcB32() const {
+ return isSCSrcB32() || isLiteralImm(MVT::i32) || isExpr();
+ }
+
+ bool isSSrcB16() const {
+ return isSCSrcB16() || isLiteralImm(MVT::i16);
+ }
+
+ bool isSSrcB64() const {
+ // TODO: Find out how SALU supports extension of 32-bit literals to 64 bits.
+ // See isVSrc64().
+ return isSCSrcB64() || isLiteralImm(MVT::i64);
+ }
+
+ bool isSSrcF32() const {
+ return isSCSrcB32() || isLiteralImm(MVT::f32) || isExpr();
+ }
+
+ bool isSSrcF64() const {
+ return isSCSrcB64() || isLiteralImm(MVT::f64);
+ }
+
+ bool isSSrcF16() const {
+ return isSCSrcB16() || isLiteralImm(MVT::f16);
+ }
+
+ bool isVCSrcB32() const {
+ return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::i32);
+ }
+
+ bool isVCSrcB64() const {
+ return isRegClass(AMDGPU::VS_64RegClassID) || isInlinableImm(MVT::i64);
+ }
+
+ bool isVCSrcB16() const {
+ return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::i16);
+ }
+
+ bool isVCSrcF32() const {
+ return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::f32);
+ }
+
+ bool isVCSrcF64() const {
+ return isRegClass(AMDGPU::VS_64RegClassID) || isInlinableImm(MVT::f64);
+ }
+
+ bool isVCSrcF16() const {
+ return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::f16);
+ }
+
+ bool isVSrcB32() const {
+ return isVCSrcF32() || isLiteralImm(MVT::i32);
+ }
+
+ bool isVSrcB64() const {
+ return isVCSrcF64() || isLiteralImm(MVT::i64);
+ }
+
+ bool isVSrcB16() const {
+ return isVCSrcF16() || isLiteralImm(MVT::i16);
+ }
+
+ bool isVSrcF32() const {
+ return isVCSrcF32() || isLiteralImm(MVT::f32);
+ }
+
+ bool isVSrcF64() const {
+ return isVCSrcF64() || isLiteralImm(MVT::f64);
+ }
+
+ bool isVSrcF16() const {
+ return isVCSrcF16() || isLiteralImm(MVT::f16);
+ }
+
+ bool isKImmFP32() const {
+ return isLiteralImm(MVT::f32);
+ }
+
+ bool isKImmFP16() const {
+ return isLiteralImm(MVT::f16);
+ }
+
+ bool isMem() const override {
+ return false;
+ }
+
+ bool isExpr() const {
+ return Kind == Expression;
+ }
+
+ bool isSoppBrTarget() const {
+ return isExpr() || isImm();
+ }
+
+ bool isSWaitCnt() const;
+ bool isHwreg() const;
+ bool isSendMsg() const;
+ bool isSMRDOffset8() const;
+ bool isSMRDOffset20() const;
+ bool isSMRDLiteralOffset() const;
+ bool isDPPCtrl() const;
+ bool isGPRIdxMode() const;
+
+ StringRef getExpressionAsToken() const {
+ assert(isExpr());
+ const MCSymbolRefExpr *S = cast<MCSymbolRefExpr>(Expr);
+ return S->getSymbol().getName();
+ }
+
+ StringRef getToken() const {
+ assert(isToken());
+
+ if (Kind == Expression)
+ return getExpressionAsToken();
+
+ return StringRef(Tok.Data, Tok.Length);
+ }
+
+ int64_t getImm() const {
+ assert(isImm());
+ return Imm.Val;
+ }
+
+ enum ImmTy getImmTy() const {
+ assert(isImm());
+ return Imm.Type;
+ }
+
+ unsigned getReg() const override {
+ return Reg.RegNo;
+ }
+
+ SMLoc getStartLoc() const override {
+ return StartLoc;
+ }
+
+ SMLoc getEndLoc() const override {
+ return EndLoc;
+ }
+
+ Modifiers getModifiers() const {
+ assert(isRegKind() || isImmTy(ImmTyNone));
+ return isRegKind() ? Reg.Mods : Imm.Mods;
+ }
+
+ void setModifiers(Modifiers Mods) {
+ assert(isRegKind() || isImmTy(ImmTyNone));
+ if (isRegKind())
+ Reg.Mods = Mods;
+ else
+ Imm.Mods = Mods;
+ }
+
+ bool hasModifiers() const {
+ return getModifiers().hasModifiers();
+ }
+
+ bool hasFPModifiers() const {
+ return getModifiers().hasFPModifiers();
+ }
+
+ bool hasIntModifiers() const {
+ return getModifiers().hasIntModifiers();
+ }
+
+ void addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers = true) const;
+
+ void addLiteralImmOperand(MCInst &Inst, int64_t Val) const;
+
+ template <unsigned Bitwidth>
+ void addKImmFPOperands(MCInst &Inst, unsigned N) const;
+
+ void addKImmFP16Operands(MCInst &Inst, unsigned N) const {
+ addKImmFPOperands<16>(Inst, N);
+ }
+
+ void addKImmFP32Operands(MCInst &Inst, unsigned N) const {
+ addKImmFPOperands<32>(Inst, N);
+ }
+
+ void addRegOperands(MCInst &Inst, unsigned N) const;
+
+ void addRegOrImmOperands(MCInst &Inst, unsigned N) const {
+ if (isRegKind())
+ addRegOperands(Inst, N);
+ else if (isExpr())
+ Inst.addOperand(MCOperand::createExpr(Expr));
+ else
+ addImmOperands(Inst, N);
+ }
+
+ void addRegOrImmWithInputModsOperands(MCInst &Inst, unsigned N) const {
+ Modifiers Mods = getModifiers();
+ Inst.addOperand(MCOperand::createImm(Mods.getModifiersOperand()));
+ if (isRegKind()) {
+ addRegOperands(Inst, N);
+ } else {
+ addImmOperands(Inst, N, false);
+ }
+ }
+
+ void addRegOrImmWithFPInputModsOperands(MCInst &Inst, unsigned N) const {
+ assert(!hasIntModifiers());
+ addRegOrImmWithInputModsOperands(Inst, N);
+ }
+
+ void addRegOrImmWithIntInputModsOperands(MCInst &Inst, unsigned N) const {
+ assert(!hasFPModifiers());
+ addRegOrImmWithInputModsOperands(Inst, N);
+ }
+
+ void addSoppBrTargetOperands(MCInst &Inst, unsigned N) const {
+ if (isImm())
+ addImmOperands(Inst, N);
+ else {
+ assert(isExpr());
+ Inst.addOperand(MCOperand::createExpr(Expr));
+ }
+ }
+
+ static void printImmTy(raw_ostream& OS, ImmTy Type) {
+ switch (Type) {
+ case ImmTyNone: OS << "None"; break;
+ case ImmTyGDS: OS << "GDS"; break;
+ case ImmTyOffen: OS << "Offen"; break;
+ case ImmTyIdxen: OS << "Idxen"; break;
+ case ImmTyAddr64: OS << "Addr64"; break;
+ case ImmTyOffset: OS << "Offset"; break;
+ case ImmTyOffset0: OS << "Offset0"; break;
+ case ImmTyOffset1: OS << "Offset1"; break;
+ case ImmTyGLC: OS << "GLC"; break;
+ case ImmTySLC: OS << "SLC"; break;
+ case ImmTyTFE: OS << "TFE"; break;
+ case ImmTyClampSI: OS << "ClampSI"; break;
+ case ImmTyOModSI: OS << "OModSI"; break;
+ case ImmTyDppCtrl: OS << "DppCtrl"; break;
+ case ImmTyDppRowMask: OS << "DppRowMask"; break;
+ case ImmTyDppBankMask: OS << "DppBankMask"; break;
+ case ImmTyDppBoundCtrl: OS << "DppBoundCtrl"; break;
+ case ImmTySdwaDstSel: OS << "SdwaDstSel"; break;
+ case ImmTySdwaSrc0Sel: OS << "SdwaSrc0Sel"; break;
+ case ImmTySdwaSrc1Sel: OS << "SdwaSrc1Sel"; break;
+ case ImmTySdwaDstUnused: OS << "SdwaDstUnused"; break;
+ case ImmTyDMask: OS << "DMask"; break;
+ case ImmTyUNorm: OS << "UNorm"; break;
+ case ImmTyDA: OS << "DA"; break;
+ case ImmTyR128: OS << "R128"; break;
+ case ImmTyLWE: OS << "LWE"; break;
+ case ImmTyOff: OS << "Off"; break;
+ case ImmTyExpTgt: OS << "ExpTgt"; break;
+ case ImmTyExpCompr: OS << "ExpCompr"; break;
+ case ImmTyExpVM: OS << "ExpVM"; break;
+ case ImmTyHwreg: OS << "Hwreg"; break;
+ case ImmTySendMsg: OS << "SendMsg"; break;
+ case ImmTyInterpSlot: OS << "InterpSlot"; break;
+ case ImmTyInterpAttr: OS << "InterpAttr"; break;
+ case ImmTyAttrChan: OS << "AttrChan"; break;
+ }
+ }
+
+ void print(raw_ostream &OS) const override {
+ switch (Kind) {
+ case Register:
+ OS << "<register " << getReg() << " mods: " << Reg.Mods << '>';
+ break;
+ case Immediate:
+ OS << '<' << getImm();
+ if (getImmTy() != ImmTyNone) {
+ OS << " type: "; printImmTy(OS, getImmTy());
+ }
+ OS << " mods: " << Imm.Mods << '>';
+ break;
+ case Token:
+ OS << '\'' << getToken() << '\'';
+ break;
+ case Expression:
+ OS << "<expr " << *Expr << '>';
+ break;
+ }
+ }
+
+ static AMDGPUOperand::Ptr CreateImm(const AMDGPUAsmParser *AsmParser,
+ int64_t Val, SMLoc Loc,
+ enum ImmTy Type = ImmTyNone,
+ bool IsFPImm = false) {
+ auto Op = llvm::make_unique<AMDGPUOperand>(Immediate, AsmParser);
+ Op->Imm.Val = Val;
+ Op->Imm.IsFPImm = IsFPImm;
+ Op->Imm.Type = Type;
+ Op->Imm.Mods = Modifiers();
+ Op->StartLoc = Loc;
+ Op->EndLoc = Loc;
+ return Op;
+ }
+
+ static AMDGPUOperand::Ptr CreateToken(const AMDGPUAsmParser *AsmParser,
+ StringRef Str, SMLoc Loc,
+ bool HasExplicitEncodingSize = true) {
+ auto Res = llvm::make_unique<AMDGPUOperand>(Token, AsmParser);
+ Res->Tok.Data = Str.data();
+ Res->Tok.Length = Str.size();
+ Res->StartLoc = Loc;
+ Res->EndLoc = Loc;
+ return Res;
+ }
+
+ static AMDGPUOperand::Ptr CreateReg(const AMDGPUAsmParser *AsmParser,
+ unsigned RegNo, SMLoc S,
+ SMLoc E,
+ bool ForceVOP3) {
+ auto Op = llvm::make_unique<AMDGPUOperand>(Register, AsmParser);
+ Op->Reg.RegNo = RegNo;
+ Op->Reg.Mods = Modifiers();
+ Op->Reg.IsForcedVOP3 = ForceVOP3;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ static AMDGPUOperand::Ptr CreateExpr(const AMDGPUAsmParser *AsmParser,
+ const class MCExpr *Expr, SMLoc S) {
+ auto Op = llvm::make_unique<AMDGPUOperand>(Expression, AsmParser);
+ Op->Expr = Expr;
+ Op->StartLoc = S;
+ Op->EndLoc = S;
+ return Op;
+ }
+};
+
+raw_ostream &operator <<(raw_ostream &OS, AMDGPUOperand::Modifiers Mods) {
+ OS << "abs:" << Mods.Abs << " neg: " << Mods.Neg << " sext:" << Mods.Sext;
+ return OS;
+}
+
+//===----------------------------------------------------------------------===//
+// AsmParser
+//===----------------------------------------------------------------------===//
+
+// Holds info related to the current kernel, e.g. count of SGPRs used.
+// Kernel scope begins at .amdgpu_hsa_kernel directive, ends at next
+// .amdgpu_hsa_kernel or at EOF.
+class KernelScopeInfo {
+ int SgprIndexUnusedMin;
+ int VgprIndexUnusedMin;
+ MCContext *Ctx;
+
+ void usesSgprAt(int i) {
+ if (i >= SgprIndexUnusedMin) {
+ SgprIndexUnusedMin = ++i;
+ if (Ctx) {
+ MCSymbol * const Sym = Ctx->getOrCreateSymbol(Twine(".kernel.sgpr_count"));
+ Sym->setVariableValue(MCConstantExpr::create(SgprIndexUnusedMin, *Ctx));
+ }
+ }
+ }
+ void usesVgprAt(int i) {
+ if (i >= VgprIndexUnusedMin) {
+ VgprIndexUnusedMin = ++i;
+ if (Ctx) {
+ MCSymbol * const Sym = Ctx->getOrCreateSymbol(Twine(".kernel.vgpr_count"));
+ Sym->setVariableValue(MCConstantExpr::create(VgprIndexUnusedMin, *Ctx));
+ }
+ }
+ }
+public:
+ KernelScopeInfo() : SgprIndexUnusedMin(-1), VgprIndexUnusedMin(-1), Ctx(nullptr)
+ {}
+ void initialize(MCContext &Context) {
+ Ctx = &Context;
+ usesSgprAt(SgprIndexUnusedMin = -1);
+ usesVgprAt(VgprIndexUnusedMin = -1);
+ }
+ void usesRegister(RegisterKind RegKind, unsigned DwordRegIndex, unsigned RegWidth) {
+ switch (RegKind) {
+ case IS_SGPR: usesSgprAt(DwordRegIndex + RegWidth - 1); break;
+ case IS_VGPR: usesVgprAt(DwordRegIndex + RegWidth - 1); break;
+ default: break;
+ }
+ }
+};
+
+class AMDGPUAsmParser : public MCTargetAsmParser {
+ const MCInstrInfo &MII;
+ MCAsmParser &Parser;
+
+ unsigned ForcedEncodingSize;
+ bool ForcedDPP;
+ bool ForcedSDWA;
+ KernelScopeInfo KernelScope;
+
+ /// @name Auto-generated Match Functions
+ /// {
+
+#define GET_ASSEMBLER_HEADER
+#include "AMDGPUGenAsmMatcher.inc"
+
+ /// }
+
+private:
+ bool ParseAsAbsoluteExpression(uint32_t &Ret);
+ bool ParseDirectiveMajorMinor(uint32_t &Major, uint32_t &Minor);
+ bool ParseDirectiveHSACodeObjectVersion();
+ bool ParseDirectiveHSACodeObjectISA();
+ bool ParseDirectiveRuntimeMetadata();
+ bool ParseAMDKernelCodeTValue(StringRef ID, amd_kernel_code_t &Header);
+ bool ParseDirectiveAMDKernelCodeT();
+ bool ParseSectionDirectiveHSAText();
+ bool subtargetHasRegister(const MCRegisterInfo &MRI, unsigned RegNo) const;
+ bool ParseDirectiveAMDGPUHsaKernel();
+ bool ParseDirectiveAMDGPUHsaModuleGlobal();
+ bool ParseDirectiveAMDGPUHsaProgramGlobal();
+ bool ParseSectionDirectiveHSADataGlobalAgent();
+ bool ParseSectionDirectiveHSADataGlobalProgram();
+ bool ParseSectionDirectiveHSARodataReadonlyAgent();
+ bool AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth, RegisterKind RegKind, unsigned Reg1, unsigned RegNum);
+ bool ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, unsigned& RegNum, unsigned& RegWidth, unsigned *DwordRegIndex);
+ void cvtMubufImpl(MCInst &Inst, const OperandVector &Operands, bool IsAtomic, bool IsAtomicReturn);
+
+public:
+ enum AMDGPUMatchResultTy {
+ Match_PreferE32 = FIRST_TARGET_MATCH_RESULT_TY
+ };
+
+ AMDGPUAsmParser(const MCSubtargetInfo &STI, MCAsmParser &_Parser,
+ const MCInstrInfo &MII,
+ const MCTargetOptions &Options)
+ : MCTargetAsmParser(Options, STI), MII(MII), Parser(_Parser),
+ ForcedEncodingSize(0),
+ ForcedDPP(false),
+ ForcedSDWA(false) {
+ MCAsmParserExtension::Initialize(Parser);
+
+ if (getSTI().getFeatureBits().none()) {
+ // Set default features.
+ copySTI().ToggleFeature("SOUTHERN_ISLANDS");
+ }
+
+ setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
+
+ {
+ // TODO: make those pre-defined variables read-only.
+ // Currently there is none suitable machinery in the core llvm-mc for this.
+ // MCSymbol::isRedefinable is intended for another purpose, and
+ // AsmParser::parseDirectiveSet() cannot be specialized for specific target.
+ AMDGPU::IsaVersion Isa = AMDGPU::getIsaVersion(getSTI().getFeatureBits());
+ MCContext &Ctx = getContext();
+ MCSymbol *Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_major"));
+ Sym->setVariableValue(MCConstantExpr::create(Isa.Major, Ctx));
+ Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_minor"));
+ Sym->setVariableValue(MCConstantExpr::create(Isa.Minor, Ctx));
+ Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_stepping"));
+ Sym->setVariableValue(MCConstantExpr::create(Isa.Stepping, Ctx));
+ }
+ KernelScope.initialize(getContext());
+ }
+
+ bool isSI() const {
+ return AMDGPU::isSI(getSTI());
+ }
+
+ bool isCI() const {
+ return AMDGPU::isCI(getSTI());
+ }
+
+ bool isVI() const {
+ return AMDGPU::isVI(getSTI());
+ }
+
+ bool hasInv2PiInlineImm() const {
+ return getSTI().getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm];
+ }
+
+ bool hasSGPR102_SGPR103() const {
+ return !isVI();
+ }
+
+ AMDGPUTargetStreamer &getTargetStreamer() {
+ MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
+ return static_cast<AMDGPUTargetStreamer &>(TS);
+ }
+
+ const MCRegisterInfo *getMRI() const {
+ // We need this const_cast because for some reason getContext() is not const
+ // in MCAsmParser.
+ return const_cast<AMDGPUAsmParser*>(this)->getContext().getRegisterInfo();
+ }
+
+ const MCInstrInfo *getMII() const {
+ return &MII;
+ }
+
+ void setForcedEncodingSize(unsigned Size) { ForcedEncodingSize = Size; }
+ void setForcedDPP(bool ForceDPP_) { ForcedDPP = ForceDPP_; }
+ void setForcedSDWA(bool ForceSDWA_) { ForcedSDWA = ForceSDWA_; }
+
+ unsigned getForcedEncodingSize() const { return ForcedEncodingSize; }
+ bool isForcedVOP3() const { return ForcedEncodingSize == 64; }
+ bool isForcedDPP() const { return ForcedDPP; }
+ bool isForcedSDWA() const { return ForcedSDWA; }
+
+ std::unique_ptr<AMDGPUOperand> parseRegister();
+ bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+ unsigned checkTargetMatchPredicate(MCInst &Inst) override;
+ unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
+ unsigned Kind) override;
+ bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands, MCStreamer &Out,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) override;
+ bool ParseDirective(AsmToken DirectiveID) override;
+ OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Mnemonic);
+ StringRef parseMnemonicSuffix(StringRef Name);
+ bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+ SMLoc NameLoc, OperandVector &Operands) override;
+ //bool ProcessInstruction(MCInst &Inst);
+
+ OperandMatchResultTy parseIntWithPrefix(const char *Prefix, int64_t &Int);
+ OperandMatchResultTy
+ parseIntWithPrefix(const char *Prefix, OperandVector &Operands,
+ enum AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone,
+ bool (*ConvertResult)(int64_t &) = nullptr);
+ OperandMatchResultTy
+ parseNamedBit(const char *Name, OperandVector &Operands,
+ enum AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone);
+ OperandMatchResultTy parseStringWithPrefix(StringRef Prefix,
+ StringRef &Value);
+
+ OperandMatchResultTy parseImm(OperandVector &Operands);
+ OperandMatchResultTy parseRegOrImm(OperandVector &Operands);
+ OperandMatchResultTy parseRegOrImmWithFPInputMods(OperandVector &Operands);
+ OperandMatchResultTy parseRegOrImmWithIntInputMods(OperandVector &Operands);
+ OperandMatchResultTy parseVReg32OrOff(OperandVector &Operands);
+
+ void cvtDSOffset01(MCInst &Inst, const OperandVector &Operands);
+ void cvtDS(MCInst &Inst, const OperandVector &Operands);
+ void cvtExp(MCInst &Inst, const OperandVector &Operands);
+
+ bool parseCnt(int64_t &IntVal);
+ OperandMatchResultTy parseSWaitCntOps(OperandVector &Operands);
+ OperandMatchResultTy parseHwreg(OperandVector &Operands);
+
+private:
+ struct OperandInfoTy {
+ int64_t Id;
+ bool IsSymbolic;
+ OperandInfoTy(int64_t Id_) : Id(Id_), IsSymbolic(false) { }
+ };
+
+ bool parseSendMsgConstruct(OperandInfoTy &Msg, OperandInfoTy &Operation, int64_t &StreamId);
+ bool parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset, int64_t &Width);
+
+ void errorExpTgt();
+ OperandMatchResultTy parseExpTgtImpl(StringRef Str, uint8_t &Val);
+
+public:
+ OperandMatchResultTy parseOptionalOperand(OperandVector &Operands);
+
+ OperandMatchResultTy parseExpTgt(OperandVector &Operands);
+ OperandMatchResultTy parseSendMsgOp(OperandVector &Operands);
+ OperandMatchResultTy parseInterpSlot(OperandVector &Operands);
+ OperandMatchResultTy parseInterpAttr(OperandVector &Operands);
+ OperandMatchResultTy parseSOppBrTarget(OperandVector &Operands);
+
+ void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false); }
+ void cvtMubufAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, false); }
+ void cvtMubufAtomicReturn(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, true); }
+ AMDGPUOperand::Ptr defaultGLC() const;
+ AMDGPUOperand::Ptr defaultSLC() const;
+ AMDGPUOperand::Ptr defaultTFE() const;
+
+ AMDGPUOperand::Ptr defaultDMask() const;
+ AMDGPUOperand::Ptr defaultUNorm() const;
+ AMDGPUOperand::Ptr defaultDA() const;
+ AMDGPUOperand::Ptr defaultR128() const;
+ AMDGPUOperand::Ptr defaultLWE() const;
+ AMDGPUOperand::Ptr defaultSMRDOffset8() const;
+ AMDGPUOperand::Ptr defaultSMRDOffset20() const;
+ AMDGPUOperand::Ptr defaultSMRDLiteralOffset() const;
+
+ OperandMatchResultTy parseOModOperand(OperandVector &Operands);
+
+ void cvtId(MCInst &Inst, const OperandVector &Operands);
+ void cvtVOP3_2_mod(MCInst &Inst, const OperandVector &Operands);
+ void cvtVOP3(MCInst &Inst, const OperandVector &Operands);
+
+ void cvtMIMG(MCInst &Inst, const OperandVector &Operands);
+ void cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands);
+
+ OperandMatchResultTy parseDPPCtrl(OperandVector &Operands);
+ AMDGPUOperand::Ptr defaultRowMask() const;
+ AMDGPUOperand::Ptr defaultBankMask() const;
+ AMDGPUOperand::Ptr defaultBoundCtrl() const;
+ void cvtDPP(MCInst &Inst, const OperandVector &Operands);
+
+ OperandMatchResultTy parseSDWASel(OperandVector &Operands, StringRef Prefix,
+ AMDGPUOperand::ImmTy Type);
+ OperandMatchResultTy parseSDWADstUnused(OperandVector &Operands);
+ void cvtSdwaVOP1(MCInst &Inst, const OperandVector &Operands);
+ void cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands);
+ void cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands);
+ void cvtSDWA(MCInst &Inst, const OperandVector &Operands,
+ uint64_t BasicInstType);
+};
+
+struct OptionalOperand {
+ const char *Name;
+ AMDGPUOperand::ImmTy Type;
+ bool IsBit;
+ bool (*ConvertResult)(int64_t&);
+};
+
+} // end anonymous namespace
+
+// May be called with integer type with equivalent bitwidth.
+static const fltSemantics *getFltSemantics(unsigned Size) {
+ switch (Size) {
+ case 4:
+ return &APFloat::IEEEsingle();
+ case 8:
+ return &APFloat::IEEEdouble();
+ case 2:
+ return &APFloat::IEEEhalf();
+ default:
+ llvm_unreachable("unsupported fp type");
+ }
+}
+
+static const fltSemantics *getFltSemantics(MVT VT) {
+ return getFltSemantics(VT.getSizeInBits() / 8);
+}
+
+//===----------------------------------------------------------------------===//
+// Operand
+//===----------------------------------------------------------------------===//
+
+static bool canLosslesslyConvertToFPType(APFloat &FPLiteral, MVT VT) {
+ bool Lost;
+
+ // Convert literal to single precision
+ APFloat::opStatus Status = FPLiteral.convert(*getFltSemantics(VT),
+ APFloat::rmNearestTiesToEven,
+ &Lost);
+ // We allow precision lost but not overflow or underflow
+ if (Status != APFloat::opOK &&
+ Lost &&
+ ((Status & APFloat::opOverflow) != 0 ||
+ (Status & APFloat::opUnderflow) != 0)) {
+ return false;
+ }
+
+ return true;
+}
+
+bool AMDGPUOperand::isInlinableImm(MVT type) const {
+ if (!isImmTy(ImmTyNone)) {
+ // Only plain immediates are inlinable (e.g. "clamp" attribute is not)
+ return false;
+ }
+ // TODO: We should avoid using host float here. It would be better to
+ // check the float bit values which is what a few other places do.
+ // We've had bot failures before due to weird NaN support on mips hosts.
+
+ APInt Literal(64, Imm.Val);
+
+ if (Imm.IsFPImm) { // We got fp literal token
+ if (type == MVT::f64 || type == MVT::i64) { // Expected 64-bit operand
+ return AMDGPU::isInlinableLiteral64(Imm.Val,
+ AsmParser->hasInv2PiInlineImm());
+ }
+
+ APFloat FPLiteral(APFloat::IEEEdouble(), APInt(64, Imm.Val));
+ if (!canLosslesslyConvertToFPType(FPLiteral, type))
+ return false;
+
+ // Check if single precision literal is inlinable
+ return AMDGPU::isInlinableLiteral32(
+ static_cast<int32_t>(FPLiteral.bitcastToAPInt().getZExtValue()),
+ AsmParser->hasInv2PiInlineImm());
+ }
+
+
+ // We got int literal token.
+ if (type == MVT::f64 || type == MVT::i64) { // Expected 64-bit operand
+ return AMDGPU::isInlinableLiteral64(Imm.Val,
+ AsmParser->hasInv2PiInlineImm());
+ }
+
+ if (type.getScalarSizeInBits() == 16) {
+ return AMDGPU::isInlinableLiteral16(
+ static_cast<int16_t>(Literal.getLoBits(16).getSExtValue()),
+ AsmParser->hasInv2PiInlineImm());
+ }
+
+ return AMDGPU::isInlinableLiteral32(
+ static_cast<int32_t>(Literal.getLoBits(32).getZExtValue()),
+ AsmParser->hasInv2PiInlineImm());
+}
+
+bool AMDGPUOperand::isLiteralImm(MVT type) const {
+ // Check that this imediate can be added as literal
+ if (!isImmTy(ImmTyNone)) {
+ return false;
+ }
+
+ if (!Imm.IsFPImm) {
+ // We got int literal token.
+
+ unsigned Size = type.getSizeInBits();
+ if (Size == 64)
+ Size = 32;
+
+ // FIXME: 64-bit operands can zero extend, sign extend, or pad zeroes for FP
+ // types.
+ return isUIntN(Size, Imm.Val) || isIntN(Size, Imm.Val);
+ }
+
+ // We got fp literal token
+ if (type == MVT::f64) { // Expected 64-bit fp operand
+ // We would set low 64-bits of literal to zeroes but we accept this literals
+ return true;
+ }
+
+ if (type == MVT::i64) { // Expected 64-bit int operand
+ // We don't allow fp literals in 64-bit integer instructions. It is
+ // unclear how we should encode them.
+ return false;
+ }
+
+ APFloat FPLiteral(APFloat::IEEEdouble(), APInt(64, Imm.Val));
+ return canLosslesslyConvertToFPType(FPLiteral, type);
+}
+
+bool AMDGPUOperand::isRegClass(unsigned RCID) const {
+ return isReg() && AsmParser->getMRI()->getRegClass(RCID).contains(getReg());
+}
+
+void AMDGPUOperand::addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers) const {
+ int64_t Val = Imm.Val;
+ if (isImmTy(ImmTyNone) && ApplyModifiers && Imm.Mods.hasFPModifiers() && Imm.Mods.Neg) {
+ // Apply modifiers to immediate value. Only negate can get here
+ if (Imm.IsFPImm) {
+ APFloat F(BitsToDouble(Val));
+ F.changeSign();
+ Val = F.bitcastToAPInt().getZExtValue();
+ } else {
+ Val = -Val;
+ }
+ }
+
+ if (AMDGPU::isSISrcOperand(AsmParser->getMII()->get(Inst.getOpcode()),
+ Inst.getNumOperands())) {
+ addLiteralImmOperand(Inst, Val);
+ } else {
+ Inst.addOperand(MCOperand::createImm(Val));
+ }
+}
+
+void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val) const {
+ const auto& InstDesc = AsmParser->getMII()->get(Inst.getOpcode());
+ auto OpNum = Inst.getNumOperands();
+ // Check that this operand accepts literals
+ assert(AMDGPU::isSISrcOperand(InstDesc, OpNum));
+
+ auto OpSize = AMDGPU::getOperandSize(InstDesc, OpNum); // expected operand size
+
+ if (Imm.IsFPImm) { // We got fp literal token
+ APInt Literal(64, Val);
+
+ switch (OpSize) {
+ case 8: {
+ if (AMDGPU::isInlinableLiteral64(Literal.getZExtValue(),
+ AsmParser->hasInv2PiInlineImm())) {
+ Inst.addOperand(MCOperand::createImm(Literal.getZExtValue()));
+ return;
+ }
+
+ // Non-inlineable
+ if (AMDGPU::isSISrcFPOperand(InstDesc, OpNum)) { // Expected 64-bit fp operand
+ // For fp operands we check if low 32 bits are zeros
+ if (Literal.getLoBits(32) != 0) {
+ const_cast<AMDGPUAsmParser *>(AsmParser)->Warning(Inst.getLoc(),
+ "Can't encode literal as exact 64-bit floating-point operand. "
+ "Low 32-bits will be set to zero");
+ }
+
+ Inst.addOperand(MCOperand::createImm(Literal.lshr(32).getZExtValue()));
+ return;
+ }
+
+ // We don't allow fp literals in 64-bit integer instructions. It is
+ // unclear how we should encode them. This case should be checked earlier
+ // in predicate methods (isLiteralImm())
+ llvm_unreachable("fp literal in 64-bit integer instruction.");
+ }
+ case 4:
+ case 2: {
+ bool lost;
+ APFloat FPLiteral(APFloat::IEEEdouble(), Literal);
+ // Convert literal to single precision
+ FPLiteral.convert(*getFltSemantics(OpSize),
+ APFloat::rmNearestTiesToEven, &lost);
+ // We allow precision lost but not overflow or underflow. This should be
+ // checked earlier in isLiteralImm()
+ Inst.addOperand(MCOperand::createImm(FPLiteral.bitcastToAPInt().getZExtValue()));
+ return;
+ }
+ default:
+ llvm_unreachable("invalid operand size");
+ }
+
+ return;
+ }
+
+ // We got int literal token.
+ // Only sign extend inline immediates.
+ // FIXME: No errors on truncation
+ switch (OpSize) {
+ case 4: {
+ if (isInt<32>(Val) &&
+ AMDGPU::isInlinableLiteral32(static_cast<int32_t>(Val),
+ AsmParser->hasInv2PiInlineImm())) {
+ Inst.addOperand(MCOperand::createImm(Val));
+ return;
+ }
+
+ Inst.addOperand(MCOperand::createImm(Val & 0xffffffff));
+ return;
+ }
+ case 8: {
+ if (AMDGPU::isInlinableLiteral64(Val,
+ AsmParser->hasInv2PiInlineImm())) {
+ Inst.addOperand(MCOperand::createImm(Val));
+ return;
+ }
+
+ Inst.addOperand(MCOperand::createImm(Lo_32(Val)));
+ return;
+ }
+ case 2: {
+ if (isInt<16>(Val) &&
+ AMDGPU::isInlinableLiteral16(static_cast<int16_t>(Val),
+ AsmParser->hasInv2PiInlineImm())) {
+ Inst.addOperand(MCOperand::createImm(Val));
+ return;
+ }
+
+ Inst.addOperand(MCOperand::createImm(Val & 0xffff));
+ return;
+ }
+ default:
+ llvm_unreachable("invalid operand size");
+ }
+}
+
+template <unsigned Bitwidth>
+void AMDGPUOperand::addKImmFPOperands(MCInst &Inst, unsigned N) const {
+ APInt Literal(64, Imm.Val);
+
+ if (!Imm.IsFPImm) {
+ // We got int literal token.
+ Inst.addOperand(MCOperand::createImm(Literal.getLoBits(Bitwidth).getZExtValue()));
+ return;
+ }
+
+ bool Lost;
+ APFloat FPLiteral(APFloat::IEEEdouble(), Literal);
+ FPLiteral.convert(*getFltSemantics(Bitwidth / 8),
+ APFloat::rmNearestTiesToEven, &Lost);
+ Inst.addOperand(MCOperand::createImm(FPLiteral.bitcastToAPInt().getZExtValue()));
+}
+
+void AMDGPUOperand::addRegOperands(MCInst &Inst, unsigned N) const {
+ Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(getReg(), AsmParser->getSTI())));
+}
+
+//===----------------------------------------------------------------------===//
+// AsmParser
+//===----------------------------------------------------------------------===//
+
+static int getRegClass(RegisterKind Is, unsigned RegWidth) {
+ if (Is == IS_VGPR) {
+ switch (RegWidth) {
+ default: return -1;
+ case 1: return AMDGPU::VGPR_32RegClassID;
+ case 2: return AMDGPU::VReg_64RegClassID;
+ case 3: return AMDGPU::VReg_96RegClassID;
+ case 4: return AMDGPU::VReg_128RegClassID;
+ case 8: return AMDGPU::VReg_256RegClassID;
+ case 16: return AMDGPU::VReg_512RegClassID;
+ }
+ } else if (Is == IS_TTMP) {
+ switch (RegWidth) {
+ default: return -1;
+ case 1: return AMDGPU::TTMP_32RegClassID;
+ case 2: return AMDGPU::TTMP_64RegClassID;
+ case 4: return AMDGPU::TTMP_128RegClassID;
+ }
+ } else if (Is == IS_SGPR) {
+ switch (RegWidth) {
+ default: return -1;
+ case 1: return AMDGPU::SGPR_32RegClassID;
+ case 2: return AMDGPU::SGPR_64RegClassID;
+ case 4: return AMDGPU::SGPR_128RegClassID;
+ case 8: return AMDGPU::SReg_256RegClassID;
+ case 16: return AMDGPU::SReg_512RegClassID;
+ }
+ }
+ return -1;
+}
+
+static unsigned getSpecialRegForName(StringRef RegName) {
+ return StringSwitch<unsigned>(RegName)
+ .Case("exec", AMDGPU::EXEC)
+ .Case("vcc", AMDGPU::VCC)
+ .Case("flat_scratch", AMDGPU::FLAT_SCR)
+ .Case("m0", AMDGPU::M0)
+ .Case("scc", AMDGPU::SCC)
+ .Case("tba", AMDGPU::TBA)
+ .Case("tma", AMDGPU::TMA)
+ .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
+ .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
+ .Case("vcc_lo", AMDGPU::VCC_LO)
+ .Case("vcc_hi", AMDGPU::VCC_HI)
+ .Case("exec_lo", AMDGPU::EXEC_LO)
+ .Case("exec_hi", AMDGPU::EXEC_HI)
+ .Case("tma_lo", AMDGPU::TMA_LO)
+ .Case("tma_hi", AMDGPU::TMA_HI)
+ .Case("tba_lo", AMDGPU::TBA_LO)
+ .Case("tba_hi", AMDGPU::TBA_HI)
+ .Default(0);
+}
+
+bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) {
+ auto R = parseRegister();
+ if (!R) return true;
+ assert(R->isReg());
+ RegNo = R->getReg();
+ StartLoc = R->getStartLoc();
+ EndLoc = R->getEndLoc();
+ return false;
+}
+
+bool AMDGPUAsmParser::AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth, RegisterKind RegKind, unsigned Reg1, unsigned RegNum)
+{
+ switch (RegKind) {
+ case IS_SPECIAL:
+ if (Reg == AMDGPU::EXEC_LO && Reg1 == AMDGPU::EXEC_HI) { Reg = AMDGPU::EXEC; RegWidth = 2; return true; }
+ if (Reg == AMDGPU::FLAT_SCR_LO && Reg1 == AMDGPU::FLAT_SCR_HI) { Reg = AMDGPU::FLAT_SCR; RegWidth = 2; return true; }
+ if (Reg == AMDGPU::VCC_LO && Reg1 == AMDGPU::VCC_HI) { Reg = AMDGPU::VCC; RegWidth = 2; return true; }
+ if (Reg == AMDGPU::TBA_LO && Reg1 == AMDGPU::TBA_HI) { Reg = AMDGPU::TBA; RegWidth = 2; return true; }
+ if (Reg == AMDGPU::TMA_LO && Reg1 == AMDGPU::TMA_HI) { Reg = AMDGPU::TMA; RegWidth = 2; return true; }
+ return false;
+ case IS_VGPR:
+ case IS_SGPR:
+ case IS_TTMP:
+ if (Reg1 != Reg + RegWidth) { return false; }
+ RegWidth++;
+ return true;
+ default:
+ llvm_unreachable("unexpected register kind");
+ }
+}
+
+bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, unsigned& RegNum, unsigned& RegWidth, unsigned *DwordRegIndex)
+{
+ if (DwordRegIndex) { *DwordRegIndex = 0; }
+ const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+ if (getLexer().is(AsmToken::Identifier)) {
+ StringRef RegName = Parser.getTok().getString();
+ if ((Reg = getSpecialRegForName(RegName))) {
+ Parser.Lex();
+ RegKind = IS_SPECIAL;
+ } else {
+ unsigned RegNumIndex = 0;
+ if (RegName[0] == 'v') {
+ RegNumIndex = 1;
+ RegKind = IS_VGPR;
+ } else if (RegName[0] == 's') {
+ RegNumIndex = 1;
+ RegKind = IS_SGPR;
+ } else if (RegName.startswith("ttmp")) {
+ RegNumIndex = strlen("ttmp");
+ RegKind = IS_TTMP;
+ } else {
+ return false;
+ }
+ if (RegName.size() > RegNumIndex) {
+ // Single 32-bit register: vXX.
+ if (RegName.substr(RegNumIndex).getAsInteger(10, RegNum))
+ return false;
+ Parser.Lex();
+ RegWidth = 1;
+ } else {
+ // Range of registers: v[XX:YY]. ":YY" is optional.
+ Parser.Lex();
+ int64_t RegLo, RegHi;
+ if (getLexer().isNot(AsmToken::LBrac))
+ return false;
+ Parser.Lex();
+
+ if (getParser().parseAbsoluteExpression(RegLo))
+ return false;
+
+ const bool isRBrace = getLexer().is(AsmToken::RBrac);
+ if (!isRBrace && getLexer().isNot(AsmToken::Colon))
+ return false;
+ Parser.Lex();
+
+ if (isRBrace) {
+ RegHi = RegLo;
+ } else {
+ if (getParser().parseAbsoluteExpression(RegHi))
+ return false;
+
+ if (getLexer().isNot(AsmToken::RBrac))
+ return false;
+ Parser.Lex();
+ }
+ RegNum = (unsigned) RegLo;
+ RegWidth = (RegHi - RegLo) + 1;
+ }
+ }
+ } else if (getLexer().is(AsmToken::LBrac)) {
+ // List of consecutive registers: [s0,s1,s2,s3]
+ Parser.Lex();
+ if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth, nullptr))
+ return false;
+ if (RegWidth != 1)
+ return false;
+ RegisterKind RegKind1;
+ unsigned Reg1, RegNum1, RegWidth1;
+ do {
+ if (getLexer().is(AsmToken::Comma)) {
+ Parser.Lex();
+ } else if (getLexer().is(AsmToken::RBrac)) {
+ Parser.Lex();
+ break;
+ } else if (ParseAMDGPURegister(RegKind1, Reg1, RegNum1, RegWidth1, nullptr)) {
+ if (RegWidth1 != 1) {
+ return false;
+ }
+ if (RegKind1 != RegKind) {
+ return false;
+ }
+ if (!AddNextRegisterToList(Reg, RegWidth, RegKind1, Reg1, RegNum1)) {
+ return false;
+ }
+ } else {
+ return false;
+ }
+ } while (true);
+ } else {
+ return false;
+ }
+ switch (RegKind) {
+ case IS_SPECIAL:
+ RegNum = 0;
+ RegWidth = 1;
+ break;
+ case IS_VGPR:
+ case IS_SGPR:
+ case IS_TTMP:
+ {
+ unsigned Size = 1;
+ if (RegKind == IS_SGPR || RegKind == IS_TTMP) {
+ // SGPR and TTMP registers must be aligned. Max required alignment is 4 dwords.
+ Size = std::min(RegWidth, 4u);
+ }
+ if (RegNum % Size != 0)
+ return false;
+ if (DwordRegIndex) { *DwordRegIndex = RegNum; }
+ RegNum = RegNum / Size;
+ int RCID = getRegClass(RegKind, RegWidth);
+ if (RCID == -1)
+ return false;
+ const MCRegisterClass RC = TRI->getRegClass(RCID);
+ if (RegNum >= RC.getNumRegs())
+ return false;
+ Reg = RC.getRegister(RegNum);
+ break;
+ }
+
+ default:
+ llvm_unreachable("unexpected register kind");
+ }
+
+ if (!subtargetHasRegister(*TRI, Reg))
+ return false;
+ return true;
+}
+
+std::unique_ptr<AMDGPUOperand> AMDGPUAsmParser::parseRegister() {
+ const auto &Tok = Parser.getTok();
+ SMLoc StartLoc = Tok.getLoc();
+ SMLoc EndLoc = Tok.getEndLoc();
+ RegisterKind RegKind;
+ unsigned Reg, RegNum, RegWidth, DwordRegIndex;
+
+ if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth, &DwordRegIndex)) {
+ return nullptr;
+ }
+ KernelScope.usesRegister(RegKind, DwordRegIndex, RegWidth);
+ return AMDGPUOperand::CreateReg(this, Reg, StartLoc, EndLoc, false);
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseImm(OperandVector &Operands) {
+ // TODO: add syntactic sugar for 1/(2*PI)
+ bool Minus = false;
+ if (getLexer().getKind() == AsmToken::Minus) {
+ Minus = true;
+ Parser.Lex();
+ }
+
+ SMLoc S = Parser.getTok().getLoc();
+ switch(getLexer().getKind()) {
+ case AsmToken::Integer: {
+ int64_t IntVal;
+ if (getParser().parseAbsoluteExpression(IntVal))
+ return MatchOperand_ParseFail;
+ if (Minus)
+ IntVal *= -1;
+ Operands.push_back(AMDGPUOperand::CreateImm(this, IntVal, S));
+ return MatchOperand_Success;
+ }
+ case AsmToken::Real: {
+ int64_t IntVal;
+ if (getParser().parseAbsoluteExpression(IntVal))
+ return MatchOperand_ParseFail;
+
+ APFloat F(BitsToDouble(IntVal));
+ if (Minus)
+ F.changeSign();
+ Operands.push_back(
+ AMDGPUOperand::CreateImm(this, F.bitcastToAPInt().getZExtValue(), S,
+ AMDGPUOperand::ImmTyNone, true));
+ return MatchOperand_Success;
+ }
+ default:
+ return Minus ? MatchOperand_ParseFail : MatchOperand_NoMatch;
+ }
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands) {
+ auto res = parseImm(Operands);
+ if (res != MatchOperand_NoMatch) {
+ return res;
+ }
+
+ if (auto R = parseRegister()) {
+ assert(R->isReg());
+ R->Reg.IsForcedVOP3 = isForcedVOP3();
+ Operands.push_back(std::move(R));
+ return MatchOperand_Success;
+ }
+ return MatchOperand_ParseFail;
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands) {
+ // XXX: During parsing we can't determine if minus sign means
+ // negate-modifier or negative immediate value.
+ // By default we suppose it is modifier.
+ bool Negate = false, Abs = false, Abs2 = false;
+
+ if (getLexer().getKind()== AsmToken::Minus) {
+ Parser.Lex();
+ Negate = true;
+ }
+
+ if (getLexer().getKind() == AsmToken::Identifier && Parser.getTok().getString() == "abs") {
+ Parser.Lex();
+ Abs2 = true;
+ if (getLexer().isNot(AsmToken::LParen)) {
+ Error(Parser.getTok().getLoc(), "expected left paren after abs");
+ return MatchOperand_ParseFail;
+ }
+ Parser.Lex();
+ }
+
+ if (getLexer().getKind() == AsmToken::Pipe) {
+ if (Abs2) {
+ Error(Parser.getTok().getLoc(), "expected register or immediate");
+ return MatchOperand_ParseFail;
+ }
+ Parser.Lex();
+ Abs = true;
+ }
+
+ auto Res = parseRegOrImm(Operands);
+ if (Res != MatchOperand_Success) {
+ return Res;
+ }
+
+ AMDGPUOperand::Modifiers Mods;
+ if (Negate) {
+ Mods.Neg = true;
+ }
+ if (Abs) {
+ if (getLexer().getKind() != AsmToken::Pipe) {
+ Error(Parser.getTok().getLoc(), "expected vertical bar");
+ return MatchOperand_ParseFail;
+ }
+ Parser.Lex();
+ Mods.Abs = true;
+ }
+ if (Abs2) {
+ if (getLexer().isNot(AsmToken::RParen)) {
+ Error(Parser.getTok().getLoc(), "expected closing parentheses");
+ return MatchOperand_ParseFail;
+ }
+ Parser.Lex();
+ Mods.Abs = true;
+ }
+
+ if (Mods.hasFPModifiers()) {
+ AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands.back());
+ Op.setModifiers(Mods);
+ }
+ return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands) {
+ bool Sext = false;
+
+ if (getLexer().getKind() == AsmToken::Identifier && Parser.getTok().getString() == "sext") {
+ Parser.Lex();
+ Sext = true;
+ if (getLexer().isNot(AsmToken::LParen)) {
+ Error(Parser.getTok().getLoc(), "expected left paren after sext");
+ return MatchOperand_ParseFail;
+ }
+ Parser.Lex();
+ }
+
+ auto Res = parseRegOrImm(Operands);
+ if (Res != MatchOperand_Success) {
+ return Res;
+ }
+
+ AMDGPUOperand::Modifiers Mods;
+ if (Sext) {
+ if (getLexer().isNot(AsmToken::RParen)) {
+ Error(Parser.getTok().getLoc(), "expected closing parentheses");
+ return MatchOperand_ParseFail;
+ }
+ Parser.Lex();
+ Mods.Sext = true;
+ }
+
+ if (Mods.hasIntModifiers()) {
+ AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands.back());
+ Op.setModifiers(Mods);
+ }
+
+ return MatchOperand_Success;
+}
+
+OperandMatchResultTy AMDGPUAsmParser::parseVReg32OrOff(OperandVector &Operands) {
+ std::unique_ptr<AMDGPUOperand> Reg = parseRegister();
+ if (Reg) {
+ Operands.push_back(std::move(Reg));
+ return MatchOperand_Success;
+ }
+
+ const AsmToken &Tok = Parser.getTok();
+ if (Tok.getString() == "off") {
+ Operands.push_back(AMDGPUOperand::CreateImm(this, 0, Tok.getLoc(),
+ AMDGPUOperand::ImmTyOff, false));
+ Parser.Lex();
+ return MatchOperand_Success;
+ }
+
+ return MatchOperand_NoMatch;
+}
+
+unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
+
+ uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
+
+ if ((getForcedEncodingSize() == 32 && (TSFlags & SIInstrFlags::VOP3)) ||
+ (getForcedEncodingSize() == 64 && !(TSFlags & SIInstrFlags::VOP3)) ||
+ (isForcedDPP() && !(TSFlags & SIInstrFlags::DPP)) ||
+ (isForcedSDWA() && !(TSFlags & SIInstrFlags::SDWA)) )
+ return Match_InvalidOperand;
+
+ if ((TSFlags & SIInstrFlags::VOP3) &&
+ (TSFlags & SIInstrFlags::VOPAsmPrefer32Bit) &&
+ getForcedEncodingSize() != 64)
+ return Match_PreferE32;
+
+ if (Inst.getOpcode() == AMDGPU::V_MAC_F32_sdwa_vi ||
+ Inst.getOpcode() == AMDGPU::V_MAC_F16_sdwa_vi) {
+ // v_mac_f32/16 allow only dst_sel == DWORD;
+ auto OpNum =
+ AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::dst_sel);
+ const auto &Op = Inst.getOperand(OpNum);
+ if (!Op.isImm() || Op.getImm() != AMDGPU::SDWA::SdwaSel::DWORD) {
+ return Match_InvalidOperand;
+ }
+ }
+
+ return Match_Success;
+}
+
+bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands,
+ MCStreamer &Out,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) {
+ // What asm variants we should check
+ std::vector<unsigned> MatchedVariants;
+ if (getForcedEncodingSize() == 32) {
+ MatchedVariants = {AMDGPUAsmVariants::DEFAULT};
+ } else if (isForcedVOP3()) {
+ MatchedVariants = {AMDGPUAsmVariants::VOP3};
+ } else if (isForcedSDWA()) {
+ MatchedVariants = {AMDGPUAsmVariants::SDWA};
+ } else if (isForcedDPP()) {
+ MatchedVariants = {AMDGPUAsmVariants::DPP};
+ } else {
+ MatchedVariants = {AMDGPUAsmVariants::DEFAULT,
+ AMDGPUAsmVariants::VOP3,
+ AMDGPUAsmVariants::SDWA,
+ AMDGPUAsmVariants::DPP};
+ }
+
+ MCInst Inst;
+ unsigned Result = Match_Success;
+ for (auto Variant : MatchedVariants) {
+ uint64_t EI;
+ auto R = MatchInstructionImpl(Operands, Inst, EI, MatchingInlineAsm,
+ Variant);
+ // We order match statuses from least to most specific. We use most specific
+ // status as resulting
+ // Match_MnemonicFail < Match_InvalidOperand < Match_MissingFeature < Match_PreferE32
+ if ((R == Match_Success) ||
+ (R == Match_PreferE32) ||
+ (R == Match_MissingFeature && Result != Match_PreferE32) ||
+ (R == Match_InvalidOperand && Result != Match_MissingFeature
+ && Result != Match_PreferE32) ||
+ (R == Match_MnemonicFail && Result != Match_InvalidOperand
+ && Result != Match_MissingFeature
+ && Result != Match_PreferE32)) {
+ Result = R;
+ ErrorInfo = EI;
+ }
+ if (R == Match_Success)
+ break;
+ }
+
+ switch (Result) {
+ default: break;
+ case Match_Success:
+ Inst.setLoc(IDLoc);
+ Out.EmitInstruction(Inst, getSTI());
+ return false;
+
+ case Match_MissingFeature:
+ return Error(IDLoc, "instruction not supported on this GPU");
+
+ case Match_MnemonicFail:
+ return Error(IDLoc, "unrecognized instruction mnemonic");
+
+ case Match_InvalidOperand: {
+ SMLoc ErrorLoc = IDLoc;
+ if (ErrorInfo != ~0ULL) {
+ if (ErrorInfo >= Operands.size()) {
+ return Error(IDLoc, "too few operands for instruction");
+ }
+ ErrorLoc = ((AMDGPUOperand &)*Operands[ErrorInfo]).getStartLoc();
+ if (ErrorLoc == SMLoc())
+ ErrorLoc = IDLoc;
+ }
+ return Error(ErrorLoc, "invalid operand for instruction");
+ }
+
+ case Match_PreferE32:
+ return Error(IDLoc, "internal error: instruction without _e64 suffix "
+ "should be encoded as e32");
+ }
+ llvm_unreachable("Implement any new match types added!");
+}
+
+bool AMDGPUAsmParser::ParseAsAbsoluteExpression(uint32_t &Ret) {
+ int64_t Tmp = -1;
+ if (getLexer().isNot(AsmToken::Integer) && getLexer().isNot(AsmToken::Identifier)) {
+ return true;
+ }
+ if (getParser().parseAbsoluteExpression(Tmp)) {
+ return true;
+ }
+ Ret = static_cast<uint32_t>(Tmp);
+ return false;
+}
+
+
+bool AMDGPUAsmParser::ParseDirectiveMajorMinor(uint32_t &Major,
+ uint32_t &Minor) {
+ if (ParseAsAbsoluteExpression(Major))
+ return TokError("invalid major version");
+
+ if (getLexer().isNot(AsmToken::Comma))
+ return TokError("minor version number required, comma expected");
+ Lex();
+
+ if (ParseAsAbsoluteExpression(Minor))
+ return TokError("invalid minor version");
+
+ return false;
+}
+
+bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectVersion() {
+
+ uint32_t Major;
+ uint32_t Minor;
+
+ if (ParseDirectiveMajorMinor(Major, Minor))
+ return true;
+
+ getTargetStreamer().EmitDirectiveHSACodeObjectVersion(Major, Minor);
+ return false;
+}
+
+bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() {
+ uint32_t Major;
+ uint32_t Minor;
+ uint32_t Stepping;
+ StringRef VendorName;
+ StringRef ArchName;
+
+ // If this directive has no arguments, then use the ISA version for the
+ // targeted GPU.
+ if (getLexer().is(AsmToken::EndOfStatement)) {
+ AMDGPU::IsaVersion Isa = AMDGPU::getIsaVersion(getSTI().getFeatureBits());
+ getTargetStreamer().EmitDirectiveHSACodeObjectISA(Isa.Major, Isa.Minor,
+ Isa.Stepping,
+ "AMD", "AMDGPU");
+ return false;
+ }
+
+ if (ParseDirectiveMajorMinor(Major, Minor))
+ return true;
+
+ if (getLexer().isNot(AsmToken::Comma))
+ return TokError("stepping version number required, comma expected");
+ Lex();
+
+ if (ParseAsAbsoluteExpression(Stepping))
+ return TokError("invalid stepping version");
+
+ if (getLexer().isNot(AsmToken::Comma))
+ return TokError("vendor name required, comma expected");
+ Lex();
+
+ if (getLexer().isNot(AsmToken::String))
+ return TokError("invalid vendor name");
+
+ VendorName = getLexer().getTok().getStringContents();
+ Lex();
+
+ if (getLexer().isNot(AsmToken::Comma))
+ return TokError("arch name required, comma expected");
+ Lex();
+
+ if (getLexer().isNot(AsmToken::String))
+ return TokError("invalid arch name");
+
+ ArchName = getLexer().getTok().getStringContents();
+ Lex();
+
+ getTargetStreamer().EmitDirectiveHSACodeObjectISA(Major, Minor, Stepping,
+ VendorName, ArchName);
+ return false;
+}
+
+bool AMDGPUAsmParser::ParseDirectiveRuntimeMetadata() {
+ std::string Metadata;
+ raw_string_ostream MS(Metadata);
+
+ getLexer().setSkipSpace(false);
+
+ bool FoundEnd = false;
+ while (!getLexer().is(AsmToken::Eof)) {
+ while (getLexer().is(AsmToken::Space)) {
+ MS << ' ';
+ Lex();
+ }
+
+ if (getLexer().is(AsmToken::Identifier)) {
+ StringRef ID = getLexer().getTok().getIdentifier();
+ if (ID == ".end_amdgpu_runtime_metadata") {
+ Lex();
+ FoundEnd = true;
+ break;
+ }
+ }
+
+ MS << Parser.parseStringToEndOfStatement()
+ << getContext().getAsmInfo()->getSeparatorString();
+
+ Parser.eatToEndOfStatement();
+ }
+
+ getLexer().setSkipSpace(true);
+
+ if (getLexer().is(AsmToken::Eof) && !FoundEnd)
+ return TokError("expected directive .end_amdgpu_runtime_metadata not found");
+
+ MS.flush();
+
+ getTargetStreamer().EmitRuntimeMetadata(Metadata);
+
+ return false;
+}
+
+bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID,
+ amd_kernel_code_t &Header) {
+ SmallString<40> ErrStr;
+ raw_svector_ostream Err(ErrStr);
+ if (!parseAmdKernelCodeField(ID, getParser(), Header, Err)) {
+ return TokError(Err.str());
+ }
+ Lex();
+ return false;
+}
+
+bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() {
+ amd_kernel_code_t Header;
+ AMDGPU::initDefaultAMDKernelCodeT(Header, getSTI().getFeatureBits());
+
+ while (true) {
+ // Lex EndOfStatement. This is in a while loop, because lexing a comment
+ // will set the current token to EndOfStatement.
+ while(getLexer().is(AsmToken::EndOfStatement))
+ Lex();
+
+ if (getLexer().isNot(AsmToken::Identifier))
+ return TokError("expected value identifier or .end_amd_kernel_code_t");
+
+ StringRef ID = getLexer().getTok().getIdentifier();
+ Lex();
+
+ if (ID == ".end_amd_kernel_code_t")
+ break;
+
+ if (ParseAMDKernelCodeTValue(ID, Header))
+ return true;
+ }
+
+ getTargetStreamer().EmitAMDKernelCodeT(Header);
+
+ return false;
+}
+
+bool AMDGPUAsmParser::ParseSectionDirectiveHSAText() {
+ getParser().getStreamer().SwitchSection(
+ AMDGPU::getHSATextSection(getContext()));
+ return false;
+}
+
+bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaKernel() {
+ if (getLexer().isNot(AsmToken::Identifier))
+ return TokError("expected symbol name");
+
+ StringRef KernelName = Parser.getTok().getString();
+
+ getTargetStreamer().EmitAMDGPUSymbolType(KernelName,
+ ELF::STT_AMDGPU_HSA_KERNEL);
+ Lex();
+ KernelScope.initialize(getContext());
+ return false;
+}
+
+bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaModuleGlobal() {
+ if (getLexer().isNot(AsmToken::Identifier))
+ return TokError("expected symbol name");
+
+ StringRef GlobalName = Parser.getTok().getIdentifier();
+
+ getTargetStreamer().EmitAMDGPUHsaModuleScopeGlobal(GlobalName);
+ Lex();
+ return false;
+}
+
+bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaProgramGlobal() {
+ if (getLexer().isNot(AsmToken::Identifier))
+ return TokError("expected symbol name");
+
+ StringRef GlobalName = Parser.getTok().getIdentifier();
+
+ getTargetStreamer().EmitAMDGPUHsaProgramScopeGlobal(GlobalName);
+ Lex();
+ return false;
+}
+
+bool AMDGPUAsmParser::ParseSectionDirectiveHSADataGlobalAgent() {
+ getParser().getStreamer().SwitchSection(
+ AMDGPU::getHSADataGlobalAgentSection(getContext()));
+ return false;
+}
+
+bool AMDGPUAsmParser::ParseSectionDirectiveHSADataGlobalProgram() {
+ getParser().getStreamer().SwitchSection(
+ AMDGPU::getHSADataGlobalProgramSection(getContext()));
+ return false;
+}
+
+bool AMDGPUAsmParser::ParseSectionDirectiveHSARodataReadonlyAgent() {
+ getParser().getStreamer().SwitchSection(
+ AMDGPU::getHSARodataReadonlyAgentSection(getContext()));
+ return false;
+}
+
+bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
+ StringRef IDVal = DirectiveID.getString();
+
+ if (IDVal == ".hsa_code_object_version")
+ return ParseDirectiveHSACodeObjectVersion();
+
+ if (IDVal == ".hsa_code_object_isa")
+ return ParseDirectiveHSACodeObjectISA();
+
+ if (IDVal == ".amdgpu_runtime_metadata")
+ return ParseDirectiveRuntimeMetadata();
+
+ if (IDVal == ".amd_kernel_code_t")
+ return ParseDirectiveAMDKernelCodeT();
+
+ if (IDVal == ".hsatext")
+ return ParseSectionDirectiveHSAText();
+
+ if (IDVal == ".amdgpu_hsa_kernel")
+ return ParseDirectiveAMDGPUHsaKernel();
+
+ if (IDVal == ".amdgpu_hsa_module_global")
+ return ParseDirectiveAMDGPUHsaModuleGlobal();
+
+ if (IDVal == ".amdgpu_hsa_program_global")
+ return ParseDirectiveAMDGPUHsaProgramGlobal();
+
+ if (IDVal == ".hsadata_global_agent")
+ return ParseSectionDirectiveHSADataGlobalAgent();
+
+ if (IDVal == ".hsadata_global_program")
+ return ParseSectionDirectiveHSADataGlobalProgram();
+
+ if (IDVal == ".hsarodata_readonly_agent")
+ return ParseSectionDirectiveHSARodataReadonlyAgent();
+
+ return true;
+}
+
+bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
+ unsigned RegNo) const {
+ if (isCI())
+ return true;
+
+ if (isSI()) {
+ // No flat_scr
+ switch (RegNo) {
+ case AMDGPU::FLAT_SCR:
+ case AMDGPU::FLAT_SCR_LO:
+ case AMDGPU::FLAT_SCR_HI:
+ return false;
+ default:
+ return true;
+ }
+ }
+
+ // VI only has 102 SGPRs, so make sure we aren't trying to use the 2 more that
+ // SI/CI have.
+ for (MCRegAliasIterator R(AMDGPU::SGPR102_SGPR103, &MRI, true);
+ R.isValid(); ++R) {
+ if (*R == RegNo)
+ return false;
+ }
+
+ return true;
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
+
+ // Try to parse with a custom parser
+ OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+
+ // If we successfully parsed the operand or if there as an error parsing,
+ // we are done.
+ //
+ // If we are parsing after we reach EndOfStatement then this means we
+ // are appending default values to the Operands list. This is only done
+ // by custom parser, so we shouldn't continue on to the generic parsing.
+ if (ResTy == MatchOperand_Success || ResTy == MatchOperand_ParseFail ||
+ getLexer().is(AsmToken::EndOfStatement))
+ return ResTy;
+
+ ResTy = parseRegOrImm(Operands);
+
+ if (ResTy == MatchOperand_Success)
+ return ResTy;
+
+ if (getLexer().getKind() == AsmToken::Identifier) {
+ // If this identifier is a symbol, we want to create an expression for it.
+ // It is a little difficult to distinguish between a symbol name, and
+ // an instruction flag like 'gds'. In order to do this, we parse
+ // all tokens as expressions and then treate the symbol name as the token
+ // string when we want to interpret the operand as a token.
+ const auto &Tok = Parser.getTok();
+ SMLoc S = Tok.getLoc();
+ const MCExpr *Expr = nullptr;
+ if (!Parser.parseExpression(Expr)) {
+ Operands.push_back(AMDGPUOperand::CreateExpr(this, Expr, S));
+ return MatchOperand_Success;
+ }
+
+ Operands.push_back(AMDGPUOperand::CreateToken(this, Tok.getString(), Tok.getLoc()));
+ Parser.Lex();
+ return MatchOperand_Success;
+ }
+ return MatchOperand_NoMatch;
+}
+
+StringRef AMDGPUAsmParser::parseMnemonicSuffix(StringRef Name) {
+ // Clear any forced encodings from the previous instruction.
+ setForcedEncodingSize(0);
+ setForcedDPP(false);
+ setForcedSDWA(false);
+
+ if (Name.endswith("_e64")) {
+ setForcedEncodingSize(64);
+ return Name.substr(0, Name.size() - 4);
+ } else if (Name.endswith("_e32")) {
+ setForcedEncodingSize(32);
+ return Name.substr(0, Name.size() - 4);
+ } else if (Name.endswith("_dpp")) {
+ setForcedDPP(true);
+ return Name.substr(0, Name.size() - 4);
+ } else if (Name.endswith("_sdwa")) {
+ setForcedSDWA(true);
+ return Name.substr(0, Name.size() - 5);
+ }
+ return Name;
+}
+
+bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info,
+ StringRef Name,
+ SMLoc NameLoc, OperandVector &Operands) {
+ // Add the instruction mnemonic
+ Name = parseMnemonicSuffix(Name);
+ Operands.push_back(AMDGPUOperand::CreateToken(this, Name, NameLoc));
+
+ while (!getLexer().is(AsmToken::EndOfStatement)) {
+ OperandMatchResultTy Res = parseOperand(Operands, Name);
+
+ // Eat the comma or space if there is one.
+ if (getLexer().is(AsmToken::Comma))
+ Parser.Lex();
+
+ switch (Res) {
+ case MatchOperand_Success: break;
+ case MatchOperand_ParseFail:
+ Error(getLexer().getLoc(), "failed parsing operand.");
+ while (!getLexer().is(AsmToken::EndOfStatement)) {
+ Parser.Lex();
+ }
+ return true;
+ case MatchOperand_NoMatch:
+ Error(getLexer().getLoc(), "not a valid operand.");
+ while (!getLexer().is(AsmToken::EndOfStatement)) {
+ Parser.Lex();
+ }
+ return true;
+ }
+ }
+
+ return false;
+}
+
+//===----------------------------------------------------------------------===//
+// Utility functions
+//===----------------------------------------------------------------------===//
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int) {
+ switch(getLexer().getKind()) {
+ default: return MatchOperand_NoMatch;
+ case AsmToken::Identifier: {
+ StringRef Name = Parser.getTok().getString();
+ if (!Name.equals(Prefix)) {
+ return MatchOperand_NoMatch;
+ }
+
+ Parser.Lex();
+ if (getLexer().isNot(AsmToken::Colon))
+ return MatchOperand_ParseFail;
+
+ Parser.Lex();
+ if (getLexer().isNot(AsmToken::Integer))
+ return MatchOperand_ParseFail;
+
+ if (getParser().parseAbsoluteExpression(Int))
+ return MatchOperand_ParseFail;
+ break;
+ }
+ }
+ return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands,
+ enum AMDGPUOperand::ImmTy ImmTy,
+ bool (*ConvertResult)(int64_t&)) {
+ SMLoc S = Parser.getTok().getLoc();
+ int64_t Value = 0;
+
+ OperandMatchResultTy Res = parseIntWithPrefix(Prefix, Value);
+ if (Res != MatchOperand_Success)
+ return Res;
+
+ if (ConvertResult && !ConvertResult(Value)) {
+ return MatchOperand_ParseFail;
+ }
+
+ Operands.push_back(AMDGPUOperand::CreateImm(this, Value, S, ImmTy));
+ return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands,
+ enum AMDGPUOperand::ImmTy ImmTy) {
+ int64_t Bit = 0;
+ SMLoc S = Parser.getTok().getLoc();
+
+ // We are at the end of the statement, and this is a default argument, so
+ // use a default value.
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ switch(getLexer().getKind()) {
+ case AsmToken::Identifier: {
+ StringRef Tok = Parser.getTok().getString();
+ if (Tok == Name) {
+ Bit = 1;
+ Parser.Lex();
+ } else if (Tok.startswith("no") && Tok.endswith(Name)) {
+ Bit = 0;
+ Parser.Lex();
+ } else {
+ return MatchOperand_NoMatch;
+ }
+ break;
+ }
+ default:
+ return MatchOperand_NoMatch;
+ }
+ }
+
+ Operands.push_back(AMDGPUOperand::CreateImm(this, Bit, S, ImmTy));
+ return MatchOperand_Success;
+}
+
+typedef std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalImmIndexMap;
+
+void addOptionalImmOperand(MCInst& Inst, const OperandVector& Operands,
+ OptionalImmIndexMap& OptionalIdx,
+ enum AMDGPUOperand::ImmTy ImmT, int64_t Default = 0) {
+ auto i = OptionalIdx.find(ImmT);
+ if (i != OptionalIdx.end()) {
+ unsigned Idx = i->second;
+ ((AMDGPUOperand &)*Operands[Idx]).addImmOperands(Inst, 1);
+ } else {
+ Inst.addOperand(MCOperand::createImm(Default));
+ }
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseStringWithPrefix(StringRef Prefix, StringRef &Value) {
+ if (getLexer().isNot(AsmToken::Identifier)) {
+ return MatchOperand_NoMatch;
+ }
+ StringRef Tok = Parser.getTok().getString();
+ if (Tok != Prefix) {
+ return MatchOperand_NoMatch;
+ }
+
+ Parser.Lex();
+ if (getLexer().isNot(AsmToken::Colon)) {
+ return MatchOperand_ParseFail;
+ }
+
+ Parser.Lex();
+ if (getLexer().isNot(AsmToken::Identifier)) {
+ return MatchOperand_ParseFail;
+ }
+
+ Value = Parser.getTok().getString();
+ return MatchOperand_Success;
+}
+
+//===----------------------------------------------------------------------===//
+// ds
+//===----------------------------------------------------------------------===//
+
+void AMDGPUAsmParser::cvtDSOffset01(MCInst &Inst,
+ const OperandVector &Operands) {
+ OptionalImmIndexMap OptionalIdx;
+
+ for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
+ AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+
+ // Add the register arguments
+ if (Op.isReg()) {
+ Op.addRegOperands(Inst, 1);
+ continue;
+ }
+
+ // Handle optional arguments
+ OptionalIdx[Op.getImmTy()] = i;
+ }
+
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset0);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset1);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGDS);
+
+ Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0
+}
+
+void AMDGPUAsmParser::cvtDS(MCInst &Inst, const OperandVector &Operands) {
+ std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx;
+ bool GDSOnly = false;
+
+ for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
+ AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+
+ // Add the register arguments
+ if (Op.isReg()) {
+ Op.addRegOperands(Inst, 1);
+ continue;
+ }
+
+ if (Op.isToken() && Op.getToken() == "gds") {
+ GDSOnly = true;
+ continue;
+ }
+
+ // Handle optional arguments
+ OptionalIdx[Op.getImmTy()] = i;
+ }
+
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGDS);
+
+ if (!GDSOnly) {
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGDS);
+ }
+ Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0
+}
+
+void AMDGPUAsmParser::cvtExp(MCInst &Inst, const OperandVector &Operands) {
+ OptionalImmIndexMap OptionalIdx;
+
+ unsigned EnMask = 0;
+ int SrcIdx = 0;
+
+ for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
+ AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+
+ // Add the register arguments
+ if (Op.isReg()) {
+ EnMask |= (1 << SrcIdx);
+ Op.addRegOperands(Inst, 1);
+ ++SrcIdx;
+ continue;
+ }
+
+ if (Op.isOff()) {
+ ++SrcIdx;
+ Inst.addOperand(MCOperand::createReg(AMDGPU::NoRegister));
+ continue;
+ }
+
+ if (Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyExpTgt) {
+ Op.addImmOperands(Inst, 1);
+ continue;
+ }
+
+ if (Op.isToken() && Op.getToken() == "done")
+ continue;
+
+ // Handle optional arguments
+ OptionalIdx[Op.getImmTy()] = i;
+ }
+
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyExpVM);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyExpCompr);
+
+ Inst.addOperand(MCOperand::createImm(EnMask));
+}
+
+//===----------------------------------------------------------------------===//
+// s_waitcnt
+//===----------------------------------------------------------------------===//
+
+bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) {
+ StringRef CntName = Parser.getTok().getString();
+ int64_t CntVal;
+
+ Parser.Lex();
+ if (getLexer().isNot(AsmToken::LParen))
+ return true;
+
+ Parser.Lex();
+ if (getLexer().isNot(AsmToken::Integer))
+ return true;
+
+ if (getParser().parseAbsoluteExpression(CntVal))
+ return true;
+
+ if (getLexer().isNot(AsmToken::RParen))
+ return true;
+
+ Parser.Lex();
+ if (getLexer().is(AsmToken::Amp) || getLexer().is(AsmToken::Comma))
+ Parser.Lex();
+
+ IsaVersion IV = getIsaVersion(getSTI().getFeatureBits());
+ if (CntName == "vmcnt")
+ IntVal = encodeVmcnt(IV, IntVal, CntVal);
+ else if (CntName == "expcnt")
+ IntVal = encodeExpcnt(IV, IntVal, CntVal);
+ else if (CntName == "lgkmcnt")
+ IntVal = encodeLgkmcnt(IV, IntVal, CntVal);
+ else
+ return true;
+
+ return false;
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) {
+ IsaVersion IV = getIsaVersion(getSTI().getFeatureBits());
+ int64_t Waitcnt = getWaitcntBitMask(IV);
+ SMLoc S = Parser.getTok().getLoc();
+
+ switch(getLexer().getKind()) {
+ default: return MatchOperand_ParseFail;
+ case AsmToken::Integer:
+ // The operand can be an integer value.
+ if (getParser().parseAbsoluteExpression(Waitcnt))
+ return MatchOperand_ParseFail;
+ break;
+
+ case AsmToken::Identifier:
+ do {
+ if (parseCnt(Waitcnt))
+ return MatchOperand_ParseFail;
+ } while(getLexer().isNot(AsmToken::EndOfStatement));
+ break;
+ }
+ Operands.push_back(AMDGPUOperand::CreateImm(this, Waitcnt, S));
+ return MatchOperand_Success;
+}
+
+bool AMDGPUAsmParser::parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset, int64_t &Width) {
+ using namespace llvm::AMDGPU::Hwreg;
+
+ if (Parser.getTok().getString() != "hwreg")
+ return true;
+ Parser.Lex();
+
+ if (getLexer().isNot(AsmToken::LParen))
+ return true;
+ Parser.Lex();
+
+ if (getLexer().is(AsmToken::Identifier)) {
+ HwReg.IsSymbolic = true;
+ HwReg.Id = ID_UNKNOWN_;
+ const StringRef tok = Parser.getTok().getString();
+ for (int i = ID_SYMBOLIC_FIRST_; i < ID_SYMBOLIC_LAST_; ++i) {
+ if (tok == IdSymbolic[i]) {
+ HwReg.Id = i;
+ break;
+ }
+ }
+ Parser.Lex();
+ } else {
+ HwReg.IsSymbolic = false;
+ if (getLexer().isNot(AsmToken::Integer))
+ return true;
+ if (getParser().parseAbsoluteExpression(HwReg.Id))
+ return true;
+ }
+
+ if (getLexer().is(AsmToken::RParen)) {
+ Parser.Lex();
+ return false;
+ }
+
+ // optional params
+ if (getLexer().isNot(AsmToken::Comma))
+ return true;
+ Parser.Lex();
+
+ if (getLexer().isNot(AsmToken::Integer))
+ return true;
+ if (getParser().parseAbsoluteExpression(Offset))
+ return true;
+
+ if (getLexer().isNot(AsmToken::Comma))
+ return true;
+ Parser.Lex();
+
+ if (getLexer().isNot(AsmToken::Integer))
+ return true;
+ if (getParser().parseAbsoluteExpression(Width))
+ return true;
+
+ if (getLexer().isNot(AsmToken::RParen))
+ return true;
+ Parser.Lex();
+
+ return false;
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseHwreg(OperandVector &Operands) {
+ using namespace llvm::AMDGPU::Hwreg;
+
+ int64_t Imm16Val = 0;
+ SMLoc S = Parser.getTok().getLoc();
+
+ switch(getLexer().getKind()) {
+ default: return MatchOperand_NoMatch;
+ case AsmToken::Integer:
+ // The operand can be an integer value.
+ if (getParser().parseAbsoluteExpression(Imm16Val))
+ return MatchOperand_NoMatch;
+ if (Imm16Val < 0 || !isUInt<16>(Imm16Val)) {
+ Error(S, "invalid immediate: only 16-bit values are legal");
+ // Do not return error code, but create an imm operand anyway and proceed
+ // to the next operand, if any. That avoids unneccessary error messages.
+ }
+ break;
+
+ case AsmToken::Identifier: {
+ OperandInfoTy HwReg(ID_UNKNOWN_);
+ int64_t Offset = OFFSET_DEFAULT_;
+ int64_t Width = WIDTH_M1_DEFAULT_ + 1;
+ if (parseHwregConstruct(HwReg, Offset, Width))
+ return MatchOperand_ParseFail;
+ if (HwReg.Id < 0 || !isUInt<ID_WIDTH_>(HwReg.Id)) {
+ if (HwReg.IsSymbolic)
+ Error(S, "invalid symbolic name of hardware register");
+ else
+ Error(S, "invalid code of hardware register: only 6-bit values are legal");
+ }
+ if (Offset < 0 || !isUInt<OFFSET_WIDTH_>(Offset))
+ Error(S, "invalid bit offset: only 5-bit values are legal");
+ if ((Width-1) < 0 || !isUInt<WIDTH_M1_WIDTH_>(Width-1))
+ Error(S, "invalid bitfield width: only values from 1 to 32 are legal");
+ Imm16Val = (HwReg.Id << ID_SHIFT_) | (Offset << OFFSET_SHIFT_) | ((Width-1) << WIDTH_M1_SHIFT_);
+ }
+ break;
+ }
+ Operands.push_back(AMDGPUOperand::CreateImm(this, Imm16Val, S, AMDGPUOperand::ImmTyHwreg));
+ return MatchOperand_Success;
+}
+
+bool AMDGPUOperand::isSWaitCnt() const {
+ return isImm();
+}
+
+bool AMDGPUOperand::isHwreg() const {
+ return isImmTy(ImmTyHwreg);
+}
+
+bool AMDGPUAsmParser::parseSendMsgConstruct(OperandInfoTy &Msg, OperandInfoTy &Operation, int64_t &StreamId) {
+ using namespace llvm::AMDGPU::SendMsg;
+
+ if (Parser.getTok().getString() != "sendmsg")
+ return true;
+ Parser.Lex();
+
+ if (getLexer().isNot(AsmToken::LParen))
+ return true;
+ Parser.Lex();
+
+ if (getLexer().is(AsmToken::Identifier)) {
+ Msg.IsSymbolic = true;
+ Msg.Id = ID_UNKNOWN_;
+ const std::string tok = Parser.getTok().getString();
+ for (int i = ID_GAPS_FIRST_; i < ID_GAPS_LAST_; ++i) {
+ switch(i) {
+ default: continue; // Omit gaps.
+ case ID_INTERRUPT: case ID_GS: case ID_GS_DONE: case ID_SYSMSG: break;
+ }
+ if (tok == IdSymbolic[i]) {
+ Msg.Id = i;
+ break;
+ }
+ }
+ Parser.Lex();
+ } else {
+ Msg.IsSymbolic = false;
+ if (getLexer().isNot(AsmToken::Integer))
+ return true;
+ if (getParser().parseAbsoluteExpression(Msg.Id))
+ return true;
+ if (getLexer().is(AsmToken::Integer))
+ if (getParser().parseAbsoluteExpression(Msg.Id))
+ Msg.Id = ID_UNKNOWN_;
+ }
+ if (Msg.Id == ID_UNKNOWN_) // Don't know how to parse the rest.
+ return false;
+
+ if (!(Msg.Id == ID_GS || Msg.Id == ID_GS_DONE || Msg.Id == ID_SYSMSG)) {
+ if (getLexer().isNot(AsmToken::RParen))
+ return true;
+ Parser.Lex();
+ return false;
+ }
+
+ if (getLexer().isNot(AsmToken::Comma))
+ return true;
+ Parser.Lex();
+
+ assert(Msg.Id == ID_GS || Msg.Id == ID_GS_DONE || Msg.Id == ID_SYSMSG);
+ Operation.Id = ID_UNKNOWN_;
+ if (getLexer().is(AsmToken::Identifier)) {
+ Operation.IsSymbolic = true;
+ const char* const *S = (Msg.Id == ID_SYSMSG) ? OpSysSymbolic : OpGsSymbolic;
+ const int F = (Msg.Id == ID_SYSMSG) ? OP_SYS_FIRST_ : OP_GS_FIRST_;
+ const int L = (Msg.Id == ID_SYSMSG) ? OP_SYS_LAST_ : OP_GS_LAST_;
+ const StringRef Tok = Parser.getTok().getString();
+ for (int i = F; i < L; ++i) {
+ if (Tok == S[i]) {
+ Operation.Id = i;
+ break;
+ }
+ }
+ Parser.Lex();
+ } else {
+ Operation.IsSymbolic = false;
+ if (getLexer().isNot(AsmToken::Integer))
+ return true;
+ if (getParser().parseAbsoluteExpression(Operation.Id))
+ return true;
+ }
+
+ if ((Msg.Id == ID_GS || Msg.Id == ID_GS_DONE) && Operation.Id != OP_GS_NOP) {
+ // Stream id is optional.
+ if (getLexer().is(AsmToken::RParen)) {
+ Parser.Lex();
+ return false;
+ }
+
+ if (getLexer().isNot(AsmToken::Comma))
+ return true;
+ Parser.Lex();
+
+ if (getLexer().isNot(AsmToken::Integer))
+ return true;
+ if (getParser().parseAbsoluteExpression(StreamId))
+ return true;
+ }
+
+ if (getLexer().isNot(AsmToken::RParen))
+ return true;
+ Parser.Lex();
+ return false;
+}
+
+OperandMatchResultTy AMDGPUAsmParser::parseInterpSlot(OperandVector &Operands) {
+ if (getLexer().getKind() != AsmToken::Identifier)
+ return MatchOperand_NoMatch;
+
+ StringRef Str = Parser.getTok().getString();
+ int Slot = StringSwitch<int>(Str)
+ .Case("p10", 0)
+ .Case("p20", 1)
+ .Case("p0", 2)
+ .Default(-1);
+
+ SMLoc S = Parser.getTok().getLoc();
+ if (Slot == -1)
+ return MatchOperand_ParseFail;
+
+ Parser.Lex();
+ Operands.push_back(AMDGPUOperand::CreateImm(this, Slot, S,
+ AMDGPUOperand::ImmTyInterpSlot));
+ return MatchOperand_Success;
+}
+
+OperandMatchResultTy AMDGPUAsmParser::parseInterpAttr(OperandVector &Operands) {
+ if (getLexer().getKind() != AsmToken::Identifier)
+ return MatchOperand_NoMatch;
+
+ StringRef Str = Parser.getTok().getString();
+ if (!Str.startswith("attr"))
+ return MatchOperand_NoMatch;
+
+ StringRef Chan = Str.take_back(2);
+ int AttrChan = StringSwitch<int>(Chan)
+ .Case(".x", 0)
+ .Case(".y", 1)
+ .Case(".z", 2)
+ .Case(".w", 3)
+ .Default(-1);
+ if (AttrChan == -1)
+ return MatchOperand_ParseFail;
+
+ Str = Str.drop_back(2).drop_front(4);
+
+ uint8_t Attr;
+ if (Str.getAsInteger(10, Attr))
+ return MatchOperand_ParseFail;
+
+ SMLoc S = Parser.getTok().getLoc();
+ Parser.Lex();
+ if (Attr > 63) {
+ Error(S, "out of bounds attr");
+ return MatchOperand_Success;
+ }
+
+ SMLoc SChan = SMLoc::getFromPointer(Chan.data());
+
+ Operands.push_back(AMDGPUOperand::CreateImm(this, Attr, S,
+ AMDGPUOperand::ImmTyInterpAttr));
+ Operands.push_back(AMDGPUOperand::CreateImm(this, AttrChan, SChan,
+ AMDGPUOperand::ImmTyAttrChan));
+ return MatchOperand_Success;
+}
+
+void AMDGPUAsmParser::errorExpTgt() {
+ Error(Parser.getTok().getLoc(), "invalid exp target");
+}
+
+OperandMatchResultTy AMDGPUAsmParser::parseExpTgtImpl(StringRef Str,
+ uint8_t &Val) {
+ if (Str == "null") {
+ Val = 9;
+ return MatchOperand_Success;
+ }
+
+ if (Str.startswith("mrt")) {
+ Str = Str.drop_front(3);
+ if (Str == "z") { // == mrtz
+ Val = 8;
+ return MatchOperand_Success;
+ }
+
+ if (Str.getAsInteger(10, Val))
+ return MatchOperand_ParseFail;
+
+ if (Val > 7)
+ errorExpTgt();
+
+ return MatchOperand_Success;
+ }
+
+ if (Str.startswith("pos")) {
+ Str = Str.drop_front(3);
+ if (Str.getAsInteger(10, Val))
+ return MatchOperand_ParseFail;
+
+ if (Val > 3)
+ errorExpTgt();
+
+ Val += 12;
+ return MatchOperand_Success;
+ }
+
+ if (Str.startswith("param")) {
+ Str = Str.drop_front(5);
+ if (Str.getAsInteger(10, Val))
+ return MatchOperand_ParseFail;
+
+ if (Val >= 32)
+ errorExpTgt();
+
+ Val += 32;
+ return MatchOperand_Success;
+ }
+
+ if (Str.startswith("invalid_target_")) {
+ Str = Str.drop_front(15);
+ if (Str.getAsInteger(10, Val))
+ return MatchOperand_ParseFail;
+
+ errorExpTgt();
+ return MatchOperand_Success;
+ }
+
+ return MatchOperand_NoMatch;
+}
+
+OperandMatchResultTy AMDGPUAsmParser::parseExpTgt(OperandVector &Operands) {
+ uint8_t Val;
+ StringRef Str = Parser.getTok().getString();
+
+ auto Res = parseExpTgtImpl(Str, Val);
+ if (Res != MatchOperand_Success)
+ return Res;
+
+ SMLoc S = Parser.getTok().getLoc();
+ Parser.Lex();
+
+ Operands.push_back(AMDGPUOperand::CreateImm(this, Val, S,
+ AMDGPUOperand::ImmTyExpTgt));
+ return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseSendMsgOp(OperandVector &Operands) {
+ using namespace llvm::AMDGPU::SendMsg;
+
+ int64_t Imm16Val = 0;
+ SMLoc S = Parser.getTok().getLoc();
+
+ switch(getLexer().getKind()) {
+ default:
+ return MatchOperand_NoMatch;
+ case AsmToken::Integer:
+ // The operand can be an integer value.
+ if (getParser().parseAbsoluteExpression(Imm16Val))
+ return MatchOperand_NoMatch;
+ if (Imm16Val < 0 || !isUInt<16>(Imm16Val)) {
+ Error(S, "invalid immediate: only 16-bit values are legal");
+ // Do not return error code, but create an imm operand anyway and proceed
+ // to the next operand, if any. That avoids unneccessary error messages.
+ }
+ break;
+ case AsmToken::Identifier: {
+ OperandInfoTy Msg(ID_UNKNOWN_);
+ OperandInfoTy Operation(OP_UNKNOWN_);
+ int64_t StreamId = STREAM_ID_DEFAULT_;
+ if (parseSendMsgConstruct(Msg, Operation, StreamId))
+ return MatchOperand_ParseFail;
+ do {
+ // Validate and encode message ID.
+ if (! ((ID_INTERRUPT <= Msg.Id && Msg.Id <= ID_GS_DONE)
+ || Msg.Id == ID_SYSMSG)) {
+ if (Msg.IsSymbolic)
+ Error(S, "invalid/unsupported symbolic name of message");
+ else
+ Error(S, "invalid/unsupported code of message");
+ break;
+ }
+ Imm16Val = (Msg.Id << ID_SHIFT_);
+ // Validate and encode operation ID.
+ if (Msg.Id == ID_GS || Msg.Id == ID_GS_DONE) {
+ if (! (OP_GS_FIRST_ <= Operation.Id && Operation.Id < OP_GS_LAST_)) {
+ if (Operation.IsSymbolic)
+ Error(S, "invalid symbolic name of GS_OP");
+ else
+ Error(S, "invalid code of GS_OP: only 2-bit values are legal");
+ break;
+ }
+ if (Operation.Id == OP_GS_NOP
+ && Msg.Id != ID_GS_DONE) {
+ Error(S, "invalid GS_OP: NOP is for GS_DONE only");
+ break;
+ }
+ Imm16Val |= (Operation.Id << OP_SHIFT_);
+ }
+ if (Msg.Id == ID_SYSMSG) {
+ if (! (OP_SYS_FIRST_ <= Operation.Id && Operation.Id < OP_SYS_LAST_)) {
+ if (Operation.IsSymbolic)
+ Error(S, "invalid/unsupported symbolic name of SYSMSG_OP");
+ else
+ Error(S, "invalid/unsupported code of SYSMSG_OP");
+ break;
+ }
+ Imm16Val |= (Operation.Id << OP_SHIFT_);
+ }
+ // Validate and encode stream ID.
+ if ((Msg.Id == ID_GS || Msg.Id == ID_GS_DONE) && Operation.Id != OP_GS_NOP) {
+ if (! (STREAM_ID_FIRST_ <= StreamId && StreamId < STREAM_ID_LAST_)) {
+ Error(S, "invalid stream id: only 2-bit values are legal");
+ break;
+ }
+ Imm16Val |= (StreamId << STREAM_ID_SHIFT_);
+ }
+ } while (false);
+ }
+ break;
+ }
+ Operands.push_back(AMDGPUOperand::CreateImm(this, Imm16Val, S, AMDGPUOperand::ImmTySendMsg));
+ return MatchOperand_Success;
+}
+
+bool AMDGPUOperand::isSendMsg() const {
+ return isImmTy(ImmTySendMsg);
+}
+
+//===----------------------------------------------------------------------===//
+// sopp branch targets
+//===----------------------------------------------------------------------===//
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseSOppBrTarget(OperandVector &Operands) {
+ SMLoc S = Parser.getTok().getLoc();
+
+ switch (getLexer().getKind()) {
+ default: return MatchOperand_ParseFail;
+ case AsmToken::Integer: {
+ int64_t Imm;
+ if (getParser().parseAbsoluteExpression(Imm))
+ return MatchOperand_ParseFail;
+ Operands.push_back(AMDGPUOperand::CreateImm(this, Imm, S));
+ return MatchOperand_Success;
+ }
+
+ case AsmToken::Identifier:
+ Operands.push_back(AMDGPUOperand::CreateExpr(this,
+ MCSymbolRefExpr::create(getContext().getOrCreateSymbol(
+ Parser.getTok().getString()), getContext()), S));
+ Parser.Lex();
+ return MatchOperand_Success;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// mubuf
+//===----------------------------------------------------------------------===//
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultGLC() const {
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyGLC);
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSLC() const {
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTySLC);
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultTFE() const {
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyTFE);
+}
+
+void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
+ const OperandVector &Operands,
+ bool IsAtomic, bool IsAtomicReturn) {
+ OptionalImmIndexMap OptionalIdx;
+ assert(IsAtomicReturn ? IsAtomic : true);
+
+ for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
+ AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+
+ // Add the register arguments
+ if (Op.isReg()) {
+ Op.addRegOperands(Inst, 1);
+ continue;
+ }
+
+ // Handle the case where soffset is an immediate
+ if (Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyNone) {
+ Op.addImmOperands(Inst, 1);
+ continue;
+ }
+
+ // Handle tokens like 'offen' which are sometimes hard-coded into the
+ // asm string. There are no MCInst operands for these.
+ if (Op.isToken()) {
+ continue;
+ }
+ assert(Op.isImm());
+
+ // Handle optional arguments
+ OptionalIdx[Op.getImmTy()] = i;
+ }
+
+ // Copy $vdata_in operand and insert as $vdata for MUBUF_Atomic RTN insns.
+ if (IsAtomicReturn) {
+ MCInst::iterator I = Inst.begin(); // $vdata_in is always at the beginning.
+ Inst.insert(I, *I);
+ }
+
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset);
+ if (!IsAtomic) { // glc is hard-coded.
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
+ }
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
+}
+
+//===----------------------------------------------------------------------===//
+// mimg
+//===----------------------------------------------------------------------===//
+
+void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands) {
+ unsigned I = 1;
+ const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+ for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
+ ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
+ }
+
+ OptionalImmIndexMap OptionalIdx;
+
+ for (unsigned E = Operands.size(); I != E; ++I) {
+ AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
+
+ // Add the register arguments
+ if (Op.isRegOrImm()) {
+ Op.addRegOrImmOperands(Inst, 1);
+ continue;
+ } else if (Op.isImmModifier()) {
+ OptionalIdx[Op.getImmTy()] = I;
+ } else {
+ llvm_unreachable("unexpected operand type");
+ }
+ }
+
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDMask);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyUNorm);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
+}
+
+void AMDGPUAsmParser::cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands) {
+ unsigned I = 1;
+ const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+ for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
+ ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
+ }
+
+ // Add src, same as dst
+ ((AMDGPUOperand &)*Operands[I]).addRegOperands(Inst, 1);
+
+ OptionalImmIndexMap OptionalIdx;
+
+ for (unsigned E = Operands.size(); I != E; ++I) {
+ AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
+
+ // Add the register arguments
+ if (Op.isRegOrImm()) {
+ Op.addRegOrImmOperands(Inst, 1);
+ continue;
+ } else if (Op.isImmModifier()) {
+ OptionalIdx[Op.getImmTy()] = I;
+ } else {
+ llvm_unreachable("unexpected operand type");
+ }
+ }
+
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDMask);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyUNorm);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDMask() const {
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDMask);
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultUNorm() const {
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyUNorm);
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDA() const {
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDA);
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultR128() const {
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyR128);
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultLWE() const {
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyLWE);
+}
+
+//===----------------------------------------------------------------------===//
+// smrd
+//===----------------------------------------------------------------------===//
+
+bool AMDGPUOperand::isSMRDOffset8() const {
+ return isImm() && isUInt<8>(getImm());
+}
+
+bool AMDGPUOperand::isSMRDOffset20() const {
+ return isImm() && isUInt<20>(getImm());
+}
+
+bool AMDGPUOperand::isSMRDLiteralOffset() const {
+ // 32-bit literals are only supported on CI and we only want to use them
+ // when the offset is > 8-bits.
+ return isImm() && !isUInt<8>(getImm()) && isUInt<32>(getImm());
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMRDOffset8() const {
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset);
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMRDOffset20() const {
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset);
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMRDLiteralOffset() const {
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset);
+}
+
+//===----------------------------------------------------------------------===//
+// vop3
+//===----------------------------------------------------------------------===//
+
+static bool ConvertOmodMul(int64_t &Mul) {
+ if (Mul != 1 && Mul != 2 && Mul != 4)
+ return false;
+
+ Mul >>= 1;
+ return true;
+}
+
+static bool ConvertOmodDiv(int64_t &Div) {
+ if (Div == 1) {
+ Div = 0;
+ return true;
+ }
+
+ if (Div == 2) {
+ Div = 3;
+ return true;
+ }
+
+ return false;
+}
+
+static bool ConvertBoundCtrl(int64_t &BoundCtrl) {
+ if (BoundCtrl == 0) {
+ BoundCtrl = 1;
+ return true;
+ }
+
+ if (BoundCtrl == -1) {
+ BoundCtrl = 0;
+ return true;
+ }
+
+ return false;
+}
+
+// Note: the order in this table matches the order of operands in AsmString.
+static const OptionalOperand AMDGPUOptionalOperandTable[] = {
+ {"offen", AMDGPUOperand::ImmTyOffen, true, nullptr},
+ {"idxen", AMDGPUOperand::ImmTyIdxen, true, nullptr},
+ {"addr64", AMDGPUOperand::ImmTyAddr64, true, nullptr},
+ {"offset0", AMDGPUOperand::ImmTyOffset0, false, nullptr},
+ {"offset1", AMDGPUOperand::ImmTyOffset1, false, nullptr},
+ {"gds", AMDGPUOperand::ImmTyGDS, true, nullptr},
+ {"offset", AMDGPUOperand::ImmTyOffset, false, nullptr},
+ {"glc", AMDGPUOperand::ImmTyGLC, true, nullptr},
+ {"slc", AMDGPUOperand::ImmTySLC, true, nullptr},
+ {"tfe", AMDGPUOperand::ImmTyTFE, true, nullptr},
+ {"clamp", AMDGPUOperand::ImmTyClampSI, true, nullptr},
+ {"omod", AMDGPUOperand::ImmTyOModSI, false, ConvertOmodMul},
+ {"unorm", AMDGPUOperand::ImmTyUNorm, true, nullptr},
+ {"da", AMDGPUOperand::ImmTyDA, true, nullptr},
+ {"r128", AMDGPUOperand::ImmTyR128, true, nullptr},
+ {"lwe", AMDGPUOperand::ImmTyLWE, true, nullptr},
+ {"dmask", AMDGPUOperand::ImmTyDMask, false, nullptr},
+ {"row_mask", AMDGPUOperand::ImmTyDppRowMask, false, nullptr},
+ {"bank_mask", AMDGPUOperand::ImmTyDppBankMask, false, nullptr},
+ {"bound_ctrl", AMDGPUOperand::ImmTyDppBoundCtrl, false, ConvertBoundCtrl},
+ {"dst_sel", AMDGPUOperand::ImmTySdwaDstSel, false, nullptr},
+ {"src0_sel", AMDGPUOperand::ImmTySdwaSrc0Sel, false, nullptr},
+ {"src1_sel", AMDGPUOperand::ImmTySdwaSrc1Sel, false, nullptr},
+ {"dst_unused", AMDGPUOperand::ImmTySdwaDstUnused, false, nullptr},
+ {"vm", AMDGPUOperand::ImmTyExpVM, true, nullptr},
+};
+
+OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operands) {
+ OperandMatchResultTy res;
+ for (const OptionalOperand &Op : AMDGPUOptionalOperandTable) {
+ // try to parse any optional operand here
+ if (Op.IsBit) {
+ res = parseNamedBit(Op.Name, Operands, Op.Type);
+ } else if (Op.Type == AMDGPUOperand::ImmTyOModSI) {
+ res = parseOModOperand(Operands);
+ } else if (Op.Type == AMDGPUOperand::ImmTySdwaDstSel ||
+ Op.Type == AMDGPUOperand::ImmTySdwaSrc0Sel ||
+ Op.Type == AMDGPUOperand::ImmTySdwaSrc1Sel) {
+ res = parseSDWASel(Operands, Op.Name, Op.Type);
+ } else if (Op.Type == AMDGPUOperand::ImmTySdwaDstUnused) {
+ res = parseSDWADstUnused(Operands);
+ } else {
+ res = parseIntWithPrefix(Op.Name, Operands, Op.Type, Op.ConvertResult);
+ }
+ if (res != MatchOperand_NoMatch) {
+ return res;
+ }
+ }
+ return MatchOperand_NoMatch;
+}
+
+OperandMatchResultTy AMDGPUAsmParser::parseOModOperand(OperandVector &Operands) {
+ StringRef Name = Parser.getTok().getString();
+ if (Name == "mul") {
+ return parseIntWithPrefix("mul", Operands,
+ AMDGPUOperand::ImmTyOModSI, ConvertOmodMul);
+ }
+
+ if (Name == "div") {
+ return parseIntWithPrefix("div", Operands,
+ AMDGPUOperand::ImmTyOModSI, ConvertOmodDiv);
+ }
+
+ return MatchOperand_NoMatch;
+}
+
+void AMDGPUAsmParser::cvtId(MCInst &Inst, const OperandVector &Operands) {
+ unsigned I = 1;
+ const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+ for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
+ ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
+ }
+ for (unsigned E = Operands.size(); I != E; ++I)
+ ((AMDGPUOperand &)*Operands[I]).addRegOrImmOperands(Inst, 1);
+}
+
+void AMDGPUAsmParser::cvtVOP3_2_mod(MCInst &Inst, const OperandVector &Operands) {
+ uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
+ if (TSFlags & SIInstrFlags::VOP3) {
+ cvtVOP3(Inst, Operands);
+ } else {
+ cvtId(Inst, Operands);
+ }
+}
+
+static bool isRegOrImmWithInputMods(const MCInstrDesc &Desc, unsigned OpNum) {
+ // 1. This operand is input modifiers
+ return Desc.OpInfo[OpNum].OperandType == AMDGPU::OPERAND_INPUT_MODS
+ // 2. This is not last operand
+ && Desc.NumOperands > (OpNum + 1)
+ // 3. Next operand is register class
+ && Desc.OpInfo[OpNum + 1].RegClass != -1
+ // 4. Next register is not tied to any other operand
+ && Desc.getOperandConstraint(OpNum + 1, MCOI::OperandConstraint::TIED_TO) == -1;
+}
+
+void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) {
+ OptionalImmIndexMap OptionalIdx;
+ unsigned I = 1;
+ const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+ for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
+ ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
+ }
+
+ for (unsigned E = Operands.size(); I != E; ++I) {
+ AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
+ if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
+ Op.addRegOrImmWithFPInputModsOperands(Inst, 2);
+ } else if (Op.isImm()) {
+ OptionalIdx[Op.getImmTy()] = I;
+ } else {
+ llvm_unreachable("unhandled operand type");
+ }
+ }
+
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI);
+
+ // special case v_mac_{f16, f32}:
+ // it has src2 register operand that is tied to dst operand
+ // we don't allow modifiers for this operand in assembler so src2_modifiers
+ // should be 0
+ if (Inst.getOpcode() == AMDGPU::V_MAC_F32_e64_si ||
+ Inst.getOpcode() == AMDGPU::V_MAC_F32_e64_vi ||
+ Inst.getOpcode() == AMDGPU::V_MAC_F16_e64_vi) {
+ auto it = Inst.begin();
+ std::advance(
+ it,
+ AMDGPU::getNamedOperandIdx(Inst.getOpcode() == AMDGPU::V_MAC_F16_e64_vi ?
+ AMDGPU::V_MAC_F16_e64 :
+ AMDGPU::V_MAC_F32_e64,
+ AMDGPU::OpName::src2_modifiers));
+ it = Inst.insert(it, MCOperand::createImm(0)); // no modifiers for src2
+ ++it;
+ Inst.insert(it, Inst.getOperand(0)); // src2 = dst
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// dpp
+//===----------------------------------------------------------------------===//
+
+bool AMDGPUOperand::isDPPCtrl() const {
+ bool result = isImm() && getImmTy() == ImmTyDppCtrl && isUInt<9>(getImm());
+ if (result) {
+ int64_t Imm = getImm();
+ return ((Imm >= 0x000) && (Imm <= 0x0ff)) ||
+ ((Imm >= 0x101) && (Imm <= 0x10f)) ||
+ ((Imm >= 0x111) && (Imm <= 0x11f)) ||
+ ((Imm >= 0x121) && (Imm <= 0x12f)) ||
+ (Imm == 0x130) ||
+ (Imm == 0x134) ||
+ (Imm == 0x138) ||
+ (Imm == 0x13c) ||
+ (Imm == 0x140) ||
+ (Imm == 0x141) ||
+ (Imm == 0x142) ||
+ (Imm == 0x143);
+ }
+ return false;
+}
+
+bool AMDGPUOperand::isGPRIdxMode() const {
+ return isImm() && isUInt<4>(getImm());
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) {
+ SMLoc S = Parser.getTok().getLoc();
+ StringRef Prefix;
+ int64_t Int;
+
+ if (getLexer().getKind() == AsmToken::Identifier) {
+ Prefix = Parser.getTok().getString();
+ } else {
+ return MatchOperand_NoMatch;
+ }
+
+ if (Prefix == "row_mirror") {
+ Int = 0x140;
+ Parser.Lex();
+ } else if (Prefix == "row_half_mirror") {
+ Int = 0x141;
+ Parser.Lex();
+ } else {
+ // Check to prevent parseDPPCtrlOps from eating invalid tokens
+ if (Prefix != "quad_perm"
+ && Prefix != "row_shl"
+ && Prefix != "row_shr"
+ && Prefix != "row_ror"
+ && Prefix != "wave_shl"
+ && Prefix != "wave_rol"
+ && Prefix != "wave_shr"
+ && Prefix != "wave_ror"
+ && Prefix != "row_bcast") {
+ return MatchOperand_NoMatch;
+ }
+
+ Parser.Lex();
+ if (getLexer().isNot(AsmToken::Colon))
+ return MatchOperand_ParseFail;
+
+ if (Prefix == "quad_perm") {
+ // quad_perm:[%d,%d,%d,%d]
+ Parser.Lex();
+ if (getLexer().isNot(AsmToken::LBrac))
+ return MatchOperand_ParseFail;
+ Parser.Lex();
+
+ if (getParser().parseAbsoluteExpression(Int) || !(0 <= Int && Int <=3))
+ return MatchOperand_ParseFail;
+
+ for (int i = 0; i < 3; ++i) {
+ if (getLexer().isNot(AsmToken::Comma))
+ return MatchOperand_ParseFail;
+ Parser.Lex();
+
+ int64_t Temp;
+ if (getParser().parseAbsoluteExpression(Temp) || !(0 <= Temp && Temp <=3))
+ return MatchOperand_ParseFail;
+ const int shift = i*2 + 2;
+ Int += (Temp << shift);
+ }
+
+ if (getLexer().isNot(AsmToken::RBrac))
+ return MatchOperand_ParseFail;
+ Parser.Lex();
+
+ } else {
+ // sel:%d
+ Parser.Lex();
+ if (getParser().parseAbsoluteExpression(Int))
+ return MatchOperand_ParseFail;
+
+ if (Prefix == "row_shl" && 1 <= Int && Int <= 15) {
+ Int |= 0x100;
+ } else if (Prefix == "row_shr" && 1 <= Int && Int <= 15) {
+ Int |= 0x110;
+ } else if (Prefix == "row_ror" && 1 <= Int && Int <= 15) {
+ Int |= 0x120;
+ } else if (Prefix == "wave_shl" && 1 == Int) {
+ Int = 0x130;
+ } else if (Prefix == "wave_rol" && 1 == Int) {
+ Int = 0x134;
+ } else if (Prefix == "wave_shr" && 1 == Int) {
+ Int = 0x138;
+ } else if (Prefix == "wave_ror" && 1 == Int) {
+ Int = 0x13C;
+ } else if (Prefix == "row_bcast") {
+ if (Int == 15) {
+ Int = 0x142;
+ } else if (Int == 31) {
+ Int = 0x143;
+ } else {
+ return MatchOperand_ParseFail;
+ }
+ } else {
+ return MatchOperand_ParseFail;
+ }
+ }
+ }
+
+ Operands.push_back(AMDGPUOperand::CreateImm(this, Int, S, AMDGPUOperand::ImmTyDppCtrl));
+ return MatchOperand_Success;
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultRowMask() const {
+ return AMDGPUOperand::CreateImm(this, 0xf, SMLoc(), AMDGPUOperand::ImmTyDppRowMask);
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultBankMask() const {
+ return AMDGPUOperand::CreateImm(this, 0xf, SMLoc(), AMDGPUOperand::ImmTyDppBankMask);
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultBoundCtrl() const {
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDppBoundCtrl);
+}
+
+void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) {
+ OptionalImmIndexMap OptionalIdx;
+
+ unsigned I = 1;
+ const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+ for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
+ ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
+ }
+
+ for (unsigned E = Operands.size(); I != E; ++I) {
+ AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
+ // Add the register arguments
+ if (Op.isReg() && Op.Reg.RegNo == AMDGPU::VCC) {
+ // VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token.
+ // Skip it.
+ continue;
+ } if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
+ Op.addRegOrImmWithFPInputModsOperands(Inst, 2);
+ } else if (Op.isDPPCtrl()) {
+ Op.addImmOperands(Inst, 1);
+ } else if (Op.isImm()) {
+ // Handle optional arguments
+ OptionalIdx[Op.getImmTy()] = I;
+ } else {
+ llvm_unreachable("Invalid operand type");
+ }
+ }
+
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppRowMask, 0xf);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBankMask, 0xf);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBoundCtrl);
+
+ // special case v_mac_{f16, f32}:
+ // it has src2 register operand that is tied to dst operand
+ if (Inst.getOpcode() == AMDGPU::V_MAC_F32_dpp ||
+ Inst.getOpcode() == AMDGPU::V_MAC_F16_dpp) {
+ auto it = Inst.begin();
+ std::advance(
+ it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2));
+ Inst.insert(it, Inst.getOperand(0)); // src2 = dst
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// sdwa
+//===----------------------------------------------------------------------===//
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseSDWASel(OperandVector &Operands, StringRef Prefix,
+ AMDGPUOperand::ImmTy Type) {
+ using namespace llvm::AMDGPU::SDWA;
+
+ SMLoc S = Parser.getTok().getLoc();
+ StringRef Value;
+ OperandMatchResultTy res;
+
+ res = parseStringWithPrefix(Prefix, Value);
+ if (res != MatchOperand_Success) {
+ return res;
+ }
+
+ int64_t Int;
+ Int = StringSwitch<int64_t>(Value)
+ .Case("BYTE_0", SdwaSel::BYTE_0)
+ .Case("BYTE_1", SdwaSel::BYTE_1)
+ .Case("BYTE_2", SdwaSel::BYTE_2)
+ .Case("BYTE_3", SdwaSel::BYTE_3)
+ .Case("WORD_0", SdwaSel::WORD_0)
+ .Case("WORD_1", SdwaSel::WORD_1)
+ .Case("DWORD", SdwaSel::DWORD)
+ .Default(0xffffffff);
+ Parser.Lex(); // eat last token
+
+ if (Int == 0xffffffff) {
+ return MatchOperand_ParseFail;
+ }
+
+ Operands.push_back(AMDGPUOperand::CreateImm(this, Int, S, Type));
+ return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseSDWADstUnused(OperandVector &Operands) {
+ using namespace llvm::AMDGPU::SDWA;
+
+ SMLoc S = Parser.getTok().getLoc();
+ StringRef Value;
+ OperandMatchResultTy res;
+
+ res = parseStringWithPrefix("dst_unused", Value);
+ if (res != MatchOperand_Success) {
+ return res;
+ }
+
+ int64_t Int;
+ Int = StringSwitch<int64_t>(Value)
+ .Case("UNUSED_PAD", DstUnused::UNUSED_PAD)
+ .Case("UNUSED_SEXT", DstUnused::UNUSED_SEXT)
+ .Case("UNUSED_PRESERVE", DstUnused::UNUSED_PRESERVE)
+ .Default(0xffffffff);
+ Parser.Lex(); // eat last token
+
+ if (Int == 0xffffffff) {
+ return MatchOperand_ParseFail;
+ }
+
+ Operands.push_back(AMDGPUOperand::CreateImm(this, Int, S, AMDGPUOperand::ImmTySdwaDstUnused));
+ return MatchOperand_Success;
+}
+
+void AMDGPUAsmParser::cvtSdwaVOP1(MCInst &Inst, const OperandVector &Operands) {
+ cvtSDWA(Inst, Operands, SIInstrFlags::VOP1);
+}
+
+void AMDGPUAsmParser::cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands) {
+ cvtSDWA(Inst, Operands, SIInstrFlags::VOP2);
+}
+
+void AMDGPUAsmParser::cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands) {
+ cvtSDWA(Inst, Operands, SIInstrFlags::VOPC);
+}
+
+void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
+ uint64_t BasicInstType) {
+ OptionalImmIndexMap OptionalIdx;
+
+ unsigned I = 1;
+ const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+ for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
+ ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
+ }
+
+ for (unsigned E = Operands.size(); I != E; ++I) {
+ AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
+ // Add the register arguments
+ if ((BasicInstType == SIInstrFlags::VOPC ||
+ BasicInstType == SIInstrFlags::VOP2)&&
+ Op.isReg() &&
+ Op.Reg.RegNo == AMDGPU::VCC) {
+ // VOPC and VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token as dst.
+ // Skip it.
+ continue;
+ } else if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
+ Op.addRegOrImmWithInputModsOperands(Inst, 2);
+ } else if (Op.isImm()) {
+ // Handle optional arguments
+ OptionalIdx[Op.getImmTy()] = I;
+ } else {
+ llvm_unreachable("Invalid operand type");
+ }
+ }
+
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
+
+ if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_vi) {
+ // V_NOP_sdwa_vi has no optional sdwa arguments
+ switch (BasicInstType) {
+ case SIInstrFlags::VOP1:
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, 6);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, 2);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6);
+ break;
+
+ case SIInstrFlags::VOP2:
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, 6);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, 2);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, 6);
+ break;
+
+ case SIInstrFlags::VOPC:
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, 6);
+ break;
+
+ default:
+ llvm_unreachable("Invalid instruction type. Only VOP1, VOP2 and VOPC allowed");
+ }
+ }
+
+ // special case v_mac_{f16, f32}:
+ // it has src2 register operand that is tied to dst operand
+ if (Inst.getOpcode() == AMDGPU::V_MAC_F32_sdwa_vi ||
+ Inst.getOpcode() == AMDGPU::V_MAC_F16_sdwa_vi) {
+ auto it = Inst.begin();
+ std::advance(
+ it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2));
+ Inst.insert(it, Inst.getOperand(0)); // src2 = dst
+ }
+
+}
+
+/// Force static initialization.
+extern "C" void LLVMInitializeAMDGPUAsmParser() {
+ RegisterMCAsmParser<AMDGPUAsmParser> A(getTheAMDGPUTarget());
+ RegisterMCAsmParser<AMDGPUAsmParser> B(getTheGCNTarget());
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#include "AMDGPUGenAsmMatcher.inc"
+
+// This fuction should be defined after auto-generated include so that we have
+// MatchClassKind enum defined
+unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op,
+ unsigned Kind) {
+ // Tokens like "glc" would be parsed as immediate operands in ParseOperand().
+ // But MatchInstructionImpl() expects to meet token and fails to validate
+ // operand. This method checks if we are given immediate operand but expect to
+ // get corresponding token.
+ AMDGPUOperand &Operand = (AMDGPUOperand&)Op;
+ switch (Kind) {
+ case MCK_addr64:
+ return Operand.isAddr64() ? Match_Success : Match_InvalidOperand;
+ case MCK_gds:
+ return Operand.isGDS() ? Match_Success : Match_InvalidOperand;
+ case MCK_glc:
+ return Operand.isGLC() ? Match_Success : Match_InvalidOperand;
+ case MCK_idxen:
+ return Operand.isIdxen() ? Match_Success : Match_InvalidOperand;
+ case MCK_offen:
+ return Operand.isOffen() ? Match_Success : Match_InvalidOperand;
+ case MCK_SSrcB32:
+ // When operands have expression values, they will return true for isToken,
+ // because it is not possible to distinguish between a token and an
+ // expression at parse time. MatchInstructionImpl() will always try to
+ // match an operand as a token, when isToken returns true, and when the
+ // name of the expression is not a valid token, the match will fail,
+ // so we need to handle it here.
+ return Operand.isSSrcB32() ? Match_Success : Match_InvalidOperand;
+ case MCK_SSrcF32:
+ return Operand.isSSrcF32() ? Match_Success : Match_InvalidOperand;
+ case MCK_SoppBrTarget:
+ return Operand.isSoppBrTarget() ? Match_Success : Match_InvalidOperand;
+ case MCK_VReg32OrOff:
+ return Operand.isVReg32OrOff() ? Match_Success : Match_InvalidOperand;
+ case MCK_InterpSlot:
+ return Operand.isInterpSlot() ? Match_Success : Match_InvalidOperand;
+ case MCK_Attr:
+ return Operand.isInterpAttr() ? Match_Success : Match_InvalidOperand;
+ case MCK_AttrChan:
+ return Operand.isAttrChan() ? Match_Success : Match_InvalidOperand;
+ default:
+ return Match_InvalidOperand;
+ }
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/BUFInstructions.td b/contrib/llvm/lib/Target/AMDGPU/BUFInstructions.td
new file mode 100644
index 000000000000..45a7fe6d3439
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -0,0 +1,1350 @@
+//===-- BUFInstructions.td - Buffer Instruction Defintions ----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">;
+def MUBUFAddr64 : ComplexPattern<i64, 7, "SelectMUBUFAddr64">;
+def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">;
+
+def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">;
+def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">;
+def MUBUFOffsetNoGLC : ComplexPattern<i64, 3, "SelectMUBUFOffset">;
+def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">;
+def MUBUFIntrinsicOffset : ComplexPattern<i32, 2, "SelectMUBUFIntrinsicOffset">;
+def MUBUFIntrinsicVOffset : ComplexPattern<i32, 3, "SelectMUBUFIntrinsicVOffset">;
+
+class MubufLoad <SDPatternOperator op> : PatFrag <
+ (ops node:$ptr), (op node:$ptr), [{
+ auto const AS = cast<MemSDNode>(N)->getAddressSpace();
+ return AS == AMDGPUAS::GLOBAL_ADDRESS ||
+ AS == AMDGPUAS::CONSTANT_ADDRESS;
+}]>;
+
+def mubuf_load : MubufLoad <load>;
+def mubuf_az_extloadi8 : MubufLoad <az_extloadi8>;
+def mubuf_sextloadi8 : MubufLoad <sextloadi8>;
+def mubuf_az_extloadi16 : MubufLoad <az_extloadi16>;
+def mubuf_sextloadi16 : MubufLoad <sextloadi16>;
+def mubuf_load_atomic : MubufLoad <atomic_load>;
+
+def BUFAddrKind {
+ int Offset = 0;
+ int OffEn = 1;
+ int IdxEn = 2;
+ int BothEn = 3;
+ int Addr64 = 4;
+}
+
+class getAddrName<int addrKind> {
+ string ret =
+ !if(!eq(addrKind, BUFAddrKind.Offset), "offset",
+ !if(!eq(addrKind, BUFAddrKind.OffEn), "offen",
+ !if(!eq(addrKind, BUFAddrKind.IdxEn), "idxen",
+ !if(!eq(addrKind, BUFAddrKind.BothEn), "bothen",
+ !if(!eq(addrKind, BUFAddrKind.Addr64), "addr64",
+ "")))));
+}
+
+class MUBUFAddr64Table <bit is_addr64, string suffix = ""> {
+ bit IsAddr64 = is_addr64;
+ string OpName = NAME # suffix;
+}
+
+//===----------------------------------------------------------------------===//
+// MTBUF classes
+//===----------------------------------------------------------------------===//
+
+class MTBUF_Pseudo <string opName, dag outs, dag ins,
+ string asmOps, list<dag> pattern=[]> :
+ InstSI<outs, ins, "", pattern>,
+ SIMCInstr<opName, SIEncodingFamily.NONE> {
+
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+ let Size = 8;
+ let UseNamedOperandTable = 1;
+
+ string Mnemonic = opName;
+ string AsmOperands = asmOps;
+
+ let VM_CNT = 1;
+ let EXP_CNT = 1;
+ let MTBUF = 1;
+ let Uses = [EXEC];
+
+ let hasSideEffects = 0;
+ let SchedRW = [WriteVMEM];
+}
+
+class MTBUF_Real <MTBUF_Pseudo ps> :
+ InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
+ Enc64 {
+
+ let isPseudo = 0;
+ let isCodeGenOnly = 0;
+
+ // copy relevant pseudo op flags
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let AsmMatchConverter = ps.AsmMatchConverter;
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
+ let TSFlags = ps.TSFlags;
+
+ bits<8> vdata;
+ bits<12> offset;
+ bits<1> offen;
+ bits<1> idxen;
+ bits<1> glc;
+ bits<1> addr64;
+ bits<4> dfmt;
+ bits<3> nfmt;
+ bits<8> vaddr;
+ bits<7> srsrc;
+ bits<1> slc;
+ bits<1> tfe;
+ bits<8> soffset;
+
+ let Inst{11-0} = offset;
+ let Inst{12} = offen;
+ let Inst{13} = idxen;
+ let Inst{14} = glc;
+ let Inst{22-19} = dfmt;
+ let Inst{25-23} = nfmt;
+ let Inst{31-26} = 0x3a; //encoding
+ let Inst{39-32} = vaddr;
+ let Inst{47-40} = vdata;
+ let Inst{52-48} = srsrc{6-2};
+ let Inst{54} = slc;
+ let Inst{55} = tfe;
+ let Inst{63-56} = soffset;
+}
+
+class MTBUF_Load_Pseudo <string opName, RegisterClass regClass> : MTBUF_Pseudo <
+ opName, (outs regClass:$dst),
+ (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
+ i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, SReg_128:$srsrc,
+ i1imm:$slc, i1imm:$tfe, SCSrc_b32:$soffset),
+ " $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"#
+ " $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset"> {
+ let mayLoad = 1;
+ let mayStore = 0;
+}
+
+class MTBUF_Store_Pseudo <string opName, RegisterClass regClass> : MTBUF_Pseudo <
+ opName, (outs),
+ (ins regClass:$vdata, u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc,
+ i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr,
+ SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SCSrc_b32:$soffset),
+ " $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"#
+ " $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset"> {
+ let mayLoad = 0;
+ let mayStore = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// MUBUF classes
+//===----------------------------------------------------------------------===//
+
+class MUBUF_Pseudo <string opName, dag outs, dag ins,
+ string asmOps, list<dag> pattern=[]> :
+ InstSI<outs, ins, "", pattern>,
+ SIMCInstr<opName, SIEncodingFamily.NONE> {
+
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+ let Size = 8;
+ let UseNamedOperandTable = 1;
+
+ string Mnemonic = opName;
+ string AsmOperands = asmOps;
+
+ let VM_CNT = 1;
+ let EXP_CNT = 1;
+ let MUBUF = 1;
+ let Uses = [EXEC];
+ let hasSideEffects = 0;
+ let SchedRW = [WriteVMEM];
+
+ let AsmMatchConverter = "cvtMubuf";
+
+ bits<1> offen = 0;
+ bits<1> idxen = 0;
+ bits<1> addr64 = 0;
+ bits<1> has_vdata = 1;
+ bits<1> has_vaddr = 1;
+ bits<1> has_glc = 1;
+ bits<1> glc_value = 0; // the value for glc if no such operand
+ bits<1> has_srsrc = 1;
+ bits<1> has_soffset = 1;
+ bits<1> has_offset = 1;
+ bits<1> has_slc = 1;
+ bits<1> has_tfe = 1;
+}
+
+class MUBUF_Real <bits<7> op, MUBUF_Pseudo ps> :
+ InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []> {
+
+ let isPseudo = 0;
+ let isCodeGenOnly = 0;
+
+ // copy relevant pseudo op flags
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let AsmMatchConverter = ps.AsmMatchConverter;
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
+ let TSFlags = ps.TSFlags;
+
+ bits<12> offset;
+ bits<1> glc;
+ bits<1> lds = 0;
+ bits<8> vaddr;
+ bits<8> vdata;
+ bits<7> srsrc;
+ bits<1> slc;
+ bits<1> tfe;
+ bits<8> soffset;
+}
+
+
+// For cache invalidation instructions.
+class MUBUF_Invalidate <string opName, SDPatternOperator node> :
+ MUBUF_Pseudo<opName, (outs), (ins), "", [(node)]> {
+
+ let AsmMatchConverter = "";
+
+ let hasSideEffects = 1;
+ let mayStore = 1;
+
+ // Set everything to 0.
+ let offen = 0;
+ let idxen = 0;
+ let addr64 = 0;
+ let has_vdata = 0;
+ let has_vaddr = 0;
+ let has_glc = 0;
+ let glc_value = 0;
+ let has_srsrc = 0;
+ let has_soffset = 0;
+ let has_offset = 0;
+ let has_slc = 0;
+ let has_tfe = 0;
+}
+
+class getMUBUFInsDA<list<RegisterClass> vdataList,
+ list<RegisterClass> vaddrList=[]> {
+ RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList));
+ RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
+ dag InsNoData = !if(!empty(vaddrList),
+ (ins SReg_128:$srsrc, SCSrc_b32:$soffset,
+ offset:$offset, GLC:$glc, slc:$slc, tfe:$tfe),
+ (ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset,
+ offset:$offset, GLC:$glc, slc:$slc, tfe:$tfe)
+ );
+ dag InsData = !if(!empty(vaddrList),
+ (ins vdataClass:$vdata, SReg_128:$srsrc,
+ SCSrc_b32:$soffset, offset:$offset, GLC:$glc, slc:$slc, tfe:$tfe),
+ (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc,
+ SCSrc_b32:$soffset, offset:$offset, GLC:$glc, slc:$slc, tfe:$tfe)
+ );
+ dag ret = !if(!empty(vdataList), InsNoData, InsData);
+}
+
+class getMUBUFIns<int addrKind, list<RegisterClass> vdataList=[]> {
+ dag ret =
+ !if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList>.ret,
+ !if(!eq(addrKind, BUFAddrKind.OffEn), getMUBUFInsDA<vdataList, [VGPR_32]>.ret,
+ !if(!eq(addrKind, BUFAddrKind.IdxEn), getMUBUFInsDA<vdataList, [VGPR_32]>.ret,
+ !if(!eq(addrKind, BUFAddrKind.BothEn), getMUBUFInsDA<vdataList, [VReg_64]>.ret,
+ !if(!eq(addrKind, BUFAddrKind.Addr64), getMUBUFInsDA<vdataList, [VReg_64]>.ret,
+ (ins))))));
+}
+
+class getMUBUFAsmOps<int addrKind> {
+ string Pfx =
+ !if(!eq(addrKind, BUFAddrKind.Offset), "off, $srsrc, $soffset",
+ !if(!eq(addrKind, BUFAddrKind.OffEn), "$vaddr, $srsrc, $soffset offen",
+ !if(!eq(addrKind, BUFAddrKind.IdxEn), "$vaddr, $srsrc, $soffset idxen",
+ !if(!eq(addrKind, BUFAddrKind.BothEn), "$vaddr, $srsrc, $soffset idxen offen",
+ !if(!eq(addrKind, BUFAddrKind.Addr64), "$vaddr, $srsrc, $soffset addr64",
+ "")))));
+ string ret = Pfx # "$offset";
+}
+
+class MUBUF_SetupAddr<int addrKind> {
+ bits<1> offen = !if(!eq(addrKind, BUFAddrKind.OffEn), 1,
+ !if(!eq(addrKind, BUFAddrKind.BothEn), 1 , 0));
+
+ bits<1> idxen = !if(!eq(addrKind, BUFAddrKind.IdxEn), 1,
+ !if(!eq(addrKind, BUFAddrKind.BothEn), 1 , 0));
+
+ bits<1> addr64 = !if(!eq(addrKind, BUFAddrKind.Addr64), 1, 0);
+
+ bits<1> has_vaddr = !if(!eq(addrKind, BUFAddrKind.Offset), 0, 1);
+}
+
+class MUBUF_Load_Pseudo <string opName,
+ int addrKind,
+ RegisterClass vdataClass,
+ list<dag> pattern=[],
+ // Workaround bug bz30254
+ int addrKindCopy = addrKind>
+ : MUBUF_Pseudo<opName,
+ (outs vdataClass:$vdata),
+ getMUBUFIns<addrKindCopy>.ret,
+ " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe",
+ pattern>,
+ MUBUF_SetupAddr<addrKindCopy> {
+ let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
+ let mayLoad = 1;
+ let mayStore = 0;
+}
+
+// FIXME: tfe can't be an operand because it requires a separate
+// opcode because it needs an N+1 register class dest register.
+multiclass MUBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
+ ValueType load_vt = i32,
+ SDPatternOperator ld = null_frag> {
+
+ def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
+ [(set load_vt:$vdata,
+ (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))]>,
+ MUBUFAddr64Table<0>;
+
+ def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
+ [(set load_vt:$vdata,
+ (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))]>,
+ MUBUFAddr64Table<1>;
+
+ def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+ def _IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+ def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+
+ let DisableWQM = 1 in {
+ def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass>;
+ def _OFFEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+ def _IDXEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+ def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+ }
+}
+
+class MUBUF_Store_Pseudo <string opName,
+ int addrKind,
+ RegisterClass vdataClass,
+ list<dag> pattern=[],
+ // Workaround bug bz30254
+ int addrKindCopy = addrKind,
+ RegisterClass vdataClassCopy = vdataClass>
+ : MUBUF_Pseudo<opName,
+ (outs),
+ getMUBUFIns<addrKindCopy, [vdataClassCopy]>.ret,
+ " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe",
+ pattern>,
+ MUBUF_SetupAddr<addrKindCopy> {
+ let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
+ let mayLoad = 0;
+ let mayStore = 1;
+}
+
+multiclass MUBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
+ ValueType store_vt = i32,
+ SDPatternOperator st = null_frag> {
+
+ def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
+ [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
+ i16:$offset, i1:$glc, i1:$slc, i1:$tfe))]>,
+ MUBUFAddr64Table<0>;
+
+ def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
+ [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
+ i16:$offset, i1:$glc, i1:$slc, i1:$tfe))]>,
+ MUBUFAddr64Table<1>;
+
+ def _OFFEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+ def _IDXEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+ def _BOTHEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+
+ let DisableWQM = 1 in {
+ def _OFFSET_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass>;
+ def _OFFEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+ def _IDXEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+ def _BOTHEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+ }
+}
+
+
+class getMUBUFAtomicInsDA<RegisterClass vdataClass, bit vdata_in,
+ list<RegisterClass> vaddrList=[]> {
+ RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
+ dag ret = !if(vdata_in,
+ !if(!empty(vaddrList),
+ (ins vdataClass:$vdata_in,
+ SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, slc:$slc),
+ (ins vdataClass:$vdata_in, vaddrClass:$vaddr,
+ SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, slc:$slc)
+ ),
+ !if(!empty(vaddrList),
+ (ins vdataClass:$vdata,
+ SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, slc:$slc),
+ (ins vdataClass:$vdata, vaddrClass:$vaddr,
+ SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, slc:$slc)
+ ));
+}
+
+class getMUBUFAtomicIns<int addrKind,
+ RegisterClass vdataClass,
+ bit vdata_in,
+ // Workaround bug bz30254
+ RegisterClass vdataClassCopy=vdataClass> {
+ dag ret =
+ !if(!eq(addrKind, BUFAddrKind.Offset),
+ getMUBUFAtomicInsDA<vdataClassCopy, vdata_in>.ret,
+ !if(!eq(addrKind, BUFAddrKind.OffEn),
+ getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, [VGPR_32]>.ret,
+ !if(!eq(addrKind, BUFAddrKind.IdxEn),
+ getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, [VGPR_32]>.ret,
+ !if(!eq(addrKind, BUFAddrKind.BothEn),
+ getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, [VReg_64]>.ret,
+ !if(!eq(addrKind, BUFAddrKind.Addr64),
+ getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, [VReg_64]>.ret,
+ (ins))))));
+}
+
+class MUBUF_Atomic_Pseudo<string opName,
+ int addrKind,
+ dag outs,
+ dag ins,
+ string asmOps,
+ list<dag> pattern=[],
+ // Workaround bug bz30254
+ int addrKindCopy = addrKind>
+ : MUBUF_Pseudo<opName, outs, ins, asmOps, pattern>,
+ MUBUF_SetupAddr<addrKindCopy> {
+ let mayStore = 1;
+ let mayLoad = 1;
+ let hasPostISelHook = 1;
+ let hasSideEffects = 1;
+ let DisableWQM = 1;
+ let has_glc = 0;
+ let has_tfe = 0;
+}
+
+class MUBUF_AtomicNoRet_Pseudo<string opName, int addrKind,
+ RegisterClass vdataClass,
+ list<dag> pattern=[],
+ // Workaround bug bz30254
+ int addrKindCopy = addrKind,
+ RegisterClass vdataClassCopy = vdataClass>
+ : MUBUF_Atomic_Pseudo<opName, addrKindCopy,
+ (outs),
+ getMUBUFAtomicIns<addrKindCopy, vdataClassCopy, 0>.ret,
+ " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$slc",
+ pattern>,
+ AtomicNoRet<opName # "_" # getAddrName<addrKindCopy>.ret, 0> {
+ let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
+ let glc_value = 0;
+ let AsmMatchConverter = "cvtMubufAtomic";
+}
+
+class MUBUF_AtomicRet_Pseudo<string opName, int addrKind,
+ RegisterClass vdataClass,
+ list<dag> pattern=[],
+ // Workaround bug bz30254
+ int addrKindCopy = addrKind,
+ RegisterClass vdataClassCopy = vdataClass>
+ : MUBUF_Atomic_Pseudo<opName, addrKindCopy,
+ (outs vdataClassCopy:$vdata),
+ getMUBUFAtomicIns<addrKindCopy, vdataClassCopy, 1>.ret,
+ " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # " glc$slc",
+ pattern>,
+ AtomicNoRet<opName # "_" # getAddrName<addrKindCopy>.ret, 1> {
+ let PseudoInstr = opName # "_rtn_" # getAddrName<addrKindCopy>.ret;
+ let glc_value = 1;
+ let Constraints = "$vdata = $vdata_in";
+ let DisableEncoding = "$vdata_in";
+ let AsmMatchConverter = "cvtMubufAtomicReturn";
+}
+
+multiclass MUBUF_Pseudo_Atomics <string opName,
+ RegisterClass vdataClass,
+ ValueType vdataType,
+ SDPatternOperator atomic> {
+
+ def _OFFSET : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass>,
+ MUBUFAddr64Table <0>;
+ def _ADDR64 : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass>,
+ MUBUFAddr64Table <1>;
+ def _OFFEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+ def _IDXEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+ def _BOTHEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+
+ def _RTN_OFFSET : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
+ [(set vdataType:$vdata,
+ (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$slc),
+ vdataType:$vdata_in))]>,
+ MUBUFAddr64Table <0, "_RTN">;
+
+ def _RTN_ADDR64 : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
+ [(set vdataType:$vdata,
+ (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$slc),
+ vdataType:$vdata_in))]>,
+ MUBUFAddr64Table <1, "_RTN">;
+
+ def _RTN_OFFEN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+ def _RTN_IDXEN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+ def _RTN_BOTHEN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// MUBUF Instructions
+//===----------------------------------------------------------------------===//
+
+let SubtargetPredicate = isGCN in {
+
+defm BUFFER_LOAD_FORMAT_X : MUBUF_Pseudo_Loads <
+ "buffer_load_format_x", VGPR_32
+>;
+defm BUFFER_LOAD_FORMAT_XY : MUBUF_Pseudo_Loads <
+ "buffer_load_format_xy", VReg_64
+>;
+defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Pseudo_Loads <
+ "buffer_load_format_xyz", VReg_96
+>;
+defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Pseudo_Loads <
+ "buffer_load_format_xyzw", VReg_128
+>;
+defm BUFFER_STORE_FORMAT_X : MUBUF_Pseudo_Stores <
+ "buffer_store_format_x", VGPR_32
+>;
+defm BUFFER_STORE_FORMAT_XY : MUBUF_Pseudo_Stores <
+ "buffer_store_format_xy", VReg_64
+>;
+defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Pseudo_Stores <
+ "buffer_store_format_xyz", VReg_96
+>;
+defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Pseudo_Stores <
+ "buffer_store_format_xyzw", VReg_128
+>;
+defm BUFFER_LOAD_UBYTE : MUBUF_Pseudo_Loads <
+ "buffer_load_ubyte", VGPR_32, i32, mubuf_az_extloadi8
+>;
+defm BUFFER_LOAD_SBYTE : MUBUF_Pseudo_Loads <
+ "buffer_load_sbyte", VGPR_32, i32, mubuf_sextloadi8
+>;
+defm BUFFER_LOAD_USHORT : MUBUF_Pseudo_Loads <
+ "buffer_load_ushort", VGPR_32, i32, mubuf_az_extloadi16
+>;
+defm BUFFER_LOAD_SSHORT : MUBUF_Pseudo_Loads <
+ "buffer_load_sshort", VGPR_32, i32, mubuf_sextloadi16
+>;
+defm BUFFER_LOAD_DWORD : MUBUF_Pseudo_Loads <
+ "buffer_load_dword", VGPR_32, i32, mubuf_load
+>;
+defm BUFFER_LOAD_DWORDX2 : MUBUF_Pseudo_Loads <
+ "buffer_load_dwordx2", VReg_64, v2i32, mubuf_load
+>;
+defm BUFFER_LOAD_DWORDX3 : MUBUF_Pseudo_Loads <
+ "buffer_load_dwordx3", VReg_96, untyped, mubuf_load
+>;
+defm BUFFER_LOAD_DWORDX4 : MUBUF_Pseudo_Loads <
+ "buffer_load_dwordx4", VReg_128, v4i32, mubuf_load
+>;
+defm BUFFER_STORE_BYTE : MUBUF_Pseudo_Stores <
+ "buffer_store_byte", VGPR_32, i32, truncstorei8_global
+>;
+defm BUFFER_STORE_SHORT : MUBUF_Pseudo_Stores <
+ "buffer_store_short", VGPR_32, i32, truncstorei16_global
+>;
+defm BUFFER_STORE_DWORD : MUBUF_Pseudo_Stores <
+ "buffer_store_dword", VGPR_32, i32, global_store
+>;
+defm BUFFER_STORE_DWORDX2 : MUBUF_Pseudo_Stores <
+ "buffer_store_dwordx2", VReg_64, v2i32, global_store
+>;
+defm BUFFER_STORE_DWORDX3 : MUBUF_Pseudo_Stores <
+ "buffer_store_dwordx3", VReg_96, untyped, global_store
+>;
+defm BUFFER_STORE_DWORDX4 : MUBUF_Pseudo_Stores <
+ "buffer_store_dwordx4", VReg_128, v4i32, global_store
+>;
+defm BUFFER_ATOMIC_SWAP : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global
+>;
+defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_cmpswap", VReg_64, v2i32, null_frag
+>;
+defm BUFFER_ATOMIC_ADD : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_add", VGPR_32, i32, atomic_add_global
+>;
+defm BUFFER_ATOMIC_SUB : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_sub", VGPR_32, i32, atomic_sub_global
+>;
+defm BUFFER_ATOMIC_SMIN : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_smin", VGPR_32, i32, atomic_min_global
+>;
+defm BUFFER_ATOMIC_UMIN : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_umin", VGPR_32, i32, atomic_umin_global
+>;
+defm BUFFER_ATOMIC_SMAX : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_smax", VGPR_32, i32, atomic_max_global
+>;
+defm BUFFER_ATOMIC_UMAX : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_umax", VGPR_32, i32, atomic_umax_global
+>;
+defm BUFFER_ATOMIC_AND : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_and", VGPR_32, i32, atomic_and_global
+>;
+defm BUFFER_ATOMIC_OR : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_or", VGPR_32, i32, atomic_or_global
+>;
+defm BUFFER_ATOMIC_XOR : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_xor", VGPR_32, i32, atomic_xor_global
+>;
+defm BUFFER_ATOMIC_INC : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_inc", VGPR_32, i32, atomic_inc_global
+>;
+defm BUFFER_ATOMIC_DEC : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_dec", VGPR_32, i32, atomic_dec_global
+>;
+defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_swap_x2", VReg_64, i64, atomic_swap_global
+>;
+defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_cmpswap_x2", VReg_128, v2i64, null_frag
+>;
+defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_add_x2", VReg_64, i64, atomic_add_global
+>;
+defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_sub_x2", VReg_64, i64, atomic_sub_global
+>;
+defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_smin_x2", VReg_64, i64, atomic_min_global
+>;
+defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_umin_x2", VReg_64, i64, atomic_umin_global
+>;
+defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_smax_x2", VReg_64, i64, atomic_max_global
+>;
+defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_umax_x2", VReg_64, i64, atomic_umax_global
+>;
+defm BUFFER_ATOMIC_AND_X2 : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_and_x2", VReg_64, i64, atomic_and_global
+>;
+defm BUFFER_ATOMIC_OR_X2 : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_or_x2", VReg_64, i64, atomic_or_global
+>;
+defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_xor_x2", VReg_64, i64, atomic_xor_global
+>;
+defm BUFFER_ATOMIC_INC_X2 : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_inc_x2", VReg_64, i64, atomic_inc_global
+>;
+defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_dec_x2", VReg_64, i64, atomic_dec_global
+>;
+
+let SubtargetPredicate = isSI in { // isn't on CI & VI
+/*
+defm BUFFER_ATOMIC_RSUB : MUBUF_Pseudo_Atomics <"buffer_atomic_rsub">;
+defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Pseudo_Atomics <"buffer_atomic_fcmpswap">;
+defm BUFFER_ATOMIC_FMIN : MUBUF_Pseudo_Atomics <"buffer_atomic_fmin">;
+defm BUFFER_ATOMIC_FMAX : MUBUF_Pseudo_Atomics <"buffer_atomic_fmax">;
+defm BUFFER_ATOMIC_RSUB_X2 : MUBUF_Pseudo_Atomics <"buffer_atomic_rsub_x2">;
+defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Pseudo_Atomics <"buffer_atomic_fcmpswap_x2">;
+defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Pseudo_Atomics <"buffer_atomic_fmin_x2">;
+defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Pseudo_Atomics <"buffer_atomic_fmax_x2">;
+*/
+
+def BUFFER_WBINVL1_SC : MUBUF_Invalidate <"buffer_wbinvl1_sc",
+ int_amdgcn_buffer_wbinvl1_sc>;
+}
+
+def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1",
+ int_amdgcn_buffer_wbinvl1>;
+
+//===----------------------------------------------------------------------===//
+// MTBUF Instructions
+//===----------------------------------------------------------------------===//
+
+//def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0, "tbuffer_load_format_x", []>;
+//def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <1, "tbuffer_load_format_xy", []>;
+//def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <2, "tbuffer_load_format_xyz", []>;
+def TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Pseudo <"tbuffer_load_format_xyzw", VReg_128>;
+def TBUFFER_STORE_FORMAT_X : MTBUF_Store_Pseudo <"tbuffer_store_format_x", VGPR_32>;
+def TBUFFER_STORE_FORMAT_XY : MTBUF_Store_Pseudo <"tbuffer_store_format_xy", VReg_64>;
+def TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Pseudo <"tbuffer_store_format_xyz", VReg_128>;
+def TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Pseudo <"tbuffer_store_format_xyzw", VReg_128>;
+
+} // End let SubtargetPredicate = isGCN
+
+let SubtargetPredicate = isCIVI in {
+
+//===----------------------------------------------------------------------===//
+// Instruction definitions for CI and newer.
+//===----------------------------------------------------------------------===//
+// Remaining instructions:
+// BUFFER_LOAD_DWORDX3
+// BUFFER_STORE_DWORDX3
+
+def BUFFER_WBINVL1_VOL : MUBUF_Invalidate <"buffer_wbinvl1_vol",
+ int_amdgcn_buffer_wbinvl1_vol>;
+
+} // End let SubtargetPredicate = isCIVI
+
+//===----------------------------------------------------------------------===//
+// MUBUF Patterns
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isGCN] in {
+
+// int_SI_vs_load_input
+def : Pat<
+ (SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr),
+ (BUFFER_LOAD_FORMAT_XYZW_IDXEN $buf_idx_vgpr, $tlst, (i32 0), imm:$attr_offset, 0, 0, 0)
+>;
+
+// Offset in an 32-bit VGPR
+def : Pat <
+ (SIload_constant v4i32:$sbase, i32:$voff),
+ (BUFFER_LOAD_DWORD_OFFEN $voff, $sbase, (i32 0), 0, 0, 0, 0)
+>;
+
+
+//===----------------------------------------------------------------------===//
+// buffer_load/store_format patterns
+//===----------------------------------------------------------------------===//
+
+multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
+ string opcode> {
+ def : Pat<
+ (vt (name v4i32:$rsrc, 0,
+ (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+ imm:$glc, imm:$slc)),
+ (!cast<MUBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset),
+ (as_i1imm $glc), (as_i1imm $slc), 0)
+ >;
+
+ def : Pat<
+ (vt (name v4i32:$rsrc, i32:$vindex,
+ (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+ imm:$glc, imm:$slc)),
+ (!cast<MUBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
+ (as_i1imm $glc), (as_i1imm $slc), 0)
+ >;
+
+ def : Pat<
+ (vt (name v4i32:$rsrc, 0,
+ (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+ imm:$glc, imm:$slc)),
+ (!cast<MUBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
+ (as_i1imm $glc), (as_i1imm $slc), 0)
+ >;
+
+ def : Pat<
+ (vt (name v4i32:$rsrc, i32:$vindex,
+ (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+ imm:$glc, imm:$slc)),
+ (!cast<MUBUF_Pseudo>(opcode # _BOTHEN)
+ (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+ $rsrc, $soffset, (as_i16imm $offset),
+ (as_i1imm $glc), (as_i1imm $slc), 0)
+ >;
+}
+
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, f32, "BUFFER_LOAD_FORMAT_X">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v2f32, "BUFFER_LOAD_FORMAT_XY">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v4f32, "BUFFER_LOAD_FORMAT_XYZW">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, f32, "BUFFER_LOAD_DWORD">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f32, "BUFFER_LOAD_DWORDX2">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f32, "BUFFER_LOAD_DWORDX4">;
+
+multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
+ string opcode> {
+ def : Pat<
+ (name vt:$vdata, v4i32:$rsrc, 0,
+ (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+ imm:$glc, imm:$slc),
+ (!cast<MUBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset),
+ (as_i1imm $glc), (as_i1imm $slc), 0)
+ >;
+
+ def : Pat<
+ (name vt:$vdata, v4i32:$rsrc, i32:$vindex,
+ (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+ imm:$glc, imm:$slc),
+ (!cast<MUBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
+ (as_i16imm $offset), (as_i1imm $glc),
+ (as_i1imm $slc), 0)
+ >;
+
+ def : Pat<
+ (name vt:$vdata, v4i32:$rsrc, 0,
+ (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+ imm:$glc, imm:$slc),
+ (!cast<MUBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
+ (as_i16imm $offset), (as_i1imm $glc),
+ (as_i1imm $slc), 0)
+ >;
+
+ def : Pat<
+ (name vt:$vdata, v4i32:$rsrc, i32:$vindex,
+ (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+ imm:$glc, imm:$slc),
+ (!cast<MUBUF_Pseudo>(opcode # _BOTHEN_exact)
+ $vdata,
+ (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+ $rsrc, $soffset, (as_i16imm $offset),
+ (as_i1imm $glc), (as_i1imm $slc), 0)
+ >;
+}
+
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store_format, f32, "BUFFER_STORE_FORMAT_X">;
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store_format, v2f32, "BUFFER_STORE_FORMAT_XY">;
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store_format, v4f32, "BUFFER_STORE_FORMAT_XYZW">;
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store, f32, "BUFFER_STORE_DWORD">;
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store, v2f32, "BUFFER_STORE_DWORDX2">;
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store, v4f32, "BUFFER_STORE_DWORDX4">;
+
+//===----------------------------------------------------------------------===//
+// buffer_atomic patterns
+//===----------------------------------------------------------------------===//
+
+multiclass BufferAtomicPatterns<SDPatternOperator name, string opcode> {
+ def : Pat<
+ (name i32:$vdata_in, v4i32:$rsrc, 0,
+ (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+ imm:$slc),
+ (!cast<MUBUF_Pseudo>(opcode # _RTN_OFFSET) $vdata_in, $rsrc, $soffset,
+ (as_i16imm $offset), (as_i1imm $slc))
+ >;
+
+ def : Pat<
+ (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex,
+ (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+ imm:$slc),
+ (!cast<MUBUF_Pseudo>(opcode # _RTN_IDXEN) $vdata_in, $vindex, $rsrc, $soffset,
+ (as_i16imm $offset), (as_i1imm $slc))
+ >;
+
+ def : Pat<
+ (name i32:$vdata_in, v4i32:$rsrc, 0,
+ (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+ imm:$slc),
+ (!cast<MUBUF_Pseudo>(opcode # _RTN_OFFEN) $vdata_in, $voffset, $rsrc, $soffset,
+ (as_i16imm $offset), (as_i1imm $slc))
+ >;
+
+ def : Pat<
+ (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex,
+ (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+ imm:$slc),
+ (!cast<MUBUF_Pseudo>(opcode # _RTN_BOTHEN)
+ $vdata_in,
+ (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+ $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc))
+ >;
+}
+
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_swap, "BUFFER_ATOMIC_SWAP">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_add, "BUFFER_ATOMIC_ADD">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_sub, "BUFFER_ATOMIC_SUB">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_smin, "BUFFER_ATOMIC_SMIN">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_umin, "BUFFER_ATOMIC_UMIN">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_smax, "BUFFER_ATOMIC_SMAX">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_umax, "BUFFER_ATOMIC_UMAX">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_and, "BUFFER_ATOMIC_AND">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_or, "BUFFER_ATOMIC_OR">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_xor, "BUFFER_ATOMIC_XOR">;
+
+def : Pat<
+ (int_amdgcn_buffer_atomic_cmpswap
+ i32:$data, i32:$cmp, v4i32:$rsrc, 0,
+ (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+ imm:$slc),
+ (EXTRACT_SUBREG
+ (BUFFER_ATOMIC_CMPSWAP_RTN_OFFSET
+ (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
+ $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
+ sub0)
+>;
+
+def : Pat<
+ (int_amdgcn_buffer_atomic_cmpswap
+ i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
+ (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+ imm:$slc),
+ (EXTRACT_SUBREG
+ (BUFFER_ATOMIC_CMPSWAP_RTN_IDXEN
+ (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
+ $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
+ sub0)
+>;
+
+def : Pat<
+ (int_amdgcn_buffer_atomic_cmpswap
+ i32:$data, i32:$cmp, v4i32:$rsrc, 0,
+ (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+ imm:$slc),
+ (EXTRACT_SUBREG
+ (BUFFER_ATOMIC_CMPSWAP_RTN_OFFEN
+ (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
+ $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
+ sub0)
+>;
+
+def : Pat<
+ (int_amdgcn_buffer_atomic_cmpswap
+ i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
+ (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+ imm:$slc),
+ (EXTRACT_SUBREG
+ (BUFFER_ATOMIC_CMPSWAP_RTN_BOTHEN
+ (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
+ (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+ $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
+ sub0)
+>;
+
+
+class MUBUFLoad_PatternADDR64 <MUBUF_Pseudo Instr_ADDR64, ValueType vt,
+ PatFrag constant_ld> : Pat <
+ (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
+ i16:$offset, i1:$glc, i1:$slc, i1:$tfe))),
+ (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe)
+ >;
+
+multiclass MUBUFLoad_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET,
+ ValueType vt, PatFrag atomic_ld> {
+ def : Pat <
+ (vt (atomic_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
+ i16:$offset, i1:$slc))),
+ (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, 1, $slc, 0)
+ >;
+
+ def : Pat <
+ (vt (atomic_ld (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset))),
+ (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 1, 0, 0)
+ >;
+}
+
+let Predicates = [isSICI] in {
+def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>;
+def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>;
+def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>;
+def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant>;
+
+defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORD_ADDR64, BUFFER_LOAD_DWORD_OFFSET, i32, mubuf_load_atomic>;
+defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, BUFFER_LOAD_DWORDX2_OFFSET, i64, mubuf_load_atomic>;
+} // End Predicates = [isSICI]
+
+multiclass MUBUFLoad_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt,
+ PatFrag ld> {
+
+ def : Pat <
+ (vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset,
+ i16:$offset, i1:$glc, i1:$slc, i1:$tfe))),
+ (Instr_OFFSET $srsrc, $soffset, $offset, $glc, $slc, $tfe)
+ >;
+}
+
+let Predicates = [Has16BitInsts] in {
+
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_OFFSET, i16, sextloadi8_constant>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, az_extloadi8_constant>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_OFFSET, i16, mubuf_sextloadi8>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, mubuf_az_extloadi8>;
+
+} // End Predicates = [Has16BitInsts]
+
+class MUBUFScratchLoadPat <MUBUF_Pseudo Instr, ValueType vt, PatFrag ld> : Pat <
+ (vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr,
+ i32:$soffset, u16imm:$offset))),
+ (Instr $vaddr, $srsrc, $soffset, $offset, 0, 0, 0)
+>;
+
+def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i32, sextloadi8_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i32, extloadi8_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i16, sextloadi8_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i16, extloadi8_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, i32, sextloadi16_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, i32, extloadi16_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, i32, load_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, v2i32, load_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, v4i32, load_private>;
+
+// BUFFER_LOAD_DWORD*, addr64=0
+multiclass MUBUF_Load_Dword <ValueType vt,
+ MUBUF_Pseudo offset,
+ MUBUF_Pseudo offen,
+ MUBUF_Pseudo idxen,
+ MUBUF_Pseudo bothen> {
+
+ def : Pat <
+ (vt (int_SI_buffer_load_dword v4i32:$rsrc, (i32 imm), i32:$soffset,
+ imm:$offset, 0, 0, imm:$glc, imm:$slc,
+ imm:$tfe)),
+ (offset $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc),
+ (as_i1imm $slc), (as_i1imm $tfe))
+ >;
+
+ def : Pat <
+ (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
+ imm:$offset, 1, 0, imm:$glc, imm:$slc,
+ imm:$tfe)),
+ (offen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc),
+ (as_i1imm $tfe))
+ >;
+
+ def : Pat <
+ (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
+ imm:$offset, 0, 1, imm:$glc, imm:$slc,
+ imm:$tfe)),
+ (idxen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc),
+ (as_i1imm $slc), (as_i1imm $tfe))
+ >;
+
+ def : Pat <
+ (vt (int_SI_buffer_load_dword v4i32:$rsrc, v2i32:$vaddr, i32:$soffset,
+ imm:$offset, 1, 1, imm:$glc, imm:$slc,
+ imm:$tfe)),
+ (bothen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc),
+ (as_i1imm $tfe))
+ >;
+}
+
+defm : MUBUF_Load_Dword <i32, BUFFER_LOAD_DWORD_OFFSET, BUFFER_LOAD_DWORD_OFFEN,
+ BUFFER_LOAD_DWORD_IDXEN, BUFFER_LOAD_DWORD_BOTHEN>;
+defm : MUBUF_Load_Dword <v2i32, BUFFER_LOAD_DWORDX2_OFFSET, BUFFER_LOAD_DWORDX2_OFFEN,
+ BUFFER_LOAD_DWORDX2_IDXEN, BUFFER_LOAD_DWORDX2_BOTHEN>;
+defm : MUBUF_Load_Dword <v4i32, BUFFER_LOAD_DWORDX4_OFFSET, BUFFER_LOAD_DWORDX4_OFFEN,
+ BUFFER_LOAD_DWORDX4_IDXEN, BUFFER_LOAD_DWORDX4_BOTHEN>;
+
+multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET,
+ ValueType vt, PatFrag atomic_st> {
+ // Store follows atomic op convention so address is forst
+ def : Pat <
+ (atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
+ i16:$offset, i1:$slc), vt:$val),
+ (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 1, $slc, 0)
+ >;
+
+ def : Pat <
+ (atomic_st (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset), vt:$val),
+ (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 1, 0, 0)
+ >;
+}
+let Predicates = [isSICI] in {
+defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORD_ADDR64, BUFFER_STORE_DWORD_OFFSET, i32, global_store_atomic>;
+defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORDX2_ADDR64, BUFFER_STORE_DWORDX2_OFFSET, i64, global_store_atomic>;
+} // End Predicates = [isSICI]
+
+
+multiclass MUBUFStore_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt,
+ PatFrag st> {
+
+ def : Pat <
+ (st vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
+ i16:$offset, i1:$glc, i1:$slc, i1:$tfe)),
+ (Instr_OFFSET $vdata, $srsrc, $soffset, $offset, $glc, $slc, $tfe)
+ >;
+}
+
+defm : MUBUFStore_Pattern <BUFFER_STORE_BYTE_OFFSET, i16, truncstorei8_global>;
+defm : MUBUFStore_Pattern <BUFFER_STORE_SHORT_OFFSET, i16, global_store>;
+
+class MUBUFScratchStorePat <MUBUF_Pseudo Instr, ValueType vt, PatFrag st> : Pat <
+ (st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset,
+ u16imm:$offset)),
+ (Instr $value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0)
+>;
+
+def : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, i32, truncstorei8_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, i32, truncstorei16_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, i16, truncstorei8_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, i16, store_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, i32, store_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, v2i32, store_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, v4i32, store_private>;
+
+//===----------------------------------------------------------------------===//
+// MTBUF Patterns
+//===----------------------------------------------------------------------===//
+
+// TBUFFER_STORE_FORMAT_*, addr64=0
+class MTBUF_StoreResource <ValueType vt, int num_channels, MTBUF_Pseudo opcode> : Pat<
+ (SItbuffer_store v4i32:$rsrc, vt:$vdata, num_channels, i32:$vaddr,
+ i32:$soffset, imm:$inst_offset, imm:$dfmt,
+ imm:$nfmt, imm:$offen, imm:$idxen,
+ imm:$glc, imm:$slc, imm:$tfe),
+ (opcode
+ $vdata, (as_i16imm $inst_offset), (as_i1imm $offen), (as_i1imm $idxen),
+ (as_i1imm $glc), 0, (as_i8imm $dfmt), (as_i8imm $nfmt), $vaddr, $rsrc,
+ (as_i1imm $slc), (as_i1imm $tfe), $soffset)
+>;
+
+def : MTBUF_StoreResource <i32, 1, TBUFFER_STORE_FORMAT_X>;
+def : MTBUF_StoreResource <v2i32, 2, TBUFFER_STORE_FORMAT_XY>;
+def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_XYZ>;
+def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>;
+
+} // End let Predicates = [isGCN]
+
+//===----------------------------------------------------------------------===//
+// Target instructions, move to the appropriate target TD file
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SI
+//===----------------------------------------------------------------------===//
+
+class MUBUF_Real_si <bits<7> op, MUBUF_Pseudo ps> :
+ MUBUF_Real<op, ps>,
+ Enc64,
+ SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SI> {
+ let AssemblerPredicate=isSICI;
+ let DecoderNamespace="SICI";
+
+ let Inst{11-0} = !if(ps.has_offset, offset, ?);
+ let Inst{12} = ps.offen;
+ let Inst{13} = ps.idxen;
+ let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
+ let Inst{15} = ps.addr64;
+ let Inst{16} = lds;
+ let Inst{24-18} = op;
+ let Inst{31-26} = 0x38; //encoding
+ let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
+ let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+ let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
+ let Inst{54} = !if(ps.has_slc, slc, ?);
+ let Inst{55} = !if(ps.has_tfe, tfe, ?);
+ let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
+}
+
+multiclass MUBUF_Real_AllAddr_si<bits<7> op> {
+ def _OFFSET_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>;
+ def _ADDR64_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64")>;
+ def _OFFEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>;
+ def _IDXEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>;
+ def _BOTHEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
+}
+
+multiclass MUBUF_Real_Atomic_si<bits<7> op> : MUBUF_Real_AllAddr_si<op> {
+ def _RTN_OFFSET_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_OFFSET")>;
+ def _RTN_ADDR64_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_ADDR64")>;
+ def _RTN_OFFEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_OFFEN")>;
+ def _RTN_IDXEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_IDXEN")>;
+ def _RTN_BOTHEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_BOTHEN")>;
+}
+
+defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_si <0x00>;
+defm BUFFER_LOAD_FORMAT_XY : MUBUF_Real_AllAddr_si <0x01>;
+defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Real_AllAddr_si <0x02>;
+defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Real_AllAddr_si <0x03>;
+defm BUFFER_STORE_FORMAT_X : MUBUF_Real_AllAddr_si <0x04>;
+defm BUFFER_STORE_FORMAT_XY : MUBUF_Real_AllAddr_si <0x05>;
+defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Real_AllAddr_si <0x06>;
+defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Real_AllAddr_si <0x07>;
+defm BUFFER_LOAD_UBYTE : MUBUF_Real_AllAddr_si <0x08>;
+defm BUFFER_LOAD_SBYTE : MUBUF_Real_AllAddr_si <0x09>;
+defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_si <0x0a>;
+defm BUFFER_LOAD_SSHORT : MUBUF_Real_AllAddr_si <0x0b>;
+defm BUFFER_LOAD_DWORD : MUBUF_Real_AllAddr_si <0x0c>;
+defm BUFFER_LOAD_DWORDX2 : MUBUF_Real_AllAddr_si <0x0d>;
+defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_si <0x0e>;
+defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_si <0x0f>;
+defm BUFFER_STORE_BYTE : MUBUF_Real_AllAddr_si <0x18>;
+defm BUFFER_STORE_SHORT : MUBUF_Real_AllAddr_si <0x1a>;
+defm BUFFER_STORE_DWORD : MUBUF_Real_AllAddr_si <0x1c>;
+defm BUFFER_STORE_DWORDX2 : MUBUF_Real_AllAddr_si <0x1d>;
+defm BUFFER_STORE_DWORDX4 : MUBUF_Real_AllAddr_si <0x1e>;
+defm BUFFER_STORE_DWORDX3 : MUBUF_Real_AllAddr_si <0x1f>;
+
+defm BUFFER_ATOMIC_SWAP : MUBUF_Real_Atomic_si <0x30>;
+defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Real_Atomic_si <0x31>;
+defm BUFFER_ATOMIC_ADD : MUBUF_Real_Atomic_si <0x32>;
+defm BUFFER_ATOMIC_SUB : MUBUF_Real_Atomic_si <0x33>;
+//defm BUFFER_ATOMIC_RSUB : MUBUF_Real_Atomic_si <0x34>; // isn't on CI & VI
+defm BUFFER_ATOMIC_SMIN : MUBUF_Real_Atomic_si <0x35>;
+defm BUFFER_ATOMIC_UMIN : MUBUF_Real_Atomic_si <0x36>;
+defm BUFFER_ATOMIC_SMAX : MUBUF_Real_Atomic_si <0x37>;
+defm BUFFER_ATOMIC_UMAX : MUBUF_Real_Atomic_si <0x38>;
+defm BUFFER_ATOMIC_AND : MUBUF_Real_Atomic_si <0x39>;
+defm BUFFER_ATOMIC_OR : MUBUF_Real_Atomic_si <0x3a>;
+defm BUFFER_ATOMIC_XOR : MUBUF_Real_Atomic_si <0x3b>;
+defm BUFFER_ATOMIC_INC : MUBUF_Real_Atomic_si <0x3c>;
+defm BUFFER_ATOMIC_DEC : MUBUF_Real_Atomic_si <0x3d>;
+
+//defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Real_Atomic_si <0x3e>; // isn't on VI
+//defm BUFFER_ATOMIC_FMIN : MUBUF_Real_Atomic_si <0x3f>; // isn't on VI
+//defm BUFFER_ATOMIC_FMAX : MUBUF_Real_Atomic_si <0x40>; // isn't on VI
+defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Real_Atomic_si <0x50>;
+defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Real_Atomic_si <0x51>;
+defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Real_Atomic_si <0x52>;
+defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Real_Atomic_si <0x53>;
+//defm BUFFER_ATOMIC_RSUB_X2 : MUBUF_Real_Atomic_si <0x54>; // isn't on CI & VI
+defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Real_Atomic_si <0x55>;
+defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Real_Atomic_si <0x56>;
+defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Real_Atomic_si <0x57>;
+defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Real_Atomic_si <0x58>;
+defm BUFFER_ATOMIC_AND_X2 : MUBUF_Real_Atomic_si <0x59>;
+defm BUFFER_ATOMIC_OR_X2 : MUBUF_Real_Atomic_si <0x5a>;
+defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomic_si <0x5b>;
+defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomic_si <0x5c>;
+defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomic_si <0x5d>;
+// FIXME: Need to handle hazard for BUFFER_ATOMIC_FCMPSWAP_X2 on CI.
+//defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Real_Atomic_si <0x5e">; // isn't on VI
+//defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Real_Atomic_si <0x5f>; // isn't on VI
+//defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Real_Atomic_si <0x60>; // isn't on VI
+
+def BUFFER_WBINVL1_SC_si : MUBUF_Real_si <0x70, BUFFER_WBINVL1_SC>;
+def BUFFER_WBINVL1_si : MUBUF_Real_si <0x71, BUFFER_WBINVL1>;
+
+class MTBUF_Real_si <bits<3> op, MTBUF_Pseudo ps> :
+ MTBUF_Real<ps>,
+ SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SI> {
+ let AssemblerPredicate=isSICI;
+ let DecoderNamespace="SICI";
+
+ bits<1> addr64;
+ let Inst{15} = addr64;
+ let Inst{18-16} = op;
+}
+
+def TBUFFER_LOAD_FORMAT_XYZW_si : MTBUF_Real_si <3, TBUFFER_LOAD_FORMAT_XYZW>;
+def TBUFFER_STORE_FORMAT_X_si : MTBUF_Real_si <4, TBUFFER_STORE_FORMAT_X>;
+def TBUFFER_STORE_FORMAT_XY_si : MTBUF_Real_si <5, TBUFFER_STORE_FORMAT_XY>;
+def TBUFFER_STORE_FORMAT_XYZ_si : MTBUF_Real_si <6, TBUFFER_STORE_FORMAT_XYZ>;
+def TBUFFER_STORE_FORMAT_XYZW_si : MTBUF_Real_si <7, TBUFFER_STORE_FORMAT_XYZW>;
+
+
+//===----------------------------------------------------------------------===//
+// CI
+//===----------------------------------------------------------------------===//
+
+class MUBUF_Real_ci <bits<7> op, MUBUF_Pseudo ps> :
+ MUBUF_Real_si<op, ps> {
+ let AssemblerPredicate=isCIOnly;
+ let DecoderNamespace="CI";
+}
+
+def BUFFER_WBINVL1_VOL_ci : MUBUF_Real_ci <0x70, BUFFER_WBINVL1_VOL>;
+
+
+//===----------------------------------------------------------------------===//
+// VI
+//===----------------------------------------------------------------------===//
+
+class MUBUF_Real_vi <bits<7> op, MUBUF_Pseudo ps> :
+ MUBUF_Real<op, ps>,
+ Enc64,
+ SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI> {
+ let AssemblerPredicate=isVI;
+ let DecoderNamespace="VI";
+
+ let Inst{11-0} = !if(ps.has_offset, offset, ?);
+ let Inst{12} = ps.offen;
+ let Inst{13} = ps.idxen;
+ let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
+ let Inst{16} = lds;
+ let Inst{17} = !if(ps.has_slc, slc, ?);
+ let Inst{24-18} = op;
+ let Inst{31-26} = 0x38; //encoding
+ let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
+ let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+ let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
+ let Inst{55} = !if(ps.has_tfe, tfe, ?);
+ let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
+}
+
+multiclass MUBUF_Real_AllAddr_vi<bits<7> op> {
+ def _OFFSET_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>;
+ def _OFFEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>;
+ def _IDXEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>;
+ def _BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
+}
+
+multiclass MUBUF_Real_Atomic_vi<bits<7> op> :
+ MUBUF_Real_AllAddr_vi<op> {
+ def _RTN_OFFSET_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_OFFSET")>;
+ def _RTN_OFFEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_OFFEN")>;
+ def _RTN_IDXEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_IDXEN")>;
+ def _RTN_BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_BOTHEN")>;
+}
+
+defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_vi <0x00>;
+defm BUFFER_LOAD_FORMAT_XY : MUBUF_Real_AllAddr_vi <0x01>;
+defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Real_AllAddr_vi <0x02>;
+defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Real_AllAddr_vi <0x03>;
+defm BUFFER_STORE_FORMAT_X : MUBUF_Real_AllAddr_vi <0x04>;
+defm BUFFER_STORE_FORMAT_XY : MUBUF_Real_AllAddr_vi <0x05>;
+defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Real_AllAddr_vi <0x06>;
+defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Real_AllAddr_vi <0x07>;
+defm BUFFER_LOAD_UBYTE : MUBUF_Real_AllAddr_vi <0x10>;
+defm BUFFER_LOAD_SBYTE : MUBUF_Real_AllAddr_vi <0x11>;
+defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_vi <0x12>;
+defm BUFFER_LOAD_SSHORT : MUBUF_Real_AllAddr_vi <0x13>;
+defm BUFFER_LOAD_DWORD : MUBUF_Real_AllAddr_vi <0x14>;
+defm BUFFER_LOAD_DWORDX2 : MUBUF_Real_AllAddr_vi <0x15>;
+defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_vi <0x16>;
+defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_vi <0x17>;
+defm BUFFER_STORE_BYTE : MUBUF_Real_AllAddr_vi <0x18>;
+defm BUFFER_STORE_SHORT : MUBUF_Real_AllAddr_vi <0x1a>;
+defm BUFFER_STORE_DWORD : MUBUF_Real_AllAddr_vi <0x1c>;
+defm BUFFER_STORE_DWORDX2 : MUBUF_Real_AllAddr_vi <0x1d>;
+defm BUFFER_STORE_DWORDX3 : MUBUF_Real_AllAddr_vi <0x1e>;
+defm BUFFER_STORE_DWORDX4 : MUBUF_Real_AllAddr_vi <0x1f>;
+
+defm BUFFER_ATOMIC_SWAP : MUBUF_Real_Atomic_vi <0x40>;
+defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Real_Atomic_vi <0x41>;
+defm BUFFER_ATOMIC_ADD : MUBUF_Real_Atomic_vi <0x42>;
+defm BUFFER_ATOMIC_SUB : MUBUF_Real_Atomic_vi <0x43>;
+defm BUFFER_ATOMIC_SMIN : MUBUF_Real_Atomic_vi <0x44>;
+defm BUFFER_ATOMIC_UMIN : MUBUF_Real_Atomic_vi <0x45>;
+defm BUFFER_ATOMIC_SMAX : MUBUF_Real_Atomic_vi <0x46>;
+defm BUFFER_ATOMIC_UMAX : MUBUF_Real_Atomic_vi <0x47>;
+defm BUFFER_ATOMIC_AND : MUBUF_Real_Atomic_vi <0x48>;
+defm BUFFER_ATOMIC_OR : MUBUF_Real_Atomic_vi <0x49>;
+defm BUFFER_ATOMIC_XOR : MUBUF_Real_Atomic_vi <0x4a>;
+defm BUFFER_ATOMIC_INC : MUBUF_Real_Atomic_vi <0x4b>;
+defm BUFFER_ATOMIC_DEC : MUBUF_Real_Atomic_vi <0x4c>;
+
+defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Real_Atomic_vi <0x60>;
+defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Real_Atomic_vi <0x61>;
+defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Real_Atomic_vi <0x62>;
+defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Real_Atomic_vi <0x63>;
+defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Real_Atomic_vi <0x64>;
+defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Real_Atomic_vi <0x65>;
+defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Real_Atomic_vi <0x66>;
+defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Real_Atomic_vi <0x67>;
+defm BUFFER_ATOMIC_AND_X2 : MUBUF_Real_Atomic_vi <0x68>;
+defm BUFFER_ATOMIC_OR_X2 : MUBUF_Real_Atomic_vi <0x69>;
+defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomic_vi <0x6a>;
+defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomic_vi <0x6b>;
+defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomic_vi <0x6c>;
+
+def BUFFER_WBINVL1_vi : MUBUF_Real_vi <0x3e, BUFFER_WBINVL1>;
+def BUFFER_WBINVL1_VOL_vi : MUBUF_Real_vi <0x3f, BUFFER_WBINVL1_VOL>;
+
+class MTBUF_Real_vi <bits<4> op, MTBUF_Pseudo ps> :
+ MTBUF_Real<ps>,
+ SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI> {
+ let AssemblerPredicate=isVI;
+ let DecoderNamespace="VI";
+
+ let Inst{18-15} = op;
+}
+
+def TBUFFER_LOAD_FORMAT_XYZW_vi : MTBUF_Real_vi <3, TBUFFER_LOAD_FORMAT_XYZW>;
+def TBUFFER_STORE_FORMAT_X_vi : MTBUF_Real_vi <4, TBUFFER_STORE_FORMAT_X>;
+def TBUFFER_STORE_FORMAT_XY_vi : MTBUF_Real_vi <5, TBUFFER_STORE_FORMAT_XY>;
+def TBUFFER_STORE_FORMAT_XYZ_vi : MTBUF_Real_vi <6, TBUFFER_STORE_FORMAT_XYZ>;
+def TBUFFER_STORE_FORMAT_XYZW_vi : MTBUF_Real_vi <7, TBUFFER_STORE_FORMAT_XYZW>;
+
diff --git a/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td
new file mode 100644
index 000000000000..26a483a8abf6
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td
@@ -0,0 +1,15 @@
+//===-- CIInstructions.td - CI Instruction Defintions ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Instruction definitions for CI and newer.
+//===----------------------------------------------------------------------===//
+// Remaining instructions:
+// S_CBRANCH_CDBGUSER
+// S_CBRANCH_CDBGSYS
+// S_CBRANCH_CDBGSYS_OR_USER
+// S_CBRANCH_CDBGSYS_AND_USER \ No newline at end of file
diff --git a/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td b/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td
new file mode 100644
index 000000000000..6b8e85a73c73
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td
@@ -0,0 +1,222 @@
+//===-- CaymanInstructions.td - CM Instruction defs -------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TableGen definitions for instructions which are available only on Cayman
+// family GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+def isCayman : Predicate<"Subtarget->hasCaymanISA()">;
+
+//===----------------------------------------------------------------------===//
+// Cayman Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isCayman] in {
+
+def MULADD_INT24_cm : R600_3OP <0x08, "MULADD_INT24",
+ [(set i32:$dst, (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2))], VecALU
+>;
+def MUL_INT24_cm : R600_2OP <0x5B, "MUL_INT24",
+ [(set i32:$dst, (AMDGPUmul_i24 i32:$src0, i32:$src1))], VecALU
+>;
+
+def : IMad24Pat<MULADD_INT24_cm>;
+
+let isVector = 1 in {
+
+def RECIP_IEEE_cm : RECIP_IEEE_Common<0x86>;
+
+def MULLO_INT_cm : MULLO_INT_Common<0x8F>;
+def MULHI_INT_cm : MULHI_INT_Common<0x90>;
+def MULLO_UINT_cm : MULLO_UINT_Common<0x91>;
+def MULHI_UINT_cm : MULHI_UINT_Common<0x92>;
+def MULHI_INT_cm24 : MULHI_INT24_Common<0x5c>;
+def MULHI_UINT_cm24 : MULHI_UINT24_Common<0xb2>;
+
+def RECIPSQRT_CLAMPED_cm : RECIPSQRT_CLAMPED_Common<0x87>;
+def EXP_IEEE_cm : EXP_IEEE_Common<0x81>;
+def LOG_IEEE_cm : LOG_IEEE_Common<0x83>;
+def RECIP_CLAMPED_cm : RECIP_CLAMPED_Common<0x84>;
+def RECIPSQRT_IEEE_cm : RECIPSQRT_IEEE_Common<0x89>;
+def SIN_cm : SIN_Common<0x8D>;
+def COS_cm : COS_Common<0x8E>;
+} // End isVector = 1
+
+def : RsqPat<RECIPSQRT_IEEE_cm, f32>;
+
+def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>;
+
+defm DIV_cm : DIV_Common<RECIP_IEEE_cm>;
+
+// RECIP_UINT emulation for Cayman
+// The multiplication scales from [0,1] to the unsigned integer range
+def : Pat <
+ (AMDGPUurecip i32:$src0),
+ (FLT_TO_UINT_eg (MUL_IEEE (RECIP_IEEE_cm (UINT_TO_FLT_eg $src0)),
+ (MOV_IMM_I32 CONST.FP_UINT_MAX_PLUS_1)))
+>;
+
+ def CF_END_CM : CF_CLAUSE_EG<32, (ins), "CF_END"> {
+ let ADDR = 0;
+ let POP_COUNT = 0;
+ let COUNT = 0;
+ }
+
+
+def : Pat<(fsqrt f32:$src), (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm $src))>;
+
+class RAT_STORE_DWORD <RegisterClass rc, ValueType vt, bits<4> mask> :
+ CF_MEM_RAT_CACHELESS <0x14, 0, mask,
+ (ins rc:$rw_gpr, R600_TReg32_X:$index_gpr),
+ "STORE_DWORD $rw_gpr, $index_gpr",
+ [(global_store vt:$rw_gpr, i32:$index_gpr)]> {
+ let eop = 0; // This bit is not used on Cayman.
+}
+
+def RAT_STORE_DWORD32 : RAT_STORE_DWORD <R600_TReg32_X, i32, 0x1>;
+def RAT_STORE_DWORD64 : RAT_STORE_DWORD <R600_Reg64, v2i32, 0x3>;
+def RAT_STORE_DWORD128 : RAT_STORE_DWORD <R600_Reg128, v4i32, 0xf>;
+
+def RAT_STORE_TYPED_cm: CF_MEM_RAT_STORE_TYPED<0> {
+ let eop = 0; // This bit is not used on Cayman.
+}
+
+class VTX_READ_cm <string name, dag outs>
+ : VTX_WORD0_cm, VTX_READ<name, outs, []> {
+
+ // Static fields
+ let VC_INST = 0;
+ let FETCH_TYPE = 2;
+ let FETCH_WHOLE_QUAD = 0;
+ let SRC_REL = 0;
+ // XXX: We can infer this field based on the SRC_GPR. This would allow us
+ // to store vertex addresses in any channel, not just X.
+ let SRC_SEL_X = 0;
+ let SRC_SEL_Y = 0;
+ let STRUCTURED_READ = 0;
+ let LDS_REQ = 0;
+ let COALESCED_READ = 0;
+
+ let Inst{31-0} = Word0;
+}
+
+def VTX_READ_8_cm
+ : VTX_READ_cm <"VTX_READ_8 $dst_gpr, $src_gpr",
+ (outs R600_TReg32_X:$dst_gpr)> {
+
+ let DST_SEL_X = 0;
+ let DST_SEL_Y = 7; // Masked
+ let DST_SEL_Z = 7; // Masked
+ let DST_SEL_W = 7; // Masked
+ let DATA_FORMAT = 1; // FMT_8
+}
+
+def VTX_READ_16_cm
+ : VTX_READ_cm <"VTX_READ_16 $dst_gpr, $src_gpr",
+ (outs R600_TReg32_X:$dst_gpr)> {
+ let DST_SEL_X = 0;
+ let DST_SEL_Y = 7; // Masked
+ let DST_SEL_Z = 7; // Masked
+ let DST_SEL_W = 7; // Masked
+ let DATA_FORMAT = 5; // FMT_16
+
+}
+
+def VTX_READ_32_cm
+ : VTX_READ_cm <"VTX_READ_32 $dst_gpr, $src_gpr",
+ (outs R600_TReg32_X:$dst_gpr)> {
+
+ let DST_SEL_X = 0;
+ let DST_SEL_Y = 7; // Masked
+ let DST_SEL_Z = 7; // Masked
+ let DST_SEL_W = 7; // Masked
+ let DATA_FORMAT = 0xD; // COLOR_32
+
+ // This is not really necessary, but there were some GPU hangs that appeared
+ // to be caused by ALU instructions in the next instruction group that wrote
+ // to the $src_gpr registers of the VTX_READ.
+ // e.g.
+ // %T3_X<def> = VTX_READ_PARAM_32_eg %T2_X<kill>, 24
+ // %T2_X<def> = MOV %ZERO
+ //Adding this constraint prevents this from happening.
+ let Constraints = "$src_gpr.ptr = $dst_gpr";
+}
+
+def VTX_READ_64_cm
+ : VTX_READ_cm <"VTX_READ_64 $dst_gpr.XY, $src_gpr",
+ (outs R600_Reg64:$dst_gpr)> {
+
+ let DST_SEL_X = 0;
+ let DST_SEL_Y = 1;
+ let DST_SEL_Z = 7;
+ let DST_SEL_W = 7;
+ let DATA_FORMAT = 0x1D; // COLOR_32_32
+}
+
+def VTX_READ_128_cm
+ : VTX_READ_cm <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr",
+ (outs R600_Reg128:$dst_gpr)> {
+
+ let DST_SEL_X = 0;
+ let DST_SEL_Y = 1;
+ let DST_SEL_Z = 2;
+ let DST_SEL_W = 3;
+ let DATA_FORMAT = 0x22; // COLOR_32_32_32_32
+
+ // XXX: Need to force VTX_READ_128 instructions to write to the same register
+ // that holds its buffer address to avoid potential hangs. We can't use
+ // the same constraint as VTX_READ_32_eg, because the $src_gpr.ptr and $dst
+ // registers are different sizes.
+}
+
+//===----------------------------------------------------------------------===//
+// VTX Read from parameter memory space
+//===----------------------------------------------------------------------===//
+def : Pat<(i32:$dst_gpr (vtx_id3_az_extloadi8 ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_8_cm MEMxi:$src_gpr, 3)>;
+def : Pat<(i32:$dst_gpr (vtx_id3_az_extloadi16 ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_16_cm MEMxi:$src_gpr, 3)>;
+def : Pat<(i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_32_cm MEMxi:$src_gpr, 3)>;
+def : Pat<(v2i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_64_cm MEMxi:$src_gpr, 3)>;
+def : Pat<(v4i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_128_cm MEMxi:$src_gpr, 3)>;
+
+//===----------------------------------------------------------------------===//
+// VTX Read from constant memory space
+//===----------------------------------------------------------------------===//
+def : Pat<(i32:$dst_gpr (vtx_id2_az_extloadi8 ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_8_cm MEMxi:$src_gpr, 2)>;
+def : Pat<(i32:$dst_gpr (vtx_id2_az_extloadi16 ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_16_cm MEMxi:$src_gpr, 2)>;
+def : Pat<(i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_32_cm MEMxi:$src_gpr, 2)>;
+def : Pat<(v2i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_64_cm MEMxi:$src_gpr, 2)>;
+def : Pat<(v4i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_128_cm MEMxi:$src_gpr, 2)>;
+
+//===----------------------------------------------------------------------===//
+// VTX Read from global memory space
+//===----------------------------------------------------------------------===//
+def : Pat<(i32:$dst_gpr (vtx_id1_az_extloadi8 ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_8_cm MEMxi:$src_gpr, 1)>;
+def : Pat<(i32:$dst_gpr (vtx_id1_az_extloadi16 ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_16_cm MEMxi:$src_gpr, 1)>;
+def : Pat<(i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_32_cm MEMxi:$src_gpr, 1)>;
+def : Pat<(v2i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_64_cm MEMxi:$src_gpr, 1)>;
+def : Pat<(v4i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_128_cm MEMxi:$src_gpr, 1)>;
+
+} // End isCayman
+
diff --git a/contrib/llvm/lib/Target/AMDGPU/DSInstructions.td b/contrib/llvm/lib/Target/AMDGPU/DSInstructions.td
new file mode 100644
index 000000000000..a077001df6bd
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -0,0 +1,906 @@
+//===-- DSInstructions.td - DS Instruction Defintions ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+class DS_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> pattern=[]> :
+ InstSI <outs, ins, "", pattern>,
+ SIMCInstr <opName, SIEncodingFamily.NONE> {
+
+ let SubtargetPredicate = isGCN;
+
+ let LGKM_CNT = 1;
+ let DS = 1;
+ let Size = 8;
+ let UseNamedOperandTable = 1;
+ let Uses = [M0, EXEC];
+
+ // Most instruction load and store data, so set this as the default.
+ let mayLoad = 1;
+ let mayStore = 1;
+
+ let hasSideEffects = 0;
+ let SchedRW = [WriteLDS];
+
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+
+ let AsmMatchConverter = "cvtDS";
+
+ string Mnemonic = opName;
+ string AsmOperands = asmOps;
+
+ // Well these bits a kind of hack because it would be more natural
+ // to test "outs" and "ins" dags for the presence of particular operands
+ bits<1> has_vdst = 1;
+ bits<1> has_addr = 1;
+ bits<1> has_data0 = 1;
+ bits<1> has_data1 = 1;
+
+ bits<1> has_offset = 1; // has "offset" that should be split to offset0,1
+ bits<1> has_offset0 = 1;
+ bits<1> has_offset1 = 1;
+
+ bits<1> has_gds = 1;
+ bits<1> gdsValue = 0; // if has_gds == 0 set gds to this value
+}
+
+class DS_Real <DS_Pseudo ds> :
+ InstSI <ds.OutOperandList, ds.InOperandList, ds.Mnemonic # " " # ds.AsmOperands, []>,
+ Enc64 {
+
+ let isPseudo = 0;
+ let isCodeGenOnly = 0;
+
+ // copy relevant pseudo op flags
+ let SubtargetPredicate = ds.SubtargetPredicate;
+ let AsmMatchConverter = ds.AsmMatchConverter;
+
+ // encoding fields
+ bits<8> vdst;
+ bits<1> gds;
+ bits<8> addr;
+ bits<8> data0;
+ bits<8> data1;
+ bits<8> offset0;
+ bits<8> offset1;
+
+ bits<16> offset;
+ let offset0 = !if(ds.has_offset, offset{7-0}, ?);
+ let offset1 = !if(ds.has_offset, offset{15-8}, ?);
+}
+
+
+// DS Pseudo instructions
+
+class DS_1A1D_NORET<string opName, RegisterClass rc = VGPR_32>
+: DS_Pseudo<opName,
+ (outs),
+ (ins VGPR_32:$addr, rc:$data0, offset:$offset, gds:$gds),
+ "$addr, $data0$offset$gds">,
+ AtomicNoRet<opName, 0> {
+
+ let has_data1 = 0;
+ let has_vdst = 0;
+}
+
+class DS_1A_Off8_NORET<string opName> : DS_Pseudo<opName,
+ (outs),
+ (ins VGPR_32:$addr, offset0:$offset0, offset1:$offset1, gds:$gds),
+ "$addr $offset0$offset1$gds"> {
+
+ let has_data0 = 0;
+ let has_data1 = 0;
+ let has_vdst = 0;
+ let has_offset = 0;
+ let AsmMatchConverter = "cvtDSOffset01";
+}
+
+class DS_1A2D_NORET<string opName, RegisterClass rc = VGPR_32>
+: DS_Pseudo<opName,
+ (outs),
+ (ins VGPR_32:$addr, rc:$data0, rc:$data1, offset:$offset, gds:$gds),
+ "$addr, $data0, $data1"#"$offset"#"$gds">,
+ AtomicNoRet<opName, 0> {
+
+ let has_vdst = 0;
+}
+
+class DS_1A2D_Off8_NORET <string opName, RegisterClass rc = VGPR_32>
+: DS_Pseudo<opName,
+ (outs),
+ (ins VGPR_32:$addr, rc:$data0, rc:$data1,
+ offset0:$offset0, offset1:$offset1, gds:$gds),
+ "$addr, $data0, $data1$offset0$offset1$gds"> {
+
+ let has_vdst = 0;
+ let has_offset = 0;
+ let AsmMatchConverter = "cvtDSOffset01";
+}
+
+class DS_1A1D_RET <string opName, RegisterClass rc = VGPR_32>
+: DS_Pseudo<opName,
+ (outs rc:$vdst),
+ (ins VGPR_32:$addr, rc:$data0, offset:$offset, gds:$gds),
+ "$vdst, $addr, $data0$offset$gds"> {
+
+ let hasPostISelHook = 1;
+ let has_data1 = 0;
+}
+
+class DS_1A2D_RET<string opName,
+ RegisterClass rc = VGPR_32,
+ RegisterClass src = rc>
+: DS_Pseudo<opName,
+ (outs rc:$vdst),
+ (ins VGPR_32:$addr, src:$data0, src:$data1, offset:$offset, gds:$gds),
+ "$vdst, $addr, $data0, $data1$offset$gds"> {
+
+ let hasPostISelHook = 1;
+}
+
+class DS_1A_RET<string opName, RegisterClass rc = VGPR_32>
+: DS_Pseudo<opName,
+ (outs rc:$vdst),
+ (ins VGPR_32:$addr, offset:$offset, gds:$gds),
+ "$vdst, $addr$offset$gds"> {
+
+ let has_data0 = 0;
+ let has_data1 = 0;
+}
+
+class DS_1A_Off8_RET <string opName, RegisterClass rc = VGPR_32>
+: DS_Pseudo<opName,
+ (outs rc:$vdst),
+ (ins VGPR_32:$addr, offset0:$offset0, offset1:$offset1, gds:$gds),
+ "$vdst, $addr$offset0$offset1$gds"> {
+
+ let has_offset = 0;
+ let has_data0 = 0;
+ let has_data1 = 0;
+ let AsmMatchConverter = "cvtDSOffset01";
+}
+
+class DS_1A_RET_GDS <string opName> : DS_Pseudo<opName,
+ (outs VGPR_32:$vdst),
+ (ins VGPR_32:$addr, offset:$offset),
+ "$vdst, $addr$offset gds"> {
+
+ let has_data0 = 0;
+ let has_data1 = 0;
+ let has_gds = 0;
+ let gdsValue = 1;
+}
+
+class DS_0A_RET <string opName> : DS_Pseudo<opName,
+ (outs VGPR_32:$vdst),
+ (ins offset:$offset, gds:$gds),
+ "$vdst$offset$gds"> {
+
+ let mayLoad = 1;
+ let mayStore = 1;
+
+ let has_addr = 0;
+ let has_data0 = 0;
+ let has_data1 = 0;
+}
+
+class DS_1A <string opName> : DS_Pseudo<opName,
+ (outs),
+ (ins VGPR_32:$addr, offset:$offset, gds:$gds),
+ "$addr$offset$gds"> {
+
+ let mayLoad = 1;
+ let mayStore = 1;
+
+ let has_vdst = 0;
+ let has_data0 = 0;
+ let has_data1 = 0;
+}
+
+class DS_1A_GDS <string opName> : DS_Pseudo<opName,
+ (outs),
+ (ins VGPR_32:$addr),
+ "$addr gds"> {
+
+ let has_vdst = 0;
+ let has_data0 = 0;
+ let has_data1 = 0;
+ let has_offset = 0;
+ let has_offset0 = 0;
+ let has_offset1 = 0;
+
+ let has_gds = 0;
+ let gdsValue = 1;
+}
+
+class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag>
+: DS_Pseudo<opName,
+ (outs VGPR_32:$vdst),
+ (ins VGPR_32:$addr, VGPR_32:$data0, offset:$offset),
+ "$vdst, $addr, $data0$offset",
+ [(set i32:$vdst,
+ (node (DS1Addr1Offset i32:$addr, i16:$offset), i32:$data0))] > {
+
+ let mayLoad = 0;
+ let mayStore = 0;
+ let isConvergent = 1;
+
+ let has_data1 = 0;
+ let has_gds = 0;
+}
+
+def DS_ADD_U32 : DS_1A1D_NORET<"ds_add_u32">;
+def DS_SUB_U32 : DS_1A1D_NORET<"ds_sub_u32">;
+def DS_RSUB_U32 : DS_1A1D_NORET<"ds_rsub_u32">;
+def DS_INC_U32 : DS_1A1D_NORET<"ds_inc_u32">;
+def DS_DEC_U32 : DS_1A1D_NORET<"ds_dec_u32">;
+def DS_MIN_I32 : DS_1A1D_NORET<"ds_min_i32">;
+def DS_MAX_I32 : DS_1A1D_NORET<"ds_max_i32">;
+def DS_MIN_U32 : DS_1A1D_NORET<"ds_min_u32">;
+def DS_MAX_U32 : DS_1A1D_NORET<"ds_max_u32">;
+def DS_AND_B32 : DS_1A1D_NORET<"ds_and_b32">;
+def DS_OR_B32 : DS_1A1D_NORET<"ds_or_b32">;
+def DS_XOR_B32 : DS_1A1D_NORET<"ds_xor_b32">;
+def DS_ADD_F32 : DS_1A1D_NORET<"ds_add_f32">;
+def DS_MIN_F32 : DS_1A1D_NORET<"ds_min_f32">;
+def DS_MAX_F32 : DS_1A1D_NORET<"ds_max_f32">;
+
+let mayLoad = 0 in {
+def DS_WRITE_B8 : DS_1A1D_NORET<"ds_write_b8">;
+def DS_WRITE_B16 : DS_1A1D_NORET<"ds_write_b16">;
+def DS_WRITE_B32 : DS_1A1D_NORET<"ds_write_b32">;
+def DS_WRITE2_B32 : DS_1A2D_Off8_NORET<"ds_write2_b32">;
+def DS_WRITE2ST64_B32 : DS_1A2D_Off8_NORET<"ds_write2st64_b32">;
+}
+
+def DS_MSKOR_B32 : DS_1A2D_NORET<"ds_mskor_b32">;
+def DS_CMPST_B32 : DS_1A2D_NORET<"ds_cmpst_b32">;
+def DS_CMPST_F32 : DS_1A2D_NORET<"ds_cmpst_f32">;
+
+def DS_ADD_U64 : DS_1A1D_NORET<"ds_add_u64", VReg_64>;
+def DS_SUB_U64 : DS_1A1D_NORET<"ds_sub_u64", VReg_64>;
+def DS_RSUB_U64 : DS_1A1D_NORET<"ds_rsub_u64", VReg_64>;
+def DS_INC_U64 : DS_1A1D_NORET<"ds_inc_u64", VReg_64>;
+def DS_DEC_U64 : DS_1A1D_NORET<"ds_dec_u64", VReg_64>;
+def DS_MIN_I64 : DS_1A1D_NORET<"ds_min_i64", VReg_64>;
+def DS_MAX_I64 : DS_1A1D_NORET<"ds_max_i64", VReg_64>;
+def DS_MIN_U64 : DS_1A1D_NORET<"ds_min_u64", VReg_64>;
+def DS_MAX_U64 : DS_1A1D_NORET<"ds_max_u64", VReg_64>;
+def DS_AND_B64 : DS_1A1D_NORET<"ds_and_b64", VReg_64>;
+def DS_OR_B64 : DS_1A1D_NORET<"ds_or_b64", VReg_64>;
+def DS_XOR_B64 : DS_1A1D_NORET<"ds_xor_b64", VReg_64>;
+def DS_MSKOR_B64 : DS_1A2D_NORET<"ds_mskor_b64", VReg_64>;
+let mayLoad = 0 in {
+def DS_WRITE_B64 : DS_1A1D_NORET<"ds_write_b64", VReg_64>;
+def DS_WRITE2_B64 : DS_1A2D_Off8_NORET<"ds_write2_b64", VReg_64>;
+def DS_WRITE2ST64_B64 : DS_1A2D_Off8_NORET<"ds_write2st64_b64", VReg_64>;
+}
+def DS_CMPST_B64 : DS_1A2D_NORET<"ds_cmpst_b64", VReg_64>;
+def DS_CMPST_F64 : DS_1A2D_NORET<"ds_cmpst_f64", VReg_64>;
+def DS_MIN_F64 : DS_1A1D_NORET<"ds_min_f64", VReg_64>;
+def DS_MAX_F64 : DS_1A1D_NORET<"ds_max_f64", VReg_64>;
+
+def DS_ADD_RTN_U32 : DS_1A1D_RET<"ds_add_rtn_u32">,
+ AtomicNoRet<"ds_add_u32", 1>;
+def DS_ADD_RTN_F32 : DS_1A1D_RET<"ds_add_rtn_f32">,
+ AtomicNoRet<"ds_add_f32", 1>;
+def DS_SUB_RTN_U32 : DS_1A1D_RET<"ds_sub_rtn_u32">,
+ AtomicNoRet<"ds_sub_u32", 1>;
+def DS_RSUB_RTN_U32 : DS_1A1D_RET<"ds_rsub_rtn_u32">,
+ AtomicNoRet<"ds_rsub_u32", 1>;
+def DS_INC_RTN_U32 : DS_1A1D_RET<"ds_inc_rtn_u32">,
+ AtomicNoRet<"ds_inc_u32", 1>;
+def DS_DEC_RTN_U32 : DS_1A1D_RET<"ds_dec_rtn_u32">,
+ AtomicNoRet<"ds_dec_u32", 1>;
+def DS_MIN_RTN_I32 : DS_1A1D_RET<"ds_min_rtn_i32">,
+ AtomicNoRet<"ds_min_i32", 1>;
+def DS_MAX_RTN_I32 : DS_1A1D_RET<"ds_max_rtn_i32">,
+ AtomicNoRet<"ds_max_i32", 1>;
+def DS_MIN_RTN_U32 : DS_1A1D_RET<"ds_min_rtn_u32">,
+ AtomicNoRet<"ds_min_u32", 1>;
+def DS_MAX_RTN_U32 : DS_1A1D_RET<"ds_max_rtn_u32">,
+ AtomicNoRet<"ds_max_u32", 1>;
+def DS_AND_RTN_B32 : DS_1A1D_RET<"ds_and_rtn_b32">,
+ AtomicNoRet<"ds_and_b32", 1>;
+def DS_OR_RTN_B32 : DS_1A1D_RET<"ds_or_rtn_b32">,
+ AtomicNoRet<"ds_or_b32", 1>;
+def DS_XOR_RTN_B32 : DS_1A1D_RET<"ds_xor_rtn_b32">,
+ AtomicNoRet<"ds_xor_b32", 1>;
+def DS_MSKOR_RTN_B32 : DS_1A2D_RET<"ds_mskor_rtn_b32">,
+ AtomicNoRet<"ds_mskor_b32", 1>;
+def DS_CMPST_RTN_B32 : DS_1A2D_RET <"ds_cmpst_rtn_b32">,
+ AtomicNoRet<"ds_cmpst_b32", 1>;
+def DS_CMPST_RTN_F32 : DS_1A2D_RET <"ds_cmpst_rtn_f32">,
+ AtomicNoRet<"ds_cmpst_f32", 1>;
+def DS_MIN_RTN_F32 : DS_1A1D_RET <"ds_min_rtn_f32">,
+ AtomicNoRet<"ds_min_f32", 1>;
+def DS_MAX_RTN_F32 : DS_1A1D_RET <"ds_max_rtn_f32">,
+ AtomicNoRet<"ds_max_f32", 1>;
+
+def DS_WRXCHG_RTN_B32 : DS_1A1D_RET<"ds_wrxchg_rtn_b32">,
+ AtomicNoRet<"", 1>;
+def DS_WRXCHG2_RTN_B32 : DS_1A2D_RET<"ds_wrxchg2_rtn_b32", VReg_64, VGPR_32>,
+ AtomicNoRet<"", 1>;
+def DS_WRXCHG2ST64_RTN_B32 : DS_1A2D_RET<"ds_wrxchg2st64_rtn_b32", VReg_64, VGPR_32>,
+ AtomicNoRet<"", 1>;
+
+def DS_ADD_RTN_U64 : DS_1A1D_RET<"ds_add_rtn_u64", VReg_64>,
+ AtomicNoRet<"ds_add_u64", 1>;
+def DS_SUB_RTN_U64 : DS_1A1D_RET<"ds_sub_rtn_u64", VReg_64>,
+ AtomicNoRet<"ds_sub_u64", 1>;
+def DS_RSUB_RTN_U64 : DS_1A1D_RET<"ds_rsub_rtn_u64", VReg_64>,
+ AtomicNoRet<"ds_rsub_u64", 1>;
+def DS_INC_RTN_U64 : DS_1A1D_RET<"ds_inc_rtn_u64", VReg_64>,
+ AtomicNoRet<"ds_inc_u64", 1>;
+def DS_DEC_RTN_U64 : DS_1A1D_RET<"ds_dec_rtn_u64", VReg_64>,
+ AtomicNoRet<"ds_dec_u64", 1>;
+def DS_MIN_RTN_I64 : DS_1A1D_RET<"ds_min_rtn_i64", VReg_64>,
+ AtomicNoRet<"ds_min_i64", 1>;
+def DS_MAX_RTN_I64 : DS_1A1D_RET<"ds_max_rtn_i64", VReg_64>,
+ AtomicNoRet<"ds_max_i64", 1>;
+def DS_MIN_RTN_U64 : DS_1A1D_RET<"ds_min_rtn_u64", VReg_64>,
+ AtomicNoRet<"ds_min_u64", 1>;
+def DS_MAX_RTN_U64 : DS_1A1D_RET<"ds_max_rtn_u64", VReg_64>,
+ AtomicNoRet<"ds_max_u64", 1>;
+def DS_AND_RTN_B64 : DS_1A1D_RET<"ds_and_rtn_b64", VReg_64>,
+ AtomicNoRet<"ds_and_b64", 1>;
+def DS_OR_RTN_B64 : DS_1A1D_RET<"ds_or_rtn_b64", VReg_64>,
+ AtomicNoRet<"ds_or_b64", 1>;
+def DS_XOR_RTN_B64 : DS_1A1D_RET<"ds_xor_rtn_b64", VReg_64>,
+ AtomicNoRet<"ds_xor_b64", 1>;
+def DS_MSKOR_RTN_B64 : DS_1A2D_RET<"ds_mskor_rtn_b64", VReg_64>,
+ AtomicNoRet<"ds_mskor_b64", 1>;
+def DS_CMPST_RTN_B64 : DS_1A2D_RET<"ds_cmpst_rtn_b64", VReg_64>,
+ AtomicNoRet<"ds_cmpst_b64", 1>;
+def DS_CMPST_RTN_F64 : DS_1A2D_RET<"ds_cmpst_rtn_f64", VReg_64>,
+ AtomicNoRet<"ds_cmpst_f64", 1>;
+def DS_MIN_RTN_F64 : DS_1A1D_RET<"ds_min_rtn_f64", VReg_64>,
+ AtomicNoRet<"ds_min_f64", 1>;
+def DS_MAX_RTN_F64 : DS_1A1D_RET<"ds_max_rtn_f64", VReg_64>,
+ AtomicNoRet<"ds_max_f64", 1>;
+
+def DS_WRXCHG_RTN_B64 : DS_1A1D_RET<"ds_wrxchg_rtn_b64", VReg_64>,
+ AtomicNoRet<"ds_wrxchg_b64", 1>;
+def DS_WRXCHG2_RTN_B64 : DS_1A2D_RET<"ds_wrxchg2_rtn_b64", VReg_128, VReg_64>,
+ AtomicNoRet<"ds_wrxchg2_b64", 1>;
+def DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_RET<"ds_wrxchg2st64_rtn_b64", VReg_128, VReg_64>,
+ AtomicNoRet<"ds_wrxchg2st64_b64", 1>;
+
+def DS_GWS_INIT : DS_1A_GDS<"ds_gws_init">;
+def DS_GWS_SEMA_V : DS_1A_GDS<"ds_gws_sema_v">;
+def DS_GWS_SEMA_BR : DS_1A_GDS<"ds_gws_sema_br">;
+def DS_GWS_SEMA_P : DS_1A_GDS<"ds_gws_sema_p">;
+def DS_GWS_BARRIER : DS_1A_GDS<"ds_gws_barrier">;
+
+def DS_ADD_SRC2_U32 : DS_1A<"ds_add_src2_u32">;
+def DS_SUB_SRC2_U32 : DS_1A<"ds_sub_src2_u32">;
+def DS_RSUB_SRC2_U32 : DS_1A<"ds_rsub_src2_u32">;
+def DS_INC_SRC2_U32 : DS_1A<"ds_inc_src2_u32">;
+def DS_DEC_SRC2_U32 : DS_1A<"ds_dec_src2_u32">;
+def DS_MIN_SRC2_I32 : DS_1A<"ds_min_src2_i32">;
+def DS_MAX_SRC2_I32 : DS_1A<"ds_max_src2_i32">;
+def DS_MIN_SRC2_U32 : DS_1A<"ds_min_src2_u32">;
+def DS_MAX_SRC2_U32 : DS_1A<"ds_max_src2_u32">;
+def DS_AND_SRC2_B32 : DS_1A<"ds_and_src_b32">;
+def DS_OR_SRC2_B32 : DS_1A<"ds_or_src2_b32">;
+def DS_XOR_SRC2_B32 : DS_1A<"ds_xor_src2_b32">;
+def DS_MIN_SRC2_F32 : DS_1A<"ds_min_src2_f32">;
+def DS_MAX_SRC2_F32 : DS_1A<"ds_max_src2_f32">;
+
+def DS_ADD_SRC2_U64 : DS_1A<"ds_add_src2_u64">;
+def DS_SUB_SRC2_U64 : DS_1A<"ds_sub_src2_u64">;
+def DS_RSUB_SRC2_U64 : DS_1A<"ds_rsub_src2_u64">;
+def DS_INC_SRC2_U64 : DS_1A<"ds_inc_src2_u64">;
+def DS_DEC_SRC2_U64 : DS_1A<"ds_dec_src2_u64">;
+def DS_MIN_SRC2_I64 : DS_1A<"ds_min_src2_i64">;
+def DS_MAX_SRC2_I64 : DS_1A<"ds_max_src2_i64">;
+def DS_MIN_SRC2_U64 : DS_1A<"ds_min_src2_u64">;
+def DS_MAX_SRC2_U64 : DS_1A<"ds_max_src2_u64">;
+def DS_AND_SRC2_B64 : DS_1A<"ds_and_src2_b64">;
+def DS_OR_SRC2_B64 : DS_1A<"ds_or_src2_b64">;
+def DS_XOR_SRC2_B64 : DS_1A<"ds_xor_src2_b64">;
+def DS_MIN_SRC2_F64 : DS_1A<"ds_min_src2_f64">;
+def DS_MAX_SRC2_F64 : DS_1A<"ds_max_src2_f64">;
+
+def DS_WRITE_SRC2_B32 : DS_1A_Off8_NORET<"ds_write_src2_b32">;
+def DS_WRITE_SRC2_B64 : DS_1A_Off8_NORET<"ds_write_src2_b64">;
+
+let Uses = [EXEC], mayLoad = 0, mayStore = 0, isConvergent = 1 in {
+def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32">;
+}
+
+let mayStore = 0 in {
+def DS_READ_I8 : DS_1A_RET<"ds_read_i8">;
+def DS_READ_U8 : DS_1A_RET<"ds_read_u8">;
+def DS_READ_I16 : DS_1A_RET<"ds_read_i16">;
+def DS_READ_U16 : DS_1A_RET<"ds_read_u16">;
+def DS_READ_B32 : DS_1A_RET<"ds_read_b32">;
+def DS_READ_B64 : DS_1A_RET<"ds_read_b64", VReg_64>;
+
+def DS_READ2_B32 : DS_1A_Off8_RET<"ds_read2_b32", VReg_64>;
+def DS_READ2ST64_B32 : DS_1A_Off8_RET<"ds_read2st64_b32", VReg_64>;
+
+def DS_READ2_B64 : DS_1A_Off8_RET<"ds_read2_b64", VReg_128>;
+def DS_READ2ST64_B64 : DS_1A_Off8_RET<"ds_read2st64_b64", VReg_128>;
+}
+
+let SubtargetPredicate = isSICI in {
+def DS_CONSUME : DS_0A_RET<"ds_consume">;
+def DS_APPEND : DS_0A_RET<"ds_append">;
+def DS_ORDERED_COUNT : DS_1A_RET_GDS<"ds_ordered_count">;
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction definitions for CI and newer.
+//===----------------------------------------------------------------------===//
+// Remaining instructions:
+// DS_NOP
+// DS_GWS_SEMA_RELEASE_ALL
+// DS_WRAP_RTN_B32
+// DS_CNDXCHG32_RTN_B64
+// DS_WRITE_B96
+// DS_WRITE_B128
+// DS_CONDXCHG32_RTN_B128
+// DS_READ_B96
+// DS_READ_B128
+
+let SubtargetPredicate = isCIVI in {
+
+def DS_WRAP_RTN_F32 : DS_1A1D_RET <"ds_wrap_rtn_f32">,
+ AtomicNoRet<"ds_wrap_f32", 1>;
+
+} // let SubtargetPredicate = isCIVI
+
+//===----------------------------------------------------------------------===//
+// Instruction definitions for VI and newer.
+//===----------------------------------------------------------------------===//
+
+let SubtargetPredicate = isVI in {
+
+let Uses = [EXEC] in {
+def DS_PERMUTE_B32 : DS_1A1D_PERMUTE <"ds_permute_b32",
+ int_amdgcn_ds_permute>;
+def DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <"ds_bpermute_b32",
+ int_amdgcn_ds_bpermute>;
+}
+
+} // let SubtargetPredicate = isVI
+
+//===----------------------------------------------------------------------===//
+// DS Patterns
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isGCN] in {
+
+def : Pat <
+ (int_amdgcn_ds_swizzle i32:$src, imm:$offset16),
+ (DS_SWIZZLE_B32 $src, (as_i16imm $offset16), (i1 0))
+>;
+
+class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag> : Pat <
+ (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))),
+ (inst $ptr, (as_i16imm $offset), (i1 0))
+>;
+
+def : DSReadPat <DS_READ_I8, i32, si_sextload_local_i8>;
+def : DSReadPat <DS_READ_U8, i32, si_az_extload_local_i8>;
+def : DSReadPat <DS_READ_I8, i16, si_sextload_local_i8>;
+def : DSReadPat <DS_READ_U8, i16, si_az_extload_local_i8>;
+def : DSReadPat <DS_READ_I16, i32, si_sextload_local_i16>;
+def : DSReadPat <DS_READ_I16, i32, si_sextload_local_i16>;
+def : DSReadPat <DS_READ_U16, i32, si_az_extload_local_i16>;
+def : DSReadPat <DS_READ_U16, i16, si_load_local>;
+def : DSReadPat <DS_READ_B32, i32, si_load_local>;
+
+let AddedComplexity = 100 in {
+
+def : DSReadPat <DS_READ_B64, v2i32, si_load_local_align8>;
+
+} // End AddedComplexity = 100
+
+def : Pat <
+ (v2i32 (si_load_local (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
+ i8:$offset1))),
+ (DS_READ2_B32 $ptr, $offset0, $offset1, (i1 0))
+>;
+
+class DSWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : Pat <
+ (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)),
+ (inst $ptr, $value, (as_i16imm $offset), (i1 0))
+>;
+
+def : DSWritePat <DS_WRITE_B8, i32, si_truncstore_local_i8>;
+def : DSWritePat <DS_WRITE_B16, i32, si_truncstore_local_i16>;
+def : DSWritePat <DS_WRITE_B8, i16, si_truncstore_local_i8>;
+def : DSWritePat <DS_WRITE_B16, i16, si_store_local>;
+def : DSWritePat <DS_WRITE_B32, i32, si_store_local>;
+
+let AddedComplexity = 100 in {
+
+def : DSWritePat <DS_WRITE_B64, v2i32, si_store_local_align8>;
+} // End AddedComplexity = 100
+
+def : Pat <
+ (si_store_local v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
+ i8:$offset1)),
+ (DS_WRITE2_B32 $ptr, (i32 (EXTRACT_SUBREG $value, sub0)),
+ (i32 (EXTRACT_SUBREG $value, sub1)), $offset0, $offset1,
+ (i1 0))
+>;
+
+class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag> : Pat <
+ (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
+ (inst $ptr, $value, (as_i16imm $offset), (i1 0))
+>;
+
+class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag> : Pat <
+ (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap),
+ (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 0))
+>;
+
+
+// 32-bit atomics.
+def : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, si_atomic_swap_local>;
+def : DSAtomicRetPat<DS_ADD_RTN_U32, i32, si_atomic_load_add_local>;
+def : DSAtomicRetPat<DS_SUB_RTN_U32, i32, si_atomic_load_sub_local>;
+def : DSAtomicRetPat<DS_INC_RTN_U32, i32, si_atomic_inc_local>;
+def : DSAtomicRetPat<DS_DEC_RTN_U32, i32, si_atomic_dec_local>;
+def : DSAtomicRetPat<DS_AND_RTN_B32, i32, si_atomic_load_and_local>;
+def : DSAtomicRetPat<DS_OR_RTN_B32, i32, si_atomic_load_or_local>;
+def : DSAtomicRetPat<DS_XOR_RTN_B32, i32, si_atomic_load_xor_local>;
+def : DSAtomicRetPat<DS_MIN_RTN_I32, i32, si_atomic_load_min_local>;
+def : DSAtomicRetPat<DS_MAX_RTN_I32, i32, si_atomic_load_max_local>;
+def : DSAtomicRetPat<DS_MIN_RTN_U32, i32, si_atomic_load_umin_local>;
+def : DSAtomicRetPat<DS_MAX_RTN_U32, i32, si_atomic_load_umax_local>;
+def : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, si_atomic_cmp_swap_32_local>;
+
+// 64-bit atomics.
+def : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, si_atomic_swap_local>;
+def : DSAtomicRetPat<DS_ADD_RTN_U64, i64, si_atomic_load_add_local>;
+def : DSAtomicRetPat<DS_SUB_RTN_U64, i64, si_atomic_load_sub_local>;
+def : DSAtomicRetPat<DS_INC_RTN_U64, i64, si_atomic_inc_local>;
+def : DSAtomicRetPat<DS_DEC_RTN_U64, i64, si_atomic_dec_local>;
+def : DSAtomicRetPat<DS_AND_RTN_B64, i64, si_atomic_load_and_local>;
+def : DSAtomicRetPat<DS_OR_RTN_B64, i64, si_atomic_load_or_local>;
+def : DSAtomicRetPat<DS_XOR_RTN_B64, i64, si_atomic_load_xor_local>;
+def : DSAtomicRetPat<DS_MIN_RTN_I64, i64, si_atomic_load_min_local>;
+def : DSAtomicRetPat<DS_MAX_RTN_I64, i64, si_atomic_load_max_local>;
+def : DSAtomicRetPat<DS_MIN_RTN_U64, i64, si_atomic_load_umin_local>;
+def : DSAtomicRetPat<DS_MAX_RTN_U64, i64, si_atomic_load_umax_local>;
+
+def : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, si_atomic_cmp_swap_64_local>;
+
+} // let Predicates = [isGCN]
+
+//===----------------------------------------------------------------------===//
+// Real instructions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SIInstructions.td
+//===----------------------------------------------------------------------===//
+
+class DS_Real_si <bits<8> op, DS_Pseudo ds> :
+ DS_Real <ds>,
+ SIMCInstr <ds.Mnemonic, SIEncodingFamily.SI> {
+ let AssemblerPredicates=[isSICI];
+ let DecoderNamespace="SICI";
+
+ // encoding
+ let Inst{7-0} = !if(ds.has_offset0, offset0, 0);
+ let Inst{15-8} = !if(ds.has_offset1, offset1, 0);
+ let Inst{17} = !if(ds.has_gds, gds, ds.gdsValue);
+ let Inst{25-18} = op;
+ let Inst{31-26} = 0x36; // ds prefix
+ let Inst{39-32} = !if(ds.has_addr, addr, 0);
+ let Inst{47-40} = !if(ds.has_data0, data0, 0);
+ let Inst{55-48} = !if(ds.has_data1, data1, 0);
+ let Inst{63-56} = !if(ds.has_vdst, vdst, 0);
+}
+
+def DS_ADD_U32_si : DS_Real_si<0x0, DS_ADD_U32>;
+def DS_SUB_U32_si : DS_Real_si<0x1, DS_SUB_U32>;
+def DS_RSUB_U32_si : DS_Real_si<0x2, DS_RSUB_U32>;
+def DS_INC_U32_si : DS_Real_si<0x3, DS_INC_U32>;
+def DS_DEC_U32_si : DS_Real_si<0x4, DS_DEC_U32>;
+def DS_MIN_I32_si : DS_Real_si<0x5, DS_MIN_I32>;
+def DS_MAX_I32_si : DS_Real_si<0x6, DS_MAX_I32>;
+def DS_MIN_U32_si : DS_Real_si<0x7, DS_MIN_U32>;
+def DS_MAX_U32_si : DS_Real_si<0x8, DS_MAX_U32>;
+def DS_AND_B32_si : DS_Real_si<0x9, DS_AND_B32>;
+def DS_OR_B32_si : DS_Real_si<0xa, DS_OR_B32>;
+def DS_XOR_B32_si : DS_Real_si<0xb, DS_XOR_B32>;
+def DS_MSKOR_B32_si : DS_Real_si<0xc, DS_MSKOR_B32>;
+def DS_WRITE_B32_si : DS_Real_si<0xd, DS_WRITE_B32>;
+def DS_WRITE2_B32_si : DS_Real_si<0xe, DS_WRITE2_B32>;
+def DS_WRITE2ST64_B32_si : DS_Real_si<0xf, DS_WRITE2ST64_B32>;
+def DS_CMPST_B32_si : DS_Real_si<0x10, DS_CMPST_B32>;
+def DS_CMPST_F32_si : DS_Real_si<0x11, DS_CMPST_F32>;
+def DS_MIN_F32_si : DS_Real_si<0x12, DS_MIN_F32>;
+def DS_MAX_F32_si : DS_Real_si<0x13, DS_MAX_F32>;
+def DS_GWS_INIT_si : DS_Real_si<0x19, DS_GWS_INIT>;
+def DS_GWS_SEMA_V_si : DS_Real_si<0x1a, DS_GWS_SEMA_V>;
+def DS_GWS_SEMA_BR_si : DS_Real_si<0x1b, DS_GWS_SEMA_BR>;
+def DS_GWS_SEMA_P_si : DS_Real_si<0x1c, DS_GWS_SEMA_P>;
+def DS_GWS_BARRIER_si : DS_Real_si<0x1d, DS_GWS_BARRIER>;
+def DS_WRITE_B8_si : DS_Real_si<0x1e, DS_WRITE_B8>;
+def DS_WRITE_B16_si : DS_Real_si<0x1f, DS_WRITE_B16>;
+def DS_ADD_RTN_U32_si : DS_Real_si<0x20, DS_ADD_RTN_U32>;
+def DS_SUB_RTN_U32_si : DS_Real_si<0x21, DS_SUB_RTN_U32>;
+def DS_RSUB_RTN_U32_si : DS_Real_si<0x22, DS_RSUB_RTN_U32>;
+def DS_INC_RTN_U32_si : DS_Real_si<0x23, DS_INC_RTN_U32>;
+def DS_DEC_RTN_U32_si : DS_Real_si<0x24, DS_DEC_RTN_U32>;
+def DS_MIN_RTN_I32_si : DS_Real_si<0x25, DS_MIN_RTN_I32>;
+def DS_MAX_RTN_I32_si : DS_Real_si<0x26, DS_MAX_RTN_I32>;
+def DS_MIN_RTN_U32_si : DS_Real_si<0x27, DS_MIN_RTN_U32>;
+def DS_MAX_RTN_U32_si : DS_Real_si<0x28, DS_MAX_RTN_U32>;
+def DS_AND_RTN_B32_si : DS_Real_si<0x29, DS_AND_RTN_B32>;
+def DS_OR_RTN_B32_si : DS_Real_si<0x2a, DS_OR_RTN_B32>;
+def DS_XOR_RTN_B32_si : DS_Real_si<0x2b, DS_XOR_RTN_B32>;
+def DS_MSKOR_RTN_B32_si : DS_Real_si<0x2c, DS_MSKOR_RTN_B32>;
+def DS_WRXCHG_RTN_B32_si : DS_Real_si<0x2d, DS_WRXCHG_RTN_B32>;
+def DS_WRXCHG2_RTN_B32_si : DS_Real_si<0x2e, DS_WRXCHG2_RTN_B32>;
+def DS_WRXCHG2ST64_RTN_B32_si : DS_Real_si<0x2f, DS_WRXCHG2ST64_RTN_B32>;
+def DS_CMPST_RTN_B32_si : DS_Real_si<0x30, DS_CMPST_RTN_B32>;
+def DS_CMPST_RTN_F32_si : DS_Real_si<0x31, DS_CMPST_RTN_F32>;
+def DS_MIN_RTN_F32_si : DS_Real_si<0x32, DS_MIN_RTN_F32>;
+def DS_MAX_RTN_F32_si : DS_Real_si<0x33, DS_MAX_RTN_F32>;
+
+// FIXME: this instruction is actually CI/VI
+def DS_WRAP_RTN_F32_si : DS_Real_si<0x34, DS_WRAP_RTN_F32>;
+
+def DS_SWIZZLE_B32_si : DS_Real_si<0x35, DS_SWIZZLE_B32>;
+def DS_READ_B32_si : DS_Real_si<0x36, DS_READ_B32>;
+def DS_READ2_B32_si : DS_Real_si<0x37, DS_READ2_B32>;
+def DS_READ2ST64_B32_si : DS_Real_si<0x38, DS_READ2ST64_B32>;
+def DS_READ_I8_si : DS_Real_si<0x39, DS_READ_I8>;
+def DS_READ_U8_si : DS_Real_si<0x3a, DS_READ_U8>;
+def DS_READ_I16_si : DS_Real_si<0x3b, DS_READ_I16>;
+def DS_READ_U16_si : DS_Real_si<0x3c, DS_READ_U16>;
+def DS_CONSUME_si : DS_Real_si<0x3d, DS_CONSUME>;
+def DS_APPEND_si : DS_Real_si<0x3e, DS_APPEND>;
+def DS_ORDERED_COUNT_si : DS_Real_si<0x3f, DS_ORDERED_COUNT>;
+def DS_ADD_U64_si : DS_Real_si<0x40, DS_ADD_U64>;
+def DS_SUB_U64_si : DS_Real_si<0x41, DS_SUB_U64>;
+def DS_RSUB_U64_si : DS_Real_si<0x42, DS_RSUB_U64>;
+def DS_INC_U64_si : DS_Real_si<0x43, DS_INC_U64>;
+def DS_DEC_U64_si : DS_Real_si<0x44, DS_DEC_U64>;
+def DS_MIN_I64_si : DS_Real_si<0x45, DS_MIN_I64>;
+def DS_MAX_I64_si : DS_Real_si<0x46, DS_MAX_I64>;
+def DS_MIN_U64_si : DS_Real_si<0x47, DS_MIN_U64>;
+def DS_MAX_U64_si : DS_Real_si<0x48, DS_MAX_U64>;
+def DS_AND_B64_si : DS_Real_si<0x49, DS_AND_B64>;
+def DS_OR_B64_si : DS_Real_si<0x4a, DS_OR_B64>;
+def DS_XOR_B64_si : DS_Real_si<0x4b, DS_XOR_B64>;
+def DS_MSKOR_B64_si : DS_Real_si<0x4c, DS_MSKOR_B64>;
+def DS_WRITE_B64_si : DS_Real_si<0x4d, DS_WRITE_B64>;
+def DS_WRITE2_B64_si : DS_Real_si<0x4E, DS_WRITE2_B64>;
+def DS_WRITE2ST64_B64_si : DS_Real_si<0x4f, DS_WRITE2ST64_B64>;
+def DS_CMPST_B64_si : DS_Real_si<0x50, DS_CMPST_B64>;
+def DS_CMPST_F64_si : DS_Real_si<0x51, DS_CMPST_F64>;
+def DS_MIN_F64_si : DS_Real_si<0x52, DS_MIN_F64>;
+def DS_MAX_F64_si : DS_Real_si<0x53, DS_MAX_F64>;
+
+def DS_ADD_RTN_U64_si : DS_Real_si<0x60, DS_ADD_RTN_U64>;
+def DS_SUB_RTN_U64_si : DS_Real_si<0x61, DS_SUB_RTN_U64>;
+def DS_RSUB_RTN_U64_si : DS_Real_si<0x62, DS_RSUB_RTN_U64>;
+def DS_INC_RTN_U64_si : DS_Real_si<0x63, DS_INC_RTN_U64>;
+def DS_DEC_RTN_U64_si : DS_Real_si<0x64, DS_DEC_RTN_U64>;
+def DS_MIN_RTN_I64_si : DS_Real_si<0x65, DS_MIN_RTN_I64>;
+def DS_MAX_RTN_I64_si : DS_Real_si<0x66, DS_MAX_RTN_I64>;
+def DS_MIN_RTN_U64_si : DS_Real_si<0x67, DS_MIN_RTN_U64>;
+def DS_MAX_RTN_U64_si : DS_Real_si<0x68, DS_MAX_RTN_U64>;
+def DS_AND_RTN_B64_si : DS_Real_si<0x69, DS_AND_RTN_B64>;
+def DS_OR_RTN_B64_si : DS_Real_si<0x6a, DS_OR_RTN_B64>;
+def DS_XOR_RTN_B64_si : DS_Real_si<0x6b, DS_XOR_RTN_B64>;
+def DS_MSKOR_RTN_B64_si : DS_Real_si<0x6c, DS_MSKOR_RTN_B64>;
+def DS_WRXCHG_RTN_B64_si : DS_Real_si<0x6d, DS_WRXCHG_RTN_B64>;
+def DS_WRXCHG2_RTN_B64_si : DS_Real_si<0x6e, DS_WRXCHG2_RTN_B64>;
+def DS_WRXCHG2ST64_RTN_B64_si : DS_Real_si<0x6f, DS_WRXCHG2ST64_RTN_B64>;
+def DS_CMPST_RTN_B64_si : DS_Real_si<0x70, DS_CMPST_RTN_B64>;
+def DS_CMPST_RTN_F64_si : DS_Real_si<0x71, DS_CMPST_RTN_F64>;
+def DS_MIN_RTN_F64_si : DS_Real_si<0x72, DS_MIN_RTN_F64>;
+def DS_MAX_RTN_F64_si : DS_Real_si<0x73, DS_MAX_RTN_F64>;
+
+def DS_READ_B64_si : DS_Real_si<0x76, DS_READ_B64>;
+def DS_READ2_B64_si : DS_Real_si<0x77, DS_READ2_B64>;
+def DS_READ2ST64_B64_si : DS_Real_si<0x78, DS_READ2ST64_B64>;
+
+def DS_ADD_SRC2_U32_si : DS_Real_si<0x80, DS_ADD_SRC2_U32>;
+def DS_SUB_SRC2_U32_si : DS_Real_si<0x81, DS_SUB_SRC2_U32>;
+def DS_RSUB_SRC2_U32_si : DS_Real_si<0x82, DS_RSUB_SRC2_U32>;
+def DS_INC_SRC2_U32_si : DS_Real_si<0x83, DS_INC_SRC2_U32>;
+def DS_DEC_SRC2_U32_si : DS_Real_si<0x84, DS_DEC_SRC2_U32>;
+def DS_MIN_SRC2_I32_si : DS_Real_si<0x85, DS_MIN_SRC2_I32>;
+def DS_MAX_SRC2_I32_si : DS_Real_si<0x86, DS_MAX_SRC2_I32>;
+def DS_MIN_SRC2_U32_si : DS_Real_si<0x87, DS_MIN_SRC2_U32>;
+def DS_MAX_SRC2_U32_si : DS_Real_si<0x88, DS_MAX_SRC2_U32>;
+def DS_AND_SRC2_B32_si : DS_Real_si<0x89, DS_AND_SRC2_B32>;
+def DS_OR_SRC2_B32_si : DS_Real_si<0x8a, DS_OR_SRC2_B32>;
+def DS_XOR_SRC2_B32_si : DS_Real_si<0x8b, DS_XOR_SRC2_B32>;
+def DS_WRITE_SRC2_B32_si : DS_Real_si<0x8d, DS_WRITE_SRC2_B32>;
+
+def DS_MIN_SRC2_F32_si : DS_Real_si<0x92, DS_MIN_SRC2_F32>;
+def DS_MAX_SRC2_F32_si : DS_Real_si<0x93, DS_MAX_SRC2_F32>;
+
+def DS_ADD_SRC2_U64_si : DS_Real_si<0xc0, DS_ADD_SRC2_U64>;
+def DS_SUB_SRC2_U64_si : DS_Real_si<0xc1, DS_SUB_SRC2_U64>;
+def DS_RSUB_SRC2_U64_si : DS_Real_si<0xc2, DS_RSUB_SRC2_U64>;
+def DS_INC_SRC2_U64_si : DS_Real_si<0xc3, DS_INC_SRC2_U64>;
+def DS_DEC_SRC2_U64_si : DS_Real_si<0xc4, DS_DEC_SRC2_U64>;
+def DS_MIN_SRC2_I64_si : DS_Real_si<0xc5, DS_MIN_SRC2_I64>;
+def DS_MAX_SRC2_I64_si : DS_Real_si<0xc6, DS_MAX_SRC2_I64>;
+def DS_MIN_SRC2_U64_si : DS_Real_si<0xc7, DS_MIN_SRC2_U64>;
+def DS_MAX_SRC2_U64_si : DS_Real_si<0xc8, DS_MAX_SRC2_U64>;
+def DS_AND_SRC2_B64_si : DS_Real_si<0xc9, DS_AND_SRC2_B64>;
+def DS_OR_SRC2_B64_si : DS_Real_si<0xca, DS_OR_SRC2_B64>;
+def DS_XOR_SRC2_B64_si : DS_Real_si<0xcb, DS_XOR_SRC2_B64>;
+def DS_WRITE_SRC2_B64_si : DS_Real_si<0xcd, DS_WRITE_SRC2_B64>;
+
+def DS_MIN_SRC2_F64_si : DS_Real_si<0xd2, DS_MIN_SRC2_F64>;
+def DS_MAX_SRC2_F64_si : DS_Real_si<0xd3, DS_MAX_SRC2_F64>;
+
+//===----------------------------------------------------------------------===//
+// VIInstructions.td
+//===----------------------------------------------------------------------===//
+
+class DS_Real_vi <bits<8> op, DS_Pseudo ds> :
+ DS_Real <ds>,
+ SIMCInstr <ds.Mnemonic, SIEncodingFamily.VI> {
+ let AssemblerPredicates = [isVI];
+ let DecoderNamespace="VI";
+
+ // encoding
+ let Inst{7-0} = !if(ds.has_offset0, offset0, 0);
+ let Inst{15-8} = !if(ds.has_offset1, offset1, 0);
+ let Inst{16} = !if(ds.has_gds, gds, ds.gdsValue);
+ let Inst{24-17} = op;
+ let Inst{31-26} = 0x36; // ds prefix
+ let Inst{39-32} = !if(ds.has_addr, addr, 0);
+ let Inst{47-40} = !if(ds.has_data0, data0, 0);
+ let Inst{55-48} = !if(ds.has_data1, data1, 0);
+ let Inst{63-56} = !if(ds.has_vdst, vdst, 0);
+}
+
+def DS_ADD_U32_vi : DS_Real_vi<0x0, DS_ADD_U32>;
+def DS_SUB_U32_vi : DS_Real_vi<0x1, DS_SUB_U32>;
+def DS_RSUB_U32_vi : DS_Real_vi<0x2, DS_RSUB_U32>;
+def DS_INC_U32_vi : DS_Real_vi<0x3, DS_INC_U32>;
+def DS_DEC_U32_vi : DS_Real_vi<0x4, DS_DEC_U32>;
+def DS_MIN_I32_vi : DS_Real_vi<0x5, DS_MIN_I32>;
+def DS_MAX_I32_vi : DS_Real_vi<0x6, DS_MAX_I32>;
+def DS_MIN_U32_vi : DS_Real_vi<0x7, DS_MIN_U32>;
+def DS_MAX_U32_vi : DS_Real_vi<0x8, DS_MAX_U32>;
+def DS_AND_B32_vi : DS_Real_vi<0x9, DS_AND_B32>;
+def DS_OR_B32_vi : DS_Real_vi<0xa, DS_OR_B32>;
+def DS_XOR_B32_vi : DS_Real_vi<0xb, DS_XOR_B32>;
+def DS_MSKOR_B32_vi : DS_Real_vi<0xc, DS_MSKOR_B32>;
+def DS_WRITE_B32_vi : DS_Real_vi<0xd, DS_WRITE_B32>;
+def DS_WRITE2_B32_vi : DS_Real_vi<0xe, DS_WRITE2_B32>;
+def DS_WRITE2ST64_B32_vi : DS_Real_vi<0xf, DS_WRITE2ST64_B32>;
+def DS_CMPST_B32_vi : DS_Real_vi<0x10, DS_CMPST_B32>;
+def DS_CMPST_F32_vi : DS_Real_vi<0x11, DS_CMPST_F32>;
+def DS_MIN_F32_vi : DS_Real_vi<0x12, DS_MIN_F32>;
+def DS_MAX_F32_vi : DS_Real_vi<0x13, DS_MAX_F32>;
+def DS_ADD_F32_vi : DS_Real_vi<0x15, DS_ADD_F32>;
+def DS_GWS_INIT_vi : DS_Real_vi<0x19, DS_GWS_INIT>;
+def DS_GWS_SEMA_V_vi : DS_Real_vi<0x1a, DS_GWS_SEMA_V>;
+def DS_GWS_SEMA_BR_vi : DS_Real_vi<0x1b, DS_GWS_SEMA_BR>;
+def DS_GWS_SEMA_P_vi : DS_Real_vi<0x1c, DS_GWS_SEMA_P>;
+def DS_GWS_BARRIER_vi : DS_Real_vi<0x1d, DS_GWS_BARRIER>;
+def DS_WRITE_B8_vi : DS_Real_vi<0x1e, DS_WRITE_B8>;
+def DS_WRITE_B16_vi : DS_Real_vi<0x1f, DS_WRITE_B16>;
+def DS_ADD_RTN_U32_vi : DS_Real_vi<0x20, DS_ADD_RTN_U32>;
+def DS_SUB_RTN_U32_vi : DS_Real_vi<0x21, DS_SUB_RTN_U32>;
+def DS_RSUB_RTN_U32_vi : DS_Real_vi<0x22, DS_RSUB_RTN_U32>;
+def DS_INC_RTN_U32_vi : DS_Real_vi<0x23, DS_INC_RTN_U32>;
+def DS_DEC_RTN_U32_vi : DS_Real_vi<0x24, DS_DEC_RTN_U32>;
+def DS_MIN_RTN_I32_vi : DS_Real_vi<0x25, DS_MIN_RTN_I32>;
+def DS_MAX_RTN_I32_vi : DS_Real_vi<0x26, DS_MAX_RTN_I32>;
+def DS_MIN_RTN_U32_vi : DS_Real_vi<0x27, DS_MIN_RTN_U32>;
+def DS_MAX_RTN_U32_vi : DS_Real_vi<0x28, DS_MAX_RTN_U32>;
+def DS_AND_RTN_B32_vi : DS_Real_vi<0x29, DS_AND_RTN_B32>;
+def DS_OR_RTN_B32_vi : DS_Real_vi<0x2a, DS_OR_RTN_B32>;
+def DS_XOR_RTN_B32_vi : DS_Real_vi<0x2b, DS_XOR_RTN_B32>;
+def DS_MSKOR_RTN_B32_vi : DS_Real_vi<0x2c, DS_MSKOR_RTN_B32>;
+def DS_WRXCHG_RTN_B32_vi : DS_Real_vi<0x2d, DS_WRXCHG_RTN_B32>;
+def DS_WRXCHG2_RTN_B32_vi : DS_Real_vi<0x2e, DS_WRXCHG2_RTN_B32>;
+def DS_WRXCHG2ST64_RTN_B32_vi : DS_Real_vi<0x2f, DS_WRXCHG2ST64_RTN_B32>;
+def DS_CMPST_RTN_B32_vi : DS_Real_vi<0x30, DS_CMPST_RTN_B32>;
+def DS_CMPST_RTN_F32_vi : DS_Real_vi<0x31, DS_CMPST_RTN_F32>;
+def DS_MIN_RTN_F32_vi : DS_Real_vi<0x32, DS_MIN_RTN_F32>;
+def DS_MAX_RTN_F32_vi : DS_Real_vi<0x33, DS_MAX_RTN_F32>;
+def DS_WRAP_RTN_F32_vi : DS_Real_vi<0x34, DS_WRAP_RTN_F32>;
+def DS_ADD_RTN_F32_vi : DS_Real_vi<0x35, DS_ADD_RTN_F32>;
+def DS_READ_B32_vi : DS_Real_vi<0x36, DS_READ_B32>;
+def DS_READ2_B32_vi : DS_Real_vi<0x37, DS_READ2_B32>;
+def DS_READ2ST64_B32_vi : DS_Real_vi<0x38, DS_READ2ST64_B32>;
+def DS_READ_I8_vi : DS_Real_vi<0x39, DS_READ_I8>;
+def DS_READ_U8_vi : DS_Real_vi<0x3a, DS_READ_U8>;
+def DS_READ_I16_vi : DS_Real_vi<0x3b, DS_READ_I16>;
+def DS_READ_U16_vi : DS_Real_vi<0x3c, DS_READ_U16>;
+def DS_SWIZZLE_B32_vi : DS_Real_vi<0x3d, DS_SWIZZLE_B32>;
+def DS_PERMUTE_B32_vi : DS_Real_vi<0x3e, DS_PERMUTE_B32>;
+def DS_BPERMUTE_B32_vi : DS_Real_vi<0x3f, DS_BPERMUTE_B32>;
+
+def DS_ADD_U64_vi : DS_Real_vi<0x40, DS_ADD_U64>;
+def DS_SUB_U64_vi : DS_Real_vi<0x41, DS_SUB_U64>;
+def DS_RSUB_U64_vi : DS_Real_vi<0x42, DS_RSUB_U64>;
+def DS_INC_U64_vi : DS_Real_vi<0x43, DS_INC_U64>;
+def DS_DEC_U64_vi : DS_Real_vi<0x44, DS_DEC_U64>;
+def DS_MIN_I64_vi : DS_Real_vi<0x45, DS_MIN_I64>;
+def DS_MAX_I64_vi : DS_Real_vi<0x46, DS_MAX_I64>;
+def DS_MIN_U64_vi : DS_Real_vi<0x47, DS_MIN_U64>;
+def DS_MAX_U64_vi : DS_Real_vi<0x48, DS_MAX_U64>;
+def DS_AND_B64_vi : DS_Real_vi<0x49, DS_AND_B64>;
+def DS_OR_B64_vi : DS_Real_vi<0x4a, DS_OR_B64>;
+def DS_XOR_B64_vi : DS_Real_vi<0x4b, DS_XOR_B64>;
+def DS_MSKOR_B64_vi : DS_Real_vi<0x4c, DS_MSKOR_B64>;
+def DS_WRITE_B64_vi : DS_Real_vi<0x4d, DS_WRITE_B64>;
+def DS_WRITE2_B64_vi : DS_Real_vi<0x4E, DS_WRITE2_B64>;
+def DS_WRITE2ST64_B64_vi : DS_Real_vi<0x4f, DS_WRITE2ST64_B64>;
+def DS_CMPST_B64_vi : DS_Real_vi<0x50, DS_CMPST_B64>;
+def DS_CMPST_F64_vi : DS_Real_vi<0x51, DS_CMPST_F64>;
+def DS_MIN_F64_vi : DS_Real_vi<0x52, DS_MIN_F64>;
+def DS_MAX_F64_vi : DS_Real_vi<0x53, DS_MAX_F64>;
+
+def DS_ADD_RTN_U64_vi : DS_Real_vi<0x60, DS_ADD_RTN_U64>;
+def DS_SUB_RTN_U64_vi : DS_Real_vi<0x61, DS_SUB_RTN_U64>;
+def DS_RSUB_RTN_U64_vi : DS_Real_vi<0x62, DS_RSUB_RTN_U64>;
+def DS_INC_RTN_U64_vi : DS_Real_vi<0x63, DS_INC_RTN_U64>;
+def DS_DEC_RTN_U64_vi : DS_Real_vi<0x64, DS_DEC_RTN_U64>;
+def DS_MIN_RTN_I64_vi : DS_Real_vi<0x65, DS_MIN_RTN_I64>;
+def DS_MAX_RTN_I64_vi : DS_Real_vi<0x66, DS_MAX_RTN_I64>;
+def DS_MIN_RTN_U64_vi : DS_Real_vi<0x67, DS_MIN_RTN_U64>;
+def DS_MAX_RTN_U64_vi : DS_Real_vi<0x68, DS_MAX_RTN_U64>;
+def DS_AND_RTN_B64_vi : DS_Real_vi<0x69, DS_AND_RTN_B64>;
+def DS_OR_RTN_B64_vi : DS_Real_vi<0x6a, DS_OR_RTN_B64>;
+def DS_XOR_RTN_B64_vi : DS_Real_vi<0x6b, DS_XOR_RTN_B64>;
+def DS_MSKOR_RTN_B64_vi : DS_Real_vi<0x6c, DS_MSKOR_RTN_B64>;
+def DS_WRXCHG_RTN_B64_vi : DS_Real_vi<0x6d, DS_WRXCHG_RTN_B64>;
+def DS_WRXCHG2_RTN_B64_vi : DS_Real_vi<0x6e, DS_WRXCHG2_RTN_B64>;
+def DS_WRXCHG2ST64_RTN_B64_vi : DS_Real_vi<0x6f, DS_WRXCHG2ST64_RTN_B64>;
+def DS_CMPST_RTN_B64_vi : DS_Real_vi<0x70, DS_CMPST_RTN_B64>;
+def DS_CMPST_RTN_F64_vi : DS_Real_vi<0x71, DS_CMPST_RTN_F64>;
+def DS_MIN_RTN_F64_vi : DS_Real_vi<0x72, DS_MIN_RTN_F64>;
+def DS_MAX_RTN_F64_vi : DS_Real_vi<0x73, DS_MAX_RTN_F64>;
+
+def DS_READ_B64_vi : DS_Real_vi<0x76, DS_READ_B64>;
+def DS_READ2_B64_vi : DS_Real_vi<0x77, DS_READ2_B64>;
+def DS_READ2ST64_B64_vi : DS_Real_vi<0x78, DS_READ2ST64_B64>;
+
+def DS_ADD_SRC2_U32_vi : DS_Real_vi<0x80, DS_ADD_SRC2_U32>;
+def DS_SUB_SRC2_U32_vi : DS_Real_vi<0x81, DS_SUB_SRC2_U32>;
+def DS_RSUB_SRC2_U32_vi : DS_Real_vi<0x82, DS_RSUB_SRC2_U32>;
+def DS_INC_SRC2_U32_vi : DS_Real_vi<0x83, DS_INC_SRC2_U32>;
+def DS_DEC_SRC2_U32_vi : DS_Real_vi<0x84, DS_DEC_SRC2_U32>;
+def DS_MIN_SRC2_I32_vi : DS_Real_vi<0x85, DS_MIN_SRC2_I32>;
+def DS_MAX_SRC2_I32_vi : DS_Real_vi<0x86, DS_MAX_SRC2_I32>;
+def DS_MIN_SRC2_U32_vi : DS_Real_vi<0x87, DS_MIN_SRC2_U32>;
+def DS_MAX_SRC2_U32_vi : DS_Real_vi<0x88, DS_MAX_SRC2_U32>;
+def DS_AND_SRC2_B32_vi : DS_Real_vi<0x89, DS_AND_SRC2_B32>;
+def DS_OR_SRC2_B32_vi : DS_Real_vi<0x8a, DS_OR_SRC2_B32>;
+def DS_XOR_SRC2_B32_vi : DS_Real_vi<0x8b, DS_XOR_SRC2_B32>;
+def DS_WRITE_SRC2_B32_vi : DS_Real_vi<0x8d, DS_WRITE_SRC2_B32>;
+def DS_MIN_SRC2_F32_vi : DS_Real_vi<0x92, DS_MIN_SRC2_F32>;
+def DS_MAX_SRC2_F32_vi : DS_Real_vi<0x93, DS_MAX_SRC2_F32>;
+def DS_ADD_SRC2_U64_vi : DS_Real_vi<0xc0, DS_ADD_SRC2_U64>;
+def DS_SUB_SRC2_U64_vi : DS_Real_vi<0xc1, DS_SUB_SRC2_U64>;
+def DS_RSUB_SRC2_U64_vi : DS_Real_vi<0xc2, DS_RSUB_SRC2_U64>;
+def DS_INC_SRC2_U64_vi : DS_Real_vi<0xc3, DS_INC_SRC2_U64>;
+def DS_DEC_SRC2_U64_vi : DS_Real_vi<0xc4, DS_DEC_SRC2_U64>;
+def DS_MIN_SRC2_I64_vi : DS_Real_vi<0xc5, DS_MIN_SRC2_I64>;
+def DS_MAX_SRC2_I64_vi : DS_Real_vi<0xc6, DS_MAX_SRC2_I64>;
+def DS_MIN_SRC2_U64_vi : DS_Real_vi<0xc7, DS_MIN_SRC2_U64>;
+def DS_MAX_SRC2_U64_vi : DS_Real_vi<0xc8, DS_MAX_SRC2_U64>;
+def DS_AND_SRC2_B64_vi : DS_Real_vi<0xc9, DS_AND_SRC2_B64>;
+def DS_OR_SRC2_B64_vi : DS_Real_vi<0xca, DS_OR_SRC2_B64>;
+def DS_XOR_SRC2_B64_vi : DS_Real_vi<0xcb, DS_XOR_SRC2_B64>;
+def DS_WRITE_SRC2_B64_vi : DS_Real_vi<0xcd, DS_WRITE_SRC2_B64>;
+def DS_MIN_SRC2_F64_vi : DS_Real_vi<0xd2, DS_MIN_SRC2_F64>;
+def DS_MAX_SRC2_F64_vi : DS_Real_vi<0xd3, DS_MAX_SRC2_F64>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
new file mode 100644
index 000000000000..2247cad7bb51
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -0,0 +1,609 @@
+//===-- AMDGPUDisassembler.cpp - Disassembler for AMDGPU ISA --------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// This file contains definition for AMDGPU ISA disassembler
+//
+//===----------------------------------------------------------------------===//
+
+// ToDo: What to do with instruction suffixes (v_mov_b32 vs v_mov_b32_e32)?
+
+#include "AMDGPUDisassembler.h"
+#include "AMDGPU.h"
+#include "AMDGPURegisterInfo.h"
+#include "SIDefines.h"
+#include "Utils/AMDGPUBaseInfo.h"
+
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TargetRegistry.h"
+
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-disassembler"
+
+typedef llvm::MCDisassembler::DecodeStatus DecodeStatus;
+
+
+inline static MCDisassembler::DecodeStatus
+addOperand(MCInst &Inst, const MCOperand& Opnd) {
+ Inst.addOperand(Opnd);
+ return Opnd.isValid() ?
+ MCDisassembler::Success :
+ MCDisassembler::SoftFail;
+}
+
+static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+
+ APInt SignedOffset(18, Imm * 4, true);
+ int64_t Offset = (SignedOffset.sext(64) + 4 + Addr).getSExtValue();
+
+ if (DAsm->tryAddingSymbolicOperand(Inst, Offset, Addr, true, 2, 2))
+ return MCDisassembler::Success;
+ return addOperand(Inst, MCOperand::createImm(Imm));
+}
+
+#define DECODE_OPERAND2(RegClass, DecName) \
+static DecodeStatus Decode##RegClass##RegisterClass(MCInst &Inst, \
+ unsigned Imm, \
+ uint64_t /*Addr*/, \
+ const void *Decoder) { \
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); \
+ return addOperand(Inst, DAsm->decodeOperand_##DecName(Imm)); \
+}
+
+#define DECODE_OPERAND(RegClass) DECODE_OPERAND2(RegClass, RegClass)
+
+DECODE_OPERAND(VGPR_32)
+DECODE_OPERAND(VS_32)
+DECODE_OPERAND(VS_64)
+
+DECODE_OPERAND(VReg_64)
+DECODE_OPERAND(VReg_96)
+DECODE_OPERAND(VReg_128)
+
+DECODE_OPERAND(SReg_32)
+DECODE_OPERAND(SReg_32_XM0_XEXEC)
+DECODE_OPERAND(SReg_64)
+DECODE_OPERAND(SReg_64_XEXEC)
+DECODE_OPERAND(SReg_128)
+DECODE_OPERAND(SReg_256)
+DECODE_OPERAND(SReg_512)
+
+
+static DecodeStatus decodeOperand_VSrc16(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+ return addOperand(Inst, DAsm->decodeOperand_VSrc16(Imm));
+}
+
+#define GET_SUBTARGETINFO_ENUM
+#include "AMDGPUGenSubtargetInfo.inc"
+#undef GET_SUBTARGETINFO_ENUM
+
+#include "AMDGPUGenDisassemblerTables.inc"
+
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+template <typename T> static inline T eatBytes(ArrayRef<uint8_t>& Bytes) {
+ assert(Bytes.size() >= sizeof(T));
+ const auto Res = support::endian::read<T, support::endianness::little>(Bytes.data());
+ Bytes = Bytes.slice(sizeof(T));
+ return Res;
+}
+
+DecodeStatus AMDGPUDisassembler::tryDecodeInst(const uint8_t* Table,
+ MCInst &MI,
+ uint64_t Inst,
+ uint64_t Address) const {
+ assert(MI.getOpcode() == 0);
+ assert(MI.getNumOperands() == 0);
+ MCInst TmpInst;
+ const auto SavedBytes = Bytes;
+ if (decodeInstruction(Table, TmpInst, Inst, Address, this, STI)) {
+ MI = TmpInst;
+ return MCDisassembler::Success;
+ }
+ Bytes = SavedBytes;
+ return MCDisassembler::Fail;
+}
+
+DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes_,
+ uint64_t Address,
+ raw_ostream &WS,
+ raw_ostream &CS) const {
+ CommentStream = &CS;
+
+ // ToDo: AMDGPUDisassembler supports only VI ISA.
+ assert(AMDGPU::isVI(STI) && "Can disassemble only VI ISA.");
+
+ const unsigned MaxInstBytesNum = (std::min)((size_t)8, Bytes_.size());
+ Bytes = Bytes_.slice(0, MaxInstBytesNum);
+
+ DecodeStatus Res = MCDisassembler::Fail;
+ do {
+ // ToDo: better to switch encoding length using some bit predicate
+ // but it is unknown yet, so try all we can
+
+ // Try to decode DPP and SDWA first to solve conflict with VOP1 and VOP2
+ // encodings
+ if (Bytes.size() >= 8) {
+ const uint64_t QW = eatBytes<uint64_t>(Bytes);
+ Res = tryDecodeInst(DecoderTableDPP64, MI, QW, Address);
+ if (Res) break;
+
+ Res = tryDecodeInst(DecoderTableSDWA64, MI, QW, Address);
+ if (Res) break;
+ }
+
+ // Reinitialize Bytes as DPP64 could have eaten too much
+ Bytes = Bytes_.slice(0, MaxInstBytesNum);
+
+ // Try decode 32-bit instruction
+ if (Bytes.size() < 4) break;
+ const uint32_t DW = eatBytes<uint32_t>(Bytes);
+ Res = tryDecodeInst(DecoderTableVI32, MI, DW, Address);
+ if (Res) break;
+
+ Res = tryDecodeInst(DecoderTableAMDGPU32, MI, DW, Address);
+ if (Res) break;
+
+ if (Bytes.size() < 4) break;
+ const uint64_t QW = ((uint64_t)eatBytes<uint32_t>(Bytes) << 32) | DW;
+ Res = tryDecodeInst(DecoderTableVI64, MI, QW, Address);
+ if (Res) break;
+
+ Res = tryDecodeInst(DecoderTableAMDGPU64, MI, QW, Address);
+ } while (false);
+
+ Size = Res ? (MaxInstBytesNum - Bytes.size()) : 0;
+ return Res;
+}
+
+const char* AMDGPUDisassembler::getRegClassName(unsigned RegClassID) const {
+ return getContext().getRegisterInfo()->
+ getRegClassName(&AMDGPUMCRegisterClasses[RegClassID]);
+}
+
+inline
+MCOperand AMDGPUDisassembler::errOperand(unsigned V,
+ const Twine& ErrMsg) const {
+ *CommentStream << "Error: " + ErrMsg;
+
+ // ToDo: add support for error operands to MCInst.h
+ // return MCOperand::createError(V);
+ return MCOperand();
+}
+
+inline
+MCOperand AMDGPUDisassembler::createRegOperand(unsigned int RegId) const {
+ return MCOperand::createReg(RegId);
+}
+
+inline
+MCOperand AMDGPUDisassembler::createRegOperand(unsigned RegClassID,
+ unsigned Val) const {
+ const auto& RegCl = AMDGPUMCRegisterClasses[RegClassID];
+ if (Val >= RegCl.getNumRegs())
+ return errOperand(Val, Twine(getRegClassName(RegClassID)) +
+ ": unknown register " + Twine(Val));
+ return createRegOperand(RegCl.getRegister(Val));
+}
+
+inline
+MCOperand AMDGPUDisassembler::createSRegOperand(unsigned SRegClassID,
+ unsigned Val) const {
+ // ToDo: SI/CI have 104 SGPRs, VI - 102
+ // Valery: here we accepting as much as we can, let assembler sort it out
+ int shift = 0;
+ switch (SRegClassID) {
+ case AMDGPU::SGPR_32RegClassID:
+ case AMDGPU::TTMP_32RegClassID:
+ break;
+ case AMDGPU::SGPR_64RegClassID:
+ case AMDGPU::TTMP_64RegClassID:
+ shift = 1;
+ break;
+ case AMDGPU::SGPR_128RegClassID:
+ case AMDGPU::TTMP_128RegClassID:
+ // ToDo: unclear if s[100:104] is available on VI. Can we use VCC as SGPR in
+ // this bundle?
+ case AMDGPU::SReg_256RegClassID:
+ // ToDo: unclear if s[96:104] is available on VI. Can we use VCC as SGPR in
+ // this bundle?
+ case AMDGPU::SReg_512RegClassID:
+ shift = 2;
+ break;
+ // ToDo: unclear if s[88:104] is available on VI. Can we use VCC as SGPR in
+ // this bundle?
+ default:
+ llvm_unreachable("unhandled register class");
+ }
+
+ if (Val % (1 << shift)) {
+ *CommentStream << "Warning: " << getRegClassName(SRegClassID)
+ << ": scalar reg isn't aligned " << Val;
+ }
+
+ return createRegOperand(SRegClassID, Val >> shift);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_VS_32(unsigned Val) const {
+ return decodeSrcOp(OPW32, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_VS_64(unsigned Val) const {
+ return decodeSrcOp(OPW64, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_VSrc16(unsigned Val) const {
+ return decodeSrcOp(OPW16, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_VGPR_32(unsigned Val) const {
+ // Some instructions have operand restrictions beyond what the encoding
+ // allows. Some ordinarily VSrc_32 operands are VGPR_32, so clear the extra
+ // high bit.
+ Val &= 255;
+
+ return createRegOperand(AMDGPU::VGPR_32RegClassID, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_VReg_64(unsigned Val) const {
+ return createRegOperand(AMDGPU::VReg_64RegClassID, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_VReg_96(unsigned Val) const {
+ return createRegOperand(AMDGPU::VReg_96RegClassID, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_VReg_128(unsigned Val) const {
+ return createRegOperand(AMDGPU::VReg_128RegClassID, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_SReg_32(unsigned Val) const {
+ // table-gen generated disassembler doesn't care about operand types
+ // leaving only registry class so SSrc_32 operand turns into SReg_32
+ // and therefore we accept immediates and literals here as well
+ return decodeSrcOp(OPW32, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_SReg_32_XM0_XEXEC(
+ unsigned Val) const {
+ // SReg_32_XM0 is SReg_32 without M0 or EXEC_LO/EXEC_HI
+ return decodeOperand_SReg_32(Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_SReg_64(unsigned Val) const {
+ return decodeSrcOp(OPW64, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_SReg_64_XEXEC(unsigned Val) const {
+ return decodeSrcOp(OPW64, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_SReg_128(unsigned Val) const {
+ return decodeSrcOp(OPW128, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_SReg_256(unsigned Val) const {
+ return createSRegOperand(AMDGPU::SReg_256RegClassID, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_SReg_512(unsigned Val) const {
+ return createSRegOperand(AMDGPU::SReg_512RegClassID, Val);
+}
+
+
+MCOperand AMDGPUDisassembler::decodeLiteralConstant() const {
+ // For now all literal constants are supposed to be unsigned integer
+ // ToDo: deal with signed/unsigned 64-bit integer constants
+ // ToDo: deal with float/double constants
+ if (Bytes.size() < 4)
+ return errOperand(0, "cannot read literal, inst bytes left " +
+ Twine(Bytes.size()));
+ return MCOperand::createImm(eatBytes<uint32_t>(Bytes));
+}
+
+MCOperand AMDGPUDisassembler::decodeIntImmed(unsigned Imm) {
+ using namespace AMDGPU::EncValues;
+ assert(Imm >= INLINE_INTEGER_C_MIN && Imm <= INLINE_INTEGER_C_MAX);
+ return MCOperand::createImm((Imm <= INLINE_INTEGER_C_POSITIVE_MAX) ?
+ (static_cast<int64_t>(Imm) - INLINE_INTEGER_C_MIN) :
+ (INLINE_INTEGER_C_POSITIVE_MAX - static_cast<int64_t>(Imm)));
+ // Cast prevents negative overflow.
+}
+
+static int64_t getInlineImmVal32(unsigned Imm) {
+ switch (Imm) {
+ case 240:
+ return FloatToBits(0.5f);
+ case 241:
+ return FloatToBits(-0.5f);
+ case 242:
+ return FloatToBits(1.0f);
+ case 243:
+ return FloatToBits(-1.0f);
+ case 244:
+ return FloatToBits(2.0f);
+ case 245:
+ return FloatToBits(-2.0f);
+ case 246:
+ return FloatToBits(4.0f);
+ case 247:
+ return FloatToBits(-4.0f);
+ case 248: // 1 / (2 * PI)
+ return 0x3e22f983;
+ default:
+ llvm_unreachable("invalid fp inline imm");
+ }
+}
+
+static int64_t getInlineImmVal64(unsigned Imm) {
+ switch (Imm) {
+ case 240:
+ return DoubleToBits(0.5);
+ case 241:
+ return DoubleToBits(-0.5);
+ case 242:
+ return DoubleToBits(1.0);
+ case 243:
+ return DoubleToBits(-1.0);
+ case 244:
+ return DoubleToBits(2.0);
+ case 245:
+ return DoubleToBits(-2.0);
+ case 246:
+ return DoubleToBits(4.0);
+ case 247:
+ return DoubleToBits(-4.0);
+ case 248: // 1 / (2 * PI)
+ return 0x3fc45f306dc9c882;
+ default:
+ llvm_unreachable("invalid fp inline imm");
+ }
+}
+
+static int64_t getInlineImmVal16(unsigned Imm) {
+ switch (Imm) {
+ case 240:
+ return 0x3800;
+ case 241:
+ return 0xB800;
+ case 242:
+ return 0x3C00;
+ case 243:
+ return 0xBC00;
+ case 244:
+ return 0x4000;
+ case 245:
+ return 0xC000;
+ case 246:
+ return 0x4400;
+ case 247:
+ return 0xC400;
+ case 248: // 1 / (2 * PI)
+ return 0x3118;
+ default:
+ llvm_unreachable("invalid fp inline imm");
+ }
+}
+
+MCOperand AMDGPUDisassembler::decodeFPImmed(OpWidthTy Width, unsigned Imm) {
+ assert(Imm >= AMDGPU::EncValues::INLINE_FLOATING_C_MIN
+ && Imm <= AMDGPU::EncValues::INLINE_FLOATING_C_MAX);
+
+ // ToDo: case 248: 1/(2*PI) - is allowed only on VI
+ switch (Width) {
+ case OPW32:
+ return MCOperand::createImm(getInlineImmVal32(Imm));
+ case OPW64:
+ return MCOperand::createImm(getInlineImmVal64(Imm));
+ case OPW16:
+ return MCOperand::createImm(getInlineImmVal16(Imm));
+ default:
+ llvm_unreachable("implement me");
+ }
+}
+
+unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const {
+ using namespace AMDGPU;
+ assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
+ switch (Width) {
+ default: // fall
+ case OPW32:
+ case OPW16:
+ return VGPR_32RegClassID;
+ case OPW64: return VReg_64RegClassID;
+ case OPW128: return VReg_128RegClassID;
+ }
+}
+
+unsigned AMDGPUDisassembler::getSgprClassId(const OpWidthTy Width) const {
+ using namespace AMDGPU;
+ assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
+ switch (Width) {
+ default: // fall
+ case OPW32:
+ case OPW16:
+ return SGPR_32RegClassID;
+ case OPW64: return SGPR_64RegClassID;
+ case OPW128: return SGPR_128RegClassID;
+ }
+}
+
+unsigned AMDGPUDisassembler::getTtmpClassId(const OpWidthTy Width) const {
+ using namespace AMDGPU;
+ assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
+ switch (Width) {
+ default: // fall
+ case OPW32:
+ case OPW16:
+ return TTMP_32RegClassID;
+ case OPW64: return TTMP_64RegClassID;
+ case OPW128: return TTMP_128RegClassID;
+ }
+}
+
+MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) const {
+ using namespace AMDGPU::EncValues;
+ assert(Val < 512); // enum9
+
+ if (VGPR_MIN <= Val && Val <= VGPR_MAX) {
+ return createRegOperand(getVgprClassId(Width), Val - VGPR_MIN);
+ }
+ if (Val <= SGPR_MAX) {
+ assert(SGPR_MIN == 0); // "SGPR_MIN <= Val" is always true and causes compilation warning.
+ return createSRegOperand(getSgprClassId(Width), Val - SGPR_MIN);
+ }
+ if (TTMP_MIN <= Val && Val <= TTMP_MAX) {
+ return createSRegOperand(getTtmpClassId(Width), Val - TTMP_MIN);
+ }
+
+ assert(Width == OPW16 || Width == OPW32 || Width == OPW64);
+
+ if (INLINE_INTEGER_C_MIN <= Val && Val <= INLINE_INTEGER_C_MAX)
+ return decodeIntImmed(Val);
+
+ if (INLINE_FLOATING_C_MIN <= Val && Val <= INLINE_FLOATING_C_MAX)
+ return decodeFPImmed(Width, Val);
+
+ if (Val == LITERAL_CONST)
+ return decodeLiteralConstant();
+
+ switch (Width) {
+ case OPW32:
+ case OPW16:
+ return decodeSpecialReg32(Val);
+ case OPW64:
+ return decodeSpecialReg64(Val);
+ default:
+ llvm_unreachable("unexpected immediate type");
+ }
+}
+
+MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const {
+ using namespace AMDGPU;
+ switch (Val) {
+ case 102: return createRegOperand(getMCReg(FLAT_SCR_LO, STI));
+ case 103: return createRegOperand(getMCReg(FLAT_SCR_HI, STI));
+ // ToDo: no support for xnack_mask_lo/_hi register
+ case 104:
+ case 105: break;
+ case 106: return createRegOperand(VCC_LO);
+ case 107: return createRegOperand(VCC_HI);
+ case 108: return createRegOperand(TBA_LO);
+ case 109: return createRegOperand(TBA_HI);
+ case 110: return createRegOperand(TMA_LO);
+ case 111: return createRegOperand(TMA_HI);
+ case 124: return createRegOperand(M0);
+ case 126: return createRegOperand(EXEC_LO);
+ case 127: return createRegOperand(EXEC_HI);
+ // ToDo: no support for vccz register
+ case 251: break;
+ // ToDo: no support for execz register
+ case 252: break;
+ case 253: return createRegOperand(SCC);
+ default: break;
+ }
+ return errOperand(Val, "unknown operand encoding " + Twine(Val));
+}
+
+MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {
+ using namespace AMDGPU;
+ switch (Val) {
+ case 102: return createRegOperand(getMCReg(FLAT_SCR, STI));
+ case 106: return createRegOperand(VCC);
+ case 108: return createRegOperand(TBA);
+ case 110: return createRegOperand(TMA);
+ case 126: return createRegOperand(EXEC);
+ default: break;
+ }
+ return errOperand(Val, "unknown operand encoding " + Twine(Val));
+}
+
+//===----------------------------------------------------------------------===//
+// AMDGPUSymbolizer
+//===----------------------------------------------------------------------===//
+
+// Try to find symbol name for specified label
+bool AMDGPUSymbolizer::tryAddingSymbolicOperand(MCInst &Inst,
+ raw_ostream &/*cStream*/, int64_t Value,
+ uint64_t /*Address*/, bool IsBranch,
+ uint64_t /*Offset*/, uint64_t /*InstSize*/) {
+ typedef std::tuple<uint64_t, StringRef, uint8_t> SymbolInfoTy;
+ typedef std::vector<SymbolInfoTy> SectionSymbolsTy;
+
+ if (!IsBranch) {
+ return false;
+ }
+
+ auto *Symbols = static_cast<SectionSymbolsTy *>(DisInfo);
+ auto Result = std::find_if(Symbols->begin(), Symbols->end(),
+ [Value](const SymbolInfoTy& Val) {
+ return std::get<0>(Val) == static_cast<uint64_t>(Value)
+ && std::get<2>(Val) == ELF::STT_NOTYPE;
+ });
+ if (Result != Symbols->end()) {
+ auto *Sym = Ctx.getOrCreateSymbol(std::get<1>(*Result));
+ const auto *Add = MCSymbolRefExpr::create(Sym, Ctx);
+ Inst.addOperand(MCOperand::createExpr(Add));
+ return true;
+ }
+ return false;
+}
+
+void AMDGPUSymbolizer::tryAddingPcLoadReferenceComment(raw_ostream &cStream,
+ int64_t Value,
+ uint64_t Address) {
+ llvm_unreachable("unimplemented");
+}
+
+//===----------------------------------------------------------------------===//
+// Initialization
+//===----------------------------------------------------------------------===//
+
+static MCSymbolizer *createAMDGPUSymbolizer(const Triple &/*TT*/,
+ LLVMOpInfoCallback /*GetOpInfo*/,
+ LLVMSymbolLookupCallback /*SymbolLookUp*/,
+ void *DisInfo,
+ MCContext *Ctx,
+ std::unique_ptr<MCRelocationInfo> &&RelInfo) {
+ return new AMDGPUSymbolizer(*Ctx, std::move(RelInfo), DisInfo);
+}
+
+static MCDisassembler *createAMDGPUDisassembler(const Target &T,
+ const MCSubtargetInfo &STI,
+ MCContext &Ctx) {
+ return new AMDGPUDisassembler(STI, Ctx);
+}
+
+extern "C" void LLVMInitializeAMDGPUDisassembler() {
+ TargetRegistry::RegisterMCDisassembler(getTheGCNTarget(),
+ createAMDGPUDisassembler);
+ TargetRegistry::RegisterMCSymbolizer(getTheGCNTarget(),
+ createAMDGPUSymbolizer);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
new file mode 100644
index 000000000000..ee5883a984e0
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -0,0 +1,130 @@
+//===-- AMDGPUDisassembler.hpp - Disassembler for AMDGPU ISA ---*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// This file contains declaration for AMDGPU ISA disassembler
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_DISASSEMBLER_AMDGPUDISASSEMBLER_H
+#define LLVM_LIB_TARGET_AMDGPU_DISASSEMBLER_AMDGPUDISASSEMBLER_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCDisassembler/MCRelocationInfo.h"
+#include "llvm/MC/MCDisassembler/MCSymbolizer.h"
+#include <cstdint>
+#include <algorithm>
+#include <memory>
+
+namespace llvm {
+
+class MCContext;
+class MCInst;
+class MCOperand;
+class MCSubtargetInfo;
+class Twine;
+
+//===----------------------------------------------------------------------===//
+// AMDGPUDisassembler
+//===----------------------------------------------------------------------===//
+
+class AMDGPUDisassembler : public MCDisassembler {
+private:
+ mutable ArrayRef<uint8_t> Bytes;
+
+public:
+ AMDGPUDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) :
+ MCDisassembler(STI, Ctx) {}
+
+ ~AMDGPUDisassembler() override = default;
+
+ DecodeStatus getInstruction(MCInst &MI, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes, uint64_t Address,
+ raw_ostream &WS, raw_ostream &CS) const override;
+
+ const char* getRegClassName(unsigned RegClassID) const;
+
+ MCOperand createRegOperand(unsigned int RegId) const;
+ MCOperand createRegOperand(unsigned RegClassID, unsigned Val) const;
+ MCOperand createSRegOperand(unsigned SRegClassID, unsigned Val) const;
+
+ MCOperand errOperand(unsigned V, const Twine& ErrMsg) const;
+
+ DecodeStatus tryDecodeInst(const uint8_t* Table,
+ MCInst &MI,
+ uint64_t Inst,
+ uint64_t Address) const;
+
+ MCOperand decodeOperand_VGPR_32(unsigned Val) const;
+ MCOperand decodeOperand_VS_32(unsigned Val) const;
+ MCOperand decodeOperand_VS_64(unsigned Val) const;
+ MCOperand decodeOperand_VSrc16(unsigned Val) const;
+
+ MCOperand decodeOperand_VReg_64(unsigned Val) const;
+ MCOperand decodeOperand_VReg_96(unsigned Val) const;
+ MCOperand decodeOperand_VReg_128(unsigned Val) const;
+
+ MCOperand decodeOperand_SReg_32(unsigned Val) const;
+ MCOperand decodeOperand_SReg_32_XM0_XEXEC(unsigned Val) const;
+ MCOperand decodeOperand_SReg_64(unsigned Val) const;
+ MCOperand decodeOperand_SReg_64_XEXEC(unsigned Val) const;
+ MCOperand decodeOperand_SReg_128(unsigned Val) const;
+ MCOperand decodeOperand_SReg_256(unsigned Val) const;
+ MCOperand decodeOperand_SReg_512(unsigned Val) const;
+
+ enum OpWidthTy {
+ OPW32,
+ OPW64,
+ OPW128,
+ OPW16,
+ OPW_LAST_,
+ OPW_FIRST_ = OPW32
+ };
+
+ unsigned getVgprClassId(const OpWidthTy Width) const;
+ unsigned getSgprClassId(const OpWidthTy Width) const;
+ unsigned getTtmpClassId(const OpWidthTy Width) const;
+
+ static MCOperand decodeIntImmed(unsigned Imm);
+ static MCOperand decodeFPImmed(OpWidthTy Width, unsigned Imm);
+ MCOperand decodeLiteralConstant() const;
+
+ MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val) const;
+ MCOperand decodeSpecialReg32(unsigned Val) const;
+ MCOperand decodeSpecialReg64(unsigned Val) const;
+};
+
+//===----------------------------------------------------------------------===//
+// AMDGPUSymbolizer
+//===----------------------------------------------------------------------===//
+
+class AMDGPUSymbolizer : public MCSymbolizer {
+private:
+ void *DisInfo;
+
+public:
+ AMDGPUSymbolizer(MCContext &Ctx, std::unique_ptr<MCRelocationInfo> &&RelInfo,
+ void *disInfo)
+ : MCSymbolizer(Ctx, std::move(RelInfo)), DisInfo(disInfo) {}
+
+ bool tryAddingSymbolicOperand(MCInst &Inst, raw_ostream &cStream,
+ int64_t Value, uint64_t Address,
+ bool IsBranch, uint64_t Offset,
+ uint64_t InstSize) override;
+
+ void tryAddingPcLoadReferenceComment(raw_ostream &cStream,
+ int64_t Value,
+ uint64_t Address) override;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_DISASSEMBLER_AMDGPUDISASSEMBLER_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
new file mode 100644
index 000000000000..4112ad100584
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -0,0 +1,663 @@
+//===-- EvergreenInstructions.td - EG Instruction defs ----*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TableGen definitions for instructions which are:
+// - Available to Evergreen and newer VLIW4/VLIW5 GPUs
+// - Available only on Evergreen family GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+def isEG : Predicate<
+ "Subtarget->getGeneration() >= AMDGPUSubtarget::EVERGREEN && "
+ "Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS && "
+ "!Subtarget->hasCaymanISA()"
+>;
+
+def isEGorCayman : Predicate<
+ "Subtarget->getGeneration() == AMDGPUSubtarget::EVERGREEN ||"
+ "Subtarget->getGeneration() ==AMDGPUSubtarget::NORTHERN_ISLANDS"
+>;
+
+//===----------------------------------------------------------------------===//
+// Evergreen / Cayman store instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isEGorCayman] in {
+
+class CF_MEM_RAT_CACHELESS <bits<6> rat_inst, bits<4> rat_id, bits<4> mask, dag ins,
+ string name, list<dag> pattern>
+ : EG_CF_RAT <0x57, rat_inst, rat_id, mask, (outs), ins,
+ "MEM_RAT_CACHELESS "#name, pattern>;
+
+class CF_MEM_RAT <bits<6> rat_inst, bits<4> rat_id, dag ins, string name,
+ list<dag> pattern>
+ : EG_CF_RAT <0x56, rat_inst, rat_id, 0xf /* mask */, (outs), ins,
+ "MEM_RAT "#name, pattern>;
+
+class CF_MEM_RAT_STORE_TYPED<bits<1> has_eop>
+ : CF_MEM_RAT <0x1, ?, (ins R600_Reg128:$rw_gpr, R600_Reg128:$index_gpr,
+ i32imm:$rat_id, InstFlag:$eop),
+ "STORE_TYPED RAT($rat_id) $rw_gpr, $index_gpr"
+ #!if(has_eop, ", $eop", ""),
+ [(int_r600_rat_store_typed R600_Reg128:$rw_gpr,
+ R600_Reg128:$index_gpr,
+ (i32 imm:$rat_id))]>;
+
+def RAT_MSKOR : CF_MEM_RAT <0x11, 0,
+ (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr),
+ "MSKOR $rw_gpr.XW, $index_gpr",
+ [(mskor_global v4i32:$rw_gpr, i32:$index_gpr)]
+> {
+ let eop = 0;
+}
+
+} // End let Predicates = [isEGorCayman]
+
+//===----------------------------------------------------------------------===//
+// Evergreen Only instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isEG] in {
+
+def RECIP_IEEE_eg : RECIP_IEEE_Common<0x86>;
+defm DIV_eg : DIV_Common<RECIP_IEEE_eg>;
+
+def MULLO_INT_eg : MULLO_INT_Common<0x8F>;
+def MULHI_INT_eg : MULHI_INT_Common<0x90>;
+def MULLO_UINT_eg : MULLO_UINT_Common<0x91>;
+def MULHI_UINT_eg : MULHI_UINT_Common<0x92>;
+def MULHI_UINT24_eg : MULHI_UINT24_Common<0xb2>;
+
+def RECIP_UINT_eg : RECIP_UINT_Common<0x94>;
+def RECIPSQRT_CLAMPED_eg : RECIPSQRT_CLAMPED_Common<0x87>;
+def EXP_IEEE_eg : EXP_IEEE_Common<0x81>;
+def LOG_IEEE_eg : LOG_IEEE_Common<0x83>;
+def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>;
+def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>;
+def : RsqPat<RECIPSQRT_IEEE_eg, f32>;
+def SIN_eg : SIN_Common<0x8D>;
+def COS_eg : COS_Common<0x8E>;
+
+def : POW_Common <LOG_IEEE_eg, EXP_IEEE_eg, MUL>;
+def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>;
+
+//===----------------------------------------------------------------------===//
+// Memory read/write instructions
+//===----------------------------------------------------------------------===//
+
+let usesCustomInserter = 1 in {
+
+// 32-bit store
+def RAT_WRITE_CACHELESS_32_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0x1,
+ (ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
+ "STORE_RAW $rw_gpr, $index_gpr, $eop",
+ [(global_store i32:$rw_gpr, i32:$index_gpr)]
+>;
+
+// 64-bit store
+def RAT_WRITE_CACHELESS_64_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0x3,
+ (ins R600_Reg64:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
+ "STORE_RAW $rw_gpr.XY, $index_gpr, $eop",
+ [(global_store v2i32:$rw_gpr, i32:$index_gpr)]
+>;
+
+//128-bit store
+def RAT_WRITE_CACHELESS_128_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0xf,
+ (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
+ "STORE_RAW $rw_gpr.XYZW, $index_gpr, $eop",
+ [(global_store v4i32:$rw_gpr, i32:$index_gpr)]
+>;
+
+def RAT_STORE_TYPED_eg: CF_MEM_RAT_STORE_TYPED<1>;
+
+} // End usesCustomInserter = 1
+
+class VTX_READ_eg <string name, dag outs>
+ : VTX_WORD0_eg, VTX_READ<name, outs, []> {
+
+ // Static fields
+ let VC_INST = 0;
+ let FETCH_TYPE = 2;
+ let FETCH_WHOLE_QUAD = 0;
+ let SRC_REL = 0;
+ // XXX: We can infer this field based on the SRC_GPR. This would allow us
+ // to store vertex addresses in any channel, not just X.
+ let SRC_SEL_X = 0;
+
+ let Inst{31-0} = Word0;
+}
+
+def VTX_READ_8_eg
+ : VTX_READ_eg <"VTX_READ_8 $dst_gpr, $src_gpr",
+ (outs R600_TReg32_X:$dst_gpr)> {
+
+ let MEGA_FETCH_COUNT = 1;
+ let DST_SEL_X = 0;
+ let DST_SEL_Y = 7; // Masked
+ let DST_SEL_Z = 7; // Masked
+ let DST_SEL_W = 7; // Masked
+ let DATA_FORMAT = 1; // FMT_8
+}
+
+def VTX_READ_16_eg
+ : VTX_READ_eg <"VTX_READ_16 $dst_gpr, $src_gpr",
+ (outs R600_TReg32_X:$dst_gpr)> {
+ let MEGA_FETCH_COUNT = 2;
+ let DST_SEL_X = 0;
+ let DST_SEL_Y = 7; // Masked
+ let DST_SEL_Z = 7; // Masked
+ let DST_SEL_W = 7; // Masked
+ let DATA_FORMAT = 5; // FMT_16
+
+}
+
+def VTX_READ_32_eg
+ : VTX_READ_eg <"VTX_READ_32 $dst_gpr, $src_gpr",
+ (outs R600_TReg32_X:$dst_gpr)> {
+
+ let MEGA_FETCH_COUNT = 4;
+ let DST_SEL_X = 0;
+ let DST_SEL_Y = 7; // Masked
+ let DST_SEL_Z = 7; // Masked
+ let DST_SEL_W = 7; // Masked
+ let DATA_FORMAT = 0xD; // COLOR_32
+
+ // This is not really necessary, but there were some GPU hangs that appeared
+ // to be caused by ALU instructions in the next instruction group that wrote
+ // to the $src_gpr registers of the VTX_READ.
+ // e.g.
+ // %T3_X<def> = VTX_READ_PARAM_32_eg %T2_X<kill>, 24
+ // %T2_X<def> = MOV %ZERO
+ //Adding this constraint prevents this from happening.
+ let Constraints = "$src_gpr.ptr = $dst_gpr";
+}
+
+def VTX_READ_64_eg
+ : VTX_READ_eg <"VTX_READ_64 $dst_gpr.XY, $src_gpr",
+ (outs R600_Reg64:$dst_gpr)> {
+
+ let MEGA_FETCH_COUNT = 8;
+ let DST_SEL_X = 0;
+ let DST_SEL_Y = 1;
+ let DST_SEL_Z = 7;
+ let DST_SEL_W = 7;
+ let DATA_FORMAT = 0x1D; // COLOR_32_32
+}
+
+def VTX_READ_128_eg
+ : VTX_READ_eg <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr",
+ (outs R600_Reg128:$dst_gpr)> {
+
+ let MEGA_FETCH_COUNT = 16;
+ let DST_SEL_X = 0;
+ let DST_SEL_Y = 1;
+ let DST_SEL_Z = 2;
+ let DST_SEL_W = 3;
+ let DATA_FORMAT = 0x22; // COLOR_32_32_32_32
+
+ // XXX: Need to force VTX_READ_128 instructions to write to the same register
+ // that holds its buffer address to avoid potential hangs. We can't use
+ // the same constraint as VTX_READ_32_eg, because the $src_gpr.ptr and $dst
+ // registers are different sizes.
+}
+
+//===----------------------------------------------------------------------===//
+// VTX Read from parameter memory space
+//===----------------------------------------------------------------------===//
+def : Pat<(i32:$dst_gpr (vtx_id3_az_extloadi8 ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_8_eg MEMxi:$src_gpr, 3)>;
+def : Pat<(i32:$dst_gpr (vtx_id3_az_extloadi16 ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_16_eg MEMxi:$src_gpr, 3)>;
+def : Pat<(i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_32_eg MEMxi:$src_gpr, 3)>;
+def : Pat<(v2i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_64_eg MEMxi:$src_gpr, 3)>;
+def : Pat<(v4i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_128_eg MEMxi:$src_gpr, 3)>;
+
+//===----------------------------------------------------------------------===//
+// VTX Read from constant memory space
+//===----------------------------------------------------------------------===//
+def : Pat<(i32:$dst_gpr (vtx_id2_az_extloadi8 ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_8_eg MEMxi:$src_gpr, 2)>;
+def : Pat<(i32:$dst_gpr (vtx_id2_az_extloadi16 ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_16_eg MEMxi:$src_gpr, 2)>;
+def : Pat<(i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_32_eg MEMxi:$src_gpr, 2)>;
+def : Pat<(v2i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_64_eg MEMxi:$src_gpr, 2)>;
+def : Pat<(v4i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_128_eg MEMxi:$src_gpr, 2)>;
+
+//===----------------------------------------------------------------------===//
+// VTX Read from global memory space
+//===----------------------------------------------------------------------===//
+def : Pat<(i32:$dst_gpr (vtx_id1_az_extloadi8 ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_8_eg MEMxi:$src_gpr, 1)>;
+def : Pat<(i32:$dst_gpr (vtx_id1_az_extloadi16 ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_16_eg MEMxi:$src_gpr, 1)>;
+def : Pat<(i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_32_eg MEMxi:$src_gpr, 1)>;
+def : Pat<(v2i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_64_eg MEMxi:$src_gpr, 1)>;
+def : Pat<(v4i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_128_eg MEMxi:$src_gpr, 1)>;
+
+} // End Predicates = [isEG]
+
+//===----------------------------------------------------------------------===//
+// Evergreen / Cayman Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isEGorCayman] in {
+
+// Should be predicated on FeatureFP64
+// def FMA_64 : R600_3OP <
+// 0xA, "FMA_64",
+// [(set f64:$dst, (fma f64:$src0, f64:$src1, f64:$src2))]
+// >;
+
+// BFE_UINT - bit_extract, an optimization for mask and shift
+// Src0 = Input
+// Src1 = Offset
+// Src2 = Width
+//
+// bit_extract = (Input << (32 - Offset - Width)) >> (32 - Width)
+//
+// Example Usage:
+// (Offset, Width)
+//
+// (0, 8) = (Input << 24) >> 24 = (Input & 0xff) >> 0
+// (8, 8) = (Input << 16) >> 24 = (Input & 0xffff) >> 8
+// (16, 8) = (Input << 8) >> 24 = (Input & 0xffffff) >> 16
+// (24, 8) = (Input << 0) >> 24 = (Input & 0xffffffff) >> 24
+def BFE_UINT_eg : R600_3OP <0x4, "BFE_UINT",
+ [(set i32:$dst, (AMDGPUbfe_u32 i32:$src0, i32:$src1, i32:$src2))],
+ VecALU
+>;
+
+def BFE_INT_eg : R600_3OP <0x5, "BFE_INT",
+ [(set i32:$dst, (AMDGPUbfe_i32 i32:$src0, i32:$src1, i32:$src2))],
+ VecALU
+>;
+
+def : BFEPattern <BFE_UINT_eg, MOV_IMM_I32>;
+
+def BFI_INT_eg : R600_3OP <0x06, "BFI_INT",
+ [(set i32:$dst, (AMDGPUbfi i32:$src0, i32:$src1, i32:$src2))],
+ VecALU
+>;
+
+def : Pat<(i32 (sext_inreg i32:$src, i1)),
+ (BFE_INT_eg i32:$src, (i32 ZERO), (i32 ONE_INT))>;
+def : Pat<(i32 (sext_inreg i32:$src, i8)),
+ (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 8))>;
+def : Pat<(i32 (sext_inreg i32:$src, i16)),
+ (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 16))>;
+
+defm : BFIPatterns <BFI_INT_eg, MOV_IMM_I32, R600_Reg64>;
+
+def BFM_INT_eg : R600_2OP <0xA0, "BFM_INT",
+ [(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))],
+ VecALU
+>;
+
+def MULADD_UINT24_eg : R600_3OP <0x10, "MULADD_UINT24",
+ [(set i32:$dst, (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2))], VecALU
+>;
+
+def : UMad24Pat<MULADD_UINT24_eg>;
+
+def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT", [], VecALU>;
+def : ROTRPattern <BIT_ALIGN_INT_eg>;
+def MULADD_eg : MULADD_Common<0x14>;
+def MULADD_IEEE_eg : MULADD_IEEE_Common<0x18>;
+def FMA_eg : FMA_Common<0x7>;
+def ASHR_eg : ASHR_Common<0x15>;
+def LSHR_eg : LSHR_Common<0x16>;
+def LSHL_eg : LSHL_Common<0x17>;
+def CNDE_eg : CNDE_Common<0x19>;
+def CNDGT_eg : CNDGT_Common<0x1A>;
+def CNDGE_eg : CNDGE_Common<0x1B>;
+def MUL_LIT_eg : MUL_LIT_Common<0x1F>;
+def LOG_CLAMPED_eg : LOG_CLAMPED_Common<0x82>;
+def MUL_UINT24_eg : R600_2OP <0xB5, "MUL_UINT24",
+ [(set i32:$dst, (AMDGPUmul_u24 i32:$src0, i32:$src1))], VecALU
+>;
+def DOT4_eg : DOT4_Common<0xBE>;
+defm CUBE_eg : CUBE_Common<0xC0>;
+
+def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>;
+
+def ADDC_UINT : R600_2OP_Helper <0x52, "ADDC_UINT", AMDGPUcarry>;
+def SUBB_UINT : R600_2OP_Helper <0x53, "SUBB_UINT", AMDGPUborrow>;
+
+def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", AMDGPUffbh_u32, VecALU>;
+def FFBL_INT : R600_1OP_Helper <0xAC, "FFBL_INT", cttz_zero_undef, VecALU>;
+
+let hasSideEffects = 1 in {
+ def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", [], VecALU>;
+}
+
+def FLT_TO_INT_eg : FLT_TO_INT_Common<0x50> {
+ let Pattern = [];
+ let Itinerary = AnyALU;
+}
+
+def INT_TO_FLT_eg : INT_TO_FLT_Common<0x9B>;
+
+def FLT_TO_UINT_eg : FLT_TO_UINT_Common<0x9A> {
+ let Pattern = [];
+}
+
+def UINT_TO_FLT_eg : UINT_TO_FLT_Common<0x9C>;
+
+def GROUP_BARRIER : InstR600 <
+ (outs), (ins), " GROUP_BARRIER", [(int_r600_group_barrier)], AnyALU>,
+ R600ALU_Word0,
+ R600ALU_Word1_OP2 <0x54> {
+
+ let dst = 0;
+ let dst_rel = 0;
+ let src0 = 0;
+ let src0_rel = 0;
+ let src0_neg = 0;
+ let src0_abs = 0;
+ let src1 = 0;
+ let src1_rel = 0;
+ let src1_neg = 0;
+ let src1_abs = 0;
+ let write = 0;
+ let omod = 0;
+ let clamp = 0;
+ let last = 1;
+ let bank_swizzle = 0;
+ let pred_sel = 0;
+ let update_exec_mask = 0;
+ let update_pred = 0;
+
+ let Inst{31-0} = Word0;
+ let Inst{63-32} = Word1;
+
+ let ALUInst = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// LDS Instructions
+//===----------------------------------------------------------------------===//
+class R600_LDS <bits<6> op, dag outs, dag ins, string asm,
+ list<dag> pattern = []> :
+
+ InstR600 <outs, ins, asm, pattern, XALU>,
+ R600_ALU_LDS_Word0,
+ R600LDS_Word1 {
+
+ bits<6> offset = 0;
+ let lds_op = op;
+
+ let Word1{27} = offset{0};
+ let Word1{12} = offset{1};
+ let Word1{28} = offset{2};
+ let Word1{31} = offset{3};
+ let Word0{12} = offset{4};
+ let Word0{25} = offset{5};
+
+
+ let Inst{31-0} = Word0;
+ let Inst{63-32} = Word1;
+
+ let ALUInst = 1;
+ let HasNativeOperands = 1;
+ let UseNamedOperandTable = 1;
+}
+
+class R600_LDS_1A <bits<6> lds_op, string name, list<dag> pattern> : R600_LDS <
+ lds_op,
+ (outs R600_Reg32:$dst),
+ (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel,
+ LAST:$last, R600_Pred:$pred_sel,
+ BANK_SWIZZLE:$bank_swizzle),
+ " "#name#" $last OQAP, $src0$src0_rel $pred_sel",
+ pattern
+ > {
+
+ let src1 = 0;
+ let src1_rel = 0;
+ let src2 = 0;
+ let src2_rel = 0;
+
+ let usesCustomInserter = 1;
+ let LDS_1A = 1;
+ let DisableEncoding = "$dst";
+}
+
+class R600_LDS_1A1D <bits<6> lds_op, dag outs, string name, list<dag> pattern,
+ string dst =""> :
+ R600_LDS <
+ lds_op, outs,
+ (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel,
+ R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel,
+ LAST:$last, R600_Pred:$pred_sel,
+ BANK_SWIZZLE:$bank_swizzle),
+ " "#name#" $last "#dst#"$src0$src0_rel, $src1$src1_rel, $pred_sel",
+ pattern
+ > {
+
+ field string BaseOp;
+
+ let src2 = 0;
+ let src2_rel = 0;
+ let LDS_1A1D = 1;
+}
+
+class R600_LDS_1A1D_NORET <bits<6> lds_op, string name, list<dag> pattern> :
+ R600_LDS_1A1D <lds_op, (outs), name, pattern> {
+ let BaseOp = name;
+}
+
+class R600_LDS_1A1D_RET <bits<6> lds_op, string name, list<dag> pattern> :
+ R600_LDS_1A1D <lds_op, (outs R600_Reg32:$dst), name##"_RET", pattern, "OQAP, "> {
+
+ let BaseOp = name;
+ let usesCustomInserter = 1;
+ let DisableEncoding = "$dst";
+}
+
+class R600_LDS_1A2D <bits<6> lds_op, dag outs, string name, list<dag> pattern,
+ string dst =""> :
+ R600_LDS <
+ lds_op, outs,
+ (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel,
+ R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel,
+ R600_Reg32:$src2, REL:$src2_rel, SEL:$src2_sel,
+ LAST:$last, R600_Pred:$pred_sel, BANK_SWIZZLE:$bank_swizzle),
+ " "#name# "$last "#dst#"$src0$src0_rel, $src1$src1_rel, $src2$src2_rel, $pred_sel",
+ pattern> {
+
+ field string BaseOp;
+
+ let LDS_1A1D = 0;
+ let LDS_1A2D = 1;
+}
+
+class R600_LDS_1A2D_NORET <bits<6> lds_op, string name, list<dag> pattern> :
+ R600_LDS_1A2D <lds_op, (outs), name, pattern> {
+ let BaseOp = name;
+}
+
+class R600_LDS_1A2D_RET <bits<6> lds_op, string name, list<dag> pattern> :
+ R600_LDS_1A2D <lds_op, (outs R600_Reg32:$dst), name, pattern> {
+
+ let BaseOp = name;
+ let usesCustomInserter = 1;
+ let DisableEncoding = "$dst";
+}
+
+def LDS_ADD : R600_LDS_1A1D_NORET <0x0, "LDS_ADD", [] >;
+def LDS_SUB : R600_LDS_1A1D_NORET <0x1, "LDS_SUB", [] >;
+def LDS_AND : R600_LDS_1A1D_NORET <0x9, "LDS_AND", [] >;
+def LDS_OR : R600_LDS_1A1D_NORET <0xa, "LDS_OR", [] >;
+def LDS_XOR : R600_LDS_1A1D_NORET <0xb, "LDS_XOR", [] >;
+def LDS_WRXCHG: R600_LDS_1A1D_NORET <0xd, "LDS_WRXCHG", [] >;
+def LDS_CMPST: R600_LDS_1A2D_NORET <0x10, "LDS_CMPST", [] >;
+def LDS_MIN_INT : R600_LDS_1A1D_NORET <0x5, "LDS_MIN_INT", [] >;
+def LDS_MAX_INT : R600_LDS_1A1D_NORET <0x6, "LDS_MAX_INT", [] >;
+def LDS_MIN_UINT : R600_LDS_1A1D_NORET <0x7, "LDS_MIN_UINT", [] >;
+def LDS_MAX_UINT : R600_LDS_1A1D_NORET <0x8, "LDS_MAX_UINT", [] >;
+def LDS_WRITE : R600_LDS_1A1D_NORET <0xD, "LDS_WRITE",
+ [(local_store (i32 R600_Reg32:$src1), R600_Reg32:$src0)]
+>;
+def LDS_BYTE_WRITE : R600_LDS_1A1D_NORET<0x12, "LDS_BYTE_WRITE",
+ [(truncstorei8_local i32:$src1, i32:$src0)]
+>;
+def LDS_SHORT_WRITE : R600_LDS_1A1D_NORET<0x13, "LDS_SHORT_WRITE",
+ [(truncstorei16_local i32:$src1, i32:$src0)]
+>;
+def LDS_ADD_RET : R600_LDS_1A1D_RET <0x20, "LDS_ADD",
+ [(set i32:$dst, (atomic_load_add_local i32:$src0, i32:$src1))]
+>;
+def LDS_SUB_RET : R600_LDS_1A1D_RET <0x21, "LDS_SUB",
+ [(set i32:$dst, (atomic_load_sub_local i32:$src0, i32:$src1))]
+>;
+def LDS_AND_RET : R600_LDS_1A1D_RET <0x29, "LDS_AND",
+ [(set i32:$dst, (atomic_load_and_local i32:$src0, i32:$src1))]
+>;
+def LDS_OR_RET : R600_LDS_1A1D_RET <0x2a, "LDS_OR",
+ [(set i32:$dst, (atomic_load_or_local i32:$src0, i32:$src1))]
+>;
+def LDS_XOR_RET : R600_LDS_1A1D_RET <0x2b, "LDS_XOR",
+ [(set i32:$dst, (atomic_load_xor_local i32:$src0, i32:$src1))]
+>;
+def LDS_MIN_INT_RET : R600_LDS_1A1D_RET <0x25, "LDS_MIN_INT",
+ [(set i32:$dst, (atomic_load_min_local i32:$src0, i32:$src1))]
+>;
+def LDS_MAX_INT_RET : R600_LDS_1A1D_RET <0x26, "LDS_MAX_INT",
+ [(set i32:$dst, (atomic_load_max_local i32:$src0, i32:$src1))]
+>;
+def LDS_MIN_UINT_RET : R600_LDS_1A1D_RET <0x27, "LDS_MIN_UINT",
+ [(set i32:$dst, (atomic_load_umin_local i32:$src0, i32:$src1))]
+>;
+def LDS_MAX_UINT_RET : R600_LDS_1A1D_RET <0x28, "LDS_MAX_UINT",
+ [(set i32:$dst, (atomic_load_umax_local i32:$src0, i32:$src1))]
+>;
+def LDS_WRXCHG_RET : R600_LDS_1A1D_RET <0x2d, "LDS_WRXCHG",
+ [(set i32:$dst, (atomic_swap_local i32:$src0, i32:$src1))]
+>;
+def LDS_CMPST_RET : R600_LDS_1A2D_RET <0x30, "LDS_CMPST",
+ [(set i32:$dst, (atomic_cmp_swap_32_local i32:$src0, i32:$src1, i32:$src2))]
+>;
+def LDS_READ_RET : R600_LDS_1A <0x32, "LDS_READ_RET",
+ [(set (i32 R600_Reg32:$dst), (local_load R600_Reg32:$src0))]
+>;
+def LDS_BYTE_READ_RET : R600_LDS_1A <0x36, "LDS_BYTE_READ_RET",
+ [(set i32:$dst, (sextloadi8_local i32:$src0))]
+>;
+def LDS_UBYTE_READ_RET : R600_LDS_1A <0x37, "LDS_UBYTE_READ_RET",
+ [(set i32:$dst, (az_extloadi8_local i32:$src0))]
+>;
+def LDS_SHORT_READ_RET : R600_LDS_1A <0x38, "LDS_SHORT_READ_RET",
+ [(set i32:$dst, (sextloadi16_local i32:$src0))]
+>;
+def LDS_USHORT_READ_RET : R600_LDS_1A <0x39, "LDS_USHORT_READ_RET",
+ [(set i32:$dst, (az_extloadi16_local i32:$src0))]
+>;
+
+// TRUNC is used for the FLT_TO_INT instructions to work around a
+// perceived problem where the rounding modes are applied differently
+// depending on the instruction and the slot they are in.
+// See:
+// https://bugs.freedesktop.org/show_bug.cgi?id=50232
+// Mesa commit: a1a0974401c467cb86ef818f22df67c21774a38c
+//
+// XXX: Lowering SELECT_CC will sometimes generate fp_to_[su]int nodes,
+// which do not need to be truncated since the fp values are 0.0f or 1.0f.
+// We should look into handling these cases separately.
+def : Pat<(fp_to_sint f32:$src0), (FLT_TO_INT_eg (TRUNC $src0))>;
+
+def : Pat<(fp_to_uint f32:$src0), (FLT_TO_UINT_eg (TRUNC $src0))>;
+
+// SHA-256 Patterns
+def : SHA256MaPattern <BFI_INT_eg, XOR_INT>;
+
+def EG_ExportSwz : ExportSwzInst {
+ let Word1{19-16} = 0; // BURST_COUNT
+ let Word1{20} = 0; // VALID_PIXEL_MODE
+ let Word1{21} = eop;
+ let Word1{29-22} = inst;
+ let Word1{30} = 0; // MARK
+ let Word1{31} = 1; // BARRIER
+}
+defm : ExportPattern<EG_ExportSwz, 83>;
+
+def EG_ExportBuf : ExportBufInst {
+ let Word1{19-16} = 0; // BURST_COUNT
+ let Word1{20} = 0; // VALID_PIXEL_MODE
+ let Word1{21} = eop;
+ let Word1{29-22} = inst;
+ let Word1{30} = 0; // MARK
+ let Word1{31} = 1; // BARRIER
+}
+defm : SteamOutputExportPattern<EG_ExportBuf, 0x40, 0x41, 0x42, 0x43>;
+
+def CF_TC_EG : CF_CLAUSE_EG<1, (ins i32imm:$ADDR, i32imm:$COUNT),
+ "TEX $COUNT @$ADDR"> {
+ let POP_COUNT = 0;
+}
+def CF_VC_EG : CF_CLAUSE_EG<2, (ins i32imm:$ADDR, i32imm:$COUNT),
+ "VTX $COUNT @$ADDR"> {
+ let POP_COUNT = 0;
+}
+def WHILE_LOOP_EG : CF_CLAUSE_EG<6, (ins i32imm:$ADDR),
+ "LOOP_START_DX10 @$ADDR"> {
+ let POP_COUNT = 0;
+ let COUNT = 0;
+}
+def END_LOOP_EG : CF_CLAUSE_EG<5, (ins i32imm:$ADDR), "END_LOOP @$ADDR"> {
+ let POP_COUNT = 0;
+ let COUNT = 0;
+}
+def LOOP_BREAK_EG : CF_CLAUSE_EG<9, (ins i32imm:$ADDR),
+ "LOOP_BREAK @$ADDR"> {
+ let POP_COUNT = 0;
+ let COUNT = 0;
+}
+def CF_CONTINUE_EG : CF_CLAUSE_EG<8, (ins i32imm:$ADDR),
+ "CONTINUE @$ADDR"> {
+ let POP_COUNT = 0;
+ let COUNT = 0;
+}
+def CF_JUMP_EG : CF_CLAUSE_EG<10, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+ "JUMP @$ADDR POP:$POP_COUNT"> {
+ let COUNT = 0;
+}
+def CF_PUSH_EG : CF_CLAUSE_EG<11, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+ "PUSH @$ADDR POP:$POP_COUNT"> {
+ let COUNT = 0;
+}
+def CF_ELSE_EG : CF_CLAUSE_EG<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+ "ELSE @$ADDR POP:$POP_COUNT"> {
+ let COUNT = 0;
+}
+def CF_CALL_FS_EG : CF_CLAUSE_EG<19, (ins), "CALL_FS"> {
+ let ADDR = 0;
+ let COUNT = 0;
+ let POP_COUNT = 0;
+}
+def POP_EG : CF_CLAUSE_EG<14, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+ "POP @$ADDR POP:$POP_COUNT"> {
+ let COUNT = 0;
+}
+def CF_END_EG : CF_CLAUSE_EG<0, (ins), "CF_END"> {
+ let COUNT = 0;
+ let POP_COUNT = 0;
+ let ADDR = 0;
+ let END_OF_PROGRAM = 1;
+}
+
+} // End Predicates = [isEGorCayman]
diff --git a/contrib/llvm/lib/Target/AMDGPU/FLATInstructions.td b/contrib/llvm/lib/Target/AMDGPU/FLATInstructions.td
new file mode 100644
index 000000000000..849fb8ad50f5
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -0,0 +1,530 @@
+//===-- FLATInstructions.td - FLAT Instruction Defintions -----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def FLATAtomic : ComplexPattern<i64, 3, "SelectFlat">;
+
+//===----------------------------------------------------------------------===//
+// FLAT classes
+//===----------------------------------------------------------------------===//
+
+class FLAT_Pseudo<string opName, dag outs, dag ins,
+ string asmOps, list<dag> pattern=[]> :
+ InstSI<outs, ins, "", pattern>,
+ SIMCInstr<opName, SIEncodingFamily.NONE> {
+
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+
+ let SubtargetPredicate = isCIVI;
+
+ let FLAT = 1;
+ // Internally, FLAT instruction are executed as both an LDS and a
+ // Buffer instruction; so, they increment both VM_CNT and LGKM_CNT
+ // and are not considered done until both have been decremented.
+ let VM_CNT = 1;
+ let LGKM_CNT = 1;
+
+ let Uses = [EXEC, FLAT_SCR]; // M0
+
+ let UseNamedOperandTable = 1;
+ let hasSideEffects = 0;
+ let SchedRW = [WriteVMEM];
+
+ string Mnemonic = opName;
+ string AsmOperands = asmOps;
+
+ bits<1> has_vdst = 1;
+ bits<1> has_data = 1;
+ bits<1> has_glc = 1;
+ bits<1> glcValue = 0;
+}
+
+class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
+ InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
+ Enc64 {
+
+ let isPseudo = 0;
+ let isCodeGenOnly = 0;
+
+ // copy relevant pseudo op flags
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let AsmMatchConverter = ps.AsmMatchConverter;
+
+ // encoding fields
+ bits<8> vaddr;
+ bits<8> vdata;
+ bits<8> vdst;
+ bits<1> slc;
+ bits<1> glc;
+ bits<1> tfe;
+
+ // 15-0 is reserved.
+ let Inst{16} = !if(ps.has_glc, glc, ps.glcValue);
+ let Inst{17} = slc;
+ let Inst{24-18} = op;
+ let Inst{31-26} = 0x37; // Encoding.
+ let Inst{39-32} = vaddr;
+ let Inst{47-40} = !if(ps.has_data, vdata, ?);
+ // 54-48 is reserved.
+ let Inst{55} = tfe;
+ let Inst{63-56} = !if(ps.has_vdst, vdst, ?);
+}
+
+class FLAT_Load_Pseudo <string opName, RegisterClass regClass> : FLAT_Pseudo<
+ opName,
+ (outs regClass:$vdst),
+ (ins VReg_64:$vaddr, GLC:$glc, slc:$slc, tfe:$tfe),
+ " $vdst, $vaddr$glc$slc$tfe"> {
+ let has_data = 0;
+ let mayLoad = 1;
+}
+
+class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass> : FLAT_Pseudo<
+ opName,
+ (outs),
+ (ins VReg_64:$vaddr, vdataClass:$vdata, GLC:$glc, slc:$slc, tfe:$tfe),
+ " $vaddr, $vdata$glc$slc$tfe"> {
+ let mayLoad = 0;
+ let mayStore = 1;
+ let has_vdst = 0;
+}
+
+multiclass FLAT_Atomic_Pseudo<
+ string opName,
+ RegisterClass vdst_rc,
+ ValueType vt,
+ SDPatternOperator atomic = null_frag,
+ ValueType data_vt = vt,
+ RegisterClass data_rc = vdst_rc> {
+
+ def "" : FLAT_Pseudo <opName,
+ (outs),
+ (ins VReg_64:$vaddr, data_rc:$vdata, slc:$slc, tfe:$tfe),
+ " $vaddr, $vdata$slc$tfe",
+ []>,
+ AtomicNoRet <NAME, 0> {
+ let mayLoad = 1;
+ let mayStore = 1;
+ let has_glc = 0;
+ let glcValue = 0;
+ let has_vdst = 0;
+ let PseudoInstr = NAME;
+ }
+
+ def _RTN : FLAT_Pseudo <opName,
+ (outs vdst_rc:$vdst),
+ (ins VReg_64:$vaddr, data_rc:$vdata, slc:$slc, tfe:$tfe),
+ " $vdst, $vaddr, $vdata glc$slc$tfe",
+ [(set vt:$vdst,
+ (atomic (FLATAtomic i64:$vaddr, i1:$slc, i1:$tfe), data_vt:$vdata))]>,
+ AtomicNoRet <NAME, 1> {
+ let mayLoad = 1;
+ let mayStore = 1;
+ let hasPostISelHook = 1;
+ let has_glc = 0;
+ let glcValue = 1;
+ let PseudoInstr = NAME # "_RTN";
+ }
+}
+
+class flat_binary_atomic_op<SDNode atomic_op> : PatFrag<
+ (ops node:$ptr, node:$value),
+ (atomic_op node:$ptr, node:$value),
+ [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS;}]
+>;
+
+def atomic_cmp_swap_flat : flat_binary_atomic_op<AMDGPUatomic_cmp_swap>;
+def atomic_swap_flat : flat_binary_atomic_op<atomic_swap>;
+def atomic_add_flat : flat_binary_atomic_op<atomic_load_add>;
+def atomic_and_flat : flat_binary_atomic_op<atomic_load_and>;
+def atomic_max_flat : flat_binary_atomic_op<atomic_load_max>;
+def atomic_min_flat : flat_binary_atomic_op<atomic_load_min>;
+def atomic_or_flat : flat_binary_atomic_op<atomic_load_or>;
+def atomic_sub_flat : flat_binary_atomic_op<atomic_load_sub>;
+def atomic_umax_flat : flat_binary_atomic_op<atomic_load_umax>;
+def atomic_umin_flat : flat_binary_atomic_op<atomic_load_umin>;
+def atomic_xor_flat : flat_binary_atomic_op<atomic_load_xor>;
+def atomic_inc_flat : flat_binary_atomic_op<SIatomic_inc>;
+def atomic_dec_flat : flat_binary_atomic_op<SIatomic_dec>;
+
+
+
+//===----------------------------------------------------------------------===//
+// Flat Instructions
+//===----------------------------------------------------------------------===//
+
+def FLAT_LOAD_UBYTE : FLAT_Load_Pseudo <"flat_load_ubyte", VGPR_32>;
+def FLAT_LOAD_SBYTE : FLAT_Load_Pseudo <"flat_load_sbyte", VGPR_32>;
+def FLAT_LOAD_USHORT : FLAT_Load_Pseudo <"flat_load_ushort", VGPR_32>;
+def FLAT_LOAD_SSHORT : FLAT_Load_Pseudo <"flat_load_sshort", VGPR_32>;
+def FLAT_LOAD_DWORD : FLAT_Load_Pseudo <"flat_load_dword", VGPR_32>;
+def FLAT_LOAD_DWORDX2 : FLAT_Load_Pseudo <"flat_load_dwordx2", VReg_64>;
+def FLAT_LOAD_DWORDX4 : FLAT_Load_Pseudo <"flat_load_dwordx4", VReg_128>;
+def FLAT_LOAD_DWORDX3 : FLAT_Load_Pseudo <"flat_load_dwordx3", VReg_96>;
+
+def FLAT_STORE_BYTE : FLAT_Store_Pseudo <"flat_store_byte", VGPR_32>;
+def FLAT_STORE_SHORT : FLAT_Store_Pseudo <"flat_store_short", VGPR_32>;
+def FLAT_STORE_DWORD : FLAT_Store_Pseudo <"flat_store_dword", VGPR_32>;
+def FLAT_STORE_DWORDX2 : FLAT_Store_Pseudo <"flat_store_dwordx2", VReg_64>;
+def FLAT_STORE_DWORDX4 : FLAT_Store_Pseudo <"flat_store_dwordx4", VReg_128>;
+def FLAT_STORE_DWORDX3 : FLAT_Store_Pseudo <"flat_store_dwordx3", VReg_96>;
+
+defm FLAT_ATOMIC_CMPSWAP : FLAT_Atomic_Pseudo <"flat_atomic_cmpswap",
+ VGPR_32, i32, atomic_cmp_swap_flat,
+ v2i32, VReg_64>;
+
+defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_Atomic_Pseudo <"flat_atomic_cmpswap_x2",
+ VReg_64, i64, atomic_cmp_swap_flat,
+ v2i64, VReg_128>;
+
+defm FLAT_ATOMIC_SWAP : FLAT_Atomic_Pseudo <"flat_atomic_swap",
+ VGPR_32, i32, atomic_swap_flat>;
+
+defm FLAT_ATOMIC_SWAP_X2 : FLAT_Atomic_Pseudo <"flat_atomic_swap_x2",
+ VReg_64, i64, atomic_swap_flat>;
+
+defm FLAT_ATOMIC_ADD : FLAT_Atomic_Pseudo <"flat_atomic_add",
+ VGPR_32, i32, atomic_add_flat>;
+
+defm FLAT_ATOMIC_SUB : FLAT_Atomic_Pseudo <"flat_atomic_sub",
+ VGPR_32, i32, atomic_sub_flat>;
+
+defm FLAT_ATOMIC_SMIN : FLAT_Atomic_Pseudo <"flat_atomic_smin",
+ VGPR_32, i32, atomic_min_flat>;
+
+defm FLAT_ATOMIC_UMIN : FLAT_Atomic_Pseudo <"flat_atomic_umin",
+ VGPR_32, i32, atomic_umin_flat>;
+
+defm FLAT_ATOMIC_SMAX : FLAT_Atomic_Pseudo <"flat_atomic_smax",
+ VGPR_32, i32, atomic_max_flat>;
+
+defm FLAT_ATOMIC_UMAX : FLAT_Atomic_Pseudo <"flat_atomic_umax",
+ VGPR_32, i32, atomic_umax_flat>;
+
+defm FLAT_ATOMIC_AND : FLAT_Atomic_Pseudo <"flat_atomic_and",
+ VGPR_32, i32, atomic_and_flat>;
+
+defm FLAT_ATOMIC_OR : FLAT_Atomic_Pseudo <"flat_atomic_or",
+ VGPR_32, i32, atomic_or_flat>;
+
+defm FLAT_ATOMIC_XOR : FLAT_Atomic_Pseudo <"flat_atomic_xor",
+ VGPR_32, i32, atomic_xor_flat>;
+
+defm FLAT_ATOMIC_INC : FLAT_Atomic_Pseudo <"flat_atomic_inc",
+ VGPR_32, i32, atomic_inc_flat>;
+
+defm FLAT_ATOMIC_DEC : FLAT_Atomic_Pseudo <"flat_atomic_dec",
+ VGPR_32, i32, atomic_dec_flat>;
+
+defm FLAT_ATOMIC_ADD_X2 : FLAT_Atomic_Pseudo <"flat_atomic_add_x2",
+ VReg_64, i64, atomic_add_flat>;
+
+defm FLAT_ATOMIC_SUB_X2 : FLAT_Atomic_Pseudo <"flat_atomic_sub_x2",
+ VReg_64, i64, atomic_sub_flat>;
+
+defm FLAT_ATOMIC_SMIN_X2 : FLAT_Atomic_Pseudo <"flat_atomic_smin_x2",
+ VReg_64, i64, atomic_min_flat>;
+
+defm FLAT_ATOMIC_UMIN_X2 : FLAT_Atomic_Pseudo <"flat_atomic_umin_x2",
+ VReg_64, i64, atomic_umin_flat>;
+
+defm FLAT_ATOMIC_SMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_smax_x2",
+ VReg_64, i64, atomic_max_flat>;
+
+defm FLAT_ATOMIC_UMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_umax_x2",
+ VReg_64, i64, atomic_umax_flat>;
+
+defm FLAT_ATOMIC_AND_X2 : FLAT_Atomic_Pseudo <"flat_atomic_and_x2",
+ VReg_64, i64, atomic_and_flat>;
+
+defm FLAT_ATOMIC_OR_X2 : FLAT_Atomic_Pseudo <"flat_atomic_or_x2",
+ VReg_64, i64, atomic_or_flat>;
+
+defm FLAT_ATOMIC_XOR_X2 : FLAT_Atomic_Pseudo <"flat_atomic_xor_x2",
+ VReg_64, i64, atomic_xor_flat>;
+
+defm FLAT_ATOMIC_INC_X2 : FLAT_Atomic_Pseudo <"flat_atomic_inc_x2",
+ VReg_64, i64, atomic_inc_flat>;
+
+defm FLAT_ATOMIC_DEC_X2 : FLAT_Atomic_Pseudo <"flat_atomic_dec_x2",
+ VReg_64, i64, atomic_dec_flat>;
+
+let SubtargetPredicate = isCI in { // CI Only flat instructions : FIXME Only?
+
+defm FLAT_ATOMIC_FCMPSWAP : FLAT_Atomic_Pseudo <"flat_atomic_fcmpswap",
+ VGPR_32, f32, null_frag, v2f32, VReg_64>;
+
+defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fcmpswap_x2",
+ VReg_64, f64, null_frag, v2f64, VReg_128>;
+
+defm FLAT_ATOMIC_FMIN : FLAT_Atomic_Pseudo <"flat_atomic_fmin",
+ VGPR_32, f32>;
+
+defm FLAT_ATOMIC_FMAX : FLAT_Atomic_Pseudo <"flat_atomic_fmax",
+ VGPR_32, f32>;
+
+defm FLAT_ATOMIC_FMIN_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmin_x2",
+ VReg_64, f64>;
+
+defm FLAT_ATOMIC_FMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmax_x2",
+ VReg_64, f64>;
+
+} // End SubtargetPredicate = isCI
+
+//===----------------------------------------------------------------------===//
+// Flat Patterns
+//===----------------------------------------------------------------------===//
+
+class flat_ld <SDPatternOperator ld> : PatFrag<(ops node:$ptr),
+ (ld node:$ptr), [{
+ auto const AS = cast<MemSDNode>(N)->getAddressSpace();
+ return AS == AMDGPUAS::FLAT_ADDRESS ||
+ AS == AMDGPUAS::GLOBAL_ADDRESS ||
+ AS == AMDGPUAS::CONSTANT_ADDRESS;
+}]>;
+
+class flat_st <SDPatternOperator st> : PatFrag<(ops node:$val, node:$ptr),
+ (st node:$val, node:$ptr), [{
+ auto const AS = cast<MemSDNode>(N)->getAddressSpace();
+ return AS == AMDGPUAS::FLAT_ADDRESS ||
+ AS == AMDGPUAS::GLOBAL_ADDRESS;
+}]>;
+
+def atomic_flat_load : flat_ld <atomic_load>;
+def flat_load : flat_ld <load>;
+def flat_az_extloadi8 : flat_ld <az_extloadi8>;
+def flat_sextloadi8 : flat_ld <sextloadi8>;
+def flat_az_extloadi16 : flat_ld <az_extloadi16>;
+def flat_sextloadi16 : flat_ld <sextloadi16>;
+
+def atomic_flat_store : flat_st <atomic_store>;
+def flat_store : flat_st <store>;
+def flat_truncstorei8 : flat_st <truncstorei8>;
+def flat_truncstorei16 : flat_st <truncstorei16>;
+
+// Patterns for global loads with no offset.
+class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
+ (vt (node i64:$addr)),
+ (inst $addr, 0, 0, 0)
+>;
+
+class FlatLoadAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
+ (vt (node i64:$addr)),
+ (inst $addr, 1, 0, 0)
+>;
+
+class FlatStorePat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
+ (node vt:$data, i64:$addr),
+ (inst $addr, $data, 0, 0, 0)
+>;
+
+class FlatStoreAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
+ // atomic store follows atomic binop convention so the address comes
+ // first.
+ (node i64:$addr, vt:$data),
+ (inst $addr, $data, 1, 0, 0)
+>;
+
+class FlatAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
+ ValueType data_vt = vt> : Pat <
+ (vt (node i64:$addr, data_vt:$data)),
+ (inst $addr, $data, 0, 0)
+>;
+
+let Predicates = [isCIVI] in {
+
+def : FlatLoadPat <FLAT_LOAD_UBYTE, flat_az_extloadi8, i32>;
+def : FlatLoadPat <FLAT_LOAD_SBYTE, flat_sextloadi8, i32>;
+def : FlatLoadPat <FLAT_LOAD_UBYTE, flat_az_extloadi8, i16>;
+def : FlatLoadPat <FLAT_LOAD_SBYTE, flat_sextloadi8, i16>;
+def : FlatLoadPat <FLAT_LOAD_USHORT, flat_az_extloadi16, i32>;
+def : FlatLoadPat <FLAT_LOAD_SSHORT, flat_sextloadi16, i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORD, flat_load, i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORDX2, flat_load, v2i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORDX4, flat_load, v4i32>;
+
+def : FlatLoadAtomicPat <FLAT_LOAD_DWORD, atomic_flat_load, i32>;
+def : FlatLoadAtomicPat <FLAT_LOAD_DWORDX2, atomic_flat_load, i64>;
+
+def : FlatStorePat <FLAT_STORE_BYTE, flat_truncstorei8, i32>;
+def : FlatStorePat <FLAT_STORE_SHORT, flat_truncstorei16, i32>;
+def : FlatStorePat <FLAT_STORE_DWORD, flat_store, i32>;
+def : FlatStorePat <FLAT_STORE_DWORDX2, flat_store, v2i32>;
+def : FlatStorePat <FLAT_STORE_DWORDX4, flat_store, v4i32>;
+
+def : FlatStoreAtomicPat <FLAT_STORE_DWORD, atomic_flat_store, i32>;
+def : FlatStoreAtomicPat <FLAT_STORE_DWORDX2, atomic_flat_store, i64>;
+
+def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_INC_RTN, atomic_inc_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_DEC_RTN, atomic_dec_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_max_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_umax_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_RTN, AMDGPUatomic_cmp_swap_global, i32, v2i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>;
+
+def : FlatAtomicPat <FLAT_ATOMIC_ADD_X2_RTN, atomic_add_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_SUB_X2_RTN, atomic_sub_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_INC_X2_RTN, atomic_inc_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_DEC_X2_RTN, atomic_dec_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_AND_X2_RTN, atomic_and_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMAX_X2_RTN, atomic_max_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMAX_X2_RTN, atomic_umax_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMIN_X2_RTN, atomic_min_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMIN_X2_RTN, atomic_umin_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_OR_X2_RTN, atomic_or_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_SWAP_X2_RTN, atomic_swap_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_X2_RTN, AMDGPUatomic_cmp_swap_global, i64, v2i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>;
+
+} // End Predicates = [isCIVI]
+
+let Predicates = [isVI] in {
+ def : FlatStorePat <FLAT_STORE_BYTE, flat_truncstorei8, i16>;
+ def : FlatStorePat <FLAT_STORE_SHORT, flat_store, i16>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Target
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// CI
+//===----------------------------------------------------------------------===//
+
+class FLAT_Real_ci <bits<7> op, FLAT_Pseudo ps> :
+ FLAT_Real <op, ps>,
+ SIMCInstr <ps.PseudoInstr, SIEncodingFamily.SI> {
+ let AssemblerPredicate = isCIOnly;
+ let DecoderNamespace="CI";
+}
+
+def FLAT_LOAD_UBYTE_ci : FLAT_Real_ci <0x8, FLAT_LOAD_UBYTE>;
+def FLAT_LOAD_SBYTE_ci : FLAT_Real_ci <0x9, FLAT_LOAD_SBYTE>;
+def FLAT_LOAD_USHORT_ci : FLAT_Real_ci <0xa, FLAT_LOAD_USHORT>;
+def FLAT_LOAD_SSHORT_ci : FLAT_Real_ci <0xb, FLAT_LOAD_SSHORT>;
+def FLAT_LOAD_DWORD_ci : FLAT_Real_ci <0xc, FLAT_LOAD_DWORD>;
+def FLAT_LOAD_DWORDX2_ci : FLAT_Real_ci <0xd, FLAT_LOAD_DWORDX2>;
+def FLAT_LOAD_DWORDX4_ci : FLAT_Real_ci <0xe, FLAT_LOAD_DWORDX4>;
+def FLAT_LOAD_DWORDX3_ci : FLAT_Real_ci <0xf, FLAT_LOAD_DWORDX3>;
+
+def FLAT_STORE_BYTE_ci : FLAT_Real_ci <0x18, FLAT_STORE_BYTE>;
+def FLAT_STORE_SHORT_ci : FLAT_Real_ci <0x1a, FLAT_STORE_SHORT>;
+def FLAT_STORE_DWORD_ci : FLAT_Real_ci <0x1c, FLAT_STORE_DWORD>;
+def FLAT_STORE_DWORDX2_ci : FLAT_Real_ci <0x1d, FLAT_STORE_DWORDX2>;
+def FLAT_STORE_DWORDX4_ci : FLAT_Real_ci <0x1e, FLAT_STORE_DWORDX4>;
+def FLAT_STORE_DWORDX3_ci : FLAT_Real_ci <0x1f, FLAT_STORE_DWORDX3>;
+
+multiclass FLAT_Real_Atomics_ci <bits<7> op, FLAT_Pseudo ps> {
+ def _ci : FLAT_Real_ci<op, !cast<FLAT_Pseudo>(ps.PseudoInstr)>;
+ def _RTN_ci : FLAT_Real_ci<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN")>;
+}
+
+defm FLAT_ATOMIC_SWAP : FLAT_Real_Atomics_ci <0x30, FLAT_ATOMIC_SWAP>;
+defm FLAT_ATOMIC_CMPSWAP : FLAT_Real_Atomics_ci <0x31, FLAT_ATOMIC_CMPSWAP>;
+defm FLAT_ATOMIC_ADD : FLAT_Real_Atomics_ci <0x32, FLAT_ATOMIC_ADD>;
+defm FLAT_ATOMIC_SUB : FLAT_Real_Atomics_ci <0x33, FLAT_ATOMIC_SUB>;
+defm FLAT_ATOMIC_SMIN : FLAT_Real_Atomics_ci <0x35, FLAT_ATOMIC_SMIN>;
+defm FLAT_ATOMIC_UMIN : FLAT_Real_Atomics_ci <0x36, FLAT_ATOMIC_UMIN>;
+defm FLAT_ATOMIC_SMAX : FLAT_Real_Atomics_ci <0x37, FLAT_ATOMIC_SMAX>;
+defm FLAT_ATOMIC_UMAX : FLAT_Real_Atomics_ci <0x38, FLAT_ATOMIC_UMAX>;
+defm FLAT_ATOMIC_AND : FLAT_Real_Atomics_ci <0x39, FLAT_ATOMIC_AND>;
+defm FLAT_ATOMIC_OR : FLAT_Real_Atomics_ci <0x3a, FLAT_ATOMIC_OR>;
+defm FLAT_ATOMIC_XOR : FLAT_Real_Atomics_ci <0x3b, FLAT_ATOMIC_XOR>;
+defm FLAT_ATOMIC_INC : FLAT_Real_Atomics_ci <0x3c, FLAT_ATOMIC_INC>;
+defm FLAT_ATOMIC_DEC : FLAT_Real_Atomics_ci <0x3d, FLAT_ATOMIC_DEC>;
+defm FLAT_ATOMIC_SWAP_X2 : FLAT_Real_Atomics_ci <0x50, FLAT_ATOMIC_SWAP_X2>;
+defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_Real_Atomics_ci <0x51, FLAT_ATOMIC_CMPSWAP_X2>;
+defm FLAT_ATOMIC_ADD_X2 : FLAT_Real_Atomics_ci <0x52, FLAT_ATOMIC_ADD_X2>;
+defm FLAT_ATOMIC_SUB_X2 : FLAT_Real_Atomics_ci <0x53, FLAT_ATOMIC_SUB_X2>;
+defm FLAT_ATOMIC_SMIN_X2 : FLAT_Real_Atomics_ci <0x55, FLAT_ATOMIC_SMIN_X2>;
+defm FLAT_ATOMIC_UMIN_X2 : FLAT_Real_Atomics_ci <0x56, FLAT_ATOMIC_UMIN_X2>;
+defm FLAT_ATOMIC_SMAX_X2 : FLAT_Real_Atomics_ci <0x57, FLAT_ATOMIC_SMAX_X2>;
+defm FLAT_ATOMIC_UMAX_X2 : FLAT_Real_Atomics_ci <0x58, FLAT_ATOMIC_UMAX_X2>;
+defm FLAT_ATOMIC_AND_X2 : FLAT_Real_Atomics_ci <0x59, FLAT_ATOMIC_AND_X2>;
+defm FLAT_ATOMIC_OR_X2 : FLAT_Real_Atomics_ci <0x5a, FLAT_ATOMIC_OR_X2>;
+defm FLAT_ATOMIC_XOR_X2 : FLAT_Real_Atomics_ci <0x5b, FLAT_ATOMIC_XOR_X2>;
+defm FLAT_ATOMIC_INC_X2 : FLAT_Real_Atomics_ci <0x5c, FLAT_ATOMIC_INC_X2>;
+defm FLAT_ATOMIC_DEC_X2 : FLAT_Real_Atomics_ci <0x5d, FLAT_ATOMIC_DEC_X2>;
+
+// CI Only flat instructions
+defm FLAT_ATOMIC_FCMPSWAP : FLAT_Real_Atomics_ci <0x3e, FLAT_ATOMIC_FCMPSWAP>;
+defm FLAT_ATOMIC_FMIN : FLAT_Real_Atomics_ci <0x3f, FLAT_ATOMIC_FMIN>;
+defm FLAT_ATOMIC_FMAX : FLAT_Real_Atomics_ci <0x40, FLAT_ATOMIC_FMAX>;
+defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_Real_Atomics_ci <0x5e, FLAT_ATOMIC_FCMPSWAP_X2>;
+defm FLAT_ATOMIC_FMIN_X2 : FLAT_Real_Atomics_ci <0x5f, FLAT_ATOMIC_FMIN_X2>;
+defm FLAT_ATOMIC_FMAX_X2 : FLAT_Real_Atomics_ci <0x60, FLAT_ATOMIC_FMAX_X2>;
+
+
+//===----------------------------------------------------------------------===//
+// VI
+//===----------------------------------------------------------------------===//
+
+class FLAT_Real_vi <bits<7> op, FLAT_Pseudo ps> :
+ FLAT_Real <op, ps>,
+ SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> {
+ let AssemblerPredicate = isVI;
+ let DecoderNamespace="VI";
+}
+
+def FLAT_LOAD_UBYTE_vi : FLAT_Real_vi <0x10, FLAT_LOAD_UBYTE>;
+def FLAT_LOAD_SBYTE_vi : FLAT_Real_vi <0x11, FLAT_LOAD_SBYTE>;
+def FLAT_LOAD_USHORT_vi : FLAT_Real_vi <0x12, FLAT_LOAD_USHORT>;
+def FLAT_LOAD_SSHORT_vi : FLAT_Real_vi <0x13, FLAT_LOAD_SSHORT>;
+def FLAT_LOAD_DWORD_vi : FLAT_Real_vi <0x14, FLAT_LOAD_DWORD>;
+def FLAT_LOAD_DWORDX2_vi : FLAT_Real_vi <0x15, FLAT_LOAD_DWORDX2>;
+def FLAT_LOAD_DWORDX4_vi : FLAT_Real_vi <0x17, FLAT_LOAD_DWORDX4>;
+def FLAT_LOAD_DWORDX3_vi : FLAT_Real_vi <0x16, FLAT_LOAD_DWORDX3>;
+
+def FLAT_STORE_BYTE_vi : FLAT_Real_vi <0x18, FLAT_STORE_BYTE>;
+def FLAT_STORE_SHORT_vi : FLAT_Real_vi <0x1a, FLAT_STORE_SHORT>;
+def FLAT_STORE_DWORD_vi : FLAT_Real_vi <0x1c, FLAT_STORE_DWORD>;
+def FLAT_STORE_DWORDX2_vi : FLAT_Real_vi <0x1d, FLAT_STORE_DWORDX2>;
+def FLAT_STORE_DWORDX4_vi : FLAT_Real_vi <0x1f, FLAT_STORE_DWORDX4>;
+def FLAT_STORE_DWORDX3_vi : FLAT_Real_vi <0x1e, FLAT_STORE_DWORDX3>;
+
+multiclass FLAT_Real_Atomics_vi <bits<7> op, FLAT_Pseudo ps> {
+ def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr)>;
+ def _RTN_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN")>;
+}
+
+defm FLAT_ATOMIC_SWAP : FLAT_Real_Atomics_vi <0x40, FLAT_ATOMIC_SWAP>;
+defm FLAT_ATOMIC_CMPSWAP : FLAT_Real_Atomics_vi <0x41, FLAT_ATOMIC_CMPSWAP>;
+defm FLAT_ATOMIC_ADD : FLAT_Real_Atomics_vi <0x42, FLAT_ATOMIC_ADD>;
+defm FLAT_ATOMIC_SUB : FLAT_Real_Atomics_vi <0x43, FLAT_ATOMIC_SUB>;
+defm FLAT_ATOMIC_SMIN : FLAT_Real_Atomics_vi <0x44, FLAT_ATOMIC_SMIN>;
+defm FLAT_ATOMIC_UMIN : FLAT_Real_Atomics_vi <0x45, FLAT_ATOMIC_UMIN>;
+defm FLAT_ATOMIC_SMAX : FLAT_Real_Atomics_vi <0x46, FLAT_ATOMIC_SMAX>;
+defm FLAT_ATOMIC_UMAX : FLAT_Real_Atomics_vi <0x47, FLAT_ATOMIC_UMAX>;
+defm FLAT_ATOMIC_AND : FLAT_Real_Atomics_vi <0x48, FLAT_ATOMIC_AND>;
+defm FLAT_ATOMIC_OR : FLAT_Real_Atomics_vi <0x49, FLAT_ATOMIC_OR>;
+defm FLAT_ATOMIC_XOR : FLAT_Real_Atomics_vi <0x4a, FLAT_ATOMIC_XOR>;
+defm FLAT_ATOMIC_INC : FLAT_Real_Atomics_vi <0x4b, FLAT_ATOMIC_INC>;
+defm FLAT_ATOMIC_DEC : FLAT_Real_Atomics_vi <0x4c, FLAT_ATOMIC_DEC>;
+defm FLAT_ATOMIC_SWAP_X2 : FLAT_Real_Atomics_vi <0x60, FLAT_ATOMIC_SWAP_X2>;
+defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_Real_Atomics_vi <0x61, FLAT_ATOMIC_CMPSWAP_X2>;
+defm FLAT_ATOMIC_ADD_X2 : FLAT_Real_Atomics_vi <0x62, FLAT_ATOMIC_ADD_X2>;
+defm FLAT_ATOMIC_SUB_X2 : FLAT_Real_Atomics_vi <0x63, FLAT_ATOMIC_SUB_X2>;
+defm FLAT_ATOMIC_SMIN_X2 : FLAT_Real_Atomics_vi <0x64, FLAT_ATOMIC_SMIN_X2>;
+defm FLAT_ATOMIC_UMIN_X2 : FLAT_Real_Atomics_vi <0x65, FLAT_ATOMIC_UMIN_X2>;
+defm FLAT_ATOMIC_SMAX_X2 : FLAT_Real_Atomics_vi <0x66, FLAT_ATOMIC_SMAX_X2>;
+defm FLAT_ATOMIC_UMAX_X2 : FLAT_Real_Atomics_vi <0x67, FLAT_ATOMIC_UMAX_X2>;
+defm FLAT_ATOMIC_AND_X2 : FLAT_Real_Atomics_vi <0x68, FLAT_ATOMIC_AND_X2>;
+defm FLAT_ATOMIC_OR_X2 : FLAT_Real_Atomics_vi <0x69, FLAT_ATOMIC_OR_X2>;
+defm FLAT_ATOMIC_XOR_X2 : FLAT_Real_Atomics_vi <0x6a, FLAT_ATOMIC_XOR_X2>;
+defm FLAT_ATOMIC_INC_X2 : FLAT_Real_Atomics_vi <0x6b, FLAT_ATOMIC_INC_X2>;
+defm FLAT_ATOMIC_DEC_X2 : FLAT_Real_Atomics_vi <0x6c, FLAT_ATOMIC_DEC_X2>;
+
diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
new file mode 100644
index 000000000000..dd3b46f13921
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -0,0 +1,502 @@
+//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements hazard recognizers for scheduling on GCN processors.
+//
+//===----------------------------------------------------------------------===//
+
+#include "GCNHazardRecognizer.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Hazard Recoginizer Implementation
+//===----------------------------------------------------------------------===//
+
+GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
+ CurrCycleInstr(nullptr),
+ MF(MF),
+ ST(MF.getSubtarget<SISubtarget>()) {
+ MaxLookAhead = 5;
+}
+
+void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
+ EmitInstruction(SU->getInstr());
+}
+
+void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
+ CurrCycleInstr = MI;
+}
+
+static bool isDivFMas(unsigned Opcode) {
+ return Opcode == AMDGPU::V_DIV_FMAS_F32 || Opcode == AMDGPU::V_DIV_FMAS_F64;
+}
+
+static bool isSGetReg(unsigned Opcode) {
+ return Opcode == AMDGPU::S_GETREG_B32;
+}
+
+static bool isSSetReg(unsigned Opcode) {
+ return Opcode == AMDGPU::S_SETREG_B32 || Opcode == AMDGPU::S_SETREG_IMM32_B32;
+}
+
+static bool isRWLane(unsigned Opcode) {
+ return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
+}
+
+static bool isRFE(unsigned Opcode) {
+ return Opcode == AMDGPU::S_RFE_B64;
+}
+
+static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
+
+ const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
+ AMDGPU::OpName::simm16);
+ return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
+}
+
+ScheduleHazardRecognizer::HazardType
+GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
+ MachineInstr *MI = SU->getInstr();
+
+ if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
+ return NoopHazard;
+
+ if (SIInstrInfo::isVMEM(*MI) && checkVMEMHazards(MI) > 0)
+ return NoopHazard;
+
+ if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
+ return NoopHazard;
+
+ if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
+ return NoopHazard;
+
+ if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
+ return NoopHazard;
+
+ if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
+ return NoopHazard;
+
+ if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
+ return NoopHazard;
+
+ if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
+ return NoopHazard;
+
+ if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
+ return NoopHazard;
+
+ return NoHazard;
+}
+
+unsigned GCNHazardRecognizer::PreEmitNoops(SUnit *SU) {
+ return PreEmitNoops(SU->getInstr());
+}
+
+unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
+ if (SIInstrInfo::isSMRD(*MI))
+ return std::max(0, checkSMRDHazards(MI));
+
+ if (SIInstrInfo::isVALU(*MI)) {
+ int WaitStates = std::max(0, checkVALUHazards(MI));
+
+ if (SIInstrInfo::isVMEM(*MI))
+ WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
+
+ if (SIInstrInfo::isDPP(*MI))
+ WaitStates = std::max(WaitStates, checkDPPHazards(MI));
+
+ if (isDivFMas(MI->getOpcode()))
+ WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
+
+ if (isRWLane(MI->getOpcode()))
+ WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
+
+ return WaitStates;
+ }
+
+ if (isSGetReg(MI->getOpcode()))
+ return std::max(0, checkGetRegHazards(MI));
+
+ if (isSSetReg(MI->getOpcode()))
+ return std::max(0, checkSetRegHazards(MI));
+
+ if (isRFE(MI->getOpcode()))
+ return std::max(0, checkRFEHazards(MI));
+
+ return 0;
+}
+
+void GCNHazardRecognizer::EmitNoop() {
+ EmittedInstrs.push_front(nullptr);
+}
+
+void GCNHazardRecognizer::AdvanceCycle() {
+
+ // When the scheduler detects a stall, it will call AdvanceCycle() without
+ // emitting any instructions.
+ if (!CurrCycleInstr)
+ return;
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ unsigned NumWaitStates = TII->getNumWaitStates(*CurrCycleInstr);
+
+ // Keep track of emitted instructions
+ EmittedInstrs.push_front(CurrCycleInstr);
+
+ // Add a nullptr for each additional wait state after the first. Make sure
+ // not to add more than getMaxLookAhead() items to the list, since we
+ // truncate the list to that size right after this loop.
+ for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
+ i < e; ++i) {
+ EmittedInstrs.push_front(nullptr);
+ }
+
+ // getMaxLookahead() is the largest number of wait states we will ever need
+ // to insert, so there is no point in keeping track of more than that many
+ // wait states.
+ EmittedInstrs.resize(getMaxLookAhead());
+
+ CurrCycleInstr = nullptr;
+}
+
+void GCNHazardRecognizer::RecedeCycle() {
+ llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
+}
+
+//===----------------------------------------------------------------------===//
+// Helper Functions
+//===----------------------------------------------------------------------===//
+
+int GCNHazardRecognizer::getWaitStatesSince(
+ function_ref<bool(MachineInstr *)> IsHazard) {
+
+ int WaitStates = -1;
+ for (MachineInstr *MI : EmittedInstrs) {
+ ++WaitStates;
+ if (!MI || !IsHazard(MI))
+ continue;
+ return WaitStates;
+ }
+ return std::numeric_limits<int>::max();
+}
+
+int GCNHazardRecognizer::getWaitStatesSinceDef(
+ unsigned Reg, function_ref<bool(MachineInstr *)> IsHazardDef) {
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+ auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) {
+ return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI);
+ };
+
+ return getWaitStatesSince(IsHazardFn);
+}
+
+int GCNHazardRecognizer::getWaitStatesSinceSetReg(
+ function_ref<bool(MachineInstr *)> IsHazard) {
+
+ auto IsHazardFn = [IsHazard] (MachineInstr *MI) {
+ return isSSetReg(MI->getOpcode()) && IsHazard(MI);
+ };
+
+ return getWaitStatesSince(IsHazardFn);
+}
+
+//===----------------------------------------------------------------------===//
+// No-op Hazard Detection
+//===----------------------------------------------------------------------===//
+
+static void addRegsToSet(iterator_range<MachineInstr::const_mop_iterator> Ops,
+ std::set<unsigned> &Set) {
+ for (const MachineOperand &Op : Ops) {
+ if (Op.isReg())
+ Set.insert(Op.getReg());
+ }
+}
+
+int GCNHazardRecognizer::checkSMEMSoftClauseHazards(MachineInstr *SMEM) {
+ // SMEM soft clause are only present on VI+
+ if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
+ return 0;
+
+ // A soft-clause is any group of consecutive SMEM instructions. The
+ // instructions in this group may return out of order and/or may be
+ // replayed (i.e. the same instruction issued more than once).
+ //
+ // In order to handle these situations correctly we need to make sure
+ // that when a clause has more than one instruction, no instruction in the
+ // clause writes to a register that is read another instruction in the clause
+ // (including itself). If we encounter this situaion, we need to break the
+ // clause by inserting a non SMEM instruction.
+
+ std::set<unsigned> ClauseDefs;
+ std::set<unsigned> ClauseUses;
+
+ for (MachineInstr *MI : EmittedInstrs) {
+
+ // When we hit a non-SMEM instruction then we have passed the start of the
+ // clause and we can stop.
+ if (!MI || !SIInstrInfo::isSMRD(*MI))
+ break;
+
+ addRegsToSet(MI->defs(), ClauseDefs);
+ addRegsToSet(MI->uses(), ClauseUses);
+ }
+
+ if (ClauseDefs.empty())
+ return 0;
+
+ // FIXME: When we support stores, we need to make sure not to put loads and
+ // stores in the same clause if they use the same address. For now, just
+ // start a new clause whenever we see a store.
+ if (SMEM->mayStore())
+ return 1;
+
+ addRegsToSet(SMEM->defs(), ClauseDefs);
+ addRegsToSet(SMEM->uses(), ClauseUses);
+
+ std::vector<unsigned> Result(std::max(ClauseDefs.size(), ClauseUses.size()));
+ std::vector<unsigned>::iterator End;
+
+ End = std::set_intersection(ClauseDefs.begin(), ClauseDefs.end(),
+ ClauseUses.begin(), ClauseUses.end(), Result.begin());
+
+ // If the set of defs and uses intersect then we cannot add this instruction
+ // to the clause, so we have a hazard.
+ if (End != Result.begin())
+ return 1;
+
+ return 0;
+}
+
+int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ int WaitStatesNeeded = 0;
+
+ WaitStatesNeeded = checkSMEMSoftClauseHazards(SMRD);
+
+ // This SMRD hazard only affects SI.
+ if (ST.getGeneration() != SISubtarget::SOUTHERN_ISLANDS)
+ return WaitStatesNeeded;
+
+ // A read of an SGPR by SMRD instruction requires 4 wait states when the
+ // SGPR was written by a VALU instruction.
+ int SmrdSgprWaitStates = 4;
+ auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
+
+ for (const MachineOperand &Use : SMRD->uses()) {
+ if (!Use.isReg())
+ continue;
+ int WaitStatesNeededForUse =
+ SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn);
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+ }
+ return WaitStatesNeeded;
+}
+
+int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
+ const SIInstrInfo *TII = ST.getInstrInfo();
+
+ if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
+ return 0;
+
+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
+
+ // A read of an SGPR by a VMEM instruction requires 5 wait states when the
+ // SGPR was written by a VALU Instruction.
+ int VmemSgprWaitStates = 5;
+ int WaitStatesNeeded = 0;
+ auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
+
+ for (const MachineOperand &Use : VMEM->uses()) {
+ if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
+ continue;
+
+ int WaitStatesNeededForUse =
+ VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn);
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+ }
+ return WaitStatesNeeded;
+}
+
+int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+ // Check for DPP VGPR read after VALU VGPR write.
+ int DppVgprWaitStates = 2;
+ int WaitStatesNeeded = 0;
+
+ for (const MachineOperand &Use : DPP->uses()) {
+ if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
+ continue;
+ int WaitStatesNeededForUse =
+ DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg());
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+ }
+
+ return WaitStatesNeeded;
+}
+
+int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
+ const SIInstrInfo *TII = ST.getInstrInfo();
+
+ // v_div_fmas requires 4 wait states after a write to vcc from a VALU
+ // instruction.
+ const int DivFMasWaitStates = 4;
+ auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
+ int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn);
+
+ return DivFMasWaitStates - WaitStatesNeeded;
+}
+
+int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
+
+ const int GetRegWaitStates = 2;
+ auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) {
+ return GetRegHWReg == getHWReg(TII, *MI);
+ };
+ int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn);
+
+ return GetRegWaitStates - WaitStatesNeeded;
+}
+
+int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ unsigned HWReg = getHWReg(TII, *SetRegInstr);
+
+ const int SetRegWaitStates =
+ ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ? 1 : 2;
+ auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) {
+ return HWReg == getHWReg(TII, *MI);
+ };
+ int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn);
+ return SetRegWaitStates - WaitStatesNeeded;
+}
+
+int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
+ if (!MI.mayStore())
+ return -1;
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ unsigned Opcode = MI.getOpcode();
+ const MCInstrDesc &Desc = MI.getDesc();
+
+ int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
+ int VDataRCID = -1;
+ if (VDataIdx != -1)
+ VDataRCID = Desc.OpInfo[VDataIdx].RegClass;
+
+ if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
+ // There is no hazard if the instruction does not use vector regs
+ // (like wbinvl1)
+ if (VDataIdx == -1)
+ return -1;
+ // For MUBUF/MTBUF instructions this hazard only exists if the
+ // instruction is not using a register in the soffset field.
+ const MachineOperand *SOffset =
+ TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
+ // If we have no soffset operand, then assume this field has been
+ // hardcoded to zero.
+ if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
+ (!SOffset || !SOffset->isReg()))
+ return VDataIdx;
+ }
+
+ // MIMG instructions create a hazard if they don't use a 256-bit T# and
+ // the store size is greater than 8 bytes and they have more than two bits
+ // of their dmask set.
+ // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
+ if (TII->isMIMG(MI)) {
+ int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
+ assert(SRsrcIdx != -1 &&
+ AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256);
+ (void)SRsrcIdx;
+ }
+
+ if (TII->isFLAT(MI)) {
+ int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
+ if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64)
+ return DataIdx;
+ }
+
+ return -1;
+}
+
+int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
+ // This checks for the hazard where VMEM instructions that store more than
+ // 8 bytes can have there store data over written by the next instruction.
+ if (!ST.has12DWordStoreHazard())
+ return 0;
+
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const MachineRegisterInfo &MRI = VALU->getParent()->getParent()->getRegInfo();
+
+ const int VALUWaitStates = 1;
+ int WaitStatesNeeded = 0;
+
+ for (const MachineOperand &Def : VALU->defs()) {
+ if (!TRI->isVGPR(MRI, Def.getReg()))
+ continue;
+ unsigned Reg = Def.getReg();
+ auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) {
+ int DataIdx = createsVALUHazard(*MI);
+ return DataIdx >= 0 &&
+ TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg);
+ };
+ int WaitStatesNeededForDef =
+ VALUWaitStates - getWaitStatesSince(IsHazardFn);
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
+ }
+ return WaitStatesNeeded;
+}
+
+int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const MachineRegisterInfo &MRI =
+ RWLane->getParent()->getParent()->getRegInfo();
+
+ const MachineOperand *LaneSelectOp =
+ TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
+
+ if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
+ return 0;
+
+ unsigned LaneSelectReg = LaneSelectOp->getReg();
+ auto IsHazardFn = [TII] (MachineInstr *MI) {
+ return TII->isVALU(*MI);
+ };
+
+ const int RWLaneWaitStates = 4;
+ int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn);
+ return RWLaneWaitStates - WaitStatesSince;
+}
+
+int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
+
+ if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ return 0;
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+
+ const int RFEWaitStates = 1;
+
+ auto IsHazardFn = [TII] (MachineInstr *MI) {
+ return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS;
+ };
+ int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn);
+ return RFEWaitStates - WaitStatesNeeded;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
new file mode 100644
index 000000000000..0ab82ff4635b
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -0,0 +1,71 @@
+//===-- GCNHazardRecognizers.h - GCN Hazard Recognizers ---------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines hazard recognizers for scheduling on GCN processors.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H
+#define LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include <list>
+
+namespace llvm {
+
+class MachineFunction;
+class MachineInstr;
+class ScheduleDAG;
+class SIInstrInfo;
+class SISubtarget;
+
+class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
+ // This variable stores the instruction that has been emitted this cycle. It
+ // will be added to EmittedInstrs, when AdvanceCycle() or RecedeCycle() is
+ // called.
+ MachineInstr *CurrCycleInstr;
+ std::list<MachineInstr*> EmittedInstrs;
+ const MachineFunction &MF;
+ const SISubtarget &ST;
+
+ int getWaitStatesSince(function_ref<bool(MachineInstr *)> IsHazard);
+ int getWaitStatesSinceDef(unsigned Reg,
+ function_ref<bool(MachineInstr *)> IsHazardDef =
+ [](MachineInstr *) { return true; });
+ int getWaitStatesSinceSetReg(function_ref<bool(MachineInstr *)> IsHazard);
+
+ int checkSMEMSoftClauseHazards(MachineInstr *SMEM);
+ int checkSMRDHazards(MachineInstr *SMRD);
+ int checkVMEMHazards(MachineInstr* VMEM);
+ int checkDPPHazards(MachineInstr *DPP);
+ int checkDivFMasHazards(MachineInstr *DivFMas);
+ int checkGetRegHazards(MachineInstr *GetRegInstr);
+ int checkSetRegHazards(MachineInstr *SetRegInstr);
+ int createsVALUHazard(const MachineInstr &MI);
+ int checkVALUHazards(MachineInstr *VALU);
+ int checkRWLaneHazards(MachineInstr *RWLane);
+ int checkRFEHazards(MachineInstr *RFE);
+public:
+ GCNHazardRecognizer(const MachineFunction &MF);
+ // We can only issue one instruction per cycle.
+ bool atIssueLimit() const override { return true; }
+ void EmitInstruction(SUnit *SU) override;
+ void EmitInstruction(MachineInstr *MI) override;
+ HazardType getHazardType(SUnit *SU, int Stalls) override;
+ void EmitNoop() override;
+ unsigned PreEmitNoops(SUnit *SU) override;
+ unsigned PreEmitNoops(MachineInstr *) override;
+ void AdvanceCycle() override;
+ void RecedeCycle() override;
+};
+
+} // end namespace llvm
+
+#endif //LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
new file mode 100644
index 000000000000..2f88033c807f
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -0,0 +1,312 @@
+//===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This contains a MachineSchedStrategy implementation for maximizing wave
+/// occupancy on GCN hardware.
+//===----------------------------------------------------------------------===//
+
+#include "GCNSchedStrategy.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+
+#define DEBUG_TYPE "misched"
+
+using namespace llvm;
+
+GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
+ const MachineSchedContext *C) :
+ GenericScheduler(C) { }
+
+static unsigned getMaxWaves(unsigned SGPRs, unsigned VGPRs,
+ const MachineFunction &MF) {
+
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ unsigned MinRegOccupancy = std::min(ST.getOccupancyWithNumSGPRs(SGPRs),
+ ST.getOccupancyWithNumVGPRs(VGPRs));
+ return std::min(MinRegOccupancy,
+ ST.getOccupancyWithLocalMemSize(MFI->getLDSSize()));
+}
+
+void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
+ bool AtTop, const RegPressureTracker &RPTracker,
+ const SIRegisterInfo *SRI,
+ int SGPRPressure,
+ int VGPRPressure,
+ int SGPRExcessLimit,
+ int VGPRExcessLimit,
+ int SGPRCriticalLimit,
+ int VGPRCriticalLimit) {
+
+ Cand.SU = SU;
+ Cand.AtTop = AtTop;
+
+ // getDownwardPressure() and getUpwardPressure() make temporary changes to
+ // the the tracker, so we need to pass those function a non-const copy.
+ RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
+
+ std::vector<unsigned> Pressure;
+ std::vector<unsigned> MaxPressure;
+
+ if (AtTop)
+ TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure);
+ else {
+ // FIXME: I think for bottom up scheduling, the register pressure is cached
+ // and can be retrieved by DAG->getPressureDif(SU).
+ TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
+ }
+
+ int NewSGPRPressure = Pressure[SRI->getSGPRPressureSet()];
+ int NewVGPRPressure = Pressure[SRI->getVGPRPressureSet()];
+
+ // If two instructions increase the pressure of different register sets
+ // by the same amount, the generic scheduler will prefer to schedule the
+ // instruction that increases the set with the least amount of registers,
+ // which in our case would be SGPRs. This is rarely what we want, so
+ // when we report excess/critical register pressure, we do it either
+ // only for VGPRs or only for SGPRs.
+
+ // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
+ const int MaxVGPRPressureInc = 16;
+ bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
+ bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
+
+
+ // FIXME: We have to enter REG-EXCESS before we reach the actual threshold
+ // to increase the likelihood we don't go over the limits. We should improve
+ // the analysis to look through dependencies to find the path with the least
+ // register pressure.
+ // FIXME: This is also necessary, because some passes that run after
+ // scheduling and before regalloc increase register pressure.
+ const int ErrorMargin = 3;
+ VGPRExcessLimit -= ErrorMargin;
+ SGPRExcessLimit -= ErrorMargin;
+
+ // We only need to update the RPDelata for instructions that increase
+ // register pressure. Instructions that decrease or keep reg pressure
+ // the same will be marked as RegExcess in tryCandidate() when they
+ // are compared with instructions that increase the register pressure.
+ if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
+ Cand.RPDelta.Excess = PressureChange(SRI->getVGPRPressureSet());
+ Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
+ }
+
+ if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
+ Cand.RPDelta.Excess = PressureChange(SRI->getSGPRPressureSet());
+ Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure = SGPRExcessLimit);
+ }
+
+ // Register pressure is considered 'CRITICAL' if it is approaching a value
+ // that would reduce the wave occupancy for the execution unit. When
+ // register pressure is 'CRITICAL', increading SGPR and VGPR pressure both
+ // has the same cost, so we don't need to prefer one over the other.
+
+ VGPRCriticalLimit -= ErrorMargin;
+ SGPRCriticalLimit -= ErrorMargin;
+
+ int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
+ int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
+
+ if (SGPRDelta >= 0 || VGPRDelta >= 0) {
+ if (SGPRDelta > VGPRDelta) {
+ Cand.RPDelta.CriticalMax = PressureChange(SRI->getSGPRPressureSet());
+ Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
+ } else {
+ Cand.RPDelta.CriticalMax = PressureChange(SRI->getVGPRPressureSet());
+ Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
+ }
+ }
+}
+
+// This function is mostly cut and pasted from
+// GenericScheduler::pickNodeFromQueue()
+void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
+ const CandPolicy &ZonePolicy,
+ const RegPressureTracker &RPTracker,
+ SchedCandidate &Cand) {
+ const SISubtarget &ST = DAG->MF.getSubtarget<SISubtarget>();
+ const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
+ ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
+ unsigned SGPRPressure = Pressure[SRI->getSGPRPressureSet()];
+ unsigned VGPRPressure = Pressure[SRI->getVGPRPressureSet()];
+ unsigned SGPRExcessLimit =
+ Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
+ unsigned VGPRExcessLimit =
+ Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass);
+ unsigned MaxWaves = getMaxWaves(SGPRPressure, VGPRPressure, DAG->MF);
+ unsigned SGPRCriticalLimit = SRI->getMaxNumSGPRs(ST, MaxWaves, true);
+ unsigned VGPRCriticalLimit = SRI->getMaxNumVGPRs(MaxWaves);
+
+ ReadyQueue &Q = Zone.Available;
+ for (SUnit *SU : Q) {
+
+ SchedCandidate TryCand(ZonePolicy);
+ initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI,
+ SGPRPressure, VGPRPressure,
+ SGPRExcessLimit, VGPRExcessLimit,
+ SGPRCriticalLimit, VGPRCriticalLimit);
+ // Pass SchedBoundary only when comparing nodes from the same boundary.
+ SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
+ GenericScheduler::tryCandidate(Cand, TryCand, ZoneArg);
+ if (TryCand.Reason != NoCand) {
+ // Initialize resource delta if needed in case future heuristics query it.
+ if (TryCand.ResDelta == SchedResourceDelta())
+ TryCand.initResourceDelta(Zone.DAG, SchedModel);
+ Cand.setBest(TryCand);
+ }
+ }
+}
+
+static int getBidirectionalReasonRank(GenericSchedulerBase::CandReason Reason) {
+ switch (Reason) {
+ default:
+ return Reason;
+ case GenericSchedulerBase::RegCritical:
+ case GenericSchedulerBase::RegExcess:
+ return -Reason;
+ }
+}
+
+// This function is mostly cut and pasted from
+// GenericScheduler::pickNodeBidirectional()
+SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
+ // Schedule as far as possible in the direction of no choice. This is most
+ // efficient, but also provides the best heuristics for CriticalPSets.
+ if (SUnit *SU = Bot.pickOnlyChoice()) {
+ IsTopNode = false;
+ return SU;
+ }
+ if (SUnit *SU = Top.pickOnlyChoice()) {
+ IsTopNode = true;
+ return SU;
+ }
+ // Set the bottom-up policy based on the state of the current bottom zone and
+ // the instructions outside the zone, including the top zone.
+ CandPolicy BotPolicy;
+ setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
+ // Set the top-down policy based on the state of the current top zone and
+ // the instructions outside the zone, including the bottom zone.
+ CandPolicy TopPolicy;
+ setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
+
+ // See if BotCand is still valid (because we previously scheduled from Top).
+ DEBUG(dbgs() << "Picking from Bot:\n");
+ if (!BotCand.isValid() || BotCand.SU->isScheduled ||
+ BotCand.Policy != BotPolicy) {
+ BotCand.reset(CandPolicy());
+ pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand);
+ assert(BotCand.Reason != NoCand && "failed to find the first candidate");
+ } else {
+ DEBUG(traceCandidate(BotCand));
+ }
+
+ // Check if the top Q has a better candidate.
+ DEBUG(dbgs() << "Picking from Top:\n");
+ if (!TopCand.isValid() || TopCand.SU->isScheduled ||
+ TopCand.Policy != TopPolicy) {
+ TopCand.reset(CandPolicy());
+ pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand);
+ assert(TopCand.Reason != NoCand && "failed to find the first candidate");
+ } else {
+ DEBUG(traceCandidate(TopCand));
+ }
+
+ // Pick best from BotCand and TopCand.
+ DEBUG(
+ dbgs() << "Top Cand: ";
+ traceCandidate(BotCand);
+ dbgs() << "Bot Cand: ";
+ traceCandidate(TopCand);
+ );
+ SchedCandidate Cand;
+ if (TopCand.Reason == BotCand.Reason) {
+ Cand = BotCand;
+ GenericSchedulerBase::CandReason TopReason = TopCand.Reason;
+ TopCand.Reason = NoCand;
+ GenericScheduler::tryCandidate(Cand, TopCand, nullptr);
+ if (TopCand.Reason != NoCand) {
+ Cand.setBest(TopCand);
+ } else {
+ TopCand.Reason = TopReason;
+ }
+ } else {
+ if (TopCand.Reason == RegExcess && TopCand.RPDelta.Excess.getUnitInc() <= 0) {
+ Cand = TopCand;
+ } else if (BotCand.Reason == RegExcess && BotCand.RPDelta.Excess.getUnitInc() <= 0) {
+ Cand = BotCand;
+ } else if (TopCand.Reason == RegCritical && TopCand.RPDelta.CriticalMax.getUnitInc() <= 0) {
+ Cand = TopCand;
+ } else if (BotCand.Reason == RegCritical && BotCand.RPDelta.CriticalMax.getUnitInc() <= 0) {
+ Cand = BotCand;
+ } else {
+ int TopRank = getBidirectionalReasonRank(TopCand.Reason);
+ int BotRank = getBidirectionalReasonRank(BotCand.Reason);
+ if (TopRank > BotRank) {
+ Cand = TopCand;
+ } else {
+ Cand = BotCand;
+ }
+ }
+ }
+ DEBUG(
+ dbgs() << "Picking: ";
+ traceCandidate(Cand);
+ );
+
+ IsTopNode = Cand.AtTop;
+ return Cand.SU;
+}
+
+// This function is mostly cut and pasted from
+// GenericScheduler::pickNode()
+SUnit *GCNMaxOccupancySchedStrategy::pickNode(bool &IsTopNode) {
+ if (DAG->top() == DAG->bottom()) {
+ assert(Top.Available.empty() && Top.Pending.empty() &&
+ Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
+ return nullptr;
+ }
+ SUnit *SU;
+ do {
+ if (RegionPolicy.OnlyTopDown) {
+ SU = Top.pickOnlyChoice();
+ if (!SU) {
+ CandPolicy NoPolicy;
+ TopCand.reset(NoPolicy);
+ pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand);
+ assert(TopCand.Reason != NoCand && "failed to find a candidate");
+ SU = TopCand.SU;
+ }
+ IsTopNode = true;
+ } else if (RegionPolicy.OnlyBottomUp) {
+ SU = Bot.pickOnlyChoice();
+ if (!SU) {
+ CandPolicy NoPolicy;
+ BotCand.reset(NoPolicy);
+ pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand);
+ assert(BotCand.Reason != NoCand && "failed to find a candidate");
+ SU = BotCand.SU;
+ }
+ IsTopNode = false;
+ } else {
+ SU = pickNodeBidirectional(IsTopNode);
+ }
+ } while (SU->isScheduled);
+
+ if (SU->isTopReady())
+ Top.removeReady(SU);
+ if (SU->isBottomReady())
+ Bot.removeReady(SU);
+
+ DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr());
+ return SU;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
new file mode 100644
index 000000000000..4cfc0cea81fb
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -0,0 +1,54 @@
+//===-- GCNSchedStrategy.h - GCN Scheduler Strategy -*- C++ -*-------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H
+#define LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H
+
+#include "llvm/CodeGen/MachineScheduler.h"
+
+namespace llvm {
+
+class SIRegisterInfo;
+
+/// This is a minimal scheduler strategy. The main difference between this
+/// and the GenericScheduler is that GCNSchedStrategy uses different
+/// heuristics to determine excess/critical pressure sets. Its goal is to
+/// maximize kernel occupancy (i.e. maximum number of waves per simd).
+class GCNMaxOccupancySchedStrategy : public GenericScheduler {
+
+ SUnit *pickNodeBidirectional(bool &IsTopNode);
+
+ void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
+ const RegPressureTracker &RPTracker,
+ SchedCandidate &Cand);
+
+ void initCandidate(SchedCandidate &Cand, SUnit *SU,
+ bool AtTop, const RegPressureTracker &RPTracker,
+ const SIRegisterInfo *SRI,
+ int SGPRPressure, int VGPRPressure,
+ int SGPRExcessLimit, int VGPRExcessLimit,
+ int SGPRCriticalLimit, int VGPRCriticalLimit);
+
+ void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
+ SchedBoundary *Zone, const SIRegisterInfo *SRI,
+ unsigned SGPRPressure, unsigned VGPRPressure);
+
+public:
+ GCNMaxOccupancySchedStrategy(const MachineSchedContext *C);
+
+ SUnit *pickNode(bool &IsTopNode) override;
+};
+
+} // End namespace llvm
+
+#endif // GCNSCHEDSTRATEGY_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
new file mode 100644
index 000000000000..7172a0aa7167
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
@@ -0,0 +1,1108 @@
+//===-- AMDGPUInstPrinter.cpp - AMDGPU MC Inst -> ASM ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// \file
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUInstPrinter.h"
+#include "SIDefines.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "Utils/AMDGPUAsmUtils.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+
+using namespace llvm;
+using namespace llvm::AMDGPU;
+
+void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+ StringRef Annot, const MCSubtargetInfo &STI) {
+ OS.flush();
+ printInstruction(MI, STI, OS);
+ printAnnotation(OS, Annot);
+}
+
+void AMDGPUInstPrinter::printU4ImmOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ O << formatHex(MI->getOperand(OpNo).getImm() & 0xf);
+}
+
+void AMDGPUInstPrinter::printU8ImmOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ O << formatHex(MI->getOperand(OpNo).getImm() & 0xff);
+}
+
+void AMDGPUInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ // It's possible to end up with a 32-bit literal used with a 16-bit operand
+ // with ignored high bits. Print as 32-bit anyway in that case.
+ int64_t Imm = MI->getOperand(OpNo).getImm();
+ if (isInt<16>(Imm) || isUInt<16>(Imm))
+ O << formatHex(static_cast<uint64_t>(Imm & 0xffff));
+ else
+ printU32ImmOperand(MI, OpNo, STI, O);
+}
+
+void AMDGPUInstPrinter::printU4ImmDecOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ O << formatDec(MI->getOperand(OpNo).getImm() & 0xf);
+}
+
+void AMDGPUInstPrinter::printU8ImmDecOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ O << formatDec(MI->getOperand(OpNo).getImm() & 0xff);
+}
+
+void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff);
+}
+
+void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ O << formatHex(MI->getOperand(OpNo).getImm() & 0xffffffff);
+}
+
+void AMDGPUInstPrinter::printNamedBit(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O, StringRef BitName) {
+ if (MI->getOperand(OpNo).getImm()) {
+ O << ' ' << BitName;
+ }
+}
+
+void AMDGPUInstPrinter::printOffen(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ printNamedBit(MI, OpNo, O, "offen");
+}
+
+void AMDGPUInstPrinter::printIdxen(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ printNamedBit(MI, OpNo, O, "idxen");
+}
+
+void AMDGPUInstPrinter::printAddr64(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ printNamedBit(MI, OpNo, O, "addr64");
+}
+
+void AMDGPUInstPrinter::printMBUFOffset(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ if (MI->getOperand(OpNo).getImm()) {
+ O << " offset:";
+ printU16ImmDecOperand(MI, OpNo, O);
+ }
+}
+
+void AMDGPUInstPrinter::printOffset(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ uint16_t Imm = MI->getOperand(OpNo).getImm();
+ if (Imm != 0) {
+ O << " offset:";
+ printU16ImmDecOperand(MI, OpNo, O);
+ }
+}
+
+void AMDGPUInstPrinter::printOffset0(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ if (MI->getOperand(OpNo).getImm()) {
+ O << " offset0:";
+ printU8ImmDecOperand(MI, OpNo, O);
+ }
+}
+
+void AMDGPUInstPrinter::printOffset1(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ if (MI->getOperand(OpNo).getImm()) {
+ O << " offset1:";
+ printU8ImmDecOperand(MI, OpNo, O);
+ }
+}
+
+void AMDGPUInstPrinter::printSMRDOffset8(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ printU32ImmOperand(MI, OpNo, STI, O);
+}
+
+void AMDGPUInstPrinter::printSMRDOffset20(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ printU32ImmOperand(MI, OpNo, STI, O);
+}
+
+void AMDGPUInstPrinter::printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ printU32ImmOperand(MI, OpNo, STI, O);
+}
+
+void AMDGPUInstPrinter::printGDS(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ printNamedBit(MI, OpNo, O, "gds");
+}
+
+void AMDGPUInstPrinter::printGLC(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ printNamedBit(MI, OpNo, O, "glc");
+}
+
+void AMDGPUInstPrinter::printSLC(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ printNamedBit(MI, OpNo, O, "slc");
+}
+
+void AMDGPUInstPrinter::printTFE(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ printNamedBit(MI, OpNo, O, "tfe");
+}
+
+void AMDGPUInstPrinter::printDMask(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ if (MI->getOperand(OpNo).getImm()) {
+ O << " dmask:";
+ printU16ImmOperand(MI, OpNo, STI, O);
+ }
+}
+
+void AMDGPUInstPrinter::printUNorm(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ printNamedBit(MI, OpNo, O, "unorm");
+}
+
+void AMDGPUInstPrinter::printDA(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ printNamedBit(MI, OpNo, O, "da");
+}
+
+void AMDGPUInstPrinter::printR128(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ printNamedBit(MI, OpNo, O, "r128");
+}
+
+void AMDGPUInstPrinter::printLWE(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ printNamedBit(MI, OpNo, O, "lwe");
+}
+
+void AMDGPUInstPrinter::printExpCompr(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ if (MI->getOperand(OpNo).getImm())
+ O << " compr";
+}
+
+void AMDGPUInstPrinter::printExpVM(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ if (MI->getOperand(OpNo).getImm())
+ O << " vm";
+}
+
+void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
+ const MCRegisterInfo &MRI) {
+ switch (RegNo) {
+ case AMDGPU::VCC:
+ O << "vcc";
+ return;
+ case AMDGPU::SCC:
+ O << "scc";
+ return;
+ case AMDGPU::EXEC:
+ O << "exec";
+ return;
+ case AMDGPU::M0:
+ O << "m0";
+ return;
+ case AMDGPU::FLAT_SCR:
+ O << "flat_scratch";
+ return;
+ case AMDGPU::VCC_LO:
+ O << "vcc_lo";
+ return;
+ case AMDGPU::VCC_HI:
+ O << "vcc_hi";
+ return;
+ case AMDGPU::TBA_LO:
+ O << "tba_lo";
+ return;
+ case AMDGPU::TBA_HI:
+ O << "tba_hi";
+ return;
+ case AMDGPU::TMA_LO:
+ O << "tma_lo";
+ return;
+ case AMDGPU::TMA_HI:
+ O << "tma_hi";
+ return;
+ case AMDGPU::EXEC_LO:
+ O << "exec_lo";
+ return;
+ case AMDGPU::EXEC_HI:
+ O << "exec_hi";
+ return;
+ case AMDGPU::FLAT_SCR_LO:
+ O << "flat_scratch_lo";
+ return;
+ case AMDGPU::FLAT_SCR_HI:
+ O << "flat_scratch_hi";
+ return;
+ default:
+ break;
+ }
+
+ // The low 8 bits of the encoding value is the register index, for both VGPRs
+ // and SGPRs.
+ unsigned RegIdx = MRI.getEncodingValue(RegNo) & ((1 << 8) - 1);
+
+ unsigned NumRegs;
+ if (MRI.getRegClass(AMDGPU::VGPR_32RegClassID).contains(RegNo)) {
+ O << 'v';
+ NumRegs = 1;
+ } else if (MRI.getRegClass(AMDGPU::SGPR_32RegClassID).contains(RegNo)) {
+ O << 's';
+ NumRegs = 1;
+ } else if (MRI.getRegClass(AMDGPU::VReg_64RegClassID).contains(RegNo)) {
+ O <<'v';
+ NumRegs = 2;
+ } else if (MRI.getRegClass(AMDGPU::SGPR_64RegClassID).contains(RegNo)) {
+ O << 's';
+ NumRegs = 2;
+ } else if (MRI.getRegClass(AMDGPU::VReg_128RegClassID).contains(RegNo)) {
+ O << 'v';
+ NumRegs = 4;
+ } else if (MRI.getRegClass(AMDGPU::SGPR_128RegClassID).contains(RegNo)) {
+ O << 's';
+ NumRegs = 4;
+ } else if (MRI.getRegClass(AMDGPU::VReg_96RegClassID).contains(RegNo)) {
+ O << 'v';
+ NumRegs = 3;
+ } else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(RegNo)) {
+ O << 'v';
+ NumRegs = 8;
+ } else if (MRI.getRegClass(AMDGPU::SReg_256RegClassID).contains(RegNo)) {
+ O << 's';
+ NumRegs = 8;
+ } else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(RegNo)) {
+ O << 'v';
+ NumRegs = 16;
+ } else if (MRI.getRegClass(AMDGPU::SReg_512RegClassID).contains(RegNo)) {
+ O << 's';
+ NumRegs = 16;
+ } else if (MRI.getRegClass(AMDGPU::TTMP_64RegClassID).contains(RegNo)) {
+ O << "ttmp";
+ NumRegs = 2;
+ // Trap temps start at offset 112. TODO: Get this from tablegen.
+ RegIdx -= 112;
+ } else if (MRI.getRegClass(AMDGPU::TTMP_128RegClassID).contains(RegNo)) {
+ O << "ttmp";
+ NumRegs = 4;
+ // Trap temps start at offset 112. TODO: Get this from tablegen.
+ RegIdx -= 112;
+ } else {
+ O << getRegisterName(RegNo);
+ return;
+ }
+
+ if (NumRegs == 1) {
+ O << RegIdx;
+ return;
+ }
+
+ O << '[' << RegIdx << ':' << (RegIdx + NumRegs - 1) << ']';
+}
+
+void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::VOP3)
+ O << "_e64 ";
+ else if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::DPP)
+ O << "_dpp ";
+ else if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::SDWA)
+ O << "_sdwa ";
+ else
+ O << "_e32 ";
+
+ printOperand(MI, OpNo, STI, O);
+}
+
+void AMDGPUInstPrinter::printImmediate16(uint32_t Imm,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ int16_t SImm = static_cast<int16_t>(Imm);
+ if (SImm >= -16 && SImm <= 64) {
+ O << SImm;
+ return;
+ }
+
+ if (Imm == 0x3C00)
+ O<< "1.0";
+ else if (Imm == 0xBC00)
+ O<< "-1.0";
+ else if (Imm == 0x3800)
+ O<< "0.5";
+ else if (Imm == 0xB800)
+ O<< "-0.5";
+ else if (Imm == 0x4000)
+ O<< "2.0";
+ else if (Imm == 0xC000)
+ O<< "-2.0";
+ else if (Imm == 0x4400)
+ O<< "4.0";
+ else if (Imm == 0xC400)
+ O<< "-4.0";
+ else if (Imm == 0x3118) {
+ assert(STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]);
+ O << "0.15915494";
+ } else
+ O << formatHex(static_cast<uint64_t>(Imm));
+}
+
+void AMDGPUInstPrinter::printImmediate32(uint32_t Imm,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ int32_t SImm = static_cast<int32_t>(Imm);
+ if (SImm >= -16 && SImm <= 64) {
+ O << SImm;
+ return;
+ }
+
+ if (Imm == FloatToBits(0.0f))
+ O << "0.0";
+ else if (Imm == FloatToBits(1.0f))
+ O << "1.0";
+ else if (Imm == FloatToBits(-1.0f))
+ O << "-1.0";
+ else if (Imm == FloatToBits(0.5f))
+ O << "0.5";
+ else if (Imm == FloatToBits(-0.5f))
+ O << "-0.5";
+ else if (Imm == FloatToBits(2.0f))
+ O << "2.0";
+ else if (Imm == FloatToBits(-2.0f))
+ O << "-2.0";
+ else if (Imm == FloatToBits(4.0f))
+ O << "4.0";
+ else if (Imm == FloatToBits(-4.0f))
+ O << "-4.0";
+ else if (Imm == 0x3e22f983 &&
+ STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm])
+ O << "0.15915494";
+ else
+ O << formatHex(static_cast<uint64_t>(Imm));
+}
+
+void AMDGPUInstPrinter::printImmediate64(uint64_t Imm,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ int64_t SImm = static_cast<int64_t>(Imm);
+ if (SImm >= -16 && SImm <= 64) {
+ O << SImm;
+ return;
+ }
+
+ if (Imm == DoubleToBits(0.0))
+ O << "0.0";
+ else if (Imm == DoubleToBits(1.0))
+ O << "1.0";
+ else if (Imm == DoubleToBits(-1.0))
+ O << "-1.0";
+ else if (Imm == DoubleToBits(0.5))
+ O << "0.5";
+ else if (Imm == DoubleToBits(-0.5))
+ O << "-0.5";
+ else if (Imm == DoubleToBits(2.0))
+ O << "2.0";
+ else if (Imm == DoubleToBits(-2.0))
+ O << "-2.0";
+ else if (Imm == DoubleToBits(4.0))
+ O << "4.0";
+ else if (Imm == DoubleToBits(-4.0))
+ O << "-4.0";
+ else if (Imm == 0x3fc45f306dc9c882 &&
+ STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm])
+ O << "0.15915494";
+ else {
+ assert(isUInt<32>(Imm) || Imm == 0x3fc45f306dc9c882);
+
+ // In rare situations, we will have a 32-bit literal in a 64-bit
+ // operand. This is technically allowed for the encoding of s_mov_b64.
+ O << formatHex(static_cast<uint64_t>(Imm));
+ }
+}
+
+void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ if (OpNo >= MI->getNumOperands()) {
+ O << "/*Missing OP" << OpNo << "*/";
+ return;
+ }
+
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isReg()) {
+ switch (Op.getReg()) {
+ // This is the default predicate state, so we don't need to print it.
+ case AMDGPU::PRED_SEL_OFF:
+ break;
+
+ default:
+ printRegOperand(Op.getReg(), O, MRI);
+ break;
+ }
+ } else if (Op.isImm()) {
+ const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+ switch (Desc.OpInfo[OpNo].OperandType) {
+ case AMDGPU::OPERAND_REG_IMM_INT32:
+ case AMDGPU::OPERAND_REG_IMM_FP32:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+ case MCOI::OPERAND_IMMEDIATE:
+ printImmediate32(Op.getImm(), STI, O);
+ break;
+ case AMDGPU::OPERAND_REG_IMM_INT64:
+ case AMDGPU::OPERAND_REG_IMM_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+ printImmediate64(Op.getImm(), STI, O);
+ break;
+ case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+ case AMDGPU::OPERAND_REG_IMM_INT16:
+ case AMDGPU::OPERAND_REG_IMM_FP16:
+ printImmediate16(Op.getImm(), STI, O);
+ break;
+ case MCOI::OPERAND_UNKNOWN:
+ case MCOI::OPERAND_PCREL:
+ O << formatDec(Op.getImm());
+ break;
+ case MCOI::OPERAND_REGISTER:
+ // FIXME: This should be removed and handled somewhere else. Seems to come
+ // from a disassembler bug.
+ O << "/*invalid immediate*/";
+ break;
+ default:
+ // We hit this for the immediate instruction bits that don't yet have a
+ // custom printer.
+ llvm_unreachable("unexpected immediate operand type");
+ }
+ } else if (Op.isFPImm()) {
+ // We special case 0.0 because otherwise it will be printed as an integer.
+ if (Op.getFPImm() == 0.0)
+ O << "0.0";
+ else {
+ const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+ int RCID = Desc.OpInfo[OpNo].RegClass;
+ unsigned RCBits = AMDGPU::getRegBitWidth(MRI.getRegClass(RCID));
+ if (RCBits == 32)
+ printImmediate32(FloatToBits(Op.getFPImm()), STI, O);
+ else if (RCBits == 64)
+ printImmediate64(DoubleToBits(Op.getFPImm()), STI, O);
+ else
+ llvm_unreachable("Invalid register class size");
+ }
+ } else if (Op.isExpr()) {
+ const MCExpr *Exp = Op.getExpr();
+ Exp->print(O, &MAI);
+ } else {
+ O << "/*INV_OP*/";
+ }
+}
+
+void AMDGPUInstPrinter::printOperandAndFPInputMods(const MCInst *MI,
+ unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned InputModifiers = MI->getOperand(OpNo).getImm();
+ if (InputModifiers & SISrcMods::NEG)
+ O << '-';
+ if (InputModifiers & SISrcMods::ABS)
+ O << '|';
+ printOperand(MI, OpNo + 1, STI, O);
+ if (InputModifiers & SISrcMods::ABS)
+ O << '|';
+}
+
+void AMDGPUInstPrinter::printOperandAndIntInputMods(const MCInst *MI,
+ unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned InputModifiers = MI->getOperand(OpNo).getImm();
+ if (InputModifiers & SISrcMods::SEXT)
+ O << "sext(";
+ printOperand(MI, OpNo + 1, STI, O);
+ if (InputModifiers & SISrcMods::SEXT)
+ O << ')';
+}
+
+void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned Imm = MI->getOperand(OpNo).getImm();
+ if (Imm <= 0x0ff) {
+ O << " quad_perm:[";
+ O << formatDec(Imm & 0x3) << ',';
+ O << formatDec((Imm & 0xc) >> 2) << ',';
+ O << formatDec((Imm & 0x30) >> 4) << ',';
+ O << formatDec((Imm & 0xc0) >> 6) << ']';
+ } else if ((Imm >= 0x101) && (Imm <= 0x10f)) {
+ O << " row_shl:";
+ printU4ImmDecOperand(MI, OpNo, O);
+ } else if ((Imm >= 0x111) && (Imm <= 0x11f)) {
+ O << " row_shr:";
+ printU4ImmDecOperand(MI, OpNo, O);
+ } else if ((Imm >= 0x121) && (Imm <= 0x12f)) {
+ O << " row_ror:";
+ printU4ImmDecOperand(MI, OpNo, O);
+ } else if (Imm == 0x130) {
+ O << " wave_shl:1";
+ } else if (Imm == 0x134) {
+ O << " wave_rol:1";
+ } else if (Imm == 0x138) {
+ O << " wave_shr:1";
+ } else if (Imm == 0x13c) {
+ O << " wave_ror:1";
+ } else if (Imm == 0x140) {
+ O << " row_mirror";
+ } else if (Imm == 0x141) {
+ O << " row_half_mirror";
+ } else if (Imm == 0x142) {
+ O << " row_bcast:15";
+ } else if (Imm == 0x143) {
+ O << " row_bcast:31";
+ } else {
+ llvm_unreachable("Invalid dpp_ctrl value");
+ }
+}
+
+void AMDGPUInstPrinter::printRowMask(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ O << " row_mask:";
+ printU4ImmOperand(MI, OpNo, STI, O);
+}
+
+void AMDGPUInstPrinter::printBankMask(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ O << " bank_mask:";
+ printU4ImmOperand(MI, OpNo, STI, O);
+}
+
+void AMDGPUInstPrinter::printBoundCtrl(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned Imm = MI->getOperand(OpNo).getImm();
+ if (Imm) {
+ O << " bound_ctrl:0"; // XXX - this syntax is used in sp3
+ }
+}
+
+void AMDGPUInstPrinter::printSDWASel(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ using namespace llvm::AMDGPU::SDWA;
+
+ unsigned Imm = MI->getOperand(OpNo).getImm();
+ switch (Imm) {
+ case SdwaSel::BYTE_0: O << "BYTE_0"; break;
+ case SdwaSel::BYTE_1: O << "BYTE_1"; break;
+ case SdwaSel::BYTE_2: O << "BYTE_2"; break;
+ case SdwaSel::BYTE_3: O << "BYTE_3"; break;
+ case SdwaSel::WORD_0: O << "WORD_0"; break;
+ case SdwaSel::WORD_1: O << "WORD_1"; break;
+ case SdwaSel::DWORD: O << "DWORD"; break;
+ default: llvm_unreachable("Invalid SDWA data select operand");
+ }
+}
+
+void AMDGPUInstPrinter::printSDWADstSel(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ O << "dst_sel:";
+ printSDWASel(MI, OpNo, O);
+}
+
+void AMDGPUInstPrinter::printSDWASrc0Sel(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ O << "src0_sel:";
+ printSDWASel(MI, OpNo, O);
+}
+
+void AMDGPUInstPrinter::printSDWASrc1Sel(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ O << "src1_sel:";
+ printSDWASel(MI, OpNo, O);
+}
+
+void AMDGPUInstPrinter::printSDWADstUnused(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ using namespace llvm::AMDGPU::SDWA;
+
+ O << "dst_unused:";
+ unsigned Imm = MI->getOperand(OpNo).getImm();
+ switch (Imm) {
+ case DstUnused::UNUSED_PAD: O << "UNUSED_PAD"; break;
+ case DstUnused::UNUSED_SEXT: O << "UNUSED_SEXT"; break;
+ case DstUnused::UNUSED_PRESERVE: O << "UNUSED_PRESERVE"; break;
+ default: llvm_unreachable("Invalid SDWA dest_unused operand");
+ }
+}
+
+template <unsigned N>
+void AMDGPUInstPrinter::printExpSrcN(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ int EnIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::en);
+ unsigned En = MI->getOperand(EnIdx).getImm();
+
+ // FIXME: What do we do with compr? The meaning of en changes depending on if
+ // compr is set.
+
+ if (En & (1 << N))
+ printRegOperand(MI->getOperand(OpNo).getReg(), O, MRI);
+ else
+ O << "off";
+}
+
+void AMDGPUInstPrinter::printExpSrc0(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ printExpSrcN<0>(MI, OpNo, STI, O);
+}
+
+void AMDGPUInstPrinter::printExpSrc1(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ printExpSrcN<1>(MI, OpNo, STI, O);
+}
+
+void AMDGPUInstPrinter::printExpSrc2(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ printExpSrcN<2>(MI, OpNo, STI, O);
+}
+
+void AMDGPUInstPrinter::printExpSrc3(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ printExpSrcN<3>(MI, OpNo, STI, O);
+}
+
+void AMDGPUInstPrinter::printExpTgt(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ // This is really a 6 bit field.
+ uint32_t Tgt = MI->getOperand(OpNo).getImm() & ((1 << 6) - 1);
+
+ if (Tgt <= 7)
+ O << " mrt" << Tgt;
+ else if (Tgt == 8)
+ O << " mrtz";
+ else if (Tgt == 9)
+ O << " null";
+ else if (Tgt >= 12 && Tgt <= 15)
+ O << " pos" << Tgt - 12;
+ else if (Tgt >= 32 && Tgt <= 63)
+ O << " param" << Tgt - 32;
+ else {
+ // Reserved values 10, 11
+ O << " invalid_target_" << Tgt;
+ }
+}
+
+void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned Imm = MI->getOperand(OpNum).getImm();
+ switch (Imm) {
+ case 0:
+ O << "p10";
+ break;
+ case 1:
+ O << "p20";
+ break;
+ case 2:
+ O << "p0";
+ break;
+ default:
+ O << "invalid_param_" << Imm;
+ }
+}
+
+void AMDGPUInstPrinter::printInterpAttr(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned Attr = MI->getOperand(OpNum).getImm();
+ O << "attr" << Attr;
+}
+
+void AMDGPUInstPrinter::printInterpAttrChan(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned Chan = MI->getOperand(OpNum).getImm();
+ O << '.' << "xyzw"[Chan & 0x3];
+}
+
+void AMDGPUInstPrinter::printVGPRIndexMode(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned Val = MI->getOperand(OpNo).getImm();
+ if (Val == 0) {
+ O << " 0";
+ return;
+ }
+
+ if (Val & VGPRIndexMode::DST_ENABLE)
+ O << " dst";
+
+ if (Val & VGPRIndexMode::SRC0_ENABLE)
+ O << " src0";
+
+ if (Val & VGPRIndexMode::SRC1_ENABLE)
+ O << " src1";
+
+ if (Val & VGPRIndexMode::SRC2_ENABLE)
+ O << " src2";
+}
+
+void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ printOperand(MI, OpNo, STI, O);
+ O << ", ";
+ printOperand(MI, OpNo + 1, STI, O);
+}
+
+void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O, StringRef Asm,
+ StringRef Default) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ assert(Op.isImm());
+ if (Op.getImm() == 1) {
+ O << Asm;
+ } else {
+ O << Default;
+ }
+}
+
+void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O, char Asm) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ assert(Op.isImm());
+ if (Op.getImm() == 1)
+ O << Asm;
+}
+
+void AMDGPUInstPrinter::printAbs(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ printIfSet(MI, OpNo, O, '|');
+}
+
+void AMDGPUInstPrinter::printClamp(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ printIfSet(MI, OpNo, O, "_SAT");
+}
+
+void AMDGPUInstPrinter::printClampSI(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ if (MI->getOperand(OpNo).getImm())
+ O << " clamp";
+}
+
+void AMDGPUInstPrinter::printOModSI(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ int Imm = MI->getOperand(OpNo).getImm();
+ if (Imm == SIOutMods::MUL2)
+ O << " mul:2";
+ else if (Imm == SIOutMods::MUL4)
+ O << " mul:4";
+ else if (Imm == SIOutMods::DIV2)
+ O << " div:2";
+}
+
+void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ assert(Op.isImm() || Op.isExpr());
+ if (Op.isImm()) {
+ int64_t Imm = Op.getImm();
+ O << Imm << '(' << BitsToFloat(Imm) << ')';
+ }
+ if (Op.isExpr()) {
+ Op.getExpr()->print(O << '@', &MAI);
+ }
+}
+
+void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ printIfSet(MI, OpNo, O, "*", " ");
+}
+
+void AMDGPUInstPrinter::printNeg(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ printIfSet(MI, OpNo, O, '-');
+}
+
+void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ switch (MI->getOperand(OpNo).getImm()) {
+ default: break;
+ case 1:
+ O << " * 2.0";
+ break;
+ case 2:
+ O << " * 4.0";
+ break;
+ case 3:
+ O << " / 2.0";
+ break;
+ }
+}
+
+void AMDGPUInstPrinter::printRel(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ printIfSet(MI, OpNo, O, '+');
+}
+
+void AMDGPUInstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ printIfSet(MI, OpNo, O, "ExecMask,");
+}
+
+void AMDGPUInstPrinter::printUpdatePred(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ printIfSet(MI, OpNo, O, "Pred,");
+}
+
+void AMDGPUInstPrinter::printWrite(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.getImm() == 0) {
+ O << " (MASKED)";
+ }
+}
+
+void AMDGPUInstPrinter::printSel(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const char * chans = "XYZW";
+ int sel = MI->getOperand(OpNo).getImm();
+
+ int chan = sel & 3;
+ sel >>= 2;
+
+ if (sel >= 512) {
+ sel -= 512;
+ int cb = sel >> 12;
+ sel &= 4095;
+ O << cb << '[' << sel << ']';
+ } else if (sel >= 448) {
+ sel -= 448;
+ O << sel;
+ } else if (sel >= 0){
+ O << sel;
+ }
+
+ if (sel >= 0)
+ O << '.' << chans[chan];
+}
+
+void AMDGPUInstPrinter::printBankSwizzle(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ int BankSwizzle = MI->getOperand(OpNo).getImm();
+ switch (BankSwizzle) {
+ case 1:
+ O << "BS:VEC_021/SCL_122";
+ break;
+ case 2:
+ O << "BS:VEC_120/SCL_212";
+ break;
+ case 3:
+ O << "BS:VEC_102/SCL_221";
+ break;
+ case 4:
+ O << "BS:VEC_201";
+ break;
+ case 5:
+ O << "BS:VEC_210";
+ break;
+ default:
+ break;
+ }
+}
+
+void AMDGPUInstPrinter::printRSel(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ unsigned Sel = MI->getOperand(OpNo).getImm();
+ switch (Sel) {
+ case 0:
+ O << 'X';
+ break;
+ case 1:
+ O << 'Y';
+ break;
+ case 2:
+ O << 'Z';
+ break;
+ case 3:
+ O << 'W';
+ break;
+ case 4:
+ O << '0';
+ break;
+ case 5:
+ O << '1';
+ break;
+ case 7:
+ O << '_';
+ break;
+ default:
+ break;
+ }
+}
+
+void AMDGPUInstPrinter::printCT(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ unsigned CT = MI->getOperand(OpNo).getImm();
+ switch (CT) {
+ case 0:
+ O << 'U';
+ break;
+ case 1:
+ O << 'N';
+ break;
+ default:
+ break;
+ }
+}
+
+void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ int KCacheMode = MI->getOperand(OpNo).getImm();
+ if (KCacheMode > 0) {
+ int KCacheBank = MI->getOperand(OpNo - 2).getImm();
+ O << "CB" << KCacheBank << ':';
+ int KCacheAddr = MI->getOperand(OpNo + 2).getImm();
+ int LineSize = (KCacheMode == 1) ? 16 : 32;
+ O << KCacheAddr * 16 << '-' << KCacheAddr * 16 + LineSize;
+ }
+}
+
+void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ using namespace llvm::AMDGPU::SendMsg;
+
+ const unsigned SImm16 = MI->getOperand(OpNo).getImm();
+ const unsigned Id = SImm16 & ID_MASK_;
+ do {
+ if (Id == ID_INTERRUPT) {
+ if ((SImm16 & ~ID_MASK_) != 0) // Unused/unknown bits must be 0.
+ break;
+ O << "sendmsg(" << IdSymbolic[Id] << ')';
+ return;
+ }
+ if (Id == ID_GS || Id == ID_GS_DONE) {
+ if ((SImm16 & ~(ID_MASK_|OP_GS_MASK_|STREAM_ID_MASK_)) != 0) // Unused/unknown bits must be 0.
+ break;
+ const unsigned OpGs = (SImm16 & OP_GS_MASK_) >> OP_SHIFT_;
+ const unsigned StreamId = (SImm16 & STREAM_ID_MASK_) >> STREAM_ID_SHIFT_;
+ if (OpGs == OP_GS_NOP && Id != ID_GS_DONE) // NOP to be used for GS_DONE only.
+ break;
+ if (OpGs == OP_GS_NOP && StreamId != 0) // NOP does not use/define stream id bits.
+ break;
+ O << "sendmsg(" << IdSymbolic[Id] << ", " << OpGsSymbolic[OpGs];
+ if (OpGs != OP_GS_NOP) { O << ", " << StreamId; }
+ O << ')';
+ return;
+ }
+ if (Id == ID_SYSMSG) {
+ if ((SImm16 & ~(ID_MASK_|OP_SYS_MASK_)) != 0) // Unused/unknown bits must be 0.
+ break;
+ const unsigned OpSys = (SImm16 & OP_SYS_MASK_) >> OP_SHIFT_;
+ if (! (OP_SYS_FIRST_ <= OpSys && OpSys < OP_SYS_LAST_)) // Unused/unknown.
+ break;
+ O << "sendmsg(" << IdSymbolic[Id] << ", " << OpSysSymbolic[OpSys] << ')';
+ return;
+ }
+ } while (false);
+ O << SImm16; // Unknown simm16 code.
+}
+
+void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ IsaVersion IV = getIsaVersion(STI.getFeatureBits());
+
+ unsigned SImm16 = MI->getOperand(OpNo).getImm();
+ unsigned Vmcnt, Expcnt, Lgkmcnt;
+ decodeWaitcnt(IV, SImm16, Vmcnt, Expcnt, Lgkmcnt);
+
+ bool NeedSpace = false;
+
+ if (Vmcnt != getVmcntBitMask(IV)) {
+ O << "vmcnt(" << Vmcnt << ')';
+ NeedSpace = true;
+ }
+
+ if (Expcnt != getExpcntBitMask(IV)) {
+ if (NeedSpace)
+ O << ' ';
+ O << "expcnt(" << Expcnt << ')';
+ NeedSpace = true;
+ }
+
+ if (Lgkmcnt != getLgkmcntBitMask(IV)) {
+ if (NeedSpace)
+ O << ' ';
+ O << "lgkmcnt(" << Lgkmcnt << ')';
+ }
+}
+
+void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ using namespace llvm::AMDGPU::Hwreg;
+
+ unsigned SImm16 = MI->getOperand(OpNo).getImm();
+ const unsigned Id = (SImm16 & ID_MASK_) >> ID_SHIFT_;
+ const unsigned Offset = (SImm16 & OFFSET_MASK_) >> OFFSET_SHIFT_;
+ const unsigned Width = ((SImm16 & WIDTH_M1_MASK_) >> WIDTH_M1_SHIFT_) + 1;
+
+ O << "hwreg(";
+ if (ID_SYMBOLIC_FIRST_ <= Id && Id < ID_SYMBOLIC_LAST_) {
+ O << IdSymbolic[Id];
+ } else {
+ O << Id;
+ }
+ if (Width != WIDTH_M1_DEFAULT_ + 1 || Offset != OFFSET_DEFAULT_) {
+ O << ", " << Offset << ", " << Width;
+ }
+ O << ')';
+}
+
+#include "AMDGPUGenAsmWriter.inc"
diff --git a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
new file mode 100644
index 000000000000..a6d348ff0f12
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
@@ -0,0 +1,194 @@
+//===-- AMDGPUInstPrinter.h - AMDGPU MC Inst -> ASM interface ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_INSTPRINTER_AMDGPUINSTPRINTER_H
+#define LLVM_LIB_TARGET_AMDGPU_INSTPRINTER_AMDGPUINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+
+class AMDGPUInstPrinter : public MCInstPrinter {
+public:
+ AMDGPUInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI)
+ : MCInstPrinter(MAI, MII, MRI) {}
+
+ //Autogenerated by tblgen
+ void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ static const char *getRegisterName(unsigned RegNo);
+
+ void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
+ const MCSubtargetInfo &STI) override;
+ static void printRegOperand(unsigned RegNo, raw_ostream &O,
+ const MCRegisterInfo &MRI);
+
+private:
+ void printU4ImmOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printU16ImmOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printU4ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printU32ImmOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printNamedBit(const MCInst *MI, unsigned OpNo, raw_ostream &O,
+ StringRef BitName);
+ void printOffen(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printIdxen(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printAddr64(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printMBUFOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printOffset0(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printOffset1(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printSMRDOffset8(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printSMRDOffset20(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printGDS(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printGLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printSLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printTFE(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printDMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printUNorm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printDA(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printR128(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printLWE(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printExpCompr(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printExpVM(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+
+ void printRegOperand(unsigned RegNo, raw_ostream &O);
+ void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printImmediate64(uint64_t Imm, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printOperandAndFPInputMods(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printOperandAndIntInputMods(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printDPPCtrl(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printRowMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printBankMask(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printBoundCtrl(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printSDWASel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printSDWADstSel(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printSDWASrc0Sel(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printSDWASrc1Sel(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printSDWADstUnused(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printInterpSlot(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printInterpAttr(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printInterpAttrChan(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+
+ void printVGPRIndexMode(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printMemOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+
+
+ template <unsigned N>
+ void printExpSrcN(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printExpSrc0(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printExpSrc1(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printExpSrc2(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printExpSrc3(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printExpTgt(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+
+ static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O,
+ StringRef Asm, StringRef Default = "");
+ static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O,
+ char Asm);
+ void printAbs(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printClamp(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printClampSI(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printOModSI(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printLiteral(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printLast(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printNeg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printOMOD(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printRel(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printUpdateExecMask(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printUpdatePred(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printWrite(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printSel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printBankSwizzle(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printRSel(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printCT(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printKCache(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printSendMsg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printWaitFlag(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printHwreg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+};
+
+} // End namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
new file mode 100644
index 000000000000..ffb92aae599e
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -0,0 +1,200 @@
+//===-- AMDGPUAsmBackend.cpp - AMDGPU Assembler Backend -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "MCTargetDesc/AMDGPUFixupKinds.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUAsmBackend : public MCAsmBackend {
+public:
+ AMDGPUAsmBackend(const Target &T)
+ : MCAsmBackend() {}
+
+ unsigned getNumFixupKinds() const override { return AMDGPU::NumTargetFixupKinds; };
+
+ void processFixupValue(const MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFixup &Fixup, const MCFragment *DF,
+ const MCValue &Target, uint64_t &Value,
+ bool &IsResolved) override;
+
+ void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+ uint64_t Value, bool IsPCRel) const override;
+ bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+ const MCRelaxableFragment *DF,
+ const MCAsmLayout &Layout) const override {
+ return false;
+ }
+ void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+ MCInst &Res) const override {
+ llvm_unreachable("Not implemented");
+ }
+ bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
+ bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+
+ const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
+};
+
+} //End anonymous namespace
+
+static unsigned getFixupKindNumBytes(unsigned Kind) {
+ switch (Kind) {
+ case AMDGPU::fixup_si_sopp_br:
+ return 2;
+ case FK_SecRel_1:
+ case FK_Data_1:
+ return 1;
+ case FK_SecRel_2:
+ case FK_Data_2:
+ return 2;
+ case FK_SecRel_4:
+ case FK_Data_4:
+ case FK_PCRel_4:
+ return 4;
+ case FK_SecRel_8:
+ case FK_Data_8:
+ return 8;
+ default:
+ llvm_unreachable("Unknown fixup kind!");
+ }
+}
+
+static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
+ MCContext *Ctx) {
+ int64_t SignedValue = static_cast<int64_t>(Value);
+
+ switch (Fixup.getKind()) {
+ case AMDGPU::fixup_si_sopp_br: {
+ int64_t BrImm = (SignedValue - 4) / 4;
+
+ if (Ctx && !isInt<16>(BrImm))
+ Ctx->reportError(Fixup.getLoc(), "branch size exceeds simm16");
+
+ return BrImm;
+ }
+ case FK_Data_1:
+ case FK_Data_2:
+ case FK_Data_4:
+ case FK_Data_8:
+ case FK_PCRel_4:
+ case FK_SecRel_4:
+ return Value;
+ default:
+ llvm_unreachable("unhandled fixup kind");
+ }
+}
+
+void AMDGPUAsmBackend::processFixupValue(const MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFixup &Fixup, const MCFragment *DF,
+ const MCValue &Target, uint64_t &Value,
+ bool &IsResolved) {
+ MCValue Res;
+
+ // When we have complex expressions like: BB0_1 + (BB0_2 - 4), which are
+ // used for long branches, this function will be called with
+ // IsResolved = false and Value set to some pre-computed value. In
+ // the example above, the value would be:
+ // (BB0_1 + (BB0_2 - 4)) - CurrentOffsetFromStartOfFunction.
+ // This is not what we want. We just want the expression computation
+ // only. The reason the MC layer subtracts the current offset from the
+ // expression is because the fixup is of kind FK_PCRel_4.
+ // For these scenarios, evaluateAsValue gives us the computation that we
+ // want.
+ if (!IsResolved && Fixup.getValue()->evaluateAsValue(Res, Layout) &&
+ Res.isAbsolute()) {
+ Value = Res.getConstant();
+ IsResolved = true;
+
+ }
+ if (IsResolved)
+ Value = adjustFixupValue(Fixup, Value, &Asm.getContext());
+}
+
+void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+ unsigned DataSize, uint64_t Value,
+ bool IsPCRel) const {
+ if (!Value)
+ return; // Doesn't change encoding.
+
+ MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
+
+ // Shift the value into position.
+ Value <<= Info.TargetOffset;
+
+ unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
+ uint32_t Offset = Fixup.getOffset();
+ assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+
+ // For each byte of the fragment that the fixup touches, mask in the bits from
+ // the fixup value.
+ for (unsigned i = 0; i != NumBytes; ++i)
+ Data[Offset + i] |= static_cast<uint8_t>((Value >> (i * 8)) & 0xff);
+}
+
+const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo(
+ MCFixupKind Kind) const {
+ const static MCFixupKindInfo Infos[AMDGPU::NumTargetFixupKinds] = {
+ // name offset bits flags
+ { "fixup_si_sopp_br", 0, 16, MCFixupKindInfo::FKF_IsPCRel },
+ };
+
+ if (Kind < FirstTargetFixupKind)
+ return MCAsmBackend::getFixupKindInfo(Kind);
+
+ return Infos[Kind - FirstTargetFixupKind];
+}
+
+bool AMDGPUAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+ OW->WriteZeros(Count);
+
+ return true;
+}
+
+//===----------------------------------------------------------------------===//
+// ELFAMDGPUAsmBackend class
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+class ELFAMDGPUAsmBackend : public AMDGPUAsmBackend {
+ bool Is64Bit;
+ bool HasRelocationAddend;
+
+public:
+ ELFAMDGPUAsmBackend(const Target &T, const Triple &TT) :
+ AMDGPUAsmBackend(T), Is64Bit(TT.getArch() == Triple::amdgcn),
+ HasRelocationAddend(TT.getOS() == Triple::AMDHSA) { }
+
+ MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+ return createAMDGPUELFObjectWriter(Is64Bit, HasRelocationAddend, OS);
+ }
+};
+
+} // end anonymous namespace
+
+MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T,
+ const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options) {
+ // Use 64-bit ELF for amdgcn
+ return new ELFAMDGPUAsmBackend(T, TT);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
new file mode 100644
index 000000000000..1847d7a67328
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -0,0 +1,87 @@
+//===-- AMDGPUELFObjectWriter.cpp - AMDGPU ELF Writer ----------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMCTargetDesc.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixup.h"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+ AMDGPUELFObjectWriter(bool Is64Bit, bool HasRelocationAddend);
+protected:
+ unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+ const MCFixup &Fixup, bool IsPCRel) const override;
+};
+
+
+} // End anonymous namespace
+
+AMDGPUELFObjectWriter::AMDGPUELFObjectWriter(bool Is64Bit,
+ bool HasRelocationAddend)
+ : MCELFObjectTargetWriter(Is64Bit,
+ ELF::ELFOSABI_AMDGPU_HSA,
+ ELF::EM_AMDGPU,
+ HasRelocationAddend) { }
+
+unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
+ const MCValue &Target,
+ const MCFixup &Fixup,
+ bool IsPCRel) const {
+ if (const auto *SymA = Target.getSymA()) {
+ // SCRATCH_RSRC_DWORD[01] is a special global variable that represents
+ // the scratch buffer.
+ if (SymA->getSymbol().getName() == "SCRATCH_RSRC_DWORD0")
+ return ELF::R_AMDGPU_ABS32_LO;
+
+ if (SymA->getSymbol().getName() == "SCRATCH_RSRC_DWORD1")
+ return ELF::R_AMDGPU_ABS32_HI;
+ }
+
+ switch (Target.getAccessVariant()) {
+ default:
+ break;
+ case MCSymbolRefExpr::VK_GOTPCREL:
+ return ELF::R_AMDGPU_GOTPCREL;
+ case MCSymbolRefExpr::VK_AMDGPU_GOTPCREL32_LO:
+ return ELF::R_AMDGPU_GOTPCREL32_LO;
+ case MCSymbolRefExpr::VK_AMDGPU_GOTPCREL32_HI:
+ return ELF::R_AMDGPU_GOTPCREL32_HI;
+ case MCSymbolRefExpr::VK_AMDGPU_REL32_LO:
+ return ELF::R_AMDGPU_REL32_LO;
+ case MCSymbolRefExpr::VK_AMDGPU_REL32_HI:
+ return ELF::R_AMDGPU_REL32_HI;
+ }
+
+ switch (Fixup.getKind()) {
+ default: break;
+ case FK_PCRel_4:
+ return ELF::R_AMDGPU_REL32;
+ case FK_Data_4:
+ case FK_SecRel_4:
+ return ELF::R_AMDGPU_ABS32;
+ case FK_Data_8:
+ return ELF::R_AMDGPU_ABS64;
+ }
+
+ llvm_unreachable("unhandled relocation type");
+}
+
+
+MCObjectWriter *llvm::createAMDGPUELFObjectWriter(bool Is64Bit,
+ bool HasRelocationAddend,
+ raw_pwrite_stream &OS) {
+ MCELFObjectTargetWriter *MOTW =
+ new AMDGPUELFObjectWriter(Is64Bit, HasRelocationAddend);
+ return createELFObjectWriter(MOTW, OS, true);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
new file mode 100644
index 000000000000..43338a5bebd2
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
@@ -0,0 +1,21 @@
+//===-------- AMDGPUELFStreamer.cpp - ELF Object Output -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUELFStreamer.h"
+#include "Utils/AMDGPUBaseInfo.h"
+
+using namespace llvm;
+
+MCELFStreamer *llvm::createAMDGPUELFStreamer(MCContext &Context,
+ MCAsmBackend &MAB,
+ raw_pwrite_stream &OS,
+ MCCodeEmitter *Emitter,
+ bool RelaxAll) {
+ return new AMDGPUELFStreamer(Context, MAB, OS, Emitter);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
new file mode 100644
index 000000000000..5319b65d65f9
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
@@ -0,0 +1,39 @@
+//===-------- AMDGPUELFStreamer.h - ELF Object Output -----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a custom MCELFStreamer which allows us to insert some hooks before
+// emitting data into an actual object file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUELFSTREAMER_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUELFSTREAMER_H
+
+#include "llvm/MC/MCELFStreamer.h"
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCSubtargetInfo;
+
+class AMDGPUELFStreamer : public MCELFStreamer {
+public:
+ AMDGPUELFStreamer(MCContext &Context, MCAsmBackend &MAB, raw_pwrite_stream &OS,
+ MCCodeEmitter *Emitter)
+ : MCELFStreamer(Context, MAB, OS, Emitter) { }
+
+};
+
+MCELFStreamer *createAMDGPUELFStreamer(MCContext &Context, MCAsmBackend &MAB,
+ raw_pwrite_stream &OS,
+ MCCodeEmitter *Emitter, bool RelaxAll);
+} // namespace llvm.
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h
new file mode 100644
index 000000000000..20c1adfbc6b9
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h
@@ -0,0 +1,28 @@
+//===-- AMDGPUFixupKinds.h - AMDGPU Specific Fixup Entries ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUFIXUPKINDS_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUFIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace AMDGPU {
+enum Fixups {
+ /// 16-bit PC relative fixup for SOPP branch instructions.
+ fixup_si_sopp_br = FirstTargetFixupKind,
+
+ // Marker
+ LastTargetFixupKind,
+ NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+}
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
new file mode 100644
index 000000000000..1655591abf39
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -0,0 +1,45 @@
+//===-- MCTargetDesc/AMDGPUMCAsmInfo.cpp - Assembly Info ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+
+using namespace llvm;
+
+AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() {
+ HasSingleParameterDotFile = false;
+ //===------------------------------------------------------------------===//
+ MinInstAlignment = 4;
+ MaxInstLength = (TT.getArch() == Triple::amdgcn) ? 8 : 16;
+ SeparatorString = "\n";
+ CommentString = ";";
+ PrivateLabelPrefix = "";
+ InlineAsmStart = ";#ASMSTART";
+ InlineAsmEnd = ";#ASMEND";
+
+ //===--- Data Emission Directives -------------------------------------===//
+ SunStyleELFSectionSwitchSyntax = true;
+ UsesELFSectionDirectiveForBSS = true;
+
+ //===--- Global Variable Emission Directives --------------------------===//
+ HasAggressiveSymbolFolding = true;
+ COMMDirectiveAlignmentIsInBytes = false;
+ HasNoDeadStrip = true;
+ WeakRefDirective = ".weakref\t";
+ //===--- Dwarf Emission Directives -----------------------------------===//
+ SupportsDebugInformation = true;
+}
+
+bool AMDGPUMCAsmInfo::shouldOmitSectionDirective(StringRef SectionName) const {
+ return SectionName == ".hsatext" || SectionName == ".hsadata_global_agent" ||
+ SectionName == ".hsadata_global_program" ||
+ SectionName == ".hsarodata_readonly_agent" ||
+ MCAsmInfo::shouldOmitSectionDirective(SectionName);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h
new file mode 100644
index 000000000000..8cb33a3179cd
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h
@@ -0,0 +1,33 @@
+//===-- MCTargetDesc/AMDGPUMCAsmInfo.h - AMDGPU MCAsm Interface -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCASMINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCASMINFO_H
+
+#include "llvm/MC/MCAsmInfoELF.h"
+namespace llvm {
+
+class Triple;
+
+// If you need to create another MCAsmInfo class, which inherits from MCAsmInfo,
+// you will need to make sure your new class sets PrivateGlobalPrefix to
+// a prefix that won't appear in a function name. The default value
+// for PrivateGlobalPrefix is 'L', so it will consider any function starting
+// with 'L' as a local symbol.
+class AMDGPUMCAsmInfo : public MCAsmInfoELF {
+public:
+ explicit AMDGPUMCAsmInfo(const Triple &TT);
+ bool shouldOmitSectionDirective(StringRef SectionName) const override;
+};
+} // namespace llvm
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
new file mode 100644
index 000000000000..521b3b39bba2
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -0,0 +1,21 @@
+//===-- AMDGPUCodeEmitter.cpp - AMDGPU Code Emitter interface -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief CodeEmitter interface for R600 and SI codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMCCodeEmitter.h"
+
+using namespace llvm;
+
+// pin vtable to this file
+void AMDGPUMCCodeEmitter::anchor() {}
+
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
new file mode 100644
index 000000000000..3d3858ab47ec
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
@@ -0,0 +1,63 @@
+//===-- AMDGPUCodeEmitter.h - AMDGPU Code Emitter interface -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief CodeEmitter interface for R600 and SI codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCCODEEMITTER_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCCODEEMITTER_H
+
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+class MCInst;
+class MCInstrInfo;
+class MCOperand;
+class MCSubtargetInfo;
+class FeatureBitset;
+
+class AMDGPUMCCodeEmitter : public MCCodeEmitter {
+ virtual void anchor();
+
+protected:
+ const MCInstrInfo &MCII;
+
+ AMDGPUMCCodeEmitter(const MCInstrInfo &mcii) : MCII(mcii) {}
+
+public:
+
+ uint64_t getBinaryCodeForInstr(const MCInst &MI,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ return 0;
+ }
+
+ virtual unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ return 0;
+ }
+
+protected:
+ uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;
+ void verifyInstructionPredicates(const MCInst &MI,
+ uint64_t AvailableFeatures) const;
+};
+
+} // End namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
new file mode 100644
index 000000000000..136e6ec4ceb5
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -0,0 +1,112 @@
+//===-- AMDGPUMCTargetDesc.cpp - AMDGPU Target Descriptions ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This file provides AMDGPU specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMCTargetDesc.h"
+#include "AMDGPUELFStreamer.h"
+#include "AMDGPUMCAsmInfo.h"
+#include "AMDGPUTargetStreamer.h"
+#include "InstPrinter/AMDGPUInstPrinter.h"
+#include "SIDefines.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "AMDGPUGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "AMDGPUGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "AMDGPUGenRegisterInfo.inc"
+
+static MCInstrInfo *createAMDGPUMCInstrInfo() {
+ MCInstrInfo *X = new MCInstrInfo();
+ InitAMDGPUMCInstrInfo(X);
+ return X;
+}
+
+static MCRegisterInfo *createAMDGPUMCRegisterInfo(const Triple &TT) {
+ MCRegisterInfo *X = new MCRegisterInfo();
+ InitAMDGPUMCRegisterInfo(X, 0);
+ return X;
+}
+
+static MCSubtargetInfo *
+createAMDGPUMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
+ return createAMDGPUMCSubtargetInfoImpl(TT, CPU, FS);
+}
+
+static MCInstPrinter *createAMDGPUMCInstPrinter(const Triple &T,
+ unsigned SyntaxVariant,
+ const MCAsmInfo &MAI,
+ const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI) {
+ return new AMDGPUInstPrinter(MAI, MII, MRI);
+}
+
+static MCTargetStreamer *createAMDGPUAsmTargetStreamer(MCStreamer &S,
+ formatted_raw_ostream &OS,
+ MCInstPrinter *InstPrint,
+ bool isVerboseAsm) {
+ return new AMDGPUTargetAsmStreamer(S, OS);
+}
+
+static MCTargetStreamer * createAMDGPUObjectTargetStreamer(
+ MCStreamer &S,
+ const MCSubtargetInfo &STI) {
+ return new AMDGPUTargetELFStreamer(S);
+}
+
+static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context,
+ MCAsmBackend &MAB, raw_pwrite_stream &OS,
+ MCCodeEmitter *Emitter, bool RelaxAll) {
+ if (T.getOS() == Triple::AMDHSA)
+ return createAMDGPUELFStreamer(Context, MAB, OS, Emitter, RelaxAll);
+
+ return createELFStreamer(Context, MAB, OS, Emitter, RelaxAll);
+}
+
+extern "C" void LLVMInitializeAMDGPUTargetMC() {
+ for (Target *T : {&getTheAMDGPUTarget(), &getTheGCNTarget()}) {
+ RegisterMCAsmInfo<AMDGPUMCAsmInfo> X(*T);
+
+ TargetRegistry::RegisterMCInstrInfo(*T, createAMDGPUMCInstrInfo);
+ TargetRegistry::RegisterMCRegInfo(*T, createAMDGPUMCRegisterInfo);
+ TargetRegistry::RegisterMCSubtargetInfo(*T, createAMDGPUMCSubtargetInfo);
+ TargetRegistry::RegisterMCInstPrinter(*T, createAMDGPUMCInstPrinter);
+ TargetRegistry::RegisterMCAsmBackend(*T, createAMDGPUAsmBackend);
+ TargetRegistry::RegisterELFStreamer(*T, createMCStreamer);
+ }
+
+ // R600 specific registration
+ TargetRegistry::RegisterMCCodeEmitter(getTheAMDGPUTarget(),
+ createR600MCCodeEmitter);
+
+ // GCN specific registration
+ TargetRegistry::RegisterMCCodeEmitter(getTheGCNTarget(),
+ createSIMCCodeEmitter);
+
+ TargetRegistry::RegisterAsmTargetStreamer(getTheGCNTarget(),
+ createAMDGPUAsmTargetStreamer);
+ TargetRegistry::RegisterObjectTargetStreamer(
+ getTheGCNTarget(), createAMDGPUObjectTargetStreamer);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
new file mode 100644
index 000000000000..548bad56e174
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -0,0 +1,64 @@
+//===-- AMDGPUMCTargetDesc.h - AMDGPU Target Descriptions -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Provides AMDGPU specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCTARGETDESC_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class MCTargetOptions;
+class StringRef;
+class Target;
+class Triple;
+class raw_pwrite_stream;
+
+Target &getTheAMDGPUTarget();
+Target &getTheGCNTarget();
+
+MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx);
+
+MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx);
+
+MCAsmBackend *createAMDGPUAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options);
+
+MCObjectWriter *createAMDGPUELFObjectWriter(bool Is64Bit,
+ bool HasRelocationAddend,
+ raw_pwrite_stream &OS);
+} // End llvm namespace
+
+#define GET_REGINFO_ENUM
+#include "AMDGPUGenRegisterInfo.inc"
+
+#define GET_INSTRINFO_ENUM
+#include "AMDGPUGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "AMDGPUGenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.cpp
new file mode 100644
index 000000000000..95387ad1627c
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.cpp
@@ -0,0 +1,408 @@
+//===-- AMDGPURuntimeMD.cpp - Generates runtime metadata ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// Generates AMDGPU runtime metadata for YAML mapping.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#include "AMDGPU.h"
+#include "AMDGPURuntimeMetadata.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/YAMLTraits.h"
+#include <vector>
+#include "AMDGPURuntimeMD.h"
+
+using namespace llvm;
+using namespace ::AMDGPU::RuntimeMD;
+
+static cl::opt<bool>
+DumpRuntimeMD("amdgpu-dump-rtmd",
+ cl::desc("Dump AMDGPU runtime metadata"));
+
+static cl::opt<bool>
+CheckRuntimeMDParser("amdgpu-check-rtmd-parser", cl::Hidden,
+ cl::desc("Check AMDGPU runtime metadata YAML parser"));
+
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(uint8_t)
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(uint32_t)
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(std::string)
+LLVM_YAML_IS_SEQUENCE_VECTOR(Kernel::Metadata)
+LLVM_YAML_IS_SEQUENCE_VECTOR(KernelArg::Metadata)
+
+namespace llvm {
+namespace yaml {
+
+template <> struct MappingTraits<KernelArg::Metadata> {
+ static void mapping(IO &YamlIO, KernelArg::Metadata &A) {
+ YamlIO.mapRequired(KeyName::ArgSize, A.Size);
+ YamlIO.mapRequired(KeyName::ArgAlign, A.Align);
+ YamlIO.mapOptional(KeyName::ArgPointeeAlign, A.PointeeAlign, 0U);
+ YamlIO.mapRequired(KeyName::ArgKind, A.Kind);
+ YamlIO.mapRequired(KeyName::ArgValueType, A.ValueType);
+ YamlIO.mapOptional(KeyName::ArgTypeName, A.TypeName, std::string());
+ YamlIO.mapOptional(KeyName::ArgName, A.Name, std::string());
+ YamlIO.mapOptional(KeyName::ArgAddrQual, A.AddrQual, INVALID_ADDR_QUAL);
+ YamlIO.mapOptional(KeyName::ArgAccQual, A.AccQual, INVALID_ACC_QUAL);
+ YamlIO.mapOptional(KeyName::ArgIsVolatile, A.IsVolatile, uint8_t(0));
+ YamlIO.mapOptional(KeyName::ArgIsConst, A.IsConst, uint8_t(0));
+ YamlIO.mapOptional(KeyName::ArgIsRestrict, A.IsRestrict, uint8_t(0));
+ YamlIO.mapOptional(KeyName::ArgIsPipe, A.IsPipe, uint8_t(0));
+ }
+ static const bool flow = true;
+};
+
+template <> struct MappingTraits<Kernel::Metadata> {
+ static void mapping(IO &YamlIO, Kernel::Metadata &K) {
+ YamlIO.mapRequired(KeyName::KernelName, K.Name);
+ YamlIO.mapOptional(KeyName::Language, K.Language, std::string());
+ YamlIO.mapOptional(KeyName::LanguageVersion, K.LanguageVersion);
+ YamlIO.mapOptional(KeyName::ReqdWorkGroupSize, K.ReqdWorkGroupSize);
+ YamlIO.mapOptional(KeyName::WorkGroupSizeHint, K.WorkGroupSizeHint);
+ YamlIO.mapOptional(KeyName::VecTypeHint, K.VecTypeHint, std::string());
+ YamlIO.mapOptional(KeyName::KernelIndex, K.KernelIndex,
+ INVALID_KERNEL_INDEX);
+ YamlIO.mapOptional(KeyName::NoPartialWorkGroups, K.NoPartialWorkGroups,
+ uint8_t(0));
+ YamlIO.mapRequired(KeyName::Args, K.Args);
+ }
+ static const bool flow = true;
+};
+
+template <> struct MappingTraits<Program::Metadata> {
+ static void mapping(IO &YamlIO, Program::Metadata &Prog) {
+ YamlIO.mapRequired(KeyName::MDVersion, Prog.MDVersionSeq);
+ YamlIO.mapOptional(KeyName::PrintfInfo, Prog.PrintfInfo);
+ YamlIO.mapOptional(KeyName::Kernels, Prog.Kernels);
+ }
+ static const bool flow = true;
+};
+
+} // end namespace yaml
+} // end namespace llvm
+
+// Get a vector of three integer values from MDNode \p Node;
+static std::vector<uint32_t> getThreeInt32(MDNode *Node) {
+ assert(Node->getNumOperands() == 3);
+ std::vector<uint32_t> V;
+ for (const MDOperand &Op : Node->operands()) {
+ const ConstantInt *CI = mdconst::extract<ConstantInt>(Op);
+ V.push_back(CI->getZExtValue());
+ }
+ return V;
+}
+
+static std::string getOCLTypeName(Type *Ty, bool Signed) {
+ switch (Ty->getTypeID()) {
+ case Type::HalfTyID:
+ return "half";
+ case Type::FloatTyID:
+ return "float";
+ case Type::DoubleTyID:
+ return "double";
+ case Type::IntegerTyID: {
+ if (!Signed)
+ return (Twine('u') + getOCLTypeName(Ty, true)).str();
+ unsigned BW = Ty->getIntegerBitWidth();
+ switch (BW) {
+ case 8:
+ return "char";
+ case 16:
+ return "short";
+ case 32:
+ return "int";
+ case 64:
+ return "long";
+ default:
+ return (Twine('i') + Twine(BW)).str();
+ }
+ }
+ case Type::VectorTyID: {
+ VectorType *VecTy = cast<VectorType>(Ty);
+ Type *EleTy = VecTy->getElementType();
+ unsigned Size = VecTy->getVectorNumElements();
+ return (Twine(getOCLTypeName(EleTy, Signed)) + Twine(Size)).str();
+ }
+ default:
+ return "unknown";
+ }
+}
+
+static KernelArg::ValueType getRuntimeMDValueType(
+ Type *Ty, StringRef TypeName) {
+ switch (Ty->getTypeID()) {
+ case Type::HalfTyID:
+ return KernelArg::F16;
+ case Type::FloatTyID:
+ return KernelArg::F32;
+ case Type::DoubleTyID:
+ return KernelArg::F64;
+ case Type::IntegerTyID: {
+ bool Signed = !TypeName.startswith("u");
+ switch (Ty->getIntegerBitWidth()) {
+ case 8:
+ return Signed ? KernelArg::I8 : KernelArg::U8;
+ case 16:
+ return Signed ? KernelArg::I16 : KernelArg::U16;
+ case 32:
+ return Signed ? KernelArg::I32 : KernelArg::U32;
+ case 64:
+ return Signed ? KernelArg::I64 : KernelArg::U64;
+ default:
+ // Runtime does not recognize other integer types. Report as struct type.
+ return KernelArg::Struct;
+ }
+ }
+ case Type::VectorTyID:
+ return getRuntimeMDValueType(Ty->getVectorElementType(), TypeName);
+ case Type::PointerTyID:
+ return getRuntimeMDValueType(Ty->getPointerElementType(), TypeName);
+ default:
+ return KernelArg::Struct;
+ }
+}
+
+static KernelArg::AddressSpaceQualifer getRuntimeAddrSpace(
+ AMDGPUAS::AddressSpaces A) {
+ switch (A) {
+ case AMDGPUAS::GLOBAL_ADDRESS:
+ return KernelArg::Global;
+ case AMDGPUAS::CONSTANT_ADDRESS:
+ return KernelArg::Constant;
+ case AMDGPUAS::LOCAL_ADDRESS:
+ return KernelArg::Local;
+ case AMDGPUAS::FLAT_ADDRESS:
+ return KernelArg::Generic;
+ case AMDGPUAS::REGION_ADDRESS:
+ return KernelArg::Region;
+ default:
+ return KernelArg::Private;
+ }
+}
+
+static KernelArg::Metadata getRuntimeMDForKernelArg(const DataLayout &DL,
+ Type *T, KernelArg::Kind Kind, StringRef BaseTypeName = "",
+ StringRef TypeName = "", StringRef ArgName = "", StringRef TypeQual = "",
+ StringRef AccQual = "") {
+
+ KernelArg::Metadata Arg;
+
+ // Set ArgSize and ArgAlign.
+ Arg.Size = DL.getTypeAllocSize(T);
+ Arg.Align = DL.getABITypeAlignment(T);
+ if (auto PT = dyn_cast<PointerType>(T)) {
+ auto ET = PT->getElementType();
+ if (PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && ET->isSized())
+ Arg.PointeeAlign = DL.getABITypeAlignment(ET);
+ }
+
+ // Set ArgTypeName.
+ Arg.TypeName = TypeName;
+
+ // Set ArgName.
+ Arg.Name = ArgName;
+
+ // Set ArgIsVolatile, ArgIsRestrict, ArgIsConst and ArgIsPipe.
+ SmallVector<StringRef, 1> SplitQ;
+ TypeQual.split(SplitQ, " ", -1, false /* Drop empty entry */);
+
+ for (StringRef KeyName : SplitQ) {
+ auto *P = StringSwitch<uint8_t *>(KeyName)
+ .Case("volatile", &Arg.IsVolatile)
+ .Case("restrict", &Arg.IsRestrict)
+ .Case("const", &Arg.IsConst)
+ .Case("pipe", &Arg.IsPipe)
+ .Default(nullptr);
+ if (P)
+ *P = 1;
+ }
+
+ // Set ArgKind.
+ Arg.Kind = Kind;
+
+ // Set ArgValueType.
+ Arg.ValueType = getRuntimeMDValueType(T, BaseTypeName);
+
+ // Set ArgAccQual.
+ if (!AccQual.empty()) {
+ Arg.AccQual = StringSwitch<KernelArg::AccessQualifer>(AccQual)
+ .Case("read_only", KernelArg::ReadOnly)
+ .Case("write_only", KernelArg::WriteOnly)
+ .Case("read_write", KernelArg::ReadWrite)
+ .Default(KernelArg::AccNone);
+ }
+
+ // Set ArgAddrQual.
+ if (auto *PT = dyn_cast<PointerType>(T)) {
+ Arg.AddrQual = getRuntimeAddrSpace(static_cast<AMDGPUAS::AddressSpaces>(
+ PT->getAddressSpace()));
+ }
+
+ return Arg;
+}
+
+static Kernel::Metadata getRuntimeMDForKernel(const Function &F) {
+ Kernel::Metadata Kernel;
+ Kernel.Name = F.getName();
+ auto &M = *F.getParent();
+
+ // Set Language and LanguageVersion.
+ if (auto MD = M.getNamedMetadata("opencl.ocl.version")) {
+ if (MD->getNumOperands() != 0) {
+ auto Node = MD->getOperand(0);
+ if (Node->getNumOperands() > 1) {
+ Kernel.Language = "OpenCL C";
+ uint16_t Major = mdconst::extract<ConstantInt>(Node->getOperand(0))
+ ->getZExtValue();
+ uint16_t Minor = mdconst::extract<ConstantInt>(Node->getOperand(1))
+ ->getZExtValue();
+ Kernel.LanguageVersion.push_back(Major);
+ Kernel.LanguageVersion.push_back(Minor);
+ }
+ }
+ }
+
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ for (auto &Arg : F.args()) {
+ unsigned I = Arg.getArgNo();
+ Type *T = Arg.getType();
+ auto TypeName = dyn_cast<MDString>(F.getMetadata(
+ "kernel_arg_type")->getOperand(I))->getString();
+ auto BaseTypeName = cast<MDString>(F.getMetadata(
+ "kernel_arg_base_type")->getOperand(I))->getString();
+ StringRef ArgName;
+ if (auto ArgNameMD = F.getMetadata("kernel_arg_name"))
+ ArgName = cast<MDString>(ArgNameMD->getOperand(I))->getString();
+ auto TypeQual = cast<MDString>(F.getMetadata(
+ "kernel_arg_type_qual")->getOperand(I))->getString();
+ auto AccQual = cast<MDString>(F.getMetadata(
+ "kernel_arg_access_qual")->getOperand(I))->getString();
+ KernelArg::Kind Kind;
+ if (TypeQual.find("pipe") != StringRef::npos)
+ Kind = KernelArg::Pipe;
+ else Kind = StringSwitch<KernelArg::Kind>(BaseTypeName)
+ .Case("sampler_t", KernelArg::Sampler)
+ .Case("queue_t", KernelArg::Queue)
+ .Cases("image1d_t", "image1d_array_t", "image1d_buffer_t",
+ "image2d_t" , "image2d_array_t", KernelArg::Image)
+ .Cases("image2d_depth_t", "image2d_array_depth_t",
+ "image2d_msaa_t", "image2d_array_msaa_t",
+ "image2d_msaa_depth_t", KernelArg::Image)
+ .Cases("image2d_array_msaa_depth_t", "image3d_t",
+ KernelArg::Image)
+ .Default(isa<PointerType>(T) ?
+ (T->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ?
+ KernelArg::DynamicSharedPointer :
+ KernelArg::GlobalBuffer) :
+ KernelArg::ByValue);
+ Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, T, Kind,
+ BaseTypeName, TypeName, ArgName, TypeQual, AccQual));
+ }
+
+ // Emit hidden kernel arguments for OpenCL kernels.
+ if (F.getParent()->getNamedMetadata("opencl.ocl.version")) {
+ auto Int64T = Type::getInt64Ty(F.getContext());
+ Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, Int64T,
+ KernelArg::HiddenGlobalOffsetX));
+ Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, Int64T,
+ KernelArg::HiddenGlobalOffsetY));
+ Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, Int64T,
+ KernelArg::HiddenGlobalOffsetZ));
+ if (F.getParent()->getNamedMetadata("llvm.printf.fmts")) {
+ auto Int8PtrT = Type::getInt8PtrTy(F.getContext(),
+ KernelArg::Global);
+ Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, Int8PtrT,
+ KernelArg::HiddenPrintfBuffer));
+ }
+ }
+
+ // Set ReqdWorkGroupSize, WorkGroupSizeHint, and VecTypeHint.
+ if (auto RWGS = F.getMetadata("reqd_work_group_size"))
+ Kernel.ReqdWorkGroupSize = getThreeInt32(RWGS);
+
+ if (auto WGSH = F.getMetadata("work_group_size_hint"))
+ Kernel.WorkGroupSizeHint = getThreeInt32(WGSH);
+
+ if (auto VTH = F.getMetadata("vec_type_hint"))
+ Kernel.VecTypeHint = getOCLTypeName(cast<ValueAsMetadata>(
+ VTH->getOperand(0))->getType(), mdconst::extract<ConstantInt>(
+ VTH->getOperand(1))->getZExtValue());
+
+ return Kernel;
+}
+
+Program::Metadata::Metadata(const std::string &YAML) {
+ yaml::Input Input(YAML);
+ Input >> *this;
+}
+
+std::string Program::Metadata::toYAML(void) {
+ std::string Text;
+ raw_string_ostream Stream(Text);
+ yaml::Output Output(Stream, nullptr, INT_MAX /* do not wrap line */);
+ Output << *this;
+ return Stream.str();
+}
+
+Program::Metadata Program::Metadata::fromYAML(const std::string &S) {
+ return Program::Metadata(S);
+}
+
+// Check if the YAML string can be parsed.
+static void checkRuntimeMDYAMLString(const std::string &YAML) {
+ auto P = Program::Metadata::fromYAML(YAML);
+ auto S = P.toYAML();
+ llvm::errs() << "AMDGPU runtime metadata parser test "
+ << (YAML == S ? "passes" : "fails") << ".\n";
+ if (YAML != S) {
+ llvm::errs() << "First output: " << YAML << '\n'
+ << "Second output: " << S << '\n';
+ }
+}
+
+std::string llvm::getRuntimeMDYAMLString(Module &M) {
+ Program::Metadata Prog;
+ Prog.MDVersionSeq.push_back(MDVersion);
+ Prog.MDVersionSeq.push_back(MDRevision);
+
+ // Set PrintfInfo.
+ if (auto MD = M.getNamedMetadata("llvm.printf.fmts")) {
+ for (unsigned I = 0; I < MD->getNumOperands(); ++I) {
+ auto Node = MD->getOperand(I);
+ if (Node->getNumOperands() > 0)
+ Prog.PrintfInfo.push_back(cast<MDString>(Node->getOperand(0))
+ ->getString());
+ }
+ }
+
+ // Set Kernels.
+ for (auto &F: M.functions()) {
+ if (!F.getMetadata("kernel_arg_type"))
+ continue;
+ Prog.Kernels.emplace_back(getRuntimeMDForKernel(F));
+ }
+
+ auto YAML = Prog.toYAML();
+
+ if (DumpRuntimeMD)
+ llvm::errs() << "AMDGPU runtime metadata:\n" << YAML << '\n';
+
+ if (CheckRuntimeMDParser)
+ checkRuntimeMDYAMLString(YAML);
+
+ return YAML;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.h
new file mode 100644
index 000000000000..a92fdd4bebc2
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.h
@@ -0,0 +1,26 @@
+//===- AMDGPURuntimeMD.h - Generate runtime metadata ---------------*- C++ -*-//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares functions for generating runtime metadata.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPURUNTIMEMD_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPURUNTIMEMD_H
+
+#include <string>
+
+namespace llvm {
+class Module;
+
+// Get runtime metadata as YAML string.
+std::string getRuntimeMDYAMLString(Module &M);
+
+}
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
new file mode 100644
index 000000000000..3392183d33c3
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -0,0 +1,242 @@
+//===-- AMDGPUTargetStreamer.cpp - Mips Target Streamer Methods -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides AMDGPU specific target streamer methods.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUTargetStreamer.h"
+#include "SIDefines.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "Utils/AMDKernelCodeTUtils.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/FormattedStream.h"
+#include "AMDGPURuntimeMD.h"
+
+namespace llvm {
+#include "AMDGPUPTNote.h"
+}
+
+using namespace llvm;
+using namespace llvm::AMDGPU;
+
+AMDGPUTargetStreamer::AMDGPUTargetStreamer(MCStreamer &S)
+ : MCTargetStreamer(S) {}
+
+//===----------------------------------------------------------------------===//
+// AMDGPUTargetAsmStreamer
+//===----------------------------------------------------------------------===//
+
+AMDGPUTargetAsmStreamer::AMDGPUTargetAsmStreamer(MCStreamer &S,
+ formatted_raw_ostream &OS)
+ : AMDGPUTargetStreamer(S), OS(OS) { }
+
+void
+AMDGPUTargetAsmStreamer::EmitDirectiveHSACodeObjectVersion(uint32_t Major,
+ uint32_t Minor) {
+ OS << "\t.hsa_code_object_version " <<
+ Twine(Major) << "," << Twine(Minor) << '\n';
+}
+
+void
+AMDGPUTargetAsmStreamer::EmitDirectiveHSACodeObjectISA(uint32_t Major,
+ uint32_t Minor,
+ uint32_t Stepping,
+ StringRef VendorName,
+ StringRef ArchName) {
+ OS << "\t.hsa_code_object_isa " <<
+ Twine(Major) << "," << Twine(Minor) << "," << Twine(Stepping) <<
+ ",\"" << VendorName << "\",\"" << ArchName << "\"\n";
+
+}
+
+void
+AMDGPUTargetAsmStreamer::EmitAMDKernelCodeT(const amd_kernel_code_t &Header) {
+ OS << "\t.amd_kernel_code_t\n";
+ dumpAmdKernelCode(&Header, OS, "\t\t");
+ OS << "\t.end_amd_kernel_code_t\n";
+}
+
+void AMDGPUTargetAsmStreamer::EmitAMDGPUSymbolType(StringRef SymbolName,
+ unsigned Type) {
+ switch (Type) {
+ default: llvm_unreachable("Invalid AMDGPU symbol type");
+ case ELF::STT_AMDGPU_HSA_KERNEL:
+ OS << "\t.amdgpu_hsa_kernel " << SymbolName << '\n' ;
+ break;
+ }
+}
+
+void AMDGPUTargetAsmStreamer::EmitAMDGPUHsaModuleScopeGlobal(
+ StringRef GlobalName) {
+ OS << "\t.amdgpu_hsa_module_global " << GlobalName << '\n';
+}
+
+void AMDGPUTargetAsmStreamer::EmitAMDGPUHsaProgramScopeGlobal(
+ StringRef GlobalName) {
+ OS << "\t.amdgpu_hsa_program_global " << GlobalName << '\n';
+}
+
+void AMDGPUTargetAsmStreamer::EmitRuntimeMetadata(Module &M) {
+ OS << "\t.amdgpu_runtime_metadata\n";
+ OS << getRuntimeMDYAMLString(M);
+ OS << "\n\t.end_amdgpu_runtime_metadata\n";
+}
+
+void AMDGPUTargetAsmStreamer::EmitRuntimeMetadata(StringRef Metadata) {
+ OS << "\t.amdgpu_runtime_metadata";
+ OS << Metadata;
+ OS << "\t.end_amdgpu_runtime_metadata\n";
+}
+
+//===----------------------------------------------------------------------===//
+// AMDGPUTargetELFStreamer
+//===----------------------------------------------------------------------===//
+
+AMDGPUTargetELFStreamer::AMDGPUTargetELFStreamer(MCStreamer &S)
+ : AMDGPUTargetStreamer(S), Streamer(S) {}
+
+MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() {
+ return static_cast<MCELFStreamer &>(Streamer);
+}
+
+void
+AMDGPUTargetELFStreamer::EmitAMDGPUNote(const MCExpr* DescSZ,
+ PT_NOTE::NoteType Type,
+ std::function<void(MCELFStreamer &)> EmitDesc) {
+ auto &S = getStreamer();
+ auto &Context = S.getContext();
+
+ auto NameSZ = sizeof(PT_NOTE::NoteName);
+
+ S.PushSection();
+ S.SwitchSection(Context.getELFSection(
+ PT_NOTE::SectionName, ELF::SHT_NOTE, ELF::SHF_ALLOC));
+ S.EmitIntValue(NameSZ, 4); // namesz
+ S.EmitValue(DescSZ, 4); // descz
+ S.EmitIntValue(Type, 4); // type
+ S.EmitBytes(StringRef(PT_NOTE::NoteName, NameSZ)); // name
+ S.EmitValueToAlignment(4, 0, 1, 0); // padding 0
+ EmitDesc(S); // desc
+ S.EmitValueToAlignment(4, 0, 1, 0); // padding 0
+ S.PopSection();
+}
+
+void
+AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectVersion(uint32_t Major,
+ uint32_t Minor) {
+
+ EmitAMDGPUNote(
+ MCConstantExpr::create(8, getContext()),
+ PT_NOTE::NT_AMDGPU_HSA_CODE_OBJECT_VERSION,
+ [&](MCELFStreamer &OS){
+ OS.EmitIntValue(Major, 4);
+ OS.EmitIntValue(Minor, 4);
+ }
+ );
+}
+
+void
+AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectISA(uint32_t Major,
+ uint32_t Minor,
+ uint32_t Stepping,
+ StringRef VendorName,
+ StringRef ArchName) {
+ uint16_t VendorNameSize = VendorName.size() + 1;
+ uint16_t ArchNameSize = ArchName.size() + 1;
+
+ unsigned DescSZ = sizeof(VendorNameSize) + sizeof(ArchNameSize) +
+ sizeof(Major) + sizeof(Minor) + sizeof(Stepping) +
+ VendorNameSize + ArchNameSize;
+
+ EmitAMDGPUNote(
+ MCConstantExpr::create(DescSZ, getContext()),
+ PT_NOTE::NT_AMDGPU_HSA_ISA,
+ [&](MCELFStreamer &OS) {
+ OS.EmitIntValue(VendorNameSize, 2);
+ OS.EmitIntValue(ArchNameSize, 2);
+ OS.EmitIntValue(Major, 4);
+ OS.EmitIntValue(Minor, 4);
+ OS.EmitIntValue(Stepping, 4);
+ OS.EmitBytes(VendorName);
+ OS.EmitIntValue(0, 1); // NULL terminate VendorName
+ OS.EmitBytes(ArchName);
+ OS.EmitIntValue(0, 1); // NULL terminte ArchName
+ }
+ );
+}
+
+void
+AMDGPUTargetELFStreamer::EmitAMDKernelCodeT(const amd_kernel_code_t &Header) {
+
+ MCStreamer &OS = getStreamer();
+ OS.PushSection();
+ OS.EmitBytes(StringRef((const char*)&Header, sizeof(Header)));
+ OS.PopSection();
+}
+
+void AMDGPUTargetELFStreamer::EmitAMDGPUSymbolType(StringRef SymbolName,
+ unsigned Type) {
+ MCSymbolELF *Symbol = cast<MCSymbolELF>(
+ getStreamer().getContext().getOrCreateSymbol(SymbolName));
+ Symbol->setType(ELF::STT_AMDGPU_HSA_KERNEL);
+}
+
+void AMDGPUTargetELFStreamer::EmitAMDGPUHsaModuleScopeGlobal(
+ StringRef GlobalName) {
+
+ MCSymbolELF *Symbol = cast<MCSymbolELF>(
+ getStreamer().getContext().getOrCreateSymbol(GlobalName));
+ Symbol->setType(ELF::STT_OBJECT);
+ Symbol->setBinding(ELF::STB_LOCAL);
+}
+
+void AMDGPUTargetELFStreamer::EmitAMDGPUHsaProgramScopeGlobal(
+ StringRef GlobalName) {
+
+ MCSymbolELF *Symbol = cast<MCSymbolELF>(
+ getStreamer().getContext().getOrCreateSymbol(GlobalName));
+ Symbol->setType(ELF::STT_OBJECT);
+ Symbol->setBinding(ELF::STB_GLOBAL);
+}
+
+void AMDGPUTargetELFStreamer::EmitRuntimeMetadata(StringRef Metadata) {
+ // Create two labels to mark the beginning and end of the desc field
+ // and a MCExpr to calculate the size of the desc field.
+ auto &Context = getContext();
+ auto *DescBegin = Context.createTempSymbol();
+ auto *DescEnd = Context.createTempSymbol();
+ auto *DescSZ = MCBinaryExpr::createSub(
+ MCSymbolRefExpr::create(DescEnd, Context),
+ MCSymbolRefExpr::create(DescBegin, Context), Context);
+
+ EmitAMDGPUNote(
+ DescSZ,
+ PT_NOTE::NT_AMDGPU_HSA_RUNTIME_METADATA,
+ [&](MCELFStreamer &OS) {
+ OS.EmitLabel(DescBegin);
+ OS.EmitBytes(Metadata);
+ OS.EmitLabel(DescEnd);
+ }
+ );
+}
+
+void AMDGPUTargetELFStreamer::EmitRuntimeMetadata(Module &M) {
+ EmitRuntimeMetadata(getRuntimeMDYAMLString(M));
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
new file mode 100644
index 000000000000..e2f20586903d
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -0,0 +1,111 @@
+//===-- AMDGPUTargetStreamer.h - AMDGPU Target Streamer --------*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
+
+#include "AMDKernelCodeT.h"
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+#include "AMDGPUPTNote.h"
+
+class DataLayout;
+class Function;
+class MCELFStreamer;
+class MCSymbol;
+class MDNode;
+class Module;
+class Type;
+
+class AMDGPUTargetStreamer : public MCTargetStreamer {
+protected:
+ MCContext &getContext() const { return Streamer.getContext(); }
+
+public:
+ AMDGPUTargetStreamer(MCStreamer &S);
+ virtual void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
+ uint32_t Minor) = 0;
+
+ virtual void EmitDirectiveHSACodeObjectISA(uint32_t Major, uint32_t Minor,
+ uint32_t Stepping,
+ StringRef VendorName,
+ StringRef ArchName) = 0;
+
+ virtual void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) = 0;
+
+ virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) = 0;
+
+ virtual void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) = 0;
+
+ virtual void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) = 0;
+
+ virtual void EmitRuntimeMetadata(Module &M) = 0;
+
+ virtual void EmitRuntimeMetadata(StringRef Metadata) = 0;
+};
+
+class AMDGPUTargetAsmStreamer : public AMDGPUTargetStreamer {
+ formatted_raw_ostream &OS;
+public:
+ AMDGPUTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
+ void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
+ uint32_t Minor) override;
+
+ void EmitDirectiveHSACodeObjectISA(uint32_t Major, uint32_t Minor,
+ uint32_t Stepping, StringRef VendorName,
+ StringRef ArchName) override;
+
+ void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override;
+
+ void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override;
+
+ void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) override;
+
+ void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override;
+
+ void EmitRuntimeMetadata(Module &M) override;
+
+ void EmitRuntimeMetadata(StringRef Metadata) override;
+};
+
+class AMDGPUTargetELFStreamer : public AMDGPUTargetStreamer {
+ MCStreamer &Streamer;
+
+ void EmitAMDGPUNote(const MCExpr* DescSize,
+ AMDGPU::PT_NOTE::NoteType Type,
+ std::function<void(MCELFStreamer &)> EmitDesc);
+
+public:
+ AMDGPUTargetELFStreamer(MCStreamer &S);
+
+ MCELFStreamer &getStreamer();
+
+ void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
+ uint32_t Minor) override;
+
+ void EmitDirectiveHSACodeObjectISA(uint32_t Major, uint32_t Minor,
+ uint32_t Stepping, StringRef VendorName,
+ StringRef ArchName) override;
+
+ void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override;
+
+ void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override;
+
+ void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) override;
+
+ void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override;
+
+ void EmitRuntimeMetadata(Module &M) override;
+
+ void EmitRuntimeMetadata(StringRef Metadata) override;
+};
+
+}
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
new file mode 100644
index 000000000000..6015ec190fd4
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -0,0 +1,189 @@
+//===- R600MCCodeEmitter.cpp - Code Emitter for R600->Cayman GPU families -===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// \brief The R600 code emitter produces machine code that can be executed
+/// directly on the GPU device.
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600Defines.h"
+#include "MCTargetDesc/AMDGPUFixupKinds.h"
+#include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+
+using namespace llvm;
+
+namespace {
+
+class R600MCCodeEmitter : public AMDGPUMCCodeEmitter {
+ const MCRegisterInfo &MRI;
+
+public:
+ R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri)
+ : AMDGPUMCCodeEmitter(mcii), MRI(mri) {}
+ R600MCCodeEmitter(const R600MCCodeEmitter &) = delete;
+ R600MCCodeEmitter &operator=(const R600MCCodeEmitter &) = delete;
+
+ /// \brief Encode the instruction and write it to the OS.
+ void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const override;
+
+ /// \returns the encoding for an MCOperand.
+ uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const override;
+
+private:
+ void Emit(uint32_t value, raw_ostream &OS) const;
+ void Emit(uint64_t value, raw_ostream &OS) const;
+
+ unsigned getHWReg(unsigned regNo) const;
+};
+
+} // end anonymous namespace
+
+enum RegElement {
+ ELEMENT_X = 0,
+ ELEMENT_Y,
+ ELEMENT_Z,
+ ELEMENT_W
+};
+
+enum FCInstr {
+ FC_IF_PREDICATE = 0,
+ FC_ELSE,
+ FC_ENDIF,
+ FC_BGNLOOP,
+ FC_ENDLOOP,
+ FC_BREAK_PREDICATE,
+ FC_CONTINUE
+};
+
+MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx) {
+ return new R600MCCodeEmitter(MCII, MRI);
+}
+
+void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ verifyInstructionPredicates(MI,
+ computeAvailableFeatures(STI.getFeatureBits()));
+
+ const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+ if (MI.getOpcode() == AMDGPU::RETURN ||
+ MI.getOpcode() == AMDGPU::FETCH_CLAUSE ||
+ MI.getOpcode() == AMDGPU::ALU_CLAUSE ||
+ MI.getOpcode() == AMDGPU::BUNDLE ||
+ MI.getOpcode() == AMDGPU::KILL) {
+ return;
+ } else if (IS_VTX(Desc)) {
+ uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups, STI);
+ uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset
+ if (!(STI.getFeatureBits()[AMDGPU::FeatureCaymanISA])) {
+ InstWord2 |= 1 << 19; // Mega-Fetch bit
+ }
+
+ Emit(InstWord01, OS);
+ Emit(InstWord2, OS);
+ Emit((uint32_t) 0, OS);
+ } else if (IS_TEX(Desc)) {
+ int64_t Sampler = MI.getOperand(14).getImm();
+
+ int64_t SrcSelect[4] = {
+ MI.getOperand(2).getImm(),
+ MI.getOperand(3).getImm(),
+ MI.getOperand(4).getImm(),
+ MI.getOperand(5).getImm()
+ };
+ int64_t Offsets[3] = {
+ MI.getOperand(6).getImm() & 0x1F,
+ MI.getOperand(7).getImm() & 0x1F,
+ MI.getOperand(8).getImm() & 0x1F
+ };
+
+ uint64_t Word01 = getBinaryCodeForInstr(MI, Fixups, STI);
+ uint32_t Word2 = Sampler << 15 | SrcSelect[ELEMENT_X] << 20 |
+ SrcSelect[ELEMENT_Y] << 23 | SrcSelect[ELEMENT_Z] << 26 |
+ SrcSelect[ELEMENT_W] << 29 | Offsets[0] << 0 | Offsets[1] << 5 |
+ Offsets[2] << 10;
+
+ Emit(Word01, OS);
+ Emit(Word2, OS);
+ Emit((uint32_t) 0, OS);
+ } else {
+ uint64_t Inst = getBinaryCodeForInstr(MI, Fixups, STI);
+ if ((STI.getFeatureBits()[AMDGPU::FeatureR600ALUInst]) &&
+ ((Desc.TSFlags & R600_InstFlag::OP1) ||
+ Desc.TSFlags & R600_InstFlag::OP2)) {
+ uint64_t ISAOpCode = Inst & (0x3FFULL << 39);
+ Inst &= ~(0x3FFULL << 39);
+ Inst |= ISAOpCode << 1;
+ }
+ Emit(Inst, OS);
+ }
+}
+
+void R600MCCodeEmitter::Emit(uint32_t Value, raw_ostream &OS) const {
+ support::endian::Writer<support::little>(OS).write(Value);
+}
+
+void R600MCCodeEmitter::Emit(uint64_t Value, raw_ostream &OS) const {
+ support::endian::Writer<support::little>(OS).write(Value);
+}
+
+unsigned R600MCCodeEmitter::getHWReg(unsigned RegNo) const {
+ return MRI.getEncodingValue(RegNo) & HW_REG_MASK;
+}
+
+uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI,
+ const MCOperand &MO,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ if (MO.isReg()) {
+ if (HAS_NATIVE_OPERANDS(MCII.get(MI.getOpcode()).TSFlags))
+ return MRI.getEncodingValue(MO.getReg());
+ return getHWReg(MO.getReg());
+ }
+
+ if (MO.isExpr()) {
+ // We put rodata at the end of code section, then map the entire
+ // code secetion as vtx buf. Thus the section relative address is the
+ // correct one.
+ // Each R600 literal instruction has two operands
+ // We can't easily get the order of the current one, so compare against
+ // the first one and adjust offset.
+ const unsigned offset = (&MO == &MI.getOperand(0)) ? 0 : 4;
+ Fixups.push_back(MCFixup::create(offset, MO.getExpr(), FK_SecRel_4, MI.getLoc()));
+ return 0;
+ }
+
+ assert(MO.isImm());
+ return MO.getImm();
+}
+
+#define ENABLE_INSTR_PREDICATE_VERIFIER
+#include "AMDGPUGenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
new file mode 100644
index 000000000000..0c5bb0648a16
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -0,0 +1,335 @@
+//===-- SIMCCodeEmitter.cpp - SI Code Emitter -----------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief The SI code emitter produces machine code that can be executed
+/// directly on the GPU device.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "MCTargetDesc/AMDGPUFixupKinds.h"
+#include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+
+using namespace llvm;
+
+namespace {
+
+class SIMCCodeEmitter : public AMDGPUMCCodeEmitter {
+ const MCRegisterInfo &MRI;
+
+ /// \brief Encode an fp or int literal
+ uint32_t getLitEncoding(const MCOperand &MO, const MCOperandInfo &OpInfo,
+ const MCSubtargetInfo &STI) const;
+
+public:
+ SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
+ MCContext &ctx)
+ : AMDGPUMCCodeEmitter(mcii), MRI(mri) {}
+ SIMCCodeEmitter(const SIMCCodeEmitter &) = delete;
+ SIMCCodeEmitter &operator=(const SIMCCodeEmitter &) = delete;
+
+ /// \brief Encode the instruction and write it to the OS.
+ void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const override;
+
+ /// \returns the encoding for an MCOperand.
+ uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const override;
+
+ /// \brief Use a fixup to encode the simm16 field for SOPP branch
+ /// instructions.
+ unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const override;
+};
+
+} // end anonymous namespace
+
+MCCodeEmitter *llvm::createSIMCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx) {
+ return new SIMCCodeEmitter(MCII, MRI, Ctx);
+}
+
+// Returns the encoding value to use if the given integer is an integer inline
+// immediate value, or 0 if it is not.
+template <typename IntTy>
+static uint32_t getIntInlineImmEncoding(IntTy Imm) {
+ if (Imm >= 0 && Imm <= 64)
+ return 128 + Imm;
+
+ if (Imm >= -16 && Imm <= -1)
+ return 192 + std::abs(Imm);
+
+ return 0;
+}
+
+static uint32_t getLit16Encoding(uint16_t Val, const MCSubtargetInfo &STI) {
+ uint16_t IntImm = getIntInlineImmEncoding(static_cast<int16_t>(Val));
+ if (IntImm != 0)
+ return IntImm;
+
+ if (Val == 0x3800) // 0.5
+ return 240;
+
+ if (Val == 0xB800) // -0.5
+ return 241;
+
+ if (Val == 0x3C00) // 1.0
+ return 242;
+
+ if (Val == 0xBC00) // -1.0
+ return 243;
+
+ if (Val == 0x4000) // 2.0
+ return 244;
+
+ if (Val == 0xC000) // -2.0
+ return 245;
+
+ if (Val == 0x4400) // 4.0
+ return 246;
+
+ if (Val == 0xC400) // -4.0
+ return 247;
+
+ if (Val == 0x3118 && // 1.0 / (2.0 * pi)
+ STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm])
+ return 248;
+
+ return 255;
+}
+
+static uint32_t getLit32Encoding(uint32_t Val, const MCSubtargetInfo &STI) {
+ uint32_t IntImm = getIntInlineImmEncoding(static_cast<int32_t>(Val));
+ if (IntImm != 0)
+ return IntImm;
+
+ if (Val == FloatToBits(0.5f))
+ return 240;
+
+ if (Val == FloatToBits(-0.5f))
+ return 241;
+
+ if (Val == FloatToBits(1.0f))
+ return 242;
+
+ if (Val == FloatToBits(-1.0f))
+ return 243;
+
+ if (Val == FloatToBits(2.0f))
+ return 244;
+
+ if (Val == FloatToBits(-2.0f))
+ return 245;
+
+ if (Val == FloatToBits(4.0f))
+ return 246;
+
+ if (Val == FloatToBits(-4.0f))
+ return 247;
+
+ if (Val == 0x3e22f983 && // 1.0 / (2.0 * pi)
+ STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm])
+ return 248;
+
+ return 255;
+}
+
+static uint32_t getLit64Encoding(uint64_t Val, const MCSubtargetInfo &STI) {
+ uint32_t IntImm = getIntInlineImmEncoding(static_cast<int64_t>(Val));
+ if (IntImm != 0)
+ return IntImm;
+
+ if (Val == DoubleToBits(0.5))
+ return 240;
+
+ if (Val == DoubleToBits(-0.5))
+ return 241;
+
+ if (Val == DoubleToBits(1.0))
+ return 242;
+
+ if (Val == DoubleToBits(-1.0))
+ return 243;
+
+ if (Val == DoubleToBits(2.0))
+ return 244;
+
+ if (Val == DoubleToBits(-2.0))
+ return 245;
+
+ if (Val == DoubleToBits(4.0))
+ return 246;
+
+ if (Val == DoubleToBits(-4.0))
+ return 247;
+
+ if (Val == 0x3fc45f306dc9c882 && // 1.0 / (2.0 * pi)
+ STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm])
+ return 248;
+
+ return 255;
+}
+
+uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO,
+ const MCOperandInfo &OpInfo,
+ const MCSubtargetInfo &STI) const {
+ int64_t Imm;
+ if (MO.isExpr()) {
+ const auto *C = dyn_cast<MCConstantExpr>(MO.getExpr());
+ if (!C)
+ return 255;
+
+ Imm = C->getValue();
+ } else {
+
+ assert(!MO.isFPImm());
+
+ if (!MO.isImm())
+ return ~0;
+
+ Imm = MO.getImm();
+ }
+
+ switch (AMDGPU::getOperandSize(OpInfo)) {
+ case 4:
+ return getLit32Encoding(static_cast<uint32_t>(Imm), STI);
+ case 8:
+ return getLit64Encoding(static_cast<uint64_t>(Imm), STI);
+ case 2:
+ return getLit16Encoding(static_cast<uint16_t>(Imm), STI);
+ default:
+ llvm_unreachable("invalid operand size");
+ }
+}
+
+void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ verifyInstructionPredicates(MI,
+ computeAvailableFeatures(STI.getFeatureBits()));
+
+ uint64_t Encoding = getBinaryCodeForInstr(MI, Fixups, STI);
+ const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+ unsigned bytes = Desc.getSize();
+
+ for (unsigned i = 0; i < bytes; i++) {
+ OS.write((uint8_t) ((Encoding >> (8 * i)) & 0xff));
+ }
+
+ if (bytes > 4)
+ return;
+
+ // Check for additional literals in SRC0/1/2 (Op 1/2/3)
+ for (unsigned i = 0, e = MI.getNumOperands(); i < e; ++i) {
+
+ // Check if this operand should be encoded as [SV]Src
+ if (!AMDGPU::isSISrcOperand(Desc, i))
+ continue;
+
+ // Is this operand a literal immediate?
+ const MCOperand &Op = MI.getOperand(i);
+ if (getLitEncoding(Op, Desc.OpInfo[i], STI) != 255)
+ continue;
+
+ // Yes! Encode it
+ int64_t Imm = 0;
+
+ if (Op.isImm())
+ Imm = Op.getImm();
+ else if (Op.isExpr()) {
+ if (const auto *C = dyn_cast<MCConstantExpr>(Op.getExpr()))
+ Imm = C->getValue();
+
+ } else if (!Op.isExpr()) // Exprs will be replaced with a fixup value.
+ llvm_unreachable("Must be immediate or expr");
+
+ for (unsigned j = 0; j < 4; j++) {
+ OS.write((uint8_t) ((Imm >> (8 * j)) & 0xff));
+ }
+
+ // Only one literal value allowed
+ break;
+ }
+}
+
+unsigned SIMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpNo);
+
+ if (MO.isExpr()) {
+ const MCExpr *Expr = MO.getExpr();
+ MCFixupKind Kind = (MCFixupKind)AMDGPU::fixup_si_sopp_br;
+ Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc()));
+ return 0;
+ }
+
+ return getMachineOpValue(MI, MO, Fixups, STI);
+}
+
+uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
+ const MCOperand &MO,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ if (MO.isReg())
+ return MRI.getEncodingValue(MO.getReg());
+
+ if (MO.isExpr() && MO.getExpr()->getKind() != MCExpr::Constant) {
+ const auto *Expr = dyn_cast<MCSymbolRefExpr>(MO.getExpr());
+ MCFixupKind Kind;
+ if (Expr && Expr->getSymbol().isExternal())
+ Kind = FK_Data_4;
+ else
+ Kind = FK_PCRel_4;
+ Fixups.push_back(MCFixup::create(4, MO.getExpr(), Kind, MI.getLoc()));
+ }
+
+ // Figure out the operand number, needed for isSrcOperand check
+ unsigned OpNo = 0;
+ for (unsigned e = MI.getNumOperands(); OpNo < e; ++OpNo) {
+ if (&MO == &MI.getOperand(OpNo))
+ break;
+ }
+
+ const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+ if (AMDGPU::isSISrcOperand(Desc, OpNo)) {
+ uint32_t Enc = getLitEncoding(MO, Desc.OpInfo[OpNo], STI);
+ if (Enc != ~0U && (Enc != 255 || Desc.getSize() == 4))
+ return Enc;
+
+ } else if (MO.isImm())
+ return MO.getImm();
+
+ llvm_unreachable("Encoding of this operand type is not supported yet.");
+ return 0;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/contrib/llvm/lib/Target/AMDGPU/MIMGInstructions.td
new file mode 100644
index 000000000000..46803e555711
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -0,0 +1,763 @@
+//===-- MIMGInstructions.td - MIMG Instruction Defintions -----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+class MIMG_Mask <string op, int channels> {
+ string Op = op;
+ int Channels = channels;
+}
+
+class mimg <bits<7> si, bits<7> vi = si> {
+ field bits<7> SI = si;
+ field bits<7> VI = vi;
+}
+
+class MIMG_Helper <dag outs, dag ins, string asm,
+ string dns=""> : MIMG<outs, ins, asm,[]> {
+ let mayLoad = 1;
+ let mayStore = 0;
+ let hasPostISelHook = 1;
+ let DecoderNamespace = dns;
+ let isAsmParserOnly = !if(!eq(dns,""), 1, 0);
+ let AsmMatchConverter = "cvtMIMG";
+ let usesCustomInserter = 1;
+}
+
+class MIMG_NoSampler_Helper <bits<7> op, string asm,
+ RegisterClass dst_rc,
+ RegisterClass addr_rc,
+ string dns=""> : MIMG_Helper <
+ (outs dst_rc:$vdata),
+ (ins addr_rc:$vaddr, SReg_256:$srsrc,
+ dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc,
+ r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
+ asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da",
+ dns>, MIMGe<op> {
+ let ssamp = 0;
+}
+
+multiclass MIMG_NoSampler_Src_Helper <bits<7> op, string asm,
+ RegisterClass dst_rc,
+ int channels> {
+ def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32,
+ !if(!eq(channels, 1), "AMDGPU", "")>,
+ MIMG_Mask<asm#"_V1", channels>;
+ def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>,
+ MIMG_Mask<asm#"_V2", channels>;
+ def _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>,
+ MIMG_Mask<asm#"_V4", channels>;
+}
+
+multiclass MIMG_NoSampler <bits<7> op, string asm> {
+ defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VGPR_32, 1>;
+ defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 2>;
+ defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 3>;
+ defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 4>;
+}
+
+class MIMG_Store_Helper <bits<7> op, string asm,
+ RegisterClass data_rc,
+ RegisterClass addr_rc> : MIMG_Helper <
+ (outs),
+ (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
+ dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc,
+ r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
+ asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"
+ >, MIMGe<op> {
+ let ssamp = 0;
+ let mayLoad = 1; // TableGen requires this for matching with the intrinsics
+ let mayStore = 1;
+ let hasSideEffects = 1;
+ let hasPostISelHook = 0;
+ let DisableWQM = 1;
+}
+
+multiclass MIMG_Store_Addr_Helper <bits<7> op, string asm,
+ RegisterClass data_rc,
+ int channels> {
+ def _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32>,
+ MIMG_Mask<asm#"_V1", channels>;
+ def _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>,
+ MIMG_Mask<asm#"_V2", channels>;
+ def _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>,
+ MIMG_Mask<asm#"_V4", channels>;
+}
+
+multiclass MIMG_Store <bits<7> op, string asm> {
+ defm _V1 : MIMG_Store_Addr_Helper <op, asm, VGPR_32, 1>;
+ defm _V2 : MIMG_Store_Addr_Helper <op, asm, VReg_64, 2>;
+ defm _V3 : MIMG_Store_Addr_Helper <op, asm, VReg_96, 3>;
+ defm _V4 : MIMG_Store_Addr_Helper <op, asm, VReg_128, 4>;
+}
+
+class MIMG_Atomic_Helper <string asm, RegisterClass data_rc,
+ RegisterClass addr_rc> : MIMG_Helper <
+ (outs data_rc:$vdst),
+ (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
+ dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc,
+ r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
+ asm#" $vdst, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"
+ > {
+ let mayStore = 1;
+ let hasSideEffects = 1;
+ let hasPostISelHook = 0;
+ let DisableWQM = 1;
+ let Constraints = "$vdst = $vdata";
+ let AsmMatchConverter = "cvtMIMGAtomic";
+}
+
+class MIMG_Atomic_Real_si<mimg op, string name, string asm,
+ RegisterClass data_rc, RegisterClass addr_rc> :
+ MIMG_Atomic_Helper<asm, data_rc, addr_rc>,
+ SIMCInstr<name, SIEncodingFamily.SI>,
+ MIMGe<op.SI> {
+ let isCodeGenOnly = 0;
+ let AssemblerPredicates = [isSICI];
+ let DecoderNamespace = "SICI";
+ let DisableDecoder = DisableSIDecoder;
+}
+
+class MIMG_Atomic_Real_vi<mimg op, string name, string asm,
+ RegisterClass data_rc, RegisterClass addr_rc> :
+ MIMG_Atomic_Helper<asm, data_rc, addr_rc>,
+ SIMCInstr<name, SIEncodingFamily.VI>,
+ MIMGe<op.VI> {
+ let isCodeGenOnly = 0;
+ let AssemblerPredicates = [isVI];
+ let DecoderNamespace = "VI";
+ let DisableDecoder = DisableVIDecoder;
+}
+
+multiclass MIMG_Atomic_Helper_m <mimg op, string name, string asm,
+ RegisterClass data_rc, RegisterClass addr_rc> {
+ let isPseudo = 1, isCodeGenOnly = 1 in {
+ def "" : MIMG_Atomic_Helper<asm, data_rc, addr_rc>,
+ SIMCInstr<name, SIEncodingFamily.NONE>;
+ }
+
+ let ssamp = 0 in {
+ def _si : MIMG_Atomic_Real_si<op, name, asm, data_rc, addr_rc>;
+
+ def _vi : MIMG_Atomic_Real_vi<op, name, asm, data_rc, addr_rc>;
+ }
+}
+
+multiclass MIMG_Atomic <mimg op, string asm, RegisterClass data_rc = VGPR_32> {
+ defm _V1 : MIMG_Atomic_Helper_m <op, asm # "_V1", asm, data_rc, VGPR_32>;
+ defm _V2 : MIMG_Atomic_Helper_m <op, asm # "_V2", asm, data_rc, VReg_64>;
+ defm _V4 : MIMG_Atomic_Helper_m <op, asm # "_V3", asm, data_rc, VReg_128>;
+}
+
+class MIMG_Sampler_Helper <bits<7> op, string asm,
+ RegisterClass dst_rc,
+ RegisterClass src_rc,
+ bit wqm,
+ string dns=""> : MIMG_Helper <
+ (outs dst_rc:$vdata),
+ (ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
+ dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc,
+ r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
+ asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da",
+ dns>, MIMGe<op> {
+ let WQM = wqm;
+}
+
+multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm,
+ RegisterClass dst_rc,
+ int channels, bit wqm> {
+ def _V1 : MIMG_Sampler_Helper <op, asm, dst_rc, VGPR_32, wqm,
+ !if(!eq(channels, 1), "AMDGPU", "")>,
+ MIMG_Mask<asm#"_V1", channels>;
+ def _V2 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_64, wqm>,
+ MIMG_Mask<asm#"_V2", channels>;
+ def _V4 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_128, wqm>,
+ MIMG_Mask<asm#"_V4", channels>;
+ def _V8 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_256, wqm>,
+ MIMG_Mask<asm#"_V8", channels>;
+ def _V16 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_512, wqm>,
+ MIMG_Mask<asm#"_V16", channels>;
+}
+
+multiclass MIMG_Sampler <bits<7> op, string asm, bit wqm=0> {
+ defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VGPR_32, 1, wqm>;
+ defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2, wqm>;
+ defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3, wqm>;
+ defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4, wqm>;
+}
+
+multiclass MIMG_Sampler_WQM <bits<7> op, string asm> : MIMG_Sampler<op, asm, 1>;
+
+class MIMG_Gather_Helper <bits<7> op, string asm,
+ RegisterClass dst_rc,
+ RegisterClass src_rc, bit wqm> : MIMG <
+ (outs dst_rc:$vdata),
+ (ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
+ dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc,
+ r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
+ asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da",
+ []>, MIMGe<op> {
+ let mayLoad = 1;
+ let mayStore = 0;
+
+ // DMASK was repurposed for GATHER4. 4 components are always
+ // returned and DMASK works like a swizzle - it selects
+ // the component to fetch. The only useful DMASK values are
+ // 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
+ // (red,red,red,red) etc.) The ISA document doesn't mention
+ // this.
+ // Therefore, disable all code which updates DMASK by setting this:
+ let Gather4 = 1;
+ let hasPostISelHook = 0;
+ let WQM = wqm;
+
+ let isAsmParserOnly = 1; // TBD: fix it later
+}
+
+multiclass MIMG_Gather_Src_Helper <bits<7> op, string asm,
+ RegisterClass dst_rc,
+ int channels, bit wqm> {
+ def _V1 : MIMG_Gather_Helper <op, asm, dst_rc, VGPR_32, wqm>,
+ MIMG_Mask<asm#"_V1", channels>;
+ def _V2 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_64, wqm>,
+ MIMG_Mask<asm#"_V2", channels>;
+ def _V4 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_128, wqm>,
+ MIMG_Mask<asm#"_V4", channels>;
+ def _V8 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_256, wqm>,
+ MIMG_Mask<asm#"_V8", channels>;
+ def _V16 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_512, wqm>,
+ MIMG_Mask<asm#"_V16", channels>;
+}
+
+multiclass MIMG_Gather <bits<7> op, string asm, bit wqm=0> {
+ defm _V1 : MIMG_Gather_Src_Helper<op, asm, VGPR_32, 1, wqm>;
+ defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2, wqm>;
+ defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3, wqm>;
+ defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4, wqm>;
+}
+
+multiclass MIMG_Gather_WQM <bits<7> op, string asm> : MIMG_Gather<op, asm, 1>;
+
+//===----------------------------------------------------------------------===//
+// MIMG Instructions
+//===----------------------------------------------------------------------===//
+let SubtargetPredicate = isGCN in {
+defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "image_load">;
+defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "image_load_mip">;
+//def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"image_load_pck", 0x00000002>;
+//def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"image_load_pck_sgn", 0x00000003>;
+//def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"image_load_mip_pck", 0x00000004>;
+//def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"image_load_mip_pck_sgn", 0x00000005>;
+defm IMAGE_STORE : MIMG_Store <0x00000008, "image_store">;
+defm IMAGE_STORE_MIP : MIMG_Store <0x00000009, "image_store_mip">;
+//def IMAGE_STORE_PCK : MIMG_NoPattern_ <"image_store_pck", 0x0000000a>;
+//def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"image_store_mip_pck", 0x0000000b>;
+defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo">;
+defm IMAGE_ATOMIC_SWAP : MIMG_Atomic <mimg<0x0f, 0x10>, "image_atomic_swap">;
+defm IMAGE_ATOMIC_CMPSWAP : MIMG_Atomic <mimg<0x10, 0x11>, "image_atomic_cmpswap", VReg_64>;
+defm IMAGE_ATOMIC_ADD : MIMG_Atomic <mimg<0x11, 0x12>, "image_atomic_add">;
+defm IMAGE_ATOMIC_SUB : MIMG_Atomic <mimg<0x12, 0x13>, "image_atomic_sub">;
+//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"image_atomic_rsub", 0x00000013>; -- not on VI
+defm IMAGE_ATOMIC_SMIN : MIMG_Atomic <mimg<0x14>, "image_atomic_smin">;
+defm IMAGE_ATOMIC_UMIN : MIMG_Atomic <mimg<0x15>, "image_atomic_umin">;
+defm IMAGE_ATOMIC_SMAX : MIMG_Atomic <mimg<0x16>, "image_atomic_smax">;
+defm IMAGE_ATOMIC_UMAX : MIMG_Atomic <mimg<0x17>, "image_atomic_umax">;
+defm IMAGE_ATOMIC_AND : MIMG_Atomic <mimg<0x18>, "image_atomic_and">;
+defm IMAGE_ATOMIC_OR : MIMG_Atomic <mimg<0x19>, "image_atomic_or">;
+defm IMAGE_ATOMIC_XOR : MIMG_Atomic <mimg<0x1a>, "image_atomic_xor">;
+defm IMAGE_ATOMIC_INC : MIMG_Atomic <mimg<0x1b>, "image_atomic_inc">;
+defm IMAGE_ATOMIC_DEC : MIMG_Atomic <mimg<0x1c>, "image_atomic_dec">;
+//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d>; -- not on VI
+//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; -- not on VI
+//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; -- not on VI
+defm IMAGE_SAMPLE : MIMG_Sampler_WQM <0x00000020, "image_sample">;
+defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <0x00000021, "image_sample_cl">;
+defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, "image_sample_d">;
+defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <0x00000023, "image_sample_d_cl">;
+defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, "image_sample_l">;
+defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <0x00000025, "image_sample_b">;
+defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <0x00000026, "image_sample_b_cl">;
+defm IMAGE_SAMPLE_LZ : MIMG_Sampler <0x00000027, "image_sample_lz">;
+defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <0x00000028, "image_sample_c">;
+defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <0x00000029, "image_sample_c_cl">;
+defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, "image_sample_c_d">;
+defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <0x0000002b, "image_sample_c_d_cl">;
+defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, "image_sample_c_l">;
+defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <0x0000002d, "image_sample_c_b">;
+defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <0x0000002e, "image_sample_c_b_cl">;
+defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <0x0000002f, "image_sample_c_lz">;
+defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <0x00000030, "image_sample_o">;
+defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <0x00000031, "image_sample_cl_o">;
+defm IMAGE_SAMPLE_D_O : MIMG_Sampler <0x00000032, "image_sample_d_o">;
+defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <0x00000033, "image_sample_d_cl_o">;
+defm IMAGE_SAMPLE_L_O : MIMG_Sampler <0x00000034, "image_sample_l_o">;
+defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <0x00000035, "image_sample_b_o">;
+defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <0x00000036, "image_sample_b_cl_o">;
+defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <0x00000037, "image_sample_lz_o">;
+defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <0x00000038, "image_sample_c_o">;
+defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <0x00000039, "image_sample_c_cl_o">;
+defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <0x0000003a, "image_sample_c_d_o">;
+defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <0x0000003b, "image_sample_c_d_cl_o">;
+defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <0x0000003c, "image_sample_c_l_o">;
+defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <0x0000003d, "image_sample_c_b_o">;
+defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <0x0000003e, "image_sample_c_b_cl_o">;
+defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <0x0000003f, "image_sample_c_lz_o">;
+defm IMAGE_GATHER4 : MIMG_Gather_WQM <0x00000040, "image_gather4">;
+defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <0x00000041, "image_gather4_cl">;
+defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, "image_gather4_l">;
+defm IMAGE_GATHER4_B : MIMG_Gather_WQM <0x00000045, "image_gather4_b">;
+defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <0x00000046, "image_gather4_b_cl">;
+defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, "image_gather4_lz">;
+defm IMAGE_GATHER4_C : MIMG_Gather_WQM <0x00000048, "image_gather4_c">;
+defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <0x00000049, "image_gather4_c_cl">;
+defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, "image_gather4_c_l">;
+defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <0x0000004d, "image_gather4_c_b">;
+defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <0x0000004e, "image_gather4_c_b_cl">;
+defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, "image_gather4_c_lz">;
+defm IMAGE_GATHER4_O : MIMG_Gather_WQM <0x00000050, "image_gather4_o">;
+defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <0x00000051, "image_gather4_cl_o">;
+defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, "image_gather4_l_o">;
+defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <0x00000055, "image_gather4_b_o">;
+defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, "image_gather4_b_cl_o">;
+defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, "image_gather4_lz_o">;
+defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <0x00000058, "image_gather4_c_o">;
+defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <0x00000059, "image_gather4_c_cl_o">;
+defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, "image_gather4_c_l_o">;
+defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <0x0000005d, "image_gather4_c_b_o">;
+defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, "image_gather4_c_b_cl_o">;
+defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, "image_gather4_c_lz_o">;
+defm IMAGE_GET_LOD : MIMG_Sampler_WQM <0x00000060, "image_get_lod">;
+defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, "image_sample_cd">;
+defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, "image_sample_cd_cl">;
+defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, "image_sample_c_cd">;
+defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <0x0000006b, "image_sample_c_cd_cl">;
+defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <0x0000006c, "image_sample_cd_o">;
+defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <0x0000006d, "image_sample_cd_cl_o">;
+defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <0x0000006e, "image_sample_c_cd_o">;
+defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "image_sample_c_cd_cl_o">;
+//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", 0x0000007e>;
+//def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>;
+}
+
+/********** ======================= **********/
+/********** Image sampling patterns **********/
+/********** ======================= **********/
+
+// Image + sampler
+class SampleRawPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat <
+ (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i32:$unorm,
+ i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe),
+ (opcode $addr, $rsrc, $sampler,
+ (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc),
+ (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $da))
+>;
+
+multiclass SampleRawPatterns<SDPatternOperator name, string opcode> {
+ def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V1), i32>;
+ def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>;
+ def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>;
+ def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V8), v8i32>;
+ def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V16), v16i32>;
+}
+
+// Image + sampler for amdgcn
+// TODO:
+// 1. Handle half data type like v4f16, and add D16 bit support;
+// 2. Handle v4i32 rsrc type (Register Class for the instruction to be SReg_128).
+// 3. Add A16 support when we pass address of half type.
+multiclass AMDGCNSamplePattern<SDPatternOperator name, MIMG opcode, ValueType dt, ValueType vt> {
+ def : Pat<
+ (dt (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i1:$unorm, i1:$glc,
+ i1:$slc, i1:$lwe, i1:$da)),
+ (opcode $addr, $rsrc, $sampler,
+ (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc),
+ 0, 0, (as_i1imm $lwe), (as_i1imm $da))
+ >;
+}
+
+multiclass AMDGCNSampleDataPatterns<SDPatternOperator name, string opcode, ValueType dt> {
+ defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V1), dt, f32>;
+ defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V2), dt, v2f32>;
+ defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V4), dt, v4f32>;
+ defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V8), dt, v8f32>;
+ defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V16), dt, v16f32>;
+}
+
+// TODO: support v3f32.
+multiclass AMDGCNSamplePatterns<SDPatternOperator name, string opcode> {
+ defm : AMDGCNSampleDataPatterns<name, !cast<string>(opcode # _V1), f32>;
+ defm : AMDGCNSampleDataPatterns<name, !cast<string>(opcode # _V2), v2f32>;
+ defm : AMDGCNSampleDataPatterns<name, !cast<string>(opcode # _V4), v4f32>;
+}
+
+// Image only
+class ImagePattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat <
+ (name vt:$addr, v8i32:$rsrc, imm:$dmask, imm:$unorm,
+ imm:$r128, imm:$da, imm:$glc, imm:$slc, imm:$tfe, imm:$lwe),
+ (opcode $addr, $rsrc,
+ (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc),
+ (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $da))
+>;
+
+multiclass ImagePatterns<SDPatternOperator name, string opcode> {
+ def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V1), i32>;
+ def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>;
+ def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>;
+}
+
+multiclass ImageLoadPattern<SDPatternOperator name, MIMG opcode, ValueType dt, ValueType vt> {
+ def : Pat <
+ (dt (name vt:$addr, v8i32:$rsrc, i32:$dmask, i1:$glc, i1:$slc, i1:$lwe,
+ i1:$da)),
+ (opcode $addr, $rsrc,
+ (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $slc),
+ 0, 0, (as_i1imm $lwe), (as_i1imm $da))
+ >;
+}
+
+multiclass ImageLoadDataPatterns<SDPatternOperator name, string opcode, ValueType dt> {
+ defm : ImageLoadPattern<name, !cast<MIMG>(opcode # _V1), dt, i32>;
+ defm : ImageLoadPattern<name, !cast<MIMG>(opcode # _V2), dt, v2i32>;
+ defm : ImageLoadPattern<name, !cast<MIMG>(opcode # _V4), dt, v4i32>;
+}
+
+// TODO: support v3f32.
+multiclass ImageLoadPatterns<SDPatternOperator name, string opcode> {
+ defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V1), f32>;
+ defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V2), v2f32>;
+ defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V4), v4f32>;
+}
+
+multiclass ImageStorePattern<SDPatternOperator name, MIMG opcode, ValueType dt, ValueType vt> {
+ def : Pat <
+ (name dt:$data, vt:$addr, v8i32:$rsrc, i32:$dmask, i1:$glc, i1:$slc,
+ i1:$lwe, i1:$da),
+ (opcode $data, $addr, $rsrc,
+ (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $slc),
+ 0, 0, (as_i1imm $lwe), (as_i1imm $da))
+ >;
+}
+
+multiclass ImageStoreDataPatterns<SDPatternOperator name, string opcode, ValueType dt> {
+ defm : ImageStorePattern<name, !cast<MIMG>(opcode # _V1), dt, i32>;
+ defm : ImageStorePattern<name, !cast<MIMG>(opcode # _V2), dt, v2i32>;
+ defm : ImageStorePattern<name, !cast<MIMG>(opcode # _V4), dt, v4i32>;
+}
+
+// TODO: support v3f32.
+multiclass ImageStorePatterns<SDPatternOperator name, string opcode> {
+ defm : ImageStoreDataPatterns<name, !cast<string>(opcode # _V1), f32>;
+ defm : ImageStoreDataPatterns<name, !cast<string>(opcode # _V2), v2f32>;
+ defm : ImageStoreDataPatterns<name, !cast<string>(opcode # _V4), v4f32>;
+}
+
+class ImageAtomicPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat <
+ (name i32:$vdata, vt:$addr, v8i32:$rsrc, imm:$r128, imm:$da, imm:$slc),
+ (opcode $vdata, $addr, $rsrc, 1, 1, 1, (as_i1imm $slc), (as_i1imm $r128), 0, 0, (as_i1imm $da))
+>;
+
+multiclass ImageAtomicPatterns<SDPatternOperator name, string opcode> {
+ def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V1), i32>;
+ def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V2), v2i32>;
+ def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V4), v4i32>;
+}
+
+class ImageAtomicCmpSwapPattern<MIMG opcode, ValueType vt> : Pat <
+ (int_amdgcn_image_atomic_cmpswap i32:$vsrc, i32:$vcmp, vt:$addr, v8i32:$rsrc,
+ imm:$r128, imm:$da, imm:$slc),
+ (EXTRACT_SUBREG
+ (opcode (REG_SEQUENCE VReg_64, $vsrc, sub0, $vcmp, sub1),
+ $addr, $rsrc, 3, 1, 1, (as_i1imm $slc), (as_i1imm $r128), 0, 0, (as_i1imm $da)),
+ sub0)
+>;
+
+// ======= SI Image Intrinsics ================
+
+// Image load
+defm : ImagePatterns<int_SI_image_load, "IMAGE_LOAD">;
+defm : ImagePatterns<int_SI_image_load_mip, "IMAGE_LOAD_MIP">;
+def : ImagePattern<int_SI_getresinfo, IMAGE_GET_RESINFO_V4_V1, i32>;
+
+// Basic sample
+defm : SampleRawPatterns<int_SI_image_sample, "IMAGE_SAMPLE">;
+defm : SampleRawPatterns<int_SI_image_sample_cl, "IMAGE_SAMPLE_CL">;
+defm : SampleRawPatterns<int_SI_image_sample_d, "IMAGE_SAMPLE_D">;
+defm : SampleRawPatterns<int_SI_image_sample_d_cl, "IMAGE_SAMPLE_D_CL">;
+defm : SampleRawPatterns<int_SI_image_sample_l, "IMAGE_SAMPLE_L">;
+defm : SampleRawPatterns<int_SI_image_sample_b, "IMAGE_SAMPLE_B">;
+defm : SampleRawPatterns<int_SI_image_sample_b_cl, "IMAGE_SAMPLE_B_CL">;
+defm : SampleRawPatterns<int_SI_image_sample_lz, "IMAGE_SAMPLE_LZ">;
+defm : SampleRawPatterns<int_SI_image_sample_cd, "IMAGE_SAMPLE_CD">;
+defm : SampleRawPatterns<int_SI_image_sample_cd_cl, "IMAGE_SAMPLE_CD_CL">;
+
+// Sample with comparison
+defm : SampleRawPatterns<int_SI_image_sample_c, "IMAGE_SAMPLE_C">;
+defm : SampleRawPatterns<int_SI_image_sample_c_cl, "IMAGE_SAMPLE_C_CL">;
+defm : SampleRawPatterns<int_SI_image_sample_c_d, "IMAGE_SAMPLE_C_D">;
+defm : SampleRawPatterns<int_SI_image_sample_c_d_cl, "IMAGE_SAMPLE_C_D_CL">;
+defm : SampleRawPatterns<int_SI_image_sample_c_l, "IMAGE_SAMPLE_C_L">;
+defm : SampleRawPatterns<int_SI_image_sample_c_b, "IMAGE_SAMPLE_C_B">;
+defm : SampleRawPatterns<int_SI_image_sample_c_b_cl, "IMAGE_SAMPLE_C_B_CL">;
+defm : SampleRawPatterns<int_SI_image_sample_c_lz, "IMAGE_SAMPLE_C_LZ">;
+defm : SampleRawPatterns<int_SI_image_sample_c_cd, "IMAGE_SAMPLE_C_CD">;
+defm : SampleRawPatterns<int_SI_image_sample_c_cd_cl, "IMAGE_SAMPLE_C_CD_CL">;
+
+// Sample with offsets
+defm : SampleRawPatterns<int_SI_image_sample_o, "IMAGE_SAMPLE_O">;
+defm : SampleRawPatterns<int_SI_image_sample_cl_o, "IMAGE_SAMPLE_CL_O">;
+defm : SampleRawPatterns<int_SI_image_sample_d_o, "IMAGE_SAMPLE_D_O">;
+defm : SampleRawPatterns<int_SI_image_sample_d_cl_o, "IMAGE_SAMPLE_D_CL_O">;
+defm : SampleRawPatterns<int_SI_image_sample_l_o, "IMAGE_SAMPLE_L_O">;
+defm : SampleRawPatterns<int_SI_image_sample_b_o, "IMAGE_SAMPLE_B_O">;
+defm : SampleRawPatterns<int_SI_image_sample_b_cl_o, "IMAGE_SAMPLE_B_CL_O">;
+defm : SampleRawPatterns<int_SI_image_sample_lz_o, "IMAGE_SAMPLE_LZ_O">;
+defm : SampleRawPatterns<int_SI_image_sample_cd_o, "IMAGE_SAMPLE_CD_O">;
+defm : SampleRawPatterns<int_SI_image_sample_cd_cl_o, "IMAGE_SAMPLE_CD_CL_O">;
+
+// Sample with comparison and offsets
+defm : SampleRawPatterns<int_SI_image_sample_c_o, "IMAGE_SAMPLE_C_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_cl_o, "IMAGE_SAMPLE_C_CL_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_d_o, "IMAGE_SAMPLE_C_D_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_d_cl_o, "IMAGE_SAMPLE_C_D_CL_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_l_o, "IMAGE_SAMPLE_C_L_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_b_o, "IMAGE_SAMPLE_C_B_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_b_cl_o, "IMAGE_SAMPLE_C_B_CL_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_lz_o, "IMAGE_SAMPLE_C_LZ_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_cd_o, "IMAGE_SAMPLE_C_CD_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_cd_cl_o, "IMAGE_SAMPLE_C_CD_CL_O">;
+
+// Gather opcodes
+// Only the variants which make sense are defined.
+def : SampleRawPattern<int_SI_gather4, IMAGE_GATHER4_V4_V2, v2i32>;
+def : SampleRawPattern<int_SI_gather4, IMAGE_GATHER4_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_cl, IMAGE_GATHER4_CL_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_l, IMAGE_GATHER4_L_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_b, IMAGE_GATHER4_B_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_b_cl, IMAGE_GATHER4_B_CL_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_b_cl, IMAGE_GATHER4_B_CL_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_lz, IMAGE_GATHER4_LZ_V4_V2, v2i32>;
+def : SampleRawPattern<int_SI_gather4_lz, IMAGE_GATHER4_LZ_V4_V4, v4i32>;
+
+def : SampleRawPattern<int_SI_gather4_c, IMAGE_GATHER4_C_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_cl, IMAGE_GATHER4_C_CL_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_cl, IMAGE_GATHER4_C_CL_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_l, IMAGE_GATHER4_C_L_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_l, IMAGE_GATHER4_C_L_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_b, IMAGE_GATHER4_C_B_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_b, IMAGE_GATHER4_C_B_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_b_cl, IMAGE_GATHER4_C_B_CL_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_lz, IMAGE_GATHER4_C_LZ_V4_V4, v4i32>;
+
+def : SampleRawPattern<int_SI_gather4_o, IMAGE_GATHER4_O_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_cl_o, IMAGE_GATHER4_CL_O_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_cl_o, IMAGE_GATHER4_CL_O_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_l_o, IMAGE_GATHER4_L_O_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_l_o, IMAGE_GATHER4_L_O_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_b_o, IMAGE_GATHER4_B_O_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_b_o, IMAGE_GATHER4_B_O_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_b_cl_o, IMAGE_GATHER4_B_CL_O_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_lz_o, IMAGE_GATHER4_LZ_O_V4_V4, v4i32>;
+
+def : SampleRawPattern<int_SI_gather4_c_o, IMAGE_GATHER4_C_O_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_o, IMAGE_GATHER4_C_O_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_cl_o, IMAGE_GATHER4_C_CL_O_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_l_o, IMAGE_GATHER4_C_L_O_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_b_o, IMAGE_GATHER4_C_B_O_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_b_cl_o, IMAGE_GATHER4_C_B_CL_O_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_lz_o, IMAGE_GATHER4_C_LZ_O_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_lz_o, IMAGE_GATHER4_C_LZ_O_V4_V8, v8i32>;
+
+def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V1, i32>;
+def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V2, v2i32>;
+def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V4, v4i32>;
+
+// ======= amdgcn Image Intrinsics ==============
+
+// Image load
+defm : ImageLoadPatterns<int_amdgcn_image_load, "IMAGE_LOAD">;
+defm : ImageLoadPatterns<int_amdgcn_image_load_mip, "IMAGE_LOAD_MIP">;
+defm : ImageLoadPatterns<int_amdgcn_image_getresinfo, "IMAGE_GET_RESINFO">;
+
+// Image store
+defm : ImageStorePatterns<int_amdgcn_image_store, "IMAGE_STORE">;
+defm : ImageStorePatterns<int_amdgcn_image_store_mip, "IMAGE_STORE_MIP">;
+
+// Basic sample
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample, "IMAGE_SAMPLE">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cl, "IMAGE_SAMPLE_CL">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_d, "IMAGE_SAMPLE_D">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_d_cl, "IMAGE_SAMPLE_D_CL">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_l, "IMAGE_SAMPLE_L">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_b, "IMAGE_SAMPLE_B">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_b_cl, "IMAGE_SAMPLE_B_CL">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_lz, "IMAGE_SAMPLE_LZ">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cd, "IMAGE_SAMPLE_CD">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cd_cl, "IMAGE_SAMPLE_CD_CL">;
+
+// Sample with comparison
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c, "IMAGE_SAMPLE_C">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cl, "IMAGE_SAMPLE_C_CL">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_d, "IMAGE_SAMPLE_C_D">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_d_cl, "IMAGE_SAMPLE_C_D_CL">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_l, "IMAGE_SAMPLE_C_L">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_b, "IMAGE_SAMPLE_C_B">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_b_cl, "IMAGE_SAMPLE_C_B_CL">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_lz, "IMAGE_SAMPLE_C_LZ">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cd, "IMAGE_SAMPLE_C_CD">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cd_cl, "IMAGE_SAMPLE_C_CD_CL">;
+
+// Sample with offsets
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_o, "IMAGE_SAMPLE_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cl_o, "IMAGE_SAMPLE_CL_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_d_o, "IMAGE_SAMPLE_D_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_d_cl_o, "IMAGE_SAMPLE_D_CL_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_l_o, "IMAGE_SAMPLE_L_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_b_o, "IMAGE_SAMPLE_B_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_b_cl_o, "IMAGE_SAMPLE_B_CL_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_lz_o, "IMAGE_SAMPLE_LZ_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cd_o, "IMAGE_SAMPLE_CD_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cd_cl_o, "IMAGE_SAMPLE_CD_CL_O">;
+
+// Sample with comparison and offsets
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_o, "IMAGE_SAMPLE_C_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cl_o, "IMAGE_SAMPLE_C_CL_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_d_o, "IMAGE_SAMPLE_C_D_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_d_cl_o, "IMAGE_SAMPLE_C_D_CL_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_l_o, "IMAGE_SAMPLE_C_L_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_b_o, "IMAGE_SAMPLE_C_B_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_b_cl_o, "IMAGE_SAMPLE_C_B_CL_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_lz_o, "IMAGE_SAMPLE_C_LZ_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cd_o, "IMAGE_SAMPLE_C_CD_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cd_cl_o, "IMAGE_SAMPLE_C_CD_CL_O">;
+
+// Gather opcodes
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4, "IMAGE_GATHER4">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_cl, "IMAGE_GATHER4_CL">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_l, "IMAGE_GATHER4_L">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_b, "IMAGE_GATHER4_B">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_b_cl, "IMAGE_GATHER4_B_CL">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_lz, "IMAGE_GATHER4_LZ">;
+
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c, "IMAGE_GATHER4_C">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_cl, "IMAGE_GATHER4_C_CL">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_l, "IMAGE_GATHER4_C_L">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_b, "IMAGE_GATHER4_C_B">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_b_cl, "IMAGE_GATHER4_C_B_CL">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_lz, "IMAGE_GATHER4_C_LZ">;
+
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_o, "IMAGE_GATHER4_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_cl_o, "IMAGE_GATHER4_CL_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_l_o, "IMAGE_GATHER4_L_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_b_o, "IMAGE_GATHER4_B_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_b_cl_o, "IMAGE_GATHER4_B_CL_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_lz_o, "IMAGE_GATHER4_LZ_O">;
+
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_o, "IMAGE_GATHER4_C_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_cl_o, "IMAGE_GATHER4_C_CL_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_l_o, "IMAGE_GATHER4_C_L_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_b_o, "IMAGE_GATHER4_C_B_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_b_cl_o, "IMAGE_GATHER4_C_B_CL_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_lz_o, "IMAGE_GATHER4_C_LZ_O">;
+
+defm : AMDGCNSamplePatterns<int_amdgcn_image_getlod, "IMAGE_GET_LOD">;
+
+// Image atomics
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_swap, "IMAGE_ATOMIC_SWAP">;
+def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V1, i32>;
+def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V2, v2i32>;
+def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V4, v4i32>;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_add, "IMAGE_ATOMIC_ADD">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_sub, "IMAGE_ATOMIC_SUB">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_smin, "IMAGE_ATOMIC_SMIN">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_umin, "IMAGE_ATOMIC_UMIN">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_smax, "IMAGE_ATOMIC_SMAX">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_umax, "IMAGE_ATOMIC_UMAX">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_and, "IMAGE_ATOMIC_AND">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_or, "IMAGE_ATOMIC_OR">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_xor, "IMAGE_ATOMIC_XOR">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_inc, "IMAGE_ATOMIC_INC">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_dec, "IMAGE_ATOMIC_DEC">;
+
+/* SIsample for simple 1D texture lookup */
+def : Pat <
+ (SIsample i32:$addr, v8i32:$rsrc, v4i32:$sampler, imm),
+ (IMAGE_SAMPLE_V4_V1 $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0)
+>;
+
+class SamplePattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
+ (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, imm),
+ (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0)
+>;
+
+class SampleRectPattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
+ (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_RECT),
+ (opcode $addr, $rsrc, $sampler, 0xf, 1, 0, 0, 0, 0, 0, 0)
+>;
+
+class SampleArrayPattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
+ (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_ARRAY),
+ (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1)
+>;
+
+class SampleShadowPattern<SDNode name, MIMG opcode,
+ ValueType vt> : Pat <
+ (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_SHADOW),
+ (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0)
+>;
+
+class SampleShadowArrayPattern<SDNode name, MIMG opcode,
+ ValueType vt> : Pat <
+ (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_SHADOW_ARRAY),
+ (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1)
+>;
+
+/* SIsample* for texture lookups consuming more address parameters */
+multiclass SamplePatterns<MIMG sample, MIMG sample_c, MIMG sample_l,
+ MIMG sample_c_l, MIMG sample_b, MIMG sample_c_b,
+MIMG sample_d, MIMG sample_c_d, ValueType addr_type> {
+ def : SamplePattern <SIsample, sample, addr_type>;
+ def : SampleRectPattern <SIsample, sample, addr_type>;
+ def : SampleArrayPattern <SIsample, sample, addr_type>;
+ def : SampleShadowPattern <SIsample, sample_c, addr_type>;
+ def : SampleShadowArrayPattern <SIsample, sample_c, addr_type>;
+
+ def : SamplePattern <SIsamplel, sample_l, addr_type>;
+ def : SampleArrayPattern <SIsamplel, sample_l, addr_type>;
+ def : SampleShadowPattern <SIsamplel, sample_c_l, addr_type>;
+ def : SampleShadowArrayPattern <SIsamplel, sample_c_l, addr_type>;
+
+ def : SamplePattern <SIsampleb, sample_b, addr_type>;
+ def : SampleArrayPattern <SIsampleb, sample_b, addr_type>;
+ def : SampleShadowPattern <SIsampleb, sample_c_b, addr_type>;
+ def : SampleShadowArrayPattern <SIsampleb, sample_c_b, addr_type>;
+
+ def : SamplePattern <SIsampled, sample_d, addr_type>;
+ def : SampleArrayPattern <SIsampled, sample_d, addr_type>;
+ def : SampleShadowPattern <SIsampled, sample_c_d, addr_type>;
+ def : SampleShadowArrayPattern <SIsampled, sample_c_d, addr_type>;
+}
+
+defm : SamplePatterns<IMAGE_SAMPLE_V4_V2, IMAGE_SAMPLE_C_V4_V2,
+ IMAGE_SAMPLE_L_V4_V2, IMAGE_SAMPLE_C_L_V4_V2,
+ IMAGE_SAMPLE_B_V4_V2, IMAGE_SAMPLE_C_B_V4_V2,
+ IMAGE_SAMPLE_D_V4_V2, IMAGE_SAMPLE_C_D_V4_V2,
+ v2i32>;
+defm : SamplePatterns<IMAGE_SAMPLE_V4_V4, IMAGE_SAMPLE_C_V4_V4,
+ IMAGE_SAMPLE_L_V4_V4, IMAGE_SAMPLE_C_L_V4_V4,
+ IMAGE_SAMPLE_B_V4_V4, IMAGE_SAMPLE_C_B_V4_V4,
+ IMAGE_SAMPLE_D_V4_V4, IMAGE_SAMPLE_C_D_V4_V4,
+ v4i32>;
+defm : SamplePatterns<IMAGE_SAMPLE_V4_V8, IMAGE_SAMPLE_C_V4_V8,
+ IMAGE_SAMPLE_L_V4_V8, IMAGE_SAMPLE_C_L_V4_V8,
+ IMAGE_SAMPLE_B_V4_V8, IMAGE_SAMPLE_C_B_V4_V8,
+ IMAGE_SAMPLE_D_V4_V8, IMAGE_SAMPLE_C_D_V4_V8,
+ v8i32>;
+defm : SamplePatterns<IMAGE_SAMPLE_V4_V16, IMAGE_SAMPLE_C_V4_V16,
+ IMAGE_SAMPLE_L_V4_V16, IMAGE_SAMPLE_C_L_V4_V16,
+ IMAGE_SAMPLE_B_V4_V16, IMAGE_SAMPLE_C_B_V4_V16,
+ IMAGE_SAMPLE_D_V4_V16, IMAGE_SAMPLE_C_D_V4_V16,
+ v16i32>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/Processors.td b/contrib/llvm/lib/Target/AMDGPU/Processors.td
new file mode 100644
index 000000000000..3c07cc76b9a1
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/Processors.td
@@ -0,0 +1,189 @@
+//===-- Processors.td - R600 Processor definitions ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, ProcessorItineraries itin, list<SubtargetFeature> Features>
+: Processor<Name, itin, Features>;
+
+//===----------------------------------------------------------------------===//
+// R600
+//===----------------------------------------------------------------------===//
+def : Proc<"r600", R600_VLIW5_Itin,
+ [FeatureR600, FeatureVertexCache, FeatureWavefrontSize64]>;
+
+def : Proc<"r630", R600_VLIW5_Itin,
+ [FeatureR600, FeatureVertexCache, FeatureWavefrontSize32]>;
+
+def : Proc<"rs880", R600_VLIW5_Itin,
+ [FeatureR600, FeatureWavefrontSize16]>;
+
+def : Proc<"rv670", R600_VLIW5_Itin,
+ [FeatureR600, FeatureFP64, FeatureVertexCache, FeatureWavefrontSize64]>;
+
+//===----------------------------------------------------------------------===//
+// R700
+//===----------------------------------------------------------------------===//
+
+def : Proc<"rv710", R600_VLIW5_Itin,
+ [FeatureR700, FeatureVertexCache, FeatureWavefrontSize32]>;
+
+def : Proc<"rv730", R600_VLIW5_Itin,
+ [FeatureR700, FeatureVertexCache, FeatureWavefrontSize32]>;
+
+def : Proc<"rv770", R600_VLIW5_Itin,
+ [FeatureR700, FeatureFP64, FeatureVertexCache, FeatureWavefrontSize64]>;
+
+//===----------------------------------------------------------------------===//
+// Evergreen
+//===----------------------------------------------------------------------===//
+
+def : Proc<"cedar", R600_VLIW5_Itin,
+ [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize32,
+ FeatureCFALUBug]>;
+
+def : Proc<"redwood", R600_VLIW5_Itin,
+ [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64,
+ FeatureCFALUBug]>;
+
+def : Proc<"sumo", R600_VLIW5_Itin,
+ [FeatureEvergreen, FeatureWavefrontSize64, FeatureCFALUBug]>;
+
+def : Proc<"juniper", R600_VLIW5_Itin,
+ [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64]>;
+
+def : Proc<"cypress", R600_VLIW5_Itin,
+ [FeatureEvergreen, FeatureFP64, FeatureVertexCache,
+ FeatureWavefrontSize64]>;
+
+//===----------------------------------------------------------------------===//
+// Northern Islands
+//===----------------------------------------------------------------------===//
+
+def : Proc<"barts", R600_VLIW5_Itin,
+ [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug]>;
+
+def : Proc<"turks", R600_VLIW5_Itin,
+ [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug]>;
+
+def : Proc<"caicos", R600_VLIW5_Itin,
+ [FeatureNorthernIslands, FeatureCFALUBug]>;
+
+def : Proc<"cayman", R600_VLIW4_Itin,
+ [FeatureNorthernIslands, FeatureFP64, FeatureCaymanISA]>;
+
+//===----------------------------------------------------------------------===//
+// Southern Islands
+//===----------------------------------------------------------------------===//
+
+def : ProcessorModel<"SI", SIFullSpeedModel,
+ [FeatureSouthernIslands, FeatureFastFMAF32, HalfRate64Ops]
+>;
+
+def : ProcessorModel<"tahiti", SIFullSpeedModel,
+ [FeatureSouthernIslands, FeatureFastFMAF32, HalfRate64Ops]
+>;
+
+def : ProcessorModel<"pitcairn", SIQuarterSpeedModel, [FeatureSouthernIslands]>;
+
+def : ProcessorModel<"verde", SIQuarterSpeedModel, [FeatureSouthernIslands]>;
+
+def : ProcessorModel<"oland", SIQuarterSpeedModel, [FeatureSouthernIslands]>;
+
+def : ProcessorModel<"hainan", SIQuarterSpeedModel, [FeatureSouthernIslands]>;
+
+//===----------------------------------------------------------------------===//
+// Sea Islands
+//===----------------------------------------------------------------------===//
+
+def : ProcessorModel<"bonaire", SIQuarterSpeedModel,
+ [FeatureISAVersion7_0_0]
+>;
+
+def : ProcessorModel<"kabini", SIQuarterSpeedModel,
+ [FeatureISAVersion7_0_2]
+>;
+
+def : ProcessorModel<"kaveri", SIQuarterSpeedModel,
+ [FeatureISAVersion7_0_0]
+>;
+
+def : ProcessorModel<"hawaii", SIFullSpeedModel,
+ [FeatureISAVersion7_0_1]
+>;
+
+def : ProcessorModel<"mullins", SIQuarterSpeedModel,
+ [FeatureISAVersion7_0_2]>;
+
+def : ProcessorModel<"gfx700", SIQuarterSpeedModel,
+ [FeatureISAVersion7_0_0]
+>;
+
+def : ProcessorModel<"gfx701", SIFullSpeedModel,
+ [FeatureISAVersion7_0_1]
+>;
+
+def : ProcessorModel<"gfx702", SIQuarterSpeedModel,
+ [FeatureISAVersion7_0_2]
+>;
+
+//===----------------------------------------------------------------------===//
+// Volcanic Islands
+//===----------------------------------------------------------------------===//
+
+def : ProcessorModel<"tonga", SIQuarterSpeedModel,
+ [FeatureISAVersion8_0_2]
+>;
+
+def : ProcessorModel<"iceland", SIQuarterSpeedModel,
+ [FeatureISAVersion8_0_0]
+>;
+
+def : ProcessorModel<"carrizo", SIQuarterSpeedModel,
+ [FeatureISAVersion8_0_1]
+>;
+
+def : ProcessorModel<"fiji", SIQuarterSpeedModel,
+ [FeatureISAVersion8_0_3]
+>;
+
+def : ProcessorModel<"stoney", SIQuarterSpeedModel,
+ [FeatureISAVersion8_1_0]
+>;
+
+def : ProcessorModel<"polaris10", SIQuarterSpeedModel,
+ [FeatureISAVersion8_0_3]
+>;
+
+def : ProcessorModel<"polaris11", SIQuarterSpeedModel,
+ [FeatureISAVersion8_0_3]
+>;
+
+def : ProcessorModel<"gfx800", SIQuarterSpeedModel,
+ [FeatureISAVersion8_0_0]
+>;
+
+def : ProcessorModel<"gfx801", SIQuarterSpeedModel,
+ [FeatureISAVersion8_0_1]
+>;
+
+def : ProcessorModel<"gfx802", SIQuarterSpeedModel,
+ [FeatureISAVersion8_0_2]
+>;
+
+def : ProcessorModel<"gfx803", SIQuarterSpeedModel,
+ [FeatureISAVersion8_0_3]
+>;
+
+def : ProcessorModel<"gfx804", SIQuarterSpeedModel,
+ [FeatureISAVersion8_0_4]
+>;
+
+def : ProcessorModel<"gfx810", SIQuarterSpeedModel,
+ [FeatureISAVersion8_1_0]
+>;
+
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp
new file mode 100644
index 000000000000..d0aba38f786d
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp
@@ -0,0 +1,213 @@
+//===-- R600ClauseMergePass - Merge consecutive CF_ALU -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// R600EmitClauseMarker pass emits CFAlu instruction in a conservative maneer.
+/// This pass is merging consecutive CFAlus where applicable.
+/// It needs to be called after IfCvt for best results.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "R600Defines.h"
+#include "R600InstrInfo.h"
+#include "R600MachineFunctionInfo.h"
+#include "R600RegisterInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "r600mergeclause"
+
+namespace {
+
+static bool isCFAlu(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case AMDGPU::CF_ALU:
+ case AMDGPU::CF_ALU_PUSH_BEFORE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+class R600ClauseMergePass : public MachineFunctionPass {
+
+private:
+ static char ID;
+ const R600InstrInfo *TII;
+
+ unsigned getCFAluSize(const MachineInstr &MI) const;
+ bool isCFAluEnabled(const MachineInstr &MI) const;
+
+ /// IfCvt pass can generate "disabled" ALU clause marker that need to be
+ /// removed and their content affected to the previous alu clause.
+ /// This function parse instructions after CFAlu until it find a disabled
+ /// CFAlu and merge the content, or an enabled CFAlu.
+ void cleanPotentialDisabledCFAlu(MachineInstr &CFAlu) const;
+
+ /// Check whether LatrCFAlu can be merged into RootCFAlu and do it if
+ /// it is the case.
+ bool mergeIfPossible(MachineInstr &RootCFAlu,
+ const MachineInstr &LatrCFAlu) const;
+
+public:
+ R600ClauseMergePass(TargetMachine &tm) : MachineFunctionPass(ID) { }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override;
+};
+
+char R600ClauseMergePass::ID = 0;
+
+unsigned R600ClauseMergePass::getCFAluSize(const MachineInstr &MI) const {
+ assert(isCFAlu(MI));
+ return MI
+ .getOperand(TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::COUNT))
+ .getImm();
+}
+
+bool R600ClauseMergePass::isCFAluEnabled(const MachineInstr &MI) const {
+ assert(isCFAlu(MI));
+ return MI
+ .getOperand(TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::Enabled))
+ .getImm();
+}
+
+void R600ClauseMergePass::cleanPotentialDisabledCFAlu(
+ MachineInstr &CFAlu) const {
+ int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT);
+ MachineBasicBlock::iterator I = CFAlu, E = CFAlu.getParent()->end();
+ I++;
+ do {
+ while (I != E && !isCFAlu(*I))
+ I++;
+ if (I == E)
+ return;
+ MachineInstr &MI = *I++;
+ if (isCFAluEnabled(MI))
+ break;
+ CFAlu.getOperand(CntIdx).setImm(getCFAluSize(CFAlu) + getCFAluSize(MI));
+ MI.eraseFromParent();
+ } while (I != E);
+}
+
+bool R600ClauseMergePass::mergeIfPossible(MachineInstr &RootCFAlu,
+ const MachineInstr &LatrCFAlu) const {
+ assert(isCFAlu(RootCFAlu) && isCFAlu(LatrCFAlu));
+ int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT);
+ unsigned RootInstCount = getCFAluSize(RootCFAlu),
+ LaterInstCount = getCFAluSize(LatrCFAlu);
+ unsigned CumuledInsts = RootInstCount + LaterInstCount;
+ if (CumuledInsts >= TII->getMaxAlusPerClause()) {
+ DEBUG(dbgs() << "Excess inst counts\n");
+ return false;
+ }
+ if (RootCFAlu.getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE)
+ return false;
+ // Is KCache Bank 0 compatible ?
+ int Mode0Idx =
+ TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE0);
+ int KBank0Idx =
+ TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK0);
+ int KBank0LineIdx =
+ TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR0);
+ if (LatrCFAlu.getOperand(Mode0Idx).getImm() &&
+ RootCFAlu.getOperand(Mode0Idx).getImm() &&
+ (LatrCFAlu.getOperand(KBank0Idx).getImm() !=
+ RootCFAlu.getOperand(KBank0Idx).getImm() ||
+ LatrCFAlu.getOperand(KBank0LineIdx).getImm() !=
+ RootCFAlu.getOperand(KBank0LineIdx).getImm())) {
+ DEBUG(dbgs() << "Wrong KC0\n");
+ return false;
+ }
+ // Is KCache Bank 1 compatible ?
+ int Mode1Idx =
+ TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE1);
+ int KBank1Idx =
+ TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK1);
+ int KBank1LineIdx =
+ TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR1);
+ if (LatrCFAlu.getOperand(Mode1Idx).getImm() &&
+ RootCFAlu.getOperand(Mode1Idx).getImm() &&
+ (LatrCFAlu.getOperand(KBank1Idx).getImm() !=
+ RootCFAlu.getOperand(KBank1Idx).getImm() ||
+ LatrCFAlu.getOperand(KBank1LineIdx).getImm() !=
+ RootCFAlu.getOperand(KBank1LineIdx).getImm())) {
+ DEBUG(dbgs() << "Wrong KC0\n");
+ return false;
+ }
+ if (LatrCFAlu.getOperand(Mode0Idx).getImm()) {
+ RootCFAlu.getOperand(Mode0Idx).setImm(
+ LatrCFAlu.getOperand(Mode0Idx).getImm());
+ RootCFAlu.getOperand(KBank0Idx).setImm(
+ LatrCFAlu.getOperand(KBank0Idx).getImm());
+ RootCFAlu.getOperand(KBank0LineIdx)
+ .setImm(LatrCFAlu.getOperand(KBank0LineIdx).getImm());
+ }
+ if (LatrCFAlu.getOperand(Mode1Idx).getImm()) {
+ RootCFAlu.getOperand(Mode1Idx).setImm(
+ LatrCFAlu.getOperand(Mode1Idx).getImm());
+ RootCFAlu.getOperand(KBank1Idx).setImm(
+ LatrCFAlu.getOperand(KBank1Idx).getImm());
+ RootCFAlu.getOperand(KBank1LineIdx)
+ .setImm(LatrCFAlu.getOperand(KBank1LineIdx).getImm());
+ }
+ RootCFAlu.getOperand(CntIdx).setImm(CumuledInsts);
+ RootCFAlu.setDesc(TII->get(LatrCFAlu.getOpcode()));
+ return true;
+}
+
+bool R600ClauseMergePass::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
+ const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
+ TII = ST.getInstrInfo();
+
+ for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+ BB != BB_E; ++BB) {
+ MachineBasicBlock &MBB = *BB;
+ MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+ MachineBasicBlock::iterator LatestCFAlu = E;
+ while (I != E) {
+ MachineInstr &MI = *I++;
+ if ((!TII->canBeConsideredALU(MI) && !isCFAlu(MI)) ||
+ TII->mustBeLastInClause(MI.getOpcode()))
+ LatestCFAlu = E;
+ if (!isCFAlu(MI))
+ continue;
+ cleanPotentialDisabledCFAlu(MI);
+
+ if (LatestCFAlu != E && mergeIfPossible(*LatestCFAlu, MI)) {
+ MI.eraseFromParent();
+ } else {
+ assert(MI.getOperand(8).getImm() && "CF ALU instruction disabled");
+ LatestCFAlu = MI;
+ }
+ }
+ }
+ return false;
+}
+
+StringRef R600ClauseMergePass::getPassName() const {
+ return "R600 Merge Clause Markers Pass";
+}
+
+} // end anonymous namespace
+
+
+llvm::FunctionPass *llvm::createR600ClauseMergePass(TargetMachine &TM) {
+ return new R600ClauseMergePass(TM);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
new file mode 100644
index 000000000000..45b36d3d3ebb
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
@@ -0,0 +1,699 @@
+//===-- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass compute turns all control flow pseudo instructions into native one
+/// computing their address on the fly ; it also sets STACK_SIZE info.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Debug.h"
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "R600Defines.h"
+#include "R600InstrInfo.h"
+#include "R600MachineFunctionInfo.h"
+#include "R600RegisterInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "r600cf"
+
+namespace {
+
+struct CFStack {
+
+ enum StackItem {
+ ENTRY = 0,
+ SUB_ENTRY = 1,
+ FIRST_NON_WQM_PUSH = 2,
+ FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3
+ };
+
+ const R600Subtarget *ST;
+ std::vector<StackItem> BranchStack;
+ std::vector<StackItem> LoopStack;
+ unsigned MaxStackSize;
+ unsigned CurrentEntries;
+ unsigned CurrentSubEntries;
+
+ CFStack(const R600Subtarget *st, CallingConv::ID cc) : ST(st),
+ // We need to reserve a stack entry for CALL_FS in vertex shaders.
+ MaxStackSize(cc == CallingConv::AMDGPU_VS ? 1 : 0),
+ CurrentEntries(0), CurrentSubEntries(0) { }
+
+ unsigned getLoopDepth();
+ bool branchStackContains(CFStack::StackItem);
+ bool requiresWorkAroundForInst(unsigned Opcode);
+ unsigned getSubEntrySize(CFStack::StackItem Item);
+ void updateMaxStackSize();
+ void pushBranch(unsigned Opcode, bool isWQM = false);
+ void pushLoop();
+ void popBranch();
+ void popLoop();
+};
+
+unsigned CFStack::getLoopDepth() {
+ return LoopStack.size();
+}
+
+bool CFStack::branchStackContains(CFStack::StackItem Item) {
+ for (std::vector<CFStack::StackItem>::const_iterator I = BranchStack.begin(),
+ E = BranchStack.end(); I != E; ++I) {
+ if (*I == Item)
+ return true;
+ }
+ return false;
+}
+
+bool CFStack::requiresWorkAroundForInst(unsigned Opcode) {
+ if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST->hasCaymanISA() &&
+ getLoopDepth() > 1)
+ return true;
+
+ if (!ST->hasCFAluBug())
+ return false;
+
+ switch(Opcode) {
+ default: return false;
+ case AMDGPU::CF_ALU_PUSH_BEFORE:
+ case AMDGPU::CF_ALU_ELSE_AFTER:
+ case AMDGPU::CF_ALU_BREAK:
+ case AMDGPU::CF_ALU_CONTINUE:
+ if (CurrentSubEntries == 0)
+ return false;
+ if (ST->getWavefrontSize() == 64) {
+ // We are being conservative here. We only require this work-around if
+ // CurrentSubEntries > 3 &&
+ // (CurrentSubEntries % 4 == 3 || CurrentSubEntries % 4 == 0)
+ //
+ // We have to be conservative, because we don't know for certain that
+ // our stack allocation algorithm for Evergreen/NI is correct. Applying this
+ // work-around when CurrentSubEntries > 3 allows us to over-allocate stack
+ // resources without any problems.
+ return CurrentSubEntries > 3;
+ } else {
+ assert(ST->getWavefrontSize() == 32);
+ // We are being conservative here. We only require the work-around if
+ // CurrentSubEntries > 7 &&
+ // (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0)
+ // See the comment on the wavefront size == 64 case for why we are
+ // being conservative.
+ return CurrentSubEntries > 7;
+ }
+ }
+}
+
+unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
+ switch(Item) {
+ default:
+ return 0;
+ case CFStack::FIRST_NON_WQM_PUSH:
+ assert(!ST->hasCaymanISA());
+ if (ST->getGeneration() <= R600Subtarget::R700) {
+ // +1 For the push operation.
+ // +2 Extra space required.
+ return 3;
+ } else {
+ // Some documentation says that this is not necessary on Evergreen,
+ // but experimentation has show that we need to allocate 1 extra
+ // sub-entry for the first non-WQM push.
+ // +1 For the push operation.
+ // +1 Extra space required.
+ return 2;
+ }
+ case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY:
+ assert(ST->getGeneration() >= R600Subtarget::EVERGREEN);
+ // +1 For the push operation.
+ // +1 Extra space required.
+ return 2;
+ case CFStack::SUB_ENTRY:
+ return 1;
+ }
+}
+
+void CFStack::updateMaxStackSize() {
+ unsigned CurrentStackSize =
+ CurrentEntries + (alignTo(CurrentSubEntries, 4) / 4);
+ MaxStackSize = std::max(CurrentStackSize, MaxStackSize);
+}
+
+void CFStack::pushBranch(unsigned Opcode, bool isWQM) {
+ CFStack::StackItem Item = CFStack::ENTRY;
+ switch(Opcode) {
+ case AMDGPU::CF_PUSH_EG:
+ case AMDGPU::CF_ALU_PUSH_BEFORE:
+ if (!isWQM) {
+ if (!ST->hasCaymanISA() &&
+ !branchStackContains(CFStack::FIRST_NON_WQM_PUSH))
+ Item = CFStack::FIRST_NON_WQM_PUSH; // May not be required on Evergreen/NI
+ // See comment in
+ // CFStack::getSubEntrySize()
+ else if (CurrentEntries > 0 &&
+ ST->getGeneration() > R600Subtarget::EVERGREEN &&
+ !ST->hasCaymanISA() &&
+ !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY))
+ Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY;
+ else
+ Item = CFStack::SUB_ENTRY;
+ } else
+ Item = CFStack::ENTRY;
+ break;
+ }
+ BranchStack.push_back(Item);
+ if (Item == CFStack::ENTRY)
+ CurrentEntries++;
+ else
+ CurrentSubEntries += getSubEntrySize(Item);
+ updateMaxStackSize();
+}
+
+void CFStack::pushLoop() {
+ LoopStack.push_back(CFStack::ENTRY);
+ CurrentEntries++;
+ updateMaxStackSize();
+}
+
+void CFStack::popBranch() {
+ CFStack::StackItem Top = BranchStack.back();
+ if (Top == CFStack::ENTRY)
+ CurrentEntries--;
+ else
+ CurrentSubEntries-= getSubEntrySize(Top);
+ BranchStack.pop_back();
+}
+
+void CFStack::popLoop() {
+ CurrentEntries--;
+ LoopStack.pop_back();
+}
+
+class R600ControlFlowFinalizer : public MachineFunctionPass {
+
+private:
+ typedef std::pair<MachineInstr *, std::vector<MachineInstr *> > ClauseFile;
+
+ enum ControlFlowInstruction {
+ CF_TC,
+ CF_VC,
+ CF_CALL_FS,
+ CF_WHILE_LOOP,
+ CF_END_LOOP,
+ CF_LOOP_BREAK,
+ CF_LOOP_CONTINUE,
+ CF_JUMP,
+ CF_ELSE,
+ CF_POP,
+ CF_END
+ };
+
+ static char ID;
+ const R600InstrInfo *TII;
+ const R600RegisterInfo *TRI;
+ unsigned MaxFetchInst;
+ const R600Subtarget *ST;
+
+ bool IsTrivialInst(MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ case AMDGPU::KILL:
+ case AMDGPU::RETURN:
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const {
+ unsigned Opcode = 0;
+ bool isEg = (ST->getGeneration() >= R600Subtarget::EVERGREEN);
+ switch (CFI) {
+ case CF_TC:
+ Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600;
+ break;
+ case CF_VC:
+ Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600;
+ break;
+ case CF_CALL_FS:
+ Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600;
+ break;
+ case CF_WHILE_LOOP:
+ Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600;
+ break;
+ case CF_END_LOOP:
+ Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600;
+ break;
+ case CF_LOOP_BREAK:
+ Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600;
+ break;
+ case CF_LOOP_CONTINUE:
+ Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600;
+ break;
+ case CF_JUMP:
+ Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600;
+ break;
+ case CF_ELSE:
+ Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600;
+ break;
+ case CF_POP:
+ Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600;
+ break;
+ case CF_END:
+ if (ST->hasCaymanISA()) {
+ Opcode = AMDGPU::CF_END_CM;
+ break;
+ }
+ Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600;
+ break;
+ }
+ assert (Opcode && "No opcode selected");
+ return TII->get(Opcode);
+ }
+
+ bool isCompatibleWithClause(const MachineInstr &MI,
+ std::set<unsigned> &DstRegs) const {
+ unsigned DstMI, SrcMI;
+ for (MachineInstr::const_mop_iterator I = MI.operands_begin(),
+ E = MI.operands_end();
+ I != E; ++I) {
+ const MachineOperand &MO = *I;
+ if (!MO.isReg())
+ continue;
+ if (MO.isDef()) {
+ unsigned Reg = MO.getReg();
+ if (AMDGPU::R600_Reg128RegClass.contains(Reg))
+ DstMI = Reg;
+ else
+ DstMI = TRI->getMatchingSuperReg(Reg,
+ TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
+ &AMDGPU::R600_Reg128RegClass);
+ }
+ if (MO.isUse()) {
+ unsigned Reg = MO.getReg();
+ if (AMDGPU::R600_Reg128RegClass.contains(Reg))
+ SrcMI = Reg;
+ else
+ SrcMI = TRI->getMatchingSuperReg(Reg,
+ TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
+ &AMDGPU::R600_Reg128RegClass);
+ }
+ }
+ if ((DstRegs.find(SrcMI) == DstRegs.end())) {
+ DstRegs.insert(DstMI);
+ return true;
+ } else
+ return false;
+ }
+
+ ClauseFile
+ MakeFetchClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
+ const {
+ MachineBasicBlock::iterator ClauseHead = I;
+ std::vector<MachineInstr *> ClauseContent;
+ unsigned AluInstCount = 0;
+ bool IsTex = TII->usesTextureCache(*ClauseHead);
+ std::set<unsigned> DstRegs;
+ for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
+ if (IsTrivialInst(*I))
+ continue;
+ if (AluInstCount >= MaxFetchInst)
+ break;
+ if ((IsTex && !TII->usesTextureCache(*I)) ||
+ (!IsTex && !TII->usesVertexCache(*I)))
+ break;
+ if (!isCompatibleWithClause(*I, DstRegs))
+ break;
+ AluInstCount ++;
+ ClauseContent.push_back(&*I);
+ }
+ MachineInstr *MIb = BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead),
+ getHWInstrDesc(IsTex?CF_TC:CF_VC))
+ .addImm(0) // ADDR
+ .addImm(AluInstCount - 1); // COUNT
+ return ClauseFile(MIb, std::move(ClauseContent));
+ }
+
+ void getLiteral(MachineInstr &MI, std::vector<MachineOperand *> &Lits) const {
+ static const unsigned LiteralRegs[] = {
+ AMDGPU::ALU_LITERAL_X,
+ AMDGPU::ALU_LITERAL_Y,
+ AMDGPU::ALU_LITERAL_Z,
+ AMDGPU::ALU_LITERAL_W
+ };
+ const SmallVector<std::pair<MachineOperand *, int64_t>, 3> Srcs =
+ TII->getSrcs(MI);
+ for (const auto &Src:Srcs) {
+ if (Src.first->getReg() != AMDGPU::ALU_LITERAL_X)
+ continue;
+ int64_t Imm = Src.second;
+ std::vector<MachineOperand *>::iterator It =
+ find_if(Lits, [&](MachineOperand *val) {
+ return val->isImm() && (val->getImm() == Imm);
+ });
+
+ // Get corresponding Operand
+ MachineOperand &Operand = MI.getOperand(
+ TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::literal));
+
+ if (It != Lits.end()) {
+ // Reuse existing literal reg
+ unsigned Index = It - Lits.begin();
+ Src.first->setReg(LiteralRegs[Index]);
+ } else {
+ // Allocate new literal reg
+ assert(Lits.size() < 4 && "Too many literals in Instruction Group");
+ Src.first->setReg(LiteralRegs[Lits.size()]);
+ Lits.push_back(&Operand);
+ }
+ }
+ }
+
+ MachineBasicBlock::iterator insertLiterals(
+ MachineBasicBlock::iterator InsertPos,
+ const std::vector<unsigned> &Literals) const {
+ MachineBasicBlock *MBB = InsertPos->getParent();
+ for (unsigned i = 0, e = Literals.size(); i < e; i+=2) {
+ unsigned LiteralPair0 = Literals[i];
+ unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0;
+ InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(),
+ TII->get(AMDGPU::LITERALS))
+ .addImm(LiteralPair0)
+ .addImm(LiteralPair1);
+ }
+ return InsertPos;
+ }
+
+ ClauseFile
+ MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
+ const {
+ MachineInstr &ClauseHead = *I;
+ std::vector<MachineInstr *> ClauseContent;
+ I++;
+ for (MachineBasicBlock::instr_iterator E = MBB.instr_end(); I != E;) {
+ if (IsTrivialInst(*I)) {
+ ++I;
+ continue;
+ }
+ if (!I->isBundle() && !TII->isALUInstr(I->getOpcode()))
+ break;
+ std::vector<MachineOperand *>Literals;
+ if (I->isBundle()) {
+ MachineInstr &DeleteMI = *I;
+ MachineBasicBlock::instr_iterator BI = I.getInstrIterator();
+ while (++BI != E && BI->isBundledWithPred()) {
+ BI->unbundleFromPred();
+ for (MachineOperand &MO : BI->operands()) {
+ if (MO.isReg() && MO.isInternalRead())
+ MO.setIsInternalRead(false);
+ }
+ getLiteral(*BI, Literals);
+ ClauseContent.push_back(&*BI);
+ }
+ I = BI;
+ DeleteMI.eraseFromParent();
+ } else {
+ getLiteral(*I, Literals);
+ ClauseContent.push_back(&*I);
+ I++;
+ }
+ for (unsigned i = 0, e = Literals.size(); i < e; i += 2) {
+ MachineInstrBuilder MILit = BuildMI(MBB, I, I->getDebugLoc(),
+ TII->get(AMDGPU::LITERALS));
+ if (Literals[i]->isImm()) {
+ MILit.addImm(Literals[i]->getImm());
+ } else {
+ MILit.addGlobalAddress(Literals[i]->getGlobal(),
+ Literals[i]->getOffset());
+ }
+ if (i + 1 < e) {
+ if (Literals[i + 1]->isImm()) {
+ MILit.addImm(Literals[i + 1]->getImm());
+ } else {
+ MILit.addGlobalAddress(Literals[i + 1]->getGlobal(),
+ Literals[i + 1]->getOffset());
+ }
+ } else
+ MILit.addImm(0);
+ ClauseContent.push_back(MILit);
+ }
+ }
+ assert(ClauseContent.size() < 128 && "ALU clause is too big");
+ ClauseHead.getOperand(7).setImm(ClauseContent.size() - 1);
+ return ClauseFile(&ClauseHead, std::move(ClauseContent));
+ }
+
+ void EmitFetchClause(MachineBasicBlock::iterator InsertPos,
+ const DebugLoc &DL, ClauseFile &Clause,
+ unsigned &CfCount) {
+ CounterPropagateAddr(*Clause.first, CfCount);
+ MachineBasicBlock *BB = Clause.first->getParent();
+ BuildMI(BB, DL, TII->get(AMDGPU::FETCH_CLAUSE)).addImm(CfCount);
+ for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
+ BB->splice(InsertPos, BB, Clause.second[i]);
+ }
+ CfCount += 2 * Clause.second.size();
+ }
+
+ void EmitALUClause(MachineBasicBlock::iterator InsertPos, const DebugLoc &DL,
+ ClauseFile &Clause, unsigned &CfCount) {
+ Clause.first->getOperand(0).setImm(0);
+ CounterPropagateAddr(*Clause.first, CfCount);
+ MachineBasicBlock *BB = Clause.first->getParent();
+ BuildMI(BB, DL, TII->get(AMDGPU::ALU_CLAUSE)).addImm(CfCount);
+ for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
+ BB->splice(InsertPos, BB, Clause.second[i]);
+ }
+ CfCount += Clause.second.size();
+ }
+
+ void CounterPropagateAddr(MachineInstr &MI, unsigned Addr) const {
+ MI.getOperand(0).setImm(Addr + MI.getOperand(0).getImm());
+ }
+ void CounterPropagateAddr(const std::set<MachineInstr *> &MIs,
+ unsigned Addr) const {
+ for (MachineInstr *MI : MIs) {
+ CounterPropagateAddr(*MI, Addr);
+ }
+ }
+
+public:
+ R600ControlFlowFinalizer(TargetMachine &tm)
+ : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), ST(nullptr) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ ST = &MF.getSubtarget<R600Subtarget>();
+ MaxFetchInst = ST->getTexVTXClauseSize();
+ TII = ST->getInstrInfo();
+ TRI = ST->getRegisterInfo();
+
+ R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
+
+ CFStack CFStack(ST, MF.getFunction()->getCallingConv());
+ for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME;
+ ++MB) {
+ MachineBasicBlock &MBB = *MB;
+ unsigned CfCount = 0;
+ std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack;
+ std::vector<MachineInstr * > IfThenElseStack;
+ if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_VS) {
+ BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
+ getHWInstrDesc(CF_CALL_FS));
+ CfCount++;
+ }
+ std::vector<ClauseFile> FetchClauses, AluClauses;
+ std::vector<MachineInstr *> LastAlu(1);
+ std::vector<MachineInstr *> ToPopAfter;
+
+ for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+ I != E;) {
+ if (TII->usesTextureCache(*I) || TII->usesVertexCache(*I)) {
+ DEBUG(dbgs() << CfCount << ":"; I->dump(););
+ FetchClauses.push_back(MakeFetchClause(MBB, I));
+ CfCount++;
+ LastAlu.back() = nullptr;
+ continue;
+ }
+
+ MachineBasicBlock::iterator MI = I;
+ if (MI->getOpcode() != AMDGPU::ENDIF)
+ LastAlu.back() = nullptr;
+ if (MI->getOpcode() == AMDGPU::CF_ALU)
+ LastAlu.back() = &*MI;
+ I++;
+ bool RequiresWorkAround =
+ CFStack.requiresWorkAroundForInst(MI->getOpcode());
+ switch (MI->getOpcode()) {
+ case AMDGPU::CF_ALU_PUSH_BEFORE:
+ if (RequiresWorkAround) {
+ DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n");
+ BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG))
+ .addImm(CfCount + 1)
+ .addImm(1);
+ MI->setDesc(TII->get(AMDGPU::CF_ALU));
+ CfCount++;
+ CFStack.pushBranch(AMDGPU::CF_PUSH_EG);
+ } else
+ CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE);
+
+ case AMDGPU::CF_ALU:
+ I = MI;
+ AluClauses.push_back(MakeALUClause(MBB, I));
+ DEBUG(dbgs() << CfCount << ":"; MI->dump(););
+ CfCount++;
+ break;
+ case AMDGPU::WHILELOOP: {
+ CFStack.pushLoop();
+ MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
+ getHWInstrDesc(CF_WHILE_LOOP))
+ .addImm(1);
+ std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount,
+ std::set<MachineInstr *>());
+ Pair.second.insert(MIb);
+ LoopStack.push_back(std::move(Pair));
+ MI->eraseFromParent();
+ CfCount++;
+ break;
+ }
+ case AMDGPU::ENDLOOP: {
+ CFStack.popLoop();
+ std::pair<unsigned, std::set<MachineInstr *> > Pair =
+ std::move(LoopStack.back());
+ LoopStack.pop_back();
+ CounterPropagateAddr(Pair.second, CfCount);
+ BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP))
+ .addImm(Pair.first + 1);
+ MI->eraseFromParent();
+ CfCount++;
+ break;
+ }
+ case AMDGPU::IF_PREDICATE_SET: {
+ LastAlu.push_back(nullptr);
+ MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
+ getHWInstrDesc(CF_JUMP))
+ .addImm(0)
+ .addImm(0);
+ IfThenElseStack.push_back(MIb);
+ DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
+ MI->eraseFromParent();
+ CfCount++;
+ break;
+ }
+ case AMDGPU::ELSE: {
+ MachineInstr * JumpInst = IfThenElseStack.back();
+ IfThenElseStack.pop_back();
+ CounterPropagateAddr(*JumpInst, CfCount);
+ MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
+ getHWInstrDesc(CF_ELSE))
+ .addImm(0)
+ .addImm(0);
+ DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
+ IfThenElseStack.push_back(MIb);
+ MI->eraseFromParent();
+ CfCount++;
+ break;
+ }
+ case AMDGPU::ENDIF: {
+ CFStack.popBranch();
+ if (LastAlu.back()) {
+ ToPopAfter.push_back(LastAlu.back());
+ } else {
+ MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
+ getHWInstrDesc(CF_POP))
+ .addImm(CfCount + 1)
+ .addImm(1);
+ (void)MIb;
+ DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
+ CfCount++;
+ }
+
+ MachineInstr *IfOrElseInst = IfThenElseStack.back();
+ IfThenElseStack.pop_back();
+ CounterPropagateAddr(*IfOrElseInst, CfCount);
+ IfOrElseInst->getOperand(1).setImm(1);
+ LastAlu.pop_back();
+ MI->eraseFromParent();
+ break;
+ }
+ case AMDGPU::BREAK: {
+ CfCount ++;
+ MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
+ getHWInstrDesc(CF_LOOP_BREAK))
+ .addImm(0);
+ LoopStack.back().second.insert(MIb);
+ MI->eraseFromParent();
+ break;
+ }
+ case AMDGPU::CONTINUE: {
+ MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
+ getHWInstrDesc(CF_LOOP_CONTINUE))
+ .addImm(0);
+ LoopStack.back().second.insert(MIb);
+ MI->eraseFromParent();
+ CfCount++;
+ break;
+ }
+ case AMDGPU::RETURN: {
+ DebugLoc DL = MBB.findDebugLoc(MI);
+ BuildMI(MBB, MI, DL, getHWInstrDesc(CF_END));
+ CfCount++;
+ if (CfCount % 2) {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::PAD));
+ CfCount++;
+ }
+ MI->eraseFromParent();
+ for (unsigned i = 0, e = FetchClauses.size(); i < e; i++)
+ EmitFetchClause(I, DL, FetchClauses[i], CfCount);
+ for (unsigned i = 0, e = AluClauses.size(); i < e; i++)
+ EmitALUClause(I, DL, AluClauses[i], CfCount);
+ break;
+ }
+ default:
+ if (TII->isExport(MI->getOpcode())) {
+ DEBUG(dbgs() << CfCount << ":"; MI->dump(););
+ CfCount++;
+ }
+ break;
+ }
+ }
+ for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) {
+ MachineInstr *Alu = ToPopAfter[i];
+ BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu),
+ TII->get(AMDGPU::CF_ALU_POP_AFTER))
+ .addImm(Alu->getOperand(0).getImm())
+ .addImm(Alu->getOperand(1).getImm())
+ .addImm(Alu->getOperand(2).getImm())
+ .addImm(Alu->getOperand(3).getImm())
+ .addImm(Alu->getOperand(4).getImm())
+ .addImm(Alu->getOperand(5).getImm())
+ .addImm(Alu->getOperand(6).getImm())
+ .addImm(Alu->getOperand(7).getImm())
+ .addImm(Alu->getOperand(8).getImm());
+ Alu->eraseFromParent();
+ }
+ MFI->CFStackSize = CFStack.MaxStackSize;
+ }
+
+ return false;
+ }
+
+ StringRef getPassName() const override {
+ return "R600 Control Flow Finalizer Pass";
+ }
+};
+
+char R600ControlFlowFinalizer::ID = 0;
+
+} // end anonymous namespace
+
+
+llvm::FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) {
+ return new R600ControlFlowFinalizer(TM);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Defines.h b/contrib/llvm/lib/Target/AMDGPU/R600Defines.h
new file mode 100644
index 000000000000..534461adc59f
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600Defines.h
@@ -0,0 +1,171 @@
+//===-- R600Defines.h - R600 Helper Macros ----------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_R600DEFINES_H
+#define LLVM_LIB_TARGET_AMDGPU_R600DEFINES_H
+
+#include "llvm/MC/MCRegisterInfo.h"
+
+// Operand Flags
+#define MO_FLAG_CLAMP (1 << 0)
+#define MO_FLAG_NEG (1 << 1)
+#define MO_FLAG_ABS (1 << 2)
+#define MO_FLAG_MASK (1 << 3)
+#define MO_FLAG_PUSH (1 << 4)
+#define MO_FLAG_NOT_LAST (1 << 5)
+#define MO_FLAG_LAST (1 << 6)
+#define NUM_MO_FLAGS 7
+
+/// \brief Helper for getting the operand index for the instruction flags
+/// operand.
+#define GET_FLAG_OPERAND_IDX(Flags) (((Flags) >> 7) & 0x3)
+
+namespace R600_InstFlag {
+ enum TIF {
+ TRANS_ONLY = (1 << 0),
+ TEX = (1 << 1),
+ REDUCTION = (1 << 2),
+ FC = (1 << 3),
+ TRIG = (1 << 4),
+ OP3 = (1 << 5),
+ VECTOR = (1 << 6),
+ //FlagOperand bits 7, 8
+ NATIVE_OPERANDS = (1 << 9),
+ OP1 = (1 << 10),
+ OP2 = (1 << 11),
+ VTX_INST = (1 << 12),
+ TEX_INST = (1 << 13),
+ ALU_INST = (1 << 14),
+ LDS_1A = (1 << 15),
+ LDS_1A1D = (1 << 16),
+ IS_EXPORT = (1 << 17),
+ LDS_1A2D = (1 << 18)
+ };
+}
+
+#define HAS_NATIVE_OPERANDS(Flags) ((Flags) & R600_InstFlag::NATIVE_OPERANDS)
+
+/// \brief Defines for extracting register information from register encoding
+#define HW_REG_MASK 0x1ff
+#define HW_CHAN_SHIFT 9
+
+#define GET_REG_CHAN(reg) ((reg) >> HW_CHAN_SHIFT)
+#define GET_REG_INDEX(reg) ((reg) & HW_REG_MASK)
+
+#define IS_VTX(desc) ((desc).TSFlags & R600_InstFlag::VTX_INST)
+#define IS_TEX(desc) ((desc).TSFlags & R600_InstFlag::TEX_INST)
+
+namespace OpName {
+
+ enum VecOps {
+ UPDATE_EXEC_MASK_X,
+ UPDATE_PREDICATE_X,
+ WRITE_X,
+ OMOD_X,
+ DST_REL_X,
+ CLAMP_X,
+ SRC0_X,
+ SRC0_NEG_X,
+ SRC0_REL_X,
+ SRC0_ABS_X,
+ SRC0_SEL_X,
+ SRC1_X,
+ SRC1_NEG_X,
+ SRC1_REL_X,
+ SRC1_ABS_X,
+ SRC1_SEL_X,
+ PRED_SEL_X,
+ UPDATE_EXEC_MASK_Y,
+ UPDATE_PREDICATE_Y,
+ WRITE_Y,
+ OMOD_Y,
+ DST_REL_Y,
+ CLAMP_Y,
+ SRC0_Y,
+ SRC0_NEG_Y,
+ SRC0_REL_Y,
+ SRC0_ABS_Y,
+ SRC0_SEL_Y,
+ SRC1_Y,
+ SRC1_NEG_Y,
+ SRC1_REL_Y,
+ SRC1_ABS_Y,
+ SRC1_SEL_Y,
+ PRED_SEL_Y,
+ UPDATE_EXEC_MASK_Z,
+ UPDATE_PREDICATE_Z,
+ WRITE_Z,
+ OMOD_Z,
+ DST_REL_Z,
+ CLAMP_Z,
+ SRC0_Z,
+ SRC0_NEG_Z,
+ SRC0_REL_Z,
+ SRC0_ABS_Z,
+ SRC0_SEL_Z,
+ SRC1_Z,
+ SRC1_NEG_Z,
+ SRC1_REL_Z,
+ SRC1_ABS_Z,
+ SRC1_SEL_Z,
+ PRED_SEL_Z,
+ UPDATE_EXEC_MASK_W,
+ UPDATE_PREDICATE_W,
+ WRITE_W,
+ OMOD_W,
+ DST_REL_W,
+ CLAMP_W,
+ SRC0_W,
+ SRC0_NEG_W,
+ SRC0_REL_W,
+ SRC0_ABS_W,
+ SRC0_SEL_W,
+ SRC1_W,
+ SRC1_NEG_W,
+ SRC1_REL_W,
+ SRC1_ABS_W,
+ SRC1_SEL_W,
+ PRED_SEL_W,
+ IMM_0,
+ IMM_1,
+ VEC_COUNT
+ };
+
+}
+
+//===----------------------------------------------------------------------===//
+// Config register definitions
+//===----------------------------------------------------------------------===//
+
+#define R_02880C_DB_SHADER_CONTROL 0x02880C
+#define S_02880C_KILL_ENABLE(x) (((x) & 0x1) << 6)
+
+// These fields are the same for all shader types and families.
+#define S_NUM_GPRS(x) (((x) & 0xFF) << 0)
+#define S_STACK_SIZE(x) (((x) & 0xFF) << 8)
+//===----------------------------------------------------------------------===//
+// R600, R700 Registers
+//===----------------------------------------------------------------------===//
+
+#define R_028850_SQ_PGM_RESOURCES_PS 0x028850
+#define R_028868_SQ_PGM_RESOURCES_VS 0x028868
+
+//===----------------------------------------------------------------------===//
+// Evergreen, Northern Islands Registers
+//===----------------------------------------------------------------------===//
+
+#define R_028844_SQ_PGM_RESOURCES_PS 0x028844
+#define R_028860_SQ_PGM_RESOURCES_VS 0x028860
+#define R_028878_SQ_PGM_RESOURCES_GS 0x028878
+#define R_0288D4_SQ_PGM_RESOURCES_LS 0x0288d4
+
+#define R_0288E8_SQ_LDS_ALLOC 0x0288E8
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/contrib/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
new file mode 100644
index 000000000000..9a5db6ccc672
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
@@ -0,0 +1,339 @@
+//===-- R600EmitClauseMarkers.cpp - Emit CF_ALU ---------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Add CF_ALU. R600 Alu instructions are grouped in clause which can hold
+/// 128 Alu instructions ; these instructions can access up to 4 prefetched
+/// 4 lines of 16 registers from constant buffers. Such ALU clauses are
+/// initiated by CF_ALU instructions.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "R600Defines.h"
+#include "R600InstrInfo.h"
+#include "R600MachineFunctionInfo.h"
+#include "R600RegisterInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+namespace llvm {
+ void initializeR600EmitClauseMarkersPass(PassRegistry&);
+}
+
+namespace {
+
+class R600EmitClauseMarkers : public MachineFunctionPass {
+
+private:
+ const R600InstrInfo *TII;
+ int Address;
+
+ unsigned OccupiedDwords(MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ case AMDGPU::INTERP_PAIR_XY:
+ case AMDGPU::INTERP_PAIR_ZW:
+ case AMDGPU::INTERP_VEC_LOAD:
+ case AMDGPU::DOT_4:
+ return 4;
+ case AMDGPU::KILL:
+ return 0;
+ default:
+ break;
+ }
+
+ // These will be expanded to two ALU instructions in the
+ // ExpandSpecialInstructions pass.
+ if (TII->isLDSRetInstr(MI.getOpcode()))
+ return 2;
+
+ if (TII->isVector(MI) || TII->isCubeOp(MI.getOpcode()) ||
+ TII->isReductionOp(MI.getOpcode()))
+ return 4;
+
+ unsigned NumLiteral = 0;
+ for (MachineInstr::mop_iterator It = MI.operands_begin(),
+ E = MI.operands_end();
+ It != E; ++It) {
+ MachineOperand &MO = *It;
+ if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X)
+ ++NumLiteral;
+ }
+ return 1 + NumLiteral;
+ }
+
+ bool isALU(const MachineInstr &MI) const {
+ if (TII->isALUInstr(MI.getOpcode()))
+ return true;
+ if (TII->isVector(MI) || TII->isCubeOp(MI.getOpcode()))
+ return true;
+ switch (MI.getOpcode()) {
+ case AMDGPU::PRED_X:
+ case AMDGPU::INTERP_PAIR_XY:
+ case AMDGPU::INTERP_PAIR_ZW:
+ case AMDGPU::INTERP_VEC_LOAD:
+ case AMDGPU::COPY:
+ case AMDGPU::DOT_4:
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ bool IsTrivialInst(MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ case AMDGPU::KILL:
+ case AMDGPU::RETURN:
+ case AMDGPU::IMPLICIT_DEF:
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ std::pair<unsigned, unsigned> getAccessedBankLine(unsigned Sel) const {
+ // Sel is (512 + (kc_bank << 12) + ConstIndex) << 2
+ // (See also R600ISelLowering.cpp)
+ // ConstIndex value is in [0, 4095];
+ return std::pair<unsigned, unsigned>(
+ ((Sel >> 2) - 512) >> 12, // KC_BANK
+ // Line Number of ConstIndex
+ // A line contains 16 constant registers however KCX bank can lock
+ // two line at the same time ; thus we want to get an even line number.
+ // Line number can be retrieved with (>>4), using (>>5) <<1 generates
+ // an even number.
+ ((((Sel >> 2) - 512) & 4095) >> 5) << 1);
+ }
+
+ bool
+ SubstituteKCacheBank(MachineInstr &MI,
+ std::vector<std::pair<unsigned, unsigned>> &CachedConsts,
+ bool UpdateInstr = true) const {
+ std::vector<std::pair<unsigned, unsigned> > UsedKCache;
+
+ if (!TII->isALUInstr(MI.getOpcode()) && MI.getOpcode() != AMDGPU::DOT_4)
+ return true;
+
+ const SmallVectorImpl<std::pair<MachineOperand *, int64_t>> &Consts =
+ TII->getSrcs(MI);
+ assert(
+ (TII->isALUInstr(MI.getOpcode()) || MI.getOpcode() == AMDGPU::DOT_4) &&
+ "Can't assign Const");
+ for (unsigned i = 0, n = Consts.size(); i < n; ++i) {
+ if (Consts[i].first->getReg() != AMDGPU::ALU_CONST)
+ continue;
+ unsigned Sel = Consts[i].second;
+ unsigned Chan = Sel & 3, Index = ((Sel >> 2) - 512) & 31;
+ unsigned KCacheIndex = Index * 4 + Chan;
+ const std::pair<unsigned, unsigned> &BankLine = getAccessedBankLine(Sel);
+ if (CachedConsts.empty()) {
+ CachedConsts.push_back(BankLine);
+ UsedKCache.push_back(std::pair<unsigned, unsigned>(0, KCacheIndex));
+ continue;
+ }
+ if (CachedConsts[0] == BankLine) {
+ UsedKCache.push_back(std::pair<unsigned, unsigned>(0, KCacheIndex));
+ continue;
+ }
+ if (CachedConsts.size() == 1) {
+ CachedConsts.push_back(BankLine);
+ UsedKCache.push_back(std::pair<unsigned, unsigned>(1, KCacheIndex));
+ continue;
+ }
+ if (CachedConsts[1] == BankLine) {
+ UsedKCache.push_back(std::pair<unsigned, unsigned>(1, KCacheIndex));
+ continue;
+ }
+ return false;
+ }
+
+ if (!UpdateInstr)
+ return true;
+
+ for (unsigned i = 0, j = 0, n = Consts.size(); i < n; ++i) {
+ if (Consts[i].first->getReg() != AMDGPU::ALU_CONST)
+ continue;
+ switch(UsedKCache[j].first) {
+ case 0:
+ Consts[i].first->setReg(
+ AMDGPU::R600_KC0RegClass.getRegister(UsedKCache[j].second));
+ break;
+ case 1:
+ Consts[i].first->setReg(
+ AMDGPU::R600_KC1RegClass.getRegister(UsedKCache[j].second));
+ break;
+ default:
+ llvm_unreachable("Wrong Cache Line");
+ }
+ j++;
+ }
+ return true;
+ }
+
+ bool canClauseLocalKillFitInClause(
+ unsigned AluInstCount,
+ std::vector<std::pair<unsigned, unsigned> > KCacheBanks,
+ MachineBasicBlock::iterator Def,
+ MachineBasicBlock::iterator BBEnd) {
+ const R600RegisterInfo &TRI = TII->getRegisterInfo();
+ for (MachineInstr::const_mop_iterator
+ MOI = Def->operands_begin(),
+ MOE = Def->operands_end(); MOI != MOE; ++MOI) {
+ if (!MOI->isReg() || !MOI->isDef() ||
+ TRI.isPhysRegLiveAcrossClauses(MOI->getReg()))
+ continue;
+
+ // Def defines a clause local register, so check that its use will fit
+ // in the clause.
+ unsigned LastUseCount = 0;
+ for (MachineBasicBlock::iterator UseI = Def; UseI != BBEnd; ++UseI) {
+ AluInstCount += OccupiedDwords(*UseI);
+ // Make sure we won't need to end the clause due to KCache limitations.
+ if (!SubstituteKCacheBank(*UseI, KCacheBanks, false))
+ return false;
+
+ // We have reached the maximum instruction limit before finding the
+ // use that kills this register, so we cannot use this def in the
+ // current clause.
+ if (AluInstCount >= TII->getMaxAlusPerClause())
+ return false;
+
+ // Register kill flags have been cleared by the time we get to this
+ // pass, but it is safe to assume that all uses of this register
+ // occur in the same basic block as its definition, because
+ // it is illegal for the scheduler to schedule them in
+ // different blocks.
+ if (UseI->findRegisterUseOperandIdx(MOI->getReg()))
+ LastUseCount = AluInstCount;
+
+ if (UseI != Def && UseI->findRegisterDefOperandIdx(MOI->getReg()) != -1)
+ break;
+ }
+ if (LastUseCount)
+ return LastUseCount <= TII->getMaxAlusPerClause();
+ llvm_unreachable("Clause local register live at end of clause.");
+ }
+ return true;
+ }
+
+ MachineBasicBlock::iterator
+ MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) {
+ MachineBasicBlock::iterator ClauseHead = I;
+ std::vector<std::pair<unsigned, unsigned> > KCacheBanks;
+ bool PushBeforeModifier = false;
+ unsigned AluInstCount = 0;
+ for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
+ if (IsTrivialInst(*I))
+ continue;
+ if (!isALU(*I))
+ break;
+ if (AluInstCount > TII->getMaxAlusPerClause())
+ break;
+ if (I->getOpcode() == AMDGPU::PRED_X) {
+ // We put PRED_X in its own clause to ensure that ifcvt won't create
+ // clauses with more than 128 insts.
+ // IfCvt is indeed checking that "then" and "else" branches of an if
+ // statement have less than ~60 insts thus converted clauses can't be
+ // bigger than ~121 insts (predicate setter needs to be in the same
+ // clause as predicated alus).
+ if (AluInstCount > 0)
+ break;
+ if (TII->getFlagOp(*I).getImm() & MO_FLAG_PUSH)
+ PushBeforeModifier = true;
+ AluInstCount ++;
+ continue;
+ }
+ // XXX: GROUP_BARRIER instructions cannot be in the same ALU clause as:
+ //
+ // * KILL or INTERP instructions
+ // * Any instruction that sets UPDATE_EXEC_MASK or UPDATE_PRED bits
+ // * Uses waterfalling (i.e. INDEX_MODE = AR.X)
+ //
+ // XXX: These checks have not been implemented yet.
+ if (TII->mustBeLastInClause(I->getOpcode())) {
+ I++;
+ break;
+ }
+
+ // If this instruction defines a clause local register, make sure
+ // its use can fit in this clause.
+ if (!canClauseLocalKillFitInClause(AluInstCount, KCacheBanks, I, E))
+ break;
+
+ if (!SubstituteKCacheBank(*I, KCacheBanks))
+ break;
+ AluInstCount += OccupiedDwords(*I);
+ }
+ unsigned Opcode = PushBeforeModifier ?
+ AMDGPU::CF_ALU_PUSH_BEFORE : AMDGPU::CF_ALU;
+ BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), TII->get(Opcode))
+ // We don't use the ADDR field until R600ControlFlowFinalizer pass, where
+ // it is safe to assume it is 0. However if we always put 0 here, the ifcvt
+ // pass may assume that identical ALU clause starter at the beginning of a
+ // true and false branch can be factorized which is not the case.
+ .addImm(Address++) // ADDR
+ .addImm(KCacheBanks.empty()?0:KCacheBanks[0].first) // KB0
+ .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].first) // KB1
+ .addImm(KCacheBanks.empty()?0:2) // KM0
+ .addImm((KCacheBanks.size() < 2)?0:2) // KM1
+ .addImm(KCacheBanks.empty()?0:KCacheBanks[0].second) // KLINE0
+ .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].second) // KLINE1
+ .addImm(AluInstCount) // COUNT
+ .addImm(1); // Enabled
+ return I;
+ }
+
+public:
+ static char ID;
+ R600EmitClauseMarkers() : MachineFunctionPass(ID), TII(nullptr), Address(0) {
+
+ initializeR600EmitClauseMarkersPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
+ TII = ST.getInstrInfo();
+
+ for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+ BB != BB_E; ++BB) {
+ MachineBasicBlock &MBB = *BB;
+ MachineBasicBlock::iterator I = MBB.begin();
+ if (I != MBB.end() && I->getOpcode() == AMDGPU::CF_ALU)
+ continue; // BB was already parsed
+ for (MachineBasicBlock::iterator E = MBB.end(); I != E;) {
+ if (isALU(*I))
+ I = MakeALUClause(MBB, I);
+ else
+ ++I;
+ }
+ }
+ return false;
+ }
+
+ StringRef getPassName() const override {
+ return "R600 Emit Clause Markers Pass";
+ }
+};
+
+char R600EmitClauseMarkers::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(R600EmitClauseMarkers, "emitclausemarkers",
+ "R600 Emit Clause Markters", false, false)
+INITIALIZE_PASS_END(R600EmitClauseMarkers, "emitclausemarkers",
+ "R600 Emit Clause Markters", false, false)
+
+llvm::FunctionPass *llvm::createR600EmitClauseMarkers() {
+ return new R600EmitClauseMarkers();
+}
+
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
new file mode 100644
index 000000000000..3e46e6387614
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
@@ -0,0 +1,270 @@
+//===-- R600ExpandSpecialInstrs.cpp - Expand special instructions ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Vector, Reduction, and Cube instructions need to fill the entire instruction
+/// group to work correctly. This pass expands these individual instructions
+/// into several instructions that will completely fill the instruction group.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "R600Defines.h"
+#include "R600InstrInfo.h"
+#include "R600MachineFunctionInfo.h"
+#include "R600RegisterInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+class R600ExpandSpecialInstrsPass : public MachineFunctionPass {
+private:
+ static char ID;
+ const R600InstrInfo *TII;
+
+ void SetFlagInNewMI(MachineInstr *NewMI, const MachineInstr *OldMI,
+ unsigned Op);
+
+public:
+ R600ExpandSpecialInstrsPass(TargetMachine &tm) : MachineFunctionPass(ID),
+ TII(nullptr) { }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "R600 Expand special instructions pass";
+ }
+};
+
+} // End anonymous namespace
+
+char R600ExpandSpecialInstrsPass::ID = 0;
+
+FunctionPass *llvm::createR600ExpandSpecialInstrsPass(TargetMachine &TM) {
+ return new R600ExpandSpecialInstrsPass(TM);
+}
+
+void R600ExpandSpecialInstrsPass::SetFlagInNewMI(MachineInstr *NewMI,
+ const MachineInstr *OldMI, unsigned Op) {
+ int OpIdx = TII->getOperandIdx(*OldMI, Op);
+ if (OpIdx > -1) {
+ uint64_t Val = OldMI->getOperand(OpIdx).getImm();
+ TII->setImmOperand(*NewMI, Op, Val);
+ }
+}
+
+bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
+ const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
+ TII = ST.getInstrInfo();
+
+ const R600RegisterInfo &TRI = TII->getRegisterInfo();
+
+ for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+ BB != BB_E; ++BB) {
+ MachineBasicBlock &MBB = *BB;
+ MachineBasicBlock::iterator I = MBB.begin();
+ while (I != MBB.end()) {
+ MachineInstr &MI = *I;
+ I = std::next(I);
+
+ // Expand LDS_*_RET instructions
+ if (TII->isLDSRetInstr(MI.getOpcode())) {
+ int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
+ assert(DstIdx != -1);
+ MachineOperand &DstOp = MI.getOperand(DstIdx);
+ MachineInstr *Mov = TII->buildMovInstr(&MBB, I,
+ DstOp.getReg(), AMDGPU::OQAP);
+ DstOp.setReg(AMDGPU::OQAP);
+ int LDSPredSelIdx = TII->getOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::pred_sel);
+ int MovPredSelIdx = TII->getOperandIdx(Mov->getOpcode(),
+ AMDGPU::OpName::pred_sel);
+ // Copy the pred_sel bit
+ Mov->getOperand(MovPredSelIdx).setReg(
+ MI.getOperand(LDSPredSelIdx).getReg());
+ }
+
+ switch (MI.getOpcode()) {
+ default: break;
+ // Expand PRED_X to one of the PRED_SET instructions.
+ case AMDGPU::PRED_X: {
+ uint64_t Flags = MI.getOperand(3).getImm();
+ // The native opcode used by PRED_X is stored as an immediate in the
+ // third operand.
+ MachineInstr *PredSet = TII->buildDefaultInstruction(MBB, I,
+ MI.getOperand(2).getImm(), // opcode
+ MI.getOperand(0).getReg(), // dst
+ MI.getOperand(1).getReg(), // src0
+ AMDGPU::ZERO); // src1
+ TII->addFlag(*PredSet, 0, MO_FLAG_MASK);
+ if (Flags & MO_FLAG_PUSH) {
+ TII->setImmOperand(*PredSet, AMDGPU::OpName::update_exec_mask, 1);
+ } else {
+ TII->setImmOperand(*PredSet, AMDGPU::OpName::update_pred, 1);
+ }
+ MI.eraseFromParent();
+ continue;
+ }
+ case AMDGPU::DOT_4: {
+
+ const R600RegisterInfo &TRI = TII->getRegisterInfo();
+
+ unsigned DstReg = MI.getOperand(0).getReg();
+ unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK;
+
+ for (unsigned Chan = 0; Chan < 4; ++Chan) {
+ bool Mask = (Chan != TRI.getHWRegChan(DstReg));
+ unsigned SubDstReg =
+ AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
+ MachineInstr *BMI =
+ TII->buildSlotOfVectorInstruction(MBB, &MI, Chan, SubDstReg);
+ if (Chan > 0) {
+ BMI->bundleWithPred();
+ }
+ if (Mask) {
+ TII->addFlag(*BMI, 0, MO_FLAG_MASK);
+ }
+ if (Chan != 3)
+ TII->addFlag(*BMI, 0, MO_FLAG_NOT_LAST);
+ unsigned Opcode = BMI->getOpcode();
+ // While not strictly necessary from hw point of view, we force
+ // all src operands of a dot4 inst to belong to the same slot.
+ unsigned Src0 = BMI->getOperand(
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src0))
+ .getReg();
+ unsigned Src1 = BMI->getOperand(
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src1))
+ .getReg();
+ (void) Src0;
+ (void) Src1;
+ if ((TRI.getEncodingValue(Src0) & 0xff) < 127 &&
+ (TRI.getEncodingValue(Src1) & 0xff) < 127)
+ assert(TRI.getHWRegChan(Src0) == TRI.getHWRegChan(Src1));
+ }
+ MI.eraseFromParent();
+ continue;
+ }
+ }
+
+ bool IsReduction = TII->isReductionOp(MI.getOpcode());
+ bool IsVector = TII->isVector(MI);
+ bool IsCube = TII->isCubeOp(MI.getOpcode());
+ if (!IsReduction && !IsVector && !IsCube) {
+ continue;
+ }
+
+ // Expand the instruction
+ //
+ // Reduction instructions:
+ // T0_X = DP4 T1_XYZW, T2_XYZW
+ // becomes:
+ // TO_X = DP4 T1_X, T2_X
+ // TO_Y (write masked) = DP4 T1_Y, T2_Y
+ // TO_Z (write masked) = DP4 T1_Z, T2_Z
+ // TO_W (write masked) = DP4 T1_W, T2_W
+ //
+ // Vector instructions:
+ // T0_X = MULLO_INT T1_X, T2_X
+ // becomes:
+ // T0_X = MULLO_INT T1_X, T2_X
+ // T0_Y (write masked) = MULLO_INT T1_X, T2_X
+ // T0_Z (write masked) = MULLO_INT T1_X, T2_X
+ // T0_W (write masked) = MULLO_INT T1_X, T2_X
+ //
+ // Cube instructions:
+ // T0_XYZW = CUBE T1_XYZW
+ // becomes:
+ // TO_X = CUBE T1_Z, T1_Y
+ // T0_Y = CUBE T1_Z, T1_X
+ // T0_Z = CUBE T1_X, T1_Z
+ // T0_W = CUBE T1_Y, T1_Z
+ for (unsigned Chan = 0; Chan < 4; Chan++) {
+ unsigned DstReg = MI.getOperand(
+ TII->getOperandIdx(MI, AMDGPU::OpName::dst)).getReg();
+ unsigned Src0 = MI.getOperand(
+ TII->getOperandIdx(MI, AMDGPU::OpName::src0)).getReg();
+ unsigned Src1 = 0;
+
+ // Determine the correct source registers
+ if (!IsCube) {
+ int Src1Idx = TII->getOperandIdx(MI, AMDGPU::OpName::src1);
+ if (Src1Idx != -1) {
+ Src1 = MI.getOperand(Src1Idx).getReg();
+ }
+ }
+ if (IsReduction) {
+ unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan);
+ Src0 = TRI.getSubReg(Src0, SubRegIndex);
+ Src1 = TRI.getSubReg(Src1, SubRegIndex);
+ } else if (IsCube) {
+ static const int CubeSrcSwz[] = {2, 2, 0, 1};
+ unsigned SubRegIndex0 = TRI.getSubRegFromChannel(CubeSrcSwz[Chan]);
+ unsigned SubRegIndex1 = TRI.getSubRegFromChannel(CubeSrcSwz[3 - Chan]);
+ Src1 = TRI.getSubReg(Src0, SubRegIndex1);
+ Src0 = TRI.getSubReg(Src0, SubRegIndex0);
+ }
+
+ // Determine the correct destination registers;
+ bool Mask = false;
+ bool NotLast = true;
+ if (IsCube) {
+ unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan);
+ DstReg = TRI.getSubReg(DstReg, SubRegIndex);
+ } else {
+ // Mask the write if the original instruction does not write to
+ // the current Channel.
+ Mask = (Chan != TRI.getHWRegChan(DstReg));
+ unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK;
+ DstReg = AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
+ }
+
+ // Set the IsLast bit
+ NotLast = (Chan != 3 );
+
+ // Add the new instruction
+ unsigned Opcode = MI.getOpcode();
+ switch (Opcode) {
+ case AMDGPU::CUBE_r600_pseudo:
+ Opcode = AMDGPU::CUBE_r600_real;
+ break;
+ case AMDGPU::CUBE_eg_pseudo:
+ Opcode = AMDGPU::CUBE_eg_real;
+ break;
+ default:
+ break;
+ }
+
+ MachineInstr *NewMI =
+ TII->buildDefaultInstruction(MBB, I, Opcode, DstReg, Src0, Src1);
+
+ if (Chan != 0)
+ NewMI->bundleWithPred();
+ if (Mask) {
+ TII->addFlag(*NewMI, 0, MO_FLAG_MASK);
+ }
+ if (NotLast) {
+ TII->addFlag(*NewMI, 0, MO_FLAG_NOT_LAST);
+ }
+ SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::clamp);
+ SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::literal);
+ SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_abs);
+ SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_abs);
+ SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_neg);
+ SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_neg);
+ }
+ MI.eraseFromParent();
+ }
+ }
+ return false;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp
new file mode 100644
index 000000000000..5813786abe01
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp
@@ -0,0 +1,14 @@
+//===----------------------- R600FrameLowering.cpp ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+
+#include "R600FrameLowering.h"
+
+using namespace llvm;
+
+R600FrameLowering::~R600FrameLowering() = default;
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.h b/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.h
new file mode 100644
index 000000000000..874435f35ce4
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.h
@@ -0,0 +1,32 @@
+//===--------------------- R600FrameLowering.h ------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_R600FRAMELOWERING_H
+#define LLVM_LIB_TARGET_AMDGPU_R600FRAMELOWERING_H
+
+#include "AMDGPUFrameLowering.h"
+
+namespace llvm {
+
+class R600FrameLowering : public AMDGPUFrameLowering {
+public:
+ R600FrameLowering(StackDirection D, unsigned StackAl, int LAO,
+ unsigned TransAl = 1) :
+ AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {}
+ ~R600FrameLowering() override;
+
+ void emitPrologue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const override {}
+ void emitEpilogue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const override {}
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_R600FRAMELOWERING_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
new file mode 100644
index 000000000000..89c9266746ac
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -0,0 +1,2202 @@
+//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Custom DAG lowering for R600
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600ISelLowering.h"
+#include "AMDGPUFrameLowering.h"
+#include "AMDGPUIntrinsicInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "R600Defines.h"
+#include "R600FrameLowering.h"
+#include "R600InstrInfo.h"
+#include "R600MachineFunctionInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/DAGCombine.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
+ const R600Subtarget &STI)
+ : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) {
+ addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
+ addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
+ addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
+ addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
+ addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
+ addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
+
+ computeRegisterProperties(STI.getRegisterInfo());
+
+ // Legalize loads and stores to the private address space.
+ setOperationAction(ISD::LOAD, MVT::i32, Custom);
+ setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
+ setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
+
+ // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
+ // spaces, so it is custom lowered to handle those where it isn't.
+ for (MVT VT : MVT::integer_valuetypes()) {
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
+
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
+
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
+ }
+
+ // Workaround for LegalizeDAG asserting on expansion of i1 vector loads.
+ setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
+
+ setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
+
+ setOperationAction(ISD::STORE, MVT::i8, Custom);
+ setOperationAction(ISD::STORE, MVT::i32, Custom);
+ setOperationAction(ISD::STORE, MVT::v2i32, Custom);
+ setOperationAction(ISD::STORE, MVT::v4i32, Custom);
+
+ setTruncStoreAction(MVT::i32, MVT::i8, Custom);
+ setTruncStoreAction(MVT::i32, MVT::i16, Custom);
+
+ // Workaround for LegalizeDAG asserting on expansion of i1 vector stores.
+ setTruncStoreAction(MVT::v2i32, MVT::v2i1, Expand);
+ setTruncStoreAction(MVT::v4i32, MVT::v4i1, Expand);
+
+ // Set condition code actions
+ setCondCodeAction(ISD::SETO, MVT::f32, Expand);
+ setCondCodeAction(ISD::SETUO, MVT::f32, Expand);
+ setCondCodeAction(ISD::SETLT, MVT::f32, Expand);
+ setCondCodeAction(ISD::SETLE, MVT::f32, Expand);
+ setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
+ setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
+ setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
+ setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
+ setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
+ setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
+ setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
+ setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
+
+ setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
+ setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
+ setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
+ setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
+
+ setOperationAction(ISD::FCOS, MVT::f32, Custom);
+ setOperationAction(ISD::FSIN, MVT::f32, Custom);
+
+ setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
+ setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
+
+ setOperationAction(ISD::BR_CC, MVT::i32, Expand);
+ setOperationAction(ISD::BR_CC, MVT::f32, Expand);
+ setOperationAction(ISD::BRCOND, MVT::Other, Custom);
+
+ setOperationAction(ISD::FSUB, MVT::f32, Expand);
+
+ setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
+
+ setOperationAction(ISD::SETCC, MVT::i32, Expand);
+ setOperationAction(ISD::SETCC, MVT::f32, Expand);
+ setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
+ setOperationAction(ISD::FP_TO_SINT, MVT::i1, Custom);
+ setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
+
+ setOperationAction(ISD::SELECT, MVT::i32, Expand);
+ setOperationAction(ISD::SELECT, MVT::f32, Expand);
+ setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
+ setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
+
+ // ADD, SUB overflow.
+ // TODO: turn these into Legal?
+ if (Subtarget->hasCARRY())
+ setOperationAction(ISD::UADDO, MVT::i32, Custom);
+
+ if (Subtarget->hasBORROW())
+ setOperationAction(ISD::USUBO, MVT::i32, Custom);
+
+ // Expand sign extension of vectors
+ if (!Subtarget->hasBFE())
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
+
+ if (!Subtarget->hasBFE())
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
+
+ if (!Subtarget->hasBFE())
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
+
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
+
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
+
+ setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
+
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
+
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
+
+ // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
+ // to be Legal/Custom in order to avoid library calls.
+ setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
+ setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
+ setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
+
+ setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+
+ const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
+ for (MVT VT : ScalarIntVTs) {
+ setOperationAction(ISD::ADDC, VT, Expand);
+ setOperationAction(ISD::SUBC, VT, Expand);
+ setOperationAction(ISD::ADDE, VT, Expand);
+ setOperationAction(ISD::SUBE, VT, Expand);
+ }
+
+ setSchedulingPreference(Sched::Source);
+
+ setTargetDAGCombine(ISD::FP_ROUND);
+ setTargetDAGCombine(ISD::FP_TO_SINT);
+ setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+ setTargetDAGCombine(ISD::SELECT_CC);
+ setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+ setTargetDAGCombine(ISD::LOAD);
+}
+
+const R600Subtarget *R600TargetLowering::getSubtarget() const {
+ return static_cast<const R600Subtarget *>(Subtarget);
+}
+
+static inline bool isEOP(MachineBasicBlock::iterator I) {
+ if (std::next(I) == I->getParent()->end())
+ return false;
+ return std::next(I)->getOpcode() == AMDGPU::RETURN;
+}
+
+MachineBasicBlock *
+R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ MachineBasicBlock::iterator I = MI;
+ const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
+
+ switch (MI.getOpcode()) {
+ default:
+ // Replace LDS_*_RET instruction that don't have any uses with the
+ // equivalent LDS_*_NORET instruction.
+ if (TII->isLDSRetInstr(MI.getOpcode())) {
+ int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
+ assert(DstIdx != -1);
+ MachineInstrBuilder NewMI;
+ // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
+ // LDS_1A2D support and remove this special case.
+ if (!MRI.use_empty(MI.getOperand(DstIdx).getReg()) ||
+ MI.getOpcode() == AMDGPU::LDS_CMPST_RET)
+ return BB;
+
+ NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
+ TII->get(AMDGPU::getLDSNoRetOp(MI.getOpcode())));
+ for (unsigned i = 1, e = MI.getNumOperands(); i < e; ++i) {
+ NewMI.addOperand(MI.getOperand(i));
+ }
+ } else {
+ return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
+ }
+ break;
+ case AMDGPU::CLAMP_R600: {
+ MachineInstr *NewMI = TII->buildDefaultInstruction(
+ *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
+ MI.getOperand(1).getReg());
+ TII->addFlag(*NewMI, 0, MO_FLAG_CLAMP);
+ break;
+ }
+
+ case AMDGPU::FABS_R600: {
+ MachineInstr *NewMI = TII->buildDefaultInstruction(
+ *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
+ MI.getOperand(1).getReg());
+ TII->addFlag(*NewMI, 0, MO_FLAG_ABS);
+ break;
+ }
+
+ case AMDGPU::FNEG_R600: {
+ MachineInstr *NewMI = TII->buildDefaultInstruction(
+ *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
+ MI.getOperand(1).getReg());
+ TII->addFlag(*NewMI, 0, MO_FLAG_NEG);
+ break;
+ }
+
+ case AMDGPU::MASK_WRITE: {
+ unsigned maskedRegister = MI.getOperand(0).getReg();
+ assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
+ MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
+ TII->addFlag(*defInstr, 0, MO_FLAG_MASK);
+ break;
+ }
+
+ case AMDGPU::MOV_IMM_F32:
+ TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), MI.getOperand(1)
+ .getFPImm()
+ ->getValueAPF()
+ .bitcastToAPInt()
+ .getZExtValue());
+ break;
+
+ case AMDGPU::MOV_IMM_I32:
+ TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(),
+ MI.getOperand(1).getImm());
+ break;
+
+ case AMDGPU::MOV_IMM_GLOBAL_ADDR: {
+ //TODO: Perhaps combine this instruction with the next if possible
+ auto MIB = TII->buildDefaultInstruction(
+ *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_LITERAL_X);
+ int Idx = TII->getOperandIdx(*MIB, AMDGPU::OpName::literal);
+ //TODO: Ugh this is rather ugly
+ MIB->getOperand(Idx) = MI.getOperand(1);
+ break;
+ }
+
+ case AMDGPU::CONST_COPY: {
+ MachineInstr *NewMI = TII->buildDefaultInstruction(
+ *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_CONST);
+ TII->setImmOperand(*NewMI, AMDGPU::OpName::src0_sel,
+ MI.getOperand(1).getImm());
+ break;
+ }
+
+ case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
+ case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
+ case AMDGPU::RAT_WRITE_CACHELESS_128_eg:
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
+ .addOperand(MI.getOperand(0))
+ .addOperand(MI.getOperand(1))
+ .addImm(isEOP(I)); // Set End of program bit
+ break;
+
+ case AMDGPU::RAT_STORE_TYPED_eg:
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
+ .addOperand(MI.getOperand(0))
+ .addOperand(MI.getOperand(1))
+ .addOperand(MI.getOperand(2))
+ .addImm(isEOP(I)); // Set End of program bit
+ break;
+
+ case AMDGPU::BRANCH:
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
+ .addOperand(MI.getOperand(0));
+ break;
+
+ case AMDGPU::BRANCH_COND_f32: {
+ MachineInstr *NewMI =
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
+ AMDGPU::PREDICATE_BIT)
+ .addOperand(MI.getOperand(1))
+ .addImm(AMDGPU::PRED_SETNE)
+ .addImm(0); // Flags
+ TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
+ .addOperand(MI.getOperand(0))
+ .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+ break;
+ }
+
+ case AMDGPU::BRANCH_COND_i32: {
+ MachineInstr *NewMI =
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
+ AMDGPU::PREDICATE_BIT)
+ .addOperand(MI.getOperand(1))
+ .addImm(AMDGPU::PRED_SETNE_INT)
+ .addImm(0); // Flags
+ TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
+ .addOperand(MI.getOperand(0))
+ .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+ break;
+ }
+
+ case AMDGPU::EG_ExportSwz:
+ case AMDGPU::R600_ExportSwz: {
+ // Instruction is left unmodified if its not the last one of its type
+ bool isLastInstructionOfItsType = true;
+ unsigned InstExportType = MI.getOperand(1).getImm();
+ for (MachineBasicBlock::iterator NextExportInst = std::next(I),
+ EndBlock = BB->end(); NextExportInst != EndBlock;
+ NextExportInst = std::next(NextExportInst)) {
+ if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
+ NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
+ unsigned CurrentInstExportType = NextExportInst->getOperand(1)
+ .getImm();
+ if (CurrentInstExportType == InstExportType) {
+ isLastInstructionOfItsType = false;
+ break;
+ }
+ }
+ }
+ bool EOP = isEOP(I);
+ if (!EOP && !isLastInstructionOfItsType)
+ return BB;
+ unsigned CfInst = (MI.getOpcode() == AMDGPU::EG_ExportSwz) ? 84 : 40;
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
+ .addOperand(MI.getOperand(0))
+ .addOperand(MI.getOperand(1))
+ .addOperand(MI.getOperand(2))
+ .addOperand(MI.getOperand(3))
+ .addOperand(MI.getOperand(4))
+ .addOperand(MI.getOperand(5))
+ .addOperand(MI.getOperand(6))
+ .addImm(CfInst)
+ .addImm(EOP);
+ break;
+ }
+ case AMDGPU::RETURN: {
+ return BB;
+ }
+ }
+
+ MI.eraseFromParent();
+ return BB;
+}
+
+//===----------------------------------------------------------------------===//
+// Custom DAG Lowering Operations
+//===----------------------------------------------------------------------===//
+
+SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
+ switch (Op.getOpcode()) {
+ default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+ case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+ case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
+ case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
+ case ISD::SRA_PARTS:
+ case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
+ case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY);
+ case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW);
+ case ISD::FCOS:
+ case ISD::FSIN: return LowerTrig(Op, DAG);
+ case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
+ case ISD::STORE: return LowerSTORE(Op, DAG);
+ case ISD::LOAD: {
+ SDValue Result = LowerLOAD(Op, DAG);
+ assert((!Result.getNode() ||
+ Result.getNode()->getNumValues() == 2) &&
+ "Load should return a value and a chain");
+ return Result;
+ }
+
+ case ISD::BRCOND: return LowerBRCOND(Op, DAG);
+ case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
+ case ISD::FrameIndex: return lowerFrameIndex(Op, DAG);
+ case ISD::INTRINSIC_VOID: {
+ SDValue Chain = Op.getOperand(0);
+ unsigned IntrinsicID =
+ cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+ switch (IntrinsicID) {
+ case AMDGPUIntrinsic::r600_store_swizzle: {
+ SDLoc DL(Op);
+ const SDValue Args[8] = {
+ Chain,
+ Op.getOperand(2), // Export Value
+ Op.getOperand(3), // ArrayBase
+ Op.getOperand(4), // Type
+ DAG.getConstant(0, DL, MVT::i32), // SWZ_X
+ DAG.getConstant(1, DL, MVT::i32), // SWZ_Y
+ DAG.getConstant(2, DL, MVT::i32), // SWZ_Z
+ DAG.getConstant(3, DL, MVT::i32) // SWZ_W
+ };
+ return DAG.getNode(AMDGPUISD::R600_EXPORT, DL, Op.getValueType(), Args);
+ }
+
+ // default for switch(IntrinsicID)
+ default: break;
+ }
+ // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
+ break;
+ }
+ case ISD::INTRINSIC_WO_CHAIN: {
+ unsigned IntrinsicID =
+ cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ EVT VT = Op.getValueType();
+ SDLoc DL(Op);
+ switch(IntrinsicID) {
+ default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+ case AMDGPUIntrinsic::r600_tex:
+ case AMDGPUIntrinsic::r600_texc: {
+ unsigned TextureOp;
+ switch (IntrinsicID) {
+ case AMDGPUIntrinsic::r600_tex:
+ TextureOp = 0;
+ break;
+ case AMDGPUIntrinsic::r600_texc:
+ TextureOp = 1;
+ break;
+ default:
+ llvm_unreachable("unhandled texture operation");
+ }
+
+ SDValue TexArgs[19] = {
+ DAG.getConstant(TextureOp, DL, MVT::i32),
+ Op.getOperand(1),
+ DAG.getConstant(0, DL, MVT::i32),
+ DAG.getConstant(1, DL, MVT::i32),
+ DAG.getConstant(2, DL, MVT::i32),
+ DAG.getConstant(3, DL, MVT::i32),
+ Op.getOperand(2),
+ Op.getOperand(3),
+ Op.getOperand(4),
+ DAG.getConstant(0, DL, MVT::i32),
+ DAG.getConstant(1, DL, MVT::i32),
+ DAG.getConstant(2, DL, MVT::i32),
+ DAG.getConstant(3, DL, MVT::i32),
+ Op.getOperand(5),
+ Op.getOperand(6),
+ Op.getOperand(7),
+ Op.getOperand(8),
+ Op.getOperand(9),
+ Op.getOperand(10)
+ };
+ return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
+ }
+ case AMDGPUIntrinsic::r600_dot4: {
+ SDValue Args[8] = {
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
+ DAG.getConstant(0, DL, MVT::i32)),
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
+ DAG.getConstant(0, DL, MVT::i32)),
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
+ DAG.getConstant(1, DL, MVT::i32)),
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
+ DAG.getConstant(1, DL, MVT::i32)),
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
+ DAG.getConstant(2, DL, MVT::i32)),
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
+ DAG.getConstant(2, DL, MVT::i32)),
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
+ DAG.getConstant(3, DL, MVT::i32)),
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
+ DAG.getConstant(3, DL, MVT::i32))
+ };
+ return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
+ }
+
+ case Intrinsic::r600_implicitarg_ptr: {
+ MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUAS::PARAM_I_ADDRESS);
+ uint32_t ByteOffset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
+ return DAG.getConstant(ByteOffset, DL, PtrVT);
+ }
+ case Intrinsic::r600_read_ngroups_x:
+ return LowerImplicitParameter(DAG, VT, DL, 0);
+ case Intrinsic::r600_read_ngroups_y:
+ return LowerImplicitParameter(DAG, VT, DL, 1);
+ case Intrinsic::r600_read_ngroups_z:
+ return LowerImplicitParameter(DAG, VT, DL, 2);
+ case Intrinsic::r600_read_global_size_x:
+ return LowerImplicitParameter(DAG, VT, DL, 3);
+ case Intrinsic::r600_read_global_size_y:
+ return LowerImplicitParameter(DAG, VT, DL, 4);
+ case Intrinsic::r600_read_global_size_z:
+ return LowerImplicitParameter(DAG, VT, DL, 5);
+ case Intrinsic::r600_read_local_size_x:
+ return LowerImplicitParameter(DAG, VT, DL, 6);
+ case Intrinsic::r600_read_local_size_y:
+ return LowerImplicitParameter(DAG, VT, DL, 7);
+ case Intrinsic::r600_read_local_size_z:
+ return LowerImplicitParameter(DAG, VT, DL, 8);
+
+ case Intrinsic::r600_read_tgid_x:
+ return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+ AMDGPU::T1_X, VT);
+ case Intrinsic::r600_read_tgid_y:
+ return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+ AMDGPU::T1_Y, VT);
+ case Intrinsic::r600_read_tgid_z:
+ return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+ AMDGPU::T1_Z, VT);
+ case Intrinsic::r600_read_tidig_x:
+ return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+ AMDGPU::T0_X, VT);
+ case Intrinsic::r600_read_tidig_y:
+ return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+ AMDGPU::T0_Y, VT);
+ case Intrinsic::r600_read_tidig_z:
+ return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+ AMDGPU::T0_Z, VT);
+
+ case Intrinsic::r600_recipsqrt_ieee:
+ return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
+
+ case Intrinsic::r600_recipsqrt_clamped:
+ return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
+ }
+
+ // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
+ break;
+ }
+ } // end switch(Op.getOpcode())
+ return SDValue();
+}
+
+void R600TargetLowering::ReplaceNodeResults(SDNode *N,
+ SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const {
+ switch (N->getOpcode()) {
+ default:
+ AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
+ return;
+ case ISD::FP_TO_UINT:
+ if (N->getValueType(0) == MVT::i1) {
+ Results.push_back(lowerFP_TO_UINT(N->getOperand(0), DAG));
+ return;
+ }
+ // Since we don't care about out of bounds values we can use FP_TO_SINT for
+ // uints too. The DAGLegalizer code for uint considers some extra cases
+ // which are not necessary here.
+ LLVM_FALLTHROUGH;
+ case ISD::FP_TO_SINT: {
+ if (N->getValueType(0) == MVT::i1) {
+ Results.push_back(lowerFP_TO_SINT(N->getOperand(0), DAG));
+ return;
+ }
+
+ SDValue Result;
+ if (expandFP_TO_SINT(N, Result, DAG))
+ Results.push_back(Result);
+ return;
+ }
+ case ISD::SDIVREM: {
+ SDValue Op = SDValue(N, 1);
+ SDValue RES = LowerSDIVREM(Op, DAG);
+ Results.push_back(RES);
+ Results.push_back(RES.getValue(1));
+ break;
+ }
+ case ISD::UDIVREM: {
+ SDValue Op = SDValue(N, 0);
+ LowerUDIVREM64(Op, DAG, Results);
+ break;
+ }
+ }
+}
+
+SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
+ SDValue Vector) const {
+ SDLoc DL(Vector);
+ EVT VecVT = Vector.getValueType();
+ EVT EltVT = VecVT.getVectorElementType();
+ SmallVector<SDValue, 8> Args;
+
+ for (unsigned i = 0, e = VecVT.getVectorNumElements(); i != e; ++i) {
+ Args.push_back(DAG.getNode(
+ ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector,
+ DAG.getConstant(i, DL, getVectorIdxTy(DAG.getDataLayout()))));
+ }
+
+ return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
+}
+
+SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ SDValue Vector = Op.getOperand(0);
+ SDValue Index = Op.getOperand(1);
+
+ if (isa<ConstantSDNode>(Index) ||
+ Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
+ return Op;
+
+ Vector = vectorToVerticalVector(DAG, Vector);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
+ Vector, Index);
+}
+
+SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ SDValue Vector = Op.getOperand(0);
+ SDValue Value = Op.getOperand(1);
+ SDValue Index = Op.getOperand(2);
+
+ if (isa<ConstantSDNode>(Index) ||
+ Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
+ return Op;
+
+ Vector = vectorToVerticalVector(DAG, Vector);
+ SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
+ Vector, Value, Index);
+ return vectorToVerticalVector(DAG, Insert);
+}
+
+SDValue R600TargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
+ SDValue Op,
+ SelectionDAG &DAG) const {
+ GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
+ if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
+ return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
+
+ const DataLayout &DL = DAG.getDataLayout();
+ const GlobalValue *GV = GSD->getGlobal();
+ MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
+
+ SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(GSD), ConstPtrVT);
+ return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(GSD), ConstPtrVT, GA);
+}
+
+SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
+ // On hw >= R700, COS/SIN input must be between -1. and 1.
+ // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
+ EVT VT = Op.getValueType();
+ SDValue Arg = Op.getOperand(0);
+ SDLoc DL(Op);
+
+ // TODO: Should this propagate fast-math-flags?
+ SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
+ DAG.getNode(ISD::FADD, DL, VT,
+ DAG.getNode(ISD::FMUL, DL, VT, Arg,
+ DAG.getConstantFP(0.15915494309, DL, MVT::f32)),
+ DAG.getConstantFP(0.5, DL, MVT::f32)));
+ unsigned TrigNode;
+ switch (Op.getOpcode()) {
+ case ISD::FCOS:
+ TrigNode = AMDGPUISD::COS_HW;
+ break;
+ case ISD::FSIN:
+ TrigNode = AMDGPUISD::SIN_HW;
+ break;
+ default:
+ llvm_unreachable("Wrong trig opcode");
+ }
+ SDValue TrigVal = DAG.getNode(TrigNode, DL, VT,
+ DAG.getNode(ISD::FADD, DL, VT, FractPart,
+ DAG.getConstantFP(-0.5, DL, MVT::f32)));
+ if (Gen >= R600Subtarget::R700)
+ return TrigVal;
+ // On R600 hw, COS/SIN input must be between -Pi and Pi.
+ return DAG.getNode(ISD::FMUL, DL, VT, TrigVal,
+ DAG.getConstantFP(3.14159265359, DL, MVT::f32));
+}
+
+SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+
+ SDValue Lo = Op.getOperand(0);
+ SDValue Hi = Op.getOperand(1);
+ SDValue Shift = Op.getOperand(2);
+ SDValue Zero = DAG.getConstant(0, DL, VT);
+ SDValue One = DAG.getConstant(1, DL, VT);
+
+ SDValue Width = DAG.getConstant(VT.getSizeInBits(), DL, VT);
+ SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
+ SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
+ SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
+
+ // The dance around Width1 is necessary for 0 special case.
+ // Without it the CompShift might be 32, producing incorrect results in
+ // Overflow. So we do the shift in two steps, the alternative is to
+ // add a conditional to filter the special case.
+
+ SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
+ Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
+
+ SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
+ HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
+ SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
+
+ SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
+ SDValue LoBig = Zero;
+
+ Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
+ Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
+
+ return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
+}
+
+SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+
+ SDValue Lo = Op.getOperand(0);
+ SDValue Hi = Op.getOperand(1);
+ SDValue Shift = Op.getOperand(2);
+ SDValue Zero = DAG.getConstant(0, DL, VT);
+ SDValue One = DAG.getConstant(1, DL, VT);
+
+ const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
+
+ SDValue Width = DAG.getConstant(VT.getSizeInBits(), DL, VT);
+ SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
+ SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
+ SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
+
+ // The dance around Width1 is necessary for 0 special case.
+ // Without it the CompShift might be 32, producing incorrect results in
+ // Overflow. So we do the shift in two steps, the alternative is to
+ // add a conditional to filter the special case.
+
+ SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
+ Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
+
+ SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
+ SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
+ LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
+
+ SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
+ SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
+
+ Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
+ Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
+
+ return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
+}
+
+SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG,
+ unsigned mainop, unsigned ovf) const {
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+
+ SDValue Lo = Op.getOperand(0);
+ SDValue Hi = Op.getOperand(1);
+
+ SDValue OVF = DAG.getNode(ovf, DL, VT, Lo, Hi);
+ // Extend sign.
+ OVF = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, OVF,
+ DAG.getValueType(MVT::i1));
+
+ SDValue Res = DAG.getNode(mainop, DL, VT, Lo, Hi);
+
+ return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF);
+}
+
+SDValue R600TargetLowering::lowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ return DAG.getNode(
+ ISD::SETCC,
+ DL,
+ MVT::i1,
+ Op, DAG.getConstantFP(1.0f, DL, MVT::f32),
+ DAG.getCondCode(ISD::SETEQ));
+}
+
+SDValue R600TargetLowering::lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ return DAG.getNode(
+ ISD::SETCC,
+ DL,
+ MVT::i1,
+ Op, DAG.getConstantFP(-1.0f, DL, MVT::f32),
+ DAG.getCondCode(ISD::SETEQ));
+}
+
+SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
+ const SDLoc &DL,
+ unsigned DwordOffset) const {
+ unsigned ByteOffset = DwordOffset * 4;
+ PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
+ AMDGPUAS::CONSTANT_BUFFER_0);
+
+ // We shouldn't be using an offset wider than 16-bits for implicit parameters.
+ assert(isInt<16>(ByteOffset));
+
+ return DAG.getLoad(VT, DL, DAG.getEntryNode(),
+ DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR
+ MachinePointerInfo(ConstantPointerNull::get(PtrType)));
+}
+
+bool R600TargetLowering::isZero(SDValue Op) const {
+ if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
+ return Cst->isNullValue();
+ } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
+ return CstFP->isZero();
+ } else {
+ return false;
+ }
+}
+
+bool R600TargetLowering::isHWTrueValue(SDValue Op) const {
+ if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
+ return CFP->isExactlyValue(1.0);
+ }
+ return isAllOnesConstant(Op);
+}
+
+bool R600TargetLowering::isHWFalseValue(SDValue Op) const {
+ if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
+ return CFP->getValueAPF().isZero();
+ }
+ return isNullConstant(Op);
+}
+
+SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ SDValue True = Op.getOperand(2);
+ SDValue False = Op.getOperand(3);
+ SDValue CC = Op.getOperand(4);
+ SDValue Temp;
+
+ if (VT == MVT::f32) {
+ DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
+ SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
+ if (MinMax)
+ return MinMax;
+ }
+
+ // LHS and RHS are guaranteed to be the same value type
+ EVT CompareVT = LHS.getValueType();
+
+ // Check if we can lower this to a native operation.
+
+ // Try to lower to a SET* instruction:
+ //
+ // SET* can match the following patterns:
+ //
+ // select_cc f32, f32, -1, 0, cc_supported
+ // select_cc f32, f32, 1.0f, 0.0f, cc_supported
+ // select_cc i32, i32, -1, 0, cc_supported
+ //
+
+ // Move hardware True/False values to the correct operand.
+ ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
+ ISD::CondCode InverseCC =
+ ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
+ if (isHWTrueValue(False) && isHWFalseValue(True)) {
+ if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
+ std::swap(False, True);
+ CC = DAG.getCondCode(InverseCC);
+ } else {
+ ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
+ if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
+ std::swap(False, True);
+ std::swap(LHS, RHS);
+ CC = DAG.getCondCode(SwapInvCC);
+ }
+ }
+ }
+
+ if (isHWTrueValue(True) && isHWFalseValue(False) &&
+ (CompareVT == VT || VT == MVT::i32)) {
+ // This can be matched by a SET* instruction.
+ return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
+ }
+
+ // Try to lower to a CND* instruction:
+ //
+ // CND* can match the following patterns:
+ //
+ // select_cc f32, 0.0, f32, f32, cc_supported
+ // select_cc f32, 0.0, i32, i32, cc_supported
+ // select_cc i32, 0, f32, f32, cc_supported
+ // select_cc i32, 0, i32, i32, cc_supported
+ //
+
+ // Try to move the zero value to the RHS
+ if (isZero(LHS)) {
+ ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
+ // Try swapping the operands
+ ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
+ if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
+ std::swap(LHS, RHS);
+ CC = DAG.getCondCode(CCSwapped);
+ } else {
+ // Try inverting the conditon and then swapping the operands
+ ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
+ CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
+ if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
+ std::swap(True, False);
+ std::swap(LHS, RHS);
+ CC = DAG.getCondCode(CCSwapped);
+ }
+ }
+ }
+ if (isZero(RHS)) {
+ SDValue Cond = LHS;
+ SDValue Zero = RHS;
+ ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
+ if (CompareVT != VT) {
+ // Bitcast True / False to the correct types. This will end up being
+ // a nop, but it allows us to define only a single pattern in the
+ // .TD files for each CND* instruction rather than having to have
+ // one pattern for integer True/False and one for fp True/False
+ True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
+ False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
+ }
+
+ switch (CCOpcode) {
+ case ISD::SETONE:
+ case ISD::SETUNE:
+ case ISD::SETNE:
+ CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
+ Temp = True;
+ True = False;
+ False = Temp;
+ break;
+ default:
+ break;
+ }
+ SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
+ Cond, Zero,
+ True, False,
+ DAG.getCondCode(CCOpcode));
+ return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
+ }
+
+ // If we make it this for it means we have no native instructions to handle
+ // this SELECT_CC, so we must lower it.
+ SDValue HWTrue, HWFalse;
+
+ if (CompareVT == MVT::f32) {
+ HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT);
+ HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT);
+ } else if (CompareVT == MVT::i32) {
+ HWTrue = DAG.getConstant(-1, DL, CompareVT);
+ HWFalse = DAG.getConstant(0, DL, CompareVT);
+ }
+ else {
+ llvm_unreachable("Unhandled value type in LowerSELECT_CC");
+ }
+
+ // Lower this unsupported SELECT_CC into a combination of two supported
+ // SELECT_CC operations.
+ SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
+
+ return DAG.getNode(ISD::SELECT_CC, DL, VT,
+ Cond, HWFalse,
+ True, False,
+ DAG.getCondCode(ISD::SETNE));
+}
+
+/// LLVM generates byte-addressed pointers. For indirect addressing, we need to
+/// convert these pointers to a register index. Each register holds
+/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
+/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
+/// for indirect addressing.
+SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
+ unsigned StackWidth,
+ SelectionDAG &DAG) const {
+ unsigned SRLPad;
+ switch(StackWidth) {
+ case 1:
+ SRLPad = 2;
+ break;
+ case 2:
+ SRLPad = 3;
+ break;
+ case 4:
+ SRLPad = 4;
+ break;
+ default: llvm_unreachable("Invalid stack width");
+ }
+
+ SDLoc DL(Ptr);
+ return DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr,
+ DAG.getConstant(SRLPad, DL, MVT::i32));
+}
+
+void R600TargetLowering::getStackAddress(unsigned StackWidth,
+ unsigned ElemIdx,
+ unsigned &Channel,
+ unsigned &PtrIncr) const {
+ switch (StackWidth) {
+ default:
+ case 1:
+ Channel = 0;
+ if (ElemIdx > 0) {
+ PtrIncr = 1;
+ } else {
+ PtrIncr = 0;
+ }
+ break;
+ case 2:
+ Channel = ElemIdx % 2;
+ if (ElemIdx == 2) {
+ PtrIncr = 1;
+ } else {
+ PtrIncr = 0;
+ }
+ break;
+ case 4:
+ Channel = ElemIdx;
+ PtrIncr = 0;
+ break;
+ }
+}
+
+SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Store);
+
+ unsigned Mask = 0;
+ if (Store->getMemoryVT() == MVT::i8) {
+ Mask = 0xff;
+ } else if (Store->getMemoryVT() == MVT::i16) {
+ Mask = 0xffff;
+ }
+
+ SDValue Chain = Store->getChain();
+ SDValue BasePtr = Store->getBasePtr();
+ EVT MemVT = Store->getMemoryVT();
+
+ SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr,
+ DAG.getConstant(2, DL, MVT::i32));
+ SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
+ Chain, Ptr,
+ DAG.getTargetConstant(0, DL, MVT::i32));
+
+ SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr,
+ DAG.getConstant(0x3, DL, MVT::i32));
+
+ SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
+ DAG.getConstant(3, DL, MVT::i32));
+
+ SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
+ Store->getValue());
+
+ SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT);
+
+ SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
+ MaskedValue, ShiftAmt);
+
+ SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32,
+ DAG.getConstant(Mask, DL, MVT::i32),
+ ShiftAmt);
+ DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask,
+ DAG.getConstant(0xffffffff, DL, MVT::i32));
+ Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
+
+ SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
+ return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
+ Chain, Value, Ptr,
+ DAG.getTargetConstant(0, DL, MVT::i32));
+}
+
+SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+ StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
+ unsigned AS = StoreNode->getAddressSpace();
+ SDValue Value = StoreNode->getValue();
+ EVT ValueVT = Value.getValueType();
+ EVT MemVT = StoreNode->getMemoryVT();
+ unsigned Align = StoreNode->getAlignment();
+
+ if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) &&
+ ValueVT.isVector()) {
+ return SplitVectorStore(Op, DAG);
+ }
+
+ // Private AS needs special fixes
+ if (Align < MemVT.getStoreSize() && (AS != AMDGPUAS::PRIVATE_ADDRESS) &&
+ !allowsMisalignedMemoryAccesses(MemVT, AS, Align, nullptr)) {
+ return expandUnalignedStore(StoreNode, DAG);
+ }
+
+ SDLoc DL(Op);
+ SDValue Chain = StoreNode->getChain();
+ SDValue Ptr = StoreNode->getBasePtr();
+
+ if (AS == AMDGPUAS::GLOBAL_ADDRESS) {
+ // It is beneficial to create MSKOR here instead of combiner to avoid
+ // artificial dependencies introduced by RMW
+ if (StoreNode->isTruncatingStore()) {
+ EVT VT = Value.getValueType();
+ assert(VT.bitsLE(MVT::i32));
+ SDValue MaskConstant;
+ if (MemVT == MVT::i8) {
+ MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32);
+ } else {
+ assert(MemVT == MVT::i16);
+ assert(StoreNode->getAlignment() >= 2);
+ MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32);
+ }
+ SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
+ DAG.getConstant(2, DL, MVT::i32));
+ SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
+ DAG.getConstant(0x00000003, DL, VT));
+ SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
+ SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
+ DAG.getConstant(3, DL, VT));
+ SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
+ SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
+ // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
+ // vector instead.
+ SDValue Src[4] = {
+ ShiftedValue,
+ DAG.getConstant(0, DL, MVT::i32),
+ DAG.getConstant(0, DL, MVT::i32),
+ Mask
+ };
+ SDValue Input = DAG.getBuildVector(MVT::v4i32, DL, Src);
+ SDValue Args[3] = { Chain, Input, DWordAddr };
+ return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
+ Op->getVTList(), Args, MemVT,
+ StoreNode->getMemOperand());
+ } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
+ ValueVT.bitsGE(MVT::i32)) {
+ // Convert pointer from byte address to dword address.
+ Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
+ DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
+ Ptr, DAG.getConstant(2, DL, MVT::i32)));
+
+ if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
+ llvm_unreachable("Truncated and indexed stores not supported yet");
+ } else {
+ Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
+ }
+ return Chain;
+ }
+ }
+
+ if (AS != AMDGPUAS::PRIVATE_ADDRESS)
+ return SDValue();
+
+ if (MemVT.bitsLT(MVT::i32))
+ return lowerPrivateTruncStore(StoreNode, DAG);
+
+ // Lowering for indirect addressing
+ const MachineFunction &MF = DAG.getMachineFunction();
+ const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
+ unsigned StackWidth = TFL->getStackWidth(MF);
+
+ Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
+
+ if (ValueVT.isVector()) {
+ unsigned NumElemVT = ValueVT.getVectorNumElements();
+ EVT ElemVT = ValueVT.getVectorElementType();
+ SmallVector<SDValue, 4> Stores(NumElemVT);
+
+ assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
+ "vector width in load");
+
+ for (unsigned i = 0; i < NumElemVT; ++i) {
+ unsigned Channel, PtrIncr;
+ getStackAddress(StackWidth, i, Channel, PtrIncr);
+ Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
+ DAG.getConstant(PtrIncr, DL, MVT::i32));
+ SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
+ Value, DAG.getConstant(i, DL, MVT::i32));
+
+ Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
+ Chain, Elem, Ptr,
+ DAG.getTargetConstant(Channel, DL, MVT::i32));
+ }
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
+ } else {
+ if (ValueVT == MVT::i8) {
+ Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
+ }
+ Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
+ DAG.getTargetConstant(0, DL, MVT::i32)); // Channel
+ }
+
+ return Chain;
+}
+
+// return (512 + (kc_bank << 12)
+static int
+ConstantAddressBlock(unsigned AddressSpace) {
+ switch (AddressSpace) {
+ case AMDGPUAS::CONSTANT_BUFFER_0:
+ return 512;
+ case AMDGPUAS::CONSTANT_BUFFER_1:
+ return 512 + 4096;
+ case AMDGPUAS::CONSTANT_BUFFER_2:
+ return 512 + 4096 * 2;
+ case AMDGPUAS::CONSTANT_BUFFER_3:
+ return 512 + 4096 * 3;
+ case AMDGPUAS::CONSTANT_BUFFER_4:
+ return 512 + 4096 * 4;
+ case AMDGPUAS::CONSTANT_BUFFER_5:
+ return 512 + 4096 * 5;
+ case AMDGPUAS::CONSTANT_BUFFER_6:
+ return 512 + 4096 * 6;
+ case AMDGPUAS::CONSTANT_BUFFER_7:
+ return 512 + 4096 * 7;
+ case AMDGPUAS::CONSTANT_BUFFER_8:
+ return 512 + 4096 * 8;
+ case AMDGPUAS::CONSTANT_BUFFER_9:
+ return 512 + 4096 * 9;
+ case AMDGPUAS::CONSTANT_BUFFER_10:
+ return 512 + 4096 * 10;
+ case AMDGPUAS::CONSTANT_BUFFER_11:
+ return 512 + 4096 * 11;
+ case AMDGPUAS::CONSTANT_BUFFER_12:
+ return 512 + 4096 * 12;
+ case AMDGPUAS::CONSTANT_BUFFER_13:
+ return 512 + 4096 * 13;
+ case AMDGPUAS::CONSTANT_BUFFER_14:
+ return 512 + 4096 * 14;
+ case AMDGPUAS::CONSTANT_BUFFER_15:
+ return 512 + 4096 * 15;
+ default:
+ return -1;
+ }
+}
+
+SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ LoadSDNode *Load = cast<LoadSDNode>(Op);
+ ISD::LoadExtType ExtType = Load->getExtensionType();
+ EVT MemVT = Load->getMemoryVT();
+
+ // <SI && AS=PRIVATE && EXTLOAD && size < 32bit,
+ // register (2-)byte extract.
+
+ // Get Register holding the target.
+ SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
+ DAG.getConstant(2, DL, MVT::i32));
+ // Load the Register.
+ SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
+ Load->getChain(),
+ Ptr,
+ DAG.getTargetConstant(0, DL, MVT::i32),
+ Op.getOperand(2));
+
+ // Get offset within the register.
+ SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
+ Load->getBasePtr(),
+ DAG.getConstant(0x3, DL, MVT::i32));
+
+ // Bit offset of target byte (byteIdx * 8).
+ SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
+ DAG.getConstant(3, DL, MVT::i32));
+
+ // Shift to the right.
+ Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt);
+
+ // Eliminate the upper bits by setting them to ...
+ EVT MemEltVT = MemVT.getScalarType();
+
+ // ... ones.
+ if (ExtType == ISD::SEXTLOAD) {
+ SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
+
+ SDValue Ops[] = {
+ DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode),
+ Load->getChain()
+ };
+
+ return DAG.getMergeValues(Ops, DL);
+ }
+
+ // ... or zeros.
+ SDValue Ops[] = {
+ DAG.getZeroExtendInReg(Ret, DL, MemEltVT),
+ Load->getChain()
+ };
+
+ return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+ LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
+ unsigned AS = LoadNode->getAddressSpace();
+ EVT MemVT = LoadNode->getMemoryVT();
+ ISD::LoadExtType ExtType = LoadNode->getExtensionType();
+
+ if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
+ ExtType != ISD::NON_EXTLOAD && MemVT.bitsLT(MVT::i32)) {
+ return lowerPrivateExtLoad(Op, DAG);
+ }
+
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+ SDValue Chain = LoadNode->getChain();
+ SDValue Ptr = LoadNode->getBasePtr();
+
+ if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
+ SDValue MergedValues[2] = {
+ scalarizeVectorLoad(LoadNode, DAG),
+ Chain
+ };
+ return DAG.getMergeValues(MergedValues, DL);
+ }
+
+ int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
+ if (ConstantBlock > -1 &&
+ ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
+ (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
+ SDValue Result;
+ if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
+ isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
+ isa<ConstantSDNode>(Ptr)) {
+ SDValue Slots[4];
+ for (unsigned i = 0; i < 4; i++) {
+ // We want Const position encoded with the following formula :
+ // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
+ // const_index is Ptr computed by llvm using an alignment of 16.
+ // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
+ // then div by 4 at the ISel step
+ SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+ DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32));
+ Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
+ }
+ EVT NewVT = MVT::v4i32;
+ unsigned NumElements = 4;
+ if (VT.isVector()) {
+ NewVT = VT;
+ NumElements = VT.getVectorNumElements();
+ }
+ Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements));
+ } else {
+ // non-constant ptr can't be folded, keeps it as a v4f32 load
+ Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
+ DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
+ DAG.getConstant(4, DL, MVT::i32)),
+ DAG.getConstant(LoadNode->getAddressSpace() -
+ AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32)
+ );
+ }
+
+ if (!VT.isVector()) {
+ Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
+ DAG.getConstant(0, DL, MVT::i32));
+ }
+
+ SDValue MergedValues[2] = {
+ Result,
+ Chain
+ };
+ return DAG.getMergeValues(MergedValues, DL);
+ }
+
+ SDValue LoweredLoad;
+
+ // For most operations returning SDValue() will result in the node being
+ // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
+ // need to manually expand loads that may be legal in some address spaces and
+ // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
+ // compute shaders, since the data is sign extended when it is uploaded to the
+ // buffer. However SEXT loads from other address spaces are not supported, so
+ // we need to expand them here.
+ if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
+ EVT MemVT = LoadNode->getMemoryVT();
+ assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
+ SDValue NewLoad = DAG.getExtLoad(
+ ISD::EXTLOAD, DL, VT, Chain, Ptr, LoadNode->getPointerInfo(), MemVT,
+ LoadNode->getAlignment(), LoadNode->getMemOperand()->getFlags());
+ SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad,
+ DAG.getValueType(MemVT));
+
+ SDValue MergedValues[2] = { Res, Chain };
+ return DAG.getMergeValues(MergedValues, DL);
+ }
+
+ if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
+ return SDValue();
+ }
+
+ // Lowering for indirect addressing
+ const MachineFunction &MF = DAG.getMachineFunction();
+ const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
+ unsigned StackWidth = TFL->getStackWidth(MF);
+
+ Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
+
+ if (VT.isVector()) {
+ unsigned NumElemVT = VT.getVectorNumElements();
+ EVT ElemVT = VT.getVectorElementType();
+ SDValue Loads[4];
+
+ assert(NumElemVT <= 4);
+ assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
+ "vector width in load");
+
+ for (unsigned i = 0; i < NumElemVT; ++i) {
+ unsigned Channel, PtrIncr;
+ getStackAddress(StackWidth, i, Channel, PtrIncr);
+ Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
+ DAG.getConstant(PtrIncr, DL, MVT::i32));
+ Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
+ Chain, Ptr,
+ DAG.getTargetConstant(Channel, DL, MVT::i32),
+ Op.getOperand(2));
+ }
+ EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElemVT);
+ LoweredLoad = DAG.getBuildVector(TargetVT, DL, makeArrayRef(Loads, NumElemVT));
+ } else {
+ LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
+ Chain, Ptr,
+ DAG.getTargetConstant(0, DL, MVT::i32), // Channel
+ Op.getOperand(2));
+ }
+
+ SDValue Ops[2] = {
+ LoweredLoad,
+ Chain
+ };
+
+ return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
+ SDValue Chain = Op.getOperand(0);
+ SDValue Cond = Op.getOperand(1);
+ SDValue Jump = Op.getOperand(2);
+
+ return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
+ Chain, Jump, Cond);
+}
+
+SDValue R600TargetLowering::lowerFrameIndex(SDValue Op,
+ SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
+
+ FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
+
+ unsigned FrameIndex = FIN->getIndex();
+ unsigned IgnoredFrameReg;
+ unsigned Offset =
+ TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
+ return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op),
+ Op.getValueType());
+}
+
+/// XXX Only kernel functions are supported, so we can assume for now that
+/// every function is a kernel function, but in the future we should use
+/// separate calling conventions for kernel and non-kernel functions.
+SDValue R600TargetLowering::LowerFormalArguments(
+ SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+ *DAG.getContext());
+ MachineFunction &MF = DAG.getMachineFunction();
+ R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
+
+ SmallVector<ISD::InputArg, 8> LocalIns;
+
+ if (AMDGPU::isShader(CallConv)) {
+ AnalyzeFormalArguments(CCInfo, Ins);
+ } else {
+ analyzeFormalArgumentsCompute(CCInfo, Ins);
+ }
+
+ for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ const ISD::InputArg &In = Ins[i];
+ EVT VT = In.VT;
+ EVT MemVT = VA.getLocVT();
+ if (!VT.isVector() && MemVT.isVector()) {
+ // Get load source type if scalarized.
+ MemVT = MemVT.getVectorElementType();
+ }
+
+ if (AMDGPU::isShader(CallConv)) {
+ unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
+ SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+ InVals.push_back(Register);
+ continue;
+ }
+
+ PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
+ AMDGPUAS::CONSTANT_BUFFER_0);
+
+ // i64 isn't a legal type, so the register type used ends up as i32, which
+ // isn't expected here. It attempts to create this sextload, but it ends up
+ // being invalid. Somehow this seems to work with i64 arguments, but breaks
+ // for <1 x i64>.
+
+ // The first 36 bytes of the input buffer contains information about
+ // thread group and global sizes.
+ ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
+ if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
+ // FIXME: This should really check the extload type, but the handling of
+ // extload vector parameters seems to be broken.
+
+ // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+ Ext = ISD::SEXTLOAD;
+ }
+
+ // Compute the offset from the value.
+ // XXX - I think PartOffset should give you this, but it seems to give the
+ // size of the register which isn't useful.
+
+ unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
+ unsigned PartOffset = VA.getLocMemOffset();
+ unsigned Offset = Subtarget->getExplicitKernelArgOffset() + VA.getLocMemOffset();
+
+ MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
+ SDValue Arg = DAG.getLoad(
+ ISD::UNINDEXED, Ext, VT, DL, Chain,
+ DAG.getConstant(Offset, DL, MVT::i32), DAG.getUNDEF(MVT::i32), PtrInfo,
+ MemVT, /* Alignment = */ 4, MachineMemOperand::MONonTemporal |
+ MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant);
+
+ // 4 is the preferred alignment for the CONSTANT memory space.
+ InVals.push_back(Arg);
+ MFI->setABIArgOffset(Offset + MemVT.getStoreSize());
+ }
+ return Chain;
+}
+
+EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
+ EVT VT) const {
+ if (!VT.isVector())
+ return MVT::i32;
+ return VT.changeVectorElementTypeToInteger();
+}
+
+bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+ unsigned AddrSpace,
+ unsigned Align,
+ bool *IsFast) const {
+ if (IsFast)
+ *IsFast = false;
+
+ if (!VT.isSimple() || VT == MVT::Other)
+ return false;
+
+ if (VT.bitsLT(MVT::i32))
+ return false;
+
+ // TODO: This is a rough estimate.
+ if (IsFast)
+ *IsFast = true;
+
+ return VT.bitsGT(MVT::i32) && Align % 4 == 0;
+}
+
+static SDValue CompactSwizzlableVector(
+ SelectionDAG &DAG, SDValue VectorEntry,
+ DenseMap<unsigned, unsigned> &RemapSwizzle) {
+ assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
+ assert(RemapSwizzle.empty());
+ SDValue NewBldVec[4] = {
+ VectorEntry.getOperand(0),
+ VectorEntry.getOperand(1),
+ VectorEntry.getOperand(2),
+ VectorEntry.getOperand(3)
+ };
+
+ for (unsigned i = 0; i < 4; i++) {
+ if (NewBldVec[i].isUndef())
+ // We mask write here to teach later passes that the ith element of this
+ // vector is undef. Thus we can use it to reduce 128 bits reg usage,
+ // break false dependencies and additionnaly make assembly easier to read.
+ RemapSwizzle[i] = 7; // SEL_MASK_WRITE
+ if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
+ if (C->isZero()) {
+ RemapSwizzle[i] = 4; // SEL_0
+ NewBldVec[i] = DAG.getUNDEF(MVT::f32);
+ } else if (C->isExactlyValue(1.0)) {
+ RemapSwizzle[i] = 5; // SEL_1
+ NewBldVec[i] = DAG.getUNDEF(MVT::f32);
+ }
+ }
+
+ if (NewBldVec[i].isUndef())
+ continue;
+ for (unsigned j = 0; j < i; j++) {
+ if (NewBldVec[i] == NewBldVec[j]) {
+ NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
+ RemapSwizzle[i] = j;
+ break;
+ }
+ }
+ }
+
+ return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry),
+ NewBldVec);
+}
+
+static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
+ DenseMap<unsigned, unsigned> &RemapSwizzle) {
+ assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
+ assert(RemapSwizzle.empty());
+ SDValue NewBldVec[4] = {
+ VectorEntry.getOperand(0),
+ VectorEntry.getOperand(1),
+ VectorEntry.getOperand(2),
+ VectorEntry.getOperand(3)
+ };
+ bool isUnmovable[4] = { false, false, false, false };
+ for (unsigned i = 0; i < 4; i++) {
+ RemapSwizzle[i] = i;
+ if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+ unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
+ ->getZExtValue();
+ if (i == Idx)
+ isUnmovable[Idx] = true;
+ }
+ }
+
+ for (unsigned i = 0; i < 4; i++) {
+ if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+ unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
+ ->getZExtValue();
+ if (isUnmovable[Idx])
+ continue;
+ // Swap i and Idx
+ std::swap(NewBldVec[Idx], NewBldVec[i]);
+ std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
+ break;
+ }
+ }
+
+ return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry),
+ NewBldVec);
+}
+
+SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4],
+ SelectionDAG &DAG,
+ const SDLoc &DL) const {
+ assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
+ // Old -> New swizzle values
+ DenseMap<unsigned, unsigned> SwizzleRemap;
+
+ BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
+ for (unsigned i = 0; i < 4; i++) {
+ unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
+ if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
+ Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
+ }
+
+ SwizzleRemap.clear();
+ BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
+ for (unsigned i = 0; i < 4; i++) {
+ unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
+ if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
+ Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
+ }
+
+ return BuildVector;
+}
+
+//===----------------------------------------------------------------------===//
+// Custom DAG Optimizations
+//===----------------------------------------------------------------------===//
+
+SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc DL(N);
+
+ switch (N->getOpcode()) {
+ // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
+ case ISD::FP_ROUND: {
+ SDValue Arg = N->getOperand(0);
+ if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
+ return DAG.getNode(ISD::UINT_TO_FP, DL, N->getValueType(0),
+ Arg.getOperand(0));
+ }
+ break;
+ }
+
+ // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
+ // (i32 select_cc f32, f32, -1, 0 cc)
+ //
+ // Mesa's GLSL frontend generates the above pattern a lot and we can lower
+ // this to one of the SET*_DX10 instructions.
+ case ISD::FP_TO_SINT: {
+ SDValue FNeg = N->getOperand(0);
+ if (FNeg.getOpcode() != ISD::FNEG) {
+ return SDValue();
+ }
+ SDValue SelectCC = FNeg.getOperand(0);
+ if (SelectCC.getOpcode() != ISD::SELECT_CC ||
+ SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
+ SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
+ !isHWTrueValue(SelectCC.getOperand(2)) ||
+ !isHWFalseValue(SelectCC.getOperand(3))) {
+ return SDValue();
+ }
+
+ return DAG.getNode(ISD::SELECT_CC, DL, N->getValueType(0),
+ SelectCC.getOperand(0), // LHS
+ SelectCC.getOperand(1), // RHS
+ DAG.getConstant(-1, DL, MVT::i32), // True
+ DAG.getConstant(0, DL, MVT::i32), // False
+ SelectCC.getOperand(4)); // CC
+
+ break;
+ }
+
+ // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
+ // => build_vector elt0, ... , NewEltIdx, ... , eltN
+ case ISD::INSERT_VECTOR_ELT: {
+ SDValue InVec = N->getOperand(0);
+ SDValue InVal = N->getOperand(1);
+ SDValue EltNo = N->getOperand(2);
+
+ // If the inserted element is an UNDEF, just use the input vector.
+ if (InVal.isUndef())
+ return InVec;
+
+ EVT VT = InVec.getValueType();
+
+ // If we can't generate a legal BUILD_VECTOR, exit
+ if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
+ return SDValue();
+
+ // Check that we know which element is being inserted
+ if (!isa<ConstantSDNode>(EltNo))
+ return SDValue();
+ unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
+
+ // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
+ // be converted to a BUILD_VECTOR). Fill in the Ops vector with the
+ // vector elements.
+ SmallVector<SDValue, 8> Ops;
+ if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
+ Ops.append(InVec.getNode()->op_begin(),
+ InVec.getNode()->op_end());
+ } else if (InVec.isUndef()) {
+ unsigned NElts = VT.getVectorNumElements();
+ Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
+ } else {
+ return SDValue();
+ }
+
+ // Insert the element
+ if (Elt < Ops.size()) {
+ // All the operands of BUILD_VECTOR must have the same type;
+ // we enforce that here.
+ EVT OpVT = Ops[0].getValueType();
+ if (InVal.getValueType() != OpVT)
+ InVal = OpVT.bitsGT(InVal.getValueType()) ?
+ DAG.getNode(ISD::ANY_EXTEND, DL, OpVT, InVal) :
+ DAG.getNode(ISD::TRUNCATE, DL, OpVT, InVal);
+ Ops[Elt] = InVal;
+ }
+
+ // Return the new vector
+ return DAG.getBuildVector(VT, DL, Ops);
+ }
+
+ // Extract_vec (Build_vector) generated by custom lowering
+ // also needs to be customly combined
+ case ISD::EXTRACT_VECTOR_ELT: {
+ SDValue Arg = N->getOperand(0);
+ if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
+ if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+ unsigned Element = Const->getZExtValue();
+ return Arg->getOperand(Element);
+ }
+ }
+ if (Arg.getOpcode() == ISD::BITCAST &&
+ Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR &&
+ (Arg.getOperand(0).getValueType().getVectorNumElements() ==
+ Arg.getValueType().getVectorNumElements())) {
+ if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+ unsigned Element = Const->getZExtValue();
+ return DAG.getNode(ISD::BITCAST, DL, N->getVTList(),
+ Arg->getOperand(0).getOperand(Element));
+ }
+ }
+ break;
+ }
+
+ case ISD::SELECT_CC: {
+ // Try common optimizations
+ if (SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI))
+ return Ret;
+
+ // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
+ // selectcc x, y, a, b, inv(cc)
+ //
+ // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
+ // selectcc x, y, a, b, cc
+ SDValue LHS = N->getOperand(0);
+ if (LHS.getOpcode() != ISD::SELECT_CC) {
+ return SDValue();
+ }
+
+ SDValue RHS = N->getOperand(1);
+ SDValue True = N->getOperand(2);
+ SDValue False = N->getOperand(3);
+ ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
+
+ if (LHS.getOperand(2).getNode() != True.getNode() ||
+ LHS.getOperand(3).getNode() != False.getNode() ||
+ RHS.getNode() != False.getNode()) {
+ return SDValue();
+ }
+
+ switch (NCC) {
+ default: return SDValue();
+ case ISD::SETNE: return LHS;
+ case ISD::SETEQ: {
+ ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
+ LHSCC = ISD::getSetCCInverse(LHSCC,
+ LHS.getOperand(0).getValueType().isInteger());
+ if (DCI.isBeforeLegalizeOps() ||
+ isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
+ return DAG.getSelectCC(DL,
+ LHS.getOperand(0),
+ LHS.getOperand(1),
+ LHS.getOperand(2),
+ LHS.getOperand(3),
+ LHSCC);
+ break;
+ }
+ }
+ return SDValue();
+ }
+
+ case AMDGPUISD::R600_EXPORT: {
+ SDValue Arg = N->getOperand(1);
+ if (Arg.getOpcode() != ISD::BUILD_VECTOR)
+ break;
+
+ SDValue NewArgs[8] = {
+ N->getOperand(0), // Chain
+ SDValue(),
+ N->getOperand(2), // ArrayBase
+ N->getOperand(3), // Type
+ N->getOperand(4), // SWZ_X
+ N->getOperand(5), // SWZ_Y
+ N->getOperand(6), // SWZ_Z
+ N->getOperand(7) // SWZ_W
+ };
+ NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL);
+ return DAG.getNode(AMDGPUISD::R600_EXPORT, DL, N->getVTList(), NewArgs);
+ }
+ case AMDGPUISD::TEXTURE_FETCH: {
+ SDValue Arg = N->getOperand(1);
+ if (Arg.getOpcode() != ISD::BUILD_VECTOR)
+ break;
+
+ SDValue NewArgs[19] = {
+ N->getOperand(0),
+ N->getOperand(1),
+ N->getOperand(2),
+ N->getOperand(3),
+ N->getOperand(4),
+ N->getOperand(5),
+ N->getOperand(6),
+ N->getOperand(7),
+ N->getOperand(8),
+ N->getOperand(9),
+ N->getOperand(10),
+ N->getOperand(11),
+ N->getOperand(12),
+ N->getOperand(13),
+ N->getOperand(14),
+ N->getOperand(15),
+ N->getOperand(16),
+ N->getOperand(17),
+ N->getOperand(18),
+ };
+ NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL);
+ return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs);
+ }
+ default: break;
+ }
+
+ return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
+}
+
+bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
+ SDValue &Src, SDValue &Neg, SDValue &Abs,
+ SDValue &Sel, SDValue &Imm,
+ SelectionDAG &DAG) const {
+ const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
+ if (!Src.isMachineOpcode())
+ return false;
+
+ switch (Src.getMachineOpcode()) {
+ case AMDGPU::FNEG_R600:
+ if (!Neg.getNode())
+ return false;
+ Src = Src.getOperand(0);
+ Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
+ return true;
+ case AMDGPU::FABS_R600:
+ if (!Abs.getNode())
+ return false;
+ Src = Src.getOperand(0);
+ Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
+ return true;
+ case AMDGPU::CONST_COPY: {
+ unsigned Opcode = ParentNode->getMachineOpcode();
+ bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
+
+ if (!Sel.getNode())
+ return false;
+
+ SDValue CstOffset = Src.getOperand(0);
+ if (ParentNode->getValueType(0).isVector())
+ return false;
+
+ // Gather constants values
+ int SrcIndices[] = {
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
+ };
+ std::vector<unsigned> Consts;
+ for (int OtherSrcIdx : SrcIndices) {
+ int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
+ if (OtherSrcIdx < 0 || OtherSelIdx < 0)
+ continue;
+ if (HasDst) {
+ OtherSrcIdx--;
+ OtherSelIdx--;
+ }
+ if (RegisterSDNode *Reg =
+ dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
+ if (Reg->getReg() == AMDGPU::ALU_CONST) {
+ ConstantSDNode *Cst
+ = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
+ Consts.push_back(Cst->getZExtValue());
+ }
+ }
+ }
+
+ ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
+ Consts.push_back(Cst->getZExtValue());
+ if (!TII->fitsConstReadLimitations(Consts)) {
+ return false;
+ }
+
+ Sel = CstOffset;
+ Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
+ return true;
+ }
+ case AMDGPU::MOV_IMM_GLOBAL_ADDR:
+ // Check if the Imm slot is used. Taken from below.
+ if (cast<ConstantSDNode>(Imm)->getZExtValue())
+ return false;
+ Imm = Src.getOperand(0);
+ Src = DAG.getRegister(AMDGPU::ALU_LITERAL_X, MVT::i32);
+ return true;
+ case AMDGPU::MOV_IMM_I32:
+ case AMDGPU::MOV_IMM_F32: {
+ unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
+ uint64_t ImmValue = 0;
+
+ if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
+ ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
+ float FloatValue = FPC->getValueAPF().convertToFloat();
+ if (FloatValue == 0.0) {
+ ImmReg = AMDGPU::ZERO;
+ } else if (FloatValue == 0.5) {
+ ImmReg = AMDGPU::HALF;
+ } else if (FloatValue == 1.0) {
+ ImmReg = AMDGPU::ONE;
+ } else {
+ ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
+ }
+ } else {
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
+ uint64_t Value = C->getZExtValue();
+ if (Value == 0) {
+ ImmReg = AMDGPU::ZERO;
+ } else if (Value == 1) {
+ ImmReg = AMDGPU::ONE_INT;
+ } else {
+ ImmValue = Value;
+ }
+ }
+
+ // Check that we aren't already using an immediate.
+ // XXX: It's possible for an instruction to have more than one
+ // immediate operand, but this is not supported yet.
+ if (ImmReg == AMDGPU::ALU_LITERAL_X) {
+ if (!Imm.getNode())
+ return false;
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
+ assert(C);
+ if (C->getZExtValue())
+ return false;
+ Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32);
+ }
+ Src = DAG.getRegister(ImmReg, MVT::i32);
+ return true;
+ }
+ default:
+ return false;
+ }
+}
+
+/// \brief Fold the instructions after selecting them
+SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
+ SelectionDAG &DAG) const {
+ const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
+ if (!Node->isMachineOpcode())
+ return Node;
+
+ unsigned Opcode = Node->getMachineOpcode();
+ SDValue FakeOp;
+
+ std::vector<SDValue> Ops(Node->op_begin(), Node->op_end());
+
+ if (Opcode == AMDGPU::DOT_4) {
+ int OperandIdx[] = {
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
+ };
+ int NegIdx[] = {
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
+ };
+ int AbsIdx[] = {
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
+ };
+ for (unsigned i = 0; i < 8; i++) {
+ if (OperandIdx[i] < 0)
+ return Node;
+ SDValue &Src = Ops[OperandIdx[i] - 1];
+ SDValue &Neg = Ops[NegIdx[i] - 1];
+ SDValue &Abs = Ops[AbsIdx[i] - 1];
+ bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
+ int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
+ if (HasDst)
+ SelIdx--;
+ SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
+ if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
+ return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
+ }
+ } else if (Opcode == AMDGPU::REG_SEQUENCE) {
+ for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
+ SDValue &Src = Ops[i];
+ if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
+ return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
+ }
+ } else if (Opcode == AMDGPU::CLAMP_R600) {
+ SDValue Src = Node->getOperand(0);
+ if (!Src.isMachineOpcode() ||
+ !TII->hasInstrModifiers(Src.getMachineOpcode()))
+ return Node;
+ int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
+ AMDGPU::OpName::clamp);
+ if (ClampIdx < 0)
+ return Node;
+ SDLoc DL(Node);
+ std::vector<SDValue> Ops(Src->op_begin(), Src->op_end());
+ Ops[ClampIdx - 1] = DAG.getTargetConstant(1, DL, MVT::i32);
+ return DAG.getMachineNode(Src.getMachineOpcode(), DL,
+ Node->getVTList(), Ops);
+ } else {
+ if (!TII->hasInstrModifiers(Opcode))
+ return Node;
+ int OperandIdx[] = {
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
+ };
+ int NegIdx[] = {
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
+ };
+ int AbsIdx[] = {
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
+ -1
+ };
+ for (unsigned i = 0; i < 3; i++) {
+ if (OperandIdx[i] < 0)
+ return Node;
+ SDValue &Src = Ops[OperandIdx[i] - 1];
+ SDValue &Neg = Ops[NegIdx[i] - 1];
+ SDValue FakeAbs;
+ SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
+ bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
+ int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
+ int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
+ if (HasDst) {
+ SelIdx--;
+ ImmIdx--;
+ }
+ SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
+ SDValue &Imm = Ops[ImmIdx];
+ if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
+ return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
+ }
+ }
+
+ return Node;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.h
new file mode 100644
index 000000000000..9700ce14c6f3
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.h
@@ -0,0 +1,104 @@
+//===-- R600ISelLowering.h - R600 DAG Lowering Interface -*- C++ -*--------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief R600 DAG Lowering interface definition
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_R600ISELLOWERING_H
+#define LLVM_LIB_TARGET_AMDGPU_R600ISELLOWERING_H
+
+#include "AMDGPUISelLowering.h"
+
+namespace llvm {
+
+class R600InstrInfo;
+class R600Subtarget;
+
+class R600TargetLowering final : public AMDGPUTargetLowering {
+public:
+ R600TargetLowering(const TargetMachine &TM, const R600Subtarget &STI);
+
+ const R600Subtarget *getSubtarget() const;
+
+ MachineBasicBlock *
+ EmitInstrWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *BB) const override;
+ SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+ SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+ void ReplaceNodeResults(SDNode * N,
+ SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const override;
+ SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &DL, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const override;
+ EVT getSetCCResultType(const DataLayout &DL, LLVMContext &,
+ EVT VT) const override;
+
+ bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
+ unsigned Align,
+ bool *IsFast) const override;
+
+private:
+ unsigned Gen;
+ /// Each OpenCL kernel has nine implicit parameters that are stored in the
+ /// first nine dwords of a Vertex Buffer. These implicit parameters are
+ /// lowered to load instructions which retrieve the values from the Vertex
+ /// Buffer.
+ SDValue LowerImplicitParameter(SelectionDAG &DAG, EVT VT, const SDLoc &DL,
+ unsigned DwordOffset) const;
+
+ void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB,
+ MachineRegisterInfo & MRI, unsigned dword_offset) const;
+ SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SelectionDAG &DAG,
+ const SDLoc &DL) const;
+ SDValue vectorToVerticalVector(SelectionDAG &DAG, SDValue Vector) const;
+
+ SDValue lowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
+ SelectionDAG &DAG) const override;
+ SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue lowerPrivateTruncStore(StoreSDNode *Store, SelectionDAG &DAG) const;
+ SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue lowerPrivateExtLoad(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSHLParts(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSRXParts(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerUADDSUBO(SDValue Op, SelectionDAG &DAG,
+ unsigned mainop, unsigned ovf) const;
+
+ SDValue stackPtrToRegIndex(SDValue Ptr, unsigned StackWidth,
+ SelectionDAG &DAG) const;
+ void getStackAddress(unsigned StackWidth, unsigned ElemIdx,
+ unsigned &Channel, unsigned &PtrIncr) const;
+ bool isZero(SDValue Op) const;
+ bool isHWTrueValue(SDValue Op) const;
+ bool isHWFalseValue(SDValue Op) const;
+
+ bool FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src,
+ SDValue &Neg, SDValue &Abs, SDValue &Sel, SDValue &Imm,
+ SelectionDAG &DAG) const;
+
+ SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override;
+};
+
+} // End namespace llvm;
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600InstrFormats.td b/contrib/llvm/lib/Target/AMDGPU/R600InstrFormats.td
new file mode 100644
index 000000000000..68fcc545916a
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600InstrFormats.td
@@ -0,0 +1,495 @@
+//===-- R600InstrFormats.td - R600 Instruction Encodings ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// R600 Instruction format definitions.
+//
+//===----------------------------------------------------------------------===//
+
+class InstR600 <dag outs, dag ins, string asm, list<dag> pattern,
+ InstrItinClass itin>
+ : AMDGPUInst <outs, ins, asm, pattern> {
+
+ field bits<64> Inst;
+ bit Trig = 0;
+ bit Op3 = 0;
+ bit isVector = 0;
+ bits<2> FlagOperandIdx = 0;
+ bit Op1 = 0;
+ bit Op2 = 0;
+ bit LDS_1A = 0;
+ bit LDS_1A1D = 0;
+ bit HasNativeOperands = 0;
+ bit VTXInst = 0;
+ bit TEXInst = 0;
+ bit ALUInst = 0;
+ bit IsExport = 0;
+ bit LDS_1A2D = 0;
+
+ let Namespace = "AMDGPU";
+ let OutOperandList = outs;
+ let InOperandList = ins;
+ let AsmString = asm;
+ let Pattern = pattern;
+ let Itinerary = itin;
+
+ // No AsmMatcher support.
+ let isCodeGenOnly = 1;
+
+ let TSFlags{4} = Trig;
+ let TSFlags{5} = Op3;
+
+ // Vector instructions are instructions that must fill all slots in an
+ // instruction group
+ let TSFlags{6} = isVector;
+ let TSFlags{8-7} = FlagOperandIdx;
+ let TSFlags{9} = HasNativeOperands;
+ let TSFlags{10} = Op1;
+ let TSFlags{11} = Op2;
+ let TSFlags{12} = VTXInst;
+ let TSFlags{13} = TEXInst;
+ let TSFlags{14} = ALUInst;
+ let TSFlags{15} = LDS_1A;
+ let TSFlags{16} = LDS_1A1D;
+ let TSFlags{17} = IsExport;
+ let TSFlags{18} = LDS_1A2D;
+}
+
+//===----------------------------------------------------------------------===//
+// ALU instructions
+//===----------------------------------------------------------------------===//
+
+class R600_ALU_LDS_Word0 {
+ field bits<32> Word0;
+
+ bits<11> src0;
+ bits<1> src0_rel;
+ bits<11> src1;
+ bits<1> src1_rel;
+ bits<3> index_mode = 0;
+ bits<2> pred_sel;
+ bits<1> last;
+
+ bits<9> src0_sel = src0{8-0};
+ bits<2> src0_chan = src0{10-9};
+ bits<9> src1_sel = src1{8-0};
+ bits<2> src1_chan = src1{10-9};
+
+ let Word0{8-0} = src0_sel;
+ let Word0{9} = src0_rel;
+ let Word0{11-10} = src0_chan;
+ let Word0{21-13} = src1_sel;
+ let Word0{22} = src1_rel;
+ let Word0{24-23} = src1_chan;
+ let Word0{28-26} = index_mode;
+ let Word0{30-29} = pred_sel;
+ let Word0{31} = last;
+}
+
+class R600ALU_Word0 : R600_ALU_LDS_Word0 {
+
+ bits<1> src0_neg;
+ bits<1> src1_neg;
+
+ let Word0{12} = src0_neg;
+ let Word0{25} = src1_neg;
+}
+
+class R600ALU_Word1 {
+ field bits<32> Word1;
+
+ bits<11> dst;
+ bits<3> bank_swizzle;
+ bits<1> dst_rel;
+ bits<1> clamp;
+
+ bits<7> dst_sel = dst{6-0};
+ bits<2> dst_chan = dst{10-9};
+
+ let Word1{20-18} = bank_swizzle;
+ let Word1{27-21} = dst_sel;
+ let Word1{28} = dst_rel;
+ let Word1{30-29} = dst_chan;
+ let Word1{31} = clamp;
+}
+
+class R600ALU_Word1_OP2 <bits<11> alu_inst> : R600ALU_Word1{
+
+ bits<1> src0_abs;
+ bits<1> src1_abs;
+ bits<1> update_exec_mask;
+ bits<1> update_pred;
+ bits<1> write;
+ bits<2> omod;
+
+ let Word1{0} = src0_abs;
+ let Word1{1} = src1_abs;
+ let Word1{2} = update_exec_mask;
+ let Word1{3} = update_pred;
+ let Word1{4} = write;
+ let Word1{6-5} = omod;
+ let Word1{17-7} = alu_inst;
+}
+
+class R600ALU_Word1_OP3 <bits<5> alu_inst> : R600ALU_Word1{
+
+ bits<11> src2;
+ bits<1> src2_rel;
+ bits<1> src2_neg;
+
+ bits<9> src2_sel = src2{8-0};
+ bits<2> src2_chan = src2{10-9};
+
+ let Word1{8-0} = src2_sel;
+ let Word1{9} = src2_rel;
+ let Word1{11-10} = src2_chan;
+ let Word1{12} = src2_neg;
+ let Word1{17-13} = alu_inst;
+}
+
+class R600LDS_Word1 {
+ field bits<32> Word1;
+
+ bits<11> src2;
+ bits<9> src2_sel = src2{8-0};
+ bits<2> src2_chan = src2{10-9};
+ bits<1> src2_rel;
+ // offset specifies the stride offset to the second set of data to be read
+ // from. This is a dword offset.
+ bits<5> alu_inst = 17; // OP3_INST_LDS_IDX_OP
+ bits<3> bank_swizzle;
+ bits<6> lds_op;
+ bits<2> dst_chan = 0;
+
+ let Word1{8-0} = src2_sel;
+ let Word1{9} = src2_rel;
+ let Word1{11-10} = src2_chan;
+ let Word1{17-13} = alu_inst;
+ let Word1{20-18} = bank_swizzle;
+ let Word1{26-21} = lds_op;
+ let Word1{30-29} = dst_chan;
+}
+
+
+/*
+XXX: R600 subtarget uses a slightly different encoding than the other
+subtargets. We currently handle this in R600MCCodeEmitter, but we may
+want to use these instruction classes in the future.
+
+class R600ALU_Word1_OP2_r600 : R600ALU_Word1_OP2 {
+
+ bits<1> fog_merge;
+ bits<10> alu_inst;
+
+ let Inst{37} = fog_merge;
+ let Inst{39-38} = omod;
+ let Inst{49-40} = alu_inst;
+}
+
+class R600ALU_Word1_OP2_r700 : R600ALU_Word1_OP2 {
+
+ bits<11> alu_inst;
+
+ let Inst{38-37} = omod;
+ let Inst{49-39} = alu_inst;
+}
+*/
+
+//===----------------------------------------------------------------------===//
+// Vertex Fetch instructions
+//===----------------------------------------------------------------------===//
+
+class VTX_WORD0 {
+ field bits<32> Word0;
+ bits<7> src_gpr;
+ bits<5> VC_INST;
+ bits<2> FETCH_TYPE;
+ bits<1> FETCH_WHOLE_QUAD;
+ bits<8> buffer_id;
+ bits<1> SRC_REL;
+ bits<2> SRC_SEL_X;
+
+ let Word0{4-0} = VC_INST;
+ let Word0{6-5} = FETCH_TYPE;
+ let Word0{7} = FETCH_WHOLE_QUAD;
+ let Word0{15-8} = buffer_id;
+ let Word0{22-16} = src_gpr;
+ let Word0{23} = SRC_REL;
+ let Word0{25-24} = SRC_SEL_X;
+}
+
+class VTX_WORD0_eg : VTX_WORD0 {
+
+ bits<6> MEGA_FETCH_COUNT;
+
+ let Word0{31-26} = MEGA_FETCH_COUNT;
+}
+
+class VTX_WORD0_cm : VTX_WORD0 {
+
+ bits<2> SRC_SEL_Y;
+ bits<2> STRUCTURED_READ;
+ bits<1> LDS_REQ;
+ bits<1> COALESCED_READ;
+
+ let Word0{27-26} = SRC_SEL_Y;
+ let Word0{29-28} = STRUCTURED_READ;
+ let Word0{30} = LDS_REQ;
+ let Word0{31} = COALESCED_READ;
+}
+
+class VTX_WORD1_GPR {
+ field bits<32> Word1;
+ bits<7> dst_gpr;
+ bits<1> DST_REL;
+ bits<3> DST_SEL_X;
+ bits<3> DST_SEL_Y;
+ bits<3> DST_SEL_Z;
+ bits<3> DST_SEL_W;
+ bits<1> USE_CONST_FIELDS;
+ bits<6> DATA_FORMAT;
+ bits<2> NUM_FORMAT_ALL;
+ bits<1> FORMAT_COMP_ALL;
+ bits<1> SRF_MODE_ALL;
+
+ let Word1{6-0} = dst_gpr;
+ let Word1{7} = DST_REL;
+ let Word1{8} = 0; // Reserved
+ let Word1{11-9} = DST_SEL_X;
+ let Word1{14-12} = DST_SEL_Y;
+ let Word1{17-15} = DST_SEL_Z;
+ let Word1{20-18} = DST_SEL_W;
+ let Word1{21} = USE_CONST_FIELDS;
+ let Word1{27-22} = DATA_FORMAT;
+ let Word1{29-28} = NUM_FORMAT_ALL;
+ let Word1{30} = FORMAT_COMP_ALL;
+ let Word1{31} = SRF_MODE_ALL;
+}
+
+//===----------------------------------------------------------------------===//
+// Texture fetch instructions
+//===----------------------------------------------------------------------===//
+
+class TEX_WORD0 {
+ field bits<32> Word0;
+
+ bits<5> TEX_INST;
+ bits<2> INST_MOD;
+ bits<1> FETCH_WHOLE_QUAD;
+ bits<8> RESOURCE_ID;
+ bits<7> SRC_GPR;
+ bits<1> SRC_REL;
+ bits<1> ALT_CONST;
+ bits<2> RESOURCE_INDEX_MODE;
+ bits<2> SAMPLER_INDEX_MODE;
+
+ let Word0{4-0} = TEX_INST;
+ let Word0{6-5} = INST_MOD;
+ let Word0{7} = FETCH_WHOLE_QUAD;
+ let Word0{15-8} = RESOURCE_ID;
+ let Word0{22-16} = SRC_GPR;
+ let Word0{23} = SRC_REL;
+ let Word0{24} = ALT_CONST;
+ let Word0{26-25} = RESOURCE_INDEX_MODE;
+ let Word0{28-27} = SAMPLER_INDEX_MODE;
+}
+
+class TEX_WORD1 {
+ field bits<32> Word1;
+
+ bits<7> DST_GPR;
+ bits<1> DST_REL;
+ bits<3> DST_SEL_X;
+ bits<3> DST_SEL_Y;
+ bits<3> DST_SEL_Z;
+ bits<3> DST_SEL_W;
+ bits<7> LOD_BIAS;
+ bits<1> COORD_TYPE_X;
+ bits<1> COORD_TYPE_Y;
+ bits<1> COORD_TYPE_Z;
+ bits<1> COORD_TYPE_W;
+
+ let Word1{6-0} = DST_GPR;
+ let Word1{7} = DST_REL;
+ let Word1{11-9} = DST_SEL_X;
+ let Word1{14-12} = DST_SEL_Y;
+ let Word1{17-15} = DST_SEL_Z;
+ let Word1{20-18} = DST_SEL_W;
+ let Word1{27-21} = LOD_BIAS;
+ let Word1{28} = COORD_TYPE_X;
+ let Word1{29} = COORD_TYPE_Y;
+ let Word1{30} = COORD_TYPE_Z;
+ let Word1{31} = COORD_TYPE_W;
+}
+
+class TEX_WORD2 {
+ field bits<32> Word2;
+
+ bits<5> OFFSET_X;
+ bits<5> OFFSET_Y;
+ bits<5> OFFSET_Z;
+ bits<5> SAMPLER_ID;
+ bits<3> SRC_SEL_X;
+ bits<3> SRC_SEL_Y;
+ bits<3> SRC_SEL_Z;
+ bits<3> SRC_SEL_W;
+
+ let Word2{4-0} = OFFSET_X;
+ let Word2{9-5} = OFFSET_Y;
+ let Word2{14-10} = OFFSET_Z;
+ let Word2{19-15} = SAMPLER_ID;
+ let Word2{22-20} = SRC_SEL_X;
+ let Word2{25-23} = SRC_SEL_Y;
+ let Word2{28-26} = SRC_SEL_Z;
+ let Word2{31-29} = SRC_SEL_W;
+}
+
+//===----------------------------------------------------------------------===//
+// Control Flow Instructions
+//===----------------------------------------------------------------------===//
+
+class CF_WORD1_R600 {
+ field bits<32> Word1;
+
+ bits<3> POP_COUNT;
+ bits<5> CF_CONST;
+ bits<2> COND;
+ bits<3> COUNT;
+ bits<6> CALL_COUNT;
+ bits<1> COUNT_3;
+ bits<1> END_OF_PROGRAM;
+ bits<1> VALID_PIXEL_MODE;
+ bits<7> CF_INST;
+ bits<1> WHOLE_QUAD_MODE;
+ bits<1> BARRIER;
+
+ let Word1{2-0} = POP_COUNT;
+ let Word1{7-3} = CF_CONST;
+ let Word1{9-8} = COND;
+ let Word1{12-10} = COUNT;
+ let Word1{18-13} = CALL_COUNT;
+ let Word1{19} = COUNT_3;
+ let Word1{21} = END_OF_PROGRAM;
+ let Word1{22} = VALID_PIXEL_MODE;
+ let Word1{29-23} = CF_INST;
+ let Word1{30} = WHOLE_QUAD_MODE;
+ let Word1{31} = BARRIER;
+}
+
+class CF_WORD0_EG {
+ field bits<32> Word0;
+
+ bits<24> ADDR;
+ bits<3> JUMPTABLE_SEL;
+
+ let Word0{23-0} = ADDR;
+ let Word0{26-24} = JUMPTABLE_SEL;
+}
+
+class CF_WORD1_EG {
+ field bits<32> Word1;
+
+ bits<3> POP_COUNT;
+ bits<5> CF_CONST;
+ bits<2> COND;
+ bits<6> COUNT;
+ bits<1> VALID_PIXEL_MODE;
+ bits<1> END_OF_PROGRAM;
+ bits<8> CF_INST;
+ bits<1> BARRIER;
+
+ let Word1{2-0} = POP_COUNT;
+ let Word1{7-3} = CF_CONST;
+ let Word1{9-8} = COND;
+ let Word1{15-10} = COUNT;
+ let Word1{20} = VALID_PIXEL_MODE;
+ let Word1{21} = END_OF_PROGRAM;
+ let Word1{29-22} = CF_INST;
+ let Word1{31} = BARRIER;
+}
+
+class CF_ALU_WORD0 {
+ field bits<32> Word0;
+
+ bits<22> ADDR;
+ bits<4> KCACHE_BANK0;
+ bits<4> KCACHE_BANK1;
+ bits<2> KCACHE_MODE0;
+
+ let Word0{21-0} = ADDR;
+ let Word0{25-22} = KCACHE_BANK0;
+ let Word0{29-26} = KCACHE_BANK1;
+ let Word0{31-30} = KCACHE_MODE0;
+}
+
+class CF_ALU_WORD1 {
+ field bits<32> Word1;
+
+ bits<2> KCACHE_MODE1;
+ bits<8> KCACHE_ADDR0;
+ bits<8> KCACHE_ADDR1;
+ bits<7> COUNT;
+ bits<1> ALT_CONST;
+ bits<4> CF_INST;
+ bits<1> WHOLE_QUAD_MODE;
+ bits<1> BARRIER;
+
+ let Word1{1-0} = KCACHE_MODE1;
+ let Word1{9-2} = KCACHE_ADDR0;
+ let Word1{17-10} = KCACHE_ADDR1;
+ let Word1{24-18} = COUNT;
+ let Word1{25} = ALT_CONST;
+ let Word1{29-26} = CF_INST;
+ let Word1{30} = WHOLE_QUAD_MODE;
+ let Word1{31} = BARRIER;
+}
+
+class CF_ALLOC_EXPORT_WORD0_RAT {
+ field bits<32> Word0;
+
+ bits<4> rat_id;
+ bits<6> rat_inst;
+ bits<2> rim;
+ bits<2> type;
+ bits<7> rw_gpr;
+ bits<1> rw_rel;
+ bits<7> index_gpr;
+ bits<2> elem_size;
+
+ let Word0{3-0} = rat_id;
+ let Word0{9-4} = rat_inst;
+ let Word0{10} = 0; // Reserved
+ let Word0{12-11} = rim;
+ let Word0{14-13} = type;
+ let Word0{21-15} = rw_gpr;
+ let Word0{22} = rw_rel;
+ let Word0{29-23} = index_gpr;
+ let Word0{31-30} = elem_size;
+}
+
+class CF_ALLOC_EXPORT_WORD1_BUF {
+ field bits<32> Word1;
+
+ bits<12> array_size;
+ bits<4> comp_mask;
+ bits<4> burst_count;
+ bits<1> vpm;
+ bits<1> eop;
+ bits<8> cf_inst;
+ bits<1> mark;
+ bits<1> barrier;
+
+ let Word1{11-0} = array_size;
+ let Word1{15-12} = comp_mask;
+ let Word1{19-16} = burst_count;
+ let Word1{20} = vpm;
+ let Word1{21} = eop;
+ let Word1{29-22} = cf_inst;
+ let Word1{30} = mark;
+ let Word1{31} = barrier;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
new file mode 100644
index 000000000000..e88bd076718e
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -0,0 +1,1483 @@
+//===-- R600InstrInfo.cpp - R600 Instruction Information ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief R600 Implementation of TargetInstrInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600InstrInfo.h"
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "AMDGPUTargetMachine.h"
+#include "R600Defines.h"
+#include "R600MachineFunctionInfo.h"
+#include "R600RegisterInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "AMDGPUGenDFAPacketizer.inc"
+
+R600InstrInfo::R600InstrInfo(const R600Subtarget &ST)
+ : AMDGPUInstrInfo(ST), RI(), ST(ST) {}
+
+bool R600InstrInfo::isVector(const MachineInstr &MI) const {
+ return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR;
+}
+
+void R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const DebugLoc &DL, unsigned DestReg,
+ unsigned SrcReg, bool KillSrc) const {
+ unsigned VectorComponents = 0;
+ if ((AMDGPU::R600_Reg128RegClass.contains(DestReg) ||
+ AMDGPU::R600_Reg128VerticalRegClass.contains(DestReg)) &&
+ (AMDGPU::R600_Reg128RegClass.contains(SrcReg) ||
+ AMDGPU::R600_Reg128VerticalRegClass.contains(SrcReg))) {
+ VectorComponents = 4;
+ } else if((AMDGPU::R600_Reg64RegClass.contains(DestReg) ||
+ AMDGPU::R600_Reg64VerticalRegClass.contains(DestReg)) &&
+ (AMDGPU::R600_Reg64RegClass.contains(SrcReg) ||
+ AMDGPU::R600_Reg64VerticalRegClass.contains(SrcReg))) {
+ VectorComponents = 2;
+ }
+
+ if (VectorComponents > 0) {
+ for (unsigned I = 0; I < VectorComponents; I++) {
+ unsigned SubRegIndex = RI.getSubRegFromChannel(I);
+ buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
+ RI.getSubReg(DestReg, SubRegIndex),
+ RI.getSubReg(SrcReg, SubRegIndex))
+ .addReg(DestReg,
+ RegState::Define | RegState::Implicit);
+ }
+ } else {
+ MachineInstr *NewMI = buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
+ DestReg, SrcReg);
+ NewMI->getOperand(getOperandIdx(*NewMI, AMDGPU::OpName::src0))
+ .setIsKill(KillSrc);
+ }
+}
+
+/// \returns true if \p MBBI can be moved into a new basic.
+bool R600InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI) const {
+ for (MachineInstr::const_mop_iterator I = MBBI->operands_begin(),
+ E = MBBI->operands_end(); I != E; ++I) {
+ if (I->isReg() && !TargetRegisterInfo::isVirtualRegister(I->getReg()) &&
+ I->isUse() && RI.isPhysRegLiveAcrossClauses(I->getReg()))
+ return false;
+ }
+ return true;
+}
+
+bool R600InstrInfo::isMov(unsigned Opcode) const {
+ switch(Opcode) {
+ default:
+ return false;
+ case AMDGPU::MOV:
+ case AMDGPU::MOV_IMM_F32:
+ case AMDGPU::MOV_IMM_I32:
+ return true;
+ }
+}
+
+bool R600InstrInfo::isReductionOp(unsigned Opcode) const {
+ return false;
+}
+
+bool R600InstrInfo::isCubeOp(unsigned Opcode) const {
+ switch(Opcode) {
+ default: return false;
+ case AMDGPU::CUBE_r600_pseudo:
+ case AMDGPU::CUBE_r600_real:
+ case AMDGPU::CUBE_eg_pseudo:
+ case AMDGPU::CUBE_eg_real:
+ return true;
+ }
+}
+
+bool R600InstrInfo::isALUInstr(unsigned Opcode) const {
+ unsigned TargetFlags = get(Opcode).TSFlags;
+
+ return (TargetFlags & R600_InstFlag::ALU_INST);
+}
+
+bool R600InstrInfo::hasInstrModifiers(unsigned Opcode) const {
+ unsigned TargetFlags = get(Opcode).TSFlags;
+
+ return ((TargetFlags & R600_InstFlag::OP1) |
+ (TargetFlags & R600_InstFlag::OP2) |
+ (TargetFlags & R600_InstFlag::OP3));
+}
+
+bool R600InstrInfo::isLDSInstr(unsigned Opcode) const {
+ unsigned TargetFlags = get(Opcode).TSFlags;
+
+ return ((TargetFlags & R600_InstFlag::LDS_1A) |
+ (TargetFlags & R600_InstFlag::LDS_1A1D) |
+ (TargetFlags & R600_InstFlag::LDS_1A2D));
+}
+
+bool R600InstrInfo::isLDSRetInstr(unsigned Opcode) const {
+ return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) != -1;
+}
+
+bool R600InstrInfo::canBeConsideredALU(const MachineInstr &MI) const {
+ if (isALUInstr(MI.getOpcode()))
+ return true;
+ if (isVector(MI) || isCubeOp(MI.getOpcode()))
+ return true;
+ switch (MI.getOpcode()) {
+ case AMDGPU::PRED_X:
+ case AMDGPU::INTERP_PAIR_XY:
+ case AMDGPU::INTERP_PAIR_ZW:
+ case AMDGPU::INTERP_VEC_LOAD:
+ case AMDGPU::COPY:
+ case AMDGPU::DOT_4:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool R600InstrInfo::isTransOnly(unsigned Opcode) const {
+ if (ST.hasCaymanISA())
+ return false;
+ return (get(Opcode).getSchedClass() == AMDGPU::Sched::TransALU);
+}
+
+bool R600InstrInfo::isTransOnly(const MachineInstr &MI) const {
+ return isTransOnly(MI.getOpcode());
+}
+
+bool R600InstrInfo::isVectorOnly(unsigned Opcode) const {
+ return (get(Opcode).getSchedClass() == AMDGPU::Sched::VecALU);
+}
+
+bool R600InstrInfo::isVectorOnly(const MachineInstr &MI) const {
+ return isVectorOnly(MI.getOpcode());
+}
+
+bool R600InstrInfo::isExport(unsigned Opcode) const {
+ return (get(Opcode).TSFlags & R600_InstFlag::IS_EXPORT);
+}
+
+bool R600InstrInfo::usesVertexCache(unsigned Opcode) const {
+ return ST.hasVertexCache() && IS_VTX(get(Opcode));
+}
+
+bool R600InstrInfo::usesVertexCache(const MachineInstr &MI) const {
+ const MachineFunction *MF = MI.getParent()->getParent();
+ return !AMDGPU::isCompute(MF->getFunction()->getCallingConv()) &&
+ usesVertexCache(MI.getOpcode());
+}
+
+bool R600InstrInfo::usesTextureCache(unsigned Opcode) const {
+ return (!ST.hasVertexCache() && IS_VTX(get(Opcode))) || IS_TEX(get(Opcode));
+}
+
+bool R600InstrInfo::usesTextureCache(const MachineInstr &MI) const {
+ const MachineFunction *MF = MI.getParent()->getParent();
+ return (AMDGPU::isCompute(MF->getFunction()->getCallingConv()) &&
+ usesVertexCache(MI.getOpcode())) ||
+ usesTextureCache(MI.getOpcode());
+}
+
+bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const {
+ switch (Opcode) {
+ case AMDGPU::KILLGT:
+ case AMDGPU::GROUP_BARRIER:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool R600InstrInfo::usesAddressRegister(MachineInstr &MI) const {
+ return MI.findRegisterUseOperandIdx(AMDGPU::AR_X) != -1;
+}
+
+bool R600InstrInfo::definesAddressRegister(MachineInstr &MI) const {
+ return MI.findRegisterDefOperandIdx(AMDGPU::AR_X) != -1;
+}
+
+bool R600InstrInfo::readsLDSSrcReg(const MachineInstr &MI) const {
+ if (!isALUInstr(MI.getOpcode())) {
+ return false;
+ }
+ for (MachineInstr::const_mop_iterator I = MI.operands_begin(),
+ E = MI.operands_end();
+ I != E; ++I) {
+ if (!I->isReg() || !I->isUse() ||
+ TargetRegisterInfo::isVirtualRegister(I->getReg()))
+ continue;
+
+ if (AMDGPU::R600_LDS_SRC_REGRegClass.contains(I->getReg()))
+ return true;
+ }
+ return false;
+}
+
+int R600InstrInfo::getSelIdx(unsigned Opcode, unsigned SrcIdx) const {
+ static const unsigned SrcSelTable[][2] = {
+ {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel},
+ {AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel},
+ {AMDGPU::OpName::src2, AMDGPU::OpName::src2_sel},
+ {AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X},
+ {AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y},
+ {AMDGPU::OpName::src0_Z, AMDGPU::OpName::src0_sel_Z},
+ {AMDGPU::OpName::src0_W, AMDGPU::OpName::src0_sel_W},
+ {AMDGPU::OpName::src1_X, AMDGPU::OpName::src1_sel_X},
+ {AMDGPU::OpName::src1_Y, AMDGPU::OpName::src1_sel_Y},
+ {AMDGPU::OpName::src1_Z, AMDGPU::OpName::src1_sel_Z},
+ {AMDGPU::OpName::src1_W, AMDGPU::OpName::src1_sel_W}
+ };
+
+ for (const auto &Row : SrcSelTable) {
+ if (getOperandIdx(Opcode, Row[0]) == (int)SrcIdx) {
+ return getOperandIdx(Opcode, Row[1]);
+ }
+ }
+ return -1;
+}
+
+SmallVector<std::pair<MachineOperand *, int64_t>, 3>
+R600InstrInfo::getSrcs(MachineInstr &MI) const {
+ SmallVector<std::pair<MachineOperand *, int64_t>, 3> Result;
+
+ if (MI.getOpcode() == AMDGPU::DOT_4) {
+ static const unsigned OpTable[8][2] = {
+ {AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X},
+ {AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y},
+ {AMDGPU::OpName::src0_Z, AMDGPU::OpName::src0_sel_Z},
+ {AMDGPU::OpName::src0_W, AMDGPU::OpName::src0_sel_W},
+ {AMDGPU::OpName::src1_X, AMDGPU::OpName::src1_sel_X},
+ {AMDGPU::OpName::src1_Y, AMDGPU::OpName::src1_sel_Y},
+ {AMDGPU::OpName::src1_Z, AMDGPU::OpName::src1_sel_Z},
+ {AMDGPU::OpName::src1_W, AMDGPU::OpName::src1_sel_W},
+ };
+
+ for (unsigned j = 0; j < 8; j++) {
+ MachineOperand &MO =
+ MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][0]));
+ unsigned Reg = MO.getReg();
+ if (Reg == AMDGPU::ALU_CONST) {
+ MachineOperand &Sel =
+ MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1]));
+ Result.push_back(std::make_pair(&MO, Sel.getImm()));
+ continue;
+ }
+
+ }
+ return Result;
+ }
+
+ static const unsigned OpTable[3][2] = {
+ {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel},
+ {AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel},
+ {AMDGPU::OpName::src2, AMDGPU::OpName::src2_sel},
+ };
+
+ for (unsigned j = 0; j < 3; j++) {
+ int SrcIdx = getOperandIdx(MI.getOpcode(), OpTable[j][0]);
+ if (SrcIdx < 0)
+ break;
+ MachineOperand &MO = MI.getOperand(SrcIdx);
+ unsigned Reg = MO.getReg();
+ if (Reg == AMDGPU::ALU_CONST) {
+ MachineOperand &Sel =
+ MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1]));
+ Result.push_back(std::make_pair(&MO, Sel.getImm()));
+ continue;
+ }
+ if (Reg == AMDGPU::ALU_LITERAL_X) {
+ MachineOperand &Operand =
+ MI.getOperand(getOperandIdx(MI.getOpcode(), AMDGPU::OpName::literal));
+ if (Operand.isImm()) {
+ Result.push_back(std::make_pair(&MO, Operand.getImm()));
+ continue;
+ }
+ assert(Operand.isGlobal());
+ }
+ Result.push_back(std::make_pair(&MO, 0));
+ }
+ return Result;
+}
+
+std::vector<std::pair<int, unsigned>>
+R600InstrInfo::ExtractSrcs(MachineInstr &MI,
+ const DenseMap<unsigned, unsigned> &PV,
+ unsigned &ConstCount) const {
+ ConstCount = 0;
+ const std::pair<int, unsigned> DummyPair(-1, 0);
+ std::vector<std::pair<int, unsigned> > Result;
+ unsigned i = 0;
+ for (const auto &Src : getSrcs(MI)) {
+ ++i;
+ unsigned Reg = Src.first->getReg();
+ int Index = RI.getEncodingValue(Reg) & 0xff;
+ if (Reg == AMDGPU::OQAP) {
+ Result.push_back(std::make_pair(Index, 0U));
+ }
+ if (PV.find(Reg) != PV.end()) {
+ // 255 is used to tells its a PS/PV reg
+ Result.push_back(std::make_pair(255, 0U));
+ continue;
+ }
+ if (Index > 127) {
+ ConstCount++;
+ Result.push_back(DummyPair);
+ continue;
+ }
+ unsigned Chan = RI.getHWRegChan(Reg);
+ Result.push_back(std::make_pair(Index, Chan));
+ }
+ for (; i < 3; ++i)
+ Result.push_back(DummyPair);
+ return Result;
+}
+
+static std::vector<std::pair<int, unsigned> >
+Swizzle(std::vector<std::pair<int, unsigned> > Src,
+ R600InstrInfo::BankSwizzle Swz) {
+ if (Src[0] == Src[1])
+ Src[1].first = -1;
+ switch (Swz) {
+ case R600InstrInfo::ALU_VEC_012_SCL_210:
+ break;
+ case R600InstrInfo::ALU_VEC_021_SCL_122:
+ std::swap(Src[1], Src[2]);
+ break;
+ case R600InstrInfo::ALU_VEC_102_SCL_221:
+ std::swap(Src[0], Src[1]);
+ break;
+ case R600InstrInfo::ALU_VEC_120_SCL_212:
+ std::swap(Src[0], Src[1]);
+ std::swap(Src[0], Src[2]);
+ break;
+ case R600InstrInfo::ALU_VEC_201:
+ std::swap(Src[0], Src[2]);
+ std::swap(Src[0], Src[1]);
+ break;
+ case R600InstrInfo::ALU_VEC_210:
+ std::swap(Src[0], Src[2]);
+ break;
+ }
+ return Src;
+}
+
+static unsigned getTransSwizzle(R600InstrInfo::BankSwizzle Swz, unsigned Op) {
+ switch (Swz) {
+ case R600InstrInfo::ALU_VEC_012_SCL_210: {
+ unsigned Cycles[3] = { 2, 1, 0};
+ return Cycles[Op];
+ }
+ case R600InstrInfo::ALU_VEC_021_SCL_122: {
+ unsigned Cycles[3] = { 1, 2, 2};
+ return Cycles[Op];
+ }
+ case R600InstrInfo::ALU_VEC_120_SCL_212: {
+ unsigned Cycles[3] = { 2, 1, 2};
+ return Cycles[Op];
+ }
+ case R600InstrInfo::ALU_VEC_102_SCL_221: {
+ unsigned Cycles[3] = { 2, 2, 1};
+ return Cycles[Op];
+ }
+ default:
+ llvm_unreachable("Wrong Swizzle for Trans Slot");
+ }
+}
+
+/// returns how many MIs (whose inputs are represented by IGSrcs) can be packed
+/// in the same Instruction Group while meeting read port limitations given a
+/// Swz swizzle sequence.
+unsigned R600InstrInfo::isLegalUpTo(
+ const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+ const std::vector<R600InstrInfo::BankSwizzle> &Swz,
+ const std::vector<std::pair<int, unsigned> > &TransSrcs,
+ R600InstrInfo::BankSwizzle TransSwz) const {
+ int Vector[4][3];
+ memset(Vector, -1, sizeof(Vector));
+ for (unsigned i = 0, e = IGSrcs.size(); i < e; i++) {
+ const std::vector<std::pair<int, unsigned> > &Srcs =
+ Swizzle(IGSrcs[i], Swz[i]);
+ for (unsigned j = 0; j < 3; j++) {
+ const std::pair<int, unsigned> &Src = Srcs[j];
+ if (Src.first < 0 || Src.first == 255)
+ continue;
+ if (Src.first == GET_REG_INDEX(RI.getEncodingValue(AMDGPU::OQAP))) {
+ if (Swz[i] != R600InstrInfo::ALU_VEC_012_SCL_210 &&
+ Swz[i] != R600InstrInfo::ALU_VEC_021_SCL_122) {
+ // The value from output queue A (denoted by register OQAP) can
+ // only be fetched during the first cycle.
+ return false;
+ }
+ // OQAP does not count towards the normal read port restrictions
+ continue;
+ }
+ if (Vector[Src.second][j] < 0)
+ Vector[Src.second][j] = Src.first;
+ if (Vector[Src.second][j] != Src.first)
+ return i;
+ }
+ }
+ // Now check Trans Alu
+ for (unsigned i = 0, e = TransSrcs.size(); i < e; ++i) {
+ const std::pair<int, unsigned> &Src = TransSrcs[i];
+ unsigned Cycle = getTransSwizzle(TransSwz, i);
+ if (Src.first < 0)
+ continue;
+ if (Src.first == 255)
+ continue;
+ if (Vector[Src.second][Cycle] < 0)
+ Vector[Src.second][Cycle] = Src.first;
+ if (Vector[Src.second][Cycle] != Src.first)
+ return IGSrcs.size() - 1;
+ }
+ return IGSrcs.size();
+}
+
+/// Given a swizzle sequence SwzCandidate and an index Idx, returns the next
+/// (in lexicographic term) swizzle sequence assuming that all swizzles after
+/// Idx can be skipped
+static bool
+NextPossibleSolution(
+ std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
+ unsigned Idx) {
+ assert(Idx < SwzCandidate.size());
+ int ResetIdx = Idx;
+ while (ResetIdx > -1 && SwzCandidate[ResetIdx] == R600InstrInfo::ALU_VEC_210)
+ ResetIdx --;
+ for (unsigned i = ResetIdx + 1, e = SwzCandidate.size(); i < e; i++) {
+ SwzCandidate[i] = R600InstrInfo::ALU_VEC_012_SCL_210;
+ }
+ if (ResetIdx == -1)
+ return false;
+ int NextSwizzle = SwzCandidate[ResetIdx] + 1;
+ SwzCandidate[ResetIdx] = (R600InstrInfo::BankSwizzle)NextSwizzle;
+ return true;
+}
+
+/// Enumerate all possible Swizzle sequence to find one that can meet all
+/// read port requirements.
+bool R600InstrInfo::FindSwizzleForVectorSlot(
+ const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+ std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
+ const std::vector<std::pair<int, unsigned> > &TransSrcs,
+ R600InstrInfo::BankSwizzle TransSwz) const {
+ unsigned ValidUpTo = 0;
+ do {
+ ValidUpTo = isLegalUpTo(IGSrcs, SwzCandidate, TransSrcs, TransSwz);
+ if (ValidUpTo == IGSrcs.size())
+ return true;
+ } while (NextPossibleSolution(SwzCandidate, ValidUpTo));
+ return false;
+}
+
+/// Instructions in Trans slot can't read gpr at cycle 0 if they also read
+/// a const, and can't read a gpr at cycle 1 if they read 2 const.
+static bool
+isConstCompatible(R600InstrInfo::BankSwizzle TransSwz,
+ const std::vector<std::pair<int, unsigned> > &TransOps,
+ unsigned ConstCount) {
+ // TransALU can't read 3 constants
+ if (ConstCount > 2)
+ return false;
+ for (unsigned i = 0, e = TransOps.size(); i < e; ++i) {
+ const std::pair<int, unsigned> &Src = TransOps[i];
+ unsigned Cycle = getTransSwizzle(TransSwz, i);
+ if (Src.first < 0)
+ continue;
+ if (ConstCount > 0 && Cycle == 0)
+ return false;
+ if (ConstCount > 1 && Cycle == 1)
+ return false;
+ }
+ return true;
+}
+
+bool
+R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG,
+ const DenseMap<unsigned, unsigned> &PV,
+ std::vector<BankSwizzle> &ValidSwizzle,
+ bool isLastAluTrans)
+ const {
+ //Todo : support shared src0 - src1 operand
+
+ std::vector<std::vector<std::pair<int, unsigned> > > IGSrcs;
+ ValidSwizzle.clear();
+ unsigned ConstCount;
+ BankSwizzle TransBS = ALU_VEC_012_SCL_210;
+ for (unsigned i = 0, e = IG.size(); i < e; ++i) {
+ IGSrcs.push_back(ExtractSrcs(*IG[i], PV, ConstCount));
+ unsigned Op = getOperandIdx(IG[i]->getOpcode(),
+ AMDGPU::OpName::bank_swizzle);
+ ValidSwizzle.push_back( (R600InstrInfo::BankSwizzle)
+ IG[i]->getOperand(Op).getImm());
+ }
+ std::vector<std::pair<int, unsigned> > TransOps;
+ if (!isLastAluTrans)
+ return FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps, TransBS);
+
+ TransOps = std::move(IGSrcs.back());
+ IGSrcs.pop_back();
+ ValidSwizzle.pop_back();
+
+ static const R600InstrInfo::BankSwizzle TransSwz[] = {
+ ALU_VEC_012_SCL_210,
+ ALU_VEC_021_SCL_122,
+ ALU_VEC_120_SCL_212,
+ ALU_VEC_102_SCL_221
+ };
+ for (unsigned i = 0; i < 4; i++) {
+ TransBS = TransSwz[i];
+ if (!isConstCompatible(TransBS, TransOps, ConstCount))
+ continue;
+ bool Result = FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps,
+ TransBS);
+ if (Result) {
+ ValidSwizzle.push_back(TransBS);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+
+bool
+R600InstrInfo::fitsConstReadLimitations(const std::vector<unsigned> &Consts)
+ const {
+ assert (Consts.size() <= 12 && "Too many operands in instructions group");
+ unsigned Pair1 = 0, Pair2 = 0;
+ for (unsigned i = 0, n = Consts.size(); i < n; ++i) {
+ unsigned ReadConstHalf = Consts[i] & 2;
+ unsigned ReadConstIndex = Consts[i] & (~3);
+ unsigned ReadHalfConst = ReadConstIndex | ReadConstHalf;
+ if (!Pair1) {
+ Pair1 = ReadHalfConst;
+ continue;
+ }
+ if (Pair1 == ReadHalfConst)
+ continue;
+ if (!Pair2) {
+ Pair2 = ReadHalfConst;
+ continue;
+ }
+ if (Pair2 != ReadHalfConst)
+ return false;
+ }
+ return true;
+}
+
+bool
+R600InstrInfo::fitsConstReadLimitations(const std::vector<MachineInstr *> &MIs)
+ const {
+ std::vector<unsigned> Consts;
+ SmallSet<int64_t, 4> Literals;
+ for (unsigned i = 0, n = MIs.size(); i < n; i++) {
+ MachineInstr &MI = *MIs[i];
+ if (!isALUInstr(MI.getOpcode()))
+ continue;
+
+ for (const auto &Src : getSrcs(MI)) {
+ if (Src.first->getReg() == AMDGPU::ALU_LITERAL_X)
+ Literals.insert(Src.second);
+ if (Literals.size() > 4)
+ return false;
+ if (Src.first->getReg() == AMDGPU::ALU_CONST)
+ Consts.push_back(Src.second);
+ if (AMDGPU::R600_KC0RegClass.contains(Src.first->getReg()) ||
+ AMDGPU::R600_KC1RegClass.contains(Src.first->getReg())) {
+ unsigned Index = RI.getEncodingValue(Src.first->getReg()) & 0xff;
+ unsigned Chan = RI.getHWRegChan(Src.first->getReg());
+ Consts.push_back((Index << 2) | Chan);
+ }
+ }
+ }
+ return fitsConstReadLimitations(Consts);
+}
+
+DFAPacketizer *
+R600InstrInfo::CreateTargetScheduleState(const TargetSubtargetInfo &STI) const {
+ const InstrItineraryData *II = STI.getInstrItineraryData();
+ return static_cast<const R600Subtarget &>(STI).createDFAPacketizer(II);
+}
+
+static bool
+isPredicateSetter(unsigned Opcode) {
+ switch (Opcode) {
+ case AMDGPU::PRED_X:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static MachineInstr *
+findFirstPredicateSetterFrom(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) {
+ while (I != MBB.begin()) {
+ --I;
+ MachineInstr &MI = *I;
+ if (isPredicateSetter(MI.getOpcode()))
+ return &MI;
+ }
+
+ return nullptr;
+}
+
+static
+bool isJump(unsigned Opcode) {
+ return Opcode == AMDGPU::JUMP || Opcode == AMDGPU::JUMP_COND;
+}
+
+static bool isBranch(unsigned Opcode) {
+ return Opcode == AMDGPU::BRANCH || Opcode == AMDGPU::BRANCH_COND_i32 ||
+ Opcode == AMDGPU::BRANCH_COND_f32;
+}
+
+bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const {
+ // Most of the following comes from the ARM implementation of AnalyzeBranch
+
+ // If the block has no terminators, it just falls into the block after it.
+ MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
+ if (I == MBB.end())
+ return false;
+
+ // AMDGPU::BRANCH* instructions are only available after isel and are not
+ // handled
+ if (isBranch(I->getOpcode()))
+ return true;
+ if (!isJump(I->getOpcode())) {
+ return false;
+ }
+
+ // Remove successive JUMP
+ while (I != MBB.begin() && std::prev(I)->getOpcode() == AMDGPU::JUMP) {
+ MachineBasicBlock::iterator PriorI = std::prev(I);
+ if (AllowModify)
+ I->removeFromParent();
+ I = PriorI;
+ }
+ MachineInstr &LastInst = *I;
+
+ // If there is only one terminator instruction, process it.
+ unsigned LastOpc = LastInst.getOpcode();
+ if (I == MBB.begin() || !isJump((--I)->getOpcode())) {
+ if (LastOpc == AMDGPU::JUMP) {
+ TBB = LastInst.getOperand(0).getMBB();
+ return false;
+ } else if (LastOpc == AMDGPU::JUMP_COND) {
+ auto predSet = I;
+ while (!isPredicateSetter(predSet->getOpcode())) {
+ predSet = --I;
+ }
+ TBB = LastInst.getOperand(0).getMBB();
+ Cond.push_back(predSet->getOperand(1));
+ Cond.push_back(predSet->getOperand(2));
+ Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false));
+ return false;
+ }
+ return true; // Can't handle indirect branch.
+ }
+
+ // Get the instruction before it if it is a terminator.
+ MachineInstr &SecondLastInst = *I;
+ unsigned SecondLastOpc = SecondLastInst.getOpcode();
+
+ // If the block ends with a B and a Bcc, handle it.
+ if (SecondLastOpc == AMDGPU::JUMP_COND && LastOpc == AMDGPU::JUMP) {
+ auto predSet = --I;
+ while (!isPredicateSetter(predSet->getOpcode())) {
+ predSet = --I;
+ }
+ TBB = SecondLastInst.getOperand(0).getMBB();
+ FBB = LastInst.getOperand(0).getMBB();
+ Cond.push_back(predSet->getOperand(1));
+ Cond.push_back(predSet->getOperand(2));
+ Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false));
+ return false;
+ }
+
+ // Otherwise, can't handle this.
+ return true;
+}
+
+static
+MachineBasicBlock::iterator FindLastAluClause(MachineBasicBlock &MBB) {
+ for (MachineBasicBlock::reverse_iterator It = MBB.rbegin(), E = MBB.rend();
+ It != E; ++It) {
+ if (It->getOpcode() == AMDGPU::CF_ALU ||
+ It->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE)
+ return It.getReverse();
+ }
+ return MBB.end();
+}
+
+unsigned R600InstrInfo::insertBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB,
+ ArrayRef<MachineOperand> Cond,
+ const DebugLoc &DL,
+ int *BytesAdded) const {
+ assert(TBB && "insertBranch must not be told to insert a fallthrough");
+ assert(!BytesAdded && "code size not handled");
+
+ if (!FBB) {
+ if (Cond.empty()) {
+ BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(TBB);
+ return 1;
+ } else {
+ MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end());
+ assert(PredSet && "No previous predicate !");
+ addFlag(*PredSet, 0, MO_FLAG_PUSH);
+ PredSet->getOperand(2).setImm(Cond[1].getImm());
+
+ BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND))
+ .addMBB(TBB)
+ .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+ MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
+ if (CfAlu == MBB.end())
+ return 1;
+ assert (CfAlu->getOpcode() == AMDGPU::CF_ALU);
+ CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE));
+ return 1;
+ }
+ } else {
+ MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end());
+ assert(PredSet && "No previous predicate !");
+ addFlag(*PredSet, 0, MO_FLAG_PUSH);
+ PredSet->getOperand(2).setImm(Cond[1].getImm());
+ BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND))
+ .addMBB(TBB)
+ .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+ BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(FBB);
+ MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
+ if (CfAlu == MBB.end())
+ return 2;
+ assert (CfAlu->getOpcode() == AMDGPU::CF_ALU);
+ CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE));
+ return 2;
+ }
+}
+
+unsigned R600InstrInfo::removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved) const {
+ assert(!BytesRemoved && "code size not handled");
+
+ // Note : we leave PRED* instructions there.
+ // They may be needed when predicating instructions.
+
+ MachineBasicBlock::iterator I = MBB.end();
+
+ if (I == MBB.begin()) {
+ return 0;
+ }
+ --I;
+ switch (I->getOpcode()) {
+ default:
+ return 0;
+ case AMDGPU::JUMP_COND: {
+ MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
+ clearFlag(*predSet, 0, MO_FLAG_PUSH);
+ I->eraseFromParent();
+ MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
+ if (CfAlu == MBB.end())
+ break;
+ assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE);
+ CfAlu->setDesc(get(AMDGPU::CF_ALU));
+ break;
+ }
+ case AMDGPU::JUMP:
+ I->eraseFromParent();
+ break;
+ }
+ I = MBB.end();
+
+ if (I == MBB.begin()) {
+ return 1;
+ }
+ --I;
+ switch (I->getOpcode()) {
+ // FIXME: only one case??
+ default:
+ return 1;
+ case AMDGPU::JUMP_COND: {
+ MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
+ clearFlag(*predSet, 0, MO_FLAG_PUSH);
+ I->eraseFromParent();
+ MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
+ if (CfAlu == MBB.end())
+ break;
+ assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE);
+ CfAlu->setDesc(get(AMDGPU::CF_ALU));
+ break;
+ }
+ case AMDGPU::JUMP:
+ I->eraseFromParent();
+ break;
+ }
+ return 2;
+}
+
+bool R600InstrInfo::isPredicated(const MachineInstr &MI) const {
+ int idx = MI.findFirstPredOperandIdx();
+ if (idx < 0)
+ return false;
+
+ unsigned Reg = MI.getOperand(idx).getReg();
+ switch (Reg) {
+ default: return false;
+ case AMDGPU::PRED_SEL_ONE:
+ case AMDGPU::PRED_SEL_ZERO:
+ case AMDGPU::PREDICATE_BIT:
+ return true;
+ }
+}
+
+bool R600InstrInfo::isPredicable(MachineInstr &MI) const {
+ // XXX: KILL* instructions can be predicated, but they must be the last
+ // instruction in a clause, so this means any instructions after them cannot
+ // be predicated. Until we have proper support for instruction clauses in the
+ // backend, we will mark KILL* instructions as unpredicable.
+
+ if (MI.getOpcode() == AMDGPU::KILLGT) {
+ return false;
+ } else if (MI.getOpcode() == AMDGPU::CF_ALU) {
+ // If the clause start in the middle of MBB then the MBB has more
+ // than a single clause, unable to predicate several clauses.
+ if (MI.getParent()->begin() != MachineBasicBlock::iterator(MI))
+ return false;
+ // TODO: We don't support KC merging atm
+ return MI.getOperand(3).getImm() == 0 && MI.getOperand(4).getImm() == 0;
+ } else if (isVector(MI)) {
+ return false;
+ } else {
+ return AMDGPUInstrInfo::isPredicable(MI);
+ }
+}
+
+
+bool
+R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB,
+ unsigned NumCyles,
+ unsigned ExtraPredCycles,
+ BranchProbability Probability) const{
+ return true;
+}
+
+bool
+R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB,
+ unsigned NumTCycles,
+ unsigned ExtraTCycles,
+ MachineBasicBlock &FMBB,
+ unsigned NumFCycles,
+ unsigned ExtraFCycles,
+ BranchProbability Probability) const {
+ return true;
+}
+
+bool
+R600InstrInfo::isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
+ unsigned NumCyles,
+ BranchProbability Probability)
+ const {
+ return true;
+}
+
+bool
+R600InstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB,
+ MachineBasicBlock &FMBB) const {
+ return false;
+}
+
+
+bool
+R600InstrInfo::reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+ MachineOperand &MO = Cond[1];
+ switch (MO.getImm()) {
+ case AMDGPU::PRED_SETE_INT:
+ MO.setImm(AMDGPU::PRED_SETNE_INT);
+ break;
+ case AMDGPU::PRED_SETNE_INT:
+ MO.setImm(AMDGPU::PRED_SETE_INT);
+ break;
+ case AMDGPU::PRED_SETE:
+ MO.setImm(AMDGPU::PRED_SETNE);
+ break;
+ case AMDGPU::PRED_SETNE:
+ MO.setImm(AMDGPU::PRED_SETE);
+ break;
+ default:
+ return true;
+ }
+
+ MachineOperand &MO2 = Cond[2];
+ switch (MO2.getReg()) {
+ case AMDGPU::PRED_SEL_ZERO:
+ MO2.setReg(AMDGPU::PRED_SEL_ONE);
+ break;
+ case AMDGPU::PRED_SEL_ONE:
+ MO2.setReg(AMDGPU::PRED_SEL_ZERO);
+ break;
+ default:
+ return true;
+ }
+ return false;
+}
+
+bool R600InstrInfo::DefinesPredicate(MachineInstr &MI,
+ std::vector<MachineOperand> &Pred) const {
+ return isPredicateSetter(MI.getOpcode());
+}
+
+
+bool R600InstrInfo::PredicateInstruction(MachineInstr &MI,
+ ArrayRef<MachineOperand> Pred) const {
+ int PIdx = MI.findFirstPredOperandIdx();
+
+ if (MI.getOpcode() == AMDGPU::CF_ALU) {
+ MI.getOperand(8).setImm(0);
+ return true;
+ }
+
+ if (MI.getOpcode() == AMDGPU::DOT_4) {
+ MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_X))
+ .setReg(Pred[2].getReg());
+ MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_Y))
+ .setReg(Pred[2].getReg());
+ MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_Z))
+ .setReg(Pred[2].getReg());
+ MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_W))
+ .setReg(Pred[2].getReg());
+ MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
+ MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit);
+ return true;
+ }
+
+ if (PIdx != -1) {
+ MachineOperand &PMO = MI.getOperand(PIdx);
+ PMO.setReg(Pred[2].getReg());
+ MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
+ MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit);
+ return true;
+ }
+
+ return false;
+}
+
+unsigned int R600InstrInfo::getPredicationCost(const MachineInstr &) const {
+ return 2;
+}
+
+unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
+ const MachineInstr &,
+ unsigned *PredCost) const {
+ if (PredCost)
+ *PredCost = 2;
+ return 2;
+}
+
+unsigned R600InstrInfo::calculateIndirectAddress(unsigned RegIndex,
+ unsigned Channel) const {
+ assert(Channel == 0);
+ return RegIndex;
+}
+
+bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ default: {
+ MachineBasicBlock *MBB = MI.getParent();
+ int OffsetOpIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::addr);
+ // addr is a custom operand with multiple MI operands, and only the
+ // first MI operand is given a name.
+ int RegOpIdx = OffsetOpIdx + 1;
+ int ChanOpIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::chan);
+ if (isRegisterLoad(MI)) {
+ int DstOpIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
+ unsigned RegIndex = MI.getOperand(RegOpIdx).getImm();
+ unsigned Channel = MI.getOperand(ChanOpIdx).getImm();
+ unsigned Address = calculateIndirectAddress(RegIndex, Channel);
+ unsigned OffsetReg = MI.getOperand(OffsetOpIdx).getReg();
+ if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) {
+ buildMovInstr(MBB, MI, MI.getOperand(DstOpIdx).getReg(),
+ getIndirectAddrRegClass()->getRegister(Address));
+ } else {
+ buildIndirectRead(MBB, MI, MI.getOperand(DstOpIdx).getReg(), Address,
+ OffsetReg);
+ }
+ } else if (isRegisterStore(MI)) {
+ int ValOpIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::val);
+ unsigned RegIndex = MI.getOperand(RegOpIdx).getImm();
+ unsigned Channel = MI.getOperand(ChanOpIdx).getImm();
+ unsigned Address = calculateIndirectAddress(RegIndex, Channel);
+ unsigned OffsetReg = MI.getOperand(OffsetOpIdx).getReg();
+ if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) {
+ buildMovInstr(MBB, MI, getIndirectAddrRegClass()->getRegister(Address),
+ MI.getOperand(ValOpIdx).getReg());
+ } else {
+ buildIndirectWrite(MBB, MI, MI.getOperand(ValOpIdx).getReg(),
+ calculateIndirectAddress(RegIndex, Channel),
+ OffsetReg);
+ }
+ } else {
+ return false;
+ }
+
+ MBB->erase(MI);
+ return true;
+ }
+ case AMDGPU::R600_EXTRACT_ELT_V2:
+ case AMDGPU::R600_EXTRACT_ELT_V4:
+ buildIndirectRead(MI.getParent(), MI, MI.getOperand(0).getReg(),
+ RI.getHWRegIndex(MI.getOperand(1).getReg()), // Address
+ MI.getOperand(2).getReg(),
+ RI.getHWRegChan(MI.getOperand(1).getReg()));
+ break;
+ case AMDGPU::R600_INSERT_ELT_V2:
+ case AMDGPU::R600_INSERT_ELT_V4:
+ buildIndirectWrite(MI.getParent(), MI, MI.getOperand(2).getReg(), // Value
+ RI.getHWRegIndex(MI.getOperand(1).getReg()), // Address
+ MI.getOperand(3).getReg(), // Offset
+ RI.getHWRegChan(MI.getOperand(1).getReg())); // Channel
+ break;
+ }
+ MI.eraseFromParent();
+ return true;
+}
+
+void R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved,
+ const MachineFunction &MF) const {
+ const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
+ const R600FrameLowering *TFL = ST.getFrameLowering();
+
+ unsigned StackWidth = TFL->getStackWidth(MF);
+ int End = getIndirectIndexEnd(MF);
+
+ if (End == -1)
+ return;
+
+ for (int Index = getIndirectIndexBegin(MF); Index <= End; ++Index) {
+ unsigned SuperReg = AMDGPU::R600_Reg128RegClass.getRegister(Index);
+ Reserved.set(SuperReg);
+ for (unsigned Chan = 0; Chan < StackWidth; ++Chan) {
+ unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister((4 * Index) + Chan);
+ Reserved.set(Reg);
+ }
+ }
+}
+
+const TargetRegisterClass *R600InstrInfo::getIndirectAddrRegClass() const {
+ return &AMDGPU::R600_TReg32_XRegClass;
+}
+
+MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator I,
+ unsigned ValueReg, unsigned Address,
+ unsigned OffsetReg) const {
+ return buildIndirectWrite(MBB, I, ValueReg, Address, OffsetReg, 0);
+}
+
+MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator I,
+ unsigned ValueReg, unsigned Address,
+ unsigned OffsetReg,
+ unsigned AddrChan) const {
+ unsigned AddrReg;
+ switch (AddrChan) {
+ default: llvm_unreachable("Invalid Channel");
+ case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break;
+ case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break;
+ case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break;
+ case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break;
+ }
+ MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
+ AMDGPU::AR_X, OffsetReg);
+ setImmOperand(*MOVA, AMDGPU::OpName::write, 0);
+
+ MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV,
+ AddrReg, ValueReg)
+ .addReg(AMDGPU::AR_X,
+ RegState::Implicit | RegState::Kill);
+ setImmOperand(*Mov, AMDGPU::OpName::dst_rel, 1);
+ return Mov;
+}
+
+MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator I,
+ unsigned ValueReg, unsigned Address,
+ unsigned OffsetReg) const {
+ return buildIndirectRead(MBB, I, ValueReg, Address, OffsetReg, 0);
+}
+
+MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator I,
+ unsigned ValueReg, unsigned Address,
+ unsigned OffsetReg,
+ unsigned AddrChan) const {
+ unsigned AddrReg;
+ switch (AddrChan) {
+ default: llvm_unreachable("Invalid Channel");
+ case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break;
+ case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break;
+ case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break;
+ case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break;
+ }
+ MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
+ AMDGPU::AR_X,
+ OffsetReg);
+ setImmOperand(*MOVA, AMDGPU::OpName::write, 0);
+ MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV,
+ ValueReg,
+ AddrReg)
+ .addReg(AMDGPU::AR_X,
+ RegState::Implicit | RegState::Kill);
+ setImmOperand(*Mov, AMDGPU::OpName::src0_rel, 1);
+
+ return Mov;
+}
+
+int R600InstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const {
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ int Offset = -1;
+
+ if (MFI.getNumObjects() == 0) {
+ return -1;
+ }
+
+ if (MRI.livein_empty()) {
+ return 0;
+ }
+
+ const TargetRegisterClass *IndirectRC = getIndirectAddrRegClass();
+ for (MachineRegisterInfo::livein_iterator LI = MRI.livein_begin(),
+ LE = MRI.livein_end();
+ LI != LE; ++LI) {
+ unsigned Reg = LI->first;
+ if (TargetRegisterInfo::isVirtualRegister(Reg) ||
+ !IndirectRC->contains(Reg))
+ continue;
+
+ unsigned RegIndex;
+ unsigned RegEnd;
+ for (RegIndex = 0, RegEnd = IndirectRC->getNumRegs(); RegIndex != RegEnd;
+ ++RegIndex) {
+ if (IndirectRC->getRegister(RegIndex) == Reg)
+ break;
+ }
+ Offset = std::max(Offset, (int)RegIndex);
+ }
+
+ return Offset + 1;
+}
+
+int R600InstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
+ int Offset = 0;
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ // Variable sized objects are not supported
+ if (MFI.hasVarSizedObjects()) {
+ return -1;
+ }
+
+ if (MFI.getNumObjects() == 0) {
+ return -1;
+ }
+
+ const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
+ const R600FrameLowering *TFL = ST.getFrameLowering();
+
+ unsigned IgnoredFrameReg;
+ Offset = TFL->getFrameIndexReference(MF, -1, IgnoredFrameReg);
+
+ return getIndirectIndexBegin(MF) + Offset;
+}
+
+unsigned R600InstrInfo::getMaxAlusPerClause() const {
+ return 115;
+}
+
+MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ unsigned Opcode,
+ unsigned DstReg,
+ unsigned Src0Reg,
+ unsigned Src1Reg) const {
+ MachineInstrBuilder MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opcode),
+ DstReg); // $dst
+
+ if (Src1Reg) {
+ MIB.addImm(0) // $update_exec_mask
+ .addImm(0); // $update_predicate
+ }
+ MIB.addImm(1) // $write
+ .addImm(0) // $omod
+ .addImm(0) // $dst_rel
+ .addImm(0) // $dst_clamp
+ .addReg(Src0Reg) // $src0
+ .addImm(0) // $src0_neg
+ .addImm(0) // $src0_rel
+ .addImm(0) // $src0_abs
+ .addImm(-1); // $src0_sel
+
+ if (Src1Reg) {
+ MIB.addReg(Src1Reg) // $src1
+ .addImm(0) // $src1_neg
+ .addImm(0) // $src1_rel
+ .addImm(0) // $src1_abs
+ .addImm(-1); // $src1_sel
+ }
+
+ //XXX: The r600g finalizer expects this to be 1, once we've moved the
+ //scheduling to the backend, we can change the default to 0.
+ MIB.addImm(1) // $last
+ .addReg(AMDGPU::PRED_SEL_OFF) // $pred_sel
+ .addImm(0) // $literal
+ .addImm(0); // $bank_swizzle
+
+ return MIB;
+}
+
+#define OPERAND_CASE(Label) \
+ case Label: { \
+ static const unsigned Ops[] = \
+ { \
+ Label##_X, \
+ Label##_Y, \
+ Label##_Z, \
+ Label##_W \
+ }; \
+ return Ops[Slot]; \
+ }
+
+static unsigned getSlotedOps(unsigned Op, unsigned Slot) {
+ switch (Op) {
+ OPERAND_CASE(AMDGPU::OpName::update_exec_mask)
+ OPERAND_CASE(AMDGPU::OpName::update_pred)
+ OPERAND_CASE(AMDGPU::OpName::write)
+ OPERAND_CASE(AMDGPU::OpName::omod)
+ OPERAND_CASE(AMDGPU::OpName::dst_rel)
+ OPERAND_CASE(AMDGPU::OpName::clamp)
+ OPERAND_CASE(AMDGPU::OpName::src0)
+ OPERAND_CASE(AMDGPU::OpName::src0_neg)
+ OPERAND_CASE(AMDGPU::OpName::src0_rel)
+ OPERAND_CASE(AMDGPU::OpName::src0_abs)
+ OPERAND_CASE(AMDGPU::OpName::src0_sel)
+ OPERAND_CASE(AMDGPU::OpName::src1)
+ OPERAND_CASE(AMDGPU::OpName::src1_neg)
+ OPERAND_CASE(AMDGPU::OpName::src1_rel)
+ OPERAND_CASE(AMDGPU::OpName::src1_abs)
+ OPERAND_CASE(AMDGPU::OpName::src1_sel)
+ OPERAND_CASE(AMDGPU::OpName::pred_sel)
+ default:
+ llvm_unreachable("Wrong Operand");
+ }
+}
+
+#undef OPERAND_CASE
+
+MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction(
+ MachineBasicBlock &MBB, MachineInstr *MI, unsigned Slot, unsigned DstReg)
+ const {
+ assert (MI->getOpcode() == AMDGPU::DOT_4 && "Not Implemented");
+ unsigned Opcode;
+ if (ST.getGeneration() <= R600Subtarget::R700)
+ Opcode = AMDGPU::DOT4_r600;
+ else
+ Opcode = AMDGPU::DOT4_eg;
+ MachineBasicBlock::iterator I = MI;
+ MachineOperand &Src0 = MI->getOperand(
+ getOperandIdx(MI->getOpcode(), getSlotedOps(AMDGPU::OpName::src0, Slot)));
+ MachineOperand &Src1 = MI->getOperand(
+ getOperandIdx(MI->getOpcode(), getSlotedOps(AMDGPU::OpName::src1, Slot)));
+ MachineInstr *MIB = buildDefaultInstruction(
+ MBB, I, Opcode, DstReg, Src0.getReg(), Src1.getReg());
+ static const unsigned Operands[14] = {
+ AMDGPU::OpName::update_exec_mask,
+ AMDGPU::OpName::update_pred,
+ AMDGPU::OpName::write,
+ AMDGPU::OpName::omod,
+ AMDGPU::OpName::dst_rel,
+ AMDGPU::OpName::clamp,
+ AMDGPU::OpName::src0_neg,
+ AMDGPU::OpName::src0_rel,
+ AMDGPU::OpName::src0_abs,
+ AMDGPU::OpName::src0_sel,
+ AMDGPU::OpName::src1_neg,
+ AMDGPU::OpName::src1_rel,
+ AMDGPU::OpName::src1_abs,
+ AMDGPU::OpName::src1_sel,
+ };
+
+ MachineOperand &MO = MI->getOperand(getOperandIdx(MI->getOpcode(),
+ getSlotedOps(AMDGPU::OpName::pred_sel, Slot)));
+ MIB->getOperand(getOperandIdx(Opcode, AMDGPU::OpName::pred_sel))
+ .setReg(MO.getReg());
+
+ for (unsigned i = 0; i < 14; i++) {
+ MachineOperand &MO = MI->getOperand(
+ getOperandIdx(MI->getOpcode(), getSlotedOps(Operands[i], Slot)));
+ assert (MO.isImm());
+ setImmOperand(*MIB, Operands[i], MO.getImm());
+ }
+ MIB->getOperand(20).setImm(0);
+ return MIB;
+}
+
+MachineInstr *R600InstrInfo::buildMovImm(MachineBasicBlock &BB,
+ MachineBasicBlock::iterator I,
+ unsigned DstReg,
+ uint64_t Imm) const {
+ MachineInstr *MovImm = buildDefaultInstruction(BB, I, AMDGPU::MOV, DstReg,
+ AMDGPU::ALU_LITERAL_X);
+ setImmOperand(*MovImm, AMDGPU::OpName::literal, Imm);
+ return MovImm;
+}
+
+MachineInstr *R600InstrInfo::buildMovInstr(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator I,
+ unsigned DstReg, unsigned SrcReg) const {
+ return buildDefaultInstruction(*MBB, I, AMDGPU::MOV, DstReg, SrcReg);
+}
+
+int R600InstrInfo::getOperandIdx(const MachineInstr &MI, unsigned Op) const {
+ return getOperandIdx(MI.getOpcode(), Op);
+}
+
+int R600InstrInfo::getOperandIdx(unsigned Opcode, unsigned Op) const {
+ return AMDGPU::getNamedOperandIdx(Opcode, Op);
+}
+
+void R600InstrInfo::setImmOperand(MachineInstr &MI, unsigned Op,
+ int64_t Imm) const {
+ int Idx = getOperandIdx(MI, Op);
+ assert(Idx != -1 && "Operand not supported for this instruction.");
+ assert(MI.getOperand(Idx).isImm());
+ MI.getOperand(Idx).setImm(Imm);
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction flag getters/setters
+//===----------------------------------------------------------------------===//
+
+MachineOperand &R600InstrInfo::getFlagOp(MachineInstr &MI, unsigned SrcIdx,
+ unsigned Flag) const {
+ unsigned TargetFlags = get(MI.getOpcode()).TSFlags;
+ int FlagIndex = 0;
+ if (Flag != 0) {
+ // If we pass something other than the default value of Flag to this
+ // function, it means we are want to set a flag on an instruction
+ // that uses native encoding.
+ assert(HAS_NATIVE_OPERANDS(TargetFlags));
+ bool IsOP3 = (TargetFlags & R600_InstFlag::OP3) == R600_InstFlag::OP3;
+ switch (Flag) {
+ case MO_FLAG_CLAMP:
+ FlagIndex = getOperandIdx(MI, AMDGPU::OpName::clamp);
+ break;
+ case MO_FLAG_MASK:
+ FlagIndex = getOperandIdx(MI, AMDGPU::OpName::write);
+ break;
+ case MO_FLAG_NOT_LAST:
+ case MO_FLAG_LAST:
+ FlagIndex = getOperandIdx(MI, AMDGPU::OpName::last);
+ break;
+ case MO_FLAG_NEG:
+ switch (SrcIdx) {
+ case 0:
+ FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src0_neg);
+ break;
+ case 1:
+ FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src1_neg);
+ break;
+ case 2:
+ FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src2_neg);
+ break;
+ }
+ break;
+
+ case MO_FLAG_ABS:
+ assert(!IsOP3 && "Cannot set absolute value modifier for OP3 "
+ "instructions.");
+ (void)IsOP3;
+ switch (SrcIdx) {
+ case 0:
+ FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src0_abs);
+ break;
+ case 1:
+ FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src1_abs);
+ break;
+ }
+ break;
+
+ default:
+ FlagIndex = -1;
+ break;
+ }
+ assert(FlagIndex != -1 && "Flag not supported for this instruction");
+ } else {
+ FlagIndex = GET_FLAG_OPERAND_IDX(TargetFlags);
+ assert(FlagIndex != 0 &&
+ "Instruction flags not supported for this instruction");
+ }
+
+ MachineOperand &FlagOp = MI.getOperand(FlagIndex);
+ assert(FlagOp.isImm());
+ return FlagOp;
+}
+
+void R600InstrInfo::addFlag(MachineInstr &MI, unsigned Operand,
+ unsigned Flag) const {
+ unsigned TargetFlags = get(MI.getOpcode()).TSFlags;
+ if (Flag == 0) {
+ return;
+ }
+ if (HAS_NATIVE_OPERANDS(TargetFlags)) {
+ MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag);
+ if (Flag == MO_FLAG_NOT_LAST) {
+ clearFlag(MI, Operand, MO_FLAG_LAST);
+ } else if (Flag == MO_FLAG_MASK) {
+ clearFlag(MI, Operand, Flag);
+ } else {
+ FlagOp.setImm(1);
+ }
+ } else {
+ MachineOperand &FlagOp = getFlagOp(MI, Operand);
+ FlagOp.setImm(FlagOp.getImm() | (Flag << (NUM_MO_FLAGS * Operand)));
+ }
+}
+
+void R600InstrInfo::clearFlag(MachineInstr &MI, unsigned Operand,
+ unsigned Flag) const {
+ unsigned TargetFlags = get(MI.getOpcode()).TSFlags;
+ if (HAS_NATIVE_OPERANDS(TargetFlags)) {
+ MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag);
+ FlagOp.setImm(0);
+ } else {
+ MachineOperand &FlagOp = getFlagOp(MI);
+ unsigned InstFlags = FlagOp.getImm();
+ InstFlags &= ~(Flag << (NUM_MO_FLAGS * Operand));
+ FlagOp.setImm(InstFlags);
+ }
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h
new file mode 100644
index 000000000000..a280052dbd4a
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h
@@ -0,0 +1,331 @@
+//===-- R600InstrInfo.h - R600 Instruction Info Interface -------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface definition for R600InstrInfo
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_R600INSTRINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_R600INSTRINFO_H
+
+#include "AMDGPUInstrInfo.h"
+#include "R600RegisterInfo.h"
+
+namespace llvm {
+
+namespace R600InstrFlags {
+enum : uint64_t {
+ REGISTER_STORE = UINT64_C(1) << 62,
+ REGISTER_LOAD = UINT64_C(1) << 63
+};
+}
+
+class AMDGPUTargetMachine;
+class DFAPacketizer;
+class MachineFunction;
+class MachineInstr;
+class MachineInstrBuilder;
+class R600Subtarget;
+
+class R600InstrInfo final : public AMDGPUInstrInfo {
+private:
+ const R600RegisterInfo RI;
+ const R600Subtarget &ST;
+
+ std::vector<std::pair<int, unsigned>>
+ ExtractSrcs(MachineInstr &MI, const DenseMap<unsigned, unsigned> &PV,
+ unsigned &ConstCount) const;
+
+ MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator I,
+ unsigned ValueReg, unsigned Address,
+ unsigned OffsetReg,
+ unsigned AddrChan) const;
+
+ MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator I,
+ unsigned ValueReg, unsigned Address,
+ unsigned OffsetReg,
+ unsigned AddrChan) const;
+public:
+ enum BankSwizzle {
+ ALU_VEC_012_SCL_210 = 0,
+ ALU_VEC_021_SCL_122,
+ ALU_VEC_120_SCL_212,
+ ALU_VEC_102_SCL_221,
+ ALU_VEC_201,
+ ALU_VEC_210
+ };
+
+ explicit R600InstrInfo(const R600Subtarget &);
+
+ const R600RegisterInfo &getRegisterInfo() const {
+ return RI;
+ }
+
+ void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+ bool KillSrc) const override;
+ bool isLegalToSplitMBBAt(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI) const override;
+
+ bool isReductionOp(unsigned opcode) const;
+ bool isCubeOp(unsigned opcode) const;
+
+ /// \returns true if this \p Opcode represents an ALU instruction.
+ bool isALUInstr(unsigned Opcode) const;
+ bool hasInstrModifiers(unsigned Opcode) const;
+ bool isLDSInstr(unsigned Opcode) const;
+ bool isLDSRetInstr(unsigned Opcode) const;
+
+ /// \returns true if this \p Opcode represents an ALU instruction or an
+ /// instruction that will be lowered in ExpandSpecialInstrs Pass.
+ bool canBeConsideredALU(const MachineInstr &MI) const;
+
+ bool isTransOnly(unsigned Opcode) const;
+ bool isTransOnly(const MachineInstr &MI) const;
+ bool isVectorOnly(unsigned Opcode) const;
+ bool isVectorOnly(const MachineInstr &MI) const;
+ bool isExport(unsigned Opcode) const;
+
+ bool usesVertexCache(unsigned Opcode) const;
+ bool usesVertexCache(const MachineInstr &MI) const;
+ bool usesTextureCache(unsigned Opcode) const;
+ bool usesTextureCache(const MachineInstr &MI) const;
+
+ bool mustBeLastInClause(unsigned Opcode) const;
+ bool usesAddressRegister(MachineInstr &MI) const;
+ bool definesAddressRegister(MachineInstr &MI) const;
+ bool readsLDSSrcReg(const MachineInstr &MI) const;
+
+ /// \returns The operand Index for the Sel operand given an index to one
+ /// of the instruction's src operands.
+ int getSelIdx(unsigned Opcode, unsigned SrcIdx) const;
+
+ /// \returns a pair for each src of an ALU instructions.
+ /// The first member of a pair is the register id.
+ /// If register is ALU_CONST, second member is SEL.
+ /// If register is ALU_LITERAL, second member is IMM.
+ /// Otherwise, second member value is undefined.
+ SmallVector<std::pair<MachineOperand *, int64_t>, 3>
+ getSrcs(MachineInstr &MI) const;
+
+ unsigned isLegalUpTo(
+ const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+ const std::vector<R600InstrInfo::BankSwizzle> &Swz,
+ const std::vector<std::pair<int, unsigned> > &TransSrcs,
+ R600InstrInfo::BankSwizzle TransSwz) const;
+
+ bool FindSwizzleForVectorSlot(
+ const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+ std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
+ const std::vector<std::pair<int, unsigned> > &TransSrcs,
+ R600InstrInfo::BankSwizzle TransSwz) const;
+
+ /// Given the order VEC_012 < VEC_021 < VEC_120 < VEC_102 < VEC_201 < VEC_210
+ /// returns true and the first (in lexical order) BankSwizzle affectation
+ /// starting from the one already provided in the Instruction Group MIs that
+ /// fits Read Port limitations in BS if available. Otherwise returns false
+ /// and undefined content in BS.
+ /// isLastAluTrans should be set if the last Alu of MIs will be executed on
+ /// Trans ALU. In this case, ValidTSwizzle returns the BankSwizzle value to
+ /// apply to the last instruction.
+ /// PV holds GPR to PV registers in the Instruction Group MIs.
+ bool fitsReadPortLimitations(const std::vector<MachineInstr *> &MIs,
+ const DenseMap<unsigned, unsigned> &PV,
+ std::vector<BankSwizzle> &BS,
+ bool isLastAluTrans) const;
+
+ /// An instruction group can only access 2 channel pair (either [XY] or [ZW])
+ /// from KCache bank on R700+. This function check if MI set in input meet
+ /// this limitations
+ bool fitsConstReadLimitations(const std::vector<MachineInstr *> &) const;
+ /// Same but using const index set instead of MI set.
+ bool fitsConstReadLimitations(const std::vector<unsigned>&) const;
+
+ /// \brief Vector instructions are instructions that must fill all
+ /// instruction slots within an instruction group.
+ bool isVector(const MachineInstr &MI) const;
+
+ bool isMov(unsigned Opcode) const;
+
+ DFAPacketizer *
+ CreateTargetScheduleState(const TargetSubtargetInfo &) const override;
+
+ bool reverseBranchCondition(
+ SmallVectorImpl<MachineOperand> &Cond) const override;
+
+ bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const override;
+
+ unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+ const DebugLoc &DL,
+ int *BytesAdded = nullptr) const override;
+
+ unsigned removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemvoed = nullptr) const override;
+
+ bool isPredicated(const MachineInstr &MI) const override;
+
+ bool isPredicable(MachineInstr &MI) const override;
+
+ bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
+ BranchProbability Probability) const override;
+
+ bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
+ unsigned ExtraPredCycles,
+ BranchProbability Probability) const override ;
+
+ bool isProfitableToIfCvt(MachineBasicBlock &TMBB,
+ unsigned NumTCycles, unsigned ExtraTCycles,
+ MachineBasicBlock &FMBB,
+ unsigned NumFCycles, unsigned ExtraFCycles,
+ BranchProbability Probability) const override;
+
+ bool DefinesPredicate(MachineInstr &MI,
+ std::vector<MachineOperand> &Pred) const override;
+
+ bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
+ MachineBasicBlock &FMBB) const override;
+
+ bool PredicateInstruction(MachineInstr &MI,
+ ArrayRef<MachineOperand> Pred) const override;
+
+ unsigned int getPredicationCost(const MachineInstr &) const override;
+
+ unsigned int getInstrLatency(const InstrItineraryData *ItinData,
+ const MachineInstr &MI,
+ unsigned *PredCost = nullptr) const override;
+
+ bool expandPostRAPseudo(MachineInstr &MI) const override;
+
+ /// \brief Reserve the registers that may be accesed using indirect addressing.
+ void reserveIndirectRegisters(BitVector &Reserved,
+ const MachineFunction &MF) const;
+
+ /// Calculate the "Indirect Address" for the given \p RegIndex and
+ /// \p Channel
+ ///
+ /// We model indirect addressing using a virtual address space that can be
+ /// accesed with loads and stores. The "Indirect Address" is the memory
+ /// address in this virtual address space that maps to the given \p RegIndex
+ /// and \p Channel.
+ unsigned calculateIndirectAddress(unsigned RegIndex, unsigned Channel) const;
+
+
+ /// \returns The register class to be used for loading and storing values
+ /// from an "Indirect Address" .
+ const TargetRegisterClass *getIndirectAddrRegClass() const;
+
+ /// \returns the smallest register index that will be accessed by an indirect
+ /// read or write or -1 if indirect addressing is not used by this program.
+ int getIndirectIndexBegin(const MachineFunction &MF) const;
+
+ /// \returns the largest register index that will be accessed by an indirect
+ /// read or write or -1 if indirect addressing is not used by this program.
+ int getIndirectIndexEnd(const MachineFunction &MF) const;
+
+ /// \brief Build instruction(s) for an indirect register write.
+ ///
+ /// \returns The instruction that performs the indirect register write
+ MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator I,
+ unsigned ValueReg, unsigned Address,
+ unsigned OffsetReg) const;
+
+ /// \brief Build instruction(s) for an indirect register read.
+ ///
+ /// \returns The instruction that performs the indirect register read
+ MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator I,
+ unsigned ValueReg, unsigned Address,
+ unsigned OffsetReg) const;
+
+ unsigned getMaxAlusPerClause() const;
+
+ /// buildDefaultInstruction - This function returns a MachineInstr with all
+ /// the instruction modifiers initialized to their default values. You can
+ /// use this function to avoid manually specifying each instruction modifier
+ /// operand when building a new instruction.
+ ///
+ /// \returns a MachineInstr with all the instruction modifiers initialized
+ /// to their default values.
+ MachineInstrBuilder buildDefaultInstruction(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ unsigned Opcode,
+ unsigned DstReg,
+ unsigned Src0Reg,
+ unsigned Src1Reg = 0) const;
+
+ MachineInstr *buildSlotOfVectorInstruction(MachineBasicBlock &MBB,
+ MachineInstr *MI,
+ unsigned Slot,
+ unsigned DstReg) const;
+
+ MachineInstr *buildMovImm(MachineBasicBlock &BB,
+ MachineBasicBlock::iterator I,
+ unsigned DstReg,
+ uint64_t Imm) const;
+
+ MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator I,
+ unsigned DstReg, unsigned SrcReg) const;
+
+ /// \brief Get the index of Op in the MachineInstr.
+ ///
+ /// \returns -1 if the Instruction does not contain the specified \p Op.
+ int getOperandIdx(const MachineInstr &MI, unsigned Op) const;
+
+ /// \brief Get the index of \p Op for the given Opcode.
+ ///
+ /// \returns -1 if the Instruction does not contain the specified \p Op.
+ int getOperandIdx(unsigned Opcode, unsigned Op) const;
+
+ /// \brief Helper function for setting instruction flag values.
+ void setImmOperand(MachineInstr &MI, unsigned Op, int64_t Imm) const;
+
+ ///\brief Add one of the MO_FLAG* flags to the specified \p Operand.
+ void addFlag(MachineInstr &MI, unsigned Operand, unsigned Flag) const;
+
+ ///\brief Determine if the specified \p Flag is set on this \p Operand.
+ bool isFlagSet(const MachineInstr &MI, unsigned Operand, unsigned Flag) const;
+
+ /// \param SrcIdx The register source to set the flag on (e.g src0, src1, src2)
+ /// \param Flag The flag being set.
+ ///
+ /// \returns the operand containing the flags for this instruction.
+ MachineOperand &getFlagOp(MachineInstr &MI, unsigned SrcIdx = 0,
+ unsigned Flag = 0) const;
+
+ /// \brief Clear the specified flag on the instruction.
+ void clearFlag(MachineInstr &MI, unsigned Operand, unsigned Flag) const;
+
+ // Helper functions that check the opcode for status information
+ bool isRegisterStore(const MachineInstr &MI) const {
+ return get(MI.getOpcode()).TSFlags & R600InstrFlags::REGISTER_STORE;
+ }
+
+ bool isRegisterLoad(const MachineInstr &MI) const {
+ return get(MI.getOpcode()).TSFlags & R600InstrFlags::REGISTER_LOAD;
+ }
+};
+
+namespace AMDGPU {
+
+int getLDSNoRetOp(uint16_t Opcode);
+
+} //End namespace AMDGPU
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td b/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td
new file mode 100644
index 000000000000..3a72e0791fd6
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td
@@ -0,0 +1,1722 @@
+//===-- R600Instructions.td - R600 Instruction defs -------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TableGen definitions for instructions which are available on R600 family
+// GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+include "R600Intrinsics.td"
+include "R600InstrFormats.td"
+
+class InstR600ISA <dag outs, dag ins, string asm, list<dag> pattern = []> :
+ InstR600 <outs, ins, asm, pattern, NullALU> {
+
+ let Namespace = "AMDGPU";
+}
+
+def MEMxi : Operand<iPTR> {
+ let MIOperandInfo = (ops R600_TReg32_X:$ptr, i32imm:$index);
+ let PrintMethod = "printMemOperand";
+}
+
+def MEMrr : Operand<iPTR> {
+ let MIOperandInfo = (ops R600_Reg32:$ptr, R600_Reg32:$index);
+}
+
+// Operands for non-registers
+
+class InstFlag<string PM = "printOperand", int Default = 0>
+ : OperandWithDefaultOps <i32, (ops (i32 Default))> {
+ let PrintMethod = PM;
+}
+
+// src_sel for ALU src operands, see also ALU_CONST, ALU_PARAM registers
+def SEL : OperandWithDefaultOps <i32, (ops (i32 -1))> {
+ let PrintMethod = "printSel";
+}
+def BANK_SWIZZLE : OperandWithDefaultOps <i32, (ops (i32 0))> {
+ let PrintMethod = "printBankSwizzle";
+}
+
+def LITERAL : InstFlag<"printLiteral">;
+
+def WRITE : InstFlag <"printWrite", 1>;
+def OMOD : InstFlag <"printOMOD">;
+def REL : InstFlag <"printRel">;
+def CLAMP : InstFlag <"printClamp">;
+def NEG : InstFlag <"printNeg">;
+def ABS : InstFlag <"printAbs">;
+def UEM : InstFlag <"printUpdateExecMask">;
+def UP : InstFlag <"printUpdatePred">;
+
+// XXX: The r600g finalizer in Mesa expects last to be one in most cases.
+// Once we start using the packetizer in this backend we should have this
+// default to 0.
+def LAST : InstFlag<"printLast", 1>;
+def RSel : Operand<i32> {
+ let PrintMethod = "printRSel";
+}
+def CT: Operand<i32> {
+ let PrintMethod = "printCT";
+}
+
+def FRAMEri : Operand<iPTR> {
+ let MIOperandInfo = (ops R600_Reg32:$ptr, i32imm:$index);
+}
+
+def ADDRParam : ComplexPattern<i32, 2, "SelectADDRParam", [], []>;
+def ADDRDWord : ComplexPattern<i32, 1, "SelectADDRDWord", [], []>;
+def ADDRVTX_READ : ComplexPattern<i32, 2, "SelectADDRVTX_READ", [], []>;
+def ADDRGA_CONST_OFFSET : ComplexPattern<i32, 1, "SelectGlobalValueConstantOffset", [], []>;
+def ADDRGA_VAR_OFFSET : ComplexPattern<i32, 2, "SelectGlobalValueVariableOffset", [], []>;
+
+
+def R600_Pred : PredicateOperand<i32, (ops R600_Predicate),
+ (ops PRED_SEL_OFF)>;
+
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+
+// Class for instructions with only one source register.
+// If you add new ins to this instruction, make sure they are listed before
+// $literal, because the backend currently assumes that the last operand is
+// a literal. Also be sure to update the enum R600Op1OperandIndex::ROI in
+// R600Defines.h, R600InstrInfo::buildDefaultInstruction(),
+// and R600InstrInfo::getOperandIdx().
+class R600_1OP <bits<11> inst, string opName, list<dag> pattern,
+ InstrItinClass itin = AnyALU> :
+ InstR600 <(outs R600_Reg32:$dst),
+ (ins WRITE:$write, OMOD:$omod, REL:$dst_rel, CLAMP:$clamp,
+ R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel,
+ LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal,
+ BANK_SWIZZLE:$bank_swizzle),
+ !strconcat(" ", opName,
+ "$clamp $last $dst$write$dst_rel$omod, "
+ "$src0_neg$src0_abs$src0$src0_abs$src0_rel, "
+ "$pred_sel $bank_swizzle"),
+ pattern,
+ itin>,
+ R600ALU_Word0,
+ R600ALU_Word1_OP2 <inst> {
+
+ let src1 = 0;
+ let src1_rel = 0;
+ let src1_neg = 0;
+ let src1_abs = 0;
+ let update_exec_mask = 0;
+ let update_pred = 0;
+ let HasNativeOperands = 1;
+ let Op1 = 1;
+ let ALUInst = 1;
+ let DisableEncoding = "$literal";
+ let UseNamedOperandTable = 1;
+
+ let Inst{31-0} = Word0;
+ let Inst{63-32} = Word1;
+}
+
+class R600_1OP_Helper <bits<11> inst, string opName, SDPatternOperator node,
+ InstrItinClass itin = AnyALU> :
+ R600_1OP <inst, opName,
+ [(set R600_Reg32:$dst, (node R600_Reg32:$src0))], itin
+>;
+
+// If you add or change the operands for R600_2OP instructions, you must
+// also update the R600Op2OperandIndex::ROI enum in R600Defines.h,
+// R600InstrInfo::buildDefaultInstruction(), and R600InstrInfo::getOperandIdx().
+class R600_2OP <bits<11> inst, string opName, list<dag> pattern,
+ InstrItinClass itin = AnyALU> :
+ InstR600 <(outs R600_Reg32:$dst),
+ (ins UEM:$update_exec_mask, UP:$update_pred, WRITE:$write,
+ OMOD:$omod, REL:$dst_rel, CLAMP:$clamp,
+ R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel,
+ R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, ABS:$src1_abs, SEL:$src1_sel,
+ LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal,
+ BANK_SWIZZLE:$bank_swizzle),
+ !strconcat(" ", opName,
+ "$clamp $last $update_exec_mask$update_pred$dst$write$dst_rel$omod, "
+ "$src0_neg$src0_abs$src0$src0_abs$src0_rel, "
+ "$src1_neg$src1_abs$src1$src1_abs$src1_rel, "
+ "$pred_sel $bank_swizzle"),
+ pattern,
+ itin>,
+ R600ALU_Word0,
+ R600ALU_Word1_OP2 <inst> {
+
+ let HasNativeOperands = 1;
+ let Op2 = 1;
+ let ALUInst = 1;
+ let DisableEncoding = "$literal";
+ let UseNamedOperandTable = 1;
+
+ let Inst{31-0} = Word0;
+ let Inst{63-32} = Word1;
+}
+
+class R600_2OP_Helper <bits<11> inst, string opName,
+ SDPatternOperator node = null_frag,
+ InstrItinClass itin = AnyALU> :
+ R600_2OP <inst, opName,
+ [(set R600_Reg32:$dst, (node R600_Reg32:$src0,
+ R600_Reg32:$src1))], itin
+>;
+
+// If you add our change the operands for R600_3OP instructions, you must
+// also update the R600Op3OperandIndex::ROI enum in R600Defines.h,
+// R600InstrInfo::buildDefaultInstruction(), and
+// R600InstrInfo::getOperandIdx().
+class R600_3OP <bits<5> inst, string opName, list<dag> pattern,
+ InstrItinClass itin = AnyALU> :
+ InstR600 <(outs R600_Reg32:$dst),
+ (ins REL:$dst_rel, CLAMP:$clamp,
+ R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, SEL:$src0_sel,
+ R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, SEL:$src1_sel,
+ R600_Reg32:$src2, NEG:$src2_neg, REL:$src2_rel, SEL:$src2_sel,
+ LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal,
+ BANK_SWIZZLE:$bank_swizzle),
+ !strconcat(" ", opName, "$clamp $last $dst$dst_rel, "
+ "$src0_neg$src0$src0_rel, "
+ "$src1_neg$src1$src1_rel, "
+ "$src2_neg$src2$src2_rel, "
+ "$pred_sel"
+ "$bank_swizzle"),
+ pattern,
+ itin>,
+ R600ALU_Word0,
+ R600ALU_Word1_OP3<inst>{
+
+ let HasNativeOperands = 1;
+ let DisableEncoding = "$literal";
+ let Op3 = 1;
+ let UseNamedOperandTable = 1;
+ let ALUInst = 1;
+
+ let Inst{31-0} = Word0;
+ let Inst{63-32} = Word1;
+}
+
+class R600_REDUCTION <bits<11> inst, dag ins, string asm, list<dag> pattern,
+ InstrItinClass itin = VecALU> :
+ InstR600 <(outs R600_Reg32:$dst),
+ ins,
+ asm,
+ pattern,
+ itin>;
+
+
+
+} // End mayLoad = 1, mayStore = 0, hasSideEffects = 0
+
+def TEX_SHADOW : PatLeaf<
+ (imm),
+ [{uint32_t TType = (uint32_t)N->getZExtValue();
+ return (TType >= 6 && TType <= 8) || TType == 13;
+ }]
+>;
+
+def TEX_RECT : PatLeaf<
+ (imm),
+ [{uint32_t TType = (uint32_t)N->getZExtValue();
+ return TType == 5;
+ }]
+>;
+
+def TEX_ARRAY : PatLeaf<
+ (imm),
+ [{uint32_t TType = (uint32_t)N->getZExtValue();
+ return TType == 9 || TType == 10 || TType == 16;
+ }]
+>;
+
+def TEX_SHADOW_ARRAY : PatLeaf<
+ (imm),
+ [{uint32_t TType = (uint32_t)N->getZExtValue();
+ return TType == 11 || TType == 12 || TType == 17;
+ }]
+>;
+
+class EG_CF_RAT <bits <8> cfinst, bits <6> ratinst, bits<4> ratid, bits<4> mask,
+ dag outs, dag ins, string asm, list<dag> pattern> :
+ InstR600ISA <outs, ins, asm, pattern>,
+ CF_ALLOC_EXPORT_WORD0_RAT, CF_ALLOC_EXPORT_WORD1_BUF {
+
+ let rat_id = ratid;
+ let rat_inst = ratinst;
+ let rim = 0;
+ // XXX: Have a separate instruction for non-indexed writes.
+ let type = 1;
+ let rw_rel = 0;
+ let elem_size = 0;
+
+ let array_size = 0;
+ let comp_mask = mask;
+ let burst_count = 0;
+ let vpm = 0;
+ let cf_inst = cfinst;
+ let mark = 0;
+ let barrier = 1;
+
+ let Inst{31-0} = Word0;
+ let Inst{63-32} = Word1;
+ let IsExport = 1;
+
+}
+
+class VTX_READ <string name, dag outs, list<dag> pattern>
+ : InstR600ISA <outs, (ins MEMxi:$src_gpr, i8imm:$buffer_id), !strconcat(" ", name, ", #$buffer_id"), pattern>,
+ VTX_WORD1_GPR {
+
+ // Static fields
+ let DST_REL = 0;
+ // The docs say that if this bit is set, then DATA_FORMAT, NUM_FORMAT_ALL,
+ // FORMAT_COMP_ALL, SRF_MODE_ALL, and ENDIAN_SWAP fields will be ignored,
+ // however, based on my testing if USE_CONST_FIELDS is set, then all
+ // these fields need to be set to 0.
+ let USE_CONST_FIELDS = 0;
+ let NUM_FORMAT_ALL = 1;
+ let FORMAT_COMP_ALL = 0;
+ let SRF_MODE_ALL = 0;
+
+ let Inst{63-32} = Word1;
+ // LLVM can only encode 64-bit instructions, so these fields are manually
+ // encoded in R600CodeEmitter
+ //
+ // bits<16> OFFSET;
+ // bits<2> ENDIAN_SWAP = 0;
+ // bits<1> CONST_BUF_NO_STRIDE = 0;
+ // bits<1> MEGA_FETCH = 0;
+ // bits<1> ALT_CONST = 0;
+ // bits<2> BUFFER_INDEX_MODE = 0;
+
+ // VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding
+ // is done in R600CodeEmitter
+ //
+ // Inst{79-64} = OFFSET;
+ // Inst{81-80} = ENDIAN_SWAP;
+ // Inst{82} = CONST_BUF_NO_STRIDE;
+ // Inst{83} = MEGA_FETCH;
+ // Inst{84} = ALT_CONST;
+ // Inst{86-85} = BUFFER_INDEX_MODE;
+ // Inst{95-86} = 0; Reserved
+
+ // VTX_WORD3 (Padding)
+ //
+ // Inst{127-96} = 0;
+
+ let VTXInst = 1;
+}
+
+class LoadParamFrag <PatFrag load_type> : PatFrag <
+ (ops node:$ptr), (load_type node:$ptr),
+ [{ return isConstantLoad(cast<LoadSDNode>(N), 0) ||
+ (cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUAS::PARAM_I_ADDRESS); }]
+>;
+
+def vtx_id3_az_extloadi8 : LoadParamFrag<az_extloadi8>;
+def vtx_id3_az_extloadi16 : LoadParamFrag<az_extloadi16>;
+def vtx_id3_load : LoadParamFrag<load>;
+
+class LoadVtxId1 <PatFrag load> : PatFrag <
+ (ops node:$ptr), (load node:$ptr), [{
+ const MemSDNode *LD = cast<MemSDNode>(N);
+ return LD->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
+ (LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+ !isa<GlobalValue>(GetUnderlyingObject(
+ LD->getMemOperand()->getValue(), CurDAG->getDataLayout())));
+}]>;
+
+def vtx_id1_az_extloadi8 : LoadVtxId1 <az_extloadi8>;
+def vtx_id1_az_extloadi16 : LoadVtxId1 <az_extloadi16>;
+def vtx_id1_load : LoadVtxId1 <load>;
+
+class LoadVtxId2 <PatFrag load> : PatFrag <
+ (ops node:$ptr), (load node:$ptr), [{
+ const MemSDNode *LD = cast<MemSDNode>(N);
+ return LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+ isa<GlobalValue>(GetUnderlyingObject(
+ LD->getMemOperand()->getValue(), CurDAG->getDataLayout()));
+}]>;
+
+def vtx_id2_az_extloadi8 : LoadVtxId2 <az_extloadi8>;
+def vtx_id2_az_extloadi16 : LoadVtxId2 <az_extloadi16>;
+def vtx_id2_load : LoadVtxId2 <load>;
+
+def isR600 : Predicate<"Subtarget->getGeneration() <= R600Subtarget::R700">;
+
+def isR600toCayman
+ : Predicate<
+ "Subtarget->getGeneration() <= R600Subtarget::NORTHERN_ISLANDS">;
+
+//===----------------------------------------------------------------------===//
+// R600 SDNodes
+//===----------------------------------------------------------------------===//
+
+def INTERP_PAIR_XY : AMDGPUShaderInst <
+ (outs R600_TReg32_X:$dst0, R600_TReg32_Y:$dst1),
+ (ins i32imm:$src0, R600_TReg32_Y:$src1, R600_TReg32_X:$src2),
+ "INTERP_PAIR_XY $src0 $src1 $src2 : $dst0 dst1",
+ []>;
+
+def INTERP_PAIR_ZW : AMDGPUShaderInst <
+ (outs R600_TReg32_Z:$dst0, R600_TReg32_W:$dst1),
+ (ins i32imm:$src0, R600_TReg32_Y:$src1, R600_TReg32_X:$src2),
+ "INTERP_PAIR_ZW $src0 $src1 $src2 : $dst0 dst1",
+ []>;
+
+def CONST_ADDRESS: SDNode<"AMDGPUISD::CONST_ADDRESS",
+ SDTypeProfile<1, -1, [SDTCisInt<0>, SDTCisPtrTy<1>]>,
+ [SDNPVariadic]
+>;
+
+def DOT4 : SDNode<"AMDGPUISD::DOT4",
+ SDTypeProfile<1, 8, [SDTCisFP<0>, SDTCisVT<1, f32>, SDTCisVT<2, f32>,
+ SDTCisVT<3, f32>, SDTCisVT<4, f32>, SDTCisVT<5, f32>,
+ SDTCisVT<6, f32>, SDTCisVT<7, f32>, SDTCisVT<8, f32>]>,
+ []
+>;
+
+def COS_HW : SDNode<"AMDGPUISD::COS_HW",
+ SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>
+>;
+
+def SIN_HW : SDNode<"AMDGPUISD::SIN_HW",
+ SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>
+>;
+
+def TEXTURE_FETCH_Type : SDTypeProfile<1, 19, [SDTCisFP<0>]>;
+
+def TEXTURE_FETCH: SDNode<"AMDGPUISD::TEXTURE_FETCH", TEXTURE_FETCH_Type, []>;
+
+multiclass TexPattern<bits<32> TextureOp, Instruction inst, ValueType vt = v4f32> {
+def : Pat<(TEXTURE_FETCH (i32 TextureOp), vt:$SRC_GPR,
+ (i32 imm:$srcx), (i32 imm:$srcy), (i32 imm:$srcz), (i32 imm:$srcw),
+ (i32 imm:$offsetx), (i32 imm:$offsety), (i32 imm:$offsetz),
+ (i32 imm:$DST_SEL_X), (i32 imm:$DST_SEL_Y), (i32 imm:$DST_SEL_Z),
+ (i32 imm:$DST_SEL_W),
+ (i32 imm:$RESOURCE_ID), (i32 imm:$SAMPLER_ID),
+ (i32 imm:$COORD_TYPE_X), (i32 imm:$COORD_TYPE_Y), (i32 imm:$COORD_TYPE_Z),
+ (i32 imm:$COORD_TYPE_W)),
+ (inst R600_Reg128:$SRC_GPR,
+ imm:$srcx, imm:$srcy, imm:$srcz, imm:$srcw,
+ imm:$offsetx, imm:$offsety, imm:$offsetz,
+ imm:$DST_SEL_X, imm:$DST_SEL_Y, imm:$DST_SEL_Z,
+ imm:$DST_SEL_W,
+ imm:$RESOURCE_ID, imm:$SAMPLER_ID,
+ imm:$COORD_TYPE_X, imm:$COORD_TYPE_Y, imm:$COORD_TYPE_Z,
+ imm:$COORD_TYPE_W)>;
+}
+
+//===----------------------------------------------------------------------===//
+// Interpolation Instructions
+//===----------------------------------------------------------------------===//
+
+def INTERP_VEC_LOAD : AMDGPUShaderInst <
+ (outs R600_Reg128:$dst),
+ (ins i32imm:$src0),
+ "INTERP_LOAD $src0 : $dst">;
+
+def INTERP_XY : R600_2OP <0xD6, "INTERP_XY", []> {
+ let bank_swizzle = 5;
+}
+
+def INTERP_ZW : R600_2OP <0xD7, "INTERP_ZW", []> {
+ let bank_swizzle = 5;
+}
+
+def INTERP_LOAD_P0 : R600_1OP <0xE0, "INTERP_LOAD_P0", []>;
+
+//===----------------------------------------------------------------------===//
+// Export Instructions
+//===----------------------------------------------------------------------===//
+
+class ExportWord0 {
+ field bits<32> Word0;
+
+ bits<13> arraybase;
+ bits<2> type;
+ bits<7> gpr;
+ bits<2> elem_size;
+
+ let Word0{12-0} = arraybase;
+ let Word0{14-13} = type;
+ let Word0{21-15} = gpr;
+ let Word0{22} = 0; // RW_REL
+ let Word0{29-23} = 0; // INDEX_GPR
+ let Word0{31-30} = elem_size;
+}
+
+class ExportSwzWord1 {
+ field bits<32> Word1;
+
+ bits<3> sw_x;
+ bits<3> sw_y;
+ bits<3> sw_z;
+ bits<3> sw_w;
+ bits<1> eop;
+ bits<8> inst;
+
+ let Word1{2-0} = sw_x;
+ let Word1{5-3} = sw_y;
+ let Word1{8-6} = sw_z;
+ let Word1{11-9} = sw_w;
+}
+
+class ExportBufWord1 {
+ field bits<32> Word1;
+
+ bits<12> arraySize;
+ bits<4> compMask;
+ bits<1> eop;
+ bits<8> inst;
+
+ let Word1{11-0} = arraySize;
+ let Word1{15-12} = compMask;
+}
+
+multiclass ExportPattern<Instruction ExportInst, bits<8> cf_inst> {
+ def : Pat<(R600_EXPORT (v4f32 R600_Reg128:$src), (i32 imm:$base), (i32 imm:$type),
+ (i32 imm:$swz_x), (i32 imm:$swz_y), (i32 imm:$swz_z), (i32 imm:$swz_w)),
+ (ExportInst R600_Reg128:$src, imm:$type, imm:$base,
+ imm:$swz_x, imm:$swz_y, imm:$swz_z, imm:$swz_w, cf_inst, 0)
+ >;
+
+}
+
+multiclass SteamOutputExportPattern<Instruction ExportInst,
+ bits<8> buf0inst, bits<8> buf1inst, bits<8> buf2inst, bits<8> buf3inst> {
+// Stream0
+ def : Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src),
+ (i32 imm:$arraybase), (i32 0), (i32 imm:$mask)),
+ (ExportInst R600_Reg128:$src, 0, imm:$arraybase,
+ 4095, imm:$mask, buf0inst, 0)>;
+// Stream1
+ def : Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src),
+ (i32 imm:$arraybase), (i32 1), (i32 imm:$mask)),
+ (ExportInst $src, 0, imm:$arraybase,
+ 4095, imm:$mask, buf1inst, 0)>;
+// Stream2
+ def : Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src),
+ (i32 imm:$arraybase), (i32 2), (i32 imm:$mask)),
+ (ExportInst $src, 0, imm:$arraybase,
+ 4095, imm:$mask, buf2inst, 0)>;
+// Stream3
+ def : Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src),
+ (i32 imm:$arraybase), (i32 3), (i32 imm:$mask)),
+ (ExportInst $src, 0, imm:$arraybase,
+ 4095, imm:$mask, buf3inst, 0)>;
+}
+
+// Export Instructions should not be duplicated by TailDuplication pass
+// (which assumes that duplicable instruction are affected by exec mask)
+let usesCustomInserter = 1, isNotDuplicable = 1 in {
+
+class ExportSwzInst : InstR600ISA<(
+ outs),
+ (ins R600_Reg128:$gpr, i32imm:$type, i32imm:$arraybase,
+ RSel:$sw_x, RSel:$sw_y, RSel:$sw_z, RSel:$sw_w, i32imm:$inst,
+ i32imm:$eop),
+ !strconcat("EXPORT", " $gpr.$sw_x$sw_y$sw_z$sw_w"),
+ []>, ExportWord0, ExportSwzWord1 {
+ let elem_size = 3;
+ let Inst{31-0} = Word0;
+ let Inst{63-32} = Word1;
+ let IsExport = 1;
+}
+
+} // End usesCustomInserter = 1
+
+class ExportBufInst : InstR600ISA<(
+ outs),
+ (ins R600_Reg128:$gpr, i32imm:$type, i32imm:$arraybase,
+ i32imm:$arraySize, i32imm:$compMask, i32imm:$inst, i32imm:$eop),
+ !strconcat("EXPORT", " $gpr"),
+ []>, ExportWord0, ExportBufWord1 {
+ let elem_size = 0;
+ let Inst{31-0} = Word0;
+ let Inst{63-32} = Word1;
+ let IsExport = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Control Flow Instructions
+//===----------------------------------------------------------------------===//
+
+
+def KCACHE : InstFlag<"printKCache">;
+
+class ALU_CLAUSE<bits<4> inst, string OpName> : AMDGPUInst <(outs),
+(ins i32imm:$ADDR, i32imm:$KCACHE_BANK0, i32imm:$KCACHE_BANK1,
+KCACHE:$KCACHE_MODE0, KCACHE:$KCACHE_MODE1,
+i32imm:$KCACHE_ADDR0, i32imm:$KCACHE_ADDR1,
+i32imm:$COUNT, i32imm:$Enabled),
+!strconcat(OpName, " $COUNT, @$ADDR, "
+"KC0[$KCACHE_MODE0], KC1[$KCACHE_MODE1]"),
+[] >, CF_ALU_WORD0, CF_ALU_WORD1 {
+ field bits<64> Inst;
+
+ let CF_INST = inst;
+ let ALT_CONST = 0;
+ let WHOLE_QUAD_MODE = 0;
+ let BARRIER = 1;
+ let isCodeGenOnly = 1;
+ let UseNamedOperandTable = 1;
+
+ let Inst{31-0} = Word0;
+ let Inst{63-32} = Word1;
+}
+
+class CF_WORD0_R600 {
+ field bits<32> Word0;
+
+ bits<32> ADDR;
+
+ let Word0 = ADDR;
+}
+
+class CF_CLAUSE_R600 <bits<7> inst, dag ins, string AsmPrint> : AMDGPUInst <(outs),
+ins, AsmPrint, [] >, CF_WORD0_R600, CF_WORD1_R600 {
+ field bits<64> Inst;
+ bits<4> CNT;
+
+ let CF_INST = inst;
+ let BARRIER = 1;
+ let CF_CONST = 0;
+ let VALID_PIXEL_MODE = 0;
+ let COND = 0;
+ let COUNT = CNT{2-0};
+ let CALL_COUNT = 0;
+ let COUNT_3 = CNT{3};
+ let END_OF_PROGRAM = 0;
+ let WHOLE_QUAD_MODE = 0;
+
+ let Inst{31-0} = Word0;
+ let Inst{63-32} = Word1;
+}
+
+class CF_CLAUSE_EG <bits<8> inst, dag ins, string AsmPrint> : AMDGPUInst <(outs),
+ins, AsmPrint, [] >, CF_WORD0_EG, CF_WORD1_EG {
+ field bits<64> Inst;
+
+ let CF_INST = inst;
+ let BARRIER = 1;
+ let JUMPTABLE_SEL = 0;
+ let CF_CONST = 0;
+ let VALID_PIXEL_MODE = 0;
+ let COND = 0;
+ let END_OF_PROGRAM = 0;
+
+ let Inst{31-0} = Word0;
+ let Inst{63-32} = Word1;
+}
+
+def CF_ALU : ALU_CLAUSE<8, "ALU">;
+def CF_ALU_PUSH_BEFORE : ALU_CLAUSE<9, "ALU_PUSH_BEFORE">;
+def CF_ALU_POP_AFTER : ALU_CLAUSE<10, "ALU_POP_AFTER">;
+def CF_ALU_CONTINUE : ALU_CLAUSE<13, "ALU_CONTINUE">;
+def CF_ALU_BREAK : ALU_CLAUSE<14, "ALU_BREAK">;
+def CF_ALU_ELSE_AFTER : ALU_CLAUSE<15, "ALU_ELSE_AFTER">;
+
+def FETCH_CLAUSE : AMDGPUInst <(outs),
+(ins i32imm:$addr), "Fetch clause starting at $addr:", [] > {
+ field bits<8> Inst;
+ bits<8> num;
+ let Inst = num;
+ let isCodeGenOnly = 1;
+}
+
+def ALU_CLAUSE : AMDGPUInst <(outs),
+(ins i32imm:$addr), "ALU clause starting at $addr:", [] > {
+ field bits<8> Inst;
+ bits<8> num;
+ let Inst = num;
+ let isCodeGenOnly = 1;
+}
+
+def LITERALS : AMDGPUInst <(outs),
+(ins LITERAL:$literal1, LITERAL:$literal2), "$literal1, $literal2", [] > {
+ let isCodeGenOnly = 1;
+
+ field bits<64> Inst;
+ bits<32> literal1;
+ bits<32> literal2;
+
+ let Inst{31-0} = literal1;
+ let Inst{63-32} = literal2;
+}
+
+def PAD : AMDGPUInst <(outs), (ins), "PAD", [] > {
+ field bits<64> Inst;
+}
+
+let Predicates = [isR600toCayman] in {
+
+//===----------------------------------------------------------------------===//
+// Common Instructions R600, R700, Evergreen, Cayman
+//===----------------------------------------------------------------------===//
+
+def ADD : R600_2OP_Helper <0x0, "ADD", fadd>;
+// Non-IEEE MUL: 0 * anything = 0
+def MUL : R600_2OP_Helper <0x1, "MUL NON-IEEE">;
+def MUL_IEEE : R600_2OP_Helper <0x2, "MUL_IEEE", fmul>;
+// TODO: Do these actually match the regular fmin/fmax behavior?
+def MAX : R600_2OP_Helper <0x3, "MAX", AMDGPUfmax_legacy>;
+def MIN : R600_2OP_Helper <0x4, "MIN", AMDGPUfmin_legacy>;
+// According to https://msdn.microsoft.com/en-us/library/windows/desktop/cc308050%28v=vs.85%29.aspx
+// DX10 min/max returns the other operand if one is NaN,
+// this matches http://llvm.org/docs/LangRef.html#llvm-minnum-intrinsic
+def MAX_DX10 : R600_2OP_Helper <0x5, "MAX_DX10", fmaxnum>;
+def MIN_DX10 : R600_2OP_Helper <0x6, "MIN_DX10", fminnum>;
+
+// For the SET* instructions there is a naming conflict in TargetSelectionDAG.td,
+// so some of the instruction names don't match the asm string.
+// XXX: Use the defs in TargetSelectionDAG.td instead of intrinsics.
+def SETE : R600_2OP <
+ 0x08, "SETE",
+ [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OEQ))]
+>;
+
+def SGT : R600_2OP <
+ 0x09, "SETGT",
+ [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OGT))]
+>;
+
+def SGE : R600_2OP <
+ 0xA, "SETGE",
+ [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OGE))]
+>;
+
+def SNE : R600_2OP <
+ 0xB, "SETNE",
+ [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_UNE_NE))]
+>;
+
+def SETE_DX10 : R600_2OP <
+ 0xC, "SETE_DX10",
+ [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OEQ))]
+>;
+
+def SETGT_DX10 : R600_2OP <
+ 0xD, "SETGT_DX10",
+ [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OGT))]
+>;
+
+def SETGE_DX10 : R600_2OP <
+ 0xE, "SETGE_DX10",
+ [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OGE))]
+>;
+
+// FIXME: This should probably be COND_ONE
+def SETNE_DX10 : R600_2OP <
+ 0xF, "SETNE_DX10",
+ [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_UNE_NE))]
+>;
+
+// FIXME: Need combine for AMDGPUfract
+def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>;
+def TRUNC : R600_1OP_Helper <0x11, "TRUNC", ftrunc>;
+def CEIL : R600_1OP_Helper <0x12, "CEIL", fceil>;
+def RNDNE : R600_1OP_Helper <0x13, "RNDNE", frint>;
+def FLOOR : R600_1OP_Helper <0x14, "FLOOR", ffloor>;
+
+def MOV : R600_1OP <0x19, "MOV", []>;
+
+let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in {
+
+class MOV_IMM <ValueType vt, Operand immType> : AMDGPUInst <
+ (outs R600_Reg32:$dst),
+ (ins immType:$imm),
+ "",
+ []
+>;
+
+} // end let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1
+
+def MOV_IMM_I32 : MOV_IMM<i32, i32imm>;
+def : Pat <
+ (imm:$val),
+ (MOV_IMM_I32 imm:$val)
+>;
+
+def MOV_IMM_GLOBAL_ADDR : MOV_IMM<iPTR, i32imm>;
+def : Pat <
+ (AMDGPUconstdata_ptr tglobaladdr:$addr),
+ (MOV_IMM_GLOBAL_ADDR tglobaladdr:$addr)
+>;
+
+
+def MOV_IMM_F32 : MOV_IMM<f32, f32imm>;
+def : Pat <
+ (fpimm:$val),
+ (MOV_IMM_F32 fpimm:$val)
+>;
+
+def PRED_SETE : R600_2OP <0x20, "PRED_SETE", []>;
+def PRED_SETGT : R600_2OP <0x21, "PRED_SETGT", []>;
+def PRED_SETGE : R600_2OP <0x22, "PRED_SETGE", []>;
+def PRED_SETNE : R600_2OP <0x23, "PRED_SETNE", []>;
+
+let hasSideEffects = 1 in {
+
+def KILLGT : R600_2OP <0x2D, "KILLGT", []>;
+
+} // end hasSideEffects
+
+def AND_INT : R600_2OP_Helper <0x30, "AND_INT", and>;
+def OR_INT : R600_2OP_Helper <0x31, "OR_INT", or>;
+def XOR_INT : R600_2OP_Helper <0x32, "XOR_INT", xor>;
+def NOT_INT : R600_1OP_Helper <0x33, "NOT_INT", not>;
+def ADD_INT : R600_2OP_Helper <0x34, "ADD_INT", add>;
+def SUB_INT : R600_2OP_Helper <0x35, "SUB_INT", sub>;
+def MAX_INT : R600_2OP_Helper <0x36, "MAX_INT", smax>;
+def MIN_INT : R600_2OP_Helper <0x37, "MIN_INT", smin>;
+def MAX_UINT : R600_2OP_Helper <0x38, "MAX_UINT", umax>;
+def MIN_UINT : R600_2OP_Helper <0x39, "MIN_UINT", umin>;
+
+def SETE_INT : R600_2OP <
+ 0x3A, "SETE_INT",
+ [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETEQ))]
+>;
+
+def SETGT_INT : R600_2OP <
+ 0x3B, "SETGT_INT",
+ [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETGT))]
+>;
+
+def SETGE_INT : R600_2OP <
+ 0x3C, "SETGE_INT",
+ [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETGE))]
+>;
+
+def SETNE_INT : R600_2OP <
+ 0x3D, "SETNE_INT",
+ [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETNE))]
+>;
+
+def SETGT_UINT : R600_2OP <
+ 0x3E, "SETGT_UINT",
+ [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETUGT))]
+>;
+
+def SETGE_UINT : R600_2OP <
+ 0x3F, "SETGE_UINT",
+ [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETUGE))]
+>;
+
+def PRED_SETE_INT : R600_2OP <0x42, "PRED_SETE_INT", []>;
+def PRED_SETGT_INT : R600_2OP <0x43, "PRED_SETGE_INT", []>;
+def PRED_SETGE_INT : R600_2OP <0x44, "PRED_SETGE_INT", []>;
+def PRED_SETNE_INT : R600_2OP <0x45, "PRED_SETNE_INT", []>;
+
+def CNDE_INT : R600_3OP <
+ 0x1C, "CNDE_INT",
+ [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_EQ))]
+>;
+
+def CNDGE_INT : R600_3OP <
+ 0x1E, "CNDGE_INT",
+ [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_SGE))]
+>;
+
+def CNDGT_INT : R600_3OP <
+ 0x1D, "CNDGT_INT",
+ [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_SGT))]
+>;
+
+//===----------------------------------------------------------------------===//
+// Texture instructions
+//===----------------------------------------------------------------------===//
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+
+class R600_TEX <bits<11> inst, string opName> :
+ InstR600 <(outs R600_Reg128:$DST_GPR),
+ (ins R600_Reg128:$SRC_GPR,
+ RSel:$srcx, RSel:$srcy, RSel:$srcz, RSel:$srcw,
+ i32imm:$offsetx, i32imm:$offsety, i32imm:$offsetz,
+ RSel:$DST_SEL_X, RSel:$DST_SEL_Y, RSel:$DST_SEL_Z, RSel:$DST_SEL_W,
+ i32imm:$RESOURCE_ID, i32imm:$SAMPLER_ID,
+ CT:$COORD_TYPE_X, CT:$COORD_TYPE_Y, CT:$COORD_TYPE_Z,
+ CT:$COORD_TYPE_W),
+ !strconcat(" ", opName,
+ " $DST_GPR.$DST_SEL_X$DST_SEL_Y$DST_SEL_Z$DST_SEL_W, "
+ "$SRC_GPR.$srcx$srcy$srcz$srcw "
+ "RID:$RESOURCE_ID SID:$SAMPLER_ID "
+ "CT:$COORD_TYPE_X$COORD_TYPE_Y$COORD_TYPE_Z$COORD_TYPE_W"),
+ [],
+ NullALU>, TEX_WORD0, TEX_WORD1, TEX_WORD2 {
+ let Inst{31-0} = Word0;
+ let Inst{63-32} = Word1;
+
+ let TEX_INST = inst{4-0};
+ let SRC_REL = 0;
+ let DST_REL = 0;
+ let LOD_BIAS = 0;
+
+ let INST_MOD = 0;
+ let FETCH_WHOLE_QUAD = 0;
+ let ALT_CONST = 0;
+ let SAMPLER_INDEX_MODE = 0;
+ let RESOURCE_INDEX_MODE = 0;
+
+ let TEXInst = 1;
+}
+
+} // End mayLoad = 0, mayStore = 0, hasSideEffects = 0
+
+
+
+def TEX_SAMPLE : R600_TEX <0x10, "TEX_SAMPLE">;
+def TEX_SAMPLE_C : R600_TEX <0x18, "TEX_SAMPLE_C">;
+def TEX_SAMPLE_L : R600_TEX <0x11, "TEX_SAMPLE_L">;
+def TEX_SAMPLE_C_L : R600_TEX <0x19, "TEX_SAMPLE_C_L">;
+def TEX_SAMPLE_LB : R600_TEX <0x12, "TEX_SAMPLE_LB">;
+def TEX_SAMPLE_C_LB : R600_TEX <0x1A, "TEX_SAMPLE_C_LB">;
+def TEX_LD : R600_TEX <0x03, "TEX_LD">;
+def TEX_LDPTR : R600_TEX <0x03, "TEX_LDPTR"> {
+ let INST_MOD = 1;
+}
+def TEX_GET_TEXTURE_RESINFO : R600_TEX <0x04, "TEX_GET_TEXTURE_RESINFO">;
+def TEX_GET_GRADIENTS_H : R600_TEX <0x07, "TEX_GET_GRADIENTS_H">;
+def TEX_GET_GRADIENTS_V : R600_TEX <0x08, "TEX_GET_GRADIENTS_V">;
+def TEX_SET_GRADIENTS_H : R600_TEX <0x0B, "TEX_SET_GRADIENTS_H">;
+def TEX_SET_GRADIENTS_V : R600_TEX <0x0C, "TEX_SET_GRADIENTS_V">;
+def TEX_SAMPLE_G : R600_TEX <0x14, "TEX_SAMPLE_G">;
+def TEX_SAMPLE_C_G : R600_TEX <0x1C, "TEX_SAMPLE_C_G">;
+
+defm : TexPattern<0, TEX_SAMPLE>;
+defm : TexPattern<1, TEX_SAMPLE_C>;
+defm : TexPattern<2, TEX_SAMPLE_L>;
+defm : TexPattern<3, TEX_SAMPLE_C_L>;
+defm : TexPattern<4, TEX_SAMPLE_LB>;
+defm : TexPattern<5, TEX_SAMPLE_C_LB>;
+defm : TexPattern<6, TEX_LD, v4i32>;
+defm : TexPattern<7, TEX_GET_TEXTURE_RESINFO, v4i32>;
+defm : TexPattern<8, TEX_GET_GRADIENTS_H>;
+defm : TexPattern<9, TEX_GET_GRADIENTS_V>;
+defm : TexPattern<10, TEX_LDPTR, v4i32>;
+
+//===----------------------------------------------------------------------===//
+// Helper classes for common instructions
+//===----------------------------------------------------------------------===//
+
+class MUL_LIT_Common <bits<5> inst> : R600_3OP <
+ inst, "MUL_LIT",
+ []
+>;
+
+class MULADD_Common <bits<5> inst> : R600_3OP <
+ inst, "MULADD",
+ []
+>;
+
+class MULADD_IEEE_Common <bits<5> inst> : R600_3OP <
+ inst, "MULADD_IEEE",
+ [(set f32:$dst, (fmad f32:$src0, f32:$src1, f32:$src2))]
+>;
+
+class FMA_Common <bits<5> inst> : R600_3OP <
+ inst, "FMA",
+ [(set f32:$dst, (fma f32:$src0, f32:$src1, f32:$src2))], VecALU
+>;
+
+class CNDE_Common <bits<5> inst> : R600_3OP <
+ inst, "CNDE",
+ [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OEQ))]
+>;
+
+class CNDGT_Common <bits<5> inst> : R600_3OP <
+ inst, "CNDGT",
+ [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OGT))]
+> {
+ let Itinerary = VecALU;
+}
+
+class CNDGE_Common <bits<5> inst> : R600_3OP <
+ inst, "CNDGE",
+ [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OGE))]
+> {
+ let Itinerary = VecALU;
+}
+
+
+let isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" in {
+class R600_VEC2OP<list<dag> pattern> : InstR600 <(outs R600_Reg32:$dst), (ins
+// Slot X
+ UEM:$update_exec_mask_X, UP:$update_pred_X, WRITE:$write_X,
+ OMOD:$omod_X, REL:$dst_rel_X, CLAMP:$clamp_X,
+ R600_TReg32_X:$src0_X, NEG:$src0_neg_X, REL:$src0_rel_X, ABS:$src0_abs_X, SEL:$src0_sel_X,
+ R600_TReg32_X:$src1_X, NEG:$src1_neg_X, REL:$src1_rel_X, ABS:$src1_abs_X, SEL:$src1_sel_X,
+ R600_Pred:$pred_sel_X,
+// Slot Y
+ UEM:$update_exec_mask_Y, UP:$update_pred_Y, WRITE:$write_Y,
+ OMOD:$omod_Y, REL:$dst_rel_Y, CLAMP:$clamp_Y,
+ R600_TReg32_Y:$src0_Y, NEG:$src0_neg_Y, REL:$src0_rel_Y, ABS:$src0_abs_Y, SEL:$src0_sel_Y,
+ R600_TReg32_Y:$src1_Y, NEG:$src1_neg_Y, REL:$src1_rel_Y, ABS:$src1_abs_Y, SEL:$src1_sel_Y,
+ R600_Pred:$pred_sel_Y,
+// Slot Z
+ UEM:$update_exec_mask_Z, UP:$update_pred_Z, WRITE:$write_Z,
+ OMOD:$omod_Z, REL:$dst_rel_Z, CLAMP:$clamp_Z,
+ R600_TReg32_Z:$src0_Z, NEG:$src0_neg_Z, REL:$src0_rel_Z, ABS:$src0_abs_Z, SEL:$src0_sel_Z,
+ R600_TReg32_Z:$src1_Z, NEG:$src1_neg_Z, REL:$src1_rel_Z, ABS:$src1_abs_Z, SEL:$src1_sel_Z,
+ R600_Pred:$pred_sel_Z,
+// Slot W
+ UEM:$update_exec_mask_W, UP:$update_pred_W, WRITE:$write_W,
+ OMOD:$omod_W, REL:$dst_rel_W, CLAMP:$clamp_W,
+ R600_TReg32_W:$src0_W, NEG:$src0_neg_W, REL:$src0_rel_W, ABS:$src0_abs_W, SEL:$src0_sel_W,
+ R600_TReg32_W:$src1_W, NEG:$src1_neg_W, REL:$src1_rel_W, ABS:$src1_abs_W, SEL:$src1_sel_W,
+ R600_Pred:$pred_sel_W,
+ LITERAL:$literal0, LITERAL:$literal1),
+ "",
+ pattern,
+ AnyALU> {
+
+ let UseNamedOperandTable = 1;
+
+}
+}
+
+def DOT_4 : R600_VEC2OP<[(set R600_Reg32:$dst, (DOT4
+ R600_TReg32_X:$src0_X, R600_TReg32_X:$src1_X,
+ R600_TReg32_Y:$src0_Y, R600_TReg32_Y:$src1_Y,
+ R600_TReg32_Z:$src0_Z, R600_TReg32_Z:$src1_Z,
+ R600_TReg32_W:$src0_W, R600_TReg32_W:$src1_W))]>;
+
+
+class DOT4_Common <bits<11> inst> : R600_2OP <inst, "DOT4", []>;
+
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+multiclass CUBE_Common <bits<11> inst> {
+
+ def _pseudo : InstR600 <
+ (outs R600_Reg128:$dst),
+ (ins R600_Reg128:$src0),
+ "CUBE $dst $src0",
+ [(set v4f32:$dst, (int_AMDGPU_cube v4f32:$src0))],
+ VecALU
+ > {
+ let isPseudo = 1;
+ let UseNamedOperandTable = 1;
+ }
+
+ def _real : R600_2OP <inst, "CUBE", []>;
+}
+} // End mayLoad = 0, mayStore = 0, hasSideEffects = 0
+
+class EXP_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
+ inst, "EXP_IEEE", fexp2
+> {
+ let Itinerary = TransALU;
+}
+
+class FLT_TO_INT_Common <bits<11> inst> : R600_1OP_Helper <
+ inst, "FLT_TO_INT", fp_to_sint
+> {
+ let Itinerary = TransALU;
+}
+
+class INT_TO_FLT_Common <bits<11> inst> : R600_1OP_Helper <
+ inst, "INT_TO_FLT", sint_to_fp
+> {
+ let Itinerary = TransALU;
+}
+
+class FLT_TO_UINT_Common <bits<11> inst> : R600_1OP_Helper <
+ inst, "FLT_TO_UINT", fp_to_uint
+> {
+ let Itinerary = TransALU;
+}
+
+class UINT_TO_FLT_Common <bits<11> inst> : R600_1OP_Helper <
+ inst, "UINT_TO_FLT", uint_to_fp
+> {
+ let Itinerary = TransALU;
+}
+
+class LOG_CLAMPED_Common <bits<11> inst> : R600_1OP <
+ inst, "LOG_CLAMPED", []
+>;
+
+class LOG_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
+ inst, "LOG_IEEE", flog2
+> {
+ let Itinerary = TransALU;
+}
+
+class LSHL_Common <bits<11> inst> : R600_2OP_Helper <inst, "LSHL", shl>;
+class LSHR_Common <bits<11> inst> : R600_2OP_Helper <inst, "LSHR", srl>;
+class ASHR_Common <bits<11> inst> : R600_2OP_Helper <inst, "ASHR", sra>;
+class MULHI_INT_Common <bits<11> inst> : R600_2OP_Helper <
+ inst, "MULHI_INT", mulhs> {
+ let Itinerary = TransALU;
+}
+
+class MULHI_INT24_Common <bits<11> inst> : R600_2OP_Helper <
+ inst, "MULHI_INT24", AMDGPUmulhi_i24> {
+ let Itinerary = VecALU;
+}
+
+class MULHI_UINT_Common <bits<11> inst> : R600_2OP_Helper <
+ inst, "MULHI", mulhu> {
+ let Itinerary = TransALU;
+}
+
+class MULHI_UINT24_Common <bits<11> inst> : R600_2OP_Helper <
+ inst, "MULHI_UINT24", AMDGPUmulhi_u24> {
+ let Itinerary = VecALU;
+}
+
+class MULLO_INT_Common <bits<11> inst> : R600_2OP_Helper <
+ inst, "MULLO_INT", mul> {
+ let Itinerary = TransALU;
+}
+class MULLO_UINT_Common <bits<11> inst> : R600_2OP <inst, "MULLO_UINT", []> {
+ let Itinerary = TransALU;
+}
+
+class RECIP_CLAMPED_Common <bits<11> inst> : R600_1OP <
+ inst, "RECIP_CLAMPED", []
+> {
+ let Itinerary = TransALU;
+}
+
+class RECIP_IEEE_Common <bits<11> inst> : R600_1OP <
+ inst, "RECIP_IEEE", [(set f32:$dst, (AMDGPUrcp f32:$src0))]
+> {
+ let Itinerary = TransALU;
+}
+
+class RECIP_UINT_Common <bits<11> inst> : R600_1OP_Helper <
+ inst, "RECIP_UINT", AMDGPUurecip
+> {
+ let Itinerary = TransALU;
+}
+
+// Clamped to maximum.
+class RECIPSQRT_CLAMPED_Common <bits<11> inst> : R600_1OP_Helper <
+ inst, "RECIPSQRT_CLAMPED", AMDGPUrsq_clamp
+> {
+ let Itinerary = TransALU;
+}
+
+class RECIPSQRT_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
+ inst, "RECIPSQRT_IEEE", AMDGPUrsq> {
+ let Itinerary = TransALU;
+}
+
+// TODO: There is also RECIPSQRT_FF which clamps to zero.
+
+class SIN_Common <bits<11> inst> : R600_1OP <
+ inst, "SIN", [(set f32:$dst, (SIN_HW f32:$src0))]>{
+ let Trig = 1;
+ let Itinerary = TransALU;
+}
+
+class COS_Common <bits<11> inst> : R600_1OP <
+ inst, "COS", [(set f32:$dst, (COS_HW f32:$src0))]> {
+ let Trig = 1;
+ let Itinerary = TransALU;
+}
+
+def CLAMP_R600 : CLAMP <R600_Reg32>;
+def FABS_R600 : FABS<R600_Reg32>;
+def FNEG_R600 : FNEG<R600_Reg32>;
+
+//===----------------------------------------------------------------------===//
+// Helper patterns for complex intrinsics
+//===----------------------------------------------------------------------===//
+
+// FIXME: Should be predicated on unsafe fp math.
+multiclass DIV_Common <InstR600 recip_ieee> {
+def : Pat<
+ (fdiv f32:$src0, f32:$src1),
+ (MUL_IEEE $src0, (recip_ieee $src1))
+>;
+
+def : RcpPat<recip_ieee, f32>;
+}
+
+//===----------------------------------------------------------------------===//
+// R600 / R700 Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isR600] in {
+
+ def MUL_LIT_r600 : MUL_LIT_Common<0x0C>;
+ def MULADD_r600 : MULADD_Common<0x10>;
+ def MULADD_IEEE_r600 : MULADD_IEEE_Common<0x14>;
+ def CNDE_r600 : CNDE_Common<0x18>;
+ def CNDGT_r600 : CNDGT_Common<0x19>;
+ def CNDGE_r600 : CNDGE_Common<0x1A>;
+ def DOT4_r600 : DOT4_Common<0x50>;
+ defm CUBE_r600 : CUBE_Common<0x52>;
+ def EXP_IEEE_r600 : EXP_IEEE_Common<0x61>;
+ def LOG_CLAMPED_r600 : LOG_CLAMPED_Common<0x62>;
+ def LOG_IEEE_r600 : LOG_IEEE_Common<0x63>;
+ def RECIP_CLAMPED_r600 : RECIP_CLAMPED_Common<0x64>;
+ def RECIP_IEEE_r600 : RECIP_IEEE_Common<0x66>;
+ def RECIPSQRT_CLAMPED_r600 : RECIPSQRT_CLAMPED_Common<0x67>;
+ def RECIPSQRT_IEEE_r600 : RECIPSQRT_IEEE_Common<0x69>;
+ def FLT_TO_INT_r600 : FLT_TO_INT_Common<0x6b>;
+ def INT_TO_FLT_r600 : INT_TO_FLT_Common<0x6c>;
+ def FLT_TO_UINT_r600 : FLT_TO_UINT_Common<0x79>;
+ def UINT_TO_FLT_r600 : UINT_TO_FLT_Common<0x6d>;
+ def SIN_r600 : SIN_Common<0x6E>;
+ def COS_r600 : COS_Common<0x6F>;
+ def ASHR_r600 : ASHR_Common<0x70>;
+ def LSHR_r600 : LSHR_Common<0x71>;
+ def LSHL_r600 : LSHL_Common<0x72>;
+ def MULLO_INT_r600 : MULLO_INT_Common<0x73>;
+ def MULHI_INT_r600 : MULHI_INT_Common<0x74>;
+ def MULLO_UINT_r600 : MULLO_UINT_Common<0x75>;
+ def MULHI_UINT_r600 : MULHI_UINT_Common<0x76>;
+ def RECIP_UINT_r600 : RECIP_UINT_Common <0x78>;
+
+ defm DIV_r600 : DIV_Common<RECIP_IEEE_r600>;
+ def : POW_Common <LOG_IEEE_r600, EXP_IEEE_r600, MUL>;
+
+ def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>;
+ def : RsqPat<RECIPSQRT_IEEE_r600, f32>;
+
+ def R600_ExportSwz : ExportSwzInst {
+ let Word1{20-17} = 0; // BURST_COUNT
+ let Word1{21} = eop;
+ let Word1{22} = 0; // VALID_PIXEL_MODE
+ let Word1{30-23} = inst;
+ let Word1{31} = 1; // BARRIER
+ }
+ defm : ExportPattern<R600_ExportSwz, 39>;
+
+ def R600_ExportBuf : ExportBufInst {
+ let Word1{20-17} = 0; // BURST_COUNT
+ let Word1{21} = eop;
+ let Word1{22} = 0; // VALID_PIXEL_MODE
+ let Word1{30-23} = inst;
+ let Word1{31} = 1; // BARRIER
+ }
+ defm : SteamOutputExportPattern<R600_ExportBuf, 0x20, 0x21, 0x22, 0x23>;
+
+ def CF_TC_R600 : CF_CLAUSE_R600<1, (ins i32imm:$ADDR, i32imm:$CNT),
+ "TEX $CNT @$ADDR"> {
+ let POP_COUNT = 0;
+ }
+ def CF_VC_R600 : CF_CLAUSE_R600<2, (ins i32imm:$ADDR, i32imm:$CNT),
+ "VTX $CNT @$ADDR"> {
+ let POP_COUNT = 0;
+ }
+ def WHILE_LOOP_R600 : CF_CLAUSE_R600<6, (ins i32imm:$ADDR),
+ "LOOP_START_DX10 @$ADDR"> {
+ let POP_COUNT = 0;
+ let CNT = 0;
+ }
+ def END_LOOP_R600 : CF_CLAUSE_R600<5, (ins i32imm:$ADDR), "END_LOOP @$ADDR"> {
+ let POP_COUNT = 0;
+ let CNT = 0;
+ }
+ def LOOP_BREAK_R600 : CF_CLAUSE_R600<9, (ins i32imm:$ADDR),
+ "LOOP_BREAK @$ADDR"> {
+ let POP_COUNT = 0;
+ let CNT = 0;
+ }
+ def CF_CONTINUE_R600 : CF_CLAUSE_R600<8, (ins i32imm:$ADDR),
+ "CONTINUE @$ADDR"> {
+ let POP_COUNT = 0;
+ let CNT = 0;
+ }
+ def CF_JUMP_R600 : CF_CLAUSE_R600<10, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+ "JUMP @$ADDR POP:$POP_COUNT"> {
+ let CNT = 0;
+ }
+ def CF_PUSH_ELSE_R600 : CF_CLAUSE_R600<12, (ins i32imm:$ADDR),
+ "PUSH_ELSE @$ADDR"> {
+ let CNT = 0;
+ let POP_COUNT = 0; // FIXME?
+ }
+ def CF_ELSE_R600 : CF_CLAUSE_R600<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+ "ELSE @$ADDR POP:$POP_COUNT"> {
+ let CNT = 0;
+ }
+ def CF_CALL_FS_R600 : CF_CLAUSE_R600<19, (ins), "CALL_FS"> {
+ let ADDR = 0;
+ let CNT = 0;
+ let POP_COUNT = 0;
+ }
+ def POP_R600 : CF_CLAUSE_R600<14, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+ "POP @$ADDR POP:$POP_COUNT"> {
+ let CNT = 0;
+ }
+ def CF_END_R600 : CF_CLAUSE_R600<0, (ins), "CF_END"> {
+ let CNT = 0;
+ let POP_COUNT = 0;
+ let ADDR = 0;
+ let END_OF_PROGRAM = 1;
+ }
+
+}
+
+
+//===----------------------------------------------------------------------===//
+// Regist loads and stores - for indirect addressing
+//===----------------------------------------------------------------------===//
+
+defm R600_ : RegisterLoadStore <R600_Reg32, FRAMEri, ADDRIndirect>;
+
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions
+//===----------------------------------------------------------------------===//
+
+let isPseudo = 1 in {
+
+def PRED_X : InstR600 <
+ (outs R600_Predicate_Bit:$dst),
+ (ins R600_Reg32:$src0, i32imm:$src1, i32imm:$flags),
+ "", [], NullALU> {
+ let FlagOperandIdx = 3;
+}
+
+let isTerminator = 1, isBranch = 1 in {
+def JUMP_COND : InstR600 <
+ (outs),
+ (ins brtarget:$target, R600_Predicate_Bit:$p),
+ "JUMP $target ($p)",
+ [], AnyALU
+ >;
+
+def JUMP : InstR600 <
+ (outs),
+ (ins brtarget:$target),
+ "JUMP $target",
+ [], AnyALU
+ >
+{
+ let isPredicable = 1;
+ let isBarrier = 1;
+}
+
+} // End isTerminator = 1, isBranch = 1
+
+let usesCustomInserter = 1 in {
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in {
+
+def MASK_WRITE : AMDGPUShaderInst <
+ (outs),
+ (ins R600_Reg32:$src),
+ "MASK_WRITE $src",
+ []
+>;
+
+} // End mayLoad = 0, mayStore = 0, hasSideEffects = 1
+
+
+def TXD: InstR600 <
+ (outs R600_Reg128:$dst),
+ (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2,
+ i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
+ "TXD $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget", [],
+ NullALU > {
+ let TEXInst = 1;
+}
+
+def TXD_SHADOW: InstR600 <
+ (outs R600_Reg128:$dst),
+ (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2,
+ i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
+ "TXD_SHADOW $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget",
+ [], NullALU> {
+ let TEXInst = 1;
+}
+} // End isPseudo = 1
+} // End usesCustomInserter = 1
+
+
+//===----------------------------------------------------------------------===//
+// Constant Buffer Addressing Support
+//===----------------------------------------------------------------------===//
+
+let usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" in {
+def CONST_COPY : Instruction {
+ let OutOperandList = (outs R600_Reg32:$dst);
+ let InOperandList = (ins i32imm:$src);
+ let Pattern =
+ [(set R600_Reg32:$dst, (CONST_ADDRESS ADDRGA_CONST_OFFSET:$src))];
+ let AsmString = "CONST_COPY";
+ let hasSideEffects = 0;
+ let isAsCheapAsAMove = 1;
+ let Itinerary = NullALU;
+}
+} // end usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU"
+
+def TEX_VTX_CONSTBUF :
+ InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$buffer_id), "VTX_READ_eg $dst, $ptr",
+ [(set v4i32:$dst, (CONST_ADDRESS ADDRGA_VAR_OFFSET:$ptr, (i32 imm:$buffer_id)))]>,
+ VTX_WORD1_GPR, VTX_WORD0_eg {
+
+ let VC_INST = 0;
+ let FETCH_TYPE = 2;
+ let FETCH_WHOLE_QUAD = 0;
+ let SRC_REL = 0;
+ let SRC_SEL_X = 0;
+ let DST_REL = 0;
+ let USE_CONST_FIELDS = 0;
+ let NUM_FORMAT_ALL = 2;
+ let FORMAT_COMP_ALL = 1;
+ let SRF_MODE_ALL = 1;
+ let MEGA_FETCH_COUNT = 16;
+ let DST_SEL_X = 0;
+ let DST_SEL_Y = 1;
+ let DST_SEL_Z = 2;
+ let DST_SEL_W = 3;
+ let DATA_FORMAT = 35;
+
+ let Inst{31-0} = Word0;
+ let Inst{63-32} = Word1;
+
+// LLVM can only encode 64-bit instructions, so these fields are manually
+// encoded in R600CodeEmitter
+//
+// bits<16> OFFSET;
+// bits<2> ENDIAN_SWAP = 0;
+// bits<1> CONST_BUF_NO_STRIDE = 0;
+// bits<1> MEGA_FETCH = 0;
+// bits<1> ALT_CONST = 0;
+// bits<2> BUFFER_INDEX_MODE = 0;
+
+
+
+// VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding
+// is done in R600CodeEmitter
+//
+// Inst{79-64} = OFFSET;
+// Inst{81-80} = ENDIAN_SWAP;
+// Inst{82} = CONST_BUF_NO_STRIDE;
+// Inst{83} = MEGA_FETCH;
+// Inst{84} = ALT_CONST;
+// Inst{86-85} = BUFFER_INDEX_MODE;
+// Inst{95-86} = 0; Reserved
+
+// VTX_WORD3 (Padding)
+//
+// Inst{127-96} = 0;
+ let VTXInst = 1;
+}
+
+def TEX_VTX_TEXBUF:
+ InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$buffer_id), "TEX_VTX_EXPLICIT_READ $dst, $ptr">,
+VTX_WORD1_GPR, VTX_WORD0_eg {
+
+let VC_INST = 0;
+let FETCH_TYPE = 2;
+let FETCH_WHOLE_QUAD = 0;
+let SRC_REL = 0;
+let SRC_SEL_X = 0;
+let DST_REL = 0;
+let USE_CONST_FIELDS = 1;
+let NUM_FORMAT_ALL = 0;
+let FORMAT_COMP_ALL = 0;
+let SRF_MODE_ALL = 1;
+let MEGA_FETCH_COUNT = 16;
+let DST_SEL_X = 0;
+let DST_SEL_Y = 1;
+let DST_SEL_Z = 2;
+let DST_SEL_W = 3;
+let DATA_FORMAT = 0;
+
+let Inst{31-0} = Word0;
+let Inst{63-32} = Word1;
+
+// LLVM can only encode 64-bit instructions, so these fields are manually
+// encoded in R600CodeEmitter
+//
+// bits<16> OFFSET;
+// bits<2> ENDIAN_SWAP = 0;
+// bits<1> CONST_BUF_NO_STRIDE = 0;
+// bits<1> MEGA_FETCH = 0;
+// bits<1> ALT_CONST = 0;
+// bits<2> BUFFER_INDEX_MODE = 0;
+
+
+
+// VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding
+// is done in R600CodeEmitter
+//
+// Inst{79-64} = OFFSET;
+// Inst{81-80} = ENDIAN_SWAP;
+// Inst{82} = CONST_BUF_NO_STRIDE;
+// Inst{83} = MEGA_FETCH;
+// Inst{84} = ALT_CONST;
+// Inst{86-85} = BUFFER_INDEX_MODE;
+// Inst{95-86} = 0; Reserved
+
+// VTX_WORD3 (Padding)
+//
+// Inst{127-96} = 0;
+ let VTXInst = 1;
+}
+
+//===---------------------------------------------------------------------===//
+// Flow and Program control Instructions
+//===---------------------------------------------------------------------===//
+class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern>
+: Instruction {
+
+ let Namespace = "AMDGPU";
+ dag OutOperandList = outs;
+ dag InOperandList = ins;
+ let Pattern = pattern;
+ let AsmString = !strconcat(asmstr, "\n");
+ let isPseudo = 1;
+ let Itinerary = NullALU;
+ bit hasIEEEFlag = 0;
+ bit hasZeroOpFlag = 0;
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let isCodeGenOnly = 1;
+}
+
+multiclass BranchConditional<SDNode Op, RegisterClass rci, RegisterClass rcf> {
+ def _i32 : ILFormat<(outs),
+ (ins brtarget:$target, rci:$src0),
+ "; i32 Pseudo branch instruction",
+ [(Op bb:$target, (i32 rci:$src0))]>;
+ def _f32 : ILFormat<(outs),
+ (ins brtarget:$target, rcf:$src0),
+ "; f32 Pseudo branch instruction",
+ [(Op bb:$target, (f32 rcf:$src0))]>;
+}
+
+// Only scalar types should generate flow control
+multiclass BranchInstr<string name> {
+ def _i32 : ILFormat<(outs), (ins R600_Reg32:$src),
+ !strconcat(name, " $src"), []>;
+ def _f32 : ILFormat<(outs), (ins R600_Reg32:$src),
+ !strconcat(name, " $src"), []>;
+}
+// Only scalar types should generate flow control
+multiclass BranchInstr2<string name> {
+ def _i32 : ILFormat<(outs), (ins R600_Reg32:$src0, R600_Reg32:$src1),
+ !strconcat(name, " $src0, $src1"), []>;
+ def _f32 : ILFormat<(outs), (ins R600_Reg32:$src0, R600_Reg32:$src1),
+ !strconcat(name, " $src0, $src1"), []>;
+}
+
+//===---------------------------------------------------------------------===//
+// Custom Inserter for Branches and returns, this eventually will be a
+// separate pass
+//===---------------------------------------------------------------------===//
+let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in {
+ def BRANCH : ILFormat<(outs), (ins brtarget:$target),
+ "; Pseudo unconditional branch instruction",
+ [(br bb:$target)]>;
+ defm BRANCH_COND : BranchConditional<IL_brcond, R600_Reg32, R600_Reg32>;
+}
+
+//===---------------------------------------------------------------------===//
+// Return instruction
+//===---------------------------------------------------------------------===//
+let isTerminator = 1, isReturn = 1, hasCtrlDep = 1,
+ usesCustomInserter = 1 in {
+ def RETURN : ILFormat<(outs), (ins variable_ops),
+ "RETURN", [(AMDGPUendpgm)]
+ >;
+}
+
+//===----------------------------------------------------------------------===//
+// Branch Instructions
+//===----------------------------------------------------------------------===//
+
+def IF_PREDICATE_SET : ILFormat<(outs), (ins R600_Reg32:$src),
+ "IF_PREDICATE_SET $src", []>;
+
+let isTerminator=1 in {
+ def BREAK : ILFormat< (outs), (ins),
+ "BREAK", []>;
+ def CONTINUE : ILFormat< (outs), (ins),
+ "CONTINUE", []>;
+ def DEFAULT : ILFormat< (outs), (ins),
+ "DEFAULT", []>;
+ def ELSE : ILFormat< (outs), (ins),
+ "ELSE", []>;
+ def ENDSWITCH : ILFormat< (outs), (ins),
+ "ENDSWITCH", []>;
+ def ENDMAIN : ILFormat< (outs), (ins),
+ "ENDMAIN", []>;
+ def END : ILFormat< (outs), (ins),
+ "END", []>;
+ def ENDFUNC : ILFormat< (outs), (ins),
+ "ENDFUNC", []>;
+ def ENDIF : ILFormat< (outs), (ins),
+ "ENDIF", []>;
+ def WHILELOOP : ILFormat< (outs), (ins),
+ "WHILE", []>;
+ def ENDLOOP : ILFormat< (outs), (ins),
+ "ENDLOOP", []>;
+ def FUNC : ILFormat< (outs), (ins),
+ "FUNC", []>;
+ def RETDYN : ILFormat< (outs), (ins),
+ "RET_DYN", []>;
+ // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+ defm IF_LOGICALNZ : BranchInstr<"IF_LOGICALNZ">;
+ // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+ defm IF_LOGICALZ : BranchInstr<"IF_LOGICALZ">;
+ // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+ defm BREAK_LOGICALNZ : BranchInstr<"BREAK_LOGICALNZ">;
+ // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+ defm BREAK_LOGICALZ : BranchInstr<"BREAK_LOGICALZ">;
+ // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+ defm CONTINUE_LOGICALNZ : BranchInstr<"CONTINUE_LOGICALNZ">;
+ // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+ defm CONTINUE_LOGICALZ : BranchInstr<"CONTINUE_LOGICALZ">;
+ defm IFC : BranchInstr2<"IFC">;
+ defm BREAKC : BranchInstr2<"BREAKC">;
+ defm CONTINUEC : BranchInstr2<"CONTINUEC">;
+}
+
+//===----------------------------------------------------------------------===//
+// Indirect addressing pseudo instructions
+//===----------------------------------------------------------------------===//
+
+let isPseudo = 1 in {
+
+class ExtractVertical <RegisterClass vec_rc> : InstR600 <
+ (outs R600_Reg32:$dst),
+ (ins vec_rc:$vec, R600_Reg32:$index), "",
+ [],
+ AnyALU
+>;
+
+let Constraints = "$dst = $vec" in {
+
+class InsertVertical <RegisterClass vec_rc> : InstR600 <
+ (outs vec_rc:$dst),
+ (ins vec_rc:$vec, R600_Reg32:$value, R600_Reg32:$index), "",
+ [],
+ AnyALU
+>;
+
+} // End Constraints = "$dst = $vec"
+
+} // End isPseudo = 1
+
+def R600_EXTRACT_ELT_V2 : ExtractVertical <R600_Reg64Vertical>;
+def R600_EXTRACT_ELT_V4 : ExtractVertical <R600_Reg128Vertical>;
+
+def R600_INSERT_ELT_V2 : InsertVertical <R600_Reg64Vertical>;
+def R600_INSERT_ELT_V4 : InsertVertical <R600_Reg128Vertical>;
+
+class ExtractVerticalPat <Instruction inst, ValueType vec_ty,
+ ValueType scalar_ty> : Pat <
+ (scalar_ty (extractelt vec_ty:$vec, i32:$index)),
+ (inst $vec, $index)
+>;
+
+def : ExtractVerticalPat <R600_EXTRACT_ELT_V2, v2i32, i32>;
+def : ExtractVerticalPat <R600_EXTRACT_ELT_V2, v2f32, f32>;
+def : ExtractVerticalPat <R600_EXTRACT_ELT_V4, v4i32, i32>;
+def : ExtractVerticalPat <R600_EXTRACT_ELT_V4, v4f32, f32>;
+
+class InsertVerticalPat <Instruction inst, ValueType vec_ty,
+ ValueType scalar_ty> : Pat <
+ (vec_ty (insertelt vec_ty:$vec, scalar_ty:$value, i32:$index)),
+ (inst $vec, $value, $index)
+>;
+
+def : InsertVerticalPat <R600_INSERT_ELT_V2, v2i32, i32>;
+def : InsertVerticalPat <R600_INSERT_ELT_V2, v2f32, f32>;
+def : InsertVerticalPat <R600_INSERT_ELT_V4, v4i32, i32>;
+def : InsertVerticalPat <R600_INSERT_ELT_V4, v4f32, f32>;
+
+//===----------------------------------------------------------------------===//
+// ISel Patterns
+//===----------------------------------------------------------------------===//
+
+// CND*_INT Patterns for f32 True / False values
+
+class CND_INT_f32 <InstR600 cnd, CondCode cc> : Pat <
+ (selectcc i32:$src0, 0, f32:$src1, f32:$src2, cc),
+ (cnd $src0, $src1, $src2)
+>;
+
+def : CND_INT_f32 <CNDE_INT, SETEQ>;
+def : CND_INT_f32 <CNDGT_INT, SETGT>;
+def : CND_INT_f32 <CNDGE_INT, SETGE>;
+
+//CNDGE_INT extra pattern
+def : Pat <
+ (selectcc i32:$src0, -1, i32:$src1, i32:$src2, COND_SGT),
+ (CNDGE_INT $src0, $src1, $src2)
+>;
+
+// KIL Patterns
+def KILP : Pat <
+ (int_AMDGPU_kilp),
+ (MASK_WRITE (KILLGT (f32 ONE), (f32 ZERO)))
+>;
+
+def KIL : Pat <
+ (int_AMDGPU_kill f32:$src0),
+ (MASK_WRITE (KILLGT (f32 ZERO), $src0))
+>;
+
+def : Extract_Element <f32, v4f32, 0, sub0>;
+def : Extract_Element <f32, v4f32, 1, sub1>;
+def : Extract_Element <f32, v4f32, 2, sub2>;
+def : Extract_Element <f32, v4f32, 3, sub3>;
+
+def : Insert_Element <f32, v4f32, 0, sub0>;
+def : Insert_Element <f32, v4f32, 1, sub1>;
+def : Insert_Element <f32, v4f32, 2, sub2>;
+def : Insert_Element <f32, v4f32, 3, sub3>;
+
+def : Extract_Element <i32, v4i32, 0, sub0>;
+def : Extract_Element <i32, v4i32, 1, sub1>;
+def : Extract_Element <i32, v4i32, 2, sub2>;
+def : Extract_Element <i32, v4i32, 3, sub3>;
+
+def : Insert_Element <i32, v4i32, 0, sub0>;
+def : Insert_Element <i32, v4i32, 1, sub1>;
+def : Insert_Element <i32, v4i32, 2, sub2>;
+def : Insert_Element <i32, v4i32, 3, sub3>;
+
+def : Extract_Element <f32, v2f32, 0, sub0>;
+def : Extract_Element <f32, v2f32, 1, sub1>;
+
+def : Insert_Element <f32, v2f32, 0, sub0>;
+def : Insert_Element <f32, v2f32, 1, sub1>;
+
+def : Extract_Element <i32, v2i32, 0, sub0>;
+def : Extract_Element <i32, v2i32, 1, sub1>;
+
+def : Insert_Element <i32, v2i32, 0, sub0>;
+def : Insert_Element <i32, v2i32, 1, sub1>;
+
+// bitconvert patterns
+
+def : BitConvert <i32, f32, R600_Reg32>;
+def : BitConvert <f32, i32, R600_Reg32>;
+def : BitConvert <v2f32, v2i32, R600_Reg64>;
+def : BitConvert <v2i32, v2f32, R600_Reg64>;
+def : BitConvert <v4f32, v4i32, R600_Reg128>;
+def : BitConvert <v4i32, v4f32, R600_Reg128>;
+
+// DWORDADDR pattern
+def : DwordAddrPat <i32, R600_Reg32>;
+
+} // End isR600toCayman Predicate
+
+def getLDSNoRetOp : InstrMapping {
+ let FilterClass = "R600_LDS_1A1D";
+ let RowFields = ["BaseOp"];
+ let ColFields = ["DisableEncoding"];
+ let KeyCol = ["$dst"];
+ let ValueCols = [[""""]];
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Intrinsics.td b/contrib/llvm/lib/Target/AMDGPU/R600Intrinsics.td
new file mode 100644
index 000000000000..a5310e9fd6d0
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600Intrinsics.td
@@ -0,0 +1,67 @@
+//===-- R600Intrinsics.td - R600 Instrinsic defs -------*- tablegen -*-----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// R600 Intrinsic Definitions
+//
+//===----------------------------------------------------------------------===//
+
+class TextureIntrinsicFloatInput : Intrinsic<[llvm_v4f32_ty], [
+ llvm_v4f32_ty, // Coord
+ llvm_i32_ty, // offset_x
+ llvm_i32_ty, // offset_y,
+ llvm_i32_ty, // offset_z,
+ llvm_i32_ty, // resource_id
+ llvm_i32_ty, // samplerid
+ llvm_i32_ty, // coord_type_x
+ llvm_i32_ty, // coord_type_y
+ llvm_i32_ty, // coord_type_z
+ llvm_i32_ty], // coord_type_w
+ [IntrNoMem]
+>;
+
+class TextureIntrinsicInt32Input : Intrinsic<[llvm_v4i32_ty], [
+ llvm_v4i32_ty, // Coord
+ llvm_i32_ty, // offset_x
+ llvm_i32_ty, // offset_y,
+ llvm_i32_ty, // offset_z,
+ llvm_i32_ty, // resource_id
+ llvm_i32_ty, // samplerid
+ llvm_i32_ty, // coord_type_x
+ llvm_i32_ty, // coord_type_y
+ llvm_i32_ty, // coord_type_z
+ llvm_i32_ty], // coord_type_w
+ [IntrNoMem]
+>;
+
+let TargetPrefix = "r600", isTarget = 1 in {
+
+def int_r600_store_swizzle :
+ Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []
+>;
+
+def int_r600_store_stream_output : Intrinsic<
+ [], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []
+>;
+
+def int_r600_tex : TextureIntrinsicFloatInput;
+def int_r600_texc : TextureIntrinsicFloatInput;
+def int_r600_txl : TextureIntrinsicFloatInput;
+def int_r600_txlc : TextureIntrinsicFloatInput;
+def int_r600_txb : TextureIntrinsicFloatInput;
+def int_r600_txbc : TextureIntrinsicFloatInput;
+def int_r600_txf : TextureIntrinsicInt32Input;
+def int_r600_txq : TextureIntrinsicInt32Input;
+def int_r600_ddx : TextureIntrinsicFloatInput;
+def int_r600_ddy : TextureIntrinsicFloatInput;
+
+def int_r600_dot4 : Intrinsic<[llvm_float_ty],
+ [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]
+>;
+
+} // End TargetPrefix = "r600", isTarget = 1
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp
new file mode 100644
index 000000000000..3ca319c6c6c2
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp
@@ -0,0 +1,16 @@
+//===-- R600MachineFunctionInfo.cpp - R600 Machine Function Info-*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#include "R600MachineFunctionInfo.h"
+
+using namespace llvm;
+
+R600MachineFunctionInfo::R600MachineFunctionInfo(const MachineFunction &MF)
+ : AMDGPUMachineFunction(MF) { }
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.h b/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.h
new file mode 100644
index 000000000000..29ac0920f997
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.h
@@ -0,0 +1,28 @@
+//===-- R600MachineFunctionInfo.h - R600 Machine Function Info ----*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_R600MACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_R600MACHINEFUNCTIONINFO_H
+
+#include "AMDGPUMachineFunction.h"
+
+namespace llvm {
+
+class R600MachineFunctionInfo final : public AMDGPUMachineFunction {
+public:
+ R600MachineFunctionInfo(const MachineFunction &MF);
+ unsigned CFStackSize;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp b/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp
new file mode 100644
index 000000000000..db18e5bd1afa
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp
@@ -0,0 +1,467 @@
+//===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief R600 Machine Scheduler interface
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600MachineScheduler.h"
+#include "R600InstrInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "misched"
+
+void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
+ assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness");
+ DAG = static_cast<ScheduleDAGMILive*>(dag);
+ const R600Subtarget &ST = DAG->MF.getSubtarget<R600Subtarget>();
+ TII = static_cast<const R600InstrInfo*>(DAG->TII);
+ TRI = static_cast<const R600RegisterInfo*>(DAG->TRI);
+ VLIW5 = !ST.hasCaymanISA();
+ MRI = &DAG->MRI;
+ CurInstKind = IDOther;
+ CurEmitted = 0;
+ OccupedSlotsMask = 31;
+ InstKindLimit[IDAlu] = TII->getMaxAlusPerClause();
+ InstKindLimit[IDOther] = 32;
+ InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
+ AluInstCount = 0;
+ FetchInstCount = 0;
+}
+
+void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
+ std::vector<SUnit *> &QDst)
+{
+ QDst.insert(QDst.end(), QSrc.begin(), QSrc.end());
+ QSrc.clear();
+}
+
+static unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
+ assert (GPRCount && "GPRCount cannot be 0");
+ return 248 / GPRCount;
+}
+
+SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
+ SUnit *SU = nullptr;
+ NextInstKind = IDOther;
+
+ IsTopNode = false;
+
+ // check if we might want to switch current clause type
+ bool AllowSwitchToAlu = (CurEmitted >= InstKindLimit[CurInstKind]) ||
+ (Available[CurInstKind].empty());
+ bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) &&
+ (!Available[IDFetch].empty() || !Available[IDOther].empty());
+
+ if (CurInstKind == IDAlu && !Available[IDFetch].empty()) {
+ // We use the heuristic provided by AMD Accelerated Parallel Processing
+ // OpenCL Programming Guide :
+ // The approx. number of WF that allows TEX inst to hide ALU inst is :
+ // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU))
+ float ALUFetchRationEstimate =
+ (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) /
+ (FetchInstCount + Available[IDFetch].size());
+ if (ALUFetchRationEstimate == 0) {
+ AllowSwitchFromAlu = true;
+ } else {
+ unsigned NeededWF = 62.5f / ALUFetchRationEstimate;
+ DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" );
+ // We assume the local GPR requirements to be "dominated" by the requirement
+ // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and
+ // after TEX are indeed likely to consume or generate values from/for the
+ // TEX clause.
+ // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause
+ // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need
+ // one GPR) or TmXYZW = TnXYZW (need 2 GPR).
+ // (TODO : use RegisterPressure)
+ // If we are going too use too many GPR, we flush Fetch instruction to lower
+ // register pressure on 128 bits regs.
+ unsigned NearRegisterRequirement = 2 * Available[IDFetch].size();
+ if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement))
+ AllowSwitchFromAlu = true;
+ }
+ }
+
+ if (!SU && ((AllowSwitchToAlu && CurInstKind != IDAlu) ||
+ (!AllowSwitchFromAlu && CurInstKind == IDAlu))) {
+ // try to pick ALU
+ SU = pickAlu();
+ if (!SU && !PhysicalRegCopy.empty()) {
+ SU = PhysicalRegCopy.front();
+ PhysicalRegCopy.erase(PhysicalRegCopy.begin());
+ }
+ if (SU) {
+ if (CurEmitted >= InstKindLimit[IDAlu])
+ CurEmitted = 0;
+ NextInstKind = IDAlu;
+ }
+ }
+
+ if (!SU) {
+ // try to pick FETCH
+ SU = pickOther(IDFetch);
+ if (SU)
+ NextInstKind = IDFetch;
+ }
+
+ // try to pick other
+ if (!SU) {
+ SU = pickOther(IDOther);
+ if (SU)
+ NextInstKind = IDOther;
+ }
+
+ DEBUG(
+ if (SU) {
+ dbgs() << " ** Pick node **\n";
+ SU->dump(DAG);
+ } else {
+ dbgs() << "NO NODE \n";
+ for (unsigned i = 0; i < DAG->SUnits.size(); i++) {
+ const SUnit &S = DAG->SUnits[i];
+ if (!S.isScheduled)
+ S.dump(DAG);
+ }
+ }
+ );
+
+ return SU;
+}
+
+void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
+ if (NextInstKind != CurInstKind) {
+ DEBUG(dbgs() << "Instruction Type Switch\n");
+ if (NextInstKind != IDAlu)
+ OccupedSlotsMask |= 31;
+ CurEmitted = 0;
+ CurInstKind = NextInstKind;
+ }
+
+ if (CurInstKind == IDAlu) {
+ AluInstCount ++;
+ switch (getAluKind(SU)) {
+ case AluT_XYZW:
+ CurEmitted += 4;
+ break;
+ case AluDiscarded:
+ break;
+ default: {
+ ++CurEmitted;
+ for (MachineInstr::mop_iterator It = SU->getInstr()->operands_begin(),
+ E = SU->getInstr()->operands_end(); It != E; ++It) {
+ MachineOperand &MO = *It;
+ if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X)
+ ++CurEmitted;
+ }
+ }
+ }
+ } else {
+ ++CurEmitted;
+ }
+
+
+ DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n");
+
+ if (CurInstKind != IDFetch) {
+ MoveUnits(Pending[IDFetch], Available[IDFetch]);
+ } else
+ FetchInstCount++;
+}
+
+static bool
+isPhysicalRegCopy(MachineInstr *MI) {
+ if (MI->getOpcode() != AMDGPU::COPY)
+ return false;
+
+ return !TargetRegisterInfo::isVirtualRegister(MI->getOperand(1).getReg());
+}
+
+void R600SchedStrategy::releaseTopNode(SUnit *SU) {
+ DEBUG(dbgs() << "Top Releasing ";SU->dump(DAG););
+}
+
+void R600SchedStrategy::releaseBottomNode(SUnit *SU) {
+ DEBUG(dbgs() << "Bottom Releasing ";SU->dump(DAG););
+ if (isPhysicalRegCopy(SU->getInstr())) {
+ PhysicalRegCopy.push_back(SU);
+ return;
+ }
+
+ int IK = getInstKind(SU);
+
+ // There is no export clause, we can schedule one as soon as its ready
+ if (IK == IDOther)
+ Available[IDOther].push_back(SU);
+ else
+ Pending[IK].push_back(SU);
+
+}
+
+bool R600SchedStrategy::regBelongsToClass(unsigned Reg,
+ const TargetRegisterClass *RC) const {
+ if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+ return RC->contains(Reg);
+ } else {
+ return MRI->getRegClass(Reg) == RC;
+ }
+}
+
+R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
+ MachineInstr *MI = SU->getInstr();
+
+ if (TII->isTransOnly(*MI))
+ return AluTrans;
+
+ switch (MI->getOpcode()) {
+ case AMDGPU::PRED_X:
+ return AluPredX;
+ case AMDGPU::INTERP_PAIR_XY:
+ case AMDGPU::INTERP_PAIR_ZW:
+ case AMDGPU::INTERP_VEC_LOAD:
+ case AMDGPU::DOT_4:
+ return AluT_XYZW;
+ case AMDGPU::COPY:
+ if (MI->getOperand(1).isUndef()) {
+ // MI will become a KILL, don't considers it in scheduling
+ return AluDiscarded;
+ }
+ default:
+ break;
+ }
+
+ // Does the instruction take a whole IG ?
+ // XXX: Is it possible to add a helper function in R600InstrInfo that can
+ // be used here and in R600PacketizerList::isSoloInstruction() ?
+ if(TII->isVector(*MI) ||
+ TII->isCubeOp(MI->getOpcode()) ||
+ TII->isReductionOp(MI->getOpcode()) ||
+ MI->getOpcode() == AMDGPU::GROUP_BARRIER) {
+ return AluT_XYZW;
+ }
+
+ if (TII->isLDSInstr(MI->getOpcode())) {
+ return AluT_X;
+ }
+
+ // Is the result already assigned to a channel ?
+ unsigned DestSubReg = MI->getOperand(0).getSubReg();
+ switch (DestSubReg) {
+ case AMDGPU::sub0:
+ return AluT_X;
+ case AMDGPU::sub1:
+ return AluT_Y;
+ case AMDGPU::sub2:
+ return AluT_Z;
+ case AMDGPU::sub3:
+ return AluT_W;
+ default:
+ break;
+ }
+
+ // Is the result already member of a X/Y/Z/W class ?
+ unsigned DestReg = MI->getOperand(0).getReg();
+ if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_XRegClass) ||
+ regBelongsToClass(DestReg, &AMDGPU::R600_AddrRegClass))
+ return AluT_X;
+ if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_YRegClass))
+ return AluT_Y;
+ if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass))
+ return AluT_Z;
+ if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_WRegClass))
+ return AluT_W;
+ if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass))
+ return AluT_XYZW;
+
+ // LDS src registers cannot be used in the Trans slot.
+ if (TII->readsLDSSrcReg(*MI))
+ return AluT_XYZW;
+
+ return AluAny;
+}
+
+int R600SchedStrategy::getInstKind(SUnit* SU) {
+ int Opcode = SU->getInstr()->getOpcode();
+
+ if (TII->usesTextureCache(Opcode) || TII->usesVertexCache(Opcode))
+ return IDFetch;
+
+ if (TII->isALUInstr(Opcode)) {
+ return IDAlu;
+ }
+
+ switch (Opcode) {
+ case AMDGPU::PRED_X:
+ case AMDGPU::COPY:
+ case AMDGPU::CONST_COPY:
+ case AMDGPU::INTERP_PAIR_XY:
+ case AMDGPU::INTERP_PAIR_ZW:
+ case AMDGPU::INTERP_VEC_LOAD:
+ case AMDGPU::DOT_4:
+ return IDAlu;
+ default:
+ return IDOther;
+ }
+}
+
+SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) {
+ if (Q.empty())
+ return nullptr;
+ for (std::vector<SUnit *>::reverse_iterator It = Q.rbegin(), E = Q.rend();
+ It != E; ++It) {
+ SUnit *SU = *It;
+ InstructionsGroupCandidate.push_back(SU->getInstr());
+ if (TII->fitsConstReadLimitations(InstructionsGroupCandidate) &&
+ (!AnyALU || !TII->isVectorOnly(*SU->getInstr()))) {
+ InstructionsGroupCandidate.pop_back();
+ Q.erase((It + 1).base());
+ return SU;
+ } else {
+ InstructionsGroupCandidate.pop_back();
+ }
+ }
+ return nullptr;
+}
+
+void R600SchedStrategy::LoadAlu() {
+ std::vector<SUnit *> &QSrc = Pending[IDAlu];
+ for (unsigned i = 0, e = QSrc.size(); i < e; ++i) {
+ AluKind AK = getAluKind(QSrc[i]);
+ AvailableAlus[AK].push_back(QSrc[i]);
+ }
+ QSrc.clear();
+}
+
+void R600SchedStrategy::PrepareNextSlot() {
+ DEBUG(dbgs() << "New Slot\n");
+ assert (OccupedSlotsMask && "Slot wasn't filled");
+ OccupedSlotsMask = 0;
+// if (HwGen == R600Subtarget::NORTHERN_ISLANDS)
+// OccupedSlotsMask |= 16;
+ InstructionsGroupCandidate.clear();
+ LoadAlu();
+}
+
+void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
+ int DstIndex = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
+ if (DstIndex == -1) {
+ return;
+ }
+ unsigned DestReg = MI->getOperand(DstIndex).getReg();
+ // PressureRegister crashes if an operand is def and used in the same inst
+ // and we try to constraint its regclass
+ for (MachineInstr::mop_iterator It = MI->operands_begin(),
+ E = MI->operands_end(); It != E; ++It) {
+ MachineOperand &MO = *It;
+ if (MO.isReg() && !MO.isDef() &&
+ MO.getReg() == DestReg)
+ return;
+ }
+ // Constrains the regclass of DestReg to assign it to Slot
+ switch (Slot) {
+ case 0:
+ MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_XRegClass);
+ break;
+ case 1:
+ MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_YRegClass);
+ break;
+ case 2:
+ MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass);
+ break;
+ case 3:
+ MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_WRegClass);
+ break;
+ }
+}
+
+SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot, bool AnyAlu) {
+ static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W};
+ SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]], AnyAlu);
+ if (SlotedSU)
+ return SlotedSU;
+ SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny], AnyAlu);
+ if (UnslotedSU)
+ AssignSlot(UnslotedSU->getInstr(), Slot);
+ return UnslotedSU;
+}
+
+unsigned R600SchedStrategy::AvailablesAluCount() const {
+ return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() +
+ AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() +
+ AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() +
+ AvailableAlus[AluTrans].size() + AvailableAlus[AluDiscarded].size() +
+ AvailableAlus[AluPredX].size();
+}
+
+SUnit* R600SchedStrategy::pickAlu() {
+ while (AvailablesAluCount() || !Pending[IDAlu].empty()) {
+ if (!OccupedSlotsMask) {
+ // Bottom up scheduling : predX must comes first
+ if (!AvailableAlus[AluPredX].empty()) {
+ OccupedSlotsMask |= 31;
+ return PopInst(AvailableAlus[AluPredX], false);
+ }
+ // Flush physical reg copies (RA will discard them)
+ if (!AvailableAlus[AluDiscarded].empty()) {
+ OccupedSlotsMask |= 31;
+ return PopInst(AvailableAlus[AluDiscarded], false);
+ }
+ // If there is a T_XYZW alu available, use it
+ if (!AvailableAlus[AluT_XYZW].empty()) {
+ OccupedSlotsMask |= 15;
+ return PopInst(AvailableAlus[AluT_XYZW], false);
+ }
+ }
+ bool TransSlotOccuped = OccupedSlotsMask & 16;
+ if (!TransSlotOccuped && VLIW5) {
+ if (!AvailableAlus[AluTrans].empty()) {
+ OccupedSlotsMask |= 16;
+ return PopInst(AvailableAlus[AluTrans], false);
+ }
+ SUnit *SU = AttemptFillSlot(3, true);
+ if (SU) {
+ OccupedSlotsMask |= 16;
+ return SU;
+ }
+ }
+ for (int Chan = 3; Chan > -1; --Chan) {
+ bool isOccupied = OccupedSlotsMask & (1 << Chan);
+ if (!isOccupied) {
+ SUnit *SU = AttemptFillSlot(Chan, false);
+ if (SU) {
+ OccupedSlotsMask |= (1 << Chan);
+ InstructionsGroupCandidate.push_back(SU->getInstr());
+ return SU;
+ }
+ }
+ }
+ PrepareNextSlot();
+ }
+ return nullptr;
+}
+
+SUnit* R600SchedStrategy::pickOther(int QID) {
+ SUnit *SU = nullptr;
+ std::vector<SUnit *> &AQ = Available[QID];
+
+ if (AQ.empty()) {
+ MoveUnits(Pending[QID], AQ);
+ }
+ if (!AQ.empty()) {
+ SU = AQ.back();
+ AQ.resize(AQ.size() - 1);
+ }
+ return SU;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.h b/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.h
new file mode 100644
index 000000000000..9a6770570477
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.h
@@ -0,0 +1,100 @@
+//===-- R600MachineScheduler.h - R600 Scheduler Interface -*- C++ -*-------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief R600 Machine Scheduler interface
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_R600MACHINESCHEDULER_H
+#define LLVM_LIB_TARGET_AMDGPU_R600MACHINESCHEDULER_H
+
+#include "llvm/CodeGen/MachineScheduler.h"
+#include <vector>
+
+using namespace llvm;
+
+namespace llvm {
+
+class R600InstrInfo;
+struct R600RegisterInfo;
+
+class R600SchedStrategy final : public MachineSchedStrategy {
+ const ScheduleDAGMILive *DAG = nullptr;
+ const R600InstrInfo *TII = nullptr;
+ const R600RegisterInfo *TRI = nullptr;
+ MachineRegisterInfo *MRI = nullptr;
+
+ enum InstKind {
+ IDAlu,
+ IDFetch,
+ IDOther,
+ IDLast
+ };
+
+ enum AluKind {
+ AluAny,
+ AluT_X,
+ AluT_Y,
+ AluT_Z,
+ AluT_W,
+ AluT_XYZW,
+ AluPredX,
+ AluTrans,
+ AluDiscarded, // LLVM Instructions that are going to be eliminated
+ AluLast
+ };
+
+ std::vector<SUnit *> Available[IDLast], Pending[IDLast];
+ std::vector<SUnit *> AvailableAlus[AluLast];
+ std::vector<SUnit *> PhysicalRegCopy;
+
+ InstKind CurInstKind;
+ int CurEmitted;
+ InstKind NextInstKind;
+
+ unsigned AluInstCount;
+ unsigned FetchInstCount;
+
+ int InstKindLimit[IDLast];
+
+ int OccupedSlotsMask;
+
+public:
+ R600SchedStrategy() = default;
+ ~R600SchedStrategy() override = default;
+
+ void initialize(ScheduleDAGMI *dag) override;
+ SUnit *pickNode(bool &IsTopNode) override;
+ void schedNode(SUnit *SU, bool IsTopNode) override;
+ void releaseTopNode(SUnit *SU) override;
+ void releaseBottomNode(SUnit *SU) override;
+
+private:
+ std::vector<MachineInstr *> InstructionsGroupCandidate;
+ bool VLIW5;
+
+ int getInstKind(SUnit *SU);
+ bool regBelongsToClass(unsigned Reg, const TargetRegisterClass *RC) const;
+ AluKind getAluKind(SUnit *SU) const;
+ void LoadAlu();
+ unsigned AvailablesAluCount() const;
+ SUnit *AttemptFillSlot (unsigned Slot, bool AnyAlu);
+ void PrepareNextSlot();
+ SUnit *PopInst(std::vector<SUnit*> &Q, bool AnyALU);
+
+ void AssignSlot(MachineInstr *MI, unsigned Slot);
+ SUnit* pickAlu();
+ SUnit* pickOther(int QID);
+ void MoveUnits(std::vector<SUnit *> &QSrc, std::vector<SUnit *> &QDst);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_R600MACHINESCHEDULER_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
new file mode 100644
index 000000000000..d90008a550ae
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -0,0 +1,401 @@
+//===--------------------- R600MergeVectorRegisters.cpp -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass merges inputs of swizzeable instructions into vector sharing
+/// common data and/or have enough undef subreg using swizzle abilities.
+///
+/// For instance let's consider the following pseudo code :
+/// vreg5<def> = REG_SEQ vreg1, sub0, vreg2, sub1, vreg3, sub2, undef, sub3
+/// ...
+/// vreg7<def> = REG_SEQ vreg1, sub0, vreg3, sub1, undef, sub2, vreg4, sub3
+/// (swizzable Inst) vreg7, SwizzleMask : sub0, sub1, sub2, sub3
+///
+/// is turned into :
+/// vreg5<def> = REG_SEQ vreg1, sub0, vreg2, sub1, vreg3, sub2, undef, sub3
+/// ...
+/// vreg7<def> = INSERT_SUBREG vreg4, sub3
+/// (swizzable Inst) vreg7, SwizzleMask : sub0, sub2, sub1, sub3
+///
+/// This allow regalloc to reduce register pressure for vector registers and
+/// to reduce MOV count.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "R600Defines.h"
+#include "R600InstrInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/PassAnalysisSupport.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "vec-merger"
+
+static bool
+isImplicitlyDef(MachineRegisterInfo &MRI, unsigned Reg) {
+ for (MachineRegisterInfo::def_instr_iterator It = MRI.def_instr_begin(Reg),
+ E = MRI.def_instr_end(); It != E; ++It) {
+ return (*It).isImplicitDef();
+ }
+ if (MRI.isReserved(Reg)) {
+ return false;
+ }
+ llvm_unreachable("Reg without a def");
+ return false;
+}
+
+namespace {
+
+class RegSeqInfo {
+public:
+ MachineInstr *Instr;
+ DenseMap<unsigned, unsigned> RegToChan;
+ std::vector<unsigned> UndefReg;
+
+ RegSeqInfo(MachineRegisterInfo &MRI, MachineInstr *MI) : Instr(MI) {
+ assert(MI->getOpcode() == AMDGPU::REG_SEQUENCE);
+ for (unsigned i = 1, e = Instr->getNumOperands(); i < e; i+=2) {
+ MachineOperand &MO = Instr->getOperand(i);
+ unsigned Chan = Instr->getOperand(i + 1).getImm();
+ if (isImplicitlyDef(MRI, MO.getReg()))
+ UndefReg.push_back(Chan);
+ else
+ RegToChan[MO.getReg()] = Chan;
+ }
+ }
+
+ RegSeqInfo() = default;
+
+ bool operator==(const RegSeqInfo &RSI) const {
+ return RSI.Instr == Instr;
+ }
+};
+
+class R600VectorRegMerger : public MachineFunctionPass {
+private:
+ MachineRegisterInfo *MRI;
+ const R600InstrInfo *TII;
+
+ bool canSwizzle(const MachineInstr &MI) const;
+ bool areAllUsesSwizzeable(unsigned Reg) const;
+ void SwizzleInput(MachineInstr &,
+ const std::vector<std::pair<unsigned, unsigned>> &RemapChan) const;
+ bool tryMergeVector(const RegSeqInfo *Untouched, RegSeqInfo *ToMerge,
+ std::vector<std::pair<unsigned, unsigned>> &Remap) const;
+ bool tryMergeUsingCommonSlot(RegSeqInfo &RSI, RegSeqInfo &CompatibleRSI,
+ std::vector<std::pair<unsigned, unsigned>> &RemapChan);
+ bool tryMergeUsingFreeSlot(RegSeqInfo &RSI, RegSeqInfo &CompatibleRSI,
+ std::vector<std::pair<unsigned, unsigned>> &RemapChan);
+ MachineInstr *RebuildVector(RegSeqInfo *MI, const RegSeqInfo *BaseVec,
+ const std::vector<std::pair<unsigned, unsigned>> &RemapChan) const;
+ void RemoveMI(MachineInstr *);
+ void trackRSI(const RegSeqInfo &RSI);
+
+ typedef DenseMap<unsigned, std::vector<MachineInstr *>> InstructionSetMap;
+ DenseMap<MachineInstr *, RegSeqInfo> PreviousRegSeq;
+ InstructionSetMap PreviousRegSeqByReg;
+ InstructionSetMap PreviousRegSeqByUndefCount;
+
+public:
+ static char ID;
+
+ R600VectorRegMerger(TargetMachine &tm) : MachineFunctionPass(ID),
+ TII(nullptr) { }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineLoopInfo>();
+ AU.addPreserved<MachineLoopInfo>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ StringRef getPassName() const override {
+ return "R600 Vector Registers Merge Pass";
+ }
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+};
+
+} // end anonymous namespace.
+
+char R600VectorRegMerger::ID = 0;
+
+bool R600VectorRegMerger::canSwizzle(const MachineInstr &MI)
+ const {
+ if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST)
+ return true;
+ switch (MI.getOpcode()) {
+ case AMDGPU::R600_ExportSwz:
+ case AMDGPU::EG_ExportSwz:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool R600VectorRegMerger::tryMergeVector(const RegSeqInfo *Untouched,
+ RegSeqInfo *ToMerge, std::vector< std::pair<unsigned, unsigned>> &Remap)
+ const {
+ unsigned CurrentUndexIdx = 0;
+ for (DenseMap<unsigned, unsigned>::iterator It = ToMerge->RegToChan.begin(),
+ E = ToMerge->RegToChan.end(); It != E; ++It) {
+ DenseMap<unsigned, unsigned>::const_iterator PosInUntouched =
+ Untouched->RegToChan.find((*It).first);
+ if (PosInUntouched != Untouched->RegToChan.end()) {
+ Remap.push_back(std::pair<unsigned, unsigned>
+ ((*It).second, (*PosInUntouched).second));
+ continue;
+ }
+ if (CurrentUndexIdx >= Untouched->UndefReg.size())
+ return false;
+ Remap.push_back(std::pair<unsigned, unsigned>
+ ((*It).second, Untouched->UndefReg[CurrentUndexIdx++]));
+ }
+
+ return true;
+}
+
+static
+unsigned getReassignedChan(
+ const std::vector<std::pair<unsigned, unsigned>> &RemapChan,
+ unsigned Chan) {
+ for (unsigned j = 0, je = RemapChan.size(); j < je; j++) {
+ if (RemapChan[j].first == Chan)
+ return RemapChan[j].second;
+ }
+ llvm_unreachable("Chan wasn't reassigned");
+}
+
+MachineInstr *R600VectorRegMerger::RebuildVector(
+ RegSeqInfo *RSI, const RegSeqInfo *BaseRSI,
+ const std::vector<std::pair<unsigned, unsigned>> &RemapChan) const {
+ unsigned Reg = RSI->Instr->getOperand(0).getReg();
+ MachineBasicBlock::iterator Pos = RSI->Instr;
+ MachineBasicBlock &MBB = *Pos->getParent();
+ DebugLoc DL = Pos->getDebugLoc();
+
+ unsigned SrcVec = BaseRSI->Instr->getOperand(0).getReg();
+ DenseMap<unsigned, unsigned> UpdatedRegToChan = BaseRSI->RegToChan;
+ std::vector<unsigned> UpdatedUndef = BaseRSI->UndefReg;
+ for (DenseMap<unsigned, unsigned>::iterator It = RSI->RegToChan.begin(),
+ E = RSI->RegToChan.end(); It != E; ++It) {
+ unsigned DstReg = MRI->createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
+ unsigned SubReg = (*It).first;
+ unsigned Swizzle = (*It).second;
+ unsigned Chan = getReassignedChan(RemapChan, Swizzle);
+
+ MachineInstr *Tmp = BuildMI(MBB, Pos, DL, TII->get(AMDGPU::INSERT_SUBREG),
+ DstReg)
+ .addReg(SrcVec)
+ .addReg(SubReg)
+ .addImm(Chan);
+ UpdatedRegToChan[SubReg] = Chan;
+ std::vector<unsigned>::iterator ChanPos = llvm::find(UpdatedUndef, Chan);
+ if (ChanPos != UpdatedUndef.end())
+ UpdatedUndef.erase(ChanPos);
+ assert(!is_contained(UpdatedUndef, Chan) &&
+ "UpdatedUndef shouldn't contain Chan more than once!");
+ DEBUG(dbgs() << " ->"; Tmp->dump(););
+ (void)Tmp;
+ SrcVec = DstReg;
+ }
+ MachineInstr *NewMI =
+ BuildMI(MBB, Pos, DL, TII->get(AMDGPU::COPY), Reg).addReg(SrcVec);
+ DEBUG(dbgs() << " ->"; NewMI->dump(););
+
+ DEBUG(dbgs() << " Updating Swizzle:\n");
+ for (MachineRegisterInfo::use_instr_iterator It = MRI->use_instr_begin(Reg),
+ E = MRI->use_instr_end(); It != E; ++It) {
+ DEBUG(dbgs() << " ";(*It).dump(); dbgs() << " ->");
+ SwizzleInput(*It, RemapChan);
+ DEBUG((*It).dump());
+ }
+ RSI->Instr->eraseFromParent();
+
+ // Update RSI
+ RSI->Instr = NewMI;
+ RSI->RegToChan = UpdatedRegToChan;
+ RSI->UndefReg = UpdatedUndef;
+
+ return NewMI;
+}
+
+void R600VectorRegMerger::RemoveMI(MachineInstr *MI) {
+ for (InstructionSetMap::iterator It = PreviousRegSeqByReg.begin(),
+ E = PreviousRegSeqByReg.end(); It != E; ++It) {
+ std::vector<MachineInstr *> &MIs = (*It).second;
+ MIs.erase(llvm::find(MIs, MI), MIs.end());
+ }
+ for (InstructionSetMap::iterator It = PreviousRegSeqByUndefCount.begin(),
+ E = PreviousRegSeqByUndefCount.end(); It != E; ++It) {
+ std::vector<MachineInstr *> &MIs = (*It).second;
+ MIs.erase(llvm::find(MIs, MI), MIs.end());
+ }
+}
+
+void R600VectorRegMerger::SwizzleInput(MachineInstr &MI,
+ const std::vector<std::pair<unsigned, unsigned>> &RemapChan) const {
+ unsigned Offset;
+ if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST)
+ Offset = 2;
+ else
+ Offset = 3;
+ for (unsigned i = 0; i < 4; i++) {
+ unsigned Swizzle = MI.getOperand(i + Offset).getImm() + 1;
+ for (unsigned j = 0, e = RemapChan.size(); j < e; j++) {
+ if (RemapChan[j].first == Swizzle) {
+ MI.getOperand(i + Offset).setImm(RemapChan[j].second - 1);
+ break;
+ }
+ }
+ }
+}
+
+bool R600VectorRegMerger::areAllUsesSwizzeable(unsigned Reg) const {
+ for (MachineRegisterInfo::use_instr_iterator It = MRI->use_instr_begin(Reg),
+ E = MRI->use_instr_end(); It != E; ++It) {
+ if (!canSwizzle(*It))
+ return false;
+ }
+ return true;
+}
+
+bool R600VectorRegMerger::tryMergeUsingCommonSlot(RegSeqInfo &RSI,
+ RegSeqInfo &CompatibleRSI,
+ std::vector<std::pair<unsigned, unsigned>> &RemapChan) {
+ for (MachineInstr::mop_iterator MOp = RSI.Instr->operands_begin(),
+ MOE = RSI.Instr->operands_end(); MOp != MOE; ++MOp) {
+ if (!MOp->isReg())
+ continue;
+ if (PreviousRegSeqByReg[MOp->getReg()].empty())
+ continue;
+ for (MachineInstr *MI : PreviousRegSeqByReg[MOp->getReg()]) {
+ CompatibleRSI = PreviousRegSeq[MI];
+ if (RSI == CompatibleRSI)
+ continue;
+ if (tryMergeVector(&CompatibleRSI, &RSI, RemapChan))
+ return true;
+ }
+ }
+ return false;
+}
+
+bool R600VectorRegMerger::tryMergeUsingFreeSlot(RegSeqInfo &RSI,
+ RegSeqInfo &CompatibleRSI,
+ std::vector<std::pair<unsigned, unsigned>> &RemapChan) {
+ unsigned NeededUndefs = 4 - RSI.UndefReg.size();
+ if (PreviousRegSeqByUndefCount[NeededUndefs].empty())
+ return false;
+ std::vector<MachineInstr *> &MIs =
+ PreviousRegSeqByUndefCount[NeededUndefs];
+ CompatibleRSI = PreviousRegSeq[MIs.back()];
+ tryMergeVector(&CompatibleRSI, &RSI, RemapChan);
+ return true;
+}
+
+void R600VectorRegMerger::trackRSI(const RegSeqInfo &RSI) {
+ for (DenseMap<unsigned, unsigned>::const_iterator
+ It = RSI.RegToChan.begin(), E = RSI.RegToChan.end(); It != E; ++It) {
+ PreviousRegSeqByReg[(*It).first].push_back(RSI.Instr);
+ }
+ PreviousRegSeqByUndefCount[RSI.UndefReg.size()].push_back(RSI.Instr);
+ PreviousRegSeq[RSI.Instr] = RSI;
+}
+
+bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
+ if (skipFunction(*Fn.getFunction()))
+ return false;
+
+ const R600Subtarget &ST = Fn.getSubtarget<R600Subtarget>();
+ TII = ST.getInstrInfo();
+ MRI = &Fn.getRegInfo();
+
+ for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
+ MBB != MBBe; ++MBB) {
+ MachineBasicBlock *MB = &*MBB;
+ PreviousRegSeq.clear();
+ PreviousRegSeqByReg.clear();
+ PreviousRegSeqByUndefCount.clear();
+
+ for (MachineBasicBlock::iterator MII = MB->begin(), MIIE = MB->end();
+ MII != MIIE; ++MII) {
+ MachineInstr &MI = *MII;
+ if (MI.getOpcode() != AMDGPU::REG_SEQUENCE) {
+ if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST) {
+ unsigned Reg = MI.getOperand(1).getReg();
+ for (MachineRegisterInfo::def_instr_iterator
+ It = MRI->def_instr_begin(Reg), E = MRI->def_instr_end();
+ It != E; ++It) {
+ RemoveMI(&(*It));
+ }
+ }
+ continue;
+ }
+
+ RegSeqInfo RSI(*MRI, &MI);
+
+ // All uses of MI are swizzeable ?
+ unsigned Reg = MI.getOperand(0).getReg();
+ if (!areAllUsesSwizzeable(Reg))
+ continue;
+
+ DEBUG({
+ dbgs() << "Trying to optimize ";
+ MI.dump();
+ });
+
+ RegSeqInfo CandidateRSI;
+ std::vector<std::pair<unsigned, unsigned>> RemapChan;
+ DEBUG(dbgs() << "Using common slots...\n";);
+ if (tryMergeUsingCommonSlot(RSI, CandidateRSI, RemapChan)) {
+ // Remove CandidateRSI mapping
+ RemoveMI(CandidateRSI.Instr);
+ MII = RebuildVector(&RSI, &CandidateRSI, RemapChan);
+ trackRSI(RSI);
+ continue;
+ }
+ DEBUG(dbgs() << "Using free slots...\n";);
+ RemapChan.clear();
+ if (tryMergeUsingFreeSlot(RSI, CandidateRSI, RemapChan)) {
+ RemoveMI(CandidateRSI.Instr);
+ MII = RebuildVector(&RSI, &CandidateRSI, RemapChan);
+ trackRSI(RSI);
+ continue;
+ }
+ //Failed to merge
+ trackRSI(RSI);
+ }
+ }
+ return false;
+}
+
+llvm::FunctionPass *llvm::createR600VectorRegMerger(TargetMachine &tm) {
+ return new R600VectorRegMerger(tm);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp b/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
new file mode 100644
index 000000000000..5b6dd1ed128d
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
@@ -0,0 +1,409 @@
+//===----- R600Packetizer.cpp - VLIW packetizer ---------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass implements instructions packetization for R600. It unsets isLast
+/// bit of instructions inside a bundle and substitutes src register with
+/// PreviousVector when applicable.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Debug.h"
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "R600InstrInfo.h"
+#include "llvm/CodeGen/DFAPacketizer.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "packets"
+
+namespace {
+
+class R600Packetizer : public MachineFunctionPass {
+
+public:
+ static char ID;
+ R600Packetizer(const TargetMachine &TM) : MachineFunctionPass(ID) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineLoopInfo>();
+ AU.addPreserved<MachineLoopInfo>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ StringRef getPassName() const override { return "R600 Packetizer"; }
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+};
+char R600Packetizer::ID = 0;
+
+class R600PacketizerList : public VLIWPacketizerList {
+private:
+ const R600InstrInfo *TII;
+ const R600RegisterInfo &TRI;
+ bool VLIW5;
+ bool ConsideredInstUsesAlreadyWrittenVectorElement;
+
+ unsigned getSlot(const MachineInstr &MI) const {
+ return TRI.getHWRegChan(MI.getOperand(0).getReg());
+ }
+
+ /// \returns register to PV chan mapping for bundle/single instructions that
+ /// immediately precedes I.
+ DenseMap<unsigned, unsigned> getPreviousVector(MachineBasicBlock::iterator I)
+ const {
+ DenseMap<unsigned, unsigned> Result;
+ I--;
+ if (!TII->isALUInstr(I->getOpcode()) && !I->isBundle())
+ return Result;
+ MachineBasicBlock::instr_iterator BI = I.getInstrIterator();
+ if (I->isBundle())
+ BI++;
+ int LastDstChan = -1;
+ do {
+ bool isTrans = false;
+ int BISlot = getSlot(*BI);
+ if (LastDstChan >= BISlot)
+ isTrans = true;
+ LastDstChan = BISlot;
+ if (TII->isPredicated(*BI))
+ continue;
+ int OperandIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::write);
+ if (OperandIdx > -1 && BI->getOperand(OperandIdx).getImm() == 0)
+ continue;
+ int DstIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::dst);
+ if (DstIdx == -1) {
+ continue;
+ }
+ unsigned Dst = BI->getOperand(DstIdx).getReg();
+ if (isTrans || TII->isTransOnly(*BI)) {
+ Result[Dst] = AMDGPU::PS;
+ continue;
+ }
+ if (BI->getOpcode() == AMDGPU::DOT4_r600 ||
+ BI->getOpcode() == AMDGPU::DOT4_eg) {
+ Result[Dst] = AMDGPU::PV_X;
+ continue;
+ }
+ if (Dst == AMDGPU::OQAP) {
+ continue;
+ }
+ unsigned PVReg = 0;
+ switch (TRI.getHWRegChan(Dst)) {
+ case 0:
+ PVReg = AMDGPU::PV_X;
+ break;
+ case 1:
+ PVReg = AMDGPU::PV_Y;
+ break;
+ case 2:
+ PVReg = AMDGPU::PV_Z;
+ break;
+ case 3:
+ PVReg = AMDGPU::PV_W;
+ break;
+ default:
+ llvm_unreachable("Invalid Chan");
+ }
+ Result[Dst] = PVReg;
+ } while ((++BI)->isBundledWithPred());
+ return Result;
+ }
+
+ void substitutePV(MachineInstr &MI, const DenseMap<unsigned, unsigned> &PVs)
+ const {
+ unsigned Ops[] = {
+ AMDGPU::OpName::src0,
+ AMDGPU::OpName::src1,
+ AMDGPU::OpName::src2
+ };
+ for (unsigned i = 0; i < 3; i++) {
+ int OperandIdx = TII->getOperandIdx(MI.getOpcode(), Ops[i]);
+ if (OperandIdx < 0)
+ continue;
+ unsigned Src = MI.getOperand(OperandIdx).getReg();
+ const DenseMap<unsigned, unsigned>::const_iterator It = PVs.find(Src);
+ if (It != PVs.end())
+ MI.getOperand(OperandIdx).setReg(It->second);
+ }
+ }
+public:
+ // Ctor.
+ R600PacketizerList(MachineFunction &MF, const R600Subtarget &ST,
+ MachineLoopInfo &MLI)
+ : VLIWPacketizerList(MF, MLI, nullptr),
+ TII(ST.getInstrInfo()),
+ TRI(TII->getRegisterInfo()) {
+ VLIW5 = !ST.hasCaymanISA();
+ }
+
+ // initPacketizerState - initialize some internal flags.
+ void initPacketizerState() override {
+ ConsideredInstUsesAlreadyWrittenVectorElement = false;
+ }
+
+ // ignorePseudoInstruction - Ignore bundling of pseudo instructions.
+ bool ignorePseudoInstruction(const MachineInstr &MI,
+ const MachineBasicBlock *MBB) override {
+ return false;
+ }
+
+ // isSoloInstruction - return true if instruction MI can not be packetized
+ // with any other instruction, which means that MI itself is a packet.
+ bool isSoloInstruction(const MachineInstr &MI) override {
+ if (TII->isVector(MI))
+ return true;
+ if (!TII->isALUInstr(MI.getOpcode()))
+ return true;
+ if (MI.getOpcode() == AMDGPU::GROUP_BARRIER)
+ return true;
+ // XXX: This can be removed once the packetizer properly handles all the
+ // LDS instruction group restrictions.
+ return TII->isLDSInstr(MI.getOpcode());
+ }
+
+ // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ
+ // together.
+ bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) override {
+ MachineInstr *MII = SUI->getInstr(), *MIJ = SUJ->getInstr();
+ if (getSlot(*MII) == getSlot(*MIJ))
+ ConsideredInstUsesAlreadyWrittenVectorElement = true;
+ // Does MII and MIJ share the same pred_sel ?
+ int OpI = TII->getOperandIdx(MII->getOpcode(), AMDGPU::OpName::pred_sel),
+ OpJ = TII->getOperandIdx(MIJ->getOpcode(), AMDGPU::OpName::pred_sel);
+ unsigned PredI = (OpI > -1)?MII->getOperand(OpI).getReg():0,
+ PredJ = (OpJ > -1)?MIJ->getOperand(OpJ).getReg():0;
+ if (PredI != PredJ)
+ return false;
+ if (SUJ->isSucc(SUI)) {
+ for (unsigned i = 0, e = SUJ->Succs.size(); i < e; ++i) {
+ const SDep &Dep = SUJ->Succs[i];
+ if (Dep.getSUnit() != SUI)
+ continue;
+ if (Dep.getKind() == SDep::Anti)
+ continue;
+ if (Dep.getKind() == SDep::Output)
+ if (MII->getOperand(0).getReg() != MIJ->getOperand(0).getReg())
+ continue;
+ return false;
+ }
+ }
+
+ bool ARDef =
+ TII->definesAddressRegister(*MII) || TII->definesAddressRegister(*MIJ);
+ bool ARUse =
+ TII->usesAddressRegister(*MII) || TII->usesAddressRegister(*MIJ);
+
+ return !ARDef || !ARUse;
+ }
+
+ // isLegalToPruneDependencies - Is it legal to prune dependece between SUI
+ // and SUJ.
+ bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) override {
+ return false;
+ }
+
+ void setIsLastBit(MachineInstr *MI, unsigned Bit) const {
+ unsigned LastOp = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::last);
+ MI->getOperand(LastOp).setImm(Bit);
+ }
+
+ bool isBundlableWithCurrentPMI(MachineInstr &MI,
+ const DenseMap<unsigned, unsigned> &PV,
+ std::vector<R600InstrInfo::BankSwizzle> &BS,
+ bool &isTransSlot) {
+ isTransSlot = TII->isTransOnly(MI);
+ assert (!isTransSlot || VLIW5);
+
+ // Is the dst reg sequence legal ?
+ if (!isTransSlot && !CurrentPacketMIs.empty()) {
+ if (getSlot(MI) <= getSlot(*CurrentPacketMIs.back())) {
+ if (ConsideredInstUsesAlreadyWrittenVectorElement &&
+ !TII->isVectorOnly(MI) && VLIW5) {
+ isTransSlot = true;
+ DEBUG({
+ dbgs() << "Considering as Trans Inst :";
+ MI.dump();
+ });
+ }
+ else
+ return false;
+ }
+ }
+
+ // Are the Constants limitations met ?
+ CurrentPacketMIs.push_back(&MI);
+ if (!TII->fitsConstReadLimitations(CurrentPacketMIs)) {
+ DEBUG({
+ dbgs() << "Couldn't pack :\n";
+ MI.dump();
+ dbgs() << "with the following packets :\n";
+ for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) {
+ CurrentPacketMIs[i]->dump();
+ dbgs() << "\n";
+ }
+ dbgs() << "because of Consts read limitations\n";
+ });
+ CurrentPacketMIs.pop_back();
+ return false;
+ }
+
+ // Is there a BankSwizzle set that meet Read Port limitations ?
+ if (!TII->fitsReadPortLimitations(CurrentPacketMIs,
+ PV, BS, isTransSlot)) {
+ DEBUG({
+ dbgs() << "Couldn't pack :\n";
+ MI.dump();
+ dbgs() << "with the following packets :\n";
+ for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) {
+ CurrentPacketMIs[i]->dump();
+ dbgs() << "\n";
+ }
+ dbgs() << "because of Read port limitations\n";
+ });
+ CurrentPacketMIs.pop_back();
+ return false;
+ }
+
+ // We cannot read LDS source registers from the Trans slot.
+ if (isTransSlot && TII->readsLDSSrcReg(MI))
+ return false;
+
+ CurrentPacketMIs.pop_back();
+ return true;
+ }
+
+ MachineBasicBlock::iterator addToPacket(MachineInstr &MI) override {
+ MachineBasicBlock::iterator FirstInBundle =
+ CurrentPacketMIs.empty() ? &MI : CurrentPacketMIs.front();
+ const DenseMap<unsigned, unsigned> &PV =
+ getPreviousVector(FirstInBundle);
+ std::vector<R600InstrInfo::BankSwizzle> BS;
+ bool isTransSlot;
+
+ if (isBundlableWithCurrentPMI(MI, PV, BS, isTransSlot)) {
+ for (unsigned i = 0, e = CurrentPacketMIs.size(); i < e; i++) {
+ MachineInstr *MI = CurrentPacketMIs[i];
+ unsigned Op = TII->getOperandIdx(MI->getOpcode(),
+ AMDGPU::OpName::bank_swizzle);
+ MI->getOperand(Op).setImm(BS[i]);
+ }
+ unsigned Op =
+ TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::bank_swizzle);
+ MI.getOperand(Op).setImm(BS.back());
+ if (!CurrentPacketMIs.empty())
+ setIsLastBit(CurrentPacketMIs.back(), 0);
+ substitutePV(MI, PV);
+ MachineBasicBlock::iterator It = VLIWPacketizerList::addToPacket(MI);
+ if (isTransSlot) {
+ endPacket(std::next(It)->getParent(), std::next(It));
+ }
+ return It;
+ }
+ endPacket(MI.getParent(), MI);
+ if (TII->isTransOnly(MI))
+ return MI;
+ return VLIWPacketizerList::addToPacket(MI);
+ }
+};
+
+bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
+ const R600Subtarget &ST = Fn.getSubtarget<R600Subtarget>();
+ const R600InstrInfo *TII = ST.getInstrInfo();
+
+ MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
+
+ // Instantiate the packetizer.
+ R600PacketizerList Packetizer(Fn, ST, MLI);
+
+ // DFA state table should not be empty.
+ assert(Packetizer.getResourceTracker() && "Empty DFA table!");
+
+ if (Packetizer.getResourceTracker()->getInstrItins()->isEmpty())
+ return false;
+
+ //
+ // Loop over all basic blocks and remove KILL pseudo-instructions
+ // These instructions confuse the dependence analysis. Consider:
+ // D0 = ... (Insn 0)
+ // R0 = KILL R0, D0 (Insn 1)
+ // R0 = ... (Insn 2)
+ // Here, Insn 1 will result in the dependence graph not emitting an output
+ // dependence between Insn 0 and Insn 2. This can lead to incorrect
+ // packetization
+ //
+ for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
+ MBB != MBBe; ++MBB) {
+ MachineBasicBlock::iterator End = MBB->end();
+ MachineBasicBlock::iterator MI = MBB->begin();
+ while (MI != End) {
+ if (MI->isKill() || MI->getOpcode() == AMDGPU::IMPLICIT_DEF ||
+ (MI->getOpcode() == AMDGPU::CF_ALU && !MI->getOperand(8).getImm())) {
+ MachineBasicBlock::iterator DeleteMI = MI;
+ ++MI;
+ MBB->erase(DeleteMI);
+ End = MBB->end();
+ continue;
+ }
+ ++MI;
+ }
+ }
+
+ // Loop over all of the basic blocks.
+ for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
+ MBB != MBBe; ++MBB) {
+ // Find scheduling regions and schedule / packetize each region.
+ unsigned RemainingCount = MBB->size();
+ for(MachineBasicBlock::iterator RegionEnd = MBB->end();
+ RegionEnd != MBB->begin();) {
+ // The next region starts above the previous region. Look backward in the
+ // instruction stream until we find the nearest boundary.
+ MachineBasicBlock::iterator I = RegionEnd;
+ for(;I != MBB->begin(); --I, --RemainingCount) {
+ if (TII->isSchedulingBoundary(*std::prev(I), &*MBB, Fn))
+ break;
+ }
+ I = MBB->begin();
+
+ // Skip empty scheduling regions.
+ if (I == RegionEnd) {
+ RegionEnd = std::prev(RegionEnd);
+ --RemainingCount;
+ continue;
+ }
+ // Skip regions with one instruction.
+ if (I == std::prev(RegionEnd)) {
+ RegionEnd = std::prev(RegionEnd);
+ continue;
+ }
+
+ Packetizer.PacketizeMIs(&*MBB, &*I, RegionEnd);
+ RegionEnd = I;
+ }
+ }
+
+ return true;
+
+}
+
+} // end anonymous namespace
+
+llvm::FunctionPass *llvm::createR600Packetizer(TargetMachine &tm) {
+ return new R600Packetizer(tm);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp
new file mode 100644
index 000000000000..dfdc602b80cd
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp
@@ -0,0 +1,98 @@
+//===-- R600RegisterInfo.cpp - R600 Register Information ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief R600 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600RegisterInfo.h"
+#include "AMDGPUTargetMachine.h"
+#include "R600Defines.h"
+#include "R600InstrInfo.h"
+#include "R600MachineFunctionInfo.h"
+
+using namespace llvm;
+
+R600RegisterInfo::R600RegisterInfo() : AMDGPURegisterInfo() {
+ RCW.RegWeight = 0;
+ RCW.WeightLimit = 0;
+}
+
+BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+ BitVector Reserved(getNumRegs());
+
+ const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
+ const R600InstrInfo *TII = ST.getInstrInfo();
+
+ Reserved.set(AMDGPU::ZERO);
+ Reserved.set(AMDGPU::HALF);
+ Reserved.set(AMDGPU::ONE);
+ Reserved.set(AMDGPU::ONE_INT);
+ Reserved.set(AMDGPU::NEG_HALF);
+ Reserved.set(AMDGPU::NEG_ONE);
+ Reserved.set(AMDGPU::PV_X);
+ Reserved.set(AMDGPU::ALU_LITERAL_X);
+ Reserved.set(AMDGPU::ALU_CONST);
+ Reserved.set(AMDGPU::PREDICATE_BIT);
+ Reserved.set(AMDGPU::PRED_SEL_OFF);
+ Reserved.set(AMDGPU::PRED_SEL_ZERO);
+ Reserved.set(AMDGPU::PRED_SEL_ONE);
+ Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
+
+ for (TargetRegisterClass::iterator I = AMDGPU::R600_AddrRegClass.begin(),
+ E = AMDGPU::R600_AddrRegClass.end(); I != E; ++I) {
+ Reserved.set(*I);
+ }
+
+ TII->reserveIndirectRegisters(Reserved, MF);
+
+ return Reserved;
+}
+
+unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const {
+ return this->getEncodingValue(reg) >> HW_CHAN_SHIFT;
+}
+
+unsigned R600RegisterInfo::getHWRegIndex(unsigned Reg) const {
+ return GET_REG_INDEX(getEncodingValue(Reg));
+}
+
+const TargetRegisterClass * R600RegisterInfo::getCFGStructurizerRegClass(
+ MVT VT) const {
+ switch(VT.SimpleTy) {
+ default:
+ case MVT::i32: return &AMDGPU::R600_TReg32RegClass;
+ }
+}
+
+const RegClassWeight &R600RegisterInfo::getRegClassWeight(
+ const TargetRegisterClass *RC) const {
+ return RCW;
+}
+
+bool R600RegisterInfo::isPhysRegLiveAcrossClauses(unsigned Reg) const {
+ assert(!TargetRegisterInfo::isVirtualRegister(Reg));
+
+ switch (Reg) {
+ case AMDGPU::OQAP:
+ case AMDGPU::OQBP:
+ case AMDGPU::AR_X:
+ return false;
+ default:
+ return true;
+ }
+}
+
+void R600RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
+ int SPAdj,
+ unsigned FIOperandNum,
+ RegScavenger *RS) const {
+ llvm_unreachable("Subroutines not supported yet");
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h
new file mode 100644
index 000000000000..9dfb3106c6cc
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h
@@ -0,0 +1,54 @@
+//===-- R600RegisterInfo.h - R600 Register Info Interface ------*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface definition for R600RegisterInfo
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_R600REGISTERINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_R600REGISTERINFO_H
+
+#include "AMDGPURegisterInfo.h"
+
+namespace llvm {
+
+class AMDGPUSubtarget;
+
+struct R600RegisterInfo final : public AMDGPURegisterInfo {
+ RegClassWeight RCW;
+
+ R600RegisterInfo();
+
+ BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+ /// \brief get the HW encoding for a register's channel.
+ unsigned getHWRegChan(unsigned reg) const;
+
+ unsigned getHWRegIndex(unsigned Reg) const;
+
+ /// \brief get the register class of the specified type to use in the
+ /// CFGStructurizer
+ const TargetRegisterClass *getCFGStructurizerRegClass(MVT VT) const;
+
+ const RegClassWeight &
+ getRegClassWeight(const TargetRegisterClass *RC) const override;
+
+ // \returns true if \p Reg can be defined in one ALU clause and used in
+ // another.
+ bool isPhysRegLiveAcrossClauses(unsigned Reg) const;
+
+ void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
+ unsigned FIOperandNum,
+ RegScavenger *RS = nullptr) const override;
+};
+
+} // End namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.td b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.td
new file mode 100644
index 000000000000..cc667d985a82
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.td
@@ -0,0 +1,252 @@
+
+class R600Reg <string name, bits<16> encoding> : Register<name> {
+ let Namespace = "AMDGPU";
+ let HWEncoding = encoding;
+}
+
+class R600RegWithChan <string name, bits<9> sel, string chan> :
+ Register <name> {
+
+ field bits<2> chan_encoding = !if(!eq(chan, "X"), 0,
+ !if(!eq(chan, "Y"), 1,
+ !if(!eq(chan, "Z"), 2,
+ !if(!eq(chan, "W"), 3, 0))));
+ let HWEncoding{8-0} = sel;
+ let HWEncoding{10-9} = chan_encoding;
+ let Namespace = "AMDGPU";
+}
+
+class R600Reg_128<string n, list<Register> subregs, bits<16> encoding> :
+ RegisterWithSubRegs<n, subregs> {
+ field bits<2> chan_encoding = 0;
+ let Namespace = "AMDGPU";
+ let SubRegIndices = [sub0, sub1, sub2, sub3];
+ let HWEncoding{8-0} = encoding{8-0};
+ let HWEncoding{10-9} = chan_encoding;
+}
+
+class R600Reg_64<string n, list<Register> subregs, bits<16> encoding> :
+ RegisterWithSubRegs<n, subregs> {
+ field bits<2> chan_encoding = 0;
+ let Namespace = "AMDGPU";
+ let SubRegIndices = [sub0, sub1];
+ let HWEncoding = encoding;
+ let HWEncoding{8-0} = encoding{8-0};
+ let HWEncoding{10-9} = chan_encoding;
+}
+
+class R600Reg_64Vertical<int lo, int hi, string chan> : R600Reg_64 <
+ "V"#lo#hi#"_"#chan,
+ [!cast<Register>("T"#lo#"_"#chan), !cast<Register>("T"#hi#"_"#chan)],
+ lo
+>;
+
+foreach Index = 0-127 in {
+ foreach Chan = [ "X", "Y", "Z", "W" ] in {
+ // 32-bit Temporary Registers
+ def T#Index#_#Chan : R600RegWithChan <"T"#Index#"."#Chan, Index, Chan>;
+
+ // Indirect addressing offset registers
+ def Addr#Index#_#Chan : R600RegWithChan <"T("#Index#" + AR.x)."#Chan,
+ Index, Chan>;
+ }
+ // 128-bit Temporary Registers
+ def T#Index#_XYZW : R600Reg_128 <"T"#Index#"",
+ [!cast<Register>("T"#Index#"_X"),
+ !cast<Register>("T"#Index#"_Y"),
+ !cast<Register>("T"#Index#"_Z"),
+ !cast<Register>("T"#Index#"_W")],
+ Index>;
+
+ def T#Index#_XY : R600Reg_64 <"T"#Index#"",
+ [!cast<Register>("T"#Index#"_X"),
+ !cast<Register>("T"#Index#"_Y")],
+ Index>;
+}
+
+foreach Chan = [ "X", "Y", "Z", "W"] in {
+
+ let chan_encoding = !if(!eq(Chan, "X"), 0,
+ !if(!eq(Chan, "Y"), 1,
+ !if(!eq(Chan, "Z"), 2,
+ !if(!eq(Chan, "W"), 3, 0)))) in {
+ def V0123_#Chan : R600Reg_128 <"V0123_"#Chan,
+ [!cast<Register>("T0_"#Chan),
+ !cast<Register>("T1_"#Chan),
+ !cast<Register>("T2_"#Chan),
+ !cast<Register>("T3_"#Chan)],
+ 0>;
+ def V01_#Chan : R600Reg_64Vertical<0, 1, Chan>;
+ def V23_#Chan : R600Reg_64Vertical<2, 3, Chan>;
+ }
+}
+
+
+// KCACHE_BANK0
+foreach Index = 159-128 in {
+ foreach Chan = [ "X", "Y", "Z", "W" ] in {
+ // 32-bit Temporary Registers
+ def KC0_#Index#_#Chan : R600RegWithChan <"KC0["#!add(Index,-128)#"]."#Chan, Index, Chan>;
+ }
+ // 128-bit Temporary Registers
+ def KC0_#Index#_XYZW : R600Reg_128 <"KC0["#!add(Index, -128)#"].XYZW",
+ [!cast<Register>("KC0_"#Index#"_X"),
+ !cast<Register>("KC0_"#Index#"_Y"),
+ !cast<Register>("KC0_"#Index#"_Z"),
+ !cast<Register>("KC0_"#Index#"_W")],
+ Index>;
+}
+
+// KCACHE_BANK1
+foreach Index = 191-160 in {
+ foreach Chan = [ "X", "Y", "Z", "W" ] in {
+ // 32-bit Temporary Registers
+ def KC1_#Index#_#Chan : R600RegWithChan <"KC1["#!add(Index,-160)#"]."#Chan, Index, Chan>;
+ }
+ // 128-bit Temporary Registers
+ def KC1_#Index#_XYZW : R600Reg_128 <"KC1["#!add(Index, -160)#"].XYZW",
+ [!cast<Register>("KC1_"#Index#"_X"),
+ !cast<Register>("KC1_"#Index#"_Y"),
+ !cast<Register>("KC1_"#Index#"_Z"),
+ !cast<Register>("KC1_"#Index#"_W")],
+ Index>;
+}
+
+
+// Array Base Register holding input in FS
+foreach Index = 448-480 in {
+ def ArrayBase#Index : R600Reg<"ARRAY_BASE", Index>;
+}
+
+
+// Special Registers
+
+def OQA : R600Reg<"OQA", 219>;
+def OQB : R600Reg<"OQB", 220>;
+def OQAP : R600Reg<"OQAP", 221>;
+def OQBP : R600Reg<"OQAP", 222>;
+def LDS_DIRECT_A : R600Reg<"LDS_DIRECT_A", 223>;
+def LDS_DIRECT_B : R600Reg<"LDS_DIRECT_B", 224>;
+def ZERO : R600Reg<"0.0", 248>;
+def ONE : R600Reg<"1.0", 249>;
+def NEG_ONE : R600Reg<"-1.0", 249>;
+def ONE_INT : R600Reg<"1", 250>;
+def HALF : R600Reg<"0.5", 252>;
+def NEG_HALF : R600Reg<"-0.5", 252>;
+def ALU_LITERAL_X : R600RegWithChan<"literal.x", 253, "X">;
+def ALU_LITERAL_Y : R600RegWithChan<"literal.y", 253, "Y">;
+def ALU_LITERAL_Z : R600RegWithChan<"literal.z", 253, "Z">;
+def ALU_LITERAL_W : R600RegWithChan<"literal.w", 253, "W">;
+def PV_X : R600RegWithChan<"PV.X", 254, "X">;
+def PV_Y : R600RegWithChan<"PV.Y", 254, "Y">;
+def PV_Z : R600RegWithChan<"PV.Z", 254, "Z">;
+def PV_W : R600RegWithChan<"PV.W", 254, "W">;
+def PS: R600Reg<"PS", 255>;
+def PREDICATE_BIT : R600Reg<"PredicateBit", 0>;
+def PRED_SEL_OFF: R600Reg<"Pred_sel_off", 0>;
+def PRED_SEL_ZERO : R600Reg<"Pred_sel_zero", 2>;
+def PRED_SEL_ONE : R600Reg<"Pred_sel_one", 3>;
+def AR_X : R600Reg<"AR.x", 0>;
+
+def R600_ArrayBase : RegisterClass <"AMDGPU", [f32, i32], 32,
+ (add (sequence "ArrayBase%u", 448, 480))>;
+// special registers for ALU src operands
+// const buffer reference, SRCx_SEL contains index
+def ALU_CONST : R600Reg<"CBuf", 0>;
+// interpolation param reference, SRCx_SEL contains index
+def ALU_PARAM : R600Reg<"Param", 0>;
+
+let isAllocatable = 0 in {
+
+def R600_Addr : RegisterClass <"AMDGPU", [i32], 32, (add (sequence "Addr%u_X", 0, 127))>;
+
+// We only use Addr_[YZW] for vertical vectors.
+// FIXME if we add more vertical vector registers we will need to ad more
+// registers to these classes.
+def R600_Addr_Y : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_Y)>;
+def R600_Addr_Z : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_Z)>;
+def R600_Addr_W : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_W)>;
+
+def R600_LDS_SRC_REG : RegisterClass<"AMDGPU", [i32], 32,
+ (add OQA, OQB, OQAP, OQBP, LDS_DIRECT_A, LDS_DIRECT_B)>;
+
+def R600_KC0_X : RegisterClass <"AMDGPU", [f32, i32], 32,
+ (add (sequence "KC0_%u_X", 128, 159))>;
+
+def R600_KC0_Y : RegisterClass <"AMDGPU", [f32, i32], 32,
+ (add (sequence "KC0_%u_Y", 128, 159))>;
+
+def R600_KC0_Z : RegisterClass <"AMDGPU", [f32, i32], 32,
+ (add (sequence "KC0_%u_Z", 128, 159))>;
+
+def R600_KC0_W : RegisterClass <"AMDGPU", [f32, i32], 32,
+ (add (sequence "KC0_%u_W", 128, 159))>;
+
+def R600_KC0 : RegisterClass <"AMDGPU", [f32, i32], 32,
+ (interleave R600_KC0_X, R600_KC0_Y,
+ R600_KC0_Z, R600_KC0_W)>;
+
+def R600_KC1_X : RegisterClass <"AMDGPU", [f32, i32], 32,
+ (add (sequence "KC1_%u_X", 160, 191))>;
+
+def R600_KC1_Y : RegisterClass <"AMDGPU", [f32, i32], 32,
+ (add (sequence "KC1_%u_Y", 160, 191))>;
+
+def R600_KC1_Z : RegisterClass <"AMDGPU", [f32, i32], 32,
+ (add (sequence "KC1_%u_Z", 160, 191))>;
+
+def R600_KC1_W : RegisterClass <"AMDGPU", [f32, i32], 32,
+ (add (sequence "KC1_%u_W", 160, 191))>;
+
+def R600_KC1 : RegisterClass <"AMDGPU", [f32, i32], 32,
+ (interleave R600_KC1_X, R600_KC1_Y,
+ R600_KC1_Z, R600_KC1_W)>;
+
+} // End isAllocatable = 0
+
+def R600_TReg32_X : RegisterClass <"AMDGPU", [f32, i32], 32,
+ (add (sequence "T%u_X", 0, 127), AR_X)>;
+
+def R600_TReg32_Y : RegisterClass <"AMDGPU", [f32, i32], 32,
+ (add (sequence "T%u_Y", 0, 127))>;
+
+def R600_TReg32_Z : RegisterClass <"AMDGPU", [f32, i32], 32,
+ (add (sequence "T%u_Z", 0, 127))>;
+
+def R600_TReg32_W : RegisterClass <"AMDGPU", [f32, i32], 32,
+ (add (sequence "T%u_W", 0, 127))>;
+
+def R600_TReg32 : RegisterClass <"AMDGPU", [f32, i32], 32,
+ (interleave R600_TReg32_X, R600_TReg32_Y,
+ R600_TReg32_Z, R600_TReg32_W)>;
+
+def R600_Reg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add
+ R600_TReg32,
+ R600_ArrayBase,
+ R600_Addr,
+ R600_KC0, R600_KC1,
+ ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF,
+ ALU_CONST, ALU_PARAM, OQAP
+ )>;
+
+def R600_Predicate : RegisterClass <"AMDGPU", [i32], 32, (add
+ PRED_SEL_OFF, PRED_SEL_ZERO, PRED_SEL_ONE)>;
+
+def R600_Predicate_Bit: RegisterClass <"AMDGPU", [i32], 32, (add
+ PREDICATE_BIT)>;
+
+def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
+ (add (sequence "T%u_XYZW", 0, 127))> {
+ let CopyCost = -1;
+}
+
+def R600_Reg128Vertical : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
+ (add V0123_W, V0123_Z, V0123_Y, V0123_X)
+>;
+
+def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
+ (add (sequence "T%u_XY", 0, 63))>;
+
+def R600_Reg64Vertical : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
+ (add V01_X, V01_Y, V01_Z, V01_W,
+ V23_X, V23_Y, V23_Z, V23_W)>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Schedule.td b/contrib/llvm/lib/Target/AMDGPU/R600Schedule.td
new file mode 100644
index 000000000000..70fb46c1a7d6
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600Schedule.td
@@ -0,0 +1,49 @@
+//===-- R600Schedule.td - R600 Scheduling definitions ------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// R600 has a VLIW architecture. On pre-cayman cards there are 5 instruction
+// slots ALU.X, ALU.Y, ALU.Z, ALU.W, and TRANS. For cayman cards, the TRANS
+// slot has been removed.
+//
+//===----------------------------------------------------------------------===//
+
+
+def ALU_X : FuncUnit;
+def ALU_Y : FuncUnit;
+def ALU_Z : FuncUnit;
+def ALU_W : FuncUnit;
+def TRANS : FuncUnit;
+
+def AnyALU : InstrItinClass;
+def VecALU : InstrItinClass;
+def TransALU : InstrItinClass;
+def XALU : InstrItinClass;
+
+def R600_VLIW5_Itin : ProcessorItineraries <
+ [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS, ALU_NULL],
+ [],
+ [
+ InstrItinData<AnyALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS]>]>,
+ InstrItinData<VecALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W]>]>,
+ InstrItinData<TransALU, [InstrStage<1, [TRANS]>]>,
+ InstrItinData<XALU, [InstrStage<1, [ALU_X]>]>,
+ InstrItinData<NullALU, [InstrStage<1, [ALU_NULL]>]>
+ ]
+>;
+
+def R600_VLIW4_Itin : ProcessorItineraries <
+ [ALU_X, ALU_Y, ALU_Z, ALU_W, ALU_NULL],
+ [],
+ [
+ InstrItinData<AnyALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W]>]>,
+ InstrItinData<VecALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W]>]>,
+ InstrItinData<TransALU, [InstrStage<1, [ALU_NULL]>]>,
+ InstrItinData<NullALU, [InstrStage<1, [ALU_NULL]>]>
+ ]
+>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/R700Instructions.td b/contrib/llvm/lib/Target/AMDGPU/R700Instructions.td
new file mode 100644
index 000000000000..613a0d729bb3
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R700Instructions.td
@@ -0,0 +1,21 @@
+//===-- R700Instructions.td - R700 Instruction defs -------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TableGen definitions for instructions which are:
+// - Available to R700 and newer VLIW4/VLIW5 GPUs
+// - Available only on R700 family GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+def isR700 : Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::R700">;
+
+let Predicates = [isR700] in {
+ def SIN_r700 : SIN_Common<0x6E>;
+ def COS_r700 : COS_Common<0x6F>;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
new file mode 100644
index 000000000000..d70f52e0f295
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -0,0 +1,425 @@
+//===-- SIAnnotateControlFlow.cpp - ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Annotates the control flow with hardware specific intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-annotate-control-flow"
+
+namespace {
+
+// Complex types used in this pass
+typedef std::pair<BasicBlock *, Value *> StackEntry;
+typedef SmallVector<StackEntry, 16> StackVector;
+
+// Intrinsic names the control flow is annotated with
+static const char *const IfIntrinsic = "llvm.amdgcn.if";
+static const char *const ElseIntrinsic = "llvm.amdgcn.else";
+static const char *const BreakIntrinsic = "llvm.amdgcn.break";
+static const char *const IfBreakIntrinsic = "llvm.amdgcn.if.break";
+static const char *const ElseBreakIntrinsic = "llvm.amdgcn.else.break";
+static const char *const LoopIntrinsic = "llvm.amdgcn.loop";
+static const char *const EndCfIntrinsic = "llvm.amdgcn.end.cf";
+
+class SIAnnotateControlFlow : public FunctionPass {
+ DivergenceAnalysis *DA;
+
+ Type *Boolean;
+ Type *Void;
+ Type *Int64;
+ Type *ReturnStruct;
+
+ ConstantInt *BoolTrue;
+ ConstantInt *BoolFalse;
+ UndefValue *BoolUndef;
+ Constant *Int64Zero;
+
+ Constant *If;
+ Constant *Else;
+ Constant *Break;
+ Constant *IfBreak;
+ Constant *ElseBreak;
+ Constant *Loop;
+ Constant *EndCf;
+
+ DominatorTree *DT;
+ StackVector Stack;
+
+ LoopInfo *LI;
+
+ bool isUniform(BranchInst *T);
+
+ bool isTopOfStack(BasicBlock *BB);
+
+ Value *popSaved();
+
+ void push(BasicBlock *BB, Value *Saved);
+
+ bool isElse(PHINode *Phi);
+
+ void eraseIfUnused(PHINode *Phi);
+
+ void openIf(BranchInst *Term);
+
+ void insertElse(BranchInst *Term);
+
+ Value *handleLoopCondition(Value *Cond, PHINode *Broken,
+ llvm::Loop *L, BranchInst *Term);
+
+ void handleLoop(BranchInst *Term);
+
+ void closeControlFlow(BasicBlock *BB);
+
+public:
+ static char ID;
+
+ SIAnnotateControlFlow():
+ FunctionPass(ID) { }
+
+ bool doInitialization(Module &M) override;
+
+ bool runOnFunction(Function &F) override;
+
+ StringRef getPassName() const override { return "SI annotate control flow"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<DivergenceAnalysis>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ FunctionPass::getAnalysisUsage(AU);
+ }
+
+};
+
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(SIAnnotateControlFlow, DEBUG_TYPE,
+ "Annotate SI Control Flow", false, false)
+INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_END(SIAnnotateControlFlow, DEBUG_TYPE,
+ "Annotate SI Control Flow", false, false)
+
+char SIAnnotateControlFlow::ID = 0;
+
+/// \brief Initialize all the types and constants used in the pass
+bool SIAnnotateControlFlow::doInitialization(Module &M) {
+ LLVMContext &Context = M.getContext();
+
+ Void = Type::getVoidTy(Context);
+ Boolean = Type::getInt1Ty(Context);
+ Int64 = Type::getInt64Ty(Context);
+ ReturnStruct = StructType::get(Boolean, Int64, (Type *)nullptr);
+
+ BoolTrue = ConstantInt::getTrue(Context);
+ BoolFalse = ConstantInt::getFalse(Context);
+ BoolUndef = UndefValue::get(Boolean);
+ Int64Zero = ConstantInt::get(Int64, 0);
+
+ If = M.getOrInsertFunction(
+ IfIntrinsic, ReturnStruct, Boolean, (Type *)nullptr);
+
+ Else = M.getOrInsertFunction(
+ ElseIntrinsic, ReturnStruct, Int64, (Type *)nullptr);
+
+ Break = M.getOrInsertFunction(
+ BreakIntrinsic, Int64, Int64, (Type *)nullptr);
+ cast<Function>(Break)->setDoesNotAccessMemory();
+
+ IfBreak = M.getOrInsertFunction(
+ IfBreakIntrinsic, Int64, Boolean, Int64, (Type *)nullptr);
+ cast<Function>(IfBreak)->setDoesNotAccessMemory();;
+
+ ElseBreak = M.getOrInsertFunction(
+ ElseBreakIntrinsic, Int64, Int64, Int64, (Type *)nullptr);
+ cast<Function>(ElseBreak)->setDoesNotAccessMemory();
+
+ Loop = M.getOrInsertFunction(
+ LoopIntrinsic, Boolean, Int64, (Type *)nullptr);
+
+ EndCf = M.getOrInsertFunction(
+ EndCfIntrinsic, Void, Int64, (Type *)nullptr);
+
+ return false;
+}
+
+/// \brief Is the branch condition uniform or did the StructurizeCFG pass
+/// consider it as such?
+bool SIAnnotateControlFlow::isUniform(BranchInst *T) {
+ return DA->isUniform(T->getCondition()) ||
+ T->getMetadata("structurizecfg.uniform") != nullptr;
+}
+
+/// \brief Is BB the last block saved on the stack ?
+bool SIAnnotateControlFlow::isTopOfStack(BasicBlock *BB) {
+ return !Stack.empty() && Stack.back().first == BB;
+}
+
+/// \brief Pop the last saved value from the control flow stack
+Value *SIAnnotateControlFlow::popSaved() {
+ return Stack.pop_back_val().second;
+}
+
+/// \brief Push a BB and saved value to the control flow stack
+void SIAnnotateControlFlow::push(BasicBlock *BB, Value *Saved) {
+ Stack.push_back(std::make_pair(BB, Saved));
+}
+
+/// \brief Can the condition represented by this PHI node treated like
+/// an "Else" block?
+bool SIAnnotateControlFlow::isElse(PHINode *Phi) {
+ BasicBlock *IDom = DT->getNode(Phi->getParent())->getIDom()->getBlock();
+ for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
+ if (Phi->getIncomingBlock(i) == IDom) {
+
+ if (Phi->getIncomingValue(i) != BoolTrue)
+ return false;
+
+ } else {
+ if (Phi->getIncomingValue(i) != BoolFalse)
+ return false;
+
+ }
+ }
+ return true;
+}
+
+// \brief Erase "Phi" if it is not used any more
+void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) {
+ if (!Phi->hasNUsesOrMore(1))
+ Phi->eraseFromParent();
+}
+
+/// \brief Open a new "If" block
+void SIAnnotateControlFlow::openIf(BranchInst *Term) {
+ if (isUniform(Term)) {
+ return;
+ }
+ Value *Ret = CallInst::Create(If, Term->getCondition(), "", Term);
+ Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term));
+ push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
+}
+
+/// \brief Close the last "If" block and open a new "Else" block
+void SIAnnotateControlFlow::insertElse(BranchInst *Term) {
+ if (isUniform(Term)) {
+ return;
+ }
+ Value *Ret = CallInst::Create(Else, popSaved(), "", Term);
+ Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term));
+ push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
+}
+
+/// \brief Recursively handle the condition leading to a loop
+Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken,
+ llvm::Loop *L, BranchInst *Term) {
+
+ // Only search through PHI nodes which are inside the loop. If we try this
+ // with PHI nodes that are outside of the loop, we end up inserting new PHI
+ // nodes outside of the loop which depend on values defined inside the loop.
+ // This will break the module with
+ // 'Instruction does not dominate all users!' errors.
+ PHINode *Phi = nullptr;
+ if ((Phi = dyn_cast<PHINode>(Cond)) && L->contains(Phi)) {
+
+ BasicBlock *Parent = Phi->getParent();
+ PHINode *NewPhi = PHINode::Create(Int64, 0, "", &Parent->front());
+ Value *Ret = NewPhi;
+
+ // Handle all non-constant incoming values first
+ for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
+ Value *Incoming = Phi->getIncomingValue(i);
+ BasicBlock *From = Phi->getIncomingBlock(i);
+ if (isa<ConstantInt>(Incoming)) {
+ NewPhi->addIncoming(Broken, From);
+ continue;
+ }
+
+ Phi->setIncomingValue(i, BoolFalse);
+ Value *PhiArg = handleLoopCondition(Incoming, Broken, L, Term);
+ NewPhi->addIncoming(PhiArg, From);
+ }
+
+ BasicBlock *IDom = DT->getNode(Parent)->getIDom()->getBlock();
+
+ for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
+
+ Value *Incoming = Phi->getIncomingValue(i);
+ if (Incoming != BoolTrue)
+ continue;
+
+ BasicBlock *From = Phi->getIncomingBlock(i);
+ if (From == IDom) {
+ // We're in the following situation:
+ // IDom/From
+ // | \
+ // | If-block
+ // | /
+ // Parent
+ // where we want to break out of the loop if the If-block is not taken.
+ // Due to the depth-first traversal, there should be an end.cf
+ // intrinsic in Parent, and we insert an else.break before it.
+ //
+ // Note that the end.cf need not be the first non-phi instruction
+ // of parent, particularly when we're dealing with a multi-level
+ // break, but it should occur within a group of intrinsic calls
+ // at the beginning of the block.
+ CallInst *OldEnd = dyn_cast<CallInst>(Parent->getFirstInsertionPt());
+ while (OldEnd && OldEnd->getCalledFunction() != EndCf)
+ OldEnd = dyn_cast<CallInst>(OldEnd->getNextNode());
+ if (OldEnd && OldEnd->getCalledFunction() == EndCf) {
+ Value *Args[] = { OldEnd->getArgOperand(0), NewPhi };
+ Ret = CallInst::Create(ElseBreak, Args, "", OldEnd);
+ continue;
+ }
+ }
+ TerminatorInst *Insert = From->getTerminator();
+ Value *PhiArg = CallInst::Create(Break, Broken, "", Insert);
+ NewPhi->setIncomingValue(i, PhiArg);
+ }
+ eraseIfUnused(Phi);
+ return Ret;
+
+ } else if (Instruction *Inst = dyn_cast<Instruction>(Cond)) {
+ BasicBlock *Parent = Inst->getParent();
+ Instruction *Insert;
+ if (L->contains(Inst)) {
+ Insert = Parent->getTerminator();
+ } else {
+ Insert = L->getHeader()->getFirstNonPHIOrDbgOrLifetime();
+ }
+ Value *Args[] = { Cond, Broken };
+ return CallInst::Create(IfBreak, Args, "", Insert);
+
+ // Insert IfBreak before TERM for constant COND.
+ } else if (isa<ConstantInt>(Cond)) {
+ Value *Args[] = { Cond, Broken };
+ return CallInst::Create(IfBreak, Args, "", Term);
+
+ } else {
+ llvm_unreachable("Unhandled loop condition!");
+ }
+ return nullptr;
+}
+
+/// \brief Handle a back edge (loop)
+void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
+ if (isUniform(Term)) {
+ return;
+ }
+
+ BasicBlock *BB = Term->getParent();
+ llvm::Loop *L = LI->getLoopFor(BB);
+ if (!L)
+ return;
+ BasicBlock *Target = Term->getSuccessor(1);
+ PHINode *Broken = PHINode::Create(Int64, 0, "", &Target->front());
+
+ Value *Cond = Term->getCondition();
+ Term->setCondition(BoolTrue);
+ Value *Arg = handleLoopCondition(Cond, Broken, L, Term);
+
+ for (pred_iterator PI = pred_begin(Target), PE = pred_end(Target);
+ PI != PE; ++PI) {
+
+ Broken->addIncoming(*PI == BB ? Arg : Int64Zero, *PI);
+ }
+
+ Term->setCondition(CallInst::Create(Loop, Arg, "", Term));
+ push(Term->getSuccessor(0), Arg);
+}/// \brief Close the last opened control flow
+void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
+ llvm::Loop *L = LI->getLoopFor(BB);
+
+ assert(Stack.back().first == BB);
+
+ if (L && L->getHeader() == BB) {
+ // We can't insert an EndCF call into a loop header, because it will
+ // get executed on every iteration of the loop, when it should be
+ // executed only once before the loop.
+ SmallVector <BasicBlock*, 8> Latches;
+ L->getLoopLatches(Latches);
+
+ std::vector<BasicBlock*> Preds;
+ for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) {
+ if (!is_contained(Latches, *PI))
+ Preds.push_back(*PI);
+ }
+ BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, false);
+ }
+
+ Value *Exec = popSaved();
+ if (!isa<UndefValue>(Exec))
+ CallInst::Create(EndCf, Exec, "", &*BB->getFirstInsertionPt());
+}
+
+/// \brief Annotate the control flow with intrinsics so the backend can
+/// recognize if/then/else and loops.
+bool SIAnnotateControlFlow::runOnFunction(Function &F) {
+
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ DA = &getAnalysis<DivergenceAnalysis>();
+
+ for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()),
+ E = df_end(&F.getEntryBlock()); I != E; ++I) {
+
+ BranchInst *Term = dyn_cast<BranchInst>((*I)->getTerminator());
+
+ if (!Term || Term->isUnconditional()) {
+ if (isTopOfStack(*I))
+ closeControlFlow(*I);
+
+ continue;
+ }
+
+ if (I.nodeVisited(Term->getSuccessor(1))) {
+ if (isTopOfStack(*I))
+ closeControlFlow(*I);
+
+ handleLoop(Term);
+ continue;
+ }
+
+ if (isTopOfStack(*I)) {
+ PHINode *Phi = dyn_cast<PHINode>(Term->getCondition());
+ if (Phi && Phi->getParent() == *I && isElse(Phi)) {
+ insertElse(Term);
+ eraseIfUnused(Phi);
+ continue;
+ }
+ closeControlFlow(*I);
+ }
+ openIf(Term);
+ }
+
+ assert(Stack.empty());
+ return true;
+}
+
+/// \brief Create the annotation pass
+FunctionPass *llvm::createSIAnnotateControlFlowPass() {
+ return new SIAnnotateControlFlow();
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp b/contrib/llvm/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp
new file mode 100644
index 000000000000..62ebef8e91af
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp
@@ -0,0 +1,96 @@
+//===--- SIDebuggerInsertNops.cpp - Inserts nops for debugger usage -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Inserts one nop instruction for each high level source statement for
+/// debugger usage.
+///
+/// Tools, such as a debugger, need to pause execution based on user input (i.e.
+/// breakpoint). In order to do this, one nop instruction is inserted before the
+/// first isa instruction of each high level source statement. Further, the
+/// debugger may replace nop instructions with trap instructions based on user
+/// input.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SIInstrInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "si-debugger-insert-nops"
+#define PASS_NAME "SI Debugger Insert Nops"
+
+namespace {
+
+class SIDebuggerInsertNops : public MachineFunctionPass {
+public:
+ static char ID;
+
+ SIDebuggerInsertNops() : MachineFunctionPass(ID) { }
+ StringRef getPassName() const override { return PASS_NAME; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+} // anonymous namespace
+
+INITIALIZE_PASS(SIDebuggerInsertNops, DEBUG_TYPE, PASS_NAME, false, false)
+
+char SIDebuggerInsertNops::ID = 0;
+char &llvm::SIDebuggerInsertNopsID = SIDebuggerInsertNops::ID;
+
+FunctionPass *llvm::createSIDebuggerInsertNopsPass() {
+ return new SIDebuggerInsertNops();
+}
+
+bool SIDebuggerInsertNops::runOnMachineFunction(MachineFunction &MF) {
+ // Skip this pass if "amdgpu-debugger-insert-nops" attribute was not
+ // specified.
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ if (!ST.debuggerInsertNops())
+ return false;
+
+ // Skip machine functions without debug info.
+ if (!MF.getMMI().hasDebugInfo())
+ return false;
+
+ // Target instruction info.
+ const SIInstrInfo *TII = ST.getInstrInfo();
+
+ // Set containing line numbers that have nop inserted.
+ DenseSet<unsigned> NopInserted;
+
+ for (auto &MBB : MF) {
+ for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
+ // Skip DBG_VALUE instructions and instructions without location.
+ if (MI->isDebugValue() || !MI->getDebugLoc())
+ continue;
+
+ // Insert nop instruction if line number does not have nop inserted.
+ auto DL = MI->getDebugLoc();
+ if (NopInserted.find(DL.getLine()) == NopInserted.end()) {
+ BuildMI(MBB, *MI, DL, TII->get(AMDGPU::S_NOP))
+ .addImm(0);
+ NopInserted.insert(DL.getLine());
+ }
+ }
+ }
+
+ return true;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIDefines.h b/contrib/llvm/lib/Target/AMDGPU/SIDefines.h
new file mode 100644
index 000000000000..ff4e32147184
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -0,0 +1,393 @@
+//===-- SIDefines.h - SI Helper Macros ----------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCInstrDesc.h"
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIDEFINES_H
+#define LLVM_LIB_TARGET_AMDGPU_SIDEFINES_H
+
+namespace llvm {
+
+namespace SIInstrFlags {
+// This needs to be kept in sync with the field bits in InstSI.
+enum : uint64_t {
+ // Low bits - basic encoding information.
+ SALU = 1 << 0,
+ VALU = 1 << 1,
+
+ // SALU instruction formats.
+ SOP1 = 1 << 2,
+ SOP2 = 1 << 3,
+ SOPC = 1 << 4,
+ SOPK = 1 << 5,
+ SOPP = 1 << 6,
+
+ // VALU instruction formats.
+ VOP1 = 1 << 7,
+ VOP2 = 1 << 8,
+ VOPC = 1 << 9,
+
+ // TODO: Should this be spilt into VOP3 a and b?
+ VOP3 = 1 << 10,
+
+ VINTRP = 1 << 13,
+ SDWA = 1 << 14,
+ DPP = 1 << 15,
+
+ // Memory instruction formats.
+ MUBUF = 1 << 16,
+ MTBUF = 1 << 17,
+ SMRD = 1 << 18,
+ MIMG = 1 << 19,
+ EXP = 1 << 20,
+ FLAT = 1 << 21,
+ DS = 1 << 22,
+
+ // Pseudo instruction formats.
+ VGPRSpill = 1 << 23,
+ SGPRSpill = 1 << 24,
+
+ // High bits - other information.
+ VM_CNT = UINT64_C(1) << 32,
+ EXP_CNT = UINT64_C(1) << 33,
+ LGKM_CNT = UINT64_C(1) << 34,
+
+ WQM = UINT64_C(1) << 35,
+ DisableWQM = UINT64_C(1) << 36,
+ Gather4 = UINT64_C(1) << 37,
+ SOPK_ZEXT = UINT64_C(1) << 38,
+ SCALAR_STORE = UINT64_C(1) << 39,
+ FIXED_SIZE = UINT64_C(1) << 40,
+ VOPAsmPrefer32Bit = UINT64_C(1) << 41
+
+};
+
+// v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
+// The result is true if any of these tests are true.
+enum ClassFlags {
+ S_NAN = 1 << 0, // Signaling NaN
+ Q_NAN = 1 << 1, // Quiet NaN
+ N_INFINITY = 1 << 2, // Negative infinity
+ N_NORMAL = 1 << 3, // Negative normal
+ N_SUBNORMAL = 1 << 4, // Negative subnormal
+ N_ZERO = 1 << 5, // Negative zero
+ P_ZERO = 1 << 6, // Positive zero
+ P_SUBNORMAL = 1 << 7, // Positive subnormal
+ P_NORMAL = 1 << 8, // Positive normal
+ P_INFINITY = 1 << 9 // Positive infinity
+};
+}
+
+namespace AMDGPU {
+ enum OperandType {
+ /// Operands with register or 32-bit immediate
+ OPERAND_REG_IMM_INT32 = MCOI::OPERAND_FIRST_TARGET,
+ OPERAND_REG_IMM_INT64,
+ OPERAND_REG_IMM_INT16,
+ OPERAND_REG_IMM_FP32,
+ OPERAND_REG_IMM_FP64,
+ OPERAND_REG_IMM_FP16,
+
+ /// Operands with register or inline constant
+ OPERAND_REG_INLINE_C_INT16,
+ OPERAND_REG_INLINE_C_INT32,
+ OPERAND_REG_INLINE_C_INT64,
+ OPERAND_REG_INLINE_C_FP16,
+ OPERAND_REG_INLINE_C_FP32,
+ OPERAND_REG_INLINE_C_FP64,
+
+ OPERAND_REG_IMM_FIRST = OPERAND_REG_IMM_INT32,
+ OPERAND_REG_IMM_LAST = OPERAND_REG_IMM_FP16,
+
+ OPERAND_REG_INLINE_C_FIRST = OPERAND_REG_INLINE_C_INT16,
+ OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_C_FP64,
+
+ OPERAND_SRC_FIRST = OPERAND_REG_IMM_INT32,
+ OPERAND_SRC_LAST = OPERAND_REG_INLINE_C_LAST,
+
+ // Operand for source modifiers for VOP instructions
+ OPERAND_INPUT_MODS,
+
+ /// Operand with 32-bit immediate that uses the constant bus.
+ OPERAND_KIMM32,
+ OPERAND_KIMM16
+ };
+}
+
+// Input operand modifiers bit-masks
+// NEG and SEXT share same bit-mask because they can't be set simultaneously.
+namespace SISrcMods {
+ enum {
+ NEG = 1 << 0, // Floating-point negate modifier
+ ABS = 1 << 1, // Floating-point absolute modifier
+ SEXT = 1 << 0 // Integer sign-extend modifier
+ };
+}
+
+namespace SIOutMods {
+ enum {
+ NONE = 0,
+ MUL2 = 1,
+ MUL4 = 2,
+ DIV2 = 3
+ };
+}
+
+namespace VGPRIndexMode {
+ enum {
+ SRC0_ENABLE = 1 << 0,
+ SRC1_ENABLE = 1 << 1,
+ SRC2_ENABLE = 1 << 2,
+ DST_ENABLE = 1 << 3
+ };
+}
+
+namespace AMDGPUAsmVariants {
+ enum {
+ DEFAULT = 0,
+ VOP3 = 1,
+ SDWA = 2,
+ DPP = 3
+ };
+}
+
+namespace AMDGPU {
+namespace EncValues { // Encoding values of enum9/8/7 operands
+
+enum {
+ SGPR_MIN = 0,
+ SGPR_MAX = 101,
+ TTMP_MIN = 112,
+ TTMP_MAX = 123,
+ INLINE_INTEGER_C_MIN = 128,
+ INLINE_INTEGER_C_POSITIVE_MAX = 192, // 64
+ INLINE_INTEGER_C_MAX = 208,
+ INLINE_FLOATING_C_MIN = 240,
+ INLINE_FLOATING_C_MAX = 248,
+ LITERAL_CONST = 255,
+ VGPR_MIN = 256,
+ VGPR_MAX = 511
+};
+
+} // namespace EncValues
+} // namespace AMDGPU
+
+namespace AMDGPU {
+namespace SendMsg { // Encoding of SIMM16 used in s_sendmsg* insns.
+
+enum Id { // Message ID, width(4) [3:0].
+ ID_UNKNOWN_ = -1,
+ ID_INTERRUPT = 1,
+ ID_GS,
+ ID_GS_DONE,
+ ID_SYSMSG = 15,
+ ID_GAPS_LAST_, // Indicate that sequence has gaps.
+ ID_GAPS_FIRST_ = ID_INTERRUPT,
+ ID_SHIFT_ = 0,
+ ID_WIDTH_ = 4,
+ ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_)
+};
+
+enum Op { // Both GS and SYS operation IDs.
+ OP_UNKNOWN_ = -1,
+ OP_SHIFT_ = 4,
+ // width(2) [5:4]
+ OP_GS_NOP = 0,
+ OP_GS_CUT,
+ OP_GS_EMIT,
+ OP_GS_EMIT_CUT,
+ OP_GS_LAST_,
+ OP_GS_FIRST_ = OP_GS_NOP,
+ OP_GS_WIDTH_ = 2,
+ OP_GS_MASK_ = (((1 << OP_GS_WIDTH_) - 1) << OP_SHIFT_),
+ // width(3) [6:4]
+ OP_SYS_ECC_ERR_INTERRUPT = 1,
+ OP_SYS_REG_RD,
+ OP_SYS_HOST_TRAP_ACK,
+ OP_SYS_TTRACE_PC,
+ OP_SYS_LAST_,
+ OP_SYS_FIRST_ = OP_SYS_ECC_ERR_INTERRUPT,
+ OP_SYS_WIDTH_ = 3,
+ OP_SYS_MASK_ = (((1 << OP_SYS_WIDTH_) - 1) << OP_SHIFT_)
+};
+
+enum StreamId { // Stream ID, (2) [9:8].
+ STREAM_ID_DEFAULT_ = 0,
+ STREAM_ID_LAST_ = 4,
+ STREAM_ID_FIRST_ = STREAM_ID_DEFAULT_,
+ STREAM_ID_SHIFT_ = 8,
+ STREAM_ID_WIDTH_= 2,
+ STREAM_ID_MASK_ = (((1 << STREAM_ID_WIDTH_) - 1) << STREAM_ID_SHIFT_)
+};
+
+} // namespace SendMsg
+
+namespace Hwreg { // Encoding of SIMM16 used in s_setreg/getreg* insns.
+
+enum Id { // HwRegCode, (6) [5:0]
+ ID_UNKNOWN_ = -1,
+ ID_SYMBOLIC_FIRST_ = 1, // There are corresponding symbolic names defined.
+ ID_MODE = 1,
+ ID_STATUS = 2,
+ ID_TRAPSTS = 3,
+ ID_HW_ID = 4,
+ ID_GPR_ALLOC = 5,
+ ID_LDS_ALLOC = 6,
+ ID_IB_STS = 7,
+ ID_SYMBOLIC_LAST_ = 8,
+ ID_SHIFT_ = 0,
+ ID_WIDTH_ = 6,
+ ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_)
+};
+
+enum Offset { // Offset, (5) [10:6]
+ OFFSET_DEFAULT_ = 0,
+ OFFSET_SHIFT_ = 6,
+ OFFSET_WIDTH_ = 5,
+ OFFSET_MASK_ = (((1 << OFFSET_WIDTH_) - 1) << OFFSET_SHIFT_)
+};
+
+enum WidthMinusOne { // WidthMinusOne, (5) [15:11]
+ WIDTH_M1_DEFAULT_ = 31,
+ WIDTH_M1_SHIFT_ = 11,
+ WIDTH_M1_WIDTH_ = 5,
+ WIDTH_M1_MASK_ = (((1 << WIDTH_M1_WIDTH_) - 1) << WIDTH_M1_SHIFT_)
+};
+
+} // namespace Hwreg
+
+namespace SDWA {
+
+enum SdwaSel {
+ BYTE_0 = 0,
+ BYTE_1 = 1,
+ BYTE_2 = 2,
+ BYTE_3 = 3,
+ WORD_0 = 4,
+ WORD_1 = 5,
+ DWORD = 6,
+};
+
+enum DstUnused {
+ UNUSED_PAD = 0,
+ UNUSED_SEXT = 1,
+ UNUSED_PRESERVE = 2,
+};
+
+} // namespace SDWA
+} // namespace AMDGPU
+
+#define R_00B028_SPI_SHADER_PGM_RSRC1_PS 0x00B028
+#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS 0x00B02C
+#define S_00B02C_EXTRA_LDS_SIZE(x) (((x) & 0xFF) << 8)
+#define R_00B128_SPI_SHADER_PGM_RSRC1_VS 0x00B128
+#define R_00B228_SPI_SHADER_PGM_RSRC1_GS 0x00B228
+#define R_00B848_COMPUTE_PGM_RSRC1 0x00B848
+#define S_00B028_VGPRS(x) (((x) & 0x3F) << 0)
+#define S_00B028_SGPRS(x) (((x) & 0x0F) << 6)
+
+#define R_00B84C_COMPUTE_PGM_RSRC2 0x00B84C
+#define S_00B84C_SCRATCH_EN(x) (((x) & 0x1) << 0)
+#define G_00B84C_SCRATCH_EN(x) (((x) >> 0) & 0x1)
+#define C_00B84C_SCRATCH_EN 0xFFFFFFFE
+#define S_00B84C_USER_SGPR(x) (((x) & 0x1F) << 1)
+#define G_00B84C_USER_SGPR(x) (((x) >> 1) & 0x1F)
+#define C_00B84C_USER_SGPR 0xFFFFFFC1
+#define S_00B84C_TGID_X_EN(x) (((x) & 0x1) << 7)
+#define G_00B84C_TGID_X_EN(x) (((x) >> 7) & 0x1)
+#define C_00B84C_TGID_X_EN 0xFFFFFF7F
+#define S_00B84C_TGID_Y_EN(x) (((x) & 0x1) << 8)
+#define G_00B84C_TGID_Y_EN(x) (((x) >> 8) & 0x1)
+#define C_00B84C_TGID_Y_EN 0xFFFFFEFF
+#define S_00B84C_TGID_Z_EN(x) (((x) & 0x1) << 9)
+#define G_00B84C_TGID_Z_EN(x) (((x) >> 9) & 0x1)
+#define C_00B84C_TGID_Z_EN 0xFFFFFDFF
+#define S_00B84C_TG_SIZE_EN(x) (((x) & 0x1) << 10)
+#define G_00B84C_TG_SIZE_EN(x) (((x) >> 10) & 0x1)
+#define C_00B84C_TG_SIZE_EN 0xFFFFFBFF
+#define S_00B84C_TIDIG_COMP_CNT(x) (((x) & 0x03) << 11)
+#define G_00B84C_TIDIG_COMP_CNT(x) (((x) >> 11) & 0x03)
+#define C_00B84C_TIDIG_COMP_CNT 0xFFFFE7FF
+/* CIK */
+#define S_00B84C_EXCP_EN_MSB(x) (((x) & 0x03) << 13)
+#define G_00B84C_EXCP_EN_MSB(x) (((x) >> 13) & 0x03)
+#define C_00B84C_EXCP_EN_MSB 0xFFFF9FFF
+/* */
+#define S_00B84C_LDS_SIZE(x) (((x) & 0x1FF) << 15)
+#define G_00B84C_LDS_SIZE(x) (((x) >> 15) & 0x1FF)
+#define C_00B84C_LDS_SIZE 0xFF007FFF
+#define S_00B84C_EXCP_EN(x) (((x) & 0x7F) << 24)
+#define G_00B84C_EXCP_EN(x) (((x) >> 24) & 0x7F)
+#define C_00B84C_EXCP_EN
+
+#define R_0286CC_SPI_PS_INPUT_ENA 0x0286CC
+#define R_0286D0_SPI_PS_INPUT_ADDR 0x0286D0
+
+#define R_00B848_COMPUTE_PGM_RSRC1 0x00B848
+#define S_00B848_VGPRS(x) (((x) & 0x3F) << 0)
+#define G_00B848_VGPRS(x) (((x) >> 0) & 0x3F)
+#define C_00B848_VGPRS 0xFFFFFFC0
+#define S_00B848_SGPRS(x) (((x) & 0x0F) << 6)
+#define G_00B848_SGPRS(x) (((x) >> 6) & 0x0F)
+#define C_00B848_SGPRS 0xFFFFFC3F
+#define S_00B848_PRIORITY(x) (((x) & 0x03) << 10)
+#define G_00B848_PRIORITY(x) (((x) >> 10) & 0x03)
+#define C_00B848_PRIORITY 0xFFFFF3FF
+#define S_00B848_FLOAT_MODE(x) (((x) & 0xFF) << 12)
+#define G_00B848_FLOAT_MODE(x) (((x) >> 12) & 0xFF)
+#define C_00B848_FLOAT_MODE 0xFFF00FFF
+#define S_00B848_PRIV(x) (((x) & 0x1) << 20)
+#define G_00B848_PRIV(x) (((x) >> 20) & 0x1)
+#define C_00B848_PRIV 0xFFEFFFFF
+#define S_00B848_DX10_CLAMP(x) (((x) & 0x1) << 21)
+#define G_00B848_DX10_CLAMP(x) (((x) >> 21) & 0x1)
+#define C_00B848_DX10_CLAMP 0xFFDFFFFF
+#define S_00B848_DEBUG_MODE(x) (((x) & 0x1) << 22)
+#define G_00B848_DEBUG_MODE(x) (((x) >> 22) & 0x1)
+#define C_00B848_DEBUG_MODE 0xFFBFFFFF
+#define S_00B848_IEEE_MODE(x) (((x) & 0x1) << 23)
+#define G_00B848_IEEE_MODE(x) (((x) >> 23) & 0x1)
+#define C_00B848_IEEE_MODE 0xFF7FFFFF
+
+
+// Helpers for setting FLOAT_MODE
+#define FP_ROUND_ROUND_TO_NEAREST 0
+#define FP_ROUND_ROUND_TO_INF 1
+#define FP_ROUND_ROUND_TO_NEGINF 2
+#define FP_ROUND_ROUND_TO_ZERO 3
+
+// Bits 3:0 control rounding mode. 1:0 control single precision, 3:2 double
+// precision.
+#define FP_ROUND_MODE_SP(x) ((x) & 0x3)
+#define FP_ROUND_MODE_DP(x) (((x) & 0x3) << 2)
+
+#define FP_DENORM_FLUSH_IN_FLUSH_OUT 0
+#define FP_DENORM_FLUSH_OUT 1
+#define FP_DENORM_FLUSH_IN 2
+#define FP_DENORM_FLUSH_NONE 3
+
+
+// Bits 7:4 control denormal handling. 5:4 control single precision, 6:7 double
+// precision.
+#define FP_DENORM_MODE_SP(x) (((x) & 0x3) << 4)
+#define FP_DENORM_MODE_DP(x) (((x) & 0x3) << 6)
+
+#define R_00B860_COMPUTE_TMPRING_SIZE 0x00B860
+#define S_00B860_WAVESIZE(x) (((x) & 0x1FFF) << 12)
+
+#define R_0286E8_SPI_TMPRING_SIZE 0x0286E8
+#define S_0286E8_WAVESIZE(x) (((x) & 0x1FFF) << 12)
+
+#define R_SPILLED_SGPRS 0x4
+#define R_SPILLED_VGPRS 0x8
+
+} // End namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp
new file mode 100644
index 000000000000..d4d3959658e7
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp
@@ -0,0 +1,88 @@
+//===-- SIFixControlFlowLiveIntervals.cpp - Fix CF live intervals ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Spilling of EXEC masks used for control flow messes up control flow
+/// lowering, so mark all live intervals associated with CF instructions as
+/// non-spillable.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-fix-cf-live-intervals"
+
+namespace {
+
+class SIFixControlFlowLiveIntervals : public MachineFunctionPass {
+public:
+ static char ID;
+
+public:
+ SIFixControlFlowLiveIntervals() : MachineFunctionPass(ID) {
+ initializeSIFixControlFlowLiveIntervalsPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override { return "SI Fix CF Live Intervals"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LiveIntervals>();
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SIFixControlFlowLiveIntervals, DEBUG_TYPE,
+ "SI Fix CF Live Intervals", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(SIFixControlFlowLiveIntervals, DEBUG_TYPE,
+ "SI Fix CF Live Intervals", false, false)
+
+char SIFixControlFlowLiveIntervals::ID = 0;
+
+char &llvm::SIFixControlFlowLiveIntervalsID = SIFixControlFlowLiveIntervals::ID;
+
+FunctionPass *llvm::createSIFixControlFlowLiveIntervalsPass() {
+ return new SIFixControlFlowLiveIntervals();
+}
+
+bool SIFixControlFlowLiveIntervals::runOnMachineFunction(MachineFunction &MF) {
+ LiveIntervals *LIS = &getAnalysis<LiveIntervals>();
+
+ for (const MachineBasicBlock &MBB : MF) {
+ for (const MachineInstr &MI : MBB) {
+ switch (MI.getOpcode()) {
+ case AMDGPU::SI_IF:
+ case AMDGPU::SI_ELSE:
+ case AMDGPU::SI_BREAK:
+ case AMDGPU::SI_IF_BREAK:
+ case AMDGPU::SI_ELSE_BREAK:
+ case AMDGPU::SI_END_CF: {
+ unsigned Reg = MI.getOperand(0).getReg();
+ LIS->getInterval(Reg).markNotSpillable();
+ break;
+ }
+ default:
+ break;
+ }
+ }
+ }
+
+ return false;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
new file mode 100644
index 000000000000..6a422e70fe1f
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -0,0 +1,462 @@
+//===-- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Copies from VGPR to SGPR registers are illegal and the register coalescer
+/// will sometimes generate these illegal copies in situations like this:
+///
+/// Register Class <vsrc> is the union of <vgpr> and <sgpr>
+///
+/// BB0:
+/// %vreg0 <sgpr> = SCALAR_INST
+/// %vreg1 <vsrc> = COPY %vreg0 <sgpr>
+/// ...
+/// BRANCH %cond BB1, BB2
+/// BB1:
+/// %vreg2 <vgpr> = VECTOR_INST
+/// %vreg3 <vsrc> = COPY %vreg2 <vgpr>
+/// BB2:
+/// %vreg4 <vsrc> = PHI %vreg1 <vsrc>, <BB#0>, %vreg3 <vrsc>, <BB#1>
+/// %vreg5 <vgpr> = VECTOR_INST %vreg4 <vsrc>
+///
+///
+/// The coalescer will begin at BB0 and eliminate its copy, then the resulting
+/// code will look like this:
+///
+/// BB0:
+/// %vreg0 <sgpr> = SCALAR_INST
+/// ...
+/// BRANCH %cond BB1, BB2
+/// BB1:
+/// %vreg2 <vgpr> = VECTOR_INST
+/// %vreg3 <vsrc> = COPY %vreg2 <vgpr>
+/// BB2:
+/// %vreg4 <sgpr> = PHI %vreg0 <sgpr>, <BB#0>, %vreg3 <vsrc>, <BB#1>
+/// %vreg5 <vgpr> = VECTOR_INST %vreg4 <sgpr>
+///
+/// Now that the result of the PHI instruction is an SGPR, the register
+/// allocator is now forced to constrain the register class of %vreg3 to
+/// <sgpr> so we end up with final code like this:
+///
+/// BB0:
+/// %vreg0 <sgpr> = SCALAR_INST
+/// ...
+/// BRANCH %cond BB1, BB2
+/// BB1:
+/// %vreg2 <vgpr> = VECTOR_INST
+/// %vreg3 <sgpr> = COPY %vreg2 <vgpr>
+/// BB2:
+/// %vreg4 <sgpr> = PHI %vreg0 <sgpr>, <BB#0>, %vreg3 <sgpr>, <BB#1>
+/// %vreg5 <vgpr> = VECTOR_INST %vreg4 <sgpr>
+///
+/// Now this code contains an illegal copy from a VGPR to an SGPR.
+///
+/// In order to avoid this problem, this pass searches for PHI instructions
+/// which define a <vsrc> register and constrains its definition class to
+/// <vgpr> if the user of the PHI's definition register is a vector instruction.
+/// If the PHI's definition class is constrained to <vgpr> then the coalescer
+/// will be unable to perform the COPY removal from the above example which
+/// ultimately led to the creation of an illegal COPY.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-fix-sgpr-copies"
+
+namespace {
+
+class SIFixSGPRCopies : public MachineFunctionPass {
+
+ MachineDominatorTree *MDT;
+
+public:
+ static char ID;
+
+ SIFixSGPRCopies() : MachineFunctionPass(ID) { }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override { return "SI Fix SGPR copies"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // End anonymous namespace
+
+INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE,
+ "SI Fix SGPR copies", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE,
+ "SI Fix SGPR copies", false, false)
+
+
+char SIFixSGPRCopies::ID = 0;
+
+char &llvm::SIFixSGPRCopiesID = SIFixSGPRCopies::ID;
+
+FunctionPass *llvm::createSIFixSGPRCopiesPass() {
+ return new SIFixSGPRCopies();
+}
+
+static bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) {
+ const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ if (!MI.getOperand(i).isReg() ||
+ !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg()))
+ continue;
+
+ if (TRI->hasVGPRs(MRI.getRegClass(MI.getOperand(i).getReg())))
+ return true;
+ }
+ return false;
+}
+
+static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
+getCopyRegClasses(const MachineInstr &Copy,
+ const SIRegisterInfo &TRI,
+ const MachineRegisterInfo &MRI) {
+ unsigned DstReg = Copy.getOperand(0).getReg();
+ unsigned SrcReg = Copy.getOperand(1).getReg();
+
+ const TargetRegisterClass *SrcRC =
+ TargetRegisterInfo::isVirtualRegister(SrcReg) ?
+ MRI.getRegClass(SrcReg) :
+ TRI.getPhysRegClass(SrcReg);
+
+ // We don't really care about the subregister here.
+ // SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg());
+
+ const TargetRegisterClass *DstRC =
+ TargetRegisterInfo::isVirtualRegister(DstReg) ?
+ MRI.getRegClass(DstReg) :
+ TRI.getPhysRegClass(DstReg);
+
+ return std::make_pair(SrcRC, DstRC);
+}
+
+static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC,
+ const TargetRegisterClass *DstRC,
+ const SIRegisterInfo &TRI) {
+ return TRI.isSGPRClass(DstRC) && TRI.hasVGPRs(SrcRC);
+}
+
+static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC,
+ const TargetRegisterClass *DstRC,
+ const SIRegisterInfo &TRI) {
+ return TRI.isSGPRClass(SrcRC) && TRI.hasVGPRs(DstRC);
+}
+
+// Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE.
+//
+// SGPRx = ...
+// SGPRy = REG_SEQUENCE SGPRx, sub0 ...
+// VGPRz = COPY SGPRy
+//
+// ==>
+//
+// VGPRx = COPY SGPRx
+// VGPRz = REG_SEQUENCE VGPRx, sub0
+//
+// This exposes immediate folding opportunities when materializing 64-bit
+// immediates.
+static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
+ const SIRegisterInfo *TRI,
+ const SIInstrInfo *TII,
+ MachineRegisterInfo &MRI) {
+ assert(MI.isRegSequence());
+
+ unsigned DstReg = MI.getOperand(0).getReg();
+ if (!TRI->isSGPRClass(MRI.getRegClass(DstReg)))
+ return false;
+
+ if (!MRI.hasOneUse(DstReg))
+ return false;
+
+ MachineInstr &CopyUse = *MRI.use_instr_begin(DstReg);
+ if (!CopyUse.isCopy())
+ return false;
+
+ const TargetRegisterClass *SrcRC, *DstRC;
+ std::tie(SrcRC, DstRC) = getCopyRegClasses(CopyUse, *TRI, MRI);
+
+ if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI))
+ return false;
+
+ // TODO: Could have multiple extracts?
+ unsigned SubReg = CopyUse.getOperand(1).getSubReg();
+ if (SubReg != AMDGPU::NoSubRegister)
+ return false;
+
+ MRI.setRegClass(DstReg, DstRC);
+
+ // SGPRx = ...
+ // SGPRy = REG_SEQUENCE SGPRx, sub0 ...
+ // VGPRz = COPY SGPRy
+
+ // =>
+ // VGPRx = COPY SGPRx
+ // VGPRz = REG_SEQUENCE VGPRx, sub0
+
+ MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg());
+
+ for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) {
+ unsigned SrcReg = MI.getOperand(I).getReg();
+ unsigned SrcSubReg = MI.getOperand(I).getSubReg();
+
+ const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
+ assert(TRI->isSGPRClass(SrcRC) &&
+ "Expected SGPR REG_SEQUENCE to only have SGPR inputs");
+
+ SrcRC = TRI->getSubRegClass(SrcRC, SrcSubReg);
+ const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SrcRC);
+
+ unsigned TmpReg = MRI.createVirtualRegister(NewSrcRC);
+
+ BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), TmpReg)
+ .addOperand(MI.getOperand(I));
+
+ MI.getOperand(I).setReg(TmpReg);
+ }
+
+ CopyUse.eraseFromParent();
+ return true;
+}
+
+static bool phiHasVGPROperands(const MachineInstr &PHI,
+ const MachineRegisterInfo &MRI,
+ const SIRegisterInfo *TRI,
+ const SIInstrInfo *TII) {
+
+ for (unsigned i = 1; i < PHI.getNumOperands(); i += 2) {
+ unsigned Reg = PHI.getOperand(i).getReg();
+ if (TRI->hasVGPRs(MRI.getRegClass(Reg)))
+ return true;
+ }
+ return false;
+}
+static bool phiHasBreakDef(const MachineInstr &PHI,
+ const MachineRegisterInfo &MRI,
+ SmallSet<unsigned, 8> &Visited) {
+
+ for (unsigned i = 1; i < PHI.getNumOperands(); i += 2) {
+ unsigned Reg = PHI.getOperand(i).getReg();
+ if (Visited.count(Reg))
+ continue;
+
+ Visited.insert(Reg);
+
+ MachineInstr *DefInstr = MRI.getUniqueVRegDef(Reg);
+ assert(DefInstr);
+ switch (DefInstr->getOpcode()) {
+ default:
+ break;
+ case AMDGPU::SI_BREAK:
+ case AMDGPU::SI_IF_BREAK:
+ case AMDGPU::SI_ELSE_BREAK:
+ return true;
+ case AMDGPU::PHI:
+ if (phiHasBreakDef(*DefInstr, MRI, Visited))
+ return true;
+ }
+ }
+ return false;
+}
+
+static bool hasTerminatorThatModifiesExec(const MachineBasicBlock &MBB,
+ const TargetRegisterInfo &TRI) {
+ for (MachineBasicBlock::const_iterator I = MBB.getFirstTerminator(),
+ E = MBB.end(); I != E; ++I) {
+ if (I->modifiesRegister(AMDGPU::EXEC, &TRI))
+ return true;
+ }
+ return false;
+}
+
+static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
+ const MachineInstr *MoveImm,
+ const SIInstrInfo *TII,
+ unsigned &SMovOp,
+ int64_t &Imm) {
+
+ if (!MoveImm->isMoveImmediate())
+ return false;
+
+ const MachineOperand *ImmOp =
+ TII->getNamedOperand(*MoveImm, AMDGPU::OpName::src0);
+ if (!ImmOp->isImm())
+ return false;
+
+ // FIXME: Handle copies with sub-regs.
+ if (Copy->getOperand(0).getSubReg())
+ return false;
+
+ switch (MoveImm->getOpcode()) {
+ default:
+ return false;
+ case AMDGPU::V_MOV_B32_e32:
+ SMovOp = AMDGPU::S_MOV_B32;
+ break;
+ case AMDGPU::V_MOV_B64_PSEUDO:
+ SMovOp = AMDGPU::S_MOV_B64;
+ break;
+ }
+ Imm = ImmOp->getImm();
+ return true;
+}
+
+bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ MDT = &getAnalysis<MachineDominatorTree>();
+
+ SmallVector<MachineInstr *, 16> Worklist;
+
+ for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+ BI != BE; ++BI) {
+
+ MachineBasicBlock &MBB = *BI;
+ for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+ I != E; ++I) {
+ MachineInstr &MI = *I;
+
+ switch (MI.getOpcode()) {
+ default:
+ continue;
+ case AMDGPU::COPY: {
+ // If the destination register is a physical register there isn't really
+ // much we can do to fix this.
+ if (!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()))
+ continue;
+
+ const TargetRegisterClass *SrcRC, *DstRC;
+ std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, MRI);
+ if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) {
+ MachineInstr *DefMI = MRI.getVRegDef(MI.getOperand(1).getReg());
+ unsigned SMovOp;
+ int64_t Imm;
+ // If we are just copying an immediate, we can replace the copy with
+ // s_mov_b32.
+ if (isSafeToFoldImmIntoCopy(&MI, DefMI, TII, SMovOp, Imm)) {
+ MI.getOperand(1).ChangeToImmediate(Imm);
+ MI.addImplicitDefUseOperands(MF);
+ MI.setDesc(TII->get(SMovOp));
+ break;
+ }
+ TII->moveToVALU(MI);
+ }
+
+ break;
+ }
+ case AMDGPU::PHI: {
+ unsigned Reg = MI.getOperand(0).getReg();
+ if (!TRI->isSGPRClass(MRI.getRegClass(Reg)))
+ break;
+
+ // We don't need to fix the PHI if the common dominator of the
+ // two incoming blocks terminates with a uniform branch.
+ if (MI.getNumExplicitOperands() == 5) {
+ MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB();
+ MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB();
+
+ MachineBasicBlock *NCD = MDT->findNearestCommonDominator(MBB0, MBB1);
+ if (NCD && !hasTerminatorThatModifiesExec(*NCD, *TRI)) {
+ DEBUG(dbgs() << "Not fixing PHI for uniform branch: " << MI << '\n');
+ break;
+ }
+ }
+
+ // If a PHI node defines an SGPR and any of its operands are VGPRs,
+ // then we need to move it to the VALU.
+ //
+ // Also, if a PHI node defines an SGPR and has all SGPR operands
+ // we must move it to the VALU, because the SGPR operands will
+ // all end up being assigned the same register, which means
+ // there is a potential for a conflict if different threads take
+ // different control flow paths.
+ //
+ // For Example:
+ //
+ // sgpr0 = def;
+ // ...
+ // sgpr1 = def;
+ // ...
+ // sgpr2 = PHI sgpr0, sgpr1
+ // use sgpr2;
+ //
+ // Will Become:
+ //
+ // sgpr2 = def;
+ // ...
+ // sgpr2 = def;
+ // ...
+ // use sgpr2
+ //
+ // The one exception to this rule is when one of the operands
+ // is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK
+ // instruction. In this case, there we know the program will
+ // never enter the second block (the loop) without entering
+ // the first block (where the condition is computed), so there
+ // is no chance for values to be over-written.
+
+ SmallSet<unsigned, 8> Visited;
+ if (phiHasVGPROperands(MI, MRI, TRI, TII) ||
+ !phiHasBreakDef(MI, MRI, Visited)) {
+ DEBUG(dbgs() << "Fixing PHI: " << MI);
+ TII->moveToVALU(MI);
+ }
+ break;
+ }
+ case AMDGPU::REG_SEQUENCE: {
+ if (TRI->hasVGPRs(TII->getOpRegClass(MI, 0)) ||
+ !hasVGPROperands(MI, TRI)) {
+ foldVGPRCopyIntoRegSequence(MI, TRI, TII, MRI);
+ continue;
+ }
+
+ DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI);
+
+ TII->moveToVALU(MI);
+ break;
+ }
+ case AMDGPU::INSERT_SUBREG: {
+ const TargetRegisterClass *DstRC, *Src0RC, *Src1RC;
+ DstRC = MRI.getRegClass(MI.getOperand(0).getReg());
+ Src0RC = MRI.getRegClass(MI.getOperand(1).getReg());
+ Src1RC = MRI.getRegClass(MI.getOperand(2).getReg());
+ if (TRI->isSGPRClass(DstRC) &&
+ (TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) {
+ DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI);
+ TII->moveToVALU(MI);
+ }
+ break;
+ }
+ }
+ }
+ }
+
+ return true;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
new file mode 100644
index 000000000000..831ac5948a68
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -0,0 +1,622 @@
+//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define DEBUG_TYPE "si-fold-operands"
+using namespace llvm;
+
+namespace {
+
+class SIFoldOperands : public MachineFunctionPass {
+public:
+ static char ID;
+
+public:
+ SIFoldOperands() : MachineFunctionPass(ID) {
+ initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override { return "SI Fold Operands"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+struct FoldCandidate {
+ MachineInstr *UseMI;
+ union {
+ MachineOperand *OpToFold;
+ uint64_t ImmToFold;
+ int FrameIndexToFold;
+ };
+ unsigned char UseOpNo;
+ MachineOperand::MachineOperandType Kind;
+
+ FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp) :
+ UseMI(MI), OpToFold(nullptr), UseOpNo(OpNo), Kind(FoldOp->getType()) {
+ if (FoldOp->isImm()) {
+ ImmToFold = FoldOp->getImm();
+ } else if (FoldOp->isFI()) {
+ FrameIndexToFold = FoldOp->getIndex();
+ } else {
+ assert(FoldOp->isReg());
+ OpToFold = FoldOp;
+ }
+ }
+
+ bool isFI() const {
+ return Kind == MachineOperand::MO_FrameIndex;
+ }
+
+ bool isImm() const {
+ return Kind == MachineOperand::MO_Immediate;
+ }
+
+ bool isReg() const {
+ return Kind == MachineOperand::MO_Register;
+ }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE,
+ "SI Fold Operands", false, false)
+
+char SIFoldOperands::ID = 0;
+
+char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
+
+FunctionPass *llvm::createSIFoldOperandsPass() {
+ return new SIFoldOperands();
+}
+
+static bool isSafeToFold(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case AMDGPU::V_MOV_B32_e32:
+ case AMDGPU::V_MOV_B32_e64:
+ case AMDGPU::V_MOV_B64_PSEUDO: {
+ // If there are additional implicit register operands, this may be used for
+ // register indexing so the source register operand isn't simply copied.
+ unsigned NumOps = MI.getDesc().getNumOperands() +
+ MI.getDesc().getNumImplicitUses();
+
+ return MI.getNumOperands() == NumOps;
+ }
+ case AMDGPU::S_MOV_B32:
+ case AMDGPU::S_MOV_B64:
+ case AMDGPU::COPY:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static bool updateOperand(FoldCandidate &Fold,
+ const TargetRegisterInfo &TRI) {
+ MachineInstr *MI = Fold.UseMI;
+ MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
+ assert(Old.isReg());
+
+ if (Fold.isImm()) {
+ Old.ChangeToImmediate(Fold.ImmToFold);
+ return true;
+ }
+
+ if (Fold.isFI()) {
+ Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
+ return true;
+ }
+
+ MachineOperand *New = Fold.OpToFold;
+ if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) &&
+ TargetRegisterInfo::isVirtualRegister(New->getReg())) {
+ Old.substVirtReg(New->getReg(), New->getSubReg(), TRI);
+ return true;
+ }
+
+ // FIXME: Handle physical registers.
+
+ return false;
+}
+
+static bool isUseMIInFoldList(const std::vector<FoldCandidate> &FoldList,
+ const MachineInstr *MI) {
+ for (auto Candidate : FoldList) {
+ if (Candidate.UseMI == MI)
+ return true;
+ }
+ return false;
+}
+
+static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
+ MachineInstr *MI, unsigned OpNo,
+ MachineOperand *OpToFold,
+ const SIInstrInfo *TII) {
+ if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
+
+ // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
+ unsigned Opc = MI->getOpcode();
+ if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64) &&
+ (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) {
+ bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
+
+ // Check if changing this to a v_mad_{f16, f32} instruction will allow us
+ // to fold the operand.
+ MI->setDesc(TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16));
+ bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII);
+ if (FoldAsMAD) {
+ MI->untieRegOperand(OpNo);
+ return true;
+ }
+ MI->setDesc(TII->get(Opc));
+ }
+
+ // Special case for s_setreg_b32
+ if (Opc == AMDGPU::S_SETREG_B32 && OpToFold->isImm()) {
+ MI->setDesc(TII->get(AMDGPU::S_SETREG_IMM32_B32));
+ FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
+ return true;
+ }
+
+ // If we are already folding into another operand of MI, then
+ // we can't commute the instruction, otherwise we risk making the
+ // other fold illegal.
+ if (isUseMIInFoldList(FoldList, MI))
+ return false;
+
+ // Operand is not legal, so try to commute the instruction to
+ // see if this makes it possible to fold.
+ unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex;
+ unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
+ bool CanCommute = TII->findCommutedOpIndices(*MI, CommuteIdx0, CommuteIdx1);
+
+ if (CanCommute) {
+ if (CommuteIdx0 == OpNo)
+ OpNo = CommuteIdx1;
+ else if (CommuteIdx1 == OpNo)
+ OpNo = CommuteIdx0;
+ }
+
+ // One of operands might be an Imm operand, and OpNo may refer to it after
+ // the call of commuteInstruction() below. Such situations are avoided
+ // here explicitly as OpNo must be a register operand to be a candidate
+ // for memory folding.
+ if (CanCommute && (!MI->getOperand(CommuteIdx0).isReg() ||
+ !MI->getOperand(CommuteIdx1).isReg()))
+ return false;
+
+ if (!CanCommute ||
+ !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1))
+ return false;
+
+ if (!TII->isOperandLegal(*MI, OpNo, OpToFold))
+ return false;
+ }
+
+ FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
+ return true;
+}
+
+// If the use operand doesn't care about the value, this may be an operand only
+// used for register indexing, in which case it is unsafe to fold.
+static bool isUseSafeToFold(const MachineInstr &MI,
+ const MachineOperand &UseMO) {
+ return !UseMO.isUndef();
+ //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg());
+}
+
+static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
+ unsigned UseOpIdx,
+ std::vector<FoldCandidate> &FoldList,
+ SmallVectorImpl<MachineInstr *> &CopiesToReplace,
+ const SIInstrInfo *TII, const SIRegisterInfo &TRI,
+ MachineRegisterInfo &MRI) {
+ const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
+
+ if (!isUseSafeToFold(*UseMI, UseOp))
+ return;
+
+ // FIXME: Fold operands with subregs.
+ if (UseOp.isReg() && OpToFold.isReg()) {
+ if (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister)
+ return;
+
+ // Don't fold subregister extracts into tied operands, only if it is a full
+ // copy since a subregister use tied to a full register def doesn't really
+ // make sense. e.g. don't fold:
+ //
+ // %vreg1 = COPY %vreg0:sub1
+ // %vreg2<tied3> = V_MAC_{F16, F32} %vreg3, %vreg4, %vreg1<tied0>
+ //
+ // into
+ // %vreg2<tied3> = V_MAC_{F16, F32} %vreg3, %vreg4, %vreg0:sub1<tied0>
+ if (UseOp.isTied() && OpToFold.getSubReg() != AMDGPU::NoSubRegister)
+ return;
+ }
+
+ // Special case for REG_SEQUENCE: We can't fold literals into
+ // REG_SEQUENCE instructions, so we have to fold them into the
+ // uses of REG_SEQUENCE.
+ if (UseMI->isRegSequence()) {
+ unsigned RegSeqDstReg = UseMI->getOperand(0).getReg();
+ unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
+
+ for (MachineRegisterInfo::use_iterator
+ RSUse = MRI.use_begin(RegSeqDstReg), RSE = MRI.use_end();
+ RSUse != RSE; ++RSUse) {
+
+ MachineInstr *RSUseMI = RSUse->getParent();
+ if (RSUse->getSubReg() != RegSeqDstSubReg)
+ continue;
+
+ foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList,
+ CopiesToReplace, TII, TRI, MRI);
+ }
+
+ return;
+ }
+
+
+ bool FoldingImm = OpToFold.isImm();
+
+ // In order to fold immediates into copies, we need to change the
+ // copy to a MOV.
+ if (FoldingImm && UseMI->isCopy()) {
+ unsigned DestReg = UseMI->getOperand(0).getReg();
+ const TargetRegisterClass *DestRC
+ = TargetRegisterInfo::isVirtualRegister(DestReg) ?
+ MRI.getRegClass(DestReg) :
+ TRI.getPhysRegClass(DestReg);
+
+ unsigned MovOp = TII->getMovOpcode(DestRC);
+ if (MovOp == AMDGPU::COPY)
+ return;
+
+ UseMI->setDesc(TII->get(MovOp));
+ CopiesToReplace.push_back(UseMI);
+ } else {
+ const MCInstrDesc &UseDesc = UseMI->getDesc();
+
+ // Don't fold into target independent nodes. Target independent opcodes
+ // don't have defined register classes.
+ if (UseDesc.isVariadic() ||
+ UseDesc.OpInfo[UseOpIdx].RegClass == -1)
+ return;
+ }
+
+ if (!FoldingImm) {
+ tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
+
+ // FIXME: We could try to change the instruction from 64-bit to 32-bit
+ // to enable more folding opportunites. The shrink operands pass
+ // already does this.
+ return;
+ }
+
+
+ const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc();
+ const TargetRegisterClass *FoldRC =
+ TRI.getRegClass(FoldDesc.OpInfo[0].RegClass);
+
+ APInt Imm(TII->operandBitWidth(FoldDesc.OpInfo[1].OperandType),
+ OpToFold.getImm());
+
+ // Split 64-bit constants into 32-bits for folding.
+ if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) {
+ unsigned UseReg = UseOp.getReg();
+ const TargetRegisterClass *UseRC
+ = TargetRegisterInfo::isVirtualRegister(UseReg) ?
+ MRI.getRegClass(UseReg) :
+ TRI.getPhysRegClass(UseReg);
+
+ assert(Imm.getBitWidth() == 64);
+
+ if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64)
+ return;
+
+ if (UseOp.getSubReg() == AMDGPU::sub0) {
+ Imm = Imm.getLoBits(32);
+ } else {
+ assert(UseOp.getSubReg() == AMDGPU::sub1);
+ Imm = Imm.getHiBits(32);
+ }
+ }
+
+ MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
+ tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII);
+}
+
+static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
+ int32_t LHS, int32_t RHS) {
+ switch (Opcode) {
+ case AMDGPU::V_AND_B32_e64:
+ case AMDGPU::S_AND_B32:
+ Result = LHS & RHS;
+ return true;
+ case AMDGPU::V_OR_B32_e64:
+ case AMDGPU::S_OR_B32:
+ Result = LHS | RHS;
+ return true;
+ case AMDGPU::V_XOR_B32_e64:
+ case AMDGPU::S_XOR_B32:
+ Result = LHS ^ RHS;
+ return true;
+ default:
+ return false;
+ }
+}
+
+static unsigned getMovOpc(bool IsScalar) {
+ return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
+}
+
+/// Remove any leftover implicit operands from mutating the instruction. e.g.
+/// if we replace an s_and_b32 with a copy, we don't need the implicit scc def
+/// anymore.
+static void stripExtraCopyOperands(MachineInstr &MI) {
+ const MCInstrDesc &Desc = MI.getDesc();
+ unsigned NumOps = Desc.getNumOperands() +
+ Desc.getNumImplicitUses() +
+ Desc.getNumImplicitDefs();
+
+ for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
+ MI.RemoveOperand(I);
+}
+
+static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
+ MI.setDesc(NewDesc);
+ stripExtraCopyOperands(MI);
+}
+
+// Try to simplify operations with a constant that may appear after instruction
+// selection.
+static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
+ const SIInstrInfo *TII,
+ MachineInstr *MI) {
+ unsigned Opc = MI->getOpcode();
+
+ if (Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
+ Opc == AMDGPU::S_NOT_B32) {
+ MachineOperand &Src0 = MI->getOperand(1);
+ if (Src0.isImm()) {
+ Src0.setImm(~Src0.getImm());
+ mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
+ return true;
+ }
+
+ return false;
+ }
+
+ if (!MI->isCommutable())
+ return false;
+
+ int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+ int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+
+ MachineOperand *Src0 = &MI->getOperand(Src0Idx);
+ MachineOperand *Src1 = &MI->getOperand(Src1Idx);
+ if (!Src0->isImm() && !Src1->isImm())
+ return false;
+
+ // and k0, k1 -> v_mov_b32 (k0 & k1)
+ // or k0, k1 -> v_mov_b32 (k0 | k1)
+ // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
+ if (Src0->isImm() && Src1->isImm()) {
+ int32_t NewImm;
+ if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm()))
+ return false;
+
+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
+ bool IsSGPR = TRI.isSGPRReg(MRI, MI->getOperand(0).getReg());
+
+ Src0->setImm(NewImm);
+ MI->RemoveOperand(Src1Idx);
+ mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
+ return true;
+ }
+
+ if (Src0->isImm() && !Src1->isImm()) {
+ std::swap(Src0, Src1);
+ std::swap(Src0Idx, Src1Idx);
+ }
+
+ int32_t Src1Val = static_cast<int32_t>(Src1->getImm());
+ if (Opc == AMDGPU::V_OR_B32_e64 || Opc == AMDGPU::S_OR_B32) {
+ if (Src1Val == 0) {
+ // y = or x, 0 => y = copy x
+ MI->RemoveOperand(Src1Idx);
+ mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
+ } else if (Src1Val == -1) {
+ // y = or x, -1 => y = v_mov_b32 -1
+ MI->RemoveOperand(Src1Idx);
+ mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
+ } else
+ return false;
+
+ return true;
+ }
+
+ if (MI->getOpcode() == AMDGPU::V_AND_B32_e64 ||
+ MI->getOpcode() == AMDGPU::S_AND_B32) {
+ if (Src1Val == 0) {
+ // y = and x, 0 => y = v_mov_b32 0
+ MI->RemoveOperand(Src0Idx);
+ mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
+ } else if (Src1Val == -1) {
+ // y = and x, -1 => y = copy x
+ MI->RemoveOperand(Src1Idx);
+ mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
+ stripExtraCopyOperands(*MI);
+ } else
+ return false;
+
+ return true;
+ }
+
+ if (MI->getOpcode() == AMDGPU::V_XOR_B32_e64 ||
+ MI->getOpcode() == AMDGPU::S_XOR_B32) {
+ if (Src1Val == 0) {
+ // y = xor x, 0 => y = copy x
+ MI->RemoveOperand(Src1Idx);
+ mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
+ }
+ }
+
+ return false;
+}
+
+bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
+
+ for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+ BI != BE; ++BI) {
+
+ MachineBasicBlock &MBB = *BI;
+ MachineBasicBlock::iterator I, Next;
+ for (I = MBB.begin(); I != MBB.end(); I = Next) {
+ Next = std::next(I);
+ MachineInstr &MI = *I;
+
+ if (!isSafeToFold(MI))
+ continue;
+
+ MachineOperand &OpToFold = MI.getOperand(1);
+ bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
+
+ // FIXME: We could also be folding things like FrameIndexes and
+ // TargetIndexes.
+ if (!FoldingImm && !OpToFold.isReg())
+ continue;
+
+ if (OpToFold.isReg() &&
+ !TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()))
+ continue;
+
+ // Prevent folding operands backwards in the function. For example,
+ // the COPY opcode must not be replaced by 1 in this example:
+ //
+ // %vreg3<def> = COPY %VGPR0; VGPR_32:%vreg3
+ // ...
+ // %VGPR0<def> = V_MOV_B32_e32 1, %EXEC<imp-use>
+ MachineOperand &Dst = MI.getOperand(0);
+ if (Dst.isReg() &&
+ !TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
+ continue;
+
+ // We need mutate the operands of new mov instructions to add implicit
+ // uses of EXEC, but adding them invalidates the use_iterator, so defer
+ // this.
+ SmallVector<MachineInstr *, 4> CopiesToReplace;
+
+ std::vector<FoldCandidate> FoldList;
+ if (FoldingImm) {
+ unsigned NumLiteralUses = 0;
+ MachineOperand *NonInlineUse = nullptr;
+ int NonInlineUseOpNo = -1;
+
+ // Try to fold any inline immediate uses, and then only fold other
+ // constants if they have one use.
+ //
+ // The legality of the inline immediate must be checked based on the use
+ // operand, not the defining instruction, because 32-bit instructions
+ // with 32-bit inline immediate sources may be used to materialize
+ // constants used in 16-bit operands.
+ //
+ // e.g. it is unsafe to fold:
+ // s_mov_b32 s0, 1.0 // materializes 0x3f800000
+ // v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00
+
+ // Folding immediates with more than one use will increase program size.
+ // FIXME: This will also reduce register usage, which may be better
+ // in some cases. A better heuristic is needed.
+ for (MachineRegisterInfo::use_iterator
+ Use = MRI.use_begin(Dst.getReg()), E = MRI.use_end();
+ Use != E; ++Use) {
+ MachineInstr *UseMI = Use->getParent();
+ unsigned OpNo = Use.getOperandNo();
+
+ if (TII->isInlineConstant(*UseMI, OpNo, OpToFold)) {
+ foldOperand(OpToFold, UseMI, OpNo, FoldList,
+ CopiesToReplace, TII, TRI, MRI);
+ } else {
+ if (++NumLiteralUses == 1) {
+ NonInlineUse = &*Use;
+ NonInlineUseOpNo = OpNo;
+ }
+ }
+ }
+
+ if (NumLiteralUses == 1) {
+ MachineInstr *UseMI = NonInlineUse->getParent();
+ foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList,
+ CopiesToReplace, TII, TRI, MRI);
+ }
+ } else {
+ // Folding register.
+ for (MachineRegisterInfo::use_iterator
+ Use = MRI.use_begin(Dst.getReg()), E = MRI.use_end();
+ Use != E; ++Use) {
+ MachineInstr *UseMI = Use->getParent();
+
+ foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList,
+ CopiesToReplace, TII, TRI, MRI);
+ }
+ }
+
+ // Make sure we add EXEC uses to any new v_mov instructions created.
+ for (MachineInstr *Copy : CopiesToReplace)
+ Copy->addImplicitDefUseOperands(MF);
+
+ for (FoldCandidate &Fold : FoldList) {
+ if (updateOperand(Fold, TRI)) {
+ // Clear kill flags.
+ if (Fold.isReg()) {
+ assert(Fold.OpToFold && Fold.OpToFold->isReg());
+ // FIXME: Probably shouldn't bother trying to fold if not an
+ // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
+ // copies.
+ MRI.clearKillFlags(Fold.OpToFold->getReg());
+ }
+ DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
+ static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n');
+
+ // Folding the immediate may reveal operations that can be constant
+ // folded or replaced with a copy. This can happen for example after
+ // frame indices are lowered to constants or from splitting 64-bit
+ // constants.
+ tryConstantFoldOp(MRI, TII, Fold.UseMI);
+ }
+ }
+ }
+ }
+ return false;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
new file mode 100644
index 000000000000..d0a69eafc58e
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -0,0 +1,400 @@
+//===----------------------- SIFrameLowering.cpp --------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+
+#include "SIFrameLowering.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "AMDGPUSubtarget.h"
+
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+
+using namespace llvm;
+
+
+static ArrayRef<MCPhysReg> getAllSGPR128(const MachineFunction &MF,
+ const SIRegisterInfo *TRI) {
+ return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
+ TRI->getMaxNumSGPRs(MF) / 4);
+}
+
+static ArrayRef<MCPhysReg> getAllSGPRs(const MachineFunction &MF,
+ const SIRegisterInfo *TRI) {
+ return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(),
+ TRI->getMaxNumSGPRs(MF));
+}
+
+void SIFrameLowering::emitFlatScratchInit(const SIInstrInfo *TII,
+ const SIRegisterInfo* TRI,
+ MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ // We don't need this if we only have spills since there is no user facing
+ // scratch.
+
+ // TODO: If we know we don't have flat instructions earlier, we can omit
+ // this from the input registers.
+ //
+ // TODO: We only need to know if we access scratch space through a flat
+ // pointer. Because we only detect if flat instructions are used at all,
+ // this will be used more often than necessary on VI.
+
+ // Debug location must be unknown since the first debug location is used to
+ // determine the end of the prologue.
+ DebugLoc DL;
+ MachineBasicBlock::iterator I = MBB.begin();
+
+ unsigned FlatScratchInitReg
+ = TRI->getPreloadedValue(MF, SIRegisterInfo::FLAT_SCRATCH_INIT);
+
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ MRI.addLiveIn(FlatScratchInitReg);
+ MBB.addLiveIn(FlatScratchInitReg);
+
+ // Copy the size in bytes.
+ unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
+ .addReg(FlatScrInitHi, RegState::Kill);
+
+ unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
+
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
+
+ // Add wave offset in bytes to private base offset.
+ // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
+ .addReg(FlatScrInitLo)
+ .addReg(ScratchWaveOffsetReg);
+
+ // Convert offset to 256-byte units.
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI)
+ .addReg(FlatScrInitLo, RegState::Kill)
+ .addImm(8);
+}
+
+unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
+ const SISubtarget &ST,
+ const SIInstrInfo *TII,
+ const SIRegisterInfo *TRI,
+ SIMachineFunctionInfo *MFI,
+ MachineFunction &MF) const {
+
+ // We need to insert initialization of the scratch resource descriptor.
+ unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
+ if (ScratchRsrcReg == AMDGPU::NoRegister)
+ return AMDGPU::NoRegister;
+
+ if (ST.hasSGPRInitBug() ||
+ ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
+ return ScratchRsrcReg;
+
+ // We reserved the last registers for this. Shift it down to the end of those
+ // which were actually used.
+ //
+ // FIXME: It might be safer to use a pseudoregister before replacement.
+
+ // FIXME: We should be able to eliminate unused input registers. We only
+ // cannot do this for the resources required for scratch access. For now we
+ // skip over user SGPRs and may leave unused holes.
+
+ // We find the resource first because it has an alignment requirement.
+
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
+ ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(MF, TRI);
+ AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
+
+ // Skip the last 2 elements because the last one is reserved for VCC, and
+ // this is the 2nd to last element already.
+ for (MCPhysReg Reg : AllSGPR128s) {
+ // Pick the first unallocated one. Make sure we don't clobber the other
+ // reserved input we needed.
+ if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) {
+ //assert(MRI.isAllocatable(Reg));
+ MRI.replaceRegWith(ScratchRsrcReg, Reg);
+ MFI->setScratchRSrcReg(Reg);
+ return Reg;
+ }
+ }
+
+ return ScratchRsrcReg;
+}
+
+unsigned SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
+ const SISubtarget &ST,
+ const SIInstrInfo *TII,
+ const SIRegisterInfo *TRI,
+ SIMachineFunctionInfo *MFI,
+ MachineFunction &MF) const {
+ unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
+ if (ST.hasSGPRInitBug() ||
+ ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF))
+ return ScratchWaveOffsetReg;
+
+ unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
+
+ ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(MF, TRI);
+ if (NumPreloaded > AllSGPRs.size())
+ return ScratchWaveOffsetReg;
+
+ AllSGPRs = AllSGPRs.slice(NumPreloaded);
+
+ // We need to drop register from the end of the list that we cannot use
+ // for the scratch wave offset.
+ // + 2 s102 and s103 do not exist on VI.
+ // + 2 for vcc
+ // + 2 for xnack_mask
+ // + 2 for flat_scratch
+ // + 4 for registers reserved for scratch resource register
+ // + 1 for register reserved for scratch wave offset. (By exluding this
+ // register from the list to consider, it means that when this
+ // register is being used for the scratch wave offset and there
+ // are no other free SGPRs, then the value will stay in this register.
+ // ----
+ // 13
+ if (AllSGPRs.size() < 13)
+ return ScratchWaveOffsetReg;
+
+ for (MCPhysReg Reg : AllSGPRs.drop_back(13)) {
+ // Pick the first unallocated SGPR. Be careful not to pick an alias of the
+ // scratch descriptor, since we haven’t added its uses yet.
+ if (!MRI.isPhysRegUsed(Reg)) {
+ if (!MRI.isAllocatable(Reg) ||
+ TRI->isSubRegisterEq(ScratchRsrcReg, Reg))
+ continue;
+
+ MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
+ MFI->setScratchWaveOffsetReg(Reg);
+ return Reg;
+ }
+ }
+
+ return ScratchWaveOffsetReg;
+}
+
+void SIFrameLowering::emitPrologue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ // Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was
+ // specified.
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ if (ST.debuggerEmitPrologue())
+ emitDebuggerPrologue(MF, MBB);
+
+ assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
+
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+ // If we only have SGPR spills, we won't actually be using scratch memory
+ // since these spill to VGPRs.
+ //
+ // FIXME: We should be cleaning up these unused SGPR spill frame indices
+ // somewhere.
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const SIRegisterInfo *TRI = &TII->getRegisterInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ unsigned ScratchRsrcReg
+ = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF);
+ unsigned ScratchWaveOffsetReg
+ = getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF);
+
+ if (ScratchRsrcReg == AMDGPU::NoRegister) {
+ assert(ScratchWaveOffsetReg == AMDGPU::NoRegister);
+ return;
+ }
+
+ assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg));
+
+ // We need to do the replacement of the private segment buffer and wave offset
+ // register even if there are no stack objects. There could be stores to undef
+ // or a constant without an associated object.
+
+ // FIXME: We still have implicit uses on SGPR spill instructions in case they
+ // need to spill to vector memory. It's likely that will not happen, but at
+ // this point it appears we need the setup. This part of the prolog should be
+ // emitted after frame indices are eliminated.
+
+ if (MF.getFrameInfo().hasStackObjects() && MFI->hasFlatScratchInit())
+ emitFlatScratchInit(TII, TRI, MF, MBB);
+
+ // We need to insert initialization of the scratch resource descriptor.
+ unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue(
+ MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+
+
+ unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
+ if (ST.isAmdCodeObjectV2()) {
+ PreloadedPrivateBufferReg = TRI->getPreloadedValue(
+ MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
+ }
+
+ bool OffsetRegUsed = !MRI.use_empty(ScratchWaveOffsetReg);
+ bool ResourceRegUsed = !MRI.use_empty(ScratchRsrcReg);
+
+ // We added live-ins during argument lowering, but since they were not used
+ // they were deleted. We're adding the uses now, so add them back.
+ if (OffsetRegUsed) {
+ assert(PreloadedScratchWaveOffsetReg != AMDGPU::NoRegister &&
+ "scratch wave offset input is required");
+ MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
+ MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
+ }
+
+ if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) {
+ assert(ST.isAmdCodeObjectV2());
+ MRI.addLiveIn(PreloadedPrivateBufferReg);
+ MBB.addLiveIn(PreloadedPrivateBufferReg);
+ }
+
+ // Make the register selected live throughout the function.
+ for (MachineBasicBlock &OtherBB : MF) {
+ if (&OtherBB == &MBB)
+ continue;
+
+ if (OffsetRegUsed)
+ OtherBB.addLiveIn(ScratchWaveOffsetReg);
+
+ if (ResourceRegUsed)
+ OtherBB.addLiveIn(ScratchRsrcReg);
+ }
+
+ DebugLoc DL;
+ MachineBasicBlock::iterator I = MBB.begin();
+
+ // If we reserved the original input registers, we don't need to copy to the
+ // reserved registers.
+
+ bool CopyBuffer = ResourceRegUsed &&
+ PreloadedPrivateBufferReg != AMDGPU::NoRegister &&
+ ScratchRsrcReg != PreloadedPrivateBufferReg;
+
+ // This needs to be careful of the copying order to avoid overwriting one of
+ // the input registers before it's been copied to it's final
+ // destination. Usually the offset should be copied first.
+ bool CopyBufferFirst = TRI->isSubRegisterEq(PreloadedPrivateBufferReg,
+ ScratchWaveOffsetReg);
+ if (CopyBuffer && CopyBufferFirst) {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
+ .addReg(PreloadedPrivateBufferReg, RegState::Kill);
+ }
+
+ if (OffsetRegUsed &&
+ PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
+ .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
+ }
+
+ if (CopyBuffer && !CopyBufferFirst) {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
+ .addReg(PreloadedPrivateBufferReg, RegState::Kill);
+ }
+
+ if (ResourceRegUsed && PreloadedPrivateBufferReg == AMDGPU::NoRegister) {
+ assert(!ST.isAmdCodeObjectV2());
+ const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
+
+ unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
+ unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
+ unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
+ unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
+
+ // Use relocations to get the pointer, and setup the other bits manually.
+ uint64_t Rsrc23 = TII->getScratchRsrcWords23();
+ BuildMI(MBB, I, DL, SMovB32, Rsrc0)
+ .addExternalSymbol("SCRATCH_RSRC_DWORD0")
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+ BuildMI(MBB, I, DL, SMovB32, Rsrc1)
+ .addExternalSymbol("SCRATCH_RSRC_DWORD1")
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+ BuildMI(MBB, I, DL, SMovB32, Rsrc2)
+ .addImm(Rsrc23 & 0xffffffff)
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+ BuildMI(MBB, I, DL, SMovB32, Rsrc3)
+ .addImm(Rsrc23 >> 32)
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+ }
+}
+
+void SIFrameLowering::emitEpilogue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+
+}
+
+void SIFrameLowering::processFunctionBeforeFrameFinalized(
+ MachineFunction &MF,
+ RegScavenger *RS) const {
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ if (!MFI.hasStackObjects())
+ return;
+
+ bool MayNeedScavengingEmergencySlot = MFI.hasStackObjects();
+
+ assert((RS || !MayNeedScavengingEmergencySlot) &&
+ "RegScavenger required if spilling");
+
+ if (MayNeedScavengingEmergencySlot) {
+ int ScavengeFI = MFI.CreateStackObject(
+ AMDGPU::SGPR_32RegClass.getSize(),
+ AMDGPU::SGPR_32RegClass.getAlignment(), false);
+ RS->addScavengingFrameIndex(ScavengeFI);
+ }
+}
+
+void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const SIRegisterInfo *TRI = &TII->getRegisterInfo();
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+ MachineBasicBlock::iterator I = MBB.begin();
+ DebugLoc DL;
+
+ // For each dimension:
+ for (unsigned i = 0; i < 3; ++i) {
+ // Get work group ID SGPR, and make it live-in again.
+ unsigned WorkGroupIDSGPR = MFI->getWorkGroupIDSGPR(i);
+ MF.getRegInfo().addLiveIn(WorkGroupIDSGPR);
+ MBB.addLiveIn(WorkGroupIDSGPR);
+
+ // Since SGPRs are spilled into VGPRs, copy work group ID SGPR to VGPR in
+ // order to spill it to scratch.
+ unsigned WorkGroupIDVGPR =
+ MF.getRegInfo().createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), WorkGroupIDVGPR)
+ .addReg(WorkGroupIDSGPR);
+
+ // Spill work group ID.
+ int WorkGroupIDObjectIdx = MFI->getDebuggerWorkGroupIDStackObjectIndex(i);
+ TII->storeRegToStackSlot(MBB, I, WorkGroupIDVGPR, false,
+ WorkGroupIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI);
+
+ // Get work item ID VGPR, and make it live-in again.
+ unsigned WorkItemIDVGPR = MFI->getWorkItemIDVGPR(i);
+ MF.getRegInfo().addLiveIn(WorkItemIDVGPR);
+ MBB.addLiveIn(WorkItemIDVGPR);
+
+ // Spill work item ID.
+ int WorkItemIDObjectIdx = MFI->getDebuggerWorkItemIDStackObjectIndex(i);
+ TII->storeRegToStackSlot(MBB, I, WorkItemIDVGPR, false,
+ WorkItemIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI);
+ }
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h
new file mode 100644
index 000000000000..7657b4e03864
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -0,0 +1,64 @@
+//===--------------------- SIFrameLowering.h --------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIFRAMELOWERING_H
+#define LLVM_LIB_TARGET_AMDGPU_SIFRAMELOWERING_H
+
+#include "AMDGPUFrameLowering.h"
+
+namespace llvm {
+
+class SIInstrInfo;
+class SIMachineFunctionInfo;
+class SIRegisterInfo;
+class SISubtarget;
+
+class SIFrameLowering final : public AMDGPUFrameLowering {
+public:
+ SIFrameLowering(StackDirection D, unsigned StackAl, int LAO,
+ unsigned TransAl = 1) :
+ AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {}
+ ~SIFrameLowering() override = default;
+
+ void emitPrologue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const override;
+ void emitEpilogue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const override;
+
+ void processFunctionBeforeFrameFinalized(
+ MachineFunction &MF,
+ RegScavenger *RS = nullptr) const override;
+
+private:
+ void emitFlatScratchInit(const SIInstrInfo *TII,
+ const SIRegisterInfo* TRI,
+ MachineFunction &MF,
+ MachineBasicBlock &MBB) const;
+
+ unsigned getReservedPrivateSegmentBufferReg(
+ const SISubtarget &ST,
+ const SIInstrInfo *TII,
+ const SIRegisterInfo *TRI,
+ SIMachineFunctionInfo *MFI,
+ MachineFunction &MF) const;
+
+ unsigned getReservedPrivateSegmentWaveByteOffsetReg(
+ const SISubtarget &ST,
+ const SIInstrInfo *TII,
+ const SIRegisterInfo *TRI,
+ SIMachineFunctionInfo *MFI,
+ MachineFunction &MF) const;
+
+ /// \brief Emits debugger prologue.
+ void emitDebuggerPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_SIFRAMELOWERING_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
new file mode 100644
index 000000000000..fa53831cbe16
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -0,0 +1,4519 @@
+//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Custom DAG lowering for SI
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef _MSC_VER
+// Provide M_PI.
+#define _USE_MATH_DEFINES
+#include <cmath>
+#endif
+
+#include "AMDGPU.h"
+#include "AMDGPUIntrinsicInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "SIDefines.h"
+#include "SIISelLowering.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
+
+using namespace llvm;
+
+static cl::opt<bool> EnableVGPRIndexMode(
+ "amdgpu-vgpr-index-mode",
+ cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
+ cl::init(false));
+
+
+static unsigned findFirstFreeSGPR(CCState &CCInfo) {
+ unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
+ for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
+ if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
+ return AMDGPU::SGPR0 + Reg;
+ }
+ }
+ llvm_unreachable("Cannot allocate sgpr");
+}
+
+SITargetLowering::SITargetLowering(const TargetMachine &TM,
+ const SISubtarget &STI)
+ : AMDGPUTargetLowering(TM, STI) {
+ addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
+ addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
+
+ addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass);
+ addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
+
+ addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
+ addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
+ addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
+
+ addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
+ addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
+
+ addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
+ addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
+
+ addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
+ addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
+
+ addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
+ addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
+
+ if (Subtarget->has16BitInsts()) {
+ addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
+ addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
+ }
+
+ computeRegisterProperties(STI.getRegisterInfo());
+
+ // We need to custom lower vector stores from local memory
+ setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
+ setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
+ setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
+ setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
+ setOperationAction(ISD::LOAD, MVT::i1, Custom);
+
+ setOperationAction(ISD::STORE, MVT::v2i32, Custom);
+ setOperationAction(ISD::STORE, MVT::v4i32, Custom);
+ setOperationAction(ISD::STORE, MVT::v8i32, Custom);
+ setOperationAction(ISD::STORE, MVT::v16i32, Custom);
+ setOperationAction(ISD::STORE, MVT::i1, Custom);
+
+ setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+ setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+ setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
+
+ setOperationAction(ISD::SELECT, MVT::i1, Promote);
+ setOperationAction(ISD::SELECT, MVT::i64, Custom);
+ setOperationAction(ISD::SELECT, MVT::f64, Promote);
+ AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
+
+ setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
+
+ setOperationAction(ISD::SETCC, MVT::i1, Promote);
+ setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
+ setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
+ AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
+
+ setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
+ setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
+
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
+
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+
+ setOperationAction(ISD::BRCOND, MVT::Other, Custom);
+ setOperationAction(ISD::BR_CC, MVT::i1, Expand);
+ setOperationAction(ISD::BR_CC, MVT::i32, Expand);
+ setOperationAction(ISD::BR_CC, MVT::i64, Expand);
+ setOperationAction(ISD::BR_CC, MVT::f32, Expand);
+ setOperationAction(ISD::BR_CC, MVT::f64, Expand);
+
+ // We only support LOAD/STORE and vector manipulation ops for vectors
+ // with > 4 elements.
+ for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64}) {
+ for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
+ switch (Op) {
+ case ISD::LOAD:
+ case ISD::STORE:
+ case ISD::BUILD_VECTOR:
+ case ISD::BITCAST:
+ case ISD::EXTRACT_VECTOR_ELT:
+ case ISD::INSERT_VECTOR_ELT:
+ case ISD::INSERT_SUBVECTOR:
+ case ISD::EXTRACT_SUBVECTOR:
+ case ISD::SCALAR_TO_VECTOR:
+ break;
+ case ISD::CONCAT_VECTORS:
+ setOperationAction(Op, VT, Custom);
+ break;
+ default:
+ setOperationAction(Op, VT, Expand);
+ break;
+ }
+ }
+ }
+
+ // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
+ // is expanded to avoid having two separate loops in case the index is a VGPR.
+
+ // Most operations are naturally 32-bit vector operations. We only support
+ // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
+ for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
+ setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
+ AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
+
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
+ AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
+
+ setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
+ AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
+
+ setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
+ AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
+ }
+
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
+
+ // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
+ // and output demarshalling
+ setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
+ setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
+
+ // We can't return success/failure, only the old value,
+ // let LLVM add the comparison
+ setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
+ setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
+
+ if (getSubtarget()->hasFlatAddressSpace()) {
+ setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
+ setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
+ }
+
+ setOperationAction(ISD::BSWAP, MVT::i32, Legal);
+ setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
+
+ // On SI this is s_memtime and s_memrealtime on VI.
+ setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
+ setOperationAction(ISD::TRAP, MVT::Other, Custom);
+
+ setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
+ setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
+
+ if (Subtarget->getGeneration() >= SISubtarget::SEA_ISLANDS) {
+ setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
+ setOperationAction(ISD::FCEIL, MVT::f64, Legal);
+ setOperationAction(ISD::FRINT, MVT::f64, Legal);
+ }
+
+ setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
+
+ setOperationAction(ISD::FSIN, MVT::f32, Custom);
+ setOperationAction(ISD::FCOS, MVT::f32, Custom);
+ setOperationAction(ISD::FDIV, MVT::f32, Custom);
+ setOperationAction(ISD::FDIV, MVT::f64, Custom);
+
+ if (Subtarget->has16BitInsts()) {
+ setOperationAction(ISD::Constant, MVT::i16, Legal);
+
+ setOperationAction(ISD::SMIN, MVT::i16, Legal);
+ setOperationAction(ISD::SMAX, MVT::i16, Legal);
+
+ setOperationAction(ISD::UMIN, MVT::i16, Legal);
+ setOperationAction(ISD::UMAX, MVT::i16, Legal);
+
+ setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote);
+ AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
+
+ setOperationAction(ISD::ROTR, MVT::i16, Promote);
+ setOperationAction(ISD::ROTL, MVT::i16, Promote);
+
+ setOperationAction(ISD::SDIV, MVT::i16, Promote);
+ setOperationAction(ISD::UDIV, MVT::i16, Promote);
+ setOperationAction(ISD::SREM, MVT::i16, Promote);
+ setOperationAction(ISD::UREM, MVT::i16, Promote);
+
+ setOperationAction(ISD::BSWAP, MVT::i16, Promote);
+ setOperationAction(ISD::BITREVERSE, MVT::i16, Promote);
+
+ setOperationAction(ISD::CTTZ, MVT::i16, Promote);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote);
+ setOperationAction(ISD::CTLZ, MVT::i16, Promote);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote);
+
+ setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
+
+ setOperationAction(ISD::BR_CC, MVT::i16, Expand);
+
+ setOperationAction(ISD::LOAD, MVT::i16, Custom);
+
+ setTruncStoreAction(MVT::i64, MVT::i16, Expand);
+
+ setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
+ AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
+ setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
+ AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
+
+ setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
+ setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
+ setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
+ setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
+
+ // F16 - Constant Actions.
+ setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
+
+ // F16 - Load/Store Actions.
+ setOperationAction(ISD::LOAD, MVT::f16, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
+ setOperationAction(ISD::STORE, MVT::f16, Promote);
+ AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
+
+ // F16 - VOP1 Actions.
+ setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
+ setOperationAction(ISD::FCOS, MVT::f16, Promote);
+ setOperationAction(ISD::FSIN, MVT::f16, Promote);
+ setOperationAction(ISD::FP_TO_SINT, MVT::f16, Promote);
+ setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote);
+ setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote);
+ setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote);
+
+ // F16 - VOP2 Actions.
+ setOperationAction(ISD::BR_CC, MVT::f16, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
+ setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
+ setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
+ setOperationAction(ISD::FDIV, MVT::f16, Custom);
+
+ // F16 - VOP3 Actions.
+ setOperationAction(ISD::FMA, MVT::f16, Legal);
+ if (!Subtarget->hasFP16Denormals())
+ setOperationAction(ISD::FMAD, MVT::f16, Legal);
+ }
+
+ setTargetDAGCombine(ISD::FADD);
+ setTargetDAGCombine(ISD::FSUB);
+ setTargetDAGCombine(ISD::FMINNUM);
+ setTargetDAGCombine(ISD::FMAXNUM);
+ setTargetDAGCombine(ISD::SMIN);
+ setTargetDAGCombine(ISD::SMAX);
+ setTargetDAGCombine(ISD::UMIN);
+ setTargetDAGCombine(ISD::UMAX);
+ setTargetDAGCombine(ISD::SETCC);
+ setTargetDAGCombine(ISD::AND);
+ setTargetDAGCombine(ISD::OR);
+ setTargetDAGCombine(ISD::XOR);
+ setTargetDAGCombine(ISD::SINT_TO_FP);
+ setTargetDAGCombine(ISD::UINT_TO_FP);
+ setTargetDAGCombine(ISD::FCANONICALIZE);
+
+ // All memory operations. Some folding on the pointer operand is done to help
+ // matching the constant offsets in the addressing modes.
+ setTargetDAGCombine(ISD::LOAD);
+ setTargetDAGCombine(ISD::STORE);
+ setTargetDAGCombine(ISD::ATOMIC_LOAD);
+ setTargetDAGCombine(ISD::ATOMIC_STORE);
+ setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);
+ setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
+ setTargetDAGCombine(ISD::ATOMIC_SWAP);
+ setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD);
+ setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB);
+ setTargetDAGCombine(ISD::ATOMIC_LOAD_AND);
+ setTargetDAGCombine(ISD::ATOMIC_LOAD_OR);
+ setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR);
+ setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND);
+ setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN);
+ setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
+ setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
+ setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
+
+ setSchedulingPreference(Sched::RegPressure);
+}
+
+const SISubtarget *SITargetLowering::getSubtarget() const {
+ return static_cast<const SISubtarget *>(Subtarget);
+}
+
+//===----------------------------------------------------------------------===//
+// TargetLowering queries
+//===----------------------------------------------------------------------===//
+
+bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
+ const CallInst &CI,
+ unsigned IntrID) const {
+ switch (IntrID) {
+ case Intrinsic::amdgcn_atomic_inc:
+ case Intrinsic::amdgcn_atomic_dec:
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::getVT(CI.getType());
+ Info.ptrVal = CI.getOperand(0);
+ Info.align = 0;
+ Info.vol = false;
+ Info.readMem = true;
+ Info.writeMem = true;
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &,
+ EVT) const {
+ // SI has some legal vector types, but no legal vector operations. Say no
+ // shuffles are legal in order to prefer scalarizing some vector operations.
+ return false;
+}
+
+bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
+ // Flat instructions do not have offsets, and only have the register
+ // address.
+ return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1);
+}
+
+bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
+ // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
+ // additionally can do r + r + i with addr64. 32-bit has more addressing
+ // mode options. Depending on the resource constant, it can also do
+ // (i64 r0) + (i32 r1) * (i14 i).
+ //
+ // Private arrays end up using a scratch buffer most of the time, so also
+ // assume those use MUBUF instructions. Scratch loads / stores are currently
+ // implemented as mubuf instructions with offen bit set, so slightly
+ // different than the normal addr64.
+ if (!isUInt<12>(AM.BaseOffs))
+ return false;
+
+ // FIXME: Since we can split immediate into soffset and immediate offset,
+ // would it make sense to allow any immediate?
+
+ switch (AM.Scale) {
+ case 0: // r + i or just i, depending on HasBaseReg.
+ return true;
+ case 1:
+ return true; // We have r + r or r + i.
+ case 2:
+ if (AM.HasBaseReg) {
+ // Reject 2 * r + r.
+ return false;
+ }
+
+ // Allow 2 * r as r + r
+ // Or 2 * r + i is allowed as r + r + i.
+ return true;
+ default: // Don't allow n * r
+ return false;
+ }
+}
+
+bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
+ const AddrMode &AM, Type *Ty,
+ unsigned AS) const {
+ // No global is ever allowed as a base.
+ if (AM.BaseGV)
+ return false;
+
+ switch (AS) {
+ case AMDGPUAS::GLOBAL_ADDRESS: {
+ if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+ // Assume the we will use FLAT for all global memory accesses
+ // on VI.
+ // FIXME: This assumption is currently wrong. On VI we still use
+ // MUBUF instructions for the r + i addressing mode. As currently
+ // implemented, the MUBUF instructions only work on buffer < 4GB.
+ // It may be possible to support > 4GB buffers with MUBUF instructions,
+ // by setting the stride value in the resource descriptor which would
+ // increase the size limit to (stride * 4GB). However, this is risky,
+ // because it has never been validated.
+ return isLegalFlatAddressingMode(AM);
+ }
+
+ return isLegalMUBUFAddressingMode(AM);
+ }
+ case AMDGPUAS::CONSTANT_ADDRESS: {
+ // If the offset isn't a multiple of 4, it probably isn't going to be
+ // correctly aligned.
+ // FIXME: Can we get the real alignment here?
+ if (AM.BaseOffs % 4 != 0)
+ return isLegalMUBUFAddressingMode(AM);
+
+ // There are no SMRD extloads, so if we have to do a small type access we
+ // will use a MUBUF load.
+ // FIXME?: We also need to do this if unaligned, but we don't know the
+ // alignment here.
+ if (DL.getTypeStoreSize(Ty) < 4)
+ return isLegalMUBUFAddressingMode(AM);
+
+ if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
+ // SMRD instructions have an 8-bit, dword offset on SI.
+ if (!isUInt<8>(AM.BaseOffs / 4))
+ return false;
+ } else if (Subtarget->getGeneration() == SISubtarget::SEA_ISLANDS) {
+ // On CI+, this can also be a 32-bit literal constant offset. If it fits
+ // in 8-bits, it can use a smaller encoding.
+ if (!isUInt<32>(AM.BaseOffs / 4))
+ return false;
+ } else if (Subtarget->getGeneration() == SISubtarget::VOLCANIC_ISLANDS) {
+ // On VI, these use the SMEM format and the offset is 20-bit in bytes.
+ if (!isUInt<20>(AM.BaseOffs))
+ return false;
+ } else
+ llvm_unreachable("unhandled generation");
+
+ if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
+ return true;
+
+ if (AM.Scale == 1 && AM.HasBaseReg)
+ return true;
+
+ return false;
+ }
+
+ case AMDGPUAS::PRIVATE_ADDRESS:
+ return isLegalMUBUFAddressingMode(AM);
+
+ case AMDGPUAS::LOCAL_ADDRESS:
+ case AMDGPUAS::REGION_ADDRESS: {
+ // Basic, single offset DS instructions allow a 16-bit unsigned immediate
+ // field.
+ // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
+ // an 8-bit dword offset but we don't know the alignment here.
+ if (!isUInt<16>(AM.BaseOffs))
+ return false;
+
+ if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
+ return true;
+
+ if (AM.Scale == 1 && AM.HasBaseReg)
+ return true;
+
+ return false;
+ }
+ case AMDGPUAS::FLAT_ADDRESS:
+ case AMDGPUAS::UNKNOWN_ADDRESS_SPACE:
+ // For an unknown address space, this usually means that this is for some
+ // reason being used for pure arithmetic, and not based on some addressing
+ // computation. We don't have instructions that compute pointers with any
+ // addressing modes, so treat them as having no offset like flat
+ // instructions.
+ return isLegalFlatAddressingMode(AM);
+
+ default:
+ llvm_unreachable("unhandled address space");
+ }
+}
+
+bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+ unsigned AddrSpace,
+ unsigned Align,
+ bool *IsFast) const {
+ if (IsFast)
+ *IsFast = false;
+
+ // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
+ // which isn't a simple VT.
+ // Until MVT is extended to handle this, simply check for the size and
+ // rely on the condition below: allow accesses if the size is a multiple of 4.
+ if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
+ VT.getStoreSize() > 16)) {
+ return false;
+ }
+
+ if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
+ AddrSpace == AMDGPUAS::REGION_ADDRESS) {
+ // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
+ // aligned, 8 byte access in a single operation using ds_read2/write2_b32
+ // with adjacent offsets.
+ bool AlignedBy4 = (Align % 4 == 0);
+ if (IsFast)
+ *IsFast = AlignedBy4;
+
+ return AlignedBy4;
+ }
+
+ // FIXME: We have to be conservative here and assume that flat operations
+ // will access scratch. If we had access to the IR function, then we
+ // could determine if any private memory was used in the function.
+ if (!Subtarget->hasUnalignedScratchAccess() &&
+ (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
+ AddrSpace == AMDGPUAS::FLAT_ADDRESS)) {
+ return false;
+ }
+
+ if (Subtarget->hasUnalignedBufferAccess()) {
+ // If we have an uniform constant load, it still requires using a slow
+ // buffer instruction if unaligned.
+ if (IsFast) {
+ *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS) ?
+ (Align % 4 == 0) : true;
+ }
+
+ return true;
+ }
+
+ // Smaller than dword value must be aligned.
+ if (VT.bitsLT(MVT::i32))
+ return false;
+
+ // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
+ // byte-address are ignored, thus forcing Dword alignment.
+ // This applies to private, global, and constant memory.
+ if (IsFast)
+ *IsFast = true;
+
+ return VT.bitsGT(MVT::i32) && Align % 4 == 0;
+}
+
+EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
+ unsigned SrcAlign, bool IsMemset,
+ bool ZeroMemset,
+ bool MemcpyStrSrc,
+ MachineFunction &MF) const {
+ // FIXME: Should account for address space here.
+
+ // The default fallback uses the private pointer size as a guess for a type to
+ // use. Make sure we switch these to 64-bit accesses.
+
+ if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
+ return MVT::v4i32;
+
+ if (Size >= 8 && DstAlign >= 4)
+ return MVT::v2i32;
+
+ // Use the default.
+ return MVT::Other;
+}
+
+static bool isFlatGlobalAddrSpace(unsigned AS) {
+ return AS == AMDGPUAS::GLOBAL_ADDRESS ||
+ AS == AMDGPUAS::FLAT_ADDRESS ||
+ AS == AMDGPUAS::CONSTANT_ADDRESS;
+}
+
+bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
+ unsigned DestAS) const {
+ return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
+}
+
+bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
+ const MemSDNode *MemNode = cast<MemSDNode>(N);
+ const Value *Ptr = MemNode->getMemOperand()->getValue();
+ const Instruction *I = dyn_cast<Instruction>(Ptr);
+ return I && I->getMetadata("amdgpu.noclobber");
+}
+
+bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
+ unsigned DestAS) const {
+ // Flat -> private/local is a simple truncate.
+ // Flat -> global is no-op
+ if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
+ return true;
+
+ return isNoopAddrSpaceCast(SrcAS, DestAS);
+}
+
+bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
+ const MemSDNode *MemNode = cast<MemSDNode>(N);
+ const Value *Ptr = MemNode->getMemOperand()->getValue();
+
+ // UndefValue means this is a load of a kernel input. These are uniform.
+ // Sometimes LDS instructions have constant pointers.
+ // If Ptr is null, then that means this mem operand contains a
+ // PseudoSourceValue like GOT.
+ if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
+ isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
+ return true;
+
+ const Instruction *I = dyn_cast<Instruction>(Ptr);
+ return I && I->getMetadata("amdgpu.uniform");
+}
+
+TargetLoweringBase::LegalizeTypeAction
+SITargetLowering::getPreferredVectorAction(EVT VT) const {
+ if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
+ return TypeSplitVector;
+
+ return TargetLoweringBase::getPreferredVectorAction(VT);
+}
+
+bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
+ Type *Ty) const {
+ // FIXME: Could be smarter if called for vector constants.
+ return true;
+}
+
+bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
+ if (Subtarget->has16BitInsts() && VT == MVT::i16) {
+ switch (Op) {
+ case ISD::LOAD:
+ case ISD::STORE:
+
+ // These operations are done with 32-bit instructions anyway.
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR:
+ case ISD::SELECT:
+ // TODO: Extensions?
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ // SimplifySetCC uses this function to determine whether or not it should
+ // create setcc with i1 operands. We don't have instructions for i1 setcc.
+ if (VT == MVT::i1 && Op == ISD::SETCC)
+ return false;
+
+ return TargetLowering::isTypeDesirableForOp(Op, VT);
+}
+
+SDValue SITargetLowering::LowerParameterPtr(SelectionDAG &DAG,
+ const SDLoc &SL, SDValue Chain,
+ unsigned Offset) const {
+ const DataLayout &DL = DAG.getDataLayout();
+ MachineFunction &MF = DAG.getMachineFunction();
+ const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
+ unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
+
+ MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
+ MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
+ SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
+ MRI.getLiveInVirtReg(InputPtrReg), PtrVT);
+ return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
+ DAG.getConstant(Offset, SL, PtrVT));
+}
+
+SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
+ const SDLoc &SL, SDValue Chain,
+ unsigned Offset, bool Signed) const {
+ const DataLayout &DL = DAG.getDataLayout();
+ Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
+ PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
+ MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
+
+ unsigned Align = DL.getABITypeAlignment(Ty);
+
+ SDValue Ptr = LowerParameterPtr(DAG, SL, Chain, Offset);
+ SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
+ MachineMemOperand::MONonTemporal |
+ MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant);
+
+ SDValue Val;
+ if (MemVT.isFloatingPoint())
+ Val = getFPExtOrFPTrunc(DAG, Load, SL, VT);
+ else if (Signed)
+ Val = DAG.getSExtOrTrunc(Load, SL, VT);
+ else
+ Val = DAG.getZExtOrTrunc(Load, SL, VT);
+
+ SDValue Ops[] = {
+ Val,
+ Load.getValue(1)
+ };
+
+ return DAG.getMergeValues(Ops, SL);
+}
+
+SDValue SITargetLowering::LowerFormalArguments(
+ SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+ const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ FunctionType *FType = MF.getFunction()->getFunctionType();
+ SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+
+ if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
+ const Function *Fn = MF.getFunction();
+ DiagnosticInfoUnsupported NoGraphicsHSA(
+ *Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
+ DAG.getContext()->diagnose(NoGraphicsHSA);
+ return DAG.getEntryNode();
+ }
+
+ // Create stack objects that are used for emitting debugger prologue if
+ // "amdgpu-debugger-emit-prologue" attribute was specified.
+ if (ST.debuggerEmitPrologue())
+ createDebuggerPrologueStackObjects(MF);
+
+ SmallVector<ISD::InputArg, 16> Splits;
+ BitVector Skipped(Ins.size());
+
+ for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) {
+ const ISD::InputArg &Arg = Ins[i];
+
+ // First check if it's a PS input addr
+ if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() &&
+ !Arg.Flags.isByVal() && PSInputNum <= 15) {
+
+ if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) {
+ // We can safely skip PS inputs
+ Skipped.set(i);
+ ++PSInputNum;
+ continue;
+ }
+
+ Info->markPSInputAllocated(PSInputNum);
+ if (Arg.Used)
+ Info->PSInputEna |= 1 << PSInputNum;
+
+ ++PSInputNum;
+ }
+
+ if (AMDGPU::isShader(CallConv)) {
+ // Second split vertices into their elements
+ if (Arg.VT.isVector()) {
+ ISD::InputArg NewArg = Arg;
+ NewArg.Flags.setSplit();
+ NewArg.VT = Arg.VT.getVectorElementType();
+
+ // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
+ // three or five element vertex only needs three or five registers,
+ // NOT four or eight.
+ Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
+ unsigned NumElements = ParamType->getVectorNumElements();
+
+ for (unsigned j = 0; j != NumElements; ++j) {
+ Splits.push_back(NewArg);
+ NewArg.PartOffset += NewArg.VT.getStoreSize();
+ }
+ } else {
+ Splits.push_back(Arg);
+ }
+ }
+ }
+
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+ *DAG.getContext());
+
+ // At least one interpolation mode must be enabled or else the GPU will hang.
+ //
+ // Check PSInputAddr instead of PSInputEna. The idea is that if the user set
+ // PSInputAddr, the user wants to enable some bits after the compilation
+ // based on run-time states. Since we can't know what the final PSInputEna
+ // will look like, so we shouldn't do anything here and the user should take
+ // responsibility for the correct programming.
+ //
+ // Otherwise, the following restrictions apply:
+ // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
+ // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
+ // enabled too.
+ if (CallConv == CallingConv::AMDGPU_PS &&
+ ((Info->getPSInputAddr() & 0x7F) == 0 ||
+ ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11)))) {
+ CCInfo.AllocateReg(AMDGPU::VGPR0);
+ CCInfo.AllocateReg(AMDGPU::VGPR1);
+ Info->markPSInputAllocated(0);
+ Info->PSInputEna |= 1;
+ }
+
+ if (!AMDGPU::isShader(CallConv)) {
+ assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
+ } else {
+ assert(!Info->hasPrivateSegmentBuffer() && !Info->hasDispatchPtr() &&
+ !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
+ !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
+ !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
+ !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
+ !Info->hasWorkItemIDZ());
+ }
+
+ // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
+ if (Info->hasPrivateSegmentBuffer()) {
+ unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI);
+ MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass);
+ CCInfo.AllocateReg(PrivateSegmentBufferReg);
+ }
+
+ if (Info->hasDispatchPtr()) {
+ unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI);
+ MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
+ CCInfo.AllocateReg(DispatchPtrReg);
+ }
+
+ if (Info->hasQueuePtr()) {
+ unsigned QueuePtrReg = Info->addQueuePtr(*TRI);
+ MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
+ CCInfo.AllocateReg(QueuePtrReg);
+ }
+
+ if (Info->hasKernargSegmentPtr()) {
+ unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI);
+ MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
+ CCInfo.AllocateReg(InputPtrReg);
+ }
+
+ if (Info->hasDispatchID()) {
+ unsigned DispatchIDReg = Info->addDispatchID(*TRI);
+ MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
+ CCInfo.AllocateReg(DispatchIDReg);
+ }
+
+ if (Info->hasFlatScratchInit()) {
+ unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI);
+ MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
+ CCInfo.AllocateReg(FlatScratchInitReg);
+ }
+
+ if (!AMDGPU::isShader(CallConv))
+ analyzeFormalArgumentsCompute(CCInfo, Ins);
+ else
+ AnalyzeFormalArguments(CCInfo, Splits);
+
+ SmallVector<SDValue, 16> Chains;
+
+ for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
+
+ const ISD::InputArg &Arg = Ins[i];
+ if (Skipped[i]) {
+ InVals.push_back(DAG.getUNDEF(Arg.VT));
+ continue;
+ }
+
+ CCValAssign &VA = ArgLocs[ArgIdx++];
+ MVT VT = VA.getLocVT();
+
+ if (VA.isMemLoc()) {
+ VT = Ins[i].VT;
+ EVT MemVT = VA.getLocVT();
+ const unsigned Offset = Subtarget->getExplicitKernelArgOffset() +
+ VA.getLocMemOffset();
+ // The first 36 bytes of the input buffer contains information about
+ // thread group and global sizes.
+ SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, Chain,
+ Offset, Ins[i].Flags.isSExt());
+ Chains.push_back(Arg.getValue(1));
+
+ auto *ParamTy =
+ dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
+ if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
+ ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+ // On SI local pointers are just offsets into LDS, so they are always
+ // less than 16-bits. On CI and newer they could potentially be
+ // real pointers, so we can't guarantee their size.
+ Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
+ DAG.getValueType(MVT::i16));
+ }
+
+ InVals.push_back(Arg);
+ Info->setABIArgOffset(Offset + MemVT.getStoreSize());
+ continue;
+ }
+ assert(VA.isRegLoc() && "Parameter must be in a register!");
+
+ unsigned Reg = VA.getLocReg();
+
+ if (VT == MVT::i64) {
+ // For now assume it is a pointer
+ Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0,
+ &AMDGPU::SGPR_64RegClass);
+ Reg = MF.addLiveIn(Reg, &AMDGPU::SGPR_64RegClass);
+ SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+ InVals.push_back(Copy);
+ continue;
+ }
+
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
+
+ Reg = MF.addLiveIn(Reg, RC);
+ SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+
+ if (Arg.VT.isVector()) {
+
+ // Build a vector from the registers
+ Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
+ unsigned NumElements = ParamType->getVectorNumElements();
+
+ SmallVector<SDValue, 4> Regs;
+ Regs.push_back(Val);
+ for (unsigned j = 1; j != NumElements; ++j) {
+ Reg = ArgLocs[ArgIdx++].getLocReg();
+ Reg = MF.addLiveIn(Reg, RC);
+
+ SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+ Regs.push_back(Copy);
+ }
+
+ // Fill up the missing vector elements
+ NumElements = Arg.VT.getVectorNumElements() - NumElements;
+ Regs.append(NumElements, DAG.getUNDEF(VT));
+
+ InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs));
+ continue;
+ }
+
+ InVals.push_back(Val);
+ }
+
+ // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
+ // these from the dispatch pointer.
+
+ // Start adding system SGPRs.
+ if (Info->hasWorkGroupIDX()) {
+ unsigned Reg = Info->addWorkGroupIDX();
+ MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
+ CCInfo.AllocateReg(Reg);
+ }
+
+ if (Info->hasWorkGroupIDY()) {
+ unsigned Reg = Info->addWorkGroupIDY();
+ MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
+ CCInfo.AllocateReg(Reg);
+ }
+
+ if (Info->hasWorkGroupIDZ()) {
+ unsigned Reg = Info->addWorkGroupIDZ();
+ MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
+ CCInfo.AllocateReg(Reg);
+ }
+
+ if (Info->hasWorkGroupInfo()) {
+ unsigned Reg = Info->addWorkGroupInfo();
+ MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
+ CCInfo.AllocateReg(Reg);
+ }
+
+ if (Info->hasPrivateSegmentWaveByteOffset()) {
+ // Scratch wave offset passed in system SGPR.
+ unsigned PrivateSegmentWaveByteOffsetReg;
+
+ if (AMDGPU::isShader(CallConv)) {
+ PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
+ Info->setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
+ } else
+ PrivateSegmentWaveByteOffsetReg = Info->addPrivateSegmentWaveByteOffset();
+
+ MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
+ CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
+ }
+
+ // Now that we've figured out where the scratch register inputs are, see if
+ // should reserve the arguments and use them directly.
+ bool HasStackObjects = MF.getFrameInfo().hasStackObjects();
+ // Record that we know we have non-spill stack objects so we don't need to
+ // check all stack objects later.
+ if (HasStackObjects)
+ Info->setHasNonSpillStackObjects(true);
+
+ // Everything live out of a block is spilled with fast regalloc, so it's
+ // almost certain that spilling will be required.
+ if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
+ HasStackObjects = true;
+
+ if (ST.isAmdCodeObjectV2()) {
+ if (HasStackObjects) {
+ // If we have stack objects, we unquestionably need the private buffer
+ // resource. For the Code Object V2 ABI, this will be the first 4 user
+ // SGPR inputs. We can reserve those and use them directly.
+
+ unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue(
+ MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
+ Info->setScratchRSrcReg(PrivateSegmentBufferReg);
+
+ unsigned PrivateSegmentWaveByteOffsetReg = TRI->getPreloadedValue(
+ MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+ Info->setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
+ } else {
+ unsigned ReservedBufferReg
+ = TRI->reservedPrivateSegmentBufferReg(MF);
+ unsigned ReservedOffsetReg
+ = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
+
+ // We tentatively reserve the last registers (skipping the last two
+ // which may contain VCC). After register allocation, we'll replace
+ // these with the ones immediately after those which were really
+ // allocated. In the prologue copies will be inserted from the argument
+ // to these reserved registers.
+ Info->setScratchRSrcReg(ReservedBufferReg);
+ Info->setScratchWaveOffsetReg(ReservedOffsetReg);
+ }
+ } else {
+ unsigned ReservedBufferReg = TRI->reservedPrivateSegmentBufferReg(MF);
+
+ // Without HSA, relocations are used for the scratch pointer and the
+ // buffer resource setup is always inserted in the prologue. Scratch wave
+ // offset is still in an input SGPR.
+ Info->setScratchRSrcReg(ReservedBufferReg);
+
+ if (HasStackObjects) {
+ unsigned ScratchWaveOffsetReg = TRI->getPreloadedValue(
+ MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+ Info->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
+ } else {
+ unsigned ReservedOffsetReg
+ = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
+ Info->setScratchWaveOffsetReg(ReservedOffsetReg);
+ }
+ }
+
+ if (Info->hasWorkItemIDX()) {
+ unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X);
+ MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+ CCInfo.AllocateReg(Reg);
+ }
+
+ if (Info->hasWorkItemIDY()) {
+ unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y);
+ MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+ CCInfo.AllocateReg(Reg);
+ }
+
+ if (Info->hasWorkItemIDZ()) {
+ unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z);
+ MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+ CCInfo.AllocateReg(Reg);
+ }
+
+ if (Chains.empty())
+ return Chain;
+
+ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+}
+
+SDValue
+SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SDLoc &DL, SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+ if (!AMDGPU::isShader(CallConv))
+ return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
+ OutVals, DL, DAG);
+
+ Info->setIfReturnsVoid(Outs.size() == 0);
+
+ SmallVector<ISD::OutputArg, 48> Splits;
+ SmallVector<SDValue, 48> SplitVals;
+
+ // Split vectors into their elements.
+ for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+ const ISD::OutputArg &Out = Outs[i];
+
+ if (Out.VT.isVector()) {
+ MVT VT = Out.VT.getVectorElementType();
+ ISD::OutputArg NewOut = Out;
+ NewOut.Flags.setSplit();
+ NewOut.VT = VT;
+
+ // We want the original number of vector elements here, e.g.
+ // three or five, not four or eight.
+ unsigned NumElements = Out.ArgVT.getVectorNumElements();
+
+ for (unsigned j = 0; j != NumElements; ++j) {
+ SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i],
+ DAG.getConstant(j, DL, MVT::i32));
+ SplitVals.push_back(Elem);
+ Splits.push_back(NewOut);
+ NewOut.PartOffset += NewOut.VT.getStoreSize();
+ }
+ } else {
+ SplitVals.push_back(OutVals[i]);
+ Splits.push_back(Out);
+ }
+ }
+
+ // CCValAssign - represent the assignment of the return value to a location.
+ SmallVector<CCValAssign, 48> RVLocs;
+
+ // CCState - Info about the registers and stack slots.
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+ *DAG.getContext());
+
+ // Analyze outgoing return values.
+ AnalyzeReturn(CCInfo, Splits);
+
+ SDValue Flag;
+ SmallVector<SDValue, 48> RetOps;
+ RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
+
+ // Copy the result values into the output registers.
+ for (unsigned i = 0, realRVLocIdx = 0;
+ i != RVLocs.size();
+ ++i, ++realRVLocIdx) {
+ CCValAssign &VA = RVLocs[i];
+ assert(VA.isRegLoc() && "Can only return in registers!");
+
+ SDValue Arg = SplitVals[realRVLocIdx];
+
+ // Copied from other backends.
+ switch (VA.getLocInfo()) {
+ default: llvm_unreachable("Unknown loc info!");
+ case CCValAssign::Full:
+ break;
+ case CCValAssign::BCvt:
+ Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+ break;
+ }
+
+ Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
+ Flag = Chain.getValue(1);
+ RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+ }
+
+ // Update chain and glue.
+ RetOps[0] = Chain;
+ if (Flag.getNode())
+ RetOps.push_back(Flag);
+
+ unsigned Opc = Info->returnsVoid() ? AMDGPUISD::ENDPGM : AMDGPUISD::RETURN;
+ return DAG.getNode(Opc, DL, MVT::Other, RetOps);
+}
+
+unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
+ SelectionDAG &DAG) const {
+ unsigned Reg = StringSwitch<unsigned>(RegName)
+ .Case("m0", AMDGPU::M0)
+ .Case("exec", AMDGPU::EXEC)
+ .Case("exec_lo", AMDGPU::EXEC_LO)
+ .Case("exec_hi", AMDGPU::EXEC_HI)
+ .Case("flat_scratch", AMDGPU::FLAT_SCR)
+ .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
+ .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
+ .Default(AMDGPU::NoRegister);
+
+ if (Reg == AMDGPU::NoRegister) {
+ report_fatal_error(Twine("invalid register name \""
+ + StringRef(RegName) + "\"."));
+
+ }
+
+ if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
+ Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
+ report_fatal_error(Twine("invalid register \""
+ + StringRef(RegName) + "\" for subtarget."));
+ }
+
+ switch (Reg) {
+ case AMDGPU::M0:
+ case AMDGPU::EXEC_LO:
+ case AMDGPU::EXEC_HI:
+ case AMDGPU::FLAT_SCR_LO:
+ case AMDGPU::FLAT_SCR_HI:
+ if (VT.getSizeInBits() == 32)
+ return Reg;
+ break;
+ case AMDGPU::EXEC:
+ case AMDGPU::FLAT_SCR:
+ if (VT.getSizeInBits() == 64)
+ return Reg;
+ break;
+ default:
+ llvm_unreachable("missing register type checking");
+ }
+
+ report_fatal_error(Twine("invalid type for register \""
+ + StringRef(RegName) + "\"."));
+}
+
+// If kill is not the last instruction, split the block so kill is always a
+// proper terminator.
+MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+
+ MachineBasicBlock::iterator SplitPoint(&MI);
+ ++SplitPoint;
+
+ if (SplitPoint == BB->end()) {
+ // Don't bother with a new block.
+ MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR));
+ return BB;
+ }
+
+ MachineFunction *MF = BB->getParent();
+ MachineBasicBlock *SplitBB
+ = MF->CreateMachineBasicBlock(BB->getBasicBlock());
+
+ MF->insert(++MachineFunction::iterator(BB), SplitBB);
+ SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
+
+ SplitBB->transferSuccessorsAndUpdatePHIs(BB);
+ BB->addSuccessor(SplitBB);
+
+ MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR));
+ return SplitBB;
+}
+
+// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
+// wavefront. If the value is uniform and just happens to be in a VGPR, this
+// will only do one iteration. In the worst case, this will loop 64 times.
+//
+// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
+static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
+ const SIInstrInfo *TII,
+ MachineRegisterInfo &MRI,
+ MachineBasicBlock &OrigBB,
+ MachineBasicBlock &LoopBB,
+ const DebugLoc &DL,
+ const MachineOperand &IdxReg,
+ unsigned InitReg,
+ unsigned ResultReg,
+ unsigned PhiReg,
+ unsigned InitSaveExecReg,
+ int Offset,
+ bool UseGPRIdxMode) {
+ MachineBasicBlock::iterator I = LoopBB.begin();
+
+ unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+
+ BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
+ .addReg(InitReg)
+ .addMBB(&OrigBB)
+ .addReg(ResultReg)
+ .addMBB(&LoopBB);
+
+ BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
+ .addReg(InitSaveExecReg)
+ .addMBB(&OrigBB)
+ .addReg(NewExec)
+ .addMBB(&LoopBB);
+
+ // Read the next variant <- also loop target.
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
+ .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
+
+ // Compare the just read M0 value to all possible Idx values.
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
+ .addReg(CurrentIdxReg)
+ .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
+
+ if (UseGPRIdxMode) {
+ unsigned IdxReg;
+ if (Offset == 0) {
+ IdxReg = CurrentIdxReg;
+ } else {
+ IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg)
+ .addReg(CurrentIdxReg, RegState::Kill)
+ .addImm(Offset);
+ }
+
+ MachineInstr *SetIdx =
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_IDX))
+ .addReg(IdxReg, RegState::Kill);
+ SetIdx->getOperand(2).setIsUndef();
+ } else {
+ // Move index from VCC into M0
+ if (Offset == 0) {
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+ .addReg(CurrentIdxReg, RegState::Kill);
+ } else {
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
+ .addReg(CurrentIdxReg, RegState::Kill)
+ .addImm(Offset);
+ }
+ }
+
+ // Update EXEC, save the original EXEC value to VCC.
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
+ .addReg(CondReg, RegState::Kill);
+
+ MRI.setSimpleHint(NewExec, CondReg);
+
+ // Update EXEC, switch all done bits to 0 and all todo bits to 1.
+ MachineInstr *InsertPt =
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
+ .addReg(AMDGPU::EXEC)
+ .addReg(NewExec);
+
+ // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
+ // s_cbranch_scc0?
+
+ // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+ .addMBB(&LoopBB);
+
+ return InsertPt->getIterator();
+}
+
+// This has slightly sub-optimal regalloc when the source vector is killed by
+// the read. The register allocator does not understand that the kill is
+// per-workitem, so is kept alive for the whole loop so we end up not re-using a
+// subregister from it, using 1 more VGPR than necessary. This was saved when
+// this was expanded after register allocation.
+static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
+ MachineBasicBlock &MBB,
+ MachineInstr &MI,
+ unsigned InitResultReg,
+ unsigned PhiReg,
+ int Offset,
+ bool UseGPRIdxMode) {
+ MachineFunction *MF = MBB.getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineBasicBlock::iterator I(&MI);
+
+ unsigned DstReg = MI.getOperand(0).getReg();
+ unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+
+ BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
+
+ // Save the EXEC mask
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec)
+ .addReg(AMDGPU::EXEC);
+
+ // To insert the loop we need to split the block. Move everything after this
+ // point to a new block, and insert a new empty block between the two.
+ MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
+ MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
+ MachineFunction::iterator MBBI(MBB);
+ ++MBBI;
+
+ MF->insert(MBBI, LoopBB);
+ MF->insert(MBBI, RemainderBB);
+
+ LoopBB->addSuccessor(LoopBB);
+ LoopBB->addSuccessor(RemainderBB);
+
+ // Move the rest of the block into a new block.
+ RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
+ RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
+
+ MBB.addSuccessor(LoopBB);
+
+ const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
+
+ auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
+ InitResultReg, DstReg, PhiReg, TmpExec,
+ Offset, UseGPRIdxMode);
+
+ MachineBasicBlock::iterator First = RemainderBB->begin();
+ BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+ .addReg(SaveExec);
+
+ return InsPt;
+}
+
+// Returns subreg index, offset
+static std::pair<unsigned, int>
+computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
+ const TargetRegisterClass *SuperRC,
+ unsigned VecReg,
+ int Offset) {
+ int NumElts = SuperRC->getSize() / 4;
+
+ // Skip out of bounds offsets, or else we would end up using an undefined
+ // register.
+ if (Offset >= NumElts || Offset < 0)
+ return std::make_pair(AMDGPU::sub0, Offset);
+
+ return std::make_pair(AMDGPU::sub0 + Offset, 0);
+}
+
+// Return true if the index is an SGPR and was set.
+static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
+ MachineRegisterInfo &MRI,
+ MachineInstr &MI,
+ int Offset,
+ bool UseGPRIdxMode,
+ bool IsIndirectSrc) {
+ MachineBasicBlock *MBB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineBasicBlock::iterator I(&MI);
+
+ const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
+ const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
+
+ assert(Idx->getReg() != AMDGPU::NoRegister);
+
+ if (!TII->getRegisterInfo().isSGPRClass(IdxRC))
+ return false;
+
+ if (UseGPRIdxMode) {
+ unsigned IdxMode = IsIndirectSrc ?
+ VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
+ if (Offset == 0) {
+ MachineInstr *SetOn =
+ BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
+ .addOperand(*Idx)
+ .addImm(IdxMode);
+
+ SetOn->getOperand(3).setIsUndef();
+ } else {
+ unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
+ .addOperand(*Idx)
+ .addImm(Offset);
+ MachineInstr *SetOn =
+ BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
+ .addReg(Tmp, RegState::Kill)
+ .addImm(IdxMode);
+
+ SetOn->getOperand(3).setIsUndef();
+ }
+
+ return true;
+ }
+
+ if (Offset == 0) {
+ BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+ .addOperand(*Idx);
+ } else {
+ BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
+ .addOperand(*Idx)
+ .addImm(Offset);
+ }
+
+ return true;
+}
+
+// Control flow needs to be inserted if indexing with a VGPR.
+static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
+ MachineBasicBlock &MBB,
+ const SISubtarget &ST) {
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
+ MachineFunction *MF = MBB.getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ unsigned Dst = MI.getOperand(0).getReg();
+ unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
+ int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
+
+ const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
+
+ unsigned SubReg;
+ std::tie(SubReg, Offset)
+ = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
+
+ bool UseGPRIdxMode = ST.hasVGPRIndexMode() && EnableVGPRIndexMode;
+
+ if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) {
+ MachineBasicBlock::iterator I(&MI);
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ if (UseGPRIdxMode) {
+ // TODO: Look at the uses to avoid the copy. This may require rescheduling
+ // to avoid interfering with other uses, so probably requires a new
+ // optimization pass.
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
+ .addReg(SrcReg, RegState::Undef, SubReg)
+ .addReg(SrcReg, RegState::Implicit)
+ .addReg(AMDGPU::M0, RegState::Implicit);
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
+ } else {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
+ .addReg(SrcReg, RegState::Undef, SubReg)
+ .addReg(SrcReg, RegState::Implicit);
+ }
+
+ MI.eraseFromParent();
+
+ return &MBB;
+ }
+
+
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineBasicBlock::iterator I(&MI);
+
+ unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
+
+ if (UseGPRIdxMode) {
+ MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
+ .addImm(0) // Reset inside loop.
+ .addImm(VGPRIndexMode::SRC0_ENABLE);
+ SetOn->getOperand(3).setIsUndef();
+
+ // Disable again after the loop.
+ BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
+ }
+
+ auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset, UseGPRIdxMode);
+ MachineBasicBlock *LoopBB = InsPt->getParent();
+
+ if (UseGPRIdxMode) {
+ BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
+ .addReg(SrcReg, RegState::Undef, SubReg)
+ .addReg(SrcReg, RegState::Implicit)
+ .addReg(AMDGPU::M0, RegState::Implicit);
+ } else {
+ BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
+ .addReg(SrcReg, RegState::Undef, SubReg)
+ .addReg(SrcReg, RegState::Implicit);
+ }
+
+ MI.eraseFromParent();
+
+ return LoopBB;
+}
+
+static unsigned getMOVRELDPseudo(const TargetRegisterClass *VecRC) {
+ switch (VecRC->getSize()) {
+ case 4:
+ return AMDGPU::V_MOVRELD_B32_V1;
+ case 8:
+ return AMDGPU::V_MOVRELD_B32_V2;
+ case 16:
+ return AMDGPU::V_MOVRELD_B32_V4;
+ case 32:
+ return AMDGPU::V_MOVRELD_B32_V8;
+ case 64:
+ return AMDGPU::V_MOVRELD_B32_V16;
+ default:
+ llvm_unreachable("unsupported size for MOVRELD pseudos");
+ }
+}
+
+static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
+ MachineBasicBlock &MBB,
+ const SISubtarget &ST) {
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
+ MachineFunction *MF = MBB.getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ unsigned Dst = MI.getOperand(0).getReg();
+ const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
+ const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
+ const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
+ int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
+ const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
+
+ // This can be an immediate, but will be folded later.
+ assert(Val->getReg());
+
+ unsigned SubReg;
+ std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
+ SrcVec->getReg(),
+ Offset);
+ bool UseGPRIdxMode = ST.hasVGPRIndexMode() && EnableVGPRIndexMode;
+
+ if (Idx->getReg() == AMDGPU::NoRegister) {
+ MachineBasicBlock::iterator I(&MI);
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ assert(Offset == 0);
+
+ BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
+ .addOperand(*SrcVec)
+ .addOperand(*Val)
+ .addImm(SubReg);
+
+ MI.eraseFromParent();
+ return &MBB;
+ }
+
+ if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) {
+ MachineBasicBlock::iterator I(&MI);
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ if (UseGPRIdxMode) {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
+ .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
+ .addOperand(*Val)
+ .addReg(Dst, RegState::ImplicitDefine)
+ .addReg(SrcVec->getReg(), RegState::Implicit)
+ .addReg(AMDGPU::M0, RegState::Implicit);
+
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
+ } else {
+ const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(VecRC));
+
+ BuildMI(MBB, I, DL, MovRelDesc)
+ .addReg(Dst, RegState::Define)
+ .addReg(SrcVec->getReg())
+ .addOperand(*Val)
+ .addImm(SubReg - AMDGPU::sub0);
+ }
+
+ MI.eraseFromParent();
+ return &MBB;
+ }
+
+ if (Val->isReg())
+ MRI.clearKillFlags(Val->getReg());
+
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ if (UseGPRIdxMode) {
+ MachineBasicBlock::iterator I(&MI);
+
+ MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
+ .addImm(0) // Reset inside loop.
+ .addImm(VGPRIndexMode::DST_ENABLE);
+ SetOn->getOperand(3).setIsUndef();
+
+ // Disable again after the loop.
+ BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
+ }
+
+ unsigned PhiReg = MRI.createVirtualRegister(VecRC);
+
+ auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
+ Offset, UseGPRIdxMode);
+ MachineBasicBlock *LoopBB = InsPt->getParent();
+
+ if (UseGPRIdxMode) {
+ BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
+ .addReg(PhiReg, RegState::Undef, SubReg) // vdst
+ .addOperand(*Val) // src0
+ .addReg(Dst, RegState::ImplicitDefine)
+ .addReg(PhiReg, RegState::Implicit)
+ .addReg(AMDGPU::M0, RegState::Implicit);
+ } else {
+ const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(VecRC));
+
+ BuildMI(*LoopBB, InsPt, DL, MovRelDesc)
+ .addReg(Dst, RegState::Define)
+ .addReg(PhiReg)
+ .addOperand(*Val)
+ .addImm(SubReg - AMDGPU::sub0);
+ }
+
+ MI.eraseFromParent();
+
+ return LoopBB;
+}
+
+MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
+ MachineInstr &MI, MachineBasicBlock *BB) const {
+
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+ MachineFunction *MF = BB->getParent();
+ SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+
+ if (TII->isMIMG(MI)) {
+ if (!MI.memoperands_empty())
+ return BB;
+ // Add a memoperand for mimg instructions so that they aren't assumed to
+ // be ordered memory instuctions.
+
+ MachinePointerInfo PtrInfo(MFI->getImagePSV());
+ MachineMemOperand::Flags Flags = MachineMemOperand::MODereferenceable;
+ if (MI.mayStore())
+ Flags |= MachineMemOperand::MOStore;
+
+ if (MI.mayLoad())
+ Flags |= MachineMemOperand::MOLoad;
+
+ auto MMO = MF->getMachineMemOperand(PtrInfo, Flags, 0, 0);
+ MI.addMemOperand(*MF, MMO);
+ return BB;
+ }
+
+ switch (MI.getOpcode()) {
+ case AMDGPU::SI_INIT_M0: {
+ BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
+ TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+ .addOperand(MI.getOperand(0));
+ MI.eraseFromParent();
+ return BB;
+ }
+ case AMDGPU::GET_GROUPSTATICSIZE: {
+ DebugLoc DL = MI.getDebugLoc();
+ BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
+ .addOperand(MI.getOperand(0))
+ .addImm(MFI->getLDSSize());
+ MI.eraseFromParent();
+ return BB;
+ }
+ case AMDGPU::SI_INDIRECT_SRC_V1:
+ case AMDGPU::SI_INDIRECT_SRC_V2:
+ case AMDGPU::SI_INDIRECT_SRC_V4:
+ case AMDGPU::SI_INDIRECT_SRC_V8:
+ case AMDGPU::SI_INDIRECT_SRC_V16:
+ return emitIndirectSrc(MI, *BB, *getSubtarget());
+ case AMDGPU::SI_INDIRECT_DST_V1:
+ case AMDGPU::SI_INDIRECT_DST_V2:
+ case AMDGPU::SI_INDIRECT_DST_V4:
+ case AMDGPU::SI_INDIRECT_DST_V8:
+ case AMDGPU::SI_INDIRECT_DST_V16:
+ return emitIndirectDst(MI, *BB, *getSubtarget());
+ case AMDGPU::SI_KILL:
+ return splitKillBlock(MI, BB);
+ case AMDGPU::V_CNDMASK_B64_PSEUDO: {
+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+
+ unsigned Dst = MI.getOperand(0).getReg();
+ unsigned Src0 = MI.getOperand(1).getReg();
+ unsigned Src1 = MI.getOperand(2).getReg();
+ const DebugLoc &DL = MI.getDebugLoc();
+ unsigned SrcCond = MI.getOperand(3).getReg();
+
+ unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
+ .addReg(Src0, 0, AMDGPU::sub0)
+ .addReg(Src1, 0, AMDGPU::sub0)
+ .addReg(SrcCond);
+ BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
+ .addReg(Src0, 0, AMDGPU::sub1)
+ .addReg(Src1, 0, AMDGPU::sub1)
+ .addReg(SrcCond);
+
+ BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
+ .addReg(DstLo)
+ .addImm(AMDGPU::sub0)
+ .addReg(DstHi)
+ .addImm(AMDGPU::sub1);
+ MI.eraseFromParent();
+ return BB;
+ }
+ case AMDGPU::SI_BR_UNDEF: {
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
+ .addOperand(MI.getOperand(0));
+ Br->getOperand(1).setIsUndef(true); // read undef SCC
+ MI.eraseFromParent();
+ return BB;
+ }
+ default:
+ return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
+ }
+}
+
+bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
+ // This currently forces unfolding various combinations of fsub into fma with
+ // free fneg'd operands. As long as we have fast FMA (controlled by
+ // isFMAFasterThanFMulAndFAdd), we should perform these.
+
+ // When fma is quarter rate, for f64 where add / sub are at best half rate,
+ // most of these combines appear to be cycle neutral but save on instruction
+ // count / code size.
+ return true;
+}
+
+EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
+ EVT VT) const {
+ if (!VT.isVector()) {
+ return MVT::i1;
+ }
+ return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
+}
+
+MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
+ // TODO: Should i16 be used always if legal? For now it would force VALU
+ // shifts.
+ return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
+}
+
+// Answering this is somewhat tricky and depends on the specific device which
+// have different rates for fma or all f64 operations.
+//
+// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
+// regardless of which device (although the number of cycles differs between
+// devices), so it is always profitable for f64.
+//
+// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
+// only on full rate devices. Normally, we should prefer selecting v_mad_f32
+// which we can always do even without fused FP ops since it returns the same
+// result as the separate operations and since it is always full
+// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
+// however does not support denormals, so we do report fma as faster if we have
+// a fast fma device and require denormals.
+//
+bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+ VT = VT.getScalarType();
+
+ if (!VT.isSimple())
+ return false;
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ case MVT::f32:
+ // This is as fast on some subtargets. However, we always have full rate f32
+ // mad available which returns the same result as the separate operations
+ // which we should prefer over fma. We can't use this if we want to support
+ // denormals, so only report this in these cases.
+ return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32();
+ case MVT::f64:
+ return true;
+ case MVT::f16:
+ return Subtarget->has16BitInsts() && Subtarget->hasFP16Denormals();
+ default:
+ break;
+ }
+
+ return false;
+}
+
+//===----------------------------------------------------------------------===//
+// Custom DAG Lowering Operations
+//===----------------------------------------------------------------------===//
+
+SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+ switch (Op.getOpcode()) {
+ default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+ case ISD::BRCOND: return LowerBRCOND(Op, DAG);
+ case ISD::LOAD: {
+ SDValue Result = LowerLOAD(Op, DAG);
+ assert((!Result.getNode() ||
+ Result.getNode()->getNumValues() == 2) &&
+ "Load should return a value and a chain");
+ return Result;
+ }
+
+ case ISD::FSIN:
+ case ISD::FCOS:
+ return LowerTrig(Op, DAG);
+ case ISD::SELECT: return LowerSELECT(Op, DAG);
+ case ISD::FDIV: return LowerFDIV(Op, DAG);
+ case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
+ case ISD::STORE: return LowerSTORE(Op, DAG);
+ case ISD::GlobalAddress: {
+ MachineFunction &MF = DAG.getMachineFunction();
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ return LowerGlobalAddress(MFI, Op, DAG);
+ }
+ case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+ case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
+ case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
+ case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
+ case ISD::TRAP: return lowerTRAP(Op, DAG);
+ case ISD::FP_ROUND:
+ return lowerFP_ROUND(Op, DAG);
+ }
+ return SDValue();
+}
+
+/// \brief Helper function for LowerBRCOND
+static SDNode *findUser(SDValue Value, unsigned Opcode) {
+
+ SDNode *Parent = Value.getNode();
+ for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
+ I != E; ++I) {
+
+ if (I.getUse().get() != Value)
+ continue;
+
+ if (I->getOpcode() == Opcode)
+ return *I;
+ }
+ return nullptr;
+}
+
+bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
+ if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
+ switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
+ case AMDGPUIntrinsic::amdgcn_if:
+ case AMDGPUIntrinsic::amdgcn_else:
+ case AMDGPUIntrinsic::amdgcn_end_cf:
+ case AMDGPUIntrinsic::amdgcn_loop:
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ if (Intr->getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
+ switch (cast<ConstantSDNode>(Intr->getOperand(0))->getZExtValue()) {
+ case AMDGPUIntrinsic::amdgcn_break:
+ case AMDGPUIntrinsic::amdgcn_if_break:
+ case AMDGPUIntrinsic::amdgcn_else_break:
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ return false;
+}
+
+void SITargetLowering::createDebuggerPrologueStackObjects(
+ MachineFunction &MF) const {
+ // Create stack objects that are used for emitting debugger prologue.
+ //
+ // Debugger prologue writes work group IDs and work item IDs to scratch memory
+ // at fixed location in the following format:
+ // offset 0: work group ID x
+ // offset 4: work group ID y
+ // offset 8: work group ID z
+ // offset 16: work item ID x
+ // offset 20: work item ID y
+ // offset 24: work item ID z
+ SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ int ObjectIdx = 0;
+
+ // For each dimension:
+ for (unsigned i = 0; i < 3; ++i) {
+ // Create fixed stack object for work group ID.
+ ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4, true);
+ Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx);
+ // Create fixed stack object for work item ID.
+ ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4 + 16, true);
+ Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx);
+ }
+}
+
+bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
+ const Triple &TT = getTargetMachine().getTargetTriple();
+ return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+ AMDGPU::shouldEmitConstantsToTextSection(TT);
+}
+
+bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
+ return (GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
+ GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) &&
+ !shouldEmitFixup(GV) &&
+ !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
+}
+
+bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
+ return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
+}
+
+/// This transforms the control flow intrinsics to get the branch destination as
+/// last parameter, also switches branch target with BR if the need arise
+SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
+ SelectionDAG &DAG) const {
+
+ SDLoc DL(BRCOND);
+
+ SDNode *Intr = BRCOND.getOperand(1).getNode();
+ SDValue Target = BRCOND.getOperand(2);
+ SDNode *BR = nullptr;
+ SDNode *SetCC = nullptr;
+
+ if (Intr->getOpcode() == ISD::SETCC) {
+ // As long as we negate the condition everything is fine
+ SetCC = Intr;
+ Intr = SetCC->getOperand(0).getNode();
+
+ } else {
+ // Get the target from BR if we don't negate the condition
+ BR = findUser(BRCOND, ISD::BR);
+ Target = BR->getOperand(1);
+ }
+
+ // FIXME: This changes the types of the intrinsics instead of introducing new
+ // nodes with the correct types.
+ // e.g. llvm.amdgcn.loop
+
+ // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
+ // => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
+
+ if (!isCFIntrinsic(Intr)) {
+ // This is a uniform branch so we don't need to legalize.
+ return BRCOND;
+ }
+
+ bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
+ Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
+
+ assert(!SetCC ||
+ (SetCC->getConstantOperandVal(1) == 1 &&
+ cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
+ ISD::SETNE));
+
+ // operands of the new intrinsic call
+ SmallVector<SDValue, 4> Ops;
+ if (HaveChain)
+ Ops.push_back(BRCOND.getOperand(0));
+
+ Ops.append(Intr->op_begin() + (HaveChain ? 1 : 0), Intr->op_end());
+ Ops.push_back(Target);
+
+ ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
+
+ // build the new intrinsic call
+ SDNode *Result = DAG.getNode(
+ Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL,
+ DAG.getVTList(Res), Ops).getNode();
+
+ if (!HaveChain) {
+ SDValue Ops[] = {
+ SDValue(Result, 0),
+ BRCOND.getOperand(0)
+ };
+
+ Result = DAG.getMergeValues(Ops, DL).getNode();
+ }
+
+ if (BR) {
+ // Give the branch instruction our target
+ SDValue Ops[] = {
+ BR->getOperand(0),
+ BRCOND.getOperand(2)
+ };
+ SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
+ DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
+ BR = NewBR.getNode();
+ }
+
+ SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
+
+ // Copy the intrinsic results to registers
+ for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
+ SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
+ if (!CopyToReg)
+ continue;
+
+ Chain = DAG.getCopyToReg(
+ Chain, DL,
+ CopyToReg->getOperand(1),
+ SDValue(Result, i - 1),
+ SDValue());
+
+ DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
+ }
+
+ // Remove the old intrinsic from the chain
+ DAG.ReplaceAllUsesOfValueWith(
+ SDValue(Intr, Intr->getNumValues() - 1),
+ Intr->getOperand(0));
+
+ return Chain;
+}
+
+SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG,
+ SDValue Op,
+ const SDLoc &DL,
+ EVT VT) const {
+ return Op.getValueType().bitsLE(VT) ?
+ DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
+ DAG.getNode(ISD::FTRUNC, DL, VT, Op);
+}
+
+SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
+ assert(Op.getValueType() == MVT::f16 &&
+ "Do not know how to custom lower FP_ROUND for non-f16 type");
+
+ SDValue Src = Op.getOperand(0);
+ EVT SrcVT = Src.getValueType();
+ if (SrcVT != MVT::f64)
+ return Op;
+
+ SDLoc DL(Op);
+
+ SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
+ return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);;
+}
+
+SDValue SITargetLowering::getSegmentAperture(unsigned AS,
+ SelectionDAG &DAG) const {
+ SDLoc SL;
+ MachineFunction &MF = DAG.getMachineFunction();
+ SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ unsigned UserSGPR = Info->getQueuePtrUserSGPR();
+ assert(UserSGPR != AMDGPU::NoRegister);
+
+ SDValue QueuePtr = CreateLiveInRegister(
+ DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
+
+ // Offset into amd_queue_t for group_segment_aperture_base_hi /
+ // private_segment_aperture_base_hi.
+ uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
+
+ SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i64, QueuePtr,
+ DAG.getConstant(StructOffset, SL, MVT::i64));
+
+ // TODO: Use custom target PseudoSourceValue.
+ // TODO: We should use the value from the IR intrinsic call, but it might not
+ // be available and how do we get it?
+ Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()),
+ AMDGPUAS::CONSTANT_ADDRESS));
+
+ MachinePointerInfo PtrInfo(V, StructOffset);
+ return DAG.getLoad(MVT::i32, SL, QueuePtr.getValue(1), Ptr, PtrInfo,
+ MinAlign(64, StructOffset),
+ MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant);
+}
+
+SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
+
+ SDValue Src = ASC->getOperand(0);
+
+ // FIXME: Really support non-0 null pointers.
+ SDValue SegmentNullPtr = DAG.getConstant(-1, SL, MVT::i32);
+ SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
+
+ // flat -> local/private
+ if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
+ if (ASC->getDestAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+ ASC->getDestAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
+ SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
+ SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
+
+ return DAG.getNode(ISD::SELECT, SL, MVT::i32,
+ NonNull, Ptr, SegmentNullPtr);
+ }
+ }
+
+ // local/private -> flat
+ if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
+ if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+ ASC->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
+ SDValue NonNull
+ = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
+
+ SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), DAG);
+ SDValue CvtPtr
+ = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
+
+ return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
+ DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
+ FlatNullPtr);
+ }
+ }
+
+ // global <-> flat are no-ops and never emitted.
+
+ const MachineFunction &MF = DAG.getMachineFunction();
+ DiagnosticInfoUnsupported InvalidAddrSpaceCast(
+ *MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
+ DAG.getContext()->diagnose(InvalidAddrSpaceCast);
+
+ return DAG.getUNDEF(ASC->getValueType(0));
+}
+
+bool
+SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+ // We can fold offsets for anything that doesn't require a GOT relocation.
+ return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
+ GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) &&
+ !shouldEmitGOTReloc(GA->getGlobal());
+}
+
+static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
+ SDLoc DL, unsigned Offset, EVT PtrVT,
+ unsigned GAFlags = SIInstrInfo::MO_NONE) {
+ // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
+ // lowered to the following code sequence:
+ //
+ // For constant address space:
+ // s_getpc_b64 s[0:1]
+ // s_add_u32 s0, s0, $symbol
+ // s_addc_u32 s1, s1, 0
+ //
+ // s_getpc_b64 returns the address of the s_add_u32 instruction and then
+ // a fixup or relocation is emitted to replace $symbol with a literal
+ // constant, which is a pc-relative offset from the encoding of the $symbol
+ // operand to the global variable.
+ //
+ // For global address space:
+ // s_getpc_b64 s[0:1]
+ // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
+ // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
+ //
+ // s_getpc_b64 returns the address of the s_add_u32 instruction and then
+ // fixups or relocations are emitted to replace $symbol@*@lo and
+ // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
+ // which is a 64-bit pc-relative offset from the encoding of the $symbol
+ // operand to the global variable.
+ //
+ // What we want here is an offset from the value returned by s_getpc
+ // (which is the address of the s_add_u32 instruction) to the global
+ // variable, but since the encoding of $symbol starts 4 bytes after the start
+ // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
+ // small. This requires us to add 4 to the global variable offset in order to
+ // compute the correct address.
+ SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
+ GAFlags);
+ SDValue PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
+ GAFlags == SIInstrInfo::MO_NONE ?
+ GAFlags : GAFlags + 1);
+ return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
+}
+
+SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
+ SDValue Op,
+ SelectionDAG &DAG) const {
+ GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
+
+ if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS &&
+ GSD->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS)
+ return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
+
+ SDLoc DL(GSD);
+ const GlobalValue *GV = GSD->getGlobal();
+ EVT PtrVT = Op.getValueType();
+
+ if (shouldEmitFixup(GV))
+ return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
+ else if (shouldEmitPCReloc(GV))
+ return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
+ SIInstrInfo::MO_REL32);
+
+ SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
+ SIInstrInfo::MO_GOTPCREL32);
+
+ Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
+ PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
+ const DataLayout &DataLayout = DAG.getDataLayout();
+ unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
+ // FIXME: Use a PseudoSourceValue once those can be assigned an address space.
+ MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
+
+ return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align,
+ MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant);
+}
+
+SDValue SITargetLowering::lowerTRAP(SDValue Op,
+ SelectionDAG &DAG) const {
+ const MachineFunction &MF = DAG.getMachineFunction();
+ DiagnosticInfoUnsupported NoTrap(*MF.getFunction(),
+ "trap handler not supported",
+ Op.getDebugLoc(),
+ DS_Warning);
+ DAG.getContext()->diagnose(NoTrap);
+
+ // Emit s_endpgm.
+
+ // FIXME: This should really be selected to s_trap, but that requires
+ // setting up the trap handler for it o do anything.
+ return DAG.getNode(AMDGPUISD::ENDPGM, SDLoc(Op), MVT::Other,
+ Op.getOperand(0));
+}
+
+SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
+ const SDLoc &DL, SDValue V) const {
+ // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
+ // the destination register.
+ //
+ // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
+ // so we will end up with redundant moves to m0.
+ //
+ // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
+
+ // A Null SDValue creates a glue result.
+ SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
+ V, Chain);
+ return SDValue(M0, 0);
+}
+
+SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
+ SDValue Op,
+ MVT VT,
+ unsigned Offset) const {
+ SDLoc SL(Op);
+ SDValue Param = LowerParameter(DAG, MVT::i32, MVT::i32, SL,
+ DAG.getEntryNode(), Offset, false);
+ // The local size values will have the hi 16-bits as zero.
+ return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
+ DAG.getValueType(VT));
+}
+
+static SDValue emitNonHSAIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) {
+ DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(),
+ "non-hsa intrinsic with hsa target",
+ DL.getDebugLoc());
+ DAG.getContext()->diagnose(BadIntrin);
+ return DAG.getUNDEF(VT);
+}
+
+static SDValue emitRemovedIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) {
+ DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(),
+ "intrinsic not supported on subtarget",
+ DL.getDebugLoc());
+ DAG.getContext()->diagnose(BadIntrin);
+ return DAG.getUNDEF(VT);
+}
+
+SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
+ SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ auto MFI = MF.getInfo<SIMachineFunctionInfo>();
+ const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
+
+ EVT VT = Op.getValueType();
+ SDLoc DL(Op);
+ unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+
+ // TODO: Should this propagate fast-math-flags?
+
+ switch (IntrinsicID) {
+ case Intrinsic::amdgcn_dispatch_ptr:
+ case Intrinsic::amdgcn_queue_ptr: {
+ if (!Subtarget->isAmdCodeObjectV2()) {
+ DiagnosticInfoUnsupported BadIntrin(
+ *MF.getFunction(), "unsupported hsa intrinsic without hsa target",
+ DL.getDebugLoc());
+ DAG.getContext()->diagnose(BadIntrin);
+ return DAG.getUNDEF(VT);
+ }
+
+ auto Reg = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
+ SIRegisterInfo::DISPATCH_PTR : SIRegisterInfo::QUEUE_PTR;
+ return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass,
+ TRI->getPreloadedValue(MF, Reg), VT);
+ }
+ case Intrinsic::amdgcn_implicitarg_ptr: {
+ unsigned offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
+ return LowerParameterPtr(DAG, DL, DAG.getEntryNode(), offset);
+ }
+ case Intrinsic::amdgcn_kernarg_segment_ptr: {
+ unsigned Reg
+ = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
+ return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
+ }
+ case Intrinsic::amdgcn_dispatch_id: {
+ unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_ID);
+ return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
+ }
+ case Intrinsic::amdgcn_rcp:
+ return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
+ case Intrinsic::amdgcn_rsq:
+ case AMDGPUIntrinsic::AMDGPU_rsq: // Legacy name
+ return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
+ case Intrinsic::amdgcn_rsq_legacy: {
+ if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
+ return emitRemovedIntrinsicError(DAG, DL, VT);
+
+ return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
+ }
+ case Intrinsic::amdgcn_rcp_legacy: {
+ if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
+ return emitRemovedIntrinsicError(DAG, DL, VT);
+ return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
+ }
+ case Intrinsic::amdgcn_rsq_clamp: {
+ if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
+ return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
+
+ Type *Type = VT.getTypeForEVT(*DAG.getContext());
+ APFloat Max = APFloat::getLargest(Type->getFltSemantics());
+ APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
+
+ SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
+ SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
+ DAG.getConstantFP(Max, DL, VT));
+ return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
+ DAG.getConstantFP(Min, DL, VT));
+ }
+ case Intrinsic::r600_read_ngroups_x:
+ if (Subtarget->isAmdHsaOS())
+ return emitNonHSAIntrinsicError(DAG, DL, VT);
+
+ return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+ SI::KernelInputOffsets::NGROUPS_X, false);
+ case Intrinsic::r600_read_ngroups_y:
+ if (Subtarget->isAmdHsaOS())
+ return emitNonHSAIntrinsicError(DAG, DL, VT);
+
+ return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+ SI::KernelInputOffsets::NGROUPS_Y, false);
+ case Intrinsic::r600_read_ngroups_z:
+ if (Subtarget->isAmdHsaOS())
+ return emitNonHSAIntrinsicError(DAG, DL, VT);
+
+ return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+ SI::KernelInputOffsets::NGROUPS_Z, false);
+ case Intrinsic::r600_read_global_size_x:
+ if (Subtarget->isAmdHsaOS())
+ return emitNonHSAIntrinsicError(DAG, DL, VT);
+
+ return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+ SI::KernelInputOffsets::GLOBAL_SIZE_X, false);
+ case Intrinsic::r600_read_global_size_y:
+ if (Subtarget->isAmdHsaOS())
+ return emitNonHSAIntrinsicError(DAG, DL, VT);
+
+ return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+ SI::KernelInputOffsets::GLOBAL_SIZE_Y, false);
+ case Intrinsic::r600_read_global_size_z:
+ if (Subtarget->isAmdHsaOS())
+ return emitNonHSAIntrinsicError(DAG, DL, VT);
+
+ return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+ SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
+ case Intrinsic::r600_read_local_size_x:
+ if (Subtarget->isAmdHsaOS())
+ return emitNonHSAIntrinsicError(DAG, DL, VT);
+
+ return lowerImplicitZextParam(DAG, Op, MVT::i16,
+ SI::KernelInputOffsets::LOCAL_SIZE_X);
+ case Intrinsic::r600_read_local_size_y:
+ if (Subtarget->isAmdHsaOS())
+ return emitNonHSAIntrinsicError(DAG, DL, VT);
+
+ return lowerImplicitZextParam(DAG, Op, MVT::i16,
+ SI::KernelInputOffsets::LOCAL_SIZE_Y);
+ case Intrinsic::r600_read_local_size_z:
+ if (Subtarget->isAmdHsaOS())
+ return emitNonHSAIntrinsicError(DAG, DL, VT);
+
+ return lowerImplicitZextParam(DAG, Op, MVT::i16,
+ SI::KernelInputOffsets::LOCAL_SIZE_Z);
+ case Intrinsic::amdgcn_workgroup_id_x:
+ case Intrinsic::r600_read_tgid_x:
+ return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass,
+ TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT);
+ case Intrinsic::amdgcn_workgroup_id_y:
+ case Intrinsic::r600_read_tgid_y:
+ return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass,
+ TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT);
+ case Intrinsic::amdgcn_workgroup_id_z:
+ case Intrinsic::r600_read_tgid_z:
+ return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass,
+ TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT);
+ case Intrinsic::amdgcn_workitem_id_x:
+ case Intrinsic::r600_read_tidig_x:
+ return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
+ TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT);
+ case Intrinsic::amdgcn_workitem_id_y:
+ case Intrinsic::r600_read_tidig_y:
+ return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
+ TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT);
+ case Intrinsic::amdgcn_workitem_id_z:
+ case Intrinsic::r600_read_tidig_z:
+ return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
+ TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT);
+ case AMDGPUIntrinsic::SI_load_const: {
+ SDValue Ops[] = {
+ Op.getOperand(1),
+ Op.getOperand(2)
+ };
+
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo(),
+ MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant,
+ VT.getStoreSize(), 4);
+ return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
+ Op->getVTList(), Ops, VT, MMO);
+ }
+ case AMDGPUIntrinsic::amdgcn_fdiv_fast: {
+ return lowerFDIV_FAST(Op, DAG);
+ }
+ case AMDGPUIntrinsic::SI_vs_load_input:
+ return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
+ Op.getOperand(1),
+ Op.getOperand(2),
+ Op.getOperand(3));
+
+ case AMDGPUIntrinsic::SI_fs_constant: {
+ SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3));
+ SDValue Glue = M0.getValue(1);
+ return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32,
+ DAG.getConstant(2, DL, MVT::i32), // P0
+ Op.getOperand(1), Op.getOperand(2), Glue);
+ }
+ case AMDGPUIntrinsic::SI_packf16:
+ if (Op.getOperand(1).isUndef() && Op.getOperand(2).isUndef())
+ return DAG.getUNDEF(MVT::i32);
+ return Op;
+ case AMDGPUIntrinsic::SI_fs_interp: {
+ SDValue IJ = Op.getOperand(4);
+ SDValue I = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ,
+ DAG.getConstant(0, DL, MVT::i32));
+ SDValue J = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ,
+ DAG.getConstant(1, DL, MVT::i32));
+ I = DAG.getNode(ISD::BITCAST, DL, MVT::f32, I);
+ J = DAG.getNode(ISD::BITCAST, DL, MVT::f32, J);
+ SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3));
+ SDValue Glue = M0.getValue(1);
+ SDValue P1 = DAG.getNode(AMDGPUISD::INTERP_P1, DL,
+ DAG.getVTList(MVT::f32, MVT::Glue),
+ I, Op.getOperand(1), Op.getOperand(2), Glue);
+ Glue = SDValue(P1.getNode(), 1);
+ return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J,
+ Op.getOperand(1), Op.getOperand(2), Glue);
+ }
+ case Intrinsic::amdgcn_interp_mov: {
+ SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
+ SDValue Glue = M0.getValue(1);
+ return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, Op.getOperand(1),
+ Op.getOperand(2), Op.getOperand(3), Glue);
+ }
+ case Intrinsic::amdgcn_interp_p1: {
+ SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
+ SDValue Glue = M0.getValue(1);
+ return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1),
+ Op.getOperand(2), Op.getOperand(3), Glue);
+ }
+ case Intrinsic::amdgcn_interp_p2: {
+ SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
+ SDValue Glue = SDValue(M0.getNode(), 1);
+ return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1),
+ Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
+ Glue);
+ }
+ case Intrinsic::amdgcn_sin:
+ return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
+
+ case Intrinsic::amdgcn_cos:
+ return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
+
+ case Intrinsic::amdgcn_log_clamp: {
+ if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
+ return SDValue();
+
+ DiagnosticInfoUnsupported BadIntrin(
+ *MF.getFunction(), "intrinsic not supported on subtarget",
+ DL.getDebugLoc());
+ DAG.getContext()->diagnose(BadIntrin);
+ return DAG.getUNDEF(VT);
+ }
+ case Intrinsic::amdgcn_ldexp:
+ return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
+ Op.getOperand(1), Op.getOperand(2));
+
+ case Intrinsic::amdgcn_fract:
+ return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
+
+ case Intrinsic::amdgcn_class:
+ return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::amdgcn_div_fmas:
+ return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
+ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
+ Op.getOperand(4));
+
+ case Intrinsic::amdgcn_div_fixup:
+ return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
+ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+ case Intrinsic::amdgcn_trig_preop:
+ return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::amdgcn_div_scale: {
+ // 3rd parameter required to be a constant.
+ const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
+ if (!Param)
+ return DAG.getUNDEF(VT);
+
+ // Translate to the operands expected by the machine instruction. The
+ // first parameter must be the same as the first instruction.
+ SDValue Numerator = Op.getOperand(1);
+ SDValue Denominator = Op.getOperand(2);
+
+ // Note this order is opposite of the machine instruction's operations,
+ // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
+ // intrinsic has the numerator as the first operand to match a normal
+ // division operation.
+
+ SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
+
+ return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
+ Denominator, Numerator);
+ }
+ case Intrinsic::amdgcn_icmp: {
+ const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
+ int CondCode = CD->getSExtValue();
+
+ if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
+ CondCode >= ICmpInst::Predicate::BAD_ICMP_PREDICATE)
+ return DAG.getUNDEF(VT);
+
+ ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
+ ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
+ return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
+ Op.getOperand(2), DAG.getCondCode(CCOpcode));
+ }
+ case Intrinsic::amdgcn_fcmp: {
+ const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
+ int CondCode = CD->getSExtValue();
+
+ if (CondCode <= FCmpInst::Predicate::FCMP_FALSE ||
+ CondCode >= FCmpInst::Predicate::FCMP_TRUE)
+ return DAG.getUNDEF(VT);
+
+ FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
+ ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
+ return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
+ Op.getOperand(2), DAG.getCondCode(CCOpcode));
+ }
+ case Intrinsic::amdgcn_fmul_legacy:
+ return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::amdgcn_sffbh:
+ case AMDGPUIntrinsic::AMDGPU_flbit_i32: // Legacy name.
+ return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
+ default:
+ return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+ }
+}
+
+SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
+ SelectionDAG &DAG) const {
+ unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+ SDLoc DL(Op);
+ switch (IntrID) {
+ case Intrinsic::amdgcn_atomic_inc:
+ case Intrinsic::amdgcn_atomic_dec: {
+ MemSDNode *M = cast<MemSDNode>(Op);
+ unsigned Opc = (IntrID == Intrinsic::amdgcn_atomic_inc) ?
+ AMDGPUISD::ATOMIC_INC : AMDGPUISD::ATOMIC_DEC;
+ SDValue Ops[] = {
+ M->getOperand(0), // Chain
+ M->getOperand(2), // Ptr
+ M->getOperand(3) // Value
+ };
+
+ return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
+ M->getMemoryVT(), M->getMemOperand());
+ }
+ case Intrinsic::amdgcn_buffer_load:
+ case Intrinsic::amdgcn_buffer_load_format: {
+ SDValue Ops[] = {
+ Op.getOperand(0), // Chain
+ Op.getOperand(2), // rsrc
+ Op.getOperand(3), // vindex
+ Op.getOperand(4), // offset
+ Op.getOperand(5), // glc
+ Op.getOperand(6) // slc
+ };
+ MachineFunction &MF = DAG.getMachineFunction();
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+ unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
+ AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
+ EVT VT = Op.getValueType();
+ EVT IntVT = VT.changeTypeToInteger();
+
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo(MFI->getBufferPSV()),
+ MachineMemOperand::MOLoad,
+ VT.getStoreSize(), VT.getStoreSize());
+
+ return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, MMO);
+ }
+ default:
+ return SDValue();
+ }
+}
+
+SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
+ SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ SDLoc DL(Op);
+ SDValue Chain = Op.getOperand(0);
+ unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+
+ switch (IntrinsicID) {
+ case AMDGPUIntrinsic::SI_sendmsg: {
+ Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
+ SDValue Glue = Chain.getValue(1);
+ return DAG.getNode(AMDGPUISD::SENDMSG, DL, MVT::Other, Chain,
+ Op.getOperand(2), Glue);
+ }
+ case AMDGPUIntrinsic::SI_tbuffer_store: {
+ SDValue Ops[] = {
+ Chain,
+ Op.getOperand(2),
+ Op.getOperand(3),
+ Op.getOperand(4),
+ Op.getOperand(5),
+ Op.getOperand(6),
+ Op.getOperand(7),
+ Op.getOperand(8),
+ Op.getOperand(9),
+ Op.getOperand(10),
+ Op.getOperand(11),
+ Op.getOperand(12),
+ Op.getOperand(13),
+ Op.getOperand(14)
+ };
+
+ EVT VT = Op.getOperand(3).getValueType();
+
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo(),
+ MachineMemOperand::MOStore,
+ VT.getStoreSize(), 4);
+ return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
+ Op->getVTList(), Ops, VT, MMO);
+ }
+ case AMDGPUIntrinsic::AMDGPU_kill: {
+ SDValue Src = Op.getOperand(2);
+ if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Src)) {
+ if (!K->isNegative())
+ return Chain;
+
+ SDValue NegOne = DAG.getTargetConstant(FloatToBits(-1.0f), DL, MVT::i32);
+ return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, NegOne);
+ }
+
+ SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src);
+ return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, Cast);
+ }
+ case AMDGPUIntrinsic::SI_export: {
+ const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(2));
+ const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(3));
+ const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(4));
+ const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(5));
+ const ConstantSDNode *Compr = cast<ConstantSDNode>(Op.getOperand(6));
+
+ const SDValue Ops[] = {
+ Chain,
+ DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8),
+ DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1),
+ DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8),
+ DAG.getTargetConstant(Compr->getZExtValue(), DL, MVT::i1),
+ Op.getOperand(7), // src0
+ Op.getOperand(8), // src1
+ Op.getOperand(9), // src2
+ Op.getOperand(10) // src3
+ };
+
+ unsigned Opc = Done->isNullValue() ?
+ AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
+ return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
+ }
+ default:
+ return SDValue();
+ }
+}
+
+SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ LoadSDNode *Load = cast<LoadSDNode>(Op);
+ ISD::LoadExtType ExtType = Load->getExtensionType();
+ EVT MemVT = Load->getMemoryVT();
+
+ if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
+ // FIXME: Copied from PPC
+ // First, load into 32 bits, then truncate to 1 bit.
+
+ SDValue Chain = Load->getChain();
+ SDValue BasePtr = Load->getBasePtr();
+ MachineMemOperand *MMO = Load->getMemOperand();
+
+ EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
+
+ SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
+ BasePtr, RealMemVT, MMO);
+
+ SDValue Ops[] = {
+ DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
+ NewLD.getValue(1)
+ };
+
+ return DAG.getMergeValues(Ops, DL);
+ }
+
+ if (!MemVT.isVector())
+ return SDValue();
+
+ assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
+ "Custom lowering for non-i32 vectors hasn't been implemented.");
+
+ unsigned AS = Load->getAddressSpace();
+ if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
+ AS, Load->getAlignment())) {
+ SDValue Ops[2];
+ std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
+ return DAG.getMergeValues(Ops, DL);
+ }
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ // If there is a possibilty that flat instruction access scratch memory
+ // then we need to use the same legalization rules we use for private.
+ if (AS == AMDGPUAS::FLAT_ADDRESS)
+ AS = MFI->hasFlatScratchInit() ?
+ AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
+
+ unsigned NumElements = MemVT.getVectorNumElements();
+ switch (AS) {
+ case AMDGPUAS::CONSTANT_ADDRESS:
+ if (isMemOpUniform(Load))
+ return SDValue();
+ // Non-uniform loads will be selected to MUBUF instructions, so they
+ // have the same legalization requirements as global and private
+ // loads.
+ //
+ LLVM_FALLTHROUGH;
+ case AMDGPUAS::GLOBAL_ADDRESS: {
+ if (Subtarget->getScalarizeGlobalBehavior() && isMemOpUniform(Load) &&
+ isMemOpHasNoClobberedMemOperand(Load))
+ return SDValue();
+ // Non-uniform loads will be selected to MUBUF instructions, so they
+ // have the same legalization requirements as global and private
+ // loads.
+ //
+ }
+ LLVM_FALLTHROUGH;
+ case AMDGPUAS::FLAT_ADDRESS:
+ if (NumElements > 4)
+ return SplitVectorLoad(Op, DAG);
+ // v4 loads are supported for private and global memory.
+ return SDValue();
+ case AMDGPUAS::PRIVATE_ADDRESS: {
+ // Depending on the setting of the private_element_size field in the
+ // resource descriptor, we can only make private accesses up to a certain
+ // size.
+ switch (Subtarget->getMaxPrivateElementSize()) {
+ case 4:
+ return scalarizeVectorLoad(Load, DAG);
+ case 8:
+ if (NumElements > 2)
+ return SplitVectorLoad(Op, DAG);
+ return SDValue();
+ case 16:
+ // Same as global/flat
+ if (NumElements > 4)
+ return SplitVectorLoad(Op, DAG);
+ return SDValue();
+ default:
+ llvm_unreachable("unsupported private_element_size");
+ }
+ }
+ case AMDGPUAS::LOCAL_ADDRESS: {
+ if (NumElements > 2)
+ return SplitVectorLoad(Op, DAG);
+
+ if (NumElements == 2)
+ return SDValue();
+
+ // If properly aligned, if we split we might be able to use ds_read_b64.
+ return SplitVectorLoad(Op, DAG);
+ }
+ default:
+ return SDValue();
+ }
+}
+
+SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
+ if (Op.getValueType() != MVT::i64)
+ return SDValue();
+
+ SDLoc DL(Op);
+ SDValue Cond = Op.getOperand(0);
+
+ SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
+ SDValue One = DAG.getConstant(1, DL, MVT::i32);
+
+ SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
+ SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
+
+ SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
+ SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
+
+ SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
+
+ SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
+ SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
+
+ SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
+
+ SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
+ return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res);
+}
+
+// Catch division cases where we can use shortcuts with rcp and rsq
+// instructions.
+SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ EVT VT = Op.getValueType();
+ bool Unsafe = DAG.getTarget().Options.UnsafeFPMath;
+
+ if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
+ if (Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
+ VT == MVT::f16) {
+ if (CLHS->isExactlyValue(1.0)) {
+ // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
+ // the CI documentation has a worst case error of 1 ulp.
+ // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
+ // use it as long as we aren't trying to use denormals.
+ //
+ // v_rcp_f16 and v_rsq_f16 DO support denormals.
+
+ // 1.0 / sqrt(x) -> rsq(x)
+
+ // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
+ // error seems really high at 2^29 ULP.
+ if (RHS.getOpcode() == ISD::FSQRT)
+ return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
+
+ // 1.0 / x -> rcp(x)
+ return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
+ }
+
+ // Same as for 1.0, but expand the sign out of the constant.
+ if (CLHS->isExactlyValue(-1.0)) {
+ // -1.0 / x -> rcp (fneg x)
+ SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+ return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
+ }
+ }
+ }
+
+ const SDNodeFlags *Flags = Op->getFlags();
+
+ if (Unsafe || Flags->hasAllowReciprocal()) {
+ // Turn into multiply by the reciprocal.
+ // x / y -> x * (1.0 / y)
+ SDNodeFlags Flags;
+ Flags.setUnsafeAlgebra(true);
+ SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
+ return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, &Flags);
+ }
+
+ return SDValue();
+}
+
+static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
+ EVT VT, SDValue A, SDValue B, SDValue GlueChain) {
+ if (GlueChain->getNumValues() <= 1) {
+ return DAG.getNode(Opcode, SL, VT, A, B);
+ }
+
+ assert(GlueChain->getNumValues() == 3);
+
+ SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
+ switch (Opcode) {
+ default: llvm_unreachable("no chain equivalent for opcode");
+ case ISD::FMUL:
+ Opcode = AMDGPUISD::FMUL_W_CHAIN;
+ break;
+ }
+
+ return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B,
+ GlueChain.getValue(2));
+}
+
+static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
+ EVT VT, SDValue A, SDValue B, SDValue C,
+ SDValue GlueChain) {
+ if (GlueChain->getNumValues() <= 1) {
+ return DAG.getNode(Opcode, SL, VT, A, B, C);
+ }
+
+ assert(GlueChain->getNumValues() == 3);
+
+ SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
+ switch (Opcode) {
+ default: llvm_unreachable("no chain equivalent for opcode");
+ case ISD::FMA:
+ Opcode = AMDGPUISD::FMA_W_CHAIN;
+ break;
+ }
+
+ return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, C,
+ GlueChain.getValue(2));
+}
+
+SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
+ if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
+ return FastLowered;
+
+ SDLoc SL(Op);
+ SDValue Src0 = Op.getOperand(0);
+ SDValue Src1 = Op.getOperand(1);
+
+ SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
+ SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
+
+ SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
+ SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
+
+ SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
+ SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
+
+ return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
+}
+
+// Faster 2.5 ULP division that does not support denormals.
+SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ SDValue LHS = Op.getOperand(1);
+ SDValue RHS = Op.getOperand(2);
+
+ SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
+
+ const APFloat K0Val(BitsToFloat(0x6f800000));
+ const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
+
+ const APFloat K1Val(BitsToFloat(0x2f800000));
+ const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
+
+ const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
+
+ EVT SetCCVT =
+ getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
+
+ SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
+
+ SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
+
+ // TODO: Should this propagate fast-math-flags?
+ r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
+
+ // rcp does not support denormals.
+ SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
+
+ SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
+
+ return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
+}
+
+SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
+ if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
+ return FastLowered;
+
+ SDLoc SL(Op);
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+
+ const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
+
+ SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
+
+ SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
+ RHS, RHS, LHS);
+ SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
+ LHS, RHS, LHS);
+
+ // Denominator is scaled to not be denormal, so using rcp is ok.
+ SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
+ DenominatorScaled);
+ SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
+ DenominatorScaled);
+
+ const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
+ (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
+ (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
+
+ const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16);
+
+ if (!Subtarget->hasFP32Denormals()) {
+ SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
+ SL, MVT::i32);
+ SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs,
+ DAG.getEntryNode(),
+ EnableDenormValue, BitField);
+ SDValue Ops[3] = {
+ NegDivScale0,
+ EnableDenorm.getValue(0),
+ EnableDenorm.getValue(1)
+ };
+
+ NegDivScale0 = DAG.getMergeValues(Ops, SL);
+ }
+
+ SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
+ ApproxRcp, One, NegDivScale0);
+
+ SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
+ ApproxRcp, Fma0);
+
+ SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
+ Fma1, Fma1);
+
+ SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
+ NumeratorScaled, Mul);
+
+ SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2);
+
+ SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
+ NumeratorScaled, Fma3);
+
+ if (!Subtarget->hasFP32Denormals()) {
+ const SDValue DisableDenormValue =
+ DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
+ SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other,
+ Fma4.getValue(1),
+ DisableDenormValue,
+ BitField,
+ Fma4.getValue(2));
+
+ SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
+ DisableDenorm, DAG.getRoot());
+ DAG.setRoot(OutputChain);
+ }
+
+ SDValue Scale = NumeratorScaled.getValue(1);
+ SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
+ Fma4, Fma1, Fma3, Scale);
+
+ return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
+}
+
+SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
+ if (DAG.getTarget().Options.UnsafeFPMath)
+ return lowerFastUnsafeFDIV(Op, DAG);
+
+ SDLoc SL(Op);
+ SDValue X = Op.getOperand(0);
+ SDValue Y = Op.getOperand(1);
+
+ const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
+
+ SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
+
+ SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
+
+ SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
+
+ SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
+
+ SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
+
+ SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
+
+ SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
+
+ SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
+
+ SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
+ SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
+
+ SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
+ NegDivScale0, Mul, DivScale1);
+
+ SDValue Scale;
+
+ if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
+ // Workaround a hardware bug on SI where the condition output from div_scale
+ // is not usable.
+
+ const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
+
+ // Figure out if the scale to use for div_fmas.
+ SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
+ SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
+ SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
+ SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
+
+ SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
+ SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
+
+ SDValue Scale0Hi
+ = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
+ SDValue Scale1Hi
+ = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
+
+ SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
+ SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
+ Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
+ } else {
+ Scale = DivScale1.getValue(1);
+ }
+
+ SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
+ Fma4, Fma3, Mul, Scale);
+
+ return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
+}
+
+SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+
+ if (VT == MVT::f32)
+ return LowerFDIV32(Op, DAG);
+
+ if (VT == MVT::f64)
+ return LowerFDIV64(Op, DAG);
+
+ if (VT == MVT::f16)
+ return LowerFDIV16(Op, DAG);
+
+ llvm_unreachable("Unexpected type for fdiv");
+}
+
+SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ StoreSDNode *Store = cast<StoreSDNode>(Op);
+ EVT VT = Store->getMemoryVT();
+
+ if (VT == MVT::i1) {
+ return DAG.getTruncStore(Store->getChain(), DL,
+ DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
+ Store->getBasePtr(), MVT::i1, Store->getMemOperand());
+ }
+
+ assert(VT.isVector() &&
+ Store->getValue().getValueType().getScalarType() == MVT::i32);
+
+ unsigned AS = Store->getAddressSpace();
+ if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
+ AS, Store->getAlignment())) {
+ return expandUnalignedStore(Store, DAG);
+ }
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ // If there is a possibilty that flat instruction access scratch memory
+ // then we need to use the same legalization rules we use for private.
+ if (AS == AMDGPUAS::FLAT_ADDRESS)
+ AS = MFI->hasFlatScratchInit() ?
+ AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
+
+ unsigned NumElements = VT.getVectorNumElements();
+ switch (AS) {
+ case AMDGPUAS::GLOBAL_ADDRESS:
+ case AMDGPUAS::FLAT_ADDRESS:
+ if (NumElements > 4)
+ return SplitVectorStore(Op, DAG);
+ return SDValue();
+ case AMDGPUAS::PRIVATE_ADDRESS: {
+ switch (Subtarget->getMaxPrivateElementSize()) {
+ case 4:
+ return scalarizeVectorStore(Store, DAG);
+ case 8:
+ if (NumElements > 2)
+ return SplitVectorStore(Op, DAG);
+ return SDValue();
+ case 16:
+ if (NumElements > 4)
+ return SplitVectorStore(Op, DAG);
+ return SDValue();
+ default:
+ llvm_unreachable("unsupported private_element_size");
+ }
+ }
+ case AMDGPUAS::LOCAL_ADDRESS: {
+ if (NumElements > 2)
+ return SplitVectorStore(Op, DAG);
+
+ if (NumElements == 2)
+ return Op;
+
+ // If properly aligned, if we split we might be able to use ds_write_b64.
+ return SplitVectorStore(Op, DAG);
+ }
+ default:
+ llvm_unreachable("unhandled address space");
+ }
+}
+
+SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+ SDValue Arg = Op.getOperand(0);
+ // TODO: Should this propagate fast-math-flags?
+ SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
+ DAG.getNode(ISD::FMUL, DL, VT, Arg,
+ DAG.getConstantFP(0.5/M_PI, DL,
+ VT)));
+
+ switch (Op.getOpcode()) {
+ case ISD::FCOS:
+ return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart);
+ case ISD::FSIN:
+ return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart);
+ default:
+ llvm_unreachable("Wrong trig opcode");
+ }
+}
+
+SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
+ AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
+ assert(AtomicNode->isCompareAndSwap());
+ unsigned AS = AtomicNode->getAddressSpace();
+
+ // No custom lowering required for local address space
+ if (!isFlatGlobalAddrSpace(AS))
+ return Op;
+
+ // Non-local address space requires custom lowering for atomic compare
+ // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
+ SDLoc DL(Op);
+ SDValue ChainIn = Op.getOperand(0);
+ SDValue Addr = Op.getOperand(1);
+ SDValue Old = Op.getOperand(2);
+ SDValue New = Op.getOperand(3);
+ EVT VT = Op.getValueType();
+ MVT SimpleVT = VT.getSimpleVT();
+ MVT VecType = MVT::getVectorVT(SimpleVT, 2);
+
+ SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
+ SDValue Ops[] = { ChainIn, Addr, NewOld };
+
+ return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
+ Ops, VT, AtomicNode->getMemOperand());
+}
+
+//===----------------------------------------------------------------------===//
+// Custom DAG optimizations
+//===----------------------------------------------------------------------===//
+
+SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ EVT VT = N->getValueType(0);
+ EVT ScalarVT = VT.getScalarType();
+ if (ScalarVT != MVT::f32)
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc DL(N);
+
+ SDValue Src = N->getOperand(0);
+ EVT SrcVT = Src.getValueType();
+
+ // TODO: We could try to match extracting the higher bytes, which would be
+ // easier if i8 vectors weren't promoted to i32 vectors, particularly after
+ // types are legalized. v4i8 -> v4f32 is probably the only case to worry
+ // about in practice.
+ if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) {
+ if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
+ SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
+ DCI.AddToWorklist(Cvt.getNode());
+ return Cvt;
+ }
+ }
+
+ return SDValue();
+}
+
+/// \brief Return true if the given offset Size in bytes can be folded into
+/// the immediate offsets of a memory instruction for the given address space.
+static bool canFoldOffset(unsigned OffsetSize, unsigned AS,
+ const SISubtarget &STI) {
+ switch (AS) {
+ case AMDGPUAS::GLOBAL_ADDRESS: {
+ // MUBUF instructions a 12-bit offset in bytes.
+ return isUInt<12>(OffsetSize);
+ }
+ case AMDGPUAS::CONSTANT_ADDRESS: {
+ // SMRD instructions have an 8-bit offset in dwords on SI and
+ // a 20-bit offset in bytes on VI.
+ if (STI.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
+ return isUInt<20>(OffsetSize);
+ else
+ return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4);
+ }
+ case AMDGPUAS::LOCAL_ADDRESS:
+ case AMDGPUAS::REGION_ADDRESS: {
+ // The single offset versions have a 16-bit offset in bytes.
+ return isUInt<16>(OffsetSize);
+ }
+ case AMDGPUAS::PRIVATE_ADDRESS:
+ // Indirect register addressing does not use any offsets.
+ default:
+ return 0;
+ }
+}
+
+// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
+
+// This is a variant of
+// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
+//
+// The normal DAG combiner will do this, but only if the add has one use since
+// that would increase the number of instructions.
+//
+// This prevents us from seeing a constant offset that can be folded into a
+// memory instruction's addressing mode. If we know the resulting add offset of
+// a pointer can be folded into an addressing offset, we can replace the pointer
+// operand with the add of new constant offset. This eliminates one of the uses,
+// and may allow the remaining use to also be simplified.
+//
+SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
+ unsigned AddrSpace,
+ DAGCombinerInfo &DCI) const {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ if (N0.getOpcode() != ISD::ADD)
+ return SDValue();
+
+ const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
+ if (!CN1)
+ return SDValue();
+
+ const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+ if (!CAdd)
+ return SDValue();
+
+ // If the resulting offset is too large, we can't fold it into the addressing
+ // mode offset.
+ APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
+ if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *getSubtarget()))
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc SL(N);
+ EVT VT = N->getValueType(0);
+
+ SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
+ SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32);
+
+ return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset);
+}
+
+SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SDValue Ptr = N->getBasePtr();
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc SL(N);
+
+ // TODO: We could also do this for multiplies.
+ unsigned AS = N->getAddressSpace();
+ if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) {
+ SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI);
+ if (NewPtr) {
+ SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
+
+ NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
+ return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+ }
+ }
+
+ return SDValue();
+}
+
+static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
+ return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
+ (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
+ (Opc == ISD::XOR && Val == 0);
+}
+
+// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
+// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
+// integer combine opportunities since most 64-bit operations are decomposed
+// this way. TODO: We won't want this for SALU especially if it is an inline
+// immediate.
+SDValue SITargetLowering::splitBinaryBitConstantOp(
+ DAGCombinerInfo &DCI,
+ const SDLoc &SL,
+ unsigned Opc, SDValue LHS,
+ const ConstantSDNode *CRHS) const {
+ uint64_t Val = CRHS->getZExtValue();
+ uint32_t ValLo = Lo_32(Val);
+ uint32_t ValHi = Hi_32(Val);
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+
+ if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
+ bitOpWithConstantIsReducible(Opc, ValHi)) ||
+ (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
+ // If we need to materialize a 64-bit immediate, it will be split up later
+ // anyway. Avoid creating the harder to understand 64-bit immediate
+ // materialization.
+ return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
+ }
+
+ return SDValue();
+}
+
+SDValue SITargetLowering::performAndCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ if (DCI.isBeforeLegalize())
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+
+ if (VT == MVT::i64) {
+ const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
+ if (CRHS) {
+ if (SDValue Split
+ = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
+ return Split;
+ }
+ }
+
+ // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
+ // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
+ if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
+ ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
+ ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
+
+ SDValue X = LHS.getOperand(0);
+ SDValue Y = RHS.getOperand(0);
+ if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X)
+ return SDValue();
+
+ if (LCC == ISD::SETO) {
+ if (X != LHS.getOperand(1))
+ return SDValue();
+
+ if (RCC == ISD::SETUNE) {
+ const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
+ if (!C1 || !C1->isInfinity() || C1->isNegative())
+ return SDValue();
+
+ const uint32_t Mask = SIInstrFlags::N_NORMAL |
+ SIInstrFlags::N_SUBNORMAL |
+ SIInstrFlags::N_ZERO |
+ SIInstrFlags::P_ZERO |
+ SIInstrFlags::P_SUBNORMAL |
+ SIInstrFlags::P_NORMAL;
+
+ static_assert(((~(SIInstrFlags::S_NAN |
+ SIInstrFlags::Q_NAN |
+ SIInstrFlags::N_INFINITY |
+ SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
+ "mask not equal");
+
+ SDLoc DL(N);
+ return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
+ X, DAG.getConstant(Mask, DL, MVT::i32));
+ }
+ }
+ }
+
+ return SDValue();
+}
+
+SDValue SITargetLowering::performOrCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ EVT VT = N->getValueType(0);
+ if (VT == MVT::i1) {
+ // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
+ if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
+ RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
+ SDValue Src = LHS.getOperand(0);
+ if (Src != RHS.getOperand(0))
+ return SDValue();
+
+ const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
+ const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
+ if (!CLHS || !CRHS)
+ return SDValue();
+
+ // Only 10 bits are used.
+ static const uint32_t MaxMask = 0x3ff;
+
+ uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
+ SDLoc DL(N);
+ return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
+ Src, DAG.getConstant(NewMask, DL, MVT::i32));
+ }
+
+ return SDValue();
+ }
+
+ if (VT != MVT::i64)
+ return SDValue();
+
+ // TODO: This could be a generic combine with a predicate for extracting the
+ // high half of an integer being free.
+
+ // (or i64:x, (zero_extend i32:y)) ->
+ // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
+ if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
+ RHS.getOpcode() != ISD::ZERO_EXTEND)
+ std::swap(LHS, RHS);
+
+ if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
+ SDValue ExtSrc = RHS.getOperand(0);
+ EVT SrcVT = ExtSrc.getValueType();
+ if (SrcVT == MVT::i32) {
+ SDLoc SL(N);
+ SDValue LowLHS, HiBits;
+ std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
+ SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
+
+ DCI.AddToWorklist(LowOr.getNode());
+ DCI.AddToWorklist(HiBits.getNode());
+
+ SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
+ LowOr, HiBits);
+ return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
+ }
+ }
+
+ const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (CRHS) {
+ if (SDValue Split
+ = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, LHS, CRHS))
+ return Split;
+ }
+
+ return SDValue();
+}
+
+SDValue SITargetLowering::performXorCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::i64)
+ return SDValue();
+
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
+ if (CRHS) {
+ if (SDValue Split
+ = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
+ return Split;
+ }
+
+ return SDValue();
+}
+
+SDValue SITargetLowering::performClassCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue Mask = N->getOperand(1);
+
+ // fp_class x, 0 -> false
+ if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
+ if (CMask->isNullValue())
+ return DAG.getConstant(0, SDLoc(N), MVT::i1);
+ }
+
+ if (N->getOperand(0).isUndef())
+ return DAG.getUNDEF(MVT::i1);
+
+ return SDValue();
+}
+
+// Constant fold canonicalize.
+SDValue SITargetLowering::performFCanonicalizeCombine(
+ SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
+ if (!CFP)
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ const APFloat &C = CFP->getValueAPF();
+
+ // Flush denormals to 0 if not enabled.
+ if (C.isDenormal()) {
+ EVT VT = N->getValueType(0);
+ if (VT == MVT::f32 && !Subtarget->hasFP32Denormals())
+ return DAG.getConstantFP(0.0, SDLoc(N), VT);
+
+ if (VT == MVT::f64 && !Subtarget->hasFP64Denormals())
+ return DAG.getConstantFP(0.0, SDLoc(N), VT);
+
+ if (VT == MVT::f16 && !Subtarget->hasFP16Denormals())
+ return DAG.getConstantFP(0.0, SDLoc(N), VT);
+ }
+
+ if (C.isNaN()) {
+ EVT VT = N->getValueType(0);
+ APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
+ if (C.isSignaling()) {
+ // Quiet a signaling NaN.
+ return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
+ }
+
+ // Make sure it is the canonical NaN bitpattern.
+ //
+ // TODO: Can we use -1 as the canonical NaN value since it's an inline
+ // immediate?
+ if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
+ return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
+ }
+
+ return SDValue(CFP, 0);
+}
+
+static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
+ switch (Opc) {
+ case ISD::FMAXNUM:
+ return AMDGPUISD::FMAX3;
+ case ISD::SMAX:
+ return AMDGPUISD::SMAX3;
+ case ISD::UMAX:
+ return AMDGPUISD::UMAX3;
+ case ISD::FMINNUM:
+ return AMDGPUISD::FMIN3;
+ case ISD::SMIN:
+ return AMDGPUISD::SMIN3;
+ case ISD::UMIN:
+ return AMDGPUISD::UMIN3;
+ default:
+ llvm_unreachable("Not a min/max opcode");
+ }
+}
+
+static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
+ SDValue Op0, SDValue Op1, bool Signed) {
+ ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
+ if (!K1)
+ return SDValue();
+
+ ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
+ if (!K0)
+ return SDValue();
+
+ if (Signed) {
+ if (K0->getAPIntValue().sge(K1->getAPIntValue()))
+ return SDValue();
+ } else {
+ if (K0->getAPIntValue().uge(K1->getAPIntValue()))
+ return SDValue();
+ }
+
+ EVT VT = K0->getValueType(0);
+
+ MVT NVT = MVT::i32;
+ unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+
+ SDValue Tmp1, Tmp2, Tmp3;
+ Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
+ Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
+ Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
+
+ if (VT == MVT::i16) {
+ Tmp1 = DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, NVT,
+ Tmp1, Tmp2, Tmp3);
+
+ return DAG.getNode(ISD::TRUNCATE, SL, VT, Tmp1);
+ } else
+ return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT,
+ Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
+}
+
+static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
+ if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions())
+ return true;
+
+ return DAG.isKnownNeverNaN(Op);
+}
+
+static SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
+ SDValue Op0, SDValue Op1) {
+ ConstantFPSDNode *K1 = dyn_cast<ConstantFPSDNode>(Op1);
+ if (!K1)
+ return SDValue();
+
+ ConstantFPSDNode *K0 = dyn_cast<ConstantFPSDNode>(Op0.getOperand(1));
+ if (!K0)
+ return SDValue();
+
+ // Ordered >= (although NaN inputs should have folded away by now).
+ APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF());
+ if (Cmp == APFloat::cmpGreaterThan)
+ return SDValue();
+
+ // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
+ // signaling NaN gives a quiet NaN. The quiet NaN input to the min would then
+ // give the other result, which is different from med3 with a NaN input.
+ SDValue Var = Op0.getOperand(0);
+ if (!isKnownNeverSNan(DAG, Var))
+ return SDValue();
+
+ return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
+ Var, SDValue(K0, 0), SDValue(K1, 0));
+}
+
+SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+
+ unsigned Opc = N->getOpcode();
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+
+ // Only do this if the inner op has one use since this will just increases
+ // register pressure for no benefit.
+
+ if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY) {
+ // max(max(a, b), c) -> max3(a, b, c)
+ // min(min(a, b), c) -> min3(a, b, c)
+ if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
+ SDLoc DL(N);
+ return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
+ DL,
+ N->getValueType(0),
+ Op0.getOperand(0),
+ Op0.getOperand(1),
+ Op1);
+ }
+
+ // Try commuted.
+ // max(a, max(b, c)) -> max3(a, b, c)
+ // min(a, min(b, c)) -> min3(a, b, c)
+ if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
+ SDLoc DL(N);
+ return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
+ DL,
+ N->getValueType(0),
+ Op0,
+ Op1.getOperand(0),
+ Op1.getOperand(1));
+ }
+ }
+
+ // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
+ if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
+ if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true))
+ return Med3;
+ }
+
+ if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
+ if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false))
+ return Med3;
+ }
+
+ // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
+ if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
+ (Opc == AMDGPUISD::FMIN_LEGACY &&
+ Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
+ N->getValueType(0) == MVT::f32 && Op0.hasOneUse()) {
+ if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
+ return Res;
+ }
+
+ return SDValue();
+}
+
+unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
+ const SDNode *N0,
+ const SDNode *N1) const {
+ EVT VT = N0->getValueType(0);
+
+ // Only do this if we are not trying to support denormals. v_mad_f32 does not
+ // support denormals ever.
+ if ((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
+ (VT == MVT::f16 && !Subtarget->hasFP16Denormals()))
+ return ISD::FMAD;
+
+ const TargetOptions &Options = DAG.getTarget().Options;
+ if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
+ Options.UnsafeFPMath ||
+ (cast<BinaryWithFlagsSDNode>(N0)->Flags.hasUnsafeAlgebra() &&
+ cast<BinaryWithFlagsSDNode>(N1)->Flags.hasUnsafeAlgebra())) &&
+ isFMAFasterThanFMulAndFAdd(VT)) {
+ return ISD::FMA;
+ }
+
+ return 0;
+}
+
+SDValue SITargetLowering::performFAddCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+ assert(!VT.isVector());
+
+ SDLoc SL(N);
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ // These should really be instruction patterns, but writing patterns with
+ // source modiifiers is a pain.
+
+ // fadd (fadd (a, a), b) -> mad 2.0, a, b
+ if (LHS.getOpcode() == ISD::FADD) {
+ SDValue A = LHS.getOperand(0);
+ if (A == LHS.getOperand(1)) {
+ unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
+ if (FusedOp != 0) {
+ const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
+ return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
+ }
+ }
+ }
+
+ // fadd (b, fadd (a, a)) -> mad 2.0, a, b
+ if (RHS.getOpcode() == ISD::FADD) {
+ SDValue A = RHS.getOperand(0);
+ if (A == RHS.getOperand(1)) {
+ unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
+ if (FusedOp != 0) {
+ const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
+ return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
+ }
+ }
+ }
+
+ return SDValue();
+}
+
+SDValue SITargetLowering::performFSubCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc SL(N);
+ EVT VT = N->getValueType(0);
+ assert(!VT.isVector());
+
+ // Try to get the fneg to fold into the source modifier. This undoes generic
+ // DAG combines and folds them into the mad.
+ //
+ // Only do this if we are not trying to support denormals. v_mad_f32 does
+ // not support denormals ever.
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ if (LHS.getOpcode() == ISD::FADD) {
+ // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
+ SDValue A = LHS.getOperand(0);
+ if (A == LHS.getOperand(1)) {
+ unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
+ if (FusedOp != 0){
+ const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
+ SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+
+ return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
+ }
+ }
+ }
+
+ if (RHS.getOpcode() == ISD::FADD) {
+ // (fsub c, (fadd a, a)) -> mad -2.0, a, c
+
+ SDValue A = RHS.getOperand(0);
+ if (A == RHS.getOperand(1)) {
+ unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
+ if (FusedOp != 0){
+ const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
+ return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
+ }
+ }
+ }
+
+ return SDValue();
+}
+
+SDValue SITargetLowering::performSetCCCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc SL(N);
+
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ EVT VT = LHS.getValueType();
+
+ if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() &&
+ VT != MVT::f16))
+ return SDValue();
+
+ // Match isinf pattern
+ // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) {
+ const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
+ if (!CRHS)
+ return SDValue();
+
+ const APFloat &APF = CRHS->getValueAPF();
+ if (APF.isInfinity() && !APF.isNegative()) {
+ unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY;
+ return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
+ DAG.getConstant(Mask, SL, MVT::i32));
+ }
+ }
+
+ return SDValue();
+}
+
+SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc SL(N);
+ unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
+
+ SDValue Src = N->getOperand(0);
+ SDValue Srl = N->getOperand(0);
+ if (Srl.getOpcode() == ISD::ZERO_EXTEND)
+ Srl = Srl.getOperand(0);
+
+ // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.
+ if (Srl.getOpcode() == ISD::SRL) {
+ // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
+ // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
+ // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
+
+ if (const ConstantSDNode *C =
+ dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
+ Srl = DAG.getZExtOrTrunc(Srl.getOperand(0), SDLoc(Srl.getOperand(0)),
+ EVT(MVT::i32));
+
+ unsigned SrcOffset = C->getZExtValue() + 8 * Offset;
+ if (SrcOffset < 32 && SrcOffset % 8 == 0) {
+ return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, SL,
+ MVT::f32, Srl);
+ }
+ }
+ }
+
+ APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
+
+ APInt KnownZero, KnownOne;
+ TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+ !DCI.isBeforeLegalizeOps());
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLO.ShrinkDemandedConstant(Src, Demanded) ||
+ TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) {
+ DCI.CommitTargetLoweringOpt(TLO);
+ }
+
+ return SDValue();
+}
+
+SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ switch (N->getOpcode()) {
+ default:
+ return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
+ case ISD::FADD:
+ return performFAddCombine(N, DCI);
+ case ISD::FSUB:
+ return performFSubCombine(N, DCI);
+ case ISD::SETCC:
+ return performSetCCCombine(N, DCI);
+ case ISD::FMAXNUM:
+ case ISD::FMINNUM:
+ case ISD::SMAX:
+ case ISD::SMIN:
+ case ISD::UMAX:
+ case ISD::UMIN:
+ case AMDGPUISD::FMIN_LEGACY:
+ case AMDGPUISD::FMAX_LEGACY: {
+ if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
+ N->getValueType(0) != MVT::f64 &&
+ getTargetMachine().getOptLevel() > CodeGenOpt::None)
+ return performMinMaxCombine(N, DCI);
+ break;
+ }
+ case ISD::LOAD:
+ case ISD::STORE:
+ case ISD::ATOMIC_LOAD:
+ case ISD::ATOMIC_STORE:
+ case ISD::ATOMIC_CMP_SWAP:
+ case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
+ case ISD::ATOMIC_SWAP:
+ case ISD::ATOMIC_LOAD_ADD:
+ case ISD::ATOMIC_LOAD_SUB:
+ case ISD::ATOMIC_LOAD_AND:
+ case ISD::ATOMIC_LOAD_OR:
+ case ISD::ATOMIC_LOAD_XOR:
+ case ISD::ATOMIC_LOAD_NAND:
+ case ISD::ATOMIC_LOAD_MIN:
+ case ISD::ATOMIC_LOAD_MAX:
+ case ISD::ATOMIC_LOAD_UMIN:
+ case ISD::ATOMIC_LOAD_UMAX:
+ case AMDGPUISD::ATOMIC_INC:
+ case AMDGPUISD::ATOMIC_DEC: { // TODO: Target mem intrinsics.
+ if (DCI.isBeforeLegalize())
+ break;
+ return performMemSDNodeCombine(cast<MemSDNode>(N), DCI);
+ }
+ case ISD::AND:
+ return performAndCombine(N, DCI);
+ case ISD::OR:
+ return performOrCombine(N, DCI);
+ case ISD::XOR:
+ return performXorCombine(N, DCI);
+ case AMDGPUISD::FP_CLASS:
+ return performClassCombine(N, DCI);
+ case ISD::FCANONICALIZE:
+ return performFCanonicalizeCombine(N, DCI);
+ case AMDGPUISD::FRACT:
+ case AMDGPUISD::RCP:
+ case AMDGPUISD::RSQ:
+ case AMDGPUISD::RCP_LEGACY:
+ case AMDGPUISD::RSQ_LEGACY:
+ case AMDGPUISD::RSQ_CLAMP:
+ case AMDGPUISD::LDEXP: {
+ SDValue Src = N->getOperand(0);
+ if (Src.isUndef())
+ return Src;
+ break;
+ }
+ case ISD::SINT_TO_FP:
+ case ISD::UINT_TO_FP:
+ return performUCharToFloatCombine(N, DCI);
+ case AMDGPUISD::CVT_F32_UBYTE0:
+ case AMDGPUISD::CVT_F32_UBYTE1:
+ case AMDGPUISD::CVT_F32_UBYTE2:
+ case AMDGPUISD::CVT_F32_UBYTE3:
+ return performCvtF32UByteNCombine(N, DCI);
+ }
+ return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
+}
+
+/// \brief Helper function for adjustWritemask
+static unsigned SubIdx2Lane(unsigned Idx) {
+ switch (Idx) {
+ default: return 0;
+ case AMDGPU::sub0: return 0;
+ case AMDGPU::sub1: return 1;
+ case AMDGPU::sub2: return 2;
+ case AMDGPU::sub3: return 3;
+ }
+}
+
+/// \brief Adjust the writemask of MIMG instructions
+void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
+ SelectionDAG &DAG) const {
+ SDNode *Users[4] = { };
+ unsigned Lane = 0;
+ unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3;
+ unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
+ unsigned NewDmask = 0;
+
+ // Try to figure out the used register components
+ for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
+ I != E; ++I) {
+
+ // Abort if we can't understand the usage
+ if (!I->isMachineOpcode() ||
+ I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
+ return;
+
+ // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used.
+ // Note that subregs are packed, i.e. Lane==0 is the first bit set
+ // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
+ // set, etc.
+ Lane = SubIdx2Lane(I->getConstantOperandVal(1));
+
+ // Set which texture component corresponds to the lane.
+ unsigned Comp;
+ for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {
+ assert(Dmask);
+ Comp = countTrailingZeros(Dmask);
+ Dmask &= ~(1 << Comp);
+ }
+
+ // Abort if we have more than one user per component
+ if (Users[Lane])
+ return;
+
+ Users[Lane] = *I;
+ NewDmask |= 1 << Comp;
+ }
+
+ // Abort if there's no change
+ if (NewDmask == OldDmask)
+ return;
+
+ // Adjust the writemask in the node
+ std::vector<SDValue> Ops;
+ Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
+ Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
+ Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
+ Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops);
+
+ // If we only got one lane, replace it with a copy
+ // (if NewDmask has only one bit set...)
+ if (NewDmask && (NewDmask & (NewDmask-1)) == 0) {
+ SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(),
+ MVT::i32);
+ SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+ SDLoc(), Users[Lane]->getValueType(0),
+ SDValue(Node, 0), RC);
+ DAG.ReplaceAllUsesWith(Users[Lane], Copy);
+ return;
+ }
+
+ // Update the users of the node with the new indices
+ for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) {
+
+ SDNode *User = Users[i];
+ if (!User)
+ continue;
+
+ SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
+ DAG.UpdateNodeOperands(User, User->getOperand(0), Op);
+
+ switch (Idx) {
+ default: break;
+ case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
+ case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
+ case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
+ }
+ }
+}
+
+static bool isFrameIndexOp(SDValue Op) {
+ if (Op.getOpcode() == ISD::AssertZext)
+ Op = Op.getOperand(0);
+
+ return isa<FrameIndexSDNode>(Op);
+}
+
+/// \brief Legalize target independent instructions (e.g. INSERT_SUBREG)
+/// with frame index operands.
+/// LLVM assumes that inputs are to these instructions are registers.
+void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
+ SelectionDAG &DAG) const {
+
+ SmallVector<SDValue, 8> Ops;
+ for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
+ if (!isFrameIndexOp(Node->getOperand(i))) {
+ Ops.push_back(Node->getOperand(i));
+ continue;
+ }
+
+ SDLoc DL(Node);
+ Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
+ Node->getOperand(i).getValueType(),
+ Node->getOperand(i)), 0));
+ }
+
+ DAG.UpdateNodeOperands(Node, Ops);
+}
+
+/// \brief Fold the instructions after selecting them.
+SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
+ SelectionDAG &DAG) const {
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+ unsigned Opcode = Node->getMachineOpcode();
+
+ if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
+ !TII->isGather4(Opcode))
+ adjustWritemask(Node, DAG);
+
+ if (Opcode == AMDGPU::INSERT_SUBREG ||
+ Opcode == AMDGPU::REG_SEQUENCE) {
+ legalizeTargetIndependentNode(Node, DAG);
+ return Node;
+ }
+ return Node;
+}
+
+/// \brief Assign the register class depending on the number of
+/// bits set in the writemask
+void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
+ SDNode *Node) const {
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+
+ MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+
+ if (TII->isVOP3(MI.getOpcode())) {
+ // Make sure constant bus requirements are respected.
+ TII->legalizeOperandsVOP3(MRI, MI);
+ return;
+ }
+
+ if (TII->isMIMG(MI)) {
+ unsigned VReg = MI.getOperand(0).getReg();
+ const TargetRegisterClass *RC = MRI.getRegClass(VReg);
+ // TODO: Need mapping tables to handle other cases (register classes).
+ if (RC != &AMDGPU::VReg_128RegClass)
+ return;
+
+ unsigned DmaskIdx = MI.getNumOperands() == 12 ? 3 : 4;
+ unsigned Writemask = MI.getOperand(DmaskIdx).getImm();
+ unsigned BitsSet = 0;
+ for (unsigned i = 0; i < 4; ++i)
+ BitsSet += Writemask & (1 << i) ? 1 : 0;
+ switch (BitsSet) {
+ default: return;
+ case 1: RC = &AMDGPU::VGPR_32RegClass; break;
+ case 2: RC = &AMDGPU::VReg_64RegClass; break;
+ case 3: RC = &AMDGPU::VReg_96RegClass; break;
+ }
+
+ unsigned NewOpcode = TII->getMaskedMIMGOp(MI.getOpcode(), BitsSet);
+ MI.setDesc(TII->get(NewOpcode));
+ MRI.setRegClass(VReg, RC);
+ return;
+ }
+
+ // Replace unused atomics with the no return version.
+ int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
+ if (NoRetAtomicOp != -1) {
+ if (!Node->hasAnyUseOfValue(0)) {
+ MI.setDesc(TII->get(NoRetAtomicOp));
+ MI.RemoveOperand(0);
+ return;
+ }
+
+ // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
+ // instruction, because the return type of these instructions is a vec2 of
+ // the memory type, so it can be tied to the input operand.
+ // This means these instructions always have a use, so we need to add a
+ // special case to check if the atomic has only one extract_subreg use,
+ // which itself has no uses.
+ if ((Node->hasNUsesOfValue(1, 0) &&
+ Node->use_begin()->isMachineOpcode() &&
+ Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG &&
+ !Node->use_begin()->hasAnyUseOfValue(0))) {
+ unsigned Def = MI.getOperand(0).getReg();
+
+ // Change this into a noret atomic.
+ MI.setDesc(TII->get(NoRetAtomicOp));
+ MI.RemoveOperand(0);
+
+ // If we only remove the def operand from the atomic instruction, the
+ // extract_subreg will be left with a use of a vreg without a def.
+ // So we need to insert an implicit_def to avoid machine verifier
+ // errors.
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+ TII->get(AMDGPU::IMPLICIT_DEF), Def);
+ }
+ return;
+ }
+}
+
+static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
+ uint64_t Val) {
+ SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
+ return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
+}
+
+MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
+ const SDLoc &DL,
+ SDValue Ptr) const {
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+
+ // Build the half of the subregister with the constants before building the
+ // full 128-bit register. If we are building multiple resource descriptors,
+ // this will allow CSEing of the 2-component register.
+ const SDValue Ops0[] = {
+ DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
+ buildSMovImm32(DAG, DL, 0),
+ DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
+ buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
+ DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
+ };
+
+ SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
+ MVT::v2i32, Ops0), 0);
+
+ // Combine the constants and the pointer.
+ const SDValue Ops1[] = {
+ DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
+ Ptr,
+ DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
+ SubRegHi,
+ DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
+ };
+
+ return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
+}
+
+/// \brief Return a resource descriptor with the 'Add TID' bit enabled
+/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
+/// of the resource descriptor) to create an offset, which is added to
+/// the resource pointer.
+MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
+ SDValue Ptr, uint32_t RsrcDword1,
+ uint64_t RsrcDword2And3) const {
+ SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
+ SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
+ if (RsrcDword1) {
+ PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
+ DAG.getConstant(RsrcDword1, DL, MVT::i32)),
+ 0);
+ }
+
+ SDValue DataLo = buildSMovImm32(DAG, DL,
+ RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
+ SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
+
+ const SDValue Ops[] = {
+ DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
+ PtrLo,
+ DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
+ PtrHi,
+ DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
+ DataLo,
+ DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
+ DataHi,
+ DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
+ };
+
+ return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
+}
+
+SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
+ const TargetRegisterClass *RC,
+ unsigned Reg, EVT VT) const {
+ SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT);
+
+ return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()),
+ cast<RegisterSDNode>(VReg)->getReg(), VT);
+}
+
+//===----------------------------------------------------------------------===//
+// SI Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+std::pair<unsigned, const TargetRegisterClass *>
+SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ StringRef Constraint,
+ MVT VT) const {
+ if (!isTypeLegal(VT))
+ return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ case 's':
+ case 'r':
+ switch (VT.getSizeInBits()) {
+ default:
+ return std::make_pair(0U, nullptr);
+ case 32:
+ case 16:
+ return std::make_pair(0U, &AMDGPU::SReg_32_XM0RegClass);
+ case 64:
+ return std::make_pair(0U, &AMDGPU::SGPR_64RegClass);
+ case 128:
+ return std::make_pair(0U, &AMDGPU::SReg_128RegClass);
+ case 256:
+ return std::make_pair(0U, &AMDGPU::SReg_256RegClass);
+ }
+
+ case 'v':
+ switch (VT.getSizeInBits()) {
+ default:
+ return std::make_pair(0U, nullptr);
+ case 32:
+ case 16:
+ return std::make_pair(0U, &AMDGPU::VGPR_32RegClass);
+ case 64:
+ return std::make_pair(0U, &AMDGPU::VReg_64RegClass);
+ case 96:
+ return std::make_pair(0U, &AMDGPU::VReg_96RegClass);
+ case 128:
+ return std::make_pair(0U, &AMDGPU::VReg_128RegClass);
+ case 256:
+ return std::make_pair(0U, &AMDGPU::VReg_256RegClass);
+ case 512:
+ return std::make_pair(0U, &AMDGPU::VReg_512RegClass);
+ }
+ }
+ }
+
+ if (Constraint.size() > 1) {
+ const TargetRegisterClass *RC = nullptr;
+ if (Constraint[1] == 'v') {
+ RC = &AMDGPU::VGPR_32RegClass;
+ } else if (Constraint[1] == 's') {
+ RC = &AMDGPU::SGPR_32RegClass;
+ }
+
+ if (RC) {
+ uint32_t Idx;
+ bool Failed = Constraint.substr(2).getAsInteger(10, Idx);
+ if (!Failed && Idx < RC->getNumRegs())
+ return std::make_pair(RC->getRegister(Idx), RC);
+ }
+ }
+ return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+}
+
+SITargetLowering::ConstraintType
+SITargetLowering::getConstraintType(StringRef Constraint) const {
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ default: break;
+ case 's':
+ case 'v':
+ return C_RegisterClass;
+ }
+ }
+ return TargetLowering::getConstraintType(Constraint);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h
new file mode 100644
index 000000000000..9583f6db6faa
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -0,0 +1,200 @@
+//===-- SIISelLowering.h - SI DAG Lowering Interface ------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief SI DAG Lowering interface definition
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIISELLOWERING_H
+#define LLVM_LIB_TARGET_AMDGPU_SIISELLOWERING_H
+
+#include "AMDGPUISelLowering.h"
+#include "SIInstrInfo.h"
+
+namespace llvm {
+
+class SITargetLowering final : public AMDGPUTargetLowering {
+ SDValue LowerParameterPtr(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain,
+ unsigned Offset) const;
+ SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL,
+ SDValue Chain, unsigned Offset, bool Signed) const;
+ SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
+ SelectionDAG &DAG) const override;
+ SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
+ MVT VT, unsigned Offset) const;
+
+ SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFDIV16(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool Signed) const;
+ SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
+
+ /// \brief Converts \p Op, which must be of floating point type, to the
+ /// floating point type \p VT, by either extending or truncating it.
+ SDValue getFPExtOrFPTrunc(SelectionDAG &DAG,
+ SDValue Op,
+ const SDLoc &DL,
+ EVT VT) const;
+
+ /// \brief Custom lowering for ISD::FP_ROUND for MVT::f16.
+ SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue getSegmentAperture(unsigned AS, SelectionDAG &DAG) const;
+ SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const;
+
+ void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
+
+ SDValue performUCharToFloatCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const;
+ SDValue performSHLPtrCombine(SDNode *N,
+ unsigned AS,
+ DAGCombinerInfo &DCI) const;
+
+ SDValue performMemSDNodeCombine(MemSDNode *N, DAGCombinerInfo &DCI) const;
+
+ SDValue splitBinaryBitConstantOp(DAGCombinerInfo &DCI, const SDLoc &SL,
+ unsigned Opc, SDValue LHS,
+ const ConstantSDNode *CRHS) const;
+
+ SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performXorCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performFCanonicalizeCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+ SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+ unsigned getFusedOpcode(const SelectionDAG &DAG,
+ const SDNode *N0, const SDNode *N1) const;
+ SDValue performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performFSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performCvtF32UByteNCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+ bool isLegalFlatAddressingMode(const AddrMode &AM) const;
+ bool isLegalMUBUFAddressingMode(const AddrMode &AM) const;
+
+ bool isCFIntrinsic(const SDNode *Intr) const;
+
+ void createDebuggerPrologueStackObjects(MachineFunction &MF) const;
+
+ /// \returns True if fixup needs to be emitted for given global value \p GV,
+ /// false otherwise.
+ bool shouldEmitFixup(const GlobalValue *GV) const;
+
+ /// \returns True if GOT relocation needs to be emitted for given global value
+ /// \p GV, false otherwise.
+ bool shouldEmitGOTReloc(const GlobalValue *GV) const;
+
+ /// \returns True if PC-relative relocation needs to be emitted for given
+ /// global value \p GV, false otherwise.
+ bool shouldEmitPCReloc(const GlobalValue *GV) const;
+
+public:
+ SITargetLowering(const TargetMachine &tm, const SISubtarget &STI);
+
+ const SISubtarget *getSubtarget() const;
+
+ bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &,
+ unsigned IntrinsicID) const override;
+
+ bool isShuffleMaskLegal(const SmallVectorImpl<int> &/*Mask*/,
+ EVT /*VT*/) const override;
+
+ bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
+ unsigned AS) const override;
+
+ bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
+ unsigned Align,
+ bool *IsFast) const override;
+
+ EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
+ unsigned SrcAlign, bool IsMemset,
+ bool ZeroMemset,
+ bool MemcpyStrSrc,
+ MachineFunction &MF) const override;
+
+ bool isMemOpUniform(const SDNode *N) const;
+ bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const;
+ bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
+ bool isCheapAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
+
+ TargetLoweringBase::LegalizeTypeAction
+ getPreferredVectorAction(EVT VT) const override;
+
+ bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+ Type *Ty) const override;
+
+ bool isTypeDesirableForOp(unsigned Op, EVT VT) const override;
+
+ bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+
+ SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &DL, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const override;
+
+ SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
+ SelectionDAG &DAG) const override;
+
+ unsigned getRegisterByName(const char* RegName, EVT VT,
+ SelectionDAG &DAG) const override;
+
+ MachineBasicBlock *splitKillBlock(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *
+ EmitInstrWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *BB) const override;
+ bool enableAggressiveFMAFusion(EVT VT) const override;
+ EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+ EVT VT) const override;
+ MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override;
+ bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
+ SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+ SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+ SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override;
+ void AdjustInstrPostInstrSelection(MachineInstr &MI,
+ SDNode *Node) const override;
+
+ SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC,
+ unsigned Reg, EVT VT) const override;
+ void legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const;
+
+ MachineSDNode *wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL,
+ SDValue Ptr) const;
+ MachineSDNode *buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr,
+ uint32_t RsrcDword1, uint64_t RsrcDword2And3) const;
+ std::pair<unsigned, const TargetRegisterClass *>
+ getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ StringRef Constraint, MVT VT) const override;
+ ConstraintType getConstraintType(StringRef Constraint) const override;
+ SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL,
+ SDValue V) const;
+};
+
+} // End namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
new file mode 100644
index 000000000000..91e4bf755c53
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
@@ -0,0 +1,329 @@
+//===-- SIInsertSkips.cpp - Use predicates for control flow ----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This pass inserts branches on the 0 exec mask over divergent branches
+/// branches when it's expected that jumping over the untaken control flow will
+/// be cheaper than having every workitem no-op through it.
+//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/MC/MCAsmInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-insert-skips"
+
+namespace {
+
+static cl::opt<unsigned> SkipThresholdFlag(
+ "amdgpu-skip-threshold",
+ cl::desc("Number of instructions before jumping over divergent control flow"),
+ cl::init(12), cl::Hidden);
+
+class SIInsertSkips : public MachineFunctionPass {
+private:
+ const SIRegisterInfo *TRI;
+ const SIInstrInfo *TII;
+ unsigned SkipThreshold;
+
+ bool shouldSkip(const MachineBasicBlock &From,
+ const MachineBasicBlock &To) const;
+
+ bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
+
+ void kill(MachineInstr &MI);
+
+ MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const;
+
+ bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
+
+public:
+ static char ID;
+
+ SIInsertSkips() :
+ MachineFunctionPass(ID), TRI(nullptr), TII(nullptr), SkipThreshold(0) { }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "SI insert s_cbranch_execz instructions";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // End anonymous namespace
+
+char SIInsertSkips::ID = 0;
+
+INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE,
+ "SI insert s_cbranch_execz instructions", false, false)
+
+char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID;
+
+static bool opcodeEmitsNoInsts(unsigned Opc) {
+ switch (Opc) {
+ case TargetOpcode::IMPLICIT_DEF:
+ case TargetOpcode::KILL:
+ case TargetOpcode::BUNDLE:
+ case TargetOpcode::CFI_INSTRUCTION:
+ case TargetOpcode::EH_LABEL:
+ case TargetOpcode::GC_LABEL:
+ case TargetOpcode::DBG_VALUE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
+ const MachineBasicBlock &To) const {
+ if (From.succ_empty())
+ return false;
+
+ unsigned NumInstr = 0;
+ const MachineFunction *MF = From.getParent();
+
+ for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
+ MBBI != End && MBBI != ToI; ++MBBI) {
+ const MachineBasicBlock &MBB = *MBBI;
+
+ for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
+ NumInstr < SkipThreshold && I != E; ++I) {
+ if (opcodeEmitsNoInsts(I->getOpcode()))
+ continue;
+
+ // FIXME: Since this is required for correctness, this should be inserted
+ // during SILowerControlFlow.
+
+ // When a uniform loop is inside non-uniform control flow, the branch
+ // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
+ // when EXEC = 0. We should skip the loop lest it becomes infinite.
+ if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
+ I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
+ return true;
+
+ if (I->isInlineAsm()) {
+ const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
+ const char *AsmStr = I->getOperand(0).getSymbolName();
+
+ // inlineasm length estimate is number of bytes assuming the longest
+ // instruction.
+ uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI);
+ NumInstr += MaxAsmSize / MAI->getMaxInstLength();
+ } else {
+ ++NumInstr;
+ }
+
+ if (NumInstr >= SkipThreshold)
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineFunction *MF = MBB.getParent();
+
+ if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS ||
+ !shouldSkip(MBB, MBB.getParent()->back()))
+ return false;
+
+ MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
+
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ // If the exec mask is non-zero, skip the next two instructions
+ BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+ .addMBB(&NextBB);
+
+ MachineBasicBlock::iterator Insert = SkipBB->begin();
+
+ // Exec mask is zero: Export to NULL target...
+ BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP_DONE))
+ .addImm(0x09) // V_008DFC_SQ_EXP_NULL
+ .addReg(AMDGPU::VGPR0, RegState::Undef)
+ .addReg(AMDGPU::VGPR0, RegState::Undef)
+ .addReg(AMDGPU::VGPR0, RegState::Undef)
+ .addReg(AMDGPU::VGPR0, RegState::Undef)
+ .addImm(1) // vm
+ .addImm(0) // compr
+ .addImm(0); // en
+
+ // ... and terminate wavefront.
+ BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
+
+ return true;
+}
+
+void SIInsertSkips::kill(MachineInstr &MI) {
+ MachineBasicBlock &MBB = *MI.getParent();
+ DebugLoc DL = MI.getDebugLoc();
+ const MachineOperand &Op = MI.getOperand(0);
+
+#ifndef NDEBUG
+ CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv();
+ // Kill is only allowed in pixel / geometry shaders.
+ assert(CallConv == CallingConv::AMDGPU_PS ||
+ CallConv == CallingConv::AMDGPU_GS);
+#endif
+ // Clear this thread from the exec mask if the operand is negative.
+ if (Op.isImm()) {
+ // Constant operand: Set exec mask to 0 or do nothing
+ if (Op.getImm() & 0x80000000) {
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+ .addImm(0);
+ }
+ } else {
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32))
+ .addImm(0)
+ .addOperand(Op);
+ }
+}
+
+MachineBasicBlock *SIInsertSkips::insertSkipBlock(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
+ MachineFunction *MF = MBB.getParent();
+
+ MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
+ MachineFunction::iterator MBBI(MBB);
+ ++MBBI;
+
+ MF->insert(MBBI, SkipBB);
+ MBB.addSuccessor(SkipBB);
+
+ return SkipBB;
+}
+
+// Returns true if a branch over the block was inserted.
+bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
+ MachineBasicBlock &SrcMBB) {
+ MachineBasicBlock *DestBB = MI.getOperand(0).getMBB();
+
+ if (!shouldSkip(**SrcMBB.succ_begin(), *DestBB))
+ return false;
+
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineBasicBlock::iterator InsPt = std::next(MI.getIterator());
+
+ BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
+ .addMBB(DestBB);
+
+ return true;
+}
+
+bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ TII = ST.getInstrInfo();
+ TRI = &TII->getRegisterInfo();
+ SkipThreshold = SkipThresholdFlag;
+
+ bool HaveKill = false;
+ bool MadeChange = false;
+
+ // Track depth of exec mask, divergent branches.
+ SmallVector<MachineBasicBlock *, 16> ExecBranchStack;
+
+ MachineFunction::iterator NextBB;
+
+ MachineBasicBlock *EmptyMBBAtEnd = nullptr;
+
+ for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+ BI != BE; BI = NextBB) {
+ NextBB = std::next(BI);
+ MachineBasicBlock &MBB = *BI;
+
+ if (!ExecBranchStack.empty() && ExecBranchStack.back() == &MBB) {
+ // Reached convergence point for last divergent branch.
+ ExecBranchStack.pop_back();
+ }
+
+ if (HaveKill && ExecBranchStack.empty()) {
+ HaveKill = false;
+
+ // TODO: Insert skip if exec is 0?
+ }
+
+ MachineBasicBlock::iterator I, Next;
+ for (I = MBB.begin(); I != MBB.end(); I = Next) {
+ Next = std::next(I);
+
+ MachineInstr &MI = *I;
+
+ switch (MI.getOpcode()) {
+ case AMDGPU::SI_MASK_BRANCH: {
+ ExecBranchStack.push_back(MI.getOperand(0).getMBB());
+ MadeChange |= skipMaskBranch(MI, MBB);
+ break;
+ }
+ case AMDGPU::S_BRANCH: {
+ // Optimize out branches to the next block.
+ // FIXME: Shouldn't this be handled by BranchFolding?
+ if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB()))
+ MI.eraseFromParent();
+ break;
+ }
+ case AMDGPU::SI_KILL_TERMINATOR: {
+ MadeChange = true;
+ kill(MI);
+
+ if (ExecBranchStack.empty()) {
+ if (skipIfDead(MI, *NextBB)) {
+ NextBB = std::next(BI);
+ BE = MF.end();
+ Next = MBB.end();
+ }
+ } else {
+ HaveKill = true;
+ }
+
+ MI.eraseFromParent();
+ break;
+ }
+ case AMDGPU::SI_RETURN: {
+ // FIXME: Should move somewhere else
+ assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
+
+ // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
+ // because external bytecode will be appended at the end.
+ if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
+ // SI_RETURN is not the last instruction. Add an empty block at
+ // the end and jump there.
+ if (!EmptyMBBAtEnd) {
+ EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
+ MF.insert(MF.end(), EmptyMBBAtEnd);
+ }
+
+ MBB.addSuccessor(EmptyMBBAtEnd);
+ BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
+ .addMBB(EmptyMBBAtEnd);
+ I->eraseFromParent();
+ }
+ }
+ default:
+ break;
+ }
+ }
+ }
+
+ return MadeChange;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
new file mode 100644
index 000000000000..202a1e9ed8ac
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
@@ -0,0 +1,679 @@
+//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Insert wait instructions for memory reads and writes.
+///
+/// Memory reads and writes are issued asynchronously, so we need to insert
+/// S_WAITCNT instructions when we want to access any of their results or
+/// overwrite any register that's used asynchronously.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIDefines.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+#define DEBUG_TYPE "si-insert-waits"
+
+using namespace llvm;
+using namespace llvm::AMDGPU;
+
+namespace {
+
+/// \brief One variable for each of the hardware counters
+typedef union {
+ struct {
+ unsigned VM;
+ unsigned EXP;
+ unsigned LGKM;
+ } Named;
+ unsigned Array[3];
+
+} Counters;
+
+typedef enum {
+ OTHER,
+ SMEM,
+ VMEM
+} InstType;
+
+typedef Counters RegCounters[512];
+typedef std::pair<unsigned, unsigned> RegInterval;
+
+class SIInsertWaits : public MachineFunctionPass {
+
+private:
+ const SISubtarget *ST;
+ const SIInstrInfo *TII;
+ const SIRegisterInfo *TRI;
+ const MachineRegisterInfo *MRI;
+ IsaVersion IV;
+
+ /// \brief Constant zero value
+ static const Counters ZeroCounts;
+
+ /// \brief Hardware limits
+ Counters HardwareLimits;
+
+ /// \brief Counter values we have already waited on.
+ Counters WaitedOn;
+
+ /// \brief Counter values that we must wait on before the next counter
+ /// increase.
+ Counters DelayedWaitOn;
+
+ /// \brief Counter values for last instruction issued.
+ Counters LastIssued;
+
+ /// \brief Registers used by async instructions.
+ RegCounters UsedRegs;
+
+ /// \brief Registers defined by async instructions.
+ RegCounters DefinedRegs;
+
+ /// \brief Different export instruction types seen since last wait.
+ unsigned ExpInstrTypesSeen;
+
+ /// \brief Type of the last opcode.
+ InstType LastOpcodeType;
+
+ bool LastInstWritesM0;
+
+ /// Whether or not we have flat operations outstanding.
+ bool IsFlatOutstanding;
+
+ /// \brief Whether the machine function returns void
+ bool ReturnsVoid;
+
+ /// Whether the VCCZ bit is possibly corrupt
+ bool VCCZCorrupt;
+
+ /// \brief Get increment/decrement amount for this instruction.
+ Counters getHwCounts(MachineInstr &MI);
+
+ /// \brief Is operand relevant for async execution?
+ bool isOpRelevant(MachineOperand &Op);
+
+ /// \brief Get register interval an operand affects.
+ RegInterval getRegInterval(const TargetRegisterClass *RC,
+ const MachineOperand &Reg) const;
+
+ /// \brief Handle instructions async components
+ void pushInstruction(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const Counters& Increment);
+
+ /// \brief Insert the actual wait instruction
+ bool insertWait(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const Counters &Counts);
+
+ /// \brief Handle existing wait instructions (from intrinsics)
+ void handleExistingWait(MachineBasicBlock::iterator I);
+
+ /// \brief Do we need def2def checks?
+ bool unorderedDefines(MachineInstr &MI);
+
+ /// \brief Resolve all operand dependencies to counter requirements
+ Counters handleOperands(MachineInstr &MI);
+
+ /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG.
+ void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
+
+ /// Return true if there are LGKM instrucitons that haven't been waited on
+ /// yet.
+ bool hasOutstandingLGKM() const;
+
+public:
+ static char ID;
+
+ SIInsertWaits() :
+ MachineFunctionPass(ID),
+ ST(nullptr),
+ TII(nullptr),
+ TRI(nullptr),
+ ExpInstrTypesSeen(0),
+ VCCZCorrupt(false) { }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "SI insert wait instructions";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // End anonymous namespace
+
+INITIALIZE_PASS_BEGIN(SIInsertWaits, DEBUG_TYPE,
+ "SI Insert Waits", false, false)
+INITIALIZE_PASS_END(SIInsertWaits, DEBUG_TYPE,
+ "SI Insert Waits", false, false)
+
+char SIInsertWaits::ID = 0;
+
+char &llvm::SIInsertWaitsID = SIInsertWaits::ID;
+
+FunctionPass *llvm::createSIInsertWaitsPass() {
+ return new SIInsertWaits();
+}
+
+const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
+
+static bool readsVCCZ(const MachineInstr &MI) {
+ unsigned Opc = MI.getOpcode();
+ return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
+ !MI.getOperand(1).isUndef();
+}
+
+bool SIInsertWaits::hasOutstandingLGKM() const {
+ return WaitedOn.Named.LGKM != LastIssued.Named.LGKM;
+}
+
+Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
+ uint64_t TSFlags = MI.getDesc().TSFlags;
+ Counters Result = { { 0, 0, 0 } };
+
+ Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
+
+ // Only consider stores or EXP for EXP_CNT
+ Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT) && MI.mayStore();
+
+ // LGKM may uses larger values
+ if (TSFlags & SIInstrFlags::LGKM_CNT) {
+
+ if (TII->isSMRD(MI)) {
+
+ if (MI.getNumOperands() != 0) {
+ assert(MI.getOperand(0).isReg() &&
+ "First LGKM operand must be a register!");
+
+ // XXX - What if this is a write into a super register?
+ const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0);
+ unsigned Size = RC->getSize();
+ Result.Named.LGKM = Size > 4 ? 2 : 1;
+ } else {
+ // s_dcache_inv etc. do not have a a destination register. Assume we
+ // want a wait on these.
+ // XXX - What is the right value?
+ Result.Named.LGKM = 1;
+ }
+ } else {
+ // DS
+ Result.Named.LGKM = 1;
+ }
+
+ } else {
+ Result.Named.LGKM = 0;
+ }
+
+ return Result;
+}
+
+bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
+ // Constants are always irrelevant
+ if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
+ return false;
+
+ // Defines are always relevant
+ if (Op.isDef())
+ return true;
+
+ // For exports all registers are relevant.
+ // TODO: Skip undef/disabled registers.
+ MachineInstr &MI = *Op.getParent();
+ if (TII->isEXP(MI))
+ return true;
+
+ // For stores the stored value is also relevant
+ if (!MI.getDesc().mayStore())
+ return false;
+
+ // Check if this operand is the value being stored.
+ // Special case for DS/FLAT instructions, since the address
+ // operand comes before the value operand and it may have
+ // multiple data operands.
+
+ if (TII->isDS(MI)) {
+ MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
+ if (Data0 && Op.isIdenticalTo(*Data0))
+ return true;
+
+ MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1);
+ return Data1 && Op.isIdenticalTo(*Data1);
+ }
+
+ if (TII->isFLAT(MI)) {
+ MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
+ if (Data && Op.isIdenticalTo(*Data))
+ return true;
+ }
+
+ // NOTE: This assumes that the value operand is before the
+ // address operand, and that there is only one value operand.
+ for (MachineInstr::mop_iterator I = MI.operands_begin(),
+ E = MI.operands_end(); I != E; ++I) {
+
+ if (I->isReg() && I->isUse())
+ return Op.isIdenticalTo(*I);
+ }
+
+ return false;
+}
+
+RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC,
+ const MachineOperand &Reg) const {
+ unsigned Size = RC->getSize();
+ assert(Size >= 4);
+
+ RegInterval Result;
+ Result.first = TRI->getEncodingValue(Reg.getReg());
+ Result.second = Result.first + Size / 4;
+
+ return Result;
+}
+
+void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const Counters &Increment) {
+
+ // Get the hardware counter increments and sum them up
+ Counters Limit = ZeroCounts;
+ unsigned Sum = 0;
+
+ if (TII->mayAccessFlatAddressSpace(*I))
+ IsFlatOutstanding = true;
+
+ for (unsigned i = 0; i < 3; ++i) {
+ LastIssued.Array[i] += Increment.Array[i];
+ if (Increment.Array[i])
+ Limit.Array[i] = LastIssued.Array[i];
+ Sum += Increment.Array[i];
+ }
+
+ // If we don't increase anything then that's it
+ if (Sum == 0) {
+ LastOpcodeType = OTHER;
+ return;
+ }
+
+ if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+ // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM
+ // or SMEM clause, respectively.
+ //
+ // The temporary workaround is to break the clauses with S_NOP.
+ //
+ // The proper solution would be to allocate registers such that all source
+ // and destination registers don't overlap, e.g. this is illegal:
+ // r0 = load r2
+ // r2 = load r0
+ if (LastOpcodeType == VMEM && Increment.Named.VM) {
+ // Insert a NOP to break the clause.
+ BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP))
+ .addImm(0);
+ LastInstWritesM0 = false;
+ }
+
+ if (TII->isSMRD(*I))
+ LastOpcodeType = SMEM;
+ else if (Increment.Named.VM)
+ LastOpcodeType = VMEM;
+ }
+
+ // Remember which export instructions we have seen
+ if (Increment.Named.EXP) {
+ ExpInstrTypesSeen |= TII->isEXP(*I) ? 1 : 2;
+ }
+
+ for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+ MachineOperand &Op = I->getOperand(i);
+ if (!isOpRelevant(Op))
+ continue;
+
+ const TargetRegisterClass *RC = TII->getOpRegClass(*I, i);
+ RegInterval Interval = getRegInterval(RC, Op);
+ for (unsigned j = Interval.first; j < Interval.second; ++j) {
+
+ // Remember which registers we define
+ if (Op.isDef())
+ DefinedRegs[j] = Limit;
+
+ // and which one we are using
+ if (Op.isUse())
+ UsedRegs[j] = Limit;
+ }
+ }
+}
+
+bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const Counters &Required) {
+
+ // End of program? No need to wait on anything
+ // A function not returning void needs to wait, because other bytecode will
+ // be appended after it and we don't know what it will be.
+ if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM && ReturnsVoid)
+ return false;
+
+ // Figure out if the async instructions execute in order
+ bool Ordered[3];
+
+ // VM_CNT is always ordered except when there are flat instructions, which
+ // can return out of order.
+ Ordered[0] = !IsFlatOutstanding;
+
+ // EXP_CNT is unordered if we have both EXP & VM-writes
+ Ordered[1] = ExpInstrTypesSeen == 3;
+
+ // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS
+ Ordered[2] = false;
+
+ // The values we are going to put into the S_WAITCNT instruction
+ Counters Counts = HardwareLimits;
+
+ // Do we really need to wait?
+ bool NeedWait = false;
+
+ for (unsigned i = 0; i < 3; ++i) {
+
+ if (Required.Array[i] <= WaitedOn.Array[i])
+ continue;
+
+ NeedWait = true;
+
+ if (Ordered[i]) {
+ unsigned Value = LastIssued.Array[i] - Required.Array[i];
+
+ // Adjust the value to the real hardware possibilities.
+ Counts.Array[i] = std::min(Value, HardwareLimits.Array[i]);
+
+ } else
+ Counts.Array[i] = 0;
+
+ // Remember on what we have waited on.
+ WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
+ }
+
+ if (!NeedWait)
+ return false;
+
+ // Reset EXP_CNT instruction types
+ if (Counts.Named.EXP == 0)
+ ExpInstrTypesSeen = 0;
+
+ // Build the wait instruction
+ BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+ .addImm(encodeWaitcnt(IV,
+ Counts.Named.VM,
+ Counts.Named.EXP,
+ Counts.Named.LGKM));
+
+ LastOpcodeType = OTHER;
+ LastInstWritesM0 = false;
+ IsFlatOutstanding = false;
+ return true;
+}
+
+/// \brief helper function for handleOperands
+static void increaseCounters(Counters &Dst, const Counters &Src) {
+
+ for (unsigned i = 0; i < 3; ++i)
+ Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
+}
+
+/// \brief check whether any of the counters is non-zero
+static bool countersNonZero(const Counters &Counter) {
+ for (unsigned i = 0; i < 3; ++i)
+ if (Counter.Array[i])
+ return true;
+ return false;
+}
+
+void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) {
+ assert(I->getOpcode() == AMDGPU::S_WAITCNT);
+
+ unsigned Imm = I->getOperand(0).getImm();
+ Counters Counts, WaitOn;
+
+ Counts.Named.VM = decodeVmcnt(IV, Imm);
+ Counts.Named.EXP = decodeExpcnt(IV, Imm);
+ Counts.Named.LGKM = decodeLgkmcnt(IV, Imm);
+
+ for (unsigned i = 0; i < 3; ++i) {
+ if (Counts.Array[i] <= LastIssued.Array[i])
+ WaitOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
+ else
+ WaitOn.Array[i] = 0;
+ }
+
+ increaseCounters(DelayedWaitOn, WaitOn);
+}
+
+Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
+
+ Counters Result = ZeroCounts;
+
+ // For each register affected by this instruction increase the result
+ // sequence.
+ //
+ // TODO: We could probably just look at explicit operands if we removed VCC /
+ // EXEC from SMRD dest reg classes.
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ MachineOperand &Op = MI.getOperand(i);
+ if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
+ continue;
+
+ const TargetRegisterClass *RC = TII->getOpRegClass(MI, i);
+ RegInterval Interval = getRegInterval(RC, Op);
+ for (unsigned j = Interval.first; j < Interval.second; ++j) {
+
+ if (Op.isDef()) {
+ increaseCounters(Result, UsedRegs[j]);
+ increaseCounters(Result, DefinedRegs[j]);
+ }
+
+ if (Op.isUse())
+ increaseCounters(Result, DefinedRegs[j]);
+ }
+ }
+
+ return Result;
+}
+
+void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) {
+ if (ST->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
+ return;
+
+ // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG.
+ if (LastInstWritesM0 && I->getOpcode() == AMDGPU::S_SENDMSG) {
+ BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
+ LastInstWritesM0 = false;
+ return;
+ }
+
+ // Set whether this instruction sets M0
+ LastInstWritesM0 = false;
+
+ unsigned NumOperands = I->getNumOperands();
+ for (unsigned i = 0; i < NumOperands; i++) {
+ const MachineOperand &Op = I->getOperand(i);
+
+ if (Op.isReg() && Op.isDef() && Op.getReg() == AMDGPU::M0)
+ LastInstWritesM0 = true;
+ }
+}
+
+// FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
+// around other non-memory instructions.
+bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
+ bool Changes = false;
+
+ ST = &MF.getSubtarget<SISubtarget>();
+ TII = ST->getInstrInfo();
+ TRI = &TII->getRegisterInfo();
+ MRI = &MF.getRegInfo();
+ IV = getIsaVersion(ST->getFeatureBits());
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+ HardwareLimits.Named.VM = getVmcntBitMask(IV);
+ HardwareLimits.Named.EXP = getExpcntBitMask(IV);
+ HardwareLimits.Named.LGKM = getLgkmcntBitMask(IV);
+
+ WaitedOn = ZeroCounts;
+ DelayedWaitOn = ZeroCounts;
+ LastIssued = ZeroCounts;
+ LastOpcodeType = OTHER;
+ LastInstWritesM0 = false;
+ IsFlatOutstanding = false;
+ ReturnsVoid = MFI->returnsVoid();
+
+ memset(&UsedRegs, 0, sizeof(UsedRegs));
+ memset(&DefinedRegs, 0, sizeof(DefinedRegs));
+
+ SmallVector<MachineInstr *, 4> RemoveMI;
+ SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
+
+ bool HaveScalarStores = false;
+
+ for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+ BI != BE; ++BI) {
+
+ MachineBasicBlock &MBB = *BI;
+
+ for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+ I != E; ++I) {
+
+ if (!HaveScalarStores && TII->isScalarStore(*I))
+ HaveScalarStores = true;
+
+ if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
+ // There is a hardware bug on CI/SI where SMRD instruction may corrupt
+ // vccz bit, so when we detect that an instruction may read from a
+ // corrupt vccz bit, we need to:
+ // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD operations to
+ // complete.
+ // 2. Restore the correct value of vccz by writing the current value
+ // of vcc back to vcc.
+
+ if (TII->isSMRD(I->getOpcode())) {
+ VCCZCorrupt = true;
+ } else if (!hasOutstandingLGKM() && I->modifiesRegister(AMDGPU::VCC, TRI)) {
+ // FIXME: We only care about SMRD instructions here, not LDS or GDS.
+ // Whenever we store a value in vcc, the correct value of vccz is
+ // restored.
+ VCCZCorrupt = false;
+ }
+
+ // Check if we need to apply the bug work-around
+ if (VCCZCorrupt && readsVCCZ(*I)) {
+ DEBUG(dbgs() << "Inserting vccz bug work-around before: " << *I << '\n');
+
+ // Wait on everything, not just LGKM. vccz reads usually come from
+ // terminators, and we always wait on everything at the end of the
+ // block, so if we only wait on LGKM here, we might end up with
+ // another s_waitcnt inserted right after this if there are non-LGKM
+ // instructions still outstanding.
+ insertWait(MBB, I, LastIssued);
+
+ // Restore the vccz bit. Any time a value is written to vcc, the vcc
+ // bit is updated, so we can restore the bit by reading the value of
+ // vcc and then writing it back to the register.
+ BuildMI(MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
+ AMDGPU::VCC)
+ .addReg(AMDGPU::VCC);
+ }
+ }
+
+ // Record pre-existing, explicitly requested waits
+ if (I->getOpcode() == AMDGPU::S_WAITCNT) {
+ handleExistingWait(*I);
+ RemoveMI.push_back(&*I);
+ continue;
+ }
+
+ Counters Required;
+
+ // Wait for everything before a barrier.
+ //
+ // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
+ // but we also want to wait for any other outstanding transfers before
+ // signalling other hardware blocks
+ if ((I->getOpcode() == AMDGPU::S_BARRIER &&
+ ST->needWaitcntBeforeBarrier()) ||
+ I->getOpcode() == AMDGPU::S_SENDMSG)
+ Required = LastIssued;
+ else
+ Required = handleOperands(*I);
+
+ Counters Increment = getHwCounts(*I);
+
+ if (countersNonZero(Required) || countersNonZero(Increment))
+ increaseCounters(Required, DelayedWaitOn);
+
+ Changes |= insertWait(MBB, I, Required);
+
+ pushInstruction(MBB, I, Increment);
+ handleSendMsg(MBB, I);
+
+ if (I->getOpcode() == AMDGPU::S_ENDPGM ||
+ I->getOpcode() == AMDGPU::SI_RETURN)
+ EndPgmBlocks.push_back(&MBB);
+ }
+
+ // Wait for everything at the end of the MBB
+ Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
+ }
+
+ if (HaveScalarStores) {
+ // If scalar writes are used, the cache must be flushed or else the next
+ // wave to reuse the same scratch memory can be clobbered.
+ //
+ // Insert s_dcache_wb at wave termination points if there were any scalar
+ // stores, and only if the cache hasn't already been flushed. This could be
+ // improved by looking across blocks for flushes in postdominating blocks
+ // from the stores but an explicitly requested flush is probably very rare.
+ for (MachineBasicBlock *MBB : EndPgmBlocks) {
+ bool SeenDCacheWB = false;
+
+ for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+ I != E; ++I) {
+
+ if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
+ SeenDCacheWB = true;
+ else if (TII->isScalarStore(*I))
+ SeenDCacheWB = false;
+
+ // FIXME: It would be better to insert this before a waitcnt if any.
+ if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
+ I->getOpcode() == AMDGPU::SI_RETURN) && !SeenDCacheWB) {
+ Changes = true;
+ BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
+ }
+ }
+ }
+ }
+
+ for (MachineInstr *I : RemoveMI)
+ I->eraseFromParent();
+
+ return Changes;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td
new file mode 100644
index 000000000000..5523ec142ba7
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -0,0 +1,285 @@
+//===-- SIInstrFormats.td - SI Instruction Encodings ----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// SI Instruction format definitions.
+//
+//===----------------------------------------------------------------------===//
+
+class InstSI <dag outs, dag ins, string asm = "",
+ list<dag> pattern = []> :
+ AMDGPUInst<outs, ins, asm, pattern>, PredicateControl {
+
+ // Low bits - basic encoding information.
+ field bit SALU = 0;
+ field bit VALU = 0;
+
+ // SALU instruction formats.
+ field bit SOP1 = 0;
+ field bit SOP2 = 0;
+ field bit SOPC = 0;
+ field bit SOPK = 0;
+ field bit SOPP = 0;
+
+ // VALU instruction formats.
+ field bit VOP1 = 0;
+ field bit VOP2 = 0;
+ field bit VOPC = 0;
+ field bit VOP3 = 0;
+ field bit VINTRP = 0;
+ field bit SDWA = 0;
+ field bit DPP = 0;
+
+ // Memory instruction formats.
+ field bit MUBUF = 0;
+ field bit MTBUF = 0;
+ field bit SMRD = 0;
+ field bit MIMG = 0;
+ field bit EXP = 0;
+ field bit FLAT = 0;
+ field bit DS = 0;
+
+ // Pseudo instruction formats.
+ field bit VGPRSpill = 0;
+ field bit SGPRSpill = 0;
+
+ // High bits - other information.
+ field bit VM_CNT = 0;
+ field bit EXP_CNT = 0;
+ field bit LGKM_CNT = 0;
+
+ // Whether WQM _must_ be enabled for this instruction.
+ field bit WQM = 0;
+
+ // Whether WQM _must_ be disabled for this instruction.
+ field bit DisableWQM = 0;
+
+ field bit Gather4 = 0;
+
+ // Most sopk treat the immediate as a signed 16-bit, however some
+ // use it as unsigned.
+ field bit SOPKZext = 0;
+
+ // This is an s_store_dword* instruction that requires a cache flush
+ // on wave termination. It is necessary to distinguish from mayStore
+ // SMEM instructions like the cache flush ones.
+ field bit ScalarStore = 0;
+
+ // Whether the operands can be ignored when computing the
+ // instruction size.
+ field bit FixedSize = 0;
+
+ // This bit tells the assembler to use the 32-bit encoding in case it
+ // is unable to infer the encoding from the operands.
+ field bit VOPAsmPrefer32Bit = 0;
+
+ // These need to be kept in sync with the enum in SIInstrFlags.
+ let TSFlags{0} = SALU;
+ let TSFlags{1} = VALU;
+
+ let TSFlags{2} = SOP1;
+ let TSFlags{3} = SOP2;
+ let TSFlags{4} = SOPC;
+ let TSFlags{5} = SOPK;
+ let TSFlags{6} = SOPP;
+
+ let TSFlags{7} = VOP1;
+ let TSFlags{8} = VOP2;
+ let TSFlags{9} = VOPC;
+ let TSFlags{10} = VOP3;
+
+ let TSFlags{13} = VINTRP;
+ let TSFlags{14} = SDWA;
+ let TSFlags{15} = DPP;
+
+ let TSFlags{16} = MUBUF;
+ let TSFlags{17} = MTBUF;
+ let TSFlags{18} = SMRD;
+ let TSFlags{19} = MIMG;
+ let TSFlags{20} = EXP;
+ let TSFlags{21} = FLAT;
+ let TSFlags{22} = DS;
+
+ let TSFlags{23} = VGPRSpill;
+ let TSFlags{24} = SGPRSpill;
+
+ let TSFlags{32} = VM_CNT;
+ let TSFlags{33} = EXP_CNT;
+ let TSFlags{34} = LGKM_CNT;
+
+ let TSFlags{35} = WQM;
+ let TSFlags{36} = DisableWQM;
+ let TSFlags{37} = Gather4;
+
+ let TSFlags{38} = SOPKZext;
+ let TSFlags{39} = ScalarStore;
+ let TSFlags{40} = FixedSize;
+ let TSFlags{41} = VOPAsmPrefer32Bit;
+
+ let SchedRW = [Write32Bit];
+
+ field bits<1> DisableSIDecoder = 0;
+ field bits<1> DisableVIDecoder = 0;
+ field bits<1> DisableDecoder = 0;
+
+ let isAsmParserOnly = !if(!eq(DisableDecoder{0}, {0}), 0, 1);
+ let AsmVariantName = AMDGPUAsmVariants.Default;
+}
+
+class PseudoInstSI<dag outs, dag ins, list<dag> pattern = []>
+ : InstSI<outs, ins, "", pattern> {
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+}
+
+class SPseudoInstSI<dag outs, dag ins, list<dag> pattern = []>
+ : PseudoInstSI<outs, ins, pattern> {
+ let SALU = 1;
+}
+
+class VPseudoInstSI<dag outs, dag ins, list<dag> pattern = []>
+ : PseudoInstSI<outs, ins, pattern> {
+ let VALU = 1;
+ let Uses = [EXEC];
+}
+
+class CFPseudoInstSI<dag outs, dag ins, list<dag> pattern = [],
+ bit UseExec = 0, bit DefExec = 0> :
+ SPseudoInstSI<outs, ins, pattern> {
+
+ let Uses = !if(UseExec, [EXEC], []);
+ let Defs = !if(DefExec, [EXEC, SCC], [SCC]);
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+}
+
+class Enc32 {
+ field bits<32> Inst;
+ int Size = 4;
+}
+
+class Enc64 {
+ field bits<64> Inst;
+ int Size = 8;
+}
+
+class VOPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVOPDst">;
+
+class VINTRPe <bits<2> op> : Enc32 {
+ bits<8> vdst;
+ bits<8> vsrc;
+ bits<2> attrchan;
+ bits<6> attr;
+
+ let Inst{7-0} = vsrc;
+ let Inst{9-8} = attrchan;
+ let Inst{15-10} = attr;
+ let Inst{17-16} = op;
+ let Inst{25-18} = vdst;
+ let Inst{31-26} = 0x32; // encoding
+}
+
+class MIMGe <bits<7> op> : Enc64 {
+ bits<8> vdata;
+ bits<4> dmask;
+ bits<1> unorm;
+ bits<1> glc;
+ bits<1> da;
+ bits<1> r128;
+ bits<1> tfe;
+ bits<1> lwe;
+ bits<1> slc;
+ bits<8> vaddr;
+ bits<7> srsrc;
+ bits<7> ssamp;
+
+ let Inst{11-8} = dmask;
+ let Inst{12} = unorm;
+ let Inst{13} = glc;
+ let Inst{14} = da;
+ let Inst{15} = r128;
+ let Inst{16} = tfe;
+ let Inst{17} = lwe;
+ let Inst{24-18} = op;
+ let Inst{25} = slc;
+ let Inst{31-26} = 0x3c;
+ let Inst{39-32} = vaddr;
+ let Inst{47-40} = vdata;
+ let Inst{52-48} = srsrc{6-2};
+ let Inst{57-53} = ssamp{6-2};
+}
+
+class EXPe : Enc64 {
+ bits<4> en;
+ bits<6> tgt;
+ bits<1> compr;
+ bits<1> done;
+ bits<1> vm;
+ bits<8> vsrc0;
+ bits<8> vsrc1;
+ bits<8> vsrc2;
+ bits<8> vsrc3;
+
+ let Inst{3-0} = en;
+ let Inst{9-4} = tgt;
+ let Inst{10} = compr;
+ let Inst{11} = done;
+ let Inst{12} = vm;
+ let Inst{31-26} = 0x3e;
+ let Inst{39-32} = vsrc0;
+ let Inst{47-40} = vsrc1;
+ let Inst{55-48} = vsrc2;
+ let Inst{63-56} = vsrc3;
+}
+
+let Uses = [EXEC] in {
+
+class VINTRPCommon <dag outs, dag ins, string asm, list<dag> pattern> :
+ InstSI <outs, ins, asm, pattern> {
+ let VINTRP = 1;
+ // VINTRP instructions read parameter values from LDS, but these parameter
+ // values are stored outside of the LDS memory that is allocated to the
+ // shader for general purpose use.
+ //
+ // While it may be possible for ds_read/ds_write instructions to access
+ // the parameter values in LDS, this would essentially be an out-of-bounds
+ // memory access which we consider to be undefined behavior.
+ //
+ // So even though these instructions read memory, this memory is outside the
+ // addressable memory space for the shader, and we consider these instructions
+ // to be readnone.
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+}
+
+class EXPCommon<dag outs, dag ins, string asm, list<dag> pattern> :
+ InstSI<outs, ins, asm, pattern> {
+ let EXP = 1;
+ let EXP_CNT = 1;
+ let mayLoad = 0; // Set to 1 if done bit is set.
+ let mayStore = 1;
+ let UseNamedOperandTable = 1;
+ let Uses = [EXEC];
+ let SchedRW = [WriteExport];
+}
+
+} // End Uses = [EXEC]
+
+class MIMG <dag outs, dag ins, string asm, list<dag> pattern> :
+ InstSI <outs, ins, asm, pattern> {
+
+ let VM_CNT = 1;
+ let EXP_CNT = 1;
+ let MIMG = 1;
+ let Uses = [EXEC];
+
+ let UseNamedOperandTable = 1;
+ let hasSideEffects = 0; // XXX ????
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
new file mode 100644
index 000000000000..26a8d22062a9
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -0,0 +1,3642 @@
+//===-- SIInstrInfo.cpp - SI Instruction Information ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief SI Implementation of TargetInstrInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SIInstrInfo.h"
+#include "AMDGPUTargetMachine.h"
+#include "GCNHazardRecognizer.h"
+#include "SIDefines.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/IR/Function.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+// Must be at least 4 to be able to branch over minimum unconditional branch
+// code. This is only for making it possible to write reasonably small tests for
+// long branches.
+static cl::opt<unsigned>
+BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
+ cl::desc("Restrict range of branch instructions (DEBUG)"));
+
+SIInstrInfo::SIInstrInfo(const SISubtarget &ST)
+ : AMDGPUInstrInfo(ST), RI(), ST(ST) {}
+
+//===----------------------------------------------------------------------===//
+// TargetInstrInfo callbacks
+//===----------------------------------------------------------------------===//
+
+static unsigned getNumOperandsNoGlue(SDNode *Node) {
+ unsigned N = Node->getNumOperands();
+ while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
+ --N;
+ return N;
+}
+
+static SDValue findChainOperand(SDNode *Load) {
+ SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1);
+ assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node");
+ return LastOp;
+}
+
+/// \brief Returns true if both nodes have the same value for the given
+/// operand \p Op, or if both nodes do not have this operand.
+static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
+ unsigned Opc0 = N0->getMachineOpcode();
+ unsigned Opc1 = N1->getMachineOpcode();
+
+ int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
+ int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
+
+ if (Op0Idx == -1 && Op1Idx == -1)
+ return true;
+
+
+ if ((Op0Idx == -1 && Op1Idx != -1) ||
+ (Op1Idx == -1 && Op0Idx != -1))
+ return false;
+
+ // getNamedOperandIdx returns the index for the MachineInstr's operands,
+ // which includes the result as the first operand. We are indexing into the
+ // MachineSDNode's operands, so we need to skip the result operand to get
+ // the real index.
+ --Op0Idx;
+ --Op1Idx;
+
+ return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
+}
+
+bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
+ AliasAnalysis *AA) const {
+ // TODO: The generic check fails for VALU instructions that should be
+ // rematerializable due to implicit reads of exec. We really want all of the
+ // generic logic for this except for this.
+ switch (MI.getOpcode()) {
+ case AMDGPU::V_MOV_B32_e32:
+ case AMDGPU::V_MOV_B32_e64:
+ case AMDGPU::V_MOV_B64_PSEUDO:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
+ int64_t &Offset0,
+ int64_t &Offset1) const {
+ if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
+ return false;
+
+ unsigned Opc0 = Load0->getMachineOpcode();
+ unsigned Opc1 = Load1->getMachineOpcode();
+
+ // Make sure both are actually loads.
+ if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
+ return false;
+
+ if (isDS(Opc0) && isDS(Opc1)) {
+
+ // FIXME: Handle this case:
+ if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
+ return false;
+
+ // Check base reg.
+ if (Load0->getOperand(1) != Load1->getOperand(1))
+ return false;
+
+ // Check chain.
+ if (findChainOperand(Load0) != findChainOperand(Load1))
+ return false;
+
+ // Skip read2 / write2 variants for simplicity.
+ // TODO: We should report true if the used offsets are adjacent (excluded
+ // st64 versions).
+ if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 ||
+ AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1)
+ return false;
+
+ Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue();
+ Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue();
+ return true;
+ }
+
+ if (isSMRD(Opc0) && isSMRD(Opc1)) {
+ assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1));
+
+ // Check base reg.
+ if (Load0->getOperand(0) != Load1->getOperand(0))
+ return false;
+
+ const ConstantSDNode *Load0Offset =
+ dyn_cast<ConstantSDNode>(Load0->getOperand(1));
+ const ConstantSDNode *Load1Offset =
+ dyn_cast<ConstantSDNode>(Load1->getOperand(1));
+
+ if (!Load0Offset || !Load1Offset)
+ return false;
+
+ // Check chain.
+ if (findChainOperand(Load0) != findChainOperand(Load1))
+ return false;
+
+ Offset0 = Load0Offset->getZExtValue();
+ Offset1 = Load1Offset->getZExtValue();
+ return true;
+ }
+
+ // MUBUF and MTBUF can access the same addresses.
+ if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
+
+ // MUBUF and MTBUF have vaddr at different indices.
+ if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
+ findChainOperand(Load0) != findChainOperand(Load1) ||
+ !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
+ !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
+ return false;
+
+ int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
+ int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
+
+ if (OffIdx0 == -1 || OffIdx1 == -1)
+ return false;
+
+ // getNamedOperandIdx returns the index for MachineInstrs. Since they
+ // inlcude the output in the operand list, but SDNodes don't, we need to
+ // subtract the index by one.
+ --OffIdx0;
+ --OffIdx1;
+
+ SDValue Off0 = Load0->getOperand(OffIdx0);
+ SDValue Off1 = Load1->getOperand(OffIdx1);
+
+ // The offset might be a FrameIndexSDNode.
+ if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
+ return false;
+
+ Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue();
+ Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue();
+ return true;
+ }
+
+ return false;
+}
+
+static bool isStride64(unsigned Opc) {
+ switch (Opc) {
+ case AMDGPU::DS_READ2ST64_B32:
+ case AMDGPU::DS_READ2ST64_B64:
+ case AMDGPU::DS_WRITE2ST64_B32:
+ case AMDGPU::DS_WRITE2ST64_B64:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
+ int64_t &Offset,
+ const TargetRegisterInfo *TRI) const {
+ unsigned Opc = LdSt.getOpcode();
+
+ if (isDS(LdSt)) {
+ const MachineOperand *OffsetImm =
+ getNamedOperand(LdSt, AMDGPU::OpName::offset);
+ if (OffsetImm) {
+ // Normal, single offset LDS instruction.
+ const MachineOperand *AddrReg =
+ getNamedOperand(LdSt, AMDGPU::OpName::addr);
+
+ BaseReg = AddrReg->getReg();
+ Offset = OffsetImm->getImm();
+ return true;
+ }
+
+ // The 2 offset instructions use offset0 and offset1 instead. We can treat
+ // these as a load with a single offset if the 2 offsets are consecutive. We
+ // will use this for some partially aligned loads.
+ const MachineOperand *Offset0Imm =
+ getNamedOperand(LdSt, AMDGPU::OpName::offset0);
+ const MachineOperand *Offset1Imm =
+ getNamedOperand(LdSt, AMDGPU::OpName::offset1);
+
+ uint8_t Offset0 = Offset0Imm->getImm();
+ uint8_t Offset1 = Offset1Imm->getImm();
+
+ if (Offset1 > Offset0 && Offset1 - Offset0 == 1) {
+ // Each of these offsets is in element sized units, so we need to convert
+ // to bytes of the individual reads.
+
+ unsigned EltSize;
+ if (LdSt.mayLoad())
+ EltSize = getOpRegClass(LdSt, 0)->getSize() / 2;
+ else {
+ assert(LdSt.mayStore());
+ int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
+ EltSize = getOpRegClass(LdSt, Data0Idx)->getSize();
+ }
+
+ if (isStride64(Opc))
+ EltSize *= 64;
+
+ const MachineOperand *AddrReg =
+ getNamedOperand(LdSt, AMDGPU::OpName::addr);
+ BaseReg = AddrReg->getReg();
+ Offset = EltSize * Offset0;
+ return true;
+ }
+
+ return false;
+ }
+
+ if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
+ const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset);
+ if (SOffset && SOffset->isReg())
+ return false;
+
+ const MachineOperand *AddrReg =
+ getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
+ if (!AddrReg)
+ return false;
+
+ const MachineOperand *OffsetImm =
+ getNamedOperand(LdSt, AMDGPU::OpName::offset);
+ BaseReg = AddrReg->getReg();
+ Offset = OffsetImm->getImm();
+
+ if (SOffset) // soffset can be an inline immediate.
+ Offset += SOffset->getImm();
+
+ return true;
+ }
+
+ if (isSMRD(LdSt)) {
+ const MachineOperand *OffsetImm =
+ getNamedOperand(LdSt, AMDGPU::OpName::offset);
+ if (!OffsetImm)
+ return false;
+
+ const MachineOperand *SBaseReg =
+ getNamedOperand(LdSt, AMDGPU::OpName::sbase);
+ BaseReg = SBaseReg->getReg();
+ Offset = OffsetImm->getImm();
+ return true;
+ }
+
+ if (isFLAT(LdSt)) {
+ const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
+ BaseReg = AddrReg->getReg();
+ Offset = 0;
+ return true;
+ }
+
+ return false;
+}
+
+bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
+ MachineInstr &SecondLdSt,
+ unsigned NumLoads) const {
+ const MachineOperand *FirstDst = nullptr;
+ const MachineOperand *SecondDst = nullptr;
+
+ if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) ||
+ (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt))) {
+ FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata);
+ SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata);
+ } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) {
+ FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst);
+ SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst);
+ } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) {
+ FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
+ SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
+ }
+
+ if (!FirstDst || !SecondDst)
+ return false;
+
+ // Try to limit clustering based on the total number of bytes loaded
+ // rather than the number of instructions. This is done to help reduce
+ // register pressure. The method used is somewhat inexact, though,
+ // because it assumes that all loads in the cluster will load the
+ // same number of bytes as FirstLdSt.
+
+ // The unit of this value is bytes.
+ // FIXME: This needs finer tuning.
+ unsigned LoadClusterThreshold = 16;
+
+ const MachineRegisterInfo &MRI =
+ FirstLdSt.getParent()->getParent()->getRegInfo();
+ const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg());
+
+ return (NumLoads * DstRC->getSize()) <= LoadClusterThreshold;
+}
+
+void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const DebugLoc &DL, unsigned DestReg,
+ unsigned SrcReg, bool KillSrc) const {
+ const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg);
+
+ if (RC == &AMDGPU::VGPR_32RegClass) {
+ assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
+ AMDGPU::SReg_32RegClass.contains(SrcReg));
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+ }
+
+ if (RC == &AMDGPU::SReg_32_XM0RegClass ||
+ RC == &AMDGPU::SReg_32RegClass) {
+ if (SrcReg == AMDGPU::SCC) {
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
+ .addImm(-1)
+ .addImm(0);
+ return;
+ }
+
+ assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+ }
+
+ if (RC == &AMDGPU::SReg_64RegClass) {
+ if (DestReg == AMDGPU::VCC) {
+ if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ } else {
+ // FIXME: Hack until VReg_1 removed.
+ assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
+ .addImm(0)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
+
+ return;
+ }
+
+ assert(AMDGPU::SReg_64RegClass.contains(SrcReg));
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+ }
+
+ if (DestReg == AMDGPU::SCC) {
+ assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
+ .addReg(SrcReg, getKillRegState(KillSrc))
+ .addImm(0);
+ return;
+ }
+
+ unsigned EltSize = 4;
+ unsigned Opcode = AMDGPU::V_MOV_B32_e32;
+ if (RI.isSGPRClass(RC)) {
+ if (RC->getSize() > 4) {
+ Opcode = AMDGPU::S_MOV_B64;
+ EltSize = 8;
+ } else {
+ Opcode = AMDGPU::S_MOV_B32;
+ EltSize = 4;
+ }
+ }
+
+ ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
+ bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
+
+ for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
+ unsigned SubIdx;
+ if (Forward)
+ SubIdx = SubIndices[Idx];
+ else
+ SubIdx = SubIndices[SubIndices.size() - Idx - 1];
+
+ MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
+ get(Opcode), RI.getSubReg(DestReg, SubIdx));
+
+ Builder.addReg(RI.getSubReg(SrcReg, SubIdx));
+
+ if (Idx == SubIndices.size() - 1)
+ Builder.addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
+
+ if (Idx == 0)
+ Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
+
+ Builder.addReg(SrcReg, RegState::Implicit);
+ }
+}
+
+int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
+ int NewOpc;
+
+ // Try to map original to commuted opcode
+ NewOpc = AMDGPU::getCommuteRev(Opcode);
+ if (NewOpc != -1)
+ // Check if the commuted (REV) opcode exists on the target.
+ return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
+
+ // Try to map commuted to original opcode
+ NewOpc = AMDGPU::getCommuteOrig(Opcode);
+ if (NewOpc != -1)
+ // Check if the original (non-REV) opcode exists on the target.
+ return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
+
+ return Opcode;
+}
+
+unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
+
+ if (DstRC->getSize() == 4) {
+ return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
+ } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) {
+ return AMDGPU::S_MOV_B64;
+ } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) {
+ return AMDGPU::V_MOV_B64_PSEUDO;
+ }
+ return AMDGPU::COPY;
+}
+
+static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
+ switch (Size) {
+ case 4:
+ return AMDGPU::SI_SPILL_S32_SAVE;
+ case 8:
+ return AMDGPU::SI_SPILL_S64_SAVE;
+ case 16:
+ return AMDGPU::SI_SPILL_S128_SAVE;
+ case 32:
+ return AMDGPU::SI_SPILL_S256_SAVE;
+ case 64:
+ return AMDGPU::SI_SPILL_S512_SAVE;
+ default:
+ llvm_unreachable("unknown register size");
+ }
+}
+
+static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
+ switch (Size) {
+ case 4:
+ return AMDGPU::SI_SPILL_V32_SAVE;
+ case 8:
+ return AMDGPU::SI_SPILL_V64_SAVE;
+ case 12:
+ return AMDGPU::SI_SPILL_V96_SAVE;
+ case 16:
+ return AMDGPU::SI_SPILL_V128_SAVE;
+ case 32:
+ return AMDGPU::SI_SPILL_V256_SAVE;
+ case 64:
+ return AMDGPU::SI_SPILL_V512_SAVE;
+ default:
+ llvm_unreachable("unknown register size");
+ }
+}
+
+void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned SrcReg, bool isKill,
+ int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ MachineFunction *MF = MBB.getParent();
+ SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+ MachineFrameInfo &FrameInfo = MF->getFrameInfo();
+ DebugLoc DL = MBB.findDebugLoc(MI);
+
+ unsigned Size = FrameInfo.getObjectSize(FrameIndex);
+ unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
+ MachinePointerInfo PtrInfo
+ = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
+ MachineMemOperand *MMO
+ = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
+ Size, Align);
+
+ if (RI.isSGPRClass(RC)) {
+ MFI->setHasSpilledSGPRs();
+
+ // We are only allowed to create one new instruction when spilling
+ // registers, so we need to use pseudo instruction for spilling SGPRs.
+ const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(RC->getSize()));
+
+ // The SGPR spill/restore instructions only work on number sgprs, so we need
+ // to make sure we are using the correct register class.
+ if (TargetRegisterInfo::isVirtualRegister(SrcReg) && RC->getSize() == 4) {
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
+ }
+
+ MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc)
+ .addReg(SrcReg, getKillRegState(isKill)) // data
+ .addFrameIndex(FrameIndex) // addr
+ .addMemOperand(MMO)
+ .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
+ .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit);
+ // Add the scratch resource registers as implicit uses because we may end up
+ // needing them, and need to ensure that the reserved registers are
+ // correctly handled.
+
+ if (ST.hasScalarStores()) {
+ // m0 is used for offset to scalar stores if used to spill.
+ Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine);
+ }
+
+ return;
+ }
+
+ if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) {
+ LLVMContext &Ctx = MF->getFunction()->getContext();
+ Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to"
+ " spill register");
+ BuildMI(MBB, MI, DL, get(AMDGPU::KILL))
+ .addReg(SrcReg);
+
+ return;
+ }
+
+ assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
+
+ unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize());
+ MFI->setHasSpilledVGPRs();
+ BuildMI(MBB, MI, DL, get(Opcode))
+ .addReg(SrcReg, getKillRegState(isKill)) // data
+ .addFrameIndex(FrameIndex) // addr
+ .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
+ .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
+ .addImm(0) // offset
+ .addMemOperand(MMO);
+}
+
+static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
+ switch (Size) {
+ case 4:
+ return AMDGPU::SI_SPILL_S32_RESTORE;
+ case 8:
+ return AMDGPU::SI_SPILL_S64_RESTORE;
+ case 16:
+ return AMDGPU::SI_SPILL_S128_RESTORE;
+ case 32:
+ return AMDGPU::SI_SPILL_S256_RESTORE;
+ case 64:
+ return AMDGPU::SI_SPILL_S512_RESTORE;
+ default:
+ llvm_unreachable("unknown register size");
+ }
+}
+
+static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
+ switch (Size) {
+ case 4:
+ return AMDGPU::SI_SPILL_V32_RESTORE;
+ case 8:
+ return AMDGPU::SI_SPILL_V64_RESTORE;
+ case 12:
+ return AMDGPU::SI_SPILL_V96_RESTORE;
+ case 16:
+ return AMDGPU::SI_SPILL_V128_RESTORE;
+ case 32:
+ return AMDGPU::SI_SPILL_V256_RESTORE;
+ case 64:
+ return AMDGPU::SI_SPILL_V512_RESTORE;
+ default:
+ llvm_unreachable("unknown register size");
+ }
+}
+
+void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned DestReg, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ MachineFunction *MF = MBB.getParent();
+ const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+ MachineFrameInfo &FrameInfo = MF->getFrameInfo();
+ DebugLoc DL = MBB.findDebugLoc(MI);
+ unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
+ unsigned Size = FrameInfo.getObjectSize(FrameIndex);
+
+ MachinePointerInfo PtrInfo
+ = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
+
+ MachineMemOperand *MMO = MF->getMachineMemOperand(
+ PtrInfo, MachineMemOperand::MOLoad, Size, Align);
+
+ if (RI.isSGPRClass(RC)) {
+ // FIXME: Maybe this should not include a memoperand because it will be
+ // lowered to non-memory instructions.
+ const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(RC->getSize()));
+ if (TargetRegisterInfo::isVirtualRegister(DestReg) && RC->getSize() == 4) {
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
+ }
+
+ MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg)
+ .addFrameIndex(FrameIndex) // addr
+ .addMemOperand(MMO)
+ .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
+ .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit);
+
+ if (ST.hasScalarStores()) {
+ // m0 is used for offset to scalar stores if used to spill.
+ Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine);
+ }
+
+ return;
+ }
+
+ if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) {
+ LLVMContext &Ctx = MF->getFunction()->getContext();
+ Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to"
+ " restore register");
+ BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg);
+
+ return;
+ }
+
+ assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
+
+ unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize());
+ BuildMI(MBB, MI, DL, get(Opcode), DestReg)
+ .addFrameIndex(FrameIndex) // vaddr
+ .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
+ .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
+ .addImm(0) // offset
+ .addMemOperand(MMO);
+}
+
+/// \param @Offset Offset in bytes of the FrameIndex being spilled
+unsigned SIInstrInfo::calculateLDSSpillAddress(
+ MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg,
+ unsigned FrameOffset, unsigned Size) const {
+ MachineFunction *MF = MBB.getParent();
+ SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+ const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ DebugLoc DL = MBB.findDebugLoc(MI);
+ unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
+ unsigned WavefrontSize = ST.getWavefrontSize();
+
+ unsigned TIDReg = MFI->getTIDReg();
+ if (!MFI->hasCalculatedTID()) {
+ MachineBasicBlock &Entry = MBB.getParent()->front();
+ MachineBasicBlock::iterator Insert = Entry.front();
+ DebugLoc DL = Insert->getDebugLoc();
+
+ TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass,
+ *MF);
+ if (TIDReg == AMDGPU::NoRegister)
+ return TIDReg;
+
+ if (!AMDGPU::isShader(MF->getFunction()->getCallingConv()) &&
+ WorkGroupSize > WavefrontSize) {
+
+ unsigned TIDIGXReg
+ = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X);
+ unsigned TIDIGYReg
+ = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y);
+ unsigned TIDIGZReg
+ = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z);
+ unsigned InputPtrReg =
+ TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
+ for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
+ if (!Entry.isLiveIn(Reg))
+ Entry.addLiveIn(Reg);
+ }
+
+ RS->enterBasicBlock(Entry);
+ // FIXME: Can we scavenge an SReg_64 and access the subregs?
+ unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
+ unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
+ BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0)
+ .addReg(InputPtrReg)
+ .addImm(SI::KernelInputOffsets::NGROUPS_Z);
+ BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1)
+ .addReg(InputPtrReg)
+ .addImm(SI::KernelInputOffsets::NGROUPS_Y);
+
+ // NGROUPS.X * NGROUPS.Y
+ BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1)
+ .addReg(STmp1)
+ .addReg(STmp0);
+ // (NGROUPS.X * NGROUPS.Y) * TIDIG.X
+ BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg)
+ .addReg(STmp1)
+ .addReg(TIDIGXReg);
+ // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)
+ BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg)
+ .addReg(STmp0)
+ .addReg(TIDIGYReg)
+ .addReg(TIDReg);
+ // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z
+ BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg)
+ .addReg(TIDReg)
+ .addReg(TIDIGZReg);
+ } else {
+ // Get the wave id
+ BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
+ TIDReg)
+ .addImm(-1)
+ .addImm(0);
+
+ BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
+ TIDReg)
+ .addImm(-1)
+ .addReg(TIDReg);
+ }
+
+ BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32),
+ TIDReg)
+ .addImm(2)
+ .addReg(TIDReg);
+ MFI->setTIDReg(TIDReg);
+ }
+
+ // Add FrameIndex to LDS offset
+ unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize);
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg)
+ .addImm(LDSOffset)
+ .addReg(TIDReg);
+
+ return TmpReg;
+}
+
+void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ int Count) const {
+ DebugLoc DL = MBB.findDebugLoc(MI);
+ while (Count > 0) {
+ int Arg;
+ if (Count >= 8)
+ Arg = 7;
+ else
+ Arg = Count - 1;
+ Count -= 8;
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP))
+ .addImm(Arg);
+ }
+}
+
+void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI) const {
+ insertWaitStates(MBB, MI, 1);
+}
+
+unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ default: return 1; // FIXME: Do wait states equal cycles?
+
+ case AMDGPU::S_NOP:
+ return MI.getOperand(0).getImm() + 1;
+ }
+}
+
+bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+ MachineBasicBlock &MBB = *MI.getParent();
+ DebugLoc DL = MBB.findDebugLoc(MI);
+ switch (MI.getOpcode()) {
+ default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
+ case AMDGPU::S_MOV_B64_term: {
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(get(AMDGPU::S_MOV_B64));
+ break;
+ }
+ case AMDGPU::S_XOR_B64_term: {
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(get(AMDGPU::S_XOR_B64));
+ break;
+ }
+ case AMDGPU::S_ANDN2_B64_term: {
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(get(AMDGPU::S_ANDN2_B64));
+ break;
+ }
+ case AMDGPU::V_MOV_B64_PSEUDO: {
+ unsigned Dst = MI.getOperand(0).getReg();
+ unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
+ unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
+
+ const MachineOperand &SrcOp = MI.getOperand(1);
+ // FIXME: Will this work for 64-bit floating point immediates?
+ assert(!SrcOp.isFPImm());
+ if (SrcOp.isImm()) {
+ APInt Imm(64, SrcOp.getImm());
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
+ .addImm(Imm.getLoBits(32).getZExtValue())
+ .addReg(Dst, RegState::Implicit | RegState::Define);
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
+ .addImm(Imm.getHiBits(32).getZExtValue())
+ .addReg(Dst, RegState::Implicit | RegState::Define);
+ } else {
+ assert(SrcOp.isReg());
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
+ .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
+ .addReg(Dst, RegState::Implicit | RegState::Define);
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
+ .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
+ .addReg(Dst, RegState::Implicit | RegState::Define);
+ }
+ MI.eraseFromParent();
+ break;
+ }
+ case AMDGPU::V_MOVRELD_B32_V1:
+ case AMDGPU::V_MOVRELD_B32_V2:
+ case AMDGPU::V_MOVRELD_B32_V4:
+ case AMDGPU::V_MOVRELD_B32_V8:
+ case AMDGPU::V_MOVRELD_B32_V16: {
+ const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32);
+ unsigned VecReg = MI.getOperand(0).getReg();
+ bool IsUndef = MI.getOperand(1).isUndef();
+ unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm();
+ assert(VecReg == MI.getOperand(1).getReg());
+
+ MachineInstr *MovRel =
+ BuildMI(MBB, MI, DL, MovRelDesc)
+ .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
+ .addOperand(MI.getOperand(2))
+ .addReg(VecReg, RegState::ImplicitDefine)
+ .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
+
+ const int ImpDefIdx =
+ MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses();
+ const int ImpUseIdx = ImpDefIdx + 1;
+ MovRel->tieOperands(ImpDefIdx, ImpUseIdx);
+
+ MI.eraseFromParent();
+ break;
+ }
+ case AMDGPU::SI_PC_ADD_REL_OFFSET: {
+ MachineFunction &MF = *MBB.getParent();
+ unsigned Reg = MI.getOperand(0).getReg();
+ unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
+ unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
+
+ // Create a bundle so these instructions won't be re-ordered by the
+ // post-RA scheduler.
+ MIBundleBuilder Bundler(MBB, MI);
+ Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
+
+ // Add 32-bit offset from this instruction to the start of the
+ // constant data.
+ Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
+ .addReg(RegLo)
+ .addOperand(MI.getOperand(1)));
+
+ MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
+ .addReg(RegHi);
+ if (MI.getOperand(2).getTargetFlags() == SIInstrInfo::MO_NONE)
+ MIB.addImm(0);
+ else
+ MIB.addOperand(MI.getOperand(2));
+
+ Bundler.append(MIB);
+ llvm::finalizeBundle(MBB, Bundler.begin());
+
+ MI.eraseFromParent();
+ break;
+ }
+ }
+ return true;
+}
+
+bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI,
+ MachineOperand &Src0,
+ unsigned Src0OpName,
+ MachineOperand &Src1,
+ unsigned Src1OpName) const {
+ MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
+ if (!Src0Mods)
+ return false;
+
+ MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
+ assert(Src1Mods &&
+ "All commutable instructions have both src0 and src1 modifiers");
+
+ int Src0ModsVal = Src0Mods->getImm();
+ int Src1ModsVal = Src1Mods->getImm();
+
+ Src1Mods->setImm(Src0ModsVal);
+ Src0Mods->setImm(Src1ModsVal);
+ return true;
+}
+
+static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
+ MachineOperand &RegOp,
+ MachineOperand &NonRegOp) {
+ unsigned Reg = RegOp.getReg();
+ unsigned SubReg = RegOp.getSubReg();
+ bool IsKill = RegOp.isKill();
+ bool IsDead = RegOp.isDead();
+ bool IsUndef = RegOp.isUndef();
+ bool IsDebug = RegOp.isDebug();
+
+ if (NonRegOp.isImm())
+ RegOp.ChangeToImmediate(NonRegOp.getImm());
+ else if (NonRegOp.isFI())
+ RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
+ else
+ return nullptr;
+
+ NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
+ NonRegOp.setSubReg(SubReg);
+
+ return &MI;
+}
+
+MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+ unsigned Src0Idx,
+ unsigned Src1Idx) const {
+ assert(!NewMI && "this should never be used");
+
+ unsigned Opc = MI.getOpcode();
+ int CommutedOpcode = commuteOpcode(Opc);
+ if (CommutedOpcode == -1)
+ return nullptr;
+
+ assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
+ static_cast<int>(Src0Idx) &&
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
+ static_cast<int>(Src1Idx) &&
+ "inconsistency with findCommutedOpIndices");
+
+ MachineOperand &Src0 = MI.getOperand(Src0Idx);
+ MachineOperand &Src1 = MI.getOperand(Src1Idx);
+
+ MachineInstr *CommutedMI = nullptr;
+ if (Src0.isReg() && Src1.isReg()) {
+ if (isOperandLegal(MI, Src1Idx, &Src0)) {
+ // Be sure to copy the source modifiers to the right place.
+ CommutedMI
+ = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
+ }
+
+ } else if (Src0.isReg() && !Src1.isReg()) {
+ // src0 should always be able to support any operand type, so no need to
+ // check operand legality.
+ CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
+ } else if (!Src0.isReg() && Src1.isReg()) {
+ if (isOperandLegal(MI, Src1Idx, &Src0))
+ CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
+ } else {
+ // FIXME: Found two non registers to commute. This does happen.
+ return nullptr;
+ }
+
+
+ if (CommutedMI) {
+ swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
+ Src1, AMDGPU::OpName::src1_modifiers);
+
+ CommutedMI->setDesc(get(CommutedOpcode));
+ }
+
+ return CommutedMI;
+}
+
+// This needs to be implemented because the source modifiers may be inserted
+// between the true commutable operands, and the base
+// TargetInstrInfo::commuteInstruction uses it.
+bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0,
+ unsigned &SrcOpIdx1) const {
+ if (!MI.isCommutable())
+ return false;
+
+ unsigned Opc = MI.getOpcode();
+ int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+ if (Src0Idx == -1)
+ return false;
+
+ int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+ if (Src1Idx == -1)
+ return false;
+
+ return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
+}
+
+bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
+ int64_t BrOffset) const {
+ // BranchRelaxation should never have to check s_setpc_b64 because its dest
+ // block is unanalyzable.
+ assert(BranchOp != AMDGPU::S_SETPC_B64);
+
+ // Convert to dwords.
+ BrOffset /= 4;
+
+ // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
+ // from the next instruction.
+ BrOffset -= 1;
+
+ return isIntN(BranchOffsetBits, BrOffset);
+}
+
+MachineBasicBlock *SIInstrInfo::getBranchDestBlock(
+ const MachineInstr &MI) const {
+ if (MI.getOpcode() == AMDGPU::S_SETPC_B64) {
+ // This would be a difficult analysis to perform, but can always be legal so
+ // there's no need to analyze it.
+ return nullptr;
+ }
+
+ return MI.getOperand(0).getMBB();
+}
+
+unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock &DestBB,
+ const DebugLoc &DL,
+ int64_t BrOffset,
+ RegScavenger *RS) const {
+ assert(RS && "RegScavenger required for long branching");
+ assert(MBB.empty() &&
+ "new block should be inserted for expanding unconditional branch");
+ assert(MBB.pred_size() == 1);
+
+ MachineFunction *MF = MBB.getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ // FIXME: Virtual register workaround for RegScavenger not working with empty
+ // blocks.
+ unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+
+ auto I = MBB.end();
+
+ // We need to compute the offset relative to the instruction immediately after
+ // s_getpc_b64. Insert pc arithmetic code before last terminator.
+ MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
+
+ // TODO: Handle > 32-bit block address.
+ if (BrOffset >= 0) {
+ BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
+ .addReg(PCReg, RegState::Define, AMDGPU::sub0)
+ .addReg(PCReg, 0, AMDGPU::sub0)
+ .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_FORWARD);
+ BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
+ .addReg(PCReg, RegState::Define, AMDGPU::sub1)
+ .addReg(PCReg, 0, AMDGPU::sub1)
+ .addImm(0);
+ } else {
+ // Backwards branch.
+ BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32))
+ .addReg(PCReg, RegState::Define, AMDGPU::sub0)
+ .addReg(PCReg, 0, AMDGPU::sub0)
+ .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_BACKWARD);
+ BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32))
+ .addReg(PCReg, RegState::Define, AMDGPU::sub1)
+ .addReg(PCReg, 0, AMDGPU::sub1)
+ .addImm(0);
+ }
+
+ // Insert the indirect branch after the other terminator.
+ BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
+ .addReg(PCReg);
+
+ // FIXME: If spilling is necessary, this will fail because this scavenger has
+ // no emergency stack slots. It is non-trivial to spill in this situation,
+ // because the restore code needs to be specially placed after the
+ // jump. BranchRelaxation then needs to be made aware of the newly inserted
+ // block.
+ //
+ // If a spill is needed for the pc register pair, we need to insert a spill
+ // restore block right before the destination block, and insert a short branch
+ // into the old destination block's fallthrough predecessor.
+ // e.g.:
+ //
+ // s_cbranch_scc0 skip_long_branch:
+ //
+ // long_branch_bb:
+ // spill s[8:9]
+ // s_getpc_b64 s[8:9]
+ // s_add_u32 s8, s8, restore_bb
+ // s_addc_u32 s9, s9, 0
+ // s_setpc_b64 s[8:9]
+ //
+ // skip_long_branch:
+ // foo;
+ //
+ // .....
+ //
+ // dest_bb_fallthrough_predecessor:
+ // bar;
+ // s_branch dest_bb
+ //
+ // restore_bb:
+ // restore s[8:9]
+ // fallthrough dest_bb
+ ///
+ // dest_bb:
+ // buzz;
+
+ RS->enterBasicBlockEnd(MBB);
+ unsigned Scav = RS->scavengeRegister(&AMDGPU::SReg_64RegClass,
+ MachineBasicBlock::iterator(GetPC), 0);
+ MRI.replaceRegWith(PCReg, Scav);
+ MRI.clearVirtRegs();
+ RS->setRegUsed(Scav);
+
+ return 4 + 8 + 4 + 4;
+}
+
+unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
+ switch (Cond) {
+ case SIInstrInfo::SCC_TRUE:
+ return AMDGPU::S_CBRANCH_SCC1;
+ case SIInstrInfo::SCC_FALSE:
+ return AMDGPU::S_CBRANCH_SCC0;
+ case SIInstrInfo::VCCNZ:
+ return AMDGPU::S_CBRANCH_VCCNZ;
+ case SIInstrInfo::VCCZ:
+ return AMDGPU::S_CBRANCH_VCCZ;
+ case SIInstrInfo::EXECNZ:
+ return AMDGPU::S_CBRANCH_EXECNZ;
+ case SIInstrInfo::EXECZ:
+ return AMDGPU::S_CBRANCH_EXECZ;
+ default:
+ llvm_unreachable("invalid branch predicate");
+ }
+}
+
+SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
+ switch (Opcode) {
+ case AMDGPU::S_CBRANCH_SCC0:
+ return SCC_FALSE;
+ case AMDGPU::S_CBRANCH_SCC1:
+ return SCC_TRUE;
+ case AMDGPU::S_CBRANCH_VCCNZ:
+ return VCCNZ;
+ case AMDGPU::S_CBRANCH_VCCZ:
+ return VCCZ;
+ case AMDGPU::S_CBRANCH_EXECNZ:
+ return EXECNZ;
+ case AMDGPU::S_CBRANCH_EXECZ:
+ return EXECZ;
+ default:
+ return INVALID_BR;
+ }
+}
+
+bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const {
+ if (I->getOpcode() == AMDGPU::S_BRANCH) {
+ // Unconditional Branch
+ TBB = I->getOperand(0).getMBB();
+ return false;
+ }
+
+ BranchPredicate Pred = getBranchPredicate(I->getOpcode());
+ if (Pred == INVALID_BR)
+ return true;
+
+ MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
+ Cond.push_back(MachineOperand::CreateImm(Pred));
+ Cond.push_back(I->getOperand(1)); // Save the branch register.
+
+ ++I;
+
+ if (I == MBB.end()) {
+ // Conditional branch followed by fall-through.
+ TBB = CondBB;
+ return false;
+ }
+
+ if (I->getOpcode() == AMDGPU::S_BRANCH) {
+ TBB = CondBB;
+ FBB = I->getOperand(0).getMBB();
+ return false;
+ }
+
+ return true;
+}
+
+bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const {
+ MachineBasicBlock::iterator I = MBB.getFirstTerminator();
+ if (I == MBB.end())
+ return false;
+
+ if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH)
+ return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
+
+ ++I;
+
+ // TODO: Should be able to treat as fallthrough?
+ if (I == MBB.end())
+ return true;
+
+ if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify))
+ return true;
+
+ MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB();
+
+ // Specifically handle the case where the conditional branch is to the same
+ // destination as the mask branch. e.g.
+ //
+ // si_mask_branch BB8
+ // s_cbranch_execz BB8
+ // s_cbranch BB9
+ //
+ // This is required to understand divergent loops which may need the branches
+ // to be relaxed.
+ if (TBB != MaskBrDest || Cond.empty())
+ return true;
+
+ auto Pred = Cond[0].getImm();
+ return (Pred != EXECZ && Pred != EXECNZ);
+}
+
+unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved) const {
+ MachineBasicBlock::iterator I = MBB.getFirstTerminator();
+
+ unsigned Count = 0;
+ unsigned RemovedSize = 0;
+ while (I != MBB.end()) {
+ MachineBasicBlock::iterator Next = std::next(I);
+ if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) {
+ I = Next;
+ continue;
+ }
+
+ RemovedSize += getInstSizeInBytes(*I);
+ I->eraseFromParent();
+ ++Count;
+ I = Next;
+ }
+
+ if (BytesRemoved)
+ *BytesRemoved = RemovedSize;
+
+ return Count;
+}
+
+unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB,
+ ArrayRef<MachineOperand> Cond,
+ const DebugLoc &DL,
+ int *BytesAdded) const {
+
+ if (!FBB && Cond.empty()) {
+ BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
+ .addMBB(TBB);
+ if (BytesAdded)
+ *BytesAdded = 4;
+ return 1;
+ }
+
+ assert(TBB && Cond[0].isImm());
+
+ unsigned Opcode
+ = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
+
+ if (!FBB) {
+ Cond[1].isUndef();
+ MachineInstr *CondBr =
+ BuildMI(&MBB, DL, get(Opcode))
+ .addMBB(TBB);
+
+ // Copy the flags onto the implicit condition register operand.
+ MachineOperand &CondReg = CondBr->getOperand(1);
+ CondReg.setIsUndef(Cond[1].isUndef());
+ CondReg.setIsKill(Cond[1].isKill());
+
+ if (BytesAdded)
+ *BytesAdded = 4;
+ return 1;
+ }
+
+ assert(TBB && FBB);
+
+ MachineInstr *CondBr =
+ BuildMI(&MBB, DL, get(Opcode))
+ .addMBB(TBB);
+ BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
+ .addMBB(FBB);
+
+ MachineOperand &CondReg = CondBr->getOperand(1);
+ CondReg.setIsUndef(Cond[1].isUndef());
+ CondReg.setIsKill(Cond[1].isKill());
+
+ if (BytesAdded)
+ *BytesAdded = 8;
+
+ return 2;
+}
+
+bool SIInstrInfo::reverseBranchCondition(
+ SmallVectorImpl<MachineOperand> &Cond) const {
+ assert(Cond.size() == 2);
+ Cond[0].setImm(-Cond[0].getImm());
+ return false;
+}
+
+static void removeModOperands(MachineInstr &MI) {
+ unsigned Opc = MI.getOpcode();
+ int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
+ AMDGPU::OpName::src0_modifiers);
+ int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc,
+ AMDGPU::OpName::src1_modifiers);
+ int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc,
+ AMDGPU::OpName::src2_modifiers);
+
+ MI.RemoveOperand(Src2ModIdx);
+ MI.RemoveOperand(Src1ModIdx);
+ MI.RemoveOperand(Src0ModIdx);
+}
+
+bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
+ unsigned Reg, MachineRegisterInfo *MRI) const {
+ if (!MRI->hasOneNonDBGUse(Reg))
+ return false;
+
+ unsigned Opc = UseMI.getOpcode();
+ if (Opc == AMDGPU::COPY) {
+ bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg());
+ switch (DefMI.getOpcode()) {
+ default:
+ return false;
+ case AMDGPU::S_MOV_B64:
+ // TODO: We could fold 64-bit immediates, but this get compilicated
+ // when there are sub-registers.
+ return false;
+
+ case AMDGPU::V_MOV_B32_e32:
+ case AMDGPU::S_MOV_B32:
+ break;
+ }
+ unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
+ const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
+ assert(ImmOp);
+ // FIXME: We could handle FrameIndex values here.
+ if (!ImmOp->isImm()) {
+ return false;
+ }
+ UseMI.setDesc(get(NewOpc));
+ UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm());
+ UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
+ return true;
+ }
+
+ if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
+ Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) {
+ bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64;
+
+ // Don't fold if we are using source modifiers. The new VOP2 instructions
+ // don't have them.
+ if (hasModifiersSet(UseMI, AMDGPU::OpName::src0_modifiers) ||
+ hasModifiersSet(UseMI, AMDGPU::OpName::src1_modifiers) ||
+ hasModifiersSet(UseMI, AMDGPU::OpName::src2_modifiers)) {
+ return false;
+ }
+
+ const MachineOperand &ImmOp = DefMI.getOperand(1);
+
+ // If this is a free constant, there's no reason to do this.
+ // TODO: We could fold this here instead of letting SIFoldOperands do it
+ // later.
+ MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
+
+ // Any src operand can be used for the legality check.
+ if (isInlineConstant(UseMI, *Src0, ImmOp))
+ return false;
+
+ MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
+ MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
+
+ // Multiplied part is the constant: Use v_madmk_{f16, f32}.
+ // We should only expect these to be on src0 due to canonicalizations.
+ if (Src0->isReg() && Src0->getReg() == Reg) {
+ if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
+ return false;
+
+ if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
+ return false;
+
+ // We need to swap operands 0 and 1 since madmk constant is at operand 1.
+
+ const int64_t Imm = DefMI.getOperand(1).getImm();
+
+ // FIXME: This would be a lot easier if we could return a new instruction
+ // instead of having to modify in place.
+
+ // Remove these first since they are at the end.
+ UseMI.RemoveOperand(
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
+ UseMI.RemoveOperand(
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
+
+ unsigned Src1Reg = Src1->getReg();
+ unsigned Src1SubReg = Src1->getSubReg();
+ Src0->setReg(Src1Reg);
+ Src0->setSubReg(Src1SubReg);
+ Src0->setIsKill(Src1->isKill());
+
+ if (Opc == AMDGPU::V_MAC_F32_e64 ||
+ Opc == AMDGPU::V_MAC_F16_e64)
+ UseMI.untieRegOperand(
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
+
+ Src1->ChangeToImmediate(Imm);
+
+ removeModOperands(UseMI);
+ UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16));
+
+ bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
+ if (DeleteDef)
+ DefMI.eraseFromParent();
+
+ return true;
+ }
+
+ // Added part is the constant: Use v_madak_{f16, f32}.
+ if (Src2->isReg() && Src2->getReg() == Reg) {
+ // Not allowed to use constant bus for another operand.
+ // We can however allow an inline immediate as src0.
+ if (!Src0->isImm() &&
+ (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))
+ return false;
+
+ if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
+ return false;
+
+ const int64_t Imm = DefMI.getOperand(1).getImm();
+
+ // FIXME: This would be a lot easier if we could return a new instruction
+ // instead of having to modify in place.
+
+ // Remove these first since they are at the end.
+ UseMI.RemoveOperand(
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
+ UseMI.RemoveOperand(
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
+
+ if (Opc == AMDGPU::V_MAC_F32_e64 ||
+ Opc == AMDGPU::V_MAC_F16_e64)
+ UseMI.untieRegOperand(
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
+
+ // ChangingToImmediate adds Src2 back to the instruction.
+ Src2->ChangeToImmediate(Imm);
+
+ // These come before src2.
+ removeModOperands(UseMI);
+ UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16));
+
+ bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
+ if (DeleteDef)
+ DefMI.eraseFromParent();
+
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
+ int WidthB, int OffsetB) {
+ int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
+ int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
+ int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
+ return LowOffset + LowWidth <= HighOffset;
+}
+
+bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa,
+ MachineInstr &MIb) const {
+ unsigned BaseReg0, BaseReg1;
+ int64_t Offset0, Offset1;
+
+ if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) &&
+ getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) {
+
+ if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
+ // FIXME: Handle ds_read2 / ds_write2.
+ return false;
+ }
+ unsigned Width0 = (*MIa.memoperands_begin())->getSize();
+ unsigned Width1 = (*MIb.memoperands_begin())->getSize();
+ if (BaseReg0 == BaseReg1 &&
+ offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa,
+ MachineInstr &MIb,
+ AliasAnalysis *AA) const {
+ assert((MIa.mayLoad() || MIa.mayStore()) &&
+ "MIa must load from or modify a memory location");
+ assert((MIb.mayLoad() || MIb.mayStore()) &&
+ "MIb must load from or modify a memory location");
+
+ if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects())
+ return false;
+
+ // XXX - Can we relax this between address spaces?
+ if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
+ return false;
+
+ if (AA && MIa.hasOneMemOperand() && MIb.hasOneMemOperand()) {
+ const MachineMemOperand *MMOa = *MIa.memoperands_begin();
+ const MachineMemOperand *MMOb = *MIb.memoperands_begin();
+ if (MMOa->getValue() && MMOb->getValue()) {
+ MemoryLocation LocA(MMOa->getValue(), MMOa->getSize(), MMOa->getAAInfo());
+ MemoryLocation LocB(MMOb->getValue(), MMOb->getSize(), MMOb->getAAInfo());
+ if (!AA->alias(LocA, LocB))
+ return true;
+ }
+ }
+
+ // TODO: Should we check the address space from the MachineMemOperand? That
+ // would allow us to distinguish objects we know don't alias based on the
+ // underlying address space, even if it was lowered to a different one,
+ // e.g. private accesses lowered to use MUBUF instructions on a scratch
+ // buffer.
+ if (isDS(MIa)) {
+ if (isDS(MIb))
+ return checkInstOffsetsDoNotOverlap(MIa, MIb);
+
+ return !isFLAT(MIb);
+ }
+
+ if (isMUBUF(MIa) || isMTBUF(MIa)) {
+ if (isMUBUF(MIb) || isMTBUF(MIb))
+ return checkInstOffsetsDoNotOverlap(MIa, MIb);
+
+ return !isFLAT(MIb) && !isSMRD(MIb);
+ }
+
+ if (isSMRD(MIa)) {
+ if (isSMRD(MIb))
+ return checkInstOffsetsDoNotOverlap(MIa, MIb);
+
+ return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa);
+ }
+
+ if (isFLAT(MIa)) {
+ if (isFLAT(MIb))
+ return checkInstOffsetsDoNotOverlap(MIa, MIb);
+
+ return false;
+ }
+
+ return false;
+}
+
+MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
+ MachineInstr &MI,
+ LiveVariables *LV) const {
+ bool IsF16 = false;
+
+ switch (MI.getOpcode()) {
+ default:
+ return nullptr;
+ case AMDGPU::V_MAC_F16_e64:
+ IsF16 = true;
+ case AMDGPU::V_MAC_F32_e64:
+ break;
+ case AMDGPU::V_MAC_F16_e32:
+ IsF16 = true;
+ case AMDGPU::V_MAC_F32_e32: {
+ int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::src0);
+ const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
+ if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
+ return nullptr;
+ break;
+ }
+ }
+
+ const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
+ const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
+ const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
+ const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
+
+ return BuildMI(*MBB, MI, MI.getDebugLoc(),
+ get(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32))
+ .addOperand(*Dst)
+ .addImm(0) // Src0 mods
+ .addOperand(*Src0)
+ .addImm(0) // Src1 mods
+ .addOperand(*Src1)
+ .addImm(0) // Src mods
+ .addOperand(*Src2)
+ .addImm(0) // clamp
+ .addImm(0); // omod
+}
+
+// It's not generally safe to move VALU instructions across these since it will
+// start using the register as a base index rather than directly.
+// XXX - Why isn't hasSideEffects sufficient for these?
+static bool changesVGPRIndexingMode(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case AMDGPU::S_SET_GPR_IDX_ON:
+ case AMDGPU::S_SET_GPR_IDX_MODE:
+ case AMDGPU::S_SET_GPR_IDX_OFF:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
+ const MachineBasicBlock *MBB,
+ const MachineFunction &MF) const {
+ // XXX - Do we want the SP check in the base implementation?
+
+ // Target-independent instructions do not have an implicit-use of EXEC, even
+ // when they operate on VGPRs. Treating EXEC modifications as scheduling
+ // boundaries prevents incorrect movements of such instructions.
+ return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) ||
+ MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
+ MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
+ MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
+ changesVGPRIndexingMode(MI);
+}
+
+bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
+ switch (Imm.getBitWidth()) {
+ case 32:
+ return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
+ ST.hasInv2PiInlineImm());
+ case 64:
+ return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
+ ST.hasInv2PiInlineImm());
+ case 16:
+ return AMDGPU::isInlinableLiteral16(Imm.getSExtValue(),
+ ST.hasInv2PiInlineImm());
+ default:
+ llvm_unreachable("invalid bitwidth");
+ }
+}
+
+bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
+ uint8_t OperandType) const {
+ if (!MO.isImm() || OperandType < MCOI::OPERAND_FIRST_TARGET)
+ return false;
+
+ // MachineOperand provides no way to tell the true operand size, since it only
+ // records a 64-bit value. We need to know the size to determine if a 32-bit
+ // floating point immediate bit pattern is legal for an integer immediate. It
+ // would be for any 32-bit integer operand, but would not be for a 64-bit one.
+
+ int64_t Imm = MO.getImm();
+ switch (operandBitWidth(OperandType)) {
+ case 32: {
+ int32_t Trunc = static_cast<int32_t>(Imm);
+ return Trunc == Imm &&
+ AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
+ }
+ case 64: {
+ return AMDGPU::isInlinableLiteral64(MO.getImm(),
+ ST.hasInv2PiInlineImm());
+ }
+ case 16: {
+ if (isInt<16>(Imm) || isUInt<16>(Imm)) {
+ int16_t Trunc = static_cast<int16_t>(Imm);
+ return AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
+ }
+
+ return false;
+ }
+ default:
+ llvm_unreachable("invalid bitwidth");
+ }
+}
+
+bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO,
+ const MCOperandInfo &OpInfo) const {
+ switch (MO.getType()) {
+ case MachineOperand::MO_Register:
+ return false;
+ case MachineOperand::MO_Immediate:
+ return !isInlineConstant(MO, OpInfo);
+ case MachineOperand::MO_FrameIndex:
+ case MachineOperand::MO_MachineBasicBlock:
+ case MachineOperand::MO_ExternalSymbol:
+ case MachineOperand::MO_GlobalAddress:
+ case MachineOperand::MO_MCSymbol:
+ return true;
+ default:
+ llvm_unreachable("unexpected operand type");
+ }
+}
+
+static bool compareMachineOp(const MachineOperand &Op0,
+ const MachineOperand &Op1) {
+ if (Op0.getType() != Op1.getType())
+ return false;
+
+ switch (Op0.getType()) {
+ case MachineOperand::MO_Register:
+ return Op0.getReg() == Op1.getReg();
+ case MachineOperand::MO_Immediate:
+ return Op0.getImm() == Op1.getImm();
+ default:
+ llvm_unreachable("Didn't expect to be comparing these operand types");
+ }
+}
+
+bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
+ const MachineOperand &MO) const {
+ const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo];
+
+ assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
+
+ if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
+ return true;
+
+ if (OpInfo.RegClass < 0)
+ return false;
+
+ if (MO.isImm() && isInlineConstant(MO, OpInfo))
+ return RI.opCanUseInlineConstant(OpInfo.OperandType);
+
+ return RI.opCanUseLiteralConstant(OpInfo.OperandType);
+}
+
+bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
+ int Op32 = AMDGPU::getVOPe32(Opcode);
+ if (Op32 == -1)
+ return false;
+
+ return pseudoToMCOpcode(Op32) != -1;
+}
+
+bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
+ // The src0_modifier operand is present on all instructions
+ // that have modifiers.
+
+ return AMDGPU::getNamedOperandIdx(Opcode,
+ AMDGPU::OpName::src0_modifiers) != -1;
+}
+
+bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
+ unsigned OpName) const {
+ const MachineOperand *Mods = getNamedOperand(MI, OpName);
+ return Mods && Mods->getImm();
+}
+
+bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
+ const MachineOperand &MO,
+ const MCOperandInfo &OpInfo) const {
+ // Literal constants use the constant bus.
+ //if (isLiteralConstantLike(MO, OpInfo))
+ // return true;
+ if (MO.isImm())
+ return !isInlineConstant(MO, OpInfo);
+
+ if (!MO.isReg())
+ return true; // Misc other operands like FrameIndex
+
+ if (!MO.isUse())
+ return false;
+
+ if (TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+ return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
+
+ // FLAT_SCR is just an SGPR pair.
+ if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR))
+ return true;
+
+ // EXEC register uses the constant bus.
+ if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC)
+ return true;
+
+ // SGPRs use the constant bus
+ return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 ||
+ (!MO.isImplicit() &&
+ (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) ||
+ AMDGPU::SGPR_64RegClass.contains(MO.getReg()))));
+}
+
+static unsigned findImplicitSGPRRead(const MachineInstr &MI) {
+ for (const MachineOperand &MO : MI.implicit_operands()) {
+ // We only care about reads.
+ if (MO.isDef())
+ continue;
+
+ switch (MO.getReg()) {
+ case AMDGPU::VCC:
+ case AMDGPU::M0:
+ case AMDGPU::FLAT_SCR:
+ return MO.getReg();
+
+ default:
+ break;
+ }
+ }
+
+ return AMDGPU::NoRegister;
+}
+
+static bool shouldReadExec(const MachineInstr &MI) {
+ if (SIInstrInfo::isVALU(MI)) {
+ switch (MI.getOpcode()) {
+ case AMDGPU::V_READLANE_B32:
+ case AMDGPU::V_READLANE_B32_si:
+ case AMDGPU::V_READLANE_B32_vi:
+ case AMDGPU::V_WRITELANE_B32:
+ case AMDGPU::V_WRITELANE_B32_si:
+ case AMDGPU::V_WRITELANE_B32_vi:
+ return false;
+ }
+
+ return true;
+ }
+
+ if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
+ SIInstrInfo::isSALU(MI) ||
+ SIInstrInfo::isSMRD(MI))
+ return false;
+
+ return true;
+}
+
+static bool isSubRegOf(const SIRegisterInfo &TRI,
+ const MachineOperand &SuperVec,
+ const MachineOperand &SubReg) {
+ if (TargetRegisterInfo::isPhysicalRegister(SubReg.getReg()))
+ return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
+
+ return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
+ SubReg.getReg() == SuperVec.getReg();
+}
+
+bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
+ StringRef &ErrInfo) const {
+ uint16_t Opcode = MI.getOpcode();
+ const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
+ int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
+ int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
+
+ // Make sure the number of operands is correct.
+ const MCInstrDesc &Desc = get(Opcode);
+ if (!Desc.isVariadic() &&
+ Desc.getNumOperands() != MI.getNumExplicitOperands()) {
+ ErrInfo = "Instruction has wrong number of operands.";
+ return false;
+ }
+
+ if (MI.isInlineAsm()) {
+ // Verify register classes for inlineasm constraints.
+ for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
+ I != E; ++I) {
+ const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
+ if (!RC)
+ continue;
+
+ const MachineOperand &Op = MI.getOperand(I);
+ if (!Op.isReg())
+ continue;
+
+ unsigned Reg = Op.getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(Reg) && !RC->contains(Reg)) {
+ ErrInfo = "inlineasm operand has incorrect register class.";
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ // Make sure the register classes are correct.
+ for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
+ if (MI.getOperand(i).isFPImm()) {
+ ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
+ "all fp values to integers.";
+ return false;
+ }
+
+ int RegClass = Desc.OpInfo[i].RegClass;
+
+ switch (Desc.OpInfo[i].OperandType) {
+ case MCOI::OPERAND_REGISTER:
+ if (MI.getOperand(i).isImm()) {
+ ErrInfo = "Illegal immediate value for operand.";
+ return false;
+ }
+ break;
+ case AMDGPU::OPERAND_REG_IMM_INT32:
+ case AMDGPU::OPERAND_REG_IMM_FP32:
+ break;
+ case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
+ const MachineOperand &MO = MI.getOperand(i);
+ if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
+ ErrInfo = "Illegal immediate value for operand.";
+ return false;
+ }
+ break;
+ }
+ case MCOI::OPERAND_IMMEDIATE:
+ case AMDGPU::OPERAND_KIMM32:
+ // Check if this operand is an immediate.
+ // FrameIndex operands will be replaced by immediates, so they are
+ // allowed.
+ if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
+ ErrInfo = "Expected immediate, but got non-immediate";
+ return false;
+ }
+ LLVM_FALLTHROUGH;
+ default:
+ continue;
+ }
+
+ if (!MI.getOperand(i).isReg())
+ continue;
+
+ if (RegClass != -1) {
+ unsigned Reg = MI.getOperand(i).getReg();
+ if (Reg == AMDGPU::NoRegister ||
+ TargetRegisterInfo::isVirtualRegister(Reg))
+ continue;
+
+ const TargetRegisterClass *RC = RI.getRegClass(RegClass);
+ if (!RC->contains(Reg)) {
+ ErrInfo = "Operand has incorrect register class.";
+ return false;
+ }
+ }
+ }
+
+ // Verify VOP*
+ if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI)) {
+ // Only look at the true operands. Only a real operand can use the constant
+ // bus, and we don't want to check pseudo-operands like the source modifier
+ // flags.
+ const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
+
+ unsigned ConstantBusCount = 0;
+
+ if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
+ ++ConstantBusCount;
+
+ unsigned SGPRUsed = findImplicitSGPRRead(MI);
+ if (SGPRUsed != AMDGPU::NoRegister)
+ ++ConstantBusCount;
+
+ for (int OpIdx : OpIndices) {
+ if (OpIdx == -1)
+ break;
+ const MachineOperand &MO = MI.getOperand(OpIdx);
+ if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
+ if (MO.isReg()) {
+ if (MO.getReg() != SGPRUsed)
+ ++ConstantBusCount;
+ SGPRUsed = MO.getReg();
+ } else {
+ ++ConstantBusCount;
+ }
+ }
+ }
+ if (ConstantBusCount > 1) {
+ ErrInfo = "VOP* instruction uses the constant bus more than once";
+ return false;
+ }
+ }
+
+ // Verify misc. restrictions on specific instructions.
+ if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
+ Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
+ const MachineOperand &Src0 = MI.getOperand(Src0Idx);
+ const MachineOperand &Src1 = MI.getOperand(Src1Idx);
+ const MachineOperand &Src2 = MI.getOperand(Src2Idx);
+ if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
+ if (!compareMachineOp(Src0, Src1) &&
+ !compareMachineOp(Src0, Src2)) {
+ ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
+ return false;
+ }
+ }
+ }
+
+ if (isSOPK(MI)) {
+ int64_t Imm = getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm();
+ if (sopkIsZext(MI)) {
+ if (!isUInt<16>(Imm)) {
+ ErrInfo = "invalid immediate for SOPK instruction";
+ return false;
+ }
+ } else {
+ if (!isInt<16>(Imm)) {
+ ErrInfo = "invalid immediate for SOPK instruction";
+ return false;
+ }
+ }
+ }
+
+ if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
+ Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
+ Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
+ Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
+ const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
+ Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
+
+ const unsigned StaticNumOps = Desc.getNumOperands() +
+ Desc.getNumImplicitUses();
+ const unsigned NumImplicitOps = IsDst ? 2 : 1;
+
+ // Allow additional implicit operands. This allows a fixup done by the post
+ // RA scheduler where the main implicit operand is killed and implicit-defs
+ // are added for sub-registers that remain live after this instruction.
+ if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
+ ErrInfo = "missing implicit register operands";
+ return false;
+ }
+
+ const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
+ if (IsDst) {
+ if (!Dst->isUse()) {
+ ErrInfo = "v_movreld_b32 vdst should be a use operand";
+ return false;
+ }
+
+ unsigned UseOpIdx;
+ if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
+ UseOpIdx != StaticNumOps + 1) {
+ ErrInfo = "movrel implicit operands should be tied";
+ return false;
+ }
+ }
+
+ const MachineOperand &Src0 = MI.getOperand(Src0Idx);
+ const MachineOperand &ImpUse
+ = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
+ if (!ImpUse.isReg() || !ImpUse.isUse() ||
+ !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
+ ErrInfo = "src0 should be subreg of implicit vector use";
+ return false;
+ }
+ }
+
+ // Make sure we aren't losing exec uses in the td files. This mostly requires
+ // being careful when using let Uses to try to add other use registers.
+ if (shouldReadExec(MI)) {
+ if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
+ ErrInfo = "VALU instruction does not implicitly read exec mask";
+ return false;
+ }
+ }
+
+ if (isSMRD(MI)) {
+ if (MI.mayStore()) {
+ // The register offset form of scalar stores may only use m0 as the
+ // soffset register.
+ const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff);
+ if (Soff && Soff->getReg() != AMDGPU::M0) {
+ ErrInfo = "scalar stores must use m0 as offset register";
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
+unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ default: return AMDGPU::INSTRUCTION_LIST_END;
+ case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
+ case AMDGPU::COPY: return AMDGPU::COPY;
+ case AMDGPU::PHI: return AMDGPU::PHI;
+ case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
+ case AMDGPU::S_MOV_B32:
+ return MI.getOperand(1).isReg() ?
+ AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
+ case AMDGPU::S_ADD_I32:
+ case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32;
+ case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32;
+ case AMDGPU::S_SUB_I32:
+ case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32;
+ case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
+ case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32;
+ case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
+ case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
+ case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
+ case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
+ case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
+ case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
+ case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
+ case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
+ case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64;
+ case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
+ case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64;
+ case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
+ case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
+ case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32;
+ case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
+ case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
+ case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
+ case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
+ case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
+ case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
+ case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
+ case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
+ case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
+ case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
+ case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
+ case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
+ case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
+ case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32;
+ case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32;
+ case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32;
+ case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32;
+ case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32;
+ case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32;
+ case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32;
+ case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32;
+ case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
+ case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
+ case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
+ case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
+ case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
+ case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
+ }
+}
+
+bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const {
+ return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END;
+}
+
+const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
+ unsigned OpNo) const {
+ const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ const MCInstrDesc &Desc = get(MI.getOpcode());
+ if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
+ Desc.OpInfo[OpNo].RegClass == -1) {
+ unsigned Reg = MI.getOperand(OpNo).getReg();
+
+ if (TargetRegisterInfo::isVirtualRegister(Reg))
+ return MRI.getRegClass(Reg);
+ return RI.getPhysRegClass(Reg);
+ }
+
+ unsigned RCID = Desc.OpInfo[OpNo].RegClass;
+ return RI.getRegClass(RCID);
+}
+
+bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const {
+ switch (MI.getOpcode()) {
+ case AMDGPU::COPY:
+ case AMDGPU::REG_SEQUENCE:
+ case AMDGPU::PHI:
+ case AMDGPU::INSERT_SUBREG:
+ return RI.hasVGPRs(getOpRegClass(MI, 0));
+ default:
+ return RI.hasVGPRs(getOpRegClass(MI, OpNo));
+ }
+}
+
+void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
+ MachineBasicBlock::iterator I = MI;
+ MachineBasicBlock *MBB = MI.getParent();
+ MachineOperand &MO = MI.getOperand(OpIdx);
+ MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+ unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass;
+ const TargetRegisterClass *RC = RI.getRegClass(RCID);
+ unsigned Opcode = AMDGPU::V_MOV_B32_e32;
+ if (MO.isReg())
+ Opcode = AMDGPU::COPY;
+ else if (RI.isSGPRClass(RC))
+ Opcode = AMDGPU::S_MOV_B32;
+
+ const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
+ if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
+ VRC = &AMDGPU::VReg_64RegClass;
+ else
+ VRC = &AMDGPU::VGPR_32RegClass;
+
+ unsigned Reg = MRI.createVirtualRegister(VRC);
+ DebugLoc DL = MBB->findDebugLoc(I);
+ BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).addOperand(MO);
+ MO.ChangeToRegister(Reg, false);
+}
+
+unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
+ MachineRegisterInfo &MRI,
+ MachineOperand &SuperReg,
+ const TargetRegisterClass *SuperRC,
+ unsigned SubIdx,
+ const TargetRegisterClass *SubRC)
+ const {
+ MachineBasicBlock *MBB = MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+ unsigned SubReg = MRI.createVirtualRegister(SubRC);
+
+ if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
+ BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
+ .addReg(SuperReg.getReg(), 0, SubIdx);
+ return SubReg;
+ }
+
+ // Just in case the super register is itself a sub-register, copy it to a new
+ // value so we don't need to worry about merging its subreg index with the
+ // SubIdx passed to this function. The register coalescer should be able to
+ // eliminate this extra copy.
+ unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC);
+
+ BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
+ .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
+
+ BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
+ .addReg(NewSuperReg, 0, SubIdx);
+
+ return SubReg;
+}
+
+MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
+ MachineBasicBlock::iterator MII,
+ MachineRegisterInfo &MRI,
+ MachineOperand &Op,
+ const TargetRegisterClass *SuperRC,
+ unsigned SubIdx,
+ const TargetRegisterClass *SubRC) const {
+ if (Op.isImm()) {
+ if (SubIdx == AMDGPU::sub0)
+ return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
+ if (SubIdx == AMDGPU::sub1)
+ return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
+
+ llvm_unreachable("Unhandled register index for immediate");
+ }
+
+ unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
+ SubIdx, SubRC);
+ return MachineOperand::CreateReg(SubReg, false);
+}
+
+// Change the order of operands from (0, 1, 2) to (0, 2, 1)
+void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
+ assert(Inst.getNumExplicitOperands() == 3);
+ MachineOperand Op1 = Inst.getOperand(1);
+ Inst.RemoveOperand(1);
+ Inst.addOperand(Op1);
+}
+
+bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
+ const MCOperandInfo &OpInfo,
+ const MachineOperand &MO) const {
+ if (!MO.isReg())
+ return false;
+
+ unsigned Reg = MO.getReg();
+ const TargetRegisterClass *RC =
+ TargetRegisterInfo::isVirtualRegister(Reg) ?
+ MRI.getRegClass(Reg) :
+ RI.getPhysRegClass(Reg);
+
+ const SIRegisterInfo *TRI =
+ static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
+ RC = TRI->getSubRegClass(RC, MO.getSubReg());
+
+ // In order to be legal, the common sub-class must be equal to the
+ // class of the current operand. For example:
+ //
+ // v_mov_b32 s0 ; Operand defined as vsrc_b32
+ // ; RI.getCommonSubClass(s0,vsrc_b32) = sgpr ; LEGAL
+ //
+ // s_sendmsg 0, s0 ; Operand defined as m0reg
+ // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
+
+ return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
+}
+
+bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
+ const MCOperandInfo &OpInfo,
+ const MachineOperand &MO) const {
+ if (MO.isReg())
+ return isLegalRegOperand(MRI, OpInfo, MO);
+
+ // Handle non-register types that are treated like immediates.
+ assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
+ return true;
+}
+
+bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
+ const MachineOperand *MO) const {
+ const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ const MCInstrDesc &InstDesc = MI.getDesc();
+ const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
+ const TargetRegisterClass *DefinedRC =
+ OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
+ if (!MO)
+ MO = &MI.getOperand(OpIdx);
+
+ if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
+
+ RegSubRegPair SGPRUsed;
+ if (MO->isReg())
+ SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg());
+
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ if (i == OpIdx)
+ continue;
+ const MachineOperand &Op = MI.getOperand(i);
+ if (Op.isReg()) {
+ if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) &&
+ usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) {
+ return false;
+ }
+ } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) {
+ return false;
+ }
+ }
+ }
+
+ if (MO->isReg()) {
+ assert(DefinedRC);
+ return isLegalRegOperand(MRI, OpInfo, *MO);
+ }
+
+ // Handle non-register types that are treated like immediates.
+ assert(MO->isImm() || MO->isTargetIndex() || MO->isFI());
+
+ if (!DefinedRC) {
+ // This operand expects an immediate.
+ return true;
+ }
+
+ return isImmOperandLegal(MI, OpIdx, *MO);
+}
+
+void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
+ MachineInstr &MI) const {
+ unsigned Opc = MI.getOpcode();
+ const MCInstrDesc &InstrDesc = get(Opc);
+
+ int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+ MachineOperand &Src1 = MI.getOperand(Src1Idx);
+
+ // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
+ // we need to only have one constant bus use.
+ //
+ // Note we do not need to worry about literal constants here. They are
+ // disabled for the operand type for instructions because they will always
+ // violate the one constant bus use rule.
+ bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister;
+ if (HasImplicitSGPR) {
+ int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+ MachineOperand &Src0 = MI.getOperand(Src0Idx);
+
+ if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg()))
+ legalizeOpWithMove(MI, Src0Idx);
+ }
+
+ // VOP2 src0 instructions support all operand types, so we don't need to check
+ // their legality. If src1 is already legal, we don't need to do anything.
+ if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
+ return;
+
+ // We do not use commuteInstruction here because it is too aggressive and will
+ // commute if it is possible. We only want to commute here if it improves
+ // legality. This can be called a fairly large number of times so don't waste
+ // compile time pointlessly swapping and checking legality again.
+ if (HasImplicitSGPR || !MI.isCommutable()) {
+ legalizeOpWithMove(MI, Src1Idx);
+ return;
+ }
+
+ int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+ MachineOperand &Src0 = MI.getOperand(Src0Idx);
+
+ // If src0 can be used as src1, commuting will make the operands legal.
+ // Otherwise we have to give up and insert a move.
+ //
+ // TODO: Other immediate-like operand kinds could be commuted if there was a
+ // MachineOperand::ChangeTo* for them.
+ if ((!Src1.isImm() && !Src1.isReg()) ||
+ !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) {
+ legalizeOpWithMove(MI, Src1Idx);
+ return;
+ }
+
+ int CommutedOpc = commuteOpcode(MI);
+ if (CommutedOpc == -1) {
+ legalizeOpWithMove(MI, Src1Idx);
+ return;
+ }
+
+ MI.setDesc(get(CommutedOpc));
+
+ unsigned Src0Reg = Src0.getReg();
+ unsigned Src0SubReg = Src0.getSubReg();
+ bool Src0Kill = Src0.isKill();
+
+ if (Src1.isImm())
+ Src0.ChangeToImmediate(Src1.getImm());
+ else if (Src1.isReg()) {
+ Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
+ Src0.setSubReg(Src1.getSubReg());
+ } else
+ llvm_unreachable("Should only have register or immediate operands");
+
+ Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
+ Src1.setSubReg(Src0SubReg);
+}
+
+// Legalize VOP3 operands. Because all operand types are supported for any
+// operand, and since literal constants are not allowed and should never be
+// seen, we only need to worry about inserting copies if we use multiple SGPR
+// operands.
+void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
+ MachineInstr &MI) const {
+ unsigned Opc = MI.getOpcode();
+
+ int VOP3Idx[3] = {
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
+ };
+
+ // Find the one SGPR operand we are allowed to use.
+ unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx);
+
+ for (unsigned i = 0; i < 3; ++i) {
+ int Idx = VOP3Idx[i];
+ if (Idx == -1)
+ break;
+ MachineOperand &MO = MI.getOperand(Idx);
+
+ // We should never see a VOP3 instruction with an illegal immediate operand.
+ if (!MO.isReg())
+ continue;
+
+ if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
+ continue; // VGPRs are legal
+
+ if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) {
+ SGPRReg = MO.getReg();
+ // We can use one SGPR in each VOP3 instruction.
+ continue;
+ }
+
+ // If we make it this far, then the operand is not legal and we must
+ // legalize it.
+ legalizeOpWithMove(MI, Idx);
+ }
+}
+
+unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI,
+ MachineRegisterInfo &MRI) const {
+ const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
+ const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
+ unsigned DstReg = MRI.createVirtualRegister(SRC);
+ unsigned SubRegs = VRC->getSize() / 4;
+
+ SmallVector<unsigned, 8> SRegs;
+ for (unsigned i = 0; i < SubRegs; ++i) {
+ unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
+ get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
+ .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
+ SRegs.push_back(SGPR);
+ }
+
+ MachineInstrBuilder MIB =
+ BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
+ get(AMDGPU::REG_SEQUENCE), DstReg);
+ for (unsigned i = 0; i < SubRegs; ++i) {
+ MIB.addReg(SRegs[i]);
+ MIB.addImm(RI.getSubRegFromChannel(i));
+ }
+ return DstReg;
+}
+
+void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
+ MachineInstr &MI) const {
+
+ // If the pointer is store in VGPRs, then we need to move them to
+ // SGPRs using v_readfirstlane. This is safe because we only select
+ // loads with uniform pointers to SMRD instruction so we know the
+ // pointer value is uniform.
+ MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
+ if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
+ unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
+ SBase->setReg(SGPR);
+ }
+}
+
+void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
+ MachineBasicBlock::iterator I,
+ const TargetRegisterClass *DstRC,
+ MachineOperand &Op,
+ MachineRegisterInfo &MRI,
+ const DebugLoc &DL) const {
+
+ unsigned OpReg = Op.getReg();
+ unsigned OpSubReg = Op.getSubReg();
+
+ const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
+ RI.getRegClassForReg(MRI, OpReg), OpSubReg);
+
+ // Check if operand is already the correct register class.
+ if (DstRC == OpRC)
+ return;
+
+ unsigned DstReg = MRI.createVirtualRegister(DstRC);
+ MachineInstr *Copy = BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg)
+ .addOperand(Op);
+
+ Op.setReg(DstReg);
+ Op.setSubReg(0);
+
+ MachineInstr *Def = MRI.getVRegDef(OpReg);
+ if (!Def)
+ return;
+
+ // Try to eliminate the copy if it is copying an immediate value.
+ if (Def->isMoveImmediate())
+ FoldImmediate(*Copy, *Def, OpReg, &MRI);
+}
+
+void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
+ MachineFunction &MF = *MI.getParent()->getParent();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ // Legalize VOP2
+ if (isVOP2(MI) || isVOPC(MI)) {
+ legalizeOperandsVOP2(MRI, MI);
+ return;
+ }
+
+ // Legalize VOP3
+ if (isVOP3(MI)) {
+ legalizeOperandsVOP3(MRI, MI);
+ return;
+ }
+
+ // Legalize SMRD
+ if (isSMRD(MI)) {
+ legalizeOperandsSMRD(MRI, MI);
+ return;
+ }
+
+ // Legalize REG_SEQUENCE and PHI
+ // The register class of the operands much be the same type as the register
+ // class of the output.
+ if (MI.getOpcode() == AMDGPU::PHI) {
+ const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
+ for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
+ if (!MI.getOperand(i).isReg() ||
+ !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg()))
+ continue;
+ const TargetRegisterClass *OpRC =
+ MRI.getRegClass(MI.getOperand(i).getReg());
+ if (RI.hasVGPRs(OpRC)) {
+ VRC = OpRC;
+ } else {
+ SRC = OpRC;
+ }
+ }
+
+ // If any of the operands are VGPR registers, then they all most be
+ // otherwise we will create illegal VGPR->SGPR copies when legalizing
+ // them.
+ if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
+ if (!VRC) {
+ assert(SRC);
+ VRC = RI.getEquivalentVGPRClass(SRC);
+ }
+ RC = VRC;
+ } else {
+ RC = SRC;
+ }
+
+ // Update all the operands so they have the same type.
+ for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
+ MachineOperand &Op = MI.getOperand(I);
+ if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
+ continue;
+
+ // MI is a PHI instruction.
+ MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
+ MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
+
+ // Avoid creating no-op copies with the same src and dst reg class. These
+ // confuse some of the machine passes.
+ legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
+ }
+ }
+
+ // REG_SEQUENCE doesn't really require operand legalization, but if one has a
+ // VGPR dest type and SGPR sources, insert copies so all operands are
+ // VGPRs. This seems to help operand folding / the register coalescer.
+ if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
+ MachineBasicBlock *MBB = MI.getParent();
+ const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
+ if (RI.hasVGPRs(DstRC)) {
+ // Update all the operands so they are VGPR register classes. These may
+ // not be the same register class because REG_SEQUENCE supports mixing
+ // subregister index types e.g. sub0_sub1 + sub2 + sub3
+ for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
+ MachineOperand &Op = MI.getOperand(I);
+ if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
+ continue;
+
+ const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
+ const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
+ if (VRC == OpRC)
+ continue;
+
+ legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
+ Op.setIsKill();
+ }
+ }
+
+ return;
+ }
+
+ // Legalize INSERT_SUBREG
+ // src0 must have the same register class as dst
+ if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
+ unsigned Dst = MI.getOperand(0).getReg();
+ unsigned Src0 = MI.getOperand(1).getReg();
+ const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
+ const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
+ if (DstRC != Src0RC) {
+ MachineBasicBlock *MBB = MI.getParent();
+ MachineOperand &Op = MI.getOperand(1);
+ legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
+ }
+ return;
+ }
+
+ // Legalize MIMG and MUBUF/MTBUF for shaders.
+ //
+ // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
+ // scratch memory access. In both cases, the legalization never involves
+ // conversion to the addr64 form.
+ if (isMIMG(MI) ||
+ (AMDGPU::isShader(MF.getFunction()->getCallingConv()) &&
+ (isMUBUF(MI) || isMTBUF(MI)))) {
+ MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc);
+ if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) {
+ unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI);
+ SRsrc->setReg(SGPR);
+ }
+
+ MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp);
+ if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) {
+ unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI);
+ SSamp->setReg(SGPR);
+ }
+ return;
+ }
+
+ // Legalize MUBUF* instructions by converting to addr64 form.
+ // FIXME: If we start using the non-addr64 instructions for compute, we
+ // may need to legalize them as above. This especially applies to the
+ // buffer_load_format_* variants and variants with idxen (or bothen).
+ int SRsrcIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
+ if (SRsrcIdx != -1) {
+ // We have an MUBUF instruction
+ MachineOperand *SRsrc = &MI.getOperand(SRsrcIdx);
+ unsigned SRsrcRC = get(MI.getOpcode()).OpInfo[SRsrcIdx].RegClass;
+ if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()),
+ RI.getRegClass(SRsrcRC))) {
+ // The operands are legal.
+ // FIXME: We may need to legalize operands besided srsrc.
+ return;
+ }
+
+ MachineBasicBlock &MBB = *MI.getParent();
+
+ // Extract the ptr from the resource descriptor.
+ unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc,
+ &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
+
+ // Create an empty resource descriptor
+ unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
+ uint64_t RsrcDataFormat = getDefaultRsrcDataFormat();
+
+ // Zero64 = 0
+ BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B64), Zero64)
+ .addImm(0);
+
+ // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
+ BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
+ .addImm(RsrcDataFormat & 0xFFFFFFFF);
+
+ // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
+ BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
+ .addImm(RsrcDataFormat >> 32);
+
+ // NewSRsrc = {Zero64, SRsrcFormat}
+ BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc)
+ .addReg(Zero64)
+ .addImm(AMDGPU::sub0_sub1)
+ .addReg(SRsrcFormatLo)
+ .addImm(AMDGPU::sub2)
+ .addReg(SRsrcFormatHi)
+ .addImm(AMDGPU::sub3);
+
+ MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
+ unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ if (VAddr) {
+ // This is already an ADDR64 instruction so we need to add the pointer
+ // extracted from the resource descriptor to the current value of VAddr.
+ unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0
+ DebugLoc DL = MI.getDebugLoc();
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo)
+ .addReg(SRsrcPtr, 0, AMDGPU::sub0)
+ .addReg(VAddr->getReg(), 0, AMDGPU::sub0);
+
+ // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi)
+ .addReg(SRsrcPtr, 0, AMDGPU::sub1)
+ .addReg(VAddr->getReg(), 0, AMDGPU::sub1);
+
+ // NewVaddr = {NewVaddrHi, NewVaddrLo}
+ BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
+ .addReg(NewVAddrLo)
+ .addImm(AMDGPU::sub0)
+ .addReg(NewVAddrHi)
+ .addImm(AMDGPU::sub1);
+ } else {
+ // This instructions is the _OFFSET variant, so we need to convert it to
+ // ADDR64.
+ assert(MBB.getParent()->getSubtarget<SISubtarget>().getGeneration()
+ < SISubtarget::VOLCANIC_ISLANDS &&
+ "FIXME: Need to emit flat atomics here");
+
+ MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
+ MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
+ MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
+ unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
+
+ // Atomics rith return have have an additional tied operand and are
+ // missing some of the special bits.
+ MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
+ MachineInstr *Addr64;
+
+ if (!VDataIn) {
+ // Regular buffer load / store.
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
+ .addOperand(*VData)
+ .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
+ // This will be replaced later
+ // with the new value of vaddr.
+ .addOperand(*SRsrc)
+ .addOperand(*SOffset)
+ .addOperand(*Offset);
+
+ // Atomics do not have this operand.
+ if (const MachineOperand *GLC =
+ getNamedOperand(MI, AMDGPU::OpName::glc)) {
+ MIB.addImm(GLC->getImm());
+ }
+
+ MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc));
+
+ if (const MachineOperand *TFE =
+ getNamedOperand(MI, AMDGPU::OpName::tfe)) {
+ MIB.addImm(TFE->getImm());
+ }
+
+ MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ Addr64 = MIB;
+ } else {
+ // Atomics with return.
+ Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
+ .addOperand(*VData)
+ .addOperand(*VDataIn)
+ .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
+ // This will be replaced later
+ // with the new value of vaddr.
+ .addOperand(*SRsrc)
+ .addOperand(*SOffset)
+ .addOperand(*Offset)
+ .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc))
+ .setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ }
+
+ MI.removeFromParent();
+
+ // NewVaddr = {NewVaddrHi, NewVaddrLo}
+ BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
+ NewVAddr)
+ .addReg(SRsrcPtr, 0, AMDGPU::sub0)
+ .addImm(AMDGPU::sub0)
+ .addReg(SRsrcPtr, 0, AMDGPU::sub1)
+ .addImm(AMDGPU::sub1);
+
+ VAddr = getNamedOperand(*Addr64, AMDGPU::OpName::vaddr);
+ SRsrc = getNamedOperand(*Addr64, AMDGPU::OpName::srsrc);
+ }
+
+ // Update the instruction to use NewVaddr
+ VAddr->setReg(NewVAddr);
+ // Update the instruction to use NewSRsrc
+ SRsrc->setReg(NewSRsrc);
+ }
+}
+
+void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
+ SmallVector<MachineInstr *, 128> Worklist;
+ Worklist.push_back(&TopInst);
+
+ while (!Worklist.empty()) {
+ MachineInstr &Inst = *Worklist.pop_back_val();
+ MachineBasicBlock *MBB = Inst.getParent();
+ MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+
+ unsigned Opcode = Inst.getOpcode();
+ unsigned NewOpcode = getVALUOp(Inst);
+
+ // Handle some special cases
+ switch (Opcode) {
+ default:
+ break;
+ case AMDGPU::S_AND_B64:
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64);
+ Inst.eraseFromParent();
+ continue;
+
+ case AMDGPU::S_OR_B64:
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64);
+ Inst.eraseFromParent();
+ continue;
+
+ case AMDGPU::S_XOR_B64:
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64);
+ Inst.eraseFromParent();
+ continue;
+
+ case AMDGPU::S_NOT_B64:
+ splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32);
+ Inst.eraseFromParent();
+ continue;
+
+ case AMDGPU::S_BCNT1_I32_B64:
+ splitScalar64BitBCNT(Worklist, Inst);
+ Inst.eraseFromParent();
+ continue;
+
+ case AMDGPU::S_BFE_I64: {
+ splitScalar64BitBFE(Worklist, Inst);
+ Inst.eraseFromParent();
+ continue;
+ }
+
+ case AMDGPU::S_LSHL_B32:
+ if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+ NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
+ swapOperands(Inst);
+ }
+ break;
+ case AMDGPU::S_ASHR_I32:
+ if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+ NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
+ swapOperands(Inst);
+ }
+ break;
+ case AMDGPU::S_LSHR_B32:
+ if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+ NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
+ swapOperands(Inst);
+ }
+ break;
+ case AMDGPU::S_LSHL_B64:
+ if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+ NewOpcode = AMDGPU::V_LSHLREV_B64;
+ swapOperands(Inst);
+ }
+ break;
+ case AMDGPU::S_ASHR_I64:
+ if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+ NewOpcode = AMDGPU::V_ASHRREV_I64;
+ swapOperands(Inst);
+ }
+ break;
+ case AMDGPU::S_LSHR_B64:
+ if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+ NewOpcode = AMDGPU::V_LSHRREV_B64;
+ swapOperands(Inst);
+ }
+ break;
+
+ case AMDGPU::S_ABS_I32:
+ lowerScalarAbs(Worklist, Inst);
+ Inst.eraseFromParent();
+ continue;
+
+ case AMDGPU::S_CBRANCH_SCC0:
+ case AMDGPU::S_CBRANCH_SCC1:
+ // Clear unused bits of vcc
+ BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64),
+ AMDGPU::VCC)
+ .addReg(AMDGPU::EXEC)
+ .addReg(AMDGPU::VCC);
+ break;
+
+ case AMDGPU::S_BFE_U64:
+ case AMDGPU::S_BFM_B64:
+ llvm_unreachable("Moving this op to VALU not implemented");
+ }
+
+ if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
+ // We cannot move this instruction to the VALU, so we should try to
+ // legalize its operands instead.
+ legalizeOperands(Inst);
+ continue;
+ }
+
+ // Use the new VALU Opcode.
+ const MCInstrDesc &NewDesc = get(NewOpcode);
+ Inst.setDesc(NewDesc);
+
+ // Remove any references to SCC. Vector instructions can't read from it, and
+ // We're just about to add the implicit use / defs of VCC, and we don't want
+ // both.
+ for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) {
+ MachineOperand &Op = Inst.getOperand(i);
+ if (Op.isReg() && Op.getReg() == AMDGPU::SCC) {
+ Inst.RemoveOperand(i);
+ addSCCDefUsersToVALUWorklist(Inst, Worklist);
+ }
+ }
+
+ if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
+ // We are converting these to a BFE, so we need to add the missing
+ // operands for the size and offset.
+ unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
+ Inst.addOperand(MachineOperand::CreateImm(0));
+ Inst.addOperand(MachineOperand::CreateImm(Size));
+
+ } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
+ // The VALU version adds the second operand to the result, so insert an
+ // extra 0 operand.
+ Inst.addOperand(MachineOperand::CreateImm(0));
+ }
+
+ Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent());
+
+ if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
+ const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
+ // If we need to move this to VGPRs, we need to unpack the second operand
+ // back into the 2 separate ones for bit offset and width.
+ assert(OffsetWidthOp.isImm() &&
+ "Scalar BFE is only implemented for constant width and offset");
+ uint32_t Imm = OffsetWidthOp.getImm();
+
+ uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
+ uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
+ Inst.RemoveOperand(2); // Remove old immediate.
+ Inst.addOperand(MachineOperand::CreateImm(Offset));
+ Inst.addOperand(MachineOperand::CreateImm(BitWidth));
+ }
+
+ bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef();
+ unsigned NewDstReg = AMDGPU::NoRegister;
+ if (HasDst) {
+ // Update the destination register class.
+ const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
+ if (!NewDstRC)
+ continue;
+
+ unsigned DstReg = Inst.getOperand(0).getReg();
+ if (Inst.isCopy() &&
+ TargetRegisterInfo::isVirtualRegister(Inst.getOperand(1).getReg()) &&
+ NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
+ // Instead of creating a copy where src and dst are the same register
+ // class, we just replace all uses of dst with src. These kinds of
+ // copies interfere with the heuristics MachineSink uses to decide
+ // whether or not to split a critical edge. Since the pass assumes
+ // that copies will end up as machine instructions and not be
+ // eliminated.
+ addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
+ MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
+ MRI.clearKillFlags(Inst.getOperand(1).getReg());
+ Inst.getOperand(0).setReg(DstReg);
+ continue;
+ }
+
+ NewDstReg = MRI.createVirtualRegister(NewDstRC);
+ MRI.replaceRegWith(DstReg, NewDstReg);
+ }
+
+ // Legalize the operands
+ legalizeOperands(Inst);
+
+ if (HasDst)
+ addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
+ }
+}
+
+void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist,
+ MachineInstr &Inst) const {
+ MachineBasicBlock &MBB = *Inst.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ MachineBasicBlock::iterator MII = Inst;
+ DebugLoc DL = Inst.getDebugLoc();
+
+ MachineOperand &Dest = Inst.getOperand(0);
+ MachineOperand &Src = Inst.getOperand(1);
+ unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_SUB_I32_e32), TmpReg)
+ .addImm(0)
+ .addReg(Src.getReg());
+
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
+ .addReg(Src.getReg())
+ .addReg(TmpReg);
+
+ MRI.replaceRegWith(Dest.getReg(), ResultReg);
+ addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
+}
+
+void SIInstrInfo::splitScalar64BitUnaryOp(
+ SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst,
+ unsigned Opcode) const {
+ MachineBasicBlock &MBB = *Inst.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+ MachineOperand &Dest = Inst.getOperand(0);
+ MachineOperand &Src0 = Inst.getOperand(1);
+ DebugLoc DL = Inst.getDebugLoc();
+
+ MachineBasicBlock::iterator MII = Inst;
+
+ const MCInstrDesc &InstDesc = get(Opcode);
+ const TargetRegisterClass *Src0RC = Src0.isReg() ?
+ MRI.getRegClass(Src0.getReg()) :
+ &AMDGPU::SGPR_32RegClass;
+
+ const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
+
+ MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
+ AMDGPU::sub0, Src0SubRC);
+
+ const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
+ const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
+ const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
+
+ unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
+ BuildMI(MBB, MII, DL, InstDesc, DestSub0)
+ .addOperand(SrcReg0Sub0);
+
+ MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
+ AMDGPU::sub1, Src0SubRC);
+
+ unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
+ BuildMI(MBB, MII, DL, InstDesc, DestSub1)
+ .addOperand(SrcReg0Sub1);
+
+ unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
+ BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
+ .addReg(DestSub0)
+ .addImm(AMDGPU::sub0)
+ .addReg(DestSub1)
+ .addImm(AMDGPU::sub1);
+
+ MRI.replaceRegWith(Dest.getReg(), FullDestReg);
+
+ // We don't need to legalizeOperands here because for a single operand, src0
+ // will support any kind of input.
+
+ // Move all users of this moved value.
+ addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
+}
+
+void SIInstrInfo::splitScalar64BitBinaryOp(
+ SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst,
+ unsigned Opcode) const {
+ MachineBasicBlock &MBB = *Inst.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+ MachineOperand &Dest = Inst.getOperand(0);
+ MachineOperand &Src0 = Inst.getOperand(1);
+ MachineOperand &Src1 = Inst.getOperand(2);
+ DebugLoc DL = Inst.getDebugLoc();
+
+ MachineBasicBlock::iterator MII = Inst;
+
+ const MCInstrDesc &InstDesc = get(Opcode);
+ const TargetRegisterClass *Src0RC = Src0.isReg() ?
+ MRI.getRegClass(Src0.getReg()) :
+ &AMDGPU::SGPR_32RegClass;
+
+ const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
+ const TargetRegisterClass *Src1RC = Src1.isReg() ?
+ MRI.getRegClass(Src1.getReg()) :
+ &AMDGPU::SGPR_32RegClass;
+
+ const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
+
+ MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
+ AMDGPU::sub0, Src0SubRC);
+ MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
+ AMDGPU::sub0, Src1SubRC);
+
+ const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
+ const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
+ const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
+
+ unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
+ MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
+ .addOperand(SrcReg0Sub0)
+ .addOperand(SrcReg1Sub0);
+
+ MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
+ AMDGPU::sub1, Src0SubRC);
+ MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
+ AMDGPU::sub1, Src1SubRC);
+
+ unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
+ MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
+ .addOperand(SrcReg0Sub1)
+ .addOperand(SrcReg1Sub1);
+
+ unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
+ BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
+ .addReg(DestSub0)
+ .addImm(AMDGPU::sub0)
+ .addReg(DestSub1)
+ .addImm(AMDGPU::sub1);
+
+ MRI.replaceRegWith(Dest.getReg(), FullDestReg);
+
+ // Try to legalize the operands in case we need to swap the order to keep it
+ // valid.
+ legalizeOperands(LoHalf);
+ legalizeOperands(HiHalf);
+
+ // Move all users of this moved vlaue.
+ addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
+}
+
+void SIInstrInfo::splitScalar64BitBCNT(
+ SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst) const {
+ MachineBasicBlock &MBB = *Inst.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+ MachineBasicBlock::iterator MII = Inst;
+ DebugLoc DL = Inst.getDebugLoc();
+
+ MachineOperand &Dest = Inst.getOperand(0);
+ MachineOperand &Src = Inst.getOperand(1);
+
+ const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
+ const TargetRegisterClass *SrcRC = Src.isReg() ?
+ MRI.getRegClass(Src.getReg()) :
+ &AMDGPU::SGPR_32RegClass;
+
+ unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0);
+
+ MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
+ AMDGPU::sub0, SrcSubRC);
+ MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
+ AMDGPU::sub1, SrcSubRC);
+
+ BuildMI(MBB, MII, DL, InstDesc, MidReg)
+ .addOperand(SrcRegSub0)
+ .addImm(0);
+
+ BuildMI(MBB, MII, DL, InstDesc, ResultReg)
+ .addOperand(SrcRegSub1)
+ .addReg(MidReg);
+
+ MRI.replaceRegWith(Dest.getReg(), ResultReg);
+
+ // We don't need to legalize operands here. src0 for etiher instruction can be
+ // an SGPR, and the second input is unused or determined here.
+ addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
+}
+
+void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
+ MachineInstr &Inst) const {
+ MachineBasicBlock &MBB = *Inst.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ MachineBasicBlock::iterator MII = Inst;
+ DebugLoc DL = Inst.getDebugLoc();
+
+ MachineOperand &Dest = Inst.getOperand(0);
+ uint32_t Imm = Inst.getOperand(2).getImm();
+ uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
+ uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
+
+ (void) Offset;
+
+ // Only sext_inreg cases handled.
+ assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
+ Offset == 0 && "Not implemented");
+
+ if (BitWidth < 32) {
+ unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo)
+ .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
+ .addImm(0)
+ .addImm(BitWidth);
+
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
+ .addImm(31)
+ .addReg(MidRegLo);
+
+ BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
+ .addReg(MidRegLo)
+ .addImm(AMDGPU::sub0)
+ .addReg(MidRegHi)
+ .addImm(AMDGPU::sub1);
+
+ MRI.replaceRegWith(Dest.getReg(), ResultReg);
+ addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
+ return;
+ }
+
+ MachineOperand &Src = Inst.getOperand(1);
+ unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
+ .addImm(31)
+ .addReg(Src.getReg(), 0, AMDGPU::sub0);
+
+ BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
+ .addReg(Src.getReg(), 0, AMDGPU::sub0)
+ .addImm(AMDGPU::sub0)
+ .addReg(TmpReg)
+ .addImm(AMDGPU::sub1);
+
+ MRI.replaceRegWith(Dest.getReg(), ResultReg);
+ addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
+}
+
+void SIInstrInfo::addUsersToMoveToVALUWorklist(
+ unsigned DstReg,
+ MachineRegisterInfo &MRI,
+ SmallVectorImpl<MachineInstr *> &Worklist) const {
+ for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
+ E = MRI.use_end(); I != E;) {
+ MachineInstr &UseMI = *I->getParent();
+ if (!canReadVGPR(UseMI, I.getOperandNo())) {
+ Worklist.push_back(&UseMI);
+
+ do {
+ ++I;
+ } while (I != E && I->getParent() == &UseMI);
+ } else {
+ ++I;
+ }
+ }
+}
+
+void SIInstrInfo::addSCCDefUsersToVALUWorklist(
+ MachineInstr &SCCDefInst, SmallVectorImpl<MachineInstr *> &Worklist) const {
+ // This assumes that all the users of SCC are in the same block
+ // as the SCC def.
+ for (MachineInstr &MI :
+ llvm::make_range(MachineBasicBlock::iterator(SCCDefInst),
+ SCCDefInst.getParent()->end())) {
+ // Exit if we find another SCC def.
+ if (MI.findRegisterDefOperandIdx(AMDGPU::SCC) != -1)
+ return;
+
+ if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1)
+ Worklist.push_back(&MI);
+ }
+}
+
+const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
+ const MachineInstr &Inst) const {
+ const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
+
+ switch (Inst.getOpcode()) {
+ // For target instructions, getOpRegClass just returns the virtual register
+ // class associated with the operand, so we need to find an equivalent VGPR
+ // register class in order to move the instruction to the VALU.
+ case AMDGPU::COPY:
+ case AMDGPU::PHI:
+ case AMDGPU::REG_SEQUENCE:
+ case AMDGPU::INSERT_SUBREG:
+ if (RI.hasVGPRs(NewDstRC))
+ return nullptr;
+
+ NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
+ if (!NewDstRC)
+ return nullptr;
+ return NewDstRC;
+ default:
+ return NewDstRC;
+ }
+}
+
+// Find the one SGPR operand we are allowed to use.
+unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
+ int OpIndices[3]) const {
+ const MCInstrDesc &Desc = MI.getDesc();
+
+ // Find the one SGPR operand we are allowed to use.
+ //
+ // First we need to consider the instruction's operand requirements before
+ // legalizing. Some operands are required to be SGPRs, such as implicit uses
+ // of VCC, but we are still bound by the constant bus requirement to only use
+ // one.
+ //
+ // If the operand's class is an SGPR, we can never move it.
+
+ unsigned SGPRReg = findImplicitSGPRRead(MI);
+ if (SGPRReg != AMDGPU::NoRegister)
+ return SGPRReg;
+
+ unsigned UsedSGPRs[3] = { AMDGPU::NoRegister };
+ const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+
+ for (unsigned i = 0; i < 3; ++i) {
+ int Idx = OpIndices[i];
+ if (Idx == -1)
+ break;
+
+ const MachineOperand &MO = MI.getOperand(Idx);
+ if (!MO.isReg())
+ continue;
+
+ // Is this operand statically required to be an SGPR based on the operand
+ // constraints?
+ const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass);
+ bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
+ if (IsRequiredSGPR)
+ return MO.getReg();
+
+ // If this could be a VGPR or an SGPR, Check the dynamic register class.
+ unsigned Reg = MO.getReg();
+ const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
+ if (RI.isSGPRClass(RegRC))
+ UsedSGPRs[i] = Reg;
+ }
+
+ // We don't have a required SGPR operand, so we have a bit more freedom in
+ // selecting operands to move.
+
+ // Try to select the most used SGPR. If an SGPR is equal to one of the
+ // others, we choose that.
+ //
+ // e.g.
+ // V_FMA_F32 v0, s0, s0, s0 -> No moves
+ // V_FMA_F32 v0, s0, s1, s0 -> Move s1
+
+ // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
+ // prefer those.
+
+ if (UsedSGPRs[0] != AMDGPU::NoRegister) {
+ if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
+ SGPRReg = UsedSGPRs[0];
+ }
+
+ if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) {
+ if (UsedSGPRs[1] == UsedSGPRs[2])
+ SGPRReg = UsedSGPRs[1];
+ }
+
+ return SGPRReg;
+}
+
+MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
+ unsigned OperandName) const {
+ int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
+ if (Idx == -1)
+ return nullptr;
+
+ return &MI.getOperand(Idx);
+}
+
+uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
+ uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
+ if (ST.isAmdHsaOS()) {
+ RsrcDataFormat |= (1ULL << 56);
+
+ if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
+ // Set MTYPE = 2
+ RsrcDataFormat |= (2ULL << 59);
+ }
+
+ return RsrcDataFormat;
+}
+
+uint64_t SIInstrInfo::getScratchRsrcWords23() const {
+ uint64_t Rsrc23 = getDefaultRsrcDataFormat() |
+ AMDGPU::RSRC_TID_ENABLE |
+ 0xffffffff; // Size;
+
+ uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1;
+
+ Rsrc23 |= (EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT) |
+ // IndexStride = 64
+ (UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT);
+
+ // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
+ // Clear them unless we want a huge stride.
+ if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
+ Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
+
+ return Rsrc23;
+}
+
+bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const {
+ unsigned Opc = MI.getOpcode();
+
+ return isSMRD(Opc);
+}
+
+bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const {
+ unsigned Opc = MI.getOpcode();
+
+ return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc);
+}
+
+unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI,
+ int &FrameIndex) const {
+ const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
+ if (!Addr || !Addr->isFI())
+ return AMDGPU::NoRegister;
+
+ assert(!MI.memoperands_empty() &&
+ (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
+
+ FrameIndex = Addr->getIndex();
+ return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
+}
+
+unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI,
+ int &FrameIndex) const {
+ const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
+ assert(Addr && Addr->isFI());
+ FrameIndex = Addr->getIndex();
+ return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
+}
+
+unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const {
+
+ if (!MI.mayLoad())
+ return AMDGPU::NoRegister;
+
+ if (isMUBUF(MI) || isVGPRSpill(MI))
+ return isStackAccess(MI, FrameIndex);
+
+ if (isSGPRSpill(MI))
+ return isSGPRStackAccess(MI, FrameIndex);
+
+ return AMDGPU::NoRegister;
+}
+
+unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const {
+ if (!MI.mayStore())
+ return AMDGPU::NoRegister;
+
+ if (isMUBUF(MI) || isVGPRSpill(MI))
+ return isStackAccess(MI, FrameIndex);
+
+ if (isSGPRSpill(MI))
+ return isSGPRStackAccess(MI, FrameIndex);
+
+ return AMDGPU::NoRegister;
+}
+
+unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
+ unsigned Opc = MI.getOpcode();
+ const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc);
+ unsigned DescSize = Desc.getSize();
+
+ // If we have a definitive size, we can use it. Otherwise we need to inspect
+ // the operands to know the size.
+ //
+ // FIXME: Instructions that have a base 32-bit encoding report their size as
+ // 4, even though they are really 8 bytes if they have a literal operand.
+ if (DescSize != 0 && DescSize != 4)
+ return DescSize;
+
+ if (Opc == AMDGPU::WAVE_BARRIER)
+ return 0;
+
+ // 4-byte instructions may have a 32-bit literal encoded after them. Check
+ // operands that coud ever be literals.
+ if (isVALU(MI) || isSALU(MI)) {
+ if (isFixedSize(MI)) {
+ assert(DescSize == 4);
+ return DescSize;
+ }
+
+ int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+ if (Src0Idx == -1)
+ return 4; // No operands.
+
+ if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx]))
+ return 8;
+
+ int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+ if (Src1Idx == -1)
+ return 4;
+
+ if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx]))
+ return 8;
+
+ return 4;
+ }
+
+ if (DescSize == 4)
+ return 4;
+
+ switch (Opc) {
+ case AMDGPU::SI_MASK_BRANCH:
+ case TargetOpcode::IMPLICIT_DEF:
+ case TargetOpcode::KILL:
+ case TargetOpcode::DBG_VALUE:
+ case TargetOpcode::BUNDLE:
+ case TargetOpcode::EH_LABEL:
+ return 0;
+ case TargetOpcode::INLINEASM: {
+ const MachineFunction *MF = MI.getParent()->getParent();
+ const char *AsmStr = MI.getOperand(0).getSymbolName();
+ return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
+ }
+ default:
+ llvm_unreachable("unable to find instruction size");
+ }
+}
+
+bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {
+ if (!isFLAT(MI))
+ return false;
+
+ if (MI.memoperands_empty())
+ return true;
+
+ for (const MachineMemOperand *MMO : MI.memoperands()) {
+ if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
+ return true;
+ }
+ return false;
+}
+
+ArrayRef<std::pair<int, const char *>>
+SIInstrInfo::getSerializableTargetIndices() const {
+ static const std::pair<int, const char *> TargetIndices[] = {
+ {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
+ {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
+ {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
+ {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
+ {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
+ return makeArrayRef(TargetIndices);
+}
+
+/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
+/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
+ScheduleHazardRecognizer *
+SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
+ const ScheduleDAG *DAG) const {
+ return new GCNHazardRecognizer(DAG->MF);
+}
+
+/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
+/// pass.
+ScheduleHazardRecognizer *
+SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const {
+ return new GCNHazardRecognizer(MF);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h
new file mode 100644
index 000000000000..e68f6f92ba96
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -0,0 +1,794 @@
+//===-- SIInstrInfo.h - SI Instruction Info Interface -----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface definition for SIInstrInfo.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIINSTRINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_SIINSTRINFO_H
+
+#include "AMDGPUInstrInfo.h"
+#include "SIDefines.h"
+#include "SIRegisterInfo.h"
+
+namespace llvm {
+
+class SIInstrInfo final : public AMDGPUInstrInfo {
+private:
+ const SIRegisterInfo RI;
+ const SISubtarget &ST;
+
+ // The the inverse predicate should have the negative value.
+ enum BranchPredicate {
+ INVALID_BR = 0,
+ SCC_TRUE = 1,
+ SCC_FALSE = -1,
+ VCCNZ = 2,
+ VCCZ = -2,
+ EXECNZ = -3,
+ EXECZ = 3
+ };
+
+ static unsigned getBranchOpcode(BranchPredicate Cond);
+ static BranchPredicate getBranchPredicate(unsigned Opcode);
+
+ unsigned buildExtractSubReg(MachineBasicBlock::iterator MI,
+ MachineRegisterInfo &MRI,
+ MachineOperand &SuperReg,
+ const TargetRegisterClass *SuperRC,
+ unsigned SubIdx,
+ const TargetRegisterClass *SubRC) const;
+ MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI,
+ MachineRegisterInfo &MRI,
+ MachineOperand &SuperReg,
+ const TargetRegisterClass *SuperRC,
+ unsigned SubIdx,
+ const TargetRegisterClass *SubRC) const;
+
+ void swapOperands(MachineInstr &Inst) const;
+
+ void lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist,
+ MachineInstr &Inst) const;
+
+ void splitScalar64BitUnaryOp(SmallVectorImpl<MachineInstr *> &Worklist,
+ MachineInstr &Inst, unsigned Opcode) const;
+
+ void splitScalar64BitBinaryOp(SmallVectorImpl<MachineInstr *> &Worklist,
+ MachineInstr &Inst, unsigned Opcode) const;
+
+ void splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist,
+ MachineInstr &Inst) const;
+ void splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
+ MachineInstr &Inst) const;
+
+ void addUsersToMoveToVALUWorklist(
+ unsigned Reg, MachineRegisterInfo &MRI,
+ SmallVectorImpl<MachineInstr *> &Worklist) const;
+
+ void
+ addSCCDefUsersToVALUWorklist(MachineInstr &SCCDefInst,
+ SmallVectorImpl<MachineInstr *> &Worklist) const;
+
+ const TargetRegisterClass *
+ getDestEquivalentVGPRClass(const MachineInstr &Inst) const;
+
+ bool checkInstOffsetsDoNotOverlap(MachineInstr &MIa, MachineInstr &MIb) const;
+
+ unsigned findUsedSGPR(const MachineInstr &MI, int OpIndices[3]) const;
+
+protected:
+ bool swapSourceModifiers(MachineInstr &MI,
+ MachineOperand &Src0, unsigned Src0OpName,
+ MachineOperand &Src1, unsigned Src1OpName) const;
+
+ MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+ unsigned OpIdx0,
+ unsigned OpIdx1) const override;
+
+public:
+
+ enum TargetOperandFlags {
+ MO_NONE = 0,
+ // MO_GOTPCREL -> symbol@GOTPCREL -> R_AMDGPU_GOTPCREL.
+ MO_GOTPCREL = 1,
+ // MO_GOTPCREL32_LO -> symbol@gotpcrel32@lo -> R_AMDGPU_GOTPCREL32_LO.
+ MO_GOTPCREL32 = 2,
+ MO_GOTPCREL32_LO = 2,
+ // MO_GOTPCREL32_HI -> symbol@gotpcrel32@hi -> R_AMDGPU_GOTPCREL32_HI.
+ MO_GOTPCREL32_HI = 3,
+ // MO_REL32_LO -> symbol@rel32@lo -> R_AMDGPU_REL32_LO.
+ MO_REL32 = 4,
+ MO_REL32_LO = 4,
+ // MO_REL32_HI -> symbol@rel32@hi -> R_AMDGPU_REL32_HI.
+ MO_REL32_HI = 5
+ };
+
+ explicit SIInstrInfo(const SISubtarget &);
+
+ const SIRegisterInfo &getRegisterInfo() const {
+ return RI;
+ }
+
+ bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
+ AliasAnalysis *AA) const override;
+
+ bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
+ int64_t &Offset1,
+ int64_t &Offset2) const override;
+
+ bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
+ int64_t &Offset,
+ const TargetRegisterInfo *TRI) const final;
+
+ bool shouldClusterMemOps(MachineInstr &FirstLdSt, MachineInstr &SecondLdSt,
+ unsigned NumLoads) const final;
+
+ void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+ bool KillSrc) const override;
+
+ unsigned calculateLDSSpillAddress(MachineBasicBlock &MBB, MachineInstr &MI,
+ RegScavenger *RS, unsigned TmpReg,
+ unsigned Offset, unsigned Size) const;
+
+ void storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI, unsigned SrcReg,
+ bool isKill, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
+
+ void loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI, unsigned DestReg,
+ int FrameIndex, const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
+
+ bool expandPostRAPseudo(MachineInstr &MI) const override;
+
+ // \brief Returns an opcode that can be used to move a value to a \p DstRC
+ // register. If there is no hardware instruction that can store to \p
+ // DstRC, then AMDGPU::COPY is returned.
+ unsigned getMovOpcode(const TargetRegisterClass *DstRC) const;
+
+ LLVM_READONLY
+ int commuteOpcode(unsigned Opc) const;
+
+ LLVM_READONLY
+ inline int commuteOpcode(const MachineInstr &MI) const {
+ return commuteOpcode(MI.getOpcode());
+ }
+
+ bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
+ unsigned &SrcOpIdx2) const override;
+
+ bool isBranchOffsetInRange(unsigned BranchOpc,
+ int64_t BrOffset) const override;
+
+ MachineBasicBlock *getBranchDestBlock(const MachineInstr &MI) const override;
+
+ unsigned insertIndirectBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock &NewDestBB,
+ const DebugLoc &DL,
+ int64_t BrOffset,
+ RegScavenger *RS = nullptr) const override;
+
+ bool analyzeBranchImpl(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const;
+
+ bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const override;
+
+ unsigned removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved = nullptr) const override;
+
+ unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+ const DebugLoc &DL,
+ int *BytesAdded = nullptr) const override;
+
+ bool reverseBranchCondition(
+ SmallVectorImpl<MachineOperand> &Cond) const override;
+
+ bool
+ areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
+ AliasAnalysis *AA = nullptr) const override;
+
+ bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg,
+ MachineRegisterInfo *MRI) const final;
+
+ unsigned getMachineCSELookAheadLimit() const override { return 500; }
+
+ MachineInstr *convertToThreeAddress(MachineFunction::iterator &MBB,
+ MachineInstr &MI,
+ LiveVariables *LV) const override;
+
+ bool isSchedulingBoundary(const MachineInstr &MI,
+ const MachineBasicBlock *MBB,
+ const MachineFunction &MF) const override;
+
+ static bool isSALU(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::SALU;
+ }
+
+ bool isSALU(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::SALU;
+ }
+
+ static bool isVALU(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::VALU;
+ }
+
+ bool isVALU(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::VALU;
+ }
+
+ static bool isVMEM(const MachineInstr &MI) {
+ return isMUBUF(MI) || isMTBUF(MI) || isMIMG(MI);
+ }
+
+ bool isVMEM(uint16_t Opcode) const {
+ return isMUBUF(Opcode) || isMTBUF(Opcode) || isMIMG(Opcode);
+ }
+
+ static bool isSOP1(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::SOP1;
+ }
+
+ bool isSOP1(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::SOP1;
+ }
+
+ static bool isSOP2(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::SOP2;
+ }
+
+ bool isSOP2(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::SOP2;
+ }
+
+ static bool isSOPC(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::SOPC;
+ }
+
+ bool isSOPC(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::SOPC;
+ }
+
+ static bool isSOPK(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::SOPK;
+ }
+
+ bool isSOPK(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::SOPK;
+ }
+
+ static bool isSOPP(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::SOPP;
+ }
+
+ bool isSOPP(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::SOPP;
+ }
+
+ static bool isVOP1(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::VOP1;
+ }
+
+ bool isVOP1(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::VOP1;
+ }
+
+ static bool isVOP2(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::VOP2;
+ }
+
+ bool isVOP2(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::VOP2;
+ }
+
+ static bool isVOP3(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::VOP3;
+ }
+
+ bool isVOP3(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::VOP3;
+ }
+
+ static bool isVOPC(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::VOPC;
+ }
+
+ bool isVOPC(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::VOPC;
+ }
+
+ static bool isMUBUF(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::MUBUF;
+ }
+
+ bool isMUBUF(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::MUBUF;
+ }
+
+ static bool isMTBUF(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::MTBUF;
+ }
+
+ bool isMTBUF(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::MTBUF;
+ }
+
+ static bool isSMRD(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::SMRD;
+ }
+
+ bool isSMRD(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::SMRD;
+ }
+
+ static bool isDS(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::DS;
+ }
+
+ bool isDS(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::DS;
+ }
+
+ static bool isMIMG(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::MIMG;
+ }
+
+ bool isMIMG(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::MIMG;
+ }
+
+ static bool isGather4(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::Gather4;
+ }
+
+ bool isGather4(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::Gather4;
+ }
+
+ static bool isFLAT(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::FLAT;
+ }
+
+ bool isFLAT(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::FLAT;
+ }
+
+ static bool isEXP(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::EXP;
+ }
+
+ bool isEXP(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::EXP;
+ }
+
+ static bool isWQM(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::WQM;
+ }
+
+ bool isWQM(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::WQM;
+ }
+
+ static bool isDisableWQM(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::DisableWQM;
+ }
+
+ bool isDisableWQM(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::DisableWQM;
+ }
+
+ static bool isVGPRSpill(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::VGPRSpill;
+ }
+
+ bool isVGPRSpill(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::VGPRSpill;
+ }
+
+ static bool isSGPRSpill(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::SGPRSpill;
+ }
+
+ bool isSGPRSpill(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::SGPRSpill;
+ }
+
+ static bool isDPP(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::DPP;
+ }
+
+ bool isDPP(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::DPP;
+ }
+
+ static bool isScalarUnit(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & (SIInstrFlags::SALU | SIInstrFlags::SMRD);
+ }
+
+ static bool usesVM_CNT(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::VM_CNT;
+ }
+
+ static bool sopkIsZext(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::SOPK_ZEXT;
+ }
+
+ bool sopkIsZext(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::SOPK_ZEXT;
+ }
+
+ /// \returns true if this is an s_store_dword* instruction. This is more
+ /// specific than than isSMEM && mayStore.
+ static bool isScalarStore(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::SCALAR_STORE;
+ }
+
+ bool isScalarStore(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::SCALAR_STORE;
+ }
+
+ static bool isFixedSize(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::FIXED_SIZE;
+ }
+
+ bool isFixedSize(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::FIXED_SIZE;
+ }
+
+ bool isVGPRCopy(const MachineInstr &MI) const {
+ assert(MI.isCopy());
+ unsigned Dest = MI.getOperand(0).getReg();
+ const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ return !RI.isSGPRReg(MRI, Dest);
+ }
+
+ static int operandBitWidth(uint8_t OperandType) {
+ switch (OperandType) {
+ case AMDGPU::OPERAND_REG_IMM_INT32:
+ case AMDGPU::OPERAND_REG_IMM_FP32:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+ return 32;
+ case AMDGPU::OPERAND_REG_IMM_INT64:
+ case AMDGPU::OPERAND_REG_IMM_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+ return 64;
+ case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+ case AMDGPU::OPERAND_REG_IMM_INT16:
+ case AMDGPU::OPERAND_REG_IMM_FP16:
+ return 16;
+ default:
+ llvm_unreachable("unexpected operand type");
+ }
+ }
+
+ bool isInlineConstant(const APInt &Imm) const;
+
+ bool isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const;
+
+ bool isInlineConstant(const MachineOperand &MO,
+ const MCOperandInfo &OpInfo) const {
+ return isInlineConstant(MO, OpInfo.OperandType);
+ }
+
+ /// \p returns true if \p UseMO is substituted with \p DefMO in \p MI it would
+ /// be an inline immediate.
+ bool isInlineConstant(const MachineInstr &MI,
+ const MachineOperand &UseMO,
+ const MachineOperand &DefMO) const {
+ assert(UseMO.getParent() == &MI);
+ int OpIdx = MI.getOperandNo(&UseMO);
+ if (!MI.getDesc().OpInfo || OpIdx >= MI.getDesc().NumOperands) {
+ return false;
+ }
+
+ return isInlineConstant(DefMO, MI.getDesc().OpInfo[OpIdx]);
+ }
+
+ /// \p returns true if the operand \p OpIdx in \p MI is a valid inline
+ /// immediate.
+ bool isInlineConstant(const MachineInstr &MI, unsigned OpIdx) const {
+ const MachineOperand &MO = MI.getOperand(OpIdx);
+ return isInlineConstant(MO, MI.getDesc().OpInfo[OpIdx].OperandType);
+ }
+
+ bool isInlineConstant(const MachineInstr &MI, unsigned OpIdx,
+ const MachineOperand &MO) const {
+ if (!MI.getDesc().OpInfo || OpIdx >= MI.getDesc().NumOperands)
+ return false;
+
+ if (MI.isCopy()) {
+ unsigned Size = getOpSize(MI, OpIdx);
+ assert(Size == 8 || Size == 4);
+
+ uint8_t OpType = (Size == 8) ?
+ AMDGPU::OPERAND_REG_IMM_INT64 : AMDGPU::OPERAND_REG_IMM_INT32;
+ return isInlineConstant(MO, OpType);
+ }
+
+ return isInlineConstant(MO, MI.getDesc().OpInfo[OpIdx].OperandType);
+ }
+
+ bool isInlineConstant(const MachineOperand &MO) const {
+ const MachineInstr *Parent = MO.getParent();
+ return isInlineConstant(*Parent, Parent->getOperandNo(&MO));
+ }
+
+ bool isLiteralConstant(const MachineOperand &MO,
+ const MCOperandInfo &OpInfo) const {
+ return MO.isImm() && !isInlineConstant(MO, OpInfo.OperandType);
+ }
+
+ bool isLiteralConstant(const MachineInstr &MI, int OpIdx) const {
+ const MachineOperand &MO = MI.getOperand(OpIdx);
+ return MO.isImm() && !isInlineConstant(MI, OpIdx);
+ }
+
+ // Returns true if this operand could potentially require a 32-bit literal
+ // operand, but not necessarily. A FrameIndex for example could resolve to an
+ // inline immediate value that will not require an additional 4-bytes; this
+ // assumes that it will.
+ bool isLiteralConstantLike(const MachineOperand &MO,
+ const MCOperandInfo &OpInfo) const;
+
+ bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
+ const MachineOperand &MO) const;
+
+ /// \brief Return true if this 64-bit VALU instruction has a 32-bit encoding.
+ /// This function will return false if you pass it a 32-bit instruction.
+ bool hasVALU32BitEncoding(unsigned Opcode) const;
+
+ /// \brief Returns true if this operand uses the constant bus.
+ bool usesConstantBus(const MachineRegisterInfo &MRI,
+ const MachineOperand &MO,
+ const MCOperandInfo &OpInfo) const;
+
+ /// \brief Return true if this instruction has any modifiers.
+ /// e.g. src[012]_mod, omod, clamp.
+ bool hasModifiers(unsigned Opcode) const;
+
+ bool hasModifiersSet(const MachineInstr &MI,
+ unsigned OpName) const;
+
+ bool verifyInstruction(const MachineInstr &MI,
+ StringRef &ErrInfo) const override;
+
+ static unsigned getVALUOp(const MachineInstr &MI);
+
+ bool isSALUOpSupportedOnVALU(const MachineInstr &MI) const;
+
+ /// \brief Return the correct register class for \p OpNo. For target-specific
+ /// instructions, this will return the register class that has been defined
+ /// in tablegen. For generic instructions, like REG_SEQUENCE it will return
+ /// the register class of its machine operand.
+ /// to infer the correct register class base on the other operands.
+ const TargetRegisterClass *getOpRegClass(const MachineInstr &MI,
+ unsigned OpNo) const;
+
+ /// \brief Return the size in bytes of the operand OpNo on the given
+ // instruction opcode.
+ unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const {
+ const MCOperandInfo &OpInfo = get(Opcode).OpInfo[OpNo];
+
+ if (OpInfo.RegClass == -1) {
+ // If this is an immediate operand, this must be a 32-bit literal.
+ assert(OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE);
+ return 4;
+ }
+
+ return RI.getRegClass(OpInfo.RegClass)->getSize();
+ }
+
+ /// \brief This form should usually be preferred since it handles operands
+ /// with unknown register classes.
+ unsigned getOpSize(const MachineInstr &MI, unsigned OpNo) const {
+ return getOpRegClass(MI, OpNo)->getSize();
+ }
+
+ /// \returns true if it is legal for the operand at index \p OpNo
+ /// to read a VGPR.
+ bool canReadVGPR(const MachineInstr &MI, unsigned OpNo) const;
+
+ /// \brief Legalize the \p OpIndex operand of this instruction by inserting
+ /// a MOV. For example:
+ /// ADD_I32_e32 VGPR0, 15
+ /// to
+ /// MOV VGPR1, 15
+ /// ADD_I32_e32 VGPR0, VGPR1
+ ///
+ /// If the operand being legalized is a register, then a COPY will be used
+ /// instead of MOV.
+ void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const;
+
+ /// \brief Check if \p MO is a legal operand if it was the \p OpIdx Operand
+ /// for \p MI.
+ bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
+ const MachineOperand *MO = nullptr) const;
+
+ /// \brief Check if \p MO would be a valid operand for the given operand
+ /// definition \p OpInfo. Note this does not attempt to validate constant bus
+ /// restrictions (e.g. literal constant usage).
+ bool isLegalVSrcOperand(const MachineRegisterInfo &MRI,
+ const MCOperandInfo &OpInfo,
+ const MachineOperand &MO) const;
+
+ /// \brief Check if \p MO (a register operand) is a legal register for the
+ /// given operand description.
+ bool isLegalRegOperand(const MachineRegisterInfo &MRI,
+ const MCOperandInfo &OpInfo,
+ const MachineOperand &MO) const;
+
+ /// \brief Legalize operands in \p MI by either commuting it or inserting a
+ /// copy of src1.
+ void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const;
+
+ /// \brief Fix operands in \p MI to satisfy constant bus requirements.
+ void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const;
+
+ /// Copy a value from a VGPR (\p SrcReg) to SGPR. This function can only
+ /// be used when it is know that the value in SrcReg is same across all
+ /// threads in the wave.
+ /// \returns The SGPR register that \p SrcReg was copied to.
+ unsigned readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI,
+ MachineRegisterInfo &MRI) const;
+
+ void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const;
+
+ void legalizeGenericOperand(MachineBasicBlock &InsertMBB,
+ MachineBasicBlock::iterator I,
+ const TargetRegisterClass *DstRC,
+ MachineOperand &Op, MachineRegisterInfo &MRI,
+ const DebugLoc &DL) const;
+
+ /// \brief Legalize all operands in this instruction. This function may
+ /// create new instruction and insert them before \p MI.
+ void legalizeOperands(MachineInstr &MI) const;
+
+ /// \brief Replace this instruction's opcode with the equivalent VALU
+ /// opcode. This function will also move the users of \p MI to the
+ /// VALU if necessary.
+ void moveToVALU(MachineInstr &MI) const;
+
+ void insertWaitStates(MachineBasicBlock &MBB,MachineBasicBlock::iterator MI,
+ int Count) const;
+
+ void insertNoop(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI) const override;
+
+ /// \brief Return the number of wait states that result from executing this
+ /// instruction.
+ unsigned getNumWaitStates(const MachineInstr &MI) const;
+
+ /// \brief Returns the operand named \p Op. If \p MI does not have an
+ /// operand named \c Op, this function returns nullptr.
+ LLVM_READONLY
+ MachineOperand *getNamedOperand(MachineInstr &MI, unsigned OperandName) const;
+
+ LLVM_READONLY
+ const MachineOperand *getNamedOperand(const MachineInstr &MI,
+ unsigned OpName) const {
+ return getNamedOperand(const_cast<MachineInstr &>(MI), OpName);
+ }
+
+ /// Get required immediate operand
+ int64_t getNamedImmOperand(const MachineInstr &MI, unsigned OpName) const {
+ int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
+ return MI.getOperand(Idx).getImm();
+ }
+
+ uint64_t getDefaultRsrcDataFormat() const;
+ uint64_t getScratchRsrcWords23() const;
+
+ bool isLowLatencyInstruction(const MachineInstr &MI) const;
+ bool isHighLatencyInstruction(const MachineInstr &MI) const;
+
+ /// \brief Return the descriptor of the target-specific machine instruction
+ /// that corresponds to the specified pseudo or native opcode.
+ const MCInstrDesc &getMCOpcodeFromPseudo(unsigned Opcode) const {
+ return get(pseudoToMCOpcode(Opcode));
+ }
+
+ unsigned isStackAccess(const MachineInstr &MI, int &FrameIndex) const;
+ unsigned isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const;
+
+ unsigned isLoadFromStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const override;
+ unsigned isStoreToStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const override;
+
+ unsigned getInstSizeInBytes(const MachineInstr &MI) const override;
+
+ bool mayAccessFlatAddressSpace(const MachineInstr &MI) const;
+
+ ArrayRef<std::pair<int, const char *>>
+ getSerializableTargetIndices() const override;
+
+ ScheduleHazardRecognizer *
+ CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
+ const ScheduleDAG *DAG) const override;
+
+ ScheduleHazardRecognizer *
+ CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const override;
+};
+
+namespace AMDGPU {
+ LLVM_READONLY
+ int getVOPe64(uint16_t Opcode);
+
+ LLVM_READONLY
+ int getVOPe32(uint16_t Opcode);
+
+ LLVM_READONLY
+ int getCommuteRev(uint16_t Opcode);
+
+ LLVM_READONLY
+ int getCommuteOrig(uint16_t Opcode);
+
+ LLVM_READONLY
+ int getAddr64Inst(uint16_t Opcode);
+
+ LLVM_READONLY
+ int getAtomicRetOp(uint16_t Opcode);
+
+ LLVM_READONLY
+ int getAtomicNoRetOp(uint16_t Opcode);
+
+ LLVM_READONLY
+ int getSOPKOp(uint16_t Opcode);
+
+ const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
+ const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19);
+ const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21);
+ const uint64_t RSRC_TID_ENABLE = UINT64_C(1) << (32 + 23);
+
+ // For MachineOperands.
+ enum TargetFlags {
+ TF_LONG_BRANCH_FORWARD = 1 << 0,
+ TF_LONG_BRANCH_BACKWARD = 1 << 1
+ };
+} // End namespace AMDGPU
+
+namespace SI {
+namespace KernelInputOffsets {
+
+/// Offsets in bytes from the start of the input buffer
+enum Offsets {
+ NGROUPS_X = 0,
+ NGROUPS_Y = 4,
+ NGROUPS_Z = 8,
+ GLOBAL_SIZE_X = 12,
+ GLOBAL_SIZE_Y = 16,
+ GLOBAL_SIZE_Z = 20,
+ LOCAL_SIZE_X = 24,
+ LOCAL_SIZE_Y = 28,
+ LOCAL_SIZE_Z = 32
+};
+
+} // End namespace KernelInputOffsets
+} // End namespace SI
+
+} // End namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td
new file mode 100644
index 000000000000..34096e158039
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -0,0 +1,1254 @@
+//===-- SIInstrInfo.td - SI Instruction Infos -------------*- tablegen -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+def isCI : Predicate<"Subtarget->getGeneration() "
+ ">= SISubtarget::SEA_ISLANDS">;
+def isCIOnly : Predicate<"Subtarget->getGeneration() =="
+ "SISubtarget::SEA_ISLANDS">,
+ AssemblerPredicate <"FeatureSeaIslands">;
+
+def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">;
+
+// Execpt for the NONE field, this must be kept in sync with the
+// SIEncodingFamily enum in AMDGPUInstrInfo.cpp
+def SIEncodingFamily {
+ int NONE = -1;
+ int SI = 0;
+ int VI = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// SI DAG Nodes
+//===----------------------------------------------------------------------===//
+
+def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT",
+ SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i32>]>,
+ [SDNPMayLoad, SDNPMemOperand]
+>;
+
+def SIatomic_inc : SDNode<"AMDGPUISD::ATOMIC_INC", SDTAtomic2,
+ [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
+>;
+
+def SIatomic_dec : SDNode<"AMDGPUISD::ATOMIC_DEC", SDTAtomic2,
+ [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
+>;
+
+def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT",
+ SDTypeProfile<0, 13,
+ [SDTCisVT<0, v4i32>, // rsrc(SGPR)
+ SDTCisVT<1, iAny>, // vdata(VGPR)
+ SDTCisVT<2, i32>, // num_channels(imm)
+ SDTCisVT<3, i32>, // vaddr(VGPR)
+ SDTCisVT<4, i32>, // soffset(SGPR)
+ SDTCisVT<5, i32>, // inst_offset(imm)
+ SDTCisVT<6, i32>, // dfmt(imm)
+ SDTCisVT<7, i32>, // nfmt(imm)
+ SDTCisVT<8, i32>, // offen(imm)
+ SDTCisVT<9, i32>, // idxen(imm)
+ SDTCisVT<10, i32>, // glc(imm)
+ SDTCisVT<11, i32>, // slc(imm)
+ SDTCisVT<12, i32> // tfe(imm)
+ ]>,
+ [SDNPMayStore, SDNPMemOperand, SDNPHasChain]
+>;
+
+def SDTBufferLoad : SDTypeProfile<1, 5,
+ [ // vdata
+ SDTCisVT<1, v4i32>, // rsrc
+ SDTCisVT<2, i32>, // vindex
+ SDTCisVT<3, i32>, // offset
+ SDTCisVT<4, i1>, // glc
+ SDTCisVT<5, i1>]>; // slc
+
+def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad,
+ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
+def SIbuffer_load_format : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT", SDTBufferLoad,
+ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
+
+def SIload_input : SDNode<"AMDGPUISD::LOAD_INPUT",
+ SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i16>,
+ SDTCisVT<3, i32>]>
+>;
+
+class SDSample<string opcode> : SDNode <opcode,
+ SDTypeProfile<1, 4, [SDTCisVT<0, v4f32>, SDTCisVT<2, v8i32>,
+ SDTCisVT<3, v4i32>, SDTCisVT<4, i32>]>
+>;
+
+def SIsample : SDSample<"AMDGPUISD::SAMPLE">;
+def SIsampleb : SDSample<"AMDGPUISD::SAMPLEB">;
+def SIsampled : SDSample<"AMDGPUISD::SAMPLED">;
+def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">;
+
+def SIpc_add_rel_offset : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET",
+ SDTypeProfile<1, 2, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>
+>;
+
+//===----------------------------------------------------------------------===//
+// PatFrags for global memory operations
+//===----------------------------------------------------------------------===//
+
+defm atomic_inc_global : global_binary_atomic_op<SIatomic_inc>;
+defm atomic_dec_global : global_binary_atomic_op<SIatomic_dec>;
+
+//===----------------------------------------------------------------------===//
+// SDNodes and PatFrag for local loads and stores to enable s_mov_b32 m0, -1
+// to be glued to the memory instructions.
+//===----------------------------------------------------------------------===//
+
+def SIld_local : SDNode <"ISD::LOAD", SDTLoad,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
+>;
+
+def si_ld_local : PatFrag <(ops node:$ptr), (SIld_local node:$ptr), [{
+ return cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+}]>;
+
+def si_load_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{
+ return cast<LoadSDNode>(N)->getAddressingMode() == ISD::UNINDEXED &&
+ cast<LoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD;
+}]>;
+
+def si_load_local_align8 : Aligned8Bytes <
+ (ops node:$ptr), (si_load_local node:$ptr)
+>;
+
+def si_sextload_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{
+ return cast<LoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD;
+}]>;
+def si_az_extload_local : AZExtLoadBase <si_ld_local>;
+
+multiclass SIExtLoadLocal <PatFrag ld_node> {
+
+ def _i8 : PatFrag <(ops node:$ptr), (ld_node node:$ptr),
+ [{return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;}]
+ >;
+
+ def _i16 : PatFrag <(ops node:$ptr), (ld_node node:$ptr),
+ [{return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;}]
+ >;
+}
+
+defm si_sextload_local : SIExtLoadLocal <si_sextload_local>;
+defm si_az_extload_local : SIExtLoadLocal <si_az_extload_local>;
+
+def SIst_local : SDNode <"ISD::STORE", SDTStore,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand, SDNPInGlue]
+>;
+
+def si_st_local : PatFrag <
+ (ops node:$val, node:$ptr), (SIst_local node:$val, node:$ptr), [{
+ return cast<StoreSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+}]>;
+
+def si_store_local : PatFrag <
+ (ops node:$val, node:$ptr), (si_st_local node:$val, node:$ptr), [{
+ return cast<StoreSDNode>(N)->getAddressingMode() == ISD::UNINDEXED &&
+ !cast<StoreSDNode>(N)->isTruncatingStore();
+}]>;
+
+def si_store_local_align8 : Aligned8Bytes <
+ (ops node:$val, node:$ptr), (si_store_local node:$val, node:$ptr)
+>;
+
+def si_truncstore_local : PatFrag <
+ (ops node:$val, node:$ptr), (si_st_local node:$val, node:$ptr), [{
+ return cast<StoreSDNode>(N)->isTruncatingStore();
+}]>;
+
+def si_truncstore_local_i8 : PatFrag <
+ (ops node:$val, node:$ptr), (si_truncstore_local node:$val, node:$ptr), [{
+ return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def si_truncstore_local_i16 : PatFrag <
+ (ops node:$val, node:$ptr), (si_truncstore_local node:$val, node:$ptr), [{
+ return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def si_setcc_uniform : PatFrag <
+ (ops node:$lhs, node:$rhs, node:$cond),
+ (setcc node:$lhs, node:$rhs, node:$cond), [{
+ for (SDNode *Use : N->uses()) {
+ if (Use->isMachineOpcode() || Use->getOpcode() != ISD::CopyToReg)
+ return false;
+
+ unsigned Reg = cast<RegisterSDNode>(Use->getOperand(1))->getReg();
+ if (Reg != AMDGPU::SCC)
+ return false;
+ }
+ return true;
+}]>;
+
+def si_uniform_br : PatFrag <
+ (ops node:$cond, node:$bb), (brcond node:$cond, node:$bb), [{
+ return isUniformBr(N);
+}]>;
+
+def si_uniform_br_scc : PatFrag <
+ (ops node:$cond, node:$bb), (si_uniform_br node:$cond, node:$bb), [{
+ return isCBranchSCC(N);
+}]>;
+
+multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0> {
+
+ def _glue : SDNode <
+ !if(is_amdgpu, "AMDGPUISD", "ISD")#"::ATOMIC_"#op_name, SDTAtomic2,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
+ >;
+
+ def _local : local_binary_atomic_op <!cast<SDNode>(NAME#"_glue")>;
+}
+
+defm si_atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">;
+defm si_atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">;
+defm si_atomic_inc : SIAtomicM0Glue2 <"INC", 1>;
+defm si_atomic_dec : SIAtomicM0Glue2 <"DEC", 1>;
+defm si_atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">;
+defm si_atomic_load_min : SIAtomicM0Glue2 <"LOAD_MIN">;
+defm si_atomic_load_max : SIAtomicM0Glue2 <"LOAD_MAX">;
+defm si_atomic_load_or : SIAtomicM0Glue2 <"LOAD_OR">;
+defm si_atomic_load_xor : SIAtomicM0Glue2 <"LOAD_XOR">;
+defm si_atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">;
+defm si_atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">;
+defm si_atomic_swap : SIAtomicM0Glue2 <"SWAP">;
+
+def si_atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
+>;
+
+defm si_atomic_cmp_swap : AtomicCmpSwapLocal <si_atomic_cmp_swap_glue>;
+
+def as_i1imm : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i1);
+}]>;
+
+def as_i8imm : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i8);
+}]>;
+
+def as_i16imm : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i16);
+}]>;
+
+def as_i32imm: SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i32);
+}]>;
+
+def as_i64imm: SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i64);
+}]>;
+
+// Copied from the AArch64 backend:
+def bitcast_fpimm_to_i32 : SDNodeXForm<fpimm, [{
+return CurDAG->getTargetConstant(
+ N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i32);
+}]>;
+
+def frameindex_to_targetframeindex : SDNodeXForm<frameindex, [{
+ auto FI = cast<FrameIndexSDNode>(N);
+ return CurDAG->getTargetFrameIndex(FI->getIndex(), MVT::i32);
+}]>;
+
+// Copied from the AArch64 backend:
+def bitcast_fpimm_to_i64 : SDNodeXForm<fpimm, [{
+return CurDAG->getTargetConstant(
+ N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i64);
+}]>;
+
+def SIMM16bit : PatLeaf <(imm),
+ [{return isInt<16>(N->getSExtValue());}]
+>;
+
+def IMM20bit : PatLeaf <(imm),
+ [{return isUInt<20>(N->getZExtValue());}]
+>;
+
+class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{
+ return isInlineImmediate(N);
+}]>;
+
+class InlineFPImm <ValueType vt> : PatLeaf <(vt fpimm), [{
+ return isInlineImmediate(N);
+}]>;
+
+class VGPRImm <dag frag> : PatLeaf<frag, [{
+ if (Subtarget->getGeneration() < SISubtarget::SOUTHERN_ISLANDS) {
+ return false;
+ }
+ const SIRegisterInfo *SIRI =
+ static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
+ unsigned Limit = 0;
+ for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
+ Limit < 10 && U != E; ++U, ++Limit) {
+ const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
+
+ // If the register class is unknown, it could be an unknown
+ // register class that needs to be an SGPR, e.g. an inline asm
+ // constraint
+ if (!RC || SIRI->isSGPRClass(RC))
+ return false;
+ }
+
+ return Limit < 10;
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Custom Operands
+//===----------------------------------------------------------------------===//
+
+def SoppBrTarget : AsmOperandClass {
+ let Name = "SoppBrTarget";
+ let ParserMethod = "parseSOppBrTarget";
+}
+
+def sopp_brtarget : Operand<OtherVT> {
+ let EncoderMethod = "getSOPPBrEncoding";
+ let DecoderMethod = "decodeSoppBrTarget";
+ let OperandType = "OPERAND_PCREL";
+ let ParserMatchClass = SoppBrTarget;
+}
+
+def si_ga : Operand<iPTR>;
+
+def InterpSlotMatchClass : AsmOperandClass {
+ let Name = "InterpSlot";
+ let PredicateMethod = "isInterpSlot";
+ let ParserMethod = "parseInterpSlot";
+ let RenderMethod = "addImmOperands";
+}
+
+def InterpSlot : Operand<i32> {
+ let PrintMethod = "printInterpSlot";
+ let ParserMatchClass = InterpSlotMatchClass;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def AttrMatchClass : AsmOperandClass {
+ let Name = "Attr";
+ let PredicateMethod = "isInterpAttr";
+ let ParserMethod = "parseInterpAttr";
+ let RenderMethod = "addImmOperands";
+}
+
+// It appears to be necessary to create a separate operand for this to
+// be able to parse attr<num> with no space.
+def Attr : Operand<i32> {
+ let PrintMethod = "printInterpAttr";
+ let ParserMatchClass = AttrMatchClass;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def AttrChanMatchClass : AsmOperandClass {
+ let Name = "AttrChan";
+ let PredicateMethod = "isAttrChan";
+ let RenderMethod = "addImmOperands";
+}
+
+def AttrChan : Operand<i32> {
+ let PrintMethod = "printInterpAttrChan";
+ let ParserMatchClass = AttrChanMatchClass;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def SendMsgMatchClass : AsmOperandClass {
+ let Name = "SendMsg";
+ let PredicateMethod = "isSendMsg";
+ let ParserMethod = "parseSendMsgOp";
+ let RenderMethod = "addImmOperands";
+}
+
+def ExpTgtMatchClass : AsmOperandClass {
+ let Name = "ExpTgt";
+ let PredicateMethod = "isExpTgt";
+ let ParserMethod = "parseExpTgt";
+ let RenderMethod = "printExpTgt";
+}
+
+def SendMsgImm : Operand<i32> {
+ let PrintMethod = "printSendMsg";
+ let ParserMatchClass = SendMsgMatchClass;
+}
+
+def SWaitMatchClass : AsmOperandClass {
+ let Name = "SWaitCnt";
+ let RenderMethod = "addImmOperands";
+ let ParserMethod = "parseSWaitCntOps";
+}
+
+def VReg32OrOffClass : AsmOperandClass {
+ let Name = "VReg32OrOff";
+ let ParserMethod = "parseVReg32OrOff";
+}
+
+def WAIT_FLAG : Operand <i32> {
+ let ParserMatchClass = SWaitMatchClass;
+ let PrintMethod = "printWaitFlag";
+}
+
+include "SIInstrFormats.td"
+include "VIInstrFormats.td"
+
+// ===----------------------------------------------------------------------===//
+// ExpSrc* Special cases for exp src operands which are printed as
+// "off" depending on en operand.
+// ===----------------------------------------------------------------------===//
+
+def ExpSrc0 : RegisterOperand<VGPR_32> {
+ let PrintMethod = "printExpSrc0";
+ let ParserMatchClass = VReg32OrOffClass;
+}
+
+def ExpSrc1 : RegisterOperand<VGPR_32> {
+ let PrintMethod = "printExpSrc1";
+ let ParserMatchClass = VReg32OrOffClass;
+}
+
+def ExpSrc2 : RegisterOperand<VGPR_32> {
+ let PrintMethod = "printExpSrc2";
+ let ParserMatchClass = VReg32OrOffClass;
+}
+
+def ExpSrc3 : RegisterOperand<VGPR_32> {
+ let PrintMethod = "printExpSrc3";
+ let ParserMatchClass = VReg32OrOffClass;
+}
+
+class NamedMatchClass<string CName, bit Optional = 1> : AsmOperandClass {
+ let Name = "Imm"#CName;
+ let PredicateMethod = "is"#CName;
+ let ParserMethod = !if(Optional, "parseOptionalOperand", "parse"#CName);
+ let RenderMethod = "addImmOperands";
+ let IsOptional = Optional;
+ let DefaultMethod = !if(Optional, "default"#CName, ?);
+}
+
+class NamedOperandBit<string Name, AsmOperandClass MatchClass> : Operand<i1> {
+ let PrintMethod = "print"#Name;
+ let ParserMatchClass = MatchClass;
+}
+
+class NamedOperandU8<string Name, AsmOperandClass MatchClass> : Operand<i8> {
+ let PrintMethod = "print"#Name;
+ let ParserMatchClass = MatchClass;
+}
+
+class NamedOperandU16<string Name, AsmOperandClass MatchClass> : Operand<i16> {
+ let PrintMethod = "print"#Name;
+ let ParserMatchClass = MatchClass;
+}
+
+class NamedOperandU32<string Name, AsmOperandClass MatchClass> : Operand<i32> {
+ let PrintMethod = "print"#Name;
+ let ParserMatchClass = MatchClass;
+}
+
+let OperandType = "OPERAND_IMMEDIATE" in {
+
+def offen : NamedOperandBit<"Offen", NamedMatchClass<"Offen">>;
+def idxen : NamedOperandBit<"Idxen", NamedMatchClass<"Idxen">>;
+def addr64 : NamedOperandBit<"Addr64", NamedMatchClass<"Addr64">>;
+
+def offset : NamedOperandU16<"Offset", NamedMatchClass<"Offset">>;
+def offset0 : NamedOperandU8<"Offset0", NamedMatchClass<"Offset0">>;
+def offset1 : NamedOperandU8<"Offset1", NamedMatchClass<"Offset1">>;
+
+def gds : NamedOperandBit<"GDS", NamedMatchClass<"GDS">>;
+
+def omod : NamedOperandU32<"OModSI", NamedMatchClass<"OModSI">>;
+def clampmod : NamedOperandBit<"ClampSI", NamedMatchClass<"ClampSI">>;
+
+def GLC : NamedOperandBit<"GLC", NamedMatchClass<"GLC">>;
+def slc : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>;
+def tfe : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>;
+def unorm : NamedOperandBit<"UNorm", NamedMatchClass<"UNorm">>;
+def da : NamedOperandBit<"DA", NamedMatchClass<"DA">>;
+def r128 : NamedOperandBit<"R128", NamedMatchClass<"R128">>;
+def lwe : NamedOperandBit<"LWE", NamedMatchClass<"LWE">>;
+def exp_compr : NamedOperandBit<"ExpCompr", NamedMatchClass<"ExpCompr">>;
+def exp_vm : NamedOperandBit<"ExpVM", NamedMatchClass<"ExpVM">>;
+
+def dmask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>;
+
+def dpp_ctrl : NamedOperandU32<"DPPCtrl", NamedMatchClass<"DPPCtrl", 0>>;
+def row_mask : NamedOperandU32<"RowMask", NamedMatchClass<"RowMask">>;
+def bank_mask : NamedOperandU32<"BankMask", NamedMatchClass<"BankMask">>;
+def bound_ctrl : NamedOperandBit<"BoundCtrl", NamedMatchClass<"BoundCtrl">>;
+
+def dst_sel : NamedOperandU32<"SDWADstSel", NamedMatchClass<"SDWADstSel">>;
+def src0_sel : NamedOperandU32<"SDWASrc0Sel", NamedMatchClass<"SDWASrc0Sel">>;
+def src1_sel : NamedOperandU32<"SDWASrc1Sel", NamedMatchClass<"SDWASrc1Sel">>;
+def dst_unused : NamedOperandU32<"SDWADstUnused", NamedMatchClass<"SDWADstUnused">>;
+
+def hwreg : NamedOperandU16<"Hwreg", NamedMatchClass<"Hwreg", 0>>;
+
+def exp_tgt : NamedOperandU8<"ExpTgt", NamedMatchClass<"ExpTgt", 0>> {
+
+}
+
+} // End OperandType = "OPERAND_IMMEDIATE"
+
+class KImmMatchClass<int size> : AsmOperandClass {
+ let Name = "KImmFP"#size;
+ let PredicateMethod = "isKImmFP"#size;
+ let ParserMethod = "parseImm";
+ let RenderMethod = "addKImmFP"#size#"Operands";
+}
+
+class kimmOperand<ValueType vt> : Operand<vt> {
+ let OperandNamespace = "AMDGPU";
+ let OperandType = "OPERAND_KIMM"#vt.Size;
+ let PrintMethod = "printU"#vt.Size#"ImmOperand";
+ let ParserMatchClass = !cast<AsmOperandClass>("KImmFP"#vt.Size#"MatchClass");
+}
+
+// 32-bit VALU immediate operand that uses the constant bus.
+def KImmFP32MatchClass : KImmMatchClass<32>;
+def f32kimm : kimmOperand<i32>;
+
+// 32-bit VALU immediate operand with a 16-bit value that uses the
+// constant bus.
+def KImmFP16MatchClass : KImmMatchClass<16>;
+def f16kimm : kimmOperand<i16>;
+
+
+def VOPDstS64 : VOPDstOperand <SReg_64>;
+
+class FPInputModsMatchClass <int opSize> : AsmOperandClass {
+ let Name = "RegOrImmWithFP"#opSize#"InputMods";
+ let ParserMethod = "parseRegOrImmWithFPInputMods";
+ let PredicateMethod = "isRegOrImmWithFP"#opSize#"InputMods";
+}
+def FP16InputModsMatchClass : FPInputModsMatchClass<16>;
+def FP32InputModsMatchClass : FPInputModsMatchClass<32>;
+def FP64InputModsMatchClass : FPInputModsMatchClass<64>;
+
+class InputMods <AsmOperandClass matchClass> : Operand <i32> {
+ let OperandNamespace = "AMDGPU";
+ let OperandType = "OPERAND_INPUT_MODS";
+ let ParserMatchClass = matchClass;
+}
+
+class FPInputMods <FPInputModsMatchClass matchClass> : InputMods <matchClass> {
+ let PrintMethod = "printOperandAndFPInputMods";
+}
+
+def FP16InputMods : FPInputMods<FP16InputModsMatchClass>;
+def FP32InputMods : FPInputMods<FP32InputModsMatchClass>;
+def FP64InputMods : FPInputMods<FP64InputModsMatchClass>;
+
+class IntInputModsMatchClass <int opSize> : AsmOperandClass {
+ let Name = "RegOrImmWithInt"#opSize#"InputMods";
+ let ParserMethod = "parseRegOrImmWithIntInputMods";
+ let PredicateMethod = "isRegOrImmWithInt"#opSize#"InputMods";
+}
+def Int32InputModsMatchClass : IntInputModsMatchClass<32>;
+def Int64InputModsMatchClass : IntInputModsMatchClass<64>;
+
+class IntInputMods <IntInputModsMatchClass matchClass> : InputMods <matchClass> {
+ let PrintMethod = "printOperandAndIntInputMods";
+}
+def Int32InputMods : IntInputMods<Int32InputModsMatchClass>;
+def Int64InputMods : IntInputMods<Int64InputModsMatchClass>;
+
+//===----------------------------------------------------------------------===//
+// Complex patterns
+//===----------------------------------------------------------------------===//
+
+def DS1Addr1Offset : ComplexPattern<i32, 2, "SelectDS1Addr1Offset">;
+def DS64Bit4ByteAligned : ComplexPattern<i32, 3, "SelectDS64Bit4ByteAligned">;
+
+def MOVRELOffset : ComplexPattern<i32, 2, "SelectMOVRELOffset">;
+
+def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">;
+def VOP3NoMods0 : ComplexPattern<untyped, 4, "SelectVOP3NoMods0">;
+def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">;
+def VOP3Mods0Clamp0OMod : ComplexPattern<untyped, 4, "SelectVOP3Mods0Clamp0OMod">;
+def VOP3Mods : ComplexPattern<untyped, 2, "SelectVOP3Mods">;
+def VOP3NoMods : ComplexPattern<untyped, 2, "SelectVOP3NoMods">;
+
+//===----------------------------------------------------------------------===//
+// SI assembler operands
+//===----------------------------------------------------------------------===//
+
+def SIOperand {
+ int ZERO = 0x80;
+ int VCC = 0x6A;
+ int FLAT_SCR = 0x68;
+}
+
+def SRCMODS {
+ int NONE = 0;
+ int NEG = 1;
+}
+
+def DSTCLAMP {
+ int NONE = 0;
+}
+
+def DSTOMOD {
+ int NONE = 0;
+}
+
+//===----------------------------------------------------------------------===//
+//
+// SI Instruction multiclass helpers.
+//
+// Instructions with _32 take 32-bit operands.
+// Instructions with _64 take 64-bit operands.
+//
+// VOP_* instructions can use either a 32-bit or 64-bit encoding. The 32-bit
+// encoding is the standard encoding, but instruction that make use of
+// any of the instruction modifiers must use the 64-bit encoding.
+//
+// Instructions with _e32 use the 32-bit encoding.
+// Instructions with _e64 use the 64-bit encoding.
+//
+//===----------------------------------------------------------------------===//
+
+class SIMCInstr <string pseudo, int subtarget> {
+ string PseudoInstr = pseudo;
+ int Subtarget = subtarget;
+}
+
+//===----------------------------------------------------------------------===//
+// EXP classes
+//===----------------------------------------------------------------------===//
+
+class EXP_Helper<bit done, SDPatternOperator node = null_frag> : EXPCommon<
+ (outs),
+ (ins exp_tgt:$tgt,
+ ExpSrc0:$src0, ExpSrc1:$src1, ExpSrc2:$src2, ExpSrc3:$src3,
+ exp_vm:$vm, exp_compr:$compr, i8imm:$en),
+ "exp$tgt $src0, $src1, $src2, $src3"#!if(done, " done", "")#"$compr$vm",
+ [(node (i8 timm:$en), (i1 timm:$vm), (i8 timm:$tgt), (i1 timm:$compr),
+ f32:$src0, f32:$src1, f32:$src2, f32:$src3)]> {
+ let AsmMatchConverter = "cvtExp";
+}
+
+// Split EXP instruction into EXP and EXP_DONE so we can set
+// mayLoad for done=1.
+multiclass EXP_m<bit done, SDPatternOperator node> {
+ let mayLoad = done in {
+ let isPseudo = 1, isCodeGenOnly = 1 in {
+ def "" : EXP_Helper<done, node>,
+ SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.NONE>;
+ }
+
+ let done = done in {
+ def _si : EXP_Helper<done>,
+ SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.SI>,
+ EXPe {
+ let DecoderNamespace = "SICI";
+ let DisableDecoder = DisableSIDecoder;
+ }
+
+ def _vi : EXP_Helper<done>,
+ SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.VI>,
+ EXPe_vi {
+ let DecoderNamespace = "VI";
+ let DisableDecoder = DisableVIDecoder;
+ }
+ }
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Vector ALU classes
+//===----------------------------------------------------------------------===//
+
+class getNumSrcArgs<ValueType Src0, ValueType Src1, ValueType Src2> {
+ int ret =
+ !if (!eq(Src0.Value, untyped.Value), 0,
+ !if (!eq(Src1.Value, untyped.Value), 1, // VOP1
+ !if (!eq(Src2.Value, untyped.Value), 2, // VOP2
+ 3))); // VOP3
+}
+
+// Returns the register class to use for the destination of VOP[123C]
+// instructions for the given VT.
+class getVALUDstForVT<ValueType VT> {
+ RegisterOperand ret = !if(!eq(VT.Size, 32), VOPDstOperand<VGPR_32>,
+ !if(!eq(VT.Size, 128), VOPDstOperand<VReg_128>,
+ !if(!eq(VT.Size, 64), VOPDstOperand<VReg_64>,
+ !if(!eq(VT.Size, 16), VOPDstOperand<VGPR_32>,
+ VOPDstOperand<SReg_64>)))); // else VT == i1
+}
+
+// Returns the register class to use for source 0 of VOP[12C]
+// instructions for the given VT.
+class getVOPSrc0ForVT<ValueType VT> {
+ bit isFP = !if(!eq(VT.Value, f16.Value), 1,
+ !if(!eq(VT.Value, f32.Value), 1,
+ !if(!eq(VT.Value, f64.Value), 1,
+ 0)));
+ RegisterOperand ret = !if(isFP,
+ !if(!eq(VT.Size, 64), VSrc_f64, !if(!eq(VT.Size, 16), VSrc_f16, VSrc_f32)),
+ !if(!eq(VT.Size, 64), VSrc_b64, !if(!eq(VT.Size, 16), VSrc_b16, VSrc_b32)));
+}
+
+// Returns the vreg register class to use for source operand given VT
+class getVregSrcForVT<ValueType VT> {
+ RegisterClass ret = !if(!eq(VT.Size, 128), VReg_128,
+ !if(!eq(VT.Size, 64), VReg_64, VGPR_32));
+}
+
+
+// Returns the register class to use for sources of VOP3 instructions for the
+// given VT.
+class getVOP3SrcForVT<ValueType VT> {
+ bit isFP = !if(!eq(VT.Value, f16.Value), 1,
+ !if(!eq(VT.Value, f32.Value), 1,
+ !if(!eq(VT.Value, f64.Value), 1,
+ 0)));
+ RegisterOperand ret =
+ !if(!eq(VT.Size, 128),
+ VSrc_128,
+ !if(!eq(VT.Size, 64),
+ !if(isFP,
+ VCSrc_f64,
+ VCSrc_b64),
+ !if(!eq(VT.Value, i1.Value),
+ SCSrc_b64,
+ !if(isFP,
+ !if(!eq(VT.Size, 16), VCSrc_f16, VCSrc_f32),
+ !if(!eq(VT.Size, 16), VCSrc_b16, VCSrc_b32)
+ )
+ )
+ )
+ );
+}
+
+// Returns 1 if the source arguments have modifiers, 0 if they do not.
+// XXX - do f16 instructions?
+class isFloatType<ValueType SrcVT> {
+ bit ret =
+ !if(!eq(SrcVT.Value, f16.Value), 1,
+ !if(!eq(SrcVT.Value, f32.Value), 1,
+ !if(!eq(SrcVT.Value, f64.Value), 1,
+ 0)));
+}
+
+class isIntType<ValueType SrcVT> {
+ bit ret =
+ !if(!eq(SrcVT.Value, i16.Value), 1,
+ !if(!eq(SrcVT.Value, i32.Value), 1,
+ !if(!eq(SrcVT.Value, i64.Value), 1,
+ 0)));
+}
+
+
+// Return type of input modifiers operand for specified input operand
+class getSrcMod <ValueType VT> {
+ bit isFP = !if(!eq(VT.Value, f16.Value), 1,
+ !if(!eq(VT.Value, f32.Value), 1,
+ !if(!eq(VT.Value, f64.Value), 1,
+ 0)));
+ Operand ret = !if(!eq(VT.Size, 64),
+ !if(isFP, FP64InputMods, Int64InputMods),
+ !if(isFP,
+ !if(!eq(VT.Value, f16.Value),
+ FP16InputMods,
+ FP32InputMods
+ ),
+ Int32InputMods)
+ );
+}
+
+// Returns the input arguments for VOP[12C] instructions for the given SrcVT.
+class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> {
+ dag ret = !if(!eq(NumSrcArgs, 1), (ins Src0RC:$src0), // VOP1
+ !if(!eq(NumSrcArgs, 2), (ins Src0RC:$src0, Src1RC:$src1), // VOP2
+ (ins)));
+}
+
+// Returns the input arguments for VOP3 instructions for the given SrcVT.
+class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
+ RegisterOperand Src2RC, int NumSrcArgs,
+ bit HasModifiers, Operand Src0Mod, Operand Src1Mod,
+ Operand Src2Mod> {
+
+ dag ret =
+ !if (!eq(NumSrcArgs, 0),
+ // VOP1 without input operands (V_NOP, V_CLREXCP)
+ (ins),
+ /* else */
+ !if (!eq(NumSrcArgs, 1),
+ !if (!eq(HasModifiers, 1),
+ // VOP1 with modifiers
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ clampmod:$clamp, omod:$omod)
+ /* else */,
+ // VOP1 without modifiers
+ (ins Src0RC:$src0)
+ /* endif */ ),
+ !if (!eq(NumSrcArgs, 2),
+ !if (!eq(HasModifiers, 1),
+ // VOP 2 with modifiers
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ Src1Mod:$src1_modifiers, Src1RC:$src1,
+ clampmod:$clamp, omod:$omod)
+ /* else */,
+ // VOP2 without modifiers
+ (ins Src0RC:$src0, Src1RC:$src1)
+ /* endif */ )
+ /* NumSrcArgs == 3 */,
+ !if (!eq(HasModifiers, 1),
+ // VOP3 with modifiers
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ Src1Mod:$src1_modifiers, Src1RC:$src1,
+ Src2Mod:$src2_modifiers, Src2RC:$src2,
+ clampmod:$clamp, omod:$omod)
+ /* else */,
+ // VOP3 without modifiers
+ (ins Src0RC:$src0, Src1RC:$src1, Src2RC:$src2)
+ /* endif */ ))));
+}
+
+class getInsDPP <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,
+ bit HasModifiers, Operand Src0Mod, Operand Src1Mod> {
+
+ dag ret = !if (!eq(NumSrcArgs, 0),
+ // VOP1 without input operands (V_NOP)
+ (ins dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+ bank_mask:$bank_mask, bound_ctrl:$bound_ctrl),
+ !if (!eq(NumSrcArgs, 1),
+ !if (!eq(HasModifiers, 1),
+ // VOP1_DPP with modifiers
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+ bank_mask:$bank_mask, bound_ctrl:$bound_ctrl)
+ /* else */,
+ // VOP1_DPP without modifiers
+ (ins Src0RC:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+ bank_mask:$bank_mask, bound_ctrl:$bound_ctrl)
+ /* endif */)
+ /* NumSrcArgs == 2 */,
+ !if (!eq(HasModifiers, 1),
+ // VOP2_DPP with modifiers
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ Src1Mod:$src1_modifiers, Src1RC:$src1,
+ dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+ bank_mask:$bank_mask, bound_ctrl:$bound_ctrl)
+ /* else */,
+ // VOP2_DPP without modifiers
+ (ins Src0RC:$src0, Src1RC:$src1, dpp_ctrl:$dpp_ctrl,
+ row_mask:$row_mask, bank_mask:$bank_mask,
+ bound_ctrl:$bound_ctrl)
+ /* endif */)));
+}
+
+class getInsSDWA <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,
+ bit HasFloatModifiers, Operand Src0Mod, Operand Src1Mod,
+ ValueType DstVT> {
+
+ dag ret = !if(!eq(NumSrcArgs, 0),
+ // VOP1 without input operands (V_NOP)
+ (ins),
+ !if(!eq(NumSrcArgs, 1),
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+ src0_sel:$src0_sel),
+ !if(!eq(NumSrcArgs, 2),
+ !if(!eq(DstVT.Size, 1),
+ // VOPC_SDWA with modifiers
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ Src1Mod:$src1_modifiers, Src1RC:$src1,
+ clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel),
+ // VOP2_SDWA or VOPC_SDWA with modifiers
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ Src1Mod:$src1_modifiers, Src1RC:$src1,
+ clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+ src0_sel:$src0_sel, src1_sel:$src1_sel)),
+ (ins)/* endif */)));
+}
+
+// Outs for DPP and SDWA
+class getOutsExt <bit HasDst, ValueType DstVT, RegisterOperand DstRCDPP> {
+ dag ret = !if(HasDst,
+ !if(!eq(DstVT.Size, 1),
+ (outs), // no dst for VOPC, we use "vcc"-token as dst in SDWA VOPC instructions
+ (outs DstRCDPP:$vdst)),
+ (outs)); // V_NOP
+}
+
+// Returns the assembly string for the inputs and outputs of a VOP[12C]
+// instruction. This does not add the _e32 suffix, so it can be reused
+// by getAsm64.
+class getAsm32 <bit HasDst, int NumSrcArgs, ValueType DstVT = i32> {
+ string dst = !if(!eq(DstVT.Size, 1), "$sdst", "$vdst"); // use $sdst for VOPC
+ string src0 = ", $src0";
+ string src1 = ", $src1";
+ string src2 = ", $src2";
+ string ret = !if(HasDst, dst, "") #
+ !if(!eq(NumSrcArgs, 1), src0, "") #
+ !if(!eq(NumSrcArgs, 2), src0#src1, "") #
+ !if(!eq(NumSrcArgs, 3), src0#src1#src2, "");
+}
+
+// Returns the assembly string for the inputs and outputs of a VOP3
+// instruction.
+class getAsm64 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32> {
+ string dst = !if(!eq(DstVT.Size, 1), "$sdst", "$vdst"); // use $sdst for VOPC
+ string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,");
+ string src1 = !if(!eq(NumSrcArgs, 1), "",
+ !if(!eq(NumSrcArgs, 2), " $src1_modifiers",
+ " $src1_modifiers,"));
+ string src2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", "");
+ string ret =
+ !if(!eq(HasModifiers, 0),
+ getAsm32<HasDst, NumSrcArgs, DstVT>.ret,
+ dst#", "#src0#src1#src2#"$clamp"#"$omod");
+}
+
+class getAsmDPP <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32> {
+ string dst = !if(HasDst,
+ !if(!eq(DstVT.Size, 1),
+ "$sdst",
+ "$vdst"),
+ ""); // use $sdst for VOPC
+ string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,");
+ string src1 = !if(!eq(NumSrcArgs, 1), "",
+ !if(!eq(NumSrcArgs, 2), " $src1_modifiers",
+ " $src1_modifiers,"));
+ string args = !if(!eq(HasModifiers, 0),
+ getAsm32<0, NumSrcArgs, DstVT>.ret,
+ ", "#src0#src1);
+ string ret = dst#args#" $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
+}
+
+class getAsmSDWA <bit HasDst, int NumSrcArgs, bit HasFloatModifiers,
+ ValueType DstVT = i32> {
+ string dst = !if(HasDst,
+ !if(!eq(DstVT.Size, 1),
+ " vcc", // use vcc token as dst for VOPC instructioins
+ "$vdst"),
+ "");
+ string src0 = "$src0_modifiers";
+ string src1 = "$src1_modifiers";
+ string args = !if(!eq(NumSrcArgs, 0),
+ "",
+ !if(!eq(NumSrcArgs, 1),
+ ", "#src0#"$clamp",
+ ", "#src0#", "#src1#"$clamp"
+ )
+ );
+ string sdwa = !if(!eq(NumSrcArgs, 0),
+ "",
+ !if(!eq(NumSrcArgs, 1),
+ " $dst_sel $dst_unused $src0_sel",
+ !if(!eq(DstVT.Size, 1),
+ " $src0_sel $src1_sel", // No dst_sel and dst_unused for VOPC
+ " $dst_sel $dst_unused $src0_sel $src1_sel"
+ )
+ )
+ );
+ string ret = dst#args#sdwa;
+}
+
+// Function that checks if instruction supports DPP and SDWA
+class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
+ ValueType Src1VT = i32> {
+ bit ret = !if(!eq(NumSrcArgs, 3),
+ 0, // NumSrcArgs == 3 - No DPP or SDWA for VOP3
+ !if(!eq(DstVT.Size, 64),
+ 0, // 64-bit dst - No DPP or SDWA for 64-bit operands
+ !if(!eq(Src0VT.Size, 64),
+ 0, // 64-bit src0
+ !if(!eq(Src0VT.Size, 64),
+ 0, // 64-bit src2
+ 1
+ )
+ )
+ )
+ );
+}
+
+class BitOr<bit a, bit b> {
+ bit ret = !if(a, 1, !if(b, 1, 0));
+}
+
+class BitAnd<bit a, bit b> {
+ bit ret = !if(a, !if(b, 1, 0), 0);
+}
+
+class VOPProfile <list<ValueType> _ArgVT> {
+
+ field list<ValueType> ArgVT = _ArgVT;
+
+ field ValueType DstVT = ArgVT[0];
+ field ValueType Src0VT = ArgVT[1];
+ field ValueType Src1VT = ArgVT[2];
+ field ValueType Src2VT = ArgVT[3];
+ field RegisterOperand DstRC = getVALUDstForVT<DstVT>.ret;
+ field RegisterOperand DstRCDPP = getVALUDstForVT<DstVT>.ret;
+ field RegisterOperand DstRCSDWA = getVALUDstForVT<DstVT>.ret;
+ field RegisterOperand Src0RC32 = getVOPSrc0ForVT<Src0VT>.ret;
+ field RegisterClass Src1RC32 = getVregSrcForVT<Src1VT>.ret;
+ field RegisterOperand Src0RC64 = getVOP3SrcForVT<Src0VT>.ret;
+ field RegisterOperand Src1RC64 = getVOP3SrcForVT<Src1VT>.ret;
+ field RegisterOperand Src2RC64 = getVOP3SrcForVT<Src2VT>.ret;
+ field RegisterClass Src0DPP = getVregSrcForVT<Src0VT>.ret;
+ field RegisterClass Src1DPP = getVregSrcForVT<Src1VT>.ret;
+ field RegisterClass Src0SDWA = getVregSrcForVT<Src0VT>.ret;
+ field RegisterClass Src1SDWA = getVregSrcForVT<Src1VT>.ret;
+ field Operand Src0Mod = getSrcMod<Src0VT>.ret;
+ field Operand Src1Mod = getSrcMod<Src1VT>.ret;
+ field Operand Src2Mod = getSrcMod<Src2VT>.ret;
+
+ field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1);
+ field bit HasDst32 = HasDst;
+ field bit EmitDst = HasDst; // force dst encoding, see v_movreld_b32 special case
+ field int NumSrcArgs = getNumSrcArgs<Src0VT, Src1VT, Src2VT>.ret;
+ field bit HasSrc0 = !if(!eq(Src0VT.Value, untyped.Value), 0, 1);
+ field bit HasSrc1 = !if(!eq(Src1VT.Value, untyped.Value), 0, 1);
+ field bit HasSrc2 = !if(!eq(Src2VT.Value, untyped.Value), 0, 1);
+
+ // TODO: Modifiers logic is somewhat adhoc here, to be refined later
+ field bit HasModifiers = isFloatType<Src0VT>.ret;
+
+ field bit HasSrc0FloatMods = isFloatType<Src0VT>.ret;
+ field bit HasSrc1FloatMods = isFloatType<Src1VT>.ret;
+ field bit HasSrc2FloatMods = isFloatType<Src2VT>.ret;
+
+ field bit HasSrc0IntMods = isIntType<Src0VT>.ret;
+ field bit HasSrc1IntMods = isIntType<Src1VT>.ret;
+ field bit HasSrc2IntMods = isIntType<Src2VT>.ret;
+
+ field bit HasSrc0Mods = HasModifiers;
+ field bit HasSrc1Mods = !if(HasModifiers, BitOr<HasSrc1FloatMods, HasSrc1IntMods>.ret, 0);
+ field bit HasSrc2Mods = !if(HasModifiers, BitOr<HasSrc2FloatMods, HasSrc2IntMods>.ret, 0);
+
+ field bit HasOMod = HasModifiers;
+ field bit HasClamp = HasModifiers;
+ field bit HasSDWAClamp = HasSrc0;
+
+ field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
+
+ field dag Outs = !if(HasDst,(outs DstRC:$vdst),(outs));
+
+ // VOP3b instructions are a special case with a second explicit
+ // output. This is manually overridden for them.
+ field dag Outs32 = Outs;
+ field dag Outs64 = Outs;
+ field dag OutsDPP = getOutsExt<HasDst, DstVT, DstRCDPP>.ret;
+ field dag OutsSDWA = getOutsExt<HasDst, DstVT, DstRCDPP>.ret;
+
+ field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret;
+ field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
+ HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret;
+ field dag InsDPP = getInsDPP<Src0DPP, Src1DPP, NumSrcArgs,
+ HasModifiers, Src0Mod, Src1Mod>.ret;
+ field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs,
+ HasModifiers, Src0Mod, Src1Mod, DstVT>.ret;
+
+ field string Asm32 = getAsm32<HasDst, NumSrcArgs, DstVT>.ret;
+ field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
+ field string AsmDPP = getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
+ field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
+}
+
+class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> {
+ let HasExt = 0;
+}
+
+def VOP_F16_F16 : VOPProfile <[f16, f16, untyped, untyped]>;
+def VOP_F16_I16 : VOPProfile <[f16, i16, untyped, untyped]>;
+def VOP_I16_F16 : VOPProfile <[i16, f16, untyped, untyped]>;
+
+def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>;
+def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>;
+def VOP_F16_F16_I32 : VOPProfile <[f16, f16, i32, untyped]>;
+def VOP_I16_I16_I16 : VOPProfile <[i32, i32, i32, untyped]>;
+
+def VOP_I16_I16_I16_I16 : VOPProfile <[i32, i32, i32, i32, untyped]>;
+def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>;
+
+def VOP_NONE : VOPProfile <[untyped, untyped, untyped, untyped]>;
+
+def VOP_F32_F32 : VOPProfile <[f32, f32, untyped, untyped]>;
+def VOP_F32_F64 : VOPProfile <[f32, f64, untyped, untyped]>;
+def VOP_F32_I32 : VOPProfile <[f32, i32, untyped, untyped]>;
+def VOP_F64_F32 : VOPProfile <[f64, f32, untyped, untyped]>;
+def VOP_F64_F64 : VOPProfile <[f64, f64, untyped, untyped]>;
+def VOP_F64_I32 : VOPProfile <[f64, i32, untyped, untyped]>;
+def VOP_I32_F32 : VOPProfile <[i32, f32, untyped, untyped]>;
+def VOP_I32_F64 : VOPProfile <[i32, f64, untyped, untyped]>;
+def VOP_I32_I32 : VOPProfile <[i32, i32, untyped, untyped]>;
+
+def VOP_F32_F32_F16 : VOPProfile <[f32, f32, f16, untyped]>;
+def VOP_F32_F32_F32 : VOPProfile <[f32, f32, f32, untyped]>;
+def VOP_F32_F32_I32 : VOPProfile <[f32, f32, i32, untyped]>;
+def VOP_F64_F64_F64 : VOPProfile <[f64, f64, f64, untyped]>;
+def VOP_F64_F64_I32 : VOPProfile <[f64, f64, i32, untyped]>;
+def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>;
+def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>;
+def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>;
+
+def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>;
+def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>;
+def VOP_I64_I64_I64 : VOPProfile <[i64, i64, i64, untyped]>;
+
+def VOP_F16_F32_F16_F32 : VOPProfile <[f16, f32, f16, f32]>;
+def VOP_F32_F32_F16_F16 : VOPProfile <[f32, f32, f16, f16]>;
+def VOP_F32_F32_F32_F32 : VOPProfile <[f32, f32, f32, f32]>;
+def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>;
+def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>;
+def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>;
+def VOP_I32_F32_I32_I32 : VOPProfile <[i32, f32, i32, i32]>;
+def VOP_I64_I64_I32_I64 : VOPProfile <[i64, i64, i32, i64]>;
+def VOP_V4I32_I64_I32_V4I32 : VOPProfile <[v4i32, i64, i32, v4i32]>;
+
+class Commutable_REV <string revOp, bit isOrig> {
+ string RevOp = revOp;
+ bit IsOrig = isOrig;
+}
+
+class AtomicNoRet <string noRetOp, bit isRet> {
+ string NoRetOp = noRetOp;
+ bit IsRet = isRet;
+}
+
+//===----------------------------------------------------------------------===//
+// Interpolation opcodes
+//===----------------------------------------------------------------------===//
+
+class VINTRP_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
+ VINTRPCommon <outs, ins, "", pattern>,
+ SIMCInstr<opName, SIEncodingFamily.NONE> {
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+}
+
+class VINTRP_Real_si <bits <2> op, string opName, dag outs, dag ins,
+ string asm> :
+ VINTRPCommon <outs, ins, asm, []>,
+ VINTRPe <op>,
+ SIMCInstr<opName, SIEncodingFamily.SI> {
+ let AssemblerPredicate = SIAssemblerPredicate;
+ let DecoderNamespace = "SICI";
+ let DisableDecoder = DisableSIDecoder;
+}
+
+class VINTRP_Real_vi <bits <2> op, string opName, dag outs, dag ins,
+ string asm> :
+ VINTRPCommon <outs, ins, asm, []>,
+ VINTRPe_vi <op>,
+ SIMCInstr<opName, SIEncodingFamily.VI> {
+ let AssemblerPredicate = VIAssemblerPredicate;
+ let DecoderNamespace = "VI";
+ let DisableDecoder = DisableVIDecoder;
+}
+
+multiclass VINTRP_m <bits <2> op, dag outs, dag ins, string asm,
+ list<dag> pattern = []> {
+ def "" : VINTRP_Pseudo <NAME, outs, ins, pattern>;
+
+ def _si : VINTRP_Real_si <op, NAME, outs, ins, asm>;
+
+ def _vi : VINTRP_Real_vi <op, NAME, outs, ins, asm>;
+}
+
+//===----------------------------------------------------------------------===//
+// Vector instruction mappings
+//===----------------------------------------------------------------------===//
+
+// Maps an opcode in e32 form to its e64 equivalent
+def getVOPe64 : InstrMapping {
+ let FilterClass = "VOP";
+ let RowFields = ["OpName"];
+ let ColFields = ["Size", "VOP3"];
+ let KeyCol = ["4", "0"];
+ let ValueCols = [["8", "1"]];
+}
+
+// Maps an opcode in e64 form to its e32 equivalent
+def getVOPe32 : InstrMapping {
+ let FilterClass = "VOP";
+ let RowFields = ["OpName"];
+ let ColFields = ["Size", "VOP3"];
+ let KeyCol = ["8", "1"];
+ let ValueCols = [["4", "0"]];
+}
+
+def getMaskedMIMGOp : InstrMapping {
+ let FilterClass = "MIMG_Mask";
+ let RowFields = ["Op"];
+ let ColFields = ["Channels"];
+ let KeyCol = ["4"];
+ let ValueCols = [["1"], ["2"], ["3"] ];
+}
+
+// Maps an commuted opcode to its original version
+def getCommuteOrig : InstrMapping {
+ let FilterClass = "Commutable_REV";
+ let RowFields = ["RevOp"];
+ let ColFields = ["IsOrig"];
+ let KeyCol = ["0"];
+ let ValueCols = [["1"]];
+}
+
+// Maps an original opcode to its commuted version
+def getCommuteRev : InstrMapping {
+ let FilterClass = "Commutable_REV";
+ let RowFields = ["RevOp"];
+ let ColFields = ["IsOrig"];
+ let KeyCol = ["1"];
+ let ValueCols = [["0"]];
+}
+
+def getMCOpcodeGen : InstrMapping {
+ let FilterClass = "SIMCInstr";
+ let RowFields = ["PseudoInstr"];
+ let ColFields = ["Subtarget"];
+ let KeyCol = [!cast<string>(SIEncodingFamily.NONE)];
+ let ValueCols = [[!cast<string>(SIEncodingFamily.SI)],
+ [!cast<string>(SIEncodingFamily.VI)]];
+}
+
+// Get equivalent SOPK instruction.
+def getSOPKOp : InstrMapping {
+ let FilterClass = "SOPKInstTable";
+ let RowFields = ["BaseCmpOp"];
+ let ColFields = ["IsSOPK"];
+ let KeyCol = ["0"];
+ let ValueCols = [["1"]];
+}
+
+def getAddr64Inst : InstrMapping {
+ let FilterClass = "MUBUFAddr64Table";
+ let RowFields = ["OpName"];
+ let ColFields = ["IsAddr64"];
+ let KeyCol = ["0"];
+ let ValueCols = [["1"]];
+}
+
+// Maps an atomic opcode to its version with a return value.
+def getAtomicRetOp : InstrMapping {
+ let FilterClass = "AtomicNoRet";
+ let RowFields = ["NoRetOp"];
+ let ColFields = ["IsRet"];
+ let KeyCol = ["0"];
+ let ValueCols = [["1"]];
+}
+
+// Maps an atomic opcode to its returnless version.
+def getAtomicNoRetOp : InstrMapping {
+ let FilterClass = "AtomicNoRet";
+ let RowFields = ["NoRetOp"];
+ let ColFields = ["IsRet"];
+ let KeyCol = ["1"];
+ let ValueCols = [["0"]];
+}
+
+include "SIInstructions.td"
+include "CIInstructions.td"
+
+include "DSInstructions.td"
+include "MIMGInstructions.td"
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
new file mode 100644
index 000000000000..bc35c2edc8d3
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -0,0 +1,1089 @@
+//===-- SIInstructions.td - SI Instruction Defintions ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This file was originally auto-generated from a GPU register header file and
+// all the instruction definitions were originally commented out. Instructions
+// that are not yet supported remain commented out.
+//===----------------------------------------------------------------------===//
+
+def isGCN : Predicate<"Subtarget->getGeneration() "
+ ">= SISubtarget::SOUTHERN_ISLANDS">,
+ AssemblerPredicate<"FeatureGCN">;
+def isSI : Predicate<"Subtarget->getGeneration() "
+ "== SISubtarget::SOUTHERN_ISLANDS">,
+ AssemblerPredicate<"FeatureSouthernIslands">;
+
+def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">;
+def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">;
+def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">,
+ AssemblerPredicate<"FeatureVGPRIndexMode">;
+def HasMovrel : Predicate<"Subtarget->hasMovrel()">,
+ AssemblerPredicate<"FeatureMovrel">;
+
+include "VOPInstructions.td"
+include "SOPInstructions.td"
+include "SMInstructions.td"
+include "FLATInstructions.td"
+include "BUFInstructions.td"
+
+let SubtargetPredicate = isGCN in {
+
+//===----------------------------------------------------------------------===//
+// EXP Instructions
+//===----------------------------------------------------------------------===//
+
+defm EXP : EXP_m<0, AMDGPUexport>;
+defm EXP_DONE : EXP_m<1, AMDGPUexport_done>;
+
+//===----------------------------------------------------------------------===//
+// VINTRP Instructions
+//===----------------------------------------------------------------------===//
+
+let Uses = [M0, EXEC] in {
+
+// FIXME: Specify SchedRW for VINTRP insturctions.
+
+multiclass V_INTERP_P1_F32_m : VINTRP_m <
+ 0x00000000,
+ (outs VGPR_32:$vdst),
+ (ins VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan),
+ "v_interp_p1_f32 $vdst, $vsrc, $attr$attrchan",
+ [(set f32:$vdst, (AMDGPUinterp_p1 f32:$vsrc, (i32 imm:$attrchan),
+ (i32 imm:$attr)))]
+>;
+
+let OtherPredicates = [has32BankLDS] in {
+
+defm V_INTERP_P1_F32 : V_INTERP_P1_F32_m;
+
+} // End OtherPredicates = [has32BankLDS]
+
+let OtherPredicates = [has16BankLDS], Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 in {
+
+defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m;
+
+} // End OtherPredicates = [has32BankLDS], Constraints = "@earlyclobber $vdst", isAsmParserOnly=1
+
+let DisableEncoding = "$src0", Constraints = "$src0 = $vdst" in {
+
+defm V_INTERP_P2_F32 : VINTRP_m <
+ 0x00000001,
+ (outs VGPR_32:$vdst),
+ (ins VGPR_32:$src0, VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan),
+ "v_interp_p2_f32 $vdst, $vsrc, $attr$attrchan",
+ [(set f32:$vdst, (AMDGPUinterp_p2 f32:$src0, f32:$vsrc, (i32 imm:$attrchan),
+ (i32 imm:$attr)))]>;
+
+} // End DisableEncoding = "$src0", Constraints = "$src0 = $vdst"
+
+defm V_INTERP_MOV_F32 : VINTRP_m <
+ 0x00000002,
+ (outs VGPR_32:$vdst),
+ (ins InterpSlot:$vsrc, Attr:$attr, AttrChan:$attrchan),
+ "v_interp_mov_f32 $vdst, $vsrc, $attr$attrchan",
+ [(set f32:$vdst, (AMDGPUinterp_mov (i32 imm:$vsrc), (i32 imm:$attrchan),
+ (i32 imm:$attr)))]>;
+
+} // End Uses = [M0, EXEC]
+
+//===----------------------------------------------------------------------===//
+// Pseudo Instructions
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
+
+// For use in patterns
+def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),
+ (ins VSrc_b64:$src0, VSrc_b64:$src1, SSrc_b64:$src2), "", []> {
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+ let usesCustomInserter = 1;
+}
+
+// 64-bit vector move instruction. This is mainly used by the SIFoldOperands
+// pass to enable folding of inline immediates.
+def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
+ (ins VSrc_b64:$src0)>;
+} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
+
+let usesCustomInserter = 1, SALU = 1 in {
+def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins),
+ [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>;
+} // End let usesCustomInserter = 1, SALU = 1
+
+def S_MOV_B64_term : PseudoInstSI<(outs SReg_64:$dst),
+ (ins SSrc_b64:$src0)> {
+ let SALU = 1;
+ let isAsCheapAsAMove = 1;
+ let isTerminator = 1;
+}
+
+def S_XOR_B64_term : PseudoInstSI<(outs SReg_64:$dst),
+ (ins SSrc_b64:$src0, SSrc_b64:$src1)> {
+ let SALU = 1;
+ let isAsCheapAsAMove = 1;
+ let isTerminator = 1;
+}
+
+def S_ANDN2_B64_term : PseudoInstSI<(outs SReg_64:$dst),
+ (ins SSrc_b64:$src0, SSrc_b64:$src1)> {
+ let SALU = 1;
+ let isAsCheapAsAMove = 1;
+ let isTerminator = 1;
+}
+
+def WAVE_BARRIER : SPseudoInstSI<(outs), (ins),
+ [(int_amdgcn_wave_barrier)]> {
+ let SchedRW = [];
+ let hasNoSchedulingInfo = 1;
+ let hasSideEffects = 1;
+ let mayLoad = 1;
+ let mayStore = 1;
+ let isBarrier = 1;
+ let isConvergent = 1;
+}
+
+// SI pseudo instructions. These are used by the CFG structurizer pass
+// and should be lowered to ISA instructions prior to codegen.
+
+// Dummy terminator instruction to use after control flow instructions
+// replaced with exec mask operations.
+def SI_MASK_BRANCH : PseudoInstSI <
+ (outs), (ins brtarget:$target)> {
+ let isBranch = 0;
+ let isTerminator = 1;
+ let isBarrier = 0;
+ let Uses = [EXEC];
+ let SchedRW = [];
+ let hasNoSchedulingInfo = 1;
+}
+
+let isTerminator = 1 in {
+
+def SI_IF: CFPseudoInstSI <
+ (outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target),
+ [(set i64:$dst, (int_amdgcn_if i1:$vcc, bb:$target))], 1, 1> {
+ let Constraints = "";
+ let Size = 12;
+ let mayLoad = 1;
+ let mayStore = 1;
+ let hasSideEffects = 1;
+}
+
+def SI_ELSE : CFPseudoInstSI <
+ (outs SReg_64:$dst), (ins SReg_64:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> {
+ let Constraints = "$src = $dst";
+ let Size = 12;
+ let mayStore = 1;
+ let mayLoad = 1;
+ let hasSideEffects = 1;
+}
+
+def SI_LOOP : CFPseudoInstSI <
+ (outs), (ins SReg_64:$saved, brtarget:$target),
+ [(int_amdgcn_loop i64:$saved, bb:$target)], 1, 1> {
+ let Size = 8;
+ let isBranch = 1;
+ let hasSideEffects = 1;
+ let mayLoad = 1;
+ let mayStore = 1;
+}
+
+} // End isBranch = 1, isTerminator = 1
+
+def SI_END_CF : CFPseudoInstSI <
+ (outs), (ins SReg_64:$saved),
+ [(int_amdgcn_end_cf i64:$saved)], 1, 1> {
+ let Size = 4;
+ let isAsCheapAsAMove = 1;
+ let isReMaterializable = 1;
+ let mayLoad = 1;
+ let mayStore = 1;
+ let hasSideEffects = 1;
+}
+
+def SI_BREAK : CFPseudoInstSI <
+ (outs SReg_64:$dst), (ins SReg_64:$src),
+ [(set i64:$dst, (int_amdgcn_break i64:$src))], 1> {
+ let Size = 4;
+ let isAsCheapAsAMove = 1;
+ let isReMaterializable = 1;
+}
+
+def SI_IF_BREAK : CFPseudoInstSI <
+ (outs SReg_64:$dst), (ins SReg_64:$vcc, SReg_64:$src),
+ [(set i64:$dst, (int_amdgcn_if_break i1:$vcc, i64:$src))]> {
+ let Size = 4;
+ let isAsCheapAsAMove = 1;
+ let isReMaterializable = 1;
+}
+
+def SI_ELSE_BREAK : CFPseudoInstSI <
+ (outs SReg_64:$dst), (ins SReg_64:$src0, SReg_64:$src1),
+ [(set i64:$dst, (int_amdgcn_else_break i64:$src0, i64:$src1))]> {
+ let Size = 4;
+ let isAsCheapAsAMove = 1;
+ let isReMaterializable = 1;
+}
+
+let Uses = [EXEC], Defs = [EXEC,VCC] in {
+def SI_KILL : PseudoInstSI <
+ (outs), (ins VSrc_b32:$src),
+ [(AMDGPUkill i32:$src)]> {
+ let isConvergent = 1;
+ let usesCustomInserter = 1;
+}
+
+def SI_KILL_TERMINATOR : SPseudoInstSI <
+ (outs), (ins VSrc_b32:$src)> {
+ let isTerminator = 1;
+}
+
+} // End Uses = [EXEC], Defs = [EXEC,VCC]
+
+// Branch on undef scc. Used to avoid intermediate copy from
+// IMPLICIT_DEF to SCC.
+def SI_BR_UNDEF : SPseudoInstSI <(outs), (ins sopp_brtarget:$simm16)> {
+ let isTerminator = 1;
+ let usesCustomInserter = 1;
+}
+
+def SI_PS_LIVE : PseudoInstSI <
+ (outs SReg_64:$dst), (ins),
+ [(set i1:$dst, (int_amdgcn_ps_live))]> {
+ let SALU = 1;
+}
+
+// Used as an isel pseudo to directly emit initialization with an
+// s_mov_b32 rather than a copy of another initialized
+// register. MachineCSE skips copies, and we don't want to have to
+// fold operands before it runs.
+def SI_INIT_M0 : SPseudoInstSI <(outs), (ins SSrc_b32:$src)> {
+ let Defs = [M0];
+ let usesCustomInserter = 1;
+ let isAsCheapAsAMove = 1;
+ let isReMaterializable = 1;
+}
+
+def SI_RETURN : SPseudoInstSI <
+ (outs), (ins variable_ops), [(AMDGPUreturn)]> {
+ let isTerminator = 1;
+ let isBarrier = 1;
+ let isReturn = 1;
+ let hasSideEffects = 1;
+ let hasNoSchedulingInfo = 1;
+ let DisableWQM = 1;
+}
+
+let Defs = [M0, EXEC],
+ UseNamedOperandTable = 1 in {
+
+class SI_INDIRECT_SRC<RegisterClass rc> : VPseudoInstSI <
+ (outs VGPR_32:$vdst),
+ (ins rc:$src, VS_32:$idx, i32imm:$offset)> {
+ let usesCustomInserter = 1;
+}
+
+class SI_INDIRECT_DST<RegisterClass rc> : VPseudoInstSI <
+ (outs rc:$vdst),
+ (ins rc:$src, VS_32:$idx, i32imm:$offset, VGPR_32:$val)> {
+ let Constraints = "$src = $vdst";
+ let usesCustomInserter = 1;
+}
+
+// TODO: We can support indirect SGPR access.
+def SI_INDIRECT_SRC_V1 : SI_INDIRECT_SRC<VGPR_32>;
+def SI_INDIRECT_SRC_V2 : SI_INDIRECT_SRC<VReg_64>;
+def SI_INDIRECT_SRC_V4 : SI_INDIRECT_SRC<VReg_128>;
+def SI_INDIRECT_SRC_V8 : SI_INDIRECT_SRC<VReg_256>;
+def SI_INDIRECT_SRC_V16 : SI_INDIRECT_SRC<VReg_512>;
+
+def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>;
+def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>;
+def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>;
+def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>;
+def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>;
+
+} // End Uses = [EXEC], Defs = [M0, EXEC]
+
+multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
+ let UseNamedOperandTable = 1, SGPRSpill = 1, Uses = [EXEC] in {
+ def _SAVE : PseudoInstSI <
+ (outs),
+ (ins sgpr_class:$data, i32imm:$addr)> {
+ let mayStore = 1;
+ let mayLoad = 0;
+ }
+
+ def _RESTORE : PseudoInstSI <
+ (outs sgpr_class:$data),
+ (ins i32imm:$addr)> {
+ let mayStore = 0;
+ let mayLoad = 1;
+ }
+ } // End UseNamedOperandTable = 1
+}
+
+// You cannot use M0 as the output of v_readlane_b32 instructions or
+// use it in the sdata operand of SMEM instructions. We still need to
+// be able to spill the physical register m0, so allow it for
+// SI_SPILL_32_* instructions.
+defm SI_SPILL_S32 : SI_SPILL_SGPR <SReg_32>;
+defm SI_SPILL_S64 : SI_SPILL_SGPR <SReg_64>;
+defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
+defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
+defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
+
+multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
+ let UseNamedOperandTable = 1, VGPRSpill = 1,
+ SchedRW = [WriteVMEM] in {
+ def _SAVE : VPseudoInstSI <
+ (outs),
+ (ins vgpr_class:$vdata, i32imm:$vaddr, SReg_128:$srsrc,
+ SReg_32:$soffset, i32imm:$offset)> {
+ let mayStore = 1;
+ let mayLoad = 0;
+ // (2 * 4) + (8 * num_subregs) bytes maximum
+ let Size = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
+ }
+
+ def _RESTORE : VPseudoInstSI <
+ (outs vgpr_class:$vdata),
+ (ins i32imm:$vaddr, SReg_128:$srsrc, SReg_32:$soffset,
+ i32imm:$offset)> {
+ let mayStore = 0;
+ let mayLoad = 1;
+
+ // (2 * 4) + (8 * num_subregs) bytes maximum
+ let Size = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
+ }
+ } // End UseNamedOperandTable = 1, VGPRSpill = 1, SchedRW = [WriteVMEM]
+}
+
+defm SI_SPILL_V32 : SI_SPILL_VGPR <VGPR_32>;
+defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>;
+defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>;
+defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>;
+defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>;
+defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
+
+def SI_PC_ADD_REL_OFFSET : SPseudoInstSI <
+ (outs SReg_64:$dst),
+ (ins si_ga:$ptr_lo, si_ga:$ptr_hi),
+ [(set SReg_64:$dst,
+ (i64 (SIpc_add_rel_offset (tglobaladdr:$ptr_lo), (tglobaladdr:$ptr_hi))))]> {
+ let Defs = [SCC];
+}
+
+} // End SubtargetPredicate = isGCN
+
+let Predicates = [isGCN] in {
+
+def : Pat<
+ (int_amdgcn_else i64:$src, bb:$target),
+ (SI_ELSE $src, $target, 0)
+>;
+
+def : Pat <
+ (int_AMDGPU_kilp),
+ (SI_KILL (i32 0xbf800000))
+>;
+
+//===----------------------------------------------------------------------===//
+// VOP1 Patterns
+//===----------------------------------------------------------------------===//
+
+let Predicates = [UnsafeFPMath] in {
+
+//def : RcpPat<V_RCP_F64_e32, f64>;
+//defm : RsqPat<V_RSQ_F64_e32, f64>;
+//defm : RsqPat<V_RSQ_F32_e32, f32>;
+
+def : RsqPat<V_RSQ_F32_e32, f32>;
+def : RsqPat<V_RSQ_F64_e32, f64>;
+
+// Convert (x - floor(x)) to fract(x)
+def : Pat <
+ (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)),
+ (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))),
+ (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+// Convert (x + (-floor(x))) to fract(x)
+def : Pat <
+ (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)),
+ (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))),
+ (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+} // End Predicates = [UnsafeFPMath]
+
+def : Pat <
+ (f32 (fpextend f16:$src)),
+ (V_CVT_F32_F16_e32 $src)
+>;
+
+def : Pat <
+ (f64 (fpextend f16:$src)),
+ (V_CVT_F64_F32_e32 (V_CVT_F32_F16_e32 $src))
+>;
+
+def : Pat <
+ (f16 (fpround f32:$src)),
+ (V_CVT_F16_F32_e32 $src)
+>;
+
+def : Pat <
+ (f16 (fpround f64:$src)),
+ (V_CVT_F16_F32_e32 (V_CVT_F32_F64_e32 $src))
+>;
+
+def : Pat <
+ (i32 (fp_to_sint f16:$src)),
+ (V_CVT_I32_F32_e32 (V_CVT_F32_F16_e32 $src))
+>;
+
+def : Pat <
+ (i32 (fp_to_uint f16:$src)),
+ (V_CVT_U32_F32_e32 (V_CVT_F32_F16_e32 $src))
+>;
+
+def : Pat <
+ (f16 (sint_to_fp i32:$src)),
+ (V_CVT_F16_F32_e32 (V_CVT_F32_I32_e32 $src))
+>;
+
+def : Pat <
+ (f16 (uint_to_fp i32:$src)),
+ (V_CVT_F16_F32_e32 (V_CVT_F32_U32_e32 $src))
+>;
+
+//===----------------------------------------------------------------------===//
+// VOP2 Patterns
+//===----------------------------------------------------------------------===//
+
+multiclass FMADPat <ValueType vt, Instruction inst> {
+ def : Pat <
+ (vt (fmad (VOP3NoMods0 vt:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
+ (VOP3NoMods vt:$src1, i32:$src1_modifiers),
+ (VOP3NoMods vt:$src2, i32:$src2_modifiers))),
+ (inst $src0_modifiers, $src0, $src1_modifiers, $src1,
+ $src2_modifiers, $src2, $clamp, $omod)
+ >;
+}
+
+defm : FMADPat <f16, V_MAC_F16_e64>;
+defm : FMADPat <f32, V_MAC_F32_e64>;
+
+multiclass SelectPat <ValueType vt, Instruction inst> {
+ def : Pat <
+ (vt (select i1:$src0, vt:$src1, vt:$src2)),
+ (inst $src2, $src1, $src0)
+ >;
+}
+
+defm : SelectPat <i16, V_CNDMASK_B32_e64>;
+defm : SelectPat <i32, V_CNDMASK_B32_e64>;
+defm : SelectPat <f16, V_CNDMASK_B32_e64>;
+defm : SelectPat <f32, V_CNDMASK_B32_e64>;
+
+def : Pat <
+ (i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)),
+ (V_BCNT_U32_B32_e64 $popcnt, $val)
+>;
+
+/********** ============================================ **********/
+/********** Extraction, Insertion, Building and Casting **********/
+/********** ============================================ **********/
+
+foreach Index = 0-2 in {
+ def Extract_Element_v2i32_#Index : Extract_Element <
+ i32, v2i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v2i32_#Index : Insert_Element <
+ i32, v2i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+
+ def Extract_Element_v2f32_#Index : Extract_Element <
+ f32, v2f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v2f32_#Index : Insert_Element <
+ f32, v2f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+}
+
+foreach Index = 0-3 in {
+ def Extract_Element_v4i32_#Index : Extract_Element <
+ i32, v4i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v4i32_#Index : Insert_Element <
+ i32, v4i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+
+ def Extract_Element_v4f32_#Index : Extract_Element <
+ f32, v4f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v4f32_#Index : Insert_Element <
+ f32, v4f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+}
+
+foreach Index = 0-7 in {
+ def Extract_Element_v8i32_#Index : Extract_Element <
+ i32, v8i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v8i32_#Index : Insert_Element <
+ i32, v8i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+
+ def Extract_Element_v8f32_#Index : Extract_Element <
+ f32, v8f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v8f32_#Index : Insert_Element <
+ f32, v8f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+}
+
+foreach Index = 0-15 in {
+ def Extract_Element_v16i32_#Index : Extract_Element <
+ i32, v16i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v16i32_#Index : Insert_Element <
+ i32, v16i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+
+ def Extract_Element_v16f32_#Index : Extract_Element <
+ f32, v16f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v16f32_#Index : Insert_Element <
+ f32, v16f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+}
+
+// FIXME: Why do only some of these type combinations for SReg and
+// VReg?
+// 16-bit bitcast
+def : BitConvert <i16, f16, VGPR_32>;
+def : BitConvert <f16, i16, VGPR_32>;
+def : BitConvert <i16, f16, SReg_32>;
+def : BitConvert <f16, i16, SReg_32>;
+
+// 32-bit bitcast
+def : BitConvert <i32, f32, VGPR_32>;
+def : BitConvert <f32, i32, VGPR_32>;
+def : BitConvert <i32, f32, SReg_32>;
+def : BitConvert <f32, i32, SReg_32>;
+
+// 64-bit bitcast
+def : BitConvert <i64, f64, VReg_64>;
+def : BitConvert <f64, i64, VReg_64>;
+def : BitConvert <v2i32, v2f32, VReg_64>;
+def : BitConvert <v2f32, v2i32, VReg_64>;
+def : BitConvert <i64, v2i32, VReg_64>;
+def : BitConvert <v2i32, i64, VReg_64>;
+def : BitConvert <i64, v2f32, VReg_64>;
+def : BitConvert <v2f32, i64, VReg_64>;
+def : BitConvert <f64, v2f32, VReg_64>;
+def : BitConvert <v2f32, f64, VReg_64>;
+def : BitConvert <f64, v2i32, VReg_64>;
+def : BitConvert <v2i32, f64, VReg_64>;
+def : BitConvert <v4i32, v4f32, VReg_128>;
+def : BitConvert <v4f32, v4i32, VReg_128>;
+
+// 128-bit bitcast
+def : BitConvert <v2i64, v4i32, SReg_128>;
+def : BitConvert <v4i32, v2i64, SReg_128>;
+def : BitConvert <v2f64, v4f32, VReg_128>;
+def : BitConvert <v2f64, v4i32, VReg_128>;
+def : BitConvert <v4f32, v2f64, VReg_128>;
+def : BitConvert <v4i32, v2f64, VReg_128>;
+def : BitConvert <v2i64, v2f64, VReg_128>;
+def : BitConvert <v2f64, v2i64, VReg_128>;
+
+// 256-bit bitcast
+def : BitConvert <v8i32, v8f32, SReg_256>;
+def : BitConvert <v8f32, v8i32, SReg_256>;
+def : BitConvert <v8i32, v8f32, VReg_256>;
+def : BitConvert <v8f32, v8i32, VReg_256>;
+
+// 512-bit bitcast
+def : BitConvert <v16i32, v16f32, VReg_512>;
+def : BitConvert <v16f32, v16i32, VReg_512>;
+
+/********** =================== **********/
+/********** Src & Dst modifiers **********/
+/********** =================== **********/
+
+def : Pat <
+ (AMDGPUclamp (VOP3Mods0Clamp f32:$src0, i32:$src0_modifiers, i32:$omod),
+ (f32 FP_ZERO), (f32 FP_ONE)),
+ (V_ADD_F32_e64 $src0_modifiers, $src0, 0, (i32 0), 1, $omod)
+>;
+
+/********** ================================ **********/
+/********** Floating point absolute/negative **********/
+/********** ================================ **********/
+
+// Prevent expanding both fneg and fabs.
+
+def : Pat <
+ (fneg (fabs f32:$src)),
+ (S_OR_B32 $src, (S_MOV_B32(i32 0x80000000))) // Set sign bit
+>;
+
+// FIXME: Should use S_OR_B32
+def : Pat <
+ (fneg (fabs f64:$src)),
+ (REG_SEQUENCE VReg_64,
+ (i32 (EXTRACT_SUBREG f64:$src, sub0)),
+ sub0,
+ (V_OR_B32_e32 (i32 (EXTRACT_SUBREG f64:$src, sub1)),
+ (V_MOV_B32_e32 (i32 0x80000000))), // Set sign bit.
+ sub1)
+>;
+
+def : Pat <
+ (fabs f32:$src),
+ (V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x7fffffff)))
+>;
+
+def : Pat <
+ (fneg f32:$src),
+ (V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x80000000)))
+>;
+
+def : Pat <
+ (fabs f64:$src),
+ (REG_SEQUENCE VReg_64,
+ (i32 (EXTRACT_SUBREG f64:$src, sub0)),
+ sub0,
+ (V_AND_B32_e64 (i32 (EXTRACT_SUBREG f64:$src, sub1)),
+ (V_MOV_B32_e32 (i32 0x7fffffff))), // Set sign bit.
+ sub1)
+>;
+
+def : Pat <
+ (fneg f64:$src),
+ (REG_SEQUENCE VReg_64,
+ (i32 (EXTRACT_SUBREG f64:$src, sub0)),
+ sub0,
+ (V_XOR_B32_e32 (i32 (EXTRACT_SUBREG f64:$src, sub1)),
+ (i32 (V_MOV_B32_e32 (i32 0x80000000)))),
+ sub1)
+>;
+
+def : Pat <
+ (fneg f16:$src),
+ (V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x00008000)))
+>;
+
+def : Pat <
+ (fabs f16:$src),
+ (V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x00007fff)))
+>;
+
+def : Pat <
+ (fneg (fabs f16:$src)),
+ (S_OR_B32 $src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit
+>;
+
+/********** ================== **********/
+/********** Immediate Patterns **********/
+/********** ================== **********/
+
+def : Pat <
+ (VGPRImm<(i32 imm)>:$imm),
+ (V_MOV_B32_e32 imm:$imm)
+>;
+
+def : Pat <
+ (VGPRImm<(f32 fpimm)>:$imm),
+ (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm)))
+>;
+
+def : Pat <
+ (i32 imm:$imm),
+ (S_MOV_B32 imm:$imm)
+>;
+
+// FIXME: Workaround for ordering issue with peephole optimizer where
+// a register class copy interferes with immediate folding. Should
+// use s_mov_b32, which can be shrunk to s_movk_i32
+def : Pat <
+ (VGPRImm<(f16 fpimm)>:$imm),
+ (V_MOV_B32_e32 (f16 (bitcast_fpimm_to_i32 $imm)))
+>;
+
+def : Pat <
+ (f32 fpimm:$imm),
+ (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm)))
+>;
+
+def : Pat <
+ (f16 fpimm:$imm),
+ (S_MOV_B32 (i32 (bitcast_fpimm_to_i32 $imm)))
+>;
+
+def : Pat <
+ (i32 frameindex:$fi),
+ (V_MOV_B32_e32 (i32 (frameindex_to_targetframeindex $fi)))
+>;
+
+def : Pat <
+ (i64 InlineImm<i64>:$imm),
+ (S_MOV_B64 InlineImm<i64>:$imm)
+>;
+
+// XXX - Should this use a s_cmp to set SCC?
+
+// Set to sign-extended 64-bit value (true = -1, false = 0)
+def : Pat <
+ (i1 imm:$imm),
+ (S_MOV_B64 (i64 (as_i64imm $imm)))
+>;
+
+def : Pat <
+ (f64 InlineFPImm<f64>:$imm),
+ (S_MOV_B64 (f64 (bitcast_fpimm_to_i64 InlineFPImm<f64>:$imm)))
+>;
+
+/********** ================== **********/
+/********** Intrinsic Patterns **********/
+/********** ================== **********/
+
+def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>;
+
+def : Pat <
+ (int_AMDGPU_cube v4f32:$src),
+ (REG_SEQUENCE VReg_128,
+ (V_CUBETC_F32 0 /* src0_modifiers */, (f32 (EXTRACT_SUBREG $src, sub0)),
+ 0 /* src1_modifiers */, (f32 (EXTRACT_SUBREG $src, sub1)),
+ 0 /* src2_modifiers */, (f32 (EXTRACT_SUBREG $src, sub2)),
+ 0 /* clamp */, 0 /* omod */), sub0,
+ (V_CUBESC_F32 0 /* src0_modifiers */, (f32 (EXTRACT_SUBREG $src, sub0)),
+ 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)),
+ 0 /* src2_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)),
+ 0 /* clamp */, 0 /* omod */), sub1,
+ (V_CUBEMA_F32 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub0)),
+ 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)),
+ 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)),
+ 0 /* clamp */, 0 /* omod */), sub2,
+ (V_CUBEID_F32 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub0)),
+ 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)),
+ 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)),
+ 0 /* clamp */, 0 /* omod */), sub3)
+>;
+
+def : Pat <
+ (i32 (sext i1:$src0)),
+ (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src0)
+>;
+
+class Ext32Pat <SDNode ext> : Pat <
+ (i32 (ext i1:$src0)),
+ (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src0)
+>;
+
+def : Ext32Pat <zext>;
+def : Ext32Pat <anyext>;
+
+// The multiplication scales from [0,1] to the unsigned integer range
+def : Pat <
+ (AMDGPUurecip i32:$src0),
+ (V_CVT_U32_F32_e32
+ (V_MUL_F32_e32 (i32 CONST.FP_UINT_MAX_PLUS_1),
+ (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0))))
+>;
+
+//===----------------------------------------------------------------------===//
+// VOP3 Patterns
+//===----------------------------------------------------------------------===//
+
+def : IMad24Pat<V_MAD_I32_I24>;
+def : UMad24Pat<V_MAD_U32_U24>;
+
+defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>;
+def : ROTRPattern <V_ALIGNBIT_B32>;
+
+/********** ====================== **********/
+/********** Indirect addressing **********/
+/********** ====================== **********/
+
+multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> {
+ // Extract with offset
+ def : Pat<
+ (eltvt (extractelt vt:$src, (MOVRELOffset i32:$idx, (i32 imm:$offset)))),
+ (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $src, $idx, imm:$offset)
+ >;
+
+ // Insert with offset
+ def : Pat<
+ (insertelt vt:$src, eltvt:$val, (MOVRELOffset i32:$idx, (i32 imm:$offset))),
+ (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $src, $idx, imm:$offset, $val)
+ >;
+}
+
+defm : SI_INDIRECT_Pattern <v2f32, f32, "V2">;
+defm : SI_INDIRECT_Pattern <v4f32, f32, "V4">;
+defm : SI_INDIRECT_Pattern <v8f32, f32, "V8">;
+defm : SI_INDIRECT_Pattern <v16f32, f32, "V16">;
+
+defm : SI_INDIRECT_Pattern <v2i32, i32, "V2">;
+defm : SI_INDIRECT_Pattern <v4i32, i32, "V4">;
+defm : SI_INDIRECT_Pattern <v8i32, i32, "V8">;
+defm : SI_INDIRECT_Pattern <v16i32, i32, "V16">;
+
+//===----------------------------------------------------------------------===//
+// SAD Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat <
+ (add (sub_oneuse (umax i32:$src0, i32:$src1),
+ (umin i32:$src0, i32:$src1)),
+ i32:$src2),
+ (V_SAD_U32 $src0, $src1, $src2)
+>;
+
+def : Pat <
+ (add (select_oneuse (i1 (setugt i32:$src0, i32:$src1)),
+ (sub i32:$src0, i32:$src1),
+ (sub i32:$src1, i32:$src0)),
+ i32:$src2),
+ (V_SAD_U32 $src0, $src1, $src2)
+>;
+
+//===----------------------------------------------------------------------===//
+// Conversion Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat<(i32 (sext_inreg i32:$src, i1)),
+ (S_BFE_I32 i32:$src, (i32 65536))>; // 0 | 1 << 16
+
+// Handle sext_inreg in i64
+def : Pat <
+ (i64 (sext_inreg i64:$src, i1)),
+ (S_BFE_I64 i64:$src, (i32 0x10000)) // 0 | 1 << 16
+>;
+
+def : Pat <
+ (i16 (sext_inreg i16:$src, i8)),
+ (S_BFE_I32 $src, (i32 0x80000)) // 0 | 8 << 16
+>;
+
+def : Pat <
+ (i64 (sext_inreg i64:$src, i8)),
+ (S_BFE_I64 i64:$src, (i32 0x80000)) // 0 | 8 << 16
+>;
+
+def : Pat <
+ (i64 (sext_inreg i64:$src, i16)),
+ (S_BFE_I64 i64:$src, (i32 0x100000)) // 0 | 16 << 16
+>;
+
+def : Pat <
+ (i64 (sext_inreg i64:$src, i32)),
+ (S_BFE_I64 i64:$src, (i32 0x200000)) // 0 | 32 << 16
+>;
+
+def : Pat <
+ (i64 (zext i32:$src)),
+ (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1)
+>;
+
+def : Pat <
+ (i64 (anyext i32:$src)),
+ (REG_SEQUENCE SReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1)
+>;
+
+class ZExt_i64_i1_Pat <SDNode ext> : Pat <
+ (i64 (ext i1:$src)),
+ (REG_SEQUENCE VReg_64,
+ (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0,
+ (S_MOV_B32 (i32 0)), sub1)
+>;
+
+
+def : ZExt_i64_i1_Pat<zext>;
+def : ZExt_i64_i1_Pat<anyext>;
+
+// FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that
+// REG_SEQUENCE patterns don't support instructions with multiple outputs.
+def : Pat <
+ (i64 (sext i32:$src)),
+ (REG_SEQUENCE SReg_64, $src, sub0,
+ (i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, (i32 31)), SReg_32_XM0)), sub1)
+>;
+
+def : Pat <
+ (i64 (sext i1:$src)),
+ (REG_SEQUENCE VReg_64,
+ (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub0,
+ (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub1)
+>;
+
+class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, SDPatternOperator fp_to_int> : Pat <
+ (i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))),
+ (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE, DSTOMOD.NONE))
+>;
+
+def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>;
+def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, i32, f32, fp_to_sint>;
+def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_ONE, i64, f64, fp_to_uint>;
+def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, i64, f64, fp_to_sint>;
+
+// If we need to perform a logical operation on i1 values, we need to
+// use vector comparisons since there is only one SCC register. Vector
+// comparisons still write to a pair of SGPRs, so treat these as
+// 64-bit comparisons. When legalizing SGPR copies, instructions
+// resulting in the copies from SCC to these instructions will be
+// moved to the VALU.
+def : Pat <
+ (i1 (and i1:$src0, i1:$src1)),
+ (S_AND_B64 $src0, $src1)
+>;
+
+def : Pat <
+ (i1 (or i1:$src0, i1:$src1)),
+ (S_OR_B64 $src0, $src1)
+>;
+
+def : Pat <
+ (i1 (xor i1:$src0, i1:$src1)),
+ (S_XOR_B64 $src0, $src1)
+>;
+
+def : Pat <
+ (f32 (sint_to_fp i1:$src)),
+ (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_NEG_ONE), $src)
+>;
+
+def : Pat <
+ (f32 (uint_to_fp i1:$src)),
+ (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_ONE), $src)
+>;
+
+def : Pat <
+ (f64 (sint_to_fp i1:$src)),
+ (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src))
+>;
+
+def : Pat <
+ (f64 (uint_to_fp i1:$src)),
+ (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src))
+>;
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat <
+ (i32 (trunc i64:$a)),
+ (EXTRACT_SUBREG $a, sub0)
+>;
+
+def : Pat <
+ (i1 (trunc i32:$a)),
+ (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1))
+>;
+
+def : Pat <
+ (i1 (trunc i64:$a)),
+ (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1),
+ (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1))
+>;
+
+def : Pat <
+ (i32 (bswap i32:$a)),
+ (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)),
+ (V_ALIGNBIT_B32 $a, $a, (i32 24)),
+ (V_ALIGNBIT_B32 $a, $a, (i32 8)))
+>;
+
+multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> {
+ def : Pat <
+ (vt (shl (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)),
+ (BFM $a, $b)
+ >;
+
+ def : Pat <
+ (vt (add (vt (shl 1, vt:$a)), -1)),
+ (BFM $a, (MOV (i32 0)))
+ >;
+}
+
+defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>;
+// FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>;
+
+def : BFEPattern <V_BFE_U32, S_MOV_B32>;
+
+def : Pat<
+ (fcanonicalize f16:$src),
+ (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), 0, $src, 0, 0)
+>;
+
+def : Pat<
+ (fcanonicalize f32:$src),
+ (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), 0, $src, 0, 0)
+>;
+
+def : Pat<
+ (fcanonicalize f64:$src),
+ (V_MUL_F64 0, CONST.FP64_ONE, 0, $src, 0, 0)
+>;
+
+//===----------------------------------------------------------------------===//
+// Fract Patterns
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isSI] in {
+
+// V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) is
+// used instead. However, SI doesn't have V_FLOOR_F64, so the most efficient
+// way to implement it is using V_FRACT_F64.
+// The workaround for the V_FRACT bug is:
+// fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
+
+// Convert floor(x) to (x - fract(x))
+def : Pat <
+ (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))),
+ (V_ADD_F64
+ $mods,
+ $x,
+ SRCMODS.NEG,
+ (V_CNDMASK_B64_PSEUDO
+ (V_MIN_F64
+ SRCMODS.NONE,
+ (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE),
+ SRCMODS.NONE,
+ (V_MOV_B64_PSEUDO 0x3fefffffffffffff),
+ DSTCLAMP.NONE, DSTOMOD.NONE),
+ $x,
+ (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, (i32 3 /*NaN*/))),
+ DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+} // End Predicates = [isSI]
+
+//============================================================================//
+// Miscellaneous Optimization Patterns
+//============================================================================//
+
+def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>;
+
+def : IntMed3Pat<V_MED3_I32, smax, smax_oneuse, smin_oneuse>;
+def : IntMed3Pat<V_MED3_U32, umax, umax_oneuse, umin_oneuse>;
+
+//============================================================================//
+// Assembler aliases
+//============================================================================//
+
+def : MnemonicAlias<"v_add_u32", "v_add_i32">;
+def : MnemonicAlias<"v_sub_u32", "v_sub_i32">;
+def : MnemonicAlias<"v_subrev_u32", "v_subrev_i32">;
+
+} // End isGCN predicate
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIIntrinsics.td b/contrib/llvm/lib/Target/AMDGPU/SIIntrinsics.td
new file mode 100644
index 000000000000..5da375468713
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIIntrinsics.td
@@ -0,0 +1,209 @@
+//===-- SIIntrinsics.td - SI Intrinsic defs ----------------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Backend internal SI Intrinsic Definitions. User code should not
+// directly use these.
+//
+//===----------------------------------------------------------------------===//
+
+
+let TargetPrefix = "SI", isTarget = 1 in {
+ def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+
+ def int_SI_export : Intrinsic <[],
+ [llvm_i32_ty, // en
+ llvm_i32_ty, // vm (FIXME: should be i1)
+ llvm_i32_ty, // done (FIXME: should be i1)
+ llvm_i32_ty, // tgt
+ llvm_i32_ty, // compr (FIXME: should be i1)
+ llvm_float_ty, // src0
+ llvm_float_ty, // src1
+ llvm_float_ty, // src2
+ llvm_float_ty], // src3
+ []
+ >;
+
+ def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
+ def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_anyint_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]> ;
+
+ // Fully-flexible TBUFFER_STORE_FORMAT_* except for the ADDR64 bit, which is not exposed
+ def int_SI_tbuffer_store : Intrinsic <
+ [],
+ [llvm_anyint_ty, // rsrc(SGPR)
+ llvm_anyint_ty, // vdata(VGPR), overloaded for types i32, v2i32, v4i32
+ llvm_i32_ty, // num_channels(imm), selects opcode suffix: 1=X, 2=XY, 3=XYZ, 4=XYZW
+ llvm_i32_ty, // vaddr(VGPR)
+ llvm_i32_ty, // soffset(SGPR)
+ llvm_i32_ty, // inst_offset(imm)
+ llvm_i32_ty, // dfmt(imm)
+ llvm_i32_ty, // nfmt(imm)
+ llvm_i32_ty, // offen(imm)
+ llvm_i32_ty, // idxen(imm)
+ llvm_i32_ty, // glc(imm)
+ llvm_i32_ty, // slc(imm)
+ llvm_i32_ty], // tfe(imm)
+ []>;
+
+ // Fully-flexible BUFFER_LOAD_DWORD_* except for the ADDR64 bit, which is not exposed
+ def int_SI_buffer_load_dword : Intrinsic <
+ [llvm_anyint_ty], // vdata(VGPR), overloaded for types i32, v2i32, v4i32
+ [llvm_anyint_ty, // rsrc(SGPR)
+ llvm_anyint_ty, // vaddr(VGPR)
+ llvm_i32_ty, // soffset(SGPR)
+ llvm_i32_ty, // inst_offset(imm)
+ llvm_i32_ty, // offen(imm)
+ llvm_i32_ty, // idxen(imm)
+ llvm_i32_ty, // glc(imm)
+ llvm_i32_ty, // slc(imm)
+ llvm_i32_ty], // tfe(imm)
+ [IntrReadMem, IntrArgMemOnly]>;
+
+ def int_SI_sendmsg : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], []>;
+
+ // Fully-flexible SAMPLE instruction.
+ class SampleRaw : Intrinsic <
+ [llvm_v4f32_ty], // vdata(VGPR)
+ [llvm_anyint_ty, // vaddr(VGPR)
+ llvm_v8i32_ty, // rsrc(SGPR)
+ llvm_v4i32_ty, // sampler(SGPR)
+ llvm_i32_ty, // dmask(imm)
+ llvm_i32_ty, // unorm(imm)
+ llvm_i32_ty, // r128(imm)
+ llvm_i32_ty, // da(imm)
+ llvm_i32_ty, // glc(imm)
+ llvm_i32_ty, // slc(imm)
+ llvm_i32_ty, // tfe(imm)
+ llvm_i32_ty], // lwe(imm)
+ [IntrNoMem]>;
+
+ // Image instruction without a sampler.
+ class Image : Intrinsic <
+ [llvm_v4f32_ty], // vdata(VGPR)
+ [llvm_anyint_ty, // vaddr(VGPR)
+ llvm_v8i32_ty, // rsrc(SGPR)
+ llvm_i32_ty, // dmask(imm)
+ llvm_i32_ty, // unorm(imm)
+ llvm_i32_ty, // r128(imm)
+ llvm_i32_ty, // da(imm)
+ llvm_i32_ty, // glc(imm)
+ llvm_i32_ty, // slc(imm)
+ llvm_i32_ty, // tfe(imm)
+ llvm_i32_ty], // lwe(imm)
+ [IntrNoMem]>;
+
+ // Basic sample
+ def int_SI_image_sample : SampleRaw;
+ def int_SI_image_sample_cl : SampleRaw;
+ def int_SI_image_sample_d : SampleRaw;
+ def int_SI_image_sample_d_cl : SampleRaw;
+ def int_SI_image_sample_l : SampleRaw;
+ def int_SI_image_sample_b : SampleRaw;
+ def int_SI_image_sample_b_cl : SampleRaw;
+ def int_SI_image_sample_lz : SampleRaw;
+ def int_SI_image_sample_cd : SampleRaw;
+ def int_SI_image_sample_cd_cl : SampleRaw;
+
+ // Sample with comparison
+ def int_SI_image_sample_c : SampleRaw;
+ def int_SI_image_sample_c_cl : SampleRaw;
+ def int_SI_image_sample_c_d : SampleRaw;
+ def int_SI_image_sample_c_d_cl : SampleRaw;
+ def int_SI_image_sample_c_l : SampleRaw;
+ def int_SI_image_sample_c_b : SampleRaw;
+ def int_SI_image_sample_c_b_cl : SampleRaw;
+ def int_SI_image_sample_c_lz : SampleRaw;
+ def int_SI_image_sample_c_cd : SampleRaw;
+ def int_SI_image_sample_c_cd_cl : SampleRaw;
+
+ // Sample with offsets
+ def int_SI_image_sample_o : SampleRaw;
+ def int_SI_image_sample_cl_o : SampleRaw;
+ def int_SI_image_sample_d_o : SampleRaw;
+ def int_SI_image_sample_d_cl_o : SampleRaw;
+ def int_SI_image_sample_l_o : SampleRaw;
+ def int_SI_image_sample_b_o : SampleRaw;
+ def int_SI_image_sample_b_cl_o : SampleRaw;
+ def int_SI_image_sample_lz_o : SampleRaw;
+ def int_SI_image_sample_cd_o : SampleRaw;
+ def int_SI_image_sample_cd_cl_o : SampleRaw;
+
+ // Sample with comparison and offsets
+ def int_SI_image_sample_c_o : SampleRaw;
+ def int_SI_image_sample_c_cl_o : SampleRaw;
+ def int_SI_image_sample_c_d_o : SampleRaw;
+ def int_SI_image_sample_c_d_cl_o : SampleRaw;
+ def int_SI_image_sample_c_l_o : SampleRaw;
+ def int_SI_image_sample_c_b_o : SampleRaw;
+ def int_SI_image_sample_c_b_cl_o : SampleRaw;
+ def int_SI_image_sample_c_lz_o : SampleRaw;
+ def int_SI_image_sample_c_cd_o : SampleRaw;
+ def int_SI_image_sample_c_cd_cl_o : SampleRaw;
+
+ // Basic gather4
+ def int_SI_gather4 : SampleRaw;
+ def int_SI_gather4_cl : SampleRaw;
+ def int_SI_gather4_l : SampleRaw;
+ def int_SI_gather4_b : SampleRaw;
+ def int_SI_gather4_b_cl : SampleRaw;
+ def int_SI_gather4_lz : SampleRaw;
+
+ // Gather4 with comparison
+ def int_SI_gather4_c : SampleRaw;
+ def int_SI_gather4_c_cl : SampleRaw;
+ def int_SI_gather4_c_l : SampleRaw;
+ def int_SI_gather4_c_b : SampleRaw;
+ def int_SI_gather4_c_b_cl : SampleRaw;
+ def int_SI_gather4_c_lz : SampleRaw;
+
+ // Gather4 with offsets
+ def int_SI_gather4_o : SampleRaw;
+ def int_SI_gather4_cl_o : SampleRaw;
+ def int_SI_gather4_l_o : SampleRaw;
+ def int_SI_gather4_b_o : SampleRaw;
+ def int_SI_gather4_b_cl_o : SampleRaw;
+ def int_SI_gather4_lz_o : SampleRaw;
+
+ // Gather4 with comparison and offsets
+ def int_SI_gather4_c_o : SampleRaw;
+ def int_SI_gather4_c_cl_o : SampleRaw;
+ def int_SI_gather4_c_l_o : SampleRaw;
+ def int_SI_gather4_c_b_o : SampleRaw;
+ def int_SI_gather4_c_b_cl_o : SampleRaw;
+ def int_SI_gather4_c_lz_o : SampleRaw;
+
+ def int_SI_getlod : SampleRaw;
+
+ // Image instrinsics.
+ def int_SI_image_load : Image;
+ def int_SI_image_load_mip : Image;
+ def int_SI_getresinfo : Image;
+
+ /* Interpolation Intrinsics */
+
+ def int_SI_fs_constant : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+ def int_SI_fs_interp : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_v2i32_ty], [IntrNoMem]>;
+} // End TargetPrefix = "SI", isTarget = 1
+
+let TargetPrefix = "amdgcn", isTarget = 1 in {
+ // Emit 2.5 ulp, no denormal division. Should only be inserted by
+ // pass based on !fpmath metadata.
+ def int_amdgcn_fdiv_fast : Intrinsic<
+ [llvm_float_ty], [llvm_float_ty], [IntrNoMem]
+ >;
+
+ /* Control flow Intrinsics */
+
+ def int_amdgcn_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], [IntrConvergent]>;
+ def int_amdgcn_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], [IntrConvergent]>;
+ def int_amdgcn_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], [IntrNoMem, IntrConvergent]>;
+ def int_amdgcn_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], [IntrNoMem, IntrConvergent]>;
+ def int_amdgcn_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem, IntrConvergent]>;
+ def int_amdgcn_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], [IntrConvergent]>;
+ def int_amdgcn_end_cf : Intrinsic<[], [llvm_i64_ty], [IntrConvergent]>;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
new file mode 100644
index 000000000000..99fe96c0be22
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -0,0 +1,531 @@
+//===-- SILoadStoreOptimizer.cpp ------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to fuse DS instructions with close by immediate offsets.
+// This will fuse operations such as
+// ds_read_b32 v0, v2 offset:16
+// ds_read_b32 v1, v2 offset:32
+// ==>
+// ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
+//
+//
+// Future improvements:
+//
+// - This currently relies on the scheduler to place loads and stores next to
+// each other, and then only merges adjacent pairs of instructions. It would
+// be good to be more flexible with interleaved instructions, and possibly run
+// before scheduling. It currently missing stores of constants because loading
+// the constant into the data register is placed between the stores, although
+// this is arguably a scheduling problem.
+//
+// - Live interval recomputing seems inefficient. This currently only matches
+// one pair, and recomputes live intervals and moves on to the next pair. It
+// would be better to compute a list of all merges that need to occur.
+//
+// - With a list of instructions to process, we can also merge more. If a
+// cluster of loads have offsets that are too large to fit in the 8-bit
+// offsets, but are close enough to fit in the 8 bits, we can add to the base
+// pointer and use the new reduced offsets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-load-store-opt"
+
+namespace {
+
+class SILoadStoreOptimizer : public MachineFunctionPass {
+private:
+ const SIInstrInfo *TII;
+ const SIRegisterInfo *TRI;
+ MachineRegisterInfo *MRI;
+ AliasAnalysis *AA;
+
+ static bool offsetsCanBeCombined(unsigned Offset0,
+ unsigned Offset1,
+ unsigned EltSize);
+
+ MachineBasicBlock::iterator findMatchingDSInst(
+ MachineBasicBlock::iterator I,
+ unsigned EltSize,
+ SmallVectorImpl<MachineInstr*> &InstsToMove);
+
+ MachineBasicBlock::iterator mergeRead2Pair(
+ MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator Paired,
+ unsigned EltSize,
+ ArrayRef<MachineInstr*> InstsToMove);
+
+ MachineBasicBlock::iterator mergeWrite2Pair(
+ MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator Paired,
+ unsigned EltSize,
+ ArrayRef<MachineInstr*> InstsToMove);
+
+public:
+ static char ID;
+
+ SILoadStoreOptimizer()
+ : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), MRI(nullptr),
+ AA(nullptr) {}
+
+ SILoadStoreOptimizer(const TargetMachine &TM_) : MachineFunctionPass(ID) {
+ initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool optimizeBlock(MachineBasicBlock &MBB);
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override { return "SI Load / Store Optimizer"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<AAResultsWrapperPass>();
+
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
+ "SI Load / Store Optimizer", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE,
+ "SI Load / Store Optimizer", false, false)
+
+char SILoadStoreOptimizer::ID = 0;
+
+char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
+
+FunctionPass *llvm::createSILoadStoreOptimizerPass(TargetMachine &TM) {
+ return new SILoadStoreOptimizer(TM);
+}
+
+static void moveInstsAfter(MachineBasicBlock::iterator I,
+ ArrayRef<MachineInstr*> InstsToMove) {
+ MachineBasicBlock *MBB = I->getParent();
+ ++I;
+ for (MachineInstr *MI : InstsToMove) {
+ MI->removeFromParent();
+ MBB->insert(I, MI);
+ }
+}
+
+static void addDefsToList(const MachineInstr &MI,
+ SmallVectorImpl<const MachineOperand *> &Defs) {
+ for (const MachineOperand &Def : MI.defs()) {
+ Defs.push_back(&Def);
+ }
+}
+
+static bool memAccessesCanBeReordered(
+ MachineBasicBlock::iterator A,
+ MachineBasicBlock::iterator B,
+ const SIInstrInfo *TII,
+ llvm::AliasAnalysis * AA) {
+ return (TII->areMemAccessesTriviallyDisjoint(*A, *B, AA) ||
+ // RAW or WAR - cannot reorder
+ // WAW - cannot reorder
+ // RAR - safe to reorder
+ !(A->mayStore() || B->mayStore()));
+}
+
+// Add MI and its defs to the lists if MI reads one of the defs that are
+// already in the list. Returns true in that case.
+static bool
+addToListsIfDependent(MachineInstr &MI,
+ SmallVectorImpl<const MachineOperand *> &Defs,
+ SmallVectorImpl<MachineInstr*> &Insts) {
+ for (const MachineOperand *Def : Defs) {
+ bool ReadDef = MI.readsVirtualRegister(Def->getReg());
+ // If ReadDef is true, then there is a use of Def between I
+ // and the instruction that I will potentially be merged with. We
+ // will need to move this instruction after the merged instructions.
+ if (ReadDef) {
+ Insts.push_back(&MI);
+ addDefsToList(MI, Defs);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static bool
+canMoveInstsAcrossMemOp(MachineInstr &MemOp,
+ ArrayRef<MachineInstr*> InstsToMove,
+ const SIInstrInfo *TII,
+ AliasAnalysis *AA) {
+
+ assert(MemOp.mayLoadOrStore());
+
+ for (MachineInstr *InstToMove : InstsToMove) {
+ if (!InstToMove->mayLoadOrStore())
+ continue;
+ if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA))
+ return false;
+ }
+ return true;
+}
+
+bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0,
+ unsigned Offset1,
+ unsigned Size) {
+ // XXX - Would the same offset be OK? Is there any reason this would happen or
+ // be useful?
+ if (Offset0 == Offset1)
+ return false;
+
+ // This won't be valid if the offset isn't aligned.
+ if ((Offset0 % Size != 0) || (Offset1 % Size != 0))
+ return false;
+
+ unsigned EltOffset0 = Offset0 / Size;
+ unsigned EltOffset1 = Offset1 / Size;
+
+ // Check if the new offsets fit in the reduced 8-bit range.
+ if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1))
+ return true;
+
+ // If the offset in elements doesn't fit in 8-bits, we might be able to use
+ // the stride 64 versions.
+ if ((EltOffset0 % 64 != 0) || (EltOffset1 % 64) != 0)
+ return false;
+
+ return isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64);
+}
+
+MachineBasicBlock::iterator
+SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I,
+ unsigned EltSize,
+ SmallVectorImpl<MachineInstr*> &InstsToMove) {
+ MachineBasicBlock::iterator E = I->getParent()->end();
+ MachineBasicBlock::iterator MBBI = I;
+ ++MBBI;
+
+ SmallVector<const MachineOperand *, 8> DefsToMove;
+ addDefsToList(*I, DefsToMove);
+
+ for ( ; MBBI != E; ++MBBI) {
+
+ if (MBBI->getOpcode() != I->getOpcode()) {
+
+ // This is not a matching DS instruction, but we can keep looking as
+ // long as one of these conditions are met:
+ // 1. It is safe to move I down past MBBI.
+ // 2. It is safe to move MBBI down past the instruction that I will
+ // be merged into.
+
+ if (MBBI->hasUnmodeledSideEffects())
+ // We can't re-order this instruction with respect to other memory
+ // opeations, so we fail both conditions mentioned above.
+ return E;
+
+ if (MBBI->mayLoadOrStore() &&
+ !memAccessesCanBeReordered(*I, *MBBI, TII, AA)) {
+ // We fail condition #1, but we may still be able to satisfy condition
+ // #2. Add this instruction to the move list and then we will check
+ // if condition #2 holds once we have selected the matching instruction.
+ InstsToMove.push_back(&*MBBI);
+ addDefsToList(*MBBI, DefsToMove);
+ continue;
+ }
+
+ // When we match I with another DS instruction we will be moving I down
+ // to the location of the matched instruction any uses of I will need to
+ // be moved down as well.
+ addToListsIfDependent(*MBBI, DefsToMove, InstsToMove);
+ continue;
+ }
+
+ // Don't merge volatiles.
+ if (MBBI->hasOrderedMemoryRef())
+ return E;
+
+ // Handle a case like
+ // DS_WRITE_B32 addr, v, idx0
+ // w = DS_READ_B32 addr, idx0
+ // DS_WRITE_B32 addr, f(w), idx1
+ // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
+ // merging of the two writes.
+ if (addToListsIfDependent(*MBBI, DefsToMove, InstsToMove))
+ continue;
+
+ int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr);
+ const MachineOperand &AddrReg0 = I->getOperand(AddrIdx);
+ const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx);
+
+ // Check same base pointer. Be careful of subregisters, which can occur with
+ // vectors of pointers.
+ if (AddrReg0.getReg() == AddrReg1.getReg() &&
+ AddrReg0.getSubReg() == AddrReg1.getSubReg()) {
+ int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(),
+ AMDGPU::OpName::offset);
+ unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff;
+ unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff;
+
+ // Check both offsets fit in the reduced range.
+ // We also need to go through the list of instructions that we plan to
+ // move and make sure they are all safe to move down past the merged
+ // instruction.
+ if (offsetsCanBeCombined(Offset0, Offset1, EltSize) &&
+ canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA))
+ return MBBI;
+ }
+
+ // We've found a load/store that we couldn't merge for some reason.
+ // We could potentially keep looking, but we'd need to make sure that
+ // it was safe to move I and also all the instruction in InstsToMove
+ // down past this instruction.
+ if (!memAccessesCanBeReordered(*I, *MBBI, TII, AA) || // check if we can move I across MBBI
+ !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA) // check if we can move all I's users
+ )
+ break;
+ }
+ return E;
+}
+
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
+ MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator Paired,
+ unsigned EltSize,
+ ArrayRef<MachineInstr*> InstsToMove) {
+ MachineBasicBlock *MBB = I->getParent();
+
+ // Be careful, since the addresses could be subregisters themselves in weird
+ // cases, like vectors of pointers.
+ const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr);
+
+ const MachineOperand *Dest0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst);
+ const MachineOperand *Dest1 = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst);
+
+ unsigned Offset0
+ = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff;
+ unsigned Offset1
+ = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff;
+
+ unsigned NewOffset0 = Offset0 / EltSize;
+ unsigned NewOffset1 = Offset1 / EltSize;
+ unsigned Opc = (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
+
+ // Prefer the st64 form if we can use it, even if we can fit the offset in the
+ // non st64 version. I'm not sure if there's any real reason to do this.
+ bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0);
+ if (UseST64) {
+ NewOffset0 /= 64;
+ NewOffset1 /= 64;
+ Opc = (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
+ }
+
+ unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
+ unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
+
+ if (NewOffset0 > NewOffset1) {
+ // Canonicalize the merged instruction so the smaller offset comes first.
+ std::swap(NewOffset0, NewOffset1);
+ std::swap(SubRegIdx0, SubRegIdx1);
+ }
+
+ assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
+ (NewOffset0 != NewOffset1) &&
+ "Computed offset doesn't fit");
+
+ const MCInstrDesc &Read2Desc = TII->get(Opc);
+
+ const TargetRegisterClass *SuperRC
+ = (EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
+ unsigned DestReg = MRI->createVirtualRegister(SuperRC);
+
+ DebugLoc DL = I->getDebugLoc();
+ MachineInstrBuilder Read2
+ = BuildMI(*MBB, Paired, DL, Read2Desc, DestReg)
+ .addOperand(*AddrReg) // addr
+ .addImm(NewOffset0) // offset0
+ .addImm(NewOffset1) // offset1
+ .addImm(0) // gds
+ .addMemOperand(*I->memoperands_begin())
+ .addMemOperand(*Paired->memoperands_begin());
+ (void)Read2;
+
+ const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
+
+ // Copy to the old destination registers.
+ BuildMI(*MBB, Paired, DL, CopyDesc)
+ .addOperand(*Dest0) // Copy to same destination including flags and sub reg.
+ .addReg(DestReg, 0, SubRegIdx0);
+ MachineInstr *Copy1 = BuildMI(*MBB, Paired, DL, CopyDesc)
+ .addOperand(*Dest1)
+ .addReg(DestReg, RegState::Kill, SubRegIdx1);
+
+ moveInstsAfter(Copy1, InstsToMove);
+
+ MachineBasicBlock::iterator Next = std::next(I);
+ I->eraseFromParent();
+ Paired->eraseFromParent();
+
+ DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
+ return Next;
+}
+
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
+ MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator Paired,
+ unsigned EltSize,
+ ArrayRef<MachineInstr*> InstsToMove) {
+ MachineBasicBlock *MBB = I->getParent();
+
+ // Be sure to use .addOperand(), and not .addReg() with these. We want to be
+ // sure we preserve the subregister index and any register flags set on them.
+ const MachineOperand *Addr = TII->getNamedOperand(*I, AMDGPU::OpName::addr);
+ const MachineOperand *Data0 = TII->getNamedOperand(*I, AMDGPU::OpName::data0);
+ const MachineOperand *Data1
+ = TII->getNamedOperand(*Paired, AMDGPU::OpName::data0);
+
+
+ unsigned Offset0
+ = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff;
+ unsigned Offset1
+ = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff;
+
+ unsigned NewOffset0 = Offset0 / EltSize;
+ unsigned NewOffset1 = Offset1 / EltSize;
+ unsigned Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
+
+ // Prefer the st64 form if we can use it, even if we can fit the offset in the
+ // non st64 version. I'm not sure if there's any real reason to do this.
+ bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0);
+ if (UseST64) {
+ NewOffset0 /= 64;
+ NewOffset1 /= 64;
+ Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64;
+ }
+
+ if (NewOffset0 > NewOffset1) {
+ // Canonicalize the merged instruction so the smaller offset comes first.
+ std::swap(NewOffset0, NewOffset1);
+ std::swap(Data0, Data1);
+ }
+
+ assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
+ (NewOffset0 != NewOffset1) &&
+ "Computed offset doesn't fit");
+
+ const MCInstrDesc &Write2Desc = TII->get(Opc);
+ DebugLoc DL = I->getDebugLoc();
+
+ MachineInstrBuilder Write2
+ = BuildMI(*MBB, Paired, DL, Write2Desc)
+ .addOperand(*Addr) // addr
+ .addOperand(*Data0) // data0
+ .addOperand(*Data1) // data1
+ .addImm(NewOffset0) // offset0
+ .addImm(NewOffset1) // offset1
+ .addImm(0) // gds
+ .addMemOperand(*I->memoperands_begin())
+ .addMemOperand(*Paired->memoperands_begin());
+
+ moveInstsAfter(Write2, InstsToMove);
+
+ MachineBasicBlock::iterator Next = std::next(I);
+ I->eraseFromParent();
+ Paired->eraseFromParent();
+
+ DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
+ return Next;
+}
+
+// Scan through looking for adjacent LDS operations with constant offsets from
+// the same base register. We rely on the scheduler to do the hard work of
+// clustering nearby loads, and assume these are all adjacent.
+bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
+ bool Modified = false;
+
+ for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
+ MachineInstr &MI = *I;
+
+ // Don't combine if volatile.
+ if (MI.hasOrderedMemoryRef()) {
+ ++I;
+ continue;
+ }
+
+ SmallVector<MachineInstr*, 8> InstsToMove;
+ unsigned Opc = MI.getOpcode();
+ if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) {
+ unsigned Size = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4;
+ MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size,
+ InstsToMove);
+ if (Match != E) {
+ Modified = true;
+ I = mergeRead2Pair(I, Match, Size, InstsToMove);
+ } else {
+ ++I;
+ }
+
+ continue;
+ } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) {
+ unsigned Size = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4;
+ MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size,
+ InstsToMove);
+ if (Match != E) {
+ Modified = true;
+ I = mergeWrite2Pair(I, Match, Size, InstsToMove);
+ } else {
+ ++I;
+ }
+
+ continue;
+ }
+
+ ++I;
+ }
+
+ return Modified;
+}
+
+bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
+ const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
+ if (!STM.loadStoreOptEnabled())
+ return false;
+
+ TII = STM.getInstrInfo();
+ TRI = &TII->getRegisterInfo();
+
+ MRI = &MF.getRegInfo();
+ AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+ DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
+
+ bool Modified = false;
+
+ for (MachineBasicBlock &MBB : MF)
+ Modified |= optimizeBlock(MBB);
+
+ return Modified;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
new file mode 100644
index 000000000000..7ed18f27e591
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -0,0 +1,468 @@
+//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This pass lowers the pseudo control flow instructions to real
+/// machine instructions.
+///
+/// All control flow is handled using predicated instructions and
+/// a predicate stack. Each Scalar ALU controls the operations of 64 Vector
+/// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs
+/// by writting to the 64-bit EXEC register (each bit corresponds to a
+/// single vector ALU). Typically, for predicates, a vector ALU will write
+/// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
+/// Vector ALU) and then the ScalarALU will AND the VCC register with the
+/// EXEC to update the predicates.
+///
+/// For example:
+/// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
+/// %SGPR0 = SI_IF %VCC
+/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
+/// %SGPR0 = SI_ELSE %SGPR0
+/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
+/// SI_END_CF %SGPR0
+///
+/// becomes:
+///
+/// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC // Save and update the exec mask
+/// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask
+/// S_CBRANCH_EXECZ label0 // This instruction is an optional
+/// // optimization which allows us to
+/// // branch if all the bits of
+/// // EXEC are zero.
+/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
+///
+/// label0:
+/// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC // Restore the exec mask for the Then block
+/// %EXEC = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask
+/// S_BRANCH_EXECZ label1 // Use our branch optimization
+/// // instruction again.
+/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR // Do the THEN block
+/// label1:
+/// %EXEC = S_OR_B64 %EXEC, %SGPR0 // Re-enable saved exec mask bits
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-lower-control-flow"
+
+namespace {
+
+class SILowerControlFlow : public MachineFunctionPass {
+private:
+ const SIRegisterInfo *TRI;
+ const SIInstrInfo *TII;
+ LiveIntervals *LIS;
+ MachineRegisterInfo *MRI;
+
+ void emitIf(MachineInstr &MI);
+ void emitElse(MachineInstr &MI);
+ void emitBreak(MachineInstr &MI);
+ void emitIfBreak(MachineInstr &MI);
+ void emitElseBreak(MachineInstr &MI);
+ void emitLoop(MachineInstr &MI);
+ void emitEndCf(MachineInstr &MI);
+
+ void findMaskOperands(MachineInstr &MI, unsigned OpNo,
+ SmallVectorImpl<MachineOperand> &Src) const;
+
+ void combineMasks(MachineInstr &MI);
+
+public:
+ static char ID;
+
+ SILowerControlFlow() :
+ MachineFunctionPass(ID),
+ TRI(nullptr),
+ TII(nullptr),
+ LIS(nullptr),
+ MRI(nullptr) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "SI Lower control flow pseudo instructions";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ // Should preserve the same set that TwoAddressInstructions does.
+ AU.addPreserved<SlotIndexes>();
+ AU.addPreserved<LiveIntervals>();
+ AU.addPreservedID(LiveVariablesID);
+ AU.addPreservedID(MachineLoopInfoID);
+ AU.addPreservedID(MachineDominatorsID);
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // End anonymous namespace
+
+char SILowerControlFlow::ID = 0;
+
+INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE,
+ "SI lower control flow", false, false)
+
+static void setImpSCCDefDead(MachineInstr &MI, bool IsDead) {
+ MachineOperand &ImpDefSCC = MI.getOperand(3);
+ assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef());
+
+ ImpDefSCC.setIsDead(IsDead);
+}
+
+char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;
+
+void SILowerControlFlow::emitIf(MachineInstr &MI) {
+ MachineBasicBlock &MBB = *MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineBasicBlock::iterator I(&MI);
+
+ MachineOperand &SaveExec = MI.getOperand(0);
+ MachineOperand &Cond = MI.getOperand(1);
+ assert(SaveExec.getSubReg() == AMDGPU::NoSubRegister &&
+ Cond.getSubReg() == AMDGPU::NoSubRegister);
+
+ unsigned SaveExecReg = SaveExec.getReg();
+
+ MachineOperand &ImpDefSCC = MI.getOperand(4);
+ assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef());
+
+ // Add an implicit def of exec to discourage scheduling VALU after this which
+ // will interfere with trying to form s_and_saveexec_b64 later.
+ unsigned CopyReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ MachineInstr *CopyExec =
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg)
+ .addReg(AMDGPU::EXEC)
+ .addReg(AMDGPU::EXEC, RegState::ImplicitDefine);
+
+ unsigned Tmp = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+
+ MachineInstr *And =
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_B64), Tmp)
+ .addReg(CopyReg)
+ //.addReg(AMDGPU::EXEC)
+ .addReg(Cond.getReg());
+ setImpSCCDefDead(*And, true);
+
+ MachineInstr *Xor =
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), SaveExecReg)
+ .addReg(Tmp)
+ .addReg(CopyReg);
+ setImpSCCDefDead(*Xor, ImpDefSCC.isDead());
+
+ // Use a copy that is a terminator to get correct spill code placement it with
+ // fast regalloc.
+ MachineInstr *SetExec =
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64_term), AMDGPU::EXEC)
+ .addReg(Tmp, RegState::Kill);
+
+ // Insert a pseudo terminator to help keep the verifier happy. This will also
+ // be used later when inserting skips.
+ MachineInstr *NewBr =
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
+ .addOperand(MI.getOperand(2));
+
+ if (!LIS) {
+ MI.eraseFromParent();
+ return;
+ }
+
+ LIS->InsertMachineInstrInMaps(*CopyExec);
+
+ // Replace with and so we don't need to fix the live interval for condition
+ // register.
+ LIS->ReplaceMachineInstrInMaps(MI, *And);
+
+ LIS->InsertMachineInstrInMaps(*Xor);
+ LIS->InsertMachineInstrInMaps(*SetExec);
+ LIS->InsertMachineInstrInMaps(*NewBr);
+
+ LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI));
+ MI.eraseFromParent();
+
+ // FIXME: Is there a better way of adjusting the liveness? It shouldn't be
+ // hard to add another def here but I'm not sure how to correctly update the
+ // valno.
+ LIS->removeInterval(SaveExecReg);
+ LIS->createAndComputeVirtRegInterval(SaveExecReg);
+ LIS->createAndComputeVirtRegInterval(Tmp);
+ LIS->createAndComputeVirtRegInterval(CopyReg);
+}
+
+void SILowerControlFlow::emitElse(MachineInstr &MI) {
+ MachineBasicBlock &MBB = *MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ unsigned DstReg = MI.getOperand(0).getReg();
+ assert(MI.getOperand(0).getSubReg() == AMDGPU::NoSubRegister);
+
+ bool ExecModified = MI.getOperand(3).getImm() != 0;
+ MachineBasicBlock::iterator Start = MBB.begin();
+
+ // We are running before TwoAddressInstructions, and si_else's operands are
+ // tied. In order to correctly tie the registers, split this into a copy of
+ // the src like it does.
+ unsigned CopyReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ BuildMI(MBB, Start, DL, TII->get(AMDGPU::COPY), CopyReg)
+ .addOperand(MI.getOperand(1)); // Saved EXEC
+
+ // This must be inserted before phis and any spill code inserted before the
+ // else.
+ unsigned SaveReg = ExecModified ?
+ MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass) : DstReg;
+ MachineInstr *OrSaveExec =
+ BuildMI(MBB, Start, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), SaveReg)
+ .addReg(CopyReg);
+
+ MachineBasicBlock *DestBB = MI.getOperand(2).getMBB();
+
+ MachineBasicBlock::iterator ElsePt(MI);
+
+ if (ExecModified) {
+ MachineInstr *And =
+ BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_AND_B64), DstReg)
+ .addReg(AMDGPU::EXEC)
+ .addReg(SaveReg);
+
+ if (LIS)
+ LIS->InsertMachineInstrInMaps(*And);
+ }
+
+ MachineInstr *Xor =
+ BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC)
+ .addReg(AMDGPU::EXEC)
+ .addReg(DstReg);
+
+ MachineInstr *Branch =
+ BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
+ .addMBB(DestBB);
+
+ if (!LIS) {
+ MI.eraseFromParent();
+ return;
+ }
+
+ LIS->RemoveMachineInstrFromMaps(MI);
+ MI.eraseFromParent();
+
+ LIS->InsertMachineInstrInMaps(*OrSaveExec);
+
+ LIS->InsertMachineInstrInMaps(*Xor);
+ LIS->InsertMachineInstrInMaps(*Branch);
+
+ // src reg is tied to dst reg.
+ LIS->removeInterval(DstReg);
+ LIS->createAndComputeVirtRegInterval(DstReg);
+ LIS->createAndComputeVirtRegInterval(CopyReg);
+ if (ExecModified)
+ LIS->createAndComputeVirtRegInterval(SaveReg);
+
+ // Let this be recomputed.
+ LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI));
+}
+
+void SILowerControlFlow::emitBreak(MachineInstr &MI) {
+ MachineBasicBlock &MBB = *MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+ unsigned Dst = MI.getOperand(0).getReg();
+
+ MachineInstr *Or =
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
+ .addReg(AMDGPU::EXEC)
+ .addOperand(MI.getOperand(1));
+
+ if (LIS)
+ LIS->ReplaceMachineInstrInMaps(MI, *Or);
+ MI.eraseFromParent();
+}
+
+void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
+ MI.setDesc(TII->get(AMDGPU::S_OR_B64));
+}
+
+void SILowerControlFlow::emitElseBreak(MachineInstr &MI) {
+ MI.setDesc(TII->get(AMDGPU::S_OR_B64));
+}
+
+void SILowerControlFlow::emitLoop(MachineInstr &MI) {
+ MachineBasicBlock &MBB = *MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ MachineInstr *AndN2 =
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64_term), AMDGPU::EXEC)
+ .addReg(AMDGPU::EXEC)
+ .addOperand(MI.getOperand(0));
+
+ MachineInstr *Branch =
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+ .addOperand(MI.getOperand(1));
+
+ if (LIS) {
+ LIS->ReplaceMachineInstrInMaps(MI, *AndN2);
+ LIS->InsertMachineInstrInMaps(*Branch);
+ }
+
+ MI.eraseFromParent();
+}
+
+void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
+ MachineBasicBlock &MBB = *MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ MachineBasicBlock::iterator InsPt = MBB.begin();
+ MachineInstr *NewMI =
+ BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
+ .addReg(AMDGPU::EXEC)
+ .addOperand(MI.getOperand(0));
+
+ if (LIS)
+ LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
+
+ MI.eraseFromParent();
+
+ if (LIS)
+ LIS->handleMove(*NewMI);
+}
+
+// Returns replace operands for a logical operation, either single result
+// for exec or two operands if source was another equivalent operation.
+void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo,
+ SmallVectorImpl<MachineOperand> &Src) const {
+ MachineOperand &Op = MI.getOperand(OpNo);
+ if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) {
+ Src.push_back(Op);
+ return;
+ }
+
+ MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
+ if (!Def || Def->getParent() != MI.getParent() ||
+ !(Def->isFullCopy() || (Def->getOpcode() == MI.getOpcode())))
+ return;
+
+ // Make sure we do not modify exec between def and use.
+ // A copy with implcitly defined exec inserted earlier is an exclusion, it
+ // does not really modify exec.
+ for (auto I = Def->getIterator(); I != MI.getIterator(); ++I)
+ if (I->modifiesRegister(AMDGPU::EXEC, TRI) &&
+ !(I->isCopy() && I->getOperand(0).getReg() != AMDGPU::EXEC))
+ return;
+
+ for (const auto &SrcOp : Def->explicit_operands())
+ if (SrcOp.isUse() && (!SrcOp.isReg() ||
+ TargetRegisterInfo::isVirtualRegister(SrcOp.getReg()) ||
+ SrcOp.getReg() == AMDGPU::EXEC))
+ Src.push_back(SrcOp);
+}
+
+// Search and combine pairs of equivalent instructions, like
+// S_AND_B64 x, (S_AND_B64 x, y) => S_AND_B64 x, y
+// S_OR_B64 x, (S_OR_B64 x, y) => S_OR_B64 x, y
+// One of the operands is exec mask.
+void SILowerControlFlow::combineMasks(MachineInstr &MI) {
+ assert(MI.getNumExplicitOperands() == 3);
+ SmallVector<MachineOperand, 4> Ops;
+ unsigned OpToReplace = 1;
+ findMaskOperands(MI, 1, Ops);
+ if (Ops.size() == 1) OpToReplace = 2; // First operand can be exec or its copy
+ findMaskOperands(MI, 2, Ops);
+ if (Ops.size() != 3) return;
+
+ unsigned UniqueOpndIdx;
+ if (Ops[0].isIdenticalTo(Ops[1])) UniqueOpndIdx = 2;
+ else if (Ops[0].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1;
+ else if (Ops[1].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1;
+ else return;
+
+ unsigned Reg = MI.getOperand(OpToReplace).getReg();
+ MI.RemoveOperand(OpToReplace);
+ MI.addOperand(Ops[UniqueOpndIdx]);
+ if (MRI->use_empty(Reg))
+ MRI->getUniqueVRegDef(Reg)->eraseFromParent();
+}
+
+bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ TII = ST.getInstrInfo();
+ TRI = &TII->getRegisterInfo();
+
+ // This doesn't actually need LiveIntervals, but we can preserve them.
+ LIS = getAnalysisIfAvailable<LiveIntervals>();
+ MRI = &MF.getRegInfo();
+
+ MachineFunction::iterator NextBB;
+ for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+ BI != BE; BI = NextBB) {
+ NextBB = std::next(BI);
+ MachineBasicBlock &MBB = *BI;
+
+ MachineBasicBlock::iterator I, Next, Last;
+
+ for (I = MBB.begin(), Last = MBB.end(); I != MBB.end(); I = Next) {
+ Next = std::next(I);
+ MachineInstr &MI = *I;
+
+ switch (MI.getOpcode()) {
+ case AMDGPU::SI_IF:
+ emitIf(MI);
+ break;
+
+ case AMDGPU::SI_ELSE:
+ emitElse(MI);
+ break;
+
+ case AMDGPU::SI_BREAK:
+ emitBreak(MI);
+ break;
+
+ case AMDGPU::SI_IF_BREAK:
+ emitIfBreak(MI);
+ break;
+
+ case AMDGPU::SI_ELSE_BREAK:
+ emitElseBreak(MI);
+ break;
+
+ case AMDGPU::SI_LOOP:
+ emitLoop(MI);
+ break;
+
+ case AMDGPU::SI_END_CF:
+ emitEndCf(MI);
+ break;
+
+ case AMDGPU::S_AND_B64:
+ case AMDGPU::S_OR_B64:
+ // Cleanup bit manipulations on exec mask
+ combineMasks(MI);
+ Last = I;
+ continue;
+
+ default:
+ Last = I;
+ continue;
+ }
+
+ // Replay newly inserted code to combine masks
+ Next = (Last == MBB.end()) ? MBB.begin() : Last;
+ }
+ }
+
+ return true;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
new file mode 100644
index 000000000000..be2e14fd4623
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -0,0 +1,161 @@
+//===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// i1 values are usually inserted by the CFG Structurize pass and they are
+/// unique in that they can be copied from VALU to SALU registers.
+/// This is not possible for any other value type. Since there are no
+/// MOV instructions for i1, we to use V_CMP_* and V_CNDMASK to move the i1.
+///
+//===----------------------------------------------------------------------===//
+//
+
+#define DEBUG_TYPE "si-i1-copies"
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+namespace {
+
+class SILowerI1Copies : public MachineFunctionPass {
+public:
+ static char ID;
+
+public:
+ SILowerI1Copies() : MachineFunctionPass(ID) {
+ initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override { return "SI Lower i1 Copies"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SILowerI1Copies, DEBUG_TYPE,
+ "SI Lower i1 Copies", false, false)
+
+char SILowerI1Copies::ID = 0;
+
+char &llvm::SILowerI1CopiesID = SILowerI1Copies::ID;
+
+FunctionPass *llvm::createSILowerI1CopiesPass() {
+ return new SILowerI1Copies();
+}
+
+bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const TargetRegisterInfo *TRI = &TII->getRegisterInfo();
+
+ std::vector<unsigned> I1Defs;
+
+ for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+ BI != BE; ++BI) {
+
+ MachineBasicBlock &MBB = *BI;
+ MachineBasicBlock::iterator I, Next;
+ for (I = MBB.begin(); I != MBB.end(); I = Next) {
+ Next = std::next(I);
+ MachineInstr &MI = *I;
+
+ if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) {
+ unsigned Reg = MI.getOperand(0).getReg();
+ const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+ if (RC == &AMDGPU::VReg_1RegClass)
+ MRI.setRegClass(Reg, &AMDGPU::SReg_64RegClass);
+ continue;
+ }
+
+ if (MI.getOpcode() != AMDGPU::COPY)
+ continue;
+
+ const MachineOperand &Dst = MI.getOperand(0);
+ const MachineOperand &Src = MI.getOperand(1);
+
+ if (!TargetRegisterInfo::isVirtualRegister(Src.getReg()) ||
+ !TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
+ continue;
+
+ const TargetRegisterClass *DstRC = MRI.getRegClass(Dst.getReg());
+ const TargetRegisterClass *SrcRC = MRI.getRegClass(Src.getReg());
+
+ DebugLoc DL = MI.getDebugLoc();
+ MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg());
+ if (DstRC == &AMDGPU::VReg_1RegClass &&
+ TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) {
+ I1Defs.push_back(Dst.getReg());
+
+ if (DefInst->getOpcode() == AMDGPU::S_MOV_B64) {
+ if (DefInst->getOperand(1).isImm()) {
+ I1Defs.push_back(Dst.getReg());
+
+ int64_t Val = DefInst->getOperand(1).getImm();
+ assert(Val == 0 || Val == -1);
+
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_MOV_B32_e32))
+ .addOperand(Dst)
+ .addImm(Val);
+ MI.eraseFromParent();
+ continue;
+ }
+ }
+
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64))
+ .addOperand(Dst)
+ .addImm(0)
+ .addImm(-1)
+ .addOperand(Src);
+ MI.eraseFromParent();
+ } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) &&
+ SrcRC == &AMDGPU::VReg_1RegClass) {
+ if (DefInst->getOpcode() == AMDGPU::V_CNDMASK_B32_e64 &&
+ DefInst->getOperand(1).isImm() && DefInst->getOperand(2).isImm() &&
+ DefInst->getOperand(1).getImm() == 0 &&
+ DefInst->getOperand(2).getImm() != 0 &&
+ DefInst->getOperand(3).isReg() &&
+ TargetRegisterInfo::isVirtualRegister(
+ DefInst->getOperand(3).getReg()) &&
+ TRI->getCommonSubClass(
+ MRI.getRegClass(DefInst->getOperand(3).getReg()),
+ &AMDGPU::SGPR_64RegClass)) {
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64))
+ .addOperand(Dst)
+ .addReg(AMDGPU::EXEC)
+ .addOperand(DefInst->getOperand(3));
+ } else {
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64))
+ .addOperand(Dst)
+ .addOperand(Src)
+ .addImm(0);
+ }
+ MI.eraseFromParent();
+ }
+ }
+ }
+
+ for (unsigned Reg : I1Defs)
+ MRI.setRegClass(Reg, &AMDGPU::VGPR_32RegClass);
+
+ return false;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
new file mode 100644
index 000000000000..e911817c451d
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -0,0 +1,226 @@
+//===-- SIMachineFunctionInfo.cpp -------- SI Machine Function Info -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SIMachineFunctionInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+
+#define MAX_LANES 64
+
+using namespace llvm;
+
+static cl::opt<bool> EnableSpillSGPRToVGPR(
+ "amdgpu-spill-sgpr-to-vgpr",
+ cl::desc("Enable spilling VGPRs to SGPRs"),
+ cl::ReallyHidden,
+ cl::init(true));
+
+SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
+ : AMDGPUMachineFunction(MF),
+ TIDReg(AMDGPU::NoRegister),
+ ScratchRSrcReg(AMDGPU::NoRegister),
+ ScratchWaveOffsetReg(AMDGPU::NoRegister),
+ PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister),
+ DispatchPtrUserSGPR(AMDGPU::NoRegister),
+ QueuePtrUserSGPR(AMDGPU::NoRegister),
+ KernargSegmentPtrUserSGPR(AMDGPU::NoRegister),
+ DispatchIDUserSGPR(AMDGPU::NoRegister),
+ FlatScratchInitUserSGPR(AMDGPU::NoRegister),
+ PrivateSegmentSizeUserSGPR(AMDGPU::NoRegister),
+ GridWorkGroupCountXUserSGPR(AMDGPU::NoRegister),
+ GridWorkGroupCountYUserSGPR(AMDGPU::NoRegister),
+ GridWorkGroupCountZUserSGPR(AMDGPU::NoRegister),
+ WorkGroupIDXSystemSGPR(AMDGPU::NoRegister),
+ WorkGroupIDYSystemSGPR(AMDGPU::NoRegister),
+ WorkGroupIDZSystemSGPR(AMDGPU::NoRegister),
+ WorkGroupInfoSystemSGPR(AMDGPU::NoRegister),
+ PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister),
+ PSInputAddr(0),
+ ReturnsVoid(true),
+ FlatWorkGroupSizes(0, 0),
+ WavesPerEU(0, 0),
+ DebuggerWorkGroupIDStackObjectIndices({{0, 0, 0}}),
+ DebuggerWorkItemIDStackObjectIndices({{0, 0, 0}}),
+ LDSWaveSpillSize(0),
+ PSInputEna(0),
+ NumUserSGPRs(0),
+ NumSystemSGPRs(0),
+ HasSpilledSGPRs(false),
+ HasSpilledVGPRs(false),
+ HasNonSpillStackObjects(false),
+ NumSpilledSGPRs(0),
+ NumSpilledVGPRs(0),
+ PrivateSegmentBuffer(false),
+ DispatchPtr(false),
+ QueuePtr(false),
+ KernargSegmentPtr(false),
+ DispatchID(false),
+ FlatScratchInit(false),
+ GridWorkgroupCountX(false),
+ GridWorkgroupCountY(false),
+ GridWorkgroupCountZ(false),
+ WorkGroupIDX(false),
+ WorkGroupIDY(false),
+ WorkGroupIDZ(false),
+ WorkGroupInfo(false),
+ PrivateSegmentWaveByteOffset(false),
+ WorkItemIDX(false),
+ WorkItemIDY(false),
+ WorkItemIDZ(false) {
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const Function *F = MF.getFunction();
+
+ PSInputAddr = AMDGPU::getInitialPSInputAddr(*F);
+
+ const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+
+ if (!AMDGPU::isShader(F->getCallingConv())) {
+ KernargSegmentPtr = true;
+ WorkGroupIDX = true;
+ WorkItemIDX = true;
+ }
+
+ if (F->hasFnAttribute("amdgpu-work-group-id-y") || ST.debuggerEmitPrologue())
+ WorkGroupIDY = true;
+
+ if (F->hasFnAttribute("amdgpu-work-group-id-z") || ST.debuggerEmitPrologue())
+ WorkGroupIDZ = true;
+
+ if (F->hasFnAttribute("amdgpu-work-item-id-y") || ST.debuggerEmitPrologue())
+ WorkItemIDY = true;
+
+ if (F->hasFnAttribute("amdgpu-work-item-id-z") || ST.debuggerEmitPrologue())
+ WorkItemIDZ = true;
+
+ // X, XY, and XYZ are the only supported combinations, so make sure Y is
+ // enabled if Z is.
+ if (WorkItemIDZ)
+ WorkItemIDY = true;
+
+ bool MaySpill = ST.isVGPRSpillingEnabled(*F);
+ bool HasStackObjects = FrameInfo.hasStackObjects();
+
+ if (HasStackObjects || MaySpill)
+ PrivateSegmentWaveByteOffset = true;
+
+ if (ST.isAmdCodeObjectV2()) {
+ if (HasStackObjects || MaySpill)
+ PrivateSegmentBuffer = true;
+
+ if (F->hasFnAttribute("amdgpu-dispatch-ptr"))
+ DispatchPtr = true;
+
+ if (F->hasFnAttribute("amdgpu-queue-ptr"))
+ QueuePtr = true;
+
+ if (F->hasFnAttribute("amdgpu-dispatch-id"))
+ DispatchID = true;
+ }
+
+ // We don't need to worry about accessing spills with flat instructions.
+ // TODO: On VI where we must use flat for global, we should be able to omit
+ // this if it is never used for generic access.
+ if (HasStackObjects && ST.getGeneration() >= SISubtarget::SEA_ISLANDS &&
+ ST.isAmdHsaOS())
+ FlatScratchInit = true;
+
+ FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(*F);
+ WavesPerEU = ST.getWavesPerEU(*F);
+}
+
+unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
+ const SIRegisterInfo &TRI) {
+ PrivateSegmentBufferUserSGPR = TRI.getMatchingSuperReg(
+ getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
+ NumUserSGPRs += 4;
+ return PrivateSegmentBufferUserSGPR;
+}
+
+unsigned SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) {
+ DispatchPtrUserSGPR = TRI.getMatchingSuperReg(
+ getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+ NumUserSGPRs += 2;
+ return DispatchPtrUserSGPR;
+}
+
+unsigned SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) {
+ QueuePtrUserSGPR = TRI.getMatchingSuperReg(
+ getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+ NumUserSGPRs += 2;
+ return QueuePtrUserSGPR;
+}
+
+unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) {
+ KernargSegmentPtrUserSGPR = TRI.getMatchingSuperReg(
+ getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+ NumUserSGPRs += 2;
+ return KernargSegmentPtrUserSGPR;
+}
+
+unsigned SIMachineFunctionInfo::addDispatchID(const SIRegisterInfo &TRI) {
+ DispatchIDUserSGPR = TRI.getMatchingSuperReg(
+ getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+ NumUserSGPRs += 2;
+ return DispatchIDUserSGPR;
+}
+
+unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
+ FlatScratchInitUserSGPR = TRI.getMatchingSuperReg(
+ getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+ NumUserSGPRs += 2;
+ return FlatScratchInitUserSGPR;
+}
+
+SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg (
+ MachineFunction *MF,
+ unsigned FrameIndex,
+ unsigned SubIdx) {
+ if (!EnableSpillSGPRToVGPR)
+ return SpilledReg();
+
+ const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+ MachineFrameInfo &FrameInfo = MF->getFrameInfo();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ int64_t Offset = FrameInfo.getObjectOffset(FrameIndex);
+ Offset += SubIdx * 4;
+
+ unsigned LaneVGPRIdx = Offset / (64 * 4);
+ unsigned Lane = (Offset / 4) % 64;
+
+ struct SpilledReg Spill;
+ Spill.Lane = Lane;
+
+ if (!LaneVGPRs.count(LaneVGPRIdx)) {
+ unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass,
+ *MF);
+
+ if (LaneVGPR == AMDGPU::NoRegister)
+ // We have no VGPRs left for spilling SGPRs.
+ return Spill;
+
+ LaneVGPRs[LaneVGPRIdx] = LaneVGPR;
+
+ // Add this register as live-in to all blocks to avoid machine verifer
+ // complaining about use of an undefined physical register.
+ for (MachineFunction::iterator BI = MF->begin(), BE = MF->end();
+ BI != BE; ++BI) {
+ BI->addLiveIn(LaneVGPR);
+ }
+ }
+
+ Spill.VGPR = LaneVGPRs[LaneVGPRIdx];
+ return Spill;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
new file mode 100644
index 000000000000..3b4e233cd787
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -0,0 +1,500 @@
+//===- SIMachineFunctionInfo.h - SIMachineFunctionInfo interface -*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_SIMACHINEFUNCTIONINFO_H
+
+#include "AMDGPUMachineFunction.h"
+#include "SIRegisterInfo.h"
+#include <array>
+#include <map>
+
+namespace llvm {
+
+class MachineRegisterInfo;
+
+class AMDGPUImagePseudoSourceValue : public PseudoSourceValue {
+public:
+ explicit AMDGPUImagePseudoSourceValue() :
+ PseudoSourceValue(PseudoSourceValue::TargetCustom) { }
+
+ bool isConstant(const MachineFrameInfo *) const override {
+ // This should probably be true for most images, but we will start by being
+ // conservative.
+ return false;
+ }
+
+ bool isAliased(const MachineFrameInfo *) const override {
+ // FIXME: If we ever change image intrinsics to accept fat pointers, then
+ // this could be true for some cases.
+ return false;
+ }
+
+ bool mayAlias(const MachineFrameInfo*) const override {
+ // FIXME: If we ever change image intrinsics to accept fat pointers, then
+ // this could be true for some cases.
+ return false;
+ }
+};
+
+class AMDGPUBufferPseudoSourceValue : public PseudoSourceValue {
+public:
+ explicit AMDGPUBufferPseudoSourceValue() :
+ PseudoSourceValue(PseudoSourceValue::TargetCustom) { }
+
+ bool isConstant(const MachineFrameInfo *) const override {
+ // This should probably be true for most images, but we will start by being
+ // conservative.
+ return false;
+ }
+
+ bool isAliased(const MachineFrameInfo *) const override {
+ // FIXME: If we ever change image intrinsics to accept fat pointers, then
+ // this could be true for some cases.
+ return false;
+ }
+
+ bool mayAlias(const MachineFrameInfo*) const override {
+ // FIXME: If we ever change image intrinsics to accept fat pointers, then
+ // this could be true for some cases.
+ return false;
+ }
+};
+
+/// This class keeps track of the SPI_SP_INPUT_ADDR config register, which
+/// tells the hardware which interpolation parameters to load.
+class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
+ // FIXME: This should be removed and getPreloadedValue moved here.
+ friend class SIRegisterInfo;
+
+ unsigned TIDReg;
+
+ // Registers that may be reserved for spilling purposes. These may be the same
+ // as the input registers.
+ unsigned ScratchRSrcReg;
+ unsigned ScratchWaveOffsetReg;
+
+ // Input registers setup for the HSA ABI.
+ // User SGPRs in allocation order.
+ unsigned PrivateSegmentBufferUserSGPR;
+ unsigned DispatchPtrUserSGPR;
+ unsigned QueuePtrUserSGPR;
+ unsigned KernargSegmentPtrUserSGPR;
+ unsigned DispatchIDUserSGPR;
+ unsigned FlatScratchInitUserSGPR;
+ unsigned PrivateSegmentSizeUserSGPR;
+ unsigned GridWorkGroupCountXUserSGPR;
+ unsigned GridWorkGroupCountYUserSGPR;
+ unsigned GridWorkGroupCountZUserSGPR;
+
+ // System SGPRs in allocation order.
+ unsigned WorkGroupIDXSystemSGPR;
+ unsigned WorkGroupIDYSystemSGPR;
+ unsigned WorkGroupIDZSystemSGPR;
+ unsigned WorkGroupInfoSystemSGPR;
+ unsigned PrivateSegmentWaveByteOffsetSystemSGPR;
+
+ // Graphics info.
+ unsigned PSInputAddr;
+ bool ReturnsVoid;
+
+ // A pair of default/requested minimum/maximum flat work group sizes.
+ // Minimum - first, maximum - second.
+ std::pair<unsigned, unsigned> FlatWorkGroupSizes;
+
+ // A pair of default/requested minimum/maximum number of waves per execution
+ // unit. Minimum - first, maximum - second.
+ std::pair<unsigned, unsigned> WavesPerEU;
+
+ // Stack object indices for work group IDs.
+ std::array<int, 3> DebuggerWorkGroupIDStackObjectIndices;
+ // Stack object indices for work item IDs.
+ std::array<int, 3> DebuggerWorkItemIDStackObjectIndices;
+
+ AMDGPUBufferPseudoSourceValue BufferPSV;
+ AMDGPUImagePseudoSourceValue ImagePSV;
+
+public:
+ // FIXME: Make private
+ unsigned LDSWaveSpillSize;
+ unsigned PSInputEna;
+ std::map<unsigned, unsigned> LaneVGPRs;
+ unsigned ScratchOffsetReg;
+ unsigned NumUserSGPRs;
+ unsigned NumSystemSGPRs;
+
+private:
+ bool HasSpilledSGPRs;
+ bool HasSpilledVGPRs;
+ bool HasNonSpillStackObjects;
+
+ unsigned NumSpilledSGPRs;
+ unsigned NumSpilledVGPRs;
+
+ // Feature bits required for inputs passed in user SGPRs.
+ bool PrivateSegmentBuffer : 1;
+ bool DispatchPtr : 1;
+ bool QueuePtr : 1;
+ bool KernargSegmentPtr : 1;
+ bool DispatchID : 1;
+ bool FlatScratchInit : 1;
+ bool GridWorkgroupCountX : 1;
+ bool GridWorkgroupCountY : 1;
+ bool GridWorkgroupCountZ : 1;
+
+ // Feature bits required for inputs passed in system SGPRs.
+ bool WorkGroupIDX : 1; // Always initialized.
+ bool WorkGroupIDY : 1;
+ bool WorkGroupIDZ : 1;
+ bool WorkGroupInfo : 1;
+ bool PrivateSegmentWaveByteOffset : 1;
+
+ bool WorkItemIDX : 1; // Always initialized.
+ bool WorkItemIDY : 1;
+ bool WorkItemIDZ : 1;
+
+ MCPhysReg getNextUserSGPR() const {
+ assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
+ return AMDGPU::SGPR0 + NumUserSGPRs;
+ }
+
+ MCPhysReg getNextSystemSGPR() const {
+ return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
+ }
+
+public:
+ struct SpilledReg {
+ unsigned VGPR;
+ int Lane;
+ SpilledReg(unsigned R, int L) : VGPR (R), Lane (L) { }
+ SpilledReg() : VGPR(AMDGPU::NoRegister), Lane(-1) { }
+ bool hasLane() { return Lane != -1;}
+ bool hasReg() { return VGPR != AMDGPU::NoRegister;}
+ };
+
+ // SIMachineFunctionInfo definition
+
+ SIMachineFunctionInfo(const MachineFunction &MF);
+ SpilledReg getSpilledReg(MachineFunction *MF, unsigned FrameIndex,
+ unsigned SubIdx);
+ bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; };
+ unsigned getTIDReg() const { return TIDReg; };
+ void setTIDReg(unsigned Reg) { TIDReg = Reg; }
+
+ // Add user SGPRs.
+ unsigned addPrivateSegmentBuffer(const SIRegisterInfo &TRI);
+ unsigned addDispatchPtr(const SIRegisterInfo &TRI);
+ unsigned addQueuePtr(const SIRegisterInfo &TRI);
+ unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI);
+ unsigned addDispatchID(const SIRegisterInfo &TRI);
+ unsigned addFlatScratchInit(const SIRegisterInfo &TRI);
+
+ // Add system SGPRs.
+ unsigned addWorkGroupIDX() {
+ WorkGroupIDXSystemSGPR = getNextSystemSGPR();
+ NumSystemSGPRs += 1;
+ return WorkGroupIDXSystemSGPR;
+ }
+
+ unsigned addWorkGroupIDY() {
+ WorkGroupIDYSystemSGPR = getNextSystemSGPR();
+ NumSystemSGPRs += 1;
+ return WorkGroupIDYSystemSGPR;
+ }
+
+ unsigned addWorkGroupIDZ() {
+ WorkGroupIDZSystemSGPR = getNextSystemSGPR();
+ NumSystemSGPRs += 1;
+ return WorkGroupIDZSystemSGPR;
+ }
+
+ unsigned addWorkGroupInfo() {
+ WorkGroupInfoSystemSGPR = getNextSystemSGPR();
+ NumSystemSGPRs += 1;
+ return WorkGroupInfoSystemSGPR;
+ }
+
+ unsigned addPrivateSegmentWaveByteOffset() {
+ PrivateSegmentWaveByteOffsetSystemSGPR = getNextSystemSGPR();
+ NumSystemSGPRs += 1;
+ return PrivateSegmentWaveByteOffsetSystemSGPR;
+ }
+
+ void setPrivateSegmentWaveByteOffset(unsigned Reg) {
+ PrivateSegmentWaveByteOffsetSystemSGPR = Reg;
+ }
+
+ bool hasPrivateSegmentBuffer() const {
+ return PrivateSegmentBuffer;
+ }
+
+ bool hasDispatchPtr() const {
+ return DispatchPtr;
+ }
+
+ bool hasQueuePtr() const {
+ return QueuePtr;
+ }
+
+ bool hasKernargSegmentPtr() const {
+ return KernargSegmentPtr;
+ }
+
+ bool hasDispatchID() const {
+ return DispatchID;
+ }
+
+ bool hasFlatScratchInit() const {
+ return FlatScratchInit;
+ }
+
+ bool hasGridWorkgroupCountX() const {
+ return GridWorkgroupCountX;
+ }
+
+ bool hasGridWorkgroupCountY() const {
+ return GridWorkgroupCountY;
+ }
+
+ bool hasGridWorkgroupCountZ() const {
+ return GridWorkgroupCountZ;
+ }
+
+ bool hasWorkGroupIDX() const {
+ return WorkGroupIDX;
+ }
+
+ bool hasWorkGroupIDY() const {
+ return WorkGroupIDY;
+ }
+
+ bool hasWorkGroupIDZ() const {
+ return WorkGroupIDZ;
+ }
+
+ bool hasWorkGroupInfo() const {
+ return WorkGroupInfo;
+ }
+
+ bool hasPrivateSegmentWaveByteOffset() const {
+ return PrivateSegmentWaveByteOffset;
+ }
+
+ bool hasWorkItemIDX() const {
+ return WorkItemIDX;
+ }
+
+ bool hasWorkItemIDY() const {
+ return WorkItemIDY;
+ }
+
+ bool hasWorkItemIDZ() const {
+ return WorkItemIDZ;
+ }
+
+ unsigned getNumUserSGPRs() const {
+ return NumUserSGPRs;
+ }
+
+ unsigned getNumPreloadedSGPRs() const {
+ return NumUserSGPRs + NumSystemSGPRs;
+ }
+
+ unsigned getPrivateSegmentWaveByteOffsetSystemSGPR() const {
+ return PrivateSegmentWaveByteOffsetSystemSGPR;
+ }
+
+ /// \brief Returns the physical register reserved for use as the resource
+ /// descriptor for scratch accesses.
+ unsigned getScratchRSrcReg() const {
+ return ScratchRSrcReg;
+ }
+
+ void setScratchRSrcReg(unsigned Reg) {
+ assert(Reg != AMDGPU::NoRegister && "Should never be unset");
+ ScratchRSrcReg = Reg;
+ }
+
+ unsigned getScratchWaveOffsetReg() const {
+ return ScratchWaveOffsetReg;
+ }
+
+ void setScratchWaveOffsetReg(unsigned Reg) {
+ assert(Reg != AMDGPU::NoRegister && "Should never be unset");
+ ScratchWaveOffsetReg = Reg;
+ }
+
+ unsigned getQueuePtrUserSGPR() const {
+ return QueuePtrUserSGPR;
+ }
+
+ bool hasSpilledSGPRs() const {
+ return HasSpilledSGPRs;
+ }
+
+ void setHasSpilledSGPRs(bool Spill = true) {
+ HasSpilledSGPRs = Spill;
+ }
+
+ bool hasSpilledVGPRs() const {
+ return HasSpilledVGPRs;
+ }
+
+ void setHasSpilledVGPRs(bool Spill = true) {
+ HasSpilledVGPRs = Spill;
+ }
+
+ bool hasNonSpillStackObjects() const {
+ return HasNonSpillStackObjects;
+ }
+
+ void setHasNonSpillStackObjects(bool StackObject = true) {
+ HasNonSpillStackObjects = StackObject;
+ }
+
+ unsigned getNumSpilledSGPRs() const {
+ return NumSpilledSGPRs;
+ }
+
+ unsigned getNumSpilledVGPRs() const {
+ return NumSpilledVGPRs;
+ }
+
+ void addToSpilledSGPRs(unsigned num) {
+ NumSpilledSGPRs += num;
+ }
+
+ void addToSpilledVGPRs(unsigned num) {
+ NumSpilledVGPRs += num;
+ }
+
+ unsigned getPSInputAddr() const {
+ return PSInputAddr;
+ }
+
+ bool isPSInputAllocated(unsigned Index) const {
+ return PSInputAddr & (1 << Index);
+ }
+
+ void markPSInputAllocated(unsigned Index) {
+ PSInputAddr |= 1 << Index;
+ }
+
+ bool returnsVoid() const {
+ return ReturnsVoid;
+ }
+
+ void setIfReturnsVoid(bool Value) {
+ ReturnsVoid = Value;
+ }
+
+ /// \returns A pair of default/requested minimum/maximum flat work group sizes
+ /// for this function.
+ std::pair<unsigned, unsigned> getFlatWorkGroupSizes() const {
+ return FlatWorkGroupSizes;
+ }
+
+ /// \returns Default/requested minimum flat work group size for this function.
+ unsigned getMinFlatWorkGroupSize() const {
+ return FlatWorkGroupSizes.first;
+ }
+
+ /// \returns Default/requested maximum flat work group size for this function.
+ unsigned getMaxFlatWorkGroupSize() const {
+ return FlatWorkGroupSizes.second;
+ }
+
+ /// \returns A pair of default/requested minimum/maximum number of waves per
+ /// execution unit.
+ std::pair<unsigned, unsigned> getWavesPerEU() const {
+ return WavesPerEU;
+ }
+
+ /// \returns Default/requested minimum number of waves per execution unit.
+ unsigned getMinWavesPerEU() const {
+ return WavesPerEU.first;
+ }
+
+ /// \returns Default/requested maximum number of waves per execution unit.
+ unsigned getMaxWavesPerEU() const {
+ return WavesPerEU.second;
+ }
+
+ /// \returns Stack object index for \p Dim's work group ID.
+ int getDebuggerWorkGroupIDStackObjectIndex(unsigned Dim) const {
+ assert(Dim < 3);
+ return DebuggerWorkGroupIDStackObjectIndices[Dim];
+ }
+
+ /// \brief Sets stack object index for \p Dim's work group ID to \p ObjectIdx.
+ void setDebuggerWorkGroupIDStackObjectIndex(unsigned Dim, int ObjectIdx) {
+ assert(Dim < 3);
+ DebuggerWorkGroupIDStackObjectIndices[Dim] = ObjectIdx;
+ }
+
+ /// \returns Stack object index for \p Dim's work item ID.
+ int getDebuggerWorkItemIDStackObjectIndex(unsigned Dim) const {
+ assert(Dim < 3);
+ return DebuggerWorkItemIDStackObjectIndices[Dim];
+ }
+
+ /// \brief Sets stack object index for \p Dim's work item ID to \p ObjectIdx.
+ void setDebuggerWorkItemIDStackObjectIndex(unsigned Dim, int ObjectIdx) {
+ assert(Dim < 3);
+ DebuggerWorkItemIDStackObjectIndices[Dim] = ObjectIdx;
+ }
+
+ /// \returns SGPR used for \p Dim's work group ID.
+ unsigned getWorkGroupIDSGPR(unsigned Dim) const {
+ switch (Dim) {
+ case 0:
+ assert(hasWorkGroupIDX());
+ return WorkGroupIDXSystemSGPR;
+ case 1:
+ assert(hasWorkGroupIDY());
+ return WorkGroupIDYSystemSGPR;
+ case 2:
+ assert(hasWorkGroupIDZ());
+ return WorkGroupIDZSystemSGPR;
+ }
+ llvm_unreachable("unexpected dimension");
+ }
+
+ /// \returns VGPR used for \p Dim' work item ID.
+ unsigned getWorkItemIDVGPR(unsigned Dim) const {
+ switch (Dim) {
+ case 0:
+ assert(hasWorkItemIDX());
+ return AMDGPU::VGPR0;
+ case 1:
+ assert(hasWorkItemIDY());
+ return AMDGPU::VGPR1;
+ case 2:
+ assert(hasWorkItemIDZ());
+ return AMDGPU::VGPR2;
+ }
+ llvm_unreachable("unexpected dimension");
+ }
+
+ const AMDGPUBufferPseudoSourceValue *getBufferPSV() const {
+ return &BufferPSV;
+ }
+
+ const AMDGPUImagePseudoSourceValue *getImagePSV() const {
+ return &ImagePSV;
+ }
+};
+
+} // End namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
new file mode 100644
index 000000000000..da86bbf9dd2a
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -0,0 +1,1898 @@
+//===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief SI Machine Scheduler interface
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "SIInstrInfo.h"
+#include "SIMachineScheduler.h"
+#include "SIRegisterInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <map>
+#include <set>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "misched"
+
+// This scheduler implements a different scheduling algorithm than
+// GenericScheduler.
+//
+// There are several specific architecture behaviours that can't be modelled
+// for GenericScheduler:
+// . When accessing the result of an SGPR load instruction, you have to wait
+// for all the SGPR load instructions before your current instruction to
+// have finished.
+// . When accessing the result of an VGPR load instruction, you have to wait
+// for all the VGPR load instructions previous to the VGPR load instruction
+// you are interested in to finish.
+// . The less the register pressure, the best load latencies are hidden
+//
+// Moreover some specifities (like the fact a lot of instructions in the shader
+// have few dependencies) makes the generic scheduler have some unpredictable
+// behaviours. For example when register pressure becomes high, it can either
+// manage to prevent register pressure from going too high, or it can
+// increase register pressure even more than if it hadn't taken register
+// pressure into account.
+//
+// Also some other bad behaviours are generated, like loading at the beginning
+// of the shader a constant in VGPR you won't need until the end of the shader.
+//
+// The scheduling problem for SI can distinguish three main parts:
+// . Hiding high latencies (texture sampling, etc)
+// . Hiding low latencies (SGPR constant loading, etc)
+// . Keeping register usage low for better latency hiding and general
+// performance
+//
+// Some other things can also affect performance, but are hard to predict
+// (cache usage, the fact the HW can issue several instructions from different
+// wavefronts if different types, etc)
+//
+// This scheduler tries to solve the scheduling problem by dividing it into
+// simpler sub-problems. It divides the instructions into blocks, schedules
+// locally inside the blocks where it takes care of low latencies, and then
+// chooses the order of the blocks by taking care of high latencies.
+// Dividing the instructions into blocks helps control keeping register
+// usage low.
+//
+// First the instructions are put into blocks.
+// We want the blocks help control register usage and hide high latencies
+// later. To help control register usage, we typically want all local
+// computations, when for example you create a result that can be comsummed
+// right away, to be contained in a block. Block inputs and outputs would
+// typically be important results that are needed in several locations of
+// the shader. Since we do want blocks to help hide high latencies, we want
+// the instructions inside the block to have a minimal set of dependencies
+// on high latencies. It will make it easy to pick blocks to hide specific
+// high latencies.
+// The block creation algorithm is divided into several steps, and several
+// variants can be tried during the scheduling process.
+//
+// Second the order of the instructions inside the blocks is chosen.
+// At that step we do take into account only register usage and hiding
+// low latency instructions
+//
+// Third the block order is chosen, there we try to hide high latencies
+// and keep register usage low.
+//
+// After the third step, a pass is done to improve the hiding of low
+// latencies.
+//
+// Actually when talking about 'low latency' or 'high latency' it includes
+// both the latency to get the cache (or global mem) data go to the register,
+// and the bandwidth limitations.
+// Increasing the number of active wavefronts helps hide the former, but it
+// doesn't solve the latter, thus why even if wavefront count is high, we have
+// to try have as many instructions hiding high latencies as possible.
+// The OpenCL doc says for example latency of 400 cycles for a global mem access,
+// which is hidden by 10 instructions if the wavefront count is 10.
+
+// Some figures taken from AMD docs:
+// Both texture and constant L1 caches are 4-way associative with 64 bytes
+// lines.
+// Constant cache is shared with 4 CUs.
+// For texture sampling, the address generation unit receives 4 texture
+// addresses per cycle, thus we could expect texture sampling latency to be
+// equivalent to 4 instructions in the very best case (a VGPR is 64 work items,
+// instructions in a wavefront group are executed every 4 cycles),
+// or 16 instructions if the other wavefronts associated to the 3 other VALUs
+// of the CU do texture sampling too. (Don't take these figures too seriously,
+// as I'm not 100% sure of the computation)
+// Data exports should get similar latency.
+// For constant loading, the cache is shader with 4 CUs.
+// The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit"
+// I guess if the other CU don't read the cache, it can go up to 64B/cycle.
+// It means a simple s_buffer_load should take one instruction to hide, as
+// well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same
+// cache line.
+//
+// As of today the driver doesn't preload the constants in cache, thus the
+// first loads get extra latency. The doc says global memory access can be
+// 300-600 cycles. We do not specially take that into account when scheduling
+// As we expect the driver to be able to preload the constants soon.
+
+// common code //
+
+#ifndef NDEBUG
+
+static const char *getReasonStr(SIScheduleCandReason Reason) {
+ switch (Reason) {
+ case NoCand: return "NOCAND";
+ case RegUsage: return "REGUSAGE";
+ case Latency: return "LATENCY";
+ case Successor: return "SUCCESSOR";
+ case Depth: return "DEPTH";
+ case NodeOrder: return "ORDER";
+ }
+ llvm_unreachable("Unknown reason!");
+}
+
+#endif
+
+static bool tryLess(int TryVal, int CandVal,
+ SISchedulerCandidate &TryCand,
+ SISchedulerCandidate &Cand,
+ SIScheduleCandReason Reason) {
+ if (TryVal < CandVal) {
+ TryCand.Reason = Reason;
+ return true;
+ }
+ if (TryVal > CandVal) {
+ if (Cand.Reason > Reason)
+ Cand.Reason = Reason;
+ return true;
+ }
+ Cand.setRepeat(Reason);
+ return false;
+}
+
+static bool tryGreater(int TryVal, int CandVal,
+ SISchedulerCandidate &TryCand,
+ SISchedulerCandidate &Cand,
+ SIScheduleCandReason Reason) {
+ if (TryVal > CandVal) {
+ TryCand.Reason = Reason;
+ return true;
+ }
+ if (TryVal < CandVal) {
+ if (Cand.Reason > Reason)
+ Cand.Reason = Reason;
+ return true;
+ }
+ Cand.setRepeat(Reason);
+ return false;
+}
+
+// SIScheduleBlock //
+
+void SIScheduleBlock::addUnit(SUnit *SU) {
+ NodeNum2Index[SU->NodeNum] = SUnits.size();
+ SUnits.push_back(SU);
+}
+
+#ifndef NDEBUG
+void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) {
+
+ dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
+ dbgs() << '\n';
+}
+#endif
+
+void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
+ SISchedCandidate &TryCand) {
+ // Initialize the candidate if needed.
+ if (!Cand.isValid()) {
+ TryCand.Reason = NodeOrder;
+ return;
+ }
+
+ if (Cand.SGPRUsage > 60 &&
+ tryLess(TryCand.SGPRUsage, Cand.SGPRUsage, TryCand, Cand, RegUsage))
+ return;
+
+ // Schedule low latency instructions as top as possible.
+ // Order of priority is:
+ // . Low latency instructions which do not depend on other low latency
+ // instructions we haven't waited for
+ // . Other instructions which do not depend on low latency instructions
+ // we haven't waited for
+ // . Low latencies
+ // . All other instructions
+ // Goal is to get: low latency instructions - independent instructions
+ // - (eventually some more low latency instructions)
+ // - instructions that depend on the first low latency instructions.
+ // If in the block there is a lot of constant loads, the SGPR usage
+ // could go quite high, thus above the arbitrary limit of 60 will encourage
+ // use the already loaded constants (in order to release some SGPRs) before
+ // loading more.
+ if (tryLess(TryCand.HasLowLatencyNonWaitedParent,
+ Cand.HasLowLatencyNonWaitedParent,
+ TryCand, Cand, SIScheduleCandReason::Depth))
+ return;
+
+ if (tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency,
+ TryCand, Cand, SIScheduleCandReason::Depth))
+ return;
+
+ if (TryCand.IsLowLatency &&
+ tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset,
+ TryCand, Cand, SIScheduleCandReason::Depth))
+ return;
+
+ if (tryLess(TryCand.VGPRUsage, Cand.VGPRUsage, TryCand, Cand, RegUsage))
+ return;
+
+ // Fall through to original instruction order.
+ if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
+ TryCand.Reason = NodeOrder;
+ }
+}
+
+SUnit* SIScheduleBlock::pickNode() {
+ SISchedCandidate TopCand;
+
+ for (SUnit* SU : TopReadySUs) {
+ SISchedCandidate TryCand;
+ std::vector<unsigned> pressure;
+ std::vector<unsigned> MaxPressure;
+ // Predict register usage after this instruction.
+ TryCand.SU = SU;
+ TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure);
+ TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()];
+ TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()];
+ TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum];
+ TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum];
+ TryCand.HasLowLatencyNonWaitedParent =
+ HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]];
+ tryCandidateTopDown(TopCand, TryCand);
+ if (TryCand.Reason != NoCand)
+ TopCand.setBest(TryCand);
+ }
+
+ return TopCand.SU;
+}
+
+
+// Schedule something valid.
+void SIScheduleBlock::fastSchedule() {
+ TopReadySUs.clear();
+ if (Scheduled)
+ undoSchedule();
+
+ for (SUnit* SU : SUnits) {
+ if (!SU->NumPredsLeft)
+ TopReadySUs.push_back(SU);
+ }
+
+ while (!TopReadySUs.empty()) {
+ SUnit *SU = TopReadySUs[0];
+ ScheduledSUnits.push_back(SU);
+ nodeScheduled(SU);
+ }
+
+ Scheduled = true;
+}
+
+// Returns if the register was set between first and last.
+static bool isDefBetween(unsigned Reg,
+ SlotIndex First, SlotIndex Last,
+ const MachineRegisterInfo *MRI,
+ const LiveIntervals *LIS) {
+ for (MachineRegisterInfo::def_instr_iterator
+ UI = MRI->def_instr_begin(Reg),
+ UE = MRI->def_instr_end(); UI != UE; ++UI) {
+ const MachineInstr* MI = &*UI;
+ if (MI->isDebugValue())
+ continue;
+ SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot();
+ if (InstSlot >= First && InstSlot <= Last)
+ return true;
+ }
+ return false;
+}
+
+void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
+ MachineBasicBlock::iterator EndBlock) {
+ IntervalPressure Pressure, BotPressure;
+ RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure);
+ LiveIntervals *LIS = DAG->getLIS();
+ MachineRegisterInfo *MRI = DAG->getMRI();
+ DAG->initRPTracker(TopRPTracker);
+ DAG->initRPTracker(BotRPTracker);
+ DAG->initRPTracker(RPTracker);
+
+ // Goes though all SU. RPTracker captures what had to be alive for the SUs
+ // to execute, and what is still alive at the end.
+ for (SUnit* SU : ScheduledSUnits) {
+ RPTracker.setPos(SU->getInstr());
+ RPTracker.advance();
+ }
+
+ // Close the RPTracker to finalize live ins/outs.
+ RPTracker.closeRegion();
+
+ // Initialize the live ins and live outs.
+ TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
+ BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
+
+ // Do not Track Physical Registers, because it messes up.
+ for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
+ if (TargetRegisterInfo::isVirtualRegister(RegMaskPair.RegUnit))
+ LiveInRegs.insert(RegMaskPair.RegUnit);
+ }
+ LiveOutRegs.clear();
+ // There is several possibilities to distinguish:
+ // 1) Reg is not input to any instruction in the block, but is output of one
+ // 2) 1) + read in the block and not needed after it
+ // 3) 1) + read in the block but needed in another block
+ // 4) Reg is input of an instruction but another block will read it too
+ // 5) Reg is input of an instruction and then rewritten in the block.
+ // result is not read in the block (implies used in another block)
+ // 6) Reg is input of an instruction and then rewritten in the block.
+ // result is read in the block and not needed in another block
+ // 7) Reg is input of an instruction and then rewritten in the block.
+ // result is read in the block but also needed in another block
+ // LiveInRegs will contains all the regs in situation 4, 5, 6, 7
+ // We want LiveOutRegs to contain only Regs whose content will be read after
+ // in another block, and whose content was written in the current block,
+ // that is we want it to get 1, 3, 5, 7
+ // Since we made the MIs of a block to be packed all together before
+ // scheduling, then the LiveIntervals were correct, and the RPTracker was
+ // able to correctly handle 5 vs 6, 2 vs 3.
+ // (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
+ // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
+ // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7
+ // The use of findDefBetween removes the case 4.
+ for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
+ unsigned Reg = RegMaskPair.RegUnit;
+ if (TargetRegisterInfo::isVirtualRegister(Reg) &&
+ isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
+ LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI,
+ LIS)) {
+ LiveOutRegs.insert(Reg);
+ }
+ }
+
+ // Pressure = sum_alive_registers register size
+ // Internally llvm will represent some registers as big 128 bits registers
+ // for example, but they actually correspond to 4 actual 32 bits registers.
+ // Thus Pressure is not equal to num_alive_registers * constant.
+ LiveInPressure = TopPressure.MaxSetPressure;
+ LiveOutPressure = BotPressure.MaxSetPressure;
+
+ // Prepares TopRPTracker for top down scheduling.
+ TopRPTracker.closeTop();
+}
+
+void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock,
+ MachineBasicBlock::iterator EndBlock) {
+ if (!Scheduled)
+ fastSchedule();
+
+ // PreScheduling phase to set LiveIn and LiveOut.
+ initRegPressure(BeginBlock, EndBlock);
+ undoSchedule();
+
+ // Schedule for real now.
+
+ TopReadySUs.clear();
+
+ for (SUnit* SU : SUnits) {
+ if (!SU->NumPredsLeft)
+ TopReadySUs.push_back(SU);
+ }
+
+ while (!TopReadySUs.empty()) {
+ SUnit *SU = pickNode();
+ ScheduledSUnits.push_back(SU);
+ TopRPTracker.setPos(SU->getInstr());
+ TopRPTracker.advance();
+ nodeScheduled(SU);
+ }
+
+ // TODO: compute InternalAdditionnalPressure.
+ InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size());
+
+ // Check everything is right.
+#ifndef NDEBUG
+ assert(SUnits.size() == ScheduledSUnits.size() &&
+ TopReadySUs.empty());
+ for (SUnit* SU : SUnits) {
+ assert(SU->isScheduled &&
+ SU->NumPredsLeft == 0);
+ }
+#endif
+
+ Scheduled = true;
+}
+
+void SIScheduleBlock::undoSchedule() {
+ for (SUnit* SU : SUnits) {
+ SU->isScheduled = false;
+ for (SDep& Succ : SU->Succs) {
+ if (BC->isSUInBlock(Succ.getSUnit(), ID))
+ undoReleaseSucc(SU, &Succ);
+ }
+ }
+ HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
+ ScheduledSUnits.clear();
+ Scheduled = false;
+}
+
+void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) {
+ SUnit *SuccSU = SuccEdge->getSUnit();
+
+ if (SuccEdge->isWeak()) {
+ ++SuccSU->WeakPredsLeft;
+ return;
+ }
+ ++SuccSU->NumPredsLeft;
+}
+
+void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) {
+ SUnit *SuccSU = SuccEdge->getSUnit();
+
+ if (SuccEdge->isWeak()) {
+ --SuccSU->WeakPredsLeft;
+ return;
+ }
+#ifndef NDEBUG
+ if (SuccSU->NumPredsLeft == 0) {
+ dbgs() << "*** Scheduling failed! ***\n";
+ SuccSU->dump(DAG);
+ dbgs() << " has been released too many times!\n";
+ llvm_unreachable(nullptr);
+ }
+#endif
+
+ --SuccSU->NumPredsLeft;
+}
+
+/// Release Successors of the SU that are in the block or not.
+void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) {
+ for (SDep& Succ : SU->Succs) {
+ SUnit *SuccSU = Succ.getSUnit();
+
+ if (SuccSU->NodeNum >= DAG->SUnits.size())
+ continue;
+
+ if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock)
+ continue;
+
+ releaseSucc(SU, &Succ);
+ if (SuccSU->NumPredsLeft == 0 && InOrOutBlock)
+ TopReadySUs.push_back(SuccSU);
+ }
+}
+
+void SIScheduleBlock::nodeScheduled(SUnit *SU) {
+ // Is in TopReadySUs
+ assert (!SU->NumPredsLeft);
+ std::vector<SUnit *>::iterator I = llvm::find(TopReadySUs, SU);
+ if (I == TopReadySUs.end()) {
+ dbgs() << "Data Structure Bug in SI Scheduler\n";
+ llvm_unreachable(nullptr);
+ }
+ TopReadySUs.erase(I);
+
+ releaseSuccessors(SU, true);
+ // Scheduling this node will trigger a wait,
+ // thus propagate to other instructions that they do not need to wait either.
+ if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]])
+ HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
+
+ if (DAG->IsLowLatencySU[SU->NodeNum]) {
+ for (SDep& Succ : SU->Succs) {
+ std::map<unsigned, unsigned>::iterator I =
+ NodeNum2Index.find(Succ.getSUnit()->NodeNum);
+ if (I != NodeNum2Index.end())
+ HasLowLatencyNonWaitedParent[I->second] = 1;
+ }
+ }
+ SU->isScheduled = true;
+}
+
+void SIScheduleBlock::finalizeUnits() {
+ // We remove links from outside blocks to enable scheduling inside the block.
+ for (SUnit* SU : SUnits) {
+ releaseSuccessors(SU, false);
+ if (DAG->IsHighLatencySU[SU->NodeNum])
+ HighLatencyBlock = true;
+ }
+ HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0);
+}
+
+// we maintain ascending order of IDs
+void SIScheduleBlock::addPred(SIScheduleBlock *Pred) {
+ unsigned PredID = Pred->getID();
+
+ // Check if not already predecessor.
+ for (SIScheduleBlock* P : Preds) {
+ if (PredID == P->getID())
+ return;
+ }
+ Preds.push_back(Pred);
+
+ assert(none_of(Succs,
+ [=](SIScheduleBlock *S) { return PredID == S->getID(); }) &&
+ "Loop in the Block Graph!");
+}
+
+void SIScheduleBlock::addSucc(SIScheduleBlock *Succ) {
+ unsigned SuccID = Succ->getID();
+
+ // Check if not already predecessor.
+ for (SIScheduleBlock* S : Succs) {
+ if (SuccID == S->getID())
+ return;
+ }
+ if (Succ->isHighLatencyBlock())
+ ++NumHighLatencySuccessors;
+ Succs.push_back(Succ);
+ assert(none_of(Preds,
+ [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) &&
+ "Loop in the Block Graph!");
+}
+
+#ifndef NDEBUG
+void SIScheduleBlock::printDebug(bool full) {
+ dbgs() << "Block (" << ID << ")\n";
+ if (!full)
+ return;
+
+ dbgs() << "\nContains High Latency Instruction: "
+ << HighLatencyBlock << '\n';
+ dbgs() << "\nDepends On:\n";
+ for (SIScheduleBlock* P : Preds) {
+ P->printDebug(false);
+ }
+
+ dbgs() << "\nSuccessors:\n";
+ for (SIScheduleBlock* S : Succs) {
+ S->printDebug(false);
+ }
+
+ if (Scheduled) {
+ dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' '
+ << LiveInPressure[DAG->getVGPRSetID()] << '\n';
+ dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' '
+ << LiveOutPressure[DAG->getVGPRSetID()] << "\n\n";
+ dbgs() << "LiveIns:\n";
+ for (unsigned Reg : LiveInRegs)
+ dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
+
+ dbgs() << "\nLiveOuts:\n";
+ for (unsigned Reg : LiveOutRegs)
+ dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
+ }
+
+ dbgs() << "\nInstructions:\n";
+ if (!Scheduled) {
+ for (SUnit* SU : SUnits) {
+ SU->dump(DAG);
+ }
+ } else {
+ for (SUnit* SU : SUnits) {
+ SU->dump(DAG);
+ }
+ }
+
+ dbgs() << "///////////////////////\n";
+}
+#endif
+
+// SIScheduleBlockCreator //
+
+SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) :
+DAG(DAG) {
+}
+
+SIScheduleBlockCreator::~SIScheduleBlockCreator() = default;
+
+SIScheduleBlocks
+SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) {
+ std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B =
+ Blocks.find(BlockVariant);
+ if (B == Blocks.end()) {
+ SIScheduleBlocks Res;
+ createBlocksForVariant(BlockVariant);
+ topologicalSort();
+ scheduleInsideBlocks();
+ fillStats();
+ Res.Blocks = CurrentBlocks;
+ Res.TopDownIndex2Block = TopDownIndex2Block;
+ Res.TopDownBlock2Index = TopDownBlock2Index;
+ Blocks[BlockVariant] = Res;
+ return Res;
+ } else {
+ return B->second;
+ }
+}
+
+bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) {
+ if (SU->NodeNum >= DAG->SUnits.size())
+ return false;
+ return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID;
+}
+
+void SIScheduleBlockCreator::colorHighLatenciesAlone() {
+ unsigned DAGSize = DAG->SUnits.size();
+
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SUnit *SU = &DAG->SUnits[i];
+ if (DAG->IsHighLatencySU[SU->NodeNum]) {
+ CurrentColoring[SU->NodeNum] = NextReservedID++;
+ }
+ }
+}
+
+void SIScheduleBlockCreator::colorHighLatenciesGroups() {
+ unsigned DAGSize = DAG->SUnits.size();
+ unsigned NumHighLatencies = 0;
+ unsigned GroupSize;
+ unsigned Color = NextReservedID;
+ unsigned Count = 0;
+ std::set<unsigned> FormingGroup;
+
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SUnit *SU = &DAG->SUnits[i];
+ if (DAG->IsHighLatencySU[SU->NodeNum])
+ ++NumHighLatencies;
+ }
+
+ if (NumHighLatencies == 0)
+ return;
+
+ if (NumHighLatencies <= 6)
+ GroupSize = 2;
+ else if (NumHighLatencies <= 12)
+ GroupSize = 3;
+ else
+ GroupSize = 4;
+
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SUnit *SU = &DAG->SUnits[i];
+ if (DAG->IsHighLatencySU[SU->NodeNum]) {
+ unsigned CompatibleGroup = true;
+ unsigned ProposedColor = Color;
+ for (unsigned j : FormingGroup) {
+ // TODO: Currently CompatibleGroup will always be false,
+ // because the graph enforces the load order. This
+ // can be fixed, but as keeping the load order is often
+ // good for performance that causes a performance hit (both
+ // the default scheduler and this scheduler).
+ // When this scheduler determines a good load order,
+ // this can be fixed.
+ if (!DAG->canAddEdge(SU, &DAG->SUnits[j]) ||
+ !DAG->canAddEdge(&DAG->SUnits[j], SU))
+ CompatibleGroup = false;
+ }
+ if (!CompatibleGroup || ++Count == GroupSize) {
+ FormingGroup.clear();
+ Color = ++NextReservedID;
+ if (!CompatibleGroup) {
+ ProposedColor = Color;
+ FormingGroup.insert(SU->NodeNum);
+ }
+ Count = 0;
+ } else {
+ FormingGroup.insert(SU->NodeNum);
+ }
+ CurrentColoring[SU->NodeNum] = ProposedColor;
+ }
+ }
+}
+
+void SIScheduleBlockCreator::colorComputeReservedDependencies() {
+ unsigned DAGSize = DAG->SUnits.size();
+ std::map<std::set<unsigned>, unsigned> ColorCombinations;
+
+ CurrentTopDownReservedDependencyColoring.clear();
+ CurrentBottomUpReservedDependencyColoring.clear();
+
+ CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0);
+ CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0);
+
+ // Traverse TopDown, and give different colors to SUs depending
+ // on which combination of High Latencies they depend on.
+
+ for (unsigned SUNum : DAG->TopDownIndex2SU) {
+ SUnit *SU = &DAG->SUnits[SUNum];
+ std::set<unsigned> SUColors;
+
+ // Already given.
+ if (CurrentColoring[SU->NodeNum]) {
+ CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
+ CurrentColoring[SU->NodeNum];
+ continue;
+ }
+
+ for (SDep& PredDep : SU->Preds) {
+ SUnit *Pred = PredDep.getSUnit();
+ if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
+ continue;
+ if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0)
+ SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]);
+ }
+ // Color 0 by default.
+ if (SUColors.empty())
+ continue;
+ // Same color than parents.
+ if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
+ CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
+ *SUColors.begin();
+ else {
+ std::map<std::set<unsigned>, unsigned>::iterator Pos =
+ ColorCombinations.find(SUColors);
+ if (Pos != ColorCombinations.end()) {
+ CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second;
+ } else {
+ CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
+ NextNonReservedID;
+ ColorCombinations[SUColors] = NextNonReservedID++;
+ }
+ }
+ }
+
+ ColorCombinations.clear();
+
+ // Same as before, but BottomUp.
+
+ for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+ SUnit *SU = &DAG->SUnits[SUNum];
+ std::set<unsigned> SUColors;
+
+ // Already given.
+ if (CurrentColoring[SU->NodeNum]) {
+ CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
+ CurrentColoring[SU->NodeNum];
+ continue;
+ }
+
+ for (SDep& SuccDep : SU->Succs) {
+ SUnit *Succ = SuccDep.getSUnit();
+ if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+ continue;
+ if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0)
+ SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]);
+ }
+ // Keep color 0.
+ if (SUColors.empty())
+ continue;
+ // Same color than parents.
+ if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
+ CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
+ *SUColors.begin();
+ else {
+ std::map<std::set<unsigned>, unsigned>::iterator Pos =
+ ColorCombinations.find(SUColors);
+ if (Pos != ColorCombinations.end()) {
+ CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second;
+ } else {
+ CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
+ NextNonReservedID;
+ ColorCombinations[SUColors] = NextNonReservedID++;
+ }
+ }
+ }
+}
+
+void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
+ unsigned DAGSize = DAG->SUnits.size();
+ std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations;
+
+ // Every combination of colors given by the top down
+ // and bottom up Reserved node dependency
+
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SUnit *SU = &DAG->SUnits[i];
+ std::pair<unsigned, unsigned> SUColors;
+
+ // High latency instructions: already given.
+ if (CurrentColoring[SU->NodeNum])
+ continue;
+
+ SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum];
+ SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum];
+
+ std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos =
+ ColorCombinations.find(SUColors);
+ if (Pos != ColorCombinations.end()) {
+ CurrentColoring[SU->NodeNum] = Pos->second;
+ } else {
+ CurrentColoring[SU->NodeNum] = NextNonReservedID;
+ ColorCombinations[SUColors] = NextNonReservedID++;
+ }
+ }
+}
+
+void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
+ unsigned DAGSize = DAG->SUnits.size();
+ std::vector<int> PendingColoring = CurrentColoring;
+
+ for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+ SUnit *SU = &DAG->SUnits[SUNum];
+ std::set<unsigned> SUColors;
+ std::set<unsigned> SUColorsPending;
+
+ if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+ continue;
+
+ if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 ||
+ CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0)
+ continue;
+
+ for (SDep& SuccDep : SU->Succs) {
+ SUnit *Succ = SuccDep.getSUnit();
+ if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+ continue;
+ if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 ||
+ CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0)
+ SUColors.insert(CurrentColoring[Succ->NodeNum]);
+ SUColorsPending.insert(PendingColoring[Succ->NodeNum]);
+ }
+ if (SUColors.size() == 1 && SUColorsPending.size() == 1)
+ PendingColoring[SU->NodeNum] = *SUColors.begin();
+ else // TODO: Attribute new colors depending on color
+ // combination of children.
+ PendingColoring[SU->NodeNum] = NextNonReservedID++;
+ }
+ CurrentColoring = PendingColoring;
+}
+
+
+void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
+ unsigned DAGSize = DAG->SUnits.size();
+ unsigned PreviousColor;
+ std::set<unsigned> SeenColors;
+
+ if (DAGSize <= 1)
+ return;
+
+ PreviousColor = CurrentColoring[0];
+
+ for (unsigned i = 1, e = DAGSize; i != e; ++i) {
+ SUnit *SU = &DAG->SUnits[i];
+ unsigned CurrentColor = CurrentColoring[i];
+ unsigned PreviousColorSave = PreviousColor;
+ assert(i == SU->NodeNum);
+
+ if (CurrentColor != PreviousColor)
+ SeenColors.insert(PreviousColor);
+ PreviousColor = CurrentColor;
+
+ if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+ continue;
+
+ if (SeenColors.find(CurrentColor) == SeenColors.end())
+ continue;
+
+ if (PreviousColorSave != CurrentColor)
+ CurrentColoring[i] = NextNonReservedID++;
+ else
+ CurrentColoring[i] = CurrentColoring[i-1];
+ }
+}
+
+void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
+ unsigned DAGSize = DAG->SUnits.size();
+
+ for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+ SUnit *SU = &DAG->SUnits[SUNum];
+ std::set<unsigned> SUColors;
+
+ if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+ continue;
+
+ // No predecessor: Vgpr constant loading.
+ // Low latency instructions usually have a predecessor (the address)
+ if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum])
+ continue;
+
+ for (SDep& SuccDep : SU->Succs) {
+ SUnit *Succ = SuccDep.getSUnit();
+ if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+ continue;
+ SUColors.insert(CurrentColoring[Succ->NodeNum]);
+ }
+ if (SUColors.size() == 1)
+ CurrentColoring[SU->NodeNum] = *SUColors.begin();
+ }
+}
+
+void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
+ unsigned DAGSize = DAG->SUnits.size();
+
+ for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+ SUnit *SU = &DAG->SUnits[SUNum];
+ std::set<unsigned> SUColors;
+
+ if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+ continue;
+
+ for (SDep& SuccDep : SU->Succs) {
+ SUnit *Succ = SuccDep.getSUnit();
+ if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+ continue;
+ SUColors.insert(CurrentColoring[Succ->NodeNum]);
+ }
+ if (SUColors.size() == 1)
+ CurrentColoring[SU->NodeNum] = *SUColors.begin();
+ }
+}
+
+void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
+ unsigned DAGSize = DAG->SUnits.size();
+
+ for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+ SUnit *SU = &DAG->SUnits[SUNum];
+ std::set<unsigned> SUColors;
+
+ if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+ continue;
+
+ for (SDep& SuccDep : SU->Succs) {
+ SUnit *Succ = SuccDep.getSUnit();
+ if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+ continue;
+ SUColors.insert(CurrentColoring[Succ->NodeNum]);
+ }
+ if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize)
+ CurrentColoring[SU->NodeNum] = *SUColors.begin();
+ }
+}
+
+void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
+ unsigned DAGSize = DAG->SUnits.size();
+ std::map<unsigned, unsigned> ColorCount;
+
+ for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+ SUnit *SU = &DAG->SUnits[SUNum];
+ unsigned color = CurrentColoring[SU->NodeNum];
+ std::map<unsigned, unsigned>::iterator Pos = ColorCount.find(color);
+ if (Pos != ColorCount.end()) {
+ ++ColorCount[color];
+ } else {
+ ColorCount[color] = 1;
+ }
+ }
+
+ for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+ SUnit *SU = &DAG->SUnits[SUNum];
+ unsigned color = CurrentColoring[SU->NodeNum];
+ std::set<unsigned> SUColors;
+
+ if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+ continue;
+
+ if (ColorCount[color] > 1)
+ continue;
+
+ for (SDep& SuccDep : SU->Succs) {
+ SUnit *Succ = SuccDep.getSUnit();
+ if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+ continue;
+ SUColors.insert(CurrentColoring[Succ->NodeNum]);
+ }
+ if (SUColors.size() == 1 && *SUColors.begin() != color) {
+ --ColorCount[color];
+ CurrentColoring[SU->NodeNum] = *SUColors.begin();
+ ++ColorCount[*SUColors.begin()];
+ }
+ }
+}
+
+void SIScheduleBlockCreator::cutHugeBlocks() {
+ // TODO
+}
+
+void SIScheduleBlockCreator::regroupNoUserInstructions() {
+ unsigned DAGSize = DAG->SUnits.size();
+ int GroupID = NextNonReservedID++;
+
+ for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+ SUnit *SU = &DAG->SUnits[SUNum];
+ bool hasSuccessor = false;
+
+ if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+ continue;
+
+ for (SDep& SuccDep : SU->Succs) {
+ SUnit *Succ = SuccDep.getSUnit();
+ if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+ continue;
+ hasSuccessor = true;
+ }
+ if (!hasSuccessor)
+ CurrentColoring[SU->NodeNum] = GroupID;
+ }
+}
+
+void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) {
+ unsigned DAGSize = DAG->SUnits.size();
+ std::map<unsigned,unsigned> RealID;
+
+ CurrentBlocks.clear();
+ CurrentColoring.clear();
+ CurrentColoring.resize(DAGSize, 0);
+ Node2CurrentBlock.clear();
+
+ // Restore links previous scheduling variant has overridden.
+ DAG->restoreSULinksLeft();
+
+ NextReservedID = 1;
+ NextNonReservedID = DAGSize + 1;
+
+ DEBUG(dbgs() << "Coloring the graph\n");
+
+ if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped)
+ colorHighLatenciesGroups();
+ else
+ colorHighLatenciesAlone();
+ colorComputeReservedDependencies();
+ colorAccordingToReservedDependencies();
+ colorEndsAccordingToDependencies();
+ if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive)
+ colorForceConsecutiveOrderInGroup();
+ regroupNoUserInstructions();
+ colorMergeConstantLoadsNextGroup();
+ colorMergeIfPossibleNextGroupOnlyForReserved();
+
+ // Put SUs of same color into same block
+ Node2CurrentBlock.resize(DAGSize, -1);
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SUnit *SU = &DAG->SUnits[i];
+ unsigned Color = CurrentColoring[SU->NodeNum];
+ if (RealID.find(Color) == RealID.end()) {
+ int ID = CurrentBlocks.size();
+ BlockPtrs.push_back(llvm::make_unique<SIScheduleBlock>(DAG, this, ID));
+ CurrentBlocks.push_back(BlockPtrs.rbegin()->get());
+ RealID[Color] = ID;
+ }
+ CurrentBlocks[RealID[Color]]->addUnit(SU);
+ Node2CurrentBlock[SU->NodeNum] = RealID[Color];
+ }
+
+ // Build dependencies between blocks.
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SUnit *SU = &DAG->SUnits[i];
+ int SUID = Node2CurrentBlock[i];
+ for (SDep& SuccDep : SU->Succs) {
+ SUnit *Succ = SuccDep.getSUnit();
+ if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+ continue;
+ if (Node2CurrentBlock[Succ->NodeNum] != SUID)
+ CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]]);
+ }
+ for (SDep& PredDep : SU->Preds) {
+ SUnit *Pred = PredDep.getSUnit();
+ if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
+ continue;
+ if (Node2CurrentBlock[Pred->NodeNum] != SUID)
+ CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]);
+ }
+ }
+
+ // Free root and leafs of all blocks to enable scheduling inside them.
+ for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
+ SIScheduleBlock *Block = CurrentBlocks[i];
+ Block->finalizeUnits();
+ }
+ DEBUG(
+ dbgs() << "Blocks created:\n\n";
+ for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
+ SIScheduleBlock *Block = CurrentBlocks[i];
+ Block->printDebug(true);
+ }
+ );
+}
+
+// Two functions taken from Codegen/MachineScheduler.cpp
+
+/// Non-const version.
+static MachineBasicBlock::iterator
+nextIfDebug(MachineBasicBlock::iterator I,
+ MachineBasicBlock::const_iterator End) {
+ for (; I != End; ++I) {
+ if (!I->isDebugValue())
+ break;
+ }
+ return I;
+}
+
+void SIScheduleBlockCreator::topologicalSort() {
+ unsigned DAGSize = CurrentBlocks.size();
+ std::vector<int> WorkList;
+
+ DEBUG(dbgs() << "Topological Sort\n");
+
+ WorkList.reserve(DAGSize);
+ TopDownIndex2Block.resize(DAGSize);
+ TopDownBlock2Index.resize(DAGSize);
+ BottomUpIndex2Block.resize(DAGSize);
+
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SIScheduleBlock *Block = CurrentBlocks[i];
+ unsigned Degree = Block->getSuccs().size();
+ TopDownBlock2Index[i] = Degree;
+ if (Degree == 0) {
+ WorkList.push_back(i);
+ }
+ }
+
+ int Id = DAGSize;
+ while (!WorkList.empty()) {
+ int i = WorkList.back();
+ SIScheduleBlock *Block = CurrentBlocks[i];
+ WorkList.pop_back();
+ TopDownBlock2Index[i] = --Id;
+ TopDownIndex2Block[Id] = i;
+ for (SIScheduleBlock* Pred : Block->getPreds()) {
+ if (!--TopDownBlock2Index[Pred->getID()])
+ WorkList.push_back(Pred->getID());
+ }
+ }
+
+#ifndef NDEBUG
+ // Check correctness of the ordering.
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SIScheduleBlock *Block = CurrentBlocks[i];
+ for (SIScheduleBlock* Pred : Block->getPreds()) {
+ assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] &&
+ "Wrong Top Down topological sorting");
+ }
+ }
+#endif
+
+ BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(),
+ TopDownIndex2Block.rend());
+}
+
+void SIScheduleBlockCreator::scheduleInsideBlocks() {
+ unsigned DAGSize = CurrentBlocks.size();
+
+ DEBUG(dbgs() << "\nScheduling Blocks\n\n");
+
+ // We do schedule a valid scheduling such that a Block corresponds
+ // to a range of instructions.
+ DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SIScheduleBlock *Block = CurrentBlocks[i];
+ Block->fastSchedule();
+ }
+
+ // Note: the following code, and the part restoring previous position
+ // is by far the most expensive operation of the Scheduler.
+
+ // Do not update CurrentTop.
+ MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop();
+ std::vector<MachineBasicBlock::iterator> PosOld;
+ std::vector<MachineBasicBlock::iterator> PosNew;
+ PosOld.reserve(DAG->SUnits.size());
+ PosNew.reserve(DAG->SUnits.size());
+
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ int BlockIndice = TopDownIndex2Block[i];
+ SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
+ std::vector<SUnit*> SUs = Block->getScheduledUnits();
+
+ for (SUnit* SU : SUs) {
+ MachineInstr *MI = SU->getInstr();
+ MachineBasicBlock::iterator Pos = MI;
+ PosOld.push_back(Pos);
+ if (&*CurrentTopFastSched == MI) {
+ PosNew.push_back(Pos);
+ CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched,
+ DAG->getCurrentBottom());
+ } else {
+ // Update the instruction stream.
+ DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI);
+
+ // Update LiveIntervals.
+ // Note: Moving all instructions and calling handleMove every time
+ // is the most cpu intensive operation of the scheduler.
+ // It would gain a lot if there was a way to recompute the
+ // LiveIntervals for the entire scheduling region.
+ DAG->getLIS()->handleMove(*MI, /*UpdateFlags=*/true);
+ PosNew.push_back(CurrentTopFastSched);
+ }
+ }
+ }
+
+ // Now we have Block of SUs == Block of MI.
+ // We do the final schedule for the instructions inside the block.
+ // The property that all the SUs of the Block are grouped together as MI
+ // is used for correct reg usage tracking.
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SIScheduleBlock *Block = CurrentBlocks[i];
+ std::vector<SUnit*> SUs = Block->getScheduledUnits();
+ Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr());
+ }
+
+ DEBUG(dbgs() << "Restoring MI Pos\n");
+ // Restore old ordering (which prevents a LIS->handleMove bug).
+ for (unsigned i = PosOld.size(), e = 0; i != e; --i) {
+ MachineBasicBlock::iterator POld = PosOld[i-1];
+ MachineBasicBlock::iterator PNew = PosNew[i-1];
+ if (PNew != POld) {
+ // Update the instruction stream.
+ DAG->getBB()->splice(POld, DAG->getBB(), PNew);
+
+ // Update LiveIntervals.
+ DAG->getLIS()->handleMove(*POld, /*UpdateFlags=*/true);
+ }
+ }
+
+ DEBUG(
+ for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
+ SIScheduleBlock *Block = CurrentBlocks[i];
+ Block->printDebug(true);
+ }
+ );
+}
+
+void SIScheduleBlockCreator::fillStats() {
+ unsigned DAGSize = CurrentBlocks.size();
+
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ int BlockIndice = TopDownIndex2Block[i];
+ SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
+ if (Block->getPreds().empty())
+ Block->Depth = 0;
+ else {
+ unsigned Depth = 0;
+ for (SIScheduleBlock *Pred : Block->getPreds()) {
+ if (Depth < Pred->Depth + 1)
+ Depth = Pred->Depth + 1;
+ }
+ Block->Depth = Depth;
+ }
+ }
+
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ int BlockIndice = BottomUpIndex2Block[i];
+ SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
+ if (Block->getSuccs().empty())
+ Block->Height = 0;
+ else {
+ unsigned Height = 0;
+ for (SIScheduleBlock *Succ : Block->getSuccs()) {
+ if (Height < Succ->Height + 1)
+ Height = Succ->Height + 1;
+ }
+ Block->Height = Height;
+ }
+ }
+}
+
+// SIScheduleBlockScheduler //
+
+SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
+ SISchedulerBlockSchedulerVariant Variant,
+ SIScheduleBlocks BlocksStruct) :
+ DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks),
+ LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0),
+ SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) {
+
+ // Fill the usage of every output
+ // Warning: while by construction we always have a link between two blocks
+ // when one needs a result from the other, the number of users of an output
+ // is not the sum of child blocks having as input the same virtual register.
+ // Here is an example. A produces x and y. B eats x and produces x'.
+ // C eats x' and y. The register coalescer may have attributed the same
+ // virtual register to x and x'.
+ // To count accurately, we do a topological sort. In case the register is
+ // found for several parents, we increment the usage of the one with the
+ // highest topological index.
+ LiveOutRegsNumUsages.resize(Blocks.size());
+ for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+ SIScheduleBlock *Block = Blocks[i];
+ for (unsigned Reg : Block->getInRegs()) {
+ bool Found = false;
+ int topoInd = -1;
+ for (SIScheduleBlock* Pred: Block->getPreds()) {
+ std::set<unsigned> PredOutRegs = Pred->getOutRegs();
+ std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
+
+ if (RegPos != PredOutRegs.end()) {
+ Found = true;
+ if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) {
+ topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()];
+ }
+ }
+ }
+
+ if (!Found)
+ continue;
+
+ int PredID = BlocksStruct.TopDownIndex2Block[topoInd];
+ std::map<unsigned, unsigned>::iterator RegPos =
+ LiveOutRegsNumUsages[PredID].find(Reg);
+ if (RegPos != LiveOutRegsNumUsages[PredID].end()) {
+ ++LiveOutRegsNumUsages[PredID][Reg];
+ } else {
+ LiveOutRegsNumUsages[PredID][Reg] = 1;
+ }
+ }
+ }
+
+ LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0);
+ BlockNumPredsLeft.resize(Blocks.size());
+ BlockNumSuccsLeft.resize(Blocks.size());
+
+ for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+ SIScheduleBlock *Block = Blocks[i];
+ BlockNumPredsLeft[i] = Block->getPreds().size();
+ BlockNumSuccsLeft[i] = Block->getSuccs().size();
+ }
+
+#ifndef NDEBUG
+ for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+ SIScheduleBlock *Block = Blocks[i];
+ assert(Block->getID() == i);
+ }
+#endif
+
+ std::set<unsigned> InRegs = DAG->getInRegs();
+ addLiveRegs(InRegs);
+
+ // Fill LiveRegsConsumers for regs that were already
+ // defined before scheduling.
+ for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+ SIScheduleBlock *Block = Blocks[i];
+ for (unsigned Reg : Block->getInRegs()) {
+ bool Found = false;
+ for (SIScheduleBlock* Pred: Block->getPreds()) {
+ std::set<unsigned> PredOutRegs = Pred->getOutRegs();
+ std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
+
+ if (RegPos != PredOutRegs.end()) {
+ Found = true;
+ break;
+ }
+ }
+
+ if (!Found) {
+ if (LiveRegsConsumers.find(Reg) == LiveRegsConsumers.end())
+ LiveRegsConsumers[Reg] = 1;
+ else
+ ++LiveRegsConsumers[Reg];
+ }
+ }
+ }
+
+ for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+ SIScheduleBlock *Block = Blocks[i];
+ if (BlockNumPredsLeft[i] == 0) {
+ ReadyBlocks.push_back(Block);
+ }
+ }
+
+ while (SIScheduleBlock *Block = pickBlock()) {
+ BlocksScheduled.push_back(Block);
+ blockScheduled(Block);
+ }
+
+ DEBUG(
+ dbgs() << "Block Order:";
+ for (SIScheduleBlock* Block : BlocksScheduled) {
+ dbgs() << ' ' << Block->getID();
+ }
+ );
+}
+
+bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
+ SIBlockSchedCandidate &TryCand) {
+ if (!Cand.isValid()) {
+ TryCand.Reason = NodeOrder;
+ return true;
+ }
+
+ // Try to hide high latencies.
+ if (tryLess(TryCand.LastPosHighLatParentScheduled,
+ Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency))
+ return true;
+ // Schedule high latencies early so you can hide them better.
+ if (tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency,
+ TryCand, Cand, Latency))
+ return true;
+ if (TryCand.IsHighLatency && tryGreater(TryCand.Height, Cand.Height,
+ TryCand, Cand, Depth))
+ return true;
+ if (tryGreater(TryCand.NumHighLatencySuccessors,
+ Cand.NumHighLatencySuccessors,
+ TryCand, Cand, Successor))
+ return true;
+ return false;
+}
+
+bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
+ SIBlockSchedCandidate &TryCand) {
+ if (!Cand.isValid()) {
+ TryCand.Reason = NodeOrder;
+ return true;
+ }
+
+ if (tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0,
+ TryCand, Cand, RegUsage))
+ return true;
+ if (tryGreater(TryCand.NumSuccessors > 0,
+ Cand.NumSuccessors > 0,
+ TryCand, Cand, Successor))
+ return true;
+ if (tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth))
+ return true;
+ if (tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff,
+ TryCand, Cand, RegUsage))
+ return true;
+ return false;
+}
+
+SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
+ SIBlockSchedCandidate Cand;
+ std::vector<SIScheduleBlock*>::iterator Best;
+ SIScheduleBlock *Block;
+ if (ReadyBlocks.empty())
+ return nullptr;
+
+ DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(),
+ VregCurrentUsage, SregCurrentUsage);
+ if (VregCurrentUsage > maxVregUsage)
+ maxVregUsage = VregCurrentUsage;
+ if (VregCurrentUsage > maxSregUsage)
+ maxSregUsage = VregCurrentUsage;
+ DEBUG(
+ dbgs() << "Picking New Blocks\n";
+ dbgs() << "Available: ";
+ for (SIScheduleBlock* Block : ReadyBlocks)
+ dbgs() << Block->getID() << ' ';
+ dbgs() << "\nCurrent Live:\n";
+ for (unsigned Reg : LiveRegs)
+ dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
+ dbgs() << '\n';
+ dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
+ dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';
+ );
+
+ Cand.Block = nullptr;
+ for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
+ E = ReadyBlocks.end(); I != E; ++I) {
+ SIBlockSchedCandidate TryCand;
+ TryCand.Block = *I;
+ TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock();
+ TryCand.VGPRUsageDiff =
+ checkRegUsageImpact(TryCand.Block->getInRegs(),
+ TryCand.Block->getOutRegs())[DAG->getVGPRSetID()];
+ TryCand.NumSuccessors = TryCand.Block->getSuccs().size();
+ TryCand.NumHighLatencySuccessors =
+ TryCand.Block->getNumHighLatencySuccessors();
+ TryCand.LastPosHighLatParentScheduled =
+ (unsigned int) std::max<int> (0,
+ LastPosHighLatencyParentScheduled[TryCand.Block->getID()] -
+ LastPosWaitedHighLatency);
+ TryCand.Height = TryCand.Block->Height;
+ // Try not to increase VGPR usage too much, else we may spill.
+ if (VregCurrentUsage > 120 ||
+ Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) {
+ if (!tryCandidateRegUsage(Cand, TryCand) &&
+ Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage)
+ tryCandidateLatency(Cand, TryCand);
+ } else {
+ if (!tryCandidateLatency(Cand, TryCand))
+ tryCandidateRegUsage(Cand, TryCand);
+ }
+ if (TryCand.Reason != NoCand) {
+ Cand.setBest(TryCand);
+ Best = I;
+ DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' '
+ << getReasonStr(Cand.Reason) << '\n');
+ }
+ }
+
+ DEBUG(
+ dbgs() << "Picking: " << Cand.Block->getID() << '\n';
+ dbgs() << "Is a block with high latency instruction: "
+ << (Cand.IsHighLatency ? "yes\n" : "no\n");
+ dbgs() << "Position of last high latency dependency: "
+ << Cand.LastPosHighLatParentScheduled << '\n';
+ dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';
+ dbgs() << '\n';
+ );
+
+ Block = Cand.Block;
+ ReadyBlocks.erase(Best);
+ return Block;
+}
+
+// Tracking of currently alive registers to determine VGPR Usage.
+
+void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) {
+ for (unsigned Reg : Regs) {
+ // For now only track virtual registers.
+ if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ continue;
+ // If not already in the live set, then add it.
+ (void) LiveRegs.insert(Reg);
+ }
+}
+
+void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
+ std::set<unsigned> &Regs) {
+ for (unsigned Reg : Regs) {
+ // For now only track virtual registers.
+ std::set<unsigned>::iterator Pos = LiveRegs.find(Reg);
+ assert (Pos != LiveRegs.end() && // Reg must be live.
+ LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() &&
+ LiveRegsConsumers[Reg] >= 1);
+ --LiveRegsConsumers[Reg];
+ if (LiveRegsConsumers[Reg] == 0)
+ LiveRegs.erase(Pos);
+ }
+}
+
+void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
+ for (SIScheduleBlock* Block : Parent->getSuccs()) {
+ --BlockNumPredsLeft[Block->getID()];
+ if (BlockNumPredsLeft[Block->getID()] == 0) {
+ ReadyBlocks.push_back(Block);
+ }
+ // TODO: Improve check. When the dependency between the high latency
+ // instructions and the instructions of the other blocks are WAR or WAW
+ // there will be no wait triggered. We would like these cases to not
+ // update LastPosHighLatencyParentScheduled.
+ if (Parent->isHighLatencyBlock())
+ LastPosHighLatencyParentScheduled[Block->getID()] = NumBlockScheduled;
+ }
+}
+
+void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
+ decreaseLiveRegs(Block, Block->getInRegs());
+ addLiveRegs(Block->getOutRegs());
+ releaseBlockSuccs(Block);
+ for (std::map<unsigned, unsigned>::iterator RegI =
+ LiveOutRegsNumUsages[Block->getID()].begin(),
+ E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) {
+ std::pair<unsigned, unsigned> RegP = *RegI;
+ if (LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end())
+ LiveRegsConsumers[RegP.first] = RegP.second;
+ else {
+ assert(LiveRegsConsumers[RegP.first] == 0);
+ LiveRegsConsumers[RegP.first] += RegP.second;
+ }
+ }
+ if (LastPosHighLatencyParentScheduled[Block->getID()] >
+ (unsigned)LastPosWaitedHighLatency)
+ LastPosWaitedHighLatency =
+ LastPosHighLatencyParentScheduled[Block->getID()];
+ ++NumBlockScheduled;
+}
+
+std::vector<int>
+SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
+ std::set<unsigned> &OutRegs) {
+ std::vector<int> DiffSetPressure;
+ DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0);
+
+ for (unsigned Reg : InRegs) {
+ // For now only track virtual registers.
+ if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ continue;
+ if (LiveRegsConsumers[Reg] > 1)
+ continue;
+ PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
+ for (; PSetI.isValid(); ++PSetI) {
+ DiffSetPressure[*PSetI] -= PSetI.getWeight();
+ }
+ }
+
+ for (unsigned Reg : OutRegs) {
+ // For now only track virtual registers.
+ if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ continue;
+ PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
+ for (; PSetI.isValid(); ++PSetI) {
+ DiffSetPressure[*PSetI] += PSetI.getWeight();
+ }
+ }
+
+ return DiffSetPressure;
+}
+
+// SIScheduler //
+
+struct SIScheduleBlockResult
+SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
+ SISchedulerBlockSchedulerVariant ScheduleVariant) {
+ SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant);
+ SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks);
+ std::vector<SIScheduleBlock*> ScheduledBlocks;
+ struct SIScheduleBlockResult Res;
+
+ ScheduledBlocks = Scheduler.getBlocks();
+
+ for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) {
+ SIScheduleBlock *Block = ScheduledBlocks[b];
+ std::vector<SUnit*> SUs = Block->getScheduledUnits();
+
+ for (SUnit* SU : SUs)
+ Res.SUs.push_back(SU->NodeNum);
+ }
+
+ Res.MaxSGPRUsage = Scheduler.getSGPRUsage();
+ Res.MaxVGPRUsage = Scheduler.getVGPRUsage();
+ return Res;
+}
+
+// SIScheduleDAGMI //
+
+SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) :
+ ScheduleDAGMILive(C, llvm::make_unique<GenericScheduler>(C)) {
+ SITII = static_cast<const SIInstrInfo*>(TII);
+ SITRI = static_cast<const SIRegisterInfo*>(TRI);
+
+ VGPRSetID = SITRI->getVGPRPressureSet();
+ SGPRSetID = SITRI->getSGPRPressureSet();
+}
+
+SIScheduleDAGMI::~SIScheduleDAGMI() = default;
+
+// Code adapted from scheduleDAG.cpp
+// Does a topological sort over the SUs.
+// Both TopDown and BottomUp
+void SIScheduleDAGMI::topologicalSort() {
+ Topo.InitDAGTopologicalSorting();
+
+ TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end());
+ BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend());
+}
+
+// Move low latencies further from their user without
+// increasing SGPR usage (in general)
+// This is to be replaced by a better pass that would
+// take into account SGPR usage (based on VGPR Usage
+// and the corresponding wavefront count), that would
+// try to merge groups of loads if it make sense, etc
+void SIScheduleDAGMI::moveLowLatencies() {
+ unsigned DAGSize = SUnits.size();
+ int LastLowLatencyUser = -1;
+ int LastLowLatencyPos = -1;
+
+ for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) {
+ SUnit *SU = &SUnits[ScheduledSUnits[i]];
+ bool IsLowLatencyUser = false;
+ unsigned MinPos = 0;
+
+ for (SDep& PredDep : SU->Preds) {
+ SUnit *Pred = PredDep.getSUnit();
+ if (SITII->isLowLatencyInstruction(*Pred->getInstr())) {
+ IsLowLatencyUser = true;
+ }
+ if (Pred->NodeNum >= DAGSize)
+ continue;
+ unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum];
+ if (PredPos >= MinPos)
+ MinPos = PredPos + 1;
+ }
+
+ if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
+ unsigned BestPos = LastLowLatencyUser + 1;
+ if ((int)BestPos <= LastLowLatencyPos)
+ BestPos = LastLowLatencyPos + 1;
+ if (BestPos < MinPos)
+ BestPos = MinPos;
+ if (BestPos < i) {
+ for (unsigned u = i; u > BestPos; --u) {
+ ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
+ ScheduledSUnits[u] = ScheduledSUnits[u-1];
+ }
+ ScheduledSUnits[BestPos] = SU->NodeNum;
+ ScheduledSUnitsInv[SU->NodeNum] = BestPos;
+ }
+ LastLowLatencyPos = BestPos;
+ if (IsLowLatencyUser)
+ LastLowLatencyUser = BestPos;
+ } else if (IsLowLatencyUser) {
+ LastLowLatencyUser = i;
+ // Moves COPY instructions on which depends
+ // the low latency instructions too.
+ } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) {
+ bool CopyForLowLat = false;
+ for (SDep& SuccDep : SU->Succs) {
+ SUnit *Succ = SuccDep.getSUnit();
+ if (SITII->isLowLatencyInstruction(*Succ->getInstr())) {
+ CopyForLowLat = true;
+ }
+ }
+ if (!CopyForLowLat)
+ continue;
+ if (MinPos < i) {
+ for (unsigned u = i; u > MinPos; --u) {
+ ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
+ ScheduledSUnits[u] = ScheduledSUnits[u-1];
+ }
+ ScheduledSUnits[MinPos] = SU->NodeNum;
+ ScheduledSUnitsInv[SU->NodeNum] = MinPos;
+ }
+ }
+ }
+}
+
+void SIScheduleDAGMI::restoreSULinksLeft() {
+ for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
+ SUnits[i].isScheduled = false;
+ SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft;
+ SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft;
+ SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft;
+ SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft;
+ }
+}
+
+// Return the Vgpr and Sgpr usage corresponding to some virtual registers.
+template<typename _Iterator> void
+SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
+ unsigned &VgprUsage, unsigned &SgprUsage) {
+ VgprUsage = 0;
+ SgprUsage = 0;
+ for (_Iterator RegI = First; RegI != End; ++RegI) {
+ unsigned Reg = *RegI;
+ // For now only track virtual registers
+ if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ continue;
+ PSetIterator PSetI = MRI.getPressureSets(Reg);
+ for (; PSetI.isValid(); ++PSetI) {
+ if (*PSetI == VGPRSetID)
+ VgprUsage += PSetI.getWeight();
+ else if (*PSetI == SGPRSetID)
+ SgprUsage += PSetI.getWeight();
+ }
+ }
+}
+
+void SIScheduleDAGMI::schedule()
+{
+ SmallVector<SUnit*, 8> TopRoots, BotRoots;
+ SIScheduleBlockResult Best, Temp;
+ DEBUG(dbgs() << "Preparing Scheduling\n");
+
+ buildDAGWithRegPressure();
+ DEBUG(
+ for(SUnit& SU : SUnits)
+ SU.dumpAll(this)
+ );
+
+ topologicalSort();
+ findRootsAndBiasEdges(TopRoots, BotRoots);
+ // We reuse several ScheduleDAGMI and ScheduleDAGMILive
+ // functions, but to make them happy we must initialize
+ // the default Scheduler implementation (even if we do not
+ // run it)
+ SchedImpl->initialize(this);
+ initQueues(TopRoots, BotRoots);
+
+ // Fill some stats to help scheduling.
+
+ SUnitsLinksBackup = SUnits;
+ IsLowLatencySU.clear();
+ LowLatencyOffset.clear();
+ IsHighLatencySU.clear();
+
+ IsLowLatencySU.resize(SUnits.size(), 0);
+ LowLatencyOffset.resize(SUnits.size(), 0);
+ IsHighLatencySU.resize(SUnits.size(), 0);
+
+ for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
+ SUnit *SU = &SUnits[i];
+ unsigned BaseLatReg;
+ int64_t OffLatReg;
+ if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
+ IsLowLatencySU[i] = 1;
+ if (SITII->getMemOpBaseRegImmOfs(*SU->getInstr(), BaseLatReg, OffLatReg,
+ TRI))
+ LowLatencyOffset[i] = OffLatReg;
+ } else if (SITII->isHighLatencyInstruction(*SU->getInstr()))
+ IsHighLatencySU[i] = 1;
+ }
+
+ SIScheduler Scheduler(this);
+ Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone,
+ SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage);
+
+ // if VGPR usage is extremely high, try other good performing variants
+ // which could lead to lower VGPR usage
+ if (Best.MaxVGPRUsage > 180) {
+ std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = {
+ { LatenciesAlone, BlockRegUsageLatency },
+// { LatenciesAlone, BlockRegUsage },
+ { LatenciesGrouped, BlockLatencyRegUsage },
+// { LatenciesGrouped, BlockRegUsageLatency },
+// { LatenciesGrouped, BlockRegUsage },
+ { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
+// { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
+// { LatenciesAlonePlusConsecutive, BlockRegUsage }
+ };
+ for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
+ Temp = Scheduler.scheduleVariant(v.first, v.second);
+ if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
+ Best = Temp;
+ }
+ }
+ // if VGPR usage is still extremely high, we may spill. Try other variants
+ // which are less performing, but that could lead to lower VGPR usage.
+ if (Best.MaxVGPRUsage > 200) {
+ std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = {
+// { LatenciesAlone, BlockRegUsageLatency },
+ { LatenciesAlone, BlockRegUsage },
+// { LatenciesGrouped, BlockLatencyRegUsage },
+ { LatenciesGrouped, BlockRegUsageLatency },
+ { LatenciesGrouped, BlockRegUsage },
+// { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
+ { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
+ { LatenciesAlonePlusConsecutive, BlockRegUsage }
+ };
+ for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
+ Temp = Scheduler.scheduleVariant(v.first, v.second);
+ if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
+ Best = Temp;
+ }
+ }
+
+ ScheduledSUnits = Best.SUs;
+ ScheduledSUnitsInv.resize(SUnits.size());
+
+ for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
+ ScheduledSUnitsInv[ScheduledSUnits[i]] = i;
+ }
+
+ moveLowLatencies();
+
+ // Tell the outside world about the result of the scheduling.
+
+ assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
+ TopRPTracker.setPos(CurrentTop);
+
+ for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(),
+ E = ScheduledSUnits.end(); I != E; ++I) {
+ SUnit *SU = &SUnits[*I];
+
+ scheduleMI(SU, true);
+
+ DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
+ << *SU->getInstr());
+ }
+
+ assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
+
+ placeDebugValues();
+
+ DEBUG({
+ unsigned BBNum = begin()->getParent()->getNumber();
+ dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n";
+ dumpSchedule();
+ dbgs() << '\n';
+ });
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
new file mode 100644
index 000000000000..77c07350d325
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
@@ -0,0 +1,493 @@
+//===-- SIMachineScheduler.h - SI Scheduler Interface -----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief SI Machine Scheduler interface
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
+#define LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
+
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include <cassert>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <set>
+#include <vector>
+
+namespace llvm {
+
+enum SIScheduleCandReason {
+ NoCand,
+ RegUsage,
+ Latency,
+ Successor,
+ Depth,
+ NodeOrder
+};
+
+struct SISchedulerCandidate {
+ // The reason for this candidate.
+ SIScheduleCandReason Reason;
+
+ // Set of reasons that apply to multiple candidates.
+ uint32_t RepeatReasonSet;
+
+ SISchedulerCandidate()
+ : Reason(NoCand), RepeatReasonSet(0) {}
+
+ bool isRepeat(SIScheduleCandReason R) { return RepeatReasonSet & (1 << R); }
+ void setRepeat(SIScheduleCandReason R) { RepeatReasonSet |= (1 << R); }
+};
+
+class SIScheduleDAGMI;
+class SIScheduleBlockCreator;
+
+class SIScheduleBlock {
+ SIScheduleDAGMI *DAG;
+ SIScheduleBlockCreator *BC;
+
+ std::vector<SUnit*> SUnits;
+ std::map<unsigned, unsigned> NodeNum2Index;
+ std::vector<SUnit*> TopReadySUs;
+ std::vector<SUnit*> ScheduledSUnits;
+
+ /// The top of the unscheduled zone.
+ IntervalPressure TopPressure;
+ RegPressureTracker TopRPTracker;
+
+ // Pressure: number of said class of registers needed to
+ // store the live virtual and real registers.
+ // We do care only of SGPR32 and VGPR32 and do track only virtual registers.
+ // Pressure of additional registers required inside the block.
+ std::vector<unsigned> InternalAdditionnalPressure;
+ // Pressure of input and output registers
+ std::vector<unsigned> LiveInPressure;
+ std::vector<unsigned> LiveOutPressure;
+ // Registers required by the block, and outputs.
+ // We do track only virtual registers.
+ // Note that some registers are not 32 bits,
+ // and thus the pressure is not equal
+ // to the number of live registers.
+ std::set<unsigned> LiveInRegs;
+ std::set<unsigned> LiveOutRegs;
+
+ bool Scheduled;
+ bool HighLatencyBlock;
+
+ std::vector<unsigned> HasLowLatencyNonWaitedParent;
+
+ // Unique ID, the index of the Block in the SIScheduleDAGMI Blocks table.
+ unsigned ID;
+
+ std::vector<SIScheduleBlock*> Preds; // All blocks predecessors.
+ std::vector<SIScheduleBlock*> Succs; // All blocks successors.
+ unsigned NumHighLatencySuccessors;
+
+public:
+ SIScheduleBlock(SIScheduleDAGMI *DAG, SIScheduleBlockCreator *BC,
+ unsigned ID):
+ DAG(DAG), BC(BC), TopRPTracker(TopPressure), Scheduled(false),
+ HighLatencyBlock(false), ID(ID), NumHighLatencySuccessors(0) {}
+
+ ~SIScheduleBlock() = default;
+
+ unsigned getID() const { return ID; }
+
+ /// Functions for Block construction.
+ void addUnit(SUnit *SU);
+
+ // When all SUs have been added.
+ void finalizeUnits();
+
+ // Add block pred, which has instruction predecessor of SU.
+ void addPred(SIScheduleBlock *Pred);
+ void addSucc(SIScheduleBlock *Succ);
+
+ const std::vector<SIScheduleBlock*>& getPreds() const { return Preds; }
+ const std::vector<SIScheduleBlock*>& getSuccs() const { return Succs; }
+
+ unsigned Height; // Maximum topdown path length to block without outputs
+ unsigned Depth; // Maximum bottomup path length to block without inputs
+
+ unsigned getNumHighLatencySuccessors() const {
+ return NumHighLatencySuccessors;
+ }
+
+ bool isHighLatencyBlock() { return HighLatencyBlock; }
+
+ // This is approximative.
+ // Ideally should take into accounts some instructions (rcp, etc)
+ // are 4 times slower.
+ int getCost() { return SUnits.size(); }
+
+ // The block Predecessors and Successors must be all registered
+ // before fastSchedule().
+ // Fast schedule with no particular requirement.
+ void fastSchedule();
+
+ std::vector<SUnit*> getScheduledUnits() { return ScheduledSUnits; }
+
+ // Complete schedule that will try to minimize reg pressure and
+ // low latencies, and will fill liveins and liveouts.
+ // Needs all MIs to be grouped between BeginBlock and EndBlock.
+ // The MIs can be moved after the scheduling,
+ // it is just used to allow correct track of live registers.
+ void schedule(MachineBasicBlock::iterator BeginBlock,
+ MachineBasicBlock::iterator EndBlock);
+
+ bool isScheduled() { return Scheduled; }
+
+ // Needs the block to be scheduled inside
+ // TODO: find a way to compute it.
+ std::vector<unsigned> &getInternalAdditionnalRegUsage() {
+ return InternalAdditionnalPressure;
+ }
+
+ std::set<unsigned> &getInRegs() { return LiveInRegs; }
+ std::set<unsigned> &getOutRegs() { return LiveOutRegs; }
+
+ void printDebug(bool Full);
+
+private:
+ struct SISchedCandidate : SISchedulerCandidate {
+ // The best SUnit candidate.
+ SUnit *SU = nullptr;
+
+ unsigned SGPRUsage;
+ unsigned VGPRUsage;
+ bool IsLowLatency;
+ unsigned LowLatencyOffset;
+ bool HasLowLatencyNonWaitedParent;
+
+ SISchedCandidate() = default;
+
+ bool isValid() const { return SU; }
+
+ // Copy the status of another candidate without changing policy.
+ void setBest(SISchedCandidate &Best) {
+ assert(Best.Reason != NoCand && "uninitialized Sched candidate");
+ SU = Best.SU;
+ Reason = Best.Reason;
+ SGPRUsage = Best.SGPRUsage;
+ VGPRUsage = Best.VGPRUsage;
+ IsLowLatency = Best.IsLowLatency;
+ LowLatencyOffset = Best.LowLatencyOffset;
+ HasLowLatencyNonWaitedParent = Best.HasLowLatencyNonWaitedParent;
+ }
+ };
+
+ void undoSchedule();
+
+ void undoReleaseSucc(SUnit *SU, SDep *SuccEdge);
+ void releaseSucc(SUnit *SU, SDep *SuccEdge);
+ // InOrOutBlock: restrict to links pointing inside the block (true),
+ // or restrict to links pointing outside the block (false).
+ void releaseSuccessors(SUnit *SU, bool InOrOutBlock);
+
+ void nodeScheduled(SUnit *SU);
+ void tryCandidateTopDown(SISchedCandidate &Cand, SISchedCandidate &TryCand);
+ void tryCandidateBottomUp(SISchedCandidate &Cand, SISchedCandidate &TryCand);
+ SUnit* pickNode();
+ void traceCandidate(const SISchedCandidate &Cand);
+ void initRegPressure(MachineBasicBlock::iterator BeginBlock,
+ MachineBasicBlock::iterator EndBlock);
+};
+
+struct SIScheduleBlocks {
+ std::vector<SIScheduleBlock*> Blocks;
+ std::vector<int> TopDownIndex2Block;
+ std::vector<int> TopDownBlock2Index;
+};
+
+enum SISchedulerBlockCreatorVariant {
+ LatenciesAlone,
+ LatenciesGrouped,
+ LatenciesAlonePlusConsecutive
+};
+
+class SIScheduleBlockCreator {
+ SIScheduleDAGMI *DAG;
+ // unique_ptr handles freeing memory for us.
+ std::vector<std::unique_ptr<SIScheduleBlock>> BlockPtrs;
+ std::map<SISchedulerBlockCreatorVariant,
+ SIScheduleBlocks> Blocks;
+ std::vector<SIScheduleBlock*> CurrentBlocks;
+ std::vector<int> Node2CurrentBlock;
+
+ // Topological sort
+ // Maps topological index to the node number.
+ std::vector<int> TopDownIndex2Block;
+ std::vector<int> TopDownBlock2Index;
+ std::vector<int> BottomUpIndex2Block;
+
+ // 0 -> Color not given.
+ // 1 to SUnits.size() -> Reserved group (you should only add elements to them).
+ // Above -> Other groups.
+ int NextReservedID;
+ int NextNonReservedID;
+ std::vector<int> CurrentColoring;
+ std::vector<int> CurrentTopDownReservedDependencyColoring;
+ std::vector<int> CurrentBottomUpReservedDependencyColoring;
+
+public:
+ SIScheduleBlockCreator(SIScheduleDAGMI *DAG);
+ ~SIScheduleBlockCreator();
+
+ SIScheduleBlocks
+ getBlocks(SISchedulerBlockCreatorVariant BlockVariant);
+
+ bool isSUInBlock(SUnit *SU, unsigned ID);
+
+private:
+ // Give a Reserved color to every high latency.
+ void colorHighLatenciesAlone();
+
+ // Create groups of high latencies with a Reserved color.
+ void colorHighLatenciesGroups();
+
+ // Compute coloring for topdown and bottom traversals with
+ // different colors depending on dependencies on Reserved colors.
+ void colorComputeReservedDependencies();
+
+ // Give color to all non-colored SUs according to Reserved groups dependencies.
+ void colorAccordingToReservedDependencies();
+
+ // Divides Blocks having no bottom up or top down dependencies on Reserved groups.
+ // The new colors are computed according to the dependencies on the other blocks
+ // formed with colorAccordingToReservedDependencies.
+ void colorEndsAccordingToDependencies();
+
+ // Cut groups into groups with SUs in consecutive order (except for Reserved groups).
+ void colorForceConsecutiveOrderInGroup();
+
+ // Merge Constant loads that have all their users into another group to the group.
+ // (TODO: else if all their users depend on the same group, put them there)
+ void colorMergeConstantLoadsNextGroup();
+
+ // Merge SUs that have all their users into another group to the group
+ void colorMergeIfPossibleNextGroup();
+
+ // Merge SUs that have all their users into another group to the group,
+ // but only for Reserved groups.
+ void colorMergeIfPossibleNextGroupOnlyForReserved();
+
+ // Merge SUs that have all their users into another group to the group,
+ // but only if the group is no more than a few SUs.
+ void colorMergeIfPossibleSmallGroupsToNextGroup();
+
+ // Divides Blocks with important size.
+ // Idea of implementation: attribute new colors depending on topdown and
+ // bottom up links to other blocks.
+ void cutHugeBlocks();
+
+ // Put in one group all instructions with no users in this scheduling region
+ // (we'd want these groups be at the end).
+ void regroupNoUserInstructions();
+
+ void createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant);
+
+ void topologicalSort();
+
+ void scheduleInsideBlocks();
+
+ void fillStats();
+};
+
+enum SISchedulerBlockSchedulerVariant {
+ BlockLatencyRegUsage,
+ BlockRegUsageLatency,
+ BlockRegUsage
+};
+
+class SIScheduleBlockScheduler {
+ SIScheduleDAGMI *DAG;
+ SISchedulerBlockSchedulerVariant Variant;
+ std::vector<SIScheduleBlock*> Blocks;
+
+ std::vector<std::map<unsigned, unsigned>> LiveOutRegsNumUsages;
+ std::set<unsigned> LiveRegs;
+ // Num of schedulable unscheduled blocks reading the register.
+ std::map<unsigned, unsigned> LiveRegsConsumers;
+
+ std::vector<unsigned> LastPosHighLatencyParentScheduled;
+ int LastPosWaitedHighLatency;
+
+ std::vector<SIScheduleBlock*> BlocksScheduled;
+ unsigned NumBlockScheduled;
+ std::vector<SIScheduleBlock*> ReadyBlocks;
+
+ unsigned VregCurrentUsage;
+ unsigned SregCurrentUsage;
+
+ // Currently is only approximation.
+ unsigned maxVregUsage;
+ unsigned maxSregUsage;
+
+ std::vector<unsigned> BlockNumPredsLeft;
+ std::vector<unsigned> BlockNumSuccsLeft;
+
+public:
+ SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
+ SISchedulerBlockSchedulerVariant Variant,
+ SIScheduleBlocks BlocksStruct);
+ ~SIScheduleBlockScheduler() = default;
+
+ std::vector<SIScheduleBlock*> getBlocks() { return BlocksScheduled; }
+
+ unsigned getVGPRUsage() { return maxVregUsage; }
+ unsigned getSGPRUsage() { return maxSregUsage; }
+
+private:
+ struct SIBlockSchedCandidate : SISchedulerCandidate {
+ // The best Block candidate.
+ SIScheduleBlock *Block = nullptr;
+
+ bool IsHighLatency;
+ int VGPRUsageDiff;
+ unsigned NumSuccessors;
+ unsigned NumHighLatencySuccessors;
+ unsigned LastPosHighLatParentScheduled;
+ unsigned Height;
+
+ SIBlockSchedCandidate() = default;
+
+ bool isValid() const { return Block; }
+
+ // Copy the status of another candidate without changing policy.
+ void setBest(SIBlockSchedCandidate &Best) {
+ assert(Best.Reason != NoCand && "uninitialized Sched candidate");
+ Block = Best.Block;
+ Reason = Best.Reason;
+ IsHighLatency = Best.IsHighLatency;
+ VGPRUsageDiff = Best.VGPRUsageDiff;
+ NumSuccessors = Best.NumSuccessors;
+ NumHighLatencySuccessors = Best.NumHighLatencySuccessors;
+ LastPosHighLatParentScheduled = Best.LastPosHighLatParentScheduled;
+ Height = Best.Height;
+ }
+ };
+
+ bool tryCandidateLatency(SIBlockSchedCandidate &Cand,
+ SIBlockSchedCandidate &TryCand);
+ bool tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
+ SIBlockSchedCandidate &TryCand);
+ SIScheduleBlock *pickBlock();
+
+ void addLiveRegs(std::set<unsigned> &Regs);
+ void decreaseLiveRegs(SIScheduleBlock *Block, std::set<unsigned> &Regs);
+ void releaseBlockSuccs(SIScheduleBlock *Parent);
+ void blockScheduled(SIScheduleBlock *Block);
+
+ // Check register pressure change
+ // by scheduling a block with these LiveIn and LiveOut.
+ std::vector<int> checkRegUsageImpact(std::set<unsigned> &InRegs,
+ std::set<unsigned> &OutRegs);
+
+ void schedule();
+};
+
+struct SIScheduleBlockResult {
+ std::vector<unsigned> SUs;
+ unsigned MaxSGPRUsage;
+ unsigned MaxVGPRUsage;
+};
+
+class SIScheduler {
+ SIScheduleDAGMI *DAG;
+ SIScheduleBlockCreator BlockCreator;
+
+public:
+ SIScheduler(SIScheduleDAGMI *DAG) : DAG(DAG), BlockCreator(DAG) {}
+
+ ~SIScheduler() = default;
+
+ struct SIScheduleBlockResult
+ scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
+ SISchedulerBlockSchedulerVariant ScheduleVariant);
+};
+
+class SIScheduleDAGMI final : public ScheduleDAGMILive {
+ const SIInstrInfo *SITII;
+ const SIRegisterInfo *SITRI;
+
+ std::vector<SUnit> SUnitsLinksBackup;
+
+ // For moveLowLatencies. After all Scheduling variants are tested.
+ std::vector<unsigned> ScheduledSUnits;
+ std::vector<unsigned> ScheduledSUnitsInv;
+
+ unsigned VGPRSetID;
+ unsigned SGPRSetID;
+
+public:
+ SIScheduleDAGMI(MachineSchedContext *C);
+
+ ~SIScheduleDAGMI() override;
+
+ // Entry point for the schedule.
+ void schedule() override;
+
+ // To init Block's RPTracker.
+ void initRPTracker(RegPressureTracker &RPTracker) {
+ RPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin, false, false);
+ }
+
+ MachineBasicBlock *getBB() { return BB; }
+ MachineBasicBlock::iterator getCurrentTop() { return CurrentTop; }
+ MachineBasicBlock::iterator getCurrentBottom() { return CurrentBottom; }
+ LiveIntervals *getLIS() { return LIS; }
+ MachineRegisterInfo *getMRI() { return &MRI; }
+ const TargetRegisterInfo *getTRI() { return TRI; }
+ SUnit& getEntrySU() { return EntrySU; }
+ SUnit& getExitSU() { return ExitSU; }
+
+ void restoreSULinksLeft();
+
+ template<typename _Iterator> void fillVgprSgprCost(_Iterator First,
+ _Iterator End,
+ unsigned &VgprUsage,
+ unsigned &SgprUsage);
+
+ std::set<unsigned> getInRegs() {
+ std::set<unsigned> InRegs;
+ for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
+ InRegs.insert(RegMaskPair.RegUnit);
+ }
+ return InRegs;
+ }
+
+ unsigned getVGPRSetID() const { return VGPRSetID; }
+ unsigned getSGPRSetID() const { return SGPRSetID; }
+
+private:
+ void topologicalSort();
+ // After scheduling is done, improve low latency placements.
+ void moveLowLatencies();
+
+public:
+ // Some stats for scheduling inside blocks.
+ std::vector<unsigned> IsLowLatencySU;
+ std::vector<unsigned> LowLatencyOffset;
+ std::vector<unsigned> IsHighLatencySU;
+ // Topological sort
+ // Maps topological index to the node number.
+ std::vector<int> TopDownIndex2SU;
+ std::vector<int> BottomUpIndex2SU;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/contrib/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
new file mode 100644
index 000000000000..4d2f917278e9
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -0,0 +1,304 @@
+//===-- SIOptimizeExecMasking.cpp -----------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-optimize-exec-masking"
+
+namespace {
+
+class SIOptimizeExecMasking : public MachineFunctionPass {
+public:
+ static char ID;
+
+public:
+ SIOptimizeExecMasking() : MachineFunctionPass(ID) {
+ initializeSIOptimizeExecMaskingPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "SI optimize exec mask operations";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SIOptimizeExecMasking, DEBUG_TYPE,
+ "SI optimize exec mask operations", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(SIOptimizeExecMasking, DEBUG_TYPE,
+ "SI optimize exec mask operations", false, false)
+
+char SIOptimizeExecMasking::ID = 0;
+
+char &llvm::SIOptimizeExecMaskingID = SIOptimizeExecMasking::ID;
+
+/// If \p MI is a copy from exec, return the register copied to.
+static unsigned isCopyFromExec(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case AMDGPU::COPY:
+ case AMDGPU::S_MOV_B64:
+ case AMDGPU::S_MOV_B64_term: {
+ const MachineOperand &Src = MI.getOperand(1);
+ if (Src.isReg() && Src.getReg() == AMDGPU::EXEC)
+ return MI.getOperand(0).getReg();
+ }
+ }
+
+ return AMDGPU::NoRegister;
+}
+
+/// If \p MI is a copy to exec, return the register copied from.
+static unsigned isCopyToExec(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case AMDGPU::COPY:
+ case AMDGPU::S_MOV_B64: {
+ const MachineOperand &Dst = MI.getOperand(0);
+ if (Dst.isReg() && Dst.getReg() == AMDGPU::EXEC)
+ return MI.getOperand(1).getReg();
+ break;
+ }
+ case AMDGPU::S_MOV_B64_term:
+ llvm_unreachable("should have been replaced");
+ }
+
+ return AMDGPU::NoRegister;
+}
+
+static unsigned getSaveExecOp(unsigned Opc) {
+ switch (Opc) {
+ case AMDGPU::S_AND_B64:
+ return AMDGPU::S_AND_SAVEEXEC_B64;
+ case AMDGPU::S_OR_B64:
+ return AMDGPU::S_OR_SAVEEXEC_B64;
+ case AMDGPU::S_XOR_B64:
+ return AMDGPU::S_XOR_SAVEEXEC_B64;
+ case AMDGPU::S_ANDN2_B64:
+ return AMDGPU::S_ANDN2_SAVEEXEC_B64;
+ case AMDGPU::S_ORN2_B64:
+ return AMDGPU::S_ORN2_SAVEEXEC_B64;
+ case AMDGPU::S_NAND_B64:
+ return AMDGPU::S_NAND_SAVEEXEC_B64;
+ case AMDGPU::S_NOR_B64:
+ return AMDGPU::S_NOR_SAVEEXEC_B64;
+ case AMDGPU::S_XNOR_B64:
+ return AMDGPU::S_XNOR_SAVEEXEC_B64;
+ default:
+ return AMDGPU::INSTRUCTION_LIST_END;
+ }
+}
+
+// These are only terminators to get correct spill code placement during
+// register allocation, so turn them back into normal instructions. Only one of
+// these is expected per block.
+static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case AMDGPU::S_MOV_B64_term: {
+ MI.setDesc(TII.get(AMDGPU::COPY));
+ return true;
+ }
+ case AMDGPU::S_XOR_B64_term: {
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(TII.get(AMDGPU::S_XOR_B64));
+ return true;
+ }
+ case AMDGPU::S_ANDN2_B64_term: {
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(TII.get(AMDGPU::S_ANDN2_B64));
+ return true;
+ }
+ default:
+ return false;
+ }
+}
+
+static MachineBasicBlock::reverse_iterator fixTerminators(
+ const SIInstrInfo &TII,
+ MachineBasicBlock &MBB) {
+ MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend();
+ for (; I != E; ++I) {
+ if (!I->isTerminator())
+ return I;
+
+ if (removeTerminatorBit(TII, *I))
+ return I;
+ }
+
+ return E;
+}
+
+static MachineBasicBlock::reverse_iterator findExecCopy(
+ const SIInstrInfo &TII,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::reverse_iterator I,
+ unsigned CopyToExec) {
+ const unsigned InstLimit = 25;
+
+ auto E = MBB.rend();
+ for (unsigned N = 0; N <= InstLimit && I != E; ++I, ++N) {
+ unsigned CopyFromExec = isCopyFromExec(*I);
+ if (CopyFromExec != AMDGPU::NoRegister)
+ return I;
+ }
+
+ return E;
+}
+
+// XXX - Seems LivePhysRegs doesn't work correctly since it will incorrectly
+// repor tthe register as unavailable because a super-register with a lane mask
+// as unavailable.
+static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) {
+ for (MachineBasicBlock *Succ : MBB.successors()) {
+ if (Succ->isLiveIn(Reg))
+ return true;
+ }
+
+ return false;
+}
+
+bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+
+ // Optimize sequences emitted for control flow lowering. They are originally
+ // emitted as the separate operations because spill code may need to be
+ // inserted for the saved copy of exec.
+ //
+ // x = copy exec
+ // z = s_<op>_b64 x, y
+ // exec = copy z
+ // =>
+ // x = s_<op>_saveexec_b64 y
+ //
+
+ for (MachineBasicBlock &MBB : MF) {
+ MachineBasicBlock::reverse_iterator I = fixTerminators(*TII, MBB);
+ MachineBasicBlock::reverse_iterator E = MBB.rend();
+ if (I == E)
+ continue;
+
+ unsigned CopyToExec = isCopyToExec(*I);
+ if (CopyToExec == AMDGPU::NoRegister)
+ continue;
+
+ // Scan backwards to find the def.
+ auto CopyToExecInst = &*I;
+ auto CopyFromExecInst = findExecCopy(*TII, MBB, I, CopyToExec);
+ if (CopyFromExecInst == E)
+ continue;
+
+ if (isLiveOut(MBB, CopyToExec)) {
+ // The copied register is live out and has a second use in another block.
+ DEBUG(dbgs() << "Exec copy source register is live out\n");
+ continue;
+ }
+
+ unsigned CopyFromExec = CopyFromExecInst->getOperand(0).getReg();
+ MachineInstr *SaveExecInst = nullptr;
+ SmallVector<MachineInstr *, 4> OtherUseInsts;
+
+ for (MachineBasicBlock::iterator J
+ = std::next(CopyFromExecInst->getIterator()), JE = I->getIterator();
+ J != JE; ++J) {
+ if (SaveExecInst && J->readsRegister(AMDGPU::EXEC, TRI)) {
+ DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n');
+ // Make sure this is inserted after any VALU ops that may have been
+ // scheduled in between.
+ SaveExecInst = nullptr;
+ break;
+ }
+
+ if (J->modifiesRegister(CopyToExec, TRI)) {
+ if (SaveExecInst) {
+ DEBUG(dbgs() << "Multiple instructions modify "
+ << PrintReg(CopyToExec, TRI) << '\n');
+ SaveExecInst = nullptr;
+ break;
+ }
+
+ unsigned SaveExecOp = getSaveExecOp(J->getOpcode());
+ if (SaveExecOp == AMDGPU::INSTRUCTION_LIST_END)
+ break;
+
+ if (J->readsRegister(CopyFromExec, TRI)) {
+ SaveExecInst = &*J;
+ DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n');
+ continue;
+ } else {
+ DEBUG(dbgs() << "Instruction does not read exec copy: " << *J << '\n');
+ break;
+ }
+ }
+
+ if (SaveExecInst && J->readsRegister(CopyToExec, TRI)) {
+ assert(SaveExecInst != &*J);
+ OtherUseInsts.push_back(&*J);
+ }
+ }
+
+ if (!SaveExecInst)
+ continue;
+
+ DEBUG(dbgs() << "Insert save exec op: " << *SaveExecInst << '\n');
+
+ MachineOperand &Src0 = SaveExecInst->getOperand(1);
+ MachineOperand &Src1 = SaveExecInst->getOperand(2);
+
+ MachineOperand *OtherOp = nullptr;
+
+ if (Src0.isReg() && Src0.getReg() == CopyFromExec) {
+ OtherOp = &Src1;
+ } else if (Src1.isReg() && Src1.getReg() == CopyFromExec) {
+ if (!SaveExecInst->isCommutable())
+ break;
+
+ OtherOp = &Src0;
+ } else
+ llvm_unreachable("unexpected");
+
+ CopyFromExecInst->eraseFromParent();
+
+ auto InsPt = SaveExecInst->getIterator();
+ const DebugLoc &DL = SaveExecInst->getDebugLoc();
+
+ BuildMI(MBB, InsPt, DL, TII->get(getSaveExecOp(SaveExecInst->getOpcode())),
+ CopyFromExec)
+ .addReg(OtherOp->getReg());
+ SaveExecInst->eraseFromParent();
+
+ CopyToExecInst->eraseFromParent();
+
+ for (MachineInstr *OtherInst : OtherUseInsts) {
+ OtherInst->substituteRegister(CopyToExec, AMDGPU::EXEC,
+ AMDGPU::NoSubRegister, *TRI);
+ }
+ }
+
+ return true;
+
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
new file mode 100644
index 000000000000..8c4b24a4504d
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -0,0 +1,1476 @@
+//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief SI implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SIRegisterInfo.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+
+using namespace llvm;
+
+static cl::opt<bool> EnableSpillSGPRToSMEM(
+ "amdgpu-spill-sgpr-to-smem",
+ cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"),
+ cl::init(false));
+
+
+static bool hasPressureSet(const int *PSets, unsigned PSetID) {
+ for (unsigned i = 0; PSets[i] != -1; ++i) {
+ if (PSets[i] == (int)PSetID)
+ return true;
+ }
+ return false;
+}
+
+void SIRegisterInfo::classifyPressureSet(unsigned PSetID, unsigned Reg,
+ BitVector &PressureSets) const {
+ for (MCRegUnitIterator U(Reg, this); U.isValid(); ++U) {
+ const int *PSets = getRegUnitPressureSets(*U);
+ if (hasPressureSet(PSets, PSetID)) {
+ PressureSets.set(PSetID);
+ break;
+ }
+ }
+}
+
+SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo(),
+ SGPRPressureSets(getNumRegPressureSets()),
+ VGPRPressureSets(getNumRegPressureSets()) {
+ unsigned NumRegPressureSets = getNumRegPressureSets();
+
+ SGPRSetID = NumRegPressureSets;
+ VGPRSetID = NumRegPressureSets;
+
+ for (unsigned i = 0; i < NumRegPressureSets; ++i) {
+ classifyPressureSet(i, AMDGPU::SGPR0, SGPRPressureSets);
+ classifyPressureSet(i, AMDGPU::VGPR0, VGPRPressureSets);
+ }
+
+ // Determine the number of reg units for each pressure set.
+ std::vector<unsigned> PressureSetRegUnits(NumRegPressureSets, 0);
+ for (unsigned i = 0, e = getNumRegUnits(); i != e; ++i) {
+ const int *PSets = getRegUnitPressureSets(i);
+ for (unsigned j = 0; PSets[j] != -1; ++j) {
+ ++PressureSetRegUnits[PSets[j]];
+ }
+ }
+
+ unsigned VGPRMax = 0, SGPRMax = 0;
+ for (unsigned i = 0; i < NumRegPressureSets; ++i) {
+ if (isVGPRPressureSet(i) && PressureSetRegUnits[i] > VGPRMax) {
+ VGPRSetID = i;
+ VGPRMax = PressureSetRegUnits[i];
+ continue;
+ }
+ if (isSGPRPressureSet(i) && PressureSetRegUnits[i] > SGPRMax) {
+ SGPRSetID = i;
+ SGPRMax = PressureSetRegUnits[i];
+ }
+ }
+
+ assert(SGPRSetID < NumRegPressureSets &&
+ VGPRSetID < NumRegPressureSets);
+}
+
+void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const {
+ MCRegAliasIterator R(Reg, this, true);
+
+ for (; R.isValid(); ++R)
+ Reserved.set(*R);
+}
+
+unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
+ const MachineFunction &MF) const {
+ unsigned BaseIdx = alignDown(getMaxNumSGPRs(MF), 4) - 4;
+ unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
+ return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
+}
+
+unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
+ const MachineFunction &MF) const {
+ unsigned RegCount = getMaxNumSGPRs(MF);
+ unsigned Reg;
+
+ // Try to place it in a hole after PrivateSegmentbufferReg.
+ if (RegCount & 3) {
+ // We cannot put the segment buffer in (Idx - 4) ... (Idx - 1) due to
+ // alignment constraints, so we have a hole where can put the wave offset.
+ Reg = RegCount - 1;
+ } else {
+ // We can put the segment buffer in (Idx - 4) ... (Idx - 1) and put the
+ // wave offset before it.
+ Reg = RegCount - 5;
+ }
+ return AMDGPU::SGPR_32RegClass.getRegister(Reg);
+}
+
+BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+ BitVector Reserved(getNumRegs());
+ Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
+
+ // EXEC_LO and EXEC_HI could be allocated and used as regular register, but
+ // this seems likely to result in bugs, so I'm marking them as reserved.
+ reserveRegisterTuples(Reserved, AMDGPU::EXEC);
+ reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
+
+ // Reserve Trap Handler registers - support is not implemented in Codegen.
+ reserveRegisterTuples(Reserved, AMDGPU::TBA);
+ reserveRegisterTuples(Reserved, AMDGPU::TMA);
+ reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1);
+ reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3);
+ reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5);
+ reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7);
+ reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
+ reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
+
+ unsigned MaxNumSGPRs = getMaxNumSGPRs(MF);
+ unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
+ for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) {
+ unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
+ reserveRegisterTuples(Reserved, Reg);
+ }
+
+ unsigned MaxNumVGPRs = getMaxNumVGPRs(MF);
+ unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
+ for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) {
+ unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
+ reserveRegisterTuples(Reserved, Reg);
+ }
+
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+ unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
+ if (ScratchWaveOffsetReg != AMDGPU::NoRegister) {
+ // Reserve 1 SGPR for scratch wave offset in case we need to spill.
+ reserveRegisterTuples(Reserved, ScratchWaveOffsetReg);
+ }
+
+ unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
+ if (ScratchRSrcReg != AMDGPU::NoRegister) {
+ // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need
+ // to spill.
+ // TODO: May need to reserve a VGPR if doing LDS spilling.
+ reserveRegisterTuples(Reserved, ScratchRSrcReg);
+ assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg));
+ }
+
+ return Reserved;
+}
+
+bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
+ return Fn.getFrameInfo().hasStackObjects();
+}
+
+bool
+SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const {
+ return MF.getFrameInfo().hasStackObjects();
+}
+
+bool SIRegisterInfo::requiresFrameIndexReplacementScavenging(
+ const MachineFunction &MF) const {
+ // m0 is needed for the scalar store offset. m0 is unallocatable, so we can't
+ // create a virtual register for it during frame index elimination, so the
+ // scavenger is directly needed.
+ return MF.getFrameInfo().hasStackObjects() &&
+ MF.getSubtarget<SISubtarget>().hasScalarStores() &&
+ MF.getInfo<SIMachineFunctionInfo>()->hasSpilledSGPRs();
+}
+
+bool SIRegisterInfo::requiresVirtualBaseRegisters(
+ const MachineFunction &) const {
+ // There are no special dedicated stack or frame pointers.
+ return true;
+}
+
+bool SIRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+ // This helps catch bugs as verifier errors.
+ return true;
+}
+
+int64_t SIRegisterInfo::getMUBUFInstrOffset(const MachineInstr *MI) const {
+ assert(SIInstrInfo::isMUBUF(*MI));
+
+ int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+ AMDGPU::OpName::offset);
+ return MI->getOperand(OffIdx).getImm();
+}
+
+int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI,
+ int Idx) const {
+ if (!SIInstrInfo::isMUBUF(*MI))
+ return 0;
+
+ assert(Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+ AMDGPU::OpName::vaddr) &&
+ "Should never see frame index on non-address operand");
+
+ return getMUBUFInstrOffset(MI);
+}
+
+bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
+ if (!MI->mayLoadOrStore())
+ return false;
+
+ int64_t FullOffset = Offset + getMUBUFInstrOffset(MI);
+
+ return !isUInt<12>(FullOffset);
+}
+
+void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
+ unsigned BaseReg,
+ int FrameIdx,
+ int64_t Offset) const {
+ MachineBasicBlock::iterator Ins = MBB->begin();
+ DebugLoc DL; // Defaults to "unknown"
+
+ if (Ins != MBB->end())
+ DL = Ins->getDebugLoc();
+
+ MachineFunction *MF = MBB->getParent();
+ const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>();
+ const SIInstrInfo *TII = Subtarget.getInstrInfo();
+
+ if (Offset == 0) {
+ BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), BaseReg)
+ .addFrameIndex(FrameIdx);
+ return;
+ }
+
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+ unsigned FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
+ .addImm(Offset);
+ BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), FIReg)
+ .addFrameIndex(FrameIdx);
+
+ BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_ADD_I32_e64), BaseReg)
+ .addReg(UnusedCarry, RegState::Define | RegState::Dead)
+ .addReg(OffsetReg, RegState::Kill)
+ .addReg(FIReg);
+}
+
+void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+ int64_t Offset) const {
+
+ MachineBasicBlock *MBB = MI.getParent();
+ MachineFunction *MF = MBB->getParent();
+ const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>();
+ const SIInstrInfo *TII = Subtarget.getInstrInfo();
+
+#ifndef NDEBUG
+ // FIXME: Is it possible to be storing a frame index to itself?
+ bool SeenFI = false;
+ for (const MachineOperand &MO: MI.operands()) {
+ if (MO.isFI()) {
+ if (SeenFI)
+ llvm_unreachable("should not see multiple frame indices");
+
+ SeenFI = true;
+ }
+ }
+#endif
+
+ MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
+ assert(FIOp && FIOp->isFI() && "frame index must be address operand");
+
+ assert(TII->isMUBUF(MI));
+
+ MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
+ int64_t NewOffset = OffsetOp->getImm() + Offset;
+ assert(isUInt<12>(NewOffset) && "offset should be legal");
+
+ FIOp->ChangeToRegister(BaseReg, false);
+ OffsetOp->setImm(NewOffset);
+}
+
+bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
+ unsigned BaseReg,
+ int64_t Offset) const {
+ if (!SIInstrInfo::isMUBUF(*MI))
+ return false;
+
+ int64_t NewOffset = Offset + getMUBUFInstrOffset(MI);
+
+ return isUInt<12>(NewOffset);
+}
+
+const TargetRegisterClass *SIRegisterInfo::getPointerRegClass(
+ const MachineFunction &MF, unsigned Kind) const {
+ // This is inaccurate. It depends on the instruction and address space. The
+ // only place where we should hit this is for dealing with frame indexes /
+ // private accesses, so this is correct in that case.
+ return &AMDGPU::VGPR_32RegClass;
+}
+
+static unsigned getNumSubRegsForSpillOp(unsigned Op) {
+
+ switch (Op) {
+ case AMDGPU::SI_SPILL_S512_SAVE:
+ case AMDGPU::SI_SPILL_S512_RESTORE:
+ case AMDGPU::SI_SPILL_V512_SAVE:
+ case AMDGPU::SI_SPILL_V512_RESTORE:
+ return 16;
+ case AMDGPU::SI_SPILL_S256_SAVE:
+ case AMDGPU::SI_SPILL_S256_RESTORE:
+ case AMDGPU::SI_SPILL_V256_SAVE:
+ case AMDGPU::SI_SPILL_V256_RESTORE:
+ return 8;
+ case AMDGPU::SI_SPILL_S128_SAVE:
+ case AMDGPU::SI_SPILL_S128_RESTORE:
+ case AMDGPU::SI_SPILL_V128_SAVE:
+ case AMDGPU::SI_SPILL_V128_RESTORE:
+ return 4;
+ case AMDGPU::SI_SPILL_V96_SAVE:
+ case AMDGPU::SI_SPILL_V96_RESTORE:
+ return 3;
+ case AMDGPU::SI_SPILL_S64_SAVE:
+ case AMDGPU::SI_SPILL_S64_RESTORE:
+ case AMDGPU::SI_SPILL_V64_SAVE:
+ case AMDGPU::SI_SPILL_V64_RESTORE:
+ return 2;
+ case AMDGPU::SI_SPILL_S32_SAVE:
+ case AMDGPU::SI_SPILL_S32_RESTORE:
+ case AMDGPU::SI_SPILL_V32_SAVE:
+ case AMDGPU::SI_SPILL_V32_RESTORE:
+ return 1;
+ default: llvm_unreachable("Invalid spill opcode");
+ }
+}
+
+static int getOffsetMUBUFStore(unsigned Opc) {
+ switch (Opc) {
+ case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
+ return AMDGPU::BUFFER_STORE_DWORD_OFFSET;
+ case AMDGPU::BUFFER_STORE_BYTE_OFFEN:
+ return AMDGPU::BUFFER_STORE_BYTE_OFFSET;
+ case AMDGPU::BUFFER_STORE_SHORT_OFFEN:
+ return AMDGPU::BUFFER_STORE_SHORT_OFFSET;
+ case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
+ return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
+ case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN:
+ return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
+ default:
+ return -1;
+ }
+}
+
+static int getOffsetMUBUFLoad(unsigned Opc) {
+ switch (Opc) {
+ case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
+ return AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
+ case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN:
+ return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET;
+ case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN:
+ return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET;
+ case AMDGPU::BUFFER_LOAD_USHORT_OFFEN:
+ return AMDGPU::BUFFER_LOAD_USHORT_OFFSET;
+ case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN:
+ return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET;
+ case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN:
+ return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
+ case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN:
+ return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET;
+ default:
+ return -1;
+ }
+}
+
+// This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
+// need to handle the case where an SGPR may need to be spilled while spilling.
+static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII,
+ MachineFrameInfo &MFI,
+ MachineBasicBlock::iterator MI,
+ int Index,
+ int64_t Offset) {
+ MachineBasicBlock *MBB = MI->getParent();
+ const DebugLoc &DL = MI->getDebugLoc();
+ bool IsStore = MI->mayStore();
+
+ unsigned Opc = MI->getOpcode();
+ int LoadStoreOp = IsStore ?
+ getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc);
+ if (LoadStoreOp == -1)
+ return false;
+
+ unsigned Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata)->getReg();
+
+ BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
+ .addReg(Reg, getDefRegState(!IsStore))
+ .addOperand(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
+ .addOperand(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
+ .addImm(Offset)
+ .addImm(0) // glc
+ .addImm(0) // slc
+ .addImm(0) // tfe
+ .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+ return true;
+}
+
+void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
+ unsigned LoadStoreOp,
+ int Index,
+ unsigned ValueReg,
+ bool IsKill,
+ unsigned ScratchRsrcReg,
+ unsigned ScratchOffsetReg,
+ int64_t InstOffset,
+ MachineMemOperand *MMO,
+ RegScavenger *RS) const {
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineFunction *MF = MI->getParent()->getParent();
+ const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const MachineFrameInfo &MFI = MF->getFrameInfo();
+
+ const MCInstrDesc &Desc = TII->get(LoadStoreOp);
+ const DebugLoc &DL = MI->getDebugLoc();
+ bool IsStore = Desc.mayStore();
+
+ bool RanOutOfSGPRs = false;
+ bool Scavenged = false;
+ unsigned SOffset = ScratchOffsetReg;
+
+ const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
+ unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / 32;
+ unsigned Size = NumSubRegs * 4;
+ int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
+ const int64_t OriginalImmOffset = Offset;
+
+ unsigned Align = MFI.getObjectAlignment(Index);
+ const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
+
+ if (!isUInt<12>(Offset + Size)) {
+ SOffset = AMDGPU::NoRegister;
+
+ // We don't have access to the register scavenger if this function is called
+ // during PEI::scavengeFrameVirtualRegs().
+ if (RS)
+ SOffset = RS->FindUnusedReg(&AMDGPU::SGPR_32RegClass);
+
+ if (SOffset == AMDGPU::NoRegister) {
+ // There are no free SGPRs, and since we are in the process of spilling
+ // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true
+ // on SI/CI and on VI it is true until we implement spilling using scalar
+ // stores), we have no way to free up an SGPR. Our solution here is to
+ // add the offset directly to the ScratchOffset register, and then
+ // subtract the offset after the spill to return ScratchOffset to it's
+ // original value.
+ RanOutOfSGPRs = true;
+ SOffset = ScratchOffsetReg;
+ } else {
+ Scavenged = true;
+ }
+
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
+ .addReg(ScratchOffsetReg)
+ .addImm(Offset);
+
+ Offset = 0;
+ }
+
+ const unsigned EltSize = 4;
+
+ for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) {
+ unsigned SubReg = NumSubRegs == 1 ?
+ ValueReg : getSubReg(ValueReg, getSubRegFromChannel(i));
+
+ unsigned SOffsetRegState = 0;
+ unsigned SrcDstRegState = getDefRegState(!IsStore);
+ if (i + 1 == e) {
+ SOffsetRegState |= getKillRegState(Scavenged);
+ // The last implicit use carries the "Kill" flag.
+ SrcDstRegState |= getKillRegState(IsKill);
+ }
+
+ MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i);
+ MachineMemOperand *NewMMO
+ = MF->getMachineMemOperand(PInfo, MMO->getFlags(),
+ EltSize, MinAlign(Align, EltSize * i));
+
+ auto MIB = BuildMI(*MBB, MI, DL, Desc)
+ .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill))
+ .addReg(ScratchRsrcReg)
+ .addReg(SOffset, SOffsetRegState)
+ .addImm(Offset)
+ .addImm(0) // glc
+ .addImm(0) // slc
+ .addImm(0) // tfe
+ .addMemOperand(NewMMO);
+
+ if (NumSubRegs > 1)
+ MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
+ }
+
+ if (RanOutOfSGPRs) {
+ // Subtract the offset we added to the ScratchOffset register.
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffsetReg)
+ .addReg(ScratchOffsetReg)
+ .addImm(OriginalImmOffset);
+ }
+}
+
+static std::pair<unsigned, unsigned> getSpillEltSize(unsigned SuperRegSize,
+ bool Store) {
+ if (SuperRegSize % 16 == 0) {
+ return { 16, Store ? AMDGPU::S_BUFFER_STORE_DWORDX4_SGPR :
+ AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR };
+ }
+
+ if (SuperRegSize % 8 == 0) {
+ return { 8, Store ? AMDGPU::S_BUFFER_STORE_DWORDX2_SGPR :
+ AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR };
+ }
+
+ return { 4, Store ? AMDGPU::S_BUFFER_STORE_DWORD_SGPR :
+ AMDGPU::S_BUFFER_LOAD_DWORD_SGPR};
+}
+
+void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
+ int Index,
+ RegScavenger *RS) const {
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineFunction *MF = MBB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+
+ unsigned SuperReg = MI->getOperand(0).getReg();
+ bool IsKill = MI->getOperand(0).isKill();
+ const DebugLoc &DL = MI->getDebugLoc();
+
+ SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+ MachineFrameInfo &FrameInfo = MF->getFrameInfo();
+
+ bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM;
+
+ assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
+
+ unsigned OffsetReg = AMDGPU::M0;
+ unsigned M0CopyReg = AMDGPU::NoRegister;
+
+ if (SpillToSMEM) {
+ if (RS->isRegUsed(AMDGPU::M0)) {
+ M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg)
+ .addReg(AMDGPU::M0);
+ }
+ }
+
+ unsigned ScalarStoreOp;
+ unsigned EltSize = 4;
+ const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
+ if (SpillToSMEM && isSGPRClass(RC)) {
+ // XXX - if private_element_size is larger than 4 it might be useful to be
+ // able to spill wider vmem spills.
+ std::tie(EltSize, ScalarStoreOp) = getSpillEltSize(RC->getSize(), true);
+ }
+
+ ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
+ unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
+
+ // SubReg carries the "Kill" flag when SubReg == SuperReg.
+ unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);
+ for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
+ unsigned SubReg = NumSubRegs == 1 ?
+ SuperReg : getSubReg(SuperReg, SplitParts[i]);
+
+ if (SpillToSMEM) {
+ int64_t FrOffset = FrameInfo.getObjectOffset(Index);
+
+ // The allocated memory size is really the wavefront size * the frame
+ // index size. The widest register class is 64 bytes, so a 4-byte scratch
+ // allocation is enough to spill this in a single stack object.
+ //
+ // FIXME: Frame size/offsets are computed earlier than this, so the extra
+ // space is still unnecessarily allocated.
+
+ unsigned Align = FrameInfo.getObjectAlignment(Index);
+ MachinePointerInfo PtrInfo
+ = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
+ MachineMemOperand *MMO
+ = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
+ EltSize, MinAlign(Align, EltSize * i));
+
+ // SMEM instructions only support a single offset, so increment the wave
+ // offset.
+
+ int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i);
+ if (Offset != 0) {
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
+ .addReg(MFI->getScratchWaveOffsetReg())
+ .addImm(Offset);
+ } else {
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
+ .addReg(MFI->getScratchWaveOffsetReg());
+ }
+
+ BuildMI(*MBB, MI, DL, TII->get(ScalarStoreOp))
+ .addReg(SubReg, getKillRegState(IsKill)) // sdata
+ .addReg(MFI->getScratchRSrcReg()) // sbase
+ .addReg(OffsetReg, RegState::Kill) // soff
+ .addImm(0) // glc
+ .addMemOperand(MMO);
+
+ continue;
+ }
+
+ struct SIMachineFunctionInfo::SpilledReg Spill =
+ MFI->getSpilledReg(MF, Index, i);
+ if (Spill.hasReg()) {
+ BuildMI(*MBB, MI, DL,
+ TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
+ Spill.VGPR)
+ .addReg(SubReg, getKillRegState(IsKill))
+ .addImm(Spill.Lane);
+
+ // FIXME: Since this spills to another register instead of an actual
+ // frame index, we should delete the frame index when all references to
+ // it are fixed.
+ } else {
+ // Spill SGPR to a frame index.
+ // TODO: Should VI try to spill to VGPR and then spill to SMEM?
+ unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ // TODO: Should VI try to spill to VGPR and then spill to SMEM?
+
+ MachineInstrBuilder Mov
+ = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
+ .addReg(SubReg, SubKillState);
+
+
+ // There could be undef components of a spilled super register.
+ // TODO: Can we detect this and skip the spill?
+ if (NumSubRegs > 1) {
+ // The last implicit use of the SuperReg carries the "Kill" flag.
+ unsigned SuperKillState = 0;
+ if (i + 1 == e)
+ SuperKillState |= getKillRegState(IsKill);
+ Mov.addReg(SuperReg, RegState::Implicit | SuperKillState);
+ }
+
+ unsigned Align = FrameInfo.getObjectAlignment(Index);
+ MachinePointerInfo PtrInfo
+ = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
+ MachineMemOperand *MMO
+ = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
+ EltSize, MinAlign(Align, EltSize * i));
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE))
+ .addReg(TmpReg, RegState::Kill) // src
+ .addFrameIndex(Index) // vaddr
+ .addReg(MFI->getScratchRSrcReg()) // srrsrc
+ .addReg(MFI->getScratchWaveOffsetReg()) // soffset
+ .addImm(i * 4) // offset
+ .addMemOperand(MMO);
+ }
+ }
+
+ if (M0CopyReg != AMDGPU::NoRegister) {
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
+ .addReg(M0CopyReg, RegState::Kill);
+ }
+
+ MI->eraseFromParent();
+ MFI->addToSpilledSGPRs(NumSubRegs);
+}
+
+void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
+ int Index,
+ RegScavenger *RS) const {
+ MachineFunction *MF = MI->getParent()->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ MachineBasicBlock *MBB = MI->getParent();
+ SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+ MachineFrameInfo &FrameInfo = MF->getFrameInfo();
+ const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const DebugLoc &DL = MI->getDebugLoc();
+
+ unsigned SuperReg = MI->getOperand(0).getReg();
+ bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM;
+
+ assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
+
+ unsigned OffsetReg = AMDGPU::M0;
+ unsigned M0CopyReg = AMDGPU::NoRegister;
+
+ if (SpillToSMEM) {
+ if (RS->isRegUsed(AMDGPU::M0)) {
+ M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg)
+ .addReg(AMDGPU::M0);
+ }
+ }
+
+ unsigned EltSize = 4;
+ unsigned ScalarLoadOp;
+
+ const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
+ if (SpillToSMEM && isSGPRClass(RC)) {
+ // XXX - if private_element_size is larger than 4 it might be useful to be
+ // able to spill wider vmem spills.
+ std::tie(EltSize, ScalarLoadOp) = getSpillEltSize(RC->getSize(), false);
+ }
+
+ ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
+ unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
+
+ // SubReg carries the "Kill" flag when SubReg == SuperReg.
+ int64_t FrOffset = FrameInfo.getObjectOffset(Index);
+
+ for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
+ unsigned SubReg = NumSubRegs == 1 ?
+ SuperReg : getSubReg(SuperReg, SplitParts[i]);
+
+ if (SpillToSMEM) {
+ // FIXME: Size may be > 4 but extra bytes wasted.
+ unsigned Align = FrameInfo.getObjectAlignment(Index);
+ MachinePointerInfo PtrInfo
+ = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
+ MachineMemOperand *MMO
+ = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
+ EltSize, MinAlign(Align, EltSize * i));
+
+ // Add i * 4 offset
+ int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i);
+ if (Offset != 0) {
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
+ .addReg(MFI->getScratchWaveOffsetReg())
+ .addImm(Offset);
+ } else {
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
+ .addReg(MFI->getScratchWaveOffsetReg());
+ }
+
+ auto MIB =
+ BuildMI(*MBB, MI, DL, TII->get(ScalarLoadOp), SubReg)
+ .addReg(MFI->getScratchRSrcReg()) // sbase
+ .addReg(OffsetReg, RegState::Kill) // soff
+ .addImm(0) // glc
+ .addMemOperand(MMO);
+
+ if (NumSubRegs > 1)
+ MIB.addReg(SuperReg, RegState::ImplicitDefine);
+
+ continue;
+ }
+
+ SIMachineFunctionInfo::SpilledReg Spill
+ = MFI->getSpilledReg(MF, Index, i);
+
+ if (Spill.hasReg()) {
+ auto MIB =
+ BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
+ SubReg)
+ .addReg(Spill.VGPR)
+ .addImm(Spill.Lane);
+
+ if (NumSubRegs > 1)
+ MIB.addReg(SuperReg, RegState::ImplicitDefine);
+ } else {
+ // Restore SGPR from a stack slot.
+ // FIXME: We should use S_LOAD_DWORD here for VI.
+ unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ unsigned Align = FrameInfo.getObjectAlignment(Index);
+
+ MachinePointerInfo PtrInfo
+ = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
+
+ MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo,
+ MachineMemOperand::MOLoad, EltSize,
+ MinAlign(Align, EltSize * i));
+
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg)
+ .addFrameIndex(Index) // vaddr
+ .addReg(MFI->getScratchRSrcReg()) // srsrc
+ .addReg(MFI->getScratchWaveOffsetReg()) // soffset
+ .addImm(i * 4) // offset
+ .addMemOperand(MMO);
+
+ auto MIB =
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
+ .addReg(TmpReg, RegState::Kill);
+
+ if (NumSubRegs > 1)
+ MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
+ }
+ }
+
+ if (M0CopyReg != AMDGPU::NoRegister) {
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
+ .addReg(M0CopyReg, RegState::Kill);
+ }
+
+ MI->eraseFromParent();
+}
+
+void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
+ int SPAdj, unsigned FIOperandNum,
+ RegScavenger *RS) const {
+ MachineFunction *MF = MI->getParent()->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ MachineBasicBlock *MBB = MI->getParent();
+ SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+ MachineFrameInfo &FrameInfo = MF->getFrameInfo();
+ const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ DebugLoc DL = MI->getDebugLoc();
+
+ MachineOperand &FIOp = MI->getOperand(FIOperandNum);
+ int Index = MI->getOperand(FIOperandNum).getIndex();
+
+ switch (MI->getOpcode()) {
+ // SGPR register spill
+ case AMDGPU::SI_SPILL_S512_SAVE:
+ case AMDGPU::SI_SPILL_S256_SAVE:
+ case AMDGPU::SI_SPILL_S128_SAVE:
+ case AMDGPU::SI_SPILL_S64_SAVE:
+ case AMDGPU::SI_SPILL_S32_SAVE: {
+ spillSGPR(MI, Index, RS);
+ break;
+ }
+
+ // SGPR register restore
+ case AMDGPU::SI_SPILL_S512_RESTORE:
+ case AMDGPU::SI_SPILL_S256_RESTORE:
+ case AMDGPU::SI_SPILL_S128_RESTORE:
+ case AMDGPU::SI_SPILL_S64_RESTORE:
+ case AMDGPU::SI_SPILL_S32_RESTORE: {
+ restoreSGPR(MI, Index, RS);
+ break;
+ }
+
+ // VGPR register spill
+ case AMDGPU::SI_SPILL_V512_SAVE:
+ case AMDGPU::SI_SPILL_V256_SAVE:
+ case AMDGPU::SI_SPILL_V128_SAVE:
+ case AMDGPU::SI_SPILL_V96_SAVE:
+ case AMDGPU::SI_SPILL_V64_SAVE:
+ case AMDGPU::SI_SPILL_V32_SAVE: {
+ const MachineOperand *VData = TII->getNamedOperand(*MI,
+ AMDGPU::OpName::vdata);
+ buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET,
+ Index,
+ VData->getReg(), VData->isKill(),
+ TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(),
+ TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(),
+ TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
+ *MI->memoperands_begin(),
+ RS);
+ MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode()));
+ MI->eraseFromParent();
+ break;
+ }
+ case AMDGPU::SI_SPILL_V32_RESTORE:
+ case AMDGPU::SI_SPILL_V64_RESTORE:
+ case AMDGPU::SI_SPILL_V96_RESTORE:
+ case AMDGPU::SI_SPILL_V128_RESTORE:
+ case AMDGPU::SI_SPILL_V256_RESTORE:
+ case AMDGPU::SI_SPILL_V512_RESTORE: {
+ const MachineOperand *VData = TII->getNamedOperand(*MI,
+ AMDGPU::OpName::vdata);
+
+ buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
+ Index,
+ VData->getReg(), VData->isKill(),
+ TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(),
+ TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(),
+ TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
+ *MI->memoperands_begin(),
+ RS);
+ MI->eraseFromParent();
+ break;
+ }
+
+ default: {
+ if (TII->isMUBUF(*MI)) {
+ // Disable offen so we don't need a 0 vgpr base.
+ assert(static_cast<int>(FIOperandNum) ==
+ AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+ AMDGPU::OpName::vaddr));
+
+ int64_t Offset = FrameInfo.getObjectOffset(Index);
+ int64_t OldImm
+ = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
+ int64_t NewOffset = OldImm + Offset;
+
+ if (isUInt<12>(NewOffset) &&
+ buildMUBUFOffsetLoadStore(TII, FrameInfo, MI, Index, NewOffset)) {
+ MI->eraseFromParent();
+ break;
+ }
+ }
+
+ int64_t Offset = FrameInfo.getObjectOffset(Index);
+ FIOp.ChangeToImmediate(Offset);
+ if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
+ unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*MBB, MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
+ .addImm(Offset);
+ FIOp.ChangeToRegister(TmpReg, false, false, true);
+ }
+ }
+ }
+}
+
+// FIXME: This is very slow. It might be worth creating a map from physreg to
+// register class.
+const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
+ assert(!TargetRegisterInfo::isVirtualRegister(Reg));
+
+ static const TargetRegisterClass *const BaseClasses[] = {
+ &AMDGPU::VGPR_32RegClass,
+ &AMDGPU::SReg_32RegClass,
+ &AMDGPU::VReg_64RegClass,
+ &AMDGPU::SReg_64RegClass,
+ &AMDGPU::VReg_96RegClass,
+ &AMDGPU::VReg_128RegClass,
+ &AMDGPU::SReg_128RegClass,
+ &AMDGPU::VReg_256RegClass,
+ &AMDGPU::SReg_256RegClass,
+ &AMDGPU::VReg_512RegClass,
+ &AMDGPU::SReg_512RegClass,
+ &AMDGPU::SCC_CLASSRegClass,
+ };
+
+ for (const TargetRegisterClass *BaseClass : BaseClasses) {
+ if (BaseClass->contains(Reg)) {
+ return BaseClass;
+ }
+ }
+ return nullptr;
+}
+
+// TODO: It might be helpful to have some target specific flags in
+// TargetRegisterClass to mark which classes are VGPRs to make this trivial.
+bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
+ switch (RC->getSize()) {
+ case 0: return false;
+ case 1: return false;
+ case 4:
+ return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr;
+ case 8:
+ return getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) != nullptr;
+ case 12:
+ return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr;
+ case 16:
+ return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr;
+ case 32:
+ return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr;
+ case 64:
+ return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr;
+ default:
+ llvm_unreachable("Invalid register class size");
+ }
+}
+
+const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
+ const TargetRegisterClass *SRC) const {
+ switch (SRC->getSize()) {
+ case 4:
+ return &AMDGPU::VGPR_32RegClass;
+ case 8:
+ return &AMDGPU::VReg_64RegClass;
+ case 12:
+ return &AMDGPU::VReg_96RegClass;
+ case 16:
+ return &AMDGPU::VReg_128RegClass;
+ case 32:
+ return &AMDGPU::VReg_256RegClass;
+ case 64:
+ return &AMDGPU::VReg_512RegClass;
+ default:
+ llvm_unreachable("Invalid register class size");
+ }
+}
+
+const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass(
+ const TargetRegisterClass *VRC) const {
+ switch (VRC->getSize()) {
+ case 4:
+ return &AMDGPU::SGPR_32RegClass;
+ case 8:
+ return &AMDGPU::SReg_64RegClass;
+ case 16:
+ return &AMDGPU::SReg_128RegClass;
+ case 32:
+ return &AMDGPU::SReg_256RegClass;
+ case 64:
+ return &AMDGPU::SReg_512RegClass;
+ default:
+ llvm_unreachable("Invalid register class size");
+ }
+}
+
+const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
+ const TargetRegisterClass *RC, unsigned SubIdx) const {
+ if (SubIdx == AMDGPU::NoSubRegister)
+ return RC;
+
+ // We can assume that each lane corresponds to one 32-bit register.
+ LaneBitmask::Type Mask = getSubRegIndexLaneMask(SubIdx).getAsInteger();
+ unsigned Count = countPopulation(Mask);
+ if (isSGPRClass(RC)) {
+ switch (Count) {
+ case 1:
+ return &AMDGPU::SGPR_32RegClass;
+ case 2:
+ return &AMDGPU::SReg_64RegClass;
+ case 4:
+ return &AMDGPU::SReg_128RegClass;
+ case 8:
+ return &AMDGPU::SReg_256RegClass;
+ case 16: /* fall-through */
+ default:
+ llvm_unreachable("Invalid sub-register class size");
+ }
+ } else {
+ switch (Count) {
+ case 1:
+ return &AMDGPU::VGPR_32RegClass;
+ case 2:
+ return &AMDGPU::VReg_64RegClass;
+ case 3:
+ return &AMDGPU::VReg_96RegClass;
+ case 4:
+ return &AMDGPU::VReg_128RegClass;
+ case 8:
+ return &AMDGPU::VReg_256RegClass;
+ case 16: /* fall-through */
+ default:
+ llvm_unreachable("Invalid sub-register class size");
+ }
+ }
+}
+
+bool SIRegisterInfo::shouldRewriteCopySrc(
+ const TargetRegisterClass *DefRC,
+ unsigned DefSubReg,
+ const TargetRegisterClass *SrcRC,
+ unsigned SrcSubReg) const {
+ // We want to prefer the smallest register class possible, so we don't want to
+ // stop and rewrite on anything that looks like a subregister
+ // extract. Operations mostly don't care about the super register class, so we
+ // only want to stop on the most basic of copies between the same register
+ // class.
+ //
+ // e.g. if we have something like
+ // vreg0 = ...
+ // vreg1 = ...
+ // vreg2 = REG_SEQUENCE vreg0, sub0, vreg1, sub1, vreg2, sub2
+ // vreg3 = COPY vreg2, sub0
+ //
+ // We want to look through the COPY to find:
+ // => vreg3 = COPY vreg0
+
+ // Plain copy.
+ return getCommonSubClass(DefRC, SrcRC) != nullptr;
+}
+
+// FIXME: Most of these are flexible with HSA and we don't need to reserve them
+// as input registers if unused. Whether the dispatch ptr is necessary should be
+// easy to detect from used intrinsics. Scratch setup is harder to know.
+unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
+ enum PreloadedValue Value) const {
+
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ (void)ST;
+ switch (Value) {
+ case SIRegisterInfo::WORKGROUP_ID_X:
+ assert(MFI->hasWorkGroupIDX());
+ return MFI->WorkGroupIDXSystemSGPR;
+ case SIRegisterInfo::WORKGROUP_ID_Y:
+ assert(MFI->hasWorkGroupIDY());
+ return MFI->WorkGroupIDYSystemSGPR;
+ case SIRegisterInfo::WORKGROUP_ID_Z:
+ assert(MFI->hasWorkGroupIDZ());
+ return MFI->WorkGroupIDZSystemSGPR;
+ case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET:
+ return MFI->PrivateSegmentWaveByteOffsetSystemSGPR;
+ case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER:
+ assert(ST.isAmdCodeObjectV2() &&
+ "Non-CodeObjectV2 ABI currently uses relocations");
+ assert(MFI->hasPrivateSegmentBuffer());
+ return MFI->PrivateSegmentBufferUserSGPR;
+ case SIRegisterInfo::KERNARG_SEGMENT_PTR:
+ assert(MFI->hasKernargSegmentPtr());
+ return MFI->KernargSegmentPtrUserSGPR;
+ case SIRegisterInfo::DISPATCH_ID:
+ assert(MFI->hasDispatchID());
+ return MFI->DispatchIDUserSGPR;
+ case SIRegisterInfo::FLAT_SCRATCH_INIT:
+ assert(MFI->hasFlatScratchInit());
+ return MFI->FlatScratchInitUserSGPR;
+ case SIRegisterInfo::DISPATCH_PTR:
+ assert(MFI->hasDispatchPtr());
+ return MFI->DispatchPtrUserSGPR;
+ case SIRegisterInfo::QUEUE_PTR:
+ assert(MFI->hasQueuePtr());
+ return MFI->QueuePtrUserSGPR;
+ case SIRegisterInfo::WORKITEM_ID_X:
+ assert(MFI->hasWorkItemIDX());
+ return AMDGPU::VGPR0;
+ case SIRegisterInfo::WORKITEM_ID_Y:
+ assert(MFI->hasWorkItemIDY());
+ return AMDGPU::VGPR1;
+ case SIRegisterInfo::WORKITEM_ID_Z:
+ assert(MFI->hasWorkItemIDZ());
+ return AMDGPU::VGPR2;
+ }
+ llvm_unreachable("unexpected preloaded value type");
+}
+
+/// \brief Returns a register that is not used at any point in the function.
+/// If all registers are used, then this function will return
+// AMDGPU::NoRegister.
+unsigned
+SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
+ const TargetRegisterClass *RC,
+ const MachineFunction &MF) const {
+
+ for (unsigned Reg : *RC)
+ if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
+ return Reg;
+ return AMDGPU::NoRegister;
+}
+
+unsigned SIRegisterInfo::getTotalNumSGPRs(const SISubtarget &ST) const {
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ return 800;
+ return 512;
+}
+
+unsigned SIRegisterInfo::getNumAddressableSGPRs(const SISubtarget &ST) const {
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ return 102;
+ return 104;
+}
+
+unsigned SIRegisterInfo::getNumReservedSGPRs(const SISubtarget &ST,
+ const SIMachineFunctionInfo &MFI) const {
+ if (MFI.hasFlatScratchInit()) {
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ return 6; // FLAT_SCRATCH, XNACK, VCC (in that order)
+
+ if (ST.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
+ return 4; // FLAT_SCRATCH, VCC (in that order)
+ }
+
+ if (ST.isXNACKEnabled())
+ return 4; // XNACK, VCC (in that order)
+
+ return 2; // VCC.
+}
+
+unsigned SIRegisterInfo::getMinNumSGPRs(const SISubtarget &ST,
+ unsigned WavesPerEU) const {
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ switch (WavesPerEU) {
+ case 0: return 0;
+ case 10: return 0;
+ case 9: return 0;
+ case 8: return 81;
+ default: return 97;
+ }
+ } else {
+ switch (WavesPerEU) {
+ case 0: return 0;
+ case 10: return 0;
+ case 9: return 49;
+ case 8: return 57;
+ case 7: return 65;
+ case 6: return 73;
+ case 5: return 81;
+ default: return 97;
+ }
+ }
+}
+
+unsigned SIRegisterInfo::getMaxNumSGPRs(const SISubtarget &ST,
+ unsigned WavesPerEU,
+ bool Addressable) const {
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ switch (WavesPerEU) {
+ case 0: return 80;
+ case 10: return 80;
+ case 9: return 80;
+ case 8: return 96;
+ default: return Addressable ? getNumAddressableSGPRs(ST) : 112;
+ }
+ } else {
+ switch (WavesPerEU) {
+ case 0: return 48;
+ case 10: return 48;
+ case 9: return 56;
+ case 8: return 64;
+ case 7: return 72;
+ case 6: return 80;
+ case 5: return 96;
+ default: return getNumAddressableSGPRs(ST);
+ }
+ }
+}
+
+unsigned SIRegisterInfo::getMaxNumSGPRs(const MachineFunction &MF) const {
+ const Function &F = *MF.getFunction();
+
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+
+ // Compute maximum number of SGPRs function can use using default/requested
+ // minimum number of waves per execution unit.
+ std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
+ unsigned MaxNumSGPRs = getMaxNumSGPRs(ST, WavesPerEU.first, false);
+ unsigned MaxNumAddressableSGPRs = getMaxNumSGPRs(ST, WavesPerEU.first, true);
+
+ // Check if maximum number of SGPRs was explicitly requested using
+ // "amdgpu-num-sgpr" attribute.
+ if (F.hasFnAttribute("amdgpu-num-sgpr")) {
+ unsigned Requested = AMDGPU::getIntegerAttribute(
+ F, "amdgpu-num-sgpr", MaxNumSGPRs);
+
+ // Make sure requested value does not violate subtarget's specifications.
+ if (Requested && (Requested <= getNumReservedSGPRs(ST, MFI)))
+ Requested = 0;
+
+ // If more SGPRs are required to support the input user/system SGPRs,
+ // increase to accommodate them.
+ //
+ // FIXME: This really ends up using the requested number of SGPRs + number
+ // of reserved special registers in total. Theoretically you could re-use
+ // the last input registers for these special registers, but this would
+ // require a lot of complexity to deal with the weird aliasing.
+ unsigned NumInputSGPRs = MFI.getNumPreloadedSGPRs();
+ if (Requested && Requested < NumInputSGPRs)
+ Requested = NumInputSGPRs;
+
+ // Make sure requested value is compatible with values implied by
+ // default/requested minimum/maximum number of waves per execution unit.
+ if (Requested && Requested > getMaxNumSGPRs(ST, WavesPerEU.first, false))
+ Requested = 0;
+ if (WavesPerEU.second &&
+ Requested && Requested < getMinNumSGPRs(ST, WavesPerEU.second))
+ Requested = 0;
+
+ if (Requested)
+ MaxNumSGPRs = Requested;
+ }
+
+ if (ST.hasSGPRInitBug())
+ MaxNumSGPRs = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
+
+ return std::min(MaxNumSGPRs - getNumReservedSGPRs(ST, MFI),
+ MaxNumAddressableSGPRs);
+}
+
+unsigned SIRegisterInfo::getNumDebuggerReservedVGPRs(
+ const SISubtarget &ST) const {
+ if (ST.debuggerReserveRegs())
+ return 4;
+ return 0;
+}
+
+unsigned SIRegisterInfo::getMinNumVGPRs(unsigned WavesPerEU) const {
+ switch (WavesPerEU) {
+ case 0: return 0;
+ case 10: return 0;
+ case 9: return 25;
+ case 8: return 29;
+ case 7: return 33;
+ case 6: return 37;
+ case 5: return 41;
+ case 4: return 49;
+ case 3: return 65;
+ case 2: return 85;
+ default: return 129;
+ }
+}
+
+unsigned SIRegisterInfo::getMaxNumVGPRs(unsigned WavesPerEU) const {
+ switch (WavesPerEU) {
+ case 0: return 24;
+ case 10: return 24;
+ case 9: return 28;
+ case 8: return 32;
+ case 7: return 36;
+ case 6: return 40;
+ case 5: return 48;
+ case 4: return 64;
+ case 3: return 84;
+ case 2: return 128;
+ default: return getTotalNumVGPRs();
+ }
+}
+
+unsigned SIRegisterInfo::getMaxNumVGPRs(const MachineFunction &MF) const {
+ const Function &F = *MF.getFunction();
+
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+
+ // Compute maximum number of VGPRs function can use using default/requested
+ // minimum number of waves per execution unit.
+ std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
+ unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
+
+ // Check if maximum number of VGPRs was explicitly requested using
+ // "amdgpu-num-vgpr" attribute.
+ if (F.hasFnAttribute("amdgpu-num-vgpr")) {
+ unsigned Requested = AMDGPU::getIntegerAttribute(
+ F, "amdgpu-num-vgpr", MaxNumVGPRs);
+
+ // Make sure requested value does not violate subtarget's specifications.
+ if (Requested && Requested <= getNumDebuggerReservedVGPRs(ST))
+ Requested = 0;
+
+ // Make sure requested value is compatible with values implied by
+ // default/requested minimum/maximum number of waves per execution unit.
+ if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
+ Requested = 0;
+ if (WavesPerEU.second &&
+ Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
+ Requested = 0;
+
+ if (Requested)
+ MaxNumVGPRs = Requested;
+ }
+
+ return MaxNumVGPRs - getNumDebuggerReservedVGPRs(ST);
+}
+
+ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC,
+ unsigned EltSize) const {
+ if (EltSize == 4) {
+ static const int16_t Sub0_15[] = {
+ AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
+ AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
+ AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
+ AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
+ };
+
+ static const int16_t Sub0_7[] = {
+ AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
+ AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
+ };
+
+ static const int16_t Sub0_3[] = {
+ AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
+ };
+
+ static const int16_t Sub0_2[] = {
+ AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2,
+ };
+
+ static const int16_t Sub0_1[] = {
+ AMDGPU::sub0, AMDGPU::sub1,
+ };
+
+ switch (AMDGPU::getRegBitWidth(*RC->MC)) {
+ case 32:
+ return {};
+ case 64:
+ return makeArrayRef(Sub0_1);
+ case 96:
+ return makeArrayRef(Sub0_2);
+ case 128:
+ return makeArrayRef(Sub0_3);
+ case 256:
+ return makeArrayRef(Sub0_7);
+ case 512:
+ return makeArrayRef(Sub0_15);
+ default:
+ llvm_unreachable("unhandled register size");
+ }
+ }
+
+ if (EltSize == 8) {
+ static const int16_t Sub0_15_64[] = {
+ AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
+ AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
+ AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
+ AMDGPU::sub12_sub13, AMDGPU::sub14_sub15
+ };
+
+ static const int16_t Sub0_7_64[] = {
+ AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
+ AMDGPU::sub4_sub5, AMDGPU::sub6_sub7
+ };
+
+
+ static const int16_t Sub0_3_64[] = {
+ AMDGPU::sub0_sub1, AMDGPU::sub2_sub3
+ };
+
+ switch (AMDGPU::getRegBitWidth(*RC->MC)) {
+ case 64:
+ return {};
+ case 128:
+ return makeArrayRef(Sub0_3_64);
+ case 256:
+ return makeArrayRef(Sub0_7_64);
+ case 512:
+ return makeArrayRef(Sub0_15_64);
+ default:
+ llvm_unreachable("unhandled register size");
+ }
+ }
+
+ assert(EltSize == 16 && "unhandled register spill split size");
+
+ static const int16_t Sub0_15_128[] = {
+ AMDGPU::sub0_sub1_sub2_sub3,
+ AMDGPU::sub4_sub5_sub6_sub7,
+ AMDGPU::sub8_sub9_sub10_sub11,
+ AMDGPU::sub12_sub13_sub14_sub15
+ };
+
+ static const int16_t Sub0_7_128[] = {
+ AMDGPU::sub0_sub1_sub2_sub3,
+ AMDGPU::sub4_sub5_sub6_sub7
+ };
+
+ switch (AMDGPU::getRegBitWidth(*RC->MC)) {
+ case 128:
+ return {};
+ case 256:
+ return makeArrayRef(Sub0_7_128);
+ case 512:
+ return makeArrayRef(Sub0_15_128);
+ default:
+ llvm_unreachable("unhandled register size");
+ }
+}
+
+const TargetRegisterClass*
+SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI,
+ unsigned Reg) const {
+ if (TargetRegisterInfo::isVirtualRegister(Reg))
+ return MRI.getRegClass(Reg);
+
+ return getPhysRegClass(Reg);
+}
+
+bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
+ unsigned Reg) const {
+ return hasVGPRs(getRegClassForReg(MRI, Reg));
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
new file mode 100644
index 000000000000..0bcae7d9840c
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -0,0 +1,282 @@
+//===-- SIRegisterInfo.h - SI Register Info Interface ----------*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface definition for SIRegisterInfo
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIREGISTERINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_SIREGISTERINFO_H
+
+#include "AMDGPURegisterInfo.h"
+#include "SIDefines.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+namespace llvm {
+
+class SISubtarget;
+class MachineRegisterInfo;
+class SIMachineFunctionInfo;
+
+class SIRegisterInfo final : public AMDGPURegisterInfo {
+private:
+ unsigned SGPRSetID;
+ unsigned VGPRSetID;
+ BitVector SGPRPressureSets;
+ BitVector VGPRPressureSets;
+
+ void reserveRegisterTuples(BitVector &, unsigned Reg) const;
+ void classifyPressureSet(unsigned PSetID, unsigned Reg,
+ BitVector &PressureSets) const;
+
+public:
+ SIRegisterInfo();
+
+ /// Return the end register initially reserved for the scratch buffer in case
+ /// spilling is needed.
+ unsigned reservedPrivateSegmentBufferReg(const MachineFunction &MF) const;
+
+ /// Return the end register initially reserved for the scratch wave offset in
+ /// case spilling is needed.
+ unsigned reservedPrivateSegmentWaveByteOffsetReg(
+ const MachineFunction &MF) const;
+
+ BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+ bool requiresRegisterScavenging(const MachineFunction &Fn) const override;
+
+ bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
+ bool requiresFrameIndexReplacementScavenging(
+ const MachineFunction &MF) const override;
+ bool requiresVirtualBaseRegisters(const MachineFunction &Fn) const override;
+ bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
+
+ int64_t getMUBUFInstrOffset(const MachineInstr *MI) const;
+
+ int64_t getFrameIndexInstrOffset(const MachineInstr *MI,
+ int Idx) const override;
+
+ bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
+
+ void materializeFrameBaseRegister(MachineBasicBlock *MBB,
+ unsigned BaseReg, int FrameIdx,
+ int64_t Offset) const override;
+
+ void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+ int64_t Offset) const override;
+
+ bool isFrameOffsetLegal(const MachineInstr *MI, unsigned BaseReg,
+ int64_t Offset) const override;
+
+ const TargetRegisterClass *getPointerRegClass(
+ const MachineFunction &MF, unsigned Kind = 0) const override;
+
+ void spillSGPR(MachineBasicBlock::iterator MI,
+ int FI, RegScavenger *RS) const;
+
+ void restoreSGPR(MachineBasicBlock::iterator MI,
+ int FI, RegScavenger *RS) const;
+
+ void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
+ unsigned FIOperandNum,
+ RegScavenger *RS) const override;
+
+ unsigned getHWRegIndex(unsigned Reg) const {
+ return getEncodingValue(Reg) & 0xff;
+ }
+
+ /// \brief Return the 'base' register class for this register.
+ /// e.g. SGPR0 => SReg_32, VGPR => VGPR_32 SGPR0_SGPR1 -> SReg_32, etc.
+ const TargetRegisterClass *getPhysRegClass(unsigned Reg) const;
+
+ /// \returns true if this class contains only SGPR registers
+ bool isSGPRClass(const TargetRegisterClass *RC) const {
+ return !hasVGPRs(RC);
+ }
+
+ /// \returns true if this class ID contains only SGPR registers
+ bool isSGPRClassID(unsigned RCID) const {
+ return isSGPRClass(getRegClass(RCID));
+ }
+
+ bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const {
+ const TargetRegisterClass *RC;
+ if (TargetRegisterInfo::isVirtualRegister(Reg))
+ RC = MRI.getRegClass(Reg);
+ else
+ RC = getPhysRegClass(Reg);
+ return isSGPRClass(RC);
+ }
+
+ /// \returns true if this class contains VGPR registers.
+ bool hasVGPRs(const TargetRegisterClass *RC) const;
+
+ /// \returns A VGPR reg class with the same width as \p SRC
+ const TargetRegisterClass *getEquivalentVGPRClass(
+ const TargetRegisterClass *SRC) const;
+
+ /// \returns A SGPR reg class with the same width as \p SRC
+ const TargetRegisterClass *getEquivalentSGPRClass(
+ const TargetRegisterClass *VRC) const;
+
+ /// \returns The register class that is used for a sub-register of \p RC for
+ /// the given \p SubIdx. If \p SubIdx equals NoSubRegister, \p RC will
+ /// be returned.
+ const TargetRegisterClass *getSubRegClass(const TargetRegisterClass *RC,
+ unsigned SubIdx) const;
+
+ bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
+ unsigned DefSubReg,
+ const TargetRegisterClass *SrcRC,
+ unsigned SrcSubReg) const override;
+
+ /// \returns True if operands defined with this operand type can accept
+ /// a literal constant (i.e. any 32-bit immediate).
+ bool opCanUseLiteralConstant(unsigned OpType) const {
+ // TODO: 64-bit operands have extending behavior from 32-bit literal.
+ return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST &&
+ OpType <= AMDGPU::OPERAND_REG_IMM_LAST;
+ }
+
+ /// \returns True if operands defined with this operand type can accept
+ /// an inline constant. i.e. An integer value in the range (-16, 64) or
+ /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f.
+ bool opCanUseInlineConstant(unsigned OpType) const {
+ return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
+ OpType <= AMDGPU::OPERAND_SRC_LAST;
+ }
+
+ enum PreloadedValue {
+ // SGPRS:
+ PRIVATE_SEGMENT_BUFFER = 0,
+ DISPATCH_PTR = 1,
+ QUEUE_PTR = 2,
+ KERNARG_SEGMENT_PTR = 3,
+ DISPATCH_ID = 4,
+ FLAT_SCRATCH_INIT = 5,
+ WORKGROUP_ID_X = 10,
+ WORKGROUP_ID_Y = 11,
+ WORKGROUP_ID_Z = 12,
+ PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
+
+ // VGPRS:
+ FIRST_VGPR_VALUE = 15,
+ WORKITEM_ID_X = FIRST_VGPR_VALUE,
+ WORKITEM_ID_Y = 16,
+ WORKITEM_ID_Z = 17
+ };
+
+ /// \brief Returns the physical register that \p Value is stored in.
+ unsigned getPreloadedValue(const MachineFunction &MF,
+ enum PreloadedValue Value) const;
+
+ unsigned findUnusedRegister(const MachineRegisterInfo &MRI,
+ const TargetRegisterClass *RC,
+ const MachineFunction &MF) const;
+
+ unsigned getSGPRPressureSet() const { return SGPRSetID; };
+ unsigned getVGPRPressureSet() const { return VGPRSetID; };
+
+ const TargetRegisterClass *getRegClassForReg(const MachineRegisterInfo &MRI,
+ unsigned Reg) const;
+ bool isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const;
+
+ bool isSGPRPressureSet(unsigned SetID) const {
+ return SGPRPressureSets.test(SetID) && !VGPRPressureSets.test(SetID);
+ }
+ bool isVGPRPressureSet(unsigned SetID) const {
+ return VGPRPressureSets.test(SetID) && !SGPRPressureSets.test(SetID);
+ }
+
+ /// \returns SGPR allocation granularity supported by the subtarget.
+ unsigned getSGPRAllocGranule() const {
+ return 8;
+ }
+
+ /// \returns Total number of SGPRs supported by the subtarget.
+ unsigned getTotalNumSGPRs(const SISubtarget &ST) const;
+
+ /// \returns Number of addressable SGPRs supported by the subtarget.
+ unsigned getNumAddressableSGPRs(const SISubtarget &ST) const;
+
+ /// \returns Number of reserved SGPRs supported by the subtarget.
+ unsigned getNumReservedSGPRs(const SISubtarget &ST,
+ const SIMachineFunctionInfo &MFI) const;
+
+ /// \returns Minimum number of SGPRs that meets given number of waves per
+ /// execution unit requirement for given subtarget.
+ unsigned getMinNumSGPRs(const SISubtarget &ST, unsigned WavesPerEU) const;
+
+ /// \returns Maximum number of SGPRs that meets given number of waves per
+ /// execution unit requirement for given subtarget.
+ unsigned getMaxNumSGPRs(const SISubtarget &ST, unsigned WavesPerEU,
+ bool Addressable) const;
+
+ /// \returns Maximum number of SGPRs that meets number of waves per execution
+ /// unit requirement for function \p MF, or number of SGPRs explicitly
+ /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
+ ///
+ /// \returns Value that meets number of waves per execution unit requirement
+ /// if explicitly requested value cannot be converted to integer, violates
+ /// subtarget's specifications, or does not meet number of waves per execution
+ /// unit requirement.
+ unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
+
+ /// \returns VGPR allocation granularity supported by the subtarget.
+ unsigned getVGPRAllocGranule() const {
+ return 4;
+ }
+
+ /// \returns Total number of VGPRs supported by the subtarget.
+ unsigned getTotalNumVGPRs() const {
+ return 256;
+ }
+
+ /// \returns Number of reserved VGPRs for debugger use supported by the
+ /// subtarget.
+ unsigned getNumDebuggerReservedVGPRs(const SISubtarget &ST) const;
+
+ /// \returns Minimum number of SGPRs that meets given number of waves per
+ /// execution unit requirement.
+ unsigned getMinNumVGPRs(unsigned WavesPerEU) const;
+
+ /// \returns Maximum number of VGPRs that meets given number of waves per
+ /// execution unit requirement.
+ unsigned getMaxNumVGPRs(unsigned WavesPerEU) const;
+
+ /// \returns Maximum number of VGPRs that meets number of waves per execution
+ /// unit requirement for function \p MF, or number of VGPRs explicitly
+ /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
+ ///
+ /// \returns Value that meets number of waves per execution unit requirement
+ /// if explicitly requested value cannot be converted to integer, violates
+ /// subtarget's specifications, or does not meet number of waves per execution
+ /// unit requirement.
+ unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
+
+ ArrayRef<int16_t> getRegSplitParts(const TargetRegisterClass *RC,
+ unsigned EltSize) const;
+
+private:
+ void buildSpillLoadStore(MachineBasicBlock::iterator MI,
+ unsigned LoadStoreOp,
+ int Index,
+ unsigned ValueReg,
+ bool ValueIsKill,
+ unsigned ScratchRsrcReg,
+ unsigned ScratchOffsetReg,
+ int64_t InstrOffset,
+ MachineMemOperand *MMO,
+ RegScavenger *RS) const;
+};
+
+} // End namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
new file mode 100644
index 000000000000..31e714b9f6b9
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -0,0 +1,465 @@
+//===-- SIRegisterInfo.td - SI Register defs ---------------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Declarations that describe the SI registers
+//===----------------------------------------------------------------------===//
+class SIReg <string n, bits<16> regIdx = 0> : Register<n>,
+ DwarfRegNum<[!cast<int>(HWEncoding)]> {
+ let Namespace = "AMDGPU";
+
+ // This is the not yet the complete register encoding. An additional
+ // bit is set for VGPRs.
+ let HWEncoding = regIdx;
+}
+
+// Special Registers
+def VCC_LO : SIReg<"vcc_lo", 106>;
+def VCC_HI : SIReg<"vcc_hi", 107>;
+
+// VCC for 64-bit instructions
+def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>,
+ DwarfRegAlias<VCC_LO> {
+ let Namespace = "AMDGPU";
+ let SubRegIndices = [sub0, sub1];
+ let HWEncoding = 106;
+}
+
+def EXEC_LO : SIReg<"exec_lo", 126>;
+def EXEC_HI : SIReg<"exec_hi", 127>;
+
+def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]>,
+ DwarfRegAlias<EXEC_LO> {
+ let Namespace = "AMDGPU";
+ let SubRegIndices = [sub0, sub1];
+ let HWEncoding = 126;
+}
+
+def SCC : SIReg<"scc", 253>;
+def M0 : SIReg <"m0", 124>;
+
+// Trap handler registers
+def TBA_LO : SIReg<"tba_lo", 108>;
+def TBA_HI : SIReg<"tba_hi", 109>;
+
+def TBA : RegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]>,
+ DwarfRegAlias<TBA_LO> {
+ let Namespace = "AMDGPU";
+ let SubRegIndices = [sub0, sub1];
+ let HWEncoding = 108;
+}
+
+def TMA_LO : SIReg<"tma_lo", 110>;
+def TMA_HI : SIReg<"tma_hi", 111>;
+
+def TMA : RegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]>,
+ DwarfRegAlias<TMA_LO> {
+ let Namespace = "AMDGPU";
+ let SubRegIndices = [sub0, sub1];
+ let HWEncoding = 110;
+}
+
+def TTMP0 : SIReg <"ttmp0", 112>;
+def TTMP1 : SIReg <"ttmp1", 113>;
+def TTMP2 : SIReg <"ttmp2", 114>;
+def TTMP3 : SIReg <"ttmp3", 115>;
+def TTMP4 : SIReg <"ttmp4", 116>;
+def TTMP5 : SIReg <"ttmp5", 117>;
+def TTMP6 : SIReg <"ttmp6", 118>;
+def TTMP7 : SIReg <"ttmp7", 119>;
+def TTMP8 : SIReg <"ttmp8", 120>;
+def TTMP9 : SIReg <"ttmp9", 121>;
+def TTMP10 : SIReg <"ttmp10", 122>;
+def TTMP11 : SIReg <"ttmp11", 123>;
+
+multiclass FLAT_SCR_LOHI_m <string n, bits<16> ci_e, bits<16> vi_e> {
+ def _ci : SIReg<n, ci_e>;
+ def _vi : SIReg<n, vi_e>;
+ def "" : SIReg<"", 0>;
+}
+
+class FlatReg <Register lo, Register hi, bits<16> encoding> :
+ RegisterWithSubRegs<"flat_scratch", [lo, hi]>,
+ DwarfRegAlias<lo> {
+ let Namespace = "AMDGPU";
+ let SubRegIndices = [sub0, sub1];
+ let HWEncoding = encoding;
+}
+
+defm FLAT_SCR_LO : FLAT_SCR_LOHI_m<"flat_scratch_lo", 104, 102>; // Offset in units of 256-bytes.
+defm FLAT_SCR_HI : FLAT_SCR_LOHI_m<"flat_scratch_hi", 105, 103>; // Size is the per-thread scratch size, in bytes.
+
+def FLAT_SCR_ci : FlatReg<FLAT_SCR_LO_ci, FLAT_SCR_HI_ci, 104>;
+def FLAT_SCR_vi : FlatReg<FLAT_SCR_LO_vi, FLAT_SCR_HI_vi, 102>;
+def FLAT_SCR : FlatReg<FLAT_SCR_LO, FLAT_SCR_HI, 0>;
+
+// SGPR registers
+foreach Index = 0-103 in {
+ def SGPR#Index : SIReg <"SGPR"#Index, Index>;
+}
+
+// VGPR registers
+foreach Index = 0-255 in {
+ def VGPR#Index : SIReg <"VGPR"#Index, Index> {
+ let HWEncoding{8} = 1;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Groupings using register classes and tuples
+//===----------------------------------------------------------------------===//
+
+def SCC_CLASS : RegisterClass<"AMDGPU", [i1], 1, (add SCC)> {
+ let CopyCost = -1;
+ let isAllocatable = 0;
+}
+
+def M0_CLASS : RegisterClass<"AMDGPU", [i32], 32, (add M0)> {
+ let CopyCost = 1;
+ let isAllocatable = 0;
+}
+
+// TODO: Do we need to set DwarfRegAlias on register tuples?
+
+// SGPR 32-bit registers
+def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
+ (add (sequence "SGPR%u", 0, 103))> {
+ // Give all SGPR classes higher priority than VGPR classes, because
+ // we want to spill SGPRs to VGPRs.
+ let AllocationPriority = 7;
+}
+
+// SGPR 64-bit registers
+def SGPR_64Regs : RegisterTuples<[sub0, sub1],
+ [(add (decimate SGPR_32, 2)),
+ (add (decimate (shl SGPR_32, 1), 2))]>;
+
+// SGPR 128-bit registers
+def SGPR_128Regs : RegisterTuples<[sub0, sub1, sub2, sub3],
+ [(add (decimate SGPR_32, 4)),
+ (add (decimate (shl SGPR_32, 1), 4)),
+ (add (decimate (shl SGPR_32, 2), 4)),
+ (add (decimate (shl SGPR_32, 3), 4))]>;
+
+// SGPR 256-bit registers
+def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7],
+ [(add (decimate SGPR_32, 4)),
+ (add (decimate (shl SGPR_32, 1), 4)),
+ (add (decimate (shl SGPR_32, 2), 4)),
+ (add (decimate (shl SGPR_32, 3), 4)),
+ (add (decimate (shl SGPR_32, 4), 4)),
+ (add (decimate (shl SGPR_32, 5), 4)),
+ (add (decimate (shl SGPR_32, 6), 4)),
+ (add (decimate (shl SGPR_32, 7), 4))]>;
+
+// SGPR 512-bit registers
+def SGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
+ sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15],
+ [(add (decimate SGPR_32, 4)),
+ (add (decimate (shl SGPR_32, 1), 4)),
+ (add (decimate (shl SGPR_32, 2), 4)),
+ (add (decimate (shl SGPR_32, 3), 4)),
+ (add (decimate (shl SGPR_32, 4), 4)),
+ (add (decimate (shl SGPR_32, 5), 4)),
+ (add (decimate (shl SGPR_32, 6), 4)),
+ (add (decimate (shl SGPR_32, 7), 4)),
+ (add (decimate (shl SGPR_32, 8), 4)),
+ (add (decimate (shl SGPR_32, 9), 4)),
+ (add (decimate (shl SGPR_32, 10), 4)),
+ (add (decimate (shl SGPR_32, 11), 4)),
+ (add (decimate (shl SGPR_32, 12), 4)),
+ (add (decimate (shl SGPR_32, 13), 4)),
+ (add (decimate (shl SGPR_32, 14), 4)),
+ (add (decimate (shl SGPR_32, 15), 4))]>;
+
+// Trap handler TMP 32-bit registers
+def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
+ (add (sequence "TTMP%u", 0, 11))> {
+ let isAllocatable = 0;
+}
+
+// Trap handler TMP 64-bit registers
+def TTMP_64Regs : RegisterTuples<[sub0, sub1],
+ [(add (decimate TTMP_32, 2)),
+ (add (decimate (shl TTMP_32, 1), 2))]>;
+
+// Trap handler TMP 128-bit registers
+def TTMP_128Regs : RegisterTuples<[sub0, sub1, sub2, sub3],
+ [(add (decimate TTMP_32, 4)),
+ (add (decimate (shl TTMP_32, 1), 4)),
+ (add (decimate (shl TTMP_32, 2), 4)),
+ (add (decimate (shl TTMP_32, 3), 4))]>;
+
+// VGPR 32-bit registers
+def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
+ (add (sequence "VGPR%u", 0, 255))> {
+ let AllocationPriority = 1;
+ let Size = 32;
+}
+
+// VGPR 64-bit registers
+def VGPR_64 : RegisterTuples<[sub0, sub1],
+ [(add (trunc VGPR_32, 255)),
+ (add (shl VGPR_32, 1))]>;
+
+// VGPR 96-bit registers
+def VGPR_96 : RegisterTuples<[sub0, sub1, sub2],
+ [(add (trunc VGPR_32, 254)),
+ (add (shl VGPR_32, 1)),
+ (add (shl VGPR_32, 2))]>;
+
+// VGPR 128-bit registers
+def VGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3],
+ [(add (trunc VGPR_32, 253)),
+ (add (shl VGPR_32, 1)),
+ (add (shl VGPR_32, 2)),
+ (add (shl VGPR_32, 3))]>;
+
+// VGPR 256-bit registers
+def VGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7],
+ [(add (trunc VGPR_32, 249)),
+ (add (shl VGPR_32, 1)),
+ (add (shl VGPR_32, 2)),
+ (add (shl VGPR_32, 3)),
+ (add (shl VGPR_32, 4)),
+ (add (shl VGPR_32, 5)),
+ (add (shl VGPR_32, 6)),
+ (add (shl VGPR_32, 7))]>;
+
+// VGPR 512-bit registers
+def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
+ sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15],
+ [(add (trunc VGPR_32, 241)),
+ (add (shl VGPR_32, 1)),
+ (add (shl VGPR_32, 2)),
+ (add (shl VGPR_32, 3)),
+ (add (shl VGPR_32, 4)),
+ (add (shl VGPR_32, 5)),
+ (add (shl VGPR_32, 6)),
+ (add (shl VGPR_32, 7)),
+ (add (shl VGPR_32, 8)),
+ (add (shl VGPR_32, 9)),
+ (add (shl VGPR_32, 10)),
+ (add (shl VGPR_32, 11)),
+ (add (shl VGPR_32, 12)),
+ (add (shl VGPR_32, 13)),
+ (add (shl VGPR_32, 14)),
+ (add (shl VGPR_32, 15))]>;
+
+//===----------------------------------------------------------------------===//
+// Register classes used as source and destination
+//===----------------------------------------------------------------------===//
+
+// Subset of SReg_32 without M0 for SMRD instructions and alike.
+// See comments in SIInstructions.td for more info.
+def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
+ (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI,
+ TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI)> {
+ let AllocationPriority = 7;
+}
+
+def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
+ (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> {
+ let AllocationPriority = 7;
+}
+
+// Register class for all scalar registers (SGPRs + Special Registers)
+def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
+ (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI)> {
+ let AllocationPriority = 7;
+}
+
+def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)> {
+ let CopyCost = 1;
+ let AllocationPriority = 8;
+}
+
+def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add TTMP_64Regs)> {
+ let isAllocatable = 0;
+}
+
+def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32,
+ (add SGPR_64, VCC, FLAT_SCR, TTMP_64, TBA, TMA)> {
+ let CopyCost = 1;
+ let AllocationPriority = 8;
+}
+
+def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32,
+ (add SReg_64_XEXEC, EXEC)> {
+ let CopyCost = 1;
+ let AllocationPriority = 8;
+}
+
+// Requires 2 s_mov_b64 to copy
+let CopyCost = 2 in {
+
+def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128Regs)> {
+ let AllocationPriority = 10;
+}
+
+def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add TTMP_128Regs)> {
+ let isAllocatable = 0;
+}
+
+def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128, TTMP_128)> {
+ let AllocationPriority = 10;
+}
+
+} // End CopyCost = 2
+
+def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256)> {
+ // Requires 4 s_mov_b64 to copy
+ let CopyCost = 4;
+ let AllocationPriority = 11;
+}
+
+def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 32, (add SGPR_512)> {
+ // Requires 8 s_mov_b64 to copy
+ let CopyCost = 8;
+ let AllocationPriority = 12;
+}
+
+// Register class for all vector registers (VGPRs + Interploation Registers)
+def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 32, (add VGPR_64)> {
+ let Size = 64;
+
+ // Requires 2 v_mov_b32 to copy
+ let CopyCost = 2;
+ let AllocationPriority = 2;
+}
+
+def VReg_96 : RegisterClass<"AMDGPU", [untyped], 32, (add VGPR_96)> {
+ let Size = 96;
+
+ // Requires 3 v_mov_b32 to copy
+ let CopyCost = 3;
+ let AllocationPriority = 3;
+}
+
+def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, (add VGPR_128)> {
+ let Size = 128;
+
+ // Requires 4 v_mov_b32 to copy
+ let CopyCost = 4;
+ let AllocationPriority = 4;
+}
+
+def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add VGPR_256)> {
+ let Size = 256;
+ let CopyCost = 8;
+ let AllocationPriority = 5;
+}
+
+def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add VGPR_512)> {
+ let Size = 512;
+ let CopyCost = 16;
+ let AllocationPriority = 6;
+}
+
+def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> {
+ let Size = 32;
+}
+
+def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
+ (add VGPR_32, SReg_32)> {
+ let isAllocatable = 0;
+}
+
+def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 32, (add VReg_64, SReg_64)> {
+ let isAllocatable = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Register operands
+//===----------------------------------------------------------------------===//
+
+class RegImmMatcher<string name> : AsmOperandClass {
+ let Name = name;
+ let RenderMethod = "addRegOrImmOperands";
+}
+
+multiclass SIRegOperand <string rc, string MatchName, string opType> {
+ let OperandNamespace = "AMDGPU" in {
+ def _b16 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> {
+ let OperandType = opType#"_INT16";
+ let ParserMatchClass = RegImmMatcher<MatchName#"B16">;
+ let DecoderMethod = "decodeOperand_VSrc16";
+ }
+
+ def _f16 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> {
+ let OperandType = opType#"_FP16";
+ let ParserMatchClass = RegImmMatcher<MatchName#"F16">;
+ let DecoderMethod = "decodeOperand_VSrc16";
+ }
+
+ def _b32 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> {
+ let OperandType = opType#"_INT32";
+ let ParserMatchClass = RegImmMatcher<MatchName#"B32">;
+ }
+
+ def _f32 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> {
+ let OperandType = opType#"_FP32";
+ let ParserMatchClass = RegImmMatcher<MatchName#"F32">;
+ }
+
+ def _b64 : RegisterOperand<!cast<RegisterClass>(rc#"_64")> {
+ let OperandType = opType#"_INT64";
+ let ParserMatchClass = RegImmMatcher<MatchName#"B64">;
+ }
+
+ def _f64 : RegisterOperand<!cast<RegisterClass>(rc#"_64")> {
+ let OperandType = opType#"_FP64";
+ let ParserMatchClass = RegImmMatcher<MatchName#"F64">;
+ }
+ }
+}
+
+// FIXME: 64-bit sources can sometimes use 32-bit constants.
+multiclass RegImmOperand <string rc, string MatchName>
+ : SIRegOperand<rc, MatchName, "OPERAND_REG_IMM">;
+
+multiclass RegInlineOperand <string rc, string MatchName>
+ : SIRegOperand<rc, MatchName, "OPERAND_REG_INLINE_C">;
+
+//===----------------------------------------------------------------------===//
+// SSrc_* Operands with an SGPR or a 32-bit immediate
+//===----------------------------------------------------------------------===//
+
+defm SSrc : RegImmOperand<"SReg", "SSrc">;
+
+//===----------------------------------------------------------------------===//
+// SCSrc_* Operands with an SGPR or a inline constant
+//===----------------------------------------------------------------------===//
+
+defm SCSrc : RegInlineOperand<"SReg", "SCSrc"> ;
+
+//===----------------------------------------------------------------------===//
+// VSrc_* Operands with an SGPR, VGPR or a 32-bit immediate
+//===----------------------------------------------------------------------===//
+
+defm VSrc : RegImmOperand<"VS", "VSrc">;
+
+def VSrc_128 : RegisterOperand<VReg_128>;
+
+//===----------------------------------------------------------------------===//
+// VSrc_* Operands with an VGPR
+//===----------------------------------------------------------------------===//
+
+// This is for operands with the enum(9), VSrc encoding restriction,
+// but only allows VGPRs.
+def VRegSrc_32 : RegisterOperand<VGPR_32> {
+ //let ParserMatchClass = RegImmMatcher<"VRegSrc32">;
+ let DecoderMethod = "DecodeVS_32RegisterClass";
+}
+
+//===----------------------------------------------------------------------===//
+// VCSrc_* Operands with an SGPR, VGPR or an inline constant
+//===----------------------------------------------------------------------===//
+
+defm VCSrc : RegInlineOperand<"VS", "VCSrc">;
diff --git a/contrib/llvm/lib/Target/AMDGPU/SISchedule.td b/contrib/llvm/lib/Target/AMDGPU/SISchedule.td
new file mode 100644
index 000000000000..be27966fd5f1
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -0,0 +1,138 @@
+//===-- SISchedule.td - SI Scheduling definitons -------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// MachineModel definitions for Southern Islands (SI)
+//
+//===----------------------------------------------------------------------===//
+
+def : PredicateProlog<[{
+ const SIInstrInfo *TII =
+ static_cast<const SIInstrInfo*>(SchedModel->getInstrInfo());
+ (void)TII;
+}]>;
+
+def WriteBranch : SchedWrite;
+def WriteExport : SchedWrite;
+def WriteLDS : SchedWrite;
+def WriteSALU : SchedWrite;
+def WriteSMEM : SchedWrite;
+def WriteVMEM : SchedWrite;
+def WriteBarrier : SchedWrite;
+
+// Vector ALU instructions
+def Write32Bit : SchedWrite;
+def WriteQuarterRate32 : SchedWrite;
+def WriteFullOrQuarterRate32 : SchedWrite;
+
+def WriteFloatFMA : SchedWrite;
+
+// Slow quarter rate f64 instruction.
+def WriteDouble : SchedWrite;
+
+// half rate f64 instruction (same as v_add_f64)
+def WriteDoubleAdd : SchedWrite;
+
+// Half rate 64-bit instructions.
+def Write64Bit : SchedWrite;
+
+// FIXME: Should there be a class for instructions which are VALU
+// instructions and have VALU rates, but write to the SALU (i.e. VOPC
+// instructions)
+
+class SISchedMachineModel : SchedMachineModel {
+ let CompleteModel = 1;
+ // MicroOpBufferSize = 1 means that instructions will always be added
+ // the ready queue when they become available. This exposes them
+ // to the register pressure analysis.
+ let MicroOpBufferSize = 1;
+ let IssueWidth = 1;
+ let PostRAScheduler = 1;
+}
+
+def SIFullSpeedModel : SISchedMachineModel;
+def SIQuarterSpeedModel : SISchedMachineModel;
+
+// XXX: Are the resource counts correct?
+def HWBranch : ProcResource<1> {
+ let BufferSize = 1;
+}
+def HWExport : ProcResource<1> {
+ let BufferSize = 7; // Taken from S_WAITCNT
+}
+def HWLGKM : ProcResource<1> {
+ let BufferSize = 31; // Taken from S_WAITCNT
+}
+def HWSALU : ProcResource<1> {
+ let BufferSize = 1;
+}
+def HWVMEM : ProcResource<1> {
+ let BufferSize = 15; // Taken from S_WAITCNT
+}
+def HWVALU : ProcResource<1> {
+ let BufferSize = 1;
+}
+
+class HWWriteRes<SchedWrite write, list<ProcResourceKind> resources,
+ int latency> : WriteRes<write, resources> {
+ let Latency = latency;
+}
+
+class HWVALUWriteRes<SchedWrite write, int latency> :
+ HWWriteRes<write, [HWVALU], latency>;
+
+
+// The latency numbers are taken from AMD Accelerated Parallel Processing
+// guide. They may not be accurate.
+
+// The latency values are 1 / (operations / cycle) / 4.
+multiclass SICommonWriteRes {
+
+ def : HWWriteRes<WriteBranch, [HWBranch], 8>;
+ def : HWWriteRes<WriteExport, [HWExport], 4>;
+ def : HWWriteRes<WriteLDS, [HWLGKM], 5>; // Can be between 2 and 64
+ def : HWWriteRes<WriteSALU, [HWSALU], 1>;
+ def : HWWriteRes<WriteSMEM, [HWLGKM], 5>;
+ def : HWWriteRes<WriteVMEM, [HWVMEM], 80>;
+ def : HWWriteRes<WriteBarrier, [HWBranch], 500>; // XXX: Guessed ???
+
+ def : HWVALUWriteRes<Write32Bit, 1>;
+ def : HWVALUWriteRes<Write64Bit, 2>;
+ def : HWVALUWriteRes<WriteQuarterRate32, 4>;
+}
+
+def PredIsVGPR32Copy : SchedPredicate<[{TII->isVGPRCopy(*MI) && TII->getOpSize(*MI, 0) <= 32}]>;
+def PredIsVGPR64Copy : SchedPredicate<[{TII->isVGPRCopy(*MI) && TII->getOpSize(*MI, 0) > 32}]>;
+def WriteCopy : SchedWriteVariant<[
+ SchedVar<PredIsVGPR32Copy, [Write32Bit]>,
+ SchedVar<PredIsVGPR64Copy, [Write64Bit]>,
+ SchedVar<NoSchedPred, [WriteSALU]>]>;
+
+let SchedModel = SIFullSpeedModel in {
+
+defm : SICommonWriteRes;
+
+def : HWVALUWriteRes<WriteFloatFMA, 1>;
+def : HWVALUWriteRes<WriteDouble, 4>;
+def : HWVALUWriteRes<WriteDoubleAdd, 2>;
+
+def : InstRW<[WriteCopy], (instrs COPY)>;
+
+} // End SchedModel = SIFullSpeedModel
+
+let SchedModel = SIQuarterSpeedModel in {
+
+defm : SICommonWriteRes;
+
+def : HWVALUWriteRes<WriteFloatFMA, 16>;
+def : HWVALUWriteRes<WriteDouble, 16>;
+def : HWVALUWriteRes<WriteDoubleAdd, 8>;
+
+def : InstRW<[WriteCopy], (instrs COPY)>;
+
+} // End SchedModel = SIQuarterSpeedModel
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
new file mode 100644
index 000000000000..b27d7c691032
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -0,0 +1,512 @@
+//===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// The pass tries to use the 32-bit encoding for instructions when possible.
+//===----------------------------------------------------------------------===//
+//
+
+#include "AMDGPU.h"
+#include "AMDGPUMCInstLower.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define DEBUG_TYPE "si-shrink-instructions"
+
+STATISTIC(NumInstructionsShrunk,
+ "Number of 64-bit instruction reduced to 32-bit.");
+STATISTIC(NumLiteralConstantsFolded,
+ "Number of literal constants folded into 32-bit instructions.");
+
+using namespace llvm;
+
+namespace {
+
+class SIShrinkInstructions : public MachineFunctionPass {
+public:
+ static char ID;
+
+public:
+ SIShrinkInstructions() : MachineFunctionPass(ID) {
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override { return "SI Shrink Instructions"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE,
+ "SI Shrink Instructions", false, false)
+
+char SIShrinkInstructions::ID = 0;
+
+FunctionPass *llvm::createSIShrinkInstructionsPass() {
+ return new SIShrinkInstructions();
+}
+
+static bool isVGPR(const MachineOperand *MO, const SIRegisterInfo &TRI,
+ const MachineRegisterInfo &MRI) {
+ if (!MO->isReg())
+ return false;
+
+ if (TargetRegisterInfo::isVirtualRegister(MO->getReg()))
+ return TRI.hasVGPRs(MRI.getRegClass(MO->getReg()));
+
+ return TRI.hasVGPRs(TRI.getPhysRegClass(MO->getReg()));
+}
+
+static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
+ const SIRegisterInfo &TRI,
+ const MachineRegisterInfo &MRI) {
+
+ const MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+ // Can't shrink instruction with three operands.
+ // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add
+ // a special case for it. It can only be shrunk if the third operand
+ // is vcc. We should handle this the same way we handle vopc, by addding
+ // a register allocation hint pre-regalloc and then do the shrining
+ // post-regalloc.
+ if (Src2) {
+ switch (MI.getOpcode()) {
+ default: return false;
+
+ case AMDGPU::V_MAC_F32_e64:
+ case AMDGPU::V_MAC_F16_e64:
+ if (!isVGPR(Src2, TRI, MRI) ||
+ TII->hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
+ return false;
+ break;
+
+ case AMDGPU::V_CNDMASK_B32_e64:
+ break;
+ }
+ }
+
+ const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+ const MachineOperand *Src1Mod =
+ TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
+
+ if (Src1 && (!isVGPR(Src1, TRI, MRI) || (Src1Mod && Src1Mod->getImm() != 0)))
+ return false;
+
+ // We don't need to check src0, all input types are legal, so just make sure
+ // src0 isn't using any modifiers.
+ if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
+ return false;
+
+ // Check output modifiers
+ if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
+ return false;
+
+ return !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp);
+}
+
+/// \brief This function checks \p MI for operands defined by a move immediate
+/// instruction and then folds the literal constant into the instruction if it
+/// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instruction
+/// and will only fold literal constants if we are still in SSA.
+static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
+ MachineRegisterInfo &MRI, bool TryToCommute = true) {
+
+ if (!MRI.isSSA())
+ return;
+
+ assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI));
+
+ int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
+
+ // Only one literal constant is allowed per instruction, so if src0 is a
+ // literal constant then we can't do any folding.
+ if (TII->isLiteralConstant(MI, Src0Idx))
+ return;
+
+ // Try to fold Src0
+ MachineOperand &Src0 = MI.getOperand(Src0Idx);
+ if (Src0.isReg() && MRI.hasOneUse(Src0.getReg())) {
+ unsigned Reg = Src0.getReg();
+ MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
+ if (Def && Def->isMoveImmediate()) {
+ MachineOperand &MovSrc = Def->getOperand(1);
+ bool ConstantFolded = false;
+
+ if (MovSrc.isImm() && (isInt<32>(MovSrc.getImm()) ||
+ isUInt<32>(MovSrc.getImm()))) {
+ Src0.ChangeToImmediate(MovSrc.getImm());
+ ConstantFolded = true;
+ }
+ if (ConstantFolded) {
+ if (MRI.use_empty(Reg))
+ Def->eraseFromParent();
+ ++NumLiteralConstantsFolded;
+ return;
+ }
+ }
+ }
+
+ // We have failed to fold src0, so commute the instruction and try again.
+ if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(MI))
+ foldImmediates(MI, TII, MRI, false);
+
+}
+
+// Copy MachineOperand with all flags except setting it as implicit.
+static void copyFlagsToImplicitVCC(MachineInstr &MI,
+ const MachineOperand &Orig) {
+
+ for (MachineOperand &Use : MI.implicit_operands()) {
+ if (Use.getReg() == AMDGPU::VCC) {
+ Use.setIsUndef(Orig.isUndef());
+ Use.setIsKill(Orig.isKill());
+ return;
+ }
+ }
+}
+
+static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
+ return isInt<16>(Src.getImm()) &&
+ !TII->isInlineConstant(*Src.getParent(),
+ Src.getParent()->getOperandNo(&Src));
+}
+
+static bool isKUImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
+ return isUInt<16>(Src.getImm()) &&
+ !TII->isInlineConstant(*Src.getParent(),
+ Src.getParent()->getOperandNo(&Src));
+}
+
+static bool isKImmOrKUImmOperand(const SIInstrInfo *TII,
+ const MachineOperand &Src,
+ bool &IsUnsigned) {
+ if (isInt<16>(Src.getImm())) {
+ IsUnsigned = false;
+ return !TII->isInlineConstant(Src);
+ }
+
+ if (isUInt<16>(Src.getImm())) {
+ IsUnsigned = true;
+ return !TII->isInlineConstant(Src);
+ }
+
+ return false;
+}
+
+/// \returns true if the constant in \p Src should be replaced with a bitreverse
+/// of an inline immediate.
+static bool isReverseInlineImm(const SIInstrInfo *TII,
+ const MachineOperand &Src,
+ int32_t &ReverseImm) {
+ if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src))
+ return false;
+
+ ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Src.getImm()));
+ return ReverseImm >= -16 && ReverseImm <= 64;
+}
+
+/// Copy implicit register operands from specified instruction to this
+/// instruction that are not part of the instruction definition.
+static void copyExtraImplicitOps(MachineInstr &NewMI, MachineFunction &MF,
+ const MachineInstr &MI) {
+ for (unsigned i = MI.getDesc().getNumOperands() +
+ MI.getDesc().getNumImplicitUses() +
+ MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands();
+ i != e; ++i) {
+ const MachineOperand &MO = MI.getOperand(i);
+ if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask())
+ NewMI.addOperand(MF, MO);
+ }
+}
+
+static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) {
+ // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to
+ // get constants on the RHS.
+ if (!MI.getOperand(0).isReg())
+ TII->commuteInstruction(MI, false, 0, 1);
+
+ const MachineOperand &Src1 = MI.getOperand(1);
+ if (!Src1.isImm())
+ return;
+
+ int SOPKOpc = AMDGPU::getSOPKOp(MI.getOpcode());
+ if (SOPKOpc == -1)
+ return;
+
+ // eq/ne is special because the imm16 can be treated as signed or unsigned,
+ // and initially selectd to the unsigned versions.
+ if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) {
+ bool HasUImm;
+ if (isKImmOrKUImmOperand(TII, Src1, HasUImm)) {
+ if (!HasUImm) {
+ SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ?
+ AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32;
+ }
+
+ MI.setDesc(TII->get(SOPKOpc));
+ }
+
+ return;
+ }
+
+ const MCInstrDesc &NewDesc = TII->get(SOPKOpc);
+
+ if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(TII, Src1)) ||
+ (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(TII, Src1))) {
+ MI.setDesc(NewDesc);
+ }
+}
+
+bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
+
+ std::vector<unsigned> I1Defs;
+
+ for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+ BI != BE; ++BI) {
+
+ MachineBasicBlock &MBB = *BI;
+ MachineBasicBlock::iterator I, Next;
+ for (I = MBB.begin(); I != MBB.end(); I = Next) {
+ Next = std::next(I);
+ MachineInstr &MI = *I;
+
+ if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) {
+ // If this has a literal constant source that is the same as the
+ // reversed bits of an inline immediate, replace with a bitreverse of
+ // that constant. This saves 4 bytes in the common case of materializing
+ // sign bits.
+
+ // Test if we are after regalloc. We only want to do this after any
+ // optimizations happen because this will confuse them.
+ // XXX - not exactly a check for post-regalloc run.
+ MachineOperand &Src = MI.getOperand(1);
+ if (Src.isImm() &&
+ TargetRegisterInfo::isPhysicalRegister(MI.getOperand(0).getReg())) {
+ int32_t ReverseImm;
+ if (isReverseInlineImm(TII, Src, ReverseImm)) {
+ MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32));
+ Src.setImm(ReverseImm);
+ continue;
+ }
+ }
+ }
+
+ // Combine adjacent s_nops to use the immediate operand encoding how long
+ // to wait.
+ //
+ // s_nop N
+ // s_nop M
+ // =>
+ // s_nop (N + M)
+ if (MI.getOpcode() == AMDGPU::S_NOP &&
+ Next != MBB.end() &&
+ (*Next).getOpcode() == AMDGPU::S_NOP) {
+
+ MachineInstr &NextMI = *Next;
+ // The instruction encodes the amount to wait with an offset of 1,
+ // i.e. 0 is wait 1 cycle. Convert both to cycles and then convert back
+ // after adding.
+ uint8_t Nop0 = MI.getOperand(0).getImm() + 1;
+ uint8_t Nop1 = NextMI.getOperand(0).getImm() + 1;
+
+ // Make sure we don't overflow the bounds.
+ if (Nop0 + Nop1 <= 8) {
+ NextMI.getOperand(0).setImm(Nop0 + Nop1 - 1);
+ MI.eraseFromParent();
+ }
+
+ continue;
+ }
+
+ // FIXME: We also need to consider movs of constant operands since
+ // immediate operands are not folded if they have more than one use, and
+ // the operand folding pass is unaware if the immediate will be free since
+ // it won't know if the src == dest constraint will end up being
+ // satisfied.
+ if (MI.getOpcode() == AMDGPU::S_ADD_I32 ||
+ MI.getOpcode() == AMDGPU::S_MUL_I32) {
+ const MachineOperand *Dest = &MI.getOperand(0);
+ MachineOperand *Src0 = &MI.getOperand(1);
+ MachineOperand *Src1 = &MI.getOperand(2);
+
+ if (!Src0->isReg() && Src1->isReg()) {
+ if (TII->commuteInstruction(MI, false, 1, 2))
+ std::swap(Src0, Src1);
+ }
+
+ // FIXME: This could work better if hints worked with subregisters. If
+ // we have a vector add of a constant, we usually don't get the correct
+ // allocation due to the subregister usage.
+ if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) &&
+ Src0->isReg()) {
+ MRI.setRegAllocationHint(Dest->getReg(), 0, Src0->getReg());
+ MRI.setRegAllocationHint(Src0->getReg(), 0, Dest->getReg());
+ continue;
+ }
+
+ if (Src0->isReg() && Src0->getReg() == Dest->getReg()) {
+ if (Src1->isImm() && isKImmOperand(TII, *Src1)) {
+ unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ?
+ AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32;
+
+ MI.setDesc(TII->get(Opc));
+ MI.tieOperands(0, 1);
+ }
+ }
+ }
+
+ // Try to use s_cmpk_*
+ if (MI.isCompare() && TII->isSOPC(MI)) {
+ shrinkScalarCompare(TII, MI);
+ continue;
+ }
+
+ // Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
+ if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
+ const MachineOperand &Dst = MI.getOperand(0);
+ MachineOperand &Src = MI.getOperand(1);
+
+ if (Src.isImm() &&
+ TargetRegisterInfo::isPhysicalRegister(Dst.getReg())) {
+ int32_t ReverseImm;
+ if (isKImmOperand(TII, Src))
+ MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
+ else if (isReverseInlineImm(TII, Src, ReverseImm)) {
+ MI.setDesc(TII->get(AMDGPU::S_BREV_B32));
+ Src.setImm(ReverseImm);
+ }
+ }
+
+ continue;
+ }
+
+ if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
+ continue;
+
+ if (!canShrink(MI, TII, TRI, MRI)) {
+ // Try commuting the instruction and see if that enables us to shrink
+ // it.
+ if (!MI.isCommutable() || !TII->commuteInstruction(MI) ||
+ !canShrink(MI, TII, TRI, MRI))
+ continue;
+ }
+
+ // getVOPe32 could be -1 here if we started with an instruction that had
+ // a 32-bit encoding and then commuted it to an instruction that did not.
+ if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
+ continue;
+
+ int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
+
+ if (TII->isVOPC(Op32)) {
+ unsigned DstReg = MI.getOperand(0).getReg();
+ if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
+ // VOPC instructions can only write to the VCC register. We can't
+ // force them to use VCC here, because this is only one register and
+ // cannot deal with sequences which would require multiple copies of
+ // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
+ //
+ // So, instead of forcing the instruction to write to VCC, we provide
+ // a hint to the register allocator to use VCC and then we we will run
+ // this pass again after RA and shrink it if it outputs to VCC.
+ MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC);
+ continue;
+ }
+ if (DstReg != AMDGPU::VCC)
+ continue;
+ }
+
+ if (Op32 == AMDGPU::V_CNDMASK_B32_e32) {
+ // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC
+ // instructions.
+ const MachineOperand *Src2 =
+ TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+ if (!Src2->isReg())
+ continue;
+ unsigned SReg = Src2->getReg();
+ if (TargetRegisterInfo::isVirtualRegister(SReg)) {
+ MRI.setRegAllocationHint(SReg, 0, AMDGPU::VCC);
+ continue;
+ }
+ if (SReg != AMDGPU::VCC)
+ continue;
+ }
+
+ // We can shrink this instruction
+ DEBUG(dbgs() << "Shrinking " << MI);
+
+ MachineInstrBuilder Inst32 =
+ BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32));
+
+ // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
+ // For VOPC instructions, this is replaced by an implicit def of vcc.
+ int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
+ if (Op32DstIdx != -1) {
+ // dst
+ Inst32.addOperand(MI.getOperand(0));
+ } else {
+ assert(MI.getOperand(0).getReg() == AMDGPU::VCC &&
+ "Unexpected case");
+ }
+
+
+ Inst32.addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
+
+ const MachineOperand *Src1 =
+ TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+ if (Src1)
+ Inst32.addOperand(*Src1);
+
+ const MachineOperand *Src2 =
+ TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+ if (Src2) {
+ int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
+ if (Op32Src2Idx != -1) {
+ Inst32.addOperand(*Src2);
+ } else {
+ // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
+ // replaced with an implicit read of vcc. This was already added
+ // during the initial BuildMI, so find it to preserve the flags.
+ copyFlagsToImplicitVCC(*Inst32, *Src2);
+ }
+ }
+
+ ++NumInstructionsShrunk;
+
+ // Copy extra operands not present in the instruction definition.
+ copyExtraImplicitOps(*Inst32, MF, MI);
+
+ MI.eraseFromParent();
+ foldImmediates(*Inst32, TII, MRI);
+
+ DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
+
+
+ }
+ }
+ return false;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp b/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp
new file mode 100644
index 000000000000..aad68537f779
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp
@@ -0,0 +1,156 @@
+//===-- SITypeRewriter.cpp - Remove unwanted types ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass removes performs the following type substitution on all
+/// non-compute shaders:
+///
+/// v16i8 => i128
+/// - v16i8 is used for constant memory resource descriptors. This type is
+/// legal for some compute APIs, and we don't want to declare it as legal
+/// in the backend, because we want the legalizer to expand all v16i8
+/// operations.
+/// v1* => *
+/// - Having v1* types complicates the legalizer and we can easily replace
+/// - them with the element type.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+
+using namespace llvm;
+
+namespace {
+
+class SITypeRewriter : public FunctionPass,
+ public InstVisitor<SITypeRewriter> {
+
+ static char ID;
+ Module *Mod;
+ Type *v16i8;
+ Type *v4i32;
+
+public:
+ SITypeRewriter() : FunctionPass(ID) { }
+ bool doInitialization(Module &M) override;
+ bool runOnFunction(Function &F) override;
+ StringRef getPassName() const override { return "SI Type Rewriter"; }
+ void visitLoadInst(LoadInst &I);
+ void visitCallInst(CallInst &I);
+ void visitBitCast(BitCastInst &I);
+};
+
+} // End anonymous namespace
+
+char SITypeRewriter::ID = 0;
+
+bool SITypeRewriter::doInitialization(Module &M) {
+ Mod = &M;
+ v16i8 = VectorType::get(Type::getInt8Ty(M.getContext()), 16);
+ v4i32 = VectorType::get(Type::getInt32Ty(M.getContext()), 4);
+ return false;
+}
+
+bool SITypeRewriter::runOnFunction(Function &F) {
+ if (!AMDGPU::isShader(F.getCallingConv()))
+ return false;
+
+ visit(F);
+ visit(F);
+
+ return false;
+}
+
+void SITypeRewriter::visitLoadInst(LoadInst &I) {
+ Value *Ptr = I.getPointerOperand();
+ Type *PtrTy = Ptr->getType();
+ Type *ElemTy = PtrTy->getPointerElementType();
+ IRBuilder<> Builder(&I);
+ if (ElemTy == v16i8) {
+ Value *BitCast = Builder.CreateBitCast(Ptr,
+ PointerType::get(v4i32,PtrTy->getPointerAddressSpace()));
+ LoadInst *Load = Builder.CreateLoad(BitCast);
+ SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
+ I.getAllMetadataOtherThanDebugLoc(MD);
+ for (unsigned i = 0, e = MD.size(); i != e; ++i) {
+ Load->setMetadata(MD[i].first, MD[i].second);
+ }
+ Value *BitCastLoad = Builder.CreateBitCast(Load, I.getType());
+ I.replaceAllUsesWith(BitCastLoad);
+ I.eraseFromParent();
+ }
+}
+
+void SITypeRewriter::visitCallInst(CallInst &I) {
+ IRBuilder<> Builder(&I);
+
+ SmallVector <Value*, 8> Args;
+ SmallVector <Type*, 8> Types;
+ bool NeedToReplace = false;
+ Function *F = I.getCalledFunction();
+ if (!F)
+ return;
+
+ std::string Name = F->getName();
+ for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) {
+ Value *Arg = I.getArgOperand(i);
+ if (Arg->getType() == v16i8) {
+ Args.push_back(Builder.CreateBitCast(Arg, v4i32));
+ Types.push_back(v4i32);
+ NeedToReplace = true;
+ Name = Name + ".v4i32";
+ } else if (Arg->getType()->isVectorTy() &&
+ Arg->getType()->getVectorNumElements() == 1 &&
+ Arg->getType()->getVectorElementType() ==
+ Type::getInt32Ty(I.getContext())){
+ Type *ElementTy = Arg->getType()->getVectorElementType();
+ std::string TypeName = "i32";
+ InsertElementInst *Def = cast<InsertElementInst>(Arg);
+ Args.push_back(Def->getOperand(1));
+ Types.push_back(ElementTy);
+ std::string VecTypeName = "v1" + TypeName;
+ Name = Name.replace(Name.find(VecTypeName), VecTypeName.length(), TypeName);
+ NeedToReplace = true;
+ } else {
+ Args.push_back(Arg);
+ Types.push_back(Arg->getType());
+ }
+ }
+
+ if (!NeedToReplace) {
+ return;
+ }
+ Function *NewF = Mod->getFunction(Name);
+ if (!NewF) {
+ NewF = Function::Create(FunctionType::get(F->getReturnType(), Types, false), GlobalValue::ExternalLinkage, Name, Mod);
+ NewF->setAttributes(F->getAttributes());
+ }
+ I.replaceAllUsesWith(Builder.CreateCall(NewF, Args));
+ I.eraseFromParent();
+}
+
+void SITypeRewriter::visitBitCast(BitCastInst &I) {
+ IRBuilder<> Builder(&I);
+ if (I.getDestTy() != v4i32) {
+ return;
+ }
+
+ if (BitCastInst *Op = dyn_cast<BitCastInst>(I.getOperand(0))) {
+ if (Op->getSrcTy() == v4i32) {
+ I.replaceAllUsesWith(Op->getOperand(0));
+ I.eraseFromParent();
+ }
+ }
+}
+
+FunctionPass *llvm::createSITypeRewriter() {
+ return new SITypeRewriter();
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/contrib/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
new file mode 100644
index 000000000000..a613a220e29d
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -0,0 +1,730 @@
+//===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This pass adds instructions to enable whole quad mode for pixel
+/// shaders.
+///
+/// Whole quad mode is required for derivative computations, but it interferes
+/// with shader side effects (stores and atomics). This pass is run on the
+/// scheduled machine IR but before register coalescing, so that machine SSA is
+/// available for analysis. It ensures that WQM is enabled when necessary, but
+/// disabled around stores and atomics.
+///
+/// When necessary, this pass creates a function prolog
+///
+/// S_MOV_B64 LiveMask, EXEC
+/// S_WQM_B64 EXEC, EXEC
+///
+/// to enter WQM at the top of the function and surrounds blocks of Exact
+/// instructions by
+///
+/// S_AND_SAVEEXEC_B64 Tmp, LiveMask
+/// ...
+/// S_MOV_B64 EXEC, Tmp
+///
+/// In order to avoid excessive switching during sequences of Exact
+/// instructions, the pass first analyzes which instructions must be run in WQM
+/// (aka which instructions produce values that lead to derivative
+/// computations).
+///
+/// Basic blocks are always exited in WQM as long as some successor needs WQM.
+///
+/// There is room for improvement given better control flow analysis:
+///
+/// (1) at the top level (outside of control flow statements, and as long as
+/// kill hasn't been used), one SGPR can be saved by recovering WQM from
+/// the LiveMask (this is implemented for the entry block).
+///
+/// (2) when entire regions (e.g. if-else blocks or entire loops) only
+/// consist of exact and don't-care instructions, the switch only has to
+/// be done at the entry and exit points rather than potentially in each
+/// block of the region.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <cassert>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-wqm"
+
+namespace {
+
+enum {
+ StateWQM = 0x1,
+ StateExact = 0x2,
+};
+
+struct PrintState {
+public:
+ int State;
+
+ explicit PrintState(int State) : State(State) {}
+};
+
+static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
+ if (PS.State & StateWQM)
+ OS << "WQM";
+ if (PS.State & StateExact) {
+ if (PS.State & StateWQM)
+ OS << '|';
+ OS << "Exact";
+ }
+
+ return OS;
+}
+
+struct InstrInfo {
+ char Needs = 0;
+ char OutNeeds = 0;
+};
+
+struct BlockInfo {
+ char Needs = 0;
+ char InNeeds = 0;
+ char OutNeeds = 0;
+};
+
+struct WorkItem {
+ MachineBasicBlock *MBB = nullptr;
+ MachineInstr *MI = nullptr;
+
+ WorkItem() = default;
+ WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
+ WorkItem(MachineInstr *MI) : MI(MI) {}
+};
+
+class SIWholeQuadMode : public MachineFunctionPass {
+private:
+ const SIInstrInfo *TII;
+ const SIRegisterInfo *TRI;
+ MachineRegisterInfo *MRI;
+ LiveIntervals *LIS;
+
+ DenseMap<const MachineInstr *, InstrInfo> Instructions;
+ DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
+ SmallVector<MachineInstr *, 1> LiveMaskQueries;
+
+ void printInfo();
+
+ void markInstruction(MachineInstr &MI, char Flag,
+ std::vector<WorkItem> &Worklist);
+ void markUsesWQM(const MachineInstr &MI, std::vector<WorkItem> &Worklist);
+ char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
+ void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
+ void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
+ char analyzeFunction(MachineFunction &MF);
+
+ bool requiresCorrectState(const MachineInstr &MI) const;
+
+ MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator Before);
+ MachineBasicBlock::iterator
+ prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
+ MachineBasicBlock::iterator Last, bool PreferLast,
+ bool SaveSCC);
+ void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
+ unsigned SaveWQM, unsigned LiveMaskReg);
+ void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
+ unsigned SavedWQM);
+ void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
+
+ void lowerLiveMaskQueries(unsigned LiveMaskReg);
+
+public:
+ static char ID;
+
+ SIWholeQuadMode() :
+ MachineFunctionPass(ID) { }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override { return "SI Whole Quad Mode"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LiveIntervals>();
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // end anonymous namespace
+
+char SIWholeQuadMode::ID = 0;
+
+INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
+ false)
+
+char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
+
+FunctionPass *llvm::createSIWholeQuadModePass() {
+ return new SIWholeQuadMode;
+}
+
+void SIWholeQuadMode::printInfo() {
+ for (const auto &BII : Blocks) {
+ dbgs() << "\nBB#" << BII.first->getNumber() << ":\n"
+ << " InNeeds = " << PrintState(BII.second.InNeeds)
+ << ", Needs = " << PrintState(BII.second.Needs)
+ << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
+
+ for (const MachineInstr &MI : *BII.first) {
+ auto III = Instructions.find(&MI);
+ if (III == Instructions.end())
+ continue;
+
+ dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs)
+ << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
+ }
+ }
+}
+
+void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
+ std::vector<WorkItem> &Worklist) {
+ InstrInfo &II = Instructions[&MI];
+
+ assert(Flag == StateWQM || Flag == StateExact);
+
+ // Ignore if the instruction is already marked. The typical case is that we
+ // mark an instruction WQM multiple times, but for atomics it can happen that
+ // Flag is StateWQM, but Needs is already set to StateExact. In this case,
+ // letting the atomic run in StateExact is correct as per the relevant specs.
+ if (II.Needs)
+ return;
+
+ II.Needs = Flag;
+ Worklist.push_back(&MI);
+}
+
+/// Mark all instructions defining the uses in \p MI as WQM.
+void SIWholeQuadMode::markUsesWQM(const MachineInstr &MI,
+ std::vector<WorkItem> &Worklist) {
+ for (const MachineOperand &Use : MI.uses()) {
+ if (!Use.isReg() || !Use.isUse())
+ continue;
+
+ unsigned Reg = Use.getReg();
+
+ // Handle physical registers that we need to track; this is mostly relevant
+ // for VCC, which can appear as the (implicit) input of a uniform branch,
+ // e.g. when a loop counter is stored in a VGPR.
+ if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (Reg == AMDGPU::EXEC)
+ continue;
+
+ for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
+ LiveRange &LR = LIS->getRegUnit(*RegUnit);
+ const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
+ if (!Value)
+ continue;
+
+ // Since we're in machine SSA, we do not need to track physical
+ // registers across basic blocks.
+ if (Value->isPHIDef())
+ continue;
+
+ markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM,
+ Worklist);
+ }
+
+ continue;
+ }
+
+ for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
+ markInstruction(DefMI, StateWQM, Worklist);
+ }
+}
+
+// Scan instructions to determine which ones require an Exact execmask and
+// which ones seed WQM requirements.
+char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
+ std::vector<WorkItem> &Worklist) {
+ char GlobalFlags = 0;
+ bool WQMOutputs = MF.getFunction()->hasFnAttribute("amdgpu-ps-wqm-outputs");
+
+ for (auto BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) {
+ MachineBasicBlock &MBB = *BI;
+
+ for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
+ MachineInstr &MI = *II;
+ unsigned Opcode = MI.getOpcode();
+ char Flags = 0;
+
+ if (TII->isDS(Opcode)) {
+ Flags = StateWQM;
+ } else if (TII->isWQM(Opcode)) {
+ // Sampling instructions don't need to produce results for all pixels
+ // in a quad, they just require all inputs of a quad to have been
+ // computed for derivatives.
+ markUsesWQM(MI, Worklist);
+ GlobalFlags |= StateWQM;
+ continue;
+ } else if (TII->isDisableWQM(MI)) {
+ Flags = StateExact;
+ } else {
+ if (Opcode == AMDGPU::SI_PS_LIVE) {
+ LiveMaskQueries.push_back(&MI);
+ } else if (WQMOutputs) {
+ // The function is in machine SSA form, which means that physical
+ // VGPRs correspond to shader inputs and outputs. Inputs are
+ // only used, outputs are only defined.
+ for (const MachineOperand &MO : MI.defs()) {
+ if (!MO.isReg())
+ continue;
+
+ unsigned Reg = MO.getReg();
+
+ if (!TRI->isVirtualRegister(Reg) &&
+ TRI->hasVGPRs(TRI->getPhysRegClass(Reg))) {
+ Flags = StateWQM;
+ break;
+ }
+ }
+ }
+
+ if (!Flags)
+ continue;
+ }
+
+ markInstruction(MI, Flags, Worklist);
+ GlobalFlags |= Flags;
+ }
+ }
+
+ return GlobalFlags;
+}
+
+void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
+ std::vector<WorkItem>& Worklist) {
+ MachineBasicBlock *MBB = MI.getParent();
+ InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
+ BlockInfo &BI = Blocks[MBB];
+
+ // Control flow-type instructions and stores to temporary memory that are
+ // followed by WQM computations must themselves be in WQM.
+ if ((II.OutNeeds & StateWQM) && !II.Needs &&
+ (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
+ Instructions[&MI].Needs = StateWQM;
+ II.Needs = StateWQM;
+ }
+
+ // Propagate to block level
+ BI.Needs |= II.Needs;
+ if ((BI.InNeeds | II.Needs) != BI.InNeeds) {
+ BI.InNeeds |= II.Needs;
+ Worklist.push_back(MBB);
+ }
+
+ // Propagate backwards within block
+ if (MachineInstr *PrevMI = MI.getPrevNode()) {
+ char InNeeds = II.Needs | II.OutNeeds;
+ if (!PrevMI->isPHI()) {
+ InstrInfo &PrevII = Instructions[PrevMI];
+ if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
+ PrevII.OutNeeds |= InNeeds;
+ Worklist.push_back(PrevMI);
+ }
+ }
+ }
+
+ // Propagate WQM flag to instruction inputs
+ assert(II.Needs != (StateWQM | StateExact));
+
+ if (II.Needs == StateWQM)
+ markUsesWQM(MI, Worklist);
+}
+
+void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
+ std::vector<WorkItem>& Worklist) {
+ BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
+
+ // Propagate through instructions
+ if (!MBB.empty()) {
+ MachineInstr *LastMI = &*MBB.rbegin();
+ InstrInfo &LastII = Instructions[LastMI];
+ if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
+ LastII.OutNeeds |= BI.OutNeeds;
+ Worklist.push_back(LastMI);
+ }
+ }
+
+ // Predecessor blocks must provide for our WQM/Exact needs.
+ for (MachineBasicBlock *Pred : MBB.predecessors()) {
+ BlockInfo &PredBI = Blocks[Pred];
+ if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
+ continue;
+
+ PredBI.OutNeeds |= BI.InNeeds;
+ PredBI.InNeeds |= BI.InNeeds;
+ Worklist.push_back(Pred);
+ }
+
+ // All successors must be prepared to accept the same set of WQM/Exact data.
+ for (MachineBasicBlock *Succ : MBB.successors()) {
+ BlockInfo &SuccBI = Blocks[Succ];
+ if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
+ continue;
+
+ SuccBI.InNeeds |= BI.OutNeeds;
+ Worklist.push_back(Succ);
+ }
+}
+
+char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
+ std::vector<WorkItem> Worklist;
+ char GlobalFlags = scanInstructions(MF, Worklist);
+
+ while (!Worklist.empty()) {
+ WorkItem WI = Worklist.back();
+ Worklist.pop_back();
+
+ if (WI.MI)
+ propagateInstruction(*WI.MI, Worklist);
+ else
+ propagateBlock(*WI.MBB, Worklist);
+ }
+
+ return GlobalFlags;
+}
+
+/// Whether \p MI really requires the exec state computed during analysis.
+///
+/// Scalar instructions must occasionally be marked WQM for correct propagation
+/// (e.g. thread masks leading up to branches), but when it comes to actual
+/// execution, they don't care about EXEC.
+bool SIWholeQuadMode::requiresCorrectState(const MachineInstr &MI) const {
+ if (MI.isTerminator())
+ return true;
+
+ // Skip instructions that are not affected by EXEC
+ if (TII->isScalarUnit(MI))
+ return false;
+
+ // Generic instructions such as COPY will either disappear by register
+ // coalescing or be lowered to SALU or VALU instructions.
+ if (MI.isTransient()) {
+ if (MI.getNumExplicitOperands() >= 1) {
+ const MachineOperand &Op = MI.getOperand(0);
+ if (Op.isReg()) {
+ if (TRI->isSGPRReg(*MRI, Op.getReg())) {
+ // SGPR instructions are not affected by EXEC
+ return false;
+ }
+ }
+ }
+ }
+
+ return true;
+}
+
+MachineBasicBlock::iterator
+SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator Before) {
+ unsigned SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+ MachineInstr *Save =
+ BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
+ .addReg(AMDGPU::SCC);
+ MachineInstr *Restore =
+ BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
+ .addReg(SaveReg);
+
+ LIS->InsertMachineInstrInMaps(*Save);
+ LIS->InsertMachineInstrInMaps(*Restore);
+ LIS->createAndComputeVirtRegInterval(SaveReg);
+
+ return Restore;
+}
+
+// Return an iterator in the (inclusive) range [First, Last] at which
+// instructions can be safely inserted, keeping in mind that some of the
+// instructions we want to add necessarily clobber SCC.
+MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
+ MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
+ if (!SaveSCC)
+ return PreferLast ? Last : First;
+
+ LiveRange &LR = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
+ auto MBBE = MBB.end();
+ SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
+ : LIS->getMBBEndIdx(&MBB);
+ SlotIndex LastIdx =
+ Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
+ SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
+ const LiveRange::Segment *S;
+
+ for (;;) {
+ S = LR.getSegmentContaining(Idx);
+ if (!S)
+ break;
+
+ if (PreferLast) {
+ SlotIndex Next = S->start.getBaseIndex();
+ if (Next < FirstIdx)
+ break;
+ Idx = Next;
+ } else {
+ SlotIndex Next = S->end.getNextIndex().getBaseIndex();
+ if (Next > LastIdx)
+ break;
+ Idx = Next;
+ }
+ }
+
+ MachineBasicBlock::iterator MBBI;
+
+ if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
+ MBBI = MI;
+ else {
+ assert(Idx == LIS->getMBBEndIdx(&MBB));
+ MBBI = MBB.end();
+ }
+
+ if (S)
+ MBBI = saveSCC(MBB, MBBI);
+
+ return MBBI;
+}
+
+void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator Before,
+ unsigned SaveWQM, unsigned LiveMaskReg) {
+ MachineInstr *MI;
+
+ if (SaveWQM) {
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64),
+ SaveWQM)
+ .addReg(LiveMaskReg);
+ } else {
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64),
+ AMDGPU::EXEC)
+ .addReg(AMDGPU::EXEC)
+ .addReg(LiveMaskReg);
+ }
+
+ LIS->InsertMachineInstrInMaps(*MI);
+}
+
+void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator Before,
+ unsigned SavedWQM) {
+ MachineInstr *MI;
+
+ if (SavedWQM) {
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC)
+ .addReg(SavedWQM);
+ } else {
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
+ AMDGPU::EXEC)
+ .addReg(AMDGPU::EXEC);
+ }
+
+ LIS->InsertMachineInstrInMaps(*MI);
+}
+
+void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
+ bool isEntry) {
+ auto BII = Blocks.find(&MBB);
+ if (BII == Blocks.end())
+ return;
+
+ const BlockInfo &BI = BII->second;
+
+ if (!(BI.InNeeds & StateWQM))
+ return;
+
+ // This is a non-entry block that is WQM throughout, so no need to do
+ // anything.
+ if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact)
+ return;
+
+ DEBUG(dbgs() << "\nProcessing block BB#" << MBB.getNumber() << ":\n");
+
+ unsigned SavedWQMReg = 0;
+ bool WQMFromExec = isEntry;
+ char State = isEntry ? StateExact : StateWQM;
+
+ auto II = MBB.getFirstNonPHI(), IE = MBB.end();
+ if (isEntry)
+ ++II; // Skip the instruction that saves LiveMask
+
+ MachineBasicBlock::iterator First = IE;
+ for (;;) {
+ MachineBasicBlock::iterator Next = II;
+ char Needs = 0;
+ char OutNeeds = 0;
+
+ if (First == IE)
+ First = II;
+
+ if (II != IE) {
+ MachineInstr &MI = *II;
+
+ if (requiresCorrectState(MI)) {
+ auto III = Instructions.find(&MI);
+ if (III != Instructions.end()) {
+ Needs = III->second.Needs;
+ OutNeeds = III->second.OutNeeds;
+ }
+ }
+
+ if (MI.isTerminator() && !Needs && OutNeeds == StateExact)
+ Needs = StateExact;
+
+ if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact)
+ MI.getOperand(3).setImm(1);
+
+ ++Next;
+ } else {
+ // End of basic block
+ if (BI.OutNeeds & StateWQM)
+ Needs = StateWQM;
+ else if (BI.OutNeeds == StateExact)
+ Needs = StateExact;
+ }
+
+ if (Needs) {
+ if (Needs != State) {
+ MachineBasicBlock::iterator Before =
+ prepareInsertion(MBB, First, II, Needs == StateWQM,
+ Needs == StateExact || WQMFromExec);
+
+ if (Needs == StateExact) {
+ if (!WQMFromExec && (OutNeeds & StateWQM))
+ SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+
+ toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
+ } else {
+ assert(WQMFromExec == (SavedWQMReg == 0));
+
+ toWQM(MBB, Before, SavedWQMReg);
+
+ if (SavedWQMReg) {
+ LIS->createAndComputeVirtRegInterval(SavedWQMReg);
+ SavedWQMReg = 0;
+ }
+ }
+
+ State = Needs;
+ }
+
+ First = IE;
+ }
+
+ if (II == IE)
+ break;
+ II = Next;
+ }
+}
+
+void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
+ for (MachineInstr *MI : LiveMaskQueries) {
+ const DebugLoc &DL = MI->getDebugLoc();
+ unsigned Dest = MI->getOperand(0).getReg();
+ MachineInstr *Copy =
+ BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
+ .addReg(LiveMaskReg);
+
+ LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
+ MI->eraseFromParent();
+ }
+}
+
+bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
+ if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS)
+ return false;
+
+ Instructions.clear();
+ Blocks.clear();
+ LiveMaskQueries.clear();
+
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+
+ TII = ST.getInstrInfo();
+ TRI = &TII->getRegisterInfo();
+ MRI = &MF.getRegInfo();
+ LIS = &getAnalysis<LiveIntervals>();
+
+ char GlobalFlags = analyzeFunction(MF);
+ if (!(GlobalFlags & StateWQM)) {
+ lowerLiveMaskQueries(AMDGPU::EXEC);
+ return !LiveMaskQueries.empty();
+ }
+
+ // Store a copy of the original live mask when required
+ unsigned LiveMaskReg = 0;
+ {
+ MachineBasicBlock &Entry = MF.front();
+ MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
+
+ if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) {
+ LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),
+ TII->get(AMDGPU::COPY), LiveMaskReg)
+ .addReg(AMDGPU::EXEC);
+ LIS->InsertMachineInstrInMaps(*MI);
+ }
+
+ if (GlobalFlags == StateWQM) {
+ // For a shader that needs only WQM, we can just set it once.
+ BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
+ AMDGPU::EXEC)
+ .addReg(AMDGPU::EXEC);
+
+ lowerLiveMaskQueries(LiveMaskReg);
+ // EntryMI may become invalid here
+ return true;
+ }
+ }
+
+ DEBUG(printInfo());
+
+ lowerLiveMaskQueries(LiveMaskReg);
+
+ // Handle the general case
+ for (auto BII : Blocks)
+ processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin());
+
+ // Physical registers like SCC aren't tracked by default anyway, so just
+ // removing the ranges we computed is the simplest option for maintaining
+ // the analysis results.
+ LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
+
+ return true;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SMInstructions.td b/contrib/llvm/lib/Target/AMDGPU/SMInstructions.td
new file mode 100644
index 000000000000..02656483cd74
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -0,0 +1,535 @@
+//===---- SMInstructions.td - Scalar Memory Instruction Defintions --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def smrd_offset_8 : NamedOperandU32<"SMRDOffset8",
+ NamedMatchClass<"SMRDOffset8">> {
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def smrd_offset_20 : NamedOperandU32<"SMRDOffset20",
+ NamedMatchClass<"SMRDOffset20">> {
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+//===----------------------------------------------------------------------===//
+// Scalar Memory classes
+//===----------------------------------------------------------------------===//
+
+class SM_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> pattern=[]> :
+ InstSI <outs, ins, "", pattern>,
+ SIMCInstr<opName, SIEncodingFamily.NONE> {
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+
+ let LGKM_CNT = 1;
+ let SMRD = 1;
+ let mayStore = 0;
+ let mayLoad = 1;
+ let hasSideEffects = 0;
+ let UseNamedOperandTable = 1;
+ let SchedRW = [WriteSMEM];
+ let SubtargetPredicate = isGCN;
+
+ string Mnemonic = opName;
+ string AsmOperands = asmOps;
+
+ bits<1> has_sbase = 1;
+ bits<1> has_sdst = 1;
+ bit has_glc = 0;
+ bits<1> has_offset = 1;
+ bits<1> offset_is_imm = 0;
+}
+
+class SM_Real <SM_Pseudo ps>
+ : InstSI<ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []> {
+
+ let isPseudo = 0;
+ let isCodeGenOnly = 0;
+
+ // copy relevant pseudo op flags
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let AsmMatchConverter = ps.AsmMatchConverter;
+
+ // encoding
+ bits<7> sbase;
+ bits<7> sdst;
+ bits<32> offset;
+ bits<1> imm = !if(ps.has_offset, ps.offset_is_imm, 0);
+}
+
+class SM_Load_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> pattern=[]>
+ : SM_Pseudo<opName, outs, ins, asmOps, pattern> {
+ RegisterClass BaseClass;
+ let mayLoad = 1;
+ let mayStore = 0;
+ let has_glc = 1;
+}
+
+class SM_Store_Pseudo <string opName, dag ins, string asmOps, list<dag> pattern = []>
+ : SM_Pseudo<opName, (outs), ins, asmOps, pattern> {
+ RegisterClass BaseClass;
+ RegisterClass SrcClass;
+ let mayLoad = 0;
+ let mayStore = 1;
+ let has_glc = 1;
+ let ScalarStore = 1;
+}
+
+multiclass SM_Pseudo_Loads<string opName,
+ RegisterClass baseClass,
+ RegisterClass dstClass> {
+ def _IMM : SM_Load_Pseudo <opName,
+ (outs dstClass:$sdst),
+ (ins baseClass:$sbase, i32imm:$offset, i1imm:$glc),
+ " $sdst, $sbase, $offset$glc", []> {
+ let offset_is_imm = 1;
+ let BaseClass = baseClass;
+ let PseudoInstr = opName # "_IMM";
+ let has_glc = 1;
+ }
+
+ def _SGPR : SM_Load_Pseudo <opName,
+ (outs dstClass:$sdst),
+ (ins baseClass:$sbase, SReg_32:$soff, i1imm:$glc),
+ " $sdst, $sbase, $offset$glc", []> {
+ let BaseClass = baseClass;
+ let PseudoInstr = opName # "_SGPR";
+ let has_glc = 1;
+ }
+}
+
+multiclass SM_Pseudo_Stores<string opName,
+ RegisterClass baseClass,
+ RegisterClass srcClass> {
+ def _IMM : SM_Store_Pseudo <opName,
+ (ins srcClass:$sdata, baseClass:$sbase, i32imm:$offset, i1imm:$glc),
+ " $sdata, $sbase, $offset$glc", []> {
+ let offset_is_imm = 1;
+ let BaseClass = baseClass;
+ let SrcClass = srcClass;
+ let PseudoInstr = opName # "_IMM";
+ }
+
+ def _SGPR : SM_Store_Pseudo <opName,
+ (ins srcClass:$sdata, baseClass:$sbase, SReg_32:$soff, i1imm:$glc),
+ " $sdata, $sbase, $offset$glc", []> {
+ let BaseClass = baseClass;
+ let SrcClass = srcClass;
+ let PseudoInstr = opName # "_SGPR";
+ }
+}
+
+class SM_Time_Pseudo<string opName, SDPatternOperator node> : SM_Pseudo<
+ opName, (outs SReg_64_XEXEC:$sdst), (ins),
+ " $sdst", [(set i64:$sdst, (node))]> {
+ let hasSideEffects = 1;
+ // FIXME: mayStore = ? is a workaround for tablegen bug for different
+ // inferred mayStore flags for the instruction pattern vs. standalone
+ // Pat. Each considers the other contradictory.
+ let mayStore = ?;
+ let mayLoad = ?;
+ let has_sbase = 0;
+ let has_offset = 0;
+}
+
+class SM_Inval_Pseudo <string opName, SDPatternOperator node> : SM_Pseudo<
+ opName, (outs), (ins), "", [(node)]> {
+ let hasSideEffects = 1;
+ let mayStore = 1;
+ let has_sdst = 0;
+ let has_sbase = 0;
+ let has_offset = 0;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Scalar Memory Instructions
+//===----------------------------------------------------------------------===//
+
+// We are using the SReg_32_XM0 and not the SReg_32 register class for 32-bit
+// SMRD instructions, because the SReg_32_XM0 register class does not include M0
+// and writing to M0 from an SMRD instruction will hang the GPU.
+
+// XXX - SMEM instructions do not allow exec for data operand, but
+// does sdst for SMRD on SI/CI?
+defm S_LOAD_DWORD : SM_Pseudo_Loads <"s_load_dword", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_LOAD_DWORDX2 : SM_Pseudo_Loads <"s_load_dwordx2", SReg_64, SReg_64_XEXEC>;
+defm S_LOAD_DWORDX4 : SM_Pseudo_Loads <"s_load_dwordx4", SReg_64, SReg_128>;
+defm S_LOAD_DWORDX8 : SM_Pseudo_Loads <"s_load_dwordx8", SReg_64, SReg_256>;
+defm S_LOAD_DWORDX16 : SM_Pseudo_Loads <"s_load_dwordx16", SReg_64, SReg_512>;
+
+defm S_BUFFER_LOAD_DWORD : SM_Pseudo_Loads <
+ "s_buffer_load_dword", SReg_128, SReg_32_XM0_XEXEC
+>;
+
+// FIXME: exec_lo/exec_hi appear to be allowed for SMRD loads on
+// SI/CI, bit disallowed for SMEM on VI.
+defm S_BUFFER_LOAD_DWORDX2 : SM_Pseudo_Loads <
+ "s_buffer_load_dwordx2", SReg_128, SReg_64_XEXEC
+>;
+
+defm S_BUFFER_LOAD_DWORDX4 : SM_Pseudo_Loads <
+ "s_buffer_load_dwordx4", SReg_128, SReg_128
+>;
+
+defm S_BUFFER_LOAD_DWORDX8 : SM_Pseudo_Loads <
+ "s_buffer_load_dwordx8", SReg_128, SReg_256
+>;
+
+defm S_BUFFER_LOAD_DWORDX16 : SM_Pseudo_Loads <
+ "s_buffer_load_dwordx16", SReg_128, SReg_512
+>;
+
+defm S_STORE_DWORD : SM_Pseudo_Stores <"s_store_dword", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_STORE_DWORDX2 : SM_Pseudo_Stores <"s_store_dwordx2", SReg_64, SReg_64_XEXEC>;
+defm S_STORE_DWORDX4 : SM_Pseudo_Stores <"s_store_dwordx4", SReg_64, SReg_128>;
+
+defm S_BUFFER_STORE_DWORD : SM_Pseudo_Stores <
+ "s_buffer_store_dword", SReg_128, SReg_32_XM0_XEXEC
+>;
+
+defm S_BUFFER_STORE_DWORDX2 : SM_Pseudo_Stores <
+ "s_buffer_store_dwordx2", SReg_128, SReg_64_XEXEC
+>;
+
+defm S_BUFFER_STORE_DWORDX4 : SM_Pseudo_Stores <
+ "s_buffer_store_dwordx4", SReg_128, SReg_128
+>;
+
+
+def S_MEMTIME : SM_Time_Pseudo <"s_memtime", int_amdgcn_s_memtime>;
+def S_DCACHE_INV : SM_Inval_Pseudo <"s_dcache_inv", int_amdgcn_s_dcache_inv>;
+
+let SubtargetPredicate = isCIVI in {
+def S_DCACHE_INV_VOL : SM_Inval_Pseudo <"s_dcache_inv_vol", int_amdgcn_s_dcache_inv_vol>;
+} // let SubtargetPredicate = isCIVI
+
+let SubtargetPredicate = isVI in {
+def S_DCACHE_WB : SM_Inval_Pseudo <"s_dcache_wb", int_amdgcn_s_dcache_wb>;
+def S_DCACHE_WB_VOL : SM_Inval_Pseudo <"s_dcache_wb_vol", int_amdgcn_s_dcache_wb_vol>;
+def S_MEMREALTIME : SM_Time_Pseudo <"s_memrealtime", int_amdgcn_s_memrealtime>;
+} // SubtargetPredicate = isVI
+
+
+
+//===----------------------------------------------------------------------===//
+// Scalar Memory Patterns
+//===----------------------------------------------------------------------===//
+
+
+def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{
+ auto Ld = cast<LoadSDNode>(N);
+ return Ld->getAlignment() >= 4 &&
+ ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+ static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N)) ||
+ (Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
+ static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N) &&
+ static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpHasNoClobberedMemOperand(N)));
+}]>;
+
+def SMRDImm : ComplexPattern<i64, 2, "SelectSMRDImm">;
+def SMRDImm32 : ComplexPattern<i64, 2, "SelectSMRDImm32">;
+def SMRDSgpr : ComplexPattern<i64, 2, "SelectSMRDSgpr">;
+def SMRDBufferImm : ComplexPattern<i32, 1, "SelectSMRDBufferImm">;
+def SMRDBufferImm32 : ComplexPattern<i32, 1, "SelectSMRDBufferImm32">;
+def SMRDBufferSgpr : ComplexPattern<i32, 1, "SelectSMRDBufferSgpr">;
+
+let Predicates = [isGCN] in {
+
+multiclass SMRD_Pattern <string Instr, ValueType vt> {
+
+ // 1. IMM offset
+ def : Pat <
+ (smrd_load (SMRDImm i64:$sbase, i32:$offset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0))
+ >;
+
+ // 2. SGPR offset
+ def : Pat <
+ (smrd_load (SMRDSgpr i64:$sbase, i32:$offset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, 0))
+ >;
+}
+
+let Predicates = [isSICI] in {
+def : Pat <
+ (i64 (readcyclecounter)),
+ (S_MEMTIME)
+>;
+}
+
+// Global and constant loads can be selected to either MUBUF or SMRD
+// instructions, but SMRD instructions are faster so we want the instruction
+// selector to prefer those.
+let AddedComplexity = 100 in {
+
+defm : SMRD_Pattern <"S_LOAD_DWORD", i32>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX2", v2i32>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX4", v4i32>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX8", v8i32>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>;
+
+// 1. Offset as an immediate
+def SM_LOAD_PATTERN : Pat < // name this pattern to reuse AddedComplexity on CI
+ (SIload_constant v4i32:$sbase, (SMRDBufferImm i32:$offset)),
+ (S_BUFFER_LOAD_DWORD_IMM $sbase, $offset, 0)
+>;
+
+// 2. Offset loaded in an 32bit SGPR
+def : Pat <
+ (SIload_constant v4i32:$sbase, (SMRDBufferSgpr i32:$offset)),
+ (S_BUFFER_LOAD_DWORD_SGPR $sbase, $offset, 0)
+>;
+
+} // End let AddedComplexity = 100
+
+} // let Predicates = [isGCN]
+
+let Predicates = [isVI] in {
+
+// 1. Offset as 20bit DWORD immediate
+def : Pat <
+ (SIload_constant v4i32:$sbase, IMM20bit:$offset),
+ (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset), 0)
+>;
+
+def : Pat <
+ (i64 (readcyclecounter)),
+ (S_MEMREALTIME)
+>;
+
+} // let Predicates = [isVI]
+
+
+//===----------------------------------------------------------------------===//
+// Targets
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SI
+//===----------------------------------------------------------------------===//
+
+class SMRD_Real_si <bits<5> op, SM_Pseudo ps>
+ : SM_Real<ps>
+ , SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SI>
+ , Enc32 {
+
+ let AssemblerPredicates = [isSICI];
+ let DecoderNamespace = "SICI";
+
+ let Inst{7-0} = !if(ps.has_offset, offset{7-0}, ?);
+ let Inst{8} = imm;
+ let Inst{14-9} = !if(ps.has_sbase, sbase{6-1}, ?);
+ let Inst{21-15} = !if(ps.has_sdst, sdst{6-0}, ?);
+ let Inst{26-22} = op;
+ let Inst{31-27} = 0x18; //encoding
+}
+
+// FIXME: Assembler should reject trying to use glc on SMRD
+// instructions on SI.
+multiclass SM_Real_Loads_si<bits<5> op, string ps,
+ SM_Load_Pseudo immPs = !cast<SM_Load_Pseudo>(ps#_IMM),
+ SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> {
+
+ def _IMM_si : SMRD_Real_si <op, immPs> {
+ let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_8:$offset, GLC:$glc);
+ }
+
+ // FIXME: The operand name $offset is inconsistent with $soff used
+ // in the pseudo
+ def _SGPR_si : SMRD_Real_si <op, sgprPs> {
+ let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc);
+ }
+
+}
+
+defm S_LOAD_DWORD : SM_Real_Loads_si <0x00, "S_LOAD_DWORD">;
+defm S_LOAD_DWORDX2 : SM_Real_Loads_si <0x01, "S_LOAD_DWORDX2">;
+defm S_LOAD_DWORDX4 : SM_Real_Loads_si <0x02, "S_LOAD_DWORDX4">;
+defm S_LOAD_DWORDX8 : SM_Real_Loads_si <0x03, "S_LOAD_DWORDX8">;
+defm S_LOAD_DWORDX16 : SM_Real_Loads_si <0x04, "S_LOAD_DWORDX16">;
+defm S_BUFFER_LOAD_DWORD : SM_Real_Loads_si <0x08, "S_BUFFER_LOAD_DWORD">;
+defm S_BUFFER_LOAD_DWORDX2 : SM_Real_Loads_si <0x09, "S_BUFFER_LOAD_DWORDX2">;
+defm S_BUFFER_LOAD_DWORDX4 : SM_Real_Loads_si <0x0a, "S_BUFFER_LOAD_DWORDX4">;
+defm S_BUFFER_LOAD_DWORDX8 : SM_Real_Loads_si <0x0b, "S_BUFFER_LOAD_DWORDX8">;
+defm S_BUFFER_LOAD_DWORDX16 : SM_Real_Loads_si <0x0c, "S_BUFFER_LOAD_DWORDX16">;
+
+def S_MEMTIME_si : SMRD_Real_si <0x1e, S_MEMTIME>;
+def S_DCACHE_INV_si : SMRD_Real_si <0x1f, S_DCACHE_INV>;
+
+
+//===----------------------------------------------------------------------===//
+// VI
+//===----------------------------------------------------------------------===//
+
+class SMEM_Real_vi <bits<8> op, SM_Pseudo ps>
+ : SM_Real<ps>
+ , SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI>
+ , Enc64 {
+ bit glc;
+
+ let AssemblerPredicates = [isVI];
+ let DecoderNamespace = "VI";
+
+ let Inst{5-0} = !if(ps.has_sbase, sbase{6-1}, ?);
+ let Inst{12-6} = !if(ps.has_sdst, sdst{6-0}, ?);
+
+ let Inst{16} = !if(ps.has_glc, glc, ?);
+ let Inst{17} = imm;
+ let Inst{25-18} = op;
+ let Inst{31-26} = 0x30; //encoding
+ let Inst{51-32} = !if(ps.has_offset, offset{19-0}, ?);
+}
+
+multiclass SM_Real_Loads_vi<bits<8> op, string ps,
+ SM_Load_Pseudo immPs = !cast<SM_Load_Pseudo>(ps#_IMM),
+ SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> {
+ def _IMM_vi : SMEM_Real_vi <op, immPs> {
+ let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc);
+ }
+ def _SGPR_vi : SMEM_Real_vi <op, sgprPs> {
+ let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc);
+ }
+}
+
+class SMEM_Real_Store_vi <bits<8> op, SM_Pseudo ps> : SMEM_Real_vi <op, ps> {
+ // encoding
+ bits<7> sdata;
+
+ let sdst = ?;
+ let Inst{12-6} = !if(ps.has_sdst, sdata{6-0}, ?);
+}
+
+multiclass SM_Real_Stores_vi<bits<8> op, string ps,
+ SM_Store_Pseudo immPs = !cast<SM_Store_Pseudo>(ps#_IMM),
+ SM_Store_Pseudo sgprPs = !cast<SM_Store_Pseudo>(ps#_SGPR)> {
+ // FIXME: The operand name $offset is inconsistent with $soff used
+ // in the pseudo
+ def _IMM_vi : SMEM_Real_Store_vi <op, immPs> {
+ let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc);
+ }
+
+ def _SGPR_vi : SMEM_Real_Store_vi <op, sgprPs> {
+ let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc);
+ }
+}
+
+defm S_LOAD_DWORD : SM_Real_Loads_vi <0x00, "S_LOAD_DWORD">;
+defm S_LOAD_DWORDX2 : SM_Real_Loads_vi <0x01, "S_LOAD_DWORDX2">;
+defm S_LOAD_DWORDX4 : SM_Real_Loads_vi <0x02, "S_LOAD_DWORDX4">;
+defm S_LOAD_DWORDX8 : SM_Real_Loads_vi <0x03, "S_LOAD_DWORDX8">;
+defm S_LOAD_DWORDX16 : SM_Real_Loads_vi <0x04, "S_LOAD_DWORDX16">;
+defm S_BUFFER_LOAD_DWORD : SM_Real_Loads_vi <0x08, "S_BUFFER_LOAD_DWORD">;
+defm S_BUFFER_LOAD_DWORDX2 : SM_Real_Loads_vi <0x09, "S_BUFFER_LOAD_DWORDX2">;
+defm S_BUFFER_LOAD_DWORDX4 : SM_Real_Loads_vi <0x0a, "S_BUFFER_LOAD_DWORDX4">;
+defm S_BUFFER_LOAD_DWORDX8 : SM_Real_Loads_vi <0x0b, "S_BUFFER_LOAD_DWORDX8">;
+defm S_BUFFER_LOAD_DWORDX16 : SM_Real_Loads_vi <0x0c, "S_BUFFER_LOAD_DWORDX16">;
+
+defm S_STORE_DWORD : SM_Real_Stores_vi <0x10, "S_STORE_DWORD">;
+defm S_STORE_DWORDX2 : SM_Real_Stores_vi <0x11, "S_STORE_DWORDX2">;
+defm S_STORE_DWORDX4 : SM_Real_Stores_vi <0x12, "S_STORE_DWORDX4">;
+
+defm S_BUFFER_STORE_DWORD : SM_Real_Stores_vi <0x18, "S_BUFFER_STORE_DWORD">;
+defm S_BUFFER_STORE_DWORDX2 : SM_Real_Stores_vi <0x19, "S_BUFFER_STORE_DWORDX2">;
+defm S_BUFFER_STORE_DWORDX4 : SM_Real_Stores_vi <0x1a, "S_BUFFER_STORE_DWORDX4">;
+
+// These instructions use same encoding
+def S_DCACHE_INV_vi : SMEM_Real_vi <0x20, S_DCACHE_INV>;
+def S_DCACHE_WB_vi : SMEM_Real_vi <0x21, S_DCACHE_WB>;
+def S_DCACHE_INV_VOL_vi : SMEM_Real_vi <0x22, S_DCACHE_INV_VOL>;
+def S_DCACHE_WB_VOL_vi : SMEM_Real_vi <0x23, S_DCACHE_WB_VOL>;
+def S_MEMTIME_vi : SMEM_Real_vi <0x24, S_MEMTIME>;
+def S_MEMREALTIME_vi : SMEM_Real_vi <0x25, S_MEMREALTIME>;
+
+
+//===----------------------------------------------------------------------===//
+// CI
+//===----------------------------------------------------------------------===//
+
+def smrd_literal_offset : NamedOperandU32<"SMRDLiteralOffset",
+ NamedMatchClass<"SMRDLiteralOffset">> {
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+class SMRD_Real_Load_IMM_ci <bits<5> op, SM_Load_Pseudo ps> :
+ SM_Real<ps>,
+ Enc64 {
+
+ let AssemblerPredicates = [isCIOnly];
+ let DecoderNamespace = "CI";
+ let InOperandList = (ins ps.BaseClass:$sbase, smrd_literal_offset:$offset, GLC:$glc);
+
+ let LGKM_CNT = ps.LGKM_CNT;
+ let SMRD = ps.SMRD;
+ let mayLoad = ps.mayLoad;
+ let mayStore = ps.mayStore;
+ let hasSideEffects = ps.hasSideEffects;
+ let SchedRW = ps.SchedRW;
+ let UseNamedOperandTable = ps.UseNamedOperandTable;
+
+ let Inst{7-0} = 0xff;
+ let Inst{8} = 0;
+ let Inst{14-9} = sbase{6-1};
+ let Inst{21-15} = sdst{6-0};
+ let Inst{26-22} = op;
+ let Inst{31-27} = 0x18; //encoding
+ let Inst{63-32} = offset{31-0};
+}
+
+def S_LOAD_DWORD_IMM_ci : SMRD_Real_Load_IMM_ci <0x00, S_LOAD_DWORD_IMM>;
+def S_LOAD_DWORDX2_IMM_ci : SMRD_Real_Load_IMM_ci <0x01, S_LOAD_DWORDX2_IMM>;
+def S_LOAD_DWORDX4_IMM_ci : SMRD_Real_Load_IMM_ci <0x02, S_LOAD_DWORDX4_IMM>;
+def S_LOAD_DWORDX8_IMM_ci : SMRD_Real_Load_IMM_ci <0x03, S_LOAD_DWORDX8_IMM>;
+def S_LOAD_DWORDX16_IMM_ci : SMRD_Real_Load_IMM_ci <0x04, S_LOAD_DWORDX16_IMM>;
+def S_BUFFER_LOAD_DWORD_IMM_ci : SMRD_Real_Load_IMM_ci <0x08, S_BUFFER_LOAD_DWORD_IMM>;
+def S_BUFFER_LOAD_DWORDX2_IMM_ci : SMRD_Real_Load_IMM_ci <0x09, S_BUFFER_LOAD_DWORDX2_IMM>;
+def S_BUFFER_LOAD_DWORDX4_IMM_ci : SMRD_Real_Load_IMM_ci <0x0a, S_BUFFER_LOAD_DWORDX4_IMM>;
+def S_BUFFER_LOAD_DWORDX8_IMM_ci : SMRD_Real_Load_IMM_ci <0x0b, S_BUFFER_LOAD_DWORDX8_IMM>;
+def S_BUFFER_LOAD_DWORDX16_IMM_ci : SMRD_Real_Load_IMM_ci <0x0c, S_BUFFER_LOAD_DWORDX16_IMM>;
+
+class SMRD_Real_ci <bits<5> op, SM_Pseudo ps>
+ : SM_Real<ps>
+ , SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SI>
+ , Enc32 {
+
+ let AssemblerPredicates = [isCIOnly];
+ let DecoderNamespace = "CI";
+
+ let Inst{7-0} = !if(ps.has_offset, offset{7-0}, ?);
+ let Inst{8} = imm;
+ let Inst{14-9} = !if(ps.has_sbase, sbase{6-1}, ?);
+ let Inst{21-15} = !if(ps.has_sdst, sdst{6-0}, ?);
+ let Inst{26-22} = op;
+ let Inst{31-27} = 0x18; //encoding
+}
+
+def S_DCACHE_INV_VOL_ci : SMRD_Real_ci <0x1d, S_DCACHE_INV_VOL>;
+
+let AddedComplexity = SM_LOAD_PATTERN.AddedComplexity in {
+
+class SMRD_Pattern_ci <string Instr, ValueType vt> : Pat <
+ (smrd_load (SMRDImm32 i64:$sbase, i32:$offset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM_ci") $sbase, $offset, 0))> {
+ let Predicates = [isCIOnly];
+}
+
+def : SMRD_Pattern_ci <"S_LOAD_DWORD", i32>;
+def : SMRD_Pattern_ci <"S_LOAD_DWORDX2", v2i32>;
+def : SMRD_Pattern_ci <"S_LOAD_DWORDX4", v4i32>;
+def : SMRD_Pattern_ci <"S_LOAD_DWORDX8", v8i32>;
+def : SMRD_Pattern_ci <"S_LOAD_DWORDX16", v16i32>;
+
+def : Pat <
+ (SIload_constant v4i32:$sbase, (SMRDBufferImm32 i32:$offset)),
+ (S_BUFFER_LOAD_DWORD_IMM_ci $sbase, $offset, 0)> {
+ let Predicates = [isCI]; // should this be isCIOnly?
+}
+
+} // End let AddedComplexity = SM_LOAD_PATTERN.AddedComplexity
+
diff --git a/contrib/llvm/lib/Target/AMDGPU/SOPInstructions.td b/contrib/llvm/lib/Target/AMDGPU/SOPInstructions.td
new file mode 100644
index 000000000000..0aeb1297d3a7
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -0,0 +1,1229 @@
+//===-- SOPInstructions.td - SOP Instruction Defintions -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def GPRIdxModeMatchClass : AsmOperandClass {
+ let Name = "GPRIdxMode";
+ let PredicateMethod = "isGPRIdxMode";
+ let RenderMethod = "addImmOperands";
+}
+
+def GPRIdxMode : Operand<i32> {
+ let PrintMethod = "printVGPRIndexMode";
+ let ParserMatchClass = GPRIdxModeMatchClass;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+//===----------------------------------------------------------------------===//
+// SOP1 Instructions
+//===----------------------------------------------------------------------===//
+
+class SOP1_Pseudo <string opName, dag outs, dag ins,
+ string asmOps, list<dag> pattern=[]> :
+ InstSI <outs, ins, "", pattern>,
+ SIMCInstr<opName, SIEncodingFamily.NONE> {
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+ let SubtargetPredicate = isGCN;
+
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let SALU = 1;
+ let SOP1 = 1;
+ let SchedRW = [WriteSALU];
+ let Size = 4;
+ let UseNamedOperandTable = 1;
+
+ string Mnemonic = opName;
+ string AsmOperands = asmOps;
+
+ bits<1> has_src0 = 1;
+ bits<1> has_sdst = 1;
+}
+
+class SOP1_Real<bits<8> op, SOP1_Pseudo ps> :
+ InstSI <ps.OutOperandList, ps.InOperandList,
+ ps.Mnemonic # " " # ps.AsmOperands, []>,
+ Enc32 {
+
+ let isPseudo = 0;
+ let isCodeGenOnly = 0;
+ let Size = 4;
+
+ // copy relevant pseudo op flags
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let AsmMatchConverter = ps.AsmMatchConverter;
+
+ // encoding
+ bits<7> sdst;
+ bits<8> src0;
+
+ let Inst{7-0} = !if(ps.has_src0, src0, ?);
+ let Inst{15-8} = op;
+ let Inst{22-16} = !if(ps.has_sdst, sdst, ?);
+ let Inst{31-23} = 0x17d; //encoding;
+}
+
+class SOP1_32 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
+ opName, (outs SReg_32:$sdst), (ins SSrc_b32:$src0),
+ "$sdst, $src0", pattern
+>;
+
+// 32-bit input, no output.
+class SOP1_0_32 <string opName, list<dag> pattern = []> : SOP1_Pseudo <
+ opName, (outs), (ins SSrc_b32:$src0),
+ "$src0", pattern> {
+ let has_sdst = 0;
+}
+
+class SOP1_64 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
+ opName, (outs SReg_64:$sdst), (ins SSrc_b64:$src0),
+ "$sdst, $src0", pattern
+>;
+
+// 64-bit input, 32-bit output.
+class SOP1_32_64 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
+ opName, (outs SReg_32:$sdst), (ins SSrc_b64:$src0),
+ "$sdst, $src0", pattern
+>;
+
+// 32-bit input, 64-bit output.
+class SOP1_64_32 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
+ opName, (outs SReg_64:$sdst), (ins SSrc_b32:$src0),
+ "$sdst, $src0", pattern
+>;
+
+// no input, 64-bit output.
+class SOP1_64_0 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
+ opName, (outs SReg_64:$sdst), (ins), "$sdst", pattern> {
+ let has_src0 = 0;
+}
+
+// 64-bit input, no output
+class SOP1_1 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
+ opName, (outs), (ins SReg_64:$src0), "$src0", pattern> {
+ let has_sdst = 0;
+}
+
+
+let isMoveImm = 1 in {
+ let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+ def S_MOV_B32 : SOP1_32 <"s_mov_b32">;
+ def S_MOV_B64 : SOP1_64 <"s_mov_b64">;
+ } // End isRematerializeable = 1
+
+ let Uses = [SCC] in {
+ def S_CMOV_B32 : SOP1_32 <"s_cmov_b32">;
+ def S_CMOV_B64 : SOP1_64 <"s_cmov_b64">;
+ } // End Uses = [SCC]
+} // End isMoveImm = 1
+
+let Defs = [SCC] in {
+ def S_NOT_B32 : SOP1_32 <"s_not_b32",
+ [(set i32:$sdst, (not i32:$src0))]
+ >;
+
+ def S_NOT_B64 : SOP1_64 <"s_not_b64",
+ [(set i64:$sdst, (not i64:$src0))]
+ >;
+ def S_WQM_B32 : SOP1_32 <"s_wqm_b32">;
+ def S_WQM_B64 : SOP1_64 <"s_wqm_b64">;
+} // End Defs = [SCC]
+
+
+def S_BREV_B32 : SOP1_32 <"s_brev_b32",
+ [(set i32:$sdst, (bitreverse i32:$src0))]
+>;
+def S_BREV_B64 : SOP1_64 <"s_brev_b64">;
+
+let Defs = [SCC] in {
+def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32">;
+def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64">;
+def S_BCNT1_I32_B32 : SOP1_32 <"s_bcnt1_i32_b32",
+ [(set i32:$sdst, (ctpop i32:$src0))]
+>;
+def S_BCNT1_I32_B64 : SOP1_32_64 <"s_bcnt1_i32_b64">;
+} // End Defs = [SCC]
+
+def S_FF0_I32_B32 : SOP1_32 <"s_ff0_i32_b32">;
+def S_FF0_I32_B64 : SOP1_32_64 <"s_ff0_i32_b64">;
+def S_FF1_I32_B32 : SOP1_32 <"s_ff1_i32_b32",
+ [(set i32:$sdst, (cttz_zero_undef i32:$src0))]
+>;
+def S_FF1_I32_B64 : SOP1_32_64 <"s_ff1_i32_b64">;
+
+def S_FLBIT_I32_B32 : SOP1_32 <"s_flbit_i32_b32",
+ [(set i32:$sdst, (AMDGPUffbh_u32 i32:$src0))]
+>;
+
+def S_FLBIT_I32_B64 : SOP1_32_64 <"s_flbit_i32_b64">;
+def S_FLBIT_I32 : SOP1_32 <"s_flbit_i32",
+ [(set i32:$sdst, (AMDGPUffbh_i32 i32:$src0))]
+>;
+def S_FLBIT_I32_I64 : SOP1_32_64 <"s_flbit_i32_i64">;
+def S_SEXT_I32_I8 : SOP1_32 <"s_sext_i32_i8",
+ [(set i32:$sdst, (sext_inreg i32:$src0, i8))]
+>;
+def S_SEXT_I32_I16 : SOP1_32 <"s_sext_i32_i16",
+ [(set i32:$sdst, (sext_inreg i32:$src0, i16))]
+>;
+
+def S_BITSET0_B32 : SOP1_32 <"s_bitset0_b32">;
+def S_BITSET0_B64 : SOP1_64_32 <"s_bitset0_b64">;
+def S_BITSET1_B32 : SOP1_32 <"s_bitset1_b32">;
+def S_BITSET1_B64 : SOP1_64_32 <"s_bitset1_b64">;
+def S_GETPC_B64 : SOP1_64_0 <"s_getpc_b64">;
+
+let isTerminator = 1, isBarrier = 1,
+ isBranch = 1, isIndirectBranch = 1 in {
+def S_SETPC_B64 : SOP1_1 <"s_setpc_b64">;
+}
+def S_SWAPPC_B64 : SOP1_64 <"s_swappc_b64">;
+def S_RFE_B64 : SOP1_1 <"s_rfe_b64">;
+
+let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] in {
+
+def S_AND_SAVEEXEC_B64 : SOP1_64 <"s_and_saveexec_b64">;
+def S_OR_SAVEEXEC_B64 : SOP1_64 <"s_or_saveexec_b64">;
+def S_XOR_SAVEEXEC_B64 : SOP1_64 <"s_xor_saveexec_b64">;
+def S_ANDN2_SAVEEXEC_B64 : SOP1_64 <"s_andn2_saveexec_b64">;
+def S_ORN2_SAVEEXEC_B64 : SOP1_64 <"s_orn2_saveexec_b64">;
+def S_NAND_SAVEEXEC_B64 : SOP1_64 <"s_nand_saveexec_b64">;
+def S_NOR_SAVEEXEC_B64 : SOP1_64 <"s_nor_saveexec_b64">;
+def S_XNOR_SAVEEXEC_B64 : SOP1_64 <"s_xnor_saveexec_b64">;
+
+} // End hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC]
+
+def S_QUADMASK_B32 : SOP1_32 <"s_quadmask_b32">;
+def S_QUADMASK_B64 : SOP1_64 <"s_quadmask_b64">;
+
+let Uses = [M0] in {
+def S_MOVRELS_B32 : SOP1_32 <"s_movrels_b32">;
+def S_MOVRELS_B64 : SOP1_64 <"s_movrels_b64">;
+def S_MOVRELD_B32 : SOP1_32 <"s_movreld_b32">;
+def S_MOVRELD_B64 : SOP1_64 <"s_movreld_b64">;
+} // End Uses = [M0]
+
+def S_CBRANCH_JOIN : SOP1_1 <"s_cbranch_join">;
+def S_MOV_REGRD_B32 : SOP1_32 <"s_mov_regrd_b32">;
+let Defs = [SCC] in {
+def S_ABS_I32 : SOP1_32 <"s_abs_i32">;
+} // End Defs = [SCC]
+def S_MOV_FED_B32 : SOP1_32 <"s_mov_fed_b32">;
+
+let SubtargetPredicate = HasVGPRIndexMode in {
+def S_SET_GPR_IDX_IDX : SOP1_0_32<"s_set_gpr_idx_idx"> {
+ let Uses = [M0];
+ let Defs = [M0];
+}
+}
+
+//===----------------------------------------------------------------------===//
+// SOP2 Instructions
+//===----------------------------------------------------------------------===//
+
+class SOP2_Pseudo<string opName, dag outs, dag ins,
+ string asmOps, list<dag> pattern=[]> :
+ InstSI<outs, ins, "", pattern>,
+ SIMCInstr<opName, SIEncodingFamily.NONE> {
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+ let SubtargetPredicate = isGCN;
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let SALU = 1;
+ let SOP2 = 1;
+ let SchedRW = [WriteSALU];
+ let UseNamedOperandTable = 1;
+
+ string Mnemonic = opName;
+ string AsmOperands = asmOps;
+
+ bits<1> has_sdst = 1;
+
+ // Pseudo instructions have no encodings, but adding this field here allows
+ // us to do:
+ // let sdst = xxx in {
+ // for multiclasses that include both real and pseudo instructions.
+ // field bits<7> sdst = 0;
+ // let Size = 4; // Do we need size here?
+}
+
+class SOP2_Real<bits<7> op, SOP2_Pseudo ps> :
+ InstSI <ps.OutOperandList, ps.InOperandList,
+ ps.Mnemonic # " " # ps.AsmOperands, []>,
+ Enc32 {
+ let isPseudo = 0;
+ let isCodeGenOnly = 0;
+
+ // copy relevant pseudo op flags
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let AsmMatchConverter = ps.AsmMatchConverter;
+
+ // encoding
+ bits<7> sdst;
+ bits<8> src0;
+ bits<8> src1;
+
+ let Inst{7-0} = src0;
+ let Inst{15-8} = src1;
+ let Inst{22-16} = !if(ps.has_sdst, sdst, ?);
+ let Inst{29-23} = op;
+ let Inst{31-30} = 0x2; // encoding
+}
+
+
+class SOP2_32 <string opName, list<dag> pattern=[]> : SOP2_Pseudo <
+ opName, (outs SReg_32:$sdst), (ins SSrc_b32:$src0, SSrc_b32:$src1),
+ "$sdst, $src0, $src1", pattern
+>;
+
+class SOP2_64 <string opName, list<dag> pattern=[]> : SOP2_Pseudo <
+ opName, (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1),
+ "$sdst, $src0, $src1", pattern
+>;
+
+class SOP2_64_32 <string opName, list<dag> pattern=[]> : SOP2_Pseudo <
+ opName, (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b32:$src1),
+ "$sdst, $src0, $src1", pattern
+>;
+
+class SOP2_64_32_32 <string opName, list<dag> pattern=[]> : SOP2_Pseudo <
+ opName, (outs SReg_64:$sdst), (ins SSrc_b32:$src0, SSrc_b32:$src1),
+ "$sdst, $src0, $src1", pattern
+>;
+
+let Defs = [SCC] in { // Carry out goes to SCC
+let isCommutable = 1 in {
+def S_ADD_U32 : SOP2_32 <"s_add_u32">;
+def S_ADD_I32 : SOP2_32 <"s_add_i32",
+ [(set i32:$sdst, (add SSrc_b32:$src0, SSrc_b32:$src1))]
+>;
+} // End isCommutable = 1
+
+def S_SUB_U32 : SOP2_32 <"s_sub_u32">;
+def S_SUB_I32 : SOP2_32 <"s_sub_i32",
+ [(set i32:$sdst, (sub SSrc_b32:$src0, SSrc_b32:$src1))]
+>;
+
+let Uses = [SCC] in { // Carry in comes from SCC
+let isCommutable = 1 in {
+def S_ADDC_U32 : SOP2_32 <"s_addc_u32",
+ [(set i32:$sdst, (adde (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]>;
+} // End isCommutable = 1
+
+def S_SUBB_U32 : SOP2_32 <"s_subb_u32",
+ [(set i32:$sdst, (sube (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]>;
+} // End Uses = [SCC]
+
+
+let isCommutable = 1 in {
+def S_MIN_I32 : SOP2_32 <"s_min_i32",
+ [(set i32:$sdst, (smin i32:$src0, i32:$src1))]
+>;
+def S_MIN_U32 : SOP2_32 <"s_min_u32",
+ [(set i32:$sdst, (umin i32:$src0, i32:$src1))]
+>;
+def S_MAX_I32 : SOP2_32 <"s_max_i32",
+ [(set i32:$sdst, (smax i32:$src0, i32:$src1))]
+>;
+def S_MAX_U32 : SOP2_32 <"s_max_u32",
+ [(set i32:$sdst, (umax i32:$src0, i32:$src1))]
+>;
+} // End isCommutable = 1
+} // End Defs = [SCC]
+
+
+let Uses = [SCC] in {
+ def S_CSELECT_B32 : SOP2_32 <"s_cselect_b32">;
+ def S_CSELECT_B64 : SOP2_64 <"s_cselect_b64">;
+} // End Uses = [SCC]
+
+let Defs = [SCC] in {
+let isCommutable = 1 in {
+def S_AND_B32 : SOP2_32 <"s_and_b32",
+ [(set i32:$sdst, (and i32:$src0, i32:$src1))]
+>;
+
+def S_AND_B64 : SOP2_64 <"s_and_b64",
+ [(set i64:$sdst, (and i64:$src0, i64:$src1))]
+>;
+
+def S_OR_B32 : SOP2_32 <"s_or_b32",
+ [(set i32:$sdst, (or i32:$src0, i32:$src1))]
+>;
+
+def S_OR_B64 : SOP2_64 <"s_or_b64",
+ [(set i64:$sdst, (or i64:$src0, i64:$src1))]
+>;
+
+def S_XOR_B32 : SOP2_32 <"s_xor_b32",
+ [(set i32:$sdst, (xor i32:$src0, i32:$src1))]
+>;
+
+def S_XOR_B64 : SOP2_64 <"s_xor_b64",
+ [(set i64:$sdst, (xor i64:$src0, i64:$src1))]
+>;
+} // End isCommutable = 1
+
+def S_ANDN2_B32 : SOP2_32 <"s_andn2_b32">;
+def S_ANDN2_B64 : SOP2_64 <"s_andn2_b64">;
+def S_ORN2_B32 : SOP2_32 <"s_orn2_b32">;
+def S_ORN2_B64 : SOP2_64 <"s_orn2_b64">;
+def S_NAND_B32 : SOP2_32 <"s_nand_b32">;
+def S_NAND_B64 : SOP2_64 <"s_nand_b64">;
+def S_NOR_B32 : SOP2_32 <"s_nor_b32">;
+def S_NOR_B64 : SOP2_64 <"s_nor_b64">;
+def S_XNOR_B32 : SOP2_32 <"s_xnor_b32">;
+def S_XNOR_B64 : SOP2_64 <"s_xnor_b64">;
+} // End Defs = [SCC]
+
+// Use added complexity so these patterns are preferred to the VALU patterns.
+let AddedComplexity = 1 in {
+
+let Defs = [SCC] in {
+def S_LSHL_B32 : SOP2_32 <"s_lshl_b32",
+ [(set i32:$sdst, (shl i32:$src0, i32:$src1))]
+>;
+def S_LSHL_B64 : SOP2_64_32 <"s_lshl_b64",
+ [(set i64:$sdst, (shl i64:$src0, i32:$src1))]
+>;
+def S_LSHR_B32 : SOP2_32 <"s_lshr_b32",
+ [(set i32:$sdst, (srl i32:$src0, i32:$src1))]
+>;
+def S_LSHR_B64 : SOP2_64_32 <"s_lshr_b64",
+ [(set i64:$sdst, (srl i64:$src0, i32:$src1))]
+>;
+def S_ASHR_I32 : SOP2_32 <"s_ashr_i32",
+ [(set i32:$sdst, (sra i32:$src0, i32:$src1))]
+>;
+def S_ASHR_I64 : SOP2_64_32 <"s_ashr_i64",
+ [(set i64:$sdst, (sra i64:$src0, i32:$src1))]
+>;
+} // End Defs = [SCC]
+
+def S_BFM_B32 : SOP2_32 <"s_bfm_b32",
+ [(set i32:$sdst, (AMDGPUbfm i32:$src0, i32:$src1))]>;
+def S_BFM_B64 : SOP2_64_32_32 <"s_bfm_b64">;
+def S_MUL_I32 : SOP2_32 <"s_mul_i32",
+ [(set i32:$sdst, (mul i32:$src0, i32:$src1))]> {
+ let isCommutable = 1;
+}
+
+} // End AddedComplexity = 1
+
+let Defs = [SCC] in {
+def S_BFE_U32 : SOP2_32 <"s_bfe_u32">;
+def S_BFE_I32 : SOP2_32 <"s_bfe_i32">;
+def S_BFE_U64 : SOP2_64_32 <"s_bfe_u64">;
+def S_BFE_I64 : SOP2_64_32 <"s_bfe_i64">;
+} // End Defs = [SCC]
+
+def S_CBRANCH_G_FORK : SOP2_Pseudo <
+ "s_cbranch_g_fork", (outs),
+ (ins SReg_64:$src0, SReg_64:$src1),
+ "$src0, $src1"
+> {
+ let has_sdst = 0;
+}
+
+let Defs = [SCC] in {
+def S_ABSDIFF_I32 : SOP2_32 <"s_absdiff_i32">;
+} // End Defs = [SCC]
+
+
+//===----------------------------------------------------------------------===//
+// SOPK Instructions
+//===----------------------------------------------------------------------===//
+
+class SOPK_Pseudo <string opName, dag outs, dag ins,
+ string asmOps, list<dag> pattern=[]> :
+ InstSI <outs, ins, "", pattern>,
+ SIMCInstr<opName, SIEncodingFamily.NONE> {
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+ let SubtargetPredicate = isGCN;
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let SALU = 1;
+ let SOPK = 1;
+ let SchedRW = [WriteSALU];
+ let UseNamedOperandTable = 1;
+ string Mnemonic = opName;
+ string AsmOperands = asmOps;
+
+ bits<1> has_sdst = 1;
+}
+
+class SOPK_Real<bits<5> op, SOPK_Pseudo ps> :
+ InstSI <ps.OutOperandList, ps.InOperandList,
+ ps.Mnemonic # " " # ps.AsmOperands, []> {
+ let isPseudo = 0;
+ let isCodeGenOnly = 0;
+
+ // copy relevant pseudo op flags
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let AsmMatchConverter = ps.AsmMatchConverter;
+ let DisableEncoding = ps.DisableEncoding;
+ let Constraints = ps.Constraints;
+
+ // encoding
+ bits<7> sdst;
+ bits<16> simm16;
+ bits<32> imm;
+}
+
+class SOPK_Real32<bits<5> op, SOPK_Pseudo ps> :
+ SOPK_Real <op, ps>,
+ Enc32 {
+ let Inst{15-0} = simm16;
+ let Inst{22-16} = !if(ps.has_sdst, sdst, ?);
+ let Inst{27-23} = op;
+ let Inst{31-28} = 0xb; //encoding
+}
+
+class SOPK_Real64<bits<5> op, SOPK_Pseudo ps> :
+ SOPK_Real<op, ps>,
+ Enc64 {
+ let Inst{15-0} = simm16;
+ let Inst{22-16} = !if(ps.has_sdst, sdst, ?);
+ let Inst{27-23} = op;
+ let Inst{31-28} = 0xb; //encoding
+ let Inst{63-32} = imm;
+}
+
+class SOPKInstTable <bit is_sopk, string cmpOp = ""> {
+ bit IsSOPK = is_sopk;
+ string BaseCmpOp = cmpOp;
+}
+
+class SOPK_32 <string opName, list<dag> pattern=[]> : SOPK_Pseudo <
+ opName,
+ (outs SReg_32:$sdst),
+ (ins u16imm:$simm16),
+ "$sdst, $simm16",
+ pattern>;
+
+class SOPK_SCC <string opName, string base_op = ""> : SOPK_Pseudo <
+ opName,
+ (outs),
+ (ins SReg_32:$sdst, u16imm:$simm16),
+ "$sdst, $simm16", []>,
+ SOPKInstTable<1, base_op>{
+ let Defs = [SCC];
+}
+
+class SOPK_32TIE <string opName, list<dag> pattern=[]> : SOPK_Pseudo <
+ opName,
+ (outs SReg_32:$sdst),
+ (ins SReg_32:$src0, u16imm:$simm16),
+ "$sdst, $simm16",
+ pattern
+>;
+
+let isReMaterializable = 1, isMoveImm = 1 in {
+def S_MOVK_I32 : SOPK_32 <"s_movk_i32">;
+} // End isReMaterializable = 1
+let Uses = [SCC] in {
+def S_CMOVK_I32 : SOPK_32 <"s_cmovk_i32">;
+}
+
+let isCompare = 1 in {
+
+// This instruction is disabled for now until we can figure out how to teach
+// the instruction selector to correctly use the S_CMP* vs V_CMP*
+// instructions.
+//
+// When this instruction is enabled the code generator sometimes produces this
+// invalid sequence:
+//
+// SCC = S_CMPK_EQ_I32 SGPR0, imm
+// VCC = COPY SCC
+// VGPR0 = V_CNDMASK VCC, VGPR0, VGPR1
+//
+// def S_CMPK_EQ_I32 : SOPK_SCC <"s_cmpk_eq_i32",
+// [(set i1:$dst, (setcc i32:$src0, imm:$src1, SETEQ))]
+// >;
+
+def S_CMPK_EQ_I32 : SOPK_SCC <"s_cmpk_eq_i32", "s_cmp_eq_i32">;
+def S_CMPK_LG_I32 : SOPK_SCC <"s_cmpk_lg_i32", "s_cmp_lg_i32">;
+def S_CMPK_GT_I32 : SOPK_SCC <"s_cmpk_gt_i32", "s_cmp_gt_i32">;
+def S_CMPK_GE_I32 : SOPK_SCC <"s_cmpk_ge_i32", "s_cmp_ge_i32">;
+def S_CMPK_LT_I32 : SOPK_SCC <"s_cmpk_lt_i32", "s_cmp_lt_i32">;
+def S_CMPK_LE_I32 : SOPK_SCC <"s_cmpk_le_i32", "s_cmp_le_i32">;
+
+let SOPKZext = 1 in {
+def S_CMPK_EQ_U32 : SOPK_SCC <"s_cmpk_eq_u32", "s_cmp_eq_u32">;
+def S_CMPK_LG_U32 : SOPK_SCC <"s_cmpk_lg_u32", "s_cmp_lg_u32">;
+def S_CMPK_GT_U32 : SOPK_SCC <"s_cmpk_gt_u32", "s_cmp_gt_u32">;
+def S_CMPK_GE_U32 : SOPK_SCC <"s_cmpk_ge_u32", "s_cmp_ge_u32">;
+def S_CMPK_LT_U32 : SOPK_SCC <"s_cmpk_lt_u32", "s_cmp_lt_u32">;
+def S_CMPK_LE_U32 : SOPK_SCC <"s_cmpk_le_u32", "s_cmp_le_u32">;
+} // End SOPKZext = 1
+} // End isCompare = 1
+
+let Defs = [SCC], isCommutable = 1, DisableEncoding = "$src0",
+ Constraints = "$sdst = $src0" in {
+ def S_ADDK_I32 : SOPK_32TIE <"s_addk_i32">;
+ def S_MULK_I32 : SOPK_32TIE <"s_mulk_i32">;
+}
+
+def S_CBRANCH_I_FORK : SOPK_Pseudo <
+ "s_cbranch_i_fork",
+ (outs), (ins SReg_64:$sdst, u16imm:$simm16),
+ "$sdst, $simm16"
+>;
+
+let mayLoad = 1 in {
+def S_GETREG_B32 : SOPK_Pseudo <
+ "s_getreg_b32",
+ (outs SReg_32:$sdst), (ins hwreg:$simm16),
+ "$sdst, $simm16"
+>;
+}
+
+let hasSideEffects = 1 in {
+
+def S_SETREG_B32 : SOPK_Pseudo <
+ "s_setreg_b32",
+ (outs), (ins SReg_32:$sdst, hwreg:$simm16),
+ "$simm16, $sdst",
+ [(AMDGPUsetreg i32:$sdst, (i16 timm:$simm16))]
+>;
+
+// FIXME: Not on SI?
+//def S_GETREG_REGRD_B32 : SOPK_32 <sopk<0x14, 0x13>, "s_getreg_regrd_b32">;
+
+def S_SETREG_IMM32_B32 : SOPK_Pseudo <
+ "s_setreg_imm32_b32",
+ (outs), (ins i32imm:$imm, hwreg:$simm16),
+ "$simm16, $imm"> {
+ let Size = 8; // Unlike every other SOPK instruction.
+ let has_sdst = 0;
+}
+
+} // End hasSideEffects = 1
+
+//===----------------------------------------------------------------------===//
+// SOPC Instructions
+//===----------------------------------------------------------------------===//
+
+class SOPCe <bits<7> op> : Enc32 {
+ bits<8> src0;
+ bits<8> src1;
+
+ let Inst{7-0} = src0;
+ let Inst{15-8} = src1;
+ let Inst{22-16} = op;
+ let Inst{31-23} = 0x17e;
+}
+
+class SOPC <bits<7> op, dag outs, dag ins, string asm,
+ list<dag> pattern = []> :
+ InstSI<outs, ins, asm, pattern>, SOPCe <op> {
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let SALU = 1;
+ let SOPC = 1;
+ let isCodeGenOnly = 0;
+ let Defs = [SCC];
+ let SchedRW = [WriteSALU];
+ let UseNamedOperandTable = 1;
+ let SubtargetPredicate = isGCN;
+}
+
+class SOPC_Base <bits<7> op, RegisterOperand rc0, RegisterOperand rc1,
+ string opName, list<dag> pattern = []> : SOPC <
+ op, (outs), (ins rc0:$src0, rc1:$src1),
+ opName#" $src0, $src1", pattern > {
+ let Defs = [SCC];
+}
+class SOPC_Helper <bits<7> op, RegisterOperand rc, ValueType vt,
+ string opName, PatLeaf cond> : SOPC_Base <
+ op, rc, rc, opName,
+ [(set SCC, (si_setcc_uniform vt:$src0, vt:$src1, cond))] > {
+}
+
+class SOPC_CMP_32<bits<7> op, string opName,
+ PatLeaf cond = COND_NULL, string revOp = opName>
+ : SOPC_Helper<op, SSrc_b32, i32, opName, cond>,
+ Commutable_REV<revOp, !eq(revOp, opName)>,
+ SOPKInstTable<0, opName> {
+ let isCompare = 1;
+ let isCommutable = 1;
+}
+
+class SOPC_CMP_64<bits<7> op, string opName,
+ PatLeaf cond = COND_NULL, string revOp = opName>
+ : SOPC_Helper<op, SSrc_b64, i64, opName, cond>,
+ Commutable_REV<revOp, !eq(revOp, opName)> {
+ let isCompare = 1;
+ let isCommutable = 1;
+}
+
+class SOPC_32<bits<7> op, string opName, list<dag> pattern = []>
+ : SOPC_Base<op, SSrc_b32, SSrc_b32, opName, pattern>;
+
+class SOPC_64_32<bits<7> op, string opName, list<dag> pattern = []>
+ : SOPC_Base<op, SSrc_b64, SSrc_b32, opName, pattern>;
+
+def S_CMP_EQ_I32 : SOPC_CMP_32 <0x00, "s_cmp_eq_i32">;
+def S_CMP_LG_I32 : SOPC_CMP_32 <0x01, "s_cmp_lg_i32">;
+def S_CMP_GT_I32 : SOPC_CMP_32 <0x02, "s_cmp_gt_i32", COND_SGT>;
+def S_CMP_GE_I32 : SOPC_CMP_32 <0x03, "s_cmp_ge_i32", COND_SGE>;
+def S_CMP_LT_I32 : SOPC_CMP_32 <0x04, "s_cmp_lt_i32", COND_SLT, "s_cmp_gt_i32">;
+def S_CMP_LE_I32 : SOPC_CMP_32 <0x05, "s_cmp_le_i32", COND_SLE, "s_cmp_ge_i32">;
+def S_CMP_EQ_U32 : SOPC_CMP_32 <0x06, "s_cmp_eq_u32", COND_EQ>;
+def S_CMP_LG_U32 : SOPC_CMP_32 <0x07, "s_cmp_lg_u32", COND_NE>;
+def S_CMP_GT_U32 : SOPC_CMP_32 <0x08, "s_cmp_gt_u32", COND_UGT>;
+def S_CMP_GE_U32 : SOPC_CMP_32 <0x09, "s_cmp_ge_u32", COND_UGE>;
+def S_CMP_LT_U32 : SOPC_CMP_32 <0x0a, "s_cmp_lt_u32", COND_ULT, "s_cmp_gt_u32">;
+def S_CMP_LE_U32 : SOPC_CMP_32 <0x0b, "s_cmp_le_u32", COND_ULE, "s_cmp_ge_u32">;
+
+def S_BITCMP0_B32 : SOPC_32 <0x0c, "s_bitcmp0_b32">;
+def S_BITCMP1_B32 : SOPC_32 <0x0d, "s_bitcmp1_b32">;
+def S_BITCMP0_B64 : SOPC_64_32 <0x0e, "s_bitcmp0_b64">;
+def S_BITCMP1_B64 : SOPC_64_32 <0x0f, "s_bitcmp1_b64">;
+def S_SETVSKIP : SOPC_32 <0x10, "s_setvskip">;
+
+let SubtargetPredicate = isVI in {
+def S_CMP_EQ_U64 : SOPC_CMP_64 <0x12, "s_cmp_eq_u64", COND_EQ>;
+def S_CMP_LG_U64 : SOPC_CMP_64 <0x13, "s_cmp_lg_u64", COND_NE>;
+}
+
+let SubtargetPredicate = HasVGPRIndexMode in {
+def S_SET_GPR_IDX_ON : SOPC <0x11,
+ (outs),
+ (ins SSrc_b32:$src0, GPRIdxMode:$src1),
+ "s_set_gpr_idx_on $src0,$src1"> {
+ let Defs = [M0]; // No scc def
+ let Uses = [M0]; // Other bits of m0 unmodified.
+ let hasSideEffects = 1; // Sets mode.gpr_idx_en
+ let FixedSize = 1;
+}
+}
+
+//===----------------------------------------------------------------------===//
+// SOPP Instructions
+//===----------------------------------------------------------------------===//
+
+class SOPPe <bits<7> op> : Enc32 {
+ bits <16> simm16;
+
+ let Inst{15-0} = simm16;
+ let Inst{22-16} = op;
+ let Inst{31-23} = 0x17f; // encoding
+}
+
+class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern = []> :
+ InstSI <(outs), ins, asm, pattern >, SOPPe <op> {
+
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let SALU = 1;
+ let SOPP = 1;
+ let Size = 4;
+ let SchedRW = [WriteSALU];
+
+ let UseNamedOperandTable = 1;
+ let SubtargetPredicate = isGCN;
+}
+
+
+def S_NOP : SOPP <0x00000000, (ins i16imm:$simm16), "s_nop $simm16">;
+
+let isTerminator = 1 in {
+
+def S_ENDPGM : SOPP <0x00000001, (ins), "s_endpgm",
+ [(AMDGPUendpgm)]> {
+ let simm16 = 0;
+ let isBarrier = 1;
+ let isReturn = 1;
+}
+
+let isBranch = 1, SchedRW = [WriteBranch] in {
+def S_BRANCH : SOPP <
+ 0x00000002, (ins sopp_brtarget:$simm16), "s_branch $simm16",
+ [(br bb:$simm16)]> {
+ let isBarrier = 1;
+}
+
+let Uses = [SCC] in {
+def S_CBRANCH_SCC0 : SOPP <
+ 0x00000004, (ins sopp_brtarget:$simm16),
+ "s_cbranch_scc0 $simm16"
+>;
+def S_CBRANCH_SCC1 : SOPP <
+ 0x00000005, (ins sopp_brtarget:$simm16),
+ "s_cbranch_scc1 $simm16",
+ [(si_uniform_br_scc SCC, bb:$simm16)]
+>;
+} // End Uses = [SCC]
+
+let Uses = [VCC] in {
+def S_CBRANCH_VCCZ : SOPP <
+ 0x00000006, (ins sopp_brtarget:$simm16),
+ "s_cbranch_vccz $simm16"
+>;
+def S_CBRANCH_VCCNZ : SOPP <
+ 0x00000007, (ins sopp_brtarget:$simm16),
+ "s_cbranch_vccnz $simm16"
+>;
+} // End Uses = [VCC]
+
+let Uses = [EXEC] in {
+def S_CBRANCH_EXECZ : SOPP <
+ 0x00000008, (ins sopp_brtarget:$simm16),
+ "s_cbranch_execz $simm16"
+>;
+def S_CBRANCH_EXECNZ : SOPP <
+ 0x00000009, (ins sopp_brtarget:$simm16),
+ "s_cbranch_execnz $simm16"
+>;
+} // End Uses = [EXEC]
+
+
+} // End isBranch = 1
+} // End isTerminator = 1
+
+let hasSideEffects = 1 in {
+def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier",
+ [(int_amdgcn_s_barrier)]> {
+ let SchedRW = [WriteBarrier];
+ let simm16 = 0;
+ let mayLoad = 1;
+ let mayStore = 1;
+ let isConvergent = 1;
+}
+
+let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in
+def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">;
+def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">;
+
+// On SI the documentation says sleep for approximately 64 * low 2
+// bits, consistent with the reported maximum of 448. On VI the
+// maximum reported is 960 cycles, so 960 / 64 = 15 max, so is the
+// maximum really 15 on VI?
+def S_SLEEP : SOPP <0x0000000e, (ins i32imm:$simm16),
+ "s_sleep $simm16", [(int_amdgcn_s_sleep SIMM16bit:$simm16)]> {
+ let hasSideEffects = 1;
+ let mayLoad = 1;
+ let mayStore = 1;
+}
+
+def S_SETPRIO : SOPP <0x0000000f, (ins i16imm:$simm16), "s_setprio $simm16">;
+
+let Uses = [EXEC, M0] in {
+// FIXME: Should this be mayLoad+mayStore?
+def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16), "s_sendmsg $simm16",
+ [(AMDGPUsendmsg (i32 imm:$simm16))]
+>;
+} // End Uses = [EXEC, M0]
+
+def S_SENDMSGHALT : SOPP <0x00000011, (ins SendMsgImm:$simm16), "s_sendmsghalt $simm16">;
+def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16">;
+def S_ICACHE_INV : SOPP <0x00000013, (ins), "s_icache_inv"> {
+ let simm16 = 0;
+}
+def S_INCPERFLEVEL : SOPP <0x00000014, (ins i32imm:$simm16), "s_incperflevel $simm16",
+ [(int_amdgcn_s_incperflevel SIMM16bit:$simm16)]> {
+ let hasSideEffects = 1;
+ let mayLoad = 1;
+ let mayStore = 1;
+}
+def S_DECPERFLEVEL : SOPP <0x00000015, (ins i32imm:$simm16), "s_decperflevel $simm16",
+ [(int_amdgcn_s_decperflevel SIMM16bit:$simm16)]> {
+ let hasSideEffects = 1;
+ let mayLoad = 1;
+ let mayStore = 1;
+}
+def S_TTRACEDATA : SOPP <0x00000016, (ins), "s_ttracedata"> {
+ let simm16 = 0;
+}
+
+let SubtargetPredicate = HasVGPRIndexMode in {
+def S_SET_GPR_IDX_OFF : SOPP<0x1c, (ins), "s_set_gpr_idx_off"> {
+ let simm16 = 0;
+}
+}
+} // End hasSideEffects
+
+let SubtargetPredicate = HasVGPRIndexMode in {
+def S_SET_GPR_IDX_MODE : SOPP<0x1d, (ins GPRIdxMode:$simm16),
+ "s_set_gpr_idx_mode$simm16"> {
+ let Defs = [M0];
+}
+}
+
+let Predicates = [isGCN] in {
+
+//===----------------------------------------------------------------------===//
+// S_GETREG_B32 Intrinsic Pattern.
+//===----------------------------------------------------------------------===//
+def : Pat <
+ (int_amdgcn_s_getreg imm:$simm16),
+ (S_GETREG_B32 (as_i16imm $simm16))
+>;
+
+//===----------------------------------------------------------------------===//
+// SOP1 Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat <
+ (i64 (ctpop i64:$src)),
+ (i64 (REG_SEQUENCE SReg_64,
+ (i32 (COPY_TO_REGCLASS (S_BCNT1_I32_B64 $src), SReg_32)), sub0,
+ (S_MOV_B32 (i32 0)), sub1))
+>;
+
+def : Pat <
+ (i32 (smax i32:$x, (i32 (ineg i32:$x)))),
+ (S_ABS_I32 $x)
+>;
+
+def : Pat <
+ (i16 imm:$imm),
+ (S_MOV_B32 imm:$imm)
+>;
+
+// Same as a 32-bit inreg
+def : Pat<
+ (i32 (sext i16:$src)),
+ (S_SEXT_I32_I16 $src)
+>;
+
+
+//===----------------------------------------------------------------------===//
+// SOP2 Patterns
+//===----------------------------------------------------------------------===//
+
+// V_ADD_I32_e32/S_ADD_U32 produces carry in VCC/SCC. For the vector
+// case, the sgpr-copies pass will fix this to use the vector version.
+def : Pat <
+ (i32 (addc i32:$src0, i32:$src1)),
+ (S_ADD_U32 $src0, $src1)
+>;
+
+// FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that
+// REG_SEQUENCE patterns don't support instructions with multiple
+// outputs.
+def : Pat<
+ (i64 (zext i16:$src)),
+ (REG_SEQUENCE SReg_64,
+ (i32 (COPY_TO_REGCLASS (S_AND_B32 $src, (S_MOV_B32 (i32 0xffff))), SGPR_32)), sub0,
+ (S_MOV_B32 (i32 0)), sub1)
+>;
+
+def : Pat <
+ (i64 (sext i16:$src)),
+ (REG_SEQUENCE SReg_64, (i32 (S_SEXT_I32_I16 $src)), sub0,
+ (i32 (COPY_TO_REGCLASS (S_ASHR_I32 (i32 (S_SEXT_I32_I16 $src)), (S_MOV_B32 (i32 31))), SGPR_32)), sub1)
+>;
+
+def : Pat<
+ (i32 (zext i16:$src)),
+ (S_AND_B32 (S_MOV_B32 (i32 0xffff)), $src)
+>;
+
+
+
+//===----------------------------------------------------------------------===//
+// SOPP Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat <
+ (int_amdgcn_s_waitcnt i32:$simm16),
+ (S_WAITCNT (as_i16imm $simm16))
+>;
+
+} // End isGCN predicate
+
+
+//===----------------------------------------------------------------------===//
+// Real target instructions, move this to the appropriate subtarget TD file
+//===----------------------------------------------------------------------===//
+
+class Select_si<string opName> :
+ SIMCInstr<opName, SIEncodingFamily.SI> {
+ list<Predicate> AssemblerPredicates = [isSICI];
+ string DecoderNamespace = "SICI";
+}
+
+class SOP1_Real_si<bits<8> op, SOP1_Pseudo ps> :
+ SOP1_Real<op, ps>,
+ Select_si<ps.Mnemonic>;
+
+class SOP2_Real_si<bits<7> op, SOP2_Pseudo ps> :
+ SOP2_Real<op, ps>,
+ Select_si<ps.Mnemonic>;
+
+class SOPK_Real_si<bits<5> op, SOPK_Pseudo ps> :
+ SOPK_Real32<op, ps>,
+ Select_si<ps.Mnemonic>;
+
+def S_MOV_B32_si : SOP1_Real_si <0x03, S_MOV_B32>;
+def S_MOV_B64_si : SOP1_Real_si <0x04, S_MOV_B64>;
+def S_CMOV_B32_si : SOP1_Real_si <0x05, S_CMOV_B32>;
+def S_CMOV_B64_si : SOP1_Real_si <0x06, S_CMOV_B64>;
+def S_NOT_B32_si : SOP1_Real_si <0x07, S_NOT_B32>;
+def S_NOT_B64_si : SOP1_Real_si <0x08, S_NOT_B64>;
+def S_WQM_B32_si : SOP1_Real_si <0x09, S_WQM_B32>;
+def S_WQM_B64_si : SOP1_Real_si <0x0a, S_WQM_B64>;
+def S_BREV_B32_si : SOP1_Real_si <0x0b, S_BREV_B32>;
+def S_BREV_B64_si : SOP1_Real_si <0x0c, S_BREV_B64>;
+def S_BCNT0_I32_B32_si : SOP1_Real_si <0x0d, S_BCNT0_I32_B32>;
+def S_BCNT0_I32_B64_si : SOP1_Real_si <0x0e, S_BCNT0_I32_B64>;
+def S_BCNT1_I32_B32_si : SOP1_Real_si <0x0f, S_BCNT1_I32_B32>;
+def S_BCNT1_I32_B64_si : SOP1_Real_si <0x10, S_BCNT1_I32_B64>;
+def S_FF0_I32_B32_si : SOP1_Real_si <0x11, S_FF0_I32_B32>;
+def S_FF0_I32_B64_si : SOP1_Real_si <0x12, S_FF0_I32_B64>;
+def S_FF1_I32_B32_si : SOP1_Real_si <0x13, S_FF1_I32_B32>;
+def S_FF1_I32_B64_si : SOP1_Real_si <0x14, S_FF1_I32_B64>;
+def S_FLBIT_I32_B32_si : SOP1_Real_si <0x15, S_FLBIT_I32_B32>;
+def S_FLBIT_I32_B64_si : SOP1_Real_si <0x16, S_FLBIT_I32_B64>;
+def S_FLBIT_I32_si : SOP1_Real_si <0x17, S_FLBIT_I32>;
+def S_FLBIT_I32_I64_si : SOP1_Real_si <0x18, S_FLBIT_I32_I64>;
+def S_SEXT_I32_I8_si : SOP1_Real_si <0x19, S_SEXT_I32_I8>;
+def S_SEXT_I32_I16_si : SOP1_Real_si <0x1a, S_SEXT_I32_I16>;
+def S_BITSET0_B32_si : SOP1_Real_si <0x1b, S_BITSET0_B32>;
+def S_BITSET0_B64_si : SOP1_Real_si <0x1c, S_BITSET0_B64>;
+def S_BITSET1_B32_si : SOP1_Real_si <0x1d, S_BITSET1_B32>;
+def S_BITSET1_B64_si : SOP1_Real_si <0x1e, S_BITSET1_B64>;
+def S_GETPC_B64_si : SOP1_Real_si <0x1f, S_GETPC_B64>;
+def S_SETPC_B64_si : SOP1_Real_si <0x20, S_SETPC_B64>;
+def S_SWAPPC_B64_si : SOP1_Real_si <0x21, S_SWAPPC_B64>;
+def S_RFE_B64_si : SOP1_Real_si <0x22, S_RFE_B64>;
+def S_AND_SAVEEXEC_B64_si : SOP1_Real_si <0x24, S_AND_SAVEEXEC_B64>;
+def S_OR_SAVEEXEC_B64_si : SOP1_Real_si <0x25, S_OR_SAVEEXEC_B64>;
+def S_XOR_SAVEEXEC_B64_si : SOP1_Real_si <0x26, S_XOR_SAVEEXEC_B64>;
+def S_ANDN2_SAVEEXEC_B64_si: SOP1_Real_si <0x27, S_ANDN2_SAVEEXEC_B64>;
+def S_ORN2_SAVEEXEC_B64_si : SOP1_Real_si <0x28, S_ORN2_SAVEEXEC_B64>;
+def S_NAND_SAVEEXEC_B64_si : SOP1_Real_si <0x29, S_NAND_SAVEEXEC_B64>;
+def S_NOR_SAVEEXEC_B64_si : SOP1_Real_si <0x2a, S_NOR_SAVEEXEC_B64>;
+def S_XNOR_SAVEEXEC_B64_si : SOP1_Real_si <0x2b, S_XNOR_SAVEEXEC_B64>;
+def S_QUADMASK_B32_si : SOP1_Real_si <0x2c, S_QUADMASK_B32>;
+def S_QUADMASK_B64_si : SOP1_Real_si <0x2d, S_QUADMASK_B64>;
+def S_MOVRELS_B32_si : SOP1_Real_si <0x2e, S_MOVRELS_B32>;
+def S_MOVRELS_B64_si : SOP1_Real_si <0x2f, S_MOVRELS_B64>;
+def S_MOVRELD_B32_si : SOP1_Real_si <0x30, S_MOVRELD_B32>;
+def S_MOVRELD_B64_si : SOP1_Real_si <0x31, S_MOVRELD_B64>;
+def S_CBRANCH_JOIN_si : SOP1_Real_si <0x32, S_CBRANCH_JOIN>;
+def S_MOV_REGRD_B32_si : SOP1_Real_si <0x33, S_MOV_REGRD_B32>;
+def S_ABS_I32_si : SOP1_Real_si <0x34, S_ABS_I32>;
+def S_MOV_FED_B32_si : SOP1_Real_si <0x35, S_MOV_FED_B32>;
+
+def S_ADD_U32_si : SOP2_Real_si <0x00, S_ADD_U32>;
+def S_ADD_I32_si : SOP2_Real_si <0x02, S_ADD_I32>;
+def S_SUB_U32_si : SOP2_Real_si <0x01, S_SUB_U32>;
+def S_SUB_I32_si : SOP2_Real_si <0x03, S_SUB_I32>;
+def S_ADDC_U32_si : SOP2_Real_si <0x04, S_ADDC_U32>;
+def S_SUBB_U32_si : SOP2_Real_si <0x05, S_SUBB_U32>;
+def S_MIN_I32_si : SOP2_Real_si <0x06, S_MIN_I32>;
+def S_MIN_U32_si : SOP2_Real_si <0x07, S_MIN_U32>;
+def S_MAX_I32_si : SOP2_Real_si <0x08, S_MAX_I32>;
+def S_MAX_U32_si : SOP2_Real_si <0x09, S_MAX_U32>;
+def S_CSELECT_B32_si : SOP2_Real_si <0x0a, S_CSELECT_B32>;
+def S_CSELECT_B64_si : SOP2_Real_si <0x0b, S_CSELECT_B64>;
+def S_AND_B32_si : SOP2_Real_si <0x0e, S_AND_B32>;
+def S_AND_B64_si : SOP2_Real_si <0x0f, S_AND_B64>;
+def S_OR_B32_si : SOP2_Real_si <0x10, S_OR_B32>;
+def S_OR_B64_si : SOP2_Real_si <0x11, S_OR_B64>;
+def S_XOR_B32_si : SOP2_Real_si <0x12, S_XOR_B32>;
+def S_XOR_B64_si : SOP2_Real_si <0x13, S_XOR_B64>;
+def S_ANDN2_B32_si : SOP2_Real_si <0x14, S_ANDN2_B32>;
+def S_ANDN2_B64_si : SOP2_Real_si <0x15, S_ANDN2_B64>;
+def S_ORN2_B32_si : SOP2_Real_si <0x16, S_ORN2_B32>;
+def S_ORN2_B64_si : SOP2_Real_si <0x17, S_ORN2_B64>;
+def S_NAND_B32_si : SOP2_Real_si <0x18, S_NAND_B32>;
+def S_NAND_B64_si : SOP2_Real_si <0x19, S_NAND_B64>;
+def S_NOR_B32_si : SOP2_Real_si <0x1a, S_NOR_B32>;
+def S_NOR_B64_si : SOP2_Real_si <0x1b, S_NOR_B64>;
+def S_XNOR_B32_si : SOP2_Real_si <0x1c, S_XNOR_B32>;
+def S_XNOR_B64_si : SOP2_Real_si <0x1d, S_XNOR_B64>;
+def S_LSHL_B32_si : SOP2_Real_si <0x1e, S_LSHL_B32>;
+def S_LSHL_B64_si : SOP2_Real_si <0x1f, S_LSHL_B64>;
+def S_LSHR_B32_si : SOP2_Real_si <0x20, S_LSHR_B32>;
+def S_LSHR_B64_si : SOP2_Real_si <0x21, S_LSHR_B64>;
+def S_ASHR_I32_si : SOP2_Real_si <0x22, S_ASHR_I32>;
+def S_ASHR_I64_si : SOP2_Real_si <0x23, S_ASHR_I64>;
+def S_BFM_B32_si : SOP2_Real_si <0x24, S_BFM_B32>;
+def S_BFM_B64_si : SOP2_Real_si <0x25, S_BFM_B64>;
+def S_MUL_I32_si : SOP2_Real_si <0x26, S_MUL_I32>;
+def S_BFE_U32_si : SOP2_Real_si <0x27, S_BFE_U32>;
+def S_BFE_I32_si : SOP2_Real_si <0x28, S_BFE_I32>;
+def S_BFE_U64_si : SOP2_Real_si <0x29, S_BFE_U64>;
+def S_BFE_I64_si : SOP2_Real_si <0x2a, S_BFE_I64>;
+def S_CBRANCH_G_FORK_si : SOP2_Real_si <0x2b, S_CBRANCH_G_FORK>;
+def S_ABSDIFF_I32_si : SOP2_Real_si <0x2c, S_ABSDIFF_I32>;
+
+def S_MOVK_I32_si : SOPK_Real_si <0x00, S_MOVK_I32>;
+def S_CMOVK_I32_si : SOPK_Real_si <0x02, S_CMOVK_I32>;
+def S_CMPK_EQ_I32_si : SOPK_Real_si <0x03, S_CMPK_EQ_I32>;
+def S_CMPK_LG_I32_si : SOPK_Real_si <0x04, S_CMPK_LG_I32>;
+def S_CMPK_GT_I32_si : SOPK_Real_si <0x05, S_CMPK_GT_I32>;
+def S_CMPK_GE_I32_si : SOPK_Real_si <0x06, S_CMPK_GE_I32>;
+def S_CMPK_LT_I32_si : SOPK_Real_si <0x07, S_CMPK_LT_I32>;
+def S_CMPK_LE_I32_si : SOPK_Real_si <0x08, S_CMPK_LE_I32>;
+def S_CMPK_EQ_U32_si : SOPK_Real_si <0x09, S_CMPK_EQ_U32>;
+def S_CMPK_LG_U32_si : SOPK_Real_si <0x0a, S_CMPK_LG_U32>;
+def S_CMPK_GT_U32_si : SOPK_Real_si <0x0b, S_CMPK_GT_U32>;
+def S_CMPK_GE_U32_si : SOPK_Real_si <0x0c, S_CMPK_GE_U32>;
+def S_CMPK_LT_U32_si : SOPK_Real_si <0x0d, S_CMPK_LT_U32>;
+def S_CMPK_LE_U32_si : SOPK_Real_si <0x0e, S_CMPK_LE_U32>;
+def S_ADDK_I32_si : SOPK_Real_si <0x0f, S_ADDK_I32>;
+def S_MULK_I32_si : SOPK_Real_si <0x10, S_MULK_I32>;
+def S_CBRANCH_I_FORK_si : SOPK_Real_si <0x11, S_CBRANCH_I_FORK>;
+def S_GETREG_B32_si : SOPK_Real_si <0x12, S_GETREG_B32>;
+def S_SETREG_B32_si : SOPK_Real_si <0x13, S_SETREG_B32>;
+//def S_GETREG_REGRD_B32_si : SOPK_Real_si <0x14, S_GETREG_REGRD_B32>; // see pseudo for comments
+def S_SETREG_IMM32_B32_si : SOPK_Real64<0x15, S_SETREG_IMM32_B32>,
+ Select_si<S_SETREG_IMM32_B32.Mnemonic>;
+
+
+class Select_vi<string opName> :
+ SIMCInstr<opName, SIEncodingFamily.VI> {
+ list<Predicate> AssemblerPredicates = [isVI];
+ string DecoderNamespace = "VI";
+}
+
+class SOP1_Real_vi<bits<8> op, SOP1_Pseudo ps> :
+ SOP1_Real<op, ps>,
+ Select_vi<ps.Mnemonic>;
+
+
+class SOP2_Real_vi<bits<7> op, SOP2_Pseudo ps> :
+ SOP2_Real<op, ps>,
+ Select_vi<ps.Mnemonic>;
+
+class SOPK_Real_vi<bits<5> op, SOPK_Pseudo ps> :
+ SOPK_Real32<op, ps>,
+ Select_vi<ps.Mnemonic>;
+
+def S_MOV_B32_vi : SOP1_Real_vi <0x00, S_MOV_B32>;
+def S_MOV_B64_vi : SOP1_Real_vi <0x01, S_MOV_B64>;
+def S_CMOV_B32_vi : SOP1_Real_vi <0x02, S_CMOV_B32>;
+def S_CMOV_B64_vi : SOP1_Real_vi <0x03, S_CMOV_B64>;
+def S_NOT_B32_vi : SOP1_Real_vi <0x04, S_NOT_B32>;
+def S_NOT_B64_vi : SOP1_Real_vi <0x05, S_NOT_B64>;
+def S_WQM_B32_vi : SOP1_Real_vi <0x06, S_WQM_B32>;
+def S_WQM_B64_vi : SOP1_Real_vi <0x07, S_WQM_B64>;
+def S_BREV_B32_vi : SOP1_Real_vi <0x08, S_BREV_B32>;
+def S_BREV_B64_vi : SOP1_Real_vi <0x09, S_BREV_B64>;
+def S_BCNT0_I32_B32_vi : SOP1_Real_vi <0x0a, S_BCNT0_I32_B32>;
+def S_BCNT0_I32_B64_vi : SOP1_Real_vi <0x0b, S_BCNT0_I32_B64>;
+def S_BCNT1_I32_B32_vi : SOP1_Real_vi <0x0c, S_BCNT1_I32_B32>;
+def S_BCNT1_I32_B64_vi : SOP1_Real_vi <0x0d, S_BCNT1_I32_B64>;
+def S_FF0_I32_B32_vi : SOP1_Real_vi <0x0e, S_FF0_I32_B32>;
+def S_FF0_I32_B64_vi : SOP1_Real_vi <0x0f, S_FF0_I32_B64>;
+def S_FF1_I32_B32_vi : SOP1_Real_vi <0x10, S_FF1_I32_B32>;
+def S_FF1_I32_B64_vi : SOP1_Real_vi <0x11, S_FF1_I32_B64>;
+def S_FLBIT_I32_B32_vi : SOP1_Real_vi <0x12, S_FLBIT_I32_B32>;
+def S_FLBIT_I32_B64_vi : SOP1_Real_vi <0x13, S_FLBIT_I32_B64>;
+def S_FLBIT_I32_vi : SOP1_Real_vi <0x14, S_FLBIT_I32>;
+def S_FLBIT_I32_I64_vi : SOP1_Real_vi <0x15, S_FLBIT_I32_I64>;
+def S_SEXT_I32_I8_vi : SOP1_Real_vi <0x16, S_SEXT_I32_I8>;
+def S_SEXT_I32_I16_vi : SOP1_Real_vi <0x17, S_SEXT_I32_I16>;
+def S_BITSET0_B32_vi : SOP1_Real_vi <0x18, S_BITSET0_B32>;
+def S_BITSET0_B64_vi : SOP1_Real_vi <0x19, S_BITSET0_B64>;
+def S_BITSET1_B32_vi : SOP1_Real_vi <0x1a, S_BITSET1_B32>;
+def S_BITSET1_B64_vi : SOP1_Real_vi <0x1b, S_BITSET1_B64>;
+def S_GETPC_B64_vi : SOP1_Real_vi <0x1c, S_GETPC_B64>;
+def S_SETPC_B64_vi : SOP1_Real_vi <0x1d, S_SETPC_B64>;
+def S_SWAPPC_B64_vi : SOP1_Real_vi <0x1e, S_SWAPPC_B64>;
+def S_RFE_B64_vi : SOP1_Real_vi <0x1f, S_RFE_B64>;
+def S_AND_SAVEEXEC_B64_vi : SOP1_Real_vi <0x20, S_AND_SAVEEXEC_B64>;
+def S_OR_SAVEEXEC_B64_vi : SOP1_Real_vi <0x21, S_OR_SAVEEXEC_B64>;
+def S_XOR_SAVEEXEC_B64_vi : SOP1_Real_vi <0x22, S_XOR_SAVEEXEC_B64>;
+def S_ANDN2_SAVEEXEC_B64_vi: SOP1_Real_vi <0x23, S_ANDN2_SAVEEXEC_B64>;
+def S_ORN2_SAVEEXEC_B64_vi : SOP1_Real_vi <0x24, S_ORN2_SAVEEXEC_B64>;
+def S_NAND_SAVEEXEC_B64_vi : SOP1_Real_vi <0x25, S_NAND_SAVEEXEC_B64>;
+def S_NOR_SAVEEXEC_B64_vi : SOP1_Real_vi <0x26, S_NOR_SAVEEXEC_B64>;
+def S_XNOR_SAVEEXEC_B64_vi : SOP1_Real_vi <0x27, S_XNOR_SAVEEXEC_B64>;
+def S_QUADMASK_B32_vi : SOP1_Real_vi <0x28, S_QUADMASK_B32>;
+def S_QUADMASK_B64_vi : SOP1_Real_vi <0x29, S_QUADMASK_B64>;
+def S_MOVRELS_B32_vi : SOP1_Real_vi <0x2a, S_MOVRELS_B32>;
+def S_MOVRELS_B64_vi : SOP1_Real_vi <0x2b, S_MOVRELS_B64>;
+def S_MOVRELD_B32_vi : SOP1_Real_vi <0x2c, S_MOVRELD_B32>;
+def S_MOVRELD_B64_vi : SOP1_Real_vi <0x2d, S_MOVRELD_B64>;
+def S_CBRANCH_JOIN_vi : SOP1_Real_vi <0x2e, S_CBRANCH_JOIN>;
+def S_MOV_REGRD_B32_vi : SOP1_Real_vi <0x2f, S_MOV_REGRD_B32>;
+def S_ABS_I32_vi : SOP1_Real_vi <0x30, S_ABS_I32>;
+def S_MOV_FED_B32_vi : SOP1_Real_vi <0x31, S_MOV_FED_B32>;
+def S_SET_GPR_IDX_IDX_vi : SOP1_Real_vi <0x32, S_SET_GPR_IDX_IDX>;
+
+def S_ADD_U32_vi : SOP2_Real_vi <0x00, S_ADD_U32>;
+def S_ADD_I32_vi : SOP2_Real_vi <0x02, S_ADD_I32>;
+def S_SUB_U32_vi : SOP2_Real_vi <0x01, S_SUB_U32>;
+def S_SUB_I32_vi : SOP2_Real_vi <0x03, S_SUB_I32>;
+def S_ADDC_U32_vi : SOP2_Real_vi <0x04, S_ADDC_U32>;
+def S_SUBB_U32_vi : SOP2_Real_vi <0x05, S_SUBB_U32>;
+def S_MIN_I32_vi : SOP2_Real_vi <0x06, S_MIN_I32>;
+def S_MIN_U32_vi : SOP2_Real_vi <0x07, S_MIN_U32>;
+def S_MAX_I32_vi : SOP2_Real_vi <0x08, S_MAX_I32>;
+def S_MAX_U32_vi : SOP2_Real_vi <0x09, S_MAX_U32>;
+def S_CSELECT_B32_vi : SOP2_Real_vi <0x0a, S_CSELECT_B32>;
+def S_CSELECT_B64_vi : SOP2_Real_vi <0x0b, S_CSELECT_B64>;
+def S_AND_B32_vi : SOP2_Real_vi <0x0c, S_AND_B32>;
+def S_AND_B64_vi : SOP2_Real_vi <0x0d, S_AND_B64>;
+def S_OR_B32_vi : SOP2_Real_vi <0x0e, S_OR_B32>;
+def S_OR_B64_vi : SOP2_Real_vi <0x0f, S_OR_B64>;
+def S_XOR_B32_vi : SOP2_Real_vi <0x10, S_XOR_B32>;
+def S_XOR_B64_vi : SOP2_Real_vi <0x11, S_XOR_B64>;
+def S_ANDN2_B32_vi : SOP2_Real_vi <0x12, S_ANDN2_B32>;
+def S_ANDN2_B64_vi : SOP2_Real_vi <0x13, S_ANDN2_B64>;
+def S_ORN2_B32_vi : SOP2_Real_vi <0x14, S_ORN2_B32>;
+def S_ORN2_B64_vi : SOP2_Real_vi <0x15, S_ORN2_B64>;
+def S_NAND_B32_vi : SOP2_Real_vi <0x16, S_NAND_B32>;
+def S_NAND_B64_vi : SOP2_Real_vi <0x17, S_NAND_B64>;
+def S_NOR_B32_vi : SOP2_Real_vi <0x18, S_NOR_B32>;
+def S_NOR_B64_vi : SOP2_Real_vi <0x19, S_NOR_B64>;
+def S_XNOR_B32_vi : SOP2_Real_vi <0x1a, S_XNOR_B32>;
+def S_XNOR_B64_vi : SOP2_Real_vi <0x1b, S_XNOR_B64>;
+def S_LSHL_B32_vi : SOP2_Real_vi <0x1c, S_LSHL_B32>;
+def S_LSHL_B64_vi : SOP2_Real_vi <0x1d, S_LSHL_B64>;
+def S_LSHR_B32_vi : SOP2_Real_vi <0x1e, S_LSHR_B32>;
+def S_LSHR_B64_vi : SOP2_Real_vi <0x1f, S_LSHR_B64>;
+def S_ASHR_I32_vi : SOP2_Real_vi <0x20, S_ASHR_I32>;
+def S_ASHR_I64_vi : SOP2_Real_vi <0x21, S_ASHR_I64>;
+def S_BFM_B32_vi : SOP2_Real_vi <0x22, S_BFM_B32>;
+def S_BFM_B64_vi : SOP2_Real_vi <0x23, S_BFM_B64>;
+def S_MUL_I32_vi : SOP2_Real_vi <0x24, S_MUL_I32>;
+def S_BFE_U32_vi : SOP2_Real_vi <0x25, S_BFE_U32>;
+def S_BFE_I32_vi : SOP2_Real_vi <0x26, S_BFE_I32>;
+def S_BFE_U64_vi : SOP2_Real_vi <0x27, S_BFE_U64>;
+def S_BFE_I64_vi : SOP2_Real_vi <0x28, S_BFE_I64>;
+def S_CBRANCH_G_FORK_vi : SOP2_Real_vi <0x29, S_CBRANCH_G_FORK>;
+def S_ABSDIFF_I32_vi : SOP2_Real_vi <0x2a, S_ABSDIFF_I32>;
+
+def S_MOVK_I32_vi : SOPK_Real_vi <0x00, S_MOVK_I32>;
+def S_CMOVK_I32_vi : SOPK_Real_vi <0x01, S_CMOVK_I32>;
+def S_CMPK_EQ_I32_vi : SOPK_Real_vi <0x02, S_CMPK_EQ_I32>;
+def S_CMPK_LG_I32_vi : SOPK_Real_vi <0x03, S_CMPK_LG_I32>;
+def S_CMPK_GT_I32_vi : SOPK_Real_vi <0x04, S_CMPK_GT_I32>;
+def S_CMPK_GE_I32_vi : SOPK_Real_vi <0x05, S_CMPK_GE_I32>;
+def S_CMPK_LT_I32_vi : SOPK_Real_vi <0x06, S_CMPK_LT_I32>;
+def S_CMPK_LE_I32_vi : SOPK_Real_vi <0x07, S_CMPK_LE_I32>;
+def S_CMPK_EQ_U32_vi : SOPK_Real_vi <0x08, S_CMPK_EQ_U32>;
+def S_CMPK_LG_U32_vi : SOPK_Real_vi <0x09, S_CMPK_LG_U32>;
+def S_CMPK_GT_U32_vi : SOPK_Real_vi <0x0A, S_CMPK_GT_U32>;
+def S_CMPK_GE_U32_vi : SOPK_Real_vi <0x0B, S_CMPK_GE_U32>;
+def S_CMPK_LT_U32_vi : SOPK_Real_vi <0x0C, S_CMPK_LT_U32>;
+def S_CMPK_LE_U32_vi : SOPK_Real_vi <0x0D, S_CMPK_LE_U32>;
+def S_ADDK_I32_vi : SOPK_Real_vi <0x0E, S_ADDK_I32>;
+def S_MULK_I32_vi : SOPK_Real_vi <0x0F, S_MULK_I32>;
+def S_CBRANCH_I_FORK_vi : SOPK_Real_vi <0x10, S_CBRANCH_I_FORK>;
+def S_GETREG_B32_vi : SOPK_Real_vi <0x11, S_GETREG_B32>;
+def S_SETREG_B32_vi : SOPK_Real_vi <0x12, S_SETREG_B32>;
+//def S_GETREG_REGRD_B32_vi : SOPK_Real_vi <0x13, S_GETREG_REGRD_B32>; // see pseudo for comments
+def S_SETREG_IMM32_B32_vi : SOPK_Real64<0x14, S_SETREG_IMM32_B32>,
+ Select_vi<S_SETREG_IMM32_B32.Mnemonic>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
new file mode 100644
index 000000000000..9908fc003ce7
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
@@ -0,0 +1,37 @@
+//===-- TargetInfo/AMDGPUTargetInfo.cpp - TargetInfo for AMDGPU -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUTargetMachine.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+/// \brief The target which suports all AMD GPUs. This will eventually
+/// be deprecated and there will be a R600 target and a GCN target.
+Target &llvm::getTheAMDGPUTarget() {
+ static Target TheAMDGPUTarget;
+ return TheAMDGPUTarget;
+}
+/// \brief The target for GCN GPUs
+Target &llvm::getTheGCNTarget() {
+ static Target TheGCNTarget;
+ return TheGCNTarget;
+}
+
+/// \brief Extern function to initialize the targets for the AMDGPU backend
+extern "C" void LLVMInitializeAMDGPUTargetInfo() {
+ RegisterTarget<Triple::r600, false> R600(getTheAMDGPUTarget(), "r600",
+ "AMD GPUs HD2XXX-HD6XXX");
+ RegisterTarget<Triple::amdgcn, false> GCN(getTheGCNTarget(), "amdgcn",
+ "AMD GCN GPUs");
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
new file mode 100644
index 000000000000..b6868de6a74e
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -0,0 +1,69 @@
+//===-- AMDGPUAsmUtils.cpp - AsmParser/InstPrinter common -----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include "AMDGPUAsmUtils.h"
+
+namespace llvm {
+namespace AMDGPU {
+namespace SendMsg {
+
+// This must be in sync with llvm::AMDGPU::SendMsg::Id enum members, see SIDefines.h.
+const char* const IdSymbolic[] = {
+ nullptr,
+ "MSG_INTERRUPT",
+ "MSG_GS",
+ "MSG_GS_DONE",
+ nullptr,
+ nullptr,
+ nullptr,
+ nullptr,
+ nullptr,
+ nullptr,
+ nullptr,
+ nullptr,
+ nullptr,
+ nullptr,
+ nullptr,
+ "MSG_SYSMSG"
+};
+
+// These two must be in sync with llvm::AMDGPU::SendMsg::Op enum members, see SIDefines.h.
+const char* const OpSysSymbolic[] = {
+ nullptr,
+ "SYSMSG_OP_ECC_ERR_INTERRUPT",
+ "SYSMSG_OP_REG_RD",
+ "SYSMSG_OP_HOST_TRAP_ACK",
+ "SYSMSG_OP_TTRACE_PC"
+};
+
+const char* const OpGsSymbolic[] = {
+ "GS_OP_NOP",
+ "GS_OP_CUT",
+ "GS_OP_EMIT",
+ "GS_OP_EMIT_CUT"
+};
+
+} // namespace SendMsg
+
+namespace Hwreg {
+
+// This must be in sync with llvm::AMDGPU::Hwreg::ID_SYMBOLIC_FIRST_/LAST_, see SIDefines.h.
+const char* const IdSymbolic[] = {
+ nullptr,
+ "HW_REG_MODE",
+ "HW_REG_STATUS",
+ "HW_REG_TRAPSTS",
+ "HW_REG_HW_ID",
+ "HW_REG_GPR_ALLOC",
+ "HW_REG_LDS_ALLOC",
+ "HW_REG_IB_STS"
+};
+
+} // namespace Hwreg
+} // namespace AMDGPU
+} // namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
new file mode 100644
index 000000000000..b2dc2c0e364c
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
@@ -0,0 +1,31 @@
+//===-- AMDGPUAsmUtils.h - AsmParser/InstPrinter common ---------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUASMUTILS_H
+#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUASMUTILS_H
+
+namespace llvm {
+namespace AMDGPU {
+namespace SendMsg { // Symbolic names for the sendmsg(...) syntax.
+
+extern const char* const IdSymbolic[];
+extern const char* const OpSysSymbolic[];
+extern const char* const OpGsSymbolic[];
+
+} // namespace SendMsg
+
+namespace Hwreg { // Symbolic names for the hwreg(...) syntax.
+
+extern const char* const IdSymbolic[];
+
+} // namespace Hwreg
+} // namespace AMDGPU
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
new file mode 100644
index 000000000000..85cbadf0a570
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -0,0 +1,461 @@
+//===-- AMDGPUBaseInfo.cpp - AMDGPU Base encoding information--------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include "AMDGPUBaseInfo.h"
+#include "AMDGPU.h"
+#include "SIDefines.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/SubtargetFeature.h"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "AMDGPUGenSubtargetInfo.inc"
+#undef GET_SUBTARGETINFO_ENUM
+
+#define GET_REGINFO_ENUM
+#include "AMDGPUGenRegisterInfo.inc"
+#undef GET_REGINFO_ENUM
+
+#define GET_INSTRINFO_NAMED_OPS
+#define GET_INSTRINFO_ENUM
+#include "AMDGPUGenInstrInfo.inc"
+#undef GET_INSTRINFO_NAMED_OPS
+#undef GET_INSTRINFO_ENUM
+
+namespace {
+
+/// \returns Bit mask for given bit \p Shift and bit \p Width.
+unsigned getBitMask(unsigned Shift, unsigned Width) {
+ return ((1 << Width) - 1) << Shift;
+}
+
+/// \brief Packs \p Src into \p Dst for given bit \p Shift and bit \p Width.
+///
+/// \returns Packed \p Dst.
+unsigned packBits(unsigned Src, unsigned Dst, unsigned Shift, unsigned Width) {
+ Dst &= ~(1 << Shift) & ~getBitMask(Shift, Width);
+ Dst |= (Src << Shift) & getBitMask(Shift, Width);
+ return Dst;
+}
+
+/// \brief Unpacks bits from \p Src for given bit \p Shift and bit \p Width.
+///
+/// \returns Unpacked bits.
+unsigned unpackBits(unsigned Src, unsigned Shift, unsigned Width) {
+ return (Src & getBitMask(Shift, Width)) >> Shift;
+}
+
+/// \returns Vmcnt bit shift.
+unsigned getVmcntBitShift() { return 0; }
+
+/// \returns Vmcnt bit width.
+unsigned getVmcntBitWidth() { return 4; }
+
+/// \returns Expcnt bit shift.
+unsigned getExpcntBitShift() { return 4; }
+
+/// \returns Expcnt bit width.
+unsigned getExpcntBitWidth() { return 3; }
+
+/// \returns Lgkmcnt bit shift.
+unsigned getLgkmcntBitShift() { return 8; }
+
+/// \returns Lgkmcnt bit width.
+unsigned getLgkmcntBitWidth() { return 4; }
+
+} // anonymous namespace
+
+namespace llvm {
+namespace AMDGPU {
+
+IsaVersion getIsaVersion(const FeatureBitset &Features) {
+
+ if (Features.test(FeatureISAVersion7_0_0))
+ return {7, 0, 0};
+
+ if (Features.test(FeatureISAVersion7_0_1))
+ return {7, 0, 1};
+
+ if (Features.test(FeatureISAVersion7_0_2))
+ return {7, 0, 2};
+
+ if (Features.test(FeatureISAVersion8_0_0))
+ return {8, 0, 0};
+
+ if (Features.test(FeatureISAVersion8_0_1))
+ return {8, 0, 1};
+
+ if (Features.test(FeatureISAVersion8_0_2))
+ return {8, 0, 2};
+
+ if (Features.test(FeatureISAVersion8_0_3))
+ return {8, 0, 3};
+
+ if (Features.test(FeatureISAVersion8_0_4))
+ return {8, 0, 4};
+
+ if (Features.test(FeatureISAVersion8_1_0))
+ return {8, 1, 0};
+
+ return {0, 0, 0};
+}
+
+void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
+ const FeatureBitset &Features) {
+
+ IsaVersion ISA = getIsaVersion(Features);
+
+ memset(&Header, 0, sizeof(Header));
+
+ Header.amd_kernel_code_version_major = 1;
+ Header.amd_kernel_code_version_minor = 0;
+ Header.amd_machine_kind = 1; // AMD_MACHINE_KIND_AMDGPU
+ Header.amd_machine_version_major = ISA.Major;
+ Header.amd_machine_version_minor = ISA.Minor;
+ Header.amd_machine_version_stepping = ISA.Stepping;
+ Header.kernel_code_entry_byte_offset = sizeof(Header);
+ // wavefront_size is specified as a power of 2: 2^6 = 64 threads.
+ Header.wavefront_size = 6;
+ // These alignment values are specified in powers of two, so alignment =
+ // 2^n. The minimum alignment is 2^4 = 16.
+ Header.kernarg_segment_alignment = 4;
+ Header.group_segment_alignment = 4;
+ Header.private_segment_alignment = 4;
+}
+
+MCSection *getHSATextSection(MCContext &Ctx) {
+ return Ctx.getELFSection(".hsatext", ELF::SHT_PROGBITS,
+ ELF::SHF_ALLOC | ELF::SHF_WRITE |
+ ELF::SHF_EXECINSTR |
+ ELF::SHF_AMDGPU_HSA_AGENT |
+ ELF::SHF_AMDGPU_HSA_CODE);
+}
+
+MCSection *getHSADataGlobalAgentSection(MCContext &Ctx) {
+ return Ctx.getELFSection(".hsadata_global_agent", ELF::SHT_PROGBITS,
+ ELF::SHF_ALLOC | ELF::SHF_WRITE |
+ ELF::SHF_AMDGPU_HSA_GLOBAL |
+ ELF::SHF_AMDGPU_HSA_AGENT);
+}
+
+MCSection *getHSADataGlobalProgramSection(MCContext &Ctx) {
+ return Ctx.getELFSection(".hsadata_global_program", ELF::SHT_PROGBITS,
+ ELF::SHF_ALLOC | ELF::SHF_WRITE |
+ ELF::SHF_AMDGPU_HSA_GLOBAL);
+}
+
+MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx) {
+ return Ctx.getELFSection(".hsarodata_readonly_agent", ELF::SHT_PROGBITS,
+ ELF::SHF_ALLOC | ELF::SHF_AMDGPU_HSA_READONLY |
+ ELF::SHF_AMDGPU_HSA_AGENT);
+}
+
+bool isGroupSegment(const GlobalValue *GV) {
+ return GV->getType()->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+}
+
+bool isGlobalSegment(const GlobalValue *GV) {
+ return GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
+}
+
+bool isReadOnlySegment(const GlobalValue *GV) {
+ return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
+}
+
+bool shouldEmitConstantsToTextSection(const Triple &TT) {
+ return TT.getOS() != Triple::AMDHSA;
+}
+
+int getIntegerAttribute(const Function &F, StringRef Name, int Default) {
+ Attribute A = F.getFnAttribute(Name);
+ int Result = Default;
+
+ if (A.isStringAttribute()) {
+ StringRef Str = A.getValueAsString();
+ if (Str.getAsInteger(0, Result)) {
+ LLVMContext &Ctx = F.getContext();
+ Ctx.emitError("can't parse integer attribute " + Name);
+ }
+ }
+
+ return Result;
+}
+
+std::pair<int, int> getIntegerPairAttribute(const Function &F,
+ StringRef Name,
+ std::pair<int, int> Default,
+ bool OnlyFirstRequired) {
+ Attribute A = F.getFnAttribute(Name);
+ if (!A.isStringAttribute())
+ return Default;
+
+ LLVMContext &Ctx = F.getContext();
+ std::pair<int, int> Ints = Default;
+ std::pair<StringRef, StringRef> Strs = A.getValueAsString().split(',');
+ if (Strs.first.trim().getAsInteger(0, Ints.first)) {
+ Ctx.emitError("can't parse first integer attribute " + Name);
+ return Default;
+ }
+ if (Strs.second.trim().getAsInteger(0, Ints.second)) {
+ if (!OnlyFirstRequired || Strs.second.trim().size()) {
+ Ctx.emitError("can't parse second integer attribute " + Name);
+ return Default;
+ }
+ }
+
+ return Ints;
+}
+
+unsigned getWaitcntBitMask(IsaVersion Version) {
+ unsigned Vmcnt = getBitMask(getVmcntBitShift(), getVmcntBitWidth());
+ unsigned Expcnt = getBitMask(getExpcntBitShift(), getExpcntBitWidth());
+ unsigned Lgkmcnt = getBitMask(getLgkmcntBitShift(), getLgkmcntBitWidth());
+ return Vmcnt | Expcnt | Lgkmcnt;
+}
+
+unsigned getVmcntBitMask(IsaVersion Version) {
+ return (1 << getVmcntBitWidth()) - 1;
+}
+
+unsigned getExpcntBitMask(IsaVersion Version) {
+ return (1 << getExpcntBitWidth()) - 1;
+}
+
+unsigned getLgkmcntBitMask(IsaVersion Version) {
+ return (1 << getLgkmcntBitWidth()) - 1;
+}
+
+unsigned decodeVmcnt(IsaVersion Version, unsigned Waitcnt) {
+ return unpackBits(Waitcnt, getVmcntBitShift(), getVmcntBitWidth());
+}
+
+unsigned decodeExpcnt(IsaVersion Version, unsigned Waitcnt) {
+ return unpackBits(Waitcnt, getExpcntBitShift(), getExpcntBitWidth());
+}
+
+unsigned decodeLgkmcnt(IsaVersion Version, unsigned Waitcnt) {
+ return unpackBits(Waitcnt, getLgkmcntBitShift(), getLgkmcntBitWidth());
+}
+
+void decodeWaitcnt(IsaVersion Version, unsigned Waitcnt,
+ unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt) {
+ Vmcnt = decodeVmcnt(Version, Waitcnt);
+ Expcnt = decodeExpcnt(Version, Waitcnt);
+ Lgkmcnt = decodeLgkmcnt(Version, Waitcnt);
+}
+
+unsigned encodeVmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Vmcnt) {
+ return packBits(Vmcnt, Waitcnt, getVmcntBitShift(), getVmcntBitWidth());
+}
+
+unsigned encodeExpcnt(IsaVersion Version, unsigned Waitcnt, unsigned Expcnt) {
+ return packBits(Expcnt, Waitcnt, getExpcntBitShift(), getExpcntBitWidth());
+}
+
+unsigned encodeLgkmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Lgkmcnt) {
+ return packBits(Lgkmcnt, Waitcnt, getLgkmcntBitShift(), getLgkmcntBitWidth());
+}
+
+unsigned encodeWaitcnt(IsaVersion Version,
+ unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt) {
+ unsigned Waitcnt = getWaitcntBitMask(Version);;
+ Waitcnt = encodeVmcnt(Version, Waitcnt, Vmcnt);
+ Waitcnt = encodeExpcnt(Version, Waitcnt, Expcnt);
+ Waitcnt = encodeLgkmcnt(Version, Waitcnt, Lgkmcnt);
+ return Waitcnt;
+}
+
+unsigned getInitialPSInputAddr(const Function &F) {
+ return getIntegerAttribute(F, "InitialPSInputAddr", 0);
+}
+
+bool isShader(CallingConv::ID cc) {
+ switch(cc) {
+ case CallingConv::AMDGPU_VS:
+ case CallingConv::AMDGPU_GS:
+ case CallingConv::AMDGPU_PS:
+ case CallingConv::AMDGPU_CS:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool isCompute(CallingConv::ID cc) {
+ return !isShader(cc) || cc == CallingConv::AMDGPU_CS;
+}
+
+bool isSI(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureSouthernIslands];
+}
+
+bool isCI(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureSeaIslands];
+}
+
+bool isVI(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands];
+}
+
+unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
+
+ switch(Reg) {
+ default: break;
+ case AMDGPU::FLAT_SCR:
+ assert(!isSI(STI));
+ return isCI(STI) ? AMDGPU::FLAT_SCR_ci : AMDGPU::FLAT_SCR_vi;
+
+ case AMDGPU::FLAT_SCR_LO:
+ assert(!isSI(STI));
+ return isCI(STI) ? AMDGPU::FLAT_SCR_LO_ci : AMDGPU::FLAT_SCR_LO_vi;
+
+ case AMDGPU::FLAT_SCR_HI:
+ assert(!isSI(STI));
+ return isCI(STI) ? AMDGPU::FLAT_SCR_HI_ci : AMDGPU::FLAT_SCR_HI_vi;
+ }
+ return Reg;
+}
+
+bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo) {
+ unsigned OpType = Desc.OpInfo[OpNo].OperandType;
+ return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
+ OpType <= AMDGPU::OPERAND_SRC_LAST;
+}
+
+bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) {
+ unsigned OpType = Desc.OpInfo[OpNo].OperandType;
+ switch (OpType) {
+ case AMDGPU::OPERAND_REG_IMM_FP32:
+ case AMDGPU::OPERAND_REG_IMM_FP64:
+ case AMDGPU::OPERAND_REG_IMM_FP16:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo) {
+ unsigned OpType = Desc.OpInfo[OpNo].OperandType;
+ return OpType >= AMDGPU::OPERAND_REG_INLINE_C_FIRST &&
+ OpType <= AMDGPU::OPERAND_REG_INLINE_C_LAST;
+}
+
+// Avoid using MCRegisterClass::getSize, since that function will go away
+// (move from MC* level to Target* level). Return size in bits.
+unsigned getRegBitWidth(unsigned RCID) {
+ switch (RCID) {
+ case AMDGPU::SGPR_32RegClassID:
+ case AMDGPU::VGPR_32RegClassID:
+ case AMDGPU::VS_32RegClassID:
+ case AMDGPU::SReg_32RegClassID:
+ case AMDGPU::SReg_32_XM0RegClassID:
+ return 32;
+ case AMDGPU::SGPR_64RegClassID:
+ case AMDGPU::VS_64RegClassID:
+ case AMDGPU::SReg_64RegClassID:
+ case AMDGPU::VReg_64RegClassID:
+ return 64;
+ case AMDGPU::VReg_96RegClassID:
+ return 96;
+ case AMDGPU::SGPR_128RegClassID:
+ case AMDGPU::SReg_128RegClassID:
+ case AMDGPU::VReg_128RegClassID:
+ return 128;
+ case AMDGPU::SReg_256RegClassID:
+ case AMDGPU::VReg_256RegClassID:
+ return 256;
+ case AMDGPU::SReg_512RegClassID:
+ case AMDGPU::VReg_512RegClassID:
+ return 512;
+ default:
+ llvm_unreachable("Unexpected register class");
+ }
+}
+
+unsigned getRegBitWidth(const MCRegisterClass &RC) {
+ return getRegBitWidth(RC.getID());
+}
+
+unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc,
+ unsigned OpNo) {
+ unsigned RCID = Desc.OpInfo[OpNo].RegClass;
+ return getRegBitWidth(MRI->getRegClass(RCID)) / 8;
+}
+
+bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi) {
+ if (Literal >= -16 && Literal <= 64)
+ return true;
+
+ uint64_t Val = static_cast<uint64_t>(Literal);
+ return (Val == DoubleToBits(0.0)) ||
+ (Val == DoubleToBits(1.0)) ||
+ (Val == DoubleToBits(-1.0)) ||
+ (Val == DoubleToBits(0.5)) ||
+ (Val == DoubleToBits(-0.5)) ||
+ (Val == DoubleToBits(2.0)) ||
+ (Val == DoubleToBits(-2.0)) ||
+ (Val == DoubleToBits(4.0)) ||
+ (Val == DoubleToBits(-4.0)) ||
+ (Val == 0x3fc45f306dc9c882 && HasInv2Pi);
+}
+
+bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi) {
+ if (Literal >= -16 && Literal <= 64)
+ return true;
+
+ // The actual type of the operand does not seem to matter as long
+ // as the bits match one of the inline immediate values. For example:
+ //
+ // -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal,
+ // so it is a legal inline immediate.
+ //
+ // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in
+ // floating-point, so it is a legal inline immediate.
+
+ uint32_t Val = static_cast<uint32_t>(Literal);
+ return (Val == FloatToBits(0.0f)) ||
+ (Val == FloatToBits(1.0f)) ||
+ (Val == FloatToBits(-1.0f)) ||
+ (Val == FloatToBits(0.5f)) ||
+ (Val == FloatToBits(-0.5f)) ||
+ (Val == FloatToBits(2.0f)) ||
+ (Val == FloatToBits(-2.0f)) ||
+ (Val == FloatToBits(4.0f)) ||
+ (Val == FloatToBits(-4.0f)) ||
+ (Val == 0x3e22f983 && HasInv2Pi);
+}
+
+bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) {
+ assert(HasInv2Pi);
+
+ if (Literal >= -16 && Literal <= 64)
+ return true;
+
+ uint16_t Val = static_cast<uint16_t>(Literal);
+ return Val == 0x3C00 || // 1.0
+ Val == 0xBC00 || // -1.0
+ Val == 0x3800 || // 0.5
+ Val == 0xB800 || // -0.5
+ Val == 0x4000 || // 2.0
+ Val == 0xC000 || // -2.0
+ Val == 0x4400 || // 4.0
+ Val == 0xC400 || // -4.0
+ Val == 0x3118; // 1/2pi
+}
+
+} // End namespace AMDGPU
+} // End namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
new file mode 100644
index 000000000000..ea5fc366d205
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -0,0 +1,216 @@
+//===-- AMDGPUBaseInfo.h - Top level definitions for AMDGPU -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H
+
+#include "AMDKernelCodeT.h"
+#include "llvm/IR/CallingConv.h"
+
+#include "SIDefines.h"
+
+#define GET_INSTRINFO_OPERAND_ENUM
+#include "AMDGPUGenInstrInfo.inc"
+#undef GET_INSTRINFO_OPERAND_ENUM
+
+namespace llvm {
+
+class FeatureBitset;
+class Function;
+class GlobalValue;
+class MCContext;
+class MCInstrDesc;
+class MCRegisterClass;
+class MCRegisterInfo;
+class MCSection;
+class MCSubtargetInfo;
+
+namespace AMDGPU {
+
+LLVM_READONLY
+int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx);
+
+struct IsaVersion {
+ unsigned Major;
+ unsigned Minor;
+ unsigned Stepping;
+};
+
+IsaVersion getIsaVersion(const FeatureBitset &Features);
+void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
+ const FeatureBitset &Features);
+MCSection *getHSATextSection(MCContext &Ctx);
+
+MCSection *getHSADataGlobalAgentSection(MCContext &Ctx);
+
+MCSection *getHSADataGlobalProgramSection(MCContext &Ctx);
+
+MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx);
+
+bool isGroupSegment(const GlobalValue *GV);
+bool isGlobalSegment(const GlobalValue *GV);
+bool isReadOnlySegment(const GlobalValue *GV);
+
+/// \returns True if constants should be emitted to .text section for given
+/// target triple \p TT, false otherwise.
+bool shouldEmitConstantsToTextSection(const Triple &TT);
+
+/// \returns Integer value requested using \p F's \p Name attribute.
+///
+/// \returns \p Default if attribute is not present.
+///
+/// \returns \p Default and emits error if requested value cannot be converted
+/// to integer.
+int getIntegerAttribute(const Function &F, StringRef Name, int Default);
+
+/// \returns A pair of integer values requested using \p F's \p Name attribute
+/// in "first[,second]" format ("second" is optional unless \p OnlyFirstRequired
+/// is false).
+///
+/// \returns \p Default if attribute is not present.
+///
+/// \returns \p Default and emits error if one of the requested values cannot be
+/// converted to integer, or \p OnlyFirstRequired is false and "second" value is
+/// not present.
+std::pair<int, int> getIntegerPairAttribute(const Function &F,
+ StringRef Name,
+ std::pair<int, int> Default,
+ bool OnlyFirstRequired = false);
+
+/// \returns Waitcnt bit mask for given isa \p Version.
+unsigned getWaitcntBitMask(IsaVersion Version);
+
+/// \returns Vmcnt bit mask for given isa \p Version.
+unsigned getVmcntBitMask(IsaVersion Version);
+
+/// \returns Expcnt bit mask for given isa \p Version.
+unsigned getExpcntBitMask(IsaVersion Version);
+
+/// \returns Lgkmcnt bit mask for given isa \p Version.
+unsigned getLgkmcntBitMask(IsaVersion Version);
+
+/// \returns Decoded Vmcnt from given \p Waitcnt for given isa \p Version.
+unsigned decodeVmcnt(IsaVersion Version, unsigned Waitcnt);
+
+/// \returns Decoded Expcnt from given \p Waitcnt for given isa \p Version.
+unsigned decodeExpcnt(IsaVersion Version, unsigned Waitcnt);
+
+/// \returns Decoded Lgkmcnt from given \p Waitcnt for given isa \p Version.
+unsigned decodeLgkmcnt(IsaVersion Version, unsigned Waitcnt);
+
+/// \brief Decodes Vmcnt, Expcnt and Lgkmcnt from given \p Waitcnt for given isa
+/// \p Version, and writes decoded values into \p Vmcnt, \p Expcnt and
+/// \p Lgkmcnt respectively.
+///
+/// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are decoded as follows:
+/// \p Vmcnt = \p Waitcnt[3:0]
+/// \p Expcnt = \p Waitcnt[6:4]
+/// \p Lgkmcnt = \p Waitcnt[11:8]
+void decodeWaitcnt(IsaVersion Version, unsigned Waitcnt,
+ unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt);
+
+/// \returns \p Waitcnt with encoded \p Vmcnt for given isa \p Version.
+unsigned encodeVmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Vmcnt);
+
+/// \returns \p Waitcnt with encoded \p Expcnt for given isa \p Version.
+unsigned encodeExpcnt(IsaVersion Version, unsigned Waitcnt, unsigned Expcnt);
+
+/// \returns \p Waitcnt with encoded \p Lgkmcnt for given isa \p Version.
+unsigned encodeLgkmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Lgkmcnt);
+
+/// \brief Encodes \p Vmcnt, \p Expcnt and \p Lgkmcnt into Waitcnt for given isa
+/// \p Version.
+///
+/// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are encoded as follows:
+/// Waitcnt[3:0] = \p Vmcnt
+/// Waitcnt[6:4] = \p Expcnt
+/// Waitcnt[11:8] = \p Lgkmcnt
+///
+/// \returns Waitcnt with encoded \p Vmcnt, \p Expcnt and \p Lgkmcnt for given
+/// isa \p Version.
+unsigned encodeWaitcnt(IsaVersion Version,
+ unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt);
+
+unsigned getInitialPSInputAddr(const Function &F);
+
+bool isShader(CallingConv::ID cc);
+bool isCompute(CallingConv::ID cc);
+
+bool isSI(const MCSubtargetInfo &STI);
+bool isCI(const MCSubtargetInfo &STI);
+bool isVI(const MCSubtargetInfo &STI);
+
+/// If \p Reg is a pseudo reg, return the correct hardware register given
+/// \p STI otherwise return \p Reg.
+unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI);
+
+/// \brief Can this operand also contain immediate values?
+bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo);
+
+/// \brief Is this floating-point operand?
+bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo);
+
+/// \brief Does this opearnd support only inlinable literals?
+bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo);
+
+/// \brief Get the size in bits of a register from the register class \p RC.
+unsigned getRegBitWidth(unsigned RCID);
+
+/// \brief Get the size in bits of a register from the register class \p RC.
+unsigned getRegBitWidth(const MCRegisterClass &RC);
+
+/// \brief Get size of register operand
+unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc,
+ unsigned OpNo);
+
+LLVM_READNONE
+inline unsigned getOperandSize(const MCOperandInfo &OpInfo) {
+ switch (OpInfo.OperandType) {
+ case AMDGPU::OPERAND_REG_IMM_INT32:
+ case AMDGPU::OPERAND_REG_IMM_FP32:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+ return 4;
+
+ case AMDGPU::OPERAND_REG_IMM_INT64:
+ case AMDGPU::OPERAND_REG_IMM_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+ return 8;
+
+ case AMDGPU::OPERAND_REG_IMM_INT16:
+ case AMDGPU::OPERAND_REG_IMM_FP16:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+ return 2;
+
+ default:
+ llvm_unreachable("unhandled operand type");
+ }
+}
+
+LLVM_READNONE
+inline unsigned getOperandSize(const MCInstrDesc &Desc, unsigned OpNo) {
+ return getOperandSize(Desc.OpInfo[OpNo]);
+}
+
+/// \brief Is this literal inlinable
+LLVM_READNONE
+bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi);
+
+LLVM_READNONE
+bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi);
+
+LLVM_READNONE
+bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi);
+
+} // end namespace AMDGPU
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
new file mode 100644
index 000000000000..c55eaab077d1
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
@@ -0,0 +1,152 @@
+//===--------------------- AMDKernelCodeTInfo.h ---------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file - specifies tables for amd_kernel_code_t structure parsing/printing
+//
+//===----------------------------------------------------------------------===//
+
+#define QNAME(name) amd_kernel_code_t::name
+#define FLD_T(name) decltype(QNAME(name)), &QNAME(name)
+
+#define FIELD2(sname, aname, name) \
+ RECORD(sname, aname, printField<FLD_T(name)>, parseField<FLD_T(name)>)
+
+#define FIELD(name) FIELD2(name, name, name)
+
+
+#define PRINTCODEPROP(name) \
+ printBitField<FLD_T(code_properties),\
+ AMD_CODE_PROPERTY_##name##_SHIFT,\
+ AMD_CODE_PROPERTY_##name##_WIDTH>
+
+#define PARSECODEPROP(name) \
+ parseBitField<FLD_T(code_properties),\
+ AMD_CODE_PROPERTY_##name##_SHIFT,\
+ AMD_CODE_PROPERTY_##name##_WIDTH>
+
+#define CODEPROP(name, shift) \
+ RECORD(name, name, PRINTCODEPROP(shift), PARSECODEPROP(shift))
+
+// have to define these lambdas because of Set/GetMacro
+#define PRINTCOMP(GetMacro, Shift) \
+[](StringRef Name, const amd_kernel_code_t &C, raw_ostream &OS) { \
+ printName(OS, Name) << \
+ (int)GetMacro(C.compute_pgm_resource_registers >> Shift); \
+}
+#define PARSECOMP(SetMacro, Shift) \
+[](amd_kernel_code_t &C, MCAsmParser &MCParser, raw_ostream &Err) { \
+ int64_t Value = 0; \
+ if (!expectAbsExpression(MCParser, Value, Err)) \
+ return false; \
+ C.compute_pgm_resource_registers |= SetMacro(Value) << Shift; \
+ return true; \
+}
+
+#define COMPPGM(name, aname, GetMacro, SetMacro, Shift) \
+ RECORD(name, aname, PRINTCOMP(GetMacro, Shift), PARSECOMP(SetMacro, Shift))
+
+#define COMPPGM1(name, aname, AccMacro) \
+ COMPPGM(name, aname, G_00B848_##AccMacro, S_00B848_##AccMacro, 0)
+
+#define COMPPGM2(name, aname, AccMacro) \
+ COMPPGM(name, aname, G_00B84C_##AccMacro, S_00B84C_##AccMacro, 32)
+
+///////////////////////////////////////////////////////////////////////////////
+// Begin of the table
+// Define RECORD(name, print, parse) in your code to get field definitions
+// and include this file
+
+FIELD2(amd_code_version_major, kernel_code_version_major, amd_kernel_code_version_major),
+FIELD2(amd_code_version_minor, kernel_code_version_minor, amd_kernel_code_version_minor),
+FIELD2(amd_machine_kind, machine_kind, amd_machine_kind),
+FIELD2(amd_machine_version_major, machine_version_major, amd_machine_version_major),
+FIELD2(amd_machine_version_minor, machine_version_minor, amd_machine_version_minor),
+FIELD2(amd_machine_version_stepping, machine_version_stepping, amd_machine_version_stepping),
+
+FIELD(kernel_code_entry_byte_offset),
+FIELD(kernel_code_prefetch_byte_size),
+FIELD(max_scratch_backing_memory_byte_size),
+
+COMPPGM1(granulated_workitem_vgpr_count, compute_pgm_rsrc1_vgprs, VGPRS),
+COMPPGM1(granulated_wavefront_sgpr_count, compute_pgm_rsrc1_sgprs, SGPRS),
+COMPPGM1(priority, compute_pgm_rsrc1_priority, PRIORITY),
+COMPPGM1(float_mode, compute_pgm_rsrc1_float_mode, FLOAT_MODE), // TODO: split float_mode
+COMPPGM1(priv, compute_pgm_rsrc1_priv, PRIV),
+COMPPGM1(enable_dx10_clamp, compute_pgm_rsrc1_dx10_clamp, DX10_CLAMP),
+COMPPGM1(debug_mode, compute_pgm_rsrc1_debug_mode, DEBUG_MODE),
+COMPPGM1(enable_ieee_mode, compute_pgm_rsrc1_ieee_mode, IEEE_MODE),
+// TODO: bulky
+// TODO: cdbg_user
+COMPPGM2(enable_sgpr_private_segment_wave_byte_offset, compute_pgm_rsrc2_scratch_en, SCRATCH_EN),
+COMPPGM2(user_sgpr_count, compute_pgm_rsrc2_user_sgpr, USER_SGPR),
+// TODO: enable_trap_handler
+COMPPGM2(enable_sgpr_workgroup_id_x, compute_pgm_rsrc2_tgid_x_en, TGID_X_EN),
+COMPPGM2(enable_sgpr_workgroup_id_y, compute_pgm_rsrc2_tgid_y_en, TGID_Y_EN),
+COMPPGM2(enable_sgpr_workgroup_id_z, compute_pgm_rsrc2_tgid_z_en, TGID_Z_EN),
+COMPPGM2(enable_sgpr_workgroup_info, compute_pgm_rsrc2_tg_size_en, TG_SIZE_EN),
+COMPPGM2(enable_vgpr_workitem_id, compute_pgm_rsrc2_tidig_comp_cnt, TIDIG_COMP_CNT),
+COMPPGM2(enable_exception_msb, compute_pgm_rsrc2_excp_en_msb, EXCP_EN_MSB), // TODO: split enable_exception_msb
+COMPPGM2(granulated_lds_size, compute_pgm_rsrc2_lds_size, LDS_SIZE),
+COMPPGM2(enable_exception, compute_pgm_rsrc2_excp_en, EXCP_EN), // TODO: split enable_exception
+
+CODEPROP(enable_sgpr_private_segment_buffer, ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER),
+CODEPROP(enable_sgpr_dispatch_ptr, ENABLE_SGPR_DISPATCH_PTR),
+CODEPROP(enable_sgpr_queue_ptr, ENABLE_SGPR_QUEUE_PTR),
+CODEPROP(enable_sgpr_kernarg_segment_ptr, ENABLE_SGPR_KERNARG_SEGMENT_PTR),
+CODEPROP(enable_sgpr_dispatch_id, ENABLE_SGPR_DISPATCH_ID),
+CODEPROP(enable_sgpr_flat_scratch_init, ENABLE_SGPR_FLAT_SCRATCH_INIT),
+CODEPROP(enable_sgpr_private_segment_size, ENABLE_SGPR_PRIVATE_SEGMENT_SIZE),
+CODEPROP(enable_sgpr_grid_workgroup_count_x, ENABLE_SGPR_GRID_WORKGROUP_COUNT_X),
+CODEPROP(enable_sgpr_grid_workgroup_count_y, ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y),
+CODEPROP(enable_sgpr_grid_workgroup_count_z, ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z),
+CODEPROP(enable_ordered_append_gds, ENABLE_ORDERED_APPEND_GDS),
+CODEPROP(private_element_size, PRIVATE_ELEMENT_SIZE),
+CODEPROP(is_ptr64, IS_PTR64),
+CODEPROP(is_dynamic_callstack, IS_DYNAMIC_CALLSTACK),
+CODEPROP(is_debug_enabled, IS_DEBUG_SUPPORTED),
+CODEPROP(is_xnack_enabled, IS_XNACK_SUPPORTED),
+
+FIELD(workitem_private_segment_byte_size),
+FIELD(workgroup_group_segment_byte_size),
+FIELD(gds_segment_byte_size),
+FIELD(kernarg_segment_byte_size),
+FIELD(workgroup_fbarrier_count),
+FIELD(wavefront_sgpr_count),
+FIELD(workitem_vgpr_count),
+FIELD(reserved_vgpr_first),
+FIELD(reserved_vgpr_count),
+FIELD(reserved_sgpr_first),
+FIELD(reserved_sgpr_count),
+FIELD(debug_wavefront_private_segment_offset_sgpr),
+FIELD(debug_private_segment_buffer_sgpr),
+FIELD(kernarg_segment_alignment),
+FIELD(group_segment_alignment),
+FIELD(private_segment_alignment),
+FIELD(wavefront_size),
+FIELD(call_convention),
+FIELD(runtime_loader_kernel_symbol)
+// TODO: control_directive
+
+// end of the table
+///////////////////////////////////////////////////////////////////////////////
+
+#undef QNAME
+#undef FLD_T
+#undef FIELD2
+#undef FIELD
+#undef PRINTCODEPROP
+#undef PARSECODEPROP
+#undef CODEPROP
+#undef PRINTCOMP
+#undef PAPSECOMP
+#undef COMPPGM
+#undef COMPPGM1
+#undef COMPPGM2
diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp
new file mode 100644
index 000000000000..0333b0a14d29
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp
@@ -0,0 +1,181 @@
+//===--------------------AMDKernelCodeTUtils.cpp --------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file - utility functions to parse/print amd_kernel_code_t structure
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDKernelCodeTUtils.h"
+#include "SIDefines.h"
+#include <llvm/MC/MCParser/MCAsmLexer.h>
+#include <llvm/MC/MCParser/MCAsmParser.h>
+#include <llvm/Support/raw_ostream.h>
+
+using namespace llvm;
+
+static ArrayRef<StringRef> get_amd_kernel_code_t_FldNames() {
+ static StringRef const Table[] = {
+ "", // not found placeholder
+#define RECORD(name, altName, print, parse) #name
+#include "AMDKernelCodeTInfo.h"
+#undef RECORD
+ };
+ return makeArrayRef(Table);
+}
+
+static ArrayRef<StringRef> get_amd_kernel_code_t_FldAltNames() {
+ static StringRef const Table[] = {
+ "", // not found placeholder
+#define RECORD(name, altName, print, parse) #altName
+#include "AMDKernelCodeTInfo.h"
+#undef RECORD
+ };
+ return makeArrayRef(Table);
+}
+
+static StringMap<int> createIndexMap(const ArrayRef<StringRef> &names,
+ const ArrayRef<StringRef> &altNames) {
+ StringMap<int> map;
+ assert(names.size() == altNames.size());
+ for (unsigned i = 0; i < names.size(); ++i) {
+ map.insert(std::make_pair(names[i], i));
+ map.insert(std::make_pair(altNames[i], i));
+ }
+ return map;
+}
+
+static int get_amd_kernel_code_t_FieldIndex(StringRef name) {
+ static const auto map = createIndexMap(get_amd_kernel_code_t_FldNames(),
+ get_amd_kernel_code_t_FldAltNames());
+ return map.lookup(name) - 1; // returns -1 if not found
+}
+
+static StringRef get_amd_kernel_code_t_FieldName(int index) {
+ return get_amd_kernel_code_t_FldNames()[index + 1];
+}
+
+
+// Field printing
+
+static raw_ostream &printName(raw_ostream &OS, StringRef Name) {
+ return OS << Name << " = ";
+}
+
+template <typename T, T amd_kernel_code_t::*ptr>
+static void printField(StringRef Name, const amd_kernel_code_t &C,
+ raw_ostream &OS) {
+ printName(OS, Name) << (int)(C.*ptr);
+}
+
+template <typename T, T amd_kernel_code_t::*ptr, int shift, int width = 1>
+static void printBitField(StringRef Name, const amd_kernel_code_t &c,
+ raw_ostream &OS) {
+ const auto Mask = (static_cast<T>(1) << width) - 1;
+ printName(OS, Name) << (int)((c.*ptr >> shift) & Mask);
+}
+
+typedef void(*PrintFx)(StringRef,
+ const amd_kernel_code_t &,
+ raw_ostream &);
+
+static ArrayRef<PrintFx> getPrinterTable() {
+ static const PrintFx Table[] = {
+#define RECORD(name, altName, print, parse) print
+#include "AMDKernelCodeTInfo.h"
+#undef RECORD
+ };
+ return makeArrayRef(Table);
+}
+
+void llvm::printAmdKernelCodeField(const amd_kernel_code_t &C,
+ int FldIndex,
+ raw_ostream &OS) {
+ auto Printer = getPrinterTable()[FldIndex];
+ if (Printer)
+ Printer(get_amd_kernel_code_t_FieldName(FldIndex), C, OS);
+}
+
+void llvm::dumpAmdKernelCode(const amd_kernel_code_t *C,
+ raw_ostream &OS,
+ const char *tab) {
+ const int Size = getPrinterTable().size();
+ for (int i = 0; i < Size; ++i) {
+ OS << tab;
+ printAmdKernelCodeField(*C, i, OS);
+ OS << '\n';
+ }
+}
+
+
+// Field parsing
+
+static bool expectAbsExpression(MCAsmParser &MCParser, int64_t &Value, raw_ostream& Err) {
+
+ if (MCParser.getLexer().isNot(AsmToken::Equal)) {
+ Err << "expected '='";
+ return false;
+ }
+ MCParser.getLexer().Lex();
+
+ if (MCParser.parseAbsoluteExpression(Value)) {
+ Err << "integer absolute expression expected";
+ return false;
+ }
+ return true;
+}
+
+template <typename T, T amd_kernel_code_t::*ptr>
+static bool parseField(amd_kernel_code_t &C, MCAsmParser &MCParser,
+ raw_ostream &Err) {
+ int64_t Value = 0;
+ if (!expectAbsExpression(MCParser, Value, Err))
+ return false;
+ C.*ptr = (T)Value;
+ return true;
+}
+
+template <typename T, T amd_kernel_code_t::*ptr, int shift, int width = 1>
+static bool parseBitField(amd_kernel_code_t &C, MCAsmParser &MCParser,
+ raw_ostream &Err) {
+ int64_t Value = 0;
+ if (!expectAbsExpression(MCParser, Value, Err))
+ return false;
+ const uint64_t Mask = ((UINT64_C(1) << width) - 1) << shift;
+ C.*ptr &= (T)~Mask;
+ C.*ptr |= (T)((Value << shift) & Mask);
+ return true;
+}
+
+typedef bool(*ParseFx)(amd_kernel_code_t &,
+ MCAsmParser &MCParser,
+ raw_ostream &Err);
+
+static ArrayRef<ParseFx> getParserTable() {
+ static const ParseFx Table[] = {
+#define RECORD(name, altName, print, parse) parse
+#include "AMDKernelCodeTInfo.h"
+#undef RECORD
+ };
+ return makeArrayRef(Table);
+}
+
+bool llvm::parseAmdKernelCodeField(StringRef ID,
+ MCAsmParser &MCParser,
+ amd_kernel_code_t &C,
+ raw_ostream &Err) {
+ const int Idx = get_amd_kernel_code_t_FieldIndex(ID);
+ if (Idx < 0) {
+ Err << "unexpected amd_kernel_code_t field name " << ID;
+ return false;
+ }
+ auto Parser = getParserTable()[Idx];
+ return Parser ? Parser(C, MCParser, Err) : false;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h
new file mode 100644
index 000000000000..d9edca7a82ac
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h
@@ -0,0 +1,39 @@
+//===- AMDGPUKernelCodeTUtils.h - helpers for amd_kernel_code_t *- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file AMDKernelCodeTUtils.h
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDKERNELCODETUTILS_H
+#define AMDKERNELCODETUTILS_H
+
+#include "AMDKernelCodeT.h"
+
+namespace llvm {
+
+class MCAsmLexer;
+class MCAsmParser;
+class raw_ostream;
+class StringRef;
+
+void printAmdKernelCodeField(const amd_kernel_code_t &C,
+ int FldIndex,
+ raw_ostream &OS);
+
+void dumpAmdKernelCode(const amd_kernel_code_t *C,
+ raw_ostream &OS,
+ const char *tab);
+
+bool parseAmdKernelCodeField(StringRef ID,
+ MCAsmParser &Parser,
+ amd_kernel_code_t &C,
+ raw_ostream &Err);
+
+}
+
+#endif // AMDKERNELCODETUTILS_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/VIInstrFormats.td b/contrib/llvm/lib/Target/AMDGPU/VIInstrFormats.td
new file mode 100644
index 000000000000..1fd1c1e21527
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/VIInstrFormats.td
@@ -0,0 +1,20 @@
+//===-- VIInstrFormats.td - VI Instruction Encodings ----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// VI Instruction format definitions.
+//
+//===----------------------------------------------------------------------===//
+
+class EXPe_vi : EXPe {
+ let Inst{31-26} = 0x31; //encoding
+}
+
+class VINTRPe_vi <bits<2> op> : VINTRPe <op> {
+ let Inst{31-26} = 0x35; // encoding
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td
new file mode 100644
index 000000000000..b45c8fc9c7d5
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td
@@ -0,0 +1,14 @@
+//===-- VIInstructions.td - VI Instruction Defintions ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Instruction definitions for VI and newer.
+//===----------------------------------------------------------------------===//
+
+FIXME: Deleting this file broke buildbots that don't do full rebuilds. This
+file is no longer used by the backend, so it can be deleted once all
+the buildbots update there dependencies.
diff --git a/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td
new file mode 100644
index 000000000000..bff706cdc1dc
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -0,0 +1,621 @@
+//===-- VOP1Instructions.td - Vector Instruction Defintions ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// VOP1 Classes
+//===----------------------------------------------------------------------===//
+
+class VOP1e <bits<8> op, VOPProfile P> : Enc32 {
+ bits<8> vdst;
+ bits<9> src0;
+
+ let Inst{8-0} = !if(P.HasSrc0, src0{8-0}, 0);
+ let Inst{16-9} = op;
+ let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
+ let Inst{31-25} = 0x3f; //encoding
+}
+
+class VOP1_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe <P> {
+ bits<8> vdst;
+
+ let Inst{8-0} = 0xf9; // sdwa
+ let Inst{16-9} = op;
+ let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
+ let Inst{31-25} = 0x3f; // encoding
+}
+
+class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
+ InstSI <P.Outs32, P.Ins32, "", pattern>,
+ VOP <opName>,
+ SIMCInstr <opName#"_e32", SIEncodingFamily.NONE>,
+ MnemonicAlias<opName#"_e32", opName> {
+
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+ let UseNamedOperandTable = 1;
+
+ string Mnemonic = opName;
+ string AsmOperands = P.Asm32;
+
+ let Size = 4;
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let SubtargetPredicate = isGCN;
+
+ let VOP1 = 1;
+ let VALU = 1;
+ let Uses = [EXEC];
+
+ let AsmVariantName = AMDGPUAsmVariants.Default;
+
+ VOPProfile Pfl = P;
+}
+
+class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily> :
+ InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
+ SIMCInstr <ps.PseudoInstr, EncodingFamily> {
+
+ let isPseudo = 0;
+ let isCodeGenOnly = 0;
+
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
+
+ // copy relevant pseudo op flags
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let AsmMatchConverter = ps.AsmMatchConverter;
+ let AsmVariantName = ps.AsmVariantName;
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
+ let TSFlags = ps.TSFlags;
+}
+
+class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
+ VOP_SDWA_Pseudo <OpName, P, pattern> {
+ let AsmMatchConverter = "cvtSdwaVOP1";
+}
+
+class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
+ list<dag> ret = !if(P.HasModifiers,
+ [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0,
+ i32:$src0_modifiers, i1:$clamp, i32:$omod))))],
+ [(set P.DstVT:$vdst, (node P.Src0VT:$src0))]);
+}
+
+multiclass VOP1Inst <string opName, VOPProfile P,
+ SDPatternOperator node = null_frag> {
+ def _e32 : VOP1_Pseudo <opName, P>;
+ def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>;
+ def _sdwa : VOP1_SDWA_Pseudo <opName, P>;
+}
+
+//===----------------------------------------------------------------------===//
+// VOP1 Instructions
+//===----------------------------------------------------------------------===//
+
+let VOPAsmPrefer32Bit = 1 in {
+defm V_NOP : VOP1Inst <"v_nop", VOP_NONE>;
+}
+
+let isMoveImm = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+defm V_MOV_B32 : VOP1Inst <"v_mov_b32", VOP_I32_I32>;
+} // End isMoveImm = 1
+
+// FIXME: Specify SchedRW for READFIRSTLANE_B32
+// TODO: Make profile for this, there is VOP3 encoding also
+def V_READFIRSTLANE_B32 :
+ InstSI <(outs SReg_32:$vdst),
+ (ins VGPR_32:$src0),
+ "v_readfirstlane_b32 $vdst, $src0",
+ [(set i32:$vdst, (int_amdgcn_readfirstlane i32:$src0))]>,
+ Enc32 {
+
+ let isCodeGenOnly = 0;
+ let UseNamedOperandTable = 1;
+
+ let Size = 4;
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let SubtargetPredicate = isGCN;
+
+ let VOP1 = 1;
+ let VALU = 1;
+ let Uses = [EXEC];
+ let isConvergent = 1;
+
+ bits<8> vdst;
+ bits<9> src0;
+
+ let Inst{8-0} = src0;
+ let Inst{16-9} = 0x2;
+ let Inst{24-17} = vdst;
+ let Inst{31-25} = 0x3f; //encoding
+}
+
+let SchedRW = [WriteQuarterRate32] in {
+defm V_CVT_I32_F64 : VOP1Inst <"v_cvt_i32_f64", VOP_I32_F64, fp_to_sint>;
+defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP_F64_I32, sint_to_fp>;
+defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP_F32_I32, sint_to_fp>;
+defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP_F32_I32, uint_to_fp>;
+defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32, fp_to_uint>;
+defm V_CVT_I32_F32 : VOP1Inst <"v_cvt_i32_f32", VOP_I32_F32, fp_to_sint>;
+defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_I32_F32, fp_to_f16>;
+defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_I32, f16_to_fp>;
+defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>;
+defm V_CVT_FLR_I32_F32 : VOP1Inst <"v_cvt_flr_i32_f32", VOP_I32_F32, cvt_flr_i32_f32>;
+defm V_CVT_OFF_F32_I4 : VOP1Inst <"v_cvt_off_f32_i4", VOP_F32_I32>;
+defm V_CVT_F32_F64 : VOP1Inst <"v_cvt_f32_f64", VOP_F32_F64, fpround>;
+defm V_CVT_F64_F32 : VOP1Inst <"v_cvt_f64_f32", VOP_F64_F32, fpextend>;
+defm V_CVT_F32_UBYTE0 : VOP1Inst <"v_cvt_f32_ubyte0", VOP_F32_I32, AMDGPUcvt_f32_ubyte0>;
+defm V_CVT_F32_UBYTE1 : VOP1Inst <"v_cvt_f32_ubyte1", VOP_F32_I32, AMDGPUcvt_f32_ubyte1>;
+defm V_CVT_F32_UBYTE2 : VOP1Inst <"v_cvt_f32_ubyte2", VOP_F32_I32, AMDGPUcvt_f32_ubyte2>;
+defm V_CVT_F32_UBYTE3 : VOP1Inst <"v_cvt_f32_ubyte3", VOP_F32_I32, AMDGPUcvt_f32_ubyte3>;
+defm V_CVT_U32_F64 : VOP1Inst <"v_cvt_u32_f64", VOP_I32_F64, fp_to_uint>;
+defm V_CVT_F64_U32 : VOP1Inst <"v_cvt_f64_u32", VOP_F64_I32, uint_to_fp>;
+} // End SchedRW = [WriteQuarterRate32]
+
+defm V_FRACT_F32 : VOP1Inst <"v_fract_f32", VOP_F32_F32, AMDGPUfract>;
+defm V_TRUNC_F32 : VOP1Inst <"v_trunc_f32", VOP_F32_F32, ftrunc>;
+defm V_CEIL_F32 : VOP1Inst <"v_ceil_f32", VOP_F32_F32, fceil>;
+defm V_RNDNE_F32 : VOP1Inst <"v_rndne_f32", VOP_F32_F32, frint>;
+defm V_FLOOR_F32 : VOP1Inst <"v_floor_f32", VOP_F32_F32, ffloor>;
+defm V_EXP_F32 : VOP1Inst <"v_exp_f32", VOP_F32_F32, fexp2>;
+
+let SchedRW = [WriteQuarterRate32] in {
+defm V_LOG_F32 : VOP1Inst <"v_log_f32", VOP_F32_F32, flog2>;
+defm V_RCP_F32 : VOP1Inst <"v_rcp_f32", VOP_F32_F32, AMDGPUrcp>;
+defm V_RCP_IFLAG_F32 : VOP1Inst <"v_rcp_iflag_f32", VOP_F32_F32>;
+defm V_RSQ_F32 : VOP1Inst <"v_rsq_f32", VOP_F32_F32, AMDGPUrsq>;
+} // End SchedRW = [WriteQuarterRate32]
+
+let SchedRW = [WriteDouble] in {
+defm V_RCP_F64 : VOP1Inst <"v_rcp_f64", VOP_F64_F64, AMDGPUrcp>;
+defm V_RSQ_F64 : VOP1Inst <"v_rsq_f64", VOP_F64_F64, AMDGPUrsq>;
+} // End SchedRW = [WriteDouble];
+
+defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, fsqrt>;
+
+let SchedRW = [WriteDouble] in {
+defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, fsqrt>;
+} // End SchedRW = [WriteDouble]
+
+let SchedRW = [WriteQuarterRate32] in {
+defm V_SIN_F32 : VOP1Inst <"v_sin_f32", VOP_F32_F32, AMDGPUsin>;
+defm V_COS_F32 : VOP1Inst <"v_cos_f32", VOP_F32_F32, AMDGPUcos>;
+} // End SchedRW = [WriteQuarterRate32]
+
+defm V_NOT_B32 : VOP1Inst <"v_not_b32", VOP_I32_I32>;
+defm V_BFREV_B32 : VOP1Inst <"v_bfrev_b32", VOP_I32_I32>;
+defm V_FFBH_U32 : VOP1Inst <"v_ffbh_u32", VOP_I32_I32>;
+defm V_FFBL_B32 : VOP1Inst <"v_ffbl_b32", VOP_I32_I32>;
+defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32>;
+defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64, int_amdgcn_frexp_exp>;
+
+let SchedRW = [WriteDoubleAdd] in {
+defm V_FREXP_MANT_F64 : VOP1Inst <"v_frexp_mant_f64", VOP_F64_F64, int_amdgcn_frexp_mant>;
+defm V_FRACT_F64 : VOP1Inst <"v_fract_f64", VOP_F64_F64, AMDGPUfract>;
+} // End SchedRW = [WriteDoubleAdd]
+
+defm V_FREXP_EXP_I32_F32 : VOP1Inst <"v_frexp_exp_i32_f32", VOP_I32_F32, int_amdgcn_frexp_exp>;
+defm V_FREXP_MANT_F32 : VOP1Inst <"v_frexp_mant_f32", VOP_F32_F32, int_amdgcn_frexp_mant>;
+
+let VOPAsmPrefer32Bit = 1 in {
+defm V_CLREXCP : VOP1Inst <"v_clrexcp", VOP_NO_EXT<VOP_NONE>>;
+}
+
+// Restrict src0 to be VGPR
+def VOP_I32_VI32_NO_EXT : VOPProfile<[i32, i32, untyped, untyped]> {
+ let Src0RC32 = VRegSrc_32;
+ let Src0RC64 = VRegSrc_32;
+
+ let HasExt = 0;
+}
+
+// Special case because there are no true output operands. Hack vdst
+// to be a src operand. The custom inserter must add a tied implicit
+// def and use of the super register since there seems to be no way to
+// add an implicit def of a virtual register in tablegen.
+def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> {
+ let Src0RC32 = VOPDstOperand<VGPR_32>;
+ let Src0RC64 = VOPDstOperand<VGPR_32>;
+
+ let Outs = (outs);
+ let Ins32 = (ins Src0RC32:$vdst, VSrc_b32:$src0);
+ let Ins64 = (ins Src0RC64:$vdst, VSrc_b32:$src0);
+ let InsDPP = (ins Src0RC32:$vdst, Src0RC32:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+ bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+ let InsSDWA = (ins Src0RC32:$vdst, Int32InputMods:$src0_modifiers, VCSrc_b32:$src0,
+ clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+ src0_sel:$src0_sel);
+
+ let Asm32 = getAsm32<1, 1>.ret;
+ let Asm64 = getAsm64<1, 1, 0>.ret;
+ let AsmDPP = getAsmDPP<1, 1, 0>.ret;
+ let AsmSDWA = getAsmSDWA<1, 1, 0>.ret;
+
+ let HasExt = 0;
+ let HasDst = 0;
+ let EmitDst = 1; // force vdst emission
+}
+
+let SubtargetPredicate = HasMovrel, Uses = [M0, EXEC] in {
+// v_movreld_b32 is a special case because the destination output
+ // register is really a source. It isn't actually read (but may be
+ // written), and is only to provide the base register to start
+ // indexing from. Tablegen seems to not let you define an implicit
+ // virtual register output for the super register being written into,
+ // so this must have an implicit def of the register added to it.
+defm V_MOVRELD_B32 : VOP1Inst <"v_movreld_b32", VOP_MOVRELD>;
+defm V_MOVRELS_B32 : VOP1Inst <"v_movrels_b32", VOP_I32_VI32_NO_EXT>;
+defm V_MOVRELSD_B32 : VOP1Inst <"v_movrelsd_b32", VOP_NO_EXT<VOP_I32_I32>>;
+} // End Uses = [M0, EXEC]
+
+// These instruction only exist on SI and CI
+let SubtargetPredicate = isSICI in {
+
+let SchedRW = [WriteQuarterRate32] in {
+defm V_MOV_FED_B32 : VOP1Inst <"v_mov_fed_b32", VOP_I32_I32>;
+defm V_LOG_CLAMP_F32 : VOP1Inst <"v_log_clamp_f32", VOP_F32_F32, int_amdgcn_log_clamp>;
+defm V_RCP_CLAMP_F32 : VOP1Inst <"v_rcp_clamp_f32", VOP_F32_F32>;
+defm V_RCP_LEGACY_F32 : VOP1Inst <"v_rcp_legacy_f32", VOP_F32_F32, AMDGPUrcp_legacy>;
+defm V_RSQ_CLAMP_F32 : VOP1Inst <"v_rsq_clamp_f32", VOP_F32_F32, AMDGPUrsq_clamp>;
+defm V_RSQ_LEGACY_F32 : VOP1Inst <"v_rsq_legacy_f32", VOP_F32_F32, AMDGPUrsq_legacy>;
+} // End SchedRW = [WriteQuarterRate32]
+
+let SchedRW = [WriteDouble] in {
+defm V_RCP_CLAMP_F64 : VOP1Inst <"v_rcp_clamp_f64", VOP_F64_F64>;
+defm V_RSQ_CLAMP_F64 : VOP1Inst <"v_rsq_clamp_f64", VOP_F64_F64, AMDGPUrsq_clamp>;
+} // End SchedRW = [WriteDouble]
+
+} // End SubtargetPredicate = isSICI
+
+
+let SubtargetPredicate = isCIVI in {
+
+let SchedRW = [WriteDoubleAdd] in {
+defm V_TRUNC_F64 : VOP1Inst <"v_trunc_f64", VOP_F64_F64, ftrunc>;
+defm V_CEIL_F64 : VOP1Inst <"v_ceil_f64", VOP_F64_F64, fceil>;
+defm V_FLOOR_F64 : VOP1Inst <"v_floor_f64", VOP_F64_F64, ffloor>;
+defm V_RNDNE_F64 : VOP1Inst <"v_rndne_f64", VOP_F64_F64, frint>;
+} // End SchedRW = [WriteDoubleAdd]
+
+let SchedRW = [WriteQuarterRate32] in {
+defm V_LOG_LEGACY_F32 : VOP1Inst <"v_log_legacy_f32", VOP_F32_F32>;
+defm V_EXP_LEGACY_F32 : VOP1Inst <"v_exp_legacy_f32", VOP_F32_F32>;
+} // End SchedRW = [WriteQuarterRate32]
+
+} // End SubtargetPredicate = isCIVI
+
+
+let SubtargetPredicate = isVI in {
+
+defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP_F16_I16, uint_to_fp>;
+defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP_F16_I16, sint_to_fp>;
+defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16, fp_to_uint>;
+defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16, fp_to_sint>;
+defm V_RCP_F16 : VOP1Inst <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>;
+defm V_SQRT_F16 : VOP1Inst <"v_sqrt_f16", VOP_F16_F16, fsqrt>;
+defm V_RSQ_F16 : VOP1Inst <"v_rsq_f16", VOP_F16_F16, AMDGPUrsq>;
+defm V_LOG_F16 : VOP1Inst <"v_log_f16", VOP_F16_F16, flog2>;
+defm V_EXP_F16 : VOP1Inst <"v_exp_f16", VOP_F16_F16, fexp2>;
+defm V_FREXP_MANT_F16 : VOP1Inst <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>;
+defm V_FREXP_EXP_I16_F16 : VOP1Inst <"v_frexp_exp_i16_f16", VOP_I16_F16, int_amdgcn_frexp_exp>;
+defm V_FLOOR_F16 : VOP1Inst <"v_floor_f16", VOP_F16_F16, ffloor>;
+defm V_CEIL_F16 : VOP1Inst <"v_ceil_f16", VOP_F16_F16, fceil>;
+defm V_TRUNC_F16 : VOP1Inst <"v_trunc_f16", VOP_F16_F16, ftrunc>;
+defm V_RNDNE_F16 : VOP1Inst <"v_rndne_f16", VOP_F16_F16, frint>;
+defm V_FRACT_F16 : VOP1Inst <"v_fract_f16", VOP_F16_F16, AMDGPUfract>;
+defm V_SIN_F16 : VOP1Inst <"v_sin_f16", VOP_F16_F16, AMDGPUsin>;
+defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;
+
+}
+
+let Predicates = [isVI] in {
+
+def : Pat<
+ (f32 (f16_to_fp i16:$src)),
+ (V_CVT_F32_F16_e32 $src)
+>;
+
+def : Pat<
+ (i16 (fp_to_f16 f32:$src)),
+ (V_CVT_F16_F32_e32 $src)
+>;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Target
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SI
+//===----------------------------------------------------------------------===//
+
+multiclass VOP1_Real_si <bits<9> op> {
+ let AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" in {
+ def _e32_si :
+ VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
+ VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32").Pfl>;
+ def _e64_si :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
+ VOP3e_si <{1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+ }
+}
+
+defm V_NOP : VOP1_Real_si <0x0>;
+defm V_MOV_B32 : VOP1_Real_si <0x1>;
+defm V_CVT_I32_F64 : VOP1_Real_si <0x3>;
+defm V_CVT_F64_I32 : VOP1_Real_si <0x4>;
+defm V_CVT_F32_I32 : VOP1_Real_si <0x5>;
+defm V_CVT_F32_U32 : VOP1_Real_si <0x6>;
+defm V_CVT_U32_F32 : VOP1_Real_si <0x7>;
+defm V_CVT_I32_F32 : VOP1_Real_si <0x8>;
+defm V_MOV_FED_B32 : VOP1_Real_si <0x9>;
+defm V_CVT_F16_F32 : VOP1_Real_si <0xa>;
+defm V_CVT_F32_F16 : VOP1_Real_si <0xb>;
+defm V_CVT_RPI_I32_F32 : VOP1_Real_si <0xc>;
+defm V_CVT_FLR_I32_F32 : VOP1_Real_si <0xd>;
+defm V_CVT_OFF_F32_I4 : VOP1_Real_si <0xe>;
+defm V_CVT_F32_F64 : VOP1_Real_si <0xf>;
+defm V_CVT_F64_F32 : VOP1_Real_si <0x10>;
+defm V_CVT_F32_UBYTE0 : VOP1_Real_si <0x11>;
+defm V_CVT_F32_UBYTE1 : VOP1_Real_si <0x12>;
+defm V_CVT_F32_UBYTE2 : VOP1_Real_si <0x13>;
+defm V_CVT_F32_UBYTE3 : VOP1_Real_si <0x14>;
+defm V_CVT_U32_F64 : VOP1_Real_si <0x15>;
+defm V_CVT_F64_U32 : VOP1_Real_si <0x16>;
+defm V_FRACT_F32 : VOP1_Real_si <0x20>;
+defm V_TRUNC_F32 : VOP1_Real_si <0x21>;
+defm V_CEIL_F32 : VOP1_Real_si <0x22>;
+defm V_RNDNE_F32 : VOP1_Real_si <0x23>;
+defm V_FLOOR_F32 : VOP1_Real_si <0x24>;
+defm V_EXP_F32 : VOP1_Real_si <0x25>;
+defm V_LOG_CLAMP_F32 : VOP1_Real_si <0x26>;
+defm V_LOG_F32 : VOP1_Real_si <0x27>;
+defm V_RCP_CLAMP_F32 : VOP1_Real_si <0x28>;
+defm V_RCP_LEGACY_F32 : VOP1_Real_si <0x29>;
+defm V_RCP_F32 : VOP1_Real_si <0x2a>;
+defm V_RCP_IFLAG_F32 : VOP1_Real_si <0x2b>;
+defm V_RSQ_CLAMP_F32 : VOP1_Real_si <0x2c>;
+defm V_RSQ_LEGACY_F32 : VOP1_Real_si <0x2d>;
+defm V_RSQ_F32 : VOP1_Real_si <0x2e>;
+defm V_RCP_F64 : VOP1_Real_si <0x2f>;
+defm V_RCP_CLAMP_F64 : VOP1_Real_si <0x30>;
+defm V_RSQ_F64 : VOP1_Real_si <0x31>;
+defm V_RSQ_CLAMP_F64 : VOP1_Real_si <0x32>;
+defm V_SQRT_F32 : VOP1_Real_si <0x33>;
+defm V_SQRT_F64 : VOP1_Real_si <0x34>;
+defm V_SIN_F32 : VOP1_Real_si <0x35>;
+defm V_COS_F32 : VOP1_Real_si <0x36>;
+defm V_NOT_B32 : VOP1_Real_si <0x37>;
+defm V_BFREV_B32 : VOP1_Real_si <0x38>;
+defm V_FFBH_U32 : VOP1_Real_si <0x39>;
+defm V_FFBL_B32 : VOP1_Real_si <0x3a>;
+defm V_FFBH_I32 : VOP1_Real_si <0x3b>;
+defm V_FREXP_EXP_I32_F64 : VOP1_Real_si <0x3c>;
+defm V_FREXP_MANT_F64 : VOP1_Real_si <0x3d>;
+defm V_FRACT_F64 : VOP1_Real_si <0x3e>;
+defm V_FREXP_EXP_I32_F32 : VOP1_Real_si <0x3f>;
+defm V_FREXP_MANT_F32 : VOP1_Real_si <0x40>;
+defm V_CLREXCP : VOP1_Real_si <0x41>;
+defm V_MOVRELD_B32 : VOP1_Real_si <0x42>;
+defm V_MOVRELS_B32 : VOP1_Real_si <0x43>;
+defm V_MOVRELSD_B32 : VOP1_Real_si <0x44>;
+
+//===----------------------------------------------------------------------===//
+// CI
+//===----------------------------------------------------------------------===//
+
+multiclass VOP1_Real_ci <bits<9> op> {
+ let AssemblerPredicates = [isCIOnly], DecoderNamespace = "CI" in {
+ def _e32_ci :
+ VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
+ VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32").Pfl>;
+ def _e64_ci :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
+ VOP3e_si <{1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+ }
+}
+
+defm V_TRUNC_F64 : VOP1_Real_ci <0x17>;
+defm V_CEIL_F64 : VOP1_Real_ci <0x18>;
+defm V_FLOOR_F64 : VOP1_Real_ci <0x1A>;
+defm V_RNDNE_F64 : VOP1_Real_ci <0x19>;
+defm V_LOG_LEGACY_F32 : VOP1_Real_ci <0x45>;
+defm V_EXP_LEGACY_F32 : VOP1_Real_ci <0x46>;
+
+//===----------------------------------------------------------------------===//
+// VI
+//===----------------------------------------------------------------------===//
+
+class VOP1_DPP <bits<8> op, VOP1_Pseudo ps, VOPProfile P = ps.Pfl> :
+ VOP_DPP <ps.OpName, P> {
+ let Defs = ps.Defs;
+ let Uses = ps.Uses;
+ let SchedRW = ps.SchedRW;
+ let hasSideEffects = ps.hasSideEffects;
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
+
+ bits<8> vdst;
+ let Inst{8-0} = 0xfa; // dpp
+ let Inst{16-9} = op;
+ let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
+ let Inst{31-25} = 0x3f; //encoding
+}
+
+multiclass VOP1_Real_vi <bits<10> op> {
+ let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
+ def _e32_vi :
+ VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.VI>,
+ VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32").Pfl>;
+ def _e64_vi :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
+ VOP3e_vi <!add(0x140, op), !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+ }
+
+ def _sdwa_vi :
+ VOP_SDWA_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
+ VOP1_SDWAe <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
+
+ // For now left dpp only for asm/dasm
+ // TODO: add corresponding pseudo
+ def _dpp : VOP1_DPP<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>;
+}
+
+defm V_NOP : VOP1_Real_vi <0x0>;
+defm V_MOV_B32 : VOP1_Real_vi <0x1>;
+defm V_CVT_I32_F64 : VOP1_Real_vi <0x3>;
+defm V_CVT_F64_I32 : VOP1_Real_vi <0x4>;
+defm V_CVT_F32_I32 : VOP1_Real_vi <0x5>;
+defm V_CVT_F32_U32 : VOP1_Real_vi <0x6>;
+defm V_CVT_U32_F32 : VOP1_Real_vi <0x7>;
+defm V_CVT_I32_F32 : VOP1_Real_vi <0x8>;
+defm V_CVT_F16_F32 : VOP1_Real_vi <0xa>;
+defm V_CVT_F32_F16 : VOP1_Real_vi <0xb>;
+defm V_CVT_RPI_I32_F32 : VOP1_Real_vi <0xc>;
+defm V_CVT_FLR_I32_F32 : VOP1_Real_vi <0xd>;
+defm V_CVT_OFF_F32_I4 : VOP1_Real_vi <0xe>;
+defm V_CVT_F32_F64 : VOP1_Real_vi <0xf>;
+defm V_CVT_F64_F32 : VOP1_Real_vi <0x10>;
+defm V_CVT_F32_UBYTE0 : VOP1_Real_vi <0x11>;
+defm V_CVT_F32_UBYTE1 : VOP1_Real_vi <0x12>;
+defm V_CVT_F32_UBYTE2 : VOP1_Real_vi <0x13>;
+defm V_CVT_F32_UBYTE3 : VOP1_Real_vi <0x14>;
+defm V_CVT_U32_F64 : VOP1_Real_vi <0x15>;
+defm V_CVT_F64_U32 : VOP1_Real_vi <0x16>;
+defm V_FRACT_F32 : VOP1_Real_vi <0x1b>;
+defm V_TRUNC_F32 : VOP1_Real_vi <0x1c>;
+defm V_CEIL_F32 : VOP1_Real_vi <0x1d>;
+defm V_RNDNE_F32 : VOP1_Real_vi <0x1e>;
+defm V_FLOOR_F32 : VOP1_Real_vi <0x1f>;
+defm V_EXP_F32 : VOP1_Real_vi <0x20>;
+defm V_LOG_F32 : VOP1_Real_vi <0x21>;
+defm V_RCP_F32 : VOP1_Real_vi <0x22>;
+defm V_RCP_IFLAG_F32 : VOP1_Real_vi <0x23>;
+defm V_RSQ_F32 : VOP1_Real_vi <0x24>;
+defm V_RCP_F64 : VOP1_Real_vi <0x25>;
+defm V_RSQ_F64 : VOP1_Real_vi <0x26>;
+defm V_SQRT_F32 : VOP1_Real_vi <0x27>;
+defm V_SQRT_F64 : VOP1_Real_vi <0x28>;
+defm V_SIN_F32 : VOP1_Real_vi <0x29>;
+defm V_COS_F32 : VOP1_Real_vi <0x2a>;
+defm V_NOT_B32 : VOP1_Real_vi <0x2b>;
+defm V_BFREV_B32 : VOP1_Real_vi <0x2c>;
+defm V_FFBH_U32 : VOP1_Real_vi <0x2d>;
+defm V_FFBL_B32 : VOP1_Real_vi <0x2e>;
+defm V_FFBH_I32 : VOP1_Real_vi <0x2f>;
+defm V_FREXP_EXP_I32_F64 : VOP1_Real_vi <0x30>;
+defm V_FREXP_MANT_F64 : VOP1_Real_vi <0x31>;
+defm V_FRACT_F64 : VOP1_Real_vi <0x32>;
+defm V_FREXP_EXP_I32_F32 : VOP1_Real_vi <0x33>;
+defm V_FREXP_MANT_F32 : VOP1_Real_vi <0x34>;
+defm V_CLREXCP : VOP1_Real_vi <0x35>;
+defm V_MOVRELD_B32 : VOP1_Real_vi <0x36>;
+defm V_MOVRELS_B32 : VOP1_Real_vi <0x37>;
+defm V_MOVRELSD_B32 : VOP1_Real_vi <0x38>;
+defm V_TRUNC_F64 : VOP1_Real_vi <0x17>;
+defm V_CEIL_F64 : VOP1_Real_vi <0x18>;
+defm V_FLOOR_F64 : VOP1_Real_vi <0x1A>;
+defm V_RNDNE_F64 : VOP1_Real_vi <0x19>;
+defm V_LOG_LEGACY_F32 : VOP1_Real_vi <0x4c>;
+defm V_EXP_LEGACY_F32 : VOP1_Real_vi <0x4b>;
+defm V_CVT_F16_U16 : VOP1_Real_vi <0x39>;
+defm V_CVT_F16_I16 : VOP1_Real_vi <0x3a>;
+defm V_CVT_U16_F16 : VOP1_Real_vi <0x3b>;
+defm V_CVT_I16_F16 : VOP1_Real_vi <0x3c>;
+defm V_RCP_F16 : VOP1_Real_vi <0x3d>;
+defm V_SQRT_F16 : VOP1_Real_vi <0x3e>;
+defm V_RSQ_F16 : VOP1_Real_vi <0x3f>;
+defm V_LOG_F16 : VOP1_Real_vi <0x40>;
+defm V_EXP_F16 : VOP1_Real_vi <0x41>;
+defm V_FREXP_MANT_F16 : VOP1_Real_vi <0x42>;
+defm V_FREXP_EXP_I16_F16 : VOP1_Real_vi <0x43>;
+defm V_FLOOR_F16 : VOP1_Real_vi <0x44>;
+defm V_CEIL_F16 : VOP1_Real_vi <0x45>;
+defm V_TRUNC_F16 : VOP1_Real_vi <0x46>;
+defm V_RNDNE_F16 : VOP1_Real_vi <0x47>;
+defm V_FRACT_F16 : VOP1_Real_vi <0x48>;
+defm V_SIN_F16 : VOP1_Real_vi <0x49>;
+defm V_COS_F16 : VOP1_Real_vi <0x4a>;
+
+
+// Copy of v_mov_b32 with $vdst as a use operand for use with VGPR
+// indexing mode. vdst can't be treated as a def for codegen purposes,
+// and an implicit use and def of the super register should be added.
+def V_MOV_B32_indirect : VPseudoInstSI<(outs),
+ (ins getVALUDstForVT<i32>.ret:$vdst, getVOPSrc0ForVT<i32>.ret:$src0)>,
+ PseudoInstExpansion<(V_MOV_B32_e32_vi getVALUDstForVT<i32>.ret:$vdst,
+ getVOPSrc0ForVT<i32>.ret:$src0)> {
+ let VOP1 = 1;
+ let SubtargetPredicate = isVI;
+}
+
+// This is a pseudo variant of the v_movreld_b32 instruction in which the
+// vector operand appears only twice, once as def and once as use. Using this
+// pseudo avoids problems with the Two Address instructions pass.
+class V_MOVRELD_B32_pseudo<RegisterClass rc> : VPseudoInstSI <
+ (outs rc:$vdst),
+ (ins rc:$vsrc, VSrc_b32:$val, i32imm:$offset)> {
+ let VOP1 = 1;
+
+ let Constraints = "$vsrc = $vdst";
+ let Uses = [M0, EXEC];
+
+ let SubtargetPredicate = HasMovrel;
+}
+
+def V_MOVRELD_B32_V1 : V_MOVRELD_B32_pseudo<VGPR_32>;
+def V_MOVRELD_B32_V2 : V_MOVRELD_B32_pseudo<VReg_64>;
+def V_MOVRELD_B32_V4 : V_MOVRELD_B32_pseudo<VReg_128>;
+def V_MOVRELD_B32_V8 : V_MOVRELD_B32_pseudo<VReg_256>;
+def V_MOVRELD_B32_V16 : V_MOVRELD_B32_pseudo<VReg_512>;
+
+let Predicates = [isVI] in {
+
+def : Pat <
+ (i32 (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask,
+ imm:$bound_ctrl)),
+ (V_MOV_B32_dpp $src, (as_i32imm $dpp_ctrl), (as_i32imm $row_mask),
+ (as_i32imm $bank_mask), (as_i1imm $bound_ctrl))
+>;
+
+
+def : Pat<
+ (i32 (anyext i16:$src)),
+ (COPY $src)
+>;
+
+def : Pat<
+ (i64 (anyext i16:$src)),
+ (REG_SEQUENCE VReg_64,
+ (i32 (COPY $src)), sub0,
+ (V_MOV_B32_e32 (i32 0)), sub1)
+>;
+
+def : Pat<
+ (i16 (trunc i32:$src)),
+ (COPY $src)
+>;
+
+def : Pat<
+ (i1 (trunc i16:$src)),
+ (COPY $src)
+>;
+
+
+def : Pat <
+ (i16 (trunc i64:$src)),
+ (EXTRACT_SUBREG $src, sub0)
+>;
+
+} // End Predicates = [isVI]
diff --git a/contrib/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/contrib/llvm/lib/Target/AMDGPU/VOP2Instructions.td
new file mode 100644
index 000000000000..20fb7f7bcab7
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -0,0 +1,757 @@
+//===-- VOP2Instructions.td - Vector Instruction Defintions ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// VOP2 Classes
+//===----------------------------------------------------------------------===//
+
+class VOP2e <bits<6> op, VOPProfile P> : Enc32 {
+ bits<8> vdst;
+ bits<9> src0;
+ bits<8> src1;
+
+ let Inst{8-0} = !if(P.HasSrc0, src0, 0);
+ let Inst{16-9} = !if(P.HasSrc1, src1, 0);
+ let Inst{24-17} = !if(P.EmitDst, vdst, 0);
+ let Inst{30-25} = op;
+ let Inst{31} = 0x0; //encoding
+}
+
+class VOP2_MADKe <bits<6> op, VOPProfile P> : Enc64 {
+ bits<8> vdst;
+ bits<9> src0;
+ bits<8> src1;
+ bits<32> imm;
+
+ let Inst{8-0} = !if(P.HasSrc0, src0, 0);
+ let Inst{16-9} = !if(P.HasSrc1, src1, 0);
+ let Inst{24-17} = !if(P.EmitDst, vdst, 0);
+ let Inst{30-25} = op;
+ let Inst{31} = 0x0; // encoding
+ let Inst{63-32} = imm;
+}
+
+class VOP2_SDWAe <bits<6> op, VOPProfile P> : VOP_SDWAe <P> {
+ bits<8> vdst;
+ bits<8> src1;
+
+ let Inst{8-0} = 0xf9; // sdwa
+ let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, 0);
+ let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
+ let Inst{30-25} = op;
+ let Inst{31} = 0x0; // encoding
+}
+
+class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suffix = "_e32"> :
+ InstSI <P.Outs32, P.Ins32, "", pattern>,
+ VOP <opName>,
+ SIMCInstr <opName#suffix, SIEncodingFamily.NONE>,
+ MnemonicAlias<opName#suffix, opName> {
+
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+ let UseNamedOperandTable = 1;
+
+ string Mnemonic = opName;
+ string AsmOperands = P.Asm32;
+
+ let Size = 4;
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let SubtargetPredicate = isGCN;
+
+ let VOP2 = 1;
+ let VALU = 1;
+ let Uses = [EXEC];
+
+ let AsmVariantName = AMDGPUAsmVariants.Default;
+
+ VOPProfile Pfl = P;
+}
+
+class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily> :
+ InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
+ SIMCInstr <ps.PseudoInstr, EncodingFamily> {
+
+ let isPseudo = 0;
+ let isCodeGenOnly = 0;
+
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
+
+ // copy relevant pseudo op flags
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let AsmMatchConverter = ps.AsmMatchConverter;
+ let AsmVariantName = ps.AsmVariantName;
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
+ let TSFlags = ps.TSFlags;
+}
+
+class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
+ VOP_SDWA_Pseudo <OpName, P, pattern> {
+ let AsmMatchConverter = "cvtSdwaVOP2";
+}
+
+class getVOP2Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
+ list<dag> ret = !if(P.HasModifiers,
+ [(set P.DstVT:$vdst,
+ (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+ (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
+ [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]);
+}
+
+multiclass VOP2Inst <string opName,
+ VOPProfile P,
+ SDPatternOperator node = null_frag,
+ string revOp = opName> {
+
+ def _e32 : VOP2_Pseudo <opName, P>,
+ Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
+
+ def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
+ Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
+
+ def _sdwa : VOP2_SDWA_Pseudo <opName, P>,
+ Commutable_REV<revOp#"_sdwa", !eq(revOp, opName)>;
+}
+
+// TODO: add SDWA pseudo instructions for VOP2bInst and VOP2eInst
+multiclass VOP2bInst <string opName,
+ VOPProfile P,
+ SDPatternOperator node = null_frag,
+ string revOp = opName,
+ bit useSGPRInput = !eq(P.NumSrcArgs, 3)> {
+
+ let SchedRW = [Write32Bit, WriteSALU] in {
+ let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]), Defs = [VCC] in {
+ def _e32 : VOP2_Pseudo <opName, P>,
+ Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
+
+ def _sdwa : VOP2_SDWA_Pseudo <opName, P>,
+ Commutable_REV<revOp#"_sdwa", !eq(revOp, opName)>;
+ }
+ def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
+ Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
+ }
+}
+
+multiclass VOP2eInst <string opName,
+ VOPProfile P,
+ SDPatternOperator node = null_frag,
+ string revOp = opName,
+ bit useSGPRInput = !eq(P.NumSrcArgs, 3)> {
+
+ let SchedRW = [Write32Bit] in {
+ let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]) in {
+ def _e32 : VOP2_Pseudo <opName, P>,
+ Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
+ }
+ def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
+ Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
+ }
+}
+
+class VOP_MADAK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
+ field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
+ field dag Ins32 = (ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm);
+ field string Asm32 = "$vdst, $src0, $src1, $imm";
+ field bit HasExt = 0;
+}
+
+def VOP_MADAK_F16 : VOP_MADAK <f16>;
+def VOP_MADAK_F32 : VOP_MADAK <f32>;
+
+class VOP_MADMK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
+ field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
+ field dag Ins32 = (ins VCSrc_f32:$src0, ImmOpType:$imm, VGPR_32:$src1);
+ field string Asm32 = "$vdst, $src0, $imm, $src1";
+ field bit HasExt = 0;
+}
+
+def VOP_MADMK_F16 : VOP_MADMK <f16>;
+def VOP_MADMK_F32 : VOP_MADMK <f32>;
+
+class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
+ let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2);
+ let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3,
+ HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret;
+ let InsDPP = (ins FP32InputMods:$src0_modifiers, Src0DPP:$src0,
+ FP32InputMods:$src1_modifiers, Src1DPP:$src1,
+ VGPR_32:$src2, // stub argument
+ dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+ bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+ let InsSDWA = (ins FP32InputMods:$src0_modifiers, Src0SDWA:$src0,
+ FP32InputMods:$src1_modifiers, Src1SDWA:$src1,
+ VGPR_32:$src2, // stub argument
+ clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+ src0_sel:$src0_sel, src1_sel:$src1_sel);
+ let Asm32 = getAsm32<1, 2, vt>.ret;
+ let AsmDPP = getAsmDPP<1, 2, HasModifiers, vt>.ret;
+ let AsmSDWA = getAsmSDWA<1, 2, HasModifiers, vt>.ret;
+ let HasSrc2 = 0;
+ let HasSrc2Mods = 0;
+ let HasExt = 1;
+}
+
+def VOP_MAC_F16 : VOP_MAC <f16> {
+ // FIXME: Move 'Asm64' definition to VOP_MAC, and use 'vt'. Currently it gives
+ // 'not a string initializer' error.
+ let Asm64 = getAsm64<1, 2, HasModifiers, f16>.ret;
+}
+
+def VOP_MAC_F32 : VOP_MAC <f32> {
+ // FIXME: Move 'Asm64' definition to VOP_MAC, and use 'vt'. Currently it gives
+ // 'not a string initializer' error.
+ let Asm64 = getAsm64<1, 2, HasModifiers, f32>.ret;
+}
+
+// Write out to vcc or arbitrary SGPR.
+def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> {
+ let Asm32 = "$vdst, vcc, $src0, $src1";
+ let Asm64 = "$vdst, $sdst, $src0, $src1";
+ let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers$clamp $dst_sel $dst_unused $src0_sel $src1_sel";
+ let AsmDPP = "$vdst, vcc, $src0, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
+ let Outs32 = (outs DstRC:$vdst);
+ let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
+}
+
+// Write out to vcc or arbitrary SGPR and read in from vcc or
+// arbitrary SGPR.
+def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
+ // We use VCSrc_b32 to exclude literal constants, even though the
+ // encoding normally allows them since the implicit VCC use means
+ // using one would always violate the constant bus
+ // restriction. SGPRs are still allowed because it should
+ // technically be possible to use VCC again as src0.
+ let Src0RC32 = VCSrc_b32;
+ let Asm32 = "$vdst, vcc, $src0, $src1, vcc";
+ let Asm64 = "$vdst, $sdst, $src0, $src1, $src2";
+ let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";
+ let AsmDPP = "$vdst, vcc, $src0, $src1, vcc $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
+ let Outs32 = (outs DstRC:$vdst);
+ let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
+
+ // Suppress src2 implied by type since the 32-bit encoding uses an
+ // implicit VCC use.
+ let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1);
+
+ let InsSDWA = (ins Src0Mod:$src0_modifiers, Src0SDWA:$src0,
+ Src1Mod:$src1_modifiers, Src1SDWA:$src1,
+ clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+ src0_sel:$src0_sel, src1_sel:$src1_sel);
+
+ let InsDPP = (ins Src0Mod:$src0_modifiers, Src0DPP:$src0,
+ Src1Mod:$src1_modifiers, Src1DPP:$src1,
+ dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+ bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+ let HasExt = 1;
+}
+
+// Read in from vcc or arbitrary SGPR
+def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
+ let Src0RC32 = VCSrc_b32; // See comment in def VOP2b_I32_I1_I32_I32_I1 above.
+ let Asm32 = "$vdst, $src0, $src1, vcc";
+ let Asm64 = "$vdst, $src0, $src1, $src2";
+ let Outs32 = (outs DstRC:$vdst);
+ let Outs64 = (outs DstRC:$vdst);
+
+ // Suppress src2 implied by type since the 32-bit encoding uses an
+ // implicit VCC use.
+ let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1);
+}
+
+def VOP_READLANE : VOPProfile<[i32, i32, i32]> {
+ let Outs32 = (outs SReg_32:$vdst);
+ let Outs64 = Outs32;
+ let Ins32 = (ins VGPR_32:$src0, SCSrc_b32:$src1);
+ let Ins64 = Ins32;
+ let Asm32 = " $vdst, $src0, $src1";
+ let Asm64 = Asm32;
+}
+
+def VOP_WRITELANE : VOPProfile<[i32, i32, i32]> {
+ let Outs32 = (outs VGPR_32:$vdst);
+ let Outs64 = Outs32;
+ let Ins32 = (ins SReg_32:$src0, SCSrc_b32:$src1);
+ let Ins64 = Ins32;
+ let Asm32 = " $vdst, $src0, $src1";
+ let Asm64 = Asm32;
+}
+
+//===----------------------------------------------------------------------===//
+// VOP2 Instructions
+//===----------------------------------------------------------------------===//
+
+let SubtargetPredicate = isGCN in {
+
+defm V_CNDMASK_B32 : VOP2eInst <"v_cndmask_b32", VOP2e_I32_I32_I32_I1>;
+def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32>;
+
+let isCommutable = 1 in {
+defm V_ADD_F32 : VOP2Inst <"v_add_f32", VOP_F32_F32_F32, fadd>;
+defm V_SUB_F32 : VOP2Inst <"v_sub_f32", VOP_F32_F32_F32, fsub>;
+defm V_SUBREV_F32 : VOP2Inst <"v_subrev_f32", VOP_F32_F32_F32, null_frag, "v_sub_f32">;
+defm V_MUL_LEGACY_F32 : VOP2Inst <"v_mul_legacy_f32", VOP_F32_F32_F32, AMDGPUfmul_legacy>;
+defm V_MUL_F32 : VOP2Inst <"v_mul_f32", VOP_F32_F32_F32, fmul>;
+defm V_MUL_I32_I24 : VOP2Inst <"v_mul_i32_i24", VOP_I32_I32_I32, AMDGPUmul_i24>;
+defm V_MUL_HI_I32_I24 : VOP2Inst <"v_mul_hi_i32_i24", VOP_I32_I32_I32, AMDGPUmulhi_i24>;
+defm V_MUL_U32_U24 : VOP2Inst <"v_mul_u32_u24", VOP_I32_I32_I32, AMDGPUmul_u24>;
+defm V_MUL_HI_U32_U24 : VOP2Inst <"v_mul_hi_u32_u24", VOP_I32_I32_I32, AMDGPUmulhi_u24>;
+defm V_MIN_F32 : VOP2Inst <"v_min_f32", VOP_F32_F32_F32, fminnum>;
+defm V_MAX_F32 : VOP2Inst <"v_max_f32", VOP_F32_F32_F32, fmaxnum>;
+defm V_MIN_I32 : VOP2Inst <"v_min_i32", VOP_I32_I32_I32>;
+defm V_MAX_I32 : VOP2Inst <"v_max_i32", VOP_I32_I32_I32>;
+defm V_MIN_U32 : VOP2Inst <"v_min_u32", VOP_I32_I32_I32>;
+defm V_MAX_U32 : VOP2Inst <"v_max_u32", VOP_I32_I32_I32>;
+defm V_LSHRREV_B32 : VOP2Inst <"v_lshrrev_b32", VOP_I32_I32_I32, null_frag, "v_lshr_b32">;
+defm V_ASHRREV_I32 : VOP2Inst <"v_ashrrev_i32", VOP_I32_I32_I32, null_frag, "v_ashr_i32">;
+defm V_LSHLREV_B32 : VOP2Inst <"v_lshlrev_b32", VOP_I32_I32_I32, null_frag, "v_lshl_b32">;
+defm V_AND_B32 : VOP2Inst <"v_and_b32", VOP_I32_I32_I32>;
+defm V_OR_B32 : VOP2Inst <"v_or_b32", VOP_I32_I32_I32>;
+defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_I32_I32_I32>;
+
+let Constraints = "$vdst = $src2", DisableEncoding="$src2",
+ isConvertibleToThreeAddress = 1 in {
+defm V_MAC_F32 : VOP2Inst <"v_mac_f32", VOP_MAC_F32>;
+}
+
+def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32>;
+
+// No patterns so that the scalar instructions are always selected.
+// The scalar versions will be replaced with vector when needed later.
+
+// V_ADD_I32, V_SUB_I32, and V_SUBREV_I32 where renamed to *_U32 in VI,
+// but the VI instructions behave the same as the SI versions.
+defm V_ADD_I32 : VOP2bInst <"v_add_i32", VOP2b_I32_I1_I32_I32>;
+defm V_SUB_I32 : VOP2bInst <"v_sub_i32", VOP2b_I32_I1_I32_I32>;
+defm V_SUBREV_I32 : VOP2bInst <"v_subrev_i32", VOP2b_I32_I1_I32_I32, null_frag, "v_sub_i32">;
+defm V_ADDC_U32 : VOP2bInst <"v_addc_u32", VOP2b_I32_I1_I32_I32_I1>;
+defm V_SUBB_U32 : VOP2bInst <"v_subb_u32", VOP2b_I32_I1_I32_I32_I1>;
+defm V_SUBBREV_U32 : VOP2bInst <"v_subbrev_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_subb_u32">;
+} // End isCommutable = 1
+
+// These are special and do not read the exec mask.
+let isConvergent = 1, Uses = []<Register> in {
+def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE,
+ [(set i32:$vdst, (int_amdgcn_readlane i32:$src0, i32:$src1))], "">;
+
+def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, [], "">;
+} // End isConvergent = 1
+
+defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_I32_I32_I32>;
+defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_I32_I32_I32>;
+defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_I32_I32_I32, int_amdgcn_mbcnt_lo>;
+defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_I32_I32_I32, int_amdgcn_mbcnt_hi>;
+defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_F32_F32_I32, AMDGPUldexp>;
+defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_I32_F32_I32>; // TODO: set "Uses = dst"
+defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_I32_F32_F32>;
+defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_I32_F32_F32>;
+defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_I32_F32_F32, int_SI_packf16>;
+defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_I32_I32_I32>;
+defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_I32_I32_I32>;
+
+} // End SubtargetPredicate = isGCN
+
+
+// These instructions only exist on SI and CI
+let SubtargetPredicate = isSICI in {
+
+defm V_MIN_LEGACY_F32 : VOP2Inst <"v_min_legacy_f32", VOP_F32_F32_F32, AMDGPUfmin_legacy>;
+defm V_MAX_LEGACY_F32 : VOP2Inst <"v_max_legacy_f32", VOP_F32_F32_F32, AMDGPUfmax_legacy>;
+
+let isCommutable = 1 in {
+defm V_MAC_LEGACY_F32 : VOP2Inst <"v_mac_legacy_f32", VOP_F32_F32_F32>;
+defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_I32_I32_I32>;
+defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_I32_I32_I32>;
+defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32>;
+} // End isCommutable = 1
+
+} // End let SubtargetPredicate = SICI
+
+let SubtargetPredicate = isVI in {
+
+def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16>;
+defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16>;
+defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16>;
+defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16>;
+defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>;
+
+let isCommutable = 1 in {
+defm V_ADD_F16 : VOP2Inst <"v_add_f16", VOP_F16_F16_F16, fadd>;
+defm V_SUB_F16 : VOP2Inst <"v_sub_f16", VOP_F16_F16_F16, fsub>;
+defm V_SUBREV_F16 : VOP2Inst <"v_subrev_f16", VOP_F16_F16_F16, null_frag, "v_sub_f16">;
+defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, fmul>;
+def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16>;
+defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16>;
+defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>;
+defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">;
+defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16>;
+defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum>;
+defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum>;
+defm V_MAX_U16 : VOP2Inst <"v_max_u16", VOP_I16_I16_I16>;
+defm V_MAX_I16 : VOP2Inst <"v_max_i16", VOP_I16_I16_I16>;
+defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16>;
+defm V_MIN_I16 : VOP2Inst <"v_min_i16", VOP_I16_I16_I16>;
+
+let Constraints = "$vdst = $src2", DisableEncoding="$src2",
+ isConvertibleToThreeAddress = 1 in {
+defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>;
+}
+} // End isCommutable = 1
+
+} // End SubtargetPredicate = isVI
+
+// Note: 16-bit instructions produce a 0 result in the high 16-bits.
+multiclass Arithmetic_i16_Pats <SDPatternOperator op, Instruction inst> {
+
+def : Pat<
+ (op i16:$src0, i16:$src1),
+ (inst $src0, $src1)
+>;
+
+def : Pat<
+ (i32 (zext (op i16:$src0, i16:$src1))),
+ (inst $src0, $src1)
+>;
+
+def : Pat<
+ (i64 (zext (op i16:$src0, i16:$src1))),
+ (REG_SEQUENCE VReg_64,
+ (inst $src0, $src1), sub0,
+ (V_MOV_B32_e32 (i32 0)), sub1)
+>;
+
+}
+
+multiclass Bits_OpsRev_i16_Pats <SDPatternOperator op, Instruction inst> {
+
+def : Pat<
+ (op i16:$src0, i16:$src1),
+ (inst $src1, $src0)
+>;
+
+def : Pat<
+ (i32 (zext (op i16:$src0, i16:$src1))),
+ (inst $src1, $src0)
+>;
+
+
+def : Pat<
+ (i64 (zext (op i16:$src0, i16:$src1))),
+ (REG_SEQUENCE VReg_64,
+ (inst $src1, $src0), sub0,
+ (V_MOV_B32_e32 (i32 0)), sub1)
+>;
+}
+
+class ZExt_i16_i1_Pat <SDNode ext> : Pat <
+ (i16 (ext i1:$src)),
+ (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src)
+>;
+
+let Predicates = [isVI] in {
+
+defm : Arithmetic_i16_Pats<add, V_ADD_U16_e64>;
+defm : Arithmetic_i16_Pats<mul, V_MUL_LO_U16_e64>;
+defm : Arithmetic_i16_Pats<sub, V_SUB_U16_e64>;
+defm : Arithmetic_i16_Pats<smin, V_MIN_I16_e64>;
+defm : Arithmetic_i16_Pats<smax, V_MAX_I16_e64>;
+defm : Arithmetic_i16_Pats<umin, V_MIN_U16_e64>;
+defm : Arithmetic_i16_Pats<umax, V_MAX_U16_e64>;
+
+def : Pat <
+ (and i16:$src0, i16:$src1),
+ (V_AND_B32_e64 $src0, $src1)
+>;
+
+def : Pat <
+ (or i16:$src0, i16:$src1),
+ (V_OR_B32_e64 $src0, $src1)
+>;
+
+def : Pat <
+ (xor i16:$src0, i16:$src1),
+ (V_XOR_B32_e64 $src0, $src1)
+>;
+
+defm : Bits_OpsRev_i16_Pats<shl, V_LSHLREV_B16_e64>;
+defm : Bits_OpsRev_i16_Pats<srl, V_LSHRREV_B16_e64>;
+defm : Bits_OpsRev_i16_Pats<sra, V_ASHRREV_I16_e64>;
+
+def : ZExt_i16_i1_Pat<zext>;
+def : ZExt_i16_i1_Pat<anyext>;
+
+def : Pat <
+ (i16 (sext i1:$src)),
+ (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src)
+>;
+
+} // End Predicates = [isVI]
+
+//===----------------------------------------------------------------------===//
+// SI
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" in {
+
+multiclass VOP2_Real_si <bits<6> op> {
+ def _si :
+ VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.SI>,
+ VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>;
+}
+
+multiclass VOP2_Real_MADK_si <bits<6> op> {
+ def _si : VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.SI>,
+ VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>;
+}
+
+multiclass VOP2_Real_e32_si <bits<6> op> {
+ def _e32_si :
+ VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
+ VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>;
+}
+
+multiclass VOP2_Real_e32e64_si <bits<6> op> : VOP2_Real_e32_si<op> {
+ def _e64_si :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
+ VOP3e_si <{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+}
+
+multiclass VOP2be_Real_e32e64_si <bits<6> op> : VOP2_Real_e32_si<op> {
+ def _e64_si :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
+ VOP3be_si <{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+}
+
+} // End AssemblerPredicates = [isSICI], DecoderNamespace = "SICI"
+
+defm V_CNDMASK_B32 : VOP2_Real_e32e64_si <0x0>;
+defm V_ADD_F32 : VOP2_Real_e32e64_si <0x3>;
+defm V_SUB_F32 : VOP2_Real_e32e64_si <0x4>;
+defm V_SUBREV_F32 : VOP2_Real_e32e64_si <0x5>;
+defm V_MUL_LEGACY_F32 : VOP2_Real_e32e64_si <0x7>;
+defm V_MUL_F32 : VOP2_Real_e32e64_si <0x8>;
+defm V_MUL_I32_I24 : VOP2_Real_e32e64_si <0x9>;
+defm V_MUL_HI_I32_I24 : VOP2_Real_e32e64_si <0xa>;
+defm V_MUL_U32_U24 : VOP2_Real_e32e64_si <0xb>;
+defm V_MUL_HI_U32_U24 : VOP2_Real_e32e64_si <0xc>;
+defm V_MIN_F32 : VOP2_Real_e32e64_si <0xf>;
+defm V_MAX_F32 : VOP2_Real_e32e64_si <0x10>;
+defm V_MIN_I32 : VOP2_Real_e32e64_si <0x11>;
+defm V_MAX_I32 : VOP2_Real_e32e64_si <0x12>;
+defm V_MIN_U32 : VOP2_Real_e32e64_si <0x13>;
+defm V_MAX_U32 : VOP2_Real_e32e64_si <0x14>;
+defm V_LSHRREV_B32 : VOP2_Real_e32e64_si <0x16>;
+defm V_ASHRREV_I32 : VOP2_Real_e32e64_si <0x18>;
+defm V_LSHLREV_B32 : VOP2_Real_e32e64_si <0x1a>;
+defm V_AND_B32 : VOP2_Real_e32e64_si <0x1b>;
+defm V_OR_B32 : VOP2_Real_e32e64_si <0x1c>;
+defm V_XOR_B32 : VOP2_Real_e32e64_si <0x1d>;
+defm V_MAC_F32 : VOP2_Real_e32e64_si <0x1f>;
+defm V_MADMK_F32 : VOP2_Real_MADK_si <0x20>;
+defm V_MADAK_F32 : VOP2_Real_MADK_si <0x21>;
+defm V_ADD_I32 : VOP2be_Real_e32e64_si <0x25>;
+defm V_SUB_I32 : VOP2be_Real_e32e64_si <0x26>;
+defm V_SUBREV_I32 : VOP2be_Real_e32e64_si <0x27>;
+defm V_ADDC_U32 : VOP2be_Real_e32e64_si <0x28>;
+defm V_SUBB_U32 : VOP2be_Real_e32e64_si <0x29>;
+defm V_SUBBREV_U32 : VOP2be_Real_e32e64_si <0x2a>;
+
+defm V_READLANE_B32 : VOP2_Real_si <0x01>;
+defm V_WRITELANE_B32 : VOP2_Real_si <0x02>;
+
+defm V_MAC_LEGACY_F32 : VOP2_Real_e32e64_si <0x6>;
+defm V_MIN_LEGACY_F32 : VOP2_Real_e32e64_si <0xd>;
+defm V_MAX_LEGACY_F32 : VOP2_Real_e32e64_si <0xe>;
+defm V_LSHR_B32 : VOP2_Real_e32e64_si <0x15>;
+defm V_ASHR_I32 : VOP2_Real_e32e64_si <0x17>;
+defm V_LSHL_B32 : VOP2_Real_e32e64_si <0x19>;
+
+defm V_BFM_B32 : VOP2_Real_e32e64_si <0x1e>;
+defm V_BCNT_U32_B32 : VOP2_Real_e32e64_si <0x22>;
+defm V_MBCNT_LO_U32_B32 : VOP2_Real_e32e64_si <0x23>;
+defm V_MBCNT_HI_U32_B32 : VOP2_Real_e32e64_si <0x24>;
+defm V_LDEXP_F32 : VOP2_Real_e32e64_si <0x2b>;
+defm V_CVT_PKACCUM_U8_F32 : VOP2_Real_e32e64_si <0x2c>;
+defm V_CVT_PKNORM_I16_F32 : VOP2_Real_e32e64_si <0x2d>;
+defm V_CVT_PKNORM_U16_F32 : VOP2_Real_e32e64_si <0x2e>;
+defm V_CVT_PKRTZ_F16_F32 : VOP2_Real_e32e64_si <0x2f>;
+defm V_CVT_PK_U16_U32 : VOP2_Real_e32e64_si <0x30>;
+defm V_CVT_PK_I16_I32 : VOP2_Real_e32e64_si <0x31>;
+
+
+//===----------------------------------------------------------------------===//
+// VI
+//===----------------------------------------------------------------------===//
+
+class VOP2_DPP <bits<6> op, VOP2_Pseudo ps, VOPProfile P = ps.Pfl> :
+ VOP_DPP <ps.OpName, P> {
+ let Defs = ps.Defs;
+ let Uses = ps.Uses;
+ let SchedRW = ps.SchedRW;
+ let hasSideEffects = ps.hasSideEffects;
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
+
+ bits<8> vdst;
+ bits<8> src1;
+ let Inst{8-0} = 0xfa; //dpp
+ let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, 0);
+ let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
+ let Inst{30-25} = op;
+ let Inst{31} = 0x0; //encoding
+}
+
+let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
+
+multiclass VOP32_Real_vi <bits<10> op> {
+ def _vi :
+ VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.VI>,
+ VOP3e_vi<op, !cast<VOP2_Pseudo>(NAME).Pfl>;
+}
+
+multiclass VOP2_Real_MADK_vi <bits<6> op> {
+ def _vi : VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.VI>,
+ VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>;
+}
+
+multiclass VOP2_Real_e32_vi <bits<6> op> {
+ def _e32_vi :
+ VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.VI>,
+ VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>;
+}
+
+multiclass VOP2_Real_e64_vi <bits<10> op> {
+ def _e64_vi :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
+ VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+}
+
+multiclass Base_VOP2be_Real_e32e64_vi <bits<6> op> : VOP2_Real_e32_vi<op> {
+ def _e64_vi :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
+ VOP3be_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+}
+
+multiclass Base_VOP2_Real_e32e64_vi <bits<6> op> :
+ VOP2_Real_e32_vi<op>,
+ VOP2_Real_e64_vi<{0, 1, 0, 0, op{5-0}}>;
+
+} // End AssemblerPredicates = [isVI], DecoderNamespace = "VI"
+
+multiclass VOP2_SDWA_Real <bits<6> op> {
+ def _sdwa_vi :
+ VOP_SDWA_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
+ VOP2_SDWAe <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
+}
+
+multiclass VOP2be_Real_e32e64_vi <bits<6> op> :
+ Base_VOP2be_Real_e32e64_vi<op>, VOP2_SDWA_Real<op> {
+ // For now left dpp only for asm/dasm
+ // TODO: add corresponding pseudo
+ def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>;
+}
+
+multiclass VOP2_Real_e32e64_vi <bits<6> op> :
+ Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op> {
+ // For now left dpp only for asm/dasm
+ // TODO: add corresponding pseudo
+ def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>;
+}
+
+defm V_CNDMASK_B32 : Base_VOP2_Real_e32e64_vi <0x0>;
+defm V_ADD_F32 : VOP2_Real_e32e64_vi <0x1>;
+defm V_SUB_F32 : VOP2_Real_e32e64_vi <0x2>;
+defm V_SUBREV_F32 : VOP2_Real_e32e64_vi <0x3>;
+defm V_MUL_LEGACY_F32 : VOP2_Real_e32e64_vi <0x4>;
+defm V_MUL_F32 : VOP2_Real_e32e64_vi <0x5>;
+defm V_MUL_I32_I24 : VOP2_Real_e32e64_vi <0x6>;
+defm V_MUL_HI_I32_I24 : VOP2_Real_e32e64_vi <0x7>;
+defm V_MUL_U32_U24 : VOP2_Real_e32e64_vi <0x8>;
+defm V_MUL_HI_U32_U24 : VOP2_Real_e32e64_vi <0x9>;
+defm V_MIN_F32 : VOP2_Real_e32e64_vi <0xa>;
+defm V_MAX_F32 : VOP2_Real_e32e64_vi <0xb>;
+defm V_MIN_I32 : VOP2_Real_e32e64_vi <0xc>;
+defm V_MAX_I32 : VOP2_Real_e32e64_vi <0xd>;
+defm V_MIN_U32 : VOP2_Real_e32e64_vi <0xe>;
+defm V_MAX_U32 : VOP2_Real_e32e64_vi <0xf>;
+defm V_LSHRREV_B32 : VOP2_Real_e32e64_vi <0x10>;
+defm V_ASHRREV_I32 : VOP2_Real_e32e64_vi <0x11>;
+defm V_LSHLREV_B32 : VOP2_Real_e32e64_vi <0x12>;
+defm V_AND_B32 : VOP2_Real_e32e64_vi <0x13>;
+defm V_OR_B32 : VOP2_Real_e32e64_vi <0x14>;
+defm V_XOR_B32 : VOP2_Real_e32e64_vi <0x15>;
+defm V_MAC_F32 : VOP2_Real_e32e64_vi <0x16>;
+defm V_MADMK_F32 : VOP2_Real_MADK_vi <0x17>;
+defm V_MADAK_F32 : VOP2_Real_MADK_vi <0x18>;
+defm V_ADD_I32 : VOP2be_Real_e32e64_vi <0x19>;
+defm V_SUB_I32 : VOP2be_Real_e32e64_vi <0x1a>;
+defm V_SUBREV_I32 : VOP2be_Real_e32e64_vi <0x1b>;
+defm V_ADDC_U32 : VOP2be_Real_e32e64_vi <0x1c>;
+defm V_SUBB_U32 : VOP2be_Real_e32e64_vi <0x1d>;
+defm V_SUBBREV_U32 : VOP2be_Real_e32e64_vi <0x1e>;
+
+defm V_READLANE_B32 : VOP32_Real_vi <0x289>;
+defm V_WRITELANE_B32 : VOP32_Real_vi <0x28a>;
+
+defm V_BFM_B32 : VOP2_Real_e64_vi <0x293>;
+defm V_BCNT_U32_B32 : VOP2_Real_e64_vi <0x28b>;
+defm V_MBCNT_LO_U32_B32 : VOP2_Real_e64_vi <0x28c>;
+defm V_MBCNT_HI_U32_B32 : VOP2_Real_e64_vi <0x28d>;
+defm V_LDEXP_F32 : VOP2_Real_e64_vi <0x288>;
+defm V_CVT_PKACCUM_U8_F32 : VOP2_Real_e64_vi <0x1f0>;
+defm V_CVT_PKNORM_I16_F32 : VOP2_Real_e64_vi <0x294>;
+defm V_CVT_PKNORM_U16_F32 : VOP2_Real_e64_vi <0x295>;
+defm V_CVT_PKRTZ_F16_F32 : VOP2_Real_e64_vi <0x296>;
+defm V_CVT_PK_U16_U32 : VOP2_Real_e64_vi <0x297>;
+defm V_CVT_PK_I16_I32 : VOP2_Real_e64_vi <0x298>;
+
+defm V_ADD_F16 : VOP2_Real_e32e64_vi <0x1f>;
+defm V_SUB_F16 : VOP2_Real_e32e64_vi <0x20>;
+defm V_SUBREV_F16 : VOP2_Real_e32e64_vi <0x21>;
+defm V_MUL_F16 : VOP2_Real_e32e64_vi <0x22>;
+defm V_MAC_F16 : VOP2_Real_e32e64_vi <0x23>;
+defm V_MADMK_F16 : VOP2_Real_MADK_vi <0x24>;
+defm V_MADAK_F16 : VOP2_Real_MADK_vi <0x25>;
+defm V_ADD_U16 : VOP2_Real_e32e64_vi <0x26>;
+defm V_SUB_U16 : VOP2_Real_e32e64_vi <0x27>;
+defm V_SUBREV_U16 : VOP2_Real_e32e64_vi <0x28>;
+defm V_MUL_LO_U16 : VOP2_Real_e32e64_vi <0x29>;
+defm V_LSHLREV_B16 : VOP2_Real_e32e64_vi <0x2a>;
+defm V_LSHRREV_B16 : VOP2_Real_e32e64_vi <0x2b>;
+defm V_ASHRREV_I16 : VOP2_Real_e32e64_vi <0x2c>;
+defm V_MAX_F16 : VOP2_Real_e32e64_vi <0x2d>;
+defm V_MIN_F16 : VOP2_Real_e32e64_vi <0x2e>;
+defm V_MAX_U16 : VOP2_Real_e32e64_vi <0x2f>;
+defm V_MAX_I16 : VOP2_Real_e32e64_vi <0x30>;
+defm V_MIN_U16 : VOP2_Real_e32e64_vi <0x31>;
+defm V_MIN_I16 : VOP2_Real_e32e64_vi <0x32>;
+defm V_LDEXP_F16 : VOP2_Real_e32e64_vi <0x33>;
+
+let SubtargetPredicate = isVI in {
+
+// Aliases to simplify matching of floating-point instructions that
+// are VOP2 on SI and VOP3 on VI.
+class SI2_VI3Alias <string name, Instruction inst> : InstAlias <
+ name#" $dst, $src0, $src1",
+ (inst VGPR_32:$dst, 0, VCSrc_f32:$src0, 0, VCSrc_f32:$src1, 0, 0)
+>, PredicateControl {
+ let UseInstAsmMatchConverter = 0;
+ let AsmVariantName = AMDGPUAsmVariants.VOP3;
+}
+
+def : SI2_VI3Alias <"v_ldexp_f32", V_LDEXP_F32_e64_vi>;
+def : SI2_VI3Alias <"v_cvt_pkaccum_u8_f32", V_CVT_PKACCUM_U8_F32_e64_vi>;
+def : SI2_VI3Alias <"v_cvt_pknorm_i16_f32", V_CVT_PKNORM_I16_F32_e64_vi>;
+def : SI2_VI3Alias <"v_cvt_pknorm_u16_f32", V_CVT_PKNORM_U16_F32_e64_vi>;
+def : SI2_VI3Alias <"v_cvt_pkrtz_f16_f32", V_CVT_PKRTZ_F16_F32_e64_vi>;
+
+} // End SubtargetPredicate = isVI
diff --git a/contrib/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/contrib/llvm/lib/Target/AMDGPU/VOP3Instructions.td
new file mode 100644
index 000000000000..5efa64d25ce1
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -0,0 +1,447 @@
+//===-- VOP3Instructions.td - Vector Instruction Defintions ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// VOP3 Classes
+//===----------------------------------------------------------------------===//
+
+class getVOP3ModPat<VOPProfile P, SDPatternOperator node> {
+ list<dag> ret3 = [(set P.DstVT:$vdst,
+ (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+ (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
+ (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))];
+
+ list<dag> ret2 = [(set P.DstVT:$vdst,
+ (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+ (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))];
+
+ list<dag> ret1 = [(set P.DstVT:$vdst,
+ (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod))))];
+
+ list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
+ !if(!eq(P.NumSrcArgs, 2), ret2,
+ ret1));
+}
+
+class getVOP3Pat<VOPProfile P, SDPatternOperator node> {
+ list<dag> ret3 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2))];
+ list<dag> ret2 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))];
+ list<dag> ret1 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0))];
+ list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
+ !if(!eq(P.NumSrcArgs, 2), ret2,
+ ret1));
+}
+
+class VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_frag, bit VOP3Only = 0> :
+ VOP3_Pseudo<OpName, P,
+ !if(P.HasModifiers, getVOP3ModPat<P, node>.ret, getVOP3Pat<P, node>.ret),
+ VOP3Only>;
+
+// Special case for v_div_fmas_{f32|f64}, since it seems to be the
+// only VOP instruction that implicitly reads VCC.
+let Asm64 = " $vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$omod" in {
+def VOP_F32_F32_F32_F32_VCC : VOPProfile<[f32, f32, f32, f32]> {
+ let Outs64 = (outs DstRC.RegClass:$vdst);
+}
+def VOP_F64_F64_F64_F64_VCC : VOPProfile<[f64, f64, f64, f64]> {
+ let Outs64 = (outs DstRC.RegClass:$vdst);
+}
+}
+
+class getVOP3VCC<VOPProfile P, SDPatternOperator node> {
+ list<dag> ret =
+ [(set P.DstVT:$vdst,
+ (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+ (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
+ (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers)),
+ (i1 VCC)))];
+}
+
+class VOP3_Profile<VOPProfile P> : VOPProfile<P.ArgVT> {
+ // FIXME: Hack to stop printing _e64
+ let Outs64 = (outs DstRC.RegClass:$vdst);
+ let Asm64 = " " # P.Asm64;
+}
+
+class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> {
+ let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
+ let Asm64 = " $vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$omod";
+}
+
+def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32> {
+ // FIXME: Hack to stop printing _e64
+ let DstRC = RegisterOperand<VGPR_32>;
+}
+
+def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64> {
+ // FIXME: Hack to stop printing _e64
+ let DstRC = RegisterOperand<VReg_64>;
+}
+
+//===----------------------------------------------------------------------===//
+// VOP3 Instructions
+//===----------------------------------------------------------------------===//
+
+let isCommutable = 1 in {
+
+def V_MAD_LEGACY_F32 : VOP3Inst <"v_mad_legacy_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
+def V_MAD_F32 : VOP3Inst <"v_mad_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fmad>;
+def V_MAD_I32_I24 : VOP3Inst <"v_mad_i32_i24", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUmad_i24>;
+def V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUmad_u24>;
+def V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fma>;
+def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, fma>;
+def V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>;
+
+let SchedRW = [WriteDoubleAdd] in {
+def V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, fadd, 1>;
+def V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile<VOP_F64_F64_F64>, fmul, 1>;
+def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum, 1>;
+def V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum, 1>;
+} // End SchedRW = [WriteDoubleAdd]
+
+let SchedRW = [WriteQuarterRate32] in {
+def V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", VOP3_Profile<VOP_I32_I32_I32>>;
+def V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", VOP3_Profile<VOP_I32_I32_I32>, mulhu>;
+def V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", VOP3_Profile<VOP_I32_I32_I32>>;
+def V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", VOP3_Profile<VOP_I32_I32_I32>, mulhs>;
+} // End SchedRW = [WriteQuarterRate32]
+
+let Uses = [VCC, EXEC] in {
+// v_div_fmas_f32:
+// result = src0 * src1 + src2
+// if (vcc)
+// result *= 2^32
+//
+def V_DIV_FMAS_F32 : VOP3_Pseudo <"v_div_fmas_f32", VOP_F32_F32_F32_F32_VCC,
+ getVOP3VCC<VOP_F32_F32_F32_F32_VCC, AMDGPUdiv_fmas>.ret> {
+ let SchedRW = [WriteFloatFMA];
+}
+// v_div_fmas_f64:
+// result = src0 * src1 + src2
+// if (vcc)
+// result *= 2^64
+//
+def V_DIV_FMAS_F64 : VOP3_Pseudo <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC,
+ getVOP3VCC<VOP_F64_F64_F64_F64_VCC, AMDGPUdiv_fmas>.ret> {
+ let SchedRW = [WriteDouble];
+}
+} // End Uses = [VCC, EXEC]
+
+} // End isCommutable = 1
+
+def V_CUBEID_F32 : VOP3Inst <"v_cubeid_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubeid>;
+def V_CUBESC_F32 : VOP3Inst <"v_cubesc_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubesc>;
+def V_CUBETC_F32 : VOP3Inst <"v_cubetc_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubetc>;
+def V_CUBEMA_F32 : VOP3Inst <"v_cubema_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubema>;
+def V_BFE_U32 : VOP3Inst <"v_bfe_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_u32>;
+def V_BFE_I32 : VOP3Inst <"v_bfe_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_i32>;
+def V_BFI_B32 : VOP3Inst <"v_bfi_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfi>;
+def V_ALIGNBIT_B32 : VOP3Inst <"v_alignbit_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+def V_ALIGNBYTE_B32 : VOP3Inst <"v_alignbyte_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+def V_MIN3_F32 : VOP3Inst <"v_min3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmin3>;
+def V_MIN3_I32 : VOP3Inst <"v_min3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmin3>;
+def V_MIN3_U32 : VOP3Inst <"v_min3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumin3>;
+def V_MAX3_F32 : VOP3Inst <"v_max3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmax3>;
+def V_MAX3_I32 : VOP3Inst <"v_max3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmax3>;
+def V_MAX3_U32 : VOP3Inst <"v_max3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumax3>;
+def V_MED3_F32 : VOP3Inst <"v_med3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmed3>;
+def V_MED3_I32 : VOP3Inst <"v_med3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmed3>;
+def V_MED3_U32 : VOP3Inst <"v_med3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumed3>;
+def V_SAD_U8 : VOP3Inst <"v_sad_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_sad_u8>;
+def V_SAD_HI_U8 : VOP3Inst <"v_sad_hi_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_sad_hi_u8>;
+def V_SAD_U16 : VOP3Inst <"v_sad_u16", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_sad_u16>;
+def V_SAD_U32 : VOP3Inst <"v_sad_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+def V_CVT_PK_U8_F32 : VOP3Inst<"v_cvt_pk_u8_f32", VOP3_Profile<VOP_I32_F32_I32_I32>, int_amdgcn_cvt_pk_u8_f32>;
+def V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUdiv_fixup>;
+
+let SchedRW = [WriteDoubleAdd] in {
+def V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, AMDGPUdiv_fixup>;
+def V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUldexp, 1>;
+} // End SchedRW = [WriteDoubleAdd]
+
+def V_DIV_SCALE_F32 : VOP3_Pseudo <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32, [], 1> {
+ let SchedRW = [WriteFloatFMA, WriteSALU];
+ let hasExtraSrcRegAllocReq = 1;
+}
+
+// Double precision division pre-scale.
+def V_DIV_SCALE_F64 : VOP3_Pseudo <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64, [], 1> {
+ let SchedRW = [WriteDouble, WriteSALU];
+ let hasExtraSrcRegAllocReq = 1;
+}
+
+def V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_msad_u8>;
+def V_MQSAD_PK_U16_U8 : VOP3Inst <"v_mqsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64>, int_amdgcn_mqsad_pk_u16_u8>;
+
+def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUtrig_preop> {
+ let SchedRW = [WriteDouble];
+}
+
+// These instructions only exist on SI and CI
+let SubtargetPredicate = isSICI in {
+def V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_I64_I64_I32>>;
+def V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_I64_I64_I32>>;
+def V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_I64_I64_I32>>;
+def V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
+} // End SubtargetPredicate = isSICI
+
+let SubtargetPredicate = isVI in {
+def V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>>;
+def V_LSHRREV_B64 : VOP3Inst <"v_lshrrev_b64", VOP3_Profile<VOP_I64_I32_I64>>;
+def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>>;
+} // End SubtargetPredicate = isVI
+
+
+let SubtargetPredicate = isCIVI in {
+
+def V_MQSAD_U16_U8 : VOP3Inst <"v_mqsad_u16_u8", VOP3_Profile<VOP_I32_I32_I32>>;
+def V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64>, int_amdgcn_qsad_pk_u16_u8>;
+def V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32_V4I32>, int_amdgcn_mqsad_u32_u8>;
+
+let isCommutable = 1 in {
+def V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3_Profile<VOP_I64_I32_I32_I64>>;
+
+// XXX - Does this set VCC?
+def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3_Profile<VOP_I64_I32_I32_I64>>;
+} // End isCommutable = 1
+
+} // End SubtargetPredicate = isCIVI
+
+
+let SubtargetPredicate = isVI in {
+
+let isCommutable = 1 in {
+
+def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup>;
+def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fma>;
+def V_INTERP_P1LL_F16 : VOP3Inst <"v_interp_p1ll_f16", VOP3_Profile<VOP_F32_F32_F16>>;
+def V_INTERP_P1LV_F16 : VOP3Inst <"v_interp_p1lv_f16", VOP3_Profile<VOP_F32_F32_F16_F16>>;
+def V_INTERP_P2_F16 : VOP3Inst <"v_interp_p2_f16", VOP3_Profile<VOP_F16_F32_F16_F32>>;
+def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>;
+
+def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile<VOP_I16_I16_I16_I16>>;
+def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16>>;
+
+} // End isCommutable = 1
+
+} // End SubtargetPredicate = isVI
+
+let Predicates = [isVI] in {
+
+multiclass Tenary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2,
+ Instruction inst, SDPatternOperator op3> {
+def : Pat<
+ (op2 (op1 i16:$src0, i16:$src1), i16:$src2),
+ (inst i16:$src0, i16:$src1, i16:$src2)
+>;
+
+def : Pat<
+ (i32 (op3 (op2 (op1 i16:$src0, i16:$src1), i16:$src2))),
+ (inst i16:$src0, i16:$src1, i16:$src2)
+>;
+
+def : Pat<
+ (i64 (op3 (op2 (op1 i16:$src0, i16:$src1), i16:$src2))),
+ (REG_SEQUENCE VReg_64,
+ (inst i16:$src0, i16:$src1, i16:$src2), sub0,
+ (V_MOV_B32_e32 (i32 0)), sub1)
+>;
+}
+
+defm: Tenary_i16_Pats<mul, add, V_MAD_U16, zext>;
+defm: Tenary_i16_Pats<mul, add, V_MAD_I16, sext>;
+
+} // End Predicates = [isVI]
+
+
+//===----------------------------------------------------------------------===//
+// Target
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SI
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" in {
+
+multiclass VOP3_Real_si<bits<9> op> {
+ def _si : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
+ VOP3e_si <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
+}
+
+multiclass VOP3be_Real_si<bits<9> op> {
+ def _si : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
+ VOP3be_si <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
+}
+
+} // End AssemblerPredicates = [isSICI], DecoderNamespace = "SICI"
+
+defm V_MAD_LEGACY_F32 : VOP3_Real_si <0x140>;
+defm V_MAD_F32 : VOP3_Real_si <0x141>;
+defm V_MAD_I32_I24 : VOP3_Real_si <0x142>;
+defm V_MAD_U32_U24 : VOP3_Real_si <0x143>;
+defm V_CUBEID_F32 : VOP3_Real_si <0x144>;
+defm V_CUBESC_F32 : VOP3_Real_si <0x145>;
+defm V_CUBETC_F32 : VOP3_Real_si <0x146>;
+defm V_CUBEMA_F32 : VOP3_Real_si <0x147>;
+defm V_BFE_U32 : VOP3_Real_si <0x148>;
+defm V_BFE_I32 : VOP3_Real_si <0x149>;
+defm V_BFI_B32 : VOP3_Real_si <0x14a>;
+defm V_FMA_F32 : VOP3_Real_si <0x14b>;
+defm V_FMA_F64 : VOP3_Real_si <0x14c>;
+defm V_LERP_U8 : VOP3_Real_si <0x14d>;
+defm V_ALIGNBIT_B32 : VOP3_Real_si <0x14e>;
+defm V_ALIGNBYTE_B32 : VOP3_Real_si <0x14f>;
+defm V_MULLIT_F32 : VOP3_Real_si <0x150>;
+defm V_MIN3_F32 : VOP3_Real_si <0x151>;
+defm V_MIN3_I32 : VOP3_Real_si <0x152>;
+defm V_MIN3_U32 : VOP3_Real_si <0x153>;
+defm V_MAX3_F32 : VOP3_Real_si <0x154>;
+defm V_MAX3_I32 : VOP3_Real_si <0x155>;
+defm V_MAX3_U32 : VOP3_Real_si <0x156>;
+defm V_MED3_F32 : VOP3_Real_si <0x157>;
+defm V_MED3_I32 : VOP3_Real_si <0x158>;
+defm V_MED3_U32 : VOP3_Real_si <0x159>;
+defm V_SAD_U8 : VOP3_Real_si <0x15a>;
+defm V_SAD_HI_U8 : VOP3_Real_si <0x15b>;
+defm V_SAD_U16 : VOP3_Real_si <0x15c>;
+defm V_SAD_U32 : VOP3_Real_si <0x15d>;
+defm V_CVT_PK_U8_F32 : VOP3_Real_si <0x15e>;
+defm V_DIV_FIXUP_F32 : VOP3_Real_si <0x15f>;
+defm V_DIV_FIXUP_F64 : VOP3_Real_si <0x160>;
+defm V_LSHL_B64 : VOP3_Real_si <0x161>;
+defm V_LSHR_B64 : VOP3_Real_si <0x162>;
+defm V_ASHR_I64 : VOP3_Real_si <0x163>;
+defm V_ADD_F64 : VOP3_Real_si <0x164>;
+defm V_MUL_F64 : VOP3_Real_si <0x165>;
+defm V_MIN_F64 : VOP3_Real_si <0x166>;
+defm V_MAX_F64 : VOP3_Real_si <0x167>;
+defm V_LDEXP_F64 : VOP3_Real_si <0x168>;
+defm V_MUL_LO_U32 : VOP3_Real_si <0x169>;
+defm V_MUL_HI_U32 : VOP3_Real_si <0x16a>;
+defm V_MUL_LO_I32 : VOP3_Real_si <0x16b>;
+defm V_MUL_HI_I32 : VOP3_Real_si <0x16c>;
+defm V_DIV_SCALE_F32 : VOP3be_Real_si <0x16d>;
+defm V_DIV_SCALE_F64 : VOP3be_Real_si <0x16e>;
+defm V_DIV_FMAS_F32 : VOP3_Real_si <0x16f>;
+defm V_DIV_FMAS_F64 : VOP3_Real_si <0x170>;
+defm V_MSAD_U8 : VOP3_Real_si <0x171>;
+defm V_MQSAD_PK_U16_U8 : VOP3_Real_si <0x173>;
+defm V_TRIG_PREOP_F64 : VOP3_Real_si <0x174>;
+
+//===----------------------------------------------------------------------===//
+// CI
+//===----------------------------------------------------------------------===//
+
+multiclass VOP3_Real_ci<bits<9> op> {
+ def _ci : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
+ VOP3e_si <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
+ let AssemblerPredicates = [isCIOnly];
+ let DecoderNamespace = "CI";
+ }
+}
+
+defm V_MQSAD_U16_U8 : VOP3_Real_ci <0x172>;
+defm V_QSAD_PK_U16_U8 : VOP3_Real_ci <0x172>;
+defm V_MQSAD_U32_U8 : VOP3_Real_ci <0x174>;
+defm V_MAD_U64_U32 : VOP3_Real_ci <0x176>;
+defm V_MAD_I64_I32 : VOP3_Real_ci <0x177>;
+
+//===----------------------------------------------------------------------===//
+// VI
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
+
+multiclass VOP3_Real_vi<bits<10> op> {
+ def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
+ VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
+}
+
+multiclass VOP3be_Real_vi<bits<10> op> {
+ def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
+ VOP3be_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
+}
+
+} // End AssemblerPredicates = [isVI], DecoderNamespace = "VI"
+
+defm V_MQSAD_U16_U8 : VOP3_Real_vi <0x172>;
+defm V_MAD_U64_U32 : VOP3_Real_vi <0x176>;
+defm V_MAD_I64_I32 : VOP3_Real_vi <0x177>;
+
+defm V_MAD_LEGACY_F32 : VOP3_Real_vi <0x1c0>;
+defm V_MAD_F32 : VOP3_Real_vi <0x1c1>;
+defm V_MAD_I32_I24 : VOP3_Real_vi <0x1c2>;
+defm V_MAD_U32_U24 : VOP3_Real_vi <0x1c3>;
+defm V_CUBEID_F32 : VOP3_Real_vi <0x1c4>;
+defm V_CUBESC_F32 : VOP3_Real_vi <0x1c5>;
+defm V_CUBETC_F32 : VOP3_Real_vi <0x1c6>;
+defm V_CUBEMA_F32 : VOP3_Real_vi <0x1c7>;
+defm V_BFE_U32 : VOP3_Real_vi <0x1c8>;
+defm V_BFE_I32 : VOP3_Real_vi <0x1c9>;
+defm V_BFI_B32 : VOP3_Real_vi <0x1ca>;
+defm V_FMA_F32 : VOP3_Real_vi <0x1cb>;
+defm V_FMA_F64 : VOP3_Real_vi <0x1cc>;
+defm V_LERP_U8 : VOP3_Real_vi <0x1cd>;
+defm V_ALIGNBIT_B32 : VOP3_Real_vi <0x1ce>;
+defm V_ALIGNBYTE_B32 : VOP3_Real_vi <0x1cf>;
+defm V_MIN3_F32 : VOP3_Real_vi <0x1d0>;
+defm V_MIN3_I32 : VOP3_Real_vi <0x1d1>;
+defm V_MIN3_U32 : VOP3_Real_vi <0x1d2>;
+defm V_MAX3_F32 : VOP3_Real_vi <0x1d3>;
+defm V_MAX3_I32 : VOP3_Real_vi <0x1d4>;
+defm V_MAX3_U32 : VOP3_Real_vi <0x1d5>;
+defm V_MED3_F32 : VOP3_Real_vi <0x1d6>;
+defm V_MED3_I32 : VOP3_Real_vi <0x1d7>;
+defm V_MED3_U32 : VOP3_Real_vi <0x1d8>;
+defm V_SAD_U8 : VOP3_Real_vi <0x1d9>;
+defm V_SAD_HI_U8 : VOP3_Real_vi <0x1da>;
+defm V_SAD_U16 : VOP3_Real_vi <0x1db>;
+defm V_SAD_U32 : VOP3_Real_vi <0x1dc>;
+defm V_CVT_PK_U8_F32 : VOP3_Real_vi <0x1dd>;
+defm V_DIV_FIXUP_F32 : VOP3_Real_vi <0x1de>;
+defm V_DIV_FIXUP_F64 : VOP3_Real_vi <0x1df>;
+defm V_DIV_SCALE_F32 : VOP3be_Real_vi <0x1e0>;
+defm V_DIV_SCALE_F64 : VOP3be_Real_vi <0x1e1>;
+defm V_DIV_FMAS_F32 : VOP3_Real_vi <0x1e2>;
+defm V_DIV_FMAS_F64 : VOP3_Real_vi <0x1e3>;
+defm V_MSAD_U8 : VOP3_Real_vi <0x1e4>;
+defm V_QSAD_PK_U16_U8 : VOP3_Real_vi <0x1e5>;
+defm V_MQSAD_PK_U16_U8 : VOP3_Real_vi <0x1e6>;
+defm V_MQSAD_U32_U8 : VOP3_Real_vi <0x1e7>;
+
+defm V_MAD_F16 : VOP3_Real_vi <0x1ea>;
+defm V_MAD_U16 : VOP3_Real_vi <0x1eb>;
+defm V_MAD_I16 : VOP3_Real_vi <0x1ec>;
+
+defm V_FMA_F16 : VOP3_Real_vi <0x1ee>;
+defm V_DIV_FIXUP_F16 : VOP3_Real_vi <0x1ef>;
+
+defm V_INTERP_P1LL_F16 : VOP3_Real_vi <0x274>;
+defm V_INTERP_P1LV_F16 : VOP3_Real_vi <0x275>;
+defm V_INTERP_P2_F16 : VOP3_Real_vi <0x276>;
+defm V_ADD_F64 : VOP3_Real_vi <0x280>;
+defm V_MUL_F64 : VOP3_Real_vi <0x281>;
+defm V_MIN_F64 : VOP3_Real_vi <0x282>;
+defm V_MAX_F64 : VOP3_Real_vi <0x283>;
+defm V_LDEXP_F64 : VOP3_Real_vi <0x284>;
+defm V_MUL_LO_U32 : VOP3_Real_vi <0x285>;
+
+// removed from VI as identical to V_MUL_LO_U32
+let isAsmParserOnly = 1 in {
+defm V_MUL_LO_I32 : VOP3_Real_vi <0x285>;
+}
+
+defm V_MUL_HI_U32 : VOP3_Real_vi <0x286>;
+defm V_MUL_HI_I32 : VOP3_Real_vi <0x287>;
+
+defm V_LSHLREV_B64 : VOP3_Real_vi <0x28f>;
+defm V_LSHRREV_B64 : VOP3_Real_vi <0x290>;
+defm V_ASHRREV_I64 : VOP3_Real_vi <0x291>;
+defm V_TRIG_PREOP_F64 : VOP3_Real_vi <0x292>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/contrib/llvm/lib/Target/AMDGPU/VOPCInstructions.td
new file mode 100644
index 000000000000..c431d9db801e
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -0,0 +1,1144 @@
+//===-- VOPCInstructions.td - Vector Instruction Defintions ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Encodings
+//===----------------------------------------------------------------------===//
+
+class VOPCe <bits<8> op> : Enc32 {
+ bits<9> src0;
+ bits<8> src1;
+
+ let Inst{8-0} = src0;
+ let Inst{16-9} = src1;
+ let Inst{24-17} = op;
+ let Inst{31-25} = 0x3e;
+}
+
+class VOPC_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe <P> {
+ bits<8> src1;
+
+ let Inst{8-0} = 0xf9; // sdwa
+ let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, 0);
+ let Inst{24-17} = op;
+ let Inst{31-25} = 0x3e; // encoding
+
+ // VOPC disallows dst_sel and dst_unused as they have no effect on destination
+ let Inst{42-40} = SDWA.DWORD;
+ let Inst{44-43} = SDWA.UNUSED_PRESERVE;
+}
+
+//===----------------------------------------------------------------------===//
+// VOPC classes
+//===----------------------------------------------------------------------===//
+
+// VOPC instructions are a special case because for the 32-bit
+// encoding, we want to display the implicit vcc write as if it were
+// an explicit $dst.
+class VOPC_Profile<list<SchedReadWrite> sched, ValueType vt0, ValueType vt1 = vt0> :
+ VOPProfile <[i1, vt0, vt1, untyped]> {
+ let Asm32 = "vcc, $src0, $src1";
+ // The destination for 32-bit encoding is implicit.
+ let HasDst32 = 0;
+ let Outs64 = (outs VOPDstS64:$sdst);
+ list<SchedReadWrite> Schedule = sched;
+}
+
+class VOPC_Pseudo <string opName, VOPC_Profile P, list<dag> pattern=[]> :
+ InstSI<(outs), P.Ins32, "", pattern>,
+ VOP <opName>,
+ SIMCInstr<opName#"_e32", SIEncodingFamily.NONE> {
+
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+ let UseNamedOperandTable = 1;
+
+ string Mnemonic = opName;
+ string AsmOperands = P.Asm32;
+
+ let Size = 4;
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+
+ let VALU = 1;
+ let VOPC = 1;
+ let Uses = [EXEC];
+ let Defs = [VCC];
+
+ let SubtargetPredicate = isGCN;
+
+ VOPProfile Pfl = P;
+}
+
+class VOPC_Real <VOPC_Pseudo ps, int EncodingFamily> :
+ InstSI <ps.OutOperandList, ps.InOperandList, ps.PseudoInstr # " " # ps.AsmOperands, []>,
+ SIMCInstr <ps.PseudoInstr, EncodingFamily> {
+
+ let isPseudo = 0;
+ let isCodeGenOnly = 0;
+
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
+
+ // copy relevant pseudo op flags
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let AsmMatchConverter = ps.AsmMatchConverter;
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
+ let TSFlags = ps.TSFlags;
+}
+
+class VOPC_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
+ VOP_SDWA_Pseudo <OpName, P, pattern> {
+ let AsmMatchConverter = "cvtSdwaVOPC";
+}
+
+// This class is used only with VOPC instructions. Use $sdst for out operand
+class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst, VOPProfile p = ps.Pfl> :
+ InstAlias <ps.OpName#" "#p.Asm32, (inst)>, PredicateControl {
+
+ field bit isCompare;
+ field bit isCommutable;
+
+ let ResultInst =
+ !if (p.HasDst32,
+ !if (!eq(p.NumSrcArgs, 0),
+ // 1 dst, 0 src
+ (inst p.DstRC:$sdst),
+ !if (!eq(p.NumSrcArgs, 1),
+ // 1 dst, 1 src
+ (inst p.DstRC:$sdst, p.Src0RC32:$src0),
+ !if (!eq(p.NumSrcArgs, 2),
+ // 1 dst, 2 src
+ (inst p.DstRC:$sdst, p.Src0RC32:$src0, p.Src1RC32:$src1),
+ // else - unreachable
+ (inst)))),
+ // else
+ !if (!eq(p.NumSrcArgs, 2),
+ // 0 dst, 2 src
+ (inst p.Src0RC32:$src0, p.Src1RC32:$src1),
+ !if (!eq(p.NumSrcArgs, 1),
+ // 0 dst, 1 src
+ (inst p.Src0RC32:$src1),
+ // else
+ // 0 dst, 0 src
+ (inst))));
+
+ let AsmVariantName = AMDGPUAsmVariants.Default;
+ let SubtargetPredicate = AssemblerPredicate;
+}
+
+multiclass VOPC_Pseudos <string opName,
+ VOPC_Profile P,
+ PatLeaf cond = COND_NULL,
+ string revOp = opName,
+ bit DefExec = 0> {
+
+ def _e32 : VOPC_Pseudo <opName, P>,
+ Commutable_REV<revOp#"_e32", !eq(revOp, opName)> {
+ let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
+ let SchedRW = P.Schedule;
+ let isConvergent = DefExec;
+ let isCompare = 1;
+ let isCommutable = 1;
+ }
+
+ def _e64 : VOP3_Pseudo<opName, P,
+ !if(P.HasModifiers,
+ [(set i1:$sdst,
+ (setcc (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
+ i1:$clamp, i32:$omod)),
+ (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
+ cond))],
+ [(set i1:$sdst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))])>,
+ Commutable_REV<revOp#"_e64", !eq(revOp, opName)> {
+ let Defs = !if(DefExec, [EXEC], []);
+ let SchedRW = P.Schedule;
+ let isCompare = 1;
+ let isCommutable = 1;
+ }
+
+ def _sdwa : VOPC_SDWA_Pseudo <opName, P>,
+ Commutable_REV<revOp#"_sdwa", !eq(revOp, opName)> {
+ let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
+ let SchedRW = P.Schedule;
+ let isConvergent = DefExec;
+ let isCompare = 1;
+ let isCommutable = 1;
+ }
+}
+
+def VOPC_I1_F16_F16 : VOPC_Profile<[Write32Bit], f16>;
+def VOPC_I1_F32_F32 : VOPC_Profile<[Write32Bit], f32>;
+def VOPC_I1_F64_F64 : VOPC_Profile<[WriteDoubleAdd], f64>;
+def VOPC_I1_I16_I16 : VOPC_Profile<[Write32Bit], i16>;
+def VOPC_I1_I32_I32 : VOPC_Profile<[Write32Bit], i32>;
+def VOPC_I1_I64_I64 : VOPC_Profile<[Write64Bit], i64>;
+
+multiclass VOPC_F16 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+ VOPC_Pseudos <opName, VOPC_I1_F16_F16, cond, revOp, 0>;
+
+multiclass VOPC_F32 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+ VOPC_Pseudos <opName, VOPC_I1_F32_F32, cond, revOp, 0>;
+
+multiclass VOPC_F64 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+ VOPC_Pseudos <opName, VOPC_I1_F64_F64, cond, revOp, 0>;
+
+multiclass VOPC_I16 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+ VOPC_Pseudos <opName, VOPC_I1_I16_I16, cond, revOp, 0>;
+
+multiclass VOPC_I32 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+ VOPC_Pseudos <opName, VOPC_I1_I32_I32, cond, revOp, 0>;
+
+multiclass VOPC_I64 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+ VOPC_Pseudos <opName, VOPC_I1_I64_I64, cond, revOp, 0>;
+
+multiclass VOPCX_F16 <string opName, string revOp = opName> :
+ VOPC_Pseudos <opName, VOPC_I1_F16_F16, COND_NULL, revOp, 1>;
+
+multiclass VOPCX_F32 <string opName, string revOp = opName> :
+ VOPC_Pseudos <opName, VOPC_I1_F32_F32, COND_NULL, revOp, 1>;
+
+multiclass VOPCX_F64 <string opName, string revOp = opName> :
+ VOPC_Pseudos <opName, VOPC_I1_F64_F64, COND_NULL, revOp, 1>;
+
+multiclass VOPCX_I16 <string opName, string revOp = opName> :
+ VOPC_Pseudos <opName, VOPC_I1_I16_I16, COND_NULL, revOp, 1>;
+
+multiclass VOPCX_I32 <string opName, string revOp = opName> :
+ VOPC_Pseudos <opName, VOPC_I1_I32_I32, COND_NULL, revOp, 1>;
+
+multiclass VOPCX_I64 <string opName, string revOp = opName> :
+ VOPC_Pseudos <opName, VOPC_I1_I64_I64, COND_NULL, revOp, 1>;
+
+
+//===----------------------------------------------------------------------===//
+// Compare instructions
+//===----------------------------------------------------------------------===//
+
+defm V_CMP_F_F32 : VOPC_F32 <"v_cmp_f_f32">;
+defm V_CMP_LT_F32 : VOPC_F32 <"v_cmp_lt_f32", COND_OLT, "v_cmp_gt_f32">;
+defm V_CMP_EQ_F32 : VOPC_F32 <"v_cmp_eq_f32", COND_OEQ>;
+defm V_CMP_LE_F32 : VOPC_F32 <"v_cmp_le_f32", COND_OLE, "v_cmp_ge_f32">;
+defm V_CMP_GT_F32 : VOPC_F32 <"v_cmp_gt_f32", COND_OGT>;
+defm V_CMP_LG_F32 : VOPC_F32 <"v_cmp_lg_f32", COND_ONE>;
+defm V_CMP_GE_F32 : VOPC_F32 <"v_cmp_ge_f32", COND_OGE>;
+defm V_CMP_O_F32 : VOPC_F32 <"v_cmp_o_f32", COND_O>;
+defm V_CMP_U_F32 : VOPC_F32 <"v_cmp_u_f32", COND_UO>;
+defm V_CMP_NGE_F32 : VOPC_F32 <"v_cmp_nge_f32", COND_ULT, "v_cmp_nle_f32">;
+defm V_CMP_NLG_F32 : VOPC_F32 <"v_cmp_nlg_f32", COND_UEQ>;
+defm V_CMP_NGT_F32 : VOPC_F32 <"v_cmp_ngt_f32", COND_ULE, "v_cmp_nlt_f32">;
+defm V_CMP_NLE_F32 : VOPC_F32 <"v_cmp_nle_f32", COND_UGT>;
+defm V_CMP_NEQ_F32 : VOPC_F32 <"v_cmp_neq_f32", COND_UNE>;
+defm V_CMP_NLT_F32 : VOPC_F32 <"v_cmp_nlt_f32", COND_UGE>;
+defm V_CMP_TRU_F32 : VOPC_F32 <"v_cmp_tru_f32">;
+
+defm V_CMPX_F_F32 : VOPCX_F32 <"v_cmpx_f_f32">;
+defm V_CMPX_LT_F32 : VOPCX_F32 <"v_cmpx_lt_f32", "v_cmpx_gt_f32">;
+defm V_CMPX_EQ_F32 : VOPCX_F32 <"v_cmpx_eq_f32">;
+defm V_CMPX_LE_F32 : VOPCX_F32 <"v_cmpx_le_f32", "v_cmpx_ge_f32">;
+defm V_CMPX_GT_F32 : VOPCX_F32 <"v_cmpx_gt_f32">;
+defm V_CMPX_LG_F32 : VOPCX_F32 <"v_cmpx_lg_f32">;
+defm V_CMPX_GE_F32 : VOPCX_F32 <"v_cmpx_ge_f32">;
+defm V_CMPX_O_F32 : VOPCX_F32 <"v_cmpx_o_f32">;
+defm V_CMPX_U_F32 : VOPCX_F32 <"v_cmpx_u_f32">;
+defm V_CMPX_NGE_F32 : VOPCX_F32 <"v_cmpx_nge_f32", "v_cmpx_nle_f32">;
+defm V_CMPX_NLG_F32 : VOPCX_F32 <"v_cmpx_nlg_f32">;
+defm V_CMPX_NGT_F32 : VOPCX_F32 <"v_cmpx_ngt_f32", "v_cmpx_nlt_f32">;
+defm V_CMPX_NLE_F32 : VOPCX_F32 <"v_cmpx_nle_f32">;
+defm V_CMPX_NEQ_F32 : VOPCX_F32 <"v_cmpx_neq_f32">;
+defm V_CMPX_NLT_F32 : VOPCX_F32 <"v_cmpx_nlt_f32">;
+defm V_CMPX_TRU_F32 : VOPCX_F32 <"v_cmpx_tru_f32">;
+
+defm V_CMP_F_F64 : VOPC_F64 <"v_cmp_f_f64">;
+defm V_CMP_LT_F64 : VOPC_F64 <"v_cmp_lt_f64", COND_OLT, "v_cmp_gt_f64">;
+defm V_CMP_EQ_F64 : VOPC_F64 <"v_cmp_eq_f64", COND_OEQ>;
+defm V_CMP_LE_F64 : VOPC_F64 <"v_cmp_le_f64", COND_OLE, "v_cmp_ge_f64">;
+defm V_CMP_GT_F64 : VOPC_F64 <"v_cmp_gt_f64", COND_OGT>;
+defm V_CMP_LG_F64 : VOPC_F64 <"v_cmp_lg_f64", COND_ONE>;
+defm V_CMP_GE_F64 : VOPC_F64 <"v_cmp_ge_f64", COND_OGE>;
+defm V_CMP_O_F64 : VOPC_F64 <"v_cmp_o_f64", COND_O>;
+defm V_CMP_U_F64 : VOPC_F64 <"v_cmp_u_f64", COND_UO>;
+defm V_CMP_NGE_F64 : VOPC_F64 <"v_cmp_nge_f64", COND_ULT, "v_cmp_nle_f64">;
+defm V_CMP_NLG_F64 : VOPC_F64 <"v_cmp_nlg_f64", COND_UEQ>;
+defm V_CMP_NGT_F64 : VOPC_F64 <"v_cmp_ngt_f64", COND_ULE, "v_cmp_nlt_f64">;
+defm V_CMP_NLE_F64 : VOPC_F64 <"v_cmp_nle_f64", COND_UGT>;
+defm V_CMP_NEQ_F64 : VOPC_F64 <"v_cmp_neq_f64", COND_UNE>;
+defm V_CMP_NLT_F64 : VOPC_F64 <"v_cmp_nlt_f64", COND_UGE>;
+defm V_CMP_TRU_F64 : VOPC_F64 <"v_cmp_tru_f64">;
+
+defm V_CMPX_F_F64 : VOPCX_F64 <"v_cmpx_f_f64">;
+defm V_CMPX_LT_F64 : VOPCX_F64 <"v_cmpx_lt_f64", "v_cmpx_gt_f64">;
+defm V_CMPX_EQ_F64 : VOPCX_F64 <"v_cmpx_eq_f64">;
+defm V_CMPX_LE_F64 : VOPCX_F64 <"v_cmpx_le_f64", "v_cmpx_ge_f64">;
+defm V_CMPX_GT_F64 : VOPCX_F64 <"v_cmpx_gt_f64">;
+defm V_CMPX_LG_F64 : VOPCX_F64 <"v_cmpx_lg_f64">;
+defm V_CMPX_GE_F64 : VOPCX_F64 <"v_cmpx_ge_f64">;
+defm V_CMPX_O_F64 : VOPCX_F64 <"v_cmpx_o_f64">;
+defm V_CMPX_U_F64 : VOPCX_F64 <"v_cmpx_u_f64">;
+defm V_CMPX_NGE_F64 : VOPCX_F64 <"v_cmpx_nge_f64", "v_cmpx_nle_f64">;
+defm V_CMPX_NLG_F64 : VOPCX_F64 <"v_cmpx_nlg_f64">;
+defm V_CMPX_NGT_F64 : VOPCX_F64 <"v_cmpx_ngt_f64", "v_cmpx_nlt_f64">;
+defm V_CMPX_NLE_F64 : VOPCX_F64 <"v_cmpx_nle_f64">;
+defm V_CMPX_NEQ_F64 : VOPCX_F64 <"v_cmpx_neq_f64">;
+defm V_CMPX_NLT_F64 : VOPCX_F64 <"v_cmpx_nlt_f64">;
+defm V_CMPX_TRU_F64 : VOPCX_F64 <"v_cmpx_tru_f64">;
+
+let SubtargetPredicate = isSICI in {
+
+defm V_CMPS_F_F32 : VOPC_F32 <"v_cmps_f_f32">;
+defm V_CMPS_LT_F32 : VOPC_F32 <"v_cmps_lt_f32", COND_NULL, "v_cmps_gt_f32">;
+defm V_CMPS_EQ_F32 : VOPC_F32 <"v_cmps_eq_f32">;
+defm V_CMPS_LE_F32 : VOPC_F32 <"v_cmps_le_f32", COND_NULL, "v_cmps_ge_f32">;
+defm V_CMPS_GT_F32 : VOPC_F32 <"v_cmps_gt_f32">;
+defm V_CMPS_LG_F32 : VOPC_F32 <"v_cmps_lg_f32">;
+defm V_CMPS_GE_F32 : VOPC_F32 <"v_cmps_ge_f32">;
+defm V_CMPS_O_F32 : VOPC_F32 <"v_cmps_o_f32">;
+defm V_CMPS_U_F32 : VOPC_F32 <"v_cmps_u_f32">;
+defm V_CMPS_NGE_F32 : VOPC_F32 <"v_cmps_nge_f32", COND_NULL, "v_cmps_nle_f32">;
+defm V_CMPS_NLG_F32 : VOPC_F32 <"v_cmps_nlg_f32">;
+defm V_CMPS_NGT_F32 : VOPC_F32 <"v_cmps_ngt_f32", COND_NULL, "v_cmps_nlt_f32">;
+defm V_CMPS_NLE_F32 : VOPC_F32 <"v_cmps_nle_f32">;
+defm V_CMPS_NEQ_F32 : VOPC_F32 <"v_cmps_neq_f32">;
+defm V_CMPS_NLT_F32 : VOPC_F32 <"v_cmps_nlt_f32">;
+defm V_CMPS_TRU_F32 : VOPC_F32 <"v_cmps_tru_f32">;
+
+defm V_CMPSX_F_F32 : VOPCX_F32 <"v_cmpsx_f_f32">;
+defm V_CMPSX_LT_F32 : VOPCX_F32 <"v_cmpsx_lt_f32", "v_cmpsx_gt_f32">;
+defm V_CMPSX_EQ_F32 : VOPCX_F32 <"v_cmpsx_eq_f32">;
+defm V_CMPSX_LE_F32 : VOPCX_F32 <"v_cmpsx_le_f32", "v_cmpsx_ge_f32">;
+defm V_CMPSX_GT_F32 : VOPCX_F32 <"v_cmpsx_gt_f32">;
+defm V_CMPSX_LG_F32 : VOPCX_F32 <"v_cmpsx_lg_f32">;
+defm V_CMPSX_GE_F32 : VOPCX_F32 <"v_cmpsx_ge_f32">;
+defm V_CMPSX_O_F32 : VOPCX_F32 <"v_cmpsx_o_f32">;
+defm V_CMPSX_U_F32 : VOPCX_F32 <"v_cmpsx_u_f32">;
+defm V_CMPSX_NGE_F32 : VOPCX_F32 <"v_cmpsx_nge_f32", "v_cmpsx_nle_f32">;
+defm V_CMPSX_NLG_F32 : VOPCX_F32 <"v_cmpsx_nlg_f32">;
+defm V_CMPSX_NGT_F32 : VOPCX_F32 <"v_cmpsx_ngt_f32", "v_cmpsx_nlt_f32">;
+defm V_CMPSX_NLE_F32 : VOPCX_F32 <"v_cmpsx_nle_f32">;
+defm V_CMPSX_NEQ_F32 : VOPCX_F32 <"v_cmpsx_neq_f32">;
+defm V_CMPSX_NLT_F32 : VOPCX_F32 <"v_cmpsx_nlt_f32">;
+defm V_CMPSX_TRU_F32 : VOPCX_F32 <"v_cmpsx_tru_f32">;
+
+defm V_CMPS_F_F64 : VOPC_F64 <"v_cmps_f_f64">;
+defm V_CMPS_LT_F64 : VOPC_F64 <"v_cmps_lt_f64", COND_NULL, "v_cmps_gt_f64">;
+defm V_CMPS_EQ_F64 : VOPC_F64 <"v_cmps_eq_f64">;
+defm V_CMPS_LE_F64 : VOPC_F64 <"v_cmps_le_f64", COND_NULL, "v_cmps_ge_f64">;
+defm V_CMPS_GT_F64 : VOPC_F64 <"v_cmps_gt_f64">;
+defm V_CMPS_LG_F64 : VOPC_F64 <"v_cmps_lg_f64">;
+defm V_CMPS_GE_F64 : VOPC_F64 <"v_cmps_ge_f64">;
+defm V_CMPS_O_F64 : VOPC_F64 <"v_cmps_o_f64">;
+defm V_CMPS_U_F64 : VOPC_F64 <"v_cmps_u_f64">;
+defm V_CMPS_NGE_F64 : VOPC_F64 <"v_cmps_nge_f64", COND_NULL, "v_cmps_nle_f64">;
+defm V_CMPS_NLG_F64 : VOPC_F64 <"v_cmps_nlg_f64">;
+defm V_CMPS_NGT_F64 : VOPC_F64 <"v_cmps_ngt_f64", COND_NULL, "v_cmps_nlt_f64">;
+defm V_CMPS_NLE_F64 : VOPC_F64 <"v_cmps_nle_f64">;
+defm V_CMPS_NEQ_F64 : VOPC_F64 <"v_cmps_neq_f64">;
+defm V_CMPS_NLT_F64 : VOPC_F64 <"v_cmps_nlt_f64">;
+defm V_CMPS_TRU_F64 : VOPC_F64 <"v_cmps_tru_f64">;
+
+defm V_CMPSX_F_F64 : VOPCX_F64 <"v_cmpsx_f_f64">;
+defm V_CMPSX_LT_F64 : VOPCX_F64 <"v_cmpsx_lt_f64", "v_cmpsx_gt_f64">;
+defm V_CMPSX_EQ_F64 : VOPCX_F64 <"v_cmpsx_eq_f64">;
+defm V_CMPSX_LE_F64 : VOPCX_F64 <"v_cmpsx_le_f64", "v_cmpsx_ge_f64">;
+defm V_CMPSX_GT_F64 : VOPCX_F64 <"v_cmpsx_gt_f64">;
+defm V_CMPSX_LG_F64 : VOPCX_F64 <"v_cmpsx_lg_f64">;
+defm V_CMPSX_GE_F64 : VOPCX_F64 <"v_cmpsx_ge_f64">;
+defm V_CMPSX_O_F64 : VOPCX_F64 <"v_cmpsx_o_f64">;
+defm V_CMPSX_U_F64 : VOPCX_F64 <"v_cmpsx_u_f64">;
+defm V_CMPSX_NGE_F64 : VOPCX_F64 <"v_cmpsx_nge_f64", "v_cmpsx_nle_f64">;
+defm V_CMPSX_NLG_F64 : VOPCX_F64 <"v_cmpsx_nlg_f64">;
+defm V_CMPSX_NGT_F64 : VOPCX_F64 <"v_cmpsx_ngt_f64", "v_cmpsx_nlt_f64">;
+defm V_CMPSX_NLE_F64 : VOPCX_F64 <"v_cmpsx_nle_f64">;
+defm V_CMPSX_NEQ_F64 : VOPCX_F64 <"v_cmpsx_neq_f64">;
+defm V_CMPSX_NLT_F64 : VOPCX_F64 <"v_cmpsx_nlt_f64">;
+defm V_CMPSX_TRU_F64 : VOPCX_F64 <"v_cmpsx_tru_f64">;
+
+} // End SubtargetPredicate = isSICI
+
+let SubtargetPredicate = Has16BitInsts in {
+
+defm V_CMP_F_F16 : VOPC_F16 <"v_cmp_f_f16">;
+defm V_CMP_LT_F16 : VOPC_F16 <"v_cmp_lt_f16", COND_OLT, "v_cmp_gt_f16">;
+defm V_CMP_EQ_F16 : VOPC_F16 <"v_cmp_eq_f16", COND_OEQ>;
+defm V_CMP_LE_F16 : VOPC_F16 <"v_cmp_le_f16", COND_OLE, "v_cmp_ge_f16">;
+defm V_CMP_GT_F16 : VOPC_F16 <"v_cmp_gt_f16", COND_OGT>;
+defm V_CMP_LG_F16 : VOPC_F16 <"v_cmp_lg_f16", COND_ONE>;
+defm V_CMP_GE_F16 : VOPC_F16 <"v_cmp_ge_f16", COND_OGE>;
+defm V_CMP_O_F16 : VOPC_F16 <"v_cmp_o_f16", COND_O>;
+defm V_CMP_U_F16 : VOPC_F16 <"v_cmp_u_f16", COND_UO>;
+defm V_CMP_NGE_F16 : VOPC_F16 <"v_cmp_nge_f16", COND_ULT, "v_cmp_nle_f16">;
+defm V_CMP_NLG_F16 : VOPC_F16 <"v_cmp_nlg_f16", COND_UEQ>;
+defm V_CMP_NGT_F16 : VOPC_F16 <"v_cmp_ngt_f16", COND_ULE, "v_cmp_nlt_f16">;
+defm V_CMP_NLE_F16 : VOPC_F16 <"v_cmp_nle_f16", COND_UGT>;
+defm V_CMP_NEQ_F16 : VOPC_F16 <"v_cmp_neq_f16", COND_UNE>;
+defm V_CMP_NLT_F16 : VOPC_F16 <"v_cmp_nlt_f16", COND_UGE>;
+defm V_CMP_TRU_F16 : VOPC_F16 <"v_cmp_tru_f16">;
+
+defm V_CMPX_F_F16 : VOPCX_F16 <"v_cmpx_f_f16">;
+defm V_CMPX_LT_F16 : VOPCX_F16 <"v_cmpx_lt_f16", "v_cmpx_gt_f16">;
+defm V_CMPX_EQ_F16 : VOPCX_F16 <"v_cmpx_eq_f16">;
+defm V_CMPX_LE_F16 : VOPCX_F16 <"v_cmpx_le_f16", "v_cmpx_ge_f16">;
+defm V_CMPX_GT_F16 : VOPCX_F16 <"v_cmpx_gt_f16">;
+defm V_CMPX_LG_F16 : VOPCX_F16 <"v_cmpx_lg_f16">;
+defm V_CMPX_GE_F16 : VOPCX_F16 <"v_cmpx_ge_f16">;
+defm V_CMPX_O_F16 : VOPCX_F16 <"v_cmpx_o_f16">;
+defm V_CMPX_U_F16 : VOPCX_F16 <"v_cmpx_u_f16">;
+defm V_CMPX_NGE_F16 : VOPCX_F16 <"v_cmpx_nge_f16", "v_cmpx_nle_f16">;
+defm V_CMPX_NLG_F16 : VOPCX_F16 <"v_cmpx_nlg_f16">;
+defm V_CMPX_NGT_F16 : VOPCX_F16 <"v_cmpx_ngt_f16", "v_cmpx_nlt_f16">;
+defm V_CMPX_NLE_F16 : VOPCX_F16 <"v_cmpx_nle_f16">;
+defm V_CMPX_NEQ_F16 : VOPCX_F16 <"v_cmpx_neq_f16">;
+defm V_CMPX_NLT_F16 : VOPCX_F16 <"v_cmpx_nlt_f16">;
+defm V_CMPX_TRU_F16 : VOPCX_F16 <"v_cmpx_tru_f16">;
+
+defm V_CMP_F_I16 : VOPC_I16 <"v_cmp_f_i16">;
+defm V_CMP_LT_I16 : VOPC_I16 <"v_cmp_lt_i16", COND_SLT, "v_cmp_gt_i16">;
+defm V_CMP_EQ_I16 : VOPC_I16 <"v_cmp_eq_i16">;
+defm V_CMP_LE_I16 : VOPC_I16 <"v_cmp_le_i16", COND_SLE, "v_cmp_ge_i16">;
+defm V_CMP_GT_I16 : VOPC_I16 <"v_cmp_gt_i16", COND_SGT>;
+defm V_CMP_NE_I16 : VOPC_I16 <"v_cmp_ne_i16">;
+defm V_CMP_GE_I16 : VOPC_I16 <"v_cmp_ge_i16", COND_SGE>;
+defm V_CMP_T_I16 : VOPC_I16 <"v_cmp_t_i16">;
+
+defm V_CMP_F_U16 : VOPC_I16 <"v_cmp_f_u16">;
+defm V_CMP_LT_U16 : VOPC_I16 <"v_cmp_lt_u16", COND_ULT, "v_cmp_gt_u16">;
+defm V_CMP_EQ_U16 : VOPC_I16 <"v_cmp_eq_u16", COND_EQ>;
+defm V_CMP_LE_U16 : VOPC_I16 <"v_cmp_le_u16", COND_ULE, "v_cmp_ge_u16">;
+defm V_CMP_GT_U16 : VOPC_I16 <"v_cmp_gt_u16", COND_UGT>;
+defm V_CMP_NE_U16 : VOPC_I16 <"v_cmp_ne_u16", COND_NE>;
+defm V_CMP_GE_U16 : VOPC_I16 <"v_cmp_ge_u16", COND_UGE>;
+defm V_CMP_T_U16 : VOPC_I16 <"v_cmp_t_u16">;
+
+defm V_CMPX_F_I16 : VOPCX_I16 <"v_cmpx_f_i16">;
+defm V_CMPX_LT_I16 : VOPCX_I16 <"v_cmpx_lt_i16", "v_cmpx_gt_i16">;
+defm V_CMPX_EQ_I16 : VOPCX_I16 <"v_cmpx_eq_i16">;
+defm V_CMPX_LE_I16 : VOPCX_I16 <"v_cmpx_le_i16", "v_cmpx_ge_i16">;
+defm V_CMPX_GT_I16 : VOPCX_I16 <"v_cmpx_gt_i16">;
+defm V_CMPX_NE_I16 : VOPCX_I16 <"v_cmpx_ne_i16">;
+defm V_CMPX_GE_I16 : VOPCX_I16 <"v_cmpx_ge_i16">;
+defm V_CMPX_T_I16 : VOPCX_I16 <"v_cmpx_t_i16">;
+defm V_CMPX_F_U16 : VOPCX_I16 <"v_cmpx_f_u16">;
+
+defm V_CMPX_LT_U16 : VOPCX_I16 <"v_cmpx_lt_u16", "v_cmpx_gt_u16">;
+defm V_CMPX_EQ_U16 : VOPCX_I16 <"v_cmpx_eq_u16">;
+defm V_CMPX_LE_U16 : VOPCX_I16 <"v_cmpx_le_u16", "v_cmpx_ge_u16">;
+defm V_CMPX_GT_U16 : VOPCX_I16 <"v_cmpx_gt_u16">;
+defm V_CMPX_NE_U16 : VOPCX_I16 <"v_cmpx_ne_u16">;
+defm V_CMPX_GE_U16 : VOPCX_I16 <"v_cmpx_ge_u16">;
+defm V_CMPX_T_U16 : VOPCX_I16 <"v_cmpx_t_u16">;
+
+} // End SubtargetPredicate = Has16BitInsts
+
+defm V_CMP_F_I32 : VOPC_I32 <"v_cmp_f_i32">;
+defm V_CMP_LT_I32 : VOPC_I32 <"v_cmp_lt_i32", COND_SLT, "v_cmp_gt_i32">;
+defm V_CMP_EQ_I32 : VOPC_I32 <"v_cmp_eq_i32">;
+defm V_CMP_LE_I32 : VOPC_I32 <"v_cmp_le_i32", COND_SLE, "v_cmp_ge_i32">;
+defm V_CMP_GT_I32 : VOPC_I32 <"v_cmp_gt_i32", COND_SGT>;
+defm V_CMP_NE_I32 : VOPC_I32 <"v_cmp_ne_i32">;
+defm V_CMP_GE_I32 : VOPC_I32 <"v_cmp_ge_i32", COND_SGE>;
+defm V_CMP_T_I32 : VOPC_I32 <"v_cmp_t_i32">;
+
+defm V_CMPX_F_I32 : VOPCX_I32 <"v_cmpx_f_i32">;
+defm V_CMPX_LT_I32 : VOPCX_I32 <"v_cmpx_lt_i32", "v_cmpx_gt_i32">;
+defm V_CMPX_EQ_I32 : VOPCX_I32 <"v_cmpx_eq_i32">;
+defm V_CMPX_LE_I32 : VOPCX_I32 <"v_cmpx_le_i32", "v_cmpx_ge_i32">;
+defm V_CMPX_GT_I32 : VOPCX_I32 <"v_cmpx_gt_i32">;
+defm V_CMPX_NE_I32 : VOPCX_I32 <"v_cmpx_ne_i32">;
+defm V_CMPX_GE_I32 : VOPCX_I32 <"v_cmpx_ge_i32">;
+defm V_CMPX_T_I32 : VOPCX_I32 <"v_cmpx_t_i32">;
+
+defm V_CMP_F_I64 : VOPC_I64 <"v_cmp_f_i64">;
+defm V_CMP_LT_I64 : VOPC_I64 <"v_cmp_lt_i64", COND_SLT, "v_cmp_gt_i64">;
+defm V_CMP_EQ_I64 : VOPC_I64 <"v_cmp_eq_i64">;
+defm V_CMP_LE_I64 : VOPC_I64 <"v_cmp_le_i64", COND_SLE, "v_cmp_ge_i64">;
+defm V_CMP_GT_I64 : VOPC_I64 <"v_cmp_gt_i64", COND_SGT>;
+defm V_CMP_NE_I64 : VOPC_I64 <"v_cmp_ne_i64">;
+defm V_CMP_GE_I64 : VOPC_I64 <"v_cmp_ge_i64", COND_SGE>;
+defm V_CMP_T_I64 : VOPC_I64 <"v_cmp_t_i64">;
+
+defm V_CMPX_F_I64 : VOPCX_I64 <"v_cmpx_f_i64">;
+defm V_CMPX_LT_I64 : VOPCX_I64 <"v_cmpx_lt_i64", "v_cmpx_gt_i64">;
+defm V_CMPX_EQ_I64 : VOPCX_I64 <"v_cmpx_eq_i64">;
+defm V_CMPX_LE_I64 : VOPCX_I64 <"v_cmpx_le_i64", "v_cmpx_ge_i64">;
+defm V_CMPX_GT_I64 : VOPCX_I64 <"v_cmpx_gt_i64">;
+defm V_CMPX_NE_I64 : VOPCX_I64 <"v_cmpx_ne_i64">;
+defm V_CMPX_GE_I64 : VOPCX_I64 <"v_cmpx_ge_i64">;
+defm V_CMPX_T_I64 : VOPCX_I64 <"v_cmpx_t_i64">;
+
+defm V_CMP_F_U32 : VOPC_I32 <"v_cmp_f_u32">;
+defm V_CMP_LT_U32 : VOPC_I32 <"v_cmp_lt_u32", COND_ULT, "v_cmp_gt_u32">;
+defm V_CMP_EQ_U32 : VOPC_I32 <"v_cmp_eq_u32", COND_EQ>;
+defm V_CMP_LE_U32 : VOPC_I32 <"v_cmp_le_u32", COND_ULE, "v_cmp_ge_u32">;
+defm V_CMP_GT_U32 : VOPC_I32 <"v_cmp_gt_u32", COND_UGT>;
+defm V_CMP_NE_U32 : VOPC_I32 <"v_cmp_ne_u32", COND_NE>;
+defm V_CMP_GE_U32 : VOPC_I32 <"v_cmp_ge_u32", COND_UGE>;
+defm V_CMP_T_U32 : VOPC_I32 <"v_cmp_t_u32">;
+
+defm V_CMPX_F_U32 : VOPCX_I32 <"v_cmpx_f_u32">;
+defm V_CMPX_LT_U32 : VOPCX_I32 <"v_cmpx_lt_u32", "v_cmpx_gt_u32">;
+defm V_CMPX_EQ_U32 : VOPCX_I32 <"v_cmpx_eq_u32">;
+defm V_CMPX_LE_U32 : VOPCX_I32 <"v_cmpx_le_u32", "v_cmpx_le_u32">;
+defm V_CMPX_GT_U32 : VOPCX_I32 <"v_cmpx_gt_u32">;
+defm V_CMPX_NE_U32 : VOPCX_I32 <"v_cmpx_ne_u32">;
+defm V_CMPX_GE_U32 : VOPCX_I32 <"v_cmpx_ge_u32">;
+defm V_CMPX_T_U32 : VOPCX_I32 <"v_cmpx_t_u32">;
+
+defm V_CMP_F_U64 : VOPC_I64 <"v_cmp_f_u64">;
+defm V_CMP_LT_U64 : VOPC_I64 <"v_cmp_lt_u64", COND_ULT, "v_cmp_gt_u64">;
+defm V_CMP_EQ_U64 : VOPC_I64 <"v_cmp_eq_u64", COND_EQ>;
+defm V_CMP_LE_U64 : VOPC_I64 <"v_cmp_le_u64", COND_ULE, "v_cmp_ge_u64">;
+defm V_CMP_GT_U64 : VOPC_I64 <"v_cmp_gt_u64", COND_UGT>;
+defm V_CMP_NE_U64 : VOPC_I64 <"v_cmp_ne_u64", COND_NE>;
+defm V_CMP_GE_U64 : VOPC_I64 <"v_cmp_ge_u64", COND_UGE>;
+defm V_CMP_T_U64 : VOPC_I64 <"v_cmp_t_u64">;
+
+defm V_CMPX_F_U64 : VOPCX_I64 <"v_cmpx_f_u64">;
+defm V_CMPX_LT_U64 : VOPCX_I64 <"v_cmpx_lt_u64", "v_cmpx_gt_u64">;
+defm V_CMPX_EQ_U64 : VOPCX_I64 <"v_cmpx_eq_u64">;
+defm V_CMPX_LE_U64 : VOPCX_I64 <"v_cmpx_le_u64", "v_cmpx_ge_u64">;
+defm V_CMPX_GT_U64 : VOPCX_I64 <"v_cmpx_gt_u64">;
+defm V_CMPX_NE_U64 : VOPCX_I64 <"v_cmpx_ne_u64">;
+defm V_CMPX_GE_U64 : VOPCX_I64 <"v_cmpx_ge_u64">;
+defm V_CMPX_T_U64 : VOPCX_I64 <"v_cmpx_t_u64">;
+
+//===----------------------------------------------------------------------===//
+// Class instructions
+//===----------------------------------------------------------------------===//
+
+class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType vt> :
+ VOPC_Profile<sched, vt, i32> {
+ let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
+ let Asm64 = "$sdst, $src0_modifiers, $src1";
+ let InsSDWA = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0,
+ Int32InputMods:$src1_modifiers, Src1RC64:$src1,
+ clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel);
+ let AsmSDWA = " vcc, $src0_modifiers, $src1_modifiers$clamp $src0_sel $src1_sel";
+ let HasSrc1Mods = 0;
+ let HasClamp = 0;
+ let HasOMod = 0;
+}
+
+class getVOPCClassPat64 <VOPProfile P> {
+ list<dag> ret =
+ [(set i1:$sdst,
+ (AMDGPUfp_class
+ (P.Src0VT (VOP3Mods0Clamp0OMod P.Src0VT:$src0, i32:$src0_modifiers)),
+ P.Src1VT:$src1))];
+}
+
+// Special case for class instructions which only have modifiers on
+// the 1st source operand.
+multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec> {
+ def _e32 : VOPC_Pseudo <opName, p> {
+ let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
+ let SchedRW = p.Schedule;
+ let isConvergent = DefExec;
+ }
+
+ def _e64 : VOP3_Pseudo<opName, p, getVOPCClassPat64<p>.ret> {
+ let Defs = !if(DefExec, [EXEC], []);
+ let SchedRW = p.Schedule;
+ }
+
+ def _sdwa : VOPC_SDWA_Pseudo <opName, p> {
+ let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
+ let SchedRW = p.Schedule;
+ let isConvergent = DefExec;
+ }
+}
+
+def VOPC_I1_F16_I32 : VOPC_Class_Profile<[Write32Bit], f16>;
+def VOPC_I1_F32_I32 : VOPC_Class_Profile<[Write32Bit], f32>;
+def VOPC_I1_F64_I32 : VOPC_Class_Profile<[WriteDoubleAdd], f64>;
+
+multiclass VOPC_CLASS_F16 <string opName> :
+ VOPC_Class_Pseudos <opName, VOPC_I1_F16_I32, 0>;
+
+multiclass VOPCX_CLASS_F16 <string opName> :
+ VOPC_Class_Pseudos <opName, VOPC_I1_F32_I32, 1>;
+
+multiclass VOPC_CLASS_F32 <string opName> :
+ VOPC_Class_Pseudos <opName, VOPC_I1_F32_I32, 0>;
+
+multiclass VOPCX_CLASS_F32 <string opName> :
+ VOPC_Class_Pseudos <opName, VOPC_I1_F32_I32, 1>;
+
+multiclass VOPC_CLASS_F64 <string opName> :
+ VOPC_Class_Pseudos <opName, VOPC_I1_F64_I32, 0>;
+
+multiclass VOPCX_CLASS_F64 <string opName> :
+ VOPC_Class_Pseudos <opName, VOPC_I1_F64_I32, 1>;
+
+defm V_CMP_CLASS_F32 : VOPC_CLASS_F32 <"v_cmp_class_f32">;
+defm V_CMPX_CLASS_F32 : VOPCX_CLASS_F32 <"v_cmpx_class_f32">;
+defm V_CMP_CLASS_F64 : VOPC_CLASS_F64 <"v_cmp_class_f64">;
+defm V_CMPX_CLASS_F64 : VOPCX_CLASS_F64 <"v_cmpx_class_f64">;
+defm V_CMP_CLASS_F16 : VOPC_CLASS_F16 <"v_cmp_class_f16">;
+defm V_CMPX_CLASS_F16 : VOPCX_CLASS_F16 <"v_cmpx_class_f16">;
+
+//===----------------------------------------------------------------------===//
+// V_ICMPIntrinsic Pattern.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isGCN] in {
+
+class ICMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> : Pat <
+ (AMDGPUsetcc vt:$src0, vt:$src1, cond),
+ (inst $src0, $src1)
+>;
+
+def : ICMP_Pattern <COND_EQ, V_CMP_EQ_U32_e64, i32>;
+def : ICMP_Pattern <COND_NE, V_CMP_NE_U32_e64, i32>;
+def : ICMP_Pattern <COND_UGT, V_CMP_GT_U32_e64, i32>;
+def : ICMP_Pattern <COND_UGE, V_CMP_GE_U32_e64, i32>;
+def : ICMP_Pattern <COND_ULT, V_CMP_LT_U32_e64, i32>;
+def : ICMP_Pattern <COND_ULE, V_CMP_LE_U32_e64, i32>;
+def : ICMP_Pattern <COND_SGT, V_CMP_GT_I32_e64, i32>;
+def : ICMP_Pattern <COND_SGE, V_CMP_GE_I32_e64, i32>;
+def : ICMP_Pattern <COND_SLT, V_CMP_LT_I32_e64, i32>;
+def : ICMP_Pattern <COND_SLE, V_CMP_LE_I32_e64, i32>;
+
+def : ICMP_Pattern <COND_EQ, V_CMP_EQ_U64_e64, i64>;
+def : ICMP_Pattern <COND_NE, V_CMP_NE_U64_e64, i64>;
+def : ICMP_Pattern <COND_UGT, V_CMP_GT_U64_e64, i64>;
+def : ICMP_Pattern <COND_UGE, V_CMP_GE_U64_e64, i64>;
+def : ICMP_Pattern <COND_ULT, V_CMP_LT_U64_e64, i64>;
+def : ICMP_Pattern <COND_ULE, V_CMP_LE_U64_e64, i64>;
+def : ICMP_Pattern <COND_SGT, V_CMP_GT_I64_e64, i64>;
+def : ICMP_Pattern <COND_SGE, V_CMP_GE_I64_e64, i64>;
+def : ICMP_Pattern <COND_SLT, V_CMP_LT_I64_e64, i64>;
+def : ICMP_Pattern <COND_SLE, V_CMP_LE_I64_e64, i64>;
+
+class FCMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> : Pat <
+ (i64 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)),
+ (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)),
+ (inst $src0_modifiers, $src0, $src1_modifiers, $src1,
+ DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+def : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F32_e64, f32>;
+def : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F32_e64, f32>;
+def : FCMP_Pattern <COND_OGT, V_CMP_GT_F32_e64, f32>;
+def : FCMP_Pattern <COND_OGE, V_CMP_GE_F32_e64, f32>;
+def : FCMP_Pattern <COND_OLT, V_CMP_LT_F32_e64, f32>;
+def : FCMP_Pattern <COND_OLE, V_CMP_LE_F32_e64, f32>;
+
+def : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F64_e64, f64>;
+def : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F64_e64, f64>;
+def : FCMP_Pattern <COND_OGT, V_CMP_GT_F64_e64, f64>;
+def : FCMP_Pattern <COND_OGE, V_CMP_GE_F64_e64, f64>;
+def : FCMP_Pattern <COND_OLT, V_CMP_LT_F64_e64, f64>;
+def : FCMP_Pattern <COND_OLE, V_CMP_LE_F64_e64, f64>;
+
+def : FCMP_Pattern <COND_UEQ, V_CMP_NLG_F32_e64, f32>;
+def : FCMP_Pattern <COND_UNE, V_CMP_NEQ_F32_e64, f32>;
+def : FCMP_Pattern <COND_UGT, V_CMP_NLE_F32_e64, f32>;
+def : FCMP_Pattern <COND_UGE, V_CMP_NLT_F32_e64, f32>;
+def : FCMP_Pattern <COND_ULT, V_CMP_NGE_F32_e64, f32>;
+def : FCMP_Pattern <COND_ULE, V_CMP_NGT_F32_e64, f32>;
+
+def : FCMP_Pattern <COND_UEQ, V_CMP_NLG_F64_e64, f64>;
+def : FCMP_Pattern <COND_UNE, V_CMP_NEQ_F64_e64, f64>;
+def : FCMP_Pattern <COND_UGT, V_CMP_NLE_F64_e64, f64>;
+def : FCMP_Pattern <COND_UGE, V_CMP_NLT_F64_e64, f64>;
+def : FCMP_Pattern <COND_ULT, V_CMP_NGE_F64_e64, f64>;
+def : FCMP_Pattern <COND_ULE, V_CMP_NGT_F64_e64, f64>;
+
+} // End Predicates = [isGCN]
+
+//===----------------------------------------------------------------------===//
+// Target
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SI
+//===----------------------------------------------------------------------===//
+
+multiclass VOPC_Real_si <bits<9> op> {
+ let AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" in {
+ def _e32_si :
+ VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
+ VOPCe<op{7-0}>;
+
+ def _e64_si :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
+ VOP3a_si <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
+ // Encoding used for VOPC instructions encoded as VOP3
+ // Differs from VOP3e by destination name (sdst) as VOPC doesn't have vector dst
+ bits<8> sdst;
+ let Inst{7-0} = sdst;
+ }
+ }
+ def : VOPCInstAlias <!cast<VOP3_Pseudo>(NAME#"_e64"),
+ !cast<Instruction>(NAME#"_e32_si")> {
+ let AssemblerPredicate = isSICI;
+ }
+}
+
+defm V_CMP_F_F32 : VOPC_Real_si <0x0>;
+defm V_CMP_LT_F32 : VOPC_Real_si <0x1>;
+defm V_CMP_EQ_F32 : VOPC_Real_si <0x2>;
+defm V_CMP_LE_F32 : VOPC_Real_si <0x3>;
+defm V_CMP_GT_F32 : VOPC_Real_si <0x4>;
+defm V_CMP_LG_F32 : VOPC_Real_si <0x5>;
+defm V_CMP_GE_F32 : VOPC_Real_si <0x6>;
+defm V_CMP_O_F32 : VOPC_Real_si <0x7>;
+defm V_CMP_U_F32 : VOPC_Real_si <0x8>;
+defm V_CMP_NGE_F32 : VOPC_Real_si <0x9>;
+defm V_CMP_NLG_F32 : VOPC_Real_si <0xa>;
+defm V_CMP_NGT_F32 : VOPC_Real_si <0xb>;
+defm V_CMP_NLE_F32 : VOPC_Real_si <0xc>;
+defm V_CMP_NEQ_F32 : VOPC_Real_si <0xd>;
+defm V_CMP_NLT_F32 : VOPC_Real_si <0xe>;
+defm V_CMP_TRU_F32 : VOPC_Real_si <0xf>;
+
+defm V_CMPX_F_F32 : VOPC_Real_si <0x10>;
+defm V_CMPX_LT_F32 : VOPC_Real_si <0x11>;
+defm V_CMPX_EQ_F32 : VOPC_Real_si <0x12>;
+defm V_CMPX_LE_F32 : VOPC_Real_si <0x13>;
+defm V_CMPX_GT_F32 : VOPC_Real_si <0x14>;
+defm V_CMPX_LG_F32 : VOPC_Real_si <0x15>;
+defm V_CMPX_GE_F32 : VOPC_Real_si <0x16>;
+defm V_CMPX_O_F32 : VOPC_Real_si <0x17>;
+defm V_CMPX_U_F32 : VOPC_Real_si <0x18>;
+defm V_CMPX_NGE_F32 : VOPC_Real_si <0x19>;
+defm V_CMPX_NLG_F32 : VOPC_Real_si <0x1a>;
+defm V_CMPX_NGT_F32 : VOPC_Real_si <0x1b>;
+defm V_CMPX_NLE_F32 : VOPC_Real_si <0x1c>;
+defm V_CMPX_NEQ_F32 : VOPC_Real_si <0x1d>;
+defm V_CMPX_NLT_F32 : VOPC_Real_si <0x1e>;
+defm V_CMPX_TRU_F32 : VOPC_Real_si <0x1f>;
+
+defm V_CMP_F_F64 : VOPC_Real_si <0x20>;
+defm V_CMP_LT_F64 : VOPC_Real_si <0x21>;
+defm V_CMP_EQ_F64 : VOPC_Real_si <0x22>;
+defm V_CMP_LE_F64 : VOPC_Real_si <0x23>;
+defm V_CMP_GT_F64 : VOPC_Real_si <0x24>;
+defm V_CMP_LG_F64 : VOPC_Real_si <0x25>;
+defm V_CMP_GE_F64 : VOPC_Real_si <0x26>;
+defm V_CMP_O_F64 : VOPC_Real_si <0x27>;
+defm V_CMP_U_F64 : VOPC_Real_si <0x28>;
+defm V_CMP_NGE_F64 : VOPC_Real_si <0x29>;
+defm V_CMP_NLG_F64 : VOPC_Real_si <0x2a>;
+defm V_CMP_NGT_F64 : VOPC_Real_si <0x2b>;
+defm V_CMP_NLE_F64 : VOPC_Real_si <0x2c>;
+defm V_CMP_NEQ_F64 : VOPC_Real_si <0x2d>;
+defm V_CMP_NLT_F64 : VOPC_Real_si <0x2e>;
+defm V_CMP_TRU_F64 : VOPC_Real_si <0x2f>;
+
+defm V_CMPX_F_F64 : VOPC_Real_si <0x30>;
+defm V_CMPX_LT_F64 : VOPC_Real_si <0x31>;
+defm V_CMPX_EQ_F64 : VOPC_Real_si <0x32>;
+defm V_CMPX_LE_F64 : VOPC_Real_si <0x33>;
+defm V_CMPX_GT_F64 : VOPC_Real_si <0x34>;
+defm V_CMPX_LG_F64 : VOPC_Real_si <0x35>;
+defm V_CMPX_GE_F64 : VOPC_Real_si <0x36>;
+defm V_CMPX_O_F64 : VOPC_Real_si <0x37>;
+defm V_CMPX_U_F64 : VOPC_Real_si <0x38>;
+defm V_CMPX_NGE_F64 : VOPC_Real_si <0x39>;
+defm V_CMPX_NLG_F64 : VOPC_Real_si <0x3a>;
+defm V_CMPX_NGT_F64 : VOPC_Real_si <0x3b>;
+defm V_CMPX_NLE_F64 : VOPC_Real_si <0x3c>;
+defm V_CMPX_NEQ_F64 : VOPC_Real_si <0x3d>;
+defm V_CMPX_NLT_F64 : VOPC_Real_si <0x3e>;
+defm V_CMPX_TRU_F64 : VOPC_Real_si <0x3f>;
+
+defm V_CMPS_F_F32 : VOPC_Real_si <0x40>;
+defm V_CMPS_LT_F32 : VOPC_Real_si <0x41>;
+defm V_CMPS_EQ_F32 : VOPC_Real_si <0x42>;
+defm V_CMPS_LE_F32 : VOPC_Real_si <0x43>;
+defm V_CMPS_GT_F32 : VOPC_Real_si <0x44>;
+defm V_CMPS_LG_F32 : VOPC_Real_si <0x45>;
+defm V_CMPS_GE_F32 : VOPC_Real_si <0x46>;
+defm V_CMPS_O_F32 : VOPC_Real_si <0x47>;
+defm V_CMPS_U_F32 : VOPC_Real_si <0x48>;
+defm V_CMPS_NGE_F32 : VOPC_Real_si <0x49>;
+defm V_CMPS_NLG_F32 : VOPC_Real_si <0x4a>;
+defm V_CMPS_NGT_F32 : VOPC_Real_si <0x4b>;
+defm V_CMPS_NLE_F32 : VOPC_Real_si <0x4c>;
+defm V_CMPS_NEQ_F32 : VOPC_Real_si <0x4d>;
+defm V_CMPS_NLT_F32 : VOPC_Real_si <0x4e>;
+defm V_CMPS_TRU_F32 : VOPC_Real_si <0x4f>;
+
+defm V_CMPSX_F_F32 : VOPC_Real_si <0x50>;
+defm V_CMPSX_LT_F32 : VOPC_Real_si <0x51>;
+defm V_CMPSX_EQ_F32 : VOPC_Real_si <0x52>;
+defm V_CMPSX_LE_F32 : VOPC_Real_si <0x53>;
+defm V_CMPSX_GT_F32 : VOPC_Real_si <0x54>;
+defm V_CMPSX_LG_F32 : VOPC_Real_si <0x55>;
+defm V_CMPSX_GE_F32 : VOPC_Real_si <0x56>;
+defm V_CMPSX_O_F32 : VOPC_Real_si <0x57>;
+defm V_CMPSX_U_F32 : VOPC_Real_si <0x58>;
+defm V_CMPSX_NGE_F32 : VOPC_Real_si <0x59>;
+defm V_CMPSX_NLG_F32 : VOPC_Real_si <0x5a>;
+defm V_CMPSX_NGT_F32 : VOPC_Real_si <0x5b>;
+defm V_CMPSX_NLE_F32 : VOPC_Real_si <0x5c>;
+defm V_CMPSX_NEQ_F32 : VOPC_Real_si <0x5d>;
+defm V_CMPSX_NLT_F32 : VOPC_Real_si <0x5e>;
+defm V_CMPSX_TRU_F32 : VOPC_Real_si <0x5f>;
+
+defm V_CMPS_F_F64 : VOPC_Real_si <0x60>;
+defm V_CMPS_LT_F64 : VOPC_Real_si <0x61>;
+defm V_CMPS_EQ_F64 : VOPC_Real_si <0x62>;
+defm V_CMPS_LE_F64 : VOPC_Real_si <0x63>;
+defm V_CMPS_GT_F64 : VOPC_Real_si <0x64>;
+defm V_CMPS_LG_F64 : VOPC_Real_si <0x65>;
+defm V_CMPS_GE_F64 : VOPC_Real_si <0x66>;
+defm V_CMPS_O_F64 : VOPC_Real_si <0x67>;
+defm V_CMPS_U_F64 : VOPC_Real_si <0x68>;
+defm V_CMPS_NGE_F64 : VOPC_Real_si <0x69>;
+defm V_CMPS_NLG_F64 : VOPC_Real_si <0x6a>;
+defm V_CMPS_NGT_F64 : VOPC_Real_si <0x6b>;
+defm V_CMPS_NLE_F64 : VOPC_Real_si <0x6c>;
+defm V_CMPS_NEQ_F64 : VOPC_Real_si <0x6d>;
+defm V_CMPS_NLT_F64 : VOPC_Real_si <0x6e>;
+defm V_CMPS_TRU_F64 : VOPC_Real_si <0x6f>;
+
+defm V_CMPSX_F_F64 : VOPC_Real_si <0x70>;
+defm V_CMPSX_LT_F64 : VOPC_Real_si <0x71>;
+defm V_CMPSX_EQ_F64 : VOPC_Real_si <0x72>;
+defm V_CMPSX_LE_F64 : VOPC_Real_si <0x73>;
+defm V_CMPSX_GT_F64 : VOPC_Real_si <0x74>;
+defm V_CMPSX_LG_F64 : VOPC_Real_si <0x75>;
+defm V_CMPSX_GE_F64 : VOPC_Real_si <0x76>;
+defm V_CMPSX_O_F64 : VOPC_Real_si <0x77>;
+defm V_CMPSX_U_F64 : VOPC_Real_si <0x78>;
+defm V_CMPSX_NGE_F64 : VOPC_Real_si <0x79>;
+defm V_CMPSX_NLG_F64 : VOPC_Real_si <0x7a>;
+defm V_CMPSX_NGT_F64 : VOPC_Real_si <0x7b>;
+defm V_CMPSX_NLE_F64 : VOPC_Real_si <0x7c>;
+defm V_CMPSX_NEQ_F64 : VOPC_Real_si <0x7d>;
+defm V_CMPSX_NLT_F64 : VOPC_Real_si <0x7e>;
+defm V_CMPSX_TRU_F64 : VOPC_Real_si <0x7f>;
+
+defm V_CMP_F_I32 : VOPC_Real_si <0x80>;
+defm V_CMP_LT_I32 : VOPC_Real_si <0x81>;
+defm V_CMP_EQ_I32 : VOPC_Real_si <0x82>;
+defm V_CMP_LE_I32 : VOPC_Real_si <0x83>;
+defm V_CMP_GT_I32 : VOPC_Real_si <0x84>;
+defm V_CMP_NE_I32 : VOPC_Real_si <0x85>;
+defm V_CMP_GE_I32 : VOPC_Real_si <0x86>;
+defm V_CMP_T_I32 : VOPC_Real_si <0x87>;
+
+defm V_CMPX_F_I32 : VOPC_Real_si <0x90>;
+defm V_CMPX_LT_I32 : VOPC_Real_si <0x91>;
+defm V_CMPX_EQ_I32 : VOPC_Real_si <0x92>;
+defm V_CMPX_LE_I32 : VOPC_Real_si <0x93>;
+defm V_CMPX_GT_I32 : VOPC_Real_si <0x94>;
+defm V_CMPX_NE_I32 : VOPC_Real_si <0x95>;
+defm V_CMPX_GE_I32 : VOPC_Real_si <0x96>;
+defm V_CMPX_T_I32 : VOPC_Real_si <0x97>;
+
+defm V_CMP_F_I64 : VOPC_Real_si <0xa0>;
+defm V_CMP_LT_I64 : VOPC_Real_si <0xa1>;
+defm V_CMP_EQ_I64 : VOPC_Real_si <0xa2>;
+defm V_CMP_LE_I64 : VOPC_Real_si <0xa3>;
+defm V_CMP_GT_I64 : VOPC_Real_si <0xa4>;
+defm V_CMP_NE_I64 : VOPC_Real_si <0xa5>;
+defm V_CMP_GE_I64 : VOPC_Real_si <0xa6>;
+defm V_CMP_T_I64 : VOPC_Real_si <0xa7>;
+
+defm V_CMPX_F_I64 : VOPC_Real_si <0xb0>;
+defm V_CMPX_LT_I64 : VOPC_Real_si <0xb1>;
+defm V_CMPX_EQ_I64 : VOPC_Real_si <0xb2>;
+defm V_CMPX_LE_I64 : VOPC_Real_si <0xb3>;
+defm V_CMPX_GT_I64 : VOPC_Real_si <0xb4>;
+defm V_CMPX_NE_I64 : VOPC_Real_si <0xb5>;
+defm V_CMPX_GE_I64 : VOPC_Real_si <0xb6>;
+defm V_CMPX_T_I64 : VOPC_Real_si <0xb7>;
+
+defm V_CMP_F_U32 : VOPC_Real_si <0xc0>;
+defm V_CMP_LT_U32 : VOPC_Real_si <0xc1>;
+defm V_CMP_EQ_U32 : VOPC_Real_si <0xc2>;
+defm V_CMP_LE_U32 : VOPC_Real_si <0xc3>;
+defm V_CMP_GT_U32 : VOPC_Real_si <0xc4>;
+defm V_CMP_NE_U32 : VOPC_Real_si <0xc5>;
+defm V_CMP_GE_U32 : VOPC_Real_si <0xc6>;
+defm V_CMP_T_U32 : VOPC_Real_si <0xc7>;
+
+defm V_CMPX_F_U32 : VOPC_Real_si <0xd0>;
+defm V_CMPX_LT_U32 : VOPC_Real_si <0xd1>;
+defm V_CMPX_EQ_U32 : VOPC_Real_si <0xd2>;
+defm V_CMPX_LE_U32 : VOPC_Real_si <0xd3>;
+defm V_CMPX_GT_U32 : VOPC_Real_si <0xd4>;
+defm V_CMPX_NE_U32 : VOPC_Real_si <0xd5>;
+defm V_CMPX_GE_U32 : VOPC_Real_si <0xd6>;
+defm V_CMPX_T_U32 : VOPC_Real_si <0xd7>;
+
+defm V_CMP_F_U64 : VOPC_Real_si <0xe0>;
+defm V_CMP_LT_U64 : VOPC_Real_si <0xe1>;
+defm V_CMP_EQ_U64 : VOPC_Real_si <0xe2>;
+defm V_CMP_LE_U64 : VOPC_Real_si <0xe3>;
+defm V_CMP_GT_U64 : VOPC_Real_si <0xe4>;
+defm V_CMP_NE_U64 : VOPC_Real_si <0xe5>;
+defm V_CMP_GE_U64 : VOPC_Real_si <0xe6>;
+defm V_CMP_T_U64 : VOPC_Real_si <0xe7>;
+
+defm V_CMPX_F_U64 : VOPC_Real_si <0xf0>;
+defm V_CMPX_LT_U64 : VOPC_Real_si <0xf1>;
+defm V_CMPX_EQ_U64 : VOPC_Real_si <0xf2>;
+defm V_CMPX_LE_U64 : VOPC_Real_si <0xf3>;
+defm V_CMPX_GT_U64 : VOPC_Real_si <0xf4>;
+defm V_CMPX_NE_U64 : VOPC_Real_si <0xf5>;
+defm V_CMPX_GE_U64 : VOPC_Real_si <0xf6>;
+defm V_CMPX_T_U64 : VOPC_Real_si <0xf7>;
+
+defm V_CMP_CLASS_F32 : VOPC_Real_si <0x88>;
+defm V_CMPX_CLASS_F32 : VOPC_Real_si <0x98>;
+defm V_CMP_CLASS_F64 : VOPC_Real_si <0xa8>;
+defm V_CMPX_CLASS_F64 : VOPC_Real_si <0xb8>;
+
+//===----------------------------------------------------------------------===//
+// VI
+//===----------------------------------------------------------------------===//
+
+multiclass VOPC_Real_vi <bits<10> op> {
+ let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
+ def _e32_vi :
+ VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.VI>,
+ VOPCe<op{7-0}>;
+
+ def _e64_vi :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
+ VOP3a_vi <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
+ // Encoding used for VOPC instructions encoded as VOP3
+ // Differs from VOP3e by destination name (sdst) as VOPC doesn't have vector dst
+ bits<8> sdst;
+ let Inst{7-0} = sdst;
+ }
+ }
+
+ def _sdwa_vi :
+ VOP_SDWA_Real <!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>,
+ VOPC_SDWAe <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
+
+ def : VOPCInstAlias <!cast<VOP3_Pseudo>(NAME#"_e64"),
+ !cast<Instruction>(NAME#"_e32_vi")> {
+ let AssemblerPredicate = isVI;
+ }
+}
+
+defm V_CMP_CLASS_F32 : VOPC_Real_vi <0x10>;
+defm V_CMPX_CLASS_F32 : VOPC_Real_vi <0x11>;
+defm V_CMP_CLASS_F64 : VOPC_Real_vi <0x12>;
+defm V_CMPX_CLASS_F64 : VOPC_Real_vi <0x13>;
+defm V_CMP_CLASS_F16 : VOPC_Real_vi <0x14>;
+defm V_CMPX_CLASS_F16 : VOPC_Real_vi <0x15>;
+
+defm V_CMP_F_F16 : VOPC_Real_vi <0x20>;
+defm V_CMP_LT_F16 : VOPC_Real_vi <0x21>;
+defm V_CMP_EQ_F16 : VOPC_Real_vi <0x22>;
+defm V_CMP_LE_F16 : VOPC_Real_vi <0x23>;
+defm V_CMP_GT_F16 : VOPC_Real_vi <0x24>;
+defm V_CMP_LG_F16 : VOPC_Real_vi <0x25>;
+defm V_CMP_GE_F16 : VOPC_Real_vi <0x26>;
+defm V_CMP_O_F16 : VOPC_Real_vi <0x27>;
+defm V_CMP_U_F16 : VOPC_Real_vi <0x28>;
+defm V_CMP_NGE_F16 : VOPC_Real_vi <0x29>;
+defm V_CMP_NLG_F16 : VOPC_Real_vi <0x2a>;
+defm V_CMP_NGT_F16 : VOPC_Real_vi <0x2b>;
+defm V_CMP_NLE_F16 : VOPC_Real_vi <0x2c>;
+defm V_CMP_NEQ_F16 : VOPC_Real_vi <0x2d>;
+defm V_CMP_NLT_F16 : VOPC_Real_vi <0x2e>;
+defm V_CMP_TRU_F16 : VOPC_Real_vi <0x2f>;
+
+defm V_CMPX_F_F16 : VOPC_Real_vi <0x30>;
+defm V_CMPX_LT_F16 : VOPC_Real_vi <0x31>;
+defm V_CMPX_EQ_F16 : VOPC_Real_vi <0x32>;
+defm V_CMPX_LE_F16 : VOPC_Real_vi <0x33>;
+defm V_CMPX_GT_F16 : VOPC_Real_vi <0x34>;
+defm V_CMPX_LG_F16 : VOPC_Real_vi <0x35>;
+defm V_CMPX_GE_F16 : VOPC_Real_vi <0x36>;
+defm V_CMPX_O_F16 : VOPC_Real_vi <0x37>;
+defm V_CMPX_U_F16 : VOPC_Real_vi <0x38>;
+defm V_CMPX_NGE_F16 : VOPC_Real_vi <0x39>;
+defm V_CMPX_NLG_F16 : VOPC_Real_vi <0x3a>;
+defm V_CMPX_NGT_F16 : VOPC_Real_vi <0x3b>;
+defm V_CMPX_NLE_F16 : VOPC_Real_vi <0x3c>;
+defm V_CMPX_NEQ_F16 : VOPC_Real_vi <0x3d>;
+defm V_CMPX_NLT_F16 : VOPC_Real_vi <0x3e>;
+defm V_CMPX_TRU_F16 : VOPC_Real_vi <0x3f>;
+
+defm V_CMP_F_F32 : VOPC_Real_vi <0x40>;
+defm V_CMP_LT_F32 : VOPC_Real_vi <0x41>;
+defm V_CMP_EQ_F32 : VOPC_Real_vi <0x42>;
+defm V_CMP_LE_F32 : VOPC_Real_vi <0x43>;
+defm V_CMP_GT_F32 : VOPC_Real_vi <0x44>;
+defm V_CMP_LG_F32 : VOPC_Real_vi <0x45>;
+defm V_CMP_GE_F32 : VOPC_Real_vi <0x46>;
+defm V_CMP_O_F32 : VOPC_Real_vi <0x47>;
+defm V_CMP_U_F32 : VOPC_Real_vi <0x48>;
+defm V_CMP_NGE_F32 : VOPC_Real_vi <0x49>;
+defm V_CMP_NLG_F32 : VOPC_Real_vi <0x4a>;
+defm V_CMP_NGT_F32 : VOPC_Real_vi <0x4b>;
+defm V_CMP_NLE_F32 : VOPC_Real_vi <0x4c>;
+defm V_CMP_NEQ_F32 : VOPC_Real_vi <0x4d>;
+defm V_CMP_NLT_F32 : VOPC_Real_vi <0x4e>;
+defm V_CMP_TRU_F32 : VOPC_Real_vi <0x4f>;
+
+defm V_CMPX_F_F32 : VOPC_Real_vi <0x50>;
+defm V_CMPX_LT_F32 : VOPC_Real_vi <0x51>;
+defm V_CMPX_EQ_F32 : VOPC_Real_vi <0x52>;
+defm V_CMPX_LE_F32 : VOPC_Real_vi <0x53>;
+defm V_CMPX_GT_F32 : VOPC_Real_vi <0x54>;
+defm V_CMPX_LG_F32 : VOPC_Real_vi <0x55>;
+defm V_CMPX_GE_F32 : VOPC_Real_vi <0x56>;
+defm V_CMPX_O_F32 : VOPC_Real_vi <0x57>;
+defm V_CMPX_U_F32 : VOPC_Real_vi <0x58>;
+defm V_CMPX_NGE_F32 : VOPC_Real_vi <0x59>;
+defm V_CMPX_NLG_F32 : VOPC_Real_vi <0x5a>;
+defm V_CMPX_NGT_F32 : VOPC_Real_vi <0x5b>;
+defm V_CMPX_NLE_F32 : VOPC_Real_vi <0x5c>;
+defm V_CMPX_NEQ_F32 : VOPC_Real_vi <0x5d>;
+defm V_CMPX_NLT_F32 : VOPC_Real_vi <0x5e>;
+defm V_CMPX_TRU_F32 : VOPC_Real_vi <0x5f>;
+
+defm V_CMP_F_F64 : VOPC_Real_vi <0x60>;
+defm V_CMP_LT_F64 : VOPC_Real_vi <0x61>;
+defm V_CMP_EQ_F64 : VOPC_Real_vi <0x62>;
+defm V_CMP_LE_F64 : VOPC_Real_vi <0x63>;
+defm V_CMP_GT_F64 : VOPC_Real_vi <0x64>;
+defm V_CMP_LG_F64 : VOPC_Real_vi <0x65>;
+defm V_CMP_GE_F64 : VOPC_Real_vi <0x66>;
+defm V_CMP_O_F64 : VOPC_Real_vi <0x67>;
+defm V_CMP_U_F64 : VOPC_Real_vi <0x68>;
+defm V_CMP_NGE_F64 : VOPC_Real_vi <0x69>;
+defm V_CMP_NLG_F64 : VOPC_Real_vi <0x6a>;
+defm V_CMP_NGT_F64 : VOPC_Real_vi <0x6b>;
+defm V_CMP_NLE_F64 : VOPC_Real_vi <0x6c>;
+defm V_CMP_NEQ_F64 : VOPC_Real_vi <0x6d>;
+defm V_CMP_NLT_F64 : VOPC_Real_vi <0x6e>;
+defm V_CMP_TRU_F64 : VOPC_Real_vi <0x6f>;
+
+defm V_CMPX_F_F64 : VOPC_Real_vi <0x70>;
+defm V_CMPX_LT_F64 : VOPC_Real_vi <0x71>;
+defm V_CMPX_EQ_F64 : VOPC_Real_vi <0x72>;
+defm V_CMPX_LE_F64 : VOPC_Real_vi <0x73>;
+defm V_CMPX_GT_F64 : VOPC_Real_vi <0x74>;
+defm V_CMPX_LG_F64 : VOPC_Real_vi <0x75>;
+defm V_CMPX_GE_F64 : VOPC_Real_vi <0x76>;
+defm V_CMPX_O_F64 : VOPC_Real_vi <0x77>;
+defm V_CMPX_U_F64 : VOPC_Real_vi <0x78>;
+defm V_CMPX_NGE_F64 : VOPC_Real_vi <0x79>;
+defm V_CMPX_NLG_F64 : VOPC_Real_vi <0x7a>;
+defm V_CMPX_NGT_F64 : VOPC_Real_vi <0x7b>;
+defm V_CMPX_NLE_F64 : VOPC_Real_vi <0x7c>;
+defm V_CMPX_NEQ_F64 : VOPC_Real_vi <0x7d>;
+defm V_CMPX_NLT_F64 : VOPC_Real_vi <0x7e>;
+defm V_CMPX_TRU_F64 : VOPC_Real_vi <0x7f>;
+
+defm V_CMP_F_I16 : VOPC_Real_vi <0xa0>;
+defm V_CMP_LT_I16 : VOPC_Real_vi <0xa1>;
+defm V_CMP_EQ_I16 : VOPC_Real_vi <0xa2>;
+defm V_CMP_LE_I16 : VOPC_Real_vi <0xa3>;
+defm V_CMP_GT_I16 : VOPC_Real_vi <0xa4>;
+defm V_CMP_NE_I16 : VOPC_Real_vi <0xa5>;
+defm V_CMP_GE_I16 : VOPC_Real_vi <0xa6>;
+defm V_CMP_T_I16 : VOPC_Real_vi <0xa7>;
+
+defm V_CMP_F_U16 : VOPC_Real_vi <0xa8>;
+defm V_CMP_LT_U16 : VOPC_Real_vi <0xa9>;
+defm V_CMP_EQ_U16 : VOPC_Real_vi <0xaa>;
+defm V_CMP_LE_U16 : VOPC_Real_vi <0xab>;
+defm V_CMP_GT_U16 : VOPC_Real_vi <0xac>;
+defm V_CMP_NE_U16 : VOPC_Real_vi <0xad>;
+defm V_CMP_GE_U16 : VOPC_Real_vi <0xae>;
+defm V_CMP_T_U16 : VOPC_Real_vi <0xaf>;
+
+defm V_CMPX_F_I16 : VOPC_Real_vi <0xb0>;
+defm V_CMPX_LT_I16 : VOPC_Real_vi <0xb1>;
+defm V_CMPX_EQ_I16 : VOPC_Real_vi <0xb2>;
+defm V_CMPX_LE_I16 : VOPC_Real_vi <0xb3>;
+defm V_CMPX_GT_I16 : VOPC_Real_vi <0xb4>;
+defm V_CMPX_NE_I16 : VOPC_Real_vi <0xb5>;
+defm V_CMPX_GE_I16 : VOPC_Real_vi <0xb6>;
+defm V_CMPX_T_I16 : VOPC_Real_vi <0xb7>;
+
+defm V_CMPX_F_U16 : VOPC_Real_vi <0xb8>;
+defm V_CMPX_LT_U16 : VOPC_Real_vi <0xb9>;
+defm V_CMPX_EQ_U16 : VOPC_Real_vi <0xba>;
+defm V_CMPX_LE_U16 : VOPC_Real_vi <0xbb>;
+defm V_CMPX_GT_U16 : VOPC_Real_vi <0xbc>;
+defm V_CMPX_NE_U16 : VOPC_Real_vi <0xbd>;
+defm V_CMPX_GE_U16 : VOPC_Real_vi <0xbe>;
+defm V_CMPX_T_U16 : VOPC_Real_vi <0xbf>;
+
+defm V_CMP_F_I32 : VOPC_Real_vi <0xc0>;
+defm V_CMP_LT_I32 : VOPC_Real_vi <0xc1>;
+defm V_CMP_EQ_I32 : VOPC_Real_vi <0xc2>;
+defm V_CMP_LE_I32 : VOPC_Real_vi <0xc3>;
+defm V_CMP_GT_I32 : VOPC_Real_vi <0xc4>;
+defm V_CMP_NE_I32 : VOPC_Real_vi <0xc5>;
+defm V_CMP_GE_I32 : VOPC_Real_vi <0xc6>;
+defm V_CMP_T_I32 : VOPC_Real_vi <0xc7>;
+
+defm V_CMPX_F_I32 : VOPC_Real_vi <0xd0>;
+defm V_CMPX_LT_I32 : VOPC_Real_vi <0xd1>;
+defm V_CMPX_EQ_I32 : VOPC_Real_vi <0xd2>;
+defm V_CMPX_LE_I32 : VOPC_Real_vi <0xd3>;
+defm V_CMPX_GT_I32 : VOPC_Real_vi <0xd4>;
+defm V_CMPX_NE_I32 : VOPC_Real_vi <0xd5>;
+defm V_CMPX_GE_I32 : VOPC_Real_vi <0xd6>;
+defm V_CMPX_T_I32 : VOPC_Real_vi <0xd7>;
+
+defm V_CMP_F_I64 : VOPC_Real_vi <0xe0>;
+defm V_CMP_LT_I64 : VOPC_Real_vi <0xe1>;
+defm V_CMP_EQ_I64 : VOPC_Real_vi <0xe2>;
+defm V_CMP_LE_I64 : VOPC_Real_vi <0xe3>;
+defm V_CMP_GT_I64 : VOPC_Real_vi <0xe4>;
+defm V_CMP_NE_I64 : VOPC_Real_vi <0xe5>;
+defm V_CMP_GE_I64 : VOPC_Real_vi <0xe6>;
+defm V_CMP_T_I64 : VOPC_Real_vi <0xe7>;
+
+defm V_CMPX_F_I64 : VOPC_Real_vi <0xf0>;
+defm V_CMPX_LT_I64 : VOPC_Real_vi <0xf1>;
+defm V_CMPX_EQ_I64 : VOPC_Real_vi <0xf2>;
+defm V_CMPX_LE_I64 : VOPC_Real_vi <0xf3>;
+defm V_CMPX_GT_I64 : VOPC_Real_vi <0xf4>;
+defm V_CMPX_NE_I64 : VOPC_Real_vi <0xf5>;
+defm V_CMPX_GE_I64 : VOPC_Real_vi <0xf6>;
+defm V_CMPX_T_I64 : VOPC_Real_vi <0xf7>;
+
+defm V_CMP_F_U32 : VOPC_Real_vi <0xc8>;
+defm V_CMP_LT_U32 : VOPC_Real_vi <0xc9>;
+defm V_CMP_EQ_U32 : VOPC_Real_vi <0xca>;
+defm V_CMP_LE_U32 : VOPC_Real_vi <0xcb>;
+defm V_CMP_GT_U32 : VOPC_Real_vi <0xcc>;
+defm V_CMP_NE_U32 : VOPC_Real_vi <0xcd>;
+defm V_CMP_GE_U32 : VOPC_Real_vi <0xce>;
+defm V_CMP_T_U32 : VOPC_Real_vi <0xcf>;
+
+defm V_CMPX_F_U32 : VOPC_Real_vi <0xd8>;
+defm V_CMPX_LT_U32 : VOPC_Real_vi <0xd9>;
+defm V_CMPX_EQ_U32 : VOPC_Real_vi <0xda>;
+defm V_CMPX_LE_U32 : VOPC_Real_vi <0xdb>;
+defm V_CMPX_GT_U32 : VOPC_Real_vi <0xdc>;
+defm V_CMPX_NE_U32 : VOPC_Real_vi <0xdd>;
+defm V_CMPX_GE_U32 : VOPC_Real_vi <0xde>;
+defm V_CMPX_T_U32 : VOPC_Real_vi <0xdf>;
+
+defm V_CMP_F_U64 : VOPC_Real_vi <0xe8>;
+defm V_CMP_LT_U64 : VOPC_Real_vi <0xe9>;
+defm V_CMP_EQ_U64 : VOPC_Real_vi <0xea>;
+defm V_CMP_LE_U64 : VOPC_Real_vi <0xeb>;
+defm V_CMP_GT_U64 : VOPC_Real_vi <0xec>;
+defm V_CMP_NE_U64 : VOPC_Real_vi <0xed>;
+defm V_CMP_GE_U64 : VOPC_Real_vi <0xee>;
+defm V_CMP_T_U64 : VOPC_Real_vi <0xef>;
+
+defm V_CMPX_F_U64 : VOPC_Real_vi <0xf8>;
+defm V_CMPX_LT_U64 : VOPC_Real_vi <0xf9>;
+defm V_CMPX_EQ_U64 : VOPC_Real_vi <0xfa>;
+defm V_CMPX_LE_U64 : VOPC_Real_vi <0xfb>;
+defm V_CMPX_GT_U64 : VOPC_Real_vi <0xfc>;
+defm V_CMPX_NE_U64 : VOPC_Real_vi <0xfd>;
+defm V_CMPX_GE_U64 : VOPC_Real_vi <0xfe>;
+defm V_CMPX_T_U64 : VOPC_Real_vi <0xff>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/VOPInstructions.td b/contrib/llvm/lib/Target/AMDGPU/VOPInstructions.td
new file mode 100644
index 000000000000..5f72f97d9e28
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -0,0 +1,350 @@
+//===-- VOPInstructions.td - Vector Instruction Defintions ----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// dummies for outer let
+class LetDummies {
+ bit isCommutable;
+ bit isConvertibleToThreeAddress;
+ bit isMoveImm;
+ bit isReMaterializable;
+ bit isAsCheapAsAMove;
+ bit VOPAsmPrefer32Bit;
+ Predicate SubtargetPredicate;
+ string Constraints;
+ string DisableEncoding;
+ list<SchedReadWrite> SchedRW;
+ list<Register> Uses;
+ list<Register> Defs;
+}
+
+class VOP <string opName> {
+ string OpName = opName;
+}
+
+class VOPAnyCommon <dag outs, dag ins, string asm, list<dag> pattern> :
+ InstSI <outs, ins, asm, pattern> {
+
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let UseNamedOperandTable = 1;
+ let VALU = 1;
+ let Uses = [EXEC];
+}
+
+class VOP3Common <dag outs, dag ins, string asm = "",
+ list<dag> pattern = [], bit HasMods = 0,
+ bit VOP3Only = 0> :
+ VOPAnyCommon <outs, ins, asm, pattern> {
+
+ // Using complex patterns gives VOP3 patterns a very high complexity rating,
+ // but standalone patterns are almost always preferred, so we need to adjust the
+ // priority lower. The goal is to use a high number to reduce complexity to
+ // zero (or less than zero).
+ let AddedComplexity = -1000;
+
+ let VOP3 = 1;
+
+ let AsmMatchConverter =
+ !if(!eq(VOP3Only,1),
+ "cvtVOP3",
+ !if(!eq(HasMods,1), "cvtVOP3_2_mod", ""));
+
+ let AsmVariantName = AMDGPUAsmVariants.VOP3;
+
+ let isCodeGenOnly = 0;
+
+ int Size = 8;
+
+ // Because SGPRs may be allowed if there are multiple operands, we
+ // need a post-isel hook to insert copies in order to avoid
+ // violating constant bus requirements.
+ let hasPostISelHook = 1;
+}
+
+class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP3Only = 0> :
+ InstSI <P.Outs64, P.Ins64, "", pattern>,
+ VOP <opName>,
+ SIMCInstr<opName#"_e64", SIEncodingFamily.NONE>,
+ MnemonicAlias<opName#"_e64", opName> {
+
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+ let UseNamedOperandTable = 1;
+
+ string Mnemonic = opName;
+ string AsmOperands = P.Asm64;
+
+ let Size = 8;
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let SubtargetPredicate = isGCN;
+
+ // Because SGPRs may be allowed if there are multiple operands, we
+ // need a post-isel hook to insert copies in order to avoid
+ // violating constant bus requirements.
+ let hasPostISelHook = 1;
+
+ // Using complex patterns gives VOP3 patterns a very high complexity rating,
+ // but standalone patterns are almost always preferred, so we need to adjust the
+ // priority lower. The goal is to use a high number to reduce complexity to
+ // zero (or less than zero).
+ let AddedComplexity = -1000;
+
+ let VOP3 = 1;
+ let VALU = 1;
+ let Uses = [EXEC];
+
+ let AsmVariantName = AMDGPUAsmVariants.VOP3;
+ let AsmMatchConverter =
+ !if(!eq(VOP3Only,1),
+ "cvtVOP3",
+ !if(!eq(P.HasModifiers, 1), "cvtVOP3_2_mod", ""));
+
+ VOPProfile Pfl = P;
+}
+
+class VOP3_Real <VOP3_Pseudo ps, int EncodingFamily> :
+ InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
+ SIMCInstr <ps.PseudoInstr, EncodingFamily> {
+
+ let isPseudo = 0;
+ let isCodeGenOnly = 0;
+
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
+
+ // copy relevant pseudo op flags
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let AsmMatchConverter = ps.AsmMatchConverter;
+ let AsmVariantName = ps.AsmVariantName;
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
+ let TSFlags = ps.TSFlags;
+}
+
+class VOP3a<VOPProfile P> : Enc64 {
+ bits<2> src0_modifiers;
+ bits<9> src0;
+ bits<2> src1_modifiers;
+ bits<9> src1;
+ bits<2> src2_modifiers;
+ bits<9> src2;
+ bits<1> clamp;
+ bits<2> omod;
+
+ let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0);
+ let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0);
+ let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0);
+
+ let Inst{31-26} = 0x34; //encoding
+ let Inst{40-32} = !if(P.HasSrc0, src0, 0);
+ let Inst{49-41} = !if(P.HasSrc1, src1, 0);
+ let Inst{58-50} = !if(P.HasSrc2, src2, 0);
+ let Inst{60-59} = !if(P.HasOMod, omod, 0);
+ let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0);
+ let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0);
+ let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0);
+}
+
+class VOP3a_si <bits<9> op, VOPProfile P> : VOP3a<P> {
+ let Inst{25-17} = op;
+ let Inst{11} = !if(P.HasClamp, clamp{0}, 0);
+}
+
+class VOP3a_vi <bits<10> op, VOPProfile P> : VOP3a<P> {
+ let Inst{25-16} = op;
+ let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
+}
+
+class VOP3e_si <bits<9> op, VOPProfile P> : VOP3a_si <op, P> {
+ bits<8> vdst;
+ let Inst{7-0} = !if(P.EmitDst, vdst{7-0}, 0);
+}
+
+class VOP3e_vi <bits<10> op, VOPProfile P> : VOP3a_vi <op, P> {
+ bits<8> vdst;
+ let Inst{7-0} = !if(P.EmitDst, vdst{7-0}, 0);
+}
+
+class VOP3be <VOPProfile P> : Enc64 {
+ bits<8> vdst;
+ bits<2> src0_modifiers;
+ bits<9> src0;
+ bits<2> src1_modifiers;
+ bits<9> src1;
+ bits<2> src2_modifiers;
+ bits<9> src2;
+ bits<7> sdst;
+ bits<2> omod;
+
+ let Inst{7-0} = vdst;
+ let Inst{14-8} = sdst;
+ let Inst{31-26} = 0x34; //encoding
+ let Inst{40-32} = !if(P.HasSrc0, src0, 0);
+ let Inst{49-41} = !if(P.HasSrc1, src1, 0);
+ let Inst{58-50} = !if(P.HasSrc2, src2, 0);
+ let Inst{60-59} = !if(P.HasOMod, omod, 0);
+ let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0);
+ let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0);
+ let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0);
+}
+
+class VOP3be_si <bits<9> op, VOPProfile P> : VOP3be<P> {
+ let Inst{25-17} = op;
+}
+
+class VOP3be_vi <bits<10> op, VOPProfile P> : VOP3be<P> {
+ bits<1> clamp;
+ let Inst{25-16} = op;
+ let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
+}
+
+def SDWA {
+ // sdwa_sel
+ int BYTE_0 = 0;
+ int BYTE_1 = 1;
+ int BYTE_2 = 2;
+ int BYTE_3 = 3;
+ int WORD_0 = 4;
+ int WORD_1 = 5;
+ int DWORD = 6;
+
+ // dst_unused
+ int UNUSED_PAD = 0;
+ int UNUSED_SEXT = 1;
+ int UNUSED_PRESERVE = 2;
+}
+
+class VOP_SDWAe<VOPProfile P> : Enc64 {
+ bits<8> src0;
+ bits<3> src0_sel;
+ bits<2> src0_modifiers; // float: {abs,neg}, int {sext}
+ bits<3> src1_sel;
+ bits<2> src1_modifiers;
+ bits<3> dst_sel;
+ bits<2> dst_unused;
+ bits<1> clamp;
+
+ let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0);
+ let Inst{42-40} = !if(P.EmitDst, dst_sel{2-0}, SDWA.DWORD);
+ let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, SDWA.UNUSED_PRESERVE);
+ let Inst{45} = !if(P.HasSDWAClamp, clamp{0}, 0);
+ let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, SDWA.DWORD);
+ let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0);
+ let Inst{51} = !if(P.HasSrc0IntMods, src0_modifiers{0}, 0);
+ let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, SDWA.DWORD);
+ let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0);
+ let Inst{59} = !if(P.HasSrc1IntMods, src1_modifiers{0}, 0);
+}
+
+class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
+ InstSI <P.OutsSDWA, P.InsSDWA, "", pattern>,
+ VOP <opName>,
+ SIMCInstr <opName#"_sdwa", SIEncodingFamily.NONE>,
+ MnemonicAlias <opName#"_sdwa", opName> {
+
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+ let UseNamedOperandTable = 1;
+
+ string Mnemonic = opName;
+ string AsmOperands = P.AsmSDWA;
+
+ let Size = 8;
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+
+ let VALU = 1;
+ let SDWA = 1;
+ let Uses = [EXEC];
+
+ let SubtargetPredicate = isVI;
+ let AssemblerPredicate = !if(P.HasExt, isVI, DisableInst);
+ let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.SDWA,
+ AMDGPUAsmVariants.Disable);
+ let DecoderNamespace = "SDWA";
+
+ VOPProfile Pfl = P;
+}
+
+class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> :
+ InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
+ SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> {
+
+ let isPseudo = 0;
+ let isCodeGenOnly = 0;
+
+ let Defs = ps.Defs;
+ let Uses = ps.Uses;
+ let SchedRW = ps.SchedRW;
+ let hasSideEffects = ps.hasSideEffects;
+
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
+
+ // Copy relevant pseudo op flags
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let AssemblerPredicate = ps.AssemblerPredicate;
+ let AsmMatchConverter = ps.AsmMatchConverter;
+ let AsmVariantName = ps.AsmVariantName;
+ let UseNamedOperandTable = ps.UseNamedOperandTable;
+ let DecoderNamespace = ps.DecoderNamespace;
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
+ let TSFlags = ps.TSFlags;
+}
+
+class VOP_DPPe<VOPProfile P> : Enc64 {
+ bits<2> src0_modifiers;
+ bits<8> src0;
+ bits<2> src1_modifiers;
+ bits<9> dpp_ctrl;
+ bits<1> bound_ctrl;
+ bits<4> bank_mask;
+ bits<4> row_mask;
+
+ let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0);
+ let Inst{48-40} = dpp_ctrl;
+ let Inst{51} = bound_ctrl;
+ let Inst{52} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // src0_neg
+ let Inst{53} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // src0_abs
+ let Inst{54} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); // src1_neg
+ let Inst{55} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); // src1_abs
+ let Inst{59-56} = bank_mask;
+ let Inst{63-60} = row_mask;
+}
+
+class VOP_DPP <string OpName, VOPProfile P> :
+ InstSI <P.OutsDPP, P.InsDPP, OpName#P.AsmDPP, []>,
+ VOP_DPPe<P> {
+
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let UseNamedOperandTable = 1;
+
+ let VALU = 1;
+ let DPP = 1;
+ let Size = 8;
+
+ let AsmMatchConverter = !if(!eq(P.HasModifiers,1), "cvtDPP", "");
+ let SubtargetPredicate = isVI;
+ let AssemblerPredicate = !if(P.HasExt, isVI, DisableInst);
+ let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.DPP,
+ AMDGPUAsmVariants.Disable);
+ let DecoderNamespace = "DPP";
+}
+
+include "VOPCInstructions.td"
+include "VOP1Instructions.td"
+include "VOP2Instructions.td"
+include "VOP3Instructions.td"
diff --git a/contrib/llvm/lib/Target/ARM/A15SDOptimizer.cpp b/contrib/llvm/lib/Target/ARM/A15SDOptimizer.cpp
new file mode 100644
index 000000000000..89859ba063d9
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/A15SDOptimizer.cpp
@@ -0,0 +1,710 @@
+//=== A15SDOptimizerPass.cpp - Optimize DPR and SPR register accesses on A15==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The Cortex-A15 processor employs a tracking scheme in its register renaming
+// in order to process each instruction's micro-ops speculatively and
+// out-of-order with appropriate forwarding. The ARM architecture allows VFP
+// instructions to read and write 32-bit S-registers. Each S-register
+// corresponds to one half (upper or lower) of an overlaid 64-bit D-register.
+//
+// There are several instruction patterns which can be used to provide this
+// capability which can provide higher performance than other, potentially more
+// direct patterns, specifically around when one micro-op reads a D-register
+// operand that has recently been written as one or more S-register results.
+//
+// This file defines a pre-regalloc pass which looks for SPR producers which
+// are going to be used by a DPR (or QPR) consumers and creates the more
+// optimized access pattern.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
+#include "ARMSubtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <map>
+#include <set>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "a15-sd-optimizer"
+
+namespace {
+ struct A15SDOptimizer : public MachineFunctionPass {
+ static char ID;
+ A15SDOptimizer() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+
+ StringRef getPassName() const override { return "ARM A15 S->D optimizer"; }
+
+ private:
+ const ARMBaseInstrInfo *TII;
+ const TargetRegisterInfo *TRI;
+ MachineRegisterInfo *MRI;
+
+ bool runOnInstruction(MachineInstr *MI);
+
+ //
+ // Instruction builder helpers
+ //
+ unsigned createDupLane(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator InsertBefore,
+ const DebugLoc &DL, unsigned Reg, unsigned Lane,
+ bool QPR = false);
+
+ unsigned createExtractSubreg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator InsertBefore,
+ const DebugLoc &DL, unsigned DReg,
+ unsigned Lane, const TargetRegisterClass *TRC);
+
+ unsigned createVExt(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator InsertBefore,
+ const DebugLoc &DL, unsigned Ssub0, unsigned Ssub1);
+
+ unsigned createRegSequence(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator InsertBefore,
+ const DebugLoc &DL, unsigned Reg1,
+ unsigned Reg2);
+
+ unsigned createInsertSubreg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator InsertBefore,
+ const DebugLoc &DL, unsigned DReg,
+ unsigned Lane, unsigned ToInsert);
+
+ unsigned createImplicitDef(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator InsertBefore,
+ const DebugLoc &DL);
+
+ //
+ // Various property checkers
+ //
+ bool usesRegClass(MachineOperand &MO, const TargetRegisterClass *TRC);
+ bool hasPartialWrite(MachineInstr *MI);
+ SmallVector<unsigned, 8> getReadDPRs(MachineInstr *MI);
+ unsigned getDPRLaneFromSPR(unsigned SReg);
+
+ //
+ // Methods used for getting the definitions of partial registers
+ //
+
+ MachineInstr *elideCopies(MachineInstr *MI);
+ void elideCopiesAndPHIs(MachineInstr *MI,
+ SmallVectorImpl<MachineInstr*> &Outs);
+
+ //
+ // Pattern optimization methods
+ //
+ unsigned optimizeAllLanesPattern(MachineInstr *MI, unsigned Reg);
+ unsigned optimizeSDPattern(MachineInstr *MI);
+ unsigned getPrefSPRLane(unsigned SReg);
+
+ //
+ // Sanitizing method - used to make sure if don't leave dead code around.
+ //
+ void eraseInstrWithNoUses(MachineInstr *MI);
+
+ //
+ // A map used to track the changes done by this pass.
+ //
+ std::map<MachineInstr*, unsigned> Replacements;
+ std::set<MachineInstr *> DeadInstr;
+ };
+ char A15SDOptimizer::ID = 0;
+} // end anonymous namespace
+
+// Returns true if this is a use of a SPR register.
+bool A15SDOptimizer::usesRegClass(MachineOperand &MO,
+ const TargetRegisterClass *TRC) {
+ if (!MO.isReg())
+ return false;
+ unsigned Reg = MO.getReg();
+
+ if (TargetRegisterInfo::isVirtualRegister(Reg))
+ return MRI->getRegClass(Reg)->hasSuperClassEq(TRC);
+ else
+ return TRC->contains(Reg);
+}
+
+unsigned A15SDOptimizer::getDPRLaneFromSPR(unsigned SReg) {
+ unsigned DReg = TRI->getMatchingSuperReg(SReg, ARM::ssub_1,
+ &ARM::DPRRegClass);
+ if (DReg != ARM::NoRegister) return ARM::ssub_1;
+ return ARM::ssub_0;
+}
+
+// Get the subreg type that is most likely to be coalesced
+// for an SPR register that will be used in VDUP32d pseudo.
+unsigned A15SDOptimizer::getPrefSPRLane(unsigned SReg) {
+ if (!TRI->isVirtualRegister(SReg))
+ return getDPRLaneFromSPR(SReg);
+
+ MachineInstr *MI = MRI->getVRegDef(SReg);
+ if (!MI) return ARM::ssub_0;
+ MachineOperand *MO = MI->findRegisterDefOperand(SReg);
+
+ assert(MO->isReg() && "Non-register operand found!");
+ if (!MO) return ARM::ssub_0;
+
+ if (MI->isCopy() && usesRegClass(MI->getOperand(1),
+ &ARM::SPRRegClass)) {
+ SReg = MI->getOperand(1).getReg();
+ }
+
+ if (TargetRegisterInfo::isVirtualRegister(SReg)) {
+ if (MO->getSubReg() == ARM::ssub_1) return ARM::ssub_1;
+ return ARM::ssub_0;
+ }
+ return getDPRLaneFromSPR(SReg);
+}
+
+// MI is known to be dead. Figure out what instructions
+// are also made dead by this and mark them for removal.
+void A15SDOptimizer::eraseInstrWithNoUses(MachineInstr *MI) {
+ SmallVector<MachineInstr *, 8> Front;
+ DeadInstr.insert(MI);
+
+ DEBUG(dbgs() << "Deleting base instruction " << *MI << "\n");
+ Front.push_back(MI);
+
+ while (Front.size() != 0) {
+ MI = Front.back();
+ Front.pop_back();
+
+ // MI is already known to be dead. We need to see
+ // if other instructions can also be removed.
+ for (unsigned int i = 0; i < MI->getNumOperands(); ++i) {
+ MachineOperand &MO = MI->getOperand(i);
+ if ((!MO.isReg()) || (!MO.isUse()))
+ continue;
+ unsigned Reg = MO.getReg();
+ if (!TRI->isVirtualRegister(Reg))
+ continue;
+ MachineOperand *Op = MI->findRegisterDefOperand(Reg);
+
+ if (!Op)
+ continue;
+
+ MachineInstr *Def = Op->getParent();
+
+ // We don't need to do anything if we have already marked
+ // this instruction as being dead.
+ if (DeadInstr.find(Def) != DeadInstr.end())
+ continue;
+
+ // Check if all the uses of this instruction are marked as
+ // dead. If so, we can also mark this instruction as being
+ // dead.
+ bool IsDead = true;
+ for (unsigned int j = 0; j < Def->getNumOperands(); ++j) {
+ MachineOperand &MODef = Def->getOperand(j);
+ if ((!MODef.isReg()) || (!MODef.isDef()))
+ continue;
+ unsigned DefReg = MODef.getReg();
+ if (!TRI->isVirtualRegister(DefReg)) {
+ IsDead = false;
+ break;
+ }
+ for (MachineRegisterInfo::use_instr_iterator
+ II = MRI->use_instr_begin(Reg), EE = MRI->use_instr_end();
+ II != EE; ++II) {
+ // We don't care about self references.
+ if (&*II == Def)
+ continue;
+ if (DeadInstr.find(&*II) == DeadInstr.end()) {
+ IsDead = false;
+ break;
+ }
+ }
+ }
+
+ if (!IsDead) continue;
+
+ DEBUG(dbgs() << "Deleting instruction " << *Def << "\n");
+ DeadInstr.insert(Def);
+ }
+ }
+}
+
+// Creates the more optimized patterns and generally does all the code
+// transformations in this pass.
+unsigned A15SDOptimizer::optimizeSDPattern(MachineInstr *MI) {
+ if (MI->isCopy()) {
+ return optimizeAllLanesPattern(MI, MI->getOperand(1).getReg());
+ }
+
+ if (MI->isInsertSubreg()) {
+ unsigned DPRReg = MI->getOperand(1).getReg();
+ unsigned SPRReg = MI->getOperand(2).getReg();
+
+ if (TRI->isVirtualRegister(DPRReg) && TRI->isVirtualRegister(SPRReg)) {
+ MachineInstr *DPRMI = MRI->getVRegDef(MI->getOperand(1).getReg());
+ MachineInstr *SPRMI = MRI->getVRegDef(MI->getOperand(2).getReg());
+
+ if (DPRMI && SPRMI) {
+ // See if the first operand of this insert_subreg is IMPLICIT_DEF
+ MachineInstr *ECDef = elideCopies(DPRMI);
+ if (ECDef && ECDef->isImplicitDef()) {
+ // Another corner case - if we're inserting something that is purely
+ // a subreg copy of a DPR, just use that DPR.
+
+ MachineInstr *EC = elideCopies(SPRMI);
+ // Is it a subreg copy of ssub_0?
+ if (EC && EC->isCopy() &&
+ EC->getOperand(1).getSubReg() == ARM::ssub_0) {
+ DEBUG(dbgs() << "Found a subreg copy: " << *SPRMI);
+
+ // Find the thing we're subreg copying out of - is it of the same
+ // regclass as DPRMI? (i.e. a DPR or QPR).
+ unsigned FullReg = SPRMI->getOperand(1).getReg();
+ const TargetRegisterClass *TRC =
+ MRI->getRegClass(MI->getOperand(1).getReg());
+ if (TRC->hasSuperClassEq(MRI->getRegClass(FullReg))) {
+ DEBUG(dbgs() << "Subreg copy is compatible - returning ");
+ DEBUG(dbgs() << PrintReg(FullReg) << "\n");
+ eraseInstrWithNoUses(MI);
+ return FullReg;
+ }
+ }
+
+ return optimizeAllLanesPattern(MI, MI->getOperand(2).getReg());
+ }
+ }
+ }
+ return optimizeAllLanesPattern(MI, MI->getOperand(0).getReg());
+ }
+
+ if (MI->isRegSequence() && usesRegClass(MI->getOperand(1),
+ &ARM::SPRRegClass)) {
+ // See if all bar one of the operands are IMPLICIT_DEF and insert the
+ // optimizer pattern accordingly.
+ unsigned NumImplicit = 0, NumTotal = 0;
+ unsigned NonImplicitReg = ~0U;
+
+ for (unsigned I = 1; I < MI->getNumExplicitOperands(); ++I) {
+ if (!MI->getOperand(I).isReg())
+ continue;
+ ++NumTotal;
+ unsigned OpReg = MI->getOperand(I).getReg();
+
+ if (!TRI->isVirtualRegister(OpReg))
+ break;
+
+ MachineInstr *Def = MRI->getVRegDef(OpReg);
+ if (!Def)
+ break;
+ if (Def->isImplicitDef())
+ ++NumImplicit;
+ else
+ NonImplicitReg = MI->getOperand(I).getReg();
+ }
+
+ if (NumImplicit == NumTotal - 1)
+ return optimizeAllLanesPattern(MI, NonImplicitReg);
+ else
+ return optimizeAllLanesPattern(MI, MI->getOperand(0).getReg());
+ }
+
+ llvm_unreachable("Unhandled update pattern!");
+}
+
+// Return true if this MachineInstr inserts a scalar (SPR) value into
+// a D or Q register.
+bool A15SDOptimizer::hasPartialWrite(MachineInstr *MI) {
+ // The only way we can do a partial register update is through a COPY,
+ // INSERT_SUBREG or REG_SEQUENCE.
+ if (MI->isCopy() && usesRegClass(MI->getOperand(1), &ARM::SPRRegClass))
+ return true;
+
+ if (MI->isInsertSubreg() && usesRegClass(MI->getOperand(2),
+ &ARM::SPRRegClass))
+ return true;
+
+ if (MI->isRegSequence() && usesRegClass(MI->getOperand(1), &ARM::SPRRegClass))
+ return true;
+
+ return false;
+}
+
+// Looks through full copies to get the instruction that defines the input
+// operand for MI.
+MachineInstr *A15SDOptimizer::elideCopies(MachineInstr *MI) {
+ if (!MI->isFullCopy())
+ return MI;
+ if (!TRI->isVirtualRegister(MI->getOperand(1).getReg()))
+ return nullptr;
+ MachineInstr *Def = MRI->getVRegDef(MI->getOperand(1).getReg());
+ if (!Def)
+ return nullptr;
+ return elideCopies(Def);
+}
+
+// Look through full copies and PHIs to get the set of non-copy MachineInstrs
+// that can produce MI.
+void A15SDOptimizer::elideCopiesAndPHIs(MachineInstr *MI,
+ SmallVectorImpl<MachineInstr*> &Outs) {
+ // Looking through PHIs may create loops so we need to track what
+ // instructions we have visited before.
+ std::set<MachineInstr *> Reached;
+ SmallVector<MachineInstr *, 8> Front;
+ Front.push_back(MI);
+ while (Front.size() != 0) {
+ MI = Front.back();
+ Front.pop_back();
+
+ // If we have already explored this MachineInstr, ignore it.
+ if (Reached.find(MI) != Reached.end())
+ continue;
+ Reached.insert(MI);
+ if (MI->isPHI()) {
+ for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) {
+ unsigned Reg = MI->getOperand(I).getReg();
+ if (!TRI->isVirtualRegister(Reg)) {
+ continue;
+ }
+ MachineInstr *NewMI = MRI->getVRegDef(Reg);
+ if (!NewMI)
+ continue;
+ Front.push_back(NewMI);
+ }
+ } else if (MI->isFullCopy()) {
+ if (!TRI->isVirtualRegister(MI->getOperand(1).getReg()))
+ continue;
+ MachineInstr *NewMI = MRI->getVRegDef(MI->getOperand(1).getReg());
+ if (!NewMI)
+ continue;
+ Front.push_back(NewMI);
+ } else {
+ DEBUG(dbgs() << "Found partial copy" << *MI <<"\n");
+ Outs.push_back(MI);
+ }
+ }
+}
+
+// Return the DPR virtual registers that are read by this machine instruction
+// (if any).
+SmallVector<unsigned, 8> A15SDOptimizer::getReadDPRs(MachineInstr *MI) {
+ if (MI->isCopyLike() || MI->isInsertSubreg() || MI->isRegSequence() ||
+ MI->isKill())
+ return SmallVector<unsigned, 8>();
+
+ SmallVector<unsigned, 8> Defs;
+ for (unsigned i = 0; i < MI->getNumOperands(); ++i) {
+ MachineOperand &MO = MI->getOperand(i);
+
+ if (!MO.isReg() || !MO.isUse())
+ continue;
+ if (!usesRegClass(MO, &ARM::DPRRegClass) &&
+ !usesRegClass(MO, &ARM::QPRRegClass) &&
+ !usesRegClass(MO, &ARM::DPairRegClass)) // Treat DPair as QPR
+ continue;
+
+ Defs.push_back(MO.getReg());
+ }
+ return Defs;
+}
+
+// Creates a DPR register from an SPR one by using a VDUP.
+unsigned A15SDOptimizer::createDupLane(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator InsertBefore,
+ const DebugLoc &DL, unsigned Reg,
+ unsigned Lane, bool QPR) {
+ unsigned Out = MRI->createVirtualRegister(QPR ? &ARM::QPRRegClass :
+ &ARM::DPRRegClass);
+ AddDefaultPred(BuildMI(MBB,
+ InsertBefore,
+ DL,
+ TII->get(QPR ? ARM::VDUPLN32q : ARM::VDUPLN32d),
+ Out)
+ .addReg(Reg)
+ .addImm(Lane));
+
+ return Out;
+}
+
+// Creates a SPR register from a DPR by copying the value in lane 0.
+unsigned A15SDOptimizer::createExtractSubreg(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+ const DebugLoc &DL, unsigned DReg, unsigned Lane,
+ const TargetRegisterClass *TRC) {
+ unsigned Out = MRI->createVirtualRegister(TRC);
+ BuildMI(MBB,
+ InsertBefore,
+ DL,
+ TII->get(TargetOpcode::COPY), Out)
+ .addReg(DReg, 0, Lane);
+
+ return Out;
+}
+
+// Takes two SPR registers and creates a DPR by using a REG_SEQUENCE.
+unsigned A15SDOptimizer::createRegSequence(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+ const DebugLoc &DL, unsigned Reg1, unsigned Reg2) {
+ unsigned Out = MRI->createVirtualRegister(&ARM::QPRRegClass);
+ BuildMI(MBB,
+ InsertBefore,
+ DL,
+ TII->get(TargetOpcode::REG_SEQUENCE), Out)
+ .addReg(Reg1)
+ .addImm(ARM::dsub_0)
+ .addReg(Reg2)
+ .addImm(ARM::dsub_1);
+ return Out;
+}
+
+// Takes two DPR registers that have previously been VDUPed (Ssub0 and Ssub1)
+// and merges them into one DPR register.
+unsigned A15SDOptimizer::createVExt(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator InsertBefore,
+ const DebugLoc &DL, unsigned Ssub0,
+ unsigned Ssub1) {
+ unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass);
+ AddDefaultPred(BuildMI(MBB,
+ InsertBefore,
+ DL,
+ TII->get(ARM::VEXTd32), Out)
+ .addReg(Ssub0)
+ .addReg(Ssub1)
+ .addImm(1));
+ return Out;
+}
+
+unsigned A15SDOptimizer::createInsertSubreg(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+ const DebugLoc &DL, unsigned DReg, unsigned Lane, unsigned ToInsert) {
+ unsigned Out = MRI->createVirtualRegister(&ARM::DPR_VFP2RegClass);
+ BuildMI(MBB,
+ InsertBefore,
+ DL,
+ TII->get(TargetOpcode::INSERT_SUBREG), Out)
+ .addReg(DReg)
+ .addReg(ToInsert)
+ .addImm(Lane);
+
+ return Out;
+}
+
+unsigned
+A15SDOptimizer::createImplicitDef(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator InsertBefore,
+ const DebugLoc &DL) {
+ unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass);
+ BuildMI(MBB,
+ InsertBefore,
+ DL,
+ TII->get(TargetOpcode::IMPLICIT_DEF), Out);
+ return Out;
+}
+
+// This function inserts instructions in order to optimize interactions between
+// SPR registers and DPR/QPR registers. It does so by performing VDUPs on all
+// lanes, and the using VEXT instructions to recompose the result.
+unsigned
+A15SDOptimizer::optimizeAllLanesPattern(MachineInstr *MI, unsigned Reg) {
+ MachineBasicBlock::iterator InsertPt(MI);
+ DebugLoc DL = MI->getDebugLoc();
+ MachineBasicBlock &MBB = *MI->getParent();
+ InsertPt++;
+ unsigned Out;
+
+ // DPair has the same length as QPR and also has two DPRs as subreg.
+ // Treat DPair as QPR.
+ if (MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::QPRRegClass) ||
+ MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::DPairRegClass)) {
+ unsigned DSub0 = createExtractSubreg(MBB, InsertPt, DL, Reg,
+ ARM::dsub_0, &ARM::DPRRegClass);
+ unsigned DSub1 = createExtractSubreg(MBB, InsertPt, DL, Reg,
+ ARM::dsub_1, &ARM::DPRRegClass);
+
+ unsigned Out1 = createDupLane(MBB, InsertPt, DL, DSub0, 0);
+ unsigned Out2 = createDupLane(MBB, InsertPt, DL, DSub0, 1);
+ Out = createVExt(MBB, InsertPt, DL, Out1, Out2);
+
+ unsigned Out3 = createDupLane(MBB, InsertPt, DL, DSub1, 0);
+ unsigned Out4 = createDupLane(MBB, InsertPt, DL, DSub1, 1);
+ Out2 = createVExt(MBB, InsertPt, DL, Out3, Out4);
+
+ Out = createRegSequence(MBB, InsertPt, DL, Out, Out2);
+
+ } else if (MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::DPRRegClass)) {
+ unsigned Out1 = createDupLane(MBB, InsertPt, DL, Reg, 0);
+ unsigned Out2 = createDupLane(MBB, InsertPt, DL, Reg, 1);
+ Out = createVExt(MBB, InsertPt, DL, Out1, Out2);
+
+ } else {
+ assert(MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::SPRRegClass) &&
+ "Found unexpected regclass!");
+
+ unsigned PrefLane = getPrefSPRLane(Reg);
+ unsigned Lane;
+ switch (PrefLane) {
+ case ARM::ssub_0: Lane = 0; break;
+ case ARM::ssub_1: Lane = 1; break;
+ default: llvm_unreachable("Unknown preferred lane!");
+ }
+
+ // Treat DPair as QPR
+ bool UsesQPR = usesRegClass(MI->getOperand(0), &ARM::QPRRegClass) ||
+ usesRegClass(MI->getOperand(0), &ARM::DPairRegClass);
+
+ Out = createImplicitDef(MBB, InsertPt, DL);
+ Out = createInsertSubreg(MBB, InsertPt, DL, Out, PrefLane, Reg);
+ Out = createDupLane(MBB, InsertPt, DL, Out, Lane, UsesQPR);
+ eraseInstrWithNoUses(MI);
+ }
+ return Out;
+}
+
+bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) {
+ // We look for instructions that write S registers that are then read as
+ // D/Q registers. These can only be caused by COPY, INSERT_SUBREG and
+ // REG_SEQUENCE pseudos that insert an SPR value into a DPR register or
+ // merge two SPR values to form a DPR register. In order avoid false
+ // positives we make sure that there is an SPR producer so we look past
+ // COPY and PHI nodes to find it.
+ //
+ // The best code pattern for when an SPR producer is going to be used by a
+ // DPR or QPR consumer depends on whether the other lanes of the
+ // corresponding DPR/QPR are currently defined.
+ //
+ // We can handle these efficiently, depending on the type of
+ // pseudo-instruction that is producing the pattern
+ //
+ // * COPY: * VDUP all lanes and merge the results together
+ // using VEXTs.
+ //
+ // * INSERT_SUBREG: * If the SPR value was originally in another DPR/QPR
+ // lane, and the other lane(s) of the DPR/QPR register
+ // that we are inserting in are undefined, use the
+ // original DPR/QPR value.
+ // * Otherwise, fall back on the same stategy as COPY.
+ //
+ // * REG_SEQUENCE: * If all except one of the input operands are
+ // IMPLICIT_DEFs, insert the VDUP pattern for just the
+ // defined input operand
+ // * Otherwise, fall back on the same stategy as COPY.
+ //
+
+ // First, get all the reads of D-registers done by this instruction.
+ SmallVector<unsigned, 8> Defs = getReadDPRs(MI);
+ bool Modified = false;
+
+ for (SmallVectorImpl<unsigned>::iterator I = Defs.begin(), E = Defs.end();
+ I != E; ++I) {
+ // Follow the def-use chain for this DPR through COPYs, and also through
+ // PHIs (which are essentially multi-way COPYs). It is because of PHIs that
+ // we can end up with multiple defs of this DPR.
+
+ SmallVector<MachineInstr *, 8> DefSrcs;
+ if (!TRI->isVirtualRegister(*I))
+ continue;
+ MachineInstr *Def = MRI->getVRegDef(*I);
+ if (!Def)
+ continue;
+
+ elideCopiesAndPHIs(Def, DefSrcs);
+
+ for (SmallVectorImpl<MachineInstr *>::iterator II = DefSrcs.begin(),
+ EE = DefSrcs.end(); II != EE; ++II) {
+ MachineInstr *MI = *II;
+
+ // If we've already analyzed and replaced this operand, don't do
+ // anything.
+ if (Replacements.find(MI) != Replacements.end())
+ continue;
+
+ // Now, work out if the instruction causes a SPR->DPR dependency.
+ if (!hasPartialWrite(MI))
+ continue;
+
+ // Collect all the uses of this MI's DPR def for updating later.
+ SmallVector<MachineOperand*, 8> Uses;
+ unsigned DPRDefReg = MI->getOperand(0).getReg();
+ for (MachineRegisterInfo::use_iterator I = MRI->use_begin(DPRDefReg),
+ E = MRI->use_end(); I != E; ++I)
+ Uses.push_back(&*I);
+
+ // We can optimize this.
+ unsigned NewReg = optimizeSDPattern(MI);
+
+ if (NewReg != 0) {
+ Modified = true;
+ for (SmallVectorImpl<MachineOperand *>::const_iterator I = Uses.begin(),
+ E = Uses.end(); I != E; ++I) {
+ // Make sure to constrain the register class of the new register to
+ // match what we're replacing. Otherwise we can optimize a DPR_VFP2
+ // reference into a plain DPR, and that will end poorly. NewReg is
+ // always virtual here, so there will always be a matching subclass
+ // to find.
+ MRI->constrainRegClass(NewReg, MRI->getRegClass((*I)->getReg()));
+
+ DEBUG(dbgs() << "Replacing operand "
+ << **I << " with "
+ << PrintReg(NewReg) << "\n");
+ (*I)->substVirtReg(NewReg, 0, *TRI);
+ }
+ }
+ Replacements[MI] = NewReg;
+ }
+ }
+ return Modified;
+}
+
+bool A15SDOptimizer::runOnMachineFunction(MachineFunction &Fn) {
+ if (skipFunction(*Fn.getFunction()))
+ return false;
+
+ const ARMSubtarget &STI = Fn.getSubtarget<ARMSubtarget>();
+ // Since the A15SDOptimizer pass can insert VDUP instructions, it can only be
+ // enabled when NEON is available.
+ if (!(STI.isCortexA15() && STI.hasNEON()))
+ return false;
+ TII = STI.getInstrInfo();
+ TRI = STI.getRegisterInfo();
+ MRI = &Fn.getRegInfo();
+ bool Modified = false;
+
+ DEBUG(dbgs() << "Running on function " << Fn.getName()<< "\n");
+
+ DeadInstr.clear();
+ Replacements.clear();
+
+ for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+ ++MFI) {
+
+ for (MachineBasicBlock::iterator MI = MFI->begin(), ME = MFI->end();
+ MI != ME;) {
+ Modified |= runOnInstruction(&*MI++);
+ }
+
+ }
+
+ for (std::set<MachineInstr *>::iterator I = DeadInstr.begin(),
+ E = DeadInstr.end();
+ I != E; ++I) {
+ (*I)->eraseFromParent();
+ }
+
+ return Modified;
+}
+
+FunctionPass *llvm::createA15SDOptimizerPass() {
+ return new A15SDOptimizer();
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARM.h b/contrib/llvm/lib/Target/ARM/ARM.h
new file mode 100644
index 000000000000..be3048252bbc
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARM.h
@@ -0,0 +1,59 @@
+//===-- ARM.h - Top-level interface for ARM representation ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// ARM back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARM_H
+#define LLVM_LIB_TARGET_ARM_ARM_H
+
+#include "llvm/Support/CodeGen.h"
+#include "ARMBasicBlockInfo.h"
+#include <functional>
+
+namespace llvm {
+
+class ARMAsmPrinter;
+class ARMBaseTargetMachine;
+class Function;
+class FunctionPass;
+class ImmutablePass;
+class MachineInstr;
+class MCInst;
+class PassRegistry;
+class TargetLowering;
+class TargetMachine;
+
+FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM,
+ CodeGenOpt::Level OptLevel);
+FunctionPass *createA15SDOptimizerPass();
+FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false);
+FunctionPass *createARMExpandPseudoPass();
+FunctionPass *createARMConstantIslandPass();
+FunctionPass *createMLxExpansionPass();
+FunctionPass *createThumb2ITBlockPass();
+FunctionPass *createARMOptimizeBarriersPass();
+FunctionPass *createThumb2SizeReductionPass(
+ std::function<bool(const Function &)> Ftor = nullptr);
+
+void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
+ ARMAsmPrinter &AP);
+
+void computeBlockSize(MachineFunction *MF, MachineBasicBlock *MBB,
+ BasicBlockInfo &BBI);
+std::vector<BasicBlockInfo> computeAllBlockSizes(MachineFunction *MF);
+
+void initializeARMLoadStoreOptPass(PassRegistry &);
+void initializeARMPreAllocLoadStoreOptPass(PassRegistry &);
+
+} // end namespace llvm;
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARM.td b/contrib/llvm/lib/Target/ARM/ARM.td
new file mode 100644
index 000000000000..2a090faeee6a
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARM.td
@@ -0,0 +1,873 @@
+//===-- ARM.td - Describe the ARM Target Machine -----------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// ARM Helper classes.
+//
+
+class ProcNoItin<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+class Architecture<string fname, string aname, list<SubtargetFeature> features >
+ : SubtargetFeature<fname, "ARMArch", aname,
+ !strconcat(aname, " architecture"), features>;
+
+//===----------------------------------------------------------------------===//
+// ARM Subtarget state.
+//
+
+def ModeThumb : SubtargetFeature<"thumb-mode", "InThumbMode", "true",
+ "Thumb mode">;
+
+def ModeSoftFloat : SubtargetFeature<"soft-float", "UseSoftFloat", "true",
+ "Use software floating point features.">;
+
+//===----------------------------------------------------------------------===//
+// ARM Subtarget features.
+//
+
+def FeatureVFP2 : SubtargetFeature<"vfp2", "HasVFPv2", "true",
+ "Enable VFP2 instructions">;
+def FeatureVFP3 : SubtargetFeature<"vfp3", "HasVFPv3", "true",
+ "Enable VFP3 instructions",
+ [FeatureVFP2]>;
+def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
+ "Enable NEON instructions",
+ [FeatureVFP3]>;
+def FeatureThumb2 : SubtargetFeature<"thumb2", "HasThumb2", "true",
+ "Enable Thumb2 instructions">;
+def FeatureNoARM : SubtargetFeature<"noarm", "NoARM", "true",
+ "Does not support ARM mode execution",
+ [ModeThumb]>;
+def FeatureFP16 : SubtargetFeature<"fp16", "HasFP16", "true",
+ "Enable half-precision floating point">;
+def FeatureVFP4 : SubtargetFeature<"vfp4", "HasVFPv4", "true",
+ "Enable VFP4 instructions",
+ [FeatureVFP3, FeatureFP16]>;
+def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8",
+ "true", "Enable ARMv8 FP",
+ [FeatureVFP4]>;
+def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true",
+ "Enable full half-precision floating point",
+ [FeatureFPARMv8]>;
+def FeatureD16 : SubtargetFeature<"d16", "HasD16", "true",
+ "Restrict FP to 16 double registers">;
+def FeatureHWDiv : SubtargetFeature<"hwdiv", "HasHardwareDivide", "true",
+ "Enable divide instructions">;
+def FeatureHWDivARM : SubtargetFeature<"hwdiv-arm",
+ "HasHardwareDivideInARM", "true",
+ "Enable divide instructions in ARM mode">;
+def FeatureT2XtPk : SubtargetFeature<"t2xtpk", "HasT2ExtractPack", "true",
+ "Enable Thumb2 extract and pack instructions">;
+def FeatureDB : SubtargetFeature<"db", "HasDataBarrier", "true",
+ "Has data barrier (dmb / dsb) instructions">;
+def FeatureV7Clrex : SubtargetFeature<"v7clrex", "HasV7Clrex", "true",
+ "Has v7 clrex instruction">;
+def FeatureAcquireRelease : SubtargetFeature<"acquire-release",
+ "HasAcquireRelease", "true",
+ "Has v8 acquire/release (lda/ldaex etc) instructions">;
+def FeatureSlowFPBrcc : SubtargetFeature<"slow-fp-brcc", "SlowFPBrcc", "true",
+ "FP compare + branch is slow">;
+def FeatureVFPOnlySP : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true",
+ "Floating point unit supports single precision only">;
+def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true",
+ "Enable support for Performance Monitor extensions">;
+def FeatureTrustZone : SubtargetFeature<"trustzone", "HasTrustZone", "true",
+ "Enable support for TrustZone security extensions">;
+def Feature8MSecExt : SubtargetFeature<"8msecext", "Has8MSecExt", "true",
+ "Enable support for ARMv8-M Security Extensions">;
+def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
+ "Enable support for Cryptography extensions",
+ [FeatureNEON]>;
+def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
+ "Enable support for CRC instructions">;
+// Not to be confused with FeatureHasRetAddrStack (return address stack)
+def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true",
+ "Enable Reliability, Availability and Serviceability extensions">;
+def FeatureFPAO : SubtargetFeature<"fpao", "HasFPAO", "true",
+ "Enable fast computation of positive address offsets">;
+
+
+// Cyclone has preferred instructions for zeroing VFP registers, which can
+// execute in 0 cycles.
+def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
+ "Has zero-cycle zeroing instructions">;
+
+// Whether or not it may be profitable to unpredicate certain instructions
+// during if conversion.
+def FeatureProfUnpredicate : SubtargetFeature<"prof-unpr",
+ "IsProfitableToUnpredicate",
+ "true",
+ "Is profitable to unpredicate">;
+
+// Some targets (e.g. Swift) have microcoded VGETLNi32.
+def FeatureSlowVGETLNi32 : SubtargetFeature<"slow-vgetlni32",
+ "HasSlowVGETLNi32", "true",
+ "Has slow VGETLNi32 - prefer VMOV">;
+
+// Some targets (e.g. Swift) have microcoded VDUP32.
+def FeatureSlowVDUP32 : SubtargetFeature<"slow-vdup32", "HasSlowVDUP32", "true",
+ "Has slow VDUP32 - prefer VMOV">;
+
+// Some targets (e.g. Cortex-A9) prefer VMOVSR to VMOVDRR even when using NEON
+// for scalar FP, as this allows more effective execution domain optimization.
+def FeaturePreferVMOVSR : SubtargetFeature<"prefer-vmovsr", "PreferVMOVSR",
+ "true", "Prefer VMOVSR">;
+
+// Swift has ISHST barriers compatible with Atomic Release semantics but weaker
+// than ISH
+def FeaturePrefISHSTBarrier : SubtargetFeature<"prefer-ishst", "PreferISHST",
+ "true", "Prefer ISHST barriers">;
+
+// Some targets (e.g. Cortex-A9) have muxed AGU and NEON/FPU.
+def FeatureMuxedUnits : SubtargetFeature<"muxed-units", "HasMuxedUnits", "true",
+ "Has muxed AGU and NEON/FPU">;
+
+// On some targets, a VLDM/VSTM starting with an odd register number needs more
+// microops than single VLDRS.
+def FeatureSlowOddRegister : SubtargetFeature<"slow-odd-reg", "SlowOddRegister",
+ "true", "VLDM/VSTM starting with an odd register is slow">;
+
+// Some targets have a renaming dependency when loading into D subregisters.
+def FeatureSlowLoadDSubreg : SubtargetFeature<"slow-load-D-subreg",
+ "SlowLoadDSubregister", "true",
+ "Loading into D subregs is slow">;
+// Some targets (e.g. Cortex-A15) never want VMOVS to be widened to VMOVD.
+def FeatureDontWidenVMOVS : SubtargetFeature<"dont-widen-vmovs",
+ "DontWidenVMOVS", "true",
+ "Don't widen VMOVS to VMOVD">;
+
+// Whether or not it is profitable to expand VFP/NEON MLA/MLS instructions.
+def FeatureExpandMLx : SubtargetFeature<"expand-fp-mlx", "ExpandMLx", "true",
+ "Expand VFP/NEON MLA/MLS instructions">;
+
+// Some targets have special RAW hazards for VFP/NEON VMLA/VMLS.
+def FeatureHasVMLxHazards : SubtargetFeature<"vmlx-hazards", "HasVMLxHazards",
+ "true", "Has VMLx hazards">;
+
+// Some targets (e.g. Cortex-A9) want to convert VMOVRS, VMOVSR and VMOVS from
+// VFP to NEON, as an execution domain optimization.
+def FeatureNEONForFPMovs : SubtargetFeature<"neon-fpmovs", "UseNEONForFPMovs",
+ "true", "Convert VMOVSR, VMOVRS, VMOVS to NEON">;
+
+// Some processors benefit from using NEON instructions for scalar
+// single-precision FP operations. This affects instruction selection and should
+// only be enabled if the handling of denormals is not important.
+def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP",
+ "true",
+ "Use NEON for single precision FP">;
+
+// On some processors, VLDn instructions that access unaligned data take one
+// extra cycle. Take that into account when computing operand latencies.
+def FeatureCheckVLDnAlign : SubtargetFeature<"vldn-align", "CheckVLDnAlign",
+ "true",
+ "Check for VLDn unaligned access">;
+
+// Some processors have a nonpipelined VFP coprocessor.
+def FeatureNonpipelinedVFP : SubtargetFeature<"nonpipelined-vfp",
+ "NonpipelinedVFP", "true",
+ "VFP instructions are not pipelined">;
+
+// Some processors have FP multiply-accumulate instructions that don't
+// play nicely with other VFP / NEON instructions, and it's generally better
+// to just not use them.
+def FeatureHasSlowFPVMLx : SubtargetFeature<"slowfpvmlx", "SlowFPVMLx", "true",
+ "Disable VFP / NEON MAC instructions">;
+
+// Cortex-A8 / A9 Advanced SIMD has multiplier accumulator forwarding.
+def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding",
+ "HasVMLxForwarding", "true",
+ "Has multiplier accumulator forwarding">;
+
+// Disable 32-bit to 16-bit narrowing for experimentation.
+def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true",
+ "Prefer 32-bit Thumb instrs">;
+
+/// Some instructions update CPSR partially, which can add false dependency for
+/// out-of-order implementation, e.g. Cortex-A9, unless each individual bit is
+/// mapped to a separate physical register. Avoid partial CPSR update for these
+/// processors.
+def FeatureAvoidPartialCPSR : SubtargetFeature<"avoid-partial-cpsr",
+ "AvoidCPSRPartialUpdate", "true",
+ "Avoid CPSR partial update for OOO execution">;
+
+def FeatureAvoidMOVsShOp : SubtargetFeature<"avoid-movs-shop",
+ "AvoidMOVsShifterOperand", "true",
+ "Avoid movs instructions with shifter operand">;
+
+// Some processors perform return stack prediction. CodeGen should avoid issue
+// "normal" call instructions to callees which do not return.
+def FeatureHasRetAddrStack : SubtargetFeature<"ret-addr-stack", "HasRetAddrStack", "true",
+ "Has return address stack">;
+
+/// DSP extension.
+def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true",
+ "Supports DSP instructions in ARM and/or Thumb2">;
+
+// Multiprocessing extension.
+def FeatureMP : SubtargetFeature<"mp", "HasMPExtension", "true",
+ "Supports Multiprocessing extension">;
+
+// Virtualization extension - requires HW divide (ARMv7-AR ARMARM - 4.4.8).
+def FeatureVirtualization : SubtargetFeature<"virtualization",
+ "HasVirtualization", "true",
+ "Supports Virtualization extension",
+ [FeatureHWDiv, FeatureHWDivARM]>;
+
+// M-series ISA
+def FeatureMClass : SubtargetFeature<"mclass", "ARMProcClass", "MClass",
+ "Is microcontroller profile ('M' series)">;
+
+// R-series ISA
+def FeatureRClass : SubtargetFeature<"rclass", "ARMProcClass", "RClass",
+ "Is realtime profile ('R' series)">;
+
+// A-series ISA
+def FeatureAClass : SubtargetFeature<"aclass", "ARMProcClass", "AClass",
+ "Is application profile ('A' series)">;
+
+// Special TRAP encoding for NaCl, which looks like a TRAP in Thumb too.
+// See ARMInstrInfo.td for details.
+def FeatureNaClTrap : SubtargetFeature<"nacl-trap", "UseNaClTrap", "true",
+ "NaCl trap">;
+
+def FeatureStrictAlign : SubtargetFeature<"strict-align",
+ "StrictAlign", "true",
+ "Disallow all unaligned memory "
+ "access">;
+
+def FeatureLongCalls : SubtargetFeature<"long-calls", "GenLongCalls", "true",
+ "Generate calls via indirect call "
+ "instructions">;
+
+def FeatureReserveR9 : SubtargetFeature<"reserve-r9", "ReserveR9", "true",
+ "Reserve R9, making it unavailable as "
+ "GPR">;
+
+def FeatureNoMovt : SubtargetFeature<"no-movt", "NoMovt", "true",
+ "Don't use movt/movw pairs for 32-bit "
+ "imms">;
+
+
+//===----------------------------------------------------------------------===//
+// ARM ISAa.
+//
+
+def HasV4TOps : SubtargetFeature<"v4t", "HasV4TOps", "true",
+ "Support ARM v4T instructions">;
+def HasV5TOps : SubtargetFeature<"v5t", "HasV5TOps", "true",
+ "Support ARM v5T instructions",
+ [HasV4TOps]>;
+def HasV5TEOps : SubtargetFeature<"v5te", "HasV5TEOps", "true",
+ "Support ARM v5TE, v5TEj, and v5TExp instructions",
+ [HasV5TOps]>;
+def HasV6Ops : SubtargetFeature<"v6", "HasV6Ops", "true",
+ "Support ARM v6 instructions",
+ [HasV5TEOps]>;
+def HasV6MOps : SubtargetFeature<"v6m", "HasV6MOps", "true",
+ "Support ARM v6M instructions",
+ [HasV6Ops]>;
+def HasV8MBaselineOps : SubtargetFeature<"v8m", "HasV8MBaselineOps", "true",
+ "Support ARM v8M Baseline instructions",
+ [HasV6MOps]>;
+def HasV6KOps : SubtargetFeature<"v6k", "HasV6KOps", "true",
+ "Support ARM v6k instructions",
+ [HasV6Ops]>;
+def HasV6T2Ops : SubtargetFeature<"v6t2", "HasV6T2Ops", "true",
+ "Support ARM v6t2 instructions",
+ [HasV8MBaselineOps, HasV6KOps, FeatureThumb2]>;
+def HasV7Ops : SubtargetFeature<"v7", "HasV7Ops", "true",
+ "Support ARM v7 instructions",
+ [HasV6T2Ops, FeaturePerfMon,
+ FeatureV7Clrex]>;
+def HasV8Ops : SubtargetFeature<"v8", "HasV8Ops", "true",
+ "Support ARM v8 instructions",
+ [HasV7Ops, FeatureAcquireRelease,
+ FeatureT2XtPk]>;
+def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true",
+ "Support ARM v8.1a instructions",
+ [HasV8Ops]>;
+def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
+ "Support ARM v8.2a instructions",
+ [HasV8_1aOps]>;
+def HasV8MMainlineOps : SubtargetFeature<"v8m.main", "HasV8MMainlineOps", "true",
+ "Support ARM v8M Mainline instructions",
+ [HasV7Ops]>;
+
+
+//===----------------------------------------------------------------------===//
+// ARM Processor subtarget features.
+//
+
+def ProcA5 : SubtargetFeature<"a5", "ARMProcFamily", "CortexA5",
+ "Cortex-A5 ARM processors", []>;
+def ProcA7 : SubtargetFeature<"a7", "ARMProcFamily", "CortexA7",
+ "Cortex-A7 ARM processors", []>;
+def ProcA8 : SubtargetFeature<"a8", "ARMProcFamily", "CortexA8",
+ "Cortex-A8 ARM processors", []>;
+def ProcA9 : SubtargetFeature<"a9", "ARMProcFamily", "CortexA9",
+ "Cortex-A9 ARM processors", []>;
+def ProcA12 : SubtargetFeature<"a12", "ARMProcFamily", "CortexA12",
+ "Cortex-A12 ARM processors", []>;
+def ProcA15 : SubtargetFeature<"a15", "ARMProcFamily", "CortexA15",
+ "Cortex-A15 ARM processors", []>;
+def ProcA17 : SubtargetFeature<"a17", "ARMProcFamily", "CortexA17",
+ "Cortex-A17 ARM processors", []>;
+def ProcA32 : SubtargetFeature<"a32", "ARMProcFamily", "CortexA32",
+ "Cortex-A32 ARM processors", []>;
+def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
+ "Cortex-A35 ARM processors", []>;
+def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
+ "Cortex-A53 ARM processors", []>;
+def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
+ "Cortex-A57 ARM processors", []>;
+def ProcA72 : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72",
+ "Cortex-A72 ARM processors", []>;
+def ProcA73 : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73",
+ "Cortex-A73 ARM processors", []>;
+
+def ProcKrait : SubtargetFeature<"krait", "ARMProcFamily", "Krait",
+ "Qualcomm ARM processors", []>;
+def ProcSwift : SubtargetFeature<"swift", "ARMProcFamily", "Swift",
+ "Swift ARM processors", []>;
+
+def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
+ "Samsung Exynos-M1 processors", []>;
+
+def ProcR4 : SubtargetFeature<"r4", "ARMProcFamily", "CortexR4",
+ "Cortex-R4 ARM processors", []>;
+def ProcR5 : SubtargetFeature<"r5", "ARMProcFamily", "CortexR5",
+ "Cortex-R5 ARM processors", []>;
+def ProcR7 : SubtargetFeature<"r7", "ARMProcFamily", "CortexR7",
+ "Cortex-R7 ARM processors", []>;
+def ProcR52 : SubtargetFeature<"r52", "ARMProcFamily", "CortexR52",
+ "Cortex-R52 ARM processors", []>;
+
+def ProcM3 : SubtargetFeature<"m3", "ARMProcFamily", "CortexM3",
+ "Cortex-M3 ARM processors", []>;
+
+//===----------------------------------------------------------------------===//
+// ARM schedules.
+//
+
+include "ARMSchedule.td"
+
+
+//===----------------------------------------------------------------------===//
+// ARM architectures
+//
+
+def ARMv2 : Architecture<"armv2", "ARMv2", []>;
+
+def ARMv2a : Architecture<"armv2a", "ARMv2a", []>;
+
+def ARMv3 : Architecture<"armv3", "ARMv3", []>;
+
+def ARMv3m : Architecture<"armv3m", "ARMv3m", []>;
+
+def ARMv4 : Architecture<"armv4", "ARMv4", []>;
+
+def ARMv4t : Architecture<"armv4t", "ARMv4t", [HasV4TOps]>;
+
+def ARMv5t : Architecture<"armv5t", "ARMv5t", [HasV5TOps]>;
+
+def ARMv5te : Architecture<"armv5te", "ARMv5te", [HasV5TEOps]>;
+
+def ARMv5tej : Architecture<"armv5tej", "ARMv5tej", [HasV5TEOps]>;
+
+def ARMv6 : Architecture<"armv6", "ARMv6", [HasV6Ops]>;
+
+def ARMv6t2 : Architecture<"armv6t2", "ARMv6t2", [HasV6T2Ops,
+ FeatureDSP,
+ FeatureT2XtPk]>;
+
+def ARMv6k : Architecture<"armv6k", "ARMv6k", [HasV6KOps]>;
+
+def ARMv6kz : Architecture<"armv6kz", "ARMv6kz", [HasV6KOps,
+ FeatureTrustZone]>;
+
+def ARMv6m : Architecture<"armv6-m", "ARMv6m", [HasV6MOps,
+ FeatureNoARM,
+ FeatureDB,
+ FeatureMClass]>;
+
+def ARMv6sm : Architecture<"armv6s-m", "ARMv6sm", [HasV6MOps,
+ FeatureNoARM,
+ FeatureDB,
+ FeatureMClass]>;
+
+def ARMv7a : Architecture<"armv7-a", "ARMv7a", [HasV7Ops,
+ FeatureNEON,
+ FeatureDB,
+ FeatureDSP,
+ FeatureAClass,
+ FeatureT2XtPk]>;
+
+def ARMv7r : Architecture<"armv7-r", "ARMv7r", [HasV7Ops,
+ FeatureDB,
+ FeatureDSP,
+ FeatureHWDiv,
+ FeatureRClass,
+ FeatureT2XtPk]>;
+
+def ARMv7m : Architecture<"armv7-m", "ARMv7m", [HasV7Ops,
+ FeatureThumb2,
+ FeatureNoARM,
+ FeatureDB,
+ FeatureHWDiv,
+ FeatureMClass]>;
+
+def ARMv7em : Architecture<"armv7e-m", "ARMv7em", [HasV7Ops,
+ FeatureThumb2,
+ FeatureNoARM,
+ FeatureDB,
+ FeatureHWDiv,
+ FeatureMClass,
+ FeatureDSP,
+ FeatureT2XtPk]>;
+
+def ARMv8a : Architecture<"armv8-a", "ARMv8a", [HasV8Ops,
+ FeatureAClass,
+ FeatureDB,
+ FeatureFPARMv8,
+ FeatureNEON,
+ FeatureDSP,
+ FeatureTrustZone,
+ FeatureMP,
+ FeatureVirtualization,
+ FeatureCrypto,
+ FeatureCRC]>;
+
+def ARMv81a : Architecture<"armv8.1-a", "ARMv81a", [HasV8_1aOps,
+ FeatureAClass,
+ FeatureDB,
+ FeatureFPARMv8,
+ FeatureNEON,
+ FeatureDSP,
+ FeatureTrustZone,
+ FeatureMP,
+ FeatureVirtualization,
+ FeatureCrypto,
+ FeatureCRC]>;
+
+def ARMv82a : Architecture<"armv8.2-a", "ARMv82a", [HasV8_2aOps,
+ FeatureAClass,
+ FeatureDB,
+ FeatureFPARMv8,
+ FeatureNEON,
+ FeatureDSP,
+ FeatureTrustZone,
+ FeatureMP,
+ FeatureVirtualization,
+ FeatureCrypto,
+ FeatureCRC,
+ FeatureRAS]>;
+
+def ARMv8r : Architecture<"armv8-r", "ARMv8r", [HasV8Ops,
+ FeatureRClass,
+ FeatureDB,
+ FeatureHWDiv,
+ FeatureHWDivARM,
+ FeatureT2XtPk,
+ FeatureDSP,
+ FeatureCRC,
+ FeatureMP,
+ FeatureVirtualization,
+ FeatureFPARMv8,
+ FeatureNEON]>;
+
+def ARMv8mBaseline : Architecture<"armv8-m.base", "ARMv8mBaseline",
+ [HasV8MBaselineOps,
+ FeatureNoARM,
+ FeatureDB,
+ FeatureHWDiv,
+ FeatureV7Clrex,
+ Feature8MSecExt,
+ FeatureAcquireRelease,
+ FeatureMClass]>;
+
+def ARMv8mMainline : Architecture<"armv8-m.main", "ARMv8mMainline",
+ [HasV8MMainlineOps,
+ FeatureNoARM,
+ FeatureDB,
+ FeatureHWDiv,
+ Feature8MSecExt,
+ FeatureAcquireRelease,
+ FeatureMClass]>;
+
+// Aliases
+def IWMMXT : Architecture<"iwmmxt", "ARMv5te", [ARMv5te]>;
+def IWMMXT2 : Architecture<"iwmmxt2", "ARMv5te", [ARMv5te]>;
+def XScale : Architecture<"xscale", "ARMv5te", [ARMv5te]>;
+def ARMv6j : Architecture<"armv6j", "ARMv7a", [ARMv6]>;
+def ARMv7k : Architecture<"armv7k", "ARMv7a", [ARMv7a]>;
+def ARMv7s : Architecture<"armv7s", "ARMv7a", [ARMv7a]>;
+
+
+//===----------------------------------------------------------------------===//
+// ARM processors
+//
+
+// Dummy CPU, used to target architectures
+def : ProcNoItin<"generic", []>;
+
+def : ProcNoItin<"arm8", [ARMv4]>;
+def : ProcNoItin<"arm810", [ARMv4]>;
+def : ProcNoItin<"strongarm", [ARMv4]>;
+def : ProcNoItin<"strongarm110", [ARMv4]>;
+def : ProcNoItin<"strongarm1100", [ARMv4]>;
+def : ProcNoItin<"strongarm1110", [ARMv4]>;
+
+def : ProcNoItin<"arm7tdmi", [ARMv4t]>;
+def : ProcNoItin<"arm7tdmi-s", [ARMv4t]>;
+def : ProcNoItin<"arm710t", [ARMv4t]>;
+def : ProcNoItin<"arm720t", [ARMv4t]>;
+def : ProcNoItin<"arm9", [ARMv4t]>;
+def : ProcNoItin<"arm9tdmi", [ARMv4t]>;
+def : ProcNoItin<"arm920", [ARMv4t]>;
+def : ProcNoItin<"arm920t", [ARMv4t]>;
+def : ProcNoItin<"arm922t", [ARMv4t]>;
+def : ProcNoItin<"arm940t", [ARMv4t]>;
+def : ProcNoItin<"ep9312", [ARMv4t]>;
+
+def : ProcNoItin<"arm10tdmi", [ARMv5t]>;
+def : ProcNoItin<"arm1020t", [ARMv5t]>;
+
+def : ProcNoItin<"arm9e", [ARMv5te]>;
+def : ProcNoItin<"arm926ej-s", [ARMv5te]>;
+def : ProcNoItin<"arm946e-s", [ARMv5te]>;
+def : ProcNoItin<"arm966e-s", [ARMv5te]>;
+def : ProcNoItin<"arm968e-s", [ARMv5te]>;
+def : ProcNoItin<"arm10e", [ARMv5te]>;
+def : ProcNoItin<"arm1020e", [ARMv5te]>;
+def : ProcNoItin<"arm1022e", [ARMv5te]>;
+def : ProcNoItin<"xscale", [ARMv5te]>;
+def : ProcNoItin<"iwmmxt", [ARMv5te]>;
+
+def : Processor<"arm1136j-s", ARMV6Itineraries, [ARMv6]>;
+def : Processor<"arm1136jf-s", ARMV6Itineraries, [ARMv6,
+ FeatureVFP2,
+ FeatureHasSlowFPVMLx]>;
+
+def : Processor<"cortex-m0", ARMV6Itineraries, [ARMv6m]>;
+def : Processor<"cortex-m0plus", ARMV6Itineraries, [ARMv6m]>;
+def : Processor<"cortex-m1", ARMV6Itineraries, [ARMv6m]>;
+def : Processor<"sc000", ARMV6Itineraries, [ARMv6m]>;
+
+def : Processor<"arm1176jz-s", ARMV6Itineraries, [ARMv6kz]>;
+def : Processor<"arm1176jzf-s", ARMV6Itineraries, [ARMv6kz,
+ FeatureVFP2,
+ FeatureHasSlowFPVMLx]>;
+
+def : Processor<"mpcorenovfp", ARMV6Itineraries, [ARMv6k]>;
+def : Processor<"mpcore", ARMV6Itineraries, [ARMv6k,
+ FeatureVFP2,
+ FeatureHasSlowFPVMLx]>;
+
+def : Processor<"arm1156t2-s", ARMV6Itineraries, [ARMv6t2]>;
+def : Processor<"arm1156t2f-s", ARMV6Itineraries, [ARMv6t2,
+ FeatureVFP2,
+ FeatureHasSlowFPVMLx]>;
+
+// FIXME: A5 has currently the same Schedule model as A8
+def : ProcessorModel<"cortex-a5", CortexA8Model, [ARMv7a, ProcA5,
+ FeatureHasRetAddrStack,
+ FeatureTrustZone,
+ FeatureSlowFPBrcc,
+ FeatureHasSlowFPVMLx,
+ FeatureVMLxForwarding,
+ FeatureMP,
+ FeatureVFP4]>;
+
+def : ProcessorModel<"cortex-a7", CortexA8Model, [ARMv7a, ProcA7,
+ FeatureHasRetAddrStack,
+ FeatureTrustZone,
+ FeatureSlowFPBrcc,
+ FeatureHasVMLxHazards,
+ FeatureHasSlowFPVMLx,
+ FeatureVMLxForwarding,
+ FeatureMP,
+ FeatureVFP4,
+ FeatureHWDiv,
+ FeatureHWDivARM,
+ FeatureVirtualization]>;
+
+def : ProcessorModel<"cortex-a8", CortexA8Model, [ARMv7a, ProcA8,
+ FeatureHasRetAddrStack,
+ FeatureNonpipelinedVFP,
+ FeatureTrustZone,
+ FeatureSlowFPBrcc,
+ FeatureHasVMLxHazards,
+ FeatureHasSlowFPVMLx,
+ FeatureVMLxForwarding]>;
+
+def : ProcessorModel<"cortex-a9", CortexA9Model, [ARMv7a, ProcA9,
+ FeatureHasRetAddrStack,
+ FeatureTrustZone,
+ FeatureHasVMLxHazards,
+ FeatureVMLxForwarding,
+ FeatureFP16,
+ FeatureAvoidPartialCPSR,
+ FeatureExpandMLx,
+ FeaturePreferVMOVSR,
+ FeatureMuxedUnits,
+ FeatureNEONForFPMovs,
+ FeatureCheckVLDnAlign,
+ FeatureMP]>;
+
+// FIXME: A12 has currently the same Schedule model as A9
+def : ProcessorModel<"cortex-a12", CortexA9Model, [ARMv7a, ProcA12,
+ FeatureHasRetAddrStack,
+ FeatureTrustZone,
+ FeatureVMLxForwarding,
+ FeatureVFP4,
+ FeatureHWDiv,
+ FeatureHWDivARM,
+ FeatureAvoidPartialCPSR,
+ FeatureVirtualization,
+ FeatureMP]>;
+
+// FIXME: A15 has currently the same Schedule model as A9.
+def : ProcessorModel<"cortex-a15", CortexA9Model, [ARMv7a, ProcA15,
+ FeatureDontWidenVMOVS,
+ FeatureHasRetAddrStack,
+ FeatureMuxedUnits,
+ FeatureTrustZone,
+ FeatureVFP4,
+ FeatureMP,
+ FeatureCheckVLDnAlign,
+ FeatureHWDiv,
+ FeatureHWDivARM,
+ FeatureAvoidPartialCPSR,
+ FeatureVirtualization]>;
+
+// FIXME: A17 has currently the same Schedule model as A9
+def : ProcessorModel<"cortex-a17", CortexA9Model, [ARMv7a, ProcA17,
+ FeatureHasRetAddrStack,
+ FeatureTrustZone,
+ FeatureMP,
+ FeatureVMLxForwarding,
+ FeatureVFP4,
+ FeatureHWDiv,
+ FeatureHWDivARM,
+ FeatureAvoidPartialCPSR,
+ FeatureVirtualization]>;
+
+// FIXME: krait has currently the same Schedule model as A9
+// FIXME: krait has currently the same features as A9 plus VFP4 and hardware
+// division features.
+def : ProcessorModel<"krait", CortexA9Model, [ARMv7a, ProcKrait,
+ FeatureHasRetAddrStack,
+ FeatureMuxedUnits,
+ FeatureCheckVLDnAlign,
+ FeatureVMLxForwarding,
+ FeatureFP16,
+ FeatureAvoidPartialCPSR,
+ FeatureVFP4,
+ FeatureHWDiv,
+ FeatureHWDivARM]>;
+
+def : ProcessorModel<"swift", SwiftModel, [ARMv7a, ProcSwift,
+ FeatureHasRetAddrStack,
+ FeatureNEONForFP,
+ FeatureVFP4,
+ FeatureMP,
+ FeatureHWDiv,
+ FeatureHWDivARM,
+ FeatureAvoidPartialCPSR,
+ FeatureAvoidMOVsShOp,
+ FeatureHasSlowFPVMLx,
+ FeatureHasVMLxHazards,
+ FeatureProfUnpredicate,
+ FeaturePrefISHSTBarrier,
+ FeatureSlowOddRegister,
+ FeatureSlowLoadDSubreg,
+ FeatureSlowVGETLNi32,
+ FeatureSlowVDUP32]>;
+
+// FIXME: R4 has currently the same ProcessorModel as A8.
+def : ProcessorModel<"cortex-r4", CortexA8Model, [ARMv7r, ProcR4,
+ FeatureHasRetAddrStack,
+ FeatureAvoidPartialCPSR]>;
+
+// FIXME: R4F has currently the same ProcessorModel as A8.
+def : ProcessorModel<"cortex-r4f", CortexA8Model, [ARMv7r, ProcR4,
+ FeatureHasRetAddrStack,
+ FeatureSlowFPBrcc,
+ FeatureHasSlowFPVMLx,
+ FeatureVFP3,
+ FeatureD16,
+ FeatureAvoidPartialCPSR]>;
+
+// FIXME: R5 has currently the same ProcessorModel as A8.
+def : ProcessorModel<"cortex-r5", CortexA8Model, [ARMv7r, ProcR5,
+ FeatureHasRetAddrStack,
+ FeatureVFP3,
+ FeatureD16,
+ FeatureSlowFPBrcc,
+ FeatureHWDivARM,
+ FeatureHasSlowFPVMLx,
+ FeatureAvoidPartialCPSR]>;
+
+// FIXME: R7 has currently the same ProcessorModel as A8 and is modelled as R5.
+def : ProcessorModel<"cortex-r7", CortexA8Model, [ARMv7r, ProcR7,
+ FeatureHasRetAddrStack,
+ FeatureVFP3,
+ FeatureD16,
+ FeatureFP16,
+ FeatureMP,
+ FeatureSlowFPBrcc,
+ FeatureHWDivARM,
+ FeatureHasSlowFPVMLx,
+ FeatureAvoidPartialCPSR]>;
+
+def : ProcessorModel<"cortex-r8", CortexA8Model, [ARMv7r,
+ FeatureHasRetAddrStack,
+ FeatureVFP3,
+ FeatureD16,
+ FeatureFP16,
+ FeatureMP,
+ FeatureSlowFPBrcc,
+ FeatureHWDivARM,
+ FeatureHasSlowFPVMLx,
+ FeatureAvoidPartialCPSR]>;
+
+def : ProcNoItin<"cortex-m3", [ARMv7m, ProcM3]>;
+def : ProcNoItin<"sc300", [ARMv7m, ProcM3]>;
+
+def : ProcNoItin<"cortex-m4", [ARMv7em,
+ FeatureVFP4,
+ FeatureVFPOnlySP,
+ FeatureD16]>;
+
+def : ProcNoItin<"cortex-m7", [ARMv7em,
+ FeatureFPARMv8,
+ FeatureD16]>;
+
+def : ProcNoItin<"cortex-a32", [ARMv8a,
+ FeatureHWDiv,
+ FeatureHWDivARM,
+ FeatureCrypto,
+ FeatureCRC]>;
+
+def : ProcNoItin<"cortex-a35", [ARMv8a, ProcA35,
+ FeatureHWDiv,
+ FeatureHWDivARM,
+ FeatureCrypto,
+ FeatureCRC]>;
+
+def : ProcNoItin<"cortex-a53", [ARMv8a, ProcA53,
+ FeatureHWDiv,
+ FeatureHWDivARM,
+ FeatureCrypto,
+ FeatureCRC,
+ FeatureFPAO]>;
+
+def : ProcNoItin<"cortex-a57", [ARMv8a, ProcA57,
+ FeatureHWDiv,
+ FeatureHWDivARM,
+ FeatureCrypto,
+ FeatureCRC,
+ FeatureFPAO]>;
+
+def : ProcNoItin<"cortex-a72", [ARMv8a, ProcA72,
+ FeatureHWDiv,
+ FeatureHWDivARM,
+ FeatureCrypto,
+ FeatureCRC]>;
+
+def : ProcNoItin<"cortex-a73", [ARMv8a, ProcA73,
+ FeatureHWDiv,
+ FeatureHWDivARM,
+ FeatureCrypto,
+ FeatureCRC]>;
+
+// Cyclone is very similar to swift
+def : ProcessorModel<"cyclone", SwiftModel, [ARMv8a, ProcSwift,
+ FeatureHasRetAddrStack,
+ FeatureNEONForFP,
+ FeatureVFP4,
+ FeatureMP,
+ FeatureHWDiv,
+ FeatureHWDivARM,
+ FeatureAvoidPartialCPSR,
+ FeatureAvoidMOVsShOp,
+ FeatureHasSlowFPVMLx,
+ FeatureCrypto,
+ FeatureZCZeroing]>;
+
+def : ProcNoItin<"exynos-m1", [ARMv8a, ProcExynosM1,
+ FeatureHWDiv,
+ FeatureHWDivARM,
+ FeatureCrypto,
+ FeatureCRC]>;
+
+def : ProcNoItin<"exynos-m2", [ARMv8a, ProcExynosM1,
+ FeatureHWDiv,
+ FeatureHWDivARM,
+ FeatureCrypto,
+ FeatureCRC]>;
+
+def : ProcNoItin<"exynos-m3", [ARMv8a, ProcExynosM1,
+ FeatureHWDiv,
+ FeatureHWDivARM,
+ FeatureCrypto,
+ FeatureCRC]>;
+
+def : ProcessorModel<"cortex-r52", CortexR52Model, [ARMv8r, ProcR52,
+ FeatureFPAO]>;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "ARMRegisterInfo.td"
+
+include "ARMCallingConv.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "ARMInstrInfo.td"
+
+def ARMInstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// Declare the target which we are implementing
+//===----------------------------------------------------------------------===//
+
+def ARMAsmWriter : AsmWriter {
+ string AsmWriterClassName = "InstPrinter";
+ int PassSubtarget = 1;
+ int Variant = 0;
+ bit isMCAsmWriter = 1;
+}
+
+def ARMAsmParserVariant : AsmParserVariant {
+ int Variant = 0;
+ string Name = "ARM";
+ string BreakCharacters = ".";
+}
+
+def ARM : Target {
+ // Pull in Instruction Info:
+ let InstructionSet = ARMInstrInfo;
+ let AssemblyWriters = [ARMAsmWriter];
+ let AssemblyParserVariants = [ARMAsmParserVariant];
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
new file mode 100644
index 000000000000..f20768ab77a5
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -0,0 +1,2165 @@
+//===-- ARMAsmPrinter.cpp - Print machine code to an ARM .s file ----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to GAS-format ARM assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMAsmPrinter.h"
+#include "ARM.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMTargetMachine.h"
+#include "ARMTargetObjectFile.h"
+#include "InstPrinter/ARMInstPrinter.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMMCExpr.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ARMBuildAttributes.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetParser.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cctype>
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+ARMAsmPrinter::ARMAsmPrinter(TargetMachine &TM,
+ std::unique_ptr<MCStreamer> Streamer)
+ : AsmPrinter(TM, std::move(Streamer)), AFI(nullptr), MCP(nullptr),
+ InConstantPool(false), OptimizationGoals(-1) {}
+
+void ARMAsmPrinter::EmitFunctionBodyEnd() {
+ // Make sure to terminate any constant pools that were at the end
+ // of the function.
+ if (!InConstantPool)
+ return;
+ InConstantPool = false;
+ OutStreamer->EmitDataRegion(MCDR_DataRegionEnd);
+}
+
+void ARMAsmPrinter::EmitFunctionEntryLabel() {
+ if (AFI->isThumbFunction()) {
+ OutStreamer->EmitAssemblerFlag(MCAF_Code16);
+ OutStreamer->EmitThumbFunc(CurrentFnSym);
+ } else {
+ OutStreamer->EmitAssemblerFlag(MCAF_Code32);
+ }
+ OutStreamer->EmitLabel(CurrentFnSym);
+}
+
+void ARMAsmPrinter::EmitXXStructor(const DataLayout &DL, const Constant *CV) {
+ uint64_t Size = getDataLayout().getTypeAllocSize(CV->getType());
+ assert(Size && "C++ constructor pointer had zero size!");
+
+ const GlobalValue *GV = dyn_cast<GlobalValue>(CV->stripPointerCasts());
+ assert(GV && "C++ constructor pointer was not a GlobalValue!");
+
+ const MCExpr *E = MCSymbolRefExpr::create(GetARMGVSymbol(GV,
+ ARMII::MO_NO_FLAG),
+ (Subtarget->isTargetELF()
+ ? MCSymbolRefExpr::VK_ARM_TARGET1
+ : MCSymbolRefExpr::VK_None),
+ OutContext);
+
+ OutStreamer->EmitValue(E, Size);
+}
+
+void ARMAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
+ if (PromotedGlobals.count(GV))
+ // The global was promoted into a constant pool. It should not be emitted.
+ return;
+ AsmPrinter::EmitGlobalVariable(GV);
+}
+
+/// runOnMachineFunction - This uses the EmitInstruction()
+/// method to print assembly for each instruction.
+///
+bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+ AFI = MF.getInfo<ARMFunctionInfo>();
+ MCP = MF.getConstantPool();
+ Subtarget = &MF.getSubtarget<ARMSubtarget>();
+
+ SetupMachineFunction(MF);
+ const Function* F = MF.getFunction();
+ const TargetMachine& TM = MF.getTarget();
+
+ // Collect all globals that had their storage promoted to a constant pool.
+ // Functions are emitted before variables, so this accumulates promoted
+ // globals from all functions in PromotedGlobals.
+ for (auto *GV : AFI->getGlobalsPromotedToConstantPool())
+ PromotedGlobals.insert(GV);
+
+ // Calculate this function's optimization goal.
+ unsigned OptimizationGoal;
+ if (F->hasFnAttribute(Attribute::OptimizeNone))
+ // For best debugging illusion, speed and small size sacrificed
+ OptimizationGoal = 6;
+ else if (F->optForMinSize())
+ // Aggressively for small size, speed and debug illusion sacrificed
+ OptimizationGoal = 4;
+ else if (F->optForSize())
+ // For small size, but speed and debugging illusion preserved
+ OptimizationGoal = 3;
+ else if (TM.getOptLevel() == CodeGenOpt::Aggressive)
+ // Aggressively for speed, small size and debug illusion sacrificed
+ OptimizationGoal = 2;
+ else if (TM.getOptLevel() > CodeGenOpt::None)
+ // For speed, but small size and good debug illusion preserved
+ OptimizationGoal = 1;
+ else // TM.getOptLevel() == CodeGenOpt::None
+ // For good debugging, but speed and small size preserved
+ OptimizationGoal = 5;
+
+ // Combine a new optimization goal with existing ones.
+ if (OptimizationGoals == -1) // uninitialized goals
+ OptimizationGoals = OptimizationGoal;
+ else if (OptimizationGoals != (int)OptimizationGoal) // conflicting goals
+ OptimizationGoals = 0;
+
+ if (Subtarget->isTargetCOFF()) {
+ bool Internal = F->hasInternalLinkage();
+ COFF::SymbolStorageClass Scl = Internal ? COFF::IMAGE_SYM_CLASS_STATIC
+ : COFF::IMAGE_SYM_CLASS_EXTERNAL;
+ int Type = COFF::IMAGE_SYM_DTYPE_FUNCTION << COFF::SCT_COMPLEX_TYPE_SHIFT;
+
+ OutStreamer->BeginCOFFSymbolDef(CurrentFnSym);
+ OutStreamer->EmitCOFFSymbolStorageClass(Scl);
+ OutStreamer->EmitCOFFSymbolType(Type);
+ OutStreamer->EndCOFFSymbolDef();
+ }
+
+ // Emit the rest of the function body.
+ EmitFunctionBody();
+
+ // Emit the XRay table for this function.
+ EmitXRayTable();
+
+ // If we need V4T thumb mode Register Indirect Jump pads, emit them.
+ // These are created per function, rather than per TU, since it's
+ // relatively easy to exceed the thumb branch range within a TU.
+ if (! ThumbIndirectPads.empty()) {
+ OutStreamer->EmitAssemblerFlag(MCAF_Code16);
+ EmitAlignment(1);
+ for (unsigned i = 0, e = ThumbIndirectPads.size(); i < e; i++) {
+ OutStreamer->EmitLabel(ThumbIndirectPads[i].second);
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tBX)
+ .addReg(ThumbIndirectPads[i].first)
+ // Add predicate operands.
+ .addImm(ARMCC::AL)
+ .addReg(0));
+ }
+ ThumbIndirectPads.clear();
+ }
+
+ // We didn't modify anything.
+ return false;
+}
+
+void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
+ raw_ostream &O) {
+ const MachineOperand &MO = MI->getOperand(OpNum);
+ unsigned TF = MO.getTargetFlags();
+
+ switch (MO.getType()) {
+ default: llvm_unreachable("<unknown operand type>");
+ case MachineOperand::MO_Register: {
+ unsigned Reg = MO.getReg();
+ assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+ assert(!MO.getSubReg() && "Subregs should be eliminated!");
+ if(ARM::GPRPairRegClass.contains(Reg)) {
+ const MachineFunction &MF = *MI->getParent()->getParent();
+ const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+ Reg = TRI->getSubReg(Reg, ARM::gsub_0);
+ }
+ O << ARMInstPrinter::getRegisterName(Reg);
+ break;
+ }
+ case MachineOperand::MO_Immediate: {
+ int64_t Imm = MO.getImm();
+ O << '#';
+ if (TF == ARMII::MO_LO16)
+ O << ":lower16:";
+ else if (TF == ARMII::MO_HI16)
+ O << ":upper16:";
+ O << Imm;
+ break;
+ }
+ case MachineOperand::MO_MachineBasicBlock:
+ MO.getMBB()->getSymbol()->print(O, MAI);
+ return;
+ case MachineOperand::MO_GlobalAddress: {
+ const GlobalValue *GV = MO.getGlobal();
+ if (TF & ARMII::MO_LO16)
+ O << ":lower16:";
+ else if (TF & ARMII::MO_HI16)
+ O << ":upper16:";
+ GetARMGVSymbol(GV, TF)->print(O, MAI);
+
+ printOffset(MO.getOffset(), O);
+ break;
+ }
+ case MachineOperand::MO_ConstantPoolIndex:
+ if (Subtarget->genExecuteOnly())
+ llvm_unreachable("execute-only should not generate constant pools");
+ GetCPISymbol(MO.getIndex())->print(O, MAI);
+ break;
+ }
+}
+
+//===--------------------------------------------------------------------===//
+
+MCSymbol *ARMAsmPrinter::
+GetARMJTIPICJumpTableLabel(unsigned uid) const {
+ const DataLayout &DL = getDataLayout();
+ SmallString<60> Name;
+ raw_svector_ostream(Name) << DL.getPrivateGlobalPrefix() << "JTI"
+ << getFunctionNumber() << '_' << uid;
+ return OutContext.getOrCreateSymbol(Name);
+}
+
+bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &O) {
+ // Does this asm operand have a single letter operand modifier?
+ if (ExtraCode && ExtraCode[0]) {
+ if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+ switch (ExtraCode[0]) {
+ default:
+ // See if this is a generic print operand
+ return AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O);
+ case 'a': // Print as a memory address.
+ if (MI->getOperand(OpNum).isReg()) {
+ O << "["
+ << ARMInstPrinter::getRegisterName(MI->getOperand(OpNum).getReg())
+ << "]";
+ return false;
+ }
+ LLVM_FALLTHROUGH;
+ case 'c': // Don't print "#" before an immediate operand.
+ if (!MI->getOperand(OpNum).isImm())
+ return true;
+ O << MI->getOperand(OpNum).getImm();
+ return false;
+ case 'P': // Print a VFP double precision register.
+ case 'q': // Print a NEON quad precision register.
+ printOperand(MI, OpNum, O);
+ return false;
+ case 'y': // Print a VFP single precision register as indexed double.
+ if (MI->getOperand(OpNum).isReg()) {
+ unsigned Reg = MI->getOperand(OpNum).getReg();
+ const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+ // Find the 'd' register that has this 's' register as a sub-register,
+ // and determine the lane number.
+ for (MCSuperRegIterator SR(Reg, TRI); SR.isValid(); ++SR) {
+ if (!ARM::DPRRegClass.contains(*SR))
+ continue;
+ bool Lane0 = TRI->getSubReg(*SR, ARM::ssub_0) == Reg;
+ O << ARMInstPrinter::getRegisterName(*SR) << (Lane0 ? "[0]" : "[1]");
+ return false;
+ }
+ }
+ return true;
+ case 'B': // Bitwise inverse of integer or symbol without a preceding #.
+ if (!MI->getOperand(OpNum).isImm())
+ return true;
+ O << ~(MI->getOperand(OpNum).getImm());
+ return false;
+ case 'L': // The low 16 bits of an immediate constant.
+ if (!MI->getOperand(OpNum).isImm())
+ return true;
+ O << (MI->getOperand(OpNum).getImm() & 0xffff);
+ return false;
+ case 'M': { // A register range suitable for LDM/STM.
+ if (!MI->getOperand(OpNum).isReg())
+ return true;
+ const MachineOperand &MO = MI->getOperand(OpNum);
+ unsigned RegBegin = MO.getReg();
+ // This takes advantage of the 2 operand-ness of ldm/stm and that we've
+ // already got the operands in registers that are operands to the
+ // inline asm statement.
+ O << "{";
+ if (ARM::GPRPairRegClass.contains(RegBegin)) {
+ const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+ unsigned Reg0 = TRI->getSubReg(RegBegin, ARM::gsub_0);
+ O << ARMInstPrinter::getRegisterName(Reg0) << ", ";
+ RegBegin = TRI->getSubReg(RegBegin, ARM::gsub_1);
+ }
+ O << ARMInstPrinter::getRegisterName(RegBegin);
+
+ // FIXME: The register allocator not only may not have given us the
+ // registers in sequence, but may not be in ascending registers. This
+ // will require changes in the register allocator that'll need to be
+ // propagated down here if the operands change.
+ unsigned RegOps = OpNum + 1;
+ while (MI->getOperand(RegOps).isReg()) {
+ O << ", "
+ << ARMInstPrinter::getRegisterName(MI->getOperand(RegOps).getReg());
+ RegOps++;
+ }
+
+ O << "}";
+
+ return false;
+ }
+ case 'R': // The most significant register of a pair.
+ case 'Q': { // The least significant register of a pair.
+ if (OpNum == 0)
+ return true;
+ const MachineOperand &FlagsOP = MI->getOperand(OpNum - 1);
+ if (!FlagsOP.isImm())
+ return true;
+ unsigned Flags = FlagsOP.getImm();
+
+ // This operand may not be the one that actually provides the register. If
+ // it's tied to a previous one then we should refer instead to that one
+ // for registers and their classes.
+ unsigned TiedIdx;
+ if (InlineAsm::isUseOperandTiedToDef(Flags, TiedIdx)) {
+ for (OpNum = InlineAsm::MIOp_FirstOperand; TiedIdx; --TiedIdx) {
+ unsigned OpFlags = MI->getOperand(OpNum).getImm();
+ OpNum += InlineAsm::getNumOperandRegisters(OpFlags) + 1;
+ }
+ Flags = MI->getOperand(OpNum).getImm();
+
+ // Later code expects OpNum to be pointing at the register rather than
+ // the flags.
+ OpNum += 1;
+ }
+
+ unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+ unsigned RC;
+ InlineAsm::hasRegClassConstraint(Flags, RC);
+ if (RC == ARM::GPRPairRegClassID) {
+ if (NumVals != 1)
+ return true;
+ const MachineOperand &MO = MI->getOperand(OpNum);
+ if (!MO.isReg())
+ return true;
+ const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+ unsigned Reg = TRI->getSubReg(MO.getReg(), ExtraCode[0] == 'Q' ?
+ ARM::gsub_0 : ARM::gsub_1);
+ O << ARMInstPrinter::getRegisterName(Reg);
+ return false;
+ }
+ if (NumVals != 2)
+ return true;
+ unsigned RegOp = ExtraCode[0] == 'Q' ? OpNum : OpNum + 1;
+ if (RegOp >= MI->getNumOperands())
+ return true;
+ const MachineOperand &MO = MI->getOperand(RegOp);
+ if (!MO.isReg())
+ return true;
+ unsigned Reg = MO.getReg();
+ O << ARMInstPrinter::getRegisterName(Reg);
+ return false;
+ }
+
+ case 'e': // The low doubleword register of a NEON quad register.
+ case 'f': { // The high doubleword register of a NEON quad register.
+ if (!MI->getOperand(OpNum).isReg())
+ return true;
+ unsigned Reg = MI->getOperand(OpNum).getReg();
+ if (!ARM::QPRRegClass.contains(Reg))
+ return true;
+ const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+ unsigned SubReg = TRI->getSubReg(Reg, ExtraCode[0] == 'e' ?
+ ARM::dsub_0 : ARM::dsub_1);
+ O << ARMInstPrinter::getRegisterName(SubReg);
+ return false;
+ }
+
+ // This modifier is not yet supported.
+ case 'h': // A range of VFP/NEON registers suitable for VLD1/VST1.
+ return true;
+ case 'H': { // The highest-numbered register of a pair.
+ const MachineOperand &MO = MI->getOperand(OpNum);
+ if (!MO.isReg())
+ return true;
+ const MachineFunction &MF = *MI->getParent()->getParent();
+ const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+ unsigned Reg = MO.getReg();
+ if(!ARM::GPRPairRegClass.contains(Reg))
+ return false;
+ Reg = TRI->getSubReg(Reg, ARM::gsub_1);
+ O << ARMInstPrinter::getRegisterName(Reg);
+ return false;
+ }
+ }
+ }
+
+ printOperand(MI, OpNum, O);
+ return false;
+}
+
+bool ARMAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+ unsigned OpNum, unsigned AsmVariant,
+ const char *ExtraCode,
+ raw_ostream &O) {
+ // Does this asm operand have a single letter operand modifier?
+ if (ExtraCode && ExtraCode[0]) {
+ if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+ switch (ExtraCode[0]) {
+ case 'A': // A memory operand for a VLD1/VST1 instruction.
+ default: return true; // Unknown modifier.
+ case 'm': // The base register of a memory operand.
+ if (!MI->getOperand(OpNum).isReg())
+ return true;
+ O << ARMInstPrinter::getRegisterName(MI->getOperand(OpNum).getReg());
+ return false;
+ }
+ }
+
+ const MachineOperand &MO = MI->getOperand(OpNum);
+ assert(MO.isReg() && "unexpected inline asm memory operand");
+ O << "[" << ARMInstPrinter::getRegisterName(MO.getReg()) << "]";
+ return false;
+}
+
+static bool isThumb(const MCSubtargetInfo& STI) {
+ return STI.getFeatureBits()[ARM::ModeThumb];
+}
+
+void ARMAsmPrinter::emitInlineAsmEnd(const MCSubtargetInfo &StartInfo,
+ const MCSubtargetInfo *EndInfo) const {
+ // If either end mode is unknown (EndInfo == NULL) or different than
+ // the start mode, then restore the start mode.
+ const bool WasThumb = isThumb(StartInfo);
+ if (!EndInfo || WasThumb != isThumb(*EndInfo)) {
+ OutStreamer->EmitAssemblerFlag(WasThumb ? MCAF_Code16 : MCAF_Code32);
+ }
+}
+
+void ARMAsmPrinter::EmitStartOfAsmFile(Module &M) {
+ const Triple &TT = TM.getTargetTriple();
+ // Use unified assembler syntax.
+ OutStreamer->EmitAssemblerFlag(MCAF_SyntaxUnified);
+
+ // Emit ARM Build Attributes
+ if (TT.isOSBinFormatELF())
+ emitAttributes();
+
+ // Use the triple's architecture and subarchitecture to determine
+ // if we're thumb for the purposes of the top level code16 assembler
+ // flag.
+ bool isThumb = TT.getArch() == Triple::thumb ||
+ TT.getArch() == Triple::thumbeb ||
+ TT.getSubArch() == Triple::ARMSubArch_v7m ||
+ TT.getSubArch() == Triple::ARMSubArch_v6m;
+ if (!M.getModuleInlineAsm().empty() && isThumb)
+ OutStreamer->EmitAssemblerFlag(MCAF_Code16);
+}
+
+static void
+emitNonLazySymbolPointer(MCStreamer &OutStreamer, MCSymbol *StubLabel,
+ MachineModuleInfoImpl::StubValueTy &MCSym) {
+ // L_foo$stub:
+ OutStreamer.EmitLabel(StubLabel);
+ // .indirect_symbol _foo
+ OutStreamer.EmitSymbolAttribute(MCSym.getPointer(), MCSA_IndirectSymbol);
+
+ if (MCSym.getInt())
+ // External to current translation unit.
+ OutStreamer.EmitIntValue(0, 4/*size*/);
+ else
+ // Internal to current translation unit.
+ //
+ // When we place the LSDA into the TEXT section, the type info
+ // pointers need to be indirect and pc-rel. We accomplish this by
+ // using NLPs; however, sometimes the types are local to the file.
+ // We need to fill in the value for the NLP in those cases.
+ OutStreamer.EmitValue(
+ MCSymbolRefExpr::create(MCSym.getPointer(), OutStreamer.getContext()),
+ 4 /*size*/);
+}
+
+
+void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
+ const Triple &TT = TM.getTargetTriple();
+ if (TT.isOSBinFormatMachO()) {
+ // All darwin targets use mach-o.
+ const TargetLoweringObjectFileMachO &TLOFMacho =
+ static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering());
+ MachineModuleInfoMachO &MMIMacho =
+ MMI->getObjFileInfo<MachineModuleInfoMachO>();
+
+ // Output non-lazy-pointers for external and common global variables.
+ MachineModuleInfoMachO::SymbolListTy Stubs = MMIMacho.GetGVStubList();
+
+ if (!Stubs.empty()) {
+ // Switch with ".non_lazy_symbol_pointer" directive.
+ OutStreamer->SwitchSection(TLOFMacho.getNonLazySymbolPointerSection());
+ EmitAlignment(2);
+
+ for (auto &Stub : Stubs)
+ emitNonLazySymbolPointer(*OutStreamer, Stub.first, Stub.second);
+
+ Stubs.clear();
+ OutStreamer->AddBlankLine();
+ }
+
+ Stubs = MMIMacho.GetThreadLocalGVStubList();
+ if (!Stubs.empty()) {
+ // Switch with ".non_lazy_symbol_pointer" directive.
+ OutStreamer->SwitchSection(TLOFMacho.getThreadLocalPointerSection());
+ EmitAlignment(2);
+
+ for (auto &Stub : Stubs)
+ emitNonLazySymbolPointer(*OutStreamer, Stub.first, Stub.second);
+
+ Stubs.clear();
+ OutStreamer->AddBlankLine();
+ }
+
+ // Funny Darwin hack: This flag tells the linker that no global symbols
+ // contain code that falls through to other global symbols (e.g. the obvious
+ // implementation of multiple entry points). If this doesn't occur, the
+ // linker can safely perform dead code stripping. Since LLVM never
+ // generates code that does this, it is always safe to set.
+ OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+ }
+
+ if (TT.isOSBinFormatCOFF()) {
+ const auto &TLOF =
+ static_cast<const TargetLoweringObjectFileCOFF &>(getObjFileLowering());
+
+ std::string Flags;
+ raw_string_ostream OS(Flags);
+
+ for (const auto &Function : M)
+ TLOF.emitLinkerFlagsForGlobal(OS, &Function);
+ for (const auto &Global : M.globals())
+ TLOF.emitLinkerFlagsForGlobal(OS, &Global);
+ for (const auto &Alias : M.aliases())
+ TLOF.emitLinkerFlagsForGlobal(OS, &Alias);
+
+ OS.flush();
+
+ // Output collected flags
+ if (!Flags.empty()) {
+ OutStreamer->SwitchSection(TLOF.getDrectveSection());
+ OutStreamer->EmitBytes(Flags);
+ }
+ }
+
+ // The last attribute to be emitted is ABI_optimization_goals
+ MCTargetStreamer &TS = *OutStreamer->getTargetStreamer();
+ ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
+
+ if (OptimizationGoals > 0 &&
+ (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||
+ Subtarget->isTargetMuslAEABI()))
+ ATS.emitAttribute(ARMBuildAttrs::ABI_optimization_goals, OptimizationGoals);
+ OptimizationGoals = -1;
+
+ ATS.finishAttributeSection();
+}
+
+static bool isV8M(const ARMSubtarget *Subtarget) {
+ // Note that v8M Baseline is a subset of v6T2!
+ return (Subtarget->hasV8MBaselineOps() && !Subtarget->hasV6T2Ops()) ||
+ Subtarget->hasV8MMainlineOps();
+}
+
+//===----------------------------------------------------------------------===//
+// Helper routines for EmitStartOfAsmFile() and EmitEndOfAsmFile()
+// FIXME:
+// The following seem like one-off assembler flags, but they actually need
+// to appear in the .ARM.attributes section in ELF.
+// Instead of subclassing the MCELFStreamer, we do the work here.
+
+static ARMBuildAttrs::CPUArch getArchForCPU(StringRef CPU,
+ const ARMSubtarget *Subtarget) {
+ if (CPU == "xscale")
+ return ARMBuildAttrs::v5TEJ;
+
+ if (Subtarget->hasV8Ops()) {
+ if (Subtarget->isRClass())
+ return ARMBuildAttrs::v8_R;
+ return ARMBuildAttrs::v8_A;
+ } else if (Subtarget->hasV8MMainlineOps())
+ return ARMBuildAttrs::v8_M_Main;
+ else if (Subtarget->hasV7Ops()) {
+ if (Subtarget->isMClass() && Subtarget->hasDSP())
+ return ARMBuildAttrs::v7E_M;
+ return ARMBuildAttrs::v7;
+ } else if (Subtarget->hasV6T2Ops())
+ return ARMBuildAttrs::v6T2;
+ else if (Subtarget->hasV8MBaselineOps())
+ return ARMBuildAttrs::v8_M_Base;
+ else if (Subtarget->hasV6MOps())
+ return ARMBuildAttrs::v6S_M;
+ else if (Subtarget->hasV6Ops())
+ return ARMBuildAttrs::v6;
+ else if (Subtarget->hasV5TEOps())
+ return ARMBuildAttrs::v5TE;
+ else if (Subtarget->hasV5TOps())
+ return ARMBuildAttrs::v5T;
+ else if (Subtarget->hasV4TOps())
+ return ARMBuildAttrs::v4T;
+ else
+ return ARMBuildAttrs::v4;
+}
+
+// Returns true if all functions have the same function attribute value.
+// It also returns true when the module has no functions.
+static bool checkFunctionsAttributeConsistency(const Module &M, StringRef Attr,
+ StringRef Value) {
+ return !any_of(M, [&](const Function &F) {
+ return F.getFnAttribute(Attr).getValueAsString() != Value;
+ });
+}
+
+void ARMAsmPrinter::emitAttributes() {
+ MCTargetStreamer &TS = *OutStreamer->getTargetStreamer();
+ ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
+
+ ATS.emitTextAttribute(ARMBuildAttrs::conformance, "2.09");
+
+ ATS.switchVendor("aeabi");
+
+ // Compute ARM ELF Attributes based on the default subtarget that
+ // we'd have constructed. The existing ARM behavior isn't LTO clean
+ // anyhow.
+ // FIXME: For ifunc related functions we could iterate over and look
+ // for a feature string that doesn't match the default one.
+ const Triple &TT = TM.getTargetTriple();
+ StringRef CPU = TM.getTargetCPU();
+ StringRef FS = TM.getTargetFeatureString();
+ std::string ArchFS = ARM_MC::ParseARMTriple(TT, CPU);
+ if (!FS.empty()) {
+ if (!ArchFS.empty())
+ ArchFS = (Twine(ArchFS) + "," + FS).str();
+ else
+ ArchFS = FS;
+ }
+ const ARMBaseTargetMachine &ATM =
+ static_cast<const ARMBaseTargetMachine &>(TM);
+ const ARMSubtarget STI(TT, CPU, ArchFS, ATM, ATM.isLittleEndian());
+
+ const std::string &CPUString = STI.getCPUString();
+
+ if (!StringRef(CPUString).startswith("generic")) {
+ // FIXME: remove krait check when GNU tools support krait cpu
+ if (STI.isKrait()) {
+ ATS.emitTextAttribute(ARMBuildAttrs::CPU_name, "cortex-a9");
+ // We consider krait as a "cortex-a9" + hwdiv CPU
+ // Enable hwdiv through ".arch_extension idiv"
+ if (STI.hasDivide() || STI.hasDivideInARMMode())
+ ATS.emitArchExtension(ARM::AEK_HWDIV | ARM::AEK_HWDIVARM);
+ } else
+ ATS.emitTextAttribute(ARMBuildAttrs::CPU_name, CPUString);
+ }
+
+ ATS.emitAttribute(ARMBuildAttrs::CPU_arch, getArchForCPU(CPUString, &STI));
+
+ // Tag_CPU_arch_profile must have the default value of 0 when "Architecture
+ // profile is not applicable (e.g. pre v7, or cross-profile code)".
+ if (STI.hasV7Ops() || isV8M(&STI)) {
+ if (STI.isAClass()) {
+ ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile,
+ ARMBuildAttrs::ApplicationProfile);
+ } else if (STI.isRClass()) {
+ ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile,
+ ARMBuildAttrs::RealTimeProfile);
+ } else if (STI.isMClass()) {
+ ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile,
+ ARMBuildAttrs::MicroControllerProfile);
+ }
+ }
+
+ ATS.emitAttribute(ARMBuildAttrs::ARM_ISA_use,
+ STI.hasARMOps() ? ARMBuildAttrs::Allowed
+ : ARMBuildAttrs::Not_Allowed);
+ if (isV8M(&STI)) {
+ ATS.emitAttribute(ARMBuildAttrs::THUMB_ISA_use,
+ ARMBuildAttrs::AllowThumbDerived);
+ } else if (STI.isThumb1Only()) {
+ ATS.emitAttribute(ARMBuildAttrs::THUMB_ISA_use, ARMBuildAttrs::Allowed);
+ } else if (STI.hasThumb2()) {
+ ATS.emitAttribute(ARMBuildAttrs::THUMB_ISA_use,
+ ARMBuildAttrs::AllowThumb32);
+ }
+
+ if (STI.hasNEON()) {
+ /* NEON is not exactly a VFP architecture, but GAS emit one of
+ * neon/neon-fp-armv8/neon-vfpv4/vfpv3/vfpv2 for .fpu parameters */
+ if (STI.hasFPARMv8()) {
+ if (STI.hasCrypto())
+ ATS.emitFPU(ARM::FK_CRYPTO_NEON_FP_ARMV8);
+ else
+ ATS.emitFPU(ARM::FK_NEON_FP_ARMV8);
+ } else if (STI.hasVFP4())
+ ATS.emitFPU(ARM::FK_NEON_VFPV4);
+ else
+ ATS.emitFPU(STI.hasFP16() ? ARM::FK_NEON_FP16 : ARM::FK_NEON);
+ // Emit Tag_Advanced_SIMD_arch for ARMv8 architecture
+ if (STI.hasV8Ops())
+ ATS.emitAttribute(ARMBuildAttrs::Advanced_SIMD_arch,
+ STI.hasV8_1aOps() ? ARMBuildAttrs::AllowNeonARMv8_1a:
+ ARMBuildAttrs::AllowNeonARMv8);
+ } else {
+ if (STI.hasFPARMv8())
+ // FPv5 and FP-ARMv8 have the same instructions, so are modeled as one
+ // FPU, but there are two different names for it depending on the CPU.
+ ATS.emitFPU(STI.hasD16()
+ ? (STI.isFPOnlySP() ? ARM::FK_FPV5_SP_D16 : ARM::FK_FPV5_D16)
+ : ARM::FK_FP_ARMV8);
+ else if (STI.hasVFP4())
+ ATS.emitFPU(STI.hasD16()
+ ? (STI.isFPOnlySP() ? ARM::FK_FPV4_SP_D16 : ARM::FK_VFPV4_D16)
+ : ARM::FK_VFPV4);
+ else if (STI.hasVFP3())
+ ATS.emitFPU(STI.hasD16()
+ // +d16
+ ? (STI.isFPOnlySP()
+ ? (STI.hasFP16() ? ARM::FK_VFPV3XD_FP16 : ARM::FK_VFPV3XD)
+ : (STI.hasFP16() ? ARM::FK_VFPV3_D16_FP16 : ARM::FK_VFPV3_D16))
+ // -d16
+ : (STI.hasFP16() ? ARM::FK_VFPV3_FP16 : ARM::FK_VFPV3));
+ else if (STI.hasVFP2())
+ ATS.emitFPU(ARM::FK_VFPV2);
+ }
+
+ // RW data addressing.
+ if (isPositionIndependent()) {
+ ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_RW_data,
+ ARMBuildAttrs::AddressRWPCRel);
+ } else if (STI.isRWPI()) {
+ // RWPI specific attributes.
+ ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_RW_data,
+ ARMBuildAttrs::AddressRWSBRel);
+ }
+
+ // RO data addressing.
+ if (isPositionIndependent() || STI.isROPI()) {
+ ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_RO_data,
+ ARMBuildAttrs::AddressROPCRel);
+ }
+
+ // GOT use.
+ if (isPositionIndependent()) {
+ ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_GOT_use,
+ ARMBuildAttrs::AddressGOT);
+ } else {
+ ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_GOT_use,
+ ARMBuildAttrs::AddressDirect);
+ }
+
+ // Set FP Denormals.
+ if (checkFunctionsAttributeConsistency(*MMI->getModule(),
+ "denormal-fp-math",
+ "preserve-sign") ||
+ TM.Options.FPDenormalMode == FPDenormal::PreserveSign)
+ ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal,
+ ARMBuildAttrs::PreserveFPSign);
+ else if (checkFunctionsAttributeConsistency(*MMI->getModule(),
+ "denormal-fp-math",
+ "positive-zero") ||
+ TM.Options.FPDenormalMode == FPDenormal::PositiveZero)
+ ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal,
+ ARMBuildAttrs::PositiveZero);
+ else if (!TM.Options.UnsafeFPMath)
+ ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal,
+ ARMBuildAttrs::IEEEDenormals);
+ else {
+ if (!STI.hasVFP2()) {
+ // When the target doesn't have an FPU (by design or
+ // intention), the assumptions made on the software support
+ // mirror that of the equivalent hardware support *if it
+ // existed*. For v7 and better we indicate that denormals are
+ // flushed preserving sign, and for V6 we indicate that
+ // denormals are flushed to positive zero.
+ if (STI.hasV7Ops())
+ ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal,
+ ARMBuildAttrs::PreserveFPSign);
+ } else if (STI.hasVFP3()) {
+ // In VFPv4, VFPv4U, VFPv3, or VFPv3U, it is preserved. That is,
+ // the sign bit of the zero matches the sign bit of the input or
+ // result that is being flushed to zero.
+ ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal,
+ ARMBuildAttrs::PreserveFPSign);
+ }
+ // For VFPv2 implementations it is implementation defined as
+ // to whether denormals are flushed to positive zero or to
+ // whatever the sign of zero is (ARM v7AR ARM 2.7.5). Historically
+ // LLVM has chosen to flush this to positive zero (most likely for
+ // GCC compatibility), so that's the chosen value here (the
+ // absence of its emission implies zero).
+ }
+
+ // Set FP exceptions and rounding
+ if (checkFunctionsAttributeConsistency(*MMI->getModule(),
+ "no-trapping-math", "true") ||
+ TM.Options.NoTrappingFPMath)
+ ATS.emitAttribute(ARMBuildAttrs::ABI_FP_exceptions,
+ ARMBuildAttrs::Not_Allowed);
+ else if (!TM.Options.UnsafeFPMath) {
+ ATS.emitAttribute(ARMBuildAttrs::ABI_FP_exceptions, ARMBuildAttrs::Allowed);
+
+ // If the user has permitted this code to choose the IEEE 754
+ // rounding at run-time, emit the rounding attribute.
+ if (TM.Options.HonorSignDependentRoundingFPMathOption)
+ ATS.emitAttribute(ARMBuildAttrs::ABI_FP_rounding, ARMBuildAttrs::Allowed);
+ }
+
+ // TM.Options.NoInfsFPMath && TM.Options.NoNaNsFPMath is the
+ // equivalent of GCC's -ffinite-math-only flag.
+ if (TM.Options.NoInfsFPMath && TM.Options.NoNaNsFPMath)
+ ATS.emitAttribute(ARMBuildAttrs::ABI_FP_number_model,
+ ARMBuildAttrs::Allowed);
+ else
+ ATS.emitAttribute(ARMBuildAttrs::ABI_FP_number_model,
+ ARMBuildAttrs::AllowIEE754);
+
+ if (STI.allowsUnalignedMem())
+ ATS.emitAttribute(ARMBuildAttrs::CPU_unaligned_access,
+ ARMBuildAttrs::Allowed);
+ else
+ ATS.emitAttribute(ARMBuildAttrs::CPU_unaligned_access,
+ ARMBuildAttrs::Not_Allowed);
+
+ // FIXME: add more flags to ARMBuildAttributes.h
+ // 8-bytes alignment stuff.
+ ATS.emitAttribute(ARMBuildAttrs::ABI_align_needed, 1);
+ ATS.emitAttribute(ARMBuildAttrs::ABI_align_preserved, 1);
+
+ // ABI_HardFP_use attribute to indicate single precision FP.
+ if (STI.isFPOnlySP())
+ ATS.emitAttribute(ARMBuildAttrs::ABI_HardFP_use,
+ ARMBuildAttrs::HardFPSinglePrecision);
+
+ // Hard float. Use both S and D registers and conform to AAPCS-VFP.
+ if (STI.isAAPCS_ABI() && TM.Options.FloatABIType == FloatABI::Hard)
+ ATS.emitAttribute(ARMBuildAttrs::ABI_VFP_args, ARMBuildAttrs::HardFPAAPCS);
+
+ // FIXME: Should we signal R9 usage?
+
+ if (STI.hasFP16())
+ ATS.emitAttribute(ARMBuildAttrs::FP_HP_extension, ARMBuildAttrs::AllowHPFP);
+
+ // FIXME: To support emitting this build attribute as GCC does, the
+ // -mfp16-format option and associated plumbing must be
+ // supported. For now the __fp16 type is exposed by default, so this
+ // attribute should be emitted with value 1.
+ ATS.emitAttribute(ARMBuildAttrs::ABI_FP_16bit_format,
+ ARMBuildAttrs::FP16FormatIEEE);
+
+ if (STI.hasMPExtension())
+ ATS.emitAttribute(ARMBuildAttrs::MPextension_use, ARMBuildAttrs::AllowMP);
+
+ // Hardware divide in ARM mode is part of base arch, starting from ARMv8.
+ // If only Thumb hwdiv is present, it must also be in base arch (ARMv7-R/M).
+ // It is not possible to produce DisallowDIV: if hwdiv is present in the base
+ // arch, supplying -hwdiv downgrades the effective arch, via ClearImpliedBits.
+ // AllowDIVExt is only emitted if hwdiv isn't available in the base arch;
+ // otherwise, the default value (AllowDIVIfExists) applies.
+ if (STI.hasDivideInARMMode() && !STI.hasV8Ops())
+ ATS.emitAttribute(ARMBuildAttrs::DIV_use, ARMBuildAttrs::AllowDIVExt);
+
+ if (STI.hasDSP() && isV8M(&STI))
+ ATS.emitAttribute(ARMBuildAttrs::DSP_extension, ARMBuildAttrs::Allowed);
+
+ if (MMI) {
+ if (const Module *SourceModule = MMI->getModule()) {
+ // ABI_PCS_wchar_t to indicate wchar_t width
+ // FIXME: There is no way to emit value 0 (wchar_t prohibited).
+ if (auto WCharWidthValue = mdconst::extract_or_null<ConstantInt>(
+ SourceModule->getModuleFlag("wchar_size"))) {
+ int WCharWidth = WCharWidthValue->getZExtValue();
+ assert((WCharWidth == 2 || WCharWidth == 4) &&
+ "wchar_t width must be 2 or 4 bytes");
+ ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_wchar_t, WCharWidth);
+ }
+
+ // ABI_enum_size to indicate enum width
+ // FIXME: There is no way to emit value 0 (enums prohibited) or value 3
+ // (all enums contain a value needing 32 bits to encode).
+ if (auto EnumWidthValue = mdconst::extract_or_null<ConstantInt>(
+ SourceModule->getModuleFlag("min_enum_size"))) {
+ int EnumWidth = EnumWidthValue->getZExtValue();
+ assert((EnumWidth == 1 || EnumWidth == 4) &&
+ "Minimum enum width must be 1 or 4 bytes");
+ int EnumBuildAttr = EnumWidth == 1 ? 1 : 2;
+ ATS.emitAttribute(ARMBuildAttrs::ABI_enum_size, EnumBuildAttr);
+ }
+ }
+ }
+
+ // We currently do not support using R9 as the TLS pointer.
+ if (STI.isRWPI())
+ ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_R9_use,
+ ARMBuildAttrs::R9IsSB);
+ else if (STI.isR9Reserved())
+ ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_R9_use,
+ ARMBuildAttrs::R9Reserved);
+ else
+ ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_R9_use,
+ ARMBuildAttrs::R9IsGPR);
+
+ if (STI.hasTrustZone() && STI.hasVirtualization())
+ ATS.emitAttribute(ARMBuildAttrs::Virtualization_use,
+ ARMBuildAttrs::AllowTZVirtualization);
+ else if (STI.hasTrustZone())
+ ATS.emitAttribute(ARMBuildAttrs::Virtualization_use,
+ ARMBuildAttrs::AllowTZ);
+ else if (STI.hasVirtualization())
+ ATS.emitAttribute(ARMBuildAttrs::Virtualization_use,
+ ARMBuildAttrs::AllowVirtualization);
+}
+
+//===----------------------------------------------------------------------===//
+
+static MCSymbol *getPICLabel(StringRef Prefix, unsigned FunctionNumber,
+ unsigned LabelId, MCContext &Ctx) {
+
+ MCSymbol *Label = Ctx.getOrCreateSymbol(Twine(Prefix)
+ + "PC" + Twine(FunctionNumber) + "_" + Twine(LabelId));
+ return Label;
+}
+
+static MCSymbolRefExpr::VariantKind
+getModifierVariantKind(ARMCP::ARMCPModifier Modifier) {
+ switch (Modifier) {
+ case ARMCP::no_modifier:
+ return MCSymbolRefExpr::VK_None;
+ case ARMCP::TLSGD:
+ return MCSymbolRefExpr::VK_TLSGD;
+ case ARMCP::TPOFF:
+ return MCSymbolRefExpr::VK_TPOFF;
+ case ARMCP::GOTTPOFF:
+ return MCSymbolRefExpr::VK_GOTTPOFF;
+ case ARMCP::SBREL:
+ return MCSymbolRefExpr::VK_ARM_SBREL;
+ case ARMCP::GOT_PREL:
+ return MCSymbolRefExpr::VK_ARM_GOT_PREL;
+ case ARMCP::SECREL:
+ return MCSymbolRefExpr::VK_SECREL;
+ }
+ llvm_unreachable("Invalid ARMCPModifier!");
+}
+
+MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV,
+ unsigned char TargetFlags) {
+ if (Subtarget->isTargetMachO()) {
+ bool IsIndirect =
+ (TargetFlags & ARMII::MO_NONLAZY) && Subtarget->isGVIndirectSymbol(GV);
+
+ if (!IsIndirect)
+ return getSymbol(GV);
+
+ // FIXME: Remove this when Darwin transition to @GOT like syntax.
+ MCSymbol *MCSym = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+ MachineModuleInfoMachO &MMIMachO =
+ MMI->getObjFileInfo<MachineModuleInfoMachO>();
+ MachineModuleInfoImpl::StubValueTy &StubSym =
+ GV->isThreadLocal() ? MMIMachO.getThreadLocalGVStubEntry(MCSym)
+ : MMIMachO.getGVStubEntry(MCSym);
+
+ if (!StubSym.getPointer())
+ StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(GV),
+ !GV->hasInternalLinkage());
+ return MCSym;
+ } else if (Subtarget->isTargetCOFF()) {
+ assert(Subtarget->isTargetWindows() &&
+ "Windows is the only supported COFF target");
+
+ bool IsIndirect = (TargetFlags & ARMII::MO_DLLIMPORT);
+ if (!IsIndirect)
+ return getSymbol(GV);
+
+ SmallString<128> Name;
+ Name = "__imp_";
+ getNameWithPrefix(Name, GV);
+
+ return OutContext.getOrCreateSymbol(Name);
+ } else if (Subtarget->isTargetELF()) {
+ return getSymbol(GV);
+ }
+ llvm_unreachable("unexpected target");
+}
+
+void ARMAsmPrinter::
+EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
+ const DataLayout &DL = getDataLayout();
+ int Size = DL.getTypeAllocSize(MCPV->getType());
+
+ ARMConstantPoolValue *ACPV = static_cast<ARMConstantPoolValue*>(MCPV);
+
+ if (ACPV->isPromotedGlobal()) {
+ // This constant pool entry is actually a global whose storage has been
+ // promoted into the constant pool. This global may be referenced still
+ // by debug information, and due to the way AsmPrinter is set up, the debug
+ // info is immutable by the time we decide to promote globals to constant
+ // pools. Because of this, we need to ensure we emit a symbol for the global
+ // with private linkage (the default) so debug info can refer to it.
+ //
+ // However, if this global is promoted into several functions we must ensure
+ // we don't try and emit duplicate symbols!
+ auto *ACPC = cast<ARMConstantPoolConstant>(ACPV);
+ auto *GV = ACPC->getPromotedGlobal();
+ if (!EmittedPromotedGlobalLabels.count(GV)) {
+ MCSymbol *GVSym = getSymbol(GV);
+ OutStreamer->EmitLabel(GVSym);
+ EmittedPromotedGlobalLabels.insert(GV);
+ }
+ return EmitGlobalConstant(DL, ACPC->getPromotedGlobalInit());
+ }
+
+ MCSymbol *MCSym;
+ if (ACPV->isLSDA()) {
+ MCSym = getCurExceptionSym();
+ } else if (ACPV->isBlockAddress()) {
+ const BlockAddress *BA =
+ cast<ARMConstantPoolConstant>(ACPV)->getBlockAddress();
+ MCSym = GetBlockAddressSymbol(BA);
+ } else if (ACPV->isGlobalValue()) {
+ const GlobalValue *GV = cast<ARMConstantPoolConstant>(ACPV)->getGV();
+
+ // On Darwin, const-pool entries may get the "FOO$non_lazy_ptr" mangling, so
+ // flag the global as MO_NONLAZY.
+ unsigned char TF = Subtarget->isTargetMachO() ? ARMII::MO_NONLAZY : 0;
+ MCSym = GetARMGVSymbol(GV, TF);
+ } else if (ACPV->isMachineBasicBlock()) {
+ const MachineBasicBlock *MBB = cast<ARMConstantPoolMBB>(ACPV)->getMBB();
+ MCSym = MBB->getSymbol();
+ } else {
+ assert(ACPV->isExtSymbol() && "unrecognized constant pool value");
+ auto Sym = cast<ARMConstantPoolSymbol>(ACPV)->getSymbol();
+ MCSym = GetExternalSymbolSymbol(Sym);
+ }
+
+ // Create an MCSymbol for the reference.
+ const MCExpr *Expr =
+ MCSymbolRefExpr::create(MCSym, getModifierVariantKind(ACPV->getModifier()),
+ OutContext);
+
+ if (ACPV->getPCAdjustment()) {
+ MCSymbol *PCLabel =
+ getPICLabel(DL.getPrivateGlobalPrefix(), getFunctionNumber(),
+ ACPV->getLabelId(), OutContext);
+ const MCExpr *PCRelExpr = MCSymbolRefExpr::create(PCLabel, OutContext);
+ PCRelExpr =
+ MCBinaryExpr::createAdd(PCRelExpr,
+ MCConstantExpr::create(ACPV->getPCAdjustment(),
+ OutContext),
+ OutContext);
+ if (ACPV->mustAddCurrentAddress()) {
+ // We want "(<expr> - .)", but MC doesn't have a concept of the '.'
+ // label, so just emit a local label end reference that instead.
+ MCSymbol *DotSym = OutContext.createTempSymbol();
+ OutStreamer->EmitLabel(DotSym);
+ const MCExpr *DotExpr = MCSymbolRefExpr::create(DotSym, OutContext);
+ PCRelExpr = MCBinaryExpr::createSub(PCRelExpr, DotExpr, OutContext);
+ }
+ Expr = MCBinaryExpr::createSub(Expr, PCRelExpr, OutContext);
+ }
+ OutStreamer->EmitValue(Expr, Size);
+}
+
+void ARMAsmPrinter::EmitJumpTableAddrs(const MachineInstr *MI) {
+ const MachineOperand &MO1 = MI->getOperand(1);
+ unsigned JTI = MO1.getIndex();
+
+ // Make sure the Thumb jump table is 4-byte aligned. This will be a nop for
+ // ARM mode tables.
+ EmitAlignment(2);
+
+ // Emit a label for the jump table.
+ MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI);
+ OutStreamer->EmitLabel(JTISymbol);
+
+ // Mark the jump table as data-in-code.
+ OutStreamer->EmitDataRegion(MCDR_DataRegionJT32);
+
+ // Emit each entry of the table.
+ const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+ const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+ const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+
+ for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) {
+ MachineBasicBlock *MBB = JTBBs[i];
+ // Construct an MCExpr for the entry. We want a value of the form:
+ // (BasicBlockAddr - TableBeginAddr)
+ //
+ // For example, a table with entries jumping to basic blocks BB0 and BB1
+ // would look like:
+ // LJTI_0_0:
+ // .word (LBB0 - LJTI_0_0)
+ // .word (LBB1 - LJTI_0_0)
+ const MCExpr *Expr = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext);
+
+ if (isPositionIndependent() || Subtarget->isROPI())
+ Expr = MCBinaryExpr::createSub(Expr, MCSymbolRefExpr::create(JTISymbol,
+ OutContext),
+ OutContext);
+ // If we're generating a table of Thumb addresses in static relocation
+ // model, we need to add one to keep interworking correctly.
+ else if (AFI->isThumbFunction())
+ Expr = MCBinaryExpr::createAdd(Expr, MCConstantExpr::create(1,OutContext),
+ OutContext);
+ OutStreamer->EmitValue(Expr, 4);
+ }
+ // Mark the end of jump table data-in-code region.
+ OutStreamer->EmitDataRegion(MCDR_DataRegionEnd);
+}
+
+void ARMAsmPrinter::EmitJumpTableInsts(const MachineInstr *MI) {
+ const MachineOperand &MO1 = MI->getOperand(1);
+ unsigned JTI = MO1.getIndex();
+
+ MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI);
+ OutStreamer->EmitLabel(JTISymbol);
+
+ // Emit each entry of the table.
+ const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+ const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+ const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+
+ for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) {
+ MachineBasicBlock *MBB = JTBBs[i];
+ const MCExpr *MBBSymbolExpr = MCSymbolRefExpr::create(MBB->getSymbol(),
+ OutContext);
+ // If this isn't a TBB or TBH, the entries are direct branch instructions.
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::t2B)
+ .addExpr(MBBSymbolExpr)
+ .addImm(ARMCC::AL)
+ .addReg(0));
+ }
+}
+
+void ARMAsmPrinter::EmitJumpTableTBInst(const MachineInstr *MI,
+ unsigned OffsetWidth) {
+ assert((OffsetWidth == 1 || OffsetWidth == 2) && "invalid tbb/tbh width");
+ const MachineOperand &MO1 = MI->getOperand(1);
+ unsigned JTI = MO1.getIndex();
+
+ if (Subtarget->isThumb1Only())
+ EmitAlignment(2);
+
+ MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI);
+ OutStreamer->EmitLabel(JTISymbol);
+
+ // Emit each entry of the table.
+ const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+ const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+ const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+
+ // Mark the jump table as data-in-code.
+ OutStreamer->EmitDataRegion(OffsetWidth == 1 ? MCDR_DataRegionJT8
+ : MCDR_DataRegionJT16);
+
+ for (auto MBB : JTBBs) {
+ const MCExpr *MBBSymbolExpr = MCSymbolRefExpr::create(MBB->getSymbol(),
+ OutContext);
+ // Otherwise it's an offset from the dispatch instruction. Construct an
+ // MCExpr for the entry. We want a value of the form:
+ // (BasicBlockAddr - TBBInstAddr + 4) / 2
+ //
+ // For example, a TBB table with entries jumping to basic blocks BB0 and BB1
+ // would look like:
+ // LJTI_0_0:
+ // .byte (LBB0 - (LCPI0_0 + 4)) / 2
+ // .byte (LBB1 - (LCPI0_0 + 4)) / 2
+ // where LCPI0_0 is a label defined just before the TBB instruction using
+ // this table.
+ MCSymbol *TBInstPC = GetCPISymbol(MI->getOperand(0).getImm());
+ const MCExpr *Expr = MCBinaryExpr::createAdd(
+ MCSymbolRefExpr::create(TBInstPC, OutContext),
+ MCConstantExpr::create(4, OutContext), OutContext);
+ Expr = MCBinaryExpr::createSub(MBBSymbolExpr, Expr, OutContext);
+ Expr = MCBinaryExpr::createDiv(Expr, MCConstantExpr::create(2, OutContext),
+ OutContext);
+ OutStreamer->EmitValue(Expr, OffsetWidth);
+ }
+ // Mark the end of jump table data-in-code region. 32-bit offsets use
+ // actual branch instructions here, so we don't mark those as a data-region
+ // at all.
+ OutStreamer->EmitDataRegion(MCDR_DataRegionEnd);
+
+ // Make sure the next instruction is 2-byte aligned.
+ EmitAlignment(1);
+}
+
+void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
+ assert(MI->getFlag(MachineInstr::FrameSetup) &&
+ "Only instruction which are involved into frame setup code are allowed");
+
+ MCTargetStreamer &TS = *OutStreamer->getTargetStreamer();
+ ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
+ const MachineFunction &MF = *MI->getParent()->getParent();
+ const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
+ const ARMFunctionInfo &AFI = *MF.getInfo<ARMFunctionInfo>();
+
+ unsigned FramePtr = RegInfo->getFrameRegister(MF);
+ unsigned Opc = MI->getOpcode();
+ unsigned SrcReg, DstReg;
+
+ if (Opc == ARM::tPUSH || Opc == ARM::tLDRpci) {
+ // Two special cases:
+ // 1) tPUSH does not have src/dst regs.
+ // 2) for Thumb1 code we sometimes materialize the constant via constpool
+ // load. Yes, this is pretty fragile, but for now I don't see better
+ // way... :(
+ SrcReg = DstReg = ARM::SP;
+ } else {
+ SrcReg = MI->getOperand(1).getReg();
+ DstReg = MI->getOperand(0).getReg();
+ }
+
+ // Try to figure out the unwinding opcode out of src / dst regs.
+ if (MI->mayStore()) {
+ // Register saves.
+ assert(DstReg == ARM::SP &&
+ "Only stack pointer as a destination reg is supported");
+
+ SmallVector<unsigned, 4> RegList;
+ // Skip src & dst reg, and pred ops.
+ unsigned StartOp = 2 + 2;
+ // Use all the operands.
+ unsigned NumOffset = 0;
+
+ switch (Opc) {
+ default:
+ MI->dump();
+ llvm_unreachable("Unsupported opcode for unwinding information");
+ case ARM::tPUSH:
+ // Special case here: no src & dst reg, but two extra imp ops.
+ StartOp = 2; NumOffset = 2;
+ case ARM::STMDB_UPD:
+ case ARM::t2STMDB_UPD:
+ case ARM::VSTMDDB_UPD:
+ assert(SrcReg == ARM::SP &&
+ "Only stack pointer as a source reg is supported");
+ for (unsigned i = StartOp, NumOps = MI->getNumOperands() - NumOffset;
+ i != NumOps; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+ // Actually, there should never be any impdef stuff here. Skip it
+ // temporary to workaround PR11902.
+ if (MO.isImplicit())
+ continue;
+ RegList.push_back(MO.getReg());
+ }
+ break;
+ case ARM::STR_PRE_IMM:
+ case ARM::STR_PRE_REG:
+ case ARM::t2STR_PRE:
+ assert(MI->getOperand(2).getReg() == ARM::SP &&
+ "Only stack pointer as a source reg is supported");
+ RegList.push_back(SrcReg);
+ break;
+ }
+ if (MAI->getExceptionHandlingType() == ExceptionHandling::ARM)
+ ATS.emitRegSave(RegList, Opc == ARM::VSTMDDB_UPD);
+ } else {
+ // Changes of stack / frame pointer.
+ if (SrcReg == ARM::SP) {
+ int64_t Offset = 0;
+ switch (Opc) {
+ default:
+ MI->dump();
+ llvm_unreachable("Unsupported opcode for unwinding information");
+ case ARM::MOVr:
+ case ARM::tMOVr:
+ Offset = 0;
+ break;
+ case ARM::ADDri:
+ case ARM::t2ADDri:
+ Offset = -MI->getOperand(2).getImm();
+ break;
+ case ARM::SUBri:
+ case ARM::t2SUBri:
+ Offset = MI->getOperand(2).getImm();
+ break;
+ case ARM::tSUBspi:
+ Offset = MI->getOperand(2).getImm()*4;
+ break;
+ case ARM::tADDspi:
+ case ARM::tADDrSPi:
+ Offset = -MI->getOperand(2).getImm()*4;
+ break;
+ case ARM::tLDRpci: {
+ // Grab the constpool index and check, whether it corresponds to
+ // original or cloned constpool entry.
+ unsigned CPI = MI->getOperand(1).getIndex();
+ const MachineConstantPool *MCP = MF.getConstantPool();
+ if (CPI >= MCP->getConstants().size())
+ CPI = AFI.getOriginalCPIdx(CPI);
+ assert(CPI != -1U && "Invalid constpool index");
+
+ // Derive the actual offset.
+ const MachineConstantPoolEntry &CPE = MCP->getConstants()[CPI];
+ assert(!CPE.isMachineConstantPoolEntry() && "Invalid constpool entry");
+ // FIXME: Check for user, it should be "add" instruction!
+ Offset = -cast<ConstantInt>(CPE.Val.ConstVal)->getSExtValue();
+ break;
+ }
+ }
+
+ if (MAI->getExceptionHandlingType() == ExceptionHandling::ARM) {
+ if (DstReg == FramePtr && FramePtr != ARM::SP)
+ // Set-up of the frame pointer. Positive values correspond to "add"
+ // instruction.
+ ATS.emitSetFP(FramePtr, ARM::SP, -Offset);
+ else if (DstReg == ARM::SP) {
+ // Change of SP by an offset. Positive values correspond to "sub"
+ // instruction.
+ ATS.emitPad(Offset);
+ } else {
+ // Move of SP to a register. Positive values correspond to an "add"
+ // instruction.
+ ATS.emitMovSP(DstReg, -Offset);
+ }
+ }
+ } else if (DstReg == ARM::SP) {
+ MI->dump();
+ llvm_unreachable("Unsupported opcode for unwinding information");
+ }
+ else {
+ MI->dump();
+ llvm_unreachable("Unsupported opcode for unwinding information");
+ }
+ }
+}
+
+// Simple pseudo-instructions have their lowering (with expansion to real
+// instructions) auto-generated.
+#include "ARMGenMCPseudoLowering.inc"
+
+void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+ const DataLayout &DL = getDataLayout();
+ MCTargetStreamer &TS = *OutStreamer->getTargetStreamer();
+ ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
+
+ // If we just ended a constant pool, mark it as such.
+ if (InConstantPool && MI->getOpcode() != ARM::CONSTPOOL_ENTRY) {
+ OutStreamer->EmitDataRegion(MCDR_DataRegionEnd);
+ InConstantPool = false;
+ }
+
+ // Emit unwinding stuff for frame-related instructions
+ if (Subtarget->isTargetEHABICompatible() &&
+ MI->getFlag(MachineInstr::FrameSetup))
+ EmitUnwindingInstruction(MI);
+
+ // Do any auto-generated pseudo lowerings.
+ if (emitPseudoExpansionLowering(*OutStreamer, MI))
+ return;
+
+ assert(!convertAddSubFlagsOpcode(MI->getOpcode()) &&
+ "Pseudo flag setting opcode should be expanded early");
+
+ // Check for manual lowerings.
+ unsigned Opc = MI->getOpcode();
+ switch (Opc) {
+ case ARM::t2MOVi32imm: llvm_unreachable("Should be lowered by thumb2it pass");
+ case ARM::DBG_VALUE: llvm_unreachable("Should be handled by generic printing");
+ case ARM::LEApcrel:
+ case ARM::tLEApcrel:
+ case ARM::t2LEApcrel: {
+ // FIXME: Need to also handle globals and externals
+ MCSymbol *CPISymbol = GetCPISymbol(MI->getOperand(1).getIndex());
+ EmitToStreamer(*OutStreamer, MCInstBuilder(MI->getOpcode() ==
+ ARM::t2LEApcrel ? ARM::t2ADR
+ : (MI->getOpcode() == ARM::tLEApcrel ? ARM::tADR
+ : ARM::ADR))
+ .addReg(MI->getOperand(0).getReg())
+ .addExpr(MCSymbolRefExpr::create(CPISymbol, OutContext))
+ // Add predicate operands.
+ .addImm(MI->getOperand(2).getImm())
+ .addReg(MI->getOperand(3).getReg()));
+ return;
+ }
+ case ARM::LEApcrelJT:
+ case ARM::tLEApcrelJT:
+ case ARM::t2LEApcrelJT: {
+ MCSymbol *JTIPICSymbol =
+ GetARMJTIPICJumpTableLabel(MI->getOperand(1).getIndex());
+ EmitToStreamer(*OutStreamer, MCInstBuilder(MI->getOpcode() ==
+ ARM::t2LEApcrelJT ? ARM::t2ADR
+ : (MI->getOpcode() == ARM::tLEApcrelJT ? ARM::tADR
+ : ARM::ADR))
+ .addReg(MI->getOperand(0).getReg())
+ .addExpr(MCSymbolRefExpr::create(JTIPICSymbol, OutContext))
+ // Add predicate operands.
+ .addImm(MI->getOperand(2).getImm())
+ .addReg(MI->getOperand(3).getReg()));
+ return;
+ }
+ // Darwin call instructions are just normal call instructions with different
+ // clobber semantics (they clobber R9).
+ case ARM::BX_CALL: {
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::MOVr)
+ .addReg(ARM::LR)
+ .addReg(ARM::PC)
+ // Add predicate operands.
+ .addImm(ARMCC::AL)
+ .addReg(0)
+ // Add 's' bit operand (always reg0 for this)
+ .addReg(0));
+
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::BX)
+ .addReg(MI->getOperand(0).getReg()));
+ return;
+ }
+ case ARM::tBX_CALL: {
+ if (Subtarget->hasV5TOps())
+ llvm_unreachable("Expected BLX to be selected for v5t+");
+
+ // On ARM v4t, when doing a call from thumb mode, we need to ensure
+ // that the saved lr has its LSB set correctly (the arch doesn't
+ // have blx).
+ // So here we generate a bl to a small jump pad that does bx rN.
+ // The jump pads are emitted after the function body.
+
+ unsigned TReg = MI->getOperand(0).getReg();
+ MCSymbol *TRegSym = nullptr;
+ for (unsigned i = 0, e = ThumbIndirectPads.size(); i < e; i++) {
+ if (ThumbIndirectPads[i].first == TReg) {
+ TRegSym = ThumbIndirectPads[i].second;
+ break;
+ }
+ }
+
+ if (!TRegSym) {
+ TRegSym = OutContext.createTempSymbol();
+ ThumbIndirectPads.push_back(std::make_pair(TReg, TRegSym));
+ }
+
+ // Create a link-saving branch to the Reg Indirect Jump Pad.
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tBL)
+ // Predicate comes first here.
+ .addImm(ARMCC::AL).addReg(0)
+ .addExpr(MCSymbolRefExpr::create(TRegSym, OutContext)));
+ return;
+ }
+ case ARM::BMOVPCRX_CALL: {
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::MOVr)
+ .addReg(ARM::LR)
+ .addReg(ARM::PC)
+ // Add predicate operands.
+ .addImm(ARMCC::AL)
+ .addReg(0)
+ // Add 's' bit operand (always reg0 for this)
+ .addReg(0));
+
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::MOVr)
+ .addReg(ARM::PC)
+ .addReg(MI->getOperand(0).getReg())
+ // Add predicate operands.
+ .addImm(ARMCC::AL)
+ .addReg(0)
+ // Add 's' bit operand (always reg0 for this)
+ .addReg(0));
+ return;
+ }
+ case ARM::BMOVPCB_CALL: {
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::MOVr)
+ .addReg(ARM::LR)
+ .addReg(ARM::PC)
+ // Add predicate operands.
+ .addImm(ARMCC::AL)
+ .addReg(0)
+ // Add 's' bit operand (always reg0 for this)
+ .addReg(0));
+
+ const MachineOperand &Op = MI->getOperand(0);
+ const GlobalValue *GV = Op.getGlobal();
+ const unsigned TF = Op.getTargetFlags();
+ MCSymbol *GVSym = GetARMGVSymbol(GV, TF);
+ const MCExpr *GVSymExpr = MCSymbolRefExpr::create(GVSym, OutContext);
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::Bcc)
+ .addExpr(GVSymExpr)
+ // Add predicate operands.
+ .addImm(ARMCC::AL)
+ .addReg(0));
+ return;
+ }
+ case ARM::MOVi16_ga_pcrel:
+ case ARM::t2MOVi16_ga_pcrel: {
+ MCInst TmpInst;
+ TmpInst.setOpcode(Opc == ARM::MOVi16_ga_pcrel? ARM::MOVi16 : ARM::t2MOVi16);
+ TmpInst.addOperand(MCOperand::createReg(MI->getOperand(0).getReg()));
+
+ unsigned TF = MI->getOperand(1).getTargetFlags();
+ const GlobalValue *GV = MI->getOperand(1).getGlobal();
+ MCSymbol *GVSym = GetARMGVSymbol(GV, TF);
+ const MCExpr *GVSymExpr = MCSymbolRefExpr::create(GVSym, OutContext);
+
+ MCSymbol *LabelSym =
+ getPICLabel(DL.getPrivateGlobalPrefix(), getFunctionNumber(),
+ MI->getOperand(2).getImm(), OutContext);
+ const MCExpr *LabelSymExpr= MCSymbolRefExpr::create(LabelSym, OutContext);
+ unsigned PCAdj = (Opc == ARM::MOVi16_ga_pcrel) ? 8 : 4;
+ const MCExpr *PCRelExpr =
+ ARMMCExpr::createLower16(MCBinaryExpr::createSub(GVSymExpr,
+ MCBinaryExpr::createAdd(LabelSymExpr,
+ MCConstantExpr::create(PCAdj, OutContext),
+ OutContext), OutContext), OutContext);
+ TmpInst.addOperand(MCOperand::createExpr(PCRelExpr));
+
+ // Add predicate operands.
+ TmpInst.addOperand(MCOperand::createImm(ARMCC::AL));
+ TmpInst.addOperand(MCOperand::createReg(0));
+ // Add 's' bit operand (always reg0 for this)
+ TmpInst.addOperand(MCOperand::createReg(0));
+ EmitToStreamer(*OutStreamer, TmpInst);
+ return;
+ }
+ case ARM::MOVTi16_ga_pcrel:
+ case ARM::t2MOVTi16_ga_pcrel: {
+ MCInst TmpInst;
+ TmpInst.setOpcode(Opc == ARM::MOVTi16_ga_pcrel
+ ? ARM::MOVTi16 : ARM::t2MOVTi16);
+ TmpInst.addOperand(MCOperand::createReg(MI->getOperand(0).getReg()));
+ TmpInst.addOperand(MCOperand::createReg(MI->getOperand(1).getReg()));
+
+ unsigned TF = MI->getOperand(2).getTargetFlags();
+ const GlobalValue *GV = MI->getOperand(2).getGlobal();
+ MCSymbol *GVSym = GetARMGVSymbol(GV, TF);
+ const MCExpr *GVSymExpr = MCSymbolRefExpr::create(GVSym, OutContext);
+
+ MCSymbol *LabelSym =
+ getPICLabel(DL.getPrivateGlobalPrefix(), getFunctionNumber(),
+ MI->getOperand(3).getImm(), OutContext);
+ const MCExpr *LabelSymExpr= MCSymbolRefExpr::create(LabelSym, OutContext);
+ unsigned PCAdj = (Opc == ARM::MOVTi16_ga_pcrel) ? 8 : 4;
+ const MCExpr *PCRelExpr =
+ ARMMCExpr::createUpper16(MCBinaryExpr::createSub(GVSymExpr,
+ MCBinaryExpr::createAdd(LabelSymExpr,
+ MCConstantExpr::create(PCAdj, OutContext),
+ OutContext), OutContext), OutContext);
+ TmpInst.addOperand(MCOperand::createExpr(PCRelExpr));
+ // Add predicate operands.
+ TmpInst.addOperand(MCOperand::createImm(ARMCC::AL));
+ TmpInst.addOperand(MCOperand::createReg(0));
+ // Add 's' bit operand (always reg0 for this)
+ TmpInst.addOperand(MCOperand::createReg(0));
+ EmitToStreamer(*OutStreamer, TmpInst);
+ return;
+ }
+ case ARM::tPICADD: {
+ // This is a pseudo op for a label + instruction sequence, which looks like:
+ // LPC0:
+ // add r0, pc
+ // This adds the address of LPC0 to r0.
+
+ // Emit the label.
+ OutStreamer->EmitLabel(getPICLabel(DL.getPrivateGlobalPrefix(),
+ getFunctionNumber(),
+ MI->getOperand(2).getImm(), OutContext));
+
+ // Form and emit the add.
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tADDhirr)
+ .addReg(MI->getOperand(0).getReg())
+ .addReg(MI->getOperand(0).getReg())
+ .addReg(ARM::PC)
+ // Add predicate operands.
+ .addImm(ARMCC::AL)
+ .addReg(0));
+ return;
+ }
+ case ARM::PICADD: {
+ // This is a pseudo op for a label + instruction sequence, which looks like:
+ // LPC0:
+ // add r0, pc, r0
+ // This adds the address of LPC0 to r0.
+
+ // Emit the label.
+ OutStreamer->EmitLabel(getPICLabel(DL.getPrivateGlobalPrefix(),
+ getFunctionNumber(),
+ MI->getOperand(2).getImm(), OutContext));
+
+ // Form and emit the add.
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::ADDrr)
+ .addReg(MI->getOperand(0).getReg())
+ .addReg(ARM::PC)
+ .addReg(MI->getOperand(1).getReg())
+ // Add predicate operands.
+ .addImm(MI->getOperand(3).getImm())
+ .addReg(MI->getOperand(4).getReg())
+ // Add 's' bit operand (always reg0 for this)
+ .addReg(0));
+ return;
+ }
+ case ARM::PICSTR:
+ case ARM::PICSTRB:
+ case ARM::PICSTRH:
+ case ARM::PICLDR:
+ case ARM::PICLDRB:
+ case ARM::PICLDRH:
+ case ARM::PICLDRSB:
+ case ARM::PICLDRSH: {
+ // This is a pseudo op for a label + instruction sequence, which looks like:
+ // LPC0:
+ // OP r0, [pc, r0]
+ // The LCP0 label is referenced by a constant pool entry in order to get
+ // a PC-relative address at the ldr instruction.
+
+ // Emit the label.
+ OutStreamer->EmitLabel(getPICLabel(DL.getPrivateGlobalPrefix(),
+ getFunctionNumber(),
+ MI->getOperand(2).getImm(), OutContext));
+
+ // Form and emit the load
+ unsigned Opcode;
+ switch (MI->getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected opcode!");
+ case ARM::PICSTR: Opcode = ARM::STRrs; break;
+ case ARM::PICSTRB: Opcode = ARM::STRBrs; break;
+ case ARM::PICSTRH: Opcode = ARM::STRH; break;
+ case ARM::PICLDR: Opcode = ARM::LDRrs; break;
+ case ARM::PICLDRB: Opcode = ARM::LDRBrs; break;
+ case ARM::PICLDRH: Opcode = ARM::LDRH; break;
+ case ARM::PICLDRSB: Opcode = ARM::LDRSB; break;
+ case ARM::PICLDRSH: Opcode = ARM::LDRSH; break;
+ }
+ EmitToStreamer(*OutStreamer, MCInstBuilder(Opcode)
+ .addReg(MI->getOperand(0).getReg())
+ .addReg(ARM::PC)
+ .addReg(MI->getOperand(1).getReg())
+ .addImm(0)
+ // Add predicate operands.
+ .addImm(MI->getOperand(3).getImm())
+ .addReg(MI->getOperand(4).getReg()));
+
+ return;
+ }
+ case ARM::CONSTPOOL_ENTRY: {
+ /// CONSTPOOL_ENTRY - This instruction represents a floating constant pool
+ /// in the function. The first operand is the ID# for this instruction, the
+ /// second is the index into the MachineConstantPool that this is, the third
+ /// is the size in bytes of this constant pool entry.
+ /// The required alignment is specified on the basic block holding this MI.
+ unsigned LabelId = (unsigned)MI->getOperand(0).getImm();
+ unsigned CPIdx = (unsigned)MI->getOperand(1).getIndex();
+
+ // If this is the first entry of the pool, mark it.
+ if (!InConstantPool) {
+ OutStreamer->EmitDataRegion(MCDR_DataRegion);
+ InConstantPool = true;
+ }
+
+ OutStreamer->EmitLabel(GetCPISymbol(LabelId));
+
+ const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPIdx];
+ if (MCPE.isMachineConstantPoolEntry())
+ EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal);
+ else
+ EmitGlobalConstant(DL, MCPE.Val.ConstVal);
+ return;
+ }
+ case ARM::JUMPTABLE_ADDRS:
+ EmitJumpTableAddrs(MI);
+ return;
+ case ARM::JUMPTABLE_INSTS:
+ EmitJumpTableInsts(MI);
+ return;
+ case ARM::JUMPTABLE_TBB:
+ case ARM::JUMPTABLE_TBH:
+ EmitJumpTableTBInst(MI, MI->getOpcode() == ARM::JUMPTABLE_TBB ? 1 : 2);
+ return;
+ case ARM::t2BR_JT: {
+ // Lower and emit the instruction itself, then the jump table following it.
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVr)
+ .addReg(ARM::PC)
+ .addReg(MI->getOperand(0).getReg())
+ // Add predicate operands.
+ .addImm(ARMCC::AL)
+ .addReg(0));
+ return;
+ }
+ case ARM::t2TBB_JT:
+ case ARM::t2TBH_JT: {
+ unsigned Opc = MI->getOpcode() == ARM::t2TBB_JT ? ARM::t2TBB : ARM::t2TBH;
+ // Lower and emit the PC label, then the instruction itself.
+ OutStreamer->EmitLabel(GetCPISymbol(MI->getOperand(3).getImm()));
+ EmitToStreamer(*OutStreamer, MCInstBuilder(Opc)
+ .addReg(MI->getOperand(0).getReg())
+ .addReg(MI->getOperand(1).getReg())
+ // Add predicate operands.
+ .addImm(ARMCC::AL)
+ .addReg(0));
+ return;
+ }
+ case ARM::tTBB_JT:
+ case ARM::tTBH_JT: {
+
+ bool Is8Bit = MI->getOpcode() == ARM::tTBB_JT;
+ unsigned Base = MI->getOperand(0).getReg();
+ unsigned Idx = MI->getOperand(1).getReg();
+ assert(MI->getOperand(1).isKill() && "We need the index register as scratch!");
+
+ // Multiply up idx if necessary.
+ if (!Is8Bit)
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tLSLri)
+ .addReg(Idx)
+ .addReg(ARM::CPSR)
+ .addReg(Idx)
+ .addImm(1)
+ // Add predicate operands.
+ .addImm(ARMCC::AL)
+ .addReg(0));
+
+ if (Base == ARM::PC) {
+ // TBB [base, idx] =
+ // ADDS idx, idx, base
+ // LDRB idx, [idx, #4] ; or LDRH if TBH
+ // LSLS idx, #1
+ // ADDS pc, pc, idx
+
+ // When using PC as the base, it's important that there is no padding
+ // between the last ADDS and the start of the jump table. The jump table
+ // is 4-byte aligned, so we ensure we're 4 byte aligned here too.
+ //
+ // FIXME: Ideally we could vary the LDRB index based on the padding
+ // between the sequence and jump table, however that relies on MCExprs
+ // for load indexes which are currently not supported.
+ OutStreamer->EmitCodeAlignment(4);
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tADDhirr)
+ .addReg(Idx)
+ .addReg(Idx)
+ .addReg(Base)
+ // Add predicate operands.
+ .addImm(ARMCC::AL)
+ .addReg(0));
+
+ unsigned Opc = Is8Bit ? ARM::tLDRBi : ARM::tLDRHi;
+ EmitToStreamer(*OutStreamer, MCInstBuilder(Opc)
+ .addReg(Idx)
+ .addReg(Idx)
+ .addImm(Is8Bit ? 4 : 2)
+ // Add predicate operands.
+ .addImm(ARMCC::AL)
+ .addReg(0));
+ } else {
+ // TBB [base, idx] =
+ // LDRB idx, [base, idx] ; or LDRH if TBH
+ // LSLS idx, #1
+ // ADDS pc, pc, idx
+
+ unsigned Opc = Is8Bit ? ARM::tLDRBr : ARM::tLDRHr;
+ EmitToStreamer(*OutStreamer, MCInstBuilder(Opc)
+ .addReg(Idx)
+ .addReg(Base)
+ .addReg(Idx)
+ // Add predicate operands.
+ .addImm(ARMCC::AL)
+ .addReg(0));
+ }
+
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tLSLri)
+ .addReg(Idx)
+ .addReg(ARM::CPSR)
+ .addReg(Idx)
+ .addImm(1)
+ // Add predicate operands.
+ .addImm(ARMCC::AL)
+ .addReg(0));
+
+ OutStreamer->EmitLabel(GetCPISymbol(MI->getOperand(3).getImm()));
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tADDhirr)
+ .addReg(ARM::PC)
+ .addReg(ARM::PC)
+ .addReg(Idx)
+ // Add predicate operands.
+ .addImm(ARMCC::AL)
+ .addReg(0));
+ return;
+ }
+ case ARM::tBR_JTr:
+ case ARM::BR_JTr: {
+ // Lower and emit the instruction itself, then the jump table following it.
+ // mov pc, target
+ MCInst TmpInst;
+ unsigned Opc = MI->getOpcode() == ARM::BR_JTr ?
+ ARM::MOVr : ARM::tMOVr;
+ TmpInst.setOpcode(Opc);
+ TmpInst.addOperand(MCOperand::createReg(ARM::PC));
+ TmpInst.addOperand(MCOperand::createReg(MI->getOperand(0).getReg()));
+ // Add predicate operands.
+ TmpInst.addOperand(MCOperand::createImm(ARMCC::AL));
+ TmpInst.addOperand(MCOperand::createReg(0));
+ // Add 's' bit operand (always reg0 for this)
+ if (Opc == ARM::MOVr)
+ TmpInst.addOperand(MCOperand::createReg(0));
+ EmitToStreamer(*OutStreamer, TmpInst);
+ return;
+ }
+ case ARM::BR_JTm: {
+ // Lower and emit the instruction itself, then the jump table following it.
+ // ldr pc, target
+ MCInst TmpInst;
+ if (MI->getOperand(1).getReg() == 0) {
+ // literal offset
+ TmpInst.setOpcode(ARM::LDRi12);
+ TmpInst.addOperand(MCOperand::createReg(ARM::PC));
+ TmpInst.addOperand(MCOperand::createReg(MI->getOperand(0).getReg()));
+ TmpInst.addOperand(MCOperand::createImm(MI->getOperand(2).getImm()));
+ } else {
+ TmpInst.setOpcode(ARM::LDRrs);
+ TmpInst.addOperand(MCOperand::createReg(ARM::PC));
+ TmpInst.addOperand(MCOperand::createReg(MI->getOperand(0).getReg()));
+ TmpInst.addOperand(MCOperand::createReg(MI->getOperand(1).getReg()));
+ TmpInst.addOperand(MCOperand::createImm(0));
+ }
+ // Add predicate operands.
+ TmpInst.addOperand(MCOperand::createImm(ARMCC::AL));
+ TmpInst.addOperand(MCOperand::createReg(0));
+ EmitToStreamer(*OutStreamer, TmpInst);
+ return;
+ }
+ case ARM::BR_JTadd: {
+ // Lower and emit the instruction itself, then the jump table following it.
+ // add pc, target, idx
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::ADDrr)
+ .addReg(ARM::PC)
+ .addReg(MI->getOperand(0).getReg())
+ .addReg(MI->getOperand(1).getReg())
+ // Add predicate operands.
+ .addImm(ARMCC::AL)
+ .addReg(0)
+ // Add 's' bit operand (always reg0 for this)
+ .addReg(0));
+ return;
+ }
+ case ARM::SPACE:
+ OutStreamer->EmitZeros(MI->getOperand(1).getImm());
+ return;
+ case ARM::TRAP: {
+ // Non-Darwin binutils don't yet support the "trap" mnemonic.
+ // FIXME: Remove this special case when they do.
+ if (!Subtarget->isTargetMachO()) {
+ uint32_t Val = 0xe7ffdefeUL;
+ OutStreamer->AddComment("trap");
+ ATS.emitInst(Val);
+ return;
+ }
+ break;
+ }
+ case ARM::TRAPNaCl: {
+ uint32_t Val = 0xe7fedef0UL;
+ OutStreamer->AddComment("trap");
+ ATS.emitInst(Val);
+ return;
+ }
+ case ARM::tTRAP: {
+ // Non-Darwin binutils don't yet support the "trap" mnemonic.
+ // FIXME: Remove this special case when they do.
+ if (!Subtarget->isTargetMachO()) {
+ uint16_t Val = 0xdefe;
+ OutStreamer->AddComment("trap");
+ ATS.emitInst(Val, 'n');
+ return;
+ }
+ break;
+ }
+ case ARM::t2Int_eh_sjlj_setjmp:
+ case ARM::t2Int_eh_sjlj_setjmp_nofp:
+ case ARM::tInt_eh_sjlj_setjmp: {
+ // Two incoming args: GPR:$src, GPR:$val
+ // mov $val, pc
+ // adds $val, #7
+ // str $val, [$src, #4]
+ // movs r0, #0
+ // b LSJLJEH
+ // movs r0, #1
+ // LSJLJEH:
+ unsigned SrcReg = MI->getOperand(0).getReg();
+ unsigned ValReg = MI->getOperand(1).getReg();
+ MCSymbol *Label = OutContext.createTempSymbol("SJLJEH", false, true);
+ OutStreamer->AddComment("eh_setjmp begin");
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVr)
+ .addReg(ValReg)
+ .addReg(ARM::PC)
+ // Predicate.
+ .addImm(ARMCC::AL)
+ .addReg(0));
+
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tADDi3)
+ .addReg(ValReg)
+ // 's' bit operand
+ .addReg(ARM::CPSR)
+ .addReg(ValReg)
+ .addImm(7)
+ // Predicate.
+ .addImm(ARMCC::AL)
+ .addReg(0));
+
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tSTRi)
+ .addReg(ValReg)
+ .addReg(SrcReg)
+ // The offset immediate is #4. The operand value is scaled by 4 for the
+ // tSTR instruction.
+ .addImm(1)
+ // Predicate.
+ .addImm(ARMCC::AL)
+ .addReg(0));
+
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVi8)
+ .addReg(ARM::R0)
+ .addReg(ARM::CPSR)
+ .addImm(0)
+ // Predicate.
+ .addImm(ARMCC::AL)
+ .addReg(0));
+
+ const MCExpr *SymbolExpr = MCSymbolRefExpr::create(Label, OutContext);
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tB)
+ .addExpr(SymbolExpr)
+ .addImm(ARMCC::AL)
+ .addReg(0));
+
+ OutStreamer->AddComment("eh_setjmp end");
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVi8)
+ .addReg(ARM::R0)
+ .addReg(ARM::CPSR)
+ .addImm(1)
+ // Predicate.
+ .addImm(ARMCC::AL)
+ .addReg(0));
+
+ OutStreamer->EmitLabel(Label);
+ return;
+ }
+
+ case ARM::Int_eh_sjlj_setjmp_nofp:
+ case ARM::Int_eh_sjlj_setjmp: {
+ // Two incoming args: GPR:$src, GPR:$val
+ // add $val, pc, #8
+ // str $val, [$src, #+4]
+ // mov r0, #0
+ // add pc, pc, #0
+ // mov r0, #1
+ unsigned SrcReg = MI->getOperand(0).getReg();
+ unsigned ValReg = MI->getOperand(1).getReg();
+
+ OutStreamer->AddComment("eh_setjmp begin");
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::ADDri)
+ .addReg(ValReg)
+ .addReg(ARM::PC)
+ .addImm(8)
+ // Predicate.
+ .addImm(ARMCC::AL)
+ .addReg(0)
+ // 's' bit operand (always reg0 for this).
+ .addReg(0));
+
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::STRi12)
+ .addReg(ValReg)
+ .addReg(SrcReg)
+ .addImm(4)
+ // Predicate.
+ .addImm(ARMCC::AL)
+ .addReg(0));
+
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::MOVi)
+ .addReg(ARM::R0)
+ .addImm(0)
+ // Predicate.
+ .addImm(ARMCC::AL)
+ .addReg(0)
+ // 's' bit operand (always reg0 for this).
+ .addReg(0));
+
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::ADDri)
+ .addReg(ARM::PC)
+ .addReg(ARM::PC)
+ .addImm(0)
+ // Predicate.
+ .addImm(ARMCC::AL)
+ .addReg(0)
+ // 's' bit operand (always reg0 for this).
+ .addReg(0));
+
+ OutStreamer->AddComment("eh_setjmp end");
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::MOVi)
+ .addReg(ARM::R0)
+ .addImm(1)
+ // Predicate.
+ .addImm(ARMCC::AL)
+ .addReg(0)
+ // 's' bit operand (always reg0 for this).
+ .addReg(0));
+ return;
+ }
+ case ARM::Int_eh_sjlj_longjmp: {
+ // ldr sp, [$src, #8]
+ // ldr $scratch, [$src, #4]
+ // ldr r7, [$src]
+ // bx $scratch
+ unsigned SrcReg = MI->getOperand(0).getReg();
+ unsigned ScratchReg = MI->getOperand(1).getReg();
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::LDRi12)
+ .addReg(ARM::SP)
+ .addReg(SrcReg)
+ .addImm(8)
+ // Predicate.
+ .addImm(ARMCC::AL)
+ .addReg(0));
+
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::LDRi12)
+ .addReg(ScratchReg)
+ .addReg(SrcReg)
+ .addImm(4)
+ // Predicate.
+ .addImm(ARMCC::AL)
+ .addReg(0));
+
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::LDRi12)
+ .addReg(ARM::R7)
+ .addReg(SrcReg)
+ .addImm(0)
+ // Predicate.
+ .addImm(ARMCC::AL)
+ .addReg(0));
+
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::BX)
+ .addReg(ScratchReg)
+ // Predicate.
+ .addImm(ARMCC::AL)
+ .addReg(0));
+ return;
+ }
+ case ARM::tInt_eh_sjlj_longjmp: {
+ // ldr $scratch, [$src, #8]
+ // mov sp, $scratch
+ // ldr $scratch, [$src, #4]
+ // ldr r7, [$src]
+ // bx $scratch
+ unsigned SrcReg = MI->getOperand(0).getReg();
+ unsigned ScratchReg = MI->getOperand(1).getReg();
+
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tLDRi)
+ .addReg(ScratchReg)
+ .addReg(SrcReg)
+ // The offset immediate is #8. The operand value is scaled by 4 for the
+ // tLDR instruction.
+ .addImm(2)
+ // Predicate.
+ .addImm(ARMCC::AL)
+ .addReg(0));
+
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVr)
+ .addReg(ARM::SP)
+ .addReg(ScratchReg)
+ // Predicate.
+ .addImm(ARMCC::AL)
+ .addReg(0));
+
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tLDRi)
+ .addReg(ScratchReg)
+ .addReg(SrcReg)
+ .addImm(1)
+ // Predicate.
+ .addImm(ARMCC::AL)
+ .addReg(0));
+
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tLDRi)
+ .addReg(ARM::R7)
+ .addReg(SrcReg)
+ .addImm(0)
+ // Predicate.
+ .addImm(ARMCC::AL)
+ .addReg(0));
+
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tBX)
+ .addReg(ScratchReg)
+ // Predicate.
+ .addImm(ARMCC::AL)
+ .addReg(0));
+ return;
+ }
+ case ARM::tInt_WIN_eh_sjlj_longjmp: {
+ // ldr.w r11, [$src, #0]
+ // ldr.w sp, [$src, #8]
+ // ldr.w pc, [$src, #4]
+
+ unsigned SrcReg = MI->getOperand(0).getReg();
+
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::t2LDRi12)
+ .addReg(ARM::R11)
+ .addReg(SrcReg)
+ .addImm(0)
+ // Predicate
+ .addImm(ARMCC::AL)
+ .addReg(0));
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::t2LDRi12)
+ .addReg(ARM::SP)
+ .addReg(SrcReg)
+ .addImm(8)
+ // Predicate
+ .addImm(ARMCC::AL)
+ .addReg(0));
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::t2LDRi12)
+ .addReg(ARM::PC)
+ .addReg(SrcReg)
+ .addImm(4)
+ // Predicate
+ .addImm(ARMCC::AL)
+ .addReg(0));
+ return;
+ }
+ case ARM::PATCHABLE_FUNCTION_ENTER:
+ LowerPATCHABLE_FUNCTION_ENTER(*MI);
+ return;
+ case ARM::PATCHABLE_FUNCTION_EXIT:
+ LowerPATCHABLE_FUNCTION_EXIT(*MI);
+ return;
+ case ARM::PATCHABLE_TAIL_CALL:
+ LowerPATCHABLE_TAIL_CALL(*MI);
+ return;
+ }
+
+ MCInst TmpInst;
+ LowerARMMachineInstrToMCInst(MI, TmpInst, *this);
+
+ EmitToStreamer(*OutStreamer, TmpInst);
+}
+
+//===----------------------------------------------------------------------===//
+// Target Registry Stuff
+//===----------------------------------------------------------------------===//
+
+// Force static initialization.
+extern "C" void LLVMInitializeARMAsmPrinter() {
+ RegisterAsmPrinter<ARMAsmPrinter> X(getTheARMLETarget());
+ RegisterAsmPrinter<ARMAsmPrinter> Y(getTheARMBETarget());
+ RegisterAsmPrinter<ARMAsmPrinter> A(getTheThumbLETarget());
+ RegisterAsmPrinter<ARMAsmPrinter> B(getTheThumbBETarget());
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h
new file mode 100644
index 000000000000..ce0b04d56d9e
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h
@@ -0,0 +1,161 @@
+//===-- ARMAsmPrinter.h - ARM implementation of AsmPrinter ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMASMPRINTER_H
+#define LLVM_LIB_TARGET_ARM_ARMASMPRINTER_H
+
+#include "ARMSubtarget.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class ARMFunctionInfo;
+class MCOperand;
+class MachineConstantPool;
+class MachineOperand;
+class MCSymbol;
+
+namespace ARM {
+ enum DW_ISA {
+ DW_ISA_ARM_thumb = 1,
+ DW_ISA_ARM_arm = 2
+ };
+}
+
+class LLVM_LIBRARY_VISIBILITY ARMAsmPrinter : public AsmPrinter {
+
+ /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
+ /// make the right decision when printing asm code for different targets.
+ const ARMSubtarget *Subtarget;
+
+ /// AFI - Keep a pointer to ARMFunctionInfo for the current
+ /// MachineFunction.
+ ARMFunctionInfo *AFI;
+
+ /// MCP - Keep a pointer to constantpool entries of the current
+ /// MachineFunction.
+ const MachineConstantPool *MCP;
+
+ /// InConstantPool - Maintain state when emitting a sequence of constant
+ /// pool entries so we can properly mark them as data regions.
+ bool InConstantPool;
+
+ /// ThumbIndirectPads - These maintain a per-function list of jump pad
+ /// labels used for ARMv4t thumb code to make register indirect calls.
+ SmallVector<std::pair<unsigned, MCSymbol*>, 4> ThumbIndirectPads;
+
+ /// OptimizationGoals - Maintain a combined optimization goal for all
+ /// functions in a module: one of Tag_ABI_optimization_goals values,
+ /// -1 if uninitialized, 0 if conflicting goals
+ int OptimizationGoals;
+
+ /// List of globals that have had their storage promoted to a constant
+ /// pool. This lives between calls to runOnMachineFunction and collects
+ /// data from every MachineFunction. It is used during doFinalization
+ /// when all non-function globals are emitted.
+ SmallPtrSet<const GlobalVariable*,2> PromotedGlobals;
+ /// Set of globals in PromotedGlobals that we've emitted labels for.
+ /// We need to emit labels even for promoted globals so that DWARF
+ /// debug info can link properly.
+ SmallPtrSet<const GlobalVariable*,2> EmittedPromotedGlobalLabels;
+
+public:
+ explicit ARMAsmPrinter(TargetMachine &TM,
+ std::unique_ptr<MCStreamer> Streamer);
+
+ StringRef getPassName() const override {
+ return "ARM Assembly Printer";
+ }
+
+ void printOperand(const MachineInstr *MI, int OpNum, raw_ostream &O);
+
+ bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &O) override;
+ bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &O) override;
+
+ void emitInlineAsmEnd(const MCSubtargetInfo &StartInfo,
+ const MCSubtargetInfo *EndInfo) const override;
+
+ void EmitJumpTableAddrs(const MachineInstr *MI);
+ void EmitJumpTableInsts(const MachineInstr *MI);
+ void EmitJumpTableTBInst(const MachineInstr *MI, unsigned OffsetWidth);
+ void EmitInstruction(const MachineInstr *MI) override;
+ bool runOnMachineFunction(MachineFunction &F) override;
+
+ void EmitConstantPool() override {
+ // we emit constant pools customly!
+ }
+ void EmitFunctionBodyEnd() override;
+ void EmitFunctionEntryLabel() override;
+ void EmitStartOfAsmFile(Module &M) override;
+ void EmitEndOfAsmFile(Module &M) override;
+ void EmitXXStructor(const DataLayout &DL, const Constant *CV) override;
+ void EmitGlobalVariable(const GlobalVariable *GV) override;
+
+ // lowerOperand - Convert a MachineOperand into the equivalent MCOperand.
+ bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp);
+
+ //===------------------------------------------------------------------===//
+ // XRay implementation
+ //===------------------------------------------------------------------===//
+public:
+ // XRay-specific lowering for ARM.
+ void LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI);
+ void LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI);
+ void LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI);
+ // Helper function that emits the XRay sleds we've collected for a particular
+ // function.
+ void EmitXRayTable();
+
+private:
+ void EmitSled(const MachineInstr &MI, SledKind Kind);
+
+ // Helpers for EmitStartOfAsmFile() and EmitEndOfAsmFile()
+ void emitAttributes();
+
+ // Generic helper used to emit e.g. ARMv5 mul pseudos
+ void EmitPatchedInstruction(const MachineInstr *MI, unsigned TargetOpc);
+
+ void EmitUnwindingInstruction(const MachineInstr *MI);
+
+ // emitPseudoExpansionLowering - tblgen'erated.
+ bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
+ const MachineInstr *MI);
+
+public:
+ unsigned getISAEncoding() override {
+ // ARM/Darwin adds ISA to the DWARF info for each function.
+ const Triple &TT = TM.getTargetTriple();
+ if (!TT.isOSBinFormatMachO())
+ return 0;
+ bool isThumb = TT.getArch() == Triple::thumb ||
+ TT.getArch() == Triple::thumbeb ||
+ TT.getSubArch() == Triple::ARMSubArch_v7m ||
+ TT.getSubArch() == Triple::ARMSubArch_v6m;
+ return isThumb ? ARM::DW_ISA_ARM_thumb : ARM::DW_ISA_ARM_arm;
+ }
+
+private:
+ MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol);
+ MCSymbol *GetARMJTIPICJumpTableLabel(unsigned uid) const;
+
+ MCSymbol *GetARMGVSymbol(const GlobalValue *GV, unsigned char TargetFlags);
+
+public:
+ /// EmitMachineConstantPoolValue - Print a machine constantpool value to
+ /// the .s file.
+ void EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) override;
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
new file mode 100644
index 000000000000..70a3246e34f1
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -0,0 +1,4707 @@
+//===-- ARMBaseInstrInfo.cpp - ARM Instruction Information ----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Base ARM implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMFeatures.h"
+#include "ARMHazardRecognizer.h"
+#include "ARMMachineFunctionInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-instrinfo"
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "ARMGenInstrInfo.inc"
+
+static cl::opt<bool>
+EnableARM3Addr("enable-arm-3-addr-conv", cl::Hidden,
+ cl::desc("Enable ARM 2-addr to 3-addr conv"));
+
+/// ARM_MLxEntry - Record information about MLA / MLS instructions.
+struct ARM_MLxEntry {
+ uint16_t MLxOpc; // MLA / MLS opcode
+ uint16_t MulOpc; // Expanded multiplication opcode
+ uint16_t AddSubOpc; // Expanded add / sub opcode
+ bool NegAcc; // True if the acc is negated before the add / sub.
+ bool HasLane; // True if instruction has an extra "lane" operand.
+};
+
+static const ARM_MLxEntry ARM_MLxTable[] = {
+ // MLxOpc, MulOpc, AddSubOpc, NegAcc, HasLane
+ // fp scalar ops
+ { ARM::VMLAS, ARM::VMULS, ARM::VADDS, false, false },
+ { ARM::VMLSS, ARM::VMULS, ARM::VSUBS, false, false },
+ { ARM::VMLAD, ARM::VMULD, ARM::VADDD, false, false },
+ { ARM::VMLSD, ARM::VMULD, ARM::VSUBD, false, false },
+ { ARM::VNMLAS, ARM::VNMULS, ARM::VSUBS, true, false },
+ { ARM::VNMLSS, ARM::VMULS, ARM::VSUBS, true, false },
+ { ARM::VNMLAD, ARM::VNMULD, ARM::VSUBD, true, false },
+ { ARM::VNMLSD, ARM::VMULD, ARM::VSUBD, true, false },
+
+ // fp SIMD ops
+ { ARM::VMLAfd, ARM::VMULfd, ARM::VADDfd, false, false },
+ { ARM::VMLSfd, ARM::VMULfd, ARM::VSUBfd, false, false },
+ { ARM::VMLAfq, ARM::VMULfq, ARM::VADDfq, false, false },
+ { ARM::VMLSfq, ARM::VMULfq, ARM::VSUBfq, false, false },
+ { ARM::VMLAslfd, ARM::VMULslfd, ARM::VADDfd, false, true },
+ { ARM::VMLSslfd, ARM::VMULslfd, ARM::VSUBfd, false, true },
+ { ARM::VMLAslfq, ARM::VMULslfq, ARM::VADDfq, false, true },
+ { ARM::VMLSslfq, ARM::VMULslfq, ARM::VSUBfq, false, true },
+};
+
+ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget& STI)
+ : ARMGenInstrInfo(ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP),
+ Subtarget(STI) {
+ for (unsigned i = 0, e = array_lengthof(ARM_MLxTable); i != e; ++i) {
+ if (!MLxEntryMap.insert(std::make_pair(ARM_MLxTable[i].MLxOpc, i)).second)
+ llvm_unreachable("Duplicated entries?");
+ MLxHazardOpcodes.insert(ARM_MLxTable[i].AddSubOpc);
+ MLxHazardOpcodes.insert(ARM_MLxTable[i].MulOpc);
+ }
+}
+
+// Use a ScoreboardHazardRecognizer for prepass ARM scheduling. TargetInstrImpl
+// currently defaults to no prepass hazard recognizer.
+ScheduleHazardRecognizer *
+ARMBaseInstrInfo::CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI,
+ const ScheduleDAG *DAG) const {
+ if (usePreRAHazardRecognizer()) {
+ const InstrItineraryData *II =
+ static_cast<const ARMSubtarget *>(STI)->getInstrItineraryData();
+ return new ScoreboardHazardRecognizer(II, DAG, "pre-RA-sched");
+ }
+ return TargetInstrInfo::CreateTargetHazardRecognizer(STI, DAG);
+}
+
+ScheduleHazardRecognizer *ARMBaseInstrInfo::
+CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
+ const ScheduleDAG *DAG) const {
+ if (Subtarget.isThumb2() || Subtarget.hasVFP2())
+ return (ScheduleHazardRecognizer *)new ARMHazardRecognizer(II, DAG);
+ return TargetInstrInfo::CreateTargetPostRAHazardRecognizer(II, DAG);
+}
+
+MachineInstr *ARMBaseInstrInfo::convertToThreeAddress(
+ MachineFunction::iterator &MFI, MachineInstr &MI, LiveVariables *LV) const {
+ // FIXME: Thumb2 support.
+
+ if (!EnableARM3Addr)
+ return nullptr;
+
+ MachineFunction &MF = *MI.getParent()->getParent();
+ uint64_t TSFlags = MI.getDesc().TSFlags;
+ bool isPre = false;
+ switch ((TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift) {
+ default: return nullptr;
+ case ARMII::IndexModePre:
+ isPre = true;
+ break;
+ case ARMII::IndexModePost:
+ break;
+ }
+
+ // Try splitting an indexed load/store to an un-indexed one plus an add/sub
+ // operation.
+ unsigned MemOpc = getUnindexedOpcode(MI.getOpcode());
+ if (MemOpc == 0)
+ return nullptr;
+
+ MachineInstr *UpdateMI = nullptr;
+ MachineInstr *MemMI = nullptr;
+ unsigned AddrMode = (TSFlags & ARMII::AddrModeMask);
+ const MCInstrDesc &MCID = MI.getDesc();
+ unsigned NumOps = MCID.getNumOperands();
+ bool isLoad = !MI.mayStore();
+ const MachineOperand &WB = isLoad ? MI.getOperand(1) : MI.getOperand(0);
+ const MachineOperand &Base = MI.getOperand(2);
+ const MachineOperand &Offset = MI.getOperand(NumOps - 3);
+ unsigned WBReg = WB.getReg();
+ unsigned BaseReg = Base.getReg();
+ unsigned OffReg = Offset.getReg();
+ unsigned OffImm = MI.getOperand(NumOps - 2).getImm();
+ ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI.getOperand(NumOps - 1).getImm();
+ switch (AddrMode) {
+ default: llvm_unreachable("Unknown indexed op!");
+ case ARMII::AddrMode2: {
+ bool isSub = ARM_AM::getAM2Op(OffImm) == ARM_AM::sub;
+ unsigned Amt = ARM_AM::getAM2Offset(OffImm);
+ if (OffReg == 0) {
+ if (ARM_AM::getSOImmVal(Amt) == -1)
+ // Can't encode it in a so_imm operand. This transformation will
+ // add more than 1 instruction. Abandon!
+ return nullptr;
+ UpdateMI = BuildMI(MF, MI.getDebugLoc(),
+ get(isSub ? ARM::SUBri : ARM::ADDri), WBReg)
+ .addReg(BaseReg)
+ .addImm(Amt)
+ .addImm(Pred)
+ .addReg(0)
+ .addReg(0);
+ } else if (Amt != 0) {
+ ARM_AM::ShiftOpc ShOpc = ARM_AM::getAM2ShiftOpc(OffImm);
+ unsigned SOOpc = ARM_AM::getSORegOpc(ShOpc, Amt);
+ UpdateMI = BuildMI(MF, MI.getDebugLoc(),
+ get(isSub ? ARM::SUBrsi : ARM::ADDrsi), WBReg)
+ .addReg(BaseReg)
+ .addReg(OffReg)
+ .addReg(0)
+ .addImm(SOOpc)
+ .addImm(Pred)
+ .addReg(0)
+ .addReg(0);
+ } else
+ UpdateMI = BuildMI(MF, MI.getDebugLoc(),
+ get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg)
+ .addReg(BaseReg)
+ .addReg(OffReg)
+ .addImm(Pred)
+ .addReg(0)
+ .addReg(0);
+ break;
+ }
+ case ARMII::AddrMode3 : {
+ bool isSub = ARM_AM::getAM3Op(OffImm) == ARM_AM::sub;
+ unsigned Amt = ARM_AM::getAM3Offset(OffImm);
+ if (OffReg == 0)
+ // Immediate is 8-bits. It's guaranteed to fit in a so_imm operand.
+ UpdateMI = BuildMI(MF, MI.getDebugLoc(),
+ get(isSub ? ARM::SUBri : ARM::ADDri), WBReg)
+ .addReg(BaseReg)
+ .addImm(Amt)
+ .addImm(Pred)
+ .addReg(0)
+ .addReg(0);
+ else
+ UpdateMI = BuildMI(MF, MI.getDebugLoc(),
+ get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg)
+ .addReg(BaseReg)
+ .addReg(OffReg)
+ .addImm(Pred)
+ .addReg(0)
+ .addReg(0);
+ break;
+ }
+ }
+
+ std::vector<MachineInstr*> NewMIs;
+ if (isPre) {
+ if (isLoad)
+ MemMI =
+ BuildMI(MF, MI.getDebugLoc(), get(MemOpc), MI.getOperand(0).getReg())
+ .addReg(WBReg)
+ .addImm(0)
+ .addImm(Pred);
+ else
+ MemMI = BuildMI(MF, MI.getDebugLoc(), get(MemOpc))
+ .addReg(MI.getOperand(1).getReg())
+ .addReg(WBReg)
+ .addReg(0)
+ .addImm(0)
+ .addImm(Pred);
+ NewMIs.push_back(MemMI);
+ NewMIs.push_back(UpdateMI);
+ } else {
+ if (isLoad)
+ MemMI =
+ BuildMI(MF, MI.getDebugLoc(), get(MemOpc), MI.getOperand(0).getReg())
+ .addReg(BaseReg)
+ .addImm(0)
+ .addImm(Pred);
+ else
+ MemMI = BuildMI(MF, MI.getDebugLoc(), get(MemOpc))
+ .addReg(MI.getOperand(1).getReg())
+ .addReg(BaseReg)
+ .addReg(0)
+ .addImm(0)
+ .addImm(Pred);
+ if (WB.isDead())
+ UpdateMI->getOperand(0).setIsDead();
+ NewMIs.push_back(UpdateMI);
+ NewMIs.push_back(MemMI);
+ }
+
+ // Transfer LiveVariables states, kill / dead info.
+ if (LV) {
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = MI.getOperand(i);
+ if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+ unsigned Reg = MO.getReg();
+
+ LiveVariables::VarInfo &VI = LV->getVarInfo(Reg);
+ if (MO.isDef()) {
+ MachineInstr *NewMI = (Reg == WBReg) ? UpdateMI : MemMI;
+ if (MO.isDead())
+ LV->addVirtualRegisterDead(Reg, *NewMI);
+ }
+ if (MO.isUse() && MO.isKill()) {
+ for (unsigned j = 0; j < 2; ++j) {
+ // Look at the two new MI's in reverse order.
+ MachineInstr *NewMI = NewMIs[j];
+ if (!NewMI->readsRegister(Reg))
+ continue;
+ LV->addVirtualRegisterKilled(Reg, *NewMI);
+ if (VI.removeKill(MI))
+ VI.Kills.push_back(NewMI);
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ MachineBasicBlock::iterator MBBI = MI.getIterator();
+ MFI->insert(MBBI, NewMIs[1]);
+ MFI->insert(MBBI, NewMIs[0]);
+ return NewMIs[0];
+}
+
+// Branch analysis.
+bool ARMBaseInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const {
+ TBB = nullptr;
+ FBB = nullptr;
+
+ MachineBasicBlock::iterator I = MBB.end();
+ if (I == MBB.begin())
+ return false; // Empty blocks are easy.
+ --I;
+
+ // Walk backwards from the end of the basic block until the branch is
+ // analyzed or we give up.
+ while (isPredicated(*I) || I->isTerminator() || I->isDebugValue()) {
+
+ // Flag to be raised on unanalyzeable instructions. This is useful in cases
+ // where we want to clean up on the end of the basic block before we bail
+ // out.
+ bool CantAnalyze = false;
+
+ // Skip over DEBUG values and predicated nonterminators.
+ while (I->isDebugValue() || !I->isTerminator()) {
+ if (I == MBB.begin())
+ return false;
+ --I;
+ }
+
+ if (isIndirectBranchOpcode(I->getOpcode()) ||
+ isJumpTableBranchOpcode(I->getOpcode())) {
+ // Indirect branches and jump tables can't be analyzed, but we still want
+ // to clean up any instructions at the tail of the basic block.
+ CantAnalyze = true;
+ } else if (isUncondBranchOpcode(I->getOpcode())) {
+ TBB = I->getOperand(0).getMBB();
+ } else if (isCondBranchOpcode(I->getOpcode())) {
+ // Bail out if we encounter multiple conditional branches.
+ if (!Cond.empty())
+ return true;
+
+ assert(!FBB && "FBB should have been null.");
+ FBB = TBB;
+ TBB = I->getOperand(0).getMBB();
+ Cond.push_back(I->getOperand(1));
+ Cond.push_back(I->getOperand(2));
+ } else if (I->isReturn()) {
+ // Returns can't be analyzed, but we should run cleanup.
+ CantAnalyze = !isPredicated(*I);
+ } else {
+ // We encountered other unrecognized terminator. Bail out immediately.
+ return true;
+ }
+
+ // Cleanup code - to be run for unpredicated unconditional branches and
+ // returns.
+ if (!isPredicated(*I) &&
+ (isUncondBranchOpcode(I->getOpcode()) ||
+ isIndirectBranchOpcode(I->getOpcode()) ||
+ isJumpTableBranchOpcode(I->getOpcode()) ||
+ I->isReturn())) {
+ // Forget any previous condition branch information - it no longer applies.
+ Cond.clear();
+ FBB = nullptr;
+
+ // If we can modify the function, delete everything below this
+ // unconditional branch.
+ if (AllowModify) {
+ MachineBasicBlock::iterator DI = std::next(I);
+ while (DI != MBB.end()) {
+ MachineInstr &InstToDelete = *DI;
+ ++DI;
+ InstToDelete.eraseFromParent();
+ }
+ }
+ }
+
+ if (CantAnalyze)
+ return true;
+
+ if (I == MBB.begin())
+ return false;
+
+ --I;
+ }
+
+ // We made it past the terminators without bailing out - we must have
+ // analyzed this branch successfully.
+ return false;
+}
+
+
+unsigned ARMBaseInstrInfo::removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved) const {
+ assert(!BytesRemoved && "code size not handled");
+
+ MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
+ if (I == MBB.end())
+ return 0;
+
+ if (!isUncondBranchOpcode(I->getOpcode()) &&
+ !isCondBranchOpcode(I->getOpcode()))
+ return 0;
+
+ // Remove the branch.
+ I->eraseFromParent();
+
+ I = MBB.end();
+
+ if (I == MBB.begin()) return 1;
+ --I;
+ if (!isCondBranchOpcode(I->getOpcode()))
+ return 1;
+
+ // Remove the branch.
+ I->eraseFromParent();
+ return 2;
+}
+
+unsigned ARMBaseInstrInfo::insertBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB,
+ ArrayRef<MachineOperand> Cond,
+ const DebugLoc &DL,
+ int *BytesAdded) const {
+ assert(!BytesAdded && "code size not handled");
+ ARMFunctionInfo *AFI = MBB.getParent()->getInfo<ARMFunctionInfo>();
+ int BOpc = !AFI->isThumbFunction()
+ ? ARM::B : (AFI->isThumb2Function() ? ARM::t2B : ARM::tB);
+ int BccOpc = !AFI->isThumbFunction()
+ ? ARM::Bcc : (AFI->isThumb2Function() ? ARM::t2Bcc : ARM::tBcc);
+ bool isThumb = AFI->isThumbFunction() || AFI->isThumb2Function();
+
+ // Shouldn't be a fall through.
+ assert(TBB && "insertBranch must not be told to insert a fallthrough");
+ assert((Cond.size() == 2 || Cond.size() == 0) &&
+ "ARM branch conditions have two components!");
+
+ // For conditional branches, we use addOperand to preserve CPSR flags.
+
+ if (!FBB) {
+ if (Cond.empty()) { // Unconditional branch?
+ if (isThumb)
+ BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB).addImm(ARMCC::AL).addReg(0);
+ else
+ BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB);
+ } else
+ BuildMI(&MBB, DL, get(BccOpc)).addMBB(TBB)
+ .addImm(Cond[0].getImm()).addOperand(Cond[1]);
+ return 1;
+ }
+
+ // Two-way conditional branch.
+ BuildMI(&MBB, DL, get(BccOpc)).addMBB(TBB)
+ .addImm(Cond[0].getImm()).addOperand(Cond[1]);
+ if (isThumb)
+ BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB).addImm(ARMCC::AL).addReg(0);
+ else
+ BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB);
+ return 2;
+}
+
+bool ARMBaseInstrInfo::
+reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+ ARMCC::CondCodes CC = (ARMCC::CondCodes)(int)Cond[0].getImm();
+ Cond[0].setImm(ARMCC::getOppositeCondition(CC));
+ return false;
+}
+
+bool ARMBaseInstrInfo::isPredicated(const MachineInstr &MI) const {
+ if (MI.isBundle()) {
+ MachineBasicBlock::const_instr_iterator I = MI.getIterator();
+ MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
+ while (++I != E && I->isInsideBundle()) {
+ int PIdx = I->findFirstPredOperandIdx();
+ if (PIdx != -1 && I->getOperand(PIdx).getImm() != ARMCC::AL)
+ return true;
+ }
+ return false;
+ }
+
+ int PIdx = MI.findFirstPredOperandIdx();
+ return PIdx != -1 && MI.getOperand(PIdx).getImm() != ARMCC::AL;
+}
+
+bool ARMBaseInstrInfo::PredicateInstruction(
+ MachineInstr &MI, ArrayRef<MachineOperand> Pred) const {
+ unsigned Opc = MI.getOpcode();
+ if (isUncondBranchOpcode(Opc)) {
+ MI.setDesc(get(getMatchingCondBranchOpcode(Opc)));
+ MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+ .addImm(Pred[0].getImm())
+ .addReg(Pred[1].getReg());
+ return true;
+ }
+
+ int PIdx = MI.findFirstPredOperandIdx();
+ if (PIdx != -1) {
+ MachineOperand &PMO = MI.getOperand(PIdx);
+ PMO.setImm(Pred[0].getImm());
+ MI.getOperand(PIdx+1).setReg(Pred[1].getReg());
+ return true;
+ }
+ return false;
+}
+
+bool ARMBaseInstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
+ ArrayRef<MachineOperand> Pred2) const {
+ if (Pred1.size() > 2 || Pred2.size() > 2)
+ return false;
+
+ ARMCC::CondCodes CC1 = (ARMCC::CondCodes)Pred1[0].getImm();
+ ARMCC::CondCodes CC2 = (ARMCC::CondCodes)Pred2[0].getImm();
+ if (CC1 == CC2)
+ return true;
+
+ switch (CC1) {
+ default:
+ return false;
+ case ARMCC::AL:
+ return true;
+ case ARMCC::HS:
+ return CC2 == ARMCC::HI;
+ case ARMCC::LS:
+ return CC2 == ARMCC::LO || CC2 == ARMCC::EQ;
+ case ARMCC::GE:
+ return CC2 == ARMCC::GT;
+ case ARMCC::LE:
+ return CC2 == ARMCC::LT;
+ }
+}
+
+bool ARMBaseInstrInfo::DefinesPredicate(
+ MachineInstr &MI, std::vector<MachineOperand> &Pred) const {
+ bool Found = false;
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI.getOperand(i);
+ if ((MO.isRegMask() && MO.clobbersPhysReg(ARM::CPSR)) ||
+ (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR)) {
+ Pred.push_back(MO);
+ Found = true;
+ }
+ }
+
+ return Found;
+}
+
+static bool isCPSRDefined(const MachineInstr *MI) {
+ for (const auto &MO : MI->operands())
+ if (MO.isReg() && MO.getReg() == ARM::CPSR && MO.isDef() && !MO.isDead())
+ return true;
+ return false;
+}
+
+static bool isEligibleForITBlock(const MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ default: return true;
+ case ARM::tADC: // ADC (register) T1
+ case ARM::tADDi3: // ADD (immediate) T1
+ case ARM::tADDi8: // ADD (immediate) T2
+ case ARM::tADDrr: // ADD (register) T1
+ case ARM::tAND: // AND (register) T1
+ case ARM::tASRri: // ASR (immediate) T1
+ case ARM::tASRrr: // ASR (register) T1
+ case ARM::tBIC: // BIC (register) T1
+ case ARM::tEOR: // EOR (register) T1
+ case ARM::tLSLri: // LSL (immediate) T1
+ case ARM::tLSLrr: // LSL (register) T1
+ case ARM::tLSRri: // LSR (immediate) T1
+ case ARM::tLSRrr: // LSR (register) T1
+ case ARM::tMUL: // MUL T1
+ case ARM::tMVN: // MVN (register) T1
+ case ARM::tORR: // ORR (register) T1
+ case ARM::tROR: // ROR (register) T1
+ case ARM::tRSB: // RSB (immediate) T1
+ case ARM::tSBC: // SBC (register) T1
+ case ARM::tSUBi3: // SUB (immediate) T1
+ case ARM::tSUBi8: // SUB (immediate) T2
+ case ARM::tSUBrr: // SUB (register) T1
+ return !isCPSRDefined(MI);
+ }
+}
+
+/// isPredicable - Return true if the specified instruction can be predicated.
+/// By default, this returns true for every instruction with a
+/// PredicateOperand.
+bool ARMBaseInstrInfo::isPredicable(MachineInstr &MI) const {
+ if (!MI.isPredicable())
+ return false;
+
+ if (MI.isBundle())
+ return false;
+
+ if (!isEligibleForITBlock(&MI))
+ return false;
+
+ ARMFunctionInfo *AFI =
+ MI.getParent()->getParent()->getInfo<ARMFunctionInfo>();
+
+ if (AFI->isThumb2Function()) {
+ if (getSubtarget().restrictIT())
+ return isV8EligibleForIT(&MI);
+ } else { // non-Thumb
+ if ((MI.getDesc().TSFlags & ARMII::DomainMask) == ARMII::DomainNEON)
+ return false;
+ }
+
+ return true;
+}
+
+namespace llvm {
+template <> bool IsCPSRDead<MachineInstr>(MachineInstr *MI) {
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+ if (!MO.isReg() || MO.isUndef() || MO.isUse())
+ continue;
+ if (MO.getReg() != ARM::CPSR)
+ continue;
+ if (!MO.isDead())
+ return false;
+ }
+ // all definitions of CPSR are dead
+ return true;
+}
+}
+
+/// GetInstSize - Return the size of the specified MachineInstr.
+///
+unsigned ARMBaseInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
+ const MachineBasicBlock &MBB = *MI.getParent();
+ const MachineFunction *MF = MBB.getParent();
+ const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
+
+ const MCInstrDesc &MCID = MI.getDesc();
+ if (MCID.getSize())
+ return MCID.getSize();
+
+ // If this machine instr is an inline asm, measure it.
+ if (MI.getOpcode() == ARM::INLINEASM)
+ return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
+ unsigned Opc = MI.getOpcode();
+ switch (Opc) {
+ default:
+ // pseudo-instruction sizes are zero.
+ return 0;
+ case TargetOpcode::BUNDLE:
+ return getInstBundleLength(MI);
+ case ARM::MOVi16_ga_pcrel:
+ case ARM::MOVTi16_ga_pcrel:
+ case ARM::t2MOVi16_ga_pcrel:
+ case ARM::t2MOVTi16_ga_pcrel:
+ return 4;
+ case ARM::MOVi32imm:
+ case ARM::t2MOVi32imm:
+ return 8;
+ case ARM::CONSTPOOL_ENTRY:
+ case ARM::JUMPTABLE_INSTS:
+ case ARM::JUMPTABLE_ADDRS:
+ case ARM::JUMPTABLE_TBB:
+ case ARM::JUMPTABLE_TBH:
+ // If this machine instr is a constant pool entry, its size is recorded as
+ // operand #2.
+ return MI.getOperand(2).getImm();
+ case ARM::Int_eh_sjlj_longjmp:
+ return 16;
+ case ARM::tInt_eh_sjlj_longjmp:
+ return 10;
+ case ARM::tInt_WIN_eh_sjlj_longjmp:
+ return 12;
+ case ARM::Int_eh_sjlj_setjmp:
+ case ARM::Int_eh_sjlj_setjmp_nofp:
+ return 20;
+ case ARM::tInt_eh_sjlj_setjmp:
+ case ARM::t2Int_eh_sjlj_setjmp:
+ case ARM::t2Int_eh_sjlj_setjmp_nofp:
+ return 12;
+ case ARM::SPACE:
+ return MI.getOperand(1).getImm();
+ }
+}
+
+unsigned ARMBaseInstrInfo::getInstBundleLength(const MachineInstr &MI) const {
+ unsigned Size = 0;
+ MachineBasicBlock::const_instr_iterator I = MI.getIterator();
+ MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
+ while (++I != E && I->isInsideBundle()) {
+ assert(!I->isBundle() && "No nested bundle!");
+ Size += getInstSizeInBytes(*I);
+ }
+ return Size;
+}
+
+void ARMBaseInstrInfo::copyFromCPSR(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ unsigned DestReg, bool KillSrc,
+ const ARMSubtarget &Subtarget) const {
+ unsigned Opc = Subtarget.isThumb()
+ ? (Subtarget.isMClass() ? ARM::t2MRS_M : ARM::t2MRS_AR)
+ : ARM::MRS;
+
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, I, I->getDebugLoc(), get(Opc), DestReg);
+
+ // There is only 1 A/R class MRS instruction, and it always refers to
+ // APSR. However, there are lots of other possibilities on M-class cores.
+ if (Subtarget.isMClass())
+ MIB.addImm(0x800);
+
+ AddDefaultPred(MIB);
+
+ MIB.addReg(ARM::CPSR, RegState::Implicit | getKillRegState(KillSrc));
+}
+
+void ARMBaseInstrInfo::copyToCPSR(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ unsigned SrcReg, bool KillSrc,
+ const ARMSubtarget &Subtarget) const {
+ unsigned Opc = Subtarget.isThumb()
+ ? (Subtarget.isMClass() ? ARM::t2MSR_M : ARM::t2MSR_AR)
+ : ARM::MSR;
+
+ MachineInstrBuilder MIB = BuildMI(MBB, I, I->getDebugLoc(), get(Opc));
+
+ if (Subtarget.isMClass())
+ MIB.addImm(0x800);
+ else
+ MIB.addImm(8);
+
+ MIB.addReg(SrcReg, getKillRegState(KillSrc));
+
+ AddDefaultPred(MIB);
+
+ MIB.addReg(ARM::CPSR, RegState::Implicit | RegState::Define);
+}
+
+void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL, unsigned DestReg,
+ unsigned SrcReg, bool KillSrc) const {
+ bool GPRDest = ARM::GPRRegClass.contains(DestReg);
+ bool GPRSrc = ARM::GPRRegClass.contains(SrcReg);
+
+ if (GPRDest && GPRSrc) {
+ AddDefaultCC(AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::MOVr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc))));
+ return;
+ }
+
+ bool SPRDest = ARM::SPRRegClass.contains(DestReg);
+ bool SPRSrc = ARM::SPRRegClass.contains(SrcReg);
+
+ unsigned Opc = 0;
+ if (SPRDest && SPRSrc)
+ Opc = ARM::VMOVS;
+ else if (GPRDest && SPRSrc)
+ Opc = ARM::VMOVRS;
+ else if (SPRDest && GPRSrc)
+ Opc = ARM::VMOVSR;
+ else if (ARM::DPRRegClass.contains(DestReg, SrcReg) && !Subtarget.isFPOnlySP())
+ Opc = ARM::VMOVD;
+ else if (ARM::QPRRegClass.contains(DestReg, SrcReg))
+ Opc = ARM::VORRq;
+
+ if (Opc) {
+ MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opc), DestReg);
+ MIB.addReg(SrcReg, getKillRegState(KillSrc));
+ if (Opc == ARM::VORRq)
+ MIB.addReg(SrcReg, getKillRegState(KillSrc));
+ AddDefaultPred(MIB);
+ return;
+ }
+
+ // Handle register classes that require multiple instructions.
+ unsigned BeginIdx = 0;
+ unsigned SubRegs = 0;
+ int Spacing = 1;
+
+ // Use VORRq when possible.
+ if (ARM::QQPRRegClass.contains(DestReg, SrcReg)) {
+ Opc = ARM::VORRq;
+ BeginIdx = ARM::qsub_0;
+ SubRegs = 2;
+ } else if (ARM::QQQQPRRegClass.contains(DestReg, SrcReg)) {
+ Opc = ARM::VORRq;
+ BeginIdx = ARM::qsub_0;
+ SubRegs = 4;
+ // Fall back to VMOVD.
+ } else if (ARM::DPairRegClass.contains(DestReg, SrcReg)) {
+ Opc = ARM::VMOVD;
+ BeginIdx = ARM::dsub_0;
+ SubRegs = 2;
+ } else if (ARM::DTripleRegClass.contains(DestReg, SrcReg)) {
+ Opc = ARM::VMOVD;
+ BeginIdx = ARM::dsub_0;
+ SubRegs = 3;
+ } else if (ARM::DQuadRegClass.contains(DestReg, SrcReg)) {
+ Opc = ARM::VMOVD;
+ BeginIdx = ARM::dsub_0;
+ SubRegs = 4;
+ } else if (ARM::GPRPairRegClass.contains(DestReg, SrcReg)) {
+ Opc = Subtarget.isThumb2() ? ARM::tMOVr : ARM::MOVr;
+ BeginIdx = ARM::gsub_0;
+ SubRegs = 2;
+ } else if (ARM::DPairSpcRegClass.contains(DestReg, SrcReg)) {
+ Opc = ARM::VMOVD;
+ BeginIdx = ARM::dsub_0;
+ SubRegs = 2;
+ Spacing = 2;
+ } else if (ARM::DTripleSpcRegClass.contains(DestReg, SrcReg)) {
+ Opc = ARM::VMOVD;
+ BeginIdx = ARM::dsub_0;
+ SubRegs = 3;
+ Spacing = 2;
+ } else if (ARM::DQuadSpcRegClass.contains(DestReg, SrcReg)) {
+ Opc = ARM::VMOVD;
+ BeginIdx = ARM::dsub_0;
+ SubRegs = 4;
+ Spacing = 2;
+ } else if (ARM::DPRRegClass.contains(DestReg, SrcReg) && Subtarget.isFPOnlySP()) {
+ Opc = ARM::VMOVS;
+ BeginIdx = ARM::ssub_0;
+ SubRegs = 2;
+ } else if (SrcReg == ARM::CPSR) {
+ copyFromCPSR(MBB, I, DestReg, KillSrc, Subtarget);
+ return;
+ } else if (DestReg == ARM::CPSR) {
+ copyToCPSR(MBB, I, SrcReg, KillSrc, Subtarget);
+ return;
+ }
+
+ assert(Opc && "Impossible reg-to-reg copy");
+
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ MachineInstrBuilder Mov;
+
+ // Copy register tuples backward when the first Dest reg overlaps with SrcReg.
+ if (TRI->regsOverlap(SrcReg, TRI->getSubReg(DestReg, BeginIdx))) {
+ BeginIdx = BeginIdx + ((SubRegs - 1) * Spacing);
+ Spacing = -Spacing;
+ }
+#ifndef NDEBUG
+ SmallSet<unsigned, 4> DstRegs;
+#endif
+ for (unsigned i = 0; i != SubRegs; ++i) {
+ unsigned Dst = TRI->getSubReg(DestReg, BeginIdx + i * Spacing);
+ unsigned Src = TRI->getSubReg(SrcReg, BeginIdx + i * Spacing);
+ assert(Dst && Src && "Bad sub-register");
+#ifndef NDEBUG
+ assert(!DstRegs.count(Src) && "destructive vector copy");
+ DstRegs.insert(Dst);
+#endif
+ Mov = BuildMI(MBB, I, I->getDebugLoc(), get(Opc), Dst).addReg(Src);
+ // VORR takes two source operands.
+ if (Opc == ARM::VORRq)
+ Mov.addReg(Src);
+ Mov = AddDefaultPred(Mov);
+ // MOVr can set CC.
+ if (Opc == ARM::MOVr)
+ Mov = AddDefaultCC(Mov);
+ }
+ // Add implicit super-register defs and kills to the last instruction.
+ Mov->addRegisterDefined(DestReg, TRI);
+ if (KillSrc)
+ Mov->addRegisterKilled(SrcReg, TRI);
+}
+
+const MachineInstrBuilder &
+ARMBaseInstrInfo::AddDReg(MachineInstrBuilder &MIB, unsigned Reg,
+ unsigned SubIdx, unsigned State,
+ const TargetRegisterInfo *TRI) const {
+ if (!SubIdx)
+ return MIB.addReg(Reg, State);
+
+ if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
+ return MIB.addReg(Reg, State, SubIdx);
+}
+
+void ARMBaseInstrInfo::
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ unsigned SrcReg, bool isKill, int FI,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ DebugLoc DL;
+ if (I != MBB.end()) DL = I->getDebugLoc();
+ MachineFunction &MF = *MBB.getParent();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ unsigned Align = MFI.getObjectAlignment(FI);
+
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore,
+ MFI.getObjectSize(FI), Align);
+
+ switch (RC->getSize()) {
+ case 4:
+ if (ARM::GPRRegClass.hasSubClassEq(RC)) {
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STRi12))
+ .addReg(SrcReg, getKillRegState(isKill))
+ .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+ } else if (ARM::SPRRegClass.hasSubClassEq(RC)) {
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRS))
+ .addReg(SrcReg, getKillRegState(isKill))
+ .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+ } else
+ llvm_unreachable("Unknown reg class!");
+ break;
+ case 8:
+ if (ARM::DPRRegClass.hasSubClassEq(RC)) {
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRD))
+ .addReg(SrcReg, getKillRegState(isKill))
+ .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+ } else if (ARM::GPRPairRegClass.hasSubClassEq(RC)) {
+ if (Subtarget.hasV5TEOps()) {
+ MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::STRD));
+ AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI);
+ AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI);
+ MIB.addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO);
+
+ AddDefaultPred(MIB);
+ } else {
+ // Fallback to STM instruction, which has existed since the dawn of
+ // time.
+ MachineInstrBuilder MIB =
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STMIA))
+ .addFrameIndex(FI).addMemOperand(MMO));
+ AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI);
+ AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI);
+ }
+ } else
+ llvm_unreachable("Unknown reg class!");
+ break;
+ case 16:
+ if (ARM::DPairRegClass.hasSubClassEq(RC)) {
+ // Use aligned spills if the stack can be realigned.
+ if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1q64))
+ .addFrameIndex(FI).addImm(16)
+ .addReg(SrcReg, getKillRegState(isKill))
+ .addMemOperand(MMO));
+ } else {
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMQIA))
+ .addReg(SrcReg, getKillRegState(isKill))
+ .addFrameIndex(FI)
+ .addMemOperand(MMO));
+ }
+ } else
+ llvm_unreachable("Unknown reg class!");
+ break;
+ case 24:
+ if (ARM::DTripleRegClass.hasSubClassEq(RC)) {
+ // Use aligned spills if the stack can be realigned.
+ if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1d64TPseudo))
+ .addFrameIndex(FI).addImm(16)
+ .addReg(SrcReg, getKillRegState(isKill))
+ .addMemOperand(MMO));
+ } else {
+ MachineInstrBuilder MIB =
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMDIA))
+ .addFrameIndex(FI))
+ .addMemOperand(MMO);
+ MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
+ MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
+ AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
+ }
+ } else
+ llvm_unreachable("Unknown reg class!");
+ break;
+ case 32:
+ if (ARM::QQPRRegClass.hasSubClassEq(RC) || ARM::DQuadRegClass.hasSubClassEq(RC)) {
+ if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+ // FIXME: It's possible to only store part of the QQ register if the
+ // spilled def has a sub-register index.
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1d64QPseudo))
+ .addFrameIndex(FI).addImm(16)
+ .addReg(SrcReg, getKillRegState(isKill))
+ .addMemOperand(MMO));
+ } else {
+ MachineInstrBuilder MIB =
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMDIA))
+ .addFrameIndex(FI))
+ .addMemOperand(MMO);
+ MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
+ MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
+ MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
+ AddDReg(MIB, SrcReg, ARM::dsub_3, 0, TRI);
+ }
+ } else
+ llvm_unreachable("Unknown reg class!");
+ break;
+ case 64:
+ if (ARM::QQQQPRRegClass.hasSubClassEq(RC)) {
+ MachineInstrBuilder MIB =
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMDIA))
+ .addFrameIndex(FI))
+ .addMemOperand(MMO);
+ MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
+ MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
+ MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
+ MIB = AddDReg(MIB, SrcReg, ARM::dsub_3, 0, TRI);
+ MIB = AddDReg(MIB, SrcReg, ARM::dsub_4, 0, TRI);
+ MIB = AddDReg(MIB, SrcReg, ARM::dsub_5, 0, TRI);
+ MIB = AddDReg(MIB, SrcReg, ARM::dsub_6, 0, TRI);
+ AddDReg(MIB, SrcReg, ARM::dsub_7, 0, TRI);
+ } else
+ llvm_unreachable("Unknown reg class!");
+ break;
+ default:
+ llvm_unreachable("Unknown reg class!");
+ }
+}
+
+unsigned ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const {
+ switch (MI.getOpcode()) {
+ default: break;
+ case ARM::STRrs:
+ case ARM::t2STRs: // FIXME: don't use t2STRs to access frame.
+ if (MI.getOperand(1).isFI() && MI.getOperand(2).isReg() &&
+ MI.getOperand(3).isImm() && MI.getOperand(2).getReg() == 0 &&
+ MI.getOperand(3).getImm() == 0) {
+ FrameIndex = MI.getOperand(1).getIndex();
+ return MI.getOperand(0).getReg();
+ }
+ break;
+ case ARM::STRi12:
+ case ARM::t2STRi12:
+ case ARM::tSTRspi:
+ case ARM::VSTRD:
+ case ARM::VSTRS:
+ if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
+ MI.getOperand(2).getImm() == 0) {
+ FrameIndex = MI.getOperand(1).getIndex();
+ return MI.getOperand(0).getReg();
+ }
+ break;
+ case ARM::VST1q64:
+ case ARM::VST1d64TPseudo:
+ case ARM::VST1d64QPseudo:
+ if (MI.getOperand(0).isFI() && MI.getOperand(2).getSubReg() == 0) {
+ FrameIndex = MI.getOperand(0).getIndex();
+ return MI.getOperand(2).getReg();
+ }
+ break;
+ case ARM::VSTMQIA:
+ if (MI.getOperand(1).isFI() && MI.getOperand(0).getSubReg() == 0) {
+ FrameIndex = MI.getOperand(1).getIndex();
+ return MI.getOperand(0).getReg();
+ }
+ break;
+ }
+
+ return 0;
+}
+
+unsigned ARMBaseInstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI,
+ int &FrameIndex) const {
+ const MachineMemOperand *Dummy;
+ return MI.mayStore() && hasStoreToStackSlot(MI, Dummy, FrameIndex);
+}
+
+void ARMBaseInstrInfo::
+loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ unsigned DestReg, int FI,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ DebugLoc DL;
+ if (I != MBB.end()) DL = I->getDebugLoc();
+ MachineFunction &MF = *MBB.getParent();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ unsigned Align = MFI.getObjectAlignment(FI);
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad,
+ MFI.getObjectSize(FI), Align);
+
+ switch (RC->getSize()) {
+ case 4:
+ if (ARM::GPRRegClass.hasSubClassEq(RC)) {
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDRi12), DestReg)
+ .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+
+ } else if (ARM::SPRRegClass.hasSubClassEq(RC)) {
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRS), DestReg)
+ .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+ } else
+ llvm_unreachable("Unknown reg class!");
+ break;
+ case 8:
+ if (ARM::DPRRegClass.hasSubClassEq(RC)) {
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRD), DestReg)
+ .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+ } else if (ARM::GPRPairRegClass.hasSubClassEq(RC)) {
+ MachineInstrBuilder MIB;
+
+ if (Subtarget.hasV5TEOps()) {
+ MIB = BuildMI(MBB, I, DL, get(ARM::LDRD));
+ AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI);
+ AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI);
+ MIB.addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO);
+
+ AddDefaultPred(MIB);
+ } else {
+ // Fallback to LDM instruction, which has existed since the dawn of
+ // time.
+ MIB = AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDMIA))
+ .addFrameIndex(FI).addMemOperand(MMO));
+ MIB = AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI);
+ MIB = AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI);
+ }
+
+ if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+ MIB.addReg(DestReg, RegState::ImplicitDefine);
+ } else
+ llvm_unreachable("Unknown reg class!");
+ break;
+ case 16:
+ if (ARM::DPairRegClass.hasSubClassEq(RC)) {
+ if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1q64), DestReg)
+ .addFrameIndex(FI).addImm(16)
+ .addMemOperand(MMO));
+ } else {
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMQIA), DestReg)
+ .addFrameIndex(FI)
+ .addMemOperand(MMO));
+ }
+ } else
+ llvm_unreachable("Unknown reg class!");
+ break;
+ case 24:
+ if (ARM::DTripleRegClass.hasSubClassEq(RC)) {
+ if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1d64TPseudo), DestReg)
+ .addFrameIndex(FI).addImm(16)
+ .addMemOperand(MMO));
+ } else {
+ MachineInstrBuilder MIB =
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMDIA))
+ .addFrameIndex(FI)
+ .addMemOperand(MMO));
+ MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI);
+ MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI);
+ MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI);
+ if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+ MIB.addReg(DestReg, RegState::ImplicitDefine);
+ }
+ } else
+ llvm_unreachable("Unknown reg class!");
+ break;
+ case 32:
+ if (ARM::QQPRRegClass.hasSubClassEq(RC) || ARM::DQuadRegClass.hasSubClassEq(RC)) {
+ if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1d64QPseudo), DestReg)
+ .addFrameIndex(FI).addImm(16)
+ .addMemOperand(MMO));
+ } else {
+ MachineInstrBuilder MIB =
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMDIA))
+ .addFrameIndex(FI))
+ .addMemOperand(MMO);
+ MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI);
+ MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI);
+ MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI);
+ MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead, TRI);
+ if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+ MIB.addReg(DestReg, RegState::ImplicitDefine);
+ }
+ } else
+ llvm_unreachable("Unknown reg class!");
+ break;
+ case 64:
+ if (ARM::QQQQPRRegClass.hasSubClassEq(RC)) {
+ MachineInstrBuilder MIB =
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMDIA))
+ .addFrameIndex(FI))
+ .addMemOperand(MMO);
+ MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI);
+ MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI);
+ MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI);
+ MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead, TRI);
+ MIB = AddDReg(MIB, DestReg, ARM::dsub_4, RegState::DefineNoRead, TRI);
+ MIB = AddDReg(MIB, DestReg, ARM::dsub_5, RegState::DefineNoRead, TRI);
+ MIB = AddDReg(MIB, DestReg, ARM::dsub_6, RegState::DefineNoRead, TRI);
+ MIB = AddDReg(MIB, DestReg, ARM::dsub_7, RegState::DefineNoRead, TRI);
+ if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+ MIB.addReg(DestReg, RegState::ImplicitDefine);
+ } else
+ llvm_unreachable("Unknown reg class!");
+ break;
+ default:
+ llvm_unreachable("Unknown regclass!");
+ }
+}
+
+unsigned ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const {
+ switch (MI.getOpcode()) {
+ default: break;
+ case ARM::LDRrs:
+ case ARM::t2LDRs: // FIXME: don't use t2LDRs to access frame.
+ if (MI.getOperand(1).isFI() && MI.getOperand(2).isReg() &&
+ MI.getOperand(3).isImm() && MI.getOperand(2).getReg() == 0 &&
+ MI.getOperand(3).getImm() == 0) {
+ FrameIndex = MI.getOperand(1).getIndex();
+ return MI.getOperand(0).getReg();
+ }
+ break;
+ case ARM::LDRi12:
+ case ARM::t2LDRi12:
+ case ARM::tLDRspi:
+ case ARM::VLDRD:
+ case ARM::VLDRS:
+ if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
+ MI.getOperand(2).getImm() == 0) {
+ FrameIndex = MI.getOperand(1).getIndex();
+ return MI.getOperand(0).getReg();
+ }
+ break;
+ case ARM::VLD1q64:
+ case ARM::VLD1d64TPseudo:
+ case ARM::VLD1d64QPseudo:
+ if (MI.getOperand(1).isFI() && MI.getOperand(0).getSubReg() == 0) {
+ FrameIndex = MI.getOperand(1).getIndex();
+ return MI.getOperand(0).getReg();
+ }
+ break;
+ case ARM::VLDMQIA:
+ if (MI.getOperand(1).isFI() && MI.getOperand(0).getSubReg() == 0) {
+ FrameIndex = MI.getOperand(1).getIndex();
+ return MI.getOperand(0).getReg();
+ }
+ break;
+ }
+
+ return 0;
+}
+
+unsigned ARMBaseInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI,
+ int &FrameIndex) const {
+ const MachineMemOperand *Dummy;
+ return MI.mayLoad() && hasLoadFromStackSlot(MI, Dummy, FrameIndex);
+}
+
+/// \brief Expands MEMCPY to either LDMIA/STMIA or LDMIA_UPD/STMID_UPD
+/// depending on whether the result is used.
+void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const {
+ bool isThumb1 = Subtarget.isThumb1Only();
+ bool isThumb2 = Subtarget.isThumb2();
+ const ARMBaseInstrInfo *TII = Subtarget.getInstrInfo();
+
+ DebugLoc dl = MI->getDebugLoc();
+ MachineBasicBlock *BB = MI->getParent();
+
+ MachineInstrBuilder LDM, STM;
+ if (isThumb1 || !MI->getOperand(1).isDead()) {
+ LDM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2LDMIA_UPD
+ : isThumb1 ? ARM::tLDMIA_UPD
+ : ARM::LDMIA_UPD))
+ .addOperand(MI->getOperand(1));
+ } else {
+ LDM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2LDMIA : ARM::LDMIA));
+ }
+
+ if (isThumb1 || !MI->getOperand(0).isDead()) {
+ STM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2STMIA_UPD
+ : isThumb1 ? ARM::tSTMIA_UPD
+ : ARM::STMIA_UPD))
+ .addOperand(MI->getOperand(0));
+ } else {
+ STM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2STMIA : ARM::STMIA));
+ }
+
+ AddDefaultPred(LDM.addOperand(MI->getOperand(3)));
+ AddDefaultPred(STM.addOperand(MI->getOperand(2)));
+
+ // Sort the scratch registers into ascending order.
+ const TargetRegisterInfo &TRI = getRegisterInfo();
+ llvm::SmallVector<unsigned, 6> ScratchRegs;
+ for(unsigned I = 5; I < MI->getNumOperands(); ++I)
+ ScratchRegs.push_back(MI->getOperand(I).getReg());
+ std::sort(ScratchRegs.begin(), ScratchRegs.end(),
+ [&TRI](const unsigned &Reg1,
+ const unsigned &Reg2) -> bool {
+ return TRI.getEncodingValue(Reg1) <
+ TRI.getEncodingValue(Reg2);
+ });
+
+ for (const auto &Reg : ScratchRegs) {
+ LDM.addReg(Reg, RegState::Define);
+ STM.addReg(Reg, RegState::Kill);
+ }
+
+ BB->erase(MI);
+}
+
+
+bool ARMBaseInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+ if (MI.getOpcode() == TargetOpcode::LOAD_STACK_GUARD) {
+ assert(getSubtarget().getTargetTriple().isOSBinFormatMachO() &&
+ "LOAD_STACK_GUARD currently supported only for MachO.");
+ expandLoadStackGuard(MI);
+ MI.getParent()->erase(MI);
+ return true;
+ }
+
+ if (MI.getOpcode() == ARM::MEMCPY) {
+ expandMEMCPY(MI);
+ return true;
+ }
+
+ // This hook gets to expand COPY instructions before they become
+ // copyPhysReg() calls. Look for VMOVS instructions that can legally be
+ // widened to VMOVD. We prefer the VMOVD when possible because it may be
+ // changed into a VORR that can go down the NEON pipeline.
+ if (!MI.isCopy() || Subtarget.dontWidenVMOVS() || Subtarget.isFPOnlySP())
+ return false;
+
+ // Look for a copy between even S-registers. That is where we keep floats
+ // when using NEON v2f32 instructions for f32 arithmetic.
+ unsigned DstRegS = MI.getOperand(0).getReg();
+ unsigned SrcRegS = MI.getOperand(1).getReg();
+ if (!ARM::SPRRegClass.contains(DstRegS, SrcRegS))
+ return false;
+
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ unsigned DstRegD = TRI->getMatchingSuperReg(DstRegS, ARM::ssub_0,
+ &ARM::DPRRegClass);
+ unsigned SrcRegD = TRI->getMatchingSuperReg(SrcRegS, ARM::ssub_0,
+ &ARM::DPRRegClass);
+ if (!DstRegD || !SrcRegD)
+ return false;
+
+ // We want to widen this into a DstRegD = VMOVD SrcRegD copy. This is only
+ // legal if the COPY already defines the full DstRegD, and it isn't a
+ // sub-register insertion.
+ if (!MI.definesRegister(DstRegD, TRI) || MI.readsRegister(DstRegD, TRI))
+ return false;
+
+ // A dead copy shouldn't show up here, but reject it just in case.
+ if (MI.getOperand(0).isDead())
+ return false;
+
+ // All clear, widen the COPY.
+ DEBUG(dbgs() << "widening: " << MI);
+ MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
+
+ // Get rid of the old <imp-def> of DstRegD. Leave it if it defines a Q-reg
+ // or some other super-register.
+ int ImpDefIdx = MI.findRegisterDefOperandIdx(DstRegD);
+ if (ImpDefIdx != -1)
+ MI.RemoveOperand(ImpDefIdx);
+
+ // Change the opcode and operands.
+ MI.setDesc(get(ARM::VMOVD));
+ MI.getOperand(0).setReg(DstRegD);
+ MI.getOperand(1).setReg(SrcRegD);
+ AddDefaultPred(MIB);
+
+ // We are now reading SrcRegD instead of SrcRegS. This may upset the
+ // register scavenger and machine verifier, so we need to indicate that we
+ // are reading an undefined value from SrcRegD, but a proper value from
+ // SrcRegS.
+ MI.getOperand(1).setIsUndef();
+ MIB.addReg(SrcRegS, RegState::Implicit);
+
+ // SrcRegD may actually contain an unrelated value in the ssub_1
+ // sub-register. Don't kill it. Only kill the ssub_0 sub-register.
+ if (MI.getOperand(1).isKill()) {
+ MI.getOperand(1).setIsKill(false);
+ MI.addRegisterKilled(SrcRegS, TRI, true);
+ }
+
+ DEBUG(dbgs() << "replaced by: " << MI);
+ return true;
+}
+
+/// Create a copy of a const pool value. Update CPI to the new index and return
+/// the label UID.
+static unsigned duplicateCPV(MachineFunction &MF, unsigned &CPI) {
+ MachineConstantPool *MCP = MF.getConstantPool();
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+ const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPI];
+ assert(MCPE.isMachineConstantPoolEntry() &&
+ "Expecting a machine constantpool entry!");
+ ARMConstantPoolValue *ACPV =
+ static_cast<ARMConstantPoolValue*>(MCPE.Val.MachineCPVal);
+
+ unsigned PCLabelId = AFI->createPICLabelUId();
+ ARMConstantPoolValue *NewCPV = nullptr;
+
+ // FIXME: The below assumes PIC relocation model and that the function
+ // is Thumb mode (t1 or t2). PCAdjustment would be 8 for ARM mode PIC, and
+ // zero for non-PIC in ARM or Thumb. The callers are all of thumb LDR
+ // instructions, so that's probably OK, but is PIC always correct when
+ // we get here?
+ if (ACPV->isGlobalValue())
+ NewCPV = ARMConstantPoolConstant::Create(
+ cast<ARMConstantPoolConstant>(ACPV)->getGV(), PCLabelId, ARMCP::CPValue,
+ 4, ACPV->getModifier(), ACPV->mustAddCurrentAddress());
+ else if (ACPV->isExtSymbol())
+ NewCPV = ARMConstantPoolSymbol::
+ Create(MF.getFunction()->getContext(),
+ cast<ARMConstantPoolSymbol>(ACPV)->getSymbol(), PCLabelId, 4);
+ else if (ACPV->isBlockAddress())
+ NewCPV = ARMConstantPoolConstant::
+ Create(cast<ARMConstantPoolConstant>(ACPV)->getBlockAddress(), PCLabelId,
+ ARMCP::CPBlockAddress, 4);
+ else if (ACPV->isLSDA())
+ NewCPV = ARMConstantPoolConstant::Create(MF.getFunction(), PCLabelId,
+ ARMCP::CPLSDA, 4);
+ else if (ACPV->isMachineBasicBlock())
+ NewCPV = ARMConstantPoolMBB::
+ Create(MF.getFunction()->getContext(),
+ cast<ARMConstantPoolMBB>(ACPV)->getMBB(), PCLabelId, 4);
+ else
+ llvm_unreachable("Unexpected ARM constantpool value type!!");
+ CPI = MCP->getConstantPoolIndex(NewCPV, MCPE.getAlignment());
+ return PCLabelId;
+}
+
+void ARMBaseInstrInfo::reMaterialize(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ unsigned DestReg, unsigned SubIdx,
+ const MachineInstr &Orig,
+ const TargetRegisterInfo &TRI) const {
+ unsigned Opcode = Orig.getOpcode();
+ switch (Opcode) {
+ default: {
+ MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig);
+ MI->substituteRegister(Orig.getOperand(0).getReg(), DestReg, SubIdx, TRI);
+ MBB.insert(I, MI);
+ break;
+ }
+ case ARM::tLDRpci_pic:
+ case ARM::t2LDRpci_pic: {
+ MachineFunction &MF = *MBB.getParent();
+ unsigned CPI = Orig.getOperand(1).getIndex();
+ unsigned PCLabelId = duplicateCPV(MF, CPI);
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, I, Orig.getDebugLoc(), get(Opcode), DestReg)
+ .addConstantPoolIndex(CPI)
+ .addImm(PCLabelId);
+ MIB->setMemRefs(Orig.memoperands_begin(), Orig.memoperands_end());
+ break;
+ }
+ }
+}
+
+MachineInstr *ARMBaseInstrInfo::duplicate(MachineInstr &Orig,
+ MachineFunction &MF) const {
+ MachineInstr *MI = TargetInstrInfo::duplicate(Orig, MF);
+ switch (Orig.getOpcode()) {
+ case ARM::tLDRpci_pic:
+ case ARM::t2LDRpci_pic: {
+ unsigned CPI = Orig.getOperand(1).getIndex();
+ unsigned PCLabelId = duplicateCPV(MF, CPI);
+ Orig.getOperand(1).setIndex(CPI);
+ Orig.getOperand(2).setImm(PCLabelId);
+ break;
+ }
+ }
+ return MI;
+}
+
+bool ARMBaseInstrInfo::produceSameValue(const MachineInstr &MI0,
+ const MachineInstr &MI1,
+ const MachineRegisterInfo *MRI) const {
+ unsigned Opcode = MI0.getOpcode();
+ if (Opcode == ARM::t2LDRpci ||
+ Opcode == ARM::t2LDRpci_pic ||
+ Opcode == ARM::tLDRpci ||
+ Opcode == ARM::tLDRpci_pic ||
+ Opcode == ARM::LDRLIT_ga_pcrel ||
+ Opcode == ARM::LDRLIT_ga_pcrel_ldr ||
+ Opcode == ARM::tLDRLIT_ga_pcrel ||
+ Opcode == ARM::MOV_ga_pcrel ||
+ Opcode == ARM::MOV_ga_pcrel_ldr ||
+ Opcode == ARM::t2MOV_ga_pcrel) {
+ if (MI1.getOpcode() != Opcode)
+ return false;
+ if (MI0.getNumOperands() != MI1.getNumOperands())
+ return false;
+
+ const MachineOperand &MO0 = MI0.getOperand(1);
+ const MachineOperand &MO1 = MI1.getOperand(1);
+ if (MO0.getOffset() != MO1.getOffset())
+ return false;
+
+ if (Opcode == ARM::LDRLIT_ga_pcrel ||
+ Opcode == ARM::LDRLIT_ga_pcrel_ldr ||
+ Opcode == ARM::tLDRLIT_ga_pcrel ||
+ Opcode == ARM::MOV_ga_pcrel ||
+ Opcode == ARM::MOV_ga_pcrel_ldr ||
+ Opcode == ARM::t2MOV_ga_pcrel)
+ // Ignore the PC labels.
+ return MO0.getGlobal() == MO1.getGlobal();
+
+ const MachineFunction *MF = MI0.getParent()->getParent();
+ const MachineConstantPool *MCP = MF->getConstantPool();
+ int CPI0 = MO0.getIndex();
+ int CPI1 = MO1.getIndex();
+ const MachineConstantPoolEntry &MCPE0 = MCP->getConstants()[CPI0];
+ const MachineConstantPoolEntry &MCPE1 = MCP->getConstants()[CPI1];
+ bool isARMCP0 = MCPE0.isMachineConstantPoolEntry();
+ bool isARMCP1 = MCPE1.isMachineConstantPoolEntry();
+ if (isARMCP0 && isARMCP1) {
+ ARMConstantPoolValue *ACPV0 =
+ static_cast<ARMConstantPoolValue*>(MCPE0.Val.MachineCPVal);
+ ARMConstantPoolValue *ACPV1 =
+ static_cast<ARMConstantPoolValue*>(MCPE1.Val.MachineCPVal);
+ return ACPV0->hasSameValue(ACPV1);
+ } else if (!isARMCP0 && !isARMCP1) {
+ return MCPE0.Val.ConstVal == MCPE1.Val.ConstVal;
+ }
+ return false;
+ } else if (Opcode == ARM::PICLDR) {
+ if (MI1.getOpcode() != Opcode)
+ return false;
+ if (MI0.getNumOperands() != MI1.getNumOperands())
+ return false;
+
+ unsigned Addr0 = MI0.getOperand(1).getReg();
+ unsigned Addr1 = MI1.getOperand(1).getReg();
+ if (Addr0 != Addr1) {
+ if (!MRI ||
+ !TargetRegisterInfo::isVirtualRegister(Addr0) ||
+ !TargetRegisterInfo::isVirtualRegister(Addr1))
+ return false;
+
+ // This assumes SSA form.
+ MachineInstr *Def0 = MRI->getVRegDef(Addr0);
+ MachineInstr *Def1 = MRI->getVRegDef(Addr1);
+ // Check if the loaded value, e.g. a constantpool of a global address, are
+ // the same.
+ if (!produceSameValue(*Def0, *Def1, MRI))
+ return false;
+ }
+
+ for (unsigned i = 3, e = MI0.getNumOperands(); i != e; ++i) {
+ // %vreg12<def> = PICLDR %vreg11, 0, pred:14, pred:%noreg
+ const MachineOperand &MO0 = MI0.getOperand(i);
+ const MachineOperand &MO1 = MI1.getOperand(i);
+ if (!MO0.isIdenticalTo(MO1))
+ return false;
+ }
+ return true;
+ }
+
+ return MI0.isIdenticalTo(MI1, MachineInstr::IgnoreVRegDefs);
+}
+
+/// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler to
+/// determine if two loads are loading from the same base address. It should
+/// only return true if the base pointers are the same and the only differences
+/// between the two addresses is the offset. It also returns the offsets by
+/// reference.
+///
+/// FIXME: remove this in favor of the MachineInstr interface once pre-RA-sched
+/// is permanently disabled.
+bool ARMBaseInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
+ int64_t &Offset1,
+ int64_t &Offset2) const {
+ // Don't worry about Thumb: just ARM and Thumb2.
+ if (Subtarget.isThumb1Only()) return false;
+
+ if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode())
+ return false;
+
+ switch (Load1->getMachineOpcode()) {
+ default:
+ return false;
+ case ARM::LDRi12:
+ case ARM::LDRBi12:
+ case ARM::LDRD:
+ case ARM::LDRH:
+ case ARM::LDRSB:
+ case ARM::LDRSH:
+ case ARM::VLDRD:
+ case ARM::VLDRS:
+ case ARM::t2LDRi8:
+ case ARM::t2LDRBi8:
+ case ARM::t2LDRDi8:
+ case ARM::t2LDRSHi8:
+ case ARM::t2LDRi12:
+ case ARM::t2LDRBi12:
+ case ARM::t2LDRSHi12:
+ break;
+ }
+
+ switch (Load2->getMachineOpcode()) {
+ default:
+ return false;
+ case ARM::LDRi12:
+ case ARM::LDRBi12:
+ case ARM::LDRD:
+ case ARM::LDRH:
+ case ARM::LDRSB:
+ case ARM::LDRSH:
+ case ARM::VLDRD:
+ case ARM::VLDRS:
+ case ARM::t2LDRi8:
+ case ARM::t2LDRBi8:
+ case ARM::t2LDRSHi8:
+ case ARM::t2LDRi12:
+ case ARM::t2LDRBi12:
+ case ARM::t2LDRSHi12:
+ break;
+ }
+
+ // Check if base addresses and chain operands match.
+ if (Load1->getOperand(0) != Load2->getOperand(0) ||
+ Load1->getOperand(4) != Load2->getOperand(4))
+ return false;
+
+ // Index should be Reg0.
+ if (Load1->getOperand(3) != Load2->getOperand(3))
+ return false;
+
+ // Determine the offsets.
+ if (isa<ConstantSDNode>(Load1->getOperand(1)) &&
+ isa<ConstantSDNode>(Load2->getOperand(1))) {
+ Offset1 = cast<ConstantSDNode>(Load1->getOperand(1))->getSExtValue();
+ Offset2 = cast<ConstantSDNode>(Load2->getOperand(1))->getSExtValue();
+ return true;
+ }
+
+ return false;
+}
+
+/// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to
+/// determine (in conjunction with areLoadsFromSameBasePtr) if two loads should
+/// be scheduled togther. On some targets if two loads are loading from
+/// addresses in the same cache line, it's better if they are scheduled
+/// together. This function takes two integers that represent the load offsets
+/// from the common base address. It returns true if it decides it's desirable
+/// to schedule the two loads together. "NumLoads" is the number of loads that
+/// have already been scheduled after Load1.
+///
+/// FIXME: remove this in favor of the MachineInstr interface once pre-RA-sched
+/// is permanently disabled.
+bool ARMBaseInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+ int64_t Offset1, int64_t Offset2,
+ unsigned NumLoads) const {
+ // Don't worry about Thumb: just ARM and Thumb2.
+ if (Subtarget.isThumb1Only()) return false;
+
+ assert(Offset2 > Offset1);
+
+ if ((Offset2 - Offset1) / 8 > 64)
+ return false;
+
+ // Check if the machine opcodes are different. If they are different
+ // then we consider them to not be of the same base address,
+ // EXCEPT in the case of Thumb2 byte loads where one is LDRBi8 and the other LDRBi12.
+ // In this case, they are considered to be the same because they are different
+ // encoding forms of the same basic instruction.
+ if ((Load1->getMachineOpcode() != Load2->getMachineOpcode()) &&
+ !((Load1->getMachineOpcode() == ARM::t2LDRBi8 &&
+ Load2->getMachineOpcode() == ARM::t2LDRBi12) ||
+ (Load1->getMachineOpcode() == ARM::t2LDRBi12 &&
+ Load2->getMachineOpcode() == ARM::t2LDRBi8)))
+ return false; // FIXME: overly conservative?
+
+ // Four loads in a row should be sufficient.
+ if (NumLoads >= 3)
+ return false;
+
+ return true;
+}
+
+bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
+ const MachineBasicBlock *MBB,
+ const MachineFunction &MF) const {
+ // Debug info is never a scheduling boundary. It's necessary to be explicit
+ // due to the special treatment of IT instructions below, otherwise a
+ // dbg_value followed by an IT will result in the IT instruction being
+ // considered a scheduling hazard, which is wrong. It should be the actual
+ // instruction preceding the dbg_value instruction(s), just like it is
+ // when debug info is not present.
+ if (MI.isDebugValue())
+ return false;
+
+ // Terminators and labels can't be scheduled around.
+ if (MI.isTerminator() || MI.isPosition())
+ return true;
+
+ // Treat the start of the IT block as a scheduling boundary, but schedule
+ // t2IT along with all instructions following it.
+ // FIXME: This is a big hammer. But the alternative is to add all potential
+ // true and anti dependencies to IT block instructions as implicit operands
+ // to the t2IT instruction. The added compile time and complexity does not
+ // seem worth it.
+ MachineBasicBlock::const_iterator I = MI;
+ // Make sure to skip any dbg_value instructions
+ while (++I != MBB->end() && I->isDebugValue())
+ ;
+ if (I != MBB->end() && I->getOpcode() == ARM::t2IT)
+ return true;
+
+ // Don't attempt to schedule around any instruction that defines
+ // a stack-oriented pointer, as it's unlikely to be profitable. This
+ // saves compile time, because it doesn't require every single
+ // stack slot reference to depend on the instruction that does the
+ // modification.
+ // Calls don't actually change the stack pointer, even if they have imp-defs.
+ // No ARM calling conventions change the stack pointer. (X86 calling
+ // conventions sometimes do).
+ if (!MI.isCall() && MI.definesRegister(ARM::SP))
+ return true;
+
+ return false;
+}
+
+bool ARMBaseInstrInfo::
+isProfitableToIfCvt(MachineBasicBlock &MBB,
+ unsigned NumCycles, unsigned ExtraPredCycles,
+ BranchProbability Probability) const {
+ if (!NumCycles)
+ return false;
+
+ // If we are optimizing for size, see if the branch in the predecessor can be
+ // lowered to cbn?z by the constant island lowering pass, and return false if
+ // so. This results in a shorter instruction sequence.
+ if (MBB.getParent()->getFunction()->optForSize()) {
+ MachineBasicBlock *Pred = *MBB.pred_begin();
+ if (!Pred->empty()) {
+ MachineInstr *LastMI = &*Pred->rbegin();
+ if (LastMI->getOpcode() == ARM::t2Bcc) {
+ MachineBasicBlock::iterator CmpMI = LastMI;
+ if (CmpMI != Pred->begin()) {
+ --CmpMI;
+ if (CmpMI->getOpcode() == ARM::tCMPi8 ||
+ CmpMI->getOpcode() == ARM::t2CMPri) {
+ unsigned Reg = CmpMI->getOperand(0).getReg();
+ unsigned PredReg = 0;
+ ARMCC::CondCodes P = getInstrPredicate(*CmpMI, PredReg);
+ if (P == ARMCC::AL && CmpMI->getOperand(1).getImm() == 0 &&
+ isARMLowRegister(Reg))
+ return false;
+ }
+ }
+ }
+ }
+ }
+
+ // Attempt to estimate the relative costs of predication versus branching.
+ // Here we scale up each component of UnpredCost to avoid precision issue when
+ // scaling NumCycles by Probability.
+ const unsigned ScalingUpFactor = 1024;
+ unsigned UnpredCost = Probability.scale(NumCycles * ScalingUpFactor);
+ UnpredCost += ScalingUpFactor; // The branch itself
+ UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10;
+
+ return (NumCycles + ExtraPredCycles) * ScalingUpFactor <= UnpredCost;
+}
+
+bool ARMBaseInstrInfo::
+isProfitableToIfCvt(MachineBasicBlock &TMBB,
+ unsigned TCycles, unsigned TExtra,
+ MachineBasicBlock &FMBB,
+ unsigned FCycles, unsigned FExtra,
+ BranchProbability Probability) const {
+ if (!TCycles || !FCycles)
+ return false;
+
+ // Attempt to estimate the relative costs of predication versus branching.
+ // Here we scale up each component of UnpredCost to avoid precision issue when
+ // scaling TCycles/FCycles by Probability.
+ const unsigned ScalingUpFactor = 1024;
+ unsigned TUnpredCost = Probability.scale(TCycles * ScalingUpFactor);
+ unsigned FUnpredCost =
+ Probability.getCompl().scale(FCycles * ScalingUpFactor);
+ unsigned UnpredCost = TUnpredCost + FUnpredCost;
+ UnpredCost += 1 * ScalingUpFactor; // The branch itself
+ UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10;
+
+ return (TCycles + FCycles + TExtra + FExtra) * ScalingUpFactor <= UnpredCost;
+}
+
+bool
+ARMBaseInstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB,
+ MachineBasicBlock &FMBB) const {
+ // Reduce false anti-dependencies to let the target's out-of-order execution
+ // engine do its thing.
+ return Subtarget.isProfitableToUnpredicate();
+}
+
+/// getInstrPredicate - If instruction is predicated, returns its predicate
+/// condition, otherwise returns AL. It also returns the condition code
+/// register by reference.
+ARMCC::CondCodes llvm::getInstrPredicate(const MachineInstr &MI,
+ unsigned &PredReg) {
+ int PIdx = MI.findFirstPredOperandIdx();
+ if (PIdx == -1) {
+ PredReg = 0;
+ return ARMCC::AL;
+ }
+
+ PredReg = MI.getOperand(PIdx+1).getReg();
+ return (ARMCC::CondCodes)MI.getOperand(PIdx).getImm();
+}
+
+
+unsigned llvm::getMatchingCondBranchOpcode(unsigned Opc) {
+ if (Opc == ARM::B)
+ return ARM::Bcc;
+ if (Opc == ARM::tB)
+ return ARM::tBcc;
+ if (Opc == ARM::t2B)
+ return ARM::t2Bcc;
+
+ llvm_unreachable("Unknown unconditional branch opcode!");
+}
+
+MachineInstr *ARMBaseInstrInfo::commuteInstructionImpl(MachineInstr &MI,
+ bool NewMI,
+ unsigned OpIdx1,
+ unsigned OpIdx2) const {
+ switch (MI.getOpcode()) {
+ case ARM::MOVCCr:
+ case ARM::t2MOVCCr: {
+ // MOVCC can be commuted by inverting the condition.
+ unsigned PredReg = 0;
+ ARMCC::CondCodes CC = getInstrPredicate(MI, PredReg);
+ // MOVCC AL can't be inverted. Shouldn't happen.
+ if (CC == ARMCC::AL || PredReg != ARM::CPSR)
+ return nullptr;
+ MachineInstr *CommutedMI =
+ TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+ if (!CommutedMI)
+ return nullptr;
+ // After swapping the MOVCC operands, also invert the condition.
+ CommutedMI->getOperand(CommutedMI->findFirstPredOperandIdx())
+ .setImm(ARMCC::getOppositeCondition(CC));
+ return CommutedMI;
+ }
+ }
+ return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+}
+
+/// Identify instructions that can be folded into a MOVCC instruction, and
+/// return the defining instruction.
+static MachineInstr *canFoldIntoMOVCC(unsigned Reg,
+ const MachineRegisterInfo &MRI,
+ const TargetInstrInfo *TII) {
+ if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ return nullptr;
+ if (!MRI.hasOneNonDBGUse(Reg))
+ return nullptr;
+ MachineInstr *MI = MRI.getVRegDef(Reg);
+ if (!MI)
+ return nullptr;
+ // MI is folded into the MOVCC by predicating it.
+ if (!MI->isPredicable())
+ return nullptr;
+ // Check if MI has any non-dead defs or physreg uses. This also detects
+ // predicated instructions which will be reading CPSR.
+ for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+ // Reject frame index operands, PEI can't handle the predicated pseudos.
+ if (MO.isFI() || MO.isCPI() || MO.isJTI())
+ return nullptr;
+ if (!MO.isReg())
+ continue;
+ // MI can't have any tied operands, that would conflict with predication.
+ if (MO.isTied())
+ return nullptr;
+ if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+ return nullptr;
+ if (MO.isDef() && !MO.isDead())
+ return nullptr;
+ }
+ bool DontMoveAcrossStores = true;
+ if (!MI->isSafeToMove(/* AliasAnalysis = */ nullptr, DontMoveAcrossStores))
+ return nullptr;
+ return MI;
+}
+
+bool ARMBaseInstrInfo::analyzeSelect(const MachineInstr &MI,
+ SmallVectorImpl<MachineOperand> &Cond,
+ unsigned &TrueOp, unsigned &FalseOp,
+ bool &Optimizable) const {
+ assert((MI.getOpcode() == ARM::MOVCCr || MI.getOpcode() == ARM::t2MOVCCr) &&
+ "Unknown select instruction");
+ // MOVCC operands:
+ // 0: Def.
+ // 1: True use.
+ // 2: False use.
+ // 3: Condition code.
+ // 4: CPSR use.
+ TrueOp = 1;
+ FalseOp = 2;
+ Cond.push_back(MI.getOperand(3));
+ Cond.push_back(MI.getOperand(4));
+ // We can always fold a def.
+ Optimizable = true;
+ return false;
+}
+
+MachineInstr *
+ARMBaseInstrInfo::optimizeSelect(MachineInstr &MI,
+ SmallPtrSetImpl<MachineInstr *> &SeenMIs,
+ bool PreferFalse) const {
+ assert((MI.getOpcode() == ARM::MOVCCr || MI.getOpcode() == ARM::t2MOVCCr) &&
+ "Unknown select instruction");
+ MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ MachineInstr *DefMI = canFoldIntoMOVCC(MI.getOperand(2).getReg(), MRI, this);
+ bool Invert = !DefMI;
+ if (!DefMI)
+ DefMI = canFoldIntoMOVCC(MI.getOperand(1).getReg(), MRI, this);
+ if (!DefMI)
+ return nullptr;
+
+ // Find new register class to use.
+ MachineOperand FalseReg = MI.getOperand(Invert ? 2 : 1);
+ unsigned DestReg = MI.getOperand(0).getReg();
+ const TargetRegisterClass *PreviousClass = MRI.getRegClass(FalseReg.getReg());
+ if (!MRI.constrainRegClass(DestReg, PreviousClass))
+ return nullptr;
+
+ // Create a new predicated version of DefMI.
+ // Rfalse is the first use.
+ MachineInstrBuilder NewMI =
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), DefMI->getDesc(), DestReg);
+
+ // Copy all the DefMI operands, excluding its (null) predicate.
+ const MCInstrDesc &DefDesc = DefMI->getDesc();
+ for (unsigned i = 1, e = DefDesc.getNumOperands();
+ i != e && !DefDesc.OpInfo[i].isPredicate(); ++i)
+ NewMI.addOperand(DefMI->getOperand(i));
+
+ unsigned CondCode = MI.getOperand(3).getImm();
+ if (Invert)
+ NewMI.addImm(ARMCC::getOppositeCondition(ARMCC::CondCodes(CondCode)));
+ else
+ NewMI.addImm(CondCode);
+ NewMI.addOperand(MI.getOperand(4));
+
+ // DefMI is not the -S version that sets CPSR, so add an optional %noreg.
+ if (NewMI->hasOptionalDef())
+ AddDefaultCC(NewMI);
+
+ // The output register value when the predicate is false is an implicit
+ // register operand tied to the first def.
+ // The tie makes the register allocator ensure the FalseReg is allocated the
+ // same register as operand 0.
+ FalseReg.setImplicit();
+ NewMI.addOperand(FalseReg);
+ NewMI->tieOperands(0, NewMI->getNumOperands() - 1);
+
+ // Update SeenMIs set: register newly created MI and erase removed DefMI.
+ SeenMIs.insert(NewMI);
+ SeenMIs.erase(DefMI);
+
+ // If MI is inside a loop, and DefMI is outside the loop, then kill flags on
+ // DefMI would be invalid when tranferred inside the loop. Checking for a
+ // loop is expensive, but at least remove kill flags if they are in different
+ // BBs.
+ if (DefMI->getParent() != MI.getParent())
+ NewMI->clearKillInfo();
+
+ // The caller will erase MI, but not DefMI.
+ DefMI->eraseFromParent();
+ return NewMI;
+}
+
+/// Map pseudo instructions that imply an 'S' bit onto real opcodes. Whether the
+/// instruction is encoded with an 'S' bit is determined by the optional CPSR
+/// def operand.
+///
+/// This will go away once we can teach tblgen how to set the optional CPSR def
+/// operand itself.
+struct AddSubFlagsOpcodePair {
+ uint16_t PseudoOpc;
+ uint16_t MachineOpc;
+};
+
+static const AddSubFlagsOpcodePair AddSubFlagsOpcodeMap[] = {
+ {ARM::ADDSri, ARM::ADDri},
+ {ARM::ADDSrr, ARM::ADDrr},
+ {ARM::ADDSrsi, ARM::ADDrsi},
+ {ARM::ADDSrsr, ARM::ADDrsr},
+
+ {ARM::SUBSri, ARM::SUBri},
+ {ARM::SUBSrr, ARM::SUBrr},
+ {ARM::SUBSrsi, ARM::SUBrsi},
+ {ARM::SUBSrsr, ARM::SUBrsr},
+
+ {ARM::RSBSri, ARM::RSBri},
+ {ARM::RSBSrsi, ARM::RSBrsi},
+ {ARM::RSBSrsr, ARM::RSBrsr},
+
+ {ARM::t2ADDSri, ARM::t2ADDri},
+ {ARM::t2ADDSrr, ARM::t2ADDrr},
+ {ARM::t2ADDSrs, ARM::t2ADDrs},
+
+ {ARM::t2SUBSri, ARM::t2SUBri},
+ {ARM::t2SUBSrr, ARM::t2SUBrr},
+ {ARM::t2SUBSrs, ARM::t2SUBrs},
+
+ {ARM::t2RSBSri, ARM::t2RSBri},
+ {ARM::t2RSBSrs, ARM::t2RSBrs},
+};
+
+unsigned llvm::convertAddSubFlagsOpcode(unsigned OldOpc) {
+ for (unsigned i = 0, e = array_lengthof(AddSubFlagsOpcodeMap); i != e; ++i)
+ if (OldOpc == AddSubFlagsOpcodeMap[i].PseudoOpc)
+ return AddSubFlagsOpcodeMap[i].MachineOpc;
+ return 0;
+}
+
+void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ const DebugLoc &dl, unsigned DestReg,
+ unsigned BaseReg, int NumBytes,
+ ARMCC::CondCodes Pred, unsigned PredReg,
+ const ARMBaseInstrInfo &TII,
+ unsigned MIFlags) {
+ if (NumBytes == 0 && DestReg != BaseReg) {
+ BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), DestReg)
+ .addReg(BaseReg, RegState::Kill)
+ .addImm((unsigned)Pred).addReg(PredReg).addReg(0)
+ .setMIFlags(MIFlags);
+ return;
+ }
+
+ bool isSub = NumBytes < 0;
+ if (isSub) NumBytes = -NumBytes;
+
+ while (NumBytes) {
+ unsigned RotAmt = ARM_AM::getSOImmValRotate(NumBytes);
+ unsigned ThisVal = NumBytes & ARM_AM::rotr32(0xFF, RotAmt);
+ assert(ThisVal && "Didn't extract field correctly");
+
+ // We will handle these bits from offset, clear them.
+ NumBytes &= ~ThisVal;
+
+ assert(ARM_AM::getSOImmVal(ThisVal) != -1 && "Bit extraction didn't work?");
+
+ // Build the new ADD / SUB.
+ unsigned Opc = isSub ? ARM::SUBri : ARM::ADDri;
+ BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
+ .addReg(BaseReg, RegState::Kill).addImm(ThisVal)
+ .addImm((unsigned)Pred).addReg(PredReg).addReg(0)
+ .setMIFlags(MIFlags);
+ BaseReg = DestReg;
+ }
+}
+
+bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget,
+ MachineFunction &MF, MachineInstr *MI,
+ unsigned NumBytes) {
+ // This optimisation potentially adds lots of load and store
+ // micro-operations, it's only really a great benefit to code-size.
+ if (!MF.getFunction()->optForMinSize())
+ return false;
+
+ // If only one register is pushed/popped, LLVM can use an LDR/STR
+ // instead. We can't modify those so make sure we're dealing with an
+ // instruction we understand.
+ bool IsPop = isPopOpcode(MI->getOpcode());
+ bool IsPush = isPushOpcode(MI->getOpcode());
+ if (!IsPush && !IsPop)
+ return false;
+
+ bool IsVFPPushPop = MI->getOpcode() == ARM::VSTMDDB_UPD ||
+ MI->getOpcode() == ARM::VLDMDIA_UPD;
+ bool IsT1PushPop = MI->getOpcode() == ARM::tPUSH ||
+ MI->getOpcode() == ARM::tPOP ||
+ MI->getOpcode() == ARM::tPOP_RET;
+
+ assert((IsT1PushPop || (MI->getOperand(0).getReg() == ARM::SP &&
+ MI->getOperand(1).getReg() == ARM::SP)) &&
+ "trying to fold sp update into non-sp-updating push/pop");
+
+ // The VFP push & pop act on D-registers, so we can only fold an adjustment
+ // by a multiple of 8 bytes in correctly. Similarly rN is 4-bytes. Don't try
+ // if this is violated.
+ if (NumBytes % (IsVFPPushPop ? 8 : 4) != 0)
+ return false;
+
+ // ARM and Thumb2 push/pop insts have explicit "sp, sp" operands (+
+ // pred) so the list starts at 4. Thumb1 starts after the predicate.
+ int RegListIdx = IsT1PushPop ? 2 : 4;
+
+ // Calculate the space we'll need in terms of registers.
+ unsigned RegsNeeded;
+ const TargetRegisterClass *RegClass;
+ if (IsVFPPushPop) {
+ RegsNeeded = NumBytes / 8;
+ RegClass = &ARM::DPRRegClass;
+ } else {
+ RegsNeeded = NumBytes / 4;
+ RegClass = &ARM::GPRRegClass;
+ }
+
+ // We're going to have to strip all list operands off before
+ // re-adding them since the order matters, so save the existing ones
+ // for later.
+ SmallVector<MachineOperand, 4> RegList;
+
+ // We're also going to need the first register transferred by this
+ // instruction, which won't necessarily be the first register in the list.
+ unsigned FirstRegEnc = -1;
+
+ const TargetRegisterInfo *TRI = MF.getRegInfo().getTargetRegisterInfo();
+ for (int i = MI->getNumOperands() - 1; i >= RegListIdx; --i) {
+ MachineOperand &MO = MI->getOperand(i);
+ RegList.push_back(MO);
+
+ if (MO.isReg() && TRI->getEncodingValue(MO.getReg()) < FirstRegEnc)
+ FirstRegEnc = TRI->getEncodingValue(MO.getReg());
+ }
+
+ const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF);
+
+ // Now try to find enough space in the reglist to allocate NumBytes.
+ for (int CurRegEnc = FirstRegEnc - 1; CurRegEnc >= 0 && RegsNeeded;
+ --CurRegEnc) {
+ unsigned CurReg = RegClass->getRegister(CurRegEnc);
+ if (!IsPop) {
+ // Pushing any register is completely harmless, mark the
+ // register involved as undef since we don't care about it in
+ // the slightest.
+ RegList.push_back(MachineOperand::CreateReg(CurReg, false, false,
+ false, false, true));
+ --RegsNeeded;
+ continue;
+ }
+
+ // However, we can only pop an extra register if it's not live. For
+ // registers live within the function we might clobber a return value
+ // register; the other way a register can be live here is if it's
+ // callee-saved.
+ if (isCalleeSavedRegister(CurReg, CSRegs) ||
+ MI->getParent()->computeRegisterLiveness(TRI, CurReg, MI) !=
+ MachineBasicBlock::LQR_Dead) {
+ // VFP pops don't allow holes in the register list, so any skip is fatal
+ // for our transformation. GPR pops do, so we should just keep looking.
+ if (IsVFPPushPop)
+ return false;
+ else
+ continue;
+ }
+
+ // Mark the unimportant registers as <def,dead> in the POP.
+ RegList.push_back(MachineOperand::CreateReg(CurReg, true, false, false,
+ true));
+ --RegsNeeded;
+ }
+
+ if (RegsNeeded > 0)
+ return false;
+
+ // Finally we know we can profitably perform the optimisation so go
+ // ahead: strip all existing registers off and add them back again
+ // in the right order.
+ for (int i = MI->getNumOperands() - 1; i >= RegListIdx; --i)
+ MI->RemoveOperand(i);
+
+ // Add the complete list back in.
+ MachineInstrBuilder MIB(MF, &*MI);
+ for (int i = RegList.size() - 1; i >= 0; --i)
+ MIB.addOperand(RegList[i]);
+
+ return true;
+}
+
+bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+ unsigned FrameReg, int &Offset,
+ const ARMBaseInstrInfo &TII) {
+ unsigned Opcode = MI.getOpcode();
+ const MCInstrDesc &Desc = MI.getDesc();
+ unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+ bool isSub = false;
+
+ // Memory operands in inline assembly always use AddrMode2.
+ if (Opcode == ARM::INLINEASM)
+ AddrMode = ARMII::AddrMode2;
+
+ if (Opcode == ARM::ADDri) {
+ Offset += MI.getOperand(FrameRegIdx+1).getImm();
+ if (Offset == 0) {
+ // Turn it into a move.
+ MI.setDesc(TII.get(ARM::MOVr));
+ MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+ MI.RemoveOperand(FrameRegIdx+1);
+ Offset = 0;
+ return true;
+ } else if (Offset < 0) {
+ Offset = -Offset;
+ isSub = true;
+ MI.setDesc(TII.get(ARM::SUBri));
+ }
+
+ // Common case: small offset, fits into instruction.
+ if (ARM_AM::getSOImmVal(Offset) != -1) {
+ // Replace the FrameIndex with sp / fp
+ MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+ MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset);
+ Offset = 0;
+ return true;
+ }
+
+ // Otherwise, pull as much of the immedidate into this ADDri/SUBri
+ // as possible.
+ unsigned RotAmt = ARM_AM::getSOImmValRotate(Offset);
+ unsigned ThisImmVal = Offset & ARM_AM::rotr32(0xFF, RotAmt);
+
+ // We will handle these bits from offset, clear them.
+ Offset &= ~ThisImmVal;
+
+ // Get the properly encoded SOImmVal field.
+ assert(ARM_AM::getSOImmVal(ThisImmVal) != -1 &&
+ "Bit extraction didn't work?");
+ MI.getOperand(FrameRegIdx+1).ChangeToImmediate(ThisImmVal);
+ } else {
+ unsigned ImmIdx = 0;
+ int InstrOffs = 0;
+ unsigned NumBits = 0;
+ unsigned Scale = 1;
+ switch (AddrMode) {
+ case ARMII::AddrMode_i12: {
+ ImmIdx = FrameRegIdx + 1;
+ InstrOffs = MI.getOperand(ImmIdx).getImm();
+ NumBits = 12;
+ break;
+ }
+ case ARMII::AddrMode2: {
+ ImmIdx = FrameRegIdx+2;
+ InstrOffs = ARM_AM::getAM2Offset(MI.getOperand(ImmIdx).getImm());
+ if (ARM_AM::getAM2Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
+ InstrOffs *= -1;
+ NumBits = 12;
+ break;
+ }
+ case ARMII::AddrMode3: {
+ ImmIdx = FrameRegIdx+2;
+ InstrOffs = ARM_AM::getAM3Offset(MI.getOperand(ImmIdx).getImm());
+ if (ARM_AM::getAM3Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
+ InstrOffs *= -1;
+ NumBits = 8;
+ break;
+ }
+ case ARMII::AddrMode4:
+ case ARMII::AddrMode6:
+ // Can't fold any offset even if it's zero.
+ return false;
+ case ARMII::AddrMode5: {
+ ImmIdx = FrameRegIdx+1;
+ InstrOffs = ARM_AM::getAM5Offset(MI.getOperand(ImmIdx).getImm());
+ if (ARM_AM::getAM5Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
+ InstrOffs *= -1;
+ NumBits = 8;
+ Scale = 4;
+ break;
+ }
+ default:
+ llvm_unreachable("Unsupported addressing mode!");
+ }
+
+ Offset += InstrOffs * Scale;
+ assert((Offset & (Scale-1)) == 0 && "Can't encode this offset!");
+ if (Offset < 0) {
+ Offset = -Offset;
+ isSub = true;
+ }
+
+ // Attempt to fold address comp. if opcode has offset bits
+ if (NumBits > 0) {
+ // Common case: small offset, fits into instruction.
+ MachineOperand &ImmOp = MI.getOperand(ImmIdx);
+ int ImmedOffset = Offset / Scale;
+ unsigned Mask = (1 << NumBits) - 1;
+ if ((unsigned)Offset <= Mask * Scale) {
+ // Replace the FrameIndex with sp
+ MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+ // FIXME: When addrmode2 goes away, this will simplify (like the
+ // T2 version), as the LDR.i12 versions don't need the encoding
+ // tricks for the offset value.
+ if (isSub) {
+ if (AddrMode == ARMII::AddrMode_i12)
+ ImmedOffset = -ImmedOffset;
+ else
+ ImmedOffset |= 1 << NumBits;
+ }
+ ImmOp.ChangeToImmediate(ImmedOffset);
+ Offset = 0;
+ return true;
+ }
+
+ // Otherwise, it didn't fit. Pull in what we can to simplify the immed.
+ ImmedOffset = ImmedOffset & Mask;
+ if (isSub) {
+ if (AddrMode == ARMII::AddrMode_i12)
+ ImmedOffset = -ImmedOffset;
+ else
+ ImmedOffset |= 1 << NumBits;
+ }
+ ImmOp.ChangeToImmediate(ImmedOffset);
+ Offset &= ~(Mask*Scale);
+ }
+ }
+
+ Offset = (isSub) ? -Offset : Offset;
+ return Offset == 0;
+}
+
+/// analyzeCompare - For a comparison instruction, return the source registers
+/// in SrcReg and SrcReg2 if having two register operands, and the value it
+/// compares against in CmpValue. Return true if the comparison instruction
+/// can be analyzed.
+bool ARMBaseInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
+ unsigned &SrcReg2, int &CmpMask,
+ int &CmpValue) const {
+ switch (MI.getOpcode()) {
+ default: break;
+ case ARM::CMPri:
+ case ARM::t2CMPri:
+ case ARM::tCMPi8:
+ SrcReg = MI.getOperand(0).getReg();
+ SrcReg2 = 0;
+ CmpMask = ~0;
+ CmpValue = MI.getOperand(1).getImm();
+ return true;
+ case ARM::CMPrr:
+ case ARM::t2CMPrr:
+ SrcReg = MI.getOperand(0).getReg();
+ SrcReg2 = MI.getOperand(1).getReg();
+ CmpMask = ~0;
+ CmpValue = 0;
+ return true;
+ case ARM::TSTri:
+ case ARM::t2TSTri:
+ SrcReg = MI.getOperand(0).getReg();
+ SrcReg2 = 0;
+ CmpMask = MI.getOperand(1).getImm();
+ CmpValue = 0;
+ return true;
+ }
+
+ return false;
+}
+
+/// isSuitableForMask - Identify a suitable 'and' instruction that
+/// operates on the given source register and applies the same mask
+/// as a 'tst' instruction. Provide a limited look-through for copies.
+/// When successful, MI will hold the found instruction.
+static bool isSuitableForMask(MachineInstr *&MI, unsigned SrcReg,
+ int CmpMask, bool CommonUse) {
+ switch (MI->getOpcode()) {
+ case ARM::ANDri:
+ case ARM::t2ANDri:
+ if (CmpMask != MI->getOperand(2).getImm())
+ return false;
+ if (SrcReg == MI->getOperand(CommonUse ? 1 : 0).getReg())
+ return true;
+ break;
+ }
+
+ return false;
+}
+
+/// getSwappedCondition - assume the flags are set by MI(a,b), return
+/// the condition code if we modify the instructions such that flags are
+/// set by MI(b,a).
+inline static ARMCC::CondCodes getSwappedCondition(ARMCC::CondCodes CC) {
+ switch (CC) {
+ default: return ARMCC::AL;
+ case ARMCC::EQ: return ARMCC::EQ;
+ case ARMCC::NE: return ARMCC::NE;
+ case ARMCC::HS: return ARMCC::LS;
+ case ARMCC::LO: return ARMCC::HI;
+ case ARMCC::HI: return ARMCC::LO;
+ case ARMCC::LS: return ARMCC::HS;
+ case ARMCC::GE: return ARMCC::LE;
+ case ARMCC::LT: return ARMCC::GT;
+ case ARMCC::GT: return ARMCC::LT;
+ case ARMCC::LE: return ARMCC::GE;
+ }
+}
+
+/// isRedundantFlagInstr - check whether the first instruction, whose only
+/// purpose is to update flags, can be made redundant.
+/// CMPrr can be made redundant by SUBrr if the operands are the same.
+/// CMPri can be made redundant by SUBri if the operands are the same.
+/// This function can be extended later on.
+inline static bool isRedundantFlagInstr(MachineInstr *CmpI, unsigned SrcReg,
+ unsigned SrcReg2, int ImmValue,
+ MachineInstr *OI) {
+ if ((CmpI->getOpcode() == ARM::CMPrr ||
+ CmpI->getOpcode() == ARM::t2CMPrr) &&
+ (OI->getOpcode() == ARM::SUBrr ||
+ OI->getOpcode() == ARM::t2SUBrr) &&
+ ((OI->getOperand(1).getReg() == SrcReg &&
+ OI->getOperand(2).getReg() == SrcReg2) ||
+ (OI->getOperand(1).getReg() == SrcReg2 &&
+ OI->getOperand(2).getReg() == SrcReg)))
+ return true;
+
+ if ((CmpI->getOpcode() == ARM::CMPri ||
+ CmpI->getOpcode() == ARM::t2CMPri) &&
+ (OI->getOpcode() == ARM::SUBri ||
+ OI->getOpcode() == ARM::t2SUBri) &&
+ OI->getOperand(1).getReg() == SrcReg &&
+ OI->getOperand(2).getImm() == ImmValue)
+ return true;
+ return false;
+}
+
+/// optimizeCompareInstr - Convert the instruction supplying the argument to the
+/// comparison into one that sets the zero bit in the flags register;
+/// Remove a redundant Compare instruction if an earlier instruction can set the
+/// flags in the same way as Compare.
+/// E.g. SUBrr(r1,r2) and CMPrr(r1,r2). We also handle the case where two
+/// operands are swapped: SUBrr(r1,r2) and CMPrr(r2,r1), by updating the
+/// condition code of instructions which use the flags.
+bool ARMBaseInstrInfo::optimizeCompareInstr(
+ MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask,
+ int CmpValue, const MachineRegisterInfo *MRI) const {
+ // Get the unique definition of SrcReg.
+ MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
+ if (!MI) return false;
+
+ // Masked compares sometimes use the same register as the corresponding 'and'.
+ if (CmpMask != ~0) {
+ if (!isSuitableForMask(MI, SrcReg, CmpMask, false) || isPredicated(*MI)) {
+ MI = nullptr;
+ for (MachineRegisterInfo::use_instr_iterator
+ UI = MRI->use_instr_begin(SrcReg), UE = MRI->use_instr_end();
+ UI != UE; ++UI) {
+ if (UI->getParent() != CmpInstr.getParent())
+ continue;
+ MachineInstr *PotentialAND = &*UI;
+ if (!isSuitableForMask(PotentialAND, SrcReg, CmpMask, true) ||
+ isPredicated(*PotentialAND))
+ continue;
+ MI = PotentialAND;
+ break;
+ }
+ if (!MI) return false;
+ }
+ }
+
+ // Get ready to iterate backward from CmpInstr.
+ MachineBasicBlock::iterator I = CmpInstr, E = MI,
+ B = CmpInstr.getParent()->begin();
+
+ // Early exit if CmpInstr is at the beginning of the BB.
+ if (I == B) return false;
+
+ // There are two possible candidates which can be changed to set CPSR:
+ // One is MI, the other is a SUB instruction.
+ // For CMPrr(r1,r2), we are looking for SUB(r1,r2) or SUB(r2,r1).
+ // For CMPri(r1, CmpValue), we are looking for SUBri(r1, CmpValue).
+ MachineInstr *Sub = nullptr;
+ if (SrcReg2 != 0)
+ // MI is not a candidate for CMPrr.
+ MI = nullptr;
+ else if (MI->getParent() != CmpInstr.getParent() || CmpValue != 0) {
+ // Conservatively refuse to convert an instruction which isn't in the same
+ // BB as the comparison.
+ // For CMPri w/ CmpValue != 0, a Sub may still be a candidate.
+ // Thus we cannot return here.
+ if (CmpInstr.getOpcode() == ARM::CMPri ||
+ CmpInstr.getOpcode() == ARM::t2CMPri)
+ MI = nullptr;
+ else
+ return false;
+ }
+
+ // Check that CPSR isn't set between the comparison instruction and the one we
+ // want to change. At the same time, search for Sub.
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ --I;
+ for (; I != E; --I) {
+ const MachineInstr &Instr = *I;
+
+ if (Instr.modifiesRegister(ARM::CPSR, TRI) ||
+ Instr.readsRegister(ARM::CPSR, TRI))
+ // This instruction modifies or uses CPSR after the one we want to
+ // change. We can't do this transformation.
+ return false;
+
+ // Check whether CmpInstr can be made redundant by the current instruction.
+ if (isRedundantFlagInstr(&CmpInstr, SrcReg, SrcReg2, CmpValue, &*I)) {
+ Sub = &*I;
+ break;
+ }
+
+ if (I == B)
+ // The 'and' is below the comparison instruction.
+ return false;
+ }
+
+ // Return false if no candidates exist.
+ if (!MI && !Sub)
+ return false;
+
+ // The single candidate is called MI.
+ if (!MI) MI = Sub;
+
+ // We can't use a predicated instruction - it doesn't always write the flags.
+ if (isPredicated(*MI))
+ return false;
+
+ bool IsThumb1 = false;
+ switch (MI->getOpcode()) {
+ default: break;
+ case ARM::tLSLri:
+ case ARM::tLSRri:
+ case ARM::tLSLrr:
+ case ARM::tLSRrr:
+ case ARM::tSUBrr:
+ case ARM::tADDrr:
+ case ARM::tADDi3:
+ case ARM::tADDi8:
+ case ARM::tSUBi3:
+ case ARM::tSUBi8:
+ IsThumb1 = true;
+ LLVM_FALLTHROUGH;
+ case ARM::RSBrr:
+ case ARM::RSBri:
+ case ARM::RSCrr:
+ case ARM::RSCri:
+ case ARM::ADDrr:
+ case ARM::ADDri:
+ case ARM::ADCrr:
+ case ARM::ADCri:
+ case ARM::SUBrr:
+ case ARM::SUBri:
+ case ARM::SBCrr:
+ case ARM::SBCri:
+ case ARM::t2RSBri:
+ case ARM::t2ADDrr:
+ case ARM::t2ADDri:
+ case ARM::t2ADCrr:
+ case ARM::t2ADCri:
+ case ARM::t2SUBrr:
+ case ARM::t2SUBri:
+ case ARM::t2SBCrr:
+ case ARM::t2SBCri:
+ case ARM::ANDrr:
+ case ARM::ANDri:
+ case ARM::t2ANDrr:
+ case ARM::t2ANDri:
+ case ARM::ORRrr:
+ case ARM::ORRri:
+ case ARM::t2ORRrr:
+ case ARM::t2ORRri:
+ case ARM::EORrr:
+ case ARM::EORri:
+ case ARM::t2EORrr:
+ case ARM::t2EORri:
+ case ARM::t2LSRri:
+ case ARM::t2LSRrr:
+ case ARM::t2LSLri:
+ case ARM::t2LSLrr: {
+ // Scan forward for the use of CPSR
+ // When checking against MI: if it's a conditional code that requires
+ // checking of the V bit or C bit, then this is not safe to do.
+ // It is safe to remove CmpInstr if CPSR is redefined or killed.
+ // If we are done with the basic block, we need to check whether CPSR is
+ // live-out.
+ SmallVector<std::pair<MachineOperand*, ARMCC::CondCodes>, 4>
+ OperandsToUpdate;
+ bool isSafe = false;
+ I = CmpInstr;
+ E = CmpInstr.getParent()->end();
+ while (!isSafe && ++I != E) {
+ const MachineInstr &Instr = *I;
+ for (unsigned IO = 0, EO = Instr.getNumOperands();
+ !isSafe && IO != EO; ++IO) {
+ const MachineOperand &MO = Instr.getOperand(IO);
+ if (MO.isRegMask() && MO.clobbersPhysReg(ARM::CPSR)) {
+ isSafe = true;
+ break;
+ }
+ if (!MO.isReg() || MO.getReg() != ARM::CPSR)
+ continue;
+ if (MO.isDef()) {
+ isSafe = true;
+ break;
+ }
+ // Condition code is after the operand before CPSR except for VSELs.
+ ARMCC::CondCodes CC;
+ bool IsInstrVSel = true;
+ switch (Instr.getOpcode()) {
+ default:
+ IsInstrVSel = false;
+ CC = (ARMCC::CondCodes)Instr.getOperand(IO - 1).getImm();
+ break;
+ case ARM::VSELEQD:
+ case ARM::VSELEQS:
+ CC = ARMCC::EQ;
+ break;
+ case ARM::VSELGTD:
+ case ARM::VSELGTS:
+ CC = ARMCC::GT;
+ break;
+ case ARM::VSELGED:
+ case ARM::VSELGES:
+ CC = ARMCC::GE;
+ break;
+ case ARM::VSELVSS:
+ case ARM::VSELVSD:
+ CC = ARMCC::VS;
+ break;
+ }
+
+ if (Sub) {
+ ARMCC::CondCodes NewCC = getSwappedCondition(CC);
+ if (NewCC == ARMCC::AL)
+ return false;
+ // If we have SUB(r1, r2) and CMP(r2, r1), the condition code based
+ // on CMP needs to be updated to be based on SUB.
+ // Push the condition code operands to OperandsToUpdate.
+ // If it is safe to remove CmpInstr, the condition code of these
+ // operands will be modified.
+ if (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
+ Sub->getOperand(2).getReg() == SrcReg) {
+ // VSel doesn't support condition code update.
+ if (IsInstrVSel)
+ return false;
+ OperandsToUpdate.push_back(
+ std::make_pair(&((*I).getOperand(IO - 1)), NewCC));
+ }
+ } else {
+ // No Sub, so this is x = <op> y, z; cmp x, 0.
+ switch (CC) {
+ case ARMCC::EQ: // Z
+ case ARMCC::NE: // Z
+ case ARMCC::MI: // N
+ case ARMCC::PL: // N
+ case ARMCC::AL: // none
+ // CPSR can be used multiple times, we should continue.
+ break;
+ case ARMCC::HS: // C
+ case ARMCC::LO: // C
+ case ARMCC::VS: // V
+ case ARMCC::VC: // V
+ case ARMCC::HI: // C Z
+ case ARMCC::LS: // C Z
+ case ARMCC::GE: // N V
+ case ARMCC::LT: // N V
+ case ARMCC::GT: // Z N V
+ case ARMCC::LE: // Z N V
+ // The instruction uses the V bit or C bit which is not safe.
+ return false;
+ }
+ }
+ }
+ }
+
+ // If CPSR is not killed nor re-defined, we should check whether it is
+ // live-out. If it is live-out, do not optimize.
+ if (!isSafe) {
+ MachineBasicBlock *MBB = CmpInstr.getParent();
+ for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+ SE = MBB->succ_end(); SI != SE; ++SI)
+ if ((*SI)->isLiveIn(ARM::CPSR))
+ return false;
+ }
+
+ // Toggle the optional operand to CPSR (if it exists - in Thumb1 we always
+ // set CPSR so this is represented as an explicit output)
+ if (!IsThumb1) {
+ MI->getOperand(5).setReg(ARM::CPSR);
+ MI->getOperand(5).setIsDef(true);
+ }
+ assert(!isPredicated(*MI) && "Can't use flags from predicated instruction");
+ CmpInstr.eraseFromParent();
+
+ // Modify the condition code of operands in OperandsToUpdate.
+ // Since we have SUB(r1, r2) and CMP(r2, r1), the condition code needs to
+ // be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
+ for (unsigned i = 0, e = OperandsToUpdate.size(); i < e; i++)
+ OperandsToUpdate[i].first->setImm(OperandsToUpdate[i].second);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
+ unsigned Reg,
+ MachineRegisterInfo *MRI) const {
+ // Fold large immediates into add, sub, or, xor.
+ unsigned DefOpc = DefMI.getOpcode();
+ if (DefOpc != ARM::t2MOVi32imm && DefOpc != ARM::MOVi32imm)
+ return false;
+ if (!DefMI.getOperand(1).isImm())
+ // Could be t2MOVi32imm <ga:xx>
+ return false;
+
+ if (!MRI->hasOneNonDBGUse(Reg))
+ return false;
+
+ const MCInstrDesc &DefMCID = DefMI.getDesc();
+ if (DefMCID.hasOptionalDef()) {
+ unsigned NumOps = DefMCID.getNumOperands();
+ const MachineOperand &MO = DefMI.getOperand(NumOps - 1);
+ if (MO.getReg() == ARM::CPSR && !MO.isDead())
+ // If DefMI defines CPSR and it is not dead, it's obviously not safe
+ // to delete DefMI.
+ return false;
+ }
+
+ const MCInstrDesc &UseMCID = UseMI.getDesc();
+ if (UseMCID.hasOptionalDef()) {
+ unsigned NumOps = UseMCID.getNumOperands();
+ if (UseMI.getOperand(NumOps - 1).getReg() == ARM::CPSR)
+ // If the instruction sets the flag, do not attempt this optimization
+ // since it may change the semantics of the code.
+ return false;
+ }
+
+ unsigned UseOpc = UseMI.getOpcode();
+ unsigned NewUseOpc = 0;
+ uint32_t ImmVal = (uint32_t)DefMI.getOperand(1).getImm();
+ uint32_t SOImmValV1 = 0, SOImmValV2 = 0;
+ bool Commute = false;
+ switch (UseOpc) {
+ default: return false;
+ case ARM::SUBrr:
+ case ARM::ADDrr:
+ case ARM::ORRrr:
+ case ARM::EORrr:
+ case ARM::t2SUBrr:
+ case ARM::t2ADDrr:
+ case ARM::t2ORRrr:
+ case ARM::t2EORrr: {
+ Commute = UseMI.getOperand(2).getReg() != Reg;
+ switch (UseOpc) {
+ default: break;
+ case ARM::ADDrr:
+ case ARM::SUBrr: {
+ if (UseOpc == ARM::SUBrr && Commute)
+ return false;
+
+ // ADD/SUB are special because they're essentially the same operation, so
+ // we can handle a larger range of immediates.
+ if (ARM_AM::isSOImmTwoPartVal(ImmVal))
+ NewUseOpc = UseOpc == ARM::ADDrr ? ARM::ADDri : ARM::SUBri;
+ else if (ARM_AM::isSOImmTwoPartVal(-ImmVal)) {
+ ImmVal = -ImmVal;
+ NewUseOpc = UseOpc == ARM::ADDrr ? ARM::SUBri : ARM::ADDri;
+ } else
+ return false;
+ SOImmValV1 = (uint32_t)ARM_AM::getSOImmTwoPartFirst(ImmVal);
+ SOImmValV2 = (uint32_t)ARM_AM::getSOImmTwoPartSecond(ImmVal);
+ break;
+ }
+ case ARM::ORRrr:
+ case ARM::EORrr: {
+ if (!ARM_AM::isSOImmTwoPartVal(ImmVal))
+ return false;
+ SOImmValV1 = (uint32_t)ARM_AM::getSOImmTwoPartFirst(ImmVal);
+ SOImmValV2 = (uint32_t)ARM_AM::getSOImmTwoPartSecond(ImmVal);
+ switch (UseOpc) {
+ default: break;
+ case ARM::ORRrr: NewUseOpc = ARM::ORRri; break;
+ case ARM::EORrr: NewUseOpc = ARM::EORri; break;
+ }
+ break;
+ }
+ case ARM::t2ADDrr:
+ case ARM::t2SUBrr: {
+ if (UseOpc == ARM::t2SUBrr && Commute)
+ return false;
+
+ // ADD/SUB are special because they're essentially the same operation, so
+ // we can handle a larger range of immediates.
+ if (ARM_AM::isT2SOImmTwoPartVal(ImmVal))
+ NewUseOpc = UseOpc == ARM::t2ADDrr ? ARM::t2ADDri : ARM::t2SUBri;
+ else if (ARM_AM::isT2SOImmTwoPartVal(-ImmVal)) {
+ ImmVal = -ImmVal;
+ NewUseOpc = UseOpc == ARM::t2ADDrr ? ARM::t2SUBri : ARM::t2ADDri;
+ } else
+ return false;
+ SOImmValV1 = (uint32_t)ARM_AM::getT2SOImmTwoPartFirst(ImmVal);
+ SOImmValV2 = (uint32_t)ARM_AM::getT2SOImmTwoPartSecond(ImmVal);
+ break;
+ }
+ case ARM::t2ORRrr:
+ case ARM::t2EORrr: {
+ if (!ARM_AM::isT2SOImmTwoPartVal(ImmVal))
+ return false;
+ SOImmValV1 = (uint32_t)ARM_AM::getT2SOImmTwoPartFirst(ImmVal);
+ SOImmValV2 = (uint32_t)ARM_AM::getT2SOImmTwoPartSecond(ImmVal);
+ switch (UseOpc) {
+ default: break;
+ case ARM::t2ORRrr: NewUseOpc = ARM::t2ORRri; break;
+ case ARM::t2EORrr: NewUseOpc = ARM::t2EORri; break;
+ }
+ break;
+ }
+ }
+ }
+ }
+
+ unsigned OpIdx = Commute ? 2 : 1;
+ unsigned Reg1 = UseMI.getOperand(OpIdx).getReg();
+ bool isKill = UseMI.getOperand(OpIdx).isKill();
+ unsigned NewReg = MRI->createVirtualRegister(MRI->getRegClass(Reg));
+ AddDefaultCC(
+ AddDefaultPred(BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
+ get(NewUseOpc), NewReg)
+ .addReg(Reg1, getKillRegState(isKill))
+ .addImm(SOImmValV1)));
+ UseMI.setDesc(get(NewUseOpc));
+ UseMI.getOperand(1).setReg(NewReg);
+ UseMI.getOperand(1).setIsKill();
+ UseMI.getOperand(2).ChangeToImmediate(SOImmValV2);
+ DefMI.eraseFromParent();
+ return true;
+}
+
+static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
+ const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ default: {
+ const MCInstrDesc &Desc = MI.getDesc();
+ int UOps = ItinData->getNumMicroOps(Desc.getSchedClass());
+ assert(UOps >= 0 && "bad # UOps");
+ return UOps;
+ }
+
+ case ARM::LDRrs:
+ case ARM::LDRBrs:
+ case ARM::STRrs:
+ case ARM::STRBrs: {
+ unsigned ShOpVal = MI.getOperand(3).getImm();
+ bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
+ unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+ if (!isSub &&
+ (ShImm == 0 ||
+ ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+ ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)))
+ return 1;
+ return 2;
+ }
+
+ case ARM::LDRH:
+ case ARM::STRH: {
+ if (!MI.getOperand(2).getReg())
+ return 1;
+
+ unsigned ShOpVal = MI.getOperand(3).getImm();
+ bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
+ unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+ if (!isSub &&
+ (ShImm == 0 ||
+ ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+ ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)))
+ return 1;
+ return 2;
+ }
+
+ case ARM::LDRSB:
+ case ARM::LDRSH:
+ return (ARM_AM::getAM3Op(MI.getOperand(3).getImm()) == ARM_AM::sub) ? 3 : 2;
+
+ case ARM::LDRSB_POST:
+ case ARM::LDRSH_POST: {
+ unsigned Rt = MI.getOperand(0).getReg();
+ unsigned Rm = MI.getOperand(3).getReg();
+ return (Rt == Rm) ? 4 : 3;
+ }
+
+ case ARM::LDR_PRE_REG:
+ case ARM::LDRB_PRE_REG: {
+ unsigned Rt = MI.getOperand(0).getReg();
+ unsigned Rm = MI.getOperand(3).getReg();
+ if (Rt == Rm)
+ return 3;
+ unsigned ShOpVal = MI.getOperand(4).getImm();
+ bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
+ unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+ if (!isSub &&
+ (ShImm == 0 ||
+ ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+ ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)))
+ return 2;
+ return 3;
+ }
+
+ case ARM::STR_PRE_REG:
+ case ARM::STRB_PRE_REG: {
+ unsigned ShOpVal = MI.getOperand(4).getImm();
+ bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
+ unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+ if (!isSub &&
+ (ShImm == 0 ||
+ ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+ ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)))
+ return 2;
+ return 3;
+ }
+
+ case ARM::LDRH_PRE:
+ case ARM::STRH_PRE: {
+ unsigned Rt = MI.getOperand(0).getReg();
+ unsigned Rm = MI.getOperand(3).getReg();
+ if (!Rm)
+ return 2;
+ if (Rt == Rm)
+ return 3;
+ return (ARM_AM::getAM3Op(MI.getOperand(4).getImm()) == ARM_AM::sub) ? 3 : 2;
+ }
+
+ case ARM::LDR_POST_REG:
+ case ARM::LDRB_POST_REG:
+ case ARM::LDRH_POST: {
+ unsigned Rt = MI.getOperand(0).getReg();
+ unsigned Rm = MI.getOperand(3).getReg();
+ return (Rt == Rm) ? 3 : 2;
+ }
+
+ case ARM::LDR_PRE_IMM:
+ case ARM::LDRB_PRE_IMM:
+ case ARM::LDR_POST_IMM:
+ case ARM::LDRB_POST_IMM:
+ case ARM::STRB_POST_IMM:
+ case ARM::STRB_POST_REG:
+ case ARM::STRB_PRE_IMM:
+ case ARM::STRH_POST:
+ case ARM::STR_POST_IMM:
+ case ARM::STR_POST_REG:
+ case ARM::STR_PRE_IMM:
+ return 2;
+
+ case ARM::LDRSB_PRE:
+ case ARM::LDRSH_PRE: {
+ unsigned Rm = MI.getOperand(3).getReg();
+ if (Rm == 0)
+ return 3;
+ unsigned Rt = MI.getOperand(0).getReg();
+ if (Rt == Rm)
+ return 4;
+ unsigned ShOpVal = MI.getOperand(4).getImm();
+ bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
+ unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+ if (!isSub &&
+ (ShImm == 0 ||
+ ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+ ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)))
+ return 3;
+ return 4;
+ }
+
+ case ARM::LDRD: {
+ unsigned Rt = MI.getOperand(0).getReg();
+ unsigned Rn = MI.getOperand(2).getReg();
+ unsigned Rm = MI.getOperand(3).getReg();
+ if (Rm)
+ return (ARM_AM::getAM3Op(MI.getOperand(4).getImm()) == ARM_AM::sub) ? 4
+ : 3;
+ return (Rt == Rn) ? 3 : 2;
+ }
+
+ case ARM::STRD: {
+ unsigned Rm = MI.getOperand(3).getReg();
+ if (Rm)
+ return (ARM_AM::getAM3Op(MI.getOperand(4).getImm()) == ARM_AM::sub) ? 4
+ : 3;
+ return 2;
+ }
+
+ case ARM::LDRD_POST:
+ case ARM::t2LDRD_POST:
+ return 3;
+
+ case ARM::STRD_POST:
+ case ARM::t2STRD_POST:
+ return 4;
+
+ case ARM::LDRD_PRE: {
+ unsigned Rt = MI.getOperand(0).getReg();
+ unsigned Rn = MI.getOperand(3).getReg();
+ unsigned Rm = MI.getOperand(4).getReg();
+ if (Rm)
+ return (ARM_AM::getAM3Op(MI.getOperand(5).getImm()) == ARM_AM::sub) ? 5
+ : 4;
+ return (Rt == Rn) ? 4 : 3;
+ }
+
+ case ARM::t2LDRD_PRE: {
+ unsigned Rt = MI.getOperand(0).getReg();
+ unsigned Rn = MI.getOperand(3).getReg();
+ return (Rt == Rn) ? 4 : 3;
+ }
+
+ case ARM::STRD_PRE: {
+ unsigned Rm = MI.getOperand(4).getReg();
+ if (Rm)
+ return (ARM_AM::getAM3Op(MI.getOperand(5).getImm()) == ARM_AM::sub) ? 5
+ : 4;
+ return 3;
+ }
+
+ case ARM::t2STRD_PRE:
+ return 3;
+
+ case ARM::t2LDR_POST:
+ case ARM::t2LDRB_POST:
+ case ARM::t2LDRB_PRE:
+ case ARM::t2LDRSBi12:
+ case ARM::t2LDRSBi8:
+ case ARM::t2LDRSBpci:
+ case ARM::t2LDRSBs:
+ case ARM::t2LDRH_POST:
+ case ARM::t2LDRH_PRE:
+ case ARM::t2LDRSBT:
+ case ARM::t2LDRSB_POST:
+ case ARM::t2LDRSB_PRE:
+ case ARM::t2LDRSH_POST:
+ case ARM::t2LDRSH_PRE:
+ case ARM::t2LDRSHi12:
+ case ARM::t2LDRSHi8:
+ case ARM::t2LDRSHpci:
+ case ARM::t2LDRSHs:
+ return 2;
+
+ case ARM::t2LDRDi8: {
+ unsigned Rt = MI.getOperand(0).getReg();
+ unsigned Rn = MI.getOperand(2).getReg();
+ return (Rt == Rn) ? 3 : 2;
+ }
+
+ case ARM::t2STRB_POST:
+ case ARM::t2STRB_PRE:
+ case ARM::t2STRBs:
+ case ARM::t2STRDi8:
+ case ARM::t2STRH_POST:
+ case ARM::t2STRH_PRE:
+ case ARM::t2STRHs:
+ case ARM::t2STR_POST:
+ case ARM::t2STR_PRE:
+ case ARM::t2STRs:
+ return 2;
+ }
+}
+
+// Return the number of 32-bit words loaded by LDM or stored by STM. If this
+// can't be easily determined return 0 (missing MachineMemOperand).
+//
+// FIXME: The current MachineInstr design does not support relying on machine
+// mem operands to determine the width of a memory access. Instead, we expect
+// the target to provide this information based on the instruction opcode and
+// operands. However, using MachineMemOperand is the best solution now for
+// two reasons:
+//
+// 1) getNumMicroOps tries to infer LDM memory width from the total number of MI
+// operands. This is much more dangerous than using the MachineMemOperand
+// sizes because CodeGen passes can insert/remove optional machine operands. In
+// fact, it's totally incorrect for preRA passes and appears to be wrong for
+// postRA passes as well.
+//
+// 2) getNumLDMAddresses is only used by the scheduling machine model and any
+// machine model that calls this should handle the unknown (zero size) case.
+//
+// Long term, we should require a target hook that verifies MachineMemOperand
+// sizes during MC lowering. That target hook should be local to MC lowering
+// because we can't ensure that it is aware of other MI forms. Doing this will
+// ensure that MachineMemOperands are correctly propagated through all passes.
+unsigned ARMBaseInstrInfo::getNumLDMAddresses(const MachineInstr &MI) const {
+ unsigned Size = 0;
+ for (MachineInstr::mmo_iterator I = MI.memoperands_begin(),
+ E = MI.memoperands_end();
+ I != E; ++I) {
+ Size += (*I)->getSize();
+ }
+ return Size / 4;
+}
+
+static unsigned getNumMicroOpsSingleIssuePlusExtras(unsigned Opc,
+ unsigned NumRegs) {
+ unsigned UOps = 1 + NumRegs; // 1 for address computation.
+ switch (Opc) {
+ default:
+ break;
+ case ARM::VLDMDIA_UPD:
+ case ARM::VLDMDDB_UPD:
+ case ARM::VLDMSIA_UPD:
+ case ARM::VLDMSDB_UPD:
+ case ARM::VSTMDIA_UPD:
+ case ARM::VSTMDDB_UPD:
+ case ARM::VSTMSIA_UPD:
+ case ARM::VSTMSDB_UPD:
+ case ARM::LDMIA_UPD:
+ case ARM::LDMDA_UPD:
+ case ARM::LDMDB_UPD:
+ case ARM::LDMIB_UPD:
+ case ARM::STMIA_UPD:
+ case ARM::STMDA_UPD:
+ case ARM::STMDB_UPD:
+ case ARM::STMIB_UPD:
+ case ARM::tLDMIA_UPD:
+ case ARM::tSTMIA_UPD:
+ case ARM::t2LDMIA_UPD:
+ case ARM::t2LDMDB_UPD:
+ case ARM::t2STMIA_UPD:
+ case ARM::t2STMDB_UPD:
+ ++UOps; // One for base register writeback.
+ break;
+ case ARM::LDMIA_RET:
+ case ARM::tPOP_RET:
+ case ARM::t2LDMIA_RET:
+ UOps += 2; // One for base reg wb, one for write to pc.
+ break;
+ }
+ return UOps;
+}
+
+unsigned ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
+ const MachineInstr &MI) const {
+ if (!ItinData || ItinData->isEmpty())
+ return 1;
+
+ const MCInstrDesc &Desc = MI.getDesc();
+ unsigned Class = Desc.getSchedClass();
+ int ItinUOps = ItinData->getNumMicroOps(Class);
+ if (ItinUOps >= 0) {
+ if (Subtarget.isSwift() && (Desc.mayLoad() || Desc.mayStore()))
+ return getNumMicroOpsSwiftLdSt(ItinData, MI);
+
+ return ItinUOps;
+ }
+
+ unsigned Opc = MI.getOpcode();
+ switch (Opc) {
+ default:
+ llvm_unreachable("Unexpected multi-uops instruction!");
+ case ARM::VLDMQIA:
+ case ARM::VSTMQIA:
+ return 2;
+
+ // The number of uOps for load / store multiple are determined by the number
+ // registers.
+ //
+ // On Cortex-A8, each pair of register loads / stores can be scheduled on the
+ // same cycle. The scheduling for the first load / store must be done
+ // separately by assuming the address is not 64-bit aligned.
+ //
+ // On Cortex-A9, the formula is simply (#reg / 2) + (#reg % 2). If the address
+ // is not 64-bit aligned, then AGU would take an extra cycle. For VFP / NEON
+ // load / store multiple, the formula is (#reg / 2) + (#reg % 2) + 1.
+ case ARM::VLDMDIA:
+ case ARM::VLDMDIA_UPD:
+ case ARM::VLDMDDB_UPD:
+ case ARM::VLDMSIA:
+ case ARM::VLDMSIA_UPD:
+ case ARM::VLDMSDB_UPD:
+ case ARM::VSTMDIA:
+ case ARM::VSTMDIA_UPD:
+ case ARM::VSTMDDB_UPD:
+ case ARM::VSTMSIA:
+ case ARM::VSTMSIA_UPD:
+ case ARM::VSTMSDB_UPD: {
+ unsigned NumRegs = MI.getNumOperands() - Desc.getNumOperands();
+ return (NumRegs / 2) + (NumRegs % 2) + 1;
+ }
+
+ case ARM::LDMIA_RET:
+ case ARM::LDMIA:
+ case ARM::LDMDA:
+ case ARM::LDMDB:
+ case ARM::LDMIB:
+ case ARM::LDMIA_UPD:
+ case ARM::LDMDA_UPD:
+ case ARM::LDMDB_UPD:
+ case ARM::LDMIB_UPD:
+ case ARM::STMIA:
+ case ARM::STMDA:
+ case ARM::STMDB:
+ case ARM::STMIB:
+ case ARM::STMIA_UPD:
+ case ARM::STMDA_UPD:
+ case ARM::STMDB_UPD:
+ case ARM::STMIB_UPD:
+ case ARM::tLDMIA:
+ case ARM::tLDMIA_UPD:
+ case ARM::tSTMIA_UPD:
+ case ARM::tPOP_RET:
+ case ARM::tPOP:
+ case ARM::tPUSH:
+ case ARM::t2LDMIA_RET:
+ case ARM::t2LDMIA:
+ case ARM::t2LDMDB:
+ case ARM::t2LDMIA_UPD:
+ case ARM::t2LDMDB_UPD:
+ case ARM::t2STMIA:
+ case ARM::t2STMDB:
+ case ARM::t2STMIA_UPD:
+ case ARM::t2STMDB_UPD: {
+ unsigned NumRegs = MI.getNumOperands() - Desc.getNumOperands() + 1;
+ switch (Subtarget.getLdStMultipleTiming()) {
+ case ARMSubtarget::SingleIssuePlusExtras:
+ return getNumMicroOpsSingleIssuePlusExtras(Opc, NumRegs);
+ case ARMSubtarget::SingleIssue:
+ // Assume the worst.
+ return NumRegs;
+ case ARMSubtarget::DoubleIssue: {
+ if (NumRegs < 4)
+ return 2;
+ // 4 registers would be issued: 2, 2.
+ // 5 registers would be issued: 2, 2, 1.
+ unsigned UOps = (NumRegs / 2);
+ if (NumRegs % 2)
+ ++UOps;
+ return UOps;
+ }
+ case ARMSubtarget::DoubleIssueCheckUnalignedAccess: {
+ unsigned UOps = (NumRegs / 2);
+ // If there are odd number of registers or if it's not 64-bit aligned,
+ // then it takes an extra AGU (Address Generation Unit) cycle.
+ if ((NumRegs % 2) || !MI.hasOneMemOperand() ||
+ (*MI.memoperands_begin())->getAlignment() < 8)
+ ++UOps;
+ return UOps;
+ }
+ }
+ }
+ }
+ llvm_unreachable("Didn't find the number of microops");
+}
+
+int
+ARMBaseInstrInfo::getVLDMDefCycle(const InstrItineraryData *ItinData,
+ const MCInstrDesc &DefMCID,
+ unsigned DefClass,
+ unsigned DefIdx, unsigned DefAlign) const {
+ int RegNo = (int)(DefIdx+1) - DefMCID.getNumOperands() + 1;
+ if (RegNo <= 0)
+ // Def is the address writeback.
+ return ItinData->getOperandCycle(DefClass, DefIdx);
+
+ int DefCycle;
+ if (Subtarget.isCortexA8() || Subtarget.isCortexA7()) {
+ // (regno / 2) + (regno % 2) + 1
+ DefCycle = RegNo / 2 + 1;
+ if (RegNo % 2)
+ ++DefCycle;
+ } else if (Subtarget.isLikeA9() || Subtarget.isSwift()) {
+ DefCycle = RegNo;
+ bool isSLoad = false;
+
+ switch (DefMCID.getOpcode()) {
+ default: break;
+ case ARM::VLDMSIA:
+ case ARM::VLDMSIA_UPD:
+ case ARM::VLDMSDB_UPD:
+ isSLoad = true;
+ break;
+ }
+
+ // If there are odd number of 'S' registers or if it's not 64-bit aligned,
+ // then it takes an extra cycle.
+ if ((isSLoad && (RegNo % 2)) || DefAlign < 8)
+ ++DefCycle;
+ } else {
+ // Assume the worst.
+ DefCycle = RegNo + 2;
+ }
+
+ return DefCycle;
+}
+
+int
+ARMBaseInstrInfo::getLDMDefCycle(const InstrItineraryData *ItinData,
+ const MCInstrDesc &DefMCID,
+ unsigned DefClass,
+ unsigned DefIdx, unsigned DefAlign) const {
+ int RegNo = (int)(DefIdx+1) - DefMCID.getNumOperands() + 1;
+ if (RegNo <= 0)
+ // Def is the address writeback.
+ return ItinData->getOperandCycle(DefClass, DefIdx);
+
+ int DefCycle;
+ if (Subtarget.isCortexA8() || Subtarget.isCortexA7()) {
+ // 4 registers would be issued: 1, 2, 1.
+ // 5 registers would be issued: 1, 2, 2.
+ DefCycle = RegNo / 2;
+ if (DefCycle < 1)
+ DefCycle = 1;
+ // Result latency is issue cycle + 2: E2.
+ DefCycle += 2;
+ } else if (Subtarget.isLikeA9() || Subtarget.isSwift()) {
+ DefCycle = (RegNo / 2);
+ // If there are odd number of registers or if it's not 64-bit aligned,
+ // then it takes an extra AGU (Address Generation Unit) cycle.
+ if ((RegNo % 2) || DefAlign < 8)
+ ++DefCycle;
+ // Result latency is AGU cycles + 2.
+ DefCycle += 2;
+ } else {
+ // Assume the worst.
+ DefCycle = RegNo + 2;
+ }
+
+ return DefCycle;
+}
+
+int
+ARMBaseInstrInfo::getVSTMUseCycle(const InstrItineraryData *ItinData,
+ const MCInstrDesc &UseMCID,
+ unsigned UseClass,
+ unsigned UseIdx, unsigned UseAlign) const {
+ int RegNo = (int)(UseIdx+1) - UseMCID.getNumOperands() + 1;
+ if (RegNo <= 0)
+ return ItinData->getOperandCycle(UseClass, UseIdx);
+
+ int UseCycle;
+ if (Subtarget.isCortexA8() || Subtarget.isCortexA7()) {
+ // (regno / 2) + (regno % 2) + 1
+ UseCycle = RegNo / 2 + 1;
+ if (RegNo % 2)
+ ++UseCycle;
+ } else if (Subtarget.isLikeA9() || Subtarget.isSwift()) {
+ UseCycle = RegNo;
+ bool isSStore = false;
+
+ switch (UseMCID.getOpcode()) {
+ default: break;
+ case ARM::VSTMSIA:
+ case ARM::VSTMSIA_UPD:
+ case ARM::VSTMSDB_UPD:
+ isSStore = true;
+ break;
+ }
+
+ // If there are odd number of 'S' registers or if it's not 64-bit aligned,
+ // then it takes an extra cycle.
+ if ((isSStore && (RegNo % 2)) || UseAlign < 8)
+ ++UseCycle;
+ } else {
+ // Assume the worst.
+ UseCycle = RegNo + 2;
+ }
+
+ return UseCycle;
+}
+
+int
+ARMBaseInstrInfo::getSTMUseCycle(const InstrItineraryData *ItinData,
+ const MCInstrDesc &UseMCID,
+ unsigned UseClass,
+ unsigned UseIdx, unsigned UseAlign) const {
+ int RegNo = (int)(UseIdx+1) - UseMCID.getNumOperands() + 1;
+ if (RegNo <= 0)
+ return ItinData->getOperandCycle(UseClass, UseIdx);
+
+ int UseCycle;
+ if (Subtarget.isCortexA8() || Subtarget.isCortexA7()) {
+ UseCycle = RegNo / 2;
+ if (UseCycle < 2)
+ UseCycle = 2;
+ // Read in E3.
+ UseCycle += 2;
+ } else if (Subtarget.isLikeA9() || Subtarget.isSwift()) {
+ UseCycle = (RegNo / 2);
+ // If there are odd number of registers or if it's not 64-bit aligned,
+ // then it takes an extra AGU (Address Generation Unit) cycle.
+ if ((RegNo % 2) || UseAlign < 8)
+ ++UseCycle;
+ } else {
+ // Assume the worst.
+ UseCycle = 1;
+ }
+ return UseCycle;
+}
+
+int
+ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
+ const MCInstrDesc &DefMCID,
+ unsigned DefIdx, unsigned DefAlign,
+ const MCInstrDesc &UseMCID,
+ unsigned UseIdx, unsigned UseAlign) const {
+ unsigned DefClass = DefMCID.getSchedClass();
+ unsigned UseClass = UseMCID.getSchedClass();
+
+ if (DefIdx < DefMCID.getNumDefs() && UseIdx < UseMCID.getNumOperands())
+ return ItinData->getOperandLatency(DefClass, DefIdx, UseClass, UseIdx);
+
+ // This may be a def / use of a variable_ops instruction, the operand
+ // latency might be determinable dynamically. Let the target try to
+ // figure it out.
+ int DefCycle = -1;
+ bool LdmBypass = false;
+ switch (DefMCID.getOpcode()) {
+ default:
+ DefCycle = ItinData->getOperandCycle(DefClass, DefIdx);
+ break;
+
+ case ARM::VLDMDIA:
+ case ARM::VLDMDIA_UPD:
+ case ARM::VLDMDDB_UPD:
+ case ARM::VLDMSIA:
+ case ARM::VLDMSIA_UPD:
+ case ARM::VLDMSDB_UPD:
+ DefCycle = getVLDMDefCycle(ItinData, DefMCID, DefClass, DefIdx, DefAlign);
+ break;
+
+ case ARM::LDMIA_RET:
+ case ARM::LDMIA:
+ case ARM::LDMDA:
+ case ARM::LDMDB:
+ case ARM::LDMIB:
+ case ARM::LDMIA_UPD:
+ case ARM::LDMDA_UPD:
+ case ARM::LDMDB_UPD:
+ case ARM::LDMIB_UPD:
+ case ARM::tLDMIA:
+ case ARM::tLDMIA_UPD:
+ case ARM::tPUSH:
+ case ARM::t2LDMIA_RET:
+ case ARM::t2LDMIA:
+ case ARM::t2LDMDB:
+ case ARM::t2LDMIA_UPD:
+ case ARM::t2LDMDB_UPD:
+ LdmBypass = 1;
+ DefCycle = getLDMDefCycle(ItinData, DefMCID, DefClass, DefIdx, DefAlign);
+ break;
+ }
+
+ if (DefCycle == -1)
+ // We can't seem to determine the result latency of the def, assume it's 2.
+ DefCycle = 2;
+
+ int UseCycle = -1;
+ switch (UseMCID.getOpcode()) {
+ default:
+ UseCycle = ItinData->getOperandCycle(UseClass, UseIdx);
+ break;
+
+ case ARM::VSTMDIA:
+ case ARM::VSTMDIA_UPD:
+ case ARM::VSTMDDB_UPD:
+ case ARM::VSTMSIA:
+ case ARM::VSTMSIA_UPD:
+ case ARM::VSTMSDB_UPD:
+ UseCycle = getVSTMUseCycle(ItinData, UseMCID, UseClass, UseIdx, UseAlign);
+ break;
+
+ case ARM::STMIA:
+ case ARM::STMDA:
+ case ARM::STMDB:
+ case ARM::STMIB:
+ case ARM::STMIA_UPD:
+ case ARM::STMDA_UPD:
+ case ARM::STMDB_UPD:
+ case ARM::STMIB_UPD:
+ case ARM::tSTMIA_UPD:
+ case ARM::tPOP_RET:
+ case ARM::tPOP:
+ case ARM::t2STMIA:
+ case ARM::t2STMDB:
+ case ARM::t2STMIA_UPD:
+ case ARM::t2STMDB_UPD:
+ UseCycle = getSTMUseCycle(ItinData, UseMCID, UseClass, UseIdx, UseAlign);
+ break;
+ }
+
+ if (UseCycle == -1)
+ // Assume it's read in the first stage.
+ UseCycle = 1;
+
+ UseCycle = DefCycle - UseCycle + 1;
+ if (UseCycle > 0) {
+ if (LdmBypass) {
+ // It's a variable_ops instruction so we can't use DefIdx here. Just use
+ // first def operand.
+ if (ItinData->hasPipelineForwarding(DefClass, DefMCID.getNumOperands()-1,
+ UseClass, UseIdx))
+ --UseCycle;
+ } else if (ItinData->hasPipelineForwarding(DefClass, DefIdx,
+ UseClass, UseIdx)) {
+ --UseCycle;
+ }
+ }
+
+ return UseCycle;
+}
+
+static const MachineInstr *getBundledDefMI(const TargetRegisterInfo *TRI,
+ const MachineInstr *MI, unsigned Reg,
+ unsigned &DefIdx, unsigned &Dist) {
+ Dist = 0;
+
+ MachineBasicBlock::const_iterator I = MI; ++I;
+ MachineBasicBlock::const_instr_iterator II = std::prev(I.getInstrIterator());
+ assert(II->isInsideBundle() && "Empty bundle?");
+
+ int Idx = -1;
+ while (II->isInsideBundle()) {
+ Idx = II->findRegisterDefOperandIdx(Reg, false, true, TRI);
+ if (Idx != -1)
+ break;
+ --II;
+ ++Dist;
+ }
+
+ assert(Idx != -1 && "Cannot find bundled definition!");
+ DefIdx = Idx;
+ return &*II;
+}
+
+static const MachineInstr *getBundledUseMI(const TargetRegisterInfo *TRI,
+ const MachineInstr &MI, unsigned Reg,
+ unsigned &UseIdx, unsigned &Dist) {
+ Dist = 0;
+
+ MachineBasicBlock::const_instr_iterator II = ++MI.getIterator();
+ assert(II->isInsideBundle() && "Empty bundle?");
+ MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
+
+ // FIXME: This doesn't properly handle multiple uses.
+ int Idx = -1;
+ while (II != E && II->isInsideBundle()) {
+ Idx = II->findRegisterUseOperandIdx(Reg, false, TRI);
+ if (Idx != -1)
+ break;
+ if (II->getOpcode() != ARM::t2IT)
+ ++Dist;
+ ++II;
+ }
+
+ if (Idx == -1) {
+ Dist = 0;
+ return nullptr;
+ }
+
+ UseIdx = Idx;
+ return &*II;
+}
+
+/// Return the number of cycles to add to (or subtract from) the static
+/// itinerary based on the def opcode and alignment. The caller will ensure that
+/// adjusted latency is at least one cycle.
+static int adjustDefLatency(const ARMSubtarget &Subtarget,
+ const MachineInstr &DefMI,
+ const MCInstrDesc &DefMCID, unsigned DefAlign) {
+ int Adjust = 0;
+ if (Subtarget.isCortexA8() || Subtarget.isLikeA9() || Subtarget.isCortexA7()) {
+ // FIXME: Shifter op hack: no shift (i.e. [r +/- r]) or [r + r << 2]
+ // variants are one cycle cheaper.
+ switch (DefMCID.getOpcode()) {
+ default: break;
+ case ARM::LDRrs:
+ case ARM::LDRBrs: {
+ unsigned ShOpVal = DefMI.getOperand(3).getImm();
+ unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+ if (ShImm == 0 ||
+ (ShImm == 2 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl))
+ --Adjust;
+ break;
+ }
+ case ARM::t2LDRs:
+ case ARM::t2LDRBs:
+ case ARM::t2LDRHs:
+ case ARM::t2LDRSHs: {
+ // Thumb2 mode: lsl only.
+ unsigned ShAmt = DefMI.getOperand(3).getImm();
+ if (ShAmt == 0 || ShAmt == 2)
+ --Adjust;
+ break;
+ }
+ }
+ } else if (Subtarget.isSwift()) {
+ // FIXME: Properly handle all of the latency adjustments for address
+ // writeback.
+ switch (DefMCID.getOpcode()) {
+ default: break;
+ case ARM::LDRrs:
+ case ARM::LDRBrs: {
+ unsigned ShOpVal = DefMI.getOperand(3).getImm();
+ bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
+ unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+ if (!isSub &&
+ (ShImm == 0 ||
+ ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+ ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)))
+ Adjust -= 2;
+ else if (!isSub &&
+ ShImm == 1 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsr)
+ --Adjust;
+ break;
+ }
+ case ARM::t2LDRs:
+ case ARM::t2LDRBs:
+ case ARM::t2LDRHs:
+ case ARM::t2LDRSHs: {
+ // Thumb2 mode: lsl only.
+ unsigned ShAmt = DefMI.getOperand(3).getImm();
+ if (ShAmt == 0 || ShAmt == 1 || ShAmt == 2 || ShAmt == 3)
+ Adjust -= 2;
+ break;
+ }
+ }
+ }
+
+ if (DefAlign < 8 && Subtarget.checkVLDnAccessAlignment()) {
+ switch (DefMCID.getOpcode()) {
+ default: break;
+ case ARM::VLD1q8:
+ case ARM::VLD1q16:
+ case ARM::VLD1q32:
+ case ARM::VLD1q64:
+ case ARM::VLD1q8wb_fixed:
+ case ARM::VLD1q16wb_fixed:
+ case ARM::VLD1q32wb_fixed:
+ case ARM::VLD1q64wb_fixed:
+ case ARM::VLD1q8wb_register:
+ case ARM::VLD1q16wb_register:
+ case ARM::VLD1q32wb_register:
+ case ARM::VLD1q64wb_register:
+ case ARM::VLD2d8:
+ case ARM::VLD2d16:
+ case ARM::VLD2d32:
+ case ARM::VLD2q8:
+ case ARM::VLD2q16:
+ case ARM::VLD2q32:
+ case ARM::VLD2d8wb_fixed:
+ case ARM::VLD2d16wb_fixed:
+ case ARM::VLD2d32wb_fixed:
+ case ARM::VLD2q8wb_fixed:
+ case ARM::VLD2q16wb_fixed:
+ case ARM::VLD2q32wb_fixed:
+ case ARM::VLD2d8wb_register:
+ case ARM::VLD2d16wb_register:
+ case ARM::VLD2d32wb_register:
+ case ARM::VLD2q8wb_register:
+ case ARM::VLD2q16wb_register:
+ case ARM::VLD2q32wb_register:
+ case ARM::VLD3d8:
+ case ARM::VLD3d16:
+ case ARM::VLD3d32:
+ case ARM::VLD1d64T:
+ case ARM::VLD3d8_UPD:
+ case ARM::VLD3d16_UPD:
+ case ARM::VLD3d32_UPD:
+ case ARM::VLD1d64Twb_fixed:
+ case ARM::VLD1d64Twb_register:
+ case ARM::VLD3q8_UPD:
+ case ARM::VLD3q16_UPD:
+ case ARM::VLD3q32_UPD:
+ case ARM::VLD4d8:
+ case ARM::VLD4d16:
+ case ARM::VLD4d32:
+ case ARM::VLD1d64Q:
+ case ARM::VLD4d8_UPD:
+ case ARM::VLD4d16_UPD:
+ case ARM::VLD4d32_UPD:
+ case ARM::VLD1d64Qwb_fixed:
+ case ARM::VLD1d64Qwb_register:
+ case ARM::VLD4q8_UPD:
+ case ARM::VLD4q16_UPD:
+ case ARM::VLD4q32_UPD:
+ case ARM::VLD1DUPq8:
+ case ARM::VLD1DUPq16:
+ case ARM::VLD1DUPq32:
+ case ARM::VLD1DUPq8wb_fixed:
+ case ARM::VLD1DUPq16wb_fixed:
+ case ARM::VLD1DUPq32wb_fixed:
+ case ARM::VLD1DUPq8wb_register:
+ case ARM::VLD1DUPq16wb_register:
+ case ARM::VLD1DUPq32wb_register:
+ case ARM::VLD2DUPd8:
+ case ARM::VLD2DUPd16:
+ case ARM::VLD2DUPd32:
+ case ARM::VLD2DUPd8wb_fixed:
+ case ARM::VLD2DUPd16wb_fixed:
+ case ARM::VLD2DUPd32wb_fixed:
+ case ARM::VLD2DUPd8wb_register:
+ case ARM::VLD2DUPd16wb_register:
+ case ARM::VLD2DUPd32wb_register:
+ case ARM::VLD4DUPd8:
+ case ARM::VLD4DUPd16:
+ case ARM::VLD4DUPd32:
+ case ARM::VLD4DUPd8_UPD:
+ case ARM::VLD4DUPd16_UPD:
+ case ARM::VLD4DUPd32_UPD:
+ case ARM::VLD1LNd8:
+ case ARM::VLD1LNd16:
+ case ARM::VLD1LNd32:
+ case ARM::VLD1LNd8_UPD:
+ case ARM::VLD1LNd16_UPD:
+ case ARM::VLD1LNd32_UPD:
+ case ARM::VLD2LNd8:
+ case ARM::VLD2LNd16:
+ case ARM::VLD2LNd32:
+ case ARM::VLD2LNq16:
+ case ARM::VLD2LNq32:
+ case ARM::VLD2LNd8_UPD:
+ case ARM::VLD2LNd16_UPD:
+ case ARM::VLD2LNd32_UPD:
+ case ARM::VLD2LNq16_UPD:
+ case ARM::VLD2LNq32_UPD:
+ case ARM::VLD4LNd8:
+ case ARM::VLD4LNd16:
+ case ARM::VLD4LNd32:
+ case ARM::VLD4LNq16:
+ case ARM::VLD4LNq32:
+ case ARM::VLD4LNd8_UPD:
+ case ARM::VLD4LNd16_UPD:
+ case ARM::VLD4LNd32_UPD:
+ case ARM::VLD4LNq16_UPD:
+ case ARM::VLD4LNq32_UPD:
+ // If the address is not 64-bit aligned, the latencies of these
+ // instructions increases by one.
+ ++Adjust;
+ break;
+ }
+ }
+ return Adjust;
+}
+
+int ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
+ const MachineInstr &DefMI,
+ unsigned DefIdx,
+ const MachineInstr &UseMI,
+ unsigned UseIdx) const {
+ // No operand latency. The caller may fall back to getInstrLatency.
+ if (!ItinData || ItinData->isEmpty())
+ return -1;
+
+ const MachineOperand &DefMO = DefMI.getOperand(DefIdx);
+ unsigned Reg = DefMO.getReg();
+
+ const MachineInstr *ResolvedDefMI = &DefMI;
+ unsigned DefAdj = 0;
+ if (DefMI.isBundle())
+ ResolvedDefMI =
+ getBundledDefMI(&getRegisterInfo(), &DefMI, Reg, DefIdx, DefAdj);
+ if (ResolvedDefMI->isCopyLike() || ResolvedDefMI->isInsertSubreg() ||
+ ResolvedDefMI->isRegSequence() || ResolvedDefMI->isImplicitDef()) {
+ return 1;
+ }
+
+ const MachineInstr *ResolvedUseMI = &UseMI;
+ unsigned UseAdj = 0;
+ if (UseMI.isBundle()) {
+ ResolvedUseMI =
+ getBundledUseMI(&getRegisterInfo(), UseMI, Reg, UseIdx, UseAdj);
+ if (!ResolvedUseMI)
+ return -1;
+ }
+
+ return getOperandLatencyImpl(
+ ItinData, *ResolvedDefMI, DefIdx, ResolvedDefMI->getDesc(), DefAdj, DefMO,
+ Reg, *ResolvedUseMI, UseIdx, ResolvedUseMI->getDesc(), UseAdj);
+}
+
+int ARMBaseInstrInfo::getOperandLatencyImpl(
+ const InstrItineraryData *ItinData, const MachineInstr &DefMI,
+ unsigned DefIdx, const MCInstrDesc &DefMCID, unsigned DefAdj,
+ const MachineOperand &DefMO, unsigned Reg, const MachineInstr &UseMI,
+ unsigned UseIdx, const MCInstrDesc &UseMCID, unsigned UseAdj) const {
+ if (Reg == ARM::CPSR) {
+ if (DefMI.getOpcode() == ARM::FMSTAT) {
+ // fpscr -> cpsr stalls over 20 cycles on A8 (and earlier?)
+ return Subtarget.isLikeA9() ? 1 : 20;
+ }
+
+ // CPSR set and branch can be paired in the same cycle.
+ if (UseMI.isBranch())
+ return 0;
+
+ // Otherwise it takes the instruction latency (generally one).
+ unsigned Latency = getInstrLatency(ItinData, DefMI);
+
+ // For Thumb2 and -Os, prefer scheduling CPSR setting instruction close to
+ // its uses. Instructions which are otherwise scheduled between them may
+ // incur a code size penalty (not able to use the CPSR setting 16-bit
+ // instructions).
+ if (Latency > 0 && Subtarget.isThumb2()) {
+ const MachineFunction *MF = DefMI.getParent()->getParent();
+ // FIXME: Use Function::optForSize().
+ if (MF->getFunction()->hasFnAttribute(Attribute::OptimizeForSize))
+ --Latency;
+ }
+ return Latency;
+ }
+
+ if (DefMO.isImplicit() || UseMI.getOperand(UseIdx).isImplicit())
+ return -1;
+
+ unsigned DefAlign = DefMI.hasOneMemOperand()
+ ? (*DefMI.memoperands_begin())->getAlignment()
+ : 0;
+ unsigned UseAlign = UseMI.hasOneMemOperand()
+ ? (*UseMI.memoperands_begin())->getAlignment()
+ : 0;
+
+ // Get the itinerary's latency if possible, and handle variable_ops.
+ int Latency = getOperandLatency(ItinData, DefMCID, DefIdx, DefAlign, UseMCID,
+ UseIdx, UseAlign);
+ // Unable to find operand latency. The caller may resort to getInstrLatency.
+ if (Latency < 0)
+ return Latency;
+
+ // Adjust for IT block position.
+ int Adj = DefAdj + UseAdj;
+
+ // Adjust for dynamic def-side opcode variants not captured by the itinerary.
+ Adj += adjustDefLatency(Subtarget, DefMI, DefMCID, DefAlign);
+ if (Adj >= 0 || (int)Latency > -Adj) {
+ return Latency + Adj;
+ }
+ // Return the itinerary latency, which may be zero but not less than zero.
+ return Latency;
+}
+
+int
+ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
+ SDNode *DefNode, unsigned DefIdx,
+ SDNode *UseNode, unsigned UseIdx) const {
+ if (!DefNode->isMachineOpcode())
+ return 1;
+
+ const MCInstrDesc &DefMCID = get(DefNode->getMachineOpcode());
+
+ if (isZeroCost(DefMCID.Opcode))
+ return 0;
+
+ if (!ItinData || ItinData->isEmpty())
+ return DefMCID.mayLoad() ? 3 : 1;
+
+ if (!UseNode->isMachineOpcode()) {
+ int Latency = ItinData->getOperandCycle(DefMCID.getSchedClass(), DefIdx);
+ int Adj = Subtarget.getPreISelOperandLatencyAdjustment();
+ int Threshold = 1 + Adj;
+ return Latency <= Threshold ? 1 : Latency - Adj;
+ }
+
+ const MCInstrDesc &UseMCID = get(UseNode->getMachineOpcode());
+ const MachineSDNode *DefMN = dyn_cast<MachineSDNode>(DefNode);
+ unsigned DefAlign = !DefMN->memoperands_empty()
+ ? (*DefMN->memoperands_begin())->getAlignment() : 0;
+ const MachineSDNode *UseMN = dyn_cast<MachineSDNode>(UseNode);
+ unsigned UseAlign = !UseMN->memoperands_empty()
+ ? (*UseMN->memoperands_begin())->getAlignment() : 0;
+ int Latency = getOperandLatency(ItinData, DefMCID, DefIdx, DefAlign,
+ UseMCID, UseIdx, UseAlign);
+
+ if (Latency > 1 &&
+ (Subtarget.isCortexA8() || Subtarget.isLikeA9() ||
+ Subtarget.isCortexA7())) {
+ // FIXME: Shifter op hack: no shift (i.e. [r +/- r]) or [r + r << 2]
+ // variants are one cycle cheaper.
+ switch (DefMCID.getOpcode()) {
+ default: break;
+ case ARM::LDRrs:
+ case ARM::LDRBrs: {
+ unsigned ShOpVal =
+ cast<ConstantSDNode>(DefNode->getOperand(2))->getZExtValue();
+ unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+ if (ShImm == 0 ||
+ (ShImm == 2 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl))
+ --Latency;
+ break;
+ }
+ case ARM::t2LDRs:
+ case ARM::t2LDRBs:
+ case ARM::t2LDRHs:
+ case ARM::t2LDRSHs: {
+ // Thumb2 mode: lsl only.
+ unsigned ShAmt =
+ cast<ConstantSDNode>(DefNode->getOperand(2))->getZExtValue();
+ if (ShAmt == 0 || ShAmt == 2)
+ --Latency;
+ break;
+ }
+ }
+ } else if (DefIdx == 0 && Latency > 2 && Subtarget.isSwift()) {
+ // FIXME: Properly handle all of the latency adjustments for address
+ // writeback.
+ switch (DefMCID.getOpcode()) {
+ default: break;
+ case ARM::LDRrs:
+ case ARM::LDRBrs: {
+ unsigned ShOpVal =
+ cast<ConstantSDNode>(DefNode->getOperand(2))->getZExtValue();
+ unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+ if (ShImm == 0 ||
+ ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+ ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl))
+ Latency -= 2;
+ else if (ShImm == 1 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsr)
+ --Latency;
+ break;
+ }
+ case ARM::t2LDRs:
+ case ARM::t2LDRBs:
+ case ARM::t2LDRHs:
+ case ARM::t2LDRSHs: {
+ // Thumb2 mode: lsl 0-3 only.
+ Latency -= 2;
+ break;
+ }
+ }
+ }
+
+ if (DefAlign < 8 && Subtarget.checkVLDnAccessAlignment())
+ switch (DefMCID.getOpcode()) {
+ default: break;
+ case ARM::VLD1q8:
+ case ARM::VLD1q16:
+ case ARM::VLD1q32:
+ case ARM::VLD1q64:
+ case ARM::VLD1q8wb_register:
+ case ARM::VLD1q16wb_register:
+ case ARM::VLD1q32wb_register:
+ case ARM::VLD1q64wb_register:
+ case ARM::VLD1q8wb_fixed:
+ case ARM::VLD1q16wb_fixed:
+ case ARM::VLD1q32wb_fixed:
+ case ARM::VLD1q64wb_fixed:
+ case ARM::VLD2d8:
+ case ARM::VLD2d16:
+ case ARM::VLD2d32:
+ case ARM::VLD2q8Pseudo:
+ case ARM::VLD2q16Pseudo:
+ case ARM::VLD2q32Pseudo:
+ case ARM::VLD2d8wb_fixed:
+ case ARM::VLD2d16wb_fixed:
+ case ARM::VLD2d32wb_fixed:
+ case ARM::VLD2q8PseudoWB_fixed:
+ case ARM::VLD2q16PseudoWB_fixed:
+ case ARM::VLD2q32PseudoWB_fixed:
+ case ARM::VLD2d8wb_register:
+ case ARM::VLD2d16wb_register:
+ case ARM::VLD2d32wb_register:
+ case ARM::VLD2q8PseudoWB_register:
+ case ARM::VLD2q16PseudoWB_register:
+ case ARM::VLD2q32PseudoWB_register:
+ case ARM::VLD3d8Pseudo:
+ case ARM::VLD3d16Pseudo:
+ case ARM::VLD3d32Pseudo:
+ case ARM::VLD1d64TPseudo:
+ case ARM::VLD1d64TPseudoWB_fixed:
+ case ARM::VLD3d8Pseudo_UPD:
+ case ARM::VLD3d16Pseudo_UPD:
+ case ARM::VLD3d32Pseudo_UPD:
+ case ARM::VLD3q8Pseudo_UPD:
+ case ARM::VLD3q16Pseudo_UPD:
+ case ARM::VLD3q32Pseudo_UPD:
+ case ARM::VLD3q8oddPseudo:
+ case ARM::VLD3q16oddPseudo:
+ case ARM::VLD3q32oddPseudo:
+ case ARM::VLD3q8oddPseudo_UPD:
+ case ARM::VLD3q16oddPseudo_UPD:
+ case ARM::VLD3q32oddPseudo_UPD:
+ case ARM::VLD4d8Pseudo:
+ case ARM::VLD4d16Pseudo:
+ case ARM::VLD4d32Pseudo:
+ case ARM::VLD1d64QPseudo:
+ case ARM::VLD1d64QPseudoWB_fixed:
+ case ARM::VLD4d8Pseudo_UPD:
+ case ARM::VLD4d16Pseudo_UPD:
+ case ARM::VLD4d32Pseudo_UPD:
+ case ARM::VLD4q8Pseudo_UPD:
+ case ARM::VLD4q16Pseudo_UPD:
+ case ARM::VLD4q32Pseudo_UPD:
+ case ARM::VLD4q8oddPseudo:
+ case ARM::VLD4q16oddPseudo:
+ case ARM::VLD4q32oddPseudo:
+ case ARM::VLD4q8oddPseudo_UPD:
+ case ARM::VLD4q16oddPseudo_UPD:
+ case ARM::VLD4q32oddPseudo_UPD:
+ case ARM::VLD1DUPq8:
+ case ARM::VLD1DUPq16:
+ case ARM::VLD1DUPq32:
+ case ARM::VLD1DUPq8wb_fixed:
+ case ARM::VLD1DUPq16wb_fixed:
+ case ARM::VLD1DUPq32wb_fixed:
+ case ARM::VLD1DUPq8wb_register:
+ case ARM::VLD1DUPq16wb_register:
+ case ARM::VLD1DUPq32wb_register:
+ case ARM::VLD2DUPd8:
+ case ARM::VLD2DUPd16:
+ case ARM::VLD2DUPd32:
+ case ARM::VLD2DUPd8wb_fixed:
+ case ARM::VLD2DUPd16wb_fixed:
+ case ARM::VLD2DUPd32wb_fixed:
+ case ARM::VLD2DUPd8wb_register:
+ case ARM::VLD2DUPd16wb_register:
+ case ARM::VLD2DUPd32wb_register:
+ case ARM::VLD4DUPd8Pseudo:
+ case ARM::VLD4DUPd16Pseudo:
+ case ARM::VLD4DUPd32Pseudo:
+ case ARM::VLD4DUPd8Pseudo_UPD:
+ case ARM::VLD4DUPd16Pseudo_UPD:
+ case ARM::VLD4DUPd32Pseudo_UPD:
+ case ARM::VLD1LNq8Pseudo:
+ case ARM::VLD1LNq16Pseudo:
+ case ARM::VLD1LNq32Pseudo:
+ case ARM::VLD1LNq8Pseudo_UPD:
+ case ARM::VLD1LNq16Pseudo_UPD:
+ case ARM::VLD1LNq32Pseudo_UPD:
+ case ARM::VLD2LNd8Pseudo:
+ case ARM::VLD2LNd16Pseudo:
+ case ARM::VLD2LNd32Pseudo:
+ case ARM::VLD2LNq16Pseudo:
+ case ARM::VLD2LNq32Pseudo:
+ case ARM::VLD2LNd8Pseudo_UPD:
+ case ARM::VLD2LNd16Pseudo_UPD:
+ case ARM::VLD2LNd32Pseudo_UPD:
+ case ARM::VLD2LNq16Pseudo_UPD:
+ case ARM::VLD2LNq32Pseudo_UPD:
+ case ARM::VLD4LNd8Pseudo:
+ case ARM::VLD4LNd16Pseudo:
+ case ARM::VLD4LNd32Pseudo:
+ case ARM::VLD4LNq16Pseudo:
+ case ARM::VLD4LNq32Pseudo:
+ case ARM::VLD4LNd8Pseudo_UPD:
+ case ARM::VLD4LNd16Pseudo_UPD:
+ case ARM::VLD4LNd32Pseudo_UPD:
+ case ARM::VLD4LNq16Pseudo_UPD:
+ case ARM::VLD4LNq32Pseudo_UPD:
+ // If the address is not 64-bit aligned, the latencies of these
+ // instructions increases by one.
+ ++Latency;
+ break;
+ }
+
+ return Latency;
+}
+
+unsigned ARMBaseInstrInfo::getPredicationCost(const MachineInstr &MI) const {
+ if (MI.isCopyLike() || MI.isInsertSubreg() || MI.isRegSequence() ||
+ MI.isImplicitDef())
+ return 0;
+
+ if (MI.isBundle())
+ return 0;
+
+ const MCInstrDesc &MCID = MI.getDesc();
+
+ if (MCID.isCall() || MCID.hasImplicitDefOfPhysReg(ARM::CPSR)) {
+ // When predicated, CPSR is an additional source operand for CPSR updating
+ // instructions, this apparently increases their latencies.
+ return 1;
+ }
+ return 0;
+}
+
+unsigned ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
+ const MachineInstr &MI,
+ unsigned *PredCost) const {
+ if (MI.isCopyLike() || MI.isInsertSubreg() || MI.isRegSequence() ||
+ MI.isImplicitDef())
+ return 1;
+
+ // An instruction scheduler typically runs on unbundled instructions, however
+ // other passes may query the latency of a bundled instruction.
+ if (MI.isBundle()) {
+ unsigned Latency = 0;
+ MachineBasicBlock::const_instr_iterator I = MI.getIterator();
+ MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
+ while (++I != E && I->isInsideBundle()) {
+ if (I->getOpcode() != ARM::t2IT)
+ Latency += getInstrLatency(ItinData, *I, PredCost);
+ }
+ return Latency;
+ }
+
+ const MCInstrDesc &MCID = MI.getDesc();
+ if (PredCost && (MCID.isCall() || MCID.hasImplicitDefOfPhysReg(ARM::CPSR))) {
+ // When predicated, CPSR is an additional source operand for CPSR updating
+ // instructions, this apparently increases their latencies.
+ *PredCost = 1;
+ }
+ // Be sure to call getStageLatency for an empty itinerary in case it has a
+ // valid MinLatency property.
+ if (!ItinData)
+ return MI.mayLoad() ? 3 : 1;
+
+ unsigned Class = MCID.getSchedClass();
+
+ // For instructions with variable uops, use uops as latency.
+ if (!ItinData->isEmpty() && ItinData->getNumMicroOps(Class) < 0)
+ return getNumMicroOps(ItinData, MI);
+
+ // For the common case, fall back on the itinerary's latency.
+ unsigned Latency = ItinData->getStageLatency(Class);
+
+ // Adjust for dynamic def-side opcode variants not captured by the itinerary.
+ unsigned DefAlign =
+ MI.hasOneMemOperand() ? (*MI.memoperands_begin())->getAlignment() : 0;
+ int Adj = adjustDefLatency(Subtarget, MI, MCID, DefAlign);
+ if (Adj >= 0 || (int)Latency > -Adj) {
+ return Latency + Adj;
+ }
+ return Latency;
+}
+
+int ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
+ SDNode *Node) const {
+ if (!Node->isMachineOpcode())
+ return 1;
+
+ if (!ItinData || ItinData->isEmpty())
+ return 1;
+
+ unsigned Opcode = Node->getMachineOpcode();
+ switch (Opcode) {
+ default:
+ return ItinData->getStageLatency(get(Opcode).getSchedClass());
+ case ARM::VLDMQIA:
+ case ARM::VSTMQIA:
+ return 2;
+ }
+}
+
+bool ARMBaseInstrInfo::hasHighOperandLatency(const TargetSchedModel &SchedModel,
+ const MachineRegisterInfo *MRI,
+ const MachineInstr &DefMI,
+ unsigned DefIdx,
+ const MachineInstr &UseMI,
+ unsigned UseIdx) const {
+ unsigned DDomain = DefMI.getDesc().TSFlags & ARMII::DomainMask;
+ unsigned UDomain = UseMI.getDesc().TSFlags & ARMII::DomainMask;
+ if (Subtarget.nonpipelinedVFP() &&
+ (DDomain == ARMII::DomainVFP || UDomain == ARMII::DomainVFP))
+ return true;
+
+ // Hoist VFP / NEON instructions with 4 or higher latency.
+ unsigned Latency =
+ SchedModel.computeOperandLatency(&DefMI, DefIdx, &UseMI, UseIdx);
+ if (Latency <= 3)
+ return false;
+ return DDomain == ARMII::DomainVFP || DDomain == ARMII::DomainNEON ||
+ UDomain == ARMII::DomainVFP || UDomain == ARMII::DomainNEON;
+}
+
+bool ARMBaseInstrInfo::hasLowDefLatency(const TargetSchedModel &SchedModel,
+ const MachineInstr &DefMI,
+ unsigned DefIdx) const {
+ const InstrItineraryData *ItinData = SchedModel.getInstrItineraries();
+ if (!ItinData || ItinData->isEmpty())
+ return false;
+
+ unsigned DDomain = DefMI.getDesc().TSFlags & ARMII::DomainMask;
+ if (DDomain == ARMII::DomainGeneral) {
+ unsigned DefClass = DefMI.getDesc().getSchedClass();
+ int DefCycle = ItinData->getOperandCycle(DefClass, DefIdx);
+ return (DefCycle != -1 && DefCycle <= 2);
+ }
+ return false;
+}
+
+bool ARMBaseInstrInfo::verifyInstruction(const MachineInstr &MI,
+ StringRef &ErrInfo) const {
+ if (convertAddSubFlagsOpcode(MI.getOpcode())) {
+ ErrInfo = "Pseudo flag setting opcodes only exist in Selection DAG";
+ return false;
+ }
+ return true;
+}
+
+// LoadStackGuard has so far only been implemented for MachO. Different code
+// sequence is needed for other targets.
+void ARMBaseInstrInfo::expandLoadStackGuardBase(MachineBasicBlock::iterator MI,
+ unsigned LoadImmOpc,
+ unsigned LoadOpc) const {
+ assert(!Subtarget.isROPI() && !Subtarget.isRWPI() &&
+ "ROPI/RWPI not currently supported with stack guard");
+
+ MachineBasicBlock &MBB = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+ unsigned Reg = MI->getOperand(0).getReg();
+ const GlobalValue *GV =
+ cast<GlobalValue>((*MI->memoperands_begin())->getValue());
+ MachineInstrBuilder MIB;
+
+ BuildMI(MBB, MI, DL, get(LoadImmOpc), Reg)
+ .addGlobalAddress(GV, 0, ARMII::MO_NONLAZY);
+
+ if (Subtarget.isGVIndirectSymbol(GV)) {
+ MIB = BuildMI(MBB, MI, DL, get(LoadOpc), Reg);
+ MIB.addReg(Reg, RegState::Kill).addImm(0);
+ auto Flags = MachineMemOperand::MOLoad |
+ MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant;
+ MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
+ MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 4, 4);
+ MIB.addMemOperand(MMO);
+ AddDefaultPred(MIB);
+ }
+
+ MIB = BuildMI(MBB, MI, DL, get(LoadOpc), Reg);
+ MIB.addReg(Reg, RegState::Kill).addImm(0);
+ MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+ AddDefaultPred(MIB);
+}
+
+bool
+ARMBaseInstrInfo::isFpMLxInstruction(unsigned Opcode, unsigned &MulOpc,
+ unsigned &AddSubOpc,
+ bool &NegAcc, bool &HasLane) const {
+ DenseMap<unsigned, unsigned>::const_iterator I = MLxEntryMap.find(Opcode);
+ if (I == MLxEntryMap.end())
+ return false;
+
+ const ARM_MLxEntry &Entry = ARM_MLxTable[I->second];
+ MulOpc = Entry.MulOpc;
+ AddSubOpc = Entry.AddSubOpc;
+ NegAcc = Entry.NegAcc;
+ HasLane = Entry.HasLane;
+ return true;
+}
+
+//===----------------------------------------------------------------------===//
+// Execution domains.
+//===----------------------------------------------------------------------===//
+//
+// Some instructions go down the NEON pipeline, some go down the VFP pipeline,
+// and some can go down both. The vmov instructions go down the VFP pipeline,
+// but they can be changed to vorr equivalents that are executed by the NEON
+// pipeline.
+//
+// We use the following execution domain numbering:
+//
+enum ARMExeDomain {
+ ExeGeneric = 0,
+ ExeVFP = 1,
+ ExeNEON = 2
+};
+//
+// Also see ARMInstrFormats.td and Domain* enums in ARMBaseInfo.h
+//
+std::pair<uint16_t, uint16_t>
+ARMBaseInstrInfo::getExecutionDomain(const MachineInstr &MI) const {
+ // If we don't have access to NEON instructions then we won't be able
+ // to swizzle anything to the NEON domain. Check to make sure.
+ if (Subtarget.hasNEON()) {
+ // VMOVD, VMOVRS and VMOVSR are VFP instructions, but can be changed to NEON
+ // if they are not predicated.
+ if (MI.getOpcode() == ARM::VMOVD && !isPredicated(MI))
+ return std::make_pair(ExeVFP, (1 << ExeVFP) | (1 << ExeNEON));
+
+ // CortexA9 is particularly picky about mixing the two and wants these
+ // converted.
+ if (Subtarget.useNEONForFPMovs() && !isPredicated(MI) &&
+ (MI.getOpcode() == ARM::VMOVRS || MI.getOpcode() == ARM::VMOVSR ||
+ MI.getOpcode() == ARM::VMOVS))
+ return std::make_pair(ExeVFP, (1 << ExeVFP) | (1 << ExeNEON));
+ }
+ // No other instructions can be swizzled, so just determine their domain.
+ unsigned Domain = MI.getDesc().TSFlags & ARMII::DomainMask;
+
+ if (Domain & ARMII::DomainNEON)
+ return std::make_pair(ExeNEON, 0);
+
+ // Certain instructions can go either way on Cortex-A8.
+ // Treat them as NEON instructions.
+ if ((Domain & ARMII::DomainNEONA8) && Subtarget.isCortexA8())
+ return std::make_pair(ExeNEON, 0);
+
+ if (Domain & ARMII::DomainVFP)
+ return std::make_pair(ExeVFP, 0);
+
+ return std::make_pair(ExeGeneric, 0);
+}
+
+static unsigned getCorrespondingDRegAndLane(const TargetRegisterInfo *TRI,
+ unsigned SReg, unsigned &Lane) {
+ unsigned DReg = TRI->getMatchingSuperReg(SReg, ARM::ssub_0, &ARM::DPRRegClass);
+ Lane = 0;
+
+ if (DReg != ARM::NoRegister)
+ return DReg;
+
+ Lane = 1;
+ DReg = TRI->getMatchingSuperReg(SReg, ARM::ssub_1, &ARM::DPRRegClass);
+
+ assert(DReg && "S-register with no D super-register?");
+ return DReg;
+}
+
+/// getImplicitSPRUseForDPRUse - Given a use of a DPR register and lane,
+/// set ImplicitSReg to a register number that must be marked as implicit-use or
+/// zero if no register needs to be defined as implicit-use.
+///
+/// If the function cannot determine if an SPR should be marked implicit use or
+/// not, it returns false.
+///
+/// This function handles cases where an instruction is being modified from taking
+/// an SPR to a DPR[Lane]. A use of the DPR is being added, which may conflict
+/// with an earlier def of an SPR corresponding to DPR[Lane^1] (i.e. the other
+/// lane of the DPR).
+///
+/// If the other SPR is defined, an implicit-use of it should be added. Else,
+/// (including the case where the DPR itself is defined), it should not.
+///
+static bool getImplicitSPRUseForDPRUse(const TargetRegisterInfo *TRI,
+ MachineInstr &MI, unsigned DReg,
+ unsigned Lane, unsigned &ImplicitSReg) {
+ // If the DPR is defined or used already, the other SPR lane will be chained
+ // correctly, so there is nothing to be done.
+ if (MI.definesRegister(DReg, TRI) || MI.readsRegister(DReg, TRI)) {
+ ImplicitSReg = 0;
+ return true;
+ }
+
+ // Otherwise we need to go searching to see if the SPR is set explicitly.
+ ImplicitSReg = TRI->getSubReg(DReg,
+ (Lane & 1) ? ARM::ssub_0 : ARM::ssub_1);
+ MachineBasicBlock::LivenessQueryResult LQR =
+ MI.getParent()->computeRegisterLiveness(TRI, ImplicitSReg, MI);
+
+ if (LQR == MachineBasicBlock::LQR_Live)
+ return true;
+ else if (LQR == MachineBasicBlock::LQR_Unknown)
+ return false;
+
+ // If the register is known not to be live, there is no need to add an
+ // implicit-use.
+ ImplicitSReg = 0;
+ return true;
+}
+
+void ARMBaseInstrInfo::setExecutionDomain(MachineInstr &MI,
+ unsigned Domain) const {
+ unsigned DstReg, SrcReg, DReg;
+ unsigned Lane;
+ MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ switch (MI.getOpcode()) {
+ default:
+ llvm_unreachable("cannot handle opcode!");
+ break;
+ case ARM::VMOVD:
+ if (Domain != ExeNEON)
+ break;
+
+ // Zap the predicate operands.
+ assert(!isPredicated(MI) && "Cannot predicate a VORRd");
+
+ // Make sure we've got NEON instructions.
+ assert(Subtarget.hasNEON() && "VORRd requires NEON");
+
+ // Source instruction is %DDst = VMOVD %DSrc, 14, %noreg (; implicits)
+ DstReg = MI.getOperand(0).getReg();
+ SrcReg = MI.getOperand(1).getReg();
+
+ for (unsigned i = MI.getDesc().getNumOperands(); i; --i)
+ MI.RemoveOperand(i - 1);
+
+ // Change to a %DDst = VORRd %DSrc, %DSrc, 14, %noreg (; implicits)
+ MI.setDesc(get(ARM::VORRd));
+ AddDefaultPred(
+ MIB.addReg(DstReg, RegState::Define).addReg(SrcReg).addReg(SrcReg));
+ break;
+ case ARM::VMOVRS:
+ if (Domain != ExeNEON)
+ break;
+ assert(!isPredicated(MI) && "Cannot predicate a VGETLN");
+
+ // Source instruction is %RDst = VMOVRS %SSrc, 14, %noreg (; implicits)
+ DstReg = MI.getOperand(0).getReg();
+ SrcReg = MI.getOperand(1).getReg();
+
+ for (unsigned i = MI.getDesc().getNumOperands(); i; --i)
+ MI.RemoveOperand(i - 1);
+
+ DReg = getCorrespondingDRegAndLane(TRI, SrcReg, Lane);
+
+ // Convert to %RDst = VGETLNi32 %DSrc, Lane, 14, %noreg (; imps)
+ // Note that DSrc has been widened and the other lane may be undef, which
+ // contaminates the entire register.
+ MI.setDesc(get(ARM::VGETLNi32));
+ AddDefaultPred(MIB.addReg(DstReg, RegState::Define)
+ .addReg(DReg, RegState::Undef)
+ .addImm(Lane));
+
+ // The old source should be an implicit use, otherwise we might think it
+ // was dead before here.
+ MIB.addReg(SrcReg, RegState::Implicit);
+ break;
+ case ARM::VMOVSR: {
+ if (Domain != ExeNEON)
+ break;
+ assert(!isPredicated(MI) && "Cannot predicate a VSETLN");
+
+ // Source instruction is %SDst = VMOVSR %RSrc, 14, %noreg (; implicits)
+ DstReg = MI.getOperand(0).getReg();
+ SrcReg = MI.getOperand(1).getReg();
+
+ DReg = getCorrespondingDRegAndLane(TRI, DstReg, Lane);
+
+ unsigned ImplicitSReg;
+ if (!getImplicitSPRUseForDPRUse(TRI, MI, DReg, Lane, ImplicitSReg))
+ break;
+
+ for (unsigned i = MI.getDesc().getNumOperands(); i; --i)
+ MI.RemoveOperand(i - 1);
+
+ // Convert to %DDst = VSETLNi32 %DDst, %RSrc, Lane, 14, %noreg (; imps)
+ // Again DDst may be undefined at the beginning of this instruction.
+ MI.setDesc(get(ARM::VSETLNi32));
+ MIB.addReg(DReg, RegState::Define)
+ .addReg(DReg, getUndefRegState(!MI.readsRegister(DReg, TRI)))
+ .addReg(SrcReg)
+ .addImm(Lane);
+ AddDefaultPred(MIB);
+
+ // The narrower destination must be marked as set to keep previous chains
+ // in place.
+ MIB.addReg(DstReg, RegState::Define | RegState::Implicit);
+ if (ImplicitSReg != 0)
+ MIB.addReg(ImplicitSReg, RegState::Implicit);
+ break;
+ }
+ case ARM::VMOVS: {
+ if (Domain != ExeNEON)
+ break;
+
+ // Source instruction is %SDst = VMOVS %SSrc, 14, %noreg (; implicits)
+ DstReg = MI.getOperand(0).getReg();
+ SrcReg = MI.getOperand(1).getReg();
+
+ unsigned DstLane = 0, SrcLane = 0, DDst, DSrc;
+ DDst = getCorrespondingDRegAndLane(TRI, DstReg, DstLane);
+ DSrc = getCorrespondingDRegAndLane(TRI, SrcReg, SrcLane);
+
+ unsigned ImplicitSReg;
+ if (!getImplicitSPRUseForDPRUse(TRI, MI, DSrc, SrcLane, ImplicitSReg))
+ break;
+
+ for (unsigned i = MI.getDesc().getNumOperands(); i; --i)
+ MI.RemoveOperand(i - 1);
+
+ if (DSrc == DDst) {
+ // Destination can be:
+ // %DDst = VDUPLN32d %DDst, Lane, 14, %noreg (; implicits)
+ MI.setDesc(get(ARM::VDUPLN32d));
+ MIB.addReg(DDst, RegState::Define)
+ .addReg(DDst, getUndefRegState(!MI.readsRegister(DDst, TRI)))
+ .addImm(SrcLane);
+ AddDefaultPred(MIB);
+
+ // Neither the source or the destination are naturally represented any
+ // more, so add them in manually.
+ MIB.addReg(DstReg, RegState::Implicit | RegState::Define);
+ MIB.addReg(SrcReg, RegState::Implicit);
+ if (ImplicitSReg != 0)
+ MIB.addReg(ImplicitSReg, RegState::Implicit);
+ break;
+ }
+
+ // In general there's no single instruction that can perform an S <-> S
+ // move in NEON space, but a pair of VEXT instructions *can* do the
+ // job. It turns out that the VEXTs needed will only use DSrc once, with
+ // the position based purely on the combination of lane-0 and lane-1
+ // involved. For example
+ // vmov s0, s2 -> vext.32 d0, d0, d1, #1 vext.32 d0, d0, d0, #1
+ // vmov s1, s3 -> vext.32 d0, d1, d0, #1 vext.32 d0, d0, d0, #1
+ // vmov s0, s3 -> vext.32 d0, d0, d0, #1 vext.32 d0, d1, d0, #1
+ // vmov s1, s2 -> vext.32 d0, d0, d0, #1 vext.32 d0, d0, d1, #1
+ //
+ // Pattern of the MachineInstrs is:
+ // %DDst = VEXTd32 %DSrc1, %DSrc2, Lane, 14, %noreg (;implicits)
+ MachineInstrBuilder NewMIB;
+ NewMIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(ARM::VEXTd32),
+ DDst);
+
+ // On the first instruction, both DSrc and DDst may be <undef> if present.
+ // Specifically when the original instruction didn't have them as an
+ // <imp-use>.
+ unsigned CurReg = SrcLane == 1 && DstLane == 1 ? DSrc : DDst;
+ bool CurUndef = !MI.readsRegister(CurReg, TRI);
+ NewMIB.addReg(CurReg, getUndefRegState(CurUndef));
+
+ CurReg = SrcLane == 0 && DstLane == 0 ? DSrc : DDst;
+ CurUndef = !MI.readsRegister(CurReg, TRI);
+ NewMIB.addReg(CurReg, getUndefRegState(CurUndef));
+
+ NewMIB.addImm(1);
+ AddDefaultPred(NewMIB);
+
+ if (SrcLane == DstLane)
+ NewMIB.addReg(SrcReg, RegState::Implicit);
+
+ MI.setDesc(get(ARM::VEXTd32));
+ MIB.addReg(DDst, RegState::Define);
+
+ // On the second instruction, DDst has definitely been defined above, so
+ // it is not <undef>. DSrc, if present, can be <undef> as above.
+ CurReg = SrcLane == 1 && DstLane == 0 ? DSrc : DDst;
+ CurUndef = CurReg == DSrc && !MI.readsRegister(CurReg, TRI);
+ MIB.addReg(CurReg, getUndefRegState(CurUndef));
+
+ CurReg = SrcLane == 0 && DstLane == 1 ? DSrc : DDst;
+ CurUndef = CurReg == DSrc && !MI.readsRegister(CurReg, TRI);
+ MIB.addReg(CurReg, getUndefRegState(CurUndef));
+
+ MIB.addImm(1);
+ AddDefaultPred(MIB);
+
+ if (SrcLane != DstLane)
+ MIB.addReg(SrcReg, RegState::Implicit);
+
+ // As before, the original destination is no longer represented, add it
+ // implicitly.
+ MIB.addReg(DstReg, RegState::Define | RegState::Implicit);
+ if (ImplicitSReg != 0)
+ MIB.addReg(ImplicitSReg, RegState::Implicit);
+ break;
+ }
+ }
+
+}
+
+//===----------------------------------------------------------------------===//
+// Partial register updates
+//===----------------------------------------------------------------------===//
+//
+// Swift renames NEON registers with 64-bit granularity. That means any
+// instruction writing an S-reg implicitly reads the containing D-reg. The
+// problem is mostly avoided by translating f32 operations to v2f32 operations
+// on D-registers, but f32 loads are still a problem.
+//
+// These instructions can load an f32 into a NEON register:
+//
+// VLDRS - Only writes S, partial D update.
+// VLD1LNd32 - Writes all D-regs, explicit partial D update, 2 uops.
+// VLD1DUPd32 - Writes all D-regs, no partial reg update, 2 uops.
+//
+// FCONSTD can be used as a dependency-breaking instruction.
+unsigned ARMBaseInstrInfo::getPartialRegUpdateClearance(
+ const MachineInstr &MI, unsigned OpNum,
+ const TargetRegisterInfo *TRI) const {
+ auto PartialUpdateClearance = Subtarget.getPartialUpdateClearance();
+ if (!PartialUpdateClearance)
+ return 0;
+
+ assert(TRI && "Need TRI instance");
+
+ const MachineOperand &MO = MI.getOperand(OpNum);
+ if (MO.readsReg())
+ return 0;
+ unsigned Reg = MO.getReg();
+ int UseOp = -1;
+
+ switch (MI.getOpcode()) {
+ // Normal instructions writing only an S-register.
+ case ARM::VLDRS:
+ case ARM::FCONSTS:
+ case ARM::VMOVSR:
+ case ARM::VMOVv8i8:
+ case ARM::VMOVv4i16:
+ case ARM::VMOVv2i32:
+ case ARM::VMOVv2f32:
+ case ARM::VMOVv1i64:
+ UseOp = MI.findRegisterUseOperandIdx(Reg, false, TRI);
+ break;
+
+ // Explicitly reads the dependency.
+ case ARM::VLD1LNd32:
+ UseOp = 3;
+ break;
+ default:
+ return 0;
+ }
+
+ // If this instruction actually reads a value from Reg, there is no unwanted
+ // dependency.
+ if (UseOp != -1 && MI.getOperand(UseOp).readsReg())
+ return 0;
+
+ // We must be able to clobber the whole D-reg.
+ if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ // Virtual register must be a foo:ssub_0<def,undef> operand.
+ if (!MO.getSubReg() || MI.readsVirtualRegister(Reg))
+ return 0;
+ } else if (ARM::SPRRegClass.contains(Reg)) {
+ // Physical register: MI must define the full D-reg.
+ unsigned DReg = TRI->getMatchingSuperReg(Reg, ARM::ssub_0,
+ &ARM::DPRRegClass);
+ if (!DReg || !MI.definesRegister(DReg, TRI))
+ return 0;
+ }
+
+ // MI has an unwanted D-register dependency.
+ // Avoid defs in the previous N instructrions.
+ return PartialUpdateClearance;
+}
+
+// Break a partial register dependency after getPartialRegUpdateClearance
+// returned non-zero.
+void ARMBaseInstrInfo::breakPartialRegDependency(
+ MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const {
+ assert(OpNum < MI.getDesc().getNumDefs() && "OpNum is not a def");
+ assert(TRI && "Need TRI instance");
+
+ const MachineOperand &MO = MI.getOperand(OpNum);
+ unsigned Reg = MO.getReg();
+ assert(TargetRegisterInfo::isPhysicalRegister(Reg) &&
+ "Can't break virtual register dependencies.");
+ unsigned DReg = Reg;
+
+ // If MI defines an S-reg, find the corresponding D super-register.
+ if (ARM::SPRRegClass.contains(Reg)) {
+ DReg = ARM::D0 + (Reg - ARM::S0) / 2;
+ assert(TRI->isSuperRegister(Reg, DReg) && "Register enums broken");
+ }
+
+ assert(ARM::DPRRegClass.contains(DReg) && "Can only break D-reg deps");
+ assert(MI.definesRegister(DReg, TRI) && "MI doesn't clobber full D-reg");
+
+ // FIXME: In some cases, VLDRS can be changed to a VLD1DUPd32 which defines
+ // the full D-register by loading the same value to both lanes. The
+ // instruction is micro-coded with 2 uops, so don't do this until we can
+ // properly schedule micro-coded instructions. The dispatcher stalls cause
+ // too big regressions.
+
+ // Insert the dependency-breaking FCONSTD before MI.
+ // 96 is the encoding of 0.5, but the actual value doesn't matter here.
+ AddDefaultPred(
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(ARM::FCONSTD), DReg)
+ .addImm(96));
+ MI.addRegisterKilled(DReg, TRI, true);
+}
+
+bool ARMBaseInstrInfo::hasNOP() const {
+ return Subtarget.getFeatureBits()[ARM::HasV6KOps];
+}
+
+bool ARMBaseInstrInfo::isSwiftFastImmShift(const MachineInstr *MI) const {
+ if (MI->getNumOperands() < 4)
+ return true;
+ unsigned ShOpVal = MI->getOperand(3).getImm();
+ unsigned ShImm = ARM_AM::getSORegOffset(ShOpVal);
+ // Swift supports faster shifts for: lsl 2, lsl 1, and lsr 1.
+ if ((ShImm == 1 && ARM_AM::getSORegShOp(ShOpVal) == ARM_AM::lsr) ||
+ ((ShImm == 1 || ShImm == 2) &&
+ ARM_AM::getSORegShOp(ShOpVal) == ARM_AM::lsl))
+ return true;
+
+ return false;
+}
+
+bool ARMBaseInstrInfo::getRegSequenceLikeInputs(
+ const MachineInstr &MI, unsigned DefIdx,
+ SmallVectorImpl<RegSubRegPairAndIdx> &InputRegs) const {
+ assert(DefIdx < MI.getDesc().getNumDefs() && "Invalid definition index");
+ assert(MI.isRegSequenceLike() && "Invalid kind of instruction");
+
+ switch (MI.getOpcode()) {
+ case ARM::VMOVDRR:
+ // dX = VMOVDRR rY, rZ
+ // is the same as:
+ // dX = REG_SEQUENCE rY, ssub_0, rZ, ssub_1
+ // Populate the InputRegs accordingly.
+ // rY
+ const MachineOperand *MOReg = &MI.getOperand(1);
+ InputRegs.push_back(
+ RegSubRegPairAndIdx(MOReg->getReg(), MOReg->getSubReg(), ARM::ssub_0));
+ // rZ
+ MOReg = &MI.getOperand(2);
+ InputRegs.push_back(
+ RegSubRegPairAndIdx(MOReg->getReg(), MOReg->getSubReg(), ARM::ssub_1));
+ return true;
+ }
+ llvm_unreachable("Target dependent opcode missing");
+}
+
+bool ARMBaseInstrInfo::getExtractSubregLikeInputs(
+ const MachineInstr &MI, unsigned DefIdx,
+ RegSubRegPairAndIdx &InputReg) const {
+ assert(DefIdx < MI.getDesc().getNumDefs() && "Invalid definition index");
+ assert(MI.isExtractSubregLike() && "Invalid kind of instruction");
+
+ switch (MI.getOpcode()) {
+ case ARM::VMOVRRD:
+ // rX, rY = VMOVRRD dZ
+ // is the same as:
+ // rX = EXTRACT_SUBREG dZ, ssub_0
+ // rY = EXTRACT_SUBREG dZ, ssub_1
+ const MachineOperand &MOReg = MI.getOperand(2);
+ InputReg.Reg = MOReg.getReg();
+ InputReg.SubReg = MOReg.getSubReg();
+ InputReg.SubIdx = DefIdx == 0 ? ARM::ssub_0 : ARM::ssub_1;
+ return true;
+ }
+ llvm_unreachable("Target dependent opcode missing");
+}
+
+bool ARMBaseInstrInfo::getInsertSubregLikeInputs(
+ const MachineInstr &MI, unsigned DefIdx, RegSubRegPair &BaseReg,
+ RegSubRegPairAndIdx &InsertedReg) const {
+ assert(DefIdx < MI.getDesc().getNumDefs() && "Invalid definition index");
+ assert(MI.isInsertSubregLike() && "Invalid kind of instruction");
+
+ switch (MI.getOpcode()) {
+ case ARM::VSETLNi32:
+ // dX = VSETLNi32 dY, rZ, imm
+ const MachineOperand &MOBaseReg = MI.getOperand(1);
+ const MachineOperand &MOInsertedReg = MI.getOperand(2);
+ const MachineOperand &MOIndex = MI.getOperand(3);
+ BaseReg.Reg = MOBaseReg.getReg();
+ BaseReg.SubReg = MOBaseReg.getSubReg();
+
+ InsertedReg.Reg = MOInsertedReg.getReg();
+ InsertedReg.SubReg = MOInsertedReg.getSubReg();
+ InsertedReg.SubIdx = MOIndex.getImm() == 0 ? ARM::ssub_0 : ARM::ssub_1;
+ return true;
+ }
+ llvm_unreachable("Target dependent opcode missing");
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
new file mode 100644
index 000000000000..b01d5c8ec85f
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -0,0 +1,522 @@
+//===-- ARMBaseInstrInfo.h - ARM Base Instruction Information ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Base ARM implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H
+#define LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H
+
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "ARMGenInstrInfo.inc"
+
+namespace llvm {
+ class ARMSubtarget;
+ class ARMBaseRegisterInfo;
+
+class ARMBaseInstrInfo : public ARMGenInstrInfo {
+ const ARMSubtarget &Subtarget;
+
+protected:
+ // Can be only subclassed.
+ explicit ARMBaseInstrInfo(const ARMSubtarget &STI);
+
+ void expandLoadStackGuardBase(MachineBasicBlock::iterator MI,
+ unsigned LoadImmOpc, unsigned LoadOpc) const;
+
+ /// Build the equivalent inputs of a REG_SEQUENCE for the given \p MI
+ /// and \p DefIdx.
+ /// \p [out] InputRegs of the equivalent REG_SEQUENCE. Each element of
+ /// the list is modeled as <Reg:SubReg, SubIdx>.
+ /// E.g., REG_SEQUENCE vreg1:sub1, sub0, vreg2, sub1 would produce
+ /// two elements:
+ /// - vreg1:sub1, sub0
+ /// - vreg2<:0>, sub1
+ ///
+ /// \returns true if it is possible to build such an input sequence
+ /// with the pair \p MI, \p DefIdx. False otherwise.
+ ///
+ /// \pre MI.isRegSequenceLike().
+ bool getRegSequenceLikeInputs(
+ const MachineInstr &MI, unsigned DefIdx,
+ SmallVectorImpl<RegSubRegPairAndIdx> &InputRegs) const override;
+
+ /// Build the equivalent inputs of a EXTRACT_SUBREG for the given \p MI
+ /// and \p DefIdx.
+ /// \p [out] InputReg of the equivalent EXTRACT_SUBREG.
+ /// E.g., EXTRACT_SUBREG vreg1:sub1, sub0, sub1 would produce:
+ /// - vreg1:sub1, sub0
+ ///
+ /// \returns true if it is possible to build such an input sequence
+ /// with the pair \p MI, \p DefIdx. False otherwise.
+ ///
+ /// \pre MI.isExtractSubregLike().
+ bool getExtractSubregLikeInputs(const MachineInstr &MI, unsigned DefIdx,
+ RegSubRegPairAndIdx &InputReg) const override;
+
+ /// Build the equivalent inputs of a INSERT_SUBREG for the given \p MI
+ /// and \p DefIdx.
+ /// \p [out] BaseReg and \p [out] InsertedReg contain
+ /// the equivalent inputs of INSERT_SUBREG.
+ /// E.g., INSERT_SUBREG vreg0:sub0, vreg1:sub1, sub3 would produce:
+ /// - BaseReg: vreg0:sub0
+ /// - InsertedReg: vreg1:sub1, sub3
+ ///
+ /// \returns true if it is possible to build such an input sequence
+ /// with the pair \p MI, \p DefIdx. False otherwise.
+ ///
+ /// \pre MI.isInsertSubregLike().
+ bool
+ getInsertSubregLikeInputs(const MachineInstr &MI, unsigned DefIdx,
+ RegSubRegPair &BaseReg,
+ RegSubRegPairAndIdx &InsertedReg) const override;
+
+ /// Commutes the operands in the given instruction.
+ /// The commutable operands are specified by their indices OpIdx1 and OpIdx2.
+ ///
+ /// Do not call this method for a non-commutable instruction or for
+ /// non-commutable pair of operand indices OpIdx1 and OpIdx2.
+ /// Even though the instruction is commutable, the method may still
+ /// fail to commute the operands, null pointer is returned in such cases.
+ MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+ unsigned OpIdx1,
+ unsigned OpIdx2) const override;
+
+public:
+ // Return whether the target has an explicit NOP encoding.
+ bool hasNOP() const;
+
+ virtual void getNoopForElfTarget(MCInst &NopInst) const {
+ getNoopForMachoTarget(NopInst);
+ }
+
+ // Return the non-pre/post incrementing version of 'Opc'. Return 0
+ // if there is not such an opcode.
+ virtual unsigned getUnindexedOpcode(unsigned Opc) const =0;
+
+ MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
+ MachineInstr &MI,
+ LiveVariables *LV) const override;
+
+ virtual const ARMBaseRegisterInfo &getRegisterInfo() const = 0;
+ const ARMSubtarget &getSubtarget() const { return Subtarget; }
+
+ ScheduleHazardRecognizer *
+ CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI,
+ const ScheduleDAG *DAG) const override;
+
+ ScheduleHazardRecognizer *
+ CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
+ const ScheduleDAG *DAG) const override;
+
+ // Branch analysis.
+ bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify = false) const override;
+ unsigned removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved = nullptr) const override;
+ unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+ const DebugLoc &DL,
+ int *BytesAdded = nullptr) const override;
+
+ bool
+ reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+
+ // Predication support.
+ bool isPredicated(const MachineInstr &MI) const override;
+
+ ARMCC::CondCodes getPredicate(const MachineInstr &MI) const {
+ int PIdx = MI.findFirstPredOperandIdx();
+ return PIdx != -1 ? (ARMCC::CondCodes)MI.getOperand(PIdx).getImm()
+ : ARMCC::AL;
+ }
+
+ bool PredicateInstruction(MachineInstr &MI,
+ ArrayRef<MachineOperand> Pred) const override;
+
+ bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
+ ArrayRef<MachineOperand> Pred2) const override;
+
+ bool DefinesPredicate(MachineInstr &MI,
+ std::vector<MachineOperand> &Pred) const override;
+
+ bool isPredicable(MachineInstr &MI) const override;
+
+ /// GetInstSize - Returns the size of the specified MachineInstr.
+ ///
+ unsigned getInstSizeInBytes(const MachineInstr &MI) const override;
+
+ unsigned isLoadFromStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const override;
+ unsigned isStoreToStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const override;
+ unsigned isLoadFromStackSlotPostFE(const MachineInstr &MI,
+ int &FrameIndex) const override;
+ unsigned isStoreToStackSlotPostFE(const MachineInstr &MI,
+ int &FrameIndex) const override;
+
+ void copyToCPSR(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ unsigned SrcReg, bool KillSrc,
+ const ARMSubtarget &Subtarget) const;
+ void copyFromCPSR(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ unsigned DestReg, bool KillSrc,
+ const ARMSubtarget &Subtarget) const;
+
+ void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+ bool KillSrc) const override;
+
+ void storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ unsigned SrcReg, bool isKill, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
+
+ void loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ unsigned DestReg, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
+
+ bool expandPostRAPseudo(MachineInstr &MI) const override;
+
+ void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ unsigned DestReg, unsigned SubIdx,
+ const MachineInstr &Orig,
+ const TargetRegisterInfo &TRI) const override;
+
+ MachineInstr *duplicate(MachineInstr &Orig,
+ MachineFunction &MF) const override;
+
+ const MachineInstrBuilder &AddDReg(MachineInstrBuilder &MIB, unsigned Reg,
+ unsigned SubIdx, unsigned State,
+ const TargetRegisterInfo *TRI) const;
+
+ bool produceSameValue(const MachineInstr &MI0, const MachineInstr &MI1,
+ const MachineRegisterInfo *MRI) const override;
+
+ /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler to
+ /// determine if two loads are loading from the same base address. It should
+ /// only return true if the base pointers are the same and the only
+ /// differences between the two addresses is the offset. It also returns the
+ /// offsets by reference.
+ bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1,
+ int64_t &Offset2) const override;
+
+ /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to
+ /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads
+ /// should be scheduled togther. On some targets if two loads are loading from
+ /// addresses in the same cache line, it's better if they are scheduled
+ /// together. This function takes two integers that represent the load offsets
+ /// from the common base address. It returns true if it decides it's desirable
+ /// to schedule the two loads together. "NumLoads" is the number of loads that
+ /// have already been scheduled after Load1.
+ bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+ int64_t Offset1, int64_t Offset2,
+ unsigned NumLoads) const override;
+
+ bool isSchedulingBoundary(const MachineInstr &MI,
+ const MachineBasicBlock *MBB,
+ const MachineFunction &MF) const override;
+
+ bool isProfitableToIfCvt(MachineBasicBlock &MBB,
+ unsigned NumCycles, unsigned ExtraPredCycles,
+ BranchProbability Probability) const override;
+
+ bool isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumT,
+ unsigned ExtraT, MachineBasicBlock &FMBB,
+ unsigned NumF, unsigned ExtraF,
+ BranchProbability Probability) const override;
+
+ bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
+ BranchProbability Probability) const override {
+ return NumCycles == 1;
+ }
+
+ bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
+ MachineBasicBlock &FMBB) const override;
+
+ /// analyzeCompare - For a comparison instruction, return the source registers
+ /// in SrcReg and SrcReg2 if having two register operands, and the value it
+ /// compares against in CmpValue. Return true if the comparison instruction
+ /// can be analyzed.
+ bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
+ unsigned &SrcReg2, int &CmpMask,
+ int &CmpValue) const override;
+
+ /// optimizeCompareInstr - Convert the instruction to set the zero flag so
+ /// that we can remove a "comparison with zero"; Remove a redundant CMP
+ /// instruction if the flags can be updated in the same way by an earlier
+ /// instruction such as SUB.
+ bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
+ unsigned SrcReg2, int CmpMask, int CmpValue,
+ const MachineRegisterInfo *MRI) const override;
+
+ bool analyzeSelect(const MachineInstr &MI,
+ SmallVectorImpl<MachineOperand> &Cond, unsigned &TrueOp,
+ unsigned &FalseOp, bool &Optimizable) const override;
+
+ MachineInstr *optimizeSelect(MachineInstr &MI,
+ SmallPtrSetImpl<MachineInstr *> &SeenMIs,
+ bool) const override;
+
+ /// FoldImmediate - 'Reg' is known to be defined by a move immediate
+ /// instruction, try to fold the immediate into the use instruction.
+ bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg,
+ MachineRegisterInfo *MRI) const override;
+
+ unsigned getNumMicroOps(const InstrItineraryData *ItinData,
+ const MachineInstr &MI) const override;
+
+ int getOperandLatency(const InstrItineraryData *ItinData,
+ const MachineInstr &DefMI, unsigned DefIdx,
+ const MachineInstr &UseMI,
+ unsigned UseIdx) const override;
+ int getOperandLatency(const InstrItineraryData *ItinData,
+ SDNode *DefNode, unsigned DefIdx,
+ SDNode *UseNode, unsigned UseIdx) const override;
+
+ /// VFP/NEON execution domains.
+ std::pair<uint16_t, uint16_t>
+ getExecutionDomain(const MachineInstr &MI) const override;
+ void setExecutionDomain(MachineInstr &MI, unsigned Domain) const override;
+
+ unsigned
+ getPartialRegUpdateClearance(const MachineInstr &, unsigned,
+ const TargetRegisterInfo *) const override;
+ void breakPartialRegDependency(MachineInstr &, unsigned,
+ const TargetRegisterInfo *TRI) const override;
+
+ /// Get the number of addresses by LDM or VLDM or zero for unknown.
+ unsigned getNumLDMAddresses(const MachineInstr &MI) const;
+
+private:
+ unsigned getInstBundleLength(const MachineInstr &MI) const;
+
+ int getVLDMDefCycle(const InstrItineraryData *ItinData,
+ const MCInstrDesc &DefMCID,
+ unsigned DefClass,
+ unsigned DefIdx, unsigned DefAlign) const;
+ int getLDMDefCycle(const InstrItineraryData *ItinData,
+ const MCInstrDesc &DefMCID,
+ unsigned DefClass,
+ unsigned DefIdx, unsigned DefAlign) const;
+ int getVSTMUseCycle(const InstrItineraryData *ItinData,
+ const MCInstrDesc &UseMCID,
+ unsigned UseClass,
+ unsigned UseIdx, unsigned UseAlign) const;
+ int getSTMUseCycle(const InstrItineraryData *ItinData,
+ const MCInstrDesc &UseMCID,
+ unsigned UseClass,
+ unsigned UseIdx, unsigned UseAlign) const;
+ int getOperandLatency(const InstrItineraryData *ItinData,
+ const MCInstrDesc &DefMCID,
+ unsigned DefIdx, unsigned DefAlign,
+ const MCInstrDesc &UseMCID,
+ unsigned UseIdx, unsigned UseAlign) const;
+
+ int getOperandLatencyImpl(const InstrItineraryData *ItinData,
+ const MachineInstr &DefMI, unsigned DefIdx,
+ const MCInstrDesc &DefMCID, unsigned DefAdj,
+ const MachineOperand &DefMO, unsigned Reg,
+ const MachineInstr &UseMI, unsigned UseIdx,
+ const MCInstrDesc &UseMCID, unsigned UseAdj) const;
+
+ unsigned getPredicationCost(const MachineInstr &MI) const override;
+
+ unsigned getInstrLatency(const InstrItineraryData *ItinData,
+ const MachineInstr &MI,
+ unsigned *PredCost = nullptr) const override;
+
+ int getInstrLatency(const InstrItineraryData *ItinData,
+ SDNode *Node) const override;
+
+ bool hasHighOperandLatency(const TargetSchedModel &SchedModel,
+ const MachineRegisterInfo *MRI,
+ const MachineInstr &DefMI, unsigned DefIdx,
+ const MachineInstr &UseMI,
+ unsigned UseIdx) const override;
+ bool hasLowDefLatency(const TargetSchedModel &SchedModel,
+ const MachineInstr &DefMI,
+ unsigned DefIdx) const override;
+
+ /// verifyInstruction - Perform target specific instruction verification.
+ bool verifyInstruction(const MachineInstr &MI,
+ StringRef &ErrInfo) const override;
+
+ virtual void expandLoadStackGuard(MachineBasicBlock::iterator MI) const = 0;
+
+ void expandMEMCPY(MachineBasicBlock::iterator) const;
+
+private:
+ /// Modeling special VFP / NEON fp MLA / MLS hazards.
+
+ /// MLxEntryMap - Map fp MLA / MLS to the corresponding entry in the internal
+ /// MLx table.
+ DenseMap<unsigned, unsigned> MLxEntryMap;
+
+ /// MLxHazardOpcodes - Set of add / sub and multiply opcodes that would cause
+ /// stalls when scheduled together with fp MLA / MLS opcodes.
+ SmallSet<unsigned, 16> MLxHazardOpcodes;
+
+public:
+ /// isFpMLxInstruction - Return true if the specified opcode is a fp MLA / MLS
+ /// instruction.
+ bool isFpMLxInstruction(unsigned Opcode) const {
+ return MLxEntryMap.count(Opcode);
+ }
+
+ /// isFpMLxInstruction - This version also returns the multiply opcode and the
+ /// addition / subtraction opcode to expand to. Return true for 'HasLane' for
+ /// the MLX instructions with an extra lane operand.
+ bool isFpMLxInstruction(unsigned Opcode, unsigned &MulOpc,
+ unsigned &AddSubOpc, bool &NegAcc,
+ bool &HasLane) const;
+
+ /// canCauseFpMLxStall - Return true if an instruction of the specified opcode
+ /// will cause stalls when scheduled after (within 4-cycle window) a fp
+ /// MLA / MLS instruction.
+ bool canCauseFpMLxStall(unsigned Opcode) const {
+ return MLxHazardOpcodes.count(Opcode);
+ }
+
+ /// Returns true if the instruction has a shift by immediate that can be
+ /// executed in one cycle less.
+ bool isSwiftFastImmShift(const MachineInstr *MI) const;
+};
+
+static inline
+const MachineInstrBuilder &AddDefaultPred(const MachineInstrBuilder &MIB) {
+ return MIB.addImm((int64_t)ARMCC::AL).addReg(0);
+}
+
+static inline
+const MachineInstrBuilder &AddDefaultCC(const MachineInstrBuilder &MIB) {
+ return MIB.addReg(0);
+}
+
+static inline
+const MachineInstrBuilder &AddDefaultT1CC(const MachineInstrBuilder &MIB,
+ bool isDead = false) {
+ return MIB.addReg(ARM::CPSR, getDefRegState(true) | getDeadRegState(isDead));
+}
+
+static inline
+const MachineInstrBuilder &AddNoT1CC(const MachineInstrBuilder &MIB) {
+ return MIB.addReg(0);
+}
+
+static inline
+bool isUncondBranchOpcode(int Opc) {
+ return Opc == ARM::B || Opc == ARM::tB || Opc == ARM::t2B;
+}
+
+static inline
+bool isCondBranchOpcode(int Opc) {
+ return Opc == ARM::Bcc || Opc == ARM::tBcc || Opc == ARM::t2Bcc;
+}
+
+static inline
+bool isJumpTableBranchOpcode(int Opc) {
+ return Opc == ARM::BR_JTr || Opc == ARM::BR_JTm || Opc == ARM::BR_JTadd ||
+ Opc == ARM::tBR_JTr || Opc == ARM::t2BR_JT;
+}
+
+static inline
+bool isIndirectBranchOpcode(int Opc) {
+ return Opc == ARM::BX || Opc == ARM::MOVPCRX || Opc == ARM::tBRIND;
+}
+
+static inline bool isPopOpcode(int Opc) {
+ return Opc == ARM::tPOP_RET || Opc == ARM::LDMIA_RET ||
+ Opc == ARM::t2LDMIA_RET || Opc == ARM::tPOP || Opc == ARM::LDMIA_UPD ||
+ Opc == ARM::t2LDMIA_UPD || Opc == ARM::VLDMDIA_UPD;
+}
+
+static inline bool isPushOpcode(int Opc) {
+ return Opc == ARM::tPUSH || Opc == ARM::t2STMDB_UPD ||
+ Opc == ARM::STMDB_UPD || Opc == ARM::VSTMDDB_UPD;
+}
+
+/// getInstrPredicate - If instruction is predicated, returns its predicate
+/// condition, otherwise returns AL. It also returns the condition code
+/// register by reference.
+ARMCC::CondCodes getInstrPredicate(const MachineInstr &MI, unsigned &PredReg);
+
+unsigned getMatchingCondBranchOpcode(unsigned Opc);
+
+/// Determine if MI can be folded into an ARM MOVCC instruction, and return the
+/// opcode of the SSA instruction representing the conditional MI.
+unsigned canFoldARMInstrIntoMOVCC(unsigned Reg,
+ MachineInstr *&MI,
+ const MachineRegisterInfo &MRI);
+
+/// Map pseudo instructions that imply an 'S' bit onto real opcodes. Whether
+/// the instruction is encoded with an 'S' bit is determined by the optional
+/// CPSR def operand.
+unsigned convertAddSubFlagsOpcode(unsigned OldOpc);
+
+/// emitARMRegPlusImmediate / emitT2RegPlusImmediate - Emits a series of
+/// instructions to materializea destreg = basereg + immediate in ARM / Thumb2
+/// code.
+void emitARMRegPlusImmediate(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ const DebugLoc &dl, unsigned DestReg,
+ unsigned BaseReg, int NumBytes,
+ ARMCC::CondCodes Pred, unsigned PredReg,
+ const ARMBaseInstrInfo &TII, unsigned MIFlags = 0);
+
+void emitT2RegPlusImmediate(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ const DebugLoc &dl, unsigned DestReg,
+ unsigned BaseReg, int NumBytes,
+ ARMCC::CondCodes Pred, unsigned PredReg,
+ const ARMBaseInstrInfo &TII, unsigned MIFlags = 0);
+void emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ const DebugLoc &dl, unsigned DestReg,
+ unsigned BaseReg, int NumBytes,
+ const TargetInstrInfo &TII,
+ const ARMBaseRegisterInfo &MRI,
+ unsigned MIFlags = 0);
+
+/// Tries to add registers to the reglist of a given base-updating
+/// push/pop instruction to adjust the stack by an additional
+/// NumBytes. This can save a few bytes per function in code-size, but
+/// obviously generates more memory traffic. As such, it only takes
+/// effect in functions being optimised for size.
+bool tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget,
+ MachineFunction &MF, MachineInstr *MI,
+ unsigned NumBytes);
+
+/// rewriteARMFrameIndex / rewriteT2FrameIndex -
+/// Rewrite MI to access 'Offset' bytes from the FP. Return false if the
+/// offset could not be handled directly in MI, and return the left-over
+/// portion by reference.
+bool rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+ unsigned FrameReg, int &Offset,
+ const ARMBaseInstrInfo &TII);
+
+bool rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+ unsigned FrameReg, int &Offset,
+ const ARMBaseInstrInfo &TII);
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
new file mode 100644
index 000000000000..d995c631dd1c
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -0,0 +1,843 @@
+//===-- ARMBaseRegisterInfo.cpp - ARM Register Information ----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the base ARM implementation of TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMBaseRegisterInfo.h"
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMFrameLowering.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+#define DEBUG_TYPE "arm-register-info"
+
+#define GET_REGINFO_TARGET_DESC
+#include "ARMGenRegisterInfo.inc"
+
+using namespace llvm;
+
+ARMBaseRegisterInfo::ARMBaseRegisterInfo()
+ : ARMGenRegisterInfo(ARM::LR, 0, 0, ARM::PC), BasePtr(ARM::R6) {}
+
+static unsigned getFramePointerReg(const ARMSubtarget &STI) {
+ return STI.useR7AsFramePointer() ? ARM::R7 : ARM::R11;
+}
+
+const MCPhysReg*
+ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+ const ARMSubtarget &STI = MF->getSubtarget<ARMSubtarget>();
+ bool UseSplitPush = STI.splitFramePushPop(*MF);
+ const MCPhysReg *RegList =
+ STI.isTargetDarwin()
+ ? CSR_iOS_SaveList
+ : (UseSplitPush ? CSR_AAPCS_SplitPush_SaveList : CSR_AAPCS_SaveList);
+
+ const Function *F = MF->getFunction();
+ if (F->getCallingConv() == CallingConv::GHC) {
+ // GHC set of callee saved regs is empty as all those regs are
+ // used for passing STG regs around
+ return CSR_NoRegs_SaveList;
+ } else if (F->hasFnAttribute("interrupt")) {
+ if (STI.isMClass()) {
+ // M-class CPUs have hardware which saves the registers needed to allow a
+ // function conforming to the AAPCS to function as a handler.
+ return UseSplitPush ? CSR_AAPCS_SplitPush_SaveList : CSR_AAPCS_SaveList;
+ } else if (F->getFnAttribute("interrupt").getValueAsString() == "FIQ") {
+ // Fast interrupt mode gives the handler a private copy of R8-R14, so less
+ // need to be saved to restore user-mode state.
+ return CSR_FIQ_SaveList;
+ } else {
+ // Generally only R13-R14 (i.e. SP, LR) are automatically preserved by
+ // exception handling.
+ return CSR_GenericInt_SaveList;
+ }
+ }
+
+ if (STI.isTargetDarwin() && STI.getTargetLowering()->supportSwiftError() &&
+ F->getAttributes().hasAttrSomewhere(Attribute::SwiftError))
+ return CSR_iOS_SwiftError_SaveList;
+
+ if (STI.isTargetDarwin() && F->getCallingConv() == CallingConv::CXX_FAST_TLS)
+ return MF->getInfo<ARMFunctionInfo>()->isSplitCSR()
+ ? CSR_iOS_CXX_TLS_PE_SaveList
+ : CSR_iOS_CXX_TLS_SaveList;
+ return RegList;
+}
+
+const MCPhysReg *ARMBaseRegisterInfo::getCalleeSavedRegsViaCopy(
+ const MachineFunction *MF) const {
+ assert(MF && "Invalid MachineFunction pointer.");
+ if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
+ MF->getInfo<ARMFunctionInfo>()->isSplitCSR())
+ return CSR_iOS_CXX_TLS_ViaCopy_SaveList;
+ return nullptr;
+}
+
+const uint32_t *
+ARMBaseRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
+ CallingConv::ID CC) const {
+ const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
+ if (CC == CallingConv::GHC)
+ // This is academic becase all GHC calls are (supposed to be) tail calls
+ return CSR_NoRegs_RegMask;
+
+ if (STI.isTargetDarwin() && STI.getTargetLowering()->supportSwiftError() &&
+ MF.getFunction()->getAttributes().hasAttrSomewhere(Attribute::SwiftError))
+ return CSR_iOS_SwiftError_RegMask;
+
+ if (STI.isTargetDarwin() && CC == CallingConv::CXX_FAST_TLS)
+ return CSR_iOS_CXX_TLS_RegMask;
+ return STI.isTargetDarwin() ? CSR_iOS_RegMask : CSR_AAPCS_RegMask;
+}
+
+const uint32_t*
+ARMBaseRegisterInfo::getNoPreservedMask() const {
+ return CSR_NoRegs_RegMask;
+}
+
+const uint32_t *
+ARMBaseRegisterInfo::getTLSCallPreservedMask(const MachineFunction &MF) const {
+ assert(MF.getSubtarget<ARMSubtarget>().isTargetDarwin() &&
+ "only know about special TLS call on Darwin");
+ return CSR_iOS_TLSCall_RegMask;
+}
+
+const uint32_t *
+ARMBaseRegisterInfo::getSjLjDispatchPreservedMask(const MachineFunction &MF) const {
+ const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
+ if (!STI.useSoftFloat() && STI.hasVFP2() && !STI.isThumb1Only())
+ return CSR_NoRegs_RegMask;
+ else
+ return CSR_FPRegs_RegMask;
+}
+
+
+const uint32_t *
+ARMBaseRegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF,
+ CallingConv::ID CC) const {
+ const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
+ // This should return a register mask that is the same as that returned by
+ // getCallPreservedMask but that additionally preserves the register used for
+ // the first i32 argument (which must also be the register used to return a
+ // single i32 return value)
+ //
+ // In case that the calling convention does not use the same register for
+ // both or otherwise does not want to enable this optimization, the function
+ // should return NULL
+ if (CC == CallingConv::GHC)
+ // This is academic becase all GHC calls are (supposed to be) tail calls
+ return nullptr;
+ return STI.isTargetDarwin() ? CSR_iOS_ThisReturn_RegMask
+ : CSR_AAPCS_ThisReturn_RegMask;
+}
+
+BitVector ARMBaseRegisterInfo::
+getReservedRegs(const MachineFunction &MF) const {
+ const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
+ const ARMFrameLowering *TFI = getFrameLowering(MF);
+
+ // FIXME: avoid re-calculating this every time.
+ BitVector Reserved(getNumRegs());
+ markSuperRegs(Reserved, ARM::SP);
+ markSuperRegs(Reserved, ARM::PC);
+ markSuperRegs(Reserved, ARM::FPSCR);
+ markSuperRegs(Reserved, ARM::APSR_NZCV);
+ if (TFI->hasFP(MF))
+ markSuperRegs(Reserved, getFramePointerReg(STI));
+ if (hasBasePointer(MF))
+ markSuperRegs(Reserved, BasePtr);
+ // Some targets reserve R9.
+ if (STI.isR9Reserved())
+ markSuperRegs(Reserved, ARM::R9);
+ // Reserve D16-D31 if the subtarget doesn't support them.
+ if (!STI.hasVFP3() || STI.hasD16()) {
+ static_assert(ARM::D31 == ARM::D16 + 15, "Register list not consecutive!");
+ for (unsigned R = 0; R < 16; ++R)
+ markSuperRegs(Reserved, ARM::D16 + R);
+ }
+ const TargetRegisterClass *RC = &ARM::GPRPairRegClass;
+ for(TargetRegisterClass::iterator I = RC->begin(), E = RC->end(); I!=E; ++I)
+ for (MCSubRegIterator SI(*I, this); SI.isValid(); ++SI)
+ if (Reserved.test(*SI)) markSuperRegs(Reserved, *I);
+
+ assert(checkAllSuperRegsMarked(Reserved));
+ return Reserved;
+}
+
+const TargetRegisterClass *
+ARMBaseRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
+ const MachineFunction &) const {
+ const TargetRegisterClass *Super = RC;
+ TargetRegisterClass::sc_iterator I = RC->getSuperClasses();
+ do {
+ switch (Super->getID()) {
+ case ARM::GPRRegClassID:
+ case ARM::SPRRegClassID:
+ case ARM::DPRRegClassID:
+ case ARM::QPRRegClassID:
+ case ARM::QQPRRegClassID:
+ case ARM::QQQQPRRegClassID:
+ case ARM::GPRPairRegClassID:
+ return Super;
+ }
+ Super = *I++;
+ } while (Super);
+ return RC;
+}
+
+const TargetRegisterClass *
+ARMBaseRegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
+ const {
+ return &ARM::GPRRegClass;
+}
+
+const TargetRegisterClass *
+ARMBaseRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
+ if (RC == &ARM::CCRRegClass)
+ return &ARM::rGPRRegClass; // Can't copy CCR registers.
+ return RC;
+}
+
+unsigned
+ARMBaseRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+ MachineFunction &MF) const {
+ const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
+ const ARMFrameLowering *TFI = getFrameLowering(MF);
+
+ switch (RC->getID()) {
+ default:
+ return 0;
+ case ARM::tGPRRegClassID:
+ return TFI->hasFP(MF) ? 4 : 5;
+ case ARM::GPRRegClassID: {
+ unsigned FP = TFI->hasFP(MF) ? 1 : 0;
+ return 10 - FP - (STI.isR9Reserved() ? 1 : 0);
+ }
+ case ARM::SPRRegClassID: // Currently not used as 'rep' register class.
+ case ARM::DPRRegClassID:
+ return 32 - 10;
+ }
+}
+
+// Get the other register in a GPRPair.
+static unsigned getPairedGPR(unsigned Reg, bool Odd, const MCRegisterInfo *RI) {
+ for (MCSuperRegIterator Supers(Reg, RI); Supers.isValid(); ++Supers)
+ if (ARM::GPRPairRegClass.contains(*Supers))
+ return RI->getSubReg(*Supers, Odd ? ARM::gsub_1 : ARM::gsub_0);
+ return 0;
+}
+
+// Resolve the RegPairEven / RegPairOdd register allocator hints.
+void
+ARMBaseRegisterInfo::getRegAllocationHints(unsigned VirtReg,
+ ArrayRef<MCPhysReg> Order,
+ SmallVectorImpl<MCPhysReg> &Hints,
+ const MachineFunction &MF,
+ const VirtRegMap *VRM,
+ const LiveRegMatrix *Matrix) const {
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ std::pair<unsigned, unsigned> Hint = MRI.getRegAllocationHint(VirtReg);
+
+ unsigned Odd;
+ switch (Hint.first) {
+ case ARMRI::RegPairEven:
+ Odd = 0;
+ break;
+ case ARMRI::RegPairOdd:
+ Odd = 1;
+ break;
+ default:
+ TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF, VRM);
+ return;
+ }
+
+ // This register should preferably be even (Odd == 0) or odd (Odd == 1).
+ // Check if the other part of the pair has already been assigned, and provide
+ // the paired register as the first hint.
+ unsigned Paired = Hint.second;
+ if (Paired == 0)
+ return;
+
+ unsigned PairedPhys = 0;
+ if (TargetRegisterInfo::isPhysicalRegister(Paired)) {
+ PairedPhys = Paired;
+ } else if (VRM && VRM->hasPhys(Paired)) {
+ PairedPhys = getPairedGPR(VRM->getPhys(Paired), Odd, this);
+ }
+
+ // First prefer the paired physreg.
+ if (PairedPhys && is_contained(Order, PairedPhys))
+ Hints.push_back(PairedPhys);
+
+ // Then prefer even or odd registers.
+ for (unsigned I = 0, E = Order.size(); I != E; ++I) {
+ unsigned Reg = Order[I];
+ if (Reg == PairedPhys || (getEncodingValue(Reg) & 1) != Odd)
+ continue;
+ // Don't provide hints that are paired to a reserved register.
+ unsigned Paired = getPairedGPR(Reg, !Odd, this);
+ if (!Paired || MRI.isReserved(Paired))
+ continue;
+ Hints.push_back(Reg);
+ }
+}
+
+void
+ARMBaseRegisterInfo::updateRegAllocHint(unsigned Reg, unsigned NewReg,
+ MachineFunction &MF) const {
+ MachineRegisterInfo *MRI = &MF.getRegInfo();
+ std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(Reg);
+ if ((Hint.first == (unsigned)ARMRI::RegPairOdd ||
+ Hint.first == (unsigned)ARMRI::RegPairEven) &&
+ TargetRegisterInfo::isVirtualRegister(Hint.second)) {
+ // If 'Reg' is one of the even / odd register pair and it's now changed
+ // (e.g. coalesced) into a different register. The other register of the
+ // pair allocation hint must be updated to reflect the relationship
+ // change.
+ unsigned OtherReg = Hint.second;
+ Hint = MRI->getRegAllocationHint(OtherReg);
+ // Make sure the pair has not already divorced.
+ if (Hint.second == Reg) {
+ MRI->setRegAllocationHint(OtherReg, Hint.first, NewReg);
+ if (TargetRegisterInfo::isVirtualRegister(NewReg))
+ MRI->setRegAllocationHint(NewReg,
+ Hint.first == (unsigned)ARMRI::RegPairOdd ? ARMRI::RegPairEven
+ : ARMRI::RegPairOdd, OtherReg);
+ }
+ }
+}
+
+bool ARMBaseRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+ const ARMFrameLowering *TFI = getFrameLowering(MF);
+
+ // When outgoing call frames are so large that we adjust the stack pointer
+ // around the call, we can no longer use the stack pointer to reach the
+ // emergency spill slot.
+ if (needsStackRealignment(MF) && !TFI->hasReservedCallFrame(MF))
+ return true;
+
+ // Thumb has trouble with negative offsets from the FP. Thumb2 has a limited
+ // negative range for ldr/str (255), and thumb1 is positive offsets only.
+ // It's going to be better to use the SP or Base Pointer instead. When there
+ // are variable sized objects, we can't reference off of the SP, so we
+ // reserve a Base Pointer.
+ if (AFI->isThumbFunction() && MFI.hasVarSizedObjects()) {
+ // Conservatively estimate whether the negative offset from the frame
+ // pointer will be sufficient to reach. If a function has a smallish
+ // frame, it's less likely to have lots of spills and callee saved
+ // space, so it's all more likely to be within range of the frame pointer.
+ // If it's wrong, the scavenger will still enable access to work, it just
+ // won't be optimal.
+ if (AFI->isThumb2Function() && MFI.getLocalFrameSize() < 128)
+ return false;
+ return true;
+ }
+
+ return false;
+}
+
+bool ARMBaseRegisterInfo::canRealignStack(const MachineFunction &MF) const {
+ const MachineRegisterInfo *MRI = &MF.getRegInfo();
+ const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+ const ARMFrameLowering *TFI = getFrameLowering(MF);
+ // We can't realign the stack if:
+ // 1. Dynamic stack realignment is explicitly disabled,
+ // 2. This is a Thumb1 function (it's not useful, so we don't bother), or
+ // 3. There are VLAs in the function and the base pointer is disabled.
+ if (!TargetRegisterInfo::canRealignStack(MF))
+ return false;
+ if (AFI->isThumb1OnlyFunction())
+ return false;
+ // Stack realignment requires a frame pointer. If we already started
+ // register allocation with frame pointer elimination, it is too late now.
+ if (!MRI->canReserveReg(getFramePointerReg(MF.getSubtarget<ARMSubtarget>())))
+ return false;
+ // We may also need a base pointer if there are dynamic allocas or stack
+ // pointer adjustments around calls.
+ if (TFI->hasReservedCallFrame(MF))
+ return true;
+ // A base pointer is required and allowed. Check that it isn't too late to
+ // reserve it.
+ return MRI->canReserveReg(BasePtr);
+}
+
+bool ARMBaseRegisterInfo::
+cannotEliminateFrame(const MachineFunction &MF) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ if (MF.getTarget().Options.DisableFramePointerElim(MF) && MFI.adjustsStack())
+ return true;
+ return MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken()
+ || needsStackRealignment(MF);
+}
+
+unsigned
+ARMBaseRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+ const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
+ const ARMFrameLowering *TFI = getFrameLowering(MF);
+
+ if (TFI->hasFP(MF))
+ return getFramePointerReg(STI);
+ return ARM::SP;
+}
+
+/// emitLoadConstPool - Emits a load from constpool to materialize the
+/// specified immediate.
+void ARMBaseRegisterInfo::emitLoadConstPool(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+ const DebugLoc &dl, unsigned DestReg, unsigned SubIdx, int Val,
+ ARMCC::CondCodes Pred, unsigned PredReg, unsigned MIFlags) const {
+ MachineFunction &MF = *MBB.getParent();
+ const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+ MachineConstantPool *ConstantPool = MF.getConstantPool();
+ const Constant *C =
+ ConstantInt::get(Type::getInt32Ty(MF.getFunction()->getContext()), Val);
+ unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
+
+ BuildMI(MBB, MBBI, dl, TII.get(ARM::LDRcp))
+ .addReg(DestReg, getDefRegState(true), SubIdx)
+ .addConstantPoolIndex(Idx)
+ .addImm(0).addImm(Pred).addReg(PredReg)
+ .setMIFlags(MIFlags);
+}
+
+bool ARMBaseRegisterInfo::
+requiresRegisterScavenging(const MachineFunction &MF) const {
+ return true;
+}
+
+bool ARMBaseRegisterInfo::
+trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+ return true;
+}
+
+bool ARMBaseRegisterInfo::
+requiresFrameIndexScavenging(const MachineFunction &MF) const {
+ return true;
+}
+
+bool ARMBaseRegisterInfo::
+requiresVirtualBaseRegisters(const MachineFunction &MF) const {
+ return true;
+}
+
+int64_t ARMBaseRegisterInfo::
+getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const {
+ const MCInstrDesc &Desc = MI->getDesc();
+ unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+ int64_t InstrOffs = 0;
+ int Scale = 1;
+ unsigned ImmIdx = 0;
+ switch (AddrMode) {
+ case ARMII::AddrModeT2_i8:
+ case ARMII::AddrModeT2_i12:
+ case ARMII::AddrMode_i12:
+ InstrOffs = MI->getOperand(Idx+1).getImm();
+ Scale = 1;
+ break;
+ case ARMII::AddrMode5: {
+ // VFP address mode.
+ const MachineOperand &OffOp = MI->getOperand(Idx+1);
+ InstrOffs = ARM_AM::getAM5Offset(OffOp.getImm());
+ if (ARM_AM::getAM5Op(OffOp.getImm()) == ARM_AM::sub)
+ InstrOffs = -InstrOffs;
+ Scale = 4;
+ break;
+ }
+ case ARMII::AddrMode2: {
+ ImmIdx = Idx+2;
+ InstrOffs = ARM_AM::getAM2Offset(MI->getOperand(ImmIdx).getImm());
+ if (ARM_AM::getAM2Op(MI->getOperand(ImmIdx).getImm()) == ARM_AM::sub)
+ InstrOffs = -InstrOffs;
+ break;
+ }
+ case ARMII::AddrMode3: {
+ ImmIdx = Idx+2;
+ InstrOffs = ARM_AM::getAM3Offset(MI->getOperand(ImmIdx).getImm());
+ if (ARM_AM::getAM3Op(MI->getOperand(ImmIdx).getImm()) == ARM_AM::sub)
+ InstrOffs = -InstrOffs;
+ break;
+ }
+ case ARMII::AddrModeT1_s: {
+ ImmIdx = Idx+1;
+ InstrOffs = MI->getOperand(ImmIdx).getImm();
+ Scale = 4;
+ break;
+ }
+ default:
+ llvm_unreachable("Unsupported addressing mode!");
+ }
+
+ return InstrOffs * Scale;
+}
+
+/// needsFrameBaseReg - Returns true if the instruction's frame index
+/// reference would be better served by a base register other than FP
+/// or SP. Used by LocalStackFrameAllocation to determine which frame index
+/// references it should create new base registers for.
+bool ARMBaseRegisterInfo::
+needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
+ for (unsigned i = 0; !MI->getOperand(i).isFI(); ++i) {
+ assert(i < MI->getNumOperands() &&"Instr doesn't have FrameIndex operand!");
+ }
+
+ // It's the load/store FI references that cause issues, as it can be difficult
+ // to materialize the offset if it won't fit in the literal field. Estimate
+ // based on the size of the local frame and some conservative assumptions
+ // about the rest of the stack frame (note, this is pre-regalloc, so
+ // we don't know everything for certain yet) whether this offset is likely
+ // to be out of range of the immediate. Return true if so.
+
+ // We only generate virtual base registers for loads and stores, so
+ // return false for everything else.
+ unsigned Opc = MI->getOpcode();
+ switch (Opc) {
+ case ARM::LDRi12: case ARM::LDRH: case ARM::LDRBi12:
+ case ARM::STRi12: case ARM::STRH: case ARM::STRBi12:
+ case ARM::t2LDRi12: case ARM::t2LDRi8:
+ case ARM::t2STRi12: case ARM::t2STRi8:
+ case ARM::VLDRS: case ARM::VLDRD:
+ case ARM::VSTRS: case ARM::VSTRD:
+ case ARM::tSTRspi: case ARM::tLDRspi:
+ break;
+ default:
+ return false;
+ }
+
+ // Without a virtual base register, if the function has variable sized
+ // objects, all fixed-size local references will be via the frame pointer,
+ // Approximate the offset and see if it's legal for the instruction.
+ // Note that the incoming offset is based on the SP value at function entry,
+ // so it'll be negative.
+ MachineFunction &MF = *MI->getParent()->getParent();
+ const ARMFrameLowering *TFI = getFrameLowering(MF);
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+ // Estimate an offset from the frame pointer.
+ // Conservatively assume all callee-saved registers get pushed. R4-R6
+ // will be earlier than the FP, so we ignore those.
+ // R7, LR
+ int64_t FPOffset = Offset - 8;
+ // ARM and Thumb2 functions also need to consider R8-R11 and D8-D15
+ if (!AFI->isThumbFunction() || !AFI->isThumb1OnlyFunction())
+ FPOffset -= 80;
+ // Estimate an offset from the stack pointer.
+ // The incoming offset is relating to the SP at the start of the function,
+ // but when we access the local it'll be relative to the SP after local
+ // allocation, so adjust our SP-relative offset by that allocation size.
+ Offset += MFI.getLocalFrameSize();
+ // Assume that we'll have at least some spill slots allocated.
+ // FIXME: This is a total SWAG number. We should run some statistics
+ // and pick a real one.
+ Offset += 128; // 128 bytes of spill slots
+
+ // If there's a frame pointer and the addressing mode allows it, try using it.
+ // The FP is only available if there is no dynamic realignment. We
+ // don't know for sure yet whether we'll need that, so we guess based
+ // on whether there are any local variables that would trigger it.
+ unsigned StackAlign = TFI->getStackAlignment();
+ if (TFI->hasFP(MF) &&
+ !((MFI.getLocalFrameMaxAlign() > StackAlign) && canRealignStack(MF))) {
+ if (isFrameOffsetLegal(MI, getFrameRegister(MF), FPOffset))
+ return false;
+ }
+ // If we can reference via the stack pointer, try that.
+ // FIXME: This (and the code that resolves the references) can be improved
+ // to only disallow SP relative references in the live range of
+ // the VLA(s). In practice, it's unclear how much difference that
+ // would make, but it may be worth doing.
+ if (!MFI.hasVarSizedObjects() && isFrameOffsetLegal(MI, ARM::SP, Offset))
+ return false;
+
+ // The offset likely isn't legal, we want to allocate a virtual base register.
+ return true;
+}
+
+/// materializeFrameBaseRegister - Insert defining instruction(s) for BaseReg to
+/// be a pointer to FrameIdx at the beginning of the basic block.
+void ARMBaseRegisterInfo::
+materializeFrameBaseRegister(MachineBasicBlock *MBB,
+ unsigned BaseReg, int FrameIdx,
+ int64_t Offset) const {
+ ARMFunctionInfo *AFI = MBB->getParent()->getInfo<ARMFunctionInfo>();
+ unsigned ADDriOpc = !AFI->isThumbFunction() ? ARM::ADDri :
+ (AFI->isThumb1OnlyFunction() ? ARM::tADDframe : ARM::t2ADDri);
+
+ MachineBasicBlock::iterator Ins = MBB->begin();
+ DebugLoc DL; // Defaults to "unknown"
+ if (Ins != MBB->end())
+ DL = Ins->getDebugLoc();
+
+ const MachineFunction &MF = *MBB->getParent();
+ MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+ const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+ const MCInstrDesc &MCID = TII.get(ADDriOpc);
+ MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this, MF));
+
+ MachineInstrBuilder MIB = BuildMI(*MBB, Ins, DL, MCID, BaseReg)
+ .addFrameIndex(FrameIdx).addImm(Offset);
+
+ if (!AFI->isThumb1OnlyFunction())
+ AddDefaultCC(AddDefaultPred(MIB));
+}
+
+void ARMBaseRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+ int64_t Offset) const {
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineFunction &MF = *MBB.getParent();
+ const ARMBaseInstrInfo &TII =
+ *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+ int Off = Offset; // ARM doesn't need the general 64-bit offsets
+ unsigned i = 0;
+
+ assert(!AFI->isThumb1OnlyFunction() &&
+ "This resolveFrameIndex does not support Thumb1!");
+
+ while (!MI.getOperand(i).isFI()) {
+ ++i;
+ assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+ }
+ bool Done = false;
+ if (!AFI->isThumbFunction())
+ Done = rewriteARMFrameIndex(MI, i, BaseReg, Off, TII);
+ else {
+ assert(AFI->isThumb2Function());
+ Done = rewriteT2FrameIndex(MI, i, BaseReg, Off, TII);
+ }
+ assert (Done && "Unable to resolve frame index!");
+ (void)Done;
+}
+
+bool ARMBaseRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, unsigned BaseReg,
+ int64_t Offset) const {
+ const MCInstrDesc &Desc = MI->getDesc();
+ unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+ unsigned i = 0;
+
+ while (!MI->getOperand(i).isFI()) {
+ ++i;
+ assert(i < MI->getNumOperands() &&"Instr doesn't have FrameIndex operand!");
+ }
+
+ // AddrMode4 and AddrMode6 cannot handle any offset.
+ if (AddrMode == ARMII::AddrMode4 || AddrMode == ARMII::AddrMode6)
+ return Offset == 0;
+
+ unsigned NumBits = 0;
+ unsigned Scale = 1;
+ bool isSigned = true;
+ switch (AddrMode) {
+ case ARMII::AddrModeT2_i8:
+ case ARMII::AddrModeT2_i12:
+ // i8 supports only negative, and i12 supports only positive, so
+ // based on Offset sign, consider the appropriate instruction
+ Scale = 1;
+ if (Offset < 0) {
+ NumBits = 8;
+ Offset = -Offset;
+ } else {
+ NumBits = 12;
+ }
+ break;
+ case ARMII::AddrMode5:
+ // VFP address mode.
+ NumBits = 8;
+ Scale = 4;
+ break;
+ case ARMII::AddrMode_i12:
+ case ARMII::AddrMode2:
+ NumBits = 12;
+ break;
+ case ARMII::AddrMode3:
+ NumBits = 8;
+ break;
+ case ARMII::AddrModeT1_s:
+ NumBits = (BaseReg == ARM::SP ? 8 : 5);
+ Scale = 4;
+ isSigned = false;
+ break;
+ default:
+ llvm_unreachable("Unsupported addressing mode!");
+ }
+
+ Offset += getFrameIndexInstrOffset(MI, i);
+ // Make sure the offset is encodable for instructions that scale the
+ // immediate.
+ if ((Offset & (Scale-1)) != 0)
+ return false;
+
+ if (isSigned && Offset < 0)
+ Offset = -Offset;
+
+ unsigned Mask = (1 << NumBits) - 1;
+ if ((unsigned)Offset <= Mask * Scale)
+ return true;
+
+ return false;
+}
+
+void
+ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+ int SPAdj, unsigned FIOperandNum,
+ RegScavenger *RS) const {
+ MachineInstr &MI = *II;
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineFunction &MF = *MBB.getParent();
+ const ARMBaseInstrInfo &TII =
+ *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
+ const ARMFrameLowering *TFI = getFrameLowering(MF);
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+ assert(!AFI->isThumb1OnlyFunction() &&
+ "This eliminateFrameIndex does not support Thumb1!");
+ int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+ unsigned FrameReg;
+
+ int Offset = TFI->ResolveFrameIndexReference(MF, FrameIndex, FrameReg, SPAdj);
+
+ // PEI::scavengeFrameVirtualRegs() cannot accurately track SPAdj because the
+ // call frame setup/destroy instructions have already been eliminated. That
+ // means the stack pointer cannot be used to access the emergency spill slot
+ // when !hasReservedCallFrame().
+#ifndef NDEBUG
+ if (RS && FrameReg == ARM::SP && RS->isScavengingFrameIndex(FrameIndex)){
+ assert(TFI->hasReservedCallFrame(MF) &&
+ "Cannot use SP to access the emergency spill slot in "
+ "functions without a reserved call frame");
+ assert(!MF.getFrameInfo().hasVarSizedObjects() &&
+ "Cannot use SP to access the emergency spill slot in "
+ "functions with variable sized frame objects");
+ }
+#endif // NDEBUG
+
+ assert(!MI.isDebugValue() && "DBG_VALUEs should be handled in target-independent code");
+
+ // Modify MI as necessary to handle as much of 'Offset' as possible
+ bool Done = false;
+ if (!AFI->isThumbFunction())
+ Done = rewriteARMFrameIndex(MI, FIOperandNum, FrameReg, Offset, TII);
+ else {
+ assert(AFI->isThumb2Function());
+ Done = rewriteT2FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII);
+ }
+ if (Done)
+ return;
+
+ // If we get here, the immediate doesn't fit into the instruction. We folded
+ // as much as possible above, handle the rest, providing a register that is
+ // SP+LargeImm.
+ assert((Offset ||
+ (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode4 ||
+ (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode6) &&
+ "This code isn't needed if offset already handled!");
+
+ unsigned ScratchReg = 0;
+ int PIdx = MI.findFirstPredOperandIdx();
+ ARMCC::CondCodes Pred = (PIdx == -1)
+ ? ARMCC::AL : (ARMCC::CondCodes)MI.getOperand(PIdx).getImm();
+ unsigned PredReg = (PIdx == -1) ? 0 : MI.getOperand(PIdx+1).getReg();
+ if (Offset == 0)
+ // Must be addrmode4/6.
+ MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false, false, false);
+ else {
+ ScratchReg = MF.getRegInfo().createVirtualRegister(&ARM::GPRRegClass);
+ if (!AFI->isThumbFunction())
+ emitARMRegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg,
+ Offset, Pred, PredReg, TII);
+ else {
+ assert(AFI->isThumb2Function());
+ emitT2RegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg,
+ Offset, Pred, PredReg, TII);
+ }
+ // Update the original instruction to use the scratch register.
+ MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false,true);
+ }
+}
+
+bool ARMBaseRegisterInfo::shouldCoalesce(MachineInstr *MI,
+ const TargetRegisterClass *SrcRC,
+ unsigned SubReg,
+ const TargetRegisterClass *DstRC,
+ unsigned DstSubReg,
+ const TargetRegisterClass *NewRC) const {
+ auto MBB = MI->getParent();
+ auto MF = MBB->getParent();
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
+ // If not copying into a sub-register this should be ok because we shouldn't
+ // need to split the reg.
+ if (!DstSubReg)
+ return true;
+ // Small registers don't frequently cause a problem, so we can coalesce them.
+ if (NewRC->getSize() < 32 && DstRC->getSize() < 32 && SrcRC->getSize() < 32)
+ return true;
+
+ auto NewRCWeight =
+ MRI.getTargetRegisterInfo()->getRegClassWeight(NewRC);
+ auto SrcRCWeight =
+ MRI.getTargetRegisterInfo()->getRegClassWeight(SrcRC);
+ auto DstRCWeight =
+ MRI.getTargetRegisterInfo()->getRegClassWeight(DstRC);
+ // If the source register class is more expensive than the destination, the
+ // coalescing is probably profitable.
+ if (SrcRCWeight.RegWeight > NewRCWeight.RegWeight)
+ return true;
+ if (DstRCWeight.RegWeight > NewRCWeight.RegWeight)
+ return true;
+
+ // If the register allocator isn't constrained, we can always allow coalescing
+ // unfortunately we don't know yet if we will be constrained.
+ // The goal of this heuristic is to restrict how many expensive registers
+ // we allow to coalesce in a given basic block.
+ auto AFI = MF->getInfo<ARMFunctionInfo>();
+ auto It = AFI->getCoalescedWeight(MBB);
+
+ DEBUG(dbgs() << "\tARM::shouldCoalesce - Coalesced Weight: "
+ << It->second << "\n");
+ DEBUG(dbgs() << "\tARM::shouldCoalesce - Reg Weight: "
+ << NewRCWeight.RegWeight << "\n");
+
+ // This number is the largest round number that which meets the criteria:
+ // (1) addresses PR18825
+ // (2) generates better code in some test cases (like vldm-shed-a9.ll)
+ // (3) Doesn't regress any test cases (in-tree, test-suite, and SPEC)
+ // In practice the SizeMultiplier will only factor in for straight line code
+ // that uses a lot of NEON vectors, which isn't terribly common.
+ unsigned SizeMultiplier = MBB->size()/100;
+ SizeMultiplier = SizeMultiplier ? SizeMultiplier : 1;
+ if (It->second < NewRCWeight.WeightLimit * SizeMultiplier) {
+ It->second += NewRCWeight.RegWeight;
+ return true;
+ }
+ return false;
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
new file mode 100644
index 000000000000..330e1535e863
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -0,0 +1,201 @@
+//===-- ARMBaseRegisterInfo.h - ARM Register Information Impl ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the base ARM implementation of TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMBASEREGISTERINFO_H
+#define LLVM_LIB_TARGET_ARM_ARMBASEREGISTERINFO_H
+
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "ARMGenRegisterInfo.inc"
+
+namespace llvm {
+/// Register allocation hints.
+namespace ARMRI {
+ enum {
+ RegPairOdd = 1,
+ RegPairEven = 2
+ };
+}
+
+/// isARMArea1Register - Returns true if the register is a low register (r0-r7)
+/// or a stack/pc register that we should push/pop.
+static inline bool isARMArea1Register(unsigned Reg, bool isIOS) {
+ using namespace ARM;
+ switch (Reg) {
+ case R0: case R1: case R2: case R3:
+ case R4: case R5: case R6: case R7:
+ case LR: case SP: case PC:
+ return true;
+ case R8: case R9: case R10: case R11: case R12:
+ // For iOS we want r7 and lr to be next to each other.
+ return !isIOS;
+ default:
+ return false;
+ }
+}
+
+static inline bool isARMArea2Register(unsigned Reg, bool isIOS) {
+ using namespace ARM;
+ switch (Reg) {
+ case R8: case R9: case R10: case R11: case R12:
+ // iOS has this second area.
+ return isIOS;
+ default:
+ return false;
+ }
+}
+
+static inline bool isARMArea3Register(unsigned Reg, bool isIOS) {
+ using namespace ARM;
+ switch (Reg) {
+ case D15: case D14: case D13: case D12:
+ case D11: case D10: case D9: case D8:
+ case D7: case D6: case D5: case D4:
+ case D3: case D2: case D1: case D0:
+ case D31: case D30: case D29: case D28:
+ case D27: case D26: case D25: case D24:
+ case D23: case D22: case D21: case D20:
+ case D19: case D18: case D17: case D16:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static inline bool isCalleeSavedRegister(unsigned Reg,
+ const MCPhysReg *CSRegs) {
+ for (unsigned i = 0; CSRegs[i]; ++i)
+ if (Reg == CSRegs[i])
+ return true;
+ return false;
+}
+
+class ARMBaseRegisterInfo : public ARMGenRegisterInfo {
+protected:
+ /// BasePtr - ARM physical register used as a base ptr in complex stack
+ /// frames. I.e., when we need a 3rd base, not just SP and FP, due to
+ /// variable size stack objects.
+ unsigned BasePtr;
+
+ // Can be only subclassed.
+ explicit ARMBaseRegisterInfo();
+
+ // Return the opcode that implements 'Op', or 0 if no opcode
+ unsigned getOpcode(int Op) const;
+
+public:
+ /// Code Generation virtual methods...
+ const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+ const MCPhysReg *
+ getCalleeSavedRegsViaCopy(const MachineFunction *MF) const;
+ const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+ CallingConv::ID) const override;
+ const uint32_t *getNoPreservedMask() const override;
+ const uint32_t *getTLSCallPreservedMask(const MachineFunction &MF) const;
+ const uint32_t *getSjLjDispatchPreservedMask(const MachineFunction &MF) const;
+
+ /// getThisReturnPreservedMask - Returns a call preserved mask specific to the
+ /// case that 'returned' is on an i32 first argument if the calling convention
+ /// is one that can (partially) model this attribute with a preserved mask
+ /// (i.e. it is a calling convention that uses the same register for the first
+ /// i32 argument and an i32 return value)
+ ///
+ /// Should return NULL in the case that the calling convention does not have
+ /// this property
+ const uint32_t *getThisReturnPreservedMask(const MachineFunction &MF,
+ CallingConv::ID) const;
+
+ BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+ const TargetRegisterClass *
+ getPointerRegClass(const MachineFunction &MF,
+ unsigned Kind = 0) const override;
+ const TargetRegisterClass *
+ getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
+
+ const TargetRegisterClass *
+ getLargestLegalSuperClass(const TargetRegisterClass *RC,
+ const MachineFunction &MF) const override;
+
+ unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+ MachineFunction &MF) const override;
+
+ void getRegAllocationHints(unsigned VirtReg,
+ ArrayRef<MCPhysReg> Order,
+ SmallVectorImpl<MCPhysReg> &Hints,
+ const MachineFunction &MF,
+ const VirtRegMap *VRM,
+ const LiveRegMatrix *Matrix) const override;
+
+ void updateRegAllocHint(unsigned Reg, unsigned NewReg,
+ MachineFunction &MF) const override;
+
+ bool hasBasePointer(const MachineFunction &MF) const;
+
+ bool canRealignStack(const MachineFunction &MF) const override;
+ int64_t getFrameIndexInstrOffset(const MachineInstr *MI,
+ int Idx) const override;
+ bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
+ void materializeFrameBaseRegister(MachineBasicBlock *MBB,
+ unsigned BaseReg, int FrameIdx,
+ int64_t Offset) const override;
+ void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+ int64_t Offset) const override;
+ bool isFrameOffsetLegal(const MachineInstr *MI, unsigned BaseReg,
+ int64_t Offset) const override;
+
+ bool cannotEliminateFrame(const MachineFunction &MF) const;
+
+ // Debug information queries.
+ unsigned getFrameRegister(const MachineFunction &MF) const override;
+ unsigned getBaseRegister() const { return BasePtr; }
+
+ bool isLowRegister(unsigned Reg) const;
+
+
+ /// emitLoadConstPool - Emits a load from constpool to materialize the
+ /// specified immediate.
+ virtual void
+ emitLoadConstPool(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+ const DebugLoc &dl, unsigned DestReg, unsigned SubIdx,
+ int Val, ARMCC::CondCodes Pred = ARMCC::AL,
+ unsigned PredReg = 0,
+ unsigned MIFlags = MachineInstr::NoFlags) const;
+
+ /// Code Generation virtual methods...
+ bool requiresRegisterScavenging(const MachineFunction &MF) const override;
+
+ bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
+
+ bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
+
+ bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override;
+
+ void eliminateFrameIndex(MachineBasicBlock::iterator II,
+ int SPAdj, unsigned FIOperandNum,
+ RegScavenger *RS = nullptr) const override;
+
+ /// \brief SrcRC and DstRC will be morphed into NewRC if this returns true
+ bool shouldCoalesce(MachineInstr *MI,
+ const TargetRegisterClass *SrcRC,
+ unsigned SubReg,
+ const TargetRegisterClass *DstRC,
+ unsigned DstSubReg,
+ const TargetRegisterClass *NewRC) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMBasicBlockInfo.h b/contrib/llvm/lib/Target/ARM/ARMBasicBlockInfo.h
new file mode 100644
index 000000000000..780544f865df
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMBasicBlockInfo.h
@@ -0,0 +1,110 @@
+//===-- ARMBasicBlockInfo.h - Basic Block Information -----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Utility functions and data structure for computing block size.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMBASICBLOCKINFO_H
+#define LLVM_LIB_TARGET_ARM_ARMBASICBLOCKINFO_H
+
+#include "ARM.h"
+#include "ARMMachineFunctionInfo.h"
+using namespace llvm;
+
+namespace llvm {
+
+/// UnknownPadding - Return the worst case padding that could result from
+/// unknown offset bits. This does not include alignment padding caused by
+/// known offset bits.
+///
+/// @param LogAlign log2(alignment)
+/// @param KnownBits Number of known low offset bits.
+inline unsigned UnknownPadding(unsigned LogAlign, unsigned KnownBits) {
+ if (KnownBits < LogAlign)
+ return (1u << LogAlign) - (1u << KnownBits);
+ return 0;
+}
+
+/// BasicBlockInfo - Information about the offset and size of a single
+/// basic block.
+struct BasicBlockInfo {
+ /// Offset - Distance from the beginning of the function to the beginning
+ /// of this basic block.
+ ///
+ /// Offsets are computed assuming worst case padding before an aligned
+ /// block. This means that subtracting basic block offsets always gives a
+ /// conservative estimate of the real distance which may be smaller.
+ ///
+ /// Because worst case padding is used, the computed offset of an aligned
+ /// block may not actually be aligned.
+ unsigned Offset;
+
+ /// Size - Size of the basic block in bytes. If the block contains
+ /// inline assembly, this is a worst case estimate.
+ ///
+ /// The size does not include any alignment padding whether from the
+ /// beginning of the block, or from an aligned jump table at the end.
+ unsigned Size;
+
+ /// KnownBits - The number of low bits in Offset that are known to be
+ /// exact. The remaining bits of Offset are an upper bound.
+ uint8_t KnownBits;
+
+ /// Unalign - When non-zero, the block contains instructions (inline asm)
+ /// of unknown size. The real size may be smaller than Size bytes by a
+ /// multiple of 1 << Unalign.
+ uint8_t Unalign;
+
+ /// PostAlign - When non-zero, the block terminator contains a .align
+ /// directive, so the end of the block is aligned to 1 << PostAlign
+ /// bytes.
+ uint8_t PostAlign;
+
+ BasicBlockInfo() : Offset(0), Size(0), KnownBits(0), Unalign(0),
+ PostAlign(0) {}
+
+ /// Compute the number of known offset bits internally to this block.
+ /// This number should be used to predict worst case padding when
+ /// splitting the block.
+ unsigned internalKnownBits() const {
+ unsigned Bits = Unalign ? Unalign : KnownBits;
+ // If the block size isn't a multiple of the known bits, assume the
+ // worst case padding.
+ if (Size & ((1u << Bits) - 1))
+ Bits = countTrailingZeros(Size);
+ return Bits;
+ }
+
+ /// Compute the offset immediately following this block. If LogAlign is
+ /// specified, return the offset the successor block will get if it has
+ /// this alignment.
+ unsigned postOffset(unsigned LogAlign = 0) const {
+ unsigned PO = Offset + Size;
+ unsigned LA = std::max(unsigned(PostAlign), LogAlign);
+ if (!LA)
+ return PO;
+ // Add alignment padding from the terminator.
+ return PO + UnknownPadding(LA, internalKnownBits());
+ }
+
+ /// Compute the number of known low bits of postOffset. If this block
+ /// contains inline asm, the number of known bits drops to the
+ /// instruction alignment. An aligned terminator may increase the number
+ /// of know bits.
+ /// If LogAlign is given, also consider the alignment of the next block.
+ unsigned postKnownBits(unsigned LogAlign = 0) const {
+ return std::max(std::max(unsigned(PostAlign), LogAlign),
+ internalKnownBits());
+ }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMCallLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMCallLowering.cpp
new file mode 100644
index 000000000000..52c95b6244ac
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMCallLowering.cpp
@@ -0,0 +1,203 @@
+//===-- llvm/lib/Target/ARM/ARMCallLowering.cpp - Call lowering -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the lowering of LLVM calls to machine code calls for
+/// GlobalISel.
+///
+//===----------------------------------------------------------------------===//
+
+#include "ARMCallLowering.h"
+
+#include "ARMBaseInstrInfo.h"
+#include "ARMISelLowering.h"
+
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "This shouldn't be built without GISel"
+#endif
+
+ARMCallLowering::ARMCallLowering(const ARMTargetLowering &TLI)
+ : CallLowering(&TLI) {}
+
+static bool isSupportedType(const DataLayout DL, const ARMTargetLowering &TLI,
+ Type *T) {
+ EVT VT = TLI.getValueType(DL, T);
+ if (!VT.isSimple() || !VT.isInteger() || VT.isVector())
+ return false;
+
+ unsigned VTSize = VT.getSimpleVT().getSizeInBits();
+ return VTSize == 8 || VTSize == 16 || VTSize == 32;
+}
+
+namespace {
+struct FuncReturnHandler : public CallLowering::ValueHandler {
+ FuncReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+ MachineInstrBuilder &MIB)
+ : ValueHandler(MIRBuilder, MRI), MIB(MIB) {}
+
+ unsigned getStackAddress(uint64_t Size, int64_t Offset,
+ MachinePointerInfo &MPO) override {
+ llvm_unreachable("Don't know how to get a stack address yet");
+ }
+
+ void assignValueToReg(unsigned ValVReg, unsigned PhysReg,
+ CCValAssign &VA) override {
+ assert(VA.isRegLoc() && "Value shouldn't be assigned to reg");
+ assert(VA.getLocReg() == PhysReg && "Assigning to the wrong reg?");
+
+ assert(VA.getValVT().getSizeInBits() <= 32 && "Unsupported value size");
+ assert(VA.getLocVT().getSizeInBits() == 32 && "Unsupported location size");
+
+ assert(VA.getLocInfo() != CCValAssign::SExt &&
+ VA.getLocInfo() != CCValAssign::ZExt &&
+ "ABI extensions not supported yet");
+
+ MIRBuilder.buildCopy(PhysReg, ValVReg);
+ MIB.addUse(PhysReg, RegState::Implicit);
+ }
+
+ void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size,
+ MachinePointerInfo &MPO, CCValAssign &VA) override {
+ llvm_unreachable("Don't know how to assign a value to an address yet");
+ }
+
+ MachineInstrBuilder &MIB;
+};
+} // End anonymous namespace.
+
+/// Lower the return value for the already existing \p Ret. This assumes that
+/// \p MIRBuilder's insertion point is correct.
+bool ARMCallLowering::lowerReturnVal(MachineIRBuilder &MIRBuilder,
+ const Value *Val, unsigned VReg,
+ MachineInstrBuilder &Ret) const {
+ if (!Val)
+ // Nothing to do here.
+ return true;
+
+ auto &MF = MIRBuilder.getMF();
+ const auto &F = *MF.getFunction();
+
+ auto DL = MF.getDataLayout();
+ auto &TLI = *getTLI<ARMTargetLowering>();
+ if (!isSupportedType(DL, TLI, Val->getType()))
+ return false;
+
+ CCAssignFn *AssignFn =
+ TLI.CCAssignFnForReturn(F.getCallingConv(), F.isVarArg());
+
+ ArgInfo RetInfo(VReg, Val->getType());
+ setArgFlags(RetInfo, AttributeSet::ReturnIndex, DL, F);
+
+ FuncReturnHandler RetHandler(MIRBuilder, MF.getRegInfo(), Ret);
+ return handleAssignments(MIRBuilder, AssignFn, RetInfo, RetHandler);
+}
+
+bool ARMCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
+ const Value *Val, unsigned VReg) const {
+ assert(!Val == !VReg && "Return value without a vreg");
+
+ auto Ret = AddDefaultPred(MIRBuilder.buildInstrNoInsert(ARM::BX_RET));
+
+ if (!lowerReturnVal(MIRBuilder, Val, VReg, Ret))
+ return false;
+
+ MIRBuilder.insertInstr(Ret);
+ return true;
+}
+
+namespace {
+struct FormalArgHandler : public CallLowering::ValueHandler {
+ FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI)
+ : ValueHandler(MIRBuilder, MRI) {}
+
+ unsigned getStackAddress(uint64_t Size, int64_t Offset,
+ MachinePointerInfo &MPO) override {
+ assert(Size == 4 && "Unsupported size");
+
+ auto &MFI = MIRBuilder.getMF().getFrameInfo();
+
+ int FI = MFI.CreateFixedObject(Size, Offset, true);
+ MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
+
+ unsigned AddrReg =
+ MRI.createGenericVirtualRegister(LLT::pointer(MPO.getAddrSpace(), 32));
+ MIRBuilder.buildFrameIndex(AddrReg, FI);
+
+ return AddrReg;
+ }
+
+ void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size,
+ MachinePointerInfo &MPO, CCValAssign &VA) override {
+ assert(Size == 4 && "Unsupported size");
+
+ auto MMO = MIRBuilder.getMF().getMachineMemOperand(
+ MPO, MachineMemOperand::MOLoad, Size, /* Alignment */ 0);
+ MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
+ }
+
+ void assignValueToReg(unsigned ValVReg, unsigned PhysReg,
+ CCValAssign &VA) override {
+ assert(VA.isRegLoc() && "Value shouldn't be assigned to reg");
+ assert(VA.getLocReg() == PhysReg && "Assigning to the wrong reg?");
+
+ assert(VA.getValVT().getSizeInBits() <= 32 && "Unsupported value size");
+ assert(VA.getLocVT().getSizeInBits() == 32 && "Unsupported location size");
+
+ MIRBuilder.getMBB().addLiveIn(PhysReg);
+ MIRBuilder.buildCopy(ValVReg, PhysReg);
+ }
+};
+} // End anonymous namespace
+
+bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
+ const Function &F,
+ ArrayRef<unsigned> VRegs) const {
+ // Quick exit if there aren't any args
+ if (F.arg_empty())
+ return true;
+
+ if (F.isVarArg())
+ return false;
+
+ auto DL = MIRBuilder.getMF().getDataLayout();
+ auto &TLI = *getTLI<ARMTargetLowering>();
+
+ auto &Args = F.getArgumentList();
+ unsigned ArgIdx = 0;
+ for (auto &Arg : Args) {
+ ArgIdx++;
+ if (!isSupportedType(DL, TLI, Arg.getType()))
+ return false;
+
+ // FIXME: This check as well as ArgIdx are going away as soon as we support
+ // loading values < 32 bits.
+ if (ArgIdx > 4 && Arg.getType()->getIntegerBitWidth() != 32)
+ return false;
+ }
+
+ CCAssignFn *AssignFn =
+ TLI.CCAssignFnForCall(F.getCallingConv(), F.isVarArg());
+
+ SmallVector<ArgInfo, 8> ArgInfos;
+ unsigned Idx = 0;
+ for (auto &Arg : Args) {
+ ArgInfo AInfo(VRegs[Idx], Arg.getType());
+ setArgFlags(AInfo, Idx + 1, DL, F);
+ ArgInfos.push_back(AInfo);
+ Idx++;
+ }
+
+ FormalArgHandler ArgHandler(MIRBuilder, MIRBuilder.getMF().getRegInfo());
+ return handleAssignments(MIRBuilder, AssignFn, ArgInfos, ArgHandler);
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMCallLowering.h b/contrib/llvm/lib/Target/ARM/ARMCallLowering.h
new file mode 100644
index 000000000000..6a1b886b501f
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMCallLowering.h
@@ -0,0 +1,42 @@
+//===-- llvm/lib/Target/ARM/ARMCallLowering.h - Call lowering -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file describes how to lower LLVM calls to machine code calls.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMCALLLOWERING
+#define LLVM_LIB_TARGET_ARM_ARMCALLLOWERING
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include "llvm/CodeGen/ValueTypes.h"
+
+namespace llvm {
+
+class ARMTargetLowering;
+class MachineInstrBuilder;
+
+class ARMCallLowering : public CallLowering {
+public:
+ ARMCallLowering(const ARMTargetLowering &TLI);
+
+ bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val,
+ unsigned VReg) const override;
+
+ bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
+ ArrayRef<unsigned> VRegs) const override;
+
+private:
+ bool lowerReturnVal(MachineIRBuilder &MIRBuilder, const Value *Val,
+ unsigned VReg, MachineInstrBuilder &Ret) const;
+};
+} // End of namespace llvm
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMCallingConv.h b/contrib/llvm/lib/Target/ARM/ARMCallingConv.h
new file mode 100644
index 000000000000..71b819362404
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMCallingConv.h
@@ -0,0 +1,288 @@
+//=== ARMCallingConv.h - ARM Custom Calling Convention Routines -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the custom routines for the ARM Calling Convention that
+// aren't done by tablegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMCALLINGCONV_H
+#define LLVM_LIB_TARGET_ARM_ARMCALLINGCONV_H
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMSubtarget.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+namespace llvm {
+
+// APCS f64 is in register pairs, possibly split to stack
+static bool f64AssignAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ CCState &State, bool CanFail) {
+ static const MCPhysReg RegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
+
+ // Try to get the first register.
+ if (unsigned Reg = State.AllocateReg(RegList))
+ State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ else {
+ // For the 2nd half of a v2f64, do not fail.
+ if (CanFail)
+ return false;
+
+ // Put the whole thing on the stack.
+ State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
+ State.AllocateStack(8, 4),
+ LocVT, LocInfo));
+ return true;
+ }
+
+ // Try to get the second register.
+ if (unsigned Reg = State.AllocateReg(RegList))
+ State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ else
+ State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
+ State.AllocateStack(4, 4),
+ LocVT, LocInfo));
+ return true;
+}
+
+static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags,
+ CCState &State) {
+ if (!f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, true))
+ return false;
+ if (LocVT == MVT::v2f64 &&
+ !f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, false))
+ return false;
+ return true; // we handled it
+}
+
+// AAPCS f64 is in aligned register pairs
+static bool f64AssignAAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ CCState &State, bool CanFail) {
+ static const MCPhysReg HiRegList[] = { ARM::R0, ARM::R2 };
+ static const MCPhysReg LoRegList[] = { ARM::R1, ARM::R3 };
+ static const MCPhysReg ShadowRegList[] = { ARM::R0, ARM::R1 };
+ static const MCPhysReg GPRArgRegs[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
+
+ unsigned Reg = State.AllocateReg(HiRegList, ShadowRegList);
+ if (Reg == 0) {
+
+ // If we had R3 unallocated only, now we still must to waste it.
+ Reg = State.AllocateReg(GPRArgRegs);
+ assert((!Reg || Reg == ARM::R3) && "Wrong GPRs usage for f64");
+
+ // For the 2nd half of a v2f64, do not just fail.
+ if (CanFail)
+ return false;
+
+ // Put the whole thing on the stack.
+ State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
+ State.AllocateStack(8, 8),
+ LocVT, LocInfo));
+ return true;
+ }
+
+ unsigned i;
+ for (i = 0; i < 2; ++i)
+ if (HiRegList[i] == Reg)
+ break;
+
+ unsigned T = State.AllocateReg(LoRegList[i]);
+ (void)T;
+ assert(T == LoRegList[i] && "Could not allocate register");
+
+ State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i],
+ LocVT, LocInfo));
+ return true;
+}
+
+static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags,
+ CCState &State) {
+ if (!f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, true))
+ return false;
+ if (LocVT == MVT::v2f64 &&
+ !f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, false))
+ return false;
+ return true; // we handled it
+}
+
+static bool f64RetAssign(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo, CCState &State) {
+ static const MCPhysReg HiRegList[] = { ARM::R0, ARM::R2 };
+ static const MCPhysReg LoRegList[] = { ARM::R1, ARM::R3 };
+
+ unsigned Reg = State.AllocateReg(HiRegList, LoRegList);
+ if (Reg == 0)
+ return false; // we didn't handle it
+
+ unsigned i;
+ for (i = 0; i < 2; ++i)
+ if (HiRegList[i] == Reg)
+ break;
+
+ State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i],
+ LocVT, LocInfo));
+ return true;
+}
+
+static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags,
+ CCState &State) {
+ if (!f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State))
+ return false;
+ if (LocVT == MVT::v2f64 && !f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State))
+ return false;
+ return true; // we handled it
+}
+
+static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags,
+ CCState &State) {
+ return RetCC_ARM_APCS_Custom_f64(ValNo, ValVT, LocVT, LocInfo, ArgFlags,
+ State);
+}
+
+static const MCPhysReg RRegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
+
+static const MCPhysReg SRegList[] = { ARM::S0, ARM::S1, ARM::S2, ARM::S3,
+ ARM::S4, ARM::S5, ARM::S6, ARM::S7,
+ ARM::S8, ARM::S9, ARM::S10, ARM::S11,
+ ARM::S12, ARM::S13, ARM::S14, ARM::S15 };
+static const MCPhysReg DRegList[] = { ARM::D0, ARM::D1, ARM::D2, ARM::D3,
+ ARM::D4, ARM::D5, ARM::D6, ARM::D7 };
+static const MCPhysReg QRegList[] = { ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3 };
+
+
+// Allocate part of an AAPCS HFA or HVA. We assume that each member of the HA
+// has InConsecutiveRegs set, and that the last member also has
+// InConsecutiveRegsLast set. We must process all members of the HA before
+// we can allocate it, as we need to know the total number of registers that
+// will be needed in order to (attempt to) allocate a contiguous block.
+static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned &ValNo, MVT &ValVT,
+ MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags,
+ CCState &State) {
+ SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
+
+ // AAPCS HFAs must have 1-4 elements, all of the same type
+ if (PendingMembers.size() > 0)
+ assert(PendingMembers[0].getLocVT() == LocVT);
+
+ // Add the argument to the list to be allocated once we know the size of the
+ // aggregate. Store the type's required alignmnent as extra info for later: in
+ // the [N x i64] case all trace has been removed by the time we actually get
+ // to do allocation.
+ PendingMembers.push_back(CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo,
+ ArgFlags.getOrigAlign()));
+
+ if (!ArgFlags.isInConsecutiveRegsLast())
+ return true;
+
+ // Try to allocate a contiguous block of registers, each of the correct
+ // size to hold one member.
+ auto &DL = State.getMachineFunction().getDataLayout();
+ unsigned StackAlign = DL.getStackAlignment();
+ unsigned Align = std::min(PendingMembers[0].getExtraInfo(), StackAlign);
+
+ ArrayRef<MCPhysReg> RegList;
+ switch (LocVT.SimpleTy) {
+ case MVT::i32: {
+ RegList = RRegList;
+ unsigned RegIdx = State.getFirstUnallocated(RegList);
+
+ // First consume all registers that would give an unaligned object. Whether
+ // we go on stack or in regs, no-one will be using them in future.
+ unsigned RegAlign = alignTo(Align, 4) / 4;
+ while (RegIdx % RegAlign != 0 && RegIdx < RegList.size())
+ State.AllocateReg(RegList[RegIdx++]);
+
+ break;
+ }
+ case MVT::f32:
+ RegList = SRegList;
+ break;
+ case MVT::f64:
+ RegList = DRegList;
+ break;
+ case MVT::v2f64:
+ RegList = QRegList;
+ break;
+ default:
+ llvm_unreachable("Unexpected member type for block aggregate");
+ break;
+ }
+
+ unsigned RegResult = State.AllocateRegBlock(RegList, PendingMembers.size());
+ if (RegResult) {
+ for (SmallVectorImpl<CCValAssign>::iterator It = PendingMembers.begin();
+ It != PendingMembers.end(); ++It) {
+ It->convertToReg(RegResult);
+ State.addLoc(*It);
+ ++RegResult;
+ }
+ PendingMembers.clear();
+ return true;
+ }
+
+ // Register allocation failed, we'll be needing the stack
+ unsigned Size = LocVT.getSizeInBits() / 8;
+ if (LocVT == MVT::i32 && State.getNextStackOffset() == 0) {
+ // If nothing else has used the stack until this point, a non-HFA aggregate
+ // can be split between regs and stack.
+ unsigned RegIdx = State.getFirstUnallocated(RegList);
+ for (auto &It : PendingMembers) {
+ if (RegIdx >= RegList.size())
+ It.convertToMem(State.AllocateStack(Size, Size));
+ else
+ It.convertToReg(State.AllocateReg(RegList[RegIdx++]));
+
+ State.addLoc(It);
+ }
+ PendingMembers.clear();
+ return true;
+ } else if (LocVT != MVT::i32)
+ RegList = SRegList;
+
+ // Mark all regs as unavailable (AAPCS rule C.2.vfp for VFP, C.6 for core)
+ for (auto Reg : RegList)
+ State.AllocateReg(Reg);
+
+ for (auto &It : PendingMembers) {
+ It.convertToMem(State.AllocateStack(Size, Align));
+ State.addLoc(It);
+
+ // After the first item has been allocated, the rest are packed as tightly
+ // as possible. (E.g. an incoming i64 would have starting Align of 8, but
+ // we'll be allocating a bunch of i32 slots).
+ Align = Size;
+ }
+
+ // All pending members have now been allocated
+ PendingMembers.clear();
+
+ // This will be allocated by the last member of the aggregate
+ return true;
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMCallingConv.td b/contrib/llvm/lib/Target/ARM/ARMCallingConv.td
new file mode 100644
index 000000000000..9c278a52a7ff
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMCallingConv.td
@@ -0,0 +1,310 @@
+//===-- ARMCallingConv.td - Calling Conventions for ARM ----*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for ARM architecture.
+//===----------------------------------------------------------------------===//
+
+/// CCIfAlign - Match of the original alignment of the arg
+class CCIfAlign<string Align, CCAction A>:
+ CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>;
+
+//===----------------------------------------------------------------------===//
+// ARM APCS Calling Convention
+//===----------------------------------------------------------------------===//
+def CC_ARM_APCS : CallingConv<[
+
+ // Handles byval parameters.
+ CCIfByVal<CCPassByVal<4, 4>>,
+
+ CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+ // Pass SwiftSelf in a callee saved register.
+ CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
+
+ // A SwiftError is passed in R6.
+ CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R6]>>>,
+
+ // Handle all vector types as either f64 or v2f64.
+ CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+ CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+ // f64 and v2f64 are passed in adjacent GPRs, possibly split onto the stack
+ CCIfType<[f64, v2f64], CCCustom<"CC_ARM_APCS_Custom_f64">>,
+
+ CCIfType<[f32], CCBitConvertToType<i32>>,
+ CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
+
+ CCIfType<[i32], CCAssignToStack<4, 4>>,
+ CCIfType<[f64], CCAssignToStack<8, 4>>,
+ CCIfType<[v2f64], CCAssignToStack<16, 4>>
+]>;
+
+def RetCC_ARM_APCS : CallingConv<[
+ CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+ CCIfType<[f32], CCBitConvertToType<i32>>,
+
+ // Pass SwiftSelf in a callee saved register.
+ CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
+
+ // A SwiftError is returned in R6.
+ CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R6]>>>,
+
+ // Handle all vector types as either f64 or v2f64.
+ CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+ CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+ CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_APCS_Custom_f64">>,
+
+ CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
+ CCIfType<[i64], CCAssignToRegWithShadow<[R0, R2], [R1, R3]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// ARM APCS Calling Convention for FastCC (when VFP2 or later is available)
+//===----------------------------------------------------------------------===//
+def FastCC_ARM_APCS : CallingConv<[
+ // Handle all vector types as either f64 or v2f64.
+ CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+ CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+ CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
+ CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
+ CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
+ S9, S10, S11, S12, S13, S14, S15]>>,
+
+ // CPRCs may be allocated to co-processor registers or the stack - they
+ // may never be allocated to core registers.
+ CCIfType<[f32], CCAssignToStackWithShadow<4, 4, [Q0, Q1, Q2, Q3]>>,
+ CCIfType<[f64], CCAssignToStackWithShadow<8, 4, [Q0, Q1, Q2, Q3]>>,
+ CCIfType<[v2f64], CCAssignToStackWithShadow<16, 4, [Q0, Q1, Q2, Q3]>>,
+
+ CCDelegateTo<CC_ARM_APCS>
+]>;
+
+def RetFastCC_ARM_APCS : CallingConv<[
+ // Handle all vector types as either f64 or v2f64.
+ CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+ CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+ CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
+ CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
+ CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
+ S9, S10, S11, S12, S13, S14, S15]>>,
+ CCDelegateTo<RetCC_ARM_APCS>
+]>;
+
+//===----------------------------------------------------------------------===//
+// ARM APCS Calling Convention for GHC
+//===----------------------------------------------------------------------===//
+
+def CC_ARM_APCS_GHC : CallingConv<[
+ // Handle all vector types as either f64 or v2f64.
+ CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+ CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+ CCIfType<[v2f64], CCAssignToReg<[Q4, Q5]>>,
+ CCIfType<[f64], CCAssignToReg<[D8, D9, D10, D11]>>,
+ CCIfType<[f32], CCAssignToReg<[S16, S17, S18, S19, S20, S21, S22, S23]>>,
+
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, SpLim
+ CCIfType<[i32], CCAssignToReg<[R4, R5, R6, R7, R8, R9, R10, R11]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// ARM AAPCS (EABI) Calling Convention, common parts
+//===----------------------------------------------------------------------===//
+
+def CC_ARM_AAPCS_Common : CallingConv<[
+
+ CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+ // i64/f64 is passed in even pairs of GPRs
+ // i64 is 8-aligned i32 here, so we may need to eat R1 as a pad register
+ // (and the same is true for f64 if VFP is not enabled)
+ CCIfType<[i32], CCIfAlign<"8", CCAssignToRegWithShadow<[R0, R2], [R0, R1]>>>,
+ CCIfType<[i32], CCIf<"ArgFlags.getOrigAlign() != 8",
+ CCAssignToReg<[R0, R1, R2, R3]>>>,
+
+ CCIfType<[i32], CCIfAlign<"8", CCAssignToStackWithShadow<4, 8, [R0, R1, R2, R3]>>>,
+ CCIfType<[i32], CCAssignToStackWithShadow<4, 4, [R0, R1, R2, R3]>>,
+ CCIfType<[f32], CCAssignToStackWithShadow<4, 4, [Q0, Q1, Q2, Q3]>>,
+ CCIfType<[f64], CCAssignToStackWithShadow<8, 8, [Q0, Q1, Q2, Q3]>>,
+ CCIfType<[v2f64], CCIfAlign<"16",
+ CCAssignToStackWithShadow<16, 16, [Q0, Q1, Q2, Q3]>>>,
+ CCIfType<[v2f64], CCAssignToStackWithShadow<16, 8, [Q0, Q1, Q2, Q3]>>
+]>;
+
+def RetCC_ARM_AAPCS_Common : CallingConv<[
+ CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+ CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
+ CCIfType<[i64], CCAssignToRegWithShadow<[R0, R2], [R1, R3]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// ARM AAPCS (EABI) Calling Convention
+//===----------------------------------------------------------------------===//
+
+def CC_ARM_AAPCS : CallingConv<[
+ // Handles byval parameters.
+ CCIfByVal<CCPassByVal<4, 4>>,
+
+ // The 'nest' parameter, if any, is passed in R12.
+ CCIfNest<CCAssignToReg<[R12]>>,
+
+ // Handle all vector types as either f64 or v2f64.
+ CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+ CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+ // Pass SwiftSelf in a callee saved register.
+ CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
+
+ // A SwiftError is passed in R6.
+ CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R6]>>>,
+
+ CCIfType<[f64, v2f64], CCCustom<"CC_ARM_AAPCS_Custom_f64">>,
+ CCIfType<[f32], CCBitConvertToType<i32>>,
+ CCDelegateTo<CC_ARM_AAPCS_Common>
+]>;
+
+def RetCC_ARM_AAPCS : CallingConv<[
+ // Handle all vector types as either f64 or v2f64.
+ CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+ CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+ // Pass SwiftSelf in a callee saved register.
+ CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
+
+ // A SwiftError is returned in R6.
+ CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R6]>>>,
+
+ CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>,
+ CCIfType<[f32], CCBitConvertToType<i32>>,
+ CCDelegateTo<RetCC_ARM_AAPCS_Common>
+]>;
+
+//===----------------------------------------------------------------------===//
+// ARM AAPCS-VFP (EABI) Calling Convention
+// Also used for FastCC (when VFP2 or later is available)
+//===----------------------------------------------------------------------===//
+
+def CC_ARM_AAPCS_VFP : CallingConv<[
+ // Handles byval parameters.
+ CCIfByVal<CCPassByVal<4, 4>>,
+
+ // Handle all vector types as either f64 or v2f64.
+ CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+ CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+ // Pass SwiftSelf in a callee saved register.
+ CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
+
+ // A SwiftError is passed in R6.
+ CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R6]>>>,
+
+ // HFAs are passed in a contiguous block of registers, or on the stack
+ CCIfConsecutiveRegs<CCCustom<"CC_ARM_AAPCS_Custom_Aggregate">>,
+
+ CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
+ CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
+ CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
+ S9, S10, S11, S12, S13, S14, S15]>>,
+ CCDelegateTo<CC_ARM_AAPCS_Common>
+]>;
+
+def RetCC_ARM_AAPCS_VFP : CallingConv<[
+ // Handle all vector types as either f64 or v2f64.
+ CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+ CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+ // Pass SwiftSelf in a callee saved register.
+ CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
+
+ // A SwiftError is returned in R6.
+ CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R6]>>>,
+
+ CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
+ CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
+ CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
+ S9, S10, S11, S12, S13, S14, S15]>>,
+ CCDelegateTo<RetCC_ARM_AAPCS_Common>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Callee-saved register lists.
+//===----------------------------------------------------------------------===//
+
+def CSR_NoRegs : CalleeSavedRegs<(add)>;
+def CSR_FPRegs : CalleeSavedRegs<(add (sequence "D%u", 0, 31))>;
+
+def CSR_AAPCS : CalleeSavedRegs<(add LR, R11, R10, R9, R8, R7, R6, R5, R4,
+ (sequence "D%u", 15, 8))>;
+
+// The order of callee-saved registers needs to match the order we actually push
+// them in FrameLowering, because this order is what's used by
+// PrologEpilogInserter to allocate frame index slots. So when R7 is the frame
+// pointer, we use this AAPCS alternative.
+def CSR_AAPCS_SplitPush : CalleeSavedRegs<(add LR, R7, R6, R5, R4,
+ R11, R10, R9, R8,
+ (sequence "D%u", 15, 8))>;
+
+// Constructors and destructors return 'this' in the ARM C++ ABI; since 'this'
+// and the pointer return value are both passed in R0 in these cases, this can
+// be partially modelled by treating R0 as a callee-saved register
+// Only the resulting RegMask is used; the SaveList is ignored
+def CSR_AAPCS_ThisReturn : CalleeSavedRegs<(add LR, R11, R10, R9, R8, R7, R6,
+ R5, R4, (sequence "D%u", 15, 8),
+ R0)>;
+
+// iOS ABI deviates from ARM standard ABI. R9 is not a callee-saved register.
+// Also save R7-R4 first to match the stack frame fixed spill areas.
+def CSR_iOS : CalleeSavedRegs<(add LR, R7, R6, R5, R4, (sub CSR_AAPCS, R9))>;
+
+// R6 is used to pass swifterror, remove it from CSR.
+def CSR_iOS_SwiftError : CalleeSavedRegs<(sub CSR_iOS, R6)>;
+
+def CSR_iOS_ThisReturn : CalleeSavedRegs<(add LR, R7, R6, R5, R4,
+ (sub CSR_AAPCS_ThisReturn, R9))>;
+
+def CSR_iOS_TLSCall : CalleeSavedRegs<(add LR, SP,
+ (sequence "R%u", 12, 1),
+ (sequence "D%u", 31, 0))>;
+
+// C++ TLS access function saves all registers except SP. Try to match
+// the order of CSRs in CSR_iOS.
+def CSR_iOS_CXX_TLS : CalleeSavedRegs<(add CSR_iOS, (sequence "R%u", 12, 1),
+ (sequence "D%u", 31, 0))>;
+
+// CSRs that are handled by prologue, epilogue.
+def CSR_iOS_CXX_TLS_PE : CalleeSavedRegs<(add LR, R12, R11, R7, R5, R4)>;
+
+// CSRs that are handled explicitly via copies.
+def CSR_iOS_CXX_TLS_ViaCopy : CalleeSavedRegs<(sub CSR_iOS_CXX_TLS,
+ CSR_iOS_CXX_TLS_PE)>;
+
+// The "interrupt" attribute is used to generate code that is acceptable in
+// exception-handlers of various kinds. It makes us use a different return
+// instruction (handled elsewhere) and affects which registers we must return to
+// our "caller" in the same state as we receive them.
+
+// For most interrupts, all registers except SP and LR are shared with
+// user-space. We mark LR to be saved anyway, since this is what the ARM backend
+// generally does rather than tracking its liveness as a normal register.
+def CSR_GenericInt : CalleeSavedRegs<(add LR, (sequence "R%u", 12, 0))>;
+
+// The fast interrupt handlers have more private state and get their own copies
+// of R8-R12, in addition to SP and LR. As before, mark LR for saving too.
+
+// FIXME: we mark R11 as callee-saved since it's often the frame-pointer, and
+// current frame lowering expects to encounter it while processing callee-saved
+// registers.
+def CSR_FIQ : CalleeSavedRegs<(add LR, R11, (sequence "R%u", 7, 0))>;
+
+
diff --git a/contrib/llvm/lib/Target/ARM/ARMComputeBlockSize.cpp b/contrib/llvm/lib/Target/ARM/ARMComputeBlockSize.cpp
new file mode 100644
index 000000000000..64f187d17e64
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMComputeBlockSize.cpp
@@ -0,0 +1,72 @@
+//===--- ARMComputeBlockSize.cpp - Compute machine block sizes ------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBasicBlockInfo.h"
+using namespace llvm;
+
+namespace llvm {
+
+// mayOptimizeThumb2Instruction - Returns true if optimizeThumb2Instructions
+// below may shrink MI.
+static bool
+mayOptimizeThumb2Instruction(const MachineInstr *MI) {
+ switch(MI->getOpcode()) {
+ // optimizeThumb2Instructions.
+ case ARM::t2LEApcrel:
+ case ARM::t2LDRpci:
+ // optimizeThumb2Branches.
+ case ARM::t2B:
+ case ARM::t2Bcc:
+ case ARM::tBcc:
+ // optimizeThumb2JumpTables.
+ case ARM::t2BR_JT:
+ return true;
+ }
+ return false;
+}
+
+void computeBlockSize(MachineFunction *MF, MachineBasicBlock *MBB,
+ BasicBlockInfo &BBI) {
+ const ARMBaseInstrInfo *TII =
+ static_cast<const ARMBaseInstrInfo *>(MF->getSubtarget().getInstrInfo());
+ bool isThumb = MF->getInfo<ARMFunctionInfo>()->isThumbFunction();
+ BBI.Size = 0;
+ BBI.Unalign = 0;
+ BBI.PostAlign = 0;
+
+ for (MachineInstr &I : *MBB) {
+ BBI.Size += TII->getInstSizeInBytes(I);
+ // For inline asm, getInstSizeInBytes returns a conservative estimate.
+ // The actual size may be smaller, but still a multiple of the instr size.
+ if (I.isInlineAsm())
+ BBI.Unalign = isThumb ? 1 : 2;
+ // Also consider instructions that may be shrunk later.
+ else if (isThumb && mayOptimizeThumb2Instruction(&I))
+ BBI.Unalign = 1;
+ }
+
+ // tBR_JTr contains a .align 2 directive.
+ if (!MBB->empty() && MBB->back().getOpcode() == ARM::tBR_JTr) {
+ BBI.PostAlign = 2;
+ MBB->getParent()->ensureAlignment(2);
+ }
+}
+
+std::vector<BasicBlockInfo> computeAllBlockSizes(MachineFunction *MF) {
+ std::vector<BasicBlockInfo> BBInfo;
+ BBInfo.resize(MF->getNumBlockIDs());
+
+ for (MachineBasicBlock &MBB : *MF)
+ computeBlockSize(MF, &MBB, BBInfo[MBB.getNumber()]);
+
+ return BBInfo;
+}
+
+} // end namespace
diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
new file mode 100644
index 000000000000..be1a37e3e362
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -0,0 +1,2258 @@
+//===-- ARMConstantIslandPass.cpp - ARM constant islands ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that splits the constant pool up into 'islands'
+// which are scattered through-out the function. This is required due to the
+// limited pc-relative displacements that ARM has.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBasicBlockInfo.h"
+#include "ARMMachineFunctionInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "Thumb2InstrInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-cp-islands"
+
+STATISTIC(NumCPEs, "Number of constpool entries");
+STATISTIC(NumSplit, "Number of uncond branches inserted");
+STATISTIC(NumCBrFixed, "Number of cond branches fixed");
+STATISTIC(NumUBrFixed, "Number of uncond branches fixed");
+STATISTIC(NumTBs, "Number of table branches generated");
+STATISTIC(NumT2CPShrunk, "Number of Thumb2 constantpool instructions shrunk");
+STATISTIC(NumT2BrShrunk, "Number of Thumb2 immediate branches shrunk");
+STATISTIC(NumCBZ, "Number of CBZ / CBNZ formed");
+STATISTIC(NumJTMoved, "Number of jump table destination blocks moved");
+STATISTIC(NumJTInserted, "Number of jump table intermediate blocks inserted");
+
+
+static cl::opt<bool>
+AdjustJumpTableBlocks("arm-adjust-jump-tables", cl::Hidden, cl::init(true),
+ cl::desc("Adjust basic block layout to better use TB[BH]"));
+
+static cl::opt<unsigned>
+CPMaxIteration("arm-constant-island-max-iteration", cl::Hidden, cl::init(30),
+ cl::desc("The max number of iteration for converge"));
+
+static cl::opt<bool> SynthesizeThumb1TBB(
+ "arm-synthesize-thumb-1-tbb", cl::Hidden, cl::init(true),
+ cl::desc("Use compressed jump tables in Thumb-1 by synthesizing an "
+ "equivalent to the TBB/TBH instructions"));
+
+namespace {
+ /// ARMConstantIslands - Due to limited PC-relative displacements, ARM
+ /// requires constant pool entries to be scattered among the instructions
+ /// inside a function. To do this, it completely ignores the normal LLVM
+ /// constant pool; instead, it places constants wherever it feels like with
+ /// special instructions.
+ ///
+ /// The terminology used in this pass includes:
+ /// Islands - Clumps of constants placed in the function.
+ /// Water - Potential places where an island could be formed.
+ /// CPE - A constant pool entry that has been placed somewhere, which
+ /// tracks a list of users.
+ class ARMConstantIslands : public MachineFunctionPass {
+
+ std::vector<BasicBlockInfo> BBInfo;
+
+ /// WaterList - A sorted list of basic blocks where islands could be placed
+ /// (i.e. blocks that don't fall through to the following block, due
+ /// to a return, unreachable, or unconditional branch).
+ std::vector<MachineBasicBlock*> WaterList;
+
+ /// NewWaterList - The subset of WaterList that was created since the
+ /// previous iteration by inserting unconditional branches.
+ SmallSet<MachineBasicBlock*, 4> NewWaterList;
+
+ typedef std::vector<MachineBasicBlock*>::iterator water_iterator;
+
+ /// CPUser - One user of a constant pool, keeping the machine instruction
+ /// pointer, the constant pool being referenced, and the max displacement
+ /// allowed from the instruction to the CP. The HighWaterMark records the
+ /// highest basic block where a new CPEntry can be placed. To ensure this
+ /// pass terminates, the CP entries are initially placed at the end of the
+ /// function and then move monotonically to lower addresses. The
+ /// exception to this rule is when the current CP entry for a particular
+ /// CPUser is out of range, but there is another CP entry for the same
+ /// constant value in range. We want to use the existing in-range CP
+ /// entry, but if it later moves out of range, the search for new water
+ /// should resume where it left off. The HighWaterMark is used to record
+ /// that point.
+ struct CPUser {
+ MachineInstr *MI;
+ MachineInstr *CPEMI;
+ MachineBasicBlock *HighWaterMark;
+ unsigned MaxDisp;
+ bool NegOk;
+ bool IsSoImm;
+ bool KnownAlignment;
+ CPUser(MachineInstr *mi, MachineInstr *cpemi, unsigned maxdisp,
+ bool neg, bool soimm)
+ : MI(mi), CPEMI(cpemi), MaxDisp(maxdisp), NegOk(neg), IsSoImm(soimm),
+ KnownAlignment(false) {
+ HighWaterMark = CPEMI->getParent();
+ }
+ /// getMaxDisp - Returns the maximum displacement supported by MI.
+ /// Correct for unknown alignment.
+ /// Conservatively subtract 2 bytes to handle weird alignment effects.
+ unsigned getMaxDisp() const {
+ return (KnownAlignment ? MaxDisp : MaxDisp - 2) - 2;
+ }
+ };
+
+ /// CPUsers - Keep track of all of the machine instructions that use various
+ /// constant pools and their max displacement.
+ std::vector<CPUser> CPUsers;
+
+ /// CPEntry - One per constant pool entry, keeping the machine instruction
+ /// pointer, the constpool index, and the number of CPUser's which
+ /// reference this entry.
+ struct CPEntry {
+ MachineInstr *CPEMI;
+ unsigned CPI;
+ unsigned RefCount;
+ CPEntry(MachineInstr *cpemi, unsigned cpi, unsigned rc = 0)
+ : CPEMI(cpemi), CPI(cpi), RefCount(rc) {}
+ };
+
+ /// CPEntries - Keep track of all of the constant pool entry machine
+ /// instructions. For each original constpool index (i.e. those that existed
+ /// upon entry to this pass), it keeps a vector of entries. Original
+ /// elements are cloned as we go along; the clones are put in the vector of
+ /// the original element, but have distinct CPIs.
+ ///
+ /// The first half of CPEntries contains generic constants, the second half
+ /// contains jump tables. Use getCombinedIndex on a generic CPEMI to look up
+ /// which vector it will be in here.
+ std::vector<std::vector<CPEntry> > CPEntries;
+
+ /// Maps a JT index to the offset in CPEntries containing copies of that
+ /// table. The equivalent map for a CONSTPOOL_ENTRY is the identity.
+ DenseMap<int, int> JumpTableEntryIndices;
+
+ /// Maps a JT index to the LEA that actually uses the index to calculate its
+ /// base address.
+ DenseMap<int, int> JumpTableUserIndices;
+
+ /// ImmBranch - One per immediate branch, keeping the machine instruction
+ /// pointer, conditional or unconditional, the max displacement,
+ /// and (if isCond is true) the corresponding unconditional branch
+ /// opcode.
+ struct ImmBranch {
+ MachineInstr *MI;
+ unsigned MaxDisp : 31;
+ bool isCond : 1;
+ unsigned UncondBr;
+ ImmBranch(MachineInstr *mi, unsigned maxdisp, bool cond, unsigned ubr)
+ : MI(mi), MaxDisp(maxdisp), isCond(cond), UncondBr(ubr) {}
+ };
+
+ /// ImmBranches - Keep track of all the immediate branch instructions.
+ ///
+ std::vector<ImmBranch> ImmBranches;
+
+ /// PushPopMIs - Keep track of all the Thumb push / pop instructions.
+ ///
+ SmallVector<MachineInstr*, 4> PushPopMIs;
+
+ /// T2JumpTables - Keep track of all the Thumb2 jumptable instructions.
+ SmallVector<MachineInstr*, 4> T2JumpTables;
+
+ /// HasFarJump - True if any far jump instruction has been emitted during
+ /// the branch fix up pass.
+ bool HasFarJump;
+
+ MachineFunction *MF;
+ MachineConstantPool *MCP;
+ const ARMBaseInstrInfo *TII;
+ const ARMSubtarget *STI;
+ ARMFunctionInfo *AFI;
+ bool isThumb;
+ bool isThumb1;
+ bool isThumb2;
+ bool isPositionIndependentOrROPI;
+ public:
+ static char ID;
+ ARMConstantIslands() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+ StringRef getPassName() const override {
+ return "ARM constant island placement and branch shortening pass";
+ }
+
+ private:
+ void doInitialConstPlacement(std::vector<MachineInstr *> &CPEMIs);
+ void doInitialJumpTablePlacement(std::vector<MachineInstr *> &CPEMIs);
+ bool BBHasFallthrough(MachineBasicBlock *MBB);
+ CPEntry *findConstPoolEntry(unsigned CPI, const MachineInstr *CPEMI);
+ unsigned getCPELogAlign(const MachineInstr *CPEMI);
+ void scanFunctionJumpTables();
+ void initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs);
+ MachineBasicBlock *splitBlockBeforeInstr(MachineInstr *MI);
+ void updateForInsertedWaterBlock(MachineBasicBlock *NewBB);
+ void adjustBBOffsetsAfter(MachineBasicBlock *BB);
+ bool decrementCPEReferenceCount(unsigned CPI, MachineInstr* CPEMI);
+ unsigned getCombinedIndex(const MachineInstr *CPEMI);
+ int findInRangeCPEntry(CPUser& U, unsigned UserOffset);
+ bool findAvailableWater(CPUser&U, unsigned UserOffset,
+ water_iterator &WaterIter, bool CloserWater);
+ void createNewWater(unsigned CPUserIndex, unsigned UserOffset,
+ MachineBasicBlock *&NewMBB);
+ bool handleConstantPoolUser(unsigned CPUserIndex, bool CloserWater);
+ void removeDeadCPEMI(MachineInstr *CPEMI);
+ bool removeUnusedCPEntries();
+ bool isCPEntryInRange(MachineInstr *MI, unsigned UserOffset,
+ MachineInstr *CPEMI, unsigned Disp, bool NegOk,
+ bool DoDump = false);
+ bool isWaterInRange(unsigned UserOffset, MachineBasicBlock *Water,
+ CPUser &U, unsigned &Growth);
+ bool isBBInRange(MachineInstr *MI, MachineBasicBlock *BB, unsigned Disp);
+ bool fixupImmediateBr(ImmBranch &Br);
+ bool fixupConditionalBr(ImmBranch &Br);
+ bool fixupUnconditionalBr(ImmBranch &Br);
+ bool undoLRSpillRestore();
+ bool optimizeThumb2Instructions();
+ bool optimizeThumb2Branches();
+ bool reorderThumb2JumpTables();
+ bool preserveBaseRegister(MachineInstr *JumpMI, MachineInstr *LEAMI,
+ unsigned &DeadSize, bool &CanDeleteLEA,
+ bool &BaseRegKill);
+ bool optimizeThumb2JumpTables();
+ MachineBasicBlock *adjustJTTargetBlockForward(MachineBasicBlock *BB,
+ MachineBasicBlock *JTBB);
+
+ unsigned getOffsetOf(MachineInstr *MI) const;
+ unsigned getUserOffset(CPUser&) const;
+ void dumpBBs();
+ void verify();
+
+ bool isOffsetInRange(unsigned UserOffset, unsigned TrialOffset,
+ unsigned Disp, bool NegativeOK, bool IsSoImm = false);
+ bool isOffsetInRange(unsigned UserOffset, unsigned TrialOffset,
+ const CPUser &U) {
+ return isOffsetInRange(UserOffset, TrialOffset,
+ U.getMaxDisp(), U.NegOk, U.IsSoImm);
+ }
+ };
+ char ARMConstantIslands::ID = 0;
+}
+
+/// verify - check BBOffsets, BBSizes, alignment of islands
+void ARMConstantIslands::verify() {
+#ifndef NDEBUG
+ assert(std::is_sorted(MF->begin(), MF->end(),
+ [this](const MachineBasicBlock &LHS,
+ const MachineBasicBlock &RHS) {
+ return BBInfo[LHS.getNumber()].postOffset() <
+ BBInfo[RHS.getNumber()].postOffset();
+ }));
+ DEBUG(dbgs() << "Verifying " << CPUsers.size() << " CP users.\n");
+ for (unsigned i = 0, e = CPUsers.size(); i != e; ++i) {
+ CPUser &U = CPUsers[i];
+ unsigned UserOffset = getUserOffset(U);
+ // Verify offset using the real max displacement without the safety
+ // adjustment.
+ if (isCPEntryInRange(U.MI, UserOffset, U.CPEMI, U.getMaxDisp()+2, U.NegOk,
+ /* DoDump = */ true)) {
+ DEBUG(dbgs() << "OK\n");
+ continue;
+ }
+ DEBUG(dbgs() << "Out of range.\n");
+ dumpBBs();
+ DEBUG(MF->dump());
+ llvm_unreachable("Constant pool entry out of range!");
+ }
+#endif
+}
+
+/// print block size and offset information - debugging
+void ARMConstantIslands::dumpBBs() {
+ DEBUG({
+ for (unsigned J = 0, E = BBInfo.size(); J !=E; ++J) {
+ const BasicBlockInfo &BBI = BBInfo[J];
+ dbgs() << format("%08x BB#%u\t", BBI.Offset, J)
+ << " kb=" << unsigned(BBI.KnownBits)
+ << " ua=" << unsigned(BBI.Unalign)
+ << " pa=" << unsigned(BBI.PostAlign)
+ << format(" size=%#x\n", BBInfo[J].Size);
+ }
+ });
+}
+
+/// createARMConstantIslandPass - returns an instance of the constpool
+/// island pass.
+FunctionPass *llvm::createARMConstantIslandPass() {
+ return new ARMConstantIslands();
+}
+
+bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
+ MF = &mf;
+ MCP = mf.getConstantPool();
+
+ DEBUG(dbgs() << "***** ARMConstantIslands: "
+ << MCP->getConstants().size() << " CP entries, aligned to "
+ << MCP->getConstantPoolAlignment() << " bytes *****\n");
+
+ STI = &static_cast<const ARMSubtarget &>(MF->getSubtarget());
+ TII = STI->getInstrInfo();
+ isPositionIndependentOrROPI =
+ STI->getTargetLowering()->isPositionIndependent() || STI->isROPI();
+ AFI = MF->getInfo<ARMFunctionInfo>();
+
+ isThumb = AFI->isThumbFunction();
+ isThumb1 = AFI->isThumb1OnlyFunction();
+ isThumb2 = AFI->isThumb2Function();
+
+ HasFarJump = false;
+ bool GenerateTBB = isThumb2 || (isThumb1 && SynthesizeThumb1TBB);
+
+ // This pass invalidates liveness information when it splits basic blocks.
+ MF->getRegInfo().invalidateLiveness();
+
+ // Renumber all of the machine basic blocks in the function, guaranteeing that
+ // the numbers agree with the position of the block in the function.
+ MF->RenumberBlocks();
+
+ // Try to reorder and otherwise adjust the block layout to make good use
+ // of the TB[BH] instructions.
+ bool MadeChange = false;
+ if (GenerateTBB && AdjustJumpTableBlocks) {
+ scanFunctionJumpTables();
+ MadeChange |= reorderThumb2JumpTables();
+ // Data is out of date, so clear it. It'll be re-computed later.
+ T2JumpTables.clear();
+ // Blocks may have shifted around. Keep the numbering up to date.
+ MF->RenumberBlocks();
+ }
+
+ // Perform the initial placement of the constant pool entries. To start with,
+ // we put them all at the end of the function.
+ std::vector<MachineInstr*> CPEMIs;
+ if (!MCP->isEmpty())
+ doInitialConstPlacement(CPEMIs);
+
+ if (MF->getJumpTableInfo())
+ doInitialJumpTablePlacement(CPEMIs);
+
+ /// The next UID to take is the first unused one.
+ AFI->initPICLabelUId(CPEMIs.size());
+
+ // Do the initial scan of the function, building up information about the
+ // sizes of each block, the location of all the water, and finding all of the
+ // constant pool users.
+ initializeFunctionInfo(CPEMIs);
+ CPEMIs.clear();
+ DEBUG(dumpBBs());
+
+ // Functions with jump tables need an alignment of 4 because they use the ADR
+ // instruction, which aligns the PC to 4 bytes before adding an offset.
+ if (!T2JumpTables.empty())
+ MF->ensureAlignment(2);
+
+ /// Remove dead constant pool entries.
+ MadeChange |= removeUnusedCPEntries();
+
+ // Iteratively place constant pool entries and fix up branches until there
+ // is no change.
+ unsigned NoCPIters = 0, NoBRIters = 0;
+ while (true) {
+ DEBUG(dbgs() << "Beginning CP iteration #" << NoCPIters << '\n');
+ bool CPChange = false;
+ for (unsigned i = 0, e = CPUsers.size(); i != e; ++i)
+ // For most inputs, it converges in no more than 5 iterations.
+ // If it doesn't end in 10, the input may have huge BB or many CPEs.
+ // In this case, we will try different heuristics.
+ CPChange |= handleConstantPoolUser(i, NoCPIters >= CPMaxIteration / 2);
+ if (CPChange && ++NoCPIters > CPMaxIteration)
+ report_fatal_error("Constant Island pass failed to converge!");
+ DEBUG(dumpBBs());
+
+ // Clear NewWaterList now. If we split a block for branches, it should
+ // appear as "new water" for the next iteration of constant pool placement.
+ NewWaterList.clear();
+
+ DEBUG(dbgs() << "Beginning BR iteration #" << NoBRIters << '\n');
+ bool BRChange = false;
+ for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i)
+ BRChange |= fixupImmediateBr(ImmBranches[i]);
+ if (BRChange && ++NoBRIters > 30)
+ report_fatal_error("Branch Fix Up pass failed to converge!");
+ DEBUG(dumpBBs());
+
+ if (!CPChange && !BRChange)
+ break;
+ MadeChange = true;
+ }
+
+ // Shrink 32-bit Thumb2 load and store instructions.
+ if (isThumb2 && !STI->prefers32BitThumb())
+ MadeChange |= optimizeThumb2Instructions();
+
+ // Shrink 32-bit branch instructions.
+ if (isThumb && STI->hasV8MBaselineOps())
+ MadeChange |= optimizeThumb2Branches();
+
+ // Optimize jump tables using TBB / TBH.
+ if (GenerateTBB && !STI->genExecuteOnly())
+ MadeChange |= optimizeThumb2JumpTables();
+
+ // After a while, this might be made debug-only, but it is not expensive.
+ verify();
+
+ // If LR has been forced spilled and no far jump (i.e. BL) has been issued,
+ // undo the spill / restore of LR if possible.
+ if (isThumb && !HasFarJump && AFI->isLRSpilledForFarJump())
+ MadeChange |= undoLRSpillRestore();
+
+ // Save the mapping between original and cloned constpool entries.
+ for (unsigned i = 0, e = CPEntries.size(); i != e; ++i) {
+ for (unsigned j = 0, je = CPEntries[i].size(); j != je; ++j) {
+ const CPEntry & CPE = CPEntries[i][j];
+ if (CPE.CPEMI && CPE.CPEMI->getOperand(1).isCPI())
+ AFI->recordCPEClone(i, CPE.CPI);
+ }
+ }
+
+ DEBUG(dbgs() << '\n'; dumpBBs());
+
+ BBInfo.clear();
+ WaterList.clear();
+ CPUsers.clear();
+ CPEntries.clear();
+ JumpTableEntryIndices.clear();
+ JumpTableUserIndices.clear();
+ ImmBranches.clear();
+ PushPopMIs.clear();
+ T2JumpTables.clear();
+
+ return MadeChange;
+}
+
+/// \brief Perform the initial placement of the regular constant pool entries.
+/// To start with, we put them all at the end of the function.
+void
+ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs) {
+ // Create the basic block to hold the CPE's.
+ MachineBasicBlock *BB = MF->CreateMachineBasicBlock();
+ MF->push_back(BB);
+
+ // MachineConstantPool measures alignment in bytes. We measure in log2(bytes).
+ unsigned MaxAlign = Log2_32(MCP->getConstantPoolAlignment());
+
+ // Mark the basic block as required by the const-pool.
+ BB->setAlignment(MaxAlign);
+
+ // The function needs to be as aligned as the basic blocks. The linker may
+ // move functions around based on their alignment.
+ MF->ensureAlignment(BB->getAlignment());
+
+ // Order the entries in BB by descending alignment. That ensures correct
+ // alignment of all entries as long as BB is sufficiently aligned. Keep
+ // track of the insertion point for each alignment. We are going to bucket
+ // sort the entries as they are created.
+ SmallVector<MachineBasicBlock::iterator, 8> InsPoint(MaxAlign + 1, BB->end());
+
+ // Add all of the constants from the constant pool to the end block, use an
+ // identity mapping of CPI's to CPE's.
+ const std::vector<MachineConstantPoolEntry> &CPs = MCP->getConstants();
+
+ const DataLayout &TD = MF->getDataLayout();
+ for (unsigned i = 0, e = CPs.size(); i != e; ++i) {
+ unsigned Size = TD.getTypeAllocSize(CPs[i].getType());
+ assert(Size >= 4 && "Too small constant pool entry");
+ unsigned Align = CPs[i].getAlignment();
+ assert(isPowerOf2_32(Align) && "Invalid alignment");
+ // Verify that all constant pool entries are a multiple of their alignment.
+ // If not, we would have to pad them out so that instructions stay aligned.
+ assert((Size % Align) == 0 && "CP Entry not multiple of 4 bytes!");
+
+ // Insert CONSTPOOL_ENTRY before entries with a smaller alignment.
+ unsigned LogAlign = Log2_32(Align);
+ MachineBasicBlock::iterator InsAt = InsPoint[LogAlign];
+ MachineInstr *CPEMI =
+ BuildMI(*BB, InsAt, DebugLoc(), TII->get(ARM::CONSTPOOL_ENTRY))
+ .addImm(i).addConstantPoolIndex(i).addImm(Size);
+ CPEMIs.push_back(CPEMI);
+
+ // Ensure that future entries with higher alignment get inserted before
+ // CPEMI. This is bucket sort with iterators.
+ for (unsigned a = LogAlign + 1; a <= MaxAlign; ++a)
+ if (InsPoint[a] == InsAt)
+ InsPoint[a] = CPEMI;
+
+ // Add a new CPEntry, but no corresponding CPUser yet.
+ CPEntries.emplace_back(1, CPEntry(CPEMI, i));
+ ++NumCPEs;
+ DEBUG(dbgs() << "Moved CPI#" << i << " to end of function, size = "
+ << Size << ", align = " << Align <<'\n');
+ }
+ DEBUG(BB->dump());
+}
+
+/// \brief Do initial placement of the jump tables. Because Thumb2's TBB and TBH
+/// instructions can be made more efficient if the jump table immediately
+/// follows the instruction, it's best to place them immediately next to their
+/// jumps to begin with. In almost all cases they'll never be moved from that
+/// position.
+void ARMConstantIslands::doInitialJumpTablePlacement(
+ std::vector<MachineInstr *> &CPEMIs) {
+ unsigned i = CPEntries.size();
+ auto MJTI = MF->getJumpTableInfo();
+ const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+
+ MachineBasicBlock *LastCorrectlyNumberedBB = nullptr;
+ for (MachineBasicBlock &MBB : *MF) {
+ auto MI = MBB.getLastNonDebugInstr();
+ if (MI == MBB.end())
+ continue;
+
+ unsigned JTOpcode;
+ switch (MI->getOpcode()) {
+ default:
+ continue;
+ case ARM::BR_JTadd:
+ case ARM::BR_JTr:
+ case ARM::tBR_JTr:
+ case ARM::BR_JTm:
+ JTOpcode = ARM::JUMPTABLE_ADDRS;
+ break;
+ case ARM::t2BR_JT:
+ JTOpcode = ARM::JUMPTABLE_INSTS;
+ break;
+ case ARM::tTBB_JT:
+ case ARM::t2TBB_JT:
+ JTOpcode = ARM::JUMPTABLE_TBB;
+ break;
+ case ARM::tTBH_JT:
+ case ARM::t2TBH_JT:
+ JTOpcode = ARM::JUMPTABLE_TBH;
+ break;
+ }
+
+ unsigned NumOps = MI->getDesc().getNumOperands();
+ MachineOperand JTOp =
+ MI->getOperand(NumOps - (MI->isPredicable() ? 2 : 1));
+ unsigned JTI = JTOp.getIndex();
+ unsigned Size = JT[JTI].MBBs.size() * sizeof(uint32_t);
+ MachineBasicBlock *JumpTableBB = MF->CreateMachineBasicBlock();
+ MF->insert(std::next(MachineFunction::iterator(MBB)), JumpTableBB);
+ MachineInstr *CPEMI = BuildMI(*JumpTableBB, JumpTableBB->begin(),
+ DebugLoc(), TII->get(JTOpcode))
+ .addImm(i++)
+ .addJumpTableIndex(JTI)
+ .addImm(Size);
+ CPEMIs.push_back(CPEMI);
+ CPEntries.emplace_back(1, CPEntry(CPEMI, JTI));
+ JumpTableEntryIndices.insert(std::make_pair(JTI, CPEntries.size() - 1));
+ if (!LastCorrectlyNumberedBB)
+ LastCorrectlyNumberedBB = &MBB;
+ }
+
+ // If we did anything then we need to renumber the subsequent blocks.
+ if (LastCorrectlyNumberedBB)
+ MF->RenumberBlocks(LastCorrectlyNumberedBB);
+}
+
+/// BBHasFallthrough - Return true if the specified basic block can fallthrough
+/// into the block immediately after it.
+bool ARMConstantIslands::BBHasFallthrough(MachineBasicBlock *MBB) {
+ // Get the next machine basic block in the function.
+ MachineFunction::iterator MBBI = MBB->getIterator();
+ // Can't fall off end of function.
+ if (std::next(MBBI) == MBB->getParent()->end())
+ return false;
+
+ MachineBasicBlock *NextBB = &*std::next(MBBI);
+ if (!MBB->isSuccessor(NextBB))
+ return false;
+
+ // Try to analyze the end of the block. A potential fallthrough may already
+ // have an unconditional branch for whatever reason.
+ MachineBasicBlock *TBB, *FBB;
+ SmallVector<MachineOperand, 4> Cond;
+ bool TooDifficult = TII->analyzeBranch(*MBB, TBB, FBB, Cond);
+ return TooDifficult || FBB == nullptr;
+}
+
+/// findConstPoolEntry - Given the constpool index and CONSTPOOL_ENTRY MI,
+/// look up the corresponding CPEntry.
+ARMConstantIslands::CPEntry
+*ARMConstantIslands::findConstPoolEntry(unsigned CPI,
+ const MachineInstr *CPEMI) {
+ std::vector<CPEntry> &CPEs = CPEntries[CPI];
+ // Number of entries per constpool index should be small, just do a
+ // linear search.
+ for (unsigned i = 0, e = CPEs.size(); i != e; ++i) {
+ if (CPEs[i].CPEMI == CPEMI)
+ return &CPEs[i];
+ }
+ return nullptr;
+}
+
+/// getCPELogAlign - Returns the required alignment of the constant pool entry
+/// represented by CPEMI. Alignment is measured in log2(bytes) units.
+unsigned ARMConstantIslands::getCPELogAlign(const MachineInstr *CPEMI) {
+ switch (CPEMI->getOpcode()) {
+ case ARM::CONSTPOOL_ENTRY:
+ break;
+ case ARM::JUMPTABLE_TBB:
+ return isThumb1 ? 2 : 0;
+ case ARM::JUMPTABLE_TBH:
+ return isThumb1 ? 2 : 1;
+ case ARM::JUMPTABLE_INSTS:
+ return 1;
+ case ARM::JUMPTABLE_ADDRS:
+ return 2;
+ default:
+ llvm_unreachable("unknown constpool entry kind");
+ }
+
+ unsigned CPI = getCombinedIndex(CPEMI);
+ assert(CPI < MCP->getConstants().size() && "Invalid constant pool index.");
+ unsigned Align = MCP->getConstants()[CPI].getAlignment();
+ assert(isPowerOf2_32(Align) && "Invalid CPE alignment");
+ return Log2_32(Align);
+}
+
+/// scanFunctionJumpTables - Do a scan of the function, building up
+/// information about the sizes of each block and the locations of all
+/// the jump tables.
+void ARMConstantIslands::scanFunctionJumpTables() {
+ for (MachineBasicBlock &MBB : *MF) {
+ for (MachineInstr &I : MBB)
+ if (I.isBranch() &&
+ (I.getOpcode() == ARM::t2BR_JT || I.getOpcode() == ARM::tBR_JTr))
+ T2JumpTables.push_back(&I);
+ }
+}
+
+/// initializeFunctionInfo - Do the initial scan of the function, building up
+/// information about the sizes of each block, the location of all the water,
+/// and finding all of the constant pool users.
+void ARMConstantIslands::
+initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
+
+ BBInfo = computeAllBlockSizes(MF);
+
+ // The known bits of the entry block offset are determined by the function
+ // alignment.
+ BBInfo.front().KnownBits = MF->getAlignment();
+
+ // Compute block offsets and known bits.
+ adjustBBOffsetsAfter(&MF->front());
+
+ // Now go back through the instructions and build up our data structures.
+ for (MachineBasicBlock &MBB : *MF) {
+ // If this block doesn't fall through into the next MBB, then this is
+ // 'water' that a constant pool island could be placed.
+ if (!BBHasFallthrough(&MBB))
+ WaterList.push_back(&MBB);
+
+ for (MachineInstr &I : MBB) {
+ if (I.isDebugValue())
+ continue;
+
+ unsigned Opc = I.getOpcode();
+ if (I.isBranch()) {
+ bool isCond = false;
+ unsigned Bits = 0;
+ unsigned Scale = 1;
+ int UOpc = Opc;
+ switch (Opc) {
+ default:
+ continue; // Ignore other JT branches
+ case ARM::t2BR_JT:
+ case ARM::tBR_JTr:
+ T2JumpTables.push_back(&I);
+ continue; // Does not get an entry in ImmBranches
+ case ARM::Bcc:
+ isCond = true;
+ UOpc = ARM::B;
+ LLVM_FALLTHROUGH;
+ case ARM::B:
+ Bits = 24;
+ Scale = 4;
+ break;
+ case ARM::tBcc:
+ isCond = true;
+ UOpc = ARM::tB;
+ Bits = 8;
+ Scale = 2;
+ break;
+ case ARM::tB:
+ Bits = 11;
+ Scale = 2;
+ break;
+ case ARM::t2Bcc:
+ isCond = true;
+ UOpc = ARM::t2B;
+ Bits = 20;
+ Scale = 2;
+ break;
+ case ARM::t2B:
+ Bits = 24;
+ Scale = 2;
+ break;
+ }
+
+ // Record this immediate branch.
+ unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale;
+ ImmBranches.push_back(ImmBranch(&I, MaxOffs, isCond, UOpc));
+ }
+
+ if (Opc == ARM::tPUSH || Opc == ARM::tPOP_RET)
+ PushPopMIs.push_back(&I);
+
+ if (Opc == ARM::CONSTPOOL_ENTRY || Opc == ARM::JUMPTABLE_ADDRS ||
+ Opc == ARM::JUMPTABLE_INSTS || Opc == ARM::JUMPTABLE_TBB ||
+ Opc == ARM::JUMPTABLE_TBH)
+ continue;
+
+ // Scan the instructions for constant pool operands.
+ for (unsigned op = 0, e = I.getNumOperands(); op != e; ++op)
+ if (I.getOperand(op).isCPI() || I.getOperand(op).isJTI()) {
+ // We found one. The addressing mode tells us the max displacement
+ // from the PC that this instruction permits.
+
+ // Basic size info comes from the TSFlags field.
+ unsigned Bits = 0;
+ unsigned Scale = 1;
+ bool NegOk = false;
+ bool IsSoImm = false;
+
+ switch (Opc) {
+ default:
+ llvm_unreachable("Unknown addressing mode for CP reference!");
+
+ // Taking the address of a CP entry.
+ case ARM::LEApcrel:
+ case ARM::LEApcrelJT:
+ // This takes a SoImm, which is 8 bit immediate rotated. We'll
+ // pretend the maximum offset is 255 * 4. Since each instruction
+ // 4 byte wide, this is always correct. We'll check for other
+ // displacements that fits in a SoImm as well.
+ Bits = 8;
+ Scale = 4;
+ NegOk = true;
+ IsSoImm = true;
+ break;
+ case ARM::t2LEApcrel:
+ case ARM::t2LEApcrelJT:
+ Bits = 12;
+ NegOk = true;
+ break;
+ case ARM::tLEApcrel:
+ case ARM::tLEApcrelJT:
+ Bits = 8;
+ Scale = 4;
+ break;
+
+ case ARM::LDRBi12:
+ case ARM::LDRi12:
+ case ARM::LDRcp:
+ case ARM::t2LDRpci:
+ case ARM::t2LDRHpci:
+ Bits = 12; // +-offset_12
+ NegOk = true;
+ break;
+
+ case ARM::tLDRpci:
+ Bits = 8;
+ Scale = 4; // +(offset_8*4)
+ break;
+
+ case ARM::VLDRD:
+ case ARM::VLDRS:
+ Bits = 8;
+ Scale = 4; // +-(offset_8*4)
+ NegOk = true;
+ break;
+
+ case ARM::tLDRHi:
+ Bits = 5;
+ Scale = 2; // +(offset_5*2)
+ break;
+ }
+
+ // Remember that this is a user of a CP entry.
+ unsigned CPI = I.getOperand(op).getIndex();
+ if (I.getOperand(op).isJTI()) {
+ JumpTableUserIndices.insert(std::make_pair(CPI, CPUsers.size()));
+ CPI = JumpTableEntryIndices[CPI];
+ }
+
+ MachineInstr *CPEMI = CPEMIs[CPI];
+ unsigned MaxOffs = ((1 << Bits)-1) * Scale;
+ CPUsers.push_back(CPUser(&I, CPEMI, MaxOffs, NegOk, IsSoImm));
+
+ // Increment corresponding CPEntry reference count.
+ CPEntry *CPE = findConstPoolEntry(CPI, CPEMI);
+ assert(CPE && "Cannot find a corresponding CPEntry!");
+ CPE->RefCount++;
+
+ // Instructions can only use one CP entry, don't bother scanning the
+ // rest of the operands.
+ break;
+ }
+ }
+ }
+}
+
+/// getOffsetOf - Return the current offset of the specified machine instruction
+/// from the start of the function. This offset changes as stuff is moved
+/// around inside the function.
+unsigned ARMConstantIslands::getOffsetOf(MachineInstr *MI) const {
+ MachineBasicBlock *MBB = MI->getParent();
+
+ // The offset is composed of two things: the sum of the sizes of all MBB's
+ // before this instruction's block, and the offset from the start of the block
+ // it is in.
+ unsigned Offset = BBInfo[MBB->getNumber()].Offset;
+
+ // Sum instructions before MI in MBB.
+ for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) {
+ assert(I != MBB->end() && "Didn't find MI in its own basic block?");
+ Offset += TII->getInstSizeInBytes(*I);
+ }
+ return Offset;
+}
+
+/// CompareMBBNumbers - Little predicate function to sort the WaterList by MBB
+/// ID.
+static bool CompareMBBNumbers(const MachineBasicBlock *LHS,
+ const MachineBasicBlock *RHS) {
+ return LHS->getNumber() < RHS->getNumber();
+}
+
+/// updateForInsertedWaterBlock - When a block is newly inserted into the
+/// machine function, it upsets all of the block numbers. Renumber the blocks
+/// and update the arrays that parallel this numbering.
+void ARMConstantIslands::updateForInsertedWaterBlock(MachineBasicBlock *NewBB) {
+ // Renumber the MBB's to keep them consecutive.
+ NewBB->getParent()->RenumberBlocks(NewBB);
+
+ // Insert an entry into BBInfo to align it properly with the (newly
+ // renumbered) block numbers.
+ BBInfo.insert(BBInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
+
+ // Next, update WaterList. Specifically, we need to add NewMBB as having
+ // available water after it.
+ water_iterator IP =
+ std::lower_bound(WaterList.begin(), WaterList.end(), NewBB,
+ CompareMBBNumbers);
+ WaterList.insert(IP, NewBB);
+}
+
+
+/// Split the basic block containing MI into two blocks, which are joined by
+/// an unconditional branch. Update data structures and renumber blocks to
+/// account for this change and returns the newly created block.
+MachineBasicBlock *ARMConstantIslands::splitBlockBeforeInstr(MachineInstr *MI) {
+ MachineBasicBlock *OrigBB = MI->getParent();
+
+ // Create a new MBB for the code after the OrigBB.
+ MachineBasicBlock *NewBB =
+ MF->CreateMachineBasicBlock(OrigBB->getBasicBlock());
+ MachineFunction::iterator MBBI = ++OrigBB->getIterator();
+ MF->insert(MBBI, NewBB);
+
+ // Splice the instructions starting with MI over to NewBB.
+ NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end());
+
+ // Add an unconditional branch from OrigBB to NewBB.
+ // Note the new unconditional branch is not being recorded.
+ // There doesn't seem to be meaningful DebugInfo available; this doesn't
+ // correspond to anything in the source.
+ unsigned Opc = isThumb ? (isThumb2 ? ARM::t2B : ARM::tB) : ARM::B;
+ if (!isThumb)
+ BuildMI(OrigBB, DebugLoc(), TII->get(Opc)).addMBB(NewBB);
+ else
+ BuildMI(OrigBB, DebugLoc(), TII->get(Opc)).addMBB(NewBB)
+ .addImm(ARMCC::AL).addReg(0);
+ ++NumSplit;
+
+ // Update the CFG. All succs of OrigBB are now succs of NewBB.
+ NewBB->transferSuccessors(OrigBB);
+
+ // OrigBB branches to NewBB.
+ OrigBB->addSuccessor(NewBB);
+
+ // Update internal data structures to account for the newly inserted MBB.
+ // This is almost the same as updateForInsertedWaterBlock, except that
+ // the Water goes after OrigBB, not NewBB.
+ MF->RenumberBlocks(NewBB);
+
+ // Insert an entry into BBInfo to align it properly with the (newly
+ // renumbered) block numbers.
+ BBInfo.insert(BBInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
+
+ // Next, update WaterList. Specifically, we need to add OrigMBB as having
+ // available water after it (but not if it's already there, which happens
+ // when splitting before a conditional branch that is followed by an
+ // unconditional branch - in that case we want to insert NewBB).
+ water_iterator IP =
+ std::lower_bound(WaterList.begin(), WaterList.end(), OrigBB,
+ CompareMBBNumbers);
+ MachineBasicBlock* WaterBB = *IP;
+ if (WaterBB == OrigBB)
+ WaterList.insert(std::next(IP), NewBB);
+ else
+ WaterList.insert(IP, OrigBB);
+ NewWaterList.insert(OrigBB);
+
+ // Figure out how large the OrigBB is. As the first half of the original
+ // block, it cannot contain a tablejump. The size includes
+ // the new jump we added. (It should be possible to do this without
+ // recounting everything, but it's very confusing, and this is rarely
+ // executed.)
+ computeBlockSize(MF, OrigBB, BBInfo[OrigBB->getNumber()]);
+
+ // Figure out how large the NewMBB is. As the second half of the original
+ // block, it may contain a tablejump.
+ computeBlockSize(MF, NewBB, BBInfo[NewBB->getNumber()]);
+
+ // All BBOffsets following these blocks must be modified.
+ adjustBBOffsetsAfter(OrigBB);
+
+ return NewBB;
+}
+
+/// getUserOffset - Compute the offset of U.MI as seen by the hardware
+/// displacement computation. Update U.KnownAlignment to match its current
+/// basic block location.
+unsigned ARMConstantIslands::getUserOffset(CPUser &U) const {
+ unsigned UserOffset = getOffsetOf(U.MI);
+ const BasicBlockInfo &BBI = BBInfo[U.MI->getParent()->getNumber()];
+ unsigned KnownBits = BBI.internalKnownBits();
+
+ // The value read from PC is offset from the actual instruction address.
+ UserOffset += (isThumb ? 4 : 8);
+
+ // Because of inline assembly, we may not know the alignment (mod 4) of U.MI.
+ // Make sure U.getMaxDisp() returns a constrained range.
+ U.KnownAlignment = (KnownBits >= 2);
+
+ // On Thumb, offsets==2 mod 4 are rounded down by the hardware for
+ // purposes of the displacement computation; compensate for that here.
+ // For unknown alignments, getMaxDisp() constrains the range instead.
+ if (isThumb && U.KnownAlignment)
+ UserOffset &= ~3u;
+
+ return UserOffset;
+}
+
+/// isOffsetInRange - Checks whether UserOffset (the location of a constant pool
+/// reference) is within MaxDisp of TrialOffset (a proposed location of a
+/// constant pool entry).
+/// UserOffset is computed by getUserOffset above to include PC adjustments. If
+/// the mod 4 alignment of UserOffset is not known, the uncertainty must be
+/// subtracted from MaxDisp instead. CPUser::getMaxDisp() does that.
+bool ARMConstantIslands::isOffsetInRange(unsigned UserOffset,
+ unsigned TrialOffset, unsigned MaxDisp,
+ bool NegativeOK, bool IsSoImm) {
+ if (UserOffset <= TrialOffset) {
+ // User before the Trial.
+ if (TrialOffset - UserOffset <= MaxDisp)
+ return true;
+ // FIXME: Make use full range of soimm values.
+ } else if (NegativeOK) {
+ if (UserOffset - TrialOffset <= MaxDisp)
+ return true;
+ // FIXME: Make use full range of soimm values.
+ }
+ return false;
+}
+
+/// isWaterInRange - Returns true if a CPE placed after the specified
+/// Water (a basic block) will be in range for the specific MI.
+///
+/// Compute how much the function will grow by inserting a CPE after Water.
+bool ARMConstantIslands::isWaterInRange(unsigned UserOffset,
+ MachineBasicBlock* Water, CPUser &U,
+ unsigned &Growth) {
+ unsigned CPELogAlign = getCPELogAlign(U.CPEMI);
+ unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset(CPELogAlign);
+ unsigned NextBlockOffset, NextBlockAlignment;
+ MachineFunction::const_iterator NextBlock = Water->getIterator();
+ if (++NextBlock == MF->end()) {
+ NextBlockOffset = BBInfo[Water->getNumber()].postOffset();
+ NextBlockAlignment = 0;
+ } else {
+ NextBlockOffset = BBInfo[NextBlock->getNumber()].Offset;
+ NextBlockAlignment = NextBlock->getAlignment();
+ }
+ unsigned Size = U.CPEMI->getOperand(2).getImm();
+ unsigned CPEEnd = CPEOffset + Size;
+
+ // The CPE may be able to hide in the alignment padding before the next
+ // block. It may also cause more padding to be required if it is more aligned
+ // that the next block.
+ if (CPEEnd > NextBlockOffset) {
+ Growth = CPEEnd - NextBlockOffset;
+ // Compute the padding that would go at the end of the CPE to align the next
+ // block.
+ Growth += OffsetToAlignment(CPEEnd, 1ULL << NextBlockAlignment);
+
+ // If the CPE is to be inserted before the instruction, that will raise
+ // the offset of the instruction. Also account for unknown alignment padding
+ // in blocks between CPE and the user.
+ if (CPEOffset < UserOffset)
+ UserOffset += Growth + UnknownPadding(MF->getAlignment(), CPELogAlign);
+ } else
+ // CPE fits in existing padding.
+ Growth = 0;
+
+ return isOffsetInRange(UserOffset, CPEOffset, U);
+}
+
+/// isCPEntryInRange - Returns true if the distance between specific MI and
+/// specific ConstPool entry instruction can fit in MI's displacement field.
+bool ARMConstantIslands::isCPEntryInRange(MachineInstr *MI, unsigned UserOffset,
+ MachineInstr *CPEMI, unsigned MaxDisp,
+ bool NegOk, bool DoDump) {
+ unsigned CPEOffset = getOffsetOf(CPEMI);
+
+ if (DoDump) {
+ DEBUG({
+ unsigned Block = MI->getParent()->getNumber();
+ const BasicBlockInfo &BBI = BBInfo[Block];
+ dbgs() << "User of CPE#" << CPEMI->getOperand(0).getImm()
+ << " max delta=" << MaxDisp
+ << format(" insn address=%#x", UserOffset)
+ << " in BB#" << Block << ": "
+ << format("%#x-%x\t", BBI.Offset, BBI.postOffset()) << *MI
+ << format("CPE address=%#x offset=%+d: ", CPEOffset,
+ int(CPEOffset-UserOffset));
+ });
+ }
+
+ return isOffsetInRange(UserOffset, CPEOffset, MaxDisp, NegOk);
+}
+
+#ifndef NDEBUG
+/// BBIsJumpedOver - Return true of the specified basic block's only predecessor
+/// unconditionally branches to its only successor.
+static bool BBIsJumpedOver(MachineBasicBlock *MBB) {
+ if (MBB->pred_size() != 1 || MBB->succ_size() != 1)
+ return false;
+
+ MachineBasicBlock *Succ = *MBB->succ_begin();
+ MachineBasicBlock *Pred = *MBB->pred_begin();
+ MachineInstr *PredMI = &Pred->back();
+ if (PredMI->getOpcode() == ARM::B || PredMI->getOpcode() == ARM::tB
+ || PredMI->getOpcode() == ARM::t2B)
+ return PredMI->getOperand(0).getMBB() == Succ;
+ return false;
+}
+#endif // NDEBUG
+
+void ARMConstantIslands::adjustBBOffsetsAfter(MachineBasicBlock *BB) {
+ unsigned BBNum = BB->getNumber();
+ for(unsigned i = BBNum + 1, e = MF->getNumBlockIDs(); i < e; ++i) {
+ // Get the offset and known bits at the end of the layout predecessor.
+ // Include the alignment of the current block.
+ unsigned LogAlign = MF->getBlockNumbered(i)->getAlignment();
+ unsigned Offset = BBInfo[i - 1].postOffset(LogAlign);
+ unsigned KnownBits = BBInfo[i - 1].postKnownBits(LogAlign);
+
+ // This is where block i begins. Stop if the offset is already correct,
+ // and we have updated 2 blocks. This is the maximum number of blocks
+ // changed before calling this function.
+ if (i > BBNum + 2 &&
+ BBInfo[i].Offset == Offset &&
+ BBInfo[i].KnownBits == KnownBits)
+ break;
+
+ BBInfo[i].Offset = Offset;
+ BBInfo[i].KnownBits = KnownBits;
+ }
+}
+
+/// decrementCPEReferenceCount - find the constant pool entry with index CPI
+/// and instruction CPEMI, and decrement its refcount. If the refcount
+/// becomes 0 remove the entry and instruction. Returns true if we removed
+/// the entry, false if we didn't.
+
+bool ARMConstantIslands::decrementCPEReferenceCount(unsigned CPI,
+ MachineInstr *CPEMI) {
+ // Find the old entry. Eliminate it if it is no longer used.
+ CPEntry *CPE = findConstPoolEntry(CPI, CPEMI);
+ assert(CPE && "Unexpected!");
+ if (--CPE->RefCount == 0) {
+ removeDeadCPEMI(CPEMI);
+ CPE->CPEMI = nullptr;
+ --NumCPEs;
+ return true;
+ }
+ return false;
+}
+
+unsigned ARMConstantIslands::getCombinedIndex(const MachineInstr *CPEMI) {
+ if (CPEMI->getOperand(1).isCPI())
+ return CPEMI->getOperand(1).getIndex();
+
+ return JumpTableEntryIndices[CPEMI->getOperand(1).getIndex()];
+}
+
+/// LookForCPEntryInRange - see if the currently referenced CPE is in range;
+/// if not, see if an in-range clone of the CPE is in range, and if so,
+/// change the data structures so the user references the clone. Returns:
+/// 0 = no existing entry found
+/// 1 = entry found, and there were no code insertions or deletions
+/// 2 = entry found, and there were code insertions or deletions
+int ARMConstantIslands::findInRangeCPEntry(CPUser& U, unsigned UserOffset)
+{
+ MachineInstr *UserMI = U.MI;
+ MachineInstr *CPEMI = U.CPEMI;
+
+ // Check to see if the CPE is already in-range.
+ if (isCPEntryInRange(UserMI, UserOffset, CPEMI, U.getMaxDisp(), U.NegOk,
+ true)) {
+ DEBUG(dbgs() << "In range\n");
+ return 1;
+ }
+
+ // No. Look for previously created clones of the CPE that are in range.
+ unsigned CPI = getCombinedIndex(CPEMI);
+ std::vector<CPEntry> &CPEs = CPEntries[CPI];
+ for (unsigned i = 0, e = CPEs.size(); i != e; ++i) {
+ // We already tried this one
+ if (CPEs[i].CPEMI == CPEMI)
+ continue;
+ // Removing CPEs can leave empty entries, skip
+ if (CPEs[i].CPEMI == nullptr)
+ continue;
+ if (isCPEntryInRange(UserMI, UserOffset, CPEs[i].CPEMI, U.getMaxDisp(),
+ U.NegOk)) {
+ DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#"
+ << CPEs[i].CPI << "\n");
+ // Point the CPUser node to the replacement
+ U.CPEMI = CPEs[i].CPEMI;
+ // Change the CPI in the instruction operand to refer to the clone.
+ for (unsigned j = 0, e = UserMI->getNumOperands(); j != e; ++j)
+ if (UserMI->getOperand(j).isCPI()) {
+ UserMI->getOperand(j).setIndex(CPEs[i].CPI);
+ break;
+ }
+ // Adjust the refcount of the clone...
+ CPEs[i].RefCount++;
+ // ...and the original. If we didn't remove the old entry, none of the
+ // addresses changed, so we don't need another pass.
+ return decrementCPEReferenceCount(CPI, CPEMI) ? 2 : 1;
+ }
+ }
+ return 0;
+}
+
+/// getUnconditionalBrDisp - Returns the maximum displacement that can fit in
+/// the specific unconditional branch instruction.
+static inline unsigned getUnconditionalBrDisp(int Opc) {
+ switch (Opc) {
+ case ARM::tB:
+ return ((1<<10)-1)*2;
+ case ARM::t2B:
+ return ((1<<23)-1)*2;
+ default:
+ break;
+ }
+
+ return ((1<<23)-1)*4;
+}
+
+/// findAvailableWater - Look for an existing entry in the WaterList in which
+/// we can place the CPE referenced from U so it's within range of U's MI.
+/// Returns true if found, false if not. If it returns true, WaterIter
+/// is set to the WaterList entry. For Thumb, prefer water that will not
+/// introduce padding to water that will. To ensure that this pass
+/// terminates, the CPE location for a particular CPUser is only allowed to
+/// move to a lower address, so search backward from the end of the list and
+/// prefer the first water that is in range.
+bool ARMConstantIslands::findAvailableWater(CPUser &U, unsigned UserOffset,
+ water_iterator &WaterIter,
+ bool CloserWater) {
+ if (WaterList.empty())
+ return false;
+
+ unsigned BestGrowth = ~0u;
+ // The nearest water without splitting the UserBB is right after it.
+ // If the distance is still large (we have a big BB), then we need to split it
+ // if we don't converge after certain iterations. This helps the following
+ // situation to converge:
+ // BB0:
+ // Big BB
+ // BB1:
+ // Constant Pool
+ // When a CP access is out of range, BB0 may be used as water. However,
+ // inserting islands between BB0 and BB1 makes other accesses out of range.
+ MachineBasicBlock *UserBB = U.MI->getParent();
+ unsigned MinNoSplitDisp =
+ BBInfo[UserBB->getNumber()].postOffset(getCPELogAlign(U.CPEMI));
+ if (CloserWater && MinNoSplitDisp > U.getMaxDisp() / 2)
+ return false;
+ for (water_iterator IP = std::prev(WaterList.end()), B = WaterList.begin();;
+ --IP) {
+ MachineBasicBlock* WaterBB = *IP;
+ // Check if water is in range and is either at a lower address than the
+ // current "high water mark" or a new water block that was created since
+ // the previous iteration by inserting an unconditional branch. In the
+ // latter case, we want to allow resetting the high water mark back to
+ // this new water since we haven't seen it before. Inserting branches
+ // should be relatively uncommon and when it does happen, we want to be
+ // sure to take advantage of it for all the CPEs near that block, so that
+ // we don't insert more branches than necessary.
+ // When CloserWater is true, we try to find the lowest address after (or
+ // equal to) user MI's BB no matter of padding growth.
+ unsigned Growth;
+ if (isWaterInRange(UserOffset, WaterBB, U, Growth) &&
+ (WaterBB->getNumber() < U.HighWaterMark->getNumber() ||
+ NewWaterList.count(WaterBB) || WaterBB == U.MI->getParent()) &&
+ Growth < BestGrowth) {
+ // This is the least amount of required padding seen so far.
+ BestGrowth = Growth;
+ WaterIter = IP;
+ DEBUG(dbgs() << "Found water after BB#" << WaterBB->getNumber()
+ << " Growth=" << Growth << '\n');
+
+ if (CloserWater && WaterBB == U.MI->getParent())
+ return true;
+ // Keep looking unless it is perfect and we're not looking for the lowest
+ // possible address.
+ if (!CloserWater && BestGrowth == 0)
+ return true;
+ }
+ if (IP == B)
+ break;
+ }
+ return BestGrowth != ~0u;
+}
+
+/// createNewWater - No existing WaterList entry will work for
+/// CPUsers[CPUserIndex], so create a place to put the CPE. The end of the
+/// block is used if in range, and the conditional branch munged so control
+/// flow is correct. Otherwise the block is split to create a hole with an
+/// unconditional branch around it. In either case NewMBB is set to a
+/// block following which the new island can be inserted (the WaterList
+/// is not adjusted).
+void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
+ unsigned UserOffset,
+ MachineBasicBlock *&NewMBB) {
+ CPUser &U = CPUsers[CPUserIndex];
+ MachineInstr *UserMI = U.MI;
+ MachineInstr *CPEMI = U.CPEMI;
+ unsigned CPELogAlign = getCPELogAlign(CPEMI);
+ MachineBasicBlock *UserMBB = UserMI->getParent();
+ const BasicBlockInfo &UserBBI = BBInfo[UserMBB->getNumber()];
+
+ // If the block does not end in an unconditional branch already, and if the
+ // end of the block is within range, make new water there. (The addition
+ // below is for the unconditional branch we will be adding: 4 bytes on ARM +
+ // Thumb2, 2 on Thumb1.
+ if (BBHasFallthrough(UserMBB)) {
+ // Size of branch to insert.
+ unsigned Delta = isThumb1 ? 2 : 4;
+ // Compute the offset where the CPE will begin.
+ unsigned CPEOffset = UserBBI.postOffset(CPELogAlign) + Delta;
+
+ if (isOffsetInRange(UserOffset, CPEOffset, U)) {
+ DEBUG(dbgs() << "Split at end of BB#" << UserMBB->getNumber()
+ << format(", expected CPE offset %#x\n", CPEOffset));
+ NewMBB = &*++UserMBB->getIterator();
+ // Add an unconditional branch from UserMBB to fallthrough block. Record
+ // it for branch lengthening; this new branch will not get out of range,
+ // but if the preceding conditional branch is out of range, the targets
+ // will be exchanged, and the altered branch may be out of range, so the
+ // machinery has to know about it.
+ int UncondBr = isThumb ? ((isThumb2) ? ARM::t2B : ARM::tB) : ARM::B;
+ if (!isThumb)
+ BuildMI(UserMBB, DebugLoc(), TII->get(UncondBr)).addMBB(NewMBB);
+ else
+ BuildMI(UserMBB, DebugLoc(), TII->get(UncondBr)).addMBB(NewMBB)
+ .addImm(ARMCC::AL).addReg(0);
+ unsigned MaxDisp = getUnconditionalBrDisp(UncondBr);
+ ImmBranches.push_back(ImmBranch(&UserMBB->back(),
+ MaxDisp, false, UncondBr));
+ computeBlockSize(MF, UserMBB, BBInfo[UserMBB->getNumber()]);
+ adjustBBOffsetsAfter(UserMBB);
+ return;
+ }
+ }
+
+ // What a big block. Find a place within the block to split it. This is a
+ // little tricky on Thumb1 since instructions are 2 bytes and constant pool
+ // entries are 4 bytes: if instruction I references island CPE, and
+ // instruction I+1 references CPE', it will not work well to put CPE as far
+ // forward as possible, since then CPE' cannot immediately follow it (that
+ // location is 2 bytes farther away from I+1 than CPE was from I) and we'd
+ // need to create a new island. So, we make a first guess, then walk through
+ // the instructions between the one currently being looked at and the
+ // possible insertion point, and make sure any other instructions that
+ // reference CPEs will be able to use the same island area; if not, we back
+ // up the insertion point.
+
+ // Try to split the block so it's fully aligned. Compute the latest split
+ // point where we can add a 4-byte branch instruction, and then align to
+ // LogAlign which is the largest possible alignment in the function.
+ unsigned LogAlign = MF->getAlignment();
+ assert(LogAlign >= CPELogAlign && "Over-aligned constant pool entry");
+ unsigned KnownBits = UserBBI.internalKnownBits();
+ unsigned UPad = UnknownPadding(LogAlign, KnownBits);
+ unsigned BaseInsertOffset = UserOffset + U.getMaxDisp() - UPad;
+ DEBUG(dbgs() << format("Split in middle of big block before %#x",
+ BaseInsertOffset));
+
+ // The 4 in the following is for the unconditional branch we'll be inserting
+ // (allows for long branch on Thumb1). Alignment of the island is handled
+ // inside isOffsetInRange.
+ BaseInsertOffset -= 4;
+
+ DEBUG(dbgs() << format(", adjusted to %#x", BaseInsertOffset)
+ << " la=" << LogAlign
+ << " kb=" << KnownBits
+ << " up=" << UPad << '\n');
+
+ // This could point off the end of the block if we've already got constant
+ // pool entries following this block; only the last one is in the water list.
+ // Back past any possible branches (allow for a conditional and a maximally
+ // long unconditional).
+ if (BaseInsertOffset + 8 >= UserBBI.postOffset()) {
+ // Ensure BaseInsertOffset is larger than the offset of the instruction
+ // following UserMI so that the loop which searches for the split point
+ // iterates at least once.
+ BaseInsertOffset =
+ std::max(UserBBI.postOffset() - UPad - 8,
+ UserOffset + TII->getInstSizeInBytes(*UserMI) + 1);
+ DEBUG(dbgs() << format("Move inside block: %#x\n", BaseInsertOffset));
+ }
+ unsigned EndInsertOffset = BaseInsertOffset + 4 + UPad +
+ CPEMI->getOperand(2).getImm();
+ MachineBasicBlock::iterator MI = UserMI;
+ ++MI;
+ unsigned CPUIndex = CPUserIndex+1;
+ unsigned NumCPUsers = CPUsers.size();
+ MachineInstr *LastIT = nullptr;
+ for (unsigned Offset = UserOffset + TII->getInstSizeInBytes(*UserMI);
+ Offset < BaseInsertOffset;
+ Offset += TII->getInstSizeInBytes(*MI), MI = std::next(MI)) {
+ assert(MI != UserMBB->end() && "Fell off end of block");
+ if (CPUIndex < NumCPUsers && CPUsers[CPUIndex].MI == &*MI) {
+ CPUser &U = CPUsers[CPUIndex];
+ if (!isOffsetInRange(Offset, EndInsertOffset, U)) {
+ // Shift intertion point by one unit of alignment so it is within reach.
+ BaseInsertOffset -= 1u << LogAlign;
+ EndInsertOffset -= 1u << LogAlign;
+ }
+ // This is overly conservative, as we don't account for CPEMIs being
+ // reused within the block, but it doesn't matter much. Also assume CPEs
+ // are added in order with alignment padding. We may eventually be able
+ // to pack the aligned CPEs better.
+ EndInsertOffset += U.CPEMI->getOperand(2).getImm();
+ CPUIndex++;
+ }
+
+ // Remember the last IT instruction.
+ if (MI->getOpcode() == ARM::t2IT)
+ LastIT = &*MI;
+ }
+
+ --MI;
+
+ // Avoid splitting an IT block.
+ if (LastIT) {
+ unsigned PredReg = 0;
+ ARMCC::CondCodes CC = getITInstrPredicate(*MI, PredReg);
+ if (CC != ARMCC::AL)
+ MI = LastIT;
+ }
+
+ // We really must not split an IT block.
+ DEBUG(unsigned PredReg;
+ assert(!isThumb || getITInstrPredicate(*MI, PredReg) == ARMCC::AL));
+
+ NewMBB = splitBlockBeforeInstr(&*MI);
+}
+
+/// handleConstantPoolUser - Analyze the specified user, checking to see if it
+/// is out-of-range. If so, pick up the constant pool value and move it some
+/// place in-range. Return true if we changed any addresses (thus must run
+/// another pass of branch lengthening), false otherwise.
+bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex,
+ bool CloserWater) {
+ CPUser &U = CPUsers[CPUserIndex];
+ MachineInstr *UserMI = U.MI;
+ MachineInstr *CPEMI = U.CPEMI;
+ unsigned CPI = getCombinedIndex(CPEMI);
+ unsigned Size = CPEMI->getOperand(2).getImm();
+ // Compute this only once, it's expensive.
+ unsigned UserOffset = getUserOffset(U);
+
+ // See if the current entry is within range, or there is a clone of it
+ // in range.
+ int result = findInRangeCPEntry(U, UserOffset);
+ if (result==1) return false;
+ else if (result==2) return true;
+
+ // No existing clone of this CPE is within range.
+ // We will be generating a new clone. Get a UID for it.
+ unsigned ID = AFI->createPICLabelUId();
+
+ // Look for water where we can place this CPE.
+ MachineBasicBlock *NewIsland = MF->CreateMachineBasicBlock();
+ MachineBasicBlock *NewMBB;
+ water_iterator IP;
+ if (findAvailableWater(U, UserOffset, IP, CloserWater)) {
+ DEBUG(dbgs() << "Found water in range\n");
+ MachineBasicBlock *WaterBB = *IP;
+
+ // If the original WaterList entry was "new water" on this iteration,
+ // propagate that to the new island. This is just keeping NewWaterList
+ // updated to match the WaterList, which will be updated below.
+ if (NewWaterList.erase(WaterBB))
+ NewWaterList.insert(NewIsland);
+
+ // The new CPE goes before the following block (NewMBB).
+ NewMBB = &*++WaterBB->getIterator();
+ } else {
+ // No water found.
+ DEBUG(dbgs() << "No water found\n");
+ createNewWater(CPUserIndex, UserOffset, NewMBB);
+
+ // splitBlockBeforeInstr adds to WaterList, which is important when it is
+ // called while handling branches so that the water will be seen on the
+ // next iteration for constant pools, but in this context, we don't want
+ // it. Check for this so it will be removed from the WaterList.
+ // Also remove any entry from NewWaterList.
+ MachineBasicBlock *WaterBB = &*--NewMBB->getIterator();
+ IP = find(WaterList, WaterBB);
+ if (IP != WaterList.end())
+ NewWaterList.erase(WaterBB);
+
+ // We are adding new water. Update NewWaterList.
+ NewWaterList.insert(NewIsland);
+ }
+
+ // Remove the original WaterList entry; we want subsequent insertions in
+ // this vicinity to go after the one we're about to insert. This
+ // considerably reduces the number of times we have to move the same CPE
+ // more than once and is also important to ensure the algorithm terminates.
+ if (IP != WaterList.end())
+ WaterList.erase(IP);
+
+ // Okay, we know we can put an island before NewMBB now, do it!
+ MF->insert(NewMBB->getIterator(), NewIsland);
+
+ // Update internal data structures to account for the newly inserted MBB.
+ updateForInsertedWaterBlock(NewIsland);
+
+ // Now that we have an island to add the CPE to, clone the original CPE and
+ // add it to the island.
+ U.HighWaterMark = NewIsland;
+ U.CPEMI = BuildMI(NewIsland, DebugLoc(), CPEMI->getDesc())
+ .addImm(ID).addOperand(CPEMI->getOperand(1)).addImm(Size);
+ CPEntries[CPI].push_back(CPEntry(U.CPEMI, ID, 1));
+ ++NumCPEs;
+
+ // Decrement the old entry, and remove it if refcount becomes 0.
+ decrementCPEReferenceCount(CPI, CPEMI);
+
+ // Mark the basic block as aligned as required by the const-pool entry.
+ NewIsland->setAlignment(getCPELogAlign(U.CPEMI));
+
+ // Increase the size of the island block to account for the new entry.
+ BBInfo[NewIsland->getNumber()].Size += Size;
+ adjustBBOffsetsAfter(&*--NewIsland->getIterator());
+
+ // Finally, change the CPI in the instruction operand to be ID.
+ for (unsigned i = 0, e = UserMI->getNumOperands(); i != e; ++i)
+ if (UserMI->getOperand(i).isCPI()) {
+ UserMI->getOperand(i).setIndex(ID);
+ break;
+ }
+
+ DEBUG(dbgs() << " Moved CPE to #" << ID << " CPI=" << CPI
+ << format(" offset=%#x\n", BBInfo[NewIsland->getNumber()].Offset));
+
+ return true;
+}
+
+/// removeDeadCPEMI - Remove a dead constant pool entry instruction. Update
+/// sizes and offsets of impacted basic blocks.
+void ARMConstantIslands::removeDeadCPEMI(MachineInstr *CPEMI) {
+ MachineBasicBlock *CPEBB = CPEMI->getParent();
+ unsigned Size = CPEMI->getOperand(2).getImm();
+ CPEMI->eraseFromParent();
+ BBInfo[CPEBB->getNumber()].Size -= Size;
+ // All succeeding offsets have the current size value added in, fix this.
+ if (CPEBB->empty()) {
+ BBInfo[CPEBB->getNumber()].Size = 0;
+
+ // This block no longer needs to be aligned.
+ CPEBB->setAlignment(0);
+ } else
+ // Entries are sorted by descending alignment, so realign from the front.
+ CPEBB->setAlignment(getCPELogAlign(&*CPEBB->begin()));
+
+ adjustBBOffsetsAfter(CPEBB);
+ // An island has only one predecessor BB and one successor BB. Check if
+ // this BB's predecessor jumps directly to this BB's successor. This
+ // shouldn't happen currently.
+ assert(!BBIsJumpedOver(CPEBB) && "How did this happen?");
+ // FIXME: remove the empty blocks after all the work is done?
+}
+
+/// removeUnusedCPEntries - Remove constant pool entries whose refcounts
+/// are zero.
+bool ARMConstantIslands::removeUnusedCPEntries() {
+ unsigned MadeChange = false;
+ for (unsigned i = 0, e = CPEntries.size(); i != e; ++i) {
+ std::vector<CPEntry> &CPEs = CPEntries[i];
+ for (unsigned j = 0, ee = CPEs.size(); j != ee; ++j) {
+ if (CPEs[j].RefCount == 0 && CPEs[j].CPEMI) {
+ removeDeadCPEMI(CPEs[j].CPEMI);
+ CPEs[j].CPEMI = nullptr;
+ MadeChange = true;
+ }
+ }
+ }
+ return MadeChange;
+}
+
+/// isBBInRange - Returns true if the distance between specific MI and
+/// specific BB can fit in MI's displacement field.
+bool ARMConstantIslands::isBBInRange(MachineInstr *MI,MachineBasicBlock *DestBB,
+ unsigned MaxDisp) {
+ unsigned PCAdj = isThumb ? 4 : 8;
+ unsigned BrOffset = getOffsetOf(MI) + PCAdj;
+ unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset;
+
+ DEBUG(dbgs() << "Branch of destination BB#" << DestBB->getNumber()
+ << " from BB#" << MI->getParent()->getNumber()
+ << " max delta=" << MaxDisp
+ << " from " << getOffsetOf(MI) << " to " << DestOffset
+ << " offset " << int(DestOffset-BrOffset) << "\t" << *MI);
+
+ if (BrOffset <= DestOffset) {
+ // Branch before the Dest.
+ if (DestOffset-BrOffset <= MaxDisp)
+ return true;
+ } else {
+ if (BrOffset-DestOffset <= MaxDisp)
+ return true;
+ }
+ return false;
+}
+
+/// fixupImmediateBr - Fix up an immediate branch whose destination is too far
+/// away to fit in its displacement field.
+bool ARMConstantIslands::fixupImmediateBr(ImmBranch &Br) {
+ MachineInstr *MI = Br.MI;
+ MachineBasicBlock *DestBB = MI->getOperand(0).getMBB();
+
+ // Check to see if the DestBB is already in-range.
+ if (isBBInRange(MI, DestBB, Br.MaxDisp))
+ return false;
+
+ if (!Br.isCond)
+ return fixupUnconditionalBr(Br);
+ return fixupConditionalBr(Br);
+}
+
+/// fixupUnconditionalBr - Fix up an unconditional branch whose destination is
+/// too far away to fit in its displacement field. If the LR register has been
+/// spilled in the epilogue, then we can use BL to implement a far jump.
+/// Otherwise, add an intermediate branch instruction to a branch.
+bool
+ARMConstantIslands::fixupUnconditionalBr(ImmBranch &Br) {
+ MachineInstr *MI = Br.MI;
+ MachineBasicBlock *MBB = MI->getParent();
+ if (!isThumb1)
+ llvm_unreachable("fixupUnconditionalBr is Thumb1 only!");
+
+ // Use BL to implement far jump.
+ Br.MaxDisp = (1 << 21) * 2;
+ MI->setDesc(TII->get(ARM::tBfar));
+ BBInfo[MBB->getNumber()].Size += 2;
+ adjustBBOffsetsAfter(MBB);
+ HasFarJump = true;
+ ++NumUBrFixed;
+
+ DEBUG(dbgs() << " Changed B to long jump " << *MI);
+
+ return true;
+}
+
+/// fixupConditionalBr - Fix up a conditional branch whose destination is too
+/// far away to fit in its displacement field. It is converted to an inverse
+/// conditional branch + an unconditional branch to the destination.
+bool
+ARMConstantIslands::fixupConditionalBr(ImmBranch &Br) {
+ MachineInstr *MI = Br.MI;
+ MachineBasicBlock *DestBB = MI->getOperand(0).getMBB();
+
+ // Add an unconditional branch to the destination and invert the branch
+ // condition to jump over it:
+ // blt L1
+ // =>
+ // bge L2
+ // b L1
+ // L2:
+ ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(1).getImm();
+ CC = ARMCC::getOppositeCondition(CC);
+ unsigned CCReg = MI->getOperand(2).getReg();
+
+ // If the branch is at the end of its MBB and that has a fall-through block,
+ // direct the updated conditional branch to the fall-through block. Otherwise,
+ // split the MBB before the next instruction.
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineInstr *BMI = &MBB->back();
+ bool NeedSplit = (BMI != MI) || !BBHasFallthrough(MBB);
+
+ ++NumCBrFixed;
+ if (BMI != MI) {
+ if (std::next(MachineBasicBlock::iterator(MI)) == std::prev(MBB->end()) &&
+ BMI->getOpcode() == Br.UncondBr) {
+ // Last MI in the BB is an unconditional branch. Can we simply invert the
+ // condition and swap destinations:
+ // beq L1
+ // b L2
+ // =>
+ // bne L2
+ // b L1
+ MachineBasicBlock *NewDest = BMI->getOperand(0).getMBB();
+ if (isBBInRange(MI, NewDest, Br.MaxDisp)) {
+ DEBUG(dbgs() << " Invert Bcc condition and swap its destination with "
+ << *BMI);
+ BMI->getOperand(0).setMBB(DestBB);
+ MI->getOperand(0).setMBB(NewDest);
+ MI->getOperand(1).setImm(CC);
+ return true;
+ }
+ }
+ }
+
+ if (NeedSplit) {
+ splitBlockBeforeInstr(MI);
+ // No need for the branch to the next block. We're adding an unconditional
+ // branch to the destination.
+ int delta = TII->getInstSizeInBytes(MBB->back());
+ BBInfo[MBB->getNumber()].Size -= delta;
+ MBB->back().eraseFromParent();
+ // BBInfo[SplitBB].Offset is wrong temporarily, fixed below
+ }
+ MachineBasicBlock *NextBB = &*++MBB->getIterator();
+
+ DEBUG(dbgs() << " Insert B to BB#" << DestBB->getNumber()
+ << " also invert condition and change dest. to BB#"
+ << NextBB->getNumber() << "\n");
+
+ // Insert a new conditional branch and a new unconditional branch.
+ // Also update the ImmBranch as well as adding a new entry for the new branch.
+ BuildMI(MBB, DebugLoc(), TII->get(MI->getOpcode()))
+ .addMBB(NextBB).addImm(CC).addReg(CCReg);
+ Br.MI = &MBB->back();
+ BBInfo[MBB->getNumber()].Size += TII->getInstSizeInBytes(MBB->back());
+ if (isThumb)
+ BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB)
+ .addImm(ARMCC::AL).addReg(0);
+ else
+ BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB);
+ BBInfo[MBB->getNumber()].Size += TII->getInstSizeInBytes(MBB->back());
+ unsigned MaxDisp = getUnconditionalBrDisp(Br.UncondBr);
+ ImmBranches.push_back(ImmBranch(&MBB->back(), MaxDisp, false, Br.UncondBr));
+
+ // Remove the old conditional branch. It may or may not still be in MBB.
+ BBInfo[MI->getParent()->getNumber()].Size -= TII->getInstSizeInBytes(*MI);
+ MI->eraseFromParent();
+ adjustBBOffsetsAfter(MBB);
+ return true;
+}
+
+/// undoLRSpillRestore - Remove Thumb push / pop instructions that only spills
+/// LR / restores LR to pc. FIXME: This is done here because it's only possible
+/// to do this if tBfar is not used.
+bool ARMConstantIslands::undoLRSpillRestore() {
+ bool MadeChange = false;
+ for (unsigned i = 0, e = PushPopMIs.size(); i != e; ++i) {
+ MachineInstr *MI = PushPopMIs[i];
+ // First two operands are predicates.
+ if (MI->getOpcode() == ARM::tPOP_RET &&
+ MI->getOperand(2).getReg() == ARM::PC &&
+ MI->getNumExplicitOperands() == 3) {
+ // Create the new insn and copy the predicate from the old.
+ BuildMI(MI->getParent(), MI->getDebugLoc(), TII->get(ARM::tBX_RET))
+ .addOperand(MI->getOperand(0))
+ .addOperand(MI->getOperand(1));
+ MI->eraseFromParent();
+ MadeChange = true;
+ }
+ }
+ return MadeChange;
+}
+
+bool ARMConstantIslands::optimizeThumb2Instructions() {
+ bool MadeChange = false;
+
+ // Shrink ADR and LDR from constantpool.
+ for (unsigned i = 0, e = CPUsers.size(); i != e; ++i) {
+ CPUser &U = CPUsers[i];
+ unsigned Opcode = U.MI->getOpcode();
+ unsigned NewOpc = 0;
+ unsigned Scale = 1;
+ unsigned Bits = 0;
+ switch (Opcode) {
+ default: break;
+ case ARM::t2LEApcrel:
+ if (isARMLowRegister(U.MI->getOperand(0).getReg())) {
+ NewOpc = ARM::tLEApcrel;
+ Bits = 8;
+ Scale = 4;
+ }
+ break;
+ case ARM::t2LDRpci:
+ if (isARMLowRegister(U.MI->getOperand(0).getReg())) {
+ NewOpc = ARM::tLDRpci;
+ Bits = 8;
+ Scale = 4;
+ }
+ break;
+ }
+
+ if (!NewOpc)
+ continue;
+
+ unsigned UserOffset = getUserOffset(U);
+ unsigned MaxOffs = ((1 << Bits) - 1) * Scale;
+
+ // Be conservative with inline asm.
+ if (!U.KnownAlignment)
+ MaxOffs -= 2;
+
+ // FIXME: Check if offset is multiple of scale if scale is not 4.
+ if (isCPEntryInRange(U.MI, UserOffset, U.CPEMI, MaxOffs, false, true)) {
+ DEBUG(dbgs() << "Shrink: " << *U.MI);
+ U.MI->setDesc(TII->get(NewOpc));
+ MachineBasicBlock *MBB = U.MI->getParent();
+ BBInfo[MBB->getNumber()].Size -= 2;
+ adjustBBOffsetsAfter(MBB);
+ ++NumT2CPShrunk;
+ MadeChange = true;
+ }
+ }
+
+ return MadeChange;
+}
+
+bool ARMConstantIslands::optimizeThumb2Branches() {
+ bool MadeChange = false;
+
+ // The order in which branches appear in ImmBranches is approximately their
+ // order within the function body. By visiting later branches first, we reduce
+ // the distance between earlier forward branches and their targets, making it
+ // more likely that the cbn?z optimization, which can only apply to forward
+ // branches, will succeed.
+ for (unsigned i = ImmBranches.size(); i != 0; --i) {
+ ImmBranch &Br = ImmBranches[i-1];
+ unsigned Opcode = Br.MI->getOpcode();
+ unsigned NewOpc = 0;
+ unsigned Scale = 1;
+ unsigned Bits = 0;
+ switch (Opcode) {
+ default: break;
+ case ARM::t2B:
+ NewOpc = ARM::tB;
+ Bits = 11;
+ Scale = 2;
+ break;
+ case ARM::t2Bcc: {
+ NewOpc = ARM::tBcc;
+ Bits = 8;
+ Scale = 2;
+ break;
+ }
+ }
+ if (NewOpc) {
+ unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale;
+ MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB();
+ if (isBBInRange(Br.MI, DestBB, MaxOffs)) {
+ DEBUG(dbgs() << "Shrink branch: " << *Br.MI);
+ Br.MI->setDesc(TII->get(NewOpc));
+ MachineBasicBlock *MBB = Br.MI->getParent();
+ BBInfo[MBB->getNumber()].Size -= 2;
+ adjustBBOffsetsAfter(MBB);
+ ++NumT2BrShrunk;
+ MadeChange = true;
+ }
+ }
+
+ Opcode = Br.MI->getOpcode();
+ if (Opcode != ARM::tBcc)
+ continue;
+
+ // If the conditional branch doesn't kill CPSR, then CPSR can be liveout
+ // so this transformation is not safe.
+ if (!Br.MI->killsRegister(ARM::CPSR))
+ continue;
+
+ NewOpc = 0;
+ unsigned PredReg = 0;
+ ARMCC::CondCodes Pred = getInstrPredicate(*Br.MI, PredReg);
+ if (Pred == ARMCC::EQ)
+ NewOpc = ARM::tCBZ;
+ else if (Pred == ARMCC::NE)
+ NewOpc = ARM::tCBNZ;
+ if (!NewOpc)
+ continue;
+ MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB();
+ // Check if the distance is within 126. Subtract starting offset by 2
+ // because the cmp will be eliminated.
+ unsigned BrOffset = getOffsetOf(Br.MI) + 4 - 2;
+ unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset;
+ if (BrOffset < DestOffset && (DestOffset - BrOffset) <= 126) {
+ MachineBasicBlock::iterator CmpMI = Br.MI;
+ if (CmpMI != Br.MI->getParent()->begin()) {
+ --CmpMI;
+ if (CmpMI->getOpcode() == ARM::tCMPi8) {
+ unsigned Reg = CmpMI->getOperand(0).getReg();
+ Pred = getInstrPredicate(*CmpMI, PredReg);
+ if (Pred == ARMCC::AL &&
+ CmpMI->getOperand(1).getImm() == 0 &&
+ isARMLowRegister(Reg)) {
+ MachineBasicBlock *MBB = Br.MI->getParent();
+ DEBUG(dbgs() << "Fold: " << *CmpMI << " and: " << *Br.MI);
+ MachineInstr *NewBR =
+ BuildMI(*MBB, CmpMI, Br.MI->getDebugLoc(), TII->get(NewOpc))
+ .addReg(Reg).addMBB(DestBB,Br.MI->getOperand(0).getTargetFlags());
+ CmpMI->eraseFromParent();
+ Br.MI->eraseFromParent();
+ Br.MI = NewBR;
+ BBInfo[MBB->getNumber()].Size -= 2;
+ adjustBBOffsetsAfter(MBB);
+ ++NumCBZ;
+ MadeChange = true;
+ }
+ }
+ }
+ }
+ }
+
+ return MadeChange;
+}
+
+static bool isSimpleIndexCalc(MachineInstr &I, unsigned EntryReg,
+ unsigned BaseReg) {
+ if (I.getOpcode() != ARM::t2ADDrs)
+ return false;
+
+ if (I.getOperand(0).getReg() != EntryReg)
+ return false;
+
+ if (I.getOperand(1).getReg() != BaseReg)
+ return false;
+
+ // FIXME: what about CC and IdxReg?
+ return true;
+}
+
+/// \brief While trying to form a TBB/TBH instruction, we may (if the table
+/// doesn't immediately follow the BR_JT) need access to the start of the
+/// jump-table. We know one instruction that produces such a register; this
+/// function works out whether that definition can be preserved to the BR_JT,
+/// possibly by removing an intervening addition (which is usually needed to
+/// calculate the actual entry to jump to).
+bool ARMConstantIslands::preserveBaseRegister(MachineInstr *JumpMI,
+ MachineInstr *LEAMI,
+ unsigned &DeadSize,
+ bool &CanDeleteLEA,
+ bool &BaseRegKill) {
+ if (JumpMI->getParent() != LEAMI->getParent())
+ return false;
+
+ // Now we hope that we have at least these instructions in the basic block:
+ // BaseReg = t2LEA ...
+ // [...]
+ // EntryReg = t2ADDrs BaseReg, ...
+ // [...]
+ // t2BR_JT EntryReg
+ //
+ // We have to be very conservative about what we recognise here though. The
+ // main perturbing factors to watch out for are:
+ // + Spills at any point in the chain: not direct problems but we would
+ // expect a blocking Def of the spilled register so in practice what we
+ // can do is limited.
+ // + EntryReg == BaseReg: this is the one situation we should allow a Def
+ // of BaseReg, but only if the t2ADDrs can be removed.
+ // + Some instruction other than t2ADDrs computing the entry. Not seen in
+ // the wild, but we should be careful.
+ unsigned EntryReg = JumpMI->getOperand(0).getReg();
+ unsigned BaseReg = LEAMI->getOperand(0).getReg();
+
+ CanDeleteLEA = true;
+ BaseRegKill = false;
+ MachineInstr *RemovableAdd = nullptr;
+ MachineBasicBlock::iterator I(LEAMI);
+ for (++I; &*I != JumpMI; ++I) {
+ if (isSimpleIndexCalc(*I, EntryReg, BaseReg)) {
+ RemovableAdd = &*I;
+ break;
+ }
+
+ for (unsigned K = 0, E = I->getNumOperands(); K != E; ++K) {
+ const MachineOperand &MO = I->getOperand(K);
+ if (!MO.isReg() || !MO.getReg())
+ continue;
+ if (MO.isDef() && MO.getReg() == BaseReg)
+ return false;
+ if (MO.isUse() && MO.getReg() == BaseReg) {
+ BaseRegKill = BaseRegKill || MO.isKill();
+ CanDeleteLEA = false;
+ }
+ }
+ }
+
+ if (!RemovableAdd)
+ return true;
+
+ // Check the add really is removable, and that nothing else in the block
+ // clobbers BaseReg.
+ for (++I; &*I != JumpMI; ++I) {
+ for (unsigned K = 0, E = I->getNumOperands(); K != E; ++K) {
+ const MachineOperand &MO = I->getOperand(K);
+ if (!MO.isReg() || !MO.getReg())
+ continue;
+ if (MO.isDef() && MO.getReg() == BaseReg)
+ return false;
+ if (MO.isUse() && MO.getReg() == EntryReg)
+ RemovableAdd = nullptr;
+ }
+ }
+
+ if (RemovableAdd) {
+ RemovableAdd->eraseFromParent();
+ DeadSize += isThumb2 ? 4 : 2;
+ } else if (BaseReg == EntryReg) {
+ // The add wasn't removable, but clobbered the base for the TBB. So we can't
+ // preserve it.
+ return false;
+ }
+
+ // We reached the end of the block without seeing another definition of
+ // BaseReg (except, possibly the t2ADDrs, which was removed). BaseReg can be
+ // used in the TBB/TBH if necessary.
+ return true;
+}
+
+/// \brief Returns whether CPEMI is the first instruction in the block
+/// immediately following JTMI (assumed to be a TBB or TBH terminator). If so,
+/// we can switch the first register to PC and usually remove the address
+/// calculation that preceded it.
+static bool jumpTableFollowsTB(MachineInstr *JTMI, MachineInstr *CPEMI) {
+ MachineFunction::iterator MBB = JTMI->getParent()->getIterator();
+ MachineFunction *MF = MBB->getParent();
+ ++MBB;
+
+ return MBB != MF->end() && MBB->begin() != MBB->end() &&
+ &*MBB->begin() == CPEMI;
+}
+
+/// optimizeThumb2JumpTables - Use tbb / tbh instructions to generate smaller
+/// jumptables when it's possible.
+bool ARMConstantIslands::optimizeThumb2JumpTables() {
+ bool MadeChange = false;
+
+ // FIXME: After the tables are shrunk, can we get rid some of the
+ // constantpool tables?
+ MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+ if (!MJTI) return false;
+
+ const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+ for (unsigned i = 0, e = T2JumpTables.size(); i != e; ++i) {
+ MachineInstr *MI = T2JumpTables[i];
+ const MCInstrDesc &MCID = MI->getDesc();
+ unsigned NumOps = MCID.getNumOperands();
+ unsigned JTOpIdx = NumOps - (MI->isPredicable() ? 2 : 1);
+ MachineOperand JTOP = MI->getOperand(JTOpIdx);
+ unsigned JTI = JTOP.getIndex();
+ assert(JTI < JT.size());
+
+ bool ByteOk = true;
+ bool HalfWordOk = true;
+ unsigned JTOffset = getOffsetOf(MI) + 4;
+ const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+ for (unsigned j = 0, ee = JTBBs.size(); j != ee; ++j) {
+ MachineBasicBlock *MBB = JTBBs[j];
+ unsigned DstOffset = BBInfo[MBB->getNumber()].Offset;
+ // Negative offset is not ok. FIXME: We should change BB layout to make
+ // sure all the branches are forward.
+ if (ByteOk && (DstOffset - JTOffset) > ((1<<8)-1)*2)
+ ByteOk = false;
+ unsigned TBHLimit = ((1<<16)-1)*2;
+ if (HalfWordOk && (DstOffset - JTOffset) > TBHLimit)
+ HalfWordOk = false;
+ if (!ByteOk && !HalfWordOk)
+ break;
+ }
+
+ if (!ByteOk && !HalfWordOk)
+ continue;
+
+ CPUser &User = CPUsers[JumpTableUserIndices[JTI]];
+ MachineBasicBlock *MBB = MI->getParent();
+ if (!MI->getOperand(0).isKill()) // FIXME: needed now?
+ continue;
+
+ unsigned DeadSize = 0;
+ bool CanDeleteLEA = false;
+ bool BaseRegKill = false;
+
+ unsigned IdxReg = ~0U;
+ bool IdxRegKill = true;
+ if (isThumb2) {
+ IdxReg = MI->getOperand(1).getReg();
+ IdxRegKill = MI->getOperand(1).isKill();
+
+ bool PreservedBaseReg =
+ preserveBaseRegister(MI, User.MI, DeadSize, CanDeleteLEA, BaseRegKill);
+ if (!jumpTableFollowsTB(MI, User.CPEMI) && !PreservedBaseReg)
+ continue;
+ } else {
+ // We're in thumb-1 mode, so we must have something like:
+ // %idx = tLSLri %idx, 2
+ // %base = tLEApcrelJT
+ // %t = tLDRr %idx, %base
+ unsigned BaseReg = User.MI->getOperand(0).getReg();
+
+ if (User.MI->getIterator() == User.MI->getParent()->begin())
+ continue;
+ MachineInstr *Shift = User.MI->getPrevNode();
+ if (Shift->getOpcode() != ARM::tLSLri ||
+ Shift->getOperand(3).getImm() != 2 ||
+ !Shift->getOperand(2).isKill())
+ continue;
+ IdxReg = Shift->getOperand(2).getReg();
+ unsigned ShiftedIdxReg = Shift->getOperand(0).getReg();
+
+ MachineInstr *Load = User.MI->getNextNode();
+ if (Load->getOpcode() != ARM::tLDRr)
+ continue;
+ if (Load->getOperand(1).getReg() != ShiftedIdxReg ||
+ Load->getOperand(2).getReg() != BaseReg ||
+ !Load->getOperand(1).isKill())
+ continue;
+
+ // If we're in PIC mode, there should be another ADD following.
+ if (isPositionIndependentOrROPI) {
+ MachineInstr *Add = Load->getNextNode();
+ if (Add->getOpcode() != ARM::tADDrr ||
+ Add->getOperand(2).getReg() != Load->getOperand(0).getReg() ||
+ Add->getOperand(3).getReg() != BaseReg ||
+ !Add->getOperand(2).isKill())
+ continue;
+ if (Add->getOperand(0).getReg() != MI->getOperand(0).getReg())
+ continue;
+
+ Add->eraseFromParent();
+ DeadSize += 2;
+ } else {
+ if (Load->getOperand(0).getReg() != MI->getOperand(0).getReg())
+ continue;
+ }
+
+
+ // Now safe to delete the load and lsl. The LEA will be removed later.
+ CanDeleteLEA = true;
+ Shift->eraseFromParent();
+ Load->eraseFromParent();
+ DeadSize += 4;
+ }
+
+ DEBUG(dbgs() << "Shrink JT: " << *MI);
+ MachineInstr *CPEMI = User.CPEMI;
+ unsigned Opc = ByteOk ? ARM::t2TBB_JT : ARM::t2TBH_JT;
+ if (!isThumb2)
+ Opc = ByteOk ? ARM::tTBB_JT : ARM::tTBH_JT;
+
+ MachineBasicBlock::iterator MI_JT = MI;
+ MachineInstr *NewJTMI =
+ BuildMI(*MBB, MI_JT, MI->getDebugLoc(), TII->get(Opc))
+ .addReg(User.MI->getOperand(0).getReg(),
+ getKillRegState(BaseRegKill))
+ .addReg(IdxReg, getKillRegState(IdxRegKill))
+ .addJumpTableIndex(JTI, JTOP.getTargetFlags())
+ .addImm(CPEMI->getOperand(0).getImm());
+ DEBUG(dbgs() << "BB#" << MBB->getNumber() << ": " << *NewJTMI);
+
+ unsigned JTOpc = ByteOk ? ARM::JUMPTABLE_TBB : ARM::JUMPTABLE_TBH;
+ CPEMI->setDesc(TII->get(JTOpc));
+
+ if (jumpTableFollowsTB(MI, User.CPEMI)) {
+ NewJTMI->getOperand(0).setReg(ARM::PC);
+ NewJTMI->getOperand(0).setIsKill(false);
+
+ if (CanDeleteLEA) {
+ User.MI->eraseFromParent();
+ DeadSize += isThumb2 ? 4 : 2;
+
+ // The LEA was eliminated, the TBB instruction becomes the only new user
+ // of the jump table.
+ User.MI = NewJTMI;
+ User.MaxDisp = 4;
+ User.NegOk = false;
+ User.IsSoImm = false;
+ User.KnownAlignment = false;
+ } else {
+ // The LEA couldn't be eliminated, so we must add another CPUser to
+ // record the TBB or TBH use.
+ int CPEntryIdx = JumpTableEntryIndices[JTI];
+ auto &CPEs = CPEntries[CPEntryIdx];
+ auto Entry =
+ find_if(CPEs, [&](CPEntry &E) { return E.CPEMI == User.CPEMI; });
+ ++Entry->RefCount;
+ CPUsers.emplace_back(CPUser(NewJTMI, User.CPEMI, 4, false, false));
+ }
+ }
+
+ unsigned NewSize = TII->getInstSizeInBytes(*NewJTMI);
+ unsigned OrigSize = TII->getInstSizeInBytes(*MI);
+ MI->eraseFromParent();
+
+ int Delta = OrigSize - NewSize + DeadSize;
+ BBInfo[MBB->getNumber()].Size -= Delta;
+ adjustBBOffsetsAfter(MBB);
+
+ ++NumTBs;
+ MadeChange = true;
+ }
+
+ return MadeChange;
+}
+
+/// reorderThumb2JumpTables - Adjust the function's block layout to ensure that
+/// jump tables always branch forwards, since that's what tbb and tbh need.
+bool ARMConstantIslands::reorderThumb2JumpTables() {
+ bool MadeChange = false;
+
+ MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+ if (!MJTI) return false;
+
+ const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+ for (unsigned i = 0, e = T2JumpTables.size(); i != e; ++i) {
+ MachineInstr *MI = T2JumpTables[i];
+ const MCInstrDesc &MCID = MI->getDesc();
+ unsigned NumOps = MCID.getNumOperands();
+ unsigned JTOpIdx = NumOps - (MI->isPredicable() ? 2 : 1);
+ MachineOperand JTOP = MI->getOperand(JTOpIdx);
+ unsigned JTI = JTOP.getIndex();
+ assert(JTI < JT.size());
+
+ // We prefer if target blocks for the jump table come after the jump
+ // instruction so we can use TB[BH]. Loop through the target blocks
+ // and try to adjust them such that that's true.
+ int JTNumber = MI->getParent()->getNumber();
+ const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+ for (unsigned j = 0, ee = JTBBs.size(); j != ee; ++j) {
+ MachineBasicBlock *MBB = JTBBs[j];
+ int DTNumber = MBB->getNumber();
+
+ if (DTNumber < JTNumber) {
+ // The destination precedes the switch. Try to move the block forward
+ // so we have a positive offset.
+ MachineBasicBlock *NewBB =
+ adjustJTTargetBlockForward(MBB, MI->getParent());
+ if (NewBB)
+ MJTI->ReplaceMBBInJumpTable(JTI, JTBBs[j], NewBB);
+ MadeChange = true;
+ }
+ }
+ }
+
+ return MadeChange;
+}
+
+MachineBasicBlock *ARMConstantIslands::
+adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) {
+ // If the destination block is terminated by an unconditional branch,
+ // try to move it; otherwise, create a new block following the jump
+ // table that branches back to the actual target. This is a very simple
+ // heuristic. FIXME: We can definitely improve it.
+ MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+ SmallVector<MachineOperand, 4> Cond;
+ SmallVector<MachineOperand, 4> CondPrior;
+ MachineFunction::iterator BBi = BB->getIterator();
+ MachineFunction::iterator OldPrior = std::prev(BBi);
+
+ // If the block terminator isn't analyzable, don't try to move the block
+ bool B = TII->analyzeBranch(*BB, TBB, FBB, Cond);
+
+ // If the block ends in an unconditional branch, move it. The prior block
+ // has to have an analyzable terminator for us to move this one. Be paranoid
+ // and make sure we're not trying to move the entry block of the function.
+ if (!B && Cond.empty() && BB != &MF->front() &&
+ !TII->analyzeBranch(*OldPrior, TBB, FBB, CondPrior)) {
+ BB->moveAfter(JTBB);
+ OldPrior->updateTerminator();
+ BB->updateTerminator();
+ // Update numbering to account for the block being moved.
+ MF->RenumberBlocks();
+ ++NumJTMoved;
+ return nullptr;
+ }
+
+ // Create a new MBB for the code after the jump BB.
+ MachineBasicBlock *NewBB =
+ MF->CreateMachineBasicBlock(JTBB->getBasicBlock());
+ MachineFunction::iterator MBBI = ++JTBB->getIterator();
+ MF->insert(MBBI, NewBB);
+
+ // Add an unconditional branch from NewBB to BB.
+ // There doesn't seem to be meaningful DebugInfo available; this doesn't
+ // correspond directly to anything in the source.
+ if (isThumb2)
+ BuildMI(NewBB, DebugLoc(), TII->get(ARM::t2B))
+ .addMBB(BB)
+ .addImm(ARMCC::AL)
+ .addReg(0);
+ else
+ BuildMI(NewBB, DebugLoc(), TII->get(ARM::tB))
+ .addMBB(BB)
+ .addImm(ARMCC::AL)
+ .addReg(0);
+
+ // Update internal data structures to account for the newly inserted MBB.
+ MF->RenumberBlocks(NewBB);
+
+ // Update the CFG.
+ NewBB->addSuccessor(BB);
+ JTBB->replaceSuccessor(BB, NewBB);
+
+ ++NumJTInserted;
+ return NewBB;
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp
new file mode 100644
index 000000000000..2d1602873ce0
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp
@@ -0,0 +1,281 @@
+//===-- ARMConstantPoolValue.cpp - ARM constantpool value -----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARM specific constantpool value class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMConstantPoolValue.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdlib>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// ARMConstantPoolValue
+//===----------------------------------------------------------------------===//
+
+ARMConstantPoolValue::ARMConstantPoolValue(Type *Ty, unsigned id,
+ ARMCP::ARMCPKind kind,
+ unsigned char PCAdj,
+ ARMCP::ARMCPModifier modifier,
+ bool addCurrentAddress)
+ : MachineConstantPoolValue(Ty), LabelId(id), Kind(kind),
+ PCAdjust(PCAdj), Modifier(modifier),
+ AddCurrentAddress(addCurrentAddress) {}
+
+ARMConstantPoolValue::ARMConstantPoolValue(LLVMContext &C, unsigned id,
+ ARMCP::ARMCPKind kind,
+ unsigned char PCAdj,
+ ARMCP::ARMCPModifier modifier,
+ bool addCurrentAddress)
+ : MachineConstantPoolValue((Type*)Type::getInt32Ty(C)),
+ LabelId(id), Kind(kind), PCAdjust(PCAdj), Modifier(modifier),
+ AddCurrentAddress(addCurrentAddress) {}
+
+ARMConstantPoolValue::~ARMConstantPoolValue() {}
+
+StringRef ARMConstantPoolValue::getModifierText() const {
+ switch (Modifier) {
+ // FIXME: Are these case sensitive? It'd be nice to lower-case all the
+ // strings if that's legal.
+ case ARMCP::no_modifier:
+ return "none";
+ case ARMCP::TLSGD:
+ return "tlsgd";
+ case ARMCP::GOT_PREL:
+ return "GOT_PREL";
+ case ARMCP::GOTTPOFF:
+ return "gottpoff";
+ case ARMCP::TPOFF:
+ return "tpoff";
+ case ARMCP::SBREL:
+ return "SBREL";
+ case ARMCP::SECREL:
+ return "secrel32";
+ }
+ llvm_unreachable("Unknown modifier!");
+}
+
+int ARMConstantPoolValue::getExistingMachineCPValue(MachineConstantPool *CP,
+ unsigned Alignment) {
+ llvm_unreachable("Shouldn't be calling this directly!");
+}
+
+void
+ARMConstantPoolValue::addSelectionDAGCSEId(FoldingSetNodeID &ID) {
+ ID.AddInteger(LabelId);
+ ID.AddInteger(PCAdjust);
+}
+
+bool
+ARMConstantPoolValue::hasSameValue(ARMConstantPoolValue *ACPV) {
+ if (ACPV->Kind == Kind &&
+ ACPV->PCAdjust == PCAdjust &&
+ ACPV->Modifier == Modifier &&
+ ACPV->LabelId == LabelId &&
+ ACPV->AddCurrentAddress == AddCurrentAddress) {
+ // Two PC relative constpool entries containing the same GV address or
+ // external symbols. FIXME: What about blockaddress?
+ if (Kind == ARMCP::CPValue || Kind == ARMCP::CPExtSymbol)
+ return true;
+ }
+ return false;
+}
+
+LLVM_DUMP_METHOD void ARMConstantPoolValue::dump() const {
+ errs() << " " << *this;
+}
+
+void ARMConstantPoolValue::print(raw_ostream &O) const {
+ if (Modifier) O << "(" << getModifierText() << ")";
+ if (PCAdjust != 0) {
+ O << "-(LPC" << LabelId << "+" << (unsigned)PCAdjust;
+ if (AddCurrentAddress) O << "-.";
+ O << ")";
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// ARMConstantPoolConstant
+//===----------------------------------------------------------------------===//
+
+ARMConstantPoolConstant::ARMConstantPoolConstant(Type *Ty,
+ const Constant *C,
+ unsigned ID,
+ ARMCP::ARMCPKind Kind,
+ unsigned char PCAdj,
+ ARMCP::ARMCPModifier Modifier,
+ bool AddCurrentAddress)
+ : ARMConstantPoolValue(Ty, ID, Kind, PCAdj, Modifier, AddCurrentAddress),
+ CVal(C) {}
+
+ARMConstantPoolConstant::ARMConstantPoolConstant(const Constant *C,
+ unsigned ID,
+ ARMCP::ARMCPKind Kind,
+ unsigned char PCAdj,
+ ARMCP::ARMCPModifier Modifier,
+ bool AddCurrentAddress)
+ : ARMConstantPoolValue((Type*)C->getType(), ID, Kind, PCAdj, Modifier,
+ AddCurrentAddress),
+ CVal(C) {}
+
+ARMConstantPoolConstant::ARMConstantPoolConstant(const GlobalVariable *GV,
+ const Constant *C)
+ : ARMConstantPoolValue((Type *)C->getType(), 0, ARMCP::CPPromotedGlobal, 0,
+ ARMCP::no_modifier, false),
+ CVal(C), GVar(GV) {}
+
+ARMConstantPoolConstant *
+ARMConstantPoolConstant::Create(const Constant *C, unsigned ID) {
+ return new ARMConstantPoolConstant(C, ID, ARMCP::CPValue, 0,
+ ARMCP::no_modifier, false);
+}
+
+ARMConstantPoolConstant *
+ARMConstantPoolConstant::Create(const GlobalVariable *GVar,
+ const Constant *Initializer) {
+ return new ARMConstantPoolConstant(GVar, Initializer);
+}
+
+ARMConstantPoolConstant *
+ARMConstantPoolConstant::Create(const GlobalValue *GV,
+ ARMCP::ARMCPModifier Modifier) {
+ return new ARMConstantPoolConstant((Type*)Type::getInt32Ty(GV->getContext()),
+ GV, 0, ARMCP::CPValue, 0,
+ Modifier, false);
+}
+
+ARMConstantPoolConstant *
+ARMConstantPoolConstant::Create(const Constant *C, unsigned ID,
+ ARMCP::ARMCPKind Kind, unsigned char PCAdj) {
+ return new ARMConstantPoolConstant(C, ID, Kind, PCAdj,
+ ARMCP::no_modifier, false);
+}
+
+ARMConstantPoolConstant *
+ARMConstantPoolConstant::Create(const Constant *C, unsigned ID,
+ ARMCP::ARMCPKind Kind, unsigned char PCAdj,
+ ARMCP::ARMCPModifier Modifier,
+ bool AddCurrentAddress) {
+ return new ARMConstantPoolConstant(C, ID, Kind, PCAdj, Modifier,
+ AddCurrentAddress);
+}
+
+const GlobalValue *ARMConstantPoolConstant::getGV() const {
+ return dyn_cast_or_null<GlobalValue>(CVal);
+}
+
+const BlockAddress *ARMConstantPoolConstant::getBlockAddress() const {
+ return dyn_cast_or_null<BlockAddress>(CVal);
+}
+
+int ARMConstantPoolConstant::getExistingMachineCPValue(MachineConstantPool *CP,
+ unsigned Alignment) {
+ return getExistingMachineCPValueImpl<ARMConstantPoolConstant>(CP, Alignment);
+}
+
+bool ARMConstantPoolConstant::hasSameValue(ARMConstantPoolValue *ACPV) {
+ const ARMConstantPoolConstant *ACPC = dyn_cast<ARMConstantPoolConstant>(ACPV);
+ return ACPC && ACPC->CVal == CVal && ARMConstantPoolValue::hasSameValue(ACPV);
+}
+
+void ARMConstantPoolConstant::addSelectionDAGCSEId(FoldingSetNodeID &ID) {
+ ID.AddPointer(CVal);
+ ARMConstantPoolValue::addSelectionDAGCSEId(ID);
+}
+
+void ARMConstantPoolConstant::print(raw_ostream &O) const {
+ O << CVal->getName();
+ ARMConstantPoolValue::print(O);
+}
+
+//===----------------------------------------------------------------------===//
+// ARMConstantPoolSymbol
+//===----------------------------------------------------------------------===//
+
+ARMConstantPoolSymbol::ARMConstantPoolSymbol(LLVMContext &C, StringRef s,
+ unsigned id, unsigned char PCAdj,
+ ARMCP::ARMCPModifier Modifier,
+ bool AddCurrentAddress)
+ : ARMConstantPoolValue(C, id, ARMCP::CPExtSymbol, PCAdj, Modifier,
+ AddCurrentAddress),
+ S(s) {}
+
+ARMConstantPoolSymbol *ARMConstantPoolSymbol::Create(LLVMContext &C,
+ StringRef s, unsigned ID,
+ unsigned char PCAdj) {
+ return new ARMConstantPoolSymbol(C, s, ID, PCAdj, ARMCP::no_modifier, false);
+}
+
+int ARMConstantPoolSymbol::getExistingMachineCPValue(MachineConstantPool *CP,
+ unsigned Alignment) {
+ return getExistingMachineCPValueImpl<ARMConstantPoolSymbol>(CP, Alignment);
+}
+
+bool ARMConstantPoolSymbol::hasSameValue(ARMConstantPoolValue *ACPV) {
+ const ARMConstantPoolSymbol *ACPS = dyn_cast<ARMConstantPoolSymbol>(ACPV);
+ return ACPS && ACPS->S == S && ARMConstantPoolValue::hasSameValue(ACPV);
+}
+
+void ARMConstantPoolSymbol::addSelectionDAGCSEId(FoldingSetNodeID &ID) {
+ ID.AddString(S);
+ ARMConstantPoolValue::addSelectionDAGCSEId(ID);
+}
+
+void ARMConstantPoolSymbol::print(raw_ostream &O) const {
+ O << S;
+ ARMConstantPoolValue::print(O);
+}
+
+//===----------------------------------------------------------------------===//
+// ARMConstantPoolMBB
+//===----------------------------------------------------------------------===//
+
+ARMConstantPoolMBB::ARMConstantPoolMBB(LLVMContext &C,
+ const MachineBasicBlock *mbb,
+ unsigned id, unsigned char PCAdj,
+ ARMCP::ARMCPModifier Modifier,
+ bool AddCurrentAddress)
+ : ARMConstantPoolValue(C, id, ARMCP::CPMachineBasicBlock, PCAdj,
+ Modifier, AddCurrentAddress),
+ MBB(mbb) {}
+
+ARMConstantPoolMBB *ARMConstantPoolMBB::Create(LLVMContext &C,
+ const MachineBasicBlock *mbb,
+ unsigned ID,
+ unsigned char PCAdj) {
+ return new ARMConstantPoolMBB(C, mbb, ID, PCAdj, ARMCP::no_modifier, false);
+}
+
+int ARMConstantPoolMBB::getExistingMachineCPValue(MachineConstantPool *CP,
+ unsigned Alignment) {
+ return getExistingMachineCPValueImpl<ARMConstantPoolMBB>(CP, Alignment);
+}
+
+bool ARMConstantPoolMBB::hasSameValue(ARMConstantPoolValue *ACPV) {
+ const ARMConstantPoolMBB *ACPMBB = dyn_cast<ARMConstantPoolMBB>(ACPV);
+ return ACPMBB && ACPMBB->MBB == MBB &&
+ ARMConstantPoolValue::hasSameValue(ACPV);
+}
+
+void ARMConstantPoolMBB::addSelectionDAGCSEId(FoldingSetNodeID &ID) {
+ ID.AddPointer(MBB);
+ ARMConstantPoolValue::addSelectionDAGCSEId(ID);
+}
+
+void ARMConstantPoolMBB::print(raw_ostream &O) const {
+ O << "BB#" << MBB->getNumber();
+ ARMConstantPoolValue::print(O);
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h
new file mode 100644
index 000000000000..5f61832aa740
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h
@@ -0,0 +1,272 @@
+//===-- ARMConstantPoolValue.h - ARM constantpool value ---------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARM specific constantpool value class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMCONSTANTPOOLVALUE_H
+#define LLVM_LIB_TARGET_ARM_ARMCONSTANTPOOLVALUE_H
+
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cstddef>
+
+namespace llvm {
+
+class BlockAddress;
+class Constant;
+class GlobalValue;
+class GlobalVariable;
+class LLVMContext;
+class MachineBasicBlock;
+
+namespace ARMCP {
+ enum ARMCPKind {
+ CPValue,
+ CPExtSymbol,
+ CPBlockAddress,
+ CPLSDA,
+ CPMachineBasicBlock,
+ CPPromotedGlobal
+ };
+
+ enum ARMCPModifier {
+ no_modifier, /// None
+ TLSGD, /// Thread Local Storage (General Dynamic Mode)
+ GOT_PREL, /// Global Offset Table, PC Relative
+ GOTTPOFF, /// Global Offset Table, Thread Pointer Offset
+ TPOFF, /// Thread Pointer Offset
+ SECREL, /// Section Relative (Windows TLS)
+ SBREL, /// Static Base Relative (RWPI)
+ };
+}
+
+/// ARMConstantPoolValue - ARM specific constantpool value. This is used to
+/// represent PC-relative displacement between the address of the load
+/// instruction and the constant being loaded, i.e. (&GV-(LPIC+8)).
+class ARMConstantPoolValue : public MachineConstantPoolValue {
+ unsigned LabelId; // Label id of the load.
+ ARMCP::ARMCPKind Kind; // Kind of constant.
+ unsigned char PCAdjust; // Extra adjustment if constantpool is pc-relative.
+ // 8 for ARM, 4 for Thumb.
+ ARMCP::ARMCPModifier Modifier; // GV modifier i.e. (&GV(modifier)-(LPIC+8))
+ bool AddCurrentAddress;
+
+protected:
+ ARMConstantPoolValue(Type *Ty, unsigned id, ARMCP::ARMCPKind Kind,
+ unsigned char PCAdj, ARMCP::ARMCPModifier Modifier,
+ bool AddCurrentAddress);
+
+ ARMConstantPoolValue(LLVMContext &C, unsigned id, ARMCP::ARMCPKind Kind,
+ unsigned char PCAdj, ARMCP::ARMCPModifier Modifier,
+ bool AddCurrentAddress);
+
+ template <typename Derived>
+ int getExistingMachineCPValueImpl(MachineConstantPool *CP,
+ unsigned Alignment) {
+ unsigned AlignMask = Alignment - 1;
+ const std::vector<MachineConstantPoolEntry> &Constants = CP->getConstants();
+ for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
+ if (Constants[i].isMachineConstantPoolEntry() &&
+ (Constants[i].getAlignment() & AlignMask) == 0) {
+ ARMConstantPoolValue *CPV =
+ (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal;
+ if (Derived *APC = dyn_cast<Derived>(CPV))
+ if (cast<Derived>(this)->equals(APC))
+ return i;
+ }
+ }
+
+ return -1;
+ }
+
+public:
+ ~ARMConstantPoolValue() override;
+
+ ARMCP::ARMCPModifier getModifier() const { return Modifier; }
+ StringRef getModifierText() const;
+ bool hasModifier() const { return Modifier != ARMCP::no_modifier; }
+
+ bool mustAddCurrentAddress() const { return AddCurrentAddress; }
+
+ unsigned getLabelId() const { return LabelId; }
+ unsigned char getPCAdjustment() const { return PCAdjust; }
+
+ bool isGlobalValue() const { return Kind == ARMCP::CPValue; }
+ bool isExtSymbol() const { return Kind == ARMCP::CPExtSymbol; }
+ bool isBlockAddress() const { return Kind == ARMCP::CPBlockAddress; }
+ bool isLSDA() const { return Kind == ARMCP::CPLSDA; }
+ bool isMachineBasicBlock() const{ return Kind == ARMCP::CPMachineBasicBlock; }
+ bool isPromotedGlobal() const{ return Kind == ARMCP::CPPromotedGlobal; }
+
+ int getExistingMachineCPValue(MachineConstantPool *CP,
+ unsigned Alignment) override;
+
+ void addSelectionDAGCSEId(FoldingSetNodeID &ID) override;
+
+ /// hasSameValue - Return true if this ARM constpool value can share the same
+ /// constantpool entry as another ARM constpool value.
+ virtual bool hasSameValue(ARMConstantPoolValue *ACPV);
+
+ bool equals(const ARMConstantPoolValue *A) const {
+ return this->LabelId == A->LabelId &&
+ this->PCAdjust == A->PCAdjust &&
+ this->Modifier == A->Modifier;
+ }
+
+ void print(raw_ostream &O) const override;
+ void print(raw_ostream *O) const { if (O) print(*O); }
+ void dump() const;
+};
+
+inline raw_ostream &operator<<(raw_ostream &O, const ARMConstantPoolValue &V) {
+ V.print(O);
+ return O;
+}
+
+/// ARMConstantPoolConstant - ARM-specific constant pool values for Constants,
+/// Functions, and BlockAddresses.
+class ARMConstantPoolConstant : public ARMConstantPoolValue {
+ const Constant *CVal; // Constant being loaded.
+ const GlobalVariable *GVar = nullptr;
+
+ ARMConstantPoolConstant(const Constant *C,
+ unsigned ID,
+ ARMCP::ARMCPKind Kind,
+ unsigned char PCAdj,
+ ARMCP::ARMCPModifier Modifier,
+ bool AddCurrentAddress);
+ ARMConstantPoolConstant(Type *Ty, const Constant *C,
+ unsigned ID,
+ ARMCP::ARMCPKind Kind,
+ unsigned char PCAdj,
+ ARMCP::ARMCPModifier Modifier,
+ bool AddCurrentAddress);
+ ARMConstantPoolConstant(const GlobalVariable *GV, const Constant *Init);
+
+public:
+ static ARMConstantPoolConstant *Create(const Constant *C, unsigned ID);
+ static ARMConstantPoolConstant *Create(const GlobalValue *GV,
+ ARMCP::ARMCPModifier Modifier);
+ static ARMConstantPoolConstant *Create(const GlobalVariable *GV,
+ const Constant *Initializer);
+ static ARMConstantPoolConstant *Create(const Constant *C, unsigned ID,
+ ARMCP::ARMCPKind Kind,
+ unsigned char PCAdj);
+ static ARMConstantPoolConstant *Create(const Constant *C, unsigned ID,
+ ARMCP::ARMCPKind Kind,
+ unsigned char PCAdj,
+ ARMCP::ARMCPModifier Modifier,
+ bool AddCurrentAddress);
+
+ const GlobalValue *getGV() const;
+ const BlockAddress *getBlockAddress() const;
+ const GlobalVariable *getPromotedGlobal() const {
+ return dyn_cast_or_null<GlobalVariable>(GVar);
+ }
+ const Constant *getPromotedGlobalInit() const {
+ return CVal;
+ }
+
+ int getExistingMachineCPValue(MachineConstantPool *CP,
+ unsigned Alignment) override;
+
+ /// hasSameValue - Return true if this ARM constpool value can share the same
+ /// constantpool entry as another ARM constpool value.
+ bool hasSameValue(ARMConstantPoolValue *ACPV) override;
+
+ void addSelectionDAGCSEId(FoldingSetNodeID &ID) override;
+
+ void print(raw_ostream &O) const override;
+ static bool classof(const ARMConstantPoolValue *APV) {
+ return APV->isGlobalValue() || APV->isBlockAddress() || APV->isLSDA() ||
+ APV->isPromotedGlobal();
+ }
+
+ bool equals(const ARMConstantPoolConstant *A) const {
+ return CVal == A->CVal && ARMConstantPoolValue::equals(A);
+ }
+};
+
+/// ARMConstantPoolSymbol - ARM-specific constantpool values for external
+/// symbols.
+class ARMConstantPoolSymbol : public ARMConstantPoolValue {
+ const std::string S; // ExtSymbol being loaded.
+
+ ARMConstantPoolSymbol(LLVMContext &C, StringRef s, unsigned id,
+ unsigned char PCAdj, ARMCP::ARMCPModifier Modifier,
+ bool AddCurrentAddress);
+
+public:
+ static ARMConstantPoolSymbol *Create(LLVMContext &C, StringRef s, unsigned ID,
+ unsigned char PCAdj);
+
+ StringRef getSymbol() const { return S; }
+
+ int getExistingMachineCPValue(MachineConstantPool *CP,
+ unsigned Alignment) override;
+
+ void addSelectionDAGCSEId(FoldingSetNodeID &ID) override;
+
+ /// hasSameValue - Return true if this ARM constpool value can share the same
+ /// constantpool entry as another ARM constpool value.
+ bool hasSameValue(ARMConstantPoolValue *ACPV) override;
+
+ void print(raw_ostream &O) const override;
+
+ static bool classof(const ARMConstantPoolValue *ACPV) {
+ return ACPV->isExtSymbol();
+ }
+
+ bool equals(const ARMConstantPoolSymbol *A) const {
+ return S == A->S && ARMConstantPoolValue::equals(A);
+ }
+};
+
+/// ARMConstantPoolMBB - ARM-specific constantpool value of a machine basic
+/// block.
+class ARMConstantPoolMBB : public ARMConstantPoolValue {
+ const MachineBasicBlock *MBB; // Machine basic block.
+
+ ARMConstantPoolMBB(LLVMContext &C, const MachineBasicBlock *mbb, unsigned id,
+ unsigned char PCAdj, ARMCP::ARMCPModifier Modifier,
+ bool AddCurrentAddress);
+
+public:
+ static ARMConstantPoolMBB *Create(LLVMContext &C,
+ const MachineBasicBlock *mbb,
+ unsigned ID, unsigned char PCAdj);
+
+ const MachineBasicBlock *getMBB() const { return MBB; }
+
+ int getExistingMachineCPValue(MachineConstantPool *CP,
+ unsigned Alignment) override;
+
+ void addSelectionDAGCSEId(FoldingSetNodeID &ID) override;
+
+ /// hasSameValue - Return true if this ARM constpool value can share the same
+ /// constantpool entry as another ARM constpool value.
+ bool hasSameValue(ARMConstantPoolValue *ACPV) override;
+
+ void print(raw_ostream &O) const override;
+
+ static bool classof(const ARMConstantPoolValue *ACPV) {
+ return ACPV->isMachineBasicBlock();
+ }
+
+ bool equals(const ARMConstantPoolMBB *A) const {
+ return MBB == A->MBB && ARMConstantPoolValue::equals(A);
+ }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
new file mode 100644
index 000000000000..95fcc8dcb453
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -0,0 +1,1686 @@
+//===-- ARMExpandPseudoInsts.cpp - Expand pseudo instructions -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that expands pseudo instructions into target
+// instructions to allow proper scheduling, if-conversion, and other late
+// optimizations. This pass should be run after register allocation but before
+// the post-regalloc scheduling pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMMachineFunctionInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h" // FIXME: for debug only. remove!
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-pseudo"
+
+static cl::opt<bool>
+VerifyARMPseudo("verify-arm-pseudo-expand", cl::Hidden,
+ cl::desc("Verify machine code after expanding ARM pseudos"));
+
+namespace {
+ class ARMExpandPseudo : public MachineFunctionPass {
+ public:
+ static char ID;
+ ARMExpandPseudo() : MachineFunctionPass(ID) {}
+
+ const ARMBaseInstrInfo *TII;
+ const TargetRegisterInfo *TRI;
+ const ARMSubtarget *STI;
+ ARMFunctionInfo *AFI;
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+ StringRef getPassName() const override {
+ return "ARM pseudo instruction expansion pass";
+ }
+
+ private:
+ void TransferImpOps(MachineInstr &OldMI,
+ MachineInstrBuilder &UseMI, MachineInstrBuilder &DefMI);
+ bool ExpandMI(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI);
+ bool ExpandMBB(MachineBasicBlock &MBB);
+ void ExpandVLD(MachineBasicBlock::iterator &MBBI);
+ void ExpandVST(MachineBasicBlock::iterator &MBBI);
+ void ExpandLaneOp(MachineBasicBlock::iterator &MBBI);
+ void ExpandVTBL(MachineBasicBlock::iterator &MBBI,
+ unsigned Opc, bool IsExt);
+ void ExpandMOV32BitImm(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI);
+ bool ExpandCMP_SWAP(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, unsigned LdrexOp,
+ unsigned StrexOp, unsigned UxtOp,
+ MachineBasicBlock::iterator &NextMBBI);
+
+ bool ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI);
+ };
+ char ARMExpandPseudo::ID = 0;
+}
+
+/// TransferImpOps - Transfer implicit operands on the pseudo instruction to
+/// the instructions created from the expansion.
+void ARMExpandPseudo::TransferImpOps(MachineInstr &OldMI,
+ MachineInstrBuilder &UseMI,
+ MachineInstrBuilder &DefMI) {
+ const MCInstrDesc &Desc = OldMI.getDesc();
+ for (unsigned i = Desc.getNumOperands(), e = OldMI.getNumOperands();
+ i != e; ++i) {
+ const MachineOperand &MO = OldMI.getOperand(i);
+ assert(MO.isReg() && MO.getReg());
+ if (MO.isUse())
+ UseMI.addOperand(MO);
+ else
+ DefMI.addOperand(MO);
+ }
+}
+
+namespace {
+ // Constants for register spacing in NEON load/store instructions.
+ // For quad-register load-lane and store-lane pseudo instructors, the
+ // spacing is initially assumed to be EvenDblSpc, and that is changed to
+ // OddDblSpc depending on the lane number operand.
+ enum NEONRegSpacing {
+ SingleSpc,
+ EvenDblSpc,
+ OddDblSpc
+ };
+
+ // Entries for NEON load/store information table. The table is sorted by
+ // PseudoOpc for fast binary-search lookups.
+ struct NEONLdStTableEntry {
+ uint16_t PseudoOpc;
+ uint16_t RealOpc;
+ bool IsLoad;
+ bool isUpdating;
+ bool hasWritebackOperand;
+ uint8_t RegSpacing; // One of type NEONRegSpacing
+ uint8_t NumRegs; // D registers loaded or stored
+ uint8_t RegElts; // elements per D register; used for lane ops
+ // FIXME: Temporary flag to denote whether the real instruction takes
+ // a single register (like the encoding) or all of the registers in
+ // the list (like the asm syntax and the isel DAG). When all definitions
+ // are converted to take only the single encoded register, this will
+ // go away.
+ bool copyAllListRegs;
+
+ // Comparison methods for binary search of the table.
+ bool operator<(const NEONLdStTableEntry &TE) const {
+ return PseudoOpc < TE.PseudoOpc;
+ }
+ friend bool operator<(const NEONLdStTableEntry &TE, unsigned PseudoOpc) {
+ return TE.PseudoOpc < PseudoOpc;
+ }
+ friend bool LLVM_ATTRIBUTE_UNUSED operator<(unsigned PseudoOpc,
+ const NEONLdStTableEntry &TE) {
+ return PseudoOpc < TE.PseudoOpc;
+ }
+ };
+}
+
+static const NEONLdStTableEntry NEONLdStTable[] = {
+{ ARM::VLD1LNq16Pseudo, ARM::VLD1LNd16, true, false, false, EvenDblSpc, 1, 4 ,true},
+{ ARM::VLD1LNq16Pseudo_UPD, ARM::VLD1LNd16_UPD, true, true, true, EvenDblSpc, 1, 4 ,true},
+{ ARM::VLD1LNq32Pseudo, ARM::VLD1LNd32, true, false, false, EvenDblSpc, 1, 2 ,true},
+{ ARM::VLD1LNq32Pseudo_UPD, ARM::VLD1LNd32_UPD, true, true, true, EvenDblSpc, 1, 2 ,true},
+{ ARM::VLD1LNq8Pseudo, ARM::VLD1LNd8, true, false, false, EvenDblSpc, 1, 8 ,true},
+{ ARM::VLD1LNq8Pseudo_UPD, ARM::VLD1LNd8_UPD, true, true, true, EvenDblSpc, 1, 8 ,true},
+
+{ ARM::VLD1d64QPseudo, ARM::VLD1d64Q, true, false, false, SingleSpc, 4, 1 ,false},
+{ ARM::VLD1d64QPseudoWB_fixed, ARM::VLD1d64Qwb_fixed, true, true, false, SingleSpc, 4, 1 ,false},
+{ ARM::VLD1d64TPseudo, ARM::VLD1d64T, true, false, false, SingleSpc, 3, 1 ,false},
+{ ARM::VLD1d64TPseudoWB_fixed, ARM::VLD1d64Twb_fixed, true, true, false, SingleSpc, 3, 1 ,false},
+
+{ ARM::VLD2LNd16Pseudo, ARM::VLD2LNd16, true, false, false, SingleSpc, 2, 4 ,true},
+{ ARM::VLD2LNd16Pseudo_UPD, ARM::VLD2LNd16_UPD, true, true, true, SingleSpc, 2, 4 ,true},
+{ ARM::VLD2LNd32Pseudo, ARM::VLD2LNd32, true, false, false, SingleSpc, 2, 2 ,true},
+{ ARM::VLD2LNd32Pseudo_UPD, ARM::VLD2LNd32_UPD, true, true, true, SingleSpc, 2, 2 ,true},
+{ ARM::VLD2LNd8Pseudo, ARM::VLD2LNd8, true, false, false, SingleSpc, 2, 8 ,true},
+{ ARM::VLD2LNd8Pseudo_UPD, ARM::VLD2LNd8_UPD, true, true, true, SingleSpc, 2, 8 ,true},
+{ ARM::VLD2LNq16Pseudo, ARM::VLD2LNq16, true, false, false, EvenDblSpc, 2, 4 ,true},
+{ ARM::VLD2LNq16Pseudo_UPD, ARM::VLD2LNq16_UPD, true, true, true, EvenDblSpc, 2, 4 ,true},
+{ ARM::VLD2LNq32Pseudo, ARM::VLD2LNq32, true, false, false, EvenDblSpc, 2, 2 ,true},
+{ ARM::VLD2LNq32Pseudo_UPD, ARM::VLD2LNq32_UPD, true, true, true, EvenDblSpc, 2, 2 ,true},
+
+{ ARM::VLD2q16Pseudo, ARM::VLD2q16, true, false, false, SingleSpc, 4, 4 ,false},
+{ ARM::VLD2q16PseudoWB_fixed, ARM::VLD2q16wb_fixed, true, true, false, SingleSpc, 4, 4 ,false},
+{ ARM::VLD2q16PseudoWB_register, ARM::VLD2q16wb_register, true, true, true, SingleSpc, 4, 4 ,false},
+{ ARM::VLD2q32Pseudo, ARM::VLD2q32, true, false, false, SingleSpc, 4, 2 ,false},
+{ ARM::VLD2q32PseudoWB_fixed, ARM::VLD2q32wb_fixed, true, true, false, SingleSpc, 4, 2 ,false},
+{ ARM::VLD2q32PseudoWB_register, ARM::VLD2q32wb_register, true, true, true, SingleSpc, 4, 2 ,false},
+{ ARM::VLD2q8Pseudo, ARM::VLD2q8, true, false, false, SingleSpc, 4, 8 ,false},
+{ ARM::VLD2q8PseudoWB_fixed, ARM::VLD2q8wb_fixed, true, true, false, SingleSpc, 4, 8 ,false},
+{ ARM::VLD2q8PseudoWB_register, ARM::VLD2q8wb_register, true, true, true, SingleSpc, 4, 8 ,false},
+
+{ ARM::VLD3DUPd16Pseudo, ARM::VLD3DUPd16, true, false, false, SingleSpc, 3, 4,true},
+{ ARM::VLD3DUPd16Pseudo_UPD, ARM::VLD3DUPd16_UPD, true, true, true, SingleSpc, 3, 4,true},
+{ ARM::VLD3DUPd32Pseudo, ARM::VLD3DUPd32, true, false, false, SingleSpc, 3, 2,true},
+{ ARM::VLD3DUPd32Pseudo_UPD, ARM::VLD3DUPd32_UPD, true, true, true, SingleSpc, 3, 2,true},
+{ ARM::VLD3DUPd8Pseudo, ARM::VLD3DUPd8, true, false, false, SingleSpc, 3, 8,true},
+{ ARM::VLD3DUPd8Pseudo_UPD, ARM::VLD3DUPd8_UPD, true, true, true, SingleSpc, 3, 8,true},
+
+{ ARM::VLD3LNd16Pseudo, ARM::VLD3LNd16, true, false, false, SingleSpc, 3, 4 ,true},
+{ ARM::VLD3LNd16Pseudo_UPD, ARM::VLD3LNd16_UPD, true, true, true, SingleSpc, 3, 4 ,true},
+{ ARM::VLD3LNd32Pseudo, ARM::VLD3LNd32, true, false, false, SingleSpc, 3, 2 ,true},
+{ ARM::VLD3LNd32Pseudo_UPD, ARM::VLD3LNd32_UPD, true, true, true, SingleSpc, 3, 2 ,true},
+{ ARM::VLD3LNd8Pseudo, ARM::VLD3LNd8, true, false, false, SingleSpc, 3, 8 ,true},
+{ ARM::VLD3LNd8Pseudo_UPD, ARM::VLD3LNd8_UPD, true, true, true, SingleSpc, 3, 8 ,true},
+{ ARM::VLD3LNq16Pseudo, ARM::VLD3LNq16, true, false, false, EvenDblSpc, 3, 4 ,true},
+{ ARM::VLD3LNq16Pseudo_UPD, ARM::VLD3LNq16_UPD, true, true, true, EvenDblSpc, 3, 4 ,true},
+{ ARM::VLD3LNq32Pseudo, ARM::VLD3LNq32, true, false, false, EvenDblSpc, 3, 2 ,true},
+{ ARM::VLD3LNq32Pseudo_UPD, ARM::VLD3LNq32_UPD, true, true, true, EvenDblSpc, 3, 2 ,true},
+
+{ ARM::VLD3d16Pseudo, ARM::VLD3d16, true, false, false, SingleSpc, 3, 4 ,true},
+{ ARM::VLD3d16Pseudo_UPD, ARM::VLD3d16_UPD, true, true, true, SingleSpc, 3, 4 ,true},
+{ ARM::VLD3d32Pseudo, ARM::VLD3d32, true, false, false, SingleSpc, 3, 2 ,true},
+{ ARM::VLD3d32Pseudo_UPD, ARM::VLD3d32_UPD, true, true, true, SingleSpc, 3, 2 ,true},
+{ ARM::VLD3d8Pseudo, ARM::VLD3d8, true, false, false, SingleSpc, 3, 8 ,true},
+{ ARM::VLD3d8Pseudo_UPD, ARM::VLD3d8_UPD, true, true, true, SingleSpc, 3, 8 ,true},
+
+{ ARM::VLD3q16Pseudo_UPD, ARM::VLD3q16_UPD, true, true, true, EvenDblSpc, 3, 4 ,true},
+{ ARM::VLD3q16oddPseudo, ARM::VLD3q16, true, false, false, OddDblSpc, 3, 4 ,true},
+{ ARM::VLD3q16oddPseudo_UPD, ARM::VLD3q16_UPD, true, true, true, OddDblSpc, 3, 4 ,true},
+{ ARM::VLD3q32Pseudo_UPD, ARM::VLD3q32_UPD, true, true, true, EvenDblSpc, 3, 2 ,true},
+{ ARM::VLD3q32oddPseudo, ARM::VLD3q32, true, false, false, OddDblSpc, 3, 2 ,true},
+{ ARM::VLD3q32oddPseudo_UPD, ARM::VLD3q32_UPD, true, true, true, OddDblSpc, 3, 2 ,true},
+{ ARM::VLD3q8Pseudo_UPD, ARM::VLD3q8_UPD, true, true, true, EvenDblSpc, 3, 8 ,true},
+{ ARM::VLD3q8oddPseudo, ARM::VLD3q8, true, false, false, OddDblSpc, 3, 8 ,true},
+{ ARM::VLD3q8oddPseudo_UPD, ARM::VLD3q8_UPD, true, true, true, OddDblSpc, 3, 8 ,true},
+
+{ ARM::VLD4DUPd16Pseudo, ARM::VLD4DUPd16, true, false, false, SingleSpc, 4, 4,true},
+{ ARM::VLD4DUPd16Pseudo_UPD, ARM::VLD4DUPd16_UPD, true, true, true, SingleSpc, 4, 4,true},
+{ ARM::VLD4DUPd32Pseudo, ARM::VLD4DUPd32, true, false, false, SingleSpc, 4, 2,true},
+{ ARM::VLD4DUPd32Pseudo_UPD, ARM::VLD4DUPd32_UPD, true, true, true, SingleSpc, 4, 2,true},
+{ ARM::VLD4DUPd8Pseudo, ARM::VLD4DUPd8, true, false, false, SingleSpc, 4, 8,true},
+{ ARM::VLD4DUPd8Pseudo_UPD, ARM::VLD4DUPd8_UPD, true, true, true, SingleSpc, 4, 8,true},
+
+{ ARM::VLD4LNd16Pseudo, ARM::VLD4LNd16, true, false, false, SingleSpc, 4, 4 ,true},
+{ ARM::VLD4LNd16Pseudo_UPD, ARM::VLD4LNd16_UPD, true, true, true, SingleSpc, 4, 4 ,true},
+{ ARM::VLD4LNd32Pseudo, ARM::VLD4LNd32, true, false, false, SingleSpc, 4, 2 ,true},
+{ ARM::VLD4LNd32Pseudo_UPD, ARM::VLD4LNd32_UPD, true, true, true, SingleSpc, 4, 2 ,true},
+{ ARM::VLD4LNd8Pseudo, ARM::VLD4LNd8, true, false, false, SingleSpc, 4, 8 ,true},
+{ ARM::VLD4LNd8Pseudo_UPD, ARM::VLD4LNd8_UPD, true, true, true, SingleSpc, 4, 8 ,true},
+{ ARM::VLD4LNq16Pseudo, ARM::VLD4LNq16, true, false, false, EvenDblSpc, 4, 4 ,true},
+{ ARM::VLD4LNq16Pseudo_UPD, ARM::VLD4LNq16_UPD, true, true, true, EvenDblSpc, 4, 4 ,true},
+{ ARM::VLD4LNq32Pseudo, ARM::VLD4LNq32, true, false, false, EvenDblSpc, 4, 2 ,true},
+{ ARM::VLD4LNq32Pseudo_UPD, ARM::VLD4LNq32_UPD, true, true, true, EvenDblSpc, 4, 2 ,true},
+
+{ ARM::VLD4d16Pseudo, ARM::VLD4d16, true, false, false, SingleSpc, 4, 4 ,true},
+{ ARM::VLD4d16Pseudo_UPD, ARM::VLD4d16_UPD, true, true, true, SingleSpc, 4, 4 ,true},
+{ ARM::VLD4d32Pseudo, ARM::VLD4d32, true, false, false, SingleSpc, 4, 2 ,true},
+{ ARM::VLD4d32Pseudo_UPD, ARM::VLD4d32_UPD, true, true, true, SingleSpc, 4, 2 ,true},
+{ ARM::VLD4d8Pseudo, ARM::VLD4d8, true, false, false, SingleSpc, 4, 8 ,true},
+{ ARM::VLD4d8Pseudo_UPD, ARM::VLD4d8_UPD, true, true, true, SingleSpc, 4, 8 ,true},
+
+{ ARM::VLD4q16Pseudo_UPD, ARM::VLD4q16_UPD, true, true, true, EvenDblSpc, 4, 4 ,true},
+{ ARM::VLD4q16oddPseudo, ARM::VLD4q16, true, false, false, OddDblSpc, 4, 4 ,true},
+{ ARM::VLD4q16oddPseudo_UPD, ARM::VLD4q16_UPD, true, true, true, OddDblSpc, 4, 4 ,true},
+{ ARM::VLD4q32Pseudo_UPD, ARM::VLD4q32_UPD, true, true, true, EvenDblSpc, 4, 2 ,true},
+{ ARM::VLD4q32oddPseudo, ARM::VLD4q32, true, false, false, OddDblSpc, 4, 2 ,true},
+{ ARM::VLD4q32oddPseudo_UPD, ARM::VLD4q32_UPD, true, true, true, OddDblSpc, 4, 2 ,true},
+{ ARM::VLD4q8Pseudo_UPD, ARM::VLD4q8_UPD, true, true, true, EvenDblSpc, 4, 8 ,true},
+{ ARM::VLD4q8oddPseudo, ARM::VLD4q8, true, false, false, OddDblSpc, 4, 8 ,true},
+{ ARM::VLD4q8oddPseudo_UPD, ARM::VLD4q8_UPD, true, true, true, OddDblSpc, 4, 8 ,true},
+
+{ ARM::VST1LNq16Pseudo, ARM::VST1LNd16, false, false, false, EvenDblSpc, 1, 4 ,true},
+{ ARM::VST1LNq16Pseudo_UPD, ARM::VST1LNd16_UPD, false, true, true, EvenDblSpc, 1, 4 ,true},
+{ ARM::VST1LNq32Pseudo, ARM::VST1LNd32, false, false, false, EvenDblSpc, 1, 2 ,true},
+{ ARM::VST1LNq32Pseudo_UPD, ARM::VST1LNd32_UPD, false, true, true, EvenDblSpc, 1, 2 ,true},
+{ ARM::VST1LNq8Pseudo, ARM::VST1LNd8, false, false, false, EvenDblSpc, 1, 8 ,true},
+{ ARM::VST1LNq8Pseudo_UPD, ARM::VST1LNd8_UPD, false, true, true, EvenDblSpc, 1, 8 ,true},
+
+{ ARM::VST1d64QPseudo, ARM::VST1d64Q, false, false, false, SingleSpc, 4, 1 ,false},
+{ ARM::VST1d64QPseudoWB_fixed, ARM::VST1d64Qwb_fixed, false, true, false, SingleSpc, 4, 1 ,false},
+{ ARM::VST1d64QPseudoWB_register, ARM::VST1d64Qwb_register, false, true, true, SingleSpc, 4, 1 ,false},
+{ ARM::VST1d64TPseudo, ARM::VST1d64T, false, false, false, SingleSpc, 3, 1 ,false},
+{ ARM::VST1d64TPseudoWB_fixed, ARM::VST1d64Twb_fixed, false, true, false, SingleSpc, 3, 1 ,false},
+{ ARM::VST1d64TPseudoWB_register, ARM::VST1d64Twb_register, false, true, true, SingleSpc, 3, 1 ,false},
+
+{ ARM::VST2LNd16Pseudo, ARM::VST2LNd16, false, false, false, SingleSpc, 2, 4 ,true},
+{ ARM::VST2LNd16Pseudo_UPD, ARM::VST2LNd16_UPD, false, true, true, SingleSpc, 2, 4 ,true},
+{ ARM::VST2LNd32Pseudo, ARM::VST2LNd32, false, false, false, SingleSpc, 2, 2 ,true},
+{ ARM::VST2LNd32Pseudo_UPD, ARM::VST2LNd32_UPD, false, true, true, SingleSpc, 2, 2 ,true},
+{ ARM::VST2LNd8Pseudo, ARM::VST2LNd8, false, false, false, SingleSpc, 2, 8 ,true},
+{ ARM::VST2LNd8Pseudo_UPD, ARM::VST2LNd8_UPD, false, true, true, SingleSpc, 2, 8 ,true},
+{ ARM::VST2LNq16Pseudo, ARM::VST2LNq16, false, false, false, EvenDblSpc, 2, 4,true},
+{ ARM::VST2LNq16Pseudo_UPD, ARM::VST2LNq16_UPD, false, true, true, EvenDblSpc, 2, 4,true},
+{ ARM::VST2LNq32Pseudo, ARM::VST2LNq32, false, false, false, EvenDblSpc, 2, 2,true},
+{ ARM::VST2LNq32Pseudo_UPD, ARM::VST2LNq32_UPD, false, true, true, EvenDblSpc, 2, 2,true},
+
+{ ARM::VST2q16Pseudo, ARM::VST2q16, false, false, false, SingleSpc, 4, 4 ,false},
+{ ARM::VST2q16PseudoWB_fixed, ARM::VST2q16wb_fixed, false, true, false, SingleSpc, 4, 4 ,false},
+{ ARM::VST2q16PseudoWB_register, ARM::VST2q16wb_register, false, true, true, SingleSpc, 4, 4 ,false},
+{ ARM::VST2q32Pseudo, ARM::VST2q32, false, false, false, SingleSpc, 4, 2 ,false},
+{ ARM::VST2q32PseudoWB_fixed, ARM::VST2q32wb_fixed, false, true, false, SingleSpc, 4, 2 ,false},
+{ ARM::VST2q32PseudoWB_register, ARM::VST2q32wb_register, false, true, true, SingleSpc, 4, 2 ,false},
+{ ARM::VST2q8Pseudo, ARM::VST2q8, false, false, false, SingleSpc, 4, 8 ,false},
+{ ARM::VST2q8PseudoWB_fixed, ARM::VST2q8wb_fixed, false, true, false, SingleSpc, 4, 8 ,false},
+{ ARM::VST2q8PseudoWB_register, ARM::VST2q8wb_register, false, true, true, SingleSpc, 4, 8 ,false},
+
+{ ARM::VST3LNd16Pseudo, ARM::VST3LNd16, false, false, false, SingleSpc, 3, 4 ,true},
+{ ARM::VST3LNd16Pseudo_UPD, ARM::VST3LNd16_UPD, false, true, true, SingleSpc, 3, 4 ,true},
+{ ARM::VST3LNd32Pseudo, ARM::VST3LNd32, false, false, false, SingleSpc, 3, 2 ,true},
+{ ARM::VST3LNd32Pseudo_UPD, ARM::VST3LNd32_UPD, false, true, true, SingleSpc, 3, 2 ,true},
+{ ARM::VST3LNd8Pseudo, ARM::VST3LNd8, false, false, false, SingleSpc, 3, 8 ,true},
+{ ARM::VST3LNd8Pseudo_UPD, ARM::VST3LNd8_UPD, false, true, true, SingleSpc, 3, 8 ,true},
+{ ARM::VST3LNq16Pseudo, ARM::VST3LNq16, false, false, false, EvenDblSpc, 3, 4,true},
+{ ARM::VST3LNq16Pseudo_UPD, ARM::VST3LNq16_UPD, false, true, true, EvenDblSpc, 3, 4,true},
+{ ARM::VST3LNq32Pseudo, ARM::VST3LNq32, false, false, false, EvenDblSpc, 3, 2,true},
+{ ARM::VST3LNq32Pseudo_UPD, ARM::VST3LNq32_UPD, false, true, true, EvenDblSpc, 3, 2,true},
+
+{ ARM::VST3d16Pseudo, ARM::VST3d16, false, false, false, SingleSpc, 3, 4 ,true},
+{ ARM::VST3d16Pseudo_UPD, ARM::VST3d16_UPD, false, true, true, SingleSpc, 3, 4 ,true},
+{ ARM::VST3d32Pseudo, ARM::VST3d32, false, false, false, SingleSpc, 3, 2 ,true},
+{ ARM::VST3d32Pseudo_UPD, ARM::VST3d32_UPD, false, true, true, SingleSpc, 3, 2 ,true},
+{ ARM::VST3d8Pseudo, ARM::VST3d8, false, false, false, SingleSpc, 3, 8 ,true},
+{ ARM::VST3d8Pseudo_UPD, ARM::VST3d8_UPD, false, true, true, SingleSpc, 3, 8 ,true},
+
+{ ARM::VST3q16Pseudo_UPD, ARM::VST3q16_UPD, false, true, true, EvenDblSpc, 3, 4 ,true},
+{ ARM::VST3q16oddPseudo, ARM::VST3q16, false, false, false, OddDblSpc, 3, 4 ,true},
+{ ARM::VST3q16oddPseudo_UPD, ARM::VST3q16_UPD, false, true, true, OddDblSpc, 3, 4 ,true},
+{ ARM::VST3q32Pseudo_UPD, ARM::VST3q32_UPD, false, true, true, EvenDblSpc, 3, 2 ,true},
+{ ARM::VST3q32oddPseudo, ARM::VST3q32, false, false, false, OddDblSpc, 3, 2 ,true},
+{ ARM::VST3q32oddPseudo_UPD, ARM::VST3q32_UPD, false, true, true, OddDblSpc, 3, 2 ,true},
+{ ARM::VST3q8Pseudo_UPD, ARM::VST3q8_UPD, false, true, true, EvenDblSpc, 3, 8 ,true},
+{ ARM::VST3q8oddPseudo, ARM::VST3q8, false, false, false, OddDblSpc, 3, 8 ,true},
+{ ARM::VST3q8oddPseudo_UPD, ARM::VST3q8_UPD, false, true, true, OddDblSpc, 3, 8 ,true},
+
+{ ARM::VST4LNd16Pseudo, ARM::VST4LNd16, false, false, false, SingleSpc, 4, 4 ,true},
+{ ARM::VST4LNd16Pseudo_UPD, ARM::VST4LNd16_UPD, false, true, true, SingleSpc, 4, 4 ,true},
+{ ARM::VST4LNd32Pseudo, ARM::VST4LNd32, false, false, false, SingleSpc, 4, 2 ,true},
+{ ARM::VST4LNd32Pseudo_UPD, ARM::VST4LNd32_UPD, false, true, true, SingleSpc, 4, 2 ,true},
+{ ARM::VST4LNd8Pseudo, ARM::VST4LNd8, false, false, false, SingleSpc, 4, 8 ,true},
+{ ARM::VST4LNd8Pseudo_UPD, ARM::VST4LNd8_UPD, false, true, true, SingleSpc, 4, 8 ,true},
+{ ARM::VST4LNq16Pseudo, ARM::VST4LNq16, false, false, false, EvenDblSpc, 4, 4,true},
+{ ARM::VST4LNq16Pseudo_UPD, ARM::VST4LNq16_UPD, false, true, true, EvenDblSpc, 4, 4,true},
+{ ARM::VST4LNq32Pseudo, ARM::VST4LNq32, false, false, false, EvenDblSpc, 4, 2,true},
+{ ARM::VST4LNq32Pseudo_UPD, ARM::VST4LNq32_UPD, false, true, true, EvenDblSpc, 4, 2,true},
+
+{ ARM::VST4d16Pseudo, ARM::VST4d16, false, false, false, SingleSpc, 4, 4 ,true},
+{ ARM::VST4d16Pseudo_UPD, ARM::VST4d16_UPD, false, true, true, SingleSpc, 4, 4 ,true},
+{ ARM::VST4d32Pseudo, ARM::VST4d32, false, false, false, SingleSpc, 4, 2 ,true},
+{ ARM::VST4d32Pseudo_UPD, ARM::VST4d32_UPD, false, true, true, SingleSpc, 4, 2 ,true},
+{ ARM::VST4d8Pseudo, ARM::VST4d8, false, false, false, SingleSpc, 4, 8 ,true},
+{ ARM::VST4d8Pseudo_UPD, ARM::VST4d8_UPD, false, true, true, SingleSpc, 4, 8 ,true},
+
+{ ARM::VST4q16Pseudo_UPD, ARM::VST4q16_UPD, false, true, true, EvenDblSpc, 4, 4 ,true},
+{ ARM::VST4q16oddPseudo, ARM::VST4q16, false, false, false, OddDblSpc, 4, 4 ,true},
+{ ARM::VST4q16oddPseudo_UPD, ARM::VST4q16_UPD, false, true, true, OddDblSpc, 4, 4 ,true},
+{ ARM::VST4q32Pseudo_UPD, ARM::VST4q32_UPD, false, true, true, EvenDblSpc, 4, 2 ,true},
+{ ARM::VST4q32oddPseudo, ARM::VST4q32, false, false, false, OddDblSpc, 4, 2 ,true},
+{ ARM::VST4q32oddPseudo_UPD, ARM::VST4q32_UPD, false, true, true, OddDblSpc, 4, 2 ,true},
+{ ARM::VST4q8Pseudo_UPD, ARM::VST4q8_UPD, false, true, true, EvenDblSpc, 4, 8 ,true},
+{ ARM::VST4q8oddPseudo, ARM::VST4q8, false, false, false, OddDblSpc, 4, 8 ,true},
+{ ARM::VST4q8oddPseudo_UPD, ARM::VST4q8_UPD, false, true, true, OddDblSpc, 4, 8 ,true}
+};
+
+/// LookupNEONLdSt - Search the NEONLdStTable for information about a NEON
+/// load or store pseudo instruction.
+static const NEONLdStTableEntry *LookupNEONLdSt(unsigned Opcode) {
+#ifndef NDEBUG
+ // Make sure the table is sorted.
+ static bool TableChecked = false;
+ if (!TableChecked) {
+ assert(std::is_sorted(std::begin(NEONLdStTable), std::end(NEONLdStTable)) &&
+ "NEONLdStTable is not sorted!");
+ TableChecked = true;
+ }
+#endif
+
+ auto I = std::lower_bound(std::begin(NEONLdStTable),
+ std::end(NEONLdStTable), Opcode);
+ if (I != std::end(NEONLdStTable) && I->PseudoOpc == Opcode)
+ return I;
+ return nullptr;
+}
+
+/// GetDSubRegs - Get 4 D subregisters of a Q, QQ, or QQQQ register,
+/// corresponding to the specified register spacing. Not all of the results
+/// are necessarily valid, e.g., a Q register only has 2 D subregisters.
+static void GetDSubRegs(unsigned Reg, NEONRegSpacing RegSpc,
+ const TargetRegisterInfo *TRI, unsigned &D0,
+ unsigned &D1, unsigned &D2, unsigned &D3) {
+ if (RegSpc == SingleSpc) {
+ D0 = TRI->getSubReg(Reg, ARM::dsub_0);
+ D1 = TRI->getSubReg(Reg, ARM::dsub_1);
+ D2 = TRI->getSubReg(Reg, ARM::dsub_2);
+ D3 = TRI->getSubReg(Reg, ARM::dsub_3);
+ } else if (RegSpc == EvenDblSpc) {
+ D0 = TRI->getSubReg(Reg, ARM::dsub_0);
+ D1 = TRI->getSubReg(Reg, ARM::dsub_2);
+ D2 = TRI->getSubReg(Reg, ARM::dsub_4);
+ D3 = TRI->getSubReg(Reg, ARM::dsub_6);
+ } else {
+ assert(RegSpc == OddDblSpc && "unknown register spacing");
+ D0 = TRI->getSubReg(Reg, ARM::dsub_1);
+ D1 = TRI->getSubReg(Reg, ARM::dsub_3);
+ D2 = TRI->getSubReg(Reg, ARM::dsub_5);
+ D3 = TRI->getSubReg(Reg, ARM::dsub_7);
+ }
+}
+
+/// ExpandVLD - Translate VLD pseudo instructions with Q, QQ or QQQQ register
+/// operands to real VLD instructions with D register operands.
+void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {
+ MachineInstr &MI = *MBBI;
+ MachineBasicBlock &MBB = *MI.getParent();
+
+ const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode());
+ assert(TableEntry && TableEntry->IsLoad && "NEONLdStTable lookup failed");
+ NEONRegSpacing RegSpc = (NEONRegSpacing)TableEntry->RegSpacing;
+ unsigned NumRegs = TableEntry->NumRegs;
+
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+ TII->get(TableEntry->RealOpc));
+ unsigned OpIdx = 0;
+
+ bool DstIsDead = MI.getOperand(OpIdx).isDead();
+ unsigned DstReg = MI.getOperand(OpIdx++).getReg();
+ unsigned D0, D1, D2, D3;
+ GetDSubRegs(DstReg, RegSpc, TRI, D0, D1, D2, D3);
+ MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead));
+ if (NumRegs > 1 && TableEntry->copyAllListRegs)
+ MIB.addReg(D1, RegState::Define | getDeadRegState(DstIsDead));
+ if (NumRegs > 2 && TableEntry->copyAllListRegs)
+ MIB.addReg(D2, RegState::Define | getDeadRegState(DstIsDead));
+ if (NumRegs > 3 && TableEntry->copyAllListRegs)
+ MIB.addReg(D3, RegState::Define | getDeadRegState(DstIsDead));
+
+ if (TableEntry->isUpdating)
+ MIB.addOperand(MI.getOperand(OpIdx++));
+
+ // Copy the addrmode6 operands.
+ MIB.addOperand(MI.getOperand(OpIdx++));
+ MIB.addOperand(MI.getOperand(OpIdx++));
+ // Copy the am6offset operand.
+ if (TableEntry->hasWritebackOperand)
+ MIB.addOperand(MI.getOperand(OpIdx++));
+
+ // For an instruction writing double-spaced subregs, the pseudo instruction
+ // has an extra operand that is a use of the super-register. Record the
+ // operand index and skip over it.
+ unsigned SrcOpIdx = 0;
+ if (RegSpc == EvenDblSpc || RegSpc == OddDblSpc)
+ SrcOpIdx = OpIdx++;
+
+ // Copy the predicate operands.
+ MIB.addOperand(MI.getOperand(OpIdx++));
+ MIB.addOperand(MI.getOperand(OpIdx++));
+
+ // Copy the super-register source operand used for double-spaced subregs over
+ // to the new instruction as an implicit operand.
+ if (SrcOpIdx != 0) {
+ MachineOperand MO = MI.getOperand(SrcOpIdx);
+ MO.setImplicit(true);
+ MIB.addOperand(MO);
+ }
+ // Add an implicit def for the super-register.
+ MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead));
+ TransferImpOps(MI, MIB, MIB);
+
+ // Transfer memoperands.
+ MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+ MI.eraseFromParent();
+}
+
+/// ExpandVST - Translate VST pseudo instructions with Q, QQ or QQQQ register
+/// operands to real VST instructions with D register operands.
+void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) {
+ MachineInstr &MI = *MBBI;
+ MachineBasicBlock &MBB = *MI.getParent();
+
+ const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode());
+ assert(TableEntry && !TableEntry->IsLoad && "NEONLdStTable lookup failed");
+ NEONRegSpacing RegSpc = (NEONRegSpacing)TableEntry->RegSpacing;
+ unsigned NumRegs = TableEntry->NumRegs;
+
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+ TII->get(TableEntry->RealOpc));
+ unsigned OpIdx = 0;
+ if (TableEntry->isUpdating)
+ MIB.addOperand(MI.getOperand(OpIdx++));
+
+ // Copy the addrmode6 operands.
+ MIB.addOperand(MI.getOperand(OpIdx++));
+ MIB.addOperand(MI.getOperand(OpIdx++));
+ // Copy the am6offset operand.
+ if (TableEntry->hasWritebackOperand)
+ MIB.addOperand(MI.getOperand(OpIdx++));
+
+ bool SrcIsKill = MI.getOperand(OpIdx).isKill();
+ bool SrcIsUndef = MI.getOperand(OpIdx).isUndef();
+ unsigned SrcReg = MI.getOperand(OpIdx++).getReg();
+ unsigned D0, D1, D2, D3;
+ GetDSubRegs(SrcReg, RegSpc, TRI, D0, D1, D2, D3);
+ MIB.addReg(D0, getUndefRegState(SrcIsUndef));
+ if (NumRegs > 1 && TableEntry->copyAllListRegs)
+ MIB.addReg(D1, getUndefRegState(SrcIsUndef));
+ if (NumRegs > 2 && TableEntry->copyAllListRegs)
+ MIB.addReg(D2, getUndefRegState(SrcIsUndef));
+ if (NumRegs > 3 && TableEntry->copyAllListRegs)
+ MIB.addReg(D3, getUndefRegState(SrcIsUndef));
+
+ // Copy the predicate operands.
+ MIB.addOperand(MI.getOperand(OpIdx++));
+ MIB.addOperand(MI.getOperand(OpIdx++));
+
+ if (SrcIsKill && !SrcIsUndef) // Add an implicit kill for the super-reg.
+ MIB->addRegisterKilled(SrcReg, TRI, true);
+ else if (!SrcIsUndef)
+ MIB.addReg(SrcReg, RegState::Implicit); // Add implicit uses for src reg.
+ TransferImpOps(MI, MIB, MIB);
+
+ // Transfer memoperands.
+ MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+ MI.eraseFromParent();
+}
+
+/// ExpandLaneOp - Translate VLD*LN and VST*LN instructions with Q, QQ or QQQQ
+/// register operands to real instructions with D register operands.
+void ARMExpandPseudo::ExpandLaneOp(MachineBasicBlock::iterator &MBBI) {
+ MachineInstr &MI = *MBBI;
+ MachineBasicBlock &MBB = *MI.getParent();
+
+ const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode());
+ assert(TableEntry && "NEONLdStTable lookup failed");
+ NEONRegSpacing RegSpc = (NEONRegSpacing)TableEntry->RegSpacing;
+ unsigned NumRegs = TableEntry->NumRegs;
+ unsigned RegElts = TableEntry->RegElts;
+
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+ TII->get(TableEntry->RealOpc));
+ unsigned OpIdx = 0;
+ // The lane operand is always the 3rd from last operand, before the 2
+ // predicate operands.
+ unsigned Lane = MI.getOperand(MI.getDesc().getNumOperands() - 3).getImm();
+
+ // Adjust the lane and spacing as needed for Q registers.
+ assert(RegSpc != OddDblSpc && "unexpected register spacing for VLD/VST-lane");
+ if (RegSpc == EvenDblSpc && Lane >= RegElts) {
+ RegSpc = OddDblSpc;
+ Lane -= RegElts;
+ }
+ assert(Lane < RegElts && "out of range lane for VLD/VST-lane");
+
+ unsigned D0 = 0, D1 = 0, D2 = 0, D3 = 0;
+ unsigned DstReg = 0;
+ bool DstIsDead = false;
+ if (TableEntry->IsLoad) {
+ DstIsDead = MI.getOperand(OpIdx).isDead();
+ DstReg = MI.getOperand(OpIdx++).getReg();
+ GetDSubRegs(DstReg, RegSpc, TRI, D0, D1, D2, D3);
+ MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead));
+ if (NumRegs > 1)
+ MIB.addReg(D1, RegState::Define | getDeadRegState(DstIsDead));
+ if (NumRegs > 2)
+ MIB.addReg(D2, RegState::Define | getDeadRegState(DstIsDead));
+ if (NumRegs > 3)
+ MIB.addReg(D3, RegState::Define | getDeadRegState(DstIsDead));
+ }
+
+ if (TableEntry->isUpdating)
+ MIB.addOperand(MI.getOperand(OpIdx++));
+
+ // Copy the addrmode6 operands.
+ MIB.addOperand(MI.getOperand(OpIdx++));
+ MIB.addOperand(MI.getOperand(OpIdx++));
+ // Copy the am6offset operand.
+ if (TableEntry->hasWritebackOperand)
+ MIB.addOperand(MI.getOperand(OpIdx++));
+
+ // Grab the super-register source.
+ MachineOperand MO = MI.getOperand(OpIdx++);
+ if (!TableEntry->IsLoad)
+ GetDSubRegs(MO.getReg(), RegSpc, TRI, D0, D1, D2, D3);
+
+ // Add the subregs as sources of the new instruction.
+ unsigned SrcFlags = (getUndefRegState(MO.isUndef()) |
+ getKillRegState(MO.isKill()));
+ MIB.addReg(D0, SrcFlags);
+ if (NumRegs > 1)
+ MIB.addReg(D1, SrcFlags);
+ if (NumRegs > 2)
+ MIB.addReg(D2, SrcFlags);
+ if (NumRegs > 3)
+ MIB.addReg(D3, SrcFlags);
+
+ // Add the lane number operand.
+ MIB.addImm(Lane);
+ OpIdx += 1;
+
+ // Copy the predicate operands.
+ MIB.addOperand(MI.getOperand(OpIdx++));
+ MIB.addOperand(MI.getOperand(OpIdx++));
+
+ // Copy the super-register source to be an implicit source.
+ MO.setImplicit(true);
+ MIB.addOperand(MO);
+ if (TableEntry->IsLoad)
+ // Add an implicit def for the super-register.
+ MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead));
+ TransferImpOps(MI, MIB, MIB);
+ // Transfer memoperands.
+ MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ MI.eraseFromParent();
+}
+
+/// ExpandVTBL - Translate VTBL and VTBX pseudo instructions with Q or QQ
+/// register operands to real instructions with D register operands.
+void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI,
+ unsigned Opc, bool IsExt) {
+ MachineInstr &MI = *MBBI;
+ MachineBasicBlock &MBB = *MI.getParent();
+
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc));
+ unsigned OpIdx = 0;
+
+ // Transfer the destination register operand.
+ MIB.addOperand(MI.getOperand(OpIdx++));
+ if (IsExt)
+ MIB.addOperand(MI.getOperand(OpIdx++));
+
+ bool SrcIsKill = MI.getOperand(OpIdx).isKill();
+ unsigned SrcReg = MI.getOperand(OpIdx++).getReg();
+ unsigned D0, D1, D2, D3;
+ GetDSubRegs(SrcReg, SingleSpc, TRI, D0, D1, D2, D3);
+ MIB.addReg(D0);
+
+ // Copy the other source register operand.
+ MIB.addOperand(MI.getOperand(OpIdx++));
+
+ // Copy the predicate operands.
+ MIB.addOperand(MI.getOperand(OpIdx++));
+ MIB.addOperand(MI.getOperand(OpIdx++));
+
+ // Add an implicit kill and use for the super-reg.
+ MIB.addReg(SrcReg, RegState::Implicit | getKillRegState(SrcIsKill));
+ TransferImpOps(MI, MIB, MIB);
+ MI.eraseFromParent();
+}
+
+static bool IsAnAddressOperand(const MachineOperand &MO) {
+ // This check is overly conservative. Unless we are certain that the machine
+ // operand is not a symbol reference, we return that it is a symbol reference.
+ // This is important as the load pair may not be split up Windows.
+ switch (MO.getType()) {
+ case MachineOperand::MO_Register:
+ case MachineOperand::MO_Immediate:
+ case MachineOperand::MO_CImmediate:
+ case MachineOperand::MO_FPImmediate:
+ return false;
+ case MachineOperand::MO_MachineBasicBlock:
+ return true;
+ case MachineOperand::MO_FrameIndex:
+ return false;
+ case MachineOperand::MO_ConstantPoolIndex:
+ case MachineOperand::MO_TargetIndex:
+ case MachineOperand::MO_JumpTableIndex:
+ case MachineOperand::MO_ExternalSymbol:
+ case MachineOperand::MO_GlobalAddress:
+ case MachineOperand::MO_BlockAddress:
+ return true;
+ case MachineOperand::MO_RegisterMask:
+ case MachineOperand::MO_RegisterLiveOut:
+ return false;
+ case MachineOperand::MO_Metadata:
+ case MachineOperand::MO_MCSymbol:
+ return true;
+ case MachineOperand::MO_CFIIndex:
+ return false;
+ case MachineOperand::MO_IntrinsicID:
+ case MachineOperand::MO_Predicate:
+ llvm_unreachable("should not exist post-isel");
+ }
+ llvm_unreachable("unhandled machine operand type");
+}
+
+void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned Opcode = MI.getOpcode();
+ unsigned PredReg = 0;
+ ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+ unsigned DstReg = MI.getOperand(0).getReg();
+ bool DstIsDead = MI.getOperand(0).isDead();
+ bool isCC = Opcode == ARM::MOVCCi32imm || Opcode == ARM::t2MOVCCi32imm;
+ const MachineOperand &MO = MI.getOperand(isCC ? 2 : 1);
+ bool RequiresBundling = STI->isTargetWindows() && IsAnAddressOperand(MO);
+ MachineInstrBuilder LO16, HI16;
+
+ if (!STI->hasV6T2Ops() &&
+ (Opcode == ARM::MOVi32imm || Opcode == ARM::MOVCCi32imm)) {
+ // FIXME Windows CE supports older ARM CPUs
+ assert(!STI->isTargetWindows() && "Windows on ARM requires ARMv7+");
+
+ // Expand into a movi + orr.
+ LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVi), DstReg);
+ HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::ORRri))
+ .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstReg);
+
+ assert (MO.isImm() && "MOVi32imm w/ non-immediate source operand!");
+ unsigned ImmVal = (unsigned)MO.getImm();
+ unsigned SOImmValV1 = ARM_AM::getSOImmTwoPartFirst(ImmVal);
+ unsigned SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(ImmVal);
+ LO16 = LO16.addImm(SOImmValV1);
+ HI16 = HI16.addImm(SOImmValV2);
+ LO16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ HI16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ LO16.addImm(Pred).addReg(PredReg).addReg(0);
+ HI16.addImm(Pred).addReg(PredReg).addReg(0);
+ TransferImpOps(MI, LO16, HI16);
+ MI.eraseFromParent();
+ return;
+ }
+
+ unsigned LO16Opc = 0;
+ unsigned HI16Opc = 0;
+ if (Opcode == ARM::t2MOVi32imm || Opcode == ARM::t2MOVCCi32imm) {
+ LO16Opc = ARM::t2MOVi16;
+ HI16Opc = ARM::t2MOVTi16;
+ } else {
+ LO16Opc = ARM::MOVi16;
+ HI16Opc = ARM::MOVTi16;
+ }
+
+ LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(LO16Opc), DstReg);
+ HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(HI16Opc))
+ .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstReg);
+
+ switch (MO.getType()) {
+ case MachineOperand::MO_Immediate: {
+ unsigned Imm = MO.getImm();
+ unsigned Lo16 = Imm & 0xffff;
+ unsigned Hi16 = (Imm >> 16) & 0xffff;
+ LO16 = LO16.addImm(Lo16);
+ HI16 = HI16.addImm(Hi16);
+ break;
+ }
+ case MachineOperand::MO_ExternalSymbol: {
+ const char *ES = MO.getSymbolName();
+ unsigned TF = MO.getTargetFlags();
+ LO16 = LO16.addExternalSymbol(ES, TF | ARMII::MO_LO16);
+ HI16 = HI16.addExternalSymbol(ES, TF | ARMII::MO_HI16);
+ break;
+ }
+ default: {
+ const GlobalValue *GV = MO.getGlobal();
+ unsigned TF = MO.getTargetFlags();
+ LO16 = LO16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_LO16);
+ HI16 = HI16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_HI16);
+ break;
+ }
+ }
+
+ LO16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ HI16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ LO16.addImm(Pred).addReg(PredReg);
+ HI16.addImm(Pred).addReg(PredReg);
+
+ if (RequiresBundling)
+ finalizeBundle(MBB, LO16->getIterator(), MBBI->getIterator());
+
+ TransferImpOps(MI, LO16, HI16);
+ MI.eraseFromParent();
+}
+
+static void addPostLoopLiveIns(MachineBasicBlock *MBB, LivePhysRegs &LiveRegs) {
+ for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I)
+ MBB->addLiveIn(*I);
+}
+
+/// Expand a CMP_SWAP pseudo-inst to an ldrex/strex loop as simply as
+/// possible. This only gets used at -O0 so we don't care about efficiency of the
+/// generated code.
+bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ unsigned LdrexOp, unsigned StrexOp,
+ unsigned UxtOp,
+ MachineBasicBlock::iterator &NextMBBI) {
+ bool IsThumb = STI->isThumb();
+ MachineInstr &MI = *MBBI;
+ DebugLoc DL = MI.getDebugLoc();
+ MachineOperand &Dest = MI.getOperand(0);
+ unsigned StatusReg = MI.getOperand(1).getReg();
+ MachineOperand &Addr = MI.getOperand(2);
+ MachineOperand &Desired = MI.getOperand(3);
+ MachineOperand &New = MI.getOperand(4);
+
+ LivePhysRegs LiveRegs(&TII->getRegisterInfo());
+ LiveRegs.addLiveOuts(MBB);
+ for (auto I = std::prev(MBB.end()); I != MBBI; --I)
+ LiveRegs.stepBackward(*I);
+
+ MachineFunction *MF = MBB.getParent();
+ auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+ auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+ auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+
+ MF->insert(++MBB.getIterator(), LoadCmpBB);
+ MF->insert(++LoadCmpBB->getIterator(), StoreBB);
+ MF->insert(++StoreBB->getIterator(), DoneBB);
+
+ if (UxtOp) {
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MBBI, DL, TII->get(UxtOp), Desired.getReg())
+ .addReg(Desired.getReg(), RegState::Kill);
+ if (!IsThumb)
+ MIB.addImm(0);
+ AddDefaultPred(MIB);
+ }
+
+ // .Lloadcmp:
+ // ldrex rDest, [rAddr]
+ // cmp rDest, rDesired
+ // bne .Ldone
+ LoadCmpBB->addLiveIn(Addr.getReg());
+ LoadCmpBB->addLiveIn(Dest.getReg());
+ LoadCmpBB->addLiveIn(Desired.getReg());
+ addPostLoopLiveIns(LoadCmpBB, LiveRegs);
+
+ MachineInstrBuilder MIB;
+ MIB = BuildMI(LoadCmpBB, DL, TII->get(LdrexOp), Dest.getReg());
+ MIB.addReg(Addr.getReg());
+ if (LdrexOp == ARM::t2LDREX)
+ MIB.addImm(0); // a 32-bit Thumb ldrex (only) allows an offset.
+ AddDefaultPred(MIB);
+
+ unsigned CMPrr = IsThumb ? ARM::tCMPhir : ARM::CMPrr;
+ AddDefaultPred(BuildMI(LoadCmpBB, DL, TII->get(CMPrr))
+ .addReg(Dest.getReg(), getKillRegState(Dest.isDead()))
+ .addOperand(Desired));
+ unsigned Bcc = IsThumb ? ARM::tBcc : ARM::Bcc;
+ BuildMI(LoadCmpBB, DL, TII->get(Bcc))
+ .addMBB(DoneBB)
+ .addImm(ARMCC::NE)
+ .addReg(ARM::CPSR, RegState::Kill);
+ LoadCmpBB->addSuccessor(DoneBB);
+ LoadCmpBB->addSuccessor(StoreBB);
+
+ // .Lstore:
+ // strex rStatus, rNew, [rAddr]
+ // cmp rStatus, #0
+ // bne .Lloadcmp
+ StoreBB->addLiveIn(Addr.getReg());
+ StoreBB->addLiveIn(New.getReg());
+ addPostLoopLiveIns(StoreBB, LiveRegs);
+
+
+ MIB = BuildMI(StoreBB, DL, TII->get(StrexOp), StatusReg);
+ MIB.addOperand(New);
+ MIB.addOperand(Addr);
+ if (StrexOp == ARM::t2STREX)
+ MIB.addImm(0); // a 32-bit Thumb strex (only) allows an offset.
+ AddDefaultPred(MIB);
+
+ unsigned CMPri = IsThumb ? ARM::t2CMPri : ARM::CMPri;
+ AddDefaultPred(BuildMI(StoreBB, DL, TII->get(CMPri))
+ .addReg(StatusReg, RegState::Kill)
+ .addImm(0));
+ BuildMI(StoreBB, DL, TII->get(Bcc))
+ .addMBB(LoadCmpBB)
+ .addImm(ARMCC::NE)
+ .addReg(ARM::CPSR, RegState::Kill);
+ StoreBB->addSuccessor(LoadCmpBB);
+ StoreBB->addSuccessor(DoneBB);
+
+ DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
+ DoneBB->transferSuccessors(&MBB);
+ addPostLoopLiveIns(DoneBB, LiveRegs);
+
+ MBB.addSuccessor(LoadCmpBB);
+
+ NextMBBI = MBB.end();
+ MI.eraseFromParent();
+ return true;
+}
+
+/// ARM's ldrexd/strexd take a consecutive register pair (represented as a
+/// single GPRPair register), Thumb's take two separate registers so we need to
+/// extract the subregs from the pair.
+static void addExclusiveRegPair(MachineInstrBuilder &MIB, MachineOperand &Reg,
+ unsigned Flags, bool IsThumb,
+ const TargetRegisterInfo *TRI) {
+ if (IsThumb) {
+ unsigned RegLo = TRI->getSubReg(Reg.getReg(), ARM::gsub_0);
+ unsigned RegHi = TRI->getSubReg(Reg.getReg(), ARM::gsub_1);
+ MIB.addReg(RegLo, Flags | getKillRegState(Reg.isDead()));
+ MIB.addReg(RegHi, Flags | getKillRegState(Reg.isDead()));
+ } else
+ MIB.addReg(Reg.getReg(), Flags | getKillRegState(Reg.isDead()));
+}
+
+/// Expand a 64-bit CMP_SWAP to an ldrexd/strexd loop.
+bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI) {
+ bool IsThumb = STI->isThumb();
+ MachineInstr &MI = *MBBI;
+ DebugLoc DL = MI.getDebugLoc();
+ MachineOperand &Dest = MI.getOperand(0);
+ unsigned StatusReg = MI.getOperand(1).getReg();
+ MachineOperand &Addr = MI.getOperand(2);
+ MachineOperand &Desired = MI.getOperand(3);
+ MachineOperand &New = MI.getOperand(4);
+
+ unsigned DestLo = TRI->getSubReg(Dest.getReg(), ARM::gsub_0);
+ unsigned DestHi = TRI->getSubReg(Dest.getReg(), ARM::gsub_1);
+ unsigned DesiredLo = TRI->getSubReg(Desired.getReg(), ARM::gsub_0);
+ unsigned DesiredHi = TRI->getSubReg(Desired.getReg(), ARM::gsub_1);
+
+ LivePhysRegs LiveRegs(&TII->getRegisterInfo());
+ LiveRegs.addLiveOuts(MBB);
+ for (auto I = std::prev(MBB.end()); I != MBBI; --I)
+ LiveRegs.stepBackward(*I);
+
+ MachineFunction *MF = MBB.getParent();
+ auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+ auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+ auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+
+ MF->insert(++MBB.getIterator(), LoadCmpBB);
+ MF->insert(++LoadCmpBB->getIterator(), StoreBB);
+ MF->insert(++StoreBB->getIterator(), DoneBB);
+
+ // .Lloadcmp:
+ // ldrexd rDestLo, rDestHi, [rAddr]
+ // cmp rDestLo, rDesiredLo
+ // sbcs rStatus<dead>, rDestHi, rDesiredHi
+ // bne .Ldone
+ LoadCmpBB->addLiveIn(Addr.getReg());
+ LoadCmpBB->addLiveIn(Dest.getReg());
+ LoadCmpBB->addLiveIn(Desired.getReg());
+ addPostLoopLiveIns(LoadCmpBB, LiveRegs);
+
+ unsigned LDREXD = IsThumb ? ARM::t2LDREXD : ARM::LDREXD;
+ MachineInstrBuilder MIB;
+ MIB = BuildMI(LoadCmpBB, DL, TII->get(LDREXD));
+ addExclusiveRegPair(MIB, Dest, RegState::Define, IsThumb, TRI);
+ MIB.addReg(Addr.getReg());
+ AddDefaultPred(MIB);
+
+ unsigned CMPrr = IsThumb ? ARM::tCMPhir : ARM::CMPrr;
+ AddDefaultPred(BuildMI(LoadCmpBB, DL, TII->get(CMPrr))
+ .addReg(DestLo, getKillRegState(Dest.isDead()))
+ .addReg(DesiredLo, getKillRegState(Desired.isDead())));
+
+ BuildMI(LoadCmpBB, DL, TII->get(CMPrr))
+ .addReg(DestHi, getKillRegState(Dest.isDead()))
+ .addReg(DesiredHi, getKillRegState(Desired.isDead()))
+ .addImm(ARMCC::EQ).addReg(ARM::CPSR, RegState::Kill);
+
+ unsigned Bcc = IsThumb ? ARM::tBcc : ARM::Bcc;
+ BuildMI(LoadCmpBB, DL, TII->get(Bcc))
+ .addMBB(DoneBB)
+ .addImm(ARMCC::NE)
+ .addReg(ARM::CPSR, RegState::Kill);
+ LoadCmpBB->addSuccessor(DoneBB);
+ LoadCmpBB->addSuccessor(StoreBB);
+
+ // .Lstore:
+ // strexd rStatus, rNewLo, rNewHi, [rAddr]
+ // cmp rStatus, #0
+ // bne .Lloadcmp
+ StoreBB->addLiveIn(Addr.getReg());
+ StoreBB->addLiveIn(New.getReg());
+ addPostLoopLiveIns(StoreBB, LiveRegs);
+
+ unsigned STREXD = IsThumb ? ARM::t2STREXD : ARM::STREXD;
+ MIB = BuildMI(StoreBB, DL, TII->get(STREXD), StatusReg);
+ addExclusiveRegPair(MIB, New, 0, IsThumb, TRI);
+ MIB.addOperand(Addr);
+ AddDefaultPred(MIB);
+
+ unsigned CMPri = IsThumb ? ARM::t2CMPri : ARM::CMPri;
+ AddDefaultPred(BuildMI(StoreBB, DL, TII->get(CMPri))
+ .addReg(StatusReg, RegState::Kill)
+ .addImm(0));
+ BuildMI(StoreBB, DL, TII->get(Bcc))
+ .addMBB(LoadCmpBB)
+ .addImm(ARMCC::NE)
+ .addReg(ARM::CPSR, RegState::Kill);
+ StoreBB->addSuccessor(LoadCmpBB);
+ StoreBB->addSuccessor(DoneBB);
+
+ DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
+ DoneBB->transferSuccessors(&MBB);
+ addPostLoopLiveIns(DoneBB, LiveRegs);
+
+ MBB.addSuccessor(LoadCmpBB);
+
+ NextMBBI = MBB.end();
+ MI.eraseFromParent();
+ return true;
+}
+
+
+bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned Opcode = MI.getOpcode();
+ switch (Opcode) {
+ default:
+ return false;
+
+ case ARM::TCRETURNdi:
+ case ARM::TCRETURNri: {
+ MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+ assert(MBBI->isReturn() &&
+ "Can only insert epilog into returning blocks");
+ unsigned RetOpcode = MBBI->getOpcode();
+ DebugLoc dl = MBBI->getDebugLoc();
+ const ARMBaseInstrInfo &TII = *static_cast<const ARMBaseInstrInfo *>(
+ MBB.getParent()->getSubtarget().getInstrInfo());
+
+ // Tail call return: adjust the stack pointer and jump to callee.
+ MBBI = MBB.getLastNonDebugInstr();
+ MachineOperand &JumpTarget = MBBI->getOperand(0);
+
+ // Jump to label or value in register.
+ if (RetOpcode == ARM::TCRETURNdi) {
+ unsigned TCOpcode =
+ STI->isThumb()
+ ? (STI->isTargetMachO() ? ARM::tTAILJMPd : ARM::tTAILJMPdND)
+ : ARM::TAILJMPd;
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(TCOpcode));
+ if (JumpTarget.isGlobal())
+ MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
+ JumpTarget.getTargetFlags());
+ else {
+ assert(JumpTarget.isSymbol());
+ MIB.addExternalSymbol(JumpTarget.getSymbolName(),
+ JumpTarget.getTargetFlags());
+ }
+
+ // Add the default predicate in Thumb mode.
+ if (STI->isThumb())
+ MIB.addImm(ARMCC::AL).addReg(0);
+ } else if (RetOpcode == ARM::TCRETURNri) {
+ BuildMI(MBB, MBBI, dl,
+ TII.get(STI->isThumb() ? ARM::tTAILJMPr : ARM::TAILJMPr))
+ .addReg(JumpTarget.getReg(), RegState::Kill);
+ }
+
+ auto NewMI = std::prev(MBBI);
+ for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i)
+ NewMI->addOperand(MBBI->getOperand(i));
+
+ // Delete the pseudo instruction TCRETURN.
+ MBB.erase(MBBI);
+ MBBI = NewMI;
+ return true;
+ }
+ case ARM::VMOVScc:
+ case ARM::VMOVDcc: {
+ unsigned newOpc = Opcode == ARM::VMOVScc ? ARM::VMOVS : ARM::VMOVD;
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(newOpc),
+ MI.getOperand(1).getReg())
+ .addOperand(MI.getOperand(2))
+ .addImm(MI.getOperand(3).getImm()) // 'pred'
+ .addOperand(MI.getOperand(4));
+
+ MI.eraseFromParent();
+ return true;
+ }
+ case ARM::t2MOVCCr:
+ case ARM::MOVCCr: {
+ unsigned Opc = AFI->isThumbFunction() ? ARM::t2MOVr : ARM::MOVr;
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc),
+ MI.getOperand(1).getReg())
+ .addOperand(MI.getOperand(2))
+ .addImm(MI.getOperand(3).getImm()) // 'pred'
+ .addOperand(MI.getOperand(4))
+ .addReg(0); // 's' bit
+
+ MI.eraseFromParent();
+ return true;
+ }
+ case ARM::MOVCCsi: {
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsi),
+ (MI.getOperand(1).getReg()))
+ .addOperand(MI.getOperand(2))
+ .addImm(MI.getOperand(3).getImm())
+ .addImm(MI.getOperand(4).getImm()) // 'pred'
+ .addOperand(MI.getOperand(5))
+ .addReg(0); // 's' bit
+
+ MI.eraseFromParent();
+ return true;
+ }
+ case ARM::MOVCCsr: {
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsr),
+ (MI.getOperand(1).getReg()))
+ .addOperand(MI.getOperand(2))
+ .addOperand(MI.getOperand(3))
+ .addImm(MI.getOperand(4).getImm())
+ .addImm(MI.getOperand(5).getImm()) // 'pred'
+ .addOperand(MI.getOperand(6))
+ .addReg(0); // 's' bit
+
+ MI.eraseFromParent();
+ return true;
+ }
+ case ARM::t2MOVCCi16:
+ case ARM::MOVCCi16: {
+ unsigned NewOpc = AFI->isThumbFunction() ? ARM::t2MOVi16 : ARM::MOVi16;
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc),
+ MI.getOperand(1).getReg())
+ .addImm(MI.getOperand(2).getImm())
+ .addImm(MI.getOperand(3).getImm()) // 'pred'
+ .addOperand(MI.getOperand(4));
+ MI.eraseFromParent();
+ return true;
+ }
+ case ARM::t2MOVCCi:
+ case ARM::MOVCCi: {
+ unsigned Opc = AFI->isThumbFunction() ? ARM::t2MOVi : ARM::MOVi;
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc),
+ MI.getOperand(1).getReg())
+ .addImm(MI.getOperand(2).getImm())
+ .addImm(MI.getOperand(3).getImm()) // 'pred'
+ .addOperand(MI.getOperand(4))
+ .addReg(0); // 's' bit
+
+ MI.eraseFromParent();
+ return true;
+ }
+ case ARM::t2MVNCCi:
+ case ARM::MVNCCi: {
+ unsigned Opc = AFI->isThumbFunction() ? ARM::t2MVNi : ARM::MVNi;
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc),
+ MI.getOperand(1).getReg())
+ .addImm(MI.getOperand(2).getImm())
+ .addImm(MI.getOperand(3).getImm()) // 'pred'
+ .addOperand(MI.getOperand(4))
+ .addReg(0); // 's' bit
+
+ MI.eraseFromParent();
+ return true;
+ }
+ case ARM::t2MOVCClsl:
+ case ARM::t2MOVCClsr:
+ case ARM::t2MOVCCasr:
+ case ARM::t2MOVCCror: {
+ unsigned NewOpc;
+ switch (Opcode) {
+ case ARM::t2MOVCClsl: NewOpc = ARM::t2LSLri; break;
+ case ARM::t2MOVCClsr: NewOpc = ARM::t2LSRri; break;
+ case ARM::t2MOVCCasr: NewOpc = ARM::t2ASRri; break;
+ case ARM::t2MOVCCror: NewOpc = ARM::t2RORri; break;
+ default: llvm_unreachable("unexpeced conditional move");
+ }
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc),
+ MI.getOperand(1).getReg())
+ .addOperand(MI.getOperand(2))
+ .addImm(MI.getOperand(3).getImm())
+ .addImm(MI.getOperand(4).getImm()) // 'pred'
+ .addOperand(MI.getOperand(5))
+ .addReg(0); // 's' bit
+ MI.eraseFromParent();
+ return true;
+ }
+ case ARM::Int_eh_sjlj_dispatchsetup: {
+ MachineFunction &MF = *MI.getParent()->getParent();
+ const ARMBaseInstrInfo *AII =
+ static_cast<const ARMBaseInstrInfo*>(TII);
+ const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
+ // For functions using a base pointer, we rematerialize it (via the frame
+ // pointer) here since eh.sjlj.setjmp and eh.sjlj.longjmp don't do it
+ // for us. Otherwise, expand to nothing.
+ if (RI.hasBasePointer(MF)) {
+ int32_t NumBytes = AFI->getFramePtrSpillOffset();
+ unsigned FramePtr = RI.getFrameRegister(MF);
+ assert(MF.getSubtarget().getFrameLowering()->hasFP(MF) &&
+ "base pointer without frame pointer?");
+
+ if (AFI->isThumb2Function()) {
+ emitT2RegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6,
+ FramePtr, -NumBytes, ARMCC::AL, 0, *TII);
+ } else if (AFI->isThumbFunction()) {
+ emitThumbRegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6,
+ FramePtr, -NumBytes, *TII, RI);
+ } else {
+ emitARMRegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6,
+ FramePtr, -NumBytes, ARMCC::AL, 0,
+ *TII);
+ }
+ // If there's dynamic realignment, adjust for it.
+ if (RI.needsStackRealignment(MF)) {
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ unsigned MaxAlign = MFI.getMaxAlignment();
+ assert (!AFI->isThumb1OnlyFunction());
+ // Emit bic r6, r6, MaxAlign
+ assert(MaxAlign <= 256 && "The BIC instruction cannot encode "
+ "immediates larger than 256 with all lower "
+ "bits set.");
+ unsigned bicOpc = AFI->isThumbFunction() ?
+ ARM::t2BICri : ARM::BICri;
+ AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),
+ TII->get(bicOpc), ARM::R6)
+ .addReg(ARM::R6, RegState::Kill)
+ .addImm(MaxAlign-1)));
+ }
+
+ }
+ MI.eraseFromParent();
+ return true;
+ }
+
+ case ARM::MOVsrl_flag:
+ case ARM::MOVsra_flag: {
+ // These are just fancy MOVs instructions.
+ AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsi),
+ MI.getOperand(0).getReg())
+ .addOperand(MI.getOperand(1))
+ .addImm(ARM_AM::getSORegOpc((Opcode == ARM::MOVsrl_flag ?
+ ARM_AM::lsr : ARM_AM::asr),
+ 1)))
+ .addReg(ARM::CPSR, RegState::Define);
+ MI.eraseFromParent();
+ return true;
+ }
+ case ARM::RRX: {
+ // This encodes as "MOVs Rd, Rm, rrx
+ MachineInstrBuilder MIB =
+ AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),TII->get(ARM::MOVsi),
+ MI.getOperand(0).getReg())
+ .addOperand(MI.getOperand(1))
+ .addImm(ARM_AM::getSORegOpc(ARM_AM::rrx, 0)))
+ .addReg(0);
+ TransferImpOps(MI, MIB, MIB);
+ MI.eraseFromParent();
+ return true;
+ }
+ case ARM::tTPsoft:
+ case ARM::TPsoft: {
+ MachineInstrBuilder MIB;
+ if (Opcode == ARM::tTPsoft)
+ MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+ TII->get( ARM::tBL))
+ .addImm((unsigned)ARMCC::AL).addReg(0)
+ .addExternalSymbol("__aeabi_read_tp", 0);
+ else
+ MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+ TII->get( ARM::BL))
+ .addExternalSymbol("__aeabi_read_tp", 0);
+
+ MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ TransferImpOps(MI, MIB, MIB);
+ MI.eraseFromParent();
+ return true;
+ }
+ case ARM::tLDRpci_pic:
+ case ARM::t2LDRpci_pic: {
+ unsigned NewLdOpc = (Opcode == ARM::tLDRpci_pic)
+ ? ARM::tLDRpci : ARM::t2LDRpci;
+ unsigned DstReg = MI.getOperand(0).getReg();
+ bool DstIsDead = MI.getOperand(0).isDead();
+ MachineInstrBuilder MIB1 =
+ AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),
+ TII->get(NewLdOpc), DstReg)
+ .addOperand(MI.getOperand(1)));
+ MIB1->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ MachineInstrBuilder MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+ TII->get(ARM::tPICADD))
+ .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstReg)
+ .addOperand(MI.getOperand(2));
+ TransferImpOps(MI, MIB1, MIB2);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ case ARM::LDRLIT_ga_abs:
+ case ARM::LDRLIT_ga_pcrel:
+ case ARM::LDRLIT_ga_pcrel_ldr:
+ case ARM::tLDRLIT_ga_abs:
+ case ARM::tLDRLIT_ga_pcrel: {
+ unsigned DstReg = MI.getOperand(0).getReg();
+ bool DstIsDead = MI.getOperand(0).isDead();
+ const MachineOperand &MO1 = MI.getOperand(1);
+ const GlobalValue *GV = MO1.getGlobal();
+ bool IsARM =
+ Opcode != ARM::tLDRLIT_ga_pcrel && Opcode != ARM::tLDRLIT_ga_abs;
+ bool IsPIC =
+ Opcode != ARM::LDRLIT_ga_abs && Opcode != ARM::tLDRLIT_ga_abs;
+ unsigned LDRLITOpc = IsARM ? ARM::LDRi12 : ARM::tLDRpci;
+ unsigned PICAddOpc =
+ IsARM
+ ? (Opcode == ARM::LDRLIT_ga_pcrel_ldr ? ARM::PICLDR : ARM::PICADD)
+ : ARM::tPICADD;
+
+ // We need a new const-pool entry to load from.
+ MachineConstantPool *MCP = MBB.getParent()->getConstantPool();
+ unsigned ARMPCLabelIndex = 0;
+ MachineConstantPoolValue *CPV;
+
+ if (IsPIC) {
+ unsigned PCAdj = IsARM ? 8 : 4;
+ ARMPCLabelIndex = AFI->createPICLabelUId();
+ CPV = ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex,
+ ARMCP::CPValue, PCAdj);
+ } else
+ CPV = ARMConstantPoolConstant::Create(GV, ARMCP::no_modifier);
+
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(LDRLITOpc), DstReg)
+ .addConstantPoolIndex(MCP->getConstantPoolIndex(CPV, 4));
+ if (IsARM)
+ MIB.addImm(0);
+ AddDefaultPred(MIB);
+
+ if (IsPIC) {
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(PICAddOpc))
+ .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstReg)
+ .addImm(ARMPCLabelIndex);
+
+ if (IsARM)
+ AddDefaultPred(MIB);
+ }
+
+ MI.eraseFromParent();
+ return true;
+ }
+ case ARM::MOV_ga_pcrel:
+ case ARM::MOV_ga_pcrel_ldr:
+ case ARM::t2MOV_ga_pcrel: {
+ // Expand into movw + movw. Also "add pc" / ldr [pc] in PIC mode.
+ unsigned LabelId = AFI->createPICLabelUId();
+ unsigned DstReg = MI.getOperand(0).getReg();
+ bool DstIsDead = MI.getOperand(0).isDead();
+ const MachineOperand &MO1 = MI.getOperand(1);
+ const GlobalValue *GV = MO1.getGlobal();
+ unsigned TF = MO1.getTargetFlags();
+ bool isARM = Opcode != ARM::t2MOV_ga_pcrel;
+ unsigned LO16Opc = isARM ? ARM::MOVi16_ga_pcrel : ARM::t2MOVi16_ga_pcrel;
+ unsigned HI16Opc = isARM ? ARM::MOVTi16_ga_pcrel :ARM::t2MOVTi16_ga_pcrel;
+ unsigned LO16TF = TF | ARMII::MO_LO16;
+ unsigned HI16TF = TF | ARMII::MO_HI16;
+ unsigned PICAddOpc = isARM
+ ? (Opcode == ARM::MOV_ga_pcrel_ldr ? ARM::PICLDR : ARM::PICADD)
+ : ARM::tPICADD;
+ MachineInstrBuilder MIB1 = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+ TII->get(LO16Opc), DstReg)
+ .addGlobalAddress(GV, MO1.getOffset(), TF | LO16TF)
+ .addImm(LabelId);
+
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(HI16Opc), DstReg)
+ .addReg(DstReg)
+ .addGlobalAddress(GV, MO1.getOffset(), TF | HI16TF)
+ .addImm(LabelId);
+
+ MachineInstrBuilder MIB3 = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+ TII->get(PICAddOpc))
+ .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstReg).addImm(LabelId);
+ if (isARM) {
+ AddDefaultPred(MIB3);
+ if (Opcode == ARM::MOV_ga_pcrel_ldr)
+ MIB3->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ }
+ TransferImpOps(MI, MIB1, MIB3);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ case ARM::MOVi32imm:
+ case ARM::MOVCCi32imm:
+ case ARM::t2MOVi32imm:
+ case ARM::t2MOVCCi32imm:
+ ExpandMOV32BitImm(MBB, MBBI);
+ return true;
+
+ case ARM::SUBS_PC_LR: {
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::SUBri), ARM::PC)
+ .addReg(ARM::LR)
+ .addOperand(MI.getOperand(0))
+ .addOperand(MI.getOperand(1))
+ .addOperand(MI.getOperand(2))
+ .addReg(ARM::CPSR, RegState::Undef);
+ TransferImpOps(MI, MIB, MIB);
+ MI.eraseFromParent();
+ return true;
+ }
+ case ARM::VLDMQIA: {
+ unsigned NewOpc = ARM::VLDMDIA;
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc));
+ unsigned OpIdx = 0;
+
+ // Grab the Q register destination.
+ bool DstIsDead = MI.getOperand(OpIdx).isDead();
+ unsigned DstReg = MI.getOperand(OpIdx++).getReg();
+
+ // Copy the source register.
+ MIB.addOperand(MI.getOperand(OpIdx++));
+
+ // Copy the predicate operands.
+ MIB.addOperand(MI.getOperand(OpIdx++));
+ MIB.addOperand(MI.getOperand(OpIdx++));
+
+ // Add the destination operands (D subregs).
+ unsigned D0 = TRI->getSubReg(DstReg, ARM::dsub_0);
+ unsigned D1 = TRI->getSubReg(DstReg, ARM::dsub_1);
+ MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(D1, RegState::Define | getDeadRegState(DstIsDead));
+
+ // Add an implicit def for the super-register.
+ MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead));
+ TransferImpOps(MI, MIB, MIB);
+ MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ MI.eraseFromParent();
+ return true;
+ }
+
+ case ARM::VSTMQIA: {
+ unsigned NewOpc = ARM::VSTMDIA;
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc));
+ unsigned OpIdx = 0;
+
+ // Grab the Q register source.
+ bool SrcIsKill = MI.getOperand(OpIdx).isKill();
+ unsigned SrcReg = MI.getOperand(OpIdx++).getReg();
+
+ // Copy the destination register.
+ MIB.addOperand(MI.getOperand(OpIdx++));
+
+ // Copy the predicate operands.
+ MIB.addOperand(MI.getOperand(OpIdx++));
+ MIB.addOperand(MI.getOperand(OpIdx++));
+
+ // Add the source operands (D subregs).
+ unsigned D0 = TRI->getSubReg(SrcReg, ARM::dsub_0);
+ unsigned D1 = TRI->getSubReg(SrcReg, ARM::dsub_1);
+ MIB.addReg(D0, SrcIsKill ? RegState::Kill : 0)
+ .addReg(D1, SrcIsKill ? RegState::Kill : 0);
+
+ if (SrcIsKill) // Add an implicit kill for the Q register.
+ MIB->addRegisterKilled(SrcReg, TRI, true);
+
+ TransferImpOps(MI, MIB, MIB);
+ MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ MI.eraseFromParent();
+ return true;
+ }
+
+ case ARM::VLD2q8Pseudo:
+ case ARM::VLD2q16Pseudo:
+ case ARM::VLD2q32Pseudo:
+ case ARM::VLD2q8PseudoWB_fixed:
+ case ARM::VLD2q16PseudoWB_fixed:
+ case ARM::VLD2q32PseudoWB_fixed:
+ case ARM::VLD2q8PseudoWB_register:
+ case ARM::VLD2q16PseudoWB_register:
+ case ARM::VLD2q32PseudoWB_register:
+ case ARM::VLD3d8Pseudo:
+ case ARM::VLD3d16Pseudo:
+ case ARM::VLD3d32Pseudo:
+ case ARM::VLD1d64TPseudo:
+ case ARM::VLD1d64TPseudoWB_fixed:
+ case ARM::VLD3d8Pseudo_UPD:
+ case ARM::VLD3d16Pseudo_UPD:
+ case ARM::VLD3d32Pseudo_UPD:
+ case ARM::VLD3q8Pseudo_UPD:
+ case ARM::VLD3q16Pseudo_UPD:
+ case ARM::VLD3q32Pseudo_UPD:
+ case ARM::VLD3q8oddPseudo:
+ case ARM::VLD3q16oddPseudo:
+ case ARM::VLD3q32oddPseudo:
+ case ARM::VLD3q8oddPseudo_UPD:
+ case ARM::VLD3q16oddPseudo_UPD:
+ case ARM::VLD3q32oddPseudo_UPD:
+ case ARM::VLD4d8Pseudo:
+ case ARM::VLD4d16Pseudo:
+ case ARM::VLD4d32Pseudo:
+ case ARM::VLD1d64QPseudo:
+ case ARM::VLD1d64QPseudoWB_fixed:
+ case ARM::VLD4d8Pseudo_UPD:
+ case ARM::VLD4d16Pseudo_UPD:
+ case ARM::VLD4d32Pseudo_UPD:
+ case ARM::VLD4q8Pseudo_UPD:
+ case ARM::VLD4q16Pseudo_UPD:
+ case ARM::VLD4q32Pseudo_UPD:
+ case ARM::VLD4q8oddPseudo:
+ case ARM::VLD4q16oddPseudo:
+ case ARM::VLD4q32oddPseudo:
+ case ARM::VLD4q8oddPseudo_UPD:
+ case ARM::VLD4q16oddPseudo_UPD:
+ case ARM::VLD4q32oddPseudo_UPD:
+ case ARM::VLD3DUPd8Pseudo:
+ case ARM::VLD3DUPd16Pseudo:
+ case ARM::VLD3DUPd32Pseudo:
+ case ARM::VLD3DUPd8Pseudo_UPD:
+ case ARM::VLD3DUPd16Pseudo_UPD:
+ case ARM::VLD3DUPd32Pseudo_UPD:
+ case ARM::VLD4DUPd8Pseudo:
+ case ARM::VLD4DUPd16Pseudo:
+ case ARM::VLD4DUPd32Pseudo:
+ case ARM::VLD4DUPd8Pseudo_UPD:
+ case ARM::VLD4DUPd16Pseudo_UPD:
+ case ARM::VLD4DUPd32Pseudo_UPD:
+ ExpandVLD(MBBI);
+ return true;
+
+ case ARM::VST2q8Pseudo:
+ case ARM::VST2q16Pseudo:
+ case ARM::VST2q32Pseudo:
+ case ARM::VST2q8PseudoWB_fixed:
+ case ARM::VST2q16PseudoWB_fixed:
+ case ARM::VST2q32PseudoWB_fixed:
+ case ARM::VST2q8PseudoWB_register:
+ case ARM::VST2q16PseudoWB_register:
+ case ARM::VST2q32PseudoWB_register:
+ case ARM::VST3d8Pseudo:
+ case ARM::VST3d16Pseudo:
+ case ARM::VST3d32Pseudo:
+ case ARM::VST1d64TPseudo:
+ case ARM::VST3d8Pseudo_UPD:
+ case ARM::VST3d16Pseudo_UPD:
+ case ARM::VST3d32Pseudo_UPD:
+ case ARM::VST1d64TPseudoWB_fixed:
+ case ARM::VST1d64TPseudoWB_register:
+ case ARM::VST3q8Pseudo_UPD:
+ case ARM::VST3q16Pseudo_UPD:
+ case ARM::VST3q32Pseudo_UPD:
+ case ARM::VST3q8oddPseudo:
+ case ARM::VST3q16oddPseudo:
+ case ARM::VST3q32oddPseudo:
+ case ARM::VST3q8oddPseudo_UPD:
+ case ARM::VST3q16oddPseudo_UPD:
+ case ARM::VST3q32oddPseudo_UPD:
+ case ARM::VST4d8Pseudo:
+ case ARM::VST4d16Pseudo:
+ case ARM::VST4d32Pseudo:
+ case ARM::VST1d64QPseudo:
+ case ARM::VST4d8Pseudo_UPD:
+ case ARM::VST4d16Pseudo_UPD:
+ case ARM::VST4d32Pseudo_UPD:
+ case ARM::VST1d64QPseudoWB_fixed:
+ case ARM::VST1d64QPseudoWB_register:
+ case ARM::VST4q8Pseudo_UPD:
+ case ARM::VST4q16Pseudo_UPD:
+ case ARM::VST4q32Pseudo_UPD:
+ case ARM::VST4q8oddPseudo:
+ case ARM::VST4q16oddPseudo:
+ case ARM::VST4q32oddPseudo:
+ case ARM::VST4q8oddPseudo_UPD:
+ case ARM::VST4q16oddPseudo_UPD:
+ case ARM::VST4q32oddPseudo_UPD:
+ ExpandVST(MBBI);
+ return true;
+
+ case ARM::VLD1LNq8Pseudo:
+ case ARM::VLD1LNq16Pseudo:
+ case ARM::VLD1LNq32Pseudo:
+ case ARM::VLD1LNq8Pseudo_UPD:
+ case ARM::VLD1LNq16Pseudo_UPD:
+ case ARM::VLD1LNq32Pseudo_UPD:
+ case ARM::VLD2LNd8Pseudo:
+ case ARM::VLD2LNd16Pseudo:
+ case ARM::VLD2LNd32Pseudo:
+ case ARM::VLD2LNq16Pseudo:
+ case ARM::VLD2LNq32Pseudo:
+ case ARM::VLD2LNd8Pseudo_UPD:
+ case ARM::VLD2LNd16Pseudo_UPD:
+ case ARM::VLD2LNd32Pseudo_UPD:
+ case ARM::VLD2LNq16Pseudo_UPD:
+ case ARM::VLD2LNq32Pseudo_UPD:
+ case ARM::VLD3LNd8Pseudo:
+ case ARM::VLD3LNd16Pseudo:
+ case ARM::VLD3LNd32Pseudo:
+ case ARM::VLD3LNq16Pseudo:
+ case ARM::VLD3LNq32Pseudo:
+ case ARM::VLD3LNd8Pseudo_UPD:
+ case ARM::VLD3LNd16Pseudo_UPD:
+ case ARM::VLD3LNd32Pseudo_UPD:
+ case ARM::VLD3LNq16Pseudo_UPD:
+ case ARM::VLD3LNq32Pseudo_UPD:
+ case ARM::VLD4LNd8Pseudo:
+ case ARM::VLD4LNd16Pseudo:
+ case ARM::VLD4LNd32Pseudo:
+ case ARM::VLD4LNq16Pseudo:
+ case ARM::VLD4LNq32Pseudo:
+ case ARM::VLD4LNd8Pseudo_UPD:
+ case ARM::VLD4LNd16Pseudo_UPD:
+ case ARM::VLD4LNd32Pseudo_UPD:
+ case ARM::VLD4LNq16Pseudo_UPD:
+ case ARM::VLD4LNq32Pseudo_UPD:
+ case ARM::VST1LNq8Pseudo:
+ case ARM::VST1LNq16Pseudo:
+ case ARM::VST1LNq32Pseudo:
+ case ARM::VST1LNq8Pseudo_UPD:
+ case ARM::VST1LNq16Pseudo_UPD:
+ case ARM::VST1LNq32Pseudo_UPD:
+ case ARM::VST2LNd8Pseudo:
+ case ARM::VST2LNd16Pseudo:
+ case ARM::VST2LNd32Pseudo:
+ case ARM::VST2LNq16Pseudo:
+ case ARM::VST2LNq32Pseudo:
+ case ARM::VST2LNd8Pseudo_UPD:
+ case ARM::VST2LNd16Pseudo_UPD:
+ case ARM::VST2LNd32Pseudo_UPD:
+ case ARM::VST2LNq16Pseudo_UPD:
+ case ARM::VST2LNq32Pseudo_UPD:
+ case ARM::VST3LNd8Pseudo:
+ case ARM::VST3LNd16Pseudo:
+ case ARM::VST3LNd32Pseudo:
+ case ARM::VST3LNq16Pseudo:
+ case ARM::VST3LNq32Pseudo:
+ case ARM::VST3LNd8Pseudo_UPD:
+ case ARM::VST3LNd16Pseudo_UPD:
+ case ARM::VST3LNd32Pseudo_UPD:
+ case ARM::VST3LNq16Pseudo_UPD:
+ case ARM::VST3LNq32Pseudo_UPD:
+ case ARM::VST4LNd8Pseudo:
+ case ARM::VST4LNd16Pseudo:
+ case ARM::VST4LNd32Pseudo:
+ case ARM::VST4LNq16Pseudo:
+ case ARM::VST4LNq32Pseudo:
+ case ARM::VST4LNd8Pseudo_UPD:
+ case ARM::VST4LNd16Pseudo_UPD:
+ case ARM::VST4LNd32Pseudo_UPD:
+ case ARM::VST4LNq16Pseudo_UPD:
+ case ARM::VST4LNq32Pseudo_UPD:
+ ExpandLaneOp(MBBI);
+ return true;
+
+ case ARM::VTBL3Pseudo: ExpandVTBL(MBBI, ARM::VTBL3, false); return true;
+ case ARM::VTBL4Pseudo: ExpandVTBL(MBBI, ARM::VTBL4, false); return true;
+ case ARM::VTBX3Pseudo: ExpandVTBL(MBBI, ARM::VTBX3, true); return true;
+ case ARM::VTBX4Pseudo: ExpandVTBL(MBBI, ARM::VTBX4, true); return true;
+
+ case ARM::CMP_SWAP_8:
+ if (STI->isThumb())
+ return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREXB, ARM::t2STREXB,
+ ARM::tUXTB, NextMBBI);
+ else
+ return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREXB, ARM::STREXB,
+ ARM::UXTB, NextMBBI);
+ case ARM::CMP_SWAP_16:
+ if (STI->isThumb())
+ return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREXH, ARM::t2STREXH,
+ ARM::tUXTH, NextMBBI);
+ else
+ return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREXH, ARM::STREXH,
+ ARM::UXTH, NextMBBI);
+ case ARM::CMP_SWAP_32:
+ if (STI->isThumb())
+ return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREX, ARM::t2STREX, 0,
+ NextMBBI);
+ else
+ return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREX, ARM::STREX, 0, NextMBBI);
+
+ case ARM::CMP_SWAP_64:
+ return ExpandCMP_SWAP_64(MBB, MBBI, NextMBBI);
+ }
+}
+
+bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
+ bool Modified = false;
+
+ MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+ while (MBBI != E) {
+ MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+ Modified |= ExpandMI(MBB, MBBI, NMBBI);
+ MBBI = NMBBI;
+ }
+
+ return Modified;
+}
+
+bool ARMExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
+ STI = &static_cast<const ARMSubtarget &>(MF.getSubtarget());
+ TII = STI->getInstrInfo();
+ TRI = STI->getRegisterInfo();
+ AFI = MF.getInfo<ARMFunctionInfo>();
+
+ bool Modified = false;
+ for (MachineFunction::iterator MFI = MF.begin(), E = MF.end(); MFI != E;
+ ++MFI)
+ Modified |= ExpandMBB(*MFI);
+ if (VerifyARMPseudo)
+ MF.verify(this, "After expanding ARM pseudo instructions.");
+ return Modified;
+}
+
+/// createARMExpandPseudoPass - returns an instance of the pseudo instruction
+/// expansion pass.
+FunctionPass *llvm::createARMExpandPseudoPass() {
+ return new ARMExpandPseudo();
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp b/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp
new file mode 100644
index 000000000000..df4dcb375750
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp
@@ -0,0 +1,3045 @@
+//===-- ARMFastISel.cpp - ARM FastISel implementation ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the ARM-specific support for the FastISel class. Some
+// of the target-specific code is generated by tablegen in the file
+// ARMGenFastISel.inc, which is #included here.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBaseRegisterInfo.h"
+#include "ARMCallingConv.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMISelLowering.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+namespace {
+
+ // All possible address modes, plus some.
+ typedef struct Address {
+ enum {
+ RegBase,
+ FrameIndexBase
+ } BaseType;
+
+ union {
+ unsigned Reg;
+ int FI;
+ } Base;
+
+ int Offset;
+
+ // Innocuous defaults for our address.
+ Address()
+ : BaseType(RegBase), Offset(0) {
+ Base.Reg = 0;
+ }
+ } Address;
+
+class ARMFastISel final : public FastISel {
+
+ /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
+ /// make the right decision when generating code for different targets.
+ const ARMSubtarget *Subtarget;
+ Module &M;
+ const TargetMachine &TM;
+ const TargetInstrInfo &TII;
+ const TargetLowering &TLI;
+ ARMFunctionInfo *AFI;
+
+ // Convenience variables to avoid some queries.
+ bool isThumb2;
+ LLVMContext *Context;
+
+ public:
+ explicit ARMFastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo)
+ : FastISel(funcInfo, libInfo),
+ Subtarget(
+ &static_cast<const ARMSubtarget &>(funcInfo.MF->getSubtarget())),
+ M(const_cast<Module &>(*funcInfo.Fn->getParent())),
+ TM(funcInfo.MF->getTarget()), TII(*Subtarget->getInstrInfo()),
+ TLI(*Subtarget->getTargetLowering()) {
+ AFI = funcInfo.MF->getInfo<ARMFunctionInfo>();
+ isThumb2 = AFI->isThumbFunction();
+ Context = &funcInfo.Fn->getContext();
+ }
+
+ // Code from FastISel.cpp.
+ private:
+ unsigned fastEmitInst_r(unsigned MachineInstOpcode,
+ const TargetRegisterClass *RC,
+ unsigned Op0, bool Op0IsKill);
+ unsigned fastEmitInst_rr(unsigned MachineInstOpcode,
+ const TargetRegisterClass *RC,
+ unsigned Op0, bool Op0IsKill,
+ unsigned Op1, bool Op1IsKill);
+ unsigned fastEmitInst_ri(unsigned MachineInstOpcode,
+ const TargetRegisterClass *RC,
+ unsigned Op0, bool Op0IsKill,
+ uint64_t Imm);
+ unsigned fastEmitInst_i(unsigned MachineInstOpcode,
+ const TargetRegisterClass *RC,
+ uint64_t Imm);
+
+ // Backend specific FastISel code.
+ private:
+ bool fastSelectInstruction(const Instruction *I) override;
+ unsigned fastMaterializeConstant(const Constant *C) override;
+ unsigned fastMaterializeAlloca(const AllocaInst *AI) override;
+ bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
+ const LoadInst *LI) override;
+ bool fastLowerArguments() override;
+ private:
+ #include "ARMGenFastISel.inc"
+
+ // Instruction selection routines.
+ private:
+ bool SelectLoad(const Instruction *I);
+ bool SelectStore(const Instruction *I);
+ bool SelectBranch(const Instruction *I);
+ bool SelectIndirectBr(const Instruction *I);
+ bool SelectCmp(const Instruction *I);
+ bool SelectFPExt(const Instruction *I);
+ bool SelectFPTrunc(const Instruction *I);
+ bool SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode);
+ bool SelectBinaryFPOp(const Instruction *I, unsigned ISDOpcode);
+ bool SelectIToFP(const Instruction *I, bool isSigned);
+ bool SelectFPToI(const Instruction *I, bool isSigned);
+ bool SelectDiv(const Instruction *I, bool isSigned);
+ bool SelectRem(const Instruction *I, bool isSigned);
+ bool SelectCall(const Instruction *I, const char *IntrMemName);
+ bool SelectIntrinsicCall(const IntrinsicInst &I);
+ bool SelectSelect(const Instruction *I);
+ bool SelectRet(const Instruction *I);
+ bool SelectTrunc(const Instruction *I);
+ bool SelectIntExt(const Instruction *I);
+ bool SelectShift(const Instruction *I, ARM_AM::ShiftOpc ShiftTy);
+
+ // Utility routines.
+ private:
+ bool isPositionIndependent() const;
+ bool isTypeLegal(Type *Ty, MVT &VT);
+ bool isLoadTypeLegal(Type *Ty, MVT &VT);
+ bool ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
+ bool isZExt);
+ bool ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+ unsigned Alignment = 0, bool isZExt = true,
+ bool allocReg = true);
+ bool ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr,
+ unsigned Alignment = 0);
+ bool ARMComputeAddress(const Value *Obj, Address &Addr);
+ void ARMSimplifyAddress(Address &Addr, MVT VT, bool useAM3);
+ bool ARMIsMemCpySmall(uint64_t Len);
+ bool ARMTryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len,
+ unsigned Alignment);
+ unsigned ARMEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool isZExt);
+ unsigned ARMMaterializeFP(const ConstantFP *CFP, MVT VT);
+ unsigned ARMMaterializeInt(const Constant *C, MVT VT);
+ unsigned ARMMaterializeGV(const GlobalValue *GV, MVT VT);
+ unsigned ARMMoveToFPReg(MVT VT, unsigned SrcReg);
+ unsigned ARMMoveToIntReg(MVT VT, unsigned SrcReg);
+ unsigned ARMSelectCallOp(bool UseReg);
+ unsigned ARMLowerPICELF(const GlobalValue *GV, unsigned Align, MVT VT);
+
+ const TargetLowering *getTargetLowering() { return &TLI; }
+
+ // Call handling routines.
+ private:
+ CCAssignFn *CCAssignFnForCall(CallingConv::ID CC,
+ bool Return,
+ bool isVarArg);
+ bool ProcessCallArgs(SmallVectorImpl<Value*> &Args,
+ SmallVectorImpl<unsigned> &ArgRegs,
+ SmallVectorImpl<MVT> &ArgVTs,
+ SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
+ SmallVectorImpl<unsigned> &RegArgs,
+ CallingConv::ID CC,
+ unsigned &NumBytes,
+ bool isVarArg);
+ unsigned getLibcallReg(const Twine &Name);
+ bool FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
+ const Instruction *I, CallingConv::ID CC,
+ unsigned &NumBytes, bool isVarArg);
+ bool ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call);
+
+ // OptionalDef handling routines.
+ private:
+ bool isARMNEONPred(const MachineInstr *MI);
+ bool DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR);
+ const MachineInstrBuilder &AddOptionalDefs(const MachineInstrBuilder &MIB);
+ void AddLoadStoreOperands(MVT VT, Address &Addr,
+ const MachineInstrBuilder &MIB,
+ MachineMemOperand::Flags Flags, bool useAM3);
+};
+
+} // end anonymous namespace
+
+#include "ARMGenCallingConv.inc"
+
+// DefinesOptionalPredicate - This is different from DefinesPredicate in that
+// we don't care about implicit defs here, just places we'll need to add a
+// default CCReg argument. Sets CPSR if we're setting CPSR instead of CCR.
+bool ARMFastISel::DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR) {
+ if (!MI->hasOptionalDef())
+ return false;
+
+ // Look to see if our OptionalDef is defining CPSR or CCR.
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+ if (!MO.isReg() || !MO.isDef()) continue;
+ if (MO.getReg() == ARM::CPSR)
+ *CPSR = true;
+ }
+ return true;
+}
+
+bool ARMFastISel::isARMNEONPred(const MachineInstr *MI) {
+ const MCInstrDesc &MCID = MI->getDesc();
+
+ // If we're a thumb2 or not NEON function we'll be handled via isPredicable.
+ if ((MCID.TSFlags & ARMII::DomainMask) != ARMII::DomainNEON ||
+ AFI->isThumb2Function())
+ return MI->isPredicable();
+
+ for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i)
+ if (MCID.OpInfo[i].isPredicate())
+ return true;
+
+ return false;
+}
+
+// If the machine is predicable go ahead and add the predicate operands, if
+// it needs default CC operands add those.
+// TODO: If we want to support thumb1 then we'll need to deal with optional
+// CPSR defs that need to be added before the remaining operands. See s_cc_out
+// for descriptions why.
+const MachineInstrBuilder &
+ARMFastISel::AddOptionalDefs(const MachineInstrBuilder &MIB) {
+ MachineInstr *MI = &*MIB;
+
+ // Do we use a predicate? or...
+ // Are we NEON in ARM mode and have a predicate operand? If so, I know
+ // we're not predicable but add it anyways.
+ if (isARMNEONPred(MI))
+ AddDefaultPred(MIB);
+
+ // Do we optionally set a predicate? Preds is size > 0 iff the predicate
+ // defines CPSR. All other OptionalDefines in ARM are the CCR register.
+ bool CPSR = false;
+ if (DefinesOptionalPredicate(MI, &CPSR)) {
+ if (CPSR)
+ AddDefaultT1CC(MIB);
+ else
+ AddDefaultCC(MIB);
+ }
+ return MIB;
+}
+
+unsigned ARMFastISel::fastEmitInst_r(unsigned MachineInstOpcode,
+ const TargetRegisterClass *RC,
+ unsigned Op0, bool Op0IsKill) {
+ unsigned ResultReg = createResultReg(RC);
+ const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+ // Make sure the input operand is sufficiently constrained to be legal
+ // for this instruction.
+ Op0 = constrainOperandRegClass(II, Op0, 1);
+ if (II.getNumDefs() >= 1) {
+ AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II,
+ ResultReg).addReg(Op0, Op0IsKill * RegState::Kill));
+ } else {
+ AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+ .addReg(Op0, Op0IsKill * RegState::Kill));
+ AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg)
+ .addReg(II.ImplicitDefs[0]));
+ }
+ return ResultReg;
+}
+
+unsigned ARMFastISel::fastEmitInst_rr(unsigned MachineInstOpcode,
+ const TargetRegisterClass *RC,
+ unsigned Op0, bool Op0IsKill,
+ unsigned Op1, bool Op1IsKill) {
+ unsigned ResultReg = createResultReg(RC);
+ const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+ // Make sure the input operands are sufficiently constrained to be legal
+ // for this instruction.
+ Op0 = constrainOperandRegClass(II, Op0, 1);
+ Op1 = constrainOperandRegClass(II, Op1, 2);
+
+ if (II.getNumDefs() >= 1) {
+ AddOptionalDefs(
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+ .addReg(Op0, Op0IsKill * RegState::Kill)
+ .addReg(Op1, Op1IsKill * RegState::Kill));
+ } else {
+ AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+ .addReg(Op0, Op0IsKill * RegState::Kill)
+ .addReg(Op1, Op1IsKill * RegState::Kill));
+ AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg)
+ .addReg(II.ImplicitDefs[0]));
+ }
+ return ResultReg;
+}
+
+unsigned ARMFastISel::fastEmitInst_ri(unsigned MachineInstOpcode,
+ const TargetRegisterClass *RC,
+ unsigned Op0, bool Op0IsKill,
+ uint64_t Imm) {
+ unsigned ResultReg = createResultReg(RC);
+ const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+ // Make sure the input operand is sufficiently constrained to be legal
+ // for this instruction.
+ Op0 = constrainOperandRegClass(II, Op0, 1);
+ if (II.getNumDefs() >= 1) {
+ AddOptionalDefs(
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+ .addReg(Op0, Op0IsKill * RegState::Kill)
+ .addImm(Imm));
+ } else {
+ AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+ .addReg(Op0, Op0IsKill * RegState::Kill)
+ .addImm(Imm));
+ AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg)
+ .addReg(II.ImplicitDefs[0]));
+ }
+ return ResultReg;
+}
+
+unsigned ARMFastISel::fastEmitInst_i(unsigned MachineInstOpcode,
+ const TargetRegisterClass *RC,
+ uint64_t Imm) {
+ unsigned ResultReg = createResultReg(RC);
+ const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+ if (II.getNumDefs() >= 1) {
+ AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II,
+ ResultReg).addImm(Imm));
+ } else {
+ AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+ .addImm(Imm));
+ AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg)
+ .addReg(II.ImplicitDefs[0]));
+ }
+ return ResultReg;
+}
+
+// TODO: Don't worry about 64-bit now, but when this is fixed remove the
+// checks from the various callers.
+unsigned ARMFastISel::ARMMoveToFPReg(MVT VT, unsigned SrcReg) {
+ if (VT == MVT::f64) return 0;
+
+ unsigned MoveReg = createResultReg(TLI.getRegClassFor(VT));
+ AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(ARM::VMOVSR), MoveReg)
+ .addReg(SrcReg));
+ return MoveReg;
+}
+
+unsigned ARMFastISel::ARMMoveToIntReg(MVT VT, unsigned SrcReg) {
+ if (VT == MVT::i64) return 0;
+
+ unsigned MoveReg = createResultReg(TLI.getRegClassFor(VT));
+ AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(ARM::VMOVRS), MoveReg)
+ .addReg(SrcReg));
+ return MoveReg;
+}
+
+// For double width floating point we need to materialize two constants
+// (the high and the low) into integer registers then use a move to get
+// the combined constant into an FP reg.
+unsigned ARMFastISel::ARMMaterializeFP(const ConstantFP *CFP, MVT VT) {
+ const APFloat Val = CFP->getValueAPF();
+ bool is64bit = VT == MVT::f64;
+
+ // This checks to see if we can use VFP3 instructions to materialize
+ // a constant, otherwise we have to go through the constant pool.
+ if (TLI.isFPImmLegal(Val, VT)) {
+ int Imm;
+ unsigned Opc;
+ if (is64bit) {
+ Imm = ARM_AM::getFP64Imm(Val);
+ Opc = ARM::FCONSTD;
+ } else {
+ Imm = ARM_AM::getFP32Imm(Val);
+ Opc = ARM::FCONSTS;
+ }
+ unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
+ AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc), DestReg).addImm(Imm));
+ return DestReg;
+ }
+
+ // Require VFP2 for loading fp constants.
+ if (!Subtarget->hasVFP2()) return false;
+
+ // MachineConstantPool wants an explicit alignment.
+ unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
+ if (Align == 0) {
+ // TODO: Figure out if this is correct.
+ Align = DL.getTypeAllocSize(CFP->getType());
+ }
+ unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align);
+ unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
+ unsigned Opc = is64bit ? ARM::VLDRD : ARM::VLDRS;
+
+ // The extra reg is for addrmode5.
+ AddOptionalDefs(
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
+ .addConstantPoolIndex(Idx)
+ .addReg(0));
+ return DestReg;
+}
+
+unsigned ARMFastISel::ARMMaterializeInt(const Constant *C, MVT VT) {
+
+ if (VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8 && VT != MVT::i1)
+ return 0;
+
+ // If we can do this in a single instruction without a constant pool entry
+ // do so now.
+ const ConstantInt *CI = cast<ConstantInt>(C);
+ if (Subtarget->hasV6T2Ops() && isUInt<16>(CI->getZExtValue())) {
+ unsigned Opc = isThumb2 ? ARM::t2MOVi16 : ARM::MOVi16;
+ const TargetRegisterClass *RC = isThumb2 ? &ARM::rGPRRegClass :
+ &ARM::GPRRegClass;
+ unsigned ImmReg = createResultReg(RC);
+ AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc), ImmReg)
+ .addImm(CI->getZExtValue()));
+ return ImmReg;
+ }
+
+ // Use MVN to emit negative constants.
+ if (VT == MVT::i32 && Subtarget->hasV6T2Ops() && CI->isNegative()) {
+ unsigned Imm = (unsigned)~(CI->getSExtValue());
+ bool UseImm = isThumb2 ? (ARM_AM::getT2SOImmVal(Imm) != -1) :
+ (ARM_AM::getSOImmVal(Imm) != -1);
+ if (UseImm) {
+ unsigned Opc = isThumb2 ? ARM::t2MVNi : ARM::MVNi;
+ const TargetRegisterClass *RC = isThumb2 ? &ARM::rGPRRegClass :
+ &ARM::GPRRegClass;
+ unsigned ImmReg = createResultReg(RC);
+ AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc), ImmReg)
+ .addImm(Imm));
+ return ImmReg;
+ }
+ }
+
+ unsigned ResultReg = 0;
+ if (Subtarget->useMovt(*FuncInfo.MF))
+ ResultReg = fastEmit_i(VT, VT, ISD::Constant, CI->getZExtValue());
+
+ if (ResultReg)
+ return ResultReg;
+
+ // Load from constant pool. For now 32-bit only.
+ if (VT != MVT::i32)
+ return 0;
+
+ // MachineConstantPool wants an explicit alignment.
+ unsigned Align = DL.getPrefTypeAlignment(C->getType());
+ if (Align == 0) {
+ // TODO: Figure out if this is correct.
+ Align = DL.getTypeAllocSize(C->getType());
+ }
+ unsigned Idx = MCP.getConstantPoolIndex(C, Align);
+ ResultReg = createResultReg(TLI.getRegClassFor(VT));
+ if (isThumb2)
+ AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(ARM::t2LDRpci), ResultReg)
+ .addConstantPoolIndex(Idx));
+ else {
+ // The extra immediate is for addrmode2.
+ ResultReg = constrainOperandRegClass(TII.get(ARM::LDRcp), ResultReg, 0);
+ AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(ARM::LDRcp), ResultReg)
+ .addConstantPoolIndex(Idx)
+ .addImm(0));
+ }
+ return ResultReg;
+}
+
+bool ARMFastISel::isPositionIndependent() const {
+ return TLI.isPositionIndependent();
+}
+
+unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
+ // For now 32-bit only.
+ if (VT != MVT::i32 || GV->isThreadLocal()) return 0;
+
+ // ROPI/RWPI not currently supported.
+ if (Subtarget->isROPI() || Subtarget->isRWPI())
+ return 0;
+
+ bool IsIndirect = Subtarget->isGVIndirectSymbol(GV);
+ const TargetRegisterClass *RC = isThumb2 ? &ARM::rGPRRegClass
+ : &ARM::GPRRegClass;
+ unsigned DestReg = createResultReg(RC);
+
+ // FastISel TLS support on non-MachO is broken, punt to SelectionDAG.
+ const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+ bool IsThreadLocal = GVar && GVar->isThreadLocal();
+ if (!Subtarget->isTargetMachO() && IsThreadLocal) return 0;
+
+ bool IsPositionIndependent = isPositionIndependent();
+ // Use movw+movt when possible, it avoids constant pool entries.
+ // Non-darwin targets only support static movt relocations in FastISel.
+ if (Subtarget->useMovt(*FuncInfo.MF) &&
+ (Subtarget->isTargetMachO() || !IsPositionIndependent)) {
+ unsigned Opc;
+ unsigned char TF = 0;
+ if (Subtarget->isTargetMachO())
+ TF = ARMII::MO_NONLAZY;
+
+ if (IsPositionIndependent)
+ Opc = isThumb2 ? ARM::t2MOV_ga_pcrel : ARM::MOV_ga_pcrel;
+ else
+ Opc = isThumb2 ? ARM::t2MOVi32imm : ARM::MOVi32imm;
+ AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc), DestReg).addGlobalAddress(GV, 0, TF));
+ } else {
+ // MachineConstantPool wants an explicit alignment.
+ unsigned Align = DL.getPrefTypeAlignment(GV->getType());
+ if (Align == 0) {
+ // TODO: Figure out if this is correct.
+ Align = DL.getTypeAllocSize(GV->getType());
+ }
+
+ if (Subtarget->isTargetELF() && IsPositionIndependent)
+ return ARMLowerPICELF(GV, Align, VT);
+
+ // Grab index.
+ unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
+ unsigned Id = AFI->createPICLabelUId();
+ ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GV, Id,
+ ARMCP::CPValue,
+ PCAdj);
+ unsigned Idx = MCP.getConstantPoolIndex(CPV, Align);
+
+ // Load value.
+ MachineInstrBuilder MIB;
+ if (isThumb2) {
+ unsigned Opc = IsPositionIndependent ? ARM::t2LDRpci_pic : ARM::t2LDRpci;
+ MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
+ DestReg).addConstantPoolIndex(Idx);
+ if (IsPositionIndependent)
+ MIB.addImm(Id);
+ AddOptionalDefs(MIB);
+ } else {
+ // The extra immediate is for addrmode2.
+ DestReg = constrainOperandRegClass(TII.get(ARM::LDRcp), DestReg, 0);
+ MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(ARM::LDRcp), DestReg)
+ .addConstantPoolIndex(Idx)
+ .addImm(0);
+ AddOptionalDefs(MIB);
+
+ if (IsPositionIndependent) {
+ unsigned Opc = IsIndirect ? ARM::PICLDR : ARM::PICADD;
+ unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT));
+
+ MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
+ DbgLoc, TII.get(Opc), NewDestReg)
+ .addReg(DestReg)
+ .addImm(Id);
+ AddOptionalDefs(MIB);
+ return NewDestReg;
+ }
+ }
+ }
+
+ if (IsIndirect) {
+ MachineInstrBuilder MIB;
+ unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT));
+ if (isThumb2)
+ MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(ARM::t2LDRi12), NewDestReg)
+ .addReg(DestReg)
+ .addImm(0);
+ else
+ MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(ARM::LDRi12), NewDestReg)
+ .addReg(DestReg)
+ .addImm(0);
+ DestReg = NewDestReg;
+ AddOptionalDefs(MIB);
+ }
+
+ return DestReg;
+}
+
+unsigned ARMFastISel::fastMaterializeConstant(const Constant *C) {
+ EVT CEVT = TLI.getValueType(DL, C->getType(), true);
+
+ // Only handle simple types.
+ if (!CEVT.isSimple()) return 0;
+ MVT VT = CEVT.getSimpleVT();
+
+ if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
+ return ARMMaterializeFP(CFP, VT);
+ else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+ return ARMMaterializeGV(GV, VT);
+ else if (isa<ConstantInt>(C))
+ return ARMMaterializeInt(C, VT);
+
+ return 0;
+}
+
+// TODO: unsigned ARMFastISel::TargetMaterializeFloatZero(const ConstantFP *CF);
+
+unsigned ARMFastISel::fastMaterializeAlloca(const AllocaInst *AI) {
+ // Don't handle dynamic allocas.
+ if (!FuncInfo.StaticAllocaMap.count(AI)) return 0;
+
+ MVT VT;
+ if (!isLoadTypeLegal(AI->getType(), VT)) return 0;
+
+ DenseMap<const AllocaInst*, int>::iterator SI =
+ FuncInfo.StaticAllocaMap.find(AI);
+
+ // This will get lowered later into the correct offsets and registers
+ // via rewriteXFrameIndex.
+ if (SI != FuncInfo.StaticAllocaMap.end()) {
+ unsigned Opc = isThumb2 ? ARM::t2ADDri : ARM::ADDri;
+ const TargetRegisterClass* RC = TLI.getRegClassFor(VT);
+ unsigned ResultReg = createResultReg(RC);
+ ResultReg = constrainOperandRegClass(TII.get(Opc), ResultReg, 0);
+
+ AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc), ResultReg)
+ .addFrameIndex(SI->second)
+ .addImm(0));
+ return ResultReg;
+ }
+
+ return 0;
+}
+
+bool ARMFastISel::isTypeLegal(Type *Ty, MVT &VT) {
+ EVT evt = TLI.getValueType(DL, Ty, true);
+
+ // Only handle simple types.
+ if (evt == MVT::Other || !evt.isSimple()) return false;
+ VT = evt.getSimpleVT();
+
+ // Handle all legal types, i.e. a register that will directly hold this
+ // value.
+ return TLI.isTypeLegal(VT);
+}
+
+bool ARMFastISel::isLoadTypeLegal(Type *Ty, MVT &VT) {
+ if (isTypeLegal(Ty, VT)) return true;
+
+ // If this is a type than can be sign or zero-extended to a basic operation
+ // go ahead and accept it now.
+ if (VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16)
+ return true;
+
+ return false;
+}
+
+// Computes the address to get to an object.
+bool ARMFastISel::ARMComputeAddress(const Value *Obj, Address &Addr) {
+ // Some boilerplate from the X86 FastISel.
+ const User *U = nullptr;
+ unsigned Opcode = Instruction::UserOp1;
+ if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
+ // Don't walk into other basic blocks unless the object is an alloca from
+ // another block, otherwise it may not have a virtual register assigned.
+ if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(Obj)) ||
+ FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
+ Opcode = I->getOpcode();
+ U = I;
+ }
+ } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(Obj)) {
+ Opcode = C->getOpcode();
+ U = C;
+ }
+
+ if (PointerType *Ty = dyn_cast<PointerType>(Obj->getType()))
+ if (Ty->getAddressSpace() > 255)
+ // Fast instruction selection doesn't support the special
+ // address spaces.
+ return false;
+
+ switch (Opcode) {
+ default:
+ break;
+ case Instruction::BitCast:
+ // Look through bitcasts.
+ return ARMComputeAddress(U->getOperand(0), Addr);
+ case Instruction::IntToPtr:
+ // Look past no-op inttoptrs.
+ if (TLI.getValueType(DL, U->getOperand(0)->getType()) ==
+ TLI.getPointerTy(DL))
+ return ARMComputeAddress(U->getOperand(0), Addr);
+ break;
+ case Instruction::PtrToInt:
+ // Look past no-op ptrtoints.
+ if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
+ return ARMComputeAddress(U->getOperand(0), Addr);
+ break;
+ case Instruction::GetElementPtr: {
+ Address SavedAddr = Addr;
+ int TmpOffset = Addr.Offset;
+
+ // Iterate through the GEP folding the constants into offsets where
+ // we can.
+ gep_type_iterator GTI = gep_type_begin(U);
+ for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end();
+ i != e; ++i, ++GTI) {
+ const Value *Op = *i;
+ if (StructType *STy = GTI.getStructTypeOrNull()) {
+ const StructLayout *SL = DL.getStructLayout(STy);
+ unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
+ TmpOffset += SL->getElementOffset(Idx);
+ } else {
+ uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
+ for (;;) {
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+ // Constant-offset addressing.
+ TmpOffset += CI->getSExtValue() * S;
+ break;
+ }
+ if (canFoldAddIntoGEP(U, Op)) {
+ // A compatible add with a constant operand. Fold the constant.
+ ConstantInt *CI =
+ cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+ TmpOffset += CI->getSExtValue() * S;
+ // Iterate on the other operand.
+ Op = cast<AddOperator>(Op)->getOperand(0);
+ continue;
+ }
+ // Unsupported
+ goto unsupported_gep;
+ }
+ }
+ }
+
+ // Try to grab the base operand now.
+ Addr.Offset = TmpOffset;
+ if (ARMComputeAddress(U->getOperand(0), Addr)) return true;
+
+ // We failed, restore everything and try the other options.
+ Addr = SavedAddr;
+
+ unsupported_gep:
+ break;
+ }
+ case Instruction::Alloca: {
+ const AllocaInst *AI = cast<AllocaInst>(Obj);
+ DenseMap<const AllocaInst*, int>::iterator SI =
+ FuncInfo.StaticAllocaMap.find(AI);
+ if (SI != FuncInfo.StaticAllocaMap.end()) {
+ Addr.BaseType = Address::FrameIndexBase;
+ Addr.Base.FI = SI->second;
+ return true;
+ }
+ break;
+ }
+ }
+
+ // Try to get this in a register if nothing else has worked.
+ if (Addr.Base.Reg == 0) Addr.Base.Reg = getRegForValue(Obj);
+ return Addr.Base.Reg != 0;
+}
+
+void ARMFastISel::ARMSimplifyAddress(Address &Addr, MVT VT, bool useAM3) {
+ bool needsLowering = false;
+ switch (VT.SimpleTy) {
+ default: llvm_unreachable("Unhandled load/store type!");
+ case MVT::i1:
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32:
+ if (!useAM3) {
+ // Integer loads/stores handle 12-bit offsets.
+ needsLowering = ((Addr.Offset & 0xfff) != Addr.Offset);
+ // Handle negative offsets.
+ if (needsLowering && isThumb2)
+ needsLowering = !(Subtarget->hasV6T2Ops() && Addr.Offset < 0 &&
+ Addr.Offset > -256);
+ } else {
+ // ARM halfword load/stores and signed byte loads use +/-imm8 offsets.
+ needsLowering = (Addr.Offset > 255 || Addr.Offset < -255);
+ }
+ break;
+ case MVT::f32:
+ case MVT::f64:
+ // Floating point operands handle 8-bit offsets.
+ needsLowering = ((Addr.Offset & 0xff) != Addr.Offset);
+ break;
+ }
+
+ // If this is a stack pointer and the offset needs to be simplified then
+ // put the alloca address into a register, set the base type back to
+ // register and continue. This should almost never happen.
+ if (needsLowering && Addr.BaseType == Address::FrameIndexBase) {
+ const TargetRegisterClass *RC = isThumb2 ? &ARM::tGPRRegClass
+ : &ARM::GPRRegClass;
+ unsigned ResultReg = createResultReg(RC);
+ unsigned Opc = isThumb2 ? ARM::t2ADDri : ARM::ADDri;
+ AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc), ResultReg)
+ .addFrameIndex(Addr.Base.FI)
+ .addImm(0));
+ Addr.Base.Reg = ResultReg;
+ Addr.BaseType = Address::RegBase;
+ }
+
+ // Since the offset is too large for the load/store instruction
+ // get the reg+offset into a register.
+ if (needsLowering) {
+ Addr.Base.Reg = fastEmit_ri_(MVT::i32, ISD::ADD, Addr.Base.Reg,
+ /*Op0IsKill*/false, Addr.Offset, MVT::i32);
+ Addr.Offset = 0;
+ }
+}
+
+void ARMFastISel::AddLoadStoreOperands(MVT VT, Address &Addr,
+ const MachineInstrBuilder &MIB,
+ MachineMemOperand::Flags Flags,
+ bool useAM3) {
+ // addrmode5 output depends on the selection dag addressing dividing the
+ // offset by 4 that it then later multiplies. Do this here as well.
+ if (VT.SimpleTy == MVT::f32 || VT.SimpleTy == MVT::f64)
+ Addr.Offset /= 4;
+
+ // Frame base works a bit differently. Handle it separately.
+ if (Addr.BaseType == Address::FrameIndexBase) {
+ int FI = Addr.Base.FI;
+ int Offset = Addr.Offset;
+ MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*FuncInfo.MF, FI, Offset), Flags,
+ MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+ // Now add the rest of the operands.
+ MIB.addFrameIndex(FI);
+
+ // ARM halfword load/stores and signed byte loads need an additional
+ // operand.
+ if (useAM3) {
+ int Imm = (Addr.Offset < 0) ? (0x100 | -Addr.Offset) : Addr.Offset;
+ MIB.addReg(0);
+ MIB.addImm(Imm);
+ } else {
+ MIB.addImm(Addr.Offset);
+ }
+ MIB.addMemOperand(MMO);
+ } else {
+ // Now add the rest of the operands.
+ MIB.addReg(Addr.Base.Reg);
+
+ // ARM halfword load/stores and signed byte loads need an additional
+ // operand.
+ if (useAM3) {
+ int Imm = (Addr.Offset < 0) ? (0x100 | -Addr.Offset) : Addr.Offset;
+ MIB.addReg(0);
+ MIB.addImm(Imm);
+ } else {
+ MIB.addImm(Addr.Offset);
+ }
+ }
+ AddOptionalDefs(MIB);
+}
+
+bool ARMFastISel::ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+ unsigned Alignment, bool isZExt, bool allocReg) {
+ unsigned Opc;
+ bool useAM3 = false;
+ bool needVMOV = false;
+ const TargetRegisterClass *RC;
+ switch (VT.SimpleTy) {
+ // This is mostly going to be Neon/vector support.
+ default: return false;
+ case MVT::i1:
+ case MVT::i8:
+ if (isThumb2) {
+ if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
+ Opc = isZExt ? ARM::t2LDRBi8 : ARM::t2LDRSBi8;
+ else
+ Opc = isZExt ? ARM::t2LDRBi12 : ARM::t2LDRSBi12;
+ } else {
+ if (isZExt) {
+ Opc = ARM::LDRBi12;
+ } else {
+ Opc = ARM::LDRSB;
+ useAM3 = true;
+ }
+ }
+ RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass;
+ break;
+ case MVT::i16:
+ if (Alignment && Alignment < 2 && !Subtarget->allowsUnalignedMem())
+ return false;
+
+ if (isThumb2) {
+ if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
+ Opc = isZExt ? ARM::t2LDRHi8 : ARM::t2LDRSHi8;
+ else
+ Opc = isZExt ? ARM::t2LDRHi12 : ARM::t2LDRSHi12;
+ } else {
+ Opc = isZExt ? ARM::LDRH : ARM::LDRSH;
+ useAM3 = true;
+ }
+ RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass;
+ break;
+ case MVT::i32:
+ if (Alignment && Alignment < 4 && !Subtarget->allowsUnalignedMem())
+ return false;
+
+ if (isThumb2) {
+ if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
+ Opc = ARM::t2LDRi8;
+ else
+ Opc = ARM::t2LDRi12;
+ } else {
+ Opc = ARM::LDRi12;
+ }
+ RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass;
+ break;
+ case MVT::f32:
+ if (!Subtarget->hasVFP2()) return false;
+ // Unaligned loads need special handling. Floats require word-alignment.
+ if (Alignment && Alignment < 4) {
+ needVMOV = true;
+ VT = MVT::i32;
+ Opc = isThumb2 ? ARM::t2LDRi12 : ARM::LDRi12;
+ RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass;
+ } else {
+ Opc = ARM::VLDRS;
+ RC = TLI.getRegClassFor(VT);
+ }
+ break;
+ case MVT::f64:
+ if (!Subtarget->hasVFP2()) return false;
+ // FIXME: Unaligned loads need special handling. Doublewords require
+ // word-alignment.
+ if (Alignment && Alignment < 4)
+ return false;
+
+ Opc = ARM::VLDRD;
+ RC = TLI.getRegClassFor(VT);
+ break;
+ }
+ // Simplify this down to something we can handle.
+ ARMSimplifyAddress(Addr, VT, useAM3);
+
+ // Create the base instruction, then add the operands.
+ if (allocReg)
+ ResultReg = createResultReg(RC);
+ assert (ResultReg > 255 && "Expected an allocated virtual register.");
+ MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc), ResultReg);
+ AddLoadStoreOperands(VT, Addr, MIB, MachineMemOperand::MOLoad, useAM3);
+
+ // If we had an unaligned load of a float we've converted it to an regular
+ // load. Now we must move from the GRP to the FP register.
+ if (needVMOV) {
+ unsigned MoveReg = createResultReg(TLI.getRegClassFor(MVT::f32));
+ AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(ARM::VMOVSR), MoveReg)
+ .addReg(ResultReg));
+ ResultReg = MoveReg;
+ }
+ return true;
+}
+
+bool ARMFastISel::SelectLoad(const Instruction *I) {
+ // Atomic loads need special handling.
+ if (cast<LoadInst>(I)->isAtomic())
+ return false;
+
+ const Value *SV = I->getOperand(0);
+ if (TLI.supportSwiftError()) {
+ // Swifterror values can come from either a function parameter with
+ // swifterror attribute or an alloca with swifterror attribute.
+ if (const Argument *Arg = dyn_cast<Argument>(SV)) {
+ if (Arg->hasSwiftErrorAttr())
+ return false;
+ }
+
+ if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(SV)) {
+ if (Alloca->isSwiftError())
+ return false;
+ }
+ }
+
+ // Verify we have a legal type before going any further.
+ MVT VT;
+ if (!isLoadTypeLegal(I->getType(), VT))
+ return false;
+
+ // See if we can handle this address.
+ Address Addr;
+ if (!ARMComputeAddress(I->getOperand(0), Addr)) return false;
+
+ unsigned ResultReg;
+ if (!ARMEmitLoad(VT, ResultReg, Addr, cast<LoadInst>(I)->getAlignment()))
+ return false;
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool ARMFastISel::ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr,
+ unsigned Alignment) {
+ unsigned StrOpc;
+ bool useAM3 = false;
+ switch (VT.SimpleTy) {
+ // This is mostly going to be Neon/vector support.
+ default: return false;
+ case MVT::i1: {
+ unsigned Res = createResultReg(isThumb2 ? &ARM::tGPRRegClass
+ : &ARM::GPRRegClass);
+ unsigned Opc = isThumb2 ? ARM::t2ANDri : ARM::ANDri;
+ SrcReg = constrainOperandRegClass(TII.get(Opc), SrcReg, 1);
+ AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc), Res)
+ .addReg(SrcReg).addImm(1));
+ SrcReg = Res;
+ LLVM_FALLTHROUGH;
+ }
+ case MVT::i8:
+ if (isThumb2) {
+ if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
+ StrOpc = ARM::t2STRBi8;
+ else
+ StrOpc = ARM::t2STRBi12;
+ } else {
+ StrOpc = ARM::STRBi12;
+ }
+ break;
+ case MVT::i16:
+ if (Alignment && Alignment < 2 && !Subtarget->allowsUnalignedMem())
+ return false;
+
+ if (isThumb2) {
+ if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
+ StrOpc = ARM::t2STRHi8;
+ else
+ StrOpc = ARM::t2STRHi12;
+ } else {
+ StrOpc = ARM::STRH;
+ useAM3 = true;
+ }
+ break;
+ case MVT::i32:
+ if (Alignment && Alignment < 4 && !Subtarget->allowsUnalignedMem())
+ return false;
+
+ if (isThumb2) {
+ if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
+ StrOpc = ARM::t2STRi8;
+ else
+ StrOpc = ARM::t2STRi12;
+ } else {
+ StrOpc = ARM::STRi12;
+ }
+ break;
+ case MVT::f32:
+ if (!Subtarget->hasVFP2()) return false;
+ // Unaligned stores need special handling. Floats require word-alignment.
+ if (Alignment && Alignment < 4) {
+ unsigned MoveReg = createResultReg(TLI.getRegClassFor(MVT::i32));
+ AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(ARM::VMOVRS), MoveReg)
+ .addReg(SrcReg));
+ SrcReg = MoveReg;
+ VT = MVT::i32;
+ StrOpc = isThumb2 ? ARM::t2STRi12 : ARM::STRi12;
+ } else {
+ StrOpc = ARM::VSTRS;
+ }
+ break;
+ case MVT::f64:
+ if (!Subtarget->hasVFP2()) return false;
+ // FIXME: Unaligned stores need special handling. Doublewords require
+ // word-alignment.
+ if (Alignment && Alignment < 4)
+ return false;
+
+ StrOpc = ARM::VSTRD;
+ break;
+ }
+ // Simplify this down to something we can handle.
+ ARMSimplifyAddress(Addr, VT, useAM3);
+
+ // Create the base instruction, then add the operands.
+ SrcReg = constrainOperandRegClass(TII.get(StrOpc), SrcReg, 0);
+ MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(StrOpc))
+ .addReg(SrcReg);
+ AddLoadStoreOperands(VT, Addr, MIB, MachineMemOperand::MOStore, useAM3);
+ return true;
+}
+
+bool ARMFastISel::SelectStore(const Instruction *I) {
+ Value *Op0 = I->getOperand(0);
+ unsigned SrcReg = 0;
+
+ // Atomic stores need special handling.
+ if (cast<StoreInst>(I)->isAtomic())
+ return false;
+
+ const Value *PtrV = I->getOperand(1);
+ if (TLI.supportSwiftError()) {
+ // Swifterror values can come from either a function parameter with
+ // swifterror attribute or an alloca with swifterror attribute.
+ if (const Argument *Arg = dyn_cast<Argument>(PtrV)) {
+ if (Arg->hasSwiftErrorAttr())
+ return false;
+ }
+
+ if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrV)) {
+ if (Alloca->isSwiftError())
+ return false;
+ }
+ }
+
+ // Verify we have a legal type before going any further.
+ MVT VT;
+ if (!isLoadTypeLegal(I->getOperand(0)->getType(), VT))
+ return false;
+
+ // Get the value to be stored into a register.
+ SrcReg = getRegForValue(Op0);
+ if (SrcReg == 0) return false;
+
+ // See if we can handle this address.
+ Address Addr;
+ if (!ARMComputeAddress(I->getOperand(1), Addr))
+ return false;
+
+ if (!ARMEmitStore(VT, SrcReg, Addr, cast<StoreInst>(I)->getAlignment()))
+ return false;
+ return true;
+}
+
+static ARMCC::CondCodes getComparePred(CmpInst::Predicate Pred) {
+ switch (Pred) {
+ // Needs two compares...
+ case CmpInst::FCMP_ONE:
+ case CmpInst::FCMP_UEQ:
+ default:
+ // AL is our "false" for now. The other two need more compares.
+ return ARMCC::AL;
+ case CmpInst::ICMP_EQ:
+ case CmpInst::FCMP_OEQ:
+ return ARMCC::EQ;
+ case CmpInst::ICMP_SGT:
+ case CmpInst::FCMP_OGT:
+ return ARMCC::GT;
+ case CmpInst::ICMP_SGE:
+ case CmpInst::FCMP_OGE:
+ return ARMCC::GE;
+ case CmpInst::ICMP_UGT:
+ case CmpInst::FCMP_UGT:
+ return ARMCC::HI;
+ case CmpInst::FCMP_OLT:
+ return ARMCC::MI;
+ case CmpInst::ICMP_ULE:
+ case CmpInst::FCMP_OLE:
+ return ARMCC::LS;
+ case CmpInst::FCMP_ORD:
+ return ARMCC::VC;
+ case CmpInst::FCMP_UNO:
+ return ARMCC::VS;
+ case CmpInst::FCMP_UGE:
+ return ARMCC::PL;
+ case CmpInst::ICMP_SLT:
+ case CmpInst::FCMP_ULT:
+ return ARMCC::LT;
+ case CmpInst::ICMP_SLE:
+ case CmpInst::FCMP_ULE:
+ return ARMCC::LE;
+ case CmpInst::FCMP_UNE:
+ case CmpInst::ICMP_NE:
+ return ARMCC::NE;
+ case CmpInst::ICMP_UGE:
+ return ARMCC::HS;
+ case CmpInst::ICMP_ULT:
+ return ARMCC::LO;
+ }
+}
+
+bool ARMFastISel::SelectBranch(const Instruction *I) {
+ const BranchInst *BI = cast<BranchInst>(I);
+ MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
+ MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
+
+ // Simple branch support.
+
+ // If we can, avoid recomputing the compare - redoing it could lead to wonky
+ // behavior.
+ if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
+ if (CI->hasOneUse() && (CI->getParent() == I->getParent())) {
+
+ // Get the compare predicate.
+ // Try to take advantage of fallthrough opportunities.
+ CmpInst::Predicate Predicate = CI->getPredicate();
+ if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+ std::swap(TBB, FBB);
+ Predicate = CmpInst::getInversePredicate(Predicate);
+ }
+
+ ARMCC::CondCodes ARMPred = getComparePred(Predicate);
+
+ // We may not handle every CC for now.
+ if (ARMPred == ARMCC::AL) return false;
+
+ // Emit the compare.
+ if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
+ return false;
+
+ unsigned BrOpc = isThumb2 ? ARM::t2Bcc : ARM::Bcc;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BrOpc))
+ .addMBB(TBB).addImm(ARMPred).addReg(ARM::CPSR);
+ finishCondBranch(BI->getParent(), TBB, FBB);
+ return true;
+ }
+ } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
+ MVT SourceVT;
+ if (TI->hasOneUse() && TI->getParent() == I->getParent() &&
+ (isLoadTypeLegal(TI->getOperand(0)->getType(), SourceVT))) {
+ unsigned TstOpc = isThumb2 ? ARM::t2TSTri : ARM::TSTri;
+ unsigned OpReg = getRegForValue(TI->getOperand(0));
+ OpReg = constrainOperandRegClass(TII.get(TstOpc), OpReg, 0);
+ AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TstOpc))
+ .addReg(OpReg).addImm(1));
+
+ unsigned CCMode = ARMCC::NE;
+ if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+ std::swap(TBB, FBB);
+ CCMode = ARMCC::EQ;
+ }
+
+ unsigned BrOpc = isThumb2 ? ARM::t2Bcc : ARM::Bcc;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BrOpc))
+ .addMBB(TBB).addImm(CCMode).addReg(ARM::CPSR);
+
+ finishCondBranch(BI->getParent(), TBB, FBB);
+ return true;
+ }
+ } else if (const ConstantInt *CI =
+ dyn_cast<ConstantInt>(BI->getCondition())) {
+ uint64_t Imm = CI->getZExtValue();
+ MachineBasicBlock *Target = (Imm == 0) ? FBB : TBB;
+ fastEmitBranch(Target, DbgLoc);
+ return true;
+ }
+
+ unsigned CmpReg = getRegForValue(BI->getCondition());
+ if (CmpReg == 0) return false;
+
+ // We've been divorced from our compare! Our block was split, and
+ // now our compare lives in a predecessor block. We musn't
+ // re-compare here, as the children of the compare aren't guaranteed
+ // live across the block boundary (we *could* check for this).
+ // Regardless, the compare has been done in the predecessor block,
+ // and it left a value for us in a virtual register. Ergo, we test
+ // the one-bit value left in the virtual register.
+ unsigned TstOpc = isThumb2 ? ARM::t2TSTri : ARM::TSTri;
+ CmpReg = constrainOperandRegClass(TII.get(TstOpc), CmpReg, 0);
+ AddOptionalDefs(
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TstOpc))
+ .addReg(CmpReg)
+ .addImm(1));
+
+ unsigned CCMode = ARMCC::NE;
+ if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+ std::swap(TBB, FBB);
+ CCMode = ARMCC::EQ;
+ }
+
+ unsigned BrOpc = isThumb2 ? ARM::t2Bcc : ARM::Bcc;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BrOpc))
+ .addMBB(TBB).addImm(CCMode).addReg(ARM::CPSR);
+ finishCondBranch(BI->getParent(), TBB, FBB);
+ return true;
+}
+
+bool ARMFastISel::SelectIndirectBr(const Instruction *I) {
+ unsigned AddrReg = getRegForValue(I->getOperand(0));
+ if (AddrReg == 0) return false;
+
+ unsigned Opc = isThumb2 ? ARM::tBRIND : ARM::BX;
+ AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc)).addReg(AddrReg));
+
+ const IndirectBrInst *IB = cast<IndirectBrInst>(I);
+ for (const BasicBlock *SuccBB : IB->successors())
+ FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[SuccBB]);
+
+ return true;
+}
+
+bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
+ bool isZExt) {
+ Type *Ty = Src1Value->getType();
+ EVT SrcEVT = TLI.getValueType(DL, Ty, true);
+ if (!SrcEVT.isSimple()) return false;
+ MVT SrcVT = SrcEVT.getSimpleVT();
+
+ bool isFloat = (Ty->isFloatTy() || Ty->isDoubleTy());
+ if (isFloat && !Subtarget->hasVFP2())
+ return false;
+
+ // Check to see if the 2nd operand is a constant that we can encode directly
+ // in the compare.
+ int Imm = 0;
+ bool UseImm = false;
+ bool isNegativeImm = false;
+ // FIXME: At -O0 we don't have anything that canonicalizes operand order.
+ // Thus, Src1Value may be a ConstantInt, but we're missing it.
+ if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(Src2Value)) {
+ if (SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8 ||
+ SrcVT == MVT::i1) {
+ const APInt &CIVal = ConstInt->getValue();
+ Imm = (isZExt) ? (int)CIVal.getZExtValue() : (int)CIVal.getSExtValue();
+ // For INT_MIN/LONG_MIN (i.e., 0x80000000) we need to use a cmp, rather
+ // then a cmn, because there is no way to represent 2147483648 as a
+ // signed 32-bit int.
+ if (Imm < 0 && Imm != (int)0x80000000) {
+ isNegativeImm = true;
+ Imm = -Imm;
+ }
+ UseImm = isThumb2 ? (ARM_AM::getT2SOImmVal(Imm) != -1) :
+ (ARM_AM::getSOImmVal(Imm) != -1);
+ }
+ } else if (const ConstantFP *ConstFP = dyn_cast<ConstantFP>(Src2Value)) {
+ if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
+ if (ConstFP->isZero() && !ConstFP->isNegative())
+ UseImm = true;
+ }
+
+ unsigned CmpOpc;
+ bool isICmp = true;
+ bool needsExt = false;
+ switch (SrcVT.SimpleTy) {
+ default: return false;
+ // TODO: Verify compares.
+ case MVT::f32:
+ isICmp = false;
+ CmpOpc = UseImm ? ARM::VCMPEZS : ARM::VCMPES;
+ break;
+ case MVT::f64:
+ isICmp = false;
+ CmpOpc = UseImm ? ARM::VCMPEZD : ARM::VCMPED;
+ break;
+ case MVT::i1:
+ case MVT::i8:
+ case MVT::i16:
+ needsExt = true;
+ // Intentional fall-through.
+ case MVT::i32:
+ if (isThumb2) {
+ if (!UseImm)
+ CmpOpc = ARM::t2CMPrr;
+ else
+ CmpOpc = isNegativeImm ? ARM::t2CMNri : ARM::t2CMPri;
+ } else {
+ if (!UseImm)
+ CmpOpc = ARM::CMPrr;
+ else
+ CmpOpc = isNegativeImm ? ARM::CMNri : ARM::CMPri;
+ }
+ break;
+ }
+
+ unsigned SrcReg1 = getRegForValue(Src1Value);
+ if (SrcReg1 == 0) return false;
+
+ unsigned SrcReg2 = 0;
+ if (!UseImm) {
+ SrcReg2 = getRegForValue(Src2Value);
+ if (SrcReg2 == 0) return false;
+ }
+
+ // We have i1, i8, or i16, we need to either zero extend or sign extend.
+ if (needsExt) {
+ SrcReg1 = ARMEmitIntExt(SrcVT, SrcReg1, MVT::i32, isZExt);
+ if (SrcReg1 == 0) return false;
+ if (!UseImm) {
+ SrcReg2 = ARMEmitIntExt(SrcVT, SrcReg2, MVT::i32, isZExt);
+ if (SrcReg2 == 0) return false;
+ }
+ }
+
+ const MCInstrDesc &II = TII.get(CmpOpc);
+ SrcReg1 = constrainOperandRegClass(II, SrcReg1, 0);
+ if (!UseImm) {
+ SrcReg2 = constrainOperandRegClass(II, SrcReg2, 1);
+ AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+ .addReg(SrcReg1).addReg(SrcReg2));
+ } else {
+ MachineInstrBuilder MIB;
+ MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+ .addReg(SrcReg1);
+
+ // Only add immediate for icmp as the immediate for fcmp is an implicit 0.0.
+ if (isICmp)
+ MIB.addImm(Imm);
+ AddOptionalDefs(MIB);
+ }
+
+ // For floating point we need to move the result to a comparison register
+ // that we can then use for branches.
+ if (Ty->isFloatTy() || Ty->isDoubleTy())
+ AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(ARM::FMSTAT)));
+ return true;
+}
+
+bool ARMFastISel::SelectCmp(const Instruction *I) {
+ const CmpInst *CI = cast<CmpInst>(I);
+
+ // Get the compare predicate.
+ ARMCC::CondCodes ARMPred = getComparePred(CI->getPredicate());
+
+ // We may not handle every CC for now.
+ if (ARMPred == ARMCC::AL) return false;
+
+ // Emit the compare.
+ if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
+ return false;
+
+ // Now set a register based on the comparison. Explicitly set the predicates
+ // here.
+ unsigned MovCCOpc = isThumb2 ? ARM::t2MOVCCi : ARM::MOVCCi;
+ const TargetRegisterClass *RC = isThumb2 ? &ARM::rGPRRegClass
+ : &ARM::GPRRegClass;
+ unsigned DestReg = createResultReg(RC);
+ Constant *Zero = ConstantInt::get(Type::getInt32Ty(*Context), 0);
+ unsigned ZeroReg = fastMaterializeConstant(Zero);
+ // ARMEmitCmp emits a FMSTAT when necessary, so it's always safe to use CPSR.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovCCOpc), DestReg)
+ .addReg(ZeroReg).addImm(1)
+ .addImm(ARMPred).addReg(ARM::CPSR);
+
+ updateValueMap(I, DestReg);
+ return true;
+}
+
+bool ARMFastISel::SelectFPExt(const Instruction *I) {
+ // Make sure we have VFP and that we're extending float to double.
+ if (!Subtarget->hasVFP2()) return false;
+
+ Value *V = I->getOperand(0);
+ if (!I->getType()->isDoubleTy() ||
+ !V->getType()->isFloatTy()) return false;
+
+ unsigned Op = getRegForValue(V);
+ if (Op == 0) return false;
+
+ unsigned Result = createResultReg(&ARM::DPRRegClass);
+ AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(ARM::VCVTDS), Result)
+ .addReg(Op));
+ updateValueMap(I, Result);
+ return true;
+}
+
+bool ARMFastISel::SelectFPTrunc(const Instruction *I) {
+ // Make sure we have VFP and that we're truncating double to float.
+ if (!Subtarget->hasVFP2()) return false;
+
+ Value *V = I->getOperand(0);
+ if (!(I->getType()->isFloatTy() &&
+ V->getType()->isDoubleTy())) return false;
+
+ unsigned Op = getRegForValue(V);
+ if (Op == 0) return false;
+
+ unsigned Result = createResultReg(&ARM::SPRRegClass);
+ AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(ARM::VCVTSD), Result)
+ .addReg(Op));
+ updateValueMap(I, Result);
+ return true;
+}
+
+bool ARMFastISel::SelectIToFP(const Instruction *I, bool isSigned) {
+ // Make sure we have VFP.
+ if (!Subtarget->hasVFP2()) return false;
+
+ MVT DstVT;
+ Type *Ty = I->getType();
+ if (!isTypeLegal(Ty, DstVT))
+ return false;
+
+ Value *Src = I->getOperand(0);
+ EVT SrcEVT = TLI.getValueType(DL, Src->getType(), true);
+ if (!SrcEVT.isSimple())
+ return false;
+ MVT SrcVT = SrcEVT.getSimpleVT();
+ if (SrcVT != MVT::i32 && SrcVT != MVT::i16 && SrcVT != MVT::i8)
+ return false;
+
+ unsigned SrcReg = getRegForValue(Src);
+ if (SrcReg == 0) return false;
+
+ // Handle sign-extension.
+ if (SrcVT == MVT::i16 || SrcVT == MVT::i8) {
+ SrcReg = ARMEmitIntExt(SrcVT, SrcReg, MVT::i32,
+ /*isZExt*/!isSigned);
+ if (SrcReg == 0) return false;
+ }
+
+ // The conversion routine works on fp-reg to fp-reg and the operand above
+ // was an integer, move it to the fp registers if possible.
+ unsigned FP = ARMMoveToFPReg(MVT::f32, SrcReg);
+ if (FP == 0) return false;
+
+ unsigned Opc;
+ if (Ty->isFloatTy()) Opc = isSigned ? ARM::VSITOS : ARM::VUITOS;
+ else if (Ty->isDoubleTy()) Opc = isSigned ? ARM::VSITOD : ARM::VUITOD;
+ else return false;
+
+ unsigned ResultReg = createResultReg(TLI.getRegClassFor(DstVT));
+ AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc), ResultReg).addReg(FP));
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool ARMFastISel::SelectFPToI(const Instruction *I, bool isSigned) {
+ // Make sure we have VFP.
+ if (!Subtarget->hasVFP2()) return false;
+
+ MVT DstVT;
+ Type *RetTy = I->getType();
+ if (!isTypeLegal(RetTy, DstVT))
+ return false;
+
+ unsigned Op = getRegForValue(I->getOperand(0));
+ if (Op == 0) return false;
+
+ unsigned Opc;
+ Type *OpTy = I->getOperand(0)->getType();
+ if (OpTy->isFloatTy()) Opc = isSigned ? ARM::VTOSIZS : ARM::VTOUIZS;
+ else if (OpTy->isDoubleTy()) Opc = isSigned ? ARM::VTOSIZD : ARM::VTOUIZD;
+ else return false;
+
+ // f64->s32/u32 or f32->s32/u32 both need an intermediate f32 reg.
+ unsigned ResultReg = createResultReg(TLI.getRegClassFor(MVT::f32));
+ AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc), ResultReg).addReg(Op));
+
+ // This result needs to be in an integer register, but the conversion only
+ // takes place in fp-regs.
+ unsigned IntReg = ARMMoveToIntReg(DstVT, ResultReg);
+ if (IntReg == 0) return false;
+
+ updateValueMap(I, IntReg);
+ return true;
+}
+
+bool ARMFastISel::SelectSelect(const Instruction *I) {
+ MVT VT;
+ if (!isTypeLegal(I->getType(), VT))
+ return false;
+
+ // Things need to be register sized for register moves.
+ if (VT != MVT::i32) return false;
+
+ unsigned CondReg = getRegForValue(I->getOperand(0));
+ if (CondReg == 0) return false;
+ unsigned Op1Reg = getRegForValue(I->getOperand(1));
+ if (Op1Reg == 0) return false;
+
+ // Check to see if we can use an immediate in the conditional move.
+ int Imm = 0;
+ bool UseImm = false;
+ bool isNegativeImm = false;
+ if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(I->getOperand(2))) {
+ assert (VT == MVT::i32 && "Expecting an i32.");
+ Imm = (int)ConstInt->getValue().getZExtValue();
+ if (Imm < 0) {
+ isNegativeImm = true;
+ Imm = ~Imm;
+ }
+ UseImm = isThumb2 ? (ARM_AM::getT2SOImmVal(Imm) != -1) :
+ (ARM_AM::getSOImmVal(Imm) != -1);
+ }
+
+ unsigned Op2Reg = 0;
+ if (!UseImm) {
+ Op2Reg = getRegForValue(I->getOperand(2));
+ if (Op2Reg == 0) return false;
+ }
+
+ unsigned TstOpc = isThumb2 ? ARM::t2TSTri : ARM::TSTri;
+ CondReg = constrainOperandRegClass(TII.get(TstOpc), CondReg, 0);
+ AddOptionalDefs(
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TstOpc))
+ .addReg(CondReg)
+ .addImm(1));
+
+ unsigned MovCCOpc;
+ const TargetRegisterClass *RC;
+ if (!UseImm) {
+ RC = isThumb2 ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
+ MovCCOpc = isThumb2 ? ARM::t2MOVCCr : ARM::MOVCCr;
+ } else {
+ RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass;
+ if (!isNegativeImm)
+ MovCCOpc = isThumb2 ? ARM::t2MOVCCi : ARM::MOVCCi;
+ else
+ MovCCOpc = isThumb2 ? ARM::t2MVNCCi : ARM::MVNCCi;
+ }
+ unsigned ResultReg = createResultReg(RC);
+ if (!UseImm) {
+ Op2Reg = constrainOperandRegClass(TII.get(MovCCOpc), Op2Reg, 1);
+ Op1Reg = constrainOperandRegClass(TII.get(MovCCOpc), Op1Reg, 2);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovCCOpc),
+ ResultReg)
+ .addReg(Op2Reg)
+ .addReg(Op1Reg)
+ .addImm(ARMCC::NE)
+ .addReg(ARM::CPSR);
+ } else {
+ Op1Reg = constrainOperandRegClass(TII.get(MovCCOpc), Op1Reg, 1);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovCCOpc),
+ ResultReg)
+ .addReg(Op1Reg)
+ .addImm(Imm)
+ .addImm(ARMCC::EQ)
+ .addReg(ARM::CPSR);
+ }
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool ARMFastISel::SelectDiv(const Instruction *I, bool isSigned) {
+ MVT VT;
+ Type *Ty = I->getType();
+ if (!isTypeLegal(Ty, VT))
+ return false;
+
+ // If we have integer div support we should have selected this automagically.
+ // In case we have a real miss go ahead and return false and we'll pick
+ // it up later.
+ if (Subtarget->hasDivide()) return false;
+
+ // Otherwise emit a libcall.
+ RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+ if (VT == MVT::i8)
+ LC = isSigned ? RTLIB::SDIV_I8 : RTLIB::UDIV_I8;
+ else if (VT == MVT::i16)
+ LC = isSigned ? RTLIB::SDIV_I16 : RTLIB::UDIV_I16;
+ else if (VT == MVT::i32)
+ LC = isSigned ? RTLIB::SDIV_I32 : RTLIB::UDIV_I32;
+ else if (VT == MVT::i64)
+ LC = isSigned ? RTLIB::SDIV_I64 : RTLIB::UDIV_I64;
+ else if (VT == MVT::i128)
+ LC = isSigned ? RTLIB::SDIV_I128 : RTLIB::UDIV_I128;
+ assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SDIV!");
+
+ return ARMEmitLibcall(I, LC);
+}
+
+bool ARMFastISel::SelectRem(const Instruction *I, bool isSigned) {
+ MVT VT;
+ Type *Ty = I->getType();
+ if (!isTypeLegal(Ty, VT))
+ return false;
+
+ // Many ABIs do not provide a libcall for standalone remainder, so we need to
+ // use divrem (see the RTABI 4.3.1). Since FastISel can't handle non-double
+ // multi-reg returns, we'll have to bail out.
+ if (!TLI.hasStandaloneRem(VT)) {
+ return false;
+ }
+
+ RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+ if (VT == MVT::i8)
+ LC = isSigned ? RTLIB::SREM_I8 : RTLIB::UREM_I8;
+ else if (VT == MVT::i16)
+ LC = isSigned ? RTLIB::SREM_I16 : RTLIB::UREM_I16;
+ else if (VT == MVT::i32)
+ LC = isSigned ? RTLIB::SREM_I32 : RTLIB::UREM_I32;
+ else if (VT == MVT::i64)
+ LC = isSigned ? RTLIB::SREM_I64 : RTLIB::UREM_I64;
+ else if (VT == MVT::i128)
+ LC = isSigned ? RTLIB::SREM_I128 : RTLIB::UREM_I128;
+ assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SREM!");
+
+ return ARMEmitLibcall(I, LC);
+}
+
+bool ARMFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) {
+ EVT DestVT = TLI.getValueType(DL, I->getType(), true);
+
+ // We can get here in the case when we have a binary operation on a non-legal
+ // type and the target independent selector doesn't know how to handle it.
+ if (DestVT != MVT::i16 && DestVT != MVT::i8 && DestVT != MVT::i1)
+ return false;
+
+ unsigned Opc;
+ switch (ISDOpcode) {
+ default: return false;
+ case ISD::ADD:
+ Opc = isThumb2 ? ARM::t2ADDrr : ARM::ADDrr;
+ break;
+ case ISD::OR:
+ Opc = isThumb2 ? ARM::t2ORRrr : ARM::ORRrr;
+ break;
+ case ISD::SUB:
+ Opc = isThumb2 ? ARM::t2SUBrr : ARM::SUBrr;
+ break;
+ }
+
+ unsigned SrcReg1 = getRegForValue(I->getOperand(0));
+ if (SrcReg1 == 0) return false;
+
+ // TODO: Often the 2nd operand is an immediate, which can be encoded directly
+ // in the instruction, rather then materializing the value in a register.
+ unsigned SrcReg2 = getRegForValue(I->getOperand(1));
+ if (SrcReg2 == 0) return false;
+
+ unsigned ResultReg = createResultReg(&ARM::GPRnopcRegClass);
+ SrcReg1 = constrainOperandRegClass(TII.get(Opc), SrcReg1, 1);
+ SrcReg2 = constrainOperandRegClass(TII.get(Opc), SrcReg2, 2);
+ AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc), ResultReg)
+ .addReg(SrcReg1).addReg(SrcReg2));
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool ARMFastISel::SelectBinaryFPOp(const Instruction *I, unsigned ISDOpcode) {
+ EVT FPVT = TLI.getValueType(DL, I->getType(), true);
+ if (!FPVT.isSimple()) return false;
+ MVT VT = FPVT.getSimpleVT();
+
+ // FIXME: Support vector types where possible.
+ if (VT.isVector())
+ return false;
+
+ // We can get here in the case when we want to use NEON for our fp
+ // operations, but can't figure out how to. Just use the vfp instructions
+ // if we have them.
+ // FIXME: It'd be nice to use NEON instructions.
+ Type *Ty = I->getType();
+ bool isFloat = (Ty->isDoubleTy() || Ty->isFloatTy());
+ if (isFloat && !Subtarget->hasVFP2())
+ return false;
+
+ unsigned Opc;
+ bool is64bit = VT == MVT::f64 || VT == MVT::i64;
+ switch (ISDOpcode) {
+ default: return false;
+ case ISD::FADD:
+ Opc = is64bit ? ARM::VADDD : ARM::VADDS;
+ break;
+ case ISD::FSUB:
+ Opc = is64bit ? ARM::VSUBD : ARM::VSUBS;
+ break;
+ case ISD::FMUL:
+ Opc = is64bit ? ARM::VMULD : ARM::VMULS;
+ break;
+ }
+ unsigned Op1 = getRegForValue(I->getOperand(0));
+ if (Op1 == 0) return false;
+
+ unsigned Op2 = getRegForValue(I->getOperand(1));
+ if (Op2 == 0) return false;
+
+ unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT.SimpleTy));
+ AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc), ResultReg)
+ .addReg(Op1).addReg(Op2));
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+// Call Handling Code
+
+// This is largely taken directly from CCAssignFnForNode
+// TODO: We may not support all of this.
+CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC,
+ bool Return,
+ bool isVarArg) {
+ switch (CC) {
+ default:
+ llvm_unreachable("Unsupported calling convention");
+ case CallingConv::Fast:
+ if (Subtarget->hasVFP2() && !isVarArg) {
+ if (!Subtarget->isAAPCS_ABI())
+ return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
+ // For AAPCS ABI targets, just use VFP variant of the calling convention.
+ return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
+ }
+ LLVM_FALLTHROUGH;
+ case CallingConv::C:
+ case CallingConv::CXX_FAST_TLS:
+ // Use target triple & subtarget features to do actual dispatch.
+ if (Subtarget->isAAPCS_ABI()) {
+ if (Subtarget->hasVFP2() &&
+ TM.Options.FloatABIType == FloatABI::Hard && !isVarArg)
+ return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
+ else
+ return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS);
+ } else {
+ return (Return ? RetCC_ARM_APCS: CC_ARM_APCS);
+ }
+ case CallingConv::ARM_AAPCS_VFP:
+ case CallingConv::Swift:
+ if (!isVarArg)
+ return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
+ // Fall through to soft float variant, variadic functions don't
+ // use hard floating point ABI.
+ LLVM_FALLTHROUGH;
+ case CallingConv::ARM_AAPCS:
+ return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS);
+ case CallingConv::ARM_APCS:
+ return (Return ? RetCC_ARM_APCS: CC_ARM_APCS);
+ case CallingConv::GHC:
+ if (Return)
+ llvm_unreachable("Can't return in GHC call convention");
+ else
+ return CC_ARM_APCS_GHC;
+ }
+}
+
+bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
+ SmallVectorImpl<unsigned> &ArgRegs,
+ SmallVectorImpl<MVT> &ArgVTs,
+ SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
+ SmallVectorImpl<unsigned> &RegArgs,
+ CallingConv::ID CC,
+ unsigned &NumBytes,
+ bool isVarArg) {
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CC, isVarArg, *FuncInfo.MF, ArgLocs, *Context);
+ CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags,
+ CCAssignFnForCall(CC, false, isVarArg));
+
+ // Check that we can handle all of the arguments. If we can't, then bail out
+ // now before we add code to the MBB.
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ MVT ArgVT = ArgVTs[VA.getValNo()];
+
+ // We don't handle NEON/vector parameters yet.
+ if (ArgVT.isVector() || ArgVT.getSizeInBits() > 64)
+ return false;
+
+ // Now copy/store arg to correct locations.
+ if (VA.isRegLoc() && !VA.needsCustom()) {
+ continue;
+ } else if (VA.needsCustom()) {
+ // TODO: We need custom lowering for vector (v2f64) args.
+ if (VA.getLocVT() != MVT::f64 ||
+ // TODO: Only handle register args for now.
+ !VA.isRegLoc() || !ArgLocs[++i].isRegLoc())
+ return false;
+ } else {
+ switch (ArgVT.SimpleTy) {
+ default:
+ return false;
+ case MVT::i1:
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32:
+ break;
+ case MVT::f32:
+ if (!Subtarget->hasVFP2())
+ return false;
+ break;
+ case MVT::f64:
+ if (!Subtarget->hasVFP2())
+ return false;
+ break;
+ }
+ }
+ }
+
+ // At the point, we are able to handle the call's arguments in fast isel.
+
+ // Get a count of how many bytes are to be pushed on the stack.
+ NumBytes = CCInfo.getNextStackOffset();
+
+ // Issue CALLSEQ_START
+ unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
+ AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(AdjStackDown))
+ .addImm(NumBytes));
+
+ // Process the args.
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ const Value *ArgVal = Args[VA.getValNo()];
+ unsigned Arg = ArgRegs[VA.getValNo()];
+ MVT ArgVT = ArgVTs[VA.getValNo()];
+
+ assert((!ArgVT.isVector() && ArgVT.getSizeInBits() <= 64) &&
+ "We don't handle NEON/vector parameters yet.");
+
+ // Handle arg promotion, etc.
+ switch (VA.getLocInfo()) {
+ case CCValAssign::Full: break;
+ case CCValAssign::SExt: {
+ MVT DestVT = VA.getLocVT();
+ Arg = ARMEmitIntExt(ArgVT, Arg, DestVT, /*isZExt*/false);
+ assert (Arg != 0 && "Failed to emit a sext");
+ ArgVT = DestVT;
+ break;
+ }
+ case CCValAssign::AExt:
+ // Intentional fall-through. Handle AExt and ZExt.
+ case CCValAssign::ZExt: {
+ MVT DestVT = VA.getLocVT();
+ Arg = ARMEmitIntExt(ArgVT, Arg, DestVT, /*isZExt*/true);
+ assert (Arg != 0 && "Failed to emit a zext");
+ ArgVT = DestVT;
+ break;
+ }
+ case CCValAssign::BCvt: {
+ unsigned BC = fastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, Arg,
+ /*TODO: Kill=*/false);
+ assert(BC != 0 && "Failed to emit a bitcast!");
+ Arg = BC;
+ ArgVT = VA.getLocVT();
+ break;
+ }
+ default: llvm_unreachable("Unknown arg promotion!");
+ }
+
+ // Now copy/store arg to correct locations.
+ if (VA.isRegLoc() && !VA.needsCustom()) {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(Arg);
+ RegArgs.push_back(VA.getLocReg());
+ } else if (VA.needsCustom()) {
+ // TODO: We need custom lowering for vector (v2f64) args.
+ assert(VA.getLocVT() == MVT::f64 &&
+ "Custom lowering for v2f64 args not available");
+
+ CCValAssign &NextVA = ArgLocs[++i];
+
+ assert(VA.isRegLoc() && NextVA.isRegLoc() &&
+ "We only handle register args!");
+
+ AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(ARM::VMOVRRD), VA.getLocReg())
+ .addReg(NextVA.getLocReg(), RegState::Define)
+ .addReg(Arg));
+ RegArgs.push_back(VA.getLocReg());
+ RegArgs.push_back(NextVA.getLocReg());
+ } else {
+ assert(VA.isMemLoc());
+ // Need to store on the stack.
+
+ // Don't emit stores for undef values.
+ if (isa<UndefValue>(ArgVal))
+ continue;
+
+ Address Addr;
+ Addr.BaseType = Address::RegBase;
+ Addr.Base.Reg = ARM::SP;
+ Addr.Offset = VA.getLocMemOffset();
+
+ bool EmitRet = ARMEmitStore(ArgVT, Arg, Addr); (void)EmitRet;
+ assert(EmitRet && "Could not emit a store for argument!");
+ }
+ }
+
+ return true;
+}
+
+bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
+ const Instruction *I, CallingConv::ID CC,
+ unsigned &NumBytes, bool isVarArg) {
+ // Issue CALLSEQ_END
+ unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
+ AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(AdjStackUp))
+ .addImm(NumBytes).addImm(0));
+
+ // Now the return value.
+ if (RetVT != MVT::isVoid) {
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CC, isVarArg, *FuncInfo.MF, RVLocs, *Context);
+ CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true, isVarArg));
+
+ // Copy all of the result registers out of their specified physreg.
+ if (RVLocs.size() == 2 && RetVT == MVT::f64) {
+ // For this move we copy into two registers and then move into the
+ // double fp reg we want.
+ MVT DestVT = RVLocs[0].getValVT();
+ const TargetRegisterClass* DstRC = TLI.getRegClassFor(DestVT);
+ unsigned ResultReg = createResultReg(DstRC);
+ AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(ARM::VMOVDRR), ResultReg)
+ .addReg(RVLocs[0].getLocReg())
+ .addReg(RVLocs[1].getLocReg()));
+
+ UsedRegs.push_back(RVLocs[0].getLocReg());
+ UsedRegs.push_back(RVLocs[1].getLocReg());
+
+ // Finally update the result.
+ updateValueMap(I, ResultReg);
+ } else {
+ assert(RVLocs.size() == 1 &&"Can't handle non-double multi-reg retvals!");
+ MVT CopyVT = RVLocs[0].getValVT();
+
+ // Special handling for extended integers.
+ if (RetVT == MVT::i1 || RetVT == MVT::i8 || RetVT == MVT::i16)
+ CopyVT = MVT::i32;
+
+ const TargetRegisterClass* DstRC = TLI.getRegClassFor(CopyVT);
+
+ unsigned ResultReg = createResultReg(DstRC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY),
+ ResultReg).addReg(RVLocs[0].getLocReg());
+ UsedRegs.push_back(RVLocs[0].getLocReg());
+
+ // Finally update the result.
+ updateValueMap(I, ResultReg);
+ }
+ }
+
+ return true;
+}
+
+bool ARMFastISel::SelectRet(const Instruction *I) {
+ const ReturnInst *Ret = cast<ReturnInst>(I);
+ const Function &F = *I->getParent()->getParent();
+
+ if (!FuncInfo.CanLowerReturn)
+ return false;
+
+ if (TLI.supportSwiftError() &&
+ F.getAttributes().hasAttrSomewhere(Attribute::SwiftError))
+ return false;
+
+ if (TLI.supportSplitCSR(FuncInfo.MF))
+ return false;
+
+ // Build a list of return value registers.
+ SmallVector<unsigned, 4> RetRegs;
+
+ CallingConv::ID CC = F.getCallingConv();
+ if (Ret->getNumOperands() > 0) {
+ SmallVector<ISD::OutputArg, 4> Outs;
+ GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL);
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ValLocs;
+ CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, I->getContext());
+ CCInfo.AnalyzeReturn(Outs, CCAssignFnForCall(CC, true /* is Ret */,
+ F.isVarArg()));
+
+ const Value *RV = Ret->getOperand(0);
+ unsigned Reg = getRegForValue(RV);
+ if (Reg == 0)
+ return false;
+
+ // Only handle a single return value for now.
+ if (ValLocs.size() != 1)
+ return false;
+
+ CCValAssign &VA = ValLocs[0];
+
+ // Don't bother handling odd stuff for now.
+ if (VA.getLocInfo() != CCValAssign::Full)
+ return false;
+ // Only handle register returns for now.
+ if (!VA.isRegLoc())
+ return false;
+
+ unsigned SrcReg = Reg + VA.getValNo();
+ EVT RVEVT = TLI.getValueType(DL, RV->getType());
+ if (!RVEVT.isSimple()) return false;
+ MVT RVVT = RVEVT.getSimpleVT();
+ MVT DestVT = VA.getValVT();
+ // Special handling for extended integers.
+ if (RVVT != DestVT) {
+ if (RVVT != MVT::i1 && RVVT != MVT::i8 && RVVT != MVT::i16)
+ return false;
+
+ assert(DestVT == MVT::i32 && "ARM should always ext to i32");
+
+ // Perform extension if flagged as either zext or sext. Otherwise, do
+ // nothing.
+ if (Outs[0].Flags.isZExt() || Outs[0].Flags.isSExt()) {
+ SrcReg = ARMEmitIntExt(RVVT, SrcReg, DestVT, Outs[0].Flags.isZExt());
+ if (SrcReg == 0) return false;
+ }
+ }
+
+ // Make the copy.
+ unsigned DstReg = VA.getLocReg();
+ const TargetRegisterClass* SrcRC = MRI.getRegClass(SrcReg);
+ // Avoid a cross-class copy. This is very unlikely.
+ if (!SrcRC->contains(DstReg))
+ return false;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), DstReg).addReg(SrcReg);
+
+ // Add register to return instruction.
+ RetRegs.push_back(VA.getLocReg());
+ }
+
+ unsigned RetOpc = isThumb2 ? ARM::tBX_RET : ARM::BX_RET;
+ MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(RetOpc));
+ AddOptionalDefs(MIB);
+ for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
+ MIB.addReg(RetRegs[i], RegState::Implicit);
+ return true;
+}
+
+unsigned ARMFastISel::ARMSelectCallOp(bool UseReg) {
+ if (UseReg)
+ return isThumb2 ? ARM::tBLXr : ARM::BLX;
+ else
+ return isThumb2 ? ARM::tBL : ARM::BL;
+}
+
+unsigned ARMFastISel::getLibcallReg(const Twine &Name) {
+ // Manually compute the global's type to avoid building it when unnecessary.
+ Type *GVTy = Type::getInt32PtrTy(*Context, /*AS=*/0);
+ EVT LCREVT = TLI.getValueType(DL, GVTy);
+ if (!LCREVT.isSimple()) return 0;
+
+ GlobalValue *GV = new GlobalVariable(M, Type::getInt32Ty(*Context), false,
+ GlobalValue::ExternalLinkage, nullptr,
+ Name);
+ assert(GV->getType() == GVTy && "We miscomputed the type for the global!");
+ return ARMMaterializeGV(GV, LCREVT.getSimpleVT());
+}
+
+// A quick function that will emit a call for a named libcall in F with the
+// vector of passed arguments for the Instruction in I. We can assume that we
+// can emit a call for any libcall we can produce. This is an abridged version
+// of the full call infrastructure since we won't need to worry about things
+// like computed function pointers or strange arguments at call sites.
+// TODO: Try to unify this and the normal call bits for ARM, then try to unify
+// with X86.
+bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
+ CallingConv::ID CC = TLI.getLibcallCallingConv(Call);
+
+ // Handle *simple* calls for now.
+ Type *RetTy = I->getType();
+ MVT RetVT;
+ if (RetTy->isVoidTy())
+ RetVT = MVT::isVoid;
+ else if (!isTypeLegal(RetTy, RetVT))
+ return false;
+
+ // Can't handle non-double multi-reg retvals.
+ if (RetVT != MVT::isVoid && RetVT != MVT::i32) {
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CC, false, *FuncInfo.MF, RVLocs, *Context);
+ CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true, false));
+ if (RVLocs.size() >= 2 && RetVT != MVT::f64)
+ return false;
+ }
+
+ // Set up the argument vectors.
+ SmallVector<Value*, 8> Args;
+ SmallVector<unsigned, 8> ArgRegs;
+ SmallVector<MVT, 8> ArgVTs;
+ SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
+ Args.reserve(I->getNumOperands());
+ ArgRegs.reserve(I->getNumOperands());
+ ArgVTs.reserve(I->getNumOperands());
+ ArgFlags.reserve(I->getNumOperands());
+ for (unsigned i = 0; i < I->getNumOperands(); ++i) {
+ Value *Op = I->getOperand(i);
+ unsigned Arg = getRegForValue(Op);
+ if (Arg == 0) return false;
+
+ Type *ArgTy = Op->getType();
+ MVT ArgVT;
+ if (!isTypeLegal(ArgTy, ArgVT)) return false;
+
+ ISD::ArgFlagsTy Flags;
+ unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy);
+ Flags.setOrigAlign(OriginalAlignment);
+
+ Args.push_back(Op);
+ ArgRegs.push_back(Arg);
+ ArgVTs.push_back(ArgVT);
+ ArgFlags.push_back(Flags);
+ }
+
+ // Handle the arguments now that we've gotten them.
+ SmallVector<unsigned, 4> RegArgs;
+ unsigned NumBytes;
+ if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags,
+ RegArgs, CC, NumBytes, false))
+ return false;
+
+ unsigned CalleeReg = 0;
+ if (Subtarget->genLongCalls()) {
+ CalleeReg = getLibcallReg(TLI.getLibcallName(Call));
+ if (CalleeReg == 0) return false;
+ }
+
+ // Issue the call.
+ unsigned CallOpc = ARMSelectCallOp(Subtarget->genLongCalls());
+ MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
+ DbgLoc, TII.get(CallOpc));
+ // BL / BLX don't take a predicate, but tBL / tBLX do.
+ if (isThumb2)
+ AddDefaultPred(MIB);
+ if (Subtarget->genLongCalls())
+ MIB.addReg(CalleeReg);
+ else
+ MIB.addExternalSymbol(TLI.getLibcallName(Call));
+
+ // Add implicit physical register uses to the call.
+ for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
+ MIB.addReg(RegArgs[i], RegState::Implicit);
+
+ // Add a register mask with the call-preserved registers.
+ // Proper defs for return values will be added by setPhysRegsDeadExcept().
+ MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC));
+
+ // Finish off the call including any return values.
+ SmallVector<unsigned, 4> UsedRegs;
+ if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes, false)) return false;
+
+ // Set all unused physreg defs as dead.
+ static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI);
+
+ return true;
+}
+
+bool ARMFastISel::SelectCall(const Instruction *I,
+ const char *IntrMemName = nullptr) {
+ const CallInst *CI = cast<CallInst>(I);
+ const Value *Callee = CI->getCalledValue();
+
+ // Can't handle inline asm.
+ if (isa<InlineAsm>(Callee)) return false;
+
+ // Allow SelectionDAG isel to handle tail calls.
+ if (CI->isTailCall()) return false;
+
+ // Check the calling convention.
+ ImmutableCallSite CS(CI);
+ CallingConv::ID CC = CS.getCallingConv();
+
+ // TODO: Avoid some calling conventions?
+
+ FunctionType *FTy = CS.getFunctionType();
+ bool isVarArg = FTy->isVarArg();
+
+ // Handle *simple* calls for now.
+ Type *RetTy = I->getType();
+ MVT RetVT;
+ if (RetTy->isVoidTy())
+ RetVT = MVT::isVoid;
+ else if (!isTypeLegal(RetTy, RetVT) && RetVT != MVT::i16 &&
+ RetVT != MVT::i8 && RetVT != MVT::i1)
+ return false;
+
+ // Can't handle non-double multi-reg retvals.
+ if (RetVT != MVT::isVoid && RetVT != MVT::i1 && RetVT != MVT::i8 &&
+ RetVT != MVT::i16 && RetVT != MVT::i32) {
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CC, isVarArg, *FuncInfo.MF, RVLocs, *Context);
+ CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true, isVarArg));
+ if (RVLocs.size() >= 2 && RetVT != MVT::f64)
+ return false;
+ }
+
+ // Set up the argument vectors.
+ SmallVector<Value*, 8> Args;
+ SmallVector<unsigned, 8> ArgRegs;
+ SmallVector<MVT, 8> ArgVTs;
+ SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
+ unsigned arg_size = CS.arg_size();
+ Args.reserve(arg_size);
+ ArgRegs.reserve(arg_size);
+ ArgVTs.reserve(arg_size);
+ ArgFlags.reserve(arg_size);
+ for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
+ i != e; ++i) {
+ // If we're lowering a memory intrinsic instead of a regular call, skip the
+ // last two arguments, which shouldn't be passed to the underlying function.
+ if (IntrMemName && e-i <= 2)
+ break;
+
+ ISD::ArgFlagsTy Flags;
+ unsigned AttrInd = i - CS.arg_begin() + 1;
+ if (CS.paramHasAttr(AttrInd, Attribute::SExt))
+ Flags.setSExt();
+ if (CS.paramHasAttr(AttrInd, Attribute::ZExt))
+ Flags.setZExt();
+
+ // FIXME: Only handle *easy* calls for now.
+ if (CS.paramHasAttr(AttrInd, Attribute::InReg) ||
+ CS.paramHasAttr(AttrInd, Attribute::StructRet) ||
+ CS.paramHasAttr(AttrInd, Attribute::SwiftSelf) ||
+ CS.paramHasAttr(AttrInd, Attribute::SwiftError) ||
+ CS.paramHasAttr(AttrInd, Attribute::Nest) ||
+ CS.paramHasAttr(AttrInd, Attribute::ByVal))
+ return false;
+
+ Type *ArgTy = (*i)->getType();
+ MVT ArgVT;
+ if (!isTypeLegal(ArgTy, ArgVT) && ArgVT != MVT::i16 && ArgVT != MVT::i8 &&
+ ArgVT != MVT::i1)
+ return false;
+
+ unsigned Arg = getRegForValue(*i);
+ if (Arg == 0)
+ return false;
+
+ unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy);
+ Flags.setOrigAlign(OriginalAlignment);
+
+ Args.push_back(*i);
+ ArgRegs.push_back(Arg);
+ ArgVTs.push_back(ArgVT);
+ ArgFlags.push_back(Flags);
+ }
+
+ // Handle the arguments now that we've gotten them.
+ SmallVector<unsigned, 4> RegArgs;
+ unsigned NumBytes;
+ if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags,
+ RegArgs, CC, NumBytes, isVarArg))
+ return false;
+
+ bool UseReg = false;
+ const GlobalValue *GV = dyn_cast<GlobalValue>(Callee);
+ if (!GV || Subtarget->genLongCalls()) UseReg = true;
+
+ unsigned CalleeReg = 0;
+ if (UseReg) {
+ if (IntrMemName)
+ CalleeReg = getLibcallReg(IntrMemName);
+ else
+ CalleeReg = getRegForValue(Callee);
+
+ if (CalleeReg == 0) return false;
+ }
+
+ // Issue the call.
+ unsigned CallOpc = ARMSelectCallOp(UseReg);
+ MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
+ DbgLoc, TII.get(CallOpc));
+
+ // ARM calls don't take a predicate, but tBL / tBLX do.
+ if(isThumb2)
+ AddDefaultPred(MIB);
+ if (UseReg)
+ MIB.addReg(CalleeReg);
+ else if (!IntrMemName)
+ MIB.addGlobalAddress(GV, 0, 0);
+ else
+ MIB.addExternalSymbol(IntrMemName, 0);
+
+ // Add implicit physical register uses to the call.
+ for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
+ MIB.addReg(RegArgs[i], RegState::Implicit);
+
+ // Add a register mask with the call-preserved registers.
+ // Proper defs for return values will be added by setPhysRegsDeadExcept().
+ MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC));
+
+ // Finish off the call including any return values.
+ SmallVector<unsigned, 4> UsedRegs;
+ if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes, isVarArg))
+ return false;
+
+ // Set all unused physreg defs as dead.
+ static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI);
+
+ return true;
+}
+
+bool ARMFastISel::ARMIsMemCpySmall(uint64_t Len) {
+ return Len <= 16;
+}
+
+bool ARMFastISel::ARMTryEmitSmallMemCpy(Address Dest, Address Src,
+ uint64_t Len, unsigned Alignment) {
+ // Make sure we don't bloat code by inlining very large memcpy's.
+ if (!ARMIsMemCpySmall(Len))
+ return false;
+
+ while (Len) {
+ MVT VT;
+ if (!Alignment || Alignment >= 4) {
+ if (Len >= 4)
+ VT = MVT::i32;
+ else if (Len >= 2)
+ VT = MVT::i16;
+ else {
+ assert (Len == 1 && "Expected a length of 1!");
+ VT = MVT::i8;
+ }
+ } else {
+ // Bound based on alignment.
+ if (Len >= 2 && Alignment == 2)
+ VT = MVT::i16;
+ else {
+ VT = MVT::i8;
+ }
+ }
+
+ bool RV;
+ unsigned ResultReg;
+ RV = ARMEmitLoad(VT, ResultReg, Src);
+ assert (RV == true && "Should be able to handle this load.");
+ RV = ARMEmitStore(VT, ResultReg, Dest);
+ assert (RV == true && "Should be able to handle this store.");
+ (void)RV;
+
+ unsigned Size = VT.getSizeInBits()/8;
+ Len -= Size;
+ Dest.Offset += Size;
+ Src.Offset += Size;
+ }
+
+ return true;
+}
+
+bool ARMFastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
+ // FIXME: Handle more intrinsics.
+ switch (I.getIntrinsicID()) {
+ default: return false;
+ case Intrinsic::frameaddress: {
+ MachineFrameInfo &MFI = FuncInfo.MF->getFrameInfo();
+ MFI.setFrameAddressIsTaken(true);
+
+ unsigned LdrOpc = isThumb2 ? ARM::t2LDRi12 : ARM::LDRi12;
+ const TargetRegisterClass *RC = isThumb2 ? &ARM::tGPRRegClass
+ : &ARM::GPRRegClass;
+
+ const ARMBaseRegisterInfo *RegInfo =
+ static_cast<const ARMBaseRegisterInfo *>(Subtarget->getRegisterInfo());
+ unsigned FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF));
+ unsigned SrcReg = FramePtr;
+
+ // Recursively load frame address
+ // ldr r0 [fp]
+ // ldr r0 [r0]
+ // ldr r0 [r0]
+ // ...
+ unsigned DestReg;
+ unsigned Depth = cast<ConstantInt>(I.getOperand(0))->getZExtValue();
+ while (Depth--) {
+ DestReg = createResultReg(RC);
+ AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(LdrOpc), DestReg)
+ .addReg(SrcReg).addImm(0));
+ SrcReg = DestReg;
+ }
+ updateValueMap(&I, SrcReg);
+ return true;
+ }
+ case Intrinsic::memcpy:
+ case Intrinsic::memmove: {
+ const MemTransferInst &MTI = cast<MemTransferInst>(I);
+ // Don't handle volatile.
+ if (MTI.isVolatile())
+ return false;
+
+ // Disable inlining for memmove before calls to ComputeAddress. Otherwise,
+ // we would emit dead code because we don't currently handle memmoves.
+ bool isMemCpy = (I.getIntrinsicID() == Intrinsic::memcpy);
+ if (isa<ConstantInt>(MTI.getLength()) && isMemCpy) {
+ // Small memcpy's are common enough that we want to do them without a call
+ // if possible.
+ uint64_t Len = cast<ConstantInt>(MTI.getLength())->getZExtValue();
+ if (ARMIsMemCpySmall(Len)) {
+ Address Dest, Src;
+ if (!ARMComputeAddress(MTI.getRawDest(), Dest) ||
+ !ARMComputeAddress(MTI.getRawSource(), Src))
+ return false;
+ unsigned Alignment = MTI.getAlignment();
+ if (ARMTryEmitSmallMemCpy(Dest, Src, Len, Alignment))
+ return true;
+ }
+ }
+
+ if (!MTI.getLength()->getType()->isIntegerTy(32))
+ return false;
+
+ if (MTI.getSourceAddressSpace() > 255 || MTI.getDestAddressSpace() > 255)
+ return false;
+
+ const char *IntrMemName = isa<MemCpyInst>(I) ? "memcpy" : "memmove";
+ return SelectCall(&I, IntrMemName);
+ }
+ case Intrinsic::memset: {
+ const MemSetInst &MSI = cast<MemSetInst>(I);
+ // Don't handle volatile.
+ if (MSI.isVolatile())
+ return false;
+
+ if (!MSI.getLength()->getType()->isIntegerTy(32))
+ return false;
+
+ if (MSI.getDestAddressSpace() > 255)
+ return false;
+
+ return SelectCall(&I, "memset");
+ }
+ case Intrinsic::trap: {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(
+ Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP));
+ return true;
+ }
+ }
+}
+
+bool ARMFastISel::SelectTrunc(const Instruction *I) {
+ // The high bits for a type smaller than the register size are assumed to be
+ // undefined.
+ Value *Op = I->getOperand(0);
+
+ EVT SrcVT, DestVT;
+ SrcVT = TLI.getValueType(DL, Op->getType(), true);
+ DestVT = TLI.getValueType(DL, I->getType(), true);
+
+ if (SrcVT != MVT::i32 && SrcVT != MVT::i16 && SrcVT != MVT::i8)
+ return false;
+ if (DestVT != MVT::i16 && DestVT != MVT::i8 && DestVT != MVT::i1)
+ return false;
+
+ unsigned SrcReg = getRegForValue(Op);
+ if (!SrcReg) return false;
+
+ // Because the high bits are undefined, a truncate doesn't generate
+ // any code.
+ updateValueMap(I, SrcReg);
+ return true;
+}
+
+unsigned ARMFastISel::ARMEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+ bool isZExt) {
+ if (DestVT != MVT::i32 && DestVT != MVT::i16 && DestVT != MVT::i8)
+ return 0;
+ if (SrcVT != MVT::i16 && SrcVT != MVT::i8 && SrcVT != MVT::i1)
+ return 0;
+
+ // Table of which combinations can be emitted as a single instruction,
+ // and which will require two.
+ static const uint8_t isSingleInstrTbl[3][2][2][2] = {
+ // ARM Thumb
+ // !hasV6Ops hasV6Ops !hasV6Ops hasV6Ops
+ // ext: s z s z s z s z
+ /* 1 */ { { { 0, 1 }, { 0, 1 } }, { { 0, 0 }, { 0, 1 } } },
+ /* 8 */ { { { 0, 1 }, { 1, 1 } }, { { 0, 0 }, { 1, 1 } } },
+ /* 16 */ { { { 0, 0 }, { 1, 1 } }, { { 0, 0 }, { 1, 1 } } }
+ };
+
+ // Target registers for:
+ // - For ARM can never be PC.
+ // - For 16-bit Thumb are restricted to lower 8 registers.
+ // - For 32-bit Thumb are restricted to non-SP and non-PC.
+ static const TargetRegisterClass *RCTbl[2][2] = {
+ // Instructions: Two Single
+ /* ARM */ { &ARM::GPRnopcRegClass, &ARM::GPRnopcRegClass },
+ /* Thumb */ { &ARM::tGPRRegClass, &ARM::rGPRRegClass }
+ };
+
+ // Table governing the instruction(s) to be emitted.
+ static const struct InstructionTable {
+ uint32_t Opc : 16;
+ uint32_t hasS : 1; // Some instructions have an S bit, always set it to 0.
+ uint32_t Shift : 7; // For shift operand addressing mode, used by MOVsi.
+ uint32_t Imm : 8; // All instructions have either a shift or a mask.
+ } IT[2][2][3][2] = {
+ { // Two instructions (first is left shift, second is in this table).
+ { // ARM Opc S Shift Imm
+ /* 1 bit sext */ { { ARM::MOVsi , 1, ARM_AM::asr , 31 },
+ /* 1 bit zext */ { ARM::MOVsi , 1, ARM_AM::lsr , 31 } },
+ /* 8 bit sext */ { { ARM::MOVsi , 1, ARM_AM::asr , 24 },
+ /* 8 bit zext */ { ARM::MOVsi , 1, ARM_AM::lsr , 24 } },
+ /* 16 bit sext */ { { ARM::MOVsi , 1, ARM_AM::asr , 16 },
+ /* 16 bit zext */ { ARM::MOVsi , 1, ARM_AM::lsr , 16 } }
+ },
+ { // Thumb Opc S Shift Imm
+ /* 1 bit sext */ { { ARM::tASRri , 0, ARM_AM::no_shift, 31 },
+ /* 1 bit zext */ { ARM::tLSRri , 0, ARM_AM::no_shift, 31 } },
+ /* 8 bit sext */ { { ARM::tASRri , 0, ARM_AM::no_shift, 24 },
+ /* 8 bit zext */ { ARM::tLSRri , 0, ARM_AM::no_shift, 24 } },
+ /* 16 bit sext */ { { ARM::tASRri , 0, ARM_AM::no_shift, 16 },
+ /* 16 bit zext */ { ARM::tLSRri , 0, ARM_AM::no_shift, 16 } }
+ }
+ },
+ { // Single instruction.
+ { // ARM Opc S Shift Imm
+ /* 1 bit sext */ { { ARM::KILL , 0, ARM_AM::no_shift, 0 },
+ /* 1 bit zext */ { ARM::ANDri , 1, ARM_AM::no_shift, 1 } },
+ /* 8 bit sext */ { { ARM::SXTB , 0, ARM_AM::no_shift, 0 },
+ /* 8 bit zext */ { ARM::ANDri , 1, ARM_AM::no_shift, 255 } },
+ /* 16 bit sext */ { { ARM::SXTH , 0, ARM_AM::no_shift, 0 },
+ /* 16 bit zext */ { ARM::UXTH , 0, ARM_AM::no_shift, 0 } }
+ },
+ { // Thumb Opc S Shift Imm
+ /* 1 bit sext */ { { ARM::KILL , 0, ARM_AM::no_shift, 0 },
+ /* 1 bit zext */ { ARM::t2ANDri, 1, ARM_AM::no_shift, 1 } },
+ /* 8 bit sext */ { { ARM::t2SXTB , 0, ARM_AM::no_shift, 0 },
+ /* 8 bit zext */ { ARM::t2ANDri, 1, ARM_AM::no_shift, 255 } },
+ /* 16 bit sext */ { { ARM::t2SXTH , 0, ARM_AM::no_shift, 0 },
+ /* 16 bit zext */ { ARM::t2UXTH , 0, ARM_AM::no_shift, 0 } }
+ }
+ }
+ };
+
+ unsigned SrcBits = SrcVT.getSizeInBits();
+ unsigned DestBits = DestVT.getSizeInBits();
+ (void) DestBits;
+ assert((SrcBits < DestBits) && "can only extend to larger types");
+ assert((DestBits == 32 || DestBits == 16 || DestBits == 8) &&
+ "other sizes unimplemented");
+ assert((SrcBits == 16 || SrcBits == 8 || SrcBits == 1) &&
+ "other sizes unimplemented");
+
+ bool hasV6Ops = Subtarget->hasV6Ops();
+ unsigned Bitness = SrcBits / 8; // {1,8,16}=>{0,1,2}
+ assert((Bitness < 3) && "sanity-check table bounds");
+
+ bool isSingleInstr = isSingleInstrTbl[Bitness][isThumb2][hasV6Ops][isZExt];
+ const TargetRegisterClass *RC = RCTbl[isThumb2][isSingleInstr];
+ const InstructionTable *ITP = &IT[isSingleInstr][isThumb2][Bitness][isZExt];
+ unsigned Opc = ITP->Opc;
+ assert(ARM::KILL != Opc && "Invalid table entry");
+ unsigned hasS = ITP->hasS;
+ ARM_AM::ShiftOpc Shift = (ARM_AM::ShiftOpc) ITP->Shift;
+ assert(((Shift == ARM_AM::no_shift) == (Opc != ARM::MOVsi)) &&
+ "only MOVsi has shift operand addressing mode");
+ unsigned Imm = ITP->Imm;
+
+ // 16-bit Thumb instructions always set CPSR (unless they're in an IT block).
+ bool setsCPSR = &ARM::tGPRRegClass == RC;
+ unsigned LSLOpc = isThumb2 ? ARM::tLSLri : ARM::MOVsi;
+ unsigned ResultReg;
+ // MOVsi encodes shift and immediate in shift operand addressing mode.
+ // The following condition has the same value when emitting two
+ // instruction sequences: both are shifts.
+ bool ImmIsSO = (Shift != ARM_AM::no_shift);
+
+ // Either one or two instructions are emitted.
+ // They're always of the form:
+ // dst = in OP imm
+ // CPSR is set only by 16-bit Thumb instructions.
+ // Predicate, if any, is AL.
+ // S bit, if available, is always 0.
+ // When two are emitted the first's result will feed as the second's input,
+ // that value is then dead.
+ unsigned NumInstrsEmitted = isSingleInstr ? 1 : 2;
+ for (unsigned Instr = 0; Instr != NumInstrsEmitted; ++Instr) {
+ ResultReg = createResultReg(RC);
+ bool isLsl = (0 == Instr) && !isSingleInstr;
+ unsigned Opcode = isLsl ? LSLOpc : Opc;
+ ARM_AM::ShiftOpc ShiftAM = isLsl ? ARM_AM::lsl : Shift;
+ unsigned ImmEnc = ImmIsSO ? ARM_AM::getSORegOpc(ShiftAM, Imm) : Imm;
+ bool isKill = 1 == Instr;
+ MachineInstrBuilder MIB = BuildMI(
+ *FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opcode), ResultReg);
+ if (setsCPSR)
+ MIB.addReg(ARM::CPSR, RegState::Define);
+ SrcReg = constrainOperandRegClass(TII.get(Opcode), SrcReg, 1 + setsCPSR);
+ AddDefaultPred(MIB.addReg(SrcReg, isKill * RegState::Kill).addImm(ImmEnc));
+ if (hasS)
+ AddDefaultCC(MIB);
+ // Second instruction consumes the first's result.
+ SrcReg = ResultReg;
+ }
+
+ return ResultReg;
+}
+
+bool ARMFastISel::SelectIntExt(const Instruction *I) {
+ // On ARM, in general, integer casts don't involve legal types; this code
+ // handles promotable integers.
+ Type *DestTy = I->getType();
+ Value *Src = I->getOperand(0);
+ Type *SrcTy = Src->getType();
+
+ bool isZExt = isa<ZExtInst>(I);
+ unsigned SrcReg = getRegForValue(Src);
+ if (!SrcReg) return false;
+
+ EVT SrcEVT, DestEVT;
+ SrcEVT = TLI.getValueType(DL, SrcTy, true);
+ DestEVT = TLI.getValueType(DL, DestTy, true);
+ if (!SrcEVT.isSimple()) return false;
+ if (!DestEVT.isSimple()) return false;
+
+ MVT SrcVT = SrcEVT.getSimpleVT();
+ MVT DestVT = DestEVT.getSimpleVT();
+ unsigned ResultReg = ARMEmitIntExt(SrcVT, SrcReg, DestVT, isZExt);
+ if (ResultReg == 0) return false;
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool ARMFastISel::SelectShift(const Instruction *I,
+ ARM_AM::ShiftOpc ShiftTy) {
+ // We handle thumb2 mode by target independent selector
+ // or SelectionDAG ISel.
+ if (isThumb2)
+ return false;
+
+ // Only handle i32 now.
+ EVT DestVT = TLI.getValueType(DL, I->getType(), true);
+ if (DestVT != MVT::i32)
+ return false;
+
+ unsigned Opc = ARM::MOVsr;
+ unsigned ShiftImm;
+ Value *Src2Value = I->getOperand(1);
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(Src2Value)) {
+ ShiftImm = CI->getZExtValue();
+
+ // Fall back to selection DAG isel if the shift amount
+ // is zero or greater than the width of the value type.
+ if (ShiftImm == 0 || ShiftImm >=32)
+ return false;
+
+ Opc = ARM::MOVsi;
+ }
+
+ Value *Src1Value = I->getOperand(0);
+ unsigned Reg1 = getRegForValue(Src1Value);
+ if (Reg1 == 0) return false;
+
+ unsigned Reg2 = 0;
+ if (Opc == ARM::MOVsr) {
+ Reg2 = getRegForValue(Src2Value);
+ if (Reg2 == 0) return false;
+ }
+
+ unsigned ResultReg = createResultReg(&ARM::GPRnopcRegClass);
+ if(ResultReg == 0) return false;
+
+ MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc), ResultReg)
+ .addReg(Reg1);
+
+ if (Opc == ARM::MOVsi)
+ MIB.addImm(ARM_AM::getSORegOpc(ShiftTy, ShiftImm));
+ else if (Opc == ARM::MOVsr) {
+ MIB.addReg(Reg2);
+ MIB.addImm(ARM_AM::getSORegOpc(ShiftTy, 0));
+ }
+
+ AddOptionalDefs(MIB);
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+// TODO: SoftFP support.
+bool ARMFastISel::fastSelectInstruction(const Instruction *I) {
+
+ switch (I->getOpcode()) {
+ case Instruction::Load:
+ return SelectLoad(I);
+ case Instruction::Store:
+ return SelectStore(I);
+ case Instruction::Br:
+ return SelectBranch(I);
+ case Instruction::IndirectBr:
+ return SelectIndirectBr(I);
+ case Instruction::ICmp:
+ case Instruction::FCmp:
+ return SelectCmp(I);
+ case Instruction::FPExt:
+ return SelectFPExt(I);
+ case Instruction::FPTrunc:
+ return SelectFPTrunc(I);
+ case Instruction::SIToFP:
+ return SelectIToFP(I, /*isSigned*/ true);
+ case Instruction::UIToFP:
+ return SelectIToFP(I, /*isSigned*/ false);
+ case Instruction::FPToSI:
+ return SelectFPToI(I, /*isSigned*/ true);
+ case Instruction::FPToUI:
+ return SelectFPToI(I, /*isSigned*/ false);
+ case Instruction::Add:
+ return SelectBinaryIntOp(I, ISD::ADD);
+ case Instruction::Or:
+ return SelectBinaryIntOp(I, ISD::OR);
+ case Instruction::Sub:
+ return SelectBinaryIntOp(I, ISD::SUB);
+ case Instruction::FAdd:
+ return SelectBinaryFPOp(I, ISD::FADD);
+ case Instruction::FSub:
+ return SelectBinaryFPOp(I, ISD::FSUB);
+ case Instruction::FMul:
+ return SelectBinaryFPOp(I, ISD::FMUL);
+ case Instruction::SDiv:
+ return SelectDiv(I, /*isSigned*/ true);
+ case Instruction::UDiv:
+ return SelectDiv(I, /*isSigned*/ false);
+ case Instruction::SRem:
+ return SelectRem(I, /*isSigned*/ true);
+ case Instruction::URem:
+ return SelectRem(I, /*isSigned*/ false);
+ case Instruction::Call:
+ if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+ return SelectIntrinsicCall(*II);
+ return SelectCall(I);
+ case Instruction::Select:
+ return SelectSelect(I);
+ case Instruction::Ret:
+ return SelectRet(I);
+ case Instruction::Trunc:
+ return SelectTrunc(I);
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ return SelectIntExt(I);
+ case Instruction::Shl:
+ return SelectShift(I, ARM_AM::lsl);
+ case Instruction::LShr:
+ return SelectShift(I, ARM_AM::lsr);
+ case Instruction::AShr:
+ return SelectShift(I, ARM_AM::asr);
+ default: break;
+ }
+ return false;
+}
+
+namespace {
+// This table describes sign- and zero-extend instructions which can be
+// folded into a preceding load. All of these extends have an immediate
+// (sometimes a mask and sometimes a shift) that's applied after
+// extension.
+const struct FoldableLoadExtendsStruct {
+ uint16_t Opc[2]; // ARM, Thumb.
+ uint8_t ExpectedImm;
+ uint8_t isZExt : 1;
+ uint8_t ExpectedVT : 7;
+} FoldableLoadExtends[] = {
+ { { ARM::SXTH, ARM::t2SXTH }, 0, 0, MVT::i16 },
+ { { ARM::UXTH, ARM::t2UXTH }, 0, 1, MVT::i16 },
+ { { ARM::ANDri, ARM::t2ANDri }, 255, 1, MVT::i8 },
+ { { ARM::SXTB, ARM::t2SXTB }, 0, 0, MVT::i8 },
+ { { ARM::UXTB, ARM::t2UXTB }, 0, 1, MVT::i8 }
+};
+}
+
+/// \brief The specified machine instr operand is a vreg, and that
+/// vreg is being provided by the specified load instruction. If possible,
+/// try to fold the load as an operand to the instruction, returning true if
+/// successful.
+bool ARMFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
+ const LoadInst *LI) {
+ // Verify we have a legal type before going any further.
+ MVT VT;
+ if (!isLoadTypeLegal(LI->getType(), VT))
+ return false;
+
+ // Combine load followed by zero- or sign-extend.
+ // ldrb r1, [r0] ldrb r1, [r0]
+ // uxtb r2, r1 =>
+ // mov r3, r2 mov r3, r1
+ if (MI->getNumOperands() < 3 || !MI->getOperand(2).isImm())
+ return false;
+ const uint64_t Imm = MI->getOperand(2).getImm();
+
+ bool Found = false;
+ bool isZExt;
+ for (unsigned i = 0, e = array_lengthof(FoldableLoadExtends);
+ i != e; ++i) {
+ if (FoldableLoadExtends[i].Opc[isThumb2] == MI->getOpcode() &&
+ (uint64_t)FoldableLoadExtends[i].ExpectedImm == Imm &&
+ MVT((MVT::SimpleValueType)FoldableLoadExtends[i].ExpectedVT) == VT) {
+ Found = true;
+ isZExt = FoldableLoadExtends[i].isZExt;
+ }
+ }
+ if (!Found) return false;
+
+ // See if we can handle this address.
+ Address Addr;
+ if (!ARMComputeAddress(LI->getOperand(0), Addr)) return false;
+
+ unsigned ResultReg = MI->getOperand(0).getReg();
+ if (!ARMEmitLoad(VT, ResultReg, Addr, LI->getAlignment(), isZExt, false))
+ return false;
+ MI->eraseFromParent();
+ return true;
+}
+
+unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV,
+ unsigned Align, MVT VT) {
+ bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV);
+
+ LLVMContext *Context = &MF->getFunction()->getContext();
+ unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+ unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
+ ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(
+ GV, ARMPCLabelIndex, ARMCP::CPValue, PCAdj,
+ UseGOT_PREL ? ARMCP::GOT_PREL : ARMCP::no_modifier,
+ /*AddCurrentAddress=*/UseGOT_PREL);
+
+ unsigned ConstAlign =
+ MF->getDataLayout().getPrefTypeAlignment(Type::getInt32PtrTy(*Context));
+ unsigned Idx = MF->getConstantPool()->getConstantPoolIndex(CPV, ConstAlign);
+
+ unsigned TempReg = MF->getRegInfo().createVirtualRegister(&ARM::rGPRRegClass);
+ unsigned Opc = isThumb2 ? ARM::t2LDRpci : ARM::LDRcp;
+ MachineInstrBuilder MIB =
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), TempReg)
+ .addConstantPoolIndex(Idx);
+ if (Opc == ARM::LDRcp)
+ MIB.addImm(0);
+ AddDefaultPred(MIB);
+
+ // Fix the address by adding pc.
+ unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
+ Opc = Subtarget->isThumb() ? ARM::tPICADD : UseGOT_PREL ? ARM::PICLDR
+ : ARM::PICADD;
+ DestReg = constrainOperandRegClass(TII.get(Opc), DestReg, 0);
+ MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
+ .addReg(TempReg)
+ .addImm(ARMPCLabelIndex);
+ if (!Subtarget->isThumb())
+ AddDefaultPred(MIB);
+
+ if (UseGOT_PREL && Subtarget->isThumb()) {
+ unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT));
+ MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(ARM::t2LDRi12), NewDestReg)
+ .addReg(DestReg)
+ .addImm(0);
+ DestReg = NewDestReg;
+ AddOptionalDefs(MIB);
+ }
+ return DestReg;
+}
+
+bool ARMFastISel::fastLowerArguments() {
+ if (!FuncInfo.CanLowerReturn)
+ return false;
+
+ const Function *F = FuncInfo.Fn;
+ if (F->isVarArg())
+ return false;
+
+ CallingConv::ID CC = F->getCallingConv();
+ switch (CC) {
+ default:
+ return false;
+ case CallingConv::Fast:
+ case CallingConv::C:
+ case CallingConv::ARM_AAPCS_VFP:
+ case CallingConv::ARM_AAPCS:
+ case CallingConv::ARM_APCS:
+ case CallingConv::Swift:
+ break;
+ }
+
+ // Only handle simple cases. i.e. Up to 4 i8/i16/i32 scalar arguments
+ // which are passed in r0 - r3.
+ unsigned Idx = 1;
+ for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+ I != E; ++I, ++Idx) {
+ if (Idx > 4)
+ return false;
+
+ if (F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
+ F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
+ F->getAttributes().hasAttribute(Idx, Attribute::SwiftSelf) ||
+ F->getAttributes().hasAttribute(Idx, Attribute::SwiftError) ||
+ F->getAttributes().hasAttribute(Idx, Attribute::ByVal))
+ return false;
+
+ Type *ArgTy = I->getType();
+ if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy())
+ return false;
+
+ EVT ArgVT = TLI.getValueType(DL, ArgTy);
+ if (!ArgVT.isSimple()) return false;
+ switch (ArgVT.getSimpleVT().SimpleTy) {
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32:
+ break;
+ default:
+ return false;
+ }
+ }
+
+
+ static const MCPhysReg GPRArgRegs[] = {
+ ARM::R0, ARM::R1, ARM::R2, ARM::R3
+ };
+
+ const TargetRegisterClass *RC = &ARM::rGPRRegClass;
+ Idx = 0;
+ for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+ I != E; ++I, ++Idx) {
+ unsigned SrcReg = GPRArgRegs[Idx];
+ unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
+ // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
+ // Without this, EmitLiveInCopies may eliminate the livein if its only
+ // use is a bitcast (which isn't turned into an instruction).
+ unsigned ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY),
+ ResultReg).addReg(DstReg, getKillRegState(true));
+ updateValueMap(&*I, ResultReg);
+ }
+
+ return true;
+}
+
+namespace llvm {
+ FastISel *ARM::createFastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo) {
+ if (funcInfo.MF->getSubtarget<ARMSubtarget>().useFastISel())
+ return new ARMFastISel(funcInfo, libInfo);
+
+ return nullptr;
+ }
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMFeatures.h b/contrib/llvm/lib/Target/ARM/ARMFeatures.h
new file mode 100644
index 000000000000..0c910ab6130f
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMFeatures.h
@@ -0,0 +1,97 @@
+//===-- ARMFeatures.h - Checks for ARM instruction features -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the code shared between ARM CodeGen and ARM MC
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMFEATURES_H
+#define LLVM_LIB_TARGET_ARM_ARMFEATURES_H
+
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+
+namespace llvm {
+
+template<typename InstrType> // could be MachineInstr or MCInst
+bool IsCPSRDead(InstrType *Instr);
+
+template<typename InstrType> // could be MachineInstr or MCInst
+inline bool isV8EligibleForIT(InstrType *Instr) {
+ switch (Instr->getOpcode()) {
+ default:
+ return false;
+ case ARM::tADC:
+ case ARM::tADDi3:
+ case ARM::tADDi8:
+ case ARM::tADDrr:
+ case ARM::tAND:
+ case ARM::tASRri:
+ case ARM::tASRrr:
+ case ARM::tBIC:
+ case ARM::tEOR:
+ case ARM::tLSLri:
+ case ARM::tLSLrr:
+ case ARM::tLSRri:
+ case ARM::tLSRrr:
+ case ARM::tMOVi8:
+ case ARM::tMUL:
+ case ARM::tMVN:
+ case ARM::tORR:
+ case ARM::tROR:
+ case ARM::tRSB:
+ case ARM::tSBC:
+ case ARM::tSUBi3:
+ case ARM::tSUBi8:
+ case ARM::tSUBrr:
+ // Outside of an IT block, these set CPSR.
+ return IsCPSRDead(Instr);
+ case ARM::tADDrSPi:
+ case ARM::tCMNz:
+ case ARM::tCMPi8:
+ case ARM::tCMPr:
+ case ARM::tLDRBi:
+ case ARM::tLDRBr:
+ case ARM::tLDRHi:
+ case ARM::tLDRHr:
+ case ARM::tLDRSB:
+ case ARM::tLDRSH:
+ case ARM::tLDRi:
+ case ARM::tLDRr:
+ case ARM::tLDRspi:
+ case ARM::tSTRBi:
+ case ARM::tSTRBr:
+ case ARM::tSTRHi:
+ case ARM::tSTRHr:
+ case ARM::tSTRi:
+ case ARM::tSTRr:
+ case ARM::tSTRspi:
+ case ARM::tTST:
+ return true;
+// there are some "conditionally deprecated" opcodes
+ case ARM::tADDspr:
+ case ARM::tBLXr:
+ return Instr->getOperand(2).getReg() != ARM::PC;
+ // ADD PC, SP and BLX PC were always unpredictable,
+ // now on top of it they're deprecated
+ case ARM::tADDrSP:
+ case ARM::tBX:
+ return Instr->getOperand(0).getReg() != ARM::PC;
+ case ARM::tADDhirr:
+ return Instr->getOperand(0).getReg() != ARM::PC &&
+ Instr->getOperand(2).getReg() != ARM::PC;
+ case ARM::tCMPhir:
+ case ARM::tMOVr:
+ return Instr->getOperand(0).getReg() != ARM::PC &&
+ Instr->getOperand(1).getReg() != ARM::PC;
+ }
+}
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp
new file mode 100644
index 000000000000..c72db8aca108
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -0,0 +1,2321 @@
+//===-- ARMFrameLowering.cpp - ARM Frame Information ----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMFrameLowering.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMMachineFunctionInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetOptions.h"
+
+#define DEBUG_TYPE "arm-frame-lowering"
+
+using namespace llvm;
+
+static cl::opt<bool>
+SpillAlignedNEONRegs("align-neon-spills", cl::Hidden, cl::init(true),
+ cl::desc("Align ARM NEON spills in prolog and epilog"));
+
+static MachineBasicBlock::iterator
+skipAlignedDPRCS2Spills(MachineBasicBlock::iterator MI,
+ unsigned NumAlignedDPRCS2Regs);
+
+ARMFrameLowering::ARMFrameLowering(const ARMSubtarget &sti)
+ : TargetFrameLowering(StackGrowsDown, sti.getStackAlignment(), 0, 4),
+ STI(sti) {}
+
+bool ARMFrameLowering::noFramePointerElim(const MachineFunction &MF) const {
+ // iOS always has a FP for backtracking, force other targets to keep their FP
+ // when doing FastISel. The emitted code is currently superior, and in cases
+ // like test-suite's lencod FastISel isn't quite correct when FP is eliminated.
+ return TargetFrameLowering::noFramePointerElim(MF) ||
+ MF.getSubtarget<ARMSubtarget>().useFastISel();
+}
+
+/// hasFP - Return true if the specified function should have a dedicated frame
+/// pointer register. This is true if the function has variable sized allocas
+/// or if frame pointer elimination is disabled.
+bool ARMFrameLowering::hasFP(const MachineFunction &MF) const {
+ const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ // ABI-required frame pointer.
+ if (MF.getTarget().Options.DisableFramePointerElim(MF))
+ return true;
+
+ // Frame pointer required for use within this function.
+ return (RegInfo->needsStackRealignment(MF) ||
+ MFI.hasVarSizedObjects() ||
+ MFI.isFrameAddressTaken());
+}
+
+/// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
+/// not required, we reserve argument space for call sites in the function
+/// immediately on entry to the current function. This eliminates the need for
+/// add/sub sp brackets around call sites. Returns true if the call frame is
+/// included as part of the stack frame.
+bool ARMFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ unsigned CFSize = MFI.getMaxCallFrameSize();
+ // It's not always a good idea to include the call frame as part of the
+ // stack frame. ARM (especially Thumb) has small immediate offset to
+ // address the stack frame. So a large call frame can cause poor codegen
+ // and may even makes it impossible to scavenge a register.
+ if (CFSize >= ((1 << 12) - 1) / 2) // Half of imm12
+ return false;
+
+ return !MFI.hasVarSizedObjects();
+}
+
+/// canSimplifyCallFramePseudos - If there is a reserved call frame, the
+/// call frame pseudos can be simplified. Unlike most targets, having a FP
+/// is not sufficient here since we still may reference some objects via SP
+/// even when FP is available in Thumb2 mode.
+bool
+ARMFrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const {
+ return hasReservedCallFrame(MF) || MF.getFrameInfo().hasVarSizedObjects();
+}
+
+static bool isCSRestore(MachineInstr &MI, const ARMBaseInstrInfo &TII,
+ const MCPhysReg *CSRegs) {
+ // Integer spill area is handled with "pop".
+ if (isPopOpcode(MI.getOpcode())) {
+ // The first two operands are predicates. The last two are
+ // imp-def and imp-use of SP. Check everything in between.
+ for (int i = 5, e = MI.getNumOperands(); i != e; ++i)
+ if (!isCalleeSavedRegister(MI.getOperand(i).getReg(), CSRegs))
+ return false;
+ return true;
+ }
+ if ((MI.getOpcode() == ARM::LDR_POST_IMM ||
+ MI.getOpcode() == ARM::LDR_POST_REG ||
+ MI.getOpcode() == ARM::t2LDR_POST) &&
+ isCalleeSavedRegister(MI.getOperand(0).getReg(), CSRegs) &&
+ MI.getOperand(1).getReg() == ARM::SP)
+ return true;
+
+ return false;
+}
+
+static void emitRegPlusImmediate(
+ bool isARM, MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+ const DebugLoc &dl, const ARMBaseInstrInfo &TII, unsigned DestReg,
+ unsigned SrcReg, int NumBytes, unsigned MIFlags = MachineInstr::NoFlags,
+ ARMCC::CondCodes Pred = ARMCC::AL, unsigned PredReg = 0) {
+ if (isARM)
+ emitARMRegPlusImmediate(MBB, MBBI, dl, DestReg, SrcReg, NumBytes,
+ Pred, PredReg, TII, MIFlags);
+ else
+ emitT2RegPlusImmediate(MBB, MBBI, dl, DestReg, SrcReg, NumBytes,
+ Pred, PredReg, TII, MIFlags);
+}
+
+static void emitSPUpdate(bool isARM, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI, const DebugLoc &dl,
+ const ARMBaseInstrInfo &TII, int NumBytes,
+ unsigned MIFlags = MachineInstr::NoFlags,
+ ARMCC::CondCodes Pred = ARMCC::AL,
+ unsigned PredReg = 0) {
+ emitRegPlusImmediate(isARM, MBB, MBBI, dl, TII, ARM::SP, ARM::SP, NumBytes,
+ MIFlags, Pred, PredReg);
+}
+
+static int sizeOfSPAdjustment(const MachineInstr &MI) {
+ int RegSize;
+ switch (MI.getOpcode()) {
+ case ARM::VSTMDDB_UPD:
+ RegSize = 8;
+ break;
+ case ARM::STMDB_UPD:
+ case ARM::t2STMDB_UPD:
+ RegSize = 4;
+ break;
+ case ARM::t2STR_PRE:
+ case ARM::STR_PRE_IMM:
+ return 4;
+ default:
+ llvm_unreachable("Unknown push or pop like instruction");
+ }
+
+ int count = 0;
+ // ARM and Thumb2 push/pop insts have explicit "sp, sp" operands (+
+ // pred) so the list starts at 4.
+ for (int i = MI.getNumOperands() - 1; i >= 4; --i)
+ count += RegSize;
+ return count;
+}
+
+static bool WindowsRequiresStackProbe(const MachineFunction &MF,
+ size_t StackSizeInBytes) {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const Function *F = MF.getFunction();
+ unsigned StackProbeSize = (MFI.getStackProtectorIndex() > 0) ? 4080 : 4096;
+ if (F->hasFnAttribute("stack-probe-size"))
+ F->getFnAttribute("stack-probe-size")
+ .getValueAsString()
+ .getAsInteger(0, StackProbeSize);
+ return StackSizeInBytes >= StackProbeSize;
+}
+
+namespace {
+struct StackAdjustingInsts {
+ struct InstInfo {
+ MachineBasicBlock::iterator I;
+ unsigned SPAdjust;
+ bool BeforeFPSet;
+ };
+
+ SmallVector<InstInfo, 4> Insts;
+
+ void addInst(MachineBasicBlock::iterator I, unsigned SPAdjust,
+ bool BeforeFPSet = false) {
+ InstInfo Info = {I, SPAdjust, BeforeFPSet};
+ Insts.push_back(Info);
+ }
+
+ void addExtraBytes(const MachineBasicBlock::iterator I, unsigned ExtraBytes) {
+ auto Info = find_if(Insts, [&](InstInfo &Info) { return Info.I == I; });
+ assert(Info != Insts.end() && "invalid sp adjusting instruction");
+ Info->SPAdjust += ExtraBytes;
+ }
+
+ void emitDefCFAOffsets(MachineBasicBlock &MBB, const DebugLoc &dl,
+ const ARMBaseInstrInfo &TII, bool HasFP) {
+ MachineFunction &MF = *MBB.getParent();
+ unsigned CFAOffset = 0;
+ for (auto &Info : Insts) {
+ if (HasFP && !Info.BeforeFPSet)
+ return;
+
+ CFAOffset -= Info.SPAdjust;
+ unsigned CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+ BuildMI(MBB, std::next(Info.I), dl,
+ TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameSetup);
+ }
+ }
+};
+}
+
+/// Emit an instruction sequence that will align the address in
+/// register Reg by zero-ing out the lower bits. For versions of the
+/// architecture that support Neon, this must be done in a single
+/// instruction, since skipAlignedDPRCS2Spills assumes it is done in a
+/// single instruction. That function only gets called when optimizing
+/// spilling of D registers on a core with the Neon instruction set
+/// present.
+static void emitAligningInstructions(MachineFunction &MF, ARMFunctionInfo *AFI,
+ const TargetInstrInfo &TII,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, const unsigned Reg,
+ const unsigned Alignment,
+ const bool MustBeSingleInstruction) {
+ const ARMSubtarget &AST =
+ static_cast<const ARMSubtarget &>(MF.getSubtarget());
+ const bool CanUseBFC = AST.hasV6T2Ops() || AST.hasV7Ops();
+ const unsigned AlignMask = Alignment - 1;
+ const unsigned NrBitsToZero = countTrailingZeros(Alignment);
+ assert(!AFI->isThumb1OnlyFunction() && "Thumb1 not supported");
+ if (!AFI->isThumbFunction()) {
+ // if the BFC instruction is available, use that to zero the lower
+ // bits:
+ // bfc Reg, #0, log2(Alignment)
+ // otherwise use BIC, if the mask to zero the required number of bits
+ // can be encoded in the bic immediate field
+ // bic Reg, Reg, Alignment-1
+ // otherwise, emit
+ // lsr Reg, Reg, log2(Alignment)
+ // lsl Reg, Reg, log2(Alignment)
+ if (CanUseBFC) {
+ AddDefaultPred(BuildMI(MBB, MBBI, DL, TII.get(ARM::BFC), Reg)
+ .addReg(Reg, RegState::Kill)
+ .addImm(~AlignMask));
+ } else if (AlignMask <= 255) {
+ AddDefaultCC(
+ AddDefaultPred(BuildMI(MBB, MBBI, DL, TII.get(ARM::BICri), Reg)
+ .addReg(Reg, RegState::Kill)
+ .addImm(AlignMask)));
+ } else {
+ assert(!MustBeSingleInstruction &&
+ "Shouldn't call emitAligningInstructions demanding a single "
+ "instruction to be emitted for large stack alignment for a target "
+ "without BFC.");
+ AddDefaultCC(AddDefaultPred(
+ BuildMI(MBB, MBBI, DL, TII.get(ARM::MOVsi), Reg)
+ .addReg(Reg, RegState::Kill)
+ .addImm(ARM_AM::getSORegOpc(ARM_AM::lsr, NrBitsToZero))));
+ AddDefaultCC(AddDefaultPred(
+ BuildMI(MBB, MBBI, DL, TII.get(ARM::MOVsi), Reg)
+ .addReg(Reg, RegState::Kill)
+ .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, NrBitsToZero))));
+ }
+ } else {
+ // Since this is only reached for Thumb-2 targets, the BFC instruction
+ // should always be available.
+ assert(CanUseBFC);
+ AddDefaultPred(BuildMI(MBB, MBBI, DL, TII.get(ARM::t2BFC), Reg)
+ .addReg(Reg, RegState::Kill)
+ .addImm(~AlignMask));
+ }
+}
+
+void ARMFrameLowering::emitPrologue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ MachineBasicBlock::iterator MBBI = MBB.begin();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+ MachineModuleInfo &MMI = MF.getMMI();
+ MCContext &Context = MMI.getContext();
+ const TargetMachine &TM = MF.getTarget();
+ const MCRegisterInfo *MRI = Context.getRegisterInfo();
+ const ARMBaseRegisterInfo *RegInfo = STI.getRegisterInfo();
+ const ARMBaseInstrInfo &TII = *STI.getInstrInfo();
+ assert(!AFI->isThumb1OnlyFunction() &&
+ "This emitPrologue does not support Thumb1!");
+ bool isARM = !AFI->isThumbFunction();
+ unsigned Align = STI.getFrameLowering()->getStackAlignment();
+ unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize();
+ unsigned NumBytes = MFI.getStackSize();
+ const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+
+ // Debug location must be unknown since the first debug location is used
+ // to determine the end of the prologue.
+ DebugLoc dl;
+
+ unsigned FramePtr = RegInfo->getFrameRegister(MF);
+
+ // Determine the sizes of each callee-save spill areas and record which frame
+ // belongs to which callee-save spill areas.
+ unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0;
+ int FramePtrSpillFI = 0;
+ int D8SpillFI = 0;
+
+ // All calls are tail calls in GHC calling conv, and functions have no
+ // prologue/epilogue.
+ if (MF.getFunction()->getCallingConv() == CallingConv::GHC)
+ return;
+
+ StackAdjustingInsts DefCFAOffsetCandidates;
+ bool HasFP = hasFP(MF);
+
+ // Allocate the vararg register save area.
+ if (ArgRegsSaveSize) {
+ emitSPUpdate(isARM, MBB, MBBI, dl, TII, -ArgRegsSaveSize,
+ MachineInstr::FrameSetup);
+ DefCFAOffsetCandidates.addInst(std::prev(MBBI), ArgRegsSaveSize, true);
+ }
+
+ if (!AFI->hasStackFrame() &&
+ (!STI.isTargetWindows() || !WindowsRequiresStackProbe(MF, NumBytes))) {
+ if (NumBytes - ArgRegsSaveSize != 0) {
+ emitSPUpdate(isARM, MBB, MBBI, dl, TII, -(NumBytes - ArgRegsSaveSize),
+ MachineInstr::FrameSetup);
+ DefCFAOffsetCandidates.addInst(std::prev(MBBI),
+ NumBytes - ArgRegsSaveSize, true);
+ }
+ DefCFAOffsetCandidates.emitDefCFAOffsets(MBB, dl, TII, HasFP);
+ return;
+ }
+
+ // Determine spill area sizes.
+ for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+ unsigned Reg = CSI[i].getReg();
+ int FI = CSI[i].getFrameIdx();
+ switch (Reg) {
+ case ARM::R8:
+ case ARM::R9:
+ case ARM::R10:
+ case ARM::R11:
+ case ARM::R12:
+ if (STI.splitFramePushPop(MF)) {
+ GPRCS2Size += 4;
+ break;
+ }
+ LLVM_FALLTHROUGH;
+ case ARM::R0:
+ case ARM::R1:
+ case ARM::R2:
+ case ARM::R3:
+ case ARM::R4:
+ case ARM::R5:
+ case ARM::R6:
+ case ARM::R7:
+ case ARM::LR:
+ if (Reg == FramePtr)
+ FramePtrSpillFI = FI;
+ GPRCS1Size += 4;
+ break;
+ default:
+ // This is a DPR. Exclude the aligned DPRCS2 spills.
+ if (Reg == ARM::D8)
+ D8SpillFI = FI;
+ if (Reg < ARM::D8 || Reg >= ARM::D8 + AFI->getNumAlignedDPRCS2Regs())
+ DPRCSSize += 8;
+ }
+ }
+
+ // Move past area 1.
+ MachineBasicBlock::iterator LastPush = MBB.end(), GPRCS1Push, GPRCS2Push;
+ if (GPRCS1Size > 0) {
+ GPRCS1Push = LastPush = MBBI++;
+ DefCFAOffsetCandidates.addInst(LastPush, GPRCS1Size, true);
+ }
+
+ // Determine starting offsets of spill areas.
+ unsigned GPRCS1Offset = NumBytes - ArgRegsSaveSize - GPRCS1Size;
+ unsigned GPRCS2Offset = GPRCS1Offset - GPRCS2Size;
+ unsigned DPRAlign = DPRCSSize ? std::min(8U, Align) : 4U;
+ unsigned DPRGapSize = (GPRCS1Size + GPRCS2Size + ArgRegsSaveSize) % DPRAlign;
+ unsigned DPRCSOffset = GPRCS2Offset - DPRGapSize - DPRCSSize;
+ int FramePtrOffsetInPush = 0;
+ if (HasFP) {
+ FramePtrOffsetInPush =
+ MFI.getObjectOffset(FramePtrSpillFI) + ArgRegsSaveSize;
+ AFI->setFramePtrSpillOffset(MFI.getObjectOffset(FramePtrSpillFI) +
+ NumBytes);
+ }
+ AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset);
+ AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset);
+ AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
+
+ // Move past area 2.
+ if (GPRCS2Size > 0) {
+ GPRCS2Push = LastPush = MBBI++;
+ DefCFAOffsetCandidates.addInst(LastPush, GPRCS2Size);
+ }
+
+ // Prolog/epilog inserter assumes we correctly align DPRs on the stack, so our
+ // .cfi_offset operations will reflect that.
+ if (DPRGapSize) {
+ assert(DPRGapSize == 4 && "unexpected alignment requirements for DPRs");
+ if (LastPush != MBB.end() &&
+ tryFoldSPUpdateIntoPushPop(STI, MF, &*LastPush, DPRGapSize))
+ DefCFAOffsetCandidates.addExtraBytes(LastPush, DPRGapSize);
+ else {
+ emitSPUpdate(isARM, MBB, MBBI, dl, TII, -DPRGapSize,
+ MachineInstr::FrameSetup);
+ DefCFAOffsetCandidates.addInst(std::prev(MBBI), DPRGapSize);
+ }
+ }
+
+ // Move past area 3.
+ if (DPRCSSize > 0) {
+ // Since vpush register list cannot have gaps, there may be multiple vpush
+ // instructions in the prologue.
+ while (MBBI->getOpcode() == ARM::VSTMDDB_UPD) {
+ DefCFAOffsetCandidates.addInst(MBBI, sizeOfSPAdjustment(*MBBI));
+ LastPush = MBBI++;
+ }
+ }
+
+ // Move past the aligned DPRCS2 area.
+ if (AFI->getNumAlignedDPRCS2Regs() > 0) {
+ MBBI = skipAlignedDPRCS2Spills(MBBI, AFI->getNumAlignedDPRCS2Regs());
+ // The code inserted by emitAlignedDPRCS2Spills realigns the stack, and
+ // leaves the stack pointer pointing to the DPRCS2 area.
+ //
+ // Adjust NumBytes to represent the stack slots below the DPRCS2 area.
+ NumBytes += MFI.getObjectOffset(D8SpillFI);
+ } else
+ NumBytes = DPRCSOffset;
+
+ if (STI.isTargetWindows() && WindowsRequiresStackProbe(MF, NumBytes)) {
+ uint32_t NumWords = NumBytes >> 2;
+
+ if (NumWords < 65536)
+ AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi16), ARM::R4)
+ .addImm(NumWords)
+ .setMIFlags(MachineInstr::FrameSetup));
+ else
+ BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi32imm), ARM::R4)
+ .addImm(NumWords)
+ .setMIFlags(MachineInstr::FrameSetup);
+
+ switch (TM.getCodeModel()) {
+ case CodeModel::Small:
+ case CodeModel::Medium:
+ case CodeModel::Default:
+ case CodeModel::Kernel:
+ BuildMI(MBB, MBBI, dl, TII.get(ARM::tBL))
+ .addImm((unsigned)ARMCC::AL).addReg(0)
+ .addExternalSymbol("__chkstk")
+ .addReg(ARM::R4, RegState::Implicit)
+ .setMIFlags(MachineInstr::FrameSetup);
+ break;
+ case CodeModel::Large:
+ case CodeModel::JITDefault:
+ BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi32imm), ARM::R12)
+ .addExternalSymbol("__chkstk")
+ .setMIFlags(MachineInstr::FrameSetup);
+
+ BuildMI(MBB, MBBI, dl, TII.get(ARM::tBLXr))
+ .addImm((unsigned)ARMCC::AL).addReg(0)
+ .addReg(ARM::R12, RegState::Kill)
+ .addReg(ARM::R4, RegState::Implicit)
+ .setMIFlags(MachineInstr::FrameSetup);
+ break;
+ }
+
+ AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::t2SUBrr),
+ ARM::SP)
+ .addReg(ARM::SP, RegState::Kill)
+ .addReg(ARM::R4, RegState::Kill)
+ .setMIFlags(MachineInstr::FrameSetup)));
+ NumBytes = 0;
+ }
+
+ if (NumBytes) {
+ // Adjust SP after all the callee-save spills.
+ if (AFI->getNumAlignedDPRCS2Regs() == 0 &&
+ tryFoldSPUpdateIntoPushPop(STI, MF, &*LastPush, NumBytes))
+ DefCFAOffsetCandidates.addExtraBytes(LastPush, NumBytes);
+ else {
+ emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes,
+ MachineInstr::FrameSetup);
+ DefCFAOffsetCandidates.addInst(std::prev(MBBI), NumBytes);
+ }
+
+ if (HasFP && isARM)
+ // Restore from fp only in ARM mode: e.g. sub sp, r7, #24
+ // Note it's not safe to do this in Thumb2 mode because it would have
+ // taken two instructions:
+ // mov sp, r7
+ // sub sp, #24
+ // If an interrupt is taken between the two instructions, then sp is in
+ // an inconsistent state (pointing to the middle of callee-saved area).
+ // The interrupt handler can end up clobbering the registers.
+ AFI->setShouldRestoreSPFromFP(true);
+ }
+
+ // Set FP to point to the stack slot that contains the previous FP.
+ // For iOS, FP is R7, which has now been stored in spill area 1.
+ // Otherwise, if this is not iOS, all the callee-saved registers go
+ // into spill area 1, including the FP in R11. In either case, it
+ // is in area one and the adjustment needs to take place just after
+ // that push.
+ if (HasFP) {
+ MachineBasicBlock::iterator AfterPush = std::next(GPRCS1Push);
+ unsigned PushSize = sizeOfSPAdjustment(*GPRCS1Push);
+ emitRegPlusImmediate(!AFI->isThumbFunction(), MBB, AfterPush,
+ dl, TII, FramePtr, ARM::SP,
+ PushSize + FramePtrOffsetInPush,
+ MachineInstr::FrameSetup);
+ if (FramePtrOffsetInPush + PushSize != 0) {
+ unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa(
+ nullptr, MRI->getDwarfRegNum(FramePtr, true),
+ -(ArgRegsSaveSize - FramePtrOffsetInPush)));
+ BuildMI(MBB, AfterPush, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameSetup);
+ } else {
+ unsigned CFIIndex =
+ MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(
+ nullptr, MRI->getDwarfRegNum(FramePtr, true)));
+ BuildMI(MBB, AfterPush, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameSetup);
+ }
+ }
+
+ // Now that the prologue's actual instructions are finalised, we can insert
+ // the necessary DWARF cf instructions to describe the situation. Start by
+ // recording where each register ended up:
+ if (GPRCS1Size > 0) {
+ MachineBasicBlock::iterator Pos = std::next(GPRCS1Push);
+ int CFIIndex;
+ for (const auto &Entry : CSI) {
+ unsigned Reg = Entry.getReg();
+ int FI = Entry.getFrameIdx();
+ switch (Reg) {
+ case ARM::R8:
+ case ARM::R9:
+ case ARM::R10:
+ case ARM::R11:
+ case ARM::R12:
+ if (STI.splitFramePushPop(MF))
+ break;
+ LLVM_FALLTHROUGH;
+ case ARM::R0:
+ case ARM::R1:
+ case ARM::R2:
+ case ARM::R3:
+ case ARM::R4:
+ case ARM::R5:
+ case ARM::R6:
+ case ARM::R7:
+ case ARM::LR:
+ CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
+ nullptr, MRI->getDwarfRegNum(Reg, true), MFI.getObjectOffset(FI)));
+ BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameSetup);
+ break;
+ }
+ }
+ }
+
+ if (GPRCS2Size > 0) {
+ MachineBasicBlock::iterator Pos = std::next(GPRCS2Push);
+ for (const auto &Entry : CSI) {
+ unsigned Reg = Entry.getReg();
+ int FI = Entry.getFrameIdx();
+ switch (Reg) {
+ case ARM::R8:
+ case ARM::R9:
+ case ARM::R10:
+ case ARM::R11:
+ case ARM::R12:
+ if (STI.splitFramePushPop(MF)) {
+ unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
+ unsigned Offset = MFI.getObjectOffset(FI);
+ unsigned CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
+ BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameSetup);
+ }
+ break;
+ }
+ }
+ }
+
+ if (DPRCSSize > 0) {
+ // Since vpush register list cannot have gaps, there may be multiple vpush
+ // instructions in the prologue.
+ MachineBasicBlock::iterator Pos = std::next(LastPush);
+ for (const auto &Entry : CSI) {
+ unsigned Reg = Entry.getReg();
+ int FI = Entry.getFrameIdx();
+ if ((Reg >= ARM::D0 && Reg <= ARM::D31) &&
+ (Reg < ARM::D8 || Reg >= ARM::D8 + AFI->getNumAlignedDPRCS2Regs())) {
+ unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
+ unsigned Offset = MFI.getObjectOffset(FI);
+ unsigned CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
+ BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameSetup);
+ }
+ }
+ }
+
+ // Now we can emit descriptions of where the canonical frame address was
+ // throughout the process. If we have a frame pointer, it takes over the job
+ // half-way through, so only the first few .cfi_def_cfa_offset instructions
+ // actually get emitted.
+ DefCFAOffsetCandidates.emitDefCFAOffsets(MBB, dl, TII, HasFP);
+
+ if (STI.isTargetELF() && hasFP(MF))
+ MFI.setOffsetAdjustment(MFI.getOffsetAdjustment() -
+ AFI->getFramePtrSpillOffset());
+
+ AFI->setGPRCalleeSavedArea1Size(GPRCS1Size);
+ AFI->setGPRCalleeSavedArea2Size(GPRCS2Size);
+ AFI->setDPRCalleeSavedGapSize(DPRGapSize);
+ AFI->setDPRCalleeSavedAreaSize(DPRCSSize);
+
+ // If we need dynamic stack realignment, do it here. Be paranoid and make
+ // sure if we also have VLAs, we have a base pointer for frame access.
+ // If aligned NEON registers were spilled, the stack has already been
+ // realigned.
+ if (!AFI->getNumAlignedDPRCS2Regs() && RegInfo->needsStackRealignment(MF)) {
+ unsigned MaxAlign = MFI.getMaxAlignment();
+ assert(!AFI->isThumb1OnlyFunction());
+ if (!AFI->isThumbFunction()) {
+ emitAligningInstructions(MF, AFI, TII, MBB, MBBI, dl, ARM::SP, MaxAlign,
+ false);
+ } else {
+ // We cannot use sp as source/dest register here, thus we're using r4 to
+ // perform the calculations. We're emitting the following sequence:
+ // mov r4, sp
+ // -- use emitAligningInstructions to produce best sequence to zero
+ // -- out lower bits in r4
+ // mov sp, r4
+ // FIXME: It will be better just to find spare register here.
+ AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::R4)
+ .addReg(ARM::SP, RegState::Kill));
+ emitAligningInstructions(MF, AFI, TII, MBB, MBBI, dl, ARM::R4, MaxAlign,
+ false);
+ AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP)
+ .addReg(ARM::R4, RegState::Kill));
+ }
+
+ AFI->setShouldRestoreSPFromFP(true);
+ }
+
+ // If we need a base pointer, set it up here. It's whatever the value
+ // of the stack pointer is at this point. Any variable size objects
+ // will be allocated after this, so we can still use the base pointer
+ // to reference locals.
+ // FIXME: Clarify FrameSetup flags here.
+ if (RegInfo->hasBasePointer(MF)) {
+ if (isARM)
+ BuildMI(MBB, MBBI, dl,
+ TII.get(ARM::MOVr), RegInfo->getBaseRegister())
+ .addReg(ARM::SP)
+ .addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
+ else
+ AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr),
+ RegInfo->getBaseRegister())
+ .addReg(ARM::SP));
+ }
+
+ // If the frame has variable sized objects then the epilogue must restore
+ // the sp from fp. We can assume there's an FP here since hasFP already
+ // checks for hasVarSizedObjects.
+ if (MFI.hasVarSizedObjects())
+ AFI->setShouldRestoreSPFromFP(true);
+}
+
+void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+ const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
+ const ARMBaseInstrInfo &TII =
+ *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
+ assert(!AFI->isThumb1OnlyFunction() &&
+ "This emitEpilogue does not support Thumb1!");
+ bool isARM = !AFI->isThumbFunction();
+
+ unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize();
+ int NumBytes = (int)MFI.getStackSize();
+ unsigned FramePtr = RegInfo->getFrameRegister(MF);
+
+ // All calls are tail calls in GHC calling conv, and functions have no
+ // prologue/epilogue.
+ if (MF.getFunction()->getCallingConv() == CallingConv::GHC)
+ return;
+
+ // First put ourselves on the first (from top) terminator instructions.
+ MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+ DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+ if (!AFI->hasStackFrame()) {
+ if (NumBytes - ArgRegsSaveSize != 0)
+ emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes - ArgRegsSaveSize);
+ } else {
+ // Unwind MBBI to point to first LDR / VLDRD.
+ const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
+ if (MBBI != MBB.begin()) {
+ do {
+ --MBBI;
+ } while (MBBI != MBB.begin() && isCSRestore(*MBBI, TII, CSRegs));
+ if (!isCSRestore(*MBBI, TII, CSRegs))
+ ++MBBI;
+ }
+
+ // Move SP to start of FP callee save spill area.
+ NumBytes -= (ArgRegsSaveSize +
+ AFI->getGPRCalleeSavedArea1Size() +
+ AFI->getGPRCalleeSavedArea2Size() +
+ AFI->getDPRCalleeSavedGapSize() +
+ AFI->getDPRCalleeSavedAreaSize());
+
+ // Reset SP based on frame pointer only if the stack frame extends beyond
+ // frame pointer stack slot or target is ELF and the function has FP.
+ if (AFI->shouldRestoreSPFromFP()) {
+ NumBytes = AFI->getFramePtrSpillOffset() - NumBytes;
+ if (NumBytes) {
+ if (isARM)
+ emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, FramePtr, -NumBytes,
+ ARMCC::AL, 0, TII);
+ else {
+ // It's not possible to restore SP from FP in a single instruction.
+ // For iOS, this looks like:
+ // mov sp, r7
+ // sub sp, #24
+ // This is bad, if an interrupt is taken after the mov, sp is in an
+ // inconsistent state.
+ // Use the first callee-saved register as a scratch register.
+ assert(!MFI.getPristineRegs(MF).test(ARM::R4) &&
+ "No scratch register to restore SP from FP!");
+ emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::R4, FramePtr, -NumBytes,
+ ARMCC::AL, 0, TII);
+ AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr),
+ ARM::SP)
+ .addReg(ARM::R4));
+ }
+ } else {
+ // Thumb2 or ARM.
+ if (isARM)
+ BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), ARM::SP)
+ .addReg(FramePtr).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
+ else
+ AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr),
+ ARM::SP)
+ .addReg(FramePtr));
+ }
+ } else if (NumBytes &&
+ !tryFoldSPUpdateIntoPushPop(STI, MF, &*MBBI, NumBytes))
+ emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
+
+ // Increment past our save areas.
+ if (MBBI != MBB.end() && AFI->getDPRCalleeSavedAreaSize()) {
+ MBBI++;
+ // Since vpop register list cannot have gaps, there may be multiple vpop
+ // instructions in the epilogue.
+ while (MBBI != MBB.end() && MBBI->getOpcode() == ARM::VLDMDIA_UPD)
+ MBBI++;
+ }
+ if (AFI->getDPRCalleeSavedGapSize()) {
+ assert(AFI->getDPRCalleeSavedGapSize() == 4 &&
+ "unexpected DPR alignment gap");
+ emitSPUpdate(isARM, MBB, MBBI, dl, TII, AFI->getDPRCalleeSavedGapSize());
+ }
+
+ if (AFI->getGPRCalleeSavedArea2Size()) MBBI++;
+ if (AFI->getGPRCalleeSavedArea1Size()) MBBI++;
+ }
+
+ if (ArgRegsSaveSize)
+ emitSPUpdate(isARM, MBB, MBBI, dl, TII, ArgRegsSaveSize);
+}
+
+/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
+/// debug info. It's the same as what we use for resolving the code-gen
+/// references for now. FIXME: This can go wrong when references are
+/// SP-relative and simple call frames aren't used.
+int
+ARMFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+ unsigned &FrameReg) const {
+ return ResolveFrameIndexReference(MF, FI, FrameReg, 0);
+}
+
+int
+ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF,
+ int FI, unsigned &FrameReg,
+ int SPAdj) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const ARMBaseRegisterInfo *RegInfo = static_cast<const ARMBaseRegisterInfo *>(
+ MF.getSubtarget().getRegisterInfo());
+ const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+ int Offset = MFI.getObjectOffset(FI) + MFI.getStackSize();
+ int FPOffset = Offset - AFI->getFramePtrSpillOffset();
+ bool isFixed = MFI.isFixedObjectIndex(FI);
+
+ FrameReg = ARM::SP;
+ Offset += SPAdj;
+
+ // SP can move around if there are allocas. We may also lose track of SP
+ // when emergency spilling inside a non-reserved call frame setup.
+ bool hasMovingSP = !hasReservedCallFrame(MF);
+
+ // When dynamically realigning the stack, use the frame pointer for
+ // parameters, and the stack/base pointer for locals.
+ if (RegInfo->needsStackRealignment(MF)) {
+ assert (hasFP(MF) && "dynamic stack realignment without a FP!");
+ if (isFixed) {
+ FrameReg = RegInfo->getFrameRegister(MF);
+ Offset = FPOffset;
+ } else if (hasMovingSP) {
+ assert(RegInfo->hasBasePointer(MF) &&
+ "VLAs and dynamic stack alignment, but missing base pointer!");
+ FrameReg = RegInfo->getBaseRegister();
+ }
+ return Offset;
+ }
+
+ // If there is a frame pointer, use it when we can.
+ if (hasFP(MF) && AFI->hasStackFrame()) {
+ // Use frame pointer to reference fixed objects. Use it for locals if
+ // there are VLAs (and thus the SP isn't reliable as a base).
+ if (isFixed || (hasMovingSP && !RegInfo->hasBasePointer(MF))) {
+ FrameReg = RegInfo->getFrameRegister(MF);
+ return FPOffset;
+ } else if (hasMovingSP) {
+ assert(RegInfo->hasBasePointer(MF) && "missing base pointer!");
+ if (AFI->isThumb2Function()) {
+ // Try to use the frame pointer if we can, else use the base pointer
+ // since it's available. This is handy for the emergency spill slot, in
+ // particular.
+ if (FPOffset >= -255 && FPOffset < 0) {
+ FrameReg = RegInfo->getFrameRegister(MF);
+ return FPOffset;
+ }
+ }
+ } else if (AFI->isThumb2Function()) {
+ // Use add <rd>, sp, #<imm8>
+ // ldr <rd>, [sp, #<imm8>]
+ // if at all possible to save space.
+ if (Offset >= 0 && (Offset & 3) == 0 && Offset <= 1020)
+ return Offset;
+ // In Thumb2 mode, the negative offset is very limited. Try to avoid
+ // out of range references. ldr <rt>,[<rn>, #-<imm8>]
+ if (FPOffset >= -255 && FPOffset < 0) {
+ FrameReg = RegInfo->getFrameRegister(MF);
+ return FPOffset;
+ }
+ } else if (Offset > (FPOffset < 0 ? -FPOffset : FPOffset)) {
+ // Otherwise, use SP or FP, whichever is closer to the stack slot.
+ FrameReg = RegInfo->getFrameRegister(MF);
+ return FPOffset;
+ }
+ }
+ // Use the base pointer if we have one.
+ if (RegInfo->hasBasePointer(MF))
+ FrameReg = RegInfo->getBaseRegister();
+ return Offset;
+}
+
+void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ unsigned StmOpc, unsigned StrOpc,
+ bool NoGap,
+ bool(*Func)(unsigned, bool),
+ unsigned NumAlignedDPRCS2Regs,
+ unsigned MIFlags) const {
+ MachineFunction &MF = *MBB.getParent();
+ const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+ const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+
+ DebugLoc DL;
+
+ typedef std::pair<unsigned, bool> RegAndKill;
+ SmallVector<RegAndKill, 4> Regs;
+ unsigned i = CSI.size();
+ while (i != 0) {
+ unsigned LastReg = 0;
+ for (; i != 0; --i) {
+ unsigned Reg = CSI[i-1].getReg();
+ if (!(Func)(Reg, STI.splitFramePushPop(MF))) continue;
+
+ // D-registers in the aligned area DPRCS2 are NOT spilled here.
+ if (Reg >= ARM::D8 && Reg < ARM::D8 + NumAlignedDPRCS2Regs)
+ continue;
+
+ bool isLiveIn = MF.getRegInfo().isLiveIn(Reg);
+ if (!isLiveIn)
+ MBB.addLiveIn(Reg);
+ // If NoGap is true, push consecutive registers and then leave the rest
+ // for other instructions. e.g.
+ // vpush {d8, d10, d11} -> vpush {d8}, vpush {d10, d11}
+ if (NoGap && LastReg && LastReg != Reg-1)
+ break;
+ LastReg = Reg;
+ // Do not set a kill flag on values that are also marked as live-in. This
+ // happens with the @llvm-returnaddress intrinsic and with arguments
+ // passed in callee saved registers.
+ // Omitting the kill flags is conservatively correct even if the live-in
+ // is not used after all.
+ Regs.push_back(std::make_pair(Reg, /*isKill=*/!isLiveIn));
+ }
+
+ if (Regs.empty())
+ continue;
+
+ std::sort(Regs.begin(), Regs.end(), [&](const RegAndKill &LHS,
+ const RegAndKill &RHS) {
+ return TRI.getEncodingValue(LHS.first) < TRI.getEncodingValue(RHS.first);
+ });
+
+ if (Regs.size() > 1 || StrOpc== 0) {
+ MachineInstrBuilder MIB =
+ AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(StmOpc), ARM::SP)
+ .addReg(ARM::SP).setMIFlags(MIFlags));
+ for (unsigned i = 0, e = Regs.size(); i < e; ++i)
+ MIB.addReg(Regs[i].first, getKillRegState(Regs[i].second));
+ } else if (Regs.size() == 1) {
+ MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc),
+ ARM::SP)
+ .addReg(Regs[0].first, getKillRegState(Regs[0].second))
+ .addReg(ARM::SP).setMIFlags(MIFlags)
+ .addImm(-4);
+ AddDefaultPred(MIB);
+ }
+ Regs.clear();
+
+ // Put any subsequent vpush instructions before this one: they will refer to
+ // higher register numbers so need to be pushed first in order to preserve
+ // monotonicity.
+ if (MI != MBB.begin())
+ --MI;
+ }
+}
+
+void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ unsigned LdmOpc, unsigned LdrOpc,
+ bool isVarArg, bool NoGap,
+ bool(*Func)(unsigned, bool),
+ unsigned NumAlignedDPRCS2Regs) const {
+ MachineFunction &MF = *MBB.getParent();
+ const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+ const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+ DebugLoc DL;
+ bool isTailCall = false;
+ bool isInterrupt = false;
+ bool isTrap = false;
+ if (MBB.end() != MI) {
+ DL = MI->getDebugLoc();
+ unsigned RetOpcode = MI->getOpcode();
+ isTailCall = (RetOpcode == ARM::TCRETURNdi || RetOpcode == ARM::TCRETURNri);
+ isInterrupt =
+ RetOpcode == ARM::SUBS_PC_LR || RetOpcode == ARM::t2SUBS_PC_LR;
+ isTrap =
+ RetOpcode == ARM::TRAP || RetOpcode == ARM::TRAPNaCl ||
+ RetOpcode == ARM::tTRAP;
+ }
+
+ SmallVector<unsigned, 4> Regs;
+ unsigned i = CSI.size();
+ while (i != 0) {
+ unsigned LastReg = 0;
+ bool DeleteRet = false;
+ for (; i != 0; --i) {
+ unsigned Reg = CSI[i-1].getReg();
+ if (!(Func)(Reg, STI.splitFramePushPop(MF))) continue;
+
+ // The aligned reloads from area DPRCS2 are not inserted here.
+ if (Reg >= ARM::D8 && Reg < ARM::D8 + NumAlignedDPRCS2Regs)
+ continue;
+
+ if (Reg == ARM::LR && !isTailCall && !isVarArg && !isInterrupt &&
+ !isTrap && STI.hasV5TOps()) {
+ if (MBB.succ_empty()) {
+ Reg = ARM::PC;
+ DeleteRet = true;
+ LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_RET : ARM::LDMIA_RET;
+ } else
+ LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_UPD : ARM::LDMIA_UPD;
+ // Fold the return instruction into the LDM.
+ }
+
+ // If NoGap is true, pop consecutive registers and then leave the rest
+ // for other instructions. e.g.
+ // vpop {d8, d10, d11} -> vpop {d8}, vpop {d10, d11}
+ if (NoGap && LastReg && LastReg != Reg-1)
+ break;
+
+ LastReg = Reg;
+ Regs.push_back(Reg);
+ }
+
+ if (Regs.empty())
+ continue;
+
+ std::sort(Regs.begin(), Regs.end(), [&](unsigned LHS, unsigned RHS) {
+ return TRI.getEncodingValue(LHS) < TRI.getEncodingValue(RHS);
+ });
+
+ if (Regs.size() > 1 || LdrOpc == 0) {
+ MachineInstrBuilder MIB =
+ AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(LdmOpc), ARM::SP)
+ .addReg(ARM::SP));
+ for (unsigned i = 0, e = Regs.size(); i < e; ++i)
+ MIB.addReg(Regs[i], getDefRegState(true));
+ if (DeleteRet && MI != MBB.end()) {
+ MIB.copyImplicitOps(*MI);
+ MI->eraseFromParent();
+ }
+ MI = MIB;
+ } else if (Regs.size() == 1) {
+ // If we adjusted the reg to PC from LR above, switch it back here. We
+ // only do that for LDM.
+ if (Regs[0] == ARM::PC)
+ Regs[0] = ARM::LR;
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MI, DL, TII.get(LdrOpc), Regs[0])
+ .addReg(ARM::SP, RegState::Define)
+ .addReg(ARM::SP);
+ // ARM mode needs an extra reg0 here due to addrmode2. Will go away once
+ // that refactoring is complete (eventually).
+ if (LdrOpc == ARM::LDR_POST_REG || LdrOpc == ARM::LDR_POST_IMM) {
+ MIB.addReg(0);
+ MIB.addImm(ARM_AM::getAM2Opc(ARM_AM::add, 4, ARM_AM::no_shift));
+ } else
+ MIB.addImm(4);
+ AddDefaultPred(MIB);
+ }
+ Regs.clear();
+
+ // Put any subsequent vpop instructions after this one: they will refer to
+ // higher register numbers so need to be popped afterwards.
+ if (MI != MBB.end())
+ ++MI;
+ }
+}
+
+/// Emit aligned spill instructions for NumAlignedDPRCS2Regs D-registers
+/// starting from d8. Also insert stack realignment code and leave the stack
+/// pointer pointing to the d8 spill slot.
+static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned NumAlignedDPRCS2Regs,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) {
+ MachineFunction &MF = *MBB.getParent();
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+ DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc();
+ const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ // Mark the D-register spill slots as properly aligned. Since MFI computes
+ // stack slot layout backwards, this can actually mean that the d-reg stack
+ // slot offsets can be wrong. The offset for d8 will always be correct.
+ for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+ unsigned DNum = CSI[i].getReg() - ARM::D8;
+ if (DNum > NumAlignedDPRCS2Regs - 1)
+ continue;
+ int FI = CSI[i].getFrameIdx();
+ // The even-numbered registers will be 16-byte aligned, the odd-numbered
+ // registers will be 8-byte aligned.
+ MFI.setObjectAlignment(FI, DNum % 2 ? 8 : 16);
+
+ // The stack slot for D8 needs to be maximally aligned because this is
+ // actually the point where we align the stack pointer. MachineFrameInfo
+ // computes all offsets relative to the incoming stack pointer which is a
+ // bit weird when realigning the stack. Any extra padding for this
+ // over-alignment is not realized because the code inserted below adjusts
+ // the stack pointer by numregs * 8 before aligning the stack pointer.
+ if (DNum == 0)
+ MFI.setObjectAlignment(FI, MFI.getMaxAlignment());
+ }
+
+ // Move the stack pointer to the d8 spill slot, and align it at the same
+ // time. Leave the stack slot address in the scratch register r4.
+ //
+ // sub r4, sp, #numregs * 8
+ // bic r4, r4, #align - 1
+ // mov sp, r4
+ //
+ bool isThumb = AFI->isThumbFunction();
+ assert(!AFI->isThumb1OnlyFunction() && "Can't realign stack for thumb1");
+ AFI->setShouldRestoreSPFromFP(true);
+
+ // sub r4, sp, #numregs * 8
+ // The immediate is <= 64, so it doesn't need any special encoding.
+ unsigned Opc = isThumb ? ARM::t2SUBri : ARM::SUBri;
+ AddDefaultCC(AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(Opc), ARM::R4)
+ .addReg(ARM::SP)
+ .addImm(8 * NumAlignedDPRCS2Regs)));
+
+ unsigned MaxAlign = MF.getFrameInfo().getMaxAlignment();
+ // We must set parameter MustBeSingleInstruction to true, since
+ // skipAlignedDPRCS2Spills expects exactly 3 instructions to perform
+ // stack alignment. Luckily, this can always be done since all ARM
+ // architecture versions that support Neon also support the BFC
+ // instruction.
+ emitAligningInstructions(MF, AFI, TII, MBB, MI, DL, ARM::R4, MaxAlign, true);
+
+ // mov sp, r4
+ // The stack pointer must be adjusted before spilling anything, otherwise
+ // the stack slots could be clobbered by an interrupt handler.
+ // Leave r4 live, it is used below.
+ Opc = isThumb ? ARM::tMOVr : ARM::MOVr;
+ MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(Opc), ARM::SP)
+ .addReg(ARM::R4);
+ MIB = AddDefaultPred(MIB);
+ if (!isThumb)
+ AddDefaultCC(MIB);
+
+ // Now spill NumAlignedDPRCS2Regs registers starting from d8.
+ // r4 holds the stack slot address.
+ unsigned NextReg = ARM::D8;
+
+ // 16-byte aligned vst1.64 with 4 d-regs and address writeback.
+ // The writeback is only needed when emitting two vst1.64 instructions.
+ if (NumAlignedDPRCS2Regs >= 6) {
+ unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
+ &ARM::QQPRRegClass);
+ MBB.addLiveIn(SupReg);
+ AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VST1d64Qwb_fixed),
+ ARM::R4)
+ .addReg(ARM::R4, RegState::Kill).addImm(16)
+ .addReg(NextReg)
+ .addReg(SupReg, RegState::ImplicitKill));
+ NextReg += 4;
+ NumAlignedDPRCS2Regs -= 4;
+ }
+
+ // We won't modify r4 beyond this point. It currently points to the next
+ // register to be spilled.
+ unsigned R4BaseReg = NextReg;
+
+ // 16-byte aligned vst1.64 with 4 d-regs, no writeback.
+ if (NumAlignedDPRCS2Regs >= 4) {
+ unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
+ &ARM::QQPRRegClass);
+ MBB.addLiveIn(SupReg);
+ AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VST1d64Q))
+ .addReg(ARM::R4).addImm(16).addReg(NextReg)
+ .addReg(SupReg, RegState::ImplicitKill));
+ NextReg += 4;
+ NumAlignedDPRCS2Regs -= 4;
+ }
+
+ // 16-byte aligned vst1.64 with 2 d-regs.
+ if (NumAlignedDPRCS2Regs >= 2) {
+ unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
+ &ARM::QPRRegClass);
+ MBB.addLiveIn(SupReg);
+ AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VST1q64))
+ .addReg(ARM::R4).addImm(16).addReg(SupReg));
+ NextReg += 2;
+ NumAlignedDPRCS2Regs -= 2;
+ }
+
+ // Finally, use a vanilla vstr.64 for the odd last register.
+ if (NumAlignedDPRCS2Regs) {
+ MBB.addLiveIn(NextReg);
+ // vstr.64 uses addrmode5 which has an offset scale of 4.
+ AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VSTRD))
+ .addReg(NextReg)
+ .addReg(ARM::R4).addImm((NextReg-R4BaseReg)*2));
+ }
+
+ // The last spill instruction inserted should kill the scratch register r4.
+ std::prev(MI)->addRegisterKilled(ARM::R4, TRI);
+}
+
+/// Skip past the code inserted by emitAlignedDPRCS2Spills, and return an
+/// iterator to the following instruction.
+static MachineBasicBlock::iterator
+skipAlignedDPRCS2Spills(MachineBasicBlock::iterator MI,
+ unsigned NumAlignedDPRCS2Regs) {
+ // sub r4, sp, #numregs * 8
+ // bic r4, r4, #align - 1
+ // mov sp, r4
+ ++MI; ++MI; ++MI;
+ assert(MI->mayStore() && "Expecting spill instruction");
+
+ // These switches all fall through.
+ switch(NumAlignedDPRCS2Regs) {
+ case 7:
+ ++MI;
+ assert(MI->mayStore() && "Expecting spill instruction");
+ default:
+ ++MI;
+ assert(MI->mayStore() && "Expecting spill instruction");
+ case 1:
+ case 2:
+ case 4:
+ assert(MI->killsRegister(ARM::R4) && "Missed kill flag");
+ ++MI;
+ }
+ return MI;
+}
+
+/// Emit aligned reload instructions for NumAlignedDPRCS2Regs D-registers
+/// starting from d8. These instructions are assumed to execute while the
+/// stack is still aligned, unlike the code inserted by emitPopInst.
+static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned NumAlignedDPRCS2Regs,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) {
+ MachineFunction &MF = *MBB.getParent();
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+ DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc();
+ const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+
+ // Find the frame index assigned to d8.
+ int D8SpillFI = 0;
+ for (unsigned i = 0, e = CSI.size(); i != e; ++i)
+ if (CSI[i].getReg() == ARM::D8) {
+ D8SpillFI = CSI[i].getFrameIdx();
+ break;
+ }
+
+ // Materialize the address of the d8 spill slot into the scratch register r4.
+ // This can be fairly complicated if the stack frame is large, so just use
+ // the normal frame index elimination mechanism to do it. This code runs as
+ // the initial part of the epilog where the stack and base pointers haven't
+ // been changed yet.
+ bool isThumb = AFI->isThumbFunction();
+ assert(!AFI->isThumb1OnlyFunction() && "Can't realign stack for thumb1");
+
+ unsigned Opc = isThumb ? ARM::t2ADDri : ARM::ADDri;
+ AddDefaultCC(AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(Opc), ARM::R4)
+ .addFrameIndex(D8SpillFI).addImm(0)));
+
+ // Now restore NumAlignedDPRCS2Regs registers starting from d8.
+ unsigned NextReg = ARM::D8;
+
+ // 16-byte aligned vld1.64 with 4 d-regs and writeback.
+ if (NumAlignedDPRCS2Regs >= 6) {
+ unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
+ &ARM::QQPRRegClass);
+ AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLD1d64Qwb_fixed), NextReg)
+ .addReg(ARM::R4, RegState::Define)
+ .addReg(ARM::R4, RegState::Kill).addImm(16)
+ .addReg(SupReg, RegState::ImplicitDefine));
+ NextReg += 4;
+ NumAlignedDPRCS2Regs -= 4;
+ }
+
+ // We won't modify r4 beyond this point. It currently points to the next
+ // register to be spilled.
+ unsigned R4BaseReg = NextReg;
+
+ // 16-byte aligned vld1.64 with 4 d-regs, no writeback.
+ if (NumAlignedDPRCS2Regs >= 4) {
+ unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
+ &ARM::QQPRRegClass);
+ AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLD1d64Q), NextReg)
+ .addReg(ARM::R4).addImm(16)
+ .addReg(SupReg, RegState::ImplicitDefine));
+ NextReg += 4;
+ NumAlignedDPRCS2Regs -= 4;
+ }
+
+ // 16-byte aligned vld1.64 with 2 d-regs.
+ if (NumAlignedDPRCS2Regs >= 2) {
+ unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
+ &ARM::QPRRegClass);
+ AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLD1q64), SupReg)
+ .addReg(ARM::R4).addImm(16));
+ NextReg += 2;
+ NumAlignedDPRCS2Regs -= 2;
+ }
+
+ // Finally, use a vanilla vldr.64 for the remaining odd register.
+ if (NumAlignedDPRCS2Regs)
+ AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLDRD), NextReg)
+ .addReg(ARM::R4).addImm(2*(NextReg-R4BaseReg)));
+
+ // Last store kills r4.
+ std::prev(MI)->addRegisterKilled(ARM::R4, TRI);
+}
+
+bool ARMFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const {
+ if (CSI.empty())
+ return false;
+
+ MachineFunction &MF = *MBB.getParent();
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+ unsigned PushOpc = AFI->isThumbFunction() ? ARM::t2STMDB_UPD : ARM::STMDB_UPD;
+ unsigned PushOneOpc = AFI->isThumbFunction() ?
+ ARM::t2STR_PRE : ARM::STR_PRE_IMM;
+ unsigned FltOpc = ARM::VSTMDDB_UPD;
+ unsigned NumAlignedDPRCS2Regs = AFI->getNumAlignedDPRCS2Regs();
+ emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea1Register, 0,
+ MachineInstr::FrameSetup);
+ emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea2Register, 0,
+ MachineInstr::FrameSetup);
+ emitPushInst(MBB, MI, CSI, FltOpc, 0, true, &isARMArea3Register,
+ NumAlignedDPRCS2Regs, MachineInstr::FrameSetup);
+
+ // The code above does not insert spill code for the aligned DPRCS2 registers.
+ // The stack realignment code will be inserted between the push instructions
+ // and these spills.
+ if (NumAlignedDPRCS2Regs)
+ emitAlignedDPRCS2Spills(MBB, MI, NumAlignedDPRCS2Regs, CSI, TRI);
+
+ return true;
+}
+
+bool ARMFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const {
+ if (CSI.empty())
+ return false;
+
+ MachineFunction &MF = *MBB.getParent();
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+ bool isVarArg = AFI->getArgRegsSaveSize() > 0;
+ unsigned NumAlignedDPRCS2Regs = AFI->getNumAlignedDPRCS2Regs();
+
+ // The emitPopInst calls below do not insert reloads for the aligned DPRCS2
+ // registers. Do that here instead.
+ if (NumAlignedDPRCS2Regs)
+ emitAlignedDPRCS2Restores(MBB, MI, NumAlignedDPRCS2Regs, CSI, TRI);
+
+ unsigned PopOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_UPD : ARM::LDMIA_UPD;
+ unsigned LdrOpc = AFI->isThumbFunction() ? ARM::t2LDR_POST :ARM::LDR_POST_IMM;
+ unsigned FltOpc = ARM::VLDMDIA_UPD;
+ emitPopInst(MBB, MI, CSI, FltOpc, 0, isVarArg, true, &isARMArea3Register,
+ NumAlignedDPRCS2Regs);
+ emitPopInst(MBB, MI, CSI, PopOpc, LdrOpc, isVarArg, false,
+ &isARMArea2Register, 0);
+ emitPopInst(MBB, MI, CSI, PopOpc, LdrOpc, isVarArg, false,
+ &isARMArea1Register, 0);
+
+ return true;
+}
+
+// FIXME: Make generic?
+static unsigned GetFunctionSizeInBytes(const MachineFunction &MF,
+ const ARMBaseInstrInfo &TII) {
+ unsigned FnSize = 0;
+ for (auto &MBB : MF) {
+ for (auto &MI : MBB)
+ FnSize += TII.getInstSizeInBytes(MI);
+ }
+ return FnSize;
+}
+
+/// estimateRSStackSizeLimit - Look at each instruction that references stack
+/// frames and return the stack size limit beyond which some of these
+/// instructions will require a scratch register during their expansion later.
+// FIXME: Move to TII?
+static unsigned estimateRSStackSizeLimit(MachineFunction &MF,
+ const TargetFrameLowering *TFI) {
+ const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+ unsigned Limit = (1 << 12) - 1;
+ for (auto &MBB : MF) {
+ for (auto &MI : MBB) {
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ if (!MI.getOperand(i).isFI())
+ continue;
+
+ // When using ADDri to get the address of a stack object, 255 is the
+ // largest offset guaranteed to fit in the immediate offset.
+ if (MI.getOpcode() == ARM::ADDri) {
+ Limit = std::min(Limit, (1U << 8) - 1);
+ break;
+ }
+
+ // Otherwise check the addressing mode.
+ switch (MI.getDesc().TSFlags & ARMII::AddrModeMask) {
+ case ARMII::AddrMode3:
+ case ARMII::AddrModeT2_i8:
+ Limit = std::min(Limit, (1U << 8) - 1);
+ break;
+ case ARMII::AddrMode5:
+ case ARMII::AddrModeT2_i8s4:
+ Limit = std::min(Limit, ((1U << 8) - 1) * 4);
+ break;
+ case ARMII::AddrModeT2_i12:
+ // i12 supports only positive offset so these will be converted to
+ // i8 opcodes. See llvm::rewriteT2FrameIndex.
+ if (TFI->hasFP(MF) && AFI->hasStackFrame())
+ Limit = std::min(Limit, (1U << 8) - 1);
+ break;
+ case ARMII::AddrMode4:
+ case ARMII::AddrMode6:
+ // Addressing modes 4 & 6 (load/store) instructions can't encode an
+ // immediate offset for stack references.
+ return 0;
+ default:
+ break;
+ }
+ break; // At most one FI per instruction
+ }
+ }
+ }
+
+ return Limit;
+}
+
+// In functions that realign the stack, it can be an advantage to spill the
+// callee-saved vector registers after realigning the stack. The vst1 and vld1
+// instructions take alignment hints that can improve performance.
+//
+static void
+checkNumAlignedDPRCS2Regs(MachineFunction &MF, BitVector &SavedRegs) {
+ MF.getInfo<ARMFunctionInfo>()->setNumAlignedDPRCS2Regs(0);
+ if (!SpillAlignedNEONRegs)
+ return;
+
+ // Naked functions don't spill callee-saved registers.
+ if (MF.getFunction()->hasFnAttribute(Attribute::Naked))
+ return;
+
+ // We are planning to use NEON instructions vst1 / vld1.
+ if (!static_cast<const ARMSubtarget &>(MF.getSubtarget()).hasNEON())
+ return;
+
+ // Don't bother if the default stack alignment is sufficiently high.
+ if (MF.getSubtarget().getFrameLowering()->getStackAlignment() >= 8)
+ return;
+
+ // Aligned spills require stack realignment.
+ if (!static_cast<const ARMBaseRegisterInfo *>(
+ MF.getSubtarget().getRegisterInfo())->canRealignStack(MF))
+ return;
+
+ // We always spill contiguous d-registers starting from d8. Count how many
+ // needs spilling. The register allocator will almost always use the
+ // callee-saved registers in order, but it can happen that there are holes in
+ // the range. Registers above the hole will be spilled to the standard DPRCS
+ // area.
+ unsigned NumSpills = 0;
+ for (; NumSpills < 8; ++NumSpills)
+ if (!SavedRegs.test(ARM::D8 + NumSpills))
+ break;
+
+ // Don't do this for just one d-register. It's not worth it.
+ if (NumSpills < 2)
+ return;
+
+ // Spill the first NumSpills D-registers after realigning the stack.
+ MF.getInfo<ARMFunctionInfo>()->setNumAlignedDPRCS2Regs(NumSpills);
+
+ // A scratch register is required for the vst1 / vld1 instructions.
+ SavedRegs.set(ARM::R4);
+}
+
+void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
+ BitVector &SavedRegs,
+ RegScavenger *RS) const {
+ TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+ // This tells PEI to spill the FP as if it is any other callee-save register
+ // to take advantage the eliminateFrameIndex machinery. This also ensures it
+ // is spilled in the order specified by getCalleeSavedRegs() to make it easier
+ // to combine multiple loads / stores.
+ bool CanEliminateFrame = true;
+ bool CS1Spilled = false;
+ bool LRSpilled = false;
+ unsigned NumGPRSpills = 0;
+ unsigned NumFPRSpills = 0;
+ SmallVector<unsigned, 4> UnspilledCS1GPRs;
+ SmallVector<unsigned, 4> UnspilledCS2GPRs;
+ const ARMBaseRegisterInfo *RegInfo = static_cast<const ARMBaseRegisterInfo *>(
+ MF.getSubtarget().getRegisterInfo());
+ const ARMBaseInstrInfo &TII =
+ *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+ (void)TRI; // Silence unused warning in non-assert builds.
+ unsigned FramePtr = RegInfo->getFrameRegister(MF);
+
+ // Spill R4 if Thumb2 function requires stack realignment - it will be used as
+ // scratch register. Also spill R4 if Thumb2 function has varsized objects,
+ // since it's not always possible to restore sp from fp in a single
+ // instruction.
+ // FIXME: It will be better just to find spare register here.
+ if (AFI->isThumb2Function() &&
+ (MFI.hasVarSizedObjects() || RegInfo->needsStackRealignment(MF)))
+ SavedRegs.set(ARM::R4);
+
+ if (AFI->isThumb1OnlyFunction()) {
+ // Spill LR if Thumb1 function uses variable length argument lists.
+ if (AFI->getArgRegsSaveSize() > 0)
+ SavedRegs.set(ARM::LR);
+
+ // Spill R4 if Thumb1 epilogue has to restore SP from FP. We don't know
+ // for sure what the stack size will be, but for this, an estimate is good
+ // enough. If there anything changes it, it'll be a spill, which implies
+ // we've used all the registers and so R4 is already used, so not marking
+ // it here will be OK.
+ // FIXME: It will be better just to find spare register here.
+ unsigned StackSize = MFI.estimateStackSize(MF);
+ if (MFI.hasVarSizedObjects() || StackSize > 508)
+ SavedRegs.set(ARM::R4);
+ }
+
+ // See if we can spill vector registers to aligned stack.
+ checkNumAlignedDPRCS2Regs(MF, SavedRegs);
+
+ // Spill the BasePtr if it's used.
+ if (RegInfo->hasBasePointer(MF))
+ SavedRegs.set(RegInfo->getBaseRegister());
+
+ // Don't spill FP if the frame can be eliminated. This is determined
+ // by scanning the callee-save registers to see if any is modified.
+ const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
+ for (unsigned i = 0; CSRegs[i]; ++i) {
+ unsigned Reg = CSRegs[i];
+ bool Spilled = false;
+ if (SavedRegs.test(Reg)) {
+ Spilled = true;
+ CanEliminateFrame = false;
+ }
+
+ if (!ARM::GPRRegClass.contains(Reg)) {
+ if (Spilled) {
+ if (ARM::SPRRegClass.contains(Reg))
+ NumFPRSpills++;
+ else if (ARM::DPRRegClass.contains(Reg))
+ NumFPRSpills += 2;
+ else if (ARM::QPRRegClass.contains(Reg))
+ NumFPRSpills += 4;
+ }
+ continue;
+ }
+
+ if (Spilled) {
+ NumGPRSpills++;
+
+ if (!STI.splitFramePushPop(MF)) {
+ if (Reg == ARM::LR)
+ LRSpilled = true;
+ CS1Spilled = true;
+ continue;
+ }
+
+ // Keep track if LR and any of R4, R5, R6, and R7 is spilled.
+ switch (Reg) {
+ case ARM::LR:
+ LRSpilled = true;
+ LLVM_FALLTHROUGH;
+ case ARM::R0: case ARM::R1:
+ case ARM::R2: case ARM::R3:
+ case ARM::R4: case ARM::R5:
+ case ARM::R6: case ARM::R7:
+ CS1Spilled = true;
+ break;
+ default:
+ break;
+ }
+ } else {
+ if (!STI.splitFramePushPop(MF)) {
+ UnspilledCS1GPRs.push_back(Reg);
+ continue;
+ }
+
+ switch (Reg) {
+ case ARM::R0: case ARM::R1:
+ case ARM::R2: case ARM::R3:
+ case ARM::R4: case ARM::R5:
+ case ARM::R6: case ARM::R7:
+ case ARM::LR:
+ UnspilledCS1GPRs.push_back(Reg);
+ break;
+ default:
+ UnspilledCS2GPRs.push_back(Reg);
+ break;
+ }
+ }
+ }
+
+ bool ForceLRSpill = false;
+ if (!LRSpilled && AFI->isThumb1OnlyFunction()) {
+ unsigned FnSize = GetFunctionSizeInBytes(MF, TII);
+ // Force LR to be spilled if the Thumb function size is > 2048. This enables
+ // use of BL to implement far jump. If it turns out that it's not needed
+ // then the branch fix up path will undo it.
+ if (FnSize >= (1 << 11)) {
+ CanEliminateFrame = false;
+ ForceLRSpill = true;
+ }
+ }
+
+ // If any of the stack slot references may be out of range of an immediate
+ // offset, make sure a register (or a spill slot) is available for the
+ // register scavenger. Note that if we're indexing off the frame pointer, the
+ // effective stack size is 4 bytes larger since the FP points to the stack
+ // slot of the previous FP. Also, if we have variable sized objects in the
+ // function, stack slot references will often be negative, and some of
+ // our instructions are positive-offset only, so conservatively consider
+ // that case to want a spill slot (or register) as well. Similarly, if
+ // the function adjusts the stack pointer during execution and the
+ // adjustments aren't already part of our stack size estimate, our offset
+ // calculations may be off, so be conservative.
+ // FIXME: We could add logic to be more precise about negative offsets
+ // and which instructions will need a scratch register for them. Is it
+ // worth the effort and added fragility?
+ unsigned EstimatedStackSize =
+ MFI.estimateStackSize(MF) + 4 * (NumGPRSpills + NumFPRSpills);
+ if (hasFP(MF)) {
+ if (AFI->hasStackFrame())
+ EstimatedStackSize += 4;
+ } else {
+ // If FP is not used, SP will be used to access arguments, so count the
+ // size of arguments into the estimation.
+ EstimatedStackSize += MF.getInfo<ARMFunctionInfo>()->getArgumentStackSize();
+ }
+ EstimatedStackSize += 16; // For possible paddings.
+
+ bool BigStack = EstimatedStackSize >= estimateRSStackSizeLimit(MF, this) ||
+ MFI.hasVarSizedObjects() ||
+ (MFI.adjustsStack() && !canSimplifyCallFramePseudos(MF));
+ bool ExtraCSSpill = false;
+ if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) {
+ AFI->setHasStackFrame(true);
+
+ if (hasFP(MF)) {
+ SavedRegs.set(FramePtr);
+ // If the frame pointer is required by the ABI, also spill LR so that we
+ // emit a complete frame record.
+ if (MF.getTarget().Options.DisableFramePointerElim(MF) && !LRSpilled) {
+ SavedRegs.set(ARM::LR);
+ LRSpilled = true;
+ NumGPRSpills++;
+ auto LRPos = find(UnspilledCS1GPRs, ARM::LR);
+ if (LRPos != UnspilledCS1GPRs.end())
+ UnspilledCS1GPRs.erase(LRPos);
+ }
+ auto FPPos = find(UnspilledCS1GPRs, FramePtr);
+ if (FPPos != UnspilledCS1GPRs.end())
+ UnspilledCS1GPRs.erase(FPPos);
+ NumGPRSpills++;
+ if (FramePtr == ARM::R7)
+ CS1Spilled = true;
+ }
+
+ if (AFI->isThumb1OnlyFunction()) {
+ // For Thumb1-only targets, we need some low registers when we save and
+ // restore the high registers (which aren't allocatable, but could be
+ // used by inline assembly) because the push/pop instructions can not
+ // access high registers. If necessary, we might need to push more low
+ // registers to ensure that there is at least one free that can be used
+ // for the saving & restoring, and preferably we should ensure that as
+ // many as are needed are available so that fewer push/pop instructions
+ // are required.
+
+ // Low registers which are not currently pushed, but could be (r4-r7).
+ SmallVector<unsigned, 4> AvailableRegs;
+
+ // Unused argument registers (r0-r3) can be clobbered in the prologue for
+ // free.
+ int EntryRegDeficit = 0;
+ for (unsigned Reg : {ARM::R0, ARM::R1, ARM::R2, ARM::R3}) {
+ if (!MF.getRegInfo().isLiveIn(Reg)) {
+ --EntryRegDeficit;
+ DEBUG(dbgs() << PrintReg(Reg, TRI)
+ << " is unused argument register, EntryRegDeficit = "
+ << EntryRegDeficit << "\n");
+ }
+ }
+
+ // Unused return registers can be clobbered in the epilogue for free.
+ int ExitRegDeficit = AFI->getReturnRegsCount() - 4;
+ DEBUG(dbgs() << AFI->getReturnRegsCount()
+ << " return regs used, ExitRegDeficit = " << ExitRegDeficit
+ << "\n");
+
+ int RegDeficit = std::max(EntryRegDeficit, ExitRegDeficit);
+ DEBUG(dbgs() << "RegDeficit = " << RegDeficit << "\n");
+
+ // r4-r6 can be used in the prologue if they are pushed by the first push
+ // instruction.
+ for (unsigned Reg : {ARM::R4, ARM::R5, ARM::R6}) {
+ if (SavedRegs.test(Reg)) {
+ --RegDeficit;
+ DEBUG(dbgs() << PrintReg(Reg, TRI)
+ << " is saved low register, RegDeficit = " << RegDeficit
+ << "\n");
+ } else {
+ AvailableRegs.push_back(Reg);
+ DEBUG(dbgs()
+ << PrintReg(Reg, TRI)
+ << " is non-saved low register, adding to AvailableRegs\n");
+ }
+ }
+
+ // r7 can be used if it is not being used as the frame pointer.
+ if (!hasFP(MF)) {
+ if (SavedRegs.test(ARM::R7)) {
+ --RegDeficit;
+ DEBUG(dbgs() << "%R7 is saved low register, RegDeficit = "
+ << RegDeficit << "\n");
+ } else {
+ AvailableRegs.push_back(ARM::R7);
+ DEBUG(dbgs()
+ << "%R7 is non-saved low register, adding to AvailableRegs\n");
+ }
+ }
+
+ // Each of r8-r11 needs to be copied to a low register, then pushed.
+ for (unsigned Reg : {ARM::R8, ARM::R9, ARM::R10, ARM::R11}) {
+ if (SavedRegs.test(Reg)) {
+ ++RegDeficit;
+ DEBUG(dbgs() << PrintReg(Reg, TRI)
+ << " is saved high register, RegDeficit = " << RegDeficit
+ << "\n");
+ }
+ }
+
+ // LR can only be used by PUSH, not POP, and can't be used at all if the
+ // llvm.returnaddress intrinsic is used. This is only worth doing if we
+ // are more limited at function entry than exit.
+ if ((EntryRegDeficit > ExitRegDeficit) &&
+ !(MF.getRegInfo().isLiveIn(ARM::LR) &&
+ MF.getFrameInfo().isReturnAddressTaken())) {
+ if (SavedRegs.test(ARM::LR)) {
+ --RegDeficit;
+ DEBUG(dbgs() << "%LR is saved register, RegDeficit = " << RegDeficit
+ << "\n");
+ } else {
+ AvailableRegs.push_back(ARM::LR);
+ DEBUG(dbgs() << "%LR is not saved, adding to AvailableRegs\n");
+ }
+ }
+
+ // If there are more high registers that need pushing than low registers
+ // available, push some more low registers so that we can use fewer push
+ // instructions. This might not reduce RegDeficit all the way to zero,
+ // because we can only guarantee that r4-r6 are available, but r8-r11 may
+ // need saving.
+ DEBUG(dbgs() << "Final RegDeficit = " << RegDeficit << "\n");
+ for (; RegDeficit > 0 && !AvailableRegs.empty(); --RegDeficit) {
+ unsigned Reg = AvailableRegs.pop_back_val();
+ DEBUG(dbgs() << "Spilling " << PrintReg(Reg, TRI)
+ << " to make up reg deficit\n");
+ SavedRegs.set(Reg);
+ NumGPRSpills++;
+ CS1Spilled = true;
+ ExtraCSSpill = true;
+ UnspilledCS1GPRs.erase(find(UnspilledCS1GPRs, Reg));
+ if (Reg == ARM::LR)
+ LRSpilled = true;
+ }
+ DEBUG(dbgs() << "After adding spills, RegDeficit = " << RegDeficit << "\n");
+ }
+
+ // If LR is not spilled, but at least one of R4, R5, R6, and R7 is spilled.
+ // Spill LR as well so we can fold BX_RET to the registers restore (LDM).
+ if (!LRSpilled && CS1Spilled) {
+ SavedRegs.set(ARM::LR);
+ NumGPRSpills++;
+ SmallVectorImpl<unsigned>::iterator LRPos;
+ LRPos = find(UnspilledCS1GPRs, (unsigned)ARM::LR);
+ if (LRPos != UnspilledCS1GPRs.end())
+ UnspilledCS1GPRs.erase(LRPos);
+
+ ForceLRSpill = false;
+ ExtraCSSpill = true;
+ }
+
+ // If stack and double are 8-byte aligned and we are spilling an odd number
+ // of GPRs, spill one extra callee save GPR so we won't have to pad between
+ // the integer and double callee save areas.
+ DEBUG(dbgs() << "NumGPRSpills = " << NumGPRSpills << "\n");
+ unsigned TargetAlign = getStackAlignment();
+ if (TargetAlign >= 8 && (NumGPRSpills & 1)) {
+ if (CS1Spilled && !UnspilledCS1GPRs.empty()) {
+ for (unsigned i = 0, e = UnspilledCS1GPRs.size(); i != e; ++i) {
+ unsigned Reg = UnspilledCS1GPRs[i];
+ // Don't spill high register if the function is thumb. In the case of
+ // Windows on ARM, accept R11 (frame pointer)
+ if (!AFI->isThumbFunction() ||
+ (STI.isTargetWindows() && Reg == ARM::R11) ||
+ isARMLowRegister(Reg) || Reg == ARM::LR) {
+ SavedRegs.set(Reg);
+ DEBUG(dbgs() << "Spilling " << PrintReg(Reg, TRI)
+ << " to make up alignment\n");
+ if (!MRI.isReserved(Reg))
+ ExtraCSSpill = true;
+ break;
+ }
+ }
+ } else if (!UnspilledCS2GPRs.empty() && !AFI->isThumb1OnlyFunction()) {
+ unsigned Reg = UnspilledCS2GPRs.front();
+ SavedRegs.set(Reg);
+ DEBUG(dbgs() << "Spilling " << PrintReg(Reg, TRI)
+ << " to make up alignment\n");
+ if (!MRI.isReserved(Reg))
+ ExtraCSSpill = true;
+ }
+ }
+
+ // Estimate if we might need to scavenge a register at some point in order
+ // to materialize a stack offset. If so, either spill one additional
+ // callee-saved register or reserve a special spill slot to facilitate
+ // register scavenging. Thumb1 needs a spill slot for stack pointer
+ // adjustments also, even when the frame itself is small.
+ if (BigStack && !ExtraCSSpill) {
+ // If any non-reserved CS register isn't spilled, just spill one or two
+ // extra. That should take care of it!
+ unsigned NumExtras = TargetAlign / 4;
+ SmallVector<unsigned, 2> Extras;
+ while (NumExtras && !UnspilledCS1GPRs.empty()) {
+ unsigned Reg = UnspilledCS1GPRs.back();
+ UnspilledCS1GPRs.pop_back();
+ if (!MRI.isReserved(Reg) &&
+ (!AFI->isThumb1OnlyFunction() || isARMLowRegister(Reg) ||
+ Reg == ARM::LR)) {
+ Extras.push_back(Reg);
+ NumExtras--;
+ }
+ }
+ // For non-Thumb1 functions, also check for hi-reg CS registers
+ if (!AFI->isThumb1OnlyFunction()) {
+ while (NumExtras && !UnspilledCS2GPRs.empty()) {
+ unsigned Reg = UnspilledCS2GPRs.back();
+ UnspilledCS2GPRs.pop_back();
+ if (!MRI.isReserved(Reg)) {
+ Extras.push_back(Reg);
+ NumExtras--;
+ }
+ }
+ }
+ if (Extras.size() && NumExtras == 0) {
+ for (unsigned i = 0, e = Extras.size(); i != e; ++i) {
+ SavedRegs.set(Extras[i]);
+ }
+ } else if (!AFI->isThumb1OnlyFunction()) {
+ // note: Thumb1 functions spill to R12, not the stack. Reserve a slot
+ // closest to SP or frame pointer.
+ assert(RS && "Register scavenging not provided");
+ const TargetRegisterClass *RC = &ARM::GPRRegClass;
+ RS->addScavengingFrameIndex(MFI.CreateStackObject(RC->getSize(),
+ RC->getAlignment(),
+ false));
+ }
+ }
+ }
+
+ if (ForceLRSpill) {
+ SavedRegs.set(ARM::LR);
+ AFI->setLRIsSpilledForFarJump(true);
+ }
+}
+
+MachineBasicBlock::iterator ARMFrameLowering::eliminateCallFramePseudoInstr(
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
+ const ARMBaseInstrInfo &TII =
+ *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
+ if (!hasReservedCallFrame(MF)) {
+ // If we have alloca, convert as follows:
+ // ADJCALLSTACKDOWN -> sub, sp, sp, amount
+ // ADJCALLSTACKUP -> add, sp, sp, amount
+ MachineInstr &Old = *I;
+ DebugLoc dl = Old.getDebugLoc();
+ unsigned Amount = Old.getOperand(0).getImm();
+ if (Amount != 0) {
+ // We need to keep the stack aligned properly. To do this, we round the
+ // amount of space needed for the outgoing arguments up to the next
+ // alignment boundary.
+ Amount = alignSPAdjust(Amount);
+
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+ assert(!AFI->isThumb1OnlyFunction() &&
+ "This eliminateCallFramePseudoInstr does not support Thumb1!");
+ bool isARM = !AFI->isThumbFunction();
+
+ // Replace the pseudo instruction with a new instruction...
+ unsigned Opc = Old.getOpcode();
+ int PIdx = Old.findFirstPredOperandIdx();
+ ARMCC::CondCodes Pred =
+ (PIdx == -1) ? ARMCC::AL
+ : (ARMCC::CondCodes)Old.getOperand(PIdx).getImm();
+ if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) {
+ // Note: PredReg is operand 2 for ADJCALLSTACKDOWN.
+ unsigned PredReg = Old.getOperand(2).getReg();
+ emitSPUpdate(isARM, MBB, I, dl, TII, -Amount, MachineInstr::NoFlags,
+ Pred, PredReg);
+ } else {
+ // Note: PredReg is operand 3 for ADJCALLSTACKUP.
+ unsigned PredReg = Old.getOperand(3).getReg();
+ assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP);
+ emitSPUpdate(isARM, MBB, I, dl, TII, Amount, MachineInstr::NoFlags,
+ Pred, PredReg);
+ }
+ }
+ }
+ return MBB.erase(I);
+}
+
+/// Get the minimum constant for ARM that is greater than or equal to the
+/// argument. In ARM, constants can have any value that can be produced by
+/// rotating an 8-bit value to the right by an even number of bits within a
+/// 32-bit word.
+static uint32_t alignToARMConstant(uint32_t Value) {
+ unsigned Shifted = 0;
+
+ if (Value == 0)
+ return 0;
+
+ while (!(Value & 0xC0000000)) {
+ Value = Value << 2;
+ Shifted += 2;
+ }
+
+ bool Carry = (Value & 0x00FFFFFF);
+ Value = ((Value & 0xFF000000) >> 24) + Carry;
+
+ if (Value & 0x0000100)
+ Value = Value & 0x000001FC;
+
+ if (Shifted > 24)
+ Value = Value >> (Shifted - 24);
+ else
+ Value = Value << (24 - Shifted);
+
+ return Value;
+}
+
+// The stack limit in the TCB is set to this many bytes above the actual
+// stack limit.
+static const uint64_t kSplitStackAvailable = 256;
+
+// Adjust the function prologue to enable split stacks. This currently only
+// supports android and linux.
+//
+// The ABI of the segmented stack prologue is a little arbitrarily chosen, but
+// must be well defined in order to allow for consistent implementations of the
+// __morestack helper function. The ABI is also not a normal ABI in that it
+// doesn't follow the normal calling conventions because this allows the
+// prologue of each function to be optimized further.
+//
+// Currently, the ABI looks like (when calling __morestack)
+//
+// * r4 holds the minimum stack size requested for this function call
+// * r5 holds the stack size of the arguments to the function
+// * the beginning of the function is 3 instructions after the call to
+// __morestack
+//
+// Implementations of __morestack should use r4 to allocate a new stack, r5 to
+// place the arguments on to the new stack, and the 3-instruction knowledge to
+// jump directly to the body of the function when working on the new stack.
+//
+// An old (and possibly no longer compatible) implementation of __morestack for
+// ARM can be found at [1].
+//
+// [1] - https://github.com/mozilla/rust/blob/86efd9/src/rt/arch/arm/morestack.S
+void ARMFrameLowering::adjustForSegmentedStacks(
+ MachineFunction &MF, MachineBasicBlock &PrologueMBB) const {
+ unsigned Opcode;
+ unsigned CFIIndex;
+ const ARMSubtarget *ST = &MF.getSubtarget<ARMSubtarget>();
+ bool Thumb = ST->isThumb();
+
+ // Sadly, this currently doesn't support varargs, platforms other than
+ // android/linux. Note that thumb1/thumb2 are support for android/linux.
+ if (MF.getFunction()->isVarArg())
+ report_fatal_error("Segmented stacks do not support vararg functions.");
+ if (!ST->isTargetAndroid() && !ST->isTargetLinux())
+ report_fatal_error("Segmented stacks not supported on this platform.");
+
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MachineModuleInfo &MMI = MF.getMMI();
+ MCContext &Context = MMI.getContext();
+ const MCRegisterInfo *MRI = Context.getRegisterInfo();
+ const ARMBaseInstrInfo &TII =
+ *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
+ ARMFunctionInfo *ARMFI = MF.getInfo<ARMFunctionInfo>();
+ DebugLoc DL;
+
+ uint64_t StackSize = MFI.getStackSize();
+
+ // Do not generate a prologue for functions with a stack of size zero
+ if (StackSize == 0)
+ return;
+
+ // Use R4 and R5 as scratch registers.
+ // We save R4 and R5 before use and restore them before leaving the function.
+ unsigned ScratchReg0 = ARM::R4;
+ unsigned ScratchReg1 = ARM::R5;
+ uint64_t AlignedStackSize;
+
+ MachineBasicBlock *PrevStackMBB = MF.CreateMachineBasicBlock();
+ MachineBasicBlock *PostStackMBB = MF.CreateMachineBasicBlock();
+ MachineBasicBlock *AllocMBB = MF.CreateMachineBasicBlock();
+ MachineBasicBlock *GetMBB = MF.CreateMachineBasicBlock();
+ MachineBasicBlock *McrMBB = MF.CreateMachineBasicBlock();
+
+ // Grab everything that reaches PrologueMBB to update there liveness as well.
+ SmallPtrSet<MachineBasicBlock *, 8> BeforePrologueRegion;
+ SmallVector<MachineBasicBlock *, 2> WalkList;
+ WalkList.push_back(&PrologueMBB);
+
+ do {
+ MachineBasicBlock *CurMBB = WalkList.pop_back_val();
+ for (MachineBasicBlock *PredBB : CurMBB->predecessors()) {
+ if (BeforePrologueRegion.insert(PredBB).second)
+ WalkList.push_back(PredBB);
+ }
+ } while (!WalkList.empty());
+
+ // The order in that list is important.
+ // The blocks will all be inserted before PrologueMBB using that order.
+ // Therefore the block that should appear first in the CFG should appear
+ // first in the list.
+ MachineBasicBlock *AddedBlocks[] = {PrevStackMBB, McrMBB, GetMBB, AllocMBB,
+ PostStackMBB};
+
+ for (MachineBasicBlock *B : AddedBlocks)
+ BeforePrologueRegion.insert(B);
+
+ for (const auto &LI : PrologueMBB.liveins()) {
+ for (MachineBasicBlock *PredBB : BeforePrologueRegion)
+ PredBB->addLiveIn(LI);
+ }
+
+ // Remove the newly added blocks from the list, since we know
+ // we do not have to do the following updates for them.
+ for (MachineBasicBlock *B : AddedBlocks) {
+ BeforePrologueRegion.erase(B);
+ MF.insert(PrologueMBB.getIterator(), B);
+ }
+
+ for (MachineBasicBlock *MBB : BeforePrologueRegion) {
+ // Make sure the LiveIns are still sorted and unique.
+ MBB->sortUniqueLiveIns();
+ // Replace the edges to PrologueMBB by edges to the sequences
+ // we are about to add.
+ MBB->ReplaceUsesOfBlockWith(&PrologueMBB, AddedBlocks[0]);
+ }
+
+ // The required stack size that is aligned to ARM constant criterion.
+ AlignedStackSize = alignToARMConstant(StackSize);
+
+ // When the frame size is less than 256 we just compare the stack
+ // boundary directly to the value of the stack pointer, per gcc.
+ bool CompareStackPointer = AlignedStackSize < kSplitStackAvailable;
+
+ // We will use two of the callee save registers as scratch registers so we
+ // need to save those registers onto the stack.
+ // We will use SR0 to hold stack limit and SR1 to hold the stack size
+ // requested and arguments for __morestack().
+ // SR0: Scratch Register #0
+ // SR1: Scratch Register #1
+ // push {SR0, SR1}
+ if (Thumb) {
+ AddDefaultPred(BuildMI(PrevStackMBB, DL, TII.get(ARM::tPUSH)))
+ .addReg(ScratchReg0).addReg(ScratchReg1);
+ } else {
+ AddDefaultPred(BuildMI(PrevStackMBB, DL, TII.get(ARM::STMDB_UPD))
+ .addReg(ARM::SP, RegState::Define).addReg(ARM::SP))
+ .addReg(ScratchReg0).addReg(ScratchReg1);
+ }
+
+ // Emit the relevant DWARF information about the change in stack pointer as
+ // well as where to find both r4 and r5 (the callee-save registers)
+ CFIIndex =
+ MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, -8));
+ BuildMI(PrevStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+ CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
+ nullptr, MRI->getDwarfRegNum(ScratchReg1, true), -4));
+ BuildMI(PrevStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+ CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
+ nullptr, MRI->getDwarfRegNum(ScratchReg0, true), -8));
+ BuildMI(PrevStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+
+ // mov SR1, sp
+ if (Thumb) {
+ AddDefaultPred(BuildMI(McrMBB, DL, TII.get(ARM::tMOVr), ScratchReg1)
+ .addReg(ARM::SP));
+ } else if (CompareStackPointer) {
+ AddDefaultPred(BuildMI(McrMBB, DL, TII.get(ARM::MOVr), ScratchReg1)
+ .addReg(ARM::SP)).addReg(0);
+ }
+
+ // sub SR1, sp, #StackSize
+ if (!CompareStackPointer && Thumb) {
+ AddDefaultPred(
+ AddDefaultCC(BuildMI(McrMBB, DL, TII.get(ARM::tSUBi8), ScratchReg1))
+ .addReg(ScratchReg1).addImm(AlignedStackSize));
+ } else if (!CompareStackPointer) {
+ AddDefaultPred(BuildMI(McrMBB, DL, TII.get(ARM::SUBri), ScratchReg1)
+ .addReg(ARM::SP).addImm(AlignedStackSize)).addReg(0);
+ }
+
+ if (Thumb && ST->isThumb1Only()) {
+ unsigned PCLabelId = ARMFI->createPICLabelUId();
+ ARMConstantPoolValue *NewCPV = ARMConstantPoolSymbol::Create(
+ MF.getFunction()->getContext(), "__STACK_LIMIT", PCLabelId, 0);
+ MachineConstantPool *MCP = MF.getConstantPool();
+ unsigned CPI = MCP->getConstantPoolIndex(NewCPV, 4);
+
+ // ldr SR0, [pc, offset(STACK_LIMIT)]
+ AddDefaultPred(BuildMI(GetMBB, DL, TII.get(ARM::tLDRpci), ScratchReg0)
+ .addConstantPoolIndex(CPI));
+
+ // ldr SR0, [SR0]
+ AddDefaultPred(BuildMI(GetMBB, DL, TII.get(ARM::tLDRi), ScratchReg0)
+ .addReg(ScratchReg0).addImm(0));
+ } else {
+ // Get TLS base address from the coprocessor
+ // mrc p15, #0, SR0, c13, c0, #3
+ AddDefaultPred(BuildMI(McrMBB, DL, TII.get(ARM::MRC), ScratchReg0)
+ .addImm(15)
+ .addImm(0)
+ .addImm(13)
+ .addImm(0)
+ .addImm(3));
+
+ // Use the last tls slot on android and a private field of the TCP on linux.
+ assert(ST->isTargetAndroid() || ST->isTargetLinux());
+ unsigned TlsOffset = ST->isTargetAndroid() ? 63 : 1;
+
+ // Get the stack limit from the right offset
+ // ldr SR0, [sr0, #4 * TlsOffset]
+ AddDefaultPred(BuildMI(GetMBB, DL, TII.get(ARM::LDRi12), ScratchReg0)
+ .addReg(ScratchReg0).addImm(4 * TlsOffset));
+ }
+
+ // Compare stack limit with stack size requested.
+ // cmp SR0, SR1
+ Opcode = Thumb ? ARM::tCMPr : ARM::CMPrr;
+ AddDefaultPred(BuildMI(GetMBB, DL, TII.get(Opcode))
+ .addReg(ScratchReg0)
+ .addReg(ScratchReg1));
+
+ // This jump is taken if StackLimit < SP - stack required.
+ Opcode = Thumb ? ARM::tBcc : ARM::Bcc;
+ BuildMI(GetMBB, DL, TII.get(Opcode)).addMBB(PostStackMBB)
+ .addImm(ARMCC::LO)
+ .addReg(ARM::CPSR);
+
+
+ // Calling __morestack(StackSize, Size of stack arguments).
+ // __morestack knows that the stack size requested is in SR0(r4)
+ // and amount size of stack arguments is in SR1(r5).
+
+ // Pass first argument for the __morestack by Scratch Register #0.
+ // The amount size of stack required
+ if (Thumb) {
+ AddDefaultPred(AddDefaultCC(BuildMI(AllocMBB, DL, TII.get(ARM::tMOVi8),
+ ScratchReg0)).addImm(AlignedStackSize));
+ } else {
+ AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::MOVi), ScratchReg0)
+ .addImm(AlignedStackSize)).addReg(0);
+ }
+ // Pass second argument for the __morestack by Scratch Register #1.
+ // The amount size of stack consumed to save function arguments.
+ if (Thumb) {
+ AddDefaultPred(
+ AddDefaultCC(BuildMI(AllocMBB, DL, TII.get(ARM::tMOVi8), ScratchReg1))
+ .addImm(alignToARMConstant(ARMFI->getArgumentStackSize())));
+ } else {
+ AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::MOVi), ScratchReg1)
+ .addImm(alignToARMConstant(ARMFI->getArgumentStackSize())))
+ .addReg(0);
+ }
+
+ // push {lr} - Save return address of this function.
+ if (Thumb) {
+ AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::tPUSH)))
+ .addReg(ARM::LR);
+ } else {
+ AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::STMDB_UPD))
+ .addReg(ARM::SP, RegState::Define)
+ .addReg(ARM::SP))
+ .addReg(ARM::LR);
+ }
+
+ // Emit the DWARF info about the change in stack as well as where to find the
+ // previous link register
+ CFIIndex =
+ MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, -12));
+ BuildMI(AllocMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+ CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
+ nullptr, MRI->getDwarfRegNum(ARM::LR, true), -12));
+ BuildMI(AllocMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+
+ // Call __morestack().
+ if (Thumb) {
+ AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::tBL)))
+ .addExternalSymbol("__morestack");
+ } else {
+ BuildMI(AllocMBB, DL, TII.get(ARM::BL))
+ .addExternalSymbol("__morestack");
+ }
+
+ // pop {lr} - Restore return address of this original function.
+ if (Thumb) {
+ if (ST->isThumb1Only()) {
+ AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::tPOP)))
+ .addReg(ScratchReg0);
+ AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::tMOVr), ARM::LR)
+ .addReg(ScratchReg0));
+ } else {
+ AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::t2LDR_POST))
+ .addReg(ARM::LR, RegState::Define)
+ .addReg(ARM::SP, RegState::Define)
+ .addReg(ARM::SP)
+ .addImm(4));
+ }
+ } else {
+ AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::LDMIA_UPD))
+ .addReg(ARM::SP, RegState::Define)
+ .addReg(ARM::SP))
+ .addReg(ARM::LR);
+ }
+
+ // Restore SR0 and SR1 in case of __morestack() was called.
+ // __morestack() will skip PostStackMBB block so we need to restore
+ // scratch registers from here.
+ // pop {SR0, SR1}
+ if (Thumb) {
+ AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::tPOP)))
+ .addReg(ScratchReg0)
+ .addReg(ScratchReg1);
+ } else {
+ AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::LDMIA_UPD))
+ .addReg(ARM::SP, RegState::Define)
+ .addReg(ARM::SP))
+ .addReg(ScratchReg0)
+ .addReg(ScratchReg1);
+ }
+
+ // Update the CFA offset now that we've popped
+ CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, 0));
+ BuildMI(AllocMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+
+ // bx lr - Return from this function.
+ Opcode = Thumb ? ARM::tBX_RET : ARM::BX_RET;
+ AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(Opcode)));
+
+ // Restore SR0 and SR1 in case of __morestack() was not called.
+ // pop {SR0, SR1}
+ if (Thumb) {
+ AddDefaultPred(BuildMI(PostStackMBB, DL, TII.get(ARM::tPOP)))
+ .addReg(ScratchReg0)
+ .addReg(ScratchReg1);
+ } else {
+ AddDefaultPred(BuildMI(PostStackMBB, DL, TII.get(ARM::LDMIA_UPD))
+ .addReg(ARM::SP, RegState::Define)
+ .addReg(ARM::SP))
+ .addReg(ScratchReg0)
+ .addReg(ScratchReg1);
+ }
+
+ // Update the CFA offset now that we've popped
+ CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, 0));
+ BuildMI(PostStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+
+ // Tell debuggers that r4 and r5 are now the same as they were in the
+ // previous function, that they're the "Same Value".
+ CFIIndex = MF.addFrameInst(MCCFIInstruction::createSameValue(
+ nullptr, MRI->getDwarfRegNum(ScratchReg0, true)));
+ BuildMI(PostStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+ CFIIndex = MF.addFrameInst(MCCFIInstruction::createSameValue(
+ nullptr, MRI->getDwarfRegNum(ScratchReg1, true)));
+ BuildMI(PostStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+
+ // Organizing MBB lists
+ PostStackMBB->addSuccessor(&PrologueMBB);
+
+ AllocMBB->addSuccessor(PostStackMBB);
+
+ GetMBB->addSuccessor(PostStackMBB);
+ GetMBB->addSuccessor(AllocMBB);
+
+ McrMBB->addSuccessor(GetMBB);
+
+ PrevStackMBB->addSuccessor(McrMBB);
+
+#ifdef EXPENSIVE_CHECKS
+ MF.verify();
+#endif
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h
new file mode 100644
index 000000000000..21cd78da395c
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h
@@ -0,0 +1,85 @@
+//==-- ARMTargetFrameLowering.h - Define frame lowering for ARM --*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMFRAMELOWERING_H
+#define LLVM_LIB_TARGET_ARM_ARMFRAMELOWERING_H
+
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+ class ARMSubtarget;
+
+class ARMFrameLowering : public TargetFrameLowering {
+protected:
+ const ARMSubtarget &STI;
+
+public:
+ explicit ARMFrameLowering(const ARMSubtarget &sti);
+
+ /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+ /// the function.
+ void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+ void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+ bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const override;
+
+ bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const override;
+
+ bool noFramePointerElim(const MachineFunction &MF) const override;
+
+ bool hasFP(const MachineFunction &MF) const override;
+ bool hasReservedCallFrame(const MachineFunction &MF) const override;
+ bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override;
+ int getFrameIndexReference(const MachineFunction &MF, int FI,
+ unsigned &FrameReg) const override;
+ int ResolveFrameIndexReference(const MachineFunction &MF, int FI,
+ unsigned &FrameReg, int SPAdj) const;
+
+ void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+ RegScavenger *RS) const override;
+
+ void adjustForSegmentedStacks(MachineFunction &MF,
+ MachineBasicBlock &MBB) const override;
+
+ /// Returns true if the target will correctly handle shrink wrapping.
+ bool enableShrinkWrapping(const MachineFunction &MF) const override {
+ return true;
+ }
+
+ private:
+ void emitPushInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI, unsigned StmOpc,
+ unsigned StrOpc, bool NoGap,
+ bool(*Func)(unsigned, bool), unsigned NumAlignedDPRCS2Regs,
+ unsigned MIFlags = 0) const;
+ void emitPopInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI, unsigned LdmOpc,
+ unsigned LdrOpc, bool isVarArg, bool NoGap,
+ bool(*Func)(unsigned, bool),
+ unsigned NumAlignedDPRCS2Regs) const;
+
+ MachineBasicBlock::iterator
+ eliminateCallFramePseudoInstr(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI) const override;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp b/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp
new file mode 100644
index 000000000000..0d904ecb6296
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp
@@ -0,0 +1,101 @@
+//===-- ARMHazardRecognizer.cpp - ARM postra hazard recognizer ------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMHazardRecognizer.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
+#include "ARMSubtarget.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+static bool hasRAWHazard(MachineInstr *DefMI, MachineInstr *MI,
+ const TargetRegisterInfo &TRI) {
+ // FIXME: Detect integer instructions properly.
+ const MCInstrDesc &MCID = MI->getDesc();
+ unsigned Domain = MCID.TSFlags & ARMII::DomainMask;
+ if (MI->mayStore())
+ return false;
+ unsigned Opcode = MCID.getOpcode();
+ if (Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD)
+ return false;
+ if ((Domain & ARMII::DomainVFP) || (Domain & ARMII::DomainNEON))
+ return MI->readsRegister(DefMI->getOperand(0).getReg(), &TRI);
+ return false;
+}
+
+ScheduleHazardRecognizer::HazardType
+ARMHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
+ assert(Stalls == 0 && "ARM hazards don't support scoreboard lookahead");
+
+ MachineInstr *MI = SU->getInstr();
+
+ if (!MI->isDebugValue()) {
+ // Look for special VMLA / VMLS hazards. A VMUL / VADD / VSUB following
+ // a VMLA / VMLS will cause 4 cycle stall.
+ const MCInstrDesc &MCID = MI->getDesc();
+ if (LastMI && (MCID.TSFlags & ARMII::DomainMask) != ARMII::DomainGeneral) {
+ MachineInstr *DefMI = LastMI;
+ const MCInstrDesc &LastMCID = LastMI->getDesc();
+ const MachineFunction *MF = MI->getParent()->getParent();
+ const ARMBaseInstrInfo &TII = *static_cast<const ARMBaseInstrInfo *>(
+ MF->getSubtarget().getInstrInfo());
+
+ // Skip over one non-VFP / NEON instruction.
+ if (!LastMI->isBarrier() &&
+ !(TII.getSubtarget().hasMuxedUnits() && LastMI->mayLoadOrStore()) &&
+ (LastMCID.TSFlags & ARMII::DomainMask) == ARMII::DomainGeneral) {
+ MachineBasicBlock::iterator I = LastMI;
+ if (I != LastMI->getParent()->begin()) {
+ I = std::prev(I);
+ DefMI = &*I;
+ }
+ }
+
+ if (TII.isFpMLxInstruction(DefMI->getOpcode()) &&
+ (TII.canCauseFpMLxStall(MI->getOpcode()) ||
+ hasRAWHazard(DefMI, MI, TII.getRegisterInfo()))) {
+ // Try to schedule another instruction for the next 4 cycles.
+ if (FpMLxStalls == 0)
+ FpMLxStalls = 4;
+ return Hazard;
+ }
+ }
+ }
+
+ return ScoreboardHazardRecognizer::getHazardType(SU, Stalls);
+}
+
+void ARMHazardRecognizer::Reset() {
+ LastMI = nullptr;
+ FpMLxStalls = 0;
+ ScoreboardHazardRecognizer::Reset();
+}
+
+void ARMHazardRecognizer::EmitInstruction(SUnit *SU) {
+ MachineInstr *MI = SU->getInstr();
+ if (!MI->isDebugValue()) {
+ LastMI = MI;
+ FpMLxStalls = 0;
+ }
+
+ ScoreboardHazardRecognizer::EmitInstruction(SU);
+}
+
+void ARMHazardRecognizer::AdvanceCycle() {
+ if (FpMLxStalls && --FpMLxStalls == 0)
+ // Stalled for 4 cycles but still can't schedule any other instructions.
+ LastMI = nullptr;
+ ScoreboardHazardRecognizer::AdvanceCycle();
+}
+
+void ARMHazardRecognizer::RecedeCycle() {
+ llvm_unreachable("reverse ARM hazard checking unsupported");
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.h b/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.h
new file mode 100644
index 000000000000..ccf09db69937
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.h
@@ -0,0 +1,49 @@
+//===-- ARMHazardRecognizer.h - ARM Hazard Recognizers ----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines hazard recognizers for scheduling ARM functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMHAZARDRECOGNIZER_H
+#define LLVM_LIB_TARGET_ARM_ARMHAZARDRECOGNIZER_H
+
+#include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
+
+namespace llvm {
+
+class ARMBaseInstrInfo;
+class ARMBaseRegisterInfo;
+class ARMSubtarget;
+class MachineInstr;
+
+/// ARMHazardRecognizer handles special constraints that are not expressed in
+/// the scheduling itinerary. This is only used during postRA scheduling. The
+/// ARM preRA scheduler uses an unspecialized instance of the
+/// ScoreboardHazardRecognizer.
+class ARMHazardRecognizer : public ScoreboardHazardRecognizer {
+ MachineInstr *LastMI;
+ unsigned FpMLxStalls;
+
+public:
+ ARMHazardRecognizer(const InstrItineraryData *ItinData,
+ const ScheduleDAG *DAG)
+ : ScoreboardHazardRecognizer(ItinData, DAG, "post-RA-sched"),
+ LastMI(nullptr) {}
+
+ HazardType getHazardType(SUnit *SU, int Stalls) override;
+ void Reset() override;
+ void EmitInstruction(SUnit *SU) override;
+ void AdvanceCycle() override;
+ void RecedeCycle() override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
new file mode 100644
index 000000000000..c3e9591d5c70
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -0,0 +1,4682 @@
+//===-- ARMISelDAGToDAG.cpp - A dag to dag inst selector for ARM ----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the ARM target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMTargetMachine.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-isel"
+
+static cl::opt<bool>
+DisableShifterOp("disable-shifter-op", cl::Hidden,
+ cl::desc("Disable isel of shifter-op"),
+ cl::init(false));
+
+//===--------------------------------------------------------------------===//
+/// ARMDAGToDAGISel - ARM specific code to select ARM machine
+/// instructions for SelectionDAG operations.
+///
+namespace {
+
+enum AddrMode2Type {
+ AM2_BASE, // Simple AM2 (+-imm12)
+ AM2_SHOP // Shifter-op AM2
+};
+
+class ARMDAGToDAGISel : public SelectionDAGISel {
+ /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
+ /// make the right decision when generating code for different targets.
+ const ARMSubtarget *Subtarget;
+
+public:
+ explicit ARMDAGToDAGISel(ARMBaseTargetMachine &tm, CodeGenOpt::Level OptLevel)
+ : SelectionDAGISel(tm, OptLevel) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ // Reset the subtarget each time through.
+ Subtarget = &MF.getSubtarget<ARMSubtarget>();
+ SelectionDAGISel::runOnMachineFunction(MF);
+ return true;
+ }
+
+ StringRef getPassName() const override { return "ARM Instruction Selection"; }
+
+ void PreprocessISelDAG() override;
+
+ /// getI32Imm - Return a target constant of type i32 with the specified
+ /// value.
+ inline SDValue getI32Imm(unsigned Imm, const SDLoc &dl) {
+ return CurDAG->getTargetConstant(Imm, dl, MVT::i32);
+ }
+
+ void Select(SDNode *N) override;
+
+ bool hasNoVMLxHazardUse(SDNode *N) const;
+ bool isShifterOpProfitable(const SDValue &Shift,
+ ARM_AM::ShiftOpc ShOpcVal, unsigned ShAmt);
+ bool SelectRegShifterOperand(SDValue N, SDValue &A,
+ SDValue &B, SDValue &C,
+ bool CheckProfitability = true);
+ bool SelectImmShifterOperand(SDValue N, SDValue &A,
+ SDValue &B, bool CheckProfitability = true);
+ bool SelectShiftRegShifterOperand(SDValue N, SDValue &A,
+ SDValue &B, SDValue &C) {
+ // Don't apply the profitability check
+ return SelectRegShifterOperand(N, A, B, C, false);
+ }
+ bool SelectShiftImmShifterOperand(SDValue N, SDValue &A,
+ SDValue &B) {
+ // Don't apply the profitability check
+ return SelectImmShifterOperand(N, A, B, false);
+ }
+
+ bool SelectAddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm);
+ bool SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset, SDValue &Opc);
+
+ AddrMode2Type SelectAddrMode2Worker(SDValue N, SDValue &Base,
+ SDValue &Offset, SDValue &Opc);
+ bool SelectAddrMode2Base(SDValue N, SDValue &Base, SDValue &Offset,
+ SDValue &Opc) {
+ return SelectAddrMode2Worker(N, Base, Offset, Opc) == AM2_BASE;
+ }
+
+ bool SelectAddrMode2ShOp(SDValue N, SDValue &Base, SDValue &Offset,
+ SDValue &Opc) {
+ return SelectAddrMode2Worker(N, Base, Offset, Opc) == AM2_SHOP;
+ }
+
+ bool SelectAddrMode2(SDValue N, SDValue &Base, SDValue &Offset,
+ SDValue &Opc) {
+ SelectAddrMode2Worker(N, Base, Offset, Opc);
+// return SelectAddrMode2ShOp(N, Base, Offset, Opc);
+ // This always matches one way or another.
+ return true;
+ }
+
+ bool SelectCMOVPred(SDValue N, SDValue &Pred, SDValue &Reg) {
+ const ConstantSDNode *CN = cast<ConstantSDNode>(N);
+ Pred = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(N), MVT::i32);
+ Reg = CurDAG->getRegister(ARM::CPSR, MVT::i32);
+ return true;
+ }
+
+ bool SelectAddrMode2OffsetReg(SDNode *Op, SDValue N,
+ SDValue &Offset, SDValue &Opc);
+ bool SelectAddrMode2OffsetImm(SDNode *Op, SDValue N,
+ SDValue &Offset, SDValue &Opc);
+ bool SelectAddrMode2OffsetImmPre(SDNode *Op, SDValue N,
+ SDValue &Offset, SDValue &Opc);
+ bool SelectAddrOffsetNone(SDValue N, SDValue &Base);
+ bool SelectAddrMode3(SDValue N, SDValue &Base,
+ SDValue &Offset, SDValue &Opc);
+ bool SelectAddrMode3Offset(SDNode *Op, SDValue N,
+ SDValue &Offset, SDValue &Opc);
+ bool SelectAddrMode5(SDValue N, SDValue &Base,
+ SDValue &Offset);
+ bool SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr,SDValue &Align);
+ bool SelectAddrMode6Offset(SDNode *Op, SDValue N, SDValue &Offset);
+
+ bool SelectAddrModePC(SDValue N, SDValue &Offset, SDValue &Label);
+
+ // Thumb Addressing Modes:
+ bool SelectThumbAddrModeRR(SDValue N, SDValue &Base, SDValue &Offset);
+ bool SelectThumbAddrModeImm5S(SDValue N, unsigned Scale, SDValue &Base,
+ SDValue &OffImm);
+ bool SelectThumbAddrModeImm5S1(SDValue N, SDValue &Base,
+ SDValue &OffImm);
+ bool SelectThumbAddrModeImm5S2(SDValue N, SDValue &Base,
+ SDValue &OffImm);
+ bool SelectThumbAddrModeImm5S4(SDValue N, SDValue &Base,
+ SDValue &OffImm);
+ bool SelectThumbAddrModeSP(SDValue N, SDValue &Base, SDValue &OffImm);
+
+ // Thumb 2 Addressing Modes:
+ bool SelectT2AddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm);
+ bool SelectT2AddrModeImm8(SDValue N, SDValue &Base,
+ SDValue &OffImm);
+ bool SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N,
+ SDValue &OffImm);
+ bool SelectT2AddrModeSoReg(SDValue N, SDValue &Base,
+ SDValue &OffReg, SDValue &ShImm);
+ bool SelectT2AddrModeExclusive(SDValue N, SDValue &Base, SDValue &OffImm);
+
+ inline bool is_so_imm(unsigned Imm) const {
+ return ARM_AM::getSOImmVal(Imm) != -1;
+ }
+
+ inline bool is_so_imm_not(unsigned Imm) const {
+ return ARM_AM::getSOImmVal(~Imm) != -1;
+ }
+
+ inline bool is_t2_so_imm(unsigned Imm) const {
+ return ARM_AM::getT2SOImmVal(Imm) != -1;
+ }
+
+ inline bool is_t2_so_imm_not(unsigned Imm) const {
+ return ARM_AM::getT2SOImmVal(~Imm) != -1;
+ }
+
+ // Include the pieces autogenerated from the target description.
+#include "ARMGenDAGISel.inc"
+
+private:
+ void transferMemOperands(SDNode *Src, SDNode *Dst);
+
+ /// Indexed (pre/post inc/dec) load matching code for ARM.
+ bool tryARMIndexedLoad(SDNode *N);
+ bool tryT1IndexedLoad(SDNode *N);
+ bool tryT2IndexedLoad(SDNode *N);
+
+ /// SelectVLD - Select NEON load intrinsics. NumVecs should be
+ /// 1, 2, 3 or 4. The opcode arrays specify the instructions used for
+ /// loads of D registers and even subregs and odd subregs of Q registers.
+ /// For NumVecs <= 2, QOpcodes1 is not used.
+ void SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
+ const uint16_t *DOpcodes, const uint16_t *QOpcodes0,
+ const uint16_t *QOpcodes1);
+
+ /// SelectVST - Select NEON store intrinsics. NumVecs should
+ /// be 1, 2, 3 or 4. The opcode arrays specify the instructions used for
+ /// stores of D registers and even subregs and odd subregs of Q registers.
+ /// For NumVecs <= 2, QOpcodes1 is not used.
+ void SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
+ const uint16_t *DOpcodes, const uint16_t *QOpcodes0,
+ const uint16_t *QOpcodes1);
+
+ /// SelectVLDSTLane - Select NEON load/store lane intrinsics. NumVecs should
+ /// be 2, 3 or 4. The opcode arrays specify the instructions used for
+ /// load/store of D registers and Q registers.
+ void SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating,
+ unsigned NumVecs, const uint16_t *DOpcodes,
+ const uint16_t *QOpcodes);
+
+ /// SelectVLDDup - Select NEON load-duplicate intrinsics. NumVecs
+ /// should be 1, 2, 3 or 4. The opcode array specifies the instructions used
+ /// for loading D registers. (Q registers are not supported.)
+ void SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
+ const uint16_t *DOpcodes,
+ const uint16_t *QOpcodes = nullptr);
+
+ /// SelectVTBL - Select NEON VTBL and VTBX intrinsics. NumVecs should be 2,
+ /// 3 or 4. These are custom-selected so that a REG_SEQUENCE can be
+ /// generated to force the table registers to be consecutive.
+ void SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs, unsigned Opc);
+
+ /// Try to select SBFX/UBFX instructions for ARM.
+ bool tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned);
+
+ // Select special operations if node forms integer ABS pattern
+ bool tryABSOp(SDNode *N);
+
+ bool tryReadRegister(SDNode *N);
+ bool tryWriteRegister(SDNode *N);
+
+ bool tryInlineAsm(SDNode *N);
+
+ void SelectConcatVector(SDNode *N);
+ void SelectCMPZ(SDNode *N, bool &SwitchEQNEToPLMI);
+
+ bool trySMLAWSMULW(SDNode *N);
+
+ void SelectCMP_SWAP(SDNode *N);
+
+ /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+ /// inline asm expressions.
+ bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
+ std::vector<SDValue> &OutOps) override;
+
+ // Form pairs of consecutive R, S, D, or Q registers.
+ SDNode *createGPRPairNode(EVT VT, SDValue V0, SDValue V1);
+ SDNode *createSRegPairNode(EVT VT, SDValue V0, SDValue V1);
+ SDNode *createDRegPairNode(EVT VT, SDValue V0, SDValue V1);
+ SDNode *createQRegPairNode(EVT VT, SDValue V0, SDValue V1);
+
+ // Form sequences of 4 consecutive S, D, or Q registers.
+ SDNode *createQuadSRegsNode(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3);
+ SDNode *createQuadDRegsNode(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3);
+ SDNode *createQuadQRegsNode(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3);
+
+ // Get the alignment operand for a NEON VLD or VST instruction.
+ SDValue GetVLDSTAlign(SDValue Align, const SDLoc &dl, unsigned NumVecs,
+ bool is64BitVector);
+
+ /// Returns the number of instructions required to materialize the given
+ /// constant in a register, or 3 if a literal pool load is needed.
+ unsigned ConstantMaterializationCost(unsigned Val) const;
+
+ /// Checks if N is a multiplication by a constant where we can extract out a
+ /// power of two from the constant so that it can be used in a shift, but only
+ /// if it simplifies the materialization of the constant. Returns true if it
+ /// is, and assigns to PowerOfTwo the power of two that should be extracted
+ /// out and to NewMulConst the new constant to be multiplied by.
+ bool canExtractShiftFromMul(const SDValue &N, unsigned MaxShift,
+ unsigned &PowerOfTwo, SDValue &NewMulConst) const;
+
+ /// Replace N with M in CurDAG, in a way that also ensures that M gets
+ /// selected when N would have been selected.
+ void replaceDAGValue(const SDValue &N, SDValue M);
+};
+}
+
+/// isInt32Immediate - This method tests to see if the node is a 32-bit constant
+/// operand. If so Imm will receive the 32-bit value.
+static bool isInt32Immediate(SDNode *N, unsigned &Imm) {
+ if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i32) {
+ Imm = cast<ConstantSDNode>(N)->getZExtValue();
+ return true;
+ }
+ return false;
+}
+
+// isInt32Immediate - This method tests to see if a constant operand.
+// If so Imm will receive the 32 bit value.
+static bool isInt32Immediate(SDValue N, unsigned &Imm) {
+ return isInt32Immediate(N.getNode(), Imm);
+}
+
+// isOpcWithIntImmediate - This method tests to see if the node is a specific
+// opcode and that it has a immediate integer right operand.
+// If so Imm will receive the 32 bit value.
+static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) {
+ return N->getOpcode() == Opc &&
+ isInt32Immediate(N->getOperand(1).getNode(), Imm);
+}
+
+/// \brief Check whether a particular node is a constant value representable as
+/// (N * Scale) where (N in [\p RangeMin, \p RangeMax).
+///
+/// \param ScaledConstant [out] - On success, the pre-scaled constant value.
+static bool isScaledConstantInRange(SDValue Node, int Scale,
+ int RangeMin, int RangeMax,
+ int &ScaledConstant) {
+ assert(Scale > 0 && "Invalid scale!");
+
+ // Check that this is a constant.
+ const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Node);
+ if (!C)
+ return false;
+
+ ScaledConstant = (int) C->getZExtValue();
+ if ((ScaledConstant % Scale) != 0)
+ return false;
+
+ ScaledConstant /= Scale;
+ return ScaledConstant >= RangeMin && ScaledConstant < RangeMax;
+}
+
+void ARMDAGToDAGISel::PreprocessISelDAG() {
+ if (!Subtarget->hasV6T2Ops())
+ return;
+
+ bool isThumb2 = Subtarget->isThumb();
+ for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
+ E = CurDAG->allnodes_end(); I != E; ) {
+ SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
+
+ if (N->getOpcode() != ISD::ADD)
+ continue;
+
+ // Look for (add X1, (and (srl X2, c1), c2)) where c2 is constant with
+ // leading zeros, followed by consecutive set bits, followed by 1 or 2
+ // trailing zeros, e.g. 1020.
+ // Transform the expression to
+ // (add X1, (shl (and (srl X2, c1), (c2>>tz)), tz)) where tz is the number
+ // of trailing zeros of c2. The left shift would be folded as an shifter
+ // operand of 'add' and the 'and' and 'srl' would become a bits extraction
+ // node (UBFX).
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ unsigned And_imm = 0;
+ if (!isOpcWithIntImmediate(N1.getNode(), ISD::AND, And_imm)) {
+ if (isOpcWithIntImmediate(N0.getNode(), ISD::AND, And_imm))
+ std::swap(N0, N1);
+ }
+ if (!And_imm)
+ continue;
+
+ // Check if the AND mask is an immediate of the form: 000.....1111111100
+ unsigned TZ = countTrailingZeros(And_imm);
+ if (TZ != 1 && TZ != 2)
+ // Be conservative here. Shifter operands aren't always free. e.g. On
+ // Swift, left shifter operand of 1 / 2 for free but others are not.
+ // e.g.
+ // ubfx r3, r1, #16, #8
+ // ldr.w r3, [r0, r3, lsl #2]
+ // vs.
+ // mov.w r9, #1020
+ // and.w r2, r9, r1, lsr #14
+ // ldr r2, [r0, r2]
+ continue;
+ And_imm >>= TZ;
+ if (And_imm & (And_imm + 1))
+ continue;
+
+ // Look for (and (srl X, c1), c2).
+ SDValue Srl = N1.getOperand(0);
+ unsigned Srl_imm = 0;
+ if (!isOpcWithIntImmediate(Srl.getNode(), ISD::SRL, Srl_imm) ||
+ (Srl_imm <= 2))
+ continue;
+
+ // Make sure first operand is not a shifter operand which would prevent
+ // folding of the left shift.
+ SDValue CPTmp0;
+ SDValue CPTmp1;
+ SDValue CPTmp2;
+ if (isThumb2) {
+ if (SelectImmShifterOperand(N0, CPTmp0, CPTmp1))
+ continue;
+ } else {
+ if (SelectImmShifterOperand(N0, CPTmp0, CPTmp1) ||
+ SelectRegShifterOperand(N0, CPTmp0, CPTmp1, CPTmp2))
+ continue;
+ }
+
+ // Now make the transformation.
+ Srl = CurDAG->getNode(ISD::SRL, SDLoc(Srl), MVT::i32,
+ Srl.getOperand(0),
+ CurDAG->getConstant(Srl_imm + TZ, SDLoc(Srl),
+ MVT::i32));
+ N1 = CurDAG->getNode(ISD::AND, SDLoc(N1), MVT::i32,
+ Srl,
+ CurDAG->getConstant(And_imm, SDLoc(Srl), MVT::i32));
+ N1 = CurDAG->getNode(ISD::SHL, SDLoc(N1), MVT::i32,
+ N1, CurDAG->getConstant(TZ, SDLoc(Srl), MVT::i32));
+ CurDAG->UpdateNodeOperands(N, N0, N1);
+ }
+}
+
+/// hasNoVMLxHazardUse - Return true if it's desirable to select a FP MLA / MLS
+/// node. VFP / NEON fp VMLA / VMLS instructions have special RAW hazards (at
+/// least on current ARM implementations) which should be avoidded.
+bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const {
+ if (OptLevel == CodeGenOpt::None)
+ return true;
+
+ if (!Subtarget->hasVMLxHazards())
+ return true;
+
+ if (!N->hasOneUse())
+ return false;
+
+ SDNode *Use = *N->use_begin();
+ if (Use->getOpcode() == ISD::CopyToReg)
+ return true;
+ if (Use->isMachineOpcode()) {
+ const ARMBaseInstrInfo *TII = static_cast<const ARMBaseInstrInfo *>(
+ CurDAG->getSubtarget().getInstrInfo());
+
+ const MCInstrDesc &MCID = TII->get(Use->getMachineOpcode());
+ if (MCID.mayStore())
+ return true;
+ unsigned Opcode = MCID.getOpcode();
+ if (Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD)
+ return true;
+ // vmlx feeding into another vmlx. We actually want to unfold
+ // the use later in the MLxExpansion pass. e.g.
+ // vmla
+ // vmla (stall 8 cycles)
+ //
+ // vmul (5 cycles)
+ // vadd (5 cycles)
+ // vmla
+ // This adds up to about 18 - 19 cycles.
+ //
+ // vmla
+ // vmul (stall 4 cycles)
+ // vadd adds up to about 14 cycles.
+ return TII->isFpMLxInstruction(Opcode);
+ }
+
+ return false;
+}
+
+bool ARMDAGToDAGISel::isShifterOpProfitable(const SDValue &Shift,
+ ARM_AM::ShiftOpc ShOpcVal,
+ unsigned ShAmt) {
+ if (!Subtarget->isLikeA9() && !Subtarget->isSwift())
+ return true;
+ if (Shift.hasOneUse())
+ return true;
+ // R << 2 is free.
+ return ShOpcVal == ARM_AM::lsl &&
+ (ShAmt == 2 || (Subtarget->isSwift() && ShAmt == 1));
+}
+
+unsigned ARMDAGToDAGISel::ConstantMaterializationCost(unsigned Val) const {
+ if (Subtarget->isThumb()) {
+ if (Val <= 255) return 1; // MOV
+ if (Subtarget->hasV6T2Ops() &&
+ (Val <= 0xffff || ARM_AM::getT2SOImmValSplatVal(Val) != -1))
+ return 1; // MOVW
+ if (Val <= 510) return 2; // MOV + ADDi8
+ if (~Val <= 255) return 2; // MOV + MVN
+ if (ARM_AM::isThumbImmShiftedVal(Val)) return 2; // MOV + LSL
+ } else {
+ if (ARM_AM::getSOImmVal(Val) != -1) return 1; // MOV
+ if (ARM_AM::getSOImmVal(~Val) != -1) return 1; // MVN
+ if (Subtarget->hasV6T2Ops() && Val <= 0xffff) return 1; // MOVW
+ if (ARM_AM::isSOImmTwoPartVal(Val)) return 2; // two instrs
+ }
+ if (Subtarget->useMovt(*MF)) return 2; // MOVW + MOVT
+ return 3; // Literal pool load
+}
+
+bool ARMDAGToDAGISel::canExtractShiftFromMul(const SDValue &N,
+ unsigned MaxShift,
+ unsigned &PowerOfTwo,
+ SDValue &NewMulConst) const {
+ assert(N.getOpcode() == ISD::MUL);
+ assert(MaxShift > 0);
+
+ // If the multiply is used in more than one place then changing the constant
+ // will make other uses incorrect, so don't.
+ if (!N.hasOneUse()) return false;
+ // Check if the multiply is by a constant
+ ConstantSDNode *MulConst = dyn_cast<ConstantSDNode>(N.getOperand(1));
+ if (!MulConst) return false;
+ // If the constant is used in more than one place then modifying it will mean
+ // we need to materialize two constants instead of one, which is a bad idea.
+ if (!MulConst->hasOneUse()) return false;
+ unsigned MulConstVal = MulConst->getZExtValue();
+ if (MulConstVal == 0) return false;
+
+ // Find the largest power of 2 that MulConstVal is a multiple of
+ PowerOfTwo = MaxShift;
+ while ((MulConstVal % (1 << PowerOfTwo)) != 0) {
+ --PowerOfTwo;
+ if (PowerOfTwo == 0) return false;
+ }
+
+ // Only optimise if the new cost is better
+ unsigned NewMulConstVal = MulConstVal / (1 << PowerOfTwo);
+ NewMulConst = CurDAG->getConstant(NewMulConstVal, SDLoc(N), MVT::i32);
+ unsigned OldCost = ConstantMaterializationCost(MulConstVal);
+ unsigned NewCost = ConstantMaterializationCost(NewMulConstVal);
+ return NewCost < OldCost;
+}
+
+void ARMDAGToDAGISel::replaceDAGValue(const SDValue &N, SDValue M) {
+ CurDAG->RepositionNode(N.getNode()->getIterator(), M.getNode());
+ CurDAG->ReplaceAllUsesWith(N, M);
+}
+
+bool ARMDAGToDAGISel::SelectImmShifterOperand(SDValue N,
+ SDValue &BaseReg,
+ SDValue &Opc,
+ bool CheckProfitability) {
+ if (DisableShifterOp)
+ return false;
+
+ // If N is a multiply-by-constant and it's profitable to extract a shift and
+ // use it in a shifted operand do so.
+ if (N.getOpcode() == ISD::MUL) {
+ unsigned PowerOfTwo = 0;
+ SDValue NewMulConst;
+ if (canExtractShiftFromMul(N, 31, PowerOfTwo, NewMulConst)) {
+ HandleSDNode Handle(N);
+ replaceDAGValue(N.getOperand(1), NewMulConst);
+ BaseReg = Handle.getValue();
+ Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ARM_AM::lsl,
+ PowerOfTwo),
+ SDLoc(N), MVT::i32);
+ return true;
+ }
+ }
+
+ ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode());
+
+ // Don't match base register only case. That is matched to a separate
+ // lower complexity pattern with explicit register operand.
+ if (ShOpcVal == ARM_AM::no_shift) return false;
+
+ BaseReg = N.getOperand(0);
+ unsigned ShImmVal = 0;
+ ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1));
+ if (!RHS) return false;
+ ShImmVal = RHS->getZExtValue() & 31;
+ Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal),
+ SDLoc(N), MVT::i32);
+ return true;
+}
+
+bool ARMDAGToDAGISel::SelectRegShifterOperand(SDValue N,
+ SDValue &BaseReg,
+ SDValue &ShReg,
+ SDValue &Opc,
+ bool CheckProfitability) {
+ if (DisableShifterOp)
+ return false;
+
+ ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode());
+
+ // Don't match base register only case. That is matched to a separate
+ // lower complexity pattern with explicit register operand.
+ if (ShOpcVal == ARM_AM::no_shift) return false;
+
+ BaseReg = N.getOperand(0);
+ unsigned ShImmVal = 0;
+ ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1));
+ if (RHS) return false;
+
+ ShReg = N.getOperand(1);
+ if (CheckProfitability && !isShifterOpProfitable(N, ShOpcVal, ShImmVal))
+ return false;
+ Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal),
+ SDLoc(N), MVT::i32);
+ return true;
+}
+
+
+bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N,
+ SDValue &Base,
+ SDValue &OffImm) {
+ // Match simple R + imm12 operands.
+
+ // Base only.
+ if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB &&
+ !CurDAG->isBaseWithConstantOffset(N)) {
+ if (N.getOpcode() == ISD::FrameIndex) {
+ // Match frame index.
+ int FI = cast<FrameIndexSDNode>(N)->getIndex();
+ Base = CurDAG->getTargetFrameIndex(
+ FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+ OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
+ return true;
+ }
+
+ if (N.getOpcode() == ARMISD::Wrapper &&
+ N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress &&
+ N.getOperand(0).getOpcode() != ISD::TargetExternalSymbol &&
+ N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) {
+ Base = N.getOperand(0);
+ } else
+ Base = N;
+ OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
+ return true;
+ }
+
+ if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+ int RHSC = (int)RHS->getSExtValue();
+ if (N.getOpcode() == ISD::SUB)
+ RHSC = -RHSC;
+
+ if (RHSC > -0x1000 && RHSC < 0x1000) { // 12 bits
+ Base = N.getOperand(0);
+ if (Base.getOpcode() == ISD::FrameIndex) {
+ int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+ Base = CurDAG->getTargetFrameIndex(
+ FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+ }
+ OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32);
+ return true;
+ }
+ }
+
+ // Base only.
+ Base = N;
+ OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
+ return true;
+}
+
+
+
+bool ARMDAGToDAGISel::SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset,
+ SDValue &Opc) {
+ if (N.getOpcode() == ISD::MUL &&
+ ((!Subtarget->isLikeA9() && !Subtarget->isSwift()) || N.hasOneUse())) {
+ if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+ // X * [3,5,9] -> X + X * [2,4,8] etc.
+ int RHSC = (int)RHS->getZExtValue();
+ if (RHSC & 1) {
+ RHSC = RHSC & ~1;
+ ARM_AM::AddrOpc AddSub = ARM_AM::add;
+ if (RHSC < 0) {
+ AddSub = ARM_AM::sub;
+ RHSC = - RHSC;
+ }
+ if (isPowerOf2_32(RHSC)) {
+ unsigned ShAmt = Log2_32(RHSC);
+ Base = Offset = N.getOperand(0);
+ Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt,
+ ARM_AM::lsl),
+ SDLoc(N), MVT::i32);
+ return true;
+ }
+ }
+ }
+ }
+
+ if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB &&
+ // ISD::OR that is equivalent to an ISD::ADD.
+ !CurDAG->isBaseWithConstantOffset(N))
+ return false;
+
+ // Leave simple R +/- imm12 operands for LDRi12
+ if (N.getOpcode() == ISD::ADD || N.getOpcode() == ISD::OR) {
+ int RHSC;
+ if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/1,
+ -0x1000+1, 0x1000, RHSC)) // 12 bits.
+ return false;
+ }
+
+ // Otherwise this is R +/- [possibly shifted] R.
+ ARM_AM::AddrOpc AddSub = N.getOpcode() == ISD::SUB ? ARM_AM::sub:ARM_AM::add;
+ ARM_AM::ShiftOpc ShOpcVal =
+ ARM_AM::getShiftOpcForNode(N.getOperand(1).getOpcode());
+ unsigned ShAmt = 0;
+
+ Base = N.getOperand(0);
+ Offset = N.getOperand(1);
+
+ if (ShOpcVal != ARM_AM::no_shift) {
+ // Check to see if the RHS of the shift is a constant, if not, we can't fold
+ // it.
+ if (ConstantSDNode *Sh =
+ dyn_cast<ConstantSDNode>(N.getOperand(1).getOperand(1))) {
+ ShAmt = Sh->getZExtValue();
+ if (isShifterOpProfitable(Offset, ShOpcVal, ShAmt))
+ Offset = N.getOperand(1).getOperand(0);
+ else {
+ ShAmt = 0;
+ ShOpcVal = ARM_AM::no_shift;
+ }
+ } else {
+ ShOpcVal = ARM_AM::no_shift;
+ }
+ }
+
+ // Try matching (R shl C) + (R).
+ if (N.getOpcode() != ISD::SUB && ShOpcVal == ARM_AM::no_shift &&
+ !(Subtarget->isLikeA9() || Subtarget->isSwift() ||
+ N.getOperand(0).hasOneUse())) {
+ ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0).getOpcode());
+ if (ShOpcVal != ARM_AM::no_shift) {
+ // Check to see if the RHS of the shift is a constant, if not, we can't
+ // fold it.
+ if (ConstantSDNode *Sh =
+ dyn_cast<ConstantSDNode>(N.getOperand(0).getOperand(1))) {
+ ShAmt = Sh->getZExtValue();
+ if (isShifterOpProfitable(N.getOperand(0), ShOpcVal, ShAmt)) {
+ Offset = N.getOperand(0).getOperand(0);
+ Base = N.getOperand(1);
+ } else {
+ ShAmt = 0;
+ ShOpcVal = ARM_AM::no_shift;
+ }
+ } else {
+ ShOpcVal = ARM_AM::no_shift;
+ }
+ }
+ }
+
+ // If Offset is a multiply-by-constant and it's profitable to extract a shift
+ // and use it in a shifted operand do so.
+ if (Offset.getOpcode() == ISD::MUL && N.hasOneUse()) {
+ unsigned PowerOfTwo = 0;
+ SDValue NewMulConst;
+ if (canExtractShiftFromMul(Offset, 31, PowerOfTwo, NewMulConst)) {
+ replaceDAGValue(Offset.getOperand(1), NewMulConst);
+ ShAmt = PowerOfTwo;
+ ShOpcVal = ARM_AM::lsl;
+ }
+ }
+
+ Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal),
+ SDLoc(N), MVT::i32);
+ return true;
+}
+
+
+//-----
+
+AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
+ SDValue &Base,
+ SDValue &Offset,
+ SDValue &Opc) {
+ if (N.getOpcode() == ISD::MUL &&
+ (!(Subtarget->isLikeA9() || Subtarget->isSwift()) || N.hasOneUse())) {
+ if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+ // X * [3,5,9] -> X + X * [2,4,8] etc.
+ int RHSC = (int)RHS->getZExtValue();
+ if (RHSC & 1) {
+ RHSC = RHSC & ~1;
+ ARM_AM::AddrOpc AddSub = ARM_AM::add;
+ if (RHSC < 0) {
+ AddSub = ARM_AM::sub;
+ RHSC = - RHSC;
+ }
+ if (isPowerOf2_32(RHSC)) {
+ unsigned ShAmt = Log2_32(RHSC);
+ Base = Offset = N.getOperand(0);
+ Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt,
+ ARM_AM::lsl),
+ SDLoc(N), MVT::i32);
+ return AM2_SHOP;
+ }
+ }
+ }
+ }
+
+ if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB &&
+ // ISD::OR that is equivalent to an ADD.
+ !CurDAG->isBaseWithConstantOffset(N)) {
+ Base = N;
+ if (N.getOpcode() == ISD::FrameIndex) {
+ int FI = cast<FrameIndexSDNode>(N)->getIndex();
+ Base = CurDAG->getTargetFrameIndex(
+ FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+ } else if (N.getOpcode() == ARMISD::Wrapper &&
+ N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress &&
+ N.getOperand(0).getOpcode() != ISD::TargetExternalSymbol &&
+ N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) {
+ Base = N.getOperand(0);
+ }
+ Offset = CurDAG->getRegister(0, MVT::i32);
+ Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(ARM_AM::add, 0,
+ ARM_AM::no_shift),
+ SDLoc(N), MVT::i32);
+ return AM2_BASE;
+ }
+
+ // Match simple R +/- imm12 operands.
+ if (N.getOpcode() != ISD::SUB) {
+ int RHSC;
+ if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/1,
+ -0x1000+1, 0x1000, RHSC)) { // 12 bits.
+ Base = N.getOperand(0);
+ if (Base.getOpcode() == ISD::FrameIndex) {
+ int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+ Base = CurDAG->getTargetFrameIndex(
+ FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+ }
+ Offset = CurDAG->getRegister(0, MVT::i32);
+
+ ARM_AM::AddrOpc AddSub = ARM_AM::add;
+ if (RHSC < 0) {
+ AddSub = ARM_AM::sub;
+ RHSC = - RHSC;
+ }
+ Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, RHSC,
+ ARM_AM::no_shift),
+ SDLoc(N), MVT::i32);
+ return AM2_BASE;
+ }
+ }
+
+ if ((Subtarget->isLikeA9() || Subtarget->isSwift()) && !N.hasOneUse()) {
+ // Compute R +/- (R << N) and reuse it.
+ Base = N;
+ Offset = CurDAG->getRegister(0, MVT::i32);
+ Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(ARM_AM::add, 0,
+ ARM_AM::no_shift),
+ SDLoc(N), MVT::i32);
+ return AM2_BASE;
+ }
+
+ // Otherwise this is R +/- [possibly shifted] R.
+ ARM_AM::AddrOpc AddSub = N.getOpcode() != ISD::SUB ? ARM_AM::add:ARM_AM::sub;
+ ARM_AM::ShiftOpc ShOpcVal =
+ ARM_AM::getShiftOpcForNode(N.getOperand(1).getOpcode());
+ unsigned ShAmt = 0;
+
+ Base = N.getOperand(0);
+ Offset = N.getOperand(1);
+
+ if (ShOpcVal != ARM_AM::no_shift) {
+ // Check to see if the RHS of the shift is a constant, if not, we can't fold
+ // it.
+ if (ConstantSDNode *Sh =
+ dyn_cast<ConstantSDNode>(N.getOperand(1).getOperand(1))) {
+ ShAmt = Sh->getZExtValue();
+ if (isShifterOpProfitable(Offset, ShOpcVal, ShAmt))
+ Offset = N.getOperand(1).getOperand(0);
+ else {
+ ShAmt = 0;
+ ShOpcVal = ARM_AM::no_shift;
+ }
+ } else {
+ ShOpcVal = ARM_AM::no_shift;
+ }
+ }
+
+ // Try matching (R shl C) + (R).
+ if (N.getOpcode() != ISD::SUB && ShOpcVal == ARM_AM::no_shift &&
+ !(Subtarget->isLikeA9() || Subtarget->isSwift() ||
+ N.getOperand(0).hasOneUse())) {
+ ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0).getOpcode());
+ if (ShOpcVal != ARM_AM::no_shift) {
+ // Check to see if the RHS of the shift is a constant, if not, we can't
+ // fold it.
+ if (ConstantSDNode *Sh =
+ dyn_cast<ConstantSDNode>(N.getOperand(0).getOperand(1))) {
+ ShAmt = Sh->getZExtValue();
+ if (isShifterOpProfitable(N.getOperand(0), ShOpcVal, ShAmt)) {
+ Offset = N.getOperand(0).getOperand(0);
+ Base = N.getOperand(1);
+ } else {
+ ShAmt = 0;
+ ShOpcVal = ARM_AM::no_shift;
+ }
+ } else {
+ ShOpcVal = ARM_AM::no_shift;
+ }
+ }
+ }
+
+ Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal),
+ SDLoc(N), MVT::i32);
+ return AM2_SHOP;
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode2OffsetReg(SDNode *Op, SDValue N,
+ SDValue &Offset, SDValue &Opc) {
+ unsigned Opcode = Op->getOpcode();
+ ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
+ ? cast<LoadSDNode>(Op)->getAddressingMode()
+ : cast<StoreSDNode>(Op)->getAddressingMode();
+ ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC)
+ ? ARM_AM::add : ARM_AM::sub;
+ int Val;
+ if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x1000, Val))
+ return false;
+
+ Offset = N;
+ ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode());
+ unsigned ShAmt = 0;
+ if (ShOpcVal != ARM_AM::no_shift) {
+ // Check to see if the RHS of the shift is a constant, if not, we can't fold
+ // it.
+ if (ConstantSDNode *Sh = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+ ShAmt = Sh->getZExtValue();
+ if (isShifterOpProfitable(N, ShOpcVal, ShAmt))
+ Offset = N.getOperand(0);
+ else {
+ ShAmt = 0;
+ ShOpcVal = ARM_AM::no_shift;
+ }
+ } else {
+ ShOpcVal = ARM_AM::no_shift;
+ }
+ }
+
+ Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal),
+ SDLoc(N), MVT::i32);
+ return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode2OffsetImmPre(SDNode *Op, SDValue N,
+ SDValue &Offset, SDValue &Opc) {
+ unsigned Opcode = Op->getOpcode();
+ ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
+ ? cast<LoadSDNode>(Op)->getAddressingMode()
+ : cast<StoreSDNode>(Op)->getAddressingMode();
+ ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC)
+ ? ARM_AM::add : ARM_AM::sub;
+ int Val;
+ if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x1000, Val)) { // 12 bits.
+ if (AddSub == ARM_AM::sub) Val *= -1;
+ Offset = CurDAG->getRegister(0, MVT::i32);
+ Opc = CurDAG->getTargetConstant(Val, SDLoc(Op), MVT::i32);
+ return true;
+ }
+
+ return false;
+}
+
+
+bool ARMDAGToDAGISel::SelectAddrMode2OffsetImm(SDNode *Op, SDValue N,
+ SDValue &Offset, SDValue &Opc) {
+ unsigned Opcode = Op->getOpcode();
+ ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
+ ? cast<LoadSDNode>(Op)->getAddressingMode()
+ : cast<StoreSDNode>(Op)->getAddressingMode();
+ ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC)
+ ? ARM_AM::add : ARM_AM::sub;
+ int Val;
+ if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x1000, Val)) { // 12 bits.
+ Offset = CurDAG->getRegister(0, MVT::i32);
+ Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, Val,
+ ARM_AM::no_shift),
+ SDLoc(Op), MVT::i32);
+ return true;
+ }
+
+ return false;
+}
+
+bool ARMDAGToDAGISel::SelectAddrOffsetNone(SDValue N, SDValue &Base) {
+ Base = N;
+ return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode3(SDValue N,
+ SDValue &Base, SDValue &Offset,
+ SDValue &Opc) {
+ if (N.getOpcode() == ISD::SUB) {
+ // X - C is canonicalize to X + -C, no need to handle it here.
+ Base = N.getOperand(0);
+ Offset = N.getOperand(1);
+ Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::sub, 0), SDLoc(N),
+ MVT::i32);
+ return true;
+ }
+
+ if (!CurDAG->isBaseWithConstantOffset(N)) {
+ Base = N;
+ if (N.getOpcode() == ISD::FrameIndex) {
+ int FI = cast<FrameIndexSDNode>(N)->getIndex();
+ Base = CurDAG->getTargetFrameIndex(
+ FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+ }
+ Offset = CurDAG->getRegister(0, MVT::i32);
+ Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0), SDLoc(N),
+ MVT::i32);
+ return true;
+ }
+
+ // If the RHS is +/- imm8, fold into addr mode.
+ int RHSC;
+ if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/1,
+ -256 + 1, 256, RHSC)) { // 8 bits.
+ Base = N.getOperand(0);
+ if (Base.getOpcode() == ISD::FrameIndex) {
+ int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+ Base = CurDAG->getTargetFrameIndex(
+ FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+ }
+ Offset = CurDAG->getRegister(0, MVT::i32);
+
+ ARM_AM::AddrOpc AddSub = ARM_AM::add;
+ if (RHSC < 0) {
+ AddSub = ARM_AM::sub;
+ RHSC = -RHSC;
+ }
+ Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, RHSC), SDLoc(N),
+ MVT::i32);
+ return true;
+ }
+
+ Base = N.getOperand(0);
+ Offset = N.getOperand(1);
+ Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0), SDLoc(N),
+ MVT::i32);
+ return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode3Offset(SDNode *Op, SDValue N,
+ SDValue &Offset, SDValue &Opc) {
+ unsigned Opcode = Op->getOpcode();
+ ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
+ ? cast<LoadSDNode>(Op)->getAddressingMode()
+ : cast<StoreSDNode>(Op)->getAddressingMode();
+ ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC)
+ ? ARM_AM::add : ARM_AM::sub;
+ int Val;
+ if (isScaledConstantInRange(N, /*Scale=*/1, 0, 256, Val)) { // 12 bits.
+ Offset = CurDAG->getRegister(0, MVT::i32);
+ Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, Val), SDLoc(Op),
+ MVT::i32);
+ return true;
+ }
+
+ Offset = N;
+ Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, 0), SDLoc(Op),
+ MVT::i32);
+ return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
+ SDValue &Base, SDValue &Offset) {
+ if (!CurDAG->isBaseWithConstantOffset(N)) {
+ Base = N;
+ if (N.getOpcode() == ISD::FrameIndex) {
+ int FI = cast<FrameIndexSDNode>(N)->getIndex();
+ Base = CurDAG->getTargetFrameIndex(
+ FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+ } else if (N.getOpcode() == ARMISD::Wrapper &&
+ N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress &&
+ N.getOperand(0).getOpcode() != ISD::TargetExternalSymbol &&
+ N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) {
+ Base = N.getOperand(0);
+ }
+ Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0),
+ SDLoc(N), MVT::i32);
+ return true;
+ }
+
+ // If the RHS is +/- imm8, fold into addr mode.
+ int RHSC;
+ if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/4,
+ -256 + 1, 256, RHSC)) {
+ Base = N.getOperand(0);
+ if (Base.getOpcode() == ISD::FrameIndex) {
+ int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+ Base = CurDAG->getTargetFrameIndex(
+ FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+ }
+
+ ARM_AM::AddrOpc AddSub = ARM_AM::add;
+ if (RHSC < 0) {
+ AddSub = ARM_AM::sub;
+ RHSC = -RHSC;
+ }
+ Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(AddSub, RHSC),
+ SDLoc(N), MVT::i32);
+ return true;
+ }
+
+ Base = N;
+ Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0),
+ SDLoc(N), MVT::i32);
+ return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr,
+ SDValue &Align) {
+ Addr = N;
+
+ unsigned Alignment = 0;
+
+ MemSDNode *MemN = cast<MemSDNode>(Parent);
+
+ if (isa<LSBaseSDNode>(MemN) ||
+ ((MemN->getOpcode() == ARMISD::VST1_UPD ||
+ MemN->getOpcode() == ARMISD::VLD1_UPD) &&
+ MemN->getConstantOperandVal(MemN->getNumOperands() - 1) == 1)) {
+ // This case occurs only for VLD1-lane/dup and VST1-lane instructions.
+ // The maximum alignment is equal to the memory size being referenced.
+ unsigned MMOAlign = MemN->getAlignment();
+ unsigned MemSize = MemN->getMemoryVT().getSizeInBits() / 8;
+ if (MMOAlign >= MemSize && MemSize > 1)
+ Alignment = MemSize;
+ } else {
+ // All other uses of addrmode6 are for intrinsics. For now just record
+ // the raw alignment value; it will be refined later based on the legal
+ // alignment operands for the intrinsic.
+ Alignment = MemN->getAlignment();
+ }
+
+ Align = CurDAG->getTargetConstant(Alignment, SDLoc(N), MVT::i32);
+ return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode6Offset(SDNode *Op, SDValue N,
+ SDValue &Offset) {
+ LSBaseSDNode *LdSt = cast<LSBaseSDNode>(Op);
+ ISD::MemIndexedMode AM = LdSt->getAddressingMode();
+ if (AM != ISD::POST_INC)
+ return false;
+ Offset = N;
+ if (ConstantSDNode *NC = dyn_cast<ConstantSDNode>(N)) {
+ if (NC->getZExtValue() * 8 == LdSt->getMemoryVT().getSizeInBits())
+ Offset = CurDAG->getRegister(0, MVT::i32);
+ }
+ return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrModePC(SDValue N,
+ SDValue &Offset, SDValue &Label) {
+ if (N.getOpcode() == ARMISD::PIC_ADD && N.hasOneUse()) {
+ Offset = N.getOperand(0);
+ SDValue N1 = N.getOperand(1);
+ Label = CurDAG->getTargetConstant(cast<ConstantSDNode>(N1)->getZExtValue(),
+ SDLoc(N), MVT::i32);
+ return true;
+ }
+
+ return false;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Thumb Addressing Modes
+//===----------------------------------------------------------------------===//
+
+bool ARMDAGToDAGISel::SelectThumbAddrModeRR(SDValue N,
+ SDValue &Base, SDValue &Offset){
+ if (N.getOpcode() != ISD::ADD && !CurDAG->isBaseWithConstantOffset(N)) {
+ ConstantSDNode *NC = dyn_cast<ConstantSDNode>(N);
+ if (!NC || !NC->isNullValue())
+ return false;
+
+ Base = Offset = N;
+ return true;
+ }
+
+ Base = N.getOperand(0);
+ Offset = N.getOperand(1);
+ return true;
+}
+
+bool
+ARMDAGToDAGISel::SelectThumbAddrModeImm5S(SDValue N, unsigned Scale,
+ SDValue &Base, SDValue &OffImm) {
+ if (!CurDAG->isBaseWithConstantOffset(N)) {
+ if (N.getOpcode() == ISD::ADD) {
+ return false; // We want to select register offset instead
+ } else if (N.getOpcode() == ARMISD::Wrapper &&
+ N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress &&
+ N.getOperand(0).getOpcode() != ISD::TargetExternalSymbol &&
+ N.getOperand(0).getOpcode() != ISD::TargetConstantPool &&
+ N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) {
+ Base = N.getOperand(0);
+ } else {
+ Base = N;
+ }
+
+ OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
+ return true;
+ }
+
+ // If the RHS is + imm5 * scale, fold into addr mode.
+ int RHSC;
+ if (isScaledConstantInRange(N.getOperand(1), Scale, 0, 32, RHSC)) {
+ Base = N.getOperand(0);
+ OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32);
+ return true;
+ }
+
+ // Offset is too large, so use register offset instead.
+ return false;
+}
+
+bool
+ARMDAGToDAGISel::SelectThumbAddrModeImm5S4(SDValue N, SDValue &Base,
+ SDValue &OffImm) {
+ return SelectThumbAddrModeImm5S(N, 4, Base, OffImm);
+}
+
+bool
+ARMDAGToDAGISel::SelectThumbAddrModeImm5S2(SDValue N, SDValue &Base,
+ SDValue &OffImm) {
+ return SelectThumbAddrModeImm5S(N, 2, Base, OffImm);
+}
+
+bool
+ARMDAGToDAGISel::SelectThumbAddrModeImm5S1(SDValue N, SDValue &Base,
+ SDValue &OffImm) {
+ return SelectThumbAddrModeImm5S(N, 1, Base, OffImm);
+}
+
+bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N,
+ SDValue &Base, SDValue &OffImm) {
+ if (N.getOpcode() == ISD::FrameIndex) {
+ int FI = cast<FrameIndexSDNode>(N)->getIndex();
+ // Only multiples of 4 are allowed for the offset, so the frame object
+ // alignment must be at least 4.
+ MachineFrameInfo &MFI = MF->getFrameInfo();
+ if (MFI.getObjectAlignment(FI) < 4)
+ MFI.setObjectAlignment(FI, 4);
+ Base = CurDAG->getTargetFrameIndex(
+ FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+ OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
+ return true;
+ }
+
+ if (!CurDAG->isBaseWithConstantOffset(N))
+ return false;
+
+ RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(N.getOperand(0));
+ if (N.getOperand(0).getOpcode() == ISD::FrameIndex ||
+ (LHSR && LHSR->getReg() == ARM::SP)) {
+ // If the RHS is + imm8 * scale, fold into addr mode.
+ int RHSC;
+ if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/4, 0, 256, RHSC)) {
+ Base = N.getOperand(0);
+ if (Base.getOpcode() == ISD::FrameIndex) {
+ int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+ // For LHS+RHS to result in an offset that's a multiple of 4 the object
+ // indexed by the LHS must be 4-byte aligned.
+ MachineFrameInfo &MFI = MF->getFrameInfo();
+ if (MFI.getObjectAlignment(FI) < 4)
+ MFI.setObjectAlignment(FI, 4);
+ Base = CurDAG->getTargetFrameIndex(
+ FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+ }
+ OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Thumb 2 Addressing Modes
+//===----------------------------------------------------------------------===//
+
+
+bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N,
+ SDValue &Base, SDValue &OffImm) {
+ // Match simple R + imm12 operands.
+
+ // Base only.
+ if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB &&
+ !CurDAG->isBaseWithConstantOffset(N)) {
+ if (N.getOpcode() == ISD::FrameIndex) {
+ // Match frame index.
+ int FI = cast<FrameIndexSDNode>(N)->getIndex();
+ Base = CurDAG->getTargetFrameIndex(
+ FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+ OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
+ return true;
+ }
+
+ if (N.getOpcode() == ARMISD::Wrapper &&
+ N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress &&
+ N.getOperand(0).getOpcode() != ISD::TargetExternalSymbol &&
+ N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) {
+ Base = N.getOperand(0);
+ if (Base.getOpcode() == ISD::TargetConstantPool)
+ return false; // We want to select t2LDRpci instead.
+ } else
+ Base = N;
+ OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
+ return true;
+ }
+
+ if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+ if (SelectT2AddrModeImm8(N, Base, OffImm))
+ // Let t2LDRi8 handle (R - imm8).
+ return false;
+
+ int RHSC = (int)RHS->getZExtValue();
+ if (N.getOpcode() == ISD::SUB)
+ RHSC = -RHSC;
+
+ if (RHSC >= 0 && RHSC < 0x1000) { // 12 bits (unsigned)
+ Base = N.getOperand(0);
+ if (Base.getOpcode() == ISD::FrameIndex) {
+ int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+ Base = CurDAG->getTargetFrameIndex(
+ FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+ }
+ OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32);
+ return true;
+ }
+ }
+
+ // Base only.
+ Base = N;
+ OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
+ return true;
+}
+
+bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDValue N,
+ SDValue &Base, SDValue &OffImm) {
+ // Match simple R - imm8 operands.
+ if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB &&
+ !CurDAG->isBaseWithConstantOffset(N))
+ return false;
+
+ if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+ int RHSC = (int)RHS->getSExtValue();
+ if (N.getOpcode() == ISD::SUB)
+ RHSC = -RHSC;
+
+ if ((RHSC >= -255) && (RHSC < 0)) { // 8 bits (always negative)
+ Base = N.getOperand(0);
+ if (Base.getOpcode() == ISD::FrameIndex) {
+ int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+ Base = CurDAG->getTargetFrameIndex(
+ FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+ }
+ OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool ARMDAGToDAGISel::SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N,
+ SDValue &OffImm){
+ unsigned Opcode = Op->getOpcode();
+ ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
+ ? cast<LoadSDNode>(Op)->getAddressingMode()
+ : cast<StoreSDNode>(Op)->getAddressingMode();
+ int RHSC;
+ if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x100, RHSC)) { // 8 bits.
+ OffImm = ((AM == ISD::PRE_INC) || (AM == ISD::POST_INC))
+ ? CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32)
+ : CurDAG->getTargetConstant(-RHSC, SDLoc(N), MVT::i32);
+ return true;
+ }
+
+ return false;
+}
+
+bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue N,
+ SDValue &Base,
+ SDValue &OffReg, SDValue &ShImm) {
+ // (R - imm8) should be handled by t2LDRi8. The rest are handled by t2LDRi12.
+ if (N.getOpcode() != ISD::ADD && !CurDAG->isBaseWithConstantOffset(N))
+ return false;
+
+ // Leave (R + imm12) for t2LDRi12, (R - imm8) for t2LDRi8.
+ if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+ int RHSC = (int)RHS->getZExtValue();
+ if (RHSC >= 0 && RHSC < 0x1000) // 12 bits (unsigned)
+ return false;
+ else if (RHSC < 0 && RHSC >= -255) // 8 bits
+ return false;
+ }
+
+ // Look for (R + R) or (R + (R << [1,2,3])).
+ unsigned ShAmt = 0;
+ Base = N.getOperand(0);
+ OffReg = N.getOperand(1);
+
+ // Swap if it is ((R << c) + R).
+ ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(OffReg.getOpcode());
+ if (ShOpcVal != ARM_AM::lsl) {
+ ShOpcVal = ARM_AM::getShiftOpcForNode(Base.getOpcode());
+ if (ShOpcVal == ARM_AM::lsl)
+ std::swap(Base, OffReg);
+ }
+
+ if (ShOpcVal == ARM_AM::lsl) {
+ // Check to see if the RHS of the shift is a constant, if not, we can't fold
+ // it.
+ if (ConstantSDNode *Sh = dyn_cast<ConstantSDNode>(OffReg.getOperand(1))) {
+ ShAmt = Sh->getZExtValue();
+ if (ShAmt < 4 && isShifterOpProfitable(OffReg, ShOpcVal, ShAmt))
+ OffReg = OffReg.getOperand(0);
+ else {
+ ShAmt = 0;
+ }
+ }
+ }
+
+ // If OffReg is a multiply-by-constant and it's profitable to extract a shift
+ // and use it in a shifted operand do so.
+ if (OffReg.getOpcode() == ISD::MUL && N.hasOneUse()) {
+ unsigned PowerOfTwo = 0;
+ SDValue NewMulConst;
+ if (canExtractShiftFromMul(OffReg, 3, PowerOfTwo, NewMulConst)) {
+ replaceDAGValue(OffReg.getOperand(1), NewMulConst);
+ ShAmt = PowerOfTwo;
+ }
+ }
+
+ ShImm = CurDAG->getTargetConstant(ShAmt, SDLoc(N), MVT::i32);
+
+ return true;
+}
+
+bool ARMDAGToDAGISel::SelectT2AddrModeExclusive(SDValue N, SDValue &Base,
+ SDValue &OffImm) {
+ // This *must* succeed since it's used for the irreplaceable ldrex and strex
+ // instructions.
+ Base = N;
+ OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
+
+ if (N.getOpcode() != ISD::ADD || !CurDAG->isBaseWithConstantOffset(N))
+ return true;
+
+ ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1));
+ if (!RHS)
+ return true;
+
+ uint32_t RHSC = (int)RHS->getZExtValue();
+ if (RHSC > 1020 || RHSC % 4 != 0)
+ return true;
+
+ Base = N.getOperand(0);
+ if (Base.getOpcode() == ISD::FrameIndex) {
+ int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+ Base = CurDAG->getTargetFrameIndex(
+ FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+ }
+
+ OffImm = CurDAG->getTargetConstant(RHSC/4, SDLoc(N), MVT::i32);
+ return true;
+}
+
+//===--------------------------------------------------------------------===//
+
+/// getAL - Returns a ARMCC::AL immediate node.
+static inline SDValue getAL(SelectionDAG *CurDAG, const SDLoc &dl) {
+ return CurDAG->getTargetConstant((uint64_t)ARMCC::AL, dl, MVT::i32);
+}
+
+void ARMDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) {
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
+ cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);
+}
+
+bool ARMDAGToDAGISel::tryARMIndexedLoad(SDNode *N) {
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ ISD::MemIndexedMode AM = LD->getAddressingMode();
+ if (AM == ISD::UNINDEXED)
+ return false;
+
+ EVT LoadedVT = LD->getMemoryVT();
+ SDValue Offset, AMOpc;
+ bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC);
+ unsigned Opcode = 0;
+ bool Match = false;
+ if (LoadedVT == MVT::i32 && isPre &&
+ SelectAddrMode2OffsetImmPre(N, LD->getOffset(), Offset, AMOpc)) {
+ Opcode = ARM::LDR_PRE_IMM;
+ Match = true;
+ } else if (LoadedVT == MVT::i32 && !isPre &&
+ SelectAddrMode2OffsetImm(N, LD->getOffset(), Offset, AMOpc)) {
+ Opcode = ARM::LDR_POST_IMM;
+ Match = true;
+ } else if (LoadedVT == MVT::i32 &&
+ SelectAddrMode2OffsetReg(N, LD->getOffset(), Offset, AMOpc)) {
+ Opcode = isPre ? ARM::LDR_PRE_REG : ARM::LDR_POST_REG;
+ Match = true;
+
+ } else if (LoadedVT == MVT::i16 &&
+ SelectAddrMode3Offset(N, LD->getOffset(), Offset, AMOpc)) {
+ Match = true;
+ Opcode = (LD->getExtensionType() == ISD::SEXTLOAD)
+ ? (isPre ? ARM::LDRSH_PRE : ARM::LDRSH_POST)
+ : (isPre ? ARM::LDRH_PRE : ARM::LDRH_POST);
+ } else if (LoadedVT == MVT::i8 || LoadedVT == MVT::i1) {
+ if (LD->getExtensionType() == ISD::SEXTLOAD) {
+ if (SelectAddrMode3Offset(N, LD->getOffset(), Offset, AMOpc)) {
+ Match = true;
+ Opcode = isPre ? ARM::LDRSB_PRE : ARM::LDRSB_POST;
+ }
+ } else {
+ if (isPre &&
+ SelectAddrMode2OffsetImmPre(N, LD->getOffset(), Offset, AMOpc)) {
+ Match = true;
+ Opcode = ARM::LDRB_PRE_IMM;
+ } else if (!isPre &&
+ SelectAddrMode2OffsetImm(N, LD->getOffset(), Offset, AMOpc)) {
+ Match = true;
+ Opcode = ARM::LDRB_POST_IMM;
+ } else if (SelectAddrMode2OffsetReg(N, LD->getOffset(), Offset, AMOpc)) {
+ Match = true;
+ Opcode = isPre ? ARM::LDRB_PRE_REG : ARM::LDRB_POST_REG;
+ }
+ }
+ }
+
+ if (Match) {
+ if (Opcode == ARM::LDR_PRE_IMM || Opcode == ARM::LDRB_PRE_IMM) {
+ SDValue Chain = LD->getChain();
+ SDValue Base = LD->getBasePtr();
+ SDValue Ops[]= { Base, AMOpc, getAL(CurDAG, SDLoc(N)),
+ CurDAG->getRegister(0, MVT::i32), Chain };
+ SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32, MVT::i32,
+ MVT::Other, Ops);
+ transferMemOperands(N, New);
+ ReplaceNode(N, New);
+ return true;
+ } else {
+ SDValue Chain = LD->getChain();
+ SDValue Base = LD->getBasePtr();
+ SDValue Ops[]= { Base, Offset, AMOpc, getAL(CurDAG, SDLoc(N)),
+ CurDAG->getRegister(0, MVT::i32), Chain };
+ SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32, MVT::i32,
+ MVT::Other, Ops);
+ transferMemOperands(N, New);
+ ReplaceNode(N, New);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool ARMDAGToDAGISel::tryT1IndexedLoad(SDNode *N) {
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ EVT LoadedVT = LD->getMemoryVT();
+ ISD::MemIndexedMode AM = LD->getAddressingMode();
+ if (AM != ISD::POST_INC || LD->getExtensionType() != ISD::NON_EXTLOAD ||
+ LoadedVT.getSimpleVT().SimpleTy != MVT::i32)
+ return false;
+
+ auto *COffs = dyn_cast<ConstantSDNode>(LD->getOffset());
+ if (!COffs || COffs->getZExtValue() != 4)
+ return false;
+
+ // A T1 post-indexed load is just a single register LDM: LDM r0!, {r1}.
+ // The encoding of LDM is not how the rest of ISel expects a post-inc load to
+ // look however, so we use a pseudo here and switch it for a tLDMIA_UPD after
+ // ISel.
+ SDValue Chain = LD->getChain();
+ SDValue Base = LD->getBasePtr();
+ SDValue Ops[]= { Base, getAL(CurDAG, SDLoc(N)),
+ CurDAG->getRegister(0, MVT::i32), Chain };
+ SDNode *New = CurDAG->getMachineNode(ARM::tLDR_postidx, SDLoc(N), MVT::i32,
+ MVT::i32, MVT::Other, Ops);
+ transferMemOperands(N, New);
+ ReplaceNode(N, New);
+ return true;
+}
+
+bool ARMDAGToDAGISel::tryT2IndexedLoad(SDNode *N) {
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ ISD::MemIndexedMode AM = LD->getAddressingMode();
+ if (AM == ISD::UNINDEXED)
+ return false;
+
+ EVT LoadedVT = LD->getMemoryVT();
+ bool isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD;
+ SDValue Offset;
+ bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC);
+ unsigned Opcode = 0;
+ bool Match = false;
+ if (SelectT2AddrModeImm8Offset(N, LD->getOffset(), Offset)) {
+ switch (LoadedVT.getSimpleVT().SimpleTy) {
+ case MVT::i32:
+ Opcode = isPre ? ARM::t2LDR_PRE : ARM::t2LDR_POST;
+ break;
+ case MVT::i16:
+ if (isSExtLd)
+ Opcode = isPre ? ARM::t2LDRSH_PRE : ARM::t2LDRSH_POST;
+ else
+ Opcode = isPre ? ARM::t2LDRH_PRE : ARM::t2LDRH_POST;
+ break;
+ case MVT::i8:
+ case MVT::i1:
+ if (isSExtLd)
+ Opcode = isPre ? ARM::t2LDRSB_PRE : ARM::t2LDRSB_POST;
+ else
+ Opcode = isPre ? ARM::t2LDRB_PRE : ARM::t2LDRB_POST;
+ break;
+ default:
+ return false;
+ }
+ Match = true;
+ }
+
+ if (Match) {
+ SDValue Chain = LD->getChain();
+ SDValue Base = LD->getBasePtr();
+ SDValue Ops[]= { Base, Offset, getAL(CurDAG, SDLoc(N)),
+ CurDAG->getRegister(0, MVT::i32), Chain };
+ SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32, MVT::i32,
+ MVT::Other, Ops);
+ transferMemOperands(N, New);
+ ReplaceNode(N, New);
+ return true;
+ }
+
+ return false;
+}
+
+/// \brief Form a GPRPair pseudo register from a pair of GPR regs.
+SDNode *ARMDAGToDAGISel::createGPRPairNode(EVT VT, SDValue V0, SDValue V1) {
+ SDLoc dl(V0.getNode());
+ SDValue RegClass =
+ CurDAG->getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
+ SDValue SubReg0 = CurDAG->getTargetConstant(ARM::gsub_0, dl, MVT::i32);
+ SDValue SubReg1 = CurDAG->getTargetConstant(ARM::gsub_1, dl, MVT::i32);
+ const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 };
+ return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
+}
+
+/// \brief Form a D register from a pair of S registers.
+SDNode *ARMDAGToDAGISel::createSRegPairNode(EVT VT, SDValue V0, SDValue V1) {
+ SDLoc dl(V0.getNode());
+ SDValue RegClass =
+ CurDAG->getTargetConstant(ARM::DPR_VFP2RegClassID, dl, MVT::i32);
+ SDValue SubReg0 = CurDAG->getTargetConstant(ARM::ssub_0, dl, MVT::i32);
+ SDValue SubReg1 = CurDAG->getTargetConstant(ARM::ssub_1, dl, MVT::i32);
+ const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 };
+ return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
+}
+
+/// \brief Form a quad register from a pair of D registers.
+SDNode *ARMDAGToDAGISel::createDRegPairNode(EVT VT, SDValue V0, SDValue V1) {
+ SDLoc dl(V0.getNode());
+ SDValue RegClass = CurDAG->getTargetConstant(ARM::QPRRegClassID, dl,
+ MVT::i32);
+ SDValue SubReg0 = CurDAG->getTargetConstant(ARM::dsub_0, dl, MVT::i32);
+ SDValue SubReg1 = CurDAG->getTargetConstant(ARM::dsub_1, dl, MVT::i32);
+ const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 };
+ return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
+}
+
+/// \brief Form 4 consecutive D registers from a pair of Q registers.
+SDNode *ARMDAGToDAGISel::createQRegPairNode(EVT VT, SDValue V0, SDValue V1) {
+ SDLoc dl(V0.getNode());
+ SDValue RegClass = CurDAG->getTargetConstant(ARM::QQPRRegClassID, dl,
+ MVT::i32);
+ SDValue SubReg0 = CurDAG->getTargetConstant(ARM::qsub_0, dl, MVT::i32);
+ SDValue SubReg1 = CurDAG->getTargetConstant(ARM::qsub_1, dl, MVT::i32);
+ const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 };
+ return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
+}
+
+/// \brief Form 4 consecutive S registers.
+SDNode *ARMDAGToDAGISel::createQuadSRegsNode(EVT VT, SDValue V0, SDValue V1,
+ SDValue V2, SDValue V3) {
+ SDLoc dl(V0.getNode());
+ SDValue RegClass =
+ CurDAG->getTargetConstant(ARM::QPR_VFP2RegClassID, dl, MVT::i32);
+ SDValue SubReg0 = CurDAG->getTargetConstant(ARM::ssub_0, dl, MVT::i32);
+ SDValue SubReg1 = CurDAG->getTargetConstant(ARM::ssub_1, dl, MVT::i32);
+ SDValue SubReg2 = CurDAG->getTargetConstant(ARM::ssub_2, dl, MVT::i32);
+ SDValue SubReg3 = CurDAG->getTargetConstant(ARM::ssub_3, dl, MVT::i32);
+ const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1,
+ V2, SubReg2, V3, SubReg3 };
+ return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
+}
+
+/// \brief Form 4 consecutive D registers.
+SDNode *ARMDAGToDAGISel::createQuadDRegsNode(EVT VT, SDValue V0, SDValue V1,
+ SDValue V2, SDValue V3) {
+ SDLoc dl(V0.getNode());
+ SDValue RegClass = CurDAG->getTargetConstant(ARM::QQPRRegClassID, dl,
+ MVT::i32);
+ SDValue SubReg0 = CurDAG->getTargetConstant(ARM::dsub_0, dl, MVT::i32);
+ SDValue SubReg1 = CurDAG->getTargetConstant(ARM::dsub_1, dl, MVT::i32);
+ SDValue SubReg2 = CurDAG->getTargetConstant(ARM::dsub_2, dl, MVT::i32);
+ SDValue SubReg3 = CurDAG->getTargetConstant(ARM::dsub_3, dl, MVT::i32);
+ const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1,
+ V2, SubReg2, V3, SubReg3 };
+ return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
+}
+
+/// \brief Form 4 consecutive Q registers.
+SDNode *ARMDAGToDAGISel::createQuadQRegsNode(EVT VT, SDValue V0, SDValue V1,
+ SDValue V2, SDValue V3) {
+ SDLoc dl(V0.getNode());
+ SDValue RegClass = CurDAG->getTargetConstant(ARM::QQQQPRRegClassID, dl,
+ MVT::i32);
+ SDValue SubReg0 = CurDAG->getTargetConstant(ARM::qsub_0, dl, MVT::i32);
+ SDValue SubReg1 = CurDAG->getTargetConstant(ARM::qsub_1, dl, MVT::i32);
+ SDValue SubReg2 = CurDAG->getTargetConstant(ARM::qsub_2, dl, MVT::i32);
+ SDValue SubReg3 = CurDAG->getTargetConstant(ARM::qsub_3, dl, MVT::i32);
+ const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1,
+ V2, SubReg2, V3, SubReg3 };
+ return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
+}
+
+/// GetVLDSTAlign - Get the alignment (in bytes) for the alignment operand
+/// of a NEON VLD or VST instruction. The supported values depend on the
+/// number of registers being loaded.
+SDValue ARMDAGToDAGISel::GetVLDSTAlign(SDValue Align, const SDLoc &dl,
+ unsigned NumVecs, bool is64BitVector) {
+ unsigned NumRegs = NumVecs;
+ if (!is64BitVector && NumVecs < 3)
+ NumRegs *= 2;
+
+ unsigned Alignment = cast<ConstantSDNode>(Align)->getZExtValue();
+ if (Alignment >= 32 && NumRegs == 4)
+ Alignment = 32;
+ else if (Alignment >= 16 && (NumRegs == 2 || NumRegs == 4))
+ Alignment = 16;
+ else if (Alignment >= 8)
+ Alignment = 8;
+ else
+ Alignment = 0;
+
+ return CurDAG->getTargetConstant(Alignment, dl, MVT::i32);
+}
+
+static bool isVLDfixed(unsigned Opc)
+{
+ switch (Opc) {
+ default: return false;
+ case ARM::VLD1d8wb_fixed : return true;
+ case ARM::VLD1d16wb_fixed : return true;
+ case ARM::VLD1d64Qwb_fixed : return true;
+ case ARM::VLD1d32wb_fixed : return true;
+ case ARM::VLD1d64wb_fixed : return true;
+ case ARM::VLD1d64TPseudoWB_fixed : return true;
+ case ARM::VLD1d64QPseudoWB_fixed : return true;
+ case ARM::VLD1q8wb_fixed : return true;
+ case ARM::VLD1q16wb_fixed : return true;
+ case ARM::VLD1q32wb_fixed : return true;
+ case ARM::VLD1q64wb_fixed : return true;
+ case ARM::VLD1DUPd8wb_fixed : return true;
+ case ARM::VLD1DUPd16wb_fixed : return true;
+ case ARM::VLD1DUPd32wb_fixed : return true;
+ case ARM::VLD1DUPq8wb_fixed : return true;
+ case ARM::VLD1DUPq16wb_fixed : return true;
+ case ARM::VLD1DUPq32wb_fixed : return true;
+ case ARM::VLD2d8wb_fixed : return true;
+ case ARM::VLD2d16wb_fixed : return true;
+ case ARM::VLD2d32wb_fixed : return true;
+ case ARM::VLD2q8PseudoWB_fixed : return true;
+ case ARM::VLD2q16PseudoWB_fixed : return true;
+ case ARM::VLD2q32PseudoWB_fixed : return true;
+ case ARM::VLD2DUPd8wb_fixed : return true;
+ case ARM::VLD2DUPd16wb_fixed : return true;
+ case ARM::VLD2DUPd32wb_fixed : return true;
+ }
+}
+
+static bool isVSTfixed(unsigned Opc)
+{
+ switch (Opc) {
+ default: return false;
+ case ARM::VST1d8wb_fixed : return true;
+ case ARM::VST1d16wb_fixed : return true;
+ case ARM::VST1d32wb_fixed : return true;
+ case ARM::VST1d64wb_fixed : return true;
+ case ARM::VST1q8wb_fixed : return true;
+ case ARM::VST1q16wb_fixed : return true;
+ case ARM::VST1q32wb_fixed : return true;
+ case ARM::VST1q64wb_fixed : return true;
+ case ARM::VST1d64TPseudoWB_fixed : return true;
+ case ARM::VST1d64QPseudoWB_fixed : return true;
+ case ARM::VST2d8wb_fixed : return true;
+ case ARM::VST2d16wb_fixed : return true;
+ case ARM::VST2d32wb_fixed : return true;
+ case ARM::VST2q8PseudoWB_fixed : return true;
+ case ARM::VST2q16PseudoWB_fixed : return true;
+ case ARM::VST2q32PseudoWB_fixed : return true;
+ }
+}
+
+// Get the register stride update opcode of a VLD/VST instruction that
+// is otherwise equivalent to the given fixed stride updating instruction.
+static unsigned getVLDSTRegisterUpdateOpcode(unsigned Opc) {
+ assert((isVLDfixed(Opc) || isVSTfixed(Opc))
+ && "Incorrect fixed stride updating instruction.");
+ switch (Opc) {
+ default: break;
+ case ARM::VLD1d8wb_fixed: return ARM::VLD1d8wb_register;
+ case ARM::VLD1d16wb_fixed: return ARM::VLD1d16wb_register;
+ case ARM::VLD1d32wb_fixed: return ARM::VLD1d32wb_register;
+ case ARM::VLD1d64wb_fixed: return ARM::VLD1d64wb_register;
+ case ARM::VLD1q8wb_fixed: return ARM::VLD1q8wb_register;
+ case ARM::VLD1q16wb_fixed: return ARM::VLD1q16wb_register;
+ case ARM::VLD1q32wb_fixed: return ARM::VLD1q32wb_register;
+ case ARM::VLD1q64wb_fixed: return ARM::VLD1q64wb_register;
+ case ARM::VLD1d64Twb_fixed: return ARM::VLD1d64Twb_register;
+ case ARM::VLD1d64Qwb_fixed: return ARM::VLD1d64Qwb_register;
+ case ARM::VLD1d64TPseudoWB_fixed: return ARM::VLD1d64TPseudoWB_register;
+ case ARM::VLD1d64QPseudoWB_fixed: return ARM::VLD1d64QPseudoWB_register;
+ case ARM::VLD1DUPd8wb_fixed : return ARM::VLD1DUPd8wb_register;
+ case ARM::VLD1DUPd16wb_fixed : return ARM::VLD1DUPd16wb_register;
+ case ARM::VLD1DUPd32wb_fixed : return ARM::VLD1DUPd32wb_register;
+ case ARM::VLD1DUPq8wb_fixed : return ARM::VLD1DUPq8wb_register;
+ case ARM::VLD1DUPq16wb_fixed : return ARM::VLD1DUPq16wb_register;
+ case ARM::VLD1DUPq32wb_fixed : return ARM::VLD1DUPq32wb_register;
+
+ case ARM::VST1d8wb_fixed: return ARM::VST1d8wb_register;
+ case ARM::VST1d16wb_fixed: return ARM::VST1d16wb_register;
+ case ARM::VST1d32wb_fixed: return ARM::VST1d32wb_register;
+ case ARM::VST1d64wb_fixed: return ARM::VST1d64wb_register;
+ case ARM::VST1q8wb_fixed: return ARM::VST1q8wb_register;
+ case ARM::VST1q16wb_fixed: return ARM::VST1q16wb_register;
+ case ARM::VST1q32wb_fixed: return ARM::VST1q32wb_register;
+ case ARM::VST1q64wb_fixed: return ARM::VST1q64wb_register;
+ case ARM::VST1d64TPseudoWB_fixed: return ARM::VST1d64TPseudoWB_register;
+ case ARM::VST1d64QPseudoWB_fixed: return ARM::VST1d64QPseudoWB_register;
+
+ case ARM::VLD2d8wb_fixed: return ARM::VLD2d8wb_register;
+ case ARM::VLD2d16wb_fixed: return ARM::VLD2d16wb_register;
+ case ARM::VLD2d32wb_fixed: return ARM::VLD2d32wb_register;
+ case ARM::VLD2q8PseudoWB_fixed: return ARM::VLD2q8PseudoWB_register;
+ case ARM::VLD2q16PseudoWB_fixed: return ARM::VLD2q16PseudoWB_register;
+ case ARM::VLD2q32PseudoWB_fixed: return ARM::VLD2q32PseudoWB_register;
+
+ case ARM::VST2d8wb_fixed: return ARM::VST2d8wb_register;
+ case ARM::VST2d16wb_fixed: return ARM::VST2d16wb_register;
+ case ARM::VST2d32wb_fixed: return ARM::VST2d32wb_register;
+ case ARM::VST2q8PseudoWB_fixed: return ARM::VST2q8PseudoWB_register;
+ case ARM::VST2q16PseudoWB_fixed: return ARM::VST2q16PseudoWB_register;
+ case ARM::VST2q32PseudoWB_fixed: return ARM::VST2q32PseudoWB_register;
+
+ case ARM::VLD2DUPd8wb_fixed: return ARM::VLD2DUPd8wb_register;
+ case ARM::VLD2DUPd16wb_fixed: return ARM::VLD2DUPd16wb_register;
+ case ARM::VLD2DUPd32wb_fixed: return ARM::VLD2DUPd32wb_register;
+ }
+ return Opc; // If not one we handle, return it unchanged.
+}
+
+void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
+ const uint16_t *DOpcodes,
+ const uint16_t *QOpcodes0,
+ const uint16_t *QOpcodes1) {
+ assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range");
+ SDLoc dl(N);
+
+ SDValue MemAddr, Align;
+ unsigned AddrOpIdx = isUpdating ? 1 : 2;
+ if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
+ return;
+
+ SDValue Chain = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+ bool is64BitVector = VT.is64BitVector();
+ Align = GetVLDSTAlign(Align, dl, NumVecs, is64BitVector);
+
+ unsigned OpcodeIndex;
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("unhandled vld type");
+ // Double-register operations:
+ case MVT::v8i8: OpcodeIndex = 0; break;
+ case MVT::v4i16: OpcodeIndex = 1; break;
+ case MVT::v2f32:
+ case MVT::v2i32: OpcodeIndex = 2; break;
+ case MVT::v1i64: OpcodeIndex = 3; break;
+ // Quad-register operations:
+ case MVT::v16i8: OpcodeIndex = 0; break;
+ case MVT::v8i16: OpcodeIndex = 1; break;
+ case MVT::v4f32:
+ case MVT::v4i32: OpcodeIndex = 2; break;
+ case MVT::v2f64:
+ case MVT::v2i64: OpcodeIndex = 3;
+ assert(NumVecs == 1 && "v2i64 type only supported for VLD1");
+ break;
+ }
+
+ EVT ResTy;
+ if (NumVecs == 1)
+ ResTy = VT;
+ else {
+ unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs;
+ if (!is64BitVector)
+ ResTyElts *= 2;
+ ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, ResTyElts);
+ }
+ std::vector<EVT> ResTys;
+ ResTys.push_back(ResTy);
+ if (isUpdating)
+ ResTys.push_back(MVT::i32);
+ ResTys.push_back(MVT::Other);
+
+ SDValue Pred = getAL(CurDAG, dl);
+ SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+ SDNode *VLd;
+ SmallVector<SDValue, 7> Ops;
+
+ // Double registers and VLD1/VLD2 quad registers are directly supported.
+ if (is64BitVector || NumVecs <= 2) {
+ unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] :
+ QOpcodes0[OpcodeIndex]);
+ Ops.push_back(MemAddr);
+ Ops.push_back(Align);
+ if (isUpdating) {
+ SDValue Inc = N->getOperand(AddrOpIdx + 1);
+ // FIXME: VLD1/VLD2 fixed increment doesn't need Reg0. Remove the reg0
+ // case entirely when the rest are updated to that form, too.
+ if ((NumVecs <= 2) && !isa<ConstantSDNode>(Inc.getNode()))
+ Opc = getVLDSTRegisterUpdateOpcode(Opc);
+ // FIXME: We use a VLD1 for v1i64 even if the pseudo says vld2/3/4, so
+ // check for that explicitly too. Horribly hacky, but temporary.
+ if ((NumVecs > 2 && !isVLDfixed(Opc)) ||
+ !isa<ConstantSDNode>(Inc.getNode()))
+ Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc);
+ }
+ Ops.push_back(Pred);
+ Ops.push_back(Reg0);
+ Ops.push_back(Chain);
+ VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+ } else {
+ // Otherwise, quad registers are loaded with two separate instructions,
+ // where one loads the even registers and the other loads the odd registers.
+ EVT AddrTy = MemAddr.getValueType();
+
+ // Load the even subregs. This is always an updating load, so that it
+ // provides the address to the second load for the odd subregs.
+ SDValue ImplDef =
+ SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, ResTy), 0);
+ const SDValue OpsA[] = { MemAddr, Align, Reg0, ImplDef, Pred, Reg0, Chain };
+ SDNode *VLdA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl,
+ ResTy, AddrTy, MVT::Other, OpsA);
+ Chain = SDValue(VLdA, 2);
+
+ // Load the odd subregs.
+ Ops.push_back(SDValue(VLdA, 1));
+ Ops.push_back(Align);
+ if (isUpdating) {
+ SDValue Inc = N->getOperand(AddrOpIdx + 1);
+ assert(isa<ConstantSDNode>(Inc.getNode()) &&
+ "only constant post-increment update allowed for VLD3/4");
+ (void)Inc;
+ Ops.push_back(Reg0);
+ }
+ Ops.push_back(SDValue(VLdA, 0));
+ Ops.push_back(Pred);
+ Ops.push_back(Reg0);
+ Ops.push_back(Chain);
+ VLd = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, Ops);
+ }
+
+ // Transfer memoperands.
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+ cast<MachineSDNode>(VLd)->setMemRefs(MemOp, MemOp + 1);
+
+ if (NumVecs == 1) {
+ ReplaceNode(N, VLd);
+ return;
+ }
+
+ // Extract out the subregisters.
+ SDValue SuperReg = SDValue(VLd, 0);
+ static_assert(ARM::dsub_7 == ARM::dsub_0 + 7 &&
+ ARM::qsub_3 == ARM::qsub_0 + 3,
+ "Unexpected subreg numbering");
+ unsigned Sub0 = (is64BitVector ? ARM::dsub_0 : ARM::qsub_0);
+ for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+ ReplaceUses(SDValue(N, Vec),
+ CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg));
+ ReplaceUses(SDValue(N, NumVecs), SDValue(VLd, 1));
+ if (isUpdating)
+ ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLd, 2));
+ CurDAG->RemoveDeadNode(N);
+}
+
+void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
+ const uint16_t *DOpcodes,
+ const uint16_t *QOpcodes0,
+ const uint16_t *QOpcodes1) {
+ assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range");
+ SDLoc dl(N);
+
+ SDValue MemAddr, Align;
+ unsigned AddrOpIdx = isUpdating ? 1 : 2;
+ unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1)
+ if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
+ return;
+
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+
+ SDValue Chain = N->getOperand(0);
+ EVT VT = N->getOperand(Vec0Idx).getValueType();
+ bool is64BitVector = VT.is64BitVector();
+ Align = GetVLDSTAlign(Align, dl, NumVecs, is64BitVector);
+
+ unsigned OpcodeIndex;
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("unhandled vst type");
+ // Double-register operations:
+ case MVT::v8i8: OpcodeIndex = 0; break;
+ case MVT::v4i16: OpcodeIndex = 1; break;
+ case MVT::v2f32:
+ case MVT::v2i32: OpcodeIndex = 2; break;
+ case MVT::v1i64: OpcodeIndex = 3; break;
+ // Quad-register operations:
+ case MVT::v16i8: OpcodeIndex = 0; break;
+ case MVT::v8i16: OpcodeIndex = 1; break;
+ case MVT::v4f32:
+ case MVT::v4i32: OpcodeIndex = 2; break;
+ case MVT::v2f64:
+ case MVT::v2i64: OpcodeIndex = 3;
+ assert(NumVecs == 1 && "v2i64 type only supported for VST1");
+ break;
+ }
+
+ std::vector<EVT> ResTys;
+ if (isUpdating)
+ ResTys.push_back(MVT::i32);
+ ResTys.push_back(MVT::Other);
+
+ SDValue Pred = getAL(CurDAG, dl);
+ SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+ SmallVector<SDValue, 7> Ops;
+
+ // Double registers and VST1/VST2 quad registers are directly supported.
+ if (is64BitVector || NumVecs <= 2) {
+ SDValue SrcReg;
+ if (NumVecs == 1) {
+ SrcReg = N->getOperand(Vec0Idx);
+ } else if (is64BitVector) {
+ // Form a REG_SEQUENCE to force register allocation.
+ SDValue V0 = N->getOperand(Vec0Idx + 0);
+ SDValue V1 = N->getOperand(Vec0Idx + 1);
+ if (NumVecs == 2)
+ SrcReg = SDValue(createDRegPairNode(MVT::v2i64, V0, V1), 0);
+ else {
+ SDValue V2 = N->getOperand(Vec0Idx + 2);
+ // If it's a vst3, form a quad D-register and leave the last part as
+ // an undef.
+ SDValue V3 = (NumVecs == 3)
+ ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0)
+ : N->getOperand(Vec0Idx + 3);
+ SrcReg = SDValue(createQuadDRegsNode(MVT::v4i64, V0, V1, V2, V3), 0);
+ }
+ } else {
+ // Form a QQ register.
+ SDValue Q0 = N->getOperand(Vec0Idx);
+ SDValue Q1 = N->getOperand(Vec0Idx + 1);
+ SrcReg = SDValue(createQRegPairNode(MVT::v4i64, Q0, Q1), 0);
+ }
+
+ unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] :
+ QOpcodes0[OpcodeIndex]);
+ Ops.push_back(MemAddr);
+ Ops.push_back(Align);
+ if (isUpdating) {
+ SDValue Inc = N->getOperand(AddrOpIdx + 1);
+ // FIXME: VST1/VST2 fixed increment doesn't need Reg0. Remove the reg0
+ // case entirely when the rest are updated to that form, too.
+ if (NumVecs <= 2 && !isa<ConstantSDNode>(Inc.getNode()))
+ Opc = getVLDSTRegisterUpdateOpcode(Opc);
+ // FIXME: We use a VST1 for v1i64 even if the pseudo says vld2/3/4, so
+ // check for that explicitly too. Horribly hacky, but temporary.
+ if (!isa<ConstantSDNode>(Inc.getNode()))
+ Ops.push_back(Inc);
+ else if (NumVecs > 2 && !isVSTfixed(Opc))
+ Ops.push_back(Reg0);
+ }
+ Ops.push_back(SrcReg);
+ Ops.push_back(Pred);
+ Ops.push_back(Reg0);
+ Ops.push_back(Chain);
+ SDNode *VSt = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+ // Transfer memoperands.
+ cast<MachineSDNode>(VSt)->setMemRefs(MemOp, MemOp + 1);
+
+ ReplaceNode(N, VSt);
+ return;
+ }
+
+ // Otherwise, quad registers are stored with two separate instructions,
+ // where one stores the even registers and the other stores the odd registers.
+
+ // Form the QQQQ REG_SEQUENCE.
+ SDValue V0 = N->getOperand(Vec0Idx + 0);
+ SDValue V1 = N->getOperand(Vec0Idx + 1);
+ SDValue V2 = N->getOperand(Vec0Idx + 2);
+ SDValue V3 = (NumVecs == 3)
+ ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0)
+ : N->getOperand(Vec0Idx + 3);
+ SDValue RegSeq = SDValue(createQuadQRegsNode(MVT::v8i64, V0, V1, V2, V3), 0);
+
+ // Store the even D registers. This is always an updating store, so that it
+ // provides the address to the second store for the odd subregs.
+ const SDValue OpsA[] = { MemAddr, Align, Reg0, RegSeq, Pred, Reg0, Chain };
+ SDNode *VStA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl,
+ MemAddr.getValueType(),
+ MVT::Other, OpsA);
+ cast<MachineSDNode>(VStA)->setMemRefs(MemOp, MemOp + 1);
+ Chain = SDValue(VStA, 1);
+
+ // Store the odd D registers.
+ Ops.push_back(SDValue(VStA, 0));
+ Ops.push_back(Align);
+ if (isUpdating) {
+ SDValue Inc = N->getOperand(AddrOpIdx + 1);
+ assert(isa<ConstantSDNode>(Inc.getNode()) &&
+ "only constant post-increment update allowed for VST3/4");
+ (void)Inc;
+ Ops.push_back(Reg0);
+ }
+ Ops.push_back(RegSeq);
+ Ops.push_back(Pred);
+ Ops.push_back(Reg0);
+ Ops.push_back(Chain);
+ SDNode *VStB = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys,
+ Ops);
+ cast<MachineSDNode>(VStB)->setMemRefs(MemOp, MemOp + 1);
+ ReplaceNode(N, VStB);
+}
+
+void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating,
+ unsigned NumVecs,
+ const uint16_t *DOpcodes,
+ const uint16_t *QOpcodes) {
+ assert(NumVecs >=2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range");
+ SDLoc dl(N);
+
+ SDValue MemAddr, Align;
+ unsigned AddrOpIdx = isUpdating ? 1 : 2;
+ unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1)
+ if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
+ return;
+
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+
+ SDValue Chain = N->getOperand(0);
+ unsigned Lane =
+ cast<ConstantSDNode>(N->getOperand(Vec0Idx + NumVecs))->getZExtValue();
+ EVT VT = N->getOperand(Vec0Idx).getValueType();
+ bool is64BitVector = VT.is64BitVector();
+
+ unsigned Alignment = 0;
+ if (NumVecs != 3) {
+ Alignment = cast<ConstantSDNode>(Align)->getZExtValue();
+ unsigned NumBytes = NumVecs * VT.getScalarSizeInBits() / 8;
+ if (Alignment > NumBytes)
+ Alignment = NumBytes;
+ if (Alignment < 8 && Alignment < NumBytes)
+ Alignment = 0;
+ // Alignment must be a power of two; make sure of that.
+ Alignment = (Alignment & -Alignment);
+ if (Alignment == 1)
+ Alignment = 0;
+ }
+ Align = CurDAG->getTargetConstant(Alignment, dl, MVT::i32);
+
+ unsigned OpcodeIndex;
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("unhandled vld/vst lane type");
+ // Double-register operations:
+ case MVT::v8i8: OpcodeIndex = 0; break;
+ case MVT::v4i16: OpcodeIndex = 1; break;
+ case MVT::v2f32:
+ case MVT::v2i32: OpcodeIndex = 2; break;
+ // Quad-register operations:
+ case MVT::v8i16: OpcodeIndex = 0; break;
+ case MVT::v4f32:
+ case MVT::v4i32: OpcodeIndex = 1; break;
+ }
+
+ std::vector<EVT> ResTys;
+ if (IsLoad) {
+ unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs;
+ if (!is64BitVector)
+ ResTyElts *= 2;
+ ResTys.push_back(EVT::getVectorVT(*CurDAG->getContext(),
+ MVT::i64, ResTyElts));
+ }
+ if (isUpdating)
+ ResTys.push_back(MVT::i32);
+ ResTys.push_back(MVT::Other);
+
+ SDValue Pred = getAL(CurDAG, dl);
+ SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(MemAddr);
+ Ops.push_back(Align);
+ if (isUpdating) {
+ SDValue Inc = N->getOperand(AddrOpIdx + 1);
+ Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc);
+ }
+
+ SDValue SuperReg;
+ SDValue V0 = N->getOperand(Vec0Idx + 0);
+ SDValue V1 = N->getOperand(Vec0Idx + 1);
+ if (NumVecs == 2) {
+ if (is64BitVector)
+ SuperReg = SDValue(createDRegPairNode(MVT::v2i64, V0, V1), 0);
+ else
+ SuperReg = SDValue(createQRegPairNode(MVT::v4i64, V0, V1), 0);
+ } else {
+ SDValue V2 = N->getOperand(Vec0Idx + 2);
+ SDValue V3 = (NumVecs == 3)
+ ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0)
+ : N->getOperand(Vec0Idx + 3);
+ if (is64BitVector)
+ SuperReg = SDValue(createQuadDRegsNode(MVT::v4i64, V0, V1, V2, V3), 0);
+ else
+ SuperReg = SDValue(createQuadQRegsNode(MVT::v8i64, V0, V1, V2, V3), 0);
+ }
+ Ops.push_back(SuperReg);
+ Ops.push_back(getI32Imm(Lane, dl));
+ Ops.push_back(Pred);
+ Ops.push_back(Reg0);
+ Ops.push_back(Chain);
+
+ unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] :
+ QOpcodes[OpcodeIndex]);
+ SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+ cast<MachineSDNode>(VLdLn)->setMemRefs(MemOp, MemOp + 1);
+ if (!IsLoad) {
+ ReplaceNode(N, VLdLn);
+ return;
+ }
+
+ // Extract the subregisters.
+ SuperReg = SDValue(VLdLn, 0);
+ static_assert(ARM::dsub_7 == ARM::dsub_0 + 7 &&
+ ARM::qsub_3 == ARM::qsub_0 + 3,
+ "Unexpected subreg numbering");
+ unsigned Sub0 = is64BitVector ? ARM::dsub_0 : ARM::qsub_0;
+ for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+ ReplaceUses(SDValue(N, Vec),
+ CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg));
+ ReplaceUses(SDValue(N, NumVecs), SDValue(VLdLn, 1));
+ if (isUpdating)
+ ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdLn, 2));
+ CurDAG->RemoveDeadNode(N);
+}
+
+void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
+ const uint16_t *DOpcodes,
+ const uint16_t *QOpcodes) {
+ assert(NumVecs >= 1 && NumVecs <= 4 && "VLDDup NumVecs out-of-range");
+ SDLoc dl(N);
+
+ SDValue MemAddr, Align;
+ if (!SelectAddrMode6(N, N->getOperand(1), MemAddr, Align))
+ return;
+
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+
+ SDValue Chain = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+
+ unsigned Alignment = 0;
+ if (NumVecs != 3) {
+ Alignment = cast<ConstantSDNode>(Align)->getZExtValue();
+ unsigned NumBytes = NumVecs * VT.getScalarSizeInBits() / 8;
+ if (Alignment > NumBytes)
+ Alignment = NumBytes;
+ if (Alignment < 8 && Alignment < NumBytes)
+ Alignment = 0;
+ // Alignment must be a power of two; make sure of that.
+ Alignment = (Alignment & -Alignment);
+ if (Alignment == 1)
+ Alignment = 0;
+ }
+ Align = CurDAG->getTargetConstant(Alignment, dl, MVT::i32);
+
+ unsigned Opc;
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("unhandled vld-dup type");
+ case MVT::v8i8: Opc = DOpcodes[0]; break;
+ case MVT::v16i8: Opc = QOpcodes[0]; break;
+ case MVT::v4i16: Opc = DOpcodes[1]; break;
+ case MVT::v8i16: Opc = QOpcodes[1]; break;
+ case MVT::v2f32:
+ case MVT::v2i32: Opc = DOpcodes[2]; break;
+ case MVT::v4f32:
+ case MVT::v4i32: Opc = QOpcodes[2]; break;
+ }
+
+ SDValue Pred = getAL(CurDAG, dl);
+ SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+ SmallVector<SDValue, 6> Ops;
+ Ops.push_back(MemAddr);
+ Ops.push_back(Align);
+ if (isUpdating) {
+ // fixed-stride update instructions don't have an explicit writeback
+ // operand. It's implicit in the opcode itself.
+ SDValue Inc = N->getOperand(2);
+ if (NumVecs <= 2 && !isa<ConstantSDNode>(Inc.getNode()))
+ Opc = getVLDSTRegisterUpdateOpcode(Opc);
+ if (!isa<ConstantSDNode>(Inc.getNode()))
+ Ops.push_back(Inc);
+ // FIXME: VLD3 and VLD4 haven't been updated to that form yet.
+ else if (NumVecs > 2)
+ Ops.push_back(Reg0);
+ }
+ Ops.push_back(Pred);
+ Ops.push_back(Reg0);
+ Ops.push_back(Chain);
+
+ unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs;
+ std::vector<EVT> ResTys;
+ ResTys.push_back(EVT::getVectorVT(*CurDAG->getContext(), MVT::i64,ResTyElts));
+ if (isUpdating)
+ ResTys.push_back(MVT::i32);
+ ResTys.push_back(MVT::Other);
+ SDNode *VLdDup = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+ cast<MachineSDNode>(VLdDup)->setMemRefs(MemOp, MemOp + 1);
+
+ // Extract the subregisters.
+ if (NumVecs == 1) {
+ ReplaceUses(SDValue(N, 0), SDValue(VLdDup, 0));
+ } else {
+ SDValue SuperReg = SDValue(VLdDup, 0);
+ static_assert(ARM::dsub_7 == ARM::dsub_0 + 7, "Unexpected subreg numbering");
+ unsigned SubIdx = ARM::dsub_0;
+ for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+ ReplaceUses(SDValue(N, Vec),
+ CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, SuperReg));
+ }
+ ReplaceUses(SDValue(N, NumVecs), SDValue(VLdDup, 1));
+ if (isUpdating)
+ ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdDup, 2));
+ CurDAG->RemoveDeadNode(N);
+}
+
+void ARMDAGToDAGISel::SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs,
+ unsigned Opc) {
+ assert(NumVecs >= 2 && NumVecs <= 4 && "VTBL NumVecs out-of-range");
+ SDLoc dl(N);
+ EVT VT = N->getValueType(0);
+ unsigned FirstTblReg = IsExt ? 2 : 1;
+
+ // Form a REG_SEQUENCE to force register allocation.
+ SDValue RegSeq;
+ SDValue V0 = N->getOperand(FirstTblReg + 0);
+ SDValue V1 = N->getOperand(FirstTblReg + 1);
+ if (NumVecs == 2)
+ RegSeq = SDValue(createDRegPairNode(MVT::v16i8, V0, V1), 0);
+ else {
+ SDValue V2 = N->getOperand(FirstTblReg + 2);
+ // If it's a vtbl3, form a quad D-register and leave the last part as
+ // an undef.
+ SDValue V3 = (NumVecs == 3)
+ ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0)
+ : N->getOperand(FirstTblReg + 3);
+ RegSeq = SDValue(createQuadDRegsNode(MVT::v4i64, V0, V1, V2, V3), 0);
+ }
+
+ SmallVector<SDValue, 6> Ops;
+ if (IsExt)
+ Ops.push_back(N->getOperand(1));
+ Ops.push_back(RegSeq);
+ Ops.push_back(N->getOperand(FirstTblReg + NumVecs));
+ Ops.push_back(getAL(CurDAG, dl)); // predicate
+ Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // predicate register
+ ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, Ops));
+}
+
+bool ARMDAGToDAGISel::tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned) {
+ if (!Subtarget->hasV6T2Ops())
+ return false;
+
+ unsigned Opc = isSigned
+ ? (Subtarget->isThumb() ? ARM::t2SBFX : ARM::SBFX)
+ : (Subtarget->isThumb() ? ARM::t2UBFX : ARM::UBFX);
+ SDLoc dl(N);
+
+ // For unsigned extracts, check for a shift right and mask
+ unsigned And_imm = 0;
+ if (N->getOpcode() == ISD::AND) {
+ if (isOpcWithIntImmediate(N, ISD::AND, And_imm)) {
+
+ // The immediate is a mask of the low bits iff imm & (imm+1) == 0
+ if (And_imm & (And_imm + 1))
+ return false;
+
+ unsigned Srl_imm = 0;
+ if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SRL,
+ Srl_imm)) {
+ assert(Srl_imm > 0 && Srl_imm < 32 && "bad amount in shift node!");
+
+ // Note: The width operand is encoded as width-1.
+ unsigned Width = countTrailingOnes(And_imm) - 1;
+ unsigned LSB = Srl_imm;
+
+ SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+
+ if ((LSB + Width + 1) == N->getValueType(0).getSizeInBits()) {
+ // It's cheaper to use a right shift to extract the top bits.
+ if (Subtarget->isThumb()) {
+ Opc = isSigned ? ARM::t2ASRri : ARM::t2LSRri;
+ SDValue Ops[] = { N->getOperand(0).getOperand(0),
+ CurDAG->getTargetConstant(LSB, dl, MVT::i32),
+ getAL(CurDAG, dl), Reg0, Reg0 };
+ CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+ return true;
+ }
+
+ // ARM models shift instructions as MOVsi with shifter operand.
+ ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(ISD::SRL);
+ SDValue ShOpc =
+ CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ShOpcVal, LSB), dl,
+ MVT::i32);
+ SDValue Ops[] = { N->getOperand(0).getOperand(0), ShOpc,
+ getAL(CurDAG, dl), Reg0, Reg0 };
+ CurDAG->SelectNodeTo(N, ARM::MOVsi, MVT::i32, Ops);
+ return true;
+ }
+
+ SDValue Ops[] = { N->getOperand(0).getOperand(0),
+ CurDAG->getTargetConstant(LSB, dl, MVT::i32),
+ CurDAG->getTargetConstant(Width, dl, MVT::i32),
+ getAL(CurDAG, dl), Reg0 };
+ CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+ return true;
+ }
+ }
+ return false;
+ }
+
+ // Otherwise, we're looking for a shift of a shift
+ unsigned Shl_imm = 0;
+ if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, Shl_imm)) {
+ assert(Shl_imm > 0 && Shl_imm < 32 && "bad amount in shift node!");
+ unsigned Srl_imm = 0;
+ if (isInt32Immediate(N->getOperand(1), Srl_imm)) {
+ assert(Srl_imm > 0 && Srl_imm < 32 && "bad amount in shift node!");
+ // Note: The width operand is encoded as width-1.
+ unsigned Width = 32 - Srl_imm - 1;
+ int LSB = Srl_imm - Shl_imm;
+ if (LSB < 0)
+ return false;
+ SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+ SDValue Ops[] = { N->getOperand(0).getOperand(0),
+ CurDAG->getTargetConstant(LSB, dl, MVT::i32),
+ CurDAG->getTargetConstant(Width, dl, MVT::i32),
+ getAL(CurDAG, dl), Reg0 };
+ CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+ return true;
+ }
+ }
+
+ // Or we are looking for a shift of an and, with a mask operand
+ if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, And_imm) &&
+ isShiftedMask_32(And_imm)) {
+ unsigned Srl_imm = 0;
+ unsigned LSB = countTrailingZeros(And_imm);
+ // Shift must be the same as the ands lsb
+ if (isInt32Immediate(N->getOperand(1), Srl_imm) && Srl_imm == LSB) {
+ assert(Srl_imm > 0 && Srl_imm < 32 && "bad amount in shift node!");
+ unsigned MSB = 31 - countLeadingZeros(And_imm);
+ // Note: The width operand is encoded as width-1.
+ unsigned Width = MSB - LSB;
+ SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+ SDValue Ops[] = { N->getOperand(0).getOperand(0),
+ CurDAG->getTargetConstant(Srl_imm, dl, MVT::i32),
+ CurDAG->getTargetConstant(Width, dl, MVT::i32),
+ getAL(CurDAG, dl), Reg0 };
+ CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+ return true;
+ }
+ }
+
+ if (N->getOpcode() == ISD::SIGN_EXTEND_INREG) {
+ unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
+ unsigned LSB = 0;
+ if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SRL, LSB) &&
+ !isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SRA, LSB))
+ return false;
+
+ if (LSB + Width > 32)
+ return false;
+
+ SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+ SDValue Ops[] = { N->getOperand(0).getOperand(0),
+ CurDAG->getTargetConstant(LSB, dl, MVT::i32),
+ CurDAG->getTargetConstant(Width - 1, dl, MVT::i32),
+ getAL(CurDAG, dl), Reg0 };
+ CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+ return true;
+ }
+
+ return false;
+}
+
+/// Target-specific DAG combining for ISD::XOR.
+/// Target-independent combining lowers SELECT_CC nodes of the form
+/// select_cc setg[ge] X, 0, X, -X
+/// select_cc setgt X, -1, X, -X
+/// select_cc setl[te] X, 0, -X, X
+/// select_cc setlt X, 1, -X, X
+/// which represent Integer ABS into:
+/// Y = sra (X, size(X)-1); xor (add (X, Y), Y)
+/// ARM instruction selection detects the latter and matches it to
+/// ARM::ABS or ARM::t2ABS machine node.
+bool ARMDAGToDAGISel::tryABSOp(SDNode *N){
+ SDValue XORSrc0 = N->getOperand(0);
+ SDValue XORSrc1 = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+
+ if (Subtarget->isThumb1Only())
+ return false;
+
+ if (XORSrc0.getOpcode() != ISD::ADD || XORSrc1.getOpcode() != ISD::SRA)
+ return false;
+
+ SDValue ADDSrc0 = XORSrc0.getOperand(0);
+ SDValue ADDSrc1 = XORSrc0.getOperand(1);
+ SDValue SRASrc0 = XORSrc1.getOperand(0);
+ SDValue SRASrc1 = XORSrc1.getOperand(1);
+ ConstantSDNode *SRAConstant = dyn_cast<ConstantSDNode>(SRASrc1);
+ EVT XType = SRASrc0.getValueType();
+ unsigned Size = XType.getSizeInBits() - 1;
+
+ if (ADDSrc1 == XORSrc1 && ADDSrc0 == SRASrc0 &&
+ XType.isInteger() && SRAConstant != nullptr &&
+ Size == SRAConstant->getZExtValue()) {
+ unsigned Opcode = Subtarget->isThumb2() ? ARM::t2ABS : ARM::ABS;
+ CurDAG->SelectNodeTo(N, Opcode, VT, ADDSrc0);
+ return true;
+ }
+
+ return false;
+}
+
+static bool SearchSignedMulShort(SDValue SignExt, unsigned *Opc, SDValue &Src1,
+ bool Accumulate) {
+ // For SM*WB, we need to some form of sext.
+ // For SM*WT, we need to search for (sra X, 16)
+ // Src1 then gets set to X.
+ if ((SignExt.getOpcode() == ISD::SIGN_EXTEND ||
+ SignExt.getOpcode() == ISD::SIGN_EXTEND_INREG ||
+ SignExt.getOpcode() == ISD::AssertSext) &&
+ SignExt.getValueType() == MVT::i32) {
+
+ *Opc = Accumulate ? ARM::SMLAWB : ARM::SMULWB;
+ Src1 = SignExt.getOperand(0);
+ return true;
+ }
+
+ if (SignExt.getOpcode() != ISD::SRA)
+ return false;
+
+ ConstantSDNode *SRASrc1 = dyn_cast<ConstantSDNode>(SignExt.getOperand(1));
+ if (!SRASrc1 || SRASrc1->getZExtValue() != 16)
+ return false;
+
+ SDValue Op0 = SignExt.getOperand(0);
+
+ // The sign extend operand for SM*WB could be generated by a shl and ashr.
+ if (Op0.getOpcode() == ISD::SHL) {
+ SDValue SHL = Op0;
+ ConstantSDNode *SHLSrc1 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
+ if (!SHLSrc1 || SHLSrc1->getZExtValue() != 16)
+ return false;
+
+ *Opc = Accumulate ? ARM::SMLAWB : ARM::SMULWB;
+ Src1 = Op0.getOperand(0);
+ return true;
+ }
+ *Opc = Accumulate ? ARM::SMLAWT : ARM::SMULWT;
+ Src1 = SignExt.getOperand(0);
+ return true;
+}
+
+static bool SearchSignedMulLong(SDValue OR, unsigned *Opc, SDValue &Src0,
+ SDValue &Src1, bool Accumulate) {
+ // First we look for:
+ // (add (or (srl ?, 16), (shl ?, 16)))
+ if (OR.getOpcode() != ISD::OR)
+ return false;
+
+ SDValue SRL = OR.getOperand(0);
+ SDValue SHL = OR.getOperand(1);
+
+ if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
+ SRL = OR.getOperand(1);
+ SHL = OR.getOperand(0);
+ if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL)
+ return false;
+ }
+
+ ConstantSDNode *SRLSrc1 = dyn_cast<ConstantSDNode>(SRL.getOperand(1));
+ ConstantSDNode *SHLSrc1 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
+ if (!SRLSrc1 || !SHLSrc1 || SRLSrc1->getZExtValue() != 16 ||
+ SHLSrc1->getZExtValue() != 16)
+ return false;
+
+ // The first operands to the shifts need to be the two results from the
+ // same smul_lohi node.
+ if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
+ SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
+ return false;
+
+ SDNode *SMULLOHI = SRL.getOperand(0).getNode();
+ if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
+ SHL.getOperand(0) != SDValue(SMULLOHI, 1))
+ return false;
+
+ // Now we have:
+ // (add (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
+ // For SMLAW[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
+ // For SMLAWB the 16-bit value will signed extended somehow.
+ // For SMLAWT only the SRA is required.
+
+ // Check both sides of SMUL_LOHI
+ if (SearchSignedMulShort(SMULLOHI->getOperand(0), Opc, Src1, Accumulate)) {
+ Src0 = SMULLOHI->getOperand(1);
+ } else if (SearchSignedMulShort(SMULLOHI->getOperand(1), Opc, Src1,
+ Accumulate)) {
+ Src0 = SMULLOHI->getOperand(0);
+ } else {
+ return false;
+ }
+ return true;
+}
+
+bool ARMDAGToDAGISel::trySMLAWSMULW(SDNode *N) {
+ if (!Subtarget->hasV6Ops() ||
+ (Subtarget->isThumb() && !Subtarget->hasThumb2()))
+ return false;
+
+ SDLoc dl(N);
+ SDValue Src0 = N->getOperand(0);
+ SDValue Src1 = N->getOperand(1);
+ SDValue A, B;
+ unsigned Opc = 0;
+
+ if (N->getOpcode() == ISD::ADD) {
+ if (Src0.getOpcode() != ISD::OR && Src1.getOpcode() != ISD::OR)
+ return false;
+
+ SDValue Acc;
+ if (SearchSignedMulLong(Src0, &Opc, A, B, true)) {
+ Acc = Src1;
+ } else if (SearchSignedMulLong(Src1, &Opc, A, B, true)) {
+ Acc = Src0;
+ } else {
+ return false;
+ }
+ if (Opc == 0)
+ return false;
+
+ SDValue Ops[] = { A, B, Acc, getAL(CurDAG, dl),
+ CurDAG->getRegister(0, MVT::i32) };
+ CurDAG->SelectNodeTo(N, Opc, MVT::i32, MVT::Other, Ops);
+ return true;
+ } else if (N->getOpcode() == ISD::OR &&
+ SearchSignedMulLong(SDValue(N, 0), &Opc, A, B, false)) {
+ if (Opc == 0)
+ return false;
+
+ SDValue Ops[] = { A, B, getAL(CurDAG, dl),
+ CurDAG->getRegister(0, MVT::i32)};
+ CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+ return true;
+ }
+ return false;
+}
+
+/// We've got special pseudo-instructions for these
+void ARMDAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
+ unsigned Opcode;
+ EVT MemTy = cast<MemSDNode>(N)->getMemoryVT();
+ if (MemTy == MVT::i8)
+ Opcode = ARM::CMP_SWAP_8;
+ else if (MemTy == MVT::i16)
+ Opcode = ARM::CMP_SWAP_16;
+ else if (MemTy == MVT::i32)
+ Opcode = ARM::CMP_SWAP_32;
+ else
+ llvm_unreachable("Unknown AtomicCmpSwap type");
+
+ SDValue Ops[] = {N->getOperand(1), N->getOperand(2), N->getOperand(3),
+ N->getOperand(0)};
+ SDNode *CmpSwap = CurDAG->getMachineNode(
+ Opcode, SDLoc(N),
+ CurDAG->getVTList(MVT::i32, MVT::i32, MVT::Other), Ops);
+
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
+ cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1);
+
+ ReplaceUses(SDValue(N, 0), SDValue(CmpSwap, 0));
+ ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 2));
+ CurDAG->RemoveDeadNode(N);
+}
+
+void ARMDAGToDAGISel::SelectConcatVector(SDNode *N) {
+ // The only time a CONCAT_VECTORS operation can have legal types is when
+ // two 64-bit vectors are concatenated to a 128-bit vector.
+ EVT VT = N->getValueType(0);
+ if (!VT.is128BitVector() || N->getNumOperands() != 2)
+ llvm_unreachable("unexpected CONCAT_VECTORS");
+ ReplaceNode(N, createDRegPairNode(VT, N->getOperand(0), N->getOperand(1)));
+}
+
+static Optional<std::pair<unsigned, unsigned>>
+getContiguousRangeOfSetBits(const APInt &A) {
+ unsigned FirstOne = A.getBitWidth() - A.countLeadingZeros() - 1;
+ unsigned LastOne = A.countTrailingZeros();
+ if (A.countPopulation() != (FirstOne - LastOne + 1))
+ return Optional<std::pair<unsigned,unsigned>>();
+ return std::make_pair(FirstOne, LastOne);
+}
+
+void ARMDAGToDAGISel::SelectCMPZ(SDNode *N, bool &SwitchEQNEToPLMI) {
+ assert(N->getOpcode() == ARMISD::CMPZ);
+ SwitchEQNEToPLMI = false;
+
+ if (!Subtarget->isThumb())
+ // FIXME: Work out whether it is profitable to do this in A32 mode - LSL and
+ // LSR don't exist as standalone instructions - they need the barrel shifter.
+ return;
+
+ // select (cmpz (and X, C), #0) -> (LSLS X) or (LSRS X) or (LSRS (LSLS X))
+ SDValue And = N->getOperand(0);
+ if (!And->hasOneUse())
+ return;
+
+ SDValue Zero = N->getOperand(1);
+ if (!isa<ConstantSDNode>(Zero) || !cast<ConstantSDNode>(Zero)->isNullValue() ||
+ And->getOpcode() != ISD::AND)
+ return;
+ SDValue X = And.getOperand(0);
+ auto C = dyn_cast<ConstantSDNode>(And.getOperand(1));
+
+ if (!C || !X->hasOneUse())
+ return;
+ auto Range = getContiguousRangeOfSetBits(C->getAPIntValue());
+ if (!Range)
+ return;
+
+ // There are several ways to lower this:
+ SDNode *NewN;
+ SDLoc dl(N);
+
+ auto EmitShift = [&](unsigned Opc, SDValue Src, unsigned Imm) -> SDNode* {
+ if (Subtarget->isThumb2()) {
+ Opc = (Opc == ARM::tLSLri) ? ARM::t2LSLri : ARM::t2LSRri;
+ SDValue Ops[] = { Src, CurDAG->getTargetConstant(Imm, dl, MVT::i32),
+ getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32),
+ CurDAG->getRegister(0, MVT::i32) };
+ return CurDAG->getMachineNode(Opc, dl, MVT::i32, Ops);
+ } else {
+ SDValue Ops[] = {CurDAG->getRegister(ARM::CPSR, MVT::i32), Src,
+ CurDAG->getTargetConstant(Imm, dl, MVT::i32),
+ getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32)};
+ return CurDAG->getMachineNode(Opc, dl, MVT::i32, Ops);
+ }
+ };
+
+ if (Range->second == 0) {
+ // 1. Mask includes the LSB -> Simply shift the top N bits off
+ NewN = EmitShift(ARM::tLSLri, X, 31 - Range->first);
+ ReplaceNode(And.getNode(), NewN);
+ } else if (Range->first == 31) {
+ // 2. Mask includes the MSB -> Simply shift the bottom N bits off
+ NewN = EmitShift(ARM::tLSRri, X, Range->second);
+ ReplaceNode(And.getNode(), NewN);
+ } else if (Range->first == Range->second) {
+ // 3. Only one bit is set. We can shift this into the sign bit and use a
+ // PL/MI comparison.
+ NewN = EmitShift(ARM::tLSLri, X, 31 - Range->first);
+ ReplaceNode(And.getNode(), NewN);
+
+ SwitchEQNEToPLMI = true;
+ } else if (!Subtarget->hasV6T2Ops()) {
+ // 4. Do a double shift to clear bottom and top bits, but only in
+ // thumb-1 mode as in thumb-2 we can use UBFX.
+ NewN = EmitShift(ARM::tLSLri, X, 31 - Range->first);
+ NewN = EmitShift(ARM::tLSRri, SDValue(NewN, 0),
+ Range->second + (31 - Range->first));
+ ReplaceNode(And.getNode(), NewN);
+ }
+
+}
+
+void ARMDAGToDAGISel::Select(SDNode *N) {
+ SDLoc dl(N);
+
+ if (N->isMachineOpcode()) {
+ N->setNodeId(-1);
+ return; // Already selected.
+ }
+
+ switch (N->getOpcode()) {
+ default: break;
+ case ISD::ADD:
+ case ISD::OR:
+ if (trySMLAWSMULW(N))
+ return;
+ break;
+ case ISD::WRITE_REGISTER:
+ if (tryWriteRegister(N))
+ return;
+ break;
+ case ISD::READ_REGISTER:
+ if (tryReadRegister(N))
+ return;
+ break;
+ case ISD::INLINEASM:
+ if (tryInlineAsm(N))
+ return;
+ break;
+ case ISD::XOR:
+ // Select special operations if XOR node forms integer ABS pattern
+ if (tryABSOp(N))
+ return;
+ // Other cases are autogenerated.
+ break;
+ case ISD::Constant: {
+ unsigned Val = cast<ConstantSDNode>(N)->getZExtValue();
+ // If we can't materialize the constant we need to use a literal pool
+ if (ConstantMaterializationCost(Val) > 2) {
+ SDValue CPIdx = CurDAG->getTargetConstantPool(
+ ConstantInt::get(Type::getInt32Ty(*CurDAG->getContext()), Val),
+ TLI->getPointerTy(CurDAG->getDataLayout()));
+
+ SDNode *ResNode;
+ if (Subtarget->isThumb()) {
+ SDValue Pred = getAL(CurDAG, dl);
+ SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
+ SDValue Ops[] = { CPIdx, Pred, PredReg, CurDAG->getEntryNode() };
+ ResNode = CurDAG->getMachineNode(ARM::tLDRpci, dl, MVT::i32, MVT::Other,
+ Ops);
+ } else {
+ SDValue Ops[] = {
+ CPIdx,
+ CurDAG->getTargetConstant(0, dl, MVT::i32),
+ getAL(CurDAG, dl),
+ CurDAG->getRegister(0, MVT::i32),
+ CurDAG->getEntryNode()
+ };
+ ResNode = CurDAG->getMachineNode(ARM::LDRcp, dl, MVT::i32, MVT::Other,
+ Ops);
+ }
+ ReplaceNode(N, ResNode);
+ return;
+ }
+
+ // Other cases are autogenerated.
+ break;
+ }
+ case ISD::FrameIndex: {
+ // Selects to ADDri FI, 0 which in turn will become ADDri SP, imm.
+ int FI = cast<FrameIndexSDNode>(N)->getIndex();
+ SDValue TFI = CurDAG->getTargetFrameIndex(
+ FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+ if (Subtarget->isThumb1Only()) {
+ // Set the alignment of the frame object to 4, to avoid having to generate
+ // more than one ADD
+ MachineFrameInfo &MFI = MF->getFrameInfo();
+ if (MFI.getObjectAlignment(FI) < 4)
+ MFI.setObjectAlignment(FI, 4);
+ CurDAG->SelectNodeTo(N, ARM::tADDframe, MVT::i32, TFI,
+ CurDAG->getTargetConstant(0, dl, MVT::i32));
+ return;
+ } else {
+ unsigned Opc = ((Subtarget->isThumb() && Subtarget->hasThumb2()) ?
+ ARM::t2ADDri : ARM::ADDri);
+ SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, dl, MVT::i32),
+ getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32),
+ CurDAG->getRegister(0, MVT::i32) };
+ CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+ return;
+ }
+ }
+ case ISD::SRL:
+ if (tryV6T2BitfieldExtractOp(N, false))
+ return;
+ break;
+ case ISD::SIGN_EXTEND_INREG:
+ case ISD::SRA:
+ if (tryV6T2BitfieldExtractOp(N, true))
+ return;
+ break;
+ case ISD::MUL:
+ if (Subtarget->isThumb1Only())
+ break;
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+ unsigned RHSV = C->getZExtValue();
+ if (!RHSV) break;
+ if (isPowerOf2_32(RHSV-1)) { // 2^n+1?
+ unsigned ShImm = Log2_32(RHSV-1);
+ if (ShImm >= 32)
+ break;
+ SDValue V = N->getOperand(0);
+ ShImm = ARM_AM::getSORegOpc(ARM_AM::lsl, ShImm);
+ SDValue ShImmOp = CurDAG->getTargetConstant(ShImm, dl, MVT::i32);
+ SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+ if (Subtarget->isThumb()) {
+ SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG, dl), Reg0, Reg0 };
+ CurDAG->SelectNodeTo(N, ARM::t2ADDrs, MVT::i32, Ops);
+ return;
+ } else {
+ SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG, dl), Reg0,
+ Reg0 };
+ CurDAG->SelectNodeTo(N, ARM::ADDrsi, MVT::i32, Ops);
+ return;
+ }
+ }
+ if (isPowerOf2_32(RHSV+1)) { // 2^n-1?
+ unsigned ShImm = Log2_32(RHSV+1);
+ if (ShImm >= 32)
+ break;
+ SDValue V = N->getOperand(0);
+ ShImm = ARM_AM::getSORegOpc(ARM_AM::lsl, ShImm);
+ SDValue ShImmOp = CurDAG->getTargetConstant(ShImm, dl, MVT::i32);
+ SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+ if (Subtarget->isThumb()) {
+ SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG, dl), Reg0, Reg0 };
+ CurDAG->SelectNodeTo(N, ARM::t2RSBrs, MVT::i32, Ops);
+ return;
+ } else {
+ SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG, dl), Reg0,
+ Reg0 };
+ CurDAG->SelectNodeTo(N, ARM::RSBrsi, MVT::i32, Ops);
+ return;
+ }
+ }
+ }
+ break;
+ case ISD::AND: {
+ // Check for unsigned bitfield extract
+ if (tryV6T2BitfieldExtractOp(N, false))
+ return;
+
+ // If an immediate is used in an AND node, it is possible that the immediate
+ // can be more optimally materialized when negated. If this is the case we
+ // can negate the immediate and use a BIC instead.
+ auto *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (N1C && N1C->hasOneUse() && Subtarget->isThumb()) {
+ uint32_t Imm = (uint32_t) N1C->getZExtValue();
+
+ // In Thumb2 mode, an AND can take a 12-bit immediate. If this
+ // immediate can be negated and fit in the immediate operand of
+ // a t2BIC, don't do any manual transform here as this can be
+ // handled by the generic ISel machinery.
+ bool PreferImmediateEncoding =
+ Subtarget->hasThumb2() && (is_t2_so_imm(Imm) || is_t2_so_imm_not(Imm));
+ if (!PreferImmediateEncoding &&
+ ConstantMaterializationCost(Imm) >
+ ConstantMaterializationCost(~Imm)) {
+ // The current immediate costs more to materialize than a negated
+ // immediate, so negate the immediate and use a BIC.
+ SDValue NewImm =
+ CurDAG->getConstant(~N1C->getZExtValue(), dl, MVT::i32);
+ // If the new constant didn't exist before, reposition it in the topological
+ // ordering so it is just before N. Otherwise, don't touch its location.
+ if (NewImm->getNodeId() == -1)
+ CurDAG->RepositionNode(N->getIterator(), NewImm.getNode());
+
+ if (!Subtarget->hasThumb2()) {
+ SDValue Ops[] = {CurDAG->getRegister(ARM::CPSR, MVT::i32),
+ N->getOperand(0), NewImm, getAL(CurDAG, dl),
+ CurDAG->getRegister(0, MVT::i32)};
+ ReplaceNode(N, CurDAG->getMachineNode(ARM::tBIC, dl, MVT::i32, Ops));
+ return;
+ } else {
+ SDValue Ops[] = {N->getOperand(0), NewImm, getAL(CurDAG, dl),
+ CurDAG->getRegister(0, MVT::i32),
+ CurDAG->getRegister(0, MVT::i32)};
+ ReplaceNode(N,
+ CurDAG->getMachineNode(ARM::t2BICrr, dl, MVT::i32, Ops));
+ return;
+ }
+ }
+ }
+
+ // (and (or x, c2), c1) and top 16-bits of c1 and c2 match, lower 16-bits
+ // of c1 are 0xffff, and lower 16-bit of c2 are 0. That is, the top 16-bits
+ // are entirely contributed by c2 and lower 16-bits are entirely contributed
+ // by x. That's equal to (or (and x, 0xffff), (and c1, 0xffff0000)).
+ // Select it to: "movt x, ((c1 & 0xffff) >> 16)
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::i32)
+ break;
+ unsigned Opc = (Subtarget->isThumb() && Subtarget->hasThumb2())
+ ? ARM::t2MOVTi16
+ : (Subtarget->hasV6T2Ops() ? ARM::MOVTi16 : 0);
+ if (!Opc)
+ break;
+ SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
+ N1C = dyn_cast<ConstantSDNode>(N1);
+ if (!N1C)
+ break;
+ if (N0.getOpcode() == ISD::OR && N0.getNode()->hasOneUse()) {
+ SDValue N2 = N0.getOperand(1);
+ ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2);
+ if (!N2C)
+ break;
+ unsigned N1CVal = N1C->getZExtValue();
+ unsigned N2CVal = N2C->getZExtValue();
+ if ((N1CVal & 0xffff0000U) == (N2CVal & 0xffff0000U) &&
+ (N1CVal & 0xffffU) == 0xffffU &&
+ (N2CVal & 0xffffU) == 0x0U) {
+ SDValue Imm16 = CurDAG->getTargetConstant((N2CVal & 0xFFFF0000U) >> 16,
+ dl, MVT::i32);
+ SDValue Ops[] = { N0.getOperand(0), Imm16,
+ getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32) };
+ ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, Ops));
+ return;
+ }
+ }
+
+ break;
+ }
+ case ARMISD::VMOVRRD:
+ ReplaceNode(N, CurDAG->getMachineNode(ARM::VMOVRRD, dl, MVT::i32, MVT::i32,
+ N->getOperand(0), getAL(CurDAG, dl),
+ CurDAG->getRegister(0, MVT::i32)));
+ return;
+ case ISD::UMUL_LOHI: {
+ if (Subtarget->isThumb1Only())
+ break;
+ if (Subtarget->isThumb()) {
+ SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+ getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32) };
+ ReplaceNode(
+ N, CurDAG->getMachineNode(ARM::t2UMULL, dl, MVT::i32, MVT::i32, Ops));
+ return;
+ } else {
+ SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+ getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32),
+ CurDAG->getRegister(0, MVT::i32) };
+ ReplaceNode(N, CurDAG->getMachineNode(
+ Subtarget->hasV6Ops() ? ARM::UMULL : ARM::UMULLv5, dl,
+ MVT::i32, MVT::i32, Ops));
+ return;
+ }
+ }
+ case ISD::SMUL_LOHI: {
+ if (Subtarget->isThumb1Only())
+ break;
+ if (Subtarget->isThumb()) {
+ SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+ getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32) };
+ ReplaceNode(
+ N, CurDAG->getMachineNode(ARM::t2SMULL, dl, MVT::i32, MVT::i32, Ops));
+ return;
+ } else {
+ SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+ getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32),
+ CurDAG->getRegister(0, MVT::i32) };
+ ReplaceNode(N, CurDAG->getMachineNode(
+ Subtarget->hasV6Ops() ? ARM::SMULL : ARM::SMULLv5, dl,
+ MVT::i32, MVT::i32, Ops));
+ return;
+ }
+ }
+ case ARMISD::UMAAL: {
+ unsigned Opc = Subtarget->isThumb() ? ARM::t2UMAAL : ARM::UMAAL;
+ SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+ N->getOperand(2), N->getOperand(3),
+ getAL(CurDAG, dl),
+ CurDAG->getRegister(0, MVT::i32) };
+ ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, MVT::i32, MVT::i32, Ops));
+ return;
+ }
+ case ARMISD::UMLAL:{
+ // UMAAL is similar to UMLAL but it adds two 32-bit values to the
+ // 64-bit multiplication result.
+ if (Subtarget->hasV6Ops() && Subtarget->hasDSP() &&
+ N->getOperand(2).getOpcode() == ARMISD::ADDC &&
+ N->getOperand(3).getOpcode() == ARMISD::ADDE) {
+
+ SDValue Addc = N->getOperand(2);
+ SDValue Adde = N->getOperand(3);
+
+ if (Adde.getOperand(2).getNode() == Addc.getNode()) {
+
+ ConstantSDNode *Op0 = dyn_cast<ConstantSDNode>(Adde.getOperand(0));
+ ConstantSDNode *Op1 = dyn_cast<ConstantSDNode>(Adde.getOperand(1));
+
+ if (Op0 && Op1 && Op0->getZExtValue() == 0 && Op1->getZExtValue() == 0)
+ {
+ // Select UMAAL instead: UMAAL RdLo, RdHi, Rn, Rm
+ // RdLo = one operand to be added, lower 32-bits of res
+ // RdHi = other operand to be added, upper 32-bits of res
+ // Rn = first multiply operand
+ // Rm = second multiply operand
+ SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+ Addc.getOperand(0), Addc.getOperand(1),
+ getAL(CurDAG, dl),
+ CurDAG->getRegister(0, MVT::i32) };
+ unsigned opc = Subtarget->isThumb() ? ARM::t2UMAAL : ARM::UMAAL;
+ CurDAG->SelectNodeTo(N, opc, MVT::i32, MVT::i32, Ops);
+ return;
+ }
+ }
+ }
+
+ if (Subtarget->isThumb()) {
+ SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+ N->getOperand(3), getAL(CurDAG, dl),
+ CurDAG->getRegister(0, MVT::i32)};
+ ReplaceNode(
+ N, CurDAG->getMachineNode(ARM::t2UMLAL, dl, MVT::i32, MVT::i32, Ops));
+ return;
+ }else{
+ SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+ N->getOperand(3), getAL(CurDAG, dl),
+ CurDAG->getRegister(0, MVT::i32),
+ CurDAG->getRegister(0, MVT::i32) };
+ ReplaceNode(N, CurDAG->getMachineNode(
+ Subtarget->hasV6Ops() ? ARM::UMLAL : ARM::UMLALv5, dl,
+ MVT::i32, MVT::i32, Ops));
+ return;
+ }
+ }
+ case ARMISD::SMLAL:{
+ if (Subtarget->isThumb()) {
+ SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+ N->getOperand(3), getAL(CurDAG, dl),
+ CurDAG->getRegister(0, MVT::i32)};
+ ReplaceNode(
+ N, CurDAG->getMachineNode(ARM::t2SMLAL, dl, MVT::i32, MVT::i32, Ops));
+ return;
+ }else{
+ SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+ N->getOperand(3), getAL(CurDAG, dl),
+ CurDAG->getRegister(0, MVT::i32),
+ CurDAG->getRegister(0, MVT::i32) };
+ ReplaceNode(N, CurDAG->getMachineNode(
+ Subtarget->hasV6Ops() ? ARM::SMLAL : ARM::SMLALv5, dl,
+ MVT::i32, MVT::i32, Ops));
+ return;
+ }
+ }
+ case ARMISD::SUBE: {
+ if (!Subtarget->hasV6Ops())
+ break;
+ // Look for a pattern to match SMMLS
+ // (sube a, (smul_loHi a, b), (subc 0, (smul_LOhi(a, b))))
+ if (N->getOperand(1).getOpcode() != ISD::SMUL_LOHI ||
+ N->getOperand(2).getOpcode() != ARMISD::SUBC ||
+ !SDValue(N, 1).use_empty())
+ break;
+
+ if (Subtarget->isThumb())
+ assert(Subtarget->hasThumb2() &&
+ "This pattern should not be generated for Thumb");
+
+ SDValue SmulLoHi = N->getOperand(1);
+ SDValue Subc = N->getOperand(2);
+ auto *Zero = dyn_cast<ConstantSDNode>(Subc.getOperand(0));
+
+ if (!Zero || Zero->getZExtValue() != 0 ||
+ Subc.getOperand(1) != SmulLoHi.getValue(0) ||
+ N->getOperand(1) != SmulLoHi.getValue(1) ||
+ N->getOperand(2) != Subc.getValue(1))
+ break;
+
+ unsigned Opc = Subtarget->isThumb2() ? ARM::t2SMMLS : ARM::SMMLS;
+ SDValue Ops[] = { SmulLoHi.getOperand(0), SmulLoHi.getOperand(1),
+ N->getOperand(0), getAL(CurDAG, dl),
+ CurDAG->getRegister(0, MVT::i32) };
+ ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, MVT::i32, Ops));
+ return;
+ }
+ case ISD::LOAD: {
+ if (Subtarget->isThumb() && Subtarget->hasThumb2()) {
+ if (tryT2IndexedLoad(N))
+ return;
+ } else if (Subtarget->isThumb()) {
+ if (tryT1IndexedLoad(N))
+ return;
+ } else if (tryARMIndexedLoad(N))
+ return;
+ // Other cases are autogenerated.
+ break;
+ }
+ case ARMISD::BRCOND: {
+ // Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc)
+ // Emits: (Bcc:void (bb:Other):$dst, (imm:i32):$cc)
+ // Pattern complexity = 6 cost = 1 size = 0
+
+ // Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc)
+ // Emits: (tBcc:void (bb:Other):$dst, (imm:i32):$cc)
+ // Pattern complexity = 6 cost = 1 size = 0
+
+ // Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc)
+ // Emits: (t2Bcc:void (bb:Other):$dst, (imm:i32):$cc)
+ // Pattern complexity = 6 cost = 1 size = 0
+
+ unsigned Opc = Subtarget->isThumb() ?
+ ((Subtarget->hasThumb2()) ? ARM::t2Bcc : ARM::tBcc) : ARM::Bcc;
+ SDValue Chain = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDValue N2 = N->getOperand(2);
+ SDValue N3 = N->getOperand(3);
+ SDValue InFlag = N->getOperand(4);
+ assert(N1.getOpcode() == ISD::BasicBlock);
+ assert(N2.getOpcode() == ISD::Constant);
+ assert(N3.getOpcode() == ISD::Register);
+
+ unsigned CC = (unsigned) cast<ConstantSDNode>(N2)->getZExtValue();
+
+ if (InFlag.getOpcode() == ARMISD::CMPZ) {
+ bool SwitchEQNEToPLMI;
+ SelectCMPZ(InFlag.getNode(), SwitchEQNEToPLMI);
+ InFlag = N->getOperand(4);
+
+ if (SwitchEQNEToPLMI) {
+ switch ((ARMCC::CondCodes)CC) {
+ default: llvm_unreachable("CMPZ must be either NE or EQ!");
+ case ARMCC::NE:
+ CC = (unsigned)ARMCC::MI;
+ break;
+ case ARMCC::EQ:
+ CC = (unsigned)ARMCC::PL;
+ break;
+ }
+ }
+ }
+
+ SDValue Tmp2 = CurDAG->getTargetConstant(CC, dl, MVT::i32);
+ SDValue Ops[] = { N1, Tmp2, N3, Chain, InFlag };
+ SDNode *ResNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
+ MVT::Glue, Ops);
+ Chain = SDValue(ResNode, 0);
+ if (N->getNumValues() == 2) {
+ InFlag = SDValue(ResNode, 1);
+ ReplaceUses(SDValue(N, 1), InFlag);
+ }
+ ReplaceUses(SDValue(N, 0),
+ SDValue(Chain.getNode(), Chain.getResNo()));
+ CurDAG->RemoveDeadNode(N);
+ return;
+ }
+
+ case ARMISD::CMPZ: {
+ // select (CMPZ X, #-C) -> (CMPZ (ADDS X, #C), #0)
+ // This allows us to avoid materializing the expensive negative constant.
+ // The CMPZ #0 is useless and will be peepholed away but we need to keep it
+ // for its glue output.
+ SDValue X = N->getOperand(0);
+ auto *C = dyn_cast<ConstantSDNode>(N->getOperand(1).getNode());
+ if (C && C->getSExtValue() < 0 && Subtarget->isThumb()) {
+ int64_t Addend = -C->getSExtValue();
+
+ SDNode *Add = nullptr;
+ // In T2 mode, ADDS can be better than CMN if the immediate fits in a
+ // 16-bit ADDS, which means either [0,256) for tADDi8 or [0,8) for tADDi3.
+ // Outside that range we can just use a CMN which is 32-bit but has a
+ // 12-bit immediate range.
+ if (Subtarget->isThumb2() && Addend < 1<<8) {
+ SDValue Ops[] = { X, CurDAG->getTargetConstant(Addend, dl, MVT::i32),
+ getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32),
+ CurDAG->getRegister(0, MVT::i32) };
+ Add = CurDAG->getMachineNode(ARM::t2ADDri, dl, MVT::i32, Ops);
+ } else if (!Subtarget->isThumb2() && Addend < 1<<8) {
+ // FIXME: Add T1 tADDi8 code.
+ SDValue Ops[] = {CurDAG->getRegister(ARM::CPSR, MVT::i32), X,
+ CurDAG->getTargetConstant(Addend, dl, MVT::i32),
+ getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32)};
+ Add = CurDAG->getMachineNode(ARM::tADDi8, dl, MVT::i32, Ops);
+ } else if (!Subtarget->isThumb2() && Addend < 1<<3) {
+ SDValue Ops[] = {CurDAG->getRegister(ARM::CPSR, MVT::i32), X,
+ CurDAG->getTargetConstant(Addend, dl, MVT::i32),
+ getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32)};
+ Add = CurDAG->getMachineNode(ARM::tADDi3, dl, MVT::i32, Ops);
+ }
+ if (Add) {
+ SDValue Ops2[] = {SDValue(Add, 0), CurDAG->getConstant(0, dl, MVT::i32)};
+ CurDAG->MorphNodeTo(N, ARMISD::CMPZ, CurDAG->getVTList(MVT::Glue), Ops2);
+ }
+ }
+ // Other cases are autogenerated.
+ break;
+ }
+
+ case ARMISD::CMOV: {
+ SDValue InFlag = N->getOperand(4);
+
+ if (InFlag.getOpcode() == ARMISD::CMPZ) {
+ bool SwitchEQNEToPLMI;
+ SelectCMPZ(InFlag.getNode(), SwitchEQNEToPLMI);
+
+ if (SwitchEQNEToPLMI) {
+ SDValue ARMcc = N->getOperand(2);
+ ARMCC::CondCodes CC =
+ (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
+
+ switch (CC) {
+ default: llvm_unreachable("CMPZ must be either NE or EQ!");
+ case ARMCC::NE:
+ CC = ARMCC::MI;
+ break;
+ case ARMCC::EQ:
+ CC = ARMCC::PL;
+ break;
+ }
+ SDValue NewARMcc = CurDAG->getConstant((unsigned)CC, dl, MVT::i32);
+ SDValue Ops[] = {N->getOperand(0), N->getOperand(1), NewARMcc,
+ N->getOperand(3), N->getOperand(4)};
+ CurDAG->MorphNodeTo(N, ARMISD::CMOV, N->getVTList(), Ops);
+ }
+
+ }
+ // Other cases are autogenerated.
+ break;
+ }
+
+ case ARMISD::VZIP: {
+ unsigned Opc = 0;
+ EVT VT = N->getValueType(0);
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return;
+ case MVT::v8i8: Opc = ARM::VZIPd8; break;
+ case MVT::v4i16: Opc = ARM::VZIPd16; break;
+ case MVT::v2f32:
+ // vzip.32 Dd, Dm is a pseudo-instruction expanded to vtrn.32 Dd, Dm.
+ case MVT::v2i32: Opc = ARM::VTRNd32; break;
+ case MVT::v16i8: Opc = ARM::VZIPq8; break;
+ case MVT::v8i16: Opc = ARM::VZIPq16; break;
+ case MVT::v4f32:
+ case MVT::v4i32: Opc = ARM::VZIPq32; break;
+ }
+ SDValue Pred = getAL(CurDAG, dl);
+ SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
+ SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg };
+ ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, VT, Ops));
+ return;
+ }
+ case ARMISD::VUZP: {
+ unsigned Opc = 0;
+ EVT VT = N->getValueType(0);
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return;
+ case MVT::v8i8: Opc = ARM::VUZPd8; break;
+ case MVT::v4i16: Opc = ARM::VUZPd16; break;
+ case MVT::v2f32:
+ // vuzp.32 Dd, Dm is a pseudo-instruction expanded to vtrn.32 Dd, Dm.
+ case MVT::v2i32: Opc = ARM::VTRNd32; break;
+ case MVT::v16i8: Opc = ARM::VUZPq8; break;
+ case MVT::v8i16: Opc = ARM::VUZPq16; break;
+ case MVT::v4f32:
+ case MVT::v4i32: Opc = ARM::VUZPq32; break;
+ }
+ SDValue Pred = getAL(CurDAG, dl);
+ SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
+ SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg };
+ ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, VT, Ops));
+ return;
+ }
+ case ARMISD::VTRN: {
+ unsigned Opc = 0;
+ EVT VT = N->getValueType(0);
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return;
+ case MVT::v8i8: Opc = ARM::VTRNd8; break;
+ case MVT::v4i16: Opc = ARM::VTRNd16; break;
+ case MVT::v2f32:
+ case MVT::v2i32: Opc = ARM::VTRNd32; break;
+ case MVT::v16i8: Opc = ARM::VTRNq8; break;
+ case MVT::v8i16: Opc = ARM::VTRNq16; break;
+ case MVT::v4f32:
+ case MVT::v4i32: Opc = ARM::VTRNq32; break;
+ }
+ SDValue Pred = getAL(CurDAG, dl);
+ SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
+ SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg };
+ ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, VT, Ops));
+ return;
+ }
+ case ARMISD::BUILD_VECTOR: {
+ EVT VecVT = N->getValueType(0);
+ EVT EltVT = VecVT.getVectorElementType();
+ unsigned NumElts = VecVT.getVectorNumElements();
+ if (EltVT == MVT::f64) {
+ assert(NumElts == 2 && "unexpected type for BUILD_VECTOR");
+ ReplaceNode(
+ N, createDRegPairNode(VecVT, N->getOperand(0), N->getOperand(1)));
+ return;
+ }
+ assert(EltVT == MVT::f32 && "unexpected type for BUILD_VECTOR");
+ if (NumElts == 2) {
+ ReplaceNode(
+ N, createSRegPairNode(VecVT, N->getOperand(0), N->getOperand(1)));
+ return;
+ }
+ assert(NumElts == 4 && "unexpected type for BUILD_VECTOR");
+ ReplaceNode(N,
+ createQuadSRegsNode(VecVT, N->getOperand(0), N->getOperand(1),
+ N->getOperand(2), N->getOperand(3)));
+ return;
+ }
+
+ case ARMISD::VLD1DUP: {
+ static const uint16_t DOpcodes[] = { ARM::VLD1DUPd8, ARM::VLD1DUPd16,
+ ARM::VLD1DUPd32 };
+ static const uint16_t QOpcodes[] = { ARM::VLD1DUPq8, ARM::VLD1DUPq16,
+ ARM::VLD1DUPq32 };
+ SelectVLDDup(N, false, 1, DOpcodes, QOpcodes);
+ return;
+ }
+
+ case ARMISD::VLD2DUP: {
+ static const uint16_t Opcodes[] = { ARM::VLD2DUPd8, ARM::VLD2DUPd16,
+ ARM::VLD2DUPd32 };
+ SelectVLDDup(N, false, 2, Opcodes);
+ return;
+ }
+
+ case ARMISD::VLD3DUP: {
+ static const uint16_t Opcodes[] = { ARM::VLD3DUPd8Pseudo,
+ ARM::VLD3DUPd16Pseudo,
+ ARM::VLD3DUPd32Pseudo };
+ SelectVLDDup(N, false, 3, Opcodes);
+ return;
+ }
+
+ case ARMISD::VLD4DUP: {
+ static const uint16_t Opcodes[] = { ARM::VLD4DUPd8Pseudo,
+ ARM::VLD4DUPd16Pseudo,
+ ARM::VLD4DUPd32Pseudo };
+ SelectVLDDup(N, false, 4, Opcodes);
+ return;
+ }
+
+ case ARMISD::VLD1DUP_UPD: {
+ static const uint16_t DOpcodes[] = { ARM::VLD1DUPd8wb_fixed,
+ ARM::VLD1DUPd16wb_fixed,
+ ARM::VLD1DUPd32wb_fixed };
+ static const uint16_t QOpcodes[] = { ARM::VLD1DUPq8wb_fixed,
+ ARM::VLD1DUPq16wb_fixed,
+ ARM::VLD1DUPq32wb_fixed };
+ SelectVLDDup(N, true, 1, DOpcodes, QOpcodes);
+ return;
+ }
+
+ case ARMISD::VLD2DUP_UPD: {
+ static const uint16_t Opcodes[] = { ARM::VLD2DUPd8wb_fixed,
+ ARM::VLD2DUPd16wb_fixed,
+ ARM::VLD2DUPd32wb_fixed };
+ SelectVLDDup(N, true, 2, Opcodes);
+ return;
+ }
+
+ case ARMISD::VLD3DUP_UPD: {
+ static const uint16_t Opcodes[] = { ARM::VLD3DUPd8Pseudo_UPD,
+ ARM::VLD3DUPd16Pseudo_UPD,
+ ARM::VLD3DUPd32Pseudo_UPD };
+ SelectVLDDup(N, true, 3, Opcodes);
+ return;
+ }
+
+ case ARMISD::VLD4DUP_UPD: {
+ static const uint16_t Opcodes[] = { ARM::VLD4DUPd8Pseudo_UPD,
+ ARM::VLD4DUPd16Pseudo_UPD,
+ ARM::VLD4DUPd32Pseudo_UPD };
+ SelectVLDDup(N, true, 4, Opcodes);
+ return;
+ }
+
+ case ARMISD::VLD1_UPD: {
+ static const uint16_t DOpcodes[] = { ARM::VLD1d8wb_fixed,
+ ARM::VLD1d16wb_fixed,
+ ARM::VLD1d32wb_fixed,
+ ARM::VLD1d64wb_fixed };
+ static const uint16_t QOpcodes[] = { ARM::VLD1q8wb_fixed,
+ ARM::VLD1q16wb_fixed,
+ ARM::VLD1q32wb_fixed,
+ ARM::VLD1q64wb_fixed };
+ SelectVLD(N, true, 1, DOpcodes, QOpcodes, nullptr);
+ return;
+ }
+
+ case ARMISD::VLD2_UPD: {
+ static const uint16_t DOpcodes[] = { ARM::VLD2d8wb_fixed,
+ ARM::VLD2d16wb_fixed,
+ ARM::VLD2d32wb_fixed,
+ ARM::VLD1q64wb_fixed};
+ static const uint16_t QOpcodes[] = { ARM::VLD2q8PseudoWB_fixed,
+ ARM::VLD2q16PseudoWB_fixed,
+ ARM::VLD2q32PseudoWB_fixed };
+ SelectVLD(N, true, 2, DOpcodes, QOpcodes, nullptr);
+ return;
+ }
+
+ case ARMISD::VLD3_UPD: {
+ static const uint16_t DOpcodes[] = { ARM::VLD3d8Pseudo_UPD,
+ ARM::VLD3d16Pseudo_UPD,
+ ARM::VLD3d32Pseudo_UPD,
+ ARM::VLD1d64TPseudoWB_fixed};
+ static const uint16_t QOpcodes0[] = { ARM::VLD3q8Pseudo_UPD,
+ ARM::VLD3q16Pseudo_UPD,
+ ARM::VLD3q32Pseudo_UPD };
+ static const uint16_t QOpcodes1[] = { ARM::VLD3q8oddPseudo_UPD,
+ ARM::VLD3q16oddPseudo_UPD,
+ ARM::VLD3q32oddPseudo_UPD };
+ SelectVLD(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1);
+ return;
+ }
+
+ case ARMISD::VLD4_UPD: {
+ static const uint16_t DOpcodes[] = { ARM::VLD4d8Pseudo_UPD,
+ ARM::VLD4d16Pseudo_UPD,
+ ARM::VLD4d32Pseudo_UPD,
+ ARM::VLD1d64QPseudoWB_fixed};
+ static const uint16_t QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD,
+ ARM::VLD4q16Pseudo_UPD,
+ ARM::VLD4q32Pseudo_UPD };
+ static const uint16_t QOpcodes1[] = { ARM::VLD4q8oddPseudo_UPD,
+ ARM::VLD4q16oddPseudo_UPD,
+ ARM::VLD4q32oddPseudo_UPD };
+ SelectVLD(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
+ return;
+ }
+
+ case ARMISD::VLD2LN_UPD: {
+ static const uint16_t DOpcodes[] = { ARM::VLD2LNd8Pseudo_UPD,
+ ARM::VLD2LNd16Pseudo_UPD,
+ ARM::VLD2LNd32Pseudo_UPD };
+ static const uint16_t QOpcodes[] = { ARM::VLD2LNq16Pseudo_UPD,
+ ARM::VLD2LNq32Pseudo_UPD };
+ SelectVLDSTLane(N, true, true, 2, DOpcodes, QOpcodes);
+ return;
+ }
+
+ case ARMISD::VLD3LN_UPD: {
+ static const uint16_t DOpcodes[] = { ARM::VLD3LNd8Pseudo_UPD,
+ ARM::VLD3LNd16Pseudo_UPD,
+ ARM::VLD3LNd32Pseudo_UPD };
+ static const uint16_t QOpcodes[] = { ARM::VLD3LNq16Pseudo_UPD,
+ ARM::VLD3LNq32Pseudo_UPD };
+ SelectVLDSTLane(N, true, true, 3, DOpcodes, QOpcodes);
+ return;
+ }
+
+ case ARMISD::VLD4LN_UPD: {
+ static const uint16_t DOpcodes[] = { ARM::VLD4LNd8Pseudo_UPD,
+ ARM::VLD4LNd16Pseudo_UPD,
+ ARM::VLD4LNd32Pseudo_UPD };
+ static const uint16_t QOpcodes[] = { ARM::VLD4LNq16Pseudo_UPD,
+ ARM::VLD4LNq32Pseudo_UPD };
+ SelectVLDSTLane(N, true, true, 4, DOpcodes, QOpcodes);
+ return;
+ }
+
+ case ARMISD::VST1_UPD: {
+ static const uint16_t DOpcodes[] = { ARM::VST1d8wb_fixed,
+ ARM::VST1d16wb_fixed,
+ ARM::VST1d32wb_fixed,
+ ARM::VST1d64wb_fixed };
+ static const uint16_t QOpcodes[] = { ARM::VST1q8wb_fixed,
+ ARM::VST1q16wb_fixed,
+ ARM::VST1q32wb_fixed,
+ ARM::VST1q64wb_fixed };
+ SelectVST(N, true, 1, DOpcodes, QOpcodes, nullptr);
+ return;
+ }
+
+ case ARMISD::VST2_UPD: {
+ static const uint16_t DOpcodes[] = { ARM::VST2d8wb_fixed,
+ ARM::VST2d16wb_fixed,
+ ARM::VST2d32wb_fixed,
+ ARM::VST1q64wb_fixed};
+ static const uint16_t QOpcodes[] = { ARM::VST2q8PseudoWB_fixed,
+ ARM::VST2q16PseudoWB_fixed,
+ ARM::VST2q32PseudoWB_fixed };
+ SelectVST(N, true, 2, DOpcodes, QOpcodes, nullptr);
+ return;
+ }
+
+ case ARMISD::VST3_UPD: {
+ static const uint16_t DOpcodes[] = { ARM::VST3d8Pseudo_UPD,
+ ARM::VST3d16Pseudo_UPD,
+ ARM::VST3d32Pseudo_UPD,
+ ARM::VST1d64TPseudoWB_fixed};
+ static const uint16_t QOpcodes0[] = { ARM::VST3q8Pseudo_UPD,
+ ARM::VST3q16Pseudo_UPD,
+ ARM::VST3q32Pseudo_UPD };
+ static const uint16_t QOpcodes1[] = { ARM::VST3q8oddPseudo_UPD,
+ ARM::VST3q16oddPseudo_UPD,
+ ARM::VST3q32oddPseudo_UPD };
+ SelectVST(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1);
+ return;
+ }
+
+ case ARMISD::VST4_UPD: {
+ static const uint16_t DOpcodes[] = { ARM::VST4d8Pseudo_UPD,
+ ARM::VST4d16Pseudo_UPD,
+ ARM::VST4d32Pseudo_UPD,
+ ARM::VST1d64QPseudoWB_fixed};
+ static const uint16_t QOpcodes0[] = { ARM::VST4q8Pseudo_UPD,
+ ARM::VST4q16Pseudo_UPD,
+ ARM::VST4q32Pseudo_UPD };
+ static const uint16_t QOpcodes1[] = { ARM::VST4q8oddPseudo_UPD,
+ ARM::VST4q16oddPseudo_UPD,
+ ARM::VST4q32oddPseudo_UPD };
+ SelectVST(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
+ return;
+ }
+
+ case ARMISD::VST2LN_UPD: {
+ static const uint16_t DOpcodes[] = { ARM::VST2LNd8Pseudo_UPD,
+ ARM::VST2LNd16Pseudo_UPD,
+ ARM::VST2LNd32Pseudo_UPD };
+ static const uint16_t QOpcodes[] = { ARM::VST2LNq16Pseudo_UPD,
+ ARM::VST2LNq32Pseudo_UPD };
+ SelectVLDSTLane(N, false, true, 2, DOpcodes, QOpcodes);
+ return;
+ }
+
+ case ARMISD::VST3LN_UPD: {
+ static const uint16_t DOpcodes[] = { ARM::VST3LNd8Pseudo_UPD,
+ ARM::VST3LNd16Pseudo_UPD,
+ ARM::VST3LNd32Pseudo_UPD };
+ static const uint16_t QOpcodes[] = { ARM::VST3LNq16Pseudo_UPD,
+ ARM::VST3LNq32Pseudo_UPD };
+ SelectVLDSTLane(N, false, true, 3, DOpcodes, QOpcodes);
+ return;
+ }
+
+ case ARMISD::VST4LN_UPD: {
+ static const uint16_t DOpcodes[] = { ARM::VST4LNd8Pseudo_UPD,
+ ARM::VST4LNd16Pseudo_UPD,
+ ARM::VST4LNd32Pseudo_UPD };
+ static const uint16_t QOpcodes[] = { ARM::VST4LNq16Pseudo_UPD,
+ ARM::VST4LNq32Pseudo_UPD };
+ SelectVLDSTLane(N, false, true, 4, DOpcodes, QOpcodes);
+ return;
+ }
+
+ case ISD::INTRINSIC_VOID:
+ case ISD::INTRINSIC_W_CHAIN: {
+ unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ switch (IntNo) {
+ default:
+ break;
+
+ case Intrinsic::arm_mrrc:
+ case Intrinsic::arm_mrrc2: {
+ SDLoc dl(N);
+ SDValue Chain = N->getOperand(0);
+ unsigned Opc;
+
+ if (Subtarget->isThumb())
+ Opc = (IntNo == Intrinsic::arm_mrrc ? ARM::t2MRRC : ARM::t2MRRC2);
+ else
+ Opc = (IntNo == Intrinsic::arm_mrrc ? ARM::MRRC : ARM::MRRC2);
+
+ SmallVector<SDValue, 5> Ops;
+ Ops.push_back(getI32Imm(cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(), dl)); /* coproc */
+ Ops.push_back(getI32Imm(cast<ConstantSDNode>(N->getOperand(3))->getZExtValue(), dl)); /* opc */
+ Ops.push_back(getI32Imm(cast<ConstantSDNode>(N->getOperand(4))->getZExtValue(), dl)); /* CRm */
+
+ // The mrrc2 instruction in ARM doesn't allow predicates, the top 4 bits of the encoded
+ // instruction will always be '1111' but it is possible in assembly language to specify
+ // AL as a predicate to mrrc2 but it doesn't make any difference to the encoded instruction.
+ if (Opc != ARM::MRRC2) {
+ Ops.push_back(getAL(CurDAG, dl));
+ Ops.push_back(CurDAG->getRegister(0, MVT::i32));
+ }
+
+ Ops.push_back(Chain);
+
+ // Writes to two registers.
+ const EVT RetType[] = {MVT::i32, MVT::i32, MVT::Other};
+
+ ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, RetType, Ops));
+ return;
+ }
+ case Intrinsic::arm_ldaexd:
+ case Intrinsic::arm_ldrexd: {
+ SDLoc dl(N);
+ SDValue Chain = N->getOperand(0);
+ SDValue MemAddr = N->getOperand(2);
+ bool isThumb = Subtarget->isThumb() && Subtarget->hasV8MBaselineOps();
+
+ bool IsAcquire = IntNo == Intrinsic::arm_ldaexd;
+ unsigned NewOpc = isThumb ? (IsAcquire ? ARM::t2LDAEXD : ARM::t2LDREXD)
+ : (IsAcquire ? ARM::LDAEXD : ARM::LDREXD);
+
+ // arm_ldrexd returns a i64 value in {i32, i32}
+ std::vector<EVT> ResTys;
+ if (isThumb) {
+ ResTys.push_back(MVT::i32);
+ ResTys.push_back(MVT::i32);
+ } else
+ ResTys.push_back(MVT::Untyped);
+ ResTys.push_back(MVT::Other);
+
+ // Place arguments in the right order.
+ SDValue Ops[] = {MemAddr, getAL(CurDAG, dl),
+ CurDAG->getRegister(0, MVT::i32), Chain};
+ SDNode *Ld = CurDAG->getMachineNode(NewOpc, dl, ResTys, Ops);
+ // Transfer memoperands.
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+ cast<MachineSDNode>(Ld)->setMemRefs(MemOp, MemOp + 1);
+
+ // Remap uses.
+ SDValue OutChain = isThumb ? SDValue(Ld, 2) : SDValue(Ld, 1);
+ if (!SDValue(N, 0).use_empty()) {
+ SDValue Result;
+ if (isThumb)
+ Result = SDValue(Ld, 0);
+ else {
+ SDValue SubRegIdx =
+ CurDAG->getTargetConstant(ARM::gsub_0, dl, MVT::i32);
+ SDNode *ResNode = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+ dl, MVT::i32, SDValue(Ld, 0), SubRegIdx);
+ Result = SDValue(ResNode,0);
+ }
+ ReplaceUses(SDValue(N, 0), Result);
+ }
+ if (!SDValue(N, 1).use_empty()) {
+ SDValue Result;
+ if (isThumb)
+ Result = SDValue(Ld, 1);
+ else {
+ SDValue SubRegIdx =
+ CurDAG->getTargetConstant(ARM::gsub_1, dl, MVT::i32);
+ SDNode *ResNode = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+ dl, MVT::i32, SDValue(Ld, 0), SubRegIdx);
+ Result = SDValue(ResNode,0);
+ }
+ ReplaceUses(SDValue(N, 1), Result);
+ }
+ ReplaceUses(SDValue(N, 2), OutChain);
+ CurDAG->RemoveDeadNode(N);
+ return;
+ }
+ case Intrinsic::arm_stlexd:
+ case Intrinsic::arm_strexd: {
+ SDLoc dl(N);
+ SDValue Chain = N->getOperand(0);
+ SDValue Val0 = N->getOperand(2);
+ SDValue Val1 = N->getOperand(3);
+ SDValue MemAddr = N->getOperand(4);
+
+ // Store exclusive double return a i32 value which is the return status
+ // of the issued store.
+ const EVT ResTys[] = {MVT::i32, MVT::Other};
+
+ bool isThumb = Subtarget->isThumb() && Subtarget->hasThumb2();
+ // Place arguments in the right order.
+ SmallVector<SDValue, 7> Ops;
+ if (isThumb) {
+ Ops.push_back(Val0);
+ Ops.push_back(Val1);
+ } else
+ // arm_strexd uses GPRPair.
+ Ops.push_back(SDValue(createGPRPairNode(MVT::Untyped, Val0, Val1), 0));
+ Ops.push_back(MemAddr);
+ Ops.push_back(getAL(CurDAG, dl));
+ Ops.push_back(CurDAG->getRegister(0, MVT::i32));
+ Ops.push_back(Chain);
+
+ bool IsRelease = IntNo == Intrinsic::arm_stlexd;
+ unsigned NewOpc = isThumb ? (IsRelease ? ARM::t2STLEXD : ARM::t2STREXD)
+ : (IsRelease ? ARM::STLEXD : ARM::STREXD);
+
+ SDNode *St = CurDAG->getMachineNode(NewOpc, dl, ResTys, Ops);
+ // Transfer memoperands.
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+ cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
+
+ ReplaceNode(N, St);
+ return;
+ }
+
+ case Intrinsic::arm_neon_vld1: {
+ static const uint16_t DOpcodes[] = { ARM::VLD1d8, ARM::VLD1d16,
+ ARM::VLD1d32, ARM::VLD1d64 };
+ static const uint16_t QOpcodes[] = { ARM::VLD1q8, ARM::VLD1q16,
+ ARM::VLD1q32, ARM::VLD1q64};
+ SelectVLD(N, false, 1, DOpcodes, QOpcodes, nullptr);
+ return;
+ }
+
+ case Intrinsic::arm_neon_vld2: {
+ static const uint16_t DOpcodes[] = { ARM::VLD2d8, ARM::VLD2d16,
+ ARM::VLD2d32, ARM::VLD1q64 };
+ static const uint16_t QOpcodes[] = { ARM::VLD2q8Pseudo, ARM::VLD2q16Pseudo,
+ ARM::VLD2q32Pseudo };
+ SelectVLD(N, false, 2, DOpcodes, QOpcodes, nullptr);
+ return;
+ }
+
+ case Intrinsic::arm_neon_vld3: {
+ static const uint16_t DOpcodes[] = { ARM::VLD3d8Pseudo,
+ ARM::VLD3d16Pseudo,
+ ARM::VLD3d32Pseudo,
+ ARM::VLD1d64TPseudo };
+ static const uint16_t QOpcodes0[] = { ARM::VLD3q8Pseudo_UPD,
+ ARM::VLD3q16Pseudo_UPD,
+ ARM::VLD3q32Pseudo_UPD };
+ static const uint16_t QOpcodes1[] = { ARM::VLD3q8oddPseudo,
+ ARM::VLD3q16oddPseudo,
+ ARM::VLD3q32oddPseudo };
+ SelectVLD(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
+ return;
+ }
+
+ case Intrinsic::arm_neon_vld4: {
+ static const uint16_t DOpcodes[] = { ARM::VLD4d8Pseudo,
+ ARM::VLD4d16Pseudo,
+ ARM::VLD4d32Pseudo,
+ ARM::VLD1d64QPseudo };
+ static const uint16_t QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD,
+ ARM::VLD4q16Pseudo_UPD,
+ ARM::VLD4q32Pseudo_UPD };
+ static const uint16_t QOpcodes1[] = { ARM::VLD4q8oddPseudo,
+ ARM::VLD4q16oddPseudo,
+ ARM::VLD4q32oddPseudo };
+ SelectVLD(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
+ return;
+ }
+
+ case Intrinsic::arm_neon_vld2lane: {
+ static const uint16_t DOpcodes[] = { ARM::VLD2LNd8Pseudo,
+ ARM::VLD2LNd16Pseudo,
+ ARM::VLD2LNd32Pseudo };
+ static const uint16_t QOpcodes[] = { ARM::VLD2LNq16Pseudo,
+ ARM::VLD2LNq32Pseudo };
+ SelectVLDSTLane(N, true, false, 2, DOpcodes, QOpcodes);
+ return;
+ }
+
+ case Intrinsic::arm_neon_vld3lane: {
+ static const uint16_t DOpcodes[] = { ARM::VLD3LNd8Pseudo,
+ ARM::VLD3LNd16Pseudo,
+ ARM::VLD3LNd32Pseudo };
+ static const uint16_t QOpcodes[] = { ARM::VLD3LNq16Pseudo,
+ ARM::VLD3LNq32Pseudo };
+ SelectVLDSTLane(N, true, false, 3, DOpcodes, QOpcodes);
+ return;
+ }
+
+ case Intrinsic::arm_neon_vld4lane: {
+ static const uint16_t DOpcodes[] = { ARM::VLD4LNd8Pseudo,
+ ARM::VLD4LNd16Pseudo,
+ ARM::VLD4LNd32Pseudo };
+ static const uint16_t QOpcodes[] = { ARM::VLD4LNq16Pseudo,
+ ARM::VLD4LNq32Pseudo };
+ SelectVLDSTLane(N, true, false, 4, DOpcodes, QOpcodes);
+ return;
+ }
+
+ case Intrinsic::arm_neon_vst1: {
+ static const uint16_t DOpcodes[] = { ARM::VST1d8, ARM::VST1d16,
+ ARM::VST1d32, ARM::VST1d64 };
+ static const uint16_t QOpcodes[] = { ARM::VST1q8, ARM::VST1q16,
+ ARM::VST1q32, ARM::VST1q64 };
+ SelectVST(N, false, 1, DOpcodes, QOpcodes, nullptr);
+ return;
+ }
+
+ case Intrinsic::arm_neon_vst2: {
+ static const uint16_t DOpcodes[] = { ARM::VST2d8, ARM::VST2d16,
+ ARM::VST2d32, ARM::VST1q64 };
+ static const uint16_t QOpcodes[] = { ARM::VST2q8Pseudo, ARM::VST2q16Pseudo,
+ ARM::VST2q32Pseudo };
+ SelectVST(N, false, 2, DOpcodes, QOpcodes, nullptr);
+ return;
+ }
+
+ case Intrinsic::arm_neon_vst3: {
+ static const uint16_t DOpcodes[] = { ARM::VST3d8Pseudo,
+ ARM::VST3d16Pseudo,
+ ARM::VST3d32Pseudo,
+ ARM::VST1d64TPseudo };
+ static const uint16_t QOpcodes0[] = { ARM::VST3q8Pseudo_UPD,
+ ARM::VST3q16Pseudo_UPD,
+ ARM::VST3q32Pseudo_UPD };
+ static const uint16_t QOpcodes1[] = { ARM::VST3q8oddPseudo,
+ ARM::VST3q16oddPseudo,
+ ARM::VST3q32oddPseudo };
+ SelectVST(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
+ return;
+ }
+
+ case Intrinsic::arm_neon_vst4: {
+ static const uint16_t DOpcodes[] = { ARM::VST4d8Pseudo,
+ ARM::VST4d16Pseudo,
+ ARM::VST4d32Pseudo,
+ ARM::VST1d64QPseudo };
+ static const uint16_t QOpcodes0[] = { ARM::VST4q8Pseudo_UPD,
+ ARM::VST4q16Pseudo_UPD,
+ ARM::VST4q32Pseudo_UPD };
+ static const uint16_t QOpcodes1[] = { ARM::VST4q8oddPseudo,
+ ARM::VST4q16oddPseudo,
+ ARM::VST4q32oddPseudo };
+ SelectVST(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
+ return;
+ }
+
+ case Intrinsic::arm_neon_vst2lane: {
+ static const uint16_t DOpcodes[] = { ARM::VST2LNd8Pseudo,
+ ARM::VST2LNd16Pseudo,
+ ARM::VST2LNd32Pseudo };
+ static const uint16_t QOpcodes[] = { ARM::VST2LNq16Pseudo,
+ ARM::VST2LNq32Pseudo };
+ SelectVLDSTLane(N, false, false, 2, DOpcodes, QOpcodes);
+ return;
+ }
+
+ case Intrinsic::arm_neon_vst3lane: {
+ static const uint16_t DOpcodes[] = { ARM::VST3LNd8Pseudo,
+ ARM::VST3LNd16Pseudo,
+ ARM::VST3LNd32Pseudo };
+ static const uint16_t QOpcodes[] = { ARM::VST3LNq16Pseudo,
+ ARM::VST3LNq32Pseudo };
+ SelectVLDSTLane(N, false, false, 3, DOpcodes, QOpcodes);
+ return;
+ }
+
+ case Intrinsic::arm_neon_vst4lane: {
+ static const uint16_t DOpcodes[] = { ARM::VST4LNd8Pseudo,
+ ARM::VST4LNd16Pseudo,
+ ARM::VST4LNd32Pseudo };
+ static const uint16_t QOpcodes[] = { ARM::VST4LNq16Pseudo,
+ ARM::VST4LNq32Pseudo };
+ SelectVLDSTLane(N, false, false, 4, DOpcodes, QOpcodes);
+ return;
+ }
+ }
+ break;
+ }
+
+ case ISD::INTRINSIC_WO_CHAIN: {
+ unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+ switch (IntNo) {
+ default:
+ break;
+
+ case Intrinsic::arm_neon_vtbl2:
+ SelectVTBL(N, false, 2, ARM::VTBL2);
+ return;
+ case Intrinsic::arm_neon_vtbl3:
+ SelectVTBL(N, false, 3, ARM::VTBL3Pseudo);
+ return;
+ case Intrinsic::arm_neon_vtbl4:
+ SelectVTBL(N, false, 4, ARM::VTBL4Pseudo);
+ return;
+
+ case Intrinsic::arm_neon_vtbx2:
+ SelectVTBL(N, true, 2, ARM::VTBX2);
+ return;
+ case Intrinsic::arm_neon_vtbx3:
+ SelectVTBL(N, true, 3, ARM::VTBX3Pseudo);
+ return;
+ case Intrinsic::arm_neon_vtbx4:
+ SelectVTBL(N, true, 4, ARM::VTBX4Pseudo);
+ return;
+ }
+ break;
+ }
+
+ case ARMISD::VTBL1: {
+ SDLoc dl(N);
+ EVT VT = N->getValueType(0);
+ SDValue Ops[] = {N->getOperand(0), N->getOperand(1),
+ getAL(CurDAG, dl), // Predicate
+ CurDAG->getRegister(0, MVT::i32)}; // Predicate Register
+ ReplaceNode(N, CurDAG->getMachineNode(ARM::VTBL1, dl, VT, Ops));
+ return;
+ }
+ case ARMISD::VTBL2: {
+ SDLoc dl(N);
+ EVT VT = N->getValueType(0);
+
+ // Form a REG_SEQUENCE to force register allocation.
+ SDValue V0 = N->getOperand(0);
+ SDValue V1 = N->getOperand(1);
+ SDValue RegSeq = SDValue(createDRegPairNode(MVT::v16i8, V0, V1), 0);
+
+ SDValue Ops[] = {RegSeq, N->getOperand(2), getAL(CurDAG, dl), // Predicate
+ CurDAG->getRegister(0, MVT::i32)}; // Predicate Register
+ ReplaceNode(N, CurDAG->getMachineNode(ARM::VTBL2, dl, VT, Ops));
+ return;
+ }
+
+ case ISD::CONCAT_VECTORS:
+ SelectConcatVector(N);
+ return;
+
+ case ISD::ATOMIC_CMP_SWAP:
+ SelectCMP_SWAP(N);
+ return;
+ }
+
+ SelectCode(N);
+}
+
+// Inspect a register string of the form
+// cp<coprocessor>:<opc1>:c<CRn>:c<CRm>:<opc2> (32bit) or
+// cp<coprocessor>:<opc1>:c<CRm> (64bit) inspect the fields of the string
+// and obtain the integer operands from them, adding these operands to the
+// provided vector.
+static void getIntOperandsFromRegisterString(StringRef RegString,
+ SelectionDAG *CurDAG,
+ const SDLoc &DL,
+ std::vector<SDValue> &Ops) {
+ SmallVector<StringRef, 5> Fields;
+ RegString.split(Fields, ':');
+
+ if (Fields.size() > 1) {
+ bool AllIntFields = true;
+
+ for (StringRef Field : Fields) {
+ // Need to trim out leading 'cp' characters and get the integer field.
+ unsigned IntField;
+ AllIntFields &= !Field.trim("CPcp").getAsInteger(10, IntField);
+ Ops.push_back(CurDAG->getTargetConstant(IntField, DL, MVT::i32));
+ }
+
+ assert(AllIntFields &&
+ "Unexpected non-integer value in special register string.");
+ }
+}
+
+// Maps a Banked Register string to its mask value. The mask value returned is
+// for use in the MRSbanked / MSRbanked instruction nodes as the Banked Register
+// mask operand, which expresses which register is to be used, e.g. r8, and in
+// which mode it is to be used, e.g. usr. Returns -1 to signify that the string
+// was invalid.
+static inline int getBankedRegisterMask(StringRef RegString) {
+ return StringSwitch<int>(RegString.lower())
+ .Case("r8_usr", 0x00)
+ .Case("r9_usr", 0x01)
+ .Case("r10_usr", 0x02)
+ .Case("r11_usr", 0x03)
+ .Case("r12_usr", 0x04)
+ .Case("sp_usr", 0x05)
+ .Case("lr_usr", 0x06)
+ .Case("r8_fiq", 0x08)
+ .Case("r9_fiq", 0x09)
+ .Case("r10_fiq", 0x0a)
+ .Case("r11_fiq", 0x0b)
+ .Case("r12_fiq", 0x0c)
+ .Case("sp_fiq", 0x0d)
+ .Case("lr_fiq", 0x0e)
+ .Case("lr_irq", 0x10)
+ .Case("sp_irq", 0x11)
+ .Case("lr_svc", 0x12)
+ .Case("sp_svc", 0x13)
+ .Case("lr_abt", 0x14)
+ .Case("sp_abt", 0x15)
+ .Case("lr_und", 0x16)
+ .Case("sp_und", 0x17)
+ .Case("lr_mon", 0x1c)
+ .Case("sp_mon", 0x1d)
+ .Case("elr_hyp", 0x1e)
+ .Case("sp_hyp", 0x1f)
+ .Case("spsr_fiq", 0x2e)
+ .Case("spsr_irq", 0x30)
+ .Case("spsr_svc", 0x32)
+ .Case("spsr_abt", 0x34)
+ .Case("spsr_und", 0x36)
+ .Case("spsr_mon", 0x3c)
+ .Case("spsr_hyp", 0x3e)
+ .Default(-1);
+}
+
+// Maps a MClass special register string to its value for use in the
+// t2MRS_M / t2MSR_M instruction nodes as the SYSm value operand.
+// Returns -1 to signify that the string was invalid.
+static inline int getMClassRegisterSYSmValueMask(StringRef RegString) {
+ return StringSwitch<int>(RegString.lower())
+ .Case("apsr", 0x0)
+ .Case("iapsr", 0x1)
+ .Case("eapsr", 0x2)
+ .Case("xpsr", 0x3)
+ .Case("ipsr", 0x5)
+ .Case("epsr", 0x6)
+ .Case("iepsr", 0x7)
+ .Case("msp", 0x8)
+ .Case("psp", 0x9)
+ .Case("primask", 0x10)
+ .Case("basepri", 0x11)
+ .Case("basepri_max", 0x12)
+ .Case("faultmask", 0x13)
+ .Case("control", 0x14)
+ .Case("msplim", 0x0a)
+ .Case("psplim", 0x0b)
+ .Case("sp", 0x18)
+ .Default(-1);
+}
+
+// The flags here are common to those allowed for apsr in the A class cores and
+// those allowed for the special registers in the M class cores. Returns a
+// value representing which flags were present, -1 if invalid.
+static inline int getMClassFlagsMask(StringRef Flags, bool hasDSP) {
+ if (Flags.empty())
+ return 0x2 | (int)hasDSP;
+
+ return StringSwitch<int>(Flags)
+ .Case("g", 0x1)
+ .Case("nzcvq", 0x2)
+ .Case("nzcvqg", 0x3)
+ .Default(-1);
+}
+
+static int getMClassRegisterMask(StringRef Reg, StringRef Flags, bool IsRead,
+ const ARMSubtarget *Subtarget) {
+ // Ensure that the register (without flags) was a valid M Class special
+ // register.
+ int SYSmvalue = getMClassRegisterSYSmValueMask(Reg);
+ if (SYSmvalue == -1)
+ return -1;
+
+ // basepri, basepri_max and faultmask are only valid for V7m.
+ if (!Subtarget->hasV7Ops() && SYSmvalue >= 0x11 && SYSmvalue <= 0x13)
+ return -1;
+
+ if (Subtarget->has8MSecExt() && Flags.lower() == "ns") {
+ Flags = "";
+ SYSmvalue |= 0x80;
+ }
+
+ if (!Subtarget->has8MSecExt() &&
+ (SYSmvalue == 0xa || SYSmvalue == 0xb || SYSmvalue > 0x14))
+ return -1;
+
+ if (!Subtarget->hasV8MMainlineOps() &&
+ (SYSmvalue == 0x8a || SYSmvalue == 0x8b || SYSmvalue == 0x91 ||
+ SYSmvalue == 0x93))
+ return -1;
+
+ // If it was a read then we won't be expecting flags and so at this point
+ // we can return the mask.
+ if (IsRead) {
+ if (Flags.empty())
+ return SYSmvalue;
+ else
+ return -1;
+ }
+
+ // We know we are now handling a write so need to get the mask for the flags.
+ int Mask = getMClassFlagsMask(Flags, Subtarget->hasDSP());
+
+ // Only apsr, iapsr, eapsr, xpsr can have flags. The other register values
+ // shouldn't have flags present.
+ if ((SYSmvalue < 0x4 && Mask == -1) || (SYSmvalue > 0x4 && !Flags.empty()))
+ return -1;
+
+ // The _g and _nzcvqg versions are only valid if the DSP extension is
+ // available.
+ if (!Subtarget->hasDSP() && (Mask & 0x1))
+ return -1;
+
+ // The register was valid so need to put the mask in the correct place
+ // (the flags need to be in bits 11-10) and combine with the SYSmvalue to
+ // construct the operand for the instruction node.
+ if (SYSmvalue < 0x4)
+ return SYSmvalue | Mask << 10;
+
+ return SYSmvalue;
+}
+
+static int getARClassRegisterMask(StringRef Reg, StringRef Flags) {
+ // The mask operand contains the special register (R Bit) in bit 4, whether
+ // the register is spsr (R bit is 1) or one of cpsr/apsr (R bit is 0), and
+ // bits 3-0 contains the fields to be accessed in the special register, set by
+ // the flags provided with the register.
+ int Mask = 0;
+ if (Reg == "apsr") {
+ // The flags permitted for apsr are the same flags that are allowed in
+ // M class registers. We get the flag value and then shift the flags into
+ // the correct place to combine with the mask.
+ Mask = getMClassFlagsMask(Flags, true);
+ if (Mask == -1)
+ return -1;
+ return Mask << 2;
+ }
+
+ if (Reg != "cpsr" && Reg != "spsr") {
+ return -1;
+ }
+
+ // This is the same as if the flags were "fc"
+ if (Flags.empty() || Flags == "all")
+ return Mask | 0x9;
+
+ // Inspect the supplied flags string and set the bits in the mask for
+ // the relevant and valid flags allowed for cpsr and spsr.
+ for (char Flag : Flags) {
+ int FlagVal;
+ switch (Flag) {
+ case 'c':
+ FlagVal = 0x1;
+ break;
+ case 'x':
+ FlagVal = 0x2;
+ break;
+ case 's':
+ FlagVal = 0x4;
+ break;
+ case 'f':
+ FlagVal = 0x8;
+ break;
+ default:
+ FlagVal = 0;
+ }
+
+ // This avoids allowing strings where the same flag bit appears twice.
+ if (!FlagVal || (Mask & FlagVal))
+ return -1;
+ Mask |= FlagVal;
+ }
+
+ // If the register is spsr then we need to set the R bit.
+ if (Reg == "spsr")
+ Mask |= 0x10;
+
+ return Mask;
+}
+
+// Lower the read_register intrinsic to ARM specific DAG nodes
+// using the supplied metadata string to select the instruction node to use
+// and the registers/masks to construct as operands for the node.
+bool ARMDAGToDAGISel::tryReadRegister(SDNode *N){
+ const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
+ const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
+ bool IsThumb2 = Subtarget->isThumb2();
+ SDLoc DL(N);
+
+ std::vector<SDValue> Ops;
+ getIntOperandsFromRegisterString(RegString->getString(), CurDAG, DL, Ops);
+
+ if (!Ops.empty()) {
+ // If the special register string was constructed of fields (as defined
+ // in the ACLE) then need to lower to MRC node (32 bit) or
+ // MRRC node(64 bit), we can make the distinction based on the number of
+ // operands we have.
+ unsigned Opcode;
+ SmallVector<EVT, 3> ResTypes;
+ if (Ops.size() == 5){
+ Opcode = IsThumb2 ? ARM::t2MRC : ARM::MRC;
+ ResTypes.append({ MVT::i32, MVT::Other });
+ } else {
+ assert(Ops.size() == 3 &&
+ "Invalid number of fields in special register string.");
+ Opcode = IsThumb2 ? ARM::t2MRRC : ARM::MRRC;
+ ResTypes.append({ MVT::i32, MVT::i32, MVT::Other });
+ }
+
+ Ops.push_back(getAL(CurDAG, DL));
+ Ops.push_back(CurDAG->getRegister(0, MVT::i32));
+ Ops.push_back(N->getOperand(0));
+ ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, ResTypes, Ops));
+ return true;
+ }
+
+ std::string SpecialReg = RegString->getString().lower();
+
+ int BankedReg = getBankedRegisterMask(SpecialReg);
+ if (BankedReg != -1) {
+ Ops = { CurDAG->getTargetConstant(BankedReg, DL, MVT::i32),
+ getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
+ N->getOperand(0) };
+ ReplaceNode(
+ N, CurDAG->getMachineNode(IsThumb2 ? ARM::t2MRSbanked : ARM::MRSbanked,
+ DL, MVT::i32, MVT::Other, Ops));
+ return true;
+ }
+
+ // The VFP registers are read by creating SelectionDAG nodes with opcodes
+ // corresponding to the register that is being read from. So we switch on the
+ // string to find which opcode we need to use.
+ unsigned Opcode = StringSwitch<unsigned>(SpecialReg)
+ .Case("fpscr", ARM::VMRS)
+ .Case("fpexc", ARM::VMRS_FPEXC)
+ .Case("fpsid", ARM::VMRS_FPSID)
+ .Case("mvfr0", ARM::VMRS_MVFR0)
+ .Case("mvfr1", ARM::VMRS_MVFR1)
+ .Case("mvfr2", ARM::VMRS_MVFR2)
+ .Case("fpinst", ARM::VMRS_FPINST)
+ .Case("fpinst2", ARM::VMRS_FPINST2)
+ .Default(0);
+
+ // If an opcode was found then we can lower the read to a VFP instruction.
+ if (Opcode) {
+ if (!Subtarget->hasVFP2())
+ return false;
+ if (Opcode == ARM::VMRS_MVFR2 && !Subtarget->hasFPARMv8())
+ return false;
+
+ Ops = { getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
+ N->getOperand(0) };
+ ReplaceNode(N,
+ CurDAG->getMachineNode(Opcode, DL, MVT::i32, MVT::Other, Ops));
+ return true;
+ }
+
+ // If the target is M Class then need to validate that the register string
+ // is an acceptable value, so check that a mask can be constructed from the
+ // string.
+ if (Subtarget->isMClass()) {
+ StringRef Flags = "", Reg = SpecialReg;
+ if (Reg.endswith("_ns")) {
+ Flags = "ns";
+ Reg = Reg.drop_back(3);
+ }
+
+ int SYSmValue = getMClassRegisterMask(Reg, Flags, true, Subtarget);
+ if (SYSmValue == -1)
+ return false;
+
+ SDValue Ops[] = { CurDAG->getTargetConstant(SYSmValue, DL, MVT::i32),
+ getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
+ N->getOperand(0) };
+ ReplaceNode(
+ N, CurDAG->getMachineNode(ARM::t2MRS_M, DL, MVT::i32, MVT::Other, Ops));
+ return true;
+ }
+
+ // Here we know the target is not M Class so we need to check if it is one
+ // of the remaining possible values which are apsr, cpsr or spsr.
+ if (SpecialReg == "apsr" || SpecialReg == "cpsr") {
+ Ops = { getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
+ N->getOperand(0) };
+ ReplaceNode(N, CurDAG->getMachineNode(IsThumb2 ? ARM::t2MRS_AR : ARM::MRS,
+ DL, MVT::i32, MVT::Other, Ops));
+ return true;
+ }
+
+ if (SpecialReg == "spsr") {
+ Ops = { getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
+ N->getOperand(0) };
+ ReplaceNode(
+ N, CurDAG->getMachineNode(IsThumb2 ? ARM::t2MRSsys_AR : ARM::MRSsys, DL,
+ MVT::i32, MVT::Other, Ops));
+ return true;
+ }
+
+ return false;
+}
+
+// Lower the write_register intrinsic to ARM specific DAG nodes
+// using the supplied metadata string to select the instruction node to use
+// and the registers/masks to use in the nodes
+bool ARMDAGToDAGISel::tryWriteRegister(SDNode *N){
+ const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
+ const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
+ bool IsThumb2 = Subtarget->isThumb2();
+ SDLoc DL(N);
+
+ std::vector<SDValue> Ops;
+ getIntOperandsFromRegisterString(RegString->getString(), CurDAG, DL, Ops);
+
+ if (!Ops.empty()) {
+ // If the special register string was constructed of fields (as defined
+ // in the ACLE) then need to lower to MCR node (32 bit) or
+ // MCRR node(64 bit), we can make the distinction based on the number of
+ // operands we have.
+ unsigned Opcode;
+ if (Ops.size() == 5) {
+ Opcode = IsThumb2 ? ARM::t2MCR : ARM::MCR;
+ Ops.insert(Ops.begin()+2, N->getOperand(2));
+ } else {
+ assert(Ops.size() == 3 &&
+ "Invalid number of fields in special register string.");
+ Opcode = IsThumb2 ? ARM::t2MCRR : ARM::MCRR;
+ SDValue WriteValue[] = { N->getOperand(2), N->getOperand(3) };
+ Ops.insert(Ops.begin()+2, WriteValue, WriteValue+2);
+ }
+
+ Ops.push_back(getAL(CurDAG, DL));
+ Ops.push_back(CurDAG->getRegister(0, MVT::i32));
+ Ops.push_back(N->getOperand(0));
+
+ ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops));
+ return true;
+ }
+
+ std::string SpecialReg = RegString->getString().lower();
+ int BankedReg = getBankedRegisterMask(SpecialReg);
+ if (BankedReg != -1) {
+ Ops = { CurDAG->getTargetConstant(BankedReg, DL, MVT::i32), N->getOperand(2),
+ getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
+ N->getOperand(0) };
+ ReplaceNode(
+ N, CurDAG->getMachineNode(IsThumb2 ? ARM::t2MSRbanked : ARM::MSRbanked,
+ DL, MVT::Other, Ops));
+ return true;
+ }
+
+ // The VFP registers are written to by creating SelectionDAG nodes with
+ // opcodes corresponding to the register that is being written. So we switch
+ // on the string to find which opcode we need to use.
+ unsigned Opcode = StringSwitch<unsigned>(SpecialReg)
+ .Case("fpscr", ARM::VMSR)
+ .Case("fpexc", ARM::VMSR_FPEXC)
+ .Case("fpsid", ARM::VMSR_FPSID)
+ .Case("fpinst", ARM::VMSR_FPINST)
+ .Case("fpinst2", ARM::VMSR_FPINST2)
+ .Default(0);
+
+ if (Opcode) {
+ if (!Subtarget->hasVFP2())
+ return false;
+ Ops = { N->getOperand(2), getAL(CurDAG, DL),
+ CurDAG->getRegister(0, MVT::i32), N->getOperand(0) };
+ ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops));
+ return true;
+ }
+
+ std::pair<StringRef, StringRef> Fields;
+ Fields = StringRef(SpecialReg).rsplit('_');
+ std::string Reg = Fields.first.str();
+ StringRef Flags = Fields.second;
+
+ // If the target was M Class then need to validate the special register value
+ // and retrieve the mask for use in the instruction node.
+ if (Subtarget->isMClass()) {
+ // basepri_max gets split so need to correct Reg and Flags.
+ if (SpecialReg == "basepri_max") {
+ Reg = SpecialReg;
+ Flags = "";
+ }
+ int SYSmValue = getMClassRegisterMask(Reg, Flags, false, Subtarget);
+ if (SYSmValue == -1)
+ return false;
+
+ SDValue Ops[] = { CurDAG->getTargetConstant(SYSmValue, DL, MVT::i32),
+ N->getOperand(2), getAL(CurDAG, DL),
+ CurDAG->getRegister(0, MVT::i32), N->getOperand(0) };
+ ReplaceNode(N, CurDAG->getMachineNode(ARM::t2MSR_M, DL, MVT::Other, Ops));
+ return true;
+ }
+
+ // We then check to see if a valid mask can be constructed for one of the
+ // register string values permitted for the A and R class cores. These values
+ // are apsr, spsr and cpsr; these are also valid on older cores.
+ int Mask = getARClassRegisterMask(Reg, Flags);
+ if (Mask != -1) {
+ Ops = { CurDAG->getTargetConstant(Mask, DL, MVT::i32), N->getOperand(2),
+ getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
+ N->getOperand(0) };
+ ReplaceNode(N, CurDAG->getMachineNode(IsThumb2 ? ARM::t2MSR_AR : ARM::MSR,
+ DL, MVT::Other, Ops));
+ return true;
+ }
+
+ return false;
+}
+
+bool ARMDAGToDAGISel::tryInlineAsm(SDNode *N){
+ std::vector<SDValue> AsmNodeOperands;
+ unsigned Flag, Kind;
+ bool Changed = false;
+ unsigned NumOps = N->getNumOperands();
+
+ // Normally, i64 data is bounded to two arbitrary GRPs for "%r" constraint.
+ // However, some instrstions (e.g. ldrexd/strexd in ARM mode) require
+ // (even/even+1) GPRs and use %n and %Hn to refer to the individual regs
+ // respectively. Since there is no constraint to explicitly specify a
+ // reg pair, we use GPRPair reg class for "%r" for 64-bit data. For Thumb,
+ // the 64-bit data may be referred by H, Q, R modifiers, so we still pack
+ // them into a GPRPair.
+
+ SDLoc dl(N);
+ SDValue Glue = N->getGluedNode() ? N->getOperand(NumOps-1)
+ : SDValue(nullptr,0);
+
+ SmallVector<bool, 8> OpChanged;
+ // Glue node will be appended late.
+ for(unsigned i = 0, e = N->getGluedNode() ? NumOps - 1 : NumOps; i < e; ++i) {
+ SDValue op = N->getOperand(i);
+ AsmNodeOperands.push_back(op);
+
+ if (i < InlineAsm::Op_FirstOperand)
+ continue;
+
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(i))) {
+ Flag = C->getZExtValue();
+ Kind = InlineAsm::getKind(Flag);
+ }
+ else
+ continue;
+
+ // Immediate operands to inline asm in the SelectionDAG are modeled with
+ // two operands. The first is a constant of value InlineAsm::Kind_Imm, and
+ // the second is a constant with the value of the immediate. If we get here
+ // and we have a Kind_Imm, skip the next operand, and continue.
+ if (Kind == InlineAsm::Kind_Imm) {
+ SDValue op = N->getOperand(++i);
+ AsmNodeOperands.push_back(op);
+ continue;
+ }
+
+ unsigned NumRegs = InlineAsm::getNumOperandRegisters(Flag);
+ if (NumRegs)
+ OpChanged.push_back(false);
+
+ unsigned DefIdx = 0;
+ bool IsTiedToChangedOp = false;
+ // If it's a use that is tied with a previous def, it has no
+ // reg class constraint.
+ if (Changed && InlineAsm::isUseOperandTiedToDef(Flag, DefIdx))
+ IsTiedToChangedOp = OpChanged[DefIdx];
+
+ // Memory operands to inline asm in the SelectionDAG are modeled with two
+ // operands: a constant of value InlineAsm::Kind_Mem followed by the input
+ // operand. If we get here and we have a Kind_Mem, skip the next operand (so
+ // it doesn't get misinterpreted), and continue. We do this here because
+ // it's important to update the OpChanged array correctly before moving on.
+ if (Kind == InlineAsm::Kind_Mem) {
+ SDValue op = N->getOperand(++i);
+ AsmNodeOperands.push_back(op);
+ continue;
+ }
+
+ if (Kind != InlineAsm::Kind_RegUse && Kind != InlineAsm::Kind_RegDef
+ && Kind != InlineAsm::Kind_RegDefEarlyClobber)
+ continue;
+
+ unsigned RC;
+ bool HasRC = InlineAsm::hasRegClassConstraint(Flag, RC);
+ if ((!IsTiedToChangedOp && (!HasRC || RC != ARM::GPRRegClassID))
+ || NumRegs != 2)
+ continue;
+
+ assert((i+2 < NumOps) && "Invalid number of operands in inline asm");
+ SDValue V0 = N->getOperand(i+1);
+ SDValue V1 = N->getOperand(i+2);
+ unsigned Reg0 = cast<RegisterSDNode>(V0)->getReg();
+ unsigned Reg1 = cast<RegisterSDNode>(V1)->getReg();
+ SDValue PairedReg;
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ if (Kind == InlineAsm::Kind_RegDef ||
+ Kind == InlineAsm::Kind_RegDefEarlyClobber) {
+ // Replace the two GPRs with 1 GPRPair and copy values from GPRPair to
+ // the original GPRs.
+
+ unsigned GPVR = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+ PairedReg = CurDAG->getRegister(GPVR, MVT::Untyped);
+ SDValue Chain = SDValue(N,0);
+
+ SDNode *GU = N->getGluedUser();
+ SDValue RegCopy = CurDAG->getCopyFromReg(Chain, dl, GPVR, MVT::Untyped,
+ Chain.getValue(1));
+
+ // Extract values from a GPRPair reg and copy to the original GPR reg.
+ SDValue Sub0 = CurDAG->getTargetExtractSubreg(ARM::gsub_0, dl, MVT::i32,
+ RegCopy);
+ SDValue Sub1 = CurDAG->getTargetExtractSubreg(ARM::gsub_1, dl, MVT::i32,
+ RegCopy);
+ SDValue T0 = CurDAG->getCopyToReg(Sub0, dl, Reg0, Sub0,
+ RegCopy.getValue(1));
+ SDValue T1 = CurDAG->getCopyToReg(Sub1, dl, Reg1, Sub1, T0.getValue(1));
+
+ // Update the original glue user.
+ std::vector<SDValue> Ops(GU->op_begin(), GU->op_end()-1);
+ Ops.push_back(T1.getValue(1));
+ CurDAG->UpdateNodeOperands(GU, Ops);
+ }
+ else {
+ // For Kind == InlineAsm::Kind_RegUse, we first copy two GPRs into a
+ // GPRPair and then pass the GPRPair to the inline asm.
+ SDValue Chain = AsmNodeOperands[InlineAsm::Op_InputChain];
+
+ // As REG_SEQ doesn't take RegisterSDNode, we copy them first.
+ SDValue T0 = CurDAG->getCopyFromReg(Chain, dl, Reg0, MVT::i32,
+ Chain.getValue(1));
+ SDValue T1 = CurDAG->getCopyFromReg(Chain, dl, Reg1, MVT::i32,
+ T0.getValue(1));
+ SDValue Pair = SDValue(createGPRPairNode(MVT::Untyped, T0, T1), 0);
+
+ // Copy REG_SEQ into a GPRPair-typed VR and replace the original two
+ // i32 VRs of inline asm with it.
+ unsigned GPVR = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+ PairedReg = CurDAG->getRegister(GPVR, MVT::Untyped);
+ Chain = CurDAG->getCopyToReg(T1, dl, GPVR, Pair, T1.getValue(1));
+
+ AsmNodeOperands[InlineAsm::Op_InputChain] = Chain;
+ Glue = Chain.getValue(1);
+ }
+
+ Changed = true;
+
+ if(PairedReg.getNode()) {
+ OpChanged[OpChanged.size() -1 ] = true;
+ Flag = InlineAsm::getFlagWord(Kind, 1 /* RegNum*/);
+ if (IsTiedToChangedOp)
+ Flag = InlineAsm::getFlagWordForMatchingOp(Flag, DefIdx);
+ else
+ Flag = InlineAsm::getFlagWordForRegClass(Flag, ARM::GPRPairRegClassID);
+ // Replace the current flag.
+ AsmNodeOperands[AsmNodeOperands.size() -1] = CurDAG->getTargetConstant(
+ Flag, dl, MVT::i32);
+ // Add the new register node and skip the original two GPRs.
+ AsmNodeOperands.push_back(PairedReg);
+ // Skip the next two GPRs.
+ i += 2;
+ }
+ }
+
+ if (Glue.getNode())
+ AsmNodeOperands.push_back(Glue);
+ if (!Changed)
+ return false;
+
+ SDValue New = CurDAG->getNode(ISD::INLINEASM, SDLoc(N),
+ CurDAG->getVTList(MVT::Other, MVT::Glue), AsmNodeOperands);
+ New->setNodeId(-1);
+ ReplaceNode(N, New.getNode());
+ return true;
+}
+
+
+bool ARMDAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
+ std::vector<SDValue> &OutOps) {
+ switch(ConstraintID) {
+ default:
+ llvm_unreachable("Unexpected asm memory constraint");
+ case InlineAsm::Constraint_i:
+ // FIXME: It seems strange that 'i' is needed here since it's supposed to
+ // be an immediate and not a memory constraint.
+ LLVM_FALLTHROUGH;
+ case InlineAsm::Constraint_m:
+ case InlineAsm::Constraint_o:
+ case InlineAsm::Constraint_Q:
+ case InlineAsm::Constraint_Um:
+ case InlineAsm::Constraint_Un:
+ case InlineAsm::Constraint_Uq:
+ case InlineAsm::Constraint_Us:
+ case InlineAsm::Constraint_Ut:
+ case InlineAsm::Constraint_Uv:
+ case InlineAsm::Constraint_Uy:
+ // Require the address to be in a register. That is safe for all ARM
+ // variants and it is hard to do anything much smarter without knowing
+ // how the operand is used.
+ OutOps.push_back(Op);
+ return false;
+ }
+ return true;
+}
+
+/// createARMISelDag - This pass converts a legalized DAG into a
+/// ARM-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createARMISelDag(ARMBaseTargetMachine &TM,
+ CodeGenOpt::Level OptLevel) {
+ return new ARMDAGToDAGISel(TM, OptLevel);
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp
new file mode 100644
index 000000000000..afba1587a743
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -0,0 +1,13504 @@
+//===-- ARMISelLowering.cpp - ARM DAG Lowering Implementation -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that ARM uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMISelLowering.h"
+#include "ARMCallingConv.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMPerfectShuffle.h"
+#include "ARMSubtarget.h"
+#include "ARMTargetMachine.h"
+#include "ARMTargetObjectFile.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/IntrinsicLowering.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+#include <utility>
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-isel"
+
+STATISTIC(NumTailCalls, "Number of tail calls");
+STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
+STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
+STATISTIC(NumConstpoolPromoted,
+ "Number of constants with their storage promoted into constant pools");
+
+static cl::opt<bool>
+ARMInterworking("arm-interworking", cl::Hidden,
+ cl::desc("Enable / disable ARM interworking (for debugging only)"),
+ cl::init(true));
+
+static cl::opt<bool> EnableConstpoolPromotion(
+ "arm-promote-constant", cl::Hidden,
+ cl::desc("Enable / disable promotion of unnamed_addr constants into "
+ "constant pools"),
+ cl::init(true));
+static cl::opt<unsigned> ConstpoolPromotionMaxSize(
+ "arm-promote-constant-max-size", cl::Hidden,
+ cl::desc("Maximum size of constant to promote into a constant pool"),
+ cl::init(64));
+static cl::opt<unsigned> ConstpoolPromotionMaxTotal(
+ "arm-promote-constant-max-total", cl::Hidden,
+ cl::desc("Maximum size of ALL constants to promote into a constant pool"),
+ cl::init(128));
+
+namespace {
+ class ARMCCState : public CCState {
+ public:
+ ARMCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
+ SmallVectorImpl<CCValAssign> &locs, LLVMContext &C,
+ ParmContext PC)
+ : CCState(CC, isVarArg, MF, locs, C) {
+ assert(((PC == Call) || (PC == Prologue)) &&
+ "ARMCCState users must specify whether their context is call"
+ "or prologue generation.");
+ CallOrPrologue = PC;
+ }
+ };
+}
+
+void ARMTargetLowering::InitLibcallCallingConvs() {
+ // The builtins on ARM always use AAPCS, irrespective of wheter C is AAPCS or
+ // AAPCS_VFP.
+ for (const auto LC : {
+ RTLIB::SHL_I16,
+ RTLIB::SHL_I32,
+ RTLIB::SHL_I64,
+ RTLIB::SHL_I128,
+ RTLIB::SRL_I16,
+ RTLIB::SRL_I32,
+ RTLIB::SRL_I64,
+ RTLIB::SRL_I128,
+ RTLIB::SRA_I16,
+ RTLIB::SRA_I32,
+ RTLIB::SRA_I64,
+ RTLIB::SRA_I128,
+ RTLIB::MUL_I8,
+ RTLIB::MUL_I16,
+ RTLIB::MUL_I32,
+ RTLIB::MUL_I64,
+ RTLIB::MUL_I128,
+ RTLIB::MULO_I32,
+ RTLIB::MULO_I64,
+ RTLIB::MULO_I128,
+ RTLIB::SDIV_I8,
+ RTLIB::SDIV_I16,
+ RTLIB::SDIV_I32,
+ RTLIB::SDIV_I64,
+ RTLIB::SDIV_I128,
+ RTLIB::UDIV_I8,
+ RTLIB::UDIV_I16,
+ RTLIB::UDIV_I32,
+ RTLIB::UDIV_I64,
+ RTLIB::UDIV_I128,
+ RTLIB::SREM_I8,
+ RTLIB::SREM_I16,
+ RTLIB::SREM_I32,
+ RTLIB::SREM_I64,
+ RTLIB::SREM_I128,
+ RTLIB::UREM_I8,
+ RTLIB::UREM_I16,
+ RTLIB::UREM_I32,
+ RTLIB::UREM_I64,
+ RTLIB::UREM_I128,
+ RTLIB::SDIVREM_I8,
+ RTLIB::SDIVREM_I16,
+ RTLIB::SDIVREM_I32,
+ RTLIB::SDIVREM_I64,
+ RTLIB::SDIVREM_I128,
+ RTLIB::UDIVREM_I8,
+ RTLIB::UDIVREM_I16,
+ RTLIB::UDIVREM_I32,
+ RTLIB::UDIVREM_I64,
+ RTLIB::UDIVREM_I128,
+ RTLIB::NEG_I32,
+ RTLIB::NEG_I64,
+ RTLIB::ADD_F32,
+ RTLIB::ADD_F64,
+ RTLIB::ADD_F80,
+ RTLIB::ADD_F128,
+ RTLIB::SUB_F32,
+ RTLIB::SUB_F64,
+ RTLIB::SUB_F80,
+ RTLIB::SUB_F128,
+ RTLIB::MUL_F32,
+ RTLIB::MUL_F64,
+ RTLIB::MUL_F80,
+ RTLIB::MUL_F128,
+ RTLIB::DIV_F32,
+ RTLIB::DIV_F64,
+ RTLIB::DIV_F80,
+ RTLIB::DIV_F128,
+ RTLIB::POWI_F32,
+ RTLIB::POWI_F64,
+ RTLIB::POWI_F80,
+ RTLIB::POWI_F128,
+ RTLIB::FPEXT_F64_F128,
+ RTLIB::FPEXT_F32_F128,
+ RTLIB::FPEXT_F32_F64,
+ RTLIB::FPEXT_F16_F32,
+ RTLIB::FPROUND_F32_F16,
+ RTLIB::FPROUND_F64_F16,
+ RTLIB::FPROUND_F80_F16,
+ RTLIB::FPROUND_F128_F16,
+ RTLIB::FPROUND_F64_F32,
+ RTLIB::FPROUND_F80_F32,
+ RTLIB::FPROUND_F128_F32,
+ RTLIB::FPROUND_F80_F64,
+ RTLIB::FPROUND_F128_F64,
+ RTLIB::FPTOSINT_F32_I32,
+ RTLIB::FPTOSINT_F32_I64,
+ RTLIB::FPTOSINT_F32_I128,
+ RTLIB::FPTOSINT_F64_I32,
+ RTLIB::FPTOSINT_F64_I64,
+ RTLIB::FPTOSINT_F64_I128,
+ RTLIB::FPTOSINT_F80_I32,
+ RTLIB::FPTOSINT_F80_I64,
+ RTLIB::FPTOSINT_F80_I128,
+ RTLIB::FPTOSINT_F128_I32,
+ RTLIB::FPTOSINT_F128_I64,
+ RTLIB::FPTOSINT_F128_I128,
+ RTLIB::FPTOUINT_F32_I32,
+ RTLIB::FPTOUINT_F32_I64,
+ RTLIB::FPTOUINT_F32_I128,
+ RTLIB::FPTOUINT_F64_I32,
+ RTLIB::FPTOUINT_F64_I64,
+ RTLIB::FPTOUINT_F64_I128,
+ RTLIB::FPTOUINT_F80_I32,
+ RTLIB::FPTOUINT_F80_I64,
+ RTLIB::FPTOUINT_F80_I128,
+ RTLIB::FPTOUINT_F128_I32,
+ RTLIB::FPTOUINT_F128_I64,
+ RTLIB::FPTOUINT_F128_I128,
+ RTLIB::SINTTOFP_I32_F32,
+ RTLIB::SINTTOFP_I32_F64,
+ RTLIB::SINTTOFP_I32_F80,
+ RTLIB::SINTTOFP_I32_F128,
+ RTLIB::SINTTOFP_I64_F32,
+ RTLIB::SINTTOFP_I64_F64,
+ RTLIB::SINTTOFP_I64_F80,
+ RTLIB::SINTTOFP_I64_F128,
+ RTLIB::SINTTOFP_I128_F32,
+ RTLIB::SINTTOFP_I128_F64,
+ RTLIB::SINTTOFP_I128_F80,
+ RTLIB::SINTTOFP_I128_F128,
+ RTLIB::UINTTOFP_I32_F32,
+ RTLIB::UINTTOFP_I32_F64,
+ RTLIB::UINTTOFP_I32_F80,
+ RTLIB::UINTTOFP_I32_F128,
+ RTLIB::UINTTOFP_I64_F32,
+ RTLIB::UINTTOFP_I64_F64,
+ RTLIB::UINTTOFP_I64_F80,
+ RTLIB::UINTTOFP_I64_F128,
+ RTLIB::UINTTOFP_I128_F32,
+ RTLIB::UINTTOFP_I128_F64,
+ RTLIB::UINTTOFP_I128_F80,
+ RTLIB::UINTTOFP_I128_F128,
+ RTLIB::OEQ_F32,
+ RTLIB::OEQ_F64,
+ RTLIB::OEQ_F128,
+ RTLIB::UNE_F32,
+ RTLIB::UNE_F64,
+ RTLIB::UNE_F128,
+ RTLIB::OGE_F32,
+ RTLIB::OGE_F64,
+ RTLIB::OGE_F128,
+ RTLIB::OLT_F32,
+ RTLIB::OLT_F64,
+ RTLIB::OLT_F128,
+ RTLIB::OLE_F32,
+ RTLIB::OLE_F64,
+ RTLIB::OLE_F128,
+ RTLIB::OGT_F32,
+ RTLIB::OGT_F64,
+ RTLIB::OGT_F128,
+ RTLIB::UO_F32,
+ RTLIB::UO_F64,
+ RTLIB::UO_F128,
+ RTLIB::O_F32,
+ RTLIB::O_F64,
+ RTLIB::O_F128,
+ })
+ setLibcallCallingConv(LC, CallingConv::ARM_AAPCS);
+}
+
+// The APCS parameter registers.
+static const MCPhysReg GPRArgRegs[] = {
+ ARM::R0, ARM::R1, ARM::R2, ARM::R3
+};
+
+void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
+ MVT PromotedBitwiseVT) {
+ if (VT != PromotedLdStVT) {
+ setOperationAction(ISD::LOAD, VT, Promote);
+ AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
+
+ setOperationAction(ISD::STORE, VT, Promote);
+ AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
+ }
+
+ MVT ElemTy = VT.getVectorElementType();
+ if (ElemTy != MVT::f64)
+ setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ if (ElemTy == MVT::i32) {
+ setOperationAction(ISD::SINT_TO_FP, VT, Custom);
+ setOperationAction(ISD::UINT_TO_FP, VT, Custom);
+ setOperationAction(ISD::FP_TO_SINT, VT, Custom);
+ setOperationAction(ISD::FP_TO_UINT, VT, Custom);
+ } else {
+ setOperationAction(ISD::SINT_TO_FP, VT, Expand);
+ setOperationAction(ISD::UINT_TO_FP, VT, Expand);
+ setOperationAction(ISD::FP_TO_SINT, VT, Expand);
+ setOperationAction(ISD::FP_TO_UINT, VT, Expand);
+ }
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
+ setOperationAction(ISD::SELECT, VT, Expand);
+ setOperationAction(ISD::SELECT_CC, VT, Expand);
+ setOperationAction(ISD::VSELECT, VT, Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
+ if (VT.isInteger()) {
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::SRA, VT, Custom);
+ setOperationAction(ISD::SRL, VT, Custom);
+ }
+
+ // Promote all bit-wise operations.
+ if (VT.isInteger() && VT != PromotedBitwiseVT) {
+ setOperationAction(ISD::AND, VT, Promote);
+ AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT);
+ setOperationAction(ISD::OR, VT, Promote);
+ AddPromotedToType (ISD::OR, VT, PromotedBitwiseVT);
+ setOperationAction(ISD::XOR, VT, Promote);
+ AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT);
+ }
+
+ // Neon does not support vector divide/remainder operations.
+ setOperationAction(ISD::SDIV, VT, Expand);
+ setOperationAction(ISD::UDIV, VT, Expand);
+ setOperationAction(ISD::FDIV, VT, Expand);
+ setOperationAction(ISD::SREM, VT, Expand);
+ setOperationAction(ISD::UREM, VT, Expand);
+ setOperationAction(ISD::FREM, VT, Expand);
+
+ if (!VT.isFloatingPoint() &&
+ VT != MVT::v2i64 && VT != MVT::v1i64)
+ for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
+ setOperationAction(Opcode, VT, Legal);
+}
+
+void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
+ addRegisterClass(VT, &ARM::DPRRegClass);
+ addTypeForNEON(VT, MVT::f64, MVT::v2i32);
+}
+
+void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
+ addRegisterClass(VT, &ARM::DPairRegClass);
+ addTypeForNEON(VT, MVT::v2f64, MVT::v4i32);
+}
+
+ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
+ const ARMSubtarget &STI)
+ : TargetLowering(TM), Subtarget(&STI) {
+ RegInfo = Subtarget->getRegisterInfo();
+ Itins = Subtarget->getInstrItineraryData();
+
+ setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+
+ InitLibcallCallingConvs();
+
+ if (Subtarget->isTargetMachO()) {
+ // Uses VFP for Thumb libfuncs if available.
+ if (Subtarget->isThumb() && Subtarget->hasVFP2() &&
+ Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
+ static const struct {
+ const RTLIB::Libcall Op;
+ const char * const Name;
+ const ISD::CondCode Cond;
+ } LibraryCalls[] = {
+ // Single-precision floating-point arithmetic.
+ { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID },
+ { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID },
+ { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID },
+ { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID },
+
+ // Double-precision floating-point arithmetic.
+ { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID },
+ { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID },
+ { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID },
+ { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID },
+
+ // Single-precision comparisons.
+ { RTLIB::OEQ_F32, "__eqsf2vfp", ISD::SETNE },
+ { RTLIB::UNE_F32, "__nesf2vfp", ISD::SETNE },
+ { RTLIB::OLT_F32, "__ltsf2vfp", ISD::SETNE },
+ { RTLIB::OLE_F32, "__lesf2vfp", ISD::SETNE },
+ { RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE },
+ { RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE },
+ { RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE },
+ { RTLIB::O_F32, "__unordsf2vfp", ISD::SETEQ },
+
+ // Double-precision comparisons.
+ { RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE },
+ { RTLIB::UNE_F64, "__nedf2vfp", ISD::SETNE },
+ { RTLIB::OLT_F64, "__ltdf2vfp", ISD::SETNE },
+ { RTLIB::OLE_F64, "__ledf2vfp", ISD::SETNE },
+ { RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE },
+ { RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE },
+ { RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE },
+ { RTLIB::O_F64, "__unorddf2vfp", ISD::SETEQ },
+
+ // Floating-point to integer conversions.
+ // i64 conversions are done via library routines even when generating VFP
+ // instructions, so use the same ones.
+ { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp", ISD::SETCC_INVALID },
+ { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID },
+ { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp", ISD::SETCC_INVALID },
+ { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID },
+
+ // Conversions between floating types.
+ { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp", ISD::SETCC_INVALID },
+ { RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp", ISD::SETCC_INVALID },
+
+ // Integer to floating-point conversions.
+ // i64 conversions are done via library routines even when generating VFP
+ // instructions, so use the same ones.
+ // FIXME: There appears to be some naming inconsistency in ARM libgcc:
+ // e.g., __floatunsidf vs. __floatunssidfvfp.
+ { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp", ISD::SETCC_INVALID },
+ { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID },
+ { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp", ISD::SETCC_INVALID },
+ { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID },
+ };
+
+ for (const auto &LC : LibraryCalls) {
+ setLibcallName(LC.Op, LC.Name);
+ if (LC.Cond != ISD::SETCC_INVALID)
+ setCmpLibcallCC(LC.Op, LC.Cond);
+ }
+ }
+
+ // Set the correct calling convention for ARMv7k WatchOS. It's just
+ // AAPCS_VFP for functions as simple as libcalls.
+ if (Subtarget->isTargetWatchABI()) {
+ for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i)
+ setLibcallCallingConv((RTLIB::Libcall)i, CallingConv::ARM_AAPCS_VFP);
+ }
+ }
+
+ // These libcalls are not available in 32-bit.
+ setLibcallName(RTLIB::SHL_I128, nullptr);
+ setLibcallName(RTLIB::SRL_I128, nullptr);
+ setLibcallName(RTLIB::SRA_I128, nullptr);
+
+ // RTLIB
+ if (Subtarget->isAAPCS_ABI() &&
+ (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||
+ Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) {
+ static const struct {
+ const RTLIB::Libcall Op;
+ const char * const Name;
+ const CallingConv::ID CC;
+ const ISD::CondCode Cond;
+ } LibraryCalls[] = {
+ // Double-precision floating-point arithmetic helper functions
+ // RTABI chapter 4.1.2, Table 2
+ { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+ { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+ { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+ { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+
+ // Double-precision floating-point comparison helper functions
+ // RTABI chapter 4.1.2, Table 3
+ { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
+ { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
+ { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
+ { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
+ { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
+ { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
+ { RTLIB::UO_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
+ { RTLIB::O_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ },
+
+ // Single-precision floating-point arithmetic helper functions
+ // RTABI chapter 4.1.2, Table 4
+ { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+ { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+ { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+ { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+
+ // Single-precision floating-point comparison helper functions
+ // RTABI chapter 4.1.2, Table 5
+ { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
+ { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
+ { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
+ { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
+ { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
+ { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
+ { RTLIB::UO_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
+ { RTLIB::O_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ },
+
+ // Floating-point to integer conversions.
+ // RTABI chapter 4.1.2, Table 6
+ { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+ { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+ { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+ { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+ { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+ { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+ { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+ { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+
+ // Conversions between floating types.
+ // RTABI chapter 4.1.2, Table 7
+ { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+ { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+ { RTLIB::FPEXT_F32_F64, "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+
+ // Integer to floating-point conversions.
+ // RTABI chapter 4.1.2, Table 8
+ { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+ { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+ { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+ { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+ { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+ { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+ { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+ { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+
+ // Long long helper functions
+ // RTABI chapter 4.2, Table 9
+ { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+ { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+ { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+ { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+
+ // Integer division functions
+ // RTABI chapter 4.3.1
+ { RTLIB::SDIV_I8, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+ { RTLIB::SDIV_I16, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+ { RTLIB::SDIV_I32, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+ { RTLIB::SDIV_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+ { RTLIB::UDIV_I8, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+ { RTLIB::UDIV_I16, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+ { RTLIB::UDIV_I32, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+ { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+ };
+
+ for (const auto &LC : LibraryCalls) {
+ setLibcallName(LC.Op, LC.Name);
+ setLibcallCallingConv(LC.Op, LC.CC);
+ if (LC.Cond != ISD::SETCC_INVALID)
+ setCmpLibcallCC(LC.Op, LC.Cond);
+ }
+
+ // EABI dependent RTLIB
+ if (TM.Options.EABIVersion == EABI::EABI4 ||
+ TM.Options.EABIVersion == EABI::EABI5) {
+ static const struct {
+ const RTLIB::Libcall Op;
+ const char *const Name;
+ const CallingConv::ID CC;
+ const ISD::CondCode Cond;
+ } MemOpsLibraryCalls[] = {
+ // Memory operations
+ // RTABI chapter 4.3.4
+ { RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+ { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+ { RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+ };
+
+ for (const auto &LC : MemOpsLibraryCalls) {
+ setLibcallName(LC.Op, LC.Name);
+ setLibcallCallingConv(LC.Op, LC.CC);
+ if (LC.Cond != ISD::SETCC_INVALID)
+ setCmpLibcallCC(LC.Op, LC.Cond);
+ }
+ }
+ }
+
+ if (Subtarget->isTargetWindows()) {
+ static const struct {
+ const RTLIB::Libcall Op;
+ const char * const Name;
+ const CallingConv::ID CC;
+ } LibraryCalls[] = {
+ { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP },
+ { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP },
+ { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP },
+ { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP },
+ { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP },
+ { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP },
+ { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP },
+ { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP },
+ };
+
+ for (const auto &LC : LibraryCalls) {
+ setLibcallName(LC.Op, LC.Name);
+ setLibcallCallingConv(LC.Op, LC.CC);
+ }
+ }
+
+ // Use divmod compiler-rt calls for iOS 5.0 and later.
+ if (Subtarget->isTargetWatchOS() ||
+ (Subtarget->isTargetIOS() &&
+ !Subtarget->getTargetTriple().isOSVersionLT(5, 0))) {
+ setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
+ setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
+ }
+
+ // The half <-> float conversion functions are always soft-float on
+ // non-watchos platforms, but are needed for some targets which use a
+ // hard-float calling convention by default.
+ if (!Subtarget->isTargetWatchABI()) {
+ if (Subtarget->isAAPCS_ABI()) {
+ setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS);
+ setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS);
+ setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS);
+ } else {
+ setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS);
+ setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS);
+ setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS);
+ }
+ }
+
+ // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have
+ // a __gnu_ prefix (which is the default).
+ if (Subtarget->isTargetAEABI()) {
+ setLibcallName(RTLIB::FPROUND_F32_F16, "__aeabi_f2h");
+ setLibcallName(RTLIB::FPROUND_F64_F16, "__aeabi_d2h");
+ setLibcallName(RTLIB::FPEXT_F16_F32, "__aeabi_h2f");
+ }
+
+ if (Subtarget->isThumb1Only())
+ addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
+ else
+ addRegisterClass(MVT::i32, &ARM::GPRRegClass);
+ if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() &&
+ !Subtarget->isThumb1Only()) {
+ addRegisterClass(MVT::f32, &ARM::SPRRegClass);
+ addRegisterClass(MVT::f64, &ARM::DPRRegClass);
+ }
+
+ for (MVT VT : MVT::vector_valuetypes()) {
+ for (MVT InnerVT : MVT::vector_valuetypes()) {
+ setTruncStoreAction(VT, InnerVT, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
+ }
+
+ setOperationAction(ISD::MULHS, VT, Expand);
+ setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+ setOperationAction(ISD::MULHU, VT, Expand);
+ setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+
+ setOperationAction(ISD::BSWAP, VT, Expand);
+ }
+
+ setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
+ setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
+
+ setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom);
+ setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom);
+
+ if (Subtarget->hasNEON()) {
+ addDRTypeForNEON(MVT::v2f32);
+ addDRTypeForNEON(MVT::v8i8);
+ addDRTypeForNEON(MVT::v4i16);
+ addDRTypeForNEON(MVT::v2i32);
+ addDRTypeForNEON(MVT::v1i64);
+
+ addQRTypeForNEON(MVT::v4f32);
+ addQRTypeForNEON(MVT::v2f64);
+ addQRTypeForNEON(MVT::v16i8);
+ addQRTypeForNEON(MVT::v8i16);
+ addQRTypeForNEON(MVT::v4i32);
+ addQRTypeForNEON(MVT::v2i64);
+
+ // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
+ // neither Neon nor VFP support any arithmetic operations on it.
+ // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
+ // supported for v4f32.
+ setOperationAction(ISD::FADD, MVT::v2f64, Expand);
+ setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
+ setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
+ // FIXME: Code duplication: FDIV and FREM are expanded always, see
+ // ARMTargetLowering::addTypeForNEON method for details.
+ setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
+ setOperationAction(ISD::FREM, MVT::v2f64, Expand);
+ // FIXME: Create unittest.
+ // In another words, find a way when "copysign" appears in DAG with vector
+ // operands.
+ setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand);
+ // FIXME: Code duplication: SETCC has custom operation action, see
+ // ARMTargetLowering::addTypeForNEON method for details.
+ setOperationAction(ISD::SETCC, MVT::v2f64, Expand);
+ // FIXME: Create unittest for FNEG and for FABS.
+ setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
+ setOperationAction(ISD::FABS, MVT::v2f64, Expand);
+ setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
+ setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
+ setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
+ setOperationAction(ISD::FPOWI, MVT::v2f64, Expand);
+ setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
+ setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
+ setOperationAction(ISD::FLOG2, MVT::v2f64, Expand);
+ setOperationAction(ISD::FLOG10, MVT::v2f64, Expand);
+ setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
+ setOperationAction(ISD::FEXP2, MVT::v2f64, Expand);
+ // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR.
+ setOperationAction(ISD::FCEIL, MVT::v2f64, Expand);
+ setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand);
+ setOperationAction(ISD::FRINT, MVT::v2f64, Expand);
+ setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
+ setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
+ setOperationAction(ISD::FMA, MVT::v2f64, Expand);
+
+ setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
+ setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
+ setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
+ setOperationAction(ISD::FPOWI, MVT::v4f32, Expand);
+ setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
+ setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
+ setOperationAction(ISD::FLOG2, MVT::v4f32, Expand);
+ setOperationAction(ISD::FLOG10, MVT::v4f32, Expand);
+ setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
+ setOperationAction(ISD::FEXP2, MVT::v4f32, Expand);
+ setOperationAction(ISD::FCEIL, MVT::v4f32, Expand);
+ setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand);
+ setOperationAction(ISD::FRINT, MVT::v4f32, Expand);
+ setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
+ setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand);
+
+ // Mark v2f32 intrinsics.
+ setOperationAction(ISD::FSQRT, MVT::v2f32, Expand);
+ setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
+ setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
+ setOperationAction(ISD::FPOWI, MVT::v2f32, Expand);
+ setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
+ setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
+ setOperationAction(ISD::FLOG2, MVT::v2f32, Expand);
+ setOperationAction(ISD::FLOG10, MVT::v2f32, Expand);
+ setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
+ setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
+ setOperationAction(ISD::FCEIL, MVT::v2f32, Expand);
+ setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand);
+ setOperationAction(ISD::FRINT, MVT::v2f32, Expand);
+ setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand);
+ setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand);
+
+ // Neon does not support some operations on v1i64 and v2i64 types.
+ setOperationAction(ISD::MUL, MVT::v1i64, Expand);
+ // Custom handling for some quad-vector types to detect VMULL.
+ setOperationAction(ISD::MUL, MVT::v8i16, Custom);
+ setOperationAction(ISD::MUL, MVT::v4i32, Custom);
+ setOperationAction(ISD::MUL, MVT::v2i64, Custom);
+ // Custom handling for some vector types to avoid expensive expansions
+ setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
+ setOperationAction(ISD::SDIV, MVT::v8i8, Custom);
+ setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
+ setOperationAction(ISD::UDIV, MVT::v8i8, Custom);
+ // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
+ // a destination type that is wider than the source, and nor does
+ // it have a FP_TO_[SU]INT instruction with a narrower destination than
+ // source.
+ setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
+
+ setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
+ setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand);
+
+ // NEON does not have single instruction CTPOP for vectors with element
+ // types wider than 8-bits. However, custom lowering can leverage the
+ // v8i8/v16i8 vcnt instruction.
+ setOperationAction(ISD::CTPOP, MVT::v2i32, Custom);
+ setOperationAction(ISD::CTPOP, MVT::v4i32, Custom);
+ setOperationAction(ISD::CTPOP, MVT::v4i16, Custom);
+ setOperationAction(ISD::CTPOP, MVT::v8i16, Custom);
+ setOperationAction(ISD::CTPOP, MVT::v1i64, Expand);
+ setOperationAction(ISD::CTPOP, MVT::v2i64, Expand);
+
+ setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
+ setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
+
+ // NEON does not have single instruction CTTZ for vectors.
+ setOperationAction(ISD::CTTZ, MVT::v8i8, Custom);
+ setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
+ setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
+ setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
+
+ setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
+ setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
+ setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
+ setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
+
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom);
+
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
+
+ // NEON only has FMA instructions as of VFP4.
+ if (!Subtarget->hasVFP4()) {
+ setOperationAction(ISD::FMA, MVT::v2f32, Expand);
+ setOperationAction(ISD::FMA, MVT::v4f32, Expand);
+ }
+
+ setTargetDAGCombine(ISD::INTRINSIC_VOID);
+ setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
+ setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
+ setTargetDAGCombine(ISD::SHL);
+ setTargetDAGCombine(ISD::SRL);
+ setTargetDAGCombine(ISD::SRA);
+ setTargetDAGCombine(ISD::SIGN_EXTEND);
+ setTargetDAGCombine(ISD::ZERO_EXTEND);
+ setTargetDAGCombine(ISD::ANY_EXTEND);
+ setTargetDAGCombine(ISD::BUILD_VECTOR);
+ setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
+ setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+ setTargetDAGCombine(ISD::STORE);
+ setTargetDAGCombine(ISD::FP_TO_SINT);
+ setTargetDAGCombine(ISD::FP_TO_UINT);
+ setTargetDAGCombine(ISD::FDIV);
+ setTargetDAGCombine(ISD::LOAD);
+
+ // It is legal to extload from v4i8 to v4i16 or v4i32.
+ for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
+ MVT::v2i32}) {
+ for (MVT VT : MVT::integer_vector_valuetypes()) {
+ setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal);
+ }
+ }
+ }
+
+ // ARM and Thumb2 support UMLAL/SMLAL.
+ if (!Subtarget->isThumb1Only())
+ setTargetDAGCombine(ISD::ADDC);
+
+ if (Subtarget->isFPOnlySP()) {
+ // When targeting a floating-point unit with only single-precision
+ // operations, f64 is legal for the few double-precision instructions which
+ // are present However, no double-precision operations other than moves,
+ // loads and stores are provided by the hardware.
+ setOperationAction(ISD::FADD, MVT::f64, Expand);
+ setOperationAction(ISD::FSUB, MVT::f64, Expand);
+ setOperationAction(ISD::FMUL, MVT::f64, Expand);
+ setOperationAction(ISD::FMA, MVT::f64, Expand);
+ setOperationAction(ISD::FDIV, MVT::f64, Expand);
+ setOperationAction(ISD::FREM, MVT::f64, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+ setOperationAction(ISD::FGETSIGN, MVT::f64, Expand);
+ setOperationAction(ISD::FNEG, MVT::f64, Expand);
+ setOperationAction(ISD::FABS, MVT::f64, Expand);
+ setOperationAction(ISD::FSQRT, MVT::f64, Expand);
+ setOperationAction(ISD::FSIN, MVT::f64, Expand);
+ setOperationAction(ISD::FCOS, MVT::f64, Expand);
+ setOperationAction(ISD::FPOWI, MVT::f64, Expand);
+ setOperationAction(ISD::FPOW, MVT::f64, Expand);
+ setOperationAction(ISD::FLOG, MVT::f64, Expand);
+ setOperationAction(ISD::FLOG2, MVT::f64, Expand);
+ setOperationAction(ISD::FLOG10, MVT::f64, Expand);
+ setOperationAction(ISD::FEXP, MVT::f64, Expand);
+ setOperationAction(ISD::FEXP2, MVT::f64, Expand);
+ setOperationAction(ISD::FCEIL, MVT::f64, Expand);
+ setOperationAction(ISD::FTRUNC, MVT::f64, Expand);
+ setOperationAction(ISD::FRINT, MVT::f64, Expand);
+ setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand);
+ setOperationAction(ISD::FFLOOR, MVT::f64, Expand);
+ setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
+ setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+ setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom);
+ setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
+ setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
+ }
+
+ computeRegisterProperties(Subtarget->getRegisterInfo());
+
+ // ARM does not have floating-point extending loads.
+ for (MVT VT : MVT::fp_valuetypes()) {
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
+ }
+
+ // ... or truncating stores
+ setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+ setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+ setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+
+ // ARM does not have i1 sign extending load.
+ for (MVT VT : MVT::integer_valuetypes())
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+
+ // ARM supports all 4 flavors of integer indexed load / store.
+ if (!Subtarget->isThumb1Only()) {
+ for (unsigned im = (unsigned)ISD::PRE_INC;
+ im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
+ setIndexedLoadAction(im, MVT::i1, Legal);
+ setIndexedLoadAction(im, MVT::i8, Legal);
+ setIndexedLoadAction(im, MVT::i16, Legal);
+ setIndexedLoadAction(im, MVT::i32, Legal);
+ setIndexedStoreAction(im, MVT::i1, Legal);
+ setIndexedStoreAction(im, MVT::i8, Legal);
+ setIndexedStoreAction(im, MVT::i16, Legal);
+ setIndexedStoreAction(im, MVT::i32, Legal);
+ }
+ } else {
+ // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
+ setIndexedLoadAction(ISD::POST_INC, MVT::i32, Legal);
+ setIndexedStoreAction(ISD::POST_INC, MVT::i32, Legal);
+ }
+
+ setOperationAction(ISD::SADDO, MVT::i32, Custom);
+ setOperationAction(ISD::UADDO, MVT::i32, Custom);
+ setOperationAction(ISD::SSUBO, MVT::i32, Custom);
+ setOperationAction(ISD::USUBO, MVT::i32, Custom);
+
+ // i64 operation support.
+ setOperationAction(ISD::MUL, MVT::i64, Expand);
+ setOperationAction(ISD::MULHU, MVT::i32, Expand);
+ if (Subtarget->isThumb1Only()) {
+ setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+ setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+ }
+ if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
+ || (Subtarget->isThumb2() && !Subtarget->hasDSP()))
+ setOperationAction(ISD::MULHS, MVT::i32, Expand);
+
+ setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
+ setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
+ setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
+ setOperationAction(ISD::SRL, MVT::i64, Custom);
+ setOperationAction(ISD::SRA, MVT::i64, Custom);
+
+ if (!Subtarget->isThumb1Only()) {
+ // FIXME: We should do this for Thumb1 as well.
+ setOperationAction(ISD::ADDC, MVT::i32, Custom);
+ setOperationAction(ISD::ADDE, MVT::i32, Custom);
+ setOperationAction(ISD::SUBC, MVT::i32, Custom);
+ setOperationAction(ISD::SUBE, MVT::i32, Custom);
+ }
+
+ if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
+ setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
+
+ // ARM does not have ROTL.
+ setOperationAction(ISD::ROTL, MVT::i32, Expand);
+ for (MVT VT : MVT::vector_valuetypes()) {
+ setOperationAction(ISD::ROTL, VT, Expand);
+ setOperationAction(ISD::ROTR, VT, Expand);
+ }
+ setOperationAction(ISD::CTTZ, MVT::i32, Custom);
+ setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+ if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only())
+ setOperationAction(ISD::CTLZ, MVT::i32, Expand);
+
+ // @llvm.readcyclecounter requires the Performance Monitors extension.
+ // Default to the 0 expansion on unsupported platforms.
+ // FIXME: Technically there are older ARM CPUs that have
+ // implementation-specific ways of obtaining this information.
+ if (Subtarget->hasPerfMon())
+ setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
+
+ // Only ARMv6 has BSWAP.
+ if (!Subtarget->hasV6Ops())
+ setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+
+ bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivide()
+ : Subtarget->hasDivideInARMMode();
+ if (!hasDivide) {
+ // These are expanded into libcalls if the cpu doesn't have HW divider.
+ setOperationAction(ISD::SDIV, MVT::i32, LibCall);
+ setOperationAction(ISD::UDIV, MVT::i32, LibCall);
+ }
+
+ if (Subtarget->isTargetWindows() && !Subtarget->hasDivide()) {
+ setOperationAction(ISD::SDIV, MVT::i32, Custom);
+ setOperationAction(ISD::UDIV, MVT::i32, Custom);
+
+ setOperationAction(ISD::SDIV, MVT::i64, Custom);
+ setOperationAction(ISD::UDIV, MVT::i64, Custom);
+ }
+
+ setOperationAction(ISD::SREM, MVT::i32, Expand);
+ setOperationAction(ISD::UREM, MVT::i32, Expand);
+ // Register based DivRem for AEABI (RTABI 4.2)
+ if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
+ Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
+ Subtarget->isTargetWindows()) {
+ setOperationAction(ISD::SREM, MVT::i64, Custom);
+ setOperationAction(ISD::UREM, MVT::i64, Custom);
+ HasStandaloneRem = false;
+
+ for (const auto &LC :
+ {RTLIB::SDIVREM_I8, RTLIB::SDIVREM_I16, RTLIB::SDIVREM_I32})
+ setLibcallName(LC, Subtarget->isTargetWindows() ? "__rt_sdiv"
+ : "__aeabi_idivmod");
+ setLibcallName(RTLIB::SDIVREM_I64, Subtarget->isTargetWindows()
+ ? "__rt_sdiv64"
+ : "__aeabi_ldivmod");
+ for (const auto &LC :
+ {RTLIB::UDIVREM_I8, RTLIB::UDIVREM_I16, RTLIB::UDIVREM_I32})
+ setLibcallName(LC, Subtarget->isTargetWindows() ? "__rt_udiv"
+ : "__aeabi_uidivmod");
+ setLibcallName(RTLIB::UDIVREM_I64, Subtarget->isTargetWindows()
+ ? "__rt_udiv64"
+ : "__aeabi_uldivmod");
+
+ setLibcallCallingConv(RTLIB::SDIVREM_I8, CallingConv::ARM_AAPCS);
+ setLibcallCallingConv(RTLIB::SDIVREM_I16, CallingConv::ARM_AAPCS);
+ setLibcallCallingConv(RTLIB::SDIVREM_I32, CallingConv::ARM_AAPCS);
+ setLibcallCallingConv(RTLIB::SDIVREM_I64, CallingConv::ARM_AAPCS);
+ setLibcallCallingConv(RTLIB::UDIVREM_I8, CallingConv::ARM_AAPCS);
+ setLibcallCallingConv(RTLIB::UDIVREM_I16, CallingConv::ARM_AAPCS);
+ setLibcallCallingConv(RTLIB::UDIVREM_I32, CallingConv::ARM_AAPCS);
+ setLibcallCallingConv(RTLIB::UDIVREM_I64, CallingConv::ARM_AAPCS);
+
+ setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
+ setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
+ setOperationAction(ISD::SDIVREM, MVT::i64, Custom);
+ setOperationAction(ISD::UDIVREM, MVT::i64, Custom);
+ } else {
+ setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+ setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+ }
+
+ if (Subtarget->isTargetWindows() && Subtarget->getTargetTriple().isOSMSVCRT())
+ for (auto &VT : {MVT::f32, MVT::f64})
+ setOperationAction(ISD::FPOWI, VT, Custom);
+
+ setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+ setOperationAction(ISD::ConstantPool, MVT::i32, Custom);
+ setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
+ setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
+
+ setOperationAction(ISD::TRAP, MVT::Other, Legal);
+
+ // Use the default implementation.
+ setOperationAction(ISD::VASTART, MVT::Other, Custom);
+ setOperationAction(ISD::VAARG, MVT::Other, Expand);
+ setOperationAction(ISD::VACOPY, MVT::Other, Expand);
+ setOperationAction(ISD::VAEND, MVT::Other, Expand);
+ setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+ setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+
+ if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment())
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+ else
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
+
+ // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
+ // the default expansion.
+ InsertFencesForAtomic = false;
+ if (Subtarget->hasAnyDataBarrier() &&
+ (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
+ // ATOMIC_FENCE needs custom lowering; the others should have been expanded
+ // to ldrex/strex loops already.
+ setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
+ if (!Subtarget->isThumb() || !Subtarget->isMClass())
+ setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
+
+ // On v8, we have particularly efficient implementations of atomic fences
+ // if they can be combined with nearby atomic loads and stores.
+ if (!Subtarget->hasV8Ops() || getTargetMachine().getOptLevel() == 0) {
+ // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
+ InsertFencesForAtomic = true;
+ }
+ } else {
+ // If there's anything we can use as a barrier, go through custom lowering
+ // for ATOMIC_FENCE.
+ // If target has DMB in thumb, Fences can be inserted.
+ if (Subtarget->hasDataBarrier())
+ InsertFencesForAtomic = true;
+
+ setOperationAction(ISD::ATOMIC_FENCE, MVT::Other,
+ Subtarget->hasAnyDataBarrier() ? Custom : Expand);
+
+ // Set them all for expansion, which will force libcalls.
+ setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Expand);
+ setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Expand);
+ setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, Expand);
+ setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand);
+ setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Expand);
+ setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, Expand);
+ setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, Expand);
+ setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand);
+ setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand);
+ setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand);
+ setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand);
+ setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand);
+ // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
+ // Unordered/Monotonic case.
+ if (!InsertFencesForAtomic) {
+ setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
+ setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
+ }
+ }
+
+ setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
+
+ // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
+ if (!Subtarget->hasV6Ops()) {
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
+ }
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+ if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() &&
+ !Subtarget->isThumb1Only()) {
+ // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
+ // iff target supports vfp2.
+ setOperationAction(ISD::BITCAST, MVT::i64, Custom);
+ setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
+ }
+
+ // We want to custom lower some of our intrinsics.
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+ setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
+ setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
+ setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
+ if (Subtarget->useSjLjEH())
+ setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
+
+ setOperationAction(ISD::SETCC, MVT::i32, Expand);
+ setOperationAction(ISD::SETCC, MVT::f32, Expand);
+ setOperationAction(ISD::SETCC, MVT::f64, Expand);
+ setOperationAction(ISD::SELECT, MVT::i32, Custom);
+ setOperationAction(ISD::SELECT, MVT::f32, Custom);
+ setOperationAction(ISD::SELECT, MVT::f64, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
+
+ // Thumb-1 cannot currently select ARMISD::SUBE.
+ if (!Subtarget->isThumb1Only())
+ setOperationAction(ISD::SETCCE, MVT::i32, Custom);
+
+ setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+ setOperationAction(ISD::BR_CC, MVT::i32, Custom);
+ setOperationAction(ISD::BR_CC, MVT::f32, Custom);
+ setOperationAction(ISD::BR_CC, MVT::f64, Custom);
+ setOperationAction(ISD::BR_JT, MVT::Other, Custom);
+
+ // We don't support sin/cos/fmod/copysign/pow
+ setOperationAction(ISD::FSIN, MVT::f64, Expand);
+ setOperationAction(ISD::FSIN, MVT::f32, Expand);
+ setOperationAction(ISD::FCOS, MVT::f32, Expand);
+ setOperationAction(ISD::FCOS, MVT::f64, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+ setOperationAction(ISD::FREM, MVT::f64, Expand);
+ setOperationAction(ISD::FREM, MVT::f32, Expand);
+ if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() &&
+ !Subtarget->isThumb1Only()) {
+ setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+ }
+ setOperationAction(ISD::FPOW, MVT::f64, Expand);
+ setOperationAction(ISD::FPOW, MVT::f32, Expand);
+
+ if (!Subtarget->hasVFP4()) {
+ setOperationAction(ISD::FMA, MVT::f64, Expand);
+ setOperationAction(ISD::FMA, MVT::f32, Expand);
+ }
+
+ // Various VFP goodness
+ if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
+ // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
+ if (!Subtarget->hasFPARMv8() || Subtarget->isFPOnlySP()) {
+ setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
+ setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
+ }
+
+ // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
+ if (!Subtarget->hasFP16()) {
+ setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
+ setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
+ }
+ }
+
+ // Combine sin / cos into one node or libcall if possible.
+ if (Subtarget->hasSinCos()) {
+ setLibcallName(RTLIB::SINCOS_F32, "sincosf");
+ setLibcallName(RTLIB::SINCOS_F64, "sincos");
+ if (Subtarget->isTargetWatchABI()) {
+ setLibcallCallingConv(RTLIB::SINCOS_F32, CallingConv::ARM_AAPCS_VFP);
+ setLibcallCallingConv(RTLIB::SINCOS_F64, CallingConv::ARM_AAPCS_VFP);
+ }
+ if (Subtarget->isTargetIOS() || Subtarget->isTargetWatchOS()) {
+ // For iOS, we don't want to the normal expansion of a libcall to
+ // sincos. We want to issue a libcall to __sincos_stret.
+ setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
+ setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
+ }
+ }
+
+ // FP-ARMv8 implements a lot of rounding-like FP operations.
+ if (Subtarget->hasFPARMv8()) {
+ setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
+ setOperationAction(ISD::FCEIL, MVT::f32, Legal);
+ setOperationAction(ISD::FROUND, MVT::f32, Legal);
+ setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
+ setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
+ setOperationAction(ISD::FRINT, MVT::f32, Legal);
+ setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
+ setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
+ setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal);
+ setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal);
+ setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
+ setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
+
+ if (!Subtarget->isFPOnlySP()) {
+ setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
+ setOperationAction(ISD::FCEIL, MVT::f64, Legal);
+ setOperationAction(ISD::FROUND, MVT::f64, Legal);
+ setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
+ setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
+ setOperationAction(ISD::FRINT, MVT::f64, Legal);
+ setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
+ setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
+ }
+ }
+
+ if (Subtarget->hasNEON()) {
+ // vmin and vmax aren't available in a scalar form, so we use
+ // a NEON instruction with an undef lane instead.
+ setOperationAction(ISD::FMINNAN, MVT::f32, Legal);
+ setOperationAction(ISD::FMAXNAN, MVT::f32, Legal);
+ setOperationAction(ISD::FMINNAN, MVT::v2f32, Legal);
+ setOperationAction(ISD::FMAXNAN, MVT::v2f32, Legal);
+ setOperationAction(ISD::FMINNAN, MVT::v4f32, Legal);
+ setOperationAction(ISD::FMAXNAN, MVT::v4f32, Legal);
+ }
+
+ // We have target-specific dag combine patterns for the following nodes:
+ // ARMISD::VMOVRRD - No need to call setTargetDAGCombine
+ setTargetDAGCombine(ISD::ADD);
+ setTargetDAGCombine(ISD::SUB);
+ setTargetDAGCombine(ISD::MUL);
+ setTargetDAGCombine(ISD::AND);
+ setTargetDAGCombine(ISD::OR);
+ setTargetDAGCombine(ISD::XOR);
+
+ if (Subtarget->hasV6Ops())
+ setTargetDAGCombine(ISD::SRL);
+
+ setStackPointerRegisterToSaveRestore(ARM::SP);
+
+ if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
+ !Subtarget->hasVFP2())
+ setSchedulingPreference(Sched::RegPressure);
+ else
+ setSchedulingPreference(Sched::Hybrid);
+
+ //// temporary - rewrite interface to use type
+ MaxStoresPerMemset = 8;
+ MaxStoresPerMemsetOptSize = 4;
+ MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
+ MaxStoresPerMemcpyOptSize = 2;
+ MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
+ MaxStoresPerMemmoveOptSize = 2;
+
+ // On ARM arguments smaller than 4 bytes are extended, so all arguments
+ // are at least 4 bytes aligned.
+ setMinStackArgumentAlignment(4);
+
+ // Prefer likely predicted branches to selects on out-of-order cores.
+ PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
+
+ setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
+}
+
+bool ARMTargetLowering::useSoftFloat() const {
+ return Subtarget->useSoftFloat();
+}
+
+// FIXME: It might make sense to define the representative register class as the
+// nearest super-register that has a non-null superset. For example, DPR_VFP2 is
+// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
+// SPR's representative would be DPR_VFP2. This should work well if register
+// pressure tracking were modified such that a register use would increment the
+// pressure of the register class's representative and all of it's super
+// classes' representatives transitively. We have not implemented this because
+// of the difficulty prior to coalescing of modeling operand register classes
+// due to the common occurrence of cross class copies and subregister insertions
+// and extractions.
+std::pair<const TargetRegisterClass *, uint8_t>
+ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
+ MVT VT) const {
+ const TargetRegisterClass *RRC = nullptr;
+ uint8_t Cost = 1;
+ switch (VT.SimpleTy) {
+ default:
+ return TargetLowering::findRepresentativeClass(TRI, VT);
+ // Use DPR as representative register class for all floating point
+ // and vector types. Since there are 32 SPR registers and 32 DPR registers so
+ // the cost is 1 for both f32 and f64.
+ case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
+ case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
+ RRC = &ARM::DPRRegClass;
+ // When NEON is used for SP, only half of the register file is available
+ // because operations that define both SP and DP results will be constrained
+ // to the VFP2 class (D0-D15). We currently model this constraint prior to
+ // coalescing by double-counting the SP regs. See the FIXME above.
+ if (Subtarget->useNEONForSinglePrecisionFP())
+ Cost = 2;
+ break;
+ case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
+ case MVT::v4f32: case MVT::v2f64:
+ RRC = &ARM::DPRRegClass;
+ Cost = 2;
+ break;
+ case MVT::v4i64:
+ RRC = &ARM::DPRRegClass;
+ Cost = 4;
+ break;
+ case MVT::v8i64:
+ RRC = &ARM::DPRRegClass;
+ Cost = 8;
+ break;
+ }
+ return std::make_pair(RRC, Cost);
+}
+
+const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
+ switch ((ARMISD::NodeType)Opcode) {
+ case ARMISD::FIRST_NUMBER: break;
+ case ARMISD::Wrapper: return "ARMISD::Wrapper";
+ case ARMISD::WrapperPIC: return "ARMISD::WrapperPIC";
+ case ARMISD::WrapperJT: return "ARMISD::WrapperJT";
+ case ARMISD::COPY_STRUCT_BYVAL: return "ARMISD::COPY_STRUCT_BYVAL";
+ case ARMISD::CALL: return "ARMISD::CALL";
+ case ARMISD::CALL_PRED: return "ARMISD::CALL_PRED";
+ case ARMISD::CALL_NOLINK: return "ARMISD::CALL_NOLINK";
+ case ARMISD::BRCOND: return "ARMISD::BRCOND";
+ case ARMISD::BR_JT: return "ARMISD::BR_JT";
+ case ARMISD::BR2_JT: return "ARMISD::BR2_JT";
+ case ARMISD::RET_FLAG: return "ARMISD::RET_FLAG";
+ case ARMISD::INTRET_FLAG: return "ARMISD::INTRET_FLAG";
+ case ARMISD::PIC_ADD: return "ARMISD::PIC_ADD";
+ case ARMISD::CMP: return "ARMISD::CMP";
+ case ARMISD::CMN: return "ARMISD::CMN";
+ case ARMISD::CMPZ: return "ARMISD::CMPZ";
+ case ARMISD::CMPFP: return "ARMISD::CMPFP";
+ case ARMISD::CMPFPw0: return "ARMISD::CMPFPw0";
+ case ARMISD::BCC_i64: return "ARMISD::BCC_i64";
+ case ARMISD::FMSTAT: return "ARMISD::FMSTAT";
+
+ case ARMISD::CMOV: return "ARMISD::CMOV";
+
+ case ARMISD::SSAT: return "ARMISD::SSAT";
+
+ case ARMISD::SRL_FLAG: return "ARMISD::SRL_FLAG";
+ case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG";
+ case ARMISD::RRX: return "ARMISD::RRX";
+
+ case ARMISD::ADDC: return "ARMISD::ADDC";
+ case ARMISD::ADDE: return "ARMISD::ADDE";
+ case ARMISD::SUBC: return "ARMISD::SUBC";
+ case ARMISD::SUBE: return "ARMISD::SUBE";
+
+ case ARMISD::VMOVRRD: return "ARMISD::VMOVRRD";
+ case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR";
+
+ case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP";
+ case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP";
+ case ARMISD::EH_SJLJ_SETUP_DISPATCH: return "ARMISD::EH_SJLJ_SETUP_DISPATCH";
+
+ case ARMISD::TC_RETURN: return "ARMISD::TC_RETURN";
+
+ case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER";
+
+ case ARMISD::DYN_ALLOC: return "ARMISD::DYN_ALLOC";
+
+ case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR";
+
+ case ARMISD::PRELOAD: return "ARMISD::PRELOAD";
+
+ case ARMISD::WIN__CHKSTK: return "ARMISD::WIN__CHKSTK";
+ case ARMISD::WIN__DBZCHK: return "ARMISD::WIN__DBZCHK";
+
+ case ARMISD::VCEQ: return "ARMISD::VCEQ";
+ case ARMISD::VCEQZ: return "ARMISD::VCEQZ";
+ case ARMISD::VCGE: return "ARMISD::VCGE";
+ case ARMISD::VCGEZ: return "ARMISD::VCGEZ";
+ case ARMISD::VCLEZ: return "ARMISD::VCLEZ";
+ case ARMISD::VCGEU: return "ARMISD::VCGEU";
+ case ARMISD::VCGT: return "ARMISD::VCGT";
+ case ARMISD::VCGTZ: return "ARMISD::VCGTZ";
+ case ARMISD::VCLTZ: return "ARMISD::VCLTZ";
+ case ARMISD::VCGTU: return "ARMISD::VCGTU";
+ case ARMISD::VTST: return "ARMISD::VTST";
+
+ case ARMISD::VSHL: return "ARMISD::VSHL";
+ case ARMISD::VSHRs: return "ARMISD::VSHRs";
+ case ARMISD::VSHRu: return "ARMISD::VSHRu";
+ case ARMISD::VRSHRs: return "ARMISD::VRSHRs";
+ case ARMISD::VRSHRu: return "ARMISD::VRSHRu";
+ case ARMISD::VRSHRN: return "ARMISD::VRSHRN";
+ case ARMISD::VQSHLs: return "ARMISD::VQSHLs";
+ case ARMISD::VQSHLu: return "ARMISD::VQSHLu";
+ case ARMISD::VQSHLsu: return "ARMISD::VQSHLsu";
+ case ARMISD::VQSHRNs: return "ARMISD::VQSHRNs";
+ case ARMISD::VQSHRNu: return "ARMISD::VQSHRNu";
+ case ARMISD::VQSHRNsu: return "ARMISD::VQSHRNsu";
+ case ARMISD::VQRSHRNs: return "ARMISD::VQRSHRNs";
+ case ARMISD::VQRSHRNu: return "ARMISD::VQRSHRNu";
+ case ARMISD::VQRSHRNsu: return "ARMISD::VQRSHRNsu";
+ case ARMISD::VSLI: return "ARMISD::VSLI";
+ case ARMISD::VSRI: return "ARMISD::VSRI";
+ case ARMISD::VGETLANEu: return "ARMISD::VGETLANEu";
+ case ARMISD::VGETLANEs: return "ARMISD::VGETLANEs";
+ case ARMISD::VMOVIMM: return "ARMISD::VMOVIMM";
+ case ARMISD::VMVNIMM: return "ARMISD::VMVNIMM";
+ case ARMISD::VMOVFPIMM: return "ARMISD::VMOVFPIMM";
+ case ARMISD::VDUP: return "ARMISD::VDUP";
+ case ARMISD::VDUPLANE: return "ARMISD::VDUPLANE";
+ case ARMISD::VEXT: return "ARMISD::VEXT";
+ case ARMISD::VREV64: return "ARMISD::VREV64";
+ case ARMISD::VREV32: return "ARMISD::VREV32";
+ case ARMISD::VREV16: return "ARMISD::VREV16";
+ case ARMISD::VZIP: return "ARMISD::VZIP";
+ case ARMISD::VUZP: return "ARMISD::VUZP";
+ case ARMISD::VTRN: return "ARMISD::VTRN";
+ case ARMISD::VTBL1: return "ARMISD::VTBL1";
+ case ARMISD::VTBL2: return "ARMISD::VTBL2";
+ case ARMISD::VMULLs: return "ARMISD::VMULLs";
+ case ARMISD::VMULLu: return "ARMISD::VMULLu";
+ case ARMISD::UMAAL: return "ARMISD::UMAAL";
+ case ARMISD::UMLAL: return "ARMISD::UMLAL";
+ case ARMISD::SMLAL: return "ARMISD::SMLAL";
+ case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR";
+ case ARMISD::BFI: return "ARMISD::BFI";
+ case ARMISD::VORRIMM: return "ARMISD::VORRIMM";
+ case ARMISD::VBICIMM: return "ARMISD::VBICIMM";
+ case ARMISD::VBSL: return "ARMISD::VBSL";
+ case ARMISD::MEMCPY: return "ARMISD::MEMCPY";
+ case ARMISD::VLD1DUP: return "ARMISD::VLD1DUP";
+ case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP";
+ case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP";
+ case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP";
+ case ARMISD::VLD1_UPD: return "ARMISD::VLD1_UPD";
+ case ARMISD::VLD2_UPD: return "ARMISD::VLD2_UPD";
+ case ARMISD::VLD3_UPD: return "ARMISD::VLD3_UPD";
+ case ARMISD::VLD4_UPD: return "ARMISD::VLD4_UPD";
+ case ARMISD::VLD2LN_UPD: return "ARMISD::VLD2LN_UPD";
+ case ARMISD::VLD3LN_UPD: return "ARMISD::VLD3LN_UPD";
+ case ARMISD::VLD4LN_UPD: return "ARMISD::VLD4LN_UPD";
+ case ARMISD::VLD1DUP_UPD: return "ARMISD::VLD1DUP_UPD";
+ case ARMISD::VLD2DUP_UPD: return "ARMISD::VLD2DUP_UPD";
+ case ARMISD::VLD3DUP_UPD: return "ARMISD::VLD3DUP_UPD";
+ case ARMISD::VLD4DUP_UPD: return "ARMISD::VLD4DUP_UPD";
+ case ARMISD::VST1_UPD: return "ARMISD::VST1_UPD";
+ case ARMISD::VST2_UPD: return "ARMISD::VST2_UPD";
+ case ARMISD::VST3_UPD: return "ARMISD::VST3_UPD";
+ case ARMISD::VST4_UPD: return "ARMISD::VST4_UPD";
+ case ARMISD::VST2LN_UPD: return "ARMISD::VST2LN_UPD";
+ case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD";
+ case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD";
+ }
+ return nullptr;
+}
+
+EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
+ EVT VT) const {
+ if (!VT.isVector())
+ return getPointerTy(DL);
+ return VT.changeVectorElementTypeToInteger();
+}
+
+/// getRegClassFor - Return the register class that should be used for the
+/// specified value type.
+const TargetRegisterClass *ARMTargetLowering::getRegClassFor(MVT VT) const {
+ // Map v4i64 to QQ registers but do not make the type legal. Similarly map
+ // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
+ // load / store 4 to 8 consecutive D registers.
+ if (Subtarget->hasNEON()) {
+ if (VT == MVT::v4i64)
+ return &ARM::QQPRRegClass;
+ if (VT == MVT::v8i64)
+ return &ARM::QQQQPRRegClass;
+ }
+ return TargetLowering::getRegClassFor(VT);
+}
+
+// memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
+// source/dest is aligned and the copy size is large enough. We therefore want
+// to align such objects passed to memory intrinsics.
+bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,
+ unsigned &PrefAlign) const {
+ if (!isa<MemIntrinsic>(CI))
+ return false;
+ MinSize = 8;
+ // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
+ // cycle faster than 4-byte aligned LDM.
+ PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4);
+ return true;
+}
+
+// Create a fast isel object.
+FastISel *
+ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo) const {
+ return ARM::createFastISel(funcInfo, libInfo);
+}
+
+Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
+ unsigned NumVals = N->getNumValues();
+ if (!NumVals)
+ return Sched::RegPressure;
+
+ for (unsigned i = 0; i != NumVals; ++i) {
+ EVT VT = N->getValueType(i);
+ if (VT == MVT::Glue || VT == MVT::Other)
+ continue;
+ if (VT.isFloatingPoint() || VT.isVector())
+ return Sched::ILP;
+ }
+
+ if (!N->isMachineOpcode())
+ return Sched::RegPressure;
+
+ // Load are scheduled for latency even if there instruction itinerary
+ // is not available.
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
+
+ if (MCID.getNumDefs() == 0)
+ return Sched::RegPressure;
+ if (!Itins->isEmpty() &&
+ Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2)
+ return Sched::ILP;
+
+ return Sched::RegPressure;
+}
+
+//===----------------------------------------------------------------------===//
+// Lowering Code
+//===----------------------------------------------------------------------===//
+
+/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
+static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
+ switch (CC) {
+ default: llvm_unreachable("Unknown condition code!");
+ case ISD::SETNE: return ARMCC::NE;
+ case ISD::SETEQ: return ARMCC::EQ;
+ case ISD::SETGT: return ARMCC::GT;
+ case ISD::SETGE: return ARMCC::GE;
+ case ISD::SETLT: return ARMCC::LT;
+ case ISD::SETLE: return ARMCC::LE;
+ case ISD::SETUGT: return ARMCC::HI;
+ case ISD::SETUGE: return ARMCC::HS;
+ case ISD::SETULT: return ARMCC::LO;
+ case ISD::SETULE: return ARMCC::LS;
+ }
+}
+
+/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
+static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
+ ARMCC::CondCodes &CondCode2) {
+ CondCode2 = ARMCC::AL;
+ switch (CC) {
+ default: llvm_unreachable("Unknown FP condition!");
+ case ISD::SETEQ:
+ case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
+ case ISD::SETGT:
+ case ISD::SETOGT: CondCode = ARMCC::GT; break;
+ case ISD::SETGE:
+ case ISD::SETOGE: CondCode = ARMCC::GE; break;
+ case ISD::SETOLT: CondCode = ARMCC::MI; break;
+ case ISD::SETOLE: CondCode = ARMCC::LS; break;
+ case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
+ case ISD::SETO: CondCode = ARMCC::VC; break;
+ case ISD::SETUO: CondCode = ARMCC::VS; break;
+ case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
+ case ISD::SETUGT: CondCode = ARMCC::HI; break;
+ case ISD::SETUGE: CondCode = ARMCC::PL; break;
+ case ISD::SETLT:
+ case ISD::SETULT: CondCode = ARMCC::LT; break;
+ case ISD::SETLE:
+ case ISD::SETULE: CondCode = ARMCC::LE; break;
+ case ISD::SETNE:
+ case ISD::SETUNE: CondCode = ARMCC::NE; break;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "ARMGenCallingConv.inc"
+
+/// getEffectiveCallingConv - Get the effective calling convention, taking into
+/// account presence of floating point hardware and calling convention
+/// limitations, such as support for variadic functions.
+CallingConv::ID
+ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
+ bool isVarArg) const {
+ switch (CC) {
+ default:
+ llvm_unreachable("Unsupported calling convention");
+ case CallingConv::ARM_AAPCS:
+ case CallingConv::ARM_APCS:
+ case CallingConv::GHC:
+ return CC;
+ case CallingConv::PreserveMost:
+ return CallingConv::PreserveMost;
+ case CallingConv::ARM_AAPCS_VFP:
+ case CallingConv::Swift:
+ return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP;
+ case CallingConv::C:
+ if (!Subtarget->isAAPCS_ABI())
+ return CallingConv::ARM_APCS;
+ else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() &&
+ getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
+ !isVarArg)
+ return CallingConv::ARM_AAPCS_VFP;
+ else
+ return CallingConv::ARM_AAPCS;
+ case CallingConv::Fast:
+ case CallingConv::CXX_FAST_TLS:
+ if (!Subtarget->isAAPCS_ABI()) {
+ if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg)
+ return CallingConv::Fast;
+ return CallingConv::ARM_APCS;
+ } else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg)
+ return CallingConv::ARM_AAPCS_VFP;
+ else
+ return CallingConv::ARM_AAPCS;
+ }
+}
+
+CCAssignFn *ARMTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
+ bool isVarArg) const {
+ return CCAssignFnForNode(CC, false, isVarArg);
+}
+
+CCAssignFn *ARMTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
+ bool isVarArg) const {
+ return CCAssignFnForNode(CC, true, isVarArg);
+}
+
+/// CCAssignFnForNode - Selects the correct CCAssignFn for the given
+/// CallingConvention.
+CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
+ bool Return,
+ bool isVarArg) const {
+ switch (getEffectiveCallingConv(CC, isVarArg)) {
+ default:
+ llvm_unreachable("Unsupported calling convention");
+ case CallingConv::ARM_APCS:
+ return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
+ case CallingConv::ARM_AAPCS:
+ return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
+ case CallingConv::ARM_AAPCS_VFP:
+ return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
+ case CallingConv::Fast:
+ return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
+ case CallingConv::GHC:
+ return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
+ case CallingConv::PreserveMost:
+ return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
+ }
+}
+
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+SDValue ARMTargetLowering::LowerCallResult(
+ SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
+ SDValue ThisVal) const {
+
+ // Assign locations to each value returned by this call.
+ SmallVector<CCValAssign, 16> RVLocs;
+ ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+ *DAG.getContext(), Call);
+ CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg));
+
+ // Copy all of the result registers out of their specified physreg.
+ for (unsigned i = 0; i != RVLocs.size(); ++i) {
+ CCValAssign VA = RVLocs[i];
+
+ // Pass 'this' value directly from the argument to return value, to avoid
+ // reg unit interference
+ if (i == 0 && isThisReturn) {
+ assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
+ "unexpected return calling convention register assignment");
+ InVals.push_back(ThisVal);
+ continue;
+ }
+
+ SDValue Val;
+ if (VA.needsCustom()) {
+ // Handle f64 or half of a v2f64.
+ SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
+ InFlag);
+ Chain = Lo.getValue(1);
+ InFlag = Lo.getValue(2);
+ VA = RVLocs[++i]; // skip ahead to next loc
+ SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
+ InFlag);
+ Chain = Hi.getValue(1);
+ InFlag = Hi.getValue(2);
+ if (!Subtarget->isLittle())
+ std::swap (Lo, Hi);
+ Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
+
+ if (VA.getLocVT() == MVT::v2f64) {
+ SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
+ Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
+ DAG.getConstant(0, dl, MVT::i32));
+
+ VA = RVLocs[++i]; // skip ahead to next loc
+ Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
+ Chain = Lo.getValue(1);
+ InFlag = Lo.getValue(2);
+ VA = RVLocs[++i]; // skip ahead to next loc
+ Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
+ Chain = Hi.getValue(1);
+ InFlag = Hi.getValue(2);
+ if (!Subtarget->isLittle())
+ std::swap (Lo, Hi);
+ Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
+ Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
+ DAG.getConstant(1, dl, MVT::i32));
+ }
+ } else {
+ Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
+ InFlag);
+ Chain = Val.getValue(1);
+ InFlag = Val.getValue(2);
+ }
+
+ switch (VA.getLocInfo()) {
+ default: llvm_unreachable("Unknown loc info!");
+ case CCValAssign::Full: break;
+ case CCValAssign::BCvt:
+ Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
+ break;
+ }
+
+ InVals.push_back(Val);
+ }
+
+ return Chain;
+}
+
+/// LowerMemOpCallTo - Store the argument to the stack.
+SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
+ SDValue Arg, const SDLoc &dl,
+ SelectionDAG &DAG,
+ const CCValAssign &VA,
+ ISD::ArgFlagsTy Flags) const {
+ unsigned LocMemOffset = VA.getLocMemOffset();
+ SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
+ PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
+ StackPtr, PtrOff);
+ return DAG.getStore(
+ Chain, dl, Arg, PtrOff,
+ MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
+}
+
+void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
+ SDValue Chain, SDValue &Arg,
+ RegsToPassVector &RegsToPass,
+ CCValAssign &VA, CCValAssign &NextVA,
+ SDValue &StackPtr,
+ SmallVectorImpl<SDValue> &MemOpChains,
+ ISD::ArgFlagsTy Flags) const {
+
+ SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
+ DAG.getVTList(MVT::i32, MVT::i32), Arg);
+ unsigned id = Subtarget->isLittle() ? 0 : 1;
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
+
+ if (NextVA.isRegLoc())
+ RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
+ else {
+ assert(NextVA.isMemLoc());
+ if (!StackPtr.getNode())
+ StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
+ getPointerTy(DAG.getDataLayout()));
+
+ MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id),
+ dl, DAG, NextVA,
+ Flags));
+ }
+}
+
+/// LowerCall - Lowering a call into a callseq_start <-
+/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
+/// nodes.
+SDValue
+ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const {
+ SelectionDAG &DAG = CLI.DAG;
+ SDLoc &dl = CLI.DL;
+ SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+ SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+ SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
+ SDValue Chain = CLI.Chain;
+ SDValue Callee = CLI.Callee;
+ bool &isTailCall = CLI.IsTailCall;
+ CallingConv::ID CallConv = CLI.CallConv;
+ bool doesNotRet = CLI.DoesNotReturn;
+ bool isVarArg = CLI.IsVarArg;
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
+ bool isThisReturn = false;
+ bool isSibCall = false;
+ auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
+
+ // Disable tail calls if they're not supported.
+ if (!Subtarget->supportsTailCall() || Attr.getValueAsString() == "true")
+ isTailCall = false;
+
+ if (isTailCall) {
+ // Check if it's really possible to do a tail call.
+ isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
+ isVarArg, isStructRet, MF.getFunction()->hasStructRetAttr(),
+ Outs, OutVals, Ins, DAG);
+ if (!isTailCall && CLI.CS && CLI.CS->isMustTailCall())
+ report_fatal_error("failed to perform tail call elimination on a call "
+ "site marked musttail");
+ // We don't support GuaranteedTailCallOpt for ARM, only automatically
+ // detected sibcalls.
+ if (isTailCall) {
+ ++NumTailCalls;
+ isSibCall = true;
+ }
+ }
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+ *DAG.getContext(), Call);
+ CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));
+
+ // Get a count of how many bytes are to be pushed on the stack.
+ unsigned NumBytes = CCInfo.getNextStackOffset();
+
+ // For tail calls, memory operands are available in our caller's stack.
+ if (isSibCall)
+ NumBytes = 0;
+
+ // Adjust the stack pointer for the new arguments...
+ // These operations are automatically eliminated by the prolog/epilog pass
+ if (!isSibCall)
+ Chain = DAG.getCALLSEQ_START(Chain,
+ DAG.getIntPtrConstant(NumBytes, dl, true), dl);
+
+ SDValue StackPtr =
+ DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
+
+ RegsToPassVector RegsToPass;
+ SmallVector<SDValue, 8> MemOpChains;
+
+ // Walk the register/memloc assignments, inserting copies/loads. In the case
+ // of tail call optimization, arguments are handled later.
+ for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
+ i != e;
+ ++i, ++realArgIdx) {
+ CCValAssign &VA = ArgLocs[i];
+ SDValue Arg = OutVals[realArgIdx];
+ ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
+ bool isByVal = Flags.isByVal();
+
+ // Promote the value if needed.
+ switch (VA.getLocInfo()) {
+ default: llvm_unreachable("Unknown loc info!");
+ case CCValAssign::Full: break;
+ case CCValAssign::SExt:
+ Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::ZExt:
+ Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::AExt:
+ Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::BCvt:
+ Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
+ break;
+ }
+
+ // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
+ if (VA.needsCustom()) {
+ if (VA.getLocVT() == MVT::v2f64) {
+ SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
+ DAG.getConstant(0, dl, MVT::i32));
+ SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
+ DAG.getConstant(1, dl, MVT::i32));
+
+ PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass,
+ VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
+
+ VA = ArgLocs[++i]; // skip ahead to next loc
+ if (VA.isRegLoc()) {
+ PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass,
+ VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
+ } else {
+ assert(VA.isMemLoc());
+
+ MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1,
+ dl, DAG, VA, Flags));
+ }
+ } else {
+ PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
+ StackPtr, MemOpChains, Flags);
+ }
+ } else if (VA.isRegLoc()) {
+ if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i32) {
+ assert(VA.getLocVT() == MVT::i32 &&
+ "unexpected calling convention register assignment");
+ assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
+ "unexpected use of 'returned'");
+ isThisReturn = true;
+ }
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+ } else if (isByVal) {
+ assert(VA.isMemLoc());
+ unsigned offset = 0;
+
+ // True if this byval aggregate will be split between registers
+ // and memory.
+ unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
+ unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
+
+ if (CurByValIdx < ByValArgsCount) {
+
+ unsigned RegBegin, RegEnd;
+ CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
+
+ EVT PtrVT =
+ DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
+ unsigned int i, j;
+ for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
+ SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
+ SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
+ SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
+ MachinePointerInfo(),
+ DAG.InferPtrAlignment(AddArg));
+ MemOpChains.push_back(Load.getValue(1));
+ RegsToPass.push_back(std::make_pair(j, Load));
+ }
+
+ // If parameter size outsides register area, "offset" value
+ // helps us to calculate stack slot for remained part properly.
+ offset = RegEnd - RegBegin;
+
+ CCInfo.nextInRegsParam();
+ }
+
+ if (Flags.getByValSize() > 4*offset) {
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+ unsigned LocMemOffset = VA.getLocMemOffset();
+ SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
+ SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff);
+ SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
+ SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset);
+ SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
+ MVT::i32);
+ SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), dl,
+ MVT::i32);
+
+ SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
+ MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
+ Ops));
+ }
+ } else if (!isSibCall) {
+ assert(VA.isMemLoc());
+
+ MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
+ dl, DAG, VA, Flags));
+ }
+ }
+
+ if (!MemOpChains.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+
+ // Build a sequence of copy-to-reg nodes chained together with token chain
+ // and flag operands which copy the outgoing args into the appropriate regs.
+ SDValue InFlag;
+ // Tail call byval lowering might overwrite argument registers so in case of
+ // tail call optimization the copies to registers are lowered later.
+ if (!isTailCall)
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+ Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+ RegsToPass[i].second, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ // For tail calls lower the arguments to the 'real' stack slot.
+ if (isTailCall) {
+ // Force all the incoming stack arguments to be loaded from the stack
+ // before any new outgoing arguments are stored to the stack, because the
+ // outgoing stack slots may alias the incoming argument stack slots, and
+ // the alias isn't otherwise explicit. This is slightly more conservative
+ // than necessary, because it means that each store effectively depends
+ // on every argument instead of just those arguments it would clobber.
+
+ // Do not flag preceding copytoreg stuff together with the following stuff.
+ InFlag = SDValue();
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+ Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+ RegsToPass[i].second, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+ InFlag = SDValue();
+ }
+
+ // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
+ // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
+ // node so that legalize doesn't hack it.
+ bool isDirect = false;
+
+ const TargetMachine &TM = getTargetMachine();
+ const Module *Mod = MF.getFunction()->getParent();
+ const GlobalValue *GV = nullptr;
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+ GV = G->getGlobal();
+ bool isStub =
+ !TM.shouldAssumeDSOLocal(*Mod, GV) && Subtarget->isTargetMachO();
+
+ bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
+ bool isLocalARMFunc = false;
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+ auto PtrVt = getPointerTy(DAG.getDataLayout());
+
+ if (Subtarget->genLongCalls()) {
+ assert((!isPositionIndependent() || Subtarget->isTargetWindows()) &&
+ "long-calls codegen is not position independent!");
+ // Handle a global address or an external symbol. If it's not one of
+ // those, the target's already in a register, so we don't need to do
+ // anything extra.
+ if (isa<GlobalAddressSDNode>(Callee)) {
+ // Create a constant pool entry for the callee address
+ unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+ ARMConstantPoolValue *CPV =
+ ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0);
+
+ // Get the address of the callee into a register
+ SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
+ CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+ Callee = DAG.getLoad(
+ PtrVt, dl, DAG.getEntryNode(), CPAddr,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+ } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
+ const char *Sym = S->getSymbol();
+
+ // Create a constant pool entry for the callee address
+ unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+ ARMConstantPoolValue *CPV =
+ ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
+ ARMPCLabelIndex, 0);
+ // Get the address of the callee into a register
+ SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
+ CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+ Callee = DAG.getLoad(
+ PtrVt, dl, DAG.getEntryNode(), CPAddr,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+ }
+ } else if (isa<GlobalAddressSDNode>(Callee)) {
+ // If we're optimizing for minimum size and the function is called three or
+ // more times in this block, we can improve codesize by calling indirectly
+ // as BLXr has a 16-bit encoding.
+ auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
+ auto *BB = CLI.CS->getParent();
+ bool PreferIndirect =
+ Subtarget->isThumb() && MF.getFunction()->optForMinSize() &&
+ count_if(GV->users(), [&BB](const User *U) {
+ return isa<Instruction>(U) && cast<Instruction>(U)->getParent() == BB;
+ }) > 2;
+
+ if (!PreferIndirect) {
+ isDirect = true;
+ bool isDef = GV->isStrongDefinitionForLinker();
+
+ // ARM call to a local ARM function is predicable.
+ isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
+ // tBX takes a register source operand.
+ if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
+ assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
+ Callee = DAG.getNode(
+ ARMISD::WrapperPIC, dl, PtrVt,
+ DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY));
+ Callee = DAG.getLoad(
+ PtrVt, dl, DAG.getEntryNode(), Callee,
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+ /* Alignment = */ 0, MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant);
+ } else if (Subtarget->isTargetCOFF()) {
+ assert(Subtarget->isTargetWindows() &&
+ "Windows is the only supported COFF target");
+ unsigned TargetFlags = GV->hasDLLImportStorageClass()
+ ? ARMII::MO_DLLIMPORT
+ : ARMII::MO_NO_FLAG;
+ Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*Offset=*/0,
+ TargetFlags);
+ if (GV->hasDLLImportStorageClass())
+ Callee =
+ DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
+ DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+ } else {
+ Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, 0);
+ }
+ }
+ } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+ isDirect = true;
+ // tBX takes a register source operand.
+ const char *Sym = S->getSymbol();
+ if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
+ unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+ ARMConstantPoolValue *CPV =
+ ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
+ ARMPCLabelIndex, 4);
+ SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
+ CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+ Callee = DAG.getLoad(
+ PtrVt, dl, DAG.getEntryNode(), CPAddr,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+ SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
+ Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
+ } else {
+ Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0);
+ }
+ }
+
+ // FIXME: handle tail calls differently.
+ unsigned CallOpc;
+ if (Subtarget->isThumb()) {
+ if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
+ CallOpc = ARMISD::CALL_NOLINK;
+ else
+ CallOpc = ARMISD::CALL;
+ } else {
+ if (!isDirect && !Subtarget->hasV5TOps())
+ CallOpc = ARMISD::CALL_NOLINK;
+ else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
+ // Emit regular call when code size is the priority
+ !MF.getFunction()->optForMinSize())
+ // "mov lr, pc; b _foo" to avoid confusing the RSP
+ CallOpc = ARMISD::CALL_NOLINK;
+ else
+ CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
+ }
+
+ std::vector<SDValue> Ops;
+ Ops.push_back(Chain);
+ Ops.push_back(Callee);
+
+ // Add argument registers to the end of the list so that they are known live
+ // into the call.
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+ Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+ RegsToPass[i].second.getValueType()));
+
+ // Add a register mask operand representing the call-preserved registers.
+ if (!isTailCall) {
+ const uint32_t *Mask;
+ const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
+ if (isThisReturn) {
+ // For 'this' returns, use the R0-preserving mask if applicable
+ Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
+ if (!Mask) {
+ // Set isThisReturn to false if the calling convention is not one that
+ // allows 'returned' to be modeled in this way, so LowerCallResult does
+ // not try to pass 'this' straight through
+ isThisReturn = false;
+ Mask = ARI->getCallPreservedMask(MF, CallConv);
+ }
+ } else
+ Mask = ARI->getCallPreservedMask(MF, CallConv);
+
+ assert(Mask && "Missing call preserved mask for calling convention");
+ Ops.push_back(DAG.getRegisterMask(Mask));
+ }
+
+ if (InFlag.getNode())
+ Ops.push_back(InFlag);
+
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+ if (isTailCall) {
+ MF.getFrameInfo().setHasTailCall();
+ return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops);
+ }
+
+ // Returns a chain and a flag for retval copy to use.
+ Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
+ InFlag = Chain.getValue(1);
+
+ Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
+ DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
+ if (!Ins.empty())
+ InFlag = Chain.getValue(1);
+
+ // Handle result values, copying them out of physregs into vregs that we
+ // return.
+ return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
+ InVals, isThisReturn,
+ isThisReturn ? OutVals[0] : SDValue());
+}
+
+/// HandleByVal - Every parameter *after* a byval parameter is passed
+/// on the stack. Remember the next parameter register to allocate,
+/// and then confiscate the rest of the parameter registers to insure
+/// this.
+void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
+ unsigned Align) const {
+ assert((State->getCallOrPrologue() == Prologue ||
+ State->getCallOrPrologue() == Call) &&
+ "unhandled ParmContext");
+
+ // Byval (as with any stack) slots are always at least 4 byte aligned.
+ Align = std::max(Align, 4U);
+
+ unsigned Reg = State->AllocateReg(GPRArgRegs);
+ if (!Reg)
+ return;
+
+ unsigned AlignInRegs = Align / 4;
+ unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
+ for (unsigned i = 0; i < Waste; ++i)
+ Reg = State->AllocateReg(GPRArgRegs);
+
+ if (!Reg)
+ return;
+
+ unsigned Excess = 4 * (ARM::R4 - Reg);
+
+ // Special case when NSAA != SP and parameter size greater than size of
+ // all remained GPR regs. In that case we can't split parameter, we must
+ // send it to stack. We also must set NCRN to R4, so waste all
+ // remained registers.
+ const unsigned NSAAOffset = State->getNextStackOffset();
+ if (NSAAOffset != 0 && Size > Excess) {
+ while (State->AllocateReg(GPRArgRegs))
+ ;
+ return;
+ }
+
+ // First register for byval parameter is the first register that wasn't
+ // allocated before this method call, so it would be "reg".
+ // If parameter is small enough to be saved in range [reg, r4), then
+ // the end (first after last) register would be reg + param-size-in-regs,
+ // else parameter would be splitted between registers and stack,
+ // end register would be r4 in this case.
+ unsigned ByValRegBegin = Reg;
+ unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
+ State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
+ // Note, first register is allocated in the beginning of function already,
+ // allocate remained amount of registers we need.
+ for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
+ State->AllocateReg(GPRArgRegs);
+ // A byval parameter that is split between registers and memory needs its
+ // size truncated here.
+ // In the case where the entire structure fits in registers, we set the
+ // size in memory to zero.
+ Size = std::max<int>(Size - Excess, 0);
+}
+
+/// MatchingStackOffset - Return true if the given stack call argument is
+/// already available in the same position (relatively) of the caller's
+/// incoming argument stack.
+static
+bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
+ MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
+ const TargetInstrInfo *TII) {
+ unsigned Bytes = Arg.getValueSizeInBits() / 8;
+ int FI = INT_MAX;
+ if (Arg.getOpcode() == ISD::CopyFromReg) {
+ unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(VR))
+ return false;
+ MachineInstr *Def = MRI->getVRegDef(VR);
+ if (!Def)
+ return false;
+ if (!Flags.isByVal()) {
+ if (!TII->isLoadFromStackSlot(*Def, FI))
+ return false;
+ } else {
+ return false;
+ }
+ } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
+ if (Flags.isByVal())
+ // ByVal argument is passed in as a pointer but it's now being
+ // dereferenced. e.g.
+ // define @foo(%struct.X* %A) {
+ // tail call @bar(%struct.X* byval %A)
+ // }
+ return false;
+ SDValue Ptr = Ld->getBasePtr();
+ FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
+ if (!FINode)
+ return false;
+ FI = FINode->getIndex();
+ } else
+ return false;
+
+ assert(FI != INT_MAX);
+ if (!MFI.isFixedObjectIndex(FI))
+ return false;
+ return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI);
+}
+
+/// IsEligibleForTailCallOptimization - Check whether the call is eligible
+/// for tail call optimization. Targets which want to do tail call
+/// optimization should implement this function.
+bool
+ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
+ CallingConv::ID CalleeCC,
+ bool isVarArg,
+ bool isCalleeStructRet,
+ bool isCallerStructRet,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ SelectionDAG& DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ const Function *CallerF = MF.getFunction();
+ CallingConv::ID CallerCC = CallerF->getCallingConv();
+
+ assert(Subtarget->supportsTailCall());
+
+ // Look for obvious safe cases to perform tail call optimization that do not
+ // require ABI changes. This is what gcc calls sibcall.
+
+ // Exception-handling functions need a special set of instructions to indicate
+ // a return to the hardware. Tail-calling another function would probably
+ // break this.
+ if (CallerF->hasFnAttribute("interrupt"))
+ return false;
+
+ // Also avoid sibcall optimization if either caller or callee uses struct
+ // return semantics.
+ if (isCalleeStructRet || isCallerStructRet)
+ return false;
+
+ // Externally-defined functions with weak linkage should not be
+ // tail-called on ARM when the OS does not support dynamic
+ // pre-emption of symbols, as the AAELF spec requires normal calls
+ // to undefined weak functions to be replaced with a NOP or jump to the
+ // next instruction. The behaviour of branch instructions in this
+ // situation (as used for tail calls) is implementation-defined, so we
+ // cannot rely on the linker replacing the tail call with a return.
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ const GlobalValue *GV = G->getGlobal();
+ const Triple &TT = getTargetMachine().getTargetTriple();
+ if (GV->hasExternalWeakLinkage() &&
+ (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
+ return false;
+ }
+
+ // Check that the call results are passed in the same way.
+ LLVMContext &C = *DAG.getContext();
+ if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
+ CCAssignFnForReturn(CalleeCC, isVarArg),
+ CCAssignFnForReturn(CallerCC, isVarArg)))
+ return false;
+ // The callee has to preserve all registers the caller needs to preserve.
+ const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
+ const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
+ if (CalleeCC != CallerCC) {
+ const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
+ if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
+ return false;
+ }
+
+ // If Caller's vararg or byval argument has been split between registers and
+ // stack, do not perform tail call, since part of the argument is in caller's
+ // local frame.
+ const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
+ if (AFI_Caller->getArgRegsSaveSize())
+ return false;
+
+ // If the callee takes no arguments then go on to check the results of the
+ // call.
+ if (!Outs.empty()) {
+ // Check if stack adjustment is needed. For now, do not do this if any
+ // argument is passed on the stack.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ ARMCCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C, Call);
+ CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
+ if (CCInfo.getNextStackOffset()) {
+ // Check if the arguments are already laid out in the right way as
+ // the caller's fixed stack objects.
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ const MachineRegisterInfo *MRI = &MF.getRegInfo();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
+ i != e;
+ ++i, ++realArgIdx) {
+ CCValAssign &VA = ArgLocs[i];
+ EVT RegVT = VA.getLocVT();
+ SDValue Arg = OutVals[realArgIdx];
+ ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
+ if (VA.getLocInfo() == CCValAssign::Indirect)
+ return false;
+ if (VA.needsCustom()) {
+ // f64 and vector types are split into multiple registers or
+ // register/stack-slot combinations. The types will not match
+ // the registers; give up on memory f64 refs until we figure
+ // out what to do about this.
+ if (!VA.isRegLoc())
+ return false;
+ if (!ArgLocs[++i].isRegLoc())
+ return false;
+ if (RegVT == MVT::v2f64) {
+ if (!ArgLocs[++i].isRegLoc())
+ return false;
+ if (!ArgLocs[++i].isRegLoc())
+ return false;
+ }
+ } else if (!VA.isRegLoc()) {
+ if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
+ MFI, MRI, TII))
+ return false;
+ }
+ }
+ }
+
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
+ return false;
+ }
+
+ return true;
+}
+
+bool
+ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
+ MachineFunction &MF, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ LLVMContext &Context) const {
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
+ return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
+}
+
+static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,
+ const SDLoc &DL, SelectionDAG &DAG) {
+ const MachineFunction &MF = DAG.getMachineFunction();
+ const Function *F = MF.getFunction();
+
+ StringRef IntKind = F->getFnAttribute("interrupt").getValueAsString();
+
+ // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
+ // version of the "preferred return address". These offsets affect the return
+ // instruction if this is a return from PL1 without hypervisor extensions.
+ // IRQ/FIQ: +4 "subs pc, lr, #4"
+ // SWI: 0 "subs pc, lr, #0"
+ // ABORT: +4 "subs pc, lr, #4"
+ // UNDEF: +4/+2 "subs pc, lr, #0"
+ // UNDEF varies depending on where the exception came from ARM or Thumb
+ // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
+
+ int64_t LROffset;
+ if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
+ IntKind == "ABORT")
+ LROffset = 4;
+ else if (IntKind == "SWI" || IntKind == "UNDEF")
+ LROffset = 0;
+ else
+ report_fatal_error("Unsupported interrupt attribute. If present, value "
+ "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
+
+ RetOps.insert(RetOps.begin() + 1,
+ DAG.getConstant(LROffset, DL, MVT::i32, false));
+
+ return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps);
+}
+
+SDValue
+ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SDLoc &dl, SelectionDAG &DAG) const {
+
+ // CCValAssign - represent the assignment of the return value to a location.
+ SmallVector<CCValAssign, 16> RVLocs;
+
+ // CCState - Info about the registers and stack slots.
+ ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+ *DAG.getContext(), Call);
+
+ // Analyze outgoing return values.
+ CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
+
+ SDValue Flag;
+ SmallVector<SDValue, 4> RetOps;
+ RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
+ bool isLittleEndian = Subtarget->isLittle();
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+ AFI->setReturnRegsCount(RVLocs.size());
+
+ // Copy the result values into the output registers.
+ for (unsigned i = 0, realRVLocIdx = 0;
+ i != RVLocs.size();
+ ++i, ++realRVLocIdx) {
+ CCValAssign &VA = RVLocs[i];
+ assert(VA.isRegLoc() && "Can only return in registers!");
+
+ SDValue Arg = OutVals[realRVLocIdx];
+
+ switch (VA.getLocInfo()) {
+ default: llvm_unreachable("Unknown loc info!");
+ case CCValAssign::Full: break;
+ case CCValAssign::BCvt:
+ Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
+ break;
+ }
+
+ if (VA.needsCustom()) {
+ if (VA.getLocVT() == MVT::v2f64) {
+ // Extract the first half and return it in two registers.
+ SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
+ DAG.getConstant(0, dl, MVT::i32));
+ SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
+ DAG.getVTList(MVT::i32, MVT::i32), Half);
+
+ Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+ HalfGPRs.getValue(isLittleEndian ? 0 : 1),
+ Flag);
+ Flag = Chain.getValue(1);
+ RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+ VA = RVLocs[++i]; // skip ahead to next loc
+ Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+ HalfGPRs.getValue(isLittleEndian ? 1 : 0),
+ Flag);
+ Flag = Chain.getValue(1);
+ RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+ VA = RVLocs[++i]; // skip ahead to next loc
+
+ // Extract the 2nd half and fall through to handle it as an f64 value.
+ Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
+ DAG.getConstant(1, dl, MVT::i32));
+ }
+ // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is
+ // available.
+ SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
+ DAG.getVTList(MVT::i32, MVT::i32), Arg);
+ Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+ fmrrd.getValue(isLittleEndian ? 0 : 1),
+ Flag);
+ Flag = Chain.getValue(1);
+ RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+ VA = RVLocs[++i]; // skip ahead to next loc
+ Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+ fmrrd.getValue(isLittleEndian ? 1 : 0),
+ Flag);
+ } else
+ Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
+
+ // Guarantee that all emitted copies are
+ // stuck together, avoiding something bad.
+ Flag = Chain.getValue(1);
+ RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+ }
+ const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
+ const MCPhysReg *I =
+ TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
+ if (I) {
+ for (; *I; ++I) {
+ if (ARM::GPRRegClass.contains(*I))
+ RetOps.push_back(DAG.getRegister(*I, MVT::i32));
+ else if (ARM::DPRRegClass.contains(*I))
+ RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
+ else
+ llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+ }
+ }
+
+ // Update chain and glue.
+ RetOps[0] = Chain;
+ if (Flag.getNode())
+ RetOps.push_back(Flag);
+
+ // CPUs which aren't M-class use a special sequence to return from
+ // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
+ // though we use "subs pc, lr, #N").
+ //
+ // M-class CPUs actually use a normal return sequence with a special
+ // (hardware-provided) value in LR, so the normal code path works.
+ if (DAG.getMachineFunction().getFunction()->hasFnAttribute("interrupt") &&
+ !Subtarget->isMClass()) {
+ if (Subtarget->isThumb1Only())
+ report_fatal_error("interrupt attribute is not supported in Thumb1");
+ return LowerInterruptReturn(RetOps, dl, DAG);
+ }
+
+ return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, RetOps);
+}
+
+bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
+ if (N->getNumValues() != 1)
+ return false;
+ if (!N->hasNUsesOfValue(1, 0))
+ return false;
+
+ SDValue TCChain = Chain;
+ SDNode *Copy = *N->use_begin();
+ if (Copy->getOpcode() == ISD::CopyToReg) {
+ // If the copy has a glue operand, we conservatively assume it isn't safe to
+ // perform a tail call.
+ if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
+ return false;
+ TCChain = Copy->getOperand(0);
+ } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
+ SDNode *VMov = Copy;
+ // f64 returned in a pair of GPRs.
+ SmallPtrSet<SDNode*, 2> Copies;
+ for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
+ UI != UE; ++UI) {
+ if (UI->getOpcode() != ISD::CopyToReg)
+ return false;
+ Copies.insert(*UI);
+ }
+ if (Copies.size() > 2)
+ return false;
+
+ for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
+ UI != UE; ++UI) {
+ SDValue UseChain = UI->getOperand(0);
+ if (Copies.count(UseChain.getNode()))
+ // Second CopyToReg
+ Copy = *UI;
+ else {
+ // We are at the top of this chain.
+ // If the copy has a glue operand, we conservatively assume it
+ // isn't safe to perform a tail call.
+ if (UI->getOperand(UI->getNumOperands()-1).getValueType() == MVT::Glue)
+ return false;
+ // First CopyToReg
+ TCChain = UseChain;
+ }
+ }
+ } else if (Copy->getOpcode() == ISD::BITCAST) {
+ // f32 returned in a single GPR.
+ if (!Copy->hasOneUse())
+ return false;
+ Copy = *Copy->use_begin();
+ if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
+ return false;
+ // If the copy has a glue operand, we conservatively assume it isn't safe to
+ // perform a tail call.
+ if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
+ return false;
+ TCChain = Copy->getOperand(0);
+ } else {
+ return false;
+ }
+
+ bool HasRet = false;
+ for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
+ UI != UE; ++UI) {
+ if (UI->getOpcode() != ARMISD::RET_FLAG &&
+ UI->getOpcode() != ARMISD::INTRET_FLAG)
+ return false;
+ HasRet = true;
+ }
+
+ if (!HasRet)
+ return false;
+
+ Chain = TCChain;
+ return true;
+}
+
+bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
+ if (!Subtarget->supportsTailCall())
+ return false;
+
+ auto Attr =
+ CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
+ if (!CI->isTailCall() || Attr.getValueAsString() == "true")
+ return false;
+
+ return true;
+}
+
+// Trying to write a 64 bit value so need to split into two 32 bit values first,
+// and pass the lower and high parts through.
+static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ SDValue WriteValue = Op->getOperand(2);
+
+ // This function is only supposed to be called for i64 type argument.
+ assert(WriteValue.getValueType() == MVT::i64
+ && "LowerWRITE_REGISTER called for non-i64 type argument.");
+
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue,
+ DAG.getConstant(0, DL, MVT::i32));
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue,
+ DAG.getConstant(1, DL, MVT::i32));
+ SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };
+ return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
+}
+
+// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
+// their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
+// one of the above mentioned nodes. It has to be wrapped because otherwise
+// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
+// be used to form addressing mode. These wrapped nodes will be selected
+// into MOVi.
+static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
+ EVT PtrVT = Op.getValueType();
+ // FIXME there is no actual debug info here
+ SDLoc dl(Op);
+ ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+ SDValue Res;
+ if (CP->isMachineConstantPoolEntry())
+ Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
+ CP->getAlignment());
+ else
+ Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
+ CP->getAlignment());
+ return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
+}
+
+unsigned ARMTargetLowering::getJumpTableEncoding() const {
+ return MachineJumpTableInfo::EK_Inline;
+}
+
+SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
+ SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+ unsigned ARMPCLabelIndex = 0;
+ SDLoc DL(Op);
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+ SDValue CPAddr;
+ bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI();
+ if (!IsPositionIndependent) {
+ CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4);
+ } else {
+ unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
+ ARMPCLabelIndex = AFI->createPICLabelUId();
+ ARMConstantPoolValue *CPV =
+ ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
+ ARMCP::CPBlockAddress, PCAdj);
+ CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+ }
+ CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
+ SDValue Result = DAG.getLoad(
+ PtrVT, DL, DAG.getEntryNode(), CPAddr,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+ if (!IsPositionIndependent)
+ return Result;
+ SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
+ return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
+}
+
+/// \brief Convert a TLS address reference into the correct sequence of loads
+/// and calls to compute the variable's address for Darwin, and return an
+/// SDValue containing the final node.
+
+/// Darwin only has one TLS scheme which must be capable of dealing with the
+/// fully general situation, in the worst case. This means:
+/// + "extern __thread" declaration.
+/// + Defined in a possibly unknown dynamic library.
+///
+/// The general system is that each __thread variable has a [3 x i32] descriptor
+/// which contains information used by the runtime to calculate the address. The
+/// only part of this the compiler needs to know about is the first word, which
+/// contains a function pointer that must be called with the address of the
+/// entire descriptor in "r0".
+///
+/// Since this descriptor may be in a different unit, in general access must
+/// proceed along the usual ARM rules. A common sequence to produce is:
+///
+/// movw rT1, :lower16:_var$non_lazy_ptr
+/// movt rT1, :upper16:_var$non_lazy_ptr
+/// ldr r0, [rT1]
+/// ldr rT2, [r0]
+/// blx rT2
+/// [...address now in r0...]
+SDValue
+ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin");
+ SDLoc DL(Op);
+
+ // First step is to get the address of the actua global symbol. This is where
+ // the TLS descriptor lives.
+ SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);
+
+ // The first entry in the descriptor is a function pointer that we must call
+ // to obtain the address of the variable.
+ SDValue Chain = DAG.getEntryNode();
+ SDValue FuncTLVGet = DAG.getLoad(
+ MVT::i32, DL, Chain, DescAddr,
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+ /* Alignment = */ 4,
+ MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant);
+ Chain = FuncTLVGet.getValue(1);
+
+ MachineFunction &F = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = F.getFrameInfo();
+ MFI.setAdjustsStack(true);
+
+ // TLS calls preserve all registers except those that absolutely must be
+ // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
+ // silly).
+ auto TRI =
+ getTargetMachine().getSubtargetImpl(*F.getFunction())->getRegisterInfo();
+ auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
+ const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction());
+
+ // Finally, we can make the call. This is just a degenerate version of a
+ // normal AArch64 call node: r0 takes the address of the descriptor, and
+ // returns the address of the variable in this thread.
+ Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
+ Chain =
+ DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
+ Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
+ DAG.getRegisterMask(Mask), Chain.getValue(1));
+ return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
+}
+
+SDValue
+ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
+
+ SDValue Chain = DAG.getEntryNode();
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ SDLoc DL(Op);
+
+ // Load the current TEB (thread environment block)
+ SDValue Ops[] = {Chain,
+ DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32),
+ DAG.getConstant(15, DL, MVT::i32),
+ DAG.getConstant(0, DL, MVT::i32),
+ DAG.getConstant(13, DL, MVT::i32),
+ DAG.getConstant(0, DL, MVT::i32),
+ DAG.getConstant(2, DL, MVT::i32)};
+ SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
+ DAG.getVTList(MVT::i32, MVT::Other), Ops);
+
+ SDValue TEB = CurrentTEB.getValue(0);
+ Chain = CurrentTEB.getValue(1);
+
+ // Load the ThreadLocalStoragePointer from the TEB
+ // A pointer to the TLS array is located at offset 0x2c from the TEB.
+ SDValue TLSArray =
+ DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL));
+ TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
+
+ // The pointer to the thread's TLS data area is at the TLS Index scaled by 4
+ // offset into the TLSArray.
+
+ // Load the TLS index from the C runtime
+ SDValue TLSIndex =
+ DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG);
+ TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex);
+ TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo());
+
+ SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
+ DAG.getConstant(2, DL, MVT::i32));
+ SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
+ DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
+ MachinePointerInfo());
+
+ // Get the offset of the start of the .tls section (section base)
+ const auto *GA = cast<GlobalAddressSDNode>(Op);
+ auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL);
+ SDValue Offset = DAG.getLoad(
+ PtrVT, DL, Chain, DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
+ DAG.getTargetConstantPool(CPV, PtrVT, 4)),
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+
+ return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset);
+}
+
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model
+SDValue
+ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
+ SelectionDAG &DAG) const {
+ SDLoc dl(GA);
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
+ MachineFunction &MF = DAG.getMachineFunction();
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+ unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+ ARMConstantPoolValue *CPV =
+ ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
+ ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
+ SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+ Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
+ Argument = DAG.getLoad(
+ PtrVT, dl, DAG.getEntryNode(), Argument,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+ SDValue Chain = Argument.getValue(1);
+
+ SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
+ Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
+
+ // call __tls_get_addr.
+ ArgListTy Args;
+ ArgListEntry Entry;
+ Entry.Node = Argument;
+ Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext());
+ Args.push_back(Entry);
+
+ // FIXME: is there useful debug info available here?
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(dl).setChain(Chain)
+ .setCallee(CallingConv::C, Type::getInt32Ty(*DAG.getContext()),
+ DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));
+
+ std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+ return CallResult.first;
+}
+
+// Lower ISD::GlobalTLSAddress using the "initial exec" or
+// "local exec" model.
+SDValue
+ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
+ SelectionDAG &DAG,
+ TLSModel::Model model) const {
+ const GlobalValue *GV = GA->getGlobal();
+ SDLoc dl(GA);
+ SDValue Offset;
+ SDValue Chain = DAG.getEntryNode();
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ // Get the Thread Pointer
+ SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
+
+ if (model == TLSModel::InitialExec) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+ unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+ // Initial exec model.
+ unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
+ ARMConstantPoolValue *CPV =
+ ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
+ ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF,
+ true);
+ Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+ Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
+ Offset = DAG.getLoad(
+ PtrVT, dl, Chain, Offset,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+ Chain = Offset.getValue(1);
+
+ SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
+ Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
+
+ Offset = DAG.getLoad(
+ PtrVT, dl, Chain, Offset,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+ } else {
+ // local exec model
+ assert(model == TLSModel::LocalExec);
+ ARMConstantPoolValue *CPV =
+ ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF);
+ Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+ Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
+ Offset = DAG.getLoad(
+ PtrVT, dl, Chain, Offset,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+ }
+
+ // The address of the thread local variable is the add of the thread
+ // pointer with the offset of the variable.
+ return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
+}
+
+SDValue
+ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
+ if (Subtarget->isTargetDarwin())
+ return LowerGlobalTLSAddressDarwin(Op, DAG);
+
+ if (Subtarget->isTargetWindows())
+ return LowerGlobalTLSAddressWindows(Op, DAG);
+
+ // TODO: implement the "local dynamic" model
+ assert(Subtarget->isTargetELF() && "Only ELF implemented here");
+ GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+ if (DAG.getTarget().Options.EmulatedTLS)
+ return LowerToTLSEmulatedModel(GA, DAG);
+
+ TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal());
+
+ switch (model) {
+ case TLSModel::GeneralDynamic:
+ case TLSModel::LocalDynamic:
+ return LowerToTLSGeneralDynamicModel(GA, DAG);
+ case TLSModel::InitialExec:
+ case TLSModel::LocalExec:
+ return LowerToTLSExecModels(GA, DAG, model);
+ }
+ llvm_unreachable("bogus TLS model");
+}
+
+/// Return true if all users of V are within function F, looking through
+/// ConstantExprs.
+static bool allUsersAreInFunction(const Value *V, const Function *F) {
+ SmallVector<const User*,4> Worklist;
+ for (auto *U : V->users())
+ Worklist.push_back(U);
+ while (!Worklist.empty()) {
+ auto *U = Worklist.pop_back_val();
+ if (isa<ConstantExpr>(U)) {
+ for (auto *UU : U->users())
+ Worklist.push_back(UU);
+ continue;
+ }
+
+ auto *I = dyn_cast<Instruction>(U);
+ if (!I || I->getParent()->getParent() != F)
+ return false;
+ }
+ return true;
+}
+
+/// Return true if all users of V are within some (any) function, looking through
+/// ConstantExprs. In other words, are there any global constant users?
+static bool allUsersAreInFunctions(const Value *V) {
+ SmallVector<const User*,4> Worklist;
+ for (auto *U : V->users())
+ Worklist.push_back(U);
+ while (!Worklist.empty()) {
+ auto *U = Worklist.pop_back_val();
+ if (isa<ConstantExpr>(U)) {
+ for (auto *UU : U->users())
+ Worklist.push_back(UU);
+ continue;
+ }
+
+ if (!isa<Instruction>(U))
+ return false;
+ }
+ return true;
+}
+
+// Return true if T is an integer, float or an array/vector of either.
+static bool isSimpleType(Type *T) {
+ if (T->isIntegerTy() || T->isFloatingPointTy())
+ return true;
+ Type *SubT = nullptr;
+ if (T->isArrayTy())
+ SubT = T->getArrayElementType();
+ else if (T->isVectorTy())
+ SubT = T->getVectorElementType();
+ else
+ return false;
+ return SubT->isIntegerTy() || SubT->isFloatingPointTy();
+}
+
+static SDValue promoteToConstantPool(const GlobalValue *GV, SelectionDAG &DAG,
+ EVT PtrVT, SDLoc dl) {
+ // If we're creating a pool entry for a constant global with unnamed address,
+ // and the global is small enough, we can emit it inline into the constant pool
+ // to save ourselves an indirection.
+ //
+ // This is a win if the constant is only used in one function (so it doesn't
+ // need to be duplicated) or duplicating the constant wouldn't increase code
+ // size (implying the constant is no larger than 4 bytes).
+ const Function *F = DAG.getMachineFunction().getFunction();
+
+ // We rely on this decision to inline being idemopotent and unrelated to the
+ // use-site. We know that if we inline a variable at one use site, we'll
+ // inline it elsewhere too (and reuse the constant pool entry). Fast-isel
+ // doesn't know about this optimization, so bail out if it's enabled else
+ // we could decide to inline here (and thus never emit the GV) but require
+ // the GV from fast-isel generated code.
+ if (!EnableConstpoolPromotion ||
+ DAG.getMachineFunction().getTarget().Options.EnableFastISel)
+ return SDValue();
+
+ auto *GVar = dyn_cast<GlobalVariable>(GV);
+ if (!GVar || !GVar->hasInitializer() ||
+ !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() ||
+ !GVar->hasLocalLinkage())
+ return SDValue();
+
+ // Ensure that we don't try and inline any type that contains pointers. If
+ // we inline a value that contains relocations, we move the relocations from
+ // .data to .text which is not ideal.
+ auto *Init = GVar->getInitializer();
+ if (!isSimpleType(Init->getType()))
+ return SDValue();
+
+ // The constant islands pass can only really deal with alignment requests
+ // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote
+ // any type wanting greater alignment requirements than 4 bytes. We also
+ // can only promote constants that are multiples of 4 bytes in size or
+ // are paddable to a multiple of 4. Currently we only try and pad constants
+ // that are strings for simplicity.
+ auto *CDAInit = dyn_cast<ConstantDataArray>(Init);
+ unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType());
+ unsigned Align = GVar->getAlignment();
+ unsigned RequiredPadding = 4 - (Size % 4);
+ bool PaddingPossible =
+ RequiredPadding == 4 || (CDAInit && CDAInit->isString());
+ if (!PaddingPossible || Align > 4 || Size > ConstpoolPromotionMaxSize)
+ return SDValue();
+
+ unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding);
+ MachineFunction &MF = DAG.getMachineFunction();
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+ // We can't bloat the constant pool too much, else the ConstantIslands pass
+ // may fail to converge. If we haven't promoted this global yet (it may have
+ // multiple uses), and promoting it would increase the constant pool size (Sz
+ // > 4), ensure we have space to do so up to MaxTotal.
+ if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4)
+ if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >=
+ ConstpoolPromotionMaxTotal)
+ return SDValue();
+
+ // This is only valid if all users are in a single function OR it has users
+ // in multiple functions but it no larger than a pointer. We also check if
+ // GVar has constant (non-ConstantExpr) users. If so, it essentially has its
+ // address taken.
+ if (!allUsersAreInFunction(GVar, F) &&
+ !(Size <= 4 && allUsersAreInFunctions(GVar)))
+ return SDValue();
+
+ // We're going to inline this global. Pad it out if needed.
+ if (RequiredPadding != 4) {
+ StringRef S = CDAInit->getAsString();
+
+ SmallVector<uint8_t,16> V(S.size());
+ std::copy(S.bytes_begin(), S.bytes_end(), V.begin());
+ while (RequiredPadding--)
+ V.push_back(0);
+ Init = ConstantDataArray::get(*DAG.getContext(), V);
+ }
+
+ auto CPVal = ARMConstantPoolConstant::Create(GVar, Init);
+ SDValue CPAddr =
+ DAG.getTargetConstantPool(CPVal, PtrVT, /*Align=*/4);
+ if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) {
+ AFI->markGlobalAsPromotedToConstantPool(GVar);
+ AFI->setPromotedConstpoolIncrease(AFI->getPromotedConstpoolIncrease() +
+ PaddedSize - 4);
+ }
+ ++NumConstpoolPromoted;
+ return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+}
+
+SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ SDLoc dl(Op);
+ const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+ const TargetMachine &TM = getTargetMachine();
+ if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+ GV = GA->getBaseObject();
+ bool IsRO =
+ (isa<GlobalVariable>(GV) && cast<GlobalVariable>(GV)->isConstant()) ||
+ isa<Function>(GV);
+
+ // promoteToConstantPool only if not generating XO text section
+ if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV) && !Subtarget->genExecuteOnly())
+ if (SDValue V = promoteToConstantPool(GV, DAG, PtrVT, dl))
+ return V;
+
+ if (isPositionIndependent()) {
+ bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV);
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+ unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ SDLoc dl(Op);
+ unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
+ ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(
+ GV, ARMPCLabelIndex, ARMCP::CPValue, PCAdj,
+ UseGOT_PREL ? ARMCP::GOT_PREL : ARMCP::no_modifier,
+ /*AddCurrentAddress=*/UseGOT_PREL);
+ SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+ CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+ SDValue Result = DAG.getLoad(
+ PtrVT, dl, DAG.getEntryNode(), CPAddr,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+ SDValue Chain = Result.getValue(1);
+ SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
+ Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
+ if (UseGOT_PREL)
+ Result =
+ DAG.getLoad(PtrVT, dl, Chain, Result,
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+ return Result;
+ } else if (Subtarget->isROPI() && IsRO) {
+ // PC-relative.
+ SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
+ SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
+ return Result;
+ } else if (Subtarget->isRWPI() && !IsRO) {
+ // SB-relative.
+ ARMConstantPoolValue *CPV =
+ ARMConstantPoolConstant::Create(GV, ARMCP::SBREL);
+ SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+ CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+ SDValue G = DAG.getLoad(
+ PtrVT, dl, DAG.getEntryNode(), CPAddr,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+ SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT);
+ SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, G);
+ return Result;
+ }
+
+ // If we have T2 ops, we can materialize the address directly via movt/movw
+ // pair. This is always cheaper.
+ if (Subtarget->useMovt(DAG.getMachineFunction())) {
+ ++NumMovwMovt;
+ // FIXME: Once remat is capable of dealing with instructions with register
+ // operands, expand this into two nodes.
+ return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
+ DAG.getTargetGlobalAddress(GV, dl, PtrVT));
+ } else {
+ SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
+ CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+ return DAG.getLoad(
+ PtrVT, dl, DAG.getEntryNode(), CPAddr,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+ }
+}
+
+SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
+ "ROPI/RWPI not currently supported for Darwin");
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ SDLoc dl(Op);
+ const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+
+ if (Subtarget->useMovt(DAG.getMachineFunction()))
+ ++NumMovwMovt;
+
+ // FIXME: Once remat is capable of dealing with instructions with register
+ // operands, expand this into multiple nodes
+ unsigned Wrapper =
+ isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper;
+
+ SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY);
+ SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G);
+
+ if (Subtarget->isGVIndirectSymbol(GV))
+ Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+ return Result;
+}
+
+SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported");
+ assert(Subtarget->useMovt(DAG.getMachineFunction()) &&
+ "Windows on ARM expects to use movw/movt");
+ assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
+ "ROPI/RWPI not currently supported for Windows");
+
+ const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+ const ARMII::TOF TargetFlags =
+ (GV->hasDLLImportStorageClass() ? ARMII::MO_DLLIMPORT : ARMII::MO_NO_FLAG);
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue Result;
+ SDLoc DL(Op);
+
+ ++NumMovwMovt;
+
+ // FIXME: Once remat is capable of dealing with instructions with register
+ // operands, expand this into two nodes.
+ Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
+ DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*Offset=*/0,
+ TargetFlags));
+ if (GV->hasDLLImportStorageClass())
+ Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+ return Result;
+}
+
+SDValue
+ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ SDValue Val = DAG.getConstant(0, dl, MVT::i32);
+ return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
+ DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
+ Op.getOperand(1), Val);
+}
+
+SDValue
+ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
+ Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
+}
+
+SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other,
+ Op.getOperand(0));
+}
+
+SDValue
+ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *Subtarget) const {
+ unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ SDLoc dl(Op);
+ switch (IntNo) {
+ default: return SDValue(); // Don't custom lower most intrinsics.
+ case Intrinsic::arm_rbit: {
+ assert(Op.getOperand(1).getValueType() == MVT::i32 &&
+ "RBIT intrinsic must have i32 type!");
+ return DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Op.getOperand(1));
+ }
+ case Intrinsic::thread_pointer: {
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
+ }
+ case Intrinsic::eh_sjlj_lsda: {
+ MachineFunction &MF = DAG.getMachineFunction();
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+ unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue CPAddr;
+ bool IsPositionIndependent = isPositionIndependent();
+ unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
+ ARMConstantPoolValue *CPV =
+ ARMConstantPoolConstant::Create(MF.getFunction(), ARMPCLabelIndex,
+ ARMCP::CPLSDA, PCAdj);
+ CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+ CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+ SDValue Result = DAG.getLoad(
+ PtrVT, dl, DAG.getEntryNode(), CPAddr,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+
+ if (IsPositionIndependent) {
+ SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
+ Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
+ }
+ return Result;
+ }
+ case Intrinsic::arm_neon_vmulls:
+ case Intrinsic::arm_neon_vmullu: {
+ unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
+ ? ARMISD::VMULLs : ARMISD::VMULLu;
+ return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ }
+ case Intrinsic::arm_neon_vminnm:
+ case Intrinsic::arm_neon_vmaxnm: {
+ unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm)
+ ? ISD::FMINNUM : ISD::FMAXNUM;
+ return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ }
+ case Intrinsic::arm_neon_vminu:
+ case Intrinsic::arm_neon_vmaxu: {
+ if (Op.getValueType().isFloatingPoint())
+ return SDValue();
+ unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu)
+ ? ISD::UMIN : ISD::UMAX;
+ return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ }
+ case Intrinsic::arm_neon_vmins:
+ case Intrinsic::arm_neon_vmaxs: {
+ // v{min,max}s is overloaded between signed integers and floats.
+ if (!Op.getValueType().isFloatingPoint()) {
+ unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
+ ? ISD::SMIN : ISD::SMAX;
+ return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ }
+ unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
+ ? ISD::FMINNAN : ISD::FMAXNAN;
+ return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ }
+ }
+}
+
+static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *Subtarget) {
+ // FIXME: handle "fence singlethread" more efficiently.
+ SDLoc dl(Op);
+ if (!Subtarget->hasDataBarrier()) {
+ // Some ARMv6 cpus can support data barriers with an mcr instruction.
+ // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
+ // here.
+ assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
+ "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
+ return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
+ DAG.getConstant(0, dl, MVT::i32));
+ }
+
+ ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1));
+ AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue());
+ ARM_MB::MemBOpt Domain = ARM_MB::ISH;
+ if (Subtarget->isMClass()) {
+ // Only a full system barrier exists in the M-class architectures.
+ Domain = ARM_MB::SY;
+ } else if (Subtarget->preferISHSTBarriers() &&
+ Ord == AtomicOrdering::Release) {
+ // Swift happens to implement ISHST barriers in a way that's compatible with
+ // Release semantics but weaker than ISH so we'd be fools not to use
+ // it. Beware: other processors probably don't!
+ Domain = ARM_MB::ISHST;
+ }
+
+ return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
+ DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32),
+ DAG.getConstant(Domain, dl, MVT::i32));
+}
+
+static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *Subtarget) {
+ // ARM pre v5TE and Thumb1 does not have preload instructions.
+ if (!(Subtarget->isThumb2() ||
+ (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
+ // Just preserve the chain.
+ return Op.getOperand(0);
+
+ SDLoc dl(Op);
+ unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1;
+ if (!isRead &&
+ (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
+ // ARMv7 with MP extension has PLDW.
+ return Op.getOperand(0);
+
+ unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+ if (Subtarget->isThumb()) {
+ // Invert the bits.
+ isRead = ~isRead & 1;
+ isData = ~isData & 1;
+ }
+
+ return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
+ Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32),
+ DAG.getConstant(isData, dl, MVT::i32));
+}
+
+static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
+
+ // vastart just stores the address of the VarArgsFrameIndex slot into the
+ // memory location argument.
+ SDLoc dl(Op);
+ EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
+ SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+ const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+ return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
+ MachinePointerInfo(SV));
+}
+
+SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA,
+ CCValAssign &NextVA,
+ SDValue &Root,
+ SelectionDAG &DAG,
+ const SDLoc &dl) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+ const TargetRegisterClass *RC;
+ if (AFI->isThumb1OnlyFunction())
+ RC = &ARM::tGPRRegClass;
+ else
+ RC = &ARM::GPRRegClass;
+
+ // Transform the arguments stored in physical registers into virtual ones.
+ unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+ SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
+
+ SDValue ArgValue2;
+ if (NextVA.isMemLoc()) {
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true);
+
+ // Create load node to retrieve arguments from the stack.
+ SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+ ArgValue2 = DAG.getLoad(
+ MVT::i32, dl, Root, FIN,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+ } else {
+ Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
+ ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
+ }
+ if (!Subtarget->isLittle())
+ std::swap (ArgValue, ArgValue2);
+ return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
+}
+
+// The remaining GPRs hold either the beginning of variable-argument
+// data, or the beginning of an aggregate passed by value (usually
+// byval). Either way, we allocate stack slots adjacent to the data
+// provided by our caller, and store the unallocated registers there.
+// If this is a variadic function, the va_list pointer will begin with
+// these values; otherwise, this reassembles a (byval) structure that
+// was split between registers and memory.
+// Return: The frame index registers were stored into.
+int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
+ const SDLoc &dl, SDValue &Chain,
+ const Value *OrigArg,
+ unsigned InRegsParamRecordIdx,
+ int ArgOffset, unsigned ArgSize) const {
+ // Currently, two use-cases possible:
+ // Case #1. Non-var-args function, and we meet first byval parameter.
+ // Setup first unallocated register as first byval register;
+ // eat all remained registers
+ // (these two actions are performed by HandleByVal method).
+ // Then, here, we initialize stack frame with
+ // "store-reg" instructions.
+ // Case #2. Var-args function, that doesn't contain byval parameters.
+ // The same: eat all remained unallocated registers,
+ // initialize stack frame.
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+ unsigned RBegin, REnd;
+ if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
+ CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
+ } else {
+ unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
+ RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx];
+ REnd = ARM::R4;
+ }
+
+ if (REnd != RBegin)
+ ArgOffset = -4 * (ARM::R4 - RBegin);
+
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+ int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false);
+ SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT);
+
+ SmallVector<SDValue, 4> MemOps;
+ const TargetRegisterClass *RC =
+ AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
+
+ for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
+ unsigned VReg = MF.addLiveIn(Reg, RC);
+ SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
+ SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
+ MachinePointerInfo(OrigArg, 4 * i));
+ MemOps.push_back(Store);
+ FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT));
+ }
+
+ if (!MemOps.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
+ return FrameIndex;
+}
+
+// Setup stack frame, the va_list pointer will start from.
+void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
+ const SDLoc &dl, SDValue &Chain,
+ unsigned ArgOffset,
+ unsigned TotalArgRegsSaveSize,
+ bool ForceMutable) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+ // Try to store any remaining integer argument regs
+ // to their spots on the stack so that they may be loaded by dereferencing
+ // the result of va_next.
+ // If there is no regs to be stored, just point address after last
+ // argument passed via stack.
+ int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr,
+ CCInfo.getInRegsParamsCount(),
+ CCInfo.getNextStackOffset(), 4);
+ AFI->setVarArgsFrameIndex(FrameIndex);
+}
+
+SDValue ARMTargetLowering::LowerFormalArguments(
+ SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+ // Assign locations to all of the incoming arguments.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+ *DAG.getContext(), Prologue);
+ CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
+
+ SmallVector<SDValue, 16> ArgValues;
+ SDValue ArgValue;
+ Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin();
+ unsigned CurArgIdx = 0;
+
+ // Initially ArgRegsSaveSize is zero.
+ // Then we increase this value each time we meet byval parameter.
+ // We also increase this value in case of varargs function.
+ AFI->setArgRegsSaveSize(0);
+
+ // Calculate the amount of stack space that we need to allocate to store
+ // byval and variadic arguments that are passed in registers.
+ // We need to know this before we allocate the first byval or variadic
+ // argument, as they will be allocated a stack slot below the CFA (Canonical
+ // Frame Address, the stack pointer at entry to the function).
+ unsigned ArgRegBegin = ARM::R4;
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount())
+ break;
+
+ CCValAssign &VA = ArgLocs[i];
+ unsigned Index = VA.getValNo();
+ ISD::ArgFlagsTy Flags = Ins[Index].Flags;
+ if (!Flags.isByVal())
+ continue;
+
+ assert(VA.isMemLoc() && "unexpected byval pointer in reg");
+ unsigned RBegin, REnd;
+ CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd);
+ ArgRegBegin = std::min(ArgRegBegin, RBegin);
+
+ CCInfo.nextInRegsParam();
+ }
+ CCInfo.rewindByValRegsInfo();
+
+ int lastInsIndex = -1;
+ if (isVarArg && MFI.hasVAStart()) {
+ unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
+ if (RegIdx != array_lengthof(GPRArgRegs))
+ ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]);
+ }
+
+ unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin);
+ AFI->setArgRegsSaveSize(TotalArgRegsSaveSize);
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ if (Ins[VA.getValNo()].isOrigArg()) {
+ std::advance(CurOrigArg,
+ Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx);
+ CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex();
+ }
+ // Arguments stored in registers.
+ if (VA.isRegLoc()) {
+ EVT RegVT = VA.getLocVT();
+
+ if (VA.needsCustom()) {
+ // f64 and vector types are split up into multiple registers or
+ // combinations of registers and stack slots.
+ if (VA.getLocVT() == MVT::v2f64) {
+ SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i],
+ Chain, DAG, dl);
+ VA = ArgLocs[++i]; // skip ahead to next loc
+ SDValue ArgValue2;
+ if (VA.isMemLoc()) {
+ int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
+ SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+ ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN,
+ MachinePointerInfo::getFixedStack(
+ DAG.getMachineFunction(), FI));
+ } else {
+ ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i],
+ Chain, DAG, dl);
+ }
+ ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
+ ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
+ ArgValue, ArgValue1,
+ DAG.getIntPtrConstant(0, dl));
+ ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
+ ArgValue, ArgValue2,
+ DAG.getIntPtrConstant(1, dl));
+ } else
+ ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
+
+ } else {
+ const TargetRegisterClass *RC;
+
+ if (RegVT == MVT::f32)
+ RC = &ARM::SPRRegClass;
+ else if (RegVT == MVT::f64)
+ RC = &ARM::DPRRegClass;
+ else if (RegVT == MVT::v2f64)
+ RC = &ARM::QPRRegClass;
+ else if (RegVT == MVT::i32)
+ RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
+ : &ARM::GPRRegClass;
+ else
+ llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
+
+ // Transform the arguments in physical registers into virtual ones.
+ unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+ ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
+ }
+
+ // If this is an 8 or 16-bit value, it is really passed promoted
+ // to 32 bits. Insert an assert[sz]ext to capture this, then
+ // truncate to the right size.
+ switch (VA.getLocInfo()) {
+ default: llvm_unreachable("Unknown loc info!");
+ case CCValAssign::Full: break;
+ case CCValAssign::BCvt:
+ ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
+ break;
+ case CCValAssign::SExt:
+ ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
+ DAG.getValueType(VA.getValVT()));
+ ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+ break;
+ case CCValAssign::ZExt:
+ ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
+ DAG.getValueType(VA.getValVT()));
+ ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+ break;
+ }
+
+ InVals.push_back(ArgValue);
+
+ } else { // VA.isRegLoc()
+
+ // sanity check
+ assert(VA.isMemLoc());
+ assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
+
+ int index = VA.getValNo();
+
+ // Some Ins[] entries become multiple ArgLoc[] entries.
+ // Process them only once.
+ if (index != lastInsIndex)
+ {
+ ISD::ArgFlagsTy Flags = Ins[index].Flags;
+ // FIXME: For now, all byval parameter objects are marked mutable.
+ // This can be changed with more analysis.
+ // In case of tail call optimization mark all arguments mutable.
+ // Since they could be overwritten by lowering of arguments in case of
+ // a tail call.
+ if (Flags.isByVal()) {
+ assert(Ins[index].isOrigArg() &&
+ "Byval arguments cannot be implicit");
+ unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();
+
+ int FrameIndex = StoreByValRegs(
+ CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex,
+ VA.getLocMemOffset(), Flags.getByValSize());
+ InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));
+ CCInfo.nextInRegsParam();
+ } else {
+ unsigned FIOffset = VA.getLocMemOffset();
+ int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
+ FIOffset, true);
+
+ // Create load nodes to retrieve arguments from the stack.
+ SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+ InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
+ MachinePointerInfo::getFixedStack(
+ DAG.getMachineFunction(), FI)));
+ }
+ lastInsIndex = index;
+ }
+ }
+ }
+
+ // varargs
+ if (isVarArg && MFI.hasVAStart())
+ VarArgStyleRegisters(CCInfo, DAG, dl, Chain,
+ CCInfo.getNextStackOffset(),
+ TotalArgRegsSaveSize);
+
+ AFI->setArgumentStackSize(CCInfo.getNextStackOffset());
+
+ return Chain;
+}
+
+/// isFloatingPointZero - Return true if this is +0.0.
+static bool isFloatingPointZero(SDValue Op) {
+ if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
+ return CFP->getValueAPF().isPosZero();
+ else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
+ // Maybe this has already been legalized into the constant pool?
+ if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
+ SDValue WrapperOp = Op.getOperand(1).getOperand(0);
+ if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp))
+ if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
+ return CFP->getValueAPF().isPosZero();
+ }
+ } else if (Op->getOpcode() == ISD::BITCAST &&
+ Op->getValueType(0) == MVT::f64) {
+ // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
+ // created by LowerConstantFP().
+ SDValue BitcastOp = Op->getOperand(0);
+ if (BitcastOp->getOpcode() == ARMISD::VMOVIMM &&
+ isNullConstant(BitcastOp->getOperand(0)))
+ return true;
+ }
+ return false;
+}
+
+/// Returns appropriate ARM CMP (cmp) and corresponding condition code for
+/// the given operands.
+SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+ SDValue &ARMcc, SelectionDAG &DAG,
+ const SDLoc &dl) const {
+ if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
+ unsigned C = RHSC->getZExtValue();
+ if (!isLegalICmpImmediate(C)) {
+ // Constant does not fit, try adjusting it by one?
+ switch (CC) {
+ default: break;
+ case ISD::SETLT:
+ case ISD::SETGE:
+ if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
+ CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
+ RHS = DAG.getConstant(C - 1, dl, MVT::i32);
+ }
+ break;
+ case ISD::SETULT:
+ case ISD::SETUGE:
+ if (C != 0 && isLegalICmpImmediate(C-1)) {
+ CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
+ RHS = DAG.getConstant(C - 1, dl, MVT::i32);
+ }
+ break;
+ case ISD::SETLE:
+ case ISD::SETGT:
+ if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
+ CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
+ RHS = DAG.getConstant(C + 1, dl, MVT::i32);
+ }
+ break;
+ case ISD::SETULE:
+ case ISD::SETUGT:
+ if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
+ CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
+ RHS = DAG.getConstant(C + 1, dl, MVT::i32);
+ }
+ break;
+ }
+ }
+ }
+
+ ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
+ ARMISD::NodeType CompareType;
+ switch (CondCode) {
+ default:
+ CompareType = ARMISD::CMP;
+ break;
+ case ARMCC::EQ:
+ case ARMCC::NE:
+ // Uses only Z Flag
+ CompareType = ARMISD::CMPZ;
+ break;
+ }
+ ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
+ return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS);
+}
+
+/// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
+SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
+ SelectionDAG &DAG, const SDLoc &dl) const {
+ assert(!Subtarget->isFPOnlySP() || RHS.getValueType() != MVT::f64);
+ SDValue Cmp;
+ if (!isFloatingPointZero(RHS))
+ Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS);
+ else
+ Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS);
+ return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp);
+}
+
+/// duplicateCmp - Glue values can have only one use, so this function
+/// duplicates a comparison node.
+SDValue
+ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
+ unsigned Opc = Cmp.getOpcode();
+ SDLoc DL(Cmp);
+ if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ)
+ return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
+
+ assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation");
+ Cmp = Cmp.getOperand(0);
+ Opc = Cmp.getOpcode();
+ if (Opc == ARMISD::CMPFP)
+ Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
+ else {
+ assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT");
+ Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0));
+ }
+ return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);
+}
+
+std::pair<SDValue, SDValue>
+ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
+ SDValue &ARMcc) const {
+ assert(Op.getValueType() == MVT::i32 && "Unsupported value type");
+
+ SDValue Value, OverflowCmp;
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ SDLoc dl(Op);
+
+ // FIXME: We are currently always generating CMPs because we don't support
+ // generating CMN through the backend. This is not as good as the natural
+ // CMP case because it causes a register dependency and cannot be folded
+ // later.
+
+ switch (Op.getOpcode()) {
+ default:
+ llvm_unreachable("Unknown overflow instruction!");
+ case ISD::SADDO:
+ ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
+ Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
+ OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
+ break;
+ case ISD::UADDO:
+ ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
+ Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
+ OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
+ break;
+ case ISD::SSUBO:
+ ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
+ Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
+ OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
+ break;
+ case ISD::USUBO:
+ ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
+ Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
+ OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
+ break;
+ } // switch (...)
+
+ return std::make_pair(Value, OverflowCmp);
+}
+
+
+SDValue
+ARMTargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
+ // Let legalize expand this if it isn't a legal type yet.
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
+ return SDValue();
+
+ SDValue Value, OverflowCmp;
+ SDValue ARMcc;
+ std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);
+ SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+ SDLoc dl(Op);
+ // We use 0 and 1 as false and true values.
+ SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
+ SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
+ EVT VT = Op.getValueType();
+
+ SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal,
+ ARMcc, CCR, OverflowCmp);
+
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+ return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
+}
+
+
+SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
+ SDValue Cond = Op.getOperand(0);
+ SDValue SelectTrue = Op.getOperand(1);
+ SDValue SelectFalse = Op.getOperand(2);
+ SDLoc dl(Op);
+ unsigned Opc = Cond.getOpcode();
+
+ if (Cond.getResNo() == 1 &&
+ (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
+ Opc == ISD::USUBO)) {
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
+ return SDValue();
+
+ SDValue Value, OverflowCmp;
+ SDValue ARMcc;
+ std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
+ SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+ EVT VT = Op.getValueType();
+
+ return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR,
+ OverflowCmp, DAG);
+ }
+
+ // Convert:
+ //
+ // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
+ // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
+ //
+ if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
+ const ConstantSDNode *CMOVTrue =
+ dyn_cast<ConstantSDNode>(Cond.getOperand(0));
+ const ConstantSDNode *CMOVFalse =
+ dyn_cast<ConstantSDNode>(Cond.getOperand(1));
+
+ if (CMOVTrue && CMOVFalse) {
+ unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
+ unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
+
+ SDValue True;
+ SDValue False;
+ if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
+ True = SelectTrue;
+ False = SelectFalse;
+ } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
+ True = SelectFalse;
+ False = SelectTrue;
+ }
+
+ if (True.getNode() && False.getNode()) {
+ EVT VT = Op.getValueType();
+ SDValue ARMcc = Cond.getOperand(2);
+ SDValue CCR = Cond.getOperand(3);
+ SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG);
+ assert(True.getValueType() == VT);
+ return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG);
+ }
+ }
+ }
+
+ // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
+ // undefined bits before doing a full-word comparison with zero.
+ Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond,
+ DAG.getConstant(1, dl, Cond.getValueType()));
+
+ return DAG.getSelectCC(dl, Cond,
+ DAG.getConstant(0, dl, Cond.getValueType()),
+ SelectTrue, SelectFalse, ISD::SETNE);
+}
+
+static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
+ bool &swpCmpOps, bool &swpVselOps) {
+ // Start by selecting the GE condition code for opcodes that return true for
+ // 'equality'
+ if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
+ CC == ISD::SETULE)
+ CondCode = ARMCC::GE;
+
+ // and GT for opcodes that return false for 'equality'.
+ else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
+ CC == ISD::SETULT)
+ CondCode = ARMCC::GT;
+
+ // Since we are constrained to GE/GT, if the opcode contains 'less', we need
+ // to swap the compare operands.
+ if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
+ CC == ISD::SETULT)
+ swpCmpOps = true;
+
+ // Both GT and GE are ordered comparisons, and return false for 'unordered'.
+ // If we have an unordered opcode, we need to swap the operands to the VSEL
+ // instruction (effectively negating the condition).
+ //
+ // This also has the effect of swapping which one of 'less' or 'greater'
+ // returns true, so we also swap the compare operands. It also switches
+ // whether we return true for 'equality', so we compensate by picking the
+ // opposite condition code to our original choice.
+ if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
+ CC == ISD::SETUGT) {
+ swpCmpOps = !swpCmpOps;
+ swpVselOps = !swpVselOps;
+ CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
+ }
+
+ // 'ordered' is 'anything but unordered', so use the VS condition code and
+ // swap the VSEL operands.
+ if (CC == ISD::SETO) {
+ CondCode = ARMCC::VS;
+ swpVselOps = true;
+ }
+
+ // 'unordered or not equal' is 'anything but equal', so use the EQ condition
+ // code and swap the VSEL operands.
+ if (CC == ISD::SETUNE) {
+ CondCode = ARMCC::EQ;
+ swpVselOps = true;
+ }
+}
+
+SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal,
+ SDValue TrueVal, SDValue ARMcc, SDValue CCR,
+ SDValue Cmp, SelectionDAG &DAG) const {
+ if (Subtarget->isFPOnlySP() && VT == MVT::f64) {
+ FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl,
+ DAG.getVTList(MVT::i32, MVT::i32), FalseVal);
+ TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl,
+ DAG.getVTList(MVT::i32, MVT::i32), TrueVal);
+
+ SDValue TrueLow = TrueVal.getValue(0);
+ SDValue TrueHigh = TrueVal.getValue(1);
+ SDValue FalseLow = FalseVal.getValue(0);
+ SDValue FalseHigh = FalseVal.getValue(1);
+
+ SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow,
+ ARMcc, CCR, Cmp);
+ SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh,
+ ARMcc, CCR, duplicateCmp(Cmp, DAG));
+
+ return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High);
+ } else {
+ return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,
+ Cmp);
+ }
+}
+
+static bool isGTorGE(ISD::CondCode CC) {
+ return CC == ISD::SETGT || CC == ISD::SETGE;
+}
+
+static bool isLTorLE(ISD::CondCode CC) {
+ return CC == ISD::SETLT || CC == ISD::SETLE;
+}
+
+// See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating.
+// All of these conditions (and their <= and >= counterparts) will do:
+// x < k ? k : x
+// x > k ? x : k
+// k < x ? x : k
+// k > x ? k : x
+static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,
+ const SDValue TrueVal, const SDValue FalseVal,
+ const ISD::CondCode CC, const SDValue K) {
+ return (isGTorGE(CC) &&
+ ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) ||
+ (isLTorLE(CC) &&
+ ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal)));
+}
+
+// Similar to isLowerSaturate(), but checks for upper-saturating conditions.
+static bool isUpperSaturate(const SDValue LHS, const SDValue RHS,
+ const SDValue TrueVal, const SDValue FalseVal,
+ const ISD::CondCode CC, const SDValue K) {
+ return (isGTorGE(CC) &&
+ ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))) ||
+ (isLTorLE(CC) &&
+ ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal)));
+}
+
+// Check if two chained conditionals could be converted into SSAT.
+//
+// SSAT can replace a set of two conditional selectors that bound a number to an
+// interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples:
+//
+// x < -k ? -k : (x > k ? k : x)
+// x < -k ? -k : (x < k ? x : k)
+// x > -k ? (x > k ? k : x) : -k
+// x < k ? (x < -k ? -k : x) : k
+// etc.
+//
+// It returns true if the conversion can be done, false otherwise.
+// Additionally, the variable is returned in parameter V and the constant in K.
+static bool isSaturatingConditional(const SDValue &Op, SDValue &V,
+ uint64_t &K) {
+
+ SDValue LHS1 = Op.getOperand(0);
+ SDValue RHS1 = Op.getOperand(1);
+ SDValue TrueVal1 = Op.getOperand(2);
+ SDValue FalseVal1 = Op.getOperand(3);
+ ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+
+ const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1;
+ if (Op2.getOpcode() != ISD::SELECT_CC)
+ return false;
+
+ SDValue LHS2 = Op2.getOperand(0);
+ SDValue RHS2 = Op2.getOperand(1);
+ SDValue TrueVal2 = Op2.getOperand(2);
+ SDValue FalseVal2 = Op2.getOperand(3);
+ ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get();
+
+ // Find out which are the constants and which are the variables
+ // in each conditional
+ SDValue *K1 = isa<ConstantSDNode>(LHS1) ? &LHS1 : isa<ConstantSDNode>(RHS1)
+ ? &RHS1
+ : NULL;
+ SDValue *K2 = isa<ConstantSDNode>(LHS2) ? &LHS2 : isa<ConstantSDNode>(RHS2)
+ ? &RHS2
+ : NULL;
+ SDValue K2Tmp = isa<ConstantSDNode>(TrueVal2) ? TrueVal2 : FalseVal2;
+ SDValue V1Tmp = (K1 && *K1 == LHS1) ? RHS1 : LHS1;
+ SDValue V2Tmp = (K2 && *K2 == LHS2) ? RHS2 : LHS2;
+ SDValue V2 = (K2Tmp == TrueVal2) ? FalseVal2 : TrueVal2;
+
+ // We must detect cases where the original operations worked with 16- or
+ // 8-bit values. In such case, V2Tmp != V2 because the comparison operations
+ // must work with sign-extended values but the select operations return
+ // the original non-extended value.
+ SDValue V2TmpReg = V2Tmp;
+ if (V2Tmp->getOpcode() == ISD::SIGN_EXTEND_INREG)
+ V2TmpReg = V2Tmp->getOperand(0);
+
+ // Check that the registers and the constants have the correct values
+ // in both conditionals
+ if (!K1 || !K2 || *K1 == Op2 || *K2 != K2Tmp || V1Tmp != V2Tmp ||
+ V2TmpReg != V2)
+ return false;
+
+ // Figure out which conditional is saturating the lower/upper bound.
+ const SDValue *LowerCheckOp =
+ isLowerSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1)
+ ? &Op
+ : isLowerSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) ? &Op2
+ : NULL;
+ const SDValue *UpperCheckOp =
+ isUpperSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1)
+ ? &Op
+ : isUpperSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) ? &Op2
+ : NULL;
+
+ if (!UpperCheckOp || !LowerCheckOp || LowerCheckOp == UpperCheckOp)
+ return false;
+
+ // Check that the constant in the lower-bound check is
+ // the opposite of the constant in the upper-bound check
+ // in 1's complement.
+ int64_t Val1 = cast<ConstantSDNode>(*K1)->getSExtValue();
+ int64_t Val2 = cast<ConstantSDNode>(*K2)->getSExtValue();
+ int64_t PosVal = std::max(Val1, Val2);
+
+ if (((Val1 > Val2 && UpperCheckOp == &Op) ||
+ (Val1 < Val2 && UpperCheckOp == &Op2)) &&
+ Val1 == ~Val2 && isPowerOf2_64(PosVal + 1)) {
+
+ V = V2;
+ K = (uint64_t)PosVal; // At this point, PosVal is guaranteed to be positive
+ return true;
+ }
+
+ return false;
+}
+
+SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
+
+ EVT VT = Op.getValueType();
+ SDLoc dl(Op);
+
+ // Try to convert two saturating conditional selects into a single SSAT
+ SDValue SatValue;
+ uint64_t SatConstant;
+ if (((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) &&
+ isSaturatingConditional(Op, SatValue, SatConstant))
+ return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue,
+ DAG.getConstant(countTrailingOnes(SatConstant), dl, VT));
+
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+ SDValue TrueVal = Op.getOperand(2);
+ SDValue FalseVal = Op.getOperand(3);
+
+ if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) {
+ DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC,
+ dl);
+
+ // If softenSetCCOperands only returned one value, we should compare it to
+ // zero.
+ if (!RHS.getNode()) {
+ RHS = DAG.getConstant(0, dl, LHS.getValueType());
+ CC = ISD::SETNE;
+ }
+ }
+
+ if (LHS.getValueType() == MVT::i32) {
+ // Try to generate VSEL on ARMv8.
+ // The VSEL instruction can't use all the usual ARM condition
+ // codes: it only has two bits to select the condition code, so it's
+ // constrained to use only GE, GT, VS and EQ.
+ //
+ // To implement all the various ISD::SETXXX opcodes, we sometimes need to
+ // swap the operands of the previous compare instruction (effectively
+ // inverting the compare condition, swapping 'less' and 'greater') and
+ // sometimes need to swap the operands to the VSEL (which inverts the
+ // condition in the sense of firing whenever the previous condition didn't)
+ if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
+ TrueVal.getValueType() == MVT::f64)) {
+ ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
+ if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
+ CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
+ CC = ISD::getSetCCInverse(CC, true);
+ std::swap(TrueVal, FalseVal);
+ }
+ }
+
+ SDValue ARMcc;
+ SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+ SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
+ return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
+ }
+
+ ARMCC::CondCodes CondCode, CondCode2;
+ FPCCToARMCC(CC, CondCode, CondCode2);
+
+ // Try to generate VMAXNM/VMINNM on ARMv8.
+ if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
+ TrueVal.getValueType() == MVT::f64)) {
+ bool swpCmpOps = false;
+ bool swpVselOps = false;
+ checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
+
+ if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
+ CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
+ if (swpCmpOps)
+ std::swap(LHS, RHS);
+ if (swpVselOps)
+ std::swap(TrueVal, FalseVal);
+ }
+ }
+
+ SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
+ SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
+ SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+ SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
+ if (CondCode2 != ARMCC::AL) {
+ SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
+ // FIXME: Needs another CMP because flag can have but one use.
+ SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl);
+ Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG);
+ }
+ return Result;
+}
+
+/// canChangeToInt - Given the fp compare operand, return true if it is suitable
+/// to morph to an integer compare sequence.
+static bool canChangeToInt(SDValue Op, bool &SeenZero,
+ const ARMSubtarget *Subtarget) {
+ SDNode *N = Op.getNode();
+ if (!N->hasOneUse())
+ // Otherwise it requires moving the value from fp to integer registers.
+ return false;
+ if (!N->getNumValues())
+ return false;
+ EVT VT = Op.getValueType();
+ if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
+ // f32 case is generally profitable. f64 case only makes sense when vcmpe +
+ // vmrs are very slow, e.g. cortex-a8.
+ return false;
+
+ if (isFloatingPointZero(Op)) {
+ SeenZero = true;
+ return true;
+ }
+ return ISD::isNormalLoad(N);
+}
+
+static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) {
+ if (isFloatingPointZero(Op))
+ return DAG.getConstant(0, SDLoc(Op), MVT::i32);
+
+ if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op))
+ return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(),
+ Ld->getPointerInfo(), Ld->getAlignment(),
+ Ld->getMemOperand()->getFlags());
+
+ llvm_unreachable("Unknown VFP cmp argument!");
+}
+
+static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
+ SDValue &RetVal1, SDValue &RetVal2) {
+ SDLoc dl(Op);
+
+ if (isFloatingPointZero(Op)) {
+ RetVal1 = DAG.getConstant(0, dl, MVT::i32);
+ RetVal2 = DAG.getConstant(0, dl, MVT::i32);
+ return;
+ }
+
+ if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
+ SDValue Ptr = Ld->getBasePtr();
+ RetVal1 =
+ DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
+ Ld->getAlignment(), Ld->getMemOperand()->getFlags());
+
+ EVT PtrType = Ptr.getValueType();
+ unsigned NewAlign = MinAlign(Ld->getAlignment(), 4);
+ SDValue NewPtr = DAG.getNode(ISD::ADD, dl,
+ PtrType, Ptr, DAG.getConstant(4, dl, PtrType));
+ RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr,
+ Ld->getPointerInfo().getWithOffset(4), NewAlign,
+ Ld->getMemOperand()->getFlags());
+ return;
+ }
+
+ llvm_unreachable("Unknown VFP cmp argument!");
+}
+
+/// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some
+/// f32 and even f64 comparisons to integer ones.
+SDValue
+ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
+ SDValue Chain = Op.getOperand(0);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+ SDValue LHS = Op.getOperand(2);
+ SDValue RHS = Op.getOperand(3);
+ SDValue Dest = Op.getOperand(4);
+ SDLoc dl(Op);
+
+ bool LHSSeenZero = false;
+ bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
+ bool RHSSeenZero = false;
+ bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);
+ if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {
+ // If unsafe fp math optimization is enabled and there are no other uses of
+ // the CMP operands, and the condition code is EQ or NE, we can optimize it
+ // to an integer comparison.
+ if (CC == ISD::SETOEQ)
+ CC = ISD::SETEQ;
+ else if (CC == ISD::SETUNE)
+ CC = ISD::SETNE;
+
+ SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32);
+ SDValue ARMcc;
+ if (LHS.getValueType() == MVT::f32) {
+ LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
+ bitcastf32Toi32(LHS, DAG), Mask);
+ RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
+ bitcastf32Toi32(RHS, DAG), Mask);
+ SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
+ SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+ return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
+ Chain, Dest, ARMcc, CCR, Cmp);
+ }
+
+ SDValue LHS1, LHS2;
+ SDValue RHS1, RHS2;
+ expandf64Toi32(LHS, DAG, LHS1, LHS2);
+ expandf64Toi32(RHS, DAG, RHS1, RHS2);
+ LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
+ RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
+ ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
+ ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
+ SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
+ return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops);
+ }
+
+ return SDValue();
+}
+
+SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
+ SDValue Chain = Op.getOperand(0);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+ SDValue LHS = Op.getOperand(2);
+ SDValue RHS = Op.getOperand(3);
+ SDValue Dest = Op.getOperand(4);
+ SDLoc dl(Op);
+
+ if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) {
+ DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC,
+ dl);
+
+ // If softenSetCCOperands only returned one value, we should compare it to
+ // zero.
+ if (!RHS.getNode()) {
+ RHS = DAG.getConstant(0, dl, LHS.getValueType());
+ CC = ISD::SETNE;
+ }
+ }
+
+ if (LHS.getValueType() == MVT::i32) {
+ SDValue ARMcc;
+ SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
+ SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+ return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
+ Chain, Dest, ARMcc, CCR, Cmp);
+ }
+
+ assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
+
+ if (getTargetMachine().Options.UnsafeFPMath &&
+ (CC == ISD::SETEQ || CC == ISD::SETOEQ ||
+ CC == ISD::SETNE || CC == ISD::SETUNE)) {
+ if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
+ return Result;
+ }
+
+ ARMCC::CondCodes CondCode, CondCode2;
+ FPCCToARMCC(CC, CondCode, CondCode2);
+
+ SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
+ SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
+ SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+ SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp };
+ SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
+ if (CondCode2 != ARMCC::AL) {
+ ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
+ SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) };
+ Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
+ }
+ return Res;
+}
+
+SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
+ SDValue Chain = Op.getOperand(0);
+ SDValue Table = Op.getOperand(1);
+ SDValue Index = Op.getOperand(2);
+ SDLoc dl(Op);
+
+ EVT PTy = getPointerTy(DAG.getDataLayout());
+ JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
+ SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
+ Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI);
+ Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy));
+ SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Index, Table);
+ if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) {
+ // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table
+ // which does another jump to the destination. This also makes it easier
+ // to translate it to TBB / TBH later (Thumb2 only).
+ // FIXME: This might not work if the function is extremely large.
+ return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
+ Addr, Op.getOperand(2), JTI);
+ }
+ if (isPositionIndependent() || Subtarget->isROPI()) {
+ Addr =
+ DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
+ MachinePointerInfo::getJumpTable(DAG.getMachineFunction()));
+ Chain = Addr.getValue(1);
+ Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr, Table);
+ return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
+ } else {
+ Addr =
+ DAG.getLoad(PTy, dl, Chain, Addr,
+ MachinePointerInfo::getJumpTable(DAG.getMachineFunction()));
+ Chain = Addr.getValue(1);
+ return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
+ }
+}
+
+static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
+ EVT VT = Op.getValueType();
+ SDLoc dl(Op);
+
+ if (Op.getValueType().getVectorElementType() == MVT::i32) {
+ if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
+ return Op;
+ return DAG.UnrollVectorOp(Op.getNode());
+ }
+
+ assert(Op.getOperand(0).getValueType() == MVT::v4f32 &&
+ "Invalid type for custom lowering!");
+ if (VT != MVT::v4i16)
+ return DAG.UnrollVectorOp(Op.getNode());
+
+ Op = DAG.getNode(Op.getOpcode(), dl, MVT::v4i32, Op.getOperand(0));
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
+}
+
+SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ if (VT.isVector())
+ return LowerVectorFP_TO_INT(Op, DAG);
+ if (Subtarget->isFPOnlySP() && Op.getOperand(0).getValueType() == MVT::f64) {
+ RTLIB::Libcall LC;
+ if (Op.getOpcode() == ISD::FP_TO_SINT)
+ LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(),
+ Op.getValueType());
+ else
+ LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(),
+ Op.getValueType());
+ return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
+ /*isSigned*/ false, SDLoc(Op)).first;
+ }
+
+ return Op;
+}
+
+static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
+ EVT VT = Op.getValueType();
+ SDLoc dl(Op);
+
+ if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
+ if (VT.getVectorElementType() == MVT::f32)
+ return Op;
+ return DAG.UnrollVectorOp(Op.getNode());
+ }
+
+ assert(Op.getOperand(0).getValueType() == MVT::v4i16 &&
+ "Invalid type for custom lowering!");
+ if (VT != MVT::v4f32)
+ return DAG.UnrollVectorOp(Op.getNode());
+
+ unsigned CastOpc;
+ unsigned Opc;
+ switch (Op.getOpcode()) {
+ default: llvm_unreachable("Invalid opcode!");
+ case ISD::SINT_TO_FP:
+ CastOpc = ISD::SIGN_EXTEND;
+ Opc = ISD::SINT_TO_FP;
+ break;
+ case ISD::UINT_TO_FP:
+ CastOpc = ISD::ZERO_EXTEND;
+ Opc = ISD::UINT_TO_FP;
+ break;
+ }
+
+ Op = DAG.getNode(CastOpc, dl, MVT::v4i32, Op.getOperand(0));
+ return DAG.getNode(Opc, dl, VT, Op);
+}
+
+SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ if (VT.isVector())
+ return LowerVectorINT_TO_FP(Op, DAG);
+ if (Subtarget->isFPOnlySP() && Op.getValueType() == MVT::f64) {
+ RTLIB::Libcall LC;
+ if (Op.getOpcode() == ISD::SINT_TO_FP)
+ LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),
+ Op.getValueType());
+ else
+ LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
+ Op.getValueType());
+ return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
+ /*isSigned*/ false, SDLoc(Op)).first;
+ }
+
+ return Op;
+}
+
+SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
+ // Implement fcopysign with a fabs and a conditional fneg.
+ SDValue Tmp0 = Op.getOperand(0);
+ SDValue Tmp1 = Op.getOperand(1);
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+ EVT SrcVT = Tmp1.getValueType();
+ bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
+ Tmp0.getOpcode() == ARMISD::VMOVDRR;
+ bool UseNEON = !InGPR && Subtarget->hasNEON();
+
+ if (UseNEON) {
+ // Use VBSL to copy the sign bit.
+ unsigned EncodedVal = ARM_AM::createNEONModImm(0x6, 0x80);
+ SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
+ DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
+ EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
+ if (VT == MVT::f64)
+ Mask = DAG.getNode(ARMISD::VSHL, dl, OpVT,
+ DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
+ DAG.getConstant(32, dl, MVT::i32));
+ else /*if (VT == MVT::f32)*/
+ Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
+ if (SrcVT == MVT::f32) {
+ Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
+ if (VT == MVT::f64)
+ Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT,
+ DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
+ DAG.getConstant(32, dl, MVT::i32));
+ } else if (VT == MVT::f32)
+ Tmp1 = DAG.getNode(ARMISD::VSHRu, dl, MVT::v1i64,
+ DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
+ DAG.getConstant(32, dl, MVT::i32));
+ Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
+ Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
+
+ SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createNEONModImm(0xe, 0xff),
+ dl, MVT::i32);
+ AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
+ SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
+ DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
+
+ SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
+ DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
+ DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
+ if (VT == MVT::f32) {
+ Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
+ Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
+ DAG.getConstant(0, dl, MVT::i32));
+ } else {
+ Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
+ }
+
+ return Res;
+ }
+
+ // Bitcast operand 1 to i32.
+ if (SrcVT == MVT::f64)
+ Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
+ Tmp1).getValue(1);
+ Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
+
+ // Or in the signbit with integer operations.
+ SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32);
+ SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32);
+ Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
+ if (VT == MVT::f32) {
+ Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
+ DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
+ return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
+ DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
+ }
+
+ // f64: Or the high part with signbit and then combine two parts.
+ Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
+ Tmp0);
+ SDValue Lo = Tmp0.getValue(0);
+ SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
+ Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
+ return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
+}
+
+SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MFI.setReturnAddressIsTaken(true);
+
+ if (verifyReturnAddressArgumentIsConstant(Op, DAG))
+ return SDValue();
+
+ EVT VT = Op.getValueType();
+ SDLoc dl(Op);
+ unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ if (Depth) {
+ SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+ SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
+ return DAG.getLoad(VT, dl, DAG.getEntryNode(),
+ DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
+ MachinePointerInfo());
+ }
+
+ // Return LR, which contains the return address. Mark it an implicit live-in.
+ unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
+ return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
+}
+
+SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
+ const ARMBaseRegisterInfo &ARI =
+ *static_cast<const ARMBaseRegisterInfo*>(RegInfo);
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MFI.setFrameAddressIsTaken(true);
+
+ EVT VT = Op.getValueType();
+ SDLoc dl(Op); // FIXME probably not meaningful
+ unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ unsigned FrameReg = ARI.getFrameRegister(MF);
+ SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
+ while (Depth--)
+ FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
+ MachinePointerInfo());
+ return FrameAddr;
+}
+
+// FIXME? Maybe this could be a TableGen attribute on some registers and
+// this table could be generated automatically from RegInfo.
+unsigned ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT,
+ SelectionDAG &DAG) const {
+ unsigned Reg = StringSwitch<unsigned>(RegName)
+ .Case("sp", ARM::SP)
+ .Default(0);
+ if (Reg)
+ return Reg;
+ report_fatal_error(Twine("Invalid register name \""
+ + StringRef(RegName) + "\"."));
+}
+
+// Result is 64 bit value so split into two 32 bit values and return as a
+// pair of values.
+static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) {
+ SDLoc DL(N);
+
+ // This function is only supposed to be called for i64 type destination.
+ assert(N->getValueType(0) == MVT::i64
+ && "ExpandREAD_REGISTER called for non-i64 type result.");
+
+ SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL,
+ DAG.getVTList(MVT::i32, MVT::i32, MVT::Other),
+ N->getOperand(0),
+ N->getOperand(1));
+
+ Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0),
+ Read.getValue(1)));
+ Results.push_back(Read.getOperand(0));
+}
+
+/// \p BC is a bitcast that is about to be turned into a VMOVDRR.
+/// When \p DstVT, the destination type of \p BC, is on the vector
+/// register bank and the source of bitcast, \p Op, operates on the same bank,
+/// it might be possible to combine them, such that everything stays on the
+/// vector register bank.
+/// \p return The node that would replace \p BT, if the combine
+/// is possible.
+static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC,
+ SelectionDAG &DAG) {
+ SDValue Op = BC->getOperand(0);
+ EVT DstVT = BC->getValueType(0);
+
+ // The only vector instruction that can produce a scalar (remember,
+ // since the bitcast was about to be turned into VMOVDRR, the source
+ // type is i64) from a vector is EXTRACT_VECTOR_ELT.
+ // Moreover, we can do this combine only if there is one use.
+ // Finally, if the destination type is not a vector, there is not
+ // much point on forcing everything on the vector bank.
+ if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ !Op.hasOneUse())
+ return SDValue();
+
+ // If the index is not constant, we will introduce an additional
+ // multiply that will stick.
+ // Give up in that case.
+ ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+ if (!Index)
+ return SDValue();
+ unsigned DstNumElt = DstVT.getVectorNumElements();
+
+ // Compute the new index.
+ const APInt &APIntIndex = Index->getAPIntValue();
+ APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
+ NewIndex *= APIntIndex;
+ // Check if the new constant index fits into i32.
+ if (NewIndex.getBitWidth() > 32)
+ return SDValue();
+
+ // vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
+ // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
+ SDLoc dl(Op);
+ SDValue ExtractSrc = Op.getOperand(0);
+ EVT VecVT = EVT::getVectorVT(
+ *DAG.getContext(), DstVT.getScalarType(),
+ ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
+ SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,
+ DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
+}
+
+/// ExpandBITCAST - If the target supports VFP, this function is called to
+/// expand a bit convert where either the source or destination type is i64 to
+/// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64
+/// operand type is illegal (e.g., v2f32 for a target that doesn't support
+/// vectors), since the legalizer won't know what to do with that.
+static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDLoc dl(N);
+ SDValue Op = N->getOperand(0);
+
+ // This function is only supposed to be called for i64 types, either as the
+ // source or destination of the bit convert.
+ EVT SrcVT = Op.getValueType();
+ EVT DstVT = N->getValueType(0);
+ assert((SrcVT == MVT::i64 || DstVT == MVT::i64) &&
+ "ExpandBITCAST called for non-i64 type");
+
+ // Turn i64->f64 into VMOVDRR.
+ if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
+ // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
+ // if we can combine the bitcast with its source.
+ if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG))
+ return Val;
+
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
+ DAG.getConstant(0, dl, MVT::i32));
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
+ DAG.getConstant(1, dl, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, DstVT,
+ DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
+ }
+
+ // Turn f64->i64 into VMOVRRD.
+ if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) {
+ SDValue Cvt;
+ if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&
+ SrcVT.getVectorNumElements() > 1)
+ Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
+ DAG.getVTList(MVT::i32, MVT::i32),
+ DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op));
+ else
+ Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
+ DAG.getVTList(MVT::i32, MVT::i32), Op);
+ // Merge the pieces into a single i64 value.
+ return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
+ }
+
+ return SDValue();
+}
+
+/// getZeroVector - Returns a vector of specified type with all zero elements.
+/// Zero vectors are used to represent vector negation and in those cases
+/// will be implemented with the NEON VNEG instruction. However, VNEG does
+/// not support i64 elements, so sometimes the zero vectors will need to be
+/// explicitly constructed. Regardless, use a canonical VMOV to create the
+/// zero vector.
+static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
+ assert(VT.isVector() && "Expected a vector type");
+ // The canonical modified immediate encoding of a zero vector is....0!
+ SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32);
+ EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
+ SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
+ return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
+}
+
+/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
+/// i32 values and take a 2 x i32 value to shift plus a shift amount.
+SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+ EVT VT = Op.getValueType();
+ unsigned VTBits = VT.getSizeInBits();
+ SDLoc dl(Op);
+ SDValue ShOpLo = Op.getOperand(0);
+ SDValue ShOpHi = Op.getOperand(1);
+ SDValue ShAmt = Op.getOperand(2);
+ SDValue ARMcc;
+ SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+ unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
+
+ assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
+
+ SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
+ DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
+ SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
+ SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
+ DAG.getConstant(VTBits, dl, MVT::i32));
+ SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
+ SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+ SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
+ SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
+ ISD::SETGE, ARMcc, DAG, dl);
+ SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift,
+ ARMcc, CCR, CmpLo);
+
+
+ SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
+ SDValue HiBigShift = Opc == ISD::SRA
+ ? DAG.getNode(Opc, dl, VT, ShOpHi,
+ DAG.getConstant(VTBits - 1, dl, VT))
+ : DAG.getConstant(0, dl, VT);
+ SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
+ ISD::SETGE, ARMcc, DAG, dl);
+ SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift,
+ ARMcc, CCR, CmpHi);
+
+ SDValue Ops[2] = { Lo, Hi };
+ return DAG.getMergeValues(Ops, dl);
+}
+
+/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
+/// i32 values and take a 2 x i32 value to shift plus a shift amount.
+SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+ EVT VT = Op.getValueType();
+ unsigned VTBits = VT.getSizeInBits();
+ SDLoc dl(Op);
+ SDValue ShOpLo = Op.getOperand(0);
+ SDValue ShOpHi = Op.getOperand(1);
+ SDValue ShAmt = Op.getOperand(2);
+ SDValue ARMcc;
+ SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+
+ assert(Op.getOpcode() == ISD::SHL_PARTS);
+ SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
+ DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
+ SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
+ SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
+ SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+
+ SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
+ DAG.getConstant(VTBits, dl, MVT::i32));
+ SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
+ SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
+ ISD::SETGE, ARMcc, DAG, dl);
+ SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift,
+ ARMcc, CCR, CmpHi);
+
+ SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
+ ISD::SETGE, ARMcc, DAG, dl);
+ SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+ SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift,
+ DAG.getConstant(0, dl, VT), ARMcc, CCR, CmpLo);
+
+ SDValue Ops[2] = { Lo, Hi };
+ return DAG.getMergeValues(Ops, dl);
+}
+
+SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
+ SelectionDAG &DAG) const {
+ // The rounding mode is in bits 23:22 of the FPSCR.
+ // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
+ // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
+ // so that the shift + and get folded into a bitfield extract.
+ SDLoc dl(Op);
+ SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32,
+ DAG.getConstant(Intrinsic::arm_get_fpscr, dl,
+ MVT::i32));
+ SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
+ DAG.getConstant(1U << 22, dl, MVT::i32));
+ SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
+ DAG.getConstant(22, dl, MVT::i32));
+ return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
+ DAG.getConstant(3, dl, MVT::i32));
+}
+
+static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
+ SDLoc dl(N);
+ EVT VT = N->getValueType(0);
+ if (VT.isVector()) {
+ assert(ST->hasNEON());
+
+ // Compute the least significant set bit: LSB = X & -X
+ SDValue X = N->getOperand(0);
+ SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X);
+ SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX);
+
+ EVT ElemTy = VT.getVectorElementType();
+
+ if (ElemTy == MVT::i8) {
+ // Compute with: cttz(x) = ctpop(lsb - 1)
+ SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
+ DAG.getTargetConstant(1, dl, ElemTy));
+ SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
+ return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
+ }
+
+ if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) &&
+ (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) {
+ // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0
+ unsigned NumBits = ElemTy.getSizeInBits();
+ SDValue WidthMinus1 =
+ DAG.getNode(ARMISD::VMOVIMM, dl, VT,
+ DAG.getTargetConstant(NumBits - 1, dl, ElemTy));
+ SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB);
+ return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ);
+ }
+
+ // Compute with: cttz(x) = ctpop(lsb - 1)
+
+ // Since we can only compute the number of bits in a byte with vcnt.8, we
+ // have to gather the result with pairwise addition (vpaddl) for i16, i32,
+ // and i64.
+
+ // Compute LSB - 1.
+ SDValue Bits;
+ if (ElemTy == MVT::i64) {
+ // Load constant 0xffff'ffff'ffff'ffff to register.
+ SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
+ DAG.getTargetConstant(0x1eff, dl, MVT::i32));
+ Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF);
+ } else {
+ SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
+ DAG.getTargetConstant(1, dl, ElemTy));
+ Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
+ }
+
+ // Count #bits with vcnt.8.
+ EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
+ SDValue BitsVT8 = DAG.getNode(ISD::BITCAST, dl, VT8Bit, Bits);
+ SDValue Cnt8 = DAG.getNode(ISD::CTPOP, dl, VT8Bit, BitsVT8);
+
+ // Gather the #bits with vpaddl (pairwise add.)
+ EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
+ SDValue Cnt16 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT16Bit,
+ DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
+ Cnt8);
+ if (ElemTy == MVT::i16)
+ return Cnt16;
+
+ EVT VT32Bit = VT.is64BitVector() ? MVT::v2i32 : MVT::v4i32;
+ SDValue Cnt32 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT32Bit,
+ DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
+ Cnt16);
+ if (ElemTy == MVT::i32)
+ return Cnt32;
+
+ assert(ElemTy == MVT::i64);
+ SDValue Cnt64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
+ Cnt32);
+ return Cnt64;
+ }
+
+ if (!ST->hasV6T2Ops())
+ return SDValue();
+
+ SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0));
+ return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
+}
+
+/// getCTPOP16BitCounts - Returns a v8i8/v16i8 vector containing the bit-count
+/// for each 16-bit element from operand, repeated. The basic idea is to
+/// leverage vcnt to get the 8-bit counts, gather and add the results.
+///
+/// Trace for v4i16:
+/// input = [v0 v1 v2 v3 ] (vi 16-bit element)
+/// cast: N0 = [w0 w1 w2 w3 w4 w5 w6 w7] (v0 = [w0 w1], wi 8-bit element)
+/// vcnt: N1 = [b0 b1 b2 b3 b4 b5 b6 b7] (bi = bit-count of 8-bit element wi)
+/// vrev: N2 = [b1 b0 b3 b2 b5 b4 b7 b6]
+/// [b0 b1 b2 b3 b4 b5 b6 b7]
+/// +[b1 b0 b3 b2 b5 b4 b7 b6]
+/// N3=N1+N2 = [k0 k0 k1 k1 k2 k2 k3 k3] (k0 = b0+b1 = bit-count of 16-bit v0,
+/// vuzp: = [k0 k1 k2 k3 k0 k1 k2 k3] each ki is 8-bits)
+static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+ SDLoc DL(N);
+
+ EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
+ SDValue N0 = DAG.getNode(ISD::BITCAST, DL, VT8Bit, N->getOperand(0));
+ SDValue N1 = DAG.getNode(ISD::CTPOP, DL, VT8Bit, N0);
+ SDValue N2 = DAG.getNode(ARMISD::VREV16, DL, VT8Bit, N1);
+ SDValue N3 = DAG.getNode(ISD::ADD, DL, VT8Bit, N1, N2);
+ return DAG.getNode(ARMISD::VUZP, DL, VT8Bit, N3, N3);
+}
+
+/// lowerCTPOP16BitElements - Returns a v4i16/v8i16 vector containing the
+/// bit-count for each 16-bit element from the operand. We need slightly
+/// different sequencing for v4i16 and v8i16 to stay within NEON's available
+/// 64/128-bit registers.
+///
+/// Trace for v4i16:
+/// input = [v0 v1 v2 v3 ] (vi 16-bit element)
+/// v8i8: BitCounts = [k0 k1 k2 k3 k0 k1 k2 k3 ] (ki is the bit-count of vi)
+/// v8i16:Extended = [k0 k1 k2 k3 k0 k1 k2 k3 ]
+/// v4i16:Extracted = [k0 k1 k2 k3 ]
+static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+ SDLoc DL(N);
+
+ SDValue BitCounts = getCTPOP16BitCounts(N, DAG);
+ if (VT.is64BitVector()) {
+ SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, BitCounts);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Extended,
+ DAG.getIntPtrConstant(0, DL));
+ } else {
+ SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8,
+ BitCounts, DAG.getIntPtrConstant(0, DL));
+ return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, Extracted);
+ }
+}
+
+/// lowerCTPOP32BitElements - Returns a v2i32/v4i32 vector containing the
+/// bit-count for each 32-bit element from the operand. The idea here is
+/// to split the vector into 16-bit elements, leverage the 16-bit count
+/// routine, and then combine the results.
+///
+/// Trace for v2i32 (v4i32 similar with Extracted/Extended exchanged):
+/// input = [v0 v1 ] (vi: 32-bit elements)
+/// Bitcast = [w0 w1 w2 w3 ] (wi: 16-bit elements, v0 = [w0 w1])
+/// Counts16 = [k0 k1 k2 k3 ] (ki: 16-bit elements, bit-count of wi)
+/// vrev: N0 = [k1 k0 k3 k2 ]
+/// [k0 k1 k2 k3 ]
+/// N1 =+[k1 k0 k3 k2 ]
+/// [k0 k2 k1 k3 ]
+/// N2 =+[k1 k3 k0 k2 ]
+/// [k0 k2 k1 k3 ]
+/// Extended =+[k1 k3 k0 k2 ]
+/// [k0 k2 ]
+/// Extracted=+[k1 k3 ]
+///
+static SDValue lowerCTPOP32BitElements(SDNode *N, SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+ SDLoc DL(N);
+
+ EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
+
+ SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, VT16Bit, N->getOperand(0));
+ SDValue Counts16 = lowerCTPOP16BitElements(Bitcast.getNode(), DAG);
+ SDValue N0 = DAG.getNode(ARMISD::VREV32, DL, VT16Bit, Counts16);
+ SDValue N1 = DAG.getNode(ISD::ADD, DL, VT16Bit, Counts16, N0);
+ SDValue N2 = DAG.getNode(ARMISD::VUZP, DL, VT16Bit, N1, N1);
+
+ if (VT.is64BitVector()) {
+ SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, N2);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Extended,
+ DAG.getIntPtrConstant(0, DL));
+ } else {
+ SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, N2,
+ DAG.getIntPtrConstant(0, DL));
+ return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, Extracted);
+ }
+}
+
+static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
+ EVT VT = N->getValueType(0);
+
+ assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
+ assert((VT == MVT::v2i32 || VT == MVT::v4i32 ||
+ VT == MVT::v4i16 || VT == MVT::v8i16) &&
+ "Unexpected type for custom ctpop lowering");
+
+ if (VT.getVectorElementType() == MVT::i32)
+ return lowerCTPOP32BitElements(N, DAG);
+ else
+ return lowerCTPOP16BitElements(N, DAG);
+}
+
+static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
+ EVT VT = N->getValueType(0);
+ SDLoc dl(N);
+
+ if (!VT.isVector())
+ return SDValue();
+
+ // Lower vector shifts on NEON to use VSHL.
+ assert(ST->hasNEON() && "unexpected vector shift");
+
+ // Left shifts translate directly to the vshiftu intrinsic.
+ if (N->getOpcode() == ISD::SHL)
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(Intrinsic::arm_neon_vshiftu, dl,
+ MVT::i32),
+ N->getOperand(0), N->getOperand(1));
+
+ assert((N->getOpcode() == ISD::SRA ||
+ N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode");
+
+ // NEON uses the same intrinsics for both left and right shifts. For
+ // right shifts, the shift amounts are negative, so negate the vector of
+ // shift amounts.
+ EVT ShiftVT = N->getOperand(1).getValueType();
+ SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT,
+ getZeroVector(ShiftVT, DAG, dl),
+ N->getOperand(1));
+ Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ?
+ Intrinsic::arm_neon_vshifts :
+ Intrinsic::arm_neon_vshiftu);
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(vshiftInt, dl, MVT::i32),
+ N->getOperand(0), NegatedCount);
+}
+
+static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
+ EVT VT = N->getValueType(0);
+ SDLoc dl(N);
+
+ // We can get here for a node like i32 = ISD::SHL i32, i64
+ if (VT != MVT::i64)
+ return SDValue();
+
+ assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
+ "Unknown shift to lower!");
+
+ // We only lower SRA, SRL of 1 here, all others use generic lowering.
+ if (!isOneConstant(N->getOperand(1)))
+ return SDValue();
+
+ // If we are in thumb mode, we don't have RRX.
+ if (ST->isThumb1Only()) return SDValue();
+
+ // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr.
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
+ DAG.getConstant(0, dl, MVT::i32));
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
+ DAG.getConstant(1, dl, MVT::i32));
+
+ // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and
+ // captures the result into a carry flag.
+ unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG;
+ Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi);
+
+ // The low part is an ARMISD::RRX operand, which shifts the carry in.
+ Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
+
+ // Merge the pieces into a single i64 value.
+ return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+}
+
+static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
+ SDValue TmpOp0, TmpOp1;
+ bool Invert = false;
+ bool Swap = false;
+ unsigned Opc = 0;
+
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ SDValue CC = Op.getOperand(2);
+ EVT CmpVT = Op0.getValueType().changeVectorElementTypeToInteger();
+ EVT VT = Op.getValueType();
+ ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+ SDLoc dl(Op);
+
+ if (Op0.getValueType().getVectorElementType() == MVT::i64 &&
+ (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) {
+ // Special-case integer 64-bit equality comparisons. They aren't legal,
+ // but they can be lowered with a few vector instructions.
+ unsigned CmpElements = CmpVT.getVectorNumElements() * 2;
+ EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements);
+ SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0);
+ SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1);
+ SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1,
+ DAG.getCondCode(ISD::SETEQ));
+ SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp);
+ SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed);
+ Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged);
+ if (SetCCOpcode == ISD::SETNE)
+ Merged = DAG.getNOT(dl, Merged, CmpVT);
+ Merged = DAG.getSExtOrTrunc(Merged, dl, VT);
+ return Merged;
+ }
+
+ if (CmpVT.getVectorElementType() == MVT::i64)
+ // 64-bit comparisons are not legal in general.
+ return SDValue();
+
+ if (Op1.getValueType().isFloatingPoint()) {
+ switch (SetCCOpcode) {
+ default: llvm_unreachable("Illegal FP comparison");
+ case ISD::SETUNE:
+ case ISD::SETNE: Invert = true; LLVM_FALLTHROUGH;
+ case ISD::SETOEQ:
+ case ISD::SETEQ: Opc = ARMISD::VCEQ; break;
+ case ISD::SETOLT:
+ case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
+ case ISD::SETOGT:
+ case ISD::SETGT: Opc = ARMISD::VCGT; break;
+ case ISD::SETOLE:
+ case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH;
+ case ISD::SETOGE:
+ case ISD::SETGE: Opc = ARMISD::VCGE; break;
+ case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH;
+ case ISD::SETULE: Invert = true; Opc = ARMISD::VCGT; break;
+ case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH;
+ case ISD::SETULT: Invert = true; Opc = ARMISD::VCGE; break;
+ case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH;
+ case ISD::SETONE:
+ // Expand this to (OLT | OGT).
+ TmpOp0 = Op0;
+ TmpOp1 = Op1;
+ Opc = ISD::OR;
+ Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0);
+ Op1 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp0, TmpOp1);
+ break;
+ case ISD::SETUO:
+ Invert = true;
+ LLVM_FALLTHROUGH;
+ case ISD::SETO:
+ // Expand this to (OLT | OGE).
+ TmpOp0 = Op0;
+ TmpOp1 = Op1;
+ Opc = ISD::OR;
+ Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0);
+ Op1 = DAG.getNode(ARMISD::VCGE, dl, CmpVT, TmpOp0, TmpOp1);
+ break;
+ }
+ } else {
+ // Integer comparisons.
+ switch (SetCCOpcode) {
+ default: llvm_unreachable("Illegal integer comparison");
+ case ISD::SETNE: Invert = true;
+ case ISD::SETEQ: Opc = ARMISD::VCEQ; break;
+ case ISD::SETLT: Swap = true;
+ case ISD::SETGT: Opc = ARMISD::VCGT; break;
+ case ISD::SETLE: Swap = true;
+ case ISD::SETGE: Opc = ARMISD::VCGE; break;
+ case ISD::SETULT: Swap = true;
+ case ISD::SETUGT: Opc = ARMISD::VCGTU; break;
+ case ISD::SETULE: Swap = true;
+ case ISD::SETUGE: Opc = ARMISD::VCGEU; break;
+ }
+
+ // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
+ if (Opc == ARMISD::VCEQ) {
+
+ SDValue AndOp;
+ if (ISD::isBuildVectorAllZeros(Op1.getNode()))
+ AndOp = Op0;
+ else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
+ AndOp = Op1;
+
+ // Ignore bitconvert.
+ if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
+ AndOp = AndOp.getOperand(0);
+
+ if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
+ Opc = ARMISD::VTST;
+ Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));
+ Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));
+ Invert = !Invert;
+ }
+ }
+ }
+
+ if (Swap)
+ std::swap(Op0, Op1);
+
+ // If one of the operands is a constant vector zero, attempt to fold the
+ // comparison to a specialized compare-against-zero form.
+ SDValue SingleOp;
+ if (ISD::isBuildVectorAllZeros(Op1.getNode()))
+ SingleOp = Op0;
+ else if (ISD::isBuildVectorAllZeros(Op0.getNode())) {
+ if (Opc == ARMISD::VCGE)
+ Opc = ARMISD::VCLEZ;
+ else if (Opc == ARMISD::VCGT)
+ Opc = ARMISD::VCLTZ;
+ SingleOp = Op1;
+ }
+
+ SDValue Result;
+ if (SingleOp.getNode()) {
+ switch (Opc) {
+ case ARMISD::VCEQ:
+ Result = DAG.getNode(ARMISD::VCEQZ, dl, CmpVT, SingleOp); break;
+ case ARMISD::VCGE:
+ Result = DAG.getNode(ARMISD::VCGEZ, dl, CmpVT, SingleOp); break;
+ case ARMISD::VCLEZ:
+ Result = DAG.getNode(ARMISD::VCLEZ, dl, CmpVT, SingleOp); break;
+ case ARMISD::VCGT:
+ Result = DAG.getNode(ARMISD::VCGTZ, dl, CmpVT, SingleOp); break;
+ case ARMISD::VCLTZ:
+ Result = DAG.getNode(ARMISD::VCLTZ, dl, CmpVT, SingleOp); break;
+ default:
+ Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1);
+ }
+ } else {
+ Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1);
+ }
+
+ Result = DAG.getSExtOrTrunc(Result, dl, VT);
+
+ if (Invert)
+ Result = DAG.getNOT(dl, Result, VT);
+
+ return Result;
+}
+
+static SDValue LowerSETCCE(SDValue Op, SelectionDAG &DAG) {
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ SDValue Carry = Op.getOperand(2);
+ SDValue Cond = Op.getOperand(3);
+ SDLoc DL(Op);
+
+ assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only.");
+
+ assert(Carry.getOpcode() != ISD::CARRY_FALSE);
+ SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
+ SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry);
+
+ SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
+ SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
+ SDValue ARMcc = DAG.getConstant(
+ IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32);
+ SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+ SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR,
+ Cmp.getValue(1), SDValue());
+ return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc,
+ CCR, Chain.getValue(1));
+}
+
+/// isNEONModifiedImm - Check if the specified splat value corresponds to a
+/// valid vector constant for a NEON instruction with a "modified immediate"
+/// operand (e.g., VMOV). If so, return the encoded value.
+static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
+ unsigned SplatBitSize, SelectionDAG &DAG,
+ const SDLoc &dl, EVT &VT, bool is128Bits,
+ NEONModImmType type) {
+ unsigned OpCmode, Imm;
+
+ // SplatBitSize is set to the smallest size that splats the vector, so a
+ // zero vector will always have SplatBitSize == 8. However, NEON modified
+ // immediate instructions others than VMOV do not support the 8-bit encoding
+ // of a zero vector, and the default encoding of zero is supposed to be the
+ // 32-bit version.
+ if (SplatBits == 0)
+ SplatBitSize = 32;
+
+ switch (SplatBitSize) {
+ case 8:
+ if (type != VMOVModImm)
+ return SDValue();
+ // Any 1-byte value is OK. Op=0, Cmode=1110.
+ assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
+ OpCmode = 0xe;
+ Imm = SplatBits;
+ VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
+ break;
+
+ case 16:
+ // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
+ VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
+ if ((SplatBits & ~0xff) == 0) {
+ // Value = 0x00nn: Op=x, Cmode=100x.
+ OpCmode = 0x8;
+ Imm = SplatBits;
+ break;
+ }
+ if ((SplatBits & ~0xff00) == 0) {
+ // Value = 0xnn00: Op=x, Cmode=101x.
+ OpCmode = 0xa;
+ Imm = SplatBits >> 8;
+ break;
+ }
+ return SDValue();
+
+ case 32:
+ // NEON's 32-bit VMOV supports splat values where:
+ // * only one byte is nonzero, or
+ // * the least significant byte is 0xff and the second byte is nonzero, or
+ // * the least significant 2 bytes are 0xff and the third is nonzero.
+ VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
+ if ((SplatBits & ~0xff) == 0) {
+ // Value = 0x000000nn: Op=x, Cmode=000x.
+ OpCmode = 0;
+ Imm = SplatBits;
+ break;
+ }
+ if ((SplatBits & ~0xff00) == 0) {
+ // Value = 0x0000nn00: Op=x, Cmode=001x.
+ OpCmode = 0x2;
+ Imm = SplatBits >> 8;
+ break;
+ }
+ if ((SplatBits & ~0xff0000) == 0) {
+ // Value = 0x00nn0000: Op=x, Cmode=010x.
+ OpCmode = 0x4;
+ Imm = SplatBits >> 16;
+ break;
+ }
+ if ((SplatBits & ~0xff000000) == 0) {
+ // Value = 0xnn000000: Op=x, Cmode=011x.
+ OpCmode = 0x6;
+ Imm = SplatBits >> 24;
+ break;
+ }
+
+ // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
+ if (type == OtherModImm) return SDValue();
+
+ if ((SplatBits & ~0xffff) == 0 &&
+ ((SplatBits | SplatUndef) & 0xff) == 0xff) {
+ // Value = 0x0000nnff: Op=x, Cmode=1100.
+ OpCmode = 0xc;
+ Imm = SplatBits >> 8;
+ break;
+ }
+
+ if ((SplatBits & ~0xffffff) == 0 &&
+ ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
+ // Value = 0x00nnffff: Op=x, Cmode=1101.
+ OpCmode = 0xd;
+ Imm = SplatBits >> 16;
+ break;
+ }
+
+ // Note: there are a few 32-bit splat values (specifically: 00ffff00,
+ // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
+ // VMOV.I32. A (very) minor optimization would be to replicate the value
+ // and fall through here to test for a valid 64-bit splat. But, then the
+ // caller would also need to check and handle the change in size.
+ return SDValue();
+
+ case 64: {
+ if (type != VMOVModImm)
+ return SDValue();
+ // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
+ uint64_t BitMask = 0xff;
+ uint64_t Val = 0;
+ unsigned ImmMask = 1;
+ Imm = 0;
+ for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
+ if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
+ Val |= BitMask;
+ Imm |= ImmMask;
+ } else if ((SplatBits & BitMask) != 0) {
+ return SDValue();
+ }
+ BitMask <<= 8;
+ ImmMask <<= 1;
+ }
+
+ if (DAG.getDataLayout().isBigEndian())
+ // swap higher and lower 32 bit word
+ Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4);
+
+ // Op=1, Cmode=1110.
+ OpCmode = 0x1e;
+ VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
+ break;
+ }
+
+ default:
+ llvm_unreachable("unexpected size for isNEONModifiedImm");
+ }
+
+ unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm);
+ return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
+}
+
+SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *ST) const {
+ bool IsDouble = Op.getValueType() == MVT::f64;
+ ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
+ const APFloat &FPVal = CFP->getValueAPF();
+
+ // Prevent floating-point constants from using literal loads
+ // when execute-only is enabled.
+ if (ST->genExecuteOnly()) {
+ APInt INTVal = FPVal.bitcastToAPInt();
+ SDLoc DL(CFP);
+ if (IsDouble) {
+ SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
+ SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
+ if (!ST->isLittle())
+ std::swap(Lo, Hi);
+ return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
+ } else {
+ return DAG.getConstant(INTVal, DL, MVT::i32);
+ }
+ }
+
+ if (!ST->hasVFP3())
+ return SDValue();
+
+ // Use the default (constant pool) lowering for double constants when we have
+ // an SP-only FPU
+ if (IsDouble && Subtarget->isFPOnlySP())
+ return SDValue();
+
+ // Try splatting with a VMOV.f32...
+ int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
+
+ if (ImmVal != -1) {
+ if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
+ // We have code in place to select a valid ConstantFP already, no need to
+ // do any mangling.
+ return Op;
+ }
+
+ // It's a float and we are trying to use NEON operations where
+ // possible. Lower it to a splat followed by an extract.
+ SDLoc DL(Op);
+ SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32);
+ SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
+ NewVal);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
+ DAG.getConstant(0, DL, MVT::i32));
+ }
+
+ // The rest of our options are NEON only, make sure that's allowed before
+ // proceeding..
+ if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
+ return SDValue();
+
+ EVT VMovVT;
+ uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
+
+ // It wouldn't really be worth bothering for doubles except for one very
+ // important value, which does happen to match: 0.0. So make sure we don't do
+ // anything stupid.
+ if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
+ return SDValue();
+
+ // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
+ SDValue NewVal = isNEONModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
+ VMovVT, false, VMOVModImm);
+ if (NewVal != SDValue()) {
+ SDLoc DL(Op);
+ SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
+ NewVal);
+ if (IsDouble)
+ return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
+
+ // It's a float: cast and extract a vector element.
+ SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
+ VecConstant);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
+ DAG.getConstant(0, DL, MVT::i32));
+ }
+
+ // Finally, try a VMVN.i32
+ NewVal = isNEONModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
+ false, VMVNModImm);
+ if (NewVal != SDValue()) {
+ SDLoc DL(Op);
+ SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
+
+ if (IsDouble)
+ return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
+
+ // It's a float: cast and extract a vector element.
+ SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
+ VecConstant);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
+ DAG.getConstant(0, DL, MVT::i32));
+ }
+
+ return SDValue();
+}
+
+// check if an VEXT instruction can handle the shuffle mask when the
+// vector sources of the shuffle are the same.
+static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ // Assume that the first shuffle index is not UNDEF. Fail if it is.
+ if (M[0] < 0)
+ return false;
+
+ Imm = M[0];
+
+ // If this is a VEXT shuffle, the immediate value is the index of the first
+ // element. The other shuffle indices must be the successive elements after
+ // the first one.
+ unsigned ExpectedElt = Imm;
+ for (unsigned i = 1; i < NumElts; ++i) {
+ // Increment the expected index. If it wraps around, just follow it
+ // back to index zero and keep going.
+ ++ExpectedElt;
+ if (ExpectedElt == NumElts)
+ ExpectedElt = 0;
+
+ if (M[i] < 0) continue; // ignore UNDEF indices
+ if (ExpectedElt != static_cast<unsigned>(M[i]))
+ return false;
+ }
+
+ return true;
+}
+
+
+static bool isVEXTMask(ArrayRef<int> M, EVT VT,
+ bool &ReverseVEXT, unsigned &Imm) {
+ unsigned NumElts = VT.getVectorNumElements();
+ ReverseVEXT = false;
+
+ // Assume that the first shuffle index is not UNDEF. Fail if it is.
+ if (M[0] < 0)
+ return false;
+
+ Imm = M[0];
+
+ // If this is a VEXT shuffle, the immediate value is the index of the first
+ // element. The other shuffle indices must be the successive elements after
+ // the first one.
+ unsigned ExpectedElt = Imm;
+ for (unsigned i = 1; i < NumElts; ++i) {
+ // Increment the expected index. If it wraps around, it may still be
+ // a VEXT but the source vectors must be swapped.
+ ExpectedElt += 1;
+ if (ExpectedElt == NumElts * 2) {
+ ExpectedElt = 0;
+ ReverseVEXT = true;
+ }
+
+ if (M[i] < 0) continue; // ignore UNDEF indices
+ if (ExpectedElt != static_cast<unsigned>(M[i]))
+ return false;
+ }
+
+ // Adjust the index value if the source operands will be swapped.
+ if (ReverseVEXT)
+ Imm -= NumElts;
+
+ return true;
+}
+
+/// isVREVMask - Check if a vector shuffle corresponds to a VREV
+/// instruction with the specified blocksize. (The order of the elements
+/// within each block of the vector is reversed.)
+static bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
+ assert((BlockSize==16 || BlockSize==32 || BlockSize==64) &&
+ "Only possible block sizes for VREV are: 16, 32, 64");
+
+ unsigned EltSz = VT.getScalarSizeInBits();
+ if (EltSz == 64)
+ return false;
+
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned BlockElts = M[0] + 1;
+ // If the first shuffle index is UNDEF, be optimistic.
+ if (M[0] < 0)
+ BlockElts = BlockSize / EltSz;
+
+ if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
+ return false;
+
+ for (unsigned i = 0; i < NumElts; ++i) {
+ if (M[i] < 0) continue; // ignore UNDEF indices
+ if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts))
+ return false;
+ }
+
+ return true;
+}
+
+static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
+ // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
+ // range, then 0 is placed into the resulting vector. So pretty much any mask
+ // of 8 elements can work here.
+ return VT == MVT::v8i8 && M.size() == 8;
+}
+
+// Checks whether the shuffle mask represents a vector transpose (VTRN) by
+// checking that pairs of elements in the shuffle mask represent the same index
+// in each vector, incrementing the expected index by 2 at each step.
+// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]
+// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}
+// v2={e,f,g,h}
+// WhichResult gives the offset for each element in the mask based on which
+// of the two results it belongs to.
+//
+// The transpose can be represented either as:
+// result1 = shufflevector v1, v2, result1_shuffle_mask
+// result2 = shufflevector v1, v2, result2_shuffle_mask
+// where v1/v2 and the shuffle masks have the same number of elements
+// (here WhichResult (see below) indicates which result is being checked)
+//
+// or as:
+// results = shufflevector v1, v2, shuffle_mask
+// where both results are returned in one vector and the shuffle mask has twice
+// as many elements as v1/v2 (here WhichResult will always be 0 if true) here we
+// want to check the low half and high half of the shuffle mask as if it were
+// the other case
+static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+ unsigned EltSz = VT.getScalarSizeInBits();
+ if (EltSz == 64)
+ return false;
+
+ unsigned NumElts = VT.getVectorNumElements();
+ if (M.size() != NumElts && M.size() != NumElts*2)
+ return false;
+
+ // If the mask is twice as long as the input vector then we need to check the
+ // upper and lower parts of the mask with a matching value for WhichResult
+ // FIXME: A mask with only even values will be rejected in case the first
+ // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only
+ // M[0] is used to determine WhichResult
+ for (unsigned i = 0; i < M.size(); i += NumElts) {
+ if (M.size() == NumElts * 2)
+ WhichResult = i / NumElts;
+ else
+ WhichResult = M[i] == 0 ? 0 : 1;
+ for (unsigned j = 0; j < NumElts; j += 2) {
+ if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
+ (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))
+ return false;
+ }
+ }
+
+ if (M.size() == NumElts*2)
+ WhichResult = 0;
+
+ return true;
+}
+
+/// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
+static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
+ unsigned EltSz = VT.getScalarSizeInBits();
+ if (EltSz == 64)
+ return false;
+
+ unsigned NumElts = VT.getVectorNumElements();
+ if (M.size() != NumElts && M.size() != NumElts*2)
+ return false;
+
+ for (unsigned i = 0; i < M.size(); i += NumElts) {
+ if (M.size() == NumElts * 2)
+ WhichResult = i / NumElts;
+ else
+ WhichResult = M[i] == 0 ? 0 : 1;
+ for (unsigned j = 0; j < NumElts; j += 2) {
+ if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
+ (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))
+ return false;
+ }
+ }
+
+ if (M.size() == NumElts*2)
+ WhichResult = 0;
+
+ return true;
+}
+
+// Checks whether the shuffle mask represents a vector unzip (VUZP) by checking
+// that the mask elements are either all even and in steps of size 2 or all odd
+// and in steps of size 2.
+// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]
+// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}
+// v2={e,f,g,h}
+// Requires similar checks to that of isVTRNMask with
+// respect the how results are returned.
+static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+ unsigned EltSz = VT.getScalarSizeInBits();
+ if (EltSz == 64)
+ return false;
+
+ unsigned NumElts = VT.getVectorNumElements();
+ if (M.size() != NumElts && M.size() != NumElts*2)
+ return false;
+
+ for (unsigned i = 0; i < M.size(); i += NumElts) {
+ WhichResult = M[i] == 0 ? 0 : 1;
+ for (unsigned j = 0; j < NumElts; ++j) {
+ if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)
+ return false;
+ }
+ }
+
+ if (M.size() == NumElts*2)
+ WhichResult = 0;
+
+ // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
+ if (VT.is64BitVector() && EltSz == 32)
+ return false;
+
+ return true;
+}
+
+/// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
+static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
+ unsigned EltSz = VT.getScalarSizeInBits();
+ if (EltSz == 64)
+ return false;
+
+ unsigned NumElts = VT.getVectorNumElements();
+ if (M.size() != NumElts && M.size() != NumElts*2)
+ return false;
+
+ unsigned Half = NumElts / 2;
+ for (unsigned i = 0; i < M.size(); i += NumElts) {
+ WhichResult = M[i] == 0 ? 0 : 1;
+ for (unsigned j = 0; j < NumElts; j += Half) {
+ unsigned Idx = WhichResult;
+ for (unsigned k = 0; k < Half; ++k) {
+ int MIdx = M[i + j + k];
+ if (MIdx >= 0 && (unsigned) MIdx != Idx)
+ return false;
+ Idx += 2;
+ }
+ }
+ }
+
+ if (M.size() == NumElts*2)
+ WhichResult = 0;
+
+ // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
+ if (VT.is64BitVector() && EltSz == 32)
+ return false;
+
+ return true;
+}
+
+// Checks whether the shuffle mask represents a vector zip (VZIP) by checking
+// that pairs of elements of the shufflemask represent the same index in each
+// vector incrementing sequentially through the vectors.
+// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]
+// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}
+// v2={e,f,g,h}
+// Requires similar checks to that of isVTRNMask with respect the how results
+// are returned.
+static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+ unsigned EltSz = VT.getScalarSizeInBits();
+ if (EltSz == 64)
+ return false;
+
+ unsigned NumElts = VT.getVectorNumElements();
+ if (M.size() != NumElts && M.size() != NumElts*2)
+ return false;
+
+ for (unsigned i = 0; i < M.size(); i += NumElts) {
+ WhichResult = M[i] == 0 ? 0 : 1;
+ unsigned Idx = WhichResult * NumElts / 2;
+ for (unsigned j = 0; j < NumElts; j += 2) {
+ if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
+ (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts))
+ return false;
+ Idx += 1;
+ }
+ }
+
+ if (M.size() == NumElts*2)
+ WhichResult = 0;
+
+ // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
+ if (VT.is64BitVector() && EltSz == 32)
+ return false;
+
+ return true;
+}
+
+/// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
+static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
+ unsigned EltSz = VT.getScalarSizeInBits();
+ if (EltSz == 64)
+ return false;
+
+ unsigned NumElts = VT.getVectorNumElements();
+ if (M.size() != NumElts && M.size() != NumElts*2)
+ return false;
+
+ for (unsigned i = 0; i < M.size(); i += NumElts) {
+ WhichResult = M[i] == 0 ? 0 : 1;
+ unsigned Idx = WhichResult * NumElts / 2;
+ for (unsigned j = 0; j < NumElts; j += 2) {
+ if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
+ (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx))
+ return false;
+ Idx += 1;
+ }
+ }
+
+ if (M.size() == NumElts*2)
+ WhichResult = 0;
+
+ // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
+ if (VT.is64BitVector() && EltSz == 32)
+ return false;
+
+ return true;
+}
+
+/// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN),
+/// and return the corresponding ARMISD opcode if it is, or 0 if it isn't.
+static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT,
+ unsigned &WhichResult,
+ bool &isV_UNDEF) {
+ isV_UNDEF = false;
+ if (isVTRNMask(ShuffleMask, VT, WhichResult))
+ return ARMISD::VTRN;
+ if (isVUZPMask(ShuffleMask, VT, WhichResult))
+ return ARMISD::VUZP;
+ if (isVZIPMask(ShuffleMask, VT, WhichResult))
+ return ARMISD::VZIP;
+
+ isV_UNDEF = true;
+ if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
+ return ARMISD::VTRN;
+ if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
+ return ARMISD::VUZP;
+ if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
+ return ARMISD::VZIP;
+
+ return 0;
+}
+
+/// \return true if this is a reverse operation on an vector.
+static bool isReverseMask(ArrayRef<int> M, EVT VT) {
+ unsigned NumElts = VT.getVectorNumElements();
+ // Make sure the mask has the right size.
+ if (NumElts != M.size())
+ return false;
+
+ // Look for <15, ..., 3, -1, 1, 0>.
+ for (unsigned i = 0; i != NumElts; ++i)
+ if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
+ return false;
+
+ return true;
+}
+
+// If N is an integer constant that can be moved into a register in one
+// instruction, return an SDValue of such a constant (will become a MOV
+// instruction). Otherwise return null.
+static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
+ const ARMSubtarget *ST, const SDLoc &dl) {
+ uint64_t Val;
+ if (!isa<ConstantSDNode>(N))
+ return SDValue();
+ Val = cast<ConstantSDNode>(N)->getZExtValue();
+
+ if (ST->isThumb1Only()) {
+ if (Val <= 255 || ~Val <= 255)
+ return DAG.getConstant(Val, dl, MVT::i32);
+ } else {
+ if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
+ return DAG.getConstant(Val, dl, MVT::i32);
+ }
+ return SDValue();
+}
+
+// If this is a case we can't handle, return null and let the default
+// expansion code take care of it.
+SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *ST) const {
+ BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+
+ APInt SplatBits, SplatUndef;
+ unsigned SplatBitSize;
+ bool HasAnyUndefs;
+ if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
+ if (SplatUndef.isAllOnesValue())
+ return DAG.getUNDEF(VT);
+
+ if (SplatBitSize <= 64) {
+ // Check if an immediate VMOV works.
+ EVT VmovVT;
+ SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
+ SplatUndef.getZExtValue(), SplatBitSize,
+ DAG, dl, VmovVT, VT.is128BitVector(),
+ VMOVModImm);
+ if (Val.getNode()) {
+ SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
+ return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
+ }
+
+ // Try an immediate VMVN.
+ uint64_t NegatedImm = (~SplatBits).getZExtValue();
+ Val = isNEONModifiedImm(NegatedImm,
+ SplatUndef.getZExtValue(), SplatBitSize,
+ DAG, dl, VmovVT, VT.is128BitVector(),
+ VMVNModImm);
+ if (Val.getNode()) {
+ SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
+ return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
+ }
+
+ // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
+ if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
+ int ImmVal = ARM_AM::getFP32Imm(SplatBits);
+ if (ImmVal != -1) {
+ SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32);
+ return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
+ }
+ }
+ }
+ }
+
+ // Scan through the operands to see if only one value is used.
+ //
+ // As an optimisation, even if more than one value is used it may be more
+ // profitable to splat with one value then change some lanes.
+ //
+ // Heuristically we decide to do this if the vector has a "dominant" value,
+ // defined as splatted to more than half of the lanes.
+ unsigned NumElts = VT.getVectorNumElements();
+ bool isOnlyLowElement = true;
+ bool usesOnlyOneValue = true;
+ bool hasDominantValue = false;
+ bool isConstant = true;
+
+ // Map of the number of times a particular SDValue appears in the
+ // element list.
+ DenseMap<SDValue, unsigned> ValueCounts;
+ SDValue Value;
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue V = Op.getOperand(i);
+ if (V.isUndef())
+ continue;
+ if (i > 0)
+ isOnlyLowElement = false;
+ if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
+ isConstant = false;
+
+ ValueCounts.insert(std::make_pair(V, 0));
+ unsigned &Count = ValueCounts[V];
+
+ // Is this value dominant? (takes up more than half of the lanes)
+ if (++Count > (NumElts / 2)) {
+ hasDominantValue = true;
+ Value = V;
+ }
+ }
+ if (ValueCounts.size() != 1)
+ usesOnlyOneValue = false;
+ if (!Value.getNode() && ValueCounts.size() > 0)
+ Value = ValueCounts.begin()->first;
+
+ if (ValueCounts.size() == 0)
+ return DAG.getUNDEF(VT);
+
+ // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
+ // Keep going if we are hitting this case.
+ if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
+ return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
+
+ unsigned EltSize = VT.getScalarSizeInBits();
+
+ // Use VDUP for non-constant splats. For f32 constant splats, reduce to
+ // i32 and try again.
+ if (hasDominantValue && EltSize <= 32) {
+ if (!isConstant) {
+ SDValue N;
+
+ // If we are VDUPing a value that comes directly from a vector, that will
+ // cause an unnecessary move to and from a GPR, where instead we could
+ // just use VDUPLANE. We can only do this if the lane being extracted
+ // is at a constant index, as the VDUP from lane instructions only have
+ // constant-index forms.
+ ConstantSDNode *constIndex;
+ if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) {
+ // We need to create a new undef vector to use for the VDUPLANE if the
+ // size of the vector from which we get the value is different than the
+ // size of the vector that we need to create. We will insert the element
+ // such that the register coalescer will remove unnecessary copies.
+ if (VT != Value->getOperand(0).getValueType()) {
+ unsigned index = constIndex->getAPIntValue().getLimitedValue() %
+ VT.getVectorNumElements();
+ N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
+ DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
+ Value, DAG.getConstant(index, dl, MVT::i32)),
+ DAG.getConstant(index, dl, MVT::i32));
+ } else
+ N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
+ Value->getOperand(0), Value->getOperand(1));
+ } else
+ N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
+
+ if (!usesOnlyOneValue) {
+ // The dominant value was splatted as 'N', but we now have to insert
+ // all differing elements.
+ for (unsigned I = 0; I < NumElts; ++I) {
+ if (Op.getOperand(I) == Value)
+ continue;
+ SmallVector<SDValue, 3> Ops;
+ Ops.push_back(N);
+ Ops.push_back(Op.getOperand(I));
+ Ops.push_back(DAG.getConstant(I, dl, MVT::i32));
+ N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops);
+ }
+ }
+ return N;
+ }
+ if (VT.getVectorElementType().isFloatingPoint()) {
+ SmallVector<SDValue, 8> Ops;
+ for (unsigned i = 0; i < NumElts; ++i)
+ Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32,
+ Op.getOperand(i)));
+ EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
+ SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
+ Val = LowerBUILD_VECTOR(Val, DAG, ST);
+ if (Val.getNode())
+ return DAG.getNode(ISD::BITCAST, dl, VT, Val);
+ }
+ if (usesOnlyOneValue) {
+ SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
+ if (isConstant && Val.getNode())
+ return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
+ }
+ }
+
+ // If all elements are constants and the case above didn't get hit, fall back
+ // to the default expansion, which will generate a load from the constant
+ // pool.
+ if (isConstant)
+ return SDValue();
+
+ // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
+ if (NumElts >= 4) {
+ SDValue shuffle = ReconstructShuffle(Op, DAG);
+ if (shuffle != SDValue())
+ return shuffle;
+ }
+
+ if (VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
+ // If we haven't found an efficient lowering, try splitting a 128-bit vector
+ // into two 64-bit vectors; we might discover a better way to lower it.
+ SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts);
+ EVT ExtVT = VT.getVectorElementType();
+ EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2);
+ SDValue Lower =
+ DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElts / 2));
+ if (Lower.getOpcode() == ISD::BUILD_VECTOR)
+ Lower = LowerBUILD_VECTOR(Lower, DAG, ST);
+ SDValue Upper = DAG.getBuildVector(
+ HVT, dl, makeArrayRef(&Ops[NumElts / 2], NumElts / 2));
+ if (Upper.getOpcode() == ISD::BUILD_VECTOR)
+ Upper = LowerBUILD_VECTOR(Upper, DAG, ST);
+ if (Lower && Upper)
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper);
+ }
+
+ // Vectors with 32- or 64-bit elements can be built by directly assigning
+ // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands
+ // will be legalized.
+ if (EltSize >= 32) {
+ // Do the expansion with floating-point types, since that is what the VFP
+ // registers are defined to use, and since i64 is not legal.
+ EVT EltVT = EVT::getFloatingPointVT(EltSize);
+ EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
+ SmallVector<SDValue, 8> Ops;
+ for (unsigned i = 0; i < NumElts; ++i)
+ Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
+ SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
+ return DAG.getNode(ISD::BITCAST, dl, VT, Val);
+ }
+
+ // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
+ // know the default expansion would otherwise fall back on something even
+ // worse. For a vector with one or two non-undef values, that's
+ // scalar_to_vector for the elements followed by a shuffle (provided the
+ // shuffle is valid for the target) and materialization element by element
+ // on the stack followed by a load for everything else.
+ if (!isConstant && !usesOnlyOneValue) {
+ SDValue Vec = DAG.getUNDEF(VT);
+ for (unsigned i = 0 ; i < NumElts; ++i) {
+ SDValue V = Op.getOperand(i);
+ if (V.isUndef())
+ continue;
+ SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32);
+ Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
+ }
+ return Vec;
+ }
+
+ return SDValue();
+}
+
+// Gather data to see if the operation can be modelled as a
+// shuffle in combination with VEXTs.
+SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+ unsigned NumElts = VT.getVectorNumElements();
+
+ struct ShuffleSourceInfo {
+ SDValue Vec;
+ unsigned MinElt;
+ unsigned MaxElt;
+
+ // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
+ // be compatible with the shuffle we intend to construct. As a result
+ // ShuffleVec will be some sliding window into the original Vec.
+ SDValue ShuffleVec;
+
+ // Code should guarantee that element i in Vec starts at element "WindowBase
+ // + i * WindowScale in ShuffleVec".
+ int WindowBase;
+ int WindowScale;
+
+ bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
+ ShuffleSourceInfo(SDValue Vec)
+ : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0),
+ WindowScale(1) {}
+ };
+
+ // First gather all vectors used as an immediate source for this BUILD_VECTOR
+ // node.
+ SmallVector<ShuffleSourceInfo, 2> Sources;
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue V = Op.getOperand(i);
+ if (V.isUndef())
+ continue;
+ else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
+ // A shuffle can only come from building a vector from various
+ // elements of other vectors.
+ return SDValue();
+ } else if (!isa<ConstantSDNode>(V.getOperand(1))) {
+ // Furthermore, shuffles require a constant mask, whereas extractelts
+ // accept variable indices.
+ return SDValue();
+ }
+
+ // Add this element source to the list if it's not already there.
+ SDValue SourceVec = V.getOperand(0);
+ auto Source = find(Sources, SourceVec);
+ if (Source == Sources.end())
+ Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
+
+ // Update the minimum and maximum lane number seen.
+ unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
+ Source->MinElt = std::min(Source->MinElt, EltNo);
+ Source->MaxElt = std::max(Source->MaxElt, EltNo);
+ }
+
+ // Currently only do something sane when at most two source vectors
+ // are involved.
+ if (Sources.size() > 2)
+ return SDValue();
+
+ // Find out the smallest element size among result and two sources, and use
+ // it as element size to build the shuffle_vector.
+ EVT SmallestEltTy = VT.getVectorElementType();
+ for (auto &Source : Sources) {
+ EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
+ if (SrcEltTy.bitsLT(SmallestEltTy))
+ SmallestEltTy = SrcEltTy;
+ }
+ unsigned ResMultiplier =
+ VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
+ NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
+ EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
+
+ // If the source vector is too wide or too narrow, we may nevertheless be able
+ // to construct a compatible shuffle either by concatenating it with UNDEF or
+ // extracting a suitable range of elements.
+ for (auto &Src : Sources) {
+ EVT SrcVT = Src.ShuffleVec.getValueType();
+
+ if (SrcVT.getSizeInBits() == VT.getSizeInBits())
+ continue;
+
+ // This stage of the search produces a source with the same element type as
+ // the original, but with a total width matching the BUILD_VECTOR output.
+ EVT EltVT = SrcVT.getVectorElementType();
+ unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
+ EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
+
+ if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
+ if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits())
+ return SDValue();
+ // We can pad out the smaller vector for free, so if it's part of a
+ // shuffle...
+ Src.ShuffleVec =
+ DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
+ DAG.getUNDEF(Src.ShuffleVec.getValueType()));
+ continue;
+ }
+
+ if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits())
+ return SDValue();
+
+ if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
+ // Span too large for a VEXT to cope
+ return SDValue();
+ }
+
+ if (Src.MinElt >= NumSrcElts) {
+ // The extraction can just take the second half
+ Src.ShuffleVec =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+ DAG.getConstant(NumSrcElts, dl, MVT::i32));
+ Src.WindowBase = -NumSrcElts;
+ } else if (Src.MaxElt < NumSrcElts) {
+ // The extraction can just take the first half
+ Src.ShuffleVec =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+ DAG.getConstant(0, dl, MVT::i32));
+ } else {
+ // An actual VEXT is needed
+ SDValue VEXTSrc1 =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+ DAG.getConstant(0, dl, MVT::i32));
+ SDValue VEXTSrc2 =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+ DAG.getConstant(NumSrcElts, dl, MVT::i32));
+
+ Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1,
+ VEXTSrc2,
+ DAG.getConstant(Src.MinElt, dl, MVT::i32));
+ Src.WindowBase = -Src.MinElt;
+ }
+ }
+
+ // Another possible incompatibility occurs from the vector element types. We
+ // can fix this by bitcasting the source vectors to the same type we intend
+ // for the shuffle.
+ for (auto &Src : Sources) {
+ EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
+ if (SrcEltTy == SmallestEltTy)
+ continue;
+ assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
+ Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
+ Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
+ Src.WindowBase *= Src.WindowScale;
+ }
+
+ // Final sanity check before we try to actually produce a shuffle.
+ DEBUG(
+ for (auto Src : Sources)
+ assert(Src.ShuffleVec.getValueType() == ShuffleVT);
+ );
+
+ // The stars all align, our next step is to produce the mask for the shuffle.
+ SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
+ int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
+ for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
+ SDValue Entry = Op.getOperand(i);
+ if (Entry.isUndef())
+ continue;
+
+ auto Src = find(Sources, Entry.getOperand(0));
+ int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
+
+ // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
+ // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
+ // segment.
+ EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
+ int BitsDefined = std::min(OrigEltTy.getSizeInBits(),
+ VT.getScalarSizeInBits());
+ int LanesDefined = BitsDefined / BitsPerShuffleLane;
+
+ // This source is expected to fill ResMultiplier lanes of the final shuffle,
+ // starting at the appropriate offset.
+ int *LaneMask = &Mask[i * ResMultiplier];
+
+ int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
+ ExtractBase += NumElts * (Src - Sources.begin());
+ for (int j = 0; j < LanesDefined; ++j)
+ LaneMask[j] = ExtractBase + j;
+ }
+
+ // Final check before we try to produce nonsense...
+ if (!isShuffleMaskLegal(Mask, ShuffleVT))
+ return SDValue();
+
+ // We can't handle more than two sources. This should have already
+ // been checked before this point.
+ assert(Sources.size() <= 2 && "Too many sources!");
+
+ SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
+ for (unsigned i = 0; i < Sources.size(); ++i)
+ ShuffleOps[i] = Sources[i].ShuffleVec;
+
+ SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
+ ShuffleOps[1], Mask);
+ return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
+}
+
+/// isShuffleMaskLegal - Targets can use this to indicate that they only
+/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
+/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
+/// are assumed to be legal.
+bool
+ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
+ EVT VT) const {
+ if (VT.getVectorNumElements() == 4 &&
+ (VT.is128BitVector() || VT.is64BitVector())) {
+ unsigned PFIndexes[4];
+ for (unsigned i = 0; i != 4; ++i) {
+ if (M[i] < 0)
+ PFIndexes[i] = 8;
+ else
+ PFIndexes[i] = M[i];
+ }
+
+ // Compute the index in the perfect shuffle table.
+ unsigned PFTableIndex =
+ PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
+ unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+ unsigned Cost = (PFEntry >> 30);
+
+ if (Cost <= 4)
+ return true;
+ }
+
+ bool ReverseVEXT, isV_UNDEF;
+ unsigned Imm, WhichResult;
+
+ unsigned EltSize = VT.getScalarSizeInBits();
+ return (EltSize >= 32 ||
+ ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
+ isVREVMask(M, VT, 64) ||
+ isVREVMask(M, VT, 32) ||
+ isVREVMask(M, VT, 16) ||
+ isVEXTMask(M, VT, ReverseVEXT, Imm) ||
+ isVTBLMask(M, VT) ||
+ isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF) ||
+ ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(M, VT)));
+}
+
+/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
+/// the specified operations to build the shuffle.
+static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
+ SDValue RHS, SelectionDAG &DAG,
+ const SDLoc &dl) {
+ unsigned OpNum = (PFEntry >> 26) & 0x0F;
+ unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
+ unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
+
+ enum {
+ OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
+ OP_VREV,
+ OP_VDUP0,
+ OP_VDUP1,
+ OP_VDUP2,
+ OP_VDUP3,
+ OP_VEXT1,
+ OP_VEXT2,
+ OP_VEXT3,
+ OP_VUZPL, // VUZP, left result
+ OP_VUZPR, // VUZP, right result
+ OP_VZIPL, // VZIP, left result
+ OP_VZIPR, // VZIP, right result
+ OP_VTRNL, // VTRN, left result
+ OP_VTRNR // VTRN, right result
+ };
+
+ if (OpNum == OP_COPY) {
+ if (LHSID == (1*9+2)*9+3) return LHS;
+ assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
+ return RHS;
+ }
+
+ SDValue OpLHS, OpRHS;
+ OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
+ OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
+ EVT VT = OpLHS.getValueType();
+
+ switch (OpNum) {
+ default: llvm_unreachable("Unknown shuffle opcode!");
+ case OP_VREV:
+ // VREV divides the vector in half and swaps within the half.
+ if (VT.getVectorElementType() == MVT::i32 ||
+ VT.getVectorElementType() == MVT::f32)
+ return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
+ // vrev <4 x i16> -> VREV32
+ if (VT.getVectorElementType() == MVT::i16)
+ return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
+ // vrev <4 x i8> -> VREV16
+ assert(VT.getVectorElementType() == MVT::i8);
+ return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
+ case OP_VDUP0:
+ case OP_VDUP1:
+ case OP_VDUP2:
+ case OP_VDUP3:
+ return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
+ OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32));
+ case OP_VEXT1:
+ case OP_VEXT2:
+ case OP_VEXT3:
+ return DAG.getNode(ARMISD::VEXT, dl, VT,
+ OpLHS, OpRHS,
+ DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32));
+ case OP_VUZPL:
+ case OP_VUZPR:
+ return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
+ OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
+ case OP_VZIPL:
+ case OP_VZIPR:
+ return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
+ OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
+ case OP_VTRNL:
+ case OP_VTRNR:
+ return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
+ OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
+ }
+}
+
+static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
+ ArrayRef<int> ShuffleMask,
+ SelectionDAG &DAG) {
+ // Check to see if we can use the VTBL instruction.
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ SDLoc DL(Op);
+
+ SmallVector<SDValue, 8> VTBLMask;
+ for (ArrayRef<int>::iterator
+ I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I)
+ VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32));
+
+ if (V2.getNode()->isUndef())
+ return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
+ DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
+
+ return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
+ DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
+}
+
+static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ SDValue OpLHS = Op.getOperand(0);
+ EVT VT = OpLHS.getValueType();
+
+ assert((VT == MVT::v8i16 || VT == MVT::v16i8) &&
+ "Expect an v8i16/v16i8 type");
+ OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS);
+ // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now,
+ // extract the first 8 bytes into the top double word and the last 8 bytes
+ // into the bottom double word. The v8i16 case is similar.
+ unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4;
+ return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS,
+ DAG.getConstant(ExtractNum, DL, MVT::i32));
+}
+
+static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+ ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
+
+ // Convert shuffles that are directly supported on NEON to target-specific
+ // DAG nodes, instead of keeping them as shuffles and matching them again
+ // during code selection. This is more efficient and avoids the possibility
+ // of inconsistencies between legalization and selection.
+ // FIXME: floating-point vectors should be canonicalized to integer vectors
+ // of the same time so that they get CSEd properly.
+ ArrayRef<int> ShuffleMask = SVN->getMask();
+
+ unsigned EltSize = VT.getScalarSizeInBits();
+ if (EltSize <= 32) {
+ if (SVN->isSplat()) {
+ int Lane = SVN->getSplatIndex();
+ // If this is undef splat, generate it via "just" vdup, if possible.
+ if (Lane == -1) Lane = 0;
+
+ // Test if V1 is a SCALAR_TO_VECTOR.
+ if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+ return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
+ }
+ // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
+ // (and probably will turn into a SCALAR_TO_VECTOR once legalization
+ // reaches it).
+ if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
+ !isa<ConstantSDNode>(V1.getOperand(0))) {
+ bool IsScalarToVector = true;
+ for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
+ if (!V1.getOperand(i).isUndef()) {
+ IsScalarToVector = false;
+ break;
+ }
+ if (IsScalarToVector)
+ return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
+ }
+ return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
+ DAG.getConstant(Lane, dl, MVT::i32));
+ }
+
+ bool ReverseVEXT;
+ unsigned Imm;
+ if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
+ if (ReverseVEXT)
+ std::swap(V1, V2);
+ return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
+ DAG.getConstant(Imm, dl, MVT::i32));
+ }
+
+ if (isVREVMask(ShuffleMask, VT, 64))
+ return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
+ if (isVREVMask(ShuffleMask, VT, 32))
+ return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
+ if (isVREVMask(ShuffleMask, VT, 16))
+ return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
+
+ if (V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
+ return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
+ DAG.getConstant(Imm, dl, MVT::i32));
+ }
+
+ // Check for Neon shuffles that modify both input vectors in place.
+ // If both results are used, i.e., if there are two shuffles with the same
+ // source operands and with masks corresponding to both results of one of
+ // these operations, DAG memoization will ensure that a single node is
+ // used for both shuffles.
+ unsigned WhichResult;
+ bool isV_UNDEF;
+ if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
+ ShuffleMask, VT, WhichResult, isV_UNDEF)) {
+ if (isV_UNDEF)
+ V2 = V1;
+ return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
+ .getValue(WhichResult);
+ }
+
+ // Also check for these shuffles through CONCAT_VECTORS: we canonicalize
+ // shuffles that produce a result larger than their operands with:
+ // shuffle(concat(v1, undef), concat(v2, undef))
+ // ->
+ // shuffle(concat(v1, v2), undef)
+ // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine).
+ //
+ // This is useful in the general case, but there are special cases where
+ // native shuffles produce larger results: the two-result ops.
+ //
+ // Look through the concat when lowering them:
+ // shuffle(concat(v1, v2), undef)
+ // ->
+ // concat(VZIP(v1, v2):0, :1)
+ //
+ if (V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
+ SDValue SubV1 = V1->getOperand(0);
+ SDValue SubV2 = V1->getOperand(1);
+ EVT SubVT = SubV1.getValueType();
+
+ // We expect these to have been canonicalized to -1.
+ assert(all_of(ShuffleMask, [&](int i) {
+ return i < (int)VT.getVectorNumElements();
+ }) && "Unexpected shuffle index into UNDEF operand!");
+
+ if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
+ ShuffleMask, SubVT, WhichResult, isV_UNDEF)) {
+ if (isV_UNDEF)
+ SubV2 = SubV1;
+ assert((WhichResult == 0) &&
+ "In-place shuffle of concat can only have one result!");
+ SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT),
+ SubV1, SubV2);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0),
+ Res.getValue(1));
+ }
+ }
+ }
+
+ // If the shuffle is not directly supported and it has 4 elements, use
+ // the PerfectShuffle-generated table to synthesize it from other shuffles.
+ unsigned NumElts = VT.getVectorNumElements();
+ if (NumElts == 4) {
+ unsigned PFIndexes[4];
+ for (unsigned i = 0; i != 4; ++i) {
+ if (ShuffleMask[i] < 0)
+ PFIndexes[i] = 8;
+ else
+ PFIndexes[i] = ShuffleMask[i];
+ }
+
+ // Compute the index in the perfect shuffle table.
+ unsigned PFTableIndex =
+ PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
+ unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+ unsigned Cost = (PFEntry >> 30);
+
+ if (Cost <= 4)
+ return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
+ }
+
+ // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
+ if (EltSize >= 32) {
+ // Do the expansion with floating-point types, since that is what the VFP
+ // registers are defined to use, and since i64 is not legal.
+ EVT EltVT = EVT::getFloatingPointVT(EltSize);
+ EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
+ V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
+ V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
+ SmallVector<SDValue, 8> Ops;
+ for (unsigned i = 0; i < NumElts; ++i) {
+ if (ShuffleMask[i] < 0)
+ Ops.push_back(DAG.getUNDEF(EltVT));
+ else
+ Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
+ ShuffleMask[i] < (int)NumElts ? V1 : V2,
+ DAG.getConstant(ShuffleMask[i] & (NumElts-1),
+ dl, MVT::i32)));
+ }
+ SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
+ return DAG.getNode(ISD::BITCAST, dl, VT, Val);
+ }
+
+ if ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT))
+ return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG);
+
+ if (VT == MVT::v8i8)
+ if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
+ return NewOp;
+
+ return SDValue();
+}
+
+static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
+ // INSERT_VECTOR_ELT is legal only for immediate indexes.
+ SDValue Lane = Op.getOperand(2);
+ if (!isa<ConstantSDNode>(Lane))
+ return SDValue();
+
+ return Op;
+}
+
+static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
+ // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
+ SDValue Lane = Op.getOperand(1);
+ if (!isa<ConstantSDNode>(Lane))
+ return SDValue();
+
+ SDValue Vec = Op.getOperand(0);
+ if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) {
+ SDLoc dl(Op);
+ return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
+ }
+
+ return Op;
+}
+
+static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
+ // The only time a CONCAT_VECTORS operation can have legal types is when
+ // two 64-bit vectors are concatenated to a 128-bit vector.
+ assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
+ "unexpected CONCAT_VECTORS");
+ SDLoc dl(Op);
+ SDValue Val = DAG.getUNDEF(MVT::v2f64);
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ if (!Op0.isUndef())
+ Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
+ DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
+ DAG.getIntPtrConstant(0, dl));
+ if (!Op1.isUndef())
+ Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
+ DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
+ DAG.getIntPtrConstant(1, dl));
+ return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
+}
+
+/// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
+/// element has been zero/sign-extended, depending on the isSigned parameter,
+/// from an integer type half its size.
+static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
+ bool isSigned) {
+ // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
+ EVT VT = N->getValueType(0);
+ if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
+ SDNode *BVN = N->getOperand(0).getNode();
+ if (BVN->getValueType(0) != MVT::v4i32 ||
+ BVN->getOpcode() != ISD::BUILD_VECTOR)
+ return false;
+ unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
+ unsigned HiElt = 1 - LoElt;
+ ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt));
+ ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt));
+ ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2));
+ ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2));
+ if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
+ return false;
+ if (isSigned) {
+ if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
+ Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
+ return true;
+ } else {
+ if (Hi0->isNullValue() && Hi1->isNullValue())
+ return true;
+ }
+ return false;
+ }
+
+ if (N->getOpcode() != ISD::BUILD_VECTOR)
+ return false;
+
+ for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+ SDNode *Elt = N->getOperand(i).getNode();
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
+ unsigned EltSize = VT.getScalarSizeInBits();
+ unsigned HalfSize = EltSize / 2;
+ if (isSigned) {
+ if (!isIntN(HalfSize, C->getSExtValue()))
+ return false;
+ } else {
+ if (!isUIntN(HalfSize, C->getZExtValue()))
+ return false;
+ }
+ continue;
+ }
+ return false;
+ }
+
+ return true;
+}
+
+/// isSignExtended - Check if a node is a vector value that is sign-extended
+/// or a constant BUILD_VECTOR with sign-extended elements.
+static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
+ if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
+ return true;
+ if (isExtendedBUILD_VECTOR(N, DAG, true))
+ return true;
+ return false;
+}
+
+/// isZeroExtended - Check if a node is a vector value that is zero-extended
+/// or a constant BUILD_VECTOR with zero-extended elements.
+static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
+ if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N))
+ return true;
+ if (isExtendedBUILD_VECTOR(N, DAG, false))
+ return true;
+ return false;
+}
+
+static EVT getExtensionTo64Bits(const EVT &OrigVT) {
+ if (OrigVT.getSizeInBits() >= 64)
+ return OrigVT;
+
+ assert(OrigVT.isSimple() && "Expecting a simple value type");
+
+ MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
+ switch (OrigSimpleTy) {
+ default: llvm_unreachable("Unexpected Vector Type");
+ case MVT::v2i8:
+ case MVT::v2i16:
+ return MVT::v2i32;
+ case MVT::v4i8:
+ return MVT::v4i16;
+ }
+}
+
+/// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
+/// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
+/// We insert the required extension here to get the vector to fill a D register.
+static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG,
+ const EVT &OrigTy,
+ const EVT &ExtTy,
+ unsigned ExtOpcode) {
+ // The vector originally had a size of OrigTy. It was then extended to ExtTy.
+ // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
+ // 64-bits we need to insert a new extension so that it will be 64-bits.
+ assert(ExtTy.is128BitVector() && "Unexpected extension size");
+ if (OrigTy.getSizeInBits() >= 64)
+ return N;
+
+ // Must extend size to at least 64 bits to be used as an operand for VMULL.
+ EVT NewVT = getExtensionTo64Bits(OrigTy);
+
+ return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
+}
+
+/// SkipLoadExtensionForVMULL - return a load of the original vector size that
+/// does not do any sign/zero extension. If the original vector is less
+/// than 64 bits, an appropriate extension will be added after the load to
+/// reach a total size of 64 bits. We have to add the extension separately
+/// because ARM does not have a sign/zero extending load for vectors.
+static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) {
+ EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
+
+ // The load already has the right type.
+ if (ExtendedTy == LD->getMemoryVT())
+ return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
+ LD->getBasePtr(), LD->getPointerInfo(),
+ LD->getAlignment(), LD->getMemOperand()->getFlags());
+
+ // We need to create a zextload/sextload. We cannot just create a load
+ // followed by a zext/zext node because LowerMUL is also run during normal
+ // operation legalization where we can't create illegal types.
+ return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
+ LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
+ LD->getMemoryVT(), LD->getAlignment(),
+ LD->getMemOperand()->getFlags());
+}
+
+/// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
+/// extending load, or BUILD_VECTOR with extended elements, return the
+/// unextended value. The unextended vector should be 64 bits so that it can
+/// be used as an operand to a VMULL instruction. If the original vector size
+/// before extension is less than 64 bits we add a an extension to resize
+/// the vector to 64 bits.
+static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
+ if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
+ return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
+ N->getOperand(0)->getValueType(0),
+ N->getValueType(0),
+ N->getOpcode());
+
+ if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N))
+ return SkipLoadExtensionForVMULL(LD, DAG);
+
+ // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will
+ // have been legalized as a BITCAST from v4i32.
+ if (N->getOpcode() == ISD::BITCAST) {
+ SDNode *BVN = N->getOperand(0).getNode();
+ assert(BVN->getOpcode() == ISD::BUILD_VECTOR &&
+ BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
+ unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
+ return DAG.getBuildVector(
+ MVT::v2i32, SDLoc(N),
+ {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)});
+ }
+ // Construct a new BUILD_VECTOR with elements truncated to half the size.
+ assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
+ EVT VT = N->getValueType(0);
+ unsigned EltSize = VT.getScalarSizeInBits() / 2;
+ unsigned NumElts = VT.getVectorNumElements();
+ MVT TruncVT = MVT::getIntegerVT(EltSize);
+ SmallVector<SDValue, 8> Ops;
+ SDLoc dl(N);
+ for (unsigned i = 0; i != NumElts; ++i) {
+ ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
+ const APInt &CInt = C->getAPIntValue();
+ // Element types smaller than 32 bits are not legal, so use i32 elements.
+ // The values are implicitly truncated so sext vs. zext doesn't matter.
+ Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
+ }
+ return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
+}
+
+static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
+ unsigned Opcode = N->getOpcode();
+ if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
+ SDNode *N0 = N->getOperand(0).getNode();
+ SDNode *N1 = N->getOperand(1).getNode();
+ return N0->hasOneUse() && N1->hasOneUse() &&
+ isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
+ }
+ return false;
+}
+
+static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
+ unsigned Opcode = N->getOpcode();
+ if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
+ SDNode *N0 = N->getOperand(0).getNode();
+ SDNode *N1 = N->getOperand(1).getNode();
+ return N0->hasOneUse() && N1->hasOneUse() &&
+ isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
+ }
+ return false;
+}
+
+static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
+ // Multiplications are only custom-lowered for 128-bit vectors so that
+ // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
+ EVT VT = Op.getValueType();
+ assert(VT.is128BitVector() && VT.isInteger() &&
+ "unexpected type for custom-lowering ISD::MUL");
+ SDNode *N0 = Op.getOperand(0).getNode();
+ SDNode *N1 = Op.getOperand(1).getNode();
+ unsigned NewOpc = 0;
+ bool isMLA = false;
+ bool isN0SExt = isSignExtended(N0, DAG);
+ bool isN1SExt = isSignExtended(N1, DAG);
+ if (isN0SExt && isN1SExt)
+ NewOpc = ARMISD::VMULLs;
+ else {
+ bool isN0ZExt = isZeroExtended(N0, DAG);
+ bool isN1ZExt = isZeroExtended(N1, DAG);
+ if (isN0ZExt && isN1ZExt)
+ NewOpc = ARMISD::VMULLu;
+ else if (isN1SExt || isN1ZExt) {
+ // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
+ // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
+ if (isN1SExt && isAddSubSExt(N0, DAG)) {
+ NewOpc = ARMISD::VMULLs;
+ isMLA = true;
+ } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
+ NewOpc = ARMISD::VMULLu;
+ isMLA = true;
+ } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
+ std::swap(N0, N1);
+ NewOpc = ARMISD::VMULLu;
+ isMLA = true;
+ }
+ }
+
+ if (!NewOpc) {
+ if (VT == MVT::v2i64)
+ // Fall through to expand this. It is not legal.
+ return SDValue();
+ else
+ // Other vector multiplications are legal.
+ return Op;
+ }
+ }
+
+ // Legalize to a VMULL instruction.
+ SDLoc DL(Op);
+ SDValue Op0;
+ SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
+ if (!isMLA) {
+ Op0 = SkipExtensionForVMULL(N0, DAG);
+ assert(Op0.getValueType().is64BitVector() &&
+ Op1.getValueType().is64BitVector() &&
+ "unexpected types for extended operands to VMULL");
+ return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
+ }
+
+ // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
+ // isel lowering to take advantage of no-stall back to back vmul + vmla.
+ // vmull q0, d4, d6
+ // vmlal q0, d5, d6
+ // is faster than
+ // vaddl q0, d4, d5
+ // vmovl q1, d6
+ // vmul q0, q0, q1
+ SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
+ SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
+ EVT Op1VT = Op1.getValueType();
+ return DAG.getNode(N0->getOpcode(), DL, VT,
+ DAG.getNode(NewOpc, DL, VT,
+ DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
+ DAG.getNode(NewOpc, DL, VT,
+ DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
+}
+
+static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl,
+ SelectionDAG &DAG) {
+ // TODO: Should this propagate fast-math-flags?
+
+ // Convert to float
+ // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
+ // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
+ X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
+ Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
+ X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
+ Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
+ // Get reciprocal estimate.
+ // float4 recip = vrecpeq_f32(yf);
+ Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
+ DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
+ Y);
+ // Because char has a smaller range than uchar, we can actually get away
+ // without any newton steps. This requires that we use a weird bias
+ // of 0xb000, however (again, this has been exhaustively tested).
+ // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
+ X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
+ X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
+ Y = DAG.getConstant(0xb000, dl, MVT::v4i32);
+ X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
+ X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
+ // Convert back to short.
+ X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
+ X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
+ return X;
+}
+
+static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl,
+ SelectionDAG &DAG) {
+ // TODO: Should this propagate fast-math-flags?
+
+ SDValue N2;
+ // Convert to float.
+ // float4 yf = vcvt_f32_s32(vmovl_s16(y));
+ // float4 xf = vcvt_f32_s32(vmovl_s16(x));
+ N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
+ N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
+ N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
+ N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
+
+ // Use reciprocal estimate and one refinement step.
+ // float4 recip = vrecpeq_f32(yf);
+ // recip *= vrecpsq_f32(yf, recip);
+ N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
+ DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
+ N1);
+ N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
+ DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
+ N1, N2);
+ N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
+ // Because short has a smaller range than ushort, we can actually get away
+ // with only a single newton step. This requires that we use a weird bias
+ // of 89, however (again, this has been exhaustively tested).
+ // float4 result = as_float4(as_int4(xf*recip) + 0x89);
+ N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
+ N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
+ N1 = DAG.getConstant(0x89, dl, MVT::v4i32);
+ N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
+ N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
+ // Convert back to integer and return.
+ // return vmovn_s32(vcvt_s32_f32(result));
+ N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
+ N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
+ return N0;
+}
+
+static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
+ EVT VT = Op.getValueType();
+ assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
+ "unexpected type for custom-lowering ISD::SDIV");
+
+ SDLoc dl(Op);
+ SDValue N0 = Op.getOperand(0);
+ SDValue N1 = Op.getOperand(1);
+ SDValue N2, N3;
+
+ if (VT == MVT::v8i8) {
+ N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
+ N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
+
+ N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
+ DAG.getIntPtrConstant(4, dl));
+ N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
+ DAG.getIntPtrConstant(4, dl));
+ N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
+ DAG.getIntPtrConstant(0, dl));
+ N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
+ DAG.getIntPtrConstant(0, dl));
+
+ N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
+ N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
+
+ N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
+ N0 = LowerCONCAT_VECTORS(N0, DAG);
+
+ N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
+ return N0;
+ }
+ return LowerSDIV_v4i16(N0, N1, dl, DAG);
+}
+
+static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
+ // TODO: Should this propagate fast-math-flags?
+ EVT VT = Op.getValueType();
+ assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
+ "unexpected type for custom-lowering ISD::UDIV");
+
+ SDLoc dl(Op);
+ SDValue N0 = Op.getOperand(0);
+ SDValue N1 = Op.getOperand(1);
+ SDValue N2, N3;
+
+ if (VT == MVT::v8i8) {
+ N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
+ N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
+
+ N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
+ DAG.getIntPtrConstant(4, dl));
+ N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
+ DAG.getIntPtrConstant(4, dl));
+ N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
+ DAG.getIntPtrConstant(0, dl));
+ N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
+ DAG.getIntPtrConstant(0, dl));
+
+ N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
+ N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
+
+ N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
+ N0 = LowerCONCAT_VECTORS(N0, DAG);
+
+ N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
+ DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,
+ MVT::i32),
+ N0);
+ return N0;
+ }
+
+ // v4i16 sdiv ... Convert to float.
+ // float4 yf = vcvt_f32_s32(vmovl_u16(y));
+ // float4 xf = vcvt_f32_s32(vmovl_u16(x));
+ N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
+ N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
+ N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
+ SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
+
+ // Use reciprocal estimate and two refinement steps.
+ // float4 recip = vrecpeq_f32(yf);
+ // recip *= vrecpsq_f32(yf, recip);
+ // recip *= vrecpsq_f32(yf, recip);
+ N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
+ DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
+ BN1);
+ N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
+ DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
+ BN1, N2);
+ N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
+ N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
+ DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
+ BN1, N2);
+ N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
+ // Simply multiplying by the reciprocal estimate can leave us a few ulps
+ // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
+ // and that it will never cause us to return an answer too large).
+ // float4 result = as_float4(as_int4(xf*recip) + 2);
+ N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
+ N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
+ N1 = DAG.getConstant(2, dl, MVT::v4i32);
+ N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
+ N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
+ // Convert back to integer and return.
+ // return vmovn_u32(vcvt_s32_f32(result));
+ N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
+ N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
+ return N0;
+}
+
+static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
+ EVT VT = Op.getNode()->getValueType(0);
+ SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+
+ unsigned Opc;
+ bool ExtraOp = false;
+ switch (Op.getOpcode()) {
+ default: llvm_unreachable("Invalid code");
+ case ISD::ADDC: Opc = ARMISD::ADDC; break;
+ case ISD::ADDE: Opc = ARMISD::ADDE; ExtraOp = true; break;
+ case ISD::SUBC: Opc = ARMISD::SUBC; break;
+ case ISD::SUBE: Opc = ARMISD::SUBE; ExtraOp = true; break;
+ }
+
+ if (!ExtraOp)
+ return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
+ Op.getOperand(1));
+ return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
+ Op.getOperand(1), Op.getOperand(2));
+}
+
+SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
+ assert(Subtarget->isTargetDarwin());
+
+ // For iOS, we want to call an alternative entry point: __sincos_stret,
+ // return values are passed via sret.
+ SDLoc dl(Op);
+ SDValue Arg = Op.getOperand(0);
+ EVT ArgVT = Arg.getValueType();
+ Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ // Pair of floats / doubles used to pass the result.
+ Type *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
+ auto &DL = DAG.getDataLayout();
+
+ ArgListTy Args;
+ bool ShouldUseSRet = Subtarget->isAPCS_ABI();
+ SDValue SRet;
+ if (ShouldUseSRet) {
+ // Create stack object for sret.
+ const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
+ const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy);
+ int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
+ SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL));
+
+ ArgListEntry Entry;
+ Entry.Node = SRet;
+ Entry.Ty = RetTy->getPointerTo();
+ Entry.isSExt = false;
+ Entry.isZExt = false;
+ Entry.isSRet = true;
+ Args.push_back(Entry);
+ RetTy = Type::getVoidTy(*DAG.getContext());
+ }
+
+ ArgListEntry Entry;
+ Entry.Node = Arg;
+ Entry.Ty = ArgTy;
+ Entry.isSExt = false;
+ Entry.isZExt = false;
+ Args.push_back(Entry);
+
+ const char *LibcallName =
+ (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";
+ RTLIB::Libcall LC =
+ (ArgVT == MVT::f64) ? RTLIB::SINCOS_F64 : RTLIB::SINCOS_F32;
+ CallingConv::ID CC = getLibcallCallingConv(LC);
+ SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL));
+
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(dl)
+ .setChain(DAG.getEntryNode())
+ .setCallee(CC, RetTy, Callee, std::move(Args))
+ .setDiscardResult(ShouldUseSRet);
+ std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+
+ if (!ShouldUseSRet)
+ return CallResult.first;
+
+ SDValue LoadSin =
+ DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo());
+
+ // Address of cos field.
+ SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,
+ DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
+ SDValue LoadCos =
+ DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo());
+
+ SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
+ LoadSin.getValue(0), LoadCos.getValue(0));
+}
+
+SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
+ bool Signed,
+ SDValue &Chain) const {
+ EVT VT = Op.getValueType();
+ assert((VT == MVT::i32 || VT == MVT::i64) &&
+ "unexpected type for custom lowering DIV");
+ SDLoc dl(Op);
+
+ const auto &DL = DAG.getDataLayout();
+ const auto &TLI = DAG.getTargetLoweringInfo();
+
+ const char *Name = nullptr;
+ if (Signed)
+ Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64";
+ else
+ Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64";
+
+ SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL));
+
+ ARMTargetLowering::ArgListTy Args;
+
+ for (auto AI : {1, 0}) {
+ ArgListEntry Arg;
+ Arg.Node = Op.getOperand(AI);
+ Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext());
+ Args.push_back(Arg);
+ }
+
+ CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(dl)
+ .setChain(Chain)
+ .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()),
+ ES, std::move(Args));
+
+ return LowerCallTo(CLI).first;
+}
+
+SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
+ bool Signed) const {
+ assert(Op.getValueType() == MVT::i32 &&
+ "unexpected type for custom lowering DIV");
+ SDLoc dl(Op);
+
+ SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other,
+ DAG.getEntryNode(), Op.getOperand(1));
+
+ return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
+}
+
+static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain) {
+ SDLoc DL(N);
+ SDValue Op = N->getOperand(1);
+ if (N->getValueType(0) == MVT::i32)
+ return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op);
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op,
+ DAG.getConstant(0, DL, MVT::i32));
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op,
+ DAG.getConstant(1, DL, MVT::i32));
+ return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain,
+ DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi));
+}
+
+void ARMTargetLowering::ExpandDIV_Windows(
+ SDValue Op, SelectionDAG &DAG, bool Signed,
+ SmallVectorImpl<SDValue> &Results) const {
+ const auto &DL = DAG.getDataLayout();
+ const auto &TLI = DAG.getTargetLoweringInfo();
+
+ assert(Op.getValueType() == MVT::i64 &&
+ "unexpected type for custom lowering DIV");
+ SDLoc dl(Op);
+
+ SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode());
+
+ SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
+
+ SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);
+ SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,
+ DAG.getConstant(32, dl, TLI.getPointerTy(DL)));
+ Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
+
+ Results.push_back(Lower);
+ Results.push_back(Upper);
+}
+
+static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
+ if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering()))
+ // Acquire/Release load/store is not legal for targets without a dmb or
+ // equivalent available.
+ return SDValue();
+
+ // Monotonic load/store is legal for all targets.
+ return Op;
+}
+
+static void ReplaceREADCYCLECOUNTER(SDNode *N,
+ SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG,
+ const ARMSubtarget *Subtarget) {
+ SDLoc DL(N);
+ // Under Power Management extensions, the cycle-count is:
+ // mrc p15, #0, <Rt>, c9, c13, #0
+ SDValue Ops[] = { N->getOperand(0), // Chain
+ DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32),
+ DAG.getConstant(15, DL, MVT::i32),
+ DAG.getConstant(0, DL, MVT::i32),
+ DAG.getConstant(9, DL, MVT::i32),
+ DAG.getConstant(13, DL, MVT::i32),
+ DAG.getConstant(0, DL, MVT::i32)
+ };
+
+ SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
+ DAG.getVTList(MVT::i32, MVT::Other), Ops);
+ Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32,
+ DAG.getConstant(0, DL, MVT::i32)));
+ Results.push_back(Cycles32.getValue(1));
+}
+
+static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
+ SDLoc dl(V.getNode());
+ SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i32);
+ SDValue VHi = DAG.getAnyExtOrTrunc(
+ DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)),
+ dl, MVT::i32);
+ SDValue RegClass =
+ DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
+ SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);
+ SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32);
+ const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
+ return SDValue(
+ DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
+}
+
+static void ReplaceCMP_SWAP_64Results(SDNode *N,
+ SmallVectorImpl<SDValue> & Results,
+ SelectionDAG &DAG) {
+ assert(N->getValueType(0) == MVT::i64 &&
+ "AtomicCmpSwap on types less than 64 should be legal");
+ SDValue Ops[] = {N->getOperand(1),
+ createGPRPairNode(DAG, N->getOperand(2)),
+ createGPRPairNode(DAG, N->getOperand(3)),
+ N->getOperand(0)};
+ SDNode *CmpSwap = DAG.getMachineNode(
+ ARM::CMP_SWAP_64, SDLoc(N),
+ DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops);
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1);
+ MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
+ cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1);
+
+ Results.push_back(DAG.getTargetExtractSubreg(ARM::gsub_0, SDLoc(N), MVT::i32,
+ SDValue(CmpSwap, 0)));
+ Results.push_back(DAG.getTargetExtractSubreg(ARM::gsub_1, SDLoc(N), MVT::i32,
+ SDValue(CmpSwap, 0)));
+ Results.push_back(SDValue(CmpSwap, 2));
+}
+
+static SDValue LowerFPOWI(SDValue Op, const ARMSubtarget &Subtarget,
+ SelectionDAG &DAG) {
+ const auto &TLI = DAG.getTargetLoweringInfo();
+
+ assert(Subtarget.getTargetTriple().isOSMSVCRT() &&
+ "Custom lowering is MSVCRT specific!");
+
+ SDLoc dl(Op);
+ SDValue Val = Op.getOperand(0);
+ MVT Ty = Val->getSimpleValueType(0);
+ SDValue Exponent = DAG.getNode(ISD::SINT_TO_FP, dl, Ty, Op.getOperand(1));
+ SDValue Callee = DAG.getExternalSymbol(Ty == MVT::f32 ? "powf" : "pow",
+ TLI.getPointerTy(DAG.getDataLayout()));
+
+ TargetLowering::ArgListTy Args;
+ TargetLowering::ArgListEntry Entry;
+
+ Entry.Node = Val;
+ Entry.Ty = Val.getValueType().getTypeForEVT(*DAG.getContext());
+ Entry.isZExt = true;
+ Args.push_back(Entry);
+
+ Entry.Node = Exponent;
+ Entry.Ty = Exponent.getValueType().getTypeForEVT(*DAG.getContext());
+ Entry.isZExt = true;
+ Args.push_back(Entry);
+
+ Type *LCRTy = Val.getValueType().getTypeForEVT(*DAG.getContext());
+
+ // In the in-chain to the call is the entry node If we are emitting a
+ // tailcall, the chain will be mutated if the node has a non-entry input
+ // chain.
+ SDValue InChain = DAG.getEntryNode();
+ SDValue TCChain = InChain;
+
+ const auto *F = DAG.getMachineFunction().getFunction();
+ bool IsTC = TLI.isInTailCallPosition(DAG, Op.getNode(), TCChain) &&
+ F->getReturnType() == LCRTy;
+ if (IsTC)
+ InChain = TCChain;
+
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(dl)
+ .setChain(InChain)
+ .setCallee(CallingConv::ARM_AAPCS_VFP, LCRTy, Callee, std::move(Args))
+ .setTailCall(IsTC);
+ std::pair<SDValue, SDValue> CI = TLI.LowerCallTo(CLI);
+
+ // Return the chain (the DAG root) if it is a tail call
+ return !CI.second.getNode() ? DAG.getRoot() : CI.first;
+}
+
+SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+ switch (Op.getOpcode()) {
+ default: llvm_unreachable("Don't know how to custom lower this!");
+ case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
+ case ISD::ConstantPool:
+ if (Subtarget->genExecuteOnly())
+ llvm_unreachable("execute-only should not generate constant pools");
+ return LowerConstantPool(Op, DAG);
+ case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
+ case ISD::GlobalAddress:
+ switch (Subtarget->getTargetTriple().getObjectFormat()) {
+ default: llvm_unreachable("unknown object format");
+ case Triple::COFF:
+ return LowerGlobalAddressWindows(Op, DAG);
+ case Triple::ELF:
+ return LowerGlobalAddressELF(Op, DAG);
+ case Triple::MachO:
+ return LowerGlobalAddressDarwin(Op, DAG);
+ }
+ case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
+ case ISD::SELECT: return LowerSELECT(Op, DAG);
+ case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
+ case ISD::BR_CC: return LowerBR_CC(Op, DAG);
+ case ISD::BR_JT: return LowerBR_JT(Op, DAG);
+ case ISD::VASTART: return LowerVASTART(Op, DAG);
+ case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget);
+ case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget);
+ case ISD::SINT_TO_FP:
+ case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
+ case ISD::FP_TO_SINT:
+ case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
+ case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
+ case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
+ case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
+ case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
+ case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
+ case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
+ case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
+ Subtarget);
+ case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG);
+ case ISD::SHL:
+ case ISD::SRL:
+ case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget);
+ case ISD::SREM: return LowerREM(Op.getNode(), DAG);
+ case ISD::UREM: return LowerREM(Op.getNode(), DAG);
+ case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG);
+ case ISD::SRL_PARTS:
+ case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
+ case ISD::CTTZ:
+ case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
+ case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
+ case ISD::SETCC: return LowerVSETCC(Op, DAG);
+ case ISD::SETCCE: return LowerSETCCE(Op, DAG);
+ case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget);
+ case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget);
+ case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
+ case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
+ case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+ case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
+ case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
+ case ISD::MUL: return LowerMUL(Op, DAG);
+ case ISD::SDIV:
+ if (Subtarget->isTargetWindows())
+ return LowerDIV_Windows(Op, DAG, /* Signed */ true);
+ return LowerSDIV(Op, DAG);
+ case ISD::UDIV:
+ if (Subtarget->isTargetWindows())
+ return LowerDIV_Windows(Op, DAG, /* Signed */ false);
+ return LowerUDIV(Op, DAG);
+ case ISD::ADDC:
+ case ISD::ADDE:
+ case ISD::SUBC:
+ case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
+ case ISD::SADDO:
+ case ISD::UADDO:
+ case ISD::SSUBO:
+ case ISD::USUBO:
+ return LowerXALUO(Op, DAG);
+ case ISD::ATOMIC_LOAD:
+ case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG);
+ case ISD::FSINCOS: return LowerFSINCOS(Op, DAG);
+ case ISD::SDIVREM:
+ case ISD::UDIVREM: return LowerDivRem(Op, DAG);
+ case ISD::DYNAMIC_STACKALLOC:
+ if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment())
+ return LowerDYNAMIC_STACKALLOC(Op, DAG);
+ llvm_unreachable("Don't know how to custom lower this!");
+ case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
+ case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
+ case ISD::FPOWI: return LowerFPOWI(Op, *Subtarget, DAG);
+ case ARMISD::WIN__DBZCHK: return SDValue();
+ }
+}
+
+/// ReplaceNodeResults - Replace the results of node with an illegal result
+/// type with new values built out of custom code.
+void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
+ SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const {
+ SDValue Res;
+ switch (N->getOpcode()) {
+ default:
+ llvm_unreachable("Don't know how to custom expand this!");
+ case ISD::READ_REGISTER:
+ ExpandREAD_REGISTER(N, Results, DAG);
+ break;
+ case ISD::BITCAST:
+ Res = ExpandBITCAST(N, DAG);
+ break;
+ case ISD::SRL:
+ case ISD::SRA:
+ Res = Expand64BitShift(N, DAG, Subtarget);
+ break;
+ case ISD::SREM:
+ case ISD::UREM:
+ Res = LowerREM(N, DAG);
+ break;
+ case ISD::SDIVREM:
+ case ISD::UDIVREM:
+ Res = LowerDivRem(SDValue(N, 0), DAG);
+ assert(Res.getNumOperands() == 2 && "DivRem needs two values");
+ Results.push_back(Res.getValue(0));
+ Results.push_back(Res.getValue(1));
+ return;
+ case ISD::READCYCLECOUNTER:
+ ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
+ return;
+ case ISD::UDIV:
+ case ISD::SDIV:
+ assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows");
+ return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV,
+ Results);
+ case ISD::ATOMIC_CMP_SWAP:
+ ReplaceCMP_SWAP_64Results(N, Results, DAG);
+ return;
+ }
+ if (Res.getNode())
+ Results.push_back(Res);
+}
+
+//===----------------------------------------------------------------------===//
+// ARM Scheduler Hooks
+//===----------------------------------------------------------------------===//
+
+/// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
+/// registers the function context.
+void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
+ MachineBasicBlock *MBB,
+ MachineBasicBlock *DispatchBB,
+ int FI) const {
+ assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
+ "ROPI/RWPI not currently supported with SjLj");
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ DebugLoc dl = MI.getDebugLoc();
+ MachineFunction *MF = MBB->getParent();
+ MachineRegisterInfo *MRI = &MF->getRegInfo();
+ MachineConstantPool *MCP = MF->getConstantPool();
+ ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>();
+ const Function *F = MF->getFunction();
+
+ bool isThumb = Subtarget->isThumb();
+ bool isThumb2 = Subtarget->isThumb2();
+
+ unsigned PCLabelId = AFI->createPICLabelUId();
+ unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
+ ARMConstantPoolValue *CPV =
+ ARMConstantPoolMBB::Create(F->getContext(), DispatchBB, PCLabelId, PCAdj);
+ unsigned CPI = MCP->getConstantPoolIndex(CPV, 4);
+
+ const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
+ : &ARM::GPRRegClass;
+
+ // Grab constant pool and fixed stack memory operands.
+ MachineMemOperand *CPMMO =
+ MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
+ MachineMemOperand::MOLoad, 4, 4);
+
+ MachineMemOperand *FIMMOSt =
+ MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI),
+ MachineMemOperand::MOStore, 4, 4);
+
+ // Load the address of the dispatch MBB into the jump buffer.
+ if (isThumb2) {
+ // Incoming value: jbuf
+ // ldr.n r5, LCPI1_1
+ // orr r5, r5, #1
+ // add r5, pc
+ // str r5, [$jbuf, #+4] ; &jbuf[1]
+ unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+ AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
+ .addConstantPoolIndex(CPI)
+ .addMemOperand(CPMMO));
+ // Set the low bit because of thumb mode.
+ unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+ AddDefaultCC(
+ AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
+ .addReg(NewVReg1, RegState::Kill)
+ .addImm(0x01)));
+ unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+ BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
+ .addReg(NewVReg2, RegState::Kill)
+ .addImm(PCLabelId);
+ AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
+ .addReg(NewVReg3, RegState::Kill)
+ .addFrameIndex(FI)
+ .addImm(36) // &jbuf[1] :: pc
+ .addMemOperand(FIMMOSt));
+ } else if (isThumb) {
+ // Incoming value: jbuf
+ // ldr.n r1, LCPI1_4
+ // add r1, pc
+ // mov r2, #1
+ // orrs r1, r2
+ // add r2, $jbuf, #+4 ; &jbuf[1]
+ // str r1, [r2]
+ unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+ AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
+ .addConstantPoolIndex(CPI)
+ .addMemOperand(CPMMO));
+ unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+ BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
+ .addReg(NewVReg1, RegState::Kill)
+ .addImm(PCLabelId);
+ // Set the low bit because of thumb mode.
+ unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+ AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
+ .addReg(ARM::CPSR, RegState::Define)
+ .addImm(1));
+ unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
+ AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
+ .addReg(ARM::CPSR, RegState::Define)
+ .addReg(NewVReg2, RegState::Kill)
+ .addReg(NewVReg3, RegState::Kill));
+ unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
+ BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
+ .addFrameIndex(FI)
+ .addImm(36); // &jbuf[1] :: pc
+ AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
+ .addReg(NewVReg4, RegState::Kill)
+ .addReg(NewVReg5, RegState::Kill)
+ .addImm(0)
+ .addMemOperand(FIMMOSt));
+ } else {
+ // Incoming value: jbuf
+ // ldr r1, LCPI1_1
+ // add r1, pc, r1
+ // str r1, [$jbuf, #+4] ; &jbuf[1]
+ unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+ AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1)
+ .addConstantPoolIndex(CPI)
+ .addImm(0)
+ .addMemOperand(CPMMO));
+ unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+ AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
+ .addReg(NewVReg1, RegState::Kill)
+ .addImm(PCLabelId));
+ AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
+ .addReg(NewVReg2, RegState::Kill)
+ .addFrameIndex(FI)
+ .addImm(36) // &jbuf[1] :: pc
+ .addMemOperand(FIMMOSt));
+ }
+}
+
+void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
+ MachineBasicBlock *MBB) const {
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ DebugLoc dl = MI.getDebugLoc();
+ MachineFunction *MF = MBB->getParent();
+ MachineRegisterInfo *MRI = &MF->getRegInfo();
+ MachineFrameInfo &MFI = MF->getFrameInfo();
+ int FI = MFI.getFunctionContextIndex();
+
+ const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass
+ : &ARM::GPRnopcRegClass;
+
+ // Get a mapping of the call site numbers to all of the landing pads they're
+ // associated with.
+ DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2> > CallSiteNumToLPad;
+ unsigned MaxCSNum = 0;
+ for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E;
+ ++BB) {
+ if (!BB->isEHPad()) continue;
+
+ // FIXME: We should assert that the EH_LABEL is the first MI in the landing
+ // pad.
+ for (MachineBasicBlock::iterator
+ II = BB->begin(), IE = BB->end(); II != IE; ++II) {
+ if (!II->isEHLabel()) continue;
+
+ MCSymbol *Sym = II->getOperand(0).getMCSymbol();
+ if (!MF->hasCallSiteLandingPad(Sym)) continue;
+
+ SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym);
+ for (SmallVectorImpl<unsigned>::iterator
+ CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end();
+ CSI != CSE; ++CSI) {
+ CallSiteNumToLPad[*CSI].push_back(&*BB);
+ MaxCSNum = std::max(MaxCSNum, *CSI);
+ }
+ break;
+ }
+ }
+
+ // Get an ordered list of the machine basic blocks for the jump table.
+ std::vector<MachineBasicBlock*> LPadList;
+ SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs;
+ LPadList.reserve(CallSiteNumToLPad.size());
+ for (unsigned I = 1; I <= MaxCSNum; ++I) {
+ SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
+ for (SmallVectorImpl<MachineBasicBlock*>::iterator
+ II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) {
+ LPadList.push_back(*II);
+ InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end());
+ }
+ }
+
+ assert(!LPadList.empty() &&
+ "No landing pad destinations for the dispatch jump table!");
+
+ // Create the jump table and associated information.
+ MachineJumpTableInfo *JTI =
+ MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
+ unsigned MJTI = JTI->createJumpTableIndex(LPadList);
+
+ // Create the MBBs for the dispatch code.
+
+ // Shove the dispatch's address into the return slot in the function context.
+ MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
+ DispatchBB->setIsEHPad();
+
+ MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
+ unsigned trap_opcode;
+ if (Subtarget->isThumb())
+ trap_opcode = ARM::tTRAP;
+ else
+ trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP;
+
+ BuildMI(TrapBB, dl, TII->get(trap_opcode));
+ DispatchBB->addSuccessor(TrapBB);
+
+ MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
+ DispatchBB->addSuccessor(DispContBB);
+
+ // Insert and MBBs.
+ MF->insert(MF->end(), DispatchBB);
+ MF->insert(MF->end(), DispContBB);
+ MF->insert(MF->end(), TrapBB);
+
+ // Insert code into the entry block that creates and registers the function
+ // context.
+ SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
+
+ MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*MF, FI),
+ MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4);
+
+ MachineInstrBuilder MIB;
+ MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
+
+ const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
+ const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
+
+ // Add a register mask with no preserved registers. This results in all
+ // registers being marked as clobbered. This can't work if the dispatch block
+ // is in a Thumb1 function and is linked with ARM code which uses the FP
+ // registers, as there is no way to preserve the FP registers in Thumb1 mode.
+ MIB.addRegMask(RI.getSjLjDispatchPreservedMask(*MF));
+
+ bool IsPositionIndependent = isPositionIndependent();
+ unsigned NumLPads = LPadList.size();
+ if (Subtarget->isThumb2()) {
+ unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+ AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
+ .addFrameIndex(FI)
+ .addImm(4)
+ .addMemOperand(FIMMOLd));
+
+ if (NumLPads < 256) {
+ AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
+ .addReg(NewVReg1)
+ .addImm(LPadList.size()));
+ } else {
+ unsigned VReg1 = MRI->createVirtualRegister(TRC);
+ AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
+ .addImm(NumLPads & 0xFFFF));
+
+ unsigned VReg2 = VReg1;
+ if ((NumLPads & 0xFFFF0000) != 0) {
+ VReg2 = MRI->createVirtualRegister(TRC);
+ AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
+ .addReg(VReg1)
+ .addImm(NumLPads >> 16));
+ }
+
+ AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
+ .addReg(NewVReg1)
+ .addReg(VReg2));
+ }
+
+ BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
+ .addMBB(TrapBB)
+ .addImm(ARMCC::HI)
+ .addReg(ARM::CPSR);
+
+ unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+ AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT),NewVReg3)
+ .addJumpTableIndex(MJTI));
+
+ unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
+ AddDefaultCC(
+ AddDefaultPred(
+ BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
+ .addReg(NewVReg3, RegState::Kill)
+ .addReg(NewVReg1)
+ .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))));
+
+ BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
+ .addReg(NewVReg4, RegState::Kill)
+ .addReg(NewVReg1)
+ .addJumpTableIndex(MJTI);
+ } else if (Subtarget->isThumb()) {
+ unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+ AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
+ .addFrameIndex(FI)
+ .addImm(1)
+ .addMemOperand(FIMMOLd));
+
+ if (NumLPads < 256) {
+ AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
+ .addReg(NewVReg1)
+ .addImm(NumLPads));
+ } else {
+ MachineConstantPool *ConstantPool = MF->getConstantPool();
+ Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
+ const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
+
+ // MachineConstantPool wants an explicit alignment.
+ unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
+ if (Align == 0)
+ Align = MF->getDataLayout().getTypeAllocSize(C->getType());
+ unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
+
+ unsigned VReg1 = MRI->createVirtualRegister(TRC);
+ AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
+ .addReg(VReg1, RegState::Define)
+ .addConstantPoolIndex(Idx));
+ AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
+ .addReg(NewVReg1)
+ .addReg(VReg1));
+ }
+
+ BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
+ .addMBB(TrapBB)
+ .addImm(ARMCC::HI)
+ .addReg(ARM::CPSR);
+
+ unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+ AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
+ .addReg(ARM::CPSR, RegState::Define)
+ .addReg(NewVReg1)
+ .addImm(2));
+
+ unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+ AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
+ .addJumpTableIndex(MJTI));
+
+ unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
+ AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
+ .addReg(ARM::CPSR, RegState::Define)
+ .addReg(NewVReg2, RegState::Kill)
+ .addReg(NewVReg3));
+
+ MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
+ MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
+
+ unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
+ AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
+ .addReg(NewVReg4, RegState::Kill)
+ .addImm(0)
+ .addMemOperand(JTMMOLd));
+
+ unsigned NewVReg6 = NewVReg5;
+ if (IsPositionIndependent) {
+ NewVReg6 = MRI->createVirtualRegister(TRC);
+ AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
+ .addReg(ARM::CPSR, RegState::Define)
+ .addReg(NewVReg5, RegState::Kill)
+ .addReg(NewVReg3));
+ }
+
+ BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
+ .addReg(NewVReg6, RegState::Kill)
+ .addJumpTableIndex(MJTI);
+ } else {
+ unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+ AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
+ .addFrameIndex(FI)
+ .addImm(4)
+ .addMemOperand(FIMMOLd));
+
+ if (NumLPads < 256) {
+ AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
+ .addReg(NewVReg1)
+ .addImm(NumLPads));
+ } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
+ unsigned VReg1 = MRI->createVirtualRegister(TRC);
+ AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
+ .addImm(NumLPads & 0xFFFF));
+
+ unsigned VReg2 = VReg1;
+ if ((NumLPads & 0xFFFF0000) != 0) {
+ VReg2 = MRI->createVirtualRegister(TRC);
+ AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
+ .addReg(VReg1)
+ .addImm(NumLPads >> 16));
+ }
+
+ AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
+ .addReg(NewVReg1)
+ .addReg(VReg2));
+ } else {
+ MachineConstantPool *ConstantPool = MF->getConstantPool();
+ Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
+ const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
+
+ // MachineConstantPool wants an explicit alignment.
+ unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
+ if (Align == 0)
+ Align = MF->getDataLayout().getTypeAllocSize(C->getType());
+ unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
+
+ unsigned VReg1 = MRI->createVirtualRegister(TRC);
+ AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
+ .addReg(VReg1, RegState::Define)
+ .addConstantPoolIndex(Idx)
+ .addImm(0));
+ AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
+ .addReg(NewVReg1)
+ .addReg(VReg1, RegState::Kill));
+ }
+
+ BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
+ .addMBB(TrapBB)
+ .addImm(ARMCC::HI)
+ .addReg(ARM::CPSR);
+
+ unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+ AddDefaultCC(
+ AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
+ .addReg(NewVReg1)
+ .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))));
+ unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
+ AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
+ .addJumpTableIndex(MJTI));
+
+ MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
+ MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
+ unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
+ AddDefaultPred(
+ BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
+ .addReg(NewVReg3, RegState::Kill)
+ .addReg(NewVReg4)
+ .addImm(0)
+ .addMemOperand(JTMMOLd));
+
+ if (IsPositionIndependent) {
+ BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
+ .addReg(NewVReg5, RegState::Kill)
+ .addReg(NewVReg4)
+ .addJumpTableIndex(MJTI);
+ } else {
+ BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
+ .addReg(NewVReg5, RegState::Kill)
+ .addJumpTableIndex(MJTI);
+ }
+ }
+
+ // Add the jump table entries as successors to the MBB.
+ SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs;
+ for (std::vector<MachineBasicBlock*>::iterator
+ I = LPadList.begin(), E = LPadList.end(); I != E; ++I) {
+ MachineBasicBlock *CurMBB = *I;
+ if (SeenMBBs.insert(CurMBB).second)
+ DispContBB->addSuccessor(CurMBB);
+ }
+
+ // N.B. the order the invoke BBs are processed in doesn't matter here.
+ const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);
+ SmallVector<MachineBasicBlock*, 64> MBBLPads;
+ for (MachineBasicBlock *BB : InvokeBBs) {
+
+ // Remove the landing pad successor from the invoke block and replace it
+ // with the new dispatch block.
+ SmallVector<MachineBasicBlock*, 4> Successors(BB->succ_begin(),
+ BB->succ_end());
+ while (!Successors.empty()) {
+ MachineBasicBlock *SMBB = Successors.pop_back_val();
+ if (SMBB->isEHPad()) {
+ BB->removeSuccessor(SMBB);
+ MBBLPads.push_back(SMBB);
+ }
+ }
+
+ BB->addSuccessor(DispatchBB, BranchProbability::getZero());
+ BB->normalizeSuccProbs();
+
+ // Find the invoke call and mark all of the callee-saved registers as
+ // 'implicit defined' so that they're spilled. This prevents code from
+ // moving instructions to before the EH block, where they will never be
+ // executed.
+ for (MachineBasicBlock::reverse_iterator
+ II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
+ if (!II->isCall()) continue;
+
+ DenseMap<unsigned, bool> DefRegs;
+ for (MachineInstr::mop_iterator
+ OI = II->operands_begin(), OE = II->operands_end();
+ OI != OE; ++OI) {
+ if (!OI->isReg()) continue;
+ DefRegs[OI->getReg()] = true;
+ }
+
+ MachineInstrBuilder MIB(*MF, &*II);
+
+ for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
+ unsigned Reg = SavedRegs[i];
+ if (Subtarget->isThumb2() &&
+ !ARM::tGPRRegClass.contains(Reg) &&
+ !ARM::hGPRRegClass.contains(Reg))
+ continue;
+ if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
+ continue;
+ if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
+ continue;
+ if (!DefRegs[Reg])
+ MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
+ }
+
+ break;
+ }
+ }
+
+ // Mark all former landing pads as non-landing pads. The dispatch is the only
+ // landing pad now.
+ for (SmallVectorImpl<MachineBasicBlock*>::iterator
+ I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I)
+ (*I)->setIsEHPad(false);
+
+ // The instruction is gone now.
+ MI.eraseFromParent();
+}
+
+static
+MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
+ for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
+ E = MBB->succ_end(); I != E; ++I)
+ if (*I != Succ)
+ return *I;
+ llvm_unreachable("Expecting a BB with two successors!");
+}
+
+/// Return the load opcode for a given load size. If load size >= 8,
+/// neon opcode will be returned.
+static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {
+ if (LdSize >= 8)
+ return LdSize == 16 ? ARM::VLD1q32wb_fixed
+ : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;
+ if (IsThumb1)
+ return LdSize == 4 ? ARM::tLDRi
+ : LdSize == 2 ? ARM::tLDRHi
+ : LdSize == 1 ? ARM::tLDRBi : 0;
+ if (IsThumb2)
+ return LdSize == 4 ? ARM::t2LDR_POST
+ : LdSize == 2 ? ARM::t2LDRH_POST
+ : LdSize == 1 ? ARM::t2LDRB_POST : 0;
+ return LdSize == 4 ? ARM::LDR_POST_IMM
+ : LdSize == 2 ? ARM::LDRH_POST
+ : LdSize == 1 ? ARM::LDRB_POST_IMM : 0;
+}
+
+/// Return the store opcode for a given store size. If store size >= 8,
+/// neon opcode will be returned.
+static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
+ if (StSize >= 8)
+ return StSize == 16 ? ARM::VST1q32wb_fixed
+ : StSize == 8 ? ARM::VST1d32wb_fixed : 0;
+ if (IsThumb1)
+ return StSize == 4 ? ARM::tSTRi
+ : StSize == 2 ? ARM::tSTRHi
+ : StSize == 1 ? ARM::tSTRBi : 0;
+ if (IsThumb2)
+ return StSize == 4 ? ARM::t2STR_POST
+ : StSize == 2 ? ARM::t2STRH_POST
+ : StSize == 1 ? ARM::t2STRB_POST : 0;
+ return StSize == 4 ? ARM::STR_POST_IMM
+ : StSize == 2 ? ARM::STRH_POST
+ : StSize == 1 ? ARM::STRB_POST_IMM : 0;
+}
+
+/// Emit a post-increment load operation with given size. The instructions
+/// will be added to BB at Pos.
+static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
+ const TargetInstrInfo *TII, const DebugLoc &dl,
+ unsigned LdSize, unsigned Data, unsigned AddrIn,
+ unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
+ unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
+ assert(LdOpc != 0 && "Should have a load opcode");
+ if (LdSize >= 8) {
+ AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
+ .addReg(AddrOut, RegState::Define).addReg(AddrIn)
+ .addImm(0));
+ } else if (IsThumb1) {
+ // load + update AddrIn
+ AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
+ .addReg(AddrIn).addImm(0));
+ MachineInstrBuilder MIB =
+ BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut);
+ MIB = AddDefaultT1CC(MIB);
+ MIB.addReg(AddrIn).addImm(LdSize);
+ AddDefaultPred(MIB);
+ } else if (IsThumb2) {
+ AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
+ .addReg(AddrOut, RegState::Define).addReg(AddrIn)
+ .addImm(LdSize));
+ } else { // arm
+ AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
+ .addReg(AddrOut, RegState::Define).addReg(AddrIn)
+ .addReg(0).addImm(LdSize));
+ }
+}
+
+/// Emit a post-increment store operation with given size. The instructions
+/// will be added to BB at Pos.
+static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
+ const TargetInstrInfo *TII, const DebugLoc &dl,
+ unsigned StSize, unsigned Data, unsigned AddrIn,
+ unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
+ unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
+ assert(StOpc != 0 && "Should have a store opcode");
+ if (StSize >= 8) {
+ AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
+ .addReg(AddrIn).addImm(0).addReg(Data));
+ } else if (IsThumb1) {
+ // store + update AddrIn
+ AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc)).addReg(Data)
+ .addReg(AddrIn).addImm(0));
+ MachineInstrBuilder MIB =
+ BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut);
+ MIB = AddDefaultT1CC(MIB);
+ MIB.addReg(AddrIn).addImm(StSize);
+ AddDefaultPred(MIB);
+ } else if (IsThumb2) {
+ AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
+ .addReg(Data).addReg(AddrIn).addImm(StSize));
+ } else { // arm
+ AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
+ .addReg(Data).addReg(AddrIn).addReg(0)
+ .addImm(StSize));
+ }
+}
+
+MachineBasicBlock *
+ARMTargetLowering::EmitStructByval(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ // This pseudo instruction has 3 operands: dst, src, size
+ // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
+ // Otherwise, we will generate unrolled scalar copies.
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ const BasicBlock *LLVM_BB = BB->getBasicBlock();
+ MachineFunction::iterator It = ++BB->getIterator();
+
+ unsigned dest = MI.getOperand(0).getReg();
+ unsigned src = MI.getOperand(1).getReg();
+ unsigned SizeVal = MI.getOperand(2).getImm();
+ unsigned Align = MI.getOperand(3).getImm();
+ DebugLoc dl = MI.getDebugLoc();
+
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ unsigned UnitSize = 0;
+ const TargetRegisterClass *TRC = nullptr;
+ const TargetRegisterClass *VecTRC = nullptr;
+
+ bool IsThumb1 = Subtarget->isThumb1Only();
+ bool IsThumb2 = Subtarget->isThumb2();
+ bool IsThumb = Subtarget->isThumb();
+
+ if (Align & 1) {
+ UnitSize = 1;
+ } else if (Align & 2) {
+ UnitSize = 2;
+ } else {
+ // Check whether we can use NEON instructions.
+ if (!MF->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
+ Subtarget->hasNEON()) {
+ if ((Align % 16 == 0) && SizeVal >= 16)
+ UnitSize = 16;
+ else if ((Align % 8 == 0) && SizeVal >= 8)
+ UnitSize = 8;
+ }
+ // Can't use NEON instructions.
+ if (UnitSize == 0)
+ UnitSize = 4;
+ }
+
+ // Select the correct opcode and register class for unit size load/store
+ bool IsNeon = UnitSize >= 8;
+ TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
+ if (IsNeon)
+ VecTRC = UnitSize == 16 ? &ARM::DPairRegClass
+ : UnitSize == 8 ? &ARM::DPRRegClass
+ : nullptr;
+
+ unsigned BytesLeft = SizeVal % UnitSize;
+ unsigned LoopSize = SizeVal - BytesLeft;
+
+ if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
+ // Use LDR and STR to copy.
+ // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
+ // [destOut] = STR_POST(scratch, destIn, UnitSize)
+ unsigned srcIn = src;
+ unsigned destIn = dest;
+ for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
+ unsigned srcOut = MRI.createVirtualRegister(TRC);
+ unsigned destOut = MRI.createVirtualRegister(TRC);
+ unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
+ emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
+ IsThumb1, IsThumb2);
+ emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
+ IsThumb1, IsThumb2);
+ srcIn = srcOut;
+ destIn = destOut;
+ }
+
+ // Handle the leftover bytes with LDRB and STRB.
+ // [scratch, srcOut] = LDRB_POST(srcIn, 1)
+ // [destOut] = STRB_POST(scratch, destIn, 1)
+ for (unsigned i = 0; i < BytesLeft; i++) {
+ unsigned srcOut = MRI.createVirtualRegister(TRC);
+ unsigned destOut = MRI.createVirtualRegister(TRC);
+ unsigned scratch = MRI.createVirtualRegister(TRC);
+ emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
+ IsThumb1, IsThumb2);
+ emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
+ IsThumb1, IsThumb2);
+ srcIn = srcOut;
+ destIn = destOut;
+ }
+ MI.eraseFromParent(); // The instruction is gone now.
+ return BB;
+ }
+
+ // Expand the pseudo op to a loop.
+ // thisMBB:
+ // ...
+ // movw varEnd, # --> with thumb2
+ // movt varEnd, #
+ // ldrcp varEnd, idx --> without thumb2
+ // fallthrough --> loopMBB
+ // loopMBB:
+ // PHI varPhi, varEnd, varLoop
+ // PHI srcPhi, src, srcLoop
+ // PHI destPhi, dst, destLoop
+ // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
+ // [destLoop] = STR_POST(scratch, destPhi, UnitSize)
+ // subs varLoop, varPhi, #UnitSize
+ // bne loopMBB
+ // fallthrough --> exitMBB
+ // exitMBB:
+ // epilogue to handle left-over bytes
+ // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
+ // [destOut] = STRB_POST(scratch, destLoop, 1)
+ MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MF->insert(It, loopMBB);
+ MF->insert(It, exitMBB);
+
+ // Transfer the remainder of BB and its successor edges to exitMBB.
+ exitMBB->splice(exitMBB->begin(), BB,
+ std::next(MachineBasicBlock::iterator(MI)), BB->end());
+ exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+ // Load an immediate to varEnd.
+ unsigned varEnd = MRI.createVirtualRegister(TRC);
+ if (Subtarget->useMovt(*MF)) {
+ unsigned Vtmp = varEnd;
+ if ((LoopSize & 0xFFFF0000) != 0)
+ Vtmp = MRI.createVirtualRegister(TRC);
+ AddDefaultPred(BuildMI(BB, dl,
+ TII->get(IsThumb ? ARM::t2MOVi16 : ARM::MOVi16),
+ Vtmp).addImm(LoopSize & 0xFFFF));
+
+ if ((LoopSize & 0xFFFF0000) != 0)
+ AddDefaultPred(BuildMI(BB, dl,
+ TII->get(IsThumb ? ARM::t2MOVTi16 : ARM::MOVTi16),
+ varEnd)
+ .addReg(Vtmp)
+ .addImm(LoopSize >> 16));
+ } else {
+ MachineConstantPool *ConstantPool = MF->getConstantPool();
+ Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
+ const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
+
+ // MachineConstantPool wants an explicit alignment.
+ unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
+ if (Align == 0)
+ Align = MF->getDataLayout().getTypeAllocSize(C->getType());
+ unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
+
+ if (IsThumb)
+ AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)).addReg(
+ varEnd, RegState::Define).addConstantPoolIndex(Idx));
+ else
+ AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)).addReg(
+ varEnd, RegState::Define).addConstantPoolIndex(Idx).addImm(0));
+ }
+ BB->addSuccessor(loopMBB);
+
+ // Generate the loop body:
+ // varPhi = PHI(varLoop, varEnd)
+ // srcPhi = PHI(srcLoop, src)
+ // destPhi = PHI(destLoop, dst)
+ MachineBasicBlock *entryBB = BB;
+ BB = loopMBB;
+ unsigned varLoop = MRI.createVirtualRegister(TRC);
+ unsigned varPhi = MRI.createVirtualRegister(TRC);
+ unsigned srcLoop = MRI.createVirtualRegister(TRC);
+ unsigned srcPhi = MRI.createVirtualRegister(TRC);
+ unsigned destLoop = MRI.createVirtualRegister(TRC);
+ unsigned destPhi = MRI.createVirtualRegister(TRC);
+
+ BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
+ .addReg(varLoop).addMBB(loopMBB)
+ .addReg(varEnd).addMBB(entryBB);
+ BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
+ .addReg(srcLoop).addMBB(loopMBB)
+ .addReg(src).addMBB(entryBB);
+ BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
+ .addReg(destLoop).addMBB(loopMBB)
+ .addReg(dest).addMBB(entryBB);
+
+ // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
+ // [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
+ unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
+ emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
+ IsThumb1, IsThumb2);
+ emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
+ IsThumb1, IsThumb2);
+
+ // Decrement loop variable by UnitSize.
+ if (IsThumb1) {
+ MachineInstrBuilder MIB =
+ BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop);
+ MIB = AddDefaultT1CC(MIB);
+ MIB.addReg(varPhi).addImm(UnitSize);
+ AddDefaultPred(MIB);
+ } else {
+ MachineInstrBuilder MIB =
+ BuildMI(*BB, BB->end(), dl,
+ TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
+ AddDefaultCC(AddDefaultPred(MIB.addReg(varPhi).addImm(UnitSize)));
+ MIB->getOperand(5).setReg(ARM::CPSR);
+ MIB->getOperand(5).setIsDef(true);
+ }
+ BuildMI(*BB, BB->end(), dl,
+ TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))
+ .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
+
+ // loopMBB can loop back to loopMBB or fall through to exitMBB.
+ BB->addSuccessor(loopMBB);
+ BB->addSuccessor(exitMBB);
+
+ // Add epilogue to handle BytesLeft.
+ BB = exitMBB;
+ auto StartOfExit = exitMBB->begin();
+
+ // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
+ // [destOut] = STRB_POST(scratch, destLoop, 1)
+ unsigned srcIn = srcLoop;
+ unsigned destIn = destLoop;
+ for (unsigned i = 0; i < BytesLeft; i++) {
+ unsigned srcOut = MRI.createVirtualRegister(TRC);
+ unsigned destOut = MRI.createVirtualRegister(TRC);
+ unsigned scratch = MRI.createVirtualRegister(TRC);
+ emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
+ IsThumb1, IsThumb2);
+ emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
+ IsThumb1, IsThumb2);
+ srcIn = srcOut;
+ destIn = destOut;
+ }
+
+ MI.eraseFromParent(); // The instruction is gone now.
+ return BB;
+}
+
+MachineBasicBlock *
+ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
+ MachineBasicBlock *MBB) const {
+ const TargetMachine &TM = getTargetMachine();
+ const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
+ DebugLoc DL = MI.getDebugLoc();
+
+ assert(Subtarget->isTargetWindows() &&
+ "__chkstk is only supported on Windows");
+ assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");
+
+ // __chkstk takes the number of words to allocate on the stack in R4, and
+ // returns the stack adjustment in number of bytes in R4. This will not
+ // clober any other registers (other than the obvious lr).
+ //
+ // Although, technically, IP should be considered a register which may be
+ // clobbered, the call itself will not touch it. Windows on ARM is a pure
+ // thumb-2 environment, so there is no interworking required. As a result, we
+ // do not expect a veneer to be emitted by the linker, clobbering IP.
+ //
+ // Each module receives its own copy of __chkstk, so no import thunk is
+ // required, again, ensuring that IP is not clobbered.
+ //
+ // Finally, although some linkers may theoretically provide a trampoline for
+ // out of range calls (which is quite common due to a 32M range limitation of
+ // branches for Thumb), we can generate the long-call version via
+ // -mcmodel=large, alleviating the need for the trampoline which may clobber
+ // IP.
+
+ switch (TM.getCodeModel()) {
+ case CodeModel::Small:
+ case CodeModel::Medium:
+ case CodeModel::Default:
+ case CodeModel::Kernel:
+ BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
+ .addImm((unsigned)ARMCC::AL).addReg(0)
+ .addExternalSymbol("__chkstk")
+ .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
+ .addReg(ARM::R4, RegState::Implicit | RegState::Define)
+ .addReg(ARM::R12, RegState::Implicit | RegState::Define | RegState::Dead);
+ break;
+ case CodeModel::Large:
+ case CodeModel::JITDefault: {
+ MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+ unsigned Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
+
+ BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
+ .addExternalSymbol("__chkstk");
+ BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr))
+ .addImm((unsigned)ARMCC::AL).addReg(0)
+ .addReg(Reg, RegState::Kill)
+ .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
+ .addReg(ARM::R4, RegState::Implicit | RegState::Define)
+ .addReg(ARM::R12, RegState::Implicit | RegState::Define | RegState::Dead);
+ break;
+ }
+ }
+
+ AddDefaultCC(AddDefaultPred(BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr),
+ ARM::SP)
+ .addReg(ARM::SP, RegState::Kill)
+ .addReg(ARM::R4, RegState::Kill)
+ .setMIFlags(MachineInstr::FrameSetup)));
+
+ MI.eraseFromParent();
+ return MBB;
+}
+
+MachineBasicBlock *
+ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
+ MachineBasicBlock *MBB) const {
+ DebugLoc DL = MI.getDebugLoc();
+ MachineFunction *MF = MBB->getParent();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+
+ MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock();
+ MF->insert(++MBB->getIterator(), ContBB);
+ ContBB->splice(ContBB->begin(), MBB,
+ std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+ ContBB->transferSuccessorsAndUpdatePHIs(MBB);
+ MBB->addSuccessor(ContBB);
+
+ MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
+ BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0));
+ MF->push_back(TrapBB);
+ MBB->addSuccessor(TrapBB);
+
+ AddDefaultPred(BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8))
+ .addReg(MI.getOperand(0).getReg())
+ .addImm(0));
+ BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc))
+ .addMBB(TrapBB)
+ .addImm(ARMCC::EQ)
+ .addReg(ARM::CPSR);
+
+ MI.eraseFromParent();
+ return ContBB;
+}
+
+MachineBasicBlock *
+ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ DebugLoc dl = MI.getDebugLoc();
+ bool isThumb2 = Subtarget->isThumb2();
+ switch (MI.getOpcode()) {
+ default: {
+ MI.dump();
+ llvm_unreachable("Unexpected instr type to insert");
+ }
+
+ // Thumb1 post-indexed loads are really just single-register LDMs.
+ case ARM::tLDR_postidx: {
+ BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
+ .addOperand(MI.getOperand(1)) // Rn_wb
+ .addOperand(MI.getOperand(2)) // Rn
+ .addOperand(MI.getOperand(3)) // PredImm
+ .addOperand(MI.getOperand(4)) // PredReg
+ .addOperand(MI.getOperand(0)); // Rt
+ MI.eraseFromParent();
+ return BB;
+ }
+
+ // The Thumb2 pre-indexed stores have the same MI operands, they just
+ // define them differently in the .td files from the isel patterns, so
+ // they need pseudos.
+ case ARM::t2STR_preidx:
+ MI.setDesc(TII->get(ARM::t2STR_PRE));
+ return BB;
+ case ARM::t2STRB_preidx:
+ MI.setDesc(TII->get(ARM::t2STRB_PRE));
+ return BB;
+ case ARM::t2STRH_preidx:
+ MI.setDesc(TII->get(ARM::t2STRH_PRE));
+ return BB;
+
+ case ARM::STRi_preidx:
+ case ARM::STRBi_preidx: {
+ unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM
+ : ARM::STRB_PRE_IMM;
+ // Decode the offset.
+ unsigned Offset = MI.getOperand(4).getImm();
+ bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
+ Offset = ARM_AM::getAM2Offset(Offset);
+ if (isSub)
+ Offset = -Offset;
+
+ MachineMemOperand *MMO = *MI.memoperands_begin();
+ BuildMI(*BB, MI, dl, TII->get(NewOpc))
+ .addOperand(MI.getOperand(0)) // Rn_wb
+ .addOperand(MI.getOperand(1)) // Rt
+ .addOperand(MI.getOperand(2)) // Rn
+ .addImm(Offset) // offset (skip GPR==zero_reg)
+ .addOperand(MI.getOperand(5)) // pred
+ .addOperand(MI.getOperand(6))
+ .addMemOperand(MMO);
+ MI.eraseFromParent();
+ return BB;
+ }
+ case ARM::STRr_preidx:
+ case ARM::STRBr_preidx:
+ case ARM::STRH_preidx: {
+ unsigned NewOpc;
+ switch (MI.getOpcode()) {
+ default: llvm_unreachable("unexpected opcode!");
+ case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
+ case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
+ case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
+ }
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
+ for (unsigned i = 0; i < MI.getNumOperands(); ++i)
+ MIB.addOperand(MI.getOperand(i));
+ MI.eraseFromParent();
+ return BB;
+ }
+
+ case ARM::tMOVCCr_pseudo: {
+ // To "insert" a SELECT_CC instruction, we actually have to insert the
+ // diamond control-flow pattern. The incoming instruction knows the
+ // destination vreg to set, the condition code register to branch on, the
+ // true/false values to select between, and a branch opcode to use.
+ const BasicBlock *LLVM_BB = BB->getBasicBlock();
+ MachineFunction::iterator It = ++BB->getIterator();
+
+ // thisMBB:
+ // ...
+ // TrueVal = ...
+ // cmpTY ccX, r1, r2
+ // bCC copy1MBB
+ // fallthrough --> copy0MBB
+ MachineBasicBlock *thisMBB = BB;
+ MachineFunction *F = BB->getParent();
+ MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ F->insert(It, copy0MBB);
+ F->insert(It, sinkMBB);
+
+ // Transfer the remainder of BB and its successor edges to sinkMBB.
+ sinkMBB->splice(sinkMBB->begin(), BB,
+ std::next(MachineBasicBlock::iterator(MI)), BB->end());
+ sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+ BB->addSuccessor(copy0MBB);
+ BB->addSuccessor(sinkMBB);
+
+ BuildMI(BB, dl, TII->get(ARM::tBcc))
+ .addMBB(sinkMBB)
+ .addImm(MI.getOperand(3).getImm())
+ .addReg(MI.getOperand(4).getReg());
+
+ // copy0MBB:
+ // %FalseValue = ...
+ // # fallthrough to sinkMBB
+ BB = copy0MBB;
+
+ // Update machine-CFG edges
+ BB->addSuccessor(sinkMBB);
+
+ // sinkMBB:
+ // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+ // ...
+ BB = sinkMBB;
+ BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg())
+ .addReg(MI.getOperand(1).getReg())
+ .addMBB(copy0MBB)
+ .addReg(MI.getOperand(2).getReg())
+ .addMBB(thisMBB);
+
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
+ return BB;
+ }
+
+ case ARM::BCCi64:
+ case ARM::BCCZi64: {
+ // If there is an unconditional branch to the other successor, remove it.
+ BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end());
+
+ // Compare both parts that make up the double comparison separately for
+ // equality.
+ bool RHSisZero = MI.getOpcode() == ARM::BCCZi64;
+
+ unsigned LHS1 = MI.getOperand(1).getReg();
+ unsigned LHS2 = MI.getOperand(2).getReg();
+ if (RHSisZero) {
+ AddDefaultPred(BuildMI(BB, dl,
+ TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
+ .addReg(LHS1).addImm(0));
+ BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
+ .addReg(LHS2).addImm(0)
+ .addImm(ARMCC::EQ).addReg(ARM::CPSR);
+ } else {
+ unsigned RHS1 = MI.getOperand(3).getReg();
+ unsigned RHS2 = MI.getOperand(4).getReg();
+ AddDefaultPred(BuildMI(BB, dl,
+ TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
+ .addReg(LHS1).addReg(RHS1));
+ BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
+ .addReg(LHS2).addReg(RHS2)
+ .addImm(ARMCC::EQ).addReg(ARM::CPSR);
+ }
+
+ MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB();
+ MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
+ if (MI.getOperand(0).getImm() == ARMCC::NE)
+ std::swap(destMBB, exitMBB);
+
+ BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
+ .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
+ if (isThumb2)
+ AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2B)).addMBB(exitMBB));
+ else
+ BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
+
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
+ return BB;
+ }
+
+ case ARM::Int_eh_sjlj_setjmp:
+ case ARM::Int_eh_sjlj_setjmp_nofp:
+ case ARM::tInt_eh_sjlj_setjmp:
+ case ARM::t2Int_eh_sjlj_setjmp:
+ case ARM::t2Int_eh_sjlj_setjmp_nofp:
+ return BB;
+
+ case ARM::Int_eh_sjlj_setup_dispatch:
+ EmitSjLjDispatchBlock(MI, BB);
+ return BB;
+
+ case ARM::ABS:
+ case ARM::t2ABS: {
+ // To insert an ABS instruction, we have to insert the
+ // diamond control-flow pattern. The incoming instruction knows the
+ // source vreg to test against 0, the destination vreg to set,
+ // the condition code register to branch on, the
+ // true/false values to select between, and a branch opcode to use.
+ // It transforms
+ // V1 = ABS V0
+ // into
+ // V2 = MOVS V0
+ // BCC (branch to SinkBB if V0 >= 0)
+ // RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0)
+ // SinkBB: V1 = PHI(V2, V3)
+ const BasicBlock *LLVM_BB = BB->getBasicBlock();
+ MachineFunction::iterator BBI = ++BB->getIterator();
+ MachineFunction *Fn = BB->getParent();
+ MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *SinkBB = Fn->CreateMachineBasicBlock(LLVM_BB);
+ Fn->insert(BBI, RSBBB);
+ Fn->insert(BBI, SinkBB);
+
+ unsigned int ABSSrcReg = MI.getOperand(1).getReg();
+ unsigned int ABSDstReg = MI.getOperand(0).getReg();
+ bool ABSSrcKIll = MI.getOperand(1).isKill();
+ bool isThumb2 = Subtarget->isThumb2();
+ MachineRegisterInfo &MRI = Fn->getRegInfo();
+ // In Thumb mode S must not be specified if source register is the SP or
+ // PC and if destination register is the SP, so restrict register class
+ unsigned NewRsbDstReg =
+ MRI.createVirtualRegister(isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass);
+
+ // Transfer the remainder of BB and its successor edges to sinkMBB.
+ SinkBB->splice(SinkBB->begin(), BB,
+ std::next(MachineBasicBlock::iterator(MI)), BB->end());
+ SinkBB->transferSuccessorsAndUpdatePHIs(BB);
+
+ BB->addSuccessor(RSBBB);
+ BB->addSuccessor(SinkBB);
+
+ // fall through to SinkMBB
+ RSBBB->addSuccessor(SinkBB);
+
+ // insert a cmp at the end of BB
+ AddDefaultPred(BuildMI(BB, dl,
+ TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
+ .addReg(ABSSrcReg).addImm(0));
+
+ // insert a bcc with opposite CC to ARMCC::MI at the end of BB
+ BuildMI(BB, dl,
+ TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB)
+ .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR);
+
+ // insert rsbri in RSBBB
+ // Note: BCC and rsbri will be converted into predicated rsbmi
+ // by if-conversion pass
+ BuildMI(*RSBBB, RSBBB->begin(), dl,
+ TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg)
+ .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0)
+ .addImm(0).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
+
+ // insert PHI in SinkBB,
+ // reuse ABSDstReg to not change uses of ABS instruction
+ BuildMI(*SinkBB, SinkBB->begin(), dl,
+ TII->get(ARM::PHI), ABSDstReg)
+ .addReg(NewRsbDstReg).addMBB(RSBBB)
+ .addReg(ABSSrcReg).addMBB(BB);
+
+ // remove ABS instruction
+ MI.eraseFromParent();
+
+ // return last added BB
+ return SinkBB;
+ }
+ case ARM::COPY_STRUCT_BYVAL_I32:
+ ++NumLoopByVals;
+ return EmitStructByval(MI, BB);
+ case ARM::WIN__CHKSTK:
+ return EmitLowered__chkstk(MI, BB);
+ case ARM::WIN__DBZCHK:
+ return EmitLowered__dbzchk(MI, BB);
+ }
+}
+
+/// \brief Attaches vregs to MEMCPY that it will use as scratch registers
+/// when it is expanded into LDM/STM. This is done as a post-isel lowering
+/// instead of as a custom inserter because we need the use list from the SDNode.
+static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
+ MachineInstr &MI, const SDNode *Node) {
+ bool isThumb1 = Subtarget->isThumb1Only();
+
+ DebugLoc DL = MI.getDebugLoc();
+ MachineFunction *MF = MI.getParent()->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ MachineInstrBuilder MIB(*MF, MI);
+
+ // If the new dst/src is unused mark it as dead.
+ if (!Node->hasAnyUseOfValue(0)) {
+ MI.getOperand(0).setIsDead(true);
+ }
+ if (!Node->hasAnyUseOfValue(1)) {
+ MI.getOperand(1).setIsDead(true);
+ }
+
+ // The MEMCPY both defines and kills the scratch registers.
+ for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) {
+ unsigned TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
+ : &ARM::GPRRegClass);
+ MIB.addReg(TmpReg, RegState::Define|RegState::Dead);
+ }
+}
+
+void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
+ SDNode *Node) const {
+ if (MI.getOpcode() == ARM::MEMCPY) {
+ attachMEMCPYScratchRegs(Subtarget, MI, Node);
+ return;
+ }
+
+ const MCInstrDesc *MCID = &MI.getDesc();
+ // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
+ // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
+ // operand is still set to noreg. If needed, set the optional operand's
+ // register to CPSR, and remove the redundant implicit def.
+ //
+ // e.g. ADCS (..., CPSR<imp-def>) -> ADC (... opt:CPSR<def>).
+
+ // Rename pseudo opcodes.
+ unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode());
+ if (NewOpc) {
+ const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
+ MCID = &TII->get(NewOpc);
+
+ assert(MCID->getNumOperands() == MI.getDesc().getNumOperands() + 1 &&
+ "converted opcode should be the same except for cc_out");
+
+ MI.setDesc(*MCID);
+
+ // Add the optional cc_out operand
+ MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
+ }
+ unsigned ccOutIdx = MCID->getNumOperands() - 1;
+
+ // Any ARM instruction that sets the 's' bit should specify an optional
+ // "cc_out" operand in the last operand position.
+ if (!MI.hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) {
+ assert(!NewOpc && "Optional cc_out operand required");
+ return;
+ }
+ // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
+ // since we already have an optional CPSR def.
+ bool definesCPSR = false;
+ bool deadCPSR = false;
+ for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e;
+ ++i) {
+ const MachineOperand &MO = MI.getOperand(i);
+ if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
+ definesCPSR = true;
+ if (MO.isDead())
+ deadCPSR = true;
+ MI.RemoveOperand(i);
+ break;
+ }
+ }
+ if (!definesCPSR) {
+ assert(!NewOpc && "Optional cc_out operand required");
+ return;
+ }
+ assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
+ if (deadCPSR) {
+ assert(!MI.getOperand(ccOutIdx).getReg() &&
+ "expect uninitialized optional cc_out operand");
+ return;
+ }
+
+ // If this instruction was defined with an optional CPSR def and its dag node
+ // had a live implicit CPSR def, then activate the optional CPSR def.
+ MachineOperand &MO = MI.getOperand(ccOutIdx);
+ MO.setReg(ARM::CPSR);
+ MO.setIsDef(true);
+}
+
+//===----------------------------------------------------------------------===//
+// ARM Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+// Helper function that checks if N is a null or all ones constant.
+static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
+ return AllOnes ? isAllOnesConstant(N) : isNullConstant(N);
+}
+
+// Return true if N is conditionally 0 or all ones.
+// Detects these expressions where cc is an i1 value:
+//
+// (select cc 0, y) [AllOnes=0]
+// (select cc y, 0) [AllOnes=0]
+// (zext cc) [AllOnes=0]
+// (sext cc) [AllOnes=0/1]
+// (select cc -1, y) [AllOnes=1]
+// (select cc y, -1) [AllOnes=1]
+//
+// Invert is set when N is the null/all ones constant when CC is false.
+// OtherOp is set to the alternative value of N.
+static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes,
+ SDValue &CC, bool &Invert,
+ SDValue &OtherOp,
+ SelectionDAG &DAG) {
+ switch (N->getOpcode()) {
+ default: return false;
+ case ISD::SELECT: {
+ CC = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDValue N2 = N->getOperand(2);
+ if (isZeroOrAllOnes(N1, AllOnes)) {
+ Invert = false;
+ OtherOp = N2;
+ return true;
+ }
+ if (isZeroOrAllOnes(N2, AllOnes)) {
+ Invert = true;
+ OtherOp = N1;
+ return true;
+ }
+ return false;
+ }
+ case ISD::ZERO_EXTEND:
+ // (zext cc) can never be the all ones value.
+ if (AllOnes)
+ return false;
+ LLVM_FALLTHROUGH;
+ case ISD::SIGN_EXTEND: {
+ SDLoc dl(N);
+ EVT VT = N->getValueType(0);
+ CC = N->getOperand(0);
+ if (CC.getValueType() != MVT::i1)
+ return false;
+ Invert = !AllOnes;
+ if (AllOnes)
+ // When looking for an AllOnes constant, N is an sext, and the 'other'
+ // value is 0.
+ OtherOp = DAG.getConstant(0, dl, VT);
+ else if (N->getOpcode() == ISD::ZERO_EXTEND)
+ // When looking for a 0 constant, N can be zext or sext.
+ OtherOp = DAG.getConstant(1, dl, VT);
+ else
+ OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl,
+ VT);
+ return true;
+ }
+ }
+}
+
+// Combine a constant select operand into its use:
+//
+// (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
+// (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
+// (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1]
+// (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
+// (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
+//
+// The transform is rejected if the select doesn't have a constant operand that
+// is null, or all ones when AllOnes is set.
+//
+// Also recognize sext/zext from i1:
+//
+// (add (zext cc), x) -> (select cc (add x, 1), x)
+// (add (sext cc), x) -> (select cc (add x, -1), x)
+//
+// These transformations eventually create predicated instructions.
+//
+// @param N The node to transform.
+// @param Slct The N operand that is a select.
+// @param OtherOp The other N operand (x above).
+// @param DCI Context.
+// @param AllOnes Require the select constant to be all ones instead of null.
+// @returns The new node, or SDValue() on failure.
+static
+SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
+ TargetLowering::DAGCombinerInfo &DCI,
+ bool AllOnes = false) {
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+ SDValue NonConstantVal;
+ SDValue CCOp;
+ bool SwapSelectOps;
+ if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
+ NonConstantVal, DAG))
+ return SDValue();
+
+ // Slct is now know to be the desired identity constant when CC is true.
+ SDValue TrueVal = OtherOp;
+ SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
+ OtherOp, NonConstantVal);
+ // Unless SwapSelectOps says CC should be false.
+ if (SwapSelectOps)
+ std::swap(TrueVal, FalseVal);
+
+ return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
+ CCOp, TrueVal, FalseVal);
+}
+
+// Attempt combineSelectAndUse on each operand of a commutative operator N.
+static
+SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ if (N0.getNode()->hasOneUse())
+ if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes))
+ return Result;
+ if (N1.getNode()->hasOneUse())
+ if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes))
+ return Result;
+ return SDValue();
+}
+
+// AddCombineToVPADDL- For pair-wise add on neon, use the vpaddl instruction
+// (only after legalization).
+static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *Subtarget) {
+
+ // Only perform optimization if after legalize, and if NEON is available. We
+ // also expected both operands to be BUILD_VECTORs.
+ if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
+ || N0.getOpcode() != ISD::BUILD_VECTOR
+ || N1.getOpcode() != ISD::BUILD_VECTOR)
+ return SDValue();
+
+ // Check output type since VPADDL operand elements can only be 8, 16, or 32.
+ EVT VT = N->getValueType(0);
+ if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
+ return SDValue();
+
+ // Check that the vector operands are of the right form.
+ // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
+ // operands, where N is the size of the formed vector.
+ // Each EXTRACT_VECTOR should have the same input vector and odd or even
+ // index such that we have a pair wise add pattern.
+
+ // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
+ if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+ SDValue Vec = N0->getOperand(0)->getOperand(0);
+ SDNode *V = Vec.getNode();
+ unsigned nextIndex = 0;
+
+ // For each operands to the ADD which are BUILD_VECTORs,
+ // check to see if each of their operands are an EXTRACT_VECTOR with
+ // the same vector and appropriate index.
+ for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
+ if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT
+ && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+
+ SDValue ExtVec0 = N0->getOperand(i);
+ SDValue ExtVec1 = N1->getOperand(i);
+
+ // First operand is the vector, verify its the same.
+ if (V != ExtVec0->getOperand(0).getNode() ||
+ V != ExtVec1->getOperand(0).getNode())
+ return SDValue();
+
+ // Second is the constant, verify its correct.
+ ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1));
+ ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1));
+
+ // For the constant, we want to see all the even or all the odd.
+ if (!C0 || !C1 || C0->getZExtValue() != nextIndex
+ || C1->getZExtValue() != nextIndex+1)
+ return SDValue();
+
+ // Increment index.
+ nextIndex+=2;
+ } else
+ return SDValue();
+ }
+
+ // Create VPADDL node.
+ SelectionDAG &DAG = DCI.DAG;
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ SDLoc dl(N);
+
+ // Build operand list.
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl,
+ TLI.getPointerTy(DAG.getDataLayout())));
+
+ // Input is the vector.
+ Ops.push_back(Vec);
+
+ // Get widened type and narrowed type.
+ MVT widenType;
+ unsigned numElem = VT.getVectorNumElements();
+
+ EVT inputLaneType = Vec.getValueType().getVectorElementType();
+ switch (inputLaneType.getSimpleVT().SimpleTy) {
+ case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
+ case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
+ case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
+ default:
+ llvm_unreachable("Invalid vector element type for padd optimization.");
+ }
+
+ SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops);
+ unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
+ return DAG.getNode(ExtOp, dl, VT, tmp);
+}
+
+static SDValue findMUL_LOHI(SDValue V) {
+ if (V->getOpcode() == ISD::UMUL_LOHI ||
+ V->getOpcode() == ISD::SMUL_LOHI)
+ return V;
+ return SDValue();
+}
+
+static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *Subtarget) {
+
+ // Look for multiply add opportunities.
+ // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
+ // each add nodes consumes a value from ISD::UMUL_LOHI and there is
+ // a glue link from the first add to the second add.
+ // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
+ // a S/UMLAL instruction.
+ // UMUL_LOHI
+ // / :lo \ :hi
+ // / \ [no multiline comment]
+ // loAdd -> ADDE |
+ // \ :glue /
+ // \ /
+ // ADDC <- hiAdd
+ //
+ assert(AddcNode->getOpcode() == ISD::ADDC && "Expect an ADDC");
+ SDValue AddcOp0 = AddcNode->getOperand(0);
+ SDValue AddcOp1 = AddcNode->getOperand(1);
+
+ // Check if the two operands are from the same mul_lohi node.
+ if (AddcOp0.getNode() == AddcOp1.getNode())
+ return SDValue();
+
+ assert(AddcNode->getNumValues() == 2 &&
+ AddcNode->getValueType(0) == MVT::i32 &&
+ "Expect ADDC with two result values. First: i32");
+
+ // Check that we have a glued ADDC node.
+ if (AddcNode->getValueType(1) != MVT::Glue)
+ return SDValue();
+
+ // Check that the ADDC adds the low result of the S/UMUL_LOHI.
+ if (AddcOp0->getOpcode() != ISD::UMUL_LOHI &&
+ AddcOp0->getOpcode() != ISD::SMUL_LOHI &&
+ AddcOp1->getOpcode() != ISD::UMUL_LOHI &&
+ AddcOp1->getOpcode() != ISD::SMUL_LOHI)
+ return SDValue();
+
+ // Look for the glued ADDE.
+ SDNode* AddeNode = AddcNode->getGluedUser();
+ if (!AddeNode)
+ return SDValue();
+
+ // Make sure it is really an ADDE.
+ if (AddeNode->getOpcode() != ISD::ADDE)
+ return SDValue();
+
+ assert(AddeNode->getNumOperands() == 3 &&
+ AddeNode->getOperand(2).getValueType() == MVT::Glue &&
+ "ADDE node has the wrong inputs");
+
+ // Check for the triangle shape.
+ SDValue AddeOp0 = AddeNode->getOperand(0);
+ SDValue AddeOp1 = AddeNode->getOperand(1);
+
+ // Make sure that the ADDE operands are not coming from the same node.
+ if (AddeOp0.getNode() == AddeOp1.getNode())
+ return SDValue();
+
+ // Find the MUL_LOHI node walking up ADDE's operands.
+ bool IsLeftOperandMUL = false;
+ SDValue MULOp = findMUL_LOHI(AddeOp0);
+ if (MULOp == SDValue())
+ MULOp = findMUL_LOHI(AddeOp1);
+ else
+ IsLeftOperandMUL = true;
+ if (MULOp == SDValue())
+ return SDValue();
+
+ // Figure out the right opcode.
+ unsigned Opc = MULOp->getOpcode();
+ unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
+
+ // Figure out the high and low input values to the MLAL node.
+ SDValue* HiAdd = nullptr;
+ SDValue* LoMul = nullptr;
+ SDValue* LowAdd = nullptr;
+
+ // Ensure that ADDE is from high result of ISD::SMUL_LOHI.
+ if ((AddeOp0 != MULOp.getValue(1)) && (AddeOp1 != MULOp.getValue(1)))
+ return SDValue();
+
+ if (IsLeftOperandMUL)
+ HiAdd = &AddeOp1;
+ else
+ HiAdd = &AddeOp0;
+
+
+ // Ensure that LoMul and LowAdd are taken from correct ISD::SMUL_LOHI node
+ // whose low result is fed to the ADDC we are checking.
+
+ if (AddcOp0 == MULOp.getValue(0)) {
+ LoMul = &AddcOp0;
+ LowAdd = &AddcOp1;
+ }
+ if (AddcOp1 == MULOp.getValue(0)) {
+ LoMul = &AddcOp1;
+ LowAdd = &AddcOp0;
+ }
+
+ if (!LoMul)
+ return SDValue();
+
+ // Create the merged node.
+ SelectionDAG &DAG = DCI.DAG;
+
+ // Build operand list.
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(LoMul->getOperand(0));
+ Ops.push_back(LoMul->getOperand(1));
+ Ops.push_back(*LowAdd);
+ Ops.push_back(*HiAdd);
+
+ SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcNode),
+ DAG.getVTList(MVT::i32, MVT::i32), Ops);
+
+ // Replace the ADDs' nodes uses by the MLA node's values.
+ SDValue HiMLALResult(MLALNode.getNode(), 1);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
+
+ SDValue LoMLALResult(MLALNode.getNode(), 0);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
+
+ // Return original node to notify the driver to stop replacing.
+ SDValue resNode(AddcNode, 0);
+ return resNode;
+}
+
+static SDValue AddCombineTo64bitUMAAL(SDNode *AddcNode,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *Subtarget) {
+ // UMAAL is similar to UMLAL except that it adds two unsigned values.
+ // While trying to combine for the other MLAL nodes, first search for the
+ // chance to use UMAAL. Check if Addc uses another addc node which can first
+ // be combined into a UMLAL. The other pattern is AddcNode being combined
+ // into an UMLAL and then using another addc is handled in ISelDAGToDAG.
+
+ if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP() ||
+ (Subtarget->isThumb() && !Subtarget->hasThumb2()))
+ return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget);
+
+ SDNode *PrevAddc = nullptr;
+ if (AddcNode->getOperand(0).getOpcode() == ISD::ADDC)
+ PrevAddc = AddcNode->getOperand(0).getNode();
+ else if (AddcNode->getOperand(1).getOpcode() == ISD::ADDC)
+ PrevAddc = AddcNode->getOperand(1).getNode();
+
+ // If there's no addc chains, just return a search for any MLAL.
+ if (PrevAddc == nullptr)
+ return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget);
+
+ // Try to convert the addc operand to an MLAL and if that fails try to
+ // combine AddcNode.
+ SDValue MLAL = AddCombineTo64bitMLAL(PrevAddc, DCI, Subtarget);
+ if (MLAL != SDValue(PrevAddc, 0))
+ return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget);
+
+ // Find the converted UMAAL or quit if it doesn't exist.
+ SDNode *UmlalNode = nullptr;
+ SDValue AddHi;
+ if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) {
+ UmlalNode = AddcNode->getOperand(0).getNode();
+ AddHi = AddcNode->getOperand(1);
+ } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) {
+ UmlalNode = AddcNode->getOperand(1).getNode();
+ AddHi = AddcNode->getOperand(0);
+ } else {
+ return SDValue();
+ }
+
+ // The ADDC should be glued to an ADDE node, which uses the same UMLAL as
+ // the ADDC as well as Zero.
+ auto *Zero = dyn_cast<ConstantSDNode>(UmlalNode->getOperand(3));
+
+ if (!Zero || Zero->getZExtValue() != 0)
+ return SDValue();
+
+ // Check that we have a glued ADDC node.
+ if (AddcNode->getValueType(1) != MVT::Glue)
+ return SDValue();
+
+ // Look for the glued ADDE.
+ SDNode* AddeNode = AddcNode->getGluedUser();
+ if (!AddeNode)
+ return SDValue();
+
+ if ((AddeNode->getOperand(0).getNode() == Zero &&
+ AddeNode->getOperand(1).getNode() == UmlalNode) ||
+ (AddeNode->getOperand(0).getNode() == UmlalNode &&
+ AddeNode->getOperand(1).getNode() == Zero)) {
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1),
+ UmlalNode->getOperand(2), AddHi };
+ SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode),
+ DAG.getVTList(MVT::i32, MVT::i32), Ops);
+
+ // Replace the ADDs' nodes uses by the UMAAL node's values.
+ DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1));
+ DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0));
+
+ // Return original node to notify the driver to stop replacing.
+ return SDValue(AddcNode, 0);
+ }
+ return SDValue();
+}
+
+/// PerformADDCCombine - Target-specific dag combine transform from
+/// ISD::ADDC, ISD::ADDE, and ISD::MUL_LOHI to MLAL or
+/// ISD::ADDC, ISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
+static SDValue PerformADDCCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *Subtarget) {
+
+ if (Subtarget->isThumb1Only()) return SDValue();
+
+ // Only perform the checks after legalize when the pattern is available.
+ if (DCI.isBeforeLegalize()) return SDValue();
+
+ return AddCombineTo64bitUMAAL(N, DCI, Subtarget);
+}
+
+/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
+/// operands N0 and N1. This is a helper for PerformADDCombine that is
+/// called with the default operands, and if that fails, with commuted
+/// operands.
+static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *Subtarget){
+
+ // Attempt to create vpaddl for this add.
+ if (SDValue Result = AddCombineToVPADDL(N, N0, N1, DCI, Subtarget))
+ return Result;
+
+ // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
+ if (N0.getNode()->hasOneUse())
+ if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI))
+ return Result;
+ return SDValue();
+}
+
+/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
+///
+static SDValue PerformADDCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *Subtarget) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ // First try with the default operand order.
+ if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
+ return Result;
+
+ // If that didn't work, try again with the operands commuted.
+ return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
+}
+
+/// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
+///
+static SDValue PerformSUBCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
+ if (N1.getNode()->hasOneUse())
+ if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))
+ return Result;
+
+ return SDValue();
+}
+
+/// PerformVMULCombine
+/// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
+/// special multiplier accumulator forwarding.
+/// vmul d3, d0, d2
+/// vmla d3, d1, d2
+/// is faster than
+/// vadd d3, d0, d1
+/// vmul d3, d3, d2
+// However, for (A + B) * (A + B),
+// vadd d2, d0, d1
+// vmul d3, d0, d2
+// vmla d3, d1, d2
+// is slower than
+// vadd d2, d0, d1
+// vmul d3, d2, d2
+static SDValue PerformVMULCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *Subtarget) {
+ if (!Subtarget->hasVMLxForwarding())
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ unsigned Opcode = N0.getOpcode();
+ if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
+ Opcode != ISD::FADD && Opcode != ISD::FSUB) {
+ Opcode = N1.getOpcode();
+ if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
+ Opcode != ISD::FADD && Opcode != ISD::FSUB)
+ return SDValue();
+ std::swap(N0, N1);
+ }
+
+ if (N0 == N1)
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+ SDLoc DL(N);
+ SDValue N00 = N0->getOperand(0);
+ SDValue N01 = N0->getOperand(1);
+ return DAG.getNode(Opcode, DL, VT,
+ DAG.getNode(ISD::MUL, DL, VT, N00, N1),
+ DAG.getNode(ISD::MUL, DL, VT, N01, N1));
+}
+
+static SDValue PerformMULCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *Subtarget) {
+ SelectionDAG &DAG = DCI.DAG;
+
+ if (Subtarget->isThumb1Only())
+ return SDValue();
+
+ if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+ if (VT.is64BitVector() || VT.is128BitVector())
+ return PerformVMULCombine(N, DCI, Subtarget);
+ if (VT != MVT::i32)
+ return SDValue();
+
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!C)
+ return SDValue();
+
+ int64_t MulAmt = C->getSExtValue();
+ unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt);
+
+ ShiftAmt = ShiftAmt & (32 - 1);
+ SDValue V = N->getOperand(0);
+ SDLoc DL(N);
+
+ SDValue Res;
+ MulAmt >>= ShiftAmt;
+
+ if (MulAmt >= 0) {
+ if (isPowerOf2_32(MulAmt - 1)) {
+ // (mul x, 2^N + 1) => (add (shl x, N), x)
+ Res = DAG.getNode(ISD::ADD, DL, VT,
+ V,
+ DAG.getNode(ISD::SHL, DL, VT,
+ V,
+ DAG.getConstant(Log2_32(MulAmt - 1), DL,
+ MVT::i32)));
+ } else if (isPowerOf2_32(MulAmt + 1)) {
+ // (mul x, 2^N - 1) => (sub (shl x, N), x)
+ Res = DAG.getNode(ISD::SUB, DL, VT,
+ DAG.getNode(ISD::SHL, DL, VT,
+ V,
+ DAG.getConstant(Log2_32(MulAmt + 1), DL,
+ MVT::i32)),
+ V);
+ } else
+ return SDValue();
+ } else {
+ uint64_t MulAmtAbs = -MulAmt;
+ if (isPowerOf2_32(MulAmtAbs + 1)) {
+ // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
+ Res = DAG.getNode(ISD::SUB, DL, VT,
+ V,
+ DAG.getNode(ISD::SHL, DL, VT,
+ V,
+ DAG.getConstant(Log2_32(MulAmtAbs + 1), DL,
+ MVT::i32)));
+ } else if (isPowerOf2_32(MulAmtAbs - 1)) {
+ // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
+ Res = DAG.getNode(ISD::ADD, DL, VT,
+ V,
+ DAG.getNode(ISD::SHL, DL, VT,
+ V,
+ DAG.getConstant(Log2_32(MulAmtAbs - 1), DL,
+ MVT::i32)));
+ Res = DAG.getNode(ISD::SUB, DL, VT,
+ DAG.getConstant(0, DL, MVT::i32), Res);
+
+ } else
+ return SDValue();
+ }
+
+ if (ShiftAmt != 0)
+ Res = DAG.getNode(ISD::SHL, DL, VT,
+ Res, DAG.getConstant(ShiftAmt, DL, MVT::i32));
+
+ // Do not add new nodes to DAG combiner worklist.
+ DCI.CombineTo(N, Res, false);
+ return SDValue();
+}
+
+static SDValue PerformANDCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *Subtarget) {
+
+ // Attempt to use immediate-form VBIC
+ BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
+ SDLoc dl(N);
+ EVT VT = N->getValueType(0);
+ SelectionDAG &DAG = DCI.DAG;
+
+ if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ return SDValue();
+
+ APInt SplatBits, SplatUndef;
+ unsigned SplatBitSize;
+ bool HasAnyUndefs;
+ if (BVN &&
+ BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
+ if (SplatBitSize <= 64) {
+ EVT VbicVT;
+ SDValue Val = isNEONModifiedImm((~SplatBits).getZExtValue(),
+ SplatUndef.getZExtValue(), SplatBitSize,
+ DAG, dl, VbicVT, VT.is128BitVector(),
+ OtherModImm);
+ if (Val.getNode()) {
+ SDValue Input =
+ DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0));
+ SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
+ return DAG.getNode(ISD::BITCAST, dl, VT, Vbic);
+ }
+ }
+ }
+
+ if (!Subtarget->isThumb1Only()) {
+ // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
+ if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
+ return Result;
+ }
+
+ return SDValue();
+}
+
+/// PerformORCombine - Target-specific dag combine xforms for ISD::OR
+static SDValue PerformORCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *Subtarget) {
+ // Attempt to use immediate-form VORR
+ BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
+ SDLoc dl(N);
+ EVT VT = N->getValueType(0);
+ SelectionDAG &DAG = DCI.DAG;
+
+ if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ return SDValue();
+
+ APInt SplatBits, SplatUndef;
+ unsigned SplatBitSize;
+ bool HasAnyUndefs;
+ if (BVN && Subtarget->hasNEON() &&
+ BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
+ if (SplatBitSize <= 64) {
+ EVT VorrVT;
+ SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
+ SplatUndef.getZExtValue(), SplatBitSize,
+ DAG, dl, VorrVT, VT.is128BitVector(),
+ OtherModImm);
+ if (Val.getNode()) {
+ SDValue Input =
+ DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0));
+ SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
+ return DAG.getNode(ISD::BITCAST, dl, VT, Vorr);
+ }
+ }
+ }
+
+ if (!Subtarget->isThumb1Only()) {
+ // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
+ if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
+ return Result;
+ }
+
+ // The code below optimizes (or (and X, Y), Z).
+ // The AND operand needs to have a single user to make these optimizations
+ // profitable.
+ SDValue N0 = N->getOperand(0);
+ if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
+ return SDValue();
+ SDValue N1 = N->getOperand(1);
+
+ // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
+ if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
+ DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
+ APInt SplatUndef;
+ unsigned SplatBitSize;
+ bool HasAnyUndefs;
+
+ APInt SplatBits0, SplatBits1;
+ BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
+ BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
+ // Ensure that the second operand of both ands are constants
+ if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
+ HasAnyUndefs) && !HasAnyUndefs) {
+ if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
+ HasAnyUndefs) && !HasAnyUndefs) {
+ // Ensure that the bit width of the constants are the same and that
+ // the splat arguments are logical inverses as per the pattern we
+ // are trying to simplify.
+ if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
+ SplatBits0 == ~SplatBits1) {
+ // Canonicalize the vector type to make instruction selection
+ // simpler.
+ EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
+ SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT,
+ N0->getOperand(1),
+ N0->getOperand(0),
+ N1->getOperand(0));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+ }
+ }
+ }
+ }
+
+ // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
+ // reasonable.
+
+ // BFI is only available on V6T2+
+ if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
+ return SDValue();
+
+ SDLoc DL(N);
+ // 1) or (and A, mask), val => ARMbfi A, val, mask
+ // iff (val & mask) == val
+ //
+ // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
+ // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
+ // && mask == ~mask2
+ // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
+ // && ~mask == mask2
+ // (i.e., copy a bitfield value into another bitfield of the same width)
+
+ if (VT != MVT::i32)
+ return SDValue();
+
+ SDValue N00 = N0.getOperand(0);
+
+ // The value and the mask need to be constants so we can verify this is
+ // actually a bitfield set. If the mask is 0xffff, we can do better
+ // via a movt instruction, so don't use BFI in that case.
+ SDValue MaskOp = N0.getOperand(1);
+ ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp);
+ if (!MaskC)
+ return SDValue();
+ unsigned Mask = MaskC->getZExtValue();
+ if (Mask == 0xffff)
+ return SDValue();
+ SDValue Res;
+ // Case (1): or (and A, mask), val => ARMbfi A, val, mask
+ ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+ if (N1C) {
+ unsigned Val = N1C->getZExtValue();
+ if ((Val & ~Mask) != Val)
+ return SDValue();
+
+ if (ARM::isBitFieldInvertedMask(Mask)) {
+ Val >>= countTrailingZeros(~Mask);
+
+ Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
+ DAG.getConstant(Val, DL, MVT::i32),
+ DAG.getConstant(Mask, DL, MVT::i32));
+
+ // Do not add new nodes to DAG combiner worklist.
+ DCI.CombineTo(N, Res, false);
+ return SDValue();
+ }
+ } else if (N1.getOpcode() == ISD::AND) {
+ // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
+ ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
+ if (!N11C)
+ return SDValue();
+ unsigned Mask2 = N11C->getZExtValue();
+
+ // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
+ // as is to match.
+ if (ARM::isBitFieldInvertedMask(Mask) &&
+ (Mask == ~Mask2)) {
+ // The pack halfword instruction works better for masks that fit it,
+ // so use that when it's available.
+ if (Subtarget->hasT2ExtractPack() &&
+ (Mask == 0xffff || Mask == 0xffff0000))
+ return SDValue();
+ // 2a
+ unsigned amt = countTrailingZeros(Mask2);
+ Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
+ DAG.getConstant(amt, DL, MVT::i32));
+ Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
+ DAG.getConstant(Mask, DL, MVT::i32));
+ // Do not add new nodes to DAG combiner worklist.
+ DCI.CombineTo(N, Res, false);
+ return SDValue();
+ } else if (ARM::isBitFieldInvertedMask(~Mask) &&
+ (~Mask == Mask2)) {
+ // The pack halfword instruction works better for masks that fit it,
+ // so use that when it's available.
+ if (Subtarget->hasT2ExtractPack() &&
+ (Mask2 == 0xffff || Mask2 == 0xffff0000))
+ return SDValue();
+ // 2b
+ unsigned lsb = countTrailingZeros(Mask);
+ Res = DAG.getNode(ISD::SRL, DL, VT, N00,
+ DAG.getConstant(lsb, DL, MVT::i32));
+ Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
+ DAG.getConstant(Mask2, DL, MVT::i32));
+ // Do not add new nodes to DAG combiner worklist.
+ DCI.CombineTo(N, Res, false);
+ return SDValue();
+ }
+ }
+
+ if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
+ N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
+ ARM::isBitFieldInvertedMask(~Mask)) {
+ // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
+ // where lsb(mask) == #shamt and masked bits of B are known zero.
+ SDValue ShAmt = N00.getOperand(1);
+ unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue();
+ unsigned LSB = countTrailingZeros(Mask);
+ if (ShAmtC != LSB)
+ return SDValue();
+
+ Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
+ DAG.getConstant(~Mask, DL, MVT::i32));
+
+ // Do not add new nodes to DAG combiner worklist.
+ DCI.CombineTo(N, Res, false);
+ }
+
+ return SDValue();
+}
+
+static SDValue PerformXORCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
+ SelectionDAG &DAG = DCI.DAG;
+
+ if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ return SDValue();
+
+ if (!Subtarget->isThumb1Only()) {
+ // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
+ if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
+ return Result;
+ }
+
+ return SDValue();
+}
+
+// ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it,
+// and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and
+// their position in "to" (Rd).
+static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) {
+ assert(N->getOpcode() == ARMISD::BFI);
+
+ SDValue From = N->getOperand(1);
+ ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue();
+ FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation());
+
+ // If the Base came from a SHR #C, we can deduce that it is really testing bit
+ // #C in the base of the SHR.
+ if (From->getOpcode() == ISD::SRL &&
+ isa<ConstantSDNode>(From->getOperand(1))) {
+ APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue();
+ assert(Shift.getLimitedValue() < 32 && "Shift too large!");
+ FromMask <<= Shift.getLimitedValue(31);
+ From = From->getOperand(0);
+ }
+
+ return From;
+}
+
+// If A and B contain one contiguous set of bits, does A | B == A . B?
+//
+// Neither A nor B must be zero.
+static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) {
+ unsigned LastActiveBitInA = A.countTrailingZeros();
+ unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1;
+ return LastActiveBitInA - 1 == FirstActiveBitInB;
+}
+
+static SDValue FindBFIToCombineWith(SDNode *N) {
+ // We have a BFI in N. Follow a possible chain of BFIs and find a BFI it can combine with,
+ // if one exists.
+ APInt ToMask, FromMask;
+ SDValue From = ParseBFI(N, ToMask, FromMask);
+ SDValue To = N->getOperand(0);
+
+ // Now check for a compatible BFI to merge with. We can pass through BFIs that
+ // aren't compatible, but not if they set the same bit in their destination as
+ // we do (or that of any BFI we're going to combine with).
+ SDValue V = To;
+ APInt CombinedToMask = ToMask;
+ while (V.getOpcode() == ARMISD::BFI) {
+ APInt NewToMask, NewFromMask;
+ SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
+ if (NewFrom != From) {
+ // This BFI has a different base. Keep going.
+ CombinedToMask |= NewToMask;
+ V = V.getOperand(0);
+ continue;
+ }
+
+ // Do the written bits conflict with any we've seen so far?
+ if ((NewToMask & CombinedToMask).getBoolValue())
+ // Conflicting bits - bail out because going further is unsafe.
+ return SDValue();
+
+ // Are the new bits contiguous when combined with the old bits?
+ if (BitsProperlyConcatenate(ToMask, NewToMask) &&
+ BitsProperlyConcatenate(FromMask, NewFromMask))
+ return V;
+ if (BitsProperlyConcatenate(NewToMask, ToMask) &&
+ BitsProperlyConcatenate(NewFromMask, FromMask))
+ return V;
+
+ // We've seen a write to some bits, so track it.
+ CombinedToMask |= NewToMask;
+ // Keep going...
+ V = V.getOperand(0);
+ }
+
+ return SDValue();
+}
+
+static SDValue PerformBFICombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ SDValue N1 = N->getOperand(1);
+ if (N1.getOpcode() == ISD::AND) {
+ // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
+ // the bits being cleared by the AND are not demanded by the BFI.
+ ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
+ if (!N11C)
+ return SDValue();
+ unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+ unsigned LSB = countTrailingZeros(~InvMask);
+ unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB;
+ assert(Width <
+ static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
+ "undefined behavior");
+ unsigned Mask = (1u << Width) - 1;
+ unsigned Mask2 = N11C->getZExtValue();
+ if ((Mask & (~Mask2)) == 0)
+ return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
+ N->getOperand(0), N1.getOperand(0),
+ N->getOperand(2));
+ } else if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
+ // We have a BFI of a BFI. Walk up the BFI chain to see how long it goes.
+ // Keep track of any consecutive bits set that all come from the same base
+ // value. We can combine these together into a single BFI.
+ SDValue CombineBFI = FindBFIToCombineWith(N);
+ if (CombineBFI == SDValue())
+ return SDValue();
+
+ // We've found a BFI.
+ APInt ToMask1, FromMask1;
+ SDValue From1 = ParseBFI(N, ToMask1, FromMask1);
+
+ APInt ToMask2, FromMask2;
+ SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2);
+ assert(From1 == From2);
+ (void)From2;
+
+ // First, unlink CombineBFI.
+ DCI.DAG.ReplaceAllUsesWith(CombineBFI, CombineBFI.getOperand(0));
+ // Then create a new BFI, combining the two together.
+ APInt NewFromMask = FromMask1 | FromMask2;
+ APInt NewToMask = ToMask1 | ToMask2;
+
+ EVT VT = N->getValueType(0);
+ SDLoc dl(N);
+
+ if (NewFromMask[0] == 0)
+ From1 = DCI.DAG.getNode(
+ ISD::SRL, dl, VT, From1,
+ DCI.DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT));
+ return DCI.DAG.getNode(ARMISD::BFI, dl, VT, N->getOperand(0), From1,
+ DCI.DAG.getConstant(~NewToMask, dl, VT));
+ }
+ return SDValue();
+}
+
+/// PerformVMOVRRDCombine - Target-specific dag combine xforms for
+/// ARMISD::VMOVRRD.
+static SDValue PerformVMOVRRDCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *Subtarget) {
+ // vmovrrd(vmovdrr x, y) -> x,y
+ SDValue InDouble = N->getOperand(0);
+ if (InDouble.getOpcode() == ARMISD::VMOVDRR && !Subtarget->isFPOnlySP())
+ return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
+
+ // vmovrrd(load f64) -> (load i32), (load i32)
+ SDNode *InNode = InDouble.getNode();
+ if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
+ InNode->getValueType(0) == MVT::f64 &&
+ InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
+ !cast<LoadSDNode>(InNode)->isVolatile()) {
+ // TODO: Should this be done for non-FrameIndex operands?
+ LoadSDNode *LD = cast<LoadSDNode>(InNode);
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc DL(LD);
+ SDValue BasePtr = LD->getBasePtr();
+ SDValue NewLD1 =
+ DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(),
+ LD->getAlignment(), LD->getMemOperand()->getFlags());
+
+ SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
+ DAG.getConstant(4, DL, MVT::i32));
+ SDValue NewLD2 = DAG.getLoad(
+ MVT::i32, DL, NewLD1.getValue(1), OffsetPtr, LD->getPointerInfo(),
+ std::min(4U, LD->getAlignment() / 2), LD->getMemOperand()->getFlags());
+
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
+ if (DCI.DAG.getDataLayout().isBigEndian())
+ std::swap (NewLD1, NewLD2);
+ SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
+ return Result;
+ }
+
+ return SDValue();
+}
+
+/// PerformVMOVDRRCombine - Target-specific dag combine xforms for
+/// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands.
+static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) {
+ // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ if (Op0.getOpcode() == ISD::BITCAST)
+ Op0 = Op0.getOperand(0);
+ if (Op1.getOpcode() == ISD::BITCAST)
+ Op1 = Op1.getOperand(0);
+ if (Op0.getOpcode() == ARMISD::VMOVRRD &&
+ Op0.getNode() == Op1.getNode() &&
+ Op0.getResNo() == 0 && Op1.getResNo() == 1)
+ return DAG.getNode(ISD::BITCAST, SDLoc(N),
+ N->getValueType(0), Op0.getOperand(0));
+ return SDValue();
+}
+
+/// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
+/// are normal, non-volatile loads. If so, it is profitable to bitcast an
+/// i64 vector to have f64 elements, since the value can then be loaded
+/// directly into a VFP register.
+static bool hasNormalLoadOperand(SDNode *N) {
+ unsigned NumElts = N->getValueType(0).getVectorNumElements();
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDNode *Elt = N->getOperand(i).getNode();
+ if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
+ return true;
+ }
+ return false;
+}
+
+/// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
+/// ISD::BUILD_VECTOR.
+static SDValue PerformBUILD_VECTORCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *Subtarget) {
+ // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
+ // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value
+ // into a pair of GPRs, which is fine when the value is used as a scalar,
+ // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
+ SelectionDAG &DAG = DCI.DAG;
+ if (N->getNumOperands() == 2)
+ if (SDValue RV = PerformVMOVDRRCombine(N, DAG))
+ return RV;
+
+ // Load i64 elements as f64 values so that type legalization does not split
+ // them up into i32 values.
+ EVT VT = N->getValueType(0);
+ if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
+ return SDValue();
+ SDLoc dl(N);
+ SmallVector<SDValue, 8> Ops;
+ unsigned NumElts = VT.getVectorNumElements();
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
+ Ops.push_back(V);
+ // Make the DAGCombiner fold the bitcast.
+ DCI.AddToWorklist(V.getNode());
+ }
+ EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
+ SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops);
+ return DAG.getNode(ISD::BITCAST, dl, VT, BV);
+}
+
+/// \brief Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
+static SDValue
+PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+ // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
+ // At that time, we may have inserted bitcasts from integer to float.
+ // If these bitcasts have survived DAGCombine, change the lowering of this
+ // BUILD_VECTOR in something more vector friendly, i.e., that does not
+ // force to use floating point types.
+
+ // Make sure we can change the type of the vector.
+ // This is possible iff:
+ // 1. The vector is only used in a bitcast to a integer type. I.e.,
+ // 1.1. Vector is used only once.
+ // 1.2. Use is a bit convert to an integer type.
+ // 2. The size of its operands are 32-bits (64-bits are not legal).
+ EVT VT = N->getValueType(0);
+ EVT EltVT = VT.getVectorElementType();
+
+ // Check 1.1. and 2.
+ if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
+ return SDValue();
+
+ // By construction, the input type must be float.
+ assert(EltVT == MVT::f32 && "Unexpected type!");
+
+ // Check 1.2.
+ SDNode *Use = *N->use_begin();
+ if (Use->getOpcode() != ISD::BITCAST ||
+ Use->getValueType(0).isFloatingPoint())
+ return SDValue();
+
+ // Check profitability.
+ // Model is, if more than half of the relevant operands are bitcast from
+ // i32, turn the build_vector into a sequence of insert_vector_elt.
+ // Relevant operands are everything that is not statically
+ // (i.e., at compile time) bitcasted.
+ unsigned NumOfBitCastedElts = 0;
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned NumOfRelevantElts = NumElts;
+ for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
+ SDValue Elt = N->getOperand(Idx);
+ if (Elt->getOpcode() == ISD::BITCAST) {
+ // Assume only bit cast to i32 will go away.
+ if (Elt->getOperand(0).getValueType() == MVT::i32)
+ ++NumOfBitCastedElts;
+ } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt))
+ // Constants are statically casted, thus do not count them as
+ // relevant operands.
+ --NumOfRelevantElts;
+ }
+
+ // Check if more than half of the elements require a non-free bitcast.
+ if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ // Create the new vector type.
+ EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
+ // Check if the type is legal.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (!TLI.isTypeLegal(VecVT))
+ return SDValue();
+
+ // Combine:
+ // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
+ // => BITCAST INSERT_VECTOR_ELT
+ // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
+ // (BITCAST EN), N.
+ SDValue Vec = DAG.getUNDEF(VecVT);
+ SDLoc dl(N);
+ for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
+ SDValue V = N->getOperand(Idx);
+ if (V.isUndef())
+ continue;
+ if (V.getOpcode() == ISD::BITCAST &&
+ V->getOperand(0).getValueType() == MVT::i32)
+ // Fold obvious case.
+ V = V.getOperand(0);
+ else {
+ V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
+ // Make the DAGCombiner fold the bitcasts.
+ DCI.AddToWorklist(V.getNode());
+ }
+ SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32);
+ Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
+ }
+ Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
+ // Make the DAGCombiner fold the bitcasts.
+ DCI.AddToWorklist(Vec.getNode());
+ return Vec;
+}
+
+/// PerformInsertEltCombine - Target-specific dag combine xforms for
+/// ISD::INSERT_VECTOR_ELT.
+static SDValue PerformInsertEltCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ // Bitcast an i64 load inserted into a vector to f64.
+ // Otherwise, the i64 value will be legalized to a pair of i32 values.
+ EVT VT = N->getValueType(0);
+ SDNode *Elt = N->getOperand(1).getNode();
+ if (VT.getVectorElementType() != MVT::i64 ||
+ !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc dl(N);
+ EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
+ VT.getVectorNumElements());
+ SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
+ SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
+ // Make the DAGCombiner fold the bitcasts.
+ DCI.AddToWorklist(Vec.getNode());
+ DCI.AddToWorklist(V.getNode());
+ SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
+ Vec, V, N->getOperand(2));
+ return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
+}
+
+/// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
+/// ISD::VECTOR_SHUFFLE.
+static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
+ // The LLVM shufflevector instruction does not require the shuffle mask
+ // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
+ // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the
+ // operands do not match the mask length, they are extended by concatenating
+ // them with undef vectors. That is probably the right thing for other
+ // targets, but for NEON it is better to concatenate two double-register
+ // size vector operands into a single quad-register size vector. Do that
+ // transformation here:
+ // shuffle(concat(v1, undef), concat(v2, undef)) ->
+ // shuffle(concat(v1, v2), undef)
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
+ Op1.getOpcode() != ISD::CONCAT_VECTORS ||
+ Op0.getNumOperands() != 2 ||
+ Op1.getNumOperands() != 2)
+ return SDValue();
+ SDValue Concat0Op1 = Op0.getOperand(1);
+ SDValue Concat1Op1 = Op1.getOperand(1);
+ if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef())
+ return SDValue();
+ // Skip the transformation if any of the types are illegal.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ EVT VT = N->getValueType(0);
+ if (!TLI.isTypeLegal(VT) ||
+ !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
+ !TLI.isTypeLegal(Concat1Op1.getValueType()))
+ return SDValue();
+
+ SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
+ Op0.getOperand(0), Op1.getOperand(0));
+ // Translate the shuffle mask.
+ SmallVector<int, 16> NewMask;
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned HalfElts = NumElts/2;
+ ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
+ for (unsigned n = 0; n < NumElts; ++n) {
+ int MaskElt = SVN->getMaskElt(n);
+ int NewElt = -1;
+ if (MaskElt < (int)HalfElts)
+ NewElt = MaskElt;
+ else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
+ NewElt = HalfElts + MaskElt - NumElts;
+ NewMask.push_back(NewElt);
+ }
+ return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
+ DAG.getUNDEF(VT), NewMask);
+}
+
+/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
+/// NEON load/store intrinsics, and generic vector load/stores, to merge
+/// base address updates.
+/// For generic load/stores, the memory type is assumed to be a vector.
+/// The caller is assumed to have checked legality.
+static SDValue CombineBaseUpdate(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ SelectionDAG &DAG = DCI.DAG;
+ const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
+ N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
+ const bool isStore = N->getOpcode() == ISD::STORE;
+ const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
+ SDValue Addr = N->getOperand(AddrOpIdx);
+ MemSDNode *MemN = cast<MemSDNode>(N);
+ SDLoc dl(N);
+
+ // Search for a use of the address operand that is an increment.
+ for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
+ UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
+ SDNode *User = *UI;
+ if (User->getOpcode() != ISD::ADD ||
+ UI.getUse().getResNo() != Addr.getResNo())
+ continue;
+
+ // Check that the add is independent of the load/store. Otherwise, folding
+ // it would create a cycle.
+ if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
+ continue;
+
+ // Find the new opcode for the updating load/store.
+ bool isLoadOp = true;
+ bool isLaneOp = false;
+ unsigned NewOpc = 0;
+ unsigned NumVecs = 0;
+ if (isIntrinsic) {
+ unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ switch (IntNo) {
+ default: llvm_unreachable("unexpected intrinsic for Neon base update");
+ case Intrinsic::arm_neon_vld1: NewOpc = ARMISD::VLD1_UPD;
+ NumVecs = 1; break;
+ case Intrinsic::arm_neon_vld2: NewOpc = ARMISD::VLD2_UPD;
+ NumVecs = 2; break;
+ case Intrinsic::arm_neon_vld3: NewOpc = ARMISD::VLD3_UPD;
+ NumVecs = 3; break;
+ case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD;
+ NumVecs = 4; break;
+ case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
+ NumVecs = 2; isLaneOp = true; break;
+ case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD;
+ NumVecs = 3; isLaneOp = true; break;
+ case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD;
+ NumVecs = 4; isLaneOp = true; break;
+ case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD;
+ NumVecs = 1; isLoadOp = false; break;
+ case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD;
+ NumVecs = 2; isLoadOp = false; break;
+ case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD;
+ NumVecs = 3; isLoadOp = false; break;
+ case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD;
+ NumVecs = 4; isLoadOp = false; break;
+ case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD;
+ NumVecs = 2; isLoadOp = false; isLaneOp = true; break;
+ case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD;
+ NumVecs = 3; isLoadOp = false; isLaneOp = true; break;
+ case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD;
+ NumVecs = 4; isLoadOp = false; isLaneOp = true; break;
+ }
+ } else {
+ isLaneOp = true;
+ switch (N->getOpcode()) {
+ default: llvm_unreachable("unexpected opcode for Neon base update");
+ case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break;
+ case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break;
+ case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break;
+ case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break;
+ case ISD::LOAD: NewOpc = ARMISD::VLD1_UPD;
+ NumVecs = 1; isLaneOp = false; break;
+ case ISD::STORE: NewOpc = ARMISD::VST1_UPD;
+ NumVecs = 1; isLaneOp = false; isLoadOp = false; break;
+ }
+ }
+
+ // Find the size of memory referenced by the load/store.
+ EVT VecTy;
+ if (isLoadOp) {
+ VecTy = N->getValueType(0);
+ } else if (isIntrinsic) {
+ VecTy = N->getOperand(AddrOpIdx+1).getValueType();
+ } else {
+ assert(isStore && "Node has to be a load, a store, or an intrinsic!");
+ VecTy = N->getOperand(1).getValueType();
+ }
+
+ unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
+ if (isLaneOp)
+ NumBytes /= VecTy.getVectorNumElements();
+
+ // If the increment is a constant, it must match the memory ref size.
+ SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
+ if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
+ uint64_t IncVal = CInc->getZExtValue();
+ if (IncVal != NumBytes)
+ continue;
+ } else if (NumBytes >= 3 * 16) {
+ // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
+ // separate instructions that make it harder to use a non-constant update.
+ continue;
+ }
+
+ // OK, we found an ADD we can fold into the base update.
+ // Now, create a _UPD node, taking care of not breaking alignment.
+
+ EVT AlignedVecTy = VecTy;
+ unsigned Alignment = MemN->getAlignment();
+
+ // If this is a less-than-standard-aligned load/store, change the type to
+ // match the standard alignment.
+ // The alignment is overlooked when selecting _UPD variants; and it's
+ // easier to introduce bitcasts here than fix that.
+ // There are 3 ways to get to this base-update combine:
+ // - intrinsics: they are assumed to be properly aligned (to the standard
+ // alignment of the memory type), so we don't need to do anything.
+ // - ARMISD::VLDx nodes: they are only generated from the aforementioned
+ // intrinsics, so, likewise, there's nothing to do.
+ // - generic load/store instructions: the alignment is specified as an
+ // explicit operand, rather than implicitly as the standard alignment
+ // of the memory type (like the intrisics). We need to change the
+ // memory type to match the explicit alignment. That way, we don't
+ // generate non-standard-aligned ARMISD::VLDx nodes.
+ if (isa<LSBaseSDNode>(N)) {
+ if (Alignment == 0)
+ Alignment = 1;
+ if (Alignment < VecTy.getScalarSizeInBits() / 8) {
+ MVT EltTy = MVT::getIntegerVT(Alignment * 8);
+ assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
+ assert(!isLaneOp && "Unexpected generic load/store lane.");
+ unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
+ AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
+ }
+ // Don't set an explicit alignment on regular load/stores that we want
+ // to transform to VLD/VST 1_UPD nodes.
+ // This matches the behavior of regular load/stores, which only get an
+ // explicit alignment if the MMO alignment is larger than the standard
+ // alignment of the memory type.
+ // Intrinsics, however, always get an explicit alignment, set to the
+ // alignment of the MMO.
+ Alignment = 1;
+ }
+
+ // Create the new updating load/store node.
+ // First, create an SDVTList for the new updating node's results.
+ EVT Tys[6];
+ unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
+ unsigned n;
+ for (n = 0; n < NumResultVecs; ++n)
+ Tys[n] = AlignedVecTy;
+ Tys[n++] = MVT::i32;
+ Tys[n] = MVT::Other;
+ SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2));
+
+ // Then, gather the new node's operands.
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(N->getOperand(0)); // incoming chain
+ Ops.push_back(N->getOperand(AddrOpIdx));
+ Ops.push_back(Inc);
+
+ if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
+ // Try to match the intrinsic's signature
+ Ops.push_back(StN->getValue());
+ } else {
+ // Loads (and of course intrinsics) match the intrinsics' signature,
+ // so just add all but the alignment operand.
+ for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i)
+ Ops.push_back(N->getOperand(i));
+ }
+
+ // For all node types, the alignment operand is always the last one.
+ Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32));
+
+ // If this is a non-standard-aligned STORE, the penultimate operand is the
+ // stored value. Bitcast it to the aligned type.
+ if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
+ SDValue &StVal = Ops[Ops.size()-2];
+ StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
+ }
+
+ EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
+ SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
+ MemN->getMemOperand());
+
+ // Update the uses.
+ SmallVector<SDValue, 5> NewResults;
+ for (unsigned i = 0; i < NumResultVecs; ++i)
+ NewResults.push_back(SDValue(UpdN.getNode(), i));
+
+ // If this is an non-standard-aligned LOAD, the first result is the loaded
+ // value. Bitcast it to the expected result type.
+ if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
+ SDValue &LdVal = NewResults[0];
+ LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
+ }
+
+ NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain
+ DCI.CombineTo(N, NewResults);
+ DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
+
+ break;
+ }
+ return SDValue();
+}
+
+static SDValue PerformVLDCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+ return SDValue();
+
+ return CombineBaseUpdate(N, DCI);
+}
+
+/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
+/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
+/// are also VDUPLANEs. If so, combine them to a vldN-dup operation and
+/// return true.
+static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+ // vldN-dup instructions only support 64-bit vectors for N > 1.
+ if (!VT.is64BitVector())
+ return false;
+
+ // Check if the VDUPLANE operand is a vldN-dup intrinsic.
+ SDNode *VLD = N->getOperand(0).getNode();
+ if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
+ return false;
+ unsigned NumVecs = 0;
+ unsigned NewOpc = 0;
+ unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
+ if (IntNo == Intrinsic::arm_neon_vld2lane) {
+ NumVecs = 2;
+ NewOpc = ARMISD::VLD2DUP;
+ } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
+ NumVecs = 3;
+ NewOpc = ARMISD::VLD3DUP;
+ } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
+ NumVecs = 4;
+ NewOpc = ARMISD::VLD4DUP;
+ } else {
+ return false;
+ }
+
+ // First check that all the vldN-lane uses are VDUPLANEs and that the lane
+ // numbers match the load.
+ unsigned VLDLaneNo =
+ cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue();
+ for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
+ UI != UE; ++UI) {
+ // Ignore uses of the chain result.
+ if (UI.getUse().getResNo() == NumVecs)
+ continue;
+ SDNode *User = *UI;
+ if (User->getOpcode() != ARMISD::VDUPLANE ||
+ VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
+ return false;
+ }
+
+ // Create the vldN-dup node.
+ EVT Tys[5];
+ unsigned n;
+ for (n = 0; n < NumVecs; ++n)
+ Tys[n] = VT;
+ Tys[n] = MVT::Other;
+ SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1));
+ SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
+ MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
+ SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
+ Ops, VLDMemInt->getMemoryVT(),
+ VLDMemInt->getMemOperand());
+
+ // Update the uses.
+ for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
+ UI != UE; ++UI) {
+ unsigned ResNo = UI.getUse().getResNo();
+ // Ignore uses of the chain result.
+ if (ResNo == NumVecs)
+ continue;
+ SDNode *User = *UI;
+ DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
+ }
+
+ // Now the vldN-lane intrinsic is dead except for its chain result.
+ // Update uses of the chain.
+ std::vector<SDValue> VLDDupResults;
+ for (unsigned n = 0; n < NumVecs; ++n)
+ VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
+ VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
+ DCI.CombineTo(VLD, VLDDupResults);
+
+ return true;
+}
+
+/// PerformVDUPLANECombine - Target-specific dag combine xforms for
+/// ARMISD::VDUPLANE.
+static SDValue PerformVDUPLANECombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ SDValue Op = N->getOperand(0);
+
+ // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
+ // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
+ if (CombineVLDDUP(N, DCI))
+ return SDValue(N, 0);
+
+ // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
+ // redundant. Ignore bit_converts for now; element sizes are checked below.
+ while (Op.getOpcode() == ISD::BITCAST)
+ Op = Op.getOperand(0);
+ if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
+ return SDValue();
+
+ // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
+ unsigned EltSize = Op.getScalarValueSizeInBits();
+ // The canonical VMOV for a zero vector uses a 32-bit element size.
+ unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ unsigned EltBits;
+ if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0)
+ EltSize = 8;
+ EVT VT = N->getValueType(0);
+ if (EltSize > VT.getScalarSizeInBits())
+ return SDValue();
+
+ return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
+}
+
+/// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
+static SDValue PerformVDUPCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue Op = N->getOperand(0);
+
+ // Match VDUP(LOAD) -> VLD1DUP.
+ // We match this pattern here rather than waiting for isel because the
+ // transform is only legal for unindexed loads.
+ LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode());
+ if (LD && Op.hasOneUse() && LD->isUnindexed() &&
+ LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) {
+ SDValue Ops[] = { LD->getOperand(0), LD->getOperand(1),
+ DAG.getConstant(LD->getAlignment(), SDLoc(N), MVT::i32) };
+ SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other);
+ SDValue VLDDup = DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys,
+ Ops, LD->getMemoryVT(),
+ LD->getMemOperand());
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1));
+ return VLDDup;
+ }
+
+ return SDValue();
+}
+
+static SDValue PerformLOADCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ EVT VT = N->getValueType(0);
+
+ // If this is a legal vector load, try to combine it into a VLD1_UPD.
+ if (ISD::isNormalLoad(N) && VT.isVector() &&
+ DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ return CombineBaseUpdate(N, DCI);
+
+ return SDValue();
+}
+
+/// PerformSTORECombine - Target-specific dag combine xforms for
+/// ISD::STORE.
+static SDValue PerformSTORECombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ StoreSDNode *St = cast<StoreSDNode>(N);
+ if (St->isVolatile())
+ return SDValue();
+
+ // Optimize trunc store (of multiple scalars) to shuffle and store. First,
+ // pack all of the elements in one place. Next, store to memory in fewer
+ // chunks.
+ SDValue StVal = St->getValue();
+ EVT VT = StVal.getValueType();
+ if (St->isTruncatingStore() && VT.isVector()) {
+ SelectionDAG &DAG = DCI.DAG;
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ EVT StVT = St->getMemoryVT();
+ unsigned NumElems = VT.getVectorNumElements();
+ assert(StVT != VT && "Cannot truncate to the same type");
+ unsigned FromEltSz = VT.getScalarSizeInBits();
+ unsigned ToEltSz = StVT.getScalarSizeInBits();
+
+ // From, To sizes and ElemCount must be pow of two
+ if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue();
+
+ // We are going to use the original vector elt for storing.
+ // Accumulated smaller vector elements must be a multiple of the store size.
+ if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue();
+
+ unsigned SizeRatio = FromEltSz / ToEltSz;
+ assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
+
+ // Create a type on which we perform the shuffle.
+ EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
+ NumElems*SizeRatio);
+ assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+ SDLoc DL(St);
+ SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
+ SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i < NumElems; ++i)
+ ShuffleVec[i] = DAG.getDataLayout().isBigEndian()
+ ? (i + 1) * SizeRatio - 1
+ : i * SizeRatio;
+
+ // Can't shuffle using an illegal type.
+ if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
+
+ SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec,
+ DAG.getUNDEF(WideVec.getValueType()),
+ ShuffleVec);
+ // At this point all of the data is stored at the bottom of the
+ // register. We now need to save it to mem.
+
+ // Find the largest store unit
+ MVT StoreType = MVT::i8;
+ for (MVT Tp : MVT::integer_valuetypes()) {
+ if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
+ StoreType = Tp;
+ }
+ // Didn't find a legal store type.
+ if (!TLI.isTypeLegal(StoreType))
+ return SDValue();
+
+ // Bitcast the original vector into a vector of store-size units
+ EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
+ StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
+ assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
+ SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
+ SmallVector<SDValue, 8> Chains;
+ SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
+ TLI.getPointerTy(DAG.getDataLayout()));
+ SDValue BasePtr = St->getBasePtr();
+
+ // Perform one or more big stores into memory.
+ unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits();
+ for (unsigned I = 0; I < E; I++) {
+ SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
+ StoreType, ShuffWide,
+ DAG.getIntPtrConstant(I, DL));
+ SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr,
+ St->getPointerInfo(), St->getAlignment(),
+ St->getMemOperand()->getFlags());
+ BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
+ Increment);
+ Chains.push_back(Ch);
+ }
+ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+ }
+
+ if (!ISD::isNormalStore(St))
+ return SDValue();
+
+ // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
+ // ARM stores of arguments in the same cache line.
+ if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
+ StVal.getNode()->hasOneUse()) {
+ SelectionDAG &DAG = DCI.DAG;
+ bool isBigEndian = DAG.getDataLayout().isBigEndian();
+ SDLoc DL(St);
+ SDValue BasePtr = St->getBasePtr();
+ SDValue NewST1 = DAG.getStore(
+ St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0),
+ BasePtr, St->getPointerInfo(), St->getAlignment(),
+ St->getMemOperand()->getFlags());
+
+ SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
+ DAG.getConstant(4, DL, MVT::i32));
+ return DAG.getStore(NewST1.getValue(0), DL,
+ StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
+ OffsetPtr, St->getPointerInfo(),
+ std::min(4U, St->getAlignment() / 2),
+ St->getMemOperand()->getFlags());
+ }
+
+ if (StVal.getValueType() == MVT::i64 &&
+ StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+
+ // Bitcast an i64 store extracted from a vector to f64.
+ // Otherwise, the i64 value will be legalized to a pair of i32 values.
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc dl(StVal);
+ SDValue IntVec = StVal.getOperand(0);
+ EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
+ IntVec.getValueType().getVectorNumElements());
+ SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
+ SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+ Vec, StVal.getOperand(1));
+ dl = SDLoc(N);
+ SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
+ // Make the DAGCombiner fold the bitcasts.
+ DCI.AddToWorklist(Vec.getNode());
+ DCI.AddToWorklist(ExtElt.getNode());
+ DCI.AddToWorklist(V.getNode());
+ return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
+ St->getPointerInfo(), St->getAlignment(),
+ St->getMemOperand()->getFlags(), St->getAAInfo());
+ }
+
+ // If this is a legal vector store, try to combine it into a VST1_UPD.
+ if (ISD::isNormalStore(N) && VT.isVector() &&
+ DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ return CombineBaseUpdate(N, DCI);
+
+ return SDValue();
+}
+
+/// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
+/// can replace combinations of VMUL and VCVT (floating-point to integer)
+/// when the VMUL has a constant operand that is a power of 2.
+///
+/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
+/// vmul.f32 d16, d17, d16
+/// vcvt.s32.f32 d16, d16
+/// becomes:
+/// vcvt.s32.f32 d16, d16, #3
+static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG,
+ const ARMSubtarget *Subtarget) {
+ if (!Subtarget->hasNEON())
+ return SDValue();
+
+ SDValue Op = N->getOperand(0);
+ if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
+ Op.getOpcode() != ISD::FMUL)
+ return SDValue();
+
+ SDValue ConstVec = Op->getOperand(1);
+ if (!isa<BuildVectorSDNode>(ConstVec))
+ return SDValue();
+
+ MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
+ uint32_t FloatBits = FloatTy.getSizeInBits();
+ MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
+ uint32_t IntBits = IntTy.getSizeInBits();
+ unsigned NumLanes = Op.getValueType().getVectorNumElements();
+ if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) {
+ // These instructions only exist converting from f32 to i32. We can handle
+ // smaller integers by generating an extra truncate, but larger ones would
+ // be lossy. We also can't handle more then 4 lanes, since these intructions
+ // only support v2i32/v4i32 types.
+ return SDValue();
+ }
+
+ BitVector UndefElements;
+ BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
+ int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
+ if (C == -1 || C == 0 || C > 32)
+ return SDValue();
+
+ SDLoc dl(N);
+ bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
+ unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
+ Intrinsic::arm_neon_vcvtfp2fxu;
+ SDValue FixConv = DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
+ DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0),
+ DAG.getConstant(C, dl, MVT::i32));
+
+ if (IntBits < FloatBits)
+ FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv);
+
+ return FixConv;
+}
+
+/// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
+/// can replace combinations of VCVT (integer to floating-point) and VDIV
+/// when the VDIV has a constant operand that is a power of 2.
+///
+/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
+/// vcvt.f32.s32 d16, d16
+/// vdiv.f32 d16, d17, d16
+/// becomes:
+/// vcvt.f32.s32 d16, d16, #3
+static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG,
+ const ARMSubtarget *Subtarget) {
+ if (!Subtarget->hasNEON())
+ return SDValue();
+
+ SDValue Op = N->getOperand(0);
+ unsigned OpOpcode = Op.getNode()->getOpcode();
+ if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() ||
+ (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
+ return SDValue();
+
+ SDValue ConstVec = N->getOperand(1);
+ if (!isa<BuildVectorSDNode>(ConstVec))
+ return SDValue();
+
+ MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
+ uint32_t FloatBits = FloatTy.getSizeInBits();
+ MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
+ uint32_t IntBits = IntTy.getSizeInBits();
+ unsigned NumLanes = Op.getValueType().getVectorNumElements();
+ if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) {
+ // These instructions only exist converting from i32 to f32. We can handle
+ // smaller integers by generating an extra extend, but larger ones would
+ // be lossy. We also can't handle more then 4 lanes, since these intructions
+ // only support v2i32/v4i32 types.
+ return SDValue();
+ }
+
+ BitVector UndefElements;
+ BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
+ int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
+ if (C == -1 || C == 0 || C > 32)
+ return SDValue();
+
+ SDLoc dl(N);
+ bool isSigned = OpOpcode == ISD::SINT_TO_FP;
+ SDValue ConvInput = Op.getOperand(0);
+ if (IntBits < FloatBits)
+ ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
+ dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
+ ConvInput);
+
+ unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp :
+ Intrinsic::arm_neon_vcvtfxu2fp;
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
+ Op.getValueType(),
+ DAG.getConstant(IntrinsicOpcode, dl, MVT::i32),
+ ConvInput, DAG.getConstant(C, dl, MVT::i32));
+}
+
+/// Getvshiftimm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift operation, where all the elements of the
+/// build_vector must have the same constant integer value.
+static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
+ // Ignore bit_converts.
+ while (Op.getOpcode() == ISD::BITCAST)
+ Op = Op.getOperand(0);
+ BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
+ APInt SplatBits, SplatUndef;
+ unsigned SplatBitSize;
+ bool HasAnyUndefs;
+ if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
+ HasAnyUndefs, ElementBits) ||
+ SplatBitSize > ElementBits)
+ return false;
+ Cnt = SplatBits.getSExtValue();
+ return true;
+}
+
+/// isVShiftLImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift left operation. That value must be in the range:
+/// 0 <= Value < ElementBits for a left shift; or
+/// 0 <= Value <= ElementBits for a long left shift.
+static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
+ assert(VT.isVector() && "vector shift count is not a vector type");
+ int64_t ElementBits = VT.getScalarSizeInBits();
+ if (! getVShiftImm(Op, ElementBits, Cnt))
+ return false;
+ return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits);
+}
+
+/// isVShiftRImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift right operation. For a shift opcode, the value
+/// is positive, but for an intrinsic the value count must be negative. The
+/// absolute value must be in the range:
+/// 1 <= |Value| <= ElementBits for a right shift; or
+/// 1 <= |Value| <= ElementBits/2 for a narrow right shift.
+static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
+ int64_t &Cnt) {
+ assert(VT.isVector() && "vector shift count is not a vector type");
+ int64_t ElementBits = VT.getScalarSizeInBits();
+ if (! getVShiftImm(Op, ElementBits, Cnt))
+ return false;
+ if (!isIntrinsic)
+ return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits));
+ if (Cnt >= -(isNarrow ? ElementBits/2 : ElementBits) && Cnt <= -1) {
+ Cnt = -Cnt;
+ return true;
+ }
+ return false;
+}
+
+/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
+static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
+ unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+ switch (IntNo) {
+ default:
+ // Don't do anything for most intrinsics.
+ break;
+
+ // Vector shifts: check for immediate versions and lower them.
+ // Note: This is done during DAG combining instead of DAG legalizing because
+ // the build_vectors for 64-bit vector element shift counts are generally
+ // not legal, and it is hard to see their values after they get legalized to
+ // loads from a constant pool.
+ case Intrinsic::arm_neon_vshifts:
+ case Intrinsic::arm_neon_vshiftu:
+ case Intrinsic::arm_neon_vrshifts:
+ case Intrinsic::arm_neon_vrshiftu:
+ case Intrinsic::arm_neon_vrshiftn:
+ case Intrinsic::arm_neon_vqshifts:
+ case Intrinsic::arm_neon_vqshiftu:
+ case Intrinsic::arm_neon_vqshiftsu:
+ case Intrinsic::arm_neon_vqshiftns:
+ case Intrinsic::arm_neon_vqshiftnu:
+ case Intrinsic::arm_neon_vqshiftnsu:
+ case Intrinsic::arm_neon_vqrshiftns:
+ case Intrinsic::arm_neon_vqrshiftnu:
+ case Intrinsic::arm_neon_vqrshiftnsu: {
+ EVT VT = N->getOperand(1).getValueType();
+ int64_t Cnt;
+ unsigned VShiftOpc = 0;
+
+ switch (IntNo) {
+ case Intrinsic::arm_neon_vshifts:
+ case Intrinsic::arm_neon_vshiftu:
+ if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
+ VShiftOpc = ARMISD::VSHL;
+ break;
+ }
+ if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
+ VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ?
+ ARMISD::VSHRs : ARMISD::VSHRu);
+ break;
+ }
+ return SDValue();
+
+ case Intrinsic::arm_neon_vrshifts:
+ case Intrinsic::arm_neon_vrshiftu:
+ if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
+ break;
+ return SDValue();
+
+ case Intrinsic::arm_neon_vqshifts:
+ case Intrinsic::arm_neon_vqshiftu:
+ if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
+ break;
+ return SDValue();
+
+ case Intrinsic::arm_neon_vqshiftsu:
+ if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
+ break;
+ llvm_unreachable("invalid shift count for vqshlu intrinsic");
+
+ case Intrinsic::arm_neon_vrshiftn:
+ case Intrinsic::arm_neon_vqshiftns:
+ case Intrinsic::arm_neon_vqshiftnu:
+ case Intrinsic::arm_neon_vqshiftnsu:
+ case Intrinsic::arm_neon_vqrshiftns:
+ case Intrinsic::arm_neon_vqrshiftnu:
+ case Intrinsic::arm_neon_vqrshiftnsu:
+ // Narrowing shifts require an immediate right shift.
+ if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
+ break;
+ llvm_unreachable("invalid shift count for narrowing vector shift "
+ "intrinsic");
+
+ default:
+ llvm_unreachable("unhandled vector shift");
+ }
+
+ switch (IntNo) {
+ case Intrinsic::arm_neon_vshifts:
+ case Intrinsic::arm_neon_vshiftu:
+ // Opcode already set above.
+ break;
+ case Intrinsic::arm_neon_vrshifts:
+ VShiftOpc = ARMISD::VRSHRs; break;
+ case Intrinsic::arm_neon_vrshiftu:
+ VShiftOpc = ARMISD::VRSHRu; break;
+ case Intrinsic::arm_neon_vrshiftn:
+ VShiftOpc = ARMISD::VRSHRN; break;
+ case Intrinsic::arm_neon_vqshifts:
+ VShiftOpc = ARMISD::VQSHLs; break;
+ case Intrinsic::arm_neon_vqshiftu:
+ VShiftOpc = ARMISD::VQSHLu; break;
+ case Intrinsic::arm_neon_vqshiftsu:
+ VShiftOpc = ARMISD::VQSHLsu; break;
+ case Intrinsic::arm_neon_vqshiftns:
+ VShiftOpc = ARMISD::VQSHRNs; break;
+ case Intrinsic::arm_neon_vqshiftnu:
+ VShiftOpc = ARMISD::VQSHRNu; break;
+ case Intrinsic::arm_neon_vqshiftnsu:
+ VShiftOpc = ARMISD::VQSHRNsu; break;
+ case Intrinsic::arm_neon_vqrshiftns:
+ VShiftOpc = ARMISD::VQRSHRNs; break;
+ case Intrinsic::arm_neon_vqrshiftnu:
+ VShiftOpc = ARMISD::VQRSHRNu; break;
+ case Intrinsic::arm_neon_vqrshiftnsu:
+ VShiftOpc = ARMISD::VQRSHRNsu; break;
+ }
+
+ SDLoc dl(N);
+ return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
+ N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32));
+ }
+
+ case Intrinsic::arm_neon_vshiftins: {
+ EVT VT = N->getOperand(1).getValueType();
+ int64_t Cnt;
+ unsigned VShiftOpc = 0;
+
+ if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
+ VShiftOpc = ARMISD::VSLI;
+ else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
+ VShiftOpc = ARMISD::VSRI;
+ else {
+ llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
+ }
+
+ SDLoc dl(N);
+ return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
+ N->getOperand(1), N->getOperand(2),
+ DAG.getConstant(Cnt, dl, MVT::i32));
+ }
+
+ case Intrinsic::arm_neon_vqrshifts:
+ case Intrinsic::arm_neon_vqrshiftu:
+ // No immediate versions of these to check for.
+ break;
+ }
+
+ return SDValue();
+}
+
+/// PerformShiftCombine - Checks for immediate versions of vector shifts and
+/// lowers them. As with the vector shift intrinsics, this is done during DAG
+/// combining instead of DAG legalizing because the build_vectors for 64-bit
+/// vector element shift counts are generally not legal, and it is hard to see
+/// their values after they get legalized to loads from a constant pool.
+static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
+ EVT VT = N->getValueType(0);
+ if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) {
+ // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high
+ // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16.
+ SDValue N1 = N->getOperand(1);
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
+ SDValue N0 = N->getOperand(0);
+ if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP &&
+ DAG.MaskedValueIsZero(N0.getOperand(0),
+ APInt::getHighBitsSet(32, 16)))
+ return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1);
+ }
+ }
+
+ // Nothing to be done for scalar shifts.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (!VT.isVector() || !TLI.isTypeLegal(VT))
+ return SDValue();
+
+ assert(ST->hasNEON() && "unexpected vector shift");
+ int64_t Cnt;
+
+ switch (N->getOpcode()) {
+ default: llvm_unreachable("unexpected shift opcode");
+
+ case ISD::SHL:
+ if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) {
+ SDLoc dl(N);
+ return DAG.getNode(ARMISD::VSHL, dl, VT, N->getOperand(0),
+ DAG.getConstant(Cnt, dl, MVT::i32));
+ }
+ break;
+
+ case ISD::SRA:
+ case ISD::SRL:
+ if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
+ unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ?
+ ARMISD::VSHRs : ARMISD::VSHRu);
+ SDLoc dl(N);
+ return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
+ DAG.getConstant(Cnt, dl, MVT::i32));
+ }
+ }
+ return SDValue();
+}
+
+/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
+/// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
+static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
+ SDValue N0 = N->getOperand(0);
+
+ // Check for sign- and zero-extensions of vector extract operations of 8-
+ // and 16-bit vector elements. NEON supports these directly. They are
+ // handled during DAG combining because type legalization will promote them
+ // to 32-bit types and it is messy to recognize the operations after that.
+ if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+ SDValue Vec = N0.getOperand(0);
+ SDValue Lane = N0.getOperand(1);
+ EVT VT = N->getValueType(0);
+ EVT EltVT = N0.getValueType();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ if (VT == MVT::i32 &&
+ (EltVT == MVT::i8 || EltVT == MVT::i16) &&
+ TLI.isTypeLegal(Vec.getValueType()) &&
+ isa<ConstantSDNode>(Lane)) {
+
+ unsigned Opc = 0;
+ switch (N->getOpcode()) {
+ default: llvm_unreachable("unexpected opcode");
+ case ISD::SIGN_EXTEND:
+ Opc = ARMISD::VGETLANEs;
+ break;
+ case ISD::ZERO_EXTEND:
+ case ISD::ANY_EXTEND:
+ Opc = ARMISD::VGETLANEu;
+ break;
+ }
+ return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane);
+ }
+ }
+
+ return SDValue();
+}
+
+static void computeKnownBits(SelectionDAG &DAG, SDValue Op, APInt &KnownZero,
+ APInt &KnownOne) {
+ if (Op.getOpcode() == ARMISD::BFI) {
+ // Conservatively, we can recurse down the first operand
+ // and just mask out all affected bits.
+ computeKnownBits(DAG, Op.getOperand(0), KnownZero, KnownOne);
+
+ // The operand to BFI is already a mask suitable for removing the bits it
+ // sets.
+ ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2));
+ const APInt &Mask = CI->getAPIntValue();
+ KnownZero &= Mask;
+ KnownOne &= Mask;
+ return;
+ }
+ if (Op.getOpcode() == ARMISD::CMOV) {
+ APInt KZ2(KnownZero.getBitWidth(), 0);
+ APInt KO2(KnownOne.getBitWidth(), 0);
+ computeKnownBits(DAG, Op.getOperand(1), KnownZero, KnownOne);
+ computeKnownBits(DAG, Op.getOperand(2), KZ2, KO2);
+
+ KnownZero &= KZ2;
+ KnownOne &= KO2;
+ return;
+ }
+ return DAG.computeKnownBits(Op, KnownZero, KnownOne);
+}
+
+SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const {
+ // If we have a CMOV, OR and AND combination such as:
+ // if (x & CN)
+ // y |= CM;
+ //
+ // And:
+ // * CN is a single bit;
+ // * All bits covered by CM are known zero in y
+ //
+ // Then we can convert this into a sequence of BFI instructions. This will
+ // always be a win if CM is a single bit, will always be no worse than the
+ // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is
+ // three bits (due to the extra IT instruction).
+
+ SDValue Op0 = CMOV->getOperand(0);
+ SDValue Op1 = CMOV->getOperand(1);
+ auto CCNode = cast<ConstantSDNode>(CMOV->getOperand(2));
+ auto CC = CCNode->getAPIntValue().getLimitedValue();
+ SDValue CmpZ = CMOV->getOperand(4);
+
+ // The compare must be against zero.
+ if (!isNullConstant(CmpZ->getOperand(1)))
+ return SDValue();
+
+ assert(CmpZ->getOpcode() == ARMISD::CMPZ);
+ SDValue And = CmpZ->getOperand(0);
+ if (And->getOpcode() != ISD::AND)
+ return SDValue();
+ ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(And->getOperand(1));
+ if (!AndC || !AndC->getAPIntValue().isPowerOf2())
+ return SDValue();
+ SDValue X = And->getOperand(0);
+
+ if (CC == ARMCC::EQ) {
+ // We're performing an "equal to zero" compare. Swap the operands so we
+ // canonicalize on a "not equal to zero" compare.
+ std::swap(Op0, Op1);
+ } else {
+ assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");
+ }
+
+ if (Op1->getOpcode() != ISD::OR)
+ return SDValue();
+
+ ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1));
+ if (!OrC)
+ return SDValue();
+ SDValue Y = Op1->getOperand(0);
+
+ if (Op0 != Y)
+ return SDValue();
+
+ // Now, is it profitable to continue?
+ APInt OrCI = OrC->getAPIntValue();
+ unsigned Heuristic = Subtarget->isThumb() ? 3 : 2;
+ if (OrCI.countPopulation() > Heuristic)
+ return SDValue();
+
+ // Lastly, can we determine that the bits defined by OrCI
+ // are zero in Y?
+ APInt KnownZero, KnownOne;
+ computeKnownBits(DAG, Y, KnownZero, KnownOne);
+ if ((OrCI & KnownZero) != OrCI)
+ return SDValue();
+
+ // OK, we can do the combine.
+ SDValue V = Y;
+ SDLoc dl(X);
+ EVT VT = X.getValueType();
+ unsigned BitInX = AndC->getAPIntValue().logBase2();
+
+ if (BitInX != 0) {
+ // We must shift X first.
+ X = DAG.getNode(ISD::SRL, dl, VT, X,
+ DAG.getConstant(BitInX, dl, VT));
+ }
+
+ for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits();
+ BitInY < NumActiveBits; ++BitInY) {
+ if (OrCI[BitInY] == 0)
+ continue;
+ APInt Mask(VT.getSizeInBits(), 0);
+ Mask.setBit(BitInY);
+ V = DAG.getNode(ARMISD::BFI, dl, VT, V, X,
+ // Confusingly, the operand is an *inverted* mask.
+ DAG.getConstant(~Mask, dl, VT));
+ }
+
+ return V;
+}
+
+/// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
+SDValue
+ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const {
+ SDValue Cmp = N->getOperand(4);
+ if (Cmp.getOpcode() != ARMISD::CMPZ)
+ // Only looking at NE cases.
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+ SDLoc dl(N);
+ SDValue LHS = Cmp.getOperand(0);
+ SDValue RHS = Cmp.getOperand(1);
+ SDValue Chain = N->getOperand(0);
+ SDValue BB = N->getOperand(1);
+ SDValue ARMcc = N->getOperand(2);
+ ARMCC::CondCodes CC =
+ (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
+
+ // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0))
+ // -> (brcond Chain BB CC CPSR Cmp)
+ if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() &&
+ LHS->getOperand(0)->getOpcode() == ARMISD::CMOV &&
+ LHS->getOperand(0)->hasOneUse()) {
+ auto *LHS00C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(0));
+ auto *LHS01C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(1));
+ auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
+ auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
+ if ((LHS00C && LHS00C->getZExtValue() == 0) &&
+ (LHS01C && LHS01C->getZExtValue() == 1) &&
+ (LHS1C && LHS1C->getZExtValue() == 1) &&
+ (RHSC && RHSC->getZExtValue() == 0)) {
+ return DAG.getNode(
+ ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2),
+ LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4));
+ }
+ }
+
+ return SDValue();
+}
+
+/// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
+SDValue
+ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
+ SDValue Cmp = N->getOperand(4);
+ if (Cmp.getOpcode() != ARMISD::CMPZ)
+ // Only looking at EQ and NE cases.
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+ SDLoc dl(N);
+ SDValue LHS = Cmp.getOperand(0);
+ SDValue RHS = Cmp.getOperand(1);
+ SDValue FalseVal = N->getOperand(0);
+ SDValue TrueVal = N->getOperand(1);
+ SDValue ARMcc = N->getOperand(2);
+ ARMCC::CondCodes CC =
+ (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
+
+ // BFI is only available on V6T2+.
+ if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) {
+ SDValue R = PerformCMOVToBFICombine(N, DAG);
+ if (R)
+ return R;
+ }
+
+ // Simplify
+ // mov r1, r0
+ // cmp r1, x
+ // mov r0, y
+ // moveq r0, x
+ // to
+ // cmp r0, x
+ // movne r0, y
+ //
+ // mov r1, r0
+ // cmp r1, x
+ // mov r0, x
+ // movne r0, y
+ // to
+ // cmp r0, x
+ // movne r0, y
+ /// FIXME: Turn this into a target neutral optimization?
+ SDValue Res;
+ if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
+ Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc,
+ N->getOperand(3), Cmp);
+ } else if (CC == ARMCC::EQ && TrueVal == RHS) {
+ SDValue ARMcc;
+ SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);
+ Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc,
+ N->getOperand(3), NewCmp);
+ }
+
+ // (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0))
+ // -> (cmov F T CC CPSR Cmp)
+ if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse()) {
+ auto *LHS0C = dyn_cast<ConstantSDNode>(LHS->getOperand(0));
+ auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
+ auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
+ if ((LHS0C && LHS0C->getZExtValue() == 0) &&
+ (LHS1C && LHS1C->getZExtValue() == 1) &&
+ (RHSC && RHSC->getZExtValue() == 0)) {
+ return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
+ LHS->getOperand(2), LHS->getOperand(3),
+ LHS->getOperand(4));
+ }
+ }
+
+ if (Res.getNode()) {
+ APInt KnownZero, KnownOne;
+ DAG.computeKnownBits(SDValue(N,0), KnownZero, KnownOne);
+ // Capture demanded bits information that would be otherwise lost.
+ if (KnownZero == 0xfffffffe)
+ Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
+ DAG.getValueType(MVT::i1));
+ else if (KnownZero == 0xffffff00)
+ Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
+ DAG.getValueType(MVT::i8));
+ else if (KnownZero == 0xffff0000)
+ Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
+ DAG.getValueType(MVT::i16));
+ }
+
+ return Res;
+}
+
+SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ switch (N->getOpcode()) {
+ default: break;
+ case ISD::ADDC: return PerformADDCCombine(N, DCI, Subtarget);
+ case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget);
+ case ISD::SUB: return PerformSUBCombine(N, DCI);
+ case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget);
+ case ISD::OR: return PerformORCombine(N, DCI, Subtarget);
+ case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget);
+ case ISD::AND: return PerformANDCombine(N, DCI, Subtarget);
+ case ARMISD::BFI: return PerformBFICombine(N, DCI);
+ case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
+ case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
+ case ISD::STORE: return PerformSTORECombine(N, DCI);
+ case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
+ case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
+ case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
+ case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);
+ case ARMISD::VDUP: return PerformVDUPCombine(N, DCI);
+ case ISD::FP_TO_SINT:
+ case ISD::FP_TO_UINT:
+ return PerformVCVTCombine(N, DCI.DAG, Subtarget);
+ case ISD::FDIV:
+ return PerformVDIVCombine(N, DCI.DAG, Subtarget);
+ case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG);
+ case ISD::SHL:
+ case ISD::SRA:
+ case ISD::SRL: return PerformShiftCombine(N, DCI.DAG, Subtarget);
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND:
+ case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
+ case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG);
+ case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG);
+ case ISD::LOAD: return PerformLOADCombine(N, DCI);
+ case ARMISD::VLD1DUP:
+ case ARMISD::VLD2DUP:
+ case ARMISD::VLD3DUP:
+ case ARMISD::VLD4DUP:
+ return PerformVLDCombine(N, DCI);
+ case ARMISD::BUILD_VECTOR:
+ return PerformARMBUILD_VECTORCombine(N, DCI);
+ case ISD::INTRINSIC_VOID:
+ case ISD::INTRINSIC_W_CHAIN:
+ switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+ case Intrinsic::arm_neon_vld1:
+ case Intrinsic::arm_neon_vld2:
+ case Intrinsic::arm_neon_vld3:
+ case Intrinsic::arm_neon_vld4:
+ case Intrinsic::arm_neon_vld2lane:
+ case Intrinsic::arm_neon_vld3lane:
+ case Intrinsic::arm_neon_vld4lane:
+ case Intrinsic::arm_neon_vst1:
+ case Intrinsic::arm_neon_vst2:
+ case Intrinsic::arm_neon_vst3:
+ case Intrinsic::arm_neon_vst4:
+ case Intrinsic::arm_neon_vst2lane:
+ case Intrinsic::arm_neon_vst3lane:
+ case Intrinsic::arm_neon_vst4lane:
+ return PerformVLDCombine(N, DCI);
+ default: break;
+ }
+ break;
+ }
+ return SDValue();
+}
+
+bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
+ EVT VT) const {
+ return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
+}
+
+bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+ unsigned,
+ unsigned,
+ bool *Fast) const {
+ // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus
+ bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32: {
+ // Unaligned access can use (for example) LRDB, LRDH, LDR
+ if (AllowsUnaligned) {
+ if (Fast)
+ *Fast = Subtarget->hasV7Ops();
+ return true;
+ }
+ return false;
+ }
+ case MVT::f64:
+ case MVT::v2f64: {
+ // For any little-endian targets with neon, we can support unaligned ld/st
+ // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
+ // A big-endian target may also explicitly support unaligned accesses
+ if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) {
+ if (Fast)
+ *Fast = true;
+ return true;
+ }
+ return false;
+ }
+ }
+}
+
+static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
+ unsigned AlignCheck) {
+ return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
+ (DstAlign == 0 || DstAlign % AlignCheck == 0));
+}
+
+EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size,
+ unsigned DstAlign, unsigned SrcAlign,
+ bool IsMemset, bool ZeroMemset,
+ bool MemcpyStrSrc,
+ MachineFunction &MF) const {
+ const Function *F = MF.getFunction();
+
+ // See if we can use NEON instructions for this...
+ if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() &&
+ !F->hasFnAttribute(Attribute::NoImplicitFloat)) {
+ bool Fast;
+ if (Size >= 16 &&
+ (memOpAlign(SrcAlign, DstAlign, 16) ||
+ (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1, &Fast) && Fast))) {
+ return MVT::v2f64;
+ } else if (Size >= 8 &&
+ (memOpAlign(SrcAlign, DstAlign, 8) ||
+ (allowsMisalignedMemoryAccesses(MVT::f64, 0, 1, &Fast) &&
+ Fast))) {
+ return MVT::f64;
+ }
+ }
+
+ // Lowering to i32/i16 if the size permits.
+ if (Size >= 4)
+ return MVT::i32;
+ else if (Size >= 2)
+ return MVT::i16;
+
+ // Let the target-independent logic figure it out.
+ return MVT::Other;
+}
+
+bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+ if (Val.getOpcode() != ISD::LOAD)
+ return false;
+
+ EVT VT1 = Val.getValueType();
+ if (!VT1.isSimple() || !VT1.isInteger() ||
+ !VT2.isSimple() || !VT2.isInteger())
+ return false;
+
+ switch (VT1.getSimpleVT().SimpleTy) {
+ default: break;
+ case MVT::i1:
+ case MVT::i8:
+ case MVT::i16:
+ // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
+ return true;
+ }
+
+ return false;
+}
+
+bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
+ EVT VT = ExtVal.getValueType();
+
+ if (!isTypeLegal(VT))
+ return false;
+
+ // Don't create a loadext if we can fold the extension into a wide/long
+ // instruction.
+ // If there's more than one user instruction, the loadext is desirable no
+ // matter what. There can be two uses by the same instruction.
+ if (ExtVal->use_empty() ||
+ !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode()))
+ return true;
+
+ SDNode *U = *ExtVal->use_begin();
+ if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB ||
+ U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHL))
+ return false;
+
+ return true;
+}
+
+bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
+ if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+ return false;
+
+ if (!isTypeLegal(EVT::getEVT(Ty1)))
+ return false;
+
+ assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
+
+ // Assuming the caller doesn't have a zeroext or signext return parameter,
+ // truncation all the way down to i1 is valid.
+ return true;
+}
+
+int ARMTargetLowering::getScalingFactorCost(const DataLayout &DL,
+ const AddrMode &AM, Type *Ty,
+ unsigned AS) const {
+ if (isLegalAddressingMode(DL, AM, Ty, AS)) {
+ if (Subtarget->hasFPAO())
+ return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
+ return 0;
+ }
+ return -1;
+}
+
+
+static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
+ if (V < 0)
+ return false;
+
+ unsigned Scale = 1;
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return false;
+ case MVT::i1:
+ case MVT::i8:
+ // Scale == 1;
+ break;
+ case MVT::i16:
+ // Scale == 2;
+ Scale = 2;
+ break;
+ case MVT::i32:
+ // Scale == 4;
+ Scale = 4;
+ break;
+ }
+
+ if ((V & (Scale - 1)) != 0)
+ return false;
+ V /= Scale;
+ return V == (V & ((1LL << 5) - 1));
+}
+
+static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
+ const ARMSubtarget *Subtarget) {
+ bool isNeg = false;
+ if (V < 0) {
+ isNeg = true;
+ V = - V;
+ }
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return false;
+ case MVT::i1:
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32:
+ // + imm12 or - imm8
+ if (isNeg)
+ return V == (V & ((1LL << 8) - 1));
+ return V == (V & ((1LL << 12) - 1));
+ case MVT::f32:
+ case MVT::f64:
+ // Same as ARM mode. FIXME: NEON?
+ if (!Subtarget->hasVFP2())
+ return false;
+ if ((V & 3) != 0)
+ return false;
+ V >>= 2;
+ return V == (V & ((1LL << 8) - 1));
+ }
+}
+
+/// isLegalAddressImmediate - Return true if the integer value can be used
+/// as the offset of the target addressing mode for load / store of the
+/// given type.
+static bool isLegalAddressImmediate(int64_t V, EVT VT,
+ const ARMSubtarget *Subtarget) {
+ if (V == 0)
+ return true;
+
+ if (!VT.isSimple())
+ return false;
+
+ if (Subtarget->isThumb1Only())
+ return isLegalT1AddressImmediate(V, VT);
+ else if (Subtarget->isThumb2())
+ return isLegalT2AddressImmediate(V, VT, Subtarget);
+
+ // ARM mode.
+ if (V < 0)
+ V = - V;
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return false;
+ case MVT::i1:
+ case MVT::i8:
+ case MVT::i32:
+ // +- imm12
+ return V == (V & ((1LL << 12) - 1));
+ case MVT::i16:
+ // +- imm8
+ return V == (V & ((1LL << 8) - 1));
+ case MVT::f32:
+ case MVT::f64:
+ if (!Subtarget->hasVFP2()) // FIXME: NEON?
+ return false;
+ if ((V & 3) != 0)
+ return false;
+ V >>= 2;
+ return V == (V & ((1LL << 8) - 1));
+ }
+}
+
+bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM,
+ EVT VT) const {
+ int Scale = AM.Scale;
+ if (Scale < 0)
+ return false;
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return false;
+ case MVT::i1:
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32:
+ if (Scale == 1)
+ return true;
+ // r + r << imm
+ Scale = Scale & ~1;
+ return Scale == 2 || Scale == 4 || Scale == 8;
+ case MVT::i64:
+ // r + r
+ if (((unsigned)AM.HasBaseReg + Scale) <= 2)
+ return true;
+ return false;
+ case MVT::isVoid:
+ // Note, we allow "void" uses (basically, uses that aren't loads or
+ // stores), because arm allows folding a scale into many arithmetic
+ // operations. This should be made more precise and revisited later.
+
+ // Allow r << imm, but the imm has to be a multiple of two.
+ if (Scale & 1) return false;
+ return isPowerOf2_32(Scale);
+ }
+}
+
+/// isLegalAddressingMode - Return true if the addressing mode represented
+/// by AM is legal for this target, for a load/store of the specified type.
+bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+ const AddrMode &AM, Type *Ty,
+ unsigned AS) const {
+ EVT VT = getValueType(DL, Ty, true);
+ if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
+ return false;
+
+ // Can never fold addr of global into load/store.
+ if (AM.BaseGV)
+ return false;
+
+ switch (AM.Scale) {
+ case 0: // no scale reg, must be "r+i" or "r", or "i".
+ break;
+ case 1:
+ if (Subtarget->isThumb1Only())
+ return false;
+ LLVM_FALLTHROUGH;
+ default:
+ // ARM doesn't support any R+R*scale+imm addr modes.
+ if (AM.BaseOffs)
+ return false;
+
+ if (!VT.isSimple())
+ return false;
+
+ if (Subtarget->isThumb2())
+ return isLegalT2ScaledAddressingMode(AM, VT);
+
+ int Scale = AM.Scale;
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return false;
+ case MVT::i1:
+ case MVT::i8:
+ case MVT::i32:
+ if (Scale < 0) Scale = -Scale;
+ if (Scale == 1)
+ return true;
+ // r + r << imm
+ return isPowerOf2_32(Scale & ~1);
+ case MVT::i16:
+ case MVT::i64:
+ // r + r
+ if (((unsigned)AM.HasBaseReg + Scale) <= 2)
+ return true;
+ return false;
+
+ case MVT::isVoid:
+ // Note, we allow "void" uses (basically, uses that aren't loads or
+ // stores), because arm allows folding a scale into many arithmetic
+ // operations. This should be made more precise and revisited later.
+
+ // Allow r << imm, but the imm has to be a multiple of two.
+ if (Scale & 1) return false;
+ return isPowerOf2_32(Scale);
+ }
+ }
+ return true;
+}
+
+/// isLegalICmpImmediate - Return true if the specified immediate is legal
+/// icmp immediate, that is the target has icmp instructions which can compare
+/// a register against the immediate without having to materialize the
+/// immediate into a register.
+bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
+ // Thumb2 and ARM modes can use cmn for negative immediates.
+ if (!Subtarget->isThumb())
+ return ARM_AM::getSOImmVal(std::abs(Imm)) != -1;
+ if (Subtarget->isThumb2())
+ return ARM_AM::getT2SOImmVal(std::abs(Imm)) != -1;
+ // Thumb1 doesn't have cmn, and only 8-bit immediates.
+ return Imm >= 0 && Imm <= 255;
+}
+
+/// isLegalAddImmediate - Return true if the specified immediate is a legal add
+/// *or sub* immediate, that is the target has add or sub instructions which can
+/// add a register with the immediate without having to materialize the
+/// immediate into a register.
+bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const {
+ // Same encoding for add/sub, just flip the sign.
+ int64_t AbsImm = std::abs(Imm);
+ if (!Subtarget->isThumb())
+ return ARM_AM::getSOImmVal(AbsImm) != -1;
+ if (Subtarget->isThumb2())
+ return ARM_AM::getT2SOImmVal(AbsImm) != -1;
+ // Thumb1 only has 8-bit unsigned immediate.
+ return AbsImm >= 0 && AbsImm <= 255;
+}
+
+static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,
+ bool isSEXTLoad, SDValue &Base,
+ SDValue &Offset, bool &isInc,
+ SelectionDAG &DAG) {
+ if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
+ return false;
+
+ if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
+ // AddressingMode 3
+ Base = Ptr->getOperand(0);
+ if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
+ int RHSC = (int)RHS->getZExtValue();
+ if (RHSC < 0 && RHSC > -256) {
+ assert(Ptr->getOpcode() == ISD::ADD);
+ isInc = false;
+ Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
+ return true;
+ }
+ }
+ isInc = (Ptr->getOpcode() == ISD::ADD);
+ Offset = Ptr->getOperand(1);
+ return true;
+ } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
+ // AddressingMode 2
+ if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
+ int RHSC = (int)RHS->getZExtValue();
+ if (RHSC < 0 && RHSC > -0x1000) {
+ assert(Ptr->getOpcode() == ISD::ADD);
+ isInc = false;
+ Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
+ Base = Ptr->getOperand(0);
+ return true;
+ }
+ }
+
+ if (Ptr->getOpcode() == ISD::ADD) {
+ isInc = true;
+ ARM_AM::ShiftOpc ShOpcVal=
+ ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode());
+ if (ShOpcVal != ARM_AM::no_shift) {
+ Base = Ptr->getOperand(1);
+ Offset = Ptr->getOperand(0);
+ } else {
+ Base = Ptr->getOperand(0);
+ Offset = Ptr->getOperand(1);
+ }
+ return true;
+ }
+
+ isInc = (Ptr->getOpcode() == ISD::ADD);
+ Base = Ptr->getOperand(0);
+ Offset = Ptr->getOperand(1);
+ return true;
+ }
+
+ // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
+ return false;
+}
+
+static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT,
+ bool isSEXTLoad, SDValue &Base,
+ SDValue &Offset, bool &isInc,
+ SelectionDAG &DAG) {
+ if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
+ return false;
+
+ Base = Ptr->getOperand(0);
+ if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
+ int RHSC = (int)RHS->getZExtValue();
+ if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
+ assert(Ptr->getOpcode() == ISD::ADD);
+ isInc = false;
+ Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
+ return true;
+ } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
+ isInc = Ptr->getOpcode() == ISD::ADD;
+ Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/// getPreIndexedAddressParts - returns true by value, base pointer and
+/// offset pointer and addressing mode by reference if the node's address
+/// can be legally represented as pre-indexed load / store address.
+bool
+ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
+ SDValue &Offset,
+ ISD::MemIndexedMode &AM,
+ SelectionDAG &DAG) const {
+ if (Subtarget->isThumb1Only())
+ return false;
+
+ EVT VT;
+ SDValue Ptr;
+ bool isSEXTLoad = false;
+ if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+ Ptr = LD->getBasePtr();
+ VT = LD->getMemoryVT();
+ isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
+ } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+ Ptr = ST->getBasePtr();
+ VT = ST->getMemoryVT();
+ } else
+ return false;
+
+ bool isInc;
+ bool isLegal = false;
+ if (Subtarget->isThumb2())
+ isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
+ Offset, isInc, DAG);
+ else
+ isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
+ Offset, isInc, DAG);
+ if (!isLegal)
+ return false;
+
+ AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
+ return true;
+}
+
+/// getPostIndexedAddressParts - returns true by value, base pointer and
+/// offset pointer and addressing mode by reference if this node can be
+/// combined with a load / store to form a post-indexed load / store.
+bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+ SDValue &Base,
+ SDValue &Offset,
+ ISD::MemIndexedMode &AM,
+ SelectionDAG &DAG) const {
+ EVT VT;
+ SDValue Ptr;
+ bool isSEXTLoad = false, isNonExt;
+ if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+ VT = LD->getMemoryVT();
+ Ptr = LD->getBasePtr();
+ isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
+ isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
+ } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+ VT = ST->getMemoryVT();
+ Ptr = ST->getBasePtr();
+ isNonExt = !ST->isTruncatingStore();
+ } else
+ return false;
+
+ if (Subtarget->isThumb1Only()) {
+ // Thumb-1 can do a limited post-inc load or store as an updating LDM. It
+ // must be non-extending/truncating, i32, with an offset of 4.
+ assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!");
+ if (Op->getOpcode() != ISD::ADD || !isNonExt)
+ return false;
+ auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1));
+ if (!RHS || RHS->getZExtValue() != 4)
+ return false;
+
+ Offset = Op->getOperand(1);
+ Base = Op->getOperand(0);
+ AM = ISD::POST_INC;
+ return true;
+ }
+
+ bool isInc;
+ bool isLegal = false;
+ if (Subtarget->isThumb2())
+ isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
+ isInc, DAG);
+ else
+ isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
+ isInc, DAG);
+ if (!isLegal)
+ return false;
+
+ if (Ptr != Base) {
+ // Swap base ptr and offset to catch more post-index load / store when
+ // it's legal. In Thumb2 mode, offset must be an immediate.
+ if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
+ !Subtarget->isThumb2())
+ std::swap(Base, Offset);
+
+ // Post-indexed load / store update the base pointer.
+ if (Ptr != Base)
+ return false;
+ }
+
+ AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
+ return true;
+}
+
+void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
+ APInt &KnownZero,
+ APInt &KnownOne,
+ const SelectionDAG &DAG,
+ unsigned Depth) const {
+ unsigned BitWidth = KnownOne.getBitWidth();
+ KnownZero = KnownOne = APInt(BitWidth, 0);
+ switch (Op.getOpcode()) {
+ default: break;
+ case ARMISD::ADDC:
+ case ARMISD::ADDE:
+ case ARMISD::SUBC:
+ case ARMISD::SUBE:
+ // These nodes' second result is a boolean
+ if (Op.getResNo() == 0)
+ break;
+ KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
+ break;
+ case ARMISD::CMOV: {
+ // Bits are known zero/one if known on the LHS and RHS.
+ DAG.computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+ if (KnownZero == 0 && KnownOne == 0) return;
+
+ APInt KnownZeroRHS, KnownOneRHS;
+ DAG.computeKnownBits(Op.getOperand(1), KnownZeroRHS, KnownOneRHS, Depth+1);
+ KnownZero &= KnownZeroRHS;
+ KnownOne &= KnownOneRHS;
+ return;
+ }
+ case ISD::INTRINSIC_W_CHAIN: {
+ ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
+ Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
+ switch (IntID) {
+ default: return;
+ case Intrinsic::arm_ldaex:
+ case Intrinsic::arm_ldrex: {
+ EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
+ unsigned MemBits = VT.getScalarSizeInBits();
+ KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
+ return;
+ }
+ }
+ }
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// ARM Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
+ // Looking for "rev" which is V6+.
+ if (!Subtarget->hasV6Ops())
+ return false;
+
+ InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
+ std::string AsmStr = IA->getAsmString();
+ SmallVector<StringRef, 4> AsmPieces;
+ SplitString(AsmStr, AsmPieces, ";\n");
+
+ switch (AsmPieces.size()) {
+ default: return false;
+ case 1:
+ AsmStr = AsmPieces[0];
+ AsmPieces.clear();
+ SplitString(AsmStr, AsmPieces, " \t,");
+
+ // rev $0, $1
+ if (AsmPieces.size() == 3 &&
+ AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" &&
+ IA->getConstraintString().compare(0, 4, "=l,l") == 0) {
+ IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+ if (Ty && Ty->getBitWidth() == 32)
+ return IntrinsicLowering::LowerToByteSwap(CI);
+ }
+ break;
+ }
+
+ return false;
+}
+
+const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
+ // At this point, we have to lower this constraint to something else, so we
+ // lower it to an "r" or "w". However, by doing this we will force the result
+ // to be in register, while the X constraint is much more permissive.
+ //
+ // Although we are correct (we are free to emit anything, without
+ // constraints), we might break use cases that would expect us to be more
+ // efficient and emit something else.
+ if (!Subtarget->hasVFP2())
+ return "r";
+ if (ConstraintVT.isFloatingPoint())
+ return "w";
+ if (ConstraintVT.isVector() && Subtarget->hasNEON() &&
+ (ConstraintVT.getSizeInBits() == 64 ||
+ ConstraintVT.getSizeInBits() == 128))
+ return "w";
+
+ return "r";
+}
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+ARMTargetLowering::ConstraintType
+ARMTargetLowering::getConstraintType(StringRef Constraint) const {
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ default: break;
+ case 'l': return C_RegisterClass;
+ case 'w': return C_RegisterClass;
+ case 'h': return C_RegisterClass;
+ case 'x': return C_RegisterClass;
+ case 't': return C_RegisterClass;
+ case 'j': return C_Other; // Constant for movw.
+ // An address with a single base register. Due to the way we
+ // currently handle addresses it is the same as an 'r' memory constraint.
+ case 'Q': return C_Memory;
+ }
+ } else if (Constraint.size() == 2) {
+ switch (Constraint[0]) {
+ default: break;
+ // All 'U+' constraints are addresses.
+ case 'U': return C_Memory;
+ }
+ }
+ return TargetLowering::getConstraintType(Constraint);
+}
+
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+ARMTargetLowering::getSingleConstraintMatchWeight(
+ AsmOperandInfo &info, const char *constraint) const {
+ ConstraintWeight weight = CW_Invalid;
+ Value *CallOperandVal = info.CallOperandVal;
+ // If we don't have a value, we can't do a match,
+ // but allow it at the lowest weight.
+ if (!CallOperandVal)
+ return CW_Default;
+ Type *type = CallOperandVal->getType();
+ // Look at the constraint type.
+ switch (*constraint) {
+ default:
+ weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+ break;
+ case 'l':
+ if (type->isIntegerTy()) {
+ if (Subtarget->isThumb())
+ weight = CW_SpecificReg;
+ else
+ weight = CW_Register;
+ }
+ break;
+ case 'w':
+ if (type->isFloatingPointTy())
+ weight = CW_Register;
+ break;
+ }
+ return weight;
+}
+
+typedef std::pair<unsigned, const TargetRegisterClass*> RCPair;
+RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
+ const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
+ if (Constraint.size() == 1) {
+ // GCC ARM Constraint Letters
+ switch (Constraint[0]) {
+ case 'l': // Low regs or general regs.
+ if (Subtarget->isThumb())
+ return RCPair(0U, &ARM::tGPRRegClass);
+ return RCPair(0U, &ARM::GPRRegClass);
+ case 'h': // High regs or no regs.
+ if (Subtarget->isThumb())
+ return RCPair(0U, &ARM::hGPRRegClass);
+ break;
+ case 'r':
+ if (Subtarget->isThumb1Only())
+ return RCPair(0U, &ARM::tGPRRegClass);
+ return RCPair(0U, &ARM::GPRRegClass);
+ case 'w':
+ if (VT == MVT::Other)
+ break;
+ if (VT == MVT::f32)
+ return RCPair(0U, &ARM::SPRRegClass);
+ if (VT.getSizeInBits() == 64)
+ return RCPair(0U, &ARM::DPRRegClass);
+ if (VT.getSizeInBits() == 128)
+ return RCPair(0U, &ARM::QPRRegClass);
+ break;
+ case 'x':
+ if (VT == MVT::Other)
+ break;
+ if (VT == MVT::f32)
+ return RCPair(0U, &ARM::SPR_8RegClass);
+ if (VT.getSizeInBits() == 64)
+ return RCPair(0U, &ARM::DPR_8RegClass);
+ if (VT.getSizeInBits() == 128)
+ return RCPair(0U, &ARM::QPR_8RegClass);
+ break;
+ case 't':
+ if (VT == MVT::f32)
+ return RCPair(0U, &ARM::SPRRegClass);
+ break;
+ }
+ }
+ if (StringRef("{cc}").equals_lower(Constraint))
+ return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
+
+ return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+}
+
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector. If it is invalid, don't add anything to Ops.
+void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+ std::string &Constraint,
+ std::vector<SDValue>&Ops,
+ SelectionDAG &DAG) const {
+ SDValue Result;
+
+ // Currently only support length 1 constraints.
+ if (Constraint.length() != 1) return;
+
+ char ConstraintLetter = Constraint[0];
+ switch (ConstraintLetter) {
+ default: break;
+ case 'j':
+ case 'I': case 'J': case 'K': case 'L':
+ case 'M': case 'N': case 'O':
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+ if (!C)
+ return;
+
+ int64_t CVal64 = C->getSExtValue();
+ int CVal = (int) CVal64;
+ // None of these constraints allow values larger than 32 bits. Check
+ // that the value fits in an int.
+ if (CVal != CVal64)
+ return;
+
+ switch (ConstraintLetter) {
+ case 'j':
+ // Constant suitable for movw, must be between 0 and
+ // 65535.
+ if (Subtarget->hasV6T2Ops())
+ if (CVal >= 0 && CVal <= 65535)
+ break;
+ return;
+ case 'I':
+ if (Subtarget->isThumb1Only()) {
+ // This must be a constant between 0 and 255, for ADD
+ // immediates.
+ if (CVal >= 0 && CVal <= 255)
+ break;
+ } else if (Subtarget->isThumb2()) {
+ // A constant that can be used as an immediate value in a
+ // data-processing instruction.
+ if (ARM_AM::getT2SOImmVal(CVal) != -1)
+ break;
+ } else {
+ // A constant that can be used as an immediate value in a
+ // data-processing instruction.
+ if (ARM_AM::getSOImmVal(CVal) != -1)
+ break;
+ }
+ return;
+
+ case 'J':
+ if (Subtarget->isThumb1Only()) {
+ // This must be a constant between -255 and -1, for negated ADD
+ // immediates. This can be used in GCC with an "n" modifier that
+ // prints the negated value, for use with SUB instructions. It is
+ // not useful otherwise but is implemented for compatibility.
+ if (CVal >= -255 && CVal <= -1)
+ break;
+ } else {
+ // This must be a constant between -4095 and 4095. It is not clear
+ // what this constraint is intended for. Implemented for
+ // compatibility with GCC.
+ if (CVal >= -4095 && CVal <= 4095)
+ break;
+ }
+ return;
+
+ case 'K':
+ if (Subtarget->isThumb1Only()) {
+ // A 32-bit value where only one byte has a nonzero value. Exclude
+ // zero to match GCC. This constraint is used by GCC internally for
+ // constants that can be loaded with a move/shift combination.
+ // It is not useful otherwise but is implemented for compatibility.
+ if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
+ break;
+ } else if (Subtarget->isThumb2()) {
+ // A constant whose bitwise inverse can be used as an immediate
+ // value in a data-processing instruction. This can be used in GCC
+ // with a "B" modifier that prints the inverted value, for use with
+ // BIC and MVN instructions. It is not useful otherwise but is
+ // implemented for compatibility.
+ if (ARM_AM::getT2SOImmVal(~CVal) != -1)
+ break;
+ } else {
+ // A constant whose bitwise inverse can be used as an immediate
+ // value in a data-processing instruction. This can be used in GCC
+ // with a "B" modifier that prints the inverted value, for use with
+ // BIC and MVN instructions. It is not useful otherwise but is
+ // implemented for compatibility.
+ if (ARM_AM::getSOImmVal(~CVal) != -1)
+ break;
+ }
+ return;
+
+ case 'L':
+ if (Subtarget->isThumb1Only()) {
+ // This must be a constant between -7 and 7,
+ // for 3-operand ADD/SUB immediate instructions.
+ if (CVal >= -7 && CVal < 7)
+ break;
+ } else if (Subtarget->isThumb2()) {
+ // A constant whose negation can be used as an immediate value in a
+ // data-processing instruction. This can be used in GCC with an "n"
+ // modifier that prints the negated value, for use with SUB
+ // instructions. It is not useful otherwise but is implemented for
+ // compatibility.
+ if (ARM_AM::getT2SOImmVal(-CVal) != -1)
+ break;
+ } else {
+ // A constant whose negation can be used as an immediate value in a
+ // data-processing instruction. This can be used in GCC with an "n"
+ // modifier that prints the negated value, for use with SUB
+ // instructions. It is not useful otherwise but is implemented for
+ // compatibility.
+ if (ARM_AM::getSOImmVal(-CVal) != -1)
+ break;
+ }
+ return;
+
+ case 'M':
+ if (Subtarget->isThumb1Only()) {
+ // This must be a multiple of 4 between 0 and 1020, for
+ // ADD sp + immediate.
+ if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
+ break;
+ } else {
+ // A power of two or a constant between 0 and 32. This is used in
+ // GCC for the shift amount on shifted register operands, but it is
+ // useful in general for any shift amounts.
+ if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
+ break;
+ }
+ return;
+
+ case 'N':
+ if (Subtarget->isThumb()) { // FIXME thumb2
+ // This must be a constant between 0 and 31, for shift amounts.
+ if (CVal >= 0 && CVal <= 31)
+ break;
+ }
+ return;
+
+ case 'O':
+ if (Subtarget->isThumb()) { // FIXME thumb2
+ // This must be a multiple of 4 between -508 and 508, for
+ // ADD/SUB sp = sp + immediate.
+ if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
+ break;
+ }
+ return;
+ }
+ Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType());
+ break;
+ }
+
+ if (Result.getNode()) {
+ Ops.push_back(Result);
+ return;
+ }
+ return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+static RTLIB::Libcall getDivRemLibcall(
+ const SDNode *N, MVT::SimpleValueType SVT) {
+ assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
+ N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
+ "Unhandled Opcode in getDivRemLibcall");
+ bool isSigned = N->getOpcode() == ISD::SDIVREM ||
+ N->getOpcode() == ISD::SREM;
+ RTLIB::Libcall LC;
+ switch (SVT) {
+ default: llvm_unreachable("Unexpected request for libcall!");
+ case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break;
+ case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
+ case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
+ case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
+ }
+ return LC;
+}
+
+static TargetLowering::ArgListTy getDivRemArgList(
+ const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) {
+ assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
+ N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
+ "Unhandled Opcode in getDivRemArgList");
+ bool isSigned = N->getOpcode() == ISD::SDIVREM ||
+ N->getOpcode() == ISD::SREM;
+ TargetLowering::ArgListTy Args;
+ TargetLowering::ArgListEntry Entry;
+ for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+ EVT ArgVT = N->getOperand(i).getValueType();
+ Type *ArgTy = ArgVT.getTypeForEVT(*Context);
+ Entry.Node = N->getOperand(i);
+ Entry.Ty = ArgTy;
+ Entry.isSExt = isSigned;
+ Entry.isZExt = !isSigned;
+ Args.push_back(Entry);
+ }
+ if (Subtarget->isTargetWindows() && Args.size() >= 2)
+ std::swap(Args[0], Args[1]);
+ return Args;
+}
+
+SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
+ assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
+ Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
+ Subtarget->isTargetWindows()) &&
+ "Register-based DivRem lowering only");
+ unsigned Opcode = Op->getOpcode();
+ assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
+ "Invalid opcode for Div/Rem lowering");
+ bool isSigned = (Opcode == ISD::SDIVREM);
+ EVT VT = Op->getValueType(0);
+ Type *Ty = VT.getTypeForEVT(*DAG.getContext());
+ SDLoc dl(Op);
+
+ // If the target has hardware divide, use divide + multiply + subtract:
+ // div = a / b
+ // rem = a - b * div
+ // return {div, rem}
+ // This should be lowered into UDIV/SDIV + MLS later on.
+ if (Subtarget->hasDivide() && Op->getValueType(0).isSimple() &&
+ Op->getSimpleValueType(0) == MVT::i32) {
+ unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
+ const SDValue Dividend = Op->getOperand(0);
+ const SDValue Divisor = Op->getOperand(1);
+ SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor);
+ SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor);
+ SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul);
+
+ SDValue Values[2] = {Div, Rem};
+ return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values);
+ }
+
+ RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(),
+ VT.getSimpleVT().SimpleTy);
+ SDValue InChain = DAG.getEntryNode();
+
+ TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(),
+ DAG.getContext(),
+ Subtarget);
+
+ SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
+ getPointerTy(DAG.getDataLayout()));
+
+ Type *RetTy = (Type*)StructType::get(Ty, Ty, nullptr);
+
+ if (Subtarget->isTargetWindows())
+ InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain);
+
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(dl).setChain(InChain)
+ .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
+ .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
+
+ std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+ return CallInfo.first;
+}
+
+// Lowers REM using divmod helpers
+// see RTABI section 4.2/4.3
+SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
+ // Build return types (div and rem)
+ std::vector<Type*> RetTyParams;
+ Type *RetTyElement;
+
+ switch (N->getValueType(0).getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("Unexpected request for libcall!");
+ case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break;
+ case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break;
+ case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break;
+ case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break;
+ }
+
+ RetTyParams.push_back(RetTyElement);
+ RetTyParams.push_back(RetTyElement);
+ ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams);
+ Type *RetTy = StructType::get(*DAG.getContext(), ret);
+
+ RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT().
+ SimpleTy);
+ SDValue InChain = DAG.getEntryNode();
+ TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext(),
+ Subtarget);
+ bool isSigned = N->getOpcode() == ISD::SREM;
+ SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
+ getPointerTy(DAG.getDataLayout()));
+
+ if (Subtarget->isTargetWindows())
+ InChain = WinDBZCheckDenominator(DAG, N, InChain);
+
+ // Lower call
+ CallLoweringInfo CLI(DAG);
+ CLI.setChain(InChain)
+ .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args))
+ .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N));
+ std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+
+ // Return second (rem) result operand (first contains div)
+ SDNode *ResNode = CallResult.first.getNode();
+ assert(ResNode->getNumOperands() == 2 && "divmod should return two operands");
+ return ResNode->getOperand(1);
+}
+
+SDValue
+ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
+ assert(Subtarget->isTargetWindows() && "unsupported target platform");
+ SDLoc DL(Op);
+
+ // Get the inputs.
+ SDValue Chain = Op.getOperand(0);
+ SDValue Size = Op.getOperand(1);
+
+ SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
+ DAG.getConstant(2, DL, MVT::i32));
+
+ SDValue Flag;
+ Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag);
+ Flag = Chain.getValue(1);
+
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+ Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag);
+
+ SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
+ Chain = NewSP.getValue(1);
+
+ SDValue Ops[2] = { NewSP, Chain };
+ return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
+ assert(Op.getValueType() == MVT::f64 && Subtarget->isFPOnlySP() &&
+ "Unexpected type for custom-lowering FP_EXTEND");
+
+ RTLIB::Libcall LC;
+ LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
+
+ SDValue SrcVal = Op.getOperand(0);
+ return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
+ SDLoc(Op)).first;
+}
+
+SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
+ assert(Op.getOperand(0).getValueType() == MVT::f64 &&
+ Subtarget->isFPOnlySP() &&
+ "Unexpected type for custom-lowering FP_ROUND");
+
+ RTLIB::Libcall LC;
+ LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
+
+ SDValue SrcVal = Op.getOperand(0);
+ return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
+ SDLoc(Op)).first;
+}
+
+bool
+ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+ // The ARM target isn't yet aware of offsets.
+ return false;
+}
+
+bool ARM::isBitFieldInvertedMask(unsigned v) {
+ if (v == 0xffffffff)
+ return false;
+
+ // there can be 1's on either or both "outsides", all the "inside"
+ // bits must be 0's
+ return isShiftedMask_32(~v);
+}
+
+/// isFPImmLegal - Returns true if the target can instruction select the
+/// specified FP immediate natively. If false, the legalizer will
+/// materialize the FP immediate as a load from a constant pool.
+bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+ if (!Subtarget->hasVFP3())
+ return false;
+ if (VT == MVT::f32)
+ return ARM_AM::getFP32Imm(Imm) != -1;
+ if (VT == MVT::f64 && !Subtarget->isFPOnlySP())
+ return ARM_AM::getFP64Imm(Imm) != -1;
+ return false;
+}
+
+/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
+/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
+/// specified in the intrinsic calls.
+bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
+ const CallInst &I,
+ unsigned Intrinsic) const {
+ switch (Intrinsic) {
+ case Intrinsic::arm_neon_vld1:
+ case Intrinsic::arm_neon_vld2:
+ case Intrinsic::arm_neon_vld3:
+ case Intrinsic::arm_neon_vld4:
+ case Intrinsic::arm_neon_vld2lane:
+ case Intrinsic::arm_neon_vld3lane:
+ case Intrinsic::arm_neon_vld4lane: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ // Conservatively set memVT to the entire set of vectors loaded.
+ auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
+ uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
+ Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
+ Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
+ Info.vol = false; // volatile loads with NEON intrinsics not supported
+ Info.readMem = true;
+ Info.writeMem = false;
+ return true;
+ }
+ case Intrinsic::arm_neon_vst1:
+ case Intrinsic::arm_neon_vst2:
+ case Intrinsic::arm_neon_vst3:
+ case Intrinsic::arm_neon_vst4:
+ case Intrinsic::arm_neon_vst2lane:
+ case Intrinsic::arm_neon_vst3lane:
+ case Intrinsic::arm_neon_vst4lane: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ // Conservatively set memVT to the entire set of vectors stored.
+ auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
+ unsigned NumElts = 0;
+ for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
+ Type *ArgTy = I.getArgOperand(ArgI)->getType();
+ if (!ArgTy->isVectorTy())
+ break;
+ NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
+ }
+ Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
+ Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
+ Info.vol = false; // volatile stores with NEON intrinsics not supported
+ Info.readMem = false;
+ Info.writeMem = true;
+ return true;
+ }
+ case Intrinsic::arm_ldaex:
+ case Intrinsic::arm_ldrex: {
+ auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
+ PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::getVT(PtrTy->getElementType());
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
+ Info.vol = true;
+ Info.readMem = true;
+ Info.writeMem = false;
+ return true;
+ }
+ case Intrinsic::arm_stlex:
+ case Intrinsic::arm_strex: {
+ auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
+ PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::getVT(PtrTy->getElementType());
+ Info.ptrVal = I.getArgOperand(1);
+ Info.offset = 0;
+ Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
+ Info.vol = true;
+ Info.readMem = false;
+ Info.writeMem = true;
+ return true;
+ }
+ case Intrinsic::arm_stlexd:
+ case Intrinsic::arm_strexd: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::i64;
+ Info.ptrVal = I.getArgOperand(2);
+ Info.offset = 0;
+ Info.align = 8;
+ Info.vol = true;
+ Info.readMem = false;
+ Info.writeMem = true;
+ return true;
+ }
+ case Intrinsic::arm_ldaexd:
+ case Intrinsic::arm_ldrexd: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::i64;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.align = 8;
+ Info.vol = true;
+ Info.readMem = true;
+ Info.writeMem = false;
+ return true;
+ }
+ default:
+ break;
+ }
+
+ return false;
+}
+
+/// \brief Returns true if it is beneficial to convert a load of a constant
+/// to just the constant itself.
+bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
+ Type *Ty) const {
+ assert(Ty->isIntegerTy());
+
+ unsigned Bits = Ty->getPrimitiveSizeInBits();
+ if (Bits == 0 || Bits > 32)
+ return false;
+ return true;
+}
+
+bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT,
+ unsigned Index) const {
+ if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
+ return false;
+
+ return (Index == 0 || Index == ResVT.getVectorNumElements());
+}
+
+Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder,
+ ARM_MB::MemBOpt Domain) const {
+ Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+
+ // First, if the target has no DMB, see what fallback we can use.
+ if (!Subtarget->hasDataBarrier()) {
+ // Some ARMv6 cpus can support data barriers with an mcr instruction.
+ // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
+ // here.
+ if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
+ Function *MCR = llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_mcr);
+ Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
+ Builder.getInt32(0), Builder.getInt32(7),
+ Builder.getInt32(10), Builder.getInt32(5)};
+ return Builder.CreateCall(MCR, args);
+ } else {
+ // Instead of using barriers, atomic accesses on these subtargets use
+ // libcalls.
+ llvm_unreachable("makeDMB on a target so old that it has no barriers");
+ }
+ } else {
+ Function *DMB = llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_dmb);
+ // Only a full system barrier exists in the M-class architectures.
+ Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
+ Constant *CDomain = Builder.getInt32(Domain);
+ return Builder.CreateCall(DMB, CDomain);
+ }
+}
+
+// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
+Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
+ AtomicOrdering Ord, bool IsStore,
+ bool IsLoad) const {
+ switch (Ord) {
+ case AtomicOrdering::NotAtomic:
+ case AtomicOrdering::Unordered:
+ llvm_unreachable("Invalid fence: unordered/non-atomic");
+ case AtomicOrdering::Monotonic:
+ case AtomicOrdering::Acquire:
+ return nullptr; // Nothing to do
+ case AtomicOrdering::SequentiallyConsistent:
+ if (!IsStore)
+ return nullptr; // Nothing to do
+ /*FALLTHROUGH*/
+ case AtomicOrdering::Release:
+ case AtomicOrdering::AcquireRelease:
+ if (Subtarget->preferISHSTBarriers())
+ return makeDMB(Builder, ARM_MB::ISHST);
+ // FIXME: add a comment with a link to documentation justifying this.
+ else
+ return makeDMB(Builder, ARM_MB::ISH);
+ }
+ llvm_unreachable("Unknown fence ordering in emitLeadingFence");
+}
+
+Instruction* ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
+ AtomicOrdering Ord, bool IsStore,
+ bool IsLoad) const {
+ switch (Ord) {
+ case AtomicOrdering::NotAtomic:
+ case AtomicOrdering::Unordered:
+ llvm_unreachable("Invalid fence: unordered/not-atomic");
+ case AtomicOrdering::Monotonic:
+ case AtomicOrdering::Release:
+ return nullptr; // Nothing to do
+ case AtomicOrdering::Acquire:
+ case AtomicOrdering::AcquireRelease:
+ case AtomicOrdering::SequentiallyConsistent:
+ return makeDMB(Builder, ARM_MB::ISH);
+ }
+ llvm_unreachable("Unknown fence ordering in emitTrailingFence");
+}
+
+// Loads and stores less than 64-bits are already atomic; ones above that
+// are doomed anyway, so defer to the default libcall and blame the OS when
+// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
+// anything for those.
+bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
+ unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
+ return (Size == 64) && !Subtarget->isMClass();
+}
+
+// Loads and stores less than 64-bits are already atomic; ones above that
+// are doomed anyway, so defer to the default libcall and blame the OS when
+// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
+// anything for those.
+// FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
+// guarantee, see DDI0406C ARM architecture reference manual,
+// sections A8.8.72-74 LDRD)
+TargetLowering::AtomicExpansionKind
+ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
+ unsigned Size = LI->getType()->getPrimitiveSizeInBits();
+ return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly
+ : AtomicExpansionKind::None;
+}
+
+// For the real atomic operations, we have ldrex/strex up to 32 bits,
+// and up to 64 bits on the non-M profiles
+TargetLowering::AtomicExpansionKind
+ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
+ unsigned Size = AI->getType()->getPrimitiveSizeInBits();
+ bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
+ return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW)
+ ? AtomicExpansionKind::LLSC
+ : AtomicExpansionKind::None;
+}
+
+bool ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(
+ AtomicCmpXchgInst *AI) const {
+ // At -O0, fast-regalloc cannot cope with the live vregs necessary to
+ // implement cmpxchg without spilling. If the address being exchanged is also
+ // on the stack and close enough to the spill slot, this can lead to a
+ // situation where the monitor always gets cleared and the atomic operation
+ // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
+ bool hasAtomicCmpXchg =
+ !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
+ return getTargetMachine().getOptLevel() != 0 && hasAtomicCmpXchg;
+}
+
+bool ARMTargetLowering::shouldInsertFencesForAtomic(
+ const Instruction *I) const {
+ return InsertFencesForAtomic;
+}
+
+// This has so far only been implemented for MachO.
+bool ARMTargetLowering::useLoadStackGuardNode() const {
+ return Subtarget->isTargetMachO();
+}
+
+bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
+ unsigned &Cost) const {
+ // If we do not have NEON, vector types are not natively supported.
+ if (!Subtarget->hasNEON())
+ return false;
+
+ // Floating point values and vector values map to the same register file.
+ // Therefore, although we could do a store extract of a vector type, this is
+ // better to leave at float as we have more freedom in the addressing mode for
+ // those.
+ if (VectorTy->isFPOrFPVectorTy())
+ return false;
+
+ // If the index is unknown at compile time, this is very expensive to lower
+ // and it is not possible to combine the store with the extract.
+ if (!isa<ConstantInt>(Idx))
+ return false;
+
+ assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
+ unsigned BitWidth = cast<VectorType>(VectorTy)->getBitWidth();
+ // We can do a store + vector extract on any vector that fits perfectly in a D
+ // or Q register.
+ if (BitWidth == 64 || BitWidth == 128) {
+ Cost = 0;
+ return true;
+ }
+ return false;
+}
+
+bool ARMTargetLowering::isCheapToSpeculateCttz() const {
+ return Subtarget->hasV6T2Ops();
+}
+
+bool ARMTargetLowering::isCheapToSpeculateCtlz() const {
+ return Subtarget->hasV6T2Ops();
+}
+
+Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+ AtomicOrdering Ord) const {
+ Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+ Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
+ bool IsAcquire = isAcquireOrStronger(Ord);
+
+ // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
+ // intrinsic must return {i32, i32} and we have to recombine them into a
+ // single i64 here.
+ if (ValTy->getPrimitiveSizeInBits() == 64) {
+ Intrinsic::ID Int =
+ IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
+ Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int);
+
+ Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
+ Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi");
+
+ Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
+ Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
+ if (!Subtarget->isLittle())
+ std::swap (Lo, Hi);
+ Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
+ Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
+ return Builder.CreateOr(
+ Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64");
+ }
+
+ Type *Tys[] = { Addr->getType() };
+ Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
+ Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int, Tys);
+
+ return Builder.CreateTruncOrBitCast(
+ Builder.CreateCall(Ldrex, Addr),
+ cast<PointerType>(Addr->getType())->getElementType());
+}
+
+void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
+ IRBuilder<> &Builder) const {
+ if (!Subtarget->hasV7Ops())
+ return;
+ Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+ Builder.CreateCall(llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_clrex));
+}
+
+Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val,
+ Value *Addr,
+ AtomicOrdering Ord) const {
+ Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+ bool IsRelease = isReleaseOrStronger(Ord);
+
+ // Since the intrinsics must have legal type, the i64 intrinsics take two
+ // parameters: "i32, i32". We must marshal Val into the appropriate form
+ // before the call.
+ if (Val->getType()->getPrimitiveSizeInBits() == 64) {
+ Intrinsic::ID Int =
+ IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
+ Function *Strex = Intrinsic::getDeclaration(M, Int);
+ Type *Int32Ty = Type::getInt32Ty(M->getContext());
+
+ Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
+ Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
+ if (!Subtarget->isLittle())
+ std::swap (Lo, Hi);
+ Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
+ return Builder.CreateCall(Strex, {Lo, Hi, Addr});
+ }
+
+ Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
+ Type *Tys[] = { Addr->getType() };
+ Function *Strex = Intrinsic::getDeclaration(M, Int, Tys);
+
+ return Builder.CreateCall(
+ Strex, {Builder.CreateZExtOrBitCast(
+ Val, Strex->getFunctionType()->getParamType(0)),
+ Addr});
+}
+
+/// \brief Lower an interleaved load into a vldN intrinsic.
+///
+/// E.g. Lower an interleaved load (Factor = 2):
+/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
+/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
+/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
+///
+/// Into:
+/// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)
+/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
+/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
+bool ARMTargetLowering::lowerInterleavedLoad(
+ LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+ ArrayRef<unsigned> Indices, unsigned Factor) const {
+ assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
+ "Invalid interleave factor");
+ assert(!Shuffles.empty() && "Empty shufflevector input");
+ assert(Shuffles.size() == Indices.size() &&
+ "Unmatched number of shufflevectors and indices");
+
+ VectorType *VecTy = Shuffles[0]->getType();
+ Type *EltTy = VecTy->getVectorElementType();
+
+ const DataLayout &DL = LI->getModule()->getDataLayout();
+ unsigned VecSize = DL.getTypeSizeInBits(VecTy);
+ bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64;
+
+ // Skip if we do not have NEON and skip illegal vector types and vector types
+ // with i64/f64 elements (vldN doesn't support i64/f64 elements).
+ if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128) || EltIs64Bits)
+ return false;
+
+ // A pointer vector can not be the return type of the ldN intrinsics. Need to
+ // load integer vectors first and then convert to pointer vectors.
+ if (EltTy->isPointerTy())
+ VecTy =
+ VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());
+
+ static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
+ Intrinsic::arm_neon_vld3,
+ Intrinsic::arm_neon_vld4};
+
+ IRBuilder<> Builder(LI);
+ SmallVector<Value *, 2> Ops;
+
+ Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace());
+ Ops.push_back(Builder.CreateBitCast(LI->getPointerOperand(), Int8Ptr));
+ Ops.push_back(Builder.getInt32(LI->getAlignment()));
+
+ Type *Tys[] = { VecTy, Int8Ptr };
+ Function *VldnFunc =
+ Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
+ CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN");
+
+ // Replace uses of each shufflevector with the corresponding vector loaded
+ // by ldN.
+ for (unsigned i = 0; i < Shuffles.size(); i++) {
+ ShuffleVectorInst *SV = Shuffles[i];
+ unsigned Index = Indices[i];
+
+ Value *SubVec = Builder.CreateExtractValue(VldN, Index);
+
+ // Convert the integer vector to pointer vector if the element is pointer.
+ if (EltTy->isPointerTy())
+ SubVec = Builder.CreateIntToPtr(SubVec, SV->getType());
+
+ SV->replaceAllUsesWith(SubVec);
+ }
+
+ return true;
+}
+
+/// \brief Get a mask consisting of sequential integers starting from \p Start.
+///
+/// I.e. <Start, Start + 1, ..., Start + NumElts - 1>
+static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start,
+ unsigned NumElts) {
+ SmallVector<Constant *, 16> Mask;
+ for (unsigned i = 0; i < NumElts; i++)
+ Mask.push_back(Builder.getInt32(Start + i));
+
+ return ConstantVector::get(Mask);
+}
+
+/// \brief Lower an interleaved store into a vstN intrinsic.
+///
+/// E.g. Lower an interleaved store (Factor = 3):
+/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
+/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
+/// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4
+///
+/// Into:
+/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
+/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
+/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
+/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
+///
+/// Note that the new shufflevectors will be removed and we'll only generate one
+/// vst3 instruction in CodeGen.
+///
+/// Example for a more general valid mask (Factor 3). Lower:
+/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
+/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
+/// store <12 x i32> %i.vec, <12 x i32>* %ptr
+///
+/// Into:
+/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
+/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
+/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
+/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
+bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
+ ShuffleVectorInst *SVI,
+ unsigned Factor) const {
+ assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
+ "Invalid interleave factor");
+
+ VectorType *VecTy = SVI->getType();
+ assert(VecTy->getVectorNumElements() % Factor == 0 &&
+ "Invalid interleaved store");
+
+ unsigned LaneLen = VecTy->getVectorNumElements() / Factor;
+ Type *EltTy = VecTy->getVectorElementType();
+ VectorType *SubVecTy = VectorType::get(EltTy, LaneLen);
+
+ const DataLayout &DL = SI->getModule()->getDataLayout();
+ unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
+ bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64;
+
+ // Skip if we do not have NEON and skip illegal vector types and vector types
+ // with i64/f64 elements (vstN doesn't support i64/f64 elements).
+ if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128) ||
+ EltIs64Bits)
+ return false;
+
+ Value *Op0 = SVI->getOperand(0);
+ Value *Op1 = SVI->getOperand(1);
+ IRBuilder<> Builder(SI);
+
+ // StN intrinsics don't support pointer vectors as arguments. Convert pointer
+ // vectors to integer vectors.
+ if (EltTy->isPointerTy()) {
+ Type *IntTy = DL.getIntPtrType(EltTy);
+
+ // Convert to the corresponding integer vector.
+ Type *IntVecTy =
+ VectorType::get(IntTy, Op0->getType()->getVectorNumElements());
+ Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
+ Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
+
+ SubVecTy = VectorType::get(IntTy, LaneLen);
+ }
+
+ static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
+ Intrinsic::arm_neon_vst3,
+ Intrinsic::arm_neon_vst4};
+ SmallVector<Value *, 6> Ops;
+
+ Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
+ Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), Int8Ptr));
+
+ Type *Tys[] = { Int8Ptr, SubVecTy };
+ Function *VstNFunc = Intrinsic::getDeclaration(
+ SI->getModule(), StoreInts[Factor - 2], Tys);
+
+ // Split the shufflevector operands into sub vectors for the new vstN call.
+ auto Mask = SVI->getShuffleMask();
+ for (unsigned i = 0; i < Factor; i++) {
+ if (Mask[i] >= 0) {
+ Ops.push_back(Builder.CreateShuffleVector(
+ Op0, Op1, getSequentialMask(Builder, Mask[i], LaneLen)));
+ } else {
+ unsigned StartMask = 0;
+ for (unsigned j = 1; j < LaneLen; j++) {
+ if (Mask[j*Factor + i] >= 0) {
+ StartMask = Mask[j*Factor + i] - j;
+ break;
+ }
+ }
+ // Note: If all elements in a chunk are undefs, StartMask=0!
+ // Note: Filling undef gaps with random elements is ok, since
+ // those elements were being written anyway (with undefs).
+ // In the case of all undefs we're defaulting to using elems from 0
+ // Note: StartMask cannot be negative, it's checked in isReInterleaveMask
+ Ops.push_back(Builder.CreateShuffleVector(
+ Op0, Op1, getSequentialMask(Builder, StartMask, LaneLen)));
+ }
+ }
+
+ Ops.push_back(Builder.getInt32(SI->getAlignment()));
+ Builder.CreateCall(VstNFunc, Ops);
+ return true;
+}
+
+enum HABaseType {
+ HA_UNKNOWN = 0,
+ HA_FLOAT,
+ HA_DOUBLE,
+ HA_VECT64,
+ HA_VECT128
+};
+
+static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
+ uint64_t &Members) {
+ if (auto *ST = dyn_cast<StructType>(Ty)) {
+ for (unsigned i = 0; i < ST->getNumElements(); ++i) {
+ uint64_t SubMembers = 0;
+ if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))
+ return false;
+ Members += SubMembers;
+ }
+ } else if (auto *AT = dyn_cast<ArrayType>(Ty)) {
+ uint64_t SubMembers = 0;
+ if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))
+ return false;
+ Members += SubMembers * AT->getNumElements();
+ } else if (Ty->isFloatTy()) {
+ if (Base != HA_UNKNOWN && Base != HA_FLOAT)
+ return false;
+ Members = 1;
+ Base = HA_FLOAT;
+ } else if (Ty->isDoubleTy()) {
+ if (Base != HA_UNKNOWN && Base != HA_DOUBLE)
+ return false;
+ Members = 1;
+ Base = HA_DOUBLE;
+ } else if (auto *VT = dyn_cast<VectorType>(Ty)) {
+ Members = 1;
+ switch (Base) {
+ case HA_FLOAT:
+ case HA_DOUBLE:
+ return false;
+ case HA_VECT64:
+ return VT->getBitWidth() == 64;
+ case HA_VECT128:
+ return VT->getBitWidth() == 128;
+ case HA_UNKNOWN:
+ switch (VT->getBitWidth()) {
+ case 64:
+ Base = HA_VECT64;
+ return true;
+ case 128:
+ Base = HA_VECT128;
+ return true;
+ default:
+ return false;
+ }
+ }
+ }
+
+ return (Members > 0 && Members <= 4);
+}
+
+/// \brief Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
+/// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
+/// passing according to AAPCS rules.
+bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
+ Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
+ if (getEffectiveCallingConv(CallConv, isVarArg) !=
+ CallingConv::ARM_AAPCS_VFP)
+ return false;
+
+ HABaseType Base = HA_UNKNOWN;
+ uint64_t Members = 0;
+ bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
+ DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
+
+ bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
+ return IsHA || IsIntArray;
+}
+
+unsigned ARMTargetLowering::getExceptionPointerRegister(
+ const Constant *PersonalityFn) const {
+ // Platforms which do not use SjLj EH may return values in these registers
+ // via the personality function.
+ return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R0;
+}
+
+unsigned ARMTargetLowering::getExceptionSelectorRegister(
+ const Constant *PersonalityFn) const {
+ // Platforms which do not use SjLj EH may return values in these registers
+ // via the personality function.
+ return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1;
+}
+
+void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
+ // Update IsSplitCSR in ARMFunctionInfo.
+ ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>();
+ AFI->setIsSplitCSR(true);
+}
+
+void ARMTargetLowering::insertCopiesSplitCSR(
+ MachineBasicBlock *Entry,
+ const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
+ const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
+ const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
+ if (!IStart)
+ return;
+
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
+ MachineBasicBlock::iterator MBBI = Entry->begin();
+ for (const MCPhysReg *I = IStart; *I; ++I) {
+ const TargetRegisterClass *RC = nullptr;
+ if (ARM::GPRRegClass.contains(*I))
+ RC = &ARM::GPRRegClass;
+ else if (ARM::DPRRegClass.contains(*I))
+ RC = &ARM::DPRRegClass;
+ else
+ llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+
+ unsigned NewVR = MRI->createVirtualRegister(RC);
+ // Create copy from CSR to a virtual register.
+ // FIXME: this currently does not emit CFI pseudo-instructions, it works
+ // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
+ // nounwind. If we want to generalize this later, we may need to emit
+ // CFI pseudo-instructions.
+ assert(Entry->getParent()->getFunction()->hasFnAttribute(
+ Attribute::NoUnwind) &&
+ "Function should be nounwind in insertCopiesSplitCSR!");
+ Entry->addLiveIn(*I);
+ BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
+ .addReg(*I);
+
+ // Insert the copy-back instructions right before the terminator.
+ for (auto *Exit : Exits)
+ BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
+ TII->get(TargetOpcode::COPY), *I)
+ .addReg(NewVR);
+ }
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.h b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h
new file mode 100644
index 000000000000..5255d82d647a
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -0,0 +1,720 @@
+//===-- ARMISelLowering.h - ARM DAG Lowering Interface ----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that ARM uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMISELLOWERING_H
+#define LLVM_LIB_TARGET_ARM_ARMISELLOWERING_H
+
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+#include <vector>
+
+namespace llvm {
+ class ARMConstantPoolValue;
+ class ARMSubtarget;
+
+ namespace ARMISD {
+ // ARM Specific DAG Nodes
+ enum NodeType : unsigned {
+ // Start the numbering where the builtin ops and target ops leave off.
+ FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+ Wrapper, // Wrapper - A wrapper node for TargetConstantPool,
+ // TargetExternalSymbol, and TargetGlobalAddress.
+ WrapperPIC, // WrapperPIC - A wrapper node for TargetGlobalAddress in
+ // PIC mode.
+ WrapperJT, // WrapperJT - A wrapper node for TargetJumpTable
+
+ // Add pseudo op to model memcpy for struct byval.
+ COPY_STRUCT_BYVAL,
+
+ CALL, // Function call.
+ CALL_PRED, // Function call that's predicable.
+ CALL_NOLINK, // Function call with branch not branch-and-link.
+ BRCOND, // Conditional branch.
+ BR_JT, // Jumptable branch.
+ BR2_JT, // Jumptable branch (2 level - jumptable entry is a jump).
+ RET_FLAG, // Return with a flag operand.
+ INTRET_FLAG, // Interrupt return with an LR-offset and a flag operand.
+
+ PIC_ADD, // Add with a PC operand and a PIC label.
+
+ CMP, // ARM compare instructions.
+ CMN, // ARM CMN instructions.
+ CMPZ, // ARM compare that sets only Z flag.
+ CMPFP, // ARM VFP compare instruction, sets FPSCR.
+ CMPFPw0, // ARM VFP compare against zero instruction, sets FPSCR.
+ FMSTAT, // ARM fmstat instruction.
+
+ CMOV, // ARM conditional move instructions.
+
+ SSAT, // Signed saturation
+
+ BCC_i64,
+
+ SRL_FLAG, // V,Flag = srl_flag X -> srl X, 1 + save carry out.
+ SRA_FLAG, // V,Flag = sra_flag X -> sra X, 1 + save carry out.
+ RRX, // V = RRX X, Flag -> srl X, 1 + shift in carry flag.
+
+ ADDC, // Add with carry
+ ADDE, // Add using carry
+ SUBC, // Sub with carry
+ SUBE, // Sub using carry
+
+ VMOVRRD, // double to two gprs.
+ VMOVDRR, // Two gprs to double.
+
+ EH_SJLJ_SETJMP, // SjLj exception handling setjmp.
+ EH_SJLJ_LONGJMP, // SjLj exception handling longjmp.
+ EH_SJLJ_SETUP_DISPATCH, // SjLj exception handling setup_dispatch.
+
+ TC_RETURN, // Tail call return pseudo.
+
+ THREAD_POINTER,
+
+ DYN_ALLOC, // Dynamic allocation on the stack.
+
+ MEMBARRIER_MCR, // Memory barrier (MCR)
+
+ PRELOAD, // Preload
+
+ WIN__CHKSTK, // Windows' __chkstk call to do stack probing.
+ WIN__DBZCHK, // Windows' divide by zero check
+
+ VCEQ, // Vector compare equal.
+ VCEQZ, // Vector compare equal to zero.
+ VCGE, // Vector compare greater than or equal.
+ VCGEZ, // Vector compare greater than or equal to zero.
+ VCLEZ, // Vector compare less than or equal to zero.
+ VCGEU, // Vector compare unsigned greater than or equal.
+ VCGT, // Vector compare greater than.
+ VCGTZ, // Vector compare greater than zero.
+ VCLTZ, // Vector compare less than zero.
+ VCGTU, // Vector compare unsigned greater than.
+ VTST, // Vector test bits.
+
+ // Vector shift by immediate:
+ VSHL, // ...left
+ VSHRs, // ...right (signed)
+ VSHRu, // ...right (unsigned)
+
+ // Vector rounding shift by immediate:
+ VRSHRs, // ...right (signed)
+ VRSHRu, // ...right (unsigned)
+ VRSHRN, // ...right narrow
+
+ // Vector saturating shift by immediate:
+ VQSHLs, // ...left (signed)
+ VQSHLu, // ...left (unsigned)
+ VQSHLsu, // ...left (signed to unsigned)
+ VQSHRNs, // ...right narrow (signed)
+ VQSHRNu, // ...right narrow (unsigned)
+ VQSHRNsu, // ...right narrow (signed to unsigned)
+
+ // Vector saturating rounding shift by immediate:
+ VQRSHRNs, // ...right narrow (signed)
+ VQRSHRNu, // ...right narrow (unsigned)
+ VQRSHRNsu, // ...right narrow (signed to unsigned)
+
+ // Vector shift and insert:
+ VSLI, // ...left
+ VSRI, // ...right
+
+ // Vector get lane (VMOV scalar to ARM core register)
+ // (These are used for 8- and 16-bit element types only.)
+ VGETLANEu, // zero-extend vector extract element
+ VGETLANEs, // sign-extend vector extract element
+
+ // Vector move immediate and move negated immediate:
+ VMOVIMM,
+ VMVNIMM,
+
+ // Vector move f32 immediate:
+ VMOVFPIMM,
+
+ // Vector duplicate:
+ VDUP,
+ VDUPLANE,
+
+ // Vector shuffles:
+ VEXT, // extract
+ VREV64, // reverse elements within 64-bit doublewords
+ VREV32, // reverse elements within 32-bit words
+ VREV16, // reverse elements within 16-bit halfwords
+ VZIP, // zip (interleave)
+ VUZP, // unzip (deinterleave)
+ VTRN, // transpose
+ VTBL1, // 1-register shuffle with mask
+ VTBL2, // 2-register shuffle with mask
+
+ // Vector multiply long:
+ VMULLs, // ...signed
+ VMULLu, // ...unsigned
+
+ UMLAL, // 64bit Unsigned Accumulate Multiply
+ SMLAL, // 64bit Signed Accumulate Multiply
+ UMAAL, // 64-bit Unsigned Accumulate Accumulate Multiply
+
+ // Operands of the standard BUILD_VECTOR node are not legalized, which
+ // is fine if BUILD_VECTORs are always lowered to shuffles or other
+ // operations, but for ARM some BUILD_VECTORs are legal as-is and their
+ // operands need to be legalized. Define an ARM-specific version of
+ // BUILD_VECTOR for this purpose.
+ BUILD_VECTOR,
+
+ // Bit-field insert
+ BFI,
+
+ // Vector OR with immediate
+ VORRIMM,
+ // Vector AND with NOT of immediate
+ VBICIMM,
+
+ // Vector bitwise select
+ VBSL,
+
+ // Pseudo-instruction representing a memory copy using ldm/stm
+ // instructions.
+ MEMCPY,
+
+ // Vector load N-element structure to all lanes:
+ VLD1DUP = ISD::FIRST_TARGET_MEMORY_OPCODE,
+ VLD2DUP,
+ VLD3DUP,
+ VLD4DUP,
+
+ // NEON loads with post-increment base updates:
+ VLD1_UPD,
+ VLD2_UPD,
+ VLD3_UPD,
+ VLD4_UPD,
+ VLD2LN_UPD,
+ VLD3LN_UPD,
+ VLD4LN_UPD,
+ VLD1DUP_UPD,
+ VLD2DUP_UPD,
+ VLD3DUP_UPD,
+ VLD4DUP_UPD,
+
+ // NEON stores with post-increment base updates:
+ VST1_UPD,
+ VST2_UPD,
+ VST3_UPD,
+ VST4_UPD,
+ VST2LN_UPD,
+ VST3LN_UPD,
+ VST4LN_UPD
+ };
+ }
+
+ /// Define some predicates that are used for node matching.
+ namespace ARM {
+ bool isBitFieldInvertedMask(unsigned v);
+ }
+
+ //===--------------------------------------------------------------------===//
+ // ARMTargetLowering - ARM Implementation of the TargetLowering interface
+
+ class ARMTargetLowering : public TargetLowering {
+ public:
+ explicit ARMTargetLowering(const TargetMachine &TM,
+ const ARMSubtarget &STI);
+
+ unsigned getJumpTableEncoding() const override;
+ bool useSoftFloat() const override;
+
+ SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+ /// ReplaceNodeResults - Replace the results of node with an illegal result
+ /// type with new values built out of custom code.
+ ///
+ void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+ SelectionDAG &DAG) const override;
+
+ const char *getTargetNodeName(unsigned Opcode) const override;
+
+ bool isSelectSupported(SelectSupportKind Kind) const override {
+ // ARM does not support scalar condition selects on vectors.
+ return (Kind != ScalarCondVectorVal);
+ }
+
+ /// getSetCCResultType - Return the value type to use for ISD::SETCC.
+ EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+ EVT VT) const override;
+
+ MachineBasicBlock *
+ EmitInstrWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *MBB) const override;
+
+ void AdjustInstrPostInstrSelection(MachineInstr &MI,
+ SDNode *Node) const override;
+
+ SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const;
+ SDValue PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const;
+ SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const;
+ SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+ bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const override;
+
+ /// allowsMisalignedMemoryAccesses - Returns true if the target allows
+ /// unaligned memory accesses of the specified type. Returns whether it
+ /// is "fast" by reference in the second argument.
+ bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace,
+ unsigned Align,
+ bool *Fast) const override;
+
+ EVT getOptimalMemOpType(uint64_t Size,
+ unsigned DstAlign, unsigned SrcAlign,
+ bool IsMemset, bool ZeroMemset,
+ bool MemcpyStrSrc,
+ MachineFunction &MF) const override;
+
+ using TargetLowering::isZExtFree;
+ bool isZExtFree(SDValue Val, EVT VT2) const override;
+
+ bool isVectorLoadExtDesirable(SDValue ExtVal) const override;
+
+ bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;
+
+
+ /// isLegalAddressingMode - Return true if the addressing mode represented
+ /// by AM is legal for this target, for a load/store of the specified type.
+ bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
+ Type *Ty, unsigned AS) const override;
+
+ /// getScalingFactorCost - Return the cost of the scaling used in
+ /// addressing mode represented by AM.
+ /// If the AM is supported, the return value must be >= 0.
+ /// If the AM is not supported, the return value must be negative.
+ int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty,
+ unsigned AS) const override;
+
+ bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const;
+
+ /// isLegalICmpImmediate - Return true if the specified immediate is legal
+ /// icmp immediate, that is the target has icmp instructions which can
+ /// compare a register against the immediate without having to materialize
+ /// the immediate into a register.
+ bool isLegalICmpImmediate(int64_t Imm) const override;
+
+ /// isLegalAddImmediate - Return true if the specified immediate is legal
+ /// add immediate, that is the target has add instructions which can
+ /// add a register and the immediate without having to materialize
+ /// the immediate into a register.
+ bool isLegalAddImmediate(int64_t Imm) const override;
+
+ /// getPreIndexedAddressParts - returns true by value, base pointer and
+ /// offset pointer and addressing mode by reference if the node's address
+ /// can be legally represented as pre-indexed load / store address.
+ bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset,
+ ISD::MemIndexedMode &AM,
+ SelectionDAG &DAG) const override;
+
+ /// getPostIndexedAddressParts - returns true by value, base pointer and
+ /// offset pointer and addressing mode by reference if this node can be
+ /// combined with a load / store to form a post-indexed load / store.
+ bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base,
+ SDValue &Offset, ISD::MemIndexedMode &AM,
+ SelectionDAG &DAG) const override;
+
+ void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero,
+ APInt &KnownOne,
+ const SelectionDAG &DAG,
+ unsigned Depth) const override;
+
+
+ bool ExpandInlineAsm(CallInst *CI) const override;
+
+ ConstraintType getConstraintType(StringRef Constraint) const override;
+
+ /// Examine constraint string and operand type and determine a weight value.
+ /// The operand object must already have been set up with the operand type.
+ ConstraintWeight getSingleConstraintMatchWeight(
+ AsmOperandInfo &info, const char *constraint) const override;
+
+ std::pair<unsigned, const TargetRegisterClass *>
+ getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ StringRef Constraint, MVT VT) const override;
+
+ const char *LowerXConstraint(EVT ConstraintVT) const override;
+
+ /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+ /// vector. If it is invalid, don't add anything to Ops. If hasMemory is
+ /// true it means one of the asm constraint of the inline asm instruction
+ /// being processed is 'm'.
+ void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
+ std::vector<SDValue> &Ops,
+ SelectionDAG &DAG) const override;
+
+ unsigned
+ getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
+ if (ConstraintCode == "Q")
+ return InlineAsm::Constraint_Q;
+ else if (ConstraintCode == "o")
+ return InlineAsm::Constraint_o;
+ else if (ConstraintCode.size() == 2) {
+ if (ConstraintCode[0] == 'U') {
+ switch(ConstraintCode[1]) {
+ default:
+ break;
+ case 'm':
+ return InlineAsm::Constraint_Um;
+ case 'n':
+ return InlineAsm::Constraint_Un;
+ case 'q':
+ return InlineAsm::Constraint_Uq;
+ case 's':
+ return InlineAsm::Constraint_Us;
+ case 't':
+ return InlineAsm::Constraint_Ut;
+ case 'v':
+ return InlineAsm::Constraint_Uv;
+ case 'y':
+ return InlineAsm::Constraint_Uy;
+ }
+ }
+ }
+ return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
+ }
+
+ const ARMSubtarget* getSubtarget() const {
+ return Subtarget;
+ }
+
+ /// getRegClassFor - Return the register class that should be used for the
+ /// specified value type.
+ const TargetRegisterClass *getRegClassFor(MVT VT) const override;
+
+ /// Returns true if a cast between SrcAS and DestAS is a noop.
+ bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
+ // Addrspacecasts are always noops.
+ return true;
+ }
+
+ bool shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,
+ unsigned &PrefAlign) const override;
+
+ /// createFastISel - This method returns a target specific FastISel object,
+ /// or null if the target does not support "fast" ISel.
+ FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo) const override;
+
+ Sched::Preference getSchedulingPreference(SDNode *N) const override;
+
+ bool
+ isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const override;
+ bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+
+ /// isFPImmLegal - Returns true if the target can instruction select the
+ /// specified FP immediate natively. If false, the legalizer will
+ /// materialize the FP immediate as a load from a constant pool.
+ bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+
+ bool getTgtMemIntrinsic(IntrinsicInfo &Info,
+ const CallInst &I,
+ unsigned Intrinsic) const override;
+
+ /// \brief Returns true if it is beneficial to convert a load of a constant
+ /// to just the constant itself.
+ bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+ Type *Ty) const override;
+
+ /// Return true if EXTRACT_SUBVECTOR is cheap for this result type
+ /// with this index.
+ bool isExtractSubvectorCheap(EVT ResVT, unsigned Index) const override;
+
+ /// \brief Returns true if an argument of type Ty needs to be passed in a
+ /// contiguous block of registers in calling convention CallConv.
+ bool functionArgumentNeedsConsecutiveRegisters(
+ Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override;
+
+ /// If a physical register, this returns the register that receives the
+ /// exception address on entry to an EH pad.
+ unsigned
+ getExceptionPointerRegister(const Constant *PersonalityFn) const override;
+
+ /// If a physical register, this returns the register that receives the
+ /// exception typeid on entry to a landing pad.
+ unsigned
+ getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
+
+ Instruction *makeDMB(IRBuilder<> &Builder, ARM_MB::MemBOpt Domain) const;
+ Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+ AtomicOrdering Ord) const override;
+ Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
+ Value *Addr, AtomicOrdering Ord) const override;
+
+ void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const override;
+
+ Instruction* emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
+ bool IsStore, bool IsLoad) const override;
+ Instruction* emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
+ bool IsStore, bool IsLoad) const override;
+
+ unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
+
+ bool lowerInterleavedLoad(LoadInst *LI,
+ ArrayRef<ShuffleVectorInst *> Shuffles,
+ ArrayRef<unsigned> Indices,
+ unsigned Factor) const override;
+ bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+ unsigned Factor) const override;
+
+ bool shouldInsertFencesForAtomic(const Instruction *I) const override;
+ TargetLoweringBase::AtomicExpansionKind
+ shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
+ bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
+ TargetLoweringBase::AtomicExpansionKind
+ shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
+ bool shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
+
+ bool useLoadStackGuardNode() const override;
+
+ bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
+ unsigned &Cost) const override;
+
+ bool isCheapToSpeculateCttz() const override;
+ bool isCheapToSpeculateCtlz() const override;
+
+ bool supportSwiftError() const override {
+ return true;
+ }
+
+ bool hasStandaloneRem(EVT VT) const override {
+ return HasStandaloneRem;
+ }
+
+ CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool isVarArg) const;
+ CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool isVarArg) const;
+
+ protected:
+ std::pair<const TargetRegisterClass *, uint8_t>
+ findRepresentativeClass(const TargetRegisterInfo *TRI,
+ MVT VT) const override;
+
+ private:
+ /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
+ /// make the right decision when generating code for different targets.
+ const ARMSubtarget *Subtarget;
+
+ const TargetRegisterInfo *RegInfo;
+
+ const InstrItineraryData *Itins;
+
+ /// ARMPCLabelIndex - Keep track of the number of ARM PC labels created.
+ ///
+ unsigned ARMPCLabelIndex;
+
+ // TODO: remove this, and have shouldInsertFencesForAtomic do the proper
+ // check.
+ bool InsertFencesForAtomic;
+
+ bool HasStandaloneRem = true;
+
+ void InitLibcallCallingConvs();
+
+ void addTypeForNEON(MVT VT, MVT PromotedLdStVT, MVT PromotedBitwiseVT);
+ void addDRTypeForNEON(MVT VT);
+ void addQRTypeForNEON(MVT VT);
+ std::pair<SDValue, SDValue> getARMXALUOOp(SDValue Op, SelectionDAG &DAG, SDValue &ARMcc) const;
+
+ typedef SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPassVector;
+ void PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG, SDValue Chain,
+ SDValue &Arg, RegsToPassVector &RegsToPass,
+ CCValAssign &VA, CCValAssign &NextVA,
+ SDValue &StackPtr,
+ SmallVectorImpl<SDValue> &MemOpChains,
+ ISD::ArgFlagsTy Flags) const;
+ SDValue GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
+ SDValue &Root, SelectionDAG &DAG,
+ const SDLoc &dl) const;
+
+ CallingConv::ID getEffectiveCallingConv(CallingConv::ID CC,
+ bool isVarArg) const;
+ CCAssignFn *CCAssignFnForNode(CallingConv::ID CC, bool Return,
+ bool isVarArg) const;
+ SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
+ const SDLoc &dl, SelectionDAG &DAG,
+ const CCValAssign &VA,
+ ISD::ArgFlagsTy Flags) const;
+ SDValue LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *Subtarget) const;
+ SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGlobalAddressDarwin(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGlobalAddressWindows(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
+ SelectionDAG &DAG) const;
+ SDValue LowerToTLSExecModels(GlobalAddressSDNode *GA,
+ SelectionDAG &DAG,
+ TLSModel::Model model) const;
+ SDValue LowerGlobalTLSAddressDarwin(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGlobalTLSAddressWindows(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerConstantFP(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *ST) const;
+ SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *ST) const;
+ SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed) const;
+ void ExpandDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed,
+ SmallVectorImpl<SDValue> &Results) const;
+ SDValue LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, bool Signed,
+ SDValue &Chain) const;
+ SDValue LowerREM(SDNode *N, SelectionDAG &DAG) const;
+ SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+
+ unsigned getRegisterByName(const char* RegName, EVT VT,
+ SelectionDAG &DAG) const override;
+
+ /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
+ /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
+ /// expanded to FMAs when this method returns true, otherwise fmuladd is
+ /// expanded to fmul + fadd.
+ ///
+ /// ARM supports both fused and unfused multiply-add operations; we already
+ /// lower a pair of fmul and fadd to the latter so it's not clear that there
+ /// would be a gain or that the gain would be worthwhile enough to risk
+ /// correctness bugs.
+ bool isFMAFasterThanFMulAndFAdd(EVT VT) const override { return false; }
+
+ SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+ CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
+ SDValue ThisVal) const;
+
+ bool supportSplitCSR(MachineFunction *MF) const override {
+ return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
+ MF->getFunction()->hasFnAttribute(Attribute::NoUnwind);
+ }
+ void initializeSplitCSR(MachineBasicBlock *Entry) const override;
+ void insertCopiesSplitCSR(
+ MachineBasicBlock *Entry,
+ const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
+
+ SDValue
+ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const override;
+
+ int StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, const SDLoc &dl,
+ SDValue &Chain, const Value *OrigArg,
+ unsigned InRegsParamRecordIdx, int ArgOffset,
+ unsigned ArgSize) const;
+
+ void VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
+ const SDLoc &dl, SDValue &Chain,
+ unsigned ArgOffset, unsigned TotalArgRegsSaveSize,
+ bool ForceMutable = false) const;
+
+ SDValue
+ LowerCall(TargetLowering::CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const override;
+
+ /// HandleByVal - Target-specific cleanup for ByVal support.
+ void HandleByVal(CCState *, unsigned &, unsigned) const override;
+
+ /// IsEligibleForTailCallOptimization - Check whether the call is eligible
+ /// for tail call optimization. Targets which want to do tail call
+ /// optimization should implement this function.
+ bool IsEligibleForTailCallOptimization(SDValue Callee,
+ CallingConv::ID CalleeCC,
+ bool isVarArg,
+ bool isCalleeStructRet,
+ bool isCallerStructRet,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ SelectionDAG& DAG) const;
+
+ bool CanLowerReturn(CallingConv::ID CallConv,
+ MachineFunction &MF, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ LLVMContext &Context) const override;
+
+ SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SDLoc &dl, SelectionDAG &DAG) const override;
+
+ bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
+
+ bool mayBeEmittedAsTailCall(CallInst *CI) const override;
+
+ SDValue getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, SDValue TrueVal,
+ SDValue ARMcc, SDValue CCR, SDValue Cmp,
+ SelectionDAG &DAG) const;
+ SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+ SDValue &ARMcc, SelectionDAG &DAG, const SDLoc &dl) const;
+ SDValue getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
+ const SDLoc &dl) const;
+ SDValue duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const;
+
+ SDValue OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const;
+
+ void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB,
+ MachineBasicBlock *DispatchBB, int FI) const;
+
+ void EmitSjLjDispatchBlock(MachineInstr &MI, MachineBasicBlock *MBB) const;
+
+ bool RemapAddSubWithFlags(MachineInstr &MI, MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *EmitStructByval(MachineInstr &MI,
+ MachineBasicBlock *MBB) const;
+
+ MachineBasicBlock *EmitLowered__chkstk(MachineInstr &MI,
+ MachineBasicBlock *MBB) const;
+ MachineBasicBlock *EmitLowered__dbzchk(MachineInstr &MI,
+ MachineBasicBlock *MBB) const;
+ };
+
+ enum NEONModImmType {
+ VMOVModImm,
+ VMVNModImm,
+ OtherModImm
+ };
+
+ namespace ARM {
+ FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo);
+ }
+}
+
+#endif // ARMISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td b/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td
new file mode 100644
index 000000000000..488439fc24e0
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td
@@ -0,0 +1,2531 @@
+//===-- ARMInstrFormats.td - ARM Instruction Formats -------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//
+// ARM Instruction Format Definitions.
+//
+
+// Format specifies the encoding used by the instruction. This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<6> val> {
+ bits<6> Value = val;
+}
+
+def Pseudo : Format<0>;
+def MulFrm : Format<1>;
+def BrFrm : Format<2>;
+def BrMiscFrm : Format<3>;
+
+def DPFrm : Format<4>;
+def DPSoRegRegFrm : Format<5>;
+
+def LdFrm : Format<6>;
+def StFrm : Format<7>;
+def LdMiscFrm : Format<8>;
+def StMiscFrm : Format<9>;
+def LdStMulFrm : Format<10>;
+
+def LdStExFrm : Format<11>;
+
+def ArithMiscFrm : Format<12>;
+def SatFrm : Format<13>;
+def ExtFrm : Format<14>;
+
+def VFPUnaryFrm : Format<15>;
+def VFPBinaryFrm : Format<16>;
+def VFPConv1Frm : Format<17>;
+def VFPConv2Frm : Format<18>;
+def VFPConv3Frm : Format<19>;
+def VFPConv4Frm : Format<20>;
+def VFPConv5Frm : Format<21>;
+def VFPLdStFrm : Format<22>;
+def VFPLdStMulFrm : Format<23>;
+def VFPMiscFrm : Format<24>;
+
+def ThumbFrm : Format<25>;
+def MiscFrm : Format<26>;
+
+def NGetLnFrm : Format<27>;
+def NSetLnFrm : Format<28>;
+def NDupFrm : Format<29>;
+def NLdStFrm : Format<30>;
+def N1RegModImmFrm: Format<31>;
+def N2RegFrm : Format<32>;
+def NVCVTFrm : Format<33>;
+def NVDupLnFrm : Format<34>;
+def N2RegVShLFrm : Format<35>;
+def N2RegVShRFrm : Format<36>;
+def N3RegFrm : Format<37>;
+def N3RegVShFrm : Format<38>;
+def NVExtFrm : Format<39>;
+def NVMulSLFrm : Format<40>;
+def NVTBLFrm : Format<41>;
+def DPSoRegImmFrm : Format<42>;
+
+// Misc flags.
+
+// The instruction has an Rn register operand.
+// UnaryDP - Indicates this is a unary data processing instruction, i.e.
+// it doesn't have a Rn operand.
+class UnaryDP { bit isUnaryDataProc = 1; }
+
+// Xform16Bit - Indicates this Thumb2 instruction may be transformed into
+// a 16-bit Thumb instruction if certain conditions are met.
+class Xform16Bit { bit canXformTo16Bit = 1; }
+
+//===----------------------------------------------------------------------===//
+// ARM Instruction flags. These need to match ARMBaseInstrInfo.h.
+//
+
+// FIXME: Once the JIT is MC-ized, these can go away.
+// Addressing mode.
+class AddrMode<bits<5> val> {
+ bits<5> Value = val;
+}
+def AddrModeNone : AddrMode<0>;
+def AddrMode1 : AddrMode<1>;
+def AddrMode2 : AddrMode<2>;
+def AddrMode3 : AddrMode<3>;
+def AddrMode4 : AddrMode<4>;
+def AddrMode5 : AddrMode<5>;
+def AddrMode6 : AddrMode<6>;
+def AddrModeT1_1 : AddrMode<7>;
+def AddrModeT1_2 : AddrMode<8>;
+def AddrModeT1_4 : AddrMode<9>;
+def AddrModeT1_s : AddrMode<10>;
+def AddrModeT2_i12 : AddrMode<11>;
+def AddrModeT2_i8 : AddrMode<12>;
+def AddrModeT2_so : AddrMode<13>;
+def AddrModeT2_pc : AddrMode<14>;
+def AddrModeT2_i8s4 : AddrMode<15>;
+def AddrMode_i12 : AddrMode<16>;
+
+// Load / store index mode.
+class IndexMode<bits<2> val> {
+ bits<2> Value = val;
+}
+def IndexModeNone : IndexMode<0>;
+def IndexModePre : IndexMode<1>;
+def IndexModePost : IndexMode<2>;
+def IndexModeUpd : IndexMode<3>;
+
+// Instruction execution domain.
+class Domain<bits<3> val> {
+ bits<3> Value = val;
+}
+def GenericDomain : Domain<0>;
+def VFPDomain : Domain<1>; // Instructions in VFP domain only
+def NeonDomain : Domain<2>; // Instructions in Neon domain only
+def VFPNeonDomain : Domain<3>; // Instructions in both VFP & Neon domains
+def VFPNeonA8Domain : Domain<5>; // Instructions in VFP & Neon under A8
+
+//===----------------------------------------------------------------------===//
+// ARM special operands.
+//
+
+// ARM imod and iflag operands, used only by the CPS instruction.
+def imod_op : Operand<i32> {
+ let PrintMethod = "printCPSIMod";
+}
+
+def ProcIFlagsOperand : AsmOperandClass {
+ let Name = "ProcIFlags";
+ let ParserMethod = "parseProcIFlagsOperand";
+}
+def iflags_op : Operand<i32> {
+ let PrintMethod = "printCPSIFlag";
+ let ParserMatchClass = ProcIFlagsOperand;
+}
+
+// ARM Predicate operand. Default to 14 = always (AL). Second part is CC
+// register whose default is 0 (no register).
+def CondCodeOperand : AsmOperandClass { let Name = "CondCode"; }
+def pred : PredicateOperand<OtherVT, (ops i32imm, i32imm),
+ (ops (i32 14), (i32 zero_reg))> {
+ let PrintMethod = "printPredicateOperand";
+ let ParserMatchClass = CondCodeOperand;
+ let DecoderMethod = "DecodePredicateOperand";
+}
+
+// Selectable predicate operand for CMOV instructions. We can't use a normal
+// predicate because the default values interfere with instruction selection. In
+// all other respects it is identical though: pseudo-instruction expansion
+// relies on the MachineOperands being compatible.
+def cmovpred : Operand<i32>, PredicateOp,
+ ComplexPattern<i32, 2, "SelectCMOVPred"> {
+ let MIOperandInfo = (ops i32imm, i32imm);
+ let PrintMethod = "printPredicateOperand";
+}
+
+// Conditional code result for instructions whose 's' bit is set, e.g. subs.
+def CCOutOperand : AsmOperandClass { let Name = "CCOut"; }
+def cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 zero_reg))> {
+ let EncoderMethod = "getCCOutOpValue";
+ let PrintMethod = "printSBitModifierOperand";
+ let ParserMatchClass = CCOutOperand;
+ let DecoderMethod = "DecodeCCOutOperand";
+}
+
+// Same as cc_out except it defaults to setting CPSR.
+def s_cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 CPSR))> {
+ let EncoderMethod = "getCCOutOpValue";
+ let PrintMethod = "printSBitModifierOperand";
+ let ParserMatchClass = CCOutOperand;
+ let DecoderMethod = "DecodeCCOutOperand";
+}
+
+// ARM special operands for disassembly only.
+//
+def SetEndAsmOperand : ImmAsmOperand {
+ let Name = "SetEndImm";
+ let ParserMethod = "parseSetEndImm";
+}
+def setend_op : Operand<i32> {
+ let PrintMethod = "printSetendOperand";
+ let ParserMatchClass = SetEndAsmOperand;
+}
+
+def MSRMaskOperand : AsmOperandClass {
+ let Name = "MSRMask";
+ let ParserMethod = "parseMSRMaskOperand";
+}
+def msr_mask : Operand<i32> {
+ let PrintMethod = "printMSRMaskOperand";
+ let DecoderMethod = "DecodeMSRMask";
+ let ParserMatchClass = MSRMaskOperand;
+}
+
+def BankedRegOperand : AsmOperandClass {
+ let Name = "BankedReg";
+ let ParserMethod = "parseBankedRegOperand";
+}
+def banked_reg : Operand<i32> {
+ let PrintMethod = "printBankedRegOperand";
+ let DecoderMethod = "DecodeBankedReg";
+ let ParserMatchClass = BankedRegOperand;
+}
+
+// Shift Right Immediate - A shift right immediate is encoded differently from
+// other shift immediates. The imm6 field is encoded like so:
+//
+// Offset Encoding
+// 8 imm6<5:3> = '001', 8 - <imm> is encoded in imm6<2:0>
+// 16 imm6<5:4> = '01', 16 - <imm> is encoded in imm6<3:0>
+// 32 imm6<5> = '1', 32 - <imm> is encoded in imm6<4:0>
+// 64 64 - <imm> is encoded in imm6<5:0>
+def shr_imm8_asm_operand : ImmAsmOperand { let Name = "ShrImm8"; }
+def shr_imm8 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 8; }]> {
+ let EncoderMethod = "getShiftRight8Imm";
+ let DecoderMethod = "DecodeShiftRight8Imm";
+ let ParserMatchClass = shr_imm8_asm_operand;
+}
+def shr_imm16_asm_operand : ImmAsmOperand { let Name = "ShrImm16"; }
+def shr_imm16 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 16; }]> {
+ let EncoderMethod = "getShiftRight16Imm";
+ let DecoderMethod = "DecodeShiftRight16Imm";
+ let ParserMatchClass = shr_imm16_asm_operand;
+}
+def shr_imm32_asm_operand : ImmAsmOperand { let Name = "ShrImm32"; }
+def shr_imm32 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 32; }]> {
+ let EncoderMethod = "getShiftRight32Imm";
+ let DecoderMethod = "DecodeShiftRight32Imm";
+ let ParserMatchClass = shr_imm32_asm_operand;
+}
+def shr_imm64_asm_operand : ImmAsmOperand { let Name = "ShrImm64"; }
+def shr_imm64 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 64; }]> {
+ let EncoderMethod = "getShiftRight64Imm";
+ let DecoderMethod = "DecodeShiftRight64Imm";
+ let ParserMatchClass = shr_imm64_asm_operand;
+}
+
+
+// ARM Assembler operand for ldr Rd, =expression which generates an offset
+// to a constant pool entry or a MOV depending on the value of expression
+def const_pool_asm_operand : AsmOperandClass { let Name = "ConstPoolAsmImm"; }
+def const_pool_asm_imm : Operand<i32> {
+ let ParserMatchClass = const_pool_asm_operand;
+}
+
+
+//===----------------------------------------------------------------------===//
+// ARM Assembler alias templates.
+//
+// Note: When EmitPriority == 1, the alias will be used for printing
+class ARMInstAlias<string Asm, dag Result, bit EmitPriority = 0>
+ : InstAlias<Asm, Result, EmitPriority>, Requires<[IsARM]>;
+class tInstAlias<string Asm, dag Result, bit EmitPriority = 0>
+ : InstAlias<Asm, Result, EmitPriority>, Requires<[IsThumb]>;
+class t2InstAlias<string Asm, dag Result, bit EmitPriority = 0>
+ : InstAlias<Asm, Result, EmitPriority>, Requires<[IsThumb2]>;
+class VFP2InstAlias<string Asm, dag Result, bit EmitPriority = 0>
+ : InstAlias<Asm, Result, EmitPriority>, Requires<[HasVFP2]>;
+class VFP2DPInstAlias<string Asm, dag Result, bit EmitPriority = 0>
+ : InstAlias<Asm, Result, EmitPriority>, Requires<[HasVFP2,HasDPVFP]>;
+class VFP3InstAlias<string Asm, dag Result, bit EmitPriority = 0>
+ : InstAlias<Asm, Result, EmitPriority>, Requires<[HasVFP3]>;
+class NEONInstAlias<string Asm, dag Result, bit EmitPriority = 0>
+ : InstAlias<Asm, Result, EmitPriority>, Requires<[HasNEON]>;
+
+
+class VFP2MnemonicAlias<string src, string dst> : MnemonicAlias<src, dst>,
+ Requires<[HasVFP2]>;
+class NEONMnemonicAlias<string src, string dst> : MnemonicAlias<src, dst>,
+ Requires<[HasNEON]>;
+
+//===----------------------------------------------------------------------===//
+// ARM Instruction templates.
+//
+
+
+class InstTemplate<AddrMode am, int sz, IndexMode im,
+ Format f, Domain d, string cstr, InstrItinClass itin>
+ : Instruction {
+ let Namespace = "ARM";
+
+ AddrMode AM = am;
+ int Size = sz;
+ IndexMode IM = im;
+ bits<2> IndexModeBits = IM.Value;
+ Format F = f;
+ bits<6> Form = F.Value;
+ Domain D = d;
+ bit isUnaryDataProc = 0;
+ bit canXformTo16Bit = 0;
+ // The instruction is a 16-bit flag setting Thumb instruction. Used
+ // by the parser to determine whether to require the 'S' suffix on the
+ // mnemonic (when not in an IT block) or preclude it (when in an IT block).
+ bit thumbArithFlagSetting = 0;
+
+ // If this is a pseudo instruction, mark it isCodeGenOnly.
+ let isCodeGenOnly = !eq(!cast<string>(f), "Pseudo");
+
+ // The layout of TSFlags should be kept in sync with ARMBaseInfo.h.
+ let TSFlags{4-0} = AM.Value;
+ let TSFlags{6-5} = IndexModeBits;
+ let TSFlags{12-7} = Form;
+ let TSFlags{13} = isUnaryDataProc;
+ let TSFlags{14} = canXformTo16Bit;
+ let TSFlags{17-15} = D.Value;
+ let TSFlags{18} = thumbArithFlagSetting;
+
+ let Constraints = cstr;
+ let Itinerary = itin;
+}
+
+class Encoding {
+ field bits<32> Inst;
+ // Mask of bits that cause an encoding to be UNPREDICTABLE.
+ // If a bit is set, then if the corresponding bit in the
+ // target encoding differs from its value in the "Inst" field,
+ // the instruction is UNPREDICTABLE (SoftFail in abstract parlance).
+ field bits<32> Unpredictable = 0;
+ // SoftFail is the generic name for this field, but we alias it so
+ // as to make it more obvious what it means in ARM-land.
+ field bits<32> SoftFail = Unpredictable;
+}
+
+class InstARM<AddrMode am, int sz, IndexMode im,
+ Format f, Domain d, string cstr, InstrItinClass itin>
+ : InstTemplate<am, sz, im, f, d, cstr, itin>, Encoding {
+ let DecoderNamespace = "ARM";
+}
+
+// This Encoding-less class is used by Thumb1 to specify the encoding bits later
+// on by adding flavors to specific instructions.
+class InstThumb<AddrMode am, int sz, IndexMode im,
+ Format f, Domain d, string cstr, InstrItinClass itin>
+ : InstTemplate<am, sz, im, f, d, cstr, itin> {
+ let DecoderNamespace = "Thumb";
+}
+
+// Pseudo-instructions for alternate assembly syntax (never used by codegen).
+// These are aliases that require C++ handling to convert to the target
+// instruction, while InstAliases can be handled directly by tblgen.
+class AsmPseudoInst<string asm, dag iops, dag oops = (outs)>
+ : InstTemplate<AddrModeNone, 0, IndexModeNone, Pseudo, GenericDomain,
+ "", NoItinerary> {
+ let OutOperandList = oops;
+ let InOperandList = iops;
+ let Pattern = [];
+ let isCodeGenOnly = 0; // So we get asm matcher for it.
+ let AsmString = asm;
+ let isPseudo = 1;
+}
+
+class ARMAsmPseudo<string asm, dag iops, dag oops = (outs)>
+ : AsmPseudoInst<asm, iops, oops>, Requires<[IsARM]>;
+class tAsmPseudo<string asm, dag iops, dag oops = (outs)>
+ : AsmPseudoInst<asm, iops, oops>, Requires<[IsThumb]>;
+class t2AsmPseudo<string asm, dag iops, dag oops = (outs)>
+ : AsmPseudoInst<asm, iops, oops>, Requires<[IsThumb2]>;
+class VFP2AsmPseudo<string asm, dag iops, dag oops = (outs)>
+ : AsmPseudoInst<asm, iops, oops>, Requires<[HasVFP2]>;
+class NEONAsmPseudo<string asm, dag iops, dag oops = (outs)>
+ : AsmPseudoInst<asm, iops, oops>, Requires<[HasNEON]>;
+
+// Pseudo instructions for the code generator.
+class PseudoInst<dag oops, dag iops, InstrItinClass itin, list<dag> pattern>
+ : InstTemplate<AddrModeNone, 0, IndexModeNone, Pseudo,
+ GenericDomain, "", itin> {
+ let OutOperandList = oops;
+ let InOperandList = iops;
+ let Pattern = pattern;
+ let isCodeGenOnly = 1;
+ let isPseudo = 1;
+}
+
+// PseudoInst that's ARM-mode only.
+class ARMPseudoInst<dag oops, dag iops, int sz, InstrItinClass itin,
+ list<dag> pattern>
+ : PseudoInst<oops, iops, itin, pattern> {
+ let Size = sz;
+ list<Predicate> Predicates = [IsARM];
+}
+
+// PseudoInst that's Thumb-mode only.
+class tPseudoInst<dag oops, dag iops, int sz, InstrItinClass itin,
+ list<dag> pattern>
+ : PseudoInst<oops, iops, itin, pattern> {
+ let Size = sz;
+ list<Predicate> Predicates = [IsThumb];
+}
+
+// PseudoInst that's in ARMv8-M baseline (Somewhere between Thumb and Thumb2)
+class t2basePseudoInst<dag oops, dag iops, int sz, InstrItinClass itin,
+ list<dag> pattern>
+ : PseudoInst<oops, iops, itin, pattern> {
+ let Size = sz;
+ list<Predicate> Predicates = [IsThumb,HasV8MBaseline];
+}
+
+// PseudoInst that's Thumb2-mode only.
+class t2PseudoInst<dag oops, dag iops, int sz, InstrItinClass itin,
+ list<dag> pattern>
+ : PseudoInst<oops, iops, itin, pattern> {
+ let Size = sz;
+ list<Predicate> Predicates = [IsThumb2];
+}
+
+class ARMPseudoExpand<dag oops, dag iops, int sz,
+ InstrItinClass itin, list<dag> pattern,
+ dag Result>
+ : ARMPseudoInst<oops, iops, sz, itin, pattern>,
+ PseudoInstExpansion<Result>;
+
+class tPseudoExpand<dag oops, dag iops, int sz,
+ InstrItinClass itin, list<dag> pattern,
+ dag Result>
+ : tPseudoInst<oops, iops, sz, itin, pattern>,
+ PseudoInstExpansion<Result>;
+
+class t2PseudoExpand<dag oops, dag iops, int sz,
+ InstrItinClass itin, list<dag> pattern,
+ dag Result>
+ : t2PseudoInst<oops, iops, sz, itin, pattern>,
+ PseudoInstExpansion<Result>;
+
+// Almost all ARM instructions are predicable.
+class I<dag oops, dag iops, AddrMode am, int sz,
+ IndexMode im, Format f, InstrItinClass itin,
+ string opc, string asm, string cstr,
+ list<dag> pattern>
+ : InstARM<am, sz, im, f, GenericDomain, cstr, itin> {
+ bits<4> p;
+ let Inst{31-28} = p;
+ let OutOperandList = oops;
+ let InOperandList = !con(iops, (ins pred:$p));
+ let AsmString = !strconcat(opc, "${p}", asm);
+ let Pattern = pattern;
+ list<Predicate> Predicates = [IsARM];
+}
+
+// A few are not predicable
+class InoP<dag oops, dag iops, AddrMode am, int sz,
+ IndexMode im, Format f, InstrItinClass itin,
+ string opc, string asm, string cstr,
+ list<dag> pattern>
+ : InstARM<am, sz, im, f, GenericDomain, cstr, itin> {
+ let OutOperandList = oops;
+ let InOperandList = iops;
+ let AsmString = !strconcat(opc, asm);
+ let Pattern = pattern;
+ let isPredicable = 0;
+ list<Predicate> Predicates = [IsARM];
+}
+
+// Same as I except it can optionally modify CPSR. Note it's modeled as an input
+// operand since by default it's a zero register. It will become an implicit def
+// once it's "flipped".
+class sI<dag oops, dag iops, AddrMode am, int sz,
+ IndexMode im, Format f, InstrItinClass itin,
+ string opc, string asm, string cstr,
+ list<dag> pattern>
+ : InstARM<am, sz, im, f, GenericDomain, cstr, itin> {
+ bits<4> p; // Predicate operand
+ bits<1> s; // condition-code set flag ('1' if the insn should set the flags)
+ let Inst{31-28} = p;
+ let Inst{20} = s;
+
+ let OutOperandList = oops;
+ let InOperandList = !con(iops, (ins pred:$p, cc_out:$s));
+ let AsmString = !strconcat(opc, "${s}${p}", asm);
+ let Pattern = pattern;
+ list<Predicate> Predicates = [IsARM];
+}
+
+// Special cases
+class XI<dag oops, dag iops, AddrMode am, int sz,
+ IndexMode im, Format f, InstrItinClass itin,
+ string asm, string cstr, list<dag> pattern>
+ : InstARM<am, sz, im, f, GenericDomain, cstr, itin> {
+ let OutOperandList = oops;
+ let InOperandList = iops;
+ let AsmString = asm;
+ let Pattern = pattern;
+ list<Predicate> Predicates = [IsARM];
+}
+
+class AI<dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : I<oops, iops, AddrModeNone, 4, IndexModeNone, f, itin,
+ opc, asm, "", pattern>;
+class AsI<dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : sI<oops, iops, AddrModeNone, 4, IndexModeNone, f, itin,
+ opc, asm, "", pattern>;
+class AXI<dag oops, dag iops, Format f, InstrItinClass itin,
+ string asm, list<dag> pattern>
+ : XI<oops, iops, AddrModeNone, 4, IndexModeNone, f, itin,
+ asm, "", pattern>;
+class AXIM<dag oops, dag iops, AddrMode am, Format f, InstrItinClass itin,
+ string asm, list<dag> pattern>
+ : XI<oops, iops, am, 4, IndexModeNone, f, itin,
+ asm, "", pattern>;
+class AInoP<dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : InoP<oops, iops, AddrModeNone, 4, IndexModeNone, f, itin,
+ opc, asm, "", pattern>;
+
+// Ctrl flow instructions
+class ABI<bits<4> opcod, dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : I<oops, iops, AddrModeNone, 4, IndexModeNone, BrFrm, itin,
+ opc, asm, "", pattern> {
+ let Inst{27-24} = opcod;
+}
+class ABXI<bits<4> opcod, dag oops, dag iops, InstrItinClass itin,
+ string asm, list<dag> pattern>
+ : XI<oops, iops, AddrModeNone, 4, IndexModeNone, BrFrm, itin,
+ asm, "", pattern> {
+ let Inst{27-24} = opcod;
+}
+
+// BR_JT instructions
+class JTI<dag oops, dag iops, InstrItinClass itin,
+ string asm, list<dag> pattern>
+ : XI<oops, iops, AddrModeNone, 0, IndexModeNone, BrMiscFrm, itin,
+ asm, "", pattern>;
+
+class AIldr_ex_or_acq<bits<2> opcod, bits<2> opcod2, dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : I<oops, iops, AddrModeNone, 4, IndexModeNone, LdStExFrm, itin,
+ opc, asm, "", pattern> {
+ bits<4> Rt;
+ bits<4> addr;
+ let Inst{27-23} = 0b00011;
+ let Inst{22-21} = opcod;
+ let Inst{20} = 1;
+ let Inst{19-16} = addr;
+ let Inst{15-12} = Rt;
+ let Inst{11-10} = 0b11;
+ let Inst{9-8} = opcod2;
+ let Inst{7-0} = 0b10011111;
+}
+class AIstr_ex_or_rel<bits<2> opcod, bits<2> opcod2, dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : I<oops, iops, AddrModeNone, 4, IndexModeNone, LdStExFrm, itin,
+ opc, asm, "", pattern> {
+ bits<4> Rt;
+ bits<4> addr;
+ let Inst{27-23} = 0b00011;
+ let Inst{22-21} = opcod;
+ let Inst{20} = 0;
+ let Inst{19-16} = addr;
+ let Inst{11-10} = 0b11;
+ let Inst{9-8} = opcod2;
+ let Inst{7-4} = 0b1001;
+ let Inst{3-0} = Rt;
+}
+// Atomic load/store instructions
+class AIldrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : AIldr_ex_or_acq<opcod, 0b11, oops, iops, itin, opc, asm, pattern>;
+
+class AIstrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : AIstr_ex_or_rel<opcod, 0b11, oops, iops, itin, opc, asm, pattern> {
+ bits<4> Rd;
+ let Inst{15-12} = Rd;
+}
+
+// Exclusive load/store instructions
+
+class AIldaex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : AIldr_ex_or_acq<opcod, 0b10, oops, iops, itin, opc, asm, pattern>,
+ Requires<[IsARM, HasAcquireRelease, HasV7Clrex]>;
+
+class AIstlex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : AIstr_ex_or_rel<opcod, 0b10, oops, iops, itin, opc, asm, pattern>,
+ Requires<[IsARM, HasAcquireRelease, HasV7Clrex]> {
+ bits<4> Rd;
+ let Inst{15-12} = Rd;
+}
+
+class AIswp<bit b, dag oops, dag iops, string opc, list<dag> pattern>
+ : AI<oops, iops, MiscFrm, NoItinerary, opc, "\t$Rt, $Rt2, $addr", pattern> {
+ bits<4> Rt;
+ bits<4> Rt2;
+ bits<4> addr;
+ let Inst{27-23} = 0b00010;
+ let Inst{22} = b;
+ let Inst{21-20} = 0b00;
+ let Inst{19-16} = addr;
+ let Inst{15-12} = Rt;
+ let Inst{11-4} = 0b00001001;
+ let Inst{3-0} = Rt2;
+
+ let Unpredictable{11-8} = 0b1111;
+ let DecoderMethod = "DecodeSwap";
+}
+// Acquire/Release load/store instructions
+class AIldracq<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : AIldr_ex_or_acq<opcod, 0b00, oops, iops, itin, opc, asm, pattern>,
+ Requires<[IsARM, HasAcquireRelease]>;
+
+class AIstrrel<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : AIstr_ex_or_rel<opcod, 0b00, oops, iops, itin, opc, asm, pattern>,
+ Requires<[IsARM, HasAcquireRelease]> {
+ let Inst{15-12} = 0b1111;
+}
+
+// addrmode1 instructions
+class AI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : I<oops, iops, AddrMode1, 4, IndexModeNone, f, itin,
+ opc, asm, "", pattern> {
+ let Inst{24-21} = opcod;
+ let Inst{27-26} = 0b00;
+}
+class AsI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : sI<oops, iops, AddrMode1, 4, IndexModeNone, f, itin,
+ opc, asm, "", pattern> {
+ let Inst{24-21} = opcod;
+ let Inst{27-26} = 0b00;
+}
+class AXI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin,
+ string asm, list<dag> pattern>
+ : XI<oops, iops, AddrMode1, 4, IndexModeNone, f, itin,
+ asm, "", pattern> {
+ let Inst{24-21} = opcod;
+ let Inst{27-26} = 0b00;
+}
+
+// loads
+
+// LDR/LDRB/STR/STRB/...
+class AI2ldst<bits<3> op, bit isLd, bit isByte, dag oops, dag iops, AddrMode am,
+ Format f, InstrItinClass itin, string opc, string asm,
+ list<dag> pattern>
+ : I<oops, iops, am, 4, IndexModeNone, f, itin, opc, asm,
+ "", pattern> {
+ let Inst{27-25} = op;
+ let Inst{24} = 1; // 24 == P
+ // 23 == U
+ let Inst{22} = isByte;
+ let Inst{21} = 0; // 21 == W
+ let Inst{20} = isLd;
+}
+// Indexed load/stores
+class AI2ldstidx<bit isLd, bit isByte, bit isPre, dag oops, dag iops,
+ IndexMode im, Format f, InstrItinClass itin, string opc,
+ string asm, string cstr, list<dag> pattern>
+ : I<oops, iops, AddrMode2, 4, im, f, itin,
+ opc, asm, cstr, pattern> {
+ bits<4> Rt;
+ let Inst{27-26} = 0b01;
+ let Inst{24} = isPre; // P bit
+ let Inst{22} = isByte; // B bit
+ let Inst{21} = isPre; // W bit
+ let Inst{20} = isLd; // L bit
+ let Inst{15-12} = Rt;
+}
+class AI2stridx_reg<bit isByte, bit isPre, dag oops, dag iops,
+ IndexMode im, Format f, InstrItinClass itin, string opc,
+ string asm, string cstr, list<dag> pattern>
+ : AI2ldstidx<0, isByte, isPre, oops, iops, im, f, itin, opc, asm, cstr,
+ pattern> {
+ // AM2 store w/ two operands: (GPR, am2offset)
+ // {12} isAdd
+ // {11-0} imm12/Rm
+ bits<14> offset;
+ bits<4> Rn;
+ let Inst{25} = 1;
+ let Inst{23} = offset{12};
+ let Inst{19-16} = Rn;
+ let Inst{11-5} = offset{11-5};
+ let Inst{4} = 0;
+ let Inst{3-0} = offset{3-0};
+}
+
+class AI2stridx_imm<bit isByte, bit isPre, dag oops, dag iops,
+ IndexMode im, Format f, InstrItinClass itin, string opc,
+ string asm, string cstr, list<dag> pattern>
+ : AI2ldstidx<0, isByte, isPre, oops, iops, im, f, itin, opc, asm, cstr,
+ pattern> {
+ // AM2 store w/ two operands: (GPR, am2offset)
+ // {12} isAdd
+ // {11-0} imm12/Rm
+ bits<14> offset;
+ bits<4> Rn;
+ let Inst{25} = 0;
+ let Inst{23} = offset{12};
+ let Inst{19-16} = Rn;
+ let Inst{11-0} = offset{11-0};
+}
+
+
+// FIXME: Merge with the above class when addrmode2 gets used for STR, STRB
+// but for now use this class for STRT and STRBT.
+class AI2stridxT<bit isByte, bit isPre, dag oops, dag iops,
+ IndexMode im, Format f, InstrItinClass itin, string opc,
+ string asm, string cstr, list<dag> pattern>
+ : AI2ldstidx<0, isByte, isPre, oops, iops, im, f, itin, opc, asm, cstr,
+ pattern> {
+ // AM2 store w/ two operands: (GPR, am2offset)
+ // {17-14} Rn
+ // {13} 1 == Rm, 0 == imm12
+ // {12} isAdd
+ // {11-0} imm12/Rm
+ bits<18> addr;
+ let Inst{25} = addr{13};
+ let Inst{23} = addr{12};
+ let Inst{19-16} = addr{17-14};
+ let Inst{11-0} = addr{11-0};
+}
+
+// addrmode3 instructions
+class AI3ld<bits<4> op, bit op20, dag oops, dag iops, Format f,
+ InstrItinClass itin, string opc, string asm, list<dag> pattern>
+ : I<oops, iops, AddrMode3, 4, IndexModeNone, f, itin,
+ opc, asm, "", pattern> {
+ bits<14> addr;
+ bits<4> Rt;
+ let Inst{27-25} = 0b000;
+ let Inst{24} = 1; // P bit
+ let Inst{23} = addr{8}; // U bit
+ let Inst{22} = addr{13}; // 1 == imm8, 0 == Rm
+ let Inst{21} = 0; // W bit
+ let Inst{20} = op20; // L bit
+ let Inst{19-16} = addr{12-9}; // Rn
+ let Inst{15-12} = Rt; // Rt
+ let Inst{11-8} = addr{7-4}; // imm7_4/zero
+ let Inst{7-4} = op;
+ let Inst{3-0} = addr{3-0}; // imm3_0/Rm
+
+ let DecoderMethod = "DecodeAddrMode3Instruction";
+}
+
+class AI3ldstidx<bits<4> op, bit op20, bit isPre, dag oops, dag iops,
+ IndexMode im, Format f, InstrItinClass itin, string opc,
+ string asm, string cstr, list<dag> pattern>
+ : I<oops, iops, AddrMode3, 4, im, f, itin,
+ opc, asm, cstr, pattern> {
+ bits<4> Rt;
+ let Inst{27-25} = 0b000;
+ let Inst{24} = isPre; // P bit
+ let Inst{21} = isPre; // W bit
+ let Inst{20} = op20; // L bit
+ let Inst{15-12} = Rt; // Rt
+ let Inst{7-4} = op;
+}
+
+// FIXME: Merge with the above class when addrmode2 gets used for LDR, LDRB
+// but for now use this class for LDRSBT, LDRHT, LDSHT.
+class AI3ldstidxT<bits<4> op, bit isLoad, dag oops, dag iops,
+ IndexMode im, Format f, InstrItinClass itin, string opc,
+ string asm, string cstr, list<dag> pattern>
+ : I<oops, iops, AddrMode3, 4, im, f, itin, opc, asm, cstr, pattern> {
+ // {13} 1 == imm8, 0 == Rm
+ // {12-9} Rn
+ // {8} isAdd
+ // {7-4} imm7_4/zero
+ // {3-0} imm3_0/Rm
+ bits<4> addr;
+ bits<4> Rt;
+ let Inst{27-25} = 0b000;
+ let Inst{24} = 0; // P bit
+ let Inst{21} = 1;
+ let Inst{20} = isLoad; // L bit
+ let Inst{19-16} = addr; // Rn
+ let Inst{15-12} = Rt; // Rt
+ let Inst{7-4} = op;
+}
+
+// stores
+class AI3str<bits<4> op, dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : I<oops, iops, AddrMode3, 4, IndexModeNone, f, itin,
+ opc, asm, "", pattern> {
+ bits<14> addr;
+ bits<4> Rt;
+ let Inst{27-25} = 0b000;
+ let Inst{24} = 1; // P bit
+ let Inst{23} = addr{8}; // U bit
+ let Inst{22} = addr{13}; // 1 == imm8, 0 == Rm
+ let Inst{21} = 0; // W bit
+ let Inst{20} = 0; // L bit
+ let Inst{19-16} = addr{12-9}; // Rn
+ let Inst{15-12} = Rt; // Rt
+ let Inst{11-8} = addr{7-4}; // imm7_4/zero
+ let Inst{7-4} = op;
+ let Inst{3-0} = addr{3-0}; // imm3_0/Rm
+ let DecoderMethod = "DecodeAddrMode3Instruction";
+}
+
+// addrmode4 instructions
+class AXI4<dag oops, dag iops, IndexMode im, Format f, InstrItinClass itin,
+ string asm, string cstr, list<dag> pattern>
+ : XI<oops, iops, AddrMode4, 4, im, f, itin, asm, cstr, pattern> {
+ bits<4> p;
+ bits<16> regs;
+ bits<4> Rn;
+ let Inst{31-28} = p;
+ let Inst{27-25} = 0b100;
+ let Inst{22} = 0; // S bit
+ let Inst{19-16} = Rn;
+ let Inst{15-0} = regs;
+}
+
+// Unsigned multiply, multiply-accumulate instructions.
+class AMul1I<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : I<oops, iops, AddrModeNone, 4, IndexModeNone, MulFrm, itin,
+ opc, asm, "", pattern> {
+ let Inst{7-4} = 0b1001;
+ let Inst{20} = 0; // S bit
+ let Inst{27-21} = opcod;
+}
+class AsMul1I<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : sI<oops, iops, AddrModeNone, 4, IndexModeNone, MulFrm, itin,
+ opc, asm, "", pattern> {
+ let Inst{7-4} = 0b1001;
+ let Inst{27-21} = opcod;
+}
+
+// Most significant word multiply
+class AMul2I<bits<7> opcod, bits<4> opc7_4, dag oops, dag iops,
+ InstrItinClass itin, string opc, string asm, list<dag> pattern>
+ : I<oops, iops, AddrModeNone, 4, IndexModeNone, MulFrm, itin,
+ opc, asm, "", pattern> {
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<4> Rm;
+ let Inst{7-4} = opc7_4;
+ let Inst{20} = 1;
+ let Inst{27-21} = opcod;
+ let Inst{19-16} = Rd;
+ let Inst{11-8} = Rm;
+ let Inst{3-0} = Rn;
+}
+// MSW multiple w/ Ra operand
+class AMul2Ia<bits<7> opcod, bits<4> opc7_4, dag oops, dag iops,
+ InstrItinClass itin, string opc, string asm, list<dag> pattern>
+ : AMul2I<opcod, opc7_4, oops, iops, itin, opc, asm, pattern> {
+ bits<4> Ra;
+ let Inst{15-12} = Ra;
+}
+
+// SMUL<x><y> / SMULW<y> / SMLA<x><y> / SMLAW<x><y>
+class AMulxyIbase<bits<7> opcod, bits<2> bit6_5, dag oops, dag iops,
+ InstrItinClass itin, string opc, string asm, list<dag> pattern>
+ : I<oops, iops, AddrModeNone, 4, IndexModeNone, MulFrm, itin,
+ opc, asm, "", pattern> {
+ bits<4> Rn;
+ bits<4> Rm;
+ let Inst{4} = 0;
+ let Inst{7} = 1;
+ let Inst{20} = 0;
+ let Inst{27-21} = opcod;
+ let Inst{6-5} = bit6_5;
+ let Inst{11-8} = Rm;
+ let Inst{3-0} = Rn;
+}
+class AMulxyI<bits<7> opcod, bits<2> bit6_5, dag oops, dag iops,
+ InstrItinClass itin, string opc, string asm, list<dag> pattern>
+ : AMulxyIbase<opcod, bit6_5, oops, iops, itin, opc, asm, pattern> {
+ bits<4> Rd;
+ let Inst{19-16} = Rd;
+}
+
+// AMulxyI with Ra operand
+class AMulxyIa<bits<7> opcod, bits<2> bit6_5, dag oops, dag iops,
+ InstrItinClass itin, string opc, string asm, list<dag> pattern>
+ : AMulxyI<opcod, bit6_5, oops, iops, itin, opc, asm, pattern> {
+ bits<4> Ra;
+ let Inst{15-12} = Ra;
+}
+// SMLAL*
+class AMulxyI64<bits<7> opcod, bits<2> bit6_5, dag oops, dag iops,
+ InstrItinClass itin, string opc, string asm, list<dag> pattern>
+ : AMulxyIbase<opcod, bit6_5, oops, iops, itin, opc, asm, pattern> {
+ bits<4> RdLo;
+ bits<4> RdHi;
+ let Inst{19-16} = RdHi;
+ let Inst{15-12} = RdLo;
+}
+
+// Extend instructions.
+class AExtI<bits<8> opcod, dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : I<oops, iops, AddrModeNone, 4, IndexModeNone, ExtFrm, itin,
+ opc, asm, "", pattern> {
+ // All AExtI instructions have Rd and Rm register operands.
+ bits<4> Rd;
+ bits<4> Rm;
+ let Inst{15-12} = Rd;
+ let Inst{3-0} = Rm;
+ let Inst{7-4} = 0b0111;
+ let Inst{9-8} = 0b00;
+ let Inst{27-20} = opcod;
+
+ let Unpredictable{9-8} = 0b11;
+}
+
+// Misc Arithmetic instructions.
+class AMiscA1I<bits<8> opcod, bits<4> opc7_4, dag oops, dag iops,
+ InstrItinClass itin, string opc, string asm, list<dag> pattern>
+ : I<oops, iops, AddrModeNone, 4, IndexModeNone, ArithMiscFrm, itin,
+ opc, asm, "", pattern> {
+ bits<4> Rd;
+ bits<4> Rm;
+ let Inst{27-20} = opcod;
+ let Inst{19-16} = 0b1111;
+ let Inst{15-12} = Rd;
+ let Inst{11-8} = 0b1111;
+ let Inst{7-4} = opc7_4;
+ let Inst{3-0} = Rm;
+}
+
+// Division instructions.
+class ADivA1I<bits<3> opcod, dag oops, dag iops,
+ InstrItinClass itin, string opc, string asm, list<dag> pattern>
+ : I<oops, iops, AddrModeNone, 4, IndexModeNone, ArithMiscFrm, itin,
+ opc, asm, "", pattern> {
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<4> Rm;
+ let Inst{27-23} = 0b01110;
+ let Inst{22-20} = opcod;
+ let Inst{19-16} = Rd;
+ let Inst{15-12} = 0b1111;
+ let Inst{11-8} = Rm;
+ let Inst{7-4} = 0b0001;
+ let Inst{3-0} = Rn;
+}
+
+// PKH instructions
+def PKHLSLAsmOperand : ImmAsmOperand {
+ let Name = "PKHLSLImm";
+ let ParserMethod = "parsePKHLSLImm";
+}
+def pkh_lsl_amt: Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 32; }]>{
+ let PrintMethod = "printPKHLSLShiftImm";
+ let ParserMatchClass = PKHLSLAsmOperand;
+}
+def PKHASRAsmOperand : AsmOperandClass {
+ let Name = "PKHASRImm";
+ let ParserMethod = "parsePKHASRImm";
+}
+def pkh_asr_amt: Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 32; }]>{
+ let PrintMethod = "printPKHASRShiftImm";
+ let ParserMatchClass = PKHASRAsmOperand;
+}
+
+class APKHI<bits<8> opcod, bit tb, dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : I<oops, iops, AddrModeNone, 4, IndexModeNone, ArithMiscFrm, itin,
+ opc, asm, "", pattern> {
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<4> Rm;
+ bits<5> sh;
+ let Inst{27-20} = opcod;
+ let Inst{19-16} = Rn;
+ let Inst{15-12} = Rd;
+ let Inst{11-7} = sh;
+ let Inst{6} = tb;
+ let Inst{5-4} = 0b01;
+ let Inst{3-0} = Rm;
+}
+
+//===----------------------------------------------------------------------===//
+
+// ARMPat - Same as Pat<>, but requires that the compiler be in ARM mode.
+class ARMPat<dag pattern, dag result> : Pat<pattern, result> {
+ list<Predicate> Predicates = [IsARM];
+}
+class ARMV5TPat<dag pattern, dag result> : Pat<pattern, result> {
+ list<Predicate> Predicates = [IsARM, HasV5T];
+}
+class ARMV5TEPat<dag pattern, dag result> : Pat<pattern, result> {
+ list<Predicate> Predicates = [IsARM, HasV5TE];
+}
+// ARMV5MOPat - Same as ARMV5TEPat with UseMulOps.
+class ARMV5MOPat<dag pattern, dag result> : Pat<pattern, result> {
+ list<Predicate> Predicates = [IsARM, HasV5TE, UseMulOps];
+}
+class ARMV6Pat<dag pattern, dag result> : Pat<pattern, result> {
+ list<Predicate> Predicates = [IsARM, HasV6];
+}
+class VFPPat<dag pattern, dag result> : Pat<pattern, result> {
+ list<Predicate> Predicates = [HasVFP2];
+}
+class VFPNoNEONPat<dag pattern, dag result> : Pat<pattern, result> {
+ list<Predicate> Predicates = [HasVFP2, DontUseNEONForFP];
+}
+class Thumb2DSPPat<dag pattern, dag result> : Pat<pattern, result> {
+ list<Predicate> Predicates = [IsThumb2, HasDSP];
+}
+class Thumb2DSPMulPat<dag pattern, dag result> : Pat<pattern, result> {
+ list<Predicate> Predicates = [IsThumb2, UseMulOps, HasDSP];
+}
+class Thumb2ExtractPat<dag pattern, dag result> : Pat<pattern, result> {
+ list<Predicate> Predicates = [IsThumb2, HasT2ExtractPack];
+}
+//===----------------------------------------------------------------------===//
+// Thumb Instruction Format Definitions.
+//
+
+class ThumbI<dag oops, dag iops, AddrMode am, int sz,
+ InstrItinClass itin, string asm, string cstr, list<dag> pattern>
+ : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+ let OutOperandList = oops;
+ let InOperandList = iops;
+ let AsmString = asm;
+ let Pattern = pattern;
+ list<Predicate> Predicates = [IsThumb];
+}
+
+// TI - Thumb instruction.
+class TI<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern>
+ : ThumbI<oops, iops, AddrModeNone, 2, itin, asm, "", pattern>;
+
+// Two-address instructions
+class TIt<dag oops, dag iops, InstrItinClass itin, string asm,
+ list<dag> pattern>
+ : ThumbI<oops, iops, AddrModeNone, 2, itin, asm, "$lhs = $dst",
+ pattern>;
+
+// tBL, tBX 32-bit instructions
+class TIx2<bits<5> opcod1, bits<2> opcod2, bit opcod3,
+ dag oops, dag iops, InstrItinClass itin, string asm,
+ list<dag> pattern>
+ : ThumbI<oops, iops, AddrModeNone, 4, itin, asm, "", pattern>,
+ Encoding {
+ let Inst{31-27} = opcod1;
+ let Inst{15-14} = opcod2;
+ let Inst{12} = opcod3;
+}
+
+// BR_JT instructions
+class TJTI<dag oops, dag iops, InstrItinClass itin, string asm,
+ list<dag> pattern>
+ : ThumbI<oops, iops, AddrModeNone, 0, itin, asm, "", pattern>;
+
+// Thumb1 only
+class Thumb1I<dag oops, dag iops, AddrMode am, int sz,
+ InstrItinClass itin, string asm, string cstr, list<dag> pattern>
+ : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+ let OutOperandList = oops;
+ let InOperandList = iops;
+ let AsmString = asm;
+ let Pattern = pattern;
+ list<Predicate> Predicates = [IsThumb, IsThumb1Only];
+}
+
+class T1I<dag oops, dag iops, InstrItinClass itin,
+ string asm, list<dag> pattern>
+ : Thumb1I<oops, iops, AddrModeNone, 2, itin, asm, "", pattern>;
+class T1Ix2<dag oops, dag iops, InstrItinClass itin,
+ string asm, list<dag> pattern>
+ : Thumb1I<oops, iops, AddrModeNone, 4, itin, asm, "", pattern>;
+
+// Two-address instructions
+class T1It<dag oops, dag iops, InstrItinClass itin,
+ string asm, string cstr, list<dag> pattern>
+ : Thumb1I<oops, iops, AddrModeNone, 2, itin,
+ asm, cstr, pattern>;
+
+// Thumb1 instruction that can either be predicated or set CPSR.
+class Thumb1sI<dag oops, dag iops, AddrMode am, int sz,
+ InstrItinClass itin,
+ string opc, string asm, string cstr, list<dag> pattern>
+ : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+ let OutOperandList = !con(oops, (outs s_cc_out:$s));
+ let InOperandList = !con(iops, (ins pred:$p));
+ let AsmString = !strconcat(opc, "${s}${p}", asm);
+ let Pattern = pattern;
+ let thumbArithFlagSetting = 1;
+ list<Predicate> Predicates = [IsThumb, IsThumb1Only];
+ let DecoderNamespace = "ThumbSBit";
+}
+
+class T1sI<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : Thumb1sI<oops, iops, AddrModeNone, 2, itin, opc, asm, "", pattern>;
+
+// Two-address instructions
+class T1sIt<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : Thumb1sI<oops, iops, AddrModeNone, 2, itin, opc, asm,
+ "$Rn = $Rdn", pattern>;
+
+// Thumb1 instruction that can be predicated.
+class Thumb1pI<dag oops, dag iops, AddrMode am, int sz,
+ InstrItinClass itin,
+ string opc, string asm, string cstr, list<dag> pattern>
+ : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+ let OutOperandList = oops;
+ let InOperandList = !con(iops, (ins pred:$p));
+ let AsmString = !strconcat(opc, "${p}", asm);
+ let Pattern = pattern;
+ list<Predicate> Predicates = [IsThumb, IsThumb1Only];
+}
+
+class T1pI<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : Thumb1pI<oops, iops, AddrModeNone, 2, itin, opc, asm, "", pattern>;
+
+// Two-address instructions
+class T1pIt<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : Thumb1pI<oops, iops, AddrModeNone, 2, itin, opc, asm,
+ "$Rn = $Rdn", pattern>;
+
+class T1pIs<dag oops, dag iops,
+ InstrItinClass itin, string opc, string asm, list<dag> pattern>
+ : Thumb1pI<oops, iops, AddrModeT1_s, 2, itin, opc, asm, "", pattern>;
+
+class Encoding16 : Encoding {
+ let Inst{31-16} = 0x0000;
+}
+
+// A6.2 16-bit Thumb instruction encoding
+class T1Encoding<bits<6> opcode> : Encoding16 {
+ let Inst{15-10} = opcode;
+}
+
+// A6.2.1 Shift (immediate), add, subtract, move, and compare encoding.
+class T1General<bits<5> opcode> : Encoding16 {
+ let Inst{15-14} = 0b00;
+ let Inst{13-9} = opcode;
+}
+
+// A6.2.2 Data-processing encoding.
+class T1DataProcessing<bits<4> opcode> : Encoding16 {
+ let Inst{15-10} = 0b010000;
+ let Inst{9-6} = opcode;
+}
+
+// A6.2.3 Special data instructions and branch and exchange encoding.
+class T1Special<bits<4> opcode> : Encoding16 {
+ let Inst{15-10} = 0b010001;
+ let Inst{9-6} = opcode;
+}
+
+// A6.2.4 Load/store single data item encoding.
+class T1LoadStore<bits<4> opA, bits<3> opB> : Encoding16 {
+ let Inst{15-12} = opA;
+ let Inst{11-9} = opB;
+}
+class T1LdStSP<bits<3> opB> : T1LoadStore<0b1001, opB>; // SP relative
+
+class T1BranchCond<bits<4> opcode> : Encoding16 {
+ let Inst{15-12} = opcode;
+}
+
+// Helper classes to encode Thumb1 loads and stores. For immediates, the
+// following bits are used for "opA" (see A6.2.4):
+//
+// 0b0110 => Immediate, 4 bytes
+// 0b1000 => Immediate, 2 bytes
+// 0b0111 => Immediate, 1 byte
+class T1pILdStEncode<bits<3> opcode, dag oops, dag iops, AddrMode am,
+ InstrItinClass itin, string opc, string asm,
+ list<dag> pattern>
+ : Thumb1pI<oops, iops, am, 2, itin, opc, asm, "", pattern>,
+ T1LoadStore<0b0101, opcode> {
+ bits<3> Rt;
+ bits<8> addr;
+ let Inst{8-6} = addr{5-3}; // Rm
+ let Inst{5-3} = addr{2-0}; // Rn
+ let Inst{2-0} = Rt;
+}
+class T1pILdStEncodeImm<bits<4> opA, bit opB, dag oops, dag iops, AddrMode am,
+ InstrItinClass itin, string opc, string asm,
+ list<dag> pattern>
+ : Thumb1pI<oops, iops, am, 2, itin, opc, asm, "", pattern>,
+ T1LoadStore<opA, {opB,?,?}> {
+ bits<3> Rt;
+ bits<8> addr;
+ let Inst{10-6} = addr{7-3}; // imm5
+ let Inst{5-3} = addr{2-0}; // Rn
+ let Inst{2-0} = Rt;
+}
+
+// A6.2.5 Miscellaneous 16-bit instructions encoding.
+class T1Misc<bits<7> opcode> : Encoding16 {
+ let Inst{15-12} = 0b1011;
+ let Inst{11-5} = opcode;
+}
+
+// Thumb2I - Thumb2 instruction. Almost all Thumb2 instructions are predicable.
+class Thumb2I<dag oops, dag iops, AddrMode am, int sz,
+ InstrItinClass itin,
+ string opc, string asm, string cstr, list<dag> pattern>
+ : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+ let OutOperandList = oops;
+ let InOperandList = !con(iops, (ins pred:$p));
+ let AsmString = !strconcat(opc, "${p}", asm);
+ let Pattern = pattern;
+ list<Predicate> Predicates = [IsThumb2];
+ let DecoderNamespace = "Thumb2";
+}
+
+// Same as Thumb2I except it can optionally modify CPSR. Note it's modeled as an
+// input operand since by default it's a zero register. It will become an
+// implicit def once it's "flipped".
+//
+// FIXME: This uses unified syntax so {s} comes before {p}. We should make it
+// more consistent.
+class Thumb2sI<dag oops, dag iops, AddrMode am, int sz,
+ InstrItinClass itin,
+ string opc, string asm, string cstr, list<dag> pattern>
+ : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+ bits<1> s; // condition-code set flag ('1' if the insn should set the flags)
+ let Inst{20} = s;
+
+ let OutOperandList = oops;
+ let InOperandList = !con(iops, (ins pred:$p, cc_out:$s));
+ let AsmString = !strconcat(opc, "${s}${p}", asm);
+ let Pattern = pattern;
+ list<Predicate> Predicates = [IsThumb2];
+ let DecoderNamespace = "Thumb2";
+}
+
+// Special cases
+class Thumb2XI<dag oops, dag iops, AddrMode am, int sz,
+ InstrItinClass itin,
+ string asm, string cstr, list<dag> pattern>
+ : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+ let OutOperandList = oops;
+ let InOperandList = iops;
+ let AsmString = asm;
+ let Pattern = pattern;
+ list<Predicate> Predicates = [IsThumb2];
+ let DecoderNamespace = "Thumb2";
+}
+
+class ThumbXI<dag oops, dag iops, AddrMode am, int sz,
+ InstrItinClass itin,
+ string asm, string cstr, list<dag> pattern>
+ : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+ let OutOperandList = oops;
+ let InOperandList = iops;
+ let AsmString = asm;
+ let Pattern = pattern;
+ list<Predicate> Predicates = [IsThumb, IsThumb1Only];
+ let DecoderNamespace = "Thumb";
+}
+
+class T2I<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : Thumb2I<oops, iops, AddrModeNone, 4, itin, opc, asm, "", pattern>;
+class T2Ii12<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : Thumb2I<oops, iops, AddrModeT2_i12, 4, itin, opc, asm, "",pattern>;
+class T2Ii8<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : Thumb2I<oops, iops, AddrModeT2_i8, 4, itin, opc, asm, "", pattern>;
+class T2Iso<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : Thumb2I<oops, iops, AddrModeT2_so, 4, itin, opc, asm, "", pattern>;
+class T2Ipc<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : Thumb2I<oops, iops, AddrModeT2_pc, 4, itin, opc, asm, "", pattern>;
+class T2Ii8s4<bit P, bit W, bit isLoad, dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, string cstr, list<dag> pattern>
+ : Thumb2I<oops, iops, AddrModeT2_i8s4, 4, itin, opc, asm, cstr,
+ pattern> {
+ bits<4> Rt;
+ bits<4> Rt2;
+ bits<13> addr;
+ let Inst{31-25} = 0b1110100;
+ let Inst{24} = P;
+ let Inst{23} = addr{8};
+ let Inst{22} = 1;
+ let Inst{21} = W;
+ let Inst{20} = isLoad;
+ let Inst{19-16} = addr{12-9};
+ let Inst{15-12} = Rt{3-0};
+ let Inst{11-8} = Rt2{3-0};
+ let Inst{7-0} = addr{7-0};
+}
+class T2Ii8s4post<bit P, bit W, bit isLoad, dag oops, dag iops,
+ InstrItinClass itin, string opc, string asm, string cstr,
+ list<dag> pattern>
+ : Thumb2I<oops, iops, AddrModeT2_i8s4, 4, itin, opc, asm, cstr,
+ pattern> {
+ bits<4> Rt;
+ bits<4> Rt2;
+ bits<4> addr;
+ bits<9> imm;
+ let Inst{31-25} = 0b1110100;
+ let Inst{24} = P;
+ let Inst{23} = imm{8};
+ let Inst{22} = 1;
+ let Inst{21} = W;
+ let Inst{20} = isLoad;
+ let Inst{19-16} = addr;
+ let Inst{15-12} = Rt{3-0};
+ let Inst{11-8} = Rt2{3-0};
+ let Inst{7-0} = imm{7-0};
+}
+
+class T2sI<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : Thumb2sI<oops, iops, AddrModeNone, 4, itin, opc, asm, "", pattern>;
+
+class T2XI<dag oops, dag iops, InstrItinClass itin,
+ string asm, list<dag> pattern>
+ : Thumb2XI<oops, iops, AddrModeNone, 4, itin, asm, "", pattern>;
+class T2JTI<dag oops, dag iops, InstrItinClass itin,
+ string asm, list<dag> pattern>
+ : Thumb2XI<oops, iops, AddrModeNone, 0, itin, asm, "", pattern>;
+
+// Move to/from coprocessor instructions
+class T2Cop<bits<4> opc, dag oops, dag iops, string opcstr, string asm,
+ list<dag> pattern>
+ : T2I <oops, iops, NoItinerary, opcstr, asm, pattern>, Requires<[IsThumb2]> {
+ let Inst{31-28} = opc;
+}
+
+// Two-address instructions
+class T2XIt<dag oops, dag iops, InstrItinClass itin,
+ string asm, string cstr, list<dag> pattern>
+ : Thumb2XI<oops, iops, AddrModeNone, 4, itin, asm, cstr, pattern>;
+
+// T2Ipreldst - Thumb2 pre-indexed load / store instructions.
+class T2Ipreldst<bit signed, bits<2> opcod, bit load, bit pre,
+ dag oops, dag iops,
+ AddrMode am, IndexMode im, InstrItinClass itin,
+ string opc, string asm, string cstr, list<dag> pattern>
+ : InstARM<am, 4, im, ThumbFrm, GenericDomain, cstr, itin> {
+ let OutOperandList = oops;
+ let InOperandList = !con(iops, (ins pred:$p));
+ let AsmString = !strconcat(opc, "${p}", asm);
+ let Pattern = pattern;
+ list<Predicate> Predicates = [IsThumb2];
+ let DecoderNamespace = "Thumb2";
+
+ bits<4> Rt;
+ bits<13> addr;
+ let Inst{31-27} = 0b11111;
+ let Inst{26-25} = 0b00;
+ let Inst{24} = signed;
+ let Inst{23} = 0;
+ let Inst{22-21} = opcod;
+ let Inst{20} = load;
+ let Inst{19-16} = addr{12-9};
+ let Inst{15-12} = Rt{3-0};
+ let Inst{11} = 1;
+ // (P, W) = (1, 1) Pre-indexed or (0, 1) Post-indexed
+ let Inst{10} = pre; // The P bit.
+ let Inst{9} = addr{8}; // Sign bit
+ let Inst{8} = 1; // The W bit.
+ let Inst{7-0} = addr{7-0};
+
+ let DecoderMethod = "DecodeT2LdStPre";
+}
+
+// T2Ipostldst - Thumb2 post-indexed load / store instructions.
+class T2Ipostldst<bit signed, bits<2> opcod, bit load, bit pre,
+ dag oops, dag iops,
+ AddrMode am, IndexMode im, InstrItinClass itin,
+ string opc, string asm, string cstr, list<dag> pattern>
+ : InstARM<am, 4, im, ThumbFrm, GenericDomain, cstr, itin> {
+ let OutOperandList = oops;
+ let InOperandList = !con(iops, (ins pred:$p));
+ let AsmString = !strconcat(opc, "${p}", asm);
+ let Pattern = pattern;
+ list<Predicate> Predicates = [IsThumb2];
+ let DecoderNamespace = "Thumb2";
+
+ bits<4> Rt;
+ bits<4> Rn;
+ bits<9> offset;
+ let Inst{31-27} = 0b11111;
+ let Inst{26-25} = 0b00;
+ let Inst{24} = signed;
+ let Inst{23} = 0;
+ let Inst{22-21} = opcod;
+ let Inst{20} = load;
+ let Inst{19-16} = Rn;
+ let Inst{15-12} = Rt{3-0};
+ let Inst{11} = 1;
+ // (P, W) = (1, 1) Pre-indexed or (0, 1) Post-indexed
+ let Inst{10} = pre; // The P bit.
+ let Inst{9} = offset{8}; // Sign bit
+ let Inst{8} = 1; // The W bit.
+ let Inst{7-0} = offset{7-0};
+
+ let DecoderMethod = "DecodeT2LdStPre";
+}
+
+// T1Pat - Same as Pat<>, but requires that the compiler be in Thumb1 mode.
+class T1Pat<dag pattern, dag result> : Pat<pattern, result> {
+ list<Predicate> Predicates = [IsThumb, IsThumb1Only];
+}
+
+// T2v6Pat - Same as Pat<>, but requires V6T2 Thumb2 mode.
+class T2v6Pat<dag pattern, dag result> : Pat<pattern, result> {
+ list<Predicate> Predicates = [IsThumb2, HasV6T2];
+}
+
+// T2Pat - Same as Pat<>, but requires that the compiler be in Thumb2 mode.
+class T2Pat<dag pattern, dag result> : Pat<pattern, result> {
+ list<Predicate> Predicates = [IsThumb2];
+}
+
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ARM VFP Instruction templates.
+//
+
+// Almost all VFP instructions are predicable.
+class VFPI<dag oops, dag iops, AddrMode am, int sz,
+ IndexMode im, Format f, InstrItinClass itin,
+ string opc, string asm, string cstr, list<dag> pattern>
+ : InstARM<am, sz, im, f, VFPDomain, cstr, itin> {
+ bits<4> p;
+ let Inst{31-28} = p;
+ let OutOperandList = oops;
+ let InOperandList = !con(iops, (ins pred:$p));
+ let AsmString = !strconcat(opc, "${p}", asm);
+ let Pattern = pattern;
+ let PostEncoderMethod = "VFPThumb2PostEncoder";
+ let DecoderNamespace = "VFP";
+ list<Predicate> Predicates = [HasVFP2];
+}
+
+// Special cases
+class VFPXI<dag oops, dag iops, AddrMode am, int sz,
+ IndexMode im, Format f, InstrItinClass itin,
+ string asm, string cstr, list<dag> pattern>
+ : InstARM<am, sz, im, f, VFPDomain, cstr, itin> {
+ bits<4> p;
+ let Inst{31-28} = p;
+ let OutOperandList = oops;
+ let InOperandList = iops;
+ let AsmString = asm;
+ let Pattern = pattern;
+ let PostEncoderMethod = "VFPThumb2PostEncoder";
+ let DecoderNamespace = "VFP";
+ list<Predicate> Predicates = [HasVFP2];
+}
+
+class VFPAI<dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : VFPI<oops, iops, AddrModeNone, 4, IndexModeNone, f, itin,
+ opc, asm, "", pattern> {
+ let PostEncoderMethod = "VFPThumb2PostEncoder";
+}
+
+// ARM VFP addrmode5 loads and stores
+class ADI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
+ InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : VFPI<oops, iops, AddrMode5, 4, IndexModeNone,
+ VFPLdStFrm, itin, opc, asm, "", pattern> {
+ // Instruction operands.
+ bits<5> Dd;
+ bits<13> addr;
+
+ // Encode instruction operands.
+ let Inst{23} = addr{8}; // U (add = (U == '1'))
+ let Inst{22} = Dd{4};
+ let Inst{19-16} = addr{12-9}; // Rn
+ let Inst{15-12} = Dd{3-0};
+ let Inst{7-0} = addr{7-0}; // imm8
+
+ let Inst{27-24} = opcod1;
+ let Inst{21-20} = opcod2;
+ let Inst{11-9} = 0b101;
+ let Inst{8} = 1; // Double precision
+
+ // Loads & stores operate on both NEON and VFP pipelines.
+ let D = VFPNeonDomain;
+}
+
+class ASI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
+ InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : VFPI<oops, iops, AddrMode5, 4, IndexModeNone,
+ VFPLdStFrm, itin, opc, asm, "", pattern> {
+ // Instruction operands.
+ bits<5> Sd;
+ bits<13> addr;
+
+ // Encode instruction operands.
+ let Inst{23} = addr{8}; // U (add = (U == '1'))
+ let Inst{22} = Sd{0};
+ let Inst{19-16} = addr{12-9}; // Rn
+ let Inst{15-12} = Sd{4-1};
+ let Inst{7-0} = addr{7-0}; // imm8
+
+ let Inst{27-24} = opcod1;
+ let Inst{21-20} = opcod2;
+ let Inst{11-9} = 0b101;
+ let Inst{8} = 0; // Single precision
+
+ // Loads & stores operate on both NEON and VFP pipelines.
+ let D = VFPNeonDomain;
+}
+
+class AHI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
+ InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : VFPI<oops, iops, AddrMode5, 4, IndexModeNone,
+ VFPLdStFrm, itin, opc, asm, "", pattern> {
+ list<Predicate> Predicates = [HasFullFP16];
+
+ // Instruction operands.
+ bits<5> Sd;
+ bits<13> addr;
+
+ // Encode instruction operands.
+ let Inst{23} = addr{8}; // U (add = (U == '1'))
+ let Inst{22} = Sd{0};
+ let Inst{19-16} = addr{12-9}; // Rn
+ let Inst{15-12} = Sd{4-1};
+ let Inst{7-0} = addr{7-0}; // imm8
+
+ let Inst{27-24} = opcod1;
+ let Inst{21-20} = opcod2;
+ let Inst{11-8} = 0b1001; // Half precision
+
+ // Loads & stores operate on both NEON and VFP pipelines.
+ let D = VFPNeonDomain;
+}
+
+// VFP Load / store multiple pseudo instructions.
+class PseudoVFPLdStM<dag oops, dag iops, InstrItinClass itin, string cstr,
+ list<dag> pattern>
+ : InstARM<AddrMode4, 4, IndexModeNone, Pseudo, VFPNeonDomain,
+ cstr, itin> {
+ let OutOperandList = oops;
+ let InOperandList = !con(iops, (ins pred:$p));
+ let Pattern = pattern;
+ list<Predicate> Predicates = [HasVFP2];
+}
+
+// Load / store multiple
+
+// Unknown precision
+class AXXI4<dag oops, dag iops, IndexMode im,
+ string asm, string cstr, list<dag> pattern>
+ : VFPXI<oops, iops, AddrMode4, 4, im,
+ VFPLdStFrm, NoItinerary, asm, cstr, pattern> {
+ // Instruction operands.
+ bits<4> Rn;
+ bits<13> regs;
+
+ // Encode instruction operands.
+ let Inst{19-16} = Rn;
+ let Inst{22} = 0;
+ let Inst{15-12} = regs{11-8};
+ let Inst{7-1} = regs{7-1};
+
+ let Inst{27-25} = 0b110;
+ let Inst{11-8} = 0b1011;
+ let Inst{0} = 1;
+}
+
+// Double precision
+class AXDI4<dag oops, dag iops, IndexMode im, InstrItinClass itin,
+ string asm, string cstr, list<dag> pattern>
+ : VFPXI<oops, iops, AddrMode4, 4, im,
+ VFPLdStMulFrm, itin, asm, cstr, pattern> {
+ // Instruction operands.
+ bits<4> Rn;
+ bits<13> regs;
+
+ // Encode instruction operands.
+ let Inst{19-16} = Rn;
+ let Inst{22} = regs{12};
+ let Inst{15-12} = regs{11-8};
+ let Inst{7-1} = regs{7-1};
+
+ let Inst{27-25} = 0b110;
+ let Inst{11-9} = 0b101;
+ let Inst{8} = 1; // Double precision
+ let Inst{0} = 0;
+}
+
+// Single Precision
+class AXSI4<dag oops, dag iops, IndexMode im, InstrItinClass itin,
+ string asm, string cstr, list<dag> pattern>
+ : VFPXI<oops, iops, AddrMode4, 4, im,
+ VFPLdStMulFrm, itin, asm, cstr, pattern> {
+ // Instruction operands.
+ bits<4> Rn;
+ bits<13> regs;
+
+ // Encode instruction operands.
+ let Inst{19-16} = Rn;
+ let Inst{22} = regs{8};
+ let Inst{15-12} = regs{12-9};
+ let Inst{7-0} = regs{7-0};
+
+ let Inst{27-25} = 0b110;
+ let Inst{11-9} = 0b101;
+ let Inst{8} = 0; // Single precision
+}
+
+// Double precision, unary
+class ADuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
+ bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc,
+ string asm, list<dag> pattern>
+ : VFPAI<oops, iops, VFPUnaryFrm, itin, opc, asm, pattern> {
+ // Instruction operands.
+ bits<5> Dd;
+ bits<5> Dm;
+
+ // Encode instruction operands.
+ let Inst{3-0} = Dm{3-0};
+ let Inst{5} = Dm{4};
+ let Inst{15-12} = Dd{3-0};
+ let Inst{22} = Dd{4};
+
+ let Inst{27-23} = opcod1;
+ let Inst{21-20} = opcod2;
+ let Inst{19-16} = opcod3;
+ let Inst{11-9} = 0b101;
+ let Inst{8} = 1; // Double precision
+ let Inst{7-6} = opcod4;
+ let Inst{4} = opcod5;
+
+ let Predicates = [HasVFP2, HasDPVFP];
+}
+
+// Double precision, unary, not-predicated
+class ADuInp<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
+ bit opcod5, dag oops, dag iops, InstrItinClass itin,
+ string asm, list<dag> pattern>
+ : VFPXI<oops, iops, AddrModeNone, 4, IndexModeNone, VFPUnaryFrm, itin, asm, "", pattern> {
+ // Instruction operands.
+ bits<5> Dd;
+ bits<5> Dm;
+
+ let Inst{31-28} = 0b1111;
+
+ // Encode instruction operands.
+ let Inst{3-0} = Dm{3-0};
+ let Inst{5} = Dm{4};
+ let Inst{15-12} = Dd{3-0};
+ let Inst{22} = Dd{4};
+
+ let Inst{27-23} = opcod1;
+ let Inst{21-20} = opcod2;
+ let Inst{19-16} = opcod3;
+ let Inst{11-9} = 0b101;
+ let Inst{8} = 1; // Double precision
+ let Inst{7-6} = opcod4;
+ let Inst{4} = opcod5;
+}
+
+// Double precision, binary
+class ADbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops,
+ dag iops, InstrItinClass itin, string opc, string asm,
+ list<dag> pattern>
+ : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> {
+ // Instruction operands.
+ bits<5> Dd;
+ bits<5> Dn;
+ bits<5> Dm;
+
+ // Encode instruction operands.
+ let Inst{3-0} = Dm{3-0};
+ let Inst{5} = Dm{4};
+ let Inst{19-16} = Dn{3-0};
+ let Inst{7} = Dn{4};
+ let Inst{15-12} = Dd{3-0};
+ let Inst{22} = Dd{4};
+
+ let Inst{27-23} = opcod1;
+ let Inst{21-20} = opcod2;
+ let Inst{11-9} = 0b101;
+ let Inst{8} = 1; // Double precision
+ let Inst{6} = op6;
+ let Inst{4} = op4;
+
+ let Predicates = [HasVFP2, HasDPVFP];
+}
+
+// FP, binary, not predicated
+class ADbInp<bits<5> opcod1, bits<2> opcod2, bit opcod3, dag oops, dag iops,
+ InstrItinClass itin, string asm, list<dag> pattern>
+ : VFPXI<oops, iops, AddrModeNone, 4, IndexModeNone, VFPBinaryFrm, itin,
+ asm, "", pattern>
+{
+ // Instruction operands.
+ bits<5> Dd;
+ bits<5> Dn;
+ bits<5> Dm;
+
+ let Inst{31-28} = 0b1111;
+
+ // Encode instruction operands.
+ let Inst{3-0} = Dm{3-0};
+ let Inst{5} = Dm{4};
+ let Inst{19-16} = Dn{3-0};
+ let Inst{7} = Dn{4};
+ let Inst{15-12} = Dd{3-0};
+ let Inst{22} = Dd{4};
+
+ let Inst{27-23} = opcod1;
+ let Inst{21-20} = opcod2;
+ let Inst{11-9} = 0b101;
+ let Inst{8} = 1; // double precision
+ let Inst{6} = opcod3;
+ let Inst{4} = 0;
+
+ let Predicates = [HasVFP2, HasDPVFP];
+}
+
+// Single precision, unary, predicated
+class ASuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
+ bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc,
+ string asm, list<dag> pattern>
+ : VFPAI<oops, iops, VFPUnaryFrm, itin, opc, asm, pattern> {
+ // Instruction operands.
+ bits<5> Sd;
+ bits<5> Sm;
+
+ // Encode instruction operands.
+ let Inst{3-0} = Sm{4-1};
+ let Inst{5} = Sm{0};
+ let Inst{15-12} = Sd{4-1};
+ let Inst{22} = Sd{0};
+
+ let Inst{27-23} = opcod1;
+ let Inst{21-20} = opcod2;
+ let Inst{19-16} = opcod3;
+ let Inst{11-9} = 0b101;
+ let Inst{8} = 0; // Single precision
+ let Inst{7-6} = opcod4;
+ let Inst{4} = opcod5;
+}
+
+// Single precision, unary, non-predicated
+class ASuInp<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
+ bit opcod5, dag oops, dag iops, InstrItinClass itin,
+ string asm, list<dag> pattern>
+ : VFPXI<oops, iops, AddrModeNone, 4, IndexModeNone,
+ VFPUnaryFrm, itin, asm, "", pattern> {
+ // Instruction operands.
+ bits<5> Sd;
+ bits<5> Sm;
+
+ let Inst{31-28} = 0b1111;
+
+ // Encode instruction operands.
+ let Inst{3-0} = Sm{4-1};
+ let Inst{5} = Sm{0};
+ let Inst{15-12} = Sd{4-1};
+ let Inst{22} = Sd{0};
+
+ let Inst{27-23} = opcod1;
+ let Inst{21-20} = opcod2;
+ let Inst{19-16} = opcod3;
+ let Inst{11-9} = 0b101;
+ let Inst{8} = 0; // Single precision
+ let Inst{7-6} = opcod4;
+ let Inst{4} = opcod5;
+}
+
+// Single precision unary, if no NEON. Same as ASuI except not available if
+// NEON is enabled.
+class ASuIn<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
+ bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc,
+ string asm, list<dag> pattern>
+ : ASuI<opcod1, opcod2, opcod3, opcod4, opcod5, oops, iops, itin, opc, asm,
+ pattern> {
+ list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP];
+}
+
+// Single precision, binary
+class ASbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, dag iops,
+ InstrItinClass itin, string opc, string asm, list<dag> pattern>
+ : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> {
+ // Instruction operands.
+ bits<5> Sd;
+ bits<5> Sn;
+ bits<5> Sm;
+
+ // Encode instruction operands.
+ let Inst{3-0} = Sm{4-1};
+ let Inst{5} = Sm{0};
+ let Inst{19-16} = Sn{4-1};
+ let Inst{7} = Sn{0};
+ let Inst{15-12} = Sd{4-1};
+ let Inst{22} = Sd{0};
+
+ let Inst{27-23} = opcod1;
+ let Inst{21-20} = opcod2;
+ let Inst{11-9} = 0b101;
+ let Inst{8} = 0; // Single precision
+ let Inst{6} = op6;
+ let Inst{4} = op4;
+}
+
+// Single precision, binary, not predicated
+class ASbInp<bits<5> opcod1, bits<2> opcod2, bit opcod3, dag oops, dag iops,
+ InstrItinClass itin, string asm, list<dag> pattern>
+ : VFPXI<oops, iops, AddrModeNone, 4, IndexModeNone,
+ VFPBinaryFrm, itin, asm, "", pattern>
+{
+ // Instruction operands.
+ bits<5> Sd;
+ bits<5> Sn;
+ bits<5> Sm;
+
+ let Inst{31-28} = 0b1111;
+
+ // Encode instruction operands.
+ let Inst{3-0} = Sm{4-1};
+ let Inst{5} = Sm{0};
+ let Inst{19-16} = Sn{4-1};
+ let Inst{7} = Sn{0};
+ let Inst{15-12} = Sd{4-1};
+ let Inst{22} = Sd{0};
+
+ let Inst{27-23} = opcod1;
+ let Inst{21-20} = opcod2;
+ let Inst{11-9} = 0b101;
+ let Inst{8} = 0; // Single precision
+ let Inst{6} = opcod3;
+ let Inst{4} = 0;
+}
+
+// Single precision binary, if no NEON. Same as ASbI except not available if
+// NEON is enabled.
+class ASbIn<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops,
+ dag iops, InstrItinClass itin, string opc, string asm,
+ list<dag> pattern>
+ : ASbI<opcod1, opcod2, op6, op4, oops, iops, itin, opc, asm, pattern> {
+ list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP];
+
+ // Instruction operands.
+ bits<5> Sd;
+ bits<5> Sn;
+ bits<5> Sm;
+
+ // Encode instruction operands.
+ let Inst{3-0} = Sm{4-1};
+ let Inst{5} = Sm{0};
+ let Inst{19-16} = Sn{4-1};
+ let Inst{7} = Sn{0};
+ let Inst{15-12} = Sd{4-1};
+ let Inst{22} = Sd{0};
+}
+
+// Half precision, unary, predicated
+class AHuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
+ bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc,
+ string asm, list<dag> pattern>
+ : VFPAI<oops, iops, VFPUnaryFrm, itin, opc, asm, pattern> {
+ list<Predicate> Predicates = [HasFullFP16];
+
+ // Instruction operands.
+ bits<5> Sd;
+ bits<5> Sm;
+
+ // Encode instruction operands.
+ let Inst{3-0} = Sm{4-1};
+ let Inst{5} = Sm{0};
+ let Inst{15-12} = Sd{4-1};
+ let Inst{22} = Sd{0};
+
+ let Inst{27-23} = opcod1;
+ let Inst{21-20} = opcod2;
+ let Inst{19-16} = opcod3;
+ let Inst{11-8} = 0b1001; // Half precision
+ let Inst{7-6} = opcod4;
+ let Inst{4} = opcod5;
+}
+
+// Half precision, unary, non-predicated
+class AHuInp<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
+ bit opcod5, dag oops, dag iops, InstrItinClass itin,
+ string asm, list<dag> pattern>
+ : VFPXI<oops, iops, AddrModeNone, 4, IndexModeNone,
+ VFPUnaryFrm, itin, asm, "", pattern> {
+ list<Predicate> Predicates = [HasFullFP16];
+
+ // Instruction operands.
+ bits<5> Sd;
+ bits<5> Sm;
+
+ let Inst{31-28} = 0b1111;
+
+ // Encode instruction operands.
+ let Inst{3-0} = Sm{4-1};
+ let Inst{5} = Sm{0};
+ let Inst{15-12} = Sd{4-1};
+ let Inst{22} = Sd{0};
+
+ let Inst{27-23} = opcod1;
+ let Inst{21-20} = opcod2;
+ let Inst{19-16} = opcod3;
+ let Inst{11-8} = 0b1001; // Half precision
+ let Inst{7-6} = opcod4;
+ let Inst{4} = opcod5;
+}
+
+// Half precision, binary
+class AHbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, dag iops,
+ InstrItinClass itin, string opc, string asm, list<dag> pattern>
+ : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> {
+ list<Predicate> Predicates = [HasFullFP16];
+
+ // Instruction operands.
+ bits<5> Sd;
+ bits<5> Sn;
+ bits<5> Sm;
+
+ // Encode instruction operands.
+ let Inst{3-0} = Sm{4-1};
+ let Inst{5} = Sm{0};
+ let Inst{19-16} = Sn{4-1};
+ let Inst{7} = Sn{0};
+ let Inst{15-12} = Sd{4-1};
+ let Inst{22} = Sd{0};
+
+ let Inst{27-23} = opcod1;
+ let Inst{21-20} = opcod2;
+ let Inst{11-8} = 0b1001; // Half precision
+ let Inst{6} = op6;
+ let Inst{4} = op4;
+}
+
+// Half precision, binary, not predicated
+class AHbInp<bits<5> opcod1, bits<2> opcod2, bit opcod3, dag oops, dag iops,
+ InstrItinClass itin, string asm, list<dag> pattern>
+ : VFPXI<oops, iops, AddrModeNone, 4, IndexModeNone,
+ VFPBinaryFrm, itin, asm, "", pattern> {
+ list<Predicate> Predicates = [HasFullFP16];
+
+ // Instruction operands.
+ bits<5> Sd;
+ bits<5> Sn;
+ bits<5> Sm;
+
+ let Inst{31-28} = 0b1111;
+
+ // Encode instruction operands.
+ let Inst{3-0} = Sm{4-1};
+ let Inst{5} = Sm{0};
+ let Inst{19-16} = Sn{4-1};
+ let Inst{7} = Sn{0};
+ let Inst{15-12} = Sd{4-1};
+ let Inst{22} = Sd{0};
+
+ let Inst{27-23} = opcod1;
+ let Inst{21-20} = opcod2;
+ let Inst{11-8} = 0b1001; // Half precision
+ let Inst{6} = opcod3;
+ let Inst{4} = 0;
+}
+
+// VFP conversion instructions
+class AVConv1I<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<4> opcod4,
+ dag oops, dag iops, InstrItinClass itin, string opc, string asm,
+ list<dag> pattern>
+ : VFPAI<oops, iops, VFPConv1Frm, itin, opc, asm, pattern> {
+ let Inst{27-23} = opcod1;
+ let Inst{21-20} = opcod2;
+ let Inst{19-16} = opcod3;
+ let Inst{11-8} = opcod4;
+ let Inst{6} = 1;
+ let Inst{4} = 0;
+}
+
+// VFP conversion between floating-point and fixed-point
+class AVConv1XI<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4, bit op5,
+ dag oops, dag iops, InstrItinClass itin, string opc, string asm,
+ list<dag> pattern>
+ : AVConv1I<op1, op2, op3, op4, oops, iops, itin, opc, asm, pattern> {
+ bits<5> fbits;
+ // size (fixed-point number): sx == 0 ? 16 : 32
+ let Inst{7} = op5; // sx
+ let Inst{5} = fbits{0};
+ let Inst{3-0} = fbits{4-1};
+}
+
+// VFP conversion instructions, if no NEON
+class AVConv1In<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<4> opcod4,
+ dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : AVConv1I<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm,
+ pattern> {
+ list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP];
+}
+
+class AVConvXI<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, Format f,
+ InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : VFPAI<oops, iops, f, itin, opc, asm, pattern> {
+ let Inst{27-20} = opcod1;
+ let Inst{11-8} = opcod2;
+ let Inst{4} = 1;
+}
+
+class AVConv2I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops,
+ InstrItinClass itin, string opc, string asm, list<dag> pattern>
+ : AVConvXI<opcod1, opcod2, oops, iops, VFPConv2Frm, itin, opc, asm, pattern>;
+
+class AVConv3I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops,
+ InstrItinClass itin, string opc, string asm, list<dag> pattern>
+ : AVConvXI<opcod1, opcod2, oops, iops, VFPConv3Frm, itin, opc, asm, pattern>;
+
+class AVConv4I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops,
+ InstrItinClass itin, string opc, string asm, list<dag> pattern>
+ : AVConvXI<opcod1, opcod2, oops, iops, VFPConv4Frm, itin, opc, asm, pattern>;
+
+class AVConv5I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops,
+ InstrItinClass itin, string opc, string asm, list<dag> pattern>
+ : AVConvXI<opcod1, opcod2, oops, iops, VFPConv5Frm, itin, opc, asm, pattern>;
+
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ARM NEON Instruction templates.
+//
+
+class NeonI<dag oops, dag iops, AddrMode am, IndexMode im, Format f,
+ InstrItinClass itin, string opc, string dt, string asm, string cstr,
+ list<dag> pattern>
+ : InstARM<am, 4, im, f, NeonDomain, cstr, itin> {
+ let OutOperandList = oops;
+ let InOperandList = !con(iops, (ins pred:$p));
+ let AsmString = !strconcat(opc, "${p}", ".", dt, "\t", asm);
+ let Pattern = pattern;
+ list<Predicate> Predicates = [HasNEON];
+ let DecoderNamespace = "NEON";
+}
+
+// Same as NeonI except it does not have a "data type" specifier.
+class NeonXI<dag oops, dag iops, AddrMode am, IndexMode im, Format f,
+ InstrItinClass itin, string opc, string asm, string cstr,
+ list<dag> pattern>
+ : InstARM<am, 4, im, f, NeonDomain, cstr, itin> {
+ let OutOperandList = oops;
+ let InOperandList = !con(iops, (ins pred:$p));
+ let AsmString = !strconcat(opc, "${p}", "\t", asm);
+ let Pattern = pattern;
+ list<Predicate> Predicates = [HasNEON];
+ let DecoderNamespace = "NEON";
+}
+
+// Same as NeonI except it is not predicated
+class NeonInp<dag oops, dag iops, AddrMode am, IndexMode im, Format f,
+ InstrItinClass itin, string opc, string dt, string asm, string cstr,
+ list<dag> pattern>
+ : InstARM<am, 4, im, f, NeonDomain, cstr, itin> {
+ let OutOperandList = oops;
+ let InOperandList = iops;
+ let AsmString = !strconcat(opc, ".", dt, "\t", asm);
+ let Pattern = pattern;
+ list<Predicate> Predicates = [HasNEON];
+ let DecoderNamespace = "NEON";
+
+ let Inst{31-28} = 0b1111;
+}
+
+class NLdSt<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4,
+ dag oops, dag iops, InstrItinClass itin,
+ string opc, string dt, string asm, string cstr, list<dag> pattern>
+ : NeonI<oops, iops, AddrMode6, IndexModeNone, NLdStFrm, itin, opc, dt, asm,
+ cstr, pattern> {
+ let Inst{31-24} = 0b11110100;
+ let Inst{23} = op23;
+ let Inst{21-20} = op21_20;
+ let Inst{11-8} = op11_8;
+ let Inst{7-4} = op7_4;
+
+ let PostEncoderMethod = "NEONThumb2LoadStorePostEncoder";
+ let DecoderNamespace = "NEONLoadStore";
+
+ bits<5> Vd;
+ bits<6> Rn;
+ bits<4> Rm;
+
+ let Inst{22} = Vd{4};
+ let Inst{15-12} = Vd{3-0};
+ let Inst{19-16} = Rn{3-0};
+ let Inst{3-0} = Rm{3-0};
+}
+
+class NLdStLn<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4,
+ dag oops, dag iops, InstrItinClass itin,
+ string opc, string dt, string asm, string cstr, list<dag> pattern>
+ : NLdSt<op23, op21_20, op11_8, op7_4, oops, iops, itin, opc,
+ dt, asm, cstr, pattern> {
+ bits<3> lane;
+}
+
+class PseudoNLdSt<dag oops, dag iops, InstrItinClass itin, string cstr>
+ : InstARM<AddrMode6, 4, IndexModeNone, Pseudo, NeonDomain, cstr,
+ itin> {
+ let OutOperandList = oops;
+ let InOperandList = !con(iops, (ins pred:$p));
+ list<Predicate> Predicates = [HasNEON];
+}
+
+class PseudoNeonI<dag oops, dag iops, InstrItinClass itin, string cstr,
+ list<dag> pattern>
+ : InstARM<AddrModeNone, 4, IndexModeNone, Pseudo, NeonDomain, cstr,
+ itin> {
+ let OutOperandList = oops;
+ let InOperandList = !con(iops, (ins pred:$p));
+ let Pattern = pattern;
+ list<Predicate> Predicates = [HasNEON];
+}
+
+class NDataI<dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string dt, string asm, string cstr, list<dag> pattern>
+ : NeonI<oops, iops, AddrModeNone, IndexModeNone, f, itin, opc, dt, asm, cstr,
+ pattern> {
+ let Inst{31-25} = 0b1111001;
+ let PostEncoderMethod = "NEONThumb2DataIPostEncoder";
+ let DecoderNamespace = "NEONData";
+}
+
+class NDataXI<dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, string cstr, list<dag> pattern>
+ : NeonXI<oops, iops, AddrModeNone, IndexModeNone, f, itin, opc, asm,
+ cstr, pattern> {
+ let Inst{31-25} = 0b1111001;
+ let PostEncoderMethod = "NEONThumb2DataIPostEncoder";
+ let DecoderNamespace = "NEONData";
+}
+
+// NEON "one register and a modified immediate" format.
+class N1ModImm<bit op23, bits<3> op21_19, bits<4> op11_8, bit op7, bit op6,
+ bit op5, bit op4,
+ dag oops, dag iops, InstrItinClass itin,
+ string opc, string dt, string asm, string cstr,
+ list<dag> pattern>
+ : NDataI<oops, iops, N1RegModImmFrm, itin, opc, dt, asm, cstr, pattern> {
+ let Inst{23} = op23;
+ let Inst{21-19} = op21_19;
+ let Inst{11-8} = op11_8;
+ let Inst{7} = op7;
+ let Inst{6} = op6;
+ let Inst{5} = op5;
+ let Inst{4} = op4;
+
+ // Instruction operands.
+ bits<5> Vd;
+ bits<13> SIMM;
+
+ let Inst{15-12} = Vd{3-0};
+ let Inst{22} = Vd{4};
+ let Inst{24} = SIMM{7};
+ let Inst{18-16} = SIMM{6-4};
+ let Inst{3-0} = SIMM{3-0};
+ let DecoderMethod = "DecodeNEONModImmInstruction";
+}
+
+// NEON 2 vector register format.
+class N2V<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
+ bits<5> op11_7, bit op6, bit op4,
+ dag oops, dag iops, InstrItinClass itin,
+ string opc, string dt, string asm, string cstr, list<dag> pattern>
+ : NDataI<oops, iops, N2RegFrm, itin, opc, dt, asm, cstr, pattern> {
+ let Inst{24-23} = op24_23;
+ let Inst{21-20} = op21_20;
+ let Inst{19-18} = op19_18;
+ let Inst{17-16} = op17_16;
+ let Inst{11-7} = op11_7;
+ let Inst{6} = op6;
+ let Inst{4} = op4;
+
+ // Instruction operands.
+ bits<5> Vd;
+ bits<5> Vm;
+
+ let Inst{15-12} = Vd{3-0};
+ let Inst{22} = Vd{4};
+ let Inst{3-0} = Vm{3-0};
+ let Inst{5} = Vm{4};
+}
+
+// Same as N2V but not predicated.
+class N2Vnp<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op7, bit op6,
+ dag oops, dag iops, InstrItinClass itin, string OpcodeStr,
+ string Dt, list<dag> pattern>
+ : NeonInp<oops, iops, AddrModeNone, IndexModeNone, N2RegFrm, itin,
+ OpcodeStr, Dt, "$Vd, $Vm", "", pattern> {
+ bits<5> Vd;
+ bits<5> Vm;
+
+ // Encode instruction operands
+ let Inst{22} = Vd{4};
+ let Inst{15-12} = Vd{3-0};
+ let Inst{5} = Vm{4};
+ let Inst{3-0} = Vm{3-0};
+
+ // Encode constant bits
+ let Inst{27-23} = 0b00111;
+ let Inst{21-20} = 0b11;
+ let Inst{19-18} = op19_18;
+ let Inst{17-16} = op17_16;
+ let Inst{11} = 0;
+ let Inst{10-8} = op10_8;
+ let Inst{7} = op7;
+ let Inst{6} = op6;
+ let Inst{4} = 0;
+
+ let DecoderNamespace = "NEON";
+}
+
+// Same as N2V except it doesn't have a datatype suffix.
+class N2VX<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
+ bits<5> op11_7, bit op6, bit op4,
+ dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, string cstr, list<dag> pattern>
+ : NDataXI<oops, iops, N2RegFrm, itin, opc, asm, cstr, pattern> {
+ let Inst{24-23} = op24_23;
+ let Inst{21-20} = op21_20;
+ let Inst{19-18} = op19_18;
+ let Inst{17-16} = op17_16;
+ let Inst{11-7} = op11_7;
+ let Inst{6} = op6;
+ let Inst{4} = op4;
+
+ // Instruction operands.
+ bits<5> Vd;
+ bits<5> Vm;
+
+ let Inst{15-12} = Vd{3-0};
+ let Inst{22} = Vd{4};
+ let Inst{3-0} = Vm{3-0};
+ let Inst{5} = Vm{4};
+}
+
+// NEON 2 vector register with immediate.
+class N2VImm<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
+ dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string dt, string asm, string cstr, list<dag> pattern>
+ : NDataI<oops, iops, f, itin, opc, dt, asm, cstr, pattern> {
+ let Inst{24} = op24;
+ let Inst{23} = op23;
+ let Inst{11-8} = op11_8;
+ let Inst{7} = op7;
+ let Inst{6} = op6;
+ let Inst{4} = op4;
+
+ // Instruction operands.
+ bits<5> Vd;
+ bits<5> Vm;
+ bits<6> SIMM;
+
+ let Inst{15-12} = Vd{3-0};
+ let Inst{22} = Vd{4};
+ let Inst{3-0} = Vm{3-0};
+ let Inst{5} = Vm{4};
+ let Inst{21-16} = SIMM{5-0};
+}
+
+// NEON 3 vector register format.
+
+class N3VCommon<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6,
+ bit op4, dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string dt, string asm, string cstr,
+ list<dag> pattern>
+ : NDataI<oops, iops, f, itin, opc, dt, asm, cstr, pattern> {
+ let Inst{24} = op24;
+ let Inst{23} = op23;
+ let Inst{21-20} = op21_20;
+ let Inst{11-8} = op11_8;
+ let Inst{6} = op6;
+ let Inst{4} = op4;
+}
+
+class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
+ dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string dt, string asm, string cstr, list<dag> pattern>
+ : N3VCommon<op24, op23, op21_20, op11_8, op6, op4,
+ oops, iops, f, itin, opc, dt, asm, cstr, pattern> {
+ // Instruction operands.
+ bits<5> Vd;
+ bits<5> Vn;
+ bits<5> Vm;
+
+ let Inst{15-12} = Vd{3-0};
+ let Inst{22} = Vd{4};
+ let Inst{19-16} = Vn{3-0};
+ let Inst{7} = Vn{4};
+ let Inst{3-0} = Vm{3-0};
+ let Inst{5} = Vm{4};
+}
+
+class N3Vnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
+ bit op4, dag oops, dag iops,Format f, InstrItinClass itin,
+ string OpcodeStr, string Dt, list<dag> pattern>
+ : NeonInp<oops, iops, AddrModeNone, IndexModeNone, f, itin, OpcodeStr,
+ Dt, "$Vd, $Vn, $Vm", "", pattern> {
+ bits<5> Vd;
+ bits<5> Vn;
+ bits<5> Vm;
+
+ // Encode instruction operands
+ let Inst{22} = Vd{4};
+ let Inst{15-12} = Vd{3-0};
+ let Inst{19-16} = Vn{3-0};
+ let Inst{7} = Vn{4};
+ let Inst{5} = Vm{4};
+ let Inst{3-0} = Vm{3-0};
+
+ // Encode constant bits
+ let Inst{27-23} = op27_23;
+ let Inst{21-20} = op21_20;
+ let Inst{11-8} = op11_8;
+ let Inst{6} = op6;
+ let Inst{4} = op4;
+}
+
+class N3VLane32<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6,
+ bit op4, dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string dt, string asm, string cstr,
+ list<dag> pattern>
+ : N3VCommon<op24, op23, op21_20, op11_8, op6, op4,
+ oops, iops, f, itin, opc, dt, asm, cstr, pattern> {
+
+ // Instruction operands.
+ bits<5> Vd;
+ bits<5> Vn;
+ bits<5> Vm;
+ bit lane;
+
+ let Inst{15-12} = Vd{3-0};
+ let Inst{22} = Vd{4};
+ let Inst{19-16} = Vn{3-0};
+ let Inst{7} = Vn{4};
+ let Inst{3-0} = Vm{3-0};
+ let Inst{5} = lane;
+}
+
+class N3VLane16<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6,
+ bit op4, dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string dt, string asm, string cstr,
+ list<dag> pattern>
+ : N3VCommon<op24, op23, op21_20, op11_8, op6, op4,
+ oops, iops, f, itin, opc, dt, asm, cstr, pattern> {
+
+ // Instruction operands.
+ bits<5> Vd;
+ bits<5> Vn;
+ bits<5> Vm;
+ bits<2> lane;
+
+ let Inst{15-12} = Vd{3-0};
+ let Inst{22} = Vd{4};
+ let Inst{19-16} = Vn{3-0};
+ let Inst{7} = Vn{4};
+ let Inst{2-0} = Vm{2-0};
+ let Inst{5} = lane{1};
+ let Inst{3} = lane{0};
+}
+
+// Same as N3V except it doesn't have a data type suffix.
+class N3VX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6,
+ bit op4,
+ dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string asm, string cstr, list<dag> pattern>
+ : NDataXI<oops, iops, f, itin, opc, asm, cstr, pattern> {
+ let Inst{24} = op24;
+ let Inst{23} = op23;
+ let Inst{21-20} = op21_20;
+ let Inst{11-8} = op11_8;
+ let Inst{6} = op6;
+ let Inst{4} = op4;
+
+ // Instruction operands.
+ bits<5> Vd;
+ bits<5> Vn;
+ bits<5> Vm;
+
+ let Inst{15-12} = Vd{3-0};
+ let Inst{22} = Vd{4};
+ let Inst{19-16} = Vn{3-0};
+ let Inst{7} = Vn{4};
+ let Inst{3-0} = Vm{3-0};
+ let Inst{5} = Vm{4};
+}
+
+// NEON VMOVs between scalar and core registers.
+class NVLaneOp<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
+ dag oops, dag iops, Format f, InstrItinClass itin,
+ string opc, string dt, string asm, list<dag> pattern>
+ : InstARM<AddrModeNone, 4, IndexModeNone, f, NeonDomain,
+ "", itin> {
+ let Inst{27-20} = opcod1;
+ let Inst{11-8} = opcod2;
+ let Inst{6-5} = opcod3;
+ let Inst{4} = 1;
+ // A8.6.303, A8.6.328, A8.6.329
+ let Inst{3-0} = 0b0000;
+
+ let OutOperandList = oops;
+ let InOperandList = !con(iops, (ins pred:$p));
+ let AsmString = !strconcat(opc, "${p}", ".", dt, "\t", asm);
+ let Pattern = pattern;
+ list<Predicate> Predicates = [HasNEON];
+
+ let PostEncoderMethod = "NEONThumb2DupPostEncoder";
+ let DecoderNamespace = "NEONDup";
+
+ bits<5> V;
+ bits<4> R;
+ bits<4> p;
+ bits<4> lane;
+
+ let Inst{31-28} = p{3-0};
+ let Inst{7} = V{4};
+ let Inst{19-16} = V{3-0};
+ let Inst{15-12} = R{3-0};
+}
+class NVGetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
+ dag oops, dag iops, InstrItinClass itin,
+ string opc, string dt, string asm, list<dag> pattern>
+ : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NGetLnFrm, itin,
+ opc, dt, asm, pattern>;
+class NVSetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
+ dag oops, dag iops, InstrItinClass itin,
+ string opc, string dt, string asm, list<dag> pattern>
+ : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NSetLnFrm, itin,
+ opc, dt, asm, pattern>;
+class NVDup<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
+ dag oops, dag iops, InstrItinClass itin,
+ string opc, string dt, string asm, list<dag> pattern>
+ : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NDupFrm, itin,
+ opc, dt, asm, pattern>;
+
+// Vector Duplicate Lane (from scalar to all elements)
+class NVDupLane<bits<4> op19_16, bit op6, dag oops, dag iops,
+ InstrItinClass itin, string opc, string dt, string asm,
+ list<dag> pattern>
+ : NDataI<oops, iops, NVDupLnFrm, itin, opc, dt, asm, "", pattern> {
+ let Inst{24-23} = 0b11;
+ let Inst{21-20} = 0b11;
+ let Inst{19-16} = op19_16;
+ let Inst{11-7} = 0b11000;
+ let Inst{6} = op6;
+ let Inst{4} = 0;
+
+ bits<5> Vd;
+ bits<5> Vm;
+
+ let Inst{22} = Vd{4};
+ let Inst{15-12} = Vd{3-0};
+ let Inst{5} = Vm{4};
+ let Inst{3-0} = Vm{3-0};
+}
+
+// NEONFPPat - Same as Pat<>, but requires that the compiler be using NEON
+// for single-precision FP.
+class NEONFPPat<dag pattern, dag result> : Pat<pattern, result> {
+ list<Predicate> Predicates = [HasNEON,UseNEONForFP];
+}
+
+// VFP/NEON Instruction aliases for type suffices.
+// Note: When EmitPriority == 1, the alias will be used for printing
+class VFPDataTypeInstAlias<string opc, string dt, string asm, dag Result, bit EmitPriority = 0> :
+ InstAlias<!strconcat(opc, dt, "\t", asm), Result, EmitPriority>, Requires<[HasVFP2]>;
+
+// Note: When EmitPriority == 1, the alias will be used for printing
+multiclass VFPDTAnyInstAlias<string opc, string asm, dag Result, bit EmitPriority = 0> {
+ def : VFPDataTypeInstAlias<opc, ".8", asm, Result, EmitPriority>;
+ def : VFPDataTypeInstAlias<opc, ".16", asm, Result, EmitPriority>;
+ def : VFPDataTypeInstAlias<opc, ".32", asm, Result, EmitPriority>;
+ def : VFPDataTypeInstAlias<opc, ".64", asm, Result, EmitPriority>;
+}
+
+// Note: When EmitPriority == 1, the alias will be used for printing
+multiclass NEONDTAnyInstAlias<string opc, string asm, dag Result, bit EmitPriority = 0> {
+ let Predicates = [HasNEON] in {
+ def : VFPDataTypeInstAlias<opc, ".8", asm, Result, EmitPriority>;
+ def : VFPDataTypeInstAlias<opc, ".16", asm, Result, EmitPriority>;
+ def : VFPDataTypeInstAlias<opc, ".32", asm, Result, EmitPriority>;
+ def : VFPDataTypeInstAlias<opc, ".64", asm, Result, EmitPriority>;
+}
+}
+
+// The same alias classes using AsmPseudo instead, for the more complex
+// stuff in NEON that InstAlias can't quite handle.
+// Note that we can't use anonymous defm references here like we can
+// above, as we care about the ultimate instruction enum names generated, unlike
+// for instalias defs.
+class NEONDataTypeAsmPseudoInst<string opc, string dt, string asm, dag iops> :
+ AsmPseudoInst<!strconcat(opc, dt, "\t", asm), iops>, Requires<[HasNEON]>;
+
+// Data type suffix token aliases. Implements Table A7-3 in the ARM ARM.
+def : TokenAlias<".s8", ".i8">;
+def : TokenAlias<".u8", ".i8">;
+def : TokenAlias<".s16", ".i16">;
+def : TokenAlias<".u16", ".i16">;
+def : TokenAlias<".s32", ".i32">;
+def : TokenAlias<".u32", ".i32">;
+def : TokenAlias<".s64", ".i64">;
+def : TokenAlias<".u64", ".i64">;
+
+def : TokenAlias<".i8", ".8">;
+def : TokenAlias<".i16", ".16">;
+def : TokenAlias<".i32", ".32">;
+def : TokenAlias<".i64", ".64">;
+
+def : TokenAlias<".p8", ".8">;
+def : TokenAlias<".p16", ".16">;
+
+def : TokenAlias<".f32", ".32">;
+def : TokenAlias<".f64", ".64">;
+def : TokenAlias<".f", ".f32">;
+def : TokenAlias<".d", ".f64">;
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp
new file mode 100644
index 000000000000..27b64322dfa9
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp
@@ -0,0 +1,136 @@
+//===-- ARMInstrInfo.cpp - ARM Instruction Information --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMInstrInfo.h"
+#include "ARM.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMTargetMachine.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInst.h"
+using namespace llvm;
+
+ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI)
+ : ARMBaseInstrInfo(STI), RI() {}
+
+/// getNoopForMachoTarget - Return the noop instruction to use for a noop.
+void ARMInstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+ if (hasNOP()) {
+ NopInst.setOpcode(ARM::HINT);
+ NopInst.addOperand(MCOperand::createImm(0));
+ NopInst.addOperand(MCOperand::createImm(ARMCC::AL));
+ NopInst.addOperand(MCOperand::createReg(0));
+ } else {
+ NopInst.setOpcode(ARM::MOVr);
+ NopInst.addOperand(MCOperand::createReg(ARM::R0));
+ NopInst.addOperand(MCOperand::createReg(ARM::R0));
+ NopInst.addOperand(MCOperand::createImm(ARMCC::AL));
+ NopInst.addOperand(MCOperand::createReg(0));
+ NopInst.addOperand(MCOperand::createReg(0));
+ }
+}
+
+unsigned ARMInstrInfo::getUnindexedOpcode(unsigned Opc) const {
+ switch (Opc) {
+ default:
+ break;
+ case ARM::LDR_PRE_IMM:
+ case ARM::LDR_PRE_REG:
+ case ARM::LDR_POST_IMM:
+ case ARM::LDR_POST_REG:
+ return ARM::LDRi12;
+ case ARM::LDRH_PRE:
+ case ARM::LDRH_POST:
+ return ARM::LDRH;
+ case ARM::LDRB_PRE_IMM:
+ case ARM::LDRB_PRE_REG:
+ case ARM::LDRB_POST_IMM:
+ case ARM::LDRB_POST_REG:
+ return ARM::LDRBi12;
+ case ARM::LDRSH_PRE:
+ case ARM::LDRSH_POST:
+ return ARM::LDRSH;
+ case ARM::LDRSB_PRE:
+ case ARM::LDRSB_POST:
+ return ARM::LDRSB;
+ case ARM::STR_PRE_IMM:
+ case ARM::STR_PRE_REG:
+ case ARM::STR_POST_IMM:
+ case ARM::STR_POST_REG:
+ return ARM::STRi12;
+ case ARM::STRH_PRE:
+ case ARM::STRH_POST:
+ return ARM::STRH;
+ case ARM::STRB_PRE_IMM:
+ case ARM::STRB_PRE_REG:
+ case ARM::STRB_POST_IMM:
+ case ARM::STRB_POST_REG:
+ return ARM::STRBi12;
+ }
+
+ return 0;
+}
+
+void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI) const {
+ MachineFunction &MF = *MI->getParent()->getParent();
+ const ARMSubtarget &Subtarget = MF.getSubtarget<ARMSubtarget>();
+ const TargetMachine &TM = MF.getTarget();
+
+ if (!Subtarget.useMovt(MF)) {
+ if (TM.isPositionIndependent())
+ expandLoadStackGuardBase(MI, ARM::LDRLIT_ga_pcrel, ARM::LDRi12);
+ else
+ expandLoadStackGuardBase(MI, ARM::LDRLIT_ga_abs, ARM::LDRi12);
+ return;
+ }
+
+ if (!TM.isPositionIndependent()) {
+ expandLoadStackGuardBase(MI, ARM::MOVi32imm, ARM::LDRi12);
+ return;
+ }
+
+ const GlobalValue *GV =
+ cast<GlobalValue>((*MI->memoperands_begin())->getValue());
+
+ if (!Subtarget.isGVIndirectSymbol(GV)) {
+ expandLoadStackGuardBase(MI, ARM::MOV_ga_pcrel, ARM::LDRi12);
+ return;
+ }
+
+ MachineBasicBlock &MBB = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+ unsigned Reg = MI->getOperand(0).getReg();
+ MachineInstrBuilder MIB;
+
+ MIB = BuildMI(MBB, MI, DL, get(ARM::MOV_ga_pcrel_ldr), Reg)
+ .addGlobalAddress(GV, 0, ARMII::MO_NONLAZY);
+ auto Flags = MachineMemOperand::MOLoad |
+ MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant;
+ MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
+ MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 4, 4);
+ MIB.addMemOperand(MMO);
+ MIB = BuildMI(MBB, MI, DL, get(ARM::LDRi12), Reg);
+ MIB.addReg(Reg, RegState::Kill).addImm(0);
+ MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+ AddDefaultPred(MIB);
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h
new file mode 100644
index 000000000000..4b1b7097b18d
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h
@@ -0,0 +1,47 @@
+//===-- ARMInstrInfo.h - ARM Instruction Information ------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMINSTRINFO_H
+#define LLVM_LIB_TARGET_ARM_ARMINSTRINFO_H
+
+#include "ARMBaseInstrInfo.h"
+#include "ARMRegisterInfo.h"
+
+namespace llvm {
+ class ARMSubtarget;
+
+class ARMInstrInfo : public ARMBaseInstrInfo {
+ ARMRegisterInfo RI;
+public:
+ explicit ARMInstrInfo(const ARMSubtarget &STI);
+
+ /// getNoopForMachoTarget - Return the noop instruction to use for a noop.
+ void getNoopForMachoTarget(MCInst &NopInst) const override;
+
+ // Return the non-pre/post incrementing version of 'Opc'. Return 0
+ // if there is not such an opcode.
+ unsigned getUnindexedOpcode(unsigned Opc) const override;
+
+ /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As
+ /// such, whenever a client has an instance of instruction info, it should
+ /// always be able to get register info as well (through this method).
+ ///
+ const ARMRegisterInfo &getRegisterInfo() const override { return RI; }
+
+private:
+ void expandLoadStackGuard(MachineBasicBlock::iterator MI) const override;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td
new file mode 100644
index 000000000000..c47393990e97
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -0,0 +1,5857 @@
+//===- ARMInstrInfo.td - Target Description for ARM Target -*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the ARM instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ARM specific DAG Nodes.
+//
+
+// Type profiles.
+def SDT_ARMCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_ARMCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>;
+def SDT_ARMStructByVal : SDTypeProfile<0, 4,
+ [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+ SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
+
+def SDT_ARMSaveCallPC : SDTypeProfile<0, 1, []>;
+
+def SDT_ARMcall : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
+
+def SDT_ARMCMov : SDTypeProfile<1, 3,
+ [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+ SDTCisVT<3, i32>]>;
+
+def SDT_ARMBrcond : SDTypeProfile<0, 2,
+ [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>]>;
+
+def SDT_ARMBrJT : SDTypeProfile<0, 2,
+ [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>;
+
+def SDT_ARMBr2JT : SDTypeProfile<0, 3,
+ [SDTCisPtrTy<0>, SDTCisVT<1, i32>,
+ SDTCisVT<2, i32>]>;
+
+def SDT_ARMBCC_i64 : SDTypeProfile<0, 6,
+ [SDTCisVT<0, i32>,
+ SDTCisVT<1, i32>, SDTCisVT<2, i32>,
+ SDTCisVT<3, i32>, SDTCisVT<4, i32>,
+ SDTCisVT<5, OtherVT>]>;
+
+def SDT_ARMAnd : SDTypeProfile<1, 2,
+ [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+ SDTCisVT<2, i32>]>;
+
+def SDT_ARMCmp : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
+
+def SDT_ARMPICAdd : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>,
+ SDTCisPtrTy<1>, SDTCisVT<2, i32>]>;
+
+def SDT_ARMThreadPointer : SDTypeProfile<1, 0, [SDTCisPtrTy<0>]>;
+def SDT_ARMEH_SJLJ_Setjmp : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisPtrTy<1>,
+ SDTCisInt<2>]>;
+def SDT_ARMEH_SJLJ_Longjmp: SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisInt<1>]>;
+def SDT_ARMEH_SJLJ_SetupDispatch: SDTypeProfile<0, 0, []>;
+
+def SDT_ARMMEMBARRIER : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_ARMPREFETCH : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisSameAs<1, 2>,
+ SDTCisInt<1>]>;
+
+def SDT_ARMTCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+
+def SDT_ARMBFI : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+ SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
+
+def SDT_WIN__DBZCHK : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
+
+def SDT_ARMMEMCPY : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+ SDTCisVT<2, i32>, SDTCisVT<3, i32>,
+ SDTCisVT<4, i32>]>;
+
+def SDTBinaryArithWithFlags : SDTypeProfile<2, 2,
+ [SDTCisSameAs<0, 2>,
+ SDTCisSameAs<0, 3>,
+ SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+// SDTBinaryArithWithFlagsInOut - RES1, CPSR = op LHS, RHS, CPSR
+def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
+ [SDTCisSameAs<0, 2>,
+ SDTCisSameAs<0, 3>,
+ SDTCisInt<0>,
+ SDTCisVT<1, i32>,
+ SDTCisVT<4, i32>]>;
+
+// Node definitions.
+def ARMWrapper : SDNode<"ARMISD::Wrapper", SDTIntUnaryOp>;
+def ARMWrapperPIC : SDNode<"ARMISD::WrapperPIC", SDTIntUnaryOp>;
+def ARMWrapperJT : SDNode<"ARMISD::WrapperJT", SDTIntUnaryOp>;
+
+def ARMcallseq_start : SDNode<"ISD::CALLSEQ_START", SDT_ARMCallSeqStart,
+ [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>;
+def ARMcallseq_end : SDNode<"ISD::CALLSEQ_END", SDT_ARMCallSeqEnd,
+ [SDNPHasChain, SDNPSideEffect,
+ SDNPOptInGlue, SDNPOutGlue]>;
+def ARMcopystructbyval : SDNode<"ARMISD::COPY_STRUCT_BYVAL" ,
+ SDT_ARMStructByVal,
+ [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
+ SDNPMayStore, SDNPMayLoad]>;
+
+def ARMcall : SDNode<"ARMISD::CALL", SDT_ARMcall,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+ SDNPVariadic]>;
+def ARMcall_pred : SDNode<"ARMISD::CALL_PRED", SDT_ARMcall,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+ SDNPVariadic]>;
+def ARMcall_nolink : SDNode<"ARMISD::CALL_NOLINK", SDT_ARMcall,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+ SDNPVariadic]>;
+
+def ARMretflag : SDNode<"ARMISD::RET_FLAG", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def ARMintretflag : SDNode<"ARMISD::INTRET_FLAG", SDT_ARMcall,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def ARMcmov : SDNode<"ARMISD::CMOV", SDT_ARMCMov,
+ [SDNPInGlue]>;
+
+def ARMssatnoshift : SDNode<"ARMISD::SSAT", SDTIntSatNoShOp, []>;
+
+def ARMbrcond : SDNode<"ARMISD::BRCOND", SDT_ARMBrcond,
+ [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>;
+
+def ARMbrjt : SDNode<"ARMISD::BR_JT", SDT_ARMBrJT,
+ [SDNPHasChain]>;
+def ARMbr2jt : SDNode<"ARMISD::BR2_JT", SDT_ARMBr2JT,
+ [SDNPHasChain]>;
+
+def ARMBcci64 : SDNode<"ARMISD::BCC_i64", SDT_ARMBCC_i64,
+ [SDNPHasChain]>;
+
+def ARMcmp : SDNode<"ARMISD::CMP", SDT_ARMCmp,
+ [SDNPOutGlue]>;
+
+def ARMcmn : SDNode<"ARMISD::CMN", SDT_ARMCmp,
+ [SDNPOutGlue]>;
+
+def ARMcmpZ : SDNode<"ARMISD::CMPZ", SDT_ARMCmp,
+ [SDNPOutGlue, SDNPCommutative]>;
+
+def ARMpic_add : SDNode<"ARMISD::PIC_ADD", SDT_ARMPICAdd>;
+
+def ARMsrl_flag : SDNode<"ARMISD::SRL_FLAG", SDTIntUnaryOp, [SDNPOutGlue]>;
+def ARMsra_flag : SDNode<"ARMISD::SRA_FLAG", SDTIntUnaryOp, [SDNPOutGlue]>;
+def ARMrrx : SDNode<"ARMISD::RRX" , SDTIntUnaryOp, [SDNPInGlue ]>;
+
+def ARMaddc : SDNode<"ARMISD::ADDC", SDTBinaryArithWithFlags,
+ [SDNPCommutative]>;
+def ARMsubc : SDNode<"ARMISD::SUBC", SDTBinaryArithWithFlags>;
+def ARMadde : SDNode<"ARMISD::ADDE", SDTBinaryArithWithFlagsInOut>;
+def ARMsube : SDNode<"ARMISD::SUBE", SDTBinaryArithWithFlagsInOut>;
+
+def ARMthread_pointer: SDNode<"ARMISD::THREAD_POINTER", SDT_ARMThreadPointer>;
+def ARMeh_sjlj_setjmp: SDNode<"ARMISD::EH_SJLJ_SETJMP",
+ SDT_ARMEH_SJLJ_Setjmp,
+ [SDNPHasChain, SDNPSideEffect]>;
+def ARMeh_sjlj_longjmp: SDNode<"ARMISD::EH_SJLJ_LONGJMP",
+ SDT_ARMEH_SJLJ_Longjmp,
+ [SDNPHasChain, SDNPSideEffect]>;
+def ARMeh_sjlj_setup_dispatch: SDNode<"ARMISD::EH_SJLJ_SETUP_DISPATCH",
+ SDT_ARMEH_SJLJ_SetupDispatch,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+def ARMMemBarrierMCR : SDNode<"ARMISD::MEMBARRIER_MCR", SDT_ARMMEMBARRIER,
+ [SDNPHasChain, SDNPSideEffect]>;
+def ARMPreload : SDNode<"ARMISD::PRELOAD", SDT_ARMPREFETCH,
+ [SDNPHasChain, SDNPMayLoad, SDNPMayStore]>;
+
+def ARMtcret : SDNode<"ARMISD::TC_RETURN", SDT_ARMTCRET,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+def ARMbfi : SDNode<"ARMISD::BFI", SDT_ARMBFI>;
+
+def ARMmemcopy : SDNode<"ARMISD::MEMCPY", SDT_ARMMEMCPY,
+ [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
+ SDNPMayStore, SDNPMayLoad]>;
+
+//===----------------------------------------------------------------------===//
+// ARM Instruction Predicate Definitions.
+//
+def HasV4T : Predicate<"Subtarget->hasV4TOps()">,
+ AssemblerPredicate<"HasV4TOps", "armv4t">;
+def NoV4T : Predicate<"!Subtarget->hasV4TOps()">;
+def HasV5T : Predicate<"Subtarget->hasV5TOps()">,
+ AssemblerPredicate<"HasV5TOps", "armv5t">;
+def HasV5TE : Predicate<"Subtarget->hasV5TEOps()">,
+ AssemblerPredicate<"HasV5TEOps", "armv5te">;
+def HasV6 : Predicate<"Subtarget->hasV6Ops()">,
+ AssemblerPredicate<"HasV6Ops", "armv6">;
+def NoV6 : Predicate<"!Subtarget->hasV6Ops()">;
+def HasV6M : Predicate<"Subtarget->hasV6MOps()">,
+ AssemblerPredicate<"HasV6MOps",
+ "armv6m or armv6t2">;
+def HasV8MBaseline : Predicate<"Subtarget->hasV8MBaselineOps()">,
+ AssemblerPredicate<"HasV8MBaselineOps",
+ "armv8m.base">;
+def HasV8MMainline : Predicate<"Subtarget->hasV8MMainlineOps()">,
+ AssemblerPredicate<"HasV8MMainlineOps",
+ "armv8m.main">;
+def HasV6T2 : Predicate<"Subtarget->hasV6T2Ops()">,
+ AssemblerPredicate<"HasV6T2Ops", "armv6t2">;
+def NoV6T2 : Predicate<"!Subtarget->hasV6T2Ops()">;
+def HasV6K : Predicate<"Subtarget->hasV6KOps()">,
+ AssemblerPredicate<"HasV6KOps", "armv6k">;
+def NoV6K : Predicate<"!Subtarget->hasV6KOps()">;
+def HasV7 : Predicate<"Subtarget->hasV7Ops()">,
+ AssemblerPredicate<"HasV7Ops", "armv7">;
+def HasV8 : Predicate<"Subtarget->hasV8Ops()">,
+ AssemblerPredicate<"HasV8Ops", "armv8">;
+def PreV8 : Predicate<"!Subtarget->hasV8Ops()">,
+ AssemblerPredicate<"!HasV8Ops", "armv7 or earlier">;
+def HasV8_1a : Predicate<"Subtarget->hasV8_1aOps()">,
+ AssemblerPredicate<"HasV8_1aOps", "armv8.1a">;
+def HasV8_2a : Predicate<"Subtarget->hasV8_2aOps()">,
+ AssemblerPredicate<"HasV8_2aOps", "armv8.2a">;
+def NoVFP : Predicate<"!Subtarget->hasVFP2()">;
+def HasVFP2 : Predicate<"Subtarget->hasVFP2()">,
+ AssemblerPredicate<"FeatureVFP2", "VFP2">;
+def HasVFP3 : Predicate<"Subtarget->hasVFP3()">,
+ AssemblerPredicate<"FeatureVFP3", "VFP3">;
+def HasVFP4 : Predicate<"Subtarget->hasVFP4()">,
+ AssemblerPredicate<"FeatureVFP4", "VFP4">;
+def HasDPVFP : Predicate<"!Subtarget->isFPOnlySP()">,
+ AssemblerPredicate<"!FeatureVFPOnlySP",
+ "double precision VFP">;
+def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8()">,
+ AssemblerPredicate<"FeatureFPARMv8", "FPARMv8">;
+def HasNEON : Predicate<"Subtarget->hasNEON()">,
+ AssemblerPredicate<"FeatureNEON", "NEON">;
+def HasCrypto : Predicate<"Subtarget->hasCrypto()">,
+ AssemblerPredicate<"FeatureCrypto", "crypto">;
+def HasCRC : Predicate<"Subtarget->hasCRC()">,
+ AssemblerPredicate<"FeatureCRC", "crc">;
+def HasRAS : Predicate<"Subtarget->hasRAS()">,
+ AssemblerPredicate<"FeatureRAS", "ras">;
+def HasFP16 : Predicate<"Subtarget->hasFP16()">,
+ AssemblerPredicate<"FeatureFP16","half-float conversions">;
+def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">,
+ AssemblerPredicate<"FeatureFullFP16","full half-float">;
+def HasDivide : Predicate<"Subtarget->hasDivide()">,
+ AssemblerPredicate<"FeatureHWDiv", "divide in THUMB">;
+def HasDivideInARM : Predicate<"Subtarget->hasDivideInARMMode()">,
+ AssemblerPredicate<"FeatureHWDivARM", "divide in ARM">;
+def HasT2ExtractPack : Predicate<"Subtarget->hasT2ExtractPack()">,
+ AssemblerPredicate<"FeatureT2XtPk",
+ "pack/extract">;
+def HasDSP : Predicate<"Subtarget->hasDSP()">,
+ AssemblerPredicate<"FeatureDSP", "dsp">;
+def HasDB : Predicate<"Subtarget->hasDataBarrier()">,
+ AssemblerPredicate<"FeatureDB",
+ "data-barriers">;
+def HasV7Clrex : Predicate<"Subtarget->hasV7Clrex()">,
+ AssemblerPredicate<"FeatureV7Clrex",
+ "v7 clrex">;
+def HasAcquireRelease : Predicate<"Subtarget->hasAcquireRelease()">,
+ AssemblerPredicate<"FeatureAcquireRelease",
+ "acquire/release">;
+def HasMP : Predicate<"Subtarget->hasMPExtension()">,
+ AssemblerPredicate<"FeatureMP",
+ "mp-extensions">;
+def HasVirtualization: Predicate<"false">,
+ AssemblerPredicate<"FeatureVirtualization",
+ "virtualization-extensions">;
+def HasTrustZone : Predicate<"Subtarget->hasTrustZone()">,
+ AssemblerPredicate<"FeatureTrustZone",
+ "TrustZone">;
+def Has8MSecExt : Predicate<"Subtarget->has8MSecExt()">,
+ AssemblerPredicate<"Feature8MSecExt",
+ "ARMv8-M Security Extensions">;
+def HasZCZ : Predicate<"Subtarget->hasZeroCycleZeroing()">;
+def UseNEONForFP : Predicate<"Subtarget->useNEONForSinglePrecisionFP()">;
+def DontUseNEONForFP : Predicate<"!Subtarget->useNEONForSinglePrecisionFP()">;
+def IsThumb : Predicate<"Subtarget->isThumb()">,
+ AssemblerPredicate<"ModeThumb", "thumb">;
+def IsThumb1Only : Predicate<"Subtarget->isThumb1Only()">;
+def IsThumb2 : Predicate<"Subtarget->isThumb2()">,
+ AssemblerPredicate<"ModeThumb,FeatureThumb2",
+ "thumb2">;
+def IsMClass : Predicate<"Subtarget->isMClass()">,
+ AssemblerPredicate<"FeatureMClass", "armv*m">;
+def IsNotMClass : Predicate<"!Subtarget->isMClass()">,
+ AssemblerPredicate<"!FeatureMClass",
+ "!armv*m">;
+def IsARM : Predicate<"!Subtarget->isThumb()">,
+ AssemblerPredicate<"!ModeThumb", "arm-mode">;
+def IsMachO : Predicate<"Subtarget->isTargetMachO()">;
+def IsNotMachO : Predicate<"!Subtarget->isTargetMachO()">;
+def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">;
+def IsWindows : Predicate<"Subtarget->isTargetWindows()">;
+def IsNotWindows : Predicate<"!Subtarget->isTargetWindows()">;
+def UseNaClTrap : Predicate<"Subtarget->useNaClTrap()">,
+ AssemblerPredicate<"FeatureNaClTrap", "NaCl">;
+def DontUseNaClTrap : Predicate<"!Subtarget->useNaClTrap()">;
+
+// FIXME: Eventually this will be just "hasV6T2Ops".
+def UseMovt : Predicate<"Subtarget->useMovt(*MF)">;
+def DontUseMovt : Predicate<"!Subtarget->useMovt(*MF)">;
+def UseFPVMLx : Predicate<"Subtarget->useFPVMLx()">;
+def UseMulOps : Predicate<"Subtarget->useMulOps()">;
+
+// Prefer fused MAC for fp mul + add over fp VMLA / VMLS if they are available.
+// But only select them if more precision in FP computation is allowed.
+// Do not use them for Darwin platforms.
+def UseFusedMAC : Predicate<"(TM.Options.AllowFPOpFusion =="
+ " FPOpFusion::Fast && "
+ " Subtarget->hasVFP4()) && "
+ "!Subtarget->isTargetDarwin()">;
+def DontUseFusedMAC : Predicate<"!(TM.Options.AllowFPOpFusion =="
+ " FPOpFusion::Fast &&"
+ " Subtarget->hasVFP4()) || "
+ "Subtarget->isTargetDarwin()">;
+
+def HasFastVGETLNi32 : Predicate<"!Subtarget->hasSlowVGETLNi32()">;
+def HasSlowVGETLNi32 : Predicate<"Subtarget->hasSlowVGETLNi32()">;
+
+def HasFastVDUP32 : Predicate<"!Subtarget->hasSlowVDUP32()">;
+def HasSlowVDUP32 : Predicate<"Subtarget->hasSlowVDUP32()">;
+
+def UseVMOVSR : Predicate<"Subtarget->preferVMOVSR() ||"
+ "!Subtarget->useNEONForSinglePrecisionFP()">;
+def DontUseVMOVSR : Predicate<"!Subtarget->preferVMOVSR() &&"
+ "Subtarget->useNEONForSinglePrecisionFP()">;
+
+def IsLE : Predicate<"MF->getDataLayout().isLittleEndian()">;
+def IsBE : Predicate<"MF->getDataLayout().isBigEndian()">;
+
+def GenExecuteOnly : Predicate<"Subtarget->genExecuteOnly()">;
+
+//===----------------------------------------------------------------------===//
+// ARM Flag Definitions.
+
+class RegConstraint<string C> {
+ string Constraints = C;
+}
+
+//===----------------------------------------------------------------------===//
+// ARM specific transformation functions and pattern fragments.
+//
+
+// imm_neg_XFORM - Return the negation of an i32 immediate value.
+def imm_neg_XFORM : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(-(int)N->getZExtValue(), SDLoc(N), MVT::i32);
+}]>;
+
+// imm_not_XFORM - Return the complement of a i32 immediate value.
+def imm_not_XFORM : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(~(int)N->getZExtValue(), SDLoc(N), MVT::i32);
+}]>;
+
+/// imm16_31 predicate - True if the 32-bit immediate is in the range [16,31].
+def imm16_31 : ImmLeaf<i32, [{
+ return (int32_t)Imm >= 16 && (int32_t)Imm < 32;
+}]>;
+
+// sext_16_node predicate - True if the SDNode is sign-extended 16 or more bits.
+def sext_16_node : PatLeaf<(i32 GPR:$a), [{
+ if (CurDAG->ComputeNumSignBits(SDValue(N,0)) >= 17)
+ return true;
+
+ if (N->getOpcode() != ISD::SRA)
+ return false;
+ if (N->getOperand(0).getOpcode() != ISD::SHL)
+ return false;
+
+ auto *ShiftVal = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!ShiftVal || ShiftVal->getZExtValue() != 16)
+ return false;
+
+ ShiftVal = dyn_cast<ConstantSDNode>(N->getOperand(0)->getOperand(1));
+ if (!ShiftVal || ShiftVal->getZExtValue() != 16)
+ return false;
+
+ return true;
+}]>;
+
+/// Split a 32-bit immediate into two 16 bit parts.
+def hi16 : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant((uint32_t)N->getZExtValue() >> 16, SDLoc(N),
+ MVT::i32);
+}]>;
+
+def lo16AllZero : PatLeaf<(i32 imm), [{
+ // Returns true if all low 16-bits are 0.
+ return (((uint32_t)N->getZExtValue()) & 0xFFFFUL) == 0;
+}], hi16>;
+
+class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
+class UnOpFrag <dag res> : PatFrag<(ops node:$Src), res>;
+
+// An 'and' node with a single use.
+def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
+ return N->hasOneUse();
+}]>;
+
+// An 'xor' node with a single use.
+def xor_su : PatFrag<(ops node:$lhs, node:$rhs), (xor node:$lhs, node:$rhs), [{
+ return N->hasOneUse();
+}]>;
+
+// An 'fmul' node with a single use.
+def fmul_su : PatFrag<(ops node:$lhs, node:$rhs), (fmul node:$lhs, node:$rhs),[{
+ return N->hasOneUse();
+}]>;
+
+// An 'fadd' node which checks for single non-hazardous use.
+def fadd_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fadd node:$lhs, node:$rhs),[{
+ return hasNoVMLxHazardUse(N);
+}]>;
+
+// An 'fsub' node which checks for single non-hazardous use.
+def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fsub node:$lhs, node:$rhs),[{
+ return hasNoVMLxHazardUse(N);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Operand Definitions.
+//
+
+// Immediate operands with a shared generic asm render method.
+class ImmAsmOperand : AsmOperandClass { let RenderMethod = "addImmOperands"; }
+
+// Operands that are part of a memory addressing mode.
+class MemOperand : Operand<i32> { let OperandType = "OPERAND_MEMORY"; }
+
+// Branch target.
+// FIXME: rename brtarget to t2_brtarget
+def brtarget : Operand<OtherVT> {
+ let EncoderMethod = "getBranchTargetOpValue";
+ let OperandType = "OPERAND_PCREL";
+ let DecoderMethod = "DecodeT2BROperand";
+}
+
+// Branches targeting ARM-mode must be divisible by 4 if they're a raw
+// immediate.
+def ARMBranchTarget : AsmOperandClass {
+ let Name = "ARMBranchTarget";
+}
+
+// Branches targeting Thumb-mode must be divisible by 2 if they're a raw
+// immediate.
+def ThumbBranchTarget : AsmOperandClass {
+ let Name = "ThumbBranchTarget";
+}
+
+def arm_br_target : Operand<OtherVT> {
+ let ParserMatchClass = ARMBranchTarget;
+ let EncoderMethod = "getARMBranchTargetOpValue";
+ let OperandType = "OPERAND_PCREL";
+}
+
+// Call target for ARM. Handles conditional/unconditional
+// FIXME: rename bl_target to t2_bltarget?
+def arm_bl_target : Operand<i32> {
+ let ParserMatchClass = ARMBranchTarget;
+ let EncoderMethod = "getARMBLTargetOpValue";
+ let OperandType = "OPERAND_PCREL";
+}
+
+// Target for BLX *from* ARM mode.
+def arm_blx_target : Operand<i32> {
+ let ParserMatchClass = ThumbBranchTarget;
+ let EncoderMethod = "getARMBLXTargetOpValue";
+ let OperandType = "OPERAND_PCREL";
+}
+
+// A list of registers separated by comma. Used by load/store multiple.
+def RegListAsmOperand : AsmOperandClass { let Name = "RegList"; }
+def reglist : Operand<i32> {
+ let EncoderMethod = "getRegisterListOpValue";
+ let ParserMatchClass = RegListAsmOperand;
+ let PrintMethod = "printRegisterList";
+ let DecoderMethod = "DecodeRegListOperand";
+}
+
+def GPRPairOp : RegisterOperand<GPRPair, "printGPRPairOperand">;
+
+def DPRRegListAsmOperand : AsmOperandClass { let Name = "DPRRegList"; }
+def dpr_reglist : Operand<i32> {
+ let EncoderMethod = "getRegisterListOpValue";
+ let ParserMatchClass = DPRRegListAsmOperand;
+ let PrintMethod = "printRegisterList";
+ let DecoderMethod = "DecodeDPRRegListOperand";
+}
+
+def SPRRegListAsmOperand : AsmOperandClass { let Name = "SPRRegList"; }
+def spr_reglist : Operand<i32> {
+ let EncoderMethod = "getRegisterListOpValue";
+ let ParserMatchClass = SPRRegListAsmOperand;
+ let PrintMethod = "printRegisterList";
+ let DecoderMethod = "DecodeSPRRegListOperand";
+}
+
+// An operand for the CONSTPOOL_ENTRY pseudo-instruction.
+def cpinst_operand : Operand<i32> {
+ let PrintMethod = "printCPInstOperand";
+}
+
+// Local PC labels.
+def pclabel : Operand<i32> {
+ let PrintMethod = "printPCLabel";
+}
+
+// ADR instruction labels.
+def AdrLabelAsmOperand : AsmOperandClass { let Name = "AdrLabel"; }
+def adrlabel : Operand<i32> {
+ let EncoderMethod = "getAdrLabelOpValue";
+ let ParserMatchClass = AdrLabelAsmOperand;
+ let PrintMethod = "printAdrLabelOperand<0>";
+}
+
+def neon_vcvt_imm32 : Operand<i32> {
+ let EncoderMethod = "getNEONVcvtImm32OpValue";
+ let DecoderMethod = "DecodeVCVTImmOperand";
+}
+
+// rot_imm: An integer that encodes a rotate amount. Must be 8, 16, or 24.
+def rot_imm_XFORM: SDNodeXForm<imm, [{
+ switch (N->getZExtValue()){
+ default: llvm_unreachable(nullptr);
+ case 0: return CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
+ case 8: return CurDAG->getTargetConstant(1, SDLoc(N), MVT::i32);
+ case 16: return CurDAG->getTargetConstant(2, SDLoc(N), MVT::i32);
+ case 24: return CurDAG->getTargetConstant(3, SDLoc(N), MVT::i32);
+ }
+}]>;
+def RotImmAsmOperand : AsmOperandClass {
+ let Name = "RotImm";
+ let ParserMethod = "parseRotImm";
+}
+def rot_imm : Operand<i32>, PatLeaf<(i32 imm), [{
+ int32_t v = N->getZExtValue();
+ return v == 8 || v == 16 || v == 24; }],
+ rot_imm_XFORM> {
+ let PrintMethod = "printRotImmOperand";
+ let ParserMatchClass = RotImmAsmOperand;
+}
+
+// shift_imm: An integer that encodes a shift amount and the type of shift
+// (asr or lsl). The 6-bit immediate encodes as:
+// {5} 0 ==> lsl
+// 1 asr
+// {4-0} imm5 shift amount.
+// asr #32 encoded as imm5 == 0.
+def ShifterImmAsmOperand : AsmOperandClass {
+ let Name = "ShifterImm";
+ let ParserMethod = "parseShifterImm";
+}
+def shift_imm : Operand<i32> {
+ let PrintMethod = "printShiftImmOperand";
+ let ParserMatchClass = ShifterImmAsmOperand;
+}
+
+// shifter_operand operands: so_reg_reg, so_reg_imm, and mod_imm.
+def ShiftedRegAsmOperand : AsmOperandClass { let Name = "RegShiftedReg"; }
+def so_reg_reg : Operand<i32>, // reg reg imm
+ ComplexPattern<i32, 3, "SelectRegShifterOperand",
+ [shl, srl, sra, rotr]> {
+ let EncoderMethod = "getSORegRegOpValue";
+ let PrintMethod = "printSORegRegOperand";
+ let DecoderMethod = "DecodeSORegRegOperand";
+ let ParserMatchClass = ShiftedRegAsmOperand;
+ let MIOperandInfo = (ops GPRnopc, GPRnopc, i32imm);
+}
+
+def ShiftedImmAsmOperand : AsmOperandClass { let Name = "RegShiftedImm"; }
+def so_reg_imm : Operand<i32>, // reg imm
+ ComplexPattern<i32, 2, "SelectImmShifterOperand",
+ [shl, srl, sra, rotr]> {
+ let EncoderMethod = "getSORegImmOpValue";
+ let PrintMethod = "printSORegImmOperand";
+ let DecoderMethod = "DecodeSORegImmOperand";
+ let ParserMatchClass = ShiftedImmAsmOperand;
+ let MIOperandInfo = (ops GPR, i32imm);
+}
+
+// FIXME: Does this need to be distinct from so_reg?
+def shift_so_reg_reg : Operand<i32>, // reg reg imm
+ ComplexPattern<i32, 3, "SelectShiftRegShifterOperand",
+ [shl,srl,sra,rotr]> {
+ let EncoderMethod = "getSORegRegOpValue";
+ let PrintMethod = "printSORegRegOperand";
+ let DecoderMethod = "DecodeSORegRegOperand";
+ let ParserMatchClass = ShiftedRegAsmOperand;
+ let MIOperandInfo = (ops GPR, GPR, i32imm);
+}
+
+// FIXME: Does this need to be distinct from so_reg?
+def shift_so_reg_imm : Operand<i32>, // reg reg imm
+ ComplexPattern<i32, 2, "SelectShiftImmShifterOperand",
+ [shl,srl,sra,rotr]> {
+ let EncoderMethod = "getSORegImmOpValue";
+ let PrintMethod = "printSORegImmOperand";
+ let DecoderMethod = "DecodeSORegImmOperand";
+ let ParserMatchClass = ShiftedImmAsmOperand;
+ let MIOperandInfo = (ops GPR, i32imm);
+}
+
+// mod_imm: match a 32-bit immediate operand, which can be encoded into
+// a 12-bit immediate; an 8-bit integer and a 4-bit rotator (See ARMARM
+// - "Modified Immediate Constants"). Within the MC layer we keep this
+// immediate in its encoded form.
+def ModImmAsmOperand: AsmOperandClass {
+ let Name = "ModImm";
+ let ParserMethod = "parseModImm";
+}
+def mod_imm : Operand<i32>, ImmLeaf<i32, [{
+ return ARM_AM::getSOImmVal(Imm) != -1;
+ }]> {
+ let EncoderMethod = "getModImmOpValue";
+ let PrintMethod = "printModImmOperand";
+ let ParserMatchClass = ModImmAsmOperand;
+}
+
+// Note: the patterns mod_imm_not and mod_imm_neg do not require an encoder
+// method and such, as they are only used on aliases (Pat<> and InstAlias<>).
+// The actual parsing, encoding, decoding are handled by the destination
+// instructions, which use mod_imm.
+
+def ModImmNotAsmOperand : AsmOperandClass { let Name = "ModImmNot"; }
+def mod_imm_not : Operand<i32>, PatLeaf<(imm), [{
+ return ARM_AM::getSOImmVal(~(uint32_t)N->getZExtValue()) != -1;
+ }], imm_not_XFORM> {
+ let ParserMatchClass = ModImmNotAsmOperand;
+}
+
+def ModImmNegAsmOperand : AsmOperandClass { let Name = "ModImmNeg"; }
+def mod_imm_neg : Operand<i32>, PatLeaf<(imm), [{
+ unsigned Value = -(unsigned)N->getZExtValue();
+ return Value && ARM_AM::getSOImmVal(Value) != -1;
+ }], imm_neg_XFORM> {
+ let ParserMatchClass = ModImmNegAsmOperand;
+}
+
+/// arm_i32imm - True for +V6T2, or when isSOImmTwoParVal()
+def arm_i32imm : PatLeaf<(imm), [{
+ if (Subtarget->useMovt(*MF))
+ return true;
+ return ARM_AM::isSOImmTwoPartVal((unsigned)N->getZExtValue());
+}]>;
+
+/// imm0_1 predicate - Immediate in the range [0,1].
+def Imm0_1AsmOperand: ImmAsmOperand { let Name = "Imm0_1"; }
+def imm0_1 : Operand<i32> { let ParserMatchClass = Imm0_1AsmOperand; }
+
+/// imm0_3 predicate - Immediate in the range [0,3].
+def Imm0_3AsmOperand: ImmAsmOperand { let Name = "Imm0_3"; }
+def imm0_3 : Operand<i32> { let ParserMatchClass = Imm0_3AsmOperand; }
+
+/// imm0_7 predicate - Immediate in the range [0,7].
+def Imm0_7AsmOperand: ImmAsmOperand { let Name = "Imm0_7"; }
+def imm0_7 : Operand<i32>, ImmLeaf<i32, [{
+ return Imm >= 0 && Imm < 8;
+}]> {
+ let ParserMatchClass = Imm0_7AsmOperand;
+}
+
+/// imm8 predicate - Immediate is exactly 8.
+def Imm8AsmOperand: ImmAsmOperand { let Name = "Imm8"; }
+def imm8 : Operand<i32>, ImmLeaf<i32, [{ return Imm == 8; }]> {
+ let ParserMatchClass = Imm8AsmOperand;
+}
+
+/// imm16 predicate - Immediate is exactly 16.
+def Imm16AsmOperand: ImmAsmOperand { let Name = "Imm16"; }
+def imm16 : Operand<i32>, ImmLeaf<i32, [{ return Imm == 16; }]> {
+ let ParserMatchClass = Imm16AsmOperand;
+}
+
+/// imm32 predicate - Immediate is exactly 32.
+def Imm32AsmOperand: ImmAsmOperand { let Name = "Imm32"; }
+def imm32 : Operand<i32>, ImmLeaf<i32, [{ return Imm == 32; }]> {
+ let ParserMatchClass = Imm32AsmOperand;
+}
+
+def imm8_or_16 : ImmLeaf<i32, [{ return Imm == 8 || Imm == 16;}]>;
+
+/// imm1_7 predicate - Immediate in the range [1,7].
+def Imm1_7AsmOperand: ImmAsmOperand { let Name = "Imm1_7"; }
+def imm1_7 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm < 8; }]> {
+ let ParserMatchClass = Imm1_7AsmOperand;
+}
+
+/// imm1_15 predicate - Immediate in the range [1,15].
+def Imm1_15AsmOperand: ImmAsmOperand { let Name = "Imm1_15"; }
+def imm1_15 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm < 16; }]> {
+ let ParserMatchClass = Imm1_15AsmOperand;
+}
+
+/// imm1_31 predicate - Immediate in the range [1,31].
+def Imm1_31AsmOperand: ImmAsmOperand { let Name = "Imm1_31"; }
+def imm1_31 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm < 32; }]> {
+ let ParserMatchClass = Imm1_31AsmOperand;
+}
+
+/// imm0_15 predicate - Immediate in the range [0,15].
+def Imm0_15AsmOperand: ImmAsmOperand {
+ let Name = "Imm0_15";
+ let DiagnosticType = "ImmRange0_15";
+}
+def imm0_15 : Operand<i32>, ImmLeaf<i32, [{
+ return Imm >= 0 && Imm < 16;
+}]> {
+ let ParserMatchClass = Imm0_15AsmOperand;
+}
+
+/// imm0_31 predicate - True if the 32-bit immediate is in the range [0,31].
+def Imm0_31AsmOperand: ImmAsmOperand { let Name = "Imm0_31"; }
+def imm0_31 : Operand<i32>, ImmLeaf<i32, [{
+ return Imm >= 0 && Imm < 32;
+}]> {
+ let ParserMatchClass = Imm0_31AsmOperand;
+}
+
+/// imm0_32 predicate - True if the 32-bit immediate is in the range [0,32].
+def Imm0_32AsmOperand: ImmAsmOperand { let Name = "Imm0_32"; }
+def imm0_32 : Operand<i32>, ImmLeaf<i32, [{
+ return Imm >= 0 && Imm < 32;
+}]> {
+ let ParserMatchClass = Imm0_32AsmOperand;
+}
+
+/// imm0_63 predicate - True if the 32-bit immediate is in the range [0,63].
+def Imm0_63AsmOperand: ImmAsmOperand { let Name = "Imm0_63"; }
+def imm0_63 : Operand<i32>, ImmLeaf<i32, [{
+ return Imm >= 0 && Imm < 64;
+}]> {
+ let ParserMatchClass = Imm0_63AsmOperand;
+}
+
+/// imm0_239 predicate - Immediate in the range [0,239].
+def Imm0_239AsmOperand : ImmAsmOperand {
+ let Name = "Imm0_239";
+ let DiagnosticType = "ImmRange0_239";
+}
+def imm0_239 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 240; }]> {
+ let ParserMatchClass = Imm0_239AsmOperand;
+}
+
+/// imm0_255 predicate - Immediate in the range [0,255].
+def Imm0_255AsmOperand : ImmAsmOperand { let Name = "Imm0_255"; }
+def imm0_255 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 256; }]> {
+ let ParserMatchClass = Imm0_255AsmOperand;
+}
+
+/// imm0_65535 - An immediate is in the range [0.65535].
+def Imm0_65535AsmOperand: ImmAsmOperand { let Name = "Imm0_65535"; }
+def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{
+ return Imm >= 0 && Imm < 65536;
+}]> {
+ let ParserMatchClass = Imm0_65535AsmOperand;
+}
+
+// imm0_65535_neg - An immediate whose negative value is in the range [0.65535].
+def imm0_65535_neg : Operand<i32>, ImmLeaf<i32, [{
+ return -Imm >= 0 && -Imm < 65536;
+}]>;
+
+// imm0_65535_expr - For movt/movw - 16-bit immediate that can also reference
+// a relocatable expression.
+//
+// FIXME: This really needs a Thumb version separate from the ARM version.
+// While the range is the same, and can thus use the same match class,
+// the encoding is different so it should have a different encoder method.
+def Imm0_65535ExprAsmOperand: ImmAsmOperand { let Name = "Imm0_65535Expr"; }
+def imm0_65535_expr : Operand<i32> {
+ let EncoderMethod = "getHiLo16ImmOpValue";
+ let ParserMatchClass = Imm0_65535ExprAsmOperand;
+}
+
+def Imm256_65535ExprAsmOperand: ImmAsmOperand { let Name = "Imm256_65535Expr"; }
+def imm256_65535_expr : Operand<i32> {
+ let ParserMatchClass = Imm256_65535ExprAsmOperand;
+}
+
+/// imm24b - True if the 32-bit immediate is encodable in 24 bits.
+def Imm24bitAsmOperand: ImmAsmOperand { let Name = "Imm24bit"; }
+def imm24b : Operand<i32>, ImmLeaf<i32, [{
+ return Imm >= 0 && Imm <= 0xffffff;
+}]> {
+ let ParserMatchClass = Imm24bitAsmOperand;
+}
+
+
+/// bf_inv_mask_imm predicate - An AND mask to clear an arbitrary width bitfield
+/// e.g., 0xf000ffff
+def BitfieldAsmOperand : AsmOperandClass {
+ let Name = "Bitfield";
+ let ParserMethod = "parseBitfield";
+}
+
+def bf_inv_mask_imm : Operand<i32>,
+ PatLeaf<(imm), [{
+ return ARM::isBitFieldInvertedMask(N->getZExtValue());
+}] > {
+ let EncoderMethod = "getBitfieldInvertedMaskOpValue";
+ let PrintMethod = "printBitfieldInvMaskImmOperand";
+ let DecoderMethod = "DecodeBitfieldMaskOperand";
+ let ParserMatchClass = BitfieldAsmOperand;
+}
+
+def imm1_32_XFORM: SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant((int)N->getZExtValue() - 1, SDLoc(N),
+ MVT::i32);
+}]>;
+def Imm1_32AsmOperand: AsmOperandClass { let Name = "Imm1_32"; }
+def imm1_32 : Operand<i32>, PatLeaf<(imm), [{
+ uint64_t Imm = N->getZExtValue();
+ return Imm > 0 && Imm <= 32;
+ }],
+ imm1_32_XFORM> {
+ let PrintMethod = "printImmPlusOneOperand";
+ let ParserMatchClass = Imm1_32AsmOperand;
+}
+
+def imm1_16_XFORM: SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant((int)N->getZExtValue() - 1, SDLoc(N),
+ MVT::i32);
+}]>;
+def Imm1_16AsmOperand: AsmOperandClass { let Name = "Imm1_16"; }
+def imm1_16 : Operand<i32>, PatLeaf<(imm), [{ return Imm > 0 && Imm <= 16; }],
+ imm1_16_XFORM> {
+ let PrintMethod = "printImmPlusOneOperand";
+ let ParserMatchClass = Imm1_16AsmOperand;
+}
+
+// Define ARM specific addressing modes.
+// addrmode_imm12 := reg +/- imm12
+//
+def MemImm12OffsetAsmOperand : AsmOperandClass { let Name = "MemImm12Offset"; }
+class AddrMode_Imm12 : MemOperand,
+ ComplexPattern<i32, 2, "SelectAddrModeImm12", []> {
+ // 12-bit immediate operand. Note that instructions using this encode
+ // #0 and #-0 differently. We flag #-0 as the magic value INT32_MIN. All other
+ // immediate values are as normal.
+
+ let EncoderMethod = "getAddrModeImm12OpValue";
+ let DecoderMethod = "DecodeAddrModeImm12Operand";
+ let ParserMatchClass = MemImm12OffsetAsmOperand;
+ let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+
+def addrmode_imm12 : AddrMode_Imm12 {
+ let PrintMethod = "printAddrModeImm12Operand<false>";
+}
+
+def addrmode_imm12_pre : AddrMode_Imm12 {
+ let PrintMethod = "printAddrModeImm12Operand<true>";
+}
+
+// ldst_so_reg := reg +/- reg shop imm
+//
+def MemRegOffsetAsmOperand : AsmOperandClass { let Name = "MemRegOffset"; }
+def ldst_so_reg : MemOperand,
+ ComplexPattern<i32, 3, "SelectLdStSOReg", []> {
+ let EncoderMethod = "getLdStSORegOpValue";
+ // FIXME: Simplify the printer
+ let PrintMethod = "printAddrMode2Operand";
+ let DecoderMethod = "DecodeSORegMemOperand";
+ let ParserMatchClass = MemRegOffsetAsmOperand;
+ let MIOperandInfo = (ops GPR:$base, GPRnopc:$offsreg, i32imm:$shift);
+}
+
+// postidx_imm8 := +/- [0,255]
+//
+// 9 bit value:
+// {8} 1 is imm8 is non-negative. 0 otherwise.
+// {7-0} [0,255] imm8 value.
+def PostIdxImm8AsmOperand : AsmOperandClass { let Name = "PostIdxImm8"; }
+def postidx_imm8 : MemOperand {
+ let PrintMethod = "printPostIdxImm8Operand";
+ let ParserMatchClass = PostIdxImm8AsmOperand;
+ let MIOperandInfo = (ops i32imm);
+}
+
+// postidx_imm8s4 := +/- [0,1020]
+//
+// 9 bit value:
+// {8} 1 is imm8 is non-negative. 0 otherwise.
+// {7-0} [0,255] imm8 value, scaled by 4.
+def PostIdxImm8s4AsmOperand : AsmOperandClass { let Name = "PostIdxImm8s4"; }
+def postidx_imm8s4 : MemOperand {
+ let PrintMethod = "printPostIdxImm8s4Operand";
+ let ParserMatchClass = PostIdxImm8s4AsmOperand;
+ let MIOperandInfo = (ops i32imm);
+}
+
+
+// postidx_reg := +/- reg
+//
+def PostIdxRegAsmOperand : AsmOperandClass {
+ let Name = "PostIdxReg";
+ let ParserMethod = "parsePostIdxReg";
+}
+def postidx_reg : MemOperand {
+ let EncoderMethod = "getPostIdxRegOpValue";
+ let DecoderMethod = "DecodePostIdxReg";
+ let PrintMethod = "printPostIdxRegOperand";
+ let ParserMatchClass = PostIdxRegAsmOperand;
+ let MIOperandInfo = (ops GPRnopc, i32imm);
+}
+
+
+// addrmode2 := reg +/- imm12
+// := reg +/- reg shop imm
+//
+// FIXME: addrmode2 should be refactored the rest of the way to always
+// use explicit imm vs. reg versions above (addrmode_imm12 and ldst_so_reg).
+def AddrMode2AsmOperand : AsmOperandClass { let Name = "AddrMode2"; }
+def addrmode2 : MemOperand,
+ ComplexPattern<i32, 3, "SelectAddrMode2", []> {
+ let EncoderMethod = "getAddrMode2OpValue";
+ let PrintMethod = "printAddrMode2Operand";
+ let ParserMatchClass = AddrMode2AsmOperand;
+ let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm);
+}
+
+def PostIdxRegShiftedAsmOperand : AsmOperandClass {
+ let Name = "PostIdxRegShifted";
+ let ParserMethod = "parsePostIdxReg";
+}
+def am2offset_reg : MemOperand,
+ ComplexPattern<i32, 2, "SelectAddrMode2OffsetReg",
+ [], [SDNPWantRoot]> {
+ let EncoderMethod = "getAddrMode2OffsetOpValue";
+ let PrintMethod = "printAddrMode2OffsetOperand";
+ // When using this for assembly, it's always as a post-index offset.
+ let ParserMatchClass = PostIdxRegShiftedAsmOperand;
+ let MIOperandInfo = (ops GPRnopc, i32imm);
+}
+
+// FIXME: am2offset_imm should only need the immediate, not the GPR. Having
+// the GPR is purely vestigal at this point.
+def AM2OffsetImmAsmOperand : AsmOperandClass { let Name = "AM2OffsetImm"; }
+def am2offset_imm : MemOperand,
+ ComplexPattern<i32, 2, "SelectAddrMode2OffsetImm",
+ [], [SDNPWantRoot]> {
+ let EncoderMethod = "getAddrMode2OffsetOpValue";
+ let PrintMethod = "printAddrMode2OffsetOperand";
+ let ParserMatchClass = AM2OffsetImmAsmOperand;
+ let MIOperandInfo = (ops GPRnopc, i32imm);
+}
+
+
+// addrmode3 := reg +/- reg
+// addrmode3 := reg +/- imm8
+//
+// FIXME: split into imm vs. reg versions.
+def AddrMode3AsmOperand : AsmOperandClass { let Name = "AddrMode3"; }
+class AddrMode3 : MemOperand,
+ ComplexPattern<i32, 3, "SelectAddrMode3", []> {
+ let EncoderMethod = "getAddrMode3OpValue";
+ let ParserMatchClass = AddrMode3AsmOperand;
+ let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm);
+}
+
+def addrmode3 : AddrMode3
+{
+ let PrintMethod = "printAddrMode3Operand<false>";
+}
+
+def addrmode3_pre : AddrMode3
+{
+ let PrintMethod = "printAddrMode3Operand<true>";
+}
+
+// FIXME: split into imm vs. reg versions.
+// FIXME: parser method to handle +/- register.
+def AM3OffsetAsmOperand : AsmOperandClass {
+ let Name = "AM3Offset";
+ let ParserMethod = "parseAM3Offset";
+}
+def am3offset : MemOperand,
+ ComplexPattern<i32, 2, "SelectAddrMode3Offset",
+ [], [SDNPWantRoot]> {
+ let EncoderMethod = "getAddrMode3OffsetOpValue";
+ let PrintMethod = "printAddrMode3OffsetOperand";
+ let ParserMatchClass = AM3OffsetAsmOperand;
+ let MIOperandInfo = (ops GPR, i32imm);
+}
+
+// ldstm_mode := {ia, ib, da, db}
+//
+def ldstm_mode : OptionalDefOperand<OtherVT, (ops i32), (ops (i32 1))> {
+ let EncoderMethod = "getLdStmModeOpValue";
+ let PrintMethod = "printLdStmModeOperand";
+}
+
+// addrmode5 := reg +/- imm8*4
+//
+def AddrMode5AsmOperand : AsmOperandClass { let Name = "AddrMode5"; }
+class AddrMode5 : MemOperand,
+ ComplexPattern<i32, 2, "SelectAddrMode5", []> {
+ let EncoderMethod = "getAddrMode5OpValue";
+ let DecoderMethod = "DecodeAddrMode5Operand";
+ let ParserMatchClass = AddrMode5AsmOperand;
+ let MIOperandInfo = (ops GPR:$base, i32imm);
+}
+
+def addrmode5 : AddrMode5 {
+ let PrintMethod = "printAddrMode5Operand<false>";
+}
+
+def addrmode5_pre : AddrMode5 {
+ let PrintMethod = "printAddrMode5Operand<true>";
+}
+
+// addrmode5fp16 := reg +/- imm8*2
+//
+def AddrMode5FP16AsmOperand : AsmOperandClass { let Name = "AddrMode5FP16"; }
+class AddrMode5FP16 : Operand<i32>,
+ ComplexPattern<i32, 2, "SelectAddrMode5FP16", []> {
+ let EncoderMethod = "getAddrMode5FP16OpValue";
+ let DecoderMethod = "DecodeAddrMode5FP16Operand";
+ let ParserMatchClass = AddrMode5FP16AsmOperand;
+ let MIOperandInfo = (ops GPR:$base, i32imm);
+}
+
+def addrmode5fp16 : AddrMode5FP16 {
+ let PrintMethod = "printAddrMode5FP16Operand<false>";
+}
+
+// addrmode6 := reg with optional alignment
+//
+def AddrMode6AsmOperand : AsmOperandClass { let Name = "AlignedMemory"; }
+def addrmode6 : MemOperand,
+ ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{
+ let PrintMethod = "printAddrMode6Operand";
+ let MIOperandInfo = (ops GPR:$addr, i32imm:$align);
+ let EncoderMethod = "getAddrMode6AddressOpValue";
+ let DecoderMethod = "DecodeAddrMode6Operand";
+ let ParserMatchClass = AddrMode6AsmOperand;
+}
+
+def am6offset : MemOperand,
+ ComplexPattern<i32, 1, "SelectAddrMode6Offset",
+ [], [SDNPWantRoot]> {
+ let PrintMethod = "printAddrMode6OffsetOperand";
+ let MIOperandInfo = (ops GPR);
+ let EncoderMethod = "getAddrMode6OffsetOpValue";
+ let DecoderMethod = "DecodeGPRRegisterClass";
+}
+
+// Special version of addrmode6 to handle alignment encoding for VST1/VLD1
+// (single element from one lane) for size 32.
+def addrmode6oneL32 : MemOperand,
+ ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{
+ let PrintMethod = "printAddrMode6Operand";
+ let MIOperandInfo = (ops GPR:$addr, i32imm);
+ let EncoderMethod = "getAddrMode6OneLane32AddressOpValue";
+}
+
+// Base class for addrmode6 with specific alignment restrictions.
+class AddrMode6Align : MemOperand,
+ ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{
+ let PrintMethod = "printAddrMode6Operand";
+ let MIOperandInfo = (ops GPR:$addr, i32imm:$align);
+ let EncoderMethod = "getAddrMode6AddressOpValue";
+ let DecoderMethod = "DecodeAddrMode6Operand";
+}
+
+// Special version of addrmode6 to handle no allowed alignment encoding for
+// VLD/VST instructions and checking the alignment is not specified.
+def AddrMode6AlignNoneAsmOperand : AsmOperandClass {
+ let Name = "AlignedMemoryNone";
+ let DiagnosticType = "AlignedMemoryRequiresNone";
+}
+def addrmode6alignNone : AddrMode6Align {
+ // The alignment specifier can only be omitted.
+ let ParserMatchClass = AddrMode6AlignNoneAsmOperand;
+}
+
+// Special version of addrmode6 to handle 16-bit alignment encoding for
+// VLD/VST instructions and checking the alignment value.
+def AddrMode6Align16AsmOperand : AsmOperandClass {
+ let Name = "AlignedMemory16";
+ let DiagnosticType = "AlignedMemoryRequires16";
+}
+def addrmode6align16 : AddrMode6Align {
+ // The alignment specifier can only be 16 or omitted.
+ let ParserMatchClass = AddrMode6Align16AsmOperand;
+}
+
+// Special version of addrmode6 to handle 32-bit alignment encoding for
+// VLD/VST instructions and checking the alignment value.
+def AddrMode6Align32AsmOperand : AsmOperandClass {
+ let Name = "AlignedMemory32";
+ let DiagnosticType = "AlignedMemoryRequires32";
+}
+def addrmode6align32 : AddrMode6Align {
+ // The alignment specifier can only be 32 or omitted.
+ let ParserMatchClass = AddrMode6Align32AsmOperand;
+}
+
+// Special version of addrmode6 to handle 64-bit alignment encoding for
+// VLD/VST instructions and checking the alignment value.
+def AddrMode6Align64AsmOperand : AsmOperandClass {
+ let Name = "AlignedMemory64";
+ let DiagnosticType = "AlignedMemoryRequires64";
+}
+def addrmode6align64 : AddrMode6Align {
+ // The alignment specifier can only be 64 or omitted.
+ let ParserMatchClass = AddrMode6Align64AsmOperand;
+}
+
+// Special version of addrmode6 to handle 64-bit or 128-bit alignment encoding
+// for VLD/VST instructions and checking the alignment value.
+def AddrMode6Align64or128AsmOperand : AsmOperandClass {
+ let Name = "AlignedMemory64or128";
+ let DiagnosticType = "AlignedMemoryRequires64or128";
+}
+def addrmode6align64or128 : AddrMode6Align {
+ // The alignment specifier can only be 64, 128 or omitted.
+ let ParserMatchClass = AddrMode6Align64or128AsmOperand;
+}
+
+// Special version of addrmode6 to handle 64-bit, 128-bit or 256-bit alignment
+// encoding for VLD/VST instructions and checking the alignment value.
+def AddrMode6Align64or128or256AsmOperand : AsmOperandClass {
+ let Name = "AlignedMemory64or128or256";
+ let DiagnosticType = "AlignedMemoryRequires64or128or256";
+}
+def addrmode6align64or128or256 : AddrMode6Align {
+ // The alignment specifier can only be 64, 128, 256 or omitted.
+ let ParserMatchClass = AddrMode6Align64or128or256AsmOperand;
+}
+
+// Special version of addrmode6 to handle alignment encoding for VLD-dup
+// instructions, specifically VLD4-dup.
+def addrmode6dup : MemOperand,
+ ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{
+ let PrintMethod = "printAddrMode6Operand";
+ let MIOperandInfo = (ops GPR:$addr, i32imm);
+ let EncoderMethod = "getAddrMode6DupAddressOpValue";
+ // FIXME: This is close, but not quite right. The alignment specifier is
+ // different.
+ let ParserMatchClass = AddrMode6AsmOperand;
+}
+
+// Base class for addrmode6dup with specific alignment restrictions.
+class AddrMode6DupAlign : MemOperand,
+ ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{
+ let PrintMethod = "printAddrMode6Operand";
+ let MIOperandInfo = (ops GPR:$addr, i32imm);
+ let EncoderMethod = "getAddrMode6DupAddressOpValue";
+}
+
+// Special version of addrmode6 to handle no allowed alignment encoding for
+// VLD-dup instruction and checking the alignment is not specified.
+def AddrMode6dupAlignNoneAsmOperand : AsmOperandClass {
+ let Name = "DupAlignedMemoryNone";
+ let DiagnosticType = "DupAlignedMemoryRequiresNone";
+}
+def addrmode6dupalignNone : AddrMode6DupAlign {
+ // The alignment specifier can only be omitted.
+ let ParserMatchClass = AddrMode6dupAlignNoneAsmOperand;
+}
+
+// Special version of addrmode6 to handle 16-bit alignment encoding for VLD-dup
+// instruction and checking the alignment value.
+def AddrMode6dupAlign16AsmOperand : AsmOperandClass {
+ let Name = "DupAlignedMemory16";
+ let DiagnosticType = "DupAlignedMemoryRequires16";
+}
+def addrmode6dupalign16 : AddrMode6DupAlign {
+ // The alignment specifier can only be 16 or omitted.
+ let ParserMatchClass = AddrMode6dupAlign16AsmOperand;
+}
+
+// Special version of addrmode6 to handle 32-bit alignment encoding for VLD-dup
+// instruction and checking the alignment value.
+def AddrMode6dupAlign32AsmOperand : AsmOperandClass {
+ let Name = "DupAlignedMemory32";
+ let DiagnosticType = "DupAlignedMemoryRequires32";
+}
+def addrmode6dupalign32 : AddrMode6DupAlign {
+ // The alignment specifier can only be 32 or omitted.
+ let ParserMatchClass = AddrMode6dupAlign32AsmOperand;
+}
+
+// Special version of addrmode6 to handle 64-bit alignment encoding for VLD
+// instructions and checking the alignment value.
+def AddrMode6dupAlign64AsmOperand : AsmOperandClass {
+ let Name = "DupAlignedMemory64";
+ let DiagnosticType = "DupAlignedMemoryRequires64";
+}
+def addrmode6dupalign64 : AddrMode6DupAlign {
+ // The alignment specifier can only be 64 or omitted.
+ let ParserMatchClass = AddrMode6dupAlign64AsmOperand;
+}
+
+// Special version of addrmode6 to handle 64-bit or 128-bit alignment encoding
+// for VLD instructions and checking the alignment value.
+def AddrMode6dupAlign64or128AsmOperand : AsmOperandClass {
+ let Name = "DupAlignedMemory64or128";
+ let DiagnosticType = "DupAlignedMemoryRequires64or128";
+}
+def addrmode6dupalign64or128 : AddrMode6DupAlign {
+ // The alignment specifier can only be 64, 128 or omitted.
+ let ParserMatchClass = AddrMode6dupAlign64or128AsmOperand;
+}
+
+// addrmodepc := pc + reg
+//
+def addrmodepc : MemOperand,
+ ComplexPattern<i32, 2, "SelectAddrModePC", []> {
+ let PrintMethod = "printAddrModePCOperand";
+ let MIOperandInfo = (ops GPR, i32imm);
+}
+
+// addr_offset_none := reg
+//
+def MemNoOffsetAsmOperand : AsmOperandClass { let Name = "MemNoOffset"; }
+def addr_offset_none : MemOperand,
+ ComplexPattern<i32, 1, "SelectAddrOffsetNone", []> {
+ let PrintMethod = "printAddrMode7Operand";
+ let DecoderMethod = "DecodeAddrMode7Operand";
+ let ParserMatchClass = MemNoOffsetAsmOperand;
+ let MIOperandInfo = (ops GPR:$base);
+}
+
+def nohash_imm : Operand<i32> {
+ let PrintMethod = "printNoHashImmediate";
+}
+
+def CoprocNumAsmOperand : AsmOperandClass {
+ let Name = "CoprocNum";
+ let ParserMethod = "parseCoprocNumOperand";
+}
+def p_imm : Operand<i32> {
+ let PrintMethod = "printPImmediate";
+ let ParserMatchClass = CoprocNumAsmOperand;
+ let DecoderMethod = "DecodeCoprocessor";
+}
+
+def CoprocRegAsmOperand : AsmOperandClass {
+ let Name = "CoprocReg";
+ let ParserMethod = "parseCoprocRegOperand";
+}
+def c_imm : Operand<i32> {
+ let PrintMethod = "printCImmediate";
+ let ParserMatchClass = CoprocRegAsmOperand;
+}
+def CoprocOptionAsmOperand : AsmOperandClass {
+ let Name = "CoprocOption";
+ let ParserMethod = "parseCoprocOptionOperand";
+}
+def coproc_option_imm : Operand<i32> {
+ let PrintMethod = "printCoprocOptionImm";
+ let ParserMatchClass = CoprocOptionAsmOperand;
+}
+
+//===----------------------------------------------------------------------===//
+
+include "ARMInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Multiclass helpers...
+//
+
+/// AsI1_bin_irs - Defines a set of (op r, {mod_imm|r|so_reg}) patterns for a
+/// binop that produces a value.
+let TwoOperandAliasConstraint = "$Rn = $Rd" in
+multiclass AsI1_bin_irs<bits<4> opcod, string opc,
+ InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+ SDPatternOperator opnode, bit Commutable = 0> {
+ // The register-immediate version is re-materializable. This is useful
+ // in particular for taking the address of a local.
+ let isReMaterializable = 1 in {
+ def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm), DPFrm,
+ iii, opc, "\t$Rd, $Rn, $imm",
+ [(set GPR:$Rd, (opnode GPR:$Rn, mod_imm:$imm))]>,
+ Sched<[WriteALU, ReadALU]> {
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<12> imm;
+ let Inst{25} = 1;
+ let Inst{19-16} = Rn;
+ let Inst{15-12} = Rd;
+ let Inst{11-0} = imm;
+ }
+ }
+ def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm,
+ iir, opc, "\t$Rd, $Rn, $Rm",
+ [(set GPR:$Rd, (opnode GPR:$Rn, GPR:$Rm))]>,
+ Sched<[WriteALU, ReadALU, ReadALU]> {
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<4> Rm;
+ let Inst{25} = 0;
+ let isCommutable = Commutable;
+ let Inst{19-16} = Rn;
+ let Inst{15-12} = Rd;
+ let Inst{11-4} = 0b00000000;
+ let Inst{3-0} = Rm;
+ }
+
+ def rsi : AsI1<opcod, (outs GPR:$Rd),
+ (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm,
+ iis, opc, "\t$Rd, $Rn, $shift",
+ [(set GPR:$Rd, (opnode GPR:$Rn, so_reg_imm:$shift))]>,
+ Sched<[WriteALUsi, ReadALU]> {
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<12> shift;
+ let Inst{25} = 0;
+ let Inst{19-16} = Rn;
+ let Inst{15-12} = Rd;
+ let Inst{11-5} = shift{11-5};
+ let Inst{4} = 0;
+ let Inst{3-0} = shift{3-0};
+ }
+
+ def rsr : AsI1<opcod, (outs GPR:$Rd),
+ (ins GPR:$Rn, so_reg_reg:$shift), DPSoRegRegFrm,
+ iis, opc, "\t$Rd, $Rn, $shift",
+ [(set GPR:$Rd, (opnode GPR:$Rn, so_reg_reg:$shift))]>,
+ Sched<[WriteALUsr, ReadALUsr]> {
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<12> shift;
+ let Inst{25} = 0;
+ let Inst{19-16} = Rn;
+ let Inst{15-12} = Rd;
+ let Inst{11-8} = shift{11-8};
+ let Inst{7} = 0;
+ let Inst{6-5} = shift{6-5};
+ let Inst{4} = 1;
+ let Inst{3-0} = shift{3-0};
+ }
+}
+
+/// AsI1_rbin_irs - Same as AsI1_bin_irs except the order of operands are
+/// reversed. The 'rr' form is only defined for the disassembler; for codegen
+/// it is equivalent to the AsI1_bin_irs counterpart.
+let TwoOperandAliasConstraint = "$Rn = $Rd" in
+multiclass AsI1_rbin_irs<bits<4> opcod, string opc,
+ InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+ SDNode opnode, bit Commutable = 0> {
+ // The register-immediate version is re-materializable. This is useful
+ // in particular for taking the address of a local.
+ let isReMaterializable = 1 in {
+ def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm), DPFrm,
+ iii, opc, "\t$Rd, $Rn, $imm",
+ [(set GPR:$Rd, (opnode mod_imm:$imm, GPR:$Rn))]>,
+ Sched<[WriteALU, ReadALU]> {
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<12> imm;
+ let Inst{25} = 1;
+ let Inst{19-16} = Rn;
+ let Inst{15-12} = Rd;
+ let Inst{11-0} = imm;
+ }
+ }
+ def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm,
+ iir, opc, "\t$Rd, $Rn, $Rm",
+ [/* pattern left blank */]>,
+ Sched<[WriteALU, ReadALU, ReadALU]> {
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<4> Rm;
+ let Inst{11-4} = 0b00000000;
+ let Inst{25} = 0;
+ let Inst{3-0} = Rm;
+ let Inst{15-12} = Rd;
+ let Inst{19-16} = Rn;
+ }
+
+ def rsi : AsI1<opcod, (outs GPR:$Rd),
+ (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm,
+ iis, opc, "\t$Rd, $Rn, $shift",
+ [(set GPR:$Rd, (opnode so_reg_imm:$shift, GPR:$Rn))]>,
+ Sched<[WriteALUsi, ReadALU]> {
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<12> shift;
+ let Inst{25} = 0;
+ let Inst{19-16} = Rn;
+ let Inst{15-12} = Rd;
+ let Inst{11-5} = shift{11-5};
+ let Inst{4} = 0;
+ let Inst{3-0} = shift{3-0};
+ }
+
+ def rsr : AsI1<opcod, (outs GPR:$Rd),
+ (ins GPR:$Rn, so_reg_reg:$shift), DPSoRegRegFrm,
+ iis, opc, "\t$Rd, $Rn, $shift",
+ [(set GPR:$Rd, (opnode so_reg_reg:$shift, GPR:$Rn))]>,
+ Sched<[WriteALUsr, ReadALUsr]> {
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<12> shift;
+ let Inst{25} = 0;
+ let Inst{19-16} = Rn;
+ let Inst{15-12} = Rd;
+ let Inst{11-8} = shift{11-8};
+ let Inst{7} = 0;
+ let Inst{6-5} = shift{6-5};
+ let Inst{4} = 1;
+ let Inst{3-0} = shift{3-0};
+ }
+}
+
+/// AsI1_bin_s_irs - Same as AsI1_bin_irs except it sets the 's' bit by default.
+///
+/// These opcodes will be converted to the real non-S opcodes by
+/// AdjustInstrPostInstrSelection after giving them an optional CPSR operand.
+let hasPostISelHook = 1, Defs = [CPSR] in {
+multiclass AsI1_bin_s_irs<InstrItinClass iii, InstrItinClass iir,
+ InstrItinClass iis, SDNode opnode,
+ bit Commutable = 0> {
+ def ri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm, pred:$p),
+ 4, iii,
+ [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, mod_imm:$imm))]>,
+ Sched<[WriteALU, ReadALU]>;
+
+ def rr : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, pred:$p),
+ 4, iir,
+ [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, GPR:$Rm))]>,
+ Sched<[WriteALU, ReadALU, ReadALU]> {
+ let isCommutable = Commutable;
+ }
+ def rsi : ARMPseudoInst<(outs GPR:$Rd),
+ (ins GPR:$Rn, so_reg_imm:$shift, pred:$p),
+ 4, iis,
+ [(set GPR:$Rd, CPSR, (opnode GPR:$Rn,
+ so_reg_imm:$shift))]>,
+ Sched<[WriteALUsi, ReadALU]>;
+
+ def rsr : ARMPseudoInst<(outs GPR:$Rd),
+ (ins GPR:$Rn, so_reg_reg:$shift, pred:$p),
+ 4, iis,
+ [(set GPR:$Rd, CPSR, (opnode GPR:$Rn,
+ so_reg_reg:$shift))]>,
+ Sched<[WriteALUSsr, ReadALUsr]>;
+}
+}
+
+/// AsI1_rbin_s_is - Same as AsI1_bin_s_irs, except selection DAG
+/// operands are reversed.
+let hasPostISelHook = 1, Defs = [CPSR] in {
+multiclass AsI1_rbin_s_is<InstrItinClass iii, InstrItinClass iir,
+ InstrItinClass iis, SDNode opnode,
+ bit Commutable = 0> {
+ def ri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm, pred:$p),
+ 4, iii,
+ [(set GPR:$Rd, CPSR, (opnode mod_imm:$imm, GPR:$Rn))]>,
+ Sched<[WriteALU, ReadALU]>;
+
+ def rsi : ARMPseudoInst<(outs GPR:$Rd),
+ (ins GPR:$Rn, so_reg_imm:$shift, pred:$p),
+ 4, iis,
+ [(set GPR:$Rd, CPSR, (opnode so_reg_imm:$shift,
+ GPR:$Rn))]>,
+ Sched<[WriteALUsi, ReadALU]>;
+
+ def rsr : ARMPseudoInst<(outs GPR:$Rd),
+ (ins GPR:$Rn, so_reg_reg:$shift, pred:$p),
+ 4, iis,
+ [(set GPR:$Rd, CPSR, (opnode so_reg_reg:$shift,
+ GPR:$Rn))]>,
+ Sched<[WriteALUSsr, ReadALUsr]>;
+}
+}
+
+/// AI1_cmp_irs - Defines a set of (op r, {mod_imm|r|so_reg}) cmp / test
+/// patterns. Similar to AsI1_bin_irs except the instruction does not produce
+/// a explicit result, only implicitly set CPSR.
+let isCompare = 1, Defs = [CPSR] in {
+multiclass AI1_cmp_irs<bits<4> opcod, string opc,
+ InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+ SDPatternOperator opnode, bit Commutable = 0,
+ string rrDecoderMethod = ""> {
+ def ri : AI1<opcod, (outs), (ins GPR:$Rn, mod_imm:$imm), DPFrm, iii,
+ opc, "\t$Rn, $imm",
+ [(opnode GPR:$Rn, mod_imm:$imm)]>,
+ Sched<[WriteCMP, ReadALU]> {
+ bits<4> Rn;
+ bits<12> imm;
+ let Inst{25} = 1;
+ let Inst{20} = 1;
+ let Inst{19-16} = Rn;
+ let Inst{15-12} = 0b0000;
+ let Inst{11-0} = imm;
+
+ let Unpredictable{15-12} = 0b1111;
+ }
+ def rr : AI1<opcod, (outs), (ins GPR:$Rn, GPR:$Rm), DPFrm, iir,
+ opc, "\t$Rn, $Rm",
+ [(opnode GPR:$Rn, GPR:$Rm)]>,
+ Sched<[WriteCMP, ReadALU, ReadALU]> {
+ bits<4> Rn;
+ bits<4> Rm;
+ let isCommutable = Commutable;
+ let Inst{25} = 0;
+ let Inst{20} = 1;
+ let Inst{19-16} = Rn;
+ let Inst{15-12} = 0b0000;
+ let Inst{11-4} = 0b00000000;
+ let Inst{3-0} = Rm;
+ let DecoderMethod = rrDecoderMethod;
+
+ let Unpredictable{15-12} = 0b1111;
+ }
+ def rsi : AI1<opcod, (outs),
+ (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm, iis,
+ opc, "\t$Rn, $shift",
+ [(opnode GPR:$Rn, so_reg_imm:$shift)]>,
+ Sched<[WriteCMPsi, ReadALU]> {
+ bits<4> Rn;
+ bits<12> shift;
+ let Inst{25} = 0;
+ let Inst{20} = 1;
+ let Inst{19-16} = Rn;
+ let Inst{15-12} = 0b0000;
+ let Inst{11-5} = shift{11-5};
+ let Inst{4} = 0;
+ let Inst{3-0} = shift{3-0};
+
+ let Unpredictable{15-12} = 0b1111;
+ }
+ def rsr : AI1<opcod, (outs),
+ (ins GPRnopc:$Rn, so_reg_reg:$shift), DPSoRegRegFrm, iis,
+ opc, "\t$Rn, $shift",
+ [(opnode GPRnopc:$Rn, so_reg_reg:$shift)]>,
+ Sched<[WriteCMPsr, ReadALU]> {
+ bits<4> Rn;
+ bits<12> shift;
+ let Inst{25} = 0;
+ let Inst{20} = 1;
+ let Inst{19-16} = Rn;
+ let Inst{15-12} = 0b0000;
+ let Inst{11-8} = shift{11-8};
+ let Inst{7} = 0;
+ let Inst{6-5} = shift{6-5};
+ let Inst{4} = 1;
+ let Inst{3-0} = shift{3-0};
+
+ let Unpredictable{15-12} = 0b1111;
+ }
+
+}
+}
+
+/// AI_ext_rrot - A unary operation with two forms: one whose operand is a
+/// register and one whose operand is a register rotated by 8/16/24.
+/// FIXME: Remove the 'r' variant. Its rot_imm is zero.
+class AI_ext_rrot<bits<8> opcod, string opc, PatFrag opnode>
+ : AExtI<opcod, (outs GPRnopc:$Rd), (ins GPRnopc:$Rm, rot_imm:$rot),
+ IIC_iEXTr, opc, "\t$Rd, $Rm$rot",
+ [(set GPRnopc:$Rd, (opnode (rotr GPRnopc:$Rm, rot_imm:$rot)))]>,
+ Requires<[IsARM, HasV6]>, Sched<[WriteALUsi]> {
+ bits<4> Rd;
+ bits<4> Rm;
+ bits<2> rot;
+ let Inst{19-16} = 0b1111;
+ let Inst{15-12} = Rd;
+ let Inst{11-10} = rot;
+ let Inst{3-0} = Rm;
+}
+
+class AI_ext_rrot_np<bits<8> opcod, string opc>
+ : AExtI<opcod, (outs GPRnopc:$Rd), (ins GPRnopc:$Rm, rot_imm:$rot),
+ IIC_iEXTr, opc, "\t$Rd, $Rm$rot", []>,
+ Requires<[IsARM, HasV6]>, Sched<[WriteALUsi]> {
+ bits<2> rot;
+ let Inst{19-16} = 0b1111;
+ let Inst{11-10} = rot;
+ }
+
+/// AI_exta_rrot - A binary operation with two forms: one whose operand is a
+/// register and one whose operand is a register rotated by 8/16/24.
+class AI_exta_rrot<bits<8> opcod, string opc, PatFrag opnode>
+ : AExtI<opcod, (outs GPRnopc:$Rd), (ins GPR:$Rn, GPRnopc:$Rm, rot_imm:$rot),
+ IIC_iEXTAr, opc, "\t$Rd, $Rn, $Rm$rot",
+ [(set GPRnopc:$Rd, (opnode GPR:$Rn,
+ (rotr GPRnopc:$Rm, rot_imm:$rot)))]>,
+ Requires<[IsARM, HasV6]>, Sched<[WriteALUsr]> {
+ bits<4> Rd;
+ bits<4> Rm;
+ bits<4> Rn;
+ bits<2> rot;
+ let Inst{19-16} = Rn;
+ let Inst{15-12} = Rd;
+ let Inst{11-10} = rot;
+ let Inst{9-4} = 0b000111;
+ let Inst{3-0} = Rm;
+}
+
+class AI_exta_rrot_np<bits<8> opcod, string opc>
+ : AExtI<opcod, (outs GPRnopc:$Rd), (ins GPR:$Rn, GPRnopc:$Rm, rot_imm:$rot),
+ IIC_iEXTAr, opc, "\t$Rd, $Rn, $Rm$rot", []>,
+ Requires<[IsARM, HasV6]>, Sched<[WriteALUsr]> {
+ bits<4> Rn;
+ bits<2> rot;
+ let Inst{19-16} = Rn;
+ let Inst{11-10} = rot;
+}
+
+/// AI1_adde_sube_irs - Define instructions and patterns for adde and sube.
+let TwoOperandAliasConstraint = "$Rn = $Rd" in
+multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, SDNode opnode,
+ bit Commutable = 0> {
+ let hasPostISelHook = 1, Defs = [CPSR], Uses = [CPSR] in {
+ def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm),
+ DPFrm, IIC_iALUi, opc, "\t$Rd, $Rn, $imm",
+ [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, mod_imm:$imm, CPSR))]>,
+ Requires<[IsARM]>,
+ Sched<[WriteALU, ReadALU]> {
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<12> imm;
+ let Inst{25} = 1;
+ let Inst{15-12} = Rd;
+ let Inst{19-16} = Rn;
+ let Inst{11-0} = imm;
+ }
+ def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+ DPFrm, IIC_iALUr, opc, "\t$Rd, $Rn, $Rm",
+ [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, GPR:$Rm, CPSR))]>,
+ Requires<[IsARM]>,
+ Sched<[WriteALU, ReadALU, ReadALU]> {
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<4> Rm;
+ let Inst{11-4} = 0b00000000;
+ let Inst{25} = 0;
+ let isCommutable = Commutable;
+ let Inst{3-0} = Rm;
+ let Inst{15-12} = Rd;
+ let Inst{19-16} = Rn;
+ }
+ def rsi : AsI1<opcod, (outs GPR:$Rd),
+ (ins GPR:$Rn, so_reg_imm:$shift),
+ DPSoRegImmFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift",
+ [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, so_reg_imm:$shift, CPSR))]>,
+ Requires<[IsARM]>,
+ Sched<[WriteALUsi, ReadALU]> {
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<12> shift;
+ let Inst{25} = 0;
+ let Inst{19-16} = Rn;
+ let Inst{15-12} = Rd;
+ let Inst{11-5} = shift{11-5};
+ let Inst{4} = 0;
+ let Inst{3-0} = shift{3-0};
+ }
+ def rsr : AsI1<opcod, (outs GPRnopc:$Rd),
+ (ins GPRnopc:$Rn, so_reg_reg:$shift),
+ DPSoRegRegFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift",
+ [(set GPRnopc:$Rd, CPSR,
+ (opnode GPRnopc:$Rn, so_reg_reg:$shift, CPSR))]>,
+ Requires<[IsARM]>,
+ Sched<[WriteALUsr, ReadALUsr]> {
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<12> shift;
+ let Inst{25} = 0;
+ let Inst{19-16} = Rn;
+ let Inst{15-12} = Rd;
+ let Inst{11-8} = shift{11-8};
+ let Inst{7} = 0;
+ let Inst{6-5} = shift{6-5};
+ let Inst{4} = 1;
+ let Inst{3-0} = shift{3-0};
+ }
+ }
+}
+
+/// AI1_rsc_irs - Define instructions and patterns for rsc
+let TwoOperandAliasConstraint = "$Rn = $Rd" in
+multiclass AI1_rsc_irs<bits<4> opcod, string opc, SDNode opnode> {
+ let hasPostISelHook = 1, Defs = [CPSR], Uses = [CPSR] in {
+ def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm),
+ DPFrm, IIC_iALUi, opc, "\t$Rd, $Rn, $imm",
+ [(set GPR:$Rd, CPSR, (opnode mod_imm:$imm, GPR:$Rn, CPSR))]>,
+ Requires<[IsARM]>,
+ Sched<[WriteALU, ReadALU]> {
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<12> imm;
+ let Inst{25} = 1;
+ let Inst{15-12} = Rd;
+ let Inst{19-16} = Rn;
+ let Inst{11-0} = imm;
+ }
+ def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+ DPFrm, IIC_iALUr, opc, "\t$Rd, $Rn, $Rm",
+ [/* pattern left blank */]>,
+ Sched<[WriteALU, ReadALU, ReadALU]> {
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<4> Rm;
+ let Inst{11-4} = 0b00000000;
+ let Inst{25} = 0;
+ let Inst{3-0} = Rm;
+ let Inst{15-12} = Rd;
+ let Inst{19-16} = Rn;
+ }
+ def rsi : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg_imm:$shift),
+ DPSoRegImmFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift",
+ [(set GPR:$Rd, CPSR, (opnode so_reg_imm:$shift, GPR:$Rn, CPSR))]>,
+ Requires<[IsARM]>,
+ Sched<[WriteALUsi, ReadALU]> {
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<12> shift;
+ let Inst{25} = 0;
+ let Inst{19-16} = Rn;
+ let Inst{15-12} = Rd;
+ let Inst{11-5} = shift{11-5};
+ let Inst{4} = 0;
+ let Inst{3-0} = shift{3-0};
+ }
+ def rsr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg_reg:$shift),
+ DPSoRegRegFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift",
+ [(set GPR:$Rd, CPSR, (opnode so_reg_reg:$shift, GPR:$Rn, CPSR))]>,
+ Requires<[IsARM]>,
+ Sched<[WriteALUsr, ReadALUsr]> {
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<12> shift;
+ let Inst{25} = 0;
+ let Inst{19-16} = Rn;
+ let Inst{15-12} = Rd;
+ let Inst{11-8} = shift{11-8};
+ let Inst{7} = 0;
+ let Inst{6-5} = shift{6-5};
+ let Inst{4} = 1;
+ let Inst{3-0} = shift{3-0};
+ }
+ }
+}
+
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+multiclass AI_ldr1<bit isByte, string opc, InstrItinClass iii,
+ InstrItinClass iir, PatFrag opnode> {
+ // Note: We use the complex addrmode_imm12 rather than just an input
+ // GPR and a constrained immediate so that we can use this to match
+ // frame index references and avoid matching constant pool references.
+ def i12: AI2ldst<0b010, 1, isByte, (outs GPR:$Rt), (ins addrmode_imm12:$addr),
+ AddrMode_i12, LdFrm, iii, opc, "\t$Rt, $addr",
+ [(set GPR:$Rt, (opnode addrmode_imm12:$addr))]> {
+ bits<4> Rt;
+ bits<17> addr;
+ let Inst{23} = addr{12}; // U (add = ('U' == 1))
+ let Inst{19-16} = addr{16-13}; // Rn
+ let Inst{15-12} = Rt;
+ let Inst{11-0} = addr{11-0}; // imm12
+ }
+ def rs : AI2ldst<0b011, 1, isByte, (outs GPR:$Rt), (ins ldst_so_reg:$shift),
+ AddrModeNone, LdFrm, iir, opc, "\t$Rt, $shift",
+ [(set GPR:$Rt, (opnode ldst_so_reg:$shift))]> {
+ bits<4> Rt;
+ bits<17> shift;
+ let shift{4} = 0; // Inst{4} = 0
+ let Inst{23} = shift{12}; // U (add = ('U' == 1))
+ let Inst{19-16} = shift{16-13}; // Rn
+ let Inst{15-12} = Rt;
+ let Inst{11-0} = shift{11-0};
+ }
+}
+}
+
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+multiclass AI_ldr1nopc<bit isByte, string opc, InstrItinClass iii,
+ InstrItinClass iir, PatFrag opnode> {
+ // Note: We use the complex addrmode_imm12 rather than just an input
+ // GPR and a constrained immediate so that we can use this to match
+ // frame index references and avoid matching constant pool references.
+ def i12: AI2ldst<0b010, 1, isByte, (outs GPRnopc:$Rt),
+ (ins addrmode_imm12:$addr),
+ AddrMode_i12, LdFrm, iii, opc, "\t$Rt, $addr",
+ [(set GPRnopc:$Rt, (opnode addrmode_imm12:$addr))]> {
+ bits<4> Rt;
+ bits<17> addr;
+ let Inst{23} = addr{12}; // U (add = ('U' == 1))
+ let Inst{19-16} = addr{16-13}; // Rn
+ let Inst{15-12} = Rt;
+ let Inst{11-0} = addr{11-0}; // imm12
+ }
+ def rs : AI2ldst<0b011, 1, isByte, (outs GPRnopc:$Rt),
+ (ins ldst_so_reg:$shift),
+ AddrModeNone, LdFrm, iir, opc, "\t$Rt, $shift",
+ [(set GPRnopc:$Rt, (opnode ldst_so_reg:$shift))]> {
+ bits<4> Rt;
+ bits<17> shift;
+ let shift{4} = 0; // Inst{4} = 0
+ let Inst{23} = shift{12}; // U (add = ('U' == 1))
+ let Inst{19-16} = shift{16-13}; // Rn
+ let Inst{15-12} = Rt;
+ let Inst{11-0} = shift{11-0};
+ }
+}
+}
+
+
+multiclass AI_str1<bit isByte, string opc, InstrItinClass iii,
+ InstrItinClass iir, PatFrag opnode> {
+ // Note: We use the complex addrmode_imm12 rather than just an input
+ // GPR and a constrained immediate so that we can use this to match
+ // frame index references and avoid matching constant pool references.
+ def i12 : AI2ldst<0b010, 0, isByte, (outs),
+ (ins GPR:$Rt, addrmode_imm12:$addr),
+ AddrMode_i12, StFrm, iii, opc, "\t$Rt, $addr",
+ [(opnode GPR:$Rt, addrmode_imm12:$addr)]> {
+ bits<4> Rt;
+ bits<17> addr;
+ let Inst{23} = addr{12}; // U (add = ('U' == 1))
+ let Inst{19-16} = addr{16-13}; // Rn
+ let Inst{15-12} = Rt;
+ let Inst{11-0} = addr{11-0}; // imm12
+ }
+ def rs : AI2ldst<0b011, 0, isByte, (outs), (ins GPR:$Rt, ldst_so_reg:$shift),
+ AddrModeNone, StFrm, iir, opc, "\t$Rt, $shift",
+ [(opnode GPR:$Rt, ldst_so_reg:$shift)]> {
+ bits<4> Rt;
+ bits<17> shift;
+ let shift{4} = 0; // Inst{4} = 0
+ let Inst{23} = shift{12}; // U (add = ('U' == 1))
+ let Inst{19-16} = shift{16-13}; // Rn
+ let Inst{15-12} = Rt;
+ let Inst{11-0} = shift{11-0};
+ }
+}
+
+multiclass AI_str1nopc<bit isByte, string opc, InstrItinClass iii,
+ InstrItinClass iir, PatFrag opnode> {
+ // Note: We use the complex addrmode_imm12 rather than just an input
+ // GPR and a constrained immediate so that we can use this to match
+ // frame index references and avoid matching constant pool references.
+ def i12 : AI2ldst<0b010, 0, isByte, (outs),
+ (ins GPRnopc:$Rt, addrmode_imm12:$addr),
+ AddrMode_i12, StFrm, iii, opc, "\t$Rt, $addr",
+ [(opnode GPRnopc:$Rt, addrmode_imm12:$addr)]> {
+ bits<4> Rt;
+ bits<17> addr;
+ let Inst{23} = addr{12}; // U (add = ('U' == 1))
+ let Inst{19-16} = addr{16-13}; // Rn
+ let Inst{15-12} = Rt;
+ let Inst{11-0} = addr{11-0}; // imm12
+ }
+ def rs : AI2ldst<0b011, 0, isByte, (outs),
+ (ins GPRnopc:$Rt, ldst_so_reg:$shift),
+ AddrModeNone, StFrm, iir, opc, "\t$Rt, $shift",
+ [(opnode GPRnopc:$Rt, ldst_so_reg:$shift)]> {
+ bits<4> Rt;
+ bits<17> shift;
+ let shift{4} = 0; // Inst{4} = 0
+ let Inst{23} = shift{12}; // U (add = ('U' == 1))
+ let Inst{19-16} = shift{16-13}; // Rn
+ let Inst{15-12} = Rt;
+ let Inst{11-0} = shift{11-0};
+ }
+}
+
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Instructions.
+//
+
+/// CONSTPOOL_ENTRY - This instruction represents a floating constant pool in
+/// the function. The first operand is the ID# for this instruction, the second
+/// is the index into the MachineConstantPool that this is, the third is the
+/// size in bytes of this constant pool entry.
+let hasSideEffects = 0, isNotDuplicable = 1 in
+def CONSTPOOL_ENTRY :
+PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
+ i32imm:$size), NoItinerary, []>;
+
+/// A jumptable consisting of direct 32-bit addresses of the destination basic
+/// blocks (either absolute, or relative to the start of the jump-table in PIC
+/// mode). Used mostly in ARM and Thumb-1 modes.
+def JUMPTABLE_ADDRS :
+PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
+ i32imm:$size), NoItinerary, []>;
+
+/// A jumptable consisting of 32-bit jump instructions. Used for Thumb-2 tables
+/// that cannot be optimised to use TBB or TBH.
+def JUMPTABLE_INSTS :
+PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
+ i32imm:$size), NoItinerary, []>;
+
+/// A jumptable consisting of 8-bit unsigned integers representing offsets from
+/// a TBB instruction.
+def JUMPTABLE_TBB :
+PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
+ i32imm:$size), NoItinerary, []>;
+
+/// A jumptable consisting of 16-bit unsigned integers representing offsets from
+/// a TBH instruction.
+def JUMPTABLE_TBH :
+PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
+ i32imm:$size), NoItinerary, []>;
+
+
+// FIXME: Marking these as hasSideEffects is necessary to prevent machine DCE
+// from removing one half of the matched pairs. That breaks PEI, which assumes
+// these will always be in pairs, and asserts if it finds otherwise. Better way?
+let Defs = [SP], Uses = [SP], hasSideEffects = 1 in {
+def ADJCALLSTACKUP :
+PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2, pred:$p), NoItinerary,
+ [(ARMcallseq_end timm:$amt1, timm:$amt2)]>;
+
+def ADJCALLSTACKDOWN :
+PseudoInst<(outs), (ins i32imm:$amt, pred:$p), NoItinerary,
+ [(ARMcallseq_start timm:$amt)]>;
+}
+
+def HINT : AI<(outs), (ins imm0_239:$imm), MiscFrm, NoItinerary,
+ "hint", "\t$imm", [(int_arm_hint imm0_239:$imm)]>,
+ Requires<[IsARM, HasV6]> {
+ bits<8> imm;
+ let Inst{27-8} = 0b00110010000011110000;
+ let Inst{7-0} = imm;
+ let DecoderMethod = "DecodeHINTInstruction";
+}
+
+def : InstAlias<"nop$p", (HINT 0, pred:$p)>, Requires<[IsARM, HasV6K]>;
+def : InstAlias<"yield$p", (HINT 1, pred:$p)>, Requires<[IsARM, HasV6K]>;
+def : InstAlias<"wfe$p", (HINT 2, pred:$p)>, Requires<[IsARM, HasV6K]>;
+def : InstAlias<"wfi$p", (HINT 3, pred:$p)>, Requires<[IsARM, HasV6K]>;
+def : InstAlias<"sev$p", (HINT 4, pred:$p)>, Requires<[IsARM, HasV6K]>;
+def : InstAlias<"sevl$p", (HINT 5, pred:$p)>, Requires<[IsARM, HasV8]>;
+def : InstAlias<"esb$p", (HINT 16, pred:$p)>, Requires<[IsARM, HasRAS]>;
+
+def SEL : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, NoItinerary, "sel",
+ "\t$Rd, $Rn, $Rm", []>, Requires<[IsARM, HasV6]> {
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<4> Rm;
+ let Inst{3-0} = Rm;
+ let Inst{15-12} = Rd;
+ let Inst{19-16} = Rn;
+ let Inst{27-20} = 0b01101000;
+ let Inst{7-4} = 0b1011;
+ let Inst{11-8} = 0b1111;
+ let Unpredictable{11-8} = 0b1111;
+}
+
+// The 16-bit operand $val can be used by a debugger to store more information
+// about the breakpoint.
+def BKPT : AInoP<(outs), (ins imm0_65535:$val), MiscFrm, NoItinerary,
+ "bkpt", "\t$val", []>, Requires<[IsARM]> {
+ bits<16> val;
+ let Inst{3-0} = val{3-0};
+ let Inst{19-8} = val{15-4};
+ let Inst{27-20} = 0b00010010;
+ let Inst{31-28} = 0xe; // AL
+ let Inst{7-4} = 0b0111;
+}
+// default immediate for breakpoint mnemonic
+def : InstAlias<"bkpt", (BKPT 0), 0>, Requires<[IsARM]>;
+
+def HLT : AInoP<(outs), (ins imm0_65535:$val), MiscFrm, NoItinerary,
+ "hlt", "\t$val", []>, Requires<[IsARM, HasV8]> {
+ bits<16> val;
+ let Inst{3-0} = val{3-0};
+ let Inst{19-8} = val{15-4};
+ let Inst{27-20} = 0b00010000;
+ let Inst{31-28} = 0xe; // AL
+ let Inst{7-4} = 0b0111;
+}
+
+// Change Processor State
+// FIXME: We should use InstAlias to handle the optional operands.
+class CPS<dag iops, string asm_ops>
+ : AXI<(outs), iops, MiscFrm, NoItinerary, !strconcat("cps", asm_ops),
+ []>, Requires<[IsARM]> {
+ bits<2> imod;
+ bits<3> iflags;
+ bits<5> mode;
+ bit M;
+
+ let Inst{31-28} = 0b1111;
+ let Inst{27-20} = 0b00010000;
+ let Inst{19-18} = imod;
+ let Inst{17} = M; // Enabled if mode is set;
+ let Inst{16-9} = 0b00000000;
+ let Inst{8-6} = iflags;
+ let Inst{5} = 0;
+ let Inst{4-0} = mode;
+}
+
+let DecoderMethod = "DecodeCPSInstruction" in {
+let M = 1 in
+ def CPS3p : CPS<(ins imod_op:$imod, iflags_op:$iflags, imm0_31:$mode),
+ "$imod\t$iflags, $mode">;
+let mode = 0, M = 0 in
+ def CPS2p : CPS<(ins imod_op:$imod, iflags_op:$iflags), "$imod\t$iflags">;
+
+let imod = 0, iflags = 0, M = 1 in
+ def CPS1p : CPS<(ins imm0_31:$mode), "\t$mode">;
+}
+
+// Preload signals the memory system of possible future data/instruction access.
+multiclass APreLoad<bits<1> read, bits<1> data, string opc> {
+
+ def i12 : AXIM<(outs), (ins addrmode_imm12:$addr), AddrMode_i12, MiscFrm,
+ IIC_Preload, !strconcat(opc, "\t$addr"),
+ [(ARMPreload addrmode_imm12:$addr, (i32 read), (i32 data))]>,
+ Sched<[WritePreLd]> {
+ bits<4> Rt;
+ bits<17> addr;
+ let Inst{31-26} = 0b111101;
+ let Inst{25} = 0; // 0 for immediate form
+ let Inst{24} = data;
+ let Inst{23} = addr{12}; // U (add = ('U' == 1))
+ let Inst{22} = read;
+ let Inst{21-20} = 0b01;
+ let Inst{19-16} = addr{16-13}; // Rn
+ let Inst{15-12} = 0b1111;
+ let Inst{11-0} = addr{11-0}; // imm12
+ }
+
+ def rs : AXI<(outs), (ins ldst_so_reg:$shift), MiscFrm, IIC_Preload,
+ !strconcat(opc, "\t$shift"),
+ [(ARMPreload ldst_so_reg:$shift, (i32 read), (i32 data))]>,
+ Sched<[WritePreLd]> {
+ bits<17> shift;
+ let Inst{31-26} = 0b111101;
+ let Inst{25} = 1; // 1 for register form
+ let Inst{24} = data;
+ let Inst{23} = shift{12}; // U (add = ('U' == 1))
+ let Inst{22} = read;
+ let Inst{21-20} = 0b01;
+ let Inst{19-16} = shift{16-13}; // Rn
+ let Inst{15-12} = 0b1111;
+ let Inst{11-0} = shift{11-0};
+ let Inst{4} = 0;
+ }
+}
+
+defm PLD : APreLoad<1, 1, "pld">, Requires<[IsARM]>;
+defm PLDW : APreLoad<0, 1, "pldw">, Requires<[IsARM,HasV7,HasMP]>;
+defm PLI : APreLoad<1, 0, "pli">, Requires<[IsARM,HasV7]>;
+
+def SETEND : AXI<(outs), (ins setend_op:$end), MiscFrm, NoItinerary,
+ "setend\t$end", []>, Requires<[IsARM]>, Deprecated<HasV8Ops> {
+ bits<1> end;
+ let Inst{31-10} = 0b1111000100000001000000;
+ let Inst{9} = end;
+ let Inst{8-0} = 0;
+}
+
+def DBG : AI<(outs), (ins imm0_15:$opt), MiscFrm, NoItinerary, "dbg", "\t$opt",
+ [(int_arm_dbg imm0_15:$opt)]>, Requires<[IsARM, HasV7]> {
+ bits<4> opt;
+ let Inst{27-4} = 0b001100100000111100001111;
+ let Inst{3-0} = opt;
+}
+
+// A8.8.247 UDF - Undefined (Encoding A1)
+def UDF : AInoP<(outs), (ins imm0_65535:$imm16), MiscFrm, NoItinerary,
+ "udf", "\t$imm16", [(int_arm_undefined imm0_65535:$imm16)]> {
+ bits<16> imm16;
+ let Inst{31-28} = 0b1110; // AL
+ let Inst{27-25} = 0b011;
+ let Inst{24-20} = 0b11111;
+ let Inst{19-8} = imm16{15-4};
+ let Inst{7-4} = 0b1111;
+ let Inst{3-0} = imm16{3-0};
+}
+
+/*
+ * A5.4 Permanently UNDEFINED instructions.
+ *
+ * For most targets use UDF #65006, for which the OS will generate SIGTRAP.
+ * Other UDF encodings generate SIGILL.
+ *
+ * NaCl's OS instead chooses an ARM UDF encoding that's also a UDF in Thumb.
+ * Encoding A1:
+ * 1110 0111 1111 iiii iiii iiii 1111 iiii
+ * Encoding T1:
+ * 1101 1110 iiii iiii
+ * It uses the following encoding:
+ * 1110 0111 1111 1110 1101 1110 1111 0000
+ * - In ARM: UDF #60896;
+ * - In Thumb: UDF #254 followed by a branch-to-self.
+ */
+let isBarrier = 1, isTerminator = 1 in
+def TRAPNaCl : AXI<(outs), (ins), MiscFrm, NoItinerary,
+ "trap", [(trap)]>,
+ Requires<[IsARM,UseNaClTrap]> {
+ let Inst = 0xe7fedef0;
+}
+let isBarrier = 1, isTerminator = 1 in
+def TRAP : AXI<(outs), (ins), MiscFrm, NoItinerary,
+ "trap", [(trap)]>,
+ Requires<[IsARM,DontUseNaClTrap]> {
+ let Inst = 0xe7ffdefe;
+}
+
+// Address computation and loads and stores in PIC mode.
+let isNotDuplicable = 1 in {
+def PICADD : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$a, pclabel:$cp, pred:$p),
+ 4, IIC_iALUr,
+ [(set GPR:$dst, (ARMpic_add GPR:$a, imm:$cp))]>,
+ Sched<[WriteALU, ReadALU]>;
+
+let AddedComplexity = 10 in {
+def PICLDR : ARMPseudoInst<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p),
+ 4, IIC_iLoad_r,
+ [(set GPR:$dst, (load addrmodepc:$addr))]>;
+
+def PICLDRH : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p),
+ 4, IIC_iLoad_bh_r,
+ [(set GPR:$Rt, (zextloadi16 addrmodepc:$addr))]>;
+
+def PICLDRB : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p),
+ 4, IIC_iLoad_bh_r,
+ [(set GPR:$Rt, (zextloadi8 addrmodepc:$addr))]>;
+
+def PICLDRSH : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p),
+ 4, IIC_iLoad_bh_r,
+ [(set GPR:$Rt, (sextloadi16 addrmodepc:$addr))]>;
+
+def PICLDRSB : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p),
+ 4, IIC_iLoad_bh_r,
+ [(set GPR:$Rt, (sextloadi8 addrmodepc:$addr))]>;
+}
+let AddedComplexity = 10 in {
+def PICSTR : ARMPseudoInst<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p),
+ 4, IIC_iStore_r, [(store GPR:$src, addrmodepc:$addr)]>;
+
+def PICSTRH : ARMPseudoInst<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p),
+ 4, IIC_iStore_bh_r, [(truncstorei16 GPR:$src,
+ addrmodepc:$addr)]>;
+
+def PICSTRB : ARMPseudoInst<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p),
+ 4, IIC_iStore_bh_r, [(truncstorei8 GPR:$src, addrmodepc:$addr)]>;
+}
+} // isNotDuplicable = 1
+
+
+// LEApcrel - Load a pc-relative address into a register without offending the
+// assembler.
+let hasSideEffects = 0, isReMaterializable = 1 in
+// The 'adr' mnemonic encodes differently if the label is before or after
+// the instruction. The {24-21} opcode bits are set by the fixup, as we don't
+// know until then which form of the instruction will be used.
+def ADR : AI1<{0,?,?,0}, (outs GPR:$Rd), (ins adrlabel:$label),
+ MiscFrm, IIC_iALUi, "adr", "\t$Rd, $label", []>,
+ Sched<[WriteALU, ReadALU]> {
+ bits<4> Rd;
+ bits<14> label;
+ let Inst{27-25} = 0b001;
+ let Inst{24} = 0;
+ let Inst{23-22} = label{13-12};
+ let Inst{21} = 0;
+ let Inst{20} = 0;
+ let Inst{19-16} = 0b1111;
+ let Inst{15-12} = Rd;
+ let Inst{11-0} = label{11-0};
+}
+
+let hasSideEffects = 1 in {
+def LEApcrel : ARMPseudoInst<(outs GPR:$Rd), (ins i32imm:$label, pred:$p),
+ 4, IIC_iALUi, []>, Sched<[WriteALU, ReadALU]>;
+
+def LEApcrelJT : ARMPseudoInst<(outs GPR:$Rd),
+ (ins i32imm:$label, pred:$p),
+ 4, IIC_iALUi, []>, Sched<[WriteALU, ReadALU]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Control Flow Instructions.
+//
+
+let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
+ // ARMV4T and above
+ def BX_RET : AI<(outs), (ins), BrMiscFrm, IIC_Br,
+ "bx", "\tlr", [(ARMretflag)]>,
+ Requires<[IsARM, HasV4T]>, Sched<[WriteBr]> {
+ let Inst{27-0} = 0b0001001011111111111100011110;
+ }
+
+ // ARMV4 only
+ def MOVPCLR : AI<(outs), (ins), BrMiscFrm, IIC_Br,
+ "mov", "\tpc, lr", [(ARMretflag)]>,
+ Requires<[IsARM, NoV4T]>, Sched<[WriteBr]> {
+ let Inst{27-0} = 0b0001101000001111000000001110;
+ }
+
+ // Exception return: N.b. doesn't set CPSR as far as we're concerned (it sets
+ // the user-space one).
+ def SUBS_PC_LR : ARMPseudoInst<(outs), (ins i32imm:$offset, pred:$p),
+ 4, IIC_Br,
+ [(ARMintretflag imm:$offset)]>;
+}
+
+// Indirect branches
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+ // ARMV4T and above
+ def BX : AXI<(outs), (ins GPR:$dst), BrMiscFrm, IIC_Br, "bx\t$dst",
+ [(brind GPR:$dst)]>,
+ Requires<[IsARM, HasV4T]>, Sched<[WriteBr]> {
+ bits<4> dst;
+ let Inst{31-4} = 0b1110000100101111111111110001;
+ let Inst{3-0} = dst;
+ }
+
+ def BX_pred : AI<(outs), (ins GPR:$dst), BrMiscFrm, IIC_Br,
+ "bx", "\t$dst", [/* pattern left blank */]>,
+ Requires<[IsARM, HasV4T]>, Sched<[WriteBr]> {
+ bits<4> dst;
+ let Inst{27-4} = 0b000100101111111111110001;
+ let Inst{3-0} = dst;
+ }
+}
+
+// SP is marked as a use to prevent stack-pointer assignments that appear
+// immediately before calls from potentially appearing dead.
+let isCall = 1,
+ // FIXME: Do we really need a non-predicated version? If so, it should
+ // at least be a pseudo instruction expanding to the predicated version
+ // at MC lowering time.
+ Defs = [LR], Uses = [SP] in {
+ def BL : ABXI<0b1011, (outs), (ins arm_bl_target:$func),
+ IIC_Br, "bl\t$func",
+ [(ARMcall tglobaladdr:$func)]>,
+ Requires<[IsARM]>, Sched<[WriteBrL]> {
+ let Inst{31-28} = 0b1110;
+ bits<24> func;
+ let Inst{23-0} = func;
+ let DecoderMethod = "DecodeBranchImmInstruction";
+ }
+
+ def BL_pred : ABI<0b1011, (outs), (ins arm_bl_target:$func),
+ IIC_Br, "bl", "\t$func",
+ [(ARMcall_pred tglobaladdr:$func)]>,
+ Requires<[IsARM]>, Sched<[WriteBrL]> {
+ bits<24> func;
+ let Inst{23-0} = func;
+ let DecoderMethod = "DecodeBranchImmInstruction";
+ }
+
+ // ARMv5T and above
+ def BLX : AXI<(outs), (ins GPR:$func), BrMiscFrm,
+ IIC_Br, "blx\t$func",
+ [(ARMcall GPR:$func)]>,
+ Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]> {
+ bits<4> func;
+ let Inst{31-4} = 0b1110000100101111111111110011;
+ let Inst{3-0} = func;
+ }
+
+ def BLX_pred : AI<(outs), (ins GPR:$func), BrMiscFrm,
+ IIC_Br, "blx", "\t$func",
+ [(ARMcall_pred GPR:$func)]>,
+ Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]> {
+ bits<4> func;
+ let Inst{27-4} = 0b000100101111111111110011;
+ let Inst{3-0} = func;
+ }
+
+ // ARMv4T
+ // Note: Restrict $func to the tGPR regclass to prevent it being in LR.
+ def BX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func),
+ 8, IIC_Br, [(ARMcall_nolink tGPR:$func)]>,
+ Requires<[IsARM, HasV4T]>, Sched<[WriteBr]>;
+
+ // ARMv4
+ def BMOVPCRX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func),
+ 8, IIC_Br, [(ARMcall_nolink tGPR:$func)]>,
+ Requires<[IsARM, NoV4T]>, Sched<[WriteBr]>;
+
+ // mov lr, pc; b if callee is marked noreturn to avoid confusing the
+ // return stack predictor.
+ def BMOVPCB_CALL : ARMPseudoInst<(outs), (ins arm_bl_target:$func),
+ 8, IIC_Br, [(ARMcall_nolink tglobaladdr:$func)]>,
+ Requires<[IsARM]>, Sched<[WriteBr]>;
+}
+
+let isBranch = 1, isTerminator = 1 in {
+ // FIXME: should be able to write a pattern for ARMBrcond, but can't use
+ // a two-value operand where a dag node expects two operands. :(
+ def Bcc : ABI<0b1010, (outs), (ins arm_br_target:$target),
+ IIC_Br, "b", "\t$target",
+ [/*(ARMbrcond bb:$target, imm:$cc, CCR:$ccr)*/]>,
+ Sched<[WriteBr]> {
+ bits<24> target;
+ let Inst{23-0} = target;
+ let DecoderMethod = "DecodeBranchImmInstruction";
+ }
+
+ let isBarrier = 1 in {
+ // B is "predicable" since it's just a Bcc with an 'always' condition.
+ let isPredicable = 1 in
+ // FIXME: We shouldn't need this pseudo at all. Just using Bcc directly
+ // should be sufficient.
+ // FIXME: Is B really a Barrier? That doesn't seem right.
+ def B : ARMPseudoExpand<(outs), (ins arm_br_target:$target), 4, IIC_Br,
+ [(br bb:$target)], (Bcc arm_br_target:$target,
+ (ops 14, zero_reg))>,
+ Sched<[WriteBr]>;
+
+ let Size = 4, isNotDuplicable = 1, isIndirectBranch = 1 in {
+ def BR_JTr : ARMPseudoInst<(outs),
+ (ins GPR:$target, i32imm:$jt),
+ 0, IIC_Br,
+ [(ARMbrjt GPR:$target, tjumptable:$jt)]>,
+ Sched<[WriteBr]>;
+ // FIXME: This shouldn't use the generic "addrmode2," but rather be split
+ // into i12 and rs suffixed versions.
+ def BR_JTm : ARMPseudoInst<(outs),
+ (ins addrmode2:$target, i32imm:$jt),
+ 0, IIC_Br,
+ [(ARMbrjt (i32 (load addrmode2:$target)),
+ tjumptable:$jt)]>, Sched<[WriteBrTbl]>;
+ def BR_JTadd : ARMPseudoInst<(outs),
+ (ins GPR:$target, GPR:$idx, i32imm:$jt),
+ 0, IIC_Br,
+ [(ARMbrjt (add GPR:$target, GPR:$idx), tjumptable:$jt)]>,
+ Sched<[WriteBrTbl]>;
+ } // isNotDuplicable = 1, isIndirectBranch = 1
+ } // isBarrier = 1
+
+}
+
+// BLX (immediate)
+def BLXi : AXI<(outs), (ins arm_blx_target:$target), BrMiscFrm, NoItinerary,
+ "blx\t$target", []>,
+ Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]> {
+ let Inst{31-25} = 0b1111101;
+ bits<25> target;
+ let Inst{23-0} = target{24-1};
+ let Inst{24} = target{0};
+ let isCall = 1;
+}
+
+// Branch and Exchange Jazelle
+def BXJ : ABI<0b0001, (outs), (ins GPR:$func), NoItinerary, "bxj", "\t$func",
+ [/* pattern left blank */]>, Sched<[WriteBr]> {
+ bits<4> func;
+ let Inst{23-20} = 0b0010;
+ let Inst{19-8} = 0xfff;
+ let Inst{7-4} = 0b0010;
+ let Inst{3-0} = func;
+ let isBranch = 1;
+}
+
+// Tail calls.
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
+ def TCRETURNdi : PseudoInst<(outs), (ins i32imm:$dst), IIC_Br, []>,
+ Sched<[WriteBr]>;
+
+ def TCRETURNri : PseudoInst<(outs), (ins tcGPR:$dst), IIC_Br, []>,
+ Sched<[WriteBr]>;
+
+ def TAILJMPd : ARMPseudoExpand<(outs), (ins arm_br_target:$dst),
+ 4, IIC_Br, [],
+ (Bcc arm_br_target:$dst, (ops 14, zero_reg))>,
+ Requires<[IsARM]>, Sched<[WriteBr]>;
+
+ def TAILJMPr : ARMPseudoExpand<(outs), (ins tcGPR:$dst),
+ 4, IIC_Br, [],
+ (BX GPR:$dst)>, Sched<[WriteBr]>,
+ Requires<[IsARM]>;
+}
+
+// Secure Monitor Call is a system instruction.
+def SMC : ABI<0b0001, (outs), (ins imm0_15:$opt), NoItinerary, "smc", "\t$opt",
+ []>, Requires<[IsARM, HasTrustZone]> {
+ bits<4> opt;
+ let Inst{23-4} = 0b01100000000000000111;
+ let Inst{3-0} = opt;
+}
+def : MnemonicAlias<"smi", "smc">;
+
+// Supervisor Call (Software Interrupt)
+let isCall = 1, Uses = [SP] in {
+def SVC : ABI<0b1111, (outs), (ins imm24b:$svc), IIC_Br, "svc", "\t$svc", []>,
+ Sched<[WriteBr]> {
+ bits<24> svc;
+ let Inst{23-0} = svc;
+}
+}
+
+// Store Return State
+class SRSI<bit wb, string asm>
+ : XI<(outs), (ins imm0_31:$mode), AddrModeNone, 4, IndexModeNone, BrFrm,
+ NoItinerary, asm, "", []> {
+ bits<5> mode;
+ let Inst{31-28} = 0b1111;
+ let Inst{27-25} = 0b100;
+ let Inst{22} = 1;
+ let Inst{21} = wb;
+ let Inst{20} = 0;
+ let Inst{19-16} = 0b1101; // SP
+ let Inst{15-5} = 0b00000101000;
+ let Inst{4-0} = mode;
+}
+
+def SRSDA : SRSI<0, "srsda\tsp, $mode"> {
+ let Inst{24-23} = 0;
+}
+def SRSDA_UPD : SRSI<1, "srsda\tsp!, $mode"> {
+ let Inst{24-23} = 0;
+}
+def SRSDB : SRSI<0, "srsdb\tsp, $mode"> {
+ let Inst{24-23} = 0b10;
+}
+def SRSDB_UPD : SRSI<1, "srsdb\tsp!, $mode"> {
+ let Inst{24-23} = 0b10;
+}
+def SRSIA : SRSI<0, "srsia\tsp, $mode"> {
+ let Inst{24-23} = 0b01;
+}
+def SRSIA_UPD : SRSI<1, "srsia\tsp!, $mode"> {
+ let Inst{24-23} = 0b01;
+}
+def SRSIB : SRSI<0, "srsib\tsp, $mode"> {
+ let Inst{24-23} = 0b11;
+}
+def SRSIB_UPD : SRSI<1, "srsib\tsp!, $mode"> {
+ let Inst{24-23} = 0b11;
+}
+
+def : ARMInstAlias<"srsda $mode", (SRSDA imm0_31:$mode)>;
+def : ARMInstAlias<"srsda $mode!", (SRSDA_UPD imm0_31:$mode)>;
+
+def : ARMInstAlias<"srsdb $mode", (SRSDB imm0_31:$mode)>;
+def : ARMInstAlias<"srsdb $mode!", (SRSDB_UPD imm0_31:$mode)>;
+
+def : ARMInstAlias<"srsia $mode", (SRSIA imm0_31:$mode)>;
+def : ARMInstAlias<"srsia $mode!", (SRSIA_UPD imm0_31:$mode)>;
+
+def : ARMInstAlias<"srsib $mode", (SRSIB imm0_31:$mode)>;
+def : ARMInstAlias<"srsib $mode!", (SRSIB_UPD imm0_31:$mode)>;
+
+// Return From Exception
+class RFEI<bit wb, string asm>
+ : XI<(outs), (ins GPR:$Rn), AddrModeNone, 4, IndexModeNone, BrFrm,
+ NoItinerary, asm, "", []> {
+ bits<4> Rn;
+ let Inst{31-28} = 0b1111;
+ let Inst{27-25} = 0b100;
+ let Inst{22} = 0;
+ let Inst{21} = wb;
+ let Inst{20} = 1;
+ let Inst{19-16} = Rn;
+ let Inst{15-0} = 0xa00;
+}
+
+def RFEDA : RFEI<0, "rfeda\t$Rn"> {
+ let Inst{24-23} = 0;
+}
+def RFEDA_UPD : RFEI<1, "rfeda\t$Rn!"> {
+ let Inst{24-23} = 0;
+}
+def RFEDB : RFEI<0, "rfedb\t$Rn"> {
+ let Inst{24-23} = 0b10;
+}
+def RFEDB_UPD : RFEI<1, "rfedb\t$Rn!"> {
+ let Inst{24-23} = 0b10;
+}
+def RFEIA : RFEI<0, "rfeia\t$Rn"> {
+ let Inst{24-23} = 0b01;
+}
+def RFEIA_UPD : RFEI<1, "rfeia\t$Rn!"> {
+ let Inst{24-23} = 0b01;
+}
+def RFEIB : RFEI<0, "rfeib\t$Rn"> {
+ let Inst{24-23} = 0b11;
+}
+def RFEIB_UPD : RFEI<1, "rfeib\t$Rn!"> {
+ let Inst{24-23} = 0b11;
+}
+
+// Hypervisor Call is a system instruction
+let isCall = 1 in {
+def HVC : AInoP< (outs), (ins imm0_65535:$imm), BrFrm, NoItinerary,
+ "hvc", "\t$imm", []>,
+ Requires<[IsARM, HasVirtualization]> {
+ bits<16> imm;
+
+ // Even though HVC isn't predicable, it's encoding includes a condition field.
+ // The instruction is undefined if the condition field is 0xf otherwise it is
+ // unpredictable if it isn't condition AL (0xe).
+ let Inst{31-28} = 0b1110;
+ let Unpredictable{31-28} = 0b1111;
+ let Inst{27-24} = 0b0001;
+ let Inst{23-20} = 0b0100;
+ let Inst{19-8} = imm{15-4};
+ let Inst{7-4} = 0b0111;
+ let Inst{3-0} = imm{3-0};
+}
+}
+
+// Return from exception in Hypervisor mode.
+let isReturn = 1, isBarrier = 1, isTerminator = 1, Defs = [PC] in
+def ERET : ABI<0b0001, (outs), (ins), NoItinerary, "eret", "", []>,
+ Requires<[IsARM, HasVirtualization]> {
+ let Inst{23-0} = 0b011000000000000001101110;
+}
+
+//===----------------------------------------------------------------------===//
+// Load / Store Instructions.
+//
+
+// Load
+
+
+defm LDR : AI_ldr1<0, "ldr", IIC_iLoad_r, IIC_iLoad_si, load>;
+defm LDRB : AI_ldr1nopc<1, "ldrb", IIC_iLoad_bh_r, IIC_iLoad_bh_si,
+ zextloadi8>;
+defm STR : AI_str1<0, "str", IIC_iStore_r, IIC_iStore_si, store>;
+defm STRB : AI_str1nopc<1, "strb", IIC_iStore_bh_r, IIC_iStore_bh_si,
+ truncstorei8>;
+
+// Special LDR for loads from non-pc-relative constpools.
+let canFoldAsLoad = 1, mayLoad = 1, hasSideEffects = 0,
+ isReMaterializable = 1, isCodeGenOnly = 1 in
+def LDRcp : AI2ldst<0b010, 1, 0, (outs GPR:$Rt), (ins addrmode_imm12:$addr),
+ AddrMode_i12, LdFrm, IIC_iLoad_r, "ldr", "\t$Rt, $addr",
+ []> {
+ bits<4> Rt;
+ bits<17> addr;
+ let Inst{23} = addr{12}; // U (add = ('U' == 1))
+ let Inst{19-16} = 0b1111;
+ let Inst{15-12} = Rt;
+ let Inst{11-0} = addr{11-0}; // imm12
+}
+
+// Loads with zero extension
+def LDRH : AI3ld<0b1011, 1, (outs GPR:$Rt), (ins addrmode3:$addr), LdMiscFrm,
+ IIC_iLoad_bh_r, "ldrh", "\t$Rt, $addr",
+ [(set GPR:$Rt, (zextloadi16 addrmode3:$addr))]>;
+
+// Loads with sign extension
+def LDRSH : AI3ld<0b1111, 1, (outs GPR:$Rt), (ins addrmode3:$addr), LdMiscFrm,
+ IIC_iLoad_bh_r, "ldrsh", "\t$Rt, $addr",
+ [(set GPR:$Rt, (sextloadi16 addrmode3:$addr))]>;
+
+def LDRSB : AI3ld<0b1101, 1, (outs GPR:$Rt), (ins addrmode3:$addr), LdMiscFrm,
+ IIC_iLoad_bh_r, "ldrsb", "\t$Rt, $addr",
+ [(set GPR:$Rt, (sextloadi8 addrmode3:$addr))]>;
+
+let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in {
+ // Load doubleword
+ def LDRD : AI3ld<0b1101, 0, (outs GPR:$Rt, GPR:$Rt2), (ins addrmode3:$addr),
+ LdMiscFrm, IIC_iLoad_d_r, "ldrd", "\t$Rt, $Rt2, $addr", []>,
+ Requires<[IsARM, HasV5TE]>;
+}
+
+def LDA : AIldracq<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+ NoItinerary, "lda", "\t$Rt, $addr", []>;
+def LDAB : AIldracq<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+ NoItinerary, "ldab", "\t$Rt, $addr", []>;
+def LDAH : AIldracq<0b11, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+ NoItinerary, "ldah", "\t$Rt, $addr", []>;
+
+// Indexed loads
+multiclass AI2_ldridx<bit isByte, string opc,
+ InstrItinClass iii, InstrItinClass iir> {
+ def _PRE_IMM : AI2ldstidx<1, isByte, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+ (ins addrmode_imm12_pre:$addr), IndexModePre, LdFrm, iii,
+ opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> {
+ bits<17> addr;
+ let Inst{25} = 0;
+ let Inst{23} = addr{12};
+ let Inst{19-16} = addr{16-13};
+ let Inst{11-0} = addr{11-0};
+ let DecoderMethod = "DecodeLDRPreImm";
+ }
+
+ def _PRE_REG : AI2ldstidx<1, isByte, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+ (ins ldst_so_reg:$addr), IndexModePre, LdFrm, iir,
+ opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> {
+ bits<17> addr;
+ let Inst{25} = 1;
+ let Inst{23} = addr{12};
+ let Inst{19-16} = addr{16-13};
+ let Inst{11-0} = addr{11-0};
+ let Inst{4} = 0;
+ let DecoderMethod = "DecodeLDRPreReg";
+ }
+
+ def _POST_REG : AI2ldstidx<1, isByte, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+ (ins addr_offset_none:$addr, am2offset_reg:$offset),
+ IndexModePost, LdFrm, iir,
+ opc, "\t$Rt, $addr, $offset",
+ "$addr.base = $Rn_wb", []> {
+ // {12} isAdd
+ // {11-0} imm12/Rm
+ bits<14> offset;
+ bits<4> addr;
+ let Inst{25} = 1;
+ let Inst{23} = offset{12};
+ let Inst{19-16} = addr;
+ let Inst{11-0} = offset{11-0};
+ let Inst{4} = 0;
+
+ let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+ }
+
+ def _POST_IMM : AI2ldstidx<1, isByte, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+ (ins addr_offset_none:$addr, am2offset_imm:$offset),
+ IndexModePost, LdFrm, iii,
+ opc, "\t$Rt, $addr, $offset",
+ "$addr.base = $Rn_wb", []> {
+ // {12} isAdd
+ // {11-0} imm12/Rm
+ bits<14> offset;
+ bits<4> addr;
+ let Inst{25} = 0;
+ let Inst{23} = offset{12};
+ let Inst{19-16} = addr;
+ let Inst{11-0} = offset{11-0};
+
+ let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+ }
+
+}
+
+let mayLoad = 1, hasSideEffects = 0 in {
+// FIXME: for LDR_PRE_REG etc. the itineray should be either IIC_iLoad_ru or
+// IIC_iLoad_siu depending on whether it the offset register is shifted.
+defm LDR : AI2_ldridx<0, "ldr", IIC_iLoad_iu, IIC_iLoad_ru>;
+defm LDRB : AI2_ldridx<1, "ldrb", IIC_iLoad_bh_iu, IIC_iLoad_bh_ru>;
+}
+
+multiclass AI3_ldridx<bits<4> op, string opc, InstrItinClass itin> {
+ def _PRE : AI3ldstidx<op, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+ (ins addrmode3_pre:$addr), IndexModePre,
+ LdMiscFrm, itin,
+ opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> {
+ bits<14> addr;
+ let Inst{23} = addr{8}; // U bit
+ let Inst{22} = addr{13}; // 1 == imm8, 0 == Rm
+ let Inst{19-16} = addr{12-9}; // Rn
+ let Inst{11-8} = addr{7-4}; // imm7_4/zero
+ let Inst{3-0} = addr{3-0}; // imm3_0/Rm
+ let DecoderMethod = "DecodeAddrMode3Instruction";
+ }
+ def _POST : AI3ldstidx<op, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+ (ins addr_offset_none:$addr, am3offset:$offset),
+ IndexModePost, LdMiscFrm, itin,
+ opc, "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb",
+ []> {
+ bits<10> offset;
+ bits<4> addr;
+ let Inst{23} = offset{8}; // U bit
+ let Inst{22} = offset{9}; // 1 == imm8, 0 == Rm
+ let Inst{19-16} = addr;
+ let Inst{11-8} = offset{7-4}; // imm7_4/zero
+ let Inst{3-0} = offset{3-0}; // imm3_0/Rm
+ let DecoderMethod = "DecodeAddrMode3Instruction";
+ }
+}
+
+let mayLoad = 1, hasSideEffects = 0 in {
+defm LDRH : AI3_ldridx<0b1011, "ldrh", IIC_iLoad_bh_ru>;
+defm LDRSH : AI3_ldridx<0b1111, "ldrsh", IIC_iLoad_bh_ru>;
+defm LDRSB : AI3_ldridx<0b1101, "ldrsb", IIC_iLoad_bh_ru>;
+let hasExtraDefRegAllocReq = 1 in {
+def LDRD_PRE : AI3ldstidx<0b1101, 0, 1, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb),
+ (ins addrmode3_pre:$addr), IndexModePre,
+ LdMiscFrm, IIC_iLoad_d_ru,
+ "ldrd", "\t$Rt, $Rt2, $addr!",
+ "$addr.base = $Rn_wb", []> {
+ bits<14> addr;
+ let Inst{23} = addr{8}; // U bit
+ let Inst{22} = addr{13}; // 1 == imm8, 0 == Rm
+ let Inst{19-16} = addr{12-9}; // Rn
+ let Inst{11-8} = addr{7-4}; // imm7_4/zero
+ let Inst{3-0} = addr{3-0}; // imm3_0/Rm
+ let DecoderMethod = "DecodeAddrMode3Instruction";
+}
+def LDRD_POST: AI3ldstidx<0b1101, 0, 0, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb),
+ (ins addr_offset_none:$addr, am3offset:$offset),
+ IndexModePost, LdMiscFrm, IIC_iLoad_d_ru,
+ "ldrd", "\t$Rt, $Rt2, $addr, $offset",
+ "$addr.base = $Rn_wb", []> {
+ bits<10> offset;
+ bits<4> addr;
+ let Inst{23} = offset{8}; // U bit
+ let Inst{22} = offset{9}; // 1 == imm8, 0 == Rm
+ let Inst{19-16} = addr;
+ let Inst{11-8} = offset{7-4}; // imm7_4/zero
+ let Inst{3-0} = offset{3-0}; // imm3_0/Rm
+ let DecoderMethod = "DecodeAddrMode3Instruction";
+}
+} // hasExtraDefRegAllocReq = 1
+} // mayLoad = 1, hasSideEffects = 0
+
+// LDRT, LDRBT, LDRSBT, LDRHT, LDRSHT.
+let mayLoad = 1, hasSideEffects = 0 in {
+def LDRT_POST_REG : AI2ldstidx<1, 0, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+ (ins addr_offset_none:$addr, am2offset_reg:$offset),
+ IndexModePost, LdFrm, IIC_iLoad_ru,
+ "ldrt", "\t$Rt, $addr, $offset",
+ "$addr.base = $Rn_wb", []> {
+ // {12} isAdd
+ // {11-0} imm12/Rm
+ bits<14> offset;
+ bits<4> addr;
+ let Inst{25} = 1;
+ let Inst{23} = offset{12};
+ let Inst{21} = 1; // overwrite
+ let Inst{19-16} = addr;
+ let Inst{11-5} = offset{11-5};
+ let Inst{4} = 0;
+ let Inst{3-0} = offset{3-0};
+ let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+
+def LDRT_POST_IMM
+ : AI2ldstidx<1, 0, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+ (ins addr_offset_none:$addr, am2offset_imm:$offset),
+ IndexModePost, LdFrm, IIC_iLoad_ru,
+ "ldrt", "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb", []> {
+ // {12} isAdd
+ // {11-0} imm12/Rm
+ bits<14> offset;
+ bits<4> addr;
+ let Inst{25} = 0;
+ let Inst{23} = offset{12};
+ let Inst{21} = 1; // overwrite
+ let Inst{19-16} = addr;
+ let Inst{11-0} = offset{11-0};
+ let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+
+def LDRBT_POST_REG : AI2ldstidx<1, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+ (ins addr_offset_none:$addr, am2offset_reg:$offset),
+ IndexModePost, LdFrm, IIC_iLoad_bh_ru,
+ "ldrbt", "\t$Rt, $addr, $offset",
+ "$addr.base = $Rn_wb", []> {
+ // {12} isAdd
+ // {11-0} imm12/Rm
+ bits<14> offset;
+ bits<4> addr;
+ let Inst{25} = 1;
+ let Inst{23} = offset{12};
+ let Inst{21} = 1; // overwrite
+ let Inst{19-16} = addr;
+ let Inst{11-5} = offset{11-5};
+ let Inst{4} = 0;
+ let Inst{3-0} = offset{3-0};
+ let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+
+def LDRBT_POST_IMM
+ : AI2ldstidx<1, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+ (ins addr_offset_none:$addr, am2offset_imm:$offset),
+ IndexModePost, LdFrm, IIC_iLoad_bh_ru,
+ "ldrbt", "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb", []> {
+ // {12} isAdd
+ // {11-0} imm12/Rm
+ bits<14> offset;
+ bits<4> addr;
+ let Inst{25} = 0;
+ let Inst{23} = offset{12};
+ let Inst{21} = 1; // overwrite
+ let Inst{19-16} = addr;
+ let Inst{11-0} = offset{11-0};
+ let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+
+multiclass AI3ldrT<bits<4> op, string opc> {
+ def i : AI3ldstidxT<op, 1, (outs GPR:$Rt, GPR:$base_wb),
+ (ins addr_offset_none:$addr, postidx_imm8:$offset),
+ IndexModePost, LdMiscFrm, IIC_iLoad_bh_ru, opc,
+ "\t$Rt, $addr, $offset", "$addr.base = $base_wb", []> {
+ bits<9> offset;
+ let Inst{23} = offset{8};
+ let Inst{22} = 1;
+ let Inst{11-8} = offset{7-4};
+ let Inst{3-0} = offset{3-0};
+ }
+ def r : AI3ldstidxT<op, 1, (outs GPRnopc:$Rt, GPRnopc:$base_wb),
+ (ins addr_offset_none:$addr, postidx_reg:$Rm),
+ IndexModePost, LdMiscFrm, IIC_iLoad_bh_ru, opc,
+ "\t$Rt, $addr, $Rm", "$addr.base = $base_wb", []> {
+ bits<5> Rm;
+ let Inst{23} = Rm{4};
+ let Inst{22} = 0;
+ let Inst{11-8} = 0;
+ let Unpredictable{11-8} = 0b1111;
+ let Inst{3-0} = Rm{3-0};
+ let DecoderMethod = "DecodeLDR";
+ }
+}
+
+defm LDRSBT : AI3ldrT<0b1101, "ldrsbt">;
+defm LDRHT : AI3ldrT<0b1011, "ldrht">;
+defm LDRSHT : AI3ldrT<0b1111, "ldrsht">;
+}
+
+def LDRT_POST
+ : ARMAsmPseudo<"ldrt${q} $Rt, $addr", (ins addr_offset_none:$addr, pred:$q),
+ (outs GPR:$Rt)>;
+
+def LDRBT_POST
+ : ARMAsmPseudo<"ldrbt${q} $Rt, $addr", (ins addr_offset_none:$addr, pred:$q),
+ (outs GPR:$Rt)>;
+
+// Pseudo instruction ldr Rt, =immediate
+def LDRConstPool
+ : ARMAsmPseudo<"ldr${q} $Rt, $immediate",
+ (ins const_pool_asm_imm:$immediate, pred:$q),
+ (outs GPR:$Rt)>;
+
+// Store
+
+// Stores with truncate
+def STRH : AI3str<0b1011, (outs), (ins GPR:$Rt, addrmode3:$addr), StMiscFrm,
+ IIC_iStore_bh_r, "strh", "\t$Rt, $addr",
+ [(truncstorei16 GPR:$Rt, addrmode3:$addr)]>;
+
+// Store doubleword
+let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in {
+ def STRD : AI3str<0b1111, (outs), (ins GPR:$Rt, GPR:$Rt2, addrmode3:$addr),
+ StMiscFrm, IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", []>,
+ Requires<[IsARM, HasV5TE]> {
+ let Inst{21} = 0;
+ }
+}
+
+// Indexed stores
+multiclass AI2_stridx<bit isByte, string opc,
+ InstrItinClass iii, InstrItinClass iir> {
+ def _PRE_IMM : AI2ldstidx<0, isByte, 1, (outs GPR:$Rn_wb),
+ (ins GPR:$Rt, addrmode_imm12_pre:$addr), IndexModePre,
+ StFrm, iii,
+ opc, "\t$Rt, $addr!",
+ "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
+ bits<17> addr;
+ let Inst{25} = 0;
+ let Inst{23} = addr{12}; // U (add = ('U' == 1))
+ let Inst{19-16} = addr{16-13}; // Rn
+ let Inst{11-0} = addr{11-0}; // imm12
+ let DecoderMethod = "DecodeSTRPreImm";
+ }
+
+ def _PRE_REG : AI2ldstidx<0, isByte, 1, (outs GPR:$Rn_wb),
+ (ins GPR:$Rt, ldst_so_reg:$addr),
+ IndexModePre, StFrm, iir,
+ opc, "\t$Rt, $addr!",
+ "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
+ bits<17> addr;
+ let Inst{25} = 1;
+ let Inst{23} = addr{12}; // U (add = ('U' == 1))
+ let Inst{19-16} = addr{16-13}; // Rn
+ let Inst{11-0} = addr{11-0};
+ let Inst{4} = 0; // Inst{4} = 0
+ let DecoderMethod = "DecodeSTRPreReg";
+ }
+ def _POST_REG : AI2ldstidx<0, isByte, 0, (outs GPR:$Rn_wb),
+ (ins GPR:$Rt, addr_offset_none:$addr, am2offset_reg:$offset),
+ IndexModePost, StFrm, iir,
+ opc, "\t$Rt, $addr, $offset",
+ "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
+ // {12} isAdd
+ // {11-0} imm12/Rm
+ bits<14> offset;
+ bits<4> addr;
+ let Inst{25} = 1;
+ let Inst{23} = offset{12};
+ let Inst{19-16} = addr;
+ let Inst{11-0} = offset{11-0};
+ let Inst{4} = 0;
+
+ let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+ }
+
+ def _POST_IMM : AI2ldstidx<0, isByte, 0, (outs GPR:$Rn_wb),
+ (ins GPR:$Rt, addr_offset_none:$addr, am2offset_imm:$offset),
+ IndexModePost, StFrm, iii,
+ opc, "\t$Rt, $addr, $offset",
+ "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
+ // {12} isAdd
+ // {11-0} imm12/Rm
+ bits<14> offset;
+ bits<4> addr;
+ let Inst{25} = 0;
+ let Inst{23} = offset{12};
+ let Inst{19-16} = addr;
+ let Inst{11-0} = offset{11-0};
+
+ let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+ }
+}
+
+let mayStore = 1, hasSideEffects = 0 in {
+// FIXME: for STR_PRE_REG etc. the itineray should be either IIC_iStore_ru or
+// IIC_iStore_siu depending on whether it the offset register is shifted.
+defm STR : AI2_stridx<0, "str", IIC_iStore_iu, IIC_iStore_ru>;
+defm STRB : AI2_stridx<1, "strb", IIC_iStore_bh_iu, IIC_iStore_bh_ru>;
+}
+
+def : ARMPat<(post_store GPR:$Rt, addr_offset_none:$addr,
+ am2offset_reg:$offset),
+ (STR_POST_REG GPR:$Rt, addr_offset_none:$addr,
+ am2offset_reg:$offset)>;
+def : ARMPat<(post_store GPR:$Rt, addr_offset_none:$addr,
+ am2offset_imm:$offset),
+ (STR_POST_IMM GPR:$Rt, addr_offset_none:$addr,
+ am2offset_imm:$offset)>;
+def : ARMPat<(post_truncsti8 GPR:$Rt, addr_offset_none:$addr,
+ am2offset_reg:$offset),
+ (STRB_POST_REG GPR:$Rt, addr_offset_none:$addr,
+ am2offset_reg:$offset)>;
+def : ARMPat<(post_truncsti8 GPR:$Rt, addr_offset_none:$addr,
+ am2offset_imm:$offset),
+ (STRB_POST_IMM GPR:$Rt, addr_offset_none:$addr,
+ am2offset_imm:$offset)>;
+
+// Pseudo-instructions for pattern matching the pre-indexed stores. We can't
+// put the patterns on the instruction definitions directly as ISel wants
+// the address base and offset to be separate operands, not a single
+// complex operand like we represent the instructions themselves. The
+// pseudos map between the two.
+let usesCustomInserter = 1,
+ Constraints = "$Rn = $Rn_wb,@earlyclobber $Rn_wb" in {
+def STRi_preidx: ARMPseudoInst<(outs GPR:$Rn_wb),
+ (ins GPR:$Rt, GPR:$Rn, am2offset_imm:$offset, pred:$p),
+ 4, IIC_iStore_ru,
+ [(set GPR:$Rn_wb,
+ (pre_store GPR:$Rt, GPR:$Rn, am2offset_imm:$offset))]>;
+def STRr_preidx: ARMPseudoInst<(outs GPR:$Rn_wb),
+ (ins GPR:$Rt, GPR:$Rn, am2offset_reg:$offset, pred:$p),
+ 4, IIC_iStore_ru,
+ [(set GPR:$Rn_wb,
+ (pre_store GPR:$Rt, GPR:$Rn, am2offset_reg:$offset))]>;
+def STRBi_preidx: ARMPseudoInst<(outs GPR:$Rn_wb),
+ (ins GPR:$Rt, GPR:$Rn, am2offset_imm:$offset, pred:$p),
+ 4, IIC_iStore_ru,
+ [(set GPR:$Rn_wb,
+ (pre_truncsti8 GPR:$Rt, GPR:$Rn, am2offset_imm:$offset))]>;
+def STRBr_preidx: ARMPseudoInst<(outs GPR:$Rn_wb),
+ (ins GPR:$Rt, GPR:$Rn, am2offset_reg:$offset, pred:$p),
+ 4, IIC_iStore_ru,
+ [(set GPR:$Rn_wb,
+ (pre_truncsti8 GPR:$Rt, GPR:$Rn, am2offset_reg:$offset))]>;
+def STRH_preidx: ARMPseudoInst<(outs GPR:$Rn_wb),
+ (ins GPR:$Rt, GPR:$Rn, am3offset:$offset, pred:$p),
+ 4, IIC_iStore_ru,
+ [(set GPR:$Rn_wb,
+ (pre_truncsti16 GPR:$Rt, GPR:$Rn, am3offset:$offset))]>;
+}
+
+
+
+def STRH_PRE : AI3ldstidx<0b1011, 0, 1, (outs GPR:$Rn_wb),
+ (ins GPR:$Rt, addrmode3_pre:$addr), IndexModePre,
+ StMiscFrm, IIC_iStore_bh_ru,
+ "strh", "\t$Rt, $addr!",
+ "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
+ bits<14> addr;
+ let Inst{23} = addr{8}; // U bit
+ let Inst{22} = addr{13}; // 1 == imm8, 0 == Rm
+ let Inst{19-16} = addr{12-9}; // Rn
+ let Inst{11-8} = addr{7-4}; // imm7_4/zero
+ let Inst{3-0} = addr{3-0}; // imm3_0/Rm
+ let DecoderMethod = "DecodeAddrMode3Instruction";
+}
+
+def STRH_POST : AI3ldstidx<0b1011, 0, 0, (outs GPR:$Rn_wb),
+ (ins GPR:$Rt, addr_offset_none:$addr, am3offset:$offset),
+ IndexModePost, StMiscFrm, IIC_iStore_bh_ru,
+ "strh", "\t$Rt, $addr, $offset",
+ "$addr.base = $Rn_wb,@earlyclobber $Rn_wb",
+ [(set GPR:$Rn_wb, (post_truncsti16 GPR:$Rt,
+ addr_offset_none:$addr,
+ am3offset:$offset))]> {
+ bits<10> offset;
+ bits<4> addr;
+ let Inst{23} = offset{8}; // U bit
+ let Inst{22} = offset{9}; // 1 == imm8, 0 == Rm
+ let Inst{19-16} = addr;
+ let Inst{11-8} = offset{7-4}; // imm7_4/zero
+ let Inst{3-0} = offset{3-0}; // imm3_0/Rm
+ let DecoderMethod = "DecodeAddrMode3Instruction";
+}
+
+let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in {
+def STRD_PRE : AI3ldstidx<0b1111, 0, 1, (outs GPR:$Rn_wb),
+ (ins GPR:$Rt, GPR:$Rt2, addrmode3_pre:$addr),
+ IndexModePre, StMiscFrm, IIC_iStore_d_ru,
+ "strd", "\t$Rt, $Rt2, $addr!",
+ "$addr.base = $Rn_wb", []> {
+ bits<14> addr;
+ let Inst{23} = addr{8}; // U bit
+ let Inst{22} = addr{13}; // 1 == imm8, 0 == Rm
+ let Inst{19-16} = addr{12-9}; // Rn
+ let Inst{11-8} = addr{7-4}; // imm7_4/zero
+ let Inst{3-0} = addr{3-0}; // imm3_0/Rm
+ let DecoderMethod = "DecodeAddrMode3Instruction";
+}
+
+def STRD_POST: AI3ldstidx<0b1111, 0, 0, (outs GPR:$Rn_wb),
+ (ins GPR:$Rt, GPR:$Rt2, addr_offset_none:$addr,
+ am3offset:$offset),
+ IndexModePost, StMiscFrm, IIC_iStore_d_ru,
+ "strd", "\t$Rt, $Rt2, $addr, $offset",
+ "$addr.base = $Rn_wb", []> {
+ bits<10> offset;
+ bits<4> addr;
+ let Inst{23} = offset{8}; // U bit
+ let Inst{22} = offset{9}; // 1 == imm8, 0 == Rm
+ let Inst{19-16} = addr;
+ let Inst{11-8} = offset{7-4}; // imm7_4/zero
+ let Inst{3-0} = offset{3-0}; // imm3_0/Rm
+ let DecoderMethod = "DecodeAddrMode3Instruction";
+}
+} // mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1
+
+// STRT, STRBT, and STRHT
+
+def STRBT_POST_REG : AI2ldstidx<0, 1, 0, (outs GPR:$Rn_wb),
+ (ins GPR:$Rt, addr_offset_none:$addr, am2offset_reg:$offset),
+ IndexModePost, StFrm, IIC_iStore_bh_ru,
+ "strbt", "\t$Rt, $addr, $offset",
+ "$addr.base = $Rn_wb", []> {
+ // {12} isAdd
+ // {11-0} imm12/Rm
+ bits<14> offset;
+ bits<4> addr;
+ let Inst{25} = 1;
+ let Inst{23} = offset{12};
+ let Inst{21} = 1; // overwrite
+ let Inst{19-16} = addr;
+ let Inst{11-5} = offset{11-5};
+ let Inst{4} = 0;
+ let Inst{3-0} = offset{3-0};
+ let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+
+def STRBT_POST_IMM
+ : AI2ldstidx<0, 1, 0, (outs GPR:$Rn_wb),
+ (ins GPR:$Rt, addr_offset_none:$addr, am2offset_imm:$offset),
+ IndexModePost, StFrm, IIC_iStore_bh_ru,
+ "strbt", "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb", []> {
+ // {12} isAdd
+ // {11-0} imm12/Rm
+ bits<14> offset;
+ bits<4> addr;
+ let Inst{25} = 0;
+ let Inst{23} = offset{12};
+ let Inst{21} = 1; // overwrite
+ let Inst{19-16} = addr;
+ let Inst{11-0} = offset{11-0};
+ let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+
+def STRBT_POST
+ : ARMAsmPseudo<"strbt${q} $Rt, $addr",
+ (ins GPR:$Rt, addr_offset_none:$addr, pred:$q)>;
+
+let mayStore = 1, hasSideEffects = 0 in {
+def STRT_POST_REG : AI2ldstidx<0, 0, 0, (outs GPR:$Rn_wb),
+ (ins GPR:$Rt, addr_offset_none:$addr, am2offset_reg:$offset),
+ IndexModePost, StFrm, IIC_iStore_ru,
+ "strt", "\t$Rt, $addr, $offset",
+ "$addr.base = $Rn_wb", []> {
+ // {12} isAdd
+ // {11-0} imm12/Rm
+ bits<14> offset;
+ bits<4> addr;
+ let Inst{25} = 1;
+ let Inst{23} = offset{12};
+ let Inst{21} = 1; // overwrite
+ let Inst{19-16} = addr;
+ let Inst{11-5} = offset{11-5};
+ let Inst{4} = 0;
+ let Inst{3-0} = offset{3-0};
+ let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+
+def STRT_POST_IMM
+ : AI2ldstidx<0, 0, 0, (outs GPR:$Rn_wb),
+ (ins GPR:$Rt, addr_offset_none:$addr, am2offset_imm:$offset),
+ IndexModePost, StFrm, IIC_iStore_ru,
+ "strt", "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb", []> {
+ // {12} isAdd
+ // {11-0} imm12/Rm
+ bits<14> offset;
+ bits<4> addr;
+ let Inst{25} = 0;
+ let Inst{23} = offset{12};
+ let Inst{21} = 1; // overwrite
+ let Inst{19-16} = addr;
+ let Inst{11-0} = offset{11-0};
+ let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+}
+
+def STRT_POST
+ : ARMAsmPseudo<"strt${q} $Rt, $addr",
+ (ins GPR:$Rt, addr_offset_none:$addr, pred:$q)>;
+
+multiclass AI3strT<bits<4> op, string opc> {
+ def i : AI3ldstidxT<op, 0, (outs GPR:$base_wb),
+ (ins GPR:$Rt, addr_offset_none:$addr, postidx_imm8:$offset),
+ IndexModePost, StMiscFrm, IIC_iStore_bh_ru, opc,
+ "\t$Rt, $addr, $offset", "$addr.base = $base_wb", []> {
+ bits<9> offset;
+ let Inst{23} = offset{8};
+ let Inst{22} = 1;
+ let Inst{11-8} = offset{7-4};
+ let Inst{3-0} = offset{3-0};
+ }
+ def r : AI3ldstidxT<op, 0, (outs GPR:$base_wb),
+ (ins GPR:$Rt, addr_offset_none:$addr, postidx_reg:$Rm),
+ IndexModePost, StMiscFrm, IIC_iStore_bh_ru, opc,
+ "\t$Rt, $addr, $Rm", "$addr.base = $base_wb", []> {
+ bits<5> Rm;
+ let Inst{23} = Rm{4};
+ let Inst{22} = 0;
+ let Inst{11-8} = 0;
+ let Inst{3-0} = Rm{3-0};
+ }
+}
+
+
+defm STRHT : AI3strT<0b1011, "strht">;
+
+def STL : AIstrrel<0b00, (outs), (ins GPR:$Rt, addr_offset_none:$addr),
+ NoItinerary, "stl", "\t$Rt, $addr", []>;
+def STLB : AIstrrel<0b10, (outs), (ins GPR:$Rt, addr_offset_none:$addr),
+ NoItinerary, "stlb", "\t$Rt, $addr", []>;
+def STLH : AIstrrel<0b11, (outs), (ins GPR:$Rt, addr_offset_none:$addr),
+ NoItinerary, "stlh", "\t$Rt, $addr", []>;
+
+//===----------------------------------------------------------------------===//
+// Load / store multiple Instructions.
+//
+
+multiclass arm_ldst_mult<string asm, string sfx, bit L_bit, bit P_bit, Format f,
+ InstrItinClass itin, InstrItinClass itin_upd> {
+ // IA is the default, so no need for an explicit suffix on the
+ // mnemonic here. Without it is the canonical spelling.
+ def IA :
+ AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+ IndexModeNone, f, itin,
+ !strconcat(asm, "${p}\t$Rn, $regs", sfx), "", []> {
+ let Inst{24-23} = 0b01; // Increment After
+ let Inst{22} = P_bit;
+ let Inst{21} = 0; // No writeback
+ let Inst{20} = L_bit;
+ }
+ def IA_UPD :
+ AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+ IndexModeUpd, f, itin_upd,
+ !strconcat(asm, "${p}\t$Rn!, $regs", sfx), "$Rn = $wb", []> {
+ let Inst{24-23} = 0b01; // Increment After
+ let Inst{22} = P_bit;
+ let Inst{21} = 1; // Writeback
+ let Inst{20} = L_bit;
+
+ let DecoderMethod = "DecodeMemMultipleWritebackInstruction";
+ }
+ def DA :
+ AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+ IndexModeNone, f, itin,
+ !strconcat(asm, "da${p}\t$Rn, $regs", sfx), "", []> {
+ let Inst{24-23} = 0b00; // Decrement After
+ let Inst{22} = P_bit;
+ let Inst{21} = 0; // No writeback
+ let Inst{20} = L_bit;
+ }
+ def DA_UPD :
+ AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+ IndexModeUpd, f, itin_upd,
+ !strconcat(asm, "da${p}\t$Rn!, $regs", sfx), "$Rn = $wb", []> {
+ let Inst{24-23} = 0b00; // Decrement After
+ let Inst{22} = P_bit;
+ let Inst{21} = 1; // Writeback
+ let Inst{20} = L_bit;
+
+ let DecoderMethod = "DecodeMemMultipleWritebackInstruction";
+ }
+ def DB :
+ AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+ IndexModeNone, f, itin,
+ !strconcat(asm, "db${p}\t$Rn, $regs", sfx), "", []> {
+ let Inst{24-23} = 0b10; // Decrement Before
+ let Inst{22} = P_bit;
+ let Inst{21} = 0; // No writeback
+ let Inst{20} = L_bit;
+ }
+ def DB_UPD :
+ AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+ IndexModeUpd, f, itin_upd,
+ !strconcat(asm, "db${p}\t$Rn!, $regs", sfx), "$Rn = $wb", []> {
+ let Inst{24-23} = 0b10; // Decrement Before
+ let Inst{22} = P_bit;
+ let Inst{21} = 1; // Writeback
+ let Inst{20} = L_bit;
+
+ let DecoderMethod = "DecodeMemMultipleWritebackInstruction";
+ }
+ def IB :
+ AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+ IndexModeNone, f, itin,
+ !strconcat(asm, "ib${p}\t$Rn, $regs", sfx), "", []> {
+ let Inst{24-23} = 0b11; // Increment Before
+ let Inst{22} = P_bit;
+ let Inst{21} = 0; // No writeback
+ let Inst{20} = L_bit;
+ }
+ def IB_UPD :
+ AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+ IndexModeUpd, f, itin_upd,
+ !strconcat(asm, "ib${p}\t$Rn!, $regs", sfx), "$Rn = $wb", []> {
+ let Inst{24-23} = 0b11; // Increment Before
+ let Inst{22} = P_bit;
+ let Inst{21} = 1; // Writeback
+ let Inst{20} = L_bit;
+
+ let DecoderMethod = "DecodeMemMultipleWritebackInstruction";
+ }
+}
+
+let hasSideEffects = 0 in {
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
+defm LDM : arm_ldst_mult<"ldm", "", 1, 0, LdStMulFrm, IIC_iLoad_m,
+ IIC_iLoad_mu>, ComplexDeprecationPredicate<"ARMLoad">;
+
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
+defm STM : arm_ldst_mult<"stm", "", 0, 0, LdStMulFrm, IIC_iStore_m,
+ IIC_iStore_mu>,
+ ComplexDeprecationPredicate<"ARMStore">;
+
+} // hasSideEffects
+
+// FIXME: remove when we have a way to marking a MI with these properties.
+// FIXME: Should pc be an implicit operand like PICADD, etc?
+let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1,
+ hasExtraDefRegAllocReq = 1, isCodeGenOnly = 1 in
+def LDMIA_RET : ARMPseudoExpand<(outs GPR:$wb), (ins GPR:$Rn, pred:$p,
+ reglist:$regs, variable_ops),
+ 4, IIC_iLoad_mBr, [],
+ (LDMIA_UPD GPR:$wb, GPR:$Rn, pred:$p, reglist:$regs)>,
+ RegConstraint<"$Rn = $wb">;
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
+defm sysLDM : arm_ldst_mult<"ldm", " ^", 1, 1, LdStMulFrm, IIC_iLoad_m,
+ IIC_iLoad_mu>;
+
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
+defm sysSTM : arm_ldst_mult<"stm", " ^", 0, 1, LdStMulFrm, IIC_iStore_m,
+ IIC_iStore_mu>;
+
+
+
+//===----------------------------------------------------------------------===//
+// Move Instructions.
+//
+
+let hasSideEffects = 0 in
+def MOVr : AsI1<0b1101, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMOVr,
+ "mov", "\t$Rd, $Rm", []>, UnaryDP, Sched<[WriteALU]> {
+ bits<4> Rd;
+ bits<4> Rm;
+
+ let Inst{19-16} = 0b0000;
+ let Inst{11-4} = 0b00000000;
+ let Inst{25} = 0;
+ let Inst{3-0} = Rm;
+ let Inst{15-12} = Rd;
+}
+
+// A version for the smaller set of tail call registers.
+let hasSideEffects = 0 in
+def MOVr_TC : AsI1<0b1101, (outs tcGPR:$Rd), (ins tcGPR:$Rm), DPFrm,
+ IIC_iMOVr, "mov", "\t$Rd, $Rm", []>, UnaryDP, Sched<[WriteALU]> {
+ bits<4> Rd;
+ bits<4> Rm;
+
+ let Inst{11-4} = 0b00000000;
+ let Inst{25} = 0;
+ let Inst{3-0} = Rm;
+ let Inst{15-12} = Rd;
+}
+
+def MOVsr : AsI1<0b1101, (outs GPRnopc:$Rd), (ins shift_so_reg_reg:$src),
+ DPSoRegRegFrm, IIC_iMOVsr,
+ "mov", "\t$Rd, $src",
+ [(set GPRnopc:$Rd, shift_so_reg_reg:$src)]>, UnaryDP,
+ Sched<[WriteALU]> {
+ bits<4> Rd;
+ bits<12> src;
+ let Inst{15-12} = Rd;
+ let Inst{19-16} = 0b0000;
+ let Inst{11-8} = src{11-8};
+ let Inst{7} = 0;
+ let Inst{6-5} = src{6-5};
+ let Inst{4} = 1;
+ let Inst{3-0} = src{3-0};
+ let Inst{25} = 0;
+}
+
+def MOVsi : AsI1<0b1101, (outs GPR:$Rd), (ins shift_so_reg_imm:$src),
+ DPSoRegImmFrm, IIC_iMOVsr,
+ "mov", "\t$Rd, $src", [(set GPR:$Rd, shift_so_reg_imm:$src)]>,
+ UnaryDP, Sched<[WriteALU]> {
+ bits<4> Rd;
+ bits<12> src;
+ let Inst{15-12} = Rd;
+ let Inst{19-16} = 0b0000;
+ let Inst{11-5} = src{11-5};
+ let Inst{4} = 0;
+ let Inst{3-0} = src{3-0};
+ let Inst{25} = 0;
+}
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
+def MOVi : AsI1<0b1101, (outs GPR:$Rd), (ins mod_imm:$imm), DPFrm, IIC_iMOVi,
+ "mov", "\t$Rd, $imm", [(set GPR:$Rd, mod_imm:$imm)]>, UnaryDP,
+ Sched<[WriteALU]> {
+ bits<4> Rd;
+ bits<12> imm;
+ let Inst{25} = 1;
+ let Inst{15-12} = Rd;
+ let Inst{19-16} = 0b0000;
+ let Inst{11-0} = imm;
+}
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
+def MOVi16 : AI1<0b1000, (outs GPR:$Rd), (ins imm0_65535_expr:$imm),
+ DPFrm, IIC_iMOVi,
+ "movw", "\t$Rd, $imm",
+ [(set GPR:$Rd, imm0_65535:$imm)]>,
+ Requires<[IsARM, HasV6T2]>, UnaryDP, Sched<[WriteALU]> {
+ bits<4> Rd;
+ bits<16> imm;
+ let Inst{15-12} = Rd;
+ let Inst{11-0} = imm{11-0};
+ let Inst{19-16} = imm{15-12};
+ let Inst{20} = 0;
+ let Inst{25} = 1;
+ let DecoderMethod = "DecodeArmMOVTWInstruction";
+}
+
+def : InstAlias<"mov${p} $Rd, $imm",
+ (MOVi16 GPR:$Rd, imm0_65535_expr:$imm, pred:$p), 0>,
+ Requires<[IsARM, HasV6T2]>;
+
+def MOVi16_ga_pcrel : PseudoInst<(outs GPR:$Rd),
+ (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>,
+ Sched<[WriteALU]>;
+
+let Constraints = "$src = $Rd" in {
+def MOVTi16 : AI1<0b1010, (outs GPRnopc:$Rd),
+ (ins GPR:$src, imm0_65535_expr:$imm),
+ DPFrm, IIC_iMOVi,
+ "movt", "\t$Rd, $imm",
+ [(set GPRnopc:$Rd,
+ (or (and GPR:$src, 0xffff),
+ lo16AllZero:$imm))]>, UnaryDP,
+ Requires<[IsARM, HasV6T2]>, Sched<[WriteALU]> {
+ bits<4> Rd;
+ bits<16> imm;
+ let Inst{15-12} = Rd;
+ let Inst{11-0} = imm{11-0};
+ let Inst{19-16} = imm{15-12};
+ let Inst{20} = 0;
+ let Inst{25} = 1;
+ let DecoderMethod = "DecodeArmMOVTWInstruction";
+}
+
+def MOVTi16_ga_pcrel : PseudoInst<(outs GPR:$Rd),
+ (ins GPR:$src, i32imm:$addr, pclabel:$id), IIC_iMOVi, []>,
+ Sched<[WriteALU]>;
+
+} // Constraints
+
+def : ARMPat<(or GPR:$src, 0xffff0000), (MOVTi16 GPR:$src, 0xffff)>,
+ Requires<[IsARM, HasV6T2]>;
+
+let Uses = [CPSR] in
+def RRX: PseudoInst<(outs GPR:$Rd), (ins GPR:$Rm), IIC_iMOVsi,
+ [(set GPR:$Rd, (ARMrrx GPR:$Rm))]>, UnaryDP,
+ Requires<[IsARM]>, Sched<[WriteALU]>;
+
+// These aren't really mov instructions, but we have to define them this way
+// due to flag operands.
+
+let Defs = [CPSR] in {
+def MOVsrl_flag : PseudoInst<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi,
+ [(set GPR:$dst, (ARMsrl_flag GPR:$src))]>, UnaryDP,
+ Sched<[WriteALU]>, Requires<[IsARM]>;
+def MOVsra_flag : PseudoInst<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi,
+ [(set GPR:$dst, (ARMsra_flag GPR:$src))]>, UnaryDP,
+ Sched<[WriteALU]>, Requires<[IsARM]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Extend Instructions.
+//
+
+// Sign extenders
+
+def SXTB : AI_ext_rrot<0b01101010,
+ "sxtb", UnOpFrag<(sext_inreg node:$Src, i8)>>;
+def SXTH : AI_ext_rrot<0b01101011,
+ "sxth", UnOpFrag<(sext_inreg node:$Src, i16)>>;
+
+def SXTAB : AI_exta_rrot<0b01101010,
+ "sxtab", BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS, i8))>>;
+def SXTAH : AI_exta_rrot<0b01101011,
+ "sxtah", BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS,i16))>>;
+
+def : ARMV6Pat<(add rGPR:$Rn, (sext_inreg (srl rGPR:$Rm, rot_imm:$rot), i8)),
+ (SXTAB rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
+def : ARMV6Pat<(add rGPR:$Rn, (sext_inreg (srl rGPR:$Rm, imm8_or_16:$rot),
+ i16)),
+ (SXTAH rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
+
+def SXTB16 : AI_ext_rrot_np<0b01101000, "sxtb16">;
+
+def SXTAB16 : AI_exta_rrot_np<0b01101000, "sxtab16">;
+
+// Zero extenders
+
+let AddedComplexity = 16 in {
+def UXTB : AI_ext_rrot<0b01101110,
+ "uxtb" , UnOpFrag<(and node:$Src, 0x000000FF)>>;
+def UXTH : AI_ext_rrot<0b01101111,
+ "uxth" , UnOpFrag<(and node:$Src, 0x0000FFFF)>>;
+def UXTB16 : AI_ext_rrot<0b01101100,
+ "uxtb16", UnOpFrag<(and node:$Src, 0x00FF00FF)>>;
+
+// FIXME: This pattern incorrectly assumes the shl operator is a rotate.
+// The transformation should probably be done as a combiner action
+// instead so we can include a check for masking back in the upper
+// eight bits of the source into the lower eight bits of the result.
+//def : ARMV6Pat<(and (shl GPR:$Src, (i32 8)), 0xFF00FF),
+// (UXTB16r_rot GPR:$Src, 3)>;
+def : ARMV6Pat<(and (srl GPR:$Src, (i32 8)), 0xFF00FF),
+ (UXTB16 GPR:$Src, 1)>;
+
+def UXTAB : AI_exta_rrot<0b01101110, "uxtab",
+ BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>;
+def UXTAH : AI_exta_rrot<0b01101111, "uxtah",
+ BinOpFrag<(add node:$LHS, (and node:$RHS, 0xFFFF))>>;
+
+def : ARMV6Pat<(add rGPR:$Rn, (and (srl rGPR:$Rm, rot_imm:$rot), 0xFF)),
+ (UXTAB rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
+def : ARMV6Pat<(add rGPR:$Rn, (and (srl rGPR:$Rm, imm8_or_16:$rot), 0xFFFF)),
+ (UXTAH rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
+}
+
+// This isn't safe in general, the add is two 16-bit units, not a 32-bit add.
+def UXTAB16 : AI_exta_rrot_np<0b01101100, "uxtab16">;
+
+
+def SBFX : I<(outs GPRnopc:$Rd),
+ (ins GPRnopc:$Rn, imm0_31:$lsb, imm1_32:$width),
+ AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi,
+ "sbfx", "\t$Rd, $Rn, $lsb, $width", "", []>,
+ Requires<[IsARM, HasV6T2]> {
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<5> lsb;
+ bits<5> width;
+ let Inst{27-21} = 0b0111101;
+ let Inst{6-4} = 0b101;
+ let Inst{20-16} = width;
+ let Inst{15-12} = Rd;
+ let Inst{11-7} = lsb;
+ let Inst{3-0} = Rn;
+}
+
+def UBFX : I<(outs GPRnopc:$Rd),
+ (ins GPRnopc:$Rn, imm0_31:$lsb, imm1_32:$width),
+ AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi,
+ "ubfx", "\t$Rd, $Rn, $lsb, $width", "", []>,
+ Requires<[IsARM, HasV6T2]> {
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<5> lsb;
+ bits<5> width;
+ let Inst{27-21} = 0b0111111;
+ let Inst{6-4} = 0b101;
+ let Inst{20-16} = width;
+ let Inst{15-12} = Rd;
+ let Inst{11-7} = lsb;
+ let Inst{3-0} = Rn;
+}
+
+//===----------------------------------------------------------------------===//
+// Arithmetic Instructions.
+//
+
+let isAdd = 1 in
+defm ADD : AsI1_bin_irs<0b0100, "add",
+ IIC_iALUi, IIC_iALUr, IIC_iALUsr, add, 1>;
+defm SUB : AsI1_bin_irs<0b0010, "sub",
+ IIC_iALUi, IIC_iALUr, IIC_iALUsr, sub>;
+
+// ADD and SUB with 's' bit set.
+//
+// Currently, ADDS/SUBS are pseudo opcodes that exist only in the
+// selection DAG. They are "lowered" to real ADD/SUB opcodes by
+// AdjustInstrPostInstrSelection where we determine whether or not to
+// set the "s" bit based on CPSR liveness.
+//
+// FIXME: Eliminate ADDS/SUBS pseudo opcodes after adding tablegen
+// support for an optional CPSR definition that corresponds to the DAG
+// node's second value. We can then eliminate the implicit def of CPSR.
+let isAdd = 1 in
+defm ADDS : AsI1_bin_s_irs<IIC_iALUi, IIC_iALUr, IIC_iALUsr, ARMaddc, 1>;
+defm SUBS : AsI1_bin_s_irs<IIC_iALUi, IIC_iALUr, IIC_iALUsr, ARMsubc>;
+
+let isAdd = 1 in
+defm ADC : AI1_adde_sube_irs<0b0101, "adc", ARMadde, 1>;
+defm SBC : AI1_adde_sube_irs<0b0110, "sbc", ARMsube>;
+
+defm RSB : AsI1_rbin_irs<0b0011, "rsb",
+ IIC_iALUi, IIC_iALUr, IIC_iALUsr,
+ sub>;
+
+// FIXME: Eliminate them if we can write def : Pat patterns which defines
+// CPSR and the implicit def of CPSR is not needed.
+defm RSBS : AsI1_rbin_s_is<IIC_iALUi, IIC_iALUr, IIC_iALUsr, ARMsubc>;
+
+defm RSC : AI1_rsc_irs<0b0111, "rsc", ARMsube>;
+
+// (sub X, imm) gets canonicalized to (add X, -imm). Match this form.
+// The assume-no-carry-in form uses the negation of the input since add/sub
+// assume opposite meanings of the carry flag (i.e., carry == !borrow).
+// See the definition of AddWithCarry() in the ARM ARM A2.2.1 for the gory
+// details.
+def : ARMPat<(add GPR:$src, mod_imm_neg:$imm),
+ (SUBri GPR:$src, mod_imm_neg:$imm)>;
+def : ARMPat<(ARMaddc GPR:$src, mod_imm_neg:$imm),
+ (SUBSri GPR:$src, mod_imm_neg:$imm)>;
+
+def : ARMPat<(add GPR:$src, imm0_65535_neg:$imm),
+ (SUBrr GPR:$src, (MOVi16 (imm_neg_XFORM imm:$imm)))>,
+ Requires<[IsARM, HasV6T2]>;
+def : ARMPat<(ARMaddc GPR:$src, imm0_65535_neg:$imm),
+ (SUBSrr GPR:$src, (MOVi16 (imm_neg_XFORM imm:$imm)))>,
+ Requires<[IsARM, HasV6T2]>;
+
+// The with-carry-in form matches bitwise not instead of the negation.
+// Effectively, the inverse interpretation of the carry flag already accounts
+// for part of the negation.
+def : ARMPat<(ARMadde GPR:$src, mod_imm_not:$imm, CPSR),
+ (SBCri GPR:$src, mod_imm_not:$imm)>;
+def : ARMPat<(ARMadde GPR:$src, imm0_65535_neg:$imm, CPSR),
+ (SBCrr GPR:$src, (MOVi16 (imm_not_XFORM imm:$imm)))>,
+ Requires<[IsARM, HasV6T2]>;
+
+// Note: These are implemented in C++ code, because they have to generate
+// ADD/SUBrs instructions, which use a complex pattern that a xform function
+// cannot produce.
+// (mul X, 2^n+1) -> (add (X << n), X)
+// (mul X, 2^n-1) -> (rsb X, (X << n))
+
+// ARM Arithmetic Instruction
+// GPR:$dst = GPR:$a op GPR:$b
+class AAI<bits<8> op27_20, bits<8> op11_4, string opc,
+ list<dag> pattern = [],
+ dag iops = (ins GPRnopc:$Rn, GPRnopc:$Rm),
+ string asm = "\t$Rd, $Rn, $Rm">
+ : AI<(outs GPRnopc:$Rd), iops, DPFrm, IIC_iALUr, opc, asm, pattern>,
+ Sched<[WriteALU, ReadALU, ReadALU]> {
+ bits<4> Rn;
+ bits<4> Rd;
+ bits<4> Rm;
+ let Inst{27-20} = op27_20;
+ let Inst{11-4} = op11_4;
+ let Inst{19-16} = Rn;
+ let Inst{15-12} = Rd;
+ let Inst{3-0} = Rm;
+
+ let Unpredictable{11-8} = 0b1111;
+}
+
+// Saturating add/subtract
+
+let DecoderMethod = "DecodeQADDInstruction" in
+def QADD : AAI<0b00010000, 0b00000101, "qadd",
+ [(set GPRnopc:$Rd, (int_arm_qadd GPRnopc:$Rm, GPRnopc:$Rn))],
+ (ins GPRnopc:$Rm, GPRnopc:$Rn), "\t$Rd, $Rm, $Rn">;
+
+def QSUB : AAI<0b00010010, 0b00000101, "qsub",
+ [(set GPRnopc:$Rd, (int_arm_qsub GPRnopc:$Rm, GPRnopc:$Rn))],
+ (ins GPRnopc:$Rm, GPRnopc:$Rn), "\t$Rd, $Rm, $Rn">;
+def QDADD : AAI<0b00010100, 0b00000101, "qdadd", [],
+ (ins GPRnopc:$Rm, GPRnopc:$Rn),
+ "\t$Rd, $Rm, $Rn">;
+def QDSUB : AAI<0b00010110, 0b00000101, "qdsub", [],
+ (ins GPRnopc:$Rm, GPRnopc:$Rn),
+ "\t$Rd, $Rm, $Rn">;
+
+def QADD16 : AAI<0b01100010, 0b11110001, "qadd16">;
+def QADD8 : AAI<0b01100010, 0b11111001, "qadd8">;
+def QASX : AAI<0b01100010, 0b11110011, "qasx">;
+def QSAX : AAI<0b01100010, 0b11110101, "qsax">;
+def QSUB16 : AAI<0b01100010, 0b11110111, "qsub16">;
+def QSUB8 : AAI<0b01100010, 0b11111111, "qsub8">;
+def UQADD16 : AAI<0b01100110, 0b11110001, "uqadd16">;
+def UQADD8 : AAI<0b01100110, 0b11111001, "uqadd8">;
+def UQASX : AAI<0b01100110, 0b11110011, "uqasx">;
+def UQSAX : AAI<0b01100110, 0b11110101, "uqsax">;
+def UQSUB16 : AAI<0b01100110, 0b11110111, "uqsub16">;
+def UQSUB8 : AAI<0b01100110, 0b11111111, "uqsub8">;
+
+// Signed/Unsigned add/subtract
+
+def SASX : AAI<0b01100001, 0b11110011, "sasx">;
+def SADD16 : AAI<0b01100001, 0b11110001, "sadd16">;
+def SADD8 : AAI<0b01100001, 0b11111001, "sadd8">;
+def SSAX : AAI<0b01100001, 0b11110101, "ssax">;
+def SSUB16 : AAI<0b01100001, 0b11110111, "ssub16">;
+def SSUB8 : AAI<0b01100001, 0b11111111, "ssub8">;
+def UASX : AAI<0b01100101, 0b11110011, "uasx">;
+def UADD16 : AAI<0b01100101, 0b11110001, "uadd16">;
+def UADD8 : AAI<0b01100101, 0b11111001, "uadd8">;
+def USAX : AAI<0b01100101, 0b11110101, "usax">;
+def USUB16 : AAI<0b01100101, 0b11110111, "usub16">;
+def USUB8 : AAI<0b01100101, 0b11111111, "usub8">;
+
+// Signed/Unsigned halving add/subtract
+
+def SHASX : AAI<0b01100011, 0b11110011, "shasx">;
+def SHADD16 : AAI<0b01100011, 0b11110001, "shadd16">;
+def SHADD8 : AAI<0b01100011, 0b11111001, "shadd8">;
+def SHSAX : AAI<0b01100011, 0b11110101, "shsax">;
+def SHSUB16 : AAI<0b01100011, 0b11110111, "shsub16">;
+def SHSUB8 : AAI<0b01100011, 0b11111111, "shsub8">;
+def UHASX : AAI<0b01100111, 0b11110011, "uhasx">;
+def UHADD16 : AAI<0b01100111, 0b11110001, "uhadd16">;
+def UHADD8 : AAI<0b01100111, 0b11111001, "uhadd8">;
+def UHSAX : AAI<0b01100111, 0b11110101, "uhsax">;
+def UHSUB16 : AAI<0b01100111, 0b11110111, "uhsub16">;
+def UHSUB8 : AAI<0b01100111, 0b11111111, "uhsub8">;
+
+// Unsigned Sum of Absolute Differences [and Accumulate].
+
+def USAD8 : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+ MulFrm /* for convenience */, NoItinerary, "usad8",
+ "\t$Rd, $Rn, $Rm", []>,
+ Requires<[IsARM, HasV6]>, Sched<[WriteALU, ReadALU, ReadALU]> {
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<4> Rm;
+ let Inst{27-20} = 0b01111000;
+ let Inst{15-12} = 0b1111;
+ let Inst{7-4} = 0b0001;
+ let Inst{19-16} = Rd;
+ let Inst{11-8} = Rm;
+ let Inst{3-0} = Rn;
+}
+def USADA8 : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+ MulFrm /* for convenience */, NoItinerary, "usada8",
+ "\t$Rd, $Rn, $Rm, $Ra", []>,
+ Requires<[IsARM, HasV6]>, Sched<[WriteALU, ReadALU, ReadALU]>{
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<4> Rm;
+ bits<4> Ra;
+ let Inst{27-20} = 0b01111000;
+ let Inst{7-4} = 0b0001;
+ let Inst{19-16} = Rd;
+ let Inst{15-12} = Ra;
+ let Inst{11-8} = Rm;
+ let Inst{3-0} = Rn;
+}
+
+// Signed/Unsigned saturate
+
+def SSAT : AI<(outs GPRnopc:$Rd),
+ (ins imm1_32:$sat_imm, GPRnopc:$Rn, shift_imm:$sh),
+ SatFrm, NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []>,
+ Requires<[IsARM,HasV6]>{
+ bits<4> Rd;
+ bits<5> sat_imm;
+ bits<4> Rn;
+ bits<8> sh;
+ let Inst{27-21} = 0b0110101;
+ let Inst{5-4} = 0b01;
+ let Inst{20-16} = sat_imm;
+ let Inst{15-12} = Rd;
+ let Inst{11-7} = sh{4-0};
+ let Inst{6} = sh{5};
+ let Inst{3-0} = Rn;
+}
+
+def SSAT16 : AI<(outs GPRnopc:$Rd),
+ (ins imm1_16:$sat_imm, GPRnopc:$Rn), SatFrm,
+ NoItinerary, "ssat16", "\t$Rd, $sat_imm, $Rn", []>,
+ Requires<[IsARM,HasV6]>{
+ bits<4> Rd;
+ bits<4> sat_imm;
+ bits<4> Rn;
+ let Inst{27-20} = 0b01101010;
+ let Inst{11-4} = 0b11110011;
+ let Inst{15-12} = Rd;
+ let Inst{19-16} = sat_imm;
+ let Inst{3-0} = Rn;
+}
+
+def USAT : AI<(outs GPRnopc:$Rd),
+ (ins imm0_31:$sat_imm, GPRnopc:$Rn, shift_imm:$sh),
+ SatFrm, NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []>,
+ Requires<[IsARM,HasV6]> {
+ bits<4> Rd;
+ bits<5> sat_imm;
+ bits<4> Rn;
+ bits<8> sh;
+ let Inst{27-21} = 0b0110111;
+ let Inst{5-4} = 0b01;
+ let Inst{15-12} = Rd;
+ let Inst{11-7} = sh{4-0};
+ let Inst{6} = sh{5};
+ let Inst{20-16} = sat_imm;
+ let Inst{3-0} = Rn;
+}
+
+def USAT16 : AI<(outs GPRnopc:$Rd),
+ (ins imm0_15:$sat_imm, GPRnopc:$Rn), SatFrm,
+ NoItinerary, "usat16", "\t$Rd, $sat_imm, $Rn", []>,
+ Requires<[IsARM,HasV6]>{
+ bits<4> Rd;
+ bits<4> sat_imm;
+ bits<4> Rn;
+ let Inst{27-20} = 0b01101110;
+ let Inst{11-4} = 0b11110011;
+ let Inst{15-12} = Rd;
+ let Inst{19-16} = sat_imm;
+ let Inst{3-0} = Rn;
+}
+
+def : ARMV6Pat<(int_arm_ssat GPRnopc:$a, imm1_32:$pos),
+ (SSAT imm1_32:$pos, GPRnopc:$a, 0)>;
+def : ARMV6Pat<(int_arm_usat GPRnopc:$a, imm0_31:$pos),
+ (USAT imm0_31:$pos, GPRnopc:$a, 0)>;
+def : ARMPat<(ARMssatnoshift GPRnopc:$Rn, imm0_31:$imm),
+ (SSAT imm0_31:$imm, GPRnopc:$Rn, 0)>;
+
+//===----------------------------------------------------------------------===//
+// Bitwise Instructions.
+//
+
+defm AND : AsI1_bin_irs<0b0000, "and",
+ IIC_iBITi, IIC_iBITr, IIC_iBITsr, and, 1>;
+defm ORR : AsI1_bin_irs<0b1100, "orr",
+ IIC_iBITi, IIC_iBITr, IIC_iBITsr, or, 1>;
+defm EOR : AsI1_bin_irs<0b0001, "eor",
+ IIC_iBITi, IIC_iBITr, IIC_iBITsr, xor, 1>;
+defm BIC : AsI1_bin_irs<0b1110, "bic",
+ IIC_iBITi, IIC_iBITr, IIC_iBITsr,
+ BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
+
+// FIXME: bf_inv_mask_imm should be two operands, the lsb and the msb, just
+// like in the actual instruction encoding. The complexity of mapping the mask
+// to the lsb/msb pair should be handled by ISel, not encapsulated in the
+// instruction description.
+def BFC : I<(outs GPR:$Rd), (ins GPR:$src, bf_inv_mask_imm:$imm),
+ AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi,
+ "bfc", "\t$Rd, $imm", "$src = $Rd",
+ [(set GPR:$Rd, (and GPR:$src, bf_inv_mask_imm:$imm))]>,
+ Requires<[IsARM, HasV6T2]> {
+ bits<4> Rd;
+ bits<10> imm;
+ let Inst{27-21} = 0b0111110;
+ let Inst{6-0} = 0b0011111;
+ let Inst{15-12} = Rd;
+ let Inst{11-7} = imm{4-0}; // lsb
+ let Inst{20-16} = imm{9-5}; // msb
+}
+
+// A8.6.18 BFI - Bitfield insert (Encoding A1)
+def BFI:I<(outs GPRnopc:$Rd), (ins GPRnopc:$src, GPR:$Rn, bf_inv_mask_imm:$imm),
+ AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi,
+ "bfi", "\t$Rd, $Rn, $imm", "$src = $Rd",
+ [(set GPRnopc:$Rd, (ARMbfi GPRnopc:$src, GPR:$Rn,
+ bf_inv_mask_imm:$imm))]>,
+ Requires<[IsARM, HasV6T2]> {
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<10> imm;
+ let Inst{27-21} = 0b0111110;
+ let Inst{6-4} = 0b001; // Rn: Inst{3-0} != 15
+ let Inst{15-12} = Rd;
+ let Inst{11-7} = imm{4-0}; // lsb
+ let Inst{20-16} = imm{9-5}; // width
+ let Inst{3-0} = Rn;
+}
+
+def MVNr : AsI1<0b1111, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMVNr,
+ "mvn", "\t$Rd, $Rm",
+ [(set GPR:$Rd, (not GPR:$Rm))]>, UnaryDP, Sched<[WriteALU]> {
+ bits<4> Rd;
+ bits<4> Rm;
+ let Inst{25} = 0;
+ let Inst{19-16} = 0b0000;
+ let Inst{11-4} = 0b00000000;
+ let Inst{15-12} = Rd;
+ let Inst{3-0} = Rm;
+}
+def MVNsi : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_imm:$shift),
+ DPSoRegImmFrm, IIC_iMVNsr, "mvn", "\t$Rd, $shift",
+ [(set GPR:$Rd, (not so_reg_imm:$shift))]>, UnaryDP,
+ Sched<[WriteALU]> {
+ bits<4> Rd;
+ bits<12> shift;
+ let Inst{25} = 0;
+ let Inst{19-16} = 0b0000;
+ let Inst{15-12} = Rd;
+ let Inst{11-5} = shift{11-5};
+ let Inst{4} = 0;
+ let Inst{3-0} = shift{3-0};
+}
+def MVNsr : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_reg:$shift),
+ DPSoRegRegFrm, IIC_iMVNsr, "mvn", "\t$Rd, $shift",
+ [(set GPR:$Rd, (not so_reg_reg:$shift))]>, UnaryDP,
+ Sched<[WriteALU]> {
+ bits<4> Rd;
+ bits<12> shift;
+ let Inst{25} = 0;
+ let Inst{19-16} = 0b0000;
+ let Inst{15-12} = Rd;
+ let Inst{11-8} = shift{11-8};
+ let Inst{7} = 0;
+ let Inst{6-5} = shift{6-5};
+ let Inst{4} = 1;
+ let Inst{3-0} = shift{3-0};
+}
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
+def MVNi : AsI1<0b1111, (outs GPR:$Rd), (ins mod_imm:$imm), DPFrm,
+ IIC_iMVNi, "mvn", "\t$Rd, $imm",
+ [(set GPR:$Rd, mod_imm_not:$imm)]>,UnaryDP, Sched<[WriteALU]> {
+ bits<4> Rd;
+ bits<12> imm;
+ let Inst{25} = 1;
+ let Inst{19-16} = 0b0000;
+ let Inst{15-12} = Rd;
+ let Inst{11-0} = imm;
+}
+
+def : ARMPat<(and GPR:$src, mod_imm_not:$imm),
+ (BICri GPR:$src, mod_imm_not:$imm)>;
+
+//===----------------------------------------------------------------------===//
+// Multiply Instructions.
+//
+class AsMul1I32<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : AsMul1I<opcod, oops, iops, itin, opc, asm, pattern> {
+ bits<4> Rd;
+ bits<4> Rm;
+ bits<4> Rn;
+ let Inst{19-16} = Rd;
+ let Inst{11-8} = Rm;
+ let Inst{3-0} = Rn;
+}
+class AsMul1I64<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : AsMul1I<opcod, oops, iops, itin, opc, asm, pattern> {
+ bits<4> RdLo;
+ bits<4> RdHi;
+ bits<4> Rm;
+ bits<4> Rn;
+ let Inst{19-16} = RdHi;
+ let Inst{15-12} = RdLo;
+ let Inst{11-8} = Rm;
+ let Inst{3-0} = Rn;
+}
+class AsMla1I64<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : AsMul1I<opcod, oops, iops, itin, opc, asm, pattern> {
+ bits<4> RdLo;
+ bits<4> RdHi;
+ bits<4> Rm;
+ bits<4> Rn;
+ let Inst{19-16} = RdHi;
+ let Inst{15-12} = RdLo;
+ let Inst{11-8} = Rm;
+ let Inst{3-0} = Rn;
+}
+
+// FIXME: The v5 pseudos are only necessary for the additional Constraint
+// property. Remove them when it's possible to add those properties
+// on an individual MachineInstr, not just an instruction description.
+let isCommutable = 1, TwoOperandAliasConstraint = "$Rn = $Rd" in {
+def MUL : AsMul1I32<0b0000000, (outs GPRnopc:$Rd),
+ (ins GPRnopc:$Rn, GPRnopc:$Rm),
+ IIC_iMUL32, "mul", "\t$Rd, $Rn, $Rm",
+ [(set GPRnopc:$Rd, (mul GPRnopc:$Rn, GPRnopc:$Rm))]>,
+ Requires<[IsARM, HasV6]> {
+ let Inst{15-12} = 0b0000;
+ let Unpredictable{15-12} = 0b1111;
+}
+
+let Constraints = "@earlyclobber $Rd" in
+def MULv5: ARMPseudoExpand<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm,
+ pred:$p, cc_out:$s),
+ 4, IIC_iMUL32,
+ [(set GPRnopc:$Rd, (mul GPRnopc:$Rn, GPRnopc:$Rm))],
+ (MUL GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, cc_out:$s)>,
+ Requires<[IsARM, NoV6, UseMulOps]>;
+}
+
+def MLA : AsMul1I32<0b0000001, (outs GPRnopc:$Rd),
+ (ins GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra),
+ IIC_iMAC32, "mla", "\t$Rd, $Rn, $Rm, $Ra",
+ [(set GPRnopc:$Rd, (add (mul GPRnopc:$Rn, GPRnopc:$Rm), GPRnopc:$Ra))]>,
+ Requires<[IsARM, HasV6, UseMulOps]> {
+ bits<4> Ra;
+ let Inst{15-12} = Ra;
+}
+
+let Constraints = "@earlyclobber $Rd" in
+def MLAv5: ARMPseudoExpand<(outs GPRnopc:$Rd),
+ (ins GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra,
+ pred:$p, cc_out:$s), 4, IIC_iMAC32,
+ [(set GPRnopc:$Rd, (add (mul GPRnopc:$Rn, GPRnopc:$Rm), GPRnopc:$Ra))],
+ (MLA GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra, pred:$p, cc_out:$s)>,
+ Requires<[IsARM, NoV6]>;
+
+def MLS : AMul1I<0b0000011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+ IIC_iMAC32, "mls", "\t$Rd, $Rn, $Rm, $Ra",
+ [(set GPR:$Rd, (sub GPR:$Ra, (mul GPR:$Rn, GPR:$Rm)))]>,
+ Requires<[IsARM, HasV6T2, UseMulOps]> {
+ bits<4> Rd;
+ bits<4> Rm;
+ bits<4> Rn;
+ bits<4> Ra;
+ let Inst{19-16} = Rd;
+ let Inst{15-12} = Ra;
+ let Inst{11-8} = Rm;
+ let Inst{3-0} = Rn;
+}
+
+// Extra precision multiplies with low / high results
+let hasSideEffects = 0 in {
+let isCommutable = 1 in {
+def SMULL : AsMul1I64<0b0000110, (outs GPR:$RdLo, GPR:$RdHi),
+ (ins GPR:$Rn, GPR:$Rm), IIC_iMUL64,
+ "smull", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+ Requires<[IsARM, HasV6]>;
+
+def UMULL : AsMul1I64<0b0000100, (outs GPR:$RdLo, GPR:$RdHi),
+ (ins GPR:$Rn, GPR:$Rm), IIC_iMUL64,
+ "umull", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+ Requires<[IsARM, HasV6]>;
+
+let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi" in {
+def SMULLv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
+ (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s),
+ 4, IIC_iMUL64, [],
+ (SMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
+ Requires<[IsARM, NoV6]>;
+
+def UMULLv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
+ (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s),
+ 4, IIC_iMUL64, [],
+ (UMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
+ Requires<[IsARM, NoV6]>;
+}
+}
+
+// Multiply + accumulate
+def SMLAL : AsMla1I64<0b0000111, (outs GPR:$RdLo, GPR:$RdHi),
+ (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), IIC_iMAC64,
+ "smlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+ RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>;
+def UMLAL : AsMla1I64<0b0000101, (outs GPR:$RdLo, GPR:$RdHi),
+ (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), IIC_iMAC64,
+ "umlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+ RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>;
+
+def UMAAL : AMul1I <0b0000010, (outs GPR:$RdLo, GPR:$RdHi),
+ (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi),
+ IIC_iMAC64,
+ "umaal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+ RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]> {
+ bits<4> RdLo;
+ bits<4> RdHi;
+ bits<4> Rm;
+ bits<4> Rn;
+ let Inst{19-16} = RdHi;
+ let Inst{15-12} = RdLo;
+ let Inst{11-8} = Rm;
+ let Inst{3-0} = Rn;
+}
+
+let Constraints =
+ "@earlyclobber $RdLo,@earlyclobber $RdHi,$RLo = $RdLo,$RHi = $RdHi" in {
+def SMLALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
+ (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi, pred:$p, cc_out:$s),
+ 4, IIC_iMAC64, [],
+ (SMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi,
+ pred:$p, cc_out:$s)>,
+ Requires<[IsARM, NoV6]>;
+def UMLALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
+ (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi, pred:$p, cc_out:$s),
+ 4, IIC_iMAC64, [],
+ (UMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi,
+ pred:$p, cc_out:$s)>,
+ Requires<[IsARM, NoV6]>;
+}
+
+} // hasSideEffects
+
+// Most significant word multiply
+def SMMUL : AMul2I <0b0111010, 0b0001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+ IIC_iMUL32, "smmul", "\t$Rd, $Rn, $Rm",
+ [(set GPR:$Rd, (mulhs GPR:$Rn, GPR:$Rm))]>,
+ Requires<[IsARM, HasV6]> {
+ let Inst{15-12} = 0b1111;
+}
+
+def SMMULR : AMul2I <0b0111010, 0b0011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+ IIC_iMUL32, "smmulr", "\t$Rd, $Rn, $Rm", []>,
+ Requires<[IsARM, HasV6]> {
+ let Inst{15-12} = 0b1111;
+}
+
+def SMMLA : AMul2Ia <0b0111010, 0b0001, (outs GPR:$Rd),
+ (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+ IIC_iMAC32, "smmla", "\t$Rd, $Rn, $Rm, $Ra",
+ [(set GPR:$Rd, (add (mulhs GPR:$Rn, GPR:$Rm), GPR:$Ra))]>,
+ Requires<[IsARM, HasV6, UseMulOps]>;
+
+def SMMLAR : AMul2Ia <0b0111010, 0b0011, (outs GPR:$Rd),
+ (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+ IIC_iMAC32, "smmlar", "\t$Rd, $Rn, $Rm, $Ra", []>,
+ Requires<[IsARM, HasV6]>;
+
+def SMMLS : AMul2Ia <0b0111010, 0b1101, (outs GPR:$Rd),
+ (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+ IIC_iMAC32, "smmls", "\t$Rd, $Rn, $Rm, $Ra", []>,
+ Requires<[IsARM, HasV6, UseMulOps]>;
+
+def SMMLSR : AMul2Ia <0b0111010, 0b1111, (outs GPR:$Rd),
+ (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+ IIC_iMAC32, "smmlsr", "\t$Rd, $Rn, $Rm, $Ra", []>,
+ Requires<[IsARM, HasV6]>;
+
+multiclass AI_smul<string opc> {
+ def BB : AMulxyI<0b0001011, 0b00, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+ IIC_iMUL16, !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm",
+ [(set GPR:$Rd, (mul (sext_inreg GPR:$Rn, i16),
+ (sext_inreg GPR:$Rm, i16)))]>,
+ Requires<[IsARM, HasV5TE]>;
+
+ def BT : AMulxyI<0b0001011, 0b10, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+ IIC_iMUL16, !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm",
+ [(set GPR:$Rd, (mul (sext_inreg GPR:$Rn, i16),
+ (sra GPR:$Rm, (i32 16))))]>,
+ Requires<[IsARM, HasV5TE]>;
+
+ def TB : AMulxyI<0b0001011, 0b01, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+ IIC_iMUL16, !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm",
+ [(set GPR:$Rd, (mul (sra GPR:$Rn, (i32 16)),
+ (sext_inreg GPR:$Rm, i16)))]>,
+ Requires<[IsARM, HasV5TE]>;
+
+ def TT : AMulxyI<0b0001011, 0b11, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+ IIC_iMUL16, !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm",
+ [(set GPR:$Rd, (mul (sra GPR:$Rn, (i32 16)),
+ (sra GPR:$Rm, (i32 16))))]>,
+ Requires<[IsARM, HasV5TE]>;
+
+ def WB : AMulxyI<0b0001001, 0b01, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+ IIC_iMUL16, !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm",
+ []>,
+ Requires<[IsARM, HasV5TE]>;
+
+ def WT : AMulxyI<0b0001001, 0b11, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+ IIC_iMUL16, !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm",
+ []>,
+ Requires<[IsARM, HasV5TE]>;
+}
+
+
+multiclass AI_smla<string opc> {
+ let DecoderMethod = "DecodeSMLAInstruction" in {
+ def BB : AMulxyIa<0b0001000, 0b00, (outs GPRnopc:$Rd),
+ (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
+ IIC_iMAC16, !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm, $Ra",
+ [(set GPRnopc:$Rd, (add GPR:$Ra,
+ (mul (sext_inreg GPRnopc:$Rn, i16),
+ (sext_inreg GPRnopc:$Rm, i16))))]>,
+ Requires<[IsARM, HasV5TE, UseMulOps]>;
+
+ def BT : AMulxyIa<0b0001000, 0b10, (outs GPRnopc:$Rd),
+ (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
+ IIC_iMAC16, !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm, $Ra",
+ [(set GPRnopc:$Rd,
+ (add GPR:$Ra, (mul (sext_inreg GPRnopc:$Rn, i16),
+ (sra GPRnopc:$Rm, (i32 16)))))]>,
+ Requires<[IsARM, HasV5TE, UseMulOps]>;
+
+ def TB : AMulxyIa<0b0001000, 0b01, (outs GPRnopc:$Rd),
+ (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
+ IIC_iMAC16, !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm, $Ra",
+ [(set GPRnopc:$Rd,
+ (add GPR:$Ra, (mul (sra GPRnopc:$Rn, (i32 16)),
+ (sext_inreg GPRnopc:$Rm, i16))))]>,
+ Requires<[IsARM, HasV5TE, UseMulOps]>;
+
+ def TT : AMulxyIa<0b0001000, 0b11, (outs GPRnopc:$Rd),
+ (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
+ IIC_iMAC16, !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm, $Ra",
+ [(set GPRnopc:$Rd,
+ (add GPR:$Ra, (mul (sra GPRnopc:$Rn, (i32 16)),
+ (sra GPRnopc:$Rm, (i32 16)))))]>,
+ Requires<[IsARM, HasV5TE, UseMulOps]>;
+
+ def WB : AMulxyIa<0b0001001, 0b00, (outs GPRnopc:$Rd),
+ (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
+ IIC_iMAC16, !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm, $Ra",
+ []>,
+ Requires<[IsARM, HasV5TE, UseMulOps]>;
+
+ def WT : AMulxyIa<0b0001001, 0b10, (outs GPRnopc:$Rd),
+ (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
+ IIC_iMAC16, !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm, $Ra",
+ []>,
+ Requires<[IsARM, HasV5TE, UseMulOps]>;
+ }
+}
+
+defm SMUL : AI_smul<"smul">;
+defm SMLA : AI_smla<"smla">;
+
+// Halfword multiply accumulate long: SMLAL<x><y>.
+def SMLALBB : AMulxyI64<0b0001010, 0b00, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
+ (ins GPRnopc:$Rn, GPRnopc:$Rm),
+ IIC_iMAC64, "smlalbb", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+ Requires<[IsARM, HasV5TE]>;
+
+def SMLALBT : AMulxyI64<0b0001010, 0b10, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
+ (ins GPRnopc:$Rn, GPRnopc:$Rm),
+ IIC_iMAC64, "smlalbt", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+ Requires<[IsARM, HasV5TE]>;
+
+def SMLALTB : AMulxyI64<0b0001010, 0b01, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
+ (ins GPRnopc:$Rn, GPRnopc:$Rm),
+ IIC_iMAC64, "smlaltb", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+ Requires<[IsARM, HasV5TE]>;
+
+def SMLALTT : AMulxyI64<0b0001010, 0b11, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
+ (ins GPRnopc:$Rn, GPRnopc:$Rm),
+ IIC_iMAC64, "smlaltt", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+ Requires<[IsARM, HasV5TE]>;
+
+// Helper class for AI_smld.
+class AMulDualIbase<bit long, bit sub, bit swap, dag oops, dag iops,
+ InstrItinClass itin, string opc, string asm>
+ : AI<oops, iops, MulFrm, itin, opc, asm, []>, Requires<[IsARM, HasV6]> {
+ bits<4> Rn;
+ bits<4> Rm;
+ let Inst{27-23} = 0b01110;
+ let Inst{22} = long;
+ let Inst{21-20} = 0b00;
+ let Inst{11-8} = Rm;
+ let Inst{7} = 0;
+ let Inst{6} = sub;
+ let Inst{5} = swap;
+ let Inst{4} = 1;
+ let Inst{3-0} = Rn;
+}
+class AMulDualI<bit long, bit sub, bit swap, dag oops, dag iops,
+ InstrItinClass itin, string opc, string asm>
+ : AMulDualIbase<long, sub, swap, oops, iops, itin, opc, asm> {
+ bits<4> Rd;
+ let Inst{15-12} = 0b1111;
+ let Inst{19-16} = Rd;
+}
+class AMulDualIa<bit long, bit sub, bit swap, dag oops, dag iops,
+ InstrItinClass itin, string opc, string asm>
+ : AMulDualIbase<long, sub, swap, oops, iops, itin, opc, asm> {
+ bits<4> Ra;
+ bits<4> Rd;
+ let Inst{19-16} = Rd;
+ let Inst{15-12} = Ra;
+}
+class AMulDualI64<bit long, bit sub, bit swap, dag oops, dag iops,
+ InstrItinClass itin, string opc, string asm>
+ : AMulDualIbase<long, sub, swap, oops, iops, itin, opc, asm> {
+ bits<4> RdLo;
+ bits<4> RdHi;
+ let Inst{19-16} = RdHi;
+ let Inst{15-12} = RdLo;
+}
+
+multiclass AI_smld<bit sub, string opc> {
+
+ def D : AMulDualIa<0, sub, 0, (outs GPRnopc:$Rd),
+ (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
+ NoItinerary, !strconcat(opc, "d"), "\t$Rd, $Rn, $Rm, $Ra">;
+
+ def DX: AMulDualIa<0, sub, 1, (outs GPRnopc:$Rd),
+ (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
+ NoItinerary, !strconcat(opc, "dx"), "\t$Rd, $Rn, $Rm, $Ra">;
+
+ def LD: AMulDualI64<1, sub, 0, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
+ (ins GPRnopc:$Rn, GPRnopc:$Rm), NoItinerary,
+ !strconcat(opc, "ld"), "\t$RdLo, $RdHi, $Rn, $Rm">;
+
+ def LDX : AMulDualI64<1, sub, 1, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
+ (ins GPRnopc:$Rn, GPRnopc:$Rm), NoItinerary,
+ !strconcat(opc, "ldx"),"\t$RdLo, $RdHi, $Rn, $Rm">;
+
+}
+
+defm SMLA : AI_smld<0, "smla">;
+defm SMLS : AI_smld<1, "smls">;
+
+multiclass AI_sdml<bit sub, string opc> {
+
+ def D:AMulDualI<0, sub, 0, (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm),
+ NoItinerary, !strconcat(opc, "d"), "\t$Rd, $Rn, $Rm">;
+ def DX:AMulDualI<0, sub, 1, (outs GPRnopc:$Rd),(ins GPRnopc:$Rn, GPRnopc:$Rm),
+ NoItinerary, !strconcat(opc, "dx"), "\t$Rd, $Rn, $Rm">;
+}
+
+defm SMUA : AI_sdml<0, "smua">;
+defm SMUS : AI_sdml<1, "smus">;
+
+//===----------------------------------------------------------------------===//
+// Division Instructions (ARMv7-A with virtualization extension)
+//
+def SDIV : ADivA1I<0b001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iDIV,
+ "sdiv", "\t$Rd, $Rn, $Rm",
+ [(set GPR:$Rd, (sdiv GPR:$Rn, GPR:$Rm))]>,
+ Requires<[IsARM, HasDivideInARM]>;
+
+def UDIV : ADivA1I<0b011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iDIV,
+ "udiv", "\t$Rd, $Rn, $Rm",
+ [(set GPR:$Rd, (udiv GPR:$Rn, GPR:$Rm))]>,
+ Requires<[IsARM, HasDivideInARM]>;
+
+//===----------------------------------------------------------------------===//
+// Misc. Arithmetic Instructions.
+//
+
+def CLZ : AMiscA1I<0b00010110, 0b0001, (outs GPR:$Rd), (ins GPR:$Rm),
+ IIC_iUNAr, "clz", "\t$Rd, $Rm",
+ [(set GPR:$Rd, (ctlz GPR:$Rm))]>, Requires<[IsARM, HasV5T]>,
+ Sched<[WriteALU]>;
+
+def RBIT : AMiscA1I<0b01101111, 0b0011, (outs GPR:$Rd), (ins GPR:$Rm),
+ IIC_iUNAr, "rbit", "\t$Rd, $Rm",
+ [(set GPR:$Rd, (bitreverse GPR:$Rm))]>,
+ Requires<[IsARM, HasV6T2]>,
+ Sched<[WriteALU]>;
+
+def REV : AMiscA1I<0b01101011, 0b0011, (outs GPR:$Rd), (ins GPR:$Rm),
+ IIC_iUNAr, "rev", "\t$Rd, $Rm",
+ [(set GPR:$Rd, (bswap GPR:$Rm))]>, Requires<[IsARM, HasV6]>,
+ Sched<[WriteALU]>;
+
+let AddedComplexity = 5 in
+def REV16 : AMiscA1I<0b01101011, 0b1011, (outs GPR:$Rd), (ins GPR:$Rm),
+ IIC_iUNAr, "rev16", "\t$Rd, $Rm",
+ [(set GPR:$Rd, (rotr (bswap GPR:$Rm), (i32 16)))]>,
+ Requires<[IsARM, HasV6]>,
+ Sched<[WriteALU]>;
+
+def : ARMV6Pat<(srl (bswap (extloadi16 addrmode3:$addr)), (i32 16)),
+ (REV16 (LDRH addrmode3:$addr))>;
+def : ARMV6Pat<(truncstorei16 (srl (bswap GPR:$Rn), (i32 16)), addrmode3:$addr),
+ (STRH (REV16 GPR:$Rn), addrmode3:$addr)>;
+
+let AddedComplexity = 5 in
+def REVSH : AMiscA1I<0b01101111, 0b1011, (outs GPR:$Rd), (ins GPR:$Rm),
+ IIC_iUNAr, "revsh", "\t$Rd, $Rm",
+ [(set GPR:$Rd, (sra (bswap GPR:$Rm), (i32 16)))]>,
+ Requires<[IsARM, HasV6]>,
+ Sched<[WriteALU]>;
+
+def : ARMV6Pat<(or (sra (shl GPR:$Rm, (i32 24)), (i32 16)),
+ (and (srl GPR:$Rm, (i32 8)), 0xFF)),
+ (REVSH GPR:$Rm)>;
+
+def PKHBT : APKHI<0b01101000, 0, (outs GPRnopc:$Rd),
+ (ins GPRnopc:$Rn, GPRnopc:$Rm, pkh_lsl_amt:$sh),
+ IIC_iALUsi, "pkhbt", "\t$Rd, $Rn, $Rm$sh",
+ [(set GPRnopc:$Rd, (or (and GPRnopc:$Rn, 0xFFFF),
+ (and (shl GPRnopc:$Rm, pkh_lsl_amt:$sh),
+ 0xFFFF0000)))]>,
+ Requires<[IsARM, HasV6]>,
+ Sched<[WriteALUsi, ReadALU]>;
+
+// Alternate cases for PKHBT where identities eliminate some nodes.
+def : ARMV6Pat<(or (and GPRnopc:$Rn, 0xFFFF), (and GPRnopc:$Rm, 0xFFFF0000)),
+ (PKHBT GPRnopc:$Rn, GPRnopc:$Rm, 0)>;
+def : ARMV6Pat<(or (and GPRnopc:$Rn, 0xFFFF), (shl GPRnopc:$Rm, imm16_31:$sh)),
+ (PKHBT GPRnopc:$Rn, GPRnopc:$Rm, imm16_31:$sh)>;
+
+// Note: Shifts of 1-15 bits will be transformed to srl instead of sra and
+// will match the pattern below.
+def PKHTB : APKHI<0b01101000, 1, (outs GPRnopc:$Rd),
+ (ins GPRnopc:$Rn, GPRnopc:$Rm, pkh_asr_amt:$sh),
+ IIC_iBITsi, "pkhtb", "\t$Rd, $Rn, $Rm$sh",
+ [(set GPRnopc:$Rd, (or (and GPRnopc:$Rn, 0xFFFF0000),
+ (and (sra GPRnopc:$Rm, pkh_asr_amt:$sh),
+ 0xFFFF)))]>,
+ Requires<[IsARM, HasV6]>,
+ Sched<[WriteALUsi, ReadALU]>;
+
+// Alternate cases for PKHTB where identities eliminate some nodes. Note that
+// a shift amount of 0 is *not legal* here, it is PKHBT instead.
+// We also can not replace a srl (17..31) by an arithmetic shift we would use in
+// pkhtb src1, src2, asr (17..31).
+def : ARMV6Pat<(or (and GPRnopc:$src1, 0xFFFF0000),
+ (srl GPRnopc:$src2, imm16:$sh)),
+ (PKHTB GPRnopc:$src1, GPRnopc:$src2, imm16:$sh)>;
+def : ARMV6Pat<(or (and GPRnopc:$src1, 0xFFFF0000),
+ (sra GPRnopc:$src2, imm16_31:$sh)),
+ (PKHTB GPRnopc:$src1, GPRnopc:$src2, imm16_31:$sh)>;
+def : ARMV6Pat<(or (and GPRnopc:$src1, 0xFFFF0000),
+ (and (srl GPRnopc:$src2, imm1_15:$sh), 0xFFFF)),
+ (PKHTB GPRnopc:$src1, GPRnopc:$src2, imm1_15:$sh)>;
+
+//===----------------------------------------------------------------------===//
+// CRC Instructions
+//
+// Polynomials:
+// + CRC32{B,H,W} 0x04C11DB7
+// + CRC32C{B,H,W} 0x1EDC6F41
+//
+
+class AI_crc32<bit C, bits<2> sz, string suffix, SDPatternOperator builtin>
+ : AInoP<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm), MiscFrm, NoItinerary,
+ !strconcat("crc32", suffix), "\t$Rd, $Rn, $Rm",
+ [(set GPRnopc:$Rd, (builtin GPRnopc:$Rn, GPRnopc:$Rm))]>,
+ Requires<[IsARM, HasV8, HasCRC]> {
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<4> Rm;
+
+ let Inst{31-28} = 0b1110;
+ let Inst{27-23} = 0b00010;
+ let Inst{22-21} = sz;
+ let Inst{20} = 0;
+ let Inst{19-16} = Rn;
+ let Inst{15-12} = Rd;
+ let Inst{11-10} = 0b00;
+ let Inst{9} = C;
+ let Inst{8} = 0;
+ let Inst{7-4} = 0b0100;
+ let Inst{3-0} = Rm;
+
+ let Unpredictable{11-8} = 0b1101;
+}
+
+def CRC32B : AI_crc32<0, 0b00, "b", int_arm_crc32b>;
+def CRC32CB : AI_crc32<1, 0b00, "cb", int_arm_crc32cb>;
+def CRC32H : AI_crc32<0, 0b01, "h", int_arm_crc32h>;
+def CRC32CH : AI_crc32<1, 0b01, "ch", int_arm_crc32ch>;
+def CRC32W : AI_crc32<0, 0b10, "w", int_arm_crc32w>;
+def CRC32CW : AI_crc32<1, 0b10, "cw", int_arm_crc32cw>;
+
+//===----------------------------------------------------------------------===//
+// ARMv8.1a Privilege Access Never extension
+//
+// SETPAN #imm1
+
+def SETPAN : AInoP<(outs), (ins imm0_1:$imm), MiscFrm, NoItinerary, "setpan",
+ "\t$imm", []>, Requires<[IsARM, HasV8, HasV8_1a]> {
+ bits<1> imm;
+
+ let Inst{31-28} = 0b1111;
+ let Inst{27-20} = 0b00010001;
+ let Inst{19-16} = 0b0000;
+ let Inst{15-10} = 0b000000;
+ let Inst{9} = imm;
+ let Inst{8} = 0b0;
+ let Inst{7-4} = 0b0000;
+ let Inst{3-0} = 0b0000;
+
+ let Unpredictable{19-16} = 0b1111;
+ let Unpredictable{15-10} = 0b111111;
+ let Unpredictable{8} = 0b1;
+ let Unpredictable{3-0} = 0b1111;
+}
+
+//===----------------------------------------------------------------------===//
+// Comparison Instructions...
+//
+
+defm CMP : AI1_cmp_irs<0b1010, "cmp",
+ IIC_iCMPi, IIC_iCMPr, IIC_iCMPsr, ARMcmp>;
+
+// ARMcmpZ can re-use the above instruction definitions.
+def : ARMPat<(ARMcmpZ GPR:$src, mod_imm:$imm),
+ (CMPri GPR:$src, mod_imm:$imm)>;
+def : ARMPat<(ARMcmpZ GPR:$src, GPR:$rhs),
+ (CMPrr GPR:$src, GPR:$rhs)>;
+def : ARMPat<(ARMcmpZ GPR:$src, so_reg_imm:$rhs),
+ (CMPrsi GPR:$src, so_reg_imm:$rhs)>;
+def : ARMPat<(ARMcmpZ GPR:$src, so_reg_reg:$rhs),
+ (CMPrsr GPR:$src, so_reg_reg:$rhs)>;
+
+// CMN register-integer
+let isCompare = 1, Defs = [CPSR] in {
+def CMNri : AI1<0b1011, (outs), (ins GPR:$Rn, mod_imm:$imm), DPFrm, IIC_iCMPi,
+ "cmn", "\t$Rn, $imm",
+ [(ARMcmn GPR:$Rn, mod_imm:$imm)]>,
+ Sched<[WriteCMP, ReadALU]> {
+ bits<4> Rn;
+ bits<12> imm;
+ let Inst{25} = 1;
+ let Inst{20} = 1;
+ let Inst{19-16} = Rn;
+ let Inst{15-12} = 0b0000;
+ let Inst{11-0} = imm;
+
+ let Unpredictable{15-12} = 0b1111;
+}
+
+// CMN register-register/shift
+def CMNzrr : AI1<0b1011, (outs), (ins GPR:$Rn, GPR:$Rm), DPFrm, IIC_iCMPr,
+ "cmn", "\t$Rn, $Rm",
+ [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
+ GPR:$Rn, GPR:$Rm)]>, Sched<[WriteCMP, ReadALU, ReadALU]> {
+ bits<4> Rn;
+ bits<4> Rm;
+ let isCommutable = 1;
+ let Inst{25} = 0;
+ let Inst{20} = 1;
+ let Inst{19-16} = Rn;
+ let Inst{15-12} = 0b0000;
+ let Inst{11-4} = 0b00000000;
+ let Inst{3-0} = Rm;
+
+ let Unpredictable{15-12} = 0b1111;
+}
+
+def CMNzrsi : AI1<0b1011, (outs),
+ (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm, IIC_iCMPsr,
+ "cmn", "\t$Rn, $shift",
+ [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
+ GPR:$Rn, so_reg_imm:$shift)]>,
+ Sched<[WriteCMPsi, ReadALU]> {
+ bits<4> Rn;
+ bits<12> shift;
+ let Inst{25} = 0;
+ let Inst{20} = 1;
+ let Inst{19-16} = Rn;
+ let Inst{15-12} = 0b0000;
+ let Inst{11-5} = shift{11-5};
+ let Inst{4} = 0;
+ let Inst{3-0} = shift{3-0};
+
+ let Unpredictable{15-12} = 0b1111;
+}
+
+def CMNzrsr : AI1<0b1011, (outs),
+ (ins GPRnopc:$Rn, so_reg_reg:$shift), DPSoRegRegFrm, IIC_iCMPsr,
+ "cmn", "\t$Rn, $shift",
+ [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
+ GPRnopc:$Rn, so_reg_reg:$shift)]>,
+ Sched<[WriteCMPsr, ReadALU]> {
+ bits<4> Rn;
+ bits<12> shift;
+ let Inst{25} = 0;
+ let Inst{20} = 1;
+ let Inst{19-16} = Rn;
+ let Inst{15-12} = 0b0000;
+ let Inst{11-8} = shift{11-8};
+ let Inst{7} = 0;
+ let Inst{6-5} = shift{6-5};
+ let Inst{4} = 1;
+ let Inst{3-0} = shift{3-0};
+
+ let Unpredictable{15-12} = 0b1111;
+}
+
+}
+
+def : ARMPat<(ARMcmp GPR:$src, mod_imm_neg:$imm),
+ (CMNri GPR:$src, mod_imm_neg:$imm)>;
+
+def : ARMPat<(ARMcmpZ GPR:$src, mod_imm_neg:$imm),
+ (CMNri GPR:$src, mod_imm_neg:$imm)>;
+
+// Note that TST/TEQ don't set all the same flags that CMP does!
+defm TST : AI1_cmp_irs<0b1000, "tst",
+ IIC_iTSTi, IIC_iTSTr, IIC_iTSTsr,
+ BinOpFrag<(ARMcmpZ (and_su node:$LHS, node:$RHS), 0)>, 1,
+ "DecodeTSTInstruction">;
+defm TEQ : AI1_cmp_irs<0b1001, "teq",
+ IIC_iTSTi, IIC_iTSTr, IIC_iTSTsr,
+ BinOpFrag<(ARMcmpZ (xor_su node:$LHS, node:$RHS), 0)>, 1>;
+
+// Pseudo i64 compares for some floating point compares.
+let usesCustomInserter = 1, isBranch = 1, isTerminator = 1,
+ Defs = [CPSR] in {
+def BCCi64 : PseudoInst<(outs),
+ (ins i32imm:$cc, GPR:$lhs1, GPR:$lhs2, GPR:$rhs1, GPR:$rhs2, brtarget:$dst),
+ IIC_Br,
+ [(ARMBcci64 imm:$cc, GPR:$lhs1, GPR:$lhs2, GPR:$rhs1, GPR:$rhs2, bb:$dst)]>,
+ Sched<[WriteBr]>;
+
+def BCCZi64 : PseudoInst<(outs),
+ (ins i32imm:$cc, GPR:$lhs1, GPR:$lhs2, brtarget:$dst), IIC_Br,
+ [(ARMBcci64 imm:$cc, GPR:$lhs1, GPR:$lhs2, 0, 0, bb:$dst)]>,
+ Sched<[WriteBr]>;
+} // usesCustomInserter
+
+
+// Conditional moves
+let hasSideEffects = 0 in {
+
+let isCommutable = 1, isSelect = 1 in
+def MOVCCr : ARMPseudoInst<(outs GPR:$Rd),
+ (ins GPR:$false, GPR:$Rm, cmovpred:$p),
+ 4, IIC_iCMOVr,
+ [(set GPR:$Rd, (ARMcmov GPR:$false, GPR:$Rm,
+ cmovpred:$p))]>,
+ RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+
+def MOVCCsi : ARMPseudoInst<(outs GPR:$Rd),
+ (ins GPR:$false, so_reg_imm:$shift, cmovpred:$p),
+ 4, IIC_iCMOVsr,
+ [(set GPR:$Rd,
+ (ARMcmov GPR:$false, so_reg_imm:$shift,
+ cmovpred:$p))]>,
+ RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+def MOVCCsr : ARMPseudoInst<(outs GPR:$Rd),
+ (ins GPR:$false, so_reg_reg:$shift, cmovpred:$p),
+ 4, IIC_iCMOVsr,
+ [(set GPR:$Rd, (ARMcmov GPR:$false, so_reg_reg:$shift,
+ cmovpred:$p))]>,
+ RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+
+
+let isMoveImm = 1 in
+def MOVCCi16
+ : ARMPseudoInst<(outs GPR:$Rd),
+ (ins GPR:$false, imm0_65535_expr:$imm, cmovpred:$p),
+ 4, IIC_iMOVi,
+ [(set GPR:$Rd, (ARMcmov GPR:$false, imm0_65535:$imm,
+ cmovpred:$p))]>,
+ RegConstraint<"$false = $Rd">, Requires<[IsARM, HasV6T2]>,
+ Sched<[WriteALU]>;
+
+let isMoveImm = 1 in
+def MOVCCi : ARMPseudoInst<(outs GPR:$Rd),
+ (ins GPR:$false, mod_imm:$imm, cmovpred:$p),
+ 4, IIC_iCMOVi,
+ [(set GPR:$Rd, (ARMcmov GPR:$false, mod_imm:$imm,
+ cmovpred:$p))]>,
+ RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+
+// Two instruction predicate mov immediate.
+let isMoveImm = 1 in
+def MOVCCi32imm
+ : ARMPseudoInst<(outs GPR:$Rd),
+ (ins GPR:$false, i32imm:$src, cmovpred:$p),
+ 8, IIC_iCMOVix2,
+ [(set GPR:$Rd, (ARMcmov GPR:$false, imm:$src,
+ cmovpred:$p))]>,
+ RegConstraint<"$false = $Rd">, Requires<[IsARM, HasV6T2]>;
+
+let isMoveImm = 1 in
+def MVNCCi : ARMPseudoInst<(outs GPR:$Rd),
+ (ins GPR:$false, mod_imm:$imm, cmovpred:$p),
+ 4, IIC_iCMOVi,
+ [(set GPR:$Rd, (ARMcmov GPR:$false, mod_imm_not:$imm,
+ cmovpred:$p))]>,
+ RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+
+} // hasSideEffects
+
+
+//===----------------------------------------------------------------------===//
+// Atomic operations intrinsics
+//
+
+def MemBarrierOptOperand : AsmOperandClass {
+ let Name = "MemBarrierOpt";
+ let ParserMethod = "parseMemBarrierOptOperand";
+}
+def memb_opt : Operand<i32> {
+ let PrintMethod = "printMemBOption";
+ let ParserMatchClass = MemBarrierOptOperand;
+ let DecoderMethod = "DecodeMemBarrierOption";
+}
+
+def InstSyncBarrierOptOperand : AsmOperandClass {
+ let Name = "InstSyncBarrierOpt";
+ let ParserMethod = "parseInstSyncBarrierOptOperand";
+}
+def instsyncb_opt : Operand<i32> {
+ let PrintMethod = "printInstSyncBOption";
+ let ParserMatchClass = InstSyncBarrierOptOperand;
+ let DecoderMethod = "DecodeInstSyncBarrierOption";
+}
+
+// Memory barriers protect the atomic sequences
+let hasSideEffects = 1 in {
+def DMB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
+ "dmb", "\t$opt", [(int_arm_dmb (i32 imm0_15:$opt))]>,
+ Requires<[IsARM, HasDB]> {
+ bits<4> opt;
+ let Inst{31-4} = 0xf57ff05;
+ let Inst{3-0} = opt;
+}
+
+def DSB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
+ "dsb", "\t$opt", [(int_arm_dsb (i32 imm0_15:$opt))]>,
+ Requires<[IsARM, HasDB]> {
+ bits<4> opt;
+ let Inst{31-4} = 0xf57ff04;
+ let Inst{3-0} = opt;
+}
+
+// ISB has only full system option
+def ISB : AInoP<(outs), (ins instsyncb_opt:$opt), MiscFrm, NoItinerary,
+ "isb", "\t$opt", [(int_arm_isb (i32 imm0_15:$opt))]>,
+ Requires<[IsARM, HasDB]> {
+ bits<4> opt;
+ let Inst{31-4} = 0xf57ff06;
+ let Inst{3-0} = opt;
+}
+}
+
+let usesCustomInserter = 1, Defs = [CPSR] in {
+
+// Pseudo instruction that combines movs + predicated rsbmi
+// to implement integer ABS
+ def ABS : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$src), 8, NoItinerary, []>;
+}
+
+let usesCustomInserter = 1 in {
+ def COPY_STRUCT_BYVAL_I32 : PseudoInst<
+ (outs), (ins GPR:$dst, GPR:$src, i32imm:$size, i32imm:$alignment),
+ NoItinerary,
+ [(ARMcopystructbyval GPR:$dst, GPR:$src, imm:$size, imm:$alignment)]>;
+}
+
+let hasPostISelHook = 1, Constraints = "$newdst = $dst, $newsrc = $src" in {
+ // %newsrc, %newdst = MEMCPY %dst, %src, N, ...N scratch regs...
+ // Copies N registers worth of memory from address %src to address %dst
+ // and returns the incremented addresses. N scratch register will
+ // be attached for the copy to use.
+ def MEMCPY : PseudoInst<
+ (outs GPR:$newdst, GPR:$newsrc),
+ (ins GPR:$dst, GPR:$src, i32imm:$nreg, variable_ops),
+ NoItinerary,
+ [(set GPR:$newdst, GPR:$newsrc,
+ (ARMmemcopy GPR:$dst, GPR:$src, imm:$nreg))]>;
+}
+
+def ldrex_1 : PatFrag<(ops node:$ptr), (int_arm_ldrex node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def ldrex_2 : PatFrag<(ops node:$ptr), (int_arm_ldrex node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def ldrex_4 : PatFrag<(ops node:$ptr), (int_arm_ldrex node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def strex_1 : PatFrag<(ops node:$val, node:$ptr),
+ (int_arm_strex node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def strex_2 : PatFrag<(ops node:$val, node:$ptr),
+ (int_arm_strex node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def strex_4 : PatFrag<(ops node:$val, node:$ptr),
+ (int_arm_strex node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def ldaex_1 : PatFrag<(ops node:$ptr), (int_arm_ldaex node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def ldaex_2 : PatFrag<(ops node:$ptr), (int_arm_ldaex node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def ldaex_4 : PatFrag<(ops node:$ptr), (int_arm_ldaex node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def stlex_1 : PatFrag<(ops node:$val, node:$ptr),
+ (int_arm_stlex node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def stlex_2 : PatFrag<(ops node:$val, node:$ptr),
+ (int_arm_stlex node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def stlex_4 : PatFrag<(ops node:$val, node:$ptr),
+ (int_arm_stlex node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+let mayLoad = 1 in {
+def LDREXB : AIldrex<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+ NoItinerary, "ldrexb", "\t$Rt, $addr",
+ [(set GPR:$Rt, (ldrex_1 addr_offset_none:$addr))]>;
+def LDREXH : AIldrex<0b11, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+ NoItinerary, "ldrexh", "\t$Rt, $addr",
+ [(set GPR:$Rt, (ldrex_2 addr_offset_none:$addr))]>;
+def LDREX : AIldrex<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+ NoItinerary, "ldrex", "\t$Rt, $addr",
+ [(set GPR:$Rt, (ldrex_4 addr_offset_none:$addr))]>;
+let hasExtraDefRegAllocReq = 1 in
+def LDREXD : AIldrex<0b01, (outs GPRPairOp:$Rt),(ins addr_offset_none:$addr),
+ NoItinerary, "ldrexd", "\t$Rt, $addr", []> {
+ let DecoderMethod = "DecodeDoubleRegLoad";
+}
+
+def LDAEXB : AIldaex<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+ NoItinerary, "ldaexb", "\t$Rt, $addr",
+ [(set GPR:$Rt, (ldaex_1 addr_offset_none:$addr))]>;
+def LDAEXH : AIldaex<0b11, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+ NoItinerary, "ldaexh", "\t$Rt, $addr",
+ [(set GPR:$Rt, (ldaex_2 addr_offset_none:$addr))]>;
+def LDAEX : AIldaex<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+ NoItinerary, "ldaex", "\t$Rt, $addr",
+ [(set GPR:$Rt, (ldaex_4 addr_offset_none:$addr))]>;
+let hasExtraDefRegAllocReq = 1 in
+def LDAEXD : AIldaex<0b01, (outs GPRPairOp:$Rt),(ins addr_offset_none:$addr),
+ NoItinerary, "ldaexd", "\t$Rt, $addr", []> {
+ let DecoderMethod = "DecodeDoubleRegLoad";
+}
+}
+
+let mayStore = 1, Constraints = "@earlyclobber $Rd" in {
+def STREXB: AIstrex<0b10, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
+ NoItinerary, "strexb", "\t$Rd, $Rt, $addr",
+ [(set GPR:$Rd, (strex_1 GPR:$Rt,
+ addr_offset_none:$addr))]>;
+def STREXH: AIstrex<0b11, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
+ NoItinerary, "strexh", "\t$Rd, $Rt, $addr",
+ [(set GPR:$Rd, (strex_2 GPR:$Rt,
+ addr_offset_none:$addr))]>;
+def STREX : AIstrex<0b00, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
+ NoItinerary, "strex", "\t$Rd, $Rt, $addr",
+ [(set GPR:$Rd, (strex_4 GPR:$Rt,
+ addr_offset_none:$addr))]>;
+let hasExtraSrcRegAllocReq = 1 in
+def STREXD : AIstrex<0b01, (outs GPR:$Rd),
+ (ins GPRPairOp:$Rt, addr_offset_none:$addr),
+ NoItinerary, "strexd", "\t$Rd, $Rt, $addr", []> {
+ let DecoderMethod = "DecodeDoubleRegStore";
+}
+def STLEXB: AIstlex<0b10, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
+ NoItinerary, "stlexb", "\t$Rd, $Rt, $addr",
+ [(set GPR:$Rd,
+ (stlex_1 GPR:$Rt, addr_offset_none:$addr))]>;
+def STLEXH: AIstlex<0b11, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
+ NoItinerary, "stlexh", "\t$Rd, $Rt, $addr",
+ [(set GPR:$Rd,
+ (stlex_2 GPR:$Rt, addr_offset_none:$addr))]>;
+def STLEX : AIstlex<0b00, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
+ NoItinerary, "stlex", "\t$Rd, $Rt, $addr",
+ [(set GPR:$Rd,
+ (stlex_4 GPR:$Rt, addr_offset_none:$addr))]>;
+let hasExtraSrcRegAllocReq = 1 in
+def STLEXD : AIstlex<0b01, (outs GPR:$Rd),
+ (ins GPRPairOp:$Rt, addr_offset_none:$addr),
+ NoItinerary, "stlexd", "\t$Rd, $Rt, $addr", []> {
+ let DecoderMethod = "DecodeDoubleRegStore";
+}
+}
+
+def CLREX : AXI<(outs), (ins), MiscFrm, NoItinerary, "clrex",
+ [(int_arm_clrex)]>,
+ Requires<[IsARM, HasV6K]> {
+ let Inst{31-0} = 0b11110101011111111111000000011111;
+}
+
+def : ARMPat<(strex_1 (and GPR:$Rt, 0xff), addr_offset_none:$addr),
+ (STREXB GPR:$Rt, addr_offset_none:$addr)>;
+def : ARMPat<(strex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr),
+ (STREXH GPR:$Rt, addr_offset_none:$addr)>;
+
+def : ARMPat<(stlex_1 (and GPR:$Rt, 0xff), addr_offset_none:$addr),
+ (STLEXB GPR:$Rt, addr_offset_none:$addr)>;
+def : ARMPat<(stlex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr),
+ (STLEXH GPR:$Rt, addr_offset_none:$addr)>;
+
+class acquiring_load<PatFrag base>
+ : PatFrag<(ops node:$ptr), (base node:$ptr), [{
+ AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+ return isAcquireOrStronger(Ordering);
+}]>;
+
+def atomic_load_acquire_8 : acquiring_load<atomic_load_8>;
+def atomic_load_acquire_16 : acquiring_load<atomic_load_16>;
+def atomic_load_acquire_32 : acquiring_load<atomic_load_32>;
+
+class releasing_store<PatFrag base>
+ : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
+ AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+ return isReleaseOrStronger(Ordering);
+}]>;
+
+def atomic_store_release_8 : releasing_store<atomic_store_8>;
+def atomic_store_release_16 : releasing_store<atomic_store_16>;
+def atomic_store_release_32 : releasing_store<atomic_store_32>;
+
+let AddedComplexity = 8 in {
+ def : ARMPat<(atomic_load_acquire_8 addr_offset_none:$addr), (LDAB addr_offset_none:$addr)>;
+ def : ARMPat<(atomic_load_acquire_16 addr_offset_none:$addr), (LDAH addr_offset_none:$addr)>;
+ def : ARMPat<(atomic_load_acquire_32 addr_offset_none:$addr), (LDA addr_offset_none:$addr)>;
+ def : ARMPat<(atomic_store_release_8 addr_offset_none:$addr, GPR:$val), (STLB GPR:$val, addr_offset_none:$addr)>;
+ def : ARMPat<(atomic_store_release_16 addr_offset_none:$addr, GPR:$val), (STLH GPR:$val, addr_offset_none:$addr)>;
+ def : ARMPat<(atomic_store_release_32 addr_offset_none:$addr, GPR:$val), (STL GPR:$val, addr_offset_none:$addr)>;
+}
+
+// SWP/SWPB are deprecated in V6/V7.
+let mayLoad = 1, mayStore = 1 in {
+def SWP : AIswp<0, (outs GPRnopc:$Rt),
+ (ins GPRnopc:$Rt2, addr_offset_none:$addr), "swp", []>,
+ Requires<[PreV8]>;
+def SWPB: AIswp<1, (outs GPRnopc:$Rt),
+ (ins GPRnopc:$Rt2, addr_offset_none:$addr), "swpb", []>,
+ Requires<[PreV8]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Coprocessor Instructions.
+//
+
+def CDP : ABI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
+ c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
+ NoItinerary, "cdp", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
+ [(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
+ imm:$CRm, imm:$opc2)]>,
+ Requires<[PreV8]> {
+ bits<4> opc1;
+ bits<4> CRn;
+ bits<4> CRd;
+ bits<4> cop;
+ bits<3> opc2;
+ bits<4> CRm;
+
+ let Inst{3-0} = CRm;
+ let Inst{4} = 0;
+ let Inst{7-5} = opc2;
+ let Inst{11-8} = cop;
+ let Inst{15-12} = CRd;
+ let Inst{19-16} = CRn;
+ let Inst{23-20} = opc1;
+}
+
+def CDP2 : ABXI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
+ c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
+ NoItinerary, "cdp2\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
+ [(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
+ imm:$CRm, imm:$opc2)]>,
+ Requires<[PreV8]> {
+ let Inst{31-28} = 0b1111;
+ bits<4> opc1;
+ bits<4> CRn;
+ bits<4> CRd;
+ bits<4> cop;
+ bits<3> opc2;
+ bits<4> CRm;
+
+ let Inst{3-0} = CRm;
+ let Inst{4} = 0;
+ let Inst{7-5} = opc2;
+ let Inst{11-8} = cop;
+ let Inst{15-12} = CRd;
+ let Inst{19-16} = CRn;
+ let Inst{23-20} = opc1;
+}
+
+class ACI<dag oops, dag iops, string opc, string asm,
+ list<dag> pattern, IndexMode im = IndexModeNone>
+ : I<oops, iops, AddrModeNone, 4, im, BrFrm, NoItinerary,
+ opc, asm, "", pattern> {
+ let Inst{27-25} = 0b110;
+}
+class ACInoP<dag oops, dag iops, string opc, string asm,
+ list<dag> pattern, IndexMode im = IndexModeNone>
+ : InoP<oops, iops, AddrModeNone, 4, im, BrFrm, NoItinerary,
+ opc, asm, "", pattern> {
+ let Inst{31-28} = 0b1111;
+ let Inst{27-25} = 0b110;
+}
+multiclass LdStCop<bit load, bit Dbit, string asm, list<dag> pattern> {
+ def _OFFSET : ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
+ asm, "\t$cop, $CRd, $addr", pattern> {
+ bits<13> addr;
+ bits<4> cop;
+ bits<4> CRd;
+ let Inst{24} = 1; // P = 1
+ let Inst{23} = addr{8};
+ let Inst{22} = Dbit;
+ let Inst{21} = 0; // W = 0
+ let Inst{20} = load;
+ let Inst{19-16} = addr{12-9};
+ let Inst{15-12} = CRd;
+ let Inst{11-8} = cop;
+ let Inst{7-0} = addr{7-0};
+ let DecoderMethod = "DecodeCopMemInstruction";
+ }
+ def _PRE : ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5_pre:$addr),
+ asm, "\t$cop, $CRd, $addr!", [], IndexModePre> {
+ bits<13> addr;
+ bits<4> cop;
+ bits<4> CRd;
+ let Inst{24} = 1; // P = 1
+ let Inst{23} = addr{8};
+ let Inst{22} = Dbit;
+ let Inst{21} = 1; // W = 1
+ let Inst{20} = load;
+ let Inst{19-16} = addr{12-9};
+ let Inst{15-12} = CRd;
+ let Inst{11-8} = cop;
+ let Inst{7-0} = addr{7-0};
+ let DecoderMethod = "DecodeCopMemInstruction";
+ }
+ def _POST: ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
+ postidx_imm8s4:$offset),
+ asm, "\t$cop, $CRd, $addr, $offset", [], IndexModePost> {
+ bits<9> offset;
+ bits<4> addr;
+ bits<4> cop;
+ bits<4> CRd;
+ let Inst{24} = 0; // P = 0
+ let Inst{23} = offset{8};
+ let Inst{22} = Dbit;
+ let Inst{21} = 1; // W = 1
+ let Inst{20} = load;
+ let Inst{19-16} = addr;
+ let Inst{15-12} = CRd;
+ let Inst{11-8} = cop;
+ let Inst{7-0} = offset{7-0};
+ let DecoderMethod = "DecodeCopMemInstruction";
+ }
+ def _OPTION : ACI<(outs),
+ (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
+ coproc_option_imm:$option),
+ asm, "\t$cop, $CRd, $addr, $option", []> {
+ bits<8> option;
+ bits<4> addr;
+ bits<4> cop;
+ bits<4> CRd;
+ let Inst{24} = 0; // P = 0
+ let Inst{23} = 1; // U = 1
+ let Inst{22} = Dbit;
+ let Inst{21} = 0; // W = 0
+ let Inst{20} = load;
+ let Inst{19-16} = addr;
+ let Inst{15-12} = CRd;
+ let Inst{11-8} = cop;
+ let Inst{7-0} = option;
+ let DecoderMethod = "DecodeCopMemInstruction";
+ }
+}
+multiclass LdSt2Cop<bit load, bit Dbit, string asm, list<dag> pattern> {
+ def _OFFSET : ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
+ asm, "\t$cop, $CRd, $addr", pattern> {
+ bits<13> addr;
+ bits<4> cop;
+ bits<4> CRd;
+ let Inst{24} = 1; // P = 1
+ let Inst{23} = addr{8};
+ let Inst{22} = Dbit;
+ let Inst{21} = 0; // W = 0
+ let Inst{20} = load;
+ let Inst{19-16} = addr{12-9};
+ let Inst{15-12} = CRd;
+ let Inst{11-8} = cop;
+ let Inst{7-0} = addr{7-0};
+ let DecoderMethod = "DecodeCopMemInstruction";
+ }
+ def _PRE : ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5_pre:$addr),
+ asm, "\t$cop, $CRd, $addr!", [], IndexModePre> {
+ bits<13> addr;
+ bits<4> cop;
+ bits<4> CRd;
+ let Inst{24} = 1; // P = 1
+ let Inst{23} = addr{8};
+ let Inst{22} = Dbit;
+ let Inst{21} = 1; // W = 1
+ let Inst{20} = load;
+ let Inst{19-16} = addr{12-9};
+ let Inst{15-12} = CRd;
+ let Inst{11-8} = cop;
+ let Inst{7-0} = addr{7-0};
+ let DecoderMethod = "DecodeCopMemInstruction";
+ }
+ def _POST: ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
+ postidx_imm8s4:$offset),
+ asm, "\t$cop, $CRd, $addr, $offset", [], IndexModePost> {
+ bits<9> offset;
+ bits<4> addr;
+ bits<4> cop;
+ bits<4> CRd;
+ let Inst{24} = 0; // P = 0
+ let Inst{23} = offset{8};
+ let Inst{22} = Dbit;
+ let Inst{21} = 1; // W = 1
+ let Inst{20} = load;
+ let Inst{19-16} = addr;
+ let Inst{15-12} = CRd;
+ let Inst{11-8} = cop;
+ let Inst{7-0} = offset{7-0};
+ let DecoderMethod = "DecodeCopMemInstruction";
+ }
+ def _OPTION : ACInoP<(outs),
+ (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
+ coproc_option_imm:$option),
+ asm, "\t$cop, $CRd, $addr, $option", []> {
+ bits<8> option;
+ bits<4> addr;
+ bits<4> cop;
+ bits<4> CRd;
+ let Inst{24} = 0; // P = 0
+ let Inst{23} = 1; // U = 1
+ let Inst{22} = Dbit;
+ let Inst{21} = 0; // W = 0
+ let Inst{20} = load;
+ let Inst{19-16} = addr;
+ let Inst{15-12} = CRd;
+ let Inst{11-8} = cop;
+ let Inst{7-0} = option;
+ let DecoderMethod = "DecodeCopMemInstruction";
+ }
+}
+
+defm LDC : LdStCop <1, 0, "ldc", [(int_arm_ldc imm:$cop, imm:$CRd, addrmode5:$addr)]>;
+defm LDCL : LdStCop <1, 1, "ldcl", [(int_arm_ldcl imm:$cop, imm:$CRd, addrmode5:$addr)]>;
+defm LDC2 : LdSt2Cop<1, 0, "ldc2", [(int_arm_ldc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8]>;
+defm LDC2L : LdSt2Cop<1, 1, "ldc2l", [(int_arm_ldc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8]>;
+
+defm STC : LdStCop <0, 0, "stc", [(int_arm_stc imm:$cop, imm:$CRd, addrmode5:$addr)]>;
+defm STCL : LdStCop <0, 1, "stcl", [(int_arm_stcl imm:$cop, imm:$CRd, addrmode5:$addr)]>;
+defm STC2 : LdSt2Cop<0, 0, "stc2", [(int_arm_stc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8]>;
+defm STC2L : LdSt2Cop<0, 1, "stc2l", [(int_arm_stc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8]>;
+
+//===----------------------------------------------------------------------===//
+// Move between coprocessor and ARM core register.
+//
+
+class MovRCopro<string opc, bit direction, dag oops, dag iops,
+ list<dag> pattern>
+ : ABI<0b1110, oops, iops, NoItinerary, opc,
+ "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2", pattern> {
+ let Inst{20} = direction;
+ let Inst{4} = 1;
+
+ bits<4> Rt;
+ bits<4> cop;
+ bits<3> opc1;
+ bits<3> opc2;
+ bits<4> CRm;
+ bits<4> CRn;
+
+ let Inst{15-12} = Rt;
+ let Inst{11-8} = cop;
+ let Inst{23-21} = opc1;
+ let Inst{7-5} = opc2;
+ let Inst{3-0} = CRm;
+ let Inst{19-16} = CRn;
+}
+
+def MCR : MovRCopro<"mcr", 0 /* from ARM core register to coprocessor */,
+ (outs),
+ (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
+ c_imm:$CRm, imm0_7:$opc2),
+ [(int_arm_mcr imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
+ imm:$CRm, imm:$opc2)]>,
+ ComplexDeprecationPredicate<"MCR">;
+def : ARMInstAlias<"mcr${p} $cop, $opc1, $Rt, $CRn, $CRm",
+ (MCR p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
+ c_imm:$CRm, 0, pred:$p)>;
+def MRC : MovRCopro<"mrc", 1 /* from coprocessor to ARM core register */,
+ (outs GPRwithAPSR:$Rt),
+ (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm,
+ imm0_7:$opc2), []>;
+def : ARMInstAlias<"mrc${p} $cop, $opc1, $Rt, $CRn, $CRm",
+ (MRC GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+ c_imm:$CRm, 0, pred:$p)>;
+
+def : ARMPat<(int_arm_mrc imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2),
+ (MRC imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
+
+class MovRCopro2<string opc, bit direction, dag oops, dag iops,
+ list<dag> pattern>
+ : ABXI<0b1110, oops, iops, NoItinerary,
+ !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"), pattern> {
+ let Inst{31-24} = 0b11111110;
+ let Inst{20} = direction;
+ let Inst{4} = 1;
+
+ bits<4> Rt;
+ bits<4> cop;
+ bits<3> opc1;
+ bits<3> opc2;
+ bits<4> CRm;
+ bits<4> CRn;
+
+ let Inst{15-12} = Rt;
+ let Inst{11-8} = cop;
+ let Inst{23-21} = opc1;
+ let Inst{7-5} = opc2;
+ let Inst{3-0} = CRm;
+ let Inst{19-16} = CRn;
+}
+
+def MCR2 : MovRCopro2<"mcr2", 0 /* from ARM core register to coprocessor */,
+ (outs),
+ (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
+ c_imm:$CRm, imm0_7:$opc2),
+ [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
+ imm:$CRm, imm:$opc2)]>,
+ Requires<[PreV8]>;
+def : ARMInstAlias<"mcr2 $cop, $opc1, $Rt, $CRn, $CRm",
+ (MCR2 p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
+ c_imm:$CRm, 0)>;
+def MRC2 : MovRCopro2<"mrc2", 1 /* from coprocessor to ARM core register */,
+ (outs GPRwithAPSR:$Rt),
+ (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm,
+ imm0_7:$opc2), []>,
+ Requires<[PreV8]>;
+def : ARMInstAlias<"mrc2 $cop, $opc1, $Rt, $CRn, $CRm",
+ (MRC2 GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+ c_imm:$CRm, 0)>;
+
+def : ARMV5TPat<(int_arm_mrc2 imm:$cop, imm:$opc1, imm:$CRn,
+ imm:$CRm, imm:$opc2),
+ (MRC2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
+
+class MovRRCopro<string opc, bit direction, dag oops, dag iops, list<dag>
+ pattern = []>
+ : ABI<0b1100, oops, iops, NoItinerary, opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm",
+ pattern> {
+
+ let Inst{23-21} = 0b010;
+ let Inst{20} = direction;
+
+ bits<4> Rt;
+ bits<4> Rt2;
+ bits<4> cop;
+ bits<4> opc1;
+ bits<4> CRm;
+
+ let Inst{15-12} = Rt;
+ let Inst{19-16} = Rt2;
+ let Inst{11-8} = cop;
+ let Inst{7-4} = opc1;
+ let Inst{3-0} = CRm;
+}
+
+def MCRR : MovRRCopro<"mcrr", 0 /* from ARM core register to coprocessor */,
+ (outs), (ins p_imm:$cop, imm0_15:$opc1, GPRnopc:$Rt,
+ GPRnopc:$Rt2, c_imm:$CRm),
+ [(int_arm_mcrr imm:$cop, imm:$opc1, GPRnopc:$Rt,
+ GPRnopc:$Rt2, imm:$CRm)]>;
+def MRRC : MovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */,
+ (outs GPRnopc:$Rt, GPRnopc:$Rt2),
+ (ins p_imm:$cop, imm0_15:$opc1, c_imm:$CRm), []>;
+
+class MovRRCopro2<string opc, bit direction, dag oops, dag iops,
+ list<dag> pattern = []>
+ : ABXI<0b1100, oops, iops, NoItinerary,
+ !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), pattern>,
+ Requires<[PreV8]> {
+ let Inst{31-28} = 0b1111;
+ let Inst{23-21} = 0b010;
+ let Inst{20} = direction;
+
+ bits<4> Rt;
+ bits<4> Rt2;
+ bits<4> cop;
+ bits<4> opc1;
+ bits<4> CRm;
+
+ let Inst{15-12} = Rt;
+ let Inst{19-16} = Rt2;
+ let Inst{11-8} = cop;
+ let Inst{7-4} = opc1;
+ let Inst{3-0} = CRm;
+
+ let DecoderMethod = "DecoderForMRRC2AndMCRR2";
+}
+
+def MCRR2 : MovRRCopro2<"mcrr2", 0 /* from ARM core register to coprocessor */,
+ (outs), (ins p_imm:$cop, imm0_15:$opc1, GPRnopc:$Rt,
+ GPRnopc:$Rt2, c_imm:$CRm),
+ [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPRnopc:$Rt,
+ GPRnopc:$Rt2, imm:$CRm)]>;
+
+def MRRC2 : MovRRCopro2<"mrrc2", 1 /* from coprocessor to ARM core register */,
+ (outs GPRnopc:$Rt, GPRnopc:$Rt2),
+ (ins p_imm:$cop, imm0_15:$opc1, c_imm:$CRm), []>;
+
+//===----------------------------------------------------------------------===//
+// Move between special register and ARM core register
+//
+
+// Move to ARM core register from Special Register
+def MRS : ABI<0b0001, (outs GPRnopc:$Rd), (ins), NoItinerary,
+ "mrs", "\t$Rd, apsr", []> {
+ bits<4> Rd;
+ let Inst{23-16} = 0b00001111;
+ let Unpredictable{19-17} = 0b111;
+
+ let Inst{15-12} = Rd;
+
+ let Inst{11-0} = 0b000000000000;
+ let Unpredictable{11-0} = 0b110100001111;
+}
+
+def : InstAlias<"mrs${p} $Rd, cpsr", (MRS GPRnopc:$Rd, pred:$p), 0>,
+ Requires<[IsARM]>;
+
+// The MRSsys instruction is the MRS instruction from the ARM ARM,
+// section B9.3.9, with the R bit set to 1.
+def MRSsys : ABI<0b0001, (outs GPRnopc:$Rd), (ins), NoItinerary,
+ "mrs", "\t$Rd, spsr", []> {
+ bits<4> Rd;
+ let Inst{23-16} = 0b01001111;
+ let Unpredictable{19-16} = 0b1111;
+
+ let Inst{15-12} = Rd;
+
+ let Inst{11-0} = 0b000000000000;
+ let Unpredictable{11-0} = 0b110100001111;
+}
+
+// However, the MRS (banked register) system instruction (ARMv7VE) *does* have a
+// separate encoding (distinguished by bit 5.
+def MRSbanked : ABI<0b0001, (outs GPRnopc:$Rd), (ins banked_reg:$banked),
+ NoItinerary, "mrs", "\t$Rd, $banked", []>,
+ Requires<[IsARM, HasVirtualization]> {
+ bits<6> banked;
+ bits<4> Rd;
+
+ let Inst{23} = 0;
+ let Inst{22} = banked{5}; // R bit
+ let Inst{21-20} = 0b00;
+ let Inst{19-16} = banked{3-0};
+ let Inst{15-12} = Rd;
+ let Inst{11-9} = 0b001;
+ let Inst{8} = banked{4};
+ let Inst{7-0} = 0b00000000;
+}
+
+// Move from ARM core register to Special Register
+//
+// No need to have both system and application versions of MSR (immediate) or
+// MSR (register), the encodings are the same and the assembly parser has no way
+// to distinguish between them. The mask operand contains the special register
+// (R Bit) in bit 4 and bits 3-0 contains the mask with the fields to be
+// accessed in the special register.
+let Defs = [CPSR] in
+def MSR : ABI<0b0001, (outs), (ins msr_mask:$mask, GPR:$Rn), NoItinerary,
+ "msr", "\t$mask, $Rn", []> {
+ bits<5> mask;
+ bits<4> Rn;
+
+ let Inst{23} = 0;
+ let Inst{22} = mask{4}; // R bit
+ let Inst{21-20} = 0b10;
+ let Inst{19-16} = mask{3-0};
+ let Inst{15-12} = 0b1111;
+ let Inst{11-4} = 0b00000000;
+ let Inst{3-0} = Rn;
+}
+
+let Defs = [CPSR] in
+def MSRi : ABI<0b0011, (outs), (ins msr_mask:$mask, mod_imm:$imm), NoItinerary,
+ "msr", "\t$mask, $imm", []> {
+ bits<5> mask;
+ bits<12> imm;
+
+ let Inst{23} = 0;
+ let Inst{22} = mask{4}; // R bit
+ let Inst{21-20} = 0b10;
+ let Inst{19-16} = mask{3-0};
+ let Inst{15-12} = 0b1111;
+ let Inst{11-0} = imm;
+}
+
+// However, the MSR (banked register) system instruction (ARMv7VE) *does* have a
+// separate encoding (distinguished by bit 5.
+def MSRbanked : ABI<0b0001, (outs), (ins banked_reg:$banked, GPRnopc:$Rn),
+ NoItinerary, "msr", "\t$banked, $Rn", []>,
+ Requires<[IsARM, HasVirtualization]> {
+ bits<6> banked;
+ bits<4> Rn;
+
+ let Inst{23} = 0;
+ let Inst{22} = banked{5}; // R bit
+ let Inst{21-20} = 0b10;
+ let Inst{19-16} = banked{3-0};
+ let Inst{15-12} = 0b1111;
+ let Inst{11-9} = 0b001;
+ let Inst{8} = banked{4};
+ let Inst{7-4} = 0b0000;
+ let Inst{3-0} = Rn;
+}
+
+// Dynamic stack allocation yields a _chkstk for Windows targets. These calls
+// are needed to probe the stack when allocating more than
+// 4k bytes in one go. Touching the stack at 4K increments is necessary to
+// ensure that the guard pages used by the OS virtual memory manager are
+// allocated in correct sequence.
+// The main point of having separate instruction are extra unmodelled effects
+// (compared to ordinary calls) like stack pointer change.
+
+def win__chkstk : SDNode<"ARMISD::WIN__CHKSTK", SDTNone,
+ [SDNPHasChain, SDNPSideEffect]>;
+let usesCustomInserter = 1, Uses = [R4], Defs = [R4, SP] in
+ def WIN__CHKSTK : PseudoInst<(outs), (ins), NoItinerary, [(win__chkstk)]>;
+
+def win__dbzchk : SDNode<"ARMISD::WIN__DBZCHK", SDT_WIN__DBZCHK,
+ [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>;
+let usesCustomInserter = 1, Defs = [CPSR] in
+ def WIN__DBZCHK : PseudoInst<(outs), (ins tGPR:$divisor), NoItinerary,
+ [(win__dbzchk tGPR:$divisor)]>;
+
+//===----------------------------------------------------------------------===//
+// TLS Instructions
+//
+
+// __aeabi_read_tp preserves the registers r1-r3.
+// This is a pseudo inst so that we can get the encoding right,
+// complete with fixup for the aeabi_read_tp function.
+// TPsoft is valid for ARM mode only, in case of Thumb mode a tTPsoft pattern
+// is defined in "ARMInstrThumb.td".
+let isCall = 1,
+ Defs = [R0, R12, LR, CPSR], Uses = [SP] in {
+ def TPsoft : ARMPseudoInst<(outs), (ins), 4, IIC_Br,
+ [(set R0, ARMthread_pointer)]>, Sched<[WriteBr]>;
+}
+
+//===----------------------------------------------------------------------===//
+// SJLJ Exception handling intrinsics
+// eh_sjlj_setjmp() is an instruction sequence to store the return
+// address and save #0 in R0 for the non-longjmp case.
+// Since by its nature we may be coming from some other function to get
+// here, and we're using the stack frame for the containing function to
+// save/restore registers, we can't keep anything live in regs across
+// the eh_sjlj_setjmp(), else it will almost certainly have been tromped upon
+// when we get here from a longjmp(). We force everything out of registers
+// except for our own input by listing the relevant registers in Defs. By
+// doing so, we also cause the prologue/epilogue code to actively preserve
+// all of the callee-saved resgisters, which is exactly what we want.
+// A constant value is passed in $val, and we use the location as a scratch.
+//
+// These are pseudo-instructions and are lowered to individual MC-insts, so
+// no encoding information is necessary.
+let Defs =
+ [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, CPSR,
+ Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15 ],
+ hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in {
+ def Int_eh_sjlj_setjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$val),
+ NoItinerary,
+ [(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>,
+ Requires<[IsARM, HasVFP2]>;
+}
+
+let Defs =
+ [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, CPSR ],
+ hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in {
+ def Int_eh_sjlj_setjmp_nofp : PseudoInst<(outs), (ins GPR:$src, GPR:$val),
+ NoItinerary,
+ [(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>,
+ Requires<[IsARM, NoVFP]>;
+}
+
+// FIXME: Non-IOS version(s)
+let isBarrier = 1, hasSideEffects = 1, isTerminator = 1,
+ Defs = [ R7, LR, SP ] in {
+def Int_eh_sjlj_longjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$scratch),
+ NoItinerary,
+ [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>,
+ Requires<[IsARM]>;
+}
+
+let isBarrier = 1, hasSideEffects = 1, usesCustomInserter = 1 in
+def Int_eh_sjlj_setup_dispatch : PseudoInst<(outs), (ins), NoItinerary,
+ [(ARMeh_sjlj_setup_dispatch)]>;
+
+// eh.sjlj.dispatchsetup pseudo-instruction.
+// This pseudo is used for both ARM and Thumb. Any differences are handled when
+// the pseudo is expanded (which happens before any passes that need the
+// instruction size).
+let isBarrier = 1 in
+def Int_eh_sjlj_dispatchsetup : PseudoInst<(outs), (ins), NoItinerary, []>;
+
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//
+
+// ARMv4 indirect branch using (MOVr PC, dst)
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in
+ def MOVPCRX : ARMPseudoExpand<(outs), (ins GPR:$dst),
+ 4, IIC_Br, [(brind GPR:$dst)],
+ (MOVr PC, GPR:$dst, (ops 14, zero_reg), zero_reg)>,
+ Requires<[IsARM, NoV4T]>, Sched<[WriteBr]>;
+
+// Large immediate handling.
+
+// 32-bit immediate using two piece mod_imms or movw + movt.
+// This is a single pseudo instruction, the benefit is that it can be remat'd
+// as a single unit instead of having to handle reg inputs.
+// FIXME: Remove this when we can do generalized remat.
+let isReMaterializable = 1, isMoveImm = 1 in
+def MOVi32imm : PseudoInst<(outs GPR:$dst), (ins i32imm:$src), IIC_iMOVix2,
+ [(set GPR:$dst, (arm_i32imm:$src))]>,
+ Requires<[IsARM]>;
+
+def LDRLIT_ga_abs : PseudoInst<(outs GPR:$dst), (ins i32imm:$src), IIC_iLoad_i,
+ [(set GPR:$dst, (ARMWrapper tglobaladdr:$src))]>,
+ Requires<[IsARM, DontUseMovt]>;
+
+// Pseudo instruction that combines movw + movt + add pc (if PIC).
+// It also makes it possible to rematerialize the instructions.
+// FIXME: Remove this when we can do generalized remat and when machine licm
+// can properly the instructions.
+let isReMaterializable = 1 in {
+def MOV_ga_pcrel : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr),
+ IIC_iMOVix2addpc,
+ [(set GPR:$dst, (ARMWrapperPIC tglobaladdr:$addr))]>,
+ Requires<[IsARM, UseMovt]>;
+
+def LDRLIT_ga_pcrel : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr),
+ IIC_iLoadiALU,
+ [(set GPR:$dst,
+ (ARMWrapperPIC tglobaladdr:$addr))]>,
+ Requires<[IsARM, DontUseMovt]>;
+
+let AddedComplexity = 10 in
+def LDRLIT_ga_pcrel_ldr : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr),
+ NoItinerary,
+ [(set GPR:$dst,
+ (load (ARMWrapperPIC tglobaladdr:$addr)))]>,
+ Requires<[IsARM, DontUseMovt]>;
+
+let AddedComplexity = 10 in
+def MOV_ga_pcrel_ldr : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr),
+ IIC_iMOVix2ld,
+ [(set GPR:$dst, (load (ARMWrapperPIC tglobaladdr:$addr)))]>,
+ Requires<[IsARM, UseMovt]>;
+} // isReMaterializable
+
+// The many different faces of TLS access.
+def : ARMPat<(ARMWrapper tglobaltlsaddr :$dst),
+ (MOVi32imm tglobaltlsaddr :$dst)>,
+ Requires<[IsARM, UseMovt]>;
+
+def : Pat<(ARMWrapper tglobaltlsaddr:$src),
+ (LDRLIT_ga_abs tglobaltlsaddr:$src)>,
+ Requires<[IsARM, DontUseMovt]>;
+
+def : Pat<(ARMWrapperPIC tglobaltlsaddr:$addr),
+ (MOV_ga_pcrel tglobaltlsaddr:$addr)>, Requires<[IsARM, UseMovt]>;
+
+def : Pat<(ARMWrapperPIC tglobaltlsaddr:$addr),
+ (LDRLIT_ga_pcrel tglobaltlsaddr:$addr)>,
+ Requires<[IsARM, DontUseMovt]>;
+let AddedComplexity = 10 in
+def : Pat<(load (ARMWrapperPIC tglobaltlsaddr:$addr)),
+ (MOV_ga_pcrel_ldr tglobaltlsaddr:$addr)>,
+ Requires<[IsARM, UseMovt]>;
+
+
+// ConstantPool, GlobalAddress, and JumpTable
+def : ARMPat<(ARMWrapper tconstpool :$dst), (LEApcrel tconstpool :$dst)>;
+def : ARMPat<(ARMWrapper tglobaladdr :$dst), (MOVi32imm tglobaladdr :$dst)>,
+ Requires<[IsARM, UseMovt]>;
+def : ARMPat<(ARMWrapper texternalsym :$dst), (MOVi32imm texternalsym :$dst)>,
+ Requires<[IsARM, UseMovt]>;
+def : ARMPat<(ARMWrapperJT tjumptable:$dst),
+ (LEApcrelJT tjumptable:$dst)>;
+
+// TODO: add,sub,and, 3-instr forms?
+
+// Tail calls. These patterns also apply to Thumb mode.
+def : Pat<(ARMtcret tcGPR:$dst), (TCRETURNri tcGPR:$dst)>;
+def : Pat<(ARMtcret (i32 tglobaladdr:$dst)), (TCRETURNdi texternalsym:$dst)>;
+def : Pat<(ARMtcret (i32 texternalsym:$dst)), (TCRETURNdi texternalsym:$dst)>;
+
+// Direct calls
+def : ARMPat<(ARMcall texternalsym:$func), (BL texternalsym:$func)>;
+def : ARMPat<(ARMcall_nolink texternalsym:$func),
+ (BMOVPCB_CALL texternalsym:$func)>;
+
+// zextload i1 -> zextload i8
+def : ARMPat<(zextloadi1 addrmode_imm12:$addr), (LDRBi12 addrmode_imm12:$addr)>;
+def : ARMPat<(zextloadi1 ldst_so_reg:$addr), (LDRBrs ldst_so_reg:$addr)>;
+
+// extload -> zextload
+def : ARMPat<(extloadi1 addrmode_imm12:$addr), (LDRBi12 addrmode_imm12:$addr)>;
+def : ARMPat<(extloadi1 ldst_so_reg:$addr), (LDRBrs ldst_so_reg:$addr)>;
+def : ARMPat<(extloadi8 addrmode_imm12:$addr), (LDRBi12 addrmode_imm12:$addr)>;
+def : ARMPat<(extloadi8 ldst_so_reg:$addr), (LDRBrs ldst_so_reg:$addr)>;
+
+def : ARMPat<(extloadi16 addrmode3:$addr), (LDRH addrmode3:$addr)>;
+
+def : ARMPat<(extloadi8 addrmodepc:$addr), (PICLDRB addrmodepc:$addr)>;
+def : ARMPat<(extloadi16 addrmodepc:$addr), (PICLDRH addrmodepc:$addr)>;
+
+// smul* and smla*
+def : ARMV5TEPat<(mul sext_16_node:$a, sext_16_node:$b),
+ (SMULBB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(mul sext_16_node:$a, (sra GPR:$b, (i32 16))),
+ (SMULBT GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(mul (sra GPR:$a, (i32 16)), sext_16_node:$b),
+ (SMULTB GPR:$a, GPR:$b)>;
+def : ARMV5MOPat<(add GPR:$acc,
+ (mul sext_16_node:$a, sext_16_node:$b)),
+ (SMLABB GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5MOPat<(add GPR:$acc,
+ (mul sext_16_node:$a, (sra GPR:$b, (i32 16)))),
+ (SMLABT GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5MOPat<(add GPR:$acc,
+ (mul (sra GPR:$a, (i32 16)), sext_16_node:$b)),
+ (SMLATB GPR:$a, GPR:$b, GPR:$acc)>;
+
+// Pre-v7 uses MCR for synchronization barriers.
+def : ARMPat<(ARMMemBarrierMCR GPR:$zero), (MCR 15, 0, GPR:$zero, 7, 10, 5)>,
+ Requires<[IsARM, HasV6]>;
+
+// SXT/UXT with no rotate
+let AddedComplexity = 16 in {
+def : ARMV6Pat<(and GPR:$Src, 0x000000FF), (UXTB GPR:$Src, 0)>;
+def : ARMV6Pat<(and GPR:$Src, 0x0000FFFF), (UXTH GPR:$Src, 0)>;
+def : ARMV6Pat<(and GPR:$Src, 0x00FF00FF), (UXTB16 GPR:$Src, 0)>;
+def : ARMV6Pat<(add GPR:$Rn, (and GPR:$Rm, 0x00FF)),
+ (UXTAB GPR:$Rn, GPR:$Rm, 0)>;
+def : ARMV6Pat<(add GPR:$Rn, (and GPR:$Rm, 0xFFFF)),
+ (UXTAH GPR:$Rn, GPR:$Rm, 0)>;
+}
+
+def : ARMV6Pat<(sext_inreg GPR:$Src, i8), (SXTB GPR:$Src, 0)>;
+def : ARMV6Pat<(sext_inreg GPR:$Src, i16), (SXTH GPR:$Src, 0)>;
+
+def : ARMV6Pat<(add GPR:$Rn, (sext_inreg GPRnopc:$Rm, i8)),
+ (SXTAB GPR:$Rn, GPRnopc:$Rm, 0)>;
+def : ARMV6Pat<(add GPR:$Rn, (sext_inreg GPRnopc:$Rm, i16)),
+ (SXTAH GPR:$Rn, GPRnopc:$Rm, 0)>;
+
+// Atomic load/store patterns
+def : ARMPat<(atomic_load_8 ldst_so_reg:$src),
+ (LDRBrs ldst_so_reg:$src)>;
+def : ARMPat<(atomic_load_8 addrmode_imm12:$src),
+ (LDRBi12 addrmode_imm12:$src)>;
+def : ARMPat<(atomic_load_16 addrmode3:$src),
+ (LDRH addrmode3:$src)>;
+def : ARMPat<(atomic_load_32 ldst_so_reg:$src),
+ (LDRrs ldst_so_reg:$src)>;
+def : ARMPat<(atomic_load_32 addrmode_imm12:$src),
+ (LDRi12 addrmode_imm12:$src)>;
+def : ARMPat<(atomic_store_8 ldst_so_reg:$ptr, GPR:$val),
+ (STRBrs GPR:$val, ldst_so_reg:$ptr)>;
+def : ARMPat<(atomic_store_8 addrmode_imm12:$ptr, GPR:$val),
+ (STRBi12 GPR:$val, addrmode_imm12:$ptr)>;
+def : ARMPat<(atomic_store_16 addrmode3:$ptr, GPR:$val),
+ (STRH GPR:$val, addrmode3:$ptr)>;
+def : ARMPat<(atomic_store_32 ldst_so_reg:$ptr, GPR:$val),
+ (STRrs GPR:$val, ldst_so_reg:$ptr)>;
+def : ARMPat<(atomic_store_32 addrmode_imm12:$ptr, GPR:$val),
+ (STRi12 GPR:$val, addrmode_imm12:$ptr)>;
+
+
+//===----------------------------------------------------------------------===//
+// Thumb Support
+//
+
+include "ARMInstrThumb.td"
+
+//===----------------------------------------------------------------------===//
+// Thumb2 Support
+//
+
+include "ARMInstrThumb2.td"
+
+//===----------------------------------------------------------------------===//
+// Floating Point Support
+//
+
+include "ARMInstrVFP.td"
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD (NEON) Support
+//
+
+include "ARMInstrNEON.td"
+
+//===----------------------------------------------------------------------===//
+// Assembler aliases
+//
+
+// Memory barriers
+def : InstAlias<"dmb", (DMB 0xf), 0>, Requires<[IsARM, HasDB]>;
+def : InstAlias<"dsb", (DSB 0xf), 0>, Requires<[IsARM, HasDB]>;
+def : InstAlias<"isb", (ISB 0xf), 0>, Requires<[IsARM, HasDB]>;
+
+// System instructions
+def : MnemonicAlias<"swi", "svc">;
+
+// Load / Store Multiple
+def : MnemonicAlias<"ldmfd", "ldm">;
+def : MnemonicAlias<"ldmia", "ldm">;
+def : MnemonicAlias<"ldmea", "ldmdb">;
+def : MnemonicAlias<"stmfd", "stmdb">;
+def : MnemonicAlias<"stmia", "stm">;
+def : MnemonicAlias<"stmea", "stm">;
+
+// PKHBT/PKHTB with default shift amount. PKHTB is equivalent to PKHBT with the
+// input operands swapped when the shift amount is zero (i.e., unspecified).
+def : InstAlias<"pkhbt${p} $Rd, $Rn, $Rm",
+ (PKHBT GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, 0, pred:$p), 0>,
+ Requires<[IsARM, HasV6]>;
+def : InstAlias<"pkhtb${p} $Rd, $Rn, $Rm",
+ (PKHBT GPRnopc:$Rd, GPRnopc:$Rm, GPRnopc:$Rn, 0, pred:$p), 0>,
+ Requires<[IsARM, HasV6]>;
+
+// PUSH/POP aliases for STM/LDM
+def : ARMInstAlias<"push${p} $regs", (STMDB_UPD SP, pred:$p, reglist:$regs)>;
+def : ARMInstAlias<"pop${p} $regs", (LDMIA_UPD SP, pred:$p, reglist:$regs)>;
+
+// SSAT/USAT optional shift operand.
+def : ARMInstAlias<"ssat${p} $Rd, $sat_imm, $Rn",
+ (SSAT GPRnopc:$Rd, imm1_32:$sat_imm, GPRnopc:$Rn, 0, pred:$p)>;
+def : ARMInstAlias<"usat${p} $Rd, $sat_imm, $Rn",
+ (USAT GPRnopc:$Rd, imm0_31:$sat_imm, GPRnopc:$Rn, 0, pred:$p)>;
+
+
+// Extend instruction optional rotate operand.
+def : ARMInstAlias<"sxtab${p} $Rd, $Rn, $Rm",
+ (SXTAB GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"sxtah${p} $Rd, $Rn, $Rm",
+ (SXTAH GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"sxtab16${p} $Rd, $Rn, $Rm",
+ (SXTAB16 GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"sxtb${p} $Rd, $Rm",
+ (SXTB GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"sxtb16${p} $Rd, $Rm",
+ (SXTB16 GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"sxth${p} $Rd, $Rm",
+ (SXTH GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
+
+def : ARMInstAlias<"uxtab${p} $Rd, $Rn, $Rm",
+ (UXTAB GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"uxtah${p} $Rd, $Rn, $Rm",
+ (UXTAH GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"uxtab16${p} $Rd, $Rn, $Rm",
+ (UXTAB16 GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"uxtb${p} $Rd, $Rm",
+ (UXTB GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"uxtb16${p} $Rd, $Rm",
+ (UXTB16 GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"uxth${p} $Rd, $Rm",
+ (UXTH GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
+
+
+// RFE aliases
+def : MnemonicAlias<"rfefa", "rfeda">;
+def : MnemonicAlias<"rfeea", "rfedb">;
+def : MnemonicAlias<"rfefd", "rfeia">;
+def : MnemonicAlias<"rfeed", "rfeib">;
+def : MnemonicAlias<"rfe", "rfeia">;
+
+// SRS aliases
+def : MnemonicAlias<"srsfa", "srsib">;
+def : MnemonicAlias<"srsea", "srsia">;
+def : MnemonicAlias<"srsfd", "srsdb">;
+def : MnemonicAlias<"srsed", "srsda">;
+def : MnemonicAlias<"srs", "srsia">;
+
+// QSAX == QSUBADDX
+def : MnemonicAlias<"qsubaddx", "qsax">;
+// SASX == SADDSUBX
+def : MnemonicAlias<"saddsubx", "sasx">;
+// SHASX == SHADDSUBX
+def : MnemonicAlias<"shaddsubx", "shasx">;
+// SHSAX == SHSUBADDX
+def : MnemonicAlias<"shsubaddx", "shsax">;
+// SSAX == SSUBADDX
+def : MnemonicAlias<"ssubaddx", "ssax">;
+// UASX == UADDSUBX
+def : MnemonicAlias<"uaddsubx", "uasx">;
+// UHASX == UHADDSUBX
+def : MnemonicAlias<"uhaddsubx", "uhasx">;
+// UHSAX == UHSUBADDX
+def : MnemonicAlias<"uhsubaddx", "uhsax">;
+// UQASX == UQADDSUBX
+def : MnemonicAlias<"uqaddsubx", "uqasx">;
+// UQSAX == UQSUBADDX
+def : MnemonicAlias<"uqsubaddx", "uqsax">;
+// USAX == USUBADDX
+def : MnemonicAlias<"usubaddx", "usax">;
+
+// "mov Rd, mod_imm_not" can be handled via "mvn" in assembly, just like
+// for isel.
+def : ARMInstAlias<"mov${s}${p} $Rd, $imm",
+ (MVNi rGPR:$Rd, mod_imm_not:$imm, pred:$p, cc_out:$s)>;
+def : ARMInstAlias<"mvn${s}${p} $Rd, $imm",
+ (MOVi rGPR:$Rd, mod_imm_not:$imm, pred:$p, cc_out:$s)>;
+// Same for AND <--> BIC
+def : ARMInstAlias<"bic${s}${p} $Rd, $Rn, $imm",
+ (ANDri GPR:$Rd, GPR:$Rn, mod_imm_not:$imm,
+ pred:$p, cc_out:$s)>;
+def : ARMInstAlias<"bic${s}${p} $Rdn, $imm",
+ (ANDri GPR:$Rdn, GPR:$Rdn, mod_imm_not:$imm,
+ pred:$p, cc_out:$s)>;
+def : ARMInstAlias<"and${s}${p} $Rd, $Rn, $imm",
+ (BICri GPR:$Rd, GPR:$Rn, mod_imm_not:$imm,
+ pred:$p, cc_out:$s)>;
+def : ARMInstAlias<"and${s}${p} $Rdn, $imm",
+ (BICri GPR:$Rdn, GPR:$Rdn, mod_imm_not:$imm,
+ pred:$p, cc_out:$s)>;
+
+// Likewise, "add Rd, mod_imm_neg" -> sub
+def : ARMInstAlias<"add${s}${p} $Rd, $Rn, $imm",
+ (SUBri GPR:$Rd, GPR:$Rn, mod_imm_neg:$imm, pred:$p, cc_out:$s)>;
+def : ARMInstAlias<"add${s}${p} $Rd, $imm",
+ (SUBri GPR:$Rd, GPR:$Rd, mod_imm_neg:$imm, pred:$p, cc_out:$s)>;
+// Same for CMP <--> CMN via mod_imm_neg
+def : ARMInstAlias<"cmp${p} $Rd, $imm",
+ (CMNri rGPR:$Rd, mod_imm_neg:$imm, pred:$p)>;
+def : ARMInstAlias<"cmn${p} $Rd, $imm",
+ (CMPri rGPR:$Rd, mod_imm_neg:$imm, pred:$p)>;
+
+// The shifter forms of the MOV instruction are aliased to the ASR, LSL,
+// LSR, ROR, and RRX instructions.
+// FIXME: We need C++ parser hooks to map the alias to the MOV
+// encoding. It seems we should be able to do that sort of thing
+// in tblgen, but it could get ugly.
+let TwoOperandAliasConstraint = "$Rm = $Rd" in {
+def ASRi : ARMAsmPseudo<"asr${s}${p} $Rd, $Rm, $imm",
+ (ins GPR:$Rd, GPR:$Rm, imm0_32:$imm, pred:$p,
+ cc_out:$s)>;
+def LSRi : ARMAsmPseudo<"lsr${s}${p} $Rd, $Rm, $imm",
+ (ins GPR:$Rd, GPR:$Rm, imm0_32:$imm, pred:$p,
+ cc_out:$s)>;
+def LSLi : ARMAsmPseudo<"lsl${s}${p} $Rd, $Rm, $imm",
+ (ins GPR:$Rd, GPR:$Rm, imm0_31:$imm, pred:$p,
+ cc_out:$s)>;
+def RORi : ARMAsmPseudo<"ror${s}${p} $Rd, $Rm, $imm",
+ (ins GPR:$Rd, GPR:$Rm, imm0_31:$imm, pred:$p,
+ cc_out:$s)>;
+}
+def RRXi : ARMAsmPseudo<"rrx${s}${p} $Rd, $Rm",
+ (ins GPR:$Rd, GPR:$Rm, pred:$p, cc_out:$s)>;
+let TwoOperandAliasConstraint = "$Rn = $Rd" in {
+def ASRr : ARMAsmPseudo<"asr${s}${p} $Rd, $Rn, $Rm",
+ (ins GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
+ cc_out:$s)>;
+def LSRr : ARMAsmPseudo<"lsr${s}${p} $Rd, $Rn, $Rm",
+ (ins GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
+ cc_out:$s)>;
+def LSLr : ARMAsmPseudo<"lsl${s}${p} $Rd, $Rn, $Rm",
+ (ins GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
+ cc_out:$s)>;
+def RORr : ARMAsmPseudo<"ror${s}${p} $Rd, $Rn, $Rm",
+ (ins GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
+ cc_out:$s)>;
+}
+
+// "neg" is and alias for "rsb rd, rn, #0"
+def : ARMInstAlias<"neg${s}${p} $Rd, $Rm",
+ (RSBri GPR:$Rd, GPR:$Rm, 0, pred:$p, cc_out:$s)>;
+
+// Pre-v6, 'mov r0, r0' was used as a NOP encoding.
+def : InstAlias<"nop${p}", (MOVr R0, R0, pred:$p, zero_reg)>,
+ Requires<[IsARM, NoV6]>;
+
+// MUL/UMLAL/SMLAL/UMULL/SMULL are available on all arches, but
+// the instruction definitions need difference constraints pre-v6.
+// Use these aliases for the assembly parsing on pre-v6.
+def : InstAlias<"mul${s}${p} $Rd, $Rn, $Rm",
+ (MUL GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, cc_out:$s), 0>,
+ Requires<[IsARM, NoV6]>;
+def : InstAlias<"mla${s}${p} $Rd, $Rn, $Rm, $Ra",
+ (MLA GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra,
+ pred:$p, cc_out:$s), 0>,
+ Requires<[IsARM, NoV6]>;
+def : InstAlias<"smlal${s}${p} $RdLo, $RdHi, $Rn, $Rm",
+ (SMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), 0>,
+ Requires<[IsARM, NoV6]>;
+def : InstAlias<"umlal${s}${p} $RdLo, $RdHi, $Rn, $Rm",
+ (UMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), 0>,
+ Requires<[IsARM, NoV6]>;
+def : InstAlias<"smull${s}${p} $RdLo, $RdHi, $Rn, $Rm",
+ (SMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), 0>,
+ Requires<[IsARM, NoV6]>;
+def : InstAlias<"umull${s}${p} $RdLo, $RdHi, $Rn, $Rm",
+ (UMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), 0>,
+ Requires<[IsARM, NoV6]>;
+
+// 'it' blocks in ARM mode just validate the predicates. The IT itself
+// is discarded.
+def ITasm : ARMAsmPseudo<"it$mask $cc", (ins it_pred:$cc, it_mask:$mask)>,
+ ComplexDeprecationPredicate<"IT">;
+
+let mayLoad = 1, mayStore =1, hasSideEffects = 1 in
+def SPACE : PseudoInst<(outs GPR:$Rd), (ins i32imm:$size, GPR:$Rn),
+ NoItinerary,
+ [(set GPR:$Rd, (int_arm_space imm:$size, GPR:$Rn))]>;
+
+//===----------------------------------
+// Atomic cmpxchg for -O0
+//===----------------------------------
+
+// The fast register allocator used during -O0 inserts spills to cover any VRegs
+// live across basic block boundaries. When this happens between an LDXR and an
+// STXR it can clear the exclusive monitor, causing all cmpxchg attempts to
+// fail.
+
+// Unfortunately, this means we have to have an alternative (expanded
+// post-regalloc) path for -O0 compilations. Fortunately this path can be
+// significantly more naive than the standard expansion: we conservatively
+// assume seq_cst, strong cmpxchg and omit clrex on failure.
+
+let Constraints = "@earlyclobber $Rd,@earlyclobber $status",
+ mayLoad = 1, mayStore = 1 in {
+def CMP_SWAP_8 : PseudoInst<(outs GPR:$Rd, GPR:$status),
+ (ins GPR:$addr, GPR:$desired, GPR:$new),
+ NoItinerary, []>, Sched<[]>;
+
+def CMP_SWAP_16 : PseudoInst<(outs GPR:$Rd, GPR:$status),
+ (ins GPR:$addr, GPR:$desired, GPR:$new),
+ NoItinerary, []>, Sched<[]>;
+
+def CMP_SWAP_32 : PseudoInst<(outs GPR:$Rd, GPR:$status),
+ (ins GPR:$addr, GPR:$desired, GPR:$new),
+ NoItinerary, []>, Sched<[]>;
+
+def CMP_SWAP_64 : PseudoInst<(outs GPRPair:$Rd, GPR:$status),
+ (ins GPR:$addr, GPRPair:$desired, GPRPair:$new),
+ NoItinerary, []>, Sched<[]>;
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td b/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td
new file mode 100644
index 000000000000..b5fa8e999e2a
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -0,0 +1,8191 @@
+//===-- ARMInstrNEON.td - NEON support for ARM -------------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the ARM NEON instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// NEON-specific Operands.
+//===----------------------------------------------------------------------===//
+def nModImm : Operand<i32> {
+ let PrintMethod = "printNEONModImmOperand";
+}
+
+def nImmSplatI8AsmOperand : AsmOperandClass { let Name = "NEONi8splat"; }
+def nImmSplatI8 : Operand<i32> {
+ let PrintMethod = "printNEONModImmOperand";
+ let ParserMatchClass = nImmSplatI8AsmOperand;
+}
+def nImmSplatI16AsmOperand : AsmOperandClass { let Name = "NEONi16splat"; }
+def nImmSplatI16 : Operand<i32> {
+ let PrintMethod = "printNEONModImmOperand";
+ let ParserMatchClass = nImmSplatI16AsmOperand;
+}
+def nImmSplatI32AsmOperand : AsmOperandClass { let Name = "NEONi32splat"; }
+def nImmSplatI32 : Operand<i32> {
+ let PrintMethod = "printNEONModImmOperand";
+ let ParserMatchClass = nImmSplatI32AsmOperand;
+}
+def nImmSplatNotI16AsmOperand : AsmOperandClass { let Name = "NEONi16splatNot"; }
+def nImmSplatNotI16 : Operand<i32> {
+ let ParserMatchClass = nImmSplatNotI16AsmOperand;
+}
+def nImmSplatNotI32AsmOperand : AsmOperandClass { let Name = "NEONi32splatNot"; }
+def nImmSplatNotI32 : Operand<i32> {
+ let ParserMatchClass = nImmSplatNotI32AsmOperand;
+}
+def nImmVMOVI32AsmOperand : AsmOperandClass { let Name = "NEONi32vmov"; }
+def nImmVMOVI32 : Operand<i32> {
+ let PrintMethod = "printNEONModImmOperand";
+ let ParserMatchClass = nImmVMOVI32AsmOperand;
+}
+
+def nImmVMOVI16AsmOperandByteReplicate :
+ AsmOperandClass {
+ let Name = "NEONi16vmovByteReplicate";
+ let PredicateMethod = "isNEONi16ByteReplicate";
+ let RenderMethod = "addNEONvmovByteReplicateOperands";
+}
+def nImmVMOVI32AsmOperandByteReplicate :
+ AsmOperandClass {
+ let Name = "NEONi32vmovByteReplicate";
+ let PredicateMethod = "isNEONi32ByteReplicate";
+ let RenderMethod = "addNEONvmovByteReplicateOperands";
+}
+def nImmVMVNI16AsmOperandByteReplicate :
+ AsmOperandClass {
+ let Name = "NEONi16invByteReplicate";
+ let PredicateMethod = "isNEONi16ByteReplicate";
+ let RenderMethod = "addNEONinvByteReplicateOperands";
+}
+def nImmVMVNI32AsmOperandByteReplicate :
+ AsmOperandClass {
+ let Name = "NEONi32invByteReplicate";
+ let PredicateMethod = "isNEONi32ByteReplicate";
+ let RenderMethod = "addNEONinvByteReplicateOperands";
+}
+
+def nImmVMOVI16ByteReplicate : Operand<i32> {
+ let PrintMethod = "printNEONModImmOperand";
+ let ParserMatchClass = nImmVMOVI16AsmOperandByteReplicate;
+}
+def nImmVMOVI32ByteReplicate : Operand<i32> {
+ let PrintMethod = "printNEONModImmOperand";
+ let ParserMatchClass = nImmVMOVI32AsmOperandByteReplicate;
+}
+def nImmVMVNI16ByteReplicate : Operand<i32> {
+ let PrintMethod = "printNEONModImmOperand";
+ let ParserMatchClass = nImmVMVNI16AsmOperandByteReplicate;
+}
+def nImmVMVNI32ByteReplicate : Operand<i32> {
+ let PrintMethod = "printNEONModImmOperand";
+ let ParserMatchClass = nImmVMVNI32AsmOperandByteReplicate;
+}
+
+def nImmVMOVI32NegAsmOperand : AsmOperandClass { let Name = "NEONi32vmovNeg"; }
+def nImmVMOVI32Neg : Operand<i32> {
+ let PrintMethod = "printNEONModImmOperand";
+ let ParserMatchClass = nImmVMOVI32NegAsmOperand;
+}
+def nImmVMOVF32 : Operand<i32> {
+ let PrintMethod = "printFPImmOperand";
+ let ParserMatchClass = FPImmOperand;
+}
+def nImmSplatI64AsmOperand : AsmOperandClass { let Name = "NEONi64splat"; }
+def nImmSplatI64 : Operand<i32> {
+ let PrintMethod = "printNEONModImmOperand";
+ let ParserMatchClass = nImmSplatI64AsmOperand;
+}
+
+def VectorIndex8Operand : AsmOperandClass { let Name = "VectorIndex8"; }
+def VectorIndex16Operand : AsmOperandClass { let Name = "VectorIndex16"; }
+def VectorIndex32Operand : AsmOperandClass { let Name = "VectorIndex32"; }
+def VectorIndex8 : Operand<i32>, ImmLeaf<i32, [{
+ return ((uint64_t)Imm) < 8;
+}]> {
+ let ParserMatchClass = VectorIndex8Operand;
+ let PrintMethod = "printVectorIndex";
+ let MIOperandInfo = (ops i32imm);
+}
+def VectorIndex16 : Operand<i32>, ImmLeaf<i32, [{
+ return ((uint64_t)Imm) < 4;
+}]> {
+ let ParserMatchClass = VectorIndex16Operand;
+ let PrintMethod = "printVectorIndex";
+ let MIOperandInfo = (ops i32imm);
+}
+def VectorIndex32 : Operand<i32>, ImmLeaf<i32, [{
+ return ((uint64_t)Imm) < 2;
+}]> {
+ let ParserMatchClass = VectorIndex32Operand;
+ let PrintMethod = "printVectorIndex";
+ let MIOperandInfo = (ops i32imm);
+}
+
+// Register list of one D register.
+def VecListOneDAsmOperand : AsmOperandClass {
+ let Name = "VecListOneD";
+ let ParserMethod = "parseVectorList";
+ let RenderMethod = "addVecListOperands";
+}
+def VecListOneD : RegisterOperand<DPR, "printVectorListOne"> {
+ let ParserMatchClass = VecListOneDAsmOperand;
+}
+// Register list of two sequential D registers.
+def VecListDPairAsmOperand : AsmOperandClass {
+ let Name = "VecListDPair";
+ let ParserMethod = "parseVectorList";
+ let RenderMethod = "addVecListOperands";
+}
+def VecListDPair : RegisterOperand<DPair, "printVectorListTwo"> {
+ let ParserMatchClass = VecListDPairAsmOperand;
+}
+// Register list of three sequential D registers.
+def VecListThreeDAsmOperand : AsmOperandClass {
+ let Name = "VecListThreeD";
+ let ParserMethod = "parseVectorList";
+ let RenderMethod = "addVecListOperands";
+}
+def VecListThreeD : RegisterOperand<DPR, "printVectorListThree"> {
+ let ParserMatchClass = VecListThreeDAsmOperand;
+}
+// Register list of four sequential D registers.
+def VecListFourDAsmOperand : AsmOperandClass {
+ let Name = "VecListFourD";
+ let ParserMethod = "parseVectorList";
+ let RenderMethod = "addVecListOperands";
+}
+def VecListFourD : RegisterOperand<DPR, "printVectorListFour"> {
+ let ParserMatchClass = VecListFourDAsmOperand;
+}
+// Register list of two D registers spaced by 2 (two sequential Q registers).
+def VecListDPairSpacedAsmOperand : AsmOperandClass {
+ let Name = "VecListDPairSpaced";
+ let ParserMethod = "parseVectorList";
+ let RenderMethod = "addVecListOperands";
+}
+def VecListDPairSpaced : RegisterOperand<DPair, "printVectorListTwoSpaced"> {
+ let ParserMatchClass = VecListDPairSpacedAsmOperand;
+}
+// Register list of three D registers spaced by 2 (three Q registers).
+def VecListThreeQAsmOperand : AsmOperandClass {
+ let Name = "VecListThreeQ";
+ let ParserMethod = "parseVectorList";
+ let RenderMethod = "addVecListOperands";
+}
+def VecListThreeQ : RegisterOperand<DPR, "printVectorListThreeSpaced"> {
+ let ParserMatchClass = VecListThreeQAsmOperand;
+}
+// Register list of three D registers spaced by 2 (three Q registers).
+def VecListFourQAsmOperand : AsmOperandClass {
+ let Name = "VecListFourQ";
+ let ParserMethod = "parseVectorList";
+ let RenderMethod = "addVecListOperands";
+}
+def VecListFourQ : RegisterOperand<DPR, "printVectorListFourSpaced"> {
+ let ParserMatchClass = VecListFourQAsmOperand;
+}
+
+// Register list of one D register, with "all lanes" subscripting.
+def VecListOneDAllLanesAsmOperand : AsmOperandClass {
+ let Name = "VecListOneDAllLanes";
+ let ParserMethod = "parseVectorList";
+ let RenderMethod = "addVecListOperands";
+}
+def VecListOneDAllLanes : RegisterOperand<DPR, "printVectorListOneAllLanes"> {
+ let ParserMatchClass = VecListOneDAllLanesAsmOperand;
+}
+// Register list of two D registers, with "all lanes" subscripting.
+def VecListDPairAllLanesAsmOperand : AsmOperandClass {
+ let Name = "VecListDPairAllLanes";
+ let ParserMethod = "parseVectorList";
+ let RenderMethod = "addVecListOperands";
+}
+def VecListDPairAllLanes : RegisterOperand<DPair,
+ "printVectorListTwoAllLanes"> {
+ let ParserMatchClass = VecListDPairAllLanesAsmOperand;
+}
+// Register list of two D registers spaced by 2 (two sequential Q registers).
+def VecListDPairSpacedAllLanesAsmOperand : AsmOperandClass {
+ let Name = "VecListDPairSpacedAllLanes";
+ let ParserMethod = "parseVectorList";
+ let RenderMethod = "addVecListOperands";
+}
+def VecListDPairSpacedAllLanes : RegisterOperand<DPair,
+ "printVectorListTwoSpacedAllLanes"> {
+ let ParserMatchClass = VecListDPairSpacedAllLanesAsmOperand;
+}
+// Register list of three D registers, with "all lanes" subscripting.
+def VecListThreeDAllLanesAsmOperand : AsmOperandClass {
+ let Name = "VecListThreeDAllLanes";
+ let ParserMethod = "parseVectorList";
+ let RenderMethod = "addVecListOperands";
+}
+def VecListThreeDAllLanes : RegisterOperand<DPR,
+ "printVectorListThreeAllLanes"> {
+ let ParserMatchClass = VecListThreeDAllLanesAsmOperand;
+}
+// Register list of three D registers spaced by 2 (three sequential Q regs).
+def VecListThreeQAllLanesAsmOperand : AsmOperandClass {
+ let Name = "VecListThreeQAllLanes";
+ let ParserMethod = "parseVectorList";
+ let RenderMethod = "addVecListOperands";
+}
+def VecListThreeQAllLanes : RegisterOperand<DPR,
+ "printVectorListThreeSpacedAllLanes"> {
+ let ParserMatchClass = VecListThreeQAllLanesAsmOperand;
+}
+// Register list of four D registers, with "all lanes" subscripting.
+def VecListFourDAllLanesAsmOperand : AsmOperandClass {
+ let Name = "VecListFourDAllLanes";
+ let ParserMethod = "parseVectorList";
+ let RenderMethod = "addVecListOperands";
+}
+def VecListFourDAllLanes : RegisterOperand<DPR, "printVectorListFourAllLanes"> {
+ let ParserMatchClass = VecListFourDAllLanesAsmOperand;
+}
+// Register list of four D registers spaced by 2 (four sequential Q regs).
+def VecListFourQAllLanesAsmOperand : AsmOperandClass {
+ let Name = "VecListFourQAllLanes";
+ let ParserMethod = "parseVectorList";
+ let RenderMethod = "addVecListOperands";
+}
+def VecListFourQAllLanes : RegisterOperand<DPR,
+ "printVectorListFourSpacedAllLanes"> {
+ let ParserMatchClass = VecListFourQAllLanesAsmOperand;
+}
+
+
+// Register list of one D register, with byte lane subscripting.
+def VecListOneDByteIndexAsmOperand : AsmOperandClass {
+ let Name = "VecListOneDByteIndexed";
+ let ParserMethod = "parseVectorList";
+ let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListOneDByteIndexed : Operand<i32> {
+ let ParserMatchClass = VecListOneDByteIndexAsmOperand;
+ let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with half-word lane subscripting.
+def VecListOneDHWordIndexAsmOperand : AsmOperandClass {
+ let Name = "VecListOneDHWordIndexed";
+ let ParserMethod = "parseVectorList";
+ let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListOneDHWordIndexed : Operand<i32> {
+ let ParserMatchClass = VecListOneDHWordIndexAsmOperand;
+ let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with word lane subscripting.
+def VecListOneDWordIndexAsmOperand : AsmOperandClass {
+ let Name = "VecListOneDWordIndexed";
+ let ParserMethod = "parseVectorList";
+ let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListOneDWordIndexed : Operand<i32> {
+ let ParserMatchClass = VecListOneDWordIndexAsmOperand;
+ let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+
+// Register list of two D registers with byte lane subscripting.
+def VecListTwoDByteIndexAsmOperand : AsmOperandClass {
+ let Name = "VecListTwoDByteIndexed";
+ let ParserMethod = "parseVectorList";
+ let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListTwoDByteIndexed : Operand<i32> {
+ let ParserMatchClass = VecListTwoDByteIndexAsmOperand;
+ let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with half-word lane subscripting.
+def VecListTwoDHWordIndexAsmOperand : AsmOperandClass {
+ let Name = "VecListTwoDHWordIndexed";
+ let ParserMethod = "parseVectorList";
+ let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListTwoDHWordIndexed : Operand<i32> {
+ let ParserMatchClass = VecListTwoDHWordIndexAsmOperand;
+ let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with word lane subscripting.
+def VecListTwoDWordIndexAsmOperand : AsmOperandClass {
+ let Name = "VecListTwoDWordIndexed";
+ let ParserMethod = "parseVectorList";
+ let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListTwoDWordIndexed : Operand<i32> {
+ let ParserMatchClass = VecListTwoDWordIndexAsmOperand;
+ let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// Register list of two Q registers with half-word lane subscripting.
+def VecListTwoQHWordIndexAsmOperand : AsmOperandClass {
+ let Name = "VecListTwoQHWordIndexed";
+ let ParserMethod = "parseVectorList";
+ let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListTwoQHWordIndexed : Operand<i32> {
+ let ParserMatchClass = VecListTwoQHWordIndexAsmOperand;
+ let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with word lane subscripting.
+def VecListTwoQWordIndexAsmOperand : AsmOperandClass {
+ let Name = "VecListTwoQWordIndexed";
+ let ParserMethod = "parseVectorList";
+ let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListTwoQWordIndexed : Operand<i32> {
+ let ParserMatchClass = VecListTwoQWordIndexAsmOperand;
+ let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+
+
+// Register list of three D registers with byte lane subscripting.
+def VecListThreeDByteIndexAsmOperand : AsmOperandClass {
+ let Name = "VecListThreeDByteIndexed";
+ let ParserMethod = "parseVectorList";
+ let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListThreeDByteIndexed : Operand<i32> {
+ let ParserMatchClass = VecListThreeDByteIndexAsmOperand;
+ let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with half-word lane subscripting.
+def VecListThreeDHWordIndexAsmOperand : AsmOperandClass {
+ let Name = "VecListThreeDHWordIndexed";
+ let ParserMethod = "parseVectorList";
+ let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListThreeDHWordIndexed : Operand<i32> {
+ let ParserMatchClass = VecListThreeDHWordIndexAsmOperand;
+ let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with word lane subscripting.
+def VecListThreeDWordIndexAsmOperand : AsmOperandClass {
+ let Name = "VecListThreeDWordIndexed";
+ let ParserMethod = "parseVectorList";
+ let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListThreeDWordIndexed : Operand<i32> {
+ let ParserMatchClass = VecListThreeDWordIndexAsmOperand;
+ let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// Register list of three Q registers with half-word lane subscripting.
+def VecListThreeQHWordIndexAsmOperand : AsmOperandClass {
+ let Name = "VecListThreeQHWordIndexed";
+ let ParserMethod = "parseVectorList";
+ let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListThreeQHWordIndexed : Operand<i32> {
+ let ParserMatchClass = VecListThreeQHWordIndexAsmOperand;
+ let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with word lane subscripting.
+def VecListThreeQWordIndexAsmOperand : AsmOperandClass {
+ let Name = "VecListThreeQWordIndexed";
+ let ParserMethod = "parseVectorList";
+ let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListThreeQWordIndexed : Operand<i32> {
+ let ParserMatchClass = VecListThreeQWordIndexAsmOperand;
+ let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+
+// Register list of four D registers with byte lane subscripting.
+def VecListFourDByteIndexAsmOperand : AsmOperandClass {
+ let Name = "VecListFourDByteIndexed";
+ let ParserMethod = "parseVectorList";
+ let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListFourDByteIndexed : Operand<i32> {
+ let ParserMatchClass = VecListFourDByteIndexAsmOperand;
+ let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with half-word lane subscripting.
+def VecListFourDHWordIndexAsmOperand : AsmOperandClass {
+ let Name = "VecListFourDHWordIndexed";
+ let ParserMethod = "parseVectorList";
+ let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListFourDHWordIndexed : Operand<i32> {
+ let ParserMatchClass = VecListFourDHWordIndexAsmOperand;
+ let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with word lane subscripting.
+def VecListFourDWordIndexAsmOperand : AsmOperandClass {
+ let Name = "VecListFourDWordIndexed";
+ let ParserMethod = "parseVectorList";
+ let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListFourDWordIndexed : Operand<i32> {
+ let ParserMatchClass = VecListFourDWordIndexAsmOperand;
+ let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// Register list of four Q registers with half-word lane subscripting.
+def VecListFourQHWordIndexAsmOperand : AsmOperandClass {
+ let Name = "VecListFourQHWordIndexed";
+ let ParserMethod = "parseVectorList";
+ let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListFourQHWordIndexed : Operand<i32> {
+ let ParserMatchClass = VecListFourQHWordIndexAsmOperand;
+ let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with word lane subscripting.
+def VecListFourQWordIndexAsmOperand : AsmOperandClass {
+ let Name = "VecListFourQWordIndexed";
+ let ParserMethod = "parseVectorList";
+ let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListFourQWordIndexed : Operand<i32> {
+ let ParserMatchClass = VecListFourQWordIndexAsmOperand;
+ let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+
+def dword_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return cast<LoadSDNode>(N)->getAlignment() >= 8;
+}]>;
+def dword_alignedstore : PatFrag<(ops node:$val, node:$ptr),
+ (store node:$val, node:$ptr), [{
+ return cast<StoreSDNode>(N)->getAlignment() >= 8;
+}]>;
+def word_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return cast<LoadSDNode>(N)->getAlignment() == 4;
+}]>;
+def word_alignedstore : PatFrag<(ops node:$val, node:$ptr),
+ (store node:$val, node:$ptr), [{
+ return cast<StoreSDNode>(N)->getAlignment() == 4;
+}]>;
+def hword_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return cast<LoadSDNode>(N)->getAlignment() == 2;
+}]>;
+def hword_alignedstore : PatFrag<(ops node:$val, node:$ptr),
+ (store node:$val, node:$ptr), [{
+ return cast<StoreSDNode>(N)->getAlignment() == 2;
+}]>;
+def byte_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return cast<LoadSDNode>(N)->getAlignment() == 1;
+}]>;
+def byte_alignedstore : PatFrag<(ops node:$val, node:$ptr),
+ (store node:$val, node:$ptr), [{
+ return cast<StoreSDNode>(N)->getAlignment() == 1;
+}]>;
+def non_word_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return cast<LoadSDNode>(N)->getAlignment() < 4;
+}]>;
+def non_word_alignedstore : PatFrag<(ops node:$val, node:$ptr),
+ (store node:$val, node:$ptr), [{
+ return cast<StoreSDNode>(N)->getAlignment() < 4;
+}]>;
+
+//===----------------------------------------------------------------------===//
+// NEON-specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDTARMVCMP : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<1, 2>]>;
+def SDTARMVCMPZ : SDTypeProfile<1, 1, []>;
+
+def NEONvceq : SDNode<"ARMISD::VCEQ", SDTARMVCMP>;
+def NEONvceqz : SDNode<"ARMISD::VCEQZ", SDTARMVCMPZ>;
+def NEONvcge : SDNode<"ARMISD::VCGE", SDTARMVCMP>;
+def NEONvcgez : SDNode<"ARMISD::VCGEZ", SDTARMVCMPZ>;
+def NEONvclez : SDNode<"ARMISD::VCLEZ", SDTARMVCMPZ>;
+def NEONvcgeu : SDNode<"ARMISD::VCGEU", SDTARMVCMP>;
+def NEONvcgt : SDNode<"ARMISD::VCGT", SDTARMVCMP>;
+def NEONvcgtz : SDNode<"ARMISD::VCGTZ", SDTARMVCMPZ>;
+def NEONvcltz : SDNode<"ARMISD::VCLTZ", SDTARMVCMPZ>;
+def NEONvcgtu : SDNode<"ARMISD::VCGTU", SDTARMVCMP>;
+def NEONvtst : SDNode<"ARMISD::VTST", SDTARMVCMP>;
+
+// Types for vector shift by immediates. The "SHX" version is for long and
+// narrow operations where the source and destination vectors have different
+// types. The "SHINS" version is for shift and insert operations.
+def SDTARMVSH : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
+ SDTCisVT<2, i32>]>;
+def SDTARMVSHX : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
+ SDTCisVT<2, i32>]>;
+def SDTARMVSHINS : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>, SDTCisVT<3, i32>]>;
+
+def NEONvshl : SDNode<"ARMISD::VSHL", SDTARMVSH>;
+def NEONvshrs : SDNode<"ARMISD::VSHRs", SDTARMVSH>;
+def NEONvshru : SDNode<"ARMISD::VSHRu", SDTARMVSH>;
+def NEONvshrn : SDNode<"ARMISD::VSHRN", SDTARMVSHX>;
+
+def NEONvrshrs : SDNode<"ARMISD::VRSHRs", SDTARMVSH>;
+def NEONvrshru : SDNode<"ARMISD::VRSHRu", SDTARMVSH>;
+def NEONvrshrn : SDNode<"ARMISD::VRSHRN", SDTARMVSHX>;
+
+def NEONvqshls : SDNode<"ARMISD::VQSHLs", SDTARMVSH>;
+def NEONvqshlu : SDNode<"ARMISD::VQSHLu", SDTARMVSH>;
+def NEONvqshlsu : SDNode<"ARMISD::VQSHLsu", SDTARMVSH>;
+def NEONvqshrns : SDNode<"ARMISD::VQSHRNs", SDTARMVSHX>;
+def NEONvqshrnu : SDNode<"ARMISD::VQSHRNu", SDTARMVSHX>;
+def NEONvqshrnsu : SDNode<"ARMISD::VQSHRNsu", SDTARMVSHX>;
+
+def NEONvqrshrns : SDNode<"ARMISD::VQRSHRNs", SDTARMVSHX>;
+def NEONvqrshrnu : SDNode<"ARMISD::VQRSHRNu", SDTARMVSHX>;
+def NEONvqrshrnsu : SDNode<"ARMISD::VQRSHRNsu", SDTARMVSHX>;
+
+def NEONvsli : SDNode<"ARMISD::VSLI", SDTARMVSHINS>;
+def NEONvsri : SDNode<"ARMISD::VSRI", SDTARMVSHINS>;
+
+def SDTARMVGETLN : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisInt<1>,
+ SDTCisVT<2, i32>]>;
+def NEONvgetlaneu : SDNode<"ARMISD::VGETLANEu", SDTARMVGETLN>;
+def NEONvgetlanes : SDNode<"ARMISD::VGETLANEs", SDTARMVGETLN>;
+
+def SDTARMVMOVIMM : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>;
+def NEONvmovImm : SDNode<"ARMISD::VMOVIMM", SDTARMVMOVIMM>;
+def NEONvmvnImm : SDNode<"ARMISD::VMVNIMM", SDTARMVMOVIMM>;
+def NEONvmovFPImm : SDNode<"ARMISD::VMOVFPIMM", SDTARMVMOVIMM>;
+
+def SDTARMVORRIMM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+ SDTCisVT<2, i32>]>;
+def NEONvorrImm : SDNode<"ARMISD::VORRIMM", SDTARMVORRIMM>;
+def NEONvbicImm : SDNode<"ARMISD::VBICIMM", SDTARMVORRIMM>;
+
+def NEONvbsl : SDNode<"ARMISD::VBSL",
+ SDTypeProfile<1, 3, [SDTCisVec<0>,
+ SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>,
+ SDTCisSameAs<0, 3>]>>;
+
+def NEONvdup : SDNode<"ARMISD::VDUP", SDTypeProfile<1, 1, [SDTCisVec<0>]>>;
+
+// VDUPLANE can produce a quad-register result from a double-register source,
+// so the result is not constrained to match the source.
+def NEONvduplane : SDNode<"ARMISD::VDUPLANE",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisVT<2, i32>]>>;
+
+def SDTARMVEXT : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>, SDTCisVT<3, i32>]>;
+def NEONvext : SDNode<"ARMISD::VEXT", SDTARMVEXT>;
+
+def SDTARMVSHUF : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0, 1>]>;
+def NEONvrev64 : SDNode<"ARMISD::VREV64", SDTARMVSHUF>;
+def NEONvrev32 : SDNode<"ARMISD::VREV32", SDTARMVSHUF>;
+def NEONvrev16 : SDNode<"ARMISD::VREV16", SDTARMVSHUF>;
+
+def SDTARMVSHUF2 : SDTypeProfile<2, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>,
+ SDTCisSameAs<0, 3>]>;
+def NEONzip : SDNode<"ARMISD::VZIP", SDTARMVSHUF2>;
+def NEONuzp : SDNode<"ARMISD::VUZP", SDTARMVSHUF2>;
+def NEONtrn : SDNode<"ARMISD::VTRN", SDTARMVSHUF2>;
+
+def SDTARMVMULL : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
+ SDTCisSameAs<1, 2>]>;
+def NEONvmulls : SDNode<"ARMISD::VMULLs", SDTARMVMULL>;
+def NEONvmullu : SDNode<"ARMISD::VMULLu", SDTARMVMULL>;
+
+def NEONimmAllZerosV: PatLeaf<(NEONvmovImm (i32 timm)), [{
+ ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0));
+ unsigned EltBits = 0;
+ uint64_t EltVal = ARM_AM::decodeNEONModImm(ConstVal->getZExtValue(), EltBits);
+ return (EltBits == 32 && EltVal == 0);
+}]>;
+
+def NEONimmAllOnesV: PatLeaf<(NEONvmovImm (i32 timm)), [{
+ ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0));
+ unsigned EltBits = 0;
+ uint64_t EltVal = ARM_AM::decodeNEONModImm(ConstVal->getZExtValue(), EltBits);
+ return (EltBits == 8 && EltVal == 0xff);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// NEON load / store instructions
+//===----------------------------------------------------------------------===//
+
+// Use VLDM to load a Q register as a D register pair.
+// This is a pseudo instruction that is expanded to VLDMD after reg alloc.
+def VLDMQIA
+ : PseudoVFPLdStM<(outs DPair:$dst), (ins GPR:$Rn),
+ IIC_fpLoad_m, "",
+ [(set DPair:$dst, (v2f64 (word_alignedload GPR:$Rn)))]>;
+
+// Use VSTM to store a Q register as a D register pair.
+// This is a pseudo instruction that is expanded to VSTMD after reg alloc.
+def VSTMQIA
+ : PseudoVFPLdStM<(outs), (ins DPair:$src, GPR:$Rn),
+ IIC_fpStore_m, "",
+ [(word_alignedstore (v2f64 DPair:$src), GPR:$Rn)]>;
+
+// Classes for VLD* pseudo-instructions with multi-register operands.
+// These are expanded to real instructions after register allocation.
+class VLDQPseudo<InstrItinClass itin>
+ : PseudoNLdSt<(outs QPR:$dst), (ins addrmode6:$addr), itin, "">;
+class VLDQWBPseudo<InstrItinClass itin>
+ : PseudoNLdSt<(outs QPR:$dst, GPR:$wb),
+ (ins addrmode6:$addr, am6offset:$offset), itin,
+ "$addr.addr = $wb">;
+class VLDQWBfixedPseudo<InstrItinClass itin>
+ : PseudoNLdSt<(outs QPR:$dst, GPR:$wb),
+ (ins addrmode6:$addr), itin,
+ "$addr.addr = $wb">;
+class VLDQWBregisterPseudo<InstrItinClass itin>
+ : PseudoNLdSt<(outs QPR:$dst, GPR:$wb),
+ (ins addrmode6:$addr, rGPR:$offset), itin,
+ "$addr.addr = $wb">;
+
+class VLDQQPseudo<InstrItinClass itin>
+ : PseudoNLdSt<(outs QQPR:$dst), (ins addrmode6:$addr), itin, "">;
+class VLDQQWBPseudo<InstrItinClass itin>
+ : PseudoNLdSt<(outs QQPR:$dst, GPR:$wb),
+ (ins addrmode6:$addr, am6offset:$offset), itin,
+ "$addr.addr = $wb">;
+class VLDQQWBfixedPseudo<InstrItinClass itin>
+ : PseudoNLdSt<(outs QQPR:$dst, GPR:$wb),
+ (ins addrmode6:$addr), itin,
+ "$addr.addr = $wb">;
+class VLDQQWBregisterPseudo<InstrItinClass itin>
+ : PseudoNLdSt<(outs QQPR:$dst, GPR:$wb),
+ (ins addrmode6:$addr, rGPR:$offset), itin,
+ "$addr.addr = $wb">;
+
+
+class VLDQQQQPseudo<InstrItinClass itin>
+ : PseudoNLdSt<(outs QQQQPR:$dst), (ins addrmode6:$addr, QQQQPR:$src),itin,
+ "$src = $dst">;
+class VLDQQQQWBPseudo<InstrItinClass itin>
+ : PseudoNLdSt<(outs QQQQPR:$dst, GPR:$wb),
+ (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src), itin,
+ "$addr.addr = $wb, $src = $dst">;
+
+let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in {
+
+// VLD1 : Vector Load (multiple single elements)
+class VLD1D<bits<4> op7_4, string Dt, Operand AddrMode>
+ : NLdSt<0,0b10,0b0111,op7_4, (outs VecListOneD:$Vd),
+ (ins AddrMode:$Rn), IIC_VLD1,
+ "vld1", Dt, "$Vd, $Rn", "", []> {
+ let Rm = 0b1111;
+ let Inst{4} = Rn{4};
+ let DecoderMethod = "DecodeVLDST1Instruction";
+}
+class VLD1Q<bits<4> op7_4, string Dt, Operand AddrMode>
+ : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd),
+ (ins AddrMode:$Rn), IIC_VLD1x2,
+ "vld1", Dt, "$Vd, $Rn", "", []> {
+ let Rm = 0b1111;
+ let Inst{5-4} = Rn{5-4};
+ let DecoderMethod = "DecodeVLDST1Instruction";
+}
+
+def VLD1d8 : VLD1D<{0,0,0,?}, "8", addrmode6align64>;
+def VLD1d16 : VLD1D<{0,1,0,?}, "16", addrmode6align64>;
+def VLD1d32 : VLD1D<{1,0,0,?}, "32", addrmode6align64>;
+def VLD1d64 : VLD1D<{1,1,0,?}, "64", addrmode6align64>;
+
+def VLD1q8 : VLD1Q<{0,0,?,?}, "8", addrmode6align64or128>;
+def VLD1q16 : VLD1Q<{0,1,?,?}, "16", addrmode6align64or128>;
+def VLD1q32 : VLD1Q<{1,0,?,?}, "32", addrmode6align64or128>;
+def VLD1q64 : VLD1Q<{1,1,?,?}, "64", addrmode6align64or128>;
+
+// ...with address register writeback:
+multiclass VLD1DWB<bits<4> op7_4, string Dt, Operand AddrMode> {
+ def _fixed : NLdSt<0,0b10, 0b0111,op7_4, (outs VecListOneD:$Vd, GPR:$wb),
+ (ins AddrMode:$Rn), IIC_VLD1u,
+ "vld1", Dt, "$Vd, $Rn!",
+ "$Rn.addr = $wb", []> {
+ let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+ let Inst{4} = Rn{4};
+ let DecoderMethod = "DecodeVLDST1Instruction";
+ }
+ def _register : NLdSt<0,0b10,0b0111,op7_4, (outs VecListOneD:$Vd, GPR:$wb),
+ (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1u,
+ "vld1", Dt, "$Vd, $Rn, $Rm",
+ "$Rn.addr = $wb", []> {
+ let Inst{4} = Rn{4};
+ let DecoderMethod = "DecodeVLDST1Instruction";
+ }
+}
+multiclass VLD1QWB<bits<4> op7_4, string Dt, Operand AddrMode> {
+ def _fixed : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd, GPR:$wb),
+ (ins AddrMode:$Rn), IIC_VLD1x2u,
+ "vld1", Dt, "$Vd, $Rn!",
+ "$Rn.addr = $wb", []> {
+ let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+ let Inst{5-4} = Rn{5-4};
+ let DecoderMethod = "DecodeVLDST1Instruction";
+ }
+ def _register : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd, GPR:$wb),
+ (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1x2u,
+ "vld1", Dt, "$Vd, $Rn, $Rm",
+ "$Rn.addr = $wb", []> {
+ let Inst{5-4} = Rn{5-4};
+ let DecoderMethod = "DecodeVLDST1Instruction";
+ }
+}
+
+defm VLD1d8wb : VLD1DWB<{0,0,0,?}, "8", addrmode6align64>;
+defm VLD1d16wb : VLD1DWB<{0,1,0,?}, "16", addrmode6align64>;
+defm VLD1d32wb : VLD1DWB<{1,0,0,?}, "32", addrmode6align64>;
+defm VLD1d64wb : VLD1DWB<{1,1,0,?}, "64", addrmode6align64>;
+defm VLD1q8wb : VLD1QWB<{0,0,?,?}, "8", addrmode6align64or128>;
+defm VLD1q16wb : VLD1QWB<{0,1,?,?}, "16", addrmode6align64or128>;
+defm VLD1q32wb : VLD1QWB<{1,0,?,?}, "32", addrmode6align64or128>;
+defm VLD1q64wb : VLD1QWB<{1,1,?,?}, "64", addrmode6align64or128>;
+
+// ...with 3 registers
+class VLD1D3<bits<4> op7_4, string Dt, Operand AddrMode>
+ : NLdSt<0,0b10,0b0110,op7_4, (outs VecListThreeD:$Vd),
+ (ins AddrMode:$Rn), IIC_VLD1x3, "vld1", Dt,
+ "$Vd, $Rn", "", []> {
+ let Rm = 0b1111;
+ let Inst{4} = Rn{4};
+ let DecoderMethod = "DecodeVLDST1Instruction";
+}
+multiclass VLD1D3WB<bits<4> op7_4, string Dt, Operand AddrMode> {
+ def _fixed : NLdSt<0,0b10,0b0110, op7_4, (outs VecListThreeD:$Vd, GPR:$wb),
+ (ins AddrMode:$Rn), IIC_VLD1x2u,
+ "vld1", Dt, "$Vd, $Rn!",
+ "$Rn.addr = $wb", []> {
+ let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+ let Inst{4} = Rn{4};
+ let DecoderMethod = "DecodeVLDST1Instruction";
+ }
+ def _register : NLdSt<0,0b10,0b0110,op7_4, (outs VecListThreeD:$Vd, GPR:$wb),
+ (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1x2u,
+ "vld1", Dt, "$Vd, $Rn, $Rm",
+ "$Rn.addr = $wb", []> {
+ let Inst{4} = Rn{4};
+ let DecoderMethod = "DecodeVLDST1Instruction";
+ }
+}
+
+def VLD1d8T : VLD1D3<{0,0,0,?}, "8", addrmode6align64>;
+def VLD1d16T : VLD1D3<{0,1,0,?}, "16", addrmode6align64>;
+def VLD1d32T : VLD1D3<{1,0,0,?}, "32", addrmode6align64>;
+def VLD1d64T : VLD1D3<{1,1,0,?}, "64", addrmode6align64>;
+
+defm VLD1d8Twb : VLD1D3WB<{0,0,0,?}, "8", addrmode6align64>;
+defm VLD1d16Twb : VLD1D3WB<{0,1,0,?}, "16", addrmode6align64>;
+defm VLD1d32Twb : VLD1D3WB<{1,0,0,?}, "32", addrmode6align64>;
+defm VLD1d64Twb : VLD1D3WB<{1,1,0,?}, "64", addrmode6align64>;
+
+def VLD1d64TPseudo : VLDQQPseudo<IIC_VLD1x3>;
+def VLD1d64TPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x3>;
+def VLD1d64TPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x3>;
+
+// ...with 4 registers
+class VLD1D4<bits<4> op7_4, string Dt, Operand AddrMode>
+ : NLdSt<0, 0b10, 0b0010, op7_4, (outs VecListFourD:$Vd),
+ (ins AddrMode:$Rn), IIC_VLD1x4, "vld1", Dt,
+ "$Vd, $Rn", "", []> {
+ let Rm = 0b1111;
+ let Inst{5-4} = Rn{5-4};
+ let DecoderMethod = "DecodeVLDST1Instruction";
+}
+multiclass VLD1D4WB<bits<4> op7_4, string Dt, Operand AddrMode> {
+ def _fixed : NLdSt<0,0b10,0b0010, op7_4, (outs VecListFourD:$Vd, GPR:$wb),
+ (ins AddrMode:$Rn), IIC_VLD1x2u,
+ "vld1", Dt, "$Vd, $Rn!",
+ "$Rn.addr = $wb", []> {
+ let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+ let Inst{5-4} = Rn{5-4};
+ let DecoderMethod = "DecodeVLDST1Instruction";
+ }
+ def _register : NLdSt<0,0b10,0b0010,op7_4, (outs VecListFourD:$Vd, GPR:$wb),
+ (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1x2u,
+ "vld1", Dt, "$Vd, $Rn, $Rm",
+ "$Rn.addr = $wb", []> {
+ let Inst{5-4} = Rn{5-4};
+ let DecoderMethod = "DecodeVLDST1Instruction";
+ }
+}
+
+def VLD1d8Q : VLD1D4<{0,0,?,?}, "8", addrmode6align64or128or256>;
+def VLD1d16Q : VLD1D4<{0,1,?,?}, "16", addrmode6align64or128or256>;
+def VLD1d32Q : VLD1D4<{1,0,?,?}, "32", addrmode6align64or128or256>;
+def VLD1d64Q : VLD1D4<{1,1,?,?}, "64", addrmode6align64or128or256>;
+
+defm VLD1d8Qwb : VLD1D4WB<{0,0,?,?}, "8", addrmode6align64or128or256>;
+defm VLD1d16Qwb : VLD1D4WB<{0,1,?,?}, "16", addrmode6align64or128or256>;
+defm VLD1d32Qwb : VLD1D4WB<{1,0,?,?}, "32", addrmode6align64or128or256>;
+defm VLD1d64Qwb : VLD1D4WB<{1,1,?,?}, "64", addrmode6align64or128or256>;
+
+def VLD1d64QPseudo : VLDQQPseudo<IIC_VLD1x4>;
+def VLD1d64QPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x4>;
+def VLD1d64QPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x4>;
+
+// VLD2 : Vector Load (multiple 2-element structures)
+class VLD2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy,
+ InstrItinClass itin, Operand AddrMode>
+ : NLdSt<0, 0b10, op11_8, op7_4, (outs VdTy:$Vd),
+ (ins AddrMode:$Rn), itin,
+ "vld2", Dt, "$Vd, $Rn", "", []> {
+ let Rm = 0b1111;
+ let Inst{5-4} = Rn{5-4};
+ let DecoderMethod = "DecodeVLDST2Instruction";
+}
+
+def VLD2d8 : VLD2<0b1000, {0,0,?,?}, "8", VecListDPair, IIC_VLD2,
+ addrmode6align64or128>;
+def VLD2d16 : VLD2<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VLD2,
+ addrmode6align64or128>;
+def VLD2d32 : VLD2<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VLD2,
+ addrmode6align64or128>;
+
+def VLD2q8 : VLD2<0b0011, {0,0,?,?}, "8", VecListFourD, IIC_VLD2x2,
+ addrmode6align64or128or256>;
+def VLD2q16 : VLD2<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VLD2x2,
+ addrmode6align64or128or256>;
+def VLD2q32 : VLD2<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VLD2x2,
+ addrmode6align64or128or256>;
+
+def VLD2q8Pseudo : VLDQQPseudo<IIC_VLD2x2>;
+def VLD2q16Pseudo : VLDQQPseudo<IIC_VLD2x2>;
+def VLD2q32Pseudo : VLDQQPseudo<IIC_VLD2x2>;
+
+// ...with address register writeback:
+multiclass VLD2WB<bits<4> op11_8, bits<4> op7_4, string Dt,
+ RegisterOperand VdTy, InstrItinClass itin, Operand AddrMode> {
+ def _fixed : NLdSt<0, 0b10, op11_8, op7_4, (outs VdTy:$Vd, GPR:$wb),
+ (ins AddrMode:$Rn), itin,
+ "vld2", Dt, "$Vd, $Rn!",
+ "$Rn.addr = $wb", []> {
+ let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+ let Inst{5-4} = Rn{5-4};
+ let DecoderMethod = "DecodeVLDST2Instruction";
+ }
+ def _register : NLdSt<0, 0b10, op11_8, op7_4, (outs VdTy:$Vd, GPR:$wb),
+ (ins AddrMode:$Rn, rGPR:$Rm), itin,
+ "vld2", Dt, "$Vd, $Rn, $Rm",
+ "$Rn.addr = $wb", []> {
+ let Inst{5-4} = Rn{5-4};
+ let DecoderMethod = "DecodeVLDST2Instruction";
+ }
+}
+
+defm VLD2d8wb : VLD2WB<0b1000, {0,0,?,?}, "8", VecListDPair, IIC_VLD2u,
+ addrmode6align64or128>;
+defm VLD2d16wb : VLD2WB<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VLD2u,
+ addrmode6align64or128>;
+defm VLD2d32wb : VLD2WB<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VLD2u,
+ addrmode6align64or128>;
+
+defm VLD2q8wb : VLD2WB<0b0011, {0,0,?,?}, "8", VecListFourD, IIC_VLD2x2u,
+ addrmode6align64or128or256>;
+defm VLD2q16wb : VLD2WB<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VLD2x2u,
+ addrmode6align64or128or256>;
+defm VLD2q32wb : VLD2WB<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VLD2x2u,
+ addrmode6align64or128or256>;
+
+def VLD2q8PseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD2x2u>;
+def VLD2q16PseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD2x2u>;
+def VLD2q32PseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD2x2u>;
+def VLD2q8PseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD2x2u>;
+def VLD2q16PseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD2x2u>;
+def VLD2q32PseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD2x2u>;
+
+// ...with double-spaced registers
+def VLD2b8 : VLD2<0b1001, {0,0,?,?}, "8", VecListDPairSpaced, IIC_VLD2,
+ addrmode6align64or128>;
+def VLD2b16 : VLD2<0b1001, {0,1,?,?}, "16", VecListDPairSpaced, IIC_VLD2,
+ addrmode6align64or128>;
+def VLD2b32 : VLD2<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, IIC_VLD2,
+ addrmode6align64or128>;
+defm VLD2b8wb : VLD2WB<0b1001, {0,0,?,?}, "8", VecListDPairSpaced, IIC_VLD2u,
+ addrmode6align64or128>;
+defm VLD2b16wb : VLD2WB<0b1001, {0,1,?,?}, "16", VecListDPairSpaced, IIC_VLD2u,
+ addrmode6align64or128>;
+defm VLD2b32wb : VLD2WB<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, IIC_VLD2u,
+ addrmode6align64or128>;
+
+// VLD3 : Vector Load (multiple 3-element structures)
+class VLD3D<bits<4> op11_8, bits<4> op7_4, string Dt>
+ : NLdSt<0, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3),
+ (ins addrmode6:$Rn), IIC_VLD3,
+ "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn", "", []> {
+ let Rm = 0b1111;
+ let Inst{4} = Rn{4};
+ let DecoderMethod = "DecodeVLDST3Instruction";
+}
+
+def VLD3d8 : VLD3D<0b0100, {0,0,0,?}, "8">;
+def VLD3d16 : VLD3D<0b0100, {0,1,0,?}, "16">;
+def VLD3d32 : VLD3D<0b0100, {1,0,0,?}, "32">;
+
+def VLD3d8Pseudo : VLDQQPseudo<IIC_VLD3>;
+def VLD3d16Pseudo : VLDQQPseudo<IIC_VLD3>;
+def VLD3d32Pseudo : VLDQQPseudo<IIC_VLD3>;
+
+// ...with address register writeback:
+class VLD3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+ : NLdSt<0, 0b10, op11_8, op7_4,
+ (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb),
+ (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD3u,
+ "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn$Rm",
+ "$Rn.addr = $wb", []> {
+ let Inst{4} = Rn{4};
+ let DecoderMethod = "DecodeVLDST3Instruction";
+}
+
+def VLD3d8_UPD : VLD3DWB<0b0100, {0,0,0,?}, "8">;
+def VLD3d16_UPD : VLD3DWB<0b0100, {0,1,0,?}, "16">;
+def VLD3d32_UPD : VLD3DWB<0b0100, {1,0,0,?}, "32">;
+
+def VLD3d8Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>;
+def VLD3d16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>;
+def VLD3d32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>;
+
+// ...with double-spaced registers:
+def VLD3q8 : VLD3D<0b0101, {0,0,0,?}, "8">;
+def VLD3q16 : VLD3D<0b0101, {0,1,0,?}, "16">;
+def VLD3q32 : VLD3D<0b0101, {1,0,0,?}, "32">;
+def VLD3q8_UPD : VLD3DWB<0b0101, {0,0,0,?}, "8">;
+def VLD3q16_UPD : VLD3DWB<0b0101, {0,1,0,?}, "16">;
+def VLD3q32_UPD : VLD3DWB<0b0101, {1,0,0,?}, "32">;
+
+def VLD3q8Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>;
+def VLD3q16Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>;
+def VLD3q32Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>;
+
+// ...alternate versions to be allocated odd register numbers:
+def VLD3q8oddPseudo : VLDQQQQPseudo<IIC_VLD3>;
+def VLD3q16oddPseudo : VLDQQQQPseudo<IIC_VLD3>;
+def VLD3q32oddPseudo : VLDQQQQPseudo<IIC_VLD3>;
+
+def VLD3q8oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>;
+def VLD3q16oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>;
+def VLD3q32oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>;
+
+// VLD4 : Vector Load (multiple 4-element structures)
+class VLD4D<bits<4> op11_8, bits<4> op7_4, string Dt>
+ : NLdSt<0, 0b10, op11_8, op7_4,
+ (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4),
+ (ins addrmode6:$Rn), IIC_VLD4,
+ "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn", "", []> {
+ let Rm = 0b1111;
+ let Inst{5-4} = Rn{5-4};
+ let DecoderMethod = "DecodeVLDST4Instruction";
+}
+
+def VLD4d8 : VLD4D<0b0000, {0,0,?,?}, "8">;
+def VLD4d16 : VLD4D<0b0000, {0,1,?,?}, "16">;
+def VLD4d32 : VLD4D<0b0000, {1,0,?,?}, "32">;
+
+def VLD4d8Pseudo : VLDQQPseudo<IIC_VLD4>;
+def VLD4d16Pseudo : VLDQQPseudo<IIC_VLD4>;
+def VLD4d32Pseudo : VLDQQPseudo<IIC_VLD4>;
+
+// ...with address register writeback:
+class VLD4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+ : NLdSt<0, 0b10, op11_8, op7_4,
+ (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb),
+ (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD4u,
+ "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn$Rm",
+ "$Rn.addr = $wb", []> {
+ let Inst{5-4} = Rn{5-4};
+ let DecoderMethod = "DecodeVLDST4Instruction";
+}
+
+def VLD4d8_UPD : VLD4DWB<0b0000, {0,0,?,?}, "8">;
+def VLD4d16_UPD : VLD4DWB<0b0000, {0,1,?,?}, "16">;
+def VLD4d32_UPD : VLD4DWB<0b0000, {1,0,?,?}, "32">;
+
+def VLD4d8Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>;
+def VLD4d16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>;
+def VLD4d32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>;
+
+// ...with double-spaced registers:
+def VLD4q8 : VLD4D<0b0001, {0,0,?,?}, "8">;
+def VLD4q16 : VLD4D<0b0001, {0,1,?,?}, "16">;
+def VLD4q32 : VLD4D<0b0001, {1,0,?,?}, "32">;
+def VLD4q8_UPD : VLD4DWB<0b0001, {0,0,?,?}, "8">;
+def VLD4q16_UPD : VLD4DWB<0b0001, {0,1,?,?}, "16">;
+def VLD4q32_UPD : VLD4DWB<0b0001, {1,0,?,?}, "32">;
+
+def VLD4q8Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>;
+def VLD4q16Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>;
+def VLD4q32Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>;
+
+// ...alternate versions to be allocated odd register numbers:
+def VLD4q8oddPseudo : VLDQQQQPseudo<IIC_VLD4>;
+def VLD4q16oddPseudo : VLDQQQQPseudo<IIC_VLD4>;
+def VLD4q32oddPseudo : VLDQQQQPseudo<IIC_VLD4>;
+
+def VLD4q8oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>;
+def VLD4q16oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>;
+def VLD4q32oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>;
+
+} // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1
+
+// Classes for VLD*LN pseudo-instructions with multi-register operands.
+// These are expanded to real instructions after register allocation.
+class VLDQLNPseudo<InstrItinClass itin>
+ : PseudoNLdSt<(outs QPR:$dst),
+ (ins addrmode6:$addr, QPR:$src, nohash_imm:$lane),
+ itin, "$src = $dst">;
+class VLDQLNWBPseudo<InstrItinClass itin>
+ : PseudoNLdSt<(outs QPR:$dst, GPR:$wb),
+ (ins addrmode6:$addr, am6offset:$offset, QPR:$src,
+ nohash_imm:$lane), itin, "$addr.addr = $wb, $src = $dst">;
+class VLDQQLNPseudo<InstrItinClass itin>
+ : PseudoNLdSt<(outs QQPR:$dst),
+ (ins addrmode6:$addr, QQPR:$src, nohash_imm:$lane),
+ itin, "$src = $dst">;
+class VLDQQLNWBPseudo<InstrItinClass itin>
+ : PseudoNLdSt<(outs QQPR:$dst, GPR:$wb),
+ (ins addrmode6:$addr, am6offset:$offset, QQPR:$src,
+ nohash_imm:$lane), itin, "$addr.addr = $wb, $src = $dst">;
+class VLDQQQQLNPseudo<InstrItinClass itin>
+ : PseudoNLdSt<(outs QQQQPR:$dst),
+ (ins addrmode6:$addr, QQQQPR:$src, nohash_imm:$lane),
+ itin, "$src = $dst">;
+class VLDQQQQLNWBPseudo<InstrItinClass itin>
+ : PseudoNLdSt<(outs QQQQPR:$dst, GPR:$wb),
+ (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src,
+ nohash_imm:$lane), itin, "$addr.addr = $wb, $src = $dst">;
+
+// VLD1LN : Vector Load (single element to one lane)
+class VLD1LN<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
+ PatFrag LoadOp>
+ : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd),
+ (ins addrmode6:$Rn, DPR:$src, nohash_imm:$lane),
+ IIC_VLD1ln, "vld1", Dt, "\\{$Vd[$lane]\\}, $Rn",
+ "$src = $Vd",
+ [(set DPR:$Vd, (vector_insert (Ty DPR:$src),
+ (i32 (LoadOp addrmode6:$Rn)),
+ imm:$lane))]> {
+ let Rm = 0b1111;
+ let DecoderMethod = "DecodeVLD1LN";
+}
+class VLD1LN32<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
+ PatFrag LoadOp>
+ : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd),
+ (ins addrmode6oneL32:$Rn, DPR:$src, nohash_imm:$lane),
+ IIC_VLD1ln, "vld1", Dt, "\\{$Vd[$lane]\\}, $Rn",
+ "$src = $Vd",
+ [(set DPR:$Vd, (vector_insert (Ty DPR:$src),
+ (i32 (LoadOp addrmode6oneL32:$Rn)),
+ imm:$lane))]> {
+ let Rm = 0b1111;
+ let DecoderMethod = "DecodeVLD1LN";
+}
+class VLD1QLNPseudo<ValueType Ty, PatFrag LoadOp> : VLDQLNPseudo<IIC_VLD1ln> {
+ let Pattern = [(set QPR:$dst, (vector_insert (Ty QPR:$src),
+ (i32 (LoadOp addrmode6:$addr)),
+ imm:$lane))];
+}
+
+def VLD1LNd8 : VLD1LN<0b0000, {?,?,?,0}, "8", v8i8, extloadi8> {
+ let Inst{7-5} = lane{2-0};
+}
+def VLD1LNd16 : VLD1LN<0b0100, {?,?,0,?}, "16", v4i16, extloadi16> {
+ let Inst{7-6} = lane{1-0};
+ let Inst{5-4} = Rn{5-4};
+}
+def VLD1LNd32 : VLD1LN32<0b1000, {?,0,?,?}, "32", v2i32, load> {
+ let Inst{7} = lane{0};
+ let Inst{5-4} = Rn{5-4};
+}
+
+def VLD1LNq8Pseudo : VLD1QLNPseudo<v16i8, extloadi8>;
+def VLD1LNq16Pseudo : VLD1QLNPseudo<v8i16, extloadi16>;
+def VLD1LNq32Pseudo : VLD1QLNPseudo<v4i32, load>;
+
+def : Pat<(vector_insert (v2f32 DPR:$src),
+ (f32 (load addrmode6:$addr)), imm:$lane),
+ (VLD1LNd32 addrmode6:$addr, DPR:$src, imm:$lane)>;
+def : Pat<(vector_insert (v4f32 QPR:$src),
+ (f32 (load addrmode6:$addr)), imm:$lane),
+ (VLD1LNq32Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>;
+
+let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in {
+
+// ...with address register writeback:
+class VLD1LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+ : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, GPR:$wb),
+ (ins addrmode6:$Rn, am6offset:$Rm,
+ DPR:$src, nohash_imm:$lane), IIC_VLD1lnu, "vld1", Dt,
+ "\\{$Vd[$lane]\\}, $Rn$Rm",
+ "$src = $Vd, $Rn.addr = $wb", []> {
+ let DecoderMethod = "DecodeVLD1LN";
+}
+
+def VLD1LNd8_UPD : VLD1LNWB<0b0000, {?,?,?,0}, "8"> {
+ let Inst{7-5} = lane{2-0};
+}
+def VLD1LNd16_UPD : VLD1LNWB<0b0100, {?,?,0,?}, "16"> {
+ let Inst{7-6} = lane{1-0};
+ let Inst{4} = Rn{4};
+}
+def VLD1LNd32_UPD : VLD1LNWB<0b1000, {?,0,?,?}, "32"> {
+ let Inst{7} = lane{0};
+ let Inst{5} = Rn{4};
+ let Inst{4} = Rn{4};
+}
+
+def VLD1LNq8Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>;
+def VLD1LNq16Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>;
+def VLD1LNq32Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>;
+
+// VLD2LN : Vector Load (single 2-element structure to one lane)
+class VLD2LN<bits<4> op11_8, bits<4> op7_4, string Dt>
+ : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2),
+ (ins addrmode6:$Rn, DPR:$src1, DPR:$src2, nohash_imm:$lane),
+ IIC_VLD2ln, "vld2", Dt, "\\{$Vd[$lane], $dst2[$lane]\\}, $Rn",
+ "$src1 = $Vd, $src2 = $dst2", []> {
+ let Rm = 0b1111;
+ let Inst{4} = Rn{4};
+ let DecoderMethod = "DecodeVLD2LN";
+}
+
+def VLD2LNd8 : VLD2LN<0b0001, {?,?,?,?}, "8"> {
+ let Inst{7-5} = lane{2-0};
+}
+def VLD2LNd16 : VLD2LN<0b0101, {?,?,0,?}, "16"> {
+ let Inst{7-6} = lane{1-0};
+}
+def VLD2LNd32 : VLD2LN<0b1001, {?,0,0,?}, "32"> {
+ let Inst{7} = lane{0};
+}
+
+def VLD2LNd8Pseudo : VLDQLNPseudo<IIC_VLD2ln>;
+def VLD2LNd16Pseudo : VLDQLNPseudo<IIC_VLD2ln>;
+def VLD2LNd32Pseudo : VLDQLNPseudo<IIC_VLD2ln>;
+
+// ...with double-spaced registers:
+def VLD2LNq16 : VLD2LN<0b0101, {?,?,1,?}, "16"> {
+ let Inst{7-6} = lane{1-0};
+}
+def VLD2LNq32 : VLD2LN<0b1001, {?,1,0,?}, "32"> {
+ let Inst{7} = lane{0};
+}
+
+def VLD2LNq16Pseudo : VLDQQLNPseudo<IIC_VLD2ln>;
+def VLD2LNq32Pseudo : VLDQQLNPseudo<IIC_VLD2ln>;
+
+// ...with address register writeback:
+class VLD2LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+ : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2, GPR:$wb),
+ (ins addrmode6:$Rn, am6offset:$Rm,
+ DPR:$src1, DPR:$src2, nohash_imm:$lane), IIC_VLD2lnu, "vld2", Dt,
+ "\\{$Vd[$lane], $dst2[$lane]\\}, $Rn$Rm",
+ "$src1 = $Vd, $src2 = $dst2, $Rn.addr = $wb", []> {
+ let Inst{4} = Rn{4};
+ let DecoderMethod = "DecodeVLD2LN";
+}
+
+def VLD2LNd8_UPD : VLD2LNWB<0b0001, {?,?,?,?}, "8"> {
+ let Inst{7-5} = lane{2-0};
+}
+def VLD2LNd16_UPD : VLD2LNWB<0b0101, {?,?,0,?}, "16"> {
+ let Inst{7-6} = lane{1-0};
+}
+def VLD2LNd32_UPD : VLD2LNWB<0b1001, {?,0,0,?}, "32"> {
+ let Inst{7} = lane{0};
+}
+
+def VLD2LNd8Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>;
+def VLD2LNd16Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>;
+def VLD2LNd32Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>;
+
+def VLD2LNq16_UPD : VLD2LNWB<0b0101, {?,?,1,?}, "16"> {
+ let Inst{7-6} = lane{1-0};
+}
+def VLD2LNq32_UPD : VLD2LNWB<0b1001, {?,1,0,?}, "32"> {
+ let Inst{7} = lane{0};
+}
+
+def VLD2LNq16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD2lnu>;
+def VLD2LNq32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD2lnu>;
+
+// VLD3LN : Vector Load (single 3-element structure to one lane)
+class VLD3LN<bits<4> op11_8, bits<4> op7_4, string Dt>
+ : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3),
+ (ins addrmode6:$Rn, DPR:$src1, DPR:$src2, DPR:$src3,
+ nohash_imm:$lane), IIC_VLD3ln, "vld3", Dt,
+ "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane]\\}, $Rn",
+ "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3", []> {
+ let Rm = 0b1111;
+ let DecoderMethod = "DecodeVLD3LN";
+}
+
+def VLD3LNd8 : VLD3LN<0b0010, {?,?,?,0}, "8"> {
+ let Inst{7-5} = lane{2-0};
+}
+def VLD3LNd16 : VLD3LN<0b0110, {?,?,0,0}, "16"> {
+ let Inst{7-6} = lane{1-0};
+}
+def VLD3LNd32 : VLD3LN<0b1010, {?,0,0,0}, "32"> {
+ let Inst{7} = lane{0};
+}
+
+def VLD3LNd8Pseudo : VLDQQLNPseudo<IIC_VLD3ln>;
+def VLD3LNd16Pseudo : VLDQQLNPseudo<IIC_VLD3ln>;
+def VLD3LNd32Pseudo : VLDQQLNPseudo<IIC_VLD3ln>;
+
+// ...with double-spaced registers:
+def VLD3LNq16 : VLD3LN<0b0110, {?,?,1,0}, "16"> {
+ let Inst{7-6} = lane{1-0};
+}
+def VLD3LNq32 : VLD3LN<0b1010, {?,1,0,0}, "32"> {
+ let Inst{7} = lane{0};
+}
+
+def VLD3LNq16Pseudo : VLDQQQQLNPseudo<IIC_VLD3ln>;
+def VLD3LNq32Pseudo : VLDQQQQLNPseudo<IIC_VLD3ln>;
+
+// ...with address register writeback:
+class VLD3LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+ : NLdStLn<1, 0b10, op11_8, op7_4,
+ (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb),
+ (ins addrmode6:$Rn, am6offset:$Rm,
+ DPR:$src1, DPR:$src2, DPR:$src3, nohash_imm:$lane),
+ IIC_VLD3lnu, "vld3", Dt,
+ "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane]\\}, $Rn$Rm",
+ "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $Rn.addr = $wb",
+ []> {
+ let DecoderMethod = "DecodeVLD3LN";
+}
+
+def VLD3LNd8_UPD : VLD3LNWB<0b0010, {?,?,?,0}, "8"> {
+ let Inst{7-5} = lane{2-0};
+}
+def VLD3LNd16_UPD : VLD3LNWB<0b0110, {?,?,0,0}, "16"> {
+ let Inst{7-6} = lane{1-0};
+}
+def VLD3LNd32_UPD : VLD3LNWB<0b1010, {?,0,0,0}, "32"> {
+ let Inst{7} = lane{0};
+}
+
+def VLD3LNd8Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>;
+def VLD3LNd16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>;
+def VLD3LNd32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>;
+
+def VLD3LNq16_UPD : VLD3LNWB<0b0110, {?,?,1,0}, "16"> {
+ let Inst{7-6} = lane{1-0};
+}
+def VLD3LNq32_UPD : VLD3LNWB<0b1010, {?,1,0,0}, "32"> {
+ let Inst{7} = lane{0};
+}
+
+def VLD3LNq16Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD3lnu>;
+def VLD3LNq32Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD3lnu>;
+
+// VLD4LN : Vector Load (single 4-element structure to one lane)
+class VLD4LN<bits<4> op11_8, bits<4> op7_4, string Dt>
+ : NLdStLn<1, 0b10, op11_8, op7_4,
+ (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4),
+ (ins addrmode6:$Rn, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4,
+ nohash_imm:$lane), IIC_VLD4ln, "vld4", Dt,
+ "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane], $dst4[$lane]\\}, $Rn",
+ "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4", []> {
+ let Rm = 0b1111;
+ let Inst{4} = Rn{4};
+ let DecoderMethod = "DecodeVLD4LN";
+}
+
+def VLD4LNd8 : VLD4LN<0b0011, {?,?,?,?}, "8"> {
+ let Inst{7-5} = lane{2-0};
+}
+def VLD4LNd16 : VLD4LN<0b0111, {?,?,0,?}, "16"> {
+ let Inst{7-6} = lane{1-0};
+}
+def VLD4LNd32 : VLD4LN<0b1011, {?,0,?,?}, "32"> {
+ let Inst{7} = lane{0};
+ let Inst{5} = Rn{5};
+}
+
+def VLD4LNd8Pseudo : VLDQQLNPseudo<IIC_VLD4ln>;
+def VLD4LNd16Pseudo : VLDQQLNPseudo<IIC_VLD4ln>;
+def VLD4LNd32Pseudo : VLDQQLNPseudo<IIC_VLD4ln>;
+
+// ...with double-spaced registers:
+def VLD4LNq16 : VLD4LN<0b0111, {?,?,1,?}, "16"> {
+ let Inst{7-6} = lane{1-0};
+}
+def VLD4LNq32 : VLD4LN<0b1011, {?,1,?,?}, "32"> {
+ let Inst{7} = lane{0};
+ let Inst{5} = Rn{5};
+}
+
+def VLD4LNq16Pseudo : VLDQQQQLNPseudo<IIC_VLD4ln>;
+def VLD4LNq32Pseudo : VLDQQQQLNPseudo<IIC_VLD4ln>;
+
+// ...with address register writeback:
+class VLD4LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+ : NLdStLn<1, 0b10, op11_8, op7_4,
+ (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb),
+ (ins addrmode6:$Rn, am6offset:$Rm,
+ DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4, nohash_imm:$lane),
+ IIC_VLD4lnu, "vld4", Dt,
+"\\{$Vd[$lane], $dst2[$lane], $dst3[$lane], $dst4[$lane]\\}, $Rn$Rm",
+"$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4, $Rn.addr = $wb",
+ []> {
+ let Inst{4} = Rn{4};
+ let DecoderMethod = "DecodeVLD4LN" ;
+}
+
+def VLD4LNd8_UPD : VLD4LNWB<0b0011, {?,?,?,?}, "8"> {
+ let Inst{7-5} = lane{2-0};
+}
+def VLD4LNd16_UPD : VLD4LNWB<0b0111, {?,?,0,?}, "16"> {
+ let Inst{7-6} = lane{1-0};
+}
+def VLD4LNd32_UPD : VLD4LNWB<0b1011, {?,0,?,?}, "32"> {
+ let Inst{7} = lane{0};
+ let Inst{5} = Rn{5};
+}
+
+def VLD4LNd8Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>;
+def VLD4LNd16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>;
+def VLD4LNd32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>;
+
+def VLD4LNq16_UPD : VLD4LNWB<0b0111, {?,?,1,?}, "16"> {
+ let Inst{7-6} = lane{1-0};
+}
+def VLD4LNq32_UPD : VLD4LNWB<0b1011, {?,1,?,?}, "32"> {
+ let Inst{7} = lane{0};
+ let Inst{5} = Rn{5};
+}
+
+def VLD4LNq16Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD4lnu>;
+def VLD4LNq32Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD4lnu>;
+
+} // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1
+
+// VLD1DUP : Vector Load (single element to all lanes)
+class VLD1DUP<bits<4> op7_4, string Dt, ValueType Ty, PatFrag LoadOp,
+ Operand AddrMode>
+ : NLdSt<1, 0b10, 0b1100, op7_4, (outs VecListOneDAllLanes:$Vd),
+ (ins AddrMode:$Rn),
+ IIC_VLD1dup, "vld1", Dt, "$Vd, $Rn", "",
+ [(set VecListOneDAllLanes:$Vd,
+ (Ty (NEONvdup (i32 (LoadOp AddrMode:$Rn)))))]> {
+ let Rm = 0b1111;
+ let Inst{4} = Rn{4};
+ let DecoderMethod = "DecodeVLD1DupInstruction";
+}
+def VLD1DUPd8 : VLD1DUP<{0,0,0,?}, "8", v8i8, extloadi8,
+ addrmode6dupalignNone>;
+def VLD1DUPd16 : VLD1DUP<{0,1,0,?}, "16", v4i16, extloadi16,
+ addrmode6dupalign16>;
+def VLD1DUPd32 : VLD1DUP<{1,0,0,?}, "32", v2i32, load,
+ addrmode6dupalign32>;
+
+def : Pat<(v2f32 (NEONvdup (f32 (load addrmode6dup:$addr)))),
+ (VLD1DUPd32 addrmode6:$addr)>;
+
+class VLD1QDUP<bits<4> op7_4, string Dt, ValueType Ty, PatFrag LoadOp,
+ Operand AddrMode>
+ : NLdSt<1, 0b10, 0b1100, op7_4, (outs VecListDPairAllLanes:$Vd),
+ (ins AddrMode:$Rn), IIC_VLD1dup,
+ "vld1", Dt, "$Vd, $Rn", "",
+ [(set VecListDPairAllLanes:$Vd,
+ (Ty (NEONvdup (i32 (LoadOp AddrMode:$Rn)))))]> {
+ let Rm = 0b1111;
+ let Inst{4} = Rn{4};
+ let DecoderMethod = "DecodeVLD1DupInstruction";
+}
+
+def VLD1DUPq8 : VLD1QDUP<{0,0,1,0}, "8", v16i8, extloadi8,
+ addrmode6dupalignNone>;
+def VLD1DUPq16 : VLD1QDUP<{0,1,1,?}, "16", v8i16, extloadi16,
+ addrmode6dupalign16>;
+def VLD1DUPq32 : VLD1QDUP<{1,0,1,?}, "32", v4i32, load,
+ addrmode6dupalign32>;
+
+def : Pat<(v4f32 (NEONvdup (f32 (load addrmode6dup:$addr)))),
+ (VLD1DUPq32 addrmode6:$addr)>;
+
+let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in {
+// ...with address register writeback:
+multiclass VLD1DUPWB<bits<4> op7_4, string Dt, Operand AddrMode> {
+ def _fixed : NLdSt<1, 0b10, 0b1100, op7_4,
+ (outs VecListOneDAllLanes:$Vd, GPR:$wb),
+ (ins AddrMode:$Rn), IIC_VLD1dupu,
+ "vld1", Dt, "$Vd, $Rn!",
+ "$Rn.addr = $wb", []> {
+ let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+ let Inst{4} = Rn{4};
+ let DecoderMethod = "DecodeVLD1DupInstruction";
+ }
+ def _register : NLdSt<1, 0b10, 0b1100, op7_4,
+ (outs VecListOneDAllLanes:$Vd, GPR:$wb),
+ (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1dupu,
+ "vld1", Dt, "$Vd, $Rn, $Rm",
+ "$Rn.addr = $wb", []> {
+ let Inst{4} = Rn{4};
+ let DecoderMethod = "DecodeVLD1DupInstruction";
+ }
+}
+multiclass VLD1QDUPWB<bits<4> op7_4, string Dt, Operand AddrMode> {
+ def _fixed : NLdSt<1, 0b10, 0b1100, op7_4,
+ (outs VecListDPairAllLanes:$Vd, GPR:$wb),
+ (ins AddrMode:$Rn), IIC_VLD1dupu,
+ "vld1", Dt, "$Vd, $Rn!",
+ "$Rn.addr = $wb", []> {
+ let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+ let Inst{4} = Rn{4};
+ let DecoderMethod = "DecodeVLD1DupInstruction";
+ }
+ def _register : NLdSt<1, 0b10, 0b1100, op7_4,
+ (outs VecListDPairAllLanes:$Vd, GPR:$wb),
+ (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1dupu,
+ "vld1", Dt, "$Vd, $Rn, $Rm",
+ "$Rn.addr = $wb", []> {
+ let Inst{4} = Rn{4};
+ let DecoderMethod = "DecodeVLD1DupInstruction";
+ }
+}
+
+defm VLD1DUPd8wb : VLD1DUPWB<{0,0,0,0}, "8", addrmode6dupalignNone>;
+defm VLD1DUPd16wb : VLD1DUPWB<{0,1,0,?}, "16", addrmode6dupalign16>;
+defm VLD1DUPd32wb : VLD1DUPWB<{1,0,0,?}, "32", addrmode6dupalign32>;
+
+defm VLD1DUPq8wb : VLD1QDUPWB<{0,0,1,0}, "8", addrmode6dupalignNone>;
+defm VLD1DUPq16wb : VLD1QDUPWB<{0,1,1,?}, "16", addrmode6dupalign16>;
+defm VLD1DUPq32wb : VLD1QDUPWB<{1,0,1,?}, "32", addrmode6dupalign32>;
+
+// VLD2DUP : Vector Load (single 2-element structure to all lanes)
+class VLD2DUP<bits<4> op7_4, string Dt, RegisterOperand VdTy, Operand AddrMode>
+ : NLdSt<1, 0b10, 0b1101, op7_4, (outs VdTy:$Vd),
+ (ins AddrMode:$Rn), IIC_VLD2dup,
+ "vld2", Dt, "$Vd, $Rn", "", []> {
+ let Rm = 0b1111;
+ let Inst{4} = Rn{4};
+ let DecoderMethod = "DecodeVLD2DupInstruction";
+}
+
+def VLD2DUPd8 : VLD2DUP<{0,0,0,?}, "8", VecListDPairAllLanes,
+ addrmode6dupalign16>;
+def VLD2DUPd16 : VLD2DUP<{0,1,0,?}, "16", VecListDPairAllLanes,
+ addrmode6dupalign32>;
+def VLD2DUPd32 : VLD2DUP<{1,0,0,?}, "32", VecListDPairAllLanes,
+ addrmode6dupalign64>;
+
+// HACK this one, VLD2DUPd8x2 must be changed at the same time with VLD2b8 or
+// "vld2.8 {d0[], d2[]}, [r4:32]" will become "vld2.8 {d0, d2}, [r4:32]".
+// ...with double-spaced registers
+def VLD2DUPd8x2 : VLD2DUP<{0,0,1,?}, "8", VecListDPairSpacedAllLanes,
+ addrmode6dupalign16>;
+def VLD2DUPd16x2 : VLD2DUP<{0,1,1,?}, "16", VecListDPairSpacedAllLanes,
+ addrmode6dupalign32>;
+def VLD2DUPd32x2 : VLD2DUP<{1,0,1,?}, "32", VecListDPairSpacedAllLanes,
+ addrmode6dupalign64>;
+
+// ...with address register writeback:
+multiclass VLD2DUPWB<bits<4> op7_4, string Dt, RegisterOperand VdTy,
+ Operand AddrMode> {
+ def _fixed : NLdSt<1, 0b10, 0b1101, op7_4,
+ (outs VdTy:$Vd, GPR:$wb),
+ (ins AddrMode:$Rn), IIC_VLD2dupu,
+ "vld2", Dt, "$Vd, $Rn!",
+ "$Rn.addr = $wb", []> {
+ let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+ let Inst{4} = Rn{4};
+ let DecoderMethod = "DecodeVLD2DupInstruction";
+ }
+ def _register : NLdSt<1, 0b10, 0b1101, op7_4,
+ (outs VdTy:$Vd, GPR:$wb),
+ (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD2dupu,
+ "vld2", Dt, "$Vd, $Rn, $Rm",
+ "$Rn.addr = $wb", []> {
+ let Inst{4} = Rn{4};
+ let DecoderMethod = "DecodeVLD2DupInstruction";
+ }
+}
+
+defm VLD2DUPd8wb : VLD2DUPWB<{0,0,0,0}, "8", VecListDPairAllLanes,
+ addrmode6dupalign16>;
+defm VLD2DUPd16wb : VLD2DUPWB<{0,1,0,?}, "16", VecListDPairAllLanes,
+ addrmode6dupalign32>;
+defm VLD2DUPd32wb : VLD2DUPWB<{1,0,0,?}, "32", VecListDPairAllLanes,
+ addrmode6dupalign64>;
+
+defm VLD2DUPd8x2wb : VLD2DUPWB<{0,0,1,0}, "8", VecListDPairSpacedAllLanes,
+ addrmode6dupalign16>;
+defm VLD2DUPd16x2wb : VLD2DUPWB<{0,1,1,?}, "16", VecListDPairSpacedAllLanes,
+ addrmode6dupalign32>;
+defm VLD2DUPd32x2wb : VLD2DUPWB<{1,0,1,?}, "32", VecListDPairSpacedAllLanes,
+ addrmode6dupalign64>;
+
+// VLD3DUP : Vector Load (single 3-element structure to all lanes)
+class VLD3DUP<bits<4> op7_4, string Dt>
+ : NLdSt<1, 0b10, 0b1110, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3),
+ (ins addrmode6dup:$Rn), IIC_VLD3dup,
+ "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn", "", []> {
+ let Rm = 0b1111;
+ let Inst{4} = 0;
+ let DecoderMethod = "DecodeVLD3DupInstruction";
+}
+
+def VLD3DUPd8 : VLD3DUP<{0,0,0,?}, "8">;
+def VLD3DUPd16 : VLD3DUP<{0,1,0,?}, "16">;
+def VLD3DUPd32 : VLD3DUP<{1,0,0,?}, "32">;
+
+def VLD3DUPd8Pseudo : VLDQQPseudo<IIC_VLD3dup>;
+def VLD3DUPd16Pseudo : VLDQQPseudo<IIC_VLD3dup>;
+def VLD3DUPd32Pseudo : VLDQQPseudo<IIC_VLD3dup>;
+
+// ...with double-spaced registers (not used for codegen):
+def VLD3DUPq8 : VLD3DUP<{0,0,1,?}, "8">;
+def VLD3DUPq16 : VLD3DUP<{0,1,1,?}, "16">;
+def VLD3DUPq32 : VLD3DUP<{1,0,1,?}, "32">;
+
+// ...with address register writeback:
+class VLD3DUPWB<bits<4> op7_4, string Dt, Operand AddrMode>
+ : NLdSt<1, 0b10, 0b1110, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb),
+ (ins AddrMode:$Rn, am6offset:$Rm), IIC_VLD3dupu,
+ "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn$Rm",
+ "$Rn.addr = $wb", []> {
+ let Inst{4} = 0;
+ let DecoderMethod = "DecodeVLD3DupInstruction";
+}
+
+def VLD3DUPd8_UPD : VLD3DUPWB<{0,0,0,0}, "8", addrmode6dupalign64>;
+def VLD3DUPd16_UPD : VLD3DUPWB<{0,1,0,?}, "16", addrmode6dupalign64>;
+def VLD3DUPd32_UPD : VLD3DUPWB<{1,0,0,?}, "32", addrmode6dupalign64>;
+
+def VLD3DUPq8_UPD : VLD3DUPWB<{0,0,1,0}, "8", addrmode6dupalign64>;
+def VLD3DUPq16_UPD : VLD3DUPWB<{0,1,1,?}, "16", addrmode6dupalign64>;
+def VLD3DUPq32_UPD : VLD3DUPWB<{1,0,1,?}, "32", addrmode6dupalign64>;
+
+def VLD3DUPd8Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>;
+def VLD3DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>;
+def VLD3DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>;
+
+// VLD4DUP : Vector Load (single 4-element structure to all lanes)
+class VLD4DUP<bits<4> op7_4, string Dt>
+ : NLdSt<1, 0b10, 0b1111, op7_4,
+ (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4),
+ (ins addrmode6dup:$Rn), IIC_VLD4dup,
+ "vld4", Dt, "\\{$Vd[], $dst2[], $dst3[], $dst4[]\\}, $Rn", "", []> {
+ let Rm = 0b1111;
+ let Inst{4} = Rn{4};
+ let DecoderMethod = "DecodeVLD4DupInstruction";
+}
+
+def VLD4DUPd8 : VLD4DUP<{0,0,0,?}, "8">;
+def VLD4DUPd16 : VLD4DUP<{0,1,0,?}, "16">;
+def VLD4DUPd32 : VLD4DUP<{1,?,0,?}, "32"> { let Inst{6} = Rn{5}; }
+
+def VLD4DUPd8Pseudo : VLDQQPseudo<IIC_VLD4dup>;
+def VLD4DUPd16Pseudo : VLDQQPseudo<IIC_VLD4dup>;
+def VLD4DUPd32Pseudo : VLDQQPseudo<IIC_VLD4dup>;
+
+// ...with double-spaced registers (not used for codegen):
+def VLD4DUPq8 : VLD4DUP<{0,0,1,?}, "8">;
+def VLD4DUPq16 : VLD4DUP<{0,1,1,?}, "16">;
+def VLD4DUPq32 : VLD4DUP<{1,?,1,?}, "32"> { let Inst{6} = Rn{5}; }
+
+// ...with address register writeback:
+class VLD4DUPWB<bits<4> op7_4, string Dt>
+ : NLdSt<1, 0b10, 0b1111, op7_4,
+ (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb),
+ (ins addrmode6dup:$Rn, am6offset:$Rm), IIC_VLD4dupu,
+ "vld4", Dt, "\\{$Vd[], $dst2[], $dst3[], $dst4[]\\}, $Rn$Rm",
+ "$Rn.addr = $wb", []> {
+ let Inst{4} = Rn{4};
+ let DecoderMethod = "DecodeVLD4DupInstruction";
+}
+
+def VLD4DUPd8_UPD : VLD4DUPWB<{0,0,0,0}, "8">;
+def VLD4DUPd16_UPD : VLD4DUPWB<{0,1,0,?}, "16">;
+def VLD4DUPd32_UPD : VLD4DUPWB<{1,?,0,?}, "32"> { let Inst{6} = Rn{5}; }
+
+def VLD4DUPq8_UPD : VLD4DUPWB<{0,0,1,0}, "8">;
+def VLD4DUPq16_UPD : VLD4DUPWB<{0,1,1,?}, "16">;
+def VLD4DUPq32_UPD : VLD4DUPWB<{1,?,1,?}, "32"> { let Inst{6} = Rn{5}; }
+
+def VLD4DUPd8Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>;
+def VLD4DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>;
+def VLD4DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>;
+
+} // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1
+
+let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in {
+
+// Classes for VST* pseudo-instructions with multi-register operands.
+// These are expanded to real instructions after register allocation.
+class VSTQPseudo<InstrItinClass itin>
+ : PseudoNLdSt<(outs), (ins addrmode6:$addr, QPR:$src), itin, "">;
+class VSTQWBPseudo<InstrItinClass itin>
+ : PseudoNLdSt<(outs GPR:$wb),
+ (ins addrmode6:$addr, am6offset:$offset, QPR:$src), itin,
+ "$addr.addr = $wb">;
+class VSTQWBfixedPseudo<InstrItinClass itin>
+ : PseudoNLdSt<(outs GPR:$wb),
+ (ins addrmode6:$addr, QPR:$src), itin,
+ "$addr.addr = $wb">;
+class VSTQWBregisterPseudo<InstrItinClass itin>
+ : PseudoNLdSt<(outs GPR:$wb),
+ (ins addrmode6:$addr, rGPR:$offset, QPR:$src), itin,
+ "$addr.addr = $wb">;
+class VSTQQPseudo<InstrItinClass itin>
+ : PseudoNLdSt<(outs), (ins addrmode6:$addr, QQPR:$src), itin, "">;
+class VSTQQWBPseudo<InstrItinClass itin>
+ : PseudoNLdSt<(outs GPR:$wb),
+ (ins addrmode6:$addr, am6offset:$offset, QQPR:$src), itin,
+ "$addr.addr = $wb">;
+class VSTQQWBfixedPseudo<InstrItinClass itin>
+ : PseudoNLdSt<(outs GPR:$wb),
+ (ins addrmode6:$addr, QQPR:$src), itin,
+ "$addr.addr = $wb">;
+class VSTQQWBregisterPseudo<InstrItinClass itin>
+ : PseudoNLdSt<(outs GPR:$wb),
+ (ins addrmode6:$addr, rGPR:$offset, QQPR:$src), itin,
+ "$addr.addr = $wb">;
+
+class VSTQQQQPseudo<InstrItinClass itin>
+ : PseudoNLdSt<(outs), (ins addrmode6:$addr, QQQQPR:$src), itin, "">;
+class VSTQQQQWBPseudo<InstrItinClass itin>
+ : PseudoNLdSt<(outs GPR:$wb),
+ (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src), itin,
+ "$addr.addr = $wb">;
+
+// VST1 : Vector Store (multiple single elements)
+class VST1D<bits<4> op7_4, string Dt, Operand AddrMode>
+ : NLdSt<0,0b00,0b0111,op7_4, (outs), (ins AddrMode:$Rn, VecListOneD:$Vd),
+ IIC_VST1, "vst1", Dt, "$Vd, $Rn", "", []> {
+ let Rm = 0b1111;
+ let Inst{4} = Rn{4};
+ let DecoderMethod = "DecodeVLDST1Instruction";
+}
+class VST1Q<bits<4> op7_4, string Dt, Operand AddrMode>
+ : NLdSt<0,0b00,0b1010,op7_4, (outs), (ins AddrMode:$Rn, VecListDPair:$Vd),
+ IIC_VST1x2, "vst1", Dt, "$Vd, $Rn", "", []> {
+ let Rm = 0b1111;
+ let Inst{5-4} = Rn{5-4};
+ let DecoderMethod = "DecodeVLDST1Instruction";
+}
+
+def VST1d8 : VST1D<{0,0,0,?}, "8", addrmode6align64>;
+def VST1d16 : VST1D<{0,1,0,?}, "16", addrmode6align64>;
+def VST1d32 : VST1D<{1,0,0,?}, "32", addrmode6align64>;
+def VST1d64 : VST1D<{1,1,0,?}, "64", addrmode6align64>;
+
+def VST1q8 : VST1Q<{0,0,?,?}, "8", addrmode6align64or128>;
+def VST1q16 : VST1Q<{0,1,?,?}, "16", addrmode6align64or128>;
+def VST1q32 : VST1Q<{1,0,?,?}, "32", addrmode6align64or128>;
+def VST1q64 : VST1Q<{1,1,?,?}, "64", addrmode6align64or128>;
+
+// ...with address register writeback:
+multiclass VST1DWB<bits<4> op7_4, string Dt, Operand AddrMode> {
+ def _fixed : NLdSt<0,0b00, 0b0111,op7_4, (outs GPR:$wb),
+ (ins AddrMode:$Rn, VecListOneD:$Vd), IIC_VLD1u,
+ "vst1", Dt, "$Vd, $Rn!",
+ "$Rn.addr = $wb", []> {
+ let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+ let Inst{4} = Rn{4};
+ let DecoderMethod = "DecodeVLDST1Instruction";
+ }
+ def _register : NLdSt<0,0b00,0b0111,op7_4, (outs GPR:$wb),
+ (ins AddrMode:$Rn, rGPR:$Rm, VecListOneD:$Vd),
+ IIC_VLD1u,
+ "vst1", Dt, "$Vd, $Rn, $Rm",
+ "$Rn.addr = $wb", []> {
+ let Inst{4} = Rn{4};
+ let DecoderMethod = "DecodeVLDST1Instruction";
+ }
+}
+multiclass VST1QWB<bits<4> op7_4, string Dt, Operand AddrMode> {
+ def _fixed : NLdSt<0,0b00,0b1010,op7_4, (outs GPR:$wb),
+ (ins AddrMode:$Rn, VecListDPair:$Vd), IIC_VLD1x2u,
+ "vst1", Dt, "$Vd, $Rn!",
+ "$Rn.addr = $wb", []> {
+ let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+ let Inst{5-4} = Rn{5-4};
+ let DecoderMethod = "DecodeVLDST1Instruction";
+ }
+ def _register : NLdSt<0,0b00,0b1010,op7_4, (outs GPR:$wb),
+ (ins AddrMode:$Rn, rGPR:$Rm, VecListDPair:$Vd),
+ IIC_VLD1x2u,
+ "vst1", Dt, "$Vd, $Rn, $Rm",
+ "$Rn.addr = $wb", []> {
+ let Inst{5-4} = Rn{5-4};
+ let DecoderMethod = "DecodeVLDST1Instruction";
+ }
+}
+
+defm VST1d8wb : VST1DWB<{0,0,0,?}, "8", addrmode6align64>;
+defm VST1d16wb : VST1DWB<{0,1,0,?}, "16", addrmode6align64>;
+defm VST1d32wb : VST1DWB<{1,0,0,?}, "32", addrmode6align64>;
+defm VST1d64wb : VST1DWB<{1,1,0,?}, "64", addrmode6align64>;
+
+defm VST1q8wb : VST1QWB<{0,0,?,?}, "8", addrmode6align64or128>;
+defm VST1q16wb : VST1QWB<{0,1,?,?}, "16", addrmode6align64or128>;
+defm VST1q32wb : VST1QWB<{1,0,?,?}, "32", addrmode6align64or128>;
+defm VST1q64wb : VST1QWB<{1,1,?,?}, "64", addrmode6align64or128>;
+
+// ...with 3 registers
+class VST1D3<bits<4> op7_4, string Dt, Operand AddrMode>
+ : NLdSt<0, 0b00, 0b0110, op7_4, (outs),
+ (ins AddrMode:$Rn, VecListThreeD:$Vd),
+ IIC_VST1x3, "vst1", Dt, "$Vd, $Rn", "", []> {
+ let Rm = 0b1111;
+ let Inst{4} = Rn{4};
+ let DecoderMethod = "DecodeVLDST1Instruction";
+}
+multiclass VST1D3WB<bits<4> op7_4, string Dt, Operand AddrMode> {
+ def _fixed : NLdSt<0,0b00,0b0110,op7_4, (outs GPR:$wb),
+ (ins AddrMode:$Rn, VecListThreeD:$Vd), IIC_VLD1x3u,
+ "vst1", Dt, "$Vd, $Rn!",
+ "$Rn.addr = $wb", []> {
+ let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+ let Inst{5-4} = Rn{5-4};
+ let DecoderMethod = "DecodeVLDST1Instruction";
+ }
+ def _register : NLdSt<0,0b00,0b0110,op7_4, (outs GPR:$wb),
+ (ins AddrMode:$Rn, rGPR:$Rm, VecListThreeD:$Vd),
+ IIC_VLD1x3u,
+ "vst1", Dt, "$Vd, $Rn, $Rm",
+ "$Rn.addr = $wb", []> {
+ let Inst{5-4} = Rn{5-4};
+ let DecoderMethod = "DecodeVLDST1Instruction";
+ }
+}
+
+def VST1d8T : VST1D3<{0,0,0,?}, "8", addrmode6align64>;
+def VST1d16T : VST1D3<{0,1,0,?}, "16", addrmode6align64>;
+def VST1d32T : VST1D3<{1,0,0,?}, "32", addrmode6align64>;
+def VST1d64T : VST1D3<{1,1,0,?}, "64", addrmode6align64>;
+
+defm VST1d8Twb : VST1D3WB<{0,0,0,?}, "8", addrmode6align64>;
+defm VST1d16Twb : VST1D3WB<{0,1,0,?}, "16", addrmode6align64>;
+defm VST1d32Twb : VST1D3WB<{1,0,0,?}, "32", addrmode6align64>;
+defm VST1d64Twb : VST1D3WB<{1,1,0,?}, "64", addrmode6align64>;
+
+def VST1d64TPseudo : VSTQQPseudo<IIC_VST1x3>;
+def VST1d64TPseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST1x3u>;
+def VST1d64TPseudoWB_register : VSTQQWBPseudo<IIC_VST1x3u>;
+
+// ...with 4 registers
+class VST1D4<bits<4> op7_4, string Dt, Operand AddrMode>
+ : NLdSt<0, 0b00, 0b0010, op7_4, (outs),
+ (ins AddrMode:$Rn, VecListFourD:$Vd),
+ IIC_VST1x4, "vst1", Dt, "$Vd, $Rn", "",
+ []> {
+ let Rm = 0b1111;
+ let Inst{5-4} = Rn{5-4};
+ let DecoderMethod = "DecodeVLDST1Instruction";
+}
+multiclass VST1D4WB<bits<4> op7_4, string Dt, Operand AddrMode> {
+ def _fixed : NLdSt<0,0b00,0b0010,op7_4, (outs GPR:$wb),
+ (ins AddrMode:$Rn, VecListFourD:$Vd), IIC_VLD1x4u,
+ "vst1", Dt, "$Vd, $Rn!",
+ "$Rn.addr = $wb", []> {
+ let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+ let Inst{5-4} = Rn{5-4};
+ let DecoderMethod = "DecodeVLDST1Instruction";
+ }
+ def _register : NLdSt<0,0b00,0b0010,op7_4, (outs GPR:$wb),
+ (ins AddrMode:$Rn, rGPR:$Rm, VecListFourD:$Vd),
+ IIC_VLD1x4u,
+ "vst1", Dt, "$Vd, $Rn, $Rm",
+ "$Rn.addr = $wb", []> {
+ let Inst{5-4} = Rn{5-4};
+ let DecoderMethod = "DecodeVLDST1Instruction";
+ }
+}
+
+def VST1d8Q : VST1D4<{0,0,?,?}, "8", addrmode6align64or128or256>;
+def VST1d16Q : VST1D4<{0,1,?,?}, "16", addrmode6align64or128or256>;
+def VST1d32Q : VST1D4<{1,0,?,?}, "32", addrmode6align64or128or256>;
+def VST1d64Q : VST1D4<{1,1,?,?}, "64", addrmode6align64or128or256>;
+
+defm VST1d8Qwb : VST1D4WB<{0,0,?,?}, "8", addrmode6align64or128or256>;
+defm VST1d16Qwb : VST1D4WB<{0,1,?,?}, "16", addrmode6align64or128or256>;
+defm VST1d32Qwb : VST1D4WB<{1,0,?,?}, "32", addrmode6align64or128or256>;
+defm VST1d64Qwb : VST1D4WB<{1,1,?,?}, "64", addrmode6align64or128or256>;
+
+def VST1d64QPseudo : VSTQQPseudo<IIC_VST1x4>;
+def VST1d64QPseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST1x4u>;
+def VST1d64QPseudoWB_register : VSTQQWBPseudo<IIC_VST1x4u>;
+
+// VST2 : Vector Store (multiple 2-element structures)
+class VST2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy,
+ InstrItinClass itin, Operand AddrMode>
+ : NLdSt<0, 0b00, op11_8, op7_4, (outs), (ins AddrMode:$Rn, VdTy:$Vd),
+ itin, "vst2", Dt, "$Vd, $Rn", "", []> {
+ let Rm = 0b1111;
+ let Inst{5-4} = Rn{5-4};
+ let DecoderMethod = "DecodeVLDST2Instruction";
+}
+
+def VST2d8 : VST2<0b1000, {0,0,?,?}, "8", VecListDPair, IIC_VST2,
+ addrmode6align64or128>;
+def VST2d16 : VST2<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VST2,
+ addrmode6align64or128>;
+def VST2d32 : VST2<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VST2,
+ addrmode6align64or128>;
+
+def VST2q8 : VST2<0b0011, {0,0,?,?}, "8", VecListFourD, IIC_VST2x2,
+ addrmode6align64or128or256>;
+def VST2q16 : VST2<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VST2x2,
+ addrmode6align64or128or256>;
+def VST2q32 : VST2<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VST2x2,
+ addrmode6align64or128or256>;
+
+def VST2q8Pseudo : VSTQQPseudo<IIC_VST2x2>;
+def VST2q16Pseudo : VSTQQPseudo<IIC_VST2x2>;
+def VST2q32Pseudo : VSTQQPseudo<IIC_VST2x2>;
+
+// ...with address register writeback:
+multiclass VST2DWB<bits<4> op11_8, bits<4> op7_4, string Dt,
+ RegisterOperand VdTy, Operand AddrMode> {
+ def _fixed : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
+ (ins AddrMode:$Rn, VdTy:$Vd), IIC_VLD1u,
+ "vst2", Dt, "$Vd, $Rn!",
+ "$Rn.addr = $wb", []> {
+ let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+ let Inst{5-4} = Rn{5-4};
+ let DecoderMethod = "DecodeVLDST2Instruction";
+ }
+ def _register : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
+ (ins AddrMode:$Rn, rGPR:$Rm, VdTy:$Vd), IIC_VLD1u,
+ "vst2", Dt, "$Vd, $Rn, $Rm",
+ "$Rn.addr = $wb", []> {
+ let Inst{5-4} = Rn{5-4};
+ let DecoderMethod = "DecodeVLDST2Instruction";
+ }
+}
+multiclass VST2QWB<bits<4> op7_4, string Dt, Operand AddrMode> {
+ def _fixed : NLdSt<0, 0b00, 0b0011, op7_4, (outs GPR:$wb),
+ (ins AddrMode:$Rn, VecListFourD:$Vd), IIC_VLD1u,
+ "vst2", Dt, "$Vd, $Rn!",
+ "$Rn.addr = $wb", []> {
+ let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+ let Inst{5-4} = Rn{5-4};
+ let DecoderMethod = "DecodeVLDST2Instruction";
+ }
+ def _register : NLdSt<0, 0b00, 0b0011, op7_4, (outs GPR:$wb),
+ (ins AddrMode:$Rn, rGPR:$Rm, VecListFourD:$Vd),
+ IIC_VLD1u,
+ "vst2", Dt, "$Vd, $Rn, $Rm",
+ "$Rn.addr = $wb", []> {
+ let Inst{5-4} = Rn{5-4};
+ let DecoderMethod = "DecodeVLDST2Instruction";
+ }
+}
+
+defm VST2d8wb : VST2DWB<0b1000, {0,0,?,?}, "8", VecListDPair,
+ addrmode6align64or128>;
+defm VST2d16wb : VST2DWB<0b1000, {0,1,?,?}, "16", VecListDPair,
+ addrmode6align64or128>;
+defm VST2d32wb : VST2DWB<0b1000, {1,0,?,?}, "32", VecListDPair,
+ addrmode6align64or128>;
+
+defm VST2q8wb : VST2QWB<{0,0,?,?}, "8", addrmode6align64or128or256>;
+defm VST2q16wb : VST2QWB<{0,1,?,?}, "16", addrmode6align64or128or256>;
+defm VST2q32wb : VST2QWB<{1,0,?,?}, "32", addrmode6align64or128or256>;
+
+def VST2q8PseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST2x2u>;
+def VST2q16PseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST2x2u>;
+def VST2q32PseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST2x2u>;
+def VST2q8PseudoWB_register : VSTQQWBregisterPseudo<IIC_VST2x2u>;
+def VST2q16PseudoWB_register : VSTQQWBregisterPseudo<IIC_VST2x2u>;
+def VST2q32PseudoWB_register : VSTQQWBregisterPseudo<IIC_VST2x2u>;
+
+// ...with double-spaced registers
+def VST2b8 : VST2<0b1001, {0,0,?,?}, "8", VecListDPairSpaced, IIC_VST2,
+ addrmode6align64or128>;
+def VST2b16 : VST2<0b1001, {0,1,?,?}, "16", VecListDPairSpaced, IIC_VST2,
+ addrmode6align64or128>;
+def VST2b32 : VST2<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, IIC_VST2,
+ addrmode6align64or128>;
+defm VST2b8wb : VST2DWB<0b1001, {0,0,?,?}, "8", VecListDPairSpaced,
+ addrmode6align64or128>;
+defm VST2b16wb : VST2DWB<0b1001, {0,1,?,?}, "16", VecListDPairSpaced,
+ addrmode6align64or128>;
+defm VST2b32wb : VST2DWB<0b1001, {1,0,?,?}, "32", VecListDPairSpaced,
+ addrmode6align64or128>;
+
+// VST3 : Vector Store (multiple 3-element structures)
+class VST3D<bits<4> op11_8, bits<4> op7_4, string Dt>
+ : NLdSt<0, 0b00, op11_8, op7_4, (outs),
+ (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3), IIC_VST3,
+ "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn", "", []> {
+ let Rm = 0b1111;
+ let Inst{4} = Rn{4};
+ let DecoderMethod = "DecodeVLDST3Instruction";
+}
+
+def VST3d8 : VST3D<0b0100, {0,0,0,?}, "8">;
+def VST3d16 : VST3D<0b0100, {0,1,0,?}, "16">;
+def VST3d32 : VST3D<0b0100, {1,0,0,?}, "32">;
+
+def VST3d8Pseudo : VSTQQPseudo<IIC_VST3>;
+def VST3d16Pseudo : VSTQQPseudo<IIC_VST3>;
+def VST3d32Pseudo : VSTQQPseudo<IIC_VST3>;
+
+// ...with address register writeback:
+class VST3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+ : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
+ (ins addrmode6:$Rn, am6offset:$Rm,
+ DPR:$Vd, DPR:$src2, DPR:$src3), IIC_VST3u,
+ "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn$Rm",
+ "$Rn.addr = $wb", []> {
+ let Inst{4} = Rn{4};
+ let DecoderMethod = "DecodeVLDST3Instruction";
+}
+
+def VST3d8_UPD : VST3DWB<0b0100, {0,0,0,?}, "8">;
+def VST3d16_UPD : VST3DWB<0b0100, {0,1,0,?}, "16">;
+def VST3d32_UPD : VST3DWB<0b0100, {1,0,0,?}, "32">;
+
+def VST3d8Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>;
+def VST3d16Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>;
+def VST3d32Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>;
+
+// ...with double-spaced registers:
+def VST3q8 : VST3D<0b0101, {0,0,0,?}, "8">;
+def VST3q16 : VST3D<0b0101, {0,1,0,?}, "16">;
+def VST3q32 : VST3D<0b0101, {1,0,0,?}, "32">;
+def VST3q8_UPD : VST3DWB<0b0101, {0,0,0,?}, "8">;
+def VST3q16_UPD : VST3DWB<0b0101, {0,1,0,?}, "16">;
+def VST3q32_UPD : VST3DWB<0b0101, {1,0,0,?}, "32">;
+
+def VST3q8Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>;
+def VST3q16Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>;
+def VST3q32Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>;
+
+// ...alternate versions to be allocated odd register numbers:
+def VST3q8oddPseudo : VSTQQQQPseudo<IIC_VST3>;
+def VST3q16oddPseudo : VSTQQQQPseudo<IIC_VST3>;
+def VST3q32oddPseudo : VSTQQQQPseudo<IIC_VST3>;
+
+def VST3q8oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>;
+def VST3q16oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>;
+def VST3q32oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>;
+
+// VST4 : Vector Store (multiple 4-element structures)
+class VST4D<bits<4> op11_8, bits<4> op7_4, string Dt>
+ : NLdSt<0, 0b00, op11_8, op7_4, (outs),
+ (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4),
+ IIC_VST4, "vst4", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn",
+ "", []> {
+ let Rm = 0b1111;
+ let Inst{5-4} = Rn{5-4};
+ let DecoderMethod = "DecodeVLDST4Instruction";
+}
+
+def VST4d8 : VST4D<0b0000, {0,0,?,?}, "8">;
+def VST4d16 : VST4D<0b0000, {0,1,?,?}, "16">;
+def VST4d32 : VST4D<0b0000, {1,0,?,?}, "32">;
+
+def VST4d8Pseudo : VSTQQPseudo<IIC_VST4>;
+def VST4d16Pseudo : VSTQQPseudo<IIC_VST4>;
+def VST4d32Pseudo : VSTQQPseudo<IIC_VST4>;
+
+// ...with address register writeback:
+class VST4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+ : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
+ (ins addrmode6:$Rn, am6offset:$Rm,
+ DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4), IIC_VST4u,
+ "vst4", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn$Rm",
+ "$Rn.addr = $wb", []> {
+ let Inst{5-4} = Rn{5-4};
+ let DecoderMethod = "DecodeVLDST4Instruction";
+}
+
+def VST4d8_UPD : VST4DWB<0b0000, {0,0,?,?}, "8">;
+def VST4d16_UPD : VST4DWB<0b0000, {0,1,?,?}, "16">;
+def VST4d32_UPD : VST4DWB<0b0000, {1,0,?,?}, "32">;
+
+def VST4d8Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>;
+def VST4d16Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>;
+def VST4d32Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>;
+
+// ...with double-spaced registers:
+def VST4q8 : VST4D<0b0001, {0,0,?,?}, "8">;
+def VST4q16 : VST4D<0b0001, {0,1,?,?}, "16">;
+def VST4q32 : VST4D<0b0001, {1,0,?,?}, "32">;
+def VST4q8_UPD : VST4DWB<0b0001, {0,0,?,?}, "8">;
+def VST4q16_UPD : VST4DWB<0b0001, {0,1,?,?}, "16">;
+def VST4q32_UPD : VST4DWB<0b0001, {1,0,?,?}, "32">;
+
+def VST4q8Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>;
+def VST4q16Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>;
+def VST4q32Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>;
+
+// ...alternate versions to be allocated odd register numbers:
+def VST4q8oddPseudo : VSTQQQQPseudo<IIC_VST4>;
+def VST4q16oddPseudo : VSTQQQQPseudo<IIC_VST4>;
+def VST4q32oddPseudo : VSTQQQQPseudo<IIC_VST4>;
+
+def VST4q8oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>;
+def VST4q16oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>;
+def VST4q32oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>;
+
+} // mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1
+
+// Classes for VST*LN pseudo-instructions with multi-register operands.
+// These are expanded to real instructions after register allocation.
+class VSTQLNPseudo<InstrItinClass itin>
+ : PseudoNLdSt<(outs), (ins addrmode6:$addr, QPR:$src, nohash_imm:$lane),
+ itin, "">;
+class VSTQLNWBPseudo<InstrItinClass itin>
+ : PseudoNLdSt<(outs GPR:$wb),
+ (ins addrmode6:$addr, am6offset:$offset, QPR:$src,
+ nohash_imm:$lane), itin, "$addr.addr = $wb">;
+class VSTQQLNPseudo<InstrItinClass itin>
+ : PseudoNLdSt<(outs), (ins addrmode6:$addr, QQPR:$src, nohash_imm:$lane),
+ itin, "">;
+class VSTQQLNWBPseudo<InstrItinClass itin>
+ : PseudoNLdSt<(outs GPR:$wb),
+ (ins addrmode6:$addr, am6offset:$offset, QQPR:$src,
+ nohash_imm:$lane), itin, "$addr.addr = $wb">;
+class VSTQQQQLNPseudo<InstrItinClass itin>
+ : PseudoNLdSt<(outs), (ins addrmode6:$addr, QQQQPR:$src, nohash_imm:$lane),
+ itin, "">;
+class VSTQQQQLNWBPseudo<InstrItinClass itin>
+ : PseudoNLdSt<(outs GPR:$wb),
+ (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src,
+ nohash_imm:$lane), itin, "$addr.addr = $wb">;
+
+// VST1LN : Vector Store (single element from one lane)
+class VST1LN<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
+ PatFrag StoreOp, SDNode ExtractOp, Operand AddrMode>
+ : NLdStLn<1, 0b00, op11_8, op7_4, (outs),
+ (ins AddrMode:$Rn, DPR:$Vd, nohash_imm:$lane),
+ IIC_VST1ln, "vst1", Dt, "\\{$Vd[$lane]\\}, $Rn", "",
+ [(StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane), AddrMode:$Rn)]> {
+ let Rm = 0b1111;
+ let DecoderMethod = "DecodeVST1LN";
+}
+class VST1QLNPseudo<ValueType Ty, PatFrag StoreOp, SDNode ExtractOp>
+ : VSTQLNPseudo<IIC_VST1ln> {
+ let Pattern = [(StoreOp (ExtractOp (Ty QPR:$src), imm:$lane),
+ addrmode6:$addr)];
+}
+
+def VST1LNd8 : VST1LN<0b0000, {?,?,?,0}, "8", v8i8, truncstorei8,
+ NEONvgetlaneu, addrmode6> {
+ let Inst{7-5} = lane{2-0};
+}
+def VST1LNd16 : VST1LN<0b0100, {?,?,0,?}, "16", v4i16, truncstorei16,
+ NEONvgetlaneu, addrmode6> {
+ let Inst{7-6} = lane{1-0};
+ let Inst{4} = Rn{4};
+}
+
+def VST1LNd32 : VST1LN<0b1000, {?,0,?,?}, "32", v2i32, store, extractelt,
+ addrmode6oneL32> {
+ let Inst{7} = lane{0};
+ let Inst{5-4} = Rn{5-4};
+}
+
+def VST1LNq8Pseudo : VST1QLNPseudo<v16i8, truncstorei8, NEONvgetlaneu>;
+def VST1LNq16Pseudo : VST1QLNPseudo<v8i16, truncstorei16, NEONvgetlaneu>;
+def VST1LNq32Pseudo : VST1QLNPseudo<v4i32, store, extractelt>;
+
+def : Pat<(store (extractelt (v2f32 DPR:$src), imm:$lane), addrmode6:$addr),
+ (VST1LNd32 addrmode6:$addr, DPR:$src, imm:$lane)>;
+def : Pat<(store (extractelt (v4f32 QPR:$src), imm:$lane), addrmode6:$addr),
+ (VST1LNq32Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>;
+
+// ...with address register writeback:
+class VST1LNWB<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
+ PatFrag StoreOp, SDNode ExtractOp, Operand AdrMode>
+ : NLdStLn<1, 0b00, op11_8, op7_4, (outs GPR:$wb),
+ (ins AdrMode:$Rn, am6offset:$Rm,
+ DPR:$Vd, nohash_imm:$lane), IIC_VST1lnu, "vst1", Dt,
+ "\\{$Vd[$lane]\\}, $Rn$Rm",
+ "$Rn.addr = $wb",
+ [(set GPR:$wb, (StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane),
+ AdrMode:$Rn, am6offset:$Rm))]> {
+ let DecoderMethod = "DecodeVST1LN";
+}
+class VST1QLNWBPseudo<ValueType Ty, PatFrag StoreOp, SDNode ExtractOp>
+ : VSTQLNWBPseudo<IIC_VST1lnu> {
+ let Pattern = [(set GPR:$wb, (StoreOp (ExtractOp (Ty QPR:$src), imm:$lane),
+ addrmode6:$addr, am6offset:$offset))];
+}
+
+def VST1LNd8_UPD : VST1LNWB<0b0000, {?,?,?,0}, "8", v8i8, post_truncsti8,
+ NEONvgetlaneu, addrmode6> {
+ let Inst{7-5} = lane{2-0};
+}
+def VST1LNd16_UPD : VST1LNWB<0b0100, {?,?,0,?}, "16", v4i16, post_truncsti16,
+ NEONvgetlaneu, addrmode6> {
+ let Inst{7-6} = lane{1-0};
+ let Inst{4} = Rn{4};
+}
+def VST1LNd32_UPD : VST1LNWB<0b1000, {?,0,?,?}, "32", v2i32, post_store,
+ extractelt, addrmode6oneL32> {
+ let Inst{7} = lane{0};
+ let Inst{5-4} = Rn{5-4};
+}
+
+def VST1LNq8Pseudo_UPD : VST1QLNWBPseudo<v16i8, post_truncsti8, NEONvgetlaneu>;
+def VST1LNq16Pseudo_UPD : VST1QLNWBPseudo<v8i16, post_truncsti16,NEONvgetlaneu>;
+def VST1LNq32Pseudo_UPD : VST1QLNWBPseudo<v4i32, post_store, extractelt>;
+
+let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in {
+
+// VST2LN : Vector Store (single 2-element structure from one lane)
+class VST2LN<bits<4> op11_8, bits<4> op7_4, string Dt>
+ : NLdStLn<1, 0b00, op11_8, op7_4, (outs),
+ (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, nohash_imm:$lane),
+ IIC_VST2ln, "vst2", Dt, "\\{$Vd[$lane], $src2[$lane]\\}, $Rn",
+ "", []> {
+ let Rm = 0b1111;
+ let Inst{4} = Rn{4};
+ let DecoderMethod = "DecodeVST2LN";
+}
+
+def VST2LNd8 : VST2LN<0b0001, {?,?,?,?}, "8"> {
+ let Inst{7-5} = lane{2-0};
+}
+def VST2LNd16 : VST2LN<0b0101, {?,?,0,?}, "16"> {
+ let Inst{7-6} = lane{1-0};
+}
+def VST2LNd32 : VST2LN<0b1001, {?,0,0,?}, "32"> {
+ let Inst{7} = lane{0};
+}
+
+def VST2LNd8Pseudo : VSTQLNPseudo<IIC_VST2ln>;
+def VST2LNd16Pseudo : VSTQLNPseudo<IIC_VST2ln>;
+def VST2LNd32Pseudo : VSTQLNPseudo<IIC_VST2ln>;
+
+// ...with double-spaced registers:
+def VST2LNq16 : VST2LN<0b0101, {?,?,1,?}, "16"> {
+ let Inst{7-6} = lane{1-0};
+ let Inst{4} = Rn{4};
+}
+def VST2LNq32 : VST2LN<0b1001, {?,1,0,?}, "32"> {
+ let Inst{7} = lane{0};
+ let Inst{4} = Rn{4};
+}
+
+def VST2LNq16Pseudo : VSTQQLNPseudo<IIC_VST2ln>;
+def VST2LNq32Pseudo : VSTQQLNPseudo<IIC_VST2ln>;
+
+// ...with address register writeback:
+class VST2LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+ : NLdStLn<1, 0b00, op11_8, op7_4, (outs GPR:$wb),
+ (ins addrmode6:$Rn, am6offset:$Rm,
+ DPR:$Vd, DPR:$src2, nohash_imm:$lane), IIC_VST2lnu, "vst2", Dt,
+ "\\{$Vd[$lane], $src2[$lane]\\}, $Rn$Rm",
+ "$Rn.addr = $wb", []> {
+ let Inst{4} = Rn{4};
+ let DecoderMethod = "DecodeVST2LN";
+}
+
+def VST2LNd8_UPD : VST2LNWB<0b0001, {?,?,?,?}, "8"> {
+ let Inst{7-5} = lane{2-0};
+}
+def VST2LNd16_UPD : VST2LNWB<0b0101, {?,?,0,?}, "16"> {
+ let Inst{7-6} = lane{1-0};
+}
+def VST2LNd32_UPD : VST2LNWB<0b1001, {?,0,0,?}, "32"> {
+ let Inst{7} = lane{0};
+}
+
+def VST2LNd8Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>;
+def VST2LNd16Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>;
+def VST2LNd32Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>;
+
+def VST2LNq16_UPD : VST2LNWB<0b0101, {?,?,1,?}, "16"> {
+ let Inst{7-6} = lane{1-0};
+}
+def VST2LNq32_UPD : VST2LNWB<0b1001, {?,1,0,?}, "32"> {
+ let Inst{7} = lane{0};
+}
+
+def VST2LNq16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST2lnu>;
+def VST2LNq32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST2lnu>;
+
+// VST3LN : Vector Store (single 3-element structure from one lane)
+class VST3LN<bits<4> op11_8, bits<4> op7_4, string Dt>
+ : NLdStLn<1, 0b00, op11_8, op7_4, (outs),
+ (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3,
+ nohash_imm:$lane), IIC_VST3ln, "vst3", Dt,
+ "\\{$Vd[$lane], $src2[$lane], $src3[$lane]\\}, $Rn", "", []> {
+ let Rm = 0b1111;
+ let DecoderMethod = "DecodeVST3LN";
+}
+
+def VST3LNd8 : VST3LN<0b0010, {?,?,?,0}, "8"> {
+ let Inst{7-5} = lane{2-0};
+}
+def VST3LNd16 : VST3LN<0b0110, {?,?,0,0}, "16"> {
+ let Inst{7-6} = lane{1-0};
+}
+def VST3LNd32 : VST3LN<0b1010, {?,0,0,0}, "32"> {
+ let Inst{7} = lane{0};
+}
+
+def VST3LNd8Pseudo : VSTQQLNPseudo<IIC_VST3ln>;
+def VST3LNd16Pseudo : VSTQQLNPseudo<IIC_VST3ln>;
+def VST3LNd32Pseudo : VSTQQLNPseudo<IIC_VST3ln>;
+
+// ...with double-spaced registers:
+def VST3LNq16 : VST3LN<0b0110, {?,?,1,0}, "16"> {
+ let Inst{7-6} = lane{1-0};
+}
+def VST3LNq32 : VST3LN<0b1010, {?,1,0,0}, "32"> {
+ let Inst{7} = lane{0};
+}
+
+def VST3LNq16Pseudo : VSTQQQQLNPseudo<IIC_VST3ln>;
+def VST3LNq32Pseudo : VSTQQQQLNPseudo<IIC_VST3ln>;
+
+// ...with address register writeback:
+class VST3LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+ : NLdStLn<1, 0b00, op11_8, op7_4, (outs GPR:$wb),
+ (ins addrmode6:$Rn, am6offset:$Rm,
+ DPR:$Vd, DPR:$src2, DPR:$src3, nohash_imm:$lane),
+ IIC_VST3lnu, "vst3", Dt,
+ "\\{$Vd[$lane], $src2[$lane], $src3[$lane]\\}, $Rn$Rm",
+ "$Rn.addr = $wb", []> {
+ let DecoderMethod = "DecodeVST3LN";
+}
+
+def VST3LNd8_UPD : VST3LNWB<0b0010, {?,?,?,0}, "8"> {
+ let Inst{7-5} = lane{2-0};
+}
+def VST3LNd16_UPD : VST3LNWB<0b0110, {?,?,0,0}, "16"> {
+ let Inst{7-6} = lane{1-0};
+}
+def VST3LNd32_UPD : VST3LNWB<0b1010, {?,0,0,0}, "32"> {
+ let Inst{7} = lane{0};
+}
+
+def VST3LNd8Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>;
+def VST3LNd16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>;
+def VST3LNd32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>;
+
+def VST3LNq16_UPD : VST3LNWB<0b0110, {?,?,1,0}, "16"> {
+ let Inst{7-6} = lane{1-0};
+}
+def VST3LNq32_UPD : VST3LNWB<0b1010, {?,1,0,0}, "32"> {
+ let Inst{7} = lane{0};
+}
+
+def VST3LNq16Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST3lnu>;
+def VST3LNq32Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST3lnu>;
+
+// VST4LN : Vector Store (single 4-element structure from one lane)
+class VST4LN<bits<4> op11_8, bits<4> op7_4, string Dt>
+ : NLdStLn<1, 0b00, op11_8, op7_4, (outs),
+ (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4,
+ nohash_imm:$lane), IIC_VST4ln, "vst4", Dt,
+ "\\{$Vd[$lane], $src2[$lane], $src3[$lane], $src4[$lane]\\}, $Rn",
+ "", []> {
+ let Rm = 0b1111;
+ let Inst{4} = Rn{4};
+ let DecoderMethod = "DecodeVST4LN";
+}
+
+def VST4LNd8 : VST4LN<0b0011, {?,?,?,?}, "8"> {
+ let Inst{7-5} = lane{2-0};
+}
+def VST4LNd16 : VST4LN<0b0111, {?,?,0,?}, "16"> {
+ let Inst{7-6} = lane{1-0};
+}
+def VST4LNd32 : VST4LN<0b1011, {?,0,?,?}, "32"> {
+ let Inst{7} = lane{0};
+ let Inst{5} = Rn{5};
+}
+
+def VST4LNd8Pseudo : VSTQQLNPseudo<IIC_VST4ln>;
+def VST4LNd16Pseudo : VSTQQLNPseudo<IIC_VST4ln>;
+def VST4LNd32Pseudo : VSTQQLNPseudo<IIC_VST4ln>;
+
+// ...with double-spaced registers:
+def VST4LNq16 : VST4LN<0b0111, {?,?,1,?}, "16"> {
+ let Inst{7-6} = lane{1-0};
+}
+def VST4LNq32 : VST4LN<0b1011, {?,1,?,?}, "32"> {
+ let Inst{7} = lane{0};
+ let Inst{5} = Rn{5};
+}
+
+def VST4LNq16Pseudo : VSTQQQQLNPseudo<IIC_VST4ln>;
+def VST4LNq32Pseudo : VSTQQQQLNPseudo<IIC_VST4ln>;
+
+// ...with address register writeback:
+class VST4LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+ : NLdStLn<1, 0b00, op11_8, op7_4, (outs GPR:$wb),
+ (ins addrmode6:$Rn, am6offset:$Rm,
+ DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4, nohash_imm:$lane),
+ IIC_VST4lnu, "vst4", Dt,
+ "\\{$Vd[$lane], $src2[$lane], $src3[$lane], $src4[$lane]\\}, $Rn$Rm",
+ "$Rn.addr = $wb", []> {
+ let Inst{4} = Rn{4};
+ let DecoderMethod = "DecodeVST4LN";
+}
+
+def VST4LNd8_UPD : VST4LNWB<0b0011, {?,?,?,?}, "8"> {
+ let Inst{7-5} = lane{2-0};
+}
+def VST4LNd16_UPD : VST4LNWB<0b0111, {?,?,0,?}, "16"> {
+ let Inst{7-6} = lane{1-0};
+}
+def VST4LNd32_UPD : VST4LNWB<0b1011, {?,0,?,?}, "32"> {
+ let Inst{7} = lane{0};
+ let Inst{5} = Rn{5};
+}
+
+def VST4LNd8Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>;
+def VST4LNd16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>;
+def VST4LNd32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>;
+
+def VST4LNq16_UPD : VST4LNWB<0b0111, {?,?,1,?}, "16"> {
+ let Inst{7-6} = lane{1-0};
+}
+def VST4LNq32_UPD : VST4LNWB<0b1011, {?,1,?,?}, "32"> {
+ let Inst{7} = lane{0};
+ let Inst{5} = Rn{5};
+}
+
+def VST4LNq16Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST4lnu>;
+def VST4LNq32Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST4lnu>;
+
+} // mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1
+
+// Use vld1/vst1 for unaligned f64 load / store
+def : Pat<(f64 (hword_alignedload addrmode6:$addr)),
+ (VLD1d16 addrmode6:$addr)>, Requires<[IsLE]>;
+def : Pat<(hword_alignedstore (f64 DPR:$value), addrmode6:$addr),
+ (VST1d16 addrmode6:$addr, DPR:$value)>, Requires<[IsLE]>;
+def : Pat<(f64 (byte_alignedload addrmode6:$addr)),
+ (VLD1d8 addrmode6:$addr)>, Requires<[IsLE]>;
+def : Pat<(byte_alignedstore (f64 DPR:$value), addrmode6:$addr),
+ (VST1d8 addrmode6:$addr, DPR:$value)>, Requires<[IsLE]>;
+def : Pat<(f64 (non_word_alignedload addrmode6:$addr)),
+ (VLD1d64 addrmode6:$addr)>, Requires<[IsBE]>;
+def : Pat<(non_word_alignedstore (f64 DPR:$value), addrmode6:$addr),
+ (VST1d64 addrmode6:$addr, DPR:$value)>, Requires<[IsBE]>;
+
+// Use vld1/vst1 for Q and QQ. Also use them for unaligned v2f64
+// load / store if it's legal.
+def : Pat<(v2f64 (dword_alignedload addrmode6:$addr)),
+ (VLD1q64 addrmode6:$addr)>;
+def : Pat<(dword_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+ (VST1q64 addrmode6:$addr, QPR:$value)>;
+def : Pat<(v2f64 (word_alignedload addrmode6:$addr)),
+ (VLD1q32 addrmode6:$addr)>, Requires<[IsLE]>;
+def : Pat<(word_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+ (VST1q32 addrmode6:$addr, QPR:$value)>, Requires<[IsLE]>;
+def : Pat<(v2f64 (hword_alignedload addrmode6:$addr)),
+ (VLD1q16 addrmode6:$addr)>, Requires<[IsLE]>;
+def : Pat<(hword_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+ (VST1q16 addrmode6:$addr, QPR:$value)>, Requires<[IsLE]>;
+def : Pat<(v2f64 (byte_alignedload addrmode6:$addr)),
+ (VLD1q8 addrmode6:$addr)>, Requires<[IsLE]>;
+def : Pat<(byte_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+ (VST1q8 addrmode6:$addr, QPR:$value)>, Requires<[IsLE]>;
+
+//===----------------------------------------------------------------------===//
+// NEON pattern fragments
+//===----------------------------------------------------------------------===//
+
+// Extract D sub-registers of Q registers.
+def DSubReg_i8_reg : SDNodeXForm<imm, [{
+ assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
+ return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue()/8, SDLoc(N),
+ MVT::i32);
+}]>;
+def DSubReg_i16_reg : SDNodeXForm<imm, [{
+ assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
+ return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue()/4, SDLoc(N),
+ MVT::i32);
+}]>;
+def DSubReg_i32_reg : SDNodeXForm<imm, [{
+ assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
+ return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue()/2, SDLoc(N),
+ MVT::i32);
+}]>;
+def DSubReg_f64_reg : SDNodeXForm<imm, [{
+ assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
+ return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue(), SDLoc(N),
+ MVT::i32);
+}]>;
+
+// Extract S sub-registers of Q/D registers.
+def SSubReg_f32_reg : SDNodeXForm<imm, [{
+ assert(ARM::ssub_3 == ARM::ssub_0+3 && "Unexpected subreg numbering");
+ return CurDAG->getTargetConstant(ARM::ssub_0 + N->getZExtValue(), SDLoc(N),
+ MVT::i32);
+}]>;
+
+// Translate lane numbers from Q registers to D subregs.
+def SubReg_i8_lane : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() & 7, SDLoc(N), MVT::i32);
+}]>;
+def SubReg_i16_lane : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() & 3, SDLoc(N), MVT::i32);
+}]>;
+def SubReg_i32_lane : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() & 1, SDLoc(N), MVT::i32);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction Classes
+//===----------------------------------------------------------------------===//
+
+// Basic 2-register operations: double- and quad-register.
+class N2VD<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+ bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
+ string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode>
+ : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$Vd),
+ (ins DPR:$Vm), IIC_VUNAD, OpcodeStr, Dt,"$Vd, $Vm", "",
+ [(set DPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vm))))]>;
+class N2VQ<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+ bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
+ string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode>
+ : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$Vd),
+ (ins QPR:$Vm), IIC_VUNAQ, OpcodeStr, Dt,"$Vd, $Vm", "",
+ [(set QPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vm))))]>;
+
+// Basic 2-register intrinsics, both double- and quad-register.
+class N2VDInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+ bits<2> op17_16, bits<5> op11_7, bit op4,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+ : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$Vd),
+ (ins DPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
+ [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>;
+class N2VQInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+ bits<2> op17_16, bits<5> op11_7, bit op4,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+ : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$Vd),
+ (ins QPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
+ [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
+
+// Same as above, but not predicated.
+class N2VDIntnp<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op7,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+ : N2Vnp<op19_18, op17_16, op10_8, op7, 0, (outs DPR:$Vd), (ins DPR:$Vm),
+ itin, OpcodeStr, Dt,
+ [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>;
+
+class N2VQIntnp<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op7,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+ : N2Vnp<op19_18, op17_16, op10_8, op7, 1, (outs QPR:$Vd), (ins QPR:$Vm),
+ itin, OpcodeStr, Dt,
+ [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
+
+// Similar to NV2VQIntnp with some more encoding bits exposed (crypto).
+class N2VQIntXnp<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op6,
+ bit op7, InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+ : N2Vnp<op19_18, op17_16, op10_8, op7, op6, (outs QPR:$Vd), (ins QPR:$Vm),
+ itin, OpcodeStr, Dt,
+ [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
+
+// Same as N2VQIntXnp but with Vd as a src register.
+class N2VQIntX2np<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op6,
+ bit op7, InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+ : N2Vnp<op19_18, op17_16, op10_8, op7, op6,
+ (outs QPR:$Vd), (ins QPR:$src, QPR:$Vm),
+ itin, OpcodeStr, Dt,
+ [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$src), (OpTy QPR:$Vm))))]> {
+ let Constraints = "$src = $Vd";
+}
+
+// Narrow 2-register operations.
+class N2VN<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+ bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType TyD, ValueType TyQ, SDNode OpNode>
+ : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs DPR:$Vd),
+ (ins QPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
+ [(set DPR:$Vd, (TyD (OpNode (TyQ QPR:$Vm))))]>;
+
+// Narrow 2-register intrinsics.
+class N2VNInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+ bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType TyD, ValueType TyQ, SDPatternOperator IntOp>
+ : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs DPR:$Vd),
+ (ins QPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
+ [(set DPR:$Vd, (TyD (IntOp (TyQ QPR:$Vm))))]>;
+
+// Long 2-register operations (currently only used for VMOVL).
+class N2VL<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+ bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType TyQ, ValueType TyD, SDNode OpNode>
+ : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs QPR:$Vd),
+ (ins DPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
+ [(set QPR:$Vd, (TyQ (OpNode (TyD DPR:$Vm))))]>;
+
+// Long 2-register intrinsics.
+class N2VLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+ bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType TyQ, ValueType TyD, SDPatternOperator IntOp>
+ : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs QPR:$Vd),
+ (ins DPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
+ [(set QPR:$Vd, (TyQ (IntOp (TyD DPR:$Vm))))]>;
+
+// 2-register shuffles (VTRN/VZIP/VUZP), both double- and quad-register.
+class N2VDShuffle<bits<2> op19_18, bits<5> op11_7, string OpcodeStr, string Dt>
+ : N2V<0b11, 0b11, op19_18, 0b10, op11_7, 0, 0, (outs DPR:$Vd, DPR:$Vm),
+ (ins DPR:$src1, DPR:$src2), IIC_VPERMD,
+ OpcodeStr, Dt, "$Vd, $Vm",
+ "$src1 = $Vd, $src2 = $Vm", []>;
+class N2VQShuffle<bits<2> op19_18, bits<5> op11_7,
+ InstrItinClass itin, string OpcodeStr, string Dt>
+ : N2V<0b11, 0b11, op19_18, 0b10, op11_7, 1, 0, (outs QPR:$Vd, QPR:$Vm),
+ (ins QPR:$src1, QPR:$src2), itin, OpcodeStr, Dt, "$Vd, $Vm",
+ "$src1 = $Vd, $src2 = $Vm", []>;
+
+// Basic 3-register operations: double- and quad-register.
+class N3VD<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable>
+ : N3V<op24, op23, op21_20, op11_8, 0, op4,
+ (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+ OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+ [(set DPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]> {
+ // All of these have a two-operand InstAlias.
+ let TwoOperandAliasConstraint = "$Vn = $Vd";
+ let isCommutable = Commutable;
+}
+// Same as N3VD but no data type.
+class N3VDX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+ InstrItinClass itin, string OpcodeStr,
+ ValueType ResTy, ValueType OpTy,
+ SDNode OpNode, bit Commutable>
+ : N3VX<op24, op23, op21_20, op11_8, 0, op4,
+ (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+ OpcodeStr, "$Vd, $Vn, $Vm", "",
+ [(set DPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>{
+ // All of these have a two-operand InstAlias.
+ let TwoOperandAliasConstraint = "$Vn = $Vd";
+ let isCommutable = Commutable;
+}
+
+class N3VDSL<bits<2> op21_20, bits<4> op11_8,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType Ty, SDNode ShOp>
+ : N3VLane32<0, 1, op21_20, op11_8, 1, 0,
+ (outs DPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+ NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+ [(set (Ty DPR:$Vd),
+ (Ty (ShOp (Ty DPR:$Vn),
+ (Ty (NEONvduplane (Ty DPR_VFP2:$Vm),imm:$lane)))))]> {
+ // All of these have a two-operand InstAlias.
+ let TwoOperandAliasConstraint = "$Vn = $Vd";
+ let isCommutable = 0;
+}
+class N3VDSL16<bits<2> op21_20, bits<4> op11_8,
+ string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp>
+ : N3VLane16<0, 1, op21_20, op11_8, 1, 0,
+ (outs DPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+ NVMulSLFrm, IIC_VMULi16D, OpcodeStr, Dt,"$Vd, $Vn, $Vm$lane","",
+ [(set (Ty DPR:$Vd),
+ (Ty (ShOp (Ty DPR:$Vn),
+ (Ty (NEONvduplane (Ty DPR_8:$Vm), imm:$lane)))))]> {
+ // All of these have a two-operand InstAlias.
+ let TwoOperandAliasConstraint = "$Vn = $Vd";
+ let isCommutable = 0;
+}
+
+class N3VQ<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable>
+ : N3V<op24, op23, op21_20, op11_8, 1, op4,
+ (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
+ OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+ [(set QPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]> {
+ // All of these have a two-operand InstAlias.
+ let TwoOperandAliasConstraint = "$Vn = $Vd";
+ let isCommutable = Commutable;
+}
+class N3VQX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+ InstrItinClass itin, string OpcodeStr,
+ ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable>
+ : N3VX<op24, op23, op21_20, op11_8, 1, op4,
+ (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
+ OpcodeStr, "$Vd, $Vn, $Vm", "",
+ [(set QPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]>{
+ // All of these have a two-operand InstAlias.
+ let TwoOperandAliasConstraint = "$Vn = $Vd";
+ let isCommutable = Commutable;
+}
+class N3VQSL<bits<2> op21_20, bits<4> op11_8,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType ResTy, ValueType OpTy, SDNode ShOp>
+ : N3VLane32<1, 1, op21_20, op11_8, 1, 0,
+ (outs QPR:$Vd), (ins QPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+ NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+ [(set (ResTy QPR:$Vd),
+ (ResTy (ShOp (ResTy QPR:$Vn),
+ (ResTy (NEONvduplane (OpTy DPR_VFP2:$Vm),
+ imm:$lane)))))]> {
+ // All of these have a two-operand InstAlias.
+ let TwoOperandAliasConstraint = "$Vn = $Vd";
+ let isCommutable = 0;
+}
+class N3VQSL16<bits<2> op21_20, bits<4> op11_8, string OpcodeStr, string Dt,
+ ValueType ResTy, ValueType OpTy, SDNode ShOp>
+ : N3VLane16<1, 1, op21_20, op11_8, 1, 0,
+ (outs QPR:$Vd), (ins QPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+ NVMulSLFrm, IIC_VMULi16Q, OpcodeStr, Dt,"$Vd, $Vn, $Vm$lane", "",
+ [(set (ResTy QPR:$Vd),
+ (ResTy (ShOp (ResTy QPR:$Vn),
+ (ResTy (NEONvduplane (OpTy DPR_8:$Vm),
+ imm:$lane)))))]> {
+ // All of these have a two-operand InstAlias.
+ let TwoOperandAliasConstraint = "$Vn = $Vd";
+ let isCommutable = 0;
+}
+
+// Basic 3-register intrinsics, both double- and quad-register.
+class N3VDInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+ Format f, InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp, bit Commutable>
+ : N3V<op24, op23, op21_20, op11_8, 0, op4,
+ (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), f, itin,
+ OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+ [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]> {
+ // All of these have a two-operand InstAlias.
+ let TwoOperandAliasConstraint = "$Vn = $Vd";
+ let isCommutable = Commutable;
+}
+
+class N3VDIntnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
+ bit op4, Format f, InstrItinClass itin, string OpcodeStr,
+ string Dt, ValueType ResTy, ValueType OpTy,
+ SDPatternOperator IntOp, bit Commutable>
+ : N3Vnp<op27_23, op21_20, op11_8, op6, op4,
+ (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, OpcodeStr, Dt,
+ [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>;
+
+class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+ string OpcodeStr, string Dt, ValueType Ty, SDPatternOperator IntOp>
+ : N3VLane32<0, 1, op21_20, op11_8, 1, 0,
+ (outs DPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+ NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+ [(set (Ty DPR:$Vd),
+ (Ty (IntOp (Ty DPR:$Vn),
+ (Ty (NEONvduplane (Ty DPR_VFP2:$Vm),
+ imm:$lane)))))]> {
+ let isCommutable = 0;
+}
+
+class N3VDIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+ string OpcodeStr, string Dt, ValueType Ty, SDPatternOperator IntOp>
+ : N3VLane16<0, 1, op21_20, op11_8, 1, 0,
+ (outs DPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+ NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+ [(set (Ty DPR:$Vd),
+ (Ty (IntOp (Ty DPR:$Vn),
+ (Ty (NEONvduplane (Ty DPR_8:$Vm), imm:$lane)))))]> {
+ let isCommutable = 0;
+}
+class N3VDIntSh<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+ Format f, InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+ : N3V<op24, op23, op21_20, op11_8, 0, op4,
+ (outs DPR:$Vd), (ins DPR:$Vm, DPR:$Vn), f, itin,
+ OpcodeStr, Dt, "$Vd, $Vm, $Vn", "",
+ [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm), (OpTy DPR:$Vn))))]> {
+ let TwoOperandAliasConstraint = "$Vm = $Vd";
+ let isCommutable = 0;
+}
+
+class N3VQInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+ Format f, InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp, bit Commutable>
+ : N3V<op24, op23, op21_20, op11_8, 1, op4,
+ (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), f, itin,
+ OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+ [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]> {
+ // All of these have a two-operand InstAlias.
+ let TwoOperandAliasConstraint = "$Vn = $Vd";
+ let isCommutable = Commutable;
+}
+
+class N3VQIntnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
+ bit op4, Format f, InstrItinClass itin, string OpcodeStr,
+ string Dt, ValueType ResTy, ValueType OpTy,
+ SDPatternOperator IntOp, bit Commutable>
+ : N3Vnp<op27_23, op21_20, op11_8, op6, op4,
+ (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), f, itin, OpcodeStr, Dt,
+ [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]>;
+
+// Same as N3VQIntnp but with Vd as a src register.
+class N3VQInt3np<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
+ bit op4, Format f, InstrItinClass itin, string OpcodeStr,
+ string Dt, ValueType ResTy, ValueType OpTy,
+ SDPatternOperator IntOp, bit Commutable>
+ : N3Vnp<op27_23, op21_20, op11_8, op6, op4,
+ (outs QPR:$Vd), (ins QPR:$src, QPR:$Vn, QPR:$Vm),
+ f, itin, OpcodeStr, Dt,
+ [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$src), (OpTy QPR:$Vn),
+ (OpTy QPR:$Vm))))]> {
+ let Constraints = "$src = $Vd";
+}
+
+class N3VQIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+ string OpcodeStr, string Dt,
+ ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+ : N3VLane32<1, 1, op21_20, op11_8, 1, 0,
+ (outs QPR:$Vd), (ins QPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+ NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+ [(set (ResTy QPR:$Vd),
+ (ResTy (IntOp (ResTy QPR:$Vn),
+ (ResTy (NEONvduplane (OpTy DPR_VFP2:$Vm),
+ imm:$lane)))))]> {
+ let isCommutable = 0;
+}
+class N3VQIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+ string OpcodeStr, string Dt,
+ ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+ : N3VLane16<1, 1, op21_20, op11_8, 1, 0,
+ (outs QPR:$Vd), (ins QPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+ NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+ [(set (ResTy QPR:$Vd),
+ (ResTy (IntOp (ResTy QPR:$Vn),
+ (ResTy (NEONvduplane (OpTy DPR_8:$Vm),
+ imm:$lane)))))]> {
+ let isCommutable = 0;
+}
+class N3VQIntSh<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+ Format f, InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+ : N3V<op24, op23, op21_20, op11_8, 1, op4,
+ (outs QPR:$Vd), (ins QPR:$Vm, QPR:$Vn), f, itin,
+ OpcodeStr, Dt, "$Vd, $Vm, $Vn", "",
+ [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm), (OpTy QPR:$Vn))))]> {
+ let TwoOperandAliasConstraint = "$Vm = $Vd";
+ let isCommutable = 0;
+}
+
+// Multiply-Add/Sub operations: double- and quad-register.
+class N3VDMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType Ty, SDPatternOperator MulOp, SDPatternOperator OpNode>
+ : N3V<op24, op23, op21_20, op11_8, 0, op4,
+ (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+ OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+ [(set DPR:$Vd, (Ty (OpNode DPR:$src1,
+ (Ty (MulOp DPR:$Vn, DPR:$Vm)))))]>;
+
+class N3VDMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+ string OpcodeStr, string Dt,
+ ValueType Ty, SDPatternOperator MulOp, SDPatternOperator ShOp>
+ : N3VLane32<0, 1, op21_20, op11_8, 1, 0,
+ (outs DPR:$Vd),
+ (ins DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+ NVMulSLFrm, itin,
+ OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd",
+ [(set (Ty DPR:$Vd),
+ (Ty (ShOp (Ty DPR:$src1),
+ (Ty (MulOp DPR:$Vn,
+ (Ty (NEONvduplane (Ty DPR_VFP2:$Vm),
+ imm:$lane)))))))]>;
+class N3VDMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+ string OpcodeStr, string Dt,
+ ValueType Ty, SDPatternOperator MulOp, SDPatternOperator ShOp>
+ : N3VLane16<0, 1, op21_20, op11_8, 1, 0,
+ (outs DPR:$Vd),
+ (ins DPR:$src1, DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+ NVMulSLFrm, itin,
+ OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd",
+ [(set (Ty DPR:$Vd),
+ (Ty (ShOp (Ty DPR:$src1),
+ (Ty (MulOp DPR:$Vn,
+ (Ty (NEONvduplane (Ty DPR_8:$Vm),
+ imm:$lane)))))))]>;
+
+class N3VQMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+ InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty,
+ SDPatternOperator MulOp, SDPatternOperator OpNode>
+ : N3V<op24, op23, op21_20, op11_8, 1, op4,
+ (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
+ OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+ [(set QPR:$Vd, (Ty (OpNode QPR:$src1,
+ (Ty (MulOp QPR:$Vn, QPR:$Vm)))))]>;
+class N3VQMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+ string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
+ SDPatternOperator MulOp, SDPatternOperator ShOp>
+ : N3VLane32<1, 1, op21_20, op11_8, 1, 0,
+ (outs QPR:$Vd),
+ (ins QPR:$src1, QPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+ NVMulSLFrm, itin,
+ OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd",
+ [(set (ResTy QPR:$Vd),
+ (ResTy (ShOp (ResTy QPR:$src1),
+ (ResTy (MulOp QPR:$Vn,
+ (ResTy (NEONvduplane (OpTy DPR_VFP2:$Vm),
+ imm:$lane)))))))]>;
+class N3VQMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+ string OpcodeStr, string Dt,
+ ValueType ResTy, ValueType OpTy,
+ SDPatternOperator MulOp, SDPatternOperator ShOp>
+ : N3VLane16<1, 1, op21_20, op11_8, 1, 0,
+ (outs QPR:$Vd),
+ (ins QPR:$src1, QPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+ NVMulSLFrm, itin,
+ OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd",
+ [(set (ResTy QPR:$Vd),
+ (ResTy (ShOp (ResTy QPR:$src1),
+ (ResTy (MulOp QPR:$Vn,
+ (ResTy (NEONvduplane (OpTy DPR_8:$Vm),
+ imm:$lane)))))))]>;
+
+// Neon Intrinsic-Op instructions (VABA): double- and quad-register.
+class N3VDIntOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType Ty, SDPatternOperator IntOp, SDNode OpNode>
+ : N3V<op24, op23, op21_20, op11_8, 0, op4,
+ (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+ OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+ [(set DPR:$Vd, (Ty (OpNode DPR:$src1,
+ (Ty (IntOp (Ty DPR:$Vn), (Ty DPR:$Vm))))))]>;
+class N3VQIntOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType Ty, SDPatternOperator IntOp, SDNode OpNode>
+ : N3V<op24, op23, op21_20, op11_8, 1, op4,
+ (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
+ OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+ [(set QPR:$Vd, (Ty (OpNode QPR:$src1,
+ (Ty (IntOp (Ty QPR:$Vn), (Ty QPR:$Vm))))))]>;
+
+// Neon 3-argument intrinsics, both double- and quad-register.
+// The destination register is also used as the first source operand register.
+class N3VDInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+ : N3V<op24, op23, op21_20, op11_8, 0, op4,
+ (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+ OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+ [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$src1),
+ (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>;
+class N3VQInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+ : N3V<op24, op23, op21_20, op11_8, 1, op4,
+ (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
+ OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+ [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$src1),
+ (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]>;
+
+// Long Multiply-Add/Sub operations.
+class N3VLMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode>
+ : N3V<op24, op23, op21_20, op11_8, 0, op4,
+ (outs QPR:$Vd), (ins QPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+ OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+ [(set QPR:$Vd, (OpNode (TyQ QPR:$src1),
+ (TyQ (MulOp (TyD DPR:$Vn),
+ (TyD DPR:$Vm)))))]>;
+class N3VLMulOpSL<bit op24, bits<2> op21_20, bits<4> op11_8,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode>
+ : N3VLane32<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd),
+ (ins QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+ NVMulSLFrm, itin,
+ OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd",
+ [(set QPR:$Vd,
+ (OpNode (TyQ QPR:$src1),
+ (TyQ (MulOp (TyD DPR:$Vn),
+ (TyD (NEONvduplane (TyD DPR_VFP2:$Vm),
+ imm:$lane))))))]>;
+class N3VLMulOpSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode>
+ : N3VLane16<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd),
+ (ins QPR:$src1, DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+ NVMulSLFrm, itin,
+ OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd",
+ [(set QPR:$Vd,
+ (OpNode (TyQ QPR:$src1),
+ (TyQ (MulOp (TyD DPR:$Vn),
+ (TyD (NEONvduplane (TyD DPR_8:$Vm),
+ imm:$lane))))))]>;
+
+// Long Intrinsic-Op vector operations with explicit extend (VABAL).
+class N3VLIntExtOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType TyQ, ValueType TyD, SDPatternOperator IntOp, SDNode ExtOp,
+ SDNode OpNode>
+ : N3V<op24, op23, op21_20, op11_8, 0, op4,
+ (outs QPR:$Vd), (ins QPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+ OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+ [(set QPR:$Vd, (OpNode (TyQ QPR:$src1),
+ (TyQ (ExtOp (TyD (IntOp (TyD DPR:$Vn),
+ (TyD DPR:$Vm)))))))]>;
+
+// Neon Long 3-argument intrinsic. The destination register is
+// a quad-register and is also used as the first source operand register.
+class N3VLInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType TyQ, ValueType TyD, SDPatternOperator IntOp>
+ : N3V<op24, op23, op21_20, op11_8, 0, op4,
+ (outs QPR:$Vd), (ins QPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+ OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+ [(set QPR:$Vd,
+ (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$Vn), (TyD DPR:$Vm))))]>;
+class N3VLInt3SL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+ string OpcodeStr, string Dt,
+ ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+ : N3VLane32<op24, 1, op21_20, op11_8, 1, 0,
+ (outs QPR:$Vd),
+ (ins QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+ NVMulSLFrm, itin,
+ OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd",
+ [(set (ResTy QPR:$Vd),
+ (ResTy (IntOp (ResTy QPR:$src1),
+ (OpTy DPR:$Vn),
+ (OpTy (NEONvduplane (OpTy DPR_VFP2:$Vm),
+ imm:$lane)))))]>;
+class N3VLInt3SL16<bit op24, bits<2> op21_20, bits<4> op11_8,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+ : N3VLane16<op24, 1, op21_20, op11_8, 1, 0,
+ (outs QPR:$Vd),
+ (ins QPR:$src1, DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+ NVMulSLFrm, itin,
+ OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd",
+ [(set (ResTy QPR:$Vd),
+ (ResTy (IntOp (ResTy QPR:$src1),
+ (OpTy DPR:$Vn),
+ (OpTy (NEONvduplane (OpTy DPR_8:$Vm),
+ imm:$lane)))))]>;
+
+// Narrowing 3-register intrinsics.
+class N3VNInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+ string OpcodeStr, string Dt, ValueType TyD, ValueType TyQ,
+ SDPatternOperator IntOp, bit Commutable>
+ : N3V<op24, op23, op21_20, op11_8, 0, op4,
+ (outs DPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, IIC_VBINi4D,
+ OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+ [(set DPR:$Vd, (TyD (IntOp (TyQ QPR:$Vn), (TyQ QPR:$Vm))))]> {
+ let isCommutable = Commutable;
+}
+
+// Long 3-register operations.
+class N3VL<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType TyQ, ValueType TyD, SDNode OpNode, bit Commutable>
+ : N3V<op24, op23, op21_20, op11_8, 0, op4,
+ (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+ OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+ [(set QPR:$Vd, (TyQ (OpNode (TyD DPR:$Vn), (TyD DPR:$Vm))))]> {
+ let isCommutable = Commutable;
+}
+
+class N3VLSL<bit op24, bits<2> op21_20, bits<4> op11_8,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType TyQ, ValueType TyD, SDNode OpNode>
+ : N3VLane32<op24, 1, op21_20, op11_8, 1, 0,
+ (outs QPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+ NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+ [(set QPR:$Vd,
+ (TyQ (OpNode (TyD DPR:$Vn),
+ (TyD (NEONvduplane (TyD DPR_VFP2:$Vm),imm:$lane)))))]>;
+class N3VLSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType TyQ, ValueType TyD, SDNode OpNode>
+ : N3VLane16<op24, 1, op21_20, op11_8, 1, 0,
+ (outs QPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+ NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+ [(set QPR:$Vd,
+ (TyQ (OpNode (TyD DPR:$Vn),
+ (TyD (NEONvduplane (TyD DPR_8:$Vm), imm:$lane)))))]>;
+
+// Long 3-register operations with explicitly extended operands.
+class N3VLExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType TyQ, ValueType TyD, SDNode OpNode, SDNode ExtOp,
+ bit Commutable>
+ : N3V<op24, op23, op21_20, op11_8, 0, op4,
+ (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+ OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+ [(set QPR:$Vd, (OpNode (TyQ (ExtOp (TyD DPR:$Vn))),
+ (TyQ (ExtOp (TyD DPR:$Vm)))))]> {
+ let isCommutable = Commutable;
+}
+
+// Long 3-register intrinsics with explicit extend (VABDL).
+class N3VLIntExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType TyQ, ValueType TyD, SDPatternOperator IntOp, SDNode ExtOp,
+ bit Commutable>
+ : N3V<op24, op23, op21_20, op11_8, 0, op4,
+ (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+ OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+ [(set QPR:$Vd, (TyQ (ExtOp (TyD (IntOp (TyD DPR:$Vn),
+ (TyD DPR:$Vm))))))]> {
+ let isCommutable = Commutable;
+}
+
+// Long 3-register intrinsics.
+class N3VLInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType TyQ, ValueType TyD, SDPatternOperator IntOp, bit Commutable>
+ : N3V<op24, op23, op21_20, op11_8, 0, op4,
+ (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+ OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+ [(set QPR:$Vd, (TyQ (IntOp (TyD DPR:$Vn), (TyD DPR:$Vm))))]> {
+ let isCommutable = Commutable;
+}
+
+// Same as above, but not predicated.
+class N3VLIntnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
+ bit op4, InstrItinClass itin, string OpcodeStr,
+ string Dt, ValueType ResTy, ValueType OpTy,
+ SDPatternOperator IntOp, bit Commutable>
+ : N3Vnp<op27_23, op21_20, op11_8, op6, op4,
+ (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, OpcodeStr, Dt,
+ [(set QPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>;
+
+class N3VLIntSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+ string OpcodeStr, string Dt,
+ ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+ : N3VLane32<op24, 1, op21_20, op11_8, 1, 0,
+ (outs QPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+ NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+ [(set (ResTy QPR:$Vd),
+ (ResTy (IntOp (OpTy DPR:$Vn),
+ (OpTy (NEONvduplane (OpTy DPR_VFP2:$Vm),
+ imm:$lane)))))]>;
+class N3VLIntSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+ : N3VLane16<op24, 1, op21_20, op11_8, 1, 0,
+ (outs QPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+ NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+ [(set (ResTy QPR:$Vd),
+ (ResTy (IntOp (OpTy DPR:$Vn),
+ (OpTy (NEONvduplane (OpTy DPR_8:$Vm),
+ imm:$lane)))))]>;
+
+// Wide 3-register operations.
+class N3VW<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+ string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD,
+ SDNode OpNode, SDNode ExtOp, bit Commutable>
+ : N3V<op24, op23, op21_20, op11_8, 0, op4,
+ (outs QPR:$Vd), (ins QPR:$Vn, DPR:$Vm), N3RegFrm, IIC_VSUBiD,
+ OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+ [(set QPR:$Vd, (OpNode (TyQ QPR:$Vn),
+ (TyQ (ExtOp (TyD DPR:$Vm)))))]> {
+ // All of these have a two-operand InstAlias.
+ let TwoOperandAliasConstraint = "$Vn = $Vd";
+ let isCommutable = Commutable;
+}
+
+// Pairwise long 2-register intrinsics, both double- and quad-register.
+class N2VDPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+ bits<2> op17_16, bits<5> op11_7, bit op4,
+ string OpcodeStr, string Dt,
+ ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+ : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$Vd),
+ (ins DPR:$Vm), IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm", "",
+ [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>;
+class N2VQPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+ bits<2> op17_16, bits<5> op11_7, bit op4,
+ string OpcodeStr, string Dt,
+ ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+ : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$Vd),
+ (ins QPR:$Vm), IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm", "",
+ [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
+
+// Pairwise long 2-register accumulate intrinsics,
+// both double- and quad-register.
+// The destination register is also used as the first source operand register.
+class N2VDPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+ bits<2> op17_16, bits<5> op11_7, bit op4,
+ string OpcodeStr, string Dt,
+ ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+ : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4,
+ (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vm), IIC_VPALiD,
+ OpcodeStr, Dt, "$Vd, $Vm", "$src1 = $Vd",
+ [(set DPR:$Vd, (ResTy (IntOp (ResTy DPR:$src1), (OpTy DPR:$Vm))))]>;
+class N2VQPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+ bits<2> op17_16, bits<5> op11_7, bit op4,
+ string OpcodeStr, string Dt,
+ ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+ : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4,
+ (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vm), IIC_VPALiQ,
+ OpcodeStr, Dt, "$Vd, $Vm", "$src1 = $Vd",
+ [(set QPR:$Vd, (ResTy (IntOp (ResTy QPR:$src1), (OpTy QPR:$Vm))))]>;
+
+// Shift by immediate,
+// both double- and quad-register.
+let TwoOperandAliasConstraint = "$Vm = $Vd" in {
+class N2VDSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+ Format f, InstrItinClass itin, Operand ImmTy,
+ string OpcodeStr, string Dt, ValueType Ty, SDNode OpNode>
+ : N2VImm<op24, op23, op11_8, op7, 0, op4,
+ (outs DPR:$Vd), (ins DPR:$Vm, ImmTy:$SIMM), f, itin,
+ OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
+ [(set DPR:$Vd, (Ty (OpNode (Ty DPR:$Vm), (i32 imm:$SIMM))))]>;
+class N2VQSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+ Format f, InstrItinClass itin, Operand ImmTy,
+ string OpcodeStr, string Dt, ValueType Ty, SDNode OpNode>
+ : N2VImm<op24, op23, op11_8, op7, 1, op4,
+ (outs QPR:$Vd), (ins QPR:$Vm, ImmTy:$SIMM), f, itin,
+ OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
+ [(set QPR:$Vd, (Ty (OpNode (Ty QPR:$Vm), (i32 imm:$SIMM))))]>;
+}
+
+// Long shift by immediate.
+class N2VLSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
+ string OpcodeStr, string Dt,
+ ValueType ResTy, ValueType OpTy, Operand ImmTy,
+ SDPatternOperator OpNode>
+ : N2VImm<op24, op23, op11_8, op7, op6, op4,
+ (outs QPR:$Vd), (ins DPR:$Vm, ImmTy:$SIMM), N2RegVShLFrm,
+ IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
+ [(set QPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vm), ImmTy:$SIMM)))]>;
+
+// Narrow shift by immediate.
+class N2VNSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType ResTy, ValueType OpTy, Operand ImmTy,
+ SDPatternOperator OpNode>
+ : N2VImm<op24, op23, op11_8, op7, op6, op4,
+ (outs DPR:$Vd), (ins QPR:$Vm, ImmTy:$SIMM), N2RegVShRFrm, itin,
+ OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
+ [(set DPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vm),
+ (i32 ImmTy:$SIMM))))]>;
+
+// Shift right by immediate and accumulate,
+// both double- and quad-register.
+let TwoOperandAliasConstraint = "$Vm = $Vd" in {
+class N2VDShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+ Operand ImmTy, string OpcodeStr, string Dt,
+ ValueType Ty, SDNode ShOp>
+ : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$Vd),
+ (ins DPR:$src1, DPR:$Vm, ImmTy:$SIMM), N2RegVShRFrm, IIC_VPALiD,
+ OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd",
+ [(set DPR:$Vd, (Ty (add DPR:$src1,
+ (Ty (ShOp DPR:$Vm, (i32 imm:$SIMM))))))]>;
+class N2VQShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+ Operand ImmTy, string OpcodeStr, string Dt,
+ ValueType Ty, SDNode ShOp>
+ : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$Vd),
+ (ins QPR:$src1, QPR:$Vm, ImmTy:$SIMM), N2RegVShRFrm, IIC_VPALiD,
+ OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd",
+ [(set QPR:$Vd, (Ty (add QPR:$src1,
+ (Ty (ShOp QPR:$Vm, (i32 imm:$SIMM))))))]>;
+}
+
+// Shift by immediate and insert,
+// both double- and quad-register.
+let TwoOperandAliasConstraint = "$Vm = $Vd" in {
+class N2VDShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+ Operand ImmTy, Format f, string OpcodeStr, string Dt,
+ ValueType Ty,SDNode ShOp>
+ : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$Vd),
+ (ins DPR:$src1, DPR:$Vm, ImmTy:$SIMM), f, IIC_VSHLiD,
+ OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd",
+ [(set DPR:$Vd, (Ty (ShOp DPR:$src1, DPR:$Vm, (i32 imm:$SIMM))))]>;
+class N2VQShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+ Operand ImmTy, Format f, string OpcodeStr, string Dt,
+ ValueType Ty,SDNode ShOp>
+ : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$Vd),
+ (ins QPR:$src1, QPR:$Vm, ImmTy:$SIMM), f, IIC_VSHLiQ,
+ OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd",
+ [(set QPR:$Vd, (Ty (ShOp QPR:$src1, QPR:$Vm, (i32 imm:$SIMM))))]>;
+}
+
+// Convert, with fractional bits immediate,
+// both double- and quad-register.
+class N2VCvtD<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+ string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
+ SDPatternOperator IntOp>
+ : N2VImm<op24, op23, op11_8, op7, 0, op4,
+ (outs DPR:$Vd), (ins DPR:$Vm, neon_vcvt_imm32:$SIMM), NVCVTFrm,
+ IIC_VUNAD, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
+ [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm), (i32 imm:$SIMM))))]>;
+class N2VCvtQ<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+ string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
+ SDPatternOperator IntOp>
+ : N2VImm<op24, op23, op11_8, op7, 1, op4,
+ (outs QPR:$Vd), (ins QPR:$Vm, neon_vcvt_imm32:$SIMM), NVCVTFrm,
+ IIC_VUNAQ, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
+ [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm), (i32 imm:$SIMM))))]>;
+
+//===----------------------------------------------------------------------===//
+// Multiclasses
+//===----------------------------------------------------------------------===//
+
+// Abbreviations used in multiclass suffixes:
+// Q = quarter int (8 bit) elements
+// H = half int (16 bit) elements
+// S = single int (32 bit) elements
+// D = double int (64 bit) elements
+
+// Neon 2-register vector operations and intrinsics.
+
+// Neon 2-register comparisons.
+// source operand element sizes of 8, 16 and 32 bits:
+multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
+ bits<5> op11_7, bit op4, string opc, string Dt,
+ string asm, SDNode OpNode> {
+ // 64-bit vector types.
+ def v8i8 : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 0, op4,
+ (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
+ opc, !strconcat(Dt, "8"), asm, "",
+ [(set DPR:$Vd, (v8i8 (OpNode (v8i8 DPR:$Vm))))]>;
+ def v4i16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 0, op4,
+ (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
+ opc, !strconcat(Dt, "16"), asm, "",
+ [(set DPR:$Vd, (v4i16 (OpNode (v4i16 DPR:$Vm))))]>;
+ def v2i32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 0, op4,
+ (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
+ opc, !strconcat(Dt, "32"), asm, "",
+ [(set DPR:$Vd, (v2i32 (OpNode (v2i32 DPR:$Vm))))]>;
+ def v2f32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 0, op4,
+ (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
+ opc, "f32", asm, "",
+ [(set DPR:$Vd, (v2i32 (OpNode (v2f32 DPR:$Vm))))]> {
+ let Inst{10} = 1; // overwrite F = 1
+ }
+ def v4f16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 0, op4,
+ (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
+ opc, "f16", asm, "",
+ [(set DPR:$Vd, (v4i16 (OpNode (v4f16 DPR:$Vm))))]>,
+ Requires<[HasNEON,HasFullFP16]> {
+ let Inst{10} = 1; // overwrite F = 1
+ }
+
+ // 128-bit vector types.
+ def v16i8 : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 1, op4,
+ (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
+ opc, !strconcat(Dt, "8"), asm, "",
+ [(set QPR:$Vd, (v16i8 (OpNode (v16i8 QPR:$Vm))))]>;
+ def v8i16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 1, op4,
+ (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
+ opc, !strconcat(Dt, "16"), asm, "",
+ [(set QPR:$Vd, (v8i16 (OpNode (v8i16 QPR:$Vm))))]>;
+ def v4i32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 1, op4,
+ (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
+ opc, !strconcat(Dt, "32"), asm, "",
+ [(set QPR:$Vd, (v4i32 (OpNode (v4i32 QPR:$Vm))))]>;
+ def v4f32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 1, op4,
+ (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
+ opc, "f32", asm, "",
+ [(set QPR:$Vd, (v4i32 (OpNode (v4f32 QPR:$Vm))))]> {
+ let Inst{10} = 1; // overwrite F = 1
+ }
+ def v8f16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 1, op4,
+ (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
+ opc, "f16", asm, "",
+ [(set QPR:$Vd, (v8i16 (OpNode (v8f16 QPR:$Vm))))]>,
+ Requires<[HasNEON,HasFullFP16]> {
+ let Inst{10} = 1; // overwrite F = 1
+ }
+}
+
+
+// Neon 2-register vector intrinsics,
+// element sizes of 8, 16 and 32 bits:
+multiclass N2VInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
+ bits<5> op11_7, bit op4,
+ InstrItinClass itinD, InstrItinClass itinQ,
+ string OpcodeStr, string Dt, SDPatternOperator IntOp> {
+ // 64-bit vector types.
+ def v8i8 : N2VDInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
+ itinD, OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, IntOp>;
+ def v4i16 : N2VDInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
+ itinD, OpcodeStr, !strconcat(Dt, "16"),v4i16,v4i16,IntOp>;
+ def v2i32 : N2VDInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
+ itinD, OpcodeStr, !strconcat(Dt, "32"),v2i32,v2i32,IntOp>;
+
+ // 128-bit vector types.
+ def v16i8 : N2VQInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
+ itinQ, OpcodeStr, !strconcat(Dt, "8"), v16i8,v16i8,IntOp>;
+ def v8i16 : N2VQInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
+ itinQ, OpcodeStr, !strconcat(Dt, "16"),v8i16,v8i16,IntOp>;
+ def v4i32 : N2VQInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
+ itinQ, OpcodeStr, !strconcat(Dt, "32"),v4i32,v4i32,IntOp>;
+}
+
+
+// Neon Narrowing 2-register vector operations,
+// source operand element sizes of 16, 32 and 64 bits:
+multiclass N2VN_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
+ bits<5> op11_7, bit op6, bit op4,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ SDNode OpNode> {
+ def v8i8 : N2VN<op24_23, op21_20, 0b00, op17_16, op11_7, op6, op4,
+ itin, OpcodeStr, !strconcat(Dt, "16"),
+ v8i8, v8i16, OpNode>;
+ def v4i16 : N2VN<op24_23, op21_20, 0b01, op17_16, op11_7, op6, op4,
+ itin, OpcodeStr, !strconcat(Dt, "32"),
+ v4i16, v4i32, OpNode>;
+ def v2i32 : N2VN<op24_23, op21_20, 0b10, op17_16, op11_7, op6, op4,
+ itin, OpcodeStr, !strconcat(Dt, "64"),
+ v2i32, v2i64, OpNode>;
+}
+
+// Neon Narrowing 2-register vector intrinsics,
+// source operand element sizes of 16, 32 and 64 bits:
+multiclass N2VNInt_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
+ bits<5> op11_7, bit op6, bit op4,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ SDPatternOperator IntOp> {
+ def v8i8 : N2VNInt<op24_23, op21_20, 0b00, op17_16, op11_7, op6, op4,
+ itin, OpcodeStr, !strconcat(Dt, "16"),
+ v8i8, v8i16, IntOp>;
+ def v4i16 : N2VNInt<op24_23, op21_20, 0b01, op17_16, op11_7, op6, op4,
+ itin, OpcodeStr, !strconcat(Dt, "32"),
+ v4i16, v4i32, IntOp>;
+ def v2i32 : N2VNInt<op24_23, op21_20, 0b10, op17_16, op11_7, op6, op4,
+ itin, OpcodeStr, !strconcat(Dt, "64"),
+ v2i32, v2i64, IntOp>;
+}
+
+
+// Neon Lengthening 2-register vector intrinsic (currently specific to VMOVL).
+// source operand element sizes of 16, 32 and 64 bits:
+multiclass N2VL_QHS<bits<2> op24_23, bits<5> op11_7, bit op6, bit op4,
+ string OpcodeStr, string Dt, SDNode OpNode> {
+ def v8i16 : N2VL<op24_23, 0b00, 0b10, 0b00, op11_7, op6, op4, IIC_VQUNAiD,
+ OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, OpNode>;
+ def v4i32 : N2VL<op24_23, 0b01, 0b00, 0b00, op11_7, op6, op4, IIC_VQUNAiD,
+ OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, OpNode>;
+ def v2i64 : N2VL<op24_23, 0b10, 0b00, 0b00, op11_7, op6, op4, IIC_VQUNAiD,
+ OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, OpNode>;
+}
+
+
+// Neon 3-register vector operations.
+
+// First with only element sizes of 8, 16 and 32 bits:
+multiclass N3V_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+ InstrItinClass itinD16, InstrItinClass itinD32,
+ InstrItinClass itinQ16, InstrItinClass itinQ32,
+ string OpcodeStr, string Dt,
+ SDNode OpNode, bit Commutable = 0> {
+ // 64-bit vector types.
+ def v8i8 : N3VD<op24, op23, 0b00, op11_8, op4, itinD16,
+ OpcodeStr, !strconcat(Dt, "8"),
+ v8i8, v8i8, OpNode, Commutable>;
+ def v4i16 : N3VD<op24, op23, 0b01, op11_8, op4, itinD16,
+ OpcodeStr, !strconcat(Dt, "16"),
+ v4i16, v4i16, OpNode, Commutable>;
+ def v2i32 : N3VD<op24, op23, 0b10, op11_8, op4, itinD32,
+ OpcodeStr, !strconcat(Dt, "32"),
+ v2i32, v2i32, OpNode, Commutable>;
+
+ // 128-bit vector types.
+ def v16i8 : N3VQ<op24, op23, 0b00, op11_8, op4, itinQ16,
+ OpcodeStr, !strconcat(Dt, "8"),
+ v16i8, v16i8, OpNode, Commutable>;
+ def v8i16 : N3VQ<op24, op23, 0b01, op11_8, op4, itinQ16,
+ OpcodeStr, !strconcat(Dt, "16"),
+ v8i16, v8i16, OpNode, Commutable>;
+ def v4i32 : N3VQ<op24, op23, 0b10, op11_8, op4, itinQ32,
+ OpcodeStr, !strconcat(Dt, "32"),
+ v4i32, v4i32, OpNode, Commutable>;
+}
+
+multiclass N3VSL_HS<bits<4> op11_8, string OpcodeStr, SDNode ShOp> {
+ def v4i16 : N3VDSL16<0b01, op11_8, OpcodeStr, "i16", v4i16, ShOp>;
+ def v2i32 : N3VDSL<0b10, op11_8, IIC_VMULi32D, OpcodeStr, "i32", v2i32, ShOp>;
+ def v8i16 : N3VQSL16<0b01, op11_8, OpcodeStr, "i16", v8i16, v4i16, ShOp>;
+ def v4i32 : N3VQSL<0b10, op11_8, IIC_VMULi32Q, OpcodeStr, "i32",
+ v4i32, v2i32, ShOp>;
+}
+
+// ....then also with element size 64 bits:
+multiclass N3V_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+ InstrItinClass itinD, InstrItinClass itinQ,
+ string OpcodeStr, string Dt,
+ SDNode OpNode, bit Commutable = 0>
+ : N3V_QHS<op24, op23, op11_8, op4, itinD, itinD, itinQ, itinQ,
+ OpcodeStr, Dt, OpNode, Commutable> {
+ def v1i64 : N3VD<op24, op23, 0b11, op11_8, op4, itinD,
+ OpcodeStr, !strconcat(Dt, "64"),
+ v1i64, v1i64, OpNode, Commutable>;
+ def v2i64 : N3VQ<op24, op23, 0b11, op11_8, op4, itinQ,
+ OpcodeStr, !strconcat(Dt, "64"),
+ v2i64, v2i64, OpNode, Commutable>;
+}
+
+
+// Neon 3-register vector intrinsics.
+
+// First with only element sizes of 16 and 32 bits:
+multiclass N3VInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
+ InstrItinClass itinD16, InstrItinClass itinD32,
+ InstrItinClass itinQ16, InstrItinClass itinQ32,
+ string OpcodeStr, string Dt,
+ SDPatternOperator IntOp, bit Commutable = 0> {
+ // 64-bit vector types.
+ def v4i16 : N3VDInt<op24, op23, 0b01, op11_8, op4, f, itinD16,
+ OpcodeStr, !strconcat(Dt, "16"),
+ v4i16, v4i16, IntOp, Commutable>;
+ def v2i32 : N3VDInt<op24, op23, 0b10, op11_8, op4, f, itinD32,
+ OpcodeStr, !strconcat(Dt, "32"),
+ v2i32, v2i32, IntOp, Commutable>;
+
+ // 128-bit vector types.
+ def v8i16 : N3VQInt<op24, op23, 0b01, op11_8, op4, f, itinQ16,
+ OpcodeStr, !strconcat(Dt, "16"),
+ v8i16, v8i16, IntOp, Commutable>;
+ def v4i32 : N3VQInt<op24, op23, 0b10, op11_8, op4, f, itinQ32,
+ OpcodeStr, !strconcat(Dt, "32"),
+ v4i32, v4i32, IntOp, Commutable>;
+}
+multiclass N3VInt_HSSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
+ InstrItinClass itinD16, InstrItinClass itinD32,
+ InstrItinClass itinQ16, InstrItinClass itinQ32,
+ string OpcodeStr, string Dt,
+ SDPatternOperator IntOp> {
+ // 64-bit vector types.
+ def v4i16 : N3VDIntSh<op24, op23, 0b01, op11_8, op4, f, itinD16,
+ OpcodeStr, !strconcat(Dt, "16"),
+ v4i16, v4i16, IntOp>;
+ def v2i32 : N3VDIntSh<op24, op23, 0b10, op11_8, op4, f, itinD32,
+ OpcodeStr, !strconcat(Dt, "32"),
+ v2i32, v2i32, IntOp>;
+
+ // 128-bit vector types.
+ def v8i16 : N3VQIntSh<op24, op23, 0b01, op11_8, op4, f, itinQ16,
+ OpcodeStr, !strconcat(Dt, "16"),
+ v8i16, v8i16, IntOp>;
+ def v4i32 : N3VQIntSh<op24, op23, 0b10, op11_8, op4, f, itinQ32,
+ OpcodeStr, !strconcat(Dt, "32"),
+ v4i32, v4i32, IntOp>;
+}
+
+multiclass N3VIntSL_HS<bits<4> op11_8,
+ InstrItinClass itinD16, InstrItinClass itinD32,
+ InstrItinClass itinQ16, InstrItinClass itinQ32,
+ string OpcodeStr, string Dt, SDPatternOperator IntOp> {
+ def v4i16 : N3VDIntSL16<0b01, op11_8, itinD16,
+ OpcodeStr, !strconcat(Dt, "16"), v4i16, IntOp>;
+ def v2i32 : N3VDIntSL<0b10, op11_8, itinD32,
+ OpcodeStr, !strconcat(Dt, "32"), v2i32, IntOp>;
+ def v8i16 : N3VQIntSL16<0b01, op11_8, itinQ16,
+ OpcodeStr, !strconcat(Dt, "16"), v8i16, v4i16, IntOp>;
+ def v4i32 : N3VQIntSL<0b10, op11_8, itinQ32,
+ OpcodeStr, !strconcat(Dt, "32"), v4i32, v2i32, IntOp>;
+}
+
+// ....then also with element size of 8 bits:
+multiclass N3VInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
+ InstrItinClass itinD16, InstrItinClass itinD32,
+ InstrItinClass itinQ16, InstrItinClass itinQ32,
+ string OpcodeStr, string Dt,
+ SDPatternOperator IntOp, bit Commutable = 0>
+ : N3VInt_HS<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32,
+ OpcodeStr, Dt, IntOp, Commutable> {
+ def v8i8 : N3VDInt<op24, op23, 0b00, op11_8, op4, f, itinD16,
+ OpcodeStr, !strconcat(Dt, "8"),
+ v8i8, v8i8, IntOp, Commutable>;
+ def v16i8 : N3VQInt<op24, op23, 0b00, op11_8, op4, f, itinQ16,
+ OpcodeStr, !strconcat(Dt, "8"),
+ v16i8, v16i8, IntOp, Commutable>;
+}
+multiclass N3VInt_QHSSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
+ InstrItinClass itinD16, InstrItinClass itinD32,
+ InstrItinClass itinQ16, InstrItinClass itinQ32,
+ string OpcodeStr, string Dt,
+ SDPatternOperator IntOp>
+ : N3VInt_HSSh<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32,
+ OpcodeStr, Dt, IntOp> {
+ def v8i8 : N3VDIntSh<op24, op23, 0b00, op11_8, op4, f, itinD16,
+ OpcodeStr, !strconcat(Dt, "8"),
+ v8i8, v8i8, IntOp>;
+ def v16i8 : N3VQIntSh<op24, op23, 0b00, op11_8, op4, f, itinQ16,
+ OpcodeStr, !strconcat(Dt, "8"),
+ v16i8, v16i8, IntOp>;
+}
+
+
+// ....then also with element size of 64 bits:
+multiclass N3VInt_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
+ InstrItinClass itinD16, InstrItinClass itinD32,
+ InstrItinClass itinQ16, InstrItinClass itinQ32,
+ string OpcodeStr, string Dt,
+ SDPatternOperator IntOp, bit Commutable = 0>
+ : N3VInt_QHS<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32,
+ OpcodeStr, Dt, IntOp, Commutable> {
+ def v1i64 : N3VDInt<op24, op23, 0b11, op11_8, op4, f, itinD32,
+ OpcodeStr, !strconcat(Dt, "64"),
+ v1i64, v1i64, IntOp, Commutable>;
+ def v2i64 : N3VQInt<op24, op23, 0b11, op11_8, op4, f, itinQ32,
+ OpcodeStr, !strconcat(Dt, "64"),
+ v2i64, v2i64, IntOp, Commutable>;
+}
+multiclass N3VInt_QHSDSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
+ InstrItinClass itinD16, InstrItinClass itinD32,
+ InstrItinClass itinQ16, InstrItinClass itinQ32,
+ string OpcodeStr, string Dt,
+ SDPatternOperator IntOp>
+ : N3VInt_QHSSh<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32,
+ OpcodeStr, Dt, IntOp> {
+ def v1i64 : N3VDIntSh<op24, op23, 0b11, op11_8, op4, f, itinD32,
+ OpcodeStr, !strconcat(Dt, "64"),
+ v1i64, v1i64, IntOp>;
+ def v2i64 : N3VQIntSh<op24, op23, 0b11, op11_8, op4, f, itinQ32,
+ OpcodeStr, !strconcat(Dt, "64"),
+ v2i64, v2i64, IntOp>;
+}
+
+// Neon Narrowing 3-register vector intrinsics,
+// source operand element sizes of 16, 32 and 64 bits:
+multiclass N3VNInt_HSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+ string OpcodeStr, string Dt,
+ SDPatternOperator IntOp, bit Commutable = 0> {
+ def v8i8 : N3VNInt<op24, op23, 0b00, op11_8, op4,
+ OpcodeStr, !strconcat(Dt, "16"),
+ v8i8, v8i16, IntOp, Commutable>;
+ def v4i16 : N3VNInt<op24, op23, 0b01, op11_8, op4,
+ OpcodeStr, !strconcat(Dt, "32"),
+ v4i16, v4i32, IntOp, Commutable>;
+ def v2i32 : N3VNInt<op24, op23, 0b10, op11_8, op4,
+ OpcodeStr, !strconcat(Dt, "64"),
+ v2i32, v2i64, IntOp, Commutable>;
+}
+
+
+// Neon Long 3-register vector operations.
+
+multiclass N3VL_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+ InstrItinClass itin16, InstrItinClass itin32,
+ string OpcodeStr, string Dt,
+ SDNode OpNode, bit Commutable = 0> {
+ def v8i16 : N3VL<op24, op23, 0b00, op11_8, op4, itin16,
+ OpcodeStr, !strconcat(Dt, "8"),
+ v8i16, v8i8, OpNode, Commutable>;
+ def v4i32 : N3VL<op24, op23, 0b01, op11_8, op4, itin16,
+ OpcodeStr, !strconcat(Dt, "16"),
+ v4i32, v4i16, OpNode, Commutable>;
+ def v2i64 : N3VL<op24, op23, 0b10, op11_8, op4, itin32,
+ OpcodeStr, !strconcat(Dt, "32"),
+ v2i64, v2i32, OpNode, Commutable>;
+}
+
+multiclass N3VLSL_HS<bit op24, bits<4> op11_8,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ SDNode OpNode> {
+ def v4i16 : N3VLSL16<op24, 0b01, op11_8, itin, OpcodeStr,
+ !strconcat(Dt, "16"), v4i32, v4i16, OpNode>;
+ def v2i32 : N3VLSL<op24, 0b10, op11_8, itin, OpcodeStr,
+ !strconcat(Dt, "32"), v2i64, v2i32, OpNode>;
+}
+
+multiclass N3VLExt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+ InstrItinClass itin16, InstrItinClass itin32,
+ string OpcodeStr, string Dt,
+ SDNode OpNode, SDNode ExtOp, bit Commutable = 0> {
+ def v8i16 : N3VLExt<op24, op23, 0b00, op11_8, op4, itin16,
+ OpcodeStr, !strconcat(Dt, "8"),
+ v8i16, v8i8, OpNode, ExtOp, Commutable>;
+ def v4i32 : N3VLExt<op24, op23, 0b01, op11_8, op4, itin16,
+ OpcodeStr, !strconcat(Dt, "16"),
+ v4i32, v4i16, OpNode, ExtOp, Commutable>;
+ def v2i64 : N3VLExt<op24, op23, 0b10, op11_8, op4, itin32,
+ OpcodeStr, !strconcat(Dt, "32"),
+ v2i64, v2i32, OpNode, ExtOp, Commutable>;
+}
+
+// Neon Long 3-register vector intrinsics.
+
+// First with only element sizes of 16 and 32 bits:
+multiclass N3VLInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
+ InstrItinClass itin16, InstrItinClass itin32,
+ string OpcodeStr, string Dt,
+ SDPatternOperator IntOp, bit Commutable = 0> {
+ def v4i32 : N3VLInt<op24, op23, 0b01, op11_8, op4, itin16,
+ OpcodeStr, !strconcat(Dt, "16"),
+ v4i32, v4i16, IntOp, Commutable>;
+ def v2i64 : N3VLInt<op24, op23, 0b10, op11_8, op4, itin32,
+ OpcodeStr, !strconcat(Dt, "32"),
+ v2i64, v2i32, IntOp, Commutable>;
+}
+
+multiclass N3VLIntSL_HS<bit op24, bits<4> op11_8,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ SDPatternOperator IntOp> {
+ def v4i16 : N3VLIntSL16<op24, 0b01, op11_8, itin,
+ OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, IntOp>;
+ def v2i32 : N3VLIntSL<op24, 0b10, op11_8, itin,
+ OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, IntOp>;
+}
+
+// ....then also with element size of 8 bits:
+multiclass N3VLInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+ InstrItinClass itin16, InstrItinClass itin32,
+ string OpcodeStr, string Dt,
+ SDPatternOperator IntOp, bit Commutable = 0>
+ : N3VLInt_HS<op24, op23, op11_8, op4, itin16, itin32, OpcodeStr, Dt,
+ IntOp, Commutable> {
+ def v8i16 : N3VLInt<op24, op23, 0b00, op11_8, op4, itin16,
+ OpcodeStr, !strconcat(Dt, "8"),
+ v8i16, v8i8, IntOp, Commutable>;
+}
+
+// ....with explicit extend (VABDL).
+multiclass N3VLIntExt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ SDPatternOperator IntOp, SDNode ExtOp, bit Commutable = 0> {
+ def v8i16 : N3VLIntExt<op24, op23, 0b00, op11_8, op4, itin,
+ OpcodeStr, !strconcat(Dt, "8"),
+ v8i16, v8i8, IntOp, ExtOp, Commutable>;
+ def v4i32 : N3VLIntExt<op24, op23, 0b01, op11_8, op4, itin,
+ OpcodeStr, !strconcat(Dt, "16"),
+ v4i32, v4i16, IntOp, ExtOp, Commutable>;
+ def v2i64 : N3VLIntExt<op24, op23, 0b10, op11_8, op4, itin,
+ OpcodeStr, !strconcat(Dt, "32"),
+ v2i64, v2i32, IntOp, ExtOp, Commutable>;
+}
+
+
+// Neon Wide 3-register vector intrinsics,
+// source operand element sizes of 8, 16 and 32 bits:
+multiclass N3VW_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+ string OpcodeStr, string Dt,
+ SDNode OpNode, SDNode ExtOp, bit Commutable = 0> {
+ def v8i16 : N3VW<op24, op23, 0b00, op11_8, op4,
+ OpcodeStr, !strconcat(Dt, "8"),
+ v8i16, v8i8, OpNode, ExtOp, Commutable>;
+ def v4i32 : N3VW<op24, op23, 0b01, op11_8, op4,
+ OpcodeStr, !strconcat(Dt, "16"),
+ v4i32, v4i16, OpNode, ExtOp, Commutable>;
+ def v2i64 : N3VW<op24, op23, 0b10, op11_8, op4,
+ OpcodeStr, !strconcat(Dt, "32"),
+ v2i64, v2i32, OpNode, ExtOp, Commutable>;
+}
+
+
+// Neon Multiply-Op vector operations,
+// element sizes of 8, 16 and 32 bits:
+multiclass N3VMulOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+ InstrItinClass itinD16, InstrItinClass itinD32,
+ InstrItinClass itinQ16, InstrItinClass itinQ32,
+ string OpcodeStr, string Dt, SDNode OpNode> {
+ // 64-bit vector types.
+ def v8i8 : N3VDMulOp<op24, op23, 0b00, op11_8, op4, itinD16,
+ OpcodeStr, !strconcat(Dt, "8"), v8i8, mul, OpNode>;
+ def v4i16 : N3VDMulOp<op24, op23, 0b01, op11_8, op4, itinD16,
+ OpcodeStr, !strconcat(Dt, "16"), v4i16, mul, OpNode>;
+ def v2i32 : N3VDMulOp<op24, op23, 0b10, op11_8, op4, itinD32,
+ OpcodeStr, !strconcat(Dt, "32"), v2i32, mul, OpNode>;
+
+ // 128-bit vector types.
+ def v16i8 : N3VQMulOp<op24, op23, 0b00, op11_8, op4, itinQ16,
+ OpcodeStr, !strconcat(Dt, "8"), v16i8, mul, OpNode>;
+ def v8i16 : N3VQMulOp<op24, op23, 0b01, op11_8, op4, itinQ16,
+ OpcodeStr, !strconcat(Dt, "16"), v8i16, mul, OpNode>;
+ def v4i32 : N3VQMulOp<op24, op23, 0b10, op11_8, op4, itinQ32,
+ OpcodeStr, !strconcat(Dt, "32"), v4i32, mul, OpNode>;
+}
+
+multiclass N3VMulOpSL_HS<bits<4> op11_8,
+ InstrItinClass itinD16, InstrItinClass itinD32,
+ InstrItinClass itinQ16, InstrItinClass itinQ32,
+ string OpcodeStr, string Dt, SDPatternOperator ShOp> {
+ def v4i16 : N3VDMulOpSL16<0b01, op11_8, itinD16,
+ OpcodeStr, !strconcat(Dt, "16"), v4i16, mul, ShOp>;
+ def v2i32 : N3VDMulOpSL<0b10, op11_8, itinD32,
+ OpcodeStr, !strconcat(Dt, "32"), v2i32, mul, ShOp>;
+ def v8i16 : N3VQMulOpSL16<0b01, op11_8, itinQ16,
+ OpcodeStr, !strconcat(Dt, "16"), v8i16, v4i16,
+ mul, ShOp>;
+ def v4i32 : N3VQMulOpSL<0b10, op11_8, itinQ32,
+ OpcodeStr, !strconcat(Dt, "32"), v4i32, v2i32,
+ mul, ShOp>;
+}
+
+// Neon Intrinsic-Op vector operations,
+// element sizes of 8, 16 and 32 bits:
+multiclass N3VIntOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+ InstrItinClass itinD, InstrItinClass itinQ,
+ string OpcodeStr, string Dt, SDPatternOperator IntOp,
+ SDNode OpNode> {
+ // 64-bit vector types.
+ def v8i8 : N3VDIntOp<op24, op23, 0b00, op11_8, op4, itinD,
+ OpcodeStr, !strconcat(Dt, "8"), v8i8, IntOp, OpNode>;
+ def v4i16 : N3VDIntOp<op24, op23, 0b01, op11_8, op4, itinD,
+ OpcodeStr, !strconcat(Dt, "16"), v4i16, IntOp, OpNode>;
+ def v2i32 : N3VDIntOp<op24, op23, 0b10, op11_8, op4, itinD,
+ OpcodeStr, !strconcat(Dt, "32"), v2i32, IntOp, OpNode>;
+
+ // 128-bit vector types.
+ def v16i8 : N3VQIntOp<op24, op23, 0b00, op11_8, op4, itinQ,
+ OpcodeStr, !strconcat(Dt, "8"), v16i8, IntOp, OpNode>;
+ def v8i16 : N3VQIntOp<op24, op23, 0b01, op11_8, op4, itinQ,
+ OpcodeStr, !strconcat(Dt, "16"), v8i16, IntOp, OpNode>;
+ def v4i32 : N3VQIntOp<op24, op23, 0b10, op11_8, op4, itinQ,
+ OpcodeStr, !strconcat(Dt, "32"), v4i32, IntOp, OpNode>;
+}
+
+// Neon 3-argument intrinsics,
+// element sizes of 16 and 32 bits:
+multiclass N3VInt3_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
+ InstrItinClass itinD16, InstrItinClass itinD32,
+ InstrItinClass itinQ16, InstrItinClass itinQ32,
+ string OpcodeStr, string Dt, SDPatternOperator IntOp> {
+ // 64-bit vector types.
+ def v4i16 : N3VDInt3<op24, op23, 0b01, op11_8, op4, itinD16,
+ OpcodeStr, !strconcat(Dt, "16"), v4i16, v4i16, IntOp>;
+ def v2i32 : N3VDInt3<op24, op23, 0b10, op11_8, op4, itinD32,
+ OpcodeStr, !strconcat(Dt, "32"), v2i32, v2i32, IntOp>;
+
+ // 128-bit vector types.
+ def v8i16 : N3VQInt3<op24, op23, 0b01, op11_8, op4, itinQ16,
+ OpcodeStr, !strconcat(Dt, "16"), v8i16, v8i16, IntOp>;
+ def v4i32 : N3VQInt3<op24, op23, 0b10, op11_8, op4, itinQ32,
+ OpcodeStr, !strconcat(Dt, "32"), v4i32, v4i32, IntOp>;
+}
+
+// element sizes of 8, 16 and 32 bits:
+multiclass N3VInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+ InstrItinClass itinD16, InstrItinClass itinD32,
+ InstrItinClass itinQ16, InstrItinClass itinQ32,
+ string OpcodeStr, string Dt, SDPatternOperator IntOp>
+ :N3VInt3_HS <op24, op23, op11_8, op4, itinD16, itinD32,
+ itinQ16, itinQ32, OpcodeStr, Dt, IntOp>{
+ // 64-bit vector types.
+ def v8i8 : N3VDInt3<op24, op23, 0b00, op11_8, op4, itinD16,
+ OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, IntOp>;
+ // 128-bit vector types.
+ def v16i8 : N3VQInt3<op24, op23, 0b00, op11_8, op4, itinQ16,
+ OpcodeStr, !strconcat(Dt, "8"), v16i8, v16i8, IntOp>;
+}
+
+// Neon Long Multiply-Op vector operations,
+// element sizes of 8, 16 and 32 bits:
+multiclass N3VLMulOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+ InstrItinClass itin16, InstrItinClass itin32,
+ string OpcodeStr, string Dt, SDNode MulOp,
+ SDNode OpNode> {
+ def v8i16 : N3VLMulOp<op24, op23, 0b00, op11_8, op4, itin16, OpcodeStr,
+ !strconcat(Dt, "8"), v8i16, v8i8, MulOp, OpNode>;
+ def v4i32 : N3VLMulOp<op24, op23, 0b01, op11_8, op4, itin16, OpcodeStr,
+ !strconcat(Dt, "16"), v4i32, v4i16, MulOp, OpNode>;
+ def v2i64 : N3VLMulOp<op24, op23, 0b10, op11_8, op4, itin32, OpcodeStr,
+ !strconcat(Dt, "32"), v2i64, v2i32, MulOp, OpNode>;
+}
+
+multiclass N3VLMulOpSL_HS<bit op24, bits<4> op11_8, string OpcodeStr,
+ string Dt, SDNode MulOp, SDNode OpNode> {
+ def v4i16 : N3VLMulOpSL16<op24, 0b01, op11_8, IIC_VMACi16D, OpcodeStr,
+ !strconcat(Dt,"16"), v4i32, v4i16, MulOp, OpNode>;
+ def v2i32 : N3VLMulOpSL<op24, 0b10, op11_8, IIC_VMACi32D, OpcodeStr,
+ !strconcat(Dt, "32"), v2i64, v2i32, MulOp, OpNode>;
+}
+
+
+// Neon Long 3-argument intrinsics.
+
+// First with only element sizes of 16 and 32 bits:
+multiclass N3VLInt3_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
+ InstrItinClass itin16, InstrItinClass itin32,
+ string OpcodeStr, string Dt, SDPatternOperator IntOp> {
+ def v4i32 : N3VLInt3<op24, op23, 0b01, op11_8, op4, itin16,
+ OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, IntOp>;
+ def v2i64 : N3VLInt3<op24, op23, 0b10, op11_8, op4, itin32,
+ OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, IntOp>;
+}
+
+multiclass N3VLInt3SL_HS<bit op24, bits<4> op11_8,
+ string OpcodeStr, string Dt, SDPatternOperator IntOp> {
+ def v4i16 : N3VLInt3SL16<op24, 0b01, op11_8, IIC_VMACi16D,
+ OpcodeStr, !strconcat(Dt,"16"), v4i32, v4i16, IntOp>;
+ def v2i32 : N3VLInt3SL<op24, 0b10, op11_8, IIC_VMACi32D,
+ OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, IntOp>;
+}
+
+// ....then also with element size of 8 bits:
+multiclass N3VLInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+ InstrItinClass itin16, InstrItinClass itin32,
+ string OpcodeStr, string Dt, SDPatternOperator IntOp>
+ : N3VLInt3_HS<op24, op23, op11_8, op4, itin16, itin32, OpcodeStr, Dt, IntOp> {
+ def v8i16 : N3VLInt3<op24, op23, 0b00, op11_8, op4, itin16,
+ OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, IntOp>;
+}
+
+// ....with explicit extend (VABAL).
+multiclass N3VLIntExtOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ SDPatternOperator IntOp, SDNode ExtOp, SDNode OpNode> {
+ def v8i16 : N3VLIntExtOp<op24, op23, 0b00, op11_8, op4, itin,
+ OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8,
+ IntOp, ExtOp, OpNode>;
+ def v4i32 : N3VLIntExtOp<op24, op23, 0b01, op11_8, op4, itin,
+ OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16,
+ IntOp, ExtOp, OpNode>;
+ def v2i64 : N3VLIntExtOp<op24, op23, 0b10, op11_8, op4, itin,
+ OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32,
+ IntOp, ExtOp, OpNode>;
+}
+
+
+// Neon Pairwise long 2-register intrinsics,
+// element sizes of 8, 16 and 32 bits:
+multiclass N2VPLInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
+ bits<5> op11_7, bit op4,
+ string OpcodeStr, string Dt, SDPatternOperator IntOp> {
+ // 64-bit vector types.
+ def v8i8 : N2VDPLInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
+ OpcodeStr, !strconcat(Dt, "8"), v4i16, v8i8, IntOp>;
+ def v4i16 : N2VDPLInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
+ OpcodeStr, !strconcat(Dt, "16"), v2i32, v4i16, IntOp>;
+ def v2i32 : N2VDPLInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
+ OpcodeStr, !strconcat(Dt, "32"), v1i64, v2i32, IntOp>;
+
+ // 128-bit vector types.
+ def v16i8 : N2VQPLInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
+ OpcodeStr, !strconcat(Dt, "8"), v8i16, v16i8, IntOp>;
+ def v8i16 : N2VQPLInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
+ OpcodeStr, !strconcat(Dt, "16"), v4i32, v8i16, IntOp>;
+ def v4i32 : N2VQPLInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
+ OpcodeStr, !strconcat(Dt, "32"), v2i64, v4i32, IntOp>;
+}
+
+
+// Neon Pairwise long 2-register accumulate intrinsics,
+// element sizes of 8, 16 and 32 bits:
+multiclass N2VPLInt2_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
+ bits<5> op11_7, bit op4,
+ string OpcodeStr, string Dt, SDPatternOperator IntOp> {
+ // 64-bit vector types.
+ def v8i8 : N2VDPLInt2<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
+ OpcodeStr, !strconcat(Dt, "8"), v4i16, v8i8, IntOp>;
+ def v4i16 : N2VDPLInt2<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
+ OpcodeStr, !strconcat(Dt, "16"), v2i32, v4i16, IntOp>;
+ def v2i32 : N2VDPLInt2<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
+ OpcodeStr, !strconcat(Dt, "32"), v1i64, v2i32, IntOp>;
+
+ // 128-bit vector types.
+ def v16i8 : N2VQPLInt2<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
+ OpcodeStr, !strconcat(Dt, "8"), v8i16, v16i8, IntOp>;
+ def v8i16 : N2VQPLInt2<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
+ OpcodeStr, !strconcat(Dt, "16"), v4i32, v8i16, IntOp>;
+ def v4i32 : N2VQPLInt2<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
+ OpcodeStr, !strconcat(Dt, "32"), v2i64, v4i32, IntOp>;
+}
+
+
+// Neon 2-register vector shift by immediate,
+// with f of either N2RegVShLFrm or N2RegVShRFrm
+// element sizes of 8, 16, 32 and 64 bits:
+multiclass N2VShL_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ SDNode OpNode> {
+ // 64-bit vector types.
+ def v8i8 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm,
+ OpcodeStr, !strconcat(Dt, "8"), v8i8, OpNode> {
+ let Inst{21-19} = 0b001; // imm6 = 001xxx
+ }
+ def v4i16 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm,
+ OpcodeStr, !strconcat(Dt, "16"), v4i16, OpNode> {
+ let Inst{21-20} = 0b01; // imm6 = 01xxxx
+ }
+ def v2i32 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm,
+ OpcodeStr, !strconcat(Dt, "32"), v2i32, OpNode> {
+ let Inst{21} = 0b1; // imm6 = 1xxxxx
+ }
+ def v1i64 : N2VDSh<op24, op23, op11_8, 1, op4, N2RegVShLFrm, itin, i32imm,
+ OpcodeStr, !strconcat(Dt, "64"), v1i64, OpNode>;
+ // imm6 = xxxxxx
+
+ // 128-bit vector types.
+ def v16i8 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm,
+ OpcodeStr, !strconcat(Dt, "8"), v16i8, OpNode> {
+ let Inst{21-19} = 0b001; // imm6 = 001xxx
+ }
+ def v8i16 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm,
+ OpcodeStr, !strconcat(Dt, "16"), v8i16, OpNode> {
+ let Inst{21-20} = 0b01; // imm6 = 01xxxx
+ }
+ def v4i32 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm,
+ OpcodeStr, !strconcat(Dt, "32"), v4i32, OpNode> {
+ let Inst{21} = 0b1; // imm6 = 1xxxxx
+ }
+ def v2i64 : N2VQSh<op24, op23, op11_8, 1, op4, N2RegVShLFrm, itin, i32imm,
+ OpcodeStr, !strconcat(Dt, "64"), v2i64, OpNode>;
+ // imm6 = xxxxxx
+}
+multiclass N2VShR_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ string baseOpc, SDNode OpNode> {
+ // 64-bit vector types.
+ def v8i8 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm8,
+ OpcodeStr, !strconcat(Dt, "8"), v8i8, OpNode> {
+ let Inst{21-19} = 0b001; // imm6 = 001xxx
+ }
+ def v4i16 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm16,
+ OpcodeStr, !strconcat(Dt, "16"), v4i16, OpNode> {
+ let Inst{21-20} = 0b01; // imm6 = 01xxxx
+ }
+ def v2i32 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm32,
+ OpcodeStr, !strconcat(Dt, "32"), v2i32, OpNode> {
+ let Inst{21} = 0b1; // imm6 = 1xxxxx
+ }
+ def v1i64 : N2VDSh<op24, op23, op11_8, 1, op4, N2RegVShRFrm, itin, shr_imm64,
+ OpcodeStr, !strconcat(Dt, "64"), v1i64, OpNode>;
+ // imm6 = xxxxxx
+
+ // 128-bit vector types.
+ def v16i8 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm8,
+ OpcodeStr, !strconcat(Dt, "8"), v16i8, OpNode> {
+ let Inst{21-19} = 0b001; // imm6 = 001xxx
+ }
+ def v8i16 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm16,
+ OpcodeStr, !strconcat(Dt, "16"), v8i16, OpNode> {
+ let Inst{21-20} = 0b01; // imm6 = 01xxxx
+ }
+ def v4i32 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm32,
+ OpcodeStr, !strconcat(Dt, "32"), v4i32, OpNode> {
+ let Inst{21} = 0b1; // imm6 = 1xxxxx
+ }
+ def v2i64 : N2VQSh<op24, op23, op11_8, 1, op4, N2RegVShRFrm, itin, shr_imm64,
+ OpcodeStr, !strconcat(Dt, "64"), v2i64, OpNode>;
+ // imm6 = xxxxxx
+}
+
+// Neon Shift-Accumulate vector operations,
+// element sizes of 8, 16, 32 and 64 bits:
+multiclass N2VShAdd_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+ string OpcodeStr, string Dt, SDNode ShOp> {
+ // 64-bit vector types.
+ def v8i8 : N2VDShAdd<op24, op23, op11_8, 0, op4, shr_imm8,
+ OpcodeStr, !strconcat(Dt, "8"), v8i8, ShOp> {
+ let Inst{21-19} = 0b001; // imm6 = 001xxx
+ }
+ def v4i16 : N2VDShAdd<op24, op23, op11_8, 0, op4, shr_imm16,
+ OpcodeStr, !strconcat(Dt, "16"), v4i16, ShOp> {
+ let Inst{21-20} = 0b01; // imm6 = 01xxxx
+ }
+ def v2i32 : N2VDShAdd<op24, op23, op11_8, 0, op4, shr_imm32,
+ OpcodeStr, !strconcat(Dt, "32"), v2i32, ShOp> {
+ let Inst{21} = 0b1; // imm6 = 1xxxxx
+ }
+ def v1i64 : N2VDShAdd<op24, op23, op11_8, 1, op4, shr_imm64,
+ OpcodeStr, !strconcat(Dt, "64"), v1i64, ShOp>;
+ // imm6 = xxxxxx
+
+ // 128-bit vector types.
+ def v16i8 : N2VQShAdd<op24, op23, op11_8, 0, op4, shr_imm8,
+ OpcodeStr, !strconcat(Dt, "8"), v16i8, ShOp> {
+ let Inst{21-19} = 0b001; // imm6 = 001xxx
+ }
+ def v8i16 : N2VQShAdd<op24, op23, op11_8, 0, op4, shr_imm16,
+ OpcodeStr, !strconcat(Dt, "16"), v8i16, ShOp> {
+ let Inst{21-20} = 0b01; // imm6 = 01xxxx
+ }
+ def v4i32 : N2VQShAdd<op24, op23, op11_8, 0, op4, shr_imm32,
+ OpcodeStr, !strconcat(Dt, "32"), v4i32, ShOp> {
+ let Inst{21} = 0b1; // imm6 = 1xxxxx
+ }
+ def v2i64 : N2VQShAdd<op24, op23, op11_8, 1, op4, shr_imm64,
+ OpcodeStr, !strconcat(Dt, "64"), v2i64, ShOp>;
+ // imm6 = xxxxxx
+}
+
+// Neon Shift-Insert vector operations,
+// with f of either N2RegVShLFrm or N2RegVShRFrm
+// element sizes of 8, 16, 32 and 64 bits:
+multiclass N2VShInsL_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+ string OpcodeStr> {
+ // 64-bit vector types.
+ def v8i8 : N2VDShIns<op24, op23, op11_8, 0, op4, i32imm,
+ N2RegVShLFrm, OpcodeStr, "8", v8i8, NEONvsli> {
+ let Inst{21-19} = 0b001; // imm6 = 001xxx
+ }
+ def v4i16 : N2VDShIns<op24, op23, op11_8, 0, op4, i32imm,
+ N2RegVShLFrm, OpcodeStr, "16", v4i16, NEONvsli> {
+ let Inst{21-20} = 0b01; // imm6 = 01xxxx
+ }
+ def v2i32 : N2VDShIns<op24, op23, op11_8, 0, op4, i32imm,
+ N2RegVShLFrm, OpcodeStr, "32", v2i32, NEONvsli> {
+ let Inst{21} = 0b1; // imm6 = 1xxxxx
+ }
+ def v1i64 : N2VDShIns<op24, op23, op11_8, 1, op4, i32imm,
+ N2RegVShLFrm, OpcodeStr, "64", v1i64, NEONvsli>;
+ // imm6 = xxxxxx
+
+ // 128-bit vector types.
+ def v16i8 : N2VQShIns<op24, op23, op11_8, 0, op4, i32imm,
+ N2RegVShLFrm, OpcodeStr, "8", v16i8, NEONvsli> {
+ let Inst{21-19} = 0b001; // imm6 = 001xxx
+ }
+ def v8i16 : N2VQShIns<op24, op23, op11_8, 0, op4, i32imm,
+ N2RegVShLFrm, OpcodeStr, "16", v8i16, NEONvsli> {
+ let Inst{21-20} = 0b01; // imm6 = 01xxxx
+ }
+ def v4i32 : N2VQShIns<op24, op23, op11_8, 0, op4, i32imm,
+ N2RegVShLFrm, OpcodeStr, "32", v4i32, NEONvsli> {
+ let Inst{21} = 0b1; // imm6 = 1xxxxx
+ }
+ def v2i64 : N2VQShIns<op24, op23, op11_8, 1, op4, i32imm,
+ N2RegVShLFrm, OpcodeStr, "64", v2i64, NEONvsli>;
+ // imm6 = xxxxxx
+}
+multiclass N2VShInsR_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+ string OpcodeStr> {
+ // 64-bit vector types.
+ def v8i8 : N2VDShIns<op24, op23, op11_8, 0, op4, shr_imm8,
+ N2RegVShRFrm, OpcodeStr, "8", v8i8, NEONvsri> {
+ let Inst{21-19} = 0b001; // imm6 = 001xxx
+ }
+ def v4i16 : N2VDShIns<op24, op23, op11_8, 0, op4, shr_imm16,
+ N2RegVShRFrm, OpcodeStr, "16", v4i16, NEONvsri> {
+ let Inst{21-20} = 0b01; // imm6 = 01xxxx
+ }
+ def v2i32 : N2VDShIns<op24, op23, op11_8, 0, op4, shr_imm32,
+ N2RegVShRFrm, OpcodeStr, "32", v2i32, NEONvsri> {
+ let Inst{21} = 0b1; // imm6 = 1xxxxx
+ }
+ def v1i64 : N2VDShIns<op24, op23, op11_8, 1, op4, shr_imm64,
+ N2RegVShRFrm, OpcodeStr, "64", v1i64, NEONvsri>;
+ // imm6 = xxxxxx
+
+ // 128-bit vector types.
+ def v16i8 : N2VQShIns<op24, op23, op11_8, 0, op4, shr_imm8,
+ N2RegVShRFrm, OpcodeStr, "8", v16i8, NEONvsri> {
+ let Inst{21-19} = 0b001; // imm6 = 001xxx
+ }
+ def v8i16 : N2VQShIns<op24, op23, op11_8, 0, op4, shr_imm16,
+ N2RegVShRFrm, OpcodeStr, "16", v8i16, NEONvsri> {
+ let Inst{21-20} = 0b01; // imm6 = 01xxxx
+ }
+ def v4i32 : N2VQShIns<op24, op23, op11_8, 0, op4, shr_imm32,
+ N2RegVShRFrm, OpcodeStr, "32", v4i32, NEONvsri> {
+ let Inst{21} = 0b1; // imm6 = 1xxxxx
+ }
+ def v2i64 : N2VQShIns<op24, op23, op11_8, 1, op4, shr_imm64,
+ N2RegVShRFrm, OpcodeStr, "64", v2i64, NEONvsri>;
+ // imm6 = xxxxxx
+}
+
+// Neon Shift Long operations,
+// element sizes of 8, 16, 32 bits:
+multiclass N2VLSh_QHS<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6,
+ bit op4, string OpcodeStr, string Dt,
+ SDPatternOperator OpNode> {
+ def v8i16 : N2VLSh<op24, op23, op11_8, op7, op6, op4,
+ OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, imm1_7, OpNode> {
+ let Inst{21-19} = 0b001; // imm6 = 001xxx
+ }
+ def v4i32 : N2VLSh<op24, op23, op11_8, op7, op6, op4,
+ OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, imm1_15, OpNode> {
+ let Inst{21-20} = 0b01; // imm6 = 01xxxx
+ }
+ def v2i64 : N2VLSh<op24, op23, op11_8, op7, op6, op4,
+ OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, imm1_31, OpNode> {
+ let Inst{21} = 0b1; // imm6 = 1xxxxx
+ }
+}
+
+// Neon Shift Narrow operations,
+// element sizes of 16, 32, 64 bits:
+multiclass N2VNSh_HSD<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6,
+ bit op4, InstrItinClass itin, string OpcodeStr, string Dt,
+ SDPatternOperator OpNode> {
+ def v8i8 : N2VNSh<op24, op23, op11_8, op7, op6, op4, itin,
+ OpcodeStr, !strconcat(Dt, "16"),
+ v8i8, v8i16, shr_imm8, OpNode> {
+ let Inst{21-19} = 0b001; // imm6 = 001xxx
+ }
+ def v4i16 : N2VNSh<op24, op23, op11_8, op7, op6, op4, itin,
+ OpcodeStr, !strconcat(Dt, "32"),
+ v4i16, v4i32, shr_imm16, OpNode> {
+ let Inst{21-20} = 0b01; // imm6 = 01xxxx
+ }
+ def v2i32 : N2VNSh<op24, op23, op11_8, op7, op6, op4, itin,
+ OpcodeStr, !strconcat(Dt, "64"),
+ v2i32, v2i64, shr_imm32, OpNode> {
+ let Inst{21} = 0b1; // imm6 = 1xxxxx
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction Definitions.
+//===----------------------------------------------------------------------===//
+
+// Vector Add Operations.
+
+// VADD : Vector Add (integer and floating-point)
+defm VADD : N3V_QHSD<0, 0, 0b1000, 0, IIC_VBINiD, IIC_VBINiQ, "vadd", "i",
+ add, 1>;
+def VADDfd : N3VD<0, 0, 0b00, 0b1101, 0, IIC_VBIND, "vadd", "f32",
+ v2f32, v2f32, fadd, 1>;
+def VADDfq : N3VQ<0, 0, 0b00, 0b1101, 0, IIC_VBINQ, "vadd", "f32",
+ v4f32, v4f32, fadd, 1>;
+def VADDhd : N3VD<0, 0, 0b01, 0b1101, 0, IIC_VBIND, "vadd", "f16",
+ v4f16, v4f16, fadd, 1>,
+ Requires<[HasNEON,HasFullFP16]>;
+def VADDhq : N3VQ<0, 0, 0b01, 0b1101, 0, IIC_VBINQ, "vadd", "f16",
+ v8f16, v8f16, fadd, 1>,
+ Requires<[HasNEON,HasFullFP16]>;
+// VADDL : Vector Add Long (Q = D + D)
+defm VADDLs : N3VLExt_QHS<0,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD,
+ "vaddl", "s", add, sext, 1>;
+defm VADDLu : N3VLExt_QHS<1,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD,
+ "vaddl", "u", add, zext, 1>;
+// VADDW : Vector Add Wide (Q = Q + D)
+defm VADDWs : N3VW_QHS<0,1,0b0001,0, "vaddw", "s", add, sext, 0>;
+defm VADDWu : N3VW_QHS<1,1,0b0001,0, "vaddw", "u", add, zext, 0>;
+// VHADD : Vector Halving Add
+defm VHADDs : N3VInt_QHS<0, 0, 0b0000, 0, N3RegFrm,
+ IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+ "vhadd", "s", int_arm_neon_vhadds, 1>;
+defm VHADDu : N3VInt_QHS<1, 0, 0b0000, 0, N3RegFrm,
+ IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+ "vhadd", "u", int_arm_neon_vhaddu, 1>;
+// VRHADD : Vector Rounding Halving Add
+defm VRHADDs : N3VInt_QHS<0, 0, 0b0001, 0, N3RegFrm,
+ IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+ "vrhadd", "s", int_arm_neon_vrhadds, 1>;
+defm VRHADDu : N3VInt_QHS<1, 0, 0b0001, 0, N3RegFrm,
+ IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+ "vrhadd", "u", int_arm_neon_vrhaddu, 1>;
+// VQADD : Vector Saturating Add
+defm VQADDs : N3VInt_QHSD<0, 0, 0b0000, 1, N3RegFrm,
+ IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+ "vqadd", "s", int_arm_neon_vqadds, 1>;
+defm VQADDu : N3VInt_QHSD<1, 0, 0b0000, 1, N3RegFrm,
+ IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+ "vqadd", "u", int_arm_neon_vqaddu, 1>;
+// VADDHN : Vector Add and Narrow Returning High Half (D = Q + Q)
+defm VADDHN : N3VNInt_HSD<0,1,0b0100,0, "vaddhn", "i", null_frag, 1>;
+// VRADDHN : Vector Rounding Add and Narrow Returning High Half (D = Q + Q)
+defm VRADDHN : N3VNInt_HSD<1,1,0b0100,0, "vraddhn", "i",
+ int_arm_neon_vraddhn, 1>;
+
+def : Pat<(v8i8 (trunc (NEONvshru (add (v8i16 QPR:$Vn), QPR:$Vm), 8))),
+ (VADDHNv8i8 QPR:$Vn, QPR:$Vm)>;
+def : Pat<(v4i16 (trunc (NEONvshru (add (v4i32 QPR:$Vn), QPR:$Vm), 16))),
+ (VADDHNv4i16 QPR:$Vn, QPR:$Vm)>;
+def : Pat<(v2i32 (trunc (NEONvshru (add (v2i64 QPR:$Vn), QPR:$Vm), 32))),
+ (VADDHNv2i32 QPR:$Vn, QPR:$Vm)>;
+
+// Vector Multiply Operations.
+
+// VMUL : Vector Multiply (integer, polynomial and floating-point)
+defm VMUL : N3V_QHS<0, 0, 0b1001, 1, IIC_VMULi16D, IIC_VMULi32D,
+ IIC_VMULi16Q, IIC_VMULi32Q, "vmul", "i", mul, 1>;
+def VMULpd : N3VDInt<1, 0, 0b00, 0b1001, 1, N3RegFrm, IIC_VMULi16D, "vmul",
+ "p8", v8i8, v8i8, int_arm_neon_vmulp, 1>;
+def VMULpq : N3VQInt<1, 0, 0b00, 0b1001, 1, N3RegFrm, IIC_VMULi16Q, "vmul",
+ "p8", v16i8, v16i8, int_arm_neon_vmulp, 1>;
+def VMULfd : N3VD<1, 0, 0b00, 0b1101, 1, IIC_VFMULD, "vmul", "f32",
+ v2f32, v2f32, fmul, 1>;
+def VMULfq : N3VQ<1, 0, 0b00, 0b1101, 1, IIC_VFMULQ, "vmul", "f32",
+ v4f32, v4f32, fmul, 1>;
+def VMULhd : N3VD<1, 0, 0b01, 0b1101, 1, IIC_VFMULD, "vmul", "f16",
+ v4f16, v4f16, fmul, 1>,
+ Requires<[HasNEON,HasFullFP16]>;
+def VMULhq : N3VQ<1, 0, 0b01, 0b1101, 1, IIC_VFMULQ, "vmul", "f16",
+ v8f16, v8f16, fmul, 1>,
+ Requires<[HasNEON,HasFullFP16]>;
+defm VMULsl : N3VSL_HS<0b1000, "vmul", mul>;
+def VMULslfd : N3VDSL<0b10, 0b1001, IIC_VBIND, "vmul", "f32", v2f32, fmul>;
+def VMULslfq : N3VQSL<0b10, 0b1001, IIC_VBINQ, "vmul", "f32", v4f32,
+ v2f32, fmul>;
+def VMULslhd : N3VDSL16<0b01, 0b1001, "vmul", "f16", v4f16, fmul>,
+ Requires<[HasNEON,HasFullFP16]>;
+def VMULslhq : N3VQSL16<0b01, 0b1001, "vmul", "f16", v8f16,
+ v4f16, fmul>,
+ Requires<[HasNEON,HasFullFP16]>;
+
+def : Pat<(v8i16 (mul (v8i16 QPR:$src1),
+ (v8i16 (NEONvduplane (v8i16 QPR:$src2), imm:$lane)))),
+ (v8i16 (VMULslv8i16 (v8i16 QPR:$src1),
+ (v4i16 (EXTRACT_SUBREG QPR:$src2,
+ (DSubReg_i16_reg imm:$lane))),
+ (SubReg_i16_lane imm:$lane)))>;
+def : Pat<(v4i32 (mul (v4i32 QPR:$src1),
+ (v4i32 (NEONvduplane (v4i32 QPR:$src2), imm:$lane)))),
+ (v4i32 (VMULslv4i32 (v4i32 QPR:$src1),
+ (v2i32 (EXTRACT_SUBREG QPR:$src2,
+ (DSubReg_i32_reg imm:$lane))),
+ (SubReg_i32_lane imm:$lane)))>;
+def : Pat<(v4f32 (fmul (v4f32 QPR:$src1),
+ (v4f32 (NEONvduplane (v4f32 QPR:$src2), imm:$lane)))),
+ (v4f32 (VMULslfq (v4f32 QPR:$src1),
+ (v2f32 (EXTRACT_SUBREG QPR:$src2,
+ (DSubReg_i32_reg imm:$lane))),
+ (SubReg_i32_lane imm:$lane)))>;
+
+
+def : Pat<(v2f32 (fmul DPR:$Rn, (NEONvdup (f32 SPR:$Rm)))),
+ (VMULslfd DPR:$Rn,
+ (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0),
+ (i32 0))>;
+def : Pat<(v4f32 (fmul QPR:$Rn, (NEONvdup (f32 SPR:$Rm)))),
+ (VMULslfq QPR:$Rn,
+ (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0),
+ (i32 0))>;
+
+
+// VQDMULH : Vector Saturating Doubling Multiply Returning High Half
+defm VQDMULH : N3VInt_HS<0, 0, 0b1011, 0, N3RegFrm, IIC_VMULi16D, IIC_VMULi32D,
+ IIC_VMULi16Q, IIC_VMULi32Q,
+ "vqdmulh", "s", int_arm_neon_vqdmulh, 1>;
+defm VQDMULHsl: N3VIntSL_HS<0b1100, IIC_VMULi16D, IIC_VMULi32D,
+ IIC_VMULi16Q, IIC_VMULi32Q,
+ "vqdmulh", "s", int_arm_neon_vqdmulh>;
+def : Pat<(v8i16 (int_arm_neon_vqdmulh (v8i16 QPR:$src1),
+ (v8i16 (NEONvduplane (v8i16 QPR:$src2),
+ imm:$lane)))),
+ (v8i16 (VQDMULHslv8i16 (v8i16 QPR:$src1),
+ (v4i16 (EXTRACT_SUBREG QPR:$src2,
+ (DSubReg_i16_reg imm:$lane))),
+ (SubReg_i16_lane imm:$lane)))>;
+def : Pat<(v4i32 (int_arm_neon_vqdmulh (v4i32 QPR:$src1),
+ (v4i32 (NEONvduplane (v4i32 QPR:$src2),
+ imm:$lane)))),
+ (v4i32 (VQDMULHslv4i32 (v4i32 QPR:$src1),
+ (v2i32 (EXTRACT_SUBREG QPR:$src2,
+ (DSubReg_i32_reg imm:$lane))),
+ (SubReg_i32_lane imm:$lane)))>;
+
+// VQRDMULH : Vector Rounding Saturating Doubling Multiply Returning High Half
+defm VQRDMULH : N3VInt_HS<1, 0, 0b1011, 0, N3RegFrm,
+ IIC_VMULi16D,IIC_VMULi32D,IIC_VMULi16Q,IIC_VMULi32Q,
+ "vqrdmulh", "s", int_arm_neon_vqrdmulh, 1>;
+defm VQRDMULHsl : N3VIntSL_HS<0b1101, IIC_VMULi16D, IIC_VMULi32D,
+ IIC_VMULi16Q, IIC_VMULi32Q,
+ "vqrdmulh", "s", int_arm_neon_vqrdmulh>;
+def : Pat<(v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$src1),
+ (v8i16 (NEONvduplane (v8i16 QPR:$src2),
+ imm:$lane)))),
+ (v8i16 (VQRDMULHslv8i16 (v8i16 QPR:$src1),
+ (v4i16 (EXTRACT_SUBREG QPR:$src2,
+ (DSubReg_i16_reg imm:$lane))),
+ (SubReg_i16_lane imm:$lane)))>;
+def : Pat<(v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$src1),
+ (v4i32 (NEONvduplane (v4i32 QPR:$src2),
+ imm:$lane)))),
+ (v4i32 (VQRDMULHslv4i32 (v4i32 QPR:$src1),
+ (v2i32 (EXTRACT_SUBREG QPR:$src2,
+ (DSubReg_i32_reg imm:$lane))),
+ (SubReg_i32_lane imm:$lane)))>;
+
+// VMULL : Vector Multiply Long (integer and polynomial) (Q = D * D)
+let PostEncoderMethod = "NEONThumb2DataIPostEncoder",
+ DecoderNamespace = "NEONData" in {
+ defm VMULLs : N3VL_QHS<0,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
+ "vmull", "s", NEONvmulls, 1>;
+ defm VMULLu : N3VL_QHS<1,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
+ "vmull", "u", NEONvmullu, 1>;
+ def VMULLp8 : N3VLInt<0, 1, 0b00, 0b1110, 0, IIC_VMULi16D, "vmull", "p8",
+ v8i16, v8i8, int_arm_neon_vmullp, 1>;
+ def VMULLp64 : N3VLIntnp<0b00101, 0b10, 0b1110, 0, 0, NoItinerary,
+ "vmull", "p64", v2i64, v1i64, int_arm_neon_vmullp, 1>,
+ Requires<[HasV8, HasCrypto]>;
+}
+defm VMULLsls : N3VLSL_HS<0, 0b1010, IIC_VMULi16D, "vmull", "s", NEONvmulls>;
+defm VMULLslu : N3VLSL_HS<1, 0b1010, IIC_VMULi16D, "vmull", "u", NEONvmullu>;
+
+// VQDMULL : Vector Saturating Doubling Multiply Long (Q = D * D)
+defm VQDMULL : N3VLInt_HS<0,1,0b1101,0, IIC_VMULi16D, IIC_VMULi32D,
+ "vqdmull", "s", int_arm_neon_vqdmull, 1>;
+defm VQDMULLsl: N3VLIntSL_HS<0, 0b1011, IIC_VMULi16D,
+ "vqdmull", "s", int_arm_neon_vqdmull>;
+
+// Vector Multiply-Accumulate and Multiply-Subtract Operations.
+
+// VMLA : Vector Multiply Accumulate (integer and floating-point)
+defm VMLA : N3VMulOp_QHS<0, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
+ IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>;
+def VMLAfd : N3VDMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACD, "vmla", "f32",
+ v2f32, fmul_su, fadd_mlx>,
+ Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
+def VMLAfq : N3VQMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACQ, "vmla", "f32",
+ v4f32, fmul_su, fadd_mlx>,
+ Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
+def VMLAhd : N3VDMulOp<0, 0, 0b01, 0b1101, 1, IIC_VMACD, "vmla", "f16",
+ v4f16, fmul_su, fadd_mlx>,
+ Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>;
+def VMLAhq : N3VQMulOp<0, 0, 0b01, 0b1101, 1, IIC_VMACQ, "vmla", "f16",
+ v8f16, fmul_su, fadd_mlx>,
+ Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>;
+defm VMLAsl : N3VMulOpSL_HS<0b0000, IIC_VMACi16D, IIC_VMACi32D,
+ IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>;
+def VMLAslfd : N3VDMulOpSL<0b10, 0b0001, IIC_VMACD, "vmla", "f32",
+ v2f32, fmul_su, fadd_mlx>,
+ Requires<[HasNEON, UseFPVMLx]>;
+def VMLAslfq : N3VQMulOpSL<0b10, 0b0001, IIC_VMACQ, "vmla", "f32",
+ v4f32, v2f32, fmul_su, fadd_mlx>,
+ Requires<[HasNEON, UseFPVMLx]>;
+def VMLAslhd : N3VDMulOpSL16<0b01, 0b0001, IIC_VMACD, "vmla", "f16",
+ v4f16, fmul, fadd>,
+ Requires<[HasNEON, HasFullFP16, UseFPVMLx]>;
+def VMLAslhq : N3VQMulOpSL16<0b01, 0b0001, IIC_VMACQ, "vmla", "f16",
+ v8f16, v4f16, fmul, fadd>,
+ Requires<[HasNEON, HasFullFP16, UseFPVMLx]>;
+
+def : Pat<(v8i16 (add (v8i16 QPR:$src1),
+ (mul (v8i16 QPR:$src2),
+ (v8i16 (NEONvduplane (v8i16 QPR:$src3), imm:$lane))))),
+ (v8i16 (VMLAslv8i16 (v8i16 QPR:$src1), (v8i16 QPR:$src2),
+ (v4i16 (EXTRACT_SUBREG QPR:$src3,
+ (DSubReg_i16_reg imm:$lane))),
+ (SubReg_i16_lane imm:$lane)))>;
+
+def : Pat<(v4i32 (add (v4i32 QPR:$src1),
+ (mul (v4i32 QPR:$src2),
+ (v4i32 (NEONvduplane (v4i32 QPR:$src3), imm:$lane))))),
+ (v4i32 (VMLAslv4i32 (v4i32 QPR:$src1), (v4i32 QPR:$src2),
+ (v2i32 (EXTRACT_SUBREG QPR:$src3,
+ (DSubReg_i32_reg imm:$lane))),
+ (SubReg_i32_lane imm:$lane)))>;
+
+def : Pat<(v4f32 (fadd_mlx (v4f32 QPR:$src1),
+ (fmul_su (v4f32 QPR:$src2),
+ (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))),
+ (v4f32 (VMLAslfq (v4f32 QPR:$src1),
+ (v4f32 QPR:$src2),
+ (v2f32 (EXTRACT_SUBREG QPR:$src3,
+ (DSubReg_i32_reg imm:$lane))),
+ (SubReg_i32_lane imm:$lane)))>,
+ Requires<[HasNEON, UseFPVMLx]>;
+
+// VMLAL : Vector Multiply Accumulate Long (Q += D * D)
+defm VMLALs : N3VLMulOp_QHS<0,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D,
+ "vmlal", "s", NEONvmulls, add>;
+defm VMLALu : N3VLMulOp_QHS<1,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D,
+ "vmlal", "u", NEONvmullu, add>;
+
+defm VMLALsls : N3VLMulOpSL_HS<0, 0b0010, "vmlal", "s", NEONvmulls, add>;
+defm VMLALslu : N3VLMulOpSL_HS<1, 0b0010, "vmlal", "u", NEONvmullu, add>;
+
+let Predicates = [HasNEON, HasV8_1a] in {
+ // v8.1a Neon Rounding Double Multiply-Op vector operations,
+ // VQRDMLAH : Vector Saturating Rounding Doubling Multiply Accumulate Long
+ // (Q += D * D)
+ defm VQRDMLAH : N3VInt3_HS<1, 0, 0b1011, 1, IIC_VMACi16D, IIC_VMACi32D,
+ IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlah", "s",
+ null_frag>;
+ def : Pat<(v4i16 (int_arm_neon_vqadds
+ (v4i16 DPR:$src1),
+ (v4i16 (int_arm_neon_vqrdmulh (v4i16 DPR:$Vn),
+ (v4i16 DPR:$Vm))))),
+ (v4i16 (VQRDMLAHv4i16 DPR:$src1, DPR:$Vn, DPR:$Vm))>;
+ def : Pat<(v2i32 (int_arm_neon_vqadds
+ (v2i32 DPR:$src1),
+ (v2i32 (int_arm_neon_vqrdmulh (v2i32 DPR:$Vn),
+ (v2i32 DPR:$Vm))))),
+ (v2i32 (VQRDMLAHv2i32 DPR:$src1, DPR:$Vn, DPR:$Vm))>;
+ def : Pat<(v8i16 (int_arm_neon_vqadds
+ (v8i16 QPR:$src1),
+ (v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$Vn),
+ (v8i16 QPR:$Vm))))),
+ (v8i16 (VQRDMLAHv8i16 QPR:$src1, QPR:$Vn, QPR:$Vm))>;
+ def : Pat<(v4i32 (int_arm_neon_vqadds
+ (v4i32 QPR:$src1),
+ (v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$Vn),
+ (v4i32 QPR:$Vm))))),
+ (v4i32 (VQRDMLAHv4i32 QPR:$src1, QPR:$Vn, QPR:$Vm))>;
+
+ defm VQRDMLAHsl : N3VMulOpSL_HS<0b1110, IIC_VMACi16D, IIC_VMACi32D,
+ IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlah", "s",
+ null_frag>;
+ def : Pat<(v4i16 (int_arm_neon_vqadds
+ (v4i16 DPR:$src1),
+ (v4i16 (int_arm_neon_vqrdmulh
+ (v4i16 DPR:$Vn),
+ (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm),
+ imm:$lane)))))),
+ (v4i16 (VQRDMLAHslv4i16 DPR:$src1, DPR:$Vn, DPR_8:$Vm,
+ imm:$lane))>;
+ def : Pat<(v2i32 (int_arm_neon_vqadds
+ (v2i32 DPR:$src1),
+ (v2i32 (int_arm_neon_vqrdmulh
+ (v2i32 DPR:$Vn),
+ (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm),
+ imm:$lane)))))),
+ (v2i32 (VQRDMLAHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm,
+ imm:$lane))>;
+ def : Pat<(v8i16 (int_arm_neon_vqadds
+ (v8i16 QPR:$src1),
+ (v8i16 (int_arm_neon_vqrdmulh
+ (v8i16 QPR:$src2),
+ (v8i16 (NEONvduplane (v8i16 QPR:$src3),
+ imm:$lane)))))),
+ (v8i16 (VQRDMLAHslv8i16 (v8i16 QPR:$src1),
+ (v8i16 QPR:$src2),
+ (v4i16 (EXTRACT_SUBREG
+ QPR:$src3,
+ (DSubReg_i16_reg imm:$lane))),
+ (SubReg_i16_lane imm:$lane)))>;
+ def : Pat<(v4i32 (int_arm_neon_vqadds
+ (v4i32 QPR:$src1),
+ (v4i32 (int_arm_neon_vqrdmulh
+ (v4i32 QPR:$src2),
+ (v4i32 (NEONvduplane (v4i32 QPR:$src3),
+ imm:$lane)))))),
+ (v4i32 (VQRDMLAHslv4i32 (v4i32 QPR:$src1),
+ (v4i32 QPR:$src2),
+ (v2i32 (EXTRACT_SUBREG
+ QPR:$src3,
+ (DSubReg_i32_reg imm:$lane))),
+ (SubReg_i32_lane imm:$lane)))>;
+
+ // VQRDMLSH : Vector Saturating Rounding Doubling Multiply Subtract Long
+ // (Q -= D * D)
+ defm VQRDMLSH : N3VInt3_HS<1, 0, 0b1100, 1, IIC_VMACi16D, IIC_VMACi32D,
+ IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlsh", "s",
+ null_frag>;
+ def : Pat<(v4i16 (int_arm_neon_vqsubs
+ (v4i16 DPR:$src1),
+ (v4i16 (int_arm_neon_vqrdmulh (v4i16 DPR:$Vn),
+ (v4i16 DPR:$Vm))))),
+ (v4i16 (VQRDMLSHv4i16 DPR:$src1, DPR:$Vn, DPR:$Vm))>;
+ def : Pat<(v2i32 (int_arm_neon_vqsubs
+ (v2i32 DPR:$src1),
+ (v2i32 (int_arm_neon_vqrdmulh (v2i32 DPR:$Vn),
+ (v2i32 DPR:$Vm))))),
+ (v2i32 (VQRDMLSHv2i32 DPR:$src1, DPR:$Vn, DPR:$Vm))>;
+ def : Pat<(v8i16 (int_arm_neon_vqsubs
+ (v8i16 QPR:$src1),
+ (v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$Vn),
+ (v8i16 QPR:$Vm))))),
+ (v8i16 (VQRDMLSHv8i16 QPR:$src1, QPR:$Vn, QPR:$Vm))>;
+ def : Pat<(v4i32 (int_arm_neon_vqsubs
+ (v4i32 QPR:$src1),
+ (v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$Vn),
+ (v4i32 QPR:$Vm))))),
+ (v4i32 (VQRDMLSHv4i32 QPR:$src1, QPR:$Vn, QPR:$Vm))>;
+
+ defm VQRDMLSHsl : N3VMulOpSL_HS<0b1111, IIC_VMACi16D, IIC_VMACi32D,
+ IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlsh", "s",
+ null_frag>;
+ def : Pat<(v4i16 (int_arm_neon_vqsubs
+ (v4i16 DPR:$src1),
+ (v4i16 (int_arm_neon_vqrdmulh
+ (v4i16 DPR:$Vn),
+ (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm),
+ imm:$lane)))))),
+ (v4i16 (VQRDMLSHslv4i16 DPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane))>;
+ def : Pat<(v2i32 (int_arm_neon_vqsubs
+ (v2i32 DPR:$src1),
+ (v2i32 (int_arm_neon_vqrdmulh
+ (v2i32 DPR:$Vn),
+ (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm),
+ imm:$lane)))))),
+ (v2i32 (VQRDMLSHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm,
+ imm:$lane))>;
+ def : Pat<(v8i16 (int_arm_neon_vqsubs
+ (v8i16 QPR:$src1),
+ (v8i16 (int_arm_neon_vqrdmulh
+ (v8i16 QPR:$src2),
+ (v8i16 (NEONvduplane (v8i16 QPR:$src3),
+ imm:$lane)))))),
+ (v8i16 (VQRDMLSHslv8i16 (v8i16 QPR:$src1),
+ (v8i16 QPR:$src2),
+ (v4i16 (EXTRACT_SUBREG
+ QPR:$src3,
+ (DSubReg_i16_reg imm:$lane))),
+ (SubReg_i16_lane imm:$lane)))>;
+ def : Pat<(v4i32 (int_arm_neon_vqsubs
+ (v4i32 QPR:$src1),
+ (v4i32 (int_arm_neon_vqrdmulh
+ (v4i32 QPR:$src2),
+ (v4i32 (NEONvduplane (v4i32 QPR:$src3),
+ imm:$lane)))))),
+ (v4i32 (VQRDMLSHslv4i32 (v4i32 QPR:$src1),
+ (v4i32 QPR:$src2),
+ (v2i32 (EXTRACT_SUBREG
+ QPR:$src3,
+ (DSubReg_i32_reg imm:$lane))),
+ (SubReg_i32_lane imm:$lane)))>;
+}
+// VQDMLAL : Vector Saturating Doubling Multiply Accumulate Long (Q += D * D)
+defm VQDMLAL : N3VLInt3_HS<0, 1, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
+ "vqdmlal", "s", null_frag>;
+defm VQDMLALsl: N3VLInt3SL_HS<0, 0b0011, "vqdmlal", "s", null_frag>;
+
+def : Pat<(v4i32 (int_arm_neon_vqadds (v4i32 QPR:$src1),
+ (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
+ (v4i16 DPR:$Vm))))),
+ (VQDMLALv4i32 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
+def : Pat<(v2i64 (int_arm_neon_vqadds (v2i64 QPR:$src1),
+ (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
+ (v2i32 DPR:$Vm))))),
+ (VQDMLALv2i64 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
+def : Pat<(v4i32 (int_arm_neon_vqadds (v4i32 QPR:$src1),
+ (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
+ (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm),
+ imm:$lane)))))),
+ (VQDMLALslv4i16 QPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane)>;
+def : Pat<(v2i64 (int_arm_neon_vqadds (v2i64 QPR:$src1),
+ (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
+ (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm),
+ imm:$lane)))))),
+ (VQDMLALslv2i32 QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane)>;
+
+// VMLS : Vector Multiply Subtract (integer and floating-point)
+defm VMLS : N3VMulOp_QHS<1, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
+ IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>;
+def VMLSfd : N3VDMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACD, "vmls", "f32",
+ v2f32, fmul_su, fsub_mlx>,
+ Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
+def VMLSfq : N3VQMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACQ, "vmls", "f32",
+ v4f32, fmul_su, fsub_mlx>,
+ Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
+def VMLShd : N3VDMulOp<0, 0, 0b11, 0b1101, 1, IIC_VMACD, "vmls", "f16",
+ v4f16, fmul, fsub>,
+ Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>;
+def VMLShq : N3VQMulOp<0, 0, 0b11, 0b1101, 1, IIC_VMACQ, "vmls", "f16",
+ v8f16, fmul, fsub>,
+ Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>;
+defm VMLSsl : N3VMulOpSL_HS<0b0100, IIC_VMACi16D, IIC_VMACi32D,
+ IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>;
+def VMLSslfd : N3VDMulOpSL<0b10, 0b0101, IIC_VMACD, "vmls", "f32",
+ v2f32, fmul_su, fsub_mlx>,
+ Requires<[HasNEON, UseFPVMLx]>;
+def VMLSslfq : N3VQMulOpSL<0b10, 0b0101, IIC_VMACQ, "vmls", "f32",
+ v4f32, v2f32, fmul_su, fsub_mlx>,
+ Requires<[HasNEON, UseFPVMLx]>;
+def VMLSslhd : N3VDMulOpSL16<0b01, 0b0101, IIC_VMACD, "vmls", "f16",
+ v4f16, fmul, fsub>,
+ Requires<[HasNEON, HasFullFP16, UseFPVMLx]>;
+def VMLSslhq : N3VQMulOpSL16<0b01, 0b0101, IIC_VMACQ, "vmls", "f16",
+ v8f16, v4f16, fmul, fsub>,
+ Requires<[HasNEON, HasFullFP16, UseFPVMLx]>;
+
+def : Pat<(v8i16 (sub (v8i16 QPR:$src1),
+ (mul (v8i16 QPR:$src2),
+ (v8i16 (NEONvduplane (v8i16 QPR:$src3), imm:$lane))))),
+ (v8i16 (VMLSslv8i16 (v8i16 QPR:$src1), (v8i16 QPR:$src2),
+ (v4i16 (EXTRACT_SUBREG QPR:$src3,
+ (DSubReg_i16_reg imm:$lane))),
+ (SubReg_i16_lane imm:$lane)))>;
+
+def : Pat<(v4i32 (sub (v4i32 QPR:$src1),
+ (mul (v4i32 QPR:$src2),
+ (v4i32 (NEONvduplane (v4i32 QPR:$src3), imm:$lane))))),
+ (v4i32 (VMLSslv4i32 (v4i32 QPR:$src1), (v4i32 QPR:$src2),
+ (v2i32 (EXTRACT_SUBREG QPR:$src3,
+ (DSubReg_i32_reg imm:$lane))),
+ (SubReg_i32_lane imm:$lane)))>;
+
+def : Pat<(v4f32 (fsub_mlx (v4f32 QPR:$src1),
+ (fmul_su (v4f32 QPR:$src2),
+ (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))),
+ (v4f32 (VMLSslfq (v4f32 QPR:$src1), (v4f32 QPR:$src2),
+ (v2f32 (EXTRACT_SUBREG QPR:$src3,
+ (DSubReg_i32_reg imm:$lane))),
+ (SubReg_i32_lane imm:$lane)))>,
+ Requires<[HasNEON, UseFPVMLx]>;
+
+// VMLSL : Vector Multiply Subtract Long (Q -= D * D)
+defm VMLSLs : N3VLMulOp_QHS<0,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D,
+ "vmlsl", "s", NEONvmulls, sub>;
+defm VMLSLu : N3VLMulOp_QHS<1,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D,
+ "vmlsl", "u", NEONvmullu, sub>;
+
+defm VMLSLsls : N3VLMulOpSL_HS<0, 0b0110, "vmlsl", "s", NEONvmulls, sub>;
+defm VMLSLslu : N3VLMulOpSL_HS<1, 0b0110, "vmlsl", "u", NEONvmullu, sub>;
+
+// VQDMLSL : Vector Saturating Doubling Multiply Subtract Long (Q -= D * D)
+defm VQDMLSL : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D,
+ "vqdmlsl", "s", null_frag>;
+defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b0111, "vqdmlsl", "s", null_frag>;
+
+def : Pat<(v4i32 (int_arm_neon_vqsubs (v4i32 QPR:$src1),
+ (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
+ (v4i16 DPR:$Vm))))),
+ (VQDMLSLv4i32 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
+def : Pat<(v2i64 (int_arm_neon_vqsubs (v2i64 QPR:$src1),
+ (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
+ (v2i32 DPR:$Vm))))),
+ (VQDMLSLv2i64 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
+def : Pat<(v4i32 (int_arm_neon_vqsubs (v4i32 QPR:$src1),
+ (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
+ (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm),
+ imm:$lane)))))),
+ (VQDMLSLslv4i16 QPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane)>;
+def : Pat<(v2i64 (int_arm_neon_vqsubs (v2i64 QPR:$src1),
+ (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
+ (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm),
+ imm:$lane)))))),
+ (VQDMLSLslv2i32 QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane)>;
+
+// Fused Vector Multiply-Accumulate and Fused Multiply-Subtract Operations.
+def VFMAfd : N3VDMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACD, "vfma", "f32",
+ v2f32, fmul_su, fadd_mlx>,
+ Requires<[HasNEON,HasVFP4,UseFusedMAC]>;
+
+def VFMAfq : N3VQMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACQ, "vfma", "f32",
+ v4f32, fmul_su, fadd_mlx>,
+ Requires<[HasNEON,HasVFP4,UseFusedMAC]>;
+def VFMAhd : N3VDMulOp<0, 0, 0b01, 0b1100, 1, IIC_VFMACD, "vfma", "f16",
+ v4f16, fmul, fadd>,
+ Requires<[HasNEON,HasFullFP16,UseFusedMAC]>;
+
+def VFMAhq : N3VQMulOp<0, 0, 0b01, 0b1100, 1, IIC_VFMACQ, "vfma", "f16",
+ v8f16, fmul, fadd>,
+ Requires<[HasNEON,HasFullFP16,UseFusedMAC]>;
+
+// Fused Vector Multiply Subtract (floating-point)
+def VFMSfd : N3VDMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACD, "vfms", "f32",
+ v2f32, fmul_su, fsub_mlx>,
+ Requires<[HasNEON,HasVFP4,UseFusedMAC]>;
+def VFMSfq : N3VQMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACQ, "vfms", "f32",
+ v4f32, fmul_su, fsub_mlx>,
+ Requires<[HasNEON,HasVFP4,UseFusedMAC]>;
+def VFMShd : N3VDMulOp<0, 0, 0b11, 0b1100, 1, IIC_VFMACD, "vfms", "f16",
+ v4f16, fmul, fsub>,
+ Requires<[HasNEON,HasFullFP16,UseFusedMAC]>;
+def VFMShq : N3VQMulOp<0, 0, 0b11, 0b1100, 1, IIC_VFMACQ, "vfms", "f16",
+ v8f16, fmul, fsub>,
+ Requires<[HasNEON,HasFullFP16,UseFusedMAC]>;
+
+// Match @llvm.fma.* intrinsics
+def : Pat<(v2f32 (fma DPR:$Vn, DPR:$Vm, DPR:$src1)),
+ (VFMAfd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+ Requires<[HasVFP4]>;
+def : Pat<(v4f32 (fma QPR:$Vn, QPR:$Vm, QPR:$src1)),
+ (VFMAfq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+ Requires<[HasVFP4]>;
+def : Pat<(v2f32 (fma (fneg DPR:$Vn), DPR:$Vm, DPR:$src1)),
+ (VFMSfd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+ Requires<[HasVFP4]>;
+def : Pat<(v4f32 (fma (fneg QPR:$Vn), QPR:$Vm, QPR:$src1)),
+ (VFMSfq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+ Requires<[HasVFP4]>;
+
+// Vector Subtract Operations.
+
+// VSUB : Vector Subtract (integer and floating-point)
+defm VSUB : N3V_QHSD<1, 0, 0b1000, 0, IIC_VSUBiD, IIC_VSUBiQ,
+ "vsub", "i", sub, 0>;
+def VSUBfd : N3VD<0, 0, 0b10, 0b1101, 0, IIC_VBIND, "vsub", "f32",
+ v2f32, v2f32, fsub, 0>;
+def VSUBfq : N3VQ<0, 0, 0b10, 0b1101, 0, IIC_VBINQ, "vsub", "f32",
+ v4f32, v4f32, fsub, 0>;
+def VSUBhd : N3VD<0, 0, 0b11, 0b1101, 0, IIC_VBIND, "vsub", "f16",
+ v4f16, v4f16, fsub, 0>,
+ Requires<[HasNEON,HasFullFP16]>;
+def VSUBhq : N3VQ<0, 0, 0b11, 0b1101, 0, IIC_VBINQ, "vsub", "f16",
+ v8f16, v8f16, fsub, 0>,
+ Requires<[HasNEON,HasFullFP16]>;
+// VSUBL : Vector Subtract Long (Q = D - D)
+defm VSUBLs : N3VLExt_QHS<0,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD,
+ "vsubl", "s", sub, sext, 0>;
+defm VSUBLu : N3VLExt_QHS<1,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD,
+ "vsubl", "u", sub, zext, 0>;
+// VSUBW : Vector Subtract Wide (Q = Q - D)
+defm VSUBWs : N3VW_QHS<0,1,0b0011,0, "vsubw", "s", sub, sext, 0>;
+defm VSUBWu : N3VW_QHS<1,1,0b0011,0, "vsubw", "u", sub, zext, 0>;
+// VHSUB : Vector Halving Subtract
+defm VHSUBs : N3VInt_QHS<0, 0, 0b0010, 0, N3RegFrm,
+ IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+ "vhsub", "s", int_arm_neon_vhsubs, 0>;
+defm VHSUBu : N3VInt_QHS<1, 0, 0b0010, 0, N3RegFrm,
+ IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+ "vhsub", "u", int_arm_neon_vhsubu, 0>;
+// VQSUB : Vector Saturing Subtract
+defm VQSUBs : N3VInt_QHSD<0, 0, 0b0010, 1, N3RegFrm,
+ IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+ "vqsub", "s", int_arm_neon_vqsubs, 0>;
+defm VQSUBu : N3VInt_QHSD<1, 0, 0b0010, 1, N3RegFrm,
+ IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+ "vqsub", "u", int_arm_neon_vqsubu, 0>;
+// VSUBHN : Vector Subtract and Narrow Returning High Half (D = Q - Q)
+defm VSUBHN : N3VNInt_HSD<0,1,0b0110,0, "vsubhn", "i", null_frag, 0>;
+// VRSUBHN : Vector Rounding Subtract and Narrow Returning High Half (D=Q-Q)
+defm VRSUBHN : N3VNInt_HSD<1,1,0b0110,0, "vrsubhn", "i",
+ int_arm_neon_vrsubhn, 0>;
+
+def : Pat<(v8i8 (trunc (NEONvshru (sub (v8i16 QPR:$Vn), QPR:$Vm), 8))),
+ (VSUBHNv8i8 QPR:$Vn, QPR:$Vm)>;
+def : Pat<(v4i16 (trunc (NEONvshru (sub (v4i32 QPR:$Vn), QPR:$Vm), 16))),
+ (VSUBHNv4i16 QPR:$Vn, QPR:$Vm)>;
+def : Pat<(v2i32 (trunc (NEONvshru (sub (v2i64 QPR:$Vn), QPR:$Vm), 32))),
+ (VSUBHNv2i32 QPR:$Vn, QPR:$Vm)>;
+
+// Vector Comparisons.
+
+// VCEQ : Vector Compare Equal
+defm VCEQ : N3V_QHS<1, 0, 0b1000, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
+ IIC_VSUBi4Q, "vceq", "i", NEONvceq, 1>;
+def VCEQfd : N3VD<0,0,0b00,0b1110,0, IIC_VBIND, "vceq", "f32", v2i32, v2f32,
+ NEONvceq, 1>;
+def VCEQfq : N3VQ<0,0,0b00,0b1110,0, IIC_VBINQ, "vceq", "f32", v4i32, v4f32,
+ NEONvceq, 1>;
+def VCEQhd : N3VD<0,0,0b01,0b1110,0, IIC_VBIND, "vceq", "f16", v4i16, v4f16,
+ NEONvceq, 1>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VCEQhq : N3VQ<0,0,0b01,0b1110,0, IIC_VBINQ, "vceq", "f16", v8i16, v8f16,
+ NEONvceq, 1>,
+ Requires<[HasNEON, HasFullFP16]>;
+
+let TwoOperandAliasConstraint = "$Vm = $Vd" in
+defm VCEQz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00010, 0, "vceq", "i",
+ "$Vd, $Vm, #0", NEONvceqz>;
+
+// VCGE : Vector Compare Greater Than or Equal
+defm VCGEs : N3V_QHS<0, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
+ IIC_VSUBi4Q, "vcge", "s", NEONvcge, 0>;
+defm VCGEu : N3V_QHS<1, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
+ IIC_VSUBi4Q, "vcge", "u", NEONvcgeu, 0>;
+def VCGEfd : N3VD<1,0,0b00,0b1110,0, IIC_VBIND, "vcge", "f32", v2i32, v2f32,
+ NEONvcge, 0>;
+def VCGEfq : N3VQ<1,0,0b00,0b1110,0, IIC_VBINQ, "vcge", "f32", v4i32, v4f32,
+ NEONvcge, 0>;
+def VCGEhd : N3VD<1,0,0b01,0b1110,0, IIC_VBIND, "vcge", "f16", v4i16, v4f16,
+ NEONvcge, 0>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VCGEhq : N3VQ<1,0,0b01,0b1110,0, IIC_VBINQ, "vcge", "f16", v8i16, v8f16,
+ NEONvcge, 0>,
+ Requires<[HasNEON, HasFullFP16]>;
+
+let TwoOperandAliasConstraint = "$Vm = $Vd" in {
+defm VCGEz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00001, 0, "vcge", "s",
+ "$Vd, $Vm, #0", NEONvcgez>;
+defm VCLEz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00011, 0, "vcle", "s",
+ "$Vd, $Vm, #0", NEONvclez>;
+}
+
+// VCGT : Vector Compare Greater Than
+defm VCGTs : N3V_QHS<0, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
+ IIC_VSUBi4Q, "vcgt", "s", NEONvcgt, 0>;
+defm VCGTu : N3V_QHS<1, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
+ IIC_VSUBi4Q, "vcgt", "u", NEONvcgtu, 0>;
+def VCGTfd : N3VD<1,0,0b10,0b1110,0, IIC_VBIND, "vcgt", "f32", v2i32, v2f32,
+ NEONvcgt, 0>;
+def VCGTfq : N3VQ<1,0,0b10,0b1110,0, IIC_VBINQ, "vcgt", "f32", v4i32, v4f32,
+ NEONvcgt, 0>;
+def VCGThd : N3VD<1,0,0b11,0b1110,0, IIC_VBIND, "vcgt", "f16", v4i16, v4f16,
+ NEONvcgt, 0>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VCGThq : N3VQ<1,0,0b11,0b1110,0, IIC_VBINQ, "vcgt", "f16", v8i16, v8f16,
+ NEONvcgt, 0>,
+ Requires<[HasNEON, HasFullFP16]>;
+
+let TwoOperandAliasConstraint = "$Vm = $Vd" in {
+defm VCGTz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00000, 0, "vcgt", "s",
+ "$Vd, $Vm, #0", NEONvcgtz>;
+defm VCLTz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00100, 0, "vclt", "s",
+ "$Vd, $Vm, #0", NEONvcltz>;
+}
+
+// VACGE : Vector Absolute Compare Greater Than or Equal (aka VCAGE)
+def VACGEfd : N3VDInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacge",
+ "f32", v2i32, v2f32, int_arm_neon_vacge, 0>;
+def VACGEfq : N3VQInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacge",
+ "f32", v4i32, v4f32, int_arm_neon_vacge, 0>;
+def VACGEhd : N3VDInt<1, 0, 0b01, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacge",
+ "f16", v4i16, v4f16, int_arm_neon_vacge, 0>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VACGEhq : N3VQInt<1, 0, 0b01, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacge",
+ "f16", v8i16, v8f16, int_arm_neon_vacge, 0>,
+ Requires<[HasNEON, HasFullFP16]>;
+// VACGT : Vector Absolute Compare Greater Than (aka VCAGT)
+def VACGTfd : N3VDInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacgt",
+ "f32", v2i32, v2f32, int_arm_neon_vacgt, 0>;
+def VACGTfq : N3VQInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacgt",
+ "f32", v4i32, v4f32, int_arm_neon_vacgt, 0>;
+def VACGThd : N3VDInt<1, 0, 0b11, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacgt",
+ "f16", v4i16, v4f16, int_arm_neon_vacgt, 0>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VACGThq : N3VQInt<1, 0, 0b11, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacgt",
+ "f16", v8f16, v8f16, int_arm_neon_vacgt, 0>,
+ Requires<[HasNEON, HasFullFP16]>;
+// VTST : Vector Test Bits
+defm VTST : N3V_QHS<0, 0, 0b1000, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+ IIC_VBINi4Q, "vtst", "", NEONvtst, 1>;
+
+def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vn, $Vm",
+ (VACGTfd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>;
+def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vn, $Vm",
+ (VACGTfq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>;
+def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vn, $Vm",
+ (VACGEfd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>;
+def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vn, $Vm",
+ (VACGEfq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>;
+let Predicates = [HasNEON, HasFullFP16] in {
+def: NEONInstAlias<"vaclt${p}.f16 $Vd, $Vn, $Vm",
+ (VACGThd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>;
+def: NEONInstAlias<"vaclt${p}.f16 $Vd, $Vn, $Vm",
+ (VACGThq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>;
+def: NEONInstAlias<"vacle${p}.f16 $Vd, $Vn, $Vm",
+ (VACGEhd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>;
+def: NEONInstAlias<"vacle${p}.f16 $Vd, $Vn, $Vm",
+ (VACGEhq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>;
+}
+
+def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vm",
+ (VACGTfd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>;
+def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vm",
+ (VACGTfq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>;
+def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vm",
+ (VACGEfd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>;
+def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vm",
+ (VACGEfq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>;
+let Predicates = [HasNEON, HasFullFP16] in {
+def: NEONInstAlias<"vaclt${p}.f16 $Vd, $Vm",
+ (VACGThd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>;
+def: NEONInstAlias<"vaclt${p}.f16 $Vd, $Vm",
+ (VACGThq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>;
+def: NEONInstAlias<"vacle${p}.f16 $Vd, $Vm",
+ (VACGEhd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>;
+def: NEONInstAlias<"vacle${p}.f16 $Vd, $Vm",
+ (VACGEhq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>;
+}
+
+// Vector Bitwise Operations.
+
+def vnotd : PatFrag<(ops node:$in),
+ (xor node:$in, (bitconvert (v8i8 NEONimmAllOnesV)))>;
+def vnotq : PatFrag<(ops node:$in),
+ (xor node:$in, (bitconvert (v16i8 NEONimmAllOnesV)))>;
+
+
+// VAND : Vector Bitwise AND
+def VANDd : N3VDX<0, 0, 0b00, 0b0001, 1, IIC_VBINiD, "vand",
+ v2i32, v2i32, and, 1>;
+def VANDq : N3VQX<0, 0, 0b00, 0b0001, 1, IIC_VBINiQ, "vand",
+ v4i32, v4i32, and, 1>;
+
+// VEOR : Vector Bitwise Exclusive OR
+def VEORd : N3VDX<1, 0, 0b00, 0b0001, 1, IIC_VBINiD, "veor",
+ v2i32, v2i32, xor, 1>;
+def VEORq : N3VQX<1, 0, 0b00, 0b0001, 1, IIC_VBINiQ, "veor",
+ v4i32, v4i32, xor, 1>;
+
+// VORR : Vector Bitwise OR
+def VORRd : N3VDX<0, 0, 0b10, 0b0001, 1, IIC_VBINiD, "vorr",
+ v2i32, v2i32, or, 1>;
+def VORRq : N3VQX<0, 0, 0b10, 0b0001, 1, IIC_VBINiQ, "vorr",
+ v4i32, v4i32, or, 1>;
+
+def VORRiv4i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 0, 0, 1,
+ (outs DPR:$Vd), (ins nImmSplatI16:$SIMM, DPR:$src),
+ IIC_VMOVImm,
+ "vorr", "i16", "$Vd, $SIMM", "$src = $Vd",
+ [(set DPR:$Vd,
+ (v4i16 (NEONvorrImm DPR:$src, timm:$SIMM)))]> {
+ let Inst{9} = SIMM{9};
+}
+
+def VORRiv2i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 0, 0, 1,
+ (outs DPR:$Vd), (ins nImmSplatI32:$SIMM, DPR:$src),
+ IIC_VMOVImm,
+ "vorr", "i32", "$Vd, $SIMM", "$src = $Vd",
+ [(set DPR:$Vd,
+ (v2i32 (NEONvorrImm DPR:$src, timm:$SIMM)))]> {
+ let Inst{10-9} = SIMM{10-9};
+}
+
+def VORRiv8i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 1, 0, 1,
+ (outs QPR:$Vd), (ins nImmSplatI16:$SIMM, QPR:$src),
+ IIC_VMOVImm,
+ "vorr", "i16", "$Vd, $SIMM", "$src = $Vd",
+ [(set QPR:$Vd,
+ (v8i16 (NEONvorrImm QPR:$src, timm:$SIMM)))]> {
+ let Inst{9} = SIMM{9};
+}
+
+def VORRiv4i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 1, 0, 1,
+ (outs QPR:$Vd), (ins nImmSplatI32:$SIMM, QPR:$src),
+ IIC_VMOVImm,
+ "vorr", "i32", "$Vd, $SIMM", "$src = $Vd",
+ [(set QPR:$Vd,
+ (v4i32 (NEONvorrImm QPR:$src, timm:$SIMM)))]> {
+ let Inst{10-9} = SIMM{10-9};
+}
+
+
+// VBIC : Vector Bitwise Bit Clear (AND NOT)
+let TwoOperandAliasConstraint = "$Vn = $Vd" in {
+def VBICd : N3VX<0, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd),
+ (ins DPR:$Vn, DPR:$Vm), N3RegFrm, IIC_VBINiD,
+ "vbic", "$Vd, $Vn, $Vm", "",
+ [(set DPR:$Vd, (v2i32 (and DPR:$Vn,
+ (vnotd DPR:$Vm))))]>;
+def VBICq : N3VX<0, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd),
+ (ins QPR:$Vn, QPR:$Vm), N3RegFrm, IIC_VBINiQ,
+ "vbic", "$Vd, $Vn, $Vm", "",
+ [(set QPR:$Vd, (v4i32 (and QPR:$Vn,
+ (vnotq QPR:$Vm))))]>;
+}
+
+def VBICiv4i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 0, 1, 1,
+ (outs DPR:$Vd), (ins nImmSplatI16:$SIMM, DPR:$src),
+ IIC_VMOVImm,
+ "vbic", "i16", "$Vd, $SIMM", "$src = $Vd",
+ [(set DPR:$Vd,
+ (v4i16 (NEONvbicImm DPR:$src, timm:$SIMM)))]> {
+ let Inst{9} = SIMM{9};
+}
+
+def VBICiv2i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 0, 1, 1,
+ (outs DPR:$Vd), (ins nImmSplatI32:$SIMM, DPR:$src),
+ IIC_VMOVImm,
+ "vbic", "i32", "$Vd, $SIMM", "$src = $Vd",
+ [(set DPR:$Vd,
+ (v2i32 (NEONvbicImm DPR:$src, timm:$SIMM)))]> {
+ let Inst{10-9} = SIMM{10-9};
+}
+
+def VBICiv8i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 1, 1, 1,
+ (outs QPR:$Vd), (ins nImmSplatI16:$SIMM, QPR:$src),
+ IIC_VMOVImm,
+ "vbic", "i16", "$Vd, $SIMM", "$src = $Vd",
+ [(set QPR:$Vd,
+ (v8i16 (NEONvbicImm QPR:$src, timm:$SIMM)))]> {
+ let Inst{9} = SIMM{9};
+}
+
+def VBICiv4i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 1, 1, 1,
+ (outs QPR:$Vd), (ins nImmSplatI32:$SIMM, QPR:$src),
+ IIC_VMOVImm,
+ "vbic", "i32", "$Vd, $SIMM", "$src = $Vd",
+ [(set QPR:$Vd,
+ (v4i32 (NEONvbicImm QPR:$src, timm:$SIMM)))]> {
+ let Inst{10-9} = SIMM{10-9};
+}
+
+// VORN : Vector Bitwise OR NOT
+def VORNd : N3VX<0, 0, 0b11, 0b0001, 0, 1, (outs DPR:$Vd),
+ (ins DPR:$Vn, DPR:$Vm), N3RegFrm, IIC_VBINiD,
+ "vorn", "$Vd, $Vn, $Vm", "",
+ [(set DPR:$Vd, (v2i32 (or DPR:$Vn,
+ (vnotd DPR:$Vm))))]>;
+def VORNq : N3VX<0, 0, 0b11, 0b0001, 1, 1, (outs QPR:$Vd),
+ (ins QPR:$Vn, QPR:$Vm), N3RegFrm, IIC_VBINiQ,
+ "vorn", "$Vd, $Vn, $Vm", "",
+ [(set QPR:$Vd, (v4i32 (or QPR:$Vn,
+ (vnotq QPR:$Vm))))]>;
+
+// VMVN : Vector Bitwise NOT (Immediate)
+
+let isReMaterializable = 1 in {
+
+def VMVNv4i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 0, 1, 1, (outs DPR:$Vd),
+ (ins nImmSplatI16:$SIMM), IIC_VMOVImm,
+ "vmvn", "i16", "$Vd, $SIMM", "",
+ [(set DPR:$Vd, (v4i16 (NEONvmvnImm timm:$SIMM)))]> {
+ let Inst{9} = SIMM{9};
+}
+
+def VMVNv8i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 1, 1, 1, (outs QPR:$Vd),
+ (ins nImmSplatI16:$SIMM), IIC_VMOVImm,
+ "vmvn", "i16", "$Vd, $SIMM", "",
+ [(set QPR:$Vd, (v8i16 (NEONvmvnImm timm:$SIMM)))]> {
+ let Inst{9} = SIMM{9};
+}
+
+def VMVNv2i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 0, 1, 1, (outs DPR:$Vd),
+ (ins nImmVMOVI32:$SIMM), IIC_VMOVImm,
+ "vmvn", "i32", "$Vd, $SIMM", "",
+ [(set DPR:$Vd, (v2i32 (NEONvmvnImm timm:$SIMM)))]> {
+ let Inst{11-8} = SIMM{11-8};
+}
+
+def VMVNv4i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 1, 1, 1, (outs QPR:$Vd),
+ (ins nImmVMOVI32:$SIMM), IIC_VMOVImm,
+ "vmvn", "i32", "$Vd, $SIMM", "",
+ [(set QPR:$Vd, (v4i32 (NEONvmvnImm timm:$SIMM)))]> {
+ let Inst{11-8} = SIMM{11-8};
+}
+}
+
+// VMVN : Vector Bitwise NOT
+def VMVNd : N2VX<0b11, 0b11, 0b00, 0b00, 0b01011, 0, 0,
+ (outs DPR:$Vd), (ins DPR:$Vm), IIC_VSUBiD,
+ "vmvn", "$Vd, $Vm", "",
+ [(set DPR:$Vd, (v2i32 (vnotd DPR:$Vm)))]>;
+def VMVNq : N2VX<0b11, 0b11, 0b00, 0b00, 0b01011, 1, 0,
+ (outs QPR:$Vd), (ins QPR:$Vm), IIC_VSUBiD,
+ "vmvn", "$Vd, $Vm", "",
+ [(set QPR:$Vd, (v4i32 (vnotq QPR:$Vm)))]>;
+def : Pat<(v2i32 (vnotd DPR:$src)), (VMVNd DPR:$src)>;
+def : Pat<(v4i32 (vnotq QPR:$src)), (VMVNq QPR:$src)>;
+
+// VBSL : Vector Bitwise Select
+def VBSLd : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd),
+ (ins DPR:$src1, DPR:$Vn, DPR:$Vm),
+ N3RegFrm, IIC_VCNTiD,
+ "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd",
+ [(set DPR:$Vd,
+ (v2i32 (NEONvbsl DPR:$src1, DPR:$Vn, DPR:$Vm)))]>;
+def : Pat<(v8i8 (int_arm_neon_vbsl (v8i8 DPR:$src1),
+ (v8i8 DPR:$Vn), (v8i8 DPR:$Vm))),
+ (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+ Requires<[HasNEON]>;
+def : Pat<(v4i16 (int_arm_neon_vbsl (v4i16 DPR:$src1),
+ (v4i16 DPR:$Vn), (v4i16 DPR:$Vm))),
+ (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+ Requires<[HasNEON]>;
+def : Pat<(v2i32 (int_arm_neon_vbsl (v2i32 DPR:$src1),
+ (v2i32 DPR:$Vn), (v2i32 DPR:$Vm))),
+ (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+ Requires<[HasNEON]>;
+def : Pat<(v2f32 (int_arm_neon_vbsl (v2f32 DPR:$src1),
+ (v2f32 DPR:$Vn), (v2f32 DPR:$Vm))),
+ (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+ Requires<[HasNEON]>;
+def : Pat<(v1i64 (int_arm_neon_vbsl (v1i64 DPR:$src1),
+ (v1i64 DPR:$Vn), (v1i64 DPR:$Vm))),
+ (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+ Requires<[HasNEON]>;
+
+def : Pat<(v2i32 (or (and DPR:$Vn, DPR:$Vd),
+ (and DPR:$Vm, (vnotd DPR:$Vd)))),
+ (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>,
+ Requires<[HasNEON]>;
+
+def : Pat<(v1i64 (or (and DPR:$Vn, DPR:$Vd),
+ (and DPR:$Vm, (vnotd DPR:$Vd)))),
+ (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>,
+ Requires<[HasNEON]>;
+
+def VBSLq : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd),
+ (ins QPR:$src1, QPR:$Vn, QPR:$Vm),
+ N3RegFrm, IIC_VCNTiQ,
+ "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd",
+ [(set QPR:$Vd,
+ (v4i32 (NEONvbsl QPR:$src1, QPR:$Vn, QPR:$Vm)))]>;
+
+def : Pat<(v16i8 (int_arm_neon_vbsl (v16i8 QPR:$src1),
+ (v16i8 QPR:$Vn), (v16i8 QPR:$Vm))),
+ (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+ Requires<[HasNEON]>;
+def : Pat<(v8i16 (int_arm_neon_vbsl (v8i16 QPR:$src1),
+ (v8i16 QPR:$Vn), (v8i16 QPR:$Vm))),
+ (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+ Requires<[HasNEON]>;
+def : Pat<(v4i32 (int_arm_neon_vbsl (v4i32 QPR:$src1),
+ (v4i32 QPR:$Vn), (v4i32 QPR:$Vm))),
+ (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+ Requires<[HasNEON]>;
+def : Pat<(v4f32 (int_arm_neon_vbsl (v4f32 QPR:$src1),
+ (v4f32 QPR:$Vn), (v4f32 QPR:$Vm))),
+ (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+ Requires<[HasNEON]>;
+def : Pat<(v2i64 (int_arm_neon_vbsl (v2i64 QPR:$src1),
+ (v2i64 QPR:$Vn), (v2i64 QPR:$Vm))),
+ (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+ Requires<[HasNEON]>;
+
+def : Pat<(v4i32 (or (and QPR:$Vn, QPR:$Vd),
+ (and QPR:$Vm, (vnotq QPR:$Vd)))),
+ (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>,
+ Requires<[HasNEON]>;
+def : Pat<(v2i64 (or (and QPR:$Vn, QPR:$Vd),
+ (and QPR:$Vm, (vnotq QPR:$Vd)))),
+ (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>,
+ Requires<[HasNEON]>;
+
+// VBIF : Vector Bitwise Insert if False
+// like VBSL but with: "vbif $dst, $src3, $src1", "$src2 = $dst",
+// FIXME: This instruction's encoding MAY NOT BE correct.
+def VBIFd : N3VX<1, 0, 0b11, 0b0001, 0, 1,
+ (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm),
+ N3RegFrm, IIC_VBINiD,
+ "vbif", "$Vd, $Vn, $Vm", "$src1 = $Vd",
+ []>;
+def VBIFq : N3VX<1, 0, 0b11, 0b0001, 1, 1,
+ (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm),
+ N3RegFrm, IIC_VBINiQ,
+ "vbif", "$Vd, $Vn, $Vm", "$src1 = $Vd",
+ []>;
+
+// VBIT : Vector Bitwise Insert if True
+// like VBSL but with: "vbit $dst, $src2, $src1", "$src3 = $dst",
+// FIXME: This instruction's encoding MAY NOT BE correct.
+def VBITd : N3VX<1, 0, 0b10, 0b0001, 0, 1,
+ (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm),
+ N3RegFrm, IIC_VBINiD,
+ "vbit", "$Vd, $Vn, $Vm", "$src1 = $Vd",
+ []>;
+def VBITq : N3VX<1, 0, 0b10, 0b0001, 1, 1,
+ (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm),
+ N3RegFrm, IIC_VBINiQ,
+ "vbit", "$Vd, $Vn, $Vm", "$src1 = $Vd",
+ []>;
+
+// VBIT/VBIF are not yet implemented. The TwoAddress pass will not go looking
+// for equivalent operations with different register constraints; it just
+// inserts copies.
+
+// Vector Absolute Differences.
+
+// VABD : Vector Absolute Difference
+defm VABDs : N3VInt_QHS<0, 0, 0b0111, 0, N3RegFrm,
+ IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+ "vabd", "s", int_arm_neon_vabds, 1>;
+defm VABDu : N3VInt_QHS<1, 0, 0b0111, 0, N3RegFrm,
+ IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+ "vabd", "u", int_arm_neon_vabdu, 1>;
+def VABDfd : N3VDInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBIND,
+ "vabd", "f32", v2f32, v2f32, int_arm_neon_vabds, 1>;
+def VABDfq : N3VQInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBINQ,
+ "vabd", "f32", v4f32, v4f32, int_arm_neon_vabds, 1>;
+def VABDhd : N3VDInt<1, 0, 0b11, 0b1101, 0, N3RegFrm, IIC_VBIND,
+ "vabd", "f16", v4f16, v4f16, int_arm_neon_vabds, 1>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VABDhq : N3VQInt<1, 0, 0b11, 0b1101, 0, N3RegFrm, IIC_VBINQ,
+ "vabd", "f16", v8f16, v8f16, int_arm_neon_vabds, 1>,
+ Requires<[HasNEON, HasFullFP16]>;
+
+// VABDL : Vector Absolute Difference Long (Q = | D - D |)
+defm VABDLs : N3VLIntExt_QHS<0,1,0b0111,0, IIC_VSUBi4Q,
+ "vabdl", "s", int_arm_neon_vabds, zext, 1>;
+defm VABDLu : N3VLIntExt_QHS<1,1,0b0111,0, IIC_VSUBi4Q,
+ "vabdl", "u", int_arm_neon_vabdu, zext, 1>;
+
+def abd_shr :
+ PatFrag<(ops node:$in1, node:$in2, node:$shift),
+ (NEONvshrs (sub (zext node:$in1),
+ (zext node:$in2)), (i32 $shift))>;
+
+def : Pat<(xor (v4i32 (bitconvert (v8i16 (abd_shr (v8i8 DPR:$opA), (v8i8 DPR:$opB), 15)))),
+ (v4i32 (bitconvert (v8i16 (add (sub (zext (v8i8 DPR:$opA)),
+ (zext (v8i8 DPR:$opB))),
+ (v8i16 (abd_shr (v8i8 DPR:$opA), (v8i8 DPR:$opB), 15))))))),
+ (VABDLuv8i16 DPR:$opA, DPR:$opB)>;
+
+def : Pat<(xor (v4i32 (abd_shr (v4i16 DPR:$opA), (v4i16 DPR:$opB), 31)),
+ (v4i32 (add (sub (zext (v4i16 DPR:$opA)),
+ (zext (v4i16 DPR:$opB))),
+ (abd_shr (v4i16 DPR:$opA), (v4i16 DPR:$opB), 31)))),
+ (VABDLuv4i32 DPR:$opA, DPR:$opB)>;
+
+def : Pat<(xor (v4i32 (bitconvert (v2i64 (abd_shr (v2i32 DPR:$opA), (v2i32 DPR:$opB), 63)))),
+ (v4i32 (bitconvert (v2i64 (add (sub (zext (v2i32 DPR:$opA)),
+ (zext (v2i32 DPR:$opB))),
+ (abd_shr (v2i32 DPR:$opA), (v2i32 DPR:$opB), 63)))))),
+ (VABDLuv2i64 DPR:$opA, DPR:$opB)>;
+
+// VABA : Vector Absolute Difference and Accumulate
+defm VABAs : N3VIntOp_QHS<0,0,0b0111,1, IIC_VABAD, IIC_VABAQ,
+ "vaba", "s", int_arm_neon_vabds, add>;
+defm VABAu : N3VIntOp_QHS<1,0,0b0111,1, IIC_VABAD, IIC_VABAQ,
+ "vaba", "u", int_arm_neon_vabdu, add>;
+
+// VABAL : Vector Absolute Difference and Accumulate Long (Q += | D - D |)
+defm VABALs : N3VLIntExtOp_QHS<0,1,0b0101,0, IIC_VABAD,
+ "vabal", "s", int_arm_neon_vabds, zext, add>;
+defm VABALu : N3VLIntExtOp_QHS<1,1,0b0101,0, IIC_VABAD,
+ "vabal", "u", int_arm_neon_vabdu, zext, add>;
+
+// Vector Maximum and Minimum.
+
+// VMAX : Vector Maximum
+defm VMAXs : N3VInt_QHS<0, 0, 0b0110, 0, N3RegFrm,
+ IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+ "vmax", "s", smax, 1>;
+defm VMAXu : N3VInt_QHS<1, 0, 0b0110, 0, N3RegFrm,
+ IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+ "vmax", "u", umax, 1>;
+def VMAXfd : N3VDInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBIND,
+ "vmax", "f32",
+ v2f32, v2f32, fmaxnan, 1>;
+def VMAXfq : N3VQInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBINQ,
+ "vmax", "f32",
+ v4f32, v4f32, fmaxnan, 1>;
+def VMAXhd : N3VDInt<0, 0, 0b01, 0b1111, 0, N3RegFrm, IIC_VBIND,
+ "vmax", "f16",
+ v4f16, v4f16, fmaxnan, 1>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VMAXhq : N3VQInt<0, 0, 0b01, 0b1111, 0, N3RegFrm, IIC_VBINQ,
+ "vmax", "f16",
+ v8f16, v8f16, fmaxnan, 1>,
+ Requires<[HasNEON, HasFullFP16]>;
+
+// VMAXNM
+let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in {
+ def VMAXNMNDf : N3VDIntnp<0b00110, 0b00, 0b1111, 0, 1,
+ N3RegFrm, NoItinerary, "vmaxnm", "f32",
+ v2f32, v2f32, fmaxnum, 1>,
+ Requires<[HasV8, HasNEON]>;
+ def VMAXNMNQf : N3VQIntnp<0b00110, 0b00, 0b1111, 1, 1,
+ N3RegFrm, NoItinerary, "vmaxnm", "f32",
+ v4f32, v4f32, fmaxnum, 1>,
+ Requires<[HasV8, HasNEON]>;
+ def VMAXNMNDh : N3VDIntnp<0b00110, 0b01, 0b1111, 0, 1,
+ N3RegFrm, NoItinerary, "vmaxnm", "f16",
+ v4f16, v4f16, fmaxnum, 1>,
+ Requires<[HasV8, HasNEON, HasFullFP16]>;
+ def VMAXNMNQh : N3VQIntnp<0b00110, 0b01, 0b1111, 1, 1,
+ N3RegFrm, NoItinerary, "vmaxnm", "f16",
+ v8f16, v8f16, fmaxnum, 1>,
+ Requires<[HasV8, HasNEON, HasFullFP16]>;
+}
+
+// VMIN : Vector Minimum
+defm VMINs : N3VInt_QHS<0, 0, 0b0110, 1, N3RegFrm,
+ IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+ "vmin", "s", smin, 1>;
+defm VMINu : N3VInt_QHS<1, 0, 0b0110, 1, N3RegFrm,
+ IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+ "vmin", "u", umin, 1>;
+def VMINfd : N3VDInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBIND,
+ "vmin", "f32",
+ v2f32, v2f32, fminnan, 1>;
+def VMINfq : N3VQInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBINQ,
+ "vmin", "f32",
+ v4f32, v4f32, fminnan, 1>;
+def VMINhd : N3VDInt<0, 0, 0b11, 0b1111, 0, N3RegFrm, IIC_VBIND,
+ "vmin", "f16",
+ v4f16, v4f16, fminnan, 1>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VMINhq : N3VQInt<0, 0, 0b11, 0b1111, 0, N3RegFrm, IIC_VBINQ,
+ "vmin", "f16",
+ v8f16, v8f16, fminnan, 1>,
+ Requires<[HasNEON, HasFullFP16]>;
+
+// VMINNM
+let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in {
+ def VMINNMNDf : N3VDIntnp<0b00110, 0b10, 0b1111, 0, 1,
+ N3RegFrm, NoItinerary, "vminnm", "f32",
+ v2f32, v2f32, fminnum, 1>,
+ Requires<[HasV8, HasNEON]>;
+ def VMINNMNQf : N3VQIntnp<0b00110, 0b10, 0b1111, 1, 1,
+ N3RegFrm, NoItinerary, "vminnm", "f32",
+ v4f32, v4f32, fminnum, 1>,
+ Requires<[HasV8, HasNEON]>;
+ def VMINNMNDh : N3VDIntnp<0b00110, 0b11, 0b1111, 0, 1,
+ N3RegFrm, NoItinerary, "vminnm", "f16",
+ v4f16, v4f16, fminnum, 1>,
+ Requires<[HasV8, HasNEON, HasFullFP16]>;
+ def VMINNMNQh : N3VQIntnp<0b00110, 0b11, 0b1111, 1, 1,
+ N3RegFrm, NoItinerary, "vminnm", "f16",
+ v8f16, v8f16, fminnum, 1>,
+ Requires<[HasV8, HasNEON, HasFullFP16]>;
+}
+
+// Vector Pairwise Operations.
+
+// VPADD : Vector Pairwise Add
+def VPADDi8 : N3VDInt<0, 0, 0b00, 0b1011, 1, N3RegFrm, IIC_VSHLiD,
+ "vpadd", "i8",
+ v8i8, v8i8, int_arm_neon_vpadd, 0>;
+def VPADDi16 : N3VDInt<0, 0, 0b01, 0b1011, 1, N3RegFrm, IIC_VSHLiD,
+ "vpadd", "i16",
+ v4i16, v4i16, int_arm_neon_vpadd, 0>;
+def VPADDi32 : N3VDInt<0, 0, 0b10, 0b1011, 1, N3RegFrm, IIC_VSHLiD,
+ "vpadd", "i32",
+ v2i32, v2i32, int_arm_neon_vpadd, 0>;
+def VPADDf : N3VDInt<1, 0, 0b00, 0b1101, 0, N3RegFrm,
+ IIC_VPBIND, "vpadd", "f32",
+ v2f32, v2f32, int_arm_neon_vpadd, 0>;
+def VPADDh : N3VDInt<1, 0, 0b01, 0b1101, 0, N3RegFrm,
+ IIC_VPBIND, "vpadd", "f16",
+ v4f16, v4f16, int_arm_neon_vpadd, 0>,
+ Requires<[HasNEON, HasFullFP16]>;
+
+// VPADDL : Vector Pairwise Add Long
+defm VPADDLs : N2VPLInt_QHS<0b11, 0b11, 0b00, 0b00100, 0, "vpaddl", "s",
+ int_arm_neon_vpaddls>;
+defm VPADDLu : N2VPLInt_QHS<0b11, 0b11, 0b00, 0b00101, 0, "vpaddl", "u",
+ int_arm_neon_vpaddlu>;
+
+// VPADAL : Vector Pairwise Add and Accumulate Long
+defm VPADALs : N2VPLInt2_QHS<0b11, 0b11, 0b00, 0b01100, 0, "vpadal", "s",
+ int_arm_neon_vpadals>;
+defm VPADALu : N2VPLInt2_QHS<0b11, 0b11, 0b00, 0b01101, 0, "vpadal", "u",
+ int_arm_neon_vpadalu>;
+
+// VPMAX : Vector Pairwise Maximum
+def VPMAXs8 : N3VDInt<0, 0, 0b00, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax",
+ "s8", v8i8, v8i8, int_arm_neon_vpmaxs, 0>;
+def VPMAXs16 : N3VDInt<0, 0, 0b01, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax",
+ "s16", v4i16, v4i16, int_arm_neon_vpmaxs, 0>;
+def VPMAXs32 : N3VDInt<0, 0, 0b10, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax",
+ "s32", v2i32, v2i32, int_arm_neon_vpmaxs, 0>;
+def VPMAXu8 : N3VDInt<1, 0, 0b00, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax",
+ "u8", v8i8, v8i8, int_arm_neon_vpmaxu, 0>;
+def VPMAXu16 : N3VDInt<1, 0, 0b01, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax",
+ "u16", v4i16, v4i16, int_arm_neon_vpmaxu, 0>;
+def VPMAXu32 : N3VDInt<1, 0, 0b10, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax",
+ "u32", v2i32, v2i32, int_arm_neon_vpmaxu, 0>;
+def VPMAXf : N3VDInt<1, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmax",
+ "f32", v2f32, v2f32, int_arm_neon_vpmaxs, 0>;
+def VPMAXh : N3VDInt<1, 0, 0b01, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmax",
+ "f16", v4f16, v4f16, int_arm_neon_vpmaxs, 0>,
+ Requires<[HasNEON, HasFullFP16]>;
+
+// VPMIN : Vector Pairwise Minimum
+def VPMINs8 : N3VDInt<0, 0, 0b00, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin",
+ "s8", v8i8, v8i8, int_arm_neon_vpmins, 0>;
+def VPMINs16 : N3VDInt<0, 0, 0b01, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin",
+ "s16", v4i16, v4i16, int_arm_neon_vpmins, 0>;
+def VPMINs32 : N3VDInt<0, 0, 0b10, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin",
+ "s32", v2i32, v2i32, int_arm_neon_vpmins, 0>;
+def VPMINu8 : N3VDInt<1, 0, 0b00, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin",
+ "u8", v8i8, v8i8, int_arm_neon_vpminu, 0>;
+def VPMINu16 : N3VDInt<1, 0, 0b01, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin",
+ "u16", v4i16, v4i16, int_arm_neon_vpminu, 0>;
+def VPMINu32 : N3VDInt<1, 0, 0b10, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin",
+ "u32", v2i32, v2i32, int_arm_neon_vpminu, 0>;
+def VPMINf : N3VDInt<1, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmin",
+ "f32", v2f32, v2f32, int_arm_neon_vpmins, 0>;
+def VPMINh : N3VDInt<1, 0, 0b11, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmin",
+ "f16", v4f16, v4f16, int_arm_neon_vpmins, 0>,
+ Requires<[HasNEON, HasFullFP16]>;
+
+// Vector Reciprocal and Reciprocal Square Root Estimate and Step.
+
+// VRECPE : Vector Reciprocal Estimate
+def VRECPEd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0,
+ IIC_VUNAD, "vrecpe", "u32",
+ v2i32, v2i32, int_arm_neon_vrecpe>;
+def VRECPEq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0,
+ IIC_VUNAQ, "vrecpe", "u32",
+ v4i32, v4i32, int_arm_neon_vrecpe>;
+def VRECPEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0,
+ IIC_VUNAD, "vrecpe", "f32",
+ v2f32, v2f32, int_arm_neon_vrecpe>;
+def VRECPEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0,
+ IIC_VUNAQ, "vrecpe", "f32",
+ v4f32, v4f32, int_arm_neon_vrecpe>;
+def VRECPEhd : N2VDInt<0b11, 0b11, 0b01, 0b11, 0b01010, 0,
+ IIC_VUNAD, "vrecpe", "f16",
+ v4f16, v4f16, int_arm_neon_vrecpe>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VRECPEhq : N2VQInt<0b11, 0b11, 0b01, 0b11, 0b01010, 0,
+ IIC_VUNAQ, "vrecpe", "f16",
+ v8f16, v8f16, int_arm_neon_vrecpe>,
+ Requires<[HasNEON, HasFullFP16]>;
+
+// VRECPS : Vector Reciprocal Step
+def VRECPSfd : N3VDInt<0, 0, 0b00, 0b1111, 1, N3RegFrm,
+ IIC_VRECSD, "vrecps", "f32",
+ v2f32, v2f32, int_arm_neon_vrecps, 1>;
+def VRECPSfq : N3VQInt<0, 0, 0b00, 0b1111, 1, N3RegFrm,
+ IIC_VRECSQ, "vrecps", "f32",
+ v4f32, v4f32, int_arm_neon_vrecps, 1>;
+def VRECPShd : N3VDInt<0, 0, 0b01, 0b1111, 1, N3RegFrm,
+ IIC_VRECSD, "vrecps", "f16",
+ v4f16, v4f16, int_arm_neon_vrecps, 1>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VRECPShq : N3VQInt<0, 0, 0b01, 0b1111, 1, N3RegFrm,
+ IIC_VRECSQ, "vrecps", "f16",
+ v8f16, v8f16, int_arm_neon_vrecps, 1>,
+ Requires<[HasNEON, HasFullFP16]>;
+
+// VRSQRTE : Vector Reciprocal Square Root Estimate
+def VRSQRTEd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0,
+ IIC_VUNAD, "vrsqrte", "u32",
+ v2i32, v2i32, int_arm_neon_vrsqrte>;
+def VRSQRTEq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0,
+ IIC_VUNAQ, "vrsqrte", "u32",
+ v4i32, v4i32, int_arm_neon_vrsqrte>;
+def VRSQRTEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0,
+ IIC_VUNAD, "vrsqrte", "f32",
+ v2f32, v2f32, int_arm_neon_vrsqrte>;
+def VRSQRTEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0,
+ IIC_VUNAQ, "vrsqrte", "f32",
+ v4f32, v4f32, int_arm_neon_vrsqrte>;
+def VRSQRTEhd : N2VDInt<0b11, 0b11, 0b01, 0b11, 0b01011, 0,
+ IIC_VUNAD, "vrsqrte", "f16",
+ v4f16, v4f16, int_arm_neon_vrsqrte>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VRSQRTEhq : N2VQInt<0b11, 0b11, 0b01, 0b11, 0b01011, 0,
+ IIC_VUNAQ, "vrsqrte", "f16",
+ v8f16, v8f16, int_arm_neon_vrsqrte>,
+ Requires<[HasNEON, HasFullFP16]>;
+
+// VRSQRTS : Vector Reciprocal Square Root Step
+def VRSQRTSfd : N3VDInt<0, 0, 0b10, 0b1111, 1, N3RegFrm,
+ IIC_VRECSD, "vrsqrts", "f32",
+ v2f32, v2f32, int_arm_neon_vrsqrts, 1>;
+def VRSQRTSfq : N3VQInt<0, 0, 0b10, 0b1111, 1, N3RegFrm,
+ IIC_VRECSQ, "vrsqrts", "f32",
+ v4f32, v4f32, int_arm_neon_vrsqrts, 1>;
+def VRSQRTShd : N3VDInt<0, 0, 0b11, 0b1111, 1, N3RegFrm,
+ IIC_VRECSD, "vrsqrts", "f16",
+ v4f16, v4f16, int_arm_neon_vrsqrts, 1>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VRSQRTShq : N3VQInt<0, 0, 0b11, 0b1111, 1, N3RegFrm,
+ IIC_VRECSQ, "vrsqrts", "f16",
+ v8f16, v8f16, int_arm_neon_vrsqrts, 1>,
+ Requires<[HasNEON, HasFullFP16]>;
+
+// Vector Shifts.
+
+// VSHL : Vector Shift
+defm VSHLs : N3VInt_QHSDSh<0, 0, 0b0100, 0, N3RegVShFrm,
+ IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ, IIC_VSHLiQ,
+ "vshl", "s", int_arm_neon_vshifts>;
+defm VSHLu : N3VInt_QHSDSh<1, 0, 0b0100, 0, N3RegVShFrm,
+ IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ, IIC_VSHLiQ,
+ "vshl", "u", int_arm_neon_vshiftu>;
+
+// VSHL : Vector Shift Left (Immediate)
+defm VSHLi : N2VShL_QHSD<0, 1, 0b0101, 1, IIC_VSHLiD, "vshl", "i", NEONvshl>;
+
+// VSHR : Vector Shift Right (Immediate)
+defm VSHRs : N2VShR_QHSD<0, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "s", "VSHRs",
+ NEONvshrs>;
+defm VSHRu : N2VShR_QHSD<1, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "u", "VSHRu",
+ NEONvshru>;
+
+// VSHLL : Vector Shift Left Long
+defm VSHLLs : N2VLSh_QHS<0, 1, 0b1010, 0, 0, 1, "vshll", "s",
+ PatFrag<(ops node:$LHS, node:$RHS), (NEONvshl (sext node:$LHS), node:$RHS)>>;
+defm VSHLLu : N2VLSh_QHS<1, 1, 0b1010, 0, 0, 1, "vshll", "u",
+ PatFrag<(ops node:$LHS, node:$RHS), (NEONvshl (zext node:$LHS), node:$RHS)>>;
+
+// VSHLL : Vector Shift Left Long (with maximum shift count)
+class N2VLShMax<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
+ bit op6, bit op4, string OpcodeStr, string Dt, ValueType ResTy,
+ ValueType OpTy, Operand ImmTy>
+ : N2VLSh<op24, op23, op11_8, op7, op6, op4, OpcodeStr, Dt,
+ ResTy, OpTy, ImmTy, null_frag> {
+ let Inst{21-16} = op21_16;
+ let DecoderMethod = "DecodeVSHLMaxInstruction";
+}
+def VSHLLi8 : N2VLShMax<1, 1, 0b110010, 0b0011, 0, 0, 0, "vshll", "i8",
+ v8i16, v8i8, imm8>;
+def VSHLLi16 : N2VLShMax<1, 1, 0b110110, 0b0011, 0, 0, 0, "vshll", "i16",
+ v4i32, v4i16, imm16>;
+def VSHLLi32 : N2VLShMax<1, 1, 0b111010, 0b0011, 0, 0, 0, "vshll", "i32",
+ v2i64, v2i32, imm32>;
+
+def : Pat<(v8i16 (NEONvshl (zext (v8i8 DPR:$Rn)), (i32 8))),
+ (VSHLLi8 DPR:$Rn, 8)>;
+def : Pat<(v4i32 (NEONvshl (zext (v4i16 DPR:$Rn)), (i32 16))),
+ (VSHLLi16 DPR:$Rn, 16)>;
+def : Pat<(v2i64 (NEONvshl (zext (v2i32 DPR:$Rn)), (i32 32))),
+ (VSHLLi32 DPR:$Rn, 32)>;
+def : Pat<(v8i16 (NEONvshl (sext (v8i8 DPR:$Rn)), (i32 8))),
+ (VSHLLi8 DPR:$Rn, 8)>;
+def : Pat<(v4i32 (NEONvshl (sext (v4i16 DPR:$Rn)), (i32 16))),
+ (VSHLLi16 DPR:$Rn, 16)>;
+def : Pat<(v2i64 (NEONvshl (sext (v2i32 DPR:$Rn)), (i32 32))),
+ (VSHLLi32 DPR:$Rn, 32)>;
+
+// VSHRN : Vector Shift Right and Narrow
+defm VSHRN : N2VNSh_HSD<0,1,0b1000,0,0,1, IIC_VSHLiD, "vshrn", "i",
+ PatFrag<(ops node:$Rn, node:$amt),
+ (trunc (NEONvshrs node:$Rn, node:$amt))>>;
+
+def : Pat<(v8i8 (trunc (NEONvshru (v8i16 QPR:$Vn), shr_imm8:$amt))),
+ (VSHRNv8i8 QPR:$Vn, shr_imm8:$amt)>;
+def : Pat<(v4i16 (trunc (NEONvshru (v4i32 QPR:$Vn), shr_imm16:$amt))),
+ (VSHRNv4i16 QPR:$Vn, shr_imm16:$amt)>;
+def : Pat<(v2i32 (trunc (NEONvshru (v2i64 QPR:$Vn), shr_imm32:$amt))),
+ (VSHRNv2i32 QPR:$Vn, shr_imm32:$amt)>;
+
+// VRSHL : Vector Rounding Shift
+defm VRSHLs : N3VInt_QHSDSh<0, 0, 0b0101, 0, N3RegVShFrm,
+ IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
+ "vrshl", "s", int_arm_neon_vrshifts>;
+defm VRSHLu : N3VInt_QHSDSh<1, 0, 0b0101, 0, N3RegVShFrm,
+ IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
+ "vrshl", "u", int_arm_neon_vrshiftu>;
+// VRSHR : Vector Rounding Shift Right
+defm VRSHRs : N2VShR_QHSD<0,1,0b0010,1, IIC_VSHLi4D, "vrshr", "s", "VRSHRs",
+ NEONvrshrs>;
+defm VRSHRu : N2VShR_QHSD<1,1,0b0010,1, IIC_VSHLi4D, "vrshr", "u", "VRSHRu",
+ NEONvrshru>;
+
+// VRSHRN : Vector Rounding Shift Right and Narrow
+defm VRSHRN : N2VNSh_HSD<0, 1, 0b1000, 0, 1, 1, IIC_VSHLi4D, "vrshrn", "i",
+ NEONvrshrn>;
+
+// VQSHL : Vector Saturating Shift
+defm VQSHLs : N3VInt_QHSDSh<0, 0, 0b0100, 1, N3RegVShFrm,
+ IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
+ "vqshl", "s", int_arm_neon_vqshifts>;
+defm VQSHLu : N3VInt_QHSDSh<1, 0, 0b0100, 1, N3RegVShFrm,
+ IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
+ "vqshl", "u", int_arm_neon_vqshiftu>;
+// VQSHL : Vector Saturating Shift Left (Immediate)
+defm VQSHLsi : N2VShL_QHSD<0,1,0b0111,1, IIC_VSHLi4D, "vqshl", "s",NEONvqshls>;
+defm VQSHLui : N2VShL_QHSD<1,1,0b0111,1, IIC_VSHLi4D, "vqshl", "u",NEONvqshlu>;
+
+// VQSHLU : Vector Saturating Shift Left (Immediate, Unsigned)
+defm VQSHLsu : N2VShL_QHSD<1,1,0b0110,1, IIC_VSHLi4D,"vqshlu","s",NEONvqshlsu>;
+
+// VQSHRN : Vector Saturating Shift Right and Narrow
+defm VQSHRNs : N2VNSh_HSD<0, 1, 0b1001, 0, 0, 1, IIC_VSHLi4D, "vqshrn", "s",
+ NEONvqshrns>;
+defm VQSHRNu : N2VNSh_HSD<1, 1, 0b1001, 0, 0, 1, IIC_VSHLi4D, "vqshrn", "u",
+ NEONvqshrnu>;
+
+// VQSHRUN : Vector Saturating Shift Right and Narrow (Unsigned)
+defm VQSHRUN : N2VNSh_HSD<1, 1, 0b1000, 0, 0, 1, IIC_VSHLi4D, "vqshrun", "s",
+ NEONvqshrnsu>;
+
+// VQRSHL : Vector Saturating Rounding Shift
+defm VQRSHLs : N3VInt_QHSDSh<0, 0, 0b0101, 1, N3RegVShFrm,
+ IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
+ "vqrshl", "s", int_arm_neon_vqrshifts>;
+defm VQRSHLu : N3VInt_QHSDSh<1, 0, 0b0101, 1, N3RegVShFrm,
+ IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
+ "vqrshl", "u", int_arm_neon_vqrshiftu>;
+
+// VQRSHRN : Vector Saturating Rounding Shift Right and Narrow
+defm VQRSHRNs : N2VNSh_HSD<0, 1, 0b1001, 0, 1, 1, IIC_VSHLi4D, "vqrshrn", "s",
+ NEONvqrshrns>;
+defm VQRSHRNu : N2VNSh_HSD<1, 1, 0b1001, 0, 1, 1, IIC_VSHLi4D, "vqrshrn", "u",
+ NEONvqrshrnu>;
+
+// VQRSHRUN : Vector Saturating Rounding Shift Right and Narrow (Unsigned)
+defm VQRSHRUN : N2VNSh_HSD<1, 1, 0b1000, 0, 1, 1, IIC_VSHLi4D, "vqrshrun", "s",
+ NEONvqrshrnsu>;
+
+// VSRA : Vector Shift Right and Accumulate
+defm VSRAs : N2VShAdd_QHSD<0, 1, 0b0001, 1, "vsra", "s", NEONvshrs>;
+defm VSRAu : N2VShAdd_QHSD<1, 1, 0b0001, 1, "vsra", "u", NEONvshru>;
+// VRSRA : Vector Rounding Shift Right and Accumulate
+defm VRSRAs : N2VShAdd_QHSD<0, 1, 0b0011, 1, "vrsra", "s", NEONvrshrs>;
+defm VRSRAu : N2VShAdd_QHSD<1, 1, 0b0011, 1, "vrsra", "u", NEONvrshru>;
+
+// VSLI : Vector Shift Left and Insert
+defm VSLI : N2VShInsL_QHSD<1, 1, 0b0101, 1, "vsli">;
+
+// VSRI : Vector Shift Right and Insert
+defm VSRI : N2VShInsR_QHSD<1, 1, 0b0100, 1, "vsri">;
+
+// Vector Absolute and Saturating Absolute.
+
+// VABS : Vector Absolute Value
+defm VABS : N2VInt_QHS<0b11, 0b11, 0b01, 0b00110, 0,
+ IIC_VUNAiD, IIC_VUNAiQ, "vabs", "s",
+ int_arm_neon_vabs>;
+def VABSfd : N2VD<0b11, 0b11, 0b10, 0b01, 0b01110, 0,
+ "vabs", "f32",
+ v2f32, v2f32, fabs>;
+def VABSfq : N2VQ<0b11, 0b11, 0b10, 0b01, 0b01110, 0,
+ "vabs", "f32",
+ v4f32, v4f32, fabs>;
+def VABShd : N2VD<0b11, 0b11, 0b01, 0b01, 0b01110, 0,
+ "vabs", "f16",
+ v4f16, v4f16, fabs>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VABShq : N2VQ<0b11, 0b11, 0b01, 0b01, 0b01110, 0,
+ "vabs", "f16",
+ v8f16, v8f16, fabs>,
+ Requires<[HasNEON, HasFullFP16]>;
+
+def : Pat<(xor (v2i32 (bitconvert (v8i8 (NEONvshrs DPR:$src, (i32 7))))),
+ (v2i32 (bitconvert (v8i8 (add DPR:$src,
+ (NEONvshrs DPR:$src, (i32 7))))))),
+ (VABSv8i8 DPR:$src)>;
+def : Pat<(xor (v2i32 (bitconvert (v4i16 (NEONvshrs DPR:$src, (i32 15))))),
+ (v2i32 (bitconvert (v4i16 (add DPR:$src,
+ (NEONvshrs DPR:$src, (i32 15))))))),
+ (VABSv4i16 DPR:$src)>;
+def : Pat<(xor (v2i32 (NEONvshrs DPR:$src, (i32 31))),
+ (v2i32 (add DPR:$src, (NEONvshrs DPR:$src, (i32 31))))),
+ (VABSv2i32 DPR:$src)>;
+def : Pat<(xor (v4i32 (bitconvert (v16i8 (NEONvshrs QPR:$src, (i32 7))))),
+ (v4i32 (bitconvert (v16i8 (add QPR:$src,
+ (NEONvshrs QPR:$src, (i32 7))))))),
+ (VABSv16i8 QPR:$src)>;
+def : Pat<(xor (v4i32 (bitconvert (v8i16 (NEONvshrs QPR:$src, (i32 15))))),
+ (v4i32 (bitconvert (v8i16 (add QPR:$src,
+ (NEONvshrs QPR:$src, (i32 15))))))),
+ (VABSv8i16 QPR:$src)>;
+def : Pat<(xor (v4i32 (NEONvshrs QPR:$src, (i32 31))),
+ (v4i32 (add QPR:$src, (NEONvshrs QPR:$src, (i32 31))))),
+ (VABSv4i32 QPR:$src)>;
+
+// VQABS : Vector Saturating Absolute Value
+defm VQABS : N2VInt_QHS<0b11, 0b11, 0b00, 0b01110, 0,
+ IIC_VQUNAiD, IIC_VQUNAiQ, "vqabs", "s",
+ int_arm_neon_vqabs>;
+
+// Vector Negate.
+
+def vnegd : PatFrag<(ops node:$in),
+ (sub (bitconvert (v2i32 NEONimmAllZerosV)), node:$in)>;
+def vnegq : PatFrag<(ops node:$in),
+ (sub (bitconvert (v4i32 NEONimmAllZerosV)), node:$in)>;
+
+class VNEGD<bits<2> size, string OpcodeStr, string Dt, ValueType Ty>
+ : N2V<0b11, 0b11, size, 0b01, 0b00111, 0, 0, (outs DPR:$Vd), (ins DPR:$Vm),
+ IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm", "",
+ [(set DPR:$Vd, (Ty (vnegd DPR:$Vm)))]>;
+class VNEGQ<bits<2> size, string OpcodeStr, string Dt, ValueType Ty>
+ : N2V<0b11, 0b11, size, 0b01, 0b00111, 1, 0, (outs QPR:$Vd), (ins QPR:$Vm),
+ IIC_VSHLiQ, OpcodeStr, Dt, "$Vd, $Vm", "",
+ [(set QPR:$Vd, (Ty (vnegq QPR:$Vm)))]>;
+
+// VNEG : Vector Negate (integer)
+def VNEGs8d : VNEGD<0b00, "vneg", "s8", v8i8>;
+def VNEGs16d : VNEGD<0b01, "vneg", "s16", v4i16>;
+def VNEGs32d : VNEGD<0b10, "vneg", "s32", v2i32>;
+def VNEGs8q : VNEGQ<0b00, "vneg", "s8", v16i8>;
+def VNEGs16q : VNEGQ<0b01, "vneg", "s16", v8i16>;
+def VNEGs32q : VNEGQ<0b10, "vneg", "s32", v4i32>;
+
+// VNEG : Vector Negate (floating-point)
+def VNEGfd : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 0, 0,
+ (outs DPR:$Vd), (ins DPR:$Vm), IIC_VUNAD,
+ "vneg", "f32", "$Vd, $Vm", "",
+ [(set DPR:$Vd, (v2f32 (fneg DPR:$Vm)))]>;
+def VNEGf32q : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 1, 0,
+ (outs QPR:$Vd), (ins QPR:$Vm), IIC_VUNAQ,
+ "vneg", "f32", "$Vd, $Vm", "",
+ [(set QPR:$Vd, (v4f32 (fneg QPR:$Vm)))]>;
+def VNEGhd : N2V<0b11, 0b11, 0b01, 0b01, 0b01111, 0, 0,
+ (outs DPR:$Vd), (ins DPR:$Vm), IIC_VUNAD,
+ "vneg", "f16", "$Vd, $Vm", "",
+ [(set DPR:$Vd, (v4f16 (fneg DPR:$Vm)))]>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VNEGhq : N2V<0b11, 0b11, 0b01, 0b01, 0b01111, 1, 0,
+ (outs QPR:$Vd), (ins QPR:$Vm), IIC_VUNAQ,
+ "vneg", "f16", "$Vd, $Vm", "",
+ [(set QPR:$Vd, (v8f16 (fneg QPR:$Vm)))]>,
+ Requires<[HasNEON, HasFullFP16]>;
+
+def : Pat<(v8i8 (vnegd DPR:$src)), (VNEGs8d DPR:$src)>;
+def : Pat<(v4i16 (vnegd DPR:$src)), (VNEGs16d DPR:$src)>;
+def : Pat<(v2i32 (vnegd DPR:$src)), (VNEGs32d DPR:$src)>;
+def : Pat<(v16i8 (vnegq QPR:$src)), (VNEGs8q QPR:$src)>;
+def : Pat<(v8i16 (vnegq QPR:$src)), (VNEGs16q QPR:$src)>;
+def : Pat<(v4i32 (vnegq QPR:$src)), (VNEGs32q QPR:$src)>;
+
+// VQNEG : Vector Saturating Negate
+defm VQNEG : N2VInt_QHS<0b11, 0b11, 0b00, 0b01111, 0,
+ IIC_VQUNAiD, IIC_VQUNAiQ, "vqneg", "s",
+ int_arm_neon_vqneg>;
+
+// Vector Bit Counting Operations.
+
+// VCLS : Vector Count Leading Sign Bits
+defm VCLS : N2VInt_QHS<0b11, 0b11, 0b00, 0b01000, 0,
+ IIC_VCNTiD, IIC_VCNTiQ, "vcls", "s",
+ int_arm_neon_vcls>;
+// VCLZ : Vector Count Leading Zeros
+defm VCLZ : N2VInt_QHS<0b11, 0b11, 0b00, 0b01001, 0,
+ IIC_VCNTiD, IIC_VCNTiQ, "vclz", "i",
+ ctlz>;
+// VCNT : Vector Count One Bits
+def VCNTd : N2VDInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0,
+ IIC_VCNTiD, "vcnt", "8",
+ v8i8, v8i8, ctpop>;
+def VCNTq : N2VQInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0,
+ IIC_VCNTiQ, "vcnt", "8",
+ v16i8, v16i8, ctpop>;
+
+// Vector Swap
+def VSWPd : N2VX<0b11, 0b11, 0b00, 0b10, 0b00000, 0, 0,
+ (outs DPR:$Vd, DPR:$Vm), (ins DPR:$in1, DPR:$in2),
+ NoItinerary, "vswp", "$Vd, $Vm", "$in1 = $Vd, $in2 = $Vm",
+ []>;
+def VSWPq : N2VX<0b11, 0b11, 0b00, 0b10, 0b00000, 1, 0,
+ (outs QPR:$Vd, QPR:$Vm), (ins QPR:$in1, QPR:$in2),
+ NoItinerary, "vswp", "$Vd, $Vm", "$in1 = $Vd, $in2 = $Vm",
+ []>;
+
+// Vector Move Operations.
+
+// VMOV : Vector Move (Register)
+def : NEONInstAlias<"vmov${p} $Vd, $Vm",
+ (VORRd DPR:$Vd, DPR:$Vm, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vmov${p} $Vd, $Vm",
+ (VORRq QPR:$Vd, QPR:$Vm, QPR:$Vm, pred:$p)>;
+
+// VMOV : Vector Move (Immediate)
+
+// Although VMOVs are not strictly speaking cheap, they are as expensive
+// as their copies counterpart (VORR), so we should prefer rematerialization
+// over splitting when it applies.
+let isReMaterializable = 1, isAsCheapAsAMove=1 in {
+def VMOVv8i8 : N1ModImm<1, 0b000, 0b1110, 0, 0, 0, 1, (outs DPR:$Vd),
+ (ins nImmSplatI8:$SIMM), IIC_VMOVImm,
+ "vmov", "i8", "$Vd, $SIMM", "",
+ [(set DPR:$Vd, (v8i8 (NEONvmovImm timm:$SIMM)))]>;
+def VMOVv16i8 : N1ModImm<1, 0b000, 0b1110, 0, 1, 0, 1, (outs QPR:$Vd),
+ (ins nImmSplatI8:$SIMM), IIC_VMOVImm,
+ "vmov", "i8", "$Vd, $SIMM", "",
+ [(set QPR:$Vd, (v16i8 (NEONvmovImm timm:$SIMM)))]>;
+
+def VMOVv4i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 0, 0, 1, (outs DPR:$Vd),
+ (ins nImmSplatI16:$SIMM), IIC_VMOVImm,
+ "vmov", "i16", "$Vd, $SIMM", "",
+ [(set DPR:$Vd, (v4i16 (NEONvmovImm timm:$SIMM)))]> {
+ let Inst{9} = SIMM{9};
+}
+
+def VMOVv8i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 1, 0, 1, (outs QPR:$Vd),
+ (ins nImmSplatI16:$SIMM), IIC_VMOVImm,
+ "vmov", "i16", "$Vd, $SIMM", "",
+ [(set QPR:$Vd, (v8i16 (NEONvmovImm timm:$SIMM)))]> {
+ let Inst{9} = SIMM{9};
+}
+
+def VMOVv2i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 0, 0, 1, (outs DPR:$Vd),
+ (ins nImmVMOVI32:$SIMM), IIC_VMOVImm,
+ "vmov", "i32", "$Vd, $SIMM", "",
+ [(set DPR:$Vd, (v2i32 (NEONvmovImm timm:$SIMM)))]> {
+ let Inst{11-8} = SIMM{11-8};
+}
+
+def VMOVv4i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 1, 0, 1, (outs QPR:$Vd),
+ (ins nImmVMOVI32:$SIMM), IIC_VMOVImm,
+ "vmov", "i32", "$Vd, $SIMM", "",
+ [(set QPR:$Vd, (v4i32 (NEONvmovImm timm:$SIMM)))]> {
+ let Inst{11-8} = SIMM{11-8};
+}
+
+def VMOVv1i64 : N1ModImm<1, 0b000, 0b1110, 0, 0, 1, 1, (outs DPR:$Vd),
+ (ins nImmSplatI64:$SIMM), IIC_VMOVImm,
+ "vmov", "i64", "$Vd, $SIMM", "",
+ [(set DPR:$Vd, (v1i64 (NEONvmovImm timm:$SIMM)))]>;
+def VMOVv2i64 : N1ModImm<1, 0b000, 0b1110, 0, 1, 1, 1, (outs QPR:$Vd),
+ (ins nImmSplatI64:$SIMM), IIC_VMOVImm,
+ "vmov", "i64", "$Vd, $SIMM", "",
+ [(set QPR:$Vd, (v2i64 (NEONvmovImm timm:$SIMM)))]>;
+
+def VMOVv2f32 : N1ModImm<1, 0b000, 0b1111, 0, 0, 0, 1, (outs DPR:$Vd),
+ (ins nImmVMOVF32:$SIMM), IIC_VMOVImm,
+ "vmov", "f32", "$Vd, $SIMM", "",
+ [(set DPR:$Vd, (v2f32 (NEONvmovFPImm timm:$SIMM)))]>;
+def VMOVv4f32 : N1ModImm<1, 0b000, 0b1111, 0, 1, 0, 1, (outs QPR:$Vd),
+ (ins nImmVMOVF32:$SIMM), IIC_VMOVImm,
+ "vmov", "f32", "$Vd, $SIMM", "",
+ [(set QPR:$Vd, (v4f32 (NEONvmovFPImm timm:$SIMM)))]>;
+} // isReMaterializable, isAsCheapAsAMove
+
+// Add support for bytes replication feature, so it could be GAS compatible.
+// E.g. instructions below:
+// "vmov.i32 d0, 0xffffffff"
+// "vmov.i32 d0, 0xabababab"
+// "vmov.i16 d0, 0xabab"
+// are incorrect, but we could deal with such cases.
+// For last two instructions, for example, it should emit:
+// "vmov.i8 d0, 0xab"
+def : NEONInstAlias<"vmov${p}.i16 $Vd, $Vm",
+ (VMOVv8i8 DPR:$Vd, nImmVMOVI16ByteReplicate:$Vm, pred:$p)>;
+def : NEONInstAlias<"vmov${p}.i32 $Vd, $Vm",
+ (VMOVv8i8 DPR:$Vd, nImmVMOVI32ByteReplicate:$Vm, pred:$p)>;
+def : NEONInstAlias<"vmov${p}.i16 $Vd, $Vm",
+ (VMOVv16i8 QPR:$Vd, nImmVMOVI16ByteReplicate:$Vm, pred:$p)>;
+def : NEONInstAlias<"vmov${p}.i32 $Vd, $Vm",
+ (VMOVv16i8 QPR:$Vd, nImmVMOVI32ByteReplicate:$Vm, pred:$p)>;
+
+// Also add same support for VMVN instructions. So instruction:
+// "vmvn.i32 d0, 0xabababab"
+// actually means:
+// "vmov.i8 d0, 0x54"
+def : NEONInstAlias<"vmvn${p}.i16 $Vd, $Vm",
+ (VMOVv8i8 DPR:$Vd, nImmVMVNI16ByteReplicate:$Vm, pred:$p)>;
+def : NEONInstAlias<"vmvn${p}.i32 $Vd, $Vm",
+ (VMOVv8i8 DPR:$Vd, nImmVMVNI32ByteReplicate:$Vm, pred:$p)>;
+def : NEONInstAlias<"vmvn${p}.i16 $Vd, $Vm",
+ (VMOVv16i8 QPR:$Vd, nImmVMVNI16ByteReplicate:$Vm, pred:$p)>;
+def : NEONInstAlias<"vmvn${p}.i32 $Vd, $Vm",
+ (VMOVv16i8 QPR:$Vd, nImmVMVNI32ByteReplicate:$Vm, pred:$p)>;
+
+// On some CPUs the two instructions "vmov.i32 dD, #0" and "vmov.i32 qD, #0"
+// require zero cycles to execute so they should be used wherever possible for
+// setting a register to zero.
+
+// Even without these pseudo-insts we would probably end up with the correct
+// instruction, but we could not mark the general ones with "isAsCheapAsAMove"
+// since they are sometimes rather expensive (in general).
+
+let AddedComplexity = 50, isAsCheapAsAMove = 1, isReMaterializable = 1 in {
+ def VMOVD0 : ARMPseudoExpand<(outs DPR:$Vd), (ins), 4, IIC_VMOVImm,
+ [(set DPR:$Vd, (v2i32 NEONimmAllZerosV))],
+ (VMOVv2i32 DPR:$Vd, 0, (ops 14, zero_reg))>,
+ Requires<[HasZCZ]>;
+ def VMOVQ0 : ARMPseudoExpand<(outs QPR:$Vd), (ins), 4, IIC_VMOVImm,
+ [(set QPR:$Vd, (v4i32 NEONimmAllZerosV))],
+ (VMOVv4i32 QPR:$Vd, 0, (ops 14, zero_reg))>,
+ Requires<[HasZCZ]>;
+}
+
+// VMOV : Vector Get Lane (move scalar to ARM core register)
+
+def VGETLNs8 : NVGetLane<{1,1,1,0,0,1,?,1}, 0b1011, {?,?},
+ (outs GPR:$R), (ins DPR:$V, VectorIndex8:$lane),
+ IIC_VMOVSI, "vmov", "s8", "$R, $V$lane",
+ [(set GPR:$R, (NEONvgetlanes (v8i8 DPR:$V),
+ imm:$lane))]> {
+ let Inst{21} = lane{2};
+ let Inst{6-5} = lane{1-0};
+}
+def VGETLNs16 : NVGetLane<{1,1,1,0,0,0,?,1}, 0b1011, {?,1},
+ (outs GPR:$R), (ins DPR:$V, VectorIndex16:$lane),
+ IIC_VMOVSI, "vmov", "s16", "$R, $V$lane",
+ [(set GPR:$R, (NEONvgetlanes (v4i16 DPR:$V),
+ imm:$lane))]> {
+ let Inst{21} = lane{1};
+ let Inst{6} = lane{0};
+}
+def VGETLNu8 : NVGetLane<{1,1,1,0,1,1,?,1}, 0b1011, {?,?},
+ (outs GPR:$R), (ins DPR:$V, VectorIndex8:$lane),
+ IIC_VMOVSI, "vmov", "u8", "$R, $V$lane",
+ [(set GPR:$R, (NEONvgetlaneu (v8i8 DPR:$V),
+ imm:$lane))]> {
+ let Inst{21} = lane{2};
+ let Inst{6-5} = lane{1-0};
+}
+def VGETLNu16 : NVGetLane<{1,1,1,0,1,0,?,1}, 0b1011, {?,1},
+ (outs GPR:$R), (ins DPR:$V, VectorIndex16:$lane),
+ IIC_VMOVSI, "vmov", "u16", "$R, $V$lane",
+ [(set GPR:$R, (NEONvgetlaneu (v4i16 DPR:$V),
+ imm:$lane))]> {
+ let Inst{21} = lane{1};
+ let Inst{6} = lane{0};
+}
+def VGETLNi32 : NVGetLane<{1,1,1,0,0,0,?,1}, 0b1011, 0b00,
+ (outs GPR:$R), (ins DPR:$V, VectorIndex32:$lane),
+ IIC_VMOVSI, "vmov", "32", "$R, $V$lane",
+ [(set GPR:$R, (extractelt (v2i32 DPR:$V),
+ imm:$lane))]>,
+ Requires<[HasVFP2, HasFastVGETLNi32]> {
+ let Inst{21} = lane{0};
+}
+// def VGETLNf32: see FMRDH and FMRDL in ARMInstrVFP.td
+def : Pat<(NEONvgetlanes (v16i8 QPR:$src), imm:$lane),
+ (VGETLNs8 (v8i8 (EXTRACT_SUBREG QPR:$src,
+ (DSubReg_i8_reg imm:$lane))),
+ (SubReg_i8_lane imm:$lane))>;
+def : Pat<(NEONvgetlanes (v8i16 QPR:$src), imm:$lane),
+ (VGETLNs16 (v4i16 (EXTRACT_SUBREG QPR:$src,
+ (DSubReg_i16_reg imm:$lane))),
+ (SubReg_i16_lane imm:$lane))>;
+def : Pat<(NEONvgetlaneu (v16i8 QPR:$src), imm:$lane),
+ (VGETLNu8 (v8i8 (EXTRACT_SUBREG QPR:$src,
+ (DSubReg_i8_reg imm:$lane))),
+ (SubReg_i8_lane imm:$lane))>;
+def : Pat<(NEONvgetlaneu (v8i16 QPR:$src), imm:$lane),
+ (VGETLNu16 (v4i16 (EXTRACT_SUBREG QPR:$src,
+ (DSubReg_i16_reg imm:$lane))),
+ (SubReg_i16_lane imm:$lane))>;
+def : Pat<(extractelt (v4i32 QPR:$src), imm:$lane),
+ (VGETLNi32 (v2i32 (EXTRACT_SUBREG QPR:$src,
+ (DSubReg_i32_reg imm:$lane))),
+ (SubReg_i32_lane imm:$lane))>,
+ Requires<[HasNEON, HasFastVGETLNi32]>;
+def : Pat<(extractelt (v2i32 DPR:$src), imm:$lane),
+ (COPY_TO_REGCLASS
+ (i32 (EXTRACT_SUBREG DPR:$src, (SSubReg_f32_reg imm:$lane))), GPR)>,
+ Requires<[HasNEON, HasSlowVGETLNi32]>;
+def : Pat<(extractelt (v4i32 QPR:$src), imm:$lane),
+ (COPY_TO_REGCLASS
+ (i32 (EXTRACT_SUBREG QPR:$src, (SSubReg_f32_reg imm:$lane))), GPR)>,
+ Requires<[HasNEON, HasSlowVGETLNi32]>;
+def : Pat<(extractelt (v2f32 DPR:$src1), imm:$src2),
+ (EXTRACT_SUBREG (v2f32 (COPY_TO_REGCLASS (v2f32 DPR:$src1),DPR_VFP2)),
+ (SSubReg_f32_reg imm:$src2))>;
+def : Pat<(extractelt (v4f32 QPR:$src1), imm:$src2),
+ (EXTRACT_SUBREG (v4f32 (COPY_TO_REGCLASS (v4f32 QPR:$src1),QPR_VFP2)),
+ (SSubReg_f32_reg imm:$src2))>;
+//def : Pat<(extractelt (v2i64 QPR:$src1), imm:$src2),
+// (EXTRACT_SUBREG QPR:$src1, (DSubReg_f64_reg imm:$src2))>;
+def : Pat<(extractelt (v2f64 QPR:$src1), imm:$src2),
+ (EXTRACT_SUBREG QPR:$src1, (DSubReg_f64_reg imm:$src2))>;
+
+
+// VMOV : Vector Set Lane (move ARM core register to scalar)
+
+let Constraints = "$src1 = $V" in {
+def VSETLNi8 : NVSetLane<{1,1,1,0,0,1,?,0}, 0b1011, {?,?}, (outs DPR:$V),
+ (ins DPR:$src1, GPR:$R, VectorIndex8:$lane),
+ IIC_VMOVISL, "vmov", "8", "$V$lane, $R",
+ [(set DPR:$V, (vector_insert (v8i8 DPR:$src1),
+ GPR:$R, imm:$lane))]> {
+ let Inst{21} = lane{2};
+ let Inst{6-5} = lane{1-0};
+}
+def VSETLNi16 : NVSetLane<{1,1,1,0,0,0,?,0}, 0b1011, {?,1}, (outs DPR:$V),
+ (ins DPR:$src1, GPR:$R, VectorIndex16:$lane),
+ IIC_VMOVISL, "vmov", "16", "$V$lane, $R",
+ [(set DPR:$V, (vector_insert (v4i16 DPR:$src1),
+ GPR:$R, imm:$lane))]> {
+ let Inst{21} = lane{1};
+ let Inst{6} = lane{0};
+}
+def VSETLNi32 : NVSetLane<{1,1,1,0,0,0,?,0}, 0b1011, 0b00, (outs DPR:$V),
+ (ins DPR:$src1, GPR:$R, VectorIndex32:$lane),
+ IIC_VMOVISL, "vmov", "32", "$V$lane, $R",
+ [(set DPR:$V, (insertelt (v2i32 DPR:$src1),
+ GPR:$R, imm:$lane))]>,
+ Requires<[HasVFP2]> {
+ let Inst{21} = lane{0};
+ // This instruction is equivalent as
+ // $V = INSERT_SUBREG $src1, $R, translateImmToSubIdx($imm)
+ let isInsertSubreg = 1;
+}
+}
+def : Pat<(vector_insert (v16i8 QPR:$src1), GPR:$src2, imm:$lane),
+ (v16i8 (INSERT_SUBREG QPR:$src1,
+ (v8i8 (VSETLNi8 (v8i8 (EXTRACT_SUBREG QPR:$src1,
+ (DSubReg_i8_reg imm:$lane))),
+ GPR:$src2, (SubReg_i8_lane imm:$lane))),
+ (DSubReg_i8_reg imm:$lane)))>;
+def : Pat<(vector_insert (v8i16 QPR:$src1), GPR:$src2, imm:$lane),
+ (v8i16 (INSERT_SUBREG QPR:$src1,
+ (v4i16 (VSETLNi16 (v4i16 (EXTRACT_SUBREG QPR:$src1,
+ (DSubReg_i16_reg imm:$lane))),
+ GPR:$src2, (SubReg_i16_lane imm:$lane))),
+ (DSubReg_i16_reg imm:$lane)))>;
+def : Pat<(insertelt (v4i32 QPR:$src1), GPR:$src2, imm:$lane),
+ (v4i32 (INSERT_SUBREG QPR:$src1,
+ (v2i32 (VSETLNi32 (v2i32 (EXTRACT_SUBREG QPR:$src1,
+ (DSubReg_i32_reg imm:$lane))),
+ GPR:$src2, (SubReg_i32_lane imm:$lane))),
+ (DSubReg_i32_reg imm:$lane)))>;
+
+def : Pat<(v2f32 (insertelt DPR:$src1, SPR:$src2, imm:$src3)),
+ (INSERT_SUBREG (v2f32 (COPY_TO_REGCLASS DPR:$src1, DPR_VFP2)),
+ SPR:$src2, (SSubReg_f32_reg imm:$src3))>;
+def : Pat<(v4f32 (insertelt QPR:$src1, SPR:$src2, imm:$src3)),
+ (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS QPR:$src1, QPR_VFP2)),
+ SPR:$src2, (SSubReg_f32_reg imm:$src3))>;
+
+//def : Pat<(v2i64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)),
+// (INSERT_SUBREG QPR:$src1, DPR:$src2, (DSubReg_f64_reg imm:$src3))>;
+def : Pat<(v2f64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)),
+ (INSERT_SUBREG QPR:$src1, DPR:$src2, (DSubReg_f64_reg imm:$src3))>;
+
+def : Pat<(v2f32 (scalar_to_vector SPR:$src)),
+ (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$src, ssub_0)>;
+def : Pat<(v2f64 (scalar_to_vector (f64 DPR:$src))),
+ (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), DPR:$src, dsub_0)>;
+def : Pat<(v4f32 (scalar_to_vector SPR:$src)),
+ (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), SPR:$src, ssub_0)>;
+
+def : Pat<(v8i8 (scalar_to_vector GPR:$src)),
+ (VSETLNi8 (v8i8 (IMPLICIT_DEF)), GPR:$src, (i32 0))>;
+def : Pat<(v4i16 (scalar_to_vector GPR:$src)),
+ (VSETLNi16 (v4i16 (IMPLICIT_DEF)), GPR:$src, (i32 0))>;
+def : Pat<(v2i32 (scalar_to_vector GPR:$src)),
+ (VSETLNi32 (v2i32 (IMPLICIT_DEF)), GPR:$src, (i32 0))>;
+
+def : Pat<(v16i8 (scalar_to_vector GPR:$src)),
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (VSETLNi8 (v8i8 (IMPLICIT_DEF)), GPR:$src, (i32 0)),
+ dsub_0)>;
+def : Pat<(v8i16 (scalar_to_vector GPR:$src)),
+ (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
+ (VSETLNi16 (v4i16 (IMPLICIT_DEF)), GPR:$src, (i32 0)),
+ dsub_0)>;
+def : Pat<(v4i32 (scalar_to_vector GPR:$src)),
+ (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
+ (VSETLNi32 (v2i32 (IMPLICIT_DEF)), GPR:$src, (i32 0)),
+ dsub_0)>;
+
+// VDUP : Vector Duplicate (from ARM core register to all elements)
+
+class VDUPD<bits<8> opcod1, bits<2> opcod3, string Dt, ValueType Ty>
+ : NVDup<opcod1, 0b1011, opcod3, (outs DPR:$V), (ins GPR:$R),
+ IIC_VMOVIS, "vdup", Dt, "$V, $R",
+ [(set DPR:$V, (Ty (NEONvdup (i32 GPR:$R))))]>;
+class VDUPQ<bits<8> opcod1, bits<2> opcod3, string Dt, ValueType Ty>
+ : NVDup<opcod1, 0b1011, opcod3, (outs QPR:$V), (ins GPR:$R),
+ IIC_VMOVIS, "vdup", Dt, "$V, $R",
+ [(set QPR:$V, (Ty (NEONvdup (i32 GPR:$R))))]>;
+
+def VDUP8d : VDUPD<0b11101100, 0b00, "8", v8i8>;
+def VDUP16d : VDUPD<0b11101000, 0b01, "16", v4i16>;
+def VDUP32d : VDUPD<0b11101000, 0b00, "32", v2i32>,
+ Requires<[HasNEON, HasFastVDUP32]>;
+def VDUP8q : VDUPQ<0b11101110, 0b00, "8", v16i8>;
+def VDUP16q : VDUPQ<0b11101010, 0b01, "16", v8i16>;
+def VDUP32q : VDUPQ<0b11101010, 0b00, "32", v4i32>;
+
+// NEONvdup patterns for uarchs with fast VDUP.32.
+def : Pat<(v2f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VDUP32d GPR:$R)>,
+ Requires<[HasNEON,HasFastVDUP32]>;
+def : Pat<(v4f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VDUP32q GPR:$R)>;
+
+// NEONvdup patterns for uarchs with slow VDUP.32 - use VMOVDRR instead.
+def : Pat<(v2i32 (NEONvdup (i32 GPR:$R))), (VMOVDRR GPR:$R, GPR:$R)>,
+ Requires<[HasNEON,HasSlowVDUP32]>;
+def : Pat<(v2f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VMOVDRR GPR:$R, GPR:$R)>,
+ Requires<[HasNEON,HasSlowVDUP32]>;
+
+// VDUP : Vector Duplicate Lane (from scalar to all elements)
+
+class VDUPLND<bits<4> op19_16, string OpcodeStr, string Dt,
+ ValueType Ty, Operand IdxTy>
+ : NVDupLane<op19_16, 0, (outs DPR:$Vd), (ins DPR:$Vm, IdxTy:$lane),
+ IIC_VMOVD, OpcodeStr, Dt, "$Vd, $Vm$lane",
+ [(set DPR:$Vd, (Ty (NEONvduplane (Ty DPR:$Vm), imm:$lane)))]>;
+
+class VDUPLNQ<bits<4> op19_16, string OpcodeStr, string Dt,
+ ValueType ResTy, ValueType OpTy, Operand IdxTy>
+ : NVDupLane<op19_16, 1, (outs QPR:$Vd), (ins DPR:$Vm, IdxTy:$lane),
+ IIC_VMOVQ, OpcodeStr, Dt, "$Vd, $Vm$lane",
+ [(set QPR:$Vd, (ResTy (NEONvduplane (OpTy DPR:$Vm),
+ VectorIndex32:$lane)))]>;
+
+// Inst{19-16} is partially specified depending on the element size.
+
+def VDUPLN8d : VDUPLND<{?,?,?,1}, "vdup", "8", v8i8, VectorIndex8> {
+ bits<3> lane;
+ let Inst{19-17} = lane{2-0};
+}
+def VDUPLN16d : VDUPLND<{?,?,1,0}, "vdup", "16", v4i16, VectorIndex16> {
+ bits<2> lane;
+ let Inst{19-18} = lane{1-0};
+}
+def VDUPLN32d : VDUPLND<{?,1,0,0}, "vdup", "32", v2i32, VectorIndex32> {
+ bits<1> lane;
+ let Inst{19} = lane{0};
+}
+def VDUPLN8q : VDUPLNQ<{?,?,?,1}, "vdup", "8", v16i8, v8i8, VectorIndex8> {
+ bits<3> lane;
+ let Inst{19-17} = lane{2-0};
+}
+def VDUPLN16q : VDUPLNQ<{?,?,1,0}, "vdup", "16", v8i16, v4i16, VectorIndex16> {
+ bits<2> lane;
+ let Inst{19-18} = lane{1-0};
+}
+def VDUPLN32q : VDUPLNQ<{?,1,0,0}, "vdup", "32", v4i32, v2i32, VectorIndex32> {
+ bits<1> lane;
+ let Inst{19} = lane{0};
+}
+
+def : Pat<(v2f32 (NEONvduplane (v2f32 DPR:$Vm), imm:$lane)),
+ (VDUPLN32d DPR:$Vm, imm:$lane)>;
+
+def : Pat<(v4f32 (NEONvduplane (v2f32 DPR:$Vm), imm:$lane)),
+ (VDUPLN32q DPR:$Vm, imm:$lane)>;
+
+def : Pat<(v16i8 (NEONvduplane (v16i8 QPR:$src), imm:$lane)),
+ (v16i8 (VDUPLN8q (v8i8 (EXTRACT_SUBREG QPR:$src,
+ (DSubReg_i8_reg imm:$lane))),
+ (SubReg_i8_lane imm:$lane)))>;
+def : Pat<(v8i16 (NEONvduplane (v8i16 QPR:$src), imm:$lane)),
+ (v8i16 (VDUPLN16q (v4i16 (EXTRACT_SUBREG QPR:$src,
+ (DSubReg_i16_reg imm:$lane))),
+ (SubReg_i16_lane imm:$lane)))>;
+def : Pat<(v4i32 (NEONvduplane (v4i32 QPR:$src), imm:$lane)),
+ (v4i32 (VDUPLN32q (v2i32 (EXTRACT_SUBREG QPR:$src,
+ (DSubReg_i32_reg imm:$lane))),
+ (SubReg_i32_lane imm:$lane)))>;
+def : Pat<(v4f32 (NEONvduplane (v4f32 QPR:$src), imm:$lane)),
+ (v4f32 (VDUPLN32q (v2f32 (EXTRACT_SUBREG QPR:$src,
+ (DSubReg_i32_reg imm:$lane))),
+ (SubReg_i32_lane imm:$lane)))>;
+
+def : Pat<(v2f32 (NEONvdup (f32 SPR:$src))),
+ (v2f32 (VDUPLN32d (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)),
+ SPR:$src, ssub_0), (i32 0)))>;
+def : Pat<(v4f32 (NEONvdup (f32 SPR:$src))),
+ (v4f32 (VDUPLN32q (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)),
+ SPR:$src, ssub_0), (i32 0)))>;
+
+// VMOVN : Vector Narrowing Move
+defm VMOVN : N2VN_HSD<0b11,0b11,0b10,0b00100,0,0, IIC_VMOVN,
+ "vmovn", "i", trunc>;
+// VQMOVN : Vector Saturating Narrowing Move
+defm VQMOVNs : N2VNInt_HSD<0b11,0b11,0b10,0b00101,0,0, IIC_VQUNAiD,
+ "vqmovn", "s", int_arm_neon_vqmovns>;
+defm VQMOVNu : N2VNInt_HSD<0b11,0b11,0b10,0b00101,1,0, IIC_VQUNAiD,
+ "vqmovn", "u", int_arm_neon_vqmovnu>;
+defm VQMOVNsu : N2VNInt_HSD<0b11,0b11,0b10,0b00100,1,0, IIC_VQUNAiD,
+ "vqmovun", "s", int_arm_neon_vqmovnsu>;
+// VMOVL : Vector Lengthening Move
+defm VMOVLs : N2VL_QHS<0b01,0b10100,0,1, "vmovl", "s", sext>;
+defm VMOVLu : N2VL_QHS<0b11,0b10100,0,1, "vmovl", "u", zext>;
+def : Pat<(v8i16 (anyext (v8i8 DPR:$Vm))), (VMOVLuv8i16 DPR:$Vm)>;
+def : Pat<(v4i32 (anyext (v4i16 DPR:$Vm))), (VMOVLuv4i32 DPR:$Vm)>;
+def : Pat<(v2i64 (anyext (v2i32 DPR:$Vm))), (VMOVLuv2i64 DPR:$Vm)>;
+
+// Vector Conversions.
+
+// VCVT : Vector Convert Between Floating-Point and Integers
+def VCVTf2sd : N2VD<0b11, 0b11, 0b10, 0b11, 0b01110, 0, "vcvt", "s32.f32",
+ v2i32, v2f32, fp_to_sint>;
+def VCVTf2ud : N2VD<0b11, 0b11, 0b10, 0b11, 0b01111, 0, "vcvt", "u32.f32",
+ v2i32, v2f32, fp_to_uint>;
+def VCVTs2fd : N2VD<0b11, 0b11, 0b10, 0b11, 0b01100, 0, "vcvt", "f32.s32",
+ v2f32, v2i32, sint_to_fp>;
+def VCVTu2fd : N2VD<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt", "f32.u32",
+ v2f32, v2i32, uint_to_fp>;
+
+def VCVTf2sq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01110, 0, "vcvt", "s32.f32",
+ v4i32, v4f32, fp_to_sint>;
+def VCVTf2uq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01111, 0, "vcvt", "u32.f32",
+ v4i32, v4f32, fp_to_uint>;
+def VCVTs2fq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01100, 0, "vcvt", "f32.s32",
+ v4f32, v4i32, sint_to_fp>;
+def VCVTu2fq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt", "f32.u32",
+ v4f32, v4i32, uint_to_fp>;
+
+def VCVTh2sd : N2VD<0b11, 0b11, 0b01, 0b11, 0b01110, 0, "vcvt", "s16.f16",
+ v4i16, v4f16, fp_to_sint>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VCVTh2ud : N2VD<0b11, 0b11, 0b01, 0b11, 0b01111, 0, "vcvt", "u16.f16",
+ v4i16, v4f16, fp_to_uint>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VCVTs2hd : N2VD<0b11, 0b11, 0b01, 0b11, 0b01100, 0, "vcvt", "f16.s16",
+ v4f16, v4i16, sint_to_fp>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VCVTu2hd : N2VD<0b11, 0b11, 0b01, 0b11, 0b01101, 0, "vcvt", "f16.u16",
+ v4f16, v4i16, uint_to_fp>,
+ Requires<[HasNEON, HasFullFP16]>;
+
+def VCVTh2sq : N2VQ<0b11, 0b11, 0b01, 0b11, 0b01110, 0, "vcvt", "s16.f16",
+ v8i16, v8f16, fp_to_sint>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VCVTh2uq : N2VQ<0b11, 0b11, 0b01, 0b11, 0b01111, 0, "vcvt", "u16.f16",
+ v8i16, v8f16, fp_to_uint>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VCVTs2hq : N2VQ<0b11, 0b11, 0b01, 0b11, 0b01100, 0, "vcvt", "f16.s16",
+ v8f16, v8i16, sint_to_fp>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VCVTu2hq : N2VQ<0b11, 0b11, 0b01, 0b11, 0b01101, 0, "vcvt", "f16.u16",
+ v8f16, v8i16, uint_to_fp>,
+ Requires<[HasNEON, HasFullFP16]>;
+
+// VCVT{A, N, P, M}
+multiclass VCVT_FPI<string op, bits<3> op10_8, SDPatternOperator IntS,
+ SDPatternOperator IntU> {
+ let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in {
+ def SDf : N2VDIntnp<0b10, 0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op),
+ "s32.f32", v2i32, v2f32, IntS>, Requires<[HasV8, HasNEON]>;
+ def SQf : N2VQIntnp<0b10, 0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op),
+ "s32.f32", v4i32, v4f32, IntS>, Requires<[HasV8, HasNEON]>;
+ def UDf : N2VDIntnp<0b10, 0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op),
+ "u32.f32", v2i32, v2f32, IntU>, Requires<[HasV8, HasNEON]>;
+ def UQf : N2VQIntnp<0b10, 0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op),
+ "u32.f32", v4i32, v4f32, IntU>, Requires<[HasV8, HasNEON]>;
+ def SDh : N2VDIntnp<0b01, 0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op),
+ "s16.f16", v4i16, v4f16, IntS>,
+ Requires<[HasV8, HasNEON, HasFullFP16]>;
+ def SQh : N2VQIntnp<0b01, 0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op),
+ "s16.f16", v8i16, v8f16, IntS>,
+ Requires<[HasV8, HasNEON, HasFullFP16]>;
+ def UDh : N2VDIntnp<0b01, 0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op),
+ "u16.f16", v4i16, v4f16, IntU>,
+ Requires<[HasV8, HasNEON, HasFullFP16]>;
+ def UQh : N2VQIntnp<0b01, 0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op),
+ "u16.f16", v8i16, v8f16, IntU>,
+ Requires<[HasV8, HasNEON, HasFullFP16]>;
+ }
+}
+
+defm VCVTAN : VCVT_FPI<"a", 0b000, int_arm_neon_vcvtas, int_arm_neon_vcvtau>;
+defm VCVTNN : VCVT_FPI<"n", 0b001, int_arm_neon_vcvtns, int_arm_neon_vcvtnu>;
+defm VCVTPN : VCVT_FPI<"p", 0b010, int_arm_neon_vcvtps, int_arm_neon_vcvtpu>;
+defm VCVTMN : VCVT_FPI<"m", 0b011, int_arm_neon_vcvtms, int_arm_neon_vcvtmu>;
+
+// VCVT : Vector Convert Between Floating-Point and Fixed-Point.
+let DecoderMethod = "DecodeVCVTD" in {
+def VCVTf2xsd : N2VCvtD<0, 1, 0b1111, 0, 1, "vcvt", "s32.f32",
+ v2i32, v2f32, int_arm_neon_vcvtfp2fxs>;
+def VCVTf2xud : N2VCvtD<1, 1, 0b1111, 0, 1, "vcvt", "u32.f32",
+ v2i32, v2f32, int_arm_neon_vcvtfp2fxu>;
+def VCVTxs2fd : N2VCvtD<0, 1, 0b1110, 0, 1, "vcvt", "f32.s32",
+ v2f32, v2i32, int_arm_neon_vcvtfxs2fp>;
+def VCVTxu2fd : N2VCvtD<1, 1, 0b1110, 0, 1, "vcvt", "f32.u32",
+ v2f32, v2i32, int_arm_neon_vcvtfxu2fp>;
+let Predicates = [HasNEON, HasFullFP16] in {
+def VCVTh2xsd : N2VCvtD<0, 1, 0b1101, 0, 1, "vcvt", "s16.f16",
+ v4i16, v4f16, int_arm_neon_vcvtfp2fxs>;
+def VCVTh2xud : N2VCvtD<1, 1, 0b1101, 0, 1, "vcvt", "u16.f16",
+ v4i16, v4f16, int_arm_neon_vcvtfp2fxu>;
+def VCVTxs2hd : N2VCvtD<0, 1, 0b1100, 0, 1, "vcvt", "f16.s16",
+ v4f16, v4i16, int_arm_neon_vcvtfxs2fp>;
+def VCVTxu2hd : N2VCvtD<1, 1, 0b1100, 0, 1, "vcvt", "f16.u16",
+ v4f16, v4i16, int_arm_neon_vcvtfxu2fp>;
+} // Predicates = [HasNEON, HasFullFP16]
+}
+
+let DecoderMethod = "DecodeVCVTQ" in {
+def VCVTf2xsq : N2VCvtQ<0, 1, 0b1111, 0, 1, "vcvt", "s32.f32",
+ v4i32, v4f32, int_arm_neon_vcvtfp2fxs>;
+def VCVTf2xuq : N2VCvtQ<1, 1, 0b1111, 0, 1, "vcvt", "u32.f32",
+ v4i32, v4f32, int_arm_neon_vcvtfp2fxu>;
+def VCVTxs2fq : N2VCvtQ<0, 1, 0b1110, 0, 1, "vcvt", "f32.s32",
+ v4f32, v4i32, int_arm_neon_vcvtfxs2fp>;
+def VCVTxu2fq : N2VCvtQ<1, 1, 0b1110, 0, 1, "vcvt", "f32.u32",
+ v4f32, v4i32, int_arm_neon_vcvtfxu2fp>;
+let Predicates = [HasNEON, HasFullFP16] in {
+def VCVTh2xsq : N2VCvtQ<0, 1, 0b1101, 0, 1, "vcvt", "s16.f16",
+ v8i16, v8f16, int_arm_neon_vcvtfp2fxs>;
+def VCVTh2xuq : N2VCvtQ<1, 1, 0b1101, 0, 1, "vcvt", "u16.f16",
+ v8i16, v8f16, int_arm_neon_vcvtfp2fxu>;
+def VCVTxs2hq : N2VCvtQ<0, 1, 0b1100, 0, 1, "vcvt", "f16.s16",
+ v8f16, v8i16, int_arm_neon_vcvtfxs2fp>;
+def VCVTxu2hq : N2VCvtQ<1, 1, 0b1100, 0, 1, "vcvt", "f16.u16",
+ v8f16, v8i16, int_arm_neon_vcvtfxu2fp>;
+} // Predicates = [HasNEON, HasFullFP16]
+}
+
+def : NEONInstAlias<"vcvt${p}.s32.f32 $Dd, $Dm, #0",
+ (VCVTf2sd DPR:$Dd, DPR:$Dm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.u32.f32 $Dd, $Dm, #0",
+ (VCVTf2ud DPR:$Dd, DPR:$Dm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f32.s32 $Dd, $Dm, #0",
+ (VCVTs2fd DPR:$Dd, DPR:$Dm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f32.u32 $Dd, $Dm, #0",
+ (VCVTu2fd DPR:$Dd, DPR:$Dm, pred:$p)>;
+
+def : NEONInstAlias<"vcvt${p}.s32.f32 $Qd, $Qm, #0",
+ (VCVTf2sq QPR:$Qd, QPR:$Qm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.u32.f32 $Qd, $Qm, #0",
+ (VCVTf2uq QPR:$Qd, QPR:$Qm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f32.s32 $Qd, $Qm, #0",
+ (VCVTs2fq QPR:$Qd, QPR:$Qm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f32.u32 $Qd, $Qm, #0",
+ (VCVTu2fq QPR:$Qd, QPR:$Qm, pred:$p)>;
+
+def : NEONInstAlias<"vcvt${p}.s16.f16 $Dd, $Dm, #0",
+ (VCVTh2sd DPR:$Dd, DPR:$Dm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.u16.f16 $Dd, $Dm, #0",
+ (VCVTh2ud DPR:$Dd, DPR:$Dm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f16.s16 $Dd, $Dm, #0",
+ (VCVTs2hd DPR:$Dd, DPR:$Dm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f16.u16 $Dd, $Dm, #0",
+ (VCVTu2hd DPR:$Dd, DPR:$Dm, pred:$p)>;
+
+def : NEONInstAlias<"vcvt${p}.s16.f16 $Qd, $Qm, #0",
+ (VCVTh2sq QPR:$Qd, QPR:$Qm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.u16.f16 $Qd, $Qm, #0",
+ (VCVTh2uq QPR:$Qd, QPR:$Qm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f16.s16 $Qd, $Qm, #0",
+ (VCVTs2hq QPR:$Qd, QPR:$Qm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f16.u16 $Qd, $Qm, #0",
+ (VCVTu2hq QPR:$Qd, QPR:$Qm, pred:$p)>;
+
+
+// VCVT : Vector Convert Between Half-Precision and Single-Precision.
+def VCVTf2h : N2VNInt<0b11, 0b11, 0b01, 0b10, 0b01100, 0, 0,
+ IIC_VUNAQ, "vcvt", "f16.f32",
+ v4i16, v4f32, int_arm_neon_vcvtfp2hf>,
+ Requires<[HasNEON, HasFP16]>;
+def VCVTh2f : N2VLInt<0b11, 0b11, 0b01, 0b10, 0b01110, 0, 0,
+ IIC_VUNAQ, "vcvt", "f32.f16",
+ v4f32, v4i16, int_arm_neon_vcvthf2fp>,
+ Requires<[HasNEON, HasFP16]>;
+
+// Vector Reverse.
+
+// VREV64 : Vector Reverse elements within 64-bit doublewords
+
+class VREV64D<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
+ : N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 0, 0, (outs DPR:$Vd),
+ (ins DPR:$Vm), IIC_VMOVD,
+ OpcodeStr, Dt, "$Vd, $Vm", "",
+ [(set DPR:$Vd, (Ty (NEONvrev64 (Ty DPR:$Vm))))]>;
+class VREV64Q<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
+ : N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 1, 0, (outs QPR:$Vd),
+ (ins QPR:$Vm), IIC_VMOVQ,
+ OpcodeStr, Dt, "$Vd, $Vm", "",
+ [(set QPR:$Vd, (Ty (NEONvrev64 (Ty QPR:$Vm))))]>;
+
+def VREV64d8 : VREV64D<0b00, "vrev64", "8", v8i8>;
+def VREV64d16 : VREV64D<0b01, "vrev64", "16", v4i16>;
+def VREV64d32 : VREV64D<0b10, "vrev64", "32", v2i32>;
+def : Pat<(v2f32 (NEONvrev64 (v2f32 DPR:$Vm))), (VREV64d32 DPR:$Vm)>;
+
+def VREV64q8 : VREV64Q<0b00, "vrev64", "8", v16i8>;
+def VREV64q16 : VREV64Q<0b01, "vrev64", "16", v8i16>;
+def VREV64q32 : VREV64Q<0b10, "vrev64", "32", v4i32>;
+def : Pat<(v4f32 (NEONvrev64 (v4f32 QPR:$Vm))), (VREV64q32 QPR:$Vm)>;
+
+// VREV32 : Vector Reverse elements within 32-bit words
+
+class VREV32D<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
+ : N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 0, 0, (outs DPR:$Vd),
+ (ins DPR:$Vm), IIC_VMOVD,
+ OpcodeStr, Dt, "$Vd, $Vm", "",
+ [(set DPR:$Vd, (Ty (NEONvrev32 (Ty DPR:$Vm))))]>;
+class VREV32Q<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
+ : N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 1, 0, (outs QPR:$Vd),
+ (ins QPR:$Vm), IIC_VMOVQ,
+ OpcodeStr, Dt, "$Vd, $Vm", "",
+ [(set QPR:$Vd, (Ty (NEONvrev32 (Ty QPR:$Vm))))]>;
+
+def VREV32d8 : VREV32D<0b00, "vrev32", "8", v8i8>;
+def VREV32d16 : VREV32D<0b01, "vrev32", "16", v4i16>;
+
+def VREV32q8 : VREV32Q<0b00, "vrev32", "8", v16i8>;
+def VREV32q16 : VREV32Q<0b01, "vrev32", "16", v8i16>;
+
+// VREV16 : Vector Reverse elements within 16-bit halfwords
+
+class VREV16D<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
+ : N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 0, 0, (outs DPR:$Vd),
+ (ins DPR:$Vm), IIC_VMOVD,
+ OpcodeStr, Dt, "$Vd, $Vm", "",
+ [(set DPR:$Vd, (Ty (NEONvrev16 (Ty DPR:$Vm))))]>;
+class VREV16Q<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
+ : N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 1, 0, (outs QPR:$Vd),
+ (ins QPR:$Vm), IIC_VMOVQ,
+ OpcodeStr, Dt, "$Vd, $Vm", "",
+ [(set QPR:$Vd, (Ty (NEONvrev16 (Ty QPR:$Vm))))]>;
+
+def VREV16d8 : VREV16D<0b00, "vrev16", "8", v8i8>;
+def VREV16q8 : VREV16Q<0b00, "vrev16", "8", v16i8>;
+
+// Other Vector Shuffles.
+
+// Aligned extractions: really just dropping registers
+
+class AlignedVEXTq<ValueType DestTy, ValueType SrcTy, SDNodeXForm LaneCVT>
+ : Pat<(DestTy (vector_extract_subvec (SrcTy QPR:$src), (i32 imm:$start))),
+ (EXTRACT_SUBREG (SrcTy QPR:$src), (LaneCVT imm:$start))>;
+
+def : AlignedVEXTq<v8i8, v16i8, DSubReg_i8_reg>;
+
+def : AlignedVEXTq<v4i16, v8i16, DSubReg_i16_reg>;
+
+def : AlignedVEXTq<v2i32, v4i32, DSubReg_i32_reg>;
+
+def : AlignedVEXTq<v1i64, v2i64, DSubReg_f64_reg>;
+
+def : AlignedVEXTq<v2f32, v4f32, DSubReg_i32_reg>;
+
+
+// VEXT : Vector Extract
+
+
+// All of these have a two-operand InstAlias.
+let TwoOperandAliasConstraint = "$Vn = $Vd" in {
+class VEXTd<string OpcodeStr, string Dt, ValueType Ty, Operand immTy>
+ : N3V<0,1,0b11,{?,?,?,?},0,0, (outs DPR:$Vd),
+ (ins DPR:$Vn, DPR:$Vm, immTy:$index), NVExtFrm,
+ IIC_VEXTD, OpcodeStr, Dt, "$Vd, $Vn, $Vm, $index", "",
+ [(set DPR:$Vd, (Ty (NEONvext (Ty DPR:$Vn),
+ (Ty DPR:$Vm), imm:$index)))]> {
+ bits<3> index;
+ let Inst{11} = 0b0;
+ let Inst{10-8} = index{2-0};
+}
+
+class VEXTq<string OpcodeStr, string Dt, ValueType Ty, Operand immTy>
+ : N3V<0,1,0b11,{?,?,?,?},1,0, (outs QPR:$Vd),
+ (ins QPR:$Vn, QPR:$Vm, imm0_15:$index), NVExtFrm,
+ IIC_VEXTQ, OpcodeStr, Dt, "$Vd, $Vn, $Vm, $index", "",
+ [(set QPR:$Vd, (Ty (NEONvext (Ty QPR:$Vn),
+ (Ty QPR:$Vm), imm:$index)))]> {
+ bits<4> index;
+ let Inst{11-8} = index{3-0};
+}
+}
+
+def VEXTd8 : VEXTd<"vext", "8", v8i8, imm0_7> {
+ let Inst{10-8} = index{2-0};
+}
+def VEXTd16 : VEXTd<"vext", "16", v4i16, imm0_3> {
+ let Inst{10-9} = index{1-0};
+ let Inst{8} = 0b0;
+}
+def VEXTd32 : VEXTd<"vext", "32", v2i32, imm0_1> {
+ let Inst{10} = index{0};
+ let Inst{9-8} = 0b00;
+}
+def : Pat<(v2f32 (NEONvext (v2f32 DPR:$Vn),
+ (v2f32 DPR:$Vm),
+ (i32 imm:$index))),
+ (VEXTd32 DPR:$Vn, DPR:$Vm, imm:$index)>;
+
+def VEXTq8 : VEXTq<"vext", "8", v16i8, imm0_15> {
+ let Inst{11-8} = index{3-0};
+}
+def VEXTq16 : VEXTq<"vext", "16", v8i16, imm0_7> {
+ let Inst{11-9} = index{2-0};
+ let Inst{8} = 0b0;
+}
+def VEXTq32 : VEXTq<"vext", "32", v4i32, imm0_3> {
+ let Inst{11-10} = index{1-0};
+ let Inst{9-8} = 0b00;
+}
+def VEXTq64 : VEXTq<"vext", "64", v2i64, imm0_1> {
+ let Inst{11} = index{0};
+ let Inst{10-8} = 0b000;
+}
+def : Pat<(v4f32 (NEONvext (v4f32 QPR:$Vn),
+ (v4f32 QPR:$Vm),
+ (i32 imm:$index))),
+ (VEXTq32 QPR:$Vn, QPR:$Vm, imm:$index)>;
+
+// VTRN : Vector Transpose
+
+def VTRNd8 : N2VDShuffle<0b00, 0b00001, "vtrn", "8">;
+def VTRNd16 : N2VDShuffle<0b01, 0b00001, "vtrn", "16">;
+def VTRNd32 : N2VDShuffle<0b10, 0b00001, "vtrn", "32">;
+
+def VTRNq8 : N2VQShuffle<0b00, 0b00001, IIC_VPERMQ, "vtrn", "8">;
+def VTRNq16 : N2VQShuffle<0b01, 0b00001, IIC_VPERMQ, "vtrn", "16">;
+def VTRNq32 : N2VQShuffle<0b10, 0b00001, IIC_VPERMQ, "vtrn", "32">;
+
+// VUZP : Vector Unzip (Deinterleave)
+
+def VUZPd8 : N2VDShuffle<0b00, 0b00010, "vuzp", "8">;
+def VUZPd16 : N2VDShuffle<0b01, 0b00010, "vuzp", "16">;
+// vuzp.32 Dd, Dm is a pseudo-instruction expanded to vtrn.32 Dd, Dm.
+def : NEONInstAlias<"vuzp${p}.32 $Dd, $Dm",
+ (VTRNd32 DPR:$Dd, DPR:$Dm, pred:$p)>;
+
+def VUZPq8 : N2VQShuffle<0b00, 0b00010, IIC_VPERMQ3, "vuzp", "8">;
+def VUZPq16 : N2VQShuffle<0b01, 0b00010, IIC_VPERMQ3, "vuzp", "16">;
+def VUZPq32 : N2VQShuffle<0b10, 0b00010, IIC_VPERMQ3, "vuzp", "32">;
+
+// VZIP : Vector Zip (Interleave)
+
+def VZIPd8 : N2VDShuffle<0b00, 0b00011, "vzip", "8">;
+def VZIPd16 : N2VDShuffle<0b01, 0b00011, "vzip", "16">;
+// vzip.32 Dd, Dm is a pseudo-instruction expanded to vtrn.32 Dd, Dm.
+def : NEONInstAlias<"vzip${p}.32 $Dd, $Dm",
+ (VTRNd32 DPR:$Dd, DPR:$Dm, pred:$p)>;
+
+def VZIPq8 : N2VQShuffle<0b00, 0b00011, IIC_VPERMQ3, "vzip", "8">;
+def VZIPq16 : N2VQShuffle<0b01, 0b00011, IIC_VPERMQ3, "vzip", "16">;
+def VZIPq32 : N2VQShuffle<0b10, 0b00011, IIC_VPERMQ3, "vzip", "32">;
+
+// Vector Table Lookup and Table Extension.
+
+// VTBL : Vector Table Lookup
+let DecoderMethod = "DecodeTBLInstruction" in {
+def VTBL1
+ : N3V<1,1,0b11,0b1000,0,0, (outs DPR:$Vd),
+ (ins VecListOneD:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTB1,
+ "vtbl", "8", "$Vd, $Vn, $Vm", "",
+ [(set DPR:$Vd, (v8i8 (int_arm_neon_vtbl1 VecListOneD:$Vn, DPR:$Vm)))]>;
+let hasExtraSrcRegAllocReq = 1 in {
+def VTBL2
+ : N3V<1,1,0b11,0b1001,0,0, (outs DPR:$Vd),
+ (ins VecListDPair:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTB2,
+ "vtbl", "8", "$Vd, $Vn, $Vm", "", []>;
+def VTBL3
+ : N3V<1,1,0b11,0b1010,0,0, (outs DPR:$Vd),
+ (ins VecListThreeD:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTB3,
+ "vtbl", "8", "$Vd, $Vn, $Vm", "", []>;
+def VTBL4
+ : N3V<1,1,0b11,0b1011,0,0, (outs DPR:$Vd),
+ (ins VecListFourD:$Vn, DPR:$Vm),
+ NVTBLFrm, IIC_VTB4,
+ "vtbl", "8", "$Vd, $Vn, $Vm", "", []>;
+} // hasExtraSrcRegAllocReq = 1
+
+def VTBL3Pseudo
+ : PseudoNeonI<(outs DPR:$dst), (ins QQPR:$tbl, DPR:$src), IIC_VTB3, "", []>;
+def VTBL4Pseudo
+ : PseudoNeonI<(outs DPR:$dst), (ins QQPR:$tbl, DPR:$src), IIC_VTB4, "", []>;
+
+// VTBX : Vector Table Extension
+def VTBX1
+ : N3V<1,1,0b11,0b1000,1,0, (outs DPR:$Vd),
+ (ins DPR:$orig, VecListOneD:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTBX1,
+ "vtbx", "8", "$Vd, $Vn, $Vm", "$orig = $Vd",
+ [(set DPR:$Vd, (v8i8 (int_arm_neon_vtbx1
+ DPR:$orig, VecListOneD:$Vn, DPR:$Vm)))]>;
+let hasExtraSrcRegAllocReq = 1 in {
+def VTBX2
+ : N3V<1,1,0b11,0b1001,1,0, (outs DPR:$Vd),
+ (ins DPR:$orig, VecListDPair:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTBX2,
+ "vtbx", "8", "$Vd, $Vn, $Vm", "$orig = $Vd", []>;
+def VTBX3
+ : N3V<1,1,0b11,0b1010,1,0, (outs DPR:$Vd),
+ (ins DPR:$orig, VecListThreeD:$Vn, DPR:$Vm),
+ NVTBLFrm, IIC_VTBX3,
+ "vtbx", "8", "$Vd, $Vn, $Vm",
+ "$orig = $Vd", []>;
+def VTBX4
+ : N3V<1,1,0b11,0b1011,1,0, (outs DPR:$Vd),
+ (ins DPR:$orig, VecListFourD:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTBX4,
+ "vtbx", "8", "$Vd, $Vn, $Vm",
+ "$orig = $Vd", []>;
+} // hasExtraSrcRegAllocReq = 1
+
+def VTBX3Pseudo
+ : PseudoNeonI<(outs DPR:$dst), (ins DPR:$orig, QQPR:$tbl, DPR:$src),
+ IIC_VTBX3, "$orig = $dst", []>;
+def VTBX4Pseudo
+ : PseudoNeonI<(outs DPR:$dst), (ins DPR:$orig, QQPR:$tbl, DPR:$src),
+ IIC_VTBX4, "$orig = $dst", []>;
+} // DecoderMethod = "DecodeTBLInstruction"
+
+// VRINT : Vector Rounding
+multiclass VRINT_FPI<string op, bits<3> op9_7, SDPatternOperator Int> {
+ let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in {
+ def Df : N2VDIntnp<0b10, 0b10, 0b100, 0, NoItinerary,
+ !strconcat("vrint", op), "f32",
+ v2f32, v2f32, Int>, Requires<[HasV8, HasNEON]> {
+ let Inst{9-7} = op9_7;
+ }
+ def Qf : N2VQIntnp<0b10, 0b10, 0b100, 0, NoItinerary,
+ !strconcat("vrint", op), "f32",
+ v4f32, v4f32, Int>, Requires<[HasV8, HasNEON]> {
+ let Inst{9-7} = op9_7;
+ }
+ def Dh : N2VDIntnp<0b01, 0b10, 0b100, 0, NoItinerary,
+ !strconcat("vrint", op), "f16",
+ v4f16, v4f16, Int>,
+ Requires<[HasV8, HasNEON, HasFullFP16]> {
+ let Inst{9-7} = op9_7;
+ }
+ def Qh : N2VQIntnp<0b01, 0b10, 0b100, 0, NoItinerary,
+ !strconcat("vrint", op), "f16",
+ v8f16, v8f16, Int>,
+ Requires<[HasV8, HasNEON, HasFullFP16]> {
+ let Inst{9-7} = op9_7;
+ }
+ }
+
+ def : NEONInstAlias<!strconcat("vrint", op, ".f32.f32\t$Dd, $Dm"),
+ (!cast<Instruction>(NAME#"Df") DPR:$Dd, DPR:$Dm)>;
+ def : NEONInstAlias<!strconcat("vrint", op, ".f32.f32\t$Qd, $Qm"),
+ (!cast<Instruction>(NAME#"Qf") QPR:$Qd, QPR:$Qm)>;
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def : NEONInstAlias<!strconcat("vrint", op, ".f16.f16\t$Dd, $Dm"),
+ (!cast<Instruction>(NAME#"Dh") DPR:$Dd, DPR:$Dm)>;
+ def : NEONInstAlias<!strconcat("vrint", op, ".f16.f16\t$Qd, $Qm"),
+ (!cast<Instruction>(NAME#"Qh") QPR:$Qd, QPR:$Qm)>;
+ }
+}
+
+defm VRINTNN : VRINT_FPI<"n", 0b000, int_arm_neon_vrintn>;
+defm VRINTXN : VRINT_FPI<"x", 0b001, int_arm_neon_vrintx>;
+defm VRINTAN : VRINT_FPI<"a", 0b010, int_arm_neon_vrinta>;
+defm VRINTZN : VRINT_FPI<"z", 0b011, int_arm_neon_vrintz>;
+defm VRINTMN : VRINT_FPI<"m", 0b101, int_arm_neon_vrintm>;
+defm VRINTPN : VRINT_FPI<"p", 0b111, int_arm_neon_vrintp>;
+
+// Cryptography instructions
+let PostEncoderMethod = "NEONThumb2DataIPostEncoder",
+ DecoderNamespace = "v8Crypto", hasSideEffects = 0 in {
+ class AES<string op, bit op7, bit op6, SDPatternOperator Int>
+ : N2VQIntXnp<0b00, 0b00, 0b011, op6, op7, NoItinerary,
+ !strconcat("aes", op), "8", v16i8, v16i8, Int>,
+ Requires<[HasV8, HasCrypto]>;
+ class AES2Op<string op, bit op7, bit op6, SDPatternOperator Int>
+ : N2VQIntX2np<0b00, 0b00, 0b011, op6, op7, NoItinerary,
+ !strconcat("aes", op), "8", v16i8, v16i8, Int>,
+ Requires<[HasV8, HasCrypto]>;
+ class N2SHA<string op, bits<2> op17_16, bits<3> op10_8, bit op7, bit op6,
+ SDPatternOperator Int>
+ : N2VQIntXnp<0b10, op17_16, op10_8, op6, op7, NoItinerary,
+ !strconcat("sha", op), "32", v4i32, v4i32, Int>,
+ Requires<[HasV8, HasCrypto]>;
+ class N2SHA2Op<string op, bits<2> op17_16, bits<3> op10_8, bit op7, bit op6,
+ SDPatternOperator Int>
+ : N2VQIntX2np<0b10, op17_16, op10_8, op6, op7, NoItinerary,
+ !strconcat("sha", op), "32", v4i32, v4i32, Int>,
+ Requires<[HasV8, HasCrypto]>;
+ class N3SHA3Op<string op, bits<5> op27_23, bits<2> op21_20, SDPatternOperator Int>
+ : N3VQInt3np<op27_23, op21_20, 0b1100, 1, 0, N3RegFrm, NoItinerary,
+ !strconcat("sha", op), "32", v4i32, v4i32, Int, 0>,
+ Requires<[HasV8, HasCrypto]>;
+}
+
+def AESD : AES2Op<"d", 0, 1, int_arm_neon_aesd>;
+def AESE : AES2Op<"e", 0, 0, int_arm_neon_aese>;
+def AESIMC : AES<"imc", 1, 1, int_arm_neon_aesimc>;
+def AESMC : AES<"mc", 1, 0, int_arm_neon_aesmc>;
+
+def SHA1H : N2SHA<"1h", 0b01, 0b010, 1, 1, null_frag>;
+def SHA1SU1 : N2SHA2Op<"1su1", 0b10, 0b011, 1, 0, int_arm_neon_sha1su1>;
+def SHA256SU0 : N2SHA2Op<"256su0", 0b10, 0b011, 1, 1, int_arm_neon_sha256su0>;
+def SHA1C : N3SHA3Op<"1c", 0b00100, 0b00, null_frag>;
+def SHA1M : N3SHA3Op<"1m", 0b00100, 0b10, null_frag>;
+def SHA1P : N3SHA3Op<"1p", 0b00100, 0b01, null_frag>;
+def SHA1SU0 : N3SHA3Op<"1su0", 0b00100, 0b11, int_arm_neon_sha1su0>;
+def SHA256H : N3SHA3Op<"256h", 0b00110, 0b00, int_arm_neon_sha256h>;
+def SHA256H2 : N3SHA3Op<"256h2", 0b00110, 0b01, int_arm_neon_sha256h2>;
+def SHA256SU1 : N3SHA3Op<"256su1", 0b00110, 0b10, int_arm_neon_sha256su1>;
+
+def : Pat<(i32 (int_arm_neon_sha1h i32:$Rn)),
+ (COPY_TO_REGCLASS (f32 (EXTRACT_SUBREG
+ (SHA1H (SUBREG_TO_REG (i64 0),
+ (f32 (COPY_TO_REGCLASS i32:$Rn, SPR)),
+ ssub_0)),
+ ssub_0)), GPR)>;
+
+def : Pat<(v4i32 (int_arm_neon_sha1c v4i32:$hash_abcd, i32:$hash_e, v4i32:$wk)),
+ (SHA1C v4i32:$hash_abcd,
+ (SUBREG_TO_REG (i64 0),
+ (f32 (COPY_TO_REGCLASS i32:$hash_e, SPR)),
+ ssub_0),
+ v4i32:$wk)>;
+
+def : Pat<(v4i32 (int_arm_neon_sha1m v4i32:$hash_abcd, i32:$hash_e, v4i32:$wk)),
+ (SHA1M v4i32:$hash_abcd,
+ (SUBREG_TO_REG (i64 0),
+ (f32 (COPY_TO_REGCLASS i32:$hash_e, SPR)),
+ ssub_0),
+ v4i32:$wk)>;
+
+def : Pat<(v4i32 (int_arm_neon_sha1p v4i32:$hash_abcd, i32:$hash_e, v4i32:$wk)),
+ (SHA1P v4i32:$hash_abcd,
+ (SUBREG_TO_REG (i64 0),
+ (f32 (COPY_TO_REGCLASS i32:$hash_e, SPR)),
+ ssub_0),
+ v4i32:$wk)>;
+
+//===----------------------------------------------------------------------===//
+// NEON instructions for single-precision FP math
+//===----------------------------------------------------------------------===//
+
+class N2VSPat<SDNode OpNode, NeonI Inst>
+ : NEONFPPat<(f32 (OpNode SPR:$a)),
+ (EXTRACT_SUBREG
+ (v2f32 (COPY_TO_REGCLASS (Inst
+ (INSERT_SUBREG
+ (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)),
+ SPR:$a, ssub_0)), DPR_VFP2)), ssub_0)>;
+
+class N3VSPat<SDNode OpNode, NeonI Inst>
+ : NEONFPPat<(f32 (OpNode SPR:$a, SPR:$b)),
+ (EXTRACT_SUBREG
+ (v2f32 (COPY_TO_REGCLASS (Inst
+ (INSERT_SUBREG
+ (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)),
+ SPR:$a, ssub_0),
+ (INSERT_SUBREG
+ (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)),
+ SPR:$b, ssub_0)), DPR_VFP2)), ssub_0)>;
+
+class N3VSMulOpPat<SDNode MulNode, SDNode OpNode, NeonI Inst>
+ : NEONFPPat<(f32 (OpNode SPR:$acc, (f32 (MulNode SPR:$a, SPR:$b)))),
+ (EXTRACT_SUBREG
+ (v2f32 (COPY_TO_REGCLASS (Inst
+ (INSERT_SUBREG
+ (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)),
+ SPR:$acc, ssub_0),
+ (INSERT_SUBREG
+ (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)),
+ SPR:$a, ssub_0),
+ (INSERT_SUBREG
+ (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)),
+ SPR:$b, ssub_0)), DPR_VFP2)), ssub_0)>;
+
+class NVCVTIFPat<SDNode OpNode, NeonI Inst>
+ : NEONFPPat<(f32 (OpNode GPR:$a)),
+ (f32 (EXTRACT_SUBREG
+ (v2f32 (Inst
+ (INSERT_SUBREG
+ (v2f32 (IMPLICIT_DEF)),
+ (i32 (COPY_TO_REGCLASS GPR:$a, SPR)), ssub_0))),
+ ssub_0))>;
+class NVCVTFIPat<SDNode OpNode, NeonI Inst>
+ : NEONFPPat<(i32 (OpNode SPR:$a)),
+ (i32 (EXTRACT_SUBREG
+ (v2f32 (Inst (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)),
+ SPR:$a, ssub_0))),
+ ssub_0))>;
+
+def : N3VSPat<fadd, VADDfd>;
+def : N3VSPat<fsub, VSUBfd>;
+def : N3VSPat<fmul, VMULfd>;
+def : N3VSMulOpPat<fmul, fadd, VMLAfd>,
+ Requires<[HasNEON, UseNEONForFP, UseFPVMLx, DontUseFusedMAC]>;
+def : N3VSMulOpPat<fmul, fsub, VMLSfd>,
+ Requires<[HasNEON, UseNEONForFP, UseFPVMLx, DontUseFusedMAC]>;
+def : N3VSMulOpPat<fmul, fadd, VFMAfd>,
+ Requires<[HasVFP4, UseNEONForFP, UseFusedMAC]>;
+def : N3VSMulOpPat<fmul, fsub, VFMSfd>,
+ Requires<[HasVFP4, UseNEONForFP, UseFusedMAC]>;
+def : N2VSPat<fabs, VABSfd>;
+def : N2VSPat<fneg, VNEGfd>;
+def : N3VSPat<fmaxnan, VMAXfd>, Requires<[HasNEON]>;
+def : N3VSPat<fminnan, VMINfd>, Requires<[HasNEON]>;
+def : NVCVTFIPat<fp_to_sint, VCVTf2sd>;
+def : NVCVTFIPat<fp_to_uint, VCVTf2ud>;
+def : NVCVTIFPat<sint_to_fp, VCVTs2fd>;
+def : NVCVTIFPat<uint_to_fp, VCVTu2fd>;
+
+// NEON doesn't have any f64 conversions, so provide patterns to make
+// sure the VFP conversions match when extracting from a vector.
+def : VFPPat<(f64 (sint_to_fp (extractelt (v2i32 DPR:$src), imm:$lane))),
+ (VSITOD (EXTRACT_SUBREG DPR:$src, (SSubReg_f32_reg imm:$lane)))>;
+def : VFPPat<(f64 (sint_to_fp (extractelt (v4i32 QPR:$src), imm:$lane))),
+ (VSITOD (EXTRACT_SUBREG QPR:$src, (SSubReg_f32_reg imm:$lane)))>;
+def : VFPPat<(f64 (uint_to_fp (extractelt (v2i32 DPR:$src), imm:$lane))),
+ (VUITOD (EXTRACT_SUBREG DPR:$src, (SSubReg_f32_reg imm:$lane)))>;
+def : VFPPat<(f64 (uint_to_fp (extractelt (v4i32 QPR:$src), imm:$lane))),
+ (VUITOD (EXTRACT_SUBREG QPR:$src, (SSubReg_f32_reg imm:$lane)))>;
+
+
+// Prefer VMOVDRR for i32 -> f32 bitcasts, it can write all DPR registers.
+def : Pat<(f32 (bitconvert GPR:$a)),
+ (EXTRACT_SUBREG (VMOVDRR GPR:$a, GPR:$a), ssub_0)>,
+ Requires<[HasNEON, DontUseVMOVSR]>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// bit_convert
+let Predicates = [IsLE] in {
+ def : Pat<(v1i64 (bitconvert (v2i32 DPR:$src))), (v1i64 DPR:$src)>;
+ def : Pat<(v1i64 (bitconvert (v4i16 DPR:$src))), (v1i64 DPR:$src)>;
+ def : Pat<(v1i64 (bitconvert (v8i8 DPR:$src))), (v1i64 DPR:$src)>;
+}
+def : Pat<(v1i64 (bitconvert (f64 DPR:$src))), (v1i64 DPR:$src)>;
+let Predicates = [IsLE] in {
+ def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (v1i64 DPR:$src)>;
+ def : Pat<(v2i32 (bitconvert (v1i64 DPR:$src))), (v2i32 DPR:$src)>;
+ def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (v2i32 DPR:$src)>;
+ def : Pat<(v2i32 (bitconvert (v8i8 DPR:$src))), (v2i32 DPR:$src)>;
+ def : Pat<(v2i32 (bitconvert (f64 DPR:$src))), (v2i32 DPR:$src)>;
+}
+def : Pat<(v2i32 (bitconvert (v2f32 DPR:$src))), (v2i32 DPR:$src)>;
+let Predicates = [IsLE] in {
+ def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (v4i16 DPR:$src)>;
+ def : Pat<(v4i16 (bitconvert (v2i32 DPR:$src))), (v4i16 DPR:$src)>;
+ def : Pat<(v4i16 (bitconvert (v8i8 DPR:$src))), (v4i16 DPR:$src)>;
+ def : Pat<(v4i16 (bitconvert (f64 DPR:$src))), (v4i16 DPR:$src)>;
+ def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (v4i16 DPR:$src)>;
+ def : Pat<(v8i8 (bitconvert (v1i64 DPR:$src))), (v8i8 DPR:$src)>;
+ def : Pat<(v8i8 (bitconvert (v2i32 DPR:$src))), (v8i8 DPR:$src)>;
+ def : Pat<(v8i8 (bitconvert (v4i16 DPR:$src))), (v8i8 DPR:$src)>;
+ def : Pat<(v8i8 (bitconvert (f64 DPR:$src))), (v8i8 DPR:$src)>;
+ def : Pat<(v8i8 (bitconvert (v2f32 DPR:$src))), (v8i8 DPR:$src)>;
+}
+def : Pat<(f64 (bitconvert (v1i64 DPR:$src))), (f64 DPR:$src)>;
+let Predicates = [IsLE] in {
+ def : Pat<(f64 (bitconvert (v2i32 DPR:$src))), (f64 DPR:$src)>;
+ def : Pat<(f64 (bitconvert (v4i16 DPR:$src))), (f64 DPR:$src)>;
+ def : Pat<(f64 (bitconvert (v8i8 DPR:$src))), (f64 DPR:$src)>;
+ def : Pat<(f64 (bitconvert (v2f32 DPR:$src))), (f64 DPR:$src)>;
+ def : Pat<(v2f32 (bitconvert (f64 DPR:$src))), (v2f32 DPR:$src)>;
+ def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (v2f32 DPR:$src)>;
+}
+def : Pat<(v2f32 (bitconvert (v2i32 DPR:$src))), (v2f32 DPR:$src)>;
+let Predicates = [IsLE] in {
+ def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (v2f32 DPR:$src)>;
+ def : Pat<(v2f32 (bitconvert (v8i8 DPR:$src))), (v2f32 DPR:$src)>;
+}
+
+let Predicates = [IsLE] in {
+ def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (v2i64 QPR:$src)>;
+ def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (v2i64 QPR:$src)>;
+ def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (v2i64 QPR:$src)>;
+}
+def : Pat<(v2i64 (bitconvert (v2f64 QPR:$src))), (v2i64 QPR:$src)>;
+let Predicates = [IsLE] in {
+ def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (v2i64 QPR:$src)>;
+ def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (v4i32 QPR:$src)>;
+ def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (v4i32 QPR:$src)>;
+ def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (v4i32 QPR:$src)>;
+ def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (v4i32 QPR:$src)>;
+}
+def : Pat<(v4i32 (bitconvert (v4f32 QPR:$src))), (v4i32 QPR:$src)>;
+let Predicates = [IsLE] in {
+ def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (v8i16 QPR:$src)>;
+ def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (v8i16 QPR:$src)>;
+ def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (v8i16 QPR:$src)>;
+ def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 QPR:$src)>;
+ def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 QPR:$src)>;
+ def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (v16i8 QPR:$src)>;
+ def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (v16i8 QPR:$src)>;
+ def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (v16i8 QPR:$src)>;
+ def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (v16i8 QPR:$src)>;
+ def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (v16i8 QPR:$src)>;
+ def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (v4f32 QPR:$src)>;
+}
+def : Pat<(v4f32 (bitconvert (v4i32 QPR:$src))), (v4f32 QPR:$src)>;
+let Predicates = [IsLE] in {
+ def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (v4f32 QPR:$src)>;
+ def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (v4f32 QPR:$src)>;
+ def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (v4f32 QPR:$src)>;
+}
+def : Pat<(v2f64 (bitconvert (v2i64 QPR:$src))), (v2f64 QPR:$src)>;
+let Predicates = [IsLE] in {
+ def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 QPR:$src)>;
+ def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>;
+ def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>;
+ def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>;
+}
+
+let Predicates = [IsBE] in {
+ // 64 bit conversions
+ def : Pat<(v1i64 (bitconvert (v2i32 DPR:$src))), (VREV64d32 DPR:$src)>;
+ def : Pat<(v1i64 (bitconvert (v4i16 DPR:$src))), (VREV64d16 DPR:$src)>;
+ def : Pat<(v1i64 (bitconvert (v8i8 DPR:$src))), (VREV64d8 DPR:$src)>;
+ def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (VREV64d32 DPR:$src)>;
+ def : Pat<(v2i32 (bitconvert (v1i64 DPR:$src))), (VREV64d32 DPR:$src)>;
+ def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (VREV32d16 DPR:$src)>;
+ def : Pat<(v2i32 (bitconvert (v8i8 DPR:$src))), (VREV32d8 DPR:$src)>;
+ def : Pat<(v2i32 (bitconvert (f64 DPR:$src))), (VREV64d32 DPR:$src)>;
+ def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (VREV64d16 DPR:$src)>;
+ def : Pat<(v4i16 (bitconvert (v2i32 DPR:$src))), (VREV32d16 DPR:$src)>;
+ def : Pat<(v4i16 (bitconvert (v8i8 DPR:$src))), (VREV16d8 DPR:$src)>;
+ def : Pat<(v4i16 (bitconvert (f64 DPR:$src))), (VREV64d16 DPR:$src)>;
+ def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (VREV32d16 DPR:$src)>;
+ def : Pat<(v8i8 (bitconvert (v1i64 DPR:$src))), (VREV64d8 DPR:$src)>;
+ def : Pat<(v8i8 (bitconvert (v2i32 DPR:$src))), (VREV32d8 DPR:$src)>;
+ def : Pat<(v8i8 (bitconvert (v4i16 DPR:$src))), (VREV16d8 DPR:$src)>;
+ def : Pat<(v8i8 (bitconvert (f64 DPR:$src))), (VREV64d8 DPR:$src)>;
+ def : Pat<(v8i8 (bitconvert (v2f32 DPR:$src))), (VREV32d8 DPR:$src)>;
+ def : Pat<(f64 (bitconvert (v2i32 DPR:$src))), (VREV64d32 DPR:$src)>;
+ def : Pat<(f64 (bitconvert (v4i16 DPR:$src))), (VREV64d16 DPR:$src)>;
+ def : Pat<(f64 (bitconvert (v8i8 DPR:$src))), (VREV64d8 DPR:$src)>;
+ def : Pat<(f64 (bitconvert (v2f32 DPR:$src))), (VREV64d32 DPR:$src)>;
+ def : Pat<(v2f32 (bitconvert (f64 DPR:$src))), (VREV64d32 DPR:$src)>;
+ def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (VREV64d32 DPR:$src)>;
+ def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (VREV32d16 DPR:$src)>;
+ def : Pat<(v2f32 (bitconvert (v8i8 DPR:$src))), (VREV32d8 DPR:$src)>;
+
+ // 128 bit conversions
+ def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (VREV64q32 QPR:$src)>;
+ def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (VREV64q16 QPR:$src)>;
+ def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (VREV64q8 QPR:$src)>;
+ def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (VREV64q32 QPR:$src)>;
+ def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (VREV64q32 QPR:$src)>;
+ def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (VREV32q16 QPR:$src)>;
+ def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (VREV32q8 QPR:$src)>;
+ def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (VREV64q32 QPR:$src)>;
+ def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (VREV64q16 QPR:$src)>;
+ def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (VREV32q16 QPR:$src)>;
+ def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (VREV16q8 QPR:$src)>;
+ def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (VREV64q16 QPR:$src)>;
+ def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (VREV32q16 QPR:$src)>;
+ def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (VREV64q8 QPR:$src)>;
+ def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (VREV32q8 QPR:$src)>;
+ def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (VREV16q8 QPR:$src)>;
+ def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (VREV64q8 QPR:$src)>;
+ def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (VREV32q8 QPR:$src)>;
+ def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (VREV64q32 QPR:$src)>;
+ def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (VREV32q16 QPR:$src)>;
+ def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (VREV32q8 QPR:$src)>;
+ def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (VREV64q32 QPR:$src)>;
+ def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (VREV64q32 QPR:$src)>;
+ def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (VREV64q16 QPR:$src)>;
+ def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (VREV64q8 QPR:$src)>;
+ def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (VREV64q32 QPR:$src)>;
+}
+
+// Use VLD1/VST1 + VREV for non-word-aligned v2f64 load/store on Big Endian
+def : Pat<(v2f64 (byte_alignedload addrmode6:$addr)),
+ (VREV64q8 (VLD1q8 addrmode6:$addr))>, Requires<[IsBE]>;
+def : Pat<(byte_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+ (VST1q8 addrmode6:$addr, (VREV64q8 QPR:$value))>, Requires<[IsBE]>;
+def : Pat<(v2f64 (hword_alignedload addrmode6:$addr)),
+ (VREV64q16 (VLD1q16 addrmode6:$addr))>, Requires<[IsBE]>;
+def : Pat<(hword_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+ (VST1q16 addrmode6:$addr, (VREV64q16 QPR:$value))>, Requires<[IsBE]>;
+
+// Fold extracting an element out of a v2i32 into a vfp register.
+def : Pat<(f32 (bitconvert (i32 (extractelt (v2i32 DPR:$src), imm:$lane)))),
+ (f32 (EXTRACT_SUBREG DPR:$src, (SSubReg_f32_reg imm:$lane)))>;
+
+// Vector lengthening move with load, matching extending loads.
+
+// extload, zextload and sextload for a standard lengthening load. Example:
+// Lengthen_Single<"8", "i16", "8"> =
+// Pat<(v8i16 (extloadvi8 addrmode6:$addr))
+// (VMOVLuv8i16 (VLD1d8 addrmode6:$addr,
+// (f64 (IMPLICIT_DEF)), (i32 0)))>;
+multiclass Lengthen_Single<string DestLanes, string DestTy, string SrcTy> {
+ let AddedComplexity = 10 in {
+ def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+ (!cast<PatFrag>("extloadvi" # SrcTy) addrmode6:$addr)),
+ (!cast<Instruction>("VMOVLuv" # DestLanes # DestTy)
+ (!cast<Instruction>("VLD1d" # SrcTy) addrmode6:$addr))>;
+
+ def _Z : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+ (!cast<PatFrag>("zextloadvi" # SrcTy) addrmode6:$addr)),
+ (!cast<Instruction>("VMOVLuv" # DestLanes # DestTy)
+ (!cast<Instruction>("VLD1d" # SrcTy) addrmode6:$addr))>;
+
+ def _S : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+ (!cast<PatFrag>("sextloadvi" # SrcTy) addrmode6:$addr)),
+ (!cast<Instruction>("VMOVLsv" # DestLanes # DestTy)
+ (!cast<Instruction>("VLD1d" # SrcTy) addrmode6:$addr))>;
+ }
+}
+
+// extload, zextload and sextload for a lengthening load which only uses
+// half the lanes available. Example:
+// Lengthen_HalfSingle<"4", "i16", "8", "i16", "i8"> =
+// Pat<(v4i16 (extloadvi8 addrmode6oneL32:$addr)),
+// (EXTRACT_SUBREG (VMOVLuv8i16 (VLD1LNd32 addrmode6oneL32:$addr,
+// (f64 (IMPLICIT_DEF)), (i32 0))),
+// dsub_0)>;
+multiclass Lengthen_HalfSingle<string DestLanes, string DestTy, string SrcTy,
+ string InsnLanes, string InsnTy> {
+ def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+ (!cast<PatFrag>("extloadv" # SrcTy) addrmode6oneL32:$addr)),
+ (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # InsnLanes # InsnTy)
+ (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+ dsub_0)>;
+ def _Z : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+ (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6oneL32:$addr)),
+ (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # InsnLanes # InsnTy)
+ (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+ dsub_0)>;
+ def _S : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+ (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6oneL32:$addr)),
+ (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # InsnLanes # InsnTy)
+ (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+ dsub_0)>;
+}
+
+// The following class definition is basically a copy of the
+// Lengthen_HalfSingle definition above, however with an additional parameter
+// "RevLanes" to select the correct VREV32dXX instruction. This is to convert
+// data loaded by VLD1LN into proper vector format in big endian mode.
+multiclass Lengthen_HalfSingle_Big_Endian<string DestLanes, string DestTy, string SrcTy,
+ string InsnLanes, string InsnTy, string RevLanes> {
+ def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+ (!cast<PatFrag>("extloadv" # SrcTy) addrmode6oneL32:$addr)),
+ (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # InsnLanes # InsnTy)
+ (!cast<Instruction>("VREV32d" # RevLanes)
+ (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
+ dsub_0)>;
+ def _Z : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+ (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6oneL32:$addr)),
+ (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # InsnLanes # InsnTy)
+ (!cast<Instruction>("VREV32d" # RevLanes)
+ (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
+ dsub_0)>;
+ def _S : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+ (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6oneL32:$addr)),
+ (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # InsnLanes # InsnTy)
+ (!cast<Instruction>("VREV32d" # RevLanes)
+ (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
+ dsub_0)>;
+}
+
+// extload, zextload and sextload for a lengthening load followed by another
+// lengthening load, to quadruple the initial length.
+//
+// Lengthen_Double<"4", "i32", "i8", "8", "i16", "4", "i32"> =
+// Pat<(v4i32 (extloadvi8 addrmode6oneL32:$addr))
+// (EXTRACT_SUBREG (VMOVLuv4i32
+// (EXTRACT_SUBREG (VMOVLuv8i16 (VLD1LNd32 addrmode6oneL32:$addr,
+// (f64 (IMPLICIT_DEF)),
+// (i32 0))),
+// dsub_0)),
+// dsub_0)>;
+multiclass Lengthen_Double<string DestLanes, string DestTy, string SrcTy,
+ string Insn1Lanes, string Insn1Ty, string Insn2Lanes,
+ string Insn2Ty> {
+ def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+ (!cast<PatFrag>("extloadv" # SrcTy) addrmode6oneL32:$addr)),
+ (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
+ (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
+ (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+ dsub_0))>;
+ def _Z : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+ (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6oneL32:$addr)),
+ (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
+ (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
+ (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+ dsub_0))>;
+ def _S : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+ (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6oneL32:$addr)),
+ (!cast<Instruction>("VMOVLsv" # Insn2Lanes # Insn2Ty)
+ (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn1Lanes # Insn1Ty)
+ (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+ dsub_0))>;
+}
+
+// The following class definition is basically a copy of the
+// Lengthen_Double definition above, however with an additional parameter
+// "RevLanes" to select the correct VREV32dXX instruction. This is to convert
+// data loaded by VLD1LN into proper vector format in big endian mode.
+multiclass Lengthen_Double_Big_Endian<string DestLanes, string DestTy, string SrcTy,
+ string Insn1Lanes, string Insn1Ty, string Insn2Lanes,
+ string Insn2Ty, string RevLanes> {
+ def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+ (!cast<PatFrag>("extloadv" # SrcTy) addrmode6oneL32:$addr)),
+ (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
+ (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
+ (!cast<Instruction>("VREV32d" # RevLanes)
+ (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
+ dsub_0))>;
+ def _Z : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+ (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6oneL32:$addr)),
+ (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
+ (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
+ (!cast<Instruction>("VREV32d" # RevLanes)
+ (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
+ dsub_0))>;
+ def _S : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+ (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6oneL32:$addr)),
+ (!cast<Instruction>("VMOVLsv" # Insn2Lanes # Insn2Ty)
+ (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn1Lanes # Insn1Ty)
+ (!cast<Instruction>("VREV32d" # RevLanes)
+ (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
+ dsub_0))>;
+}
+
+// extload, zextload and sextload for a lengthening load followed by another
+// lengthening load, to quadruple the initial length, but which ends up only
+// requiring half the available lanes (a 64-bit outcome instead of a 128-bit).
+//
+// Lengthen_HalfDouble<"2", "i32", "i8", "8", "i16", "4", "i32"> =
+// Pat<(v2i32 (extloadvi8 addrmode6:$addr))
+// (EXTRACT_SUBREG (VMOVLuv4i32
+// (EXTRACT_SUBREG (VMOVLuv8i16 (VLD1LNd16 addrmode6:$addr,
+// (f64 (IMPLICIT_DEF)), (i32 0))),
+// dsub_0)),
+// dsub_0)>;
+multiclass Lengthen_HalfDouble<string DestLanes, string DestTy, string SrcTy,
+ string Insn1Lanes, string Insn1Ty, string Insn2Lanes,
+ string Insn2Ty> {
+ def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+ (!cast<PatFrag>("extloadv" # SrcTy) addrmode6:$addr)),
+ (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
+ (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
+ (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+ dsub_0)),
+ dsub_0)>;
+ def _Z : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+ (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6:$addr)),
+ (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
+ (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
+ (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+ dsub_0)),
+ dsub_0)>;
+ def _S : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+ (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6:$addr)),
+ (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn2Lanes # Insn2Ty)
+ (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn1Lanes # Insn1Ty)
+ (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+ dsub_0)),
+ dsub_0)>;
+}
+
+// The following class definition is basically a copy of the
+// Lengthen_HalfDouble definition above, however with an additional VREV16d8
+// instruction to convert data loaded by VLD1LN into proper vector format
+// in big endian mode.
+multiclass Lengthen_HalfDouble_Big_Endian<string DestLanes, string DestTy, string SrcTy,
+ string Insn1Lanes, string Insn1Ty, string Insn2Lanes,
+ string Insn2Ty> {
+ def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+ (!cast<PatFrag>("extloadv" # SrcTy) addrmode6:$addr)),
+ (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
+ (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
+ (!cast<Instruction>("VREV16d8")
+ (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
+ dsub_0)),
+ dsub_0)>;
+ def _Z : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+ (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6:$addr)),
+ (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
+ (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
+ (!cast<Instruction>("VREV16d8")
+ (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
+ dsub_0)),
+ dsub_0)>;
+ def _S : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+ (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6:$addr)),
+ (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn2Lanes # Insn2Ty)
+ (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn1Lanes # Insn1Ty)
+ (!cast<Instruction>("VREV16d8")
+ (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
+ dsub_0)),
+ dsub_0)>;
+}
+
+defm : Lengthen_Single<"8", "i16", "8">; // v8i8 -> v8i16
+defm : Lengthen_Single<"4", "i32", "16">; // v4i16 -> v4i32
+defm : Lengthen_Single<"2", "i64", "32">; // v2i32 -> v2i64
+
+let Predicates = [IsLE] in {
+ defm : Lengthen_HalfSingle<"4", "i16", "i8", "8", "i16">; // v4i8 -> v4i16
+ defm : Lengthen_HalfSingle<"2", "i32", "i16", "4", "i32">; // v2i16 -> v2i32
+
+ // Double lengthening - v4i8 -> v4i16 -> v4i32
+ defm : Lengthen_Double<"4", "i32", "i8", "8", "i16", "4", "i32">;
+ // v2i8 -> v2i16 -> v2i32
+ defm : Lengthen_HalfDouble<"2", "i32", "i8", "8", "i16", "4", "i32">;
+ // v2i16 -> v2i32 -> v2i64
+ defm : Lengthen_Double<"2", "i64", "i16", "4", "i32", "2", "i64">;
+}
+
+let Predicates = [IsBE] in {
+ defm : Lengthen_HalfSingle_Big_Endian<"4", "i16", "i8", "8", "i16", "8">; // v4i8 -> v4i16
+ defm : Lengthen_HalfSingle_Big_Endian<"2", "i32", "i16", "4", "i32", "16">; // v2i16 -> v2i32
+
+ // Double lengthening - v4i8 -> v4i16 -> v4i32
+ defm : Lengthen_Double_Big_Endian<"4", "i32", "i8", "8", "i16", "4", "i32", "8">;
+ // v2i8 -> v2i16 -> v2i32
+ defm : Lengthen_HalfDouble_Big_Endian<"2", "i32", "i8", "8", "i16", "4", "i32">;
+ // v2i16 -> v2i32 -> v2i64
+ defm : Lengthen_Double_Big_Endian<"2", "i64", "i16", "4", "i32", "2", "i64", "16">;
+}
+
+// Triple lengthening - v2i8 -> v2i16 -> v2i32 -> v2i64
+let Predicates = [IsLE] in {
+ def : Pat<(v2i64 (extloadvi8 addrmode6:$addr)),
+ (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16
+ (VLD1LNd16 addrmode6:$addr,
+ (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>;
+ def : Pat<(v2i64 (zextloadvi8 addrmode6:$addr)),
+ (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16
+ (VLD1LNd16 addrmode6:$addr,
+ (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>;
+ def : Pat<(v2i64 (sextloadvi8 addrmode6:$addr)),
+ (VMOVLsv2i64 (EXTRACT_SUBREG (VMOVLsv4i32 (EXTRACT_SUBREG (VMOVLsv8i16
+ (VLD1LNd16 addrmode6:$addr,
+ (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>;
+}
+// The following patterns are basically a copy of the patterns above,
+// however with an additional VREV16d instruction to convert data
+// loaded by VLD1LN into proper vector format in big endian mode.
+let Predicates = [IsBE] in {
+ def : Pat<(v2i64 (extloadvi8 addrmode6:$addr)),
+ (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16
+ (!cast<Instruction>("VREV16d8")
+ (VLD1LNd16 addrmode6:$addr,
+ (f64 (IMPLICIT_DEF)), (i32 0)))), dsub_0)), dsub_0))>;
+ def : Pat<(v2i64 (zextloadvi8 addrmode6:$addr)),
+ (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16
+ (!cast<Instruction>("VREV16d8")
+ (VLD1LNd16 addrmode6:$addr,
+ (f64 (IMPLICIT_DEF)), (i32 0)))), dsub_0)), dsub_0))>;
+ def : Pat<(v2i64 (sextloadvi8 addrmode6:$addr)),
+ (VMOVLsv2i64 (EXTRACT_SUBREG (VMOVLsv4i32 (EXTRACT_SUBREG (VMOVLsv8i16
+ (!cast<Instruction>("VREV16d8")
+ (VLD1LNd16 addrmode6:$addr,
+ (f64 (IMPLICIT_DEF)), (i32 0)))), dsub_0)), dsub_0))>;
+}
+
+//===----------------------------------------------------------------------===//
+// Assembler aliases
+//
+
+def : VFP2InstAlias<"fmdhr${p} $Dd, $Rn",
+ (VSETLNi32 DPR:$Dd, GPR:$Rn, 1, pred:$p)>;
+def : VFP2InstAlias<"fmdlr${p} $Dd, $Rn",
+ (VSETLNi32 DPR:$Dd, GPR:$Rn, 0, pred:$p)>;
+
+// VAND/VBIC/VEOR/VORR accept but do not require a type suffix.
+defm : NEONDTAnyInstAlias<"vand${p}", "$Vd, $Vn, $Vm",
+ (VANDd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vand${p}", "$Vd, $Vn, $Vm",
+ (VANDq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vbic${p}", "$Vd, $Vn, $Vm",
+ (VBICd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vbic${p}", "$Vd, $Vn, $Vm",
+ (VBICq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"veor${p}", "$Vd, $Vn, $Vm",
+ (VEORd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"veor${p}", "$Vd, $Vn, $Vm",
+ (VEORq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vorr${p}", "$Vd, $Vn, $Vm",
+ (VORRd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vorr${p}", "$Vd, $Vn, $Vm",
+ (VORRq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>;
+// ... two-operand aliases
+defm : NEONDTAnyInstAlias<"vand${p}", "$Vdn, $Vm",
+ (VANDd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vand${p}", "$Vdn, $Vm",
+ (VANDq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"veor${p}", "$Vdn, $Vm",
+ (VEORd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"veor${p}", "$Vdn, $Vm",
+ (VEORq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vorr${p}", "$Vdn, $Vm",
+ (VORRd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vorr${p}", "$Vdn, $Vm",
+ (VORRq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+// ... immediates
+def : NEONInstAlias<"vand${p}.i16 $Vd, $imm",
+ (VBICiv4i16 DPR:$Vd, nImmSplatNotI16:$imm, pred:$p)>;
+def : NEONInstAlias<"vand${p}.i32 $Vd, $imm",
+ (VBICiv2i32 DPR:$Vd, nImmSplatNotI32:$imm, pred:$p)>;
+def : NEONInstAlias<"vand${p}.i16 $Vd, $imm",
+ (VBICiv8i16 QPR:$Vd, nImmSplatNotI16:$imm, pred:$p)>;
+def : NEONInstAlias<"vand${p}.i32 $Vd, $imm",
+ (VBICiv4i32 QPR:$Vd, nImmSplatNotI32:$imm, pred:$p)>;
+
+
+// VLD1 single-lane pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VLD1LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vld1${p}", ".8", "$list, $addr",
+ (ins VecListOneDByteIndexed:$list, addrmode6alignNone:$addr,
+ pred:$p)>;
+def VLD1LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vld1${p}", ".16", "$list, $addr",
+ (ins VecListOneDHWordIndexed:$list, addrmode6align16:$addr,
+ pred:$p)>;
+def VLD1LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vld1${p}", ".32", "$list, $addr",
+ (ins VecListOneDWordIndexed:$list, addrmode6align32:$addr,
+ pred:$p)>;
+
+def VLD1LNdWB_fixed_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vld1${p}", ".8", "$list, $addr!",
+ (ins VecListOneDByteIndexed:$list, addrmode6alignNone:$addr,
+ pred:$p)>;
+def VLD1LNdWB_fixed_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vld1${p}", ".16", "$list, $addr!",
+ (ins VecListOneDHWordIndexed:$list, addrmode6align16:$addr,
+ pred:$p)>;
+def VLD1LNdWB_fixed_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vld1${p}", ".32", "$list, $addr!",
+ (ins VecListOneDWordIndexed:$list, addrmode6align32:$addr,
+ pred:$p)>;
+def VLD1LNdWB_register_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vld1${p}", ".8", "$list, $addr, $Rm",
+ (ins VecListOneDByteIndexed:$list, addrmode6alignNone:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VLD1LNdWB_register_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vld1${p}", ".16", "$list, $addr, $Rm",
+ (ins VecListOneDHWordIndexed:$list, addrmode6align16:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VLD1LNdWB_register_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vld1${p}", ".32", "$list, $addr, $Rm",
+ (ins VecListOneDWordIndexed:$list, addrmode6align32:$addr,
+ rGPR:$Rm, pred:$p)>;
+
+
+// VST1 single-lane pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VST1LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vst1${p}", ".8", "$list, $addr",
+ (ins VecListOneDByteIndexed:$list, addrmode6alignNone:$addr,
+ pred:$p)>;
+def VST1LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vst1${p}", ".16", "$list, $addr",
+ (ins VecListOneDHWordIndexed:$list, addrmode6align16:$addr,
+ pred:$p)>;
+def VST1LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vst1${p}", ".32", "$list, $addr",
+ (ins VecListOneDWordIndexed:$list, addrmode6align32:$addr,
+ pred:$p)>;
+
+def VST1LNdWB_fixed_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vst1${p}", ".8", "$list, $addr!",
+ (ins VecListOneDByteIndexed:$list, addrmode6alignNone:$addr,
+ pred:$p)>;
+def VST1LNdWB_fixed_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vst1${p}", ".16", "$list, $addr!",
+ (ins VecListOneDHWordIndexed:$list, addrmode6align16:$addr,
+ pred:$p)>;
+def VST1LNdWB_fixed_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vst1${p}", ".32", "$list, $addr!",
+ (ins VecListOneDWordIndexed:$list, addrmode6align32:$addr,
+ pred:$p)>;
+def VST1LNdWB_register_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vst1${p}", ".8", "$list, $addr, $Rm",
+ (ins VecListOneDByteIndexed:$list, addrmode6alignNone:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VST1LNdWB_register_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vst1${p}", ".16", "$list, $addr, $Rm",
+ (ins VecListOneDHWordIndexed:$list, addrmode6align16:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VST1LNdWB_register_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vst1${p}", ".32", "$list, $addr, $Rm",
+ (ins VecListOneDWordIndexed:$list, addrmode6align32:$addr,
+ rGPR:$Rm, pred:$p)>;
+
+// VLD2 single-lane pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VLD2LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vld2${p}", ".8", "$list, $addr",
+ (ins VecListTwoDByteIndexed:$list, addrmode6align16:$addr,
+ pred:$p)>;
+def VLD2LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr",
+ (ins VecListTwoDHWordIndexed:$list, addrmode6align32:$addr,
+ pred:$p)>;
+def VLD2LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr",
+ (ins VecListTwoDWordIndexed:$list, addrmode6align64:$addr, pred:$p)>;
+def VLD2LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr",
+ (ins VecListTwoQHWordIndexed:$list, addrmode6align32:$addr,
+ pred:$p)>;
+def VLD2LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr",
+ (ins VecListTwoQWordIndexed:$list, addrmode6align64:$addr,
+ pred:$p)>;
+
+def VLD2LNdWB_fixed_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vld2${p}", ".8", "$list, $addr!",
+ (ins VecListTwoDByteIndexed:$list, addrmode6align16:$addr,
+ pred:$p)>;
+def VLD2LNdWB_fixed_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr!",
+ (ins VecListTwoDHWordIndexed:$list, addrmode6align32:$addr,
+ pred:$p)>;
+def VLD2LNdWB_fixed_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr!",
+ (ins VecListTwoDWordIndexed:$list, addrmode6align64:$addr,
+ pred:$p)>;
+def VLD2LNqWB_fixed_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr!",
+ (ins VecListTwoQHWordIndexed:$list, addrmode6align32:$addr,
+ pred:$p)>;
+def VLD2LNqWB_fixed_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr!",
+ (ins VecListTwoQWordIndexed:$list, addrmode6align64:$addr,
+ pred:$p)>;
+def VLD2LNdWB_register_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vld2${p}", ".8", "$list, $addr, $Rm",
+ (ins VecListTwoDByteIndexed:$list, addrmode6align16:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VLD2LNdWB_register_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr, $Rm",
+ (ins VecListTwoDHWordIndexed:$list, addrmode6align32:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VLD2LNdWB_register_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr, $Rm",
+ (ins VecListTwoDWordIndexed:$list, addrmode6align64:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VLD2LNqWB_register_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr, $Rm",
+ (ins VecListTwoQHWordIndexed:$list, addrmode6align32:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VLD2LNqWB_register_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr, $Rm",
+ (ins VecListTwoQWordIndexed:$list, addrmode6align64:$addr,
+ rGPR:$Rm, pred:$p)>;
+
+
+// VST2 single-lane pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VST2LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vst2${p}", ".8", "$list, $addr",
+ (ins VecListTwoDByteIndexed:$list, addrmode6align16:$addr,
+ pred:$p)>;
+def VST2LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vst2${p}", ".16", "$list, $addr",
+ (ins VecListTwoDHWordIndexed:$list, addrmode6align32:$addr,
+ pred:$p)>;
+def VST2LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr",
+ (ins VecListTwoDWordIndexed:$list, addrmode6align64:$addr,
+ pred:$p)>;
+def VST2LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vst2${p}", ".16", "$list, $addr",
+ (ins VecListTwoQHWordIndexed:$list, addrmode6align32:$addr,
+ pred:$p)>;
+def VST2LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr",
+ (ins VecListTwoQWordIndexed:$list, addrmode6align64:$addr,
+ pred:$p)>;
+
+def VST2LNdWB_fixed_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vst2${p}", ".8", "$list, $addr!",
+ (ins VecListTwoDByteIndexed:$list, addrmode6align16:$addr,
+ pred:$p)>;
+def VST2LNdWB_fixed_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vst2${p}", ".16", "$list, $addr!",
+ (ins VecListTwoDHWordIndexed:$list, addrmode6align32:$addr,
+ pred:$p)>;
+def VST2LNdWB_fixed_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr!",
+ (ins VecListTwoDWordIndexed:$list, addrmode6align64:$addr,
+ pred:$p)>;
+def VST2LNqWB_fixed_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vst2${p}", ".16", "$list, $addr!",
+ (ins VecListTwoQHWordIndexed:$list, addrmode6align32:$addr,
+ pred:$p)>;
+def VST2LNqWB_fixed_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr!",
+ (ins VecListTwoQWordIndexed:$list, addrmode6align64:$addr,
+ pred:$p)>;
+def VST2LNdWB_register_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vst2${p}", ".8", "$list, $addr, $Rm",
+ (ins VecListTwoDByteIndexed:$list, addrmode6align16:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VST2LNdWB_register_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vst2${p}", ".16","$list, $addr, $Rm",
+ (ins VecListTwoDHWordIndexed:$list, addrmode6align32:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VST2LNdWB_register_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr, $Rm",
+ (ins VecListTwoDWordIndexed:$list, addrmode6align64:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VST2LNqWB_register_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vst2${p}", ".16","$list, $addr, $Rm",
+ (ins VecListTwoQHWordIndexed:$list, addrmode6align32:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VST2LNqWB_register_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr, $Rm",
+ (ins VecListTwoQWordIndexed:$list, addrmode6align64:$addr,
+ rGPR:$Rm, pred:$p)>;
+
+// VLD3 all-lanes pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VLD3DUPdAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
+ (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
+ pred:$p)>;
+def VLD3DUPdAsm_16: NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
+ (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
+ pred:$p)>;
+def VLD3DUPdAsm_32: NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
+ (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
+ pred:$p)>;
+def VLD3DUPqAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
+ (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
+ pred:$p)>;
+def VLD3DUPqAsm_16: NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
+ (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
+ pred:$p)>;
+def VLD3DUPqAsm_32: NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
+ (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
+ pred:$p)>;
+
+def VLD3DUPdWB_fixed_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr!",
+ (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
+ pred:$p)>;
+def VLD3DUPdWB_fixed_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!",
+ (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
+ pred:$p)>;
+def VLD3DUPdWB_fixed_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!",
+ (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
+ pred:$p)>;
+def VLD3DUPqWB_fixed_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr!",
+ (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
+ pred:$p)>;
+def VLD3DUPqWB_fixed_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!",
+ (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
+ pred:$p)>;
+def VLD3DUPqWB_fixed_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!",
+ (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
+ pred:$p)>;
+def VLD3DUPdWB_register_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr, $Rm",
+ (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VLD3DUPdWB_register_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm",
+ (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VLD3DUPdWB_register_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm",
+ (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VLD3DUPqWB_register_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr, $Rm",
+ (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VLD3DUPqWB_register_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm",
+ (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VLD3DUPqWB_register_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm",
+ (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
+ rGPR:$Rm, pred:$p)>;
+
+
+// VLD3 single-lane pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VLD3LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
+ (ins VecListThreeDByteIndexed:$list, addrmode6alignNone:$addr,
+ pred:$p)>;
+def VLD3LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
+ (ins VecListThreeDHWordIndexed:$list, addrmode6alignNone:$addr,
+ pred:$p)>;
+def VLD3LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
+ (ins VecListThreeDWordIndexed:$list, addrmode6alignNone:$addr,
+ pred:$p)>;
+def VLD3LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
+ (ins VecListThreeQHWordIndexed:$list, addrmode6alignNone:$addr,
+ pred:$p)>;
+def VLD3LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
+ (ins VecListThreeQWordIndexed:$list, addrmode6alignNone:$addr,
+ pred:$p)>;
+
+def VLD3LNdWB_fixed_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr!",
+ (ins VecListThreeDByteIndexed:$list, addrmode6alignNone:$addr,
+ pred:$p)>;
+def VLD3LNdWB_fixed_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!",
+ (ins VecListThreeDHWordIndexed:$list, addrmode6alignNone:$addr,
+ pred:$p)>;
+def VLD3LNdWB_fixed_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!",
+ (ins VecListThreeDWordIndexed:$list, addrmode6alignNone:$addr,
+ pred:$p)>;
+def VLD3LNqWB_fixed_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!",
+ (ins VecListThreeQHWordIndexed:$list, addrmode6alignNone:$addr,
+ pred:$p)>;
+def VLD3LNqWB_fixed_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!",
+ (ins VecListThreeQWordIndexed:$list, addrmode6alignNone:$addr,
+ pred:$p)>;
+def VLD3LNdWB_register_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr, $Rm",
+ (ins VecListThreeDByteIndexed:$list, addrmode6alignNone:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VLD3LNdWB_register_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm",
+ (ins VecListThreeDHWordIndexed:$list,
+ addrmode6alignNone:$addr, rGPR:$Rm, pred:$p)>;
+def VLD3LNdWB_register_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm",
+ (ins VecListThreeDWordIndexed:$list, addrmode6alignNone:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VLD3LNqWB_register_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm",
+ (ins VecListThreeQHWordIndexed:$list,
+ addrmode6alignNone:$addr, rGPR:$Rm, pred:$p)>;
+def VLD3LNqWB_register_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm",
+ (ins VecListThreeQWordIndexed:$list, addrmode6alignNone:$addr,
+ rGPR:$Rm, pred:$p)>;
+
+// VLD3 multiple structure pseudo-instructions. These need special handling for
+// the vector operands that the normal instructions don't yet model.
+// FIXME: Remove these when the register classes and instructions are updated.
+def VLD3dAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
+ (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
+def VLD3dAsm_16 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
+ (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
+def VLD3dAsm_32 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
+ (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
+def VLD3qAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
+ (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
+def VLD3qAsm_16 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
+ (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
+def VLD3qAsm_32 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
+ (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
+
+def VLD3dWB_fixed_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr!",
+ (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
+def VLD3dWB_fixed_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!",
+ (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
+def VLD3dWB_fixed_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!",
+ (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
+def VLD3qWB_fixed_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr!",
+ (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
+def VLD3qWB_fixed_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!",
+ (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
+def VLD3qWB_fixed_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!",
+ (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
+def VLD3dWB_register_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr, $Rm",
+ (ins VecListThreeD:$list, addrmode6align64:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VLD3dWB_register_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm",
+ (ins VecListThreeD:$list, addrmode6align64:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VLD3dWB_register_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm",
+ (ins VecListThreeD:$list, addrmode6align64:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VLD3qWB_register_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr, $Rm",
+ (ins VecListThreeQ:$list, addrmode6align64:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VLD3qWB_register_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm",
+ (ins VecListThreeQ:$list, addrmode6align64:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VLD3qWB_register_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm",
+ (ins VecListThreeQ:$list, addrmode6align64:$addr,
+ rGPR:$Rm, pred:$p)>;
+
+// VST3 single-lane pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VST3LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr",
+ (ins VecListThreeDByteIndexed:$list, addrmode6alignNone:$addr,
+ pred:$p)>;
+def VST3LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr",
+ (ins VecListThreeDHWordIndexed:$list, addrmode6alignNone:$addr,
+ pred:$p)>;
+def VST3LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr",
+ (ins VecListThreeDWordIndexed:$list, addrmode6alignNone:$addr,
+ pred:$p)>;
+def VST3LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr",
+ (ins VecListThreeQHWordIndexed:$list, addrmode6alignNone:$addr,
+ pred:$p)>;
+def VST3LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr",
+ (ins VecListThreeQWordIndexed:$list, addrmode6alignNone:$addr,
+ pred:$p)>;
+
+def VST3LNdWB_fixed_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr!",
+ (ins VecListThreeDByteIndexed:$list, addrmode6alignNone:$addr,
+ pred:$p)>;
+def VST3LNdWB_fixed_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr!",
+ (ins VecListThreeDHWordIndexed:$list, addrmode6alignNone:$addr,
+ pred:$p)>;
+def VST3LNdWB_fixed_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr!",
+ (ins VecListThreeDWordIndexed:$list, addrmode6alignNone:$addr,
+ pred:$p)>;
+def VST3LNqWB_fixed_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr!",
+ (ins VecListThreeQHWordIndexed:$list, addrmode6alignNone:$addr,
+ pred:$p)>;
+def VST3LNqWB_fixed_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr!",
+ (ins VecListThreeQWordIndexed:$list, addrmode6alignNone:$addr,
+ pred:$p)>;
+def VST3LNdWB_register_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr, $Rm",
+ (ins VecListThreeDByteIndexed:$list, addrmode6alignNone:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VST3LNdWB_register_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr, $Rm",
+ (ins VecListThreeDHWordIndexed:$list,
+ addrmode6alignNone:$addr, rGPR:$Rm, pred:$p)>;
+def VST3LNdWB_register_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr, $Rm",
+ (ins VecListThreeDWordIndexed:$list, addrmode6alignNone:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VST3LNqWB_register_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr, $Rm",
+ (ins VecListThreeQHWordIndexed:$list,
+ addrmode6alignNone:$addr, rGPR:$Rm, pred:$p)>;
+def VST3LNqWB_register_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr, $Rm",
+ (ins VecListThreeQWordIndexed:$list, addrmode6alignNone:$addr,
+ rGPR:$Rm, pred:$p)>;
+
+
+// VST3 multiple structure pseudo-instructions. These need special handling for
+// the vector operands that the normal instructions don't yet model.
+// FIXME: Remove these when the register classes and instructions are updated.
+def VST3dAsm_8 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr",
+ (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
+def VST3dAsm_16 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr",
+ (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
+def VST3dAsm_32 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr",
+ (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
+def VST3qAsm_8 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr",
+ (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
+def VST3qAsm_16 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr",
+ (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
+def VST3qAsm_32 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr",
+ (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
+
+def VST3dWB_fixed_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr!",
+ (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
+def VST3dWB_fixed_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr!",
+ (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
+def VST3dWB_fixed_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr!",
+ (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
+def VST3qWB_fixed_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr!",
+ (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
+def VST3qWB_fixed_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr!",
+ (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
+def VST3qWB_fixed_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr!",
+ (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
+def VST3dWB_register_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr, $Rm",
+ (ins VecListThreeD:$list, addrmode6align64:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VST3dWB_register_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr, $Rm",
+ (ins VecListThreeD:$list, addrmode6align64:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VST3dWB_register_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr, $Rm",
+ (ins VecListThreeD:$list, addrmode6align64:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VST3qWB_register_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr, $Rm",
+ (ins VecListThreeQ:$list, addrmode6align64:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VST3qWB_register_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr, $Rm",
+ (ins VecListThreeQ:$list, addrmode6align64:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VST3qWB_register_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr, $Rm",
+ (ins VecListThreeQ:$list, addrmode6align64:$addr,
+ rGPR:$Rm, pred:$p)>;
+
+// VLD4 all-lanes pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VLD4DUPdAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr",
+ (ins VecListFourDAllLanes:$list, addrmode6dupalign32:$addr,
+ pred:$p)>;
+def VLD4DUPdAsm_16: NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
+ (ins VecListFourDAllLanes:$list, addrmode6dupalign64:$addr,
+ pred:$p)>;
+def VLD4DUPdAsm_32: NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
+ (ins VecListFourDAllLanes:$list, addrmode6dupalign64or128:$addr,
+ pred:$p)>;
+def VLD4DUPqAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr",
+ (ins VecListFourQAllLanes:$list, addrmode6dupalign32:$addr,
+ pred:$p)>;
+def VLD4DUPqAsm_16: NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
+ (ins VecListFourQAllLanes:$list, addrmode6dupalign64:$addr,
+ pred:$p)>;
+def VLD4DUPqAsm_32: NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
+ (ins VecListFourQAllLanes:$list, addrmode6dupalign64or128:$addr,
+ pred:$p)>;
+
+def VLD4DUPdWB_fixed_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr!",
+ (ins VecListFourDAllLanes:$list, addrmode6dupalign32:$addr,
+ pred:$p)>;
+def VLD4DUPdWB_fixed_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!",
+ (ins VecListFourDAllLanes:$list, addrmode6dupalign64:$addr,
+ pred:$p)>;
+def VLD4DUPdWB_fixed_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!",
+ (ins VecListFourDAllLanes:$list, addrmode6dupalign64or128:$addr,
+ pred:$p)>;
+def VLD4DUPqWB_fixed_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr!",
+ (ins VecListFourQAllLanes:$list, addrmode6dupalign32:$addr,
+ pred:$p)>;
+def VLD4DUPqWB_fixed_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!",
+ (ins VecListFourQAllLanes:$list, addrmode6dupalign64:$addr,
+ pred:$p)>;
+def VLD4DUPqWB_fixed_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!",
+ (ins VecListFourQAllLanes:$list, addrmode6dupalign64or128:$addr,
+ pred:$p)>;
+def VLD4DUPdWB_register_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr, $Rm",
+ (ins VecListFourDAllLanes:$list, addrmode6dupalign32:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VLD4DUPdWB_register_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm",
+ (ins VecListFourDAllLanes:$list, addrmode6dupalign64:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VLD4DUPdWB_register_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm",
+ (ins VecListFourDAllLanes:$list,
+ addrmode6dupalign64or128:$addr, rGPR:$Rm, pred:$p)>;
+def VLD4DUPqWB_register_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr, $Rm",
+ (ins VecListFourQAllLanes:$list, addrmode6dupalign32:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VLD4DUPqWB_register_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm",
+ (ins VecListFourQAllLanes:$list, addrmode6dupalign64:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VLD4DUPqWB_register_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm",
+ (ins VecListFourQAllLanes:$list,
+ addrmode6dupalign64or128:$addr, rGPR:$Rm, pred:$p)>;
+
+
+// VLD4 single-lane pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VLD4LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr",
+ (ins VecListFourDByteIndexed:$list, addrmode6align32:$addr,
+ pred:$p)>;
+def VLD4LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
+ (ins VecListFourDHWordIndexed:$list, addrmode6align64:$addr,
+ pred:$p)>;
+def VLD4LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
+ (ins VecListFourDWordIndexed:$list, addrmode6align64or128:$addr,
+ pred:$p)>;
+def VLD4LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
+ (ins VecListFourQHWordIndexed:$list, addrmode6align64:$addr,
+ pred:$p)>;
+def VLD4LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
+ (ins VecListFourQWordIndexed:$list, addrmode6align64or128:$addr,
+ pred:$p)>;
+
+def VLD4LNdWB_fixed_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr!",
+ (ins VecListFourDByteIndexed:$list, addrmode6align32:$addr,
+ pred:$p)>;
+def VLD4LNdWB_fixed_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!",
+ (ins VecListFourDHWordIndexed:$list, addrmode6align64:$addr,
+ pred:$p)>;
+def VLD4LNdWB_fixed_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!",
+ (ins VecListFourDWordIndexed:$list, addrmode6align64or128:$addr,
+ pred:$p)>;
+def VLD4LNqWB_fixed_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!",
+ (ins VecListFourQHWordIndexed:$list, addrmode6align64:$addr,
+ pred:$p)>;
+def VLD4LNqWB_fixed_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!",
+ (ins VecListFourQWordIndexed:$list, addrmode6align64or128:$addr,
+ pred:$p)>;
+def VLD4LNdWB_register_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr, $Rm",
+ (ins VecListFourDByteIndexed:$list, addrmode6align32:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VLD4LNdWB_register_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm",
+ (ins VecListFourDHWordIndexed:$list, addrmode6align64:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VLD4LNdWB_register_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm",
+ (ins VecListFourDWordIndexed:$list,
+ addrmode6align64or128:$addr, rGPR:$Rm, pred:$p)>;
+def VLD4LNqWB_register_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm",
+ (ins VecListFourQHWordIndexed:$list, addrmode6align64:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VLD4LNqWB_register_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm",
+ (ins VecListFourQWordIndexed:$list,
+ addrmode6align64or128:$addr, rGPR:$Rm, pred:$p)>;
+
+
+
+// VLD4 multiple structure pseudo-instructions. These need special handling for
+// the vector operands that the normal instructions don't yet model.
+// FIXME: Remove these when the register classes and instructions are updated.
+def VLD4dAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr",
+ (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+ pred:$p)>;
+def VLD4dAsm_16 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
+ (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+ pred:$p)>;
+def VLD4dAsm_32 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
+ (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+ pred:$p)>;
+def VLD4qAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr",
+ (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+ pred:$p)>;
+def VLD4qAsm_16 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
+ (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+ pred:$p)>;
+def VLD4qAsm_32 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
+ (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+ pred:$p)>;
+
+def VLD4dWB_fixed_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr!",
+ (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+ pred:$p)>;
+def VLD4dWB_fixed_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!",
+ (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+ pred:$p)>;
+def VLD4dWB_fixed_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!",
+ (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+ pred:$p)>;
+def VLD4qWB_fixed_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr!",
+ (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+ pred:$p)>;
+def VLD4qWB_fixed_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!",
+ (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+ pred:$p)>;
+def VLD4qWB_fixed_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!",
+ (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+ pred:$p)>;
+def VLD4dWB_register_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr, $Rm",
+ (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VLD4dWB_register_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm",
+ (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VLD4dWB_register_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm",
+ (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VLD4qWB_register_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr, $Rm",
+ (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VLD4qWB_register_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm",
+ (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VLD4qWB_register_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm",
+ (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+ rGPR:$Rm, pred:$p)>;
+
+// VST4 single-lane pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VST4LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr",
+ (ins VecListFourDByteIndexed:$list, addrmode6align32:$addr,
+ pred:$p)>;
+def VST4LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr",
+ (ins VecListFourDHWordIndexed:$list, addrmode6align64:$addr,
+ pred:$p)>;
+def VST4LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr",
+ (ins VecListFourDWordIndexed:$list, addrmode6align64or128:$addr,
+ pred:$p)>;
+def VST4LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr",
+ (ins VecListFourQHWordIndexed:$list, addrmode6align64:$addr,
+ pred:$p)>;
+def VST4LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr",
+ (ins VecListFourQWordIndexed:$list, addrmode6align64or128:$addr,
+ pred:$p)>;
+
+def VST4LNdWB_fixed_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr!",
+ (ins VecListFourDByteIndexed:$list, addrmode6align32:$addr,
+ pred:$p)>;
+def VST4LNdWB_fixed_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr!",
+ (ins VecListFourDHWordIndexed:$list, addrmode6align64:$addr,
+ pred:$p)>;
+def VST4LNdWB_fixed_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr!",
+ (ins VecListFourDWordIndexed:$list, addrmode6align64or128:$addr,
+ pred:$p)>;
+def VST4LNqWB_fixed_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr!",
+ (ins VecListFourQHWordIndexed:$list, addrmode6align64:$addr,
+ pred:$p)>;
+def VST4LNqWB_fixed_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr!",
+ (ins VecListFourQWordIndexed:$list, addrmode6align64or128:$addr,
+ pred:$p)>;
+def VST4LNdWB_register_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr, $Rm",
+ (ins VecListFourDByteIndexed:$list, addrmode6align32:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VST4LNdWB_register_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr, $Rm",
+ (ins VecListFourDHWordIndexed:$list, addrmode6align64:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VST4LNdWB_register_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr, $Rm",
+ (ins VecListFourDWordIndexed:$list,
+ addrmode6align64or128:$addr, rGPR:$Rm, pred:$p)>;
+def VST4LNqWB_register_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr, $Rm",
+ (ins VecListFourQHWordIndexed:$list, addrmode6align64:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VST4LNqWB_register_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr, $Rm",
+ (ins VecListFourQWordIndexed:$list,
+ addrmode6align64or128:$addr, rGPR:$Rm, pred:$p)>;
+
+
+// VST4 multiple structure pseudo-instructions. These need special handling for
+// the vector operands that the normal instructions don't yet model.
+// FIXME: Remove these when the register classes and instructions are updated.
+def VST4dAsm_8 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr",
+ (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+ pred:$p)>;
+def VST4dAsm_16 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr",
+ (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+ pred:$p)>;
+def VST4dAsm_32 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr",
+ (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+ pred:$p)>;
+def VST4qAsm_8 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr",
+ (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+ pred:$p)>;
+def VST4qAsm_16 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr",
+ (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+ pred:$p)>;
+def VST4qAsm_32 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr",
+ (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+ pred:$p)>;
+
+def VST4dWB_fixed_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr!",
+ (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+ pred:$p)>;
+def VST4dWB_fixed_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr!",
+ (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+ pred:$p)>;
+def VST4dWB_fixed_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr!",
+ (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+ pred:$p)>;
+def VST4qWB_fixed_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr!",
+ (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+ pred:$p)>;
+def VST4qWB_fixed_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr!",
+ (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+ pred:$p)>;
+def VST4qWB_fixed_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr!",
+ (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+ pred:$p)>;
+def VST4dWB_register_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr, $Rm",
+ (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VST4dWB_register_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr, $Rm",
+ (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VST4dWB_register_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr, $Rm",
+ (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VST4qWB_register_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr, $Rm",
+ (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VST4qWB_register_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr, $Rm",
+ (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VST4qWB_register_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr, $Rm",
+ (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+ rGPR:$Rm, pred:$p)>;
+
+// VMOV/VMVN takes an optional datatype suffix
+defm : NEONDTAnyInstAlias<"vmov${p}", "$Vd, $Vm",
+ (VORRd DPR:$Vd, DPR:$Vm, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vmov${p}", "$Vd, $Vm",
+ (VORRq QPR:$Vd, QPR:$Vm, QPR:$Vm, pred:$p)>;
+
+defm : NEONDTAnyInstAlias<"vmvn${p}", "$Vd, $Vm",
+ (VMVNd DPR:$Vd, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vmvn${p}", "$Vd, $Vm",
+ (VMVNq QPR:$Vd, QPR:$Vm, pred:$p)>;
+
+// VCLT (register) is an assembler alias for VCGT w/ the operands reversed.
+// D-register versions.
+def : NEONInstAlias<"vcle${p}.s8 $Dd, $Dn, $Dm",
+ (VCGEsv8i8 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.s16 $Dd, $Dn, $Dm",
+ (VCGEsv4i16 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.s32 $Dd, $Dn, $Dm",
+ (VCGEsv2i32 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.u8 $Dd, $Dn, $Dm",
+ (VCGEuv8i8 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.u16 $Dd, $Dn, $Dm",
+ (VCGEuv4i16 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.u32 $Dd, $Dn, $Dm",
+ (VCGEuv2i32 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.f32 $Dd, $Dn, $Dm",
+ (VCGEfd DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+let Predicates = [HasNEON, HasFullFP16] in
+def : NEONInstAlias<"vcle${p}.f16 $Dd, $Dn, $Dm",
+ (VCGEhd DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+// Q-register versions.
+def : NEONInstAlias<"vcle${p}.s8 $Qd, $Qn, $Qm",
+ (VCGEsv16i8 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.s16 $Qd, $Qn, $Qm",
+ (VCGEsv8i16 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.s32 $Qd, $Qn, $Qm",
+ (VCGEsv4i32 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.u8 $Qd, $Qn, $Qm",
+ (VCGEuv16i8 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.u16 $Qd, $Qn, $Qm",
+ (VCGEuv8i16 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.u32 $Qd, $Qn, $Qm",
+ (VCGEuv4i32 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.f32 $Qd, $Qn, $Qm",
+ (VCGEfq QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+let Predicates = [HasNEON, HasFullFP16] in
+def : NEONInstAlias<"vcle${p}.f16 $Qd, $Qn, $Qm",
+ (VCGEhq QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+
+// VCLT (register) is an assembler alias for VCGT w/ the operands reversed.
+// D-register versions.
+def : NEONInstAlias<"vclt${p}.s8 $Dd, $Dn, $Dm",
+ (VCGTsv8i8 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.s16 $Dd, $Dn, $Dm",
+ (VCGTsv4i16 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.s32 $Dd, $Dn, $Dm",
+ (VCGTsv2i32 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.u8 $Dd, $Dn, $Dm",
+ (VCGTuv8i8 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.u16 $Dd, $Dn, $Dm",
+ (VCGTuv4i16 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.u32 $Dd, $Dn, $Dm",
+ (VCGTuv2i32 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.f32 $Dd, $Dn, $Dm",
+ (VCGTfd DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+let Predicates = [HasNEON, HasFullFP16] in
+def : NEONInstAlias<"vclt${p}.f16 $Dd, $Dn, $Dm",
+ (VCGThd DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+// Q-register versions.
+def : NEONInstAlias<"vclt${p}.s8 $Qd, $Qn, $Qm",
+ (VCGTsv16i8 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.s16 $Qd, $Qn, $Qm",
+ (VCGTsv8i16 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.s32 $Qd, $Qn, $Qm",
+ (VCGTsv4i32 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.u8 $Qd, $Qn, $Qm",
+ (VCGTuv16i8 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.u16 $Qd, $Qn, $Qm",
+ (VCGTuv8i16 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.u32 $Qd, $Qn, $Qm",
+ (VCGTuv4i32 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.f32 $Qd, $Qn, $Qm",
+ (VCGTfq QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+let Predicates = [HasNEON, HasFullFP16] in
+def : NEONInstAlias<"vclt${p}.f16 $Qd, $Qn, $Qm",
+ (VCGThq QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+
+// VSWP allows, but does not require, a type suffix.
+defm : NEONDTAnyInstAlias<"vswp${p}", "$Vd, $Vm",
+ (VSWPd DPR:$Vd, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vswp${p}", "$Vd, $Vm",
+ (VSWPq QPR:$Vd, QPR:$Vm, pred:$p)>;
+
+// VBIF, VBIT, and VBSL allow, but do not require, a type suffix.
+defm : NEONDTAnyInstAlias<"vbif${p}", "$Vd, $Vn, $Vm",
+ (VBIFd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vbit${p}", "$Vd, $Vn, $Vm",
+ (VBITd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vbsl${p}", "$Vd, $Vn, $Vm",
+ (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vbif${p}", "$Vd, $Vn, $Vm",
+ (VBIFq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vbit${p}", "$Vd, $Vn, $Vm",
+ (VBITq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vbsl${p}", "$Vd, $Vn, $Vm",
+ (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>;
+
+// "vmov Rd, #-imm" can be handled via "vmvn".
+def : NEONInstAlias<"vmov${p}.i32 $Vd, $imm",
+ (VMVNv2i32 DPR:$Vd, nImmVMOVI32Neg:$imm, pred:$p)>;
+def : NEONInstAlias<"vmov${p}.i32 $Vd, $imm",
+ (VMVNv4i32 QPR:$Vd, nImmVMOVI32Neg:$imm, pred:$p)>;
+def : NEONInstAlias<"vmvn${p}.i32 $Vd, $imm",
+ (VMOVv2i32 DPR:$Vd, nImmVMOVI32Neg:$imm, pred:$p)>;
+def : NEONInstAlias<"vmvn${p}.i32 $Vd, $imm",
+ (VMOVv4i32 QPR:$Vd, nImmVMOVI32Neg:$imm, pred:$p)>;
+
+// 'gas' compatibility aliases for quad-word instructions. Strictly speaking,
+// these should restrict to just the Q register variants, but the register
+// classes are enough to match correctly regardless, so we keep it simple
+// and just use MnemonicAlias.
+def : NEONMnemonicAlias<"vbicq", "vbic">;
+def : NEONMnemonicAlias<"vandq", "vand">;
+def : NEONMnemonicAlias<"veorq", "veor">;
+def : NEONMnemonicAlias<"vorrq", "vorr">;
+
+def : NEONMnemonicAlias<"vmovq", "vmov">;
+def : NEONMnemonicAlias<"vmvnq", "vmvn">;
+// Explicit versions for floating point so that the FPImm variants get
+// handled early. The parser gets confused otherwise.
+def : NEONMnemonicAlias<"vmovq.f32", "vmov.f32">;
+def : NEONMnemonicAlias<"vmovq.f64", "vmov.f64">;
+
+def : NEONMnemonicAlias<"vaddq", "vadd">;
+def : NEONMnemonicAlias<"vsubq", "vsub">;
+
+def : NEONMnemonicAlias<"vminq", "vmin">;
+def : NEONMnemonicAlias<"vmaxq", "vmax">;
+
+def : NEONMnemonicAlias<"vmulq", "vmul">;
+
+def : NEONMnemonicAlias<"vabsq", "vabs">;
+
+def : NEONMnemonicAlias<"vshlq", "vshl">;
+def : NEONMnemonicAlias<"vshrq", "vshr">;
+
+def : NEONMnemonicAlias<"vcvtq", "vcvt">;
+
+def : NEONMnemonicAlias<"vcleq", "vcle">;
+def : NEONMnemonicAlias<"vceqq", "vceq">;
+
+def : NEONMnemonicAlias<"vzipq", "vzip">;
+def : NEONMnemonicAlias<"vswpq", "vswp">;
+
+def : NEONMnemonicAlias<"vrecpeq.f32", "vrecpe.f32">;
+def : NEONMnemonicAlias<"vrecpeq.u32", "vrecpe.u32">;
+
+
+// Alias for loading floating point immediates that aren't representable
+// using the vmov.f32 encoding but the bitpattern is representable using
+// the .i32 encoding.
+def : NEONInstAlias<"vmov${p}.f32 $Vd, $imm",
+ (VMOVv4i32 QPR:$Vd, nImmVMOVI32:$imm, pred:$p)>;
+def : NEONInstAlias<"vmov${p}.f32 $Vd, $imm",
+ (VMOVv2i32 DPR:$Vd, nImmVMOVI32:$imm, pred:$p)>;
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td
new file mode 100644
index 000000000000..a681f64b05e6
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td
@@ -0,0 +1,1610 @@
+//===-- ARMInstrThumb.td - Thumb support for ARM -----------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Thumb instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Thumb specific DAG Nodes.
+//
+
+def imm_sr_XFORM: SDNodeXForm<imm, [{
+ unsigned Imm = N->getZExtValue();
+ return CurDAG->getTargetConstant((Imm == 32 ? 0 : Imm), SDLoc(N), MVT::i32);
+}]>;
+def ThumbSRImmAsmOperand: AsmOperandClass { let Name = "ImmThumbSR"; }
+def imm_sr : Operand<i32>, PatLeaf<(imm), [{
+ uint64_t Imm = N->getZExtValue();
+ return Imm > 0 && Imm <= 32;
+}], imm_sr_XFORM> {
+ let PrintMethod = "printThumbSRImm";
+ let ParserMatchClass = ThumbSRImmAsmOperand;
+}
+
+def imm_comp_XFORM : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(~((uint32_t)N->getZExtValue()), SDLoc(N),
+ MVT::i32);
+}]>;
+
+def imm0_7_neg : PatLeaf<(i32 imm), [{
+ return (uint32_t)-N->getZExtValue() < 8;
+}], imm_neg_XFORM>;
+
+def imm0_255_comp : PatLeaf<(i32 imm), [{
+ return ~((uint32_t)N->getZExtValue()) < 256;
+}]>;
+
+def imm8_255 : ImmLeaf<i32, [{
+ return Imm >= 8 && Imm < 256;
+}]>;
+def imm8_255_neg : PatLeaf<(i32 imm), [{
+ unsigned Val = -N->getZExtValue();
+ return Val >= 8 && Val < 256;
+}], imm_neg_XFORM>;
+
+// Break imm's up into two pieces: an immediate + a left shift. This uses
+// thumb_immshifted to match and thumb_immshifted_val and thumb_immshifted_shamt
+// to get the val/shift pieces.
+def thumb_immshifted : PatLeaf<(imm), [{
+ return ARM_AM::isThumbImmShiftedVal((unsigned)N->getZExtValue());
+}]>;
+
+def thumb_immshifted_val : SDNodeXForm<imm, [{
+ unsigned V = ARM_AM::getThumbImmNonShiftedVal((unsigned)N->getZExtValue());
+ return CurDAG->getTargetConstant(V, SDLoc(N), MVT::i32);
+}]>;
+
+def thumb_immshifted_shamt : SDNodeXForm<imm, [{
+ unsigned V = ARM_AM::getThumbImmValShift((unsigned)N->getZExtValue());
+ return CurDAG->getTargetConstant(V, SDLoc(N), MVT::i32);
+}]>;
+
+def imm256_510 : ImmLeaf<i32, [{
+ return Imm >= 256 && Imm < 511;
+}]>;
+
+def thumb_imm256_510_addend : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() - 255, SDLoc(N), MVT::i32);
+}]>;
+
+// Scaled 4 immediate.
+def t_imm0_1020s4_asmoperand: AsmOperandClass { let Name = "Imm0_1020s4"; }
+def t_imm0_1020s4 : Operand<i32> {
+ let PrintMethod = "printThumbS4ImmOperand";
+ let ParserMatchClass = t_imm0_1020s4_asmoperand;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def t_imm0_508s4_asmoperand: AsmOperandClass { let Name = "Imm0_508s4"; }
+def t_imm0_508s4 : Operand<i32> {
+ let PrintMethod = "printThumbS4ImmOperand";
+ let ParserMatchClass = t_imm0_508s4_asmoperand;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+// Alias use only, so no printer is necessary.
+def t_imm0_508s4_neg_asmoperand: AsmOperandClass { let Name = "Imm0_508s4Neg"; }
+def t_imm0_508s4_neg : Operand<i32> {
+ let ParserMatchClass = t_imm0_508s4_neg_asmoperand;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// Define Thumb specific addressing modes.
+
+// unsigned 8-bit, 2-scaled memory offset
+class OperandUnsignedOffset_b8s2 : AsmOperandClass {
+ let Name = "UnsignedOffset_b8s2";
+ let PredicateMethod = "isUnsignedOffset<8, 2>";
+}
+
+def UnsignedOffset_b8s2 : OperandUnsignedOffset_b8s2;
+
+// thumb style PC relative operand. signed, 8 bits magnitude,
+// two bits shift. can be represented as either [pc, #imm], #imm,
+// or relocatable expression...
+def ThumbMemPC : AsmOperandClass {
+ let Name = "ThumbMemPC";
+}
+
+let OperandType = "OPERAND_PCREL" in {
+def t_brtarget : Operand<OtherVT> {
+ let EncoderMethod = "getThumbBRTargetOpValue";
+ let DecoderMethod = "DecodeThumbBROperand";
+}
+
+// ADR instruction labels.
+def t_adrlabel : Operand<i32> {
+ let EncoderMethod = "getThumbAdrLabelOpValue";
+ let PrintMethod = "printAdrLabelOperand<2>";
+ let ParserMatchClass = UnsignedOffset_b8s2;
+}
+
+
+def thumb_br_target : Operand<OtherVT> {
+ let ParserMatchClass = ThumbBranchTarget;
+ let EncoderMethod = "getThumbBranchTargetOpValue";
+ let OperandType = "OPERAND_PCREL";
+}
+
+def thumb_bl_target : Operand<i32> {
+ let ParserMatchClass = ThumbBranchTarget;
+ let EncoderMethod = "getThumbBLTargetOpValue";
+ let DecoderMethod = "DecodeThumbBLTargetOperand";
+}
+
+// Target for BLX *from* thumb mode.
+def thumb_blx_target : Operand<i32> {
+ let ParserMatchClass = ARMBranchTarget;
+ let EncoderMethod = "getThumbBLXTargetOpValue";
+ let DecoderMethod = "DecodeThumbBLXOffset";
+}
+
+def thumb_bcc_target : Operand<OtherVT> {
+ let ParserMatchClass = ThumbBranchTarget;
+ let EncoderMethod = "getThumbBCCTargetOpValue";
+ let DecoderMethod = "DecodeThumbBCCTargetOperand";
+}
+
+def thumb_cb_target : Operand<OtherVT> {
+ let ParserMatchClass = ThumbBranchTarget;
+ let EncoderMethod = "getThumbCBTargetOpValue";
+ let DecoderMethod = "DecodeThumbCmpBROperand";
+}
+
+// t_addrmode_pc := <label> => pc + imm8 * 4
+//
+def t_addrmode_pc : MemOperand {
+ let EncoderMethod = "getAddrModePCOpValue";
+ let DecoderMethod = "DecodeThumbAddrModePC";
+ let PrintMethod = "printThumbLdrLabelOperand";
+ let ParserMatchClass = ThumbMemPC;
+}
+}
+
+// t_addrmode_rr := reg + reg
+//
+def t_addrmode_rr_asm_operand : AsmOperandClass { let Name = "MemThumbRR"; }
+def t_addrmode_rr : MemOperand,
+ ComplexPattern<i32, 2, "SelectThumbAddrModeRR", []> {
+ let EncoderMethod = "getThumbAddrModeRegRegOpValue";
+ let PrintMethod = "printThumbAddrModeRROperand";
+ let DecoderMethod = "DecodeThumbAddrModeRR";
+ let ParserMatchClass = t_addrmode_rr_asm_operand;
+ let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
+}
+
+// t_addrmode_rrs := reg + reg
+//
+// We use separate scaled versions because the Select* functions need
+// to explicitly check for a matching constant and return false here so that
+// the reg+imm forms will match instead. This is a horrible way to do that,
+// as it forces tight coupling between the methods, but it's how selectiondag
+// currently works.
+def t_addrmode_rrs1 : MemOperand,
+ ComplexPattern<i32, 2, "SelectThumbAddrModeRI5S1", []> {
+ let EncoderMethod = "getThumbAddrModeRegRegOpValue";
+ let PrintMethod = "printThumbAddrModeRROperand";
+ let DecoderMethod = "DecodeThumbAddrModeRR";
+ let ParserMatchClass = t_addrmode_rr_asm_operand;
+ let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
+}
+def t_addrmode_rrs2 : MemOperand,
+ ComplexPattern<i32, 2, "SelectThumbAddrModeRI5S2", []> {
+ let EncoderMethod = "getThumbAddrModeRegRegOpValue";
+ let DecoderMethod = "DecodeThumbAddrModeRR";
+ let PrintMethod = "printThumbAddrModeRROperand";
+ let ParserMatchClass = t_addrmode_rr_asm_operand;
+ let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
+}
+def t_addrmode_rrs4 : MemOperand,
+ ComplexPattern<i32, 2, "SelectThumbAddrModeRI5S4", []> {
+ let EncoderMethod = "getThumbAddrModeRegRegOpValue";
+ let DecoderMethod = "DecodeThumbAddrModeRR";
+ let PrintMethod = "printThumbAddrModeRROperand";
+ let ParserMatchClass = t_addrmode_rr_asm_operand;
+ let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
+}
+
+// t_addrmode_is4 := reg + imm5 * 4
+//
+def t_addrmode_is4_asm_operand : AsmOperandClass { let Name = "MemThumbRIs4"; }
+def t_addrmode_is4 : MemOperand,
+ ComplexPattern<i32, 2, "SelectThumbAddrModeImm5S4", []> {
+ let EncoderMethod = "getAddrModeISOpValue";
+ let DecoderMethod = "DecodeThumbAddrModeIS";
+ let PrintMethod = "printThumbAddrModeImm5S4Operand";
+ let ParserMatchClass = t_addrmode_is4_asm_operand;
+ let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm);
+}
+
+// t_addrmode_is2 := reg + imm5 * 2
+//
+def t_addrmode_is2_asm_operand : AsmOperandClass { let Name = "MemThumbRIs2"; }
+def t_addrmode_is2 : MemOperand,
+ ComplexPattern<i32, 2, "SelectThumbAddrModeImm5S2", []> {
+ let EncoderMethod = "getAddrModeISOpValue";
+ let DecoderMethod = "DecodeThumbAddrModeIS";
+ let PrintMethod = "printThumbAddrModeImm5S2Operand";
+ let ParserMatchClass = t_addrmode_is2_asm_operand;
+ let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm);
+}
+
+// t_addrmode_is1 := reg + imm5
+//
+def t_addrmode_is1_asm_operand : AsmOperandClass { let Name = "MemThumbRIs1"; }
+def t_addrmode_is1 : MemOperand,
+ ComplexPattern<i32, 2, "SelectThumbAddrModeImm5S1", []> {
+ let EncoderMethod = "getAddrModeISOpValue";
+ let DecoderMethod = "DecodeThumbAddrModeIS";
+ let PrintMethod = "printThumbAddrModeImm5S1Operand";
+ let ParserMatchClass = t_addrmode_is1_asm_operand;
+ let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm);
+}
+
+// t_addrmode_sp := sp + imm8 * 4
+//
+// FIXME: This really shouldn't have an explicit SP operand at all. It should
+// be implicit, just like in the instruction encoding itself.
+def t_addrmode_sp_asm_operand : AsmOperandClass { let Name = "MemThumbSPI"; }
+def t_addrmode_sp : MemOperand,
+ ComplexPattern<i32, 2, "SelectThumbAddrModeSP", []> {
+ let EncoderMethod = "getAddrModeThumbSPOpValue";
+ let DecoderMethod = "DecodeThumbAddrModeSP";
+ let PrintMethod = "printThumbAddrModeSPOperand";
+ let ParserMatchClass = t_addrmode_sp_asm_operand;
+ let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Instructions.
+//
+
+// FIXME: Marking these as hasSideEffects is necessary to prevent machine DCE
+// from removing one half of the matched pairs. That breaks PEI, which assumes
+// these will always be in pairs, and asserts if it finds otherwise. Better way?
+let Defs = [SP], Uses = [SP], hasSideEffects = 1 in {
+def tADJCALLSTACKUP :
+ PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), NoItinerary,
+ [(ARMcallseq_end imm:$amt1, imm:$amt2)]>,
+ Requires<[IsThumb, IsThumb1Only]>;
+
+def tADJCALLSTACKDOWN :
+ PseudoInst<(outs), (ins i32imm:$amt), NoItinerary,
+ [(ARMcallseq_start imm:$amt)]>,
+ Requires<[IsThumb, IsThumb1Only]>;
+}
+
+class T1SystemEncoding<bits<8> opc>
+ : T1Encoding<0b101111> {
+ let Inst{9-8} = 0b11;
+ let Inst{7-0} = opc;
+}
+
+def tHINT : T1pI<(outs), (ins imm0_15:$imm), NoItinerary, "hint", "\t$imm",
+ [(int_arm_hint imm0_15:$imm)]>,
+ T1SystemEncoding<0x00>,
+ Requires<[IsThumb, HasV6M]> {
+ bits<4> imm;
+ let Inst{7-4} = imm;
+}
+
+// Note: When EmitPriority == 1, the alias will be used for printing
+class tHintAlias<string Asm, dag Result, bit EmitPriority = 0> : tInstAlias<Asm, Result, EmitPriority> {
+ let Predicates = [IsThumb, HasV6M];
+}
+
+def : tHintAlias<"nop$p", (tHINT 0, pred:$p), 1>; // A8.6.110
+def : tHintAlias<"yield$p", (tHINT 1, pred:$p), 1>; // A8.6.410
+def : tHintAlias<"wfe$p", (tHINT 2, pred:$p), 1>; // A8.6.408
+def : tHintAlias<"wfi$p", (tHINT 3, pred:$p), 1>; // A8.6.409
+def : tHintAlias<"sev$p", (tHINT 4, pred:$p), 1>; // A8.6.157
+def : tInstAlias<"sevl$p", (tHINT 5, pred:$p), 1> {
+ let Predicates = [IsThumb2, HasV8];
+}
+
+// The imm operand $val can be used by a debugger to store more information
+// about the breakpoint.
+def tBKPT : T1I<(outs), (ins imm0_255:$val), NoItinerary, "bkpt\t$val",
+ []>,
+ T1Encoding<0b101111> {
+ let Inst{9-8} = 0b10;
+ // A8.6.22
+ bits<8> val;
+ let Inst{7-0} = val;
+}
+// default immediate for breakpoint mnemonic
+def : InstAlias<"bkpt", (tBKPT 0), 0>, Requires<[IsThumb]>;
+
+def tHLT : T1I<(outs), (ins imm0_63:$val), NoItinerary, "hlt\t$val",
+ []>, T1Encoding<0b101110>, Requires<[IsThumb, HasV8]> {
+ let Inst{9-6} = 0b1010;
+ bits<6> val;
+ let Inst{5-0} = val;
+}
+
+def tSETEND : T1I<(outs), (ins setend_op:$end), NoItinerary, "setend\t$end",
+ []>, T1Encoding<0b101101>, Requires<[IsNotMClass]>, Deprecated<HasV8Ops> {
+ bits<1> end;
+ // A8.6.156
+ let Inst{9-5} = 0b10010;
+ let Inst{4} = 1;
+ let Inst{3} = end;
+ let Inst{2-0} = 0b000;
+}
+
+// Change Processor State is a system instruction -- for disassembly only.
+def tCPS : T1I<(outs), (ins imod_op:$imod, iflags_op:$iflags),
+ NoItinerary, "cps$imod $iflags", []>,
+ T1Misc<0b0110011> {
+ // A8.6.38 & B6.1.1
+ bit imod;
+ bits<3> iflags;
+
+ let Inst{4} = imod;
+ let Inst{3} = 0;
+ let Inst{2-0} = iflags;
+ let DecoderMethod = "DecodeThumbCPS";
+}
+
+// For both thumb1 and thumb2.
+let isNotDuplicable = 1, isCodeGenOnly = 1 in
+def tPICADD : TIt<(outs GPR:$dst), (ins GPR:$lhs, pclabel:$cp), IIC_iALUr, "",
+ [(set GPR:$dst, (ARMpic_add GPR:$lhs, imm:$cp))]>,
+ T1Special<{0,0,?,?}>, Sched<[WriteALU]> {
+ // A8.6.6
+ bits<3> dst;
+ let Inst{6-3} = 0b1111; // Rm = pc
+ let Inst{2-0} = dst;
+}
+
+// ADD <Rd>, sp, #<imm8>
+// FIXME: This should not be marked as having side effects, and it should be
+// rematerializable. Clearing the side effect bit causes miscompilations,
+// probably because the instruction can be moved around.
+def tADDrSPi : T1pI<(outs tGPR:$dst), (ins GPRsp:$sp, t_imm0_1020s4:$imm),
+ IIC_iALUi, "add", "\t$dst, $sp, $imm", []>,
+ T1Encoding<{1,0,1,0,1,?}>, Sched<[WriteALU]> {
+ // A6.2 & A8.6.8
+ bits<3> dst;
+ bits<8> imm;
+ let Inst{10-8} = dst;
+ let Inst{7-0} = imm;
+ let DecoderMethod = "DecodeThumbAddSpecialReg";
+}
+
+// Thumb1 frame lowering is rather fragile, we hope to be able to use
+// tADDrSPi, but we may need to insert a sequence that clobbers CPSR.
+def tADDframe : PseudoInst<(outs tGPR:$dst), (ins i32imm:$base, i32imm:$offset),
+ NoItinerary, []>,
+ Requires<[IsThumb, IsThumb1Only]> {
+ let Defs = [CPSR];
+}
+
+// ADD sp, sp, #<imm7>
+def tADDspi : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, t_imm0_508s4:$imm),
+ IIC_iALUi, "add", "\t$Rdn, $imm", []>,
+ T1Misc<{0,0,0,0,0,?,?}>, Sched<[WriteALU]> {
+ // A6.2.5 & A8.6.8
+ bits<7> imm;
+ let Inst{6-0} = imm;
+ let DecoderMethod = "DecodeThumbAddSPImm";
+}
+
+// SUB sp, sp, #<imm7>
+// FIXME: The encoding and the ASM string don't match up.
+def tSUBspi : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, t_imm0_508s4:$imm),
+ IIC_iALUi, "sub", "\t$Rdn, $imm", []>,
+ T1Misc<{0,0,0,0,1,?,?}>, Sched<[WriteALU]> {
+ // A6.2.5 & A8.6.214
+ bits<7> imm;
+ let Inst{6-0} = imm;
+ let DecoderMethod = "DecodeThumbAddSPImm";
+}
+
+def : tInstAlias<"add${p} sp, $imm",
+ (tSUBspi SP, t_imm0_508s4_neg:$imm, pred:$p)>;
+def : tInstAlias<"add${p} sp, sp, $imm",
+ (tSUBspi SP, t_imm0_508s4_neg:$imm, pred:$p)>;
+
+// Can optionally specify SP as a three operand instruction.
+def : tInstAlias<"add${p} sp, sp, $imm",
+ (tADDspi SP, t_imm0_508s4:$imm, pred:$p)>;
+def : tInstAlias<"sub${p} sp, sp, $imm",
+ (tSUBspi SP, t_imm0_508s4:$imm, pred:$p)>;
+
+// ADD <Rm>, sp
+def tADDrSP : T1pI<(outs GPR:$Rdn), (ins GPRsp:$sp, GPR:$Rn), IIC_iALUr,
+ "add", "\t$Rdn, $sp, $Rn", []>,
+ T1Special<{0,0,?,?}>, Sched<[WriteALU]> {
+ // A8.6.9 Encoding T1
+ bits<4> Rdn;
+ let Inst{7} = Rdn{3};
+ let Inst{6-3} = 0b1101;
+ let Inst{2-0} = Rdn{2-0};
+ let DecoderMethod = "DecodeThumbAddSPReg";
+}
+
+// ADD sp, <Rm>
+def tADDspr : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, GPR:$Rm), IIC_iALUr,
+ "add", "\t$Rdn, $Rm", []>,
+ T1Special<{0,0,?,?}>, Sched<[WriteALU]> {
+ // A8.6.9 Encoding T2
+ bits<4> Rm;
+ let Inst{7} = 1;
+ let Inst{6-3} = Rm;
+ let Inst{2-0} = 0b101;
+ let DecoderMethod = "DecodeThumbAddSPReg";
+}
+
+//===----------------------------------------------------------------------===//
+// Control Flow Instructions.
+//
+
+// Indirect branches
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+ def tBX : TI<(outs), (ins GPR:$Rm, pred:$p), IIC_Br, "bx${p}\t$Rm", []>,
+ T1Special<{1,1,0,?}>, Sched<[WriteBr]> {
+ // A6.2.3 & A8.6.25
+ bits<4> Rm;
+ let Inst{6-3} = Rm;
+ let Inst{2-0} = 0b000;
+ let Unpredictable{2-0} = 0b111;
+ }
+ def tBXNS : TI<(outs), (ins GPR:$Rm, pred:$p), IIC_Br, "bxns${p}\t$Rm", []>,
+ Requires<[IsThumb, Has8MSecExt]>,
+ T1Special<{1,1,0,?}>, Sched<[WriteBr]> {
+ bits<4> Rm;
+ let Inst{6-3} = Rm;
+ let Inst{2-0} = 0b100;
+ let Unpredictable{1-0} = 0b11;
+ }
+}
+
+let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
+ def tBX_RET : tPseudoExpand<(outs), (ins pred:$p), 2, IIC_Br,
+ [(ARMretflag)], (tBX LR, pred:$p)>, Sched<[WriteBr]>;
+
+ // Alternative return instruction used by vararg functions.
+ def tBX_RET_vararg : tPseudoExpand<(outs), (ins tGPR:$Rm, pred:$p),
+ 2, IIC_Br, [],
+ (tBX GPR:$Rm, pred:$p)>, Sched<[WriteBr]>;
+}
+
+// All calls clobber the non-callee saved registers. SP is marked as a use to
+// prevent stack-pointer assignments that appear immediately before calls from
+// potentially appearing dead.
+let isCall = 1,
+ Defs = [LR], Uses = [SP] in {
+ // Also used for Thumb2
+ def tBL : TIx2<0b11110, 0b11, 1,
+ (outs), (ins pred:$p, thumb_bl_target:$func), IIC_Br,
+ "bl${p}\t$func",
+ [(ARMcall tglobaladdr:$func)]>,
+ Requires<[IsThumb]>, Sched<[WriteBrL]> {
+ bits<24> func;
+ let Inst{26} = func{23};
+ let Inst{25-16} = func{20-11};
+ let Inst{13} = func{22};
+ let Inst{11} = func{21};
+ let Inst{10-0} = func{10-0};
+ }
+
+ // ARMv5T and above, also used for Thumb2
+ def tBLXi : TIx2<0b11110, 0b11, 0,
+ (outs), (ins pred:$p, thumb_blx_target:$func), IIC_Br,
+ "blx${p}\t$func", []>,
+ Requires<[IsThumb, HasV5T, IsNotMClass]>, Sched<[WriteBrL]> {
+ bits<24> func;
+ let Inst{26} = func{23};
+ let Inst{25-16} = func{20-11};
+ let Inst{13} = func{22};
+ let Inst{11} = func{21};
+ let Inst{10-1} = func{10-1};
+ let Inst{0} = 0; // func{0} is assumed zero
+ }
+
+ // Also used for Thumb2
+ def tBLXr : TI<(outs), (ins pred:$p, GPR:$func), IIC_Br,
+ "blx${p}\t$func",
+ [(ARMcall GPR:$func)]>,
+ Requires<[IsThumb, HasV5T]>,
+ T1Special<{1,1,1,?}>, Sched<[WriteBrL]> { // A6.2.3 & A8.6.24;
+ bits<4> func;
+ let Inst{6-3} = func;
+ let Inst{2-0} = 0b000;
+ }
+
+ // ARMv8-M Security Extensions
+ def tBLXNSr : TI<(outs), (ins pred:$p, GPRnopc:$func), IIC_Br,
+ "blxns${p}\t$func", []>,
+ Requires<[IsThumb, Has8MSecExt]>,
+ T1Special<{1,1,1,?}>, Sched<[WriteBrL]> {
+ bits<4> func;
+ let Inst{6-3} = func;
+ let Inst{2-0} = 0b100;
+ let Unpredictable{1-0} = 0b11;
+ }
+
+ // ARMv4T
+ def tBX_CALL : tPseudoInst<(outs), (ins tGPR:$func),
+ 4, IIC_Br,
+ [(ARMcall_nolink tGPR:$func)]>,
+ Requires<[IsThumb, IsThumb1Only]>, Sched<[WriteBr]>;
+}
+
+let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
+ let isPredicable = 1 in
+ def tB : T1pI<(outs), (ins t_brtarget:$target), IIC_Br,
+ "b", "\t$target", [(br bb:$target)]>,
+ T1Encoding<{1,1,1,0,0,?}>, Sched<[WriteBr]> {
+ bits<11> target;
+ let Inst{10-0} = target;
+ let AsmMatchConverter = "cvtThumbBranches";
+ }
+
+ // Far jump
+ // Just a pseudo for a tBL instruction. Needed to let regalloc know about
+ // the clobber of LR.
+ let Defs = [LR] in
+ def tBfar : tPseudoExpand<(outs), (ins thumb_bl_target:$target, pred:$p),
+ 4, IIC_Br, [],
+ (tBL pred:$p, thumb_bl_target:$target)>,
+ Sched<[WriteBrTbl]>;
+
+ def tBR_JTr : tPseudoInst<(outs),
+ (ins tGPR:$target, i32imm:$jt),
+ 0, IIC_Br,
+ [(ARMbrjt tGPR:$target, tjumptable:$jt)]>,
+ Sched<[WriteBrTbl]> {
+ let Size = 2;
+ list<Predicate> Predicates = [IsThumb, IsThumb1Only];
+ }
+}
+
+// FIXME: should be able to write a pattern for ARMBrcond, but can't use
+// a two-value operand where a dag node expects two operands. :(
+let isBranch = 1, isTerminator = 1 in
+ def tBcc : T1I<(outs), (ins thumb_bcc_target:$target, pred:$p), IIC_Br,
+ "b${p}\t$target",
+ [/*(ARMbrcond bb:$target, imm:$cc)*/]>,
+ T1BranchCond<{1,1,0,1}>, Sched<[WriteBr]> {
+ bits<4> p;
+ bits<8> target;
+ let Inst{11-8} = p;
+ let Inst{7-0} = target;
+ let AsmMatchConverter = "cvtThumbBranches";
+}
+
+
+// Tail calls
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
+ // IOS versions.
+ let Uses = [SP] in {
+ def tTAILJMPr : tPseudoExpand<(outs), (ins tcGPR:$dst),
+ 4, IIC_Br, [],
+ (tBX GPR:$dst, (ops 14, zero_reg))>,
+ Requires<[IsThumb]>, Sched<[WriteBr]>;
+ }
+ // tTAILJMPd: MachO version uses a Thumb2 branch (no Thumb1 tail calls
+ // on MachO), so it's in ARMInstrThumb2.td.
+ // Non-MachO version:
+ let Uses = [SP] in {
+ def tTAILJMPdND : tPseudoExpand<(outs),
+ (ins t_brtarget:$dst, pred:$p),
+ 4, IIC_Br, [],
+ (tB t_brtarget:$dst, pred:$p)>,
+ Requires<[IsThumb, IsNotMachO]>, Sched<[WriteBr]>;
+ }
+}
+
+
+// A8.6.218 Supervisor Call (Software Interrupt)
+// A8.6.16 B: Encoding T1
+// If Inst{11-8} == 0b1111 then SEE SVC
+let isCall = 1, Uses = [SP] in
+def tSVC : T1pI<(outs), (ins imm0_255:$imm), IIC_Br,
+ "svc", "\t$imm", []>, Encoding16, Sched<[WriteBr]> {
+ bits<8> imm;
+ let Inst{15-12} = 0b1101;
+ let Inst{11-8} = 0b1111;
+ let Inst{7-0} = imm;
+}
+
+// The assembler uses 0xDEFE for a trap instruction.
+let isBarrier = 1, isTerminator = 1 in
+def tTRAP : TI<(outs), (ins), IIC_Br,
+ "trap", [(trap)]>, Encoding16, Sched<[WriteBr]> {
+ let Inst = 0xdefe;
+}
+
+//===----------------------------------------------------------------------===//
+// Load Store Instructions.
+//
+
+// PC-relative loads need to be matched first as constant pool accesses need to
+// always be PC-relative. We do this using AddedComplexity, as the pattern is
+// simpler than the patterns of the other load instructions.
+let canFoldAsLoad = 1, isReMaterializable = 1, AddedComplexity = 10 in
+def tLDRpci : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i,
+ "ldr", "\t$Rt, $addr",
+ [(set tGPR:$Rt, (load (ARMWrapper tconstpool:$addr)))]>,
+ T1Encoding<{0,1,0,0,1,?}> {
+ // A6.2 & A8.6.59
+ bits<3> Rt;
+ bits<8> addr;
+ let Inst{10-8} = Rt;
+ let Inst{7-0} = addr;
+}
+
+// SP-relative loads should be matched before standard immediate-offset loads as
+// it means we avoid having to move SP to another register.
+let canFoldAsLoad = 1 in
+def tLDRspi : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_sp:$addr), IIC_iLoad_i,
+ "ldr", "\t$Rt, $addr",
+ [(set tGPR:$Rt, (load t_addrmode_sp:$addr))]>,
+ T1LdStSP<{1,?,?}> {
+ bits<3> Rt;
+ bits<8> addr;
+ let Inst{10-8} = Rt;
+ let Inst{7-0} = addr;
+}
+
+// Loads: reg/reg and reg/imm5
+let canFoldAsLoad = 1, isReMaterializable = 1 in
+multiclass thumb_ld_rr_ri_enc<bits<3> reg_opc, bits<4> imm_opc,
+ Operand AddrMode_r, Operand AddrMode_i,
+ AddrMode am, InstrItinClass itin_r,
+ InstrItinClass itin_i, string asm,
+ PatFrag opnode> {
+ // Immediate-offset loads should be matched before register-offset loads as
+ // when the offset is a constant it's simpler to first check if it fits in the
+ // immediate offset field then fall back to register-offset if it doesn't.
+ def i : // reg/imm5
+ T1pILdStEncodeImm<imm_opc, 1 /* Load */,
+ (outs tGPR:$Rt), (ins AddrMode_i:$addr),
+ am, itin_i, asm, "\t$Rt, $addr",
+ [(set tGPR:$Rt, (opnode AddrMode_i:$addr))]>;
+ // Register-offset loads are matched last.
+ def r : // reg/reg
+ T1pILdStEncode<reg_opc,
+ (outs tGPR:$Rt), (ins AddrMode_r:$addr),
+ am, itin_r, asm, "\t$Rt, $addr",
+ [(set tGPR:$Rt, (opnode AddrMode_r:$addr))]>;
+}
+// Stores: reg/reg and reg/imm5
+multiclass thumb_st_rr_ri_enc<bits<3> reg_opc, bits<4> imm_opc,
+ Operand AddrMode_r, Operand AddrMode_i,
+ AddrMode am, InstrItinClass itin_r,
+ InstrItinClass itin_i, string asm,
+ PatFrag opnode> {
+ def i : // reg/imm5
+ T1pILdStEncodeImm<imm_opc, 0 /* Store */,
+ (outs), (ins tGPR:$Rt, AddrMode_i:$addr),
+ am, itin_i, asm, "\t$Rt, $addr",
+ [(opnode tGPR:$Rt, AddrMode_i:$addr)]>;
+ def r : // reg/reg
+ T1pILdStEncode<reg_opc,
+ (outs), (ins tGPR:$Rt, AddrMode_r:$addr),
+ am, itin_r, asm, "\t$Rt, $addr",
+ [(opnode tGPR:$Rt, AddrMode_r:$addr)]>;
+}
+
+// A8.6.57 & A8.6.60
+defm tLDR : thumb_ld_rr_ri_enc<0b100, 0b0110, t_addrmode_rr,
+ t_addrmode_is4, AddrModeT1_4,
+ IIC_iLoad_r, IIC_iLoad_i, "ldr",
+ load>;
+
+// A8.6.64 & A8.6.61
+defm tLDRB : thumb_ld_rr_ri_enc<0b110, 0b0111, t_addrmode_rr,
+ t_addrmode_is1, AddrModeT1_1,
+ IIC_iLoad_bh_r, IIC_iLoad_bh_i, "ldrb",
+ zextloadi8>;
+
+// A8.6.76 & A8.6.73
+defm tLDRH : thumb_ld_rr_ri_enc<0b101, 0b1000, t_addrmode_rr,
+ t_addrmode_is2, AddrModeT1_2,
+ IIC_iLoad_bh_r, IIC_iLoad_bh_i, "ldrh",
+ zextloadi16>;
+
+let AddedComplexity = 10 in
+def tLDRSB : // A8.6.80
+ T1pILdStEncode<0b011, (outs tGPR:$Rt), (ins t_addrmode_rr:$addr),
+ AddrModeT1_1, IIC_iLoad_bh_r,
+ "ldrsb", "\t$Rt, $addr",
+ [(set tGPR:$Rt, (sextloadi8 t_addrmode_rr:$addr))]>;
+
+let AddedComplexity = 10 in
+def tLDRSH : // A8.6.84
+ T1pILdStEncode<0b111, (outs tGPR:$Rt), (ins t_addrmode_rr:$addr),
+ AddrModeT1_2, IIC_iLoad_bh_r,
+ "ldrsh", "\t$Rt, $addr",
+ [(set tGPR:$Rt, (sextloadi16 t_addrmode_rr:$addr))]>;
+
+
+def tSTRspi : T1pIs<(outs), (ins tGPR:$Rt, t_addrmode_sp:$addr), IIC_iStore_i,
+ "str", "\t$Rt, $addr",
+ [(store tGPR:$Rt, t_addrmode_sp:$addr)]>,
+ T1LdStSP<{0,?,?}> {
+ bits<3> Rt;
+ bits<8> addr;
+ let Inst{10-8} = Rt;
+ let Inst{7-0} = addr;
+}
+
+// A8.6.194 & A8.6.192
+defm tSTR : thumb_st_rr_ri_enc<0b000, 0b0110, t_addrmode_rr,
+ t_addrmode_is4, AddrModeT1_4,
+ IIC_iStore_r, IIC_iStore_i, "str",
+ store>;
+
+// A8.6.197 & A8.6.195
+defm tSTRB : thumb_st_rr_ri_enc<0b010, 0b0111, t_addrmode_rr,
+ t_addrmode_is1, AddrModeT1_1,
+ IIC_iStore_bh_r, IIC_iStore_bh_i, "strb",
+ truncstorei8>;
+
+// A8.6.207 & A8.6.205
+defm tSTRH : thumb_st_rr_ri_enc<0b001, 0b1000, t_addrmode_rr,
+ t_addrmode_is2, AddrModeT1_2,
+ IIC_iStore_bh_r, IIC_iStore_bh_i, "strh",
+ truncstorei16>;
+
+
+//===----------------------------------------------------------------------===//
+// Load / store multiple Instructions.
+//
+
+// These require base address to be written back or one of the loaded regs.
+let hasSideEffects = 0 in {
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
+def tLDMIA : T1I<(outs), (ins tGPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+ IIC_iLoad_m, "ldm${p}\t$Rn, $regs", []>, T1Encoding<{1,1,0,0,1,?}> {
+ bits<3> Rn;
+ bits<8> regs;
+ let Inst{10-8} = Rn;
+ let Inst{7-0} = regs;
+}
+
+// Writeback version is just a pseudo, as there's no encoding difference.
+// Writeback happens iff the base register is not in the destination register
+// list.
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
+def tLDMIA_UPD :
+ InstTemplate<AddrModeNone, 0, IndexModeNone, Pseudo, GenericDomain,
+ "$Rn = $wb", IIC_iLoad_mu>,
+ PseudoInstExpansion<(tLDMIA tGPR:$Rn, pred:$p, reglist:$regs)> {
+ let Size = 2;
+ let OutOperandList = (outs GPR:$wb);
+ let InOperandList = (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops);
+ let Pattern = [];
+ let isCodeGenOnly = 1;
+ let isPseudo = 1;
+ list<Predicate> Predicates = [IsThumb];
+}
+
+// There is no non-writeback version of STM for Thumb.
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
+def tSTMIA_UPD : Thumb1I<(outs GPR:$wb),
+ (ins tGPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+ AddrModeNone, 2, IIC_iStore_mu,
+ "stm${p}\t$Rn!, $regs", "$Rn = $wb", []>,
+ T1Encoding<{1,1,0,0,0,?}> {
+ bits<3> Rn;
+ bits<8> regs;
+ let Inst{10-8} = Rn;
+ let Inst{7-0} = regs;
+}
+
+} // hasSideEffects
+
+def : InstAlias<"ldm${p} $Rn!, $regs",
+ (tLDMIA tGPR:$Rn, pred:$p, reglist:$regs), 0>,
+ Requires<[IsThumb, IsThumb1Only]>;
+
+let mayLoad = 1, Uses = [SP], Defs = [SP], hasExtraDefRegAllocReq = 1 in
+def tPOP : T1I<(outs), (ins pred:$p, reglist:$regs, variable_ops),
+ IIC_iPop,
+ "pop${p}\t$regs", []>,
+ T1Misc<{1,1,0,?,?,?,?}> {
+ bits<16> regs;
+ let Inst{8} = regs{15};
+ let Inst{7-0} = regs{7-0};
+}
+
+let mayStore = 1, Uses = [SP], Defs = [SP], hasExtraSrcRegAllocReq = 1 in
+def tPUSH : T1I<(outs), (ins pred:$p, reglist:$regs, variable_ops),
+ IIC_iStore_m,
+ "push${p}\t$regs", []>,
+ T1Misc<{0,1,0,?,?,?,?}> {
+ bits<16> regs;
+ let Inst{8} = regs{14};
+ let Inst{7-0} = regs{7-0};
+}
+
+//===----------------------------------------------------------------------===//
+// Arithmetic Instructions.
+//
+
+// Helper classes for encoding T1pI patterns:
+class T1pIDPEncode<bits<4> opA, dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : T1pI<oops, iops, itin, opc, asm, pattern>,
+ T1DataProcessing<opA> {
+ bits<3> Rm;
+ bits<3> Rn;
+ let Inst{5-3} = Rm;
+ let Inst{2-0} = Rn;
+}
+class T1pIMiscEncode<bits<7> opA, dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : T1pI<oops, iops, itin, opc, asm, pattern>,
+ T1Misc<opA> {
+ bits<3> Rm;
+ bits<3> Rd;
+ let Inst{5-3} = Rm;
+ let Inst{2-0} = Rd;
+}
+
+// Helper classes for encoding T1sI patterns:
+class T1sIDPEncode<bits<4> opA, dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : T1sI<oops, iops, itin, opc, asm, pattern>,
+ T1DataProcessing<opA> {
+ bits<3> Rd;
+ bits<3> Rn;
+ let Inst{5-3} = Rn;
+ let Inst{2-0} = Rd;
+}
+class T1sIGenEncode<bits<5> opA, dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : T1sI<oops, iops, itin, opc, asm, pattern>,
+ T1General<opA> {
+ bits<3> Rm;
+ bits<3> Rn;
+ bits<3> Rd;
+ let Inst{8-6} = Rm;
+ let Inst{5-3} = Rn;
+ let Inst{2-0} = Rd;
+}
+class T1sIGenEncodeImm<bits<5> opA, dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : T1sI<oops, iops, itin, opc, asm, pattern>,
+ T1General<opA> {
+ bits<3> Rd;
+ bits<3> Rm;
+ let Inst{5-3} = Rm;
+ let Inst{2-0} = Rd;
+}
+
+// Helper classes for encoding T1sIt patterns:
+class T1sItDPEncode<bits<4> opA, dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : T1sIt<oops, iops, itin, opc, asm, pattern>,
+ T1DataProcessing<opA> {
+ bits<3> Rdn;
+ bits<3> Rm;
+ let Inst{5-3} = Rm;
+ let Inst{2-0} = Rdn;
+}
+class T1sItGenEncodeImm<bits<5> opA, dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : T1sIt<oops, iops, itin, opc, asm, pattern>,
+ T1General<opA> {
+ bits<3> Rdn;
+ bits<8> imm8;
+ let Inst{10-8} = Rdn;
+ let Inst{7-0} = imm8;
+}
+
+let isAdd = 1 in {
+ // Add with carry register
+ let isCommutable = 1, Uses = [CPSR] in
+ def tADC : // A8.6.2
+ T1sItDPEncode<0b0101, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), IIC_iALUr,
+ "adc", "\t$Rdn, $Rm",
+ [(set tGPR:$Rdn, (adde tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
+
+ // Add immediate
+ def tADDi3 : // A8.6.4 T1
+ T1sIGenEncodeImm<0b01110, (outs tGPR:$Rd), (ins tGPR:$Rm, imm0_7:$imm3),
+ IIC_iALUi,
+ "add", "\t$Rd, $Rm, $imm3",
+ [(set tGPR:$Rd, (add tGPR:$Rm, imm0_7:$imm3))]>,
+ Sched<[WriteALU]> {
+ bits<3> imm3;
+ let Inst{8-6} = imm3;
+ }
+
+ def tADDi8 : // A8.6.4 T2
+ T1sItGenEncodeImm<{1,1,0,?,?}, (outs tGPR:$Rdn),
+ (ins tGPR:$Rn, imm0_255:$imm8), IIC_iALUi,
+ "add", "\t$Rdn, $imm8",
+ [(set tGPR:$Rdn, (add tGPR:$Rn, imm8_255:$imm8))]>,
+ Sched<[WriteALU]>;
+
+ // Add register
+ let isCommutable = 1 in
+ def tADDrr : // A8.6.6 T1
+ T1sIGenEncode<0b01100, (outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm),
+ IIC_iALUr,
+ "add", "\t$Rd, $Rn, $Rm",
+ [(set tGPR:$Rd, (add tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
+
+ let hasSideEffects = 0 in
+ def tADDhirr : T1pIt<(outs GPR:$Rdn), (ins GPR:$Rn, GPR:$Rm), IIC_iALUr,
+ "add", "\t$Rdn, $Rm", []>,
+ T1Special<{0,0,?,?}>, Sched<[WriteALU]> {
+ // A8.6.6 T2
+ bits<4> Rdn;
+ bits<4> Rm;
+ let Inst{7} = Rdn{3};
+ let Inst{6-3} = Rm;
+ let Inst{2-0} = Rdn{2-0};
+ }
+}
+
+// AND register
+let isCommutable = 1 in
+def tAND : // A8.6.12
+ T1sItDPEncode<0b0000, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+ IIC_iBITr,
+ "and", "\t$Rdn, $Rm",
+ [(set tGPR:$Rdn, (and tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
+
+// ASR immediate
+def tASRri : // A8.6.14
+ T1sIGenEncodeImm<{0,1,0,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, imm_sr:$imm5),
+ IIC_iMOVsi,
+ "asr", "\t$Rd, $Rm, $imm5",
+ [(set tGPR:$Rd, (sra tGPR:$Rm, (i32 imm_sr:$imm5)))]>,
+ Sched<[WriteALU]> {
+ bits<5> imm5;
+ let Inst{10-6} = imm5;
+}
+
+// ASR register
+def tASRrr : // A8.6.15
+ T1sItDPEncode<0b0100, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+ IIC_iMOVsr,
+ "asr", "\t$Rdn, $Rm",
+ [(set tGPR:$Rdn, (sra tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
+
+// BIC register
+def tBIC : // A8.6.20
+ T1sItDPEncode<0b1110, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+ IIC_iBITr,
+ "bic", "\t$Rdn, $Rm",
+ [(set tGPR:$Rdn, (and tGPR:$Rn, (not tGPR:$Rm)))]>,
+ Sched<[WriteALU]>;
+
+// CMN register
+let isCompare = 1, Defs = [CPSR] in {
+//FIXME: Disable CMN, as CCodes are backwards from compare expectations
+// Compare-to-zero still works out, just not the relationals
+//def tCMN : // A8.6.33
+// T1pIDPEncode<0b1011, (outs), (ins tGPR:$lhs, tGPR:$rhs),
+// IIC_iCMPr,
+// "cmn", "\t$lhs, $rhs",
+// [(ARMcmp tGPR:$lhs, (ineg tGPR:$rhs))]>;
+
+def tCMNz : // A8.6.33
+ T1pIDPEncode<0b1011, (outs), (ins tGPR:$Rn, tGPR:$Rm),
+ IIC_iCMPr,
+ "cmn", "\t$Rn, $Rm",
+ [(ARMcmpZ tGPR:$Rn, (ineg tGPR:$Rm))]>, Sched<[WriteCMP]>;
+
+} // isCompare = 1, Defs = [CPSR]
+
+// CMP immediate
+let isCompare = 1, Defs = [CPSR] in {
+def tCMPi8 : T1pI<(outs), (ins tGPR:$Rn, imm0_255:$imm8), IIC_iCMPi,
+ "cmp", "\t$Rn, $imm8",
+ [(ARMcmp tGPR:$Rn, imm0_255:$imm8)]>,
+ T1General<{1,0,1,?,?}>, Sched<[WriteCMP]> {
+ // A8.6.35
+ bits<3> Rn;
+ bits<8> imm8;
+ let Inst{10-8} = Rn;
+ let Inst{7-0} = imm8;
+}
+
+// CMP register
+def tCMPr : // A8.6.36 T1
+ T1pIDPEncode<0b1010, (outs), (ins tGPR:$Rn, tGPR:$Rm),
+ IIC_iCMPr,
+ "cmp", "\t$Rn, $Rm",
+ [(ARMcmp tGPR:$Rn, tGPR:$Rm)]>, Sched<[WriteCMP]>;
+
+def tCMPhir : T1pI<(outs), (ins GPR:$Rn, GPR:$Rm), IIC_iCMPr,
+ "cmp", "\t$Rn, $Rm", []>,
+ T1Special<{0,1,?,?}>, Sched<[WriteCMP]> {
+ // A8.6.36 T2
+ bits<4> Rm;
+ bits<4> Rn;
+ let Inst{7} = Rn{3};
+ let Inst{6-3} = Rm;
+ let Inst{2-0} = Rn{2-0};
+}
+} // isCompare = 1, Defs = [CPSR]
+
+
+// XOR register
+let isCommutable = 1 in
+def tEOR : // A8.6.45
+ T1sItDPEncode<0b0001, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+ IIC_iBITr,
+ "eor", "\t$Rdn, $Rm",
+ [(set tGPR:$Rdn, (xor tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
+
+// LSL immediate
+def tLSLri : // A8.6.88
+ T1sIGenEncodeImm<{0,0,0,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, imm0_31:$imm5),
+ IIC_iMOVsi,
+ "lsl", "\t$Rd, $Rm, $imm5",
+ [(set tGPR:$Rd, (shl tGPR:$Rm, (i32 imm:$imm5)))]>,
+ Sched<[WriteALU]> {
+ bits<5> imm5;
+ let Inst{10-6} = imm5;
+}
+
+// LSL register
+def tLSLrr : // A8.6.89
+ T1sItDPEncode<0b0010, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+ IIC_iMOVsr,
+ "lsl", "\t$Rdn, $Rm",
+ [(set tGPR:$Rdn, (shl tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
+
+// LSR immediate
+def tLSRri : // A8.6.90
+ T1sIGenEncodeImm<{0,0,1,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, imm_sr:$imm5),
+ IIC_iMOVsi,
+ "lsr", "\t$Rd, $Rm, $imm5",
+ [(set tGPR:$Rd, (srl tGPR:$Rm, (i32 imm_sr:$imm5)))]>,
+ Sched<[WriteALU]> {
+ bits<5> imm5;
+ let Inst{10-6} = imm5;
+}
+
+// LSR register
+def tLSRrr : // A8.6.91
+ T1sItDPEncode<0b0011, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+ IIC_iMOVsr,
+ "lsr", "\t$Rdn, $Rm",
+ [(set tGPR:$Rdn, (srl tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
+
+// Move register
+let isMoveImm = 1 in
+def tMOVi8 : T1sI<(outs tGPR:$Rd), (ins imm0_255:$imm8), IIC_iMOVi,
+ "mov", "\t$Rd, $imm8",
+ [(set tGPR:$Rd, imm0_255:$imm8)]>,
+ T1General<{1,0,0,?,?}>, Sched<[WriteALU]> {
+ // A8.6.96
+ bits<3> Rd;
+ bits<8> imm8;
+ let Inst{10-8} = Rd;
+ let Inst{7-0} = imm8;
+}
+// Because we have an explicit tMOVSr below, we need an alias to handle
+// the immediate "movs" form here. Blech.
+def : tInstAlias <"movs $Rdn, $imm",
+ (tMOVi8 tGPR:$Rdn, CPSR, imm0_255:$imm, 14, 0)>;
+
+// A7-73: MOV(2) - mov setting flag.
+
+let hasSideEffects = 0 in {
+def tMOVr : Thumb1pI<(outs GPR:$Rd), (ins GPR:$Rm), AddrModeNone,
+ 2, IIC_iMOVr,
+ "mov", "\t$Rd, $Rm", "", []>,
+ T1Special<{1,0,?,?}>, Sched<[WriteALU]> {
+ // A8.6.97
+ bits<4> Rd;
+ bits<4> Rm;
+ let Inst{7} = Rd{3};
+ let Inst{6-3} = Rm;
+ let Inst{2-0} = Rd{2-0};
+}
+let Defs = [CPSR] in
+def tMOVSr : T1I<(outs tGPR:$Rd), (ins tGPR:$Rm), IIC_iMOVr,
+ "movs\t$Rd, $Rm", []>, Encoding16, Sched<[WriteALU]> {
+ // A8.6.97
+ bits<3> Rd;
+ bits<3> Rm;
+ let Inst{15-6} = 0b0000000000;
+ let Inst{5-3} = Rm;
+ let Inst{2-0} = Rd;
+}
+} // hasSideEffects
+
+// Multiply register
+let isCommutable = 1 in
+def tMUL : // A8.6.105 T1
+ Thumb1sI<(outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm), AddrModeNone, 2,
+ IIC_iMUL32, "mul", "\t$Rd, $Rn, $Rm", "$Rm = $Rd",
+ [(set tGPR:$Rd, (mul tGPR:$Rn, tGPR:$Rm))]>,
+ T1DataProcessing<0b1101> {
+ bits<3> Rd;
+ bits<3> Rn;
+ let Inst{5-3} = Rn;
+ let Inst{2-0} = Rd;
+ let AsmMatchConverter = "cvtThumbMultiply";
+}
+
+def :tInstAlias<"mul${s}${p} $Rdm, $Rn", (tMUL tGPR:$Rdm, s_cc_out:$s, tGPR:$Rn,
+ pred:$p)>;
+
+// Move inverse register
+def tMVN : // A8.6.107
+ T1sIDPEncode<0b1111, (outs tGPR:$Rd), (ins tGPR:$Rn), IIC_iMVNr,
+ "mvn", "\t$Rd, $Rn",
+ [(set tGPR:$Rd, (not tGPR:$Rn))]>, Sched<[WriteALU]>;
+
+// Bitwise or register
+let isCommutable = 1 in
+def tORR : // A8.6.114
+ T1sItDPEncode<0b1100, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+ IIC_iBITr,
+ "orr", "\t$Rdn, $Rm",
+ [(set tGPR:$Rdn, (or tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
+
+// Swaps
+def tREV : // A8.6.134
+ T1pIMiscEncode<{1,0,1,0,0,0,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
+ IIC_iUNAr,
+ "rev", "\t$Rd, $Rm",
+ [(set tGPR:$Rd, (bswap tGPR:$Rm))]>,
+ Requires<[IsThumb, IsThumb1Only, HasV6]>, Sched<[WriteALU]>;
+
+def tREV16 : // A8.6.135
+ T1pIMiscEncode<{1,0,1,0,0,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
+ IIC_iUNAr,
+ "rev16", "\t$Rd, $Rm",
+ [(set tGPR:$Rd, (rotr (bswap tGPR:$Rm), (i32 16)))]>,
+ Requires<[IsThumb, IsThumb1Only, HasV6]>, Sched<[WriteALU]>;
+
+def tREVSH : // A8.6.136
+ T1pIMiscEncode<{1,0,1,0,1,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
+ IIC_iUNAr,
+ "revsh", "\t$Rd, $Rm",
+ [(set tGPR:$Rd, (sra (bswap tGPR:$Rm), (i32 16)))]>,
+ Requires<[IsThumb, IsThumb1Only, HasV6]>, Sched<[WriteALU]>;
+
+// Rotate right register
+def tROR : // A8.6.139
+ T1sItDPEncode<0b0111, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+ IIC_iMOVsr,
+ "ror", "\t$Rdn, $Rm",
+ [(set tGPR:$Rdn, (rotr tGPR:$Rn, tGPR:$Rm))]>,
+ Sched<[WriteALU]>;
+
+// Negate register
+def tRSB : // A8.6.141
+ T1sIDPEncode<0b1001, (outs tGPR:$Rd), (ins tGPR:$Rn),
+ IIC_iALUi,
+ "rsb", "\t$Rd, $Rn, #0",
+ [(set tGPR:$Rd, (ineg tGPR:$Rn))]>, Sched<[WriteALU]>;
+
+// Subtract with carry register
+let Uses = [CPSR] in
+def tSBC : // A8.6.151
+ T1sItDPEncode<0b0110, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+ IIC_iALUr,
+ "sbc", "\t$Rdn, $Rm",
+ [(set tGPR:$Rdn, (sube tGPR:$Rn, tGPR:$Rm))]>,
+ Sched<[WriteALU]>;
+
+// Subtract immediate
+def tSUBi3 : // A8.6.210 T1
+ T1sIGenEncodeImm<0b01111, (outs tGPR:$Rd), (ins tGPR:$Rm, imm0_7:$imm3),
+ IIC_iALUi,
+ "sub", "\t$Rd, $Rm, $imm3",
+ [(set tGPR:$Rd, (add tGPR:$Rm, imm0_7_neg:$imm3))]>,
+ Sched<[WriteALU]> {
+ bits<3> imm3;
+ let Inst{8-6} = imm3;
+}
+
+def tSUBi8 : // A8.6.210 T2
+ T1sItGenEncodeImm<{1,1,1,?,?}, (outs tGPR:$Rdn),
+ (ins tGPR:$Rn, imm0_255:$imm8), IIC_iALUi,
+ "sub", "\t$Rdn, $imm8",
+ [(set tGPR:$Rdn, (add tGPR:$Rn, imm8_255_neg:$imm8))]>,
+ Sched<[WriteALU]>;
+
+// Subtract register
+def tSUBrr : // A8.6.212
+ T1sIGenEncode<0b01101, (outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm),
+ IIC_iALUr,
+ "sub", "\t$Rd, $Rn, $Rm",
+ [(set tGPR:$Rd, (sub tGPR:$Rn, tGPR:$Rm))]>,
+ Sched<[WriteALU]>;
+
+// Sign-extend byte
+def tSXTB : // A8.6.222
+ T1pIMiscEncode<{0,0,1,0,0,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
+ IIC_iUNAr,
+ "sxtb", "\t$Rd, $Rm",
+ [(set tGPR:$Rd, (sext_inreg tGPR:$Rm, i8))]>,
+ Requires<[IsThumb, IsThumb1Only, HasV6]>,
+ Sched<[WriteALU]>;
+
+// Sign-extend short
+def tSXTH : // A8.6.224
+ T1pIMiscEncode<{0,0,1,0,0,0,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
+ IIC_iUNAr,
+ "sxth", "\t$Rd, $Rm",
+ [(set tGPR:$Rd, (sext_inreg tGPR:$Rm, i16))]>,
+ Requires<[IsThumb, IsThumb1Only, HasV6]>,
+ Sched<[WriteALU]>;
+
+// Test
+let isCompare = 1, isCommutable = 1, Defs = [CPSR] in
+def tTST : // A8.6.230
+ T1pIDPEncode<0b1000, (outs), (ins tGPR:$Rn, tGPR:$Rm), IIC_iTSTr,
+ "tst", "\t$Rn, $Rm",
+ [(ARMcmpZ (and_su tGPR:$Rn, tGPR:$Rm), 0)]>,
+ Sched<[WriteALU]>;
+
+// A8.8.247 UDF - Undefined (Encoding T1)
+def tUDF : TI<(outs), (ins imm0_255:$imm8), IIC_Br, "udf\t$imm8",
+ [(int_arm_undefined imm0_255:$imm8)]>, Encoding16 {
+ bits<8> imm8;
+ let Inst{15-12} = 0b1101;
+ let Inst{11-8} = 0b1110;
+ let Inst{7-0} = imm8;
+}
+
+def t__brkdiv0 : TI<(outs), (ins), IIC_Br, "__brkdiv0",
+ [(int_arm_undefined 249)]>, Encoding16,
+ Requires<[IsThumb, IsWindows]> {
+ let Inst = 0xdef9;
+ let isTerminator = 1;
+}
+
+// Zero-extend byte
+def tUXTB : // A8.6.262
+ T1pIMiscEncode<{0,0,1,0,1,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
+ IIC_iUNAr,
+ "uxtb", "\t$Rd, $Rm",
+ [(set tGPR:$Rd, (and tGPR:$Rm, 0xFF))]>,
+ Requires<[IsThumb, IsThumb1Only, HasV6]>,
+ Sched<[WriteALU]>;
+
+// Zero-extend short
+def tUXTH : // A8.6.264
+ T1pIMiscEncode<{0,0,1,0,1,0,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
+ IIC_iUNAr,
+ "uxth", "\t$Rd, $Rm",
+ [(set tGPR:$Rd, (and tGPR:$Rm, 0xFFFF))]>,
+ Requires<[IsThumb, IsThumb1Only, HasV6]>, Sched<[WriteALU]>;
+
+// Conditional move tMOVCCr - Used to implement the Thumb SELECT_CC operation.
+// Expanded after instruction selection into a branch sequence.
+let usesCustomInserter = 1 in // Expanded after instruction selection.
+ def tMOVCCr_pseudo :
+ PseudoInst<(outs tGPR:$dst), (ins tGPR:$false, tGPR:$true, cmovpred:$p),
+ NoItinerary,
+ [(set tGPR:$dst, (ARMcmov tGPR:$false, tGPR:$true, cmovpred:$p))]>;
+
+// tLEApcrel - Load a pc-relative address into a register without offending the
+// assembler.
+
+def tADR : T1I<(outs tGPR:$Rd), (ins t_adrlabel:$addr, pred:$p),
+ IIC_iALUi, "adr{$p}\t$Rd, $addr", []>,
+ T1Encoding<{1,0,1,0,0,?}>, Sched<[WriteALU]> {
+ bits<3> Rd;
+ bits<8> addr;
+ let Inst{10-8} = Rd;
+ let Inst{7-0} = addr;
+ let DecoderMethod = "DecodeThumbAddSpecialReg";
+}
+
+let hasSideEffects = 0, isReMaterializable = 1 in
+def tLEApcrel : tPseudoInst<(outs tGPR:$Rd), (ins i32imm:$label, pred:$p),
+ 2, IIC_iALUi, []>, Sched<[WriteALU]>;
+
+let hasSideEffects = 1 in
+def tLEApcrelJT : tPseudoInst<(outs tGPR:$Rd),
+ (ins i32imm:$label, pred:$p),
+ 2, IIC_iALUi, []>, Sched<[WriteALU]>;
+
+// Thumb-1 doesn't have the TBB or TBH instructions, but we can synthesize them
+// and make use of the same compressed jump table format as Thumb-2.
+let Size = 2 in {
+def tTBB_JT : tPseudoInst<(outs),
+ (ins tGPR:$base, tGPR:$index, i32imm:$jt, i32imm:$pclbl), 0, IIC_Br, []>,
+ Sched<[WriteBr]>;
+
+def tTBH_JT : tPseudoInst<(outs),
+ (ins tGPR:$base, tGPR:$index, i32imm:$jt, i32imm:$pclbl), 0, IIC_Br, []>,
+ Sched<[WriteBr]>;
+}
+
+//===----------------------------------------------------------------------===//
+// TLS Instructions
+//
+
+// __aeabi_read_tp preserves the registers r1-r3.
+// This is a pseudo inst so that we can get the encoding right,
+// complete with fixup for the aeabi_read_tp function.
+let isCall = 1, Defs = [R0, R12, LR, CPSR], Uses = [SP] in
+def tTPsoft : tPseudoInst<(outs), (ins), 4, IIC_Br,
+ [(set R0, ARMthread_pointer)]>,
+ Sched<[WriteBr]>;
+
+//===----------------------------------------------------------------------===//
+// SJLJ Exception handling intrinsics
+//
+
+// eh_sjlj_setjmp() is an instruction sequence to store the return address and
+// save #0 in R0 for the non-longjmp case. Since by its nature we may be coming
+// from some other function to get here, and we're using the stack frame for the
+// containing function to save/restore registers, we can't keep anything live in
+// regs across the eh_sjlj_setjmp(), else it will almost certainly have been
+// tromped upon when we get here from a longjmp(). We force everything out of
+// registers except for our own input by listing the relevant registers in
+// Defs. By doing so, we also cause the prologue/epilogue code to actively
+// preserve all of the callee-saved resgisters, which is exactly what we want.
+// $val is a scratch register for our use.
+let Defs = [ R0, R1, R2, R3, R4, R5, R6, R7, R12, CPSR ],
+ hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
+ usesCustomInserter = 1 in
+def tInt_eh_sjlj_setjmp : ThumbXI<(outs),(ins tGPR:$src, tGPR:$val),
+ AddrModeNone, 0, NoItinerary, "","",
+ [(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>;
+
+// FIXME: Non-IOS version(s)
+let isBarrier = 1, hasSideEffects = 1, isTerminator = 1, isCodeGenOnly = 1,
+ Defs = [ R7, LR, SP ] in
+def tInt_eh_sjlj_longjmp : XI<(outs), (ins GPR:$src, GPR:$scratch),
+ AddrModeNone, 0, IndexModeNone,
+ Pseudo, NoItinerary, "", "",
+ [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>,
+ Requires<[IsThumb,IsNotWindows]>;
+
+let isBarrier = 1, hasSideEffects = 1, isTerminator = 1, isCodeGenOnly = 1,
+ Defs = [ R11, LR, SP ] in
+def tInt_WIN_eh_sjlj_longjmp
+ : XI<(outs), (ins GPR:$src, GPR:$scratch), AddrModeNone, 0, IndexModeNone,
+ Pseudo, NoItinerary, "", "", [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>,
+ Requires<[IsThumb,IsWindows]>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//
+
+// Comparisons
+def : T1Pat<(ARMcmpZ tGPR:$Rn, imm0_255:$imm8),
+ (tCMPi8 tGPR:$Rn, imm0_255:$imm8)>;
+def : T1Pat<(ARMcmpZ tGPR:$Rn, tGPR:$Rm),
+ (tCMPr tGPR:$Rn, tGPR:$Rm)>;
+
+// Add with carry
+def : T1Pat<(addc tGPR:$lhs, imm0_7:$rhs),
+ (tADDi3 tGPR:$lhs, imm0_7:$rhs)>;
+def : T1Pat<(addc tGPR:$lhs, imm8_255:$rhs),
+ (tADDi8 tGPR:$lhs, imm8_255:$rhs)>;
+def : T1Pat<(addc tGPR:$lhs, tGPR:$rhs),
+ (tADDrr tGPR:$lhs, tGPR:$rhs)>;
+
+// Subtract with carry
+def : T1Pat<(addc tGPR:$lhs, imm0_7_neg:$rhs),
+ (tSUBi3 tGPR:$lhs, imm0_7_neg:$rhs)>;
+def : T1Pat<(addc tGPR:$lhs, imm8_255_neg:$rhs),
+ (tSUBi8 tGPR:$lhs, imm8_255_neg:$rhs)>;
+def : T1Pat<(subc tGPR:$lhs, tGPR:$rhs),
+ (tSUBrr tGPR:$lhs, tGPR:$rhs)>;
+
+// Bswap 16 with load/store
+def : T1Pat<(srl (bswap (extloadi16 t_addrmode_is2:$addr)), (i32 16)),
+ (tREV16 (tLDRHi t_addrmode_is2:$addr))>;
+def : T1Pat<(srl (bswap (extloadi16 t_addrmode_rr:$addr)), (i32 16)),
+ (tREV16 (tLDRHr t_addrmode_rr:$addr))>;
+def : T1Pat<(truncstorei16 (srl (bswap tGPR:$Rn), (i32 16)),
+ t_addrmode_is2:$addr),
+ (tSTRHi(tREV16 tGPR:$Rn), t_addrmode_is2:$addr)>;
+def : T1Pat<(truncstorei16 (srl (bswap tGPR:$Rn), (i32 16)),
+ t_addrmode_rr:$addr),
+ (tSTRHr (tREV16 tGPR:$Rn), t_addrmode_rr:$addr)>;
+
+// ConstantPool
+def : T1Pat<(ARMWrapper tconstpool :$dst), (tLEApcrel tconstpool :$dst)>;
+
+// GlobalAddress
+def tLDRLIT_ga_pcrel : PseudoInst<(outs tGPR:$dst), (ins i32imm:$addr),
+ IIC_iLoadiALU,
+ [(set tGPR:$dst,
+ (ARMWrapperPIC tglobaladdr:$addr))]>,
+ Requires<[IsThumb, DontUseMovt]>;
+
+def tLDRLIT_ga_abs : PseudoInst<(outs tGPR:$dst), (ins i32imm:$src),
+ IIC_iLoad_i,
+ [(set tGPR:$dst,
+ (ARMWrapper tglobaladdr:$src))]>,
+ Requires<[IsThumb, DontUseMovt]>;
+
+// TLS globals
+def : Pat<(ARMWrapperPIC tglobaltlsaddr:$addr),
+ (tLDRLIT_ga_pcrel tglobaltlsaddr:$addr)>,
+ Requires<[IsThumb, DontUseMovt]>;
+def : Pat<(ARMWrapper tglobaltlsaddr:$addr),
+ (tLDRLIT_ga_abs tglobaltlsaddr:$addr)>,
+ Requires<[IsThumb, DontUseMovt]>;
+
+
+// JumpTable
+def : T1Pat<(ARMWrapperJT tjumptable:$dst),
+ (tLEApcrelJT tjumptable:$dst)>;
+
+// Direct calls
+def : T1Pat<(ARMcall texternalsym:$func), (tBL texternalsym:$func)>,
+ Requires<[IsThumb]>;
+
+// zextload i1 -> zextload i8
+def : T1Pat<(zextloadi1 t_addrmode_is1:$addr),
+ (tLDRBi t_addrmode_is1:$addr)>;
+def : T1Pat<(zextloadi1 t_addrmode_rr:$addr),
+ (tLDRBr t_addrmode_rr:$addr)>;
+
+// extload from the stack -> word load from the stack, as it avoids having to
+// materialize the base in a separate register. This only works when a word
+// load puts the byte/halfword value in the same place in the register that the
+// byte/halfword load would, i.e. when little-endian.
+def : T1Pat<(extloadi1 t_addrmode_sp:$addr), (tLDRspi t_addrmode_sp:$addr)>,
+ Requires<[IsThumb, IsThumb1Only, IsLE]>;
+def : T1Pat<(extloadi8 t_addrmode_sp:$addr), (tLDRspi t_addrmode_sp:$addr)>,
+ Requires<[IsThumb, IsThumb1Only, IsLE]>;
+def : T1Pat<(extloadi16 t_addrmode_sp:$addr), (tLDRspi t_addrmode_sp:$addr)>,
+ Requires<[IsThumb, IsThumb1Only, IsLE]>;
+
+// extload -> zextload
+def : T1Pat<(extloadi1 t_addrmode_is1:$addr), (tLDRBi t_addrmode_is1:$addr)>;
+def : T1Pat<(extloadi1 t_addrmode_rr:$addr), (tLDRBr t_addrmode_rr:$addr)>;
+def : T1Pat<(extloadi8 t_addrmode_is1:$addr), (tLDRBi t_addrmode_is1:$addr)>;
+def : T1Pat<(extloadi8 t_addrmode_rr:$addr), (tLDRBr t_addrmode_rr:$addr)>;
+def : T1Pat<(extloadi16 t_addrmode_is2:$addr), (tLDRHi t_addrmode_is2:$addr)>;
+def : T1Pat<(extloadi16 t_addrmode_rr:$addr), (tLDRHr t_addrmode_rr:$addr)>;
+
+// post-inc loads and stores
+
+// post-inc LDR -> LDM r0!, {r1}. The way operands are layed out in LDMs is
+// different to how ISel expects them for a post-inc load, so use a pseudo
+// and expand it just after ISel.
+let usesCustomInserter = 1,
+ Constraints = "$Rn = $Rn_wb,@earlyclobber $Rn_wb" in
+ def tLDR_postidx: tPseudoInst<(outs rGPR:$Rt, rGPR:$Rn_wb),
+ (ins rGPR:$Rn, pred:$p),
+ 4, IIC_iStore_ru,
+ []>;
+
+// post-inc STR -> STM r0!, {r1}. The layout of this (because it doesn't def
+// multiple registers) is the same in ISel as MachineInstr, so there's no need
+// for a pseudo.
+def : T1Pat<(post_store rGPR:$Rt, rGPR:$Rn, 4),
+ (tSTMIA_UPD rGPR:$Rn, rGPR:$Rt)>;
+
+// If it's impossible to use [r,r] address mode for sextload, select to
+// ldr{b|h} + sxt{b|h} instead.
+def : T1Pat<(sextloadi8 t_addrmode_is1:$addr),
+ (tSXTB (tLDRBi t_addrmode_is1:$addr))>,
+ Requires<[IsThumb, IsThumb1Only, HasV6]>;
+def : T1Pat<(sextloadi8 t_addrmode_rr:$addr),
+ (tSXTB (tLDRBr t_addrmode_rr:$addr))>,
+ Requires<[IsThumb, IsThumb1Only, HasV6]>;
+def : T1Pat<(sextloadi16 t_addrmode_is2:$addr),
+ (tSXTH (tLDRHi t_addrmode_is2:$addr))>,
+ Requires<[IsThumb, IsThumb1Only, HasV6]>;
+def : T1Pat<(sextloadi16 t_addrmode_rr:$addr),
+ (tSXTH (tLDRHr t_addrmode_rr:$addr))>,
+ Requires<[IsThumb, IsThumb1Only, HasV6]>;
+
+def : T1Pat<(sextloadi8 t_addrmode_is1:$addr),
+ (tASRri (tLSLri (tLDRBi t_addrmode_is1:$addr), 24), 24)>;
+def : T1Pat<(sextloadi8 t_addrmode_rr:$addr),
+ (tASRri (tLSLri (tLDRBr t_addrmode_rr:$addr), 24), 24)>;
+def : T1Pat<(sextloadi16 t_addrmode_is2:$addr),
+ (tASRri (tLSLri (tLDRHi t_addrmode_is2:$addr), 16), 16)>;
+def : T1Pat<(sextloadi16 t_addrmode_rr:$addr),
+ (tASRri (tLSLri (tLDRHr t_addrmode_rr:$addr), 16), 16)>;
+
+def : T1Pat<(atomic_load_8 t_addrmode_is1:$src),
+ (tLDRBi t_addrmode_is1:$src)>;
+def : T1Pat<(atomic_load_8 t_addrmode_rr:$src),
+ (tLDRBr t_addrmode_rr:$src)>;
+def : T1Pat<(atomic_load_16 t_addrmode_is2:$src),
+ (tLDRHi t_addrmode_is2:$src)>;
+def : T1Pat<(atomic_load_16 t_addrmode_rr:$src),
+ (tLDRHr t_addrmode_rr:$src)>;
+def : T1Pat<(atomic_load_32 t_addrmode_is4:$src),
+ (tLDRi t_addrmode_is4:$src)>;
+def : T1Pat<(atomic_load_32 t_addrmode_rr:$src),
+ (tLDRr t_addrmode_rr:$src)>;
+def : T1Pat<(atomic_store_8 t_addrmode_is1:$ptr, tGPR:$val),
+ (tSTRBi tGPR:$val, t_addrmode_is1:$ptr)>;
+def : T1Pat<(atomic_store_8 t_addrmode_rr:$ptr, tGPR:$val),
+ (tSTRBr tGPR:$val, t_addrmode_rr:$ptr)>;
+def : T1Pat<(atomic_store_16 t_addrmode_is2:$ptr, tGPR:$val),
+ (tSTRHi tGPR:$val, t_addrmode_is2:$ptr)>;
+def : T1Pat<(atomic_store_16 t_addrmode_rr:$ptr, tGPR:$val),
+ (tSTRHr tGPR:$val, t_addrmode_rr:$ptr)>;
+def : T1Pat<(atomic_store_32 t_addrmode_is4:$ptr, tGPR:$val),
+ (tSTRi tGPR:$val, t_addrmode_is4:$ptr)>;
+def : T1Pat<(atomic_store_32 t_addrmode_rr:$ptr, tGPR:$val),
+ (tSTRr tGPR:$val, t_addrmode_rr:$ptr)>;
+
+// Large immediate handling.
+
+// Two piece imms.
+def : T1Pat<(i32 thumb_immshifted:$src),
+ (tLSLri (tMOVi8 (thumb_immshifted_val imm:$src)),
+ (thumb_immshifted_shamt imm:$src))>;
+
+def : T1Pat<(i32 imm0_255_comp:$src),
+ (tMVN (tMOVi8 (imm_comp_XFORM imm:$src)))>;
+
+def : T1Pat<(i32 imm256_510:$src),
+ (tADDi8 (tMOVi8 255),
+ (thumb_imm256_510_addend imm:$src))>;
+
+// Pseudo instruction that combines ldr from constpool and add pc. This should
+// be expanded into two instructions late to allow if-conversion and
+// scheduling.
+let isReMaterializable = 1 in
+def tLDRpci_pic : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr, pclabel:$cp),
+ NoItinerary,
+ [(set GPR:$dst, (ARMpic_add (load (ARMWrapper tconstpool:$addr)),
+ imm:$cp))]>,
+ Requires<[IsThumb, IsThumb1Only]>;
+
+// Pseudo-instruction for merged POP and return.
+// FIXME: remove when we have a way to marking a MI with these properties.
+let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1,
+ hasExtraDefRegAllocReq = 1 in
+def tPOP_RET : tPseudoExpand<(outs), (ins pred:$p, reglist:$regs, variable_ops),
+ 2, IIC_iPop_Br, [],
+ (tPOP pred:$p, reglist:$regs)>, Sched<[WriteBrL]>;
+
+// Indirect branch using "mov pc, $Rm"
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+ def tBRIND : tPseudoExpand<(outs), (ins GPR:$Rm, pred:$p),
+ 2, IIC_Br, [(brind GPR:$Rm)],
+ (tMOVr PC, GPR:$Rm, pred:$p)>, Sched<[WriteBr]>;
+}
+
+
+// In Thumb1, "nop" is encoded as a "mov r8, r8". Technically, the bf00
+// encoding is available on ARMv6K, but we don't differentiate that finely.
+def : InstAlias<"nop", (tMOVr R8, R8, 14, 0), 0>, Requires<[IsThumb, IsThumb1Only]>;
+
+
+// For round-trip assembly/disassembly, we have to handle a CPS instruction
+// without any iflags. That's not, strictly speaking, valid syntax, but it's
+// a useful extension and assembles to defined behaviour (the insn does
+// nothing).
+def : tInstAlias<"cps$imod", (tCPS imod_op:$imod, 0)>;
+def : tInstAlias<"cps$imod", (tCPS imod_op:$imod, 0)>;
+
+// "neg" is and alias for "rsb rd, rn, #0"
+def : tInstAlias<"neg${s}${p} $Rd, $Rm",
+ (tRSB tGPR:$Rd, s_cc_out:$s, tGPR:$Rm, pred:$p)>;
+
+
+// Implied destination operand forms for shifts.
+def : tInstAlias<"lsl${s}${p} $Rdm, $imm",
+ (tLSLri tGPR:$Rdm, cc_out:$s, tGPR:$Rdm, imm0_31:$imm, pred:$p)>;
+def : tInstAlias<"lsr${s}${p} $Rdm, $imm",
+ (tLSRri tGPR:$Rdm, cc_out:$s, tGPR:$Rdm, imm_sr:$imm, pred:$p)>;
+def : tInstAlias<"asr${s}${p} $Rdm, $imm",
+ (tASRri tGPR:$Rdm, cc_out:$s, tGPR:$Rdm, imm_sr:$imm, pred:$p)>;
+
+// Pseudo instruction ldr Rt, =immediate
+def tLDRConstPool
+ : tAsmPseudo<"ldr${p} $Rt, $immediate",
+ (ins tGPR:$Rt, const_pool_asm_imm:$immediate, pred:$p)>;
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td
new file mode 100644
index 000000000000..603d66403e65
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -0,0 +1,4673 @@
+//===-- ARMInstrThumb2.td - Thumb2 support for ARM ---------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Thumb2 instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+// IT block predicate field
+def it_pred_asmoperand : AsmOperandClass {
+ let Name = "ITCondCode";
+ let ParserMethod = "parseITCondCode";
+}
+def it_pred : Operand<i32> {
+ let PrintMethod = "printMandatoryPredicateOperand";
+ let ParserMatchClass = it_pred_asmoperand;
+}
+
+// IT block condition mask
+def it_mask_asmoperand : AsmOperandClass { let Name = "ITMask"; }
+def it_mask : Operand<i32> {
+ let PrintMethod = "printThumbITMask";
+ let ParserMatchClass = it_mask_asmoperand;
+}
+
+// t2_shift_imm: An integer that encodes a shift amount and the type of shift
+// (asr or lsl). The 6-bit immediate encodes as:
+// {5} 0 ==> lsl
+// 1 asr
+// {4-0} imm5 shift amount.
+// asr #32 not allowed
+def t2_shift_imm : Operand<i32> {
+ let PrintMethod = "printShiftImmOperand";
+ let ParserMatchClass = ShifterImmAsmOperand;
+ let DecoderMethod = "DecodeT2ShifterImmOperand";
+}
+
+// Shifted operands. No register controlled shifts for Thumb2.
+// Note: We do not support rrx shifted operands yet.
+def t2_so_reg : Operand<i32>, // reg imm
+ ComplexPattern<i32, 2, "SelectShiftImmShifterOperand",
+ [shl,srl,sra,rotr]> {
+ let EncoderMethod = "getT2SORegOpValue";
+ let PrintMethod = "printT2SOOperand";
+ let DecoderMethod = "DecodeSORegImmOperand";
+ let ParserMatchClass = ShiftedImmAsmOperand;
+ let MIOperandInfo = (ops rGPR, i32imm);
+}
+
+// t2_so_imm_not_XFORM - Return the complement of a t2_so_imm value
+def t2_so_imm_not_XFORM : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(~((uint32_t)N->getZExtValue()), SDLoc(N),
+ MVT::i32);
+}]>;
+
+// t2_so_imm_neg_XFORM - Return the negation of a t2_so_imm value
+def t2_so_imm_neg_XFORM : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(-((int)N->getZExtValue()), SDLoc(N),
+ MVT::i32);
+}]>;
+
+// so_imm_notSext_XFORM - Return a so_imm value packed into the format
+// described for so_imm_notSext def below, with sign extension from 16
+// bits.
+def t2_so_imm_notSext16_XFORM : SDNodeXForm<imm, [{
+ APInt apIntN = N->getAPIntValue();
+ unsigned N16bitSignExt = apIntN.trunc(16).sext(32).getZExtValue();
+ return CurDAG->getTargetConstant(~N16bitSignExt, SDLoc(N), MVT::i32);
+}]>;
+
+// t2_so_imm - Match a 32-bit immediate operand, which is an
+// 8-bit immediate rotated by an arbitrary number of bits, or an 8-bit
+// immediate splatted into multiple bytes of the word.
+def t2_so_imm_asmoperand : ImmAsmOperand { let Name = "T2SOImm"; }
+def t2_so_imm : Operand<i32>, ImmLeaf<i32, [{
+ return ARM_AM::getT2SOImmVal(Imm) != -1;
+ }]> {
+ let ParserMatchClass = t2_so_imm_asmoperand;
+ let EncoderMethod = "getT2SOImmOpValue";
+ let DecoderMethod = "DecodeT2SOImm";
+}
+
+// t2_so_imm_not - Match an immediate that is a complement
+// of a t2_so_imm.
+// Note: this pattern doesn't require an encoder method and such, as it's
+// only used on aliases (Pat<> and InstAlias<>). The actual encoding
+// is handled by the destination instructions, which use t2_so_imm.
+def t2_so_imm_not_asmoperand : AsmOperandClass { let Name = "T2SOImmNot"; }
+def t2_so_imm_not : Operand<i32>, PatLeaf<(imm), [{
+ return ARM_AM::getT2SOImmVal(~((uint32_t)N->getZExtValue())) != -1;
+}], t2_so_imm_not_XFORM> {
+ let ParserMatchClass = t2_so_imm_not_asmoperand;
+}
+
+// t2_so_imm_notSext - match an immediate that is a complement of a t2_so_imm
+// if the upper 16 bits are zero.
+def t2_so_imm_notSext : Operand<i32>, PatLeaf<(imm), [{
+ APInt apIntN = N->getAPIntValue();
+ if (!apIntN.isIntN(16)) return false;
+ unsigned N16bitSignExt = apIntN.trunc(16).sext(32).getZExtValue();
+ return ARM_AM::getT2SOImmVal(~N16bitSignExt) != -1;
+ }], t2_so_imm_notSext16_XFORM> {
+ let ParserMatchClass = t2_so_imm_not_asmoperand;
+}
+
+// t2_so_imm_neg - Match an immediate that is a negation of a t2_so_imm.
+def t2_so_imm_neg_asmoperand : AsmOperandClass { let Name = "T2SOImmNeg"; }
+def t2_so_imm_neg : Operand<i32>, PatLeaf<(imm), [{
+ int64_t Value = -(int)N->getZExtValue();
+ return Value && ARM_AM::getT2SOImmVal(Value) != -1;
+}], t2_so_imm_neg_XFORM> {
+ let ParserMatchClass = t2_so_imm_neg_asmoperand;
+}
+
+/// imm0_4095 predicate - True if the 32-bit immediate is in the range [0.4095].
+def imm0_4095_asmoperand: ImmAsmOperand { let Name = "Imm0_4095"; }
+def imm0_4095 : Operand<i32>, ImmLeaf<i32, [{
+ return Imm >= 0 && Imm < 4096;
+}]> {
+ let ParserMatchClass = imm0_4095_asmoperand;
+}
+
+def imm0_4095_neg_asmoperand: AsmOperandClass { let Name = "Imm0_4095Neg"; }
+def imm0_4095_neg : Operand<i32>, PatLeaf<(i32 imm), [{
+ return (uint32_t)(-N->getZExtValue()) < 4096;
+}], imm_neg_XFORM> {
+ let ParserMatchClass = imm0_4095_neg_asmoperand;
+}
+
+def imm1_255_neg : PatLeaf<(i32 imm), [{
+ uint32_t Val = -N->getZExtValue();
+ return (Val > 0 && Val < 255);
+}], imm_neg_XFORM>;
+
+def imm0_255_not : PatLeaf<(i32 imm), [{
+ return (uint32_t)(~N->getZExtValue()) < 255;
+}], imm_comp_XFORM>;
+
+def lo5AllOne : PatLeaf<(i32 imm), [{
+ // Returns true if all low 5-bits are 1.
+ return (((uint32_t)N->getZExtValue()) & 0x1FUL) == 0x1FUL;
+}]>;
+
+// Define Thumb2 specific addressing modes.
+
+// t2addrmode_imm12 := reg + imm12
+def t2addrmode_imm12_asmoperand : AsmOperandClass {let Name="MemUImm12Offset";}
+def t2addrmode_imm12 : MemOperand,
+ ComplexPattern<i32, 2, "SelectT2AddrModeImm12", []> {
+ let PrintMethod = "printAddrModeImm12Operand<false>";
+ let EncoderMethod = "getAddrModeImm12OpValue";
+ let DecoderMethod = "DecodeT2AddrModeImm12";
+ let ParserMatchClass = t2addrmode_imm12_asmoperand;
+ let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+
+// t2ldrlabel := imm12
+def t2ldrlabel : Operand<i32> {
+ let EncoderMethod = "getAddrModeImm12OpValue";
+ let PrintMethod = "printThumbLdrLabelOperand";
+}
+
+def t2ldr_pcrel_imm12_asmoperand : AsmOperandClass {let Name = "MemPCRelImm12";}
+def t2ldr_pcrel_imm12 : Operand<i32> {
+ let ParserMatchClass = t2ldr_pcrel_imm12_asmoperand;
+ // used for assembler pseudo instruction and maps to t2ldrlabel, so
+ // doesn't need encoder or print methods of its own.
+}
+
+// ADR instruction labels.
+def t2adrlabel : Operand<i32> {
+ let EncoderMethod = "getT2AdrLabelOpValue";
+ let PrintMethod = "printAdrLabelOperand<0>";
+}
+
+// t2addrmode_posimm8 := reg + imm8
+def MemPosImm8OffsetAsmOperand : AsmOperandClass {let Name="MemPosImm8Offset";}
+def t2addrmode_posimm8 : MemOperand {
+ let PrintMethod = "printT2AddrModeImm8Operand<false>";
+ let EncoderMethod = "getT2AddrModeImm8OpValue";
+ let DecoderMethod = "DecodeT2AddrModeImm8";
+ let ParserMatchClass = MemPosImm8OffsetAsmOperand;
+ let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+
+// t2addrmode_negimm8 := reg - imm8
+def MemNegImm8OffsetAsmOperand : AsmOperandClass {let Name="MemNegImm8Offset";}
+def t2addrmode_negimm8 : MemOperand,
+ ComplexPattern<i32, 2, "SelectT2AddrModeImm8", []> {
+ let PrintMethod = "printT2AddrModeImm8Operand<false>";
+ let EncoderMethod = "getT2AddrModeImm8OpValue";
+ let DecoderMethod = "DecodeT2AddrModeImm8";
+ let ParserMatchClass = MemNegImm8OffsetAsmOperand;
+ let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+
+// t2addrmode_imm8 := reg +/- imm8
+def MemImm8OffsetAsmOperand : AsmOperandClass { let Name = "MemImm8Offset"; }
+class T2AddrMode_Imm8 : MemOperand,
+ ComplexPattern<i32, 2, "SelectT2AddrModeImm8", []> {
+ let EncoderMethod = "getT2AddrModeImm8OpValue";
+ let DecoderMethod = "DecodeT2AddrModeImm8";
+ let ParserMatchClass = MemImm8OffsetAsmOperand;
+ let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+
+def t2addrmode_imm8 : T2AddrMode_Imm8 {
+ let PrintMethod = "printT2AddrModeImm8Operand<false>";
+}
+
+def t2addrmode_imm8_pre : T2AddrMode_Imm8 {
+ let PrintMethod = "printT2AddrModeImm8Operand<true>";
+}
+
+def t2am_imm8_offset : MemOperand,
+ ComplexPattern<i32, 1, "SelectT2AddrModeImm8Offset",
+ [], [SDNPWantRoot]> {
+ let PrintMethod = "printT2AddrModeImm8OffsetOperand";
+ let EncoderMethod = "getT2AddrModeImm8OffsetOpValue";
+ let DecoderMethod = "DecodeT2Imm8";
+}
+
+// t2addrmode_imm8s4 := reg +/- (imm8 << 2)
+def MemImm8s4OffsetAsmOperand : AsmOperandClass {let Name = "MemImm8s4Offset";}
+class T2AddrMode_Imm8s4 : MemOperand {
+ let EncoderMethod = "getT2AddrModeImm8s4OpValue";
+ let DecoderMethod = "DecodeT2AddrModeImm8s4";
+ let ParserMatchClass = MemImm8s4OffsetAsmOperand;
+ let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+
+def t2addrmode_imm8s4 : T2AddrMode_Imm8s4 {
+ let PrintMethod = "printT2AddrModeImm8s4Operand<false>";
+}
+
+def t2addrmode_imm8s4_pre : T2AddrMode_Imm8s4 {
+ let PrintMethod = "printT2AddrModeImm8s4Operand<true>";
+}
+
+def t2am_imm8s4_offset_asmoperand : AsmOperandClass { let Name = "Imm8s4"; }
+def t2am_imm8s4_offset : MemOperand {
+ let PrintMethod = "printT2AddrModeImm8s4OffsetOperand";
+ let EncoderMethod = "getT2Imm8s4OpValue";
+ let DecoderMethod = "DecodeT2Imm8S4";
+}
+
+// t2addrmode_imm0_1020s4 := reg + (imm8 << 2)
+def MemImm0_1020s4OffsetAsmOperand : AsmOperandClass {
+ let Name = "MemImm0_1020s4Offset";
+}
+def t2addrmode_imm0_1020s4 : MemOperand,
+ ComplexPattern<i32, 2, "SelectT2AddrModeExclusive"> {
+ let PrintMethod = "printT2AddrModeImm0_1020s4Operand";
+ let EncoderMethod = "getT2AddrModeImm0_1020s4OpValue";
+ let DecoderMethod = "DecodeT2AddrModeImm0_1020s4";
+ let ParserMatchClass = MemImm0_1020s4OffsetAsmOperand;
+ let MIOperandInfo = (ops GPRnopc:$base, i32imm:$offsimm);
+}
+
+// t2addrmode_so_reg := reg + (reg << imm2)
+def t2addrmode_so_reg_asmoperand : AsmOperandClass {let Name="T2MemRegOffset";}
+def t2addrmode_so_reg : MemOperand,
+ ComplexPattern<i32, 3, "SelectT2AddrModeSoReg", []> {
+ let PrintMethod = "printT2AddrModeSoRegOperand";
+ let EncoderMethod = "getT2AddrModeSORegOpValue";
+ let DecoderMethod = "DecodeT2AddrModeSOReg";
+ let ParserMatchClass = t2addrmode_so_reg_asmoperand;
+ let MIOperandInfo = (ops GPRnopc:$base, rGPR:$offsreg, i32imm:$offsimm);
+}
+
+// Addresses for the TBB/TBH instructions.
+def addrmode_tbb_asmoperand : AsmOperandClass { let Name = "MemTBB"; }
+def addrmode_tbb : MemOperand {
+ let PrintMethod = "printAddrModeTBB";
+ let ParserMatchClass = addrmode_tbb_asmoperand;
+ let MIOperandInfo = (ops GPR:$Rn, rGPR:$Rm);
+}
+def addrmode_tbh_asmoperand : AsmOperandClass { let Name = "MemTBH"; }
+def addrmode_tbh : MemOperand {
+ let PrintMethod = "printAddrModeTBH";
+ let ParserMatchClass = addrmode_tbh_asmoperand;
+ let MIOperandInfo = (ops GPR:$Rn, rGPR:$Rm);
+}
+
+//===----------------------------------------------------------------------===//
+// Multiclass helpers...
+//
+
+
+class T2OneRegImm<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : T2I<oops, iops, itin, opc, asm, pattern> {
+ bits<4> Rd;
+ bits<12> imm;
+
+ let Inst{11-8} = Rd;
+ let Inst{26} = imm{11};
+ let Inst{14-12} = imm{10-8};
+ let Inst{7-0} = imm{7-0};
+}
+
+
+class T2sOneRegImm<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : T2sI<oops, iops, itin, opc, asm, pattern> {
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<12> imm;
+
+ let Inst{11-8} = Rd;
+ let Inst{26} = imm{11};
+ let Inst{14-12} = imm{10-8};
+ let Inst{7-0} = imm{7-0};
+}
+
+class T2OneRegCmpImm<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : T2I<oops, iops, itin, opc, asm, pattern> {
+ bits<4> Rn;
+ bits<12> imm;
+
+ let Inst{19-16} = Rn;
+ let Inst{26} = imm{11};
+ let Inst{14-12} = imm{10-8};
+ let Inst{7-0} = imm{7-0};
+}
+
+
+class T2OneRegShiftedReg<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : T2I<oops, iops, itin, opc, asm, pattern> {
+ bits<4> Rd;
+ bits<12> ShiftedRm;
+
+ let Inst{11-8} = Rd;
+ let Inst{3-0} = ShiftedRm{3-0};
+ let Inst{5-4} = ShiftedRm{6-5};
+ let Inst{14-12} = ShiftedRm{11-9};
+ let Inst{7-6} = ShiftedRm{8-7};
+}
+
+class T2sOneRegShiftedReg<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : T2sI<oops, iops, itin, opc, asm, pattern> {
+ bits<4> Rd;
+ bits<12> ShiftedRm;
+
+ let Inst{11-8} = Rd;
+ let Inst{3-0} = ShiftedRm{3-0};
+ let Inst{5-4} = ShiftedRm{6-5};
+ let Inst{14-12} = ShiftedRm{11-9};
+ let Inst{7-6} = ShiftedRm{8-7};
+}
+
+class T2OneRegCmpShiftedReg<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : T2I<oops, iops, itin, opc, asm, pattern> {
+ bits<4> Rn;
+ bits<12> ShiftedRm;
+
+ let Inst{19-16} = Rn;
+ let Inst{3-0} = ShiftedRm{3-0};
+ let Inst{5-4} = ShiftedRm{6-5};
+ let Inst{14-12} = ShiftedRm{11-9};
+ let Inst{7-6} = ShiftedRm{8-7};
+}
+
+class T2TwoReg<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : T2I<oops, iops, itin, opc, asm, pattern> {
+ bits<4> Rd;
+ bits<4> Rm;
+
+ let Inst{11-8} = Rd;
+ let Inst{3-0} = Rm;
+}
+
+class T2sTwoReg<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : T2sI<oops, iops, itin, opc, asm, pattern> {
+ bits<4> Rd;
+ bits<4> Rm;
+
+ let Inst{11-8} = Rd;
+ let Inst{3-0} = Rm;
+}
+
+class T2TwoRegCmp<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : T2I<oops, iops, itin, opc, asm, pattern> {
+ bits<4> Rn;
+ bits<4> Rm;
+
+ let Inst{19-16} = Rn;
+ let Inst{3-0} = Rm;
+}
+
+
+class T2TwoRegImm<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : T2I<oops, iops, itin, opc, asm, pattern> {
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<12> imm;
+
+ let Inst{11-8} = Rd;
+ let Inst{19-16} = Rn;
+ let Inst{26} = imm{11};
+ let Inst{14-12} = imm{10-8};
+ let Inst{7-0} = imm{7-0};
+}
+
+class T2sTwoRegImm<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : T2sI<oops, iops, itin, opc, asm, pattern> {
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<12> imm;
+
+ let Inst{11-8} = Rd;
+ let Inst{19-16} = Rn;
+ let Inst{26} = imm{11};
+ let Inst{14-12} = imm{10-8};
+ let Inst{7-0} = imm{7-0};
+}
+
+class T2TwoRegShiftImm<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : T2I<oops, iops, itin, opc, asm, pattern> {
+ bits<4> Rd;
+ bits<4> Rm;
+ bits<5> imm;
+
+ let Inst{11-8} = Rd;
+ let Inst{3-0} = Rm;
+ let Inst{14-12} = imm{4-2};
+ let Inst{7-6} = imm{1-0};
+}
+
+class T2sTwoRegShiftImm<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : T2sI<oops, iops, itin, opc, asm, pattern> {
+ bits<4> Rd;
+ bits<4> Rm;
+ bits<5> imm;
+
+ let Inst{11-8} = Rd;
+ let Inst{3-0} = Rm;
+ let Inst{14-12} = imm{4-2};
+ let Inst{7-6} = imm{1-0};
+}
+
+class T2ThreeReg<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : T2I<oops, iops, itin, opc, asm, pattern> {
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<4> Rm;
+
+ let Inst{11-8} = Rd;
+ let Inst{19-16} = Rn;
+ let Inst{3-0} = Rm;
+}
+
+class T2ThreeRegNoP<dag oops, dag iops, InstrItinClass itin,
+ string asm, list<dag> pattern>
+ : T2XI<oops, iops, itin, asm, pattern> {
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<4> Rm;
+
+ let Inst{11-8} = Rd;
+ let Inst{19-16} = Rn;
+ let Inst{3-0} = Rm;
+}
+
+class T2sThreeReg<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : T2sI<oops, iops, itin, opc, asm, pattern> {
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<4> Rm;
+
+ let Inst{11-8} = Rd;
+ let Inst{19-16} = Rn;
+ let Inst{3-0} = Rm;
+}
+
+class T2TwoRegShiftedReg<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : T2I<oops, iops, itin, opc, asm, pattern> {
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<12> ShiftedRm;
+
+ let Inst{11-8} = Rd;
+ let Inst{19-16} = Rn;
+ let Inst{3-0} = ShiftedRm{3-0};
+ let Inst{5-4} = ShiftedRm{6-5};
+ let Inst{14-12} = ShiftedRm{11-9};
+ let Inst{7-6} = ShiftedRm{8-7};
+}
+
+class T2sTwoRegShiftedReg<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : T2sI<oops, iops, itin, opc, asm, pattern> {
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<12> ShiftedRm;
+
+ let Inst{11-8} = Rd;
+ let Inst{19-16} = Rn;
+ let Inst{3-0} = ShiftedRm{3-0};
+ let Inst{5-4} = ShiftedRm{6-5};
+ let Inst{14-12} = ShiftedRm{11-9};
+ let Inst{7-6} = ShiftedRm{8-7};
+}
+
+class T2FourReg<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : T2I<oops, iops, itin, opc, asm, pattern> {
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<4> Rm;
+ bits<4> Ra;
+
+ let Inst{19-16} = Rn;
+ let Inst{15-12} = Ra;
+ let Inst{11-8} = Rd;
+ let Inst{3-0} = Rm;
+}
+
+class T2MulLong<bits<3> opc22_20, bits<4> opc7_4,
+ string opc, list<dag> pattern>
+ : T2I<(outs rGPR:$RdLo, rGPR:$RdHi), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL64,
+ opc, "\t$RdLo, $RdHi, $Rn, $Rm", pattern> {
+ bits<4> RdLo;
+ bits<4> RdHi;
+ bits<4> Rn;
+ bits<4> Rm;
+
+ let Inst{31-23} = 0b111110111;
+ let Inst{22-20} = opc22_20;
+ let Inst{19-16} = Rn;
+ let Inst{15-12} = RdLo;
+ let Inst{11-8} = RdHi;
+ let Inst{7-4} = opc7_4;
+ let Inst{3-0} = Rm;
+}
+class T2MlaLong<bits<3> opc22_20, bits<4> opc7_4, string opc>
+ : T2I<(outs rGPR:$RdLo, rGPR:$RdHi),
+ (ins rGPR:$Rn, rGPR:$Rm, rGPR:$RLo, rGPR:$RHi), IIC_iMAC64,
+ opc, "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+ RegConstraint<"$RLo = $RdLo, $RHi = $RdHi"> {
+ bits<4> RdLo;
+ bits<4> RdHi;
+ bits<4> Rn;
+ bits<4> Rm;
+
+ let Inst{31-23} = 0b111110111;
+ let Inst{22-20} = opc22_20;
+ let Inst{19-16} = Rn;
+ let Inst{15-12} = RdLo;
+ let Inst{11-8} = RdHi;
+ let Inst{7-4} = opc7_4;
+ let Inst{3-0} = Rm;
+}
+
+
+/// T2I_bin_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns for a
+/// binary operation that produces a value. These are predicable and can be
+/// changed to modify CPSR.
+multiclass T2I_bin_irs<bits<4> opcod, string opc,
+ InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+ SDPatternOperator opnode, bit Commutable = 0,
+ string wide = ""> {
+ // shifted imm
+ def ri : T2sTwoRegImm<
+ (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), iii,
+ opc, "\t$Rd, $Rn, $imm",
+ [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_imm:$imm))]>,
+ Sched<[WriteALU, ReadALU]> {
+ let Inst{31-27} = 0b11110;
+ let Inst{25} = 0;
+ let Inst{24-21} = opcod;
+ let Inst{15} = 0;
+ }
+ // register
+ def rr : T2sThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), iir,
+ opc, !strconcat(wide, "\t$Rd, $Rn, $Rm"),
+ [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]>,
+ Sched<[WriteALU, ReadALU, ReadALU]> {
+ let isCommutable = Commutable;
+ let Inst{31-27} = 0b11101;
+ let Inst{26-25} = 0b01;
+ let Inst{24-21} = opcod;
+ let Inst{14-12} = 0b000; // imm3
+ let Inst{7-6} = 0b00; // imm2
+ let Inst{5-4} = 0b00; // type
+ }
+ // shifted register
+ def rs : T2sTwoRegShiftedReg<
+ (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm), iis,
+ opc, !strconcat(wide, "\t$Rd, $Rn, $ShiftedRm"),
+ [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_reg:$ShiftedRm))]>,
+ Sched<[WriteALUsi, ReadALU]> {
+ let Inst{31-27} = 0b11101;
+ let Inst{26-25} = 0b01;
+ let Inst{24-21} = opcod;
+ }
+ // Assembly aliases for optional destination operand when it's the same
+ // as the source operand.
+ def : t2InstAlias<!strconcat(opc, "${s}${p} $Rdn, $imm"),
+ (!cast<Instruction>(NAME#"ri") rGPR:$Rdn, rGPR:$Rdn,
+ t2_so_imm:$imm, pred:$p,
+ cc_out:$s)>;
+ def : t2InstAlias<!strconcat(opc, "${s}${p}", wide, " $Rdn, $Rm"),
+ (!cast<Instruction>(NAME#"rr") rGPR:$Rdn, rGPR:$Rdn,
+ rGPR:$Rm, pred:$p,
+ cc_out:$s)>;
+ def : t2InstAlias<!strconcat(opc, "${s}${p}", wide, " $Rdn, $shift"),
+ (!cast<Instruction>(NAME#"rs") rGPR:$Rdn, rGPR:$Rdn,
+ t2_so_reg:$shift, pred:$p,
+ cc_out:$s)>;
+}
+
+/// T2I_bin_w_irs - Same as T2I_bin_irs except these operations need
+// the ".w" suffix to indicate that they are wide.
+multiclass T2I_bin_w_irs<bits<4> opcod, string opc,
+ InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+ SDPatternOperator opnode, bit Commutable = 0> :
+ T2I_bin_irs<opcod, opc, iii, iir, iis, opnode, Commutable, ".w"> {
+ // Assembler aliases w/ the ".w" suffix.
+ def : t2InstAlias<!strconcat(opc, "${s}${p}.w", " $Rd, $Rn, $imm"),
+ (!cast<Instruction>(NAME#"ri") rGPR:$Rd, rGPR:$Rn, t2_so_imm:$imm, pred:$p,
+ cc_out:$s)>;
+ // Assembler aliases w/o the ".w" suffix.
+ def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rd, $Rn, $Rm"),
+ (!cast<Instruction>(NAME#"rr") rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, pred:$p,
+ cc_out:$s)>;
+ def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rd, $Rn, $shift"),
+ (!cast<Instruction>(NAME#"rs") rGPR:$Rd, rGPR:$Rn, t2_so_reg:$shift,
+ pred:$p, cc_out:$s)>;
+
+ // and with the optional destination operand, too.
+ def : t2InstAlias<!strconcat(opc, "${s}${p}.w", " $Rdn, $imm"),
+ (!cast<Instruction>(NAME#"ri") rGPR:$Rdn, rGPR:$Rdn, t2_so_imm:$imm,
+ pred:$p, cc_out:$s)>;
+ def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rdn, $Rm"),
+ (!cast<Instruction>(NAME#"rr") rGPR:$Rdn, rGPR:$Rdn, rGPR:$Rm, pred:$p,
+ cc_out:$s)>;
+ def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rdn, $shift"),
+ (!cast<Instruction>(NAME#"rs") rGPR:$Rdn, rGPR:$Rdn, t2_so_reg:$shift,
+ pred:$p, cc_out:$s)>;
+}
+
+/// T2I_rbin_is - Same as T2I_bin_irs except the order of operands are
+/// reversed. The 'rr' form is only defined for the disassembler; for codegen
+/// it is equivalent to the T2I_bin_irs counterpart.
+multiclass T2I_rbin_irs<bits<4> opcod, string opc, SDNode opnode> {
+ // shifted imm
+ def ri : T2sTwoRegImm<
+ (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), IIC_iALUi,
+ opc, ".w\t$Rd, $Rn, $imm",
+ [(set rGPR:$Rd, (opnode t2_so_imm:$imm, rGPR:$Rn))]>,
+ Sched<[WriteALU, ReadALU]> {
+ let Inst{31-27} = 0b11110;
+ let Inst{25} = 0;
+ let Inst{24-21} = opcod;
+ let Inst{15} = 0;
+ }
+ // register
+ def rr : T2sThreeReg<
+ (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUr,
+ opc, "\t$Rd, $Rn, $Rm",
+ [/* For disassembly only; pattern left blank */]>,
+ Sched<[WriteALU, ReadALU, ReadALU]> {
+ let Inst{31-27} = 0b11101;
+ let Inst{26-25} = 0b01;
+ let Inst{24-21} = opcod;
+ let Inst{14-12} = 0b000; // imm3
+ let Inst{7-6} = 0b00; // imm2
+ let Inst{5-4} = 0b00; // type
+ }
+ // shifted register
+ def rs : T2sTwoRegShiftedReg<
+ (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm),
+ IIC_iALUsir, opc, "\t$Rd, $Rn, $ShiftedRm",
+ [(set rGPR:$Rd, (opnode t2_so_reg:$ShiftedRm, rGPR:$Rn))]>,
+ Sched<[WriteALUsi, ReadALU]> {
+ let Inst{31-27} = 0b11101;
+ let Inst{26-25} = 0b01;
+ let Inst{24-21} = opcod;
+ }
+}
+
+/// T2I_bin_s_irs - Similar to T2I_bin_irs except it sets the 's' bit so the
+/// instruction modifies the CPSR register.
+///
+/// These opcodes will be converted to the real non-S opcodes by
+/// AdjustInstrPostInstrSelection after giving then an optional CPSR operand.
+let hasPostISelHook = 1, Defs = [CPSR] in {
+multiclass T2I_bin_s_irs<InstrItinClass iii, InstrItinClass iir,
+ InstrItinClass iis, SDNode opnode,
+ bit Commutable = 0> {
+ // shifted imm
+ def ri : t2PseudoInst<(outs rGPR:$Rd),
+ (ins GPRnopc:$Rn, t2_so_imm:$imm, pred:$p),
+ 4, iii,
+ [(set rGPR:$Rd, CPSR, (opnode GPRnopc:$Rn,
+ t2_so_imm:$imm))]>,
+ Sched<[WriteALU, ReadALU]>;
+ // register
+ def rr : t2PseudoInst<(outs rGPR:$Rd), (ins GPRnopc:$Rn, rGPR:$Rm, pred:$p),
+ 4, iir,
+ [(set rGPR:$Rd, CPSR, (opnode GPRnopc:$Rn,
+ rGPR:$Rm))]>,
+ Sched<[WriteALU, ReadALU, ReadALU]> {
+ let isCommutable = Commutable;
+ }
+ // shifted register
+ def rs : t2PseudoInst<(outs rGPR:$Rd),
+ (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm, pred:$p),
+ 4, iis,
+ [(set rGPR:$Rd, CPSR, (opnode GPRnopc:$Rn,
+ t2_so_reg:$ShiftedRm))]>,
+ Sched<[WriteALUsi, ReadALUsr]>;
+}
+}
+
+/// T2I_rbin_s_is - Same as T2I_bin_s_irs, except selection DAG
+/// operands are reversed.
+let hasPostISelHook = 1, Defs = [CPSR] in {
+multiclass T2I_rbin_s_is<SDNode opnode> {
+ // shifted imm
+ def ri : t2PseudoInst<(outs rGPR:$Rd),
+ (ins rGPR:$Rn, t2_so_imm:$imm, pred:$p),
+ 4, IIC_iALUi,
+ [(set rGPR:$Rd, CPSR, (opnode t2_so_imm:$imm,
+ rGPR:$Rn))]>,
+ Sched<[WriteALU, ReadALU]>;
+ // shifted register
+ def rs : t2PseudoInst<(outs rGPR:$Rd),
+ (ins rGPR:$Rn, t2_so_reg:$ShiftedRm, pred:$p),
+ 4, IIC_iALUsi,
+ [(set rGPR:$Rd, CPSR, (opnode t2_so_reg:$ShiftedRm,
+ rGPR:$Rn))]>,
+ Sched<[WriteALUsi, ReadALU]>;
+}
+}
+
+/// T2I_bin_ii12rs - Defines a set of (op reg, {so_imm|imm0_4095|r|so_reg})
+/// patterns for a binary operation that produces a value.
+multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, SDNode opnode,
+ bit Commutable = 0> {
+ // shifted imm
+ // The register-immediate version is re-materializable. This is useful
+ // in particular for taking the address of a local.
+ let isReMaterializable = 1 in {
+ def ri : T2sTwoRegImm<
+ (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, t2_so_imm:$imm), IIC_iALUi,
+ opc, ".w\t$Rd, $Rn, $imm",
+ [(set GPRnopc:$Rd, (opnode GPRnopc:$Rn, t2_so_imm:$imm))]>,
+ Sched<[WriteALU, ReadALU]> {
+ let Inst{31-27} = 0b11110;
+ let Inst{25} = 0;
+ let Inst{24} = 1;
+ let Inst{23-21} = op23_21;
+ let Inst{15} = 0;
+ }
+ }
+ // 12-bit imm
+ def ri12 : T2I<
+ (outs GPRnopc:$Rd), (ins GPR:$Rn, imm0_4095:$imm), IIC_iALUi,
+ !strconcat(opc, "w"), "\t$Rd, $Rn, $imm",
+ [(set GPRnopc:$Rd, (opnode GPR:$Rn, imm0_4095:$imm))]>,
+ Sched<[WriteALU, ReadALU]> {
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<12> imm;
+ let Inst{31-27} = 0b11110;
+ let Inst{26} = imm{11};
+ let Inst{25-24} = 0b10;
+ let Inst{23-21} = op23_21;
+ let Inst{20} = 0; // The S bit.
+ let Inst{19-16} = Rn;
+ let Inst{15} = 0;
+ let Inst{14-12} = imm{10-8};
+ let Inst{11-8} = Rd;
+ let Inst{7-0} = imm{7-0};
+ }
+ // register
+ def rr : T2sThreeReg<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, rGPR:$Rm),
+ IIC_iALUr, opc, ".w\t$Rd, $Rn, $Rm",
+ [(set GPRnopc:$Rd, (opnode GPRnopc:$Rn, rGPR:$Rm))]>,
+ Sched<[WriteALU, ReadALU, ReadALU]> {
+ let isCommutable = Commutable;
+ let Inst{31-27} = 0b11101;
+ let Inst{26-25} = 0b01;
+ let Inst{24} = 1;
+ let Inst{23-21} = op23_21;
+ let Inst{14-12} = 0b000; // imm3
+ let Inst{7-6} = 0b00; // imm2
+ let Inst{5-4} = 0b00; // type
+ }
+ // shifted register
+ def rs : T2sTwoRegShiftedReg<
+ (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm),
+ IIC_iALUsi, opc, ".w\t$Rd, $Rn, $ShiftedRm",
+ [(set GPRnopc:$Rd, (opnode GPRnopc:$Rn, t2_so_reg:$ShiftedRm))]>,
+ Sched<[WriteALUsi, ReadALU]> {
+ let Inst{31-27} = 0b11101;
+ let Inst{26-25} = 0b01;
+ let Inst{24} = 1;
+ let Inst{23-21} = op23_21;
+ }
+}
+
+/// T2I_adde_sube_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns
+/// for a binary operation that produces a value and use the carry
+/// bit. It's not predicable.
+let Defs = [CPSR], Uses = [CPSR] in {
+multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, SDNode opnode,
+ bit Commutable = 0> {
+ // shifted imm
+ def ri : T2sTwoRegImm<(outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm),
+ IIC_iALUi, opc, "\t$Rd, $Rn, $imm",
+ [(set rGPR:$Rd, CPSR, (opnode rGPR:$Rn, t2_so_imm:$imm, CPSR))]>,
+ Requires<[IsThumb2]>, Sched<[WriteALU, ReadALU]> {
+ let Inst{31-27} = 0b11110;
+ let Inst{25} = 0;
+ let Inst{24-21} = opcod;
+ let Inst{15} = 0;
+ }
+ // register
+ def rr : T2sThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUr,
+ opc, ".w\t$Rd, $Rn, $Rm",
+ [(set rGPR:$Rd, CPSR, (opnode rGPR:$Rn, rGPR:$Rm, CPSR))]>,
+ Requires<[IsThumb2]>, Sched<[WriteALU, ReadALU, ReadALU]> {
+ let isCommutable = Commutable;
+ let Inst{31-27} = 0b11101;
+ let Inst{26-25} = 0b01;
+ let Inst{24-21} = opcod;
+ let Inst{14-12} = 0b000; // imm3
+ let Inst{7-6} = 0b00; // imm2
+ let Inst{5-4} = 0b00; // type
+ }
+ // shifted register
+ def rs : T2sTwoRegShiftedReg<
+ (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm),
+ IIC_iALUsi, opc, ".w\t$Rd, $Rn, $ShiftedRm",
+ [(set rGPR:$Rd, CPSR, (opnode rGPR:$Rn, t2_so_reg:$ShiftedRm, CPSR))]>,
+ Requires<[IsThumb2]>, Sched<[WriteALUsi, ReadALU]> {
+ let Inst{31-27} = 0b11101;
+ let Inst{26-25} = 0b01;
+ let Inst{24-21} = opcod;
+ }
+}
+}
+
+/// T2I_sh_ir - Defines a set of (op reg, {so_imm|r}) patterns for a shift /
+// rotate operation that produces a value.
+multiclass T2I_sh_ir<bits<2> opcod, string opc, Operand ty, SDNode opnode> {
+ // 5-bit imm
+ def ri : T2sTwoRegShiftImm<
+ (outs rGPR:$Rd), (ins rGPR:$Rm, ty:$imm), IIC_iMOVsi,
+ opc, ".w\t$Rd, $Rm, $imm",
+ [(set rGPR:$Rd, (opnode rGPR:$Rm, (i32 ty:$imm)))]>,
+ Sched<[WriteALU]> {
+ let Inst{31-27} = 0b11101;
+ let Inst{26-21} = 0b010010;
+ let Inst{19-16} = 0b1111; // Rn
+ let Inst{5-4} = opcod;
+ }
+ // register
+ def rr : T2sThreeReg<
+ (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMOVsr,
+ opc, ".w\t$Rd, $Rn, $Rm",
+ [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]>,
+ Sched<[WriteALU]> {
+ let Inst{31-27} = 0b11111;
+ let Inst{26-23} = 0b0100;
+ let Inst{22-21} = opcod;
+ let Inst{15-12} = 0b1111;
+ let Inst{7-4} = 0b0000;
+ }
+
+ // Optional destination register
+ def : t2InstAlias<!strconcat(opc, "${s}${p}", ".w $Rdn, $imm"),
+ (!cast<Instruction>(NAME#"ri") rGPR:$Rdn, rGPR:$Rdn, ty:$imm, pred:$p,
+ cc_out:$s)>;
+ def : t2InstAlias<!strconcat(opc, "${s}${p}", ".w $Rdn, $Rm"),
+ (!cast<Instruction>(NAME#"rr") rGPR:$Rdn, rGPR:$Rdn, rGPR:$Rm, pred:$p,
+ cc_out:$s)>;
+
+ // Assembler aliases w/o the ".w" suffix.
+ def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rd, $Rn, $imm"),
+ (!cast<Instruction>(NAME#"ri") rGPR:$Rd, rGPR:$Rn, ty:$imm, pred:$p,
+ cc_out:$s)>;
+ def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rd, $Rn, $Rm"),
+ (!cast<Instruction>(NAME#"rr") rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, pred:$p,
+ cc_out:$s)>;
+
+ // and with the optional destination operand, too.
+ def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rdn, $imm"),
+ (!cast<Instruction>(NAME#"ri") rGPR:$Rdn, rGPR:$Rdn, ty:$imm, pred:$p,
+ cc_out:$s)>;
+ def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rdn, $Rm"),
+ (!cast<Instruction>(NAME#"rr") rGPR:$Rdn, rGPR:$Rdn, rGPR:$Rm, pred:$p,
+ cc_out:$s)>;
+}
+
+/// T2I_cmp_irs - Defines a set of (op r, {so_imm|r|so_reg}) cmp / test
+/// patterns. Similar to T2I_bin_irs except the instruction does not produce
+/// a explicit result, only implicitly set CPSR.
+multiclass T2I_cmp_irs<bits<4> opcod, string opc,
+ InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+ SDPatternOperator opnode> {
+let isCompare = 1, Defs = [CPSR] in {
+ // shifted imm
+ def ri : T2OneRegCmpImm<
+ (outs), (ins GPRnopc:$Rn, t2_so_imm:$imm), iii,
+ opc, ".w\t$Rn, $imm",
+ [(opnode GPRnopc:$Rn, t2_so_imm:$imm)]>, Sched<[WriteCMP]> {
+ let Inst{31-27} = 0b11110;
+ let Inst{25} = 0;
+ let Inst{24-21} = opcod;
+ let Inst{20} = 1; // The S bit.
+ let Inst{15} = 0;
+ let Inst{11-8} = 0b1111; // Rd
+ }
+ // register
+ def rr : T2TwoRegCmp<
+ (outs), (ins GPRnopc:$Rn, rGPR:$Rm), iir,
+ opc, ".w\t$Rn, $Rm",
+ [(opnode GPRnopc:$Rn, rGPR:$Rm)]>, Sched<[WriteCMP]> {
+ let Inst{31-27} = 0b11101;
+ let Inst{26-25} = 0b01;
+ let Inst{24-21} = opcod;
+ let Inst{20} = 1; // The S bit.
+ let Inst{14-12} = 0b000; // imm3
+ let Inst{11-8} = 0b1111; // Rd
+ let Inst{7-6} = 0b00; // imm2
+ let Inst{5-4} = 0b00; // type
+ }
+ // shifted register
+ def rs : T2OneRegCmpShiftedReg<
+ (outs), (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm), iis,
+ opc, ".w\t$Rn, $ShiftedRm",
+ [(opnode GPRnopc:$Rn, t2_so_reg:$ShiftedRm)]>,
+ Sched<[WriteCMPsi]> {
+ let Inst{31-27} = 0b11101;
+ let Inst{26-25} = 0b01;
+ let Inst{24-21} = opcod;
+ let Inst{20} = 1; // The S bit.
+ let Inst{11-8} = 0b1111; // Rd
+ }
+}
+
+ // Assembler aliases w/o the ".w" suffix.
+ // No alias here for 'rr' version as not all instantiations of this
+ // multiclass want one (CMP in particular, does not).
+ def : t2InstAlias<!strconcat(opc, "${p}", " $Rn, $imm"),
+ (!cast<Instruction>(NAME#"ri") GPRnopc:$Rn, t2_so_imm:$imm, pred:$p)>;
+ def : t2InstAlias<!strconcat(opc, "${p}", " $Rn, $shift"),
+ (!cast<Instruction>(NAME#"rs") GPRnopc:$Rn, t2_so_reg:$shift, pred:$p)>;
+}
+
+/// T2I_ld - Defines a set of (op r, {imm12|imm8|so_reg}) load patterns.
+multiclass T2I_ld<bit signed, bits<2> opcod, string opc,
+ InstrItinClass iii, InstrItinClass iis, RegisterClass target,
+ PatFrag opnode> {
+ def i12 : T2Ii12<(outs target:$Rt), (ins t2addrmode_imm12:$addr), iii,
+ opc, ".w\t$Rt, $addr",
+ [(set target:$Rt, (opnode t2addrmode_imm12:$addr))]> {
+ bits<4> Rt;
+ bits<17> addr;
+ let Inst{31-25} = 0b1111100;
+ let Inst{24} = signed;
+ let Inst{23} = 1;
+ let Inst{22-21} = opcod;
+ let Inst{20} = 1; // load
+ let Inst{19-16} = addr{16-13}; // Rn
+ let Inst{15-12} = Rt;
+ let Inst{11-0} = addr{11-0}; // imm
+
+ let DecoderMethod = "DecodeT2LoadImm12";
+ }
+ def i8 : T2Ii8 <(outs target:$Rt), (ins t2addrmode_negimm8:$addr), iii,
+ opc, "\t$Rt, $addr",
+ [(set target:$Rt, (opnode t2addrmode_negimm8:$addr))]> {
+ bits<4> Rt;
+ bits<13> addr;
+ let Inst{31-27} = 0b11111;
+ let Inst{26-25} = 0b00;
+ let Inst{24} = signed;
+ let Inst{23} = 0;
+ let Inst{22-21} = opcod;
+ let Inst{20} = 1; // load
+ let Inst{19-16} = addr{12-9}; // Rn
+ let Inst{15-12} = Rt;
+ let Inst{11} = 1;
+ // Offset: index==TRUE, wback==FALSE
+ let Inst{10} = 1; // The P bit.
+ let Inst{9} = addr{8}; // U
+ let Inst{8} = 0; // The W bit.
+ let Inst{7-0} = addr{7-0}; // imm
+
+ let DecoderMethod = "DecodeT2LoadImm8";
+ }
+ def s : T2Iso <(outs target:$Rt), (ins t2addrmode_so_reg:$addr), iis,
+ opc, ".w\t$Rt, $addr",
+ [(set target:$Rt, (opnode t2addrmode_so_reg:$addr))]> {
+ let Inst{31-27} = 0b11111;
+ let Inst{26-25} = 0b00;
+ let Inst{24} = signed;
+ let Inst{23} = 0;
+ let Inst{22-21} = opcod;
+ let Inst{20} = 1; // load
+ let Inst{11-6} = 0b000000;
+
+ bits<4> Rt;
+ let Inst{15-12} = Rt;
+
+ bits<10> addr;
+ let Inst{19-16} = addr{9-6}; // Rn
+ let Inst{3-0} = addr{5-2}; // Rm
+ let Inst{5-4} = addr{1-0}; // imm
+
+ let DecoderMethod = "DecodeT2LoadShift";
+ }
+
+ // pci variant is very similar to i12, but supports negative offsets
+ // from the PC.
+ def pci : T2Ipc <(outs target:$Rt), (ins t2ldrlabel:$addr), iii,
+ opc, ".w\t$Rt, $addr",
+ [(set target:$Rt, (opnode (ARMWrapper tconstpool:$addr)))]> {
+ let isReMaterializable = 1;
+ let Inst{31-27} = 0b11111;
+ let Inst{26-25} = 0b00;
+ let Inst{24} = signed;
+ let Inst{22-21} = opcod;
+ let Inst{20} = 1; // load
+ let Inst{19-16} = 0b1111; // Rn
+
+ bits<4> Rt;
+ let Inst{15-12} = Rt{3-0};
+
+ bits<13> addr;
+ let Inst{23} = addr{12}; // add = (U == '1')
+ let Inst{11-0} = addr{11-0};
+
+ let DecoderMethod = "DecodeT2LoadLabel";
+ }
+}
+
+/// T2I_st - Defines a set of (op r, {imm12|imm8|so_reg}) store patterns.
+multiclass T2I_st<bits<2> opcod, string opc,
+ InstrItinClass iii, InstrItinClass iis, RegisterClass target,
+ PatFrag opnode> {
+ def i12 : T2Ii12<(outs), (ins target:$Rt, t2addrmode_imm12:$addr), iii,
+ opc, ".w\t$Rt, $addr",
+ [(opnode target:$Rt, t2addrmode_imm12:$addr)]> {
+ let Inst{31-27} = 0b11111;
+ let Inst{26-23} = 0b0001;
+ let Inst{22-21} = opcod;
+ let Inst{20} = 0; // !load
+
+ bits<4> Rt;
+ let Inst{15-12} = Rt;
+
+ bits<17> addr;
+ let addr{12} = 1; // add = TRUE
+ let Inst{19-16} = addr{16-13}; // Rn
+ let Inst{23} = addr{12}; // U
+ let Inst{11-0} = addr{11-0}; // imm
+ }
+ def i8 : T2Ii8 <(outs), (ins target:$Rt, t2addrmode_negimm8:$addr), iii,
+ opc, "\t$Rt, $addr",
+ [(opnode target:$Rt, t2addrmode_negimm8:$addr)]> {
+ let Inst{31-27} = 0b11111;
+ let Inst{26-23} = 0b0000;
+ let Inst{22-21} = opcod;
+ let Inst{20} = 0; // !load
+ let Inst{11} = 1;
+ // Offset: index==TRUE, wback==FALSE
+ let Inst{10} = 1; // The P bit.
+ let Inst{8} = 0; // The W bit.
+
+ bits<4> Rt;
+ let Inst{15-12} = Rt;
+
+ bits<13> addr;
+ let Inst{19-16} = addr{12-9}; // Rn
+ let Inst{9} = addr{8}; // U
+ let Inst{7-0} = addr{7-0}; // imm
+ }
+ def s : T2Iso <(outs), (ins target:$Rt, t2addrmode_so_reg:$addr), iis,
+ opc, ".w\t$Rt, $addr",
+ [(opnode target:$Rt, t2addrmode_so_reg:$addr)]> {
+ let Inst{31-27} = 0b11111;
+ let Inst{26-23} = 0b0000;
+ let Inst{22-21} = opcod;
+ let Inst{20} = 0; // !load
+ let Inst{11-6} = 0b000000;
+
+ bits<4> Rt;
+ let Inst{15-12} = Rt;
+
+ bits<10> addr;
+ let Inst{19-16} = addr{9-6}; // Rn
+ let Inst{3-0} = addr{5-2}; // Rm
+ let Inst{5-4} = addr{1-0}; // imm
+ }
+}
+
+/// T2I_ext_rrot - A unary operation with two forms: one whose operand is a
+/// register and one whose operand is a register rotated by 8/16/24.
+class T2I_ext_rrot<bits<3> opcod, string opc, PatFrag opnode>
+ : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, rot_imm:$rot), IIC_iEXTr,
+ opc, ".w\t$Rd, $Rm$rot",
+ [(set rGPR:$Rd, (opnode (rotr rGPR:$Rm, rot_imm:$rot)))]>,
+ Requires<[IsThumb2]> {
+ let Inst{31-27} = 0b11111;
+ let Inst{26-23} = 0b0100;
+ let Inst{22-20} = opcod;
+ let Inst{19-16} = 0b1111; // Rn
+ let Inst{15-12} = 0b1111;
+ let Inst{7} = 1;
+
+ bits<2> rot;
+ let Inst{5-4} = rot{1-0}; // rotate
+}
+
+// UXTB16 - Requres T2ExtractPack, does not need the .w qualifier.
+class T2I_ext_rrot_uxtb16<bits<3> opcod, string opc, PatFrag opnode>
+ : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, rot_imm:$rot),
+ IIC_iEXTr, opc, "\t$Rd, $Rm$rot",
+ [(set rGPR:$Rd, (opnode (rotr rGPR:$Rm, rot_imm:$rot)))]>,
+ Requires<[HasT2ExtractPack, IsThumb2]> {
+ bits<2> rot;
+ let Inst{31-27} = 0b11111;
+ let Inst{26-23} = 0b0100;
+ let Inst{22-20} = opcod;
+ let Inst{19-16} = 0b1111; // Rn
+ let Inst{15-12} = 0b1111;
+ let Inst{7} = 1;
+ let Inst{5-4} = rot;
+}
+
+// SXTB16 - Requres T2ExtractPack, does not need the .w qualifier, no pattern
+// supported yet.
+class T2I_ext_rrot_sxtb16<bits<3> opcod, string opc>
+ : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, rot_imm:$rot), IIC_iEXTr,
+ opc, "\t$Rd, $Rm$rot", []>,
+ Requires<[IsThumb2, HasT2ExtractPack]> {
+ bits<2> rot;
+ let Inst{31-27} = 0b11111;
+ let Inst{26-23} = 0b0100;
+ let Inst{22-20} = opcod;
+ let Inst{19-16} = 0b1111; // Rn
+ let Inst{15-12} = 0b1111;
+ let Inst{7} = 1;
+ let Inst{5-4} = rot;
+}
+
+/// T2I_exta_rrot - A binary operation with two forms: one whose operand is a
+/// register and one whose operand is a register rotated by 8/16/24.
+class T2I_exta_rrot<bits<3> opcod, string opc, PatFrag opnode>
+ : T2ThreeReg<(outs rGPR:$Rd),
+ (ins rGPR:$Rn, rGPR:$Rm, rot_imm:$rot),
+ IIC_iEXTAsr, opc, "\t$Rd, $Rn, $Rm$rot",
+ [(set rGPR:$Rd, (opnode rGPR:$Rn, (rotr rGPR:$Rm,rot_imm:$rot)))]>,
+ Requires<[HasT2ExtractPack, IsThumb2]> {
+ bits<2> rot;
+ let Inst{31-27} = 0b11111;
+ let Inst{26-23} = 0b0100;
+ let Inst{22-20} = opcod;
+ let Inst{15-12} = 0b1111;
+ let Inst{7} = 1;
+ let Inst{5-4} = rot;
+}
+
+class T2I_exta_rrot_np<bits<3> opcod, string opc>
+ : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm,rot_imm:$rot),
+ IIC_iEXTAsr, opc, "\t$Rd, $Rn, $Rm$rot", []>,
+ Requires<[HasT2ExtractPack, IsThumb2]> {
+ bits<2> rot;
+ let Inst{31-27} = 0b11111;
+ let Inst{26-23} = 0b0100;
+ let Inst{22-20} = opcod;
+ let Inst{15-12} = 0b1111;
+ let Inst{7} = 1;
+ let Inst{5-4} = rot;
+}
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Instructions.
+//
+
+class T2PCOneRegImm<dag oops, dag iops, InstrItinClass itin,
+ string asm, list<dag> pattern>
+ : T2XI<oops, iops, itin, asm, pattern> {
+ bits<4> Rd;
+ bits<12> label;
+
+ let Inst{11-8} = Rd;
+ let Inst{26} = label{11};
+ let Inst{14-12} = label{10-8};
+ let Inst{7-0} = label{7-0};
+}
+
+// LEApcrel - Load a pc-relative address into a register without offending the
+// assembler.
+def t2ADR : T2PCOneRegImm<(outs rGPR:$Rd),
+ (ins t2adrlabel:$addr, pred:$p),
+ IIC_iALUi, "adr{$p}.w\t$Rd, $addr", []>,
+ Sched<[WriteALU, ReadALU]> {
+ let Inst{31-27} = 0b11110;
+ let Inst{25-24} = 0b10;
+ // Inst{23:21} = '11' (add = FALSE) or '00' (add = TRUE)
+ let Inst{22} = 0;
+ let Inst{20} = 0;
+ let Inst{19-16} = 0b1111; // Rn
+ let Inst{15} = 0;
+
+ bits<4> Rd;
+ bits<13> addr;
+ let Inst{11-8} = Rd;
+ let Inst{23} = addr{12};
+ let Inst{21} = addr{12};
+ let Inst{26} = addr{11};
+ let Inst{14-12} = addr{10-8};
+ let Inst{7-0} = addr{7-0};
+
+ let DecoderMethod = "DecodeT2Adr";
+}
+
+let hasSideEffects = 0, isReMaterializable = 1 in
+def t2LEApcrel : t2PseudoInst<(outs rGPR:$Rd), (ins i32imm:$label, pred:$p),
+ 4, IIC_iALUi, []>, Sched<[WriteALU, ReadALU]>;
+let hasSideEffects = 1 in
+def t2LEApcrelJT : t2PseudoInst<(outs rGPR:$Rd),
+ (ins i32imm:$label, pred:$p),
+ 4, IIC_iALUi,
+ []>, Sched<[WriteALU, ReadALU]>;
+
+
+//===----------------------------------------------------------------------===//
+// Load / store Instructions.
+//
+
+// Load
+let canFoldAsLoad = 1, isReMaterializable = 1 in
+defm t2LDR : T2I_ld<0, 0b10, "ldr", IIC_iLoad_i, IIC_iLoad_si, GPR, load>;
+
+// Loads with zero extension
+defm t2LDRH : T2I_ld<0, 0b01, "ldrh", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
+ GPRnopc, zextloadi16>;
+defm t2LDRB : T2I_ld<0, 0b00, "ldrb", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
+ GPRnopc, zextloadi8>;
+
+// Loads with sign extension
+defm t2LDRSH : T2I_ld<1, 0b01, "ldrsh", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
+ GPRnopc, sextloadi16>;
+defm t2LDRSB : T2I_ld<1, 0b00, "ldrsb", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
+ GPRnopc, sextloadi8>;
+
+let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in {
+// Load doubleword
+def t2LDRDi8 : T2Ii8s4<1, 0, 1, (outs rGPR:$Rt, rGPR:$Rt2),
+ (ins t2addrmode_imm8s4:$addr),
+ IIC_iLoad_d_i, "ldrd", "\t$Rt, $Rt2, $addr", "", []>;
+} // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1
+
+// zextload i1 -> zextload i8
+def : T2Pat<(zextloadi1 t2addrmode_imm12:$addr),
+ (t2LDRBi12 t2addrmode_imm12:$addr)>;
+def : T2Pat<(zextloadi1 t2addrmode_negimm8:$addr),
+ (t2LDRBi8 t2addrmode_negimm8:$addr)>;
+def : T2Pat<(zextloadi1 t2addrmode_so_reg:$addr),
+ (t2LDRBs t2addrmode_so_reg:$addr)>;
+def : T2Pat<(zextloadi1 (ARMWrapper tconstpool:$addr)),
+ (t2LDRBpci tconstpool:$addr)>;
+
+// extload -> zextload
+// FIXME: Reduce the number of patterns by legalizing extload to zextload
+// earlier?
+def : T2Pat<(extloadi1 t2addrmode_imm12:$addr),
+ (t2LDRBi12 t2addrmode_imm12:$addr)>;
+def : T2Pat<(extloadi1 t2addrmode_negimm8:$addr),
+ (t2LDRBi8 t2addrmode_negimm8:$addr)>;
+def : T2Pat<(extloadi1 t2addrmode_so_reg:$addr),
+ (t2LDRBs t2addrmode_so_reg:$addr)>;
+def : T2Pat<(extloadi1 (ARMWrapper tconstpool:$addr)),
+ (t2LDRBpci tconstpool:$addr)>;
+
+def : T2Pat<(extloadi8 t2addrmode_imm12:$addr),
+ (t2LDRBi12 t2addrmode_imm12:$addr)>;
+def : T2Pat<(extloadi8 t2addrmode_negimm8:$addr),
+ (t2LDRBi8 t2addrmode_negimm8:$addr)>;
+def : T2Pat<(extloadi8 t2addrmode_so_reg:$addr),
+ (t2LDRBs t2addrmode_so_reg:$addr)>;
+def : T2Pat<(extloadi8 (ARMWrapper tconstpool:$addr)),
+ (t2LDRBpci tconstpool:$addr)>;
+
+def : T2Pat<(extloadi16 t2addrmode_imm12:$addr),
+ (t2LDRHi12 t2addrmode_imm12:$addr)>;
+def : T2Pat<(extloadi16 t2addrmode_negimm8:$addr),
+ (t2LDRHi8 t2addrmode_negimm8:$addr)>;
+def : T2Pat<(extloadi16 t2addrmode_so_reg:$addr),
+ (t2LDRHs t2addrmode_so_reg:$addr)>;
+def : T2Pat<(extloadi16 (ARMWrapper tconstpool:$addr)),
+ (t2LDRHpci tconstpool:$addr)>;
+
+// FIXME: The destination register of the loads and stores can't be PC, but
+// can be SP. We need another regclass (similar to rGPR) to represent
+// that. Not a pressing issue since these are selected manually,
+// not via pattern.
+
+// Indexed loads
+
+let mayLoad = 1, hasSideEffects = 0 in {
+def t2LDR_PRE : T2Ipreldst<0, 0b10, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+ (ins t2addrmode_imm8_pre:$addr),
+ AddrModeT2_i8, IndexModePre, IIC_iLoad_iu,
+ "ldr", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []>;
+
+def t2LDR_POST : T2Ipostldst<0, 0b10, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+ (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
+ AddrModeT2_i8, IndexModePost, IIC_iLoad_iu,
+ "ldr", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
+
+def t2LDRB_PRE : T2Ipreldst<0, 0b00, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+ (ins t2addrmode_imm8_pre:$addr),
+ AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
+ "ldrb", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []>;
+
+def t2LDRB_POST : T2Ipostldst<0, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+ (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
+ AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
+ "ldrb", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
+
+def t2LDRH_PRE : T2Ipreldst<0, 0b01, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+ (ins t2addrmode_imm8_pre:$addr),
+ AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
+ "ldrh", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []>;
+
+def t2LDRH_POST : T2Ipostldst<0, 0b01, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+ (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
+ AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
+ "ldrh", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
+
+def t2LDRSB_PRE : T2Ipreldst<1, 0b00, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+ (ins t2addrmode_imm8_pre:$addr),
+ AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
+ "ldrsb", "\t$Rt, $addr!", "$addr.base = $Rn_wb",
+ []>;
+
+def t2LDRSB_POST : T2Ipostldst<1, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+ (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
+ AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
+ "ldrsb", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
+
+def t2LDRSH_PRE : T2Ipreldst<1, 0b01, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+ (ins t2addrmode_imm8_pre:$addr),
+ AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
+ "ldrsh", "\t$Rt, $addr!", "$addr.base = $Rn_wb",
+ []>;
+
+def t2LDRSH_POST : T2Ipostldst<1, 0b01, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+ (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
+ AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
+ "ldrsh", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
+} // mayLoad = 1, hasSideEffects = 0
+
+// LDRT, LDRBT, LDRHT, LDRSBT, LDRSHT all have offset mode (PUW=0b110).
+// Ref: A8.6.57 LDR (immediate, Thumb) Encoding T4
+class T2IldT<bit signed, bits<2> type, string opc, InstrItinClass ii>
+ : T2Ii8<(outs rGPR:$Rt), (ins t2addrmode_posimm8:$addr), ii, opc,
+ "\t$Rt, $addr", []> {
+ bits<4> Rt;
+ bits<13> addr;
+ let Inst{31-27} = 0b11111;
+ let Inst{26-25} = 0b00;
+ let Inst{24} = signed;
+ let Inst{23} = 0;
+ let Inst{22-21} = type;
+ let Inst{20} = 1; // load
+ let Inst{19-16} = addr{12-9};
+ let Inst{15-12} = Rt;
+ let Inst{11} = 1;
+ let Inst{10-8} = 0b110; // PUW.
+ let Inst{7-0} = addr{7-0};
+
+ let DecoderMethod = "DecodeT2LoadT";
+}
+
+def t2LDRT : T2IldT<0, 0b10, "ldrt", IIC_iLoad_i>;
+def t2LDRBT : T2IldT<0, 0b00, "ldrbt", IIC_iLoad_bh_i>;
+def t2LDRHT : T2IldT<0, 0b01, "ldrht", IIC_iLoad_bh_i>;
+def t2LDRSBT : T2IldT<1, 0b00, "ldrsbt", IIC_iLoad_bh_i>;
+def t2LDRSHT : T2IldT<1, 0b01, "ldrsht", IIC_iLoad_bh_i>;
+
+class T2Ildacq<bits<4> bits23_20, bits<2> bit54, dag oops, dag iops,
+ string opc, string asm, list<dag> pattern>
+ : Thumb2I<oops, iops, AddrModeNone, 4, NoItinerary,
+ opc, asm, "", pattern>, Requires<[IsThumb, HasAcquireRelease]> {
+ bits<4> Rt;
+ bits<4> addr;
+
+ let Inst{31-27} = 0b11101;
+ let Inst{26-24} = 0b000;
+ let Inst{23-20} = bits23_20;
+ let Inst{11-6} = 0b111110;
+ let Inst{5-4} = bit54;
+ let Inst{3-0} = 0b1111;
+
+ // Encode instruction operands
+ let Inst{19-16} = addr;
+ let Inst{15-12} = Rt;
+}
+
+def t2LDA : T2Ildacq<0b1101, 0b10, (outs rGPR:$Rt),
+ (ins addr_offset_none:$addr), "lda", "\t$Rt, $addr", []>;
+def t2LDAB : T2Ildacq<0b1101, 0b00, (outs rGPR:$Rt),
+ (ins addr_offset_none:$addr), "ldab", "\t$Rt, $addr", []>;
+def t2LDAH : T2Ildacq<0b1101, 0b01, (outs rGPR:$Rt),
+ (ins addr_offset_none:$addr), "ldah", "\t$Rt, $addr", []>;
+
+// Store
+defm t2STR :T2I_st<0b10,"str", IIC_iStore_i, IIC_iStore_si, GPR, store>;
+defm t2STRB:T2I_st<0b00,"strb", IIC_iStore_bh_i, IIC_iStore_bh_si,
+ rGPR, truncstorei8>;
+defm t2STRH:T2I_st<0b01,"strh", IIC_iStore_bh_i, IIC_iStore_bh_si,
+ rGPR, truncstorei16>;
+
+// Store doubleword
+let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in
+def t2STRDi8 : T2Ii8s4<1, 0, 0, (outs),
+ (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4:$addr),
+ IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", "", []>;
+
+// Indexed stores
+
+let mayStore = 1, hasSideEffects = 0 in {
+def t2STR_PRE : T2Ipreldst<0, 0b10, 0, 1, (outs GPRnopc:$Rn_wb),
+ (ins GPRnopc:$Rt, t2addrmode_imm8_pre:$addr),
+ AddrModeT2_i8, IndexModePre, IIC_iStore_iu,
+ "str", "\t$Rt, $addr!",
+ "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []>;
+
+def t2STRH_PRE : T2Ipreldst<0, 0b01, 0, 1, (outs GPRnopc:$Rn_wb),
+ (ins rGPR:$Rt, t2addrmode_imm8_pre:$addr),
+ AddrModeT2_i8, IndexModePre, IIC_iStore_iu,
+ "strh", "\t$Rt, $addr!",
+ "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []>;
+
+def t2STRB_PRE : T2Ipreldst<0, 0b00, 0, 1, (outs GPRnopc:$Rn_wb),
+ (ins rGPR:$Rt, t2addrmode_imm8_pre:$addr),
+ AddrModeT2_i8, IndexModePre, IIC_iStore_bh_iu,
+ "strb", "\t$Rt, $addr!",
+ "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []>;
+} // mayStore = 1, hasSideEffects = 0
+
+def t2STR_POST : T2Ipostldst<0, 0b10, 0, 0, (outs GPRnopc:$Rn_wb),
+ (ins GPRnopc:$Rt, addr_offset_none:$Rn,
+ t2am_imm8_offset:$offset),
+ AddrModeT2_i8, IndexModePost, IIC_iStore_iu,
+ "str", "\t$Rt, $Rn$offset",
+ "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
+ [(set GPRnopc:$Rn_wb,
+ (post_store GPRnopc:$Rt, addr_offset_none:$Rn,
+ t2am_imm8_offset:$offset))]>;
+
+def t2STRH_POST : T2Ipostldst<0, 0b01, 0, 0, (outs GPRnopc:$Rn_wb),
+ (ins rGPR:$Rt, addr_offset_none:$Rn,
+ t2am_imm8_offset:$offset),
+ AddrModeT2_i8, IndexModePost, IIC_iStore_bh_iu,
+ "strh", "\t$Rt, $Rn$offset",
+ "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
+ [(set GPRnopc:$Rn_wb,
+ (post_truncsti16 rGPR:$Rt, addr_offset_none:$Rn,
+ t2am_imm8_offset:$offset))]>;
+
+def t2STRB_POST : T2Ipostldst<0, 0b00, 0, 0, (outs GPRnopc:$Rn_wb),
+ (ins rGPR:$Rt, addr_offset_none:$Rn,
+ t2am_imm8_offset:$offset),
+ AddrModeT2_i8, IndexModePost, IIC_iStore_bh_iu,
+ "strb", "\t$Rt, $Rn$offset",
+ "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
+ [(set GPRnopc:$Rn_wb,
+ (post_truncsti8 rGPR:$Rt, addr_offset_none:$Rn,
+ t2am_imm8_offset:$offset))]>;
+
+// Pseudo-instructions for pattern matching the pre-indexed stores. We can't
+// put the patterns on the instruction definitions directly as ISel wants
+// the address base and offset to be separate operands, not a single
+// complex operand like we represent the instructions themselves. The
+// pseudos map between the two.
+let usesCustomInserter = 1,
+ Constraints = "$Rn = $Rn_wb,@earlyclobber $Rn_wb" in {
+def t2STR_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb),
+ (ins rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset, pred:$p),
+ 4, IIC_iStore_ru,
+ [(set GPRnopc:$Rn_wb,
+ (pre_store rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>;
+def t2STRB_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb),
+ (ins rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset, pred:$p),
+ 4, IIC_iStore_ru,
+ [(set GPRnopc:$Rn_wb,
+ (pre_truncsti8 rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>;
+def t2STRH_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb),
+ (ins rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset, pred:$p),
+ 4, IIC_iStore_ru,
+ [(set GPRnopc:$Rn_wb,
+ (pre_truncsti16 rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>;
+}
+
+// STRT, STRBT, STRHT all have offset mode (PUW=0b110) and are for disassembly
+// only.
+// Ref: A8.6.193 STR (immediate, Thumb) Encoding T4
+class T2IstT<bits<2> type, string opc, InstrItinClass ii>
+ : T2Ii8<(outs rGPR:$Rt), (ins t2addrmode_imm8:$addr), ii, opc,
+ "\t$Rt, $addr", []> {
+ let Inst{31-27} = 0b11111;
+ let Inst{26-25} = 0b00;
+ let Inst{24} = 0; // not signed
+ let Inst{23} = 0;
+ let Inst{22-21} = type;
+ let Inst{20} = 0; // store
+ let Inst{11} = 1;
+ let Inst{10-8} = 0b110; // PUW
+
+ bits<4> Rt;
+ bits<13> addr;
+ let Inst{15-12} = Rt;
+ let Inst{19-16} = addr{12-9};
+ let Inst{7-0} = addr{7-0};
+}
+
+def t2STRT : T2IstT<0b10, "strt", IIC_iStore_i>;
+def t2STRBT : T2IstT<0b00, "strbt", IIC_iStore_bh_i>;
+def t2STRHT : T2IstT<0b01, "strht", IIC_iStore_bh_i>;
+
+// ldrd / strd pre / post variants
+
+let mayLoad = 1 in
+def t2LDRD_PRE : T2Ii8s4<1, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb),
+ (ins t2addrmode_imm8s4_pre:$addr), IIC_iLoad_d_ru,
+ "ldrd", "\t$Rt, $Rt2, $addr!", "$addr.base = $wb", []> {
+ let DecoderMethod = "DecodeT2LDRDPreInstruction";
+}
+
+let mayLoad = 1 in
+def t2LDRD_POST : T2Ii8s4post<0, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb),
+ (ins addr_offset_none:$addr, t2am_imm8s4_offset:$imm),
+ IIC_iLoad_d_ru, "ldrd", "\t$Rt, $Rt2, $addr$imm",
+ "$addr.base = $wb", []>;
+
+let mayStore = 1 in
+def t2STRD_PRE : T2Ii8s4<1, 1, 0, (outs GPR:$wb),
+ (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4_pre:$addr),
+ IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, $addr!",
+ "$addr.base = $wb", []> {
+ let DecoderMethod = "DecodeT2STRDPreInstruction";
+}
+
+let mayStore = 1 in
+def t2STRD_POST : T2Ii8s4post<0, 1, 0, (outs GPR:$wb),
+ (ins rGPR:$Rt, rGPR:$Rt2, addr_offset_none:$addr,
+ t2am_imm8s4_offset:$imm),
+ IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, $addr$imm",
+ "$addr.base = $wb", []>;
+
+class T2Istrrel<bits<2> bit54, dag oops, dag iops,
+ string opc, string asm, list<dag> pattern>
+ : Thumb2I<oops, iops, AddrModeNone, 4, NoItinerary, opc,
+ asm, "", pattern>, Requires<[IsThumb, HasAcquireRelease]> {
+ bits<4> Rt;
+ bits<4> addr;
+
+ let Inst{31-27} = 0b11101;
+ let Inst{26-20} = 0b0001100;
+ let Inst{11-6} = 0b111110;
+ let Inst{5-4} = bit54;
+ let Inst{3-0} = 0b1111;
+
+ // Encode instruction operands
+ let Inst{19-16} = addr;
+ let Inst{15-12} = Rt;
+}
+
+def t2STL : T2Istrrel<0b10, (outs), (ins rGPR:$Rt, addr_offset_none:$addr),
+ "stl", "\t$Rt, $addr", []>;
+def t2STLB : T2Istrrel<0b00, (outs), (ins rGPR:$Rt, addr_offset_none:$addr),
+ "stlb", "\t$Rt, $addr", []>;
+def t2STLH : T2Istrrel<0b01, (outs), (ins rGPR:$Rt, addr_offset_none:$addr),
+ "stlh", "\t$Rt, $addr", []>;
+
+// T2Ipl (Preload Data/Instruction) signals the memory system of possible future
+// data/instruction access.
+// instr_write is inverted for Thumb mode: (prefetch 3) -> (preload 0),
+// (prefetch 1) -> (preload 2), (prefetch 2) -> (preload 1).
+multiclass T2Ipl<bits<1> write, bits<1> instr, string opc> {
+
+ def i12 : T2Ii12<(outs), (ins t2addrmode_imm12:$addr), IIC_Preload, opc,
+ "\t$addr",
+ [(ARMPreload t2addrmode_imm12:$addr, (i32 write), (i32 instr))]>,
+ Sched<[WritePreLd]> {
+ let Inst{31-25} = 0b1111100;
+ let Inst{24} = instr;
+ let Inst{23} = 1;
+ let Inst{22} = 0;
+ let Inst{21} = write;
+ let Inst{20} = 1;
+ let Inst{15-12} = 0b1111;
+
+ bits<17> addr;
+ let Inst{19-16} = addr{16-13}; // Rn
+ let Inst{11-0} = addr{11-0}; // imm12
+
+ let DecoderMethod = "DecodeT2LoadImm12";
+ }
+
+ def i8 : T2Ii8<(outs), (ins t2addrmode_negimm8:$addr), IIC_Preload, opc,
+ "\t$addr",
+ [(ARMPreload t2addrmode_negimm8:$addr, (i32 write), (i32 instr))]>,
+ Sched<[WritePreLd]> {
+ let Inst{31-25} = 0b1111100;
+ let Inst{24} = instr;
+ let Inst{23} = 0; // U = 0
+ let Inst{22} = 0;
+ let Inst{21} = write;
+ let Inst{20} = 1;
+ let Inst{15-12} = 0b1111;
+ let Inst{11-8} = 0b1100;
+
+ bits<13> addr;
+ let Inst{19-16} = addr{12-9}; // Rn
+ let Inst{7-0} = addr{7-0}; // imm8
+
+ let DecoderMethod = "DecodeT2LoadImm8";
+ }
+
+ def s : T2Iso<(outs), (ins t2addrmode_so_reg:$addr), IIC_Preload, opc,
+ "\t$addr",
+ [(ARMPreload t2addrmode_so_reg:$addr, (i32 write), (i32 instr))]>,
+ Sched<[WritePreLd]> {
+ let Inst{31-25} = 0b1111100;
+ let Inst{24} = instr;
+ let Inst{23} = 0; // add = TRUE for T1
+ let Inst{22} = 0;
+ let Inst{21} = write;
+ let Inst{20} = 1;
+ let Inst{15-12} = 0b1111;
+ let Inst{11-6} = 0b000000;
+
+ bits<10> addr;
+ let Inst{19-16} = addr{9-6}; // Rn
+ let Inst{3-0} = addr{5-2}; // Rm
+ let Inst{5-4} = addr{1-0}; // imm2
+
+ let DecoderMethod = "DecodeT2LoadShift";
+ }
+}
+
+defm t2PLD : T2Ipl<0, 0, "pld">, Requires<[IsThumb2]>;
+defm t2PLDW : T2Ipl<1, 0, "pldw">, Requires<[IsThumb2,HasV7,HasMP]>;
+defm t2PLI : T2Ipl<0, 1, "pli">, Requires<[IsThumb2,HasV7]>;
+
+// pci variant is very similar to i12, but supports negative offsets
+// from the PC. Only PLD and PLI have pci variants (not PLDW)
+class T2Iplpci<bits<1> inst, string opc> : T2Iso<(outs), (ins t2ldrlabel:$addr),
+ IIC_Preload, opc, "\t$addr",
+ [(ARMPreload (ARMWrapper tconstpool:$addr),
+ (i32 0), (i32 inst))]>, Sched<[WritePreLd]> {
+ let Inst{31-25} = 0b1111100;
+ let Inst{24} = inst;
+ let Inst{22-20} = 0b001;
+ let Inst{19-16} = 0b1111;
+ let Inst{15-12} = 0b1111;
+
+ bits<13> addr;
+ let Inst{23} = addr{12}; // add = (U == '1')
+ let Inst{11-0} = addr{11-0}; // imm12
+
+ let DecoderMethod = "DecodeT2LoadLabel";
+}
+
+def t2PLDpci : T2Iplpci<0, "pld">, Requires<[IsThumb2]>;
+def t2PLIpci : T2Iplpci<1, "pli">, Requires<[IsThumb2,HasV7]>;
+
+//===----------------------------------------------------------------------===//
+// Load / store multiple Instructions.
+//
+
+multiclass thumb2_ld_mult<string asm, InstrItinClass itin,
+ InstrItinClass itin_upd, bit L_bit> {
+ def IA :
+ T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+ itin, !strconcat(asm, "${p}.w\t$Rn, $regs"), []> {
+ bits<4> Rn;
+ bits<16> regs;
+
+ let Inst{31-27} = 0b11101;
+ let Inst{26-25} = 0b00;
+ let Inst{24-23} = 0b01; // Increment After
+ let Inst{22} = 0;
+ let Inst{21} = 0; // No writeback
+ let Inst{20} = L_bit;
+ let Inst{19-16} = Rn;
+ let Inst{15-0} = regs;
+ }
+ def IA_UPD :
+ T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+ itin_upd, !strconcat(asm, "${p}.w\t$Rn!, $regs"), "$Rn = $wb", []> {
+ bits<4> Rn;
+ bits<16> regs;
+
+ let Inst{31-27} = 0b11101;
+ let Inst{26-25} = 0b00;
+ let Inst{24-23} = 0b01; // Increment After
+ let Inst{22} = 0;
+ let Inst{21} = 1; // Writeback
+ let Inst{20} = L_bit;
+ let Inst{19-16} = Rn;
+ let Inst{15-0} = regs;
+ }
+ def DB :
+ T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+ itin, !strconcat(asm, "db${p}\t$Rn, $regs"), []> {
+ bits<4> Rn;
+ bits<16> regs;
+
+ let Inst{31-27} = 0b11101;
+ let Inst{26-25} = 0b00;
+ let Inst{24-23} = 0b10; // Decrement Before
+ let Inst{22} = 0;
+ let Inst{21} = 0; // No writeback
+ let Inst{20} = L_bit;
+ let Inst{19-16} = Rn;
+ let Inst{15-0} = regs;
+ }
+ def DB_UPD :
+ T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+ itin_upd, !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+ bits<4> Rn;
+ bits<16> regs;
+
+ let Inst{31-27} = 0b11101;
+ let Inst{26-25} = 0b00;
+ let Inst{24-23} = 0b10; // Decrement Before
+ let Inst{22} = 0;
+ let Inst{21} = 1; // Writeback
+ let Inst{20} = L_bit;
+ let Inst{19-16} = Rn;
+ let Inst{15-0} = regs;
+ }
+}
+
+let hasSideEffects = 0 in {
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
+defm t2LDM : thumb2_ld_mult<"ldm", IIC_iLoad_m, IIC_iLoad_mu, 1>;
+
+multiclass thumb2_st_mult<string asm, InstrItinClass itin,
+ InstrItinClass itin_upd, bit L_bit> {
+ def IA :
+ T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+ itin, !strconcat(asm, "${p}.w\t$Rn, $regs"), []> {
+ bits<4> Rn;
+ bits<16> regs;
+
+ let Inst{31-27} = 0b11101;
+ let Inst{26-25} = 0b00;
+ let Inst{24-23} = 0b01; // Increment After
+ let Inst{22} = 0;
+ let Inst{21} = 0; // No writeback
+ let Inst{20} = L_bit;
+ let Inst{19-16} = Rn;
+ let Inst{15} = 0;
+ let Inst{14} = regs{14};
+ let Inst{13} = 0;
+ let Inst{12-0} = regs{12-0};
+ }
+ def IA_UPD :
+ T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+ itin_upd, !strconcat(asm, "${p}.w\t$Rn!, $regs"), "$Rn = $wb", []> {
+ bits<4> Rn;
+ bits<16> regs;
+
+ let Inst{31-27} = 0b11101;
+ let Inst{26-25} = 0b00;
+ let Inst{24-23} = 0b01; // Increment After
+ let Inst{22} = 0;
+ let Inst{21} = 1; // Writeback
+ let Inst{20} = L_bit;
+ let Inst{19-16} = Rn;
+ let Inst{15} = 0;
+ let Inst{14} = regs{14};
+ let Inst{13} = 0;
+ let Inst{12-0} = regs{12-0};
+ }
+ def DB :
+ T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+ itin, !strconcat(asm, "db${p}\t$Rn, $regs"), []> {
+ bits<4> Rn;
+ bits<16> regs;
+
+ let Inst{31-27} = 0b11101;
+ let Inst{26-25} = 0b00;
+ let Inst{24-23} = 0b10; // Decrement Before
+ let Inst{22} = 0;
+ let Inst{21} = 0; // No writeback
+ let Inst{20} = L_bit;
+ let Inst{19-16} = Rn;
+ let Inst{15} = 0;
+ let Inst{14} = regs{14};
+ let Inst{13} = 0;
+ let Inst{12-0} = regs{12-0};
+ }
+ def DB_UPD :
+ T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+ itin_upd, !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+ bits<4> Rn;
+ bits<16> regs;
+
+ let Inst{31-27} = 0b11101;
+ let Inst{26-25} = 0b00;
+ let Inst{24-23} = 0b10; // Decrement Before
+ let Inst{22} = 0;
+ let Inst{21} = 1; // Writeback
+ let Inst{20} = L_bit;
+ let Inst{19-16} = Rn;
+ let Inst{15} = 0;
+ let Inst{14} = regs{14};
+ let Inst{13} = 0;
+ let Inst{12-0} = regs{12-0};
+ }
+}
+
+
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
+defm t2STM : thumb2_st_mult<"stm", IIC_iStore_m, IIC_iStore_mu, 0>;
+
+} // hasSideEffects
+
+
+//===----------------------------------------------------------------------===//
+// Move Instructions.
+//
+
+let hasSideEffects = 0 in
+def t2MOVr : T2sTwoReg<(outs GPRnopc:$Rd), (ins GPR:$Rm), IIC_iMOVr,
+ "mov", ".w\t$Rd, $Rm", []>, Sched<[WriteALU]> {
+ let Inst{31-27} = 0b11101;
+ let Inst{26-25} = 0b01;
+ let Inst{24-21} = 0b0010;
+ let Inst{19-16} = 0b1111; // Rn
+ let Inst{14-12} = 0b000;
+ let Inst{7-4} = 0b0000;
+}
+def : t2InstAlias<"mov${p}.w $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPR:$Rm,
+ pred:$p, zero_reg)>;
+def : t2InstAlias<"movs${p}.w $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPR:$Rm,
+ pred:$p, CPSR)>;
+def : t2InstAlias<"movs${p} $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPR:$Rm,
+ pred:$p, CPSR)>;
+
+// AddedComplexity to ensure isel tries t2MOVi before t2MOVi16.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1,
+ AddedComplexity = 1 in
+def t2MOVi : T2sOneRegImm<(outs rGPR:$Rd), (ins t2_so_imm:$imm), IIC_iMOVi,
+ "mov", ".w\t$Rd, $imm",
+ [(set rGPR:$Rd, t2_so_imm:$imm)]>, Sched<[WriteALU]> {
+ let Inst{31-27} = 0b11110;
+ let Inst{25} = 0;
+ let Inst{24-21} = 0b0010;
+ let Inst{19-16} = 0b1111; // Rn
+ let Inst{15} = 0;
+}
+
+// cc_out is handled as part of the explicit mnemonic in the parser for 'mov'.
+// Use aliases to get that to play nice here.
+def : t2InstAlias<"movs${p}.w $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm,
+ pred:$p, CPSR)>;
+def : t2InstAlias<"movs${p} $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm,
+ pred:$p, CPSR)>;
+
+def : t2InstAlias<"mov${p}.w $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm,
+ pred:$p, zero_reg)>;
+def : t2InstAlias<"mov${p} $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm,
+ pred:$p, zero_reg)>;
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
+def t2MOVi16 : T2I<(outs rGPR:$Rd), (ins imm0_65535_expr:$imm), IIC_iMOVi,
+ "movw", "\t$Rd, $imm",
+ [(set rGPR:$Rd, imm0_65535:$imm)]>, Sched<[WriteALU]>,
+ Requires<[IsThumb, HasV8MBaseline]> {
+ let Inst{31-27} = 0b11110;
+ let Inst{25} = 1;
+ let Inst{24-21} = 0b0010;
+ let Inst{20} = 0; // The S bit.
+ let Inst{15} = 0;
+
+ bits<4> Rd;
+ bits<16> imm;
+
+ let Inst{11-8} = Rd;
+ let Inst{19-16} = imm{15-12};
+ let Inst{26} = imm{11};
+ let Inst{14-12} = imm{10-8};
+ let Inst{7-0} = imm{7-0};
+ let DecoderMethod = "DecodeT2MOVTWInstruction";
+}
+
+def : InstAlias<"mov${p} $Rd, $imm",
+ (t2MOVi16 rGPR:$Rd, imm256_65535_expr:$imm, pred:$p), 0>,
+ Requires<[IsThumb, HasV8MBaseline]>;
+
+def t2MOVi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd),
+ (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>;
+
+let Constraints = "$src = $Rd" in {
+def t2MOVTi16 : T2I<(outs rGPR:$Rd),
+ (ins rGPR:$src, imm0_65535_expr:$imm), IIC_iMOVi,
+ "movt", "\t$Rd, $imm",
+ [(set rGPR:$Rd,
+ (or (and rGPR:$src, 0xffff), lo16AllZero:$imm))]>,
+ Sched<[WriteALU]>,
+ Requires<[IsThumb, HasV8MBaseline]> {
+ let Inst{31-27} = 0b11110;
+ let Inst{25} = 1;
+ let Inst{24-21} = 0b0110;
+ let Inst{20} = 0; // The S bit.
+ let Inst{15} = 0;
+
+ bits<4> Rd;
+ bits<16> imm;
+
+ let Inst{11-8} = Rd;
+ let Inst{19-16} = imm{15-12};
+ let Inst{26} = imm{11};
+ let Inst{14-12} = imm{10-8};
+ let Inst{7-0} = imm{7-0};
+ let DecoderMethod = "DecodeT2MOVTWInstruction";
+}
+
+def t2MOVTi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd),
+ (ins rGPR:$src, i32imm:$addr, pclabel:$id), IIC_iMOVi, []>,
+ Sched<[WriteALU]>, Requires<[IsThumb, HasV8MBaseline]>;
+} // Constraints
+
+def : T2Pat<(or rGPR:$src, 0xffff0000), (t2MOVTi16 rGPR:$src, 0xffff)>;
+
+//===----------------------------------------------------------------------===//
+// Extend Instructions.
+//
+
+// Sign extenders
+
+def t2SXTB : T2I_ext_rrot<0b100, "sxtb",
+ UnOpFrag<(sext_inreg node:$Src, i8)>>;
+def t2SXTH : T2I_ext_rrot<0b000, "sxth",
+ UnOpFrag<(sext_inreg node:$Src, i16)>>;
+def t2SXTB16 : T2I_ext_rrot_sxtb16<0b010, "sxtb16">;
+
+def t2SXTAB : T2I_exta_rrot<0b100, "sxtab",
+ BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS, i8))>>;
+def t2SXTAH : T2I_exta_rrot<0b000, "sxtah",
+ BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS,i16))>>;
+def t2SXTAB16 : T2I_exta_rrot_np<0b010, "sxtab16">;
+
+// A simple right-shift can also be used in most cases (the exception is the
+// SXTH operations with a rotate of 24: there the non-contiguous bits are
+// relevant).
+def : Thumb2ExtractPat<(add rGPR:$Rn, (sext_inreg
+ (srl rGPR:$Rm, rot_imm:$rot), i8)),
+ (t2SXTAB rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
+def : Thumb2ExtractPat<(add rGPR:$Rn, (sext_inreg
+ (srl rGPR:$Rm, imm8_or_16:$rot), i16)),
+ (t2SXTAH rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
+def : Thumb2ExtractPat<(add rGPR:$Rn, (sext_inreg
+ (rotr rGPR:$Rm, (i32 24)), i16)),
+ (t2SXTAH rGPR:$Rn, rGPR:$Rm, (i32 3))>;
+def : Thumb2ExtractPat<(add rGPR:$Rn, (sext_inreg
+ (or (srl rGPR:$Rm, (i32 24)),
+ (shl rGPR:$Rm, (i32 8))), i16)),
+ (t2SXTAH rGPR:$Rn, rGPR:$Rm, (i32 3))>;
+
+// Zero extenders
+
+let AddedComplexity = 16 in {
+def t2UXTB : T2I_ext_rrot<0b101, "uxtb",
+ UnOpFrag<(and node:$Src, 0x000000FF)>>;
+def t2UXTH : T2I_ext_rrot<0b001, "uxth",
+ UnOpFrag<(and node:$Src, 0x0000FFFF)>>;
+def t2UXTB16 : T2I_ext_rrot_uxtb16<0b011, "uxtb16",
+ UnOpFrag<(and node:$Src, 0x00FF00FF)>>;
+
+// FIXME: This pattern incorrectly assumes the shl operator is a rotate.
+// The transformation should probably be done as a combiner action
+// instead so we can include a check for masking back in the upper
+// eight bits of the source into the lower eight bits of the result.
+//def : T2Pat<(and (shl rGPR:$Src, (i32 8)), 0xFF00FF),
+// (t2UXTB16 rGPR:$Src, 3)>,
+// Requires<[HasT2ExtractPack, IsThumb2]>;
+def : T2Pat<(and (srl rGPR:$Src, (i32 8)), 0xFF00FF),
+ (t2UXTB16 rGPR:$Src, 1)>,
+ Requires<[HasT2ExtractPack, IsThumb2]>;
+
+def t2UXTAB : T2I_exta_rrot<0b101, "uxtab",
+ BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>;
+def t2UXTAH : T2I_exta_rrot<0b001, "uxtah",
+ BinOpFrag<(add node:$LHS, (and node:$RHS, 0xFFFF))>>;
+def t2UXTAB16 : T2I_exta_rrot_np<0b011, "uxtab16">;
+
+def : Thumb2ExtractPat<(add rGPR:$Rn, (and (srl rGPR:$Rm, rot_imm:$rot),
+ 0xFF)),
+ (t2UXTAB rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
+def : Thumb2ExtractPat<(add rGPR:$Rn, (and (srl rGPR:$Rm, imm8_or_16:$rot),
+ 0xFFFF)),
+ (t2UXTAH rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Arithmetic Instructions.
+//
+
+let isAdd = 1 in
+defm t2ADD : T2I_bin_ii12rs<0b000, "add", add, 1>;
+defm t2SUB : T2I_bin_ii12rs<0b101, "sub", sub>;
+
+// ADD and SUB with 's' bit set. No 12-bit immediate (T4) variants.
+//
+// Currently, t2ADDS/t2SUBS are pseudo opcodes that exist only in the
+// selection DAG. They are "lowered" to real t2ADD/t2SUB opcodes by
+// AdjustInstrPostInstrSelection where we determine whether or not to
+// set the "s" bit based on CPSR liveness.
+//
+// FIXME: Eliminate t2ADDS/t2SUBS pseudo opcodes after adding tablegen
+// support for an optional CPSR definition that corresponds to the DAG
+// node's second value. We can then eliminate the implicit def of CPSR.
+defm t2ADDS : T2I_bin_s_irs <IIC_iALUi, IIC_iALUr, IIC_iALUsi, ARMaddc, 1>;
+defm t2SUBS : T2I_bin_s_irs <IIC_iALUi, IIC_iALUr, IIC_iALUsi, ARMsubc>;
+
+let hasPostISelHook = 1 in {
+defm t2ADC : T2I_adde_sube_irs<0b1010, "adc", ARMadde, 1>;
+defm t2SBC : T2I_adde_sube_irs<0b1011, "sbc", ARMsube>;
+}
+
+// RSB
+defm t2RSB : T2I_rbin_irs <0b1110, "rsb", sub>;
+
+// FIXME: Eliminate them if we can write def : Pat patterns which defines
+// CPSR and the implicit def of CPSR is not needed.
+defm t2RSBS : T2I_rbin_s_is <ARMsubc>;
+
+// (sub X, imm) gets canonicalized to (add X, -imm). Match this form.
+// The assume-no-carry-in form uses the negation of the input since add/sub
+// assume opposite meanings of the carry flag (i.e., carry == !borrow).
+// See the definition of AddWithCarry() in the ARM ARM A2.2.1 for the gory
+// details.
+// The AddedComplexity preferences the first variant over the others since
+// it can be shrunk to a 16-bit wide encoding, while the others cannot.
+let AddedComplexity = 1 in
+def : T2Pat<(add GPR:$src, imm1_255_neg:$imm),
+ (t2SUBri GPR:$src, imm1_255_neg:$imm)>;
+def : T2Pat<(add GPR:$src, t2_so_imm_neg:$imm),
+ (t2SUBri GPR:$src, t2_so_imm_neg:$imm)>;
+def : T2Pat<(add GPR:$src, imm0_4095_neg:$imm),
+ (t2SUBri12 GPR:$src, imm0_4095_neg:$imm)>;
+def : T2Pat<(add GPR:$src, imm0_65535_neg:$imm),
+ (t2SUBrr GPR:$src, (t2MOVi16 (imm_neg_XFORM imm:$imm)))>;
+
+let AddedComplexity = 1 in
+def : T2Pat<(ARMaddc rGPR:$src, imm1_255_neg:$imm),
+ (t2SUBSri rGPR:$src, imm1_255_neg:$imm)>;
+def : T2Pat<(ARMaddc rGPR:$src, t2_so_imm_neg:$imm),
+ (t2SUBSri rGPR:$src, t2_so_imm_neg:$imm)>;
+def : T2Pat<(ARMaddc rGPR:$src, imm0_65535_neg:$imm),
+ (t2SUBSrr rGPR:$src, (t2MOVi16 (imm_neg_XFORM imm:$imm)))>;
+// The with-carry-in form matches bitwise not instead of the negation.
+// Effectively, the inverse interpretation of the carry flag already accounts
+// for part of the negation.
+let AddedComplexity = 1 in
+def : T2Pat<(ARMadde rGPR:$src, imm0_255_not:$imm, CPSR),
+ (t2SBCri rGPR:$src, imm0_255_not:$imm)>;
+def : T2Pat<(ARMadde rGPR:$src, t2_so_imm_not:$imm, CPSR),
+ (t2SBCri rGPR:$src, t2_so_imm_not:$imm)>;
+def : T2Pat<(ARMadde rGPR:$src, imm0_65535_neg:$imm, CPSR),
+ (t2SBCrr rGPR:$src, (t2MOVi16 (imm_not_XFORM imm:$imm)))>;
+
+// Select Bytes -- for disassembly only
+
+def t2SEL : T2ThreeReg<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+ NoItinerary, "sel", "\t$Rd, $Rn, $Rm", []>,
+ Requires<[IsThumb2, HasDSP]> {
+ let Inst{31-27} = 0b11111;
+ let Inst{26-24} = 0b010;
+ let Inst{23} = 0b1;
+ let Inst{22-20} = 0b010;
+ let Inst{15-12} = 0b1111;
+ let Inst{7} = 0b1;
+ let Inst{6-4} = 0b000;
+}
+
+// A6.3.13, A6.3.14, A6.3.15 Parallel addition and subtraction (signed/unsigned)
+// And Miscellaneous operations -- for disassembly only
+class T2I_pam<bits<3> op22_20, bits<4> op7_4, string opc,
+ list<dag> pat = [/* For disassembly only; pattern left blank */],
+ dag iops = (ins rGPR:$Rn, rGPR:$Rm),
+ string asm = "\t$Rd, $Rn, $Rm">
+ : T2I<(outs rGPR:$Rd), iops, NoItinerary, opc, asm, pat>,
+ Requires<[IsThumb2, HasDSP]> {
+ let Inst{31-27} = 0b11111;
+ let Inst{26-23} = 0b0101;
+ let Inst{22-20} = op22_20;
+ let Inst{15-12} = 0b1111;
+ let Inst{7-4} = op7_4;
+
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<4> Rm;
+
+ let Inst{11-8} = Rd;
+ let Inst{19-16} = Rn;
+ let Inst{3-0} = Rm;
+}
+
+// Saturating add/subtract -- for disassembly only
+
+def t2QADD : T2I_pam<0b000, 0b1000, "qadd",
+ [(set rGPR:$Rd, (int_arm_qadd rGPR:$Rn, rGPR:$Rm))],
+ (ins rGPR:$Rm, rGPR:$Rn), "\t$Rd, $Rm, $Rn">;
+def t2QADD16 : T2I_pam<0b001, 0b0001, "qadd16">;
+def t2QADD8 : T2I_pam<0b000, 0b0001, "qadd8">;
+def t2QASX : T2I_pam<0b010, 0b0001, "qasx">;
+def t2QDADD : T2I_pam<0b000, 0b1001, "qdadd", [],
+ (ins rGPR:$Rm, rGPR:$Rn), "\t$Rd, $Rm, $Rn">;
+def t2QDSUB : T2I_pam<0b000, 0b1011, "qdsub", [],
+ (ins rGPR:$Rm, rGPR:$Rn), "\t$Rd, $Rm, $Rn">;
+def t2QSAX : T2I_pam<0b110, 0b0001, "qsax">;
+def t2QSUB : T2I_pam<0b000, 0b1010, "qsub",
+ [(set rGPR:$Rd, (int_arm_qsub rGPR:$Rn, rGPR:$Rm))],
+ (ins rGPR:$Rm, rGPR:$Rn), "\t$Rd, $Rm, $Rn">;
+def t2QSUB16 : T2I_pam<0b101, 0b0001, "qsub16">;
+def t2QSUB8 : T2I_pam<0b100, 0b0001, "qsub8">;
+def t2UQADD16 : T2I_pam<0b001, 0b0101, "uqadd16">;
+def t2UQADD8 : T2I_pam<0b000, 0b0101, "uqadd8">;
+def t2UQASX : T2I_pam<0b010, 0b0101, "uqasx">;
+def t2UQSAX : T2I_pam<0b110, 0b0101, "uqsax">;
+def t2UQSUB16 : T2I_pam<0b101, 0b0101, "uqsub16">;
+def t2UQSUB8 : T2I_pam<0b100, 0b0101, "uqsub8">;
+
+// Signed/Unsigned add/subtract -- for disassembly only
+
+def t2SASX : T2I_pam<0b010, 0b0000, "sasx">;
+def t2SADD16 : T2I_pam<0b001, 0b0000, "sadd16">;
+def t2SADD8 : T2I_pam<0b000, 0b0000, "sadd8">;
+def t2SSAX : T2I_pam<0b110, 0b0000, "ssax">;
+def t2SSUB16 : T2I_pam<0b101, 0b0000, "ssub16">;
+def t2SSUB8 : T2I_pam<0b100, 0b0000, "ssub8">;
+def t2UASX : T2I_pam<0b010, 0b0100, "uasx">;
+def t2UADD16 : T2I_pam<0b001, 0b0100, "uadd16">;
+def t2UADD8 : T2I_pam<0b000, 0b0100, "uadd8">;
+def t2USAX : T2I_pam<0b110, 0b0100, "usax">;
+def t2USUB16 : T2I_pam<0b101, 0b0100, "usub16">;
+def t2USUB8 : T2I_pam<0b100, 0b0100, "usub8">;
+
+// Signed/Unsigned halving add/subtract -- for disassembly only
+
+def t2SHASX : T2I_pam<0b010, 0b0010, "shasx">;
+def t2SHADD16 : T2I_pam<0b001, 0b0010, "shadd16">;
+def t2SHADD8 : T2I_pam<0b000, 0b0010, "shadd8">;
+def t2SHSAX : T2I_pam<0b110, 0b0010, "shsax">;
+def t2SHSUB16 : T2I_pam<0b101, 0b0010, "shsub16">;
+def t2SHSUB8 : T2I_pam<0b100, 0b0010, "shsub8">;
+def t2UHASX : T2I_pam<0b010, 0b0110, "uhasx">;
+def t2UHADD16 : T2I_pam<0b001, 0b0110, "uhadd16">;
+def t2UHADD8 : T2I_pam<0b000, 0b0110, "uhadd8">;
+def t2UHSAX : T2I_pam<0b110, 0b0110, "uhsax">;
+def t2UHSUB16 : T2I_pam<0b101, 0b0110, "uhsub16">;
+def t2UHSUB8 : T2I_pam<0b100, 0b0110, "uhsub8">;
+
+// Helper class for disassembly only
+// A6.3.16 & A6.3.17
+// T2Imac - Thumb2 multiply [accumulate, and absolute difference] instructions.
+class T2ThreeReg_mac<bit long, bits<3> op22_20, bits<4> op7_4, dag oops,
+ dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern>
+ : T2ThreeReg<oops, iops, itin, opc, asm, pattern> {
+ let Inst{31-27} = 0b11111;
+ let Inst{26-24} = 0b011;
+ let Inst{23} = long;
+ let Inst{22-20} = op22_20;
+ let Inst{7-4} = op7_4;
+}
+
+class T2FourReg_mac<bit long, bits<3> op22_20, bits<4> op7_4, dag oops,
+ dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern>
+ : T2FourReg<oops, iops, itin, opc, asm, pattern> {
+ let Inst{31-27} = 0b11111;
+ let Inst{26-24} = 0b011;
+ let Inst{23} = long;
+ let Inst{22-20} = op22_20;
+ let Inst{7-4} = op7_4;
+}
+
+// Unsigned Sum of Absolute Differences [and Accumulate].
+def t2USAD8 : T2ThreeReg_mac<0, 0b111, 0b0000, (outs rGPR:$Rd),
+ (ins rGPR:$Rn, rGPR:$Rm),
+ NoItinerary, "usad8", "\t$Rd, $Rn, $Rm", []>,
+ Requires<[IsThumb2, HasDSP]> {
+ let Inst{15-12} = 0b1111;
+}
+def t2USADA8 : T2FourReg_mac<0, 0b111, 0b0000, (outs rGPR:$Rd),
+ (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), NoItinerary,
+ "usada8", "\t$Rd, $Rn, $Rm, $Ra", []>,
+ Requires<[IsThumb2, HasDSP]>;
+
+// Signed/Unsigned saturate.
+class T2SatI<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : T2I<oops, iops, itin, opc, asm, pattern> {
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<5> sat_imm;
+ bits<7> sh;
+
+ let Inst{11-8} = Rd;
+ let Inst{19-16} = Rn;
+ let Inst{4-0} = sat_imm;
+ let Inst{21} = sh{5};
+ let Inst{14-12} = sh{4-2};
+ let Inst{7-6} = sh{1-0};
+}
+
+def t2SSAT: T2SatI<
+ (outs rGPR:$Rd),
+ (ins imm1_32:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh),
+ NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []>,
+ Requires<[IsThumb2]> {
+ let Inst{31-27} = 0b11110;
+ let Inst{25-22} = 0b1100;
+ let Inst{20} = 0;
+ let Inst{15} = 0;
+ let Inst{5} = 0;
+}
+
+def t2SSAT16: T2SatI<
+ (outs rGPR:$Rd), (ins imm1_16:$sat_imm, rGPR:$Rn), NoItinerary,
+ "ssat16", "\t$Rd, $sat_imm, $Rn", []>,
+ Requires<[IsThumb2, HasDSP]> {
+ let Inst{31-27} = 0b11110;
+ let Inst{25-22} = 0b1100;
+ let Inst{20} = 0;
+ let Inst{15} = 0;
+ let Inst{21} = 1; // sh = '1'
+ let Inst{14-12} = 0b000; // imm3 = '000'
+ let Inst{7-6} = 0b00; // imm2 = '00'
+ let Inst{5-4} = 0b00;
+}
+
+def t2USAT: T2SatI<
+ (outs rGPR:$Rd),
+ (ins imm0_31:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh),
+ NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []>,
+ Requires<[IsThumb2]> {
+ let Inst{31-27} = 0b11110;
+ let Inst{25-22} = 0b1110;
+ let Inst{20} = 0;
+ let Inst{15} = 0;
+}
+
+def t2USAT16: T2SatI<(outs rGPR:$Rd), (ins imm0_15:$sat_imm, rGPR:$Rn),
+ NoItinerary,
+ "usat16", "\t$Rd, $sat_imm, $Rn", []>,
+ Requires<[IsThumb2, HasDSP]> {
+ let Inst{31-22} = 0b1111001110;
+ let Inst{20} = 0;
+ let Inst{15} = 0;
+ let Inst{21} = 1; // sh = '1'
+ let Inst{14-12} = 0b000; // imm3 = '000'
+ let Inst{7-6} = 0b00; // imm2 = '00'
+ let Inst{5-4} = 0b00;
+}
+
+def : T2Pat<(int_arm_ssat GPR:$a, imm1_32:$pos), (t2SSAT imm1_32:$pos, GPR:$a, 0)>;
+def : T2Pat<(int_arm_usat GPR:$a, imm0_31:$pos), (t2USAT imm0_31:$pos, GPR:$a, 0)>;
+def : T2Pat<(ARMssatnoshift GPRnopc:$Rn, imm0_31:$imm),
+ (t2SSAT imm0_31:$imm, GPRnopc:$Rn, 0)>;
+
+//===----------------------------------------------------------------------===//
+// Shift and rotate Instructions.
+//
+
+defm t2LSL : T2I_sh_ir<0b00, "lsl", imm0_31, shl>;
+defm t2LSR : T2I_sh_ir<0b01, "lsr", imm_sr, srl>;
+defm t2ASR : T2I_sh_ir<0b10, "asr", imm_sr, sra>;
+defm t2ROR : T2I_sh_ir<0b11, "ror", imm0_31, rotr>;
+
+// (rotr x, (and y, 0x...1f)) ==> (ROR x, y)
+def : T2Pat<(rotr rGPR:$lhs, (and rGPR:$rhs, lo5AllOne)),
+ (t2RORrr rGPR:$lhs, rGPR:$rhs)>;
+
+let Uses = [CPSR] in {
+def t2RRX : T2sTwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi,
+ "rrx", "\t$Rd, $Rm",
+ [(set rGPR:$Rd, (ARMrrx rGPR:$Rm))]>, Sched<[WriteALU]> {
+ let Inst{31-27} = 0b11101;
+ let Inst{26-25} = 0b01;
+ let Inst{24-21} = 0b0010;
+ let Inst{19-16} = 0b1111; // Rn
+ let Inst{14-12} = 0b000;
+ let Inst{7-4} = 0b0011;
+}
+}
+
+let isCodeGenOnly = 1, Defs = [CPSR] in {
+def t2MOVsrl_flag : T2TwoRegShiftImm<
+ (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi,
+ "lsrs", ".w\t$Rd, $Rm, #1",
+ [(set rGPR:$Rd, (ARMsrl_flag rGPR:$Rm))]>,
+ Sched<[WriteALU]> {
+ let Inst{31-27} = 0b11101;
+ let Inst{26-25} = 0b01;
+ let Inst{24-21} = 0b0010;
+ let Inst{20} = 1; // The S bit.
+ let Inst{19-16} = 0b1111; // Rn
+ let Inst{5-4} = 0b01; // Shift type.
+ // Shift amount = Inst{14-12:7-6} = 1.
+ let Inst{14-12} = 0b000;
+ let Inst{7-6} = 0b01;
+}
+def t2MOVsra_flag : T2TwoRegShiftImm<
+ (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi,
+ "asrs", ".w\t$Rd, $Rm, #1",
+ [(set rGPR:$Rd, (ARMsra_flag rGPR:$Rm))]>,
+ Sched<[WriteALU]> {
+ let Inst{31-27} = 0b11101;
+ let Inst{26-25} = 0b01;
+ let Inst{24-21} = 0b0010;
+ let Inst{20} = 1; // The S bit.
+ let Inst{19-16} = 0b1111; // Rn
+ let Inst{5-4} = 0b10; // Shift type.
+ // Shift amount = Inst{14-12:7-6} = 1.
+ let Inst{14-12} = 0b000;
+ let Inst{7-6} = 0b01;
+}
+}
+
+//===----------------------------------------------------------------------===//
+// Bitwise Instructions.
+//
+
+defm t2AND : T2I_bin_w_irs<0b0000, "and",
+ IIC_iBITi, IIC_iBITr, IIC_iBITsi, and, 1>;
+defm t2ORR : T2I_bin_w_irs<0b0010, "orr",
+ IIC_iBITi, IIC_iBITr, IIC_iBITsi, or, 1>;
+defm t2EOR : T2I_bin_w_irs<0b0100, "eor",
+ IIC_iBITi, IIC_iBITr, IIC_iBITsi, xor, 1>;
+
+defm t2BIC : T2I_bin_w_irs<0b0001, "bic",
+ IIC_iBITi, IIC_iBITr, IIC_iBITsi,
+ BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
+
+class T2BitFI<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : T2I<oops, iops, itin, opc, asm, pattern> {
+ bits<4> Rd;
+ bits<5> msb;
+ bits<5> lsb;
+
+ let Inst{11-8} = Rd;
+ let Inst{4-0} = msb{4-0};
+ let Inst{14-12} = lsb{4-2};
+ let Inst{7-6} = lsb{1-0};
+}
+
+class T2TwoRegBitFI<dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : T2BitFI<oops, iops, itin, opc, asm, pattern> {
+ bits<4> Rn;
+
+ let Inst{19-16} = Rn;
+}
+
+let Constraints = "$src = $Rd" in
+def t2BFC : T2BitFI<(outs rGPR:$Rd), (ins rGPR:$src, bf_inv_mask_imm:$imm),
+ IIC_iUNAsi, "bfc", "\t$Rd, $imm",
+ [(set rGPR:$Rd, (and rGPR:$src, bf_inv_mask_imm:$imm))]> {
+ let Inst{31-27} = 0b11110;
+ let Inst{26} = 0; // should be 0.
+ let Inst{25} = 1;
+ let Inst{24-20} = 0b10110;
+ let Inst{19-16} = 0b1111; // Rn
+ let Inst{15} = 0;
+ let Inst{5} = 0; // should be 0.
+
+ bits<10> imm;
+ let msb{4-0} = imm{9-5};
+ let lsb{4-0} = imm{4-0};
+}
+
+def t2SBFX: T2TwoRegBitFI<
+ (outs rGPR:$Rd), (ins rGPR:$Rn, imm0_31:$lsb, imm1_32:$msb),
+ IIC_iUNAsi, "sbfx", "\t$Rd, $Rn, $lsb, $msb", []> {
+ let Inst{31-27} = 0b11110;
+ let Inst{25} = 1;
+ let Inst{24-20} = 0b10100;
+ let Inst{15} = 0;
+}
+
+def t2UBFX: T2TwoRegBitFI<
+ (outs rGPR:$Rd), (ins rGPR:$Rn, imm0_31:$lsb, imm1_32:$msb),
+ IIC_iUNAsi, "ubfx", "\t$Rd, $Rn, $lsb, $msb", []> {
+ let Inst{31-27} = 0b11110;
+ let Inst{25} = 1;
+ let Inst{24-20} = 0b11100;
+ let Inst{15} = 0;
+}
+
+// A8.8.247 UDF - Undefined (Encoding T2)
+def t2UDF : T2XI<(outs), (ins imm0_65535:$imm16), IIC_Br, "udf.w\t$imm16",
+ [(int_arm_undefined imm0_65535:$imm16)]> {
+ bits<16> imm16;
+ let Inst{31-29} = 0b111;
+ let Inst{28-27} = 0b10;
+ let Inst{26-20} = 0b1111111;
+ let Inst{19-16} = imm16{15-12};
+ let Inst{15} = 0b1;
+ let Inst{14-12} = 0b010;
+ let Inst{11-0} = imm16{11-0};
+}
+
+// A8.6.18 BFI - Bitfield insert (Encoding T1)
+let Constraints = "$src = $Rd" in {
+ def t2BFI : T2TwoRegBitFI<(outs rGPR:$Rd),
+ (ins rGPR:$src, rGPR:$Rn, bf_inv_mask_imm:$imm),
+ IIC_iBITi, "bfi", "\t$Rd, $Rn, $imm",
+ [(set rGPR:$Rd, (ARMbfi rGPR:$src, rGPR:$Rn,
+ bf_inv_mask_imm:$imm))]> {
+ let Inst{31-27} = 0b11110;
+ let Inst{26} = 0; // should be 0.
+ let Inst{25} = 1;
+ let Inst{24-20} = 0b10110;
+ let Inst{15} = 0;
+ let Inst{5} = 0; // should be 0.
+
+ bits<10> imm;
+ let msb{4-0} = imm{9-5};
+ let lsb{4-0} = imm{4-0};
+ }
+}
+
+defm t2ORN : T2I_bin_irs<0b0011, "orn",
+ IIC_iBITi, IIC_iBITr, IIC_iBITsi,
+ BinOpFrag<(or node:$LHS, (not node:$RHS))>, 0, "">;
+
+/// T2I_un_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns for a
+/// unary operation that produces a value. These are predicable and can be
+/// changed to modify CPSR.
+multiclass T2I_un_irs<bits<4> opcod, string opc,
+ InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+ PatFrag opnode,
+ bit Cheap = 0, bit ReMat = 0, bit MoveImm = 0> {
+ // shifted imm
+ def i : T2sOneRegImm<(outs rGPR:$Rd), (ins t2_so_imm:$imm), iii,
+ opc, "\t$Rd, $imm",
+ [(set rGPR:$Rd, (opnode t2_so_imm:$imm))]>, Sched<[WriteALU]> {
+ let isAsCheapAsAMove = Cheap;
+ let isReMaterializable = ReMat;
+ let isMoveImm = MoveImm;
+ let Inst{31-27} = 0b11110;
+ let Inst{25} = 0;
+ let Inst{24-21} = opcod;
+ let Inst{19-16} = 0b1111; // Rn
+ let Inst{15} = 0;
+ }
+ // register
+ def r : T2sTwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), iir,
+ opc, ".w\t$Rd, $Rm",
+ [(set rGPR:$Rd, (opnode rGPR:$Rm))]>, Sched<[WriteALU]> {
+ let Inst{31-27} = 0b11101;
+ let Inst{26-25} = 0b01;
+ let Inst{24-21} = opcod;
+ let Inst{19-16} = 0b1111; // Rn
+ let Inst{14-12} = 0b000; // imm3
+ let Inst{7-6} = 0b00; // imm2
+ let Inst{5-4} = 0b00; // type
+ }
+ // shifted register
+ def s : T2sOneRegShiftedReg<(outs rGPR:$Rd), (ins t2_so_reg:$ShiftedRm), iis,
+ opc, ".w\t$Rd, $ShiftedRm",
+ [(set rGPR:$Rd, (opnode t2_so_reg:$ShiftedRm))]>,
+ Sched<[WriteALU]> {
+ let Inst{31-27} = 0b11101;
+ let Inst{26-25} = 0b01;
+ let Inst{24-21} = opcod;
+ let Inst{19-16} = 0b1111; // Rn
+ }
+}
+
+// Prefer over of t2EORri ra, rb, -1 because mvn has 16-bit version
+let AddedComplexity = 1 in
+defm t2MVN : T2I_un_irs <0b0011, "mvn",
+ IIC_iMVNi, IIC_iMVNr, IIC_iMVNsi,
+ not, 1, 1, 1>;
+
+let AddedComplexity = 1 in
+def : T2Pat<(and rGPR:$src, t2_so_imm_not:$imm),
+ (t2BICri rGPR:$src, t2_so_imm_not:$imm)>;
+
+// top16Zero - answer true if the upper 16 bits of $src are 0, false otherwise
+def top16Zero: PatLeaf<(i32 rGPR:$src), [{
+ return CurDAG->MaskedValueIsZero(SDValue(N,0), APInt::getHighBitsSet(32, 16));
+ }]>;
+
+// so_imm_notSext is needed instead of so_imm_not, as the value of imm
+// will match the extended, not the original bitWidth for $src.
+def : T2Pat<(and top16Zero:$src, t2_so_imm_notSext:$imm),
+ (t2BICri rGPR:$src, t2_so_imm_notSext:$imm)>;
+
+
+// FIXME: Disable this pattern on Darwin to workaround an assembler bug.
+def : T2Pat<(or rGPR:$src, t2_so_imm_not:$imm),
+ (t2ORNri rGPR:$src, t2_so_imm_not:$imm)>,
+ Requires<[IsThumb2]>;
+
+def : T2Pat<(t2_so_imm_not:$src),
+ (t2MVNi t2_so_imm_not:$src)>;
+
+//===----------------------------------------------------------------------===//
+// Multiply Instructions.
+//
+let isCommutable = 1 in
+def t2MUL: T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32,
+ "mul", "\t$Rd, $Rn, $Rm",
+ [(set rGPR:$Rd, (mul rGPR:$Rn, rGPR:$Rm))]> {
+ let Inst{31-27} = 0b11111;
+ let Inst{26-23} = 0b0110;
+ let Inst{22-20} = 0b000;
+ let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate)
+ let Inst{7-4} = 0b0000; // Multiply
+}
+
+class T2FourRegMLA<bits<4> op7_4, string opc, list<dag> pattern>
+ : T2FourReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
+ opc, "\t$Rd, $Rn, $Rm, $Ra", pattern>,
+ Requires<[IsThumb2, UseMulOps]> {
+ let Inst{31-27} = 0b11111;
+ let Inst{26-23} = 0b0110;
+ let Inst{22-20} = 0b000;
+ let Inst{7-4} = op7_4;
+}
+
+def t2MLA : T2FourRegMLA<0b0000, "mla",
+ [(set rGPR:$Rd, (add (mul rGPR:$Rn, rGPR:$Rm),
+ rGPR:$Ra))]>;
+def t2MLS: T2FourRegMLA<0b0001, "mls",
+ [(set rGPR:$Rd, (sub rGPR:$Ra, (mul rGPR:$Rn,
+ rGPR:$Rm)))]>;
+
+// Extra precision multiplies with low / high results
+let hasSideEffects = 0 in {
+let isCommutable = 1 in {
+def t2SMULL : T2MulLong<0b000, 0b0000, "smull", []>;
+def t2UMULL : T2MulLong<0b010, 0b0000, "umull", []>;
+} // isCommutable
+
+// Multiply + accumulate
+def t2SMLAL : T2MlaLong<0b100, 0b0000, "smlal">;
+def t2UMLAL : T2MlaLong<0b110, 0b0000, "umlal">;
+def t2UMAAL : T2MlaLong<0b110, 0b0110, "umaal">, Requires<[IsThumb2, HasDSP]>;
+} // hasSideEffects
+
+// Rounding variants of the below included for disassembly only
+
+// Most significant word multiply
+class T2SMMUL<bits<4> op7_4, string opc, list<dag> pattern>
+ : T2ThreeReg<(outs rGPR:$Rd),
+ (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32,
+ opc, "\t$Rd, $Rn, $Rm", pattern>,
+ Requires<[IsThumb2, HasDSP]> {
+ let Inst{31-27} = 0b11111;
+ let Inst{26-23} = 0b0110;
+ let Inst{22-20} = 0b101;
+ let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate)
+ let Inst{7-4} = op7_4;
+}
+def t2SMMUL : T2SMMUL<0b0000, "smmul", [(set rGPR:$Rd, (mulhs rGPR:$Rn,
+ rGPR:$Rm))]>;
+def t2SMMULR : T2SMMUL<0b0001, "smmulr", []>;
+
+class T2FourRegSMMLA<bits<3> op22_20, bits<4> op7_4, string opc,
+ list<dag> pattern>
+ : T2FourReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
+ opc, "\t$Rd, $Rn, $Rm, $Ra", pattern>,
+ Requires<[IsThumb2, HasDSP, UseMulOps]> {
+ let Inst{31-27} = 0b11111;
+ let Inst{26-23} = 0b0110;
+ let Inst{22-20} = op22_20;
+ let Inst{7-4} = op7_4;
+}
+
+def t2SMMLA : T2FourRegSMMLA<0b101, 0b0000, "smmla",
+ [(set rGPR:$Rd, (add (mulhs rGPR:$Rm, rGPR:$Rn), rGPR:$Ra))]>;
+def t2SMMLAR: T2FourRegSMMLA<0b101, 0b0001, "smmlar", []>;
+def t2SMMLS: T2FourRegSMMLA<0b110, 0b0000, "smmls", []>;
+def t2SMMLSR: T2FourRegSMMLA<0b110, 0b0001, "smmlsr", []>;
+
+class T2ThreeRegSMUL<bits<3> op22_20, bits<2> op5_4, string opc,
+ list<dag> pattern>
+ : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16, opc,
+ "\t$Rd, $Rn, $Rm", pattern>,
+ Requires<[IsThumb2, HasDSP]> {
+ let Inst{31-27} = 0b11111;
+ let Inst{26-23} = 0b0110;
+ let Inst{22-20} = op22_20;
+ let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate)
+ let Inst{7-6} = 0b00;
+ let Inst{5-4} = op5_4;
+}
+
+def t2SMULBB : T2ThreeRegSMUL<0b001, 0b00, "smulbb",
+ [(set rGPR:$Rd, (mul (sext_inreg rGPR:$Rn, i16),
+ (sext_inreg rGPR:$Rm, i16)))]>;
+def t2SMULBT : T2ThreeRegSMUL<0b001, 0b01, "smulbt",
+ [(set rGPR:$Rd, (mul (sext_inreg rGPR:$Rn, i16),
+ (sra rGPR:$Rm, (i32 16))))]>;
+def t2SMULTB : T2ThreeRegSMUL<0b001, 0b10, "smultb",
+ [(set rGPR:$Rd, (mul (sra rGPR:$Rn, (i32 16)),
+ (sext_inreg rGPR:$Rm, i16)))]>;
+def t2SMULTT : T2ThreeRegSMUL<0b001, 0b11, "smultt",
+ [(set rGPR:$Rd, (mul (sra rGPR:$Rn, (i32 16)),
+ (sra rGPR:$Rm, (i32 16))))]>;
+def t2SMULWB : T2ThreeRegSMUL<0b011, 0b00, "smulwb", []>;
+def t2SMULWT : T2ThreeRegSMUL<0b011, 0b01, "smulwt", []>;
+
+def : Thumb2DSPPat<(mul sext_16_node:$Rm, sext_16_node:$Rn),
+ (t2SMULBB rGPR:$Rm, rGPR:$Rn)>;
+def : Thumb2DSPPat<(mul sext_16_node:$Rn, (sra rGPR:$Rm, (i32 16))),
+ (t2SMULBT rGPR:$Rn, rGPR:$Rm)>;
+def : Thumb2DSPPat<(mul (sra rGPR:$Rn, (i32 16)), sext_16_node:$Rm),
+ (t2SMULTB rGPR:$Rn, rGPR:$Rm)>;
+
+class T2FourRegSMLA<bits<3> op22_20, bits<2> op5_4, string opc,
+ list<dag> pattern>
+ : T2FourReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMUL16,
+ opc, "\t$Rd, $Rn, $Rm, $Ra", pattern>,
+ Requires<[IsThumb2, HasDSP, UseMulOps]> {
+ let Inst{31-27} = 0b11111;
+ let Inst{26-23} = 0b0110;
+ let Inst{22-20} = op22_20;
+ let Inst{7-6} = 0b00;
+ let Inst{5-4} = op5_4;
+}
+
+def t2SMLABB : T2FourRegSMLA<0b001, 0b00, "smlabb",
+ [(set rGPR:$Rd, (add rGPR:$Ra,
+ (mul (sext_inreg rGPR:$Rn, i16),
+ (sext_inreg rGPR:$Rm, i16))))]>;
+def t2SMLABT : T2FourRegSMLA<0b001, 0b01, "smlabt",
+ [(set rGPR:$Rd, (add rGPR:$Ra, (mul (sext_inreg rGPR:$Rn, i16),
+ (sra rGPR:$Rm, (i32 16)))))]>;
+def t2SMLATB : T2FourRegSMLA<0b001, 0b10, "smlatb",
+ [(set rGPR:$Rd, (add rGPR:$Ra, (mul (sra rGPR:$Rn, (i32 16)),
+ (sext_inreg rGPR:$Rm, i16))))]>;
+def t2SMLATT : T2FourRegSMLA<0b001, 0b11, "smlatt",
+ [(set rGPR:$Rd, (add rGPR:$Ra, (mul (sra rGPR:$Rn, (i32 16)),
+ (sra rGPR:$Rm, (i32 16)))))]>;
+def t2SMLAWB : T2FourRegSMLA<0b011, 0b00, "smlawb", []>;
+def t2SMLAWT : T2FourRegSMLA<0b011, 0b01, "smlawt", []>;
+
+def : Thumb2DSPMulPat<(add rGPR:$Ra, (mul sext_16_node:$Rn, sext_16_node:$Rm)),
+ (t2SMLABB rGPR:$Rn, rGPR:$Rm, rGPR:$Ra)>;
+def : Thumb2DSPMulPat<(add rGPR:$Ra,
+ (mul sext_16_node:$Rn, (sra rGPR:$Rm, (i32 16)))),
+ (t2SMLABT rGPR:$Rn, rGPR:$Rm, rGPR:$Ra)>;
+def : Thumb2DSPMulPat<(add rGPR:$Ra,
+ (mul (sra rGPR:$Rn, (i32 16)), sext_16_node:$Rm)),
+ (t2SMLATB rGPR:$Rn, rGPR:$Rm, rGPR:$Ra)>;
+
+class T2SMLAL<bits<3> op22_20, bits<4> op7_4, string opc, list<dag> pattern>
+ : T2FourReg_mac<1, op22_20, op7_4,
+ (outs rGPR:$Ra, rGPR:$Rd),
+ (ins rGPR:$Rn, rGPR:$Rm),
+ IIC_iMAC64, opc, "\t$Ra, $Rd, $Rn, $Rm", []>,
+ Requires<[IsThumb2, HasDSP]>;
+
+// Halfword multiple accumulate long: SMLAL<x><y>
+def t2SMLALBB : T2SMLAL<0b100, 0b1000, "smlalbb", []>;
+def t2SMLALBT : T2SMLAL<0b100, 0b1001, "smlalbt", []>;
+def t2SMLALTB : T2SMLAL<0b100, 0b1010, "smlaltb", []>;
+def t2SMLALTT : T2SMLAL<0b100, 0b1011, "smlaltt", []>;
+
+class T2DualHalfMul<bits<3> op22_20, bits<4> op7_4, string opc>
+ : T2ThreeReg_mac<0, op22_20, op7_4,
+ (outs rGPR:$Rd),
+ (ins rGPR:$Rn, rGPR:$Rm),
+ IIC_iMAC32, opc, "\t$Rd, $Rn, $Rm", []>,
+ Requires<[IsThumb2, HasDSP]> {
+ let Inst{15-12} = 0b1111;
+}
+
+// Dual halfword multiple: SMUAD, SMUSD, SMLAD, SMLSD, SMLALD, SMLSLD
+def t2SMUAD: T2DualHalfMul<0b010, 0b0000, "smuad">;
+def t2SMUADX: T2DualHalfMul<0b010, 0b0001, "smuadx">;
+def t2SMUSD: T2DualHalfMul<0b100, 0b0000, "smusd">;
+def t2SMUSDX: T2DualHalfMul<0b100, 0b0001, "smusdx">;
+
+class T2DualHalfMulAdd<bits<3> op22_20, bits<4> op7_4, string opc>
+ : T2FourReg_mac<0, op22_20, op7_4,
+ (outs rGPR:$Rd),
+ (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra),
+ IIC_iMAC32, opc, "\t$Rd, $Rn, $Rm, $Ra", []>,
+ Requires<[IsThumb2, HasDSP]>;
+
+def t2SMLAD : T2DualHalfMulAdd<0b010, 0b0000, "smlad">;
+def t2SMLADX : T2DualHalfMulAdd<0b010, 0b0001, "smladx">;
+def t2SMLSD : T2DualHalfMulAdd<0b100, 0b0000, "smlsd">;
+def t2SMLSDX : T2DualHalfMulAdd<0b100, 0b0001, "smlsdx">;
+
+class T2DualHalfMulAddLong<bits<3> op22_20, bits<4> op7_4, string opc>
+ : T2FourReg_mac<1, op22_20, op7_4,
+ (outs rGPR:$Ra, rGPR:$Rd),
+ (ins rGPR:$Rn, rGPR:$Rm),
+ IIC_iMAC64, opc, "\t$Ra, $Rd, $Rn, $Rm", []>,
+ Requires<[IsThumb2, HasDSP]>;
+
+def t2SMLALD : T2DualHalfMulAddLong<0b100, 0b1100, "smlald">;
+def t2SMLALDX : T2DualHalfMulAddLong<0b100, 0b1101, "smlaldx">;
+def t2SMLSLD : T2DualHalfMulAddLong<0b101, 0b1100, "smlsld">;
+def t2SMLSLDX : T2DualHalfMulAddLong<0b101, 0b1101, "smlsldx">;
+
+//===----------------------------------------------------------------------===//
+// Division Instructions.
+// Signed and unsigned division on v7-M
+//
+def t2SDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iDIV,
+ "sdiv", "\t$Rd, $Rn, $Rm",
+ [(set rGPR:$Rd, (sdiv rGPR:$Rn, rGPR:$Rm))]>,
+ Requires<[HasDivide, IsThumb, HasV8MBaseline]> {
+ let Inst{31-27} = 0b11111;
+ let Inst{26-21} = 0b011100;
+ let Inst{20} = 0b1;
+ let Inst{15-12} = 0b1111;
+ let Inst{7-4} = 0b1111;
+}
+
+def t2UDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iDIV,
+ "udiv", "\t$Rd, $Rn, $Rm",
+ [(set rGPR:$Rd, (udiv rGPR:$Rn, rGPR:$Rm))]>,
+ Requires<[HasDivide, IsThumb, HasV8MBaseline]> {
+ let Inst{31-27} = 0b11111;
+ let Inst{26-21} = 0b011101;
+ let Inst{20} = 0b1;
+ let Inst{15-12} = 0b1111;
+ let Inst{7-4} = 0b1111;
+}
+
+//===----------------------------------------------------------------------===//
+// Misc. Arithmetic Instructions.
+//
+
+class T2I_misc<bits<2> op1, bits<2> op2, dag oops, dag iops,
+ InstrItinClass itin, string opc, string asm, list<dag> pattern>
+ : T2ThreeReg<oops, iops, itin, opc, asm, pattern> {
+ let Inst{31-27} = 0b11111;
+ let Inst{26-22} = 0b01010;
+ let Inst{21-20} = op1;
+ let Inst{15-12} = 0b1111;
+ let Inst{7-6} = 0b10;
+ let Inst{5-4} = op2;
+ let Rn{3-0} = Rm;
+}
+
+def t2CLZ : T2I_misc<0b11, 0b00, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
+ "clz", "\t$Rd, $Rm", [(set rGPR:$Rd, (ctlz rGPR:$Rm))]>,
+ Sched<[WriteALU]>;
+
+def t2RBIT : T2I_misc<0b01, 0b10, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
+ "rbit", "\t$Rd, $Rm",
+ [(set rGPR:$Rd, (bitreverse rGPR:$Rm))]>,
+ Sched<[WriteALU]>;
+
+def t2REV : T2I_misc<0b01, 0b00, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
+ "rev", ".w\t$Rd, $Rm", [(set rGPR:$Rd, (bswap rGPR:$Rm))]>,
+ Sched<[WriteALU]>;
+
+def t2REV16 : T2I_misc<0b01, 0b01, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
+ "rev16", ".w\t$Rd, $Rm",
+ [(set rGPR:$Rd, (rotr (bswap rGPR:$Rm), (i32 16)))]>,
+ Sched<[WriteALU]>;
+
+def t2REVSH : T2I_misc<0b01, 0b11, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
+ "revsh", ".w\t$Rd, $Rm",
+ [(set rGPR:$Rd, (sra (bswap rGPR:$Rm), (i32 16)))]>,
+ Sched<[WriteALU]>;
+
+def : T2Pat<(or (sra (shl rGPR:$Rm, (i32 24)), (i32 16)),
+ (and (srl rGPR:$Rm, (i32 8)), 0xFF)),
+ (t2REVSH rGPR:$Rm)>;
+
+def t2PKHBT : T2ThreeReg<
+ (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, pkh_lsl_amt:$sh),
+ IIC_iBITsi, "pkhbt", "\t$Rd, $Rn, $Rm$sh",
+ [(set rGPR:$Rd, (or (and rGPR:$Rn, 0xFFFF),
+ (and (shl rGPR:$Rm, pkh_lsl_amt:$sh),
+ 0xFFFF0000)))]>,
+ Requires<[HasT2ExtractPack, IsThumb2]>,
+ Sched<[WriteALUsi, ReadALU]> {
+ let Inst{31-27} = 0b11101;
+ let Inst{26-25} = 0b01;
+ let Inst{24-20} = 0b01100;
+ let Inst{5} = 0; // BT form
+ let Inst{4} = 0;
+
+ bits<5> sh;
+ let Inst{14-12} = sh{4-2};
+ let Inst{7-6} = sh{1-0};
+}
+
+// Alternate cases for PKHBT where identities eliminate some nodes.
+def : T2Pat<(or (and rGPR:$src1, 0xFFFF), (and rGPR:$src2, 0xFFFF0000)),
+ (t2PKHBT rGPR:$src1, rGPR:$src2, 0)>,
+ Requires<[HasT2ExtractPack, IsThumb2]>;
+def : T2Pat<(or (and rGPR:$src1, 0xFFFF), (shl rGPR:$src2, imm16_31:$sh)),
+ (t2PKHBT rGPR:$src1, rGPR:$src2, imm16_31:$sh)>,
+ Requires<[HasT2ExtractPack, IsThumb2]>;
+
+// Note: Shifts of 1-15 bits will be transformed to srl instead of sra and
+// will match the pattern below.
+def t2PKHTB : T2ThreeReg<
+ (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, pkh_asr_amt:$sh),
+ IIC_iBITsi, "pkhtb", "\t$Rd, $Rn, $Rm$sh",
+ [(set rGPR:$Rd, (or (and rGPR:$Rn, 0xFFFF0000),
+ (and (sra rGPR:$Rm, pkh_asr_amt:$sh),
+ 0xFFFF)))]>,
+ Requires<[HasT2ExtractPack, IsThumb2]>,
+ Sched<[WriteALUsi, ReadALU]> {
+ let Inst{31-27} = 0b11101;
+ let Inst{26-25} = 0b01;
+ let Inst{24-20} = 0b01100;
+ let Inst{5} = 1; // TB form
+ let Inst{4} = 0;
+
+ bits<5> sh;
+ let Inst{14-12} = sh{4-2};
+ let Inst{7-6} = sh{1-0};
+}
+
+// Alternate cases for PKHTB where identities eliminate some nodes. Note that
+// a shift amount of 0 is *not legal* here, it is PKHBT instead.
+// We also can not replace a srl (17..31) by an arithmetic shift we would use in
+// pkhtb src1, src2, asr (17..31).
+def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000), (srl rGPR:$src2, imm16:$sh)),
+ (t2PKHTB rGPR:$src1, rGPR:$src2, imm16:$sh)>,
+ Requires<[HasT2ExtractPack, IsThumb2]>;
+def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000), (sra rGPR:$src2, imm16_31:$sh)),
+ (t2PKHTB rGPR:$src1, rGPR:$src2, imm16_31:$sh)>,
+ Requires<[HasT2ExtractPack, IsThumb2]>;
+def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000),
+ (and (srl rGPR:$src2, imm1_15:$sh), 0xFFFF)),
+ (t2PKHTB rGPR:$src1, rGPR:$src2, imm1_15:$sh)>,
+ Requires<[HasT2ExtractPack, IsThumb2]>;
+
+//===----------------------------------------------------------------------===//
+// CRC32 Instructions
+//
+// Polynomials:
+// + CRC32{B,H,W} 0x04C11DB7
+// + CRC32C{B,H,W} 0x1EDC6F41
+//
+
+class T2I_crc32<bit C, bits<2> sz, string suffix, SDPatternOperator builtin>
+ : T2ThreeRegNoP<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), NoItinerary,
+ !strconcat("crc32", suffix, "\t$Rd, $Rn, $Rm"),
+ [(set rGPR:$Rd, (builtin rGPR:$Rn, rGPR:$Rm))]>,
+ Requires<[IsThumb2, HasV8, HasCRC]> {
+ let Inst{31-27} = 0b11111;
+ let Inst{26-21} = 0b010110;
+ let Inst{20} = C;
+ let Inst{15-12} = 0b1111;
+ let Inst{7-6} = 0b10;
+ let Inst{5-4} = sz;
+}
+
+def t2CRC32B : T2I_crc32<0, 0b00, "b", int_arm_crc32b>;
+def t2CRC32CB : T2I_crc32<1, 0b00, "cb", int_arm_crc32cb>;
+def t2CRC32H : T2I_crc32<0, 0b01, "h", int_arm_crc32h>;
+def t2CRC32CH : T2I_crc32<1, 0b01, "ch", int_arm_crc32ch>;
+def t2CRC32W : T2I_crc32<0, 0b10, "w", int_arm_crc32w>;
+def t2CRC32CW : T2I_crc32<1, 0b10, "cw", int_arm_crc32cw>;
+
+//===----------------------------------------------------------------------===//
+// Comparison Instructions...
+//
+defm t2CMP : T2I_cmp_irs<0b1101, "cmp",
+ IIC_iCMPi, IIC_iCMPr, IIC_iCMPsi, ARMcmp>;
+
+def : T2Pat<(ARMcmpZ GPRnopc:$lhs, t2_so_imm:$imm),
+ (t2CMPri GPRnopc:$lhs, t2_so_imm:$imm)>;
+def : T2Pat<(ARMcmpZ GPRnopc:$lhs, rGPR:$rhs),
+ (t2CMPrr GPRnopc:$lhs, rGPR:$rhs)>;
+def : T2Pat<(ARMcmpZ GPRnopc:$lhs, t2_so_reg:$rhs),
+ (t2CMPrs GPRnopc:$lhs, t2_so_reg:$rhs)>;
+
+let isCompare = 1, Defs = [CPSR] in {
+ // shifted imm
+ def t2CMNri : T2OneRegCmpImm<
+ (outs), (ins GPRnopc:$Rn, t2_so_imm:$imm), IIC_iCMPi,
+ "cmn", ".w\t$Rn, $imm",
+ [(ARMcmn GPRnopc:$Rn, (ineg t2_so_imm:$imm))]>,
+ Sched<[WriteCMP, ReadALU]> {
+ let Inst{31-27} = 0b11110;
+ let Inst{25} = 0;
+ let Inst{24-21} = 0b1000;
+ let Inst{20} = 1; // The S bit.
+ let Inst{15} = 0;
+ let Inst{11-8} = 0b1111; // Rd
+ }
+ // register
+ def t2CMNzrr : T2TwoRegCmp<
+ (outs), (ins GPRnopc:$Rn, rGPR:$Rm), IIC_iCMPr,
+ "cmn", ".w\t$Rn, $Rm",
+ [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
+ GPRnopc:$Rn, rGPR:$Rm)]>, Sched<[WriteCMP, ReadALU, ReadALU]> {
+ let Inst{31-27} = 0b11101;
+ let Inst{26-25} = 0b01;
+ let Inst{24-21} = 0b1000;
+ let Inst{20} = 1; // The S bit.
+ let Inst{14-12} = 0b000; // imm3
+ let Inst{11-8} = 0b1111; // Rd
+ let Inst{7-6} = 0b00; // imm2
+ let Inst{5-4} = 0b00; // type
+ }
+ // shifted register
+ def t2CMNzrs : T2OneRegCmpShiftedReg<
+ (outs), (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm), IIC_iCMPsi,
+ "cmn", ".w\t$Rn, $ShiftedRm",
+ [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
+ GPRnopc:$Rn, t2_so_reg:$ShiftedRm)]>,
+ Sched<[WriteCMPsi, ReadALU, ReadALU]> {
+ let Inst{31-27} = 0b11101;
+ let Inst{26-25} = 0b01;
+ let Inst{24-21} = 0b1000;
+ let Inst{20} = 1; // The S bit.
+ let Inst{11-8} = 0b1111; // Rd
+ }
+}
+
+// Assembler aliases w/o the ".w" suffix.
+// No alias here for 'rr' version as not all instantiations of this multiclass
+// want one (CMP in particular, does not).
+def : t2InstAlias<"cmn${p} $Rn, $imm",
+ (t2CMNri GPRnopc:$Rn, t2_so_imm:$imm, pred:$p)>;
+def : t2InstAlias<"cmn${p} $Rn, $shift",
+ (t2CMNzrs GPRnopc:$Rn, t2_so_reg:$shift, pred:$p)>;
+
+def : T2Pat<(ARMcmp GPR:$src, t2_so_imm_neg:$imm),
+ (t2CMNri GPR:$src, t2_so_imm_neg:$imm)>;
+
+def : T2Pat<(ARMcmpZ GPRnopc:$src, t2_so_imm_neg:$imm),
+ (t2CMNri GPRnopc:$src, t2_so_imm_neg:$imm)>;
+
+defm t2TST : T2I_cmp_irs<0b0000, "tst",
+ IIC_iTSTi, IIC_iTSTr, IIC_iTSTsi,
+ BinOpFrag<(ARMcmpZ (and_su node:$LHS, node:$RHS), 0)>>;
+defm t2TEQ : T2I_cmp_irs<0b0100, "teq",
+ IIC_iTSTi, IIC_iTSTr, IIC_iTSTsi,
+ BinOpFrag<(ARMcmpZ (xor_su node:$LHS, node:$RHS), 0)>>;
+
+// Conditional moves
+let hasSideEffects = 0 in {
+
+let isCommutable = 1, isSelect = 1 in
+def t2MOVCCr : t2PseudoInst<(outs rGPR:$Rd),
+ (ins rGPR:$false, rGPR:$Rm, cmovpred:$p),
+ 4, IIC_iCMOVr,
+ [(set rGPR:$Rd, (ARMcmov rGPR:$false, rGPR:$Rm,
+ cmovpred:$p))]>,
+ RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+
+let isMoveImm = 1 in
+def t2MOVCCi
+ : t2PseudoInst<(outs rGPR:$Rd),
+ (ins rGPR:$false, t2_so_imm:$imm, cmovpred:$p),
+ 4, IIC_iCMOVi,
+ [(set rGPR:$Rd, (ARMcmov rGPR:$false,t2_so_imm:$imm,
+ cmovpred:$p))]>,
+ RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+
+let isCodeGenOnly = 1 in {
+let isMoveImm = 1 in
+def t2MOVCCi16
+ : t2PseudoInst<(outs rGPR:$Rd),
+ (ins rGPR:$false, imm0_65535_expr:$imm, cmovpred:$p),
+ 4, IIC_iCMOVi,
+ [(set rGPR:$Rd, (ARMcmov rGPR:$false, imm0_65535:$imm,
+ cmovpred:$p))]>,
+ RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+
+let isMoveImm = 1 in
+def t2MVNCCi
+ : t2PseudoInst<(outs rGPR:$Rd),
+ (ins rGPR:$false, t2_so_imm:$imm, cmovpred:$p),
+ 4, IIC_iCMOVi,
+ [(set rGPR:$Rd,
+ (ARMcmov rGPR:$false, t2_so_imm_not:$imm,
+ cmovpred:$p))]>,
+ RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+
+class MOVCCShPseudo<SDPatternOperator opnode, Operand ty>
+ : t2PseudoInst<(outs rGPR:$Rd),
+ (ins rGPR:$false, rGPR:$Rm, i32imm:$imm, cmovpred:$p),
+ 4, IIC_iCMOVsi,
+ [(set rGPR:$Rd, (ARMcmov rGPR:$false,
+ (opnode rGPR:$Rm, (i32 ty:$imm)),
+ cmovpred:$p))]>,
+ RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+
+def t2MOVCClsl : MOVCCShPseudo<shl, imm0_31>;
+def t2MOVCClsr : MOVCCShPseudo<srl, imm_sr>;
+def t2MOVCCasr : MOVCCShPseudo<sra, imm_sr>;
+def t2MOVCCror : MOVCCShPseudo<rotr, imm0_31>;
+
+let isMoveImm = 1 in
+def t2MOVCCi32imm
+ : t2PseudoInst<(outs rGPR:$dst),
+ (ins rGPR:$false, i32imm:$src, cmovpred:$p),
+ 8, IIC_iCMOVix2,
+ [(set rGPR:$dst, (ARMcmov rGPR:$false, imm:$src,
+ cmovpred:$p))]>,
+ RegConstraint<"$false = $dst">;
+} // isCodeGenOnly = 1
+
+} // hasSideEffects
+
+//===----------------------------------------------------------------------===//
+// Atomic operations intrinsics
+//
+
+// memory barriers protect the atomic sequences
+let hasSideEffects = 1 in {
+def t2DMB : T2I<(outs), (ins memb_opt:$opt), NoItinerary,
+ "dmb", "\t$opt", [(int_arm_dmb (i32 imm0_15:$opt))]>,
+ Requires<[IsThumb, HasDB]> {
+ bits<4> opt;
+ let Inst{31-4} = 0xf3bf8f5;
+ let Inst{3-0} = opt;
+}
+
+def t2DSB : T2I<(outs), (ins memb_opt:$opt), NoItinerary,
+ "dsb", "\t$opt", [(int_arm_dsb (i32 imm0_15:$opt))]>,
+ Requires<[IsThumb, HasDB]> {
+ bits<4> opt;
+ let Inst{31-4} = 0xf3bf8f4;
+ let Inst{3-0} = opt;
+}
+
+def t2ISB : T2I<(outs), (ins instsyncb_opt:$opt), NoItinerary,
+ "isb", "\t$opt", [(int_arm_isb (i32 imm0_15:$opt))]>,
+ Requires<[IsThumb, HasDB]> {
+ bits<4> opt;
+ let Inst{31-4} = 0xf3bf8f6;
+ let Inst{3-0} = opt;
+}
+}
+
+class T2I_ldrex<bits<4> opcod, dag oops, dag iops, AddrMode am, int sz,
+ InstrItinClass itin, string opc, string asm, string cstr,
+ list<dag> pattern, bits<4> rt2 = 0b1111>
+ : Thumb2I<oops, iops, am, sz, itin, opc, asm, cstr, pattern> {
+ let Inst{31-27} = 0b11101;
+ let Inst{26-20} = 0b0001101;
+ let Inst{11-8} = rt2;
+ let Inst{7-4} = opcod;
+ let Inst{3-0} = 0b1111;
+
+ bits<4> addr;
+ bits<4> Rt;
+ let Inst{19-16} = addr;
+ let Inst{15-12} = Rt;
+}
+class T2I_strex<bits<4> opcod, dag oops, dag iops, AddrMode am, int sz,
+ InstrItinClass itin, string opc, string asm, string cstr,
+ list<dag> pattern, bits<4> rt2 = 0b1111>
+ : Thumb2I<oops, iops, am, sz, itin, opc, asm, cstr, pattern> {
+ let Inst{31-27} = 0b11101;
+ let Inst{26-20} = 0b0001100;
+ let Inst{11-8} = rt2;
+ let Inst{7-4} = opcod;
+
+ bits<4> Rd;
+ bits<4> addr;
+ bits<4> Rt;
+ let Inst{3-0} = Rd;
+ let Inst{19-16} = addr;
+ let Inst{15-12} = Rt;
+}
+
+let mayLoad = 1 in {
+def t2LDREXB : T2I_ldrex<0b0100, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
+ AddrModeNone, 4, NoItinerary,
+ "ldrexb", "\t$Rt, $addr", "",
+ [(set rGPR:$Rt, (ldrex_1 addr_offset_none:$addr))]>,
+ Requires<[IsThumb, HasV8MBaseline]>;
+def t2LDREXH : T2I_ldrex<0b0101, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
+ AddrModeNone, 4, NoItinerary,
+ "ldrexh", "\t$Rt, $addr", "",
+ [(set rGPR:$Rt, (ldrex_2 addr_offset_none:$addr))]>,
+ Requires<[IsThumb, HasV8MBaseline]>;
+def t2LDREX : Thumb2I<(outs rGPR:$Rt), (ins t2addrmode_imm0_1020s4:$addr),
+ AddrModeNone, 4, NoItinerary,
+ "ldrex", "\t$Rt, $addr", "",
+ [(set rGPR:$Rt, (ldrex_4 t2addrmode_imm0_1020s4:$addr))]>,
+ Requires<[IsThumb, HasV8MBaseline]> {
+ bits<4> Rt;
+ bits<12> addr;
+ let Inst{31-27} = 0b11101;
+ let Inst{26-20} = 0b0000101;
+ let Inst{19-16} = addr{11-8};
+ let Inst{15-12} = Rt;
+ let Inst{11-8} = 0b1111;
+ let Inst{7-0} = addr{7-0};
+}
+let hasExtraDefRegAllocReq = 1 in
+def t2LDREXD : T2I_ldrex<0b0111, (outs rGPR:$Rt, rGPR:$Rt2),
+ (ins addr_offset_none:$addr),
+ AddrModeNone, 4, NoItinerary,
+ "ldrexd", "\t$Rt, $Rt2, $addr", "",
+ [], {?, ?, ?, ?}>,
+ Requires<[IsThumb2, IsNotMClass]> {
+ bits<4> Rt2;
+ let Inst{11-8} = Rt2;
+}
+def t2LDAEXB : T2I_ldrex<0b1100, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
+ AddrModeNone, 4, NoItinerary,
+ "ldaexb", "\t$Rt, $addr", "",
+ [(set rGPR:$Rt, (ldaex_1 addr_offset_none:$addr))]>,
+ Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>;
+def t2LDAEXH : T2I_ldrex<0b1101, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
+ AddrModeNone, 4, NoItinerary,
+ "ldaexh", "\t$Rt, $addr", "",
+ [(set rGPR:$Rt, (ldaex_2 addr_offset_none:$addr))]>,
+ Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>;
+def t2LDAEX : Thumb2I<(outs rGPR:$Rt), (ins addr_offset_none:$addr),
+ AddrModeNone, 4, NoItinerary,
+ "ldaex", "\t$Rt, $addr", "",
+ [(set rGPR:$Rt, (ldaex_4 addr_offset_none:$addr))]>,
+ Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]> {
+ bits<4> Rt;
+ bits<4> addr;
+ let Inst{31-27} = 0b11101;
+ let Inst{26-20} = 0b0001101;
+ let Inst{19-16} = addr;
+ let Inst{15-12} = Rt;
+ let Inst{11-8} = 0b1111;
+ let Inst{7-0} = 0b11101111;
+}
+let hasExtraDefRegAllocReq = 1 in
+def t2LDAEXD : T2I_ldrex<0b1111, (outs rGPR:$Rt, rGPR:$Rt2),
+ (ins addr_offset_none:$addr),
+ AddrModeNone, 4, NoItinerary,
+ "ldaexd", "\t$Rt, $Rt2, $addr", "",
+ [], {?, ?, ?, ?}>, Requires<[IsThumb,
+ HasAcquireRelease, HasV7Clrex, IsNotMClass]> {
+ bits<4> Rt2;
+ let Inst{11-8} = Rt2;
+
+ let Inst{7} = 1;
+}
+}
+
+let mayStore = 1, Constraints = "@earlyclobber $Rd" in {
+def t2STREXB : T2I_strex<0b0100, (outs rGPR:$Rd),
+ (ins rGPR:$Rt, addr_offset_none:$addr),
+ AddrModeNone, 4, NoItinerary,
+ "strexb", "\t$Rd, $Rt, $addr", "",
+ [(set rGPR:$Rd,
+ (strex_1 rGPR:$Rt, addr_offset_none:$addr))]>,
+ Requires<[IsThumb, HasV8MBaseline]>;
+def t2STREXH : T2I_strex<0b0101, (outs rGPR:$Rd),
+ (ins rGPR:$Rt, addr_offset_none:$addr),
+ AddrModeNone, 4, NoItinerary,
+ "strexh", "\t$Rd, $Rt, $addr", "",
+ [(set rGPR:$Rd,
+ (strex_2 rGPR:$Rt, addr_offset_none:$addr))]>,
+ Requires<[IsThumb, HasV8MBaseline]>;
+
+def t2STREX : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt,
+ t2addrmode_imm0_1020s4:$addr),
+ AddrModeNone, 4, NoItinerary,
+ "strex", "\t$Rd, $Rt, $addr", "",
+ [(set rGPR:$Rd,
+ (strex_4 rGPR:$Rt, t2addrmode_imm0_1020s4:$addr))]>,
+ Requires<[IsThumb, HasV8MBaseline]> {
+ bits<4> Rd;
+ bits<4> Rt;
+ bits<12> addr;
+ let Inst{31-27} = 0b11101;
+ let Inst{26-20} = 0b0000100;
+ let Inst{19-16} = addr{11-8};
+ let Inst{15-12} = Rt;
+ let Inst{11-8} = Rd;
+ let Inst{7-0} = addr{7-0};
+}
+let hasExtraSrcRegAllocReq = 1 in
+def t2STREXD : T2I_strex<0b0111, (outs rGPR:$Rd),
+ (ins rGPR:$Rt, rGPR:$Rt2, addr_offset_none:$addr),
+ AddrModeNone, 4, NoItinerary,
+ "strexd", "\t$Rd, $Rt, $Rt2, $addr", "", [],
+ {?, ?, ?, ?}>,
+ Requires<[IsThumb2, IsNotMClass]> {
+ bits<4> Rt2;
+ let Inst{11-8} = Rt2;
+}
+def t2STLEXB : T2I_strex<0b1100, (outs rGPR:$Rd),
+ (ins rGPR:$Rt, addr_offset_none:$addr),
+ AddrModeNone, 4, NoItinerary,
+ "stlexb", "\t$Rd, $Rt, $addr", "",
+ [(set rGPR:$Rd,
+ (stlex_1 rGPR:$Rt, addr_offset_none:$addr))]>,
+ Requires<[IsThumb, HasAcquireRelease,
+ HasV7Clrex]>;
+
+def t2STLEXH : T2I_strex<0b1101, (outs rGPR:$Rd),
+ (ins rGPR:$Rt, addr_offset_none:$addr),
+ AddrModeNone, 4, NoItinerary,
+ "stlexh", "\t$Rd, $Rt, $addr", "",
+ [(set rGPR:$Rd,
+ (stlex_2 rGPR:$Rt, addr_offset_none:$addr))]>,
+ Requires<[IsThumb, HasAcquireRelease,
+ HasV7Clrex]>;
+
+def t2STLEX : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt,
+ addr_offset_none:$addr),
+ AddrModeNone, 4, NoItinerary,
+ "stlex", "\t$Rd, $Rt, $addr", "",
+ [(set rGPR:$Rd,
+ (stlex_4 rGPR:$Rt, addr_offset_none:$addr))]>,
+ Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]> {
+ bits<4> Rd;
+ bits<4> Rt;
+ bits<4> addr;
+ let Inst{31-27} = 0b11101;
+ let Inst{26-20} = 0b0001100;
+ let Inst{19-16} = addr;
+ let Inst{15-12} = Rt;
+ let Inst{11-4} = 0b11111110;
+ let Inst{3-0} = Rd;
+}
+let hasExtraSrcRegAllocReq = 1 in
+def t2STLEXD : T2I_strex<0b1111, (outs rGPR:$Rd),
+ (ins rGPR:$Rt, rGPR:$Rt2, addr_offset_none:$addr),
+ AddrModeNone, 4, NoItinerary,
+ "stlexd", "\t$Rd, $Rt, $Rt2, $addr", "", [],
+ {?, ?, ?, ?}>, Requires<[IsThumb, HasAcquireRelease,
+ HasV7Clrex, IsNotMClass]> {
+ bits<4> Rt2;
+ let Inst{11-8} = Rt2;
+}
+}
+
+def t2CLREX : T2I<(outs), (ins), NoItinerary, "clrex", "", [(int_arm_clrex)]>,
+ Requires<[IsThumb, HasV7Clrex]> {
+ let Inst{31-16} = 0xf3bf;
+ let Inst{15-14} = 0b10;
+ let Inst{13} = 0;
+ let Inst{12} = 0;
+ let Inst{11-8} = 0b1111;
+ let Inst{7-4} = 0b0010;
+ let Inst{3-0} = 0b1111;
+}
+
+def : T2Pat<(and (ldrex_1 addr_offset_none:$addr), 0xff),
+ (t2LDREXB addr_offset_none:$addr)>,
+ Requires<[IsThumb, HasV8MBaseline]>;
+def : T2Pat<(and (ldrex_2 addr_offset_none:$addr), 0xffff),
+ (t2LDREXH addr_offset_none:$addr)>,
+ Requires<[IsThumb, HasV8MBaseline]>;
+def : T2Pat<(strex_1 (and GPR:$Rt, 0xff), addr_offset_none:$addr),
+ (t2STREXB GPR:$Rt, addr_offset_none:$addr)>,
+ Requires<[IsThumb, HasV8MBaseline]>;
+def : T2Pat<(strex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr),
+ (t2STREXH GPR:$Rt, addr_offset_none:$addr)>,
+ Requires<[IsThumb, HasV8MBaseline]>;
+
+def : T2Pat<(and (ldaex_1 addr_offset_none:$addr), 0xff),
+ (t2LDAEXB addr_offset_none:$addr)>,
+ Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>;
+def : T2Pat<(and (ldaex_2 addr_offset_none:$addr), 0xffff),
+ (t2LDAEXH addr_offset_none:$addr)>,
+ Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>;
+def : T2Pat<(stlex_1 (and GPR:$Rt, 0xff), addr_offset_none:$addr),
+ (t2STLEXB GPR:$Rt, addr_offset_none:$addr)>,
+ Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>;
+def : T2Pat<(stlex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr),
+ (t2STLEXH GPR:$Rt, addr_offset_none:$addr)>,
+ Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>;
+
+//===----------------------------------------------------------------------===//
+// SJLJ Exception handling intrinsics
+// eh_sjlj_setjmp() is an instruction sequence to store the return
+// address and save #0 in R0 for the non-longjmp case.
+// Since by its nature we may be coming from some other function to get
+// here, and we're using the stack frame for the containing function to
+// save/restore registers, we can't keep anything live in regs across
+// the eh_sjlj_setjmp(), else it will almost certainly have been tromped upon
+// when we get here from a longjmp(). We force everything out of registers
+// except for our own input by listing the relevant registers in Defs. By
+// doing so, we also cause the prologue/epilogue code to actively preserve
+// all of the callee-saved resgisters, which is exactly what we want.
+// $val is a scratch register for our use.
+let Defs =
+ [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, CPSR,
+ Q0, Q1, Q2, Q3, Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15],
+ hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
+ usesCustomInserter = 1 in {
+ def t2Int_eh_sjlj_setjmp : Thumb2XI<(outs), (ins tGPR:$src, tGPR:$val),
+ AddrModeNone, 0, NoItinerary, "", "",
+ [(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>,
+ Requires<[IsThumb2, HasVFP2]>;
+}
+
+let Defs =
+ [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, CPSR ],
+ hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
+ usesCustomInserter = 1 in {
+ def t2Int_eh_sjlj_setjmp_nofp : Thumb2XI<(outs), (ins tGPR:$src, tGPR:$val),
+ AddrModeNone, 0, NoItinerary, "", "",
+ [(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>,
+ Requires<[IsThumb2, NoVFP]>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Control-Flow Instructions
+//
+
+// FIXME: remove when we have a way to marking a MI with these properties.
+// FIXME: Should pc be an implicit operand like PICADD, etc?
+let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1,
+ hasExtraDefRegAllocReq = 1, isCodeGenOnly = 1 in
+def t2LDMIA_RET: t2PseudoExpand<(outs GPR:$wb), (ins GPR:$Rn, pred:$p,
+ reglist:$regs, variable_ops),
+ 4, IIC_iLoad_mBr, [],
+ (t2LDMIA_UPD GPR:$wb, GPR:$Rn, pred:$p, reglist:$regs)>,
+ RegConstraint<"$Rn = $wb">;
+
+let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
+let isPredicable = 1 in
+def t2B : T2I<(outs), (ins thumb_br_target:$target), IIC_Br,
+ "b", ".w\t$target",
+ [(br bb:$target)]>, Sched<[WriteBr]>,
+ Requires<[IsThumb, HasV8MBaseline]> {
+ let Inst{31-27} = 0b11110;
+ let Inst{15-14} = 0b10;
+ let Inst{12} = 1;
+
+ bits<24> target;
+ let Inst{26} = target{23};
+ let Inst{13} = target{22};
+ let Inst{11} = target{21};
+ let Inst{25-16} = target{20-11};
+ let Inst{10-0} = target{10-0};
+ let DecoderMethod = "DecodeT2BInstruction";
+ let AsmMatchConverter = "cvtThumbBranches";
+}
+
+let Size = 4, isNotDuplicable = 1, isIndirectBranch = 1 in {
+
+// available in both v8-M.Baseline and Thumb2 targets
+def t2BR_JT : t2basePseudoInst<(outs),
+ (ins GPR:$target, GPR:$index, i32imm:$jt),
+ 0, IIC_Br,
+ [(ARMbr2jt GPR:$target, GPR:$index, tjumptable:$jt)]>,
+ Sched<[WriteBr]>;
+
+// FIXME: Add a case that can be predicated.
+def t2TBB_JT : t2PseudoInst<(outs),
+ (ins GPR:$base, GPR:$index, i32imm:$jt, i32imm:$pclbl), 0, IIC_Br, []>,
+ Sched<[WriteBr]>;
+
+def t2TBH_JT : t2PseudoInst<(outs),
+ (ins GPR:$base, GPR:$index, i32imm:$jt, i32imm:$pclbl), 0, IIC_Br, []>,
+ Sched<[WriteBr]>;
+
+def t2TBB : T2I<(outs), (ins addrmode_tbb:$addr), IIC_Br,
+ "tbb", "\t$addr", []>, Sched<[WriteBrTbl]> {
+ bits<4> Rn;
+ bits<4> Rm;
+ let Inst{31-20} = 0b111010001101;
+ let Inst{19-16} = Rn;
+ let Inst{15-5} = 0b11110000000;
+ let Inst{4} = 0; // B form
+ let Inst{3-0} = Rm;
+
+ let DecoderMethod = "DecodeThumbTableBranch";
+}
+
+def t2TBH : T2I<(outs), (ins addrmode_tbh:$addr), IIC_Br,
+ "tbh", "\t$addr", []>, Sched<[WriteBrTbl]> {
+ bits<4> Rn;
+ bits<4> Rm;
+ let Inst{31-20} = 0b111010001101;
+ let Inst{19-16} = Rn;
+ let Inst{15-5} = 0b11110000000;
+ let Inst{4} = 1; // H form
+ let Inst{3-0} = Rm;
+
+ let DecoderMethod = "DecodeThumbTableBranch";
+}
+} // isNotDuplicable, isIndirectBranch
+
+} // isBranch, isTerminator, isBarrier
+
+// FIXME: should be able to write a pattern for ARMBrcond, but can't use
+// a two-value operand where a dag node expects ", "two operands. :(
+let isBranch = 1, isTerminator = 1 in
+def t2Bcc : T2I<(outs), (ins brtarget:$target), IIC_Br,
+ "b", ".w\t$target",
+ [/*(ARMbrcond bb:$target, imm:$cc)*/]>, Sched<[WriteBr]> {
+ let Inst{31-27} = 0b11110;
+ let Inst{15-14} = 0b10;
+ let Inst{12} = 0;
+
+ bits<4> p;
+ let Inst{25-22} = p;
+
+ bits<21> target;
+ let Inst{26} = target{20};
+ let Inst{11} = target{19};
+ let Inst{13} = target{18};
+ let Inst{21-16} = target{17-12};
+ let Inst{10-0} = target{11-1};
+
+ let DecoderMethod = "DecodeThumb2BCCInstruction";
+ let AsmMatchConverter = "cvtThumbBranches";
+}
+
+// Tail calls. The MachO version of thumb tail calls uses a t2 branch, so
+// it goes here.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
+ // IOS version.
+ let Uses = [SP] in
+ def tTAILJMPd: tPseudoExpand<(outs),
+ (ins thumb_br_target:$dst, pred:$p),
+ 4, IIC_Br, [],
+ (t2B thumb_br_target:$dst, pred:$p)>,
+ Requires<[IsThumb2, IsMachO]>, Sched<[WriteBr]>;
+}
+
+// IT block
+let Defs = [ITSTATE] in
+def t2IT : Thumb2XI<(outs), (ins it_pred:$cc, it_mask:$mask),
+ AddrModeNone, 2, IIC_iALUx,
+ "it$mask\t$cc", "", []>,
+ ComplexDeprecationPredicate<"IT"> {
+ // 16-bit instruction.
+ let Inst{31-16} = 0x0000;
+ let Inst{15-8} = 0b10111111;
+
+ bits<4> cc;
+ bits<4> mask;
+ let Inst{7-4} = cc;
+ let Inst{3-0} = mask;
+
+ let DecoderMethod = "DecodeIT";
+}
+
+// Branch and Exchange Jazelle -- for disassembly only
+// Rm = Inst{19-16}
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in
+def t2BXJ : T2I<(outs), (ins GPRnopc:$func), NoItinerary, "bxj", "\t$func", []>,
+ Sched<[WriteBr]>, Requires<[IsThumb2, IsNotMClass]> {
+ bits<4> func;
+ let Inst{31-27} = 0b11110;
+ let Inst{26} = 0;
+ let Inst{25-20} = 0b111100;
+ let Inst{19-16} = func;
+ let Inst{15-0} = 0b1000111100000000;
+}
+
+// Compare and branch on zero / non-zero
+let isBranch = 1, isTerminator = 1 in {
+ def tCBZ : T1I<(outs), (ins tGPR:$Rn, thumb_cb_target:$target), IIC_Br,
+ "cbz\t$Rn, $target", []>,
+ T1Misc<{0,0,?,1,?,?,?}>,
+ Requires<[IsThumb, HasV8MBaseline]>, Sched<[WriteBr]> {
+ // A8.6.27
+ bits<6> target;
+ bits<3> Rn;
+ let Inst{9} = target{5};
+ let Inst{7-3} = target{4-0};
+ let Inst{2-0} = Rn;
+ }
+
+ def tCBNZ : T1I<(outs), (ins tGPR:$Rn, thumb_cb_target:$target), IIC_Br,
+ "cbnz\t$Rn, $target", []>,
+ T1Misc<{1,0,?,1,?,?,?}>,
+ Requires<[IsThumb, HasV8MBaseline]>, Sched<[WriteBr]> {
+ // A8.6.27
+ bits<6> target;
+ bits<3> Rn;
+ let Inst{9} = target{5};
+ let Inst{7-3} = target{4-0};
+ let Inst{2-0} = Rn;
+ }
+}
+
+
+// Change Processor State is a system instruction.
+// FIXME: Since the asm parser has currently no clean way to handle optional
+// operands, create 3 versions of the same instruction. Once there's a clean
+// framework to represent optional operands, change this behavior.
+class t2CPS<dag iops, string asm_op> : T2XI<(outs), iops, NoItinerary,
+ !strconcat("cps", asm_op), []>,
+ Requires<[IsThumb2, IsNotMClass]> {
+ bits<2> imod;
+ bits<3> iflags;
+ bits<5> mode;
+ bit M;
+
+ let Inst{31-11} = 0b111100111010111110000;
+ let Inst{10-9} = imod;
+ let Inst{8} = M;
+ let Inst{7-5} = iflags;
+ let Inst{4-0} = mode;
+ let DecoderMethod = "DecodeT2CPSInstruction";
+}
+
+let M = 1 in
+ def t2CPS3p : t2CPS<(ins imod_op:$imod, iflags_op:$iflags, i32imm:$mode),
+ "$imod\t$iflags, $mode">;
+let mode = 0, M = 0 in
+ def t2CPS2p : t2CPS<(ins imod_op:$imod, iflags_op:$iflags),
+ "$imod.w\t$iflags">;
+let imod = 0, iflags = 0, M = 1 in
+ def t2CPS1p : t2CPS<(ins imm0_31:$mode), "\t$mode">;
+
+def : t2InstAlias<"cps$imod.w $iflags, $mode",
+ (t2CPS3p imod_op:$imod, iflags_op:$iflags, i32imm:$mode), 0>;
+def : t2InstAlias<"cps.w $mode", (t2CPS1p imm0_31:$mode), 0>;
+
+// A6.3.4 Branches and miscellaneous control
+// Table A6-14 Change Processor State, and hint instructions
+def t2HINT : T2I<(outs), (ins imm0_239:$imm), NoItinerary, "hint", ".w\t$imm",
+ [(int_arm_hint imm0_239:$imm)]> {
+ bits<8> imm;
+ let Inst{31-3} = 0b11110011101011111000000000000;
+ let Inst{7-0} = imm;
+}
+
+def : t2InstAlias<"hint$p $imm", (t2HINT imm0_239:$imm, pred:$p), 0>;
+def : t2InstAlias<"nop$p.w", (t2HINT 0, pred:$p), 1>;
+def : t2InstAlias<"yield$p.w", (t2HINT 1, pred:$p), 1>;
+def : t2InstAlias<"wfe$p.w", (t2HINT 2, pred:$p), 1>;
+def : t2InstAlias<"wfi$p.w", (t2HINT 3, pred:$p), 1>;
+def : t2InstAlias<"sev$p.w", (t2HINT 4, pred:$p), 1>;
+def : t2InstAlias<"sevl$p.w", (t2HINT 5, pred:$p), 1> {
+ let Predicates = [IsThumb2, HasV8];
+}
+def : t2InstAlias<"esb$p.w", (t2HINT 16, pred:$p), 1> {
+ let Predicates = [IsThumb2, HasRAS];
+}
+def : t2InstAlias<"esb$p", (t2HINT 16, pred:$p), 0> {
+ let Predicates = [IsThumb2, HasRAS];
+}
+
+def t2DBG : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "dbg", "\t$opt",
+ [(int_arm_dbg imm0_15:$opt)]> {
+ bits<4> opt;
+ let Inst{31-20} = 0b111100111010;
+ let Inst{19-16} = 0b1111;
+ let Inst{15-8} = 0b10000000;
+ let Inst{7-4} = 0b1111;
+ let Inst{3-0} = opt;
+}
+
+// Secure Monitor Call is a system instruction.
+// Option = Inst{19-16}
+let isCall = 1, Uses = [SP] in
+def t2SMC : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "smc", "\t$opt",
+ []>, Requires<[IsThumb2, HasTrustZone]> {
+ let Inst{31-27} = 0b11110;
+ let Inst{26-20} = 0b1111111;
+ let Inst{15-12} = 0b1000;
+
+ bits<4> opt;
+ let Inst{19-16} = opt;
+}
+
+class T2DCPS<bits<2> opt, string opc>
+ : T2I<(outs), (ins), NoItinerary, opc, "", []>, Requires<[IsThumb2, HasV8]> {
+ let Inst{31-27} = 0b11110;
+ let Inst{26-20} = 0b1111000;
+ let Inst{19-16} = 0b1111;
+ let Inst{15-12} = 0b1000;
+ let Inst{11-2} = 0b0000000000;
+ let Inst{1-0} = opt;
+}
+
+def t2DCPS1 : T2DCPS<0b01, "dcps1">;
+def t2DCPS2 : T2DCPS<0b10, "dcps2">;
+def t2DCPS3 : T2DCPS<0b11, "dcps3">;
+
+class T2SRS<bits<2> Op, bit W, dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : T2I<oops, iops, itin, opc, asm, pattern>,
+ Requires<[IsThumb2,IsNotMClass]> {
+ bits<5> mode;
+ let Inst{31-25} = 0b1110100;
+ let Inst{24-23} = Op;
+ let Inst{22} = 0;
+ let Inst{21} = W;
+ let Inst{20-16} = 0b01101;
+ let Inst{15-5} = 0b11000000000;
+ let Inst{4-0} = mode{4-0};
+}
+
+// Store Return State is a system instruction.
+def t2SRSDB_UPD : T2SRS<0b00, 1, (outs), (ins imm0_31:$mode), NoItinerary,
+ "srsdb", "\tsp!, $mode", []>;
+def t2SRSDB : T2SRS<0b00, 0, (outs), (ins imm0_31:$mode), NoItinerary,
+ "srsdb","\tsp, $mode", []>;
+def t2SRSIA_UPD : T2SRS<0b11, 1, (outs), (ins imm0_31:$mode), NoItinerary,
+ "srsia","\tsp!, $mode", []>;
+def t2SRSIA : T2SRS<0b11, 0, (outs), (ins imm0_31:$mode), NoItinerary,
+ "srsia","\tsp, $mode", []>;
+
+
+def : t2InstAlias<"srsdb${p} $mode", (t2SRSDB imm0_31:$mode, pred:$p)>;
+def : t2InstAlias<"srsdb${p} $mode!", (t2SRSDB_UPD imm0_31:$mode, pred:$p)>;
+
+def : t2InstAlias<"srsia${p} $mode", (t2SRSIA imm0_31:$mode, pred:$p)>;
+def : t2InstAlias<"srsia${p} $mode!", (t2SRSIA_UPD imm0_31:$mode, pred:$p)>;
+
+// Return From Exception is a system instruction.
+let isReturn = 1, isBarrier = 1, isTerminator = 1, Defs = [PC] in
+class T2RFE<bits<12> op31_20, dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : T2I<oops, iops, itin, opc, asm, pattern>,
+ Requires<[IsThumb2,IsNotMClass]> {
+ let Inst{31-20} = op31_20{11-0};
+
+ bits<4> Rn;
+ let Inst{19-16} = Rn;
+ let Inst{15-0} = 0xc000;
+}
+
+def t2RFEDBW : T2RFE<0b111010000011,
+ (outs), (ins GPR:$Rn), NoItinerary, "rfedb", "\t$Rn!",
+ [/* For disassembly only; pattern left blank */]>;
+def t2RFEDB : T2RFE<0b111010000001,
+ (outs), (ins GPR:$Rn), NoItinerary, "rfedb", "\t$Rn",
+ [/* For disassembly only; pattern left blank */]>;
+def t2RFEIAW : T2RFE<0b111010011011,
+ (outs), (ins GPR:$Rn), NoItinerary, "rfeia", "\t$Rn!",
+ [/* For disassembly only; pattern left blank */]>;
+def t2RFEIA : T2RFE<0b111010011001,
+ (outs), (ins GPR:$Rn), NoItinerary, "rfeia", "\t$Rn",
+ [/* For disassembly only; pattern left blank */]>;
+
+// B9.3.19 SUBS PC, LR, #imm (Thumb2) system instruction.
+// Exception return instruction is "subs pc, lr, #imm".
+let isReturn = 1, isBarrier = 1, isTerminator = 1, Defs = [PC] in
+def t2SUBS_PC_LR : T2I <(outs), (ins imm0_255:$imm), NoItinerary,
+ "subs", "\tpc, lr, $imm",
+ [(ARMintretflag imm0_255:$imm)]>,
+ Requires<[IsThumb2,IsNotMClass]> {
+ let Inst{31-8} = 0b111100111101111010001111;
+
+ bits<8> imm;
+ let Inst{7-0} = imm;
+}
+
+// Hypervisor Call is a system instruction.
+let isCall = 1 in {
+def t2HVC : T2XI <(outs), (ins imm0_65535:$imm16), IIC_Br, "hvc.w\t$imm16", []>,
+ Requires<[IsThumb2, HasVirtualization]>, Sched<[WriteBr]> {
+ bits<16> imm16;
+ let Inst{31-20} = 0b111101111110;
+ let Inst{19-16} = imm16{15-12};
+ let Inst{15-12} = 0b1000;
+ let Inst{11-0} = imm16{11-0};
+}
+}
+
+// Alias for HVC without the ".w" optional width specifier
+def : t2InstAlias<"hvc\t$imm16", (t2HVC imm0_65535:$imm16)>;
+
+// ERET - Return from exception in Hypervisor mode.
+// B9.3.3, B9.3.20: ERET is an alias for "SUBS PC, LR, #0" in an implementation that
+// includes virtualization extensions.
+def t2ERET : InstAlias<"eret${p}", (t2SUBS_PC_LR 0, pred:$p), 1>,
+ Requires<[IsThumb2, HasVirtualization]>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//
+
+// 32-bit immediate using movw + movt.
+// This is a single pseudo instruction to make it re-materializable.
+// FIXME: Remove this when we can do generalized remat.
+let isReMaterializable = 1, isMoveImm = 1 in
+def t2MOVi32imm : PseudoInst<(outs rGPR:$dst), (ins i32imm:$src), IIC_iMOVix2,
+ [(set rGPR:$dst, (i32 imm:$src))]>,
+ Requires<[IsThumb, UseMovt]>;
+
+// Pseudo instruction that combines movw + movt + add pc (if pic).
+// It also makes it possible to rematerialize the instructions.
+// FIXME: Remove this when we can do generalized remat and when machine licm
+// can properly the instructions.
+let isReMaterializable = 1 in {
+def t2MOV_ga_pcrel : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr),
+ IIC_iMOVix2addpc,
+ [(set rGPR:$dst, (ARMWrapperPIC tglobaladdr:$addr))]>,
+ Requires<[IsThumb, HasV8MBaseline, UseMovt]>;
+
+}
+
+def : T2Pat<(ARMWrapperPIC tglobaltlsaddr :$dst),
+ (t2MOV_ga_pcrel tglobaltlsaddr:$dst)>,
+ Requires<[IsThumb2, UseMovt]>;
+def : T2Pat<(ARMWrapper tglobaltlsaddr:$dst),
+ (t2MOVi32imm tglobaltlsaddr:$dst)>,
+ Requires<[IsThumb2, UseMovt]>;
+
+// ConstantPool, GlobalAddress, and JumpTable
+def : T2Pat<(ARMWrapper tconstpool :$dst), (t2LEApcrel tconstpool :$dst)>;
+def : T2Pat<(ARMWrapper texternalsym :$dst), (t2MOVi32imm texternalsym :$dst)>,
+ Requires<[IsThumb, HasV8MBaseline, UseMovt]>;
+def : T2Pat<(ARMWrapper tglobaladdr :$dst), (t2MOVi32imm tglobaladdr :$dst)>,
+ Requires<[IsThumb, HasV8MBaseline, UseMovt]>;
+
+def : T2Pat<(ARMWrapperJT tjumptable:$dst), (t2LEApcrelJT tjumptable:$dst)>;
+
+// Pseudo instruction that combines ldr from constpool and add pc. This should
+// be expanded into two instructions late to allow if-conversion and
+// scheduling.
+let canFoldAsLoad = 1, isReMaterializable = 1 in
+def t2LDRpci_pic : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr, pclabel:$cp),
+ IIC_iLoadiALU,
+ [(set rGPR:$dst, (ARMpic_add (load (ARMWrapper tconstpool:$addr)),
+ imm:$cp))]>,
+ Requires<[IsThumb2]>;
+
+// Pseudo isntruction that combines movs + predicated rsbmi
+// to implement integer ABS
+let usesCustomInserter = 1, Defs = [CPSR] in {
+def t2ABS : PseudoInst<(outs rGPR:$dst), (ins rGPR:$src),
+ NoItinerary, []>, Requires<[IsThumb2]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Coprocessor load/store -- for disassembly only
+//
+class T2CI<bits<4> op31_28, dag oops, dag iops, string opc, string asm, list<dag> pattern>
+ : T2I<oops, iops, NoItinerary, opc, asm, pattern> {
+ let Inst{31-28} = op31_28;
+ let Inst{27-25} = 0b110;
+}
+
+multiclass t2LdStCop<bits<4> op31_28, bit load, bit Dbit, string asm, list<dag> pattern> {
+ def _OFFSET : T2CI<op31_28,
+ (outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
+ asm, "\t$cop, $CRd, $addr", pattern> {
+ bits<13> addr;
+ bits<4> cop;
+ bits<4> CRd;
+ let Inst{24} = 1; // P = 1
+ let Inst{23} = addr{8};
+ let Inst{22} = Dbit;
+ let Inst{21} = 0; // W = 0
+ let Inst{20} = load;
+ let Inst{19-16} = addr{12-9};
+ let Inst{15-12} = CRd;
+ let Inst{11-8} = cop;
+ let Inst{7-0} = addr{7-0};
+ let DecoderMethod = "DecodeCopMemInstruction";
+ }
+ def _PRE : T2CI<op31_28,
+ (outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5_pre:$addr),
+ asm, "\t$cop, $CRd, $addr!", []> {
+ bits<13> addr;
+ bits<4> cop;
+ bits<4> CRd;
+ let Inst{24} = 1; // P = 1
+ let Inst{23} = addr{8};
+ let Inst{22} = Dbit;
+ let Inst{21} = 1; // W = 1
+ let Inst{20} = load;
+ let Inst{19-16} = addr{12-9};
+ let Inst{15-12} = CRd;
+ let Inst{11-8} = cop;
+ let Inst{7-0} = addr{7-0};
+ let DecoderMethod = "DecodeCopMemInstruction";
+ }
+ def _POST: T2CI<op31_28,
+ (outs), (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
+ postidx_imm8s4:$offset),
+ asm, "\t$cop, $CRd, $addr, $offset", []> {
+ bits<9> offset;
+ bits<4> addr;
+ bits<4> cop;
+ bits<4> CRd;
+ let Inst{24} = 0; // P = 0
+ let Inst{23} = offset{8};
+ let Inst{22} = Dbit;
+ let Inst{21} = 1; // W = 1
+ let Inst{20} = load;
+ let Inst{19-16} = addr;
+ let Inst{15-12} = CRd;
+ let Inst{11-8} = cop;
+ let Inst{7-0} = offset{7-0};
+ let DecoderMethod = "DecodeCopMemInstruction";
+ }
+ def _OPTION : T2CI<op31_28, (outs),
+ (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
+ coproc_option_imm:$option),
+ asm, "\t$cop, $CRd, $addr, $option", []> {
+ bits<8> option;
+ bits<4> addr;
+ bits<4> cop;
+ bits<4> CRd;
+ let Inst{24} = 0; // P = 0
+ let Inst{23} = 1; // U = 1
+ let Inst{22} = Dbit;
+ let Inst{21} = 0; // W = 0
+ let Inst{20} = load;
+ let Inst{19-16} = addr;
+ let Inst{15-12} = CRd;
+ let Inst{11-8} = cop;
+ let Inst{7-0} = option;
+ let DecoderMethod = "DecodeCopMemInstruction";
+ }
+}
+
+defm t2LDC : t2LdStCop<0b1110, 1, 0, "ldc", [(int_arm_ldc imm:$cop, imm:$CRd, addrmode5:$addr)]>;
+defm t2LDCL : t2LdStCop<0b1110, 1, 1, "ldcl", [(int_arm_ldcl imm:$cop, imm:$CRd, addrmode5:$addr)]>;
+defm t2LDC2 : t2LdStCop<0b1111, 1, 0, "ldc2", [(int_arm_ldc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
+defm t2LDC2L : t2LdStCop<0b1111, 1, 1, "ldc2l", [(int_arm_ldc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
+
+defm t2STC : t2LdStCop<0b1110, 0, 0, "stc", [(int_arm_stc imm:$cop, imm:$CRd, addrmode5:$addr)]>;
+defm t2STCL : t2LdStCop<0b1110, 0, 1, "stcl", [(int_arm_stcl imm:$cop, imm:$CRd, addrmode5:$addr)]>;
+defm t2STC2 : t2LdStCop<0b1111, 0, 0, "stc2", [(int_arm_stc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
+defm t2STC2L : t2LdStCop<0b1111, 0, 1, "stc2l", [(int_arm_stc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
+
+
+//===----------------------------------------------------------------------===//
+// Move between special register and ARM core register -- for disassembly only
+//
+// Move to ARM core register from Special Register
+
+// A/R class MRS.
+//
+// A/R class can only move from CPSR or SPSR.
+def t2MRS_AR : T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, apsr",
+ []>, Requires<[IsThumb2,IsNotMClass]> {
+ bits<4> Rd;
+ let Inst{31-12} = 0b11110011111011111000;
+ let Inst{11-8} = Rd;
+ let Inst{7-0} = 0b00000000;
+}
+
+def : t2InstAlias<"mrs${p} $Rd, cpsr", (t2MRS_AR GPR:$Rd, pred:$p)>;
+
+def t2MRSsys_AR: T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, spsr",
+ []>, Requires<[IsThumb2,IsNotMClass]> {
+ bits<4> Rd;
+ let Inst{31-12} = 0b11110011111111111000;
+ let Inst{11-8} = Rd;
+ let Inst{7-0} = 0b00000000;
+}
+
+def t2MRSbanked : T2I<(outs rGPR:$Rd), (ins banked_reg:$banked),
+ NoItinerary, "mrs", "\t$Rd, $banked", []>,
+ Requires<[IsThumb, HasVirtualization]> {
+ bits<6> banked;
+ bits<4> Rd;
+
+ let Inst{31-21} = 0b11110011111;
+ let Inst{20} = banked{5}; // R bit
+ let Inst{19-16} = banked{3-0};
+ let Inst{15-12} = 0b1000;
+ let Inst{11-8} = Rd;
+ let Inst{7-5} = 0b001;
+ let Inst{4} = banked{4};
+ let Inst{3-0} = 0b0000;
+}
+
+
+// M class MRS.
+//
+// This MRS has a mask field in bits 7-0 and can take more values than
+// the A/R class (a full msr_mask).
+def t2MRS_M : T2I<(outs rGPR:$Rd), (ins msr_mask:$SYSm), NoItinerary,
+ "mrs", "\t$Rd, $SYSm", []>,
+ Requires<[IsThumb,IsMClass]> {
+ bits<4> Rd;
+ bits<8> SYSm;
+ let Inst{31-12} = 0b11110011111011111000;
+ let Inst{11-8} = Rd;
+ let Inst{7-0} = SYSm;
+
+ let Unpredictable{20-16} = 0b11111;
+ let Unpredictable{13} = 0b1;
+}
+
+
+// Move from ARM core register to Special Register
+//
+// A/R class MSR.
+//
+// No need to have both system and application versions, the encodings are the
+// same and the assembly parser has no way to distinguish between them. The mask
+// operand contains the special register (R Bit) in bit 4 and bits 3-0 contains
+// the mask with the fields to be accessed in the special register.
+let Defs = [CPSR] in
+def t2MSR_AR : T2I<(outs), (ins msr_mask:$mask, rGPR:$Rn),
+ NoItinerary, "msr", "\t$mask, $Rn", []>,
+ Requires<[IsThumb2,IsNotMClass]> {
+ bits<5> mask;
+ bits<4> Rn;
+ let Inst{31-21} = 0b11110011100;
+ let Inst{20} = mask{4}; // R Bit
+ let Inst{19-16} = Rn;
+ let Inst{15-12} = 0b1000;
+ let Inst{11-8} = mask{3-0};
+ let Inst{7-0} = 0;
+}
+
+// However, the MSR (banked register) system instruction (ARMv7VE) *does* have a
+// separate encoding (distinguished by bit 5.
+def t2MSRbanked : T2I<(outs), (ins banked_reg:$banked, rGPR:$Rn),
+ NoItinerary, "msr", "\t$banked, $Rn", []>,
+ Requires<[IsThumb, HasVirtualization]> {
+ bits<6> banked;
+ bits<4> Rn;
+
+ let Inst{31-21} = 0b11110011100;
+ let Inst{20} = banked{5}; // R bit
+ let Inst{19-16} = Rn;
+ let Inst{15-12} = 0b1000;
+ let Inst{11-8} = banked{3-0};
+ let Inst{7-5} = 0b001;
+ let Inst{4} = banked{4};
+ let Inst{3-0} = 0b0000;
+}
+
+
+// M class MSR.
+//
+// Move from ARM core register to Special Register
+let Defs = [CPSR] in
+def t2MSR_M : T2I<(outs), (ins msr_mask:$SYSm, rGPR:$Rn),
+ NoItinerary, "msr", "\t$SYSm, $Rn", []>,
+ Requires<[IsThumb,IsMClass]> {
+ bits<12> SYSm;
+ bits<4> Rn;
+ let Inst{31-21} = 0b11110011100;
+ let Inst{20} = 0b0;
+ let Inst{19-16} = Rn;
+ let Inst{15-12} = 0b1000;
+ let Inst{11-10} = SYSm{11-10};
+ let Inst{9-8} = 0b00;
+ let Inst{7-0} = SYSm{7-0};
+
+ let Unpredictable{20} = 0b1;
+ let Unpredictable{13} = 0b1;
+ let Unpredictable{9-8} = 0b11;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Move between coprocessor and ARM core register
+//
+
+class t2MovRCopro<bits<4> Op, string opc, bit direction, dag oops, dag iops,
+ list<dag> pattern>
+ : T2Cop<Op, oops, iops, opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2",
+ pattern> {
+ let Inst{27-24} = 0b1110;
+ let Inst{20} = direction;
+ let Inst{4} = 1;
+
+ bits<4> Rt;
+ bits<4> cop;
+ bits<3> opc1;
+ bits<3> opc2;
+ bits<4> CRm;
+ bits<4> CRn;
+
+ let Inst{15-12} = Rt;
+ let Inst{11-8} = cop;
+ let Inst{23-21} = opc1;
+ let Inst{7-5} = opc2;
+ let Inst{3-0} = CRm;
+ let Inst{19-16} = CRn;
+}
+
+class t2MovRRCopro<bits<4> Op, string opc, bit direction, dag oops, dag iops,
+ list<dag> pattern = []>
+ : T2Cop<Op, oops, iops, opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm", pattern> {
+ let Inst{27-24} = 0b1100;
+ let Inst{23-21} = 0b010;
+ let Inst{20} = direction;
+
+ bits<4> Rt;
+ bits<4> Rt2;
+ bits<4> cop;
+ bits<4> opc1;
+ bits<4> CRm;
+
+ let Inst{15-12} = Rt;
+ let Inst{19-16} = Rt2;
+ let Inst{11-8} = cop;
+ let Inst{7-4} = opc1;
+ let Inst{3-0} = CRm;
+}
+
+/* from ARM core register to coprocessor */
+def t2MCR : t2MovRCopro<0b1110, "mcr", 0,
+ (outs),
+ (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
+ c_imm:$CRm, imm0_7:$opc2),
+ [(int_arm_mcr imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
+ imm:$CRm, imm:$opc2)]>,
+ ComplexDeprecationPredicate<"MCR">;
+def : t2InstAlias<"mcr${p} $cop, $opc1, $Rt, $CRn, $CRm",
+ (t2MCR p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
+ c_imm:$CRm, 0, pred:$p)>;
+def t2MCR2 : t2MovRCopro<0b1111, "mcr2", 0,
+ (outs), (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
+ c_imm:$CRm, imm0_7:$opc2),
+ [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
+ imm:$CRm, imm:$opc2)]> {
+ let Predicates = [IsThumb2, PreV8];
+}
+def : t2InstAlias<"mcr2${p} $cop, $opc1, $Rt, $CRn, $CRm",
+ (t2MCR2 p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
+ c_imm:$CRm, 0, pred:$p)>;
+
+/* from coprocessor to ARM core register */
+def t2MRC : t2MovRCopro<0b1110, "mrc", 1,
+ (outs GPRwithAPSR:$Rt), (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+ c_imm:$CRm, imm0_7:$opc2), []>;
+def : t2InstAlias<"mrc${p} $cop, $opc1, $Rt, $CRn, $CRm",
+ (t2MRC GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+ c_imm:$CRm, 0, pred:$p)>;
+
+def t2MRC2 : t2MovRCopro<0b1111, "mrc2", 1,
+ (outs GPRwithAPSR:$Rt), (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+ c_imm:$CRm, imm0_7:$opc2), []> {
+ let Predicates = [IsThumb2, PreV8];
+}
+def : t2InstAlias<"mrc2${p} $cop, $opc1, $Rt, $CRn, $CRm",
+ (t2MRC2 GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+ c_imm:$CRm, 0, pred:$p)>;
+
+def : T2v6Pat<(int_arm_mrc imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2),
+ (t2MRC imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
+
+def : T2v6Pat<(int_arm_mrc2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2),
+ (t2MRC2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
+
+
+/* from ARM core register to coprocessor */
+def t2MCRR : t2MovRRCopro<0b1110, "mcrr", 0, (outs),
+ (ins p_imm:$cop, imm0_15:$opc1, GPR:$Rt, GPR:$Rt2,
+ c_imm:$CRm),
+ [(int_arm_mcrr imm:$cop, imm:$opc1, GPR:$Rt, GPR:$Rt2,
+ imm:$CRm)]>;
+def t2MCRR2 : t2MovRRCopro<0b1111, "mcrr2", 0, (outs),
+ (ins p_imm:$cop, imm0_15:$opc1, GPR:$Rt, GPR:$Rt2,
+ c_imm:$CRm),
+ [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPR:$Rt,
+ GPR:$Rt2, imm:$CRm)]> {
+ let Predicates = [IsThumb2, PreV8];
+}
+
+/* from coprocessor to ARM core register */
+def t2MRRC : t2MovRRCopro<0b1110, "mrrc", 1, (outs GPR:$Rt, GPR:$Rt2),
+ (ins p_imm:$cop, imm0_15:$opc1, c_imm:$CRm)>;
+
+def t2MRRC2 : t2MovRRCopro<0b1111, "mrrc2", 1, (outs GPR:$Rt, GPR:$Rt2),
+ (ins p_imm:$cop, imm0_15:$opc1, c_imm:$CRm)> {
+ let Predicates = [IsThumb2, PreV8];
+}
+
+//===----------------------------------------------------------------------===//
+// Other Coprocessor Instructions.
+//
+
+def t2CDP : T2Cop<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
+ c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
+ "cdp", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
+ [(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
+ imm:$CRm, imm:$opc2)]> {
+ let Inst{27-24} = 0b1110;
+
+ bits<4> opc1;
+ bits<4> CRn;
+ bits<4> CRd;
+ bits<4> cop;
+ bits<3> opc2;
+ bits<4> CRm;
+
+ let Inst{3-0} = CRm;
+ let Inst{4} = 0;
+ let Inst{7-5} = opc2;
+ let Inst{11-8} = cop;
+ let Inst{15-12} = CRd;
+ let Inst{19-16} = CRn;
+ let Inst{23-20} = opc1;
+
+ let Predicates = [IsThumb2, PreV8];
+}
+
+def t2CDP2 : T2Cop<0b1111, (outs), (ins p_imm:$cop, imm0_15:$opc1,
+ c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
+ "cdp2", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
+ [(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
+ imm:$CRm, imm:$opc2)]> {
+ let Inst{27-24} = 0b1110;
+
+ bits<4> opc1;
+ bits<4> CRn;
+ bits<4> CRd;
+ bits<4> cop;
+ bits<3> opc2;
+ bits<4> CRm;
+
+ let Inst{3-0} = CRm;
+ let Inst{4} = 0;
+ let Inst{7-5} = opc2;
+ let Inst{11-8} = cop;
+ let Inst{15-12} = CRd;
+ let Inst{19-16} = CRn;
+ let Inst{23-20} = opc1;
+
+ let Predicates = [IsThumb2, PreV8];
+}
+
+
+
+//===----------------------------------------------------------------------===//
+// ARMv8.1 Privilege Access Never extension
+//
+// SETPAN #imm1
+
+def t2SETPAN : T1I<(outs), (ins imm0_1:$imm), NoItinerary, "setpan\t$imm", []>,
+ T1Misc<0b0110000>, Requires<[IsThumb2, HasV8, HasV8_1a]> {
+ bits<1> imm;
+
+ let Inst{4} = 0b1;
+ let Inst{3} = imm;
+ let Inst{2-0} = 0b000;
+
+ let Unpredictable{4} = 0b1;
+ let Unpredictable{2-0} = 0b111;
+}
+
+//===----------------------------------------------------------------------===//
+// ARMv8-M Security Extensions instructions
+//
+
+let hasSideEffects = 1 in
+def t2SG : T2I<(outs), (ins), NoItinerary, "sg", "", []>,
+ Requires<[Has8MSecExt]> {
+ let Inst = 0xe97fe97f;
+}
+
+class T2TT<bits<2> at, string asm, list<dag> pattern>
+ : T2I<(outs rGPR:$Rt), (ins GPRnopc:$Rn), NoItinerary, asm, "\t$Rt, $Rn",
+ pattern> {
+ bits<4> Rn;
+ bits<4> Rt;
+
+ let Inst{31-20} = 0b111010000100;
+ let Inst{19-16} = Rn;
+ let Inst{15-12} = 0b1111;
+ let Inst{11-8} = Rt;
+ let Inst{7-6} = at;
+ let Inst{5-0} = 0b000000;
+
+ let Unpredictable{5-0} = 0b111111;
+}
+
+def t2TT : T2TT<0b00, "tt", []>, Requires<[IsThumb,Has8MSecExt]>;
+def t2TTT : T2TT<0b01, "ttt", []>, Requires<[IsThumb,Has8MSecExt]>;
+def t2TTA : T2TT<0b10, "tta", []>, Requires<[IsThumb,Has8MSecExt]>;
+def t2TTAT : T2TT<0b11, "ttat", []>, Requires<[IsThumb,Has8MSecExt]>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//
+
+// SXT/UXT with no rotate
+let AddedComplexity = 16 in {
+def : T2Pat<(and rGPR:$Rm, 0x000000FF), (t2UXTB rGPR:$Rm, 0)>,
+ Requires<[IsThumb2]>;
+def : T2Pat<(and rGPR:$Rm, 0x0000FFFF), (t2UXTH rGPR:$Rm, 0)>,
+ Requires<[IsThumb2]>;
+def : T2Pat<(and rGPR:$Rm, 0x00FF00FF), (t2UXTB16 rGPR:$Rm, 0)>,
+ Requires<[HasT2ExtractPack, IsThumb2]>;
+def : T2Pat<(add rGPR:$Rn, (and rGPR:$Rm, 0x00FF)),
+ (t2UXTAB rGPR:$Rn, rGPR:$Rm, 0)>,
+ Requires<[HasT2ExtractPack, IsThumb2]>;
+def : T2Pat<(add rGPR:$Rn, (and rGPR:$Rm, 0xFFFF)),
+ (t2UXTAH rGPR:$Rn, rGPR:$Rm, 0)>,
+ Requires<[HasT2ExtractPack, IsThumb2]>;
+}
+
+def : T2Pat<(sext_inreg rGPR:$Src, i8), (t2SXTB rGPR:$Src, 0)>,
+ Requires<[IsThumb2]>;
+def : T2Pat<(sext_inreg rGPR:$Src, i16), (t2SXTH rGPR:$Src, 0)>,
+ Requires<[IsThumb2]>;
+def : T2Pat<(add rGPR:$Rn, (sext_inreg rGPR:$Rm, i8)),
+ (t2SXTAB rGPR:$Rn, rGPR:$Rm, 0)>,
+ Requires<[HasT2ExtractPack, IsThumb2]>;
+def : T2Pat<(add rGPR:$Rn, (sext_inreg rGPR:$Rm, i16)),
+ (t2SXTAH rGPR:$Rn, rGPR:$Rm, 0)>,
+ Requires<[HasT2ExtractPack, IsThumb2]>;
+
+// Atomic load/store patterns
+def : T2Pat<(atomic_load_8 t2addrmode_imm12:$addr),
+ (t2LDRBi12 t2addrmode_imm12:$addr)>;
+def : T2Pat<(atomic_load_8 t2addrmode_negimm8:$addr),
+ (t2LDRBi8 t2addrmode_negimm8:$addr)>;
+def : T2Pat<(atomic_load_8 t2addrmode_so_reg:$addr),
+ (t2LDRBs t2addrmode_so_reg:$addr)>;
+def : T2Pat<(atomic_load_16 t2addrmode_imm12:$addr),
+ (t2LDRHi12 t2addrmode_imm12:$addr)>;
+def : T2Pat<(atomic_load_16 t2addrmode_negimm8:$addr),
+ (t2LDRHi8 t2addrmode_negimm8:$addr)>;
+def : T2Pat<(atomic_load_16 t2addrmode_so_reg:$addr),
+ (t2LDRHs t2addrmode_so_reg:$addr)>;
+def : T2Pat<(atomic_load_32 t2addrmode_imm12:$addr),
+ (t2LDRi12 t2addrmode_imm12:$addr)>;
+def : T2Pat<(atomic_load_32 t2addrmode_negimm8:$addr),
+ (t2LDRi8 t2addrmode_negimm8:$addr)>;
+def : T2Pat<(atomic_load_32 t2addrmode_so_reg:$addr),
+ (t2LDRs t2addrmode_so_reg:$addr)>;
+def : T2Pat<(atomic_store_8 t2addrmode_imm12:$addr, GPR:$val),
+ (t2STRBi12 GPR:$val, t2addrmode_imm12:$addr)>;
+def : T2Pat<(atomic_store_8 t2addrmode_negimm8:$addr, GPR:$val),
+ (t2STRBi8 GPR:$val, t2addrmode_negimm8:$addr)>;
+def : T2Pat<(atomic_store_8 t2addrmode_so_reg:$addr, GPR:$val),
+ (t2STRBs GPR:$val, t2addrmode_so_reg:$addr)>;
+def : T2Pat<(atomic_store_16 t2addrmode_imm12:$addr, GPR:$val),
+ (t2STRHi12 GPR:$val, t2addrmode_imm12:$addr)>;
+def : T2Pat<(atomic_store_16 t2addrmode_negimm8:$addr, GPR:$val),
+ (t2STRHi8 GPR:$val, t2addrmode_negimm8:$addr)>;
+def : T2Pat<(atomic_store_16 t2addrmode_so_reg:$addr, GPR:$val),
+ (t2STRHs GPR:$val, t2addrmode_so_reg:$addr)>;
+def : T2Pat<(atomic_store_32 t2addrmode_imm12:$addr, GPR:$val),
+ (t2STRi12 GPR:$val, t2addrmode_imm12:$addr)>;
+def : T2Pat<(atomic_store_32 t2addrmode_negimm8:$addr, GPR:$val),
+ (t2STRi8 GPR:$val, t2addrmode_negimm8:$addr)>;
+def : T2Pat<(atomic_store_32 t2addrmode_so_reg:$addr, GPR:$val),
+ (t2STRs GPR:$val, t2addrmode_so_reg:$addr)>;
+
+let AddedComplexity = 8 in {
+ def : T2Pat<(atomic_load_acquire_8 addr_offset_none:$addr), (t2LDAB addr_offset_none:$addr)>;
+ def : T2Pat<(atomic_load_acquire_16 addr_offset_none:$addr), (t2LDAH addr_offset_none:$addr)>;
+ def : T2Pat<(atomic_load_acquire_32 addr_offset_none:$addr), (t2LDA addr_offset_none:$addr)>;
+ def : T2Pat<(atomic_store_release_8 addr_offset_none:$addr, GPR:$val), (t2STLB GPR:$val, addr_offset_none:$addr)>;
+ def : T2Pat<(atomic_store_release_16 addr_offset_none:$addr, GPR:$val), (t2STLH GPR:$val, addr_offset_none:$addr)>;
+ def : T2Pat<(atomic_store_release_32 addr_offset_none:$addr, GPR:$val), (t2STL GPR:$val, addr_offset_none:$addr)>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Assembler aliases
+//
+
+// Aliases for ADC without the ".w" optional width specifier.
+def : t2InstAlias<"adc${s}${p} $Rd, $Rn, $Rm",
+ (t2ADCrr rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"adc${s}${p} $Rd, $Rn, $ShiftedRm",
+ (t2ADCrs rGPR:$Rd, rGPR:$Rn, t2_so_reg:$ShiftedRm,
+ pred:$p, cc_out:$s)>;
+
+// Aliases for SBC without the ".w" optional width specifier.
+def : t2InstAlias<"sbc${s}${p} $Rd, $Rn, $Rm",
+ (t2SBCrr rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"sbc${s}${p} $Rd, $Rn, $ShiftedRm",
+ (t2SBCrs rGPR:$Rd, rGPR:$Rn, t2_so_reg:$ShiftedRm,
+ pred:$p, cc_out:$s)>;
+
+// Aliases for ADD without the ".w" optional width specifier.
+def : t2InstAlias<"add${s}${p} $Rd, $Rn, $imm",
+ (t2ADDri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm:$imm, pred:$p,
+ cc_out:$s)>;
+def : t2InstAlias<"add${p} $Rd, $Rn, $imm",
+ (t2ADDri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095:$imm, pred:$p)>;
+def : t2InstAlias<"add${s}${p} $Rd, $Rn, $Rm",
+ (t2ADDrr GPRnopc:$Rd, GPRnopc:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"add${s}${p} $Rd, $Rn, $ShiftedRm",
+ (t2ADDrs GPRnopc:$Rd, GPRnopc:$Rn, t2_so_reg:$ShiftedRm,
+ pred:$p, cc_out:$s)>;
+// ... and with the destination and source register combined.
+def : t2InstAlias<"add${s}${p} $Rdn, $imm",
+ (t2ADDri GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"add${p} $Rdn, $imm",
+ (t2ADDri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095:$imm, pred:$p)>;
+def : t2InstAlias<"add${s}${p} $Rdn, $Rm",
+ (t2ADDrr GPRnopc:$Rdn, GPRnopc:$Rdn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"add${s}${p} $Rdn, $ShiftedRm",
+ (t2ADDrs GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_reg:$ShiftedRm,
+ pred:$p, cc_out:$s)>;
+
+// add w/ negative immediates is just a sub.
+def : t2InstAlias<"add${s}${p} $Rd, $Rn, $imm",
+ (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm, pred:$p,
+ cc_out:$s)>;
+def : t2InstAlias<"add${p} $Rd, $Rn, $imm",
+ (t2SUBri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095_neg:$imm, pred:$p)>;
+def : t2InstAlias<"add${s}${p} $Rdn, $imm",
+ (t2SUBri GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_imm_neg:$imm, pred:$p,
+ cc_out:$s)>;
+def : t2InstAlias<"add${p} $Rdn, $imm",
+ (t2SUBri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095_neg:$imm, pred:$p)>;
+
+def : t2InstAlias<"add${s}${p}.w $Rd, $Rn, $imm",
+ (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm, pred:$p,
+ cc_out:$s)>;
+def : t2InstAlias<"addw${p} $Rd, $Rn, $imm",
+ (t2SUBri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095_neg:$imm, pred:$p)>;
+def : t2InstAlias<"add${s}${p}.w $Rdn, $imm",
+ (t2SUBri GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_imm_neg:$imm, pred:$p,
+ cc_out:$s)>;
+def : t2InstAlias<"addw${p} $Rdn, $imm",
+ (t2SUBri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095_neg:$imm, pred:$p)>;
+
+
+// Aliases for SUB without the ".w" optional width specifier.
+def : t2InstAlias<"sub${s}${p} $Rd, $Rn, $imm",
+ (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"sub${p} $Rd, $Rn, $imm",
+ (t2SUBri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095:$imm, pred:$p)>;
+def : t2InstAlias<"sub${s}${p} $Rd, $Rn, $Rm",
+ (t2SUBrr GPRnopc:$Rd, GPRnopc:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"sub${s}${p} $Rd, $Rn, $ShiftedRm",
+ (t2SUBrs GPRnopc:$Rd, GPRnopc:$Rn, t2_so_reg:$ShiftedRm,
+ pred:$p, cc_out:$s)>;
+// ... and with the destination and source register combined.
+def : t2InstAlias<"sub${s}${p} $Rdn, $imm",
+ (t2SUBri GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"sub${p} $Rdn, $imm",
+ (t2SUBri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095:$imm, pred:$p)>;
+def : t2InstAlias<"sub${s}${p}.w $Rdn, $Rm",
+ (t2SUBrr GPRnopc:$Rdn, GPRnopc:$Rdn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"sub${s}${p} $Rdn, $Rm",
+ (t2SUBrr GPRnopc:$Rdn, GPRnopc:$Rdn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"sub${s}${p} $Rdn, $ShiftedRm",
+ (t2SUBrs GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_reg:$ShiftedRm,
+ pred:$p, cc_out:$s)>;
+
+// Alias for compares without the ".w" optional width specifier.
+def : t2InstAlias<"cmn${p} $Rn, $Rm",
+ (t2CMNzrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>;
+def : t2InstAlias<"teq${p} $Rn, $Rm",
+ (t2TEQrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>;
+def : t2InstAlias<"tst${p} $Rn, $Rm",
+ (t2TSTrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>;
+
+// Memory barriers
+def : InstAlias<"dmb${p}", (t2DMB 0xf, pred:$p), 0>, Requires<[HasDB]>;
+def : InstAlias<"dsb${p}", (t2DSB 0xf, pred:$p), 0>, Requires<[HasDB]>;
+def : InstAlias<"isb${p}", (t2ISB 0xf, pred:$p), 0>, Requires<[HasDB]>;
+
+// Alias for LDR, LDRB, LDRH, LDRSB, and LDRSH without the ".w" optional
+// width specifier.
+def : t2InstAlias<"ldr${p} $Rt, $addr",
+ (t2LDRi12 GPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"ldrb${p} $Rt, $addr",
+ (t2LDRBi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"ldrh${p} $Rt, $addr",
+ (t2LDRHi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsb${p} $Rt, $addr",
+ (t2LDRSBi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsh${p} $Rt, $addr",
+ (t2LDRSHi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+
+def : t2InstAlias<"ldr${p} $Rt, $addr",
+ (t2LDRs GPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+def : t2InstAlias<"ldrb${p} $Rt, $addr",
+ (t2LDRBs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+def : t2InstAlias<"ldrh${p} $Rt, $addr",
+ (t2LDRHs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsb${p} $Rt, $addr",
+ (t2LDRSBs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsh${p} $Rt, $addr",
+ (t2LDRSHs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+
+def : t2InstAlias<"ldr${p} $Rt, $addr",
+ (t2LDRpci GPR:$Rt, t2ldrlabel:$addr, pred:$p)>;
+def : t2InstAlias<"ldrb${p} $Rt, $addr",
+ (t2LDRBpci rGPR:$Rt, t2ldrlabel:$addr, pred:$p)>;
+def : t2InstAlias<"ldrh${p} $Rt, $addr",
+ (t2LDRHpci rGPR:$Rt, t2ldrlabel:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsb${p} $Rt, $addr",
+ (t2LDRSBpci rGPR:$Rt, t2ldrlabel:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsh${p} $Rt, $addr",
+ (t2LDRSHpci rGPR:$Rt, t2ldrlabel:$addr, pred:$p)>;
+
+// Alias for MVN with(out) the ".w" optional width specifier.
+def : t2InstAlias<"mvn${s}${p}.w $Rd, $imm",
+ (t2MVNi rGPR:$Rd, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"mvn${s}${p} $Rd, $Rm",
+ (t2MVNr rGPR:$Rd, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"mvn${s}${p} $Rd, $ShiftedRm",
+ (t2MVNs rGPR:$Rd, t2_so_reg:$ShiftedRm, pred:$p, cc_out:$s)>;
+
+// PKHBT/PKHTB with default shift amount. PKHTB is equivalent to PKHBT with the
+// input operands swapped when the shift amount is zero (i.e., unspecified).
+def : InstAlias<"pkhbt${p} $Rd, $Rn, $Rm",
+ (t2PKHBT rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
+ Requires<[HasT2ExtractPack, IsThumb2]>;
+def : InstAlias<"pkhtb${p} $Rd, $Rn, $Rm",
+ (t2PKHBT rGPR:$Rd, rGPR:$Rm, rGPR:$Rn, 0, pred:$p), 0>,
+ Requires<[HasT2ExtractPack, IsThumb2]>;
+
+// PUSH/POP aliases for STM/LDM
+def : t2InstAlias<"push${p}.w $regs", (t2STMDB_UPD SP, pred:$p, reglist:$regs)>;
+def : t2InstAlias<"push${p} $regs", (t2STMDB_UPD SP, pred:$p, reglist:$regs)>;
+def : t2InstAlias<"pop${p}.w $regs", (t2LDMIA_UPD SP, pred:$p, reglist:$regs)>;
+def : t2InstAlias<"pop${p} $regs", (t2LDMIA_UPD SP, pred:$p, reglist:$regs)>;
+
+// STMIA/STMIA_UPD aliases w/o the optional .w suffix
+def : t2InstAlias<"stm${p} $Rn, $regs",
+ (t2STMIA GPR:$Rn, pred:$p, reglist:$regs)>;
+def : t2InstAlias<"stm${p} $Rn!, $regs",
+ (t2STMIA_UPD GPR:$Rn, pred:$p, reglist:$regs)>;
+
+// LDMIA/LDMIA_UPD aliases w/o the optional .w suffix
+def : t2InstAlias<"ldm${p} $Rn, $regs",
+ (t2LDMIA GPR:$Rn, pred:$p, reglist:$regs)>;
+def : t2InstAlias<"ldm${p} $Rn!, $regs",
+ (t2LDMIA_UPD GPR:$Rn, pred:$p, reglist:$regs)>;
+
+// STMDB/STMDB_UPD aliases w/ the optional .w suffix
+def : t2InstAlias<"stmdb${p}.w $Rn, $regs",
+ (t2STMDB GPR:$Rn, pred:$p, reglist:$regs)>;
+def : t2InstAlias<"stmdb${p}.w $Rn!, $regs",
+ (t2STMDB_UPD GPR:$Rn, pred:$p, reglist:$regs)>;
+
+// LDMDB/LDMDB_UPD aliases w/ the optional .w suffix
+def : t2InstAlias<"ldmdb${p}.w $Rn, $regs",
+ (t2LDMDB GPR:$Rn, pred:$p, reglist:$regs)>;
+def : t2InstAlias<"ldmdb${p}.w $Rn!, $regs",
+ (t2LDMDB_UPD GPR:$Rn, pred:$p, reglist:$regs)>;
+
+// Alias for REV/REV16/REVSH without the ".w" optional width specifier.
+def : t2InstAlias<"rev${p} $Rd, $Rm", (t2REV rGPR:$Rd, rGPR:$Rm, pred:$p)>;
+def : t2InstAlias<"rev16${p} $Rd, $Rm", (t2REV16 rGPR:$Rd, rGPR:$Rm, pred:$p)>;
+def : t2InstAlias<"revsh${p} $Rd, $Rm", (t2REVSH rGPR:$Rd, rGPR:$Rm, pred:$p)>;
+
+
+// Alias for RSB without the ".w" optional width specifier, and with optional
+// implied destination register.
+def : t2InstAlias<"rsb${s}${p} $Rd, $Rn, $imm",
+ (t2RSBri rGPR:$Rd, rGPR:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"rsb${s}${p} $Rdn, $imm",
+ (t2RSBri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"rsb${s}${p} $Rdn, $Rm",
+ (t2RSBrr rGPR:$Rdn, rGPR:$Rdn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"rsb${s}${p} $Rdn, $ShiftedRm",
+ (t2RSBrs rGPR:$Rdn, rGPR:$Rdn, t2_so_reg:$ShiftedRm, pred:$p,
+ cc_out:$s)>;
+
+// SSAT/USAT optional shift operand.
+def : t2InstAlias<"ssat${p} $Rd, $sat_imm, $Rn",
+ (t2SSAT rGPR:$Rd, imm1_32:$sat_imm, rGPR:$Rn, 0, pred:$p)>;
+def : t2InstAlias<"usat${p} $Rd, $sat_imm, $Rn",
+ (t2USAT rGPR:$Rd, imm0_31:$sat_imm, rGPR:$Rn, 0, pred:$p)>;
+
+// STM w/o the .w suffix.
+def : t2InstAlias<"stm${p} $Rn, $regs",
+ (t2STMIA GPR:$Rn, pred:$p, reglist:$regs)>;
+
+// Alias for STR, STRB, and STRH without the ".w" optional
+// width specifier.
+def : t2InstAlias<"str${p} $Rt, $addr",
+ (t2STRi12 GPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"strb${p} $Rt, $addr",
+ (t2STRBi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"strh${p} $Rt, $addr",
+ (t2STRHi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+
+def : t2InstAlias<"str${p} $Rt, $addr",
+ (t2STRs GPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+def : t2InstAlias<"strb${p} $Rt, $addr",
+ (t2STRBs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+def : t2InstAlias<"strh${p} $Rt, $addr",
+ (t2STRHs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+
+// Extend instruction optional rotate operand.
+def : InstAlias<"sxtab${p} $Rd, $Rn, $Rm",
+ (t2SXTAB rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
+ Requires<[HasT2ExtractPack, IsThumb2]>;
+def : InstAlias<"sxtah${p} $Rd, $Rn, $Rm",
+ (t2SXTAH rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
+ Requires<[HasT2ExtractPack, IsThumb2]>;
+def : InstAlias<"sxtab16${p} $Rd, $Rn, $Rm",
+ (t2SXTAB16 rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
+ Requires<[HasT2ExtractPack, IsThumb2]>;
+def : InstAlias<"sxtb16${p} $Rd, $Rm",
+ (t2SXTB16 rGPR:$Rd, rGPR:$Rm, 0, pred:$p), 0>,
+ Requires<[HasT2ExtractPack, IsThumb2]>;
+
+def : t2InstAlias<"sxtb${p} $Rd, $Rm",
+ (t2SXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"sxth${p} $Rd, $Rm",
+ (t2SXTH rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"sxtb${p}.w $Rd, $Rm",
+ (t2SXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"sxth${p}.w $Rd, $Rm",
+ (t2SXTH rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+
+def : InstAlias<"uxtab${p} $Rd, $Rn, $Rm",
+ (t2UXTAB rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
+ Requires<[HasT2ExtractPack, IsThumb2]>;
+def : InstAlias<"uxtah${p} $Rd, $Rn, $Rm",
+ (t2UXTAH rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
+ Requires<[HasT2ExtractPack, IsThumb2]>;
+def : InstAlias<"uxtab16${p} $Rd, $Rn, $Rm",
+ (t2UXTAB16 rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
+ Requires<[HasT2ExtractPack, IsThumb2]>;
+def : InstAlias<"uxtb16${p} $Rd, $Rm",
+ (t2UXTB16 rGPR:$Rd, rGPR:$Rm, 0, pred:$p), 0>,
+ Requires<[HasT2ExtractPack, IsThumb2]>;
+
+def : t2InstAlias<"uxtb${p} $Rd, $Rm",
+ (t2UXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"uxth${p} $Rd, $Rm",
+ (t2UXTH rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"uxtb${p}.w $Rd, $Rm",
+ (t2UXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"uxth${p}.w $Rd, $Rm",
+ (t2UXTH rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+
+// Extend instruction w/o the ".w" optional width specifier.
+def : t2InstAlias<"uxtb${p} $Rd, $Rm$rot",
+ (t2UXTB rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
+def : InstAlias<"uxtb16${p} $Rd, $Rm$rot",
+ (t2UXTB16 rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p), 0>,
+ Requires<[HasT2ExtractPack, IsThumb2]>;
+def : t2InstAlias<"uxth${p} $Rd, $Rm$rot",
+ (t2UXTH rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
+
+def : t2InstAlias<"sxtb${p} $Rd, $Rm$rot",
+ (t2SXTB rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
+def : InstAlias<"sxtb16${p} $Rd, $Rm$rot",
+ (t2SXTB16 rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p), 0>,
+ Requires<[HasT2ExtractPack, IsThumb2]>;
+def : t2InstAlias<"sxth${p} $Rd, $Rm$rot",
+ (t2SXTH rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
+
+
+// "mov Rd, t2_so_imm_not" can be handled via "mvn" in assembly, just like
+// for isel.
+def : t2InstAlias<"mov${p} $Rd, $imm",
+ (t2MVNi rGPR:$Rd, t2_so_imm_not:$imm, pred:$p, zero_reg)>;
+def : t2InstAlias<"mvn${p} $Rd, $imm",
+ (t2MOVi rGPR:$Rd, t2_so_imm_not:$imm, pred:$p, zero_reg)>;
+// Same for AND <--> BIC
+def : t2InstAlias<"bic${s}${p} $Rd, $Rn, $imm",
+ (t2ANDri rGPR:$Rd, rGPR:$Rn, t2_so_imm_not:$imm,
+ pred:$p, cc_out:$s)>;
+def : t2InstAlias<"bic${s}${p} $Rdn, $imm",
+ (t2ANDri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm_not:$imm,
+ pred:$p, cc_out:$s)>;
+def : t2InstAlias<"and${s}${p} $Rd, $Rn, $imm",
+ (t2BICri rGPR:$Rd, rGPR:$Rn, t2_so_imm_not:$imm,
+ pred:$p, cc_out:$s)>;
+def : t2InstAlias<"and${s}${p} $Rdn, $imm",
+ (t2BICri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm_not:$imm,
+ pred:$p, cc_out:$s)>;
+// Likewise, "add Rd, t2_so_imm_neg" -> sub
+def : t2InstAlias<"add${s}${p} $Rd, $Rn, $imm",
+ (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm,
+ pred:$p, cc_out:$s)>;
+def : t2InstAlias<"add${s}${p} $Rd, $imm",
+ (t2SUBri GPRnopc:$Rd, GPRnopc:$Rd, t2_so_imm_neg:$imm,
+ pred:$p, cc_out:$s)>;
+// Same for CMP <--> CMN via t2_so_imm_neg
+def : t2InstAlias<"cmp${p} $Rd, $imm",
+ (t2CMNri rGPR:$Rd, t2_so_imm_neg:$imm, pred:$p)>;
+def : t2InstAlias<"cmn${p} $Rd, $imm",
+ (t2CMPri rGPR:$Rd, t2_so_imm_neg:$imm, pred:$p)>;
+
+
+// Wide 'mul' encoding can be specified with only two operands.
+def : t2InstAlias<"mul${p} $Rn, $Rm",
+ (t2MUL rGPR:$Rn, rGPR:$Rm, rGPR:$Rn, pred:$p)>;
+
+// "neg" is and alias for "rsb rd, rn, #0"
+def : t2InstAlias<"neg${s}${p} $Rd, $Rm",
+ (t2RSBri rGPR:$Rd, rGPR:$Rm, 0, pred:$p, cc_out:$s)>;
+
+// MOV so_reg assembler pseudos. InstAlias isn't expressive enough for
+// these, unfortunately.
+def t2MOVsi: t2AsmPseudo<"mov${p} $Rd, $shift",
+ (ins rGPR:$Rd, t2_so_reg:$shift, pred:$p)>;
+def t2MOVSsi: t2AsmPseudo<"movs${p} $Rd, $shift",
+ (ins rGPR:$Rd, t2_so_reg:$shift, pred:$p)>;
+
+def t2MOVsr: t2AsmPseudo<"mov${p} $Rd, $shift",
+ (ins rGPR:$Rd, so_reg_reg:$shift, pred:$p)>;
+def t2MOVSsr: t2AsmPseudo<"movs${p} $Rd, $shift",
+ (ins rGPR:$Rd, so_reg_reg:$shift, pred:$p)>;
+
+// ADR w/o the .w suffix
+def : t2InstAlias<"adr${p} $Rd, $addr",
+ (t2ADR rGPR:$Rd, t2adrlabel:$addr, pred:$p)>;
+
+// LDR(literal) w/ alternate [pc, #imm] syntax.
+def t2LDRpcrel : t2AsmPseudo<"ldr${p} $Rt, $addr",
+ (ins GPR:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+def t2LDRBpcrel : t2AsmPseudo<"ldrb${p} $Rt, $addr",
+ (ins GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+def t2LDRHpcrel : t2AsmPseudo<"ldrh${p} $Rt, $addr",
+ (ins GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+def t2LDRSBpcrel : t2AsmPseudo<"ldrsb${p} $Rt, $addr",
+ (ins GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+def t2LDRSHpcrel : t2AsmPseudo<"ldrsh${p} $Rt, $addr",
+ (ins GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+ // Version w/ the .w suffix.
+def : t2InstAlias<"ldr${p}.w $Rt, $addr",
+ (t2LDRpcrel GPR:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p), 0>;
+def : t2InstAlias<"ldrb${p}.w $Rt, $addr",
+ (t2LDRBpcrel GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"ldrh${p}.w $Rt, $addr",
+ (t2LDRHpcrel GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsb${p}.w $Rt, $addr",
+ (t2LDRSBpcrel GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsh${p}.w $Rt, $addr",
+ (t2LDRSHpcrel GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+
+def : t2InstAlias<"add${p} $Rd, pc, $imm",
+ (t2ADR rGPR:$Rd, imm0_4095:$imm, pred:$p)>;
+
+// Pseudo instruction ldr Rt, =immediate
+def t2LDRConstPool
+ : t2AsmPseudo<"ldr${p} $Rt, $immediate",
+ (ins GPRnopc:$Rt, const_pool_asm_imm:$immediate, pred:$p)>;
+// Version w/ the .w suffix.
+def : t2InstAlias<"ldr${p}.w $Rt, $immediate",
+ (t2LDRConstPool GPRnopc:$Rt,
+ const_pool_asm_imm:$immediate, pred:$p)>;
+
+// PLD/PLDW/PLI with alternate literal form.
+def : t2InstAlias<"pld${p} $addr",
+ (t2PLDpci t2ldr_pcrel_imm12:$addr, pred:$p)>;
+def : InstAlias<"pli${p} $addr",
+ (t2PLIpci t2ldr_pcrel_imm12:$addr, pred:$p), 0>,
+ Requires<[IsThumb2,HasV7]>;
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td b/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td
new file mode 100644
index 000000000000..e99048645685
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td
@@ -0,0 +1,2308 @@
+//===-- ARMInstrVFP.td - VFP support for ARM ---------------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the ARM VFP instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+def SDT_CMPFP0 : SDTypeProfile<0, 1, [SDTCisFP<0>]>;
+def SDT_VMOVDRR : SDTypeProfile<1, 2, [SDTCisVT<0, f64>, SDTCisVT<1, i32>,
+ SDTCisSameAs<1, 2>]>;
+
+def arm_fmstat : SDNode<"ARMISD::FMSTAT", SDTNone, [SDNPInGlue, SDNPOutGlue]>;
+def arm_cmpfp : SDNode<"ARMISD::CMPFP", SDT_ARMCmp, [SDNPOutGlue]>;
+def arm_cmpfp0 : SDNode<"ARMISD::CMPFPw0", SDT_CMPFP0, [SDNPOutGlue]>;
+def arm_fmdrr : SDNode<"ARMISD::VMOVDRR", SDT_VMOVDRR>;
+
+//===----------------------------------------------------------------------===//
+// Operand Definitions.
+//
+
+// 8-bit floating-point immediate encodings.
+def FPImmOperand : AsmOperandClass {
+ let Name = "FPImm";
+ let ParserMethod = "parseFPImm";
+}
+
+def vfp_f16imm : Operand<f16>,
+ PatLeaf<(f16 fpimm), [{
+ return ARM_AM::getFP16Imm(N->getValueAPF()) != -1;
+ }], SDNodeXForm<fpimm, [{
+ APFloat InVal = N->getValueAPF();
+ uint32_t enc = ARM_AM::getFP16Imm(InVal);
+ return CurDAG->getTargetConstant(enc, MVT::i32);
+ }]>> {
+ let PrintMethod = "printFPImmOperand";
+ let ParserMatchClass = FPImmOperand;
+}
+
+def vfp_f32imm : Operand<f32>,
+ PatLeaf<(f32 fpimm), [{
+ return ARM_AM::getFP32Imm(N->getValueAPF()) != -1;
+ }], SDNodeXForm<fpimm, [{
+ APFloat InVal = N->getValueAPF();
+ uint32_t enc = ARM_AM::getFP32Imm(InVal);
+ return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
+ }]>> {
+ let PrintMethod = "printFPImmOperand";
+ let ParserMatchClass = FPImmOperand;
+}
+
+def vfp_f64imm : Operand<f64>,
+ PatLeaf<(f64 fpimm), [{
+ return ARM_AM::getFP64Imm(N->getValueAPF()) != -1;
+ }], SDNodeXForm<fpimm, [{
+ APFloat InVal = N->getValueAPF();
+ uint32_t enc = ARM_AM::getFP64Imm(InVal);
+ return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
+ }]>> {
+ let PrintMethod = "printFPImmOperand";
+ let ParserMatchClass = FPImmOperand;
+}
+
+def alignedload32 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return cast<LoadSDNode>(N)->getAlignment() >= 4;
+}]>;
+
+def alignedstore32 : PatFrag<(ops node:$val, node:$ptr),
+ (store node:$val, node:$ptr), [{
+ return cast<StoreSDNode>(N)->getAlignment() >= 4;
+}]>;
+
+// The VCVT to/from fixed-point instructions encode the 'fbits' operand
+// (the number of fixed bits) differently than it appears in the assembly
+// source. It's encoded as "Size - fbits" where Size is the size of the
+// fixed-point representation (32 or 16) and fbits is the value appearing
+// in the assembly source, an integer in [0,16] or (0,32], depending on size.
+def fbits32_asm_operand : AsmOperandClass { let Name = "FBits32"; }
+def fbits32 : Operand<i32> {
+ let PrintMethod = "printFBits32";
+ let ParserMatchClass = fbits32_asm_operand;
+}
+
+def fbits16_asm_operand : AsmOperandClass { let Name = "FBits16"; }
+def fbits16 : Operand<i32> {
+ let PrintMethod = "printFBits16";
+ let ParserMatchClass = fbits16_asm_operand;
+}
+
+//===----------------------------------------------------------------------===//
+// Load / store Instructions.
+//
+
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+
+def VLDRD : ADI5<0b1101, 0b01, (outs DPR:$Dd), (ins addrmode5:$addr),
+ IIC_fpLoad64, "vldr", "\t$Dd, $addr",
+ [(set DPR:$Dd, (f64 (alignedload32 addrmode5:$addr)))]>;
+
+def VLDRS : ASI5<0b1101, 0b01, (outs SPR:$Sd), (ins addrmode5:$addr),
+ IIC_fpLoad32, "vldr", "\t$Sd, $addr",
+ [(set SPR:$Sd, (alignedload32 addrmode5:$addr))]> {
+ // Some single precision VFP instructions may be executed on both NEON and VFP
+ // pipelines.
+ let D = VFPNeonDomain;
+}
+
+def VLDRH : AHI5<0b1101, 0b01, (outs SPR:$Sd), (ins addrmode5fp16:$addr),
+ IIC_fpLoad16, "vldr", ".16\t$Sd, $addr",
+ []>,
+ Requires<[HasFullFP16]>;
+
+} // End of 'let canFoldAsLoad = 1, isReMaterializable = 1 in'
+
+def VSTRD : ADI5<0b1101, 0b00, (outs), (ins DPR:$Dd, addrmode5:$addr),
+ IIC_fpStore64, "vstr", "\t$Dd, $addr",
+ [(alignedstore32 (f64 DPR:$Dd), addrmode5:$addr)]>;
+
+def VSTRS : ASI5<0b1101, 0b00, (outs), (ins SPR:$Sd, addrmode5:$addr),
+ IIC_fpStore32, "vstr", "\t$Sd, $addr",
+ [(alignedstore32 SPR:$Sd, addrmode5:$addr)]> {
+ // Some single precision VFP instructions may be executed on both NEON and VFP
+ // pipelines.
+ let D = VFPNeonDomain;
+}
+
+def VSTRH : AHI5<0b1101, 0b00, (outs), (ins SPR:$Sd, addrmode5fp16:$addr),
+ IIC_fpStore16, "vstr", ".16\t$Sd, $addr",
+ []>,
+ Requires<[HasFullFP16]>;
+
+//===----------------------------------------------------------------------===//
+// Load / store multiple Instructions.
+//
+
+multiclass vfp_ldst_mult<string asm, bit L_bit,
+ InstrItinClass itin, InstrItinClass itin_upd> {
+ // Double Precision
+ def DIA :
+ AXDI4<(outs), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops),
+ IndexModeNone, itin,
+ !strconcat(asm, "ia${p}\t$Rn, $regs"), "", []> {
+ let Inst{24-23} = 0b01; // Increment After
+ let Inst{21} = 0; // No writeback
+ let Inst{20} = L_bit;
+ }
+ def DIA_UPD :
+ AXDI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs,
+ variable_ops),
+ IndexModeUpd, itin_upd,
+ !strconcat(asm, "ia${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+ let Inst{24-23} = 0b01; // Increment After
+ let Inst{21} = 1; // Writeback
+ let Inst{20} = L_bit;
+ }
+ def DDB_UPD :
+ AXDI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs,
+ variable_ops),
+ IndexModeUpd, itin_upd,
+ !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+ let Inst{24-23} = 0b10; // Decrement Before
+ let Inst{21} = 1; // Writeback
+ let Inst{20} = L_bit;
+ }
+
+ // Single Precision
+ def SIA :
+ AXSI4<(outs), (ins GPR:$Rn, pred:$p, spr_reglist:$regs, variable_ops),
+ IndexModeNone, itin,
+ !strconcat(asm, "ia${p}\t$Rn, $regs"), "", []> {
+ let Inst{24-23} = 0b01; // Increment After
+ let Inst{21} = 0; // No writeback
+ let Inst{20} = L_bit;
+
+ // Some single precision VFP instructions may be executed on both NEON and
+ // VFP pipelines.
+ let D = VFPNeonDomain;
+ }
+ def SIA_UPD :
+ AXSI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, spr_reglist:$regs,
+ variable_ops),
+ IndexModeUpd, itin_upd,
+ !strconcat(asm, "ia${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+ let Inst{24-23} = 0b01; // Increment After
+ let Inst{21} = 1; // Writeback
+ let Inst{20} = L_bit;
+
+ // Some single precision VFP instructions may be executed on both NEON and
+ // VFP pipelines.
+ let D = VFPNeonDomain;
+ }
+ def SDB_UPD :
+ AXSI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, spr_reglist:$regs,
+ variable_ops),
+ IndexModeUpd, itin_upd,
+ !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+ let Inst{24-23} = 0b10; // Decrement Before
+ let Inst{21} = 1; // Writeback
+ let Inst{20} = L_bit;
+
+ // Some single precision VFP instructions may be executed on both NEON and
+ // VFP pipelines.
+ let D = VFPNeonDomain;
+ }
+}
+
+let hasSideEffects = 0 in {
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
+defm VLDM : vfp_ldst_mult<"vldm", 1, IIC_fpLoad_m, IIC_fpLoad_mu>;
+
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
+defm VSTM : vfp_ldst_mult<"vstm", 0, IIC_fpStore_m, IIC_fpStore_mu>;
+
+} // hasSideEffects
+
+def : MnemonicAlias<"vldm", "vldmia">;
+def : MnemonicAlias<"vstm", "vstmia">;
+
+
+//===----------------------------------------------------------------------===//
+// Lazy load / store multiple Instructions
+//
+let mayLoad = 1 in
+def VLLDM : AXSI4<(outs), (ins GPRnopc:$Rn, pred:$p), IndexModeNone,
+ IIC_fpLoad_m, "vlldm${p}\t$Rn", "", []>,
+ Requires<[HasV8MMainline, Has8MSecExt]> {
+ let Inst{24-23} = 0b00;
+ let Inst{22} = 0;
+ let Inst{21} = 1;
+ let Inst{20} = 1;
+ let Inst{15-12} = 0;
+ let Inst{7-0} = 0;
+ let mayLoad = 1;
+}
+
+let mayStore = 1 in
+def VLSTM : AXSI4<(outs), (ins GPRnopc:$Rn, pred:$p), IndexModeNone,
+ IIC_fpStore_m, "vlstm${p}\t$Rn", "", []>,
+ Requires<[HasV8MMainline, Has8MSecExt]> {
+ let Inst{24-23} = 0b00;
+ let Inst{22} = 0;
+ let Inst{21} = 1;
+ let Inst{20} = 0;
+ let Inst{15-12} = 0;
+ let Inst{7-0} = 0;
+ let mayStore = 1;
+}
+
+
+// FLDM/FSTM - Load / Store multiple single / double precision registers for
+// pre-ARMv6 cores.
+// These instructions are deprecated!
+def : VFP2MnemonicAlias<"fldmias", "vldmia">;
+def : VFP2MnemonicAlias<"fldmdbs", "vldmdb">;
+def : VFP2MnemonicAlias<"fldmeas", "vldmdb">;
+def : VFP2MnemonicAlias<"fldmfds", "vldmia">;
+def : VFP2MnemonicAlias<"fldmiad", "vldmia">;
+def : VFP2MnemonicAlias<"fldmdbd", "vldmdb">;
+def : VFP2MnemonicAlias<"fldmead", "vldmdb">;
+def : VFP2MnemonicAlias<"fldmfdd", "vldmia">;
+
+def : VFP2MnemonicAlias<"fstmias", "vstmia">;
+def : VFP2MnemonicAlias<"fstmdbs", "vstmdb">;
+def : VFP2MnemonicAlias<"fstmeas", "vstmia">;
+def : VFP2MnemonicAlias<"fstmfds", "vstmdb">;
+def : VFP2MnemonicAlias<"fstmiad", "vstmia">;
+def : VFP2MnemonicAlias<"fstmdbd", "vstmdb">;
+def : VFP2MnemonicAlias<"fstmead", "vstmia">;
+def : VFP2MnemonicAlias<"fstmfdd", "vstmdb">;
+
+def : InstAlias<"vpush${p} $r", (VSTMDDB_UPD SP, pred:$p, dpr_reglist:$r), 0>,
+ Requires<[HasVFP2]>;
+def : InstAlias<"vpush${p} $r", (VSTMSDB_UPD SP, pred:$p, spr_reglist:$r), 0>,
+ Requires<[HasVFP2]>;
+def : InstAlias<"vpop${p} $r", (VLDMDIA_UPD SP, pred:$p, dpr_reglist:$r), 0>,
+ Requires<[HasVFP2]>;
+def : InstAlias<"vpop${p} $r", (VLDMSIA_UPD SP, pred:$p, spr_reglist:$r), 0>,
+ Requires<[HasVFP2]>;
+defm : VFPDTAnyInstAlias<"vpush${p}", "$r",
+ (VSTMSDB_UPD SP, pred:$p, spr_reglist:$r)>;
+defm : VFPDTAnyInstAlias<"vpush${p}", "$r",
+ (VSTMDDB_UPD SP, pred:$p, dpr_reglist:$r)>;
+defm : VFPDTAnyInstAlias<"vpop${p}", "$r",
+ (VLDMSIA_UPD SP, pred:$p, spr_reglist:$r)>;
+defm : VFPDTAnyInstAlias<"vpop${p}", "$r",
+ (VLDMDIA_UPD SP, pred:$p, dpr_reglist:$r)>;
+
+// FLDMX, FSTMX - Load and store multiple unknown precision registers for
+// pre-armv6 cores.
+// These instruction are deprecated so we don't want them to get selected.
+multiclass vfp_ldstx_mult<string asm, bit L_bit> {
+ // Unknown precision
+ def XIA :
+ AXXI4<(outs), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops),
+ IndexModeNone, !strconcat(asm, "iax${p}\t$Rn, $regs"), "", []> {
+ let Inst{24-23} = 0b01; // Increment After
+ let Inst{21} = 0; // No writeback
+ let Inst{20} = L_bit;
+ }
+ def XIA_UPD :
+ AXXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops),
+ IndexModeUpd, !strconcat(asm, "iax${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+ let Inst{24-23} = 0b01; // Increment After
+ let Inst{21} = 1; // Writeback
+ let Inst{20} = L_bit;
+ }
+ def XDB_UPD :
+ AXXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops),
+ IndexModeUpd, !strconcat(asm, "dbx${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+ let Inst{24-23} = 0b10; // Decrement Before
+ let Inst{21} = 1; // Writeback
+ let Inst{20} = L_bit;
+ }
+}
+
+defm FLDM : vfp_ldstx_mult<"fldm", 1>;
+defm FSTM : vfp_ldstx_mult<"fstm", 0>;
+
+def : VFP2MnemonicAlias<"fldmeax", "fldmdbx">;
+def : VFP2MnemonicAlias<"fldmfdx", "fldmiax">;
+
+def : VFP2MnemonicAlias<"fstmeax", "fstmiax">;
+def : VFP2MnemonicAlias<"fstmfdx", "fstmdbx">;
+
+//===----------------------------------------------------------------------===//
+// FP Binary Operations.
+//
+
+let TwoOperandAliasConstraint = "$Dn = $Dd" in
+def VADDD : ADbI<0b11100, 0b11, 0, 0,
+ (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
+ IIC_fpALU64, "vadd", ".f64\t$Dd, $Dn, $Dm",
+ [(set DPR:$Dd, (fadd DPR:$Dn, (f64 DPR:$Dm)))]>;
+
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
+def VADDS : ASbIn<0b11100, 0b11, 0, 0,
+ (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+ IIC_fpALU32, "vadd", ".f32\t$Sd, $Sn, $Sm",
+ [(set SPR:$Sd, (fadd SPR:$Sn, SPR:$Sm))]> {
+ // Some single precision VFP instructions may be executed on both NEON and
+ // VFP pipelines on A8.
+ let D = VFPNeonA8Domain;
+}
+
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
+def VADDH : AHbI<0b11100, 0b11, 0, 0,
+ (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+ IIC_fpALU16, "vadd", ".f16\t$Sd, $Sn, $Sm",
+ []>;
+
+let TwoOperandAliasConstraint = "$Dn = $Dd" in
+def VSUBD : ADbI<0b11100, 0b11, 1, 0,
+ (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
+ IIC_fpALU64, "vsub", ".f64\t$Dd, $Dn, $Dm",
+ [(set DPR:$Dd, (fsub DPR:$Dn, (f64 DPR:$Dm)))]>;
+
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
+def VSUBS : ASbIn<0b11100, 0b11, 1, 0,
+ (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+ IIC_fpALU32, "vsub", ".f32\t$Sd, $Sn, $Sm",
+ [(set SPR:$Sd, (fsub SPR:$Sn, SPR:$Sm))]> {
+ // Some single precision VFP instructions may be executed on both NEON and
+ // VFP pipelines on A8.
+ let D = VFPNeonA8Domain;
+}
+
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
+def VSUBH : AHbI<0b11100, 0b11, 1, 0,
+ (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+ IIC_fpALU16, "vsub", ".f16\t$Sd, $Sn, $Sm",
+ []>;
+
+let TwoOperandAliasConstraint = "$Dn = $Dd" in
+def VDIVD : ADbI<0b11101, 0b00, 0, 0,
+ (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
+ IIC_fpDIV64, "vdiv", ".f64\t$Dd, $Dn, $Dm",
+ [(set DPR:$Dd, (fdiv DPR:$Dn, (f64 DPR:$Dm)))]>;
+
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
+def VDIVS : ASbI<0b11101, 0b00, 0, 0,
+ (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+ IIC_fpDIV32, "vdiv", ".f32\t$Sd, $Sn, $Sm",
+ [(set SPR:$Sd, (fdiv SPR:$Sn, SPR:$Sm))]>;
+
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
+def VDIVH : AHbI<0b11101, 0b00, 0, 0,
+ (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+ IIC_fpDIV16, "vdiv", ".f16\t$Sd, $Sn, $Sm",
+ []>;
+
+let TwoOperandAliasConstraint = "$Dn = $Dd" in
+def VMULD : ADbI<0b11100, 0b10, 0, 0,
+ (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
+ IIC_fpMUL64, "vmul", ".f64\t$Dd, $Dn, $Dm",
+ [(set DPR:$Dd, (fmul DPR:$Dn, (f64 DPR:$Dm)))]>;
+
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
+def VMULS : ASbIn<0b11100, 0b10, 0, 0,
+ (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+ IIC_fpMUL32, "vmul", ".f32\t$Sd, $Sn, $Sm",
+ [(set SPR:$Sd, (fmul SPR:$Sn, SPR:$Sm))]> {
+ // Some single precision VFP instructions may be executed on both NEON and
+ // VFP pipelines on A8.
+ let D = VFPNeonA8Domain;
+}
+
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
+def VMULH : AHbI<0b11100, 0b10, 0, 0,
+ (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+ IIC_fpMUL16, "vmul", ".f16\t$Sd, $Sn, $Sm",
+ []>;
+
+def VNMULD : ADbI<0b11100, 0b10, 1, 0,
+ (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
+ IIC_fpMUL64, "vnmul", ".f64\t$Dd, $Dn, $Dm",
+ [(set DPR:$Dd, (fneg (fmul DPR:$Dn, (f64 DPR:$Dm))))]>;
+
+def VNMULS : ASbI<0b11100, 0b10, 1, 0,
+ (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+ IIC_fpMUL32, "vnmul", ".f32\t$Sd, $Sn, $Sm",
+ [(set SPR:$Sd, (fneg (fmul SPR:$Sn, SPR:$Sm)))]> {
+ // Some single precision VFP instructions may be executed on both NEON and
+ // VFP pipelines on A8.
+ let D = VFPNeonA8Domain;
+}
+
+def VNMULH : AHbI<0b11100, 0b10, 1, 0,
+ (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+ IIC_fpMUL16, "vnmul", ".f16\t$Sd, $Sn, $Sm",
+ []>;
+
+multiclass vsel_inst<string op, bits<2> opc, int CC> {
+ let DecoderNamespace = "VFPV8", PostEncoderMethod = "",
+ Uses = [CPSR], AddedComplexity = 4 in {
+ def H : AHbInp<0b11100, opc, 0,
+ (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+ NoItinerary, !strconcat("vsel", op, ".f16\t$Sd, $Sn, $Sm"),
+ []>,
+ Requires<[HasFullFP16]>;
+
+ def S : ASbInp<0b11100, opc, 0,
+ (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+ NoItinerary, !strconcat("vsel", op, ".f32\t$Sd, $Sn, $Sm"),
+ [(set SPR:$Sd, (ARMcmov SPR:$Sm, SPR:$Sn, CC))]>,
+ Requires<[HasFPARMv8]>;
+
+ def D : ADbInp<0b11100, opc, 0,
+ (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
+ NoItinerary, !strconcat("vsel", op, ".f64\t$Dd, $Dn, $Dm"),
+ [(set DPR:$Dd, (ARMcmov (f64 DPR:$Dm), (f64 DPR:$Dn), CC))]>,
+ Requires<[HasFPARMv8, HasDPVFP]>;
+ }
+}
+
+// The CC constants here match ARMCC::CondCodes.
+defm VSELGT : vsel_inst<"gt", 0b11, 12>;
+defm VSELGE : vsel_inst<"ge", 0b10, 10>;
+defm VSELEQ : vsel_inst<"eq", 0b00, 0>;
+defm VSELVS : vsel_inst<"vs", 0b01, 6>;
+
+multiclass vmaxmin_inst<string op, bit opc, SDNode SD> {
+ let DecoderNamespace = "VFPV8", PostEncoderMethod = "" in {
+ def H : AHbInp<0b11101, 0b00, opc,
+ (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+ NoItinerary, !strconcat(op, ".f16\t$Sd, $Sn, $Sm"),
+ []>,
+ Requires<[HasFullFP16]>;
+
+ def S : ASbInp<0b11101, 0b00, opc,
+ (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+ NoItinerary, !strconcat(op, ".f32\t$Sd, $Sn, $Sm"),
+ [(set SPR:$Sd, (SD SPR:$Sn, SPR:$Sm))]>,
+ Requires<[HasFPARMv8]>;
+
+ def D : ADbInp<0b11101, 0b00, opc,
+ (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
+ NoItinerary, !strconcat(op, ".f64\t$Dd, $Dn, $Dm"),
+ [(set DPR:$Dd, (f64 (SD (f64 DPR:$Dn), (f64 DPR:$Dm))))]>,
+ Requires<[HasFPARMv8, HasDPVFP]>;
+ }
+}
+
+defm VMAXNM : vmaxmin_inst<"vmaxnm", 0, fmaxnum>;
+defm VMINNM : vmaxmin_inst<"vminnm", 1, fminnum>;
+
+// Match reassociated forms only if not sign dependent rounding.
+def : Pat<(fmul (fneg DPR:$a), (f64 DPR:$b)),
+ (VNMULD DPR:$a, DPR:$b)>,
+ Requires<[NoHonorSignDependentRounding,HasDPVFP]>;
+def : Pat<(fmul (fneg SPR:$a), SPR:$b),
+ (VNMULS SPR:$a, SPR:$b)>, Requires<[NoHonorSignDependentRounding]>;
+
+// These are encoded as unary instructions.
+let Defs = [FPSCR_NZCV] in {
+def VCMPED : ADuI<0b11101, 0b11, 0b0100, 0b11, 0,
+ (outs), (ins DPR:$Dd, DPR:$Dm),
+ IIC_fpCMP64, "vcmpe", ".f64\t$Dd, $Dm",
+ [(arm_cmpfp DPR:$Dd, (f64 DPR:$Dm))]>;
+
+def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0,
+ (outs), (ins SPR:$Sd, SPR:$Sm),
+ IIC_fpCMP32, "vcmpe", ".f32\t$Sd, $Sm",
+ [(arm_cmpfp SPR:$Sd, SPR:$Sm)]> {
+ // Some single precision VFP instructions may be executed on both NEON and
+ // VFP pipelines on A8.
+ let D = VFPNeonA8Domain;
+}
+
+def VCMPEH : AHuI<0b11101, 0b11, 0b0100, 0b11, 0,
+ (outs), (ins SPR:$Sd, SPR:$Sm),
+ IIC_fpCMP16, "vcmpe", ".f16\t$Sd, $Sm",
+ []>;
+
+
+// FIXME: Verify encoding after integrated assembler is working.
+def VCMPD : ADuI<0b11101, 0b11, 0b0100, 0b01, 0,
+ (outs), (ins DPR:$Dd, DPR:$Dm),
+ IIC_fpCMP64, "vcmp", ".f64\t$Dd, $Dm",
+ [/* For disassembly only; pattern left blank */]>;
+
+def VCMPS : ASuI<0b11101, 0b11, 0b0100, 0b01, 0,
+ (outs), (ins SPR:$Sd, SPR:$Sm),
+ IIC_fpCMP32, "vcmp", ".f32\t$Sd, $Sm",
+ [/* For disassembly only; pattern left blank */]> {
+ // Some single precision VFP instructions may be executed on both NEON and
+ // VFP pipelines on A8.
+ let D = VFPNeonA8Domain;
+}
+
+def VCMPH : AHuI<0b11101, 0b11, 0b0100, 0b01, 0,
+ (outs), (ins SPR:$Sd, SPR:$Sm),
+ IIC_fpCMP16, "vcmp", ".f16\t$Sd, $Sm",
+ []>;
+} // Defs = [FPSCR_NZCV]
+
+//===----------------------------------------------------------------------===//
+// FP Unary Operations.
+//
+
+def VABSD : ADuI<0b11101, 0b11, 0b0000, 0b11, 0,
+ (outs DPR:$Dd), (ins DPR:$Dm),
+ IIC_fpUNA64, "vabs", ".f64\t$Dd, $Dm",
+ [(set DPR:$Dd, (fabs (f64 DPR:$Dm)))]>;
+
+def VABSS : ASuIn<0b11101, 0b11, 0b0000, 0b11, 0,
+ (outs SPR:$Sd), (ins SPR:$Sm),
+ IIC_fpUNA32, "vabs", ".f32\t$Sd, $Sm",
+ [(set SPR:$Sd, (fabs SPR:$Sm))]> {
+ // Some single precision VFP instructions may be executed on both NEON and
+ // VFP pipelines on A8.
+ let D = VFPNeonA8Domain;
+}
+
+def VABSH : AHuI<0b11101, 0b11, 0b0000, 0b11, 0,
+ (outs SPR:$Sd), (ins SPR:$Sm),
+ IIC_fpUNA16, "vabs", ".f16\t$Sd, $Sm",
+ []>;
+
+let Defs = [FPSCR_NZCV] in {
+def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0,
+ (outs), (ins DPR:$Dd),
+ IIC_fpCMP64, "vcmpe", ".f64\t$Dd, #0",
+ [(arm_cmpfp0 (f64 DPR:$Dd))]> {
+ let Inst{3-0} = 0b0000;
+ let Inst{5} = 0;
+}
+
+def VCMPEZS : ASuI<0b11101, 0b11, 0b0101, 0b11, 0,
+ (outs), (ins SPR:$Sd),
+ IIC_fpCMP32, "vcmpe", ".f32\t$Sd, #0",
+ [(arm_cmpfp0 SPR:$Sd)]> {
+ let Inst{3-0} = 0b0000;
+ let Inst{5} = 0;
+
+ // Some single precision VFP instructions may be executed on both NEON and
+ // VFP pipelines on A8.
+ let D = VFPNeonA8Domain;
+}
+
+def VCMPEZH : AHuI<0b11101, 0b11, 0b0101, 0b11, 0,
+ (outs), (ins SPR:$Sd),
+ IIC_fpCMP16, "vcmpe", ".f16\t$Sd, #0",
+ []> {
+ let Inst{3-0} = 0b0000;
+ let Inst{5} = 0;
+}
+
+// FIXME: Verify encoding after integrated assembler is working.
+def VCMPZD : ADuI<0b11101, 0b11, 0b0101, 0b01, 0,
+ (outs), (ins DPR:$Dd),
+ IIC_fpCMP64, "vcmp", ".f64\t$Dd, #0",
+ [/* For disassembly only; pattern left blank */]> {
+ let Inst{3-0} = 0b0000;
+ let Inst{5} = 0;
+}
+
+def VCMPZS : ASuI<0b11101, 0b11, 0b0101, 0b01, 0,
+ (outs), (ins SPR:$Sd),
+ IIC_fpCMP32, "vcmp", ".f32\t$Sd, #0",
+ [/* For disassembly only; pattern left blank */]> {
+ let Inst{3-0} = 0b0000;
+ let Inst{5} = 0;
+
+ // Some single precision VFP instructions may be executed on both NEON and
+ // VFP pipelines on A8.
+ let D = VFPNeonA8Domain;
+}
+
+def VCMPZH : AHuI<0b11101, 0b11, 0b0101, 0b01, 0,
+ (outs), (ins SPR:$Sd),
+ IIC_fpCMP16, "vcmp", ".f16\t$Sd, #0",
+ []> {
+ let Inst{3-0} = 0b0000;
+ let Inst{5} = 0;
+}
+} // Defs = [FPSCR_NZCV]
+
+def VCVTDS : ASuI<0b11101, 0b11, 0b0111, 0b11, 0,
+ (outs DPR:$Dd), (ins SPR:$Sm),
+ IIC_fpCVTDS, "vcvt", ".f64.f32\t$Dd, $Sm",
+ [(set DPR:$Dd, (fpextend SPR:$Sm))]> {
+ // Instruction operands.
+ bits<5> Dd;
+ bits<5> Sm;
+
+ // Encode instruction operands.
+ let Inst{3-0} = Sm{4-1};
+ let Inst{5} = Sm{0};
+ let Inst{15-12} = Dd{3-0};
+ let Inst{22} = Dd{4};
+
+ let Predicates = [HasVFP2, HasDPVFP];
+}
+
+// Special case encoding: bits 11-8 is 0b1011.
+def VCVTSD : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm,
+ IIC_fpCVTSD, "vcvt", ".f32.f64\t$Sd, $Dm",
+ [(set SPR:$Sd, (fpround DPR:$Dm))]> {
+ // Instruction operands.
+ bits<5> Sd;
+ bits<5> Dm;
+
+ // Encode instruction operands.
+ let Inst{3-0} = Dm{3-0};
+ let Inst{5} = Dm{4};
+ let Inst{15-12} = Sd{4-1};
+ let Inst{22} = Sd{0};
+
+ let Inst{27-23} = 0b11101;
+ let Inst{21-16} = 0b110111;
+ let Inst{11-8} = 0b1011;
+ let Inst{7-6} = 0b11;
+ let Inst{4} = 0;
+
+ let Predicates = [HasVFP2, HasDPVFP];
+}
+
+// Between half, single and double-precision. For disassembly only.
+
+// FIXME: Verify encoding after integrated assembler is working.
+def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
+ /* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$Sd, $Sm",
+ [/* For disassembly only; pattern left blank */]>,
+ Requires<[HasFP16]>;
+
+def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
+ /* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$Sd, $Sm",
+ [/* For disassembly only; pattern left blank */]>,
+ Requires<[HasFP16]>;
+
+def VCVTTHS: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
+ /* FIXME */ IIC_fpCVTSH, "vcvtt", ".f32.f16\t$Sd, $Sm",
+ [/* For disassembly only; pattern left blank */]>,
+ Requires<[HasFP16]>;
+
+def VCVTTSH: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
+ /* FIXME */ IIC_fpCVTHS, "vcvtt", ".f16.f32\t$Sd, $Sm",
+ [/* For disassembly only; pattern left blank */]>,
+ Requires<[HasFP16]>;
+
+def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0,
+ (outs DPR:$Dd), (ins SPR:$Sm),
+ NoItinerary, "vcvtb", ".f64.f16\t$Dd, $Sm",
+ []>, Requires<[HasFPARMv8, HasDPVFP]> {
+ // Instruction operands.
+ bits<5> Sm;
+
+ // Encode instruction operands.
+ let Inst{3-0} = Sm{4-1};
+ let Inst{5} = Sm{0};
+}
+
+def VCVTBDH : ADuI<0b11101, 0b11, 0b0011, 0b01, 0,
+ (outs SPR:$Sd), (ins DPR:$Dm),
+ NoItinerary, "vcvtb", ".f16.f64\t$Sd, $Dm",
+ []>, Requires<[HasFPARMv8, HasDPVFP]> {
+ // Instruction operands.
+ bits<5> Sd;
+ bits<5> Dm;
+
+ // Encode instruction operands.
+ let Inst{3-0} = Dm{3-0};
+ let Inst{5} = Dm{4};
+ let Inst{15-12} = Sd{4-1};
+ let Inst{22} = Sd{0};
+}
+
+def VCVTTHD : ADuI<0b11101, 0b11, 0b0010, 0b11, 0,
+ (outs DPR:$Dd), (ins SPR:$Sm),
+ NoItinerary, "vcvtt", ".f64.f16\t$Dd, $Sm",
+ []>, Requires<[HasFPARMv8, HasDPVFP]> {
+ // Instruction operands.
+ bits<5> Sm;
+
+ // Encode instruction operands.
+ let Inst{3-0} = Sm{4-1};
+ let Inst{5} = Sm{0};
+}
+
+def VCVTTDH : ADuI<0b11101, 0b11, 0b0011, 0b11, 0,
+ (outs SPR:$Sd), (ins DPR:$Dm),
+ NoItinerary, "vcvtt", ".f16.f64\t$Sd, $Dm",
+ []>, Requires<[HasFPARMv8, HasDPVFP]> {
+ // Instruction operands.
+ bits<5> Sd;
+ bits<5> Dm;
+
+ // Encode instruction operands.
+ let Inst{15-12} = Sd{4-1};
+ let Inst{22} = Sd{0};
+ let Inst{3-0} = Dm{3-0};
+ let Inst{5} = Dm{4};
+}
+
+def : Pat<(fp_to_f16 SPR:$a),
+ (i32 (COPY_TO_REGCLASS (VCVTBSH SPR:$a), GPR))>;
+
+def : Pat<(fp_to_f16 (f64 DPR:$a)),
+ (i32 (COPY_TO_REGCLASS (VCVTBDH DPR:$a), GPR))>;
+
+def : Pat<(f16_to_fp GPR:$a),
+ (VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>;
+
+def : Pat<(f64 (f16_to_fp GPR:$a)),
+ (VCVTBHD (COPY_TO_REGCLASS GPR:$a, SPR))>;
+
+multiclass vcvt_inst<string opc, bits<2> rm,
+ SDPatternOperator node = null_frag> {
+ let PostEncoderMethod = "", DecoderNamespace = "VFPV8" in {
+ def SH : AHuInp<0b11101, 0b11, 0b1100, 0b11, 0,
+ (outs SPR:$Sd), (ins SPR:$Sm),
+ NoItinerary, !strconcat("vcvt", opc, ".s32.f16\t$Sd, $Sm"),
+ []>,
+ Requires<[HasFullFP16]> {
+ let Inst{17-16} = rm;
+ }
+
+ def UH : AHuInp<0b11101, 0b11, 0b1100, 0b01, 0,
+ (outs SPR:$Sd), (ins SPR:$Sm),
+ NoItinerary, !strconcat("vcvt", opc, ".u32.f16\t$Sd, $Sm"),
+ []>,
+ Requires<[HasFullFP16]> {
+ let Inst{17-16} = rm;
+ }
+
+ def SS : ASuInp<0b11101, 0b11, 0b1100, 0b11, 0,
+ (outs SPR:$Sd), (ins SPR:$Sm),
+ NoItinerary, !strconcat("vcvt", opc, ".s32.f32\t$Sd, $Sm"),
+ []>,
+ Requires<[HasFPARMv8]> {
+ let Inst{17-16} = rm;
+ }
+
+ def US : ASuInp<0b11101, 0b11, 0b1100, 0b01, 0,
+ (outs SPR:$Sd), (ins SPR:$Sm),
+ NoItinerary, !strconcat("vcvt", opc, ".u32.f32\t$Sd, $Sm"),
+ []>,
+ Requires<[HasFPARMv8]> {
+ let Inst{17-16} = rm;
+ }
+
+ def SD : ASuInp<0b11101, 0b11, 0b1100, 0b11, 0,
+ (outs SPR:$Sd), (ins DPR:$Dm),
+ NoItinerary, !strconcat("vcvt", opc, ".s32.f64\t$Sd, $Dm"),
+ []>,
+ Requires<[HasFPARMv8, HasDPVFP]> {
+ bits<5> Dm;
+
+ let Inst{17-16} = rm;
+
+ // Encode instruction operands
+ let Inst{3-0} = Dm{3-0};
+ let Inst{5} = Dm{4};
+ let Inst{8} = 1;
+ }
+
+ def UD : ASuInp<0b11101, 0b11, 0b1100, 0b01, 0,
+ (outs SPR:$Sd), (ins DPR:$Dm),
+ NoItinerary, !strconcat("vcvt", opc, ".u32.f64\t$Sd, $Dm"),
+ []>,
+ Requires<[HasFPARMv8, HasDPVFP]> {
+ bits<5> Dm;
+
+ let Inst{17-16} = rm;
+
+ // Encode instruction operands
+ let Inst{3-0} = Dm{3-0};
+ let Inst{5} = Dm{4};
+ let Inst{8} = 1;
+ }
+ }
+
+ let Predicates = [HasFPARMv8] in {
+ def : Pat<(i32 (fp_to_sint (node SPR:$a))),
+ (COPY_TO_REGCLASS
+ (!cast<Instruction>(NAME#"SS") SPR:$a),
+ GPR)>;
+ def : Pat<(i32 (fp_to_uint (node SPR:$a))),
+ (COPY_TO_REGCLASS
+ (!cast<Instruction>(NAME#"US") SPR:$a),
+ GPR)>;
+ }
+ let Predicates = [HasFPARMv8, HasDPVFP] in {
+ def : Pat<(i32 (fp_to_sint (node (f64 DPR:$a)))),
+ (COPY_TO_REGCLASS
+ (!cast<Instruction>(NAME#"SD") DPR:$a),
+ GPR)>;
+ def : Pat<(i32 (fp_to_uint (node (f64 DPR:$a)))),
+ (COPY_TO_REGCLASS
+ (!cast<Instruction>(NAME#"UD") DPR:$a),
+ GPR)>;
+ }
+}
+
+defm VCVTA : vcvt_inst<"a", 0b00, fround>;
+defm VCVTN : vcvt_inst<"n", 0b01>;
+defm VCVTP : vcvt_inst<"p", 0b10, fceil>;
+defm VCVTM : vcvt_inst<"m", 0b11, ffloor>;
+
+def VNEGD : ADuI<0b11101, 0b11, 0b0001, 0b01, 0,
+ (outs DPR:$Dd), (ins DPR:$Dm),
+ IIC_fpUNA64, "vneg", ".f64\t$Dd, $Dm",
+ [(set DPR:$Dd, (fneg (f64 DPR:$Dm)))]>;
+
+def VNEGS : ASuIn<0b11101, 0b11, 0b0001, 0b01, 0,
+ (outs SPR:$Sd), (ins SPR:$Sm),
+ IIC_fpUNA32, "vneg", ".f32\t$Sd, $Sm",
+ [(set SPR:$Sd, (fneg SPR:$Sm))]> {
+ // Some single precision VFP instructions may be executed on both NEON and
+ // VFP pipelines on A8.
+ let D = VFPNeonA8Domain;
+}
+
+def VNEGH : AHuI<0b11101, 0b11, 0b0001, 0b01, 0,
+ (outs SPR:$Sd), (ins SPR:$Sm),
+ IIC_fpUNA16, "vneg", ".f16\t$Sd, $Sm",
+ []>;
+
+multiclass vrint_inst_zrx<string opc, bit op, bit op2, SDPatternOperator node> {
+ def H : AHuI<0b11101, 0b11, 0b0110, 0b11, 0,
+ (outs SPR:$Sd), (ins SPR:$Sm),
+ NoItinerary, !strconcat("vrint", opc), ".f16\t$Sd, $Sm",
+ []>,
+ Requires<[HasFullFP16]> {
+ let Inst{7} = op2;
+ let Inst{16} = op;
+ }
+
+ def S : ASuI<0b11101, 0b11, 0b0110, 0b11, 0,
+ (outs SPR:$Sd), (ins SPR:$Sm),
+ NoItinerary, !strconcat("vrint", opc), ".f32\t$Sd, $Sm",
+ [(set (f32 SPR:$Sd), (node (f32 SPR:$Sm)))]>,
+ Requires<[HasFPARMv8]> {
+ let Inst{7} = op2;
+ let Inst{16} = op;
+ }
+ def D : ADuI<0b11101, 0b11, 0b0110, 0b11, 0,
+ (outs DPR:$Dd), (ins DPR:$Dm),
+ NoItinerary, !strconcat("vrint", opc), ".f64\t$Dd, $Dm",
+ [(set (f64 DPR:$Dd), (node (f64 DPR:$Dm)))]>,
+ Requires<[HasFPARMv8, HasDPVFP]> {
+ let Inst{7} = op2;
+ let Inst{16} = op;
+ }
+
+ def : InstAlias<!strconcat("vrint", opc, "$p.f16.f16\t$Sd, $Sm"),
+ (!cast<Instruction>(NAME#"H") SPR:$Sd, SPR:$Sm, pred:$p), 0>,
+ Requires<[HasFullFP16]>;
+ def : InstAlias<!strconcat("vrint", opc, "$p.f32.f32\t$Sd, $Sm"),
+ (!cast<Instruction>(NAME#"S") SPR:$Sd, SPR:$Sm, pred:$p), 0>,
+ Requires<[HasFPARMv8]>;
+ def : InstAlias<!strconcat("vrint", opc, "$p.f64.f64\t$Dd, $Dm"),
+ (!cast<Instruction>(NAME#"D") DPR:$Dd, DPR:$Dm, pred:$p), 0>,
+ Requires<[HasFPARMv8,HasDPVFP]>;
+}
+
+defm VRINTZ : vrint_inst_zrx<"z", 0, 1, ftrunc>;
+defm VRINTR : vrint_inst_zrx<"r", 0, 0, fnearbyint>;
+defm VRINTX : vrint_inst_zrx<"x", 1, 0, frint>;
+
+multiclass vrint_inst_anpm<string opc, bits<2> rm,
+ SDPatternOperator node = null_frag> {
+ let PostEncoderMethod = "", DecoderNamespace = "VFPV8" in {
+ def H : AHuInp<0b11101, 0b11, 0b1000, 0b01, 0,
+ (outs SPR:$Sd), (ins SPR:$Sm),
+ NoItinerary, !strconcat("vrint", opc, ".f16\t$Sd, $Sm"),
+ []>,
+ Requires<[HasFullFP16]> {
+ let Inst{17-16} = rm;
+ }
+ def S : ASuInp<0b11101, 0b11, 0b1000, 0b01, 0,
+ (outs SPR:$Sd), (ins SPR:$Sm),
+ NoItinerary, !strconcat("vrint", opc, ".f32\t$Sd, $Sm"),
+ [(set (f32 SPR:$Sd), (node (f32 SPR:$Sm)))]>,
+ Requires<[HasFPARMv8]> {
+ let Inst{17-16} = rm;
+ }
+ def D : ADuInp<0b11101, 0b11, 0b1000, 0b01, 0,
+ (outs DPR:$Dd), (ins DPR:$Dm),
+ NoItinerary, !strconcat("vrint", opc, ".f64\t$Dd, $Dm"),
+ [(set (f64 DPR:$Dd), (node (f64 DPR:$Dm)))]>,
+ Requires<[HasFPARMv8, HasDPVFP]> {
+ let Inst{17-16} = rm;
+ }
+ }
+
+ def : InstAlias<!strconcat("vrint", opc, ".f32.f32\t$Sd, $Sm"),
+ (!cast<Instruction>(NAME#"S") SPR:$Sd, SPR:$Sm), 0>,
+ Requires<[HasFPARMv8]>;
+ def : InstAlias<!strconcat("vrint", opc, ".f64.f64\t$Dd, $Dm"),
+ (!cast<Instruction>(NAME#"D") DPR:$Dd, DPR:$Dm), 0>,
+ Requires<[HasFPARMv8,HasDPVFP]>;
+}
+
+defm VRINTA : vrint_inst_anpm<"a", 0b00, fround>;
+defm VRINTN : vrint_inst_anpm<"n", 0b01>;
+defm VRINTP : vrint_inst_anpm<"p", 0b10, fceil>;
+defm VRINTM : vrint_inst_anpm<"m", 0b11, ffloor>;
+
+def VSQRTD : ADuI<0b11101, 0b11, 0b0001, 0b11, 0,
+ (outs DPR:$Dd), (ins DPR:$Dm),
+ IIC_fpSQRT64, "vsqrt", ".f64\t$Dd, $Dm",
+ [(set DPR:$Dd, (fsqrt (f64 DPR:$Dm)))]>;
+
+def VSQRTS : ASuI<0b11101, 0b11, 0b0001, 0b11, 0,
+ (outs SPR:$Sd), (ins SPR:$Sm),
+ IIC_fpSQRT32, "vsqrt", ".f32\t$Sd, $Sm",
+ [(set SPR:$Sd, (fsqrt SPR:$Sm))]>;
+
+def VSQRTH : AHuI<0b11101, 0b11, 0b0001, 0b11, 0,
+ (outs SPR:$Sd), (ins SPR:$Sm),
+ IIC_fpSQRT16, "vsqrt", ".f16\t$Sd, $Sm",
+ []>;
+
+let hasSideEffects = 0 in {
+def VMOVD : ADuI<0b11101, 0b11, 0b0000, 0b01, 0,
+ (outs DPR:$Dd), (ins DPR:$Dm),
+ IIC_fpUNA64, "vmov", ".f64\t$Dd, $Dm", []>;
+
+def VMOVS : ASuI<0b11101, 0b11, 0b0000, 0b01, 0,
+ (outs SPR:$Sd), (ins SPR:$Sm),
+ IIC_fpUNA32, "vmov", ".f32\t$Sd, $Sm", []>;
+
+let PostEncoderMethod = "", DecoderNamespace = "VFPV8" in {
+def VMOVH : ASuInp<0b11101, 0b11, 0b0000, 0b01, 0,
+ (outs SPR:$Sd), (ins SPR:$Sm),
+ IIC_fpUNA16, "vmovx.f16\t$Sd, $Sm", []>,
+ Requires<[HasFullFP16]>;
+
+def VINSH : ASuInp<0b11101, 0b11, 0b0000, 0b11, 0,
+ (outs SPR:$Sd), (ins SPR:$Sm),
+ IIC_fpUNA16, "vins.f16\t$Sd, $Sm", []>,
+ Requires<[HasFullFP16]>;
+} // PostEncoderMethod
+} // hasSideEffects
+
+//===----------------------------------------------------------------------===//
+// FP <-> GPR Copies. Int <-> FP Conversions.
+//
+
+def VMOVRS : AVConv2I<0b11100001, 0b1010,
+ (outs GPR:$Rt), (ins SPR:$Sn),
+ IIC_fpMOVSI, "vmov", "\t$Rt, $Sn",
+ [(set GPR:$Rt, (bitconvert SPR:$Sn))]> {
+ // Instruction operands.
+ bits<4> Rt;
+ bits<5> Sn;
+
+ // Encode instruction operands.
+ let Inst{19-16} = Sn{4-1};
+ let Inst{7} = Sn{0};
+ let Inst{15-12} = Rt;
+
+ let Inst{6-5} = 0b00;
+ let Inst{3-0} = 0b0000;
+
+ // Some single precision VFP instructions may be executed on both NEON and VFP
+ // pipelines.
+ let D = VFPNeonDomain;
+}
+
+// Bitcast i32 -> f32. NEON prefers to use VMOVDRR.
+def VMOVSR : AVConv4I<0b11100000, 0b1010,
+ (outs SPR:$Sn), (ins GPR:$Rt),
+ IIC_fpMOVIS, "vmov", "\t$Sn, $Rt",
+ [(set SPR:$Sn, (bitconvert GPR:$Rt))]>,
+ Requires<[HasVFP2, UseVMOVSR]> {
+ // Instruction operands.
+ bits<5> Sn;
+ bits<4> Rt;
+
+ // Encode instruction operands.
+ let Inst{19-16} = Sn{4-1};
+ let Inst{7} = Sn{0};
+ let Inst{15-12} = Rt;
+
+ let Inst{6-5} = 0b00;
+ let Inst{3-0} = 0b0000;
+
+ // Some single precision VFP instructions may be executed on both NEON and VFP
+ // pipelines.
+ let D = VFPNeonDomain;
+}
+
+let hasSideEffects = 0 in {
+def VMOVRRD : AVConv3I<0b11000101, 0b1011,
+ (outs GPR:$Rt, GPR:$Rt2), (ins DPR:$Dm),
+ IIC_fpMOVDI, "vmov", "\t$Rt, $Rt2, $Dm",
+ [/* FIXME: Can't write pattern for multiple result instr*/]> {
+ // Instruction operands.
+ bits<5> Dm;
+ bits<4> Rt;
+ bits<4> Rt2;
+
+ // Encode instruction operands.
+ let Inst{3-0} = Dm{3-0};
+ let Inst{5} = Dm{4};
+ let Inst{15-12} = Rt;
+ let Inst{19-16} = Rt2;
+
+ let Inst{7-6} = 0b00;
+
+ // Some single precision VFP instructions may be executed on both NEON and VFP
+ // pipelines.
+ let D = VFPNeonDomain;
+
+ // This instruction is equivalent to
+ // $Rt = EXTRACT_SUBREG $Dm, ssub_0
+ // $Rt2 = EXTRACT_SUBREG $Dm, ssub_1
+ let isExtractSubreg = 1;
+}
+
+def VMOVRRS : AVConv3I<0b11000101, 0b1010,
+ (outs GPR:$Rt, GPR:$Rt2), (ins SPR:$src1, SPR:$src2),
+ IIC_fpMOVDI, "vmov", "\t$Rt, $Rt2, $src1, $src2",
+ [/* For disassembly only; pattern left blank */]> {
+ bits<5> src1;
+ bits<4> Rt;
+ bits<4> Rt2;
+
+ // Encode instruction operands.
+ let Inst{3-0} = src1{4-1};
+ let Inst{5} = src1{0};
+ let Inst{15-12} = Rt;
+ let Inst{19-16} = Rt2;
+
+ let Inst{7-6} = 0b00;
+
+ // Some single precision VFP instructions may be executed on both NEON and VFP
+ // pipelines.
+ let D = VFPNeonDomain;
+ let DecoderMethod = "DecodeVMOVRRS";
+}
+} // hasSideEffects
+
+// FMDHR: GPR -> SPR
+// FMDLR: GPR -> SPR
+
+def VMOVDRR : AVConv5I<0b11000100, 0b1011,
+ (outs DPR:$Dm), (ins GPR:$Rt, GPR:$Rt2),
+ IIC_fpMOVID, "vmov", "\t$Dm, $Rt, $Rt2",
+ [(set DPR:$Dm, (arm_fmdrr GPR:$Rt, GPR:$Rt2))]> {
+ // Instruction operands.
+ bits<5> Dm;
+ bits<4> Rt;
+ bits<4> Rt2;
+
+ // Encode instruction operands.
+ let Inst{3-0} = Dm{3-0};
+ let Inst{5} = Dm{4};
+ let Inst{15-12} = Rt;
+ let Inst{19-16} = Rt2;
+
+ let Inst{7-6} = 0b00;
+
+ // Some single precision VFP instructions may be executed on both NEON and VFP
+ // pipelines.
+ let D = VFPNeonDomain;
+
+ // This instruction is equivalent to
+ // $Dm = REG_SEQUENCE $Rt, ssub_0, $Rt2, ssub_1
+ let isRegSequence = 1;
+}
+
+// Hoist an fabs or a fneg of a value coming from integer registers
+// and do the fabs/fneg on the integer value. This is never a lose
+// and could enable the conversion to float to be removed completely.
+def : Pat<(fabs (arm_fmdrr GPR:$Rl, GPR:$Rh)),
+ (VMOVDRR GPR:$Rl, (BFC GPR:$Rh, (i32 0x7FFFFFFF)))>,
+ Requires<[IsARM, HasV6T2]>;
+def : Pat<(fabs (arm_fmdrr GPR:$Rl, GPR:$Rh)),
+ (VMOVDRR GPR:$Rl, (t2BFC GPR:$Rh, (i32 0x7FFFFFFF)))>,
+ Requires<[IsThumb2, HasV6T2]>;
+def : Pat<(fneg (arm_fmdrr GPR:$Rl, GPR:$Rh)),
+ (VMOVDRR GPR:$Rl, (EORri GPR:$Rh, (i32 0x80000000)))>,
+ Requires<[IsARM]>;
+def : Pat<(fneg (arm_fmdrr GPR:$Rl, GPR:$Rh)),
+ (VMOVDRR GPR:$Rl, (t2EORri GPR:$Rh, (i32 0x80000000)))>,
+ Requires<[IsThumb2]>;
+
+let hasSideEffects = 0 in
+def VMOVSRR : AVConv5I<0b11000100, 0b1010,
+ (outs SPR:$dst1, SPR:$dst2), (ins GPR:$src1, GPR:$src2),
+ IIC_fpMOVID, "vmov", "\t$dst1, $dst2, $src1, $src2",
+ [/* For disassembly only; pattern left blank */]> {
+ // Instruction operands.
+ bits<5> dst1;
+ bits<4> src1;
+ bits<4> src2;
+
+ // Encode instruction operands.
+ let Inst{3-0} = dst1{4-1};
+ let Inst{5} = dst1{0};
+ let Inst{15-12} = src1;
+ let Inst{19-16} = src2;
+
+ let Inst{7-6} = 0b00;
+
+ // Some single precision VFP instructions may be executed on both NEON and VFP
+ // pipelines.
+ let D = VFPNeonDomain;
+
+ let DecoderMethod = "DecodeVMOVSRR";
+}
+
+// Move H->R, clearing top 16 bits
+def VMOVRH : AVConv2I<0b11100001, 0b1001,
+ (outs GPR:$Rt), (ins SPR:$Sn),
+ IIC_fpMOVSI, "vmov", ".f16\t$Rt, $Sn",
+ []>,
+ Requires<[HasFullFP16]> {
+ // Instruction operands.
+ bits<4> Rt;
+ bits<5> Sn;
+
+ // Encode instruction operands.
+ let Inst{19-16} = Sn{4-1};
+ let Inst{7} = Sn{0};
+ let Inst{15-12} = Rt;
+
+ let Inst{6-5} = 0b00;
+ let Inst{3-0} = 0b0000;
+}
+
+// Move R->H, clearing top 16 bits
+def VMOVHR : AVConv4I<0b11100000, 0b1001,
+ (outs SPR:$Sn), (ins GPR:$Rt),
+ IIC_fpMOVIS, "vmov", ".f16\t$Sn, $Rt",
+ []>,
+ Requires<[HasFullFP16]> {
+ // Instruction operands.
+ bits<5> Sn;
+ bits<4> Rt;
+
+ // Encode instruction operands.
+ let Inst{19-16} = Sn{4-1};
+ let Inst{7} = Sn{0};
+ let Inst{15-12} = Rt;
+
+ let Inst{6-5} = 0b00;
+ let Inst{3-0} = 0b0000;
+}
+
+// FMRDH: SPR -> GPR
+// FMRDL: SPR -> GPR
+// FMRRS: SPR -> GPR
+// FMRX: SPR system reg -> GPR
+// FMSRR: GPR -> SPR
+// FMXR: GPR -> VFP system reg
+
+
+// Int -> FP:
+
+class AVConv1IDs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
+ bits<4> opcod4, dag oops, dag iops,
+ InstrItinClass itin, string opc, string asm,
+ list<dag> pattern>
+ : AVConv1I<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm,
+ pattern> {
+ // Instruction operands.
+ bits<5> Dd;
+ bits<5> Sm;
+
+ // Encode instruction operands.
+ let Inst{3-0} = Sm{4-1};
+ let Inst{5} = Sm{0};
+ let Inst{15-12} = Dd{3-0};
+ let Inst{22} = Dd{4};
+
+ let Predicates = [HasVFP2, HasDPVFP];
+}
+
+class AVConv1InSs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
+ bits<4> opcod4, dag oops, dag iops,InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : AVConv1In<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm,
+ pattern> {
+ // Instruction operands.
+ bits<5> Sd;
+ bits<5> Sm;
+
+ // Encode instruction operands.
+ let Inst{3-0} = Sm{4-1};
+ let Inst{5} = Sm{0};
+ let Inst{15-12} = Sd{4-1};
+ let Inst{22} = Sd{0};
+}
+
+class AVConv1IHs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
+ bits<4> opcod4, dag oops, dag iops,
+ InstrItinClass itin, string opc, string asm,
+ list<dag> pattern>
+ : AVConv1I<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm,
+ pattern> {
+ // Instruction operands.
+ bits<5> Sd;
+ bits<5> Sm;
+
+ // Encode instruction operands.
+ let Inst{3-0} = Sm{4-1};
+ let Inst{5} = Sm{0};
+ let Inst{15-12} = Sd{4-1};
+ let Inst{22} = Sd{0};
+
+ let Predicates = [HasFullFP16];
+}
+
+def VSITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011,
+ (outs DPR:$Dd), (ins SPR:$Sm),
+ IIC_fpCVTID, "vcvt", ".f64.s32\t$Dd, $Sm",
+ []> {
+ let Inst{7} = 1; // s32
+}
+
+let Predicates=[HasVFP2, HasDPVFP] in {
+ def : VFPPat<(f64 (sint_to_fp GPR:$a)),
+ (VSITOD (COPY_TO_REGCLASS GPR:$a, SPR))>;
+
+ def : VFPPat<(f64 (sint_to_fp (i32 (alignedload32 addrmode5:$a)))),
+ (VSITOD (VLDRS addrmode5:$a))>;
+}
+
+def VSITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010,
+ (outs SPR:$Sd),(ins SPR:$Sm),
+ IIC_fpCVTIS, "vcvt", ".f32.s32\t$Sd, $Sm",
+ []> {
+ let Inst{7} = 1; // s32
+
+ // Some single precision VFP instructions may be executed on both NEON and
+ // VFP pipelines on A8.
+ let D = VFPNeonA8Domain;
+}
+
+def : VFPNoNEONPat<(f32 (sint_to_fp GPR:$a)),
+ (VSITOS (COPY_TO_REGCLASS GPR:$a, SPR))>;
+
+def : VFPNoNEONPat<(f32 (sint_to_fp (i32 (alignedload32 addrmode5:$a)))),
+ (VSITOS (VLDRS addrmode5:$a))>;
+
+def VSITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001,
+ (outs SPR:$Sd), (ins SPR:$Sm),
+ IIC_fpCVTIH, "vcvt", ".f16.s32\t$Sd, $Sm",
+ []> {
+ let Inst{7} = 1; // s32
+}
+
+def VUITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011,
+ (outs DPR:$Dd), (ins SPR:$Sm),
+ IIC_fpCVTID, "vcvt", ".f64.u32\t$Dd, $Sm",
+ []> {
+ let Inst{7} = 0; // u32
+}
+
+let Predicates=[HasVFP2, HasDPVFP] in {
+ def : VFPPat<(f64 (uint_to_fp GPR:$a)),
+ (VUITOD (COPY_TO_REGCLASS GPR:$a, SPR))>;
+
+ def : VFPPat<(f64 (uint_to_fp (i32 (alignedload32 addrmode5:$a)))),
+ (VUITOD (VLDRS addrmode5:$a))>;
+}
+
+def VUITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010,
+ (outs SPR:$Sd), (ins SPR:$Sm),
+ IIC_fpCVTIS, "vcvt", ".f32.u32\t$Sd, $Sm",
+ []> {
+ let Inst{7} = 0; // u32
+
+ // Some single precision VFP instructions may be executed on both NEON and
+ // VFP pipelines on A8.
+ let D = VFPNeonA8Domain;
+}
+
+def : VFPNoNEONPat<(f32 (uint_to_fp GPR:$a)),
+ (VUITOS (COPY_TO_REGCLASS GPR:$a, SPR))>;
+
+def : VFPNoNEONPat<(f32 (uint_to_fp (i32 (alignedload32 addrmode5:$a)))),
+ (VUITOS (VLDRS addrmode5:$a))>;
+
+def VUITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001,
+ (outs SPR:$Sd), (ins SPR:$Sm),
+ IIC_fpCVTIH, "vcvt", ".f16.u32\t$Sd, $Sm",
+ []> {
+ let Inst{7} = 0; // u32
+}
+
+// FP -> Int:
+
+class AVConv1IsD_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
+ bits<4> opcod4, dag oops, dag iops,
+ InstrItinClass itin, string opc, string asm,
+ list<dag> pattern>
+ : AVConv1I<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm,
+ pattern> {
+ // Instruction operands.
+ bits<5> Sd;
+ bits<5> Dm;
+
+ // Encode instruction operands.
+ let Inst{3-0} = Dm{3-0};
+ let Inst{5} = Dm{4};
+ let Inst{15-12} = Sd{4-1};
+ let Inst{22} = Sd{0};
+
+ let Predicates = [HasVFP2, HasDPVFP];
+}
+
+class AVConv1InsS_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
+ bits<4> opcod4, dag oops, dag iops,
+ InstrItinClass itin, string opc, string asm,
+ list<dag> pattern>
+ : AVConv1In<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm,
+ pattern> {
+ // Instruction operands.
+ bits<5> Sd;
+ bits<5> Sm;
+
+ // Encode instruction operands.
+ let Inst{3-0} = Sm{4-1};
+ let Inst{5} = Sm{0};
+ let Inst{15-12} = Sd{4-1};
+ let Inst{22} = Sd{0};
+}
+
+class AVConv1IsH_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
+ bits<4> opcod4, dag oops, dag iops,
+ InstrItinClass itin, string opc, string asm,
+ list<dag> pattern>
+ : AVConv1I<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm,
+ pattern> {
+ // Instruction operands.
+ bits<5> Sd;
+ bits<5> Sm;
+
+ // Encode instruction operands.
+ let Inst{3-0} = Sm{4-1};
+ let Inst{5} = Sm{0};
+ let Inst{15-12} = Sd{4-1};
+ let Inst{22} = Sd{0};
+
+ let Predicates = [HasFullFP16];
+}
+
+// Always set Z bit in the instruction, i.e. "round towards zero" variants.
+def VTOSIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011,
+ (outs SPR:$Sd), (ins DPR:$Dm),
+ IIC_fpCVTDI, "vcvt", ".s32.f64\t$Sd, $Dm",
+ []> {
+ let Inst{7} = 1; // Z bit
+}
+
+let Predicates=[HasVFP2, HasDPVFP] in {
+ def : VFPPat<(i32 (fp_to_sint (f64 DPR:$a))),
+ (COPY_TO_REGCLASS (VTOSIZD DPR:$a), GPR)>;
+
+ def : VFPPat<(alignedstore32 (i32 (fp_to_sint (f64 DPR:$a))), addrmode5:$ptr),
+ (VSTRS (VTOSIZD DPR:$a), addrmode5:$ptr)>;
+}
+
+def VTOSIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010,
+ (outs SPR:$Sd), (ins SPR:$Sm),
+ IIC_fpCVTSI, "vcvt", ".s32.f32\t$Sd, $Sm",
+ []> {
+ let Inst{7} = 1; // Z bit
+
+ // Some single precision VFP instructions may be executed on both NEON and
+ // VFP pipelines on A8.
+ let D = VFPNeonA8Domain;
+}
+
+def : VFPNoNEONPat<(i32 (fp_to_sint SPR:$a)),
+ (COPY_TO_REGCLASS (VTOSIZS SPR:$a), GPR)>;
+
+def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_sint (f32 SPR:$a))),
+ addrmode5:$ptr),
+ (VSTRS (VTOSIZS SPR:$a), addrmode5:$ptr)>;
+
+def VTOSIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1101, 0b1001,
+ (outs SPR:$Sd), (ins SPR:$Sm),
+ IIC_fpCVTHI, "vcvt", ".s32.f16\t$Sd, $Sm",
+ []> {
+ let Inst{7} = 1; // Z bit
+}
+
+def VTOUIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011,
+ (outs SPR:$Sd), (ins DPR:$Dm),
+ IIC_fpCVTDI, "vcvt", ".u32.f64\t$Sd, $Dm",
+ []> {
+ let Inst{7} = 1; // Z bit
+}
+
+let Predicates=[HasVFP2, HasDPVFP] in {
+ def : VFPPat<(i32 (fp_to_uint (f64 DPR:$a))),
+ (COPY_TO_REGCLASS (VTOUIZD DPR:$a), GPR)>;
+
+ def : VFPPat<(alignedstore32 (i32 (fp_to_uint (f64 DPR:$a))), addrmode5:$ptr),
+ (VSTRS (VTOUIZD DPR:$a), addrmode5:$ptr)>;
+}
+
+def VTOUIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010,
+ (outs SPR:$Sd), (ins SPR:$Sm),
+ IIC_fpCVTSI, "vcvt", ".u32.f32\t$Sd, $Sm",
+ []> {
+ let Inst{7} = 1; // Z bit
+
+ // Some single precision VFP instructions may be executed on both NEON and
+ // VFP pipelines on A8.
+ let D = VFPNeonA8Domain;
+}
+
+def : VFPNoNEONPat<(i32 (fp_to_uint SPR:$a)),
+ (COPY_TO_REGCLASS (VTOUIZS SPR:$a), GPR)>;
+
+def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_uint (f32 SPR:$a))),
+ addrmode5:$ptr),
+ (VSTRS (VTOUIZS SPR:$a), addrmode5:$ptr)>;
+
+def VTOUIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1100, 0b1001,
+ (outs SPR:$Sd), (ins SPR:$Sm),
+ IIC_fpCVTHI, "vcvt", ".u32.f16\t$Sd, $Sm",
+ []> {
+ let Inst{7} = 1; // Z bit
+}
+
+// And the Z bit '0' variants, i.e. use the rounding mode specified by FPSCR.
+let Uses = [FPSCR] in {
+// FIXME: Verify encoding after integrated assembler is working.
+def VTOSIRD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011,
+ (outs SPR:$Sd), (ins DPR:$Dm),
+ IIC_fpCVTDI, "vcvtr", ".s32.f64\t$Sd, $Dm",
+ [(set SPR:$Sd, (int_arm_vcvtr (f64 DPR:$Dm)))]>{
+ let Inst{7} = 0; // Z bit
+}
+
+def VTOSIRS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010,
+ (outs SPR:$Sd), (ins SPR:$Sm),
+ IIC_fpCVTSI, "vcvtr", ".s32.f32\t$Sd, $Sm",
+ [(set SPR:$Sd, (int_arm_vcvtr SPR:$Sm))]> {
+ let Inst{7} = 0; // Z bit
+}
+
+def VTOSIRH : AVConv1IsH_Encode<0b11101, 0b11, 0b1101, 0b1001,
+ (outs SPR:$Sd), (ins SPR:$Sm),
+ IIC_fpCVTHI, "vcvtr", ".s32.f16\t$Sd, $Sm",
+ []> {
+ let Inst{7} = 0; // Z bit
+}
+
+def VTOUIRD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011,
+ (outs SPR:$Sd), (ins DPR:$Dm),
+ IIC_fpCVTDI, "vcvtr", ".u32.f64\t$Sd, $Dm",
+ [(set SPR:$Sd, (int_arm_vcvtru(f64 DPR:$Dm)))]>{
+ let Inst{7} = 0; // Z bit
+}
+
+def VTOUIRS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010,
+ (outs SPR:$Sd), (ins SPR:$Sm),
+ IIC_fpCVTSI, "vcvtr", ".u32.f32\t$Sd, $Sm",
+ [(set SPR:$Sd, (int_arm_vcvtru SPR:$Sm))]> {
+ let Inst{7} = 0; // Z bit
+}
+
+def VTOUIRH : AVConv1IsH_Encode<0b11101, 0b11, 0b1100, 0b1001,
+ (outs SPR:$Sd), (ins SPR:$Sm),
+ IIC_fpCVTHI, "vcvtr", ".u32.f16\t$Sd, $Sm",
+ []> {
+ let Inst{7} = 0; // Z bit
+}
+}
+
+// Convert between floating-point and fixed-point
+// Data type for fixed-point naming convention:
+// S16 (U=0, sx=0) -> SH
+// U16 (U=1, sx=0) -> UH
+// S32 (U=0, sx=1) -> SL
+// U32 (U=1, sx=1) -> UL
+
+let Constraints = "$a = $dst" in {
+
+// FP to Fixed-Point:
+
+// Single Precision register
+class AVConv1XInsS_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4,
+ bit op5, dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : AVConv1XI<op1, op2, op3, op4, op5, oops, iops, itin, opc, asm, pattern>,
+ Sched<[WriteCvtFP]> {
+ bits<5> dst;
+ // if dp_operation then UInt(D:Vd) else UInt(Vd:D);
+ let Inst{22} = dst{0};
+ let Inst{15-12} = dst{4-1};
+}
+
+// Double Precision register
+class AVConv1XInsD_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4,
+ bit op5, dag oops, dag iops, InstrItinClass itin,
+ string opc, string asm, list<dag> pattern>
+ : AVConv1XI<op1, op2, op3, op4, op5, oops, iops, itin, opc, asm, pattern>,
+ Sched<[WriteCvtFP]> {
+ bits<5> dst;
+ // if dp_operation then UInt(D:Vd) else UInt(Vd:D);
+ let Inst{22} = dst{4};
+ let Inst{15-12} = dst{3-0};
+
+ let Predicates = [HasVFP2, HasDPVFP];
+}
+
+def VTOSHH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1001, 0,
+ (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
+ IIC_fpCVTHI, "vcvt", ".s16.f16\t$dst, $a, $fbits", []>,
+ Requires<[HasFullFP16]>;
+
+def VTOUHH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1001, 0,
+ (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
+ IIC_fpCVTHI, "vcvt", ".u16.f16\t$dst, $a, $fbits", []>,
+ Requires<[HasFullFP16]>;
+
+def VTOSLH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1001, 1,
+ (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
+ IIC_fpCVTHI, "vcvt", ".s32.f16\t$dst, $a, $fbits", []>,
+ Requires<[HasFullFP16]>;
+
+def VTOULH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1001, 1,
+ (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
+ IIC_fpCVTHI, "vcvt", ".u32.f16\t$dst, $a, $fbits", []>,
+ Requires<[HasFullFP16]>;
+
+def VTOSHS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1010, 0,
+ (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
+ IIC_fpCVTSI, "vcvt", ".s16.f32\t$dst, $a, $fbits", []> {
+ // Some single precision VFP instructions may be executed on both NEON and
+ // VFP pipelines on A8.
+ let D = VFPNeonA8Domain;
+}
+
+def VTOUHS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1010, 0,
+ (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
+ IIC_fpCVTSI, "vcvt", ".u16.f32\t$dst, $a, $fbits", []> {
+ // Some single precision VFP instructions may be executed on both NEON and
+ // VFP pipelines on A8.
+ let D = VFPNeonA8Domain;
+}
+
+def VTOSLS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1010, 1,
+ (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
+ IIC_fpCVTSI, "vcvt", ".s32.f32\t$dst, $a, $fbits", []> {
+ // Some single precision VFP instructions may be executed on both NEON and
+ // VFP pipelines on A8.
+ let D = VFPNeonA8Domain;
+}
+
+def VTOULS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1010, 1,
+ (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
+ IIC_fpCVTSI, "vcvt", ".u32.f32\t$dst, $a, $fbits", []> {
+ // Some single precision VFP instructions may be executed on both NEON and
+ // VFP pipelines on A8.
+ let D = VFPNeonA8Domain;
+}
+
+def VTOSHD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1110, 0b1011, 0,
+ (outs DPR:$dst), (ins DPR:$a, fbits16:$fbits),
+ IIC_fpCVTDI, "vcvt", ".s16.f64\t$dst, $a, $fbits", []>;
+
+def VTOUHD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1111, 0b1011, 0,
+ (outs DPR:$dst), (ins DPR:$a, fbits16:$fbits),
+ IIC_fpCVTDI, "vcvt", ".u16.f64\t$dst, $a, $fbits", []>;
+
+def VTOSLD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1110, 0b1011, 1,
+ (outs DPR:$dst), (ins DPR:$a, fbits32:$fbits),
+ IIC_fpCVTDI, "vcvt", ".s32.f64\t$dst, $a, $fbits", []>;
+
+def VTOULD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1111, 0b1011, 1,
+ (outs DPR:$dst), (ins DPR:$a, fbits32:$fbits),
+ IIC_fpCVTDI, "vcvt", ".u32.f64\t$dst, $a, $fbits", []>;
+
+// Fixed-Point to FP:
+
+def VSHTOH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1010, 0b1001, 0,
+ (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
+ IIC_fpCVTIH, "vcvt", ".f16.s16\t$dst, $a, $fbits", []>,
+ Requires<[HasFullFP16]>;
+
+def VUHTOH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1011, 0b1001, 0,
+ (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
+ IIC_fpCVTIH, "vcvt", ".f16.u16\t$dst, $a, $fbits", []>,
+ Requires<[HasFullFP16]>;
+
+def VSLTOH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1010, 0b1001, 1,
+ (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
+ IIC_fpCVTIH, "vcvt", ".f16.s32\t$dst, $a, $fbits", []>,
+ Requires<[HasFullFP16]>;
+
+def VULTOH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1011, 0b1001, 1,
+ (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
+ IIC_fpCVTIH, "vcvt", ".f16.u32\t$dst, $a, $fbits", []>,
+ Requires<[HasFullFP16]>;
+
+def VSHTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1010, 0b1010, 0,
+ (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
+ IIC_fpCVTIS, "vcvt", ".f32.s16\t$dst, $a, $fbits", []> {
+ // Some single precision VFP instructions may be executed on both NEON and
+ // VFP pipelines on A8.
+ let D = VFPNeonA8Domain;
+}
+
+def VUHTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1011, 0b1010, 0,
+ (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
+ IIC_fpCVTIS, "vcvt", ".f32.u16\t$dst, $a, $fbits", []> {
+ // Some single precision VFP instructions may be executed on both NEON and
+ // VFP pipelines on A8.
+ let D = VFPNeonA8Domain;
+}
+
+def VSLTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1010, 0b1010, 1,
+ (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
+ IIC_fpCVTIS, "vcvt", ".f32.s32\t$dst, $a, $fbits", []> {
+ // Some single precision VFP instructions may be executed on both NEON and
+ // VFP pipelines on A8.
+ let D = VFPNeonA8Domain;
+}
+
+def VULTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1011, 0b1010, 1,
+ (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
+ IIC_fpCVTIS, "vcvt", ".f32.u32\t$dst, $a, $fbits", []> {
+ // Some single precision VFP instructions may be executed on both NEON and
+ // VFP pipelines on A8.
+ let D = VFPNeonA8Domain;
+}
+
+def VSHTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1010, 0b1011, 0,
+ (outs DPR:$dst), (ins DPR:$a, fbits16:$fbits),
+ IIC_fpCVTID, "vcvt", ".f64.s16\t$dst, $a, $fbits", []>;
+
+def VUHTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1011, 0b1011, 0,
+ (outs DPR:$dst), (ins DPR:$a, fbits16:$fbits),
+ IIC_fpCVTID, "vcvt", ".f64.u16\t$dst, $a, $fbits", []>;
+
+def VSLTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1010, 0b1011, 1,
+ (outs DPR:$dst), (ins DPR:$a, fbits32:$fbits),
+ IIC_fpCVTID, "vcvt", ".f64.s32\t$dst, $a, $fbits", []>;
+
+def VULTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1011, 0b1011, 1,
+ (outs DPR:$dst), (ins DPR:$a, fbits32:$fbits),
+ IIC_fpCVTID, "vcvt", ".f64.u32\t$dst, $a, $fbits", []>;
+
+} // End of 'let Constraints = "$a = $dst" in'
+
+//===----------------------------------------------------------------------===//
+// FP Multiply-Accumulate Operations.
+//
+
+def VMLAD : ADbI<0b11100, 0b00, 0, 0,
+ (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
+ IIC_fpMAC64, "vmla", ".f64\t$Dd, $Dn, $Dm",
+ [(set DPR:$Dd, (fadd_mlx (fmul_su DPR:$Dn, DPR:$Dm),
+ (f64 DPR:$Ddin)))]>,
+ RegConstraint<"$Ddin = $Dd">,
+ Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+
+def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
+ (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+ IIC_fpMAC32, "vmla", ".f32\t$Sd, $Sn, $Sm",
+ [(set SPR:$Sd, (fadd_mlx (fmul_su SPR:$Sn, SPR:$Sm),
+ SPR:$Sdin))]>,
+ RegConstraint<"$Sdin = $Sd">,
+ Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]> {
+ // Some single precision VFP instructions may be executed on both NEON and
+ // VFP pipelines on A8.
+ let D = VFPNeonA8Domain;
+}
+
+def VMLAH : AHbI<0b11100, 0b00, 0, 0,
+ (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+ IIC_fpMAC16, "vmla", ".f16\t$Sd, $Sn, $Sm",
+ []>,
+ RegConstraint<"$Sdin = $Sd">,
+ Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
+
+def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
+ (VMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
+ Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
+ (VMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
+ Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx,DontUseFusedMAC]>;
+
+def VMLSD : ADbI<0b11100, 0b00, 1, 0,
+ (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
+ IIC_fpMAC64, "vmls", ".f64\t$Dd, $Dn, $Dm",
+ [(set DPR:$Dd, (fadd_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
+ (f64 DPR:$Ddin)))]>,
+ RegConstraint<"$Ddin = $Dd">,
+ Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+
+def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
+ (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+ IIC_fpMAC32, "vmls", ".f32\t$Sd, $Sn, $Sm",
+ [(set SPR:$Sd, (fadd_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
+ SPR:$Sdin))]>,
+ RegConstraint<"$Sdin = $Sd">,
+ Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]> {
+ // Some single precision VFP instructions may be executed on both NEON and
+ // VFP pipelines on A8.
+ let D = VFPNeonA8Domain;
+}
+
+def VMLSH : AHbI<0b11100, 0b00, 1, 0,
+ (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+ IIC_fpMAC16, "vmls", ".f16\t$Sd, $Sn, $Sm",
+ []>,
+ RegConstraint<"$Sdin = $Sd">,
+ Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
+
+def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
+ (VMLSD DPR:$dstin, DPR:$a, DPR:$b)>,
+ Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
+ (VMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
+ Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+
+def VNMLAD : ADbI<0b11100, 0b01, 1, 0,
+ (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
+ IIC_fpMAC64, "vnmla", ".f64\t$Dd, $Dn, $Dm",
+ [(set DPR:$Dd,(fsub_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
+ (f64 DPR:$Ddin)))]>,
+ RegConstraint<"$Ddin = $Dd">,
+ Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+
+def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
+ (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+ IIC_fpMAC32, "vnmla", ".f32\t$Sd, $Sn, $Sm",
+ [(set SPR:$Sd, (fsub_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
+ SPR:$Sdin))]>,
+ RegConstraint<"$Sdin = $Sd">,
+ Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]> {
+ // Some single precision VFP instructions may be executed on both NEON and
+ // VFP pipelines on A8.
+ let D = VFPNeonA8Domain;
+}
+
+def VNMLAH : AHbI<0b11100, 0b01, 1, 0,
+ (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+ IIC_fpMAC16, "vnmla", ".f16\t$Sd, $Sn, $Sm",
+ []>,
+ RegConstraint<"$Sdin = $Sd">,
+ Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
+
+def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin),
+ (VNMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
+ Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin),
+ (VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
+ Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+
+def VNMLSD : ADbI<0b11100, 0b01, 0, 0,
+ (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
+ IIC_fpMAC64, "vnmls", ".f64\t$Dd, $Dn, $Dm",
+ [(set DPR:$Dd, (fsub_mlx (fmul_su DPR:$Dn, DPR:$Dm),
+ (f64 DPR:$Ddin)))]>,
+ RegConstraint<"$Ddin = $Dd">,
+ Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+
+def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
+ (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+ IIC_fpMAC32, "vnmls", ".f32\t$Sd, $Sn, $Sm",
+ [(set SPR:$Sd, (fsub_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>,
+ RegConstraint<"$Sdin = $Sd">,
+ Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]> {
+ // Some single precision VFP instructions may be executed on both NEON and
+ // VFP pipelines on A8.
+ let D = VFPNeonA8Domain;
+}
+
+def VNMLSH : AHbI<0b11100, 0b01, 0, 0,
+ (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+ IIC_fpMAC16, "vnmls", ".f16\t$Sd, $Sn, $Sm",
+ []>,
+ RegConstraint<"$Sdin = $Sd">,
+ Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
+
+def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin),
+ (VNMLSD DPR:$dstin, DPR:$a, DPR:$b)>,
+ Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin),
+ (VNMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
+ Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+
+//===----------------------------------------------------------------------===//
+// Fused FP Multiply-Accumulate Operations.
+//
+def VFMAD : ADbI<0b11101, 0b10, 0, 0,
+ (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
+ IIC_fpFMAC64, "vfma", ".f64\t$Dd, $Dn, $Dm",
+ [(set DPR:$Dd, (fadd_mlx (fmul_su DPR:$Dn, DPR:$Dm),
+ (f64 DPR:$Ddin)))]>,
+ RegConstraint<"$Ddin = $Dd">,
+ Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
+
+def VFMAS : ASbIn<0b11101, 0b10, 0, 0,
+ (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+ IIC_fpFMAC32, "vfma", ".f32\t$Sd, $Sn, $Sm",
+ [(set SPR:$Sd, (fadd_mlx (fmul_su SPR:$Sn, SPR:$Sm),
+ SPR:$Sdin))]>,
+ RegConstraint<"$Sdin = $Sd">,
+ Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]> {
+ // Some single precision VFP instructions may be executed on both NEON and
+ // VFP pipelines.
+}
+
+def VFMAH : AHbI<0b11101, 0b10, 0, 0,
+ (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+ IIC_fpFMAC16, "vfma", ".f16\t$Sd, $Sn, $Sm",
+ []>,
+ RegConstraint<"$Sdin = $Sd">,
+ Requires<[HasFullFP16,UseFusedMAC]>;
+
+def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
+ (VFMAD DPR:$dstin, DPR:$a, DPR:$b)>,
+ Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
+def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
+ (VFMAS SPR:$dstin, SPR:$a, SPR:$b)>,
+ Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
+
+// Match @llvm.fma.* intrinsics
+// (fma x, y, z) -> (vfms z, x, y)
+def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, DPR:$Ddin)),
+ (VFMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+ Requires<[HasVFP4,HasDPVFP]>;
+def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, SPR:$Sdin)),
+ (VFMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+ Requires<[HasVFP4]>;
+
+def VFMSD : ADbI<0b11101, 0b10, 1, 0,
+ (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
+ IIC_fpFMAC64, "vfms", ".f64\t$Dd, $Dn, $Dm",
+ [(set DPR:$Dd, (fadd_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
+ (f64 DPR:$Ddin)))]>,
+ RegConstraint<"$Ddin = $Dd">,
+ Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
+
+def VFMSS : ASbIn<0b11101, 0b10, 1, 0,
+ (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+ IIC_fpFMAC32, "vfms", ".f32\t$Sd, $Sn, $Sm",
+ [(set SPR:$Sd, (fadd_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
+ SPR:$Sdin))]>,
+ RegConstraint<"$Sdin = $Sd">,
+ Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]> {
+ // Some single precision VFP instructions may be executed on both NEON and
+ // VFP pipelines.
+}
+
+def VFMSH : AHbI<0b11101, 0b10, 1, 0,
+ (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+ IIC_fpFMAC16, "vfms", ".f16\t$Sd, $Sn, $Sm",
+ []>,
+ RegConstraint<"$Sdin = $Sd">,
+ Requires<[HasFullFP16,UseFusedMAC]>;
+
+def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
+ (VFMSD DPR:$dstin, DPR:$a, DPR:$b)>,
+ Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
+def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
+ (VFMSS SPR:$dstin, SPR:$a, SPR:$b)>,
+ Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
+
+// Match @llvm.fma.* intrinsics
+// (fma (fneg x), y, z) -> (vfms z, x, y)
+def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin)),
+ (VFMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+ Requires<[HasVFP4,HasDPVFP]>;
+def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin)),
+ (VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+ Requires<[HasVFP4]>;
+// (fma x, (fneg y), z) -> (vfms z, x, y)
+def : Pat<(f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin)),
+ (VFMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+ Requires<[HasVFP4,HasDPVFP]>;
+def : Pat<(f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin)),
+ (VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+ Requires<[HasVFP4]>;
+
+def VFNMAD : ADbI<0b11101, 0b01, 1, 0,
+ (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
+ IIC_fpFMAC64, "vfnma", ".f64\t$Dd, $Dn, $Dm",
+ [(set DPR:$Dd,(fsub_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
+ (f64 DPR:$Ddin)))]>,
+ RegConstraint<"$Ddin = $Dd">,
+ Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
+
+def VFNMAS : ASbI<0b11101, 0b01, 1, 0,
+ (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+ IIC_fpFMAC32, "vfnma", ".f32\t$Sd, $Sn, $Sm",
+ [(set SPR:$Sd, (fsub_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
+ SPR:$Sdin))]>,
+ RegConstraint<"$Sdin = $Sd">,
+ Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]> {
+ // Some single precision VFP instructions may be executed on both NEON and
+ // VFP pipelines.
+}
+
+def VFNMAH : AHbI<0b11101, 0b01, 1, 0,
+ (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+ IIC_fpFMAC16, "vfnma", ".f16\t$Sd, $Sn, $Sm",
+ []>,
+ RegConstraint<"$Sdin = $Sd">,
+ Requires<[HasFullFP16,UseFusedMAC]>;
+
+def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin),
+ (VFNMAD DPR:$dstin, DPR:$a, DPR:$b)>,
+ Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
+def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin),
+ (VFNMAS SPR:$dstin, SPR:$a, SPR:$b)>,
+ Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
+
+// Match @llvm.fma.* intrinsics
+// (fneg (fma x, y, z)) -> (vfnma z, x, y)
+def : Pat<(fneg (fma (f64 DPR:$Dn), (f64 DPR:$Dm), (f64 DPR:$Ddin))),
+ (VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+ Requires<[HasVFP4,HasDPVFP]>;
+def : Pat<(fneg (fma (f32 SPR:$Sn), (f32 SPR:$Sm), (f32 SPR:$Sdin))),
+ (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+ Requires<[HasVFP4]>;
+// (fma (fneg x), y, (fneg z)) -> (vfnma z, x, y)
+def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, (fneg DPR:$Ddin))),
+ (VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+ Requires<[HasVFP4,HasDPVFP]>;
+def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, (fneg SPR:$Sdin))),
+ (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+ Requires<[HasVFP4]>;
+
+def VFNMSD : ADbI<0b11101, 0b01, 0, 0,
+ (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
+ IIC_fpFMAC64, "vfnms", ".f64\t$Dd, $Dn, $Dm",
+ [(set DPR:$Dd, (fsub_mlx (fmul_su DPR:$Dn, DPR:$Dm),
+ (f64 DPR:$Ddin)))]>,
+ RegConstraint<"$Ddin = $Dd">,
+ Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
+
+def VFNMSS : ASbI<0b11101, 0b01, 0, 0,
+ (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+ IIC_fpFMAC32, "vfnms", ".f32\t$Sd, $Sn, $Sm",
+ [(set SPR:$Sd, (fsub_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>,
+ RegConstraint<"$Sdin = $Sd">,
+ Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]> {
+ // Some single precision VFP instructions may be executed on both NEON and
+ // VFP pipelines.
+}
+
+def VFNMSH : AHbI<0b11101, 0b01, 0, 0,
+ (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+ IIC_fpFMAC16, "vfnms", ".f16\t$Sd, $Sn, $Sm",
+ []>,
+ RegConstraint<"$Sdin = $Sd">,
+ Requires<[HasFullFP16,UseFusedMAC]>;
+
+def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin),
+ (VFNMSD DPR:$dstin, DPR:$a, DPR:$b)>,
+ Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
+def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin),
+ (VFNMSS SPR:$dstin, SPR:$a, SPR:$b)>,
+ Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
+
+// Match @llvm.fma.* intrinsics
+
+// (fma x, y, (fneg z)) -> (vfnms z, x, y))
+def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, (fneg DPR:$Ddin))),
+ (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+ Requires<[HasVFP4,HasDPVFP]>;
+def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, (fneg SPR:$Sdin))),
+ (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+ Requires<[HasVFP4]>;
+// (fneg (fma (fneg x), y, z)) -> (vfnms z, x, y)
+def : Pat<(fneg (f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin))),
+ (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+ Requires<[HasVFP4,HasDPVFP]>;
+def : Pat<(fneg (f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin))),
+ (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+ Requires<[HasVFP4]>;
+// (fneg (fma x, (fneg y), z) -> (vfnms z, x, y)
+def : Pat<(fneg (f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin))),
+ (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+ Requires<[HasVFP4,HasDPVFP]>;
+def : Pat<(fneg (f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin))),
+ (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+ Requires<[HasVFP4]>;
+
+//===----------------------------------------------------------------------===//
+// FP Conditional moves.
+//
+
+let hasSideEffects = 0 in {
+def VMOVDcc : PseudoInst<(outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm, cmovpred:$p),
+ IIC_fpUNA64,
+ [(set (f64 DPR:$Dd),
+ (ARMcmov DPR:$Dn, DPR:$Dm, cmovpred:$p))]>,
+ RegConstraint<"$Dn = $Dd">, Requires<[HasVFP2,HasDPVFP]>;
+
+def VMOVScc : PseudoInst<(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm, cmovpred:$p),
+ IIC_fpUNA32,
+ [(set (f32 SPR:$Sd),
+ (ARMcmov SPR:$Sn, SPR:$Sm, cmovpred:$p))]>,
+ RegConstraint<"$Sn = $Sd">, Requires<[HasVFP2]>;
+} // hasSideEffects
+
+//===----------------------------------------------------------------------===//
+// Move from VFP System Register to ARM core register.
+//
+
+class MovFromVFP<bits<4> opc19_16, dag oops, dag iops, string opc, string asm,
+ list<dag> pattern>:
+ VFPAI<oops, iops, VFPMiscFrm, IIC_fpSTAT, opc, asm, pattern> {
+
+ // Instruction operand.
+ bits<4> Rt;
+
+ let Inst{27-20} = 0b11101111;
+ let Inst{19-16} = opc19_16;
+ let Inst{15-12} = Rt;
+ let Inst{11-8} = 0b1010;
+ let Inst{7} = 0;
+ let Inst{6-5} = 0b00;
+ let Inst{4} = 1;
+ let Inst{3-0} = 0b0000;
+}
+
+// APSR is the application level alias of CPSR. This FPSCR N, Z, C, V flags
+// to APSR.
+let Defs = [CPSR], Uses = [FPSCR_NZCV], Rt = 0b1111 /* apsr_nzcv */ in
+def FMSTAT : MovFromVFP<0b0001 /* fpscr */, (outs), (ins),
+ "vmrs", "\tAPSR_nzcv, fpscr", [(arm_fmstat)]>;
+
+// Application level FPSCR -> GPR
+let hasSideEffects = 1, Uses = [FPSCR] in
+def VMRS : MovFromVFP<0b0001 /* fpscr */, (outs GPR:$Rt), (ins),
+ "vmrs", "\t$Rt, fpscr",
+ [(set GPR:$Rt, (int_arm_get_fpscr))]>;
+
+// System level FPEXC, FPSID -> GPR
+let Uses = [FPSCR] in {
+ def VMRS_FPEXC : MovFromVFP<0b1000 /* fpexc */, (outs GPR:$Rt), (ins),
+ "vmrs", "\t$Rt, fpexc", []>;
+ def VMRS_FPSID : MovFromVFP<0b0000 /* fpsid */, (outs GPR:$Rt), (ins),
+ "vmrs", "\t$Rt, fpsid", []>;
+ def VMRS_MVFR0 : MovFromVFP<0b0111 /* mvfr0 */, (outs GPR:$Rt), (ins),
+ "vmrs", "\t$Rt, mvfr0", []>;
+ def VMRS_MVFR1 : MovFromVFP<0b0110 /* mvfr1 */, (outs GPR:$Rt), (ins),
+ "vmrs", "\t$Rt, mvfr1", []>;
+ def VMRS_MVFR2 : MovFromVFP<0b0101 /* mvfr2 */, (outs GPR:$Rt), (ins),
+ "vmrs", "\t$Rt, mvfr2", []>, Requires<[HasFPARMv8]>;
+ def VMRS_FPINST : MovFromVFP<0b1001 /* fpinst */, (outs GPR:$Rt), (ins),
+ "vmrs", "\t$Rt, fpinst", []>;
+ def VMRS_FPINST2 : MovFromVFP<0b1010 /* fpinst2 */, (outs GPR:$Rt), (ins),
+ "vmrs", "\t$Rt, fpinst2", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Move from ARM core register to VFP System Register.
+//
+
+class MovToVFP<bits<4> opc19_16, dag oops, dag iops, string opc, string asm,
+ list<dag> pattern>:
+ VFPAI<oops, iops, VFPMiscFrm, IIC_fpSTAT, opc, asm, pattern> {
+
+ // Instruction operand.
+ bits<4> src;
+
+ // Encode instruction operand.
+ let Inst{15-12} = src;
+
+ let Inst{27-20} = 0b11101110;
+ let Inst{19-16} = opc19_16;
+ let Inst{11-8} = 0b1010;
+ let Inst{7} = 0;
+ let Inst{4} = 1;
+}
+
+let Defs = [FPSCR] in {
+ // Application level GPR -> FPSCR
+ def VMSR : MovToVFP<0b0001 /* fpscr */, (outs), (ins GPR:$src),
+ "vmsr", "\tfpscr, $src", [(int_arm_set_fpscr GPR:$src)]>;
+ // System level GPR -> FPEXC
+ def VMSR_FPEXC : MovToVFP<0b1000 /* fpexc */, (outs), (ins GPR:$src),
+ "vmsr", "\tfpexc, $src", []>;
+ // System level GPR -> FPSID
+ def VMSR_FPSID : MovToVFP<0b0000 /* fpsid */, (outs), (ins GPR:$src),
+ "vmsr", "\tfpsid, $src", []>;
+
+ def VMSR_FPINST : MovToVFP<0b1001 /* fpinst */, (outs), (ins GPR:$src),
+ "vmsr", "\tfpinst, $src", []>;
+ def VMSR_FPINST2 : MovToVFP<0b1010 /* fpinst2 */, (outs), (ins GPR:$src),
+ "vmsr", "\tfpinst2, $src", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Misc.
+//
+
+// Materialize FP immediates. VFP3 only.
+let isReMaterializable = 1 in {
+def FCONSTD : VFPAI<(outs DPR:$Dd), (ins vfp_f64imm:$imm),
+ VFPMiscFrm, IIC_fpUNA64,
+ "vmov", ".f64\t$Dd, $imm",
+ [(set DPR:$Dd, vfp_f64imm:$imm)]>,
+ Requires<[HasVFP3,HasDPVFP]> {
+ bits<5> Dd;
+ bits<8> imm;
+
+ let Inst{27-23} = 0b11101;
+ let Inst{22} = Dd{4};
+ let Inst{21-20} = 0b11;
+ let Inst{19-16} = imm{7-4};
+ let Inst{15-12} = Dd{3-0};
+ let Inst{11-9} = 0b101;
+ let Inst{8} = 1; // Double precision.
+ let Inst{7-4} = 0b0000;
+ let Inst{3-0} = imm{3-0};
+}
+
+def FCONSTS : VFPAI<(outs SPR:$Sd), (ins vfp_f32imm:$imm),
+ VFPMiscFrm, IIC_fpUNA32,
+ "vmov", ".f32\t$Sd, $imm",
+ [(set SPR:$Sd, vfp_f32imm:$imm)]>, Requires<[HasVFP3]> {
+ bits<5> Sd;
+ bits<8> imm;
+
+ let Inst{27-23} = 0b11101;
+ let Inst{22} = Sd{0};
+ let Inst{21-20} = 0b11;
+ let Inst{19-16} = imm{7-4};
+ let Inst{15-12} = Sd{4-1};
+ let Inst{11-9} = 0b101;
+ let Inst{8} = 0; // Single precision.
+ let Inst{7-4} = 0b0000;
+ let Inst{3-0} = imm{3-0};
+}
+
+def FCONSTH : VFPAI<(outs SPR:$Sd), (ins vfp_f16imm:$imm),
+ VFPMiscFrm, IIC_fpUNA16,
+ "vmov", ".f16\t$Sd, $imm",
+ []>, Requires<[HasFullFP16]> {
+ bits<5> Sd;
+ bits<8> imm;
+
+ let Inst{27-23} = 0b11101;
+ let Inst{22} = Sd{0};
+ let Inst{21-20} = 0b11;
+ let Inst{19-16} = imm{7-4};
+ let Inst{15-12} = Sd{4-1};
+ let Inst{11-8} = 0b1001; // Half precision
+ let Inst{7-4} = 0b0000;
+ let Inst{3-0} = imm{3-0};
+}
+}
+
+//===----------------------------------------------------------------------===//
+// Assembler aliases.
+//
+// A few mnemonic aliases for pre-unifixed syntax. We don't guarantee to
+// support them all, but supporting at least some of the basics is
+// good to be friendly.
+def : VFP2MnemonicAlias<"flds", "vldr">;
+def : VFP2MnemonicAlias<"fldd", "vldr">;
+def : VFP2MnemonicAlias<"fmrs", "vmov">;
+def : VFP2MnemonicAlias<"fmsr", "vmov">;
+def : VFP2MnemonicAlias<"fsqrts", "vsqrt">;
+def : VFP2MnemonicAlias<"fsqrtd", "vsqrt">;
+def : VFP2MnemonicAlias<"fadds", "vadd.f32">;
+def : VFP2MnemonicAlias<"faddd", "vadd.f64">;
+def : VFP2MnemonicAlias<"fmrdd", "vmov">;
+def : VFP2MnemonicAlias<"fmrds", "vmov">;
+def : VFP2MnemonicAlias<"fmrrd", "vmov">;
+def : VFP2MnemonicAlias<"fmdrr", "vmov">;
+def : VFP2MnemonicAlias<"fmuls", "vmul.f32">;
+def : VFP2MnemonicAlias<"fmuld", "vmul.f64">;
+def : VFP2MnemonicAlias<"fnegs", "vneg.f32">;
+def : VFP2MnemonicAlias<"fnegd", "vneg.f64">;
+def : VFP2MnemonicAlias<"ftosizd", "vcvt.s32.f64">;
+def : VFP2MnemonicAlias<"ftosid", "vcvtr.s32.f64">;
+def : VFP2MnemonicAlias<"ftosizs", "vcvt.s32.f32">;
+def : VFP2MnemonicAlias<"ftosis", "vcvtr.s32.f32">;
+def : VFP2MnemonicAlias<"ftouizd", "vcvt.u32.f64">;
+def : VFP2MnemonicAlias<"ftouid", "vcvtr.u32.f64">;
+def : VFP2MnemonicAlias<"ftouizs", "vcvt.u32.f32">;
+def : VFP2MnemonicAlias<"ftouis", "vcvtr.u32.f32">;
+def : VFP2MnemonicAlias<"fsitod", "vcvt.f64.s32">;
+def : VFP2MnemonicAlias<"fsitos", "vcvt.f32.s32">;
+def : VFP2MnemonicAlias<"fuitod", "vcvt.f64.u32">;
+def : VFP2MnemonicAlias<"fuitos", "vcvt.f32.u32">;
+def : VFP2MnemonicAlias<"fsts", "vstr">;
+def : VFP2MnemonicAlias<"fstd", "vstr">;
+def : VFP2MnemonicAlias<"fmacd", "vmla.f64">;
+def : VFP2MnemonicAlias<"fmacs", "vmla.f32">;
+def : VFP2MnemonicAlias<"fcpys", "vmov.f32">;
+def : VFP2MnemonicAlias<"fcpyd", "vmov.f64">;
+def : VFP2MnemonicAlias<"fcmps", "vcmp.f32">;
+def : VFP2MnemonicAlias<"fcmpd", "vcmp.f64">;
+def : VFP2MnemonicAlias<"fdivs", "vdiv.f32">;
+def : VFP2MnemonicAlias<"fdivd", "vdiv.f64">;
+def : VFP2MnemonicAlias<"fmrx", "vmrs">;
+def : VFP2MnemonicAlias<"fmxr", "vmsr">;
+
+// Be friendly and accept the old form of zero-compare
+def : VFP2DPInstAlias<"fcmpzd${p} $val", (VCMPZD DPR:$val, pred:$p)>;
+def : VFP2InstAlias<"fcmpzs${p} $val", (VCMPZS SPR:$val, pred:$p)>;
+
+
+def : VFP2InstAlias<"fmstat${p}", (FMSTAT pred:$p)>;
+def : VFP2InstAlias<"fadds${p} $Sd, $Sn, $Sm",
+ (VADDS SPR:$Sd, SPR:$Sn, SPR:$Sm, pred:$p)>;
+def : VFP2DPInstAlias<"faddd${p} $Dd, $Dn, $Dm",
+ (VADDD DPR:$Dd, DPR:$Dn, DPR:$Dm, pred:$p)>;
+def : VFP2InstAlias<"fsubs${p} $Sd, $Sn, $Sm",
+ (VSUBS SPR:$Sd, SPR:$Sn, SPR:$Sm, pred:$p)>;
+def : VFP2DPInstAlias<"fsubd${p} $Dd, $Dn, $Dm",
+ (VSUBD DPR:$Dd, DPR:$Dn, DPR:$Dm, pred:$p)>;
+
+// No need for the size suffix on VSQRT. It's implied by the register classes.
+def : VFP2InstAlias<"vsqrt${p} $Sd, $Sm", (VSQRTS SPR:$Sd, SPR:$Sm, pred:$p)>;
+def : VFP2DPInstAlias<"vsqrt${p} $Dd, $Dm", (VSQRTD DPR:$Dd, DPR:$Dm, pred:$p)>;
+
+// VLDR/VSTR accept an optional type suffix.
+def : VFP2InstAlias<"vldr${p}.32 $Sd, $addr",
+ (VLDRS SPR:$Sd, addrmode5:$addr, pred:$p)>;
+def : VFP2InstAlias<"vstr${p}.32 $Sd, $addr",
+ (VSTRS SPR:$Sd, addrmode5:$addr, pred:$p)>;
+def : VFP2InstAlias<"vldr${p}.64 $Dd, $addr",
+ (VLDRD DPR:$Dd, addrmode5:$addr, pred:$p)>;
+def : VFP2InstAlias<"vstr${p}.64 $Dd, $addr",
+ (VSTRD DPR:$Dd, addrmode5:$addr, pred:$p)>;
+
+// VMOV can accept optional 32-bit or less data type suffix suffix.
+def : VFP2InstAlias<"vmov${p}.8 $Rt, $Sn",
+ (VMOVRS GPR:$Rt, SPR:$Sn, pred:$p)>;
+def : VFP2InstAlias<"vmov${p}.16 $Rt, $Sn",
+ (VMOVRS GPR:$Rt, SPR:$Sn, pred:$p)>;
+def : VFP2InstAlias<"vmov${p}.32 $Rt, $Sn",
+ (VMOVRS GPR:$Rt, SPR:$Sn, pred:$p)>;
+def : VFP2InstAlias<"vmov${p}.8 $Sn, $Rt",
+ (VMOVSR SPR:$Sn, GPR:$Rt, pred:$p)>;
+def : VFP2InstAlias<"vmov${p}.16 $Sn, $Rt",
+ (VMOVSR SPR:$Sn, GPR:$Rt, pred:$p)>;
+def : VFP2InstAlias<"vmov${p}.32 $Sn, $Rt",
+ (VMOVSR SPR:$Sn, GPR:$Rt, pred:$p)>;
+
+def : VFP2InstAlias<"vmov${p}.f64 $Rt, $Rt2, $Dn",
+ (VMOVRRD GPR:$Rt, GPR:$Rt2, DPR:$Dn, pred:$p)>;
+def : VFP2InstAlias<"vmov${p}.f64 $Dn, $Rt, $Rt2",
+ (VMOVDRR DPR:$Dn, GPR:$Rt, GPR:$Rt2, pred:$p)>;
+
+// VMOVS doesn't need the .f32 to disambiguate from the NEON encoding the way
+// VMOVD does.
+def : VFP2InstAlias<"vmov${p} $Sd, $Sm",
+ (VMOVS SPR:$Sd, SPR:$Sm, pred:$p)>;
+
+// FCONSTD/FCONSTS alias for vmov.f64/vmov.f32
+// These aliases provide added functionality over vmov.f instructions by
+// allowing users to write assembly containing encoded floating point constants
+// (e.g. #0x70 vs #1.0). Without these alises there is no way for the
+// assembler to accept encoded fp constants (but the equivalent fp-literal is
+// accepted directly by vmovf).
+def : VFP3InstAlias<"fconstd${p} $Dd, $val",
+ (FCONSTD DPR:$Dd, vfp_f64imm:$val, pred:$p)>;
+def : VFP3InstAlias<"fconsts${p} $Sd, $val",
+ (FCONSTS SPR:$Sd, vfp_f32imm:$val, pred:$p)>;
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstructionSelector.cpp b/contrib/llvm/lib/Target/ARM/ARMInstructionSelector.cpp
new file mode 100644
index 000000000000..2bdbe4fca3de
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMInstructionSelector.cpp
@@ -0,0 +1,109 @@
+//===- ARMInstructionSelector.cpp ----------------------------*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the InstructionSelector class for ARM.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "ARMInstructionSelector.h"
+#include "ARMRegisterBankInfo.h"
+#include "ARMSubtarget.h"
+#include "ARMTargetMachine.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "arm-isel"
+
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "You shouldn't build this"
+#endif
+
+ARMInstructionSelector::ARMInstructionSelector(const ARMSubtarget &STI,
+ const ARMRegisterBankInfo &RBI)
+ : InstructionSelector(), TII(*STI.getInstrInfo()),
+ TRI(*STI.getRegisterInfo()), RBI(RBI) {}
+
+static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
+ MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
+ const RegisterBankInfo &RBI) {
+ unsigned DstReg = I.getOperand(0).getReg();
+ if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+ return true;
+
+ const RegisterBank *RegBank = RBI.getRegBank(DstReg, MRI, TRI);
+ (void)RegBank;
+ assert(RegBank && "Can't get reg bank for virtual register");
+
+ const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
+ (void)DstSize;
+ unsigned SrcReg = I.getOperand(1).getReg();
+ const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
+ (void)SrcSize;
+ assert((DstSize == SrcSize ||
+ // Copies are a means to setup initial types, the number of
+ // bits may not exactly match.
+ (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
+ DstSize <= SrcSize)) &&
+ "Copy with different width?!");
+
+ assert(RegBank->getID() == ARM::GPRRegBankID && "Unsupported reg bank");
+ const TargetRegisterClass *RC = &ARM::GPRRegClass;
+
+ // No need to constrain SrcReg. It will get constrained when
+ // we hit another of its uses or its defs.
+ // Copies do not have constraints.
+ if (!RBI.constrainGenericRegister(DstReg, *RC, MRI)) {
+ DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
+ << " operand\n");
+ return false;
+ }
+ return true;
+}
+
+bool ARMInstructionSelector::select(MachineInstr &I) const {
+ assert(I.getParent() && "Instruction should be in a basic block!");
+ assert(I.getParent()->getParent() && "Instruction should be in a function!");
+
+ auto &MBB = *I.getParent();
+ auto &MF = *MBB.getParent();
+ auto &MRI = MF.getRegInfo();
+
+ if (!isPreISelGenericOpcode(I.getOpcode())) {
+ if (I.isCopy())
+ return selectCopy(I, TII, MRI, TRI, RBI);
+
+ return true;
+ }
+
+ MachineInstrBuilder MIB{MF, I};
+
+ using namespace TargetOpcode;
+ switch (I.getOpcode()) {
+ case G_ADD:
+ I.setDesc(TII.get(ARM::ADDrr));
+ AddDefaultCC(AddDefaultPred(MIB));
+ break;
+ case G_FRAME_INDEX:
+ // Add 0 to the given frame index and hope it will eventually be folded into
+ // the user(s).
+ I.setDesc(TII.get(ARM::ADDri));
+ AddDefaultCC(AddDefaultPred(MIB.addImm(0)));
+ break;
+ case G_LOAD:
+ I.setDesc(TII.get(ARM::LDRi12));
+ AddDefaultPred(MIB.addImm(0));
+ break;
+ default:
+ return false;
+ }
+
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstructionSelector.h b/contrib/llvm/lib/Target/ARM/ARMInstructionSelector.h
new file mode 100644
index 000000000000..5072cdd60ce4
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMInstructionSelector.h
@@ -0,0 +1,39 @@
+//===- ARMInstructionSelector ------------------------------------*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the InstructionSelector class for ARM.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMINSTRUCTIONSELECTOR_H
+#define LLVM_LIB_TARGET_ARM_ARMINSTRUCTIONSELECTOR_H
+
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+
+namespace llvm {
+class ARMBaseInstrInfo;
+class ARMBaseRegisterInfo;
+class ARMBaseTargetMachine;
+class ARMRegisterBankInfo;
+class ARMSubtarget;
+
+class ARMInstructionSelector : public InstructionSelector {
+public:
+ ARMInstructionSelector(const ARMSubtarget &STI,
+ const ARMRegisterBankInfo &RBI);
+
+ virtual bool select(MachineInstr &I) const override;
+
+private:
+ const ARMBaseInstrInfo &TII;
+ const ARMBaseRegisterInfo &TRI;
+ const ARMRegisterBankInfo &RBI;
+};
+
+} // End llvm namespace.
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
new file mode 100644
index 000000000000..255ea4bc7198
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -0,0 +1,44 @@
+//===- ARMLegalizerInfo.cpp --------------------------------------*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the Machinelegalizer class for ARM.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "ARMLegalizerInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Target/TargetOpcodes.h"
+
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "You shouldn't build this"
+#endif
+
+ARMLegalizerInfo::ARMLegalizerInfo() {
+ using namespace TargetOpcode;
+
+ const LLT p0 = LLT::pointer(0, 32);
+
+ const LLT s8 = LLT::scalar(8);
+ const LLT s16 = LLT::scalar(16);
+ const LLT s32 = LLT::scalar(32);
+
+ setAction({G_FRAME_INDEX, p0}, Legal);
+
+ setAction({G_LOAD, s32}, Legal);
+ setAction({G_LOAD, 1, p0}, Legal);
+
+ for (auto Ty : {s8, s16, s32})
+ setAction({G_ADD, Ty}, Legal);
+
+ computeTables();
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.h b/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.h
new file mode 100644
index 000000000000..ca3eea81271b
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.h
@@ -0,0 +1,29 @@
+//===- ARMLegalizerInfo ------------------------------------------*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the Machinelegalizer class for ARM.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMMACHINELEGALIZER_H
+#define LLVM_LIB_TARGET_ARM_ARMMACHINELEGALIZER_H
+
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+
+namespace llvm {
+
+class LLVMContext;
+
+/// This class provides the information for the target register banks.
+class ARMLegalizerInfo : public LegalizerInfo {
+public:
+ ARMLegalizerInfo();
+};
+} // End llvm namespace.
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
new file mode 100644
index 000000000000..48ab491b5be9
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -0,0 +1,2389 @@
+//===-- ARMLoadStoreOptimizer.cpp - ARM load / store opt. pass ------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains a pass that performs load / store related peephole
+/// optimizations. This pass should be run after register allocation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
+#include "ARMISelLowering.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "ThumbRegisterInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-ldst-opt"
+
+STATISTIC(NumLDMGened , "Number of ldm instructions generated");
+STATISTIC(NumSTMGened , "Number of stm instructions generated");
+STATISTIC(NumVLDMGened, "Number of vldm instructions generated");
+STATISTIC(NumVSTMGened, "Number of vstm instructions generated");
+STATISTIC(NumLdStMoved, "Number of load / store instructions moved");
+STATISTIC(NumLDRDFormed,"Number of ldrd created before allocation");
+STATISTIC(NumSTRDFormed,"Number of strd created before allocation");
+STATISTIC(NumLDRD2LDM, "Number of ldrd instructions turned back into ldm");
+STATISTIC(NumSTRD2STM, "Number of strd instructions turned back into stm");
+STATISTIC(NumLDRD2LDR, "Number of ldrd instructions turned back into ldr's");
+STATISTIC(NumSTRD2STR, "Number of strd instructions turned back into str's");
+
+/// This switch disables formation of double/multi instructions that could
+/// potentially lead to (new) alignment traps even with CCR.UNALIGN_TRP
+/// disabled. This can be used to create libraries that are robust even when
+/// users provoke undefined behaviour by supplying misaligned pointers.
+/// \see mayCombineMisaligned()
+static cl::opt<bool>
+AssumeMisalignedLoadStores("arm-assume-misaligned-load-store", cl::Hidden,
+ cl::init(false), cl::desc("Be more conservative in ARM load/store opt"));
+
+#define ARM_LOAD_STORE_OPT_NAME "ARM load / store optimization pass"
+
+namespace {
+ /// Post- register allocation pass the combine load / store instructions to
+ /// form ldm / stm instructions.
+ struct ARMLoadStoreOpt : public MachineFunctionPass {
+ static char ID;
+ ARMLoadStoreOpt() : MachineFunctionPass(ID) {}
+
+ const MachineFunction *MF;
+ const TargetInstrInfo *TII;
+ const TargetRegisterInfo *TRI;
+ const ARMSubtarget *STI;
+ const TargetLowering *TL;
+ ARMFunctionInfo *AFI;
+ LivePhysRegs LiveRegs;
+ RegisterClassInfo RegClassInfo;
+ MachineBasicBlock::const_iterator LiveRegPos;
+ bool LiveRegsValid;
+ bool RegClassInfoValid;
+ bool isThumb1, isThumb2;
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+ StringRef getPassName() const override { return ARM_LOAD_STORE_OPT_NAME; }
+
+ private:
+ /// A set of load/store MachineInstrs with same base register sorted by
+ /// offset.
+ struct MemOpQueueEntry {
+ MachineInstr *MI;
+ int Offset; ///< Load/Store offset.
+ unsigned Position; ///< Position as counted from end of basic block.
+ MemOpQueueEntry(MachineInstr &MI, int Offset, unsigned Position)
+ : MI(&MI), Offset(Offset), Position(Position) {}
+ };
+ typedef SmallVector<MemOpQueueEntry,8> MemOpQueue;
+
+ /// A set of MachineInstrs that fulfill (nearly all) conditions to get
+ /// merged into a LDM/STM.
+ struct MergeCandidate {
+ /// List of instructions ordered by load/store offset.
+ SmallVector<MachineInstr*, 4> Instrs;
+ /// Index in Instrs of the instruction being latest in the schedule.
+ unsigned LatestMIIdx;
+ /// Index in Instrs of the instruction being earliest in the schedule.
+ unsigned EarliestMIIdx;
+ /// Index into the basic block where the merged instruction will be
+ /// inserted. (See MemOpQueueEntry.Position)
+ unsigned InsertPos;
+ /// Whether the instructions can be merged into a ldm/stm instruction.
+ bool CanMergeToLSMulti;
+ /// Whether the instructions can be merged into a ldrd/strd instruction.
+ bool CanMergeToLSDouble;
+ };
+ SpecificBumpPtrAllocator<MergeCandidate> Allocator;
+ SmallVector<const MergeCandidate*,4> Candidates;
+ SmallVector<MachineInstr*,4> MergeBaseCandidates;
+
+ void moveLiveRegsBefore(const MachineBasicBlock &MBB,
+ MachineBasicBlock::const_iterator Before);
+ unsigned findFreeReg(const TargetRegisterClass &RegClass);
+ void UpdateBaseRegUses(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+ unsigned Base, unsigned WordOffset,
+ ARMCC::CondCodes Pred, unsigned PredReg);
+ MachineInstr *CreateLoadStoreMulti(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+ int Offset, unsigned Base, bool BaseKill, unsigned Opcode,
+ ARMCC::CondCodes Pred, unsigned PredReg, const DebugLoc &DL,
+ ArrayRef<std::pair<unsigned, bool>> Regs);
+ MachineInstr *CreateLoadStoreDouble(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+ int Offset, unsigned Base, bool BaseKill, unsigned Opcode,
+ ARMCC::CondCodes Pred, unsigned PredReg, const DebugLoc &DL,
+ ArrayRef<std::pair<unsigned, bool>> Regs) const;
+ void FormCandidates(const MemOpQueue &MemOps);
+ MachineInstr *MergeOpsUpdate(const MergeCandidate &Cand);
+ bool FixInvalidRegPairOp(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI);
+ bool MergeBaseUpdateLoadStore(MachineInstr *MI);
+ bool MergeBaseUpdateLSMultiple(MachineInstr *MI);
+ bool MergeBaseUpdateLSDouble(MachineInstr &MI) const;
+ bool LoadStoreMultipleOpti(MachineBasicBlock &MBB);
+ bool MergeReturnIntoLDM(MachineBasicBlock &MBB);
+ bool CombineMovBx(MachineBasicBlock &MBB);
+ };
+ char ARMLoadStoreOpt::ID = 0;
+}
+
+INITIALIZE_PASS(ARMLoadStoreOpt, "arm-ldst-opt", ARM_LOAD_STORE_OPT_NAME, false,
+ false)
+
+static bool definesCPSR(const MachineInstr &MI) {
+ for (const auto &MO : MI.operands()) {
+ if (!MO.isReg())
+ continue;
+ if (MO.isDef() && MO.getReg() == ARM::CPSR && !MO.isDead())
+ // If the instruction has live CPSR def, then it's not safe to fold it
+ // into load / store.
+ return true;
+ }
+
+ return false;
+}
+
+static int getMemoryOpOffset(const MachineInstr &MI) {
+ unsigned Opcode = MI.getOpcode();
+ bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD;
+ unsigned NumOperands = MI.getDesc().getNumOperands();
+ unsigned OffField = MI.getOperand(NumOperands - 3).getImm();
+
+ if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 ||
+ Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 ||
+ Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8 ||
+ Opcode == ARM::LDRi12 || Opcode == ARM::STRi12)
+ return OffField;
+
+ // Thumb1 immediate offsets are scaled by 4
+ if (Opcode == ARM::tLDRi || Opcode == ARM::tSTRi ||
+ Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi)
+ return OffField * 4;
+
+ int Offset = isAM3 ? ARM_AM::getAM3Offset(OffField)
+ : ARM_AM::getAM5Offset(OffField) * 4;
+ ARM_AM::AddrOpc Op = isAM3 ? ARM_AM::getAM3Op(OffField)
+ : ARM_AM::getAM5Op(OffField);
+
+ if (Op == ARM_AM::sub)
+ return -Offset;
+
+ return Offset;
+}
+
+static const MachineOperand &getLoadStoreBaseOp(const MachineInstr &MI) {
+ return MI.getOperand(1);
+}
+
+static const MachineOperand &getLoadStoreRegOp(const MachineInstr &MI) {
+ return MI.getOperand(0);
+}
+
+static int getLoadStoreMultipleOpcode(unsigned Opcode, ARM_AM::AMSubMode Mode) {
+ switch (Opcode) {
+ default: llvm_unreachable("Unhandled opcode!");
+ case ARM::LDRi12:
+ ++NumLDMGened;
+ switch (Mode) {
+ default: llvm_unreachable("Unhandled submode!");
+ case ARM_AM::ia: return ARM::LDMIA;
+ case ARM_AM::da: return ARM::LDMDA;
+ case ARM_AM::db: return ARM::LDMDB;
+ case ARM_AM::ib: return ARM::LDMIB;
+ }
+ case ARM::STRi12:
+ ++NumSTMGened;
+ switch (Mode) {
+ default: llvm_unreachable("Unhandled submode!");
+ case ARM_AM::ia: return ARM::STMIA;
+ case ARM_AM::da: return ARM::STMDA;
+ case ARM_AM::db: return ARM::STMDB;
+ case ARM_AM::ib: return ARM::STMIB;
+ }
+ case ARM::tLDRi:
+ case ARM::tLDRspi:
+ // tLDMIA is writeback-only - unless the base register is in the input
+ // reglist.
+ ++NumLDMGened;
+ switch (Mode) {
+ default: llvm_unreachable("Unhandled submode!");
+ case ARM_AM::ia: return ARM::tLDMIA;
+ }
+ case ARM::tSTRi:
+ case ARM::tSTRspi:
+ // There is no non-writeback tSTMIA either.
+ ++NumSTMGened;
+ switch (Mode) {
+ default: llvm_unreachable("Unhandled submode!");
+ case ARM_AM::ia: return ARM::tSTMIA_UPD;
+ }
+ case ARM::t2LDRi8:
+ case ARM::t2LDRi12:
+ ++NumLDMGened;
+ switch (Mode) {
+ default: llvm_unreachable("Unhandled submode!");
+ case ARM_AM::ia: return ARM::t2LDMIA;
+ case ARM_AM::db: return ARM::t2LDMDB;
+ }
+ case ARM::t2STRi8:
+ case ARM::t2STRi12:
+ ++NumSTMGened;
+ switch (Mode) {
+ default: llvm_unreachable("Unhandled submode!");
+ case ARM_AM::ia: return ARM::t2STMIA;
+ case ARM_AM::db: return ARM::t2STMDB;
+ }
+ case ARM::VLDRS:
+ ++NumVLDMGened;
+ switch (Mode) {
+ default: llvm_unreachable("Unhandled submode!");
+ case ARM_AM::ia: return ARM::VLDMSIA;
+ case ARM_AM::db: return 0; // Only VLDMSDB_UPD exists.
+ }
+ case ARM::VSTRS:
+ ++NumVSTMGened;
+ switch (Mode) {
+ default: llvm_unreachable("Unhandled submode!");
+ case ARM_AM::ia: return ARM::VSTMSIA;
+ case ARM_AM::db: return 0; // Only VSTMSDB_UPD exists.
+ }
+ case ARM::VLDRD:
+ ++NumVLDMGened;
+ switch (Mode) {
+ default: llvm_unreachable("Unhandled submode!");
+ case ARM_AM::ia: return ARM::VLDMDIA;
+ case ARM_AM::db: return 0; // Only VLDMDDB_UPD exists.
+ }
+ case ARM::VSTRD:
+ ++NumVSTMGened;
+ switch (Mode) {
+ default: llvm_unreachable("Unhandled submode!");
+ case ARM_AM::ia: return ARM::VSTMDIA;
+ case ARM_AM::db: return 0; // Only VSTMDDB_UPD exists.
+ }
+ }
+}
+
+static ARM_AM::AMSubMode getLoadStoreMultipleSubMode(unsigned Opcode) {
+ switch (Opcode) {
+ default: llvm_unreachable("Unhandled opcode!");
+ case ARM::LDMIA_RET:
+ case ARM::LDMIA:
+ case ARM::LDMIA_UPD:
+ case ARM::STMIA:
+ case ARM::STMIA_UPD:
+ case ARM::tLDMIA:
+ case ARM::tLDMIA_UPD:
+ case ARM::tSTMIA_UPD:
+ case ARM::t2LDMIA_RET:
+ case ARM::t2LDMIA:
+ case ARM::t2LDMIA_UPD:
+ case ARM::t2STMIA:
+ case ARM::t2STMIA_UPD:
+ case ARM::VLDMSIA:
+ case ARM::VLDMSIA_UPD:
+ case ARM::VSTMSIA:
+ case ARM::VSTMSIA_UPD:
+ case ARM::VLDMDIA:
+ case ARM::VLDMDIA_UPD:
+ case ARM::VSTMDIA:
+ case ARM::VSTMDIA_UPD:
+ return ARM_AM::ia;
+
+ case ARM::LDMDA:
+ case ARM::LDMDA_UPD:
+ case ARM::STMDA:
+ case ARM::STMDA_UPD:
+ return ARM_AM::da;
+
+ case ARM::LDMDB:
+ case ARM::LDMDB_UPD:
+ case ARM::STMDB:
+ case ARM::STMDB_UPD:
+ case ARM::t2LDMDB:
+ case ARM::t2LDMDB_UPD:
+ case ARM::t2STMDB:
+ case ARM::t2STMDB_UPD:
+ case ARM::VLDMSDB_UPD:
+ case ARM::VSTMSDB_UPD:
+ case ARM::VLDMDDB_UPD:
+ case ARM::VSTMDDB_UPD:
+ return ARM_AM::db;
+
+ case ARM::LDMIB:
+ case ARM::LDMIB_UPD:
+ case ARM::STMIB:
+ case ARM::STMIB_UPD:
+ return ARM_AM::ib;
+ }
+}
+
+static bool isT1i32Load(unsigned Opc) {
+ return Opc == ARM::tLDRi || Opc == ARM::tLDRspi;
+}
+
+static bool isT2i32Load(unsigned Opc) {
+ return Opc == ARM::t2LDRi12 || Opc == ARM::t2LDRi8;
+}
+
+static bool isi32Load(unsigned Opc) {
+ return Opc == ARM::LDRi12 || isT1i32Load(Opc) || isT2i32Load(Opc) ;
+}
+
+static bool isT1i32Store(unsigned Opc) {
+ return Opc == ARM::tSTRi || Opc == ARM::tSTRspi;
+}
+
+static bool isT2i32Store(unsigned Opc) {
+ return Opc == ARM::t2STRi12 || Opc == ARM::t2STRi8;
+}
+
+static bool isi32Store(unsigned Opc) {
+ return Opc == ARM::STRi12 || isT1i32Store(Opc) || isT2i32Store(Opc);
+}
+
+static bool isLoadSingle(unsigned Opc) {
+ return isi32Load(Opc) || Opc == ARM::VLDRS || Opc == ARM::VLDRD;
+}
+
+static unsigned getImmScale(unsigned Opc) {
+ switch (Opc) {
+ default: llvm_unreachable("Unhandled opcode!");
+ case ARM::tLDRi:
+ case ARM::tSTRi:
+ case ARM::tLDRspi:
+ case ARM::tSTRspi:
+ return 1;
+ case ARM::tLDRHi:
+ case ARM::tSTRHi:
+ return 2;
+ case ARM::tLDRBi:
+ case ARM::tSTRBi:
+ return 4;
+ }
+}
+
+static unsigned getLSMultipleTransferSize(const MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ default: return 0;
+ case ARM::LDRi12:
+ case ARM::STRi12:
+ case ARM::tLDRi:
+ case ARM::tSTRi:
+ case ARM::tLDRspi:
+ case ARM::tSTRspi:
+ case ARM::t2LDRi8:
+ case ARM::t2LDRi12:
+ case ARM::t2STRi8:
+ case ARM::t2STRi12:
+ case ARM::VLDRS:
+ case ARM::VSTRS:
+ return 4;
+ case ARM::VLDRD:
+ case ARM::VSTRD:
+ return 8;
+ case ARM::LDMIA:
+ case ARM::LDMDA:
+ case ARM::LDMDB:
+ case ARM::LDMIB:
+ case ARM::STMIA:
+ case ARM::STMDA:
+ case ARM::STMDB:
+ case ARM::STMIB:
+ case ARM::tLDMIA:
+ case ARM::tLDMIA_UPD:
+ case ARM::tSTMIA_UPD:
+ case ARM::t2LDMIA:
+ case ARM::t2LDMDB:
+ case ARM::t2STMIA:
+ case ARM::t2STMDB:
+ case ARM::VLDMSIA:
+ case ARM::VSTMSIA:
+ return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 4;
+ case ARM::VLDMDIA:
+ case ARM::VSTMDIA:
+ return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 8;
+ }
+}
+
+/// Update future uses of the base register with the offset introduced
+/// due to writeback. This function only works on Thumb1.
+void ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, unsigned Base,
+ unsigned WordOffset,
+ ARMCC::CondCodes Pred,
+ unsigned PredReg) {
+ assert(isThumb1 && "Can only update base register uses for Thumb1!");
+ // Start updating any instructions with immediate offsets. Insert a SUB before
+ // the first non-updateable instruction (if any).
+ for (; MBBI != MBB.end(); ++MBBI) {
+ bool InsertSub = false;
+ unsigned Opc = MBBI->getOpcode();
+
+ if (MBBI->readsRegister(Base)) {
+ int Offset;
+ bool IsLoad =
+ Opc == ARM::tLDRi || Opc == ARM::tLDRHi || Opc == ARM::tLDRBi;
+ bool IsStore =
+ Opc == ARM::tSTRi || Opc == ARM::tSTRHi || Opc == ARM::tSTRBi;
+
+ if (IsLoad || IsStore) {
+ // Loads and stores with immediate offsets can be updated, but only if
+ // the new offset isn't negative.
+ // The MachineOperand containing the offset immediate is the last one
+ // before predicates.
+ MachineOperand &MO =
+ MBBI->getOperand(MBBI->getDesc().getNumOperands() - 3);
+ // The offsets are scaled by 1, 2 or 4 depending on the Opcode.
+ Offset = MO.getImm() - WordOffset * getImmScale(Opc);
+
+ // If storing the base register, it needs to be reset first.
+ unsigned InstrSrcReg = getLoadStoreRegOp(*MBBI).getReg();
+
+ if (Offset >= 0 && !(IsStore && InstrSrcReg == Base))
+ MO.setImm(Offset);
+ else
+ InsertSub = true;
+
+ } else if ((Opc == ARM::tSUBi8 || Opc == ARM::tADDi8) &&
+ !definesCPSR(*MBBI)) {
+ // SUBS/ADDS using this register, with a dead def of the CPSR.
+ // Merge it with the update; if the merged offset is too large,
+ // insert a new sub instead.
+ MachineOperand &MO =
+ MBBI->getOperand(MBBI->getDesc().getNumOperands() - 3);
+ Offset = (Opc == ARM::tSUBi8) ?
+ MO.getImm() + WordOffset * 4 :
+ MO.getImm() - WordOffset * 4 ;
+ if (Offset >= 0 && TL->isLegalAddImmediate(Offset)) {
+ // FIXME: Swap ADDS<->SUBS if Offset < 0, erase instruction if
+ // Offset == 0.
+ MO.setImm(Offset);
+ // The base register has now been reset, so exit early.
+ return;
+ } else {
+ InsertSub = true;
+ }
+
+ } else {
+ // Can't update the instruction.
+ InsertSub = true;
+ }
+
+ } else if (definesCPSR(*MBBI) || MBBI->isCall() || MBBI->isBranch()) {
+ // Since SUBS sets the condition flags, we can't place the base reset
+ // after an instruction that has a live CPSR def.
+ // The base register might also contain an argument for a function call.
+ InsertSub = true;
+ }
+
+ if (InsertSub) {
+ // An instruction above couldn't be updated, so insert a sub.
+ AddDefaultT1CC(BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBi8), Base), true)
+ .addReg(Base).addImm(WordOffset * 4).addImm(Pred).addReg(PredReg);
+ return;
+ }
+
+ if (MBBI->killsRegister(Base) || MBBI->definesRegister(Base))
+ // Register got killed. Stop updating.
+ return;
+ }
+
+ // End of block was reached.
+ if (MBB.succ_size() > 0) {
+ // FIXME: Because of a bug, live registers are sometimes missing from
+ // the successor blocks' live-in sets. This means we can't trust that
+ // information and *always* have to reset at the end of a block.
+ // See PR21029.
+ if (MBBI != MBB.end()) --MBBI;
+ AddDefaultT1CC(
+ BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBi8), Base), true)
+ .addReg(Base).addImm(WordOffset * 4).addImm(Pred).addReg(PredReg);
+ }
+}
+
+/// Return the first register of class \p RegClass that is not in \p Regs.
+unsigned ARMLoadStoreOpt::findFreeReg(const TargetRegisterClass &RegClass) {
+ if (!RegClassInfoValid) {
+ RegClassInfo.runOnMachineFunction(*MF);
+ RegClassInfoValid = true;
+ }
+
+ for (unsigned Reg : RegClassInfo.getOrder(&RegClass))
+ if (!LiveRegs.contains(Reg))
+ return Reg;
+ return 0;
+}
+
+/// Compute live registers just before instruction \p Before (in normal schedule
+/// direction). Computes backwards so multiple queries in the same block must
+/// come in reverse order.
+void ARMLoadStoreOpt::moveLiveRegsBefore(const MachineBasicBlock &MBB,
+ MachineBasicBlock::const_iterator Before) {
+ // Initialize if we never queried in this block.
+ if (!LiveRegsValid) {
+ LiveRegs.init(*TRI);
+ LiveRegs.addLiveOuts(MBB);
+ LiveRegPos = MBB.end();
+ LiveRegsValid = true;
+ }
+ // Move backward just before the "Before" position.
+ while (LiveRegPos != Before) {
+ --LiveRegPos;
+ LiveRegs.stepBackward(*LiveRegPos);
+ }
+}
+
+static bool ContainsReg(const ArrayRef<std::pair<unsigned, bool>> &Regs,
+ unsigned Reg) {
+ for (const std::pair<unsigned, bool> &R : Regs)
+ if (R.first == Reg)
+ return true;
+ return false;
+}
+
+/// Create and insert a LDM or STM with Base as base register and registers in
+/// Regs as the register operands that would be loaded / stored. It returns
+/// true if the transformation is done.
+MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+ int Offset, unsigned Base, bool BaseKill, unsigned Opcode,
+ ARMCC::CondCodes Pred, unsigned PredReg, const DebugLoc &DL,
+ ArrayRef<std::pair<unsigned, bool>> Regs) {
+ unsigned NumRegs = Regs.size();
+ assert(NumRegs > 1);
+
+ // For Thumb1 targets, it might be necessary to clobber the CPSR to merge.
+ // Compute liveness information for that register to make the decision.
+ bool SafeToClobberCPSR = !isThumb1 ||
+ (MBB.computeRegisterLiveness(TRI, ARM::CPSR, InsertBefore, 20) ==
+ MachineBasicBlock::LQR_Dead);
+
+ bool Writeback = isThumb1; // Thumb1 LDM/STM have base reg writeback.
+
+ // Exception: If the base register is in the input reglist, Thumb1 LDM is
+ // non-writeback.
+ // It's also not possible to merge an STR of the base register in Thumb1.
+ if (isThumb1 && isi32Load(Opcode) && ContainsReg(Regs, Base)) {
+ assert(Base != ARM::SP && "Thumb1 does not allow SP in register list");
+ if (Opcode == ARM::tLDRi) {
+ Writeback = false;
+ } else if (Opcode == ARM::tSTRi) {
+ return nullptr;
+ }
+ }
+
+ ARM_AM::AMSubMode Mode = ARM_AM::ia;
+ // VFP and Thumb2 do not support IB or DA modes. Thumb1 only supports IA.
+ bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode);
+ bool haveIBAndDA = isNotVFP && !isThumb2 && !isThumb1;
+
+ if (Offset == 4 && haveIBAndDA) {
+ Mode = ARM_AM::ib;
+ } else if (Offset == -4 * (int)NumRegs + 4 && haveIBAndDA) {
+ Mode = ARM_AM::da;
+ } else if (Offset == -4 * (int)NumRegs && isNotVFP && !isThumb1) {
+ // VLDM/VSTM do not support DB mode without also updating the base reg.
+ Mode = ARM_AM::db;
+ } else if (Offset != 0 || Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi) {
+ // Check if this is a supported opcode before inserting instructions to
+ // calculate a new base register.
+ if (!getLoadStoreMultipleOpcode(Opcode, Mode)) return nullptr;
+
+ // If starting offset isn't zero, insert a MI to materialize a new base.
+ // But only do so if it is cost effective, i.e. merging more than two
+ // loads / stores.
+ if (NumRegs <= 2)
+ return nullptr;
+
+ // On Thumb1, it's not worth materializing a new base register without
+ // clobbering the CPSR (i.e. not using ADDS/SUBS).
+ if (!SafeToClobberCPSR)
+ return nullptr;
+
+ unsigned NewBase;
+ if (isi32Load(Opcode)) {
+ // If it is a load, then just use one of the destination registers
+ // as the new base. Will no longer be writeback in Thumb1.
+ NewBase = Regs[NumRegs-1].first;
+ Writeback = false;
+ } else {
+ // Find a free register that we can use as scratch register.
+ moveLiveRegsBefore(MBB, InsertBefore);
+ // The merged instruction does not exist yet but will use several Regs if
+ // it is a Store.
+ if (!isLoadSingle(Opcode))
+ for (const std::pair<unsigned, bool> &R : Regs)
+ LiveRegs.addReg(R.first);
+
+ NewBase = findFreeReg(isThumb1 ? ARM::tGPRRegClass : ARM::GPRRegClass);
+ if (NewBase == 0)
+ return nullptr;
+ }
+
+ int BaseOpc =
+ isThumb2 ? ARM::t2ADDri :
+ (isThumb1 && Base == ARM::SP) ? ARM::tADDrSPi :
+ (isThumb1 && Offset < 8) ? ARM::tADDi3 :
+ isThumb1 ? ARM::tADDi8 : ARM::ADDri;
+
+ if (Offset < 0) {
+ Offset = - Offset;
+ BaseOpc =
+ isThumb2 ? ARM::t2SUBri :
+ (isThumb1 && Offset < 8 && Base != ARM::SP) ? ARM::tSUBi3 :
+ isThumb1 ? ARM::tSUBi8 : ARM::SUBri;
+ }
+
+ if (!TL->isLegalAddImmediate(Offset))
+ // FIXME: Try add with register operand?
+ return nullptr; // Probably not worth it then.
+
+ // We can only append a kill flag to the add/sub input if the value is not
+ // used in the register list of the stm as well.
+ bool KillOldBase = BaseKill &&
+ (!isi32Store(Opcode) || !ContainsReg(Regs, Base));
+
+ if (isThumb1) {
+ // Thumb1: depending on immediate size, use either
+ // ADDS NewBase, Base, #imm3
+ // or
+ // MOV NewBase, Base
+ // ADDS NewBase, #imm8.
+ if (Base != NewBase &&
+ (BaseOpc == ARM::tADDi8 || BaseOpc == ARM::tSUBi8)) {
+ // Need to insert a MOV to the new base first.
+ if (isARMLowRegister(NewBase) && isARMLowRegister(Base) &&
+ !STI->hasV6Ops()) {
+ // thumbv4t doesn't have lo->lo copies, and we can't predicate tMOVSr
+ if (Pred != ARMCC::AL)
+ return nullptr;
+ BuildMI(MBB, InsertBefore, DL, TII->get(ARM::tMOVSr), NewBase)
+ .addReg(Base, getKillRegState(KillOldBase));
+ } else
+ BuildMI(MBB, InsertBefore, DL, TII->get(ARM::tMOVr), NewBase)
+ .addReg(Base, getKillRegState(KillOldBase))
+ .addImm(Pred).addReg(PredReg);
+
+ // The following ADDS/SUBS becomes an update.
+ Base = NewBase;
+ KillOldBase = true;
+ }
+ if (BaseOpc == ARM::tADDrSPi) {
+ assert(Offset % 4 == 0 && "tADDrSPi offset is scaled by 4");
+ BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase)
+ .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset/4)
+ .addImm(Pred).addReg(PredReg);
+ } else
+ AddDefaultT1CC(
+ BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase), true)
+ .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset)
+ .addImm(Pred).addReg(PredReg);
+ } else {
+ BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase)
+ .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset)
+ .addImm(Pred).addReg(PredReg).addReg(0);
+ }
+ Base = NewBase;
+ BaseKill = true; // New base is always killed straight away.
+ }
+
+ bool isDef = isLoadSingle(Opcode);
+
+ // Get LS multiple opcode. Note that for Thumb1 this might be an opcode with
+ // base register writeback.
+ Opcode = getLoadStoreMultipleOpcode(Opcode, Mode);
+ if (!Opcode)
+ return nullptr;
+
+ // Check if a Thumb1 LDM/STM merge is safe. This is the case if:
+ // - There is no writeback (LDM of base register),
+ // - the base register is killed by the merged instruction,
+ // - or it's safe to overwrite the condition flags, i.e. to insert a SUBS
+ // to reset the base register.
+ // Otherwise, don't merge.
+ // It's safe to return here since the code to materialize a new base register
+ // above is also conditional on SafeToClobberCPSR.
+ if (isThumb1 && !SafeToClobberCPSR && Writeback && !BaseKill)
+ return nullptr;
+
+ MachineInstrBuilder MIB;
+
+ if (Writeback) {
+ assert(isThumb1 && "expected Writeback only inThumb1");
+ if (Opcode == ARM::tLDMIA) {
+ assert(!(ContainsReg(Regs, Base)) && "Thumb1 can't LDM ! with Base in Regs");
+ // Update tLDMIA with writeback if necessary.
+ Opcode = ARM::tLDMIA_UPD;
+ }
+
+ MIB = BuildMI(MBB, InsertBefore, DL, TII->get(Opcode));
+
+ // Thumb1: we might need to set base writeback when building the MI.
+ MIB.addReg(Base, getDefRegState(true))
+ .addReg(Base, getKillRegState(BaseKill));
+
+ // The base isn't dead after a merged instruction with writeback.
+ // Insert a sub instruction after the newly formed instruction to reset.
+ if (!BaseKill)
+ UpdateBaseRegUses(MBB, InsertBefore, DL, Base, NumRegs, Pred, PredReg);
+
+ } else {
+ // No writeback, simply build the MachineInstr.
+ MIB = BuildMI(MBB, InsertBefore, DL, TII->get(Opcode));
+ MIB.addReg(Base, getKillRegState(BaseKill));
+ }
+
+ MIB.addImm(Pred).addReg(PredReg);
+
+ for (const std::pair<unsigned, bool> &R : Regs)
+ MIB.addReg(R.first, getDefRegState(isDef) | getKillRegState(R.second));
+
+ return MIB.getInstr();
+}
+
+MachineInstr *ARMLoadStoreOpt::CreateLoadStoreDouble(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+ int Offset, unsigned Base, bool BaseKill, unsigned Opcode,
+ ARMCC::CondCodes Pred, unsigned PredReg, const DebugLoc &DL,
+ ArrayRef<std::pair<unsigned, bool>> Regs) const {
+ bool IsLoad = isi32Load(Opcode);
+ assert((IsLoad || isi32Store(Opcode)) && "Must have integer load or store");
+ unsigned LoadStoreOpcode = IsLoad ? ARM::t2LDRDi8 : ARM::t2STRDi8;
+
+ assert(Regs.size() == 2);
+ MachineInstrBuilder MIB = BuildMI(MBB, InsertBefore, DL,
+ TII->get(LoadStoreOpcode));
+ if (IsLoad) {
+ MIB.addReg(Regs[0].first, RegState::Define)
+ .addReg(Regs[1].first, RegState::Define);
+ } else {
+ MIB.addReg(Regs[0].first, getKillRegState(Regs[0].second))
+ .addReg(Regs[1].first, getKillRegState(Regs[1].second));
+ }
+ MIB.addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
+ return MIB.getInstr();
+}
+
+/// Call MergeOps and update MemOps and merges accordingly on success.
+MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) {
+ const MachineInstr *First = Cand.Instrs.front();
+ unsigned Opcode = First->getOpcode();
+ bool IsLoad = isLoadSingle(Opcode);
+ SmallVector<std::pair<unsigned, bool>, 8> Regs;
+ SmallVector<unsigned, 4> ImpDefs;
+ DenseSet<unsigned> KilledRegs;
+ DenseSet<unsigned> UsedRegs;
+ // Determine list of registers and list of implicit super-register defs.
+ for (const MachineInstr *MI : Cand.Instrs) {
+ const MachineOperand &MO = getLoadStoreRegOp(*MI);
+ unsigned Reg = MO.getReg();
+ bool IsKill = MO.isKill();
+ if (IsKill)
+ KilledRegs.insert(Reg);
+ Regs.push_back(std::make_pair(Reg, IsKill));
+ UsedRegs.insert(Reg);
+
+ if (IsLoad) {
+ // Collect any implicit defs of super-registers, after merging we can't
+ // be sure anymore that we properly preserved these live ranges and must
+ // removed these implicit operands.
+ for (const MachineOperand &MO : MI->implicit_operands()) {
+ if (!MO.isReg() || !MO.isDef() || MO.isDead())
+ continue;
+ assert(MO.isImplicit());
+ unsigned DefReg = MO.getReg();
+
+ if (is_contained(ImpDefs, DefReg))
+ continue;
+ // We can ignore cases where the super-reg is read and written.
+ if (MI->readsRegister(DefReg))
+ continue;
+ ImpDefs.push_back(DefReg);
+ }
+ }
+ }
+
+ // Attempt the merge.
+ typedef MachineBasicBlock::iterator iterator;
+ MachineInstr *LatestMI = Cand.Instrs[Cand.LatestMIIdx];
+ iterator InsertBefore = std::next(iterator(LatestMI));
+ MachineBasicBlock &MBB = *LatestMI->getParent();
+ unsigned Offset = getMemoryOpOffset(*First);
+ unsigned Base = getLoadStoreBaseOp(*First).getReg();
+ bool BaseKill = LatestMI->killsRegister(Base);
+ unsigned PredReg = 0;
+ ARMCC::CondCodes Pred = getInstrPredicate(*First, PredReg);
+ DebugLoc DL = First->getDebugLoc();
+ MachineInstr *Merged = nullptr;
+ if (Cand.CanMergeToLSDouble)
+ Merged = CreateLoadStoreDouble(MBB, InsertBefore, Offset, Base, BaseKill,
+ Opcode, Pred, PredReg, DL, Regs);
+ if (!Merged && Cand.CanMergeToLSMulti)
+ Merged = CreateLoadStoreMulti(MBB, InsertBefore, Offset, Base, BaseKill,
+ Opcode, Pred, PredReg, DL, Regs);
+ if (!Merged)
+ return nullptr;
+
+ // Determine earliest instruction that will get removed. We then keep an
+ // iterator just above it so the following erases don't invalidated it.
+ iterator EarliestI(Cand.Instrs[Cand.EarliestMIIdx]);
+ bool EarliestAtBegin = false;
+ if (EarliestI == MBB.begin()) {
+ EarliestAtBegin = true;
+ } else {
+ EarliestI = std::prev(EarliestI);
+ }
+
+ // Remove instructions which have been merged.
+ for (MachineInstr *MI : Cand.Instrs)
+ MBB.erase(MI);
+
+ // Determine range between the earliest removed instruction and the new one.
+ if (EarliestAtBegin)
+ EarliestI = MBB.begin();
+ else
+ EarliestI = std::next(EarliestI);
+ auto FixupRange = make_range(EarliestI, iterator(Merged));
+
+ if (isLoadSingle(Opcode)) {
+ // If the previous loads defined a super-reg, then we have to mark earlier
+ // operands undef; Replicate the super-reg def on the merged instruction.
+ for (MachineInstr &MI : FixupRange) {
+ for (unsigned &ImpDefReg : ImpDefs) {
+ for (MachineOperand &MO : MI.implicit_operands()) {
+ if (!MO.isReg() || MO.getReg() != ImpDefReg)
+ continue;
+ if (MO.readsReg())
+ MO.setIsUndef();
+ else if (MO.isDef())
+ ImpDefReg = 0;
+ }
+ }
+ }
+
+ MachineInstrBuilder MIB(*Merged->getParent()->getParent(), Merged);
+ for (unsigned ImpDef : ImpDefs)
+ MIB.addReg(ImpDef, RegState::ImplicitDefine);
+ } else {
+ // Remove kill flags: We are possibly storing the values later now.
+ assert(isi32Store(Opcode) || Opcode == ARM::VSTRS || Opcode == ARM::VSTRD);
+ for (MachineInstr &MI : FixupRange) {
+ for (MachineOperand &MO : MI.uses()) {
+ if (!MO.isReg() || !MO.isKill())
+ continue;
+ if (UsedRegs.count(MO.getReg()))
+ MO.setIsKill(false);
+ }
+ }
+ assert(ImpDefs.empty());
+ }
+
+ return Merged;
+}
+
+static bool isValidLSDoubleOffset(int Offset) {
+ unsigned Value = abs(Offset);
+ // t2LDRDi8/t2STRDi8 supports an 8 bit immediate which is internally
+ // multiplied by 4.
+ return (Value % 4) == 0 && Value < 1024;
+}
+
+/// Return true for loads/stores that can be combined to a double/multi
+/// operation without increasing the requirements for alignment.
+static bool mayCombineMisaligned(const TargetSubtargetInfo &STI,
+ const MachineInstr &MI) {
+ // vldr/vstr trap on misaligned pointers anyway, forming vldm makes no
+ // difference.
+ unsigned Opcode = MI.getOpcode();
+ if (!isi32Load(Opcode) && !isi32Store(Opcode))
+ return true;
+
+ // Stack pointer alignment is out of the programmers control so we can trust
+ // SP-relative loads/stores.
+ if (getLoadStoreBaseOp(MI).getReg() == ARM::SP &&
+ STI.getFrameLowering()->getTransientStackAlignment() >= 4)
+ return true;
+ return false;
+}
+
+/// Find candidates for load/store multiple merge in list of MemOpQueueEntries.
+void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
+ const MachineInstr *FirstMI = MemOps[0].MI;
+ unsigned Opcode = FirstMI->getOpcode();
+ bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode);
+ unsigned Size = getLSMultipleTransferSize(FirstMI);
+
+ unsigned SIndex = 0;
+ unsigned EIndex = MemOps.size();
+ do {
+ // Look at the first instruction.
+ const MachineInstr *MI = MemOps[SIndex].MI;
+ int Offset = MemOps[SIndex].Offset;
+ const MachineOperand &PMO = getLoadStoreRegOp(*MI);
+ unsigned PReg = PMO.getReg();
+ unsigned PRegNum = PMO.isUndef() ? UINT_MAX : TRI->getEncodingValue(PReg);
+ unsigned Latest = SIndex;
+ unsigned Earliest = SIndex;
+ unsigned Count = 1;
+ bool CanMergeToLSDouble =
+ STI->isThumb2() && isNotVFP && isValidLSDoubleOffset(Offset);
+ // ARM errata 602117: LDRD with base in list may result in incorrect base
+ // register when interrupted or faulted.
+ if (STI->isCortexM3() && isi32Load(Opcode) &&
+ PReg == getLoadStoreBaseOp(*MI).getReg())
+ CanMergeToLSDouble = false;
+
+ bool CanMergeToLSMulti = true;
+ // On swift vldm/vstm starting with an odd register number as that needs
+ // more uops than single vldrs.
+ if (STI->hasSlowOddRegister() && !isNotVFP && (PRegNum % 2) == 1)
+ CanMergeToLSMulti = false;
+
+ // LDRD/STRD do not allow SP/PC. LDM/STM do not support it or have it
+ // deprecated; LDM to PC is fine but cannot happen here.
+ if (PReg == ARM::SP || PReg == ARM::PC)
+ CanMergeToLSMulti = CanMergeToLSDouble = false;
+
+ // Should we be conservative?
+ if (AssumeMisalignedLoadStores && !mayCombineMisaligned(*STI, *MI))
+ CanMergeToLSMulti = CanMergeToLSDouble = false;
+
+ // Merge following instructions where possible.
+ for (unsigned I = SIndex+1; I < EIndex; ++I, ++Count) {
+ int NewOffset = MemOps[I].Offset;
+ if (NewOffset != Offset + (int)Size)
+ break;
+ const MachineOperand &MO = getLoadStoreRegOp(*MemOps[I].MI);
+ unsigned Reg = MO.getReg();
+ if (Reg == ARM::SP || Reg == ARM::PC)
+ break;
+
+ // See if the current load/store may be part of a multi load/store.
+ unsigned RegNum = MO.isUndef() ? UINT_MAX : TRI->getEncodingValue(Reg);
+ bool PartOfLSMulti = CanMergeToLSMulti;
+ if (PartOfLSMulti) {
+ // Register numbers must be in ascending order.
+ if (RegNum <= PRegNum)
+ PartOfLSMulti = false;
+ // For VFP / NEON load/store multiples, the registers must be
+ // consecutive and within the limit on the number of registers per
+ // instruction.
+ else if (!isNotVFP && RegNum != PRegNum+1)
+ PartOfLSMulti = false;
+ }
+ // See if the current load/store may be part of a double load/store.
+ bool PartOfLSDouble = CanMergeToLSDouble && Count <= 1;
+
+ if (!PartOfLSMulti && !PartOfLSDouble)
+ break;
+ CanMergeToLSMulti &= PartOfLSMulti;
+ CanMergeToLSDouble &= PartOfLSDouble;
+ // Track MemOp with latest and earliest position (Positions are
+ // counted in reverse).
+ unsigned Position = MemOps[I].Position;
+ if (Position < MemOps[Latest].Position)
+ Latest = I;
+ else if (Position > MemOps[Earliest].Position)
+ Earliest = I;
+ // Prepare for next MemOp.
+ Offset += Size;
+ PRegNum = RegNum;
+ }
+
+ // Form a candidate from the Ops collected so far.
+ MergeCandidate *Candidate = new(Allocator.Allocate()) MergeCandidate;
+ for (unsigned C = SIndex, CE = SIndex + Count; C < CE; ++C)
+ Candidate->Instrs.push_back(MemOps[C].MI);
+ Candidate->LatestMIIdx = Latest - SIndex;
+ Candidate->EarliestMIIdx = Earliest - SIndex;
+ Candidate->InsertPos = MemOps[Latest].Position;
+ if (Count == 1)
+ CanMergeToLSMulti = CanMergeToLSDouble = false;
+ Candidate->CanMergeToLSMulti = CanMergeToLSMulti;
+ Candidate->CanMergeToLSDouble = CanMergeToLSDouble;
+ Candidates.push_back(Candidate);
+ // Continue after the chain.
+ SIndex += Count;
+ } while (SIndex < EIndex);
+}
+
+static unsigned getUpdatingLSMultipleOpcode(unsigned Opc,
+ ARM_AM::AMSubMode Mode) {
+ switch (Opc) {
+ default: llvm_unreachable("Unhandled opcode!");
+ case ARM::LDMIA:
+ case ARM::LDMDA:
+ case ARM::LDMDB:
+ case ARM::LDMIB:
+ switch (Mode) {
+ default: llvm_unreachable("Unhandled submode!");
+ case ARM_AM::ia: return ARM::LDMIA_UPD;
+ case ARM_AM::ib: return ARM::LDMIB_UPD;
+ case ARM_AM::da: return ARM::LDMDA_UPD;
+ case ARM_AM::db: return ARM::LDMDB_UPD;
+ }
+ case ARM::STMIA:
+ case ARM::STMDA:
+ case ARM::STMDB:
+ case ARM::STMIB:
+ switch (Mode) {
+ default: llvm_unreachable("Unhandled submode!");
+ case ARM_AM::ia: return ARM::STMIA_UPD;
+ case ARM_AM::ib: return ARM::STMIB_UPD;
+ case ARM_AM::da: return ARM::STMDA_UPD;
+ case ARM_AM::db: return ARM::STMDB_UPD;
+ }
+ case ARM::t2LDMIA:
+ case ARM::t2LDMDB:
+ switch (Mode) {
+ default: llvm_unreachable("Unhandled submode!");
+ case ARM_AM::ia: return ARM::t2LDMIA_UPD;
+ case ARM_AM::db: return ARM::t2LDMDB_UPD;
+ }
+ case ARM::t2STMIA:
+ case ARM::t2STMDB:
+ switch (Mode) {
+ default: llvm_unreachable("Unhandled submode!");
+ case ARM_AM::ia: return ARM::t2STMIA_UPD;
+ case ARM_AM::db: return ARM::t2STMDB_UPD;
+ }
+ case ARM::VLDMSIA:
+ switch (Mode) {
+ default: llvm_unreachable("Unhandled submode!");
+ case ARM_AM::ia: return ARM::VLDMSIA_UPD;
+ case ARM_AM::db: return ARM::VLDMSDB_UPD;
+ }
+ case ARM::VLDMDIA:
+ switch (Mode) {
+ default: llvm_unreachable("Unhandled submode!");
+ case ARM_AM::ia: return ARM::VLDMDIA_UPD;
+ case ARM_AM::db: return ARM::VLDMDDB_UPD;
+ }
+ case ARM::VSTMSIA:
+ switch (Mode) {
+ default: llvm_unreachable("Unhandled submode!");
+ case ARM_AM::ia: return ARM::VSTMSIA_UPD;
+ case ARM_AM::db: return ARM::VSTMSDB_UPD;
+ }
+ case ARM::VSTMDIA:
+ switch (Mode) {
+ default: llvm_unreachable("Unhandled submode!");
+ case ARM_AM::ia: return ARM::VSTMDIA_UPD;
+ case ARM_AM::db: return ARM::VSTMDDB_UPD;
+ }
+ }
+}
+
+/// Check if the given instruction increments or decrements a register and
+/// return the amount it is incremented/decremented. Returns 0 if the CPSR flags
+/// generated by the instruction are possibly read as well.
+static int isIncrementOrDecrement(const MachineInstr &MI, unsigned Reg,
+ ARMCC::CondCodes Pred, unsigned PredReg) {
+ bool CheckCPSRDef;
+ int Scale;
+ switch (MI.getOpcode()) {
+ case ARM::tADDi8: Scale = 4; CheckCPSRDef = true; break;
+ case ARM::tSUBi8: Scale = -4; CheckCPSRDef = true; break;
+ case ARM::t2SUBri:
+ case ARM::SUBri: Scale = -1; CheckCPSRDef = true; break;
+ case ARM::t2ADDri:
+ case ARM::ADDri: Scale = 1; CheckCPSRDef = true; break;
+ case ARM::tADDspi: Scale = 4; CheckCPSRDef = false; break;
+ case ARM::tSUBspi: Scale = -4; CheckCPSRDef = false; break;
+ default: return 0;
+ }
+
+ unsigned MIPredReg;
+ if (MI.getOperand(0).getReg() != Reg ||
+ MI.getOperand(1).getReg() != Reg ||
+ getInstrPredicate(MI, MIPredReg) != Pred ||
+ MIPredReg != PredReg)
+ return 0;
+
+ if (CheckCPSRDef && definesCPSR(MI))
+ return 0;
+ return MI.getOperand(2).getImm() * Scale;
+}
+
+/// Searches for an increment or decrement of \p Reg before \p MBBI.
+static MachineBasicBlock::iterator
+findIncDecBefore(MachineBasicBlock::iterator MBBI, unsigned Reg,
+ ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) {
+ Offset = 0;
+ MachineBasicBlock &MBB = *MBBI->getParent();
+ MachineBasicBlock::iterator BeginMBBI = MBB.begin();
+ MachineBasicBlock::iterator EndMBBI = MBB.end();
+ if (MBBI == BeginMBBI)
+ return EndMBBI;
+
+ // Skip debug values.
+ MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI);
+ while (PrevMBBI->isDebugValue() && PrevMBBI != BeginMBBI)
+ --PrevMBBI;
+
+ Offset = isIncrementOrDecrement(*PrevMBBI, Reg, Pred, PredReg);
+ return Offset == 0 ? EndMBBI : PrevMBBI;
+}
+
+/// Searches for a increment or decrement of \p Reg after \p MBBI.
+static MachineBasicBlock::iterator
+findIncDecAfter(MachineBasicBlock::iterator MBBI, unsigned Reg,
+ ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) {
+ Offset = 0;
+ MachineBasicBlock &MBB = *MBBI->getParent();
+ MachineBasicBlock::iterator EndMBBI = MBB.end();
+ MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
+ // Skip debug values.
+ while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
+ ++NextMBBI;
+ if (NextMBBI == EndMBBI)
+ return EndMBBI;
+
+ Offset = isIncrementOrDecrement(*NextMBBI, Reg, Pred, PredReg);
+ return Offset == 0 ? EndMBBI : NextMBBI;
+}
+
+/// Fold proceeding/trailing inc/dec of base register into the
+/// LDM/STM/VLDM{D|S}/VSTM{D|S} op when possible:
+///
+/// stmia rn, <ra, rb, rc>
+/// rn := rn + 4 * 3;
+/// =>
+/// stmia rn!, <ra, rb, rc>
+///
+/// rn := rn - 4 * 3;
+/// ldmia rn, <ra, rb, rc>
+/// =>
+/// ldmdb rn!, <ra, rb, rc>
+bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
+ // Thumb1 is already using updating loads/stores.
+ if (isThumb1) return false;
+
+ const MachineOperand &BaseOP = MI->getOperand(0);
+ unsigned Base = BaseOP.getReg();
+ bool BaseKill = BaseOP.isKill();
+ unsigned PredReg = 0;
+ ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
+ unsigned Opcode = MI->getOpcode();
+ DebugLoc DL = MI->getDebugLoc();
+
+ // Can't use an updating ld/st if the base register is also a dest
+ // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined.
+ for (unsigned i = 2, e = MI->getNumOperands(); i != e; ++i)
+ if (MI->getOperand(i).getReg() == Base)
+ return false;
+
+ int Bytes = getLSMultipleTransferSize(MI);
+ MachineBasicBlock &MBB = *MI->getParent();
+ MachineBasicBlock::iterator MBBI(MI);
+ int Offset;
+ MachineBasicBlock::iterator MergeInstr
+ = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset);
+ ARM_AM::AMSubMode Mode = getLoadStoreMultipleSubMode(Opcode);
+ if (Mode == ARM_AM::ia && Offset == -Bytes) {
+ Mode = ARM_AM::db;
+ } else if (Mode == ARM_AM::ib && Offset == -Bytes) {
+ Mode = ARM_AM::da;
+ } else {
+ MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
+ if (((Mode != ARM_AM::ia && Mode != ARM_AM::ib) || Offset != Bytes) &&
+ ((Mode != ARM_AM::da && Mode != ARM_AM::db) || Offset != -Bytes)) {
+
+ // We couldn't find an inc/dec to merge. But if the base is dead, we
+ // can still change to a writeback form as that will save us 2 bytes
+ // of code size. It can create WAW hazards though, so only do it if
+ // we're minimizing code size.
+ if (!MBB.getParent()->getFunction()->optForMinSize() || !BaseKill)
+ return false;
+
+ bool HighRegsUsed = false;
+ for (unsigned i = 2, e = MI->getNumOperands(); i != e; ++i)
+ if (MI->getOperand(i).getReg() >= ARM::R8) {
+ HighRegsUsed = true;
+ break;
+ }
+
+ if (!HighRegsUsed)
+ MergeInstr = MBB.end();
+ else
+ return false;
+ }
+ }
+ if (MergeInstr != MBB.end())
+ MBB.erase(MergeInstr);
+
+ unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode, Mode);
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc))
+ .addReg(Base, getDefRegState(true)) // WB base register
+ .addReg(Base, getKillRegState(BaseKill))
+ .addImm(Pred).addReg(PredReg);
+
+ // Transfer the rest of operands.
+ for (unsigned OpNum = 3, e = MI->getNumOperands(); OpNum != e; ++OpNum)
+ MIB.addOperand(MI->getOperand(OpNum));
+
+ // Transfer memoperands.
+ MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+ MBB.erase(MBBI);
+ return true;
+}
+
+static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc,
+ ARM_AM::AddrOpc Mode) {
+ switch (Opc) {
+ case ARM::LDRi12:
+ return ARM::LDR_PRE_IMM;
+ case ARM::STRi12:
+ return ARM::STR_PRE_IMM;
+ case ARM::VLDRS:
+ return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD;
+ case ARM::VLDRD:
+ return Mode == ARM_AM::add ? ARM::VLDMDIA_UPD : ARM::VLDMDDB_UPD;
+ case ARM::VSTRS:
+ return Mode == ARM_AM::add ? ARM::VSTMSIA_UPD : ARM::VSTMSDB_UPD;
+ case ARM::VSTRD:
+ return Mode == ARM_AM::add ? ARM::VSTMDIA_UPD : ARM::VSTMDDB_UPD;
+ case ARM::t2LDRi8:
+ case ARM::t2LDRi12:
+ return ARM::t2LDR_PRE;
+ case ARM::t2STRi8:
+ case ARM::t2STRi12:
+ return ARM::t2STR_PRE;
+ default: llvm_unreachable("Unhandled opcode!");
+ }
+}
+
+static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc,
+ ARM_AM::AddrOpc Mode) {
+ switch (Opc) {
+ case ARM::LDRi12:
+ return ARM::LDR_POST_IMM;
+ case ARM::STRi12:
+ return ARM::STR_POST_IMM;
+ case ARM::VLDRS:
+ return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD;
+ case ARM::VLDRD:
+ return Mode == ARM_AM::add ? ARM::VLDMDIA_UPD : ARM::VLDMDDB_UPD;
+ case ARM::VSTRS:
+ return Mode == ARM_AM::add ? ARM::VSTMSIA_UPD : ARM::VSTMSDB_UPD;
+ case ARM::VSTRD:
+ return Mode == ARM_AM::add ? ARM::VSTMDIA_UPD : ARM::VSTMDDB_UPD;
+ case ARM::t2LDRi8:
+ case ARM::t2LDRi12:
+ return ARM::t2LDR_POST;
+ case ARM::t2STRi8:
+ case ARM::t2STRi12:
+ return ARM::t2STR_POST;
+ default: llvm_unreachable("Unhandled opcode!");
+ }
+}
+
+/// Fold proceeding/trailing inc/dec of base register into the
+/// LDR/STR/FLD{D|S}/FST{D|S} op when possible:
+bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
+ // Thumb1 doesn't have updating LDR/STR.
+ // FIXME: Use LDM/STM with single register instead.
+ if (isThumb1) return false;
+
+ unsigned Base = getLoadStoreBaseOp(*MI).getReg();
+ bool BaseKill = getLoadStoreBaseOp(*MI).isKill();
+ unsigned Opcode = MI->getOpcode();
+ DebugLoc DL = MI->getDebugLoc();
+ bool isAM5 = (Opcode == ARM::VLDRD || Opcode == ARM::VLDRS ||
+ Opcode == ARM::VSTRD || Opcode == ARM::VSTRS);
+ bool isAM2 = (Opcode == ARM::LDRi12 || Opcode == ARM::STRi12);
+ if (isi32Load(Opcode) || isi32Store(Opcode))
+ if (MI->getOperand(2).getImm() != 0)
+ return false;
+ if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0)
+ return false;
+
+ // Can't do the merge if the destination register is the same as the would-be
+ // writeback register.
+ if (MI->getOperand(0).getReg() == Base)
+ return false;
+
+ unsigned PredReg = 0;
+ ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
+ int Bytes = getLSMultipleTransferSize(MI);
+ MachineBasicBlock &MBB = *MI->getParent();
+ MachineBasicBlock::iterator MBBI(MI);
+ int Offset;
+ MachineBasicBlock::iterator MergeInstr
+ = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset);
+ unsigned NewOpc;
+ if (!isAM5 && Offset == Bytes) {
+ NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::add);
+ } else if (Offset == -Bytes) {
+ NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::sub);
+ } else {
+ MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
+ if (Offset == Bytes) {
+ NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::add);
+ } else if (!isAM5 && Offset == -Bytes) {
+ NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::sub);
+ } else
+ return false;
+ }
+ MBB.erase(MergeInstr);
+
+ ARM_AM::AddrOpc AddSub = Offset < 0 ? ARM_AM::sub : ARM_AM::add;
+
+ bool isLd = isLoadSingle(Opcode);
+ if (isAM5) {
+ // VLDM[SD]_UPD, VSTM[SD]_UPD
+ // (There are no base-updating versions of VLDR/VSTR instructions, but the
+ // updating load/store-multiple instructions can be used with only one
+ // register.)
+ MachineOperand &MO = MI->getOperand(0);
+ BuildMI(MBB, MBBI, DL, TII->get(NewOpc))
+ .addReg(Base, getDefRegState(true)) // WB base register
+ .addReg(Base, getKillRegState(isLd ? BaseKill : false))
+ .addImm(Pred).addReg(PredReg)
+ .addReg(MO.getReg(), (isLd ? getDefRegState(true) :
+ getKillRegState(MO.isKill())));
+ } else if (isLd) {
+ if (isAM2) {
+ // LDR_PRE, LDR_POST
+ if (NewOpc == ARM::LDR_PRE_IMM || NewOpc == ARM::LDRB_PRE_IMM) {
+ BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
+ .addReg(Base, RegState::Define)
+ .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
+ } else {
+ int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
+ BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
+ .addReg(Base, RegState::Define)
+ .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg);
+ }
+ } else {
+ // t2LDR_PRE, t2LDR_POST
+ BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
+ .addReg(Base, RegState::Define)
+ .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
+ }
+ } else {
+ MachineOperand &MO = MI->getOperand(0);
+ // FIXME: post-indexed stores use am2offset_imm, which still encodes
+ // the vestigal zero-reg offset register. When that's fixed, this clause
+ // can be removed entirely.
+ if (isAM2 && NewOpc == ARM::STR_POST_IMM) {
+ int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
+ // STR_PRE, STR_POST
+ BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
+ .addReg(MO.getReg(), getKillRegState(MO.isKill()))
+ .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg);
+ } else {
+ // t2STR_PRE, t2STR_POST
+ BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
+ .addReg(MO.getReg(), getKillRegState(MO.isKill()))
+ .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
+ }
+ }
+ MBB.erase(MBBI);
+
+ return true;
+}
+
+bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const {
+ unsigned Opcode = MI.getOpcode();
+ assert((Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) &&
+ "Must have t2STRDi8 or t2LDRDi8");
+ if (MI.getOperand(3).getImm() != 0)
+ return false;
+
+ // Behaviour for writeback is undefined if base register is the same as one
+ // of the others.
+ const MachineOperand &BaseOp = MI.getOperand(2);
+ unsigned Base = BaseOp.getReg();
+ const MachineOperand &Reg0Op = MI.getOperand(0);
+ const MachineOperand &Reg1Op = MI.getOperand(1);
+ if (Reg0Op.getReg() == Base || Reg1Op.getReg() == Base)
+ return false;
+
+ unsigned PredReg;
+ ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+ MachineBasicBlock::iterator MBBI(MI);
+ MachineBasicBlock &MBB = *MI.getParent();
+ int Offset;
+ MachineBasicBlock::iterator MergeInstr = findIncDecBefore(MBBI, Base, Pred,
+ PredReg, Offset);
+ unsigned NewOpc;
+ if (Offset == 8 || Offset == -8) {
+ NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_PRE : ARM::t2STRD_PRE;
+ } else {
+ MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
+ if (Offset == 8 || Offset == -8) {
+ NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_POST : ARM::t2STRD_POST;
+ } else
+ return false;
+ }
+ MBB.erase(MergeInstr);
+
+ DebugLoc DL = MI.getDebugLoc();
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
+ if (NewOpc == ARM::t2LDRD_PRE || NewOpc == ARM::t2LDRD_POST) {
+ MIB.addOperand(Reg0Op).addOperand(Reg1Op)
+ .addReg(BaseOp.getReg(), RegState::Define);
+ } else {
+ assert(NewOpc == ARM::t2STRD_PRE || NewOpc == ARM::t2STRD_POST);
+ MIB.addReg(BaseOp.getReg(), RegState::Define)
+ .addOperand(Reg0Op).addOperand(Reg1Op);
+ }
+ MIB.addReg(BaseOp.getReg(), RegState::Kill)
+ .addImm(Offset).addImm(Pred).addReg(PredReg);
+ assert(TII->get(Opcode).getNumOperands() == 6 &&
+ TII->get(NewOpc).getNumOperands() == 7 &&
+ "Unexpected number of operands in Opcode specification.");
+
+ // Transfer implicit operands.
+ for (const MachineOperand &MO : MI.implicit_operands())
+ MIB.addOperand(MO);
+ MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+ MBB.erase(MBBI);
+ return true;
+}
+
+/// Returns true if instruction is a memory operation that this pass is capable
+/// of operating on.
+static bool isMemoryOp(const MachineInstr &MI) {
+ unsigned Opcode = MI.getOpcode();
+ switch (Opcode) {
+ case ARM::VLDRS:
+ case ARM::VSTRS:
+ case ARM::VLDRD:
+ case ARM::VSTRD:
+ case ARM::LDRi12:
+ case ARM::STRi12:
+ case ARM::tLDRi:
+ case ARM::tSTRi:
+ case ARM::tLDRspi:
+ case ARM::tSTRspi:
+ case ARM::t2LDRi8:
+ case ARM::t2LDRi12:
+ case ARM::t2STRi8:
+ case ARM::t2STRi12:
+ break;
+ default:
+ return false;
+ }
+ if (!MI.getOperand(1).isReg())
+ return false;
+
+ // When no memory operands are present, conservatively assume unaligned,
+ // volatile, unfoldable.
+ if (!MI.hasOneMemOperand())
+ return false;
+
+ const MachineMemOperand &MMO = **MI.memoperands_begin();
+
+ // Don't touch volatile memory accesses - we may be changing their order.
+ if (MMO.isVolatile())
+ return false;
+
+ // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is
+ // not.
+ if (MMO.getAlignment() < 4)
+ return false;
+
+ // str <undef> could probably be eliminated entirely, but for now we just want
+ // to avoid making a mess of it.
+ // FIXME: Use str <undef> as a wildcard to enable better stm folding.
+ if (MI.getOperand(0).isReg() && MI.getOperand(0).isUndef())
+ return false;
+
+ // Likewise don't mess with references to undefined addresses.
+ if (MI.getOperand(1).isUndef())
+ return false;
+
+ return true;
+}
+
+static void InsertLDR_STR(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI, int Offset,
+ bool isDef, const DebugLoc &DL, unsigned NewOpc,
+ unsigned Reg, bool RegDeadKill, bool RegUndef,
+ unsigned BaseReg, bool BaseKill, bool BaseUndef,
+ bool OffKill, bool OffUndef, ARMCC::CondCodes Pred,
+ unsigned PredReg, const TargetInstrInfo *TII,
+ bool isT2) {
+ if (isDef) {
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
+ TII->get(NewOpc))
+ .addReg(Reg, getDefRegState(true) | getDeadRegState(RegDeadKill))
+ .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
+ MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
+ } else {
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
+ TII->get(NewOpc))
+ .addReg(Reg, getKillRegState(RegDeadKill) | getUndefRegState(RegUndef))
+ .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
+ MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
+ }
+}
+
+bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI) {
+ MachineInstr *MI = &*MBBI;
+ unsigned Opcode = MI->getOpcode();
+ if (Opcode != ARM::LDRD && Opcode != ARM::STRD && Opcode != ARM::t2LDRDi8)
+ return false;
+
+ const MachineOperand &BaseOp = MI->getOperand(2);
+ unsigned BaseReg = BaseOp.getReg();
+ unsigned EvenReg = MI->getOperand(0).getReg();
+ unsigned OddReg = MI->getOperand(1).getReg();
+ unsigned EvenRegNum = TRI->getDwarfRegNum(EvenReg, false);
+ unsigned OddRegNum = TRI->getDwarfRegNum(OddReg, false);
+
+ // ARM errata 602117: LDRD with base in list may result in incorrect base
+ // register when interrupted or faulted.
+ bool Errata602117 = EvenReg == BaseReg &&
+ (Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8) && STI->isCortexM3();
+ // ARM LDRD/STRD needs consecutive registers.
+ bool NonConsecutiveRegs = (Opcode == ARM::LDRD || Opcode == ARM::STRD) &&
+ (EvenRegNum % 2 != 0 || EvenRegNum + 1 != OddRegNum);
+
+ if (!Errata602117 && !NonConsecutiveRegs)
+ return false;
+
+ bool isT2 = Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8;
+ bool isLd = Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8;
+ bool EvenDeadKill = isLd ?
+ MI->getOperand(0).isDead() : MI->getOperand(0).isKill();
+ bool EvenUndef = MI->getOperand(0).isUndef();
+ bool OddDeadKill = isLd ?
+ MI->getOperand(1).isDead() : MI->getOperand(1).isKill();
+ bool OddUndef = MI->getOperand(1).isUndef();
+ bool BaseKill = BaseOp.isKill();
+ bool BaseUndef = BaseOp.isUndef();
+ bool OffKill = isT2 ? false : MI->getOperand(3).isKill();
+ bool OffUndef = isT2 ? false : MI->getOperand(3).isUndef();
+ int OffImm = getMemoryOpOffset(*MI);
+ unsigned PredReg = 0;
+ ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
+
+ if (OddRegNum > EvenRegNum && OffImm == 0) {
+ // Ascending register numbers and no offset. It's safe to change it to a
+ // ldm or stm.
+ unsigned NewOpc = (isLd)
+ ? (isT2 ? ARM::t2LDMIA : ARM::LDMIA)
+ : (isT2 ? ARM::t2STMIA : ARM::STMIA);
+ if (isLd) {
+ BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
+ .addReg(BaseReg, getKillRegState(BaseKill))
+ .addImm(Pred).addReg(PredReg)
+ .addReg(EvenReg, getDefRegState(isLd) | getDeadRegState(EvenDeadKill))
+ .addReg(OddReg, getDefRegState(isLd) | getDeadRegState(OddDeadKill));
+ ++NumLDRD2LDM;
+ } else {
+ BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
+ .addReg(BaseReg, getKillRegState(BaseKill))
+ .addImm(Pred).addReg(PredReg)
+ .addReg(EvenReg,
+ getKillRegState(EvenDeadKill) | getUndefRegState(EvenUndef))
+ .addReg(OddReg,
+ getKillRegState(OddDeadKill) | getUndefRegState(OddUndef));
+ ++NumSTRD2STM;
+ }
+ } else {
+ // Split into two instructions.
+ unsigned NewOpc = (isLd)
+ ? (isT2 ? (OffImm < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDRi12)
+ : (isT2 ? (OffImm < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STRi12);
+ // Be extra careful for thumb2. t2LDRi8 can't reference a zero offset,
+ // so adjust and use t2LDRi12 here for that.
+ unsigned NewOpc2 = (isLd)
+ ? (isT2 ? (OffImm+4 < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDRi12)
+ : (isT2 ? (OffImm+4 < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STRi12);
+ DebugLoc dl = MBBI->getDebugLoc();
+ // If this is a load and base register is killed, it may have been
+ // re-defed by the load, make sure the first load does not clobber it.
+ if (isLd &&
+ (BaseKill || OffKill) &&
+ (TRI->regsOverlap(EvenReg, BaseReg))) {
+ assert(!TRI->regsOverlap(OddReg, BaseReg));
+ InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc2,
+ OddReg, OddDeadKill, false,
+ BaseReg, false, BaseUndef, false, OffUndef,
+ Pred, PredReg, TII, isT2);
+ InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
+ EvenReg, EvenDeadKill, false,
+ BaseReg, BaseKill, BaseUndef, OffKill, OffUndef,
+ Pred, PredReg, TII, isT2);
+ } else {
+ if (OddReg == EvenReg && EvenDeadKill) {
+ // If the two source operands are the same, the kill marker is
+ // probably on the first one. e.g.
+ // t2STRDi8 %R5<kill>, %R5, %R9<kill>, 0, 14, %reg0
+ EvenDeadKill = false;
+ OddDeadKill = true;
+ }
+ // Never kill the base register in the first instruction.
+ if (EvenReg == BaseReg)
+ EvenDeadKill = false;
+ InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
+ EvenReg, EvenDeadKill, EvenUndef,
+ BaseReg, false, BaseUndef, false, OffUndef,
+ Pred, PredReg, TII, isT2);
+ InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc2,
+ OddReg, OddDeadKill, OddUndef,
+ BaseReg, BaseKill, BaseUndef, OffKill, OffUndef,
+ Pred, PredReg, TII, isT2);
+ }
+ if (isLd)
+ ++NumLDRD2LDR;
+ else
+ ++NumSTRD2STR;
+ }
+
+ MBBI = MBB.erase(MBBI);
+ return true;
+}
+
+/// An optimization pass to turn multiple LDR / STR ops of the same base and
+/// incrementing offset into LDM / STM ops.
+bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
+ MemOpQueue MemOps;
+ unsigned CurrBase = 0;
+ unsigned CurrOpc = ~0u;
+ ARMCC::CondCodes CurrPred = ARMCC::AL;
+ unsigned Position = 0;
+ assert(Candidates.size() == 0);
+ assert(MergeBaseCandidates.size() == 0);
+ LiveRegsValid = false;
+
+ for (MachineBasicBlock::iterator I = MBB.end(), MBBI; I != MBB.begin();
+ I = MBBI) {
+ // The instruction in front of the iterator is the one we look at.
+ MBBI = std::prev(I);
+ if (FixInvalidRegPairOp(MBB, MBBI))
+ continue;
+ ++Position;
+
+ if (isMemoryOp(*MBBI)) {
+ unsigned Opcode = MBBI->getOpcode();
+ const MachineOperand &MO = MBBI->getOperand(0);
+ unsigned Reg = MO.getReg();
+ unsigned Base = getLoadStoreBaseOp(*MBBI).getReg();
+ unsigned PredReg = 0;
+ ARMCC::CondCodes Pred = getInstrPredicate(*MBBI, PredReg);
+ int Offset = getMemoryOpOffset(*MBBI);
+ if (CurrBase == 0) {
+ // Start of a new chain.
+ CurrBase = Base;
+ CurrOpc = Opcode;
+ CurrPred = Pred;
+ MemOps.push_back(MemOpQueueEntry(*MBBI, Offset, Position));
+ continue;
+ }
+ // Note: No need to match PredReg in the next if.
+ if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) {
+ // Watch out for:
+ // r4 := ldr [r0, #8]
+ // r4 := ldr [r0, #4]
+ // or
+ // r0 := ldr [r0]
+ // If a load overrides the base register or a register loaded by
+ // another load in our chain, we cannot take this instruction.
+ bool Overlap = false;
+ if (isLoadSingle(Opcode)) {
+ Overlap = (Base == Reg);
+ if (!Overlap) {
+ for (const MemOpQueueEntry &E : MemOps) {
+ if (TRI->regsOverlap(Reg, E.MI->getOperand(0).getReg())) {
+ Overlap = true;
+ break;
+ }
+ }
+ }
+ }
+
+ if (!Overlap) {
+ // Check offset and sort memory operation into the current chain.
+ if (Offset > MemOps.back().Offset) {
+ MemOps.push_back(MemOpQueueEntry(*MBBI, Offset, Position));
+ continue;
+ } else {
+ MemOpQueue::iterator MI, ME;
+ for (MI = MemOps.begin(), ME = MemOps.end(); MI != ME; ++MI) {
+ if (Offset < MI->Offset) {
+ // Found a place to insert.
+ break;
+ }
+ if (Offset == MI->Offset) {
+ // Collision, abort.
+ MI = ME;
+ break;
+ }
+ }
+ if (MI != MemOps.end()) {
+ MemOps.insert(MI, MemOpQueueEntry(*MBBI, Offset, Position));
+ continue;
+ }
+ }
+ }
+ }
+
+ // Don't advance the iterator; The op will start a new chain next.
+ MBBI = I;
+ --Position;
+ // Fallthrough to look into existing chain.
+ } else if (MBBI->isDebugValue()) {
+ continue;
+ } else if (MBBI->getOpcode() == ARM::t2LDRDi8 ||
+ MBBI->getOpcode() == ARM::t2STRDi8) {
+ // ARMPreAllocLoadStoreOpt has already formed some LDRD/STRD instructions
+ // remember them because we may still be able to merge add/sub into them.
+ MergeBaseCandidates.push_back(&*MBBI);
+ }
+
+
+ // If we are here then the chain is broken; Extract candidates for a merge.
+ if (MemOps.size() > 0) {
+ FormCandidates(MemOps);
+ // Reset for the next chain.
+ CurrBase = 0;
+ CurrOpc = ~0u;
+ CurrPred = ARMCC::AL;
+ MemOps.clear();
+ }
+ }
+ if (MemOps.size() > 0)
+ FormCandidates(MemOps);
+
+ // Sort candidates so they get processed from end to begin of the basic
+ // block later; This is necessary for liveness calculation.
+ auto LessThan = [](const MergeCandidate* M0, const MergeCandidate *M1) {
+ return M0->InsertPos < M1->InsertPos;
+ };
+ std::sort(Candidates.begin(), Candidates.end(), LessThan);
+
+ // Go through list of candidates and merge.
+ bool Changed = false;
+ for (const MergeCandidate *Candidate : Candidates) {
+ if (Candidate->CanMergeToLSMulti || Candidate->CanMergeToLSDouble) {
+ MachineInstr *Merged = MergeOpsUpdate(*Candidate);
+ // Merge preceding/trailing base inc/dec into the merged op.
+ if (Merged) {
+ Changed = true;
+ unsigned Opcode = Merged->getOpcode();
+ if (Opcode == ARM::t2STRDi8 || Opcode == ARM::t2LDRDi8)
+ MergeBaseUpdateLSDouble(*Merged);
+ else
+ MergeBaseUpdateLSMultiple(Merged);
+ } else {
+ for (MachineInstr *MI : Candidate->Instrs) {
+ if (MergeBaseUpdateLoadStore(MI))
+ Changed = true;
+ }
+ }
+ } else {
+ assert(Candidate->Instrs.size() == 1);
+ if (MergeBaseUpdateLoadStore(Candidate->Instrs.front()))
+ Changed = true;
+ }
+ }
+ Candidates.clear();
+ // Try to fold add/sub into the LDRD/STRD formed by ARMPreAllocLoadStoreOpt.
+ for (MachineInstr *MI : MergeBaseCandidates)
+ MergeBaseUpdateLSDouble(*MI);
+ MergeBaseCandidates.clear();
+
+ return Changed;
+}
+
+/// If this is a exit BB, try merging the return ops ("bx lr" and "mov pc, lr")
+/// into the preceding stack restore so it directly restore the value of LR
+/// into pc.
+/// ldmfd sp!, {..., lr}
+/// bx lr
+/// or
+/// ldmfd sp!, {..., lr}
+/// mov pc, lr
+/// =>
+/// ldmfd sp!, {..., pc}
+bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
+ // Thumb1 LDM doesn't allow high registers.
+ if (isThumb1) return false;
+ if (MBB.empty()) return false;
+
+ MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+ if (MBBI != MBB.begin() && MBBI != MBB.end() &&
+ (MBBI->getOpcode() == ARM::BX_RET ||
+ MBBI->getOpcode() == ARM::tBX_RET ||
+ MBBI->getOpcode() == ARM::MOVPCLR)) {
+ MachineBasicBlock::iterator PrevI = std::prev(MBBI);
+ // Ignore any DBG_VALUE instructions.
+ while (PrevI->isDebugValue() && PrevI != MBB.begin())
+ --PrevI;
+ MachineInstr &PrevMI = *PrevI;
+ unsigned Opcode = PrevMI.getOpcode();
+ if (Opcode == ARM::LDMIA_UPD || Opcode == ARM::LDMDA_UPD ||
+ Opcode == ARM::LDMDB_UPD || Opcode == ARM::LDMIB_UPD ||
+ Opcode == ARM::t2LDMIA_UPD || Opcode == ARM::t2LDMDB_UPD) {
+ MachineOperand &MO = PrevMI.getOperand(PrevMI.getNumOperands() - 1);
+ if (MO.getReg() != ARM::LR)
+ return false;
+ unsigned NewOpc = (isThumb2 ? ARM::t2LDMIA_RET : ARM::LDMIA_RET);
+ assert(((isThumb2 && Opcode == ARM::t2LDMIA_UPD) ||
+ Opcode == ARM::LDMIA_UPD) && "Unsupported multiple load-return!");
+ PrevMI.setDesc(TII->get(NewOpc));
+ MO.setReg(ARM::PC);
+ PrevMI.copyImplicitOps(*MBB.getParent(), *MBBI);
+ MBB.erase(MBBI);
+ return true;
+ }
+ }
+ return false;
+}
+
+bool ARMLoadStoreOpt::CombineMovBx(MachineBasicBlock &MBB) {
+ MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+ if (MBBI == MBB.begin() || MBBI == MBB.end() ||
+ MBBI->getOpcode() != ARM::tBX_RET)
+ return false;
+
+ MachineBasicBlock::iterator Prev = MBBI;
+ --Prev;
+ if (Prev->getOpcode() != ARM::tMOVr || !Prev->definesRegister(ARM::LR))
+ return false;
+
+ for (auto Use : Prev->uses())
+ if (Use.isKill()) {
+ AddDefaultPred(BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(ARM::tBX))
+ .addReg(Use.getReg(), RegState::Kill))
+ .copyImplicitOps(*MBBI);
+ MBB.erase(MBBI);
+ MBB.erase(Prev);
+ return true;
+ }
+
+ llvm_unreachable("tMOVr doesn't kill a reg before tBX_RET?");
+}
+
+bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
+ if (skipFunction(*Fn.getFunction()))
+ return false;
+
+ MF = &Fn;
+ STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget());
+ TL = STI->getTargetLowering();
+ AFI = Fn.getInfo<ARMFunctionInfo>();
+ TII = STI->getInstrInfo();
+ TRI = STI->getRegisterInfo();
+
+ RegClassInfoValid = false;
+ isThumb2 = AFI->isThumb2Function();
+ isThumb1 = AFI->isThumbFunction() && !isThumb2;
+
+ bool Modified = false;
+ for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+ ++MFI) {
+ MachineBasicBlock &MBB = *MFI;
+ Modified |= LoadStoreMultipleOpti(MBB);
+ if (STI->hasV5TOps())
+ Modified |= MergeReturnIntoLDM(MBB);
+ if (isThumb1)
+ Modified |= CombineMovBx(MBB);
+ }
+
+ Allocator.DestroyAll();
+ return Modified;
+}
+
+#define ARM_PREALLOC_LOAD_STORE_OPT_NAME \
+ "ARM pre- register allocation load / store optimization pass"
+
+namespace {
+ /// Pre- register allocation pass that move load / stores from consecutive
+ /// locations close to make it more likely they will be combined later.
+ struct ARMPreAllocLoadStoreOpt : public MachineFunctionPass{
+ static char ID;
+ ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {}
+
+ const DataLayout *TD;
+ const TargetInstrInfo *TII;
+ const TargetRegisterInfo *TRI;
+ const ARMSubtarget *STI;
+ MachineRegisterInfo *MRI;
+ MachineFunction *MF;
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+
+ StringRef getPassName() const override {
+ return ARM_PREALLOC_LOAD_STORE_OPT_NAME;
+ }
+
+ private:
+ bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl,
+ unsigned &NewOpc, unsigned &EvenReg,
+ unsigned &OddReg, unsigned &BaseReg,
+ int &Offset,
+ unsigned &PredReg, ARMCC::CondCodes &Pred,
+ bool &isT2);
+ bool RescheduleOps(MachineBasicBlock *MBB,
+ SmallVectorImpl<MachineInstr *> &Ops,
+ unsigned Base, bool isLd,
+ DenseMap<MachineInstr*, unsigned> &MI2LocMap);
+ bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB);
+ };
+ char ARMPreAllocLoadStoreOpt::ID = 0;
+}
+
+INITIALIZE_PASS(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt",
+ ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false)
+
+bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
+ if (AssumeMisalignedLoadStores || skipFunction(*Fn.getFunction()))
+ return false;
+
+ TD = &Fn.getDataLayout();
+ STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget());
+ TII = STI->getInstrInfo();
+ TRI = STI->getRegisterInfo();
+ MRI = &Fn.getRegInfo();
+ MF = &Fn;
+
+ bool Modified = false;
+ for (MachineBasicBlock &MFI : Fn)
+ Modified |= RescheduleLoadStoreInstrs(&MFI);
+
+ return Modified;
+}
+
+static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base,
+ MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator E,
+ SmallPtrSetImpl<MachineInstr*> &MemOps,
+ SmallSet<unsigned, 4> &MemRegs,
+ const TargetRegisterInfo *TRI) {
+ // Are there stores / loads / calls between them?
+ // FIXME: This is overly conservative. We should make use of alias information
+ // some day.
+ SmallSet<unsigned, 4> AddedRegPressure;
+ while (++I != E) {
+ if (I->isDebugValue() || MemOps.count(&*I))
+ continue;
+ if (I->isCall() || I->isTerminator() || I->hasUnmodeledSideEffects())
+ return false;
+ if (isLd && I->mayStore())
+ return false;
+ if (!isLd) {
+ if (I->mayLoad())
+ return false;
+ // It's not safe to move the first 'str' down.
+ // str r1, [r0]
+ // strh r5, [r0]
+ // str r4, [r0, #+4]
+ if (I->mayStore())
+ return false;
+ }
+ for (unsigned j = 0, NumOps = I->getNumOperands(); j != NumOps; ++j) {
+ MachineOperand &MO = I->getOperand(j);
+ if (!MO.isReg())
+ continue;
+ unsigned Reg = MO.getReg();
+ if (MO.isDef() && TRI->regsOverlap(Reg, Base))
+ return false;
+ if (Reg != Base && !MemRegs.count(Reg))
+ AddedRegPressure.insert(Reg);
+ }
+ }
+
+ // Estimate register pressure increase due to the transformation.
+ if (MemRegs.size() <= 4)
+ // Ok if we are moving small number of instructions.
+ return true;
+ return AddedRegPressure.size() <= MemRegs.size() * 2;
+}
+
+bool
+ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
+ DebugLoc &dl, unsigned &NewOpc,
+ unsigned &FirstReg,
+ unsigned &SecondReg,
+ unsigned &BaseReg, int &Offset,
+ unsigned &PredReg,
+ ARMCC::CondCodes &Pred,
+ bool &isT2) {
+ // Make sure we're allowed to generate LDRD/STRD.
+ if (!STI->hasV5TEOps())
+ return false;
+
+ // FIXME: VLDRS / VSTRS -> VLDRD / VSTRD
+ unsigned Scale = 1;
+ unsigned Opcode = Op0->getOpcode();
+ if (Opcode == ARM::LDRi12) {
+ NewOpc = ARM::LDRD;
+ } else if (Opcode == ARM::STRi12) {
+ NewOpc = ARM::STRD;
+ } else if (Opcode == ARM::t2LDRi8 || Opcode == ARM::t2LDRi12) {
+ NewOpc = ARM::t2LDRDi8;
+ Scale = 4;
+ isT2 = true;
+ } else if (Opcode == ARM::t2STRi8 || Opcode == ARM::t2STRi12) {
+ NewOpc = ARM::t2STRDi8;
+ Scale = 4;
+ isT2 = true;
+ } else {
+ return false;
+ }
+
+ // Make sure the base address satisfies i64 ld / st alignment requirement.
+ // At the moment, we ignore the memoryoperand's value.
+ // If we want to use AliasAnalysis, we should check it accordingly.
+ if (!Op0->hasOneMemOperand() ||
+ (*Op0->memoperands_begin())->isVolatile())
+ return false;
+
+ unsigned Align = (*Op0->memoperands_begin())->getAlignment();
+ const Function *Func = MF->getFunction();
+ unsigned ReqAlign = STI->hasV6Ops()
+ ? TD->getABITypeAlignment(Type::getInt64Ty(Func->getContext()))
+ : 8; // Pre-v6 need 8-byte align
+ if (Align < ReqAlign)
+ return false;
+
+ // Then make sure the immediate offset fits.
+ int OffImm = getMemoryOpOffset(*Op0);
+ if (isT2) {
+ int Limit = (1 << 8) * Scale;
+ if (OffImm >= Limit || (OffImm <= -Limit) || (OffImm & (Scale-1)))
+ return false;
+ Offset = OffImm;
+ } else {
+ ARM_AM::AddrOpc AddSub = ARM_AM::add;
+ if (OffImm < 0) {
+ AddSub = ARM_AM::sub;
+ OffImm = - OffImm;
+ }
+ int Limit = (1 << 8) * Scale;
+ if (OffImm >= Limit || (OffImm & (Scale-1)))
+ return false;
+ Offset = ARM_AM::getAM3Opc(AddSub, OffImm);
+ }
+ FirstReg = Op0->getOperand(0).getReg();
+ SecondReg = Op1->getOperand(0).getReg();
+ if (FirstReg == SecondReg)
+ return false;
+ BaseReg = Op0->getOperand(1).getReg();
+ Pred = getInstrPredicate(*Op0, PredReg);
+ dl = Op0->getDebugLoc();
+ return true;
+}
+
+bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
+ SmallVectorImpl<MachineInstr *> &Ops,
+ unsigned Base, bool isLd,
+ DenseMap<MachineInstr*, unsigned> &MI2LocMap) {
+ bool RetVal = false;
+
+ // Sort by offset (in reverse order).
+ std::sort(Ops.begin(), Ops.end(),
+ [](const MachineInstr *LHS, const MachineInstr *RHS) {
+ int LOffset = getMemoryOpOffset(*LHS);
+ int ROffset = getMemoryOpOffset(*RHS);
+ assert(LHS == RHS || LOffset != ROffset);
+ return LOffset > ROffset;
+ });
+
+ // The loads / stores of the same base are in order. Scan them from first to
+ // last and check for the following:
+ // 1. Any def of base.
+ // 2. Any gaps.
+ while (Ops.size() > 1) {
+ unsigned FirstLoc = ~0U;
+ unsigned LastLoc = 0;
+ MachineInstr *FirstOp = nullptr;
+ MachineInstr *LastOp = nullptr;
+ int LastOffset = 0;
+ unsigned LastOpcode = 0;
+ unsigned LastBytes = 0;
+ unsigned NumMove = 0;
+ for (int i = Ops.size() - 1; i >= 0; --i) {
+ MachineInstr *Op = Ops[i];
+ unsigned Loc = MI2LocMap[Op];
+ if (Loc <= FirstLoc) {
+ FirstLoc = Loc;
+ FirstOp = Op;
+ }
+ if (Loc >= LastLoc) {
+ LastLoc = Loc;
+ LastOp = Op;
+ }
+
+ unsigned LSMOpcode
+ = getLoadStoreMultipleOpcode(Op->getOpcode(), ARM_AM::ia);
+ if (LastOpcode && LSMOpcode != LastOpcode)
+ break;
+
+ int Offset = getMemoryOpOffset(*Op);
+ unsigned Bytes = getLSMultipleTransferSize(Op);
+ if (LastBytes) {
+ if (Bytes != LastBytes || Offset != (LastOffset + (int)Bytes))
+ break;
+ }
+ LastOffset = Offset;
+ LastBytes = Bytes;
+ LastOpcode = LSMOpcode;
+ if (++NumMove == 8) // FIXME: Tune this limit.
+ break;
+ }
+
+ if (NumMove <= 1)
+ Ops.pop_back();
+ else {
+ SmallPtrSet<MachineInstr*, 4> MemOps;
+ SmallSet<unsigned, 4> MemRegs;
+ for (int i = NumMove-1; i >= 0; --i) {
+ MemOps.insert(Ops[i]);
+ MemRegs.insert(Ops[i]->getOperand(0).getReg());
+ }
+
+ // Be conservative, if the instructions are too far apart, don't
+ // move them. We want to limit the increase of register pressure.
+ bool DoMove = (LastLoc - FirstLoc) <= NumMove*4; // FIXME: Tune this.
+ if (DoMove)
+ DoMove = IsSafeAndProfitableToMove(isLd, Base, FirstOp, LastOp,
+ MemOps, MemRegs, TRI);
+ if (!DoMove) {
+ for (unsigned i = 0; i != NumMove; ++i)
+ Ops.pop_back();
+ } else {
+ // This is the new location for the loads / stores.
+ MachineBasicBlock::iterator InsertPos = isLd ? FirstOp : LastOp;
+ while (InsertPos != MBB->end() &&
+ (MemOps.count(&*InsertPos) || InsertPos->isDebugValue()))
+ ++InsertPos;
+
+ // If we are moving a pair of loads / stores, see if it makes sense
+ // to try to allocate a pair of registers that can form register pairs.
+ MachineInstr *Op0 = Ops.back();
+ MachineInstr *Op1 = Ops[Ops.size()-2];
+ unsigned FirstReg = 0, SecondReg = 0;
+ unsigned BaseReg = 0, PredReg = 0;
+ ARMCC::CondCodes Pred = ARMCC::AL;
+ bool isT2 = false;
+ unsigned NewOpc = 0;
+ int Offset = 0;
+ DebugLoc dl;
+ if (NumMove == 2 && CanFormLdStDWord(Op0, Op1, dl, NewOpc,
+ FirstReg, SecondReg, BaseReg,
+ Offset, PredReg, Pred, isT2)) {
+ Ops.pop_back();
+ Ops.pop_back();
+
+ const MCInstrDesc &MCID = TII->get(NewOpc);
+ const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI, *MF);
+ MRI->constrainRegClass(FirstReg, TRC);
+ MRI->constrainRegClass(SecondReg, TRC);
+
+ // Form the pair instruction.
+ if (isLd) {
+ MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, MCID)
+ .addReg(FirstReg, RegState::Define)
+ .addReg(SecondReg, RegState::Define)
+ .addReg(BaseReg);
+ // FIXME: We're converting from LDRi12 to an insn that still
+ // uses addrmode2, so we need an explicit offset reg. It should
+ // always by reg0 since we're transforming LDRi12s.
+ if (!isT2)
+ MIB.addReg(0);
+ MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
+ MIB.setMemRefs(Op0->mergeMemRefsWith(*Op1));
+ DEBUG(dbgs() << "Formed " << *MIB << "\n");
+ ++NumLDRDFormed;
+ } else {
+ MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, MCID)
+ .addReg(FirstReg)
+ .addReg(SecondReg)
+ .addReg(BaseReg);
+ // FIXME: We're converting from LDRi12 to an insn that still
+ // uses addrmode2, so we need an explicit offset reg. It should
+ // always by reg0 since we're transforming STRi12s.
+ if (!isT2)
+ MIB.addReg(0);
+ MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
+ MIB.setMemRefs(Op0->mergeMemRefsWith(*Op1));
+ DEBUG(dbgs() << "Formed " << *MIB << "\n");
+ ++NumSTRDFormed;
+ }
+ MBB->erase(Op0);
+ MBB->erase(Op1);
+
+ if (!isT2) {
+ // Add register allocation hints to form register pairs.
+ MRI->setRegAllocationHint(FirstReg, ARMRI::RegPairEven, SecondReg);
+ MRI->setRegAllocationHint(SecondReg, ARMRI::RegPairOdd, FirstReg);
+ }
+ } else {
+ for (unsigned i = 0; i != NumMove; ++i) {
+ MachineInstr *Op = Ops.back();
+ Ops.pop_back();
+ MBB->splice(InsertPos, MBB, Op);
+ }
+ }
+
+ NumLdStMoved += NumMove;
+ RetVal = true;
+ }
+ }
+ }
+
+ return RetVal;
+}
+
+bool
+ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
+ bool RetVal = false;
+
+ DenseMap<MachineInstr*, unsigned> MI2LocMap;
+ DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2LdsMap;
+ DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2StsMap;
+ SmallVector<unsigned, 4> LdBases;
+ SmallVector<unsigned, 4> StBases;
+
+ unsigned Loc = 0;
+ MachineBasicBlock::iterator MBBI = MBB->begin();
+ MachineBasicBlock::iterator E = MBB->end();
+ while (MBBI != E) {
+ for (; MBBI != E; ++MBBI) {
+ MachineInstr &MI = *MBBI;
+ if (MI.isCall() || MI.isTerminator()) {
+ // Stop at barriers.
+ ++MBBI;
+ break;
+ }
+
+ if (!MI.isDebugValue())
+ MI2LocMap[&MI] = ++Loc;
+
+ if (!isMemoryOp(MI))
+ continue;
+ unsigned PredReg = 0;
+ if (getInstrPredicate(MI, PredReg) != ARMCC::AL)
+ continue;
+
+ int Opc = MI.getOpcode();
+ bool isLd = isLoadSingle(Opc);
+ unsigned Base = MI.getOperand(1).getReg();
+ int Offset = getMemoryOpOffset(MI);
+
+ bool StopHere = false;
+ if (isLd) {
+ DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI =
+ Base2LdsMap.find(Base);
+ if (BI != Base2LdsMap.end()) {
+ for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
+ if (Offset == getMemoryOpOffset(*BI->second[i])) {
+ StopHere = true;
+ break;
+ }
+ }
+ if (!StopHere)
+ BI->second.push_back(&MI);
+ } else {
+ Base2LdsMap[Base].push_back(&MI);
+ LdBases.push_back(Base);
+ }
+ } else {
+ DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI =
+ Base2StsMap.find(Base);
+ if (BI != Base2StsMap.end()) {
+ for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
+ if (Offset == getMemoryOpOffset(*BI->second[i])) {
+ StopHere = true;
+ break;
+ }
+ }
+ if (!StopHere)
+ BI->second.push_back(&MI);
+ } else {
+ Base2StsMap[Base].push_back(&MI);
+ StBases.push_back(Base);
+ }
+ }
+
+ if (StopHere) {
+ // Found a duplicate (a base+offset combination that's seen earlier).
+ // Backtrack.
+ --Loc;
+ break;
+ }
+ }
+
+ // Re-schedule loads.
+ for (unsigned i = 0, e = LdBases.size(); i != e; ++i) {
+ unsigned Base = LdBases[i];
+ SmallVectorImpl<MachineInstr *> &Lds = Base2LdsMap[Base];
+ if (Lds.size() > 1)
+ RetVal |= RescheduleOps(MBB, Lds, Base, true, MI2LocMap);
+ }
+
+ // Re-schedule stores.
+ for (unsigned i = 0, e = StBases.size(); i != e; ++i) {
+ unsigned Base = StBases[i];
+ SmallVectorImpl<MachineInstr *> &Sts = Base2StsMap[Base];
+ if (Sts.size() > 1)
+ RetVal |= RescheduleOps(MBB, Sts, Base, false, MI2LocMap);
+ }
+
+ if (MBBI != E) {
+ Base2LdsMap.clear();
+ Base2StsMap.clear();
+ LdBases.clear();
+ StBases.clear();
+ }
+ }
+
+ return RetVal;
+}
+
+
+/// Returns an instance of the load / store optimization pass.
+FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) {
+ if (PreAlloc)
+ return new ARMPreAllocLoadStoreOpt();
+ return new ARMLoadStoreOpt();
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMMCInstLower.cpp b/contrib/llvm/lib/Target/ARM/ARMMCInstLower.cpp
new file mode 100644
index 000000000000..293a527b09e8
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMMCInstLower.cpp
@@ -0,0 +1,263 @@
+//===-- ARMMCInstLower.cpp - Convert ARM MachineInstr to an MCInst --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower ARM MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMAsmPrinter.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "MCTargetDesc/ARMMCExpr.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCStreamer.h"
+using namespace llvm;
+
+
+MCOperand ARMAsmPrinter::GetSymbolRef(const MachineOperand &MO,
+ const MCSymbol *Symbol) {
+ const MCExpr *Expr =
+ MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None, OutContext);
+ switch (MO.getTargetFlags() & ARMII::MO_OPTION_MASK) {
+ default:
+ llvm_unreachable("Unknown target flag on symbol operand");
+ case ARMII::MO_NO_FLAG:
+ break;
+ case ARMII::MO_LO16:
+ Expr =
+ MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None, OutContext);
+ Expr = ARMMCExpr::createLower16(Expr, OutContext);
+ break;
+ case ARMII::MO_HI16:
+ Expr =
+ MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None, OutContext);
+ Expr = ARMMCExpr::createUpper16(Expr, OutContext);
+ break;
+ }
+
+ if (!MO.isJTI() && MO.getOffset())
+ Expr = MCBinaryExpr::createAdd(Expr,
+ MCConstantExpr::create(MO.getOffset(),
+ OutContext),
+ OutContext);
+ return MCOperand::createExpr(Expr);
+
+}
+
+bool ARMAsmPrinter::lowerOperand(const MachineOperand &MO,
+ MCOperand &MCOp) {
+ switch (MO.getType()) {
+ default: llvm_unreachable("unknown operand type");
+ case MachineOperand::MO_Register:
+ // Ignore all non-CPSR implicit register operands.
+ if (MO.isImplicit() && MO.getReg() != ARM::CPSR)
+ return false;
+ assert(!MO.getSubReg() && "Subregs should be eliminated!");
+ MCOp = MCOperand::createReg(MO.getReg());
+ break;
+ case MachineOperand::MO_Immediate:
+ MCOp = MCOperand::createImm(MO.getImm());
+ break;
+ case MachineOperand::MO_MachineBasicBlock:
+ MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(
+ MO.getMBB()->getSymbol(), OutContext));
+ break;
+ case MachineOperand::MO_GlobalAddress: {
+ MCOp = GetSymbolRef(MO,
+ GetARMGVSymbol(MO.getGlobal(), MO.getTargetFlags()));
+ break;
+ }
+ case MachineOperand::MO_ExternalSymbol:
+ MCOp = GetSymbolRef(MO,
+ GetExternalSymbolSymbol(MO.getSymbolName()));
+ break;
+ case MachineOperand::MO_JumpTableIndex:
+ MCOp = GetSymbolRef(MO, GetJTISymbol(MO.getIndex()));
+ break;
+ case MachineOperand::MO_ConstantPoolIndex:
+ if (Subtarget->genExecuteOnly())
+ llvm_unreachable("execute-only should not generate constant pools");
+ MCOp = GetSymbolRef(MO, GetCPISymbol(MO.getIndex()));
+ break;
+ case MachineOperand::MO_BlockAddress:
+ MCOp = GetSymbolRef(MO, GetBlockAddressSymbol(MO.getBlockAddress()));
+ break;
+ case MachineOperand::MO_FPImmediate: {
+ APFloat Val = MO.getFPImm()->getValueAPF();
+ bool ignored;
+ Val.convert(APFloat::IEEEdouble(), APFloat::rmTowardZero, &ignored);
+ MCOp = MCOperand::createFPImm(Val.convertToDouble());
+ break;
+ }
+ case MachineOperand::MO_RegisterMask:
+ // Ignore call clobbers.
+ return false;
+ }
+ return true;
+}
+
+void llvm::LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
+ ARMAsmPrinter &AP) {
+ OutMI.setOpcode(MI->getOpcode());
+
+ // In the MC layer, we keep modified immediates in their encoded form
+ bool EncodeImms = false;
+ switch (MI->getOpcode()) {
+ default: break;
+ case ARM::MOVi:
+ case ARM::MVNi:
+ case ARM::CMPri:
+ case ARM::CMNri:
+ case ARM::TSTri:
+ case ARM::TEQri:
+ case ARM::MSRi:
+ case ARM::ADCri:
+ case ARM::ADDri:
+ case ARM::ADDSri:
+ case ARM::SBCri:
+ case ARM::SUBri:
+ case ARM::SUBSri:
+ case ARM::ANDri:
+ case ARM::ORRri:
+ case ARM::EORri:
+ case ARM::BICri:
+ case ARM::RSBri:
+ case ARM::RSBSri:
+ case ARM::RSCri:
+ EncodeImms = true;
+ break;
+ }
+
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+
+ MCOperand MCOp;
+ if (AP.lowerOperand(MO, MCOp)) {
+ if (MCOp.isImm() && EncodeImms) {
+ int32_t Enc = ARM_AM::getSOImmVal(MCOp.getImm());
+ if (Enc != -1)
+ MCOp.setImm(Enc);
+ }
+ OutMI.addOperand(MCOp);
+ }
+ }
+}
+
+void ARMAsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind)
+{
+ if (MI.getParent()->getParent()->getInfo<ARMFunctionInfo>()
+ ->isThumbFunction())
+ {
+ MI.emitError("An attempt to perform XRay instrumentation for a"
+ " Thumb function (not supported). Detected when emitting a sled.");
+ return;
+ }
+ static const int8_t NoopsInSledCount = 6;
+ // We want to emit the following pattern:
+ //
+ // .Lxray_sled_N:
+ // ALIGN
+ // B #20
+ // ; 6 NOP instructions (24 bytes)
+ // .tmpN
+ //
+ // We need the 24 bytes (6 instructions) because at runtime, we'd be patching
+ // over the full 28 bytes (7 instructions) with the following pattern:
+ //
+ // PUSH{ r0, lr }
+ // MOVW r0, #<lower 16 bits of function ID>
+ // MOVT r0, #<higher 16 bits of function ID>
+ // MOVW ip, #<lower 16 bits of address of __xray_FunctionEntry/Exit>
+ // MOVT ip, #<higher 16 bits of address of __xray_FunctionEntry/Exit>
+ // BLX ip
+ // POP{ r0, lr }
+ //
+ OutStreamer->EmitCodeAlignment(4);
+ auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
+ OutStreamer->EmitLabel(CurSled);
+ auto Target = OutContext.createTempSymbol();
+
+ // Emit "B #20" instruction, which jumps over the next 24 bytes (because
+ // register pc is 8 bytes ahead of the jump instruction by the moment CPU
+ // is executing it).
+ // By analogy to ARMAsmPrinter::emitPseudoExpansionLowering() |case ARM::B|.
+ // It is not clear why |addReg(0)| is needed (the last operand).
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::Bcc).addImm(20)
+ .addImm(ARMCC::AL).addReg(0));
+
+ MCInst Noop;
+ Subtarget->getInstrInfo()->getNoopForElfTarget(Noop);
+ for (int8_t I = 0; I < NoopsInSledCount; I++)
+ {
+ OutStreamer->EmitInstruction(Noop, getSubtargetInfo());
+ }
+
+ OutStreamer->EmitLabel(Target);
+ recordSled(CurSled, MI, Kind);
+}
+
+void ARMAsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI)
+{
+ EmitSled(MI, SledKind::FUNCTION_ENTER);
+}
+
+void ARMAsmPrinter::LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI)
+{
+ EmitSled(MI, SledKind::FUNCTION_EXIT);
+}
+
+void ARMAsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI)
+{
+ EmitSled(MI, SledKind::TAIL_CALL);
+}
+
+void ARMAsmPrinter::EmitXRayTable()
+{
+ if (Sleds.empty())
+ return;
+
+ MCSection *Section = nullptr;
+ if (Subtarget->isTargetELF()) {
+ Section = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS,
+ ELF::SHF_ALLOC | ELF::SHF_GROUP |
+ ELF::SHF_MERGE,
+ 0, CurrentFnSym->getName());
+ } else if (Subtarget->isTargetMachO()) {
+ Section = OutContext.getMachOSection("__DATA", "xray_instr_map", 0,
+ SectionKind::getReadOnlyWithRel());
+ } else {
+ llvm_unreachable("Unsupported target");
+ }
+
+ auto PrevSection = OutStreamer->getCurrentSectionOnly();
+ OutStreamer->SwitchSection(Section);
+ for (const auto &Sled : Sleds) {
+ OutStreamer->EmitSymbolValue(Sled.Sled, 4);
+ OutStreamer->EmitSymbolValue(CurrentFnSym, 4);
+ auto Kind = static_cast<uint8_t>(Sled.Kind);
+ OutStreamer->EmitBytes(
+ StringRef(reinterpret_cast<const char *>(&Kind), 1));
+ OutStreamer->EmitBytes(
+ StringRef(reinterpret_cast<const char *>(&Sled.AlwaysInstrument), 1));
+ OutStreamer->EmitZeros(6);
+ }
+ OutStreamer->SwitchSection(PrevSection);
+
+ Sleds.clear();
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp
new file mode 100644
index 000000000000..50d8f0941460
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp
@@ -0,0 +1,24 @@
+//===-- ARMMachineFunctionInfo.cpp - ARM machine function info ------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMMachineFunctionInfo.h"
+
+using namespace llvm;
+
+void ARMFunctionInfo::anchor() {}
+
+ARMFunctionInfo::ARMFunctionInfo(MachineFunction &MF)
+ : isThumb(MF.getSubtarget<ARMSubtarget>().isThumb()),
+ hasThumb2(MF.getSubtarget<ARMSubtarget>().hasThumb2()),
+ StByValParamsPadding(0), ArgRegsSaveSize(0), ReturnRegsCount(0),
+ HasStackFrame(false), RestoreSPFromFP(false), LRSpilledForFarJump(false),
+ FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
+ GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0), PICLabelUId(0),
+ VarArgsFrameIndex(0), HasITBlocks(false), ArgumentStackSize(0),
+ IsSplitCSR(false), PromotedGlobalsIncrease(0) {}
diff --git a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
new file mode 100644
index 000000000000..8c485e89bf54
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -0,0 +1,255 @@
+//===-- ARMMachineFunctionInfo.h - ARM machine function info ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares ARM-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMMACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_ARM_ARMMACHINEFUNCTIONINFO_H
+
+#include "ARMSubtarget.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+namespace llvm {
+
+/// ARMFunctionInfo - This class is derived from MachineFunctionInfo and
+/// contains private ARM-specific information for each MachineFunction.
+class ARMFunctionInfo : public MachineFunctionInfo {
+ virtual void anchor();
+
+ /// isThumb - True if this function is compiled under Thumb mode.
+ /// Used to initialized Align, so must precede it.
+ bool isThumb;
+
+ /// hasThumb2 - True if the target architecture supports Thumb2. Do not use
+ /// to determine if function is compiled under Thumb mode, for that use
+ /// 'isThumb'.
+ bool hasThumb2;
+
+ /// StByValParamsPadding - For parameter that is split between
+ /// GPRs and memory; while recovering GPRs part, when
+ /// StackAlignment > 4, and GPRs-part-size mod StackAlignment != 0,
+ /// we need to insert gap before parameter start address. It allows to
+ /// "attach" GPR-part to the part that was passed via stack.
+ unsigned StByValParamsPadding;
+
+ /// VarArgsRegSaveSize - Size of the register save area for vararg functions.
+ ///
+ unsigned ArgRegsSaveSize;
+
+ /// ReturnRegsCount - Number of registers used up in the return.
+ unsigned ReturnRegsCount;
+
+ /// HasStackFrame - True if this function has a stack frame. Set by
+ /// determineCalleeSaves().
+ bool HasStackFrame;
+
+ /// RestoreSPFromFP - True if epilogue should restore SP from FP. Set by
+ /// emitPrologue.
+ bool RestoreSPFromFP;
+
+ /// LRSpilledForFarJump - True if the LR register has been for spilled to
+ /// enable far jump.
+ bool LRSpilledForFarJump;
+
+ /// FramePtrSpillOffset - If HasStackFrame, this records the frame pointer
+ /// spill stack offset.
+ unsigned FramePtrSpillOffset;
+
+ /// GPRCS1Offset, GPRCS2Offset, DPRCSOffset - Starting offset of callee saved
+ /// register spills areas. For Mac OS X:
+ ///
+ /// GPR callee-saved (1) : r4, r5, r6, r7, lr
+ /// --------------------------------------------
+ /// GPR callee-saved (2) : r8, r10, r11
+ /// --------------------------------------------
+ /// DPR callee-saved : d8 - d15
+ ///
+ /// Also see AlignedDPRCSRegs below. Not all D-regs need to go in area 3.
+ /// Some may be spilled after the stack has been realigned.
+ unsigned GPRCS1Offset;
+ unsigned GPRCS2Offset;
+ unsigned DPRCSOffset;
+
+ /// GPRCS1Size, GPRCS2Size, DPRCSSize - Sizes of callee saved register spills
+ /// areas.
+ unsigned GPRCS1Size;
+ unsigned GPRCS2Size;
+ unsigned DPRCSAlignGapSize;
+ unsigned DPRCSSize;
+
+ /// NumAlignedDPRCS2Regs - The number of callee-saved DPRs that are saved in
+ /// the aligned portion of the stack frame. This is always a contiguous
+ /// sequence of D-registers starting from d8.
+ ///
+ /// We do not keep track of the frame indices used for these registers - they
+ /// behave like any other frame index in the aligned stack frame. These
+ /// registers also aren't included in DPRCSSize above.
+ unsigned NumAlignedDPRCS2Regs;
+
+ unsigned PICLabelUId;
+
+ /// VarArgsFrameIndex - FrameIndex for start of varargs area.
+ int VarArgsFrameIndex;
+
+ /// HasITBlocks - True if IT blocks have been inserted.
+ bool HasITBlocks;
+
+ /// CPEClones - Track constant pool entries clones created by Constant Island
+ /// pass.
+ DenseMap<unsigned, unsigned> CPEClones;
+
+ /// ArgumentStackSize - amount of bytes on stack consumed by the arguments
+ /// being passed on the stack
+ unsigned ArgumentStackSize;
+
+ /// CoalescedWeights - mapping of basic blocks to the rolling counter of
+ /// coalesced weights.
+ DenseMap<const MachineBasicBlock*, unsigned> CoalescedWeights;
+
+ /// True if this function has a subset of CSRs that is handled explicitly via
+ /// copies.
+ bool IsSplitCSR;
+
+ /// Globals that have had their storage promoted into the constant pool.
+ SmallPtrSet<const GlobalVariable*,2> PromotedGlobals;
+
+ /// The amount the literal pool has been increasedby due to promoted globals.
+ int PromotedGlobalsIncrease;
+
+public:
+ ARMFunctionInfo() :
+ isThumb(false),
+ hasThumb2(false),
+ ArgRegsSaveSize(0), ReturnRegsCount(0), HasStackFrame(false),
+ RestoreSPFromFP(false),
+ LRSpilledForFarJump(false),
+ FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
+ GPRCS1Size(0), GPRCS2Size(0), DPRCSAlignGapSize(0), DPRCSSize(0),
+ NumAlignedDPRCS2Regs(0), PICLabelUId(0),
+ VarArgsFrameIndex(0), HasITBlocks(false), IsSplitCSR(false),
+ PromotedGlobalsIncrease(0) {}
+
+ explicit ARMFunctionInfo(MachineFunction &MF);
+
+ bool isThumbFunction() const { return isThumb; }
+ bool isThumb1OnlyFunction() const { return isThumb && !hasThumb2; }
+ bool isThumb2Function() const { return isThumb && hasThumb2; }
+
+ unsigned getStoredByValParamsPadding() const { return StByValParamsPadding; }
+ void setStoredByValParamsPadding(unsigned p) { StByValParamsPadding = p; }
+
+ unsigned getArgRegsSaveSize() const { return ArgRegsSaveSize; }
+ void setArgRegsSaveSize(unsigned s) { ArgRegsSaveSize = s; }
+
+ unsigned getReturnRegsCount() const { return ReturnRegsCount; }
+ void setReturnRegsCount(unsigned s) { ReturnRegsCount = s; }
+
+ bool hasStackFrame() const { return HasStackFrame; }
+ void setHasStackFrame(bool s) { HasStackFrame = s; }
+
+ bool shouldRestoreSPFromFP() const { return RestoreSPFromFP; }
+ void setShouldRestoreSPFromFP(bool s) { RestoreSPFromFP = s; }
+
+ bool isLRSpilledForFarJump() const { return LRSpilledForFarJump; }
+ void setLRIsSpilledForFarJump(bool s) { LRSpilledForFarJump = s; }
+
+ unsigned getFramePtrSpillOffset() const { return FramePtrSpillOffset; }
+ void setFramePtrSpillOffset(unsigned o) { FramePtrSpillOffset = o; }
+
+ unsigned getNumAlignedDPRCS2Regs() const { return NumAlignedDPRCS2Regs; }
+ void setNumAlignedDPRCS2Regs(unsigned n) { NumAlignedDPRCS2Regs = n; }
+
+ unsigned getGPRCalleeSavedArea1Offset() const { return GPRCS1Offset; }
+ unsigned getGPRCalleeSavedArea2Offset() const { return GPRCS2Offset; }
+ unsigned getDPRCalleeSavedAreaOffset() const { return DPRCSOffset; }
+
+ void setGPRCalleeSavedArea1Offset(unsigned o) { GPRCS1Offset = o; }
+ void setGPRCalleeSavedArea2Offset(unsigned o) { GPRCS2Offset = o; }
+ void setDPRCalleeSavedAreaOffset(unsigned o) { DPRCSOffset = o; }
+
+ unsigned getGPRCalleeSavedArea1Size() const { return GPRCS1Size; }
+ unsigned getGPRCalleeSavedArea2Size() const { return GPRCS2Size; }
+ unsigned getDPRCalleeSavedGapSize() const { return DPRCSAlignGapSize; }
+ unsigned getDPRCalleeSavedAreaSize() const { return DPRCSSize; }
+
+ void setGPRCalleeSavedArea1Size(unsigned s) { GPRCS1Size = s; }
+ void setGPRCalleeSavedArea2Size(unsigned s) { GPRCS2Size = s; }
+ void setDPRCalleeSavedGapSize(unsigned s) { DPRCSAlignGapSize = s; }
+ void setDPRCalleeSavedAreaSize(unsigned s) { DPRCSSize = s; }
+
+ unsigned getArgumentStackSize() const { return ArgumentStackSize; }
+ void setArgumentStackSize(unsigned size) { ArgumentStackSize = size; }
+
+ void initPICLabelUId(unsigned UId) {
+ PICLabelUId = UId;
+ }
+
+ unsigned getNumPICLabels() const {
+ return PICLabelUId;
+ }
+
+ unsigned createPICLabelUId() {
+ return PICLabelUId++;
+ }
+
+ int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+ void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
+
+ bool hasITBlocks() const { return HasITBlocks; }
+ void setHasITBlocks(bool h) { HasITBlocks = h; }
+
+ bool isSplitCSR() const { return IsSplitCSR; }
+ void setIsSplitCSR(bool s) { IsSplitCSR = s; }
+
+ void recordCPEClone(unsigned CPIdx, unsigned CPCloneIdx) {
+ if (!CPEClones.insert(std::make_pair(CPCloneIdx, CPIdx)).second)
+ llvm_unreachable("Duplicate entries!");
+ }
+
+ unsigned getOriginalCPIdx(unsigned CloneIdx) const {
+ DenseMap<unsigned, unsigned>::const_iterator I = CPEClones.find(CloneIdx);
+ if (I != CPEClones.end())
+ return I->second;
+ else
+ return -1U;
+ }
+
+ DenseMap<const MachineBasicBlock*, unsigned>::iterator getCoalescedWeight(
+ MachineBasicBlock* MBB) {
+ auto It = CoalescedWeights.find(MBB);
+ if (It == CoalescedWeights.end()) {
+ It = CoalescedWeights.insert(std::make_pair(MBB, 0)).first;
+ }
+ return It;
+ }
+
+ /// Indicate to the backend that \c GV has had its storage changed to inside
+ /// a constant pool. This means it no longer needs to be emitted as a
+ /// global variable.
+ void markGlobalAsPromotedToConstantPool(const GlobalVariable *GV) {
+ PromotedGlobals.insert(GV);
+ }
+ SmallPtrSet<const GlobalVariable*, 2>& getGlobalsPromotedToConstantPool() {
+ return PromotedGlobals;
+ }
+ int getPromotedConstpoolIncrease() const {
+ return PromotedGlobalsIncrease;
+ }
+ void setPromotedConstpoolIncrease(int Sz) {
+ PromotedGlobalsIncrease = Sz;
+ }
+};
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMOptimizeBarriersPass.cpp b/contrib/llvm/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
new file mode 100644
index 000000000000..581d5fe159fd
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
@@ -0,0 +1,105 @@
+//===-- ARMOptimizeBarriersPass - two DMBs without a memory access in between,
+//removed one -===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===------------------------------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMInstrInfo.h"
+#include "ARMMachineFunctionInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "double barriers"
+
+STATISTIC(NumDMBsRemoved, "Number of DMBs removed");
+
+namespace {
+class ARMOptimizeBarriersPass : public MachineFunctionPass {
+public:
+ static char ID;
+ ARMOptimizeBarriersPass() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+ StringRef getPassName() const override { return "optimise barriers pass"; }
+};
+char ARMOptimizeBarriersPass::ID = 0;
+}
+
+// Returns whether the instruction can safely move past a DMB instruction
+// The current implementation allows this iif MI does not have any possible
+// memory access
+static bool CanMovePastDMB(const MachineInstr *MI) {
+ return !(MI->mayLoad() ||
+ MI->mayStore() ||
+ MI->hasUnmodeledSideEffects() ||
+ MI->isCall() ||
+ MI->isReturn());
+}
+
+bool ARMOptimizeBarriersPass::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
+ // Vector to store the DMBs we will remove after the first iteration
+ std::vector<MachineInstr *> ToRemove;
+ // DMBType is the Imm value of the first operand. It determines whether it's a
+ // DMB ish, dmb sy, dmb osh, etc
+ int64_t DMBType = -1;
+
+ // Find a dmb. If we can move it until the next dmb, tag the second one for
+ // removal
+ for (auto &MBB : MF) {
+ // Will be true when we have seen a DMB, and not seen any instruction since
+ // that cannot move past a DMB
+ bool IsRemovableNextDMB = false;
+ for (auto &MI : MBB) {
+ if (MI.getOpcode() == ARM::DMB) {
+ if (IsRemovableNextDMB) {
+ // If the Imm of this DMB is the same as that of the last DMB, we can
+ // tag this second DMB for removal
+ if (MI.getOperand(0).getImm() == DMBType) {
+ ToRemove.push_back(&MI);
+ } else {
+ // If it has a different DMBType, we cannot remove it, but will scan
+ // for the next DMB, recording this DMB's type as last seen DMB type
+ DMBType = MI.getOperand(0).getImm();
+ }
+ } else {
+ // After we see a DMB, a next one is removable
+ IsRemovableNextDMB = true;
+ DMBType = MI.getOperand(0).getImm();
+ }
+ } else if (!CanMovePastDMB(&MI)) {
+ // If we find an instruction unable to pass past a DMB, a next DMB is
+ // not removable
+ IsRemovableNextDMB = false;
+ }
+ }
+ }
+ // Remove the tagged DMB
+ for (auto MI : ToRemove) {
+ MI->eraseFromParent();
+ ++NumDMBsRemoved;
+ }
+
+ return NumDMBsRemoved > 0;
+}
+
+/// createARMOptimizeBarriersPass - Returns an instance of the remove double
+/// barriers
+/// pass.
+FunctionPass *llvm::createARMOptimizeBarriersPass() {
+ return new ARMOptimizeBarriersPass();
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMPerfectShuffle.h b/contrib/llvm/lib/Target/ARM/ARMPerfectShuffle.h
new file mode 100644
index 000000000000..3ff0bee7e5bf
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMPerfectShuffle.h
@@ -0,0 +1,6591 @@
+//===-- ARMPerfectShuffle.h - NEON Perfect Shuffle Table --------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file, which was autogenerated by llvm-PerfectShuffle, contains data
+// for the optimal way to build a perfect shuffle using neon instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMPERFECTSHUFFLE_H
+#define LLVM_LIB_TARGET_ARM_ARMPERFECTSHUFFLE_H
+
+// 31 entries have cost 0
+// 242 entries have cost 1
+// 1447 entries have cost 2
+// 3602 entries have cost 3
+// 1237 entries have cost 4
+// 2 entries have cost 5
+
+// This table is 6561*4 = 26244 bytes in size.
+static const unsigned PerfectShuffleTable[6561+1] = {
+ 135053414U, // <0,0,0,0>: Cost 1 vdup0 LHS
+ 1543503974U, // <0,0,0,1>: Cost 2 vext2 <0,0,0,0>, LHS
+ 2618572962U, // <0,0,0,2>: Cost 3 vext2 <0,2,0,0>, <0,2,0,0>
+ 2568054923U, // <0,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
+ 1476398390U, // <0,0,0,4>: Cost 2 vext1 <0,0,0,0>, RHS
+ 2550140624U, // <0,0,0,5>: Cost 3 vext1 <0,0,0,0>, <5,1,7,3>
+ 2550141434U, // <0,0,0,6>: Cost 3 vext1 <0,0,0,0>, <6,2,7,3>
+ 2591945711U, // <0,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
+ 135053414U, // <0,0,0,u>: Cost 1 vdup0 LHS
+ 2886516736U, // <0,0,1,0>: Cost 3 vzipl LHS, <0,0,0,0>
+ 1812775014U, // <0,0,1,1>: Cost 2 vzipl LHS, LHS
+ 1618133094U, // <0,0,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
+ 2625209292U, // <0,0,1,3>: Cost 3 vext2 <1,3,0,0>, <1,3,0,0>
+ 2886558034U, // <0,0,1,4>: Cost 3 vzipl LHS, <0,4,1,5>
+ 2617246864U, // <0,0,1,5>: Cost 3 vext2 <0,0,0,0>, <1,5,3,7>
+ 3659723031U, // <0,0,1,6>: Cost 4 vext1 <6,0,0,1>, <6,0,0,1>
+ 2591953904U, // <0,0,1,7>: Cost 3 vext1 <7,0,0,1>, <7,0,0,1>
+ 1812775581U, // <0,0,1,u>: Cost 2 vzipl LHS, LHS
+ 3020734464U, // <0,0,2,0>: Cost 3 vtrnl LHS, <0,0,0,0>
+ 3020734474U, // <0,0,2,1>: Cost 3 vtrnl LHS, <0,0,1,1>
+ 1946992742U, // <0,0,2,2>: Cost 2 vtrnl LHS, LHS
+ 2631181989U, // <0,0,2,3>: Cost 3 vext2 <2,3,0,0>, <2,3,0,0>
+ 3020734668U, // <0,0,2,4>: Cost 3 vtrnl LHS, <0,2,4,6>
+ 3826550569U, // <0,0,2,5>: Cost 4 vuzpl <0,2,0,2>, <2,4,5,6>
+ 2617247674U, // <0,0,2,6>: Cost 3 vext2 <0,0,0,0>, <2,6,3,7>
+ 2591962097U, // <0,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
+ 1946992796U, // <0,0,2,u>: Cost 2 vtrnl LHS, LHS
+ 2635163787U, // <0,0,3,0>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
+ 2686419196U, // <0,0,3,1>: Cost 3 vext3 <0,3,1,0>, <0,3,1,0>
+ 2686492933U, // <0,0,3,2>: Cost 3 vext3 <0,3,2,0>, <0,3,2,0>
+ 2617248156U, // <0,0,3,3>: Cost 3 vext2 <0,0,0,0>, <3,3,3,3>
+ 2617248258U, // <0,0,3,4>: Cost 3 vext2 <0,0,0,0>, <3,4,5,6>
+ 3826551298U, // <0,0,3,5>: Cost 4 vuzpl <0,2,0,2>, <3,4,5,6>
+ 3690990200U, // <0,0,3,6>: Cost 4 vext2 <0,0,0,0>, <3,6,0,7>
+ 3713551042U, // <0,0,3,7>: Cost 4 vext2 <3,7,0,0>, <3,7,0,0>
+ 2635163787U, // <0,0,3,u>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
+ 2617248658U, // <0,0,4,0>: Cost 3 vext2 <0,0,0,0>, <4,0,5,1>
+ 2888450150U, // <0,0,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
+ 3021570150U, // <0,0,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
+ 3641829519U, // <0,0,4,3>: Cost 4 vext1 <3,0,0,4>, <3,0,0,4>
+ 3021570252U, // <0,0,4,4>: Cost 3 vtrnl <0,2,4,6>, <0,2,4,6>
+ 1543507254U, // <0,0,4,5>: Cost 2 vext2 <0,0,0,0>, RHS
+ 2752810294U, // <0,0,4,6>: Cost 3 vuzpl <0,2,0,2>, RHS
+ 3786998152U, // <0,0,4,7>: Cost 4 vext3 <4,7,5,0>, <0,4,7,5>
+ 1543507497U, // <0,0,4,u>: Cost 2 vext2 <0,0,0,0>, RHS
+ 2684354972U, // <0,0,5,0>: Cost 3 vext3 <0,0,0,0>, <0,5,0,7>
+ 2617249488U, // <0,0,5,1>: Cost 3 vext2 <0,0,0,0>, <5,1,7,3>
+ 3765617070U, // <0,0,5,2>: Cost 4 vext3 <1,2,3,0>, <0,5,2,7>
+ 3635865780U, // <0,0,5,3>: Cost 4 vext1 <2,0,0,5>, <3,0,4,5>
+ 2617249734U, // <0,0,5,4>: Cost 3 vext2 <0,0,0,0>, <5,4,7,6>
+ 2617249796U, // <0,0,5,5>: Cost 3 vext2 <0,0,0,0>, <5,5,5,5>
+ 2718712274U, // <0,0,5,6>: Cost 3 vext3 <5,6,7,0>, <0,5,6,7>
+ 2617249960U, // <0,0,5,7>: Cost 3 vext2 <0,0,0,0>, <5,7,5,7>
+ 2720039396U, // <0,0,5,u>: Cost 3 vext3 <5,u,7,0>, <0,5,u,7>
+ 2684355053U, // <0,0,6,0>: Cost 3 vext3 <0,0,0,0>, <0,6,0,7>
+ 3963609190U, // <0,0,6,1>: Cost 4 vzipl <0,6,2,7>, LHS
+ 2617250298U, // <0,0,6,2>: Cost 3 vext2 <0,0,0,0>, <6,2,7,3>
+ 3796435464U, // <0,0,6,3>: Cost 4 vext3 <6,3,7,0>, <0,6,3,7>
+ 3659762998U, // <0,0,6,4>: Cost 4 vext1 <6,0,0,6>, RHS
+ 3659763810U, // <0,0,6,5>: Cost 4 vext1 <6,0,0,6>, <5,6,7,0>
+ 2617250616U, // <0,0,6,6>: Cost 3 vext2 <0,0,0,0>, <6,6,6,6>
+ 2657727309U, // <0,0,6,7>: Cost 3 vext2 <6,7,0,0>, <6,7,0,0>
+ 2658390942U, // <0,0,6,u>: Cost 3 vext2 <6,u,0,0>, <6,u,0,0>
+ 2659054575U, // <0,0,7,0>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
+ 3635880854U, // <0,0,7,1>: Cost 4 vext1 <2,0,0,7>, <1,2,3,0>
+ 3635881401U, // <0,0,7,2>: Cost 4 vext1 <2,0,0,7>, <2,0,0,7>
+ 3734787298U, // <0,0,7,3>: Cost 4 vext2 <7,3,0,0>, <7,3,0,0>
+ 2617251174U, // <0,0,7,4>: Cost 3 vext2 <0,0,0,0>, <7,4,5,6>
+ 3659772002U, // <0,0,7,5>: Cost 4 vext1 <6,0,0,7>, <5,6,7,0>
+ 3659772189U, // <0,0,7,6>: Cost 4 vext1 <6,0,0,7>, <6,0,0,7>
+ 2617251436U, // <0,0,7,7>: Cost 3 vext2 <0,0,0,0>, <7,7,7,7>
+ 2659054575U, // <0,0,7,u>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
+ 135053414U, // <0,0,u,0>: Cost 1 vdup0 LHS
+ 1817419878U, // <0,0,u,1>: Cost 2 vzipl LHS, LHS
+ 1947435110U, // <0,0,u,2>: Cost 2 vtrnl LHS, LHS
+ 2568120467U, // <0,0,u,3>: Cost 3 vext1 <3,0,0,u>, <3,0,0,u>
+ 1476463926U, // <0,0,u,4>: Cost 2 vext1 <0,0,0,u>, RHS
+ 1543510170U, // <0,0,u,5>: Cost 2 vext2 <0,0,0,0>, RHS
+ 2752813210U, // <0,0,u,6>: Cost 3 vuzpl <0,2,0,2>, RHS
+ 2592011255U, // <0,0,u,7>: Cost 3 vext1 <7,0,0,u>, <7,0,0,u>
+ 135053414U, // <0,0,u,u>: Cost 1 vdup0 LHS
+ 2618581002U, // <0,1,0,0>: Cost 3 vext2 <0,2,0,1>, <0,0,1,1>
+ 1557446758U, // <0,1,0,1>: Cost 2 vext2 <2,3,0,1>, LHS
+ 2618581155U, // <0,1,0,2>: Cost 3 vext2 <0,2,0,1>, <0,2,0,1>
+ 2690548468U, // <0,1,0,3>: Cost 3 vext3 <1,0,3,0>, <1,0,3,0>
+ 2626543954U, // <0,1,0,4>: Cost 3 vext2 <1,5,0,1>, <0,4,1,5>
+ 4094985216U, // <0,1,0,5>: Cost 4 vtrnl <0,2,0,2>, <1,3,5,7>
+ 2592019278U, // <0,1,0,6>: Cost 3 vext1 <7,0,1,0>, <6,7,0,1>
+ 2592019448U, // <0,1,0,7>: Cost 3 vext1 <7,0,1,0>, <7,0,1,0>
+ 1557447325U, // <0,1,0,u>: Cost 2 vext2 <2,3,0,1>, LHS
+ 1476476938U, // <0,1,1,0>: Cost 2 vext1 <0,0,1,1>, <0,0,1,1>
+ 2886517556U, // <0,1,1,1>: Cost 3 vzipl LHS, <1,1,1,1>
+ 2886517654U, // <0,1,1,2>: Cost 3 vzipl LHS, <1,2,3,0>
+ 2886517720U, // <0,1,1,3>: Cost 3 vzipl LHS, <1,3,1,3>
+ 1476480310U, // <0,1,1,4>: Cost 2 vext1 <0,0,1,1>, RHS
+ 2886558864U, // <0,1,1,5>: Cost 3 vzipl LHS, <1,5,3,7>
+ 2550223354U, // <0,1,1,6>: Cost 3 vext1 <0,0,1,1>, <6,2,7,3>
+ 2550223856U, // <0,1,1,7>: Cost 3 vext1 <0,0,1,1>, <7,0,0,1>
+ 1476482862U, // <0,1,1,u>: Cost 2 vext1 <0,0,1,1>, LHS
+ 1494401126U, // <0,1,2,0>: Cost 2 vext1 <3,0,1,2>, LHS
+ 3020735284U, // <0,1,2,1>: Cost 3 vtrnl LHS, <1,1,1,1>
+ 2562172349U, // <0,1,2,2>: Cost 3 vext1 <2,0,1,2>, <2,0,1,2>
+ 835584U, // <0,1,2,3>: Cost 0 copy LHS
+ 1494404406U, // <0,1,2,4>: Cost 2 vext1 <3,0,1,2>, RHS
+ 3020735488U, // <0,1,2,5>: Cost 3 vtrnl LHS, <1,3,5,7>
+ 2631190458U, // <0,1,2,6>: Cost 3 vext2 <2,3,0,1>, <2,6,3,7>
+ 1518294010U, // <0,1,2,7>: Cost 2 vext1 <7,0,1,2>, <7,0,1,2>
+ 835584U, // <0,1,2,u>: Cost 0 copy LHS
+ 2692318156U, // <0,1,3,0>: Cost 3 vext3 <1,3,0,0>, <1,3,0,0>
+ 2691875800U, // <0,1,3,1>: Cost 3 vext3 <1,2,3,0>, <1,3,1,3>
+ 2691875806U, // <0,1,3,2>: Cost 3 vext3 <1,2,3,0>, <1,3,2,0>
+ 2692539367U, // <0,1,3,3>: Cost 3 vext3 <1,3,3,0>, <1,3,3,0>
+ 2562182454U, // <0,1,3,4>: Cost 3 vext1 <2,0,1,3>, RHS
+ 2691875840U, // <0,1,3,5>: Cost 3 vext3 <1,2,3,0>, <1,3,5,7>
+ 2692760578U, // <0,1,3,6>: Cost 3 vext3 <1,3,6,0>, <1,3,6,0>
+ 2639817411U, // <0,1,3,7>: Cost 3 vext2 <3,7,0,1>, <3,7,0,1>
+ 2691875863U, // <0,1,3,u>: Cost 3 vext3 <1,2,3,0>, <1,3,u,3>
+ 2568159334U, // <0,1,4,0>: Cost 3 vext1 <3,0,1,4>, LHS
+ 4095312692U, // <0,1,4,1>: Cost 4 vtrnl <0,2,4,6>, <1,1,1,1>
+ 2568160934U, // <0,1,4,2>: Cost 3 vext1 <3,0,1,4>, <2,3,0,1>
+ 2568161432U, // <0,1,4,3>: Cost 3 vext1 <3,0,1,4>, <3,0,1,4>
+ 2568162614U, // <0,1,4,4>: Cost 3 vext1 <3,0,1,4>, RHS
+ 1557450038U, // <0,1,4,5>: Cost 2 vext2 <2,3,0,1>, RHS
+ 2754235702U, // <0,1,4,6>: Cost 3 vuzpl <0,4,1,5>, RHS
+ 2592052220U, // <0,1,4,7>: Cost 3 vext1 <7,0,1,4>, <7,0,1,4>
+ 1557450281U, // <0,1,4,u>: Cost 2 vext2 <2,3,0,1>, RHS
+ 3765617775U, // <0,1,5,0>: Cost 4 vext3 <1,2,3,0>, <1,5,0,1>
+ 2647781007U, // <0,1,5,1>: Cost 3 vext2 <5,1,0,1>, <5,1,0,1>
+ 3704934138U, // <0,1,5,2>: Cost 4 vext2 <2,3,0,1>, <5,2,3,0>
+ 2691875984U, // <0,1,5,3>: Cost 3 vext3 <1,2,3,0>, <1,5,3,7>
+ 2657734598U, // <0,1,5,4>: Cost 3 vext2 <6,7,0,1>, <5,4,7,6>
+ 2650435539U, // <0,1,5,5>: Cost 3 vext2 <5,5,0,1>, <5,5,0,1>
+ 2651099172U, // <0,1,5,6>: Cost 3 vext2 <5,6,0,1>, <5,6,0,1>
+ 2651762805U, // <0,1,5,7>: Cost 3 vext2 <5,7,0,1>, <5,7,0,1>
+ 2691876029U, // <0,1,5,u>: Cost 3 vext3 <1,2,3,0>, <1,5,u,7>
+ 2592063590U, // <0,1,6,0>: Cost 3 vext1 <7,0,1,6>, LHS
+ 3765617871U, // <0,1,6,1>: Cost 4 vext3 <1,2,3,0>, <1,6,1,7>
+ 2654417337U, // <0,1,6,2>: Cost 3 vext2 <6,2,0,1>, <6,2,0,1>
+ 3765617889U, // <0,1,6,3>: Cost 4 vext3 <1,2,3,0>, <1,6,3,7>
+ 2592066870U, // <0,1,6,4>: Cost 3 vext1 <7,0,1,6>, RHS
+ 3765617907U, // <0,1,6,5>: Cost 4 vext3 <1,2,3,0>, <1,6,5,7>
+ 2657071869U, // <0,1,6,6>: Cost 3 vext2 <6,6,0,1>, <6,6,0,1>
+ 1583993678U, // <0,1,6,7>: Cost 2 vext2 <6,7,0,1>, <6,7,0,1>
+ 1584657311U, // <0,1,6,u>: Cost 2 vext2 <6,u,0,1>, <6,u,0,1>
+ 2657735672U, // <0,1,7,0>: Cost 3 vext2 <6,7,0,1>, <7,0,1,0>
+ 2657735808U, // <0,1,7,1>: Cost 3 vext2 <6,7,0,1>, <7,1,7,1>
+ 2631193772U, // <0,1,7,2>: Cost 3 vext2 <2,3,0,1>, <7,2,3,0>
+ 2661053667U, // <0,1,7,3>: Cost 3 vext2 <7,3,0,1>, <7,3,0,1>
+ 2657736038U, // <0,1,7,4>: Cost 3 vext2 <6,7,0,1>, <7,4,5,6>
+ 3721524621U, // <0,1,7,5>: Cost 4 vext2 <5,1,0,1>, <7,5,1,0>
+ 2657736158U, // <0,1,7,6>: Cost 3 vext2 <6,7,0,1>, <7,6,1,0>
+ 2657736300U, // <0,1,7,7>: Cost 3 vext2 <6,7,0,1>, <7,7,7,7>
+ 2657736322U, // <0,1,7,u>: Cost 3 vext2 <6,7,0,1>, <7,u,1,2>
+ 1494450278U, // <0,1,u,0>: Cost 2 vext1 <3,0,1,u>, LHS
+ 1557452590U, // <0,1,u,1>: Cost 2 vext2 <2,3,0,1>, LHS
+ 2754238254U, // <0,1,u,2>: Cost 3 vuzpl <0,4,1,5>, LHS
+ 835584U, // <0,1,u,3>: Cost 0 copy LHS
+ 1494453558U, // <0,1,u,4>: Cost 2 vext1 <3,0,1,u>, RHS
+ 1557452954U, // <0,1,u,5>: Cost 2 vext2 <2,3,0,1>, RHS
+ 2754238618U, // <0,1,u,6>: Cost 3 vuzpl <0,4,1,5>, RHS
+ 1518343168U, // <0,1,u,7>: Cost 2 vext1 <7,0,1,u>, <7,0,1,u>
+ 835584U, // <0,1,u,u>: Cost 0 copy LHS
+ 2752299008U, // <0,2,0,0>: Cost 3 vuzpl LHS, <0,0,0,0>
+ 1544847462U, // <0,2,0,1>: Cost 2 vext2 <0,2,0,2>, LHS
+ 1678557286U, // <0,2,0,2>: Cost 2 vuzpl LHS, LHS
+ 2696521165U, // <0,2,0,3>: Cost 3 vext3 <2,0,3,0>, <2,0,3,0>
+ 2752340172U, // <0,2,0,4>: Cost 3 vuzpl LHS, <0,2,4,6>
+ 2691876326U, // <0,2,0,5>: Cost 3 vext3 <1,2,3,0>, <2,0,5,7>
+ 2618589695U, // <0,2,0,6>: Cost 3 vext2 <0,2,0,2>, <0,6,2,7>
+ 2592093185U, // <0,2,0,7>: Cost 3 vext1 <7,0,2,0>, <7,0,2,0>
+ 1678557340U, // <0,2,0,u>: Cost 2 vuzpl LHS, LHS
+ 2618589942U, // <0,2,1,0>: Cost 3 vext2 <0,2,0,2>, <1,0,3,2>
+ 2752299828U, // <0,2,1,1>: Cost 3 vuzpl LHS, <1,1,1,1>
+ 2886518376U, // <0,2,1,2>: Cost 3 vzipl LHS, <2,2,2,2>
+ 2752299766U, // <0,2,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
+ 2550295862U, // <0,2,1,4>: Cost 3 vext1 <0,0,2,1>, RHS
+ 2752340992U, // <0,2,1,5>: Cost 3 vuzpl LHS, <1,3,5,7>
+ 2886559674U, // <0,2,1,6>: Cost 3 vzipl LHS, <2,6,3,7>
+ 3934208106U, // <0,2,1,7>: Cost 4 vuzpr <7,0,1,2>, <0,1,2,7>
+ 2752340771U, // <0,2,1,u>: Cost 3 vuzpl LHS, <1,0,u,2>
+ 1476558868U, // <0,2,2,0>: Cost 2 vext1 <0,0,2,2>, <0,0,2,2>
+ 2226628029U, // <0,2,2,1>: Cost 3 vrev <2,0,1,2>
+ 2752300648U, // <0,2,2,2>: Cost 3 vuzpl LHS, <2,2,2,2>
+ 3020736114U, // <0,2,2,3>: Cost 3 vtrnl LHS, <2,2,3,3>
+ 1476562230U, // <0,2,2,4>: Cost 2 vext1 <0,0,2,2>, RHS
+ 2550304464U, // <0,2,2,5>: Cost 3 vext1 <0,0,2,2>, <5,1,7,3>
+ 2618591162U, // <0,2,2,6>: Cost 3 vext2 <0,2,0,2>, <2,6,3,7>
+ 2550305777U, // <0,2,2,7>: Cost 3 vext1 <0,0,2,2>, <7,0,0,2>
+ 1476564782U, // <0,2,2,u>: Cost 2 vext1 <0,0,2,2>, LHS
+ 2618591382U, // <0,2,3,0>: Cost 3 vext2 <0,2,0,2>, <3,0,1,2>
+ 2752301206U, // <0,2,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
+ 3826043121U, // <0,2,3,2>: Cost 4 vuzpl LHS, <3,1,2,3>
+ 2752301468U, // <0,2,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
+ 2618591746U, // <0,2,3,4>: Cost 3 vext2 <0,2,0,2>, <3,4,5,6>
+ 2752301570U, // <0,2,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
+ 3830688102U, // <0,2,3,6>: Cost 4 vuzpl LHS, <3,2,6,3>
+ 2698807012U, // <0,2,3,7>: Cost 3 vext3 <2,3,7,0>, <2,3,7,0>
+ 2752301269U, // <0,2,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
+ 2562261094U, // <0,2,4,0>: Cost 3 vext1 <2,0,2,4>, LHS
+ 4095313828U, // <0,2,4,1>: Cost 4 vtrnl <0,2,4,6>, <2,6,1,3>
+ 2226718152U, // <0,2,4,2>: Cost 3 vrev <2,0,2,4>
+ 2568235169U, // <0,2,4,3>: Cost 3 vext1 <3,0,2,4>, <3,0,2,4>
+ 2562264374U, // <0,2,4,4>: Cost 3 vext1 <2,0,2,4>, RHS
+ 1544850742U, // <0,2,4,5>: Cost 2 vext2 <0,2,0,2>, RHS
+ 1678560566U, // <0,2,4,6>: Cost 2 vuzpl LHS, RHS
+ 2592125957U, // <0,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
+ 1678560584U, // <0,2,4,u>: Cost 2 vuzpl LHS, RHS
+ 2691876686U, // <0,2,5,0>: Cost 3 vext3 <1,2,3,0>, <2,5,0,7>
+ 2618592976U, // <0,2,5,1>: Cost 3 vext2 <0,2,0,2>, <5,1,7,3>
+ 3765618528U, // <0,2,5,2>: Cost 4 vext3 <1,2,3,0>, <2,5,2,7>
+ 3765618536U, // <0,2,5,3>: Cost 4 vext3 <1,2,3,0>, <2,5,3,6>
+ 2618593222U, // <0,2,5,4>: Cost 3 vext2 <0,2,0,2>, <5,4,7,6>
+ 2752303108U, // <0,2,5,5>: Cost 3 vuzpl LHS, <5,5,5,5>
+ 2618593378U, // <0,2,5,6>: Cost 3 vext2 <0,2,0,2>, <5,6,7,0>
+ 2824785206U, // <0,2,5,7>: Cost 3 vuzpr <1,0,3,2>, RHS
+ 2824785207U, // <0,2,5,u>: Cost 3 vuzpr <1,0,3,2>, RHS
+ 2752303950U, // <0,2,6,0>: Cost 3 vuzpl LHS, <6,7,0,1>
+ 3830690081U, // <0,2,6,1>: Cost 4 vuzpl LHS, <6,0,1,2>
+ 2618593786U, // <0,2,6,2>: Cost 3 vext2 <0,2,0,2>, <6,2,7,3>
+ 2691876794U, // <0,2,6,3>: Cost 3 vext3 <1,2,3,0>, <2,6,3,7>
+ 2752303990U, // <0,2,6,4>: Cost 3 vuzpl LHS, <6,7,4,5>
+ 3830690445U, // <0,2,6,5>: Cost 4 vuzpl LHS, <6,4,5,6>
+ 2752303928U, // <0,2,6,6>: Cost 3 vuzpl LHS, <6,6,6,6>
+ 2657743695U, // <0,2,6,7>: Cost 3 vext2 <6,7,0,2>, <6,7,0,2>
+ 2691876839U, // <0,2,6,u>: Cost 3 vext3 <1,2,3,0>, <2,6,u,7>
+ 2659070961U, // <0,2,7,0>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
+ 2659734594U, // <0,2,7,1>: Cost 3 vext2 <7,1,0,2>, <7,1,0,2>
+ 3734140051U, // <0,2,7,2>: Cost 4 vext2 <7,2,0,2>, <7,2,0,2>
+ 2701166596U, // <0,2,7,3>: Cost 3 vext3 <2,7,3,0>, <2,7,3,0>
+ 2662389094U, // <0,2,7,4>: Cost 3 vext2 <7,5,0,2>, <7,4,5,6>
+ 2662389126U, // <0,2,7,5>: Cost 3 vext2 <7,5,0,2>, <7,5,0,2>
+ 3736794583U, // <0,2,7,6>: Cost 4 vext2 <7,6,0,2>, <7,6,0,2>
+ 2752304748U, // <0,2,7,7>: Cost 3 vuzpl LHS, <7,7,7,7>
+ 2659070961U, // <0,2,7,u>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
+ 1476608026U, // <0,2,u,0>: Cost 2 vext1 <0,0,2,u>, <0,0,2,u>
+ 1544853294U, // <0,2,u,1>: Cost 2 vext2 <0,2,0,2>, LHS
+ 1678563118U, // <0,2,u,2>: Cost 2 vuzpl LHS, LHS
+ 3021178482U, // <0,2,u,3>: Cost 3 vtrnl LHS, <2,2,3,3>
+ 1476611382U, // <0,2,u,4>: Cost 2 vext1 <0,0,2,u>, RHS
+ 1544853658U, // <0,2,u,5>: Cost 2 vext2 <0,2,0,2>, RHS
+ 1678563482U, // <0,2,u,6>: Cost 2 vuzpl LHS, RHS
+ 2824785449U, // <0,2,u,7>: Cost 3 vuzpr <1,0,3,2>, RHS
+ 1678563172U, // <0,2,u,u>: Cost 2 vuzpl LHS, LHS
+ 2556329984U, // <0,3,0,0>: Cost 3 vext1 <1,0,3,0>, <0,0,0,0>
+ 2686421142U, // <0,3,0,1>: Cost 3 vext3 <0,3,1,0>, <3,0,1,2>
+ 2562303437U, // <0,3,0,2>: Cost 3 vext1 <2,0,3,0>, <2,0,3,0>
+ 4094986652U, // <0,3,0,3>: Cost 4 vtrnl <0,2,0,2>, <3,3,3,3>
+ 2556333366U, // <0,3,0,4>: Cost 3 vext1 <1,0,3,0>, RHS
+ 4094986754U, // <0,3,0,5>: Cost 4 vtrnl <0,2,0,2>, <3,4,5,6>
+ 3798796488U, // <0,3,0,6>: Cost 4 vext3 <6,7,3,0>, <3,0,6,7>
+ 3776530634U, // <0,3,0,7>: Cost 4 vext3 <3,0,7,0>, <3,0,7,0>
+ 2556335918U, // <0,3,0,u>: Cost 3 vext1 <1,0,3,0>, LHS
+ 2886518934U, // <0,3,1,0>: Cost 3 vzipl LHS, <3,0,1,2>
+ 2556338933U, // <0,3,1,1>: Cost 3 vext1 <1,0,3,1>, <1,0,3,1>
+ 2691877105U, // <0,3,1,2>: Cost 3 vext3 <1,2,3,0>, <3,1,2,3>
+ 2886519196U, // <0,3,1,3>: Cost 3 vzipl LHS, <3,3,3,3>
+ 2886519298U, // <0,3,1,4>: Cost 3 vzipl LHS, <3,4,5,6>
+ 4095740418U, // <0,3,1,5>: Cost 4 vtrnl <0,3,1,4>, <3,4,5,6>
+ 3659944242U, // <0,3,1,6>: Cost 4 vext1 <6,0,3,1>, <6,0,3,1>
+ 3769600286U, // <0,3,1,7>: Cost 4 vext3 <1,u,3,0>, <3,1,7,3>
+ 2886519582U, // <0,3,1,u>: Cost 3 vzipl LHS, <3,u,1,2>
+ 1482604646U, // <0,3,2,0>: Cost 2 vext1 <1,0,3,2>, LHS
+ 1482605302U, // <0,3,2,1>: Cost 2 vext1 <1,0,3,2>, <1,0,3,2>
+ 2556348008U, // <0,3,2,2>: Cost 3 vext1 <1,0,3,2>, <2,2,2,2>
+ 3020736924U, // <0,3,2,3>: Cost 3 vtrnl LHS, <3,3,3,3>
+ 1482607926U, // <0,3,2,4>: Cost 2 vext1 <1,0,3,2>, RHS
+ 3020737026U, // <0,3,2,5>: Cost 3 vtrnl LHS, <3,4,5,6>
+ 2598154746U, // <0,3,2,6>: Cost 3 vext1 <u,0,3,2>, <6,2,7,3>
+ 2598155258U, // <0,3,2,7>: Cost 3 vext1 <u,0,3,2>, <7,0,1,2>
+ 1482610478U, // <0,3,2,u>: Cost 2 vext1 <1,0,3,2>, LHS
+ 3692341398U, // <0,3,3,0>: Cost 4 vext2 <0,2,0,3>, <3,0,1,2>
+ 2635851999U, // <0,3,3,1>: Cost 3 vext2 <3,1,0,3>, <3,1,0,3>
+ 3636069840U, // <0,3,3,2>: Cost 4 vext1 <2,0,3,3>, <2,0,3,3>
+ 2691877276U, // <0,3,3,3>: Cost 3 vext3 <1,2,3,0>, <3,3,3,3>
+ 3961522690U, // <0,3,3,4>: Cost 4 vzipl <0,3,1,4>, <3,4,5,6>
+ 3826797058U, // <0,3,3,5>: Cost 4 vuzpl <0,2,3,5>, <3,4,5,6>
+ 3703622282U, // <0,3,3,6>: Cost 4 vext2 <2,1,0,3>, <3,6,2,7>
+ 3769600452U, // <0,3,3,7>: Cost 4 vext3 <1,u,3,0>, <3,3,7,7>
+ 2640497430U, // <0,3,3,u>: Cost 3 vext2 <3,u,0,3>, <3,u,0,3>
+ 3962194070U, // <0,3,4,0>: Cost 4 vzipl <0,4,1,5>, <3,0,1,2>
+ 2232617112U, // <0,3,4,1>: Cost 3 vrev <3,0,1,4>
+ 2232690849U, // <0,3,4,2>: Cost 3 vrev <3,0,2,4>
+ 4095314332U, // <0,3,4,3>: Cost 4 vtrnl <0,2,4,6>, <3,3,3,3>
+ 3962194434U, // <0,3,4,4>: Cost 4 vzipl <0,4,1,5>, <3,4,5,6>
+ 2691877378U, // <0,3,4,5>: Cost 3 vext3 <1,2,3,0>, <3,4,5,6>
+ 3826765110U, // <0,3,4,6>: Cost 4 vuzpl <0,2,3,1>, RHS
+ 3665941518U, // <0,3,4,7>: Cost 4 vext1 <7,0,3,4>, <7,0,3,4>
+ 2691877405U, // <0,3,4,u>: Cost 3 vext3 <1,2,3,0>, <3,4,u,6>
+ 3630112870U, // <0,3,5,0>: Cost 4 vext1 <1,0,3,5>, LHS
+ 3630113526U, // <0,3,5,1>: Cost 4 vext1 <1,0,3,5>, <1,0,3,2>
+ 4035199734U, // <0,3,5,2>: Cost 4 vzipr <1,4,0,5>, <1,0,3,2>
+ 3769600578U, // <0,3,5,3>: Cost 4 vext3 <1,u,3,0>, <3,5,3,7>
+ 2232846516U, // <0,3,5,4>: Cost 3 vrev <3,0,4,5>
+ 3779037780U, // <0,3,5,5>: Cost 4 vext3 <3,4,5,0>, <3,5,5,7>
+ 2718714461U, // <0,3,5,6>: Cost 3 vext3 <5,6,7,0>, <3,5,6,7>
+ 2706106975U, // <0,3,5,7>: Cost 3 vext3 <3,5,7,0>, <3,5,7,0>
+ 2233141464U, // <0,3,5,u>: Cost 3 vrev <3,0,u,5>
+ 2691877496U, // <0,3,6,0>: Cost 3 vext3 <1,2,3,0>, <3,6,0,7>
+ 3727511914U, // <0,3,6,1>: Cost 4 vext2 <6,1,0,3>, <6,1,0,3>
+ 3765619338U, // <0,3,6,2>: Cost 4 vext3 <1,2,3,0>, <3,6,2,7>
+ 3765619347U, // <0,3,6,3>: Cost 4 vext3 <1,2,3,0>, <3,6,3,7>
+ 3765987996U, // <0,3,6,4>: Cost 4 vext3 <1,2,u,0>, <3,6,4,7>
+ 3306670270U, // <0,3,6,5>: Cost 4 vrev <3,0,5,6>
+ 3792456365U, // <0,3,6,6>: Cost 4 vext3 <5,6,7,0>, <3,6,6,6>
+ 2706770608U, // <0,3,6,7>: Cost 3 vext3 <3,6,7,0>, <3,6,7,0>
+ 2706844345U, // <0,3,6,u>: Cost 3 vext3 <3,6,u,0>, <3,6,u,0>
+ 3769600707U, // <0,3,7,0>: Cost 4 vext3 <1,u,3,0>, <3,7,0,1>
+ 2659742787U, // <0,3,7,1>: Cost 3 vext2 <7,1,0,3>, <7,1,0,3>
+ 3636102612U, // <0,3,7,2>: Cost 4 vext1 <2,0,3,7>, <2,0,3,7>
+ 3769600740U, // <0,3,7,3>: Cost 4 vext3 <1,u,3,0>, <3,7,3,7>
+ 3769600747U, // <0,3,7,4>: Cost 4 vext3 <1,u,3,0>, <3,7,4,5>
+ 3769600758U, // <0,3,7,5>: Cost 4 vext3 <1,u,3,0>, <3,7,5,7>
+ 3659993400U, // <0,3,7,6>: Cost 4 vext1 <6,0,3,7>, <6,0,3,7>
+ 3781176065U, // <0,3,7,7>: Cost 4 vext3 <3,7,7,0>, <3,7,7,0>
+ 2664388218U, // <0,3,7,u>: Cost 3 vext2 <7,u,0,3>, <7,u,0,3>
+ 1482653798U, // <0,3,u,0>: Cost 2 vext1 <1,0,3,u>, LHS
+ 1482654460U, // <0,3,u,1>: Cost 2 vext1 <1,0,3,u>, <1,0,3,u>
+ 2556397160U, // <0,3,u,2>: Cost 3 vext1 <1,0,3,u>, <2,2,2,2>
+ 3021179292U, // <0,3,u,3>: Cost 3 vtrnl LHS, <3,3,3,3>
+ 1482657078U, // <0,3,u,4>: Cost 2 vext1 <1,0,3,u>, RHS
+ 3021179394U, // <0,3,u,5>: Cost 3 vtrnl LHS, <3,4,5,6>
+ 2598203898U, // <0,3,u,6>: Cost 3 vext1 <u,0,3,u>, <6,2,7,3>
+ 2708097874U, // <0,3,u,7>: Cost 3 vext3 <3,u,7,0>, <3,u,7,0>
+ 1482659630U, // <0,3,u,u>: Cost 2 vext1 <1,0,3,u>, LHS
+ 2617278468U, // <0,4,0,0>: Cost 3 vext2 <0,0,0,4>, <0,0,0,4>
+ 2618605670U, // <0,4,0,1>: Cost 3 vext2 <0,2,0,4>, LHS
+ 2618605734U, // <0,4,0,2>: Cost 3 vext2 <0,2,0,4>, <0,2,0,4>
+ 3642091695U, // <0,4,0,3>: Cost 4 vext1 <3,0,4,0>, <3,0,4,0>
+ 2753134796U, // <0,4,0,4>: Cost 3 vuzpl <0,2,4,6>, <0,2,4,6>
+ 2718714770U, // <0,4,0,5>: Cost 3 vext3 <5,6,7,0>, <4,0,5,1>
+ 3021245750U, // <0,4,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
+ 3665982483U, // <0,4,0,7>: Cost 4 vext1 <7,0,4,0>, <7,0,4,0>
+ 3021245768U, // <0,4,0,u>: Cost 3 vtrnl <0,2,0,2>, RHS
+ 2568355942U, // <0,4,1,0>: Cost 3 vext1 <3,0,4,1>, LHS
+ 3692348212U, // <0,4,1,1>: Cost 4 vext2 <0,2,0,4>, <1,1,1,1>
+ 3692348310U, // <0,4,1,2>: Cost 4 vext2 <0,2,0,4>, <1,2,3,0>
+ 2568358064U, // <0,4,1,3>: Cost 3 vext1 <3,0,4,1>, <3,0,4,1>
+ 2568359222U, // <0,4,1,4>: Cost 3 vext1 <3,0,4,1>, RHS
+ 1812778294U, // <0,4,1,5>: Cost 2 vzipl LHS, RHS
+ 3022671158U, // <0,4,1,6>: Cost 3 vtrnl <0,4,1,5>, RHS
+ 2592248852U, // <0,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
+ 1812778537U, // <0,4,1,u>: Cost 2 vzipl LHS, RHS
+ 2568364134U, // <0,4,2,0>: Cost 3 vext1 <3,0,4,2>, LHS
+ 2238573423U, // <0,4,2,1>: Cost 3 vrev <4,0,1,2>
+ 3692349032U, // <0,4,2,2>: Cost 4 vext2 <0,2,0,4>, <2,2,2,2>
+ 2631214761U, // <0,4,2,3>: Cost 3 vext2 <2,3,0,4>, <2,3,0,4>
+ 2568367414U, // <0,4,2,4>: Cost 3 vext1 <3,0,4,2>, RHS
+ 2887028022U, // <0,4,2,5>: Cost 3 vzipl <0,2,0,2>, RHS
+ 1946996022U, // <0,4,2,6>: Cost 2 vtrnl LHS, RHS
+ 2592257045U, // <0,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
+ 1946996040U, // <0,4,2,u>: Cost 2 vtrnl LHS, RHS
+ 3692349590U, // <0,4,3,0>: Cost 4 vext2 <0,2,0,4>, <3,0,1,2>
+ 3826878614U, // <0,4,3,1>: Cost 4 vuzpl <0,2,4,6>, <3,0,1,2>
+ 3826878625U, // <0,4,3,2>: Cost 4 vuzpl <0,2,4,6>, <3,0,2,4>
+ 3692349852U, // <0,4,3,3>: Cost 4 vext2 <0,2,0,4>, <3,3,3,3>
+ 3692349954U, // <0,4,3,4>: Cost 4 vext2 <0,2,0,4>, <3,4,5,6>
+ 3826878978U, // <0,4,3,5>: Cost 4 vuzpl <0,2,4,6>, <3,4,5,6>
+ 4095200566U, // <0,4,3,6>: Cost 4 vtrnl <0,2,3,1>, RHS
+ 3713583814U, // <0,4,3,7>: Cost 4 vext2 <3,7,0,4>, <3,7,0,4>
+ 3692350238U, // <0,4,3,u>: Cost 4 vext2 <0,2,0,4>, <3,u,1,2>
+ 2550464552U, // <0,4,4,0>: Cost 3 vext1 <0,0,4,4>, <0,0,4,4>
+ 3962194914U, // <0,4,4,1>: Cost 4 vzipl <0,4,1,5>, <4,1,5,0>
+ 3693677631U, // <0,4,4,2>: Cost 4 vext2 <0,4,0,4>, <4,2,6,3>
+ 3642124467U, // <0,4,4,3>: Cost 4 vext1 <3,0,4,4>, <3,0,4,4>
+ 2718715088U, // <0,4,4,4>: Cost 3 vext3 <5,6,7,0>, <4,4,4,4>
+ 2618608950U, // <0,4,4,5>: Cost 3 vext2 <0,2,0,4>, RHS
+ 2753137974U, // <0,4,4,6>: Cost 3 vuzpl <0,2,4,6>, RHS
+ 3666015255U, // <0,4,4,7>: Cost 4 vext1 <7,0,4,4>, <7,0,4,4>
+ 2618609193U, // <0,4,4,u>: Cost 3 vext2 <0,2,0,4>, RHS
+ 2568388710U, // <0,4,5,0>: Cost 3 vext1 <3,0,4,5>, LHS
+ 2568389526U, // <0,4,5,1>: Cost 3 vext1 <3,0,4,5>, <1,2,3,0>
+ 3636159963U, // <0,4,5,2>: Cost 4 vext1 <2,0,4,5>, <2,0,4,5>
+ 2568390836U, // <0,4,5,3>: Cost 3 vext1 <3,0,4,5>, <3,0,4,5>
+ 2568391990U, // <0,4,5,4>: Cost 3 vext1 <3,0,4,5>, RHS
+ 2718715180U, // <0,4,5,5>: Cost 3 vext3 <5,6,7,0>, <4,5,5,6>
+ 1618136374U, // <0,4,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
+ 2592281624U, // <0,4,5,7>: Cost 3 vext1 <7,0,4,5>, <7,0,4,5>
+ 1618136392U, // <0,4,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
+ 2550480938U, // <0,4,6,0>: Cost 3 vext1 <0,0,4,6>, <0,0,4,6>
+ 3826880801U, // <0,4,6,1>: Cost 4 vuzpl <0,2,4,6>, <6,0,1,2>
+ 2562426332U, // <0,4,6,2>: Cost 3 vext1 <2,0,4,6>, <2,0,4,6>
+ 3786190181U, // <0,4,6,3>: Cost 4 vext3 <4,6,3,0>, <4,6,3,0>
+ 2718715252U, // <0,4,6,4>: Cost 3 vext3 <5,6,7,0>, <4,6,4,6>
+ 3826881165U, // <0,4,6,5>: Cost 4 vuzpl <0,2,4,6>, <6,4,5,6>
+ 2712669568U, // <0,4,6,6>: Cost 3 vext3 <4,6,6,0>, <4,6,6,0>
+ 2657760081U, // <0,4,6,7>: Cost 3 vext2 <6,7,0,4>, <6,7,0,4>
+ 2718715284U, // <0,4,6,u>: Cost 3 vext3 <5,6,7,0>, <4,6,u,2>
+ 3654090854U, // <0,4,7,0>: Cost 4 vext1 <5,0,4,7>, LHS
+ 3934229326U, // <0,4,7,1>: Cost 4 vuzpr <7,0,1,4>, <6,7,0,1>
+ 3734156437U, // <0,4,7,2>: Cost 4 vext2 <7,2,0,4>, <7,2,0,4>
+ 3734820070U, // <0,4,7,3>: Cost 4 vext2 <7,3,0,4>, <7,3,0,4>
+ 3654094134U, // <0,4,7,4>: Cost 4 vext1 <5,0,4,7>, RHS
+ 2713259464U, // <0,4,7,5>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
+ 2713333201U, // <0,4,7,6>: Cost 3 vext3 <4,7,6,0>, <4,7,6,0>
+ 3654095866U, // <0,4,7,7>: Cost 4 vext1 <5,0,4,7>, <7,0,1,2>
+ 2713259464U, // <0,4,7,u>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
+ 2568413286U, // <0,4,u,0>: Cost 3 vext1 <3,0,4,u>, LHS
+ 2618611502U, // <0,4,u,1>: Cost 3 vext2 <0,2,0,4>, LHS
+ 2753140526U, // <0,4,u,2>: Cost 3 vuzpl <0,2,4,6>, LHS
+ 2568415415U, // <0,4,u,3>: Cost 3 vext1 <3,0,4,u>, <3,0,4,u>
+ 2568416566U, // <0,4,u,4>: Cost 3 vext1 <3,0,4,u>, RHS
+ 1817423158U, // <0,4,u,5>: Cost 2 vzipl LHS, RHS
+ 1947438390U, // <0,4,u,6>: Cost 2 vtrnl LHS, RHS
+ 2592306203U, // <0,4,u,7>: Cost 3 vext1 <7,0,4,u>, <7,0,4,u>
+ 1947438408U, // <0,4,u,u>: Cost 2 vtrnl LHS, RHS
+ 3630219264U, // <0,5,0,0>: Cost 4 vext1 <1,0,5,0>, <0,0,0,0>
+ 2625912934U, // <0,5,0,1>: Cost 3 vext2 <1,4,0,5>, LHS
+ 3692355748U, // <0,5,0,2>: Cost 4 vext2 <0,2,0,5>, <0,2,0,2>
+ 3693019384U, // <0,5,0,3>: Cost 4 vext2 <0,3,0,5>, <0,3,0,5>
+ 3630222646U, // <0,5,0,4>: Cost 4 vext1 <1,0,5,0>, RHS
+ 3699655062U, // <0,5,0,5>: Cost 4 vext2 <1,4,0,5>, <0,5,0,1>
+ 2718715508U, // <0,5,0,6>: Cost 3 vext3 <5,6,7,0>, <5,0,6,1>
+ 3087011126U, // <0,5,0,7>: Cost 3 vtrnr <0,0,0,0>, RHS
+ 2625913501U, // <0,5,0,u>: Cost 3 vext2 <1,4,0,5>, LHS
+ 1500659814U, // <0,5,1,0>: Cost 2 vext1 <4,0,5,1>, LHS
+ 2886520528U, // <0,5,1,1>: Cost 3 vzipl LHS, <5,1,7,3>
+ 2574403176U, // <0,5,1,2>: Cost 3 vext1 <4,0,5,1>, <2,2,2,2>
+ 2574403734U, // <0,5,1,3>: Cost 3 vext1 <4,0,5,1>, <3,0,1,2>
+ 1500662674U, // <0,5,1,4>: Cost 2 vext1 <4,0,5,1>, <4,0,5,1>
+ 2886520836U, // <0,5,1,5>: Cost 3 vzipl LHS, <5,5,5,5>
+ 2886520930U, // <0,5,1,6>: Cost 3 vzipl LHS, <5,6,7,0>
+ 2718715600U, // <0,5,1,7>: Cost 3 vext3 <5,6,7,0>, <5,1,7,3>
+ 1500665646U, // <0,5,1,u>: Cost 2 vext1 <4,0,5,1>, LHS
+ 2556493926U, // <0,5,2,0>: Cost 3 vext1 <1,0,5,2>, LHS
+ 2244546120U, // <0,5,2,1>: Cost 3 vrev <5,0,1,2>
+ 3692357256U, // <0,5,2,2>: Cost 4 vext2 <0,2,0,5>, <2,2,5,7>
+ 2568439994U, // <0,5,2,3>: Cost 3 vext1 <3,0,5,2>, <3,0,5,2>
+ 2556497206U, // <0,5,2,4>: Cost 3 vext1 <1,0,5,2>, RHS
+ 3020738564U, // <0,5,2,5>: Cost 3 vtrnl LHS, <5,5,5,5>
+ 4027877161U, // <0,5,2,6>: Cost 4 vzipr <0,2,0,2>, <2,4,5,6>
+ 3093220662U, // <0,5,2,7>: Cost 3 vtrnr <1,0,3,2>, RHS
+ 3093220663U, // <0,5,2,u>: Cost 3 vtrnr <1,0,3,2>, RHS
+ 3699656854U, // <0,5,3,0>: Cost 4 vext2 <1,4,0,5>, <3,0,1,2>
+ 3699656927U, // <0,5,3,1>: Cost 4 vext2 <1,4,0,5>, <3,1,0,3>
+ 3699657006U, // <0,5,3,2>: Cost 4 vext2 <1,4,0,5>, <3,2,0,1>
+ 3699657116U, // <0,5,3,3>: Cost 4 vext2 <1,4,0,5>, <3,3,3,3>
+ 2637859284U, // <0,5,3,4>: Cost 3 vext2 <3,4,0,5>, <3,4,0,5>
+ 3790319453U, // <0,5,3,5>: Cost 4 vext3 <5,3,5,0>, <5,3,5,0>
+ 3699657354U, // <0,5,3,6>: Cost 4 vext2 <1,4,0,5>, <3,6,2,7>
+ 2716725103U, // <0,5,3,7>: Cost 3 vext3 <5,3,7,0>, <5,3,7,0>
+ 2716798840U, // <0,5,3,u>: Cost 3 vext3 <5,3,u,0>, <5,3,u,0>
+ 2661747602U, // <0,5,4,0>: Cost 3 vext2 <7,4,0,5>, <4,0,5,1>
+ 3630252810U, // <0,5,4,1>: Cost 4 vext1 <1,0,5,4>, <1,0,5,4>
+ 3636225507U, // <0,5,4,2>: Cost 4 vext1 <2,0,5,4>, <2,0,5,4>
+ 3716910172U, // <0,5,4,3>: Cost 4 vext2 <4,3,0,5>, <4,3,0,5>
+ 3962195892U, // <0,5,4,4>: Cost 4 vzipl <0,4,1,5>, <5,4,5,6>
+ 2625916214U, // <0,5,4,5>: Cost 3 vext2 <1,4,0,5>, RHS
+ 3718901071U, // <0,5,4,6>: Cost 4 vext2 <4,6,0,5>, <4,6,0,5>
+ 2718715846U, // <0,5,4,7>: Cost 3 vext3 <5,6,7,0>, <5,4,7,6>
+ 2625916457U, // <0,5,4,u>: Cost 3 vext2 <1,4,0,5>, RHS
+ 3791278034U, // <0,5,5,0>: Cost 4 vext3 <5,5,0,0>, <5,5,0,0>
+ 3791351771U, // <0,5,5,1>: Cost 4 vext3 <5,5,1,0>, <5,5,1,0>
+ 3318386260U, // <0,5,5,2>: Cost 4 vrev <5,0,2,5>
+ 3791499245U, // <0,5,5,3>: Cost 4 vext3 <5,5,3,0>, <5,5,3,0>
+ 3318533734U, // <0,5,5,4>: Cost 4 vrev <5,0,4,5>
+ 2718715908U, // <0,5,5,5>: Cost 3 vext3 <5,6,7,0>, <5,5,5,5>
+ 2657767522U, // <0,5,5,6>: Cost 3 vext2 <6,7,0,5>, <5,6,7,0>
+ 2718715928U, // <0,5,5,7>: Cost 3 vext3 <5,6,7,0>, <5,5,7,7>
+ 2718715937U, // <0,5,5,u>: Cost 3 vext3 <5,6,7,0>, <5,5,u,7>
+ 2592358502U, // <0,5,6,0>: Cost 3 vext1 <7,0,5,6>, LHS
+ 3792015404U, // <0,5,6,1>: Cost 4 vext3 <5,6,1,0>, <5,6,1,0>
+ 3731509754U, // <0,5,6,2>: Cost 4 vext2 <6,7,0,5>, <6,2,7,3>
+ 3785748546U, // <0,5,6,3>: Cost 4 vext3 <4,5,6,0>, <5,6,3,4>
+ 2592361782U, // <0,5,6,4>: Cost 3 vext1 <7,0,5,6>, RHS
+ 2592362594U, // <0,5,6,5>: Cost 3 vext1 <7,0,5,6>, <5,6,7,0>
+ 3785748576U, // <0,5,6,6>: Cost 4 vext3 <4,5,6,0>, <5,6,6,7>
+ 1644974178U, // <0,5,6,7>: Cost 2 vext3 <5,6,7,0>, <5,6,7,0>
+ 1645047915U, // <0,5,6,u>: Cost 2 vext3 <5,6,u,0>, <5,6,u,0>
+ 2562506854U, // <0,5,7,0>: Cost 3 vext1 <2,0,5,7>, LHS
+ 2562507670U, // <0,5,7,1>: Cost 3 vext1 <2,0,5,7>, <1,2,3,0>
+ 2562508262U, // <0,5,7,2>: Cost 3 vext1 <2,0,5,7>, <2,0,5,7>
+ 3636250774U, // <0,5,7,3>: Cost 4 vext1 <2,0,5,7>, <3,0,1,2>
+ 2562510134U, // <0,5,7,4>: Cost 3 vext1 <2,0,5,7>, RHS
+ 2718716072U, // <0,5,7,5>: Cost 3 vext3 <5,6,7,0>, <5,7,5,7>
+ 2718716074U, // <0,5,7,6>: Cost 3 vext3 <5,6,7,0>, <5,7,6,0>
+ 2719379635U, // <0,5,7,7>: Cost 3 vext3 <5,7,7,0>, <5,7,7,0>
+ 2562512686U, // <0,5,7,u>: Cost 3 vext1 <2,0,5,7>, LHS
+ 1500717158U, // <0,5,u,0>: Cost 2 vext1 <4,0,5,u>, LHS
+ 2625918766U, // <0,5,u,1>: Cost 3 vext2 <1,4,0,5>, LHS
+ 2719674583U, // <0,5,u,2>: Cost 3 vext3 <5,u,2,0>, <5,u,2,0>
+ 2568489152U, // <0,5,u,3>: Cost 3 vext1 <3,0,5,u>, <3,0,5,u>
+ 1500720025U, // <0,5,u,4>: Cost 2 vext1 <4,0,5,u>, <4,0,5,u>
+ 2625919130U, // <0,5,u,5>: Cost 3 vext2 <1,4,0,5>, RHS
+ 2586407243U, // <0,5,u,6>: Cost 3 vext1 <6,0,5,u>, <6,0,5,u>
+ 1646301444U, // <0,5,u,7>: Cost 2 vext3 <5,u,7,0>, <5,u,7,0>
+ 1646375181U, // <0,5,u,u>: Cost 2 vext3 <5,u,u,0>, <5,u,u,0>
+ 2586411110U, // <0,6,0,0>: Cost 3 vext1 <6,0,6,0>, LHS
+ 2619949158U, // <0,6,0,1>: Cost 3 vext2 <0,4,0,6>, LHS
+ 2619949220U, // <0,6,0,2>: Cost 3 vext2 <0,4,0,6>, <0,2,0,2>
+ 3785748789U, // <0,6,0,3>: Cost 4 vext3 <4,5,6,0>, <6,0,3,4>
+ 2619949386U, // <0,6,0,4>: Cost 3 vext2 <0,4,0,6>, <0,4,0,6>
+ 2586415202U, // <0,6,0,5>: Cost 3 vext1 <6,0,6,0>, <5,6,7,0>
+ 2586415436U, // <0,6,0,6>: Cost 3 vext1 <6,0,6,0>, <6,0,6,0>
+ 2952793398U, // <0,6,0,7>: Cost 3 vzipr <0,0,0,0>, RHS
+ 2619949725U, // <0,6,0,u>: Cost 3 vext2 <0,4,0,6>, LHS
+ 2562531430U, // <0,6,1,0>: Cost 3 vext1 <2,0,6,1>, LHS
+ 3693691700U, // <0,6,1,1>: Cost 4 vext2 <0,4,0,6>, <1,1,1,1>
+ 2886521338U, // <0,6,1,2>: Cost 3 vzipl LHS, <6,2,7,3>
+ 3693691864U, // <0,6,1,3>: Cost 4 vext2 <0,4,0,6>, <1,3,1,3>
+ 2562534710U, // <0,6,1,4>: Cost 3 vext1 <2,0,6,1>, RHS
+ 2580450932U, // <0,6,1,5>: Cost 3 vext1 <5,0,6,1>, <5,0,6,1>
+ 2886521656U, // <0,6,1,6>: Cost 3 vzipl LHS, <6,6,6,6>
+ 2966736182U, // <0,6,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
+ 2966736183U, // <0,6,1,u>: Cost 3 vzipr <2,3,0,1>, RHS
+ 1500741734U, // <0,6,2,0>: Cost 2 vext1 <4,0,6,2>, LHS
+ 2250518817U, // <0,6,2,1>: Cost 3 vrev <6,0,1,2>
+ 2574485096U, // <0,6,2,2>: Cost 3 vext1 <4,0,6,2>, <2,2,2,2>
+ 2631894694U, // <0,6,2,3>: Cost 3 vext2 <2,4,0,6>, <2,3,0,1>
+ 1500744604U, // <0,6,2,4>: Cost 2 vext1 <4,0,6,2>, <4,0,6,2>
+ 2574487248U, // <0,6,2,5>: Cost 3 vext1 <4,0,6,2>, <5,1,7,3>
+ 3020739384U, // <0,6,2,6>: Cost 3 vtrnl LHS, <6,6,6,6>
+ 2954136886U, // <0,6,2,7>: Cost 3 vzipr <0,2,0,2>, RHS
+ 1500747566U, // <0,6,2,u>: Cost 2 vext1 <4,0,6,2>, LHS
+ 3693693078U, // <0,6,3,0>: Cost 4 vext2 <0,4,0,6>, <3,0,1,2>
+ 3705637136U, // <0,6,3,1>: Cost 4 vext2 <2,4,0,6>, <3,1,5,7>
+ 3705637192U, // <0,6,3,2>: Cost 4 vext2 <2,4,0,6>, <3,2,3,0>
+ 3693693340U, // <0,6,3,3>: Cost 4 vext2 <0,4,0,6>, <3,3,3,3>
+ 2637867477U, // <0,6,3,4>: Cost 3 vext2 <3,4,0,6>, <3,4,0,6>
+ 3705637424U, // <0,6,3,5>: Cost 4 vext2 <2,4,0,6>, <3,5,1,7>
+ 3666154056U, // <0,6,3,6>: Cost 4 vext1 <7,0,6,3>, <6,3,7,0>
+ 2722697800U, // <0,6,3,7>: Cost 3 vext3 <6,3,7,0>, <6,3,7,0>
+ 2722771537U, // <0,6,3,u>: Cost 3 vext3 <6,3,u,0>, <6,3,u,0>
+ 2562556006U, // <0,6,4,0>: Cost 3 vext1 <2,0,6,4>, LHS
+ 4095316257U, // <0,6,4,1>: Cost 4 vtrnl <0,2,4,6>, <6,0,1,2>
+ 2562557420U, // <0,6,4,2>: Cost 3 vext1 <2,0,6,4>, <2,0,6,4>
+ 3636299926U, // <0,6,4,3>: Cost 4 vext1 <2,0,6,4>, <3,0,1,2>
+ 2562559286U, // <0,6,4,4>: Cost 3 vext1 <2,0,6,4>, RHS
+ 2619952438U, // <0,6,4,5>: Cost 3 vext2 <0,4,0,6>, RHS
+ 2723287696U, // <0,6,4,6>: Cost 3 vext3 <6,4,6,0>, <6,4,6,0>
+ 4027895094U, // <0,6,4,7>: Cost 4 vzipr <0,2,0,4>, RHS
+ 2619952681U, // <0,6,4,u>: Cost 3 vext2 <0,4,0,6>, RHS
+ 2718716594U, // <0,6,5,0>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
+ 3648250774U, // <0,6,5,1>: Cost 4 vext1 <4,0,6,5>, <1,2,3,0>
+ 3792458436U, // <0,6,5,2>: Cost 4 vext3 <5,6,7,0>, <6,5,2,7>
+ 3705638767U, // <0,6,5,3>: Cost 5 vext2 <2,4,0,6>, <5,3,7,0>
+ 3648252831U, // <0,6,5,4>: Cost 4 vext1 <4,0,6,5>, <4,0,6,5>
+ 3797619416U, // <0,6,5,5>: Cost 4 vext3 <6,5,5,0>, <6,5,5,0>
+ 3792458472U, // <0,6,5,6>: Cost 4 vext3 <5,6,7,0>, <6,5,6,7>
+ 4035202358U, // <0,6,5,7>: Cost 4 vzipr <1,4,0,5>, RHS
+ 2718716594U, // <0,6,5,u>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
+ 3786412796U, // <0,6,6,0>: Cost 4 vext3 <4,6,6,0>, <6,6,0,0>
+ 3792458504U, // <0,6,6,1>: Cost 4 vext3 <5,6,7,0>, <6,6,1,3>
+ 3728200126U, // <0,6,6,2>: Cost 4 vext2 <6,2,0,6>, <6,2,0,6>
+ 3798135575U, // <0,6,6,3>: Cost 4 vext3 <6,6,3,0>, <6,6,3,0>
+ 3786412836U, // <0,6,6,4>: Cost 4 vext3 <4,6,6,0>, <6,6,4,4>
+ 3792458543U, // <0,6,6,5>: Cost 4 vext3 <5,6,7,0>, <6,6,5,6>
+ 2718716728U, // <0,6,6,6>: Cost 3 vext3 <5,6,7,0>, <6,6,6,6>
+ 2718716738U, // <0,6,6,7>: Cost 3 vext3 <5,6,7,0>, <6,6,7,7>
+ 2718716747U, // <0,6,6,u>: Cost 3 vext3 <5,6,7,0>, <6,6,u,7>
+ 2718716750U, // <0,6,7,0>: Cost 3 vext3 <5,6,7,0>, <6,7,0,1>
+ 2724909910U, // <0,6,7,1>: Cost 3 vext3 <6,7,1,0>, <6,7,1,0>
+ 3636323823U, // <0,6,7,2>: Cost 4 vext1 <2,0,6,7>, <2,0,6,7>
+ 2725057384U, // <0,6,7,3>: Cost 3 vext3 <6,7,3,0>, <6,7,3,0>
+ 2718716790U, // <0,6,7,4>: Cost 3 vext3 <5,6,7,0>, <6,7,4,5>
+ 2718716800U, // <0,6,7,5>: Cost 3 vext3 <5,6,7,0>, <6,7,5,6>
+ 3792458629U, // <0,6,7,6>: Cost 4 vext3 <5,6,7,0>, <6,7,6,2>
+ 2725352332U, // <0,6,7,7>: Cost 3 vext3 <6,7,7,0>, <6,7,7,0>
+ 2718716822U, // <0,6,7,u>: Cost 3 vext3 <5,6,7,0>, <6,7,u,1>
+ 1500790886U, // <0,6,u,0>: Cost 2 vext1 <4,0,6,u>, LHS
+ 2619954990U, // <0,6,u,1>: Cost 3 vext2 <0,4,0,6>, LHS
+ 2562590192U, // <0,6,u,2>: Cost 3 vext1 <2,0,6,u>, <2,0,6,u>
+ 2725721017U, // <0,6,u,3>: Cost 3 vext3 <6,u,3,0>, <6,u,3,0>
+ 1500793762U, // <0,6,u,4>: Cost 2 vext1 <4,0,6,u>, <4,0,6,u>
+ 2619955354U, // <0,6,u,5>: Cost 3 vext2 <0,4,0,6>, RHS
+ 2725942228U, // <0,6,u,6>: Cost 3 vext3 <6,u,6,0>, <6,u,6,0>
+ 2954186038U, // <0,6,u,7>: Cost 3 vzipr <0,2,0,u>, RHS
+ 1500796718U, // <0,6,u,u>: Cost 2 vext1 <4,0,6,u>, LHS
+ 2256401391U, // <0,7,0,0>: Cost 3 vrev <7,0,0,0>
+ 2632564838U, // <0,7,0,1>: Cost 3 vext2 <2,5,0,7>, LHS
+ 2256548865U, // <0,7,0,2>: Cost 3 vrev <7,0,2,0>
+ 3700998396U, // <0,7,0,3>: Cost 4 vext2 <1,6,0,7>, <0,3,1,0>
+ 2718716952U, // <0,7,0,4>: Cost 3 vext3 <5,6,7,0>, <7,0,4,5>
+ 2718716962U, // <0,7,0,5>: Cost 3 vext3 <5,6,7,0>, <7,0,5,6>
+ 2621284845U, // <0,7,0,6>: Cost 3 vext2 <0,6,0,7>, <0,6,0,7>
+ 3904685542U, // <0,7,0,7>: Cost 4 vuzpr <2,0,5,7>, <2,0,5,7>
+ 2632565405U, // <0,7,0,u>: Cost 3 vext2 <2,5,0,7>, LHS
+ 2256409584U, // <0,7,1,0>: Cost 3 vrev <7,0,0,1>
+ 3706307380U, // <0,7,1,1>: Cost 4 vext2 <2,5,0,7>, <1,1,1,1>
+ 2632565654U, // <0,7,1,2>: Cost 3 vext2 <2,5,0,7>, <1,2,3,0>
+ 3769603168U, // <0,7,1,3>: Cost 4 vext3 <1,u,3,0>, <7,1,3,5>
+ 2256704532U, // <0,7,1,4>: Cost 3 vrev <7,0,4,1>
+ 3769603184U, // <0,7,1,5>: Cost 4 vext3 <1,u,3,0>, <7,1,5,3>
+ 3700999366U, // <0,7,1,6>: Cost 4 vext2 <1,6,0,7>, <1,6,0,7>
+ 2886522476U, // <0,7,1,7>: Cost 3 vzipl LHS, <7,7,7,7>
+ 2256999480U, // <0,7,1,u>: Cost 3 vrev <7,0,u,1>
+ 2586501222U, // <0,7,2,0>: Cost 3 vext1 <6,0,7,2>, LHS
+ 1182749690U, // <0,7,2,1>: Cost 2 vrev <7,0,1,2>
+ 3636356595U, // <0,7,2,2>: Cost 4 vext1 <2,0,7,2>, <2,0,7,2>
+ 2727711916U, // <0,7,2,3>: Cost 3 vext3 <7,2,3,0>, <7,2,3,0>
+ 2586504502U, // <0,7,2,4>: Cost 3 vext1 <6,0,7,2>, RHS
+ 2632566606U, // <0,7,2,5>: Cost 3 vext2 <2,5,0,7>, <2,5,0,7>
+ 2586505559U, // <0,7,2,6>: Cost 3 vext1 <6,0,7,2>, <6,0,7,2>
+ 3020740204U, // <0,7,2,7>: Cost 3 vtrnl LHS, <7,7,7,7>
+ 1183265849U, // <0,7,2,u>: Cost 2 vrev <7,0,u,2>
+ 3701000342U, // <0,7,3,0>: Cost 4 vext2 <1,6,0,7>, <3,0,1,2>
+ 3706308849U, // <0,7,3,1>: Cost 4 vext2 <2,5,0,7>, <3,1,2,3>
+ 3330315268U, // <0,7,3,2>: Cost 4 vrev <7,0,2,3>
+ 3706309020U, // <0,7,3,3>: Cost 4 vext2 <2,5,0,7>, <3,3,3,3>
+ 3706309122U, // <0,7,3,4>: Cost 4 vext2 <2,5,0,7>, <3,4,5,6>
+ 3712281127U, // <0,7,3,5>: Cost 4 vext2 <3,5,0,7>, <3,5,0,7>
+ 2639202936U, // <0,7,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
+ 3802412321U, // <0,7,3,7>: Cost 4 vext3 <7,3,7,0>, <7,3,7,0>
+ 2640530202U, // <0,7,3,u>: Cost 3 vext2 <3,u,0,7>, <3,u,0,7>
+ 3654287462U, // <0,7,4,0>: Cost 4 vext1 <5,0,7,4>, LHS
+ 2256507900U, // <0,7,4,1>: Cost 3 vrev <7,0,1,4>
+ 2256581637U, // <0,7,4,2>: Cost 3 vrev <7,0,2,4>
+ 3660262008U, // <0,7,4,3>: Cost 4 vext1 <6,0,7,4>, <3,6,0,7>
+ 3786413405U, // <0,7,4,4>: Cost 4 vext3 <4,6,6,0>, <7,4,4,6>
+ 2632568118U, // <0,7,4,5>: Cost 3 vext2 <2,5,0,7>, RHS
+ 3718917457U, // <0,7,4,6>: Cost 4 vext2 <4,6,0,7>, <4,6,0,7>
+ 3787003255U, // <0,7,4,7>: Cost 4 vext3 <4,7,5,0>, <7,4,7,5>
+ 2632568361U, // <0,7,4,u>: Cost 3 vext2 <2,5,0,7>, RHS
+ 3706310268U, // <0,7,5,0>: Cost 4 vext2 <2,5,0,7>, <5,0,7,0>
+ 3792459156U, // <0,7,5,1>: Cost 4 vext3 <5,6,7,0>, <7,5,1,7>
+ 3330331654U, // <0,7,5,2>: Cost 4 vrev <7,0,2,5>
+ 3722899255U, // <0,7,5,3>: Cost 4 vext2 <5,3,0,7>, <5,3,0,7>
+ 2256737304U, // <0,7,5,4>: Cost 3 vrev <7,0,4,5>
+ 3724226521U, // <0,7,5,5>: Cost 4 vext2 <5,5,0,7>, <5,5,0,7>
+ 2718717377U, // <0,7,5,6>: Cost 3 vext3 <5,6,7,0>, <7,5,6,7>
+ 2729997763U, // <0,7,5,7>: Cost 3 vext3 <7,5,7,0>, <7,5,7,0>
+ 2720044499U, // <0,7,5,u>: Cost 3 vext3 <5,u,7,0>, <7,5,u,7>
+ 3712946517U, // <0,7,6,0>: Cost 4 vext2 <3,6,0,7>, <6,0,7,0>
+ 2256524286U, // <0,7,6,1>: Cost 3 vrev <7,0,1,6>
+ 3792459246U, // <0,7,6,2>: Cost 4 vext3 <5,6,7,0>, <7,6,2,7>
+ 3796440567U, // <0,7,6,3>: Cost 4 vext3 <6,3,7,0>, <7,6,3,7>
+ 3654307126U, // <0,7,6,4>: Cost 4 vext1 <5,0,7,6>, RHS
+ 2656457394U, // <0,7,6,5>: Cost 3 vext2 <6,5,0,7>, <6,5,0,7>
+ 3792459281U, // <0,7,6,6>: Cost 4 vext3 <5,6,7,0>, <7,6,6,6>
+ 2730661396U, // <0,7,6,7>: Cost 3 vext3 <7,6,7,0>, <7,6,7,0>
+ 2658448293U, // <0,7,6,u>: Cost 3 vext2 <6,u,0,7>, <6,u,0,7>
+ 3787003431U, // <0,7,7,0>: Cost 4 vext3 <4,7,5,0>, <7,7,0,1>
+ 3654312854U, // <0,7,7,1>: Cost 4 vext1 <5,0,7,7>, <1,2,3,0>
+ 3654313446U, // <0,7,7,2>: Cost 4 vext1 <5,0,7,7>, <2,0,5,7>
+ 3804771905U, // <0,7,7,3>: Cost 4 vext3 <7,7,3,0>, <7,7,3,0>
+ 3654315318U, // <0,7,7,4>: Cost 4 vext1 <5,0,7,7>, RHS
+ 3654315651U, // <0,7,7,5>: Cost 4 vext1 <5,0,7,7>, <5,0,7,7>
+ 3660288348U, // <0,7,7,6>: Cost 4 vext1 <6,0,7,7>, <6,0,7,7>
+ 2718717548U, // <0,7,7,7>: Cost 3 vext3 <5,6,7,0>, <7,7,7,7>
+ 2664420990U, // <0,7,7,u>: Cost 3 vext2 <7,u,0,7>, <7,u,0,7>
+ 2256466935U, // <0,7,u,0>: Cost 3 vrev <7,0,0,u>
+ 1182798848U, // <0,7,u,1>: Cost 2 vrev <7,0,1,u>
+ 2256614409U, // <0,7,u,2>: Cost 3 vrev <7,0,2,u>
+ 2731693714U, // <0,7,u,3>: Cost 3 vext3 <7,u,3,0>, <7,u,3,0>
+ 2256761883U, // <0,7,u,4>: Cost 3 vrev <7,0,4,u>
+ 2632571034U, // <0,7,u,5>: Cost 3 vext2 <2,5,0,7>, RHS
+ 2669066421U, // <0,7,u,6>: Cost 3 vext2 <u,6,0,7>, <u,6,0,7>
+ 2731988662U, // <0,7,u,7>: Cost 3 vext3 <7,u,7,0>, <7,u,7,0>
+ 1183315007U, // <0,7,u,u>: Cost 2 vrev <7,0,u,u>
+ 135053414U, // <0,u,0,0>: Cost 1 vdup0 LHS
+ 1544896614U, // <0,u,0,1>: Cost 2 vext2 <0,2,0,u>, LHS
+ 1678999654U, // <0,u,0,2>: Cost 2 vuzpl LHS, LHS
+ 2691880677U, // <0,u,0,3>: Cost 3 vext3 <1,2,3,0>, <u,0,3,2>
+ 1476988214U, // <0,u,0,4>: Cost 2 vext1 <0,0,u,0>, RHS
+ 2718791419U, // <0,u,0,5>: Cost 3 vext3 <5,6,u,0>, <u,0,5,6>
+ 3021248666U, // <0,u,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
+ 2592535607U, // <0,u,0,7>: Cost 3 vext1 <7,0,u,0>, <7,0,u,0>
+ 135053414U, // <0,u,0,u>: Cost 1 vdup0 LHS
+ 1476993097U, // <0,u,1,0>: Cost 2 vext1 <0,0,u,1>, <0,0,u,1>
+ 1812780846U, // <0,u,1,1>: Cost 2 vzipl LHS, LHS
+ 1618138926U, // <0,u,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
+ 2752742134U, // <0,u,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
+ 1476996406U, // <0,u,1,4>: Cost 2 vext1 <0,0,u,1>, RHS
+ 1812781210U, // <0,u,1,5>: Cost 2 vzipl LHS, RHS
+ 2887006416U, // <0,u,1,6>: Cost 3 vzipl LHS, <u,6,3,7>
+ 2966736200U, // <0,u,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
+ 1812781413U, // <0,u,1,u>: Cost 2 vzipl LHS, LHS
+ 1482973286U, // <0,u,2,0>: Cost 2 vext1 <1,0,u,2>, LHS
+ 1482973987U, // <0,u,2,1>: Cost 2 vext1 <1,0,u,2>, <1,0,u,2>
+ 1946998574U, // <0,u,2,2>: Cost 2 vtrnl LHS, LHS
+ 835584U, // <0,u,2,3>: Cost 0 copy LHS
+ 1482976566U, // <0,u,2,4>: Cost 2 vext1 <1,0,u,2>, RHS
+ 3020781631U, // <0,u,2,5>: Cost 3 vtrnl LHS, <u,4,5,6>
+ 1946998938U, // <0,u,2,6>: Cost 2 vtrnl LHS, RHS
+ 1518810169U, // <0,u,2,7>: Cost 2 vext1 <7,0,u,2>, <7,0,u,2>
+ 835584U, // <0,u,2,u>: Cost 0 copy LHS
+ 2618640534U, // <0,u,3,0>: Cost 3 vext2 <0,2,0,u>, <3,0,1,2>
+ 2752743574U, // <0,u,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
+ 2636556597U, // <0,u,3,2>: Cost 3 vext2 <3,2,0,u>, <3,2,0,u>
+ 2752743836U, // <0,u,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
+ 2618640898U, // <0,u,3,4>: Cost 3 vext2 <0,2,0,u>, <3,4,5,6>
+ 2752743938U, // <0,u,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
+ 2639202936U, // <0,u,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
+ 2639874762U, // <0,u,3,7>: Cost 3 vext2 <3,7,0,u>, <3,7,0,u>
+ 2752743637U, // <0,u,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
+ 2562703462U, // <0,u,4,0>: Cost 3 vext1 <2,0,u,4>, LHS
+ 2888455982U, // <0,u,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
+ 3021575982U, // <0,u,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
+ 2568677591U, // <0,u,4,3>: Cost 3 vext1 <3,0,u,4>, <3,0,u,4>
+ 2562706742U, // <0,u,4,4>: Cost 3 vext1 <2,0,u,4>, RHS
+ 1544899894U, // <0,u,4,5>: Cost 2 vext2 <0,2,0,u>, RHS
+ 1679002934U, // <0,u,4,6>: Cost 2 vuzpl LHS, RHS
+ 2718718033U, // <0,u,4,7>: Cost 3 vext3 <5,6,7,0>, <u,4,7,6>
+ 1679002952U, // <0,u,4,u>: Cost 2 vuzpl LHS, RHS
+ 2568683622U, // <0,u,5,0>: Cost 3 vext1 <3,0,u,5>, LHS
+ 2568684438U, // <0,u,5,1>: Cost 3 vext1 <3,0,u,5>, <1,2,3,0>
+ 3765622902U, // <0,u,5,2>: Cost 4 vext3 <1,2,3,0>, <u,5,2,7>
+ 2691881087U, // <0,u,5,3>: Cost 3 vext3 <1,2,3,0>, <u,5,3,7>
+ 2568686902U, // <0,u,5,4>: Cost 3 vext1 <3,0,u,5>, RHS
+ 2650492890U, // <0,u,5,5>: Cost 3 vext2 <5,5,0,u>, <5,5,0,u>
+ 1618139290U, // <0,u,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
+ 2824834358U, // <0,u,5,7>: Cost 3 vuzpr <1,0,3,u>, RHS
+ 1618139308U, // <0,u,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
+ 2592579686U, // <0,u,6,0>: Cost 3 vext1 <7,0,u,6>, LHS
+ 2262496983U, // <0,u,6,1>: Cost 3 vrev <u,0,1,6>
+ 2654474688U, // <0,u,6,2>: Cost 3 vext2 <6,2,0,u>, <6,2,0,u>
+ 2691881168U, // <0,u,6,3>: Cost 3 vext3 <1,2,3,0>, <u,6,3,7>
+ 2592582966U, // <0,u,6,4>: Cost 3 vext1 <7,0,u,6>, RHS
+ 2656465587U, // <0,u,6,5>: Cost 3 vext2 <6,5,0,u>, <6,5,0,u>
+ 2657129220U, // <0,u,6,6>: Cost 3 vext2 <6,6,0,u>, <6,6,0,u>
+ 1584051029U, // <0,u,6,7>: Cost 2 vext2 <6,7,0,u>, <6,7,0,u>
+ 1584714662U, // <0,u,6,u>: Cost 2 vext2 <6,u,0,u>, <6,u,0,u>
+ 2562728038U, // <0,u,7,0>: Cost 3 vext1 <2,0,u,7>, LHS
+ 2562728854U, // <0,u,7,1>: Cost 3 vext1 <2,0,u,7>, <1,2,3,0>
+ 2562729473U, // <0,u,7,2>: Cost 3 vext1 <2,0,u,7>, <2,0,u,7>
+ 2661111018U, // <0,u,7,3>: Cost 3 vext2 <7,3,0,u>, <7,3,0,u>
+ 2562731318U, // <0,u,7,4>: Cost 3 vext1 <2,0,u,7>, RHS
+ 2718718258U, // <0,u,7,5>: Cost 3 vext3 <5,6,7,0>, <u,7,5,6>
+ 2586620261U, // <0,u,7,6>: Cost 3 vext1 <6,0,u,7>, <6,0,u,7>
+ 2657793644U, // <0,u,7,7>: Cost 3 vext2 <6,7,0,u>, <7,7,7,7>
+ 2562733870U, // <0,u,7,u>: Cost 3 vext1 <2,0,u,7>, LHS
+ 135053414U, // <0,u,u,0>: Cost 1 vdup0 LHS
+ 1544902446U, // <0,u,u,1>: Cost 2 vext2 <0,2,0,u>, LHS
+ 1679005486U, // <0,u,u,2>: Cost 2 vuzpl LHS, LHS
+ 835584U, // <0,u,u,3>: Cost 0 copy LHS
+ 1483025718U, // <0,u,u,4>: Cost 2 vext1 <1,0,u,u>, RHS
+ 1544902810U, // <0,u,u,5>: Cost 2 vext2 <0,2,0,u>, RHS
+ 1679005850U, // <0,u,u,6>: Cost 2 vuzpl LHS, RHS
+ 1518859327U, // <0,u,u,7>: Cost 2 vext1 <7,0,u,u>, <7,0,u,u>
+ 835584U, // <0,u,u,u>: Cost 0 copy LHS
+ 2689744896U, // <1,0,0,0>: Cost 3 vext3 <0,u,1,1>, <0,0,0,0>
+ 1610694666U, // <1,0,0,1>: Cost 2 vext3 <0,0,1,1>, <0,0,1,1>
+ 2689744916U, // <1,0,0,2>: Cost 3 vext3 <0,u,1,1>, <0,0,2,2>
+ 2619310332U, // <1,0,0,3>: Cost 3 vext2 <0,3,1,0>, <0,3,1,0>
+ 2684657701U, // <1,0,0,4>: Cost 3 vext3 <0,0,4,1>, <0,0,4,1>
+ 2620637598U, // <1,0,0,5>: Cost 3 vext2 <0,5,1,0>, <0,5,1,0>
+ 3708977654U, // <1,0,0,6>: Cost 4 vext2 <3,0,1,0>, <0,6,1,7>
+ 3666351168U, // <1,0,0,7>: Cost 4 vext1 <7,1,0,0>, <7,1,0,0>
+ 1611210825U, // <1,0,0,u>: Cost 2 vext3 <0,0,u,1>, <0,0,u,1>
+ 2556780646U, // <1,0,1,0>: Cost 3 vext1 <1,1,0,1>, LHS
+ 2556781355U, // <1,0,1,1>: Cost 3 vext1 <1,1,0,1>, <1,1,0,1>
+ 1616003174U, // <1,0,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
+ 3693052888U, // <1,0,1,3>: Cost 4 vext2 <0,3,1,0>, <1,3,1,3>
+ 2556783926U, // <1,0,1,4>: Cost 3 vext1 <1,1,0,1>, RHS
+ 2580672143U, // <1,0,1,5>: Cost 3 vext1 <5,1,0,1>, <5,1,0,1>
+ 2724839566U, // <1,0,1,6>: Cost 3 vext3 <6,7,0,1>, <0,1,6,7>
+ 3654415354U, // <1,0,1,7>: Cost 4 vext1 <5,1,0,1>, <7,0,1,2>
+ 1616003228U, // <1,0,1,u>: Cost 2 vext3 <0,u,1,1>, LHS
+ 2685690019U, // <1,0,2,0>: Cost 3 vext3 <0,2,0,1>, <0,2,0,1>
+ 2685763756U, // <1,0,2,1>: Cost 3 vext3 <0,2,1,1>, <0,2,1,1>
+ 2698297524U, // <1,0,2,2>: Cost 3 vext3 <2,3,0,1>, <0,2,2,0>
+ 2685911230U, // <1,0,2,3>: Cost 3 vext3 <0,2,3,1>, <0,2,3,1>
+ 2689745100U, // <1,0,2,4>: Cost 3 vext3 <0,u,1,1>, <0,2,4,6>
+ 3764814038U, // <1,0,2,5>: Cost 4 vext3 <1,1,1,1>, <0,2,5,7>
+ 2724839640U, // <1,0,2,6>: Cost 3 vext3 <6,7,0,1>, <0,2,6,0>
+ 2592625658U, // <1,0,2,7>: Cost 3 vext1 <7,1,0,2>, <7,0,1,2>
+ 2686279915U, // <1,0,2,u>: Cost 3 vext3 <0,2,u,1>, <0,2,u,1>
+ 3087843328U, // <1,0,3,0>: Cost 3 vtrnr LHS, <0,0,0,0>
+ 3087843338U, // <1,0,3,1>: Cost 3 vtrnr LHS, <0,0,1,1>
+ 67944550U, // <1,0,3,2>: Cost 1 vrev LHS
+ 2568743135U, // <1,0,3,3>: Cost 3 vext1 <3,1,0,3>, <3,1,0,3>
+ 2562772278U, // <1,0,3,4>: Cost 3 vext1 <2,1,0,3>, RHS
+ 4099850454U, // <1,0,3,5>: Cost 4 vtrnl <1,0,3,2>, <0,2,5,7>
+ 3704998538U, // <1,0,3,6>: Cost 4 vext2 <2,3,1,0>, <3,6,2,7>
+ 2592633923U, // <1,0,3,7>: Cost 3 vext1 <7,1,0,3>, <7,1,0,3>
+ 68386972U, // <1,0,3,u>: Cost 1 vrev LHS
+ 2620640146U, // <1,0,4,0>: Cost 3 vext2 <0,5,1,0>, <4,0,5,1>
+ 2689745234U, // <1,0,4,1>: Cost 3 vext3 <0,u,1,1>, <0,4,1,5>
+ 2689745244U, // <1,0,4,2>: Cost 3 vext3 <0,u,1,1>, <0,4,2,6>
+ 3760980320U, // <1,0,4,3>: Cost 4 vext3 <0,4,3,1>, <0,4,3,1>
+ 3761054057U, // <1,0,4,4>: Cost 4 vext3 <0,4,4,1>, <0,4,4,1>
+ 2619313462U, // <1,0,4,5>: Cost 3 vext2 <0,3,1,0>, RHS
+ 3761201531U, // <1,0,4,6>: Cost 4 vext3 <0,4,6,1>, <0,4,6,1>
+ 3666383940U, // <1,0,4,7>: Cost 4 vext1 <7,1,0,4>, <7,1,0,4>
+ 2619313705U, // <1,0,4,u>: Cost 3 vext2 <0,3,1,0>, RHS
+ 4029300736U, // <1,0,5,0>: Cost 4 vzipr <0,4,1,5>, <0,0,0,0>
+ 2895249510U, // <1,0,5,1>: Cost 3 vzipl <1,5,3,7>, LHS
+ 3028287590U, // <1,0,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
+ 3642501345U, // <1,0,5,3>: Cost 4 vext1 <3,1,0,5>, <3,1,0,5>
+ 2215592058U, // <1,0,5,4>: Cost 3 vrev <0,1,4,5>
+ 3724242907U, // <1,0,5,5>: Cost 4 vext2 <5,5,1,0>, <5,5,1,0>
+ 3724906540U, // <1,0,5,6>: Cost 4 vext2 <5,6,1,0>, <5,6,1,0>
+ 3911118134U, // <1,0,5,7>: Cost 4 vuzpr <3,1,3,0>, RHS
+ 3028287644U, // <1,0,5,u>: Cost 3 vtrnl <1,3,5,7>, LHS
+ 3762086375U, // <1,0,6,0>: Cost 4 vext3 <0,6,0,1>, <0,6,0,1>
+ 2698297846U, // <1,0,6,1>: Cost 3 vext3 <2,3,0,1>, <0,6,1,7>
+ 3760022015U, // <1,0,6,2>: Cost 4 vext3 <0,2,u,1>, <0,6,2,7>
+ 3642509538U, // <1,0,6,3>: Cost 4 vext1 <3,1,0,6>, <3,1,0,6>
+ 3762381323U, // <1,0,6,4>: Cost 4 vext3 <0,6,4,1>, <0,6,4,1>
+ 3730215604U, // <1,0,6,5>: Cost 4 vext2 <6,5,1,0>, <6,5,1,0>
+ 3730879237U, // <1,0,6,6>: Cost 4 vext2 <6,6,1,0>, <6,6,1,0>
+ 2657801046U, // <1,0,6,7>: Cost 3 vext2 <6,7,1,0>, <6,7,1,0>
+ 2658464679U, // <1,0,6,u>: Cost 3 vext2 <6,u,1,0>, <6,u,1,0>
+ 2659128312U, // <1,0,7,0>: Cost 3 vext2 <7,0,1,0>, <7,0,1,0>
+ 4047898278U, // <1,0,7,1>: Cost 4 vzipr <3,5,1,7>, <2,3,0,1>
+ 2215460970U, // <1,0,7,2>: Cost 3 vrev <0,1,2,7>
+ 3734861035U, // <1,0,7,3>: Cost 4 vext2 <7,3,1,0>, <7,3,1,0>
+ 3731543398U, // <1,0,7,4>: Cost 4 vext2 <6,7,1,0>, <7,4,5,6>
+ 3736188301U, // <1,0,7,5>: Cost 4 vext2 <7,5,1,0>, <7,5,1,0>
+ 2663110110U, // <1,0,7,6>: Cost 3 vext2 <7,6,1,0>, <7,6,1,0>
+ 3731543660U, // <1,0,7,7>: Cost 4 vext2 <6,7,1,0>, <7,7,7,7>
+ 2664437376U, // <1,0,7,u>: Cost 3 vext2 <7,u,1,0>, <7,u,1,0>
+ 3087884288U, // <1,0,u,0>: Cost 3 vtrnr LHS, <0,0,0,0>
+ 1616003730U, // <1,0,u,1>: Cost 2 vext3 <0,u,1,1>, <0,u,1,1>
+ 67985515U, // <1,0,u,2>: Cost 1 vrev LHS
+ 2689893028U, // <1,0,u,3>: Cost 3 vext3 <0,u,3,1>, <0,u,3,1>
+ 2689745586U, // <1,0,u,4>: Cost 3 vext3 <0,u,1,1>, <0,u,4,6>
+ 2619316378U, // <1,0,u,5>: Cost 3 vext2 <0,3,1,0>, RHS
+ 2669082807U, // <1,0,u,6>: Cost 3 vext2 <u,6,1,0>, <u,6,1,0>
+ 2592674888U, // <1,0,u,7>: Cost 3 vext1 <7,1,0,u>, <7,1,0,u>
+ 68427937U, // <1,0,u,u>: Cost 1 vrev LHS
+ 1543585802U, // <1,1,0,0>: Cost 2 vext2 <0,0,1,1>, <0,0,1,1>
+ 1548894310U, // <1,1,0,1>: Cost 2 vext2 <0,u,1,1>, LHS
+ 2618654892U, // <1,1,0,2>: Cost 3 vext2 <0,2,1,1>, <0,2,1,1>
+ 2689745654U, // <1,1,0,3>: Cost 3 vext3 <0,u,1,1>, <1,0,3,2>
+ 2622636370U, // <1,1,0,4>: Cost 3 vext2 <0,u,1,1>, <0,4,1,5>
+ 2620645791U, // <1,1,0,5>: Cost 3 vext2 <0,5,1,1>, <0,5,1,1>
+ 3696378367U, // <1,1,0,6>: Cost 4 vext2 <0,u,1,1>, <0,6,2,7>
+ 3666424905U, // <1,1,0,7>: Cost 4 vext1 <7,1,1,0>, <7,1,1,0>
+ 1548894866U, // <1,1,0,u>: Cost 2 vext2 <0,u,1,1>, <0,u,1,1>
+ 1483112550U, // <1,1,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
+ 202162278U, // <1,1,1,1>: Cost 1 vdup1 LHS
+ 2622636950U, // <1,1,1,2>: Cost 3 vext2 <0,u,1,1>, <1,2,3,0>
+ 2622637016U, // <1,1,1,3>: Cost 3 vext2 <0,u,1,1>, <1,3,1,3>
+ 1483115830U, // <1,1,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
+ 2622637200U, // <1,1,1,5>: Cost 3 vext2 <0,u,1,1>, <1,5,3,7>
+ 2622637263U, // <1,1,1,6>: Cost 3 vext2 <0,u,1,1>, <1,6,1,7>
+ 2592691274U, // <1,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
+ 202162278U, // <1,1,1,u>: Cost 1 vdup1 LHS
+ 2550890588U, // <1,1,2,0>: Cost 3 vext1 <0,1,1,2>, <0,1,1,2>
+ 2617329183U, // <1,1,2,1>: Cost 3 vext2 <0,0,1,1>, <2,1,3,1>
+ 2622637672U, // <1,1,2,2>: Cost 3 vext2 <0,u,1,1>, <2,2,2,2>
+ 2622637734U, // <1,1,2,3>: Cost 3 vext2 <0,u,1,1>, <2,3,0,1>
+ 2550893878U, // <1,1,2,4>: Cost 3 vext1 <0,1,1,2>, RHS
+ 3696379744U, // <1,1,2,5>: Cost 4 vext2 <0,u,1,1>, <2,5,2,7>
+ 2622638010U, // <1,1,2,6>: Cost 3 vext2 <0,u,1,1>, <2,6,3,7>
+ 3804554170U, // <1,1,2,7>: Cost 4 vext3 <7,7,0,1>, <1,2,7,0>
+ 2622638139U, // <1,1,2,u>: Cost 3 vext2 <0,u,1,1>, <2,u,0,1>
+ 2622638230U, // <1,1,3,0>: Cost 3 vext2 <0,u,1,1>, <3,0,1,2>
+ 3087844148U, // <1,1,3,1>: Cost 3 vtrnr LHS, <1,1,1,1>
+ 4161585244U, // <1,1,3,2>: Cost 4 vtrnr LHS, <0,1,1,2>
+ 2014101606U, // <1,1,3,3>: Cost 2 vtrnr LHS, LHS
+ 2622638594U, // <1,1,3,4>: Cost 3 vext2 <0,u,1,1>, <3,4,5,6>
+ 2689745920U, // <1,1,3,5>: Cost 3 vext3 <0,u,1,1>, <1,3,5,7>
+ 3763487753U, // <1,1,3,6>: Cost 4 vext3 <0,u,1,1>, <1,3,6,7>
+ 2592707660U, // <1,1,3,7>: Cost 3 vext1 <7,1,1,3>, <7,1,1,3>
+ 2014101611U, // <1,1,3,u>: Cost 2 vtrnr LHS, LHS
+ 2556878950U, // <1,1,4,0>: Cost 3 vext1 <1,1,1,4>, LHS
+ 2221335351U, // <1,1,4,1>: Cost 3 vrev <1,1,1,4>
+ 3696380988U, // <1,1,4,2>: Cost 4 vext2 <0,u,1,1>, <4,2,6,0>
+ 3763487805U, // <1,1,4,3>: Cost 4 vext3 <0,u,1,1>, <1,4,3,5>
+ 2556882230U, // <1,1,4,4>: Cost 3 vext1 <1,1,1,4>, RHS
+ 1548897590U, // <1,1,4,5>: Cost 2 vext2 <0,u,1,1>, RHS
+ 2758184246U, // <1,1,4,6>: Cost 3 vuzpl <1,1,1,1>, RHS
+ 3666457677U, // <1,1,4,7>: Cost 4 vext1 <7,1,1,4>, <7,1,1,4>
+ 1548897833U, // <1,1,4,u>: Cost 2 vext2 <0,u,1,1>, RHS
+ 2693653615U, // <1,1,5,0>: Cost 3 vext3 <1,5,0,1>, <1,5,0,1>
+ 2617331408U, // <1,1,5,1>: Cost 3 vext2 <0,0,1,1>, <5,1,7,3>
+ 4029302934U, // <1,1,5,2>: Cost 4 vzipr <0,4,1,5>, <3,0,1,2>
+ 2689746064U, // <1,1,5,3>: Cost 3 vext3 <0,u,1,1>, <1,5,3,7>
+ 2221564755U, // <1,1,5,4>: Cost 3 vrev <1,1,4,5>
+ 2955559250U, // <1,1,5,5>: Cost 3 vzipr <0,4,1,5>, <0,4,1,5>
+ 2617331810U, // <1,1,5,6>: Cost 3 vext2 <0,0,1,1>, <5,6,7,0>
+ 2825293110U, // <1,1,5,7>: Cost 3 vuzpr <1,1,1,1>, RHS
+ 2689746109U, // <1,1,5,u>: Cost 3 vext3 <0,u,1,1>, <1,5,u,7>
+ 3696382241U, // <1,1,6,0>: Cost 4 vext2 <0,u,1,1>, <6,0,1,2>
+ 2689746127U, // <1,1,6,1>: Cost 3 vext3 <0,u,1,1>, <1,6,1,7>
+ 2617332218U, // <1,1,6,2>: Cost 3 vext2 <0,0,1,1>, <6,2,7,3>
+ 3763487969U, // <1,1,6,3>: Cost 4 vext3 <0,u,1,1>, <1,6,3,7>
+ 3696382605U, // <1,1,6,4>: Cost 4 vext2 <0,u,1,1>, <6,4,5,6>
+ 4029309266U, // <1,1,6,5>: Cost 4 vzipr <0,4,1,6>, <0,4,1,5>
+ 2617332536U, // <1,1,6,6>: Cost 3 vext2 <0,0,1,1>, <6,6,6,6>
+ 2724840702U, // <1,1,6,7>: Cost 3 vext3 <6,7,0,1>, <1,6,7,0>
+ 2725504263U, // <1,1,6,u>: Cost 3 vext3 <6,u,0,1>, <1,6,u,0>
+ 2617332720U, // <1,1,7,0>: Cost 3 vext2 <0,0,1,1>, <7,0,0,1>
+ 2659800138U, // <1,1,7,1>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
+ 3691074717U, // <1,1,7,2>: Cost 4 vext2 <0,0,1,1>, <7,2,1,3>
+ 4167811174U, // <1,1,7,3>: Cost 4 vtrnr <1,1,5,7>, LHS
+ 2617333094U, // <1,1,7,4>: Cost 3 vext2 <0,0,1,1>, <7,4,5,6>
+ 3295396702U, // <1,1,7,5>: Cost 4 vrev <1,1,5,7>
+ 3803891014U, // <1,1,7,6>: Cost 4 vext3 <7,6,0,1>, <1,7,6,0>
+ 2617333356U, // <1,1,7,7>: Cost 3 vext2 <0,0,1,1>, <7,7,7,7>
+ 2659800138U, // <1,1,7,u>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
+ 1483112550U, // <1,1,u,0>: Cost 2 vext1 <1,1,1,1>, LHS
+ 202162278U, // <1,1,u,1>: Cost 1 vdup1 LHS
+ 2622642056U, // <1,1,u,2>: Cost 3 vext2 <0,u,1,1>, <u,2,3,3>
+ 2014142566U, // <1,1,u,3>: Cost 2 vtrnr LHS, LHS
+ 1483115830U, // <1,1,u,4>: Cost 2 vext1 <1,1,1,1>, RHS
+ 1548900506U, // <1,1,u,5>: Cost 2 vext2 <0,u,1,1>, RHS
+ 2622642384U, // <1,1,u,6>: Cost 3 vext2 <0,u,1,1>, <u,6,3,7>
+ 2825293353U, // <1,1,u,7>: Cost 3 vuzpr <1,1,1,1>, RHS
+ 202162278U, // <1,1,u,u>: Cost 1 vdup1 LHS
+ 2635251712U, // <1,2,0,0>: Cost 3 vext2 <3,0,1,2>, <0,0,0,0>
+ 1561509990U, // <1,2,0,1>: Cost 2 vext2 <3,0,1,2>, LHS
+ 2618663085U, // <1,2,0,2>: Cost 3 vext2 <0,2,1,2>, <0,2,1,2>
+ 2696529358U, // <1,2,0,3>: Cost 3 vext3 <2,0,3,1>, <2,0,3,1>
+ 2635252050U, // <1,2,0,4>: Cost 3 vext2 <3,0,1,2>, <0,4,1,5>
+ 3769533926U, // <1,2,0,5>: Cost 4 vext3 <1,u,2,1>, <2,0,5,7>
+ 2621317617U, // <1,2,0,6>: Cost 3 vext2 <0,6,1,2>, <0,6,1,2>
+ 2659140170U, // <1,2,0,7>: Cost 3 vext2 <7,0,1,2>, <0,7,2,1>
+ 1561510557U, // <1,2,0,u>: Cost 2 vext2 <3,0,1,2>, LHS
+ 2623308516U, // <1,2,1,0>: Cost 3 vext2 <1,0,1,2>, <1,0,1,2>
+ 2635252532U, // <1,2,1,1>: Cost 3 vext2 <3,0,1,2>, <1,1,1,1>
+ 2631271318U, // <1,2,1,2>: Cost 3 vext2 <2,3,1,2>, <1,2,3,0>
+ 2958180454U, // <1,2,1,3>: Cost 3 vzipr <0,u,1,1>, LHS
+ 2550959414U, // <1,2,1,4>: Cost 3 vext1 <0,1,2,1>, RHS
+ 2635252880U, // <1,2,1,5>: Cost 3 vext2 <3,0,1,2>, <1,5,3,7>
+ 2635252952U, // <1,2,1,6>: Cost 3 vext2 <3,0,1,2>, <1,6,2,7>
+ 3732882731U, // <1,2,1,7>: Cost 4 vext2 <7,0,1,2>, <1,7,3,0>
+ 2958180459U, // <1,2,1,u>: Cost 3 vzipr <0,u,1,1>, LHS
+ 2629281213U, // <1,2,2,0>: Cost 3 vext2 <2,0,1,2>, <2,0,1,2>
+ 2635253280U, // <1,2,2,1>: Cost 3 vext2 <3,0,1,2>, <2,1,3,2>
+ 2618664552U, // <1,2,2,2>: Cost 3 vext2 <0,2,1,2>, <2,2,2,2>
+ 2689746546U, // <1,2,2,3>: Cost 3 vext3 <0,u,1,1>, <2,2,3,3>
+ 3764815485U, // <1,2,2,4>: Cost 4 vext3 <1,1,1,1>, <2,2,4,5>
+ 3760023176U, // <1,2,2,5>: Cost 4 vext3 <0,2,u,1>, <2,2,5,7>
+ 2635253690U, // <1,2,2,6>: Cost 3 vext2 <3,0,1,2>, <2,6,3,7>
+ 2659141610U, // <1,2,2,7>: Cost 3 vext2 <7,0,1,2>, <2,7,0,1>
+ 2689746591U, // <1,2,2,u>: Cost 3 vext3 <0,u,1,1>, <2,2,u,3>
+ 403488870U, // <1,2,3,0>: Cost 1 vext1 LHS, LHS
+ 1477231350U, // <1,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+ 1477232232U, // <1,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
+ 1477233052U, // <1,2,3,3>: Cost 2 vext1 LHS, <3,3,3,3>
+ 403492150U, // <1,2,3,4>: Cost 1 vext1 LHS, RHS
+ 1525010128U, // <1,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
+ 1525010938U, // <1,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+ 1525011450U, // <1,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
+ 403494702U, // <1,2,3,u>: Cost 1 vext1 LHS, LHS
+ 2641226607U, // <1,2,4,0>: Cost 3 vext2 <4,0,1,2>, <4,0,1,2>
+ 3624723446U, // <1,2,4,1>: Cost 4 vext1 <0,1,2,4>, <1,3,4,6>
+ 3301123609U, // <1,2,4,2>: Cost 4 vrev <2,1,2,4>
+ 2598759198U, // <1,2,4,3>: Cost 3 vext1 <u,1,2,4>, <3,u,1,2>
+ 2659142864U, // <1,2,4,4>: Cost 3 vext2 <7,0,1,2>, <4,4,4,4>
+ 1561513270U, // <1,2,4,5>: Cost 2 vext2 <3,0,1,2>, RHS
+ 2659143028U, // <1,2,4,6>: Cost 3 vext2 <7,0,1,2>, <4,6,4,6>
+ 2659143112U, // <1,2,4,7>: Cost 3 vext2 <7,0,1,2>, <4,7,5,0>
+ 1561513513U, // <1,2,4,u>: Cost 2 vext2 <3,0,1,2>, RHS
+ 2550988902U, // <1,2,5,0>: Cost 3 vext1 <0,1,2,5>, LHS
+ 2550989824U, // <1,2,5,1>: Cost 3 vext1 <0,1,2,5>, <1,3,5,7>
+ 3624732264U, // <1,2,5,2>: Cost 4 vext1 <0,1,2,5>, <2,2,2,2>
+ 2955559014U, // <1,2,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
+ 2550992182U, // <1,2,5,4>: Cost 3 vext1 <0,1,2,5>, RHS
+ 2659143684U, // <1,2,5,5>: Cost 3 vext2 <7,0,1,2>, <5,5,5,5>
+ 2659143778U, // <1,2,5,6>: Cost 3 vext2 <7,0,1,2>, <5,6,7,0>
+ 2659143848U, // <1,2,5,7>: Cost 3 vext2 <7,0,1,2>, <5,7,5,7>
+ 2550994734U, // <1,2,5,u>: Cost 3 vext1 <0,1,2,5>, LHS
+ 2700289945U, // <1,2,6,0>: Cost 3 vext3 <2,6,0,1>, <2,6,0,1>
+ 2635256232U, // <1,2,6,1>: Cost 3 vext2 <3,0,1,2>, <6,1,7,2>
+ 2659144186U, // <1,2,6,2>: Cost 3 vext2 <7,0,1,2>, <6,2,7,3>
+ 2689746874U, // <1,2,6,3>: Cost 3 vext3 <0,u,1,1>, <2,6,3,7>
+ 3763488705U, // <1,2,6,4>: Cost 4 vext3 <0,u,1,1>, <2,6,4,5>
+ 3763488716U, // <1,2,6,5>: Cost 4 vext3 <0,u,1,1>, <2,6,5,7>
+ 2659144504U, // <1,2,6,6>: Cost 3 vext2 <7,0,1,2>, <6,6,6,6>
+ 2657817432U, // <1,2,6,7>: Cost 3 vext2 <6,7,1,2>, <6,7,1,2>
+ 2689746919U, // <1,2,6,u>: Cost 3 vext3 <0,u,1,1>, <2,6,u,7>
+ 1585402874U, // <1,2,7,0>: Cost 2 vext2 <7,0,1,2>, <7,0,1,2>
+ 2659144770U, // <1,2,7,1>: Cost 3 vext2 <7,0,1,2>, <7,1,0,2>
+ 3708998858U, // <1,2,7,2>: Cost 4 vext2 <3,0,1,2>, <7,2,6,3>
+ 2635257059U, // <1,2,7,3>: Cost 3 vext2 <3,0,1,2>, <7,3,0,1>
+ 2659145062U, // <1,2,7,4>: Cost 3 vext2 <7,0,1,2>, <7,4,5,6>
+ 3732886916U, // <1,2,7,5>: Cost 4 vext2 <7,0,1,2>, <7,5,0,0>
+ 3732886998U, // <1,2,7,6>: Cost 4 vext2 <7,0,1,2>, <7,6,0,1>
+ 2659145255U, // <1,2,7,7>: Cost 3 vext2 <7,0,1,2>, <7,7,0,1>
+ 1590711938U, // <1,2,7,u>: Cost 2 vext2 <7,u,1,2>, <7,u,1,2>
+ 403529835U, // <1,2,u,0>: Cost 1 vext1 LHS, LHS
+ 1477272310U, // <1,2,u,1>: Cost 2 vext1 LHS, <1,0,3,2>
+ 1477273192U, // <1,2,u,2>: Cost 2 vext1 LHS, <2,2,2,2>
+ 1477273750U, // <1,2,u,3>: Cost 2 vext1 LHS, <3,0,1,2>
+ 403533110U, // <1,2,u,4>: Cost 1 vext1 LHS, RHS
+ 1561516186U, // <1,2,u,5>: Cost 2 vext2 <3,0,1,2>, RHS
+ 1525051898U, // <1,2,u,6>: Cost 2 vext1 LHS, <6,2,7,3>
+ 1525052410U, // <1,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
+ 403535662U, // <1,2,u,u>: Cost 1 vext1 LHS, LHS
+ 2819407872U, // <1,3,0,0>: Cost 3 vuzpr LHS, <0,0,0,0>
+ 1551564902U, // <1,3,0,1>: Cost 2 vext2 <1,3,1,3>, LHS
+ 2819408630U, // <1,3,0,2>: Cost 3 vuzpr LHS, <1,0,3,2>
+ 2619334911U, // <1,3,0,3>: Cost 3 vext2 <0,3,1,3>, <0,3,1,3>
+ 2625306962U, // <1,3,0,4>: Cost 3 vext2 <1,3,1,3>, <0,4,1,5>
+ 3832725879U, // <1,3,0,5>: Cost 4 vuzpl <1,2,3,0>, <0,4,5,6>
+ 3699048959U, // <1,3,0,6>: Cost 4 vext2 <1,3,1,3>, <0,6,2,7>
+ 3776538827U, // <1,3,0,7>: Cost 4 vext3 <3,0,7,1>, <3,0,7,1>
+ 1551565469U, // <1,3,0,u>: Cost 2 vext2 <1,3,1,3>, LHS
+ 2618671862U, // <1,3,1,0>: Cost 3 vext2 <0,2,1,3>, <1,0,3,2>
+ 2819408692U, // <1,3,1,1>: Cost 3 vuzpr LHS, <1,1,1,1>
+ 2624643975U, // <1,3,1,2>: Cost 3 vext2 <1,2,1,3>, <1,2,1,3>
+ 1745666150U, // <1,3,1,3>: Cost 2 vuzpr LHS, LHS
+ 2557005110U, // <1,3,1,4>: Cost 3 vext1 <1,1,3,1>, RHS
+ 2625307792U, // <1,3,1,5>: Cost 3 vext2 <1,3,1,3>, <1,5,3,7>
+ 3698386127U, // <1,3,1,6>: Cost 4 vext2 <1,2,1,3>, <1,6,1,7>
+ 2592838748U, // <1,3,1,7>: Cost 3 vext1 <7,1,3,1>, <7,1,3,1>
+ 1745666155U, // <1,3,1,u>: Cost 2 vuzpr LHS, LHS
+ 2819408790U, // <1,3,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
+ 2625308193U, // <1,3,2,1>: Cost 3 vext2 <1,3,1,3>, <2,1,3,3>
+ 2819408036U, // <1,3,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
+ 2819851890U, // <1,3,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
+ 2819408794U, // <1,3,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
+ 3893149890U, // <1,3,2,5>: Cost 4 vuzpr LHS, <0,2,3,5>
+ 2819408076U, // <1,3,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
+ 3772041583U, // <1,3,2,7>: Cost 4 vext3 <2,3,0,1>, <3,2,7,3>
+ 2819408042U, // <1,3,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
+ 1483276390U, // <1,3,3,0>: Cost 2 vext1 <1,1,3,3>, LHS
+ 1483277128U, // <1,3,3,1>: Cost 2 vext1 <1,1,3,3>, <1,1,3,3>
+ 2557019752U, // <1,3,3,2>: Cost 3 vext1 <1,1,3,3>, <2,2,2,2>
+ 2819408856U, // <1,3,3,3>: Cost 3 vuzpr LHS, <1,3,1,3>
+ 1483279670U, // <1,3,3,4>: Cost 2 vext1 <1,1,3,3>, RHS
+ 2819409614U, // <1,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
+ 2598826490U, // <1,3,3,6>: Cost 3 vext1 <u,1,3,3>, <6,2,7,3>
+ 3087844352U, // <1,3,3,7>: Cost 3 vtrnr LHS, <1,3,5,7>
+ 1483282222U, // <1,3,3,u>: Cost 2 vext1 <1,1,3,3>, LHS
+ 2568970342U, // <1,3,4,0>: Cost 3 vext1 <3,1,3,4>, LHS
+ 2568971224U, // <1,3,4,1>: Cost 3 vext1 <3,1,3,4>, <1,3,1,3>
+ 3832761290U, // <1,3,4,2>: Cost 4 vuzpl <1,2,3,4>, <4,1,2,3>
+ 2233428219U, // <1,3,4,3>: Cost 3 vrev <3,1,3,4>
+ 2568973622U, // <1,3,4,4>: Cost 3 vext1 <3,1,3,4>, RHS
+ 1551568182U, // <1,3,4,5>: Cost 2 vext2 <1,3,1,3>, RHS
+ 2819410434U, // <1,3,4,6>: Cost 3 vuzpr LHS, <3,4,5,6>
+ 3666605151U, // <1,3,4,7>: Cost 4 vext1 <7,1,3,4>, <7,1,3,4>
+ 1551568425U, // <1,3,4,u>: Cost 2 vext2 <1,3,1,3>, RHS
+ 2563006566U, // <1,3,5,0>: Cost 3 vext1 <2,1,3,5>, LHS
+ 2568979456U, // <1,3,5,1>: Cost 3 vext1 <3,1,3,5>, <1,3,5,7>
+ 2563008035U, // <1,3,5,2>: Cost 3 vext1 <2,1,3,5>, <2,1,3,5>
+ 2233436412U, // <1,3,5,3>: Cost 3 vrev <3,1,3,5>
+ 2563009846U, // <1,3,5,4>: Cost 3 vext1 <2,1,3,5>, RHS
+ 2867187716U, // <1,3,5,5>: Cost 3 vuzpr LHS, <5,5,5,5>
+ 2655834214U, // <1,3,5,6>: Cost 3 vext2 <6,4,1,3>, <5,6,7,4>
+ 1745669430U, // <1,3,5,7>: Cost 2 vuzpr LHS, RHS
+ 1745669431U, // <1,3,5,u>: Cost 2 vuzpr LHS, RHS
+ 2867187810U, // <1,3,6,0>: Cost 3 vuzpr LHS, <5,6,7,0>
+ 3699052931U, // <1,3,6,1>: Cost 4 vext2 <1,3,1,3>, <6,1,3,1>
+ 2654507460U, // <1,3,6,2>: Cost 3 vext2 <6,2,1,3>, <6,2,1,3>
+ 3766291091U, // <1,3,6,3>: Cost 4 vext3 <1,3,3,1>, <3,6,3,7>
+ 2655834726U, // <1,3,6,4>: Cost 3 vext2 <6,4,1,3>, <6,4,1,3>
+ 3923384562U, // <1,3,6,5>: Cost 4 vuzpr <5,1,7,3>, <u,6,7,5>
+ 2657161992U, // <1,3,6,6>: Cost 3 vext2 <6,6,1,3>, <6,6,1,3>
+ 2819852218U, // <1,3,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
+ 2819852219U, // <1,3,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
+ 2706926275U, // <1,3,7,0>: Cost 3 vext3 <3,7,0,1>, <3,7,0,1>
+ 2659816524U, // <1,3,7,1>: Cost 3 vext2 <7,1,1,3>, <7,1,1,3>
+ 3636766245U, // <1,3,7,2>: Cost 4 vext1 <2,1,3,7>, <2,1,3,7>
+ 2867187903U, // <1,3,7,3>: Cost 3 vuzpr LHS, <5,7,u,3>
+ 2625312102U, // <1,3,7,4>: Cost 3 vext2 <1,3,1,3>, <7,4,5,6>
+ 2867188598U, // <1,3,7,5>: Cost 3 vuzpr LHS, <6,7,4,5>
+ 3728250344U, // <1,3,7,6>: Cost 4 vext2 <6,2,1,3>, <7,6,2,1>
+ 2867187880U, // <1,3,7,7>: Cost 3 vuzpr LHS, <5,7,5,7>
+ 2707516171U, // <1,3,7,u>: Cost 3 vext3 <3,7,u,1>, <3,7,u,1>
+ 1483317350U, // <1,3,u,0>: Cost 2 vext1 <1,1,3,u>, LHS
+ 1483318093U, // <1,3,u,1>: Cost 2 vext1 <1,1,3,u>, <1,1,3,u>
+ 2819410718U, // <1,3,u,2>: Cost 3 vuzpr LHS, <3,u,1,2>
+ 1745666717U, // <1,3,u,3>: Cost 2 vuzpr LHS, LHS
+ 1483320630U, // <1,3,u,4>: Cost 2 vext1 <1,1,3,u>, RHS
+ 1551571098U, // <1,3,u,5>: Cost 2 vext2 <1,3,1,3>, RHS
+ 2819410758U, // <1,3,u,6>: Cost 3 vuzpr LHS, <3,u,5,6>
+ 1745669673U, // <1,3,u,7>: Cost 2 vuzpr LHS, RHS
+ 1745666722U, // <1,3,u,u>: Cost 2 vuzpr LHS, LHS
+ 2617352205U, // <1,4,0,0>: Cost 3 vext2 <0,0,1,4>, <0,0,1,4>
+ 2619342950U, // <1,4,0,1>: Cost 3 vext2 <0,3,1,4>, LHS
+ 3692421295U, // <1,4,0,2>: Cost 4 vext2 <0,2,1,4>, <0,2,1,4>
+ 2619343104U, // <1,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
+ 2617352530U, // <1,4,0,4>: Cost 3 vext2 <0,0,1,4>, <0,4,1,5>
+ 1634880402U, // <1,4,0,5>: Cost 2 vext3 <4,0,5,1>, <4,0,5,1>
+ 2713930652U, // <1,4,0,6>: Cost 3 vext3 <4,u,5,1>, <4,0,6,2>
+ 3732898396U, // <1,4,0,7>: Cost 4 vext2 <7,0,1,4>, <0,7,4,1>
+ 1635101613U, // <1,4,0,u>: Cost 2 vext3 <4,0,u,1>, <4,0,u,1>
+ 3693085430U, // <1,4,1,0>: Cost 4 vext2 <0,3,1,4>, <1,0,3,2>
+ 2623988535U, // <1,4,1,1>: Cost 3 vext2 <1,1,1,4>, <1,1,1,4>
+ 3693085590U, // <1,4,1,2>: Cost 4 vext2 <0,3,1,4>, <1,2,3,0>
+ 3692422134U, // <1,4,1,3>: Cost 4 vext2 <0,2,1,4>, <1,3,4,6>
+ 3693085726U, // <1,4,1,4>: Cost 4 vext2 <0,3,1,4>, <1,4,0,1>
+ 2892401974U, // <1,4,1,5>: Cost 3 vzipl <1,1,1,1>, RHS
+ 3026619702U, // <1,4,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
+ 3800206324U, // <1,4,1,7>: Cost 4 vext3 <7,0,4,1>, <4,1,7,0>
+ 2892402217U, // <1,4,1,u>: Cost 3 vzipl <1,1,1,1>, RHS
+ 3966978927U, // <1,4,2,0>: Cost 4 vzipl <1,2,3,4>, <4,0,1,2>
+ 3966979018U, // <1,4,2,1>: Cost 4 vzipl <1,2,3,4>, <4,1,2,3>
+ 3693086312U, // <1,4,2,2>: Cost 4 vext2 <0,3,1,4>, <2,2,2,2>
+ 2635269798U, // <1,4,2,3>: Cost 3 vext2 <3,0,1,4>, <2,3,0,1>
+ 3966979280U, // <1,4,2,4>: Cost 4 vzipl <1,2,3,4>, <4,4,4,4>
+ 2893204790U, // <1,4,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
+ 3693086650U, // <1,4,2,6>: Cost 4 vext2 <0,3,1,4>, <2,6,3,7>
+ 3666662502U, // <1,4,2,7>: Cost 4 vext1 <7,1,4,2>, <7,1,4,2>
+ 2893205033U, // <1,4,2,u>: Cost 3 vzipl <1,2,3,0>, RHS
+ 2563063910U, // <1,4,3,0>: Cost 3 vext1 <2,1,4,3>, LHS
+ 2563064730U, // <1,4,3,1>: Cost 3 vext1 <2,1,4,3>, <1,2,3,4>
+ 2563065386U, // <1,4,3,2>: Cost 3 vext1 <2,1,4,3>, <2,1,4,3>
+ 3693087132U, // <1,4,3,3>: Cost 4 vext2 <0,3,1,4>, <3,3,3,3>
+ 2619345410U, // <1,4,3,4>: Cost 3 vext2 <0,3,1,4>, <3,4,5,6>
+ 3087843666U, // <1,4,3,5>: Cost 3 vtrnr LHS, <0,4,1,5>
+ 3087843676U, // <1,4,3,6>: Cost 3 vtrnr LHS, <0,4,2,6>
+ 3666670695U, // <1,4,3,7>: Cost 4 vext1 <7,1,4,3>, <7,1,4,3>
+ 3087843669U, // <1,4,3,u>: Cost 3 vtrnr LHS, <0,4,1,u>
+ 2620672914U, // <1,4,4,0>: Cost 3 vext2 <0,5,1,4>, <4,0,5,1>
+ 3630842706U, // <1,4,4,1>: Cost 4 vext1 <1,1,4,4>, <1,1,4,4>
+ 3313069003U, // <1,4,4,2>: Cost 4 vrev <4,1,2,4>
+ 3642788100U, // <1,4,4,3>: Cost 4 vext1 <3,1,4,4>, <3,1,4,4>
+ 2713930960U, // <1,4,4,4>: Cost 3 vext3 <4,u,5,1>, <4,4,4,4>
+ 2619346230U, // <1,4,4,5>: Cost 3 vext2 <0,3,1,4>, RHS
+ 2713930980U, // <1,4,4,6>: Cost 3 vext3 <4,u,5,1>, <4,4,6,6>
+ 3736882642U, // <1,4,4,7>: Cost 4 vext2 <7,6,1,4>, <4,7,6,1>
+ 2619346473U, // <1,4,4,u>: Cost 3 vext2 <0,3,1,4>, RHS
+ 2557108326U, // <1,4,5,0>: Cost 3 vext1 <1,1,4,5>, LHS
+ 2557109075U, // <1,4,5,1>: Cost 3 vext1 <1,1,4,5>, <1,1,4,5>
+ 2598913774U, // <1,4,5,2>: Cost 3 vext1 <u,1,4,5>, <2,3,u,1>
+ 3630852246U, // <1,4,5,3>: Cost 4 vext1 <1,1,4,5>, <3,0,1,2>
+ 2557111606U, // <1,4,5,4>: Cost 3 vext1 <1,1,4,5>, RHS
+ 2895252790U, // <1,4,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
+ 1616006454U, // <1,4,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
+ 3899059510U, // <1,4,5,7>: Cost 4 vuzpr <1,1,1,4>, RHS
+ 1616006472U, // <1,4,5,u>: Cost 2 vext3 <0,u,1,1>, RHS
+ 2557116518U, // <1,4,6,0>: Cost 3 vext1 <1,1,4,6>, LHS
+ 2557117236U, // <1,4,6,1>: Cost 3 vext1 <1,1,4,6>, <1,1,1,1>
+ 3630859880U, // <1,4,6,2>: Cost 4 vext1 <1,1,4,6>, <2,2,2,2>
+ 2569062550U, // <1,4,6,3>: Cost 3 vext1 <3,1,4,6>, <3,0,1,2>
+ 2557119798U, // <1,4,6,4>: Cost 3 vext1 <1,1,4,6>, RHS
+ 3763490174U, // <1,4,6,5>: Cost 4 vext3 <0,u,1,1>, <4,6,5,7>
+ 3763490183U, // <1,4,6,6>: Cost 4 vext3 <0,u,1,1>, <4,6,6,7>
+ 2712751498U, // <1,4,6,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
+ 2557122350U, // <1,4,6,u>: Cost 3 vext1 <1,1,4,6>, LHS
+ 2659161084U, // <1,4,7,0>: Cost 3 vext2 <7,0,1,4>, <7,0,1,4>
+ 3732903040U, // <1,4,7,1>: Cost 4 vext2 <7,0,1,4>, <7,1,7,1>
+ 3734230174U, // <1,4,7,2>: Cost 4 vext2 <7,2,1,4>, <7,2,1,4>
+ 3734893807U, // <1,4,7,3>: Cost 4 vext2 <7,3,1,4>, <7,3,1,4>
+ 3660729654U, // <1,4,7,4>: Cost 4 vext1 <6,1,4,7>, RHS
+ 3786493384U, // <1,4,7,5>: Cost 4 vext3 <4,6,7,1>, <4,7,5,0>
+ 2713341394U, // <1,4,7,6>: Cost 3 vext3 <4,7,6,1>, <4,7,6,1>
+ 3660731386U, // <1,4,7,7>: Cost 4 vext1 <6,1,4,7>, <7,0,1,2>
+ 2664470148U, // <1,4,7,u>: Cost 3 vext2 <7,u,1,4>, <7,u,1,4>
+ 2557132902U, // <1,4,u,0>: Cost 3 vext1 <1,1,4,u>, LHS
+ 2619348782U, // <1,4,u,1>: Cost 3 vext2 <0,3,1,4>, LHS
+ 2563106351U, // <1,4,u,2>: Cost 3 vext1 <2,1,4,u>, <2,1,4,u>
+ 2713783816U, // <1,4,u,3>: Cost 3 vext3 <4,u,3,1>, <4,u,3,1>
+ 2622666815U, // <1,4,u,4>: Cost 3 vext2 <0,u,1,4>, <u,4,5,6>
+ 1640189466U, // <1,4,u,5>: Cost 2 vext3 <4,u,5,1>, <4,u,5,1>
+ 1616006697U, // <1,4,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
+ 2712751498U, // <1,4,u,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
+ 1616006715U, // <1,4,u,u>: Cost 2 vext3 <0,u,1,1>, RHS
+ 2620014592U, // <1,5,0,0>: Cost 3 vext2 <0,4,1,5>, <0,0,0,0>
+ 1546272870U, // <1,5,0,1>: Cost 2 vext2 <0,4,1,5>, LHS
+ 2618687664U, // <1,5,0,2>: Cost 3 vext2 <0,2,1,5>, <0,2,1,5>
+ 3693093120U, // <1,5,0,3>: Cost 4 vext2 <0,3,1,5>, <0,3,1,4>
+ 1546273106U, // <1,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
+ 2620678563U, // <1,5,0,5>: Cost 3 vext2 <0,5,1,5>, <0,5,1,5>
+ 2714668660U, // <1,5,0,6>: Cost 3 vext3 <5,0,6,1>, <5,0,6,1>
+ 3772042877U, // <1,5,0,7>: Cost 4 vext3 <2,3,0,1>, <5,0,7,1>
+ 1546273437U, // <1,5,0,u>: Cost 2 vext2 <0,4,1,5>, LHS
+ 2620015350U, // <1,5,1,0>: Cost 3 vext2 <0,4,1,5>, <1,0,3,2>
+ 2620015412U, // <1,5,1,1>: Cost 3 vext2 <0,4,1,5>, <1,1,1,1>
+ 2620015510U, // <1,5,1,2>: Cost 3 vext2 <0,4,1,5>, <1,2,3,0>
+ 2618688512U, // <1,5,1,3>: Cost 3 vext2 <0,2,1,5>, <1,3,5,7>
+ 2620015677U, // <1,5,1,4>: Cost 3 vext2 <0,4,1,5>, <1,4,3,5>
+ 2620015727U, // <1,5,1,5>: Cost 3 vext2 <0,4,1,5>, <1,5,0,1>
+ 2620015859U, // <1,5,1,6>: Cost 3 vext2 <0,4,1,5>, <1,6,5,7>
+ 3093728566U, // <1,5,1,7>: Cost 3 vtrnr <1,1,1,1>, RHS
+ 2620015981U, // <1,5,1,u>: Cost 3 vext2 <0,4,1,5>, <1,u,1,3>
+ 3692430816U, // <1,5,2,0>: Cost 4 vext2 <0,2,1,5>, <2,0,5,1>
+ 2620016163U, // <1,5,2,1>: Cost 3 vext2 <0,4,1,5>, <2,1,3,5>
+ 2620016232U, // <1,5,2,2>: Cost 3 vext2 <0,4,1,5>, <2,2,2,2>
+ 2620016294U, // <1,5,2,3>: Cost 3 vext2 <0,4,1,5>, <2,3,0,1>
+ 3693758221U, // <1,5,2,4>: Cost 4 vext2 <0,4,1,5>, <2,4,2,5>
+ 3692431209U, // <1,5,2,5>: Cost 4 vext2 <0,2,1,5>, <2,5,3,7>
+ 2620016570U, // <1,5,2,6>: Cost 3 vext2 <0,4,1,5>, <2,6,3,7>
+ 4173598006U, // <1,5,2,7>: Cost 4 vtrnr <2,1,3,2>, RHS
+ 2620016699U, // <1,5,2,u>: Cost 3 vext2 <0,4,1,5>, <2,u,0,1>
+ 2620016790U, // <1,5,3,0>: Cost 3 vext2 <0,4,1,5>, <3,0,1,2>
+ 2569110672U, // <1,5,3,1>: Cost 3 vext1 <3,1,5,3>, <1,5,3,7>
+ 3693758785U, // <1,5,3,2>: Cost 4 vext2 <0,4,1,5>, <3,2,2,2>
+ 2620017052U, // <1,5,3,3>: Cost 3 vext2 <0,4,1,5>, <3,3,3,3>
+ 2620017154U, // <1,5,3,4>: Cost 3 vext2 <0,4,1,5>, <3,4,5,6>
+ 3135623172U, // <1,5,3,5>: Cost 3 vtrnr LHS, <5,5,5,5>
+ 4161587048U, // <1,5,3,6>: Cost 4 vtrnr LHS, <2,5,3,6>
+ 2014104886U, // <1,5,3,7>: Cost 2 vtrnr LHS, RHS
+ 2014104887U, // <1,5,3,u>: Cost 2 vtrnr LHS, RHS
+ 2620017554U, // <1,5,4,0>: Cost 3 vext2 <0,4,1,5>, <4,0,5,1>
+ 2620017634U, // <1,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
+ 3693759551U, // <1,5,4,2>: Cost 4 vext2 <0,4,1,5>, <4,2,6,3>
+ 3642861837U, // <1,5,4,3>: Cost 4 vext1 <3,1,5,4>, <3,1,5,4>
+ 2575092710U, // <1,5,4,4>: Cost 3 vext1 <4,1,5,4>, <4,1,5,4>
+ 1546276150U, // <1,5,4,5>: Cost 2 vext2 <0,4,1,5>, RHS
+ 2759855414U, // <1,5,4,6>: Cost 3 vuzpl <1,3,5,7>, RHS
+ 2713931718U, // <1,5,4,7>: Cost 3 vext3 <4,u,5,1>, <5,4,7,6>
+ 1546276393U, // <1,5,4,u>: Cost 2 vext2 <0,4,1,5>, RHS
+ 2557182054U, // <1,5,5,0>: Cost 3 vext1 <1,1,5,5>, LHS
+ 2557182812U, // <1,5,5,1>: Cost 3 vext1 <1,1,5,5>, <1,1,5,5>
+ 3630925347U, // <1,5,5,2>: Cost 4 vext1 <1,1,5,5>, <2,1,3,5>
+ 4029301675U, // <1,5,5,3>: Cost 4 vzipr <0,4,1,5>, <1,2,5,3>
+ 2557185334U, // <1,5,5,4>: Cost 3 vext1 <1,1,5,5>, RHS
+ 2713931780U, // <1,5,5,5>: Cost 3 vext3 <4,u,5,1>, <5,5,5,5>
+ 2667794530U, // <1,5,5,6>: Cost 3 vext2 <u,4,1,5>, <5,6,7,0>
+ 2713931800U, // <1,5,5,7>: Cost 3 vext3 <4,u,5,1>, <5,5,7,7>
+ 2557187886U, // <1,5,5,u>: Cost 3 vext1 <1,1,5,5>, LHS
+ 2718208036U, // <1,5,6,0>: Cost 3 vext3 <5,6,0,1>, <5,6,0,1>
+ 2620019115U, // <1,5,6,1>: Cost 3 vext2 <0,4,1,5>, <6,1,7,5>
+ 2667794938U, // <1,5,6,2>: Cost 3 vext2 <u,4,1,5>, <6,2,7,3>
+ 3787673666U, // <1,5,6,3>: Cost 4 vext3 <4,u,5,1>, <5,6,3,4>
+ 3693761165U, // <1,5,6,4>: Cost 4 vext2 <0,4,1,5>, <6,4,5,6>
+ 3319279297U, // <1,5,6,5>: Cost 4 vrev <5,1,5,6>
+ 2667795256U, // <1,5,6,6>: Cost 3 vext2 <u,4,1,5>, <6,6,6,6>
+ 2713931874U, // <1,5,6,7>: Cost 3 vext3 <4,u,5,1>, <5,6,7,0>
+ 2713931883U, // <1,5,6,u>: Cost 3 vext3 <4,u,5,1>, <5,6,u,0>
+ 2557198438U, // <1,5,7,0>: Cost 3 vext1 <1,1,5,7>, LHS
+ 2557199156U, // <1,5,7,1>: Cost 3 vext1 <1,1,5,7>, <1,1,1,1>
+ 2569143974U, // <1,5,7,2>: Cost 3 vext1 <3,1,5,7>, <2,3,0,1>
+ 2569144592U, // <1,5,7,3>: Cost 3 vext1 <3,1,5,7>, <3,1,5,7>
+ 2557201718U, // <1,5,7,4>: Cost 3 vext1 <1,1,5,7>, RHS
+ 2713931944U, // <1,5,7,5>: Cost 3 vext3 <4,u,5,1>, <5,7,5,7>
+ 3787673770U, // <1,5,7,6>: Cost 4 vext3 <4,u,5,1>, <5,7,6,0>
+ 2719387828U, // <1,5,7,7>: Cost 3 vext3 <5,7,7,1>, <5,7,7,1>
+ 2557204270U, // <1,5,7,u>: Cost 3 vext1 <1,1,5,7>, LHS
+ 2620020435U, // <1,5,u,0>: Cost 3 vext2 <0,4,1,5>, <u,0,1,2>
+ 1546278702U, // <1,5,u,1>: Cost 2 vext2 <0,4,1,5>, LHS
+ 2620020616U, // <1,5,u,2>: Cost 3 vext2 <0,4,1,5>, <u,2,3,3>
+ 2620020668U, // <1,5,u,3>: Cost 3 vext2 <0,4,1,5>, <u,3,0,1>
+ 1594054682U, // <1,5,u,4>: Cost 2 vext2 <u,4,1,5>, <u,4,1,5>
+ 1546279066U, // <1,5,u,5>: Cost 2 vext2 <0,4,1,5>, RHS
+ 2620020944U, // <1,5,u,6>: Cost 3 vext2 <0,4,1,5>, <u,6,3,7>
+ 2014145846U, // <1,5,u,7>: Cost 2 vtrnr LHS, RHS
+ 2014145847U, // <1,5,u,u>: Cost 2 vtrnr LHS, RHS
+ 3692437504U, // <1,6,0,0>: Cost 4 vext2 <0,2,1,6>, <0,0,0,0>
+ 2618695782U, // <1,6,0,1>: Cost 3 vext2 <0,2,1,6>, LHS
+ 2618695857U, // <1,6,0,2>: Cost 3 vext2 <0,2,1,6>, <0,2,1,6>
+ 3794161970U, // <1,6,0,3>: Cost 4 vext3 <6,0,3,1>, <6,0,3,1>
+ 2620023122U, // <1,6,0,4>: Cost 3 vext2 <0,4,1,6>, <0,4,1,5>
+ 2620686756U, // <1,6,0,5>: Cost 3 vext2 <0,5,1,6>, <0,5,1,6>
+ 2621350389U, // <1,6,0,6>: Cost 3 vext2 <0,6,1,6>, <0,6,1,6>
+ 4028599606U, // <1,6,0,7>: Cost 4 vzipr <0,3,1,0>, RHS
+ 2618696349U, // <1,6,0,u>: Cost 3 vext2 <0,2,1,6>, LHS
+ 3692438262U, // <1,6,1,0>: Cost 4 vext2 <0,2,1,6>, <1,0,3,2>
+ 2625995572U, // <1,6,1,1>: Cost 3 vext2 <1,4,1,6>, <1,1,1,1>
+ 3692438422U, // <1,6,1,2>: Cost 4 vext2 <0,2,1,6>, <1,2,3,0>
+ 3692438488U, // <1,6,1,3>: Cost 4 vext2 <0,2,1,6>, <1,3,1,3>
+ 2625995820U, // <1,6,1,4>: Cost 3 vext2 <1,4,1,6>, <1,4,1,6>
+ 3692438672U, // <1,6,1,5>: Cost 4 vext2 <0,2,1,6>, <1,5,3,7>
+ 3692438720U, // <1,6,1,6>: Cost 4 vext2 <0,2,1,6>, <1,6,0,1>
+ 2958183734U, // <1,6,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
+ 2958183735U, // <1,6,1,u>: Cost 3 vzipr <0,u,1,1>, RHS
+ 2721526201U, // <1,6,2,0>: Cost 3 vext3 <6,2,0,1>, <6,2,0,1>
+ 3692439097U, // <1,6,2,1>: Cost 4 vext2 <0,2,1,6>, <2,1,6,0>
+ 3692439144U, // <1,6,2,2>: Cost 4 vext2 <0,2,1,6>, <2,2,2,2>
+ 3692439206U, // <1,6,2,3>: Cost 4 vext2 <0,2,1,6>, <2,3,0,1>
+ 3636948278U, // <1,6,2,4>: Cost 4 vext1 <2,1,6,2>, RHS
+ 3787674092U, // <1,6,2,5>: Cost 4 vext3 <4,u,5,1>, <6,2,5,7>
+ 2618697658U, // <1,6,2,6>: Cost 3 vext2 <0,2,1,6>, <2,6,3,7>
+ 2970799414U, // <1,6,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
+ 2970799415U, // <1,6,2,u>: Cost 3 vzipr <3,0,1,2>, RHS
+ 2563211366U, // <1,6,3,0>: Cost 3 vext1 <2,1,6,3>, LHS
+ 3699738854U, // <1,6,3,1>: Cost 4 vext2 <1,4,1,6>, <3,1,1,1>
+ 2563212860U, // <1,6,3,2>: Cost 3 vext1 <2,1,6,3>, <2,1,6,3>
+ 3692439964U, // <1,6,3,3>: Cost 4 vext2 <0,2,1,6>, <3,3,3,3>
+ 2563214646U, // <1,6,3,4>: Cost 3 vext1 <2,1,6,3>, RHS
+ 4191820018U, // <1,6,3,5>: Cost 4 vtrnr <5,1,7,3>, <u,6,7,5>
+ 2587103648U, // <1,6,3,6>: Cost 3 vext1 <6,1,6,3>, <6,1,6,3>
+ 3087845306U, // <1,6,3,7>: Cost 3 vtrnr LHS, <2,6,3,7>
+ 3087845307U, // <1,6,3,u>: Cost 3 vtrnr LHS, <2,6,3,u>
+ 3693767570U, // <1,6,4,0>: Cost 4 vext2 <0,4,1,6>, <4,0,5,1>
+ 3693767650U, // <1,6,4,1>: Cost 4 vext2 <0,4,1,6>, <4,1,5,0>
+ 3636962877U, // <1,6,4,2>: Cost 4 vext1 <2,1,6,4>, <2,1,6,4>
+ 3325088134U, // <1,6,4,3>: Cost 4 vrev <6,1,3,4>
+ 3693767898U, // <1,6,4,4>: Cost 4 vext2 <0,4,1,6>, <4,4,5,5>
+ 2618699062U, // <1,6,4,5>: Cost 3 vext2 <0,2,1,6>, RHS
+ 3833670966U, // <1,6,4,6>: Cost 4 vuzpl <1,3,6,7>, RHS
+ 4028632374U, // <1,6,4,7>: Cost 4 vzipr <0,3,1,4>, RHS
+ 2618699305U, // <1,6,4,u>: Cost 3 vext2 <0,2,1,6>, RHS
+ 3693768264U, // <1,6,5,0>: Cost 4 vext2 <0,4,1,6>, <5,0,1,2>
+ 3630998373U, // <1,6,5,1>: Cost 4 vext1 <1,1,6,5>, <1,1,6,5>
+ 3636971070U, // <1,6,5,2>: Cost 4 vext1 <2,1,6,5>, <2,1,6,5>
+ 3642943767U, // <1,6,5,3>: Cost 4 vext1 <3,1,6,5>, <3,1,6,5>
+ 3693768628U, // <1,6,5,4>: Cost 4 vext2 <0,4,1,6>, <5,4,5,6>
+ 3732918276U, // <1,6,5,5>: Cost 4 vext2 <7,0,1,6>, <5,5,5,5>
+ 2620690530U, // <1,6,5,6>: Cost 3 vext2 <0,5,1,6>, <5,6,7,0>
+ 2955562294U, // <1,6,5,7>: Cost 3 vzipr <0,4,1,5>, RHS
+ 2955562295U, // <1,6,5,u>: Cost 3 vzipr <0,4,1,5>, RHS
+ 2724180733U, // <1,6,6,0>: Cost 3 vext3 <6,6,0,1>, <6,6,0,1>
+ 3631006566U, // <1,6,6,1>: Cost 4 vext1 <1,1,6,6>, <1,1,6,6>
+ 3631007674U, // <1,6,6,2>: Cost 4 vext1 <1,1,6,6>, <2,6,3,7>
+ 3692442184U, // <1,6,6,3>: Cost 4 vext2 <0,2,1,6>, <6,3,7,0>
+ 3631009078U, // <1,6,6,4>: Cost 4 vext1 <1,1,6,6>, RHS
+ 3787674416U, // <1,6,6,5>: Cost 4 vext3 <4,u,5,1>, <6,6,5,7>
+ 2713932600U, // <1,6,6,6>: Cost 3 vext3 <4,u,5,1>, <6,6,6,6>
+ 2713932610U, // <1,6,6,7>: Cost 3 vext3 <4,u,5,1>, <6,6,7,7>
+ 2713932619U, // <1,6,6,u>: Cost 3 vext3 <4,u,5,1>, <6,6,u,7>
+ 1651102542U, // <1,6,7,0>: Cost 2 vext3 <6,7,0,1>, <6,7,0,1>
+ 2724918103U, // <1,6,7,1>: Cost 3 vext3 <6,7,1,1>, <6,7,1,1>
+ 2698302306U, // <1,6,7,2>: Cost 3 vext3 <2,3,0,1>, <6,7,2,3>
+ 3642960153U, // <1,6,7,3>: Cost 4 vext1 <3,1,6,7>, <3,1,6,7>
+ 2713932662U, // <1,6,7,4>: Cost 3 vext3 <4,u,5,1>, <6,7,4,5>
+ 2725213051U, // <1,6,7,5>: Cost 3 vext3 <6,7,5,1>, <6,7,5,1>
+ 2724844426U, // <1,6,7,6>: Cost 3 vext3 <6,7,0,1>, <6,7,6,7>
+ 4035956022U, // <1,6,7,7>: Cost 4 vzipr <1,5,1,7>, RHS
+ 1651692438U, // <1,6,7,u>: Cost 2 vext3 <6,7,u,1>, <6,7,u,1>
+ 1651766175U, // <1,6,u,0>: Cost 2 vext3 <6,u,0,1>, <6,u,0,1>
+ 2618701614U, // <1,6,u,1>: Cost 3 vext2 <0,2,1,6>, LHS
+ 3135663508U, // <1,6,u,2>: Cost 3 vtrnr LHS, <4,6,u,2>
+ 3692443580U, // <1,6,u,3>: Cost 4 vext2 <0,2,1,6>, <u,3,0,1>
+ 2713932743U, // <1,6,u,4>: Cost 3 vext3 <4,u,5,1>, <6,u,4,5>
+ 2618701978U, // <1,6,u,5>: Cost 3 vext2 <0,2,1,6>, RHS
+ 2622683344U, // <1,6,u,6>: Cost 3 vext2 <0,u,1,6>, <u,6,3,7>
+ 3087886266U, // <1,6,u,7>: Cost 3 vtrnr LHS, <2,6,3,7>
+ 1652356071U, // <1,6,u,u>: Cost 2 vext3 <6,u,u,1>, <6,u,u,1>
+ 2726171632U, // <1,7,0,0>: Cost 3 vext3 <7,0,0,1>, <7,0,0,1>
+ 2626666598U, // <1,7,0,1>: Cost 3 vext2 <1,5,1,7>, LHS
+ 3695100067U, // <1,7,0,2>: Cost 4 vext2 <0,6,1,7>, <0,2,0,1>
+ 3707044102U, // <1,7,0,3>: Cost 4 vext2 <2,6,1,7>, <0,3,2,1>
+ 2726466580U, // <1,7,0,4>: Cost 3 vext3 <7,0,4,1>, <7,0,4,1>
+ 3654921933U, // <1,7,0,5>: Cost 4 vext1 <5,1,7,0>, <5,1,7,0>
+ 2621358582U, // <1,7,0,6>: Cost 3 vext2 <0,6,1,7>, <0,6,1,7>
+ 2622022215U, // <1,7,0,7>: Cost 3 vext2 <0,7,1,7>, <0,7,1,7>
+ 2626667165U, // <1,7,0,u>: Cost 3 vext2 <1,5,1,7>, LHS
+ 2593128550U, // <1,7,1,0>: Cost 3 vext1 <7,1,7,1>, LHS
+ 2626667316U, // <1,7,1,1>: Cost 3 vext2 <1,5,1,7>, <1,1,1,1>
+ 3700409238U, // <1,7,1,2>: Cost 4 vext2 <1,5,1,7>, <1,2,3,0>
+ 2257294428U, // <1,7,1,3>: Cost 3 vrev <7,1,3,1>
+ 2593131830U, // <1,7,1,4>: Cost 3 vext1 <7,1,7,1>, RHS
+ 2626667646U, // <1,7,1,5>: Cost 3 vext2 <1,5,1,7>, <1,5,1,7>
+ 2627331279U, // <1,7,1,6>: Cost 3 vext2 <1,6,1,7>, <1,6,1,7>
+ 2593133696U, // <1,7,1,7>: Cost 3 vext1 <7,1,7,1>, <7,1,7,1>
+ 2628658545U, // <1,7,1,u>: Cost 3 vext2 <1,u,1,7>, <1,u,1,7>
+ 2587164774U, // <1,7,2,0>: Cost 3 vext1 <6,1,7,2>, LHS
+ 3701073445U, // <1,7,2,1>: Cost 4 vext2 <1,6,1,7>, <2,1,3,7>
+ 3700409960U, // <1,7,2,2>: Cost 4 vext2 <1,5,1,7>, <2,2,2,2>
+ 2638612134U, // <1,7,2,3>: Cost 3 vext2 <3,5,1,7>, <2,3,0,1>
+ 2587168054U, // <1,7,2,4>: Cost 3 vext1 <6,1,7,2>, RHS
+ 3706382167U, // <1,7,2,5>: Cost 4 vext2 <2,5,1,7>, <2,5,1,7>
+ 2587169192U, // <1,7,2,6>: Cost 3 vext1 <6,1,7,2>, <6,1,7,2>
+ 3660911610U, // <1,7,2,7>: Cost 4 vext1 <6,1,7,2>, <7,0,1,2>
+ 2587170606U, // <1,7,2,u>: Cost 3 vext1 <6,1,7,2>, LHS
+ 1507459174U, // <1,7,3,0>: Cost 2 vext1 <5,1,7,3>, LHS
+ 2569257984U, // <1,7,3,1>: Cost 3 vext1 <3,1,7,3>, <1,3,5,7>
+ 2581202536U, // <1,7,3,2>: Cost 3 vext1 <5,1,7,3>, <2,2,2,2>
+ 2569259294U, // <1,7,3,3>: Cost 3 vext1 <3,1,7,3>, <3,1,7,3>
+ 1507462454U, // <1,7,3,4>: Cost 2 vext1 <5,1,7,3>, RHS
+ 1507462864U, // <1,7,3,5>: Cost 2 vext1 <5,1,7,3>, <5,1,7,3>
+ 2581205498U, // <1,7,3,6>: Cost 3 vext1 <5,1,7,3>, <6,2,7,3>
+ 2581206010U, // <1,7,3,7>: Cost 3 vext1 <5,1,7,3>, <7,0,1,2>
+ 1507465006U, // <1,7,3,u>: Cost 2 vext1 <5,1,7,3>, LHS
+ 2728826164U, // <1,7,4,0>: Cost 3 vext3 <7,4,0,1>, <7,4,0,1>
+ 3654951732U, // <1,7,4,1>: Cost 4 vext1 <5,1,7,4>, <1,1,1,1>
+ 3330987094U, // <1,7,4,2>: Cost 4 vrev <7,1,2,4>
+ 3331060831U, // <1,7,4,3>: Cost 4 vrev <7,1,3,4>
+ 3787674971U, // <1,7,4,4>: Cost 4 vext3 <4,u,5,1>, <7,4,4,4>
+ 2626669878U, // <1,7,4,5>: Cost 3 vext2 <1,5,1,7>, RHS
+ 3785979241U, // <1,7,4,6>: Cost 4 vext3 <4,6,0,1>, <7,4,6,0>
+ 3787085176U, // <1,7,4,7>: Cost 4 vext3 <4,7,6,1>, <7,4,7,6>
+ 2626670121U, // <1,7,4,u>: Cost 3 vext2 <1,5,1,7>, RHS
+ 2569273446U, // <1,7,5,0>: Cost 3 vext1 <3,1,7,5>, LHS
+ 2569274368U, // <1,7,5,1>: Cost 3 vext1 <3,1,7,5>, <1,3,5,7>
+ 3643016808U, // <1,7,5,2>: Cost 4 vext1 <3,1,7,5>, <2,2,2,2>
+ 2569275680U, // <1,7,5,3>: Cost 3 vext1 <3,1,7,5>, <3,1,7,5>
+ 2569276726U, // <1,7,5,4>: Cost 3 vext1 <3,1,7,5>, RHS
+ 4102034790U, // <1,7,5,5>: Cost 4 vtrnl <1,3,5,7>, <7,4,5,6>
+ 2651222067U, // <1,7,5,6>: Cost 3 vext2 <5,6,1,7>, <5,6,1,7>
+ 3899378998U, // <1,7,5,7>: Cost 4 vuzpr <1,1,5,7>, RHS
+ 2569279278U, // <1,7,5,u>: Cost 3 vext1 <3,1,7,5>, LHS
+ 2730153430U, // <1,7,6,0>: Cost 3 vext3 <7,6,0,1>, <7,6,0,1>
+ 2724845022U, // <1,7,6,1>: Cost 3 vext3 <6,7,0,1>, <7,6,1,0>
+ 3643025338U, // <1,7,6,2>: Cost 4 vext1 <3,1,7,6>, <2,6,3,7>
+ 3643025697U, // <1,7,6,3>: Cost 4 vext1 <3,1,7,6>, <3,1,7,6>
+ 3643026742U, // <1,7,6,4>: Cost 4 vext1 <3,1,7,6>, RHS
+ 3654971091U, // <1,7,6,5>: Cost 4 vext1 <5,1,7,6>, <5,1,7,6>
+ 3787675153U, // <1,7,6,6>: Cost 4 vext3 <4,u,5,1>, <7,6,6,6>
+ 2724845076U, // <1,7,6,7>: Cost 3 vext3 <6,7,0,1>, <7,6,7,0>
+ 2725508637U, // <1,7,6,u>: Cost 3 vext3 <6,u,0,1>, <7,6,u,0>
+ 2730817063U, // <1,7,7,0>: Cost 3 vext3 <7,7,0,1>, <7,7,0,1>
+ 3631088436U, // <1,7,7,1>: Cost 4 vext1 <1,1,7,7>, <1,1,1,1>
+ 3660949158U, // <1,7,7,2>: Cost 4 vext1 <6,1,7,7>, <2,3,0,1>
+ 3801904705U, // <1,7,7,3>: Cost 4 vext3 <7,3,0,1>, <7,7,3,0>
+ 3631090998U, // <1,7,7,4>: Cost 4 vext1 <1,1,7,7>, RHS
+ 2662503828U, // <1,7,7,5>: Cost 3 vext2 <7,5,1,7>, <7,5,1,7>
+ 3660951981U, // <1,7,7,6>: Cost 4 vext1 <6,1,7,7>, <6,1,7,7>
+ 2713933420U, // <1,7,7,7>: Cost 3 vext3 <4,u,5,1>, <7,7,7,7>
+ 2731406959U, // <1,7,7,u>: Cost 3 vext3 <7,7,u,1>, <7,7,u,1>
+ 1507500134U, // <1,7,u,0>: Cost 2 vext1 <5,1,7,u>, LHS
+ 2626672430U, // <1,7,u,1>: Cost 3 vext2 <1,5,1,7>, LHS
+ 2581243496U, // <1,7,u,2>: Cost 3 vext1 <5,1,7,u>, <2,2,2,2>
+ 2569300259U, // <1,7,u,3>: Cost 3 vext1 <3,1,7,u>, <3,1,7,u>
+ 1507503414U, // <1,7,u,4>: Cost 2 vext1 <5,1,7,u>, RHS
+ 1507503829U, // <1,7,u,5>: Cost 2 vext1 <5,1,7,u>, <5,1,7,u>
+ 2581246458U, // <1,7,u,6>: Cost 3 vext1 <5,1,7,u>, <6,2,7,3>
+ 2581246970U, // <1,7,u,7>: Cost 3 vext1 <5,1,7,u>, <7,0,1,2>
+ 1507505966U, // <1,7,u,u>: Cost 2 vext1 <5,1,7,u>, LHS
+ 1543643153U, // <1,u,0,0>: Cost 2 vext2 <0,0,1,u>, <0,0,1,u>
+ 1546297446U, // <1,u,0,1>: Cost 2 vext2 <0,4,1,u>, LHS
+ 2819448852U, // <1,u,0,2>: Cost 3 vuzpr LHS, <0,0,2,2>
+ 2619375876U, // <1,u,0,3>: Cost 3 vext2 <0,3,1,u>, <0,3,1,u>
+ 1546297685U, // <1,u,0,4>: Cost 2 vext2 <0,4,1,u>, <0,4,1,u>
+ 1658771190U, // <1,u,0,5>: Cost 2 vext3 <u,0,5,1>, <u,0,5,1>
+ 2736789248U, // <1,u,0,6>: Cost 3 vext3 <u,7,0,1>, <u,0,6,2>
+ 2659189376U, // <1,u,0,7>: Cost 3 vext2 <7,0,1,u>, <0,7,u,1>
+ 1546298013U, // <1,u,0,u>: Cost 2 vext2 <0,4,1,u>, LHS
+ 1483112550U, // <1,u,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
+ 202162278U, // <1,u,1,1>: Cost 1 vdup1 LHS
+ 1616009006U, // <1,u,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
+ 1745707110U, // <1,u,1,3>: Cost 2 vuzpr LHS, LHS
+ 1483115830U, // <1,u,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
+ 2620040336U, // <1,u,1,5>: Cost 3 vext2 <0,4,1,u>, <1,5,3,7>
+ 3026622618U, // <1,u,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
+ 2958183752U, // <1,u,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
+ 202162278U, // <1,u,1,u>: Cost 1 vdup1 LHS
+ 2819449750U, // <1,u,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
+ 2893207342U, // <1,u,2,1>: Cost 3 vzipl <1,2,3,0>, LHS
+ 2819448996U, // <1,u,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
+ 2819450482U, // <1,u,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
+ 2819449754U, // <1,u,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
+ 2893207706U, // <1,u,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
+ 2819449036U, // <1,u,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
+ 2970799432U, // <1,u,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
+ 2819449002U, // <1,u,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
+ 403931292U, // <1,u,3,0>: Cost 1 vext1 LHS, LHS
+ 1477673718U, // <1,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+ 115726126U, // <1,u,3,2>: Cost 1 vrev LHS
+ 2014102173U, // <1,u,3,3>: Cost 2 vtrnr LHS, LHS
+ 403934518U, // <1,u,3,4>: Cost 1 vext1 LHS, RHS
+ 1507536601U, // <1,u,3,5>: Cost 2 vext1 <5,1,u,3>, <5,1,u,3>
+ 1525453306U, // <1,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+ 2014105129U, // <1,u,3,7>: Cost 2 vtrnr LHS, RHS
+ 403937070U, // <1,u,3,u>: Cost 1 vext1 LHS, LHS
+ 2620042157U, // <1,u,4,0>: Cost 3 vext2 <0,4,1,u>, <4,0,u,1>
+ 2620042237U, // <1,u,4,1>: Cost 3 vext2 <0,4,1,u>, <4,1,u,0>
+ 2263217967U, // <1,u,4,2>: Cost 3 vrev <u,1,2,4>
+ 2569341224U, // <1,u,4,3>: Cost 3 vext1 <3,1,u,4>, <3,1,u,4>
+ 2569342262U, // <1,u,4,4>: Cost 3 vext1 <3,1,u,4>, RHS
+ 1546300726U, // <1,u,4,5>: Cost 2 vext2 <0,4,1,u>, RHS
+ 2819449180U, // <1,u,4,6>: Cost 3 vuzpr LHS, <0,4,2,6>
+ 2724845649U, // <1,u,4,7>: Cost 3 vext3 <6,7,0,1>, <u,4,7,6>
+ 1546300969U, // <1,u,4,u>: Cost 2 vext2 <0,4,1,u>, RHS
+ 2551431270U, // <1,u,5,0>: Cost 3 vext1 <0,1,u,5>, LHS
+ 2551432192U, // <1,u,5,1>: Cost 3 vext1 <0,1,u,5>, <1,3,5,7>
+ 3028293422U, // <1,u,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
+ 2955559068U, // <1,u,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
+ 2551434550U, // <1,u,5,4>: Cost 3 vext1 <0,1,u,5>, RHS
+ 2895255706U, // <1,u,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
+ 1616009370U, // <1,u,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
+ 1745710390U, // <1,u,5,7>: Cost 2 vuzpr LHS, RHS
+ 1745710391U, // <1,u,5,u>: Cost 2 vuzpr LHS, RHS
+ 2653221159U, // <1,u,6,0>: Cost 3 vext2 <6,0,1,u>, <6,0,1,u>
+ 2725509303U, // <1,u,6,1>: Cost 3 vext3 <6,u,0,1>, <u,6,1,0>
+ 2659193338U, // <1,u,6,2>: Cost 3 vext2 <7,0,1,u>, <6,2,7,3>
+ 2689751248U, // <1,u,6,3>: Cost 3 vext3 <0,u,1,1>, <u,6,3,7>
+ 2867228774U, // <1,u,6,4>: Cost 3 vuzpr LHS, <5,6,7,4>
+ 3764820194U, // <1,u,6,5>: Cost 4 vext3 <1,1,1,1>, <u,6,5,7>
+ 2657202957U, // <1,u,6,6>: Cost 3 vext2 <6,6,1,u>, <6,6,1,u>
+ 2819450810U, // <1,u,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
+ 2819450811U, // <1,u,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
+ 1585452032U, // <1,u,7,0>: Cost 2 vext2 <7,0,1,u>, <7,0,1,u>
+ 2557420340U, // <1,u,7,1>: Cost 3 vext1 <1,1,u,7>, <1,1,1,1>
+ 2569365158U, // <1,u,7,2>: Cost 3 vext1 <3,1,u,7>, <2,3,0,1>
+ 2569365803U, // <1,u,7,3>: Cost 3 vext1 <3,1,u,7>, <3,1,u,7>
+ 2557422902U, // <1,u,7,4>: Cost 3 vext1 <1,1,u,7>, RHS
+ 2662512021U, // <1,u,7,5>: Cost 3 vext2 <7,5,1,u>, <7,5,1,u>
+ 2724845884U, // <1,u,7,6>: Cost 3 vext3 <6,7,0,1>, <u,7,6,7>
+ 2659194476U, // <1,u,7,7>: Cost 3 vext2 <7,0,1,u>, <7,7,7,7>
+ 1590761096U, // <1,u,7,u>: Cost 2 vext2 <7,u,1,u>, <7,u,1,u>
+ 403972257U, // <1,u,u,0>: Cost 1 vext1 LHS, LHS
+ 202162278U, // <1,u,u,1>: Cost 1 vdup1 LHS
+ 115767091U, // <1,u,u,2>: Cost 1 vrev LHS
+ 1745707677U, // <1,u,u,3>: Cost 2 vuzpr LHS, LHS
+ 403975478U, // <1,u,u,4>: Cost 1 vext1 LHS, RHS
+ 1546303642U, // <1,u,u,5>: Cost 2 vext2 <0,4,1,u>, RHS
+ 1616009613U, // <1,u,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
+ 1745710633U, // <1,u,u,7>: Cost 2 vuzpr LHS, RHS
+ 403978030U, // <1,u,u,u>: Cost 1 vext1 LHS, LHS
+ 2551463936U, // <2,0,0,0>: Cost 3 vext1 <0,2,0,0>, <0,0,0,0>
+ 2685698058U, // <2,0,0,1>: Cost 3 vext3 <0,2,0,2>, <0,0,1,1>
+ 1610776596U, // <2,0,0,2>: Cost 2 vext3 <0,0,2,2>, <0,0,2,2>
+ 2619384069U, // <2,0,0,3>: Cost 3 vext2 <0,3,2,0>, <0,3,2,0>
+ 2551467318U, // <2,0,0,4>: Cost 3 vext1 <0,2,0,0>, RHS
+ 3899836596U, // <2,0,0,5>: Cost 4 vuzpr <1,2,3,0>, <3,0,4,5>
+ 2621374968U, // <2,0,0,6>: Cost 3 vext2 <0,6,2,0>, <0,6,2,0>
+ 4168271334U, // <2,0,0,7>: Cost 4 vtrnr <1,2,3,0>, <2,0,5,7>
+ 1611219018U, // <2,0,0,u>: Cost 2 vext3 <0,0,u,2>, <0,0,u,2>
+ 2551472138U, // <2,0,1,0>: Cost 3 vext1 <0,2,0,1>, <0,0,1,1>
+ 2690564186U, // <2,0,1,1>: Cost 3 vext3 <1,0,3,2>, <0,1,1,0>
+ 1611956326U, // <2,0,1,2>: Cost 2 vext3 <0,2,0,2>, LHS
+ 2826092646U, // <2,0,1,3>: Cost 3 vuzpr <1,2,3,0>, LHS
+ 2551475510U, // <2,0,1,4>: Cost 3 vext1 <0,2,0,1>, RHS
+ 3692463248U, // <2,0,1,5>: Cost 4 vext2 <0,2,2,0>, <1,5,3,7>
+ 2587308473U, // <2,0,1,6>: Cost 3 vext1 <6,2,0,1>, <6,2,0,1>
+ 3661050874U, // <2,0,1,7>: Cost 4 vext1 <6,2,0,1>, <7,0,1,2>
+ 1611956380U, // <2,0,1,u>: Cost 2 vext3 <0,2,0,2>, LHS
+ 1477738598U, // <2,0,2,0>: Cost 2 vext1 <0,2,0,2>, LHS
+ 2551481078U, // <2,0,2,1>: Cost 3 vext1 <0,2,0,2>, <1,0,3,2>
+ 2551481796U, // <2,0,2,2>: Cost 3 vext1 <0,2,0,2>, <2,0,2,0>
+ 2551482518U, // <2,0,2,3>: Cost 3 vext1 <0,2,0,2>, <3,0,1,2>
+ 1477741878U, // <2,0,2,4>: Cost 2 vext1 <0,2,0,2>, RHS
+ 2551484112U, // <2,0,2,5>: Cost 3 vext1 <0,2,0,2>, <5,1,7,3>
+ 2551484759U, // <2,0,2,6>: Cost 3 vext1 <0,2,0,2>, <6,0,7,2>
+ 2551485434U, // <2,0,2,7>: Cost 3 vext1 <0,2,0,2>, <7,0,1,2>
+ 1477744430U, // <2,0,2,u>: Cost 2 vext1 <0,2,0,2>, LHS
+ 2953625600U, // <2,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
+ 2953627302U, // <2,0,3,1>: Cost 3 vzipr LHS, <2,3,0,1>
+ 2953625764U, // <2,0,3,2>: Cost 3 vzipr LHS, <0,2,0,2>
+ 4027369695U, // <2,0,3,3>: Cost 4 vzipr LHS, <3,1,0,3>
+ 3625233718U, // <2,0,3,4>: Cost 4 vext1 <0,2,0,3>, RHS
+ 3899836110U, // <2,0,3,5>: Cost 4 vuzpr <1,2,3,0>, <2,3,4,5>
+ 4032012618U, // <2,0,3,6>: Cost 4 vzipr LHS, <0,4,0,6>
+ 3899835392U, // <2,0,3,7>: Cost 4 vuzpr <1,2,3,0>, <1,3,5,7>
+ 2953625770U, // <2,0,3,u>: Cost 3 vzipr LHS, <0,2,0,u>
+ 2551496806U, // <2,0,4,0>: Cost 3 vext1 <0,2,0,4>, LHS
+ 2685698386U, // <2,0,4,1>: Cost 3 vext3 <0,2,0,2>, <0,4,1,5>
+ 2685698396U, // <2,0,4,2>: Cost 3 vext3 <0,2,0,2>, <0,4,2,6>
+ 3625240726U, // <2,0,4,3>: Cost 4 vext1 <0,2,0,4>, <3,0,1,2>
+ 2551500086U, // <2,0,4,4>: Cost 3 vext1 <0,2,0,4>, RHS
+ 2618723638U, // <2,0,4,5>: Cost 3 vext2 <0,2,2,0>, RHS
+ 2765409590U, // <2,0,4,6>: Cost 3 vuzpl <2,3,0,1>, RHS
+ 3799990664U, // <2,0,4,7>: Cost 4 vext3 <7,0,1,2>, <0,4,7,5>
+ 2685698450U, // <2,0,4,u>: Cost 3 vext3 <0,2,0,2>, <0,4,u,6>
+ 3625246822U, // <2,0,5,0>: Cost 4 vext1 <0,2,0,5>, LHS
+ 3289776304U, // <2,0,5,1>: Cost 4 vrev <0,2,1,5>
+ 2690564526U, // <2,0,5,2>: Cost 3 vext3 <1,0,3,2>, <0,5,2,7>
+ 3289923778U, // <2,0,5,3>: Cost 4 vrev <0,2,3,5>
+ 2216255691U, // <2,0,5,4>: Cost 3 vrev <0,2,4,5>
+ 3726307332U, // <2,0,5,5>: Cost 4 vext2 <5,u,2,0>, <5,5,5,5>
+ 3726307426U, // <2,0,5,6>: Cost 4 vext2 <5,u,2,0>, <5,6,7,0>
+ 2826095926U, // <2,0,5,7>: Cost 3 vuzpr <1,2,3,0>, RHS
+ 2216550639U, // <2,0,5,u>: Cost 3 vrev <0,2,u,5>
+ 4162420736U, // <2,0,6,0>: Cost 4 vtrnr <0,2,4,6>, <0,0,0,0>
+ 2901885030U, // <2,0,6,1>: Cost 3 vzipl <2,6,3,7>, LHS
+ 2685698559U, // <2,0,6,2>: Cost 3 vext3 <0,2,0,2>, <0,6,2,7>
+ 3643173171U, // <2,0,6,3>: Cost 4 vext1 <3,2,0,6>, <3,2,0,6>
+ 2216263884U, // <2,0,6,4>: Cost 3 vrev <0,2,4,6>
+ 3730289341U, // <2,0,6,5>: Cost 4 vext2 <6,5,2,0>, <6,5,2,0>
+ 3726308152U, // <2,0,6,6>: Cost 4 vext2 <5,u,2,0>, <6,6,6,6>
+ 3899836346U, // <2,0,6,7>: Cost 4 vuzpr <1,2,3,0>, <2,6,3,7>
+ 2216558832U, // <2,0,6,u>: Cost 3 vrev <0,2,u,6>
+ 2659202049U, // <2,0,7,0>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
+ 3726308437U, // <2,0,7,1>: Cost 4 vext2 <5,u,2,0>, <7,1,2,3>
+ 2726249034U, // <2,0,7,2>: Cost 3 vext3 <7,0,1,2>, <0,7,2,1>
+ 3734934772U, // <2,0,7,3>: Cost 4 vext2 <7,3,2,0>, <7,3,2,0>
+ 3726308710U, // <2,0,7,4>: Cost 4 vext2 <5,u,2,0>, <7,4,5,6>
+ 3726308814U, // <2,0,7,5>: Cost 4 vext2 <5,u,2,0>, <7,5,u,2>
+ 3736925671U, // <2,0,7,6>: Cost 4 vext2 <7,6,2,0>, <7,6,2,0>
+ 3726308972U, // <2,0,7,7>: Cost 4 vext2 <5,u,2,0>, <7,7,7,7>
+ 2659202049U, // <2,0,7,u>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
+ 1477787750U, // <2,0,u,0>: Cost 2 vext1 <0,2,0,u>, LHS
+ 2953668262U, // <2,0,u,1>: Cost 3 vzipr LHS, <2,3,0,1>
+ 1611956893U, // <2,0,u,2>: Cost 2 vext3 <0,2,0,2>, LHS
+ 2551531670U, // <2,0,u,3>: Cost 3 vext1 <0,2,0,u>, <3,0,1,2>
+ 1477791030U, // <2,0,u,4>: Cost 2 vext1 <0,2,0,u>, RHS
+ 2618726554U, // <2,0,u,5>: Cost 3 vext2 <0,2,2,0>, RHS
+ 2765412506U, // <2,0,u,6>: Cost 3 vuzpl <2,3,0,1>, RHS
+ 2826096169U, // <2,0,u,7>: Cost 3 vuzpr <1,2,3,0>, RHS
+ 1611956947U, // <2,0,u,u>: Cost 2 vext3 <0,2,0,2>, LHS
+ 2569453670U, // <2,1,0,0>: Cost 3 vext1 <3,2,1,0>, LHS
+ 2619392102U, // <2,1,0,1>: Cost 3 vext2 <0,3,2,1>, LHS
+ 3759440619U, // <2,1,0,2>: Cost 4 vext3 <0,2,0,2>, <1,0,2,0>
+ 1616823030U, // <2,1,0,3>: Cost 2 vext3 <1,0,3,2>, <1,0,3,2>
+ 2569456950U, // <2,1,0,4>: Cost 3 vext1 <3,2,1,0>, RHS
+ 2690712328U, // <2,1,0,5>: Cost 3 vext3 <1,0,5,2>, <1,0,5,2>
+ 3661115841U, // <2,1,0,6>: Cost 4 vext1 <6,2,1,0>, <6,2,1,0>
+ 2622046794U, // <2,1,0,7>: Cost 3 vext2 <0,7,2,1>, <0,7,2,1>
+ 1617191715U, // <2,1,0,u>: Cost 2 vext3 <1,0,u,2>, <1,0,u,2>
+ 2551545958U, // <2,1,1,0>: Cost 3 vext1 <0,2,1,1>, LHS
+ 2685698868U, // <2,1,1,1>: Cost 3 vext3 <0,2,0,2>, <1,1,1,1>
+ 2628682646U, // <2,1,1,2>: Cost 3 vext2 <1,u,2,1>, <1,2,3,0>
+ 2685698888U, // <2,1,1,3>: Cost 3 vext3 <0,2,0,2>, <1,1,3,3>
+ 2551549238U, // <2,1,1,4>: Cost 3 vext1 <0,2,1,1>, RHS
+ 3693134992U, // <2,1,1,5>: Cost 4 vext2 <0,3,2,1>, <1,5,3,7>
+ 3661124034U, // <2,1,1,6>: Cost 4 vext1 <6,2,1,1>, <6,2,1,1>
+ 3625292794U, // <2,1,1,7>: Cost 4 vext1 <0,2,1,1>, <7,0,1,2>
+ 2685698933U, // <2,1,1,u>: Cost 3 vext3 <0,2,0,2>, <1,1,u,3>
+ 2551554150U, // <2,1,2,0>: Cost 3 vext1 <0,2,1,2>, LHS
+ 3893649571U, // <2,1,2,1>: Cost 4 vuzpr <0,2,0,1>, <0,2,0,1>
+ 2551555688U, // <2,1,2,2>: Cost 3 vext1 <0,2,1,2>, <2,2,2,2>
+ 2685698966U, // <2,1,2,3>: Cost 3 vext3 <0,2,0,2>, <1,2,3,0>
+ 2551557430U, // <2,1,2,4>: Cost 3 vext1 <0,2,1,2>, RHS
+ 3763422123U, // <2,1,2,5>: Cost 4 vext3 <0,u,0,2>, <1,2,5,3>
+ 3693135802U, // <2,1,2,6>: Cost 4 vext2 <0,3,2,1>, <2,6,3,7>
+ 2726249402U, // <2,1,2,7>: Cost 3 vext3 <7,0,1,2>, <1,2,7,0>
+ 2685699011U, // <2,1,2,u>: Cost 3 vext3 <0,2,0,2>, <1,2,u,0>
+ 2551562342U, // <2,1,3,0>: Cost 3 vext1 <0,2,1,3>, LHS
+ 2953625610U, // <2,1,3,1>: Cost 3 vzipr LHS, <0,0,1,1>
+ 2953627798U, // <2,1,3,2>: Cost 3 vzipr LHS, <3,0,1,2>
+ 2953626584U, // <2,1,3,3>: Cost 3 vzipr LHS, <1,3,1,3>
+ 2551565622U, // <2,1,3,4>: Cost 3 vext1 <0,2,1,3>, RHS
+ 2953625938U, // <2,1,3,5>: Cost 3 vzipr LHS, <0,4,1,5>
+ 2587398596U, // <2,1,3,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
+ 4032013519U, // <2,1,3,7>: Cost 4 vzipr LHS, <1,6,1,7>
+ 2953625617U, // <2,1,3,u>: Cost 3 vzipr LHS, <0,0,1,u>
+ 2690565154U, // <2,1,4,0>: Cost 3 vext3 <1,0,3,2>, <1,4,0,5>
+ 3625313270U, // <2,1,4,1>: Cost 4 vext1 <0,2,1,4>, <1,3,4,6>
+ 3771532340U, // <2,1,4,2>: Cost 4 vext3 <2,2,2,2>, <1,4,2,5>
+ 1148404634U, // <2,1,4,3>: Cost 2 vrev <1,2,3,4>
+ 3625315638U, // <2,1,4,4>: Cost 4 vext1 <0,2,1,4>, RHS
+ 2619395382U, // <2,1,4,5>: Cost 3 vext2 <0,3,2,1>, RHS
+ 3837242678U, // <2,1,4,6>: Cost 4 vuzpl <2,0,1,2>, RHS
+ 3799991394U, // <2,1,4,7>: Cost 4 vext3 <7,0,1,2>, <1,4,7,6>
+ 1148773319U, // <2,1,4,u>: Cost 2 vrev <1,2,u,4>
+ 2551578726U, // <2,1,5,0>: Cost 3 vext1 <0,2,1,5>, LHS
+ 2551579648U, // <2,1,5,1>: Cost 3 vext1 <0,2,1,5>, <1,3,5,7>
+ 3625321952U, // <2,1,5,2>: Cost 4 vext1 <0,2,1,5>, <2,0,5,1>
+ 2685699216U, // <2,1,5,3>: Cost 3 vext3 <0,2,0,2>, <1,5,3,7>
+ 2551582006U, // <2,1,5,4>: Cost 3 vext1 <0,2,1,5>, RHS
+ 3740913668U, // <2,1,5,5>: Cost 4 vext2 <u,3,2,1>, <5,5,5,5>
+ 3661156806U, // <2,1,5,6>: Cost 4 vext1 <6,2,1,5>, <6,2,1,5>
+ 3893652790U, // <2,1,5,7>: Cost 4 vuzpr <0,2,0,1>, RHS
+ 2685699261U, // <2,1,5,u>: Cost 3 vext3 <0,2,0,2>, <1,5,u,7>
+ 2551586918U, // <2,1,6,0>: Cost 3 vext1 <0,2,1,6>, LHS
+ 3625329398U, // <2,1,6,1>: Cost 4 vext1 <0,2,1,6>, <1,0,3,2>
+ 2551588794U, // <2,1,6,2>: Cost 3 vext1 <0,2,1,6>, <2,6,3,7>
+ 3088679014U, // <2,1,6,3>: Cost 3 vtrnr <0,2,4,6>, LHS
+ 2551590198U, // <2,1,6,4>: Cost 3 vext1 <0,2,1,6>, RHS
+ 4029382994U, // <2,1,6,5>: Cost 4 vzipr <0,4,2,6>, <0,4,1,5>
+ 3625333560U, // <2,1,6,6>: Cost 4 vext1 <0,2,1,6>, <6,6,6,6>
+ 3731624800U, // <2,1,6,7>: Cost 4 vext2 <6,7,2,1>, <6,7,2,1>
+ 2551592750U, // <2,1,6,u>: Cost 3 vext1 <0,2,1,6>, LHS
+ 2622051322U, // <2,1,7,0>: Cost 3 vext2 <0,7,2,1>, <7,0,1,2>
+ 3733615699U, // <2,1,7,1>: Cost 4 vext2 <7,1,2,1>, <7,1,2,1>
+ 3795125538U, // <2,1,7,2>: Cost 4 vext3 <6,1,7,2>, <1,7,2,0>
+ 2222171037U, // <2,1,7,3>: Cost 3 vrev <1,2,3,7>
+ 3740915046U, // <2,1,7,4>: Cost 4 vext2 <u,3,2,1>, <7,4,5,6>
+ 3296060335U, // <2,1,7,5>: Cost 4 vrev <1,2,5,7>
+ 3736933864U, // <2,1,7,6>: Cost 4 vext2 <7,6,2,1>, <7,6,2,1>
+ 3805300055U, // <2,1,7,7>: Cost 4 vext3 <7,u,1,2>, <1,7,7,u>
+ 2669827714U, // <2,1,7,u>: Cost 3 vext2 <u,7,2,1>, <7,u,1,2>
+ 2551603302U, // <2,1,u,0>: Cost 3 vext1 <0,2,1,u>, LHS
+ 2953666570U, // <2,1,u,1>: Cost 3 vzipr LHS, <0,0,1,1>
+ 2953668758U, // <2,1,u,2>: Cost 3 vzipr LHS, <3,0,1,2>
+ 1148437406U, // <2,1,u,3>: Cost 2 vrev <1,2,3,u>
+ 2551606582U, // <2,1,u,4>: Cost 3 vext1 <0,2,1,u>, RHS
+ 2953666898U, // <2,1,u,5>: Cost 3 vzipr LHS, <0,4,1,5>
+ 2587398596U, // <2,1,u,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
+ 2669828370U, // <2,1,u,7>: Cost 3 vext2 <u,7,2,1>, <u,7,2,1>
+ 1148806091U, // <2,1,u,u>: Cost 2 vrev <1,2,u,u>
+ 1543667732U, // <2,2,0,0>: Cost 2 vext2 <0,0,2,2>, <0,0,2,2>
+ 1548976230U, // <2,2,0,1>: Cost 2 vext2 <0,u,2,2>, LHS
+ 2685699524U, // <2,2,0,2>: Cost 3 vext3 <0,2,0,2>, <2,0,2,0>
+ 2685699535U, // <2,2,0,3>: Cost 3 vext3 <0,2,0,2>, <2,0,3,2>
+ 2551614774U, // <2,2,0,4>: Cost 3 vext1 <0,2,2,0>, RHS
+ 3704422830U, // <2,2,0,5>: Cost 4 vext2 <2,2,2,2>, <0,5,2,7>
+ 3893657642U, // <2,2,0,6>: Cost 4 vuzpr <0,2,0,2>, <0,0,4,6>
+ 3770574323U, // <2,2,0,7>: Cost 4 vext3 <2,0,7,2>, <2,0,7,2>
+ 1548976796U, // <2,2,0,u>: Cost 2 vext2 <0,u,2,2>, <0,u,2,2>
+ 2622718710U, // <2,2,1,0>: Cost 3 vext2 <0,u,2,2>, <1,0,3,2>
+ 2622718772U, // <2,2,1,1>: Cost 3 vext2 <0,u,2,2>, <1,1,1,1>
+ 2622718870U, // <2,2,1,2>: Cost 3 vext2 <0,u,2,2>, <1,2,3,0>
+ 2819915878U, // <2,2,1,3>: Cost 3 vuzpr <0,2,0,2>, LHS
+ 3625364790U, // <2,2,1,4>: Cost 4 vext1 <0,2,2,1>, RHS
+ 2622719120U, // <2,2,1,5>: Cost 3 vext2 <0,u,2,2>, <1,5,3,7>
+ 3760031292U, // <2,2,1,6>: Cost 4 vext3 <0,2,u,2>, <2,1,6,3>
+ 3667170468U, // <2,2,1,7>: Cost 4 vext1 <7,2,2,1>, <7,2,2,1>
+ 2819915883U, // <2,2,1,u>: Cost 3 vuzpr <0,2,0,2>, LHS
+ 1489829990U, // <2,2,2,0>: Cost 2 vext1 <2,2,2,2>, LHS
+ 2563572470U, // <2,2,2,1>: Cost 3 vext1 <2,2,2,2>, <1,0,3,2>
+ 269271142U, // <2,2,2,2>: Cost 1 vdup2 LHS
+ 2685699698U, // <2,2,2,3>: Cost 3 vext3 <0,2,0,2>, <2,2,3,3>
+ 1489833270U, // <2,2,2,4>: Cost 2 vext1 <2,2,2,2>, RHS
+ 2685699720U, // <2,2,2,5>: Cost 3 vext3 <0,2,0,2>, <2,2,5,7>
+ 2622719930U, // <2,2,2,6>: Cost 3 vext2 <0,u,2,2>, <2,6,3,7>
+ 2593436837U, // <2,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
+ 269271142U, // <2,2,2,u>: Cost 1 vdup2 LHS
+ 2685699750U, // <2,2,3,0>: Cost 3 vext3 <0,2,0,2>, <2,3,0,1>
+ 2690565806U, // <2,2,3,1>: Cost 3 vext3 <1,0,3,2>, <2,3,1,0>
+ 2953627240U, // <2,2,3,2>: Cost 3 vzipr LHS, <2,2,2,2>
+ 1879883878U, // <2,2,3,3>: Cost 2 vzipr LHS, LHS
+ 2685699790U, // <2,2,3,4>: Cost 3 vext3 <0,2,0,2>, <2,3,4,5>
+ 3893659342U, // <2,2,3,5>: Cost 4 vuzpr <0,2,0,2>, <2,3,4,5>
+ 2958270812U, // <2,2,3,6>: Cost 3 vzipr LHS, <0,4,2,6>
+ 2593445030U, // <2,2,3,7>: Cost 3 vext1 <7,2,2,3>, <7,2,2,3>
+ 1879883883U, // <2,2,3,u>: Cost 2 vzipr LHS, LHS
+ 2551644262U, // <2,2,4,0>: Cost 3 vext1 <0,2,2,4>, LHS
+ 3625386742U, // <2,2,4,1>: Cost 4 vext1 <0,2,2,4>, <1,0,3,2>
+ 2551645902U, // <2,2,4,2>: Cost 3 vext1 <0,2,2,4>, <2,3,4,5>
+ 3759441686U, // <2,2,4,3>: Cost 4 vext3 <0,2,0,2>, <2,4,3,5>
+ 2551647542U, // <2,2,4,4>: Cost 3 vext1 <0,2,2,4>, RHS
+ 1548979510U, // <2,2,4,5>: Cost 2 vext2 <0,u,2,2>, RHS
+ 2764901686U, // <2,2,4,6>: Cost 3 vuzpl <2,2,2,2>, RHS
+ 3667195047U, // <2,2,4,7>: Cost 4 vext1 <7,2,2,4>, <7,2,2,4>
+ 1548979753U, // <2,2,4,u>: Cost 2 vext2 <0,u,2,2>, RHS
+ 3696463432U, // <2,2,5,0>: Cost 4 vext2 <0,u,2,2>, <5,0,1,2>
+ 2617413328U, // <2,2,5,1>: Cost 3 vext2 <0,0,2,2>, <5,1,7,3>
+ 2685699936U, // <2,2,5,2>: Cost 3 vext3 <0,2,0,2>, <2,5,2,7>
+ 4027383910U, // <2,2,5,3>: Cost 4 vzipr <0,1,2,5>, LHS
+ 2228201085U, // <2,2,5,4>: Cost 3 vrev <2,2,4,5>
+ 2617413636U, // <2,2,5,5>: Cost 3 vext2 <0,0,2,2>, <5,5,5,5>
+ 2617413730U, // <2,2,5,6>: Cost 3 vext2 <0,0,2,2>, <5,6,7,0>
+ 2819919158U, // <2,2,5,7>: Cost 3 vuzpr <0,2,0,2>, RHS
+ 2819919159U, // <2,2,5,u>: Cost 3 vuzpr <0,2,0,2>, RHS
+ 3625402554U, // <2,2,6,0>: Cost 4 vext1 <0,2,2,6>, <0,2,2,6>
+ 3760031652U, // <2,2,6,1>: Cost 4 vext3 <0,2,u,2>, <2,6,1,3>
+ 2617414138U, // <2,2,6,2>: Cost 3 vext2 <0,0,2,2>, <6,2,7,3>
+ 2685700026U, // <2,2,6,3>: Cost 3 vext3 <0,2,0,2>, <2,6,3,7>
+ 3625405750U, // <2,2,6,4>: Cost 4 vext1 <0,2,2,6>, RHS
+ 3760031692U, // <2,2,6,5>: Cost 4 vext3 <0,2,u,2>, <2,6,5,7>
+ 3088679116U, // <2,2,6,6>: Cost 3 vtrnr <0,2,4,6>, <0,2,4,6>
+ 2657891169U, // <2,2,6,7>: Cost 3 vext2 <6,7,2,2>, <6,7,2,2>
+ 2685700071U, // <2,2,6,u>: Cost 3 vext3 <0,2,0,2>, <2,6,u,7>
+ 2726250474U, // <2,2,7,0>: Cost 3 vext3 <7,0,1,2>, <2,7,0,1>
+ 3704427616U, // <2,2,7,1>: Cost 4 vext2 <2,2,2,2>, <7,1,3,5>
+ 2660545701U, // <2,2,7,2>: Cost 3 vext2 <7,2,2,2>, <7,2,2,2>
+ 4030718054U, // <2,2,7,3>: Cost 4 vzipr <0,6,2,7>, LHS
+ 2617415014U, // <2,2,7,4>: Cost 3 vext2 <0,0,2,2>, <7,4,5,6>
+ 3302033032U, // <2,2,7,5>: Cost 4 vrev <2,2,5,7>
+ 3661246929U, // <2,2,7,6>: Cost 4 vext1 <6,2,2,7>, <6,2,2,7>
+ 2617415276U, // <2,2,7,7>: Cost 3 vext2 <0,0,2,2>, <7,7,7,7>
+ 2731558962U, // <2,2,7,u>: Cost 3 vext3 <7,u,1,2>, <2,7,u,1>
+ 1489829990U, // <2,2,u,0>: Cost 2 vext1 <2,2,2,2>, LHS
+ 1548982062U, // <2,2,u,1>: Cost 2 vext2 <0,u,2,2>, LHS
+ 269271142U, // <2,2,u,2>: Cost 1 vdup2 LHS
+ 1879924838U, // <2,2,u,3>: Cost 2 vzipr LHS, LHS
+ 1489833270U, // <2,2,u,4>: Cost 2 vext1 <2,2,2,2>, RHS
+ 1548982426U, // <2,2,u,5>: Cost 2 vext2 <0,u,2,2>, RHS
+ 2953666908U, // <2,2,u,6>: Cost 3 vzipr LHS, <0,4,2,6>
+ 2819919401U, // <2,2,u,7>: Cost 3 vuzpr <0,2,0,2>, RHS
+ 269271142U, // <2,2,u,u>: Cost 1 vdup2 LHS
+ 1544339456U, // <2,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+ 470597734U, // <2,3,0,1>: Cost 1 vext2 LHS, LHS
+ 1548984484U, // <2,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+ 2619408648U, // <2,3,0,3>: Cost 3 vext2 <0,3,2,3>, <0,3,2,3>
+ 1548984658U, // <2,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+ 2665857454U, // <2,3,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
+ 2622726655U, // <2,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
+ 2593494188U, // <2,3,0,7>: Cost 3 vext1 <7,2,3,0>, <7,2,3,0>
+ 470598301U, // <2,3,0,u>: Cost 1 vext2 LHS, LHS
+ 1544340214U, // <2,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+ 1544340276U, // <2,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+ 1544340374U, // <2,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+ 1548985304U, // <2,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+ 2551696694U, // <2,3,1,4>: Cost 3 vext1 <0,2,3,1>, RHS
+ 1548985488U, // <2,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+ 2622727375U, // <2,3,1,6>: Cost 3 vext2 LHS, <1,6,1,7>
+ 2665858347U, // <2,3,1,7>: Cost 3 vext2 LHS, <1,7,3,0>
+ 1548985709U, // <2,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
+ 2622727613U, // <2,3,2,0>: Cost 3 vext2 LHS, <2,0,1,2>
+ 2622727711U, // <2,3,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
+ 1544341096U, // <2,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
+ 1544341158U, // <2,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+ 2622727958U, // <2,3,2,4>: Cost 3 vext2 LHS, <2,4,3,5>
+ 2622728032U, // <2,3,2,5>: Cost 3 vext2 LHS, <2,5,2,7>
+ 1548986298U, // <2,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+ 2665859050U, // <2,3,2,7>: Cost 3 vext2 LHS, <2,7,0,1>
+ 1548986427U, // <2,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
+ 1548986518U, // <2,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+ 2622728415U, // <2,3,3,1>: Cost 3 vext2 LHS, <3,1,0,3>
+ 1489913458U, // <2,3,3,2>: Cost 2 vext1 <2,2,3,3>, <2,2,3,3>
+ 1544341916U, // <2,3,3,3>: Cost 2 vext2 LHS, <3,3,3,3>
+ 1548986882U, // <2,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+ 2665859632U, // <2,3,3,5>: Cost 3 vext2 LHS, <3,5,1,7>
+ 2234304870U, // <2,3,3,6>: Cost 3 vrev <3,2,6,3>
+ 2958271632U, // <2,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
+ 1548987166U, // <2,3,3,u>: Cost 2 vext2 LHS, <3,u,1,2>
+ 1483948134U, // <2,3,4,0>: Cost 2 vext1 <1,2,3,4>, LHS
+ 1483948954U, // <2,3,4,1>: Cost 2 vext1 <1,2,3,4>, <1,2,3,4>
+ 2622729276U, // <2,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
+ 2557692054U, // <2,3,4,3>: Cost 3 vext1 <1,2,3,4>, <3,0,1,2>
+ 1483951414U, // <2,3,4,4>: Cost 2 vext1 <1,2,3,4>, RHS
+ 470601014U, // <2,3,4,5>: Cost 1 vext2 LHS, RHS
+ 1592118644U, // <2,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+ 2593526960U, // <2,3,4,7>: Cost 3 vext1 <7,2,3,4>, <7,2,3,4>
+ 470601257U, // <2,3,4,u>: Cost 1 vext2 LHS, RHS
+ 2551726182U, // <2,3,5,0>: Cost 3 vext1 <0,2,3,5>, LHS
+ 1592118992U, // <2,3,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
+ 2665860862U, // <2,3,5,2>: Cost 3 vext2 LHS, <5,2,3,4>
+ 2551728642U, // <2,3,5,3>: Cost 3 vext1 <0,2,3,5>, <3,4,5,6>
+ 1592119238U, // <2,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+ 1592119300U, // <2,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+ 1592119394U, // <2,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
+ 1592119464U, // <2,3,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
+ 1592119545U, // <2,3,5,u>: Cost 2 vext2 LHS, <5,u,5,7>
+ 2622730529U, // <2,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
+ 2557707164U, // <2,3,6,1>: Cost 3 vext1 <1,2,3,6>, <1,2,3,6>
+ 1592119802U, // <2,3,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
+ 2665861682U, // <2,3,6,3>: Cost 3 vext2 LHS, <6,3,4,5>
+ 2622730893U, // <2,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
+ 2665861810U, // <2,3,6,5>: Cost 3 vext2 LHS, <6,5,0,7>
+ 1592120120U, // <2,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+ 1592120142U, // <2,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+ 1592120223U, // <2,3,6,u>: Cost 2 vext2 LHS, <6,u,0,1>
+ 1592120314U, // <2,3,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
+ 2659890261U, // <2,3,7,1>: Cost 3 vext2 <7,1,2,3>, <7,1,2,3>
+ 2660553894U, // <2,3,7,2>: Cost 3 vext2 <7,2,2,3>, <7,2,2,3>
+ 2665862371U, // <2,3,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
+ 1592120678U, // <2,3,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
+ 2665862534U, // <2,3,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
+ 2665862614U, // <2,3,7,6>: Cost 3 vext2 LHS, <7,6,0,1>
+ 1592120940U, // <2,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+ 1592120962U, // <2,3,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
+ 1548990163U, // <2,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
+ 470603566U, // <2,3,u,1>: Cost 1 vext2 LHS, LHS
+ 1548990341U, // <2,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
+ 1548990396U, // <2,3,u,3>: Cost 2 vext2 LHS, <u,3,0,1>
+ 1548990527U, // <2,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
+ 470603930U, // <2,3,u,5>: Cost 1 vext2 LHS, RHS
+ 1548990672U, // <2,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
+ 1592121600U, // <2,3,u,7>: Cost 2 vext2 LHS, <u,7,0,1>
+ 470604133U, // <2,3,u,u>: Cost 1 vext2 LHS, LHS
+ 2617425942U, // <2,4,0,0>: Cost 3 vext2 <0,0,2,4>, <0,0,2,4>
+ 2618753126U, // <2,4,0,1>: Cost 3 vext2 <0,2,2,4>, LHS
+ 2618753208U, // <2,4,0,2>: Cost 3 vext2 <0,2,2,4>, <0,2,2,4>
+ 2619416841U, // <2,4,0,3>: Cost 3 vext2 <0,3,2,4>, <0,3,2,4>
+ 2587593628U, // <2,4,0,4>: Cost 3 vext1 <6,2,4,0>, <4,0,6,2>
+ 2712832914U, // <2,4,0,5>: Cost 3 vext3 <4,6,u,2>, <4,0,5,1>
+ 1634962332U, // <2,4,0,6>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
+ 3799993252U, // <2,4,0,7>: Cost 4 vext3 <7,0,1,2>, <4,0,7,1>
+ 1634962332U, // <2,4,0,u>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
+ 2619417334U, // <2,4,1,0>: Cost 3 vext2 <0,3,2,4>, <1,0,3,2>
+ 3692495668U, // <2,4,1,1>: Cost 4 vext2 <0,2,2,4>, <1,1,1,1>
+ 2625389466U, // <2,4,1,2>: Cost 3 vext2 <1,3,2,4>, <1,2,3,4>
+ 2826125414U, // <2,4,1,3>: Cost 3 vuzpr <1,2,3,4>, LHS
+ 3699794995U, // <2,4,1,4>: Cost 4 vext2 <1,4,2,4>, <1,4,2,4>
+ 3692496016U, // <2,4,1,5>: Cost 4 vext2 <0,2,2,4>, <1,5,3,7>
+ 3763424238U, // <2,4,1,6>: Cost 4 vext3 <0,u,0,2>, <4,1,6,3>
+ 3667317942U, // <2,4,1,7>: Cost 4 vext1 <7,2,4,1>, <7,2,4,1>
+ 2826125419U, // <2,4,1,u>: Cost 3 vuzpr <1,2,3,4>, LHS
+ 2629371336U, // <2,4,2,0>: Cost 3 vext2 <2,0,2,4>, <2,0,2,4>
+ 3699131946U, // <2,4,2,1>: Cost 4 vext2 <1,3,2,4>, <2,1,4,3>
+ 2630698602U, // <2,4,2,2>: Cost 3 vext2 <2,2,2,4>, <2,2,2,4>
+ 2618754766U, // <2,4,2,3>: Cost 3 vext2 <0,2,2,4>, <2,3,4,5>
+ 2826126234U, // <2,4,2,4>: Cost 3 vuzpr <1,2,3,4>, <1,2,3,4>
+ 2899119414U, // <2,4,2,5>: Cost 3 vzipl <2,2,2,2>, RHS
+ 3033337142U, // <2,4,2,6>: Cost 3 vtrnl <2,2,2,2>, RHS
+ 3800214597U, // <2,4,2,7>: Cost 4 vext3 <7,0,4,2>, <4,2,7,0>
+ 2899119657U, // <2,4,2,u>: Cost 3 vzipl <2,2,2,2>, RHS
+ 2635344033U, // <2,4,3,0>: Cost 3 vext2 <3,0,2,4>, <3,0,2,4>
+ 4032012325U, // <2,4,3,1>: Cost 4 vzipr LHS, <0,0,4,1>
+ 3692497228U, // <2,4,3,2>: Cost 4 vext2 <0,2,2,4>, <3,2,3,4>
+ 3692497308U, // <2,4,3,3>: Cost 4 vext2 <0,2,2,4>, <3,3,3,3>
+ 3001404624U, // <2,4,3,4>: Cost 3 vzipr LHS, <4,4,4,4>
+ 2953627342U, // <2,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
+ 2953625804U, // <2,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
+ 3899868160U, // <2,4,3,7>: Cost 4 vuzpr <1,2,3,4>, <1,3,5,7>
+ 2953625806U, // <2,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
+ 2710916266U, // <2,4,4,0>: Cost 3 vext3 <4,4,0,2>, <4,4,0,2>
+ 3899869648U, // <2,4,4,1>: Cost 4 vuzpr <1,2,3,4>, <3,4,0,1>
+ 3899869658U, // <2,4,4,2>: Cost 4 vuzpr <1,2,3,4>, <3,4,1,2>
+ 3899868930U, // <2,4,4,3>: Cost 4 vuzpr <1,2,3,4>, <2,4,1,3>
+ 2712833232U, // <2,4,4,4>: Cost 3 vext3 <4,6,u,2>, <4,4,4,4>
+ 2618756406U, // <2,4,4,5>: Cost 3 vext2 <0,2,2,4>, RHS
+ 2765737270U, // <2,4,4,6>: Cost 3 vuzpl <2,3,4,5>, RHS
+ 4168304426U, // <2,4,4,7>: Cost 4 vtrnr <1,2,3,4>, <2,4,5,7>
+ 2618756649U, // <2,4,4,u>: Cost 3 vext2 <0,2,2,4>, RHS
+ 2551800011U, // <2,4,5,0>: Cost 3 vext1 <0,2,4,5>, <0,2,4,5>
+ 2569716470U, // <2,4,5,1>: Cost 3 vext1 <3,2,4,5>, <1,0,3,2>
+ 2563745405U, // <2,4,5,2>: Cost 3 vext1 <2,2,4,5>, <2,2,4,5>
+ 2569718102U, // <2,4,5,3>: Cost 3 vext1 <3,2,4,5>, <3,2,4,5>
+ 2551803190U, // <2,4,5,4>: Cost 3 vext1 <0,2,4,5>, RHS
+ 3625545732U, // <2,4,5,5>: Cost 4 vext1 <0,2,4,5>, <5,5,5,5>
+ 1611959606U, // <2,4,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
+ 2826128694U, // <2,4,5,7>: Cost 3 vuzpr <1,2,3,4>, RHS
+ 1611959624U, // <2,4,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
+ 1478066278U, // <2,4,6,0>: Cost 2 vext1 <0,2,4,6>, LHS
+ 2551808758U, // <2,4,6,1>: Cost 3 vext1 <0,2,4,6>, <1,0,3,2>
+ 2551809516U, // <2,4,6,2>: Cost 3 vext1 <0,2,4,6>, <2,0,6,4>
+ 2551810198U, // <2,4,6,3>: Cost 3 vext1 <0,2,4,6>, <3,0,1,2>
+ 1478069558U, // <2,4,6,4>: Cost 2 vext1 <0,2,4,6>, RHS
+ 2901888310U, // <2,4,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
+ 2551812920U, // <2,4,6,6>: Cost 3 vext1 <0,2,4,6>, <6,6,6,6>
+ 2726251914U, // <2,4,6,7>: Cost 3 vext3 <7,0,1,2>, <4,6,7,1>
+ 1478072110U, // <2,4,6,u>: Cost 2 vext1 <0,2,4,6>, LHS
+ 2659234821U, // <2,4,7,0>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
+ 3786722726U, // <2,4,7,1>: Cost 4 vext3 <4,7,1,2>, <4,7,1,2>
+ 3734303911U, // <2,4,7,2>: Cost 4 vext2 <7,2,2,4>, <7,2,2,4>
+ 3734967544U, // <2,4,7,3>: Cost 4 vext2 <7,3,2,4>, <7,3,2,4>
+ 3727005030U, // <2,4,7,4>: Cost 4 vext2 <6,0,2,4>, <7,4,5,6>
+ 2726251976U, // <2,4,7,5>: Cost 3 vext3 <7,0,1,2>, <4,7,5,0>
+ 2726251986U, // <2,4,7,6>: Cost 3 vext3 <7,0,1,2>, <4,7,6,1>
+ 3727005292U, // <2,4,7,7>: Cost 4 vext2 <6,0,2,4>, <7,7,7,7>
+ 2659234821U, // <2,4,7,u>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
+ 1478082662U, // <2,4,u,0>: Cost 2 vext1 <0,2,4,u>, LHS
+ 2618758958U, // <2,4,u,1>: Cost 3 vext2 <0,2,2,4>, LHS
+ 2551826024U, // <2,4,u,2>: Cost 3 vext1 <0,2,4,u>, <2,2,2,2>
+ 2551826582U, // <2,4,u,3>: Cost 3 vext1 <0,2,4,u>, <3,0,1,2>
+ 1478085942U, // <2,4,u,4>: Cost 2 vext1 <0,2,4,u>, RHS
+ 2953668302U, // <2,4,u,5>: Cost 3 vzipr LHS, <2,3,4,5>
+ 1611959849U, // <2,4,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
+ 2826128937U, // <2,4,u,7>: Cost 3 vuzpr <1,2,3,4>, RHS
+ 1611959867U, // <2,4,u,u>: Cost 2 vext3 <0,2,0,2>, RHS
+ 3691839488U, // <2,5,0,0>: Cost 4 vext2 <0,1,2,5>, <0,0,0,0>
+ 2618097766U, // <2,5,0,1>: Cost 3 vext2 <0,1,2,5>, LHS
+ 2620088484U, // <2,5,0,2>: Cost 3 vext2 <0,4,2,5>, <0,2,0,2>
+ 2619425034U, // <2,5,0,3>: Cost 3 vext2 <0,3,2,5>, <0,3,2,5>
+ 2620088667U, // <2,5,0,4>: Cost 3 vext2 <0,4,2,5>, <0,4,2,5>
+ 2620752300U, // <2,5,0,5>: Cost 3 vext2 <0,5,2,5>, <0,5,2,5>
+ 3693830655U, // <2,5,0,6>: Cost 4 vext2 <0,4,2,5>, <0,6,2,7>
+ 3094531382U, // <2,5,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
+ 2618098333U, // <2,5,0,u>: Cost 3 vext2 <0,1,2,5>, LHS
+ 3691840246U, // <2,5,1,0>: Cost 4 vext2 <0,1,2,5>, <1,0,3,2>
+ 3691840308U, // <2,5,1,1>: Cost 4 vext2 <0,1,2,5>, <1,1,1,1>
+ 2626061206U, // <2,5,1,2>: Cost 3 vext2 <1,4,2,5>, <1,2,3,0>
+ 2618098688U, // <2,5,1,3>: Cost 3 vext2 <0,1,2,5>, <1,3,5,7>
+ 2626061364U, // <2,5,1,4>: Cost 3 vext2 <1,4,2,5>, <1,4,2,5>
+ 3691840656U, // <2,5,1,5>: Cost 4 vext2 <0,1,2,5>, <1,5,3,7>
+ 3789082310U, // <2,5,1,6>: Cost 4 vext3 <5,1,6,2>, <5,1,6,2>
+ 2712833744U, // <2,5,1,7>: Cost 3 vext3 <4,6,u,2>, <5,1,7,3>
+ 2628715896U, // <2,5,1,u>: Cost 3 vext2 <1,u,2,5>, <1,u,2,5>
+ 3693831613U, // <2,5,2,0>: Cost 4 vext2 <0,4,2,5>, <2,0,1,2>
+ 4026698642U, // <2,5,2,1>: Cost 4 vzipr <0,0,2,2>, <4,0,5,1>
+ 2632033896U, // <2,5,2,2>: Cost 3 vext2 <2,4,2,5>, <2,2,2,2>
+ 3691841190U, // <2,5,2,3>: Cost 4 vext2 <0,1,2,5>, <2,3,0,1>
+ 2632034061U, // <2,5,2,4>: Cost 3 vext2 <2,4,2,5>, <2,4,2,5>
+ 3691841352U, // <2,5,2,5>: Cost 4 vext2 <0,1,2,5>, <2,5,0,1>
+ 3691841466U, // <2,5,2,6>: Cost 4 vext2 <0,1,2,5>, <2,6,3,7>
+ 3088354614U, // <2,5,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
+ 3088354615U, // <2,5,2,u>: Cost 3 vtrnr <0,2,0,2>, RHS
+ 2557829222U, // <2,5,3,0>: Cost 3 vext1 <1,2,5,3>, LHS
+ 2557830059U, // <2,5,3,1>: Cost 3 vext1 <1,2,5,3>, <1,2,5,3>
+ 2575746766U, // <2,5,3,2>: Cost 3 vext1 <4,2,5,3>, <2,3,4,5>
+ 3691841948U, // <2,5,3,3>: Cost 4 vext2 <0,1,2,5>, <3,3,3,3>
+ 2619427330U, // <2,5,3,4>: Cost 3 vext2 <0,3,2,5>, <3,4,5,6>
+ 2581720847U, // <2,5,3,5>: Cost 3 vext1 <5,2,5,3>, <5,2,5,3>
+ 2953628162U, // <2,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
+ 2953626624U, // <2,5,3,7>: Cost 3 vzipr LHS, <1,3,5,7>
+ 2953626625U, // <2,5,3,u>: Cost 3 vzipr LHS, <1,3,5,u>
+ 2569781350U, // <2,5,4,0>: Cost 3 vext1 <3,2,5,4>, LHS
+ 3631580076U, // <2,5,4,1>: Cost 4 vext1 <1,2,5,4>, <1,2,5,4>
+ 2569782990U, // <2,5,4,2>: Cost 3 vext1 <3,2,5,4>, <2,3,4,5>
+ 2569783646U, // <2,5,4,3>: Cost 3 vext1 <3,2,5,4>, <3,2,5,4>
+ 2569784630U, // <2,5,4,4>: Cost 3 vext1 <3,2,5,4>, RHS
+ 2618101046U, // <2,5,4,5>: Cost 3 vext2 <0,1,2,5>, RHS
+ 3893905922U, // <2,5,4,6>: Cost 4 vuzpr <0,2,3,5>, <3,4,5,6>
+ 3094564150U, // <2,5,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
+ 2618101289U, // <2,5,4,u>: Cost 3 vext2 <0,1,2,5>, RHS
+ 2551873638U, // <2,5,5,0>: Cost 3 vext1 <0,2,5,5>, LHS
+ 3637560320U, // <2,5,5,1>: Cost 4 vext1 <2,2,5,5>, <1,3,5,7>
+ 3637560966U, // <2,5,5,2>: Cost 4 vext1 <2,2,5,5>, <2,2,5,5>
+ 3723030343U, // <2,5,5,3>: Cost 4 vext2 <5,3,2,5>, <5,3,2,5>
+ 2551876918U, // <2,5,5,4>: Cost 3 vext1 <0,2,5,5>, RHS
+ 2712834052U, // <2,5,5,5>: Cost 3 vext3 <4,6,u,2>, <5,5,5,5>
+ 4028713474U, // <2,5,5,6>: Cost 4 vzipr <0,3,2,5>, <3,4,5,6>
+ 2712834072U, // <2,5,5,7>: Cost 3 vext3 <4,6,u,2>, <5,5,7,7>
+ 2712834081U, // <2,5,5,u>: Cost 3 vext3 <4,6,u,2>, <5,5,u,7>
+ 2575769702U, // <2,5,6,0>: Cost 3 vext1 <4,2,5,6>, LHS
+ 3631596462U, // <2,5,6,1>: Cost 4 vext1 <1,2,5,6>, <1,2,5,6>
+ 2655924730U, // <2,5,6,2>: Cost 3 vext2 <6,4,2,5>, <6,2,7,3>
+ 3643541856U, // <2,5,6,3>: Cost 4 vext1 <3,2,5,6>, <3,2,5,6>
+ 2655924849U, // <2,5,6,4>: Cost 3 vext2 <6,4,2,5>, <6,4,2,5>
+ 3787755607U, // <2,5,6,5>: Cost 4 vext3 <4,u,6,2>, <5,6,5,7>
+ 4029385218U, // <2,5,6,6>: Cost 4 vzipr <0,4,2,6>, <3,4,5,6>
+ 3088682294U, // <2,5,6,7>: Cost 3 vtrnr <0,2,4,6>, RHS
+ 3088682295U, // <2,5,6,u>: Cost 3 vtrnr <0,2,4,6>, RHS
+ 2563833958U, // <2,5,7,0>: Cost 3 vext1 <2,2,5,7>, LHS
+ 2551890678U, // <2,5,7,1>: Cost 3 vext1 <0,2,5,7>, <1,0,3,2>
+ 2563835528U, // <2,5,7,2>: Cost 3 vext1 <2,2,5,7>, <2,2,5,7>
+ 3637577878U, // <2,5,7,3>: Cost 4 vext1 <2,2,5,7>, <3,0,1,2>
+ 2563837238U, // <2,5,7,4>: Cost 3 vext1 <2,2,5,7>, RHS
+ 2712834216U, // <2,5,7,5>: Cost 3 vext3 <4,6,u,2>, <5,7,5,7>
+ 2712834220U, // <2,5,7,6>: Cost 3 vext3 <4,6,u,2>, <5,7,6,2>
+ 4174449974U, // <2,5,7,7>: Cost 4 vtrnr <2,2,5,7>, RHS
+ 2563839790U, // <2,5,7,u>: Cost 3 vext1 <2,2,5,7>, LHS
+ 2563842150U, // <2,5,u,0>: Cost 3 vext1 <2,2,5,u>, LHS
+ 2618103598U, // <2,5,u,1>: Cost 3 vext2 <0,1,2,5>, LHS
+ 2563843721U, // <2,5,u,2>: Cost 3 vext1 <2,2,5,u>, <2,2,5,u>
+ 2569816418U, // <2,5,u,3>: Cost 3 vext1 <3,2,5,u>, <3,2,5,u>
+ 2622748735U, // <2,5,u,4>: Cost 3 vext2 <0,u,2,5>, <u,4,5,6>
+ 2618103962U, // <2,5,u,5>: Cost 3 vext2 <0,1,2,5>, RHS
+ 2953669122U, // <2,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
+ 2953667584U, // <2,5,u,7>: Cost 3 vzipr LHS, <1,3,5,7>
+ 2618104165U, // <2,5,u,u>: Cost 3 vext2 <0,1,2,5>, LHS
+ 2620096512U, // <2,6,0,0>: Cost 3 vext2 <0,4,2,6>, <0,0,0,0>
+ 1546354790U, // <2,6,0,1>: Cost 2 vext2 <0,4,2,6>, LHS
+ 2620096676U, // <2,6,0,2>: Cost 3 vext2 <0,4,2,6>, <0,2,0,2>
+ 3693838588U, // <2,6,0,3>: Cost 4 vext2 <0,4,2,6>, <0,3,1,0>
+ 1546355036U, // <2,6,0,4>: Cost 2 vext2 <0,4,2,6>, <0,4,2,6>
+ 3694502317U, // <2,6,0,5>: Cost 4 vext2 <0,5,2,6>, <0,5,2,6>
+ 2551911246U, // <2,6,0,6>: Cost 3 vext1 <0,2,6,0>, <6,7,0,1>
+ 2720723287U, // <2,6,0,7>: Cost 3 vext3 <6,0,7,2>, <6,0,7,2>
+ 1546355357U, // <2,6,0,u>: Cost 2 vext2 <0,4,2,6>, LHS
+ 2620097270U, // <2,6,1,0>: Cost 3 vext2 <0,4,2,6>, <1,0,3,2>
+ 2620097332U, // <2,6,1,1>: Cost 3 vext2 <0,4,2,6>, <1,1,1,1>
+ 2620097430U, // <2,6,1,2>: Cost 3 vext2 <0,4,2,6>, <1,2,3,0>
+ 2820243558U, // <2,6,1,3>: Cost 3 vuzpr <0,2,4,6>, LHS
+ 2620097598U, // <2,6,1,4>: Cost 3 vext2 <0,4,2,6>, <1,4,3,6>
+ 2620097680U, // <2,6,1,5>: Cost 3 vext2 <0,4,2,6>, <1,5,3,7>
+ 3693839585U, // <2,6,1,6>: Cost 4 vext2 <0,4,2,6>, <1,6,3,7>
+ 2721386920U, // <2,6,1,7>: Cost 3 vext3 <6,1,7,2>, <6,1,7,2>
+ 2820243563U, // <2,6,1,u>: Cost 3 vuzpr <0,2,4,6>, LHS
+ 2714014137U, // <2,6,2,0>: Cost 3 vext3 <4,u,6,2>, <6,2,0,1>
+ 2712834500U, // <2,6,2,1>: Cost 3 vext3 <4,6,u,2>, <6,2,1,3>
+ 2620098152U, // <2,6,2,2>: Cost 3 vext2 <0,4,2,6>, <2,2,2,2>
+ 2620098214U, // <2,6,2,3>: Cost 3 vext2 <0,4,2,6>, <2,3,0,1>
+ 2632042254U, // <2,6,2,4>: Cost 3 vext2 <2,4,2,6>, <2,4,2,6>
+ 2712834540U, // <2,6,2,5>: Cost 3 vext3 <4,6,u,2>, <6,2,5,7>
+ 2820243660U, // <2,6,2,6>: Cost 3 vuzpr <0,2,4,6>, <0,2,4,6>
+ 2958265654U, // <2,6,2,7>: Cost 3 vzipr <0,u,2,2>, RHS
+ 2620098619U, // <2,6,2,u>: Cost 3 vext2 <0,4,2,6>, <2,u,0,1>
+ 2620098710U, // <2,6,3,0>: Cost 3 vext2 <0,4,2,6>, <3,0,1,2>
+ 3893986982U, // <2,6,3,1>: Cost 4 vuzpr <0,2,4,6>, <2,3,0,1>
+ 2569848762U, // <2,6,3,2>: Cost 3 vext1 <3,2,6,3>, <2,6,3,7>
+ 2620098972U, // <2,6,3,3>: Cost 3 vext2 <0,4,2,6>, <3,3,3,3>
+ 2620099074U, // <2,6,3,4>: Cost 3 vext2 <0,4,2,6>, <3,4,5,6>
+ 3893987022U, // <2,6,3,5>: Cost 4 vuzpr <0,2,4,6>, <2,3,4,5>
+ 3001404644U, // <2,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
+ 1879887158U, // <2,6,3,7>: Cost 2 vzipr LHS, RHS
+ 1879887159U, // <2,6,3,u>: Cost 2 vzipr LHS, RHS
+ 2620099484U, // <2,6,4,0>: Cost 3 vext2 <0,4,2,6>, <4,0,6,2>
+ 2620099566U, // <2,6,4,1>: Cost 3 vext2 <0,4,2,6>, <4,1,6,3>
+ 2620099644U, // <2,6,4,2>: Cost 3 vext2 <0,4,2,6>, <4,2,6,0>
+ 3643599207U, // <2,6,4,3>: Cost 4 vext1 <3,2,6,4>, <3,2,6,4>
+ 2575830080U, // <2,6,4,4>: Cost 3 vext1 <4,2,6,4>, <4,2,6,4>
+ 1546358070U, // <2,6,4,5>: Cost 2 vext2 <0,4,2,6>, RHS
+ 2667875700U, // <2,6,4,6>: Cost 3 vext2 <u,4,2,6>, <4,6,4,6>
+ 4028042550U, // <2,6,4,7>: Cost 4 vzipr <0,2,2,4>, RHS
+ 1546358313U, // <2,6,4,u>: Cost 2 vext2 <0,4,2,6>, RHS
+ 3693841992U, // <2,6,5,0>: Cost 4 vext2 <0,4,2,6>, <5,0,1,2>
+ 2667876048U, // <2,6,5,1>: Cost 3 vext2 <u,4,2,6>, <5,1,7,3>
+ 2712834756U, // <2,6,5,2>: Cost 3 vext3 <4,6,u,2>, <6,5,2,7>
+ 3643607400U, // <2,6,5,3>: Cost 4 vext1 <3,2,6,5>, <3,2,6,5>
+ 2252091873U, // <2,6,5,4>: Cost 3 vrev <6,2,4,5>
+ 2667876356U, // <2,6,5,5>: Cost 3 vext2 <u,4,2,6>, <5,5,5,5>
+ 2667876450U, // <2,6,5,6>: Cost 3 vext2 <u,4,2,6>, <5,6,7,0>
+ 2820246838U, // <2,6,5,7>: Cost 3 vuzpr <0,2,4,6>, RHS
+ 2820246839U, // <2,6,5,u>: Cost 3 vuzpr <0,2,4,6>, RHS
+ 2563899494U, // <2,6,6,0>: Cost 3 vext1 <2,2,6,6>, LHS
+ 3893988683U, // <2,6,6,1>: Cost 4 vuzpr <0,2,4,6>, <4,6,0,1>
+ 2563901072U, // <2,6,6,2>: Cost 3 vext1 <2,2,6,6>, <2,2,6,6>
+ 3893987236U, // <2,6,6,3>: Cost 4 vuzpr <0,2,4,6>, <2,6,1,3>
+ 2563902774U, // <2,6,6,4>: Cost 3 vext1 <2,2,6,6>, RHS
+ 3893988723U, // <2,6,6,5>: Cost 4 vuzpr <0,2,4,6>, <4,6,4,5>
+ 2712834872U, // <2,6,6,6>: Cost 3 vext3 <4,6,u,2>, <6,6,6,6>
+ 2955644214U, // <2,6,6,7>: Cost 3 vzipr <0,4,2,6>, RHS
+ 2955644215U, // <2,6,6,u>: Cost 3 vzipr <0,4,2,6>, RHS
+ 2712834894U, // <2,6,7,0>: Cost 3 vext3 <4,6,u,2>, <6,7,0,1>
+ 2724926296U, // <2,6,7,1>: Cost 3 vext3 <6,7,1,2>, <6,7,1,2>
+ 2725000033U, // <2,6,7,2>: Cost 3 vext3 <6,7,2,2>, <6,7,2,2>
+ 2702365544U, // <2,6,7,3>: Cost 3 vext3 <3,0,1,2>, <6,7,3,0>
+ 2712834934U, // <2,6,7,4>: Cost 3 vext3 <4,6,u,2>, <6,7,4,5>
+ 3776107393U, // <2,6,7,5>: Cost 4 vext3 <3,0,1,2>, <6,7,5,7>
+ 2725294981U, // <2,6,7,6>: Cost 3 vext3 <6,7,6,2>, <6,7,6,2>
+ 2726253452U, // <2,6,7,7>: Cost 3 vext3 <7,0,1,2>, <6,7,7,0>
+ 2712834966U, // <2,6,7,u>: Cost 3 vext3 <4,6,u,2>, <6,7,u,1>
+ 2620102355U, // <2,6,u,0>: Cost 3 vext2 <0,4,2,6>, <u,0,1,2>
+ 1546360622U, // <2,6,u,1>: Cost 2 vext2 <0,4,2,6>, LHS
+ 2620102536U, // <2,6,u,2>: Cost 3 vext2 <0,4,2,6>, <u,2,3,3>
+ 2820244125U, // <2,6,u,3>: Cost 3 vuzpr <0,2,4,6>, LHS
+ 1594136612U, // <2,6,u,4>: Cost 2 vext2 <u,4,2,6>, <u,4,2,6>
+ 1546360986U, // <2,6,u,5>: Cost 2 vext2 <0,4,2,6>, RHS
+ 2620102864U, // <2,6,u,6>: Cost 3 vext2 <0,4,2,6>, <u,6,3,7>
+ 1879928118U, // <2,6,u,7>: Cost 2 vzipr LHS, RHS
+ 1879928119U, // <2,6,u,u>: Cost 2 vzipr LHS, RHS
+ 2726179825U, // <2,7,0,0>: Cost 3 vext3 <7,0,0,2>, <7,0,0,2>
+ 1652511738U, // <2,7,0,1>: Cost 2 vext3 <7,0,1,2>, <7,0,1,2>
+ 2621431972U, // <2,7,0,2>: Cost 3 vext2 <0,6,2,7>, <0,2,0,2>
+ 2257949868U, // <2,7,0,3>: Cost 3 vrev <7,2,3,0>
+ 2726474773U, // <2,7,0,4>: Cost 3 vext3 <7,0,4,2>, <7,0,4,2>
+ 2620768686U, // <2,7,0,5>: Cost 3 vext2 <0,5,2,7>, <0,5,2,7>
+ 2621432319U, // <2,7,0,6>: Cost 3 vext2 <0,6,2,7>, <0,6,2,7>
+ 2599760953U, // <2,7,0,7>: Cost 3 vext1 <u,2,7,0>, <7,0,u,2>
+ 1653027897U, // <2,7,0,u>: Cost 2 vext3 <7,0,u,2>, <7,0,u,2>
+ 2639348470U, // <2,7,1,0>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
+ 3695174452U, // <2,7,1,1>: Cost 4 vext2 <0,6,2,7>, <1,1,1,1>
+ 3695174550U, // <2,7,1,2>: Cost 4 vext2 <0,6,2,7>, <1,2,3,0>
+ 3694511104U, // <2,7,1,3>: Cost 4 vext2 <0,5,2,7>, <1,3,5,7>
+ 3713090594U, // <2,7,1,4>: Cost 4 vext2 <3,6,2,7>, <1,4,0,5>
+ 3693184144U, // <2,7,1,5>: Cost 4 vext2 <0,3,2,7>, <1,5,3,7>
+ 2627405016U, // <2,7,1,6>: Cost 3 vext2 <1,6,2,7>, <1,6,2,7>
+ 3799995519U, // <2,7,1,7>: Cost 4 vext3 <7,0,1,2>, <7,1,7,0>
+ 2639348470U, // <2,7,1,u>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
+ 3695175101U, // <2,7,2,0>: Cost 4 vext2 <0,6,2,7>, <2,0,1,2>
+ 3643655168U, // <2,7,2,1>: Cost 4 vext1 <3,2,7,2>, <1,3,5,7>
+ 2257892517U, // <2,7,2,2>: Cost 3 vrev <7,2,2,2>
+ 3695175334U, // <2,7,2,3>: Cost 4 vext2 <0,6,2,7>, <2,3,0,1>
+ 3695175465U, // <2,7,2,4>: Cost 4 vext2 <0,6,2,7>, <2,4,5,6>
+ 2632714080U, // <2,7,2,5>: Cost 3 vext2 <2,5,2,7>, <2,5,2,7>
+ 2633377713U, // <2,7,2,6>: Cost 3 vext2 <2,6,2,7>, <2,6,2,7>
+ 3695175658U, // <2,7,2,7>: Cost 4 vext2 <0,6,2,7>, <2,7,0,1>
+ 2634704979U, // <2,7,2,u>: Cost 3 vext2 <2,u,2,7>, <2,u,2,7>
+ 1514094694U, // <2,7,3,0>: Cost 2 vext1 <6,2,7,3>, LHS
+ 2569921680U, // <2,7,3,1>: Cost 3 vext1 <3,2,7,3>, <1,5,3,7>
+ 2587838056U, // <2,7,3,2>: Cost 3 vext1 <6,2,7,3>, <2,2,2,2>
+ 2569922927U, // <2,7,3,3>: Cost 3 vext1 <3,2,7,3>, <3,2,7,3>
+ 1514097974U, // <2,7,3,4>: Cost 2 vext1 <6,2,7,3>, RHS
+ 2581868321U, // <2,7,3,5>: Cost 3 vext1 <5,2,7,3>, <5,2,7,3>
+ 1514099194U, // <2,7,3,6>: Cost 2 vext1 <6,2,7,3>, <6,2,7,3>
+ 2587841530U, // <2,7,3,7>: Cost 3 vext1 <6,2,7,3>, <7,0,1,2>
+ 1514100526U, // <2,7,3,u>: Cost 2 vext1 <6,2,7,3>, LHS
+ 2708706617U, // <2,7,4,0>: Cost 3 vext3 <4,0,6,2>, <7,4,0,6>
+ 3649643418U, // <2,7,4,1>: Cost 4 vext1 <4,2,7,4>, <1,2,3,4>
+ 3649644330U, // <2,7,4,2>: Cost 4 vext1 <4,2,7,4>, <2,4,5,7>
+ 2257982640U, // <2,7,4,3>: Cost 3 vrev <7,2,3,4>
+ 3649645641U, // <2,7,4,4>: Cost 4 vext1 <4,2,7,4>, <4,2,7,4>
+ 2621435190U, // <2,7,4,5>: Cost 3 vext2 <0,6,2,7>, RHS
+ 2712835441U, // <2,7,4,6>: Cost 3 vext3 <4,6,u,2>, <7,4,6,u>
+ 3799995762U, // <2,7,4,7>: Cost 4 vext3 <7,0,1,2>, <7,4,7,0>
+ 2621435433U, // <2,7,4,u>: Cost 3 vext2 <0,6,2,7>, RHS
+ 2729497990U, // <2,7,5,0>: Cost 3 vext3 <7,5,0,2>, <7,5,0,2>
+ 3643679744U, // <2,7,5,1>: Cost 4 vext1 <3,2,7,5>, <1,3,5,7>
+ 3637708424U, // <2,7,5,2>: Cost 4 vext1 <2,2,7,5>, <2,2,5,7>
+ 3643681137U, // <2,7,5,3>: Cost 4 vext1 <3,2,7,5>, <3,2,7,5>
+ 2599800118U, // <2,7,5,4>: Cost 3 vext1 <u,2,7,5>, RHS
+ 3786577334U, // <2,7,5,5>: Cost 4 vext3 <4,6,u,2>, <7,5,5,5>
+ 3786577345U, // <2,7,5,6>: Cost 4 vext3 <4,6,u,2>, <7,5,6,7>
+ 2599802214U, // <2,7,5,7>: Cost 3 vext1 <u,2,7,5>, <7,4,5,6>
+ 2599802670U, // <2,7,5,u>: Cost 3 vext1 <u,2,7,5>, LHS
+ 2581889126U, // <2,7,6,0>: Cost 3 vext1 <5,2,7,6>, LHS
+ 3643687936U, // <2,7,6,1>: Cost 4 vext1 <3,2,7,6>, <1,3,5,7>
+ 2663240186U, // <2,7,6,2>: Cost 3 vext2 <7,6,2,7>, <6,2,7,3>
+ 3643689330U, // <2,7,6,3>: Cost 4 vext1 <3,2,7,6>, <3,2,7,6>
+ 2581892406U, // <2,7,6,4>: Cost 3 vext1 <5,2,7,6>, RHS
+ 2581892900U, // <2,7,6,5>: Cost 3 vext1 <5,2,7,6>, <5,2,7,6>
+ 2587865597U, // <2,7,6,6>: Cost 3 vext1 <6,2,7,6>, <6,2,7,6>
+ 3786577428U, // <2,7,6,7>: Cost 4 vext3 <4,6,u,2>, <7,6,7,0>
+ 2581894958U, // <2,7,6,u>: Cost 3 vext1 <5,2,7,6>, LHS
+ 2726254119U, // <2,7,7,0>: Cost 3 vext3 <7,0,1,2>, <7,7,0,1>
+ 3804640817U, // <2,7,7,1>: Cost 4 vext3 <7,7,1,2>, <7,7,1,2>
+ 3637724826U, // <2,7,7,2>: Cost 4 vext1 <2,2,7,7>, <2,2,7,7>
+ 3734992123U, // <2,7,7,3>: Cost 4 vext2 <7,3,2,7>, <7,3,2,7>
+ 2552040758U, // <2,7,7,4>: Cost 3 vext1 <0,2,7,7>, RHS
+ 3799995992U, // <2,7,7,5>: Cost 4 vext3 <7,0,1,2>, <7,7,5,5>
+ 2663241198U, // <2,7,7,6>: Cost 3 vext2 <7,6,2,7>, <7,6,2,7>
+ 2712835692U, // <2,7,7,7>: Cost 3 vext3 <4,6,u,2>, <7,7,7,7>
+ 2731562607U, // <2,7,7,u>: Cost 3 vext3 <7,u,1,2>, <7,7,u,1>
+ 1514135654U, // <2,7,u,0>: Cost 2 vext1 <6,2,7,u>, LHS
+ 1657820802U, // <2,7,u,1>: Cost 2 vext3 <7,u,1,2>, <7,u,1,2>
+ 2587879016U, // <2,7,u,2>: Cost 3 vext1 <6,2,7,u>, <2,2,2,2>
+ 2569963892U, // <2,7,u,3>: Cost 3 vext1 <3,2,7,u>, <3,2,7,u>
+ 1514138934U, // <2,7,u,4>: Cost 2 vext1 <6,2,7,u>, RHS
+ 2621438106U, // <2,7,u,5>: Cost 3 vext2 <0,6,2,7>, RHS
+ 1514140159U, // <2,7,u,6>: Cost 2 vext1 <6,2,7,u>, <6,2,7,u>
+ 2587882490U, // <2,7,u,7>: Cost 3 vext1 <6,2,7,u>, <7,0,1,2>
+ 1514141486U, // <2,7,u,u>: Cost 2 vext1 <6,2,7,u>, LHS
+ 1544380416U, // <2,u,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+ 470638699U, // <2,u,0,1>: Cost 1 vext2 LHS, LHS
+ 1544380580U, // <2,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+ 1658631909U, // <2,u,0,3>: Cost 2 vext3 <u,0,3,2>, <u,0,3,2>
+ 1544380754U, // <2,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+ 2665898414U, // <2,u,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
+ 1658853120U, // <2,u,0,6>: Cost 2 vext3 <u,0,6,2>, <u,0,6,2>
+ 3094531625U, // <2,u,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
+ 470639261U, // <2,u,0,u>: Cost 1 vext2 LHS, LHS
+ 1544381174U, // <2,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+ 1544381236U, // <2,u,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+ 1544381334U, // <2,u,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+ 1544381400U, // <2,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+ 2618123325U, // <2,u,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
+ 1544381584U, // <2,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+ 2618123489U, // <2,u,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
+ 2726254427U, // <2,u,1,7>: Cost 3 vext3 <7,0,1,2>, <u,1,7,3>
+ 1544381823U, // <2,u,1,u>: Cost 2 vext2 LHS, <1,u,3,3>
+ 1478328422U, // <2,u,2,0>: Cost 2 vext1 <0,2,u,2>, LHS
+ 2618123807U, // <2,u,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
+ 269271142U, // <2,u,2,2>: Cost 1 vdup2 LHS
+ 1544382118U, // <2,u,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+ 1478331702U, // <2,u,2,4>: Cost 2 vext1 <0,2,u,2>, RHS
+ 2618124136U, // <2,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+ 1544382394U, // <2,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+ 3088354857U, // <2,u,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
+ 269271142U, // <2,u,2,u>: Cost 1 vdup2 LHS
+ 1544382614U, // <2,u,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+ 2953627374U, // <2,u,3,1>: Cost 3 vzipr LHS, <2,3,u,1>
+ 1490282143U, // <2,u,3,2>: Cost 2 vext1 <2,2,u,3>, <2,2,u,3>
+ 1879883932U, // <2,u,3,3>: Cost 2 vzipr LHS, LHS
+ 1544382978U, // <2,u,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+ 2953627378U, // <2,u,3,5>: Cost 3 vzipr LHS, <2,3,u,5>
+ 1514172931U, // <2,u,3,6>: Cost 2 vext1 <6,2,u,3>, <6,2,u,3>
+ 1879887176U, // <2,u,3,7>: Cost 2 vzipr LHS, RHS
+ 1879883937U, // <2,u,3,u>: Cost 2 vzipr LHS, LHS
+ 1484316774U, // <2,u,4,0>: Cost 2 vext1 <1,2,u,4>, LHS
+ 1484317639U, // <2,u,4,1>: Cost 2 vext1 <1,2,u,4>, <1,2,u,4>
+ 2552088270U, // <2,u,4,2>: Cost 3 vext1 <0,2,u,4>, <2,3,4,5>
+ 1190213513U, // <2,u,4,3>: Cost 2 vrev <u,2,3,4>
+ 1484320054U, // <2,u,4,4>: Cost 2 vext1 <1,2,u,4>, RHS
+ 470641974U, // <2,u,4,5>: Cost 1 vext2 LHS, RHS
+ 1592159604U, // <2,u,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+ 3094564393U, // <2,u,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
+ 470642217U, // <2,u,4,u>: Cost 1 vext2 LHS, RHS
+ 2552094959U, // <2,u,5,0>: Cost 3 vext1 <0,2,u,5>, <0,2,u,5>
+ 1592159952U, // <2,u,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
+ 2564040353U, // <2,u,5,2>: Cost 3 vext1 <2,2,u,5>, <2,2,u,5>
+ 2690275455U, // <2,u,5,3>: Cost 3 vext3 <0,u,u,2>, <u,5,3,7>
+ 1592160198U, // <2,u,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+ 1592160260U, // <2,u,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+ 1611962522U, // <2,u,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
+ 1592160424U, // <2,u,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
+ 1611962540U, // <2,u,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
+ 1478361190U, // <2,u,6,0>: Cost 2 vext1 <0,2,u,6>, LHS
+ 2552103670U, // <2,u,6,1>: Cost 3 vext1 <0,2,u,6>, <1,0,3,2>
+ 1592160762U, // <2,u,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
+ 2685704400U, // <2,u,6,3>: Cost 3 vext3 <0,2,0,2>, <u,6,3,7>
+ 1478364470U, // <2,u,6,4>: Cost 2 vext1 <0,2,u,6>, RHS
+ 2901891226U, // <2,u,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
+ 1592161080U, // <2,u,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+ 1592161102U, // <2,u,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+ 1478367022U, // <2,u,6,u>: Cost 2 vext1 <0,2,u,6>, LHS
+ 1592161274U, // <2,u,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
+ 2659931226U, // <2,u,7,1>: Cost 3 vext2 <7,1,2,u>, <7,1,2,u>
+ 2564056739U, // <2,u,7,2>: Cost 3 vext1 <2,2,u,7>, <2,2,u,7>
+ 2665903331U, // <2,u,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
+ 1592161638U, // <2,u,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
+ 2665903494U, // <2,u,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
+ 2587947527U, // <2,u,7,6>: Cost 3 vext1 <6,2,u,7>, <6,2,u,7>
+ 1592161900U, // <2,u,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+ 1592161922U, // <2,u,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
+ 1478377574U, // <2,u,u,0>: Cost 2 vext1 <0,2,u,u>, LHS
+ 470644526U, // <2,u,u,1>: Cost 1 vext2 LHS, LHS
+ 269271142U, // <2,u,u,2>: Cost 1 vdup2 LHS
+ 1879924892U, // <2,u,u,3>: Cost 2 vzipr LHS, LHS
+ 1478380854U, // <2,u,u,4>: Cost 2 vext1 <0,2,u,u>, RHS
+ 470644890U, // <2,u,u,5>: Cost 1 vext2 LHS, RHS
+ 1611962765U, // <2,u,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
+ 1879928136U, // <2,u,u,7>: Cost 2 vzipr LHS, RHS
+ 470645093U, // <2,u,u,u>: Cost 1 vext2 LHS, LHS
+ 1611448320U, // <3,0,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
+ 1611890698U, // <3,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
+ 1611890708U, // <3,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
+ 3763576860U, // <3,0,0,3>: Cost 4 vext3 LHS, <0,0,3,1>
+ 2689835045U, // <3,0,0,4>: Cost 3 vext3 LHS, <0,0,4,1>
+ 3698508206U, // <3,0,0,5>: Cost 4 vext2 <1,2,3,0>, <0,5,2,7>
+ 3763576887U, // <3,0,0,6>: Cost 4 vext3 LHS, <0,0,6,1>
+ 3667678434U, // <3,0,0,7>: Cost 4 vext1 <7,3,0,0>, <7,3,0,0>
+ 1616093258U, // <3,0,0,u>: Cost 2 vext3 LHS, <0,0,u,2>
+ 1490337894U, // <3,0,1,0>: Cost 2 vext1 <2,3,0,1>, LHS
+ 2685632602U, // <3,0,1,1>: Cost 3 vext3 LHS, <0,1,1,0>
+ 537706598U, // <3,0,1,2>: Cost 1 vext3 LHS, LHS
+ 2624766936U, // <3,0,1,3>: Cost 3 vext2 <1,2,3,0>, <1,3,1,3>
+ 1490341174U, // <3,0,1,4>: Cost 2 vext1 <2,3,0,1>, RHS
+ 2624767120U, // <3,0,1,5>: Cost 3 vext2 <1,2,3,0>, <1,5,3,7>
+ 2732966030U, // <3,0,1,6>: Cost 3 vext3 LHS, <0,1,6,7>
+ 2593944803U, // <3,0,1,7>: Cost 3 vext1 <7,3,0,1>, <7,3,0,1>
+ 537706652U, // <3,0,1,u>: Cost 1 vext3 LHS, LHS
+ 1611890852U, // <3,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+ 2685632684U, // <3,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
+ 2685632692U, // <3,0,2,2>: Cost 3 vext3 LHS, <0,2,2,0>
+ 2685632702U, // <3,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
+ 1611890892U, // <3,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+ 2732966102U, // <3,0,2,5>: Cost 3 vext3 LHS, <0,2,5,7>
+ 2624767930U, // <3,0,2,6>: Cost 3 vext2 <1,2,3,0>, <2,6,3,7>
+ 2685632744U, // <3,0,2,7>: Cost 3 vext3 LHS, <0,2,7,7>
+ 1611890924U, // <3,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
+ 2624768150U, // <3,0,3,0>: Cost 3 vext2 <1,2,3,0>, <3,0,1,2>
+ 2685632764U, // <3,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
+ 2685632774U, // <3,0,3,2>: Cost 3 vext3 LHS, <0,3,2,1>
+ 2624768412U, // <3,0,3,3>: Cost 3 vext2 <1,2,3,0>, <3,3,3,3>
+ 2624768514U, // <3,0,3,4>: Cost 3 vext2 <1,2,3,0>, <3,4,5,6>
+ 3702491714U, // <3,0,3,5>: Cost 4 vext2 <1,u,3,0>, <3,5,3,7>
+ 2624768632U, // <3,0,3,6>: Cost 3 vext2 <1,2,3,0>, <3,6,0,7>
+ 3702491843U, // <3,0,3,7>: Cost 4 vext2 <1,u,3,0>, <3,7,0,1>
+ 2686959934U, // <3,0,3,u>: Cost 3 vext3 <0,3,u,3>, <0,3,u,3>
+ 2689835336U, // <3,0,4,0>: Cost 3 vext3 LHS, <0,4,0,4>
+ 1611891026U, // <3,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
+ 1611891036U, // <3,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
+ 3763577184U, // <3,0,4,3>: Cost 4 vext3 LHS, <0,4,3,1>
+ 2689835374U, // <3,0,4,4>: Cost 3 vext3 LHS, <0,4,4,6>
+ 1551027510U, // <3,0,4,5>: Cost 2 vext2 <1,2,3,0>, RHS
+ 2666573172U, // <3,0,4,6>: Cost 3 vext2 <u,2,3,0>, <4,6,4,6>
+ 3667711206U, // <3,0,4,7>: Cost 4 vext1 <7,3,0,4>, <7,3,0,4>
+ 1616093586U, // <3,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
+ 2685190556U, // <3,0,5,0>: Cost 3 vext3 LHS, <0,5,0,7>
+ 2666573520U, // <3,0,5,1>: Cost 3 vext2 <u,2,3,0>, <5,1,7,3>
+ 3040886886U, // <3,0,5,2>: Cost 3 vtrnl <3,4,5,6>, LHS
+ 3625912834U, // <3,0,5,3>: Cost 4 vext1 <0,3,0,5>, <3,4,5,6>
+ 2666573766U, // <3,0,5,4>: Cost 3 vext2 <u,2,3,0>, <5,4,7,6>
+ 2666573828U, // <3,0,5,5>: Cost 3 vext2 <u,2,3,0>, <5,5,5,5>
+ 2732966354U, // <3,0,5,6>: Cost 3 vext3 LHS, <0,5,6,7>
+ 2666573992U, // <3,0,5,7>: Cost 3 vext2 <u,2,3,0>, <5,7,5,7>
+ 3040886940U, // <3,0,5,u>: Cost 3 vtrnl <3,4,5,6>, LHS
+ 2685190637U, // <3,0,6,0>: Cost 3 vext3 LHS, <0,6,0,7>
+ 2732966390U, // <3,0,6,1>: Cost 3 vext3 LHS, <0,6,1,7>
+ 2689835519U, // <3,0,6,2>: Cost 3 vext3 LHS, <0,6,2,7>
+ 3667724438U, // <3,0,6,3>: Cost 4 vext1 <7,3,0,6>, <3,0,1,2>
+ 3763577355U, // <3,0,6,4>: Cost 4 vext3 LHS, <0,6,4,1>
+ 3806708243U, // <3,0,6,5>: Cost 4 vext3 LHS, <0,6,5,0>
+ 2666574648U, // <3,0,6,6>: Cost 3 vext2 <u,2,3,0>, <6,6,6,6>
+ 2657948520U, // <3,0,6,7>: Cost 3 vext2 <6,7,3,0>, <6,7,3,0>
+ 2689835573U, // <3,0,6,u>: Cost 3 vext3 LHS, <0,6,u,7>
+ 2666574842U, // <3,0,7,0>: Cost 3 vext2 <u,2,3,0>, <7,0,1,2>
+ 2685633095U, // <3,0,7,1>: Cost 3 vext3 LHS, <0,7,1,7>
+ 2660603052U, // <3,0,7,2>: Cost 3 vext2 <7,2,3,0>, <7,2,3,0>
+ 3643844997U, // <3,0,7,3>: Cost 4 vext1 <3,3,0,7>, <3,3,0,7>
+ 2666575206U, // <3,0,7,4>: Cost 3 vext2 <u,2,3,0>, <7,4,5,6>
+ 3655790391U, // <3,0,7,5>: Cost 4 vext1 <5,3,0,7>, <5,3,0,7>
+ 3731690968U, // <3,0,7,6>: Cost 4 vext2 <6,7,3,0>, <7,6,0,3>
+ 2666575468U, // <3,0,7,7>: Cost 3 vext2 <u,2,3,0>, <7,7,7,7>
+ 2664584850U, // <3,0,7,u>: Cost 3 vext2 <7,u,3,0>, <7,u,3,0>
+ 1616093834U, // <3,0,u,0>: Cost 2 vext3 LHS, <0,u,0,2>
+ 1611891346U, // <3,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
+ 537707165U, // <3,0,u,2>: Cost 1 vext3 LHS, LHS
+ 2689835684U, // <3,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
+ 1616093874U, // <3,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
+ 1551030426U, // <3,0,u,5>: Cost 2 vext2 <1,2,3,0>, RHS
+ 2624772304U, // <3,0,u,6>: Cost 3 vext2 <1,2,3,0>, <u,6,3,7>
+ 2594002154U, // <3,0,u,7>: Cost 3 vext1 <7,3,0,u>, <7,3,0,u>
+ 537707219U, // <3,0,u,u>: Cost 1 vext3 LHS, LHS
+ 2552201318U, // <3,1,0,0>: Cost 3 vext1 <0,3,1,0>, LHS
+ 2618802278U, // <3,1,0,1>: Cost 3 vext2 <0,2,3,1>, LHS
+ 2618802366U, // <3,1,0,2>: Cost 3 vext2 <0,2,3,1>, <0,2,3,1>
+ 1611449078U, // <3,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
+ 2552204598U, // <3,1,0,4>: Cost 3 vext1 <0,3,1,0>, RHS
+ 2732966663U, // <3,1,0,5>: Cost 3 vext3 LHS, <1,0,5,1>
+ 3906258396U, // <3,1,0,6>: Cost 4 vuzpr <2,3,0,1>, <2,0,4,6>
+ 3667752171U, // <3,1,0,7>: Cost 4 vext1 <7,3,1,0>, <7,3,1,0>
+ 1611891491U, // <3,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
+ 2689835819U, // <3,1,1,0>: Cost 3 vext3 LHS, <1,1,0,1>
+ 1611449140U, // <3,1,1,1>: Cost 2 vext3 LHS, <1,1,1,1>
+ 2624775063U, // <3,1,1,2>: Cost 3 vext2 <1,2,3,1>, <1,2,3,1>
+ 1611891528U, // <3,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
+ 2689835859U, // <3,1,1,4>: Cost 3 vext3 LHS, <1,1,4,5>
+ 2689835868U, // <3,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
+ 3763577701U, // <3,1,1,6>: Cost 4 vext3 LHS, <1,1,6,5>
+ 3765273452U, // <3,1,1,7>: Cost 4 vext3 <1,1,7,3>, <1,1,7,3>
+ 1611891573U, // <3,1,1,u>: Cost 2 vext3 LHS, <1,1,u,3>
+ 2629420494U, // <3,1,2,0>: Cost 3 vext2 <2,0,3,1>, <2,0,3,1>
+ 2689835911U, // <3,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
+ 2564163248U, // <3,1,2,2>: Cost 3 vext1 <2,3,1,2>, <2,3,1,2>
+ 1611449238U, // <3,1,2,3>: Cost 2 vext3 LHS, <1,2,3,0>
+ 2564164918U, // <3,1,2,4>: Cost 3 vext1 <2,3,1,2>, RHS
+ 2689835947U, // <3,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
+ 3692545978U, // <3,1,2,6>: Cost 4 vext2 <0,2,3,1>, <2,6,3,7>
+ 2732966842U, // <3,1,2,7>: Cost 3 vext3 LHS, <1,2,7,0>
+ 1611891651U, // <3,1,2,u>: Cost 2 vext3 LHS, <1,2,u,0>
+ 1484456038U, // <3,1,3,0>: Cost 2 vext1 <1,3,1,3>, LHS
+ 1611891672U, // <3,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
+ 2685633502U, // <3,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
+ 2685633512U, // <3,1,3,3>: Cost 3 vext3 LHS, <1,3,3,1>
+ 1484459318U, // <3,1,3,4>: Cost 2 vext1 <1,3,1,3>, RHS
+ 1611891712U, // <3,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
+ 2689836041U, // <3,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
+ 2733409294U, // <3,1,3,7>: Cost 3 vext3 LHS, <1,3,7,3>
+ 1611891735U, // <3,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
+ 2552234086U, // <3,1,4,0>: Cost 3 vext1 <0,3,1,4>, LHS
+ 2732966955U, // <3,1,4,1>: Cost 3 vext3 LHS, <1,4,1,5>
+ 2732966964U, // <3,1,4,2>: Cost 3 vext3 LHS, <1,4,2,5>
+ 2685633597U, // <3,1,4,3>: Cost 3 vext3 LHS, <1,4,3,5>
+ 2552237366U, // <3,1,4,4>: Cost 3 vext1 <0,3,1,4>, RHS
+ 2618805558U, // <3,1,4,5>: Cost 3 vext2 <0,2,3,1>, RHS
+ 2769472822U, // <3,1,4,6>: Cost 3 vuzpl <3,0,1,2>, RHS
+ 3667784943U, // <3,1,4,7>: Cost 4 vext1 <7,3,1,4>, <7,3,1,4>
+ 2685633642U, // <3,1,4,u>: Cost 3 vext3 LHS, <1,4,u,5>
+ 2689836143U, // <3,1,5,0>: Cost 3 vext3 LHS, <1,5,0,1>
+ 2564187280U, // <3,1,5,1>: Cost 3 vext1 <2,3,1,5>, <1,5,3,7>
+ 2564187827U, // <3,1,5,2>: Cost 3 vext1 <2,3,1,5>, <2,3,1,5>
+ 1611891856U, // <3,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
+ 2689836183U, // <3,1,5,4>: Cost 3 vext3 LHS, <1,5,4,5>
+ 3759375522U, // <3,1,5,5>: Cost 4 vext3 LHS, <1,5,5,7>
+ 3720417378U, // <3,1,5,6>: Cost 4 vext2 <4,u,3,1>, <5,6,7,0>
+ 2832518454U, // <3,1,5,7>: Cost 3 vuzpr <2,3,0,1>, RHS
+ 1611891901U, // <3,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
+ 3763578048U, // <3,1,6,0>: Cost 4 vext3 LHS, <1,6,0,1>
+ 2689836239U, // <3,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
+ 2732967128U, // <3,1,6,2>: Cost 3 vext3 LHS, <1,6,2,7>
+ 2685633761U, // <3,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
+ 3763578088U, // <3,1,6,4>: Cost 4 vext3 LHS, <1,6,4,5>
+ 2689836275U, // <3,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
+ 3763578108U, // <3,1,6,6>: Cost 4 vext3 LHS, <1,6,6,7>
+ 2732967166U, // <3,1,6,7>: Cost 3 vext3 LHS, <1,6,7,0>
+ 2685633806U, // <3,1,6,u>: Cost 3 vext3 LHS, <1,6,u,7>
+ 3631972454U, // <3,1,7,0>: Cost 4 vext1 <1,3,1,7>, LHS
+ 2659947612U, // <3,1,7,1>: Cost 3 vext2 <7,1,3,1>, <7,1,3,1>
+ 4036102294U, // <3,1,7,2>: Cost 4 vzipr <1,5,3,7>, <3,0,1,2>
+ 3095396454U, // <3,1,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
+ 3631975734U, // <3,1,7,4>: Cost 4 vext1 <1,3,1,7>, RHS
+ 2222982144U, // <3,1,7,5>: Cost 3 vrev <1,3,5,7>
+ 3296797705U, // <3,1,7,6>: Cost 4 vrev <1,3,6,7>
+ 3720418924U, // <3,1,7,7>: Cost 4 vext2 <4,u,3,1>, <7,7,7,7>
+ 3095396459U, // <3,1,7,u>: Cost 3 vtrnr <1,3,5,7>, LHS
+ 1484496998U, // <3,1,u,0>: Cost 2 vext1 <1,3,1,u>, LHS
+ 1611892077U, // <3,1,u,1>: Cost 2 vext3 LHS, <1,u,1,3>
+ 2685633907U, // <3,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
+ 1611892092U, // <3,1,u,3>: Cost 2 vext3 LHS, <1,u,3,0>
+ 1484500278U, // <3,1,u,4>: Cost 2 vext1 <1,3,1,u>, RHS
+ 1611892117U, // <3,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
+ 2685633950U, // <3,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
+ 2832518697U, // <3,1,u,7>: Cost 3 vuzpr <2,3,0,1>, RHS
+ 1611892140U, // <3,1,u,u>: Cost 2 vext3 LHS, <1,u,u,3>
+ 2623455232U, // <3,2,0,0>: Cost 3 vext2 <1,0,3,2>, <0,0,0,0>
+ 1549713510U, // <3,2,0,1>: Cost 2 vext2 <1,0,3,2>, LHS
+ 2689836484U, // <3,2,0,2>: Cost 3 vext3 LHS, <2,0,2,0>
+ 2685633997U, // <3,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
+ 2623455570U, // <3,2,0,4>: Cost 3 vext2 <1,0,3,2>, <0,4,1,5>
+ 2732967398U, // <3,2,0,5>: Cost 3 vext3 LHS, <2,0,5,7>
+ 2689836524U, // <3,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
+ 2229044964U, // <3,2,0,7>: Cost 3 vrev <2,3,7,0>
+ 1549714077U, // <3,2,0,u>: Cost 2 vext2 <1,0,3,2>, LHS
+ 1549714166U, // <3,2,1,0>: Cost 2 vext2 <1,0,3,2>, <1,0,3,2>
+ 2623456052U, // <3,2,1,1>: Cost 3 vext2 <1,0,3,2>, <1,1,1,1>
+ 2623456150U, // <3,2,1,2>: Cost 3 vext2 <1,0,3,2>, <1,2,3,0>
+ 2685634079U, // <3,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
+ 2552286518U, // <3,2,1,4>: Cost 3 vext1 <0,3,2,1>, RHS
+ 2623456400U, // <3,2,1,5>: Cost 3 vext2 <1,0,3,2>, <1,5,3,7>
+ 2689836604U, // <3,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
+ 3667834101U, // <3,2,1,7>: Cost 4 vext1 <7,3,2,1>, <7,3,2,1>
+ 1155385070U, // <3,2,1,u>: Cost 2 vrev <2,3,u,1>
+ 2689836629U, // <3,2,2,0>: Cost 3 vext3 LHS, <2,2,0,1>
+ 2689836640U, // <3,2,2,1>: Cost 3 vext3 LHS, <2,2,1,3>
+ 1611449960U, // <3,2,2,2>: Cost 2 vext3 LHS, <2,2,2,2>
+ 1611892338U, // <3,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
+ 2689836669U, // <3,2,2,4>: Cost 3 vext3 LHS, <2,2,4,5>
+ 2689836680U, // <3,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
+ 2689836688U, // <3,2,2,6>: Cost 3 vext3 LHS, <2,2,6,6>
+ 3763578518U, // <3,2,2,7>: Cost 4 vext3 LHS, <2,2,7,3>
+ 1611892383U, // <3,2,2,u>: Cost 2 vext3 LHS, <2,2,u,3>
+ 1611450022U, // <3,2,3,0>: Cost 2 vext3 LHS, <2,3,0,1>
+ 2685191854U, // <3,2,3,1>: Cost 3 vext3 LHS, <2,3,1,0>
+ 2685191865U, // <3,2,3,2>: Cost 3 vext3 LHS, <2,3,2,2>
+ 2685191875U, // <3,2,3,3>: Cost 3 vext3 LHS, <2,3,3,3>
+ 1611450062U, // <3,2,3,4>: Cost 2 vext3 LHS, <2,3,4,5>
+ 2732967635U, // <3,2,3,5>: Cost 3 vext3 LHS, <2,3,5,1>
+ 2732967645U, // <3,2,3,6>: Cost 3 vext3 LHS, <2,3,6,2>
+ 2732967652U, // <3,2,3,7>: Cost 3 vext3 LHS, <2,3,7,0>
+ 1611450094U, // <3,2,3,u>: Cost 2 vext3 LHS, <2,3,u,1>
+ 2558279782U, // <3,2,4,0>: Cost 3 vext1 <1,3,2,4>, LHS
+ 2558280602U, // <3,2,4,1>: Cost 3 vext1 <1,3,2,4>, <1,2,3,4>
+ 2732967692U, // <3,2,4,2>: Cost 3 vext3 LHS, <2,4,2,4>
+ 2685634326U, // <3,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
+ 2558283062U, // <3,2,4,4>: Cost 3 vext1 <1,3,2,4>, RHS
+ 1549716790U, // <3,2,4,5>: Cost 2 vext2 <1,0,3,2>, RHS
+ 2689836844U, // <3,2,4,6>: Cost 3 vext3 LHS, <2,4,6,0>
+ 2229077736U, // <3,2,4,7>: Cost 3 vrev <2,3,7,4>
+ 1549717033U, // <3,2,4,u>: Cost 2 vext2 <1,0,3,2>, RHS
+ 2552316006U, // <3,2,5,0>: Cost 3 vext1 <0,3,2,5>, LHS
+ 2228643507U, // <3,2,5,1>: Cost 3 vrev <2,3,1,5>
+ 2689836896U, // <3,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
+ 2685634408U, // <3,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
+ 1155122894U, // <3,2,5,4>: Cost 2 vrev <2,3,4,5>
+ 2665263108U, // <3,2,5,5>: Cost 3 vext2 <u,0,3,2>, <5,5,5,5>
+ 2689836932U, // <3,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
+ 2665263272U, // <3,2,5,7>: Cost 3 vext2 <u,0,3,2>, <5,7,5,7>
+ 1155417842U, // <3,2,5,u>: Cost 2 vrev <2,3,u,5>
+ 2689836953U, // <3,2,6,0>: Cost 3 vext3 LHS, <2,6,0,1>
+ 2689836964U, // <3,2,6,1>: Cost 3 vext3 LHS, <2,6,1,3>
+ 2689836976U, // <3,2,6,2>: Cost 3 vext3 LHS, <2,6,2,6>
+ 1611892666U, // <3,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
+ 2689836993U, // <3,2,6,4>: Cost 3 vext3 LHS, <2,6,4,5>
+ 2689837004U, // <3,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
+ 2689837013U, // <3,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
+ 2665263950U, // <3,2,6,7>: Cost 3 vext2 <u,0,3,2>, <6,7,0,1>
+ 1611892711U, // <3,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
+ 2665264122U, // <3,2,7,0>: Cost 3 vext2 <u,0,3,2>, <7,0,1,2>
+ 2623460419U, // <3,2,7,1>: Cost 3 vext2 <1,0,3,2>, <7,1,0,3>
+ 4169138340U, // <3,2,7,2>: Cost 4 vtrnr <1,3,5,7>, <0,2,0,2>
+ 2962358374U, // <3,2,7,3>: Cost 3 vzipr <1,5,3,7>, LHS
+ 2665264486U, // <3,2,7,4>: Cost 3 vext2 <u,0,3,2>, <7,4,5,6>
+ 2228954841U, // <3,2,7,5>: Cost 3 vrev <2,3,5,7>
+ 2229028578U, // <3,2,7,6>: Cost 3 vrev <2,3,6,7>
+ 2665264748U, // <3,2,7,7>: Cost 3 vext2 <u,0,3,2>, <7,7,7,7>
+ 2962358379U, // <3,2,7,u>: Cost 3 vzipr <1,5,3,7>, LHS
+ 1611892795U, // <3,2,u,0>: Cost 2 vext3 LHS, <2,u,0,1>
+ 1549719342U, // <3,2,u,1>: Cost 2 vext2 <1,0,3,2>, LHS
+ 1611449960U, // <3,2,u,2>: Cost 2 vext3 LHS, <2,2,2,2>
+ 1611892824U, // <3,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
+ 1611892835U, // <3,2,u,4>: Cost 2 vext3 LHS, <2,u,4,5>
+ 1549719706U, // <3,2,u,5>: Cost 2 vext2 <1,0,3,2>, RHS
+ 2689837168U, // <3,2,u,6>: Cost 3 vext3 LHS, <2,u,6,0>
+ 2665265408U, // <3,2,u,7>: Cost 3 vext2 <u,0,3,2>, <u,7,0,1>
+ 1611892867U, // <3,2,u,u>: Cost 2 vext3 LHS, <2,u,u,1>
+ 2685192331U, // <3,3,0,0>: Cost 3 vext3 LHS, <3,0,0,0>
+ 1611450518U, // <3,3,0,1>: Cost 2 vext3 LHS, <3,0,1,2>
+ 2685634717U, // <3,3,0,2>: Cost 3 vext3 LHS, <3,0,2,0>
+ 2564294806U, // <3,3,0,3>: Cost 3 vext1 <2,3,3,0>, <3,0,1,2>
+ 2685634736U, // <3,3,0,4>: Cost 3 vext3 LHS, <3,0,4,1>
+ 2732968122U, // <3,3,0,5>: Cost 3 vext3 LHS, <3,0,5,2>
+ 3763579075U, // <3,3,0,6>: Cost 4 vext3 LHS, <3,0,6,2>
+ 4034053264U, // <3,3,0,7>: Cost 4 vzipr <1,2,3,0>, <1,5,3,7>
+ 1611450581U, // <3,3,0,u>: Cost 2 vext3 LHS, <3,0,u,2>
+ 2685192415U, // <3,3,1,0>: Cost 3 vext3 LHS, <3,1,0,3>
+ 1550385992U, // <3,3,1,1>: Cost 2 vext2 <1,1,3,3>, <1,1,3,3>
+ 2685192433U, // <3,3,1,2>: Cost 3 vext3 LHS, <3,1,2,3>
+ 2685634808U, // <3,3,1,3>: Cost 3 vext3 LHS, <3,1,3,1>
+ 2558332214U, // <3,3,1,4>: Cost 3 vext1 <1,3,3,1>, RHS
+ 2685634828U, // <3,3,1,5>: Cost 3 vext3 LHS, <3,1,5,3>
+ 3759376661U, // <3,3,1,6>: Cost 4 vext3 LHS, <3,1,6,3>
+ 2703477022U, // <3,3,1,7>: Cost 3 vext3 <3,1,7,3>, <3,1,7,3>
+ 1555031423U, // <3,3,1,u>: Cost 2 vext2 <1,u,3,3>, <1,u,3,3>
+ 2564309094U, // <3,3,2,0>: Cost 3 vext1 <2,3,3,2>, LHS
+ 2630100513U, // <3,3,2,1>: Cost 3 vext2 <2,1,3,3>, <2,1,3,3>
+ 1557022322U, // <3,3,2,2>: Cost 2 vext2 <2,2,3,3>, <2,2,3,3>
+ 2685192520U, // <3,3,2,3>: Cost 3 vext3 LHS, <3,2,3,0>
+ 2564312374U, // <3,3,2,4>: Cost 3 vext1 <2,3,3,2>, RHS
+ 2732968286U, // <3,3,2,5>: Cost 3 vext3 LHS, <3,2,5,4>
+ 2685634918U, // <3,3,2,6>: Cost 3 vext3 LHS, <3,2,6,3>
+ 2704140655U, // <3,3,2,7>: Cost 3 vext3 <3,2,7,3>, <3,2,7,3>
+ 1561004120U, // <3,3,2,u>: Cost 2 vext2 <2,u,3,3>, <2,u,3,3>
+ 1496547430U, // <3,3,3,0>: Cost 2 vext1 <3,3,3,3>, LHS
+ 2624129256U, // <3,3,3,1>: Cost 3 vext2 <1,1,3,3>, <3,1,1,3>
+ 2630764866U, // <3,3,3,2>: Cost 3 vext2 <2,2,3,3>, <3,2,2,3>
+ 336380006U, // <3,3,3,3>: Cost 1 vdup3 LHS
+ 1496550710U, // <3,3,3,4>: Cost 2 vext1 <3,3,3,3>, RHS
+ 2732968368U, // <3,3,3,5>: Cost 3 vext3 LHS, <3,3,5,5>
+ 2624129683U, // <3,3,3,6>: Cost 3 vext2 <1,1,3,3>, <3,6,3,7>
+ 2594182400U, // <3,3,3,7>: Cost 3 vext1 <7,3,3,3>, <7,3,3,3>
+ 336380006U, // <3,3,3,u>: Cost 1 vdup3 LHS
+ 2558353510U, // <3,3,4,0>: Cost 3 vext1 <1,3,3,4>, LHS
+ 2558354411U, // <3,3,4,1>: Cost 3 vext1 <1,3,3,4>, <1,3,3,4>
+ 2564327108U, // <3,3,4,2>: Cost 3 vext1 <2,3,3,4>, <2,3,3,4>
+ 2564327938U, // <3,3,4,3>: Cost 3 vext1 <2,3,3,4>, <3,4,5,6>
+ 2960343962U, // <3,3,4,4>: Cost 3 vzipr <1,2,3,4>, <1,2,3,4>
+ 1611893250U, // <3,3,4,5>: Cost 2 vext3 LHS, <3,4,5,6>
+ 2771619126U, // <3,3,4,6>: Cost 3 vuzpl <3,3,3,3>, RHS
+ 4034086032U, // <3,3,4,7>: Cost 4 vzipr <1,2,3,4>, <1,5,3,7>
+ 1611893277U, // <3,3,4,u>: Cost 2 vext3 LHS, <3,4,u,6>
+ 2558361702U, // <3,3,5,0>: Cost 3 vext1 <1,3,3,5>, LHS
+ 2558362604U, // <3,3,5,1>: Cost 3 vext1 <1,3,3,5>, <1,3,3,5>
+ 2558363342U, // <3,3,5,2>: Cost 3 vext1 <1,3,3,5>, <2,3,4,5>
+ 2732968512U, // <3,3,5,3>: Cost 3 vext3 LHS, <3,5,3,5>
+ 2558364982U, // <3,3,5,4>: Cost 3 vext1 <1,3,3,5>, RHS
+ 3101279950U, // <3,3,5,5>: Cost 3 vtrnr <2,3,4,5>, <2,3,4,5>
+ 2665934946U, // <3,3,5,6>: Cost 3 vext2 <u,1,3,3>, <5,6,7,0>
+ 2826636598U, // <3,3,5,7>: Cost 3 vuzpr <1,3,1,3>, RHS
+ 2826636599U, // <3,3,5,u>: Cost 3 vuzpr <1,3,1,3>, RHS
+ 2732968568U, // <3,3,6,0>: Cost 3 vext3 LHS, <3,6,0,7>
+ 3763579521U, // <3,3,6,1>: Cost 4 vext3 LHS, <3,6,1,7>
+ 2732968586U, // <3,3,6,2>: Cost 3 vext3 LHS, <3,6,2,7>
+ 2732968595U, // <3,3,6,3>: Cost 3 vext3 LHS, <3,6,3,7>
+ 2732968604U, // <3,3,6,4>: Cost 3 vext3 LHS, <3,6,4,7>
+ 3763579557U, // <3,3,6,5>: Cost 4 vext3 LHS, <3,6,5,7>
+ 2732968621U, // <3,3,6,6>: Cost 3 vext3 LHS, <3,6,6,6>
+ 2657973099U, // <3,3,6,7>: Cost 3 vext2 <6,7,3,3>, <6,7,3,3>
+ 2658636732U, // <3,3,6,u>: Cost 3 vext2 <6,u,3,3>, <6,u,3,3>
+ 2558378086U, // <3,3,7,0>: Cost 3 vext1 <1,3,3,7>, LHS
+ 2558378990U, // <3,3,7,1>: Cost 3 vext1 <1,3,3,7>, <1,3,3,7>
+ 2564351687U, // <3,3,7,2>: Cost 3 vext1 <2,3,3,7>, <2,3,3,7>
+ 2661291264U, // <3,3,7,3>: Cost 3 vext2 <7,3,3,3>, <7,3,3,3>
+ 2558381366U, // <3,3,7,4>: Cost 3 vext1 <1,3,3,7>, RHS
+ 2732968694U, // <3,3,7,5>: Cost 3 vext3 LHS, <3,7,5,7>
+ 3781126907U, // <3,3,7,6>: Cost 4 vext3 <3,7,6,3>, <3,7,6,3>
+ 3095397376U, // <3,3,7,7>: Cost 3 vtrnr <1,3,5,7>, <1,3,5,7>
+ 2558383918U, // <3,3,7,u>: Cost 3 vext1 <1,3,3,7>, LHS
+ 1496547430U, // <3,3,u,0>: Cost 2 vext1 <3,3,3,3>, LHS
+ 1611893534U, // <3,3,u,1>: Cost 2 vext3 LHS, <3,u,1,2>
+ 1592858504U, // <3,3,u,2>: Cost 2 vext2 <u,2,3,3>, <u,2,3,3>
+ 336380006U, // <3,3,u,3>: Cost 1 vdup3 LHS
+ 1496550710U, // <3,3,u,4>: Cost 2 vext1 <3,3,3,3>, RHS
+ 1611893574U, // <3,3,u,5>: Cost 2 vext3 LHS, <3,u,5,6>
+ 2690280268U, // <3,3,u,6>: Cost 3 vext3 LHS, <3,u,6,3>
+ 2826636841U, // <3,3,u,7>: Cost 3 vuzpr <1,3,1,3>, RHS
+ 336380006U, // <3,3,u,u>: Cost 1 vdup3 LHS
+ 2624798720U, // <3,4,0,0>: Cost 3 vext2 <1,2,3,4>, <0,0,0,0>
+ 1551056998U, // <3,4,0,1>: Cost 2 vext2 <1,2,3,4>, LHS
+ 2624798884U, // <3,4,0,2>: Cost 3 vext2 <1,2,3,4>, <0,2,0,2>
+ 3693232384U, // <3,4,0,3>: Cost 4 vext2 <0,3,3,4>, <0,3,1,4>
+ 2624799058U, // <3,4,0,4>: Cost 3 vext2 <1,2,3,4>, <0,4,1,5>
+ 1659227026U, // <3,4,0,5>: Cost 2 vext3 LHS, <4,0,5,1>
+ 1659227036U, // <3,4,0,6>: Cost 2 vext3 LHS, <4,0,6,2>
+ 3667973382U, // <3,4,0,7>: Cost 4 vext1 <7,3,4,0>, <7,3,4,0>
+ 1551057565U, // <3,4,0,u>: Cost 2 vext2 <1,2,3,4>, LHS
+ 2624799478U, // <3,4,1,0>: Cost 3 vext2 <1,2,3,4>, <1,0,3,2>
+ 2624799540U, // <3,4,1,1>: Cost 3 vext2 <1,2,3,4>, <1,1,1,1>
+ 1551057818U, // <3,4,1,2>: Cost 2 vext2 <1,2,3,4>, <1,2,3,4>
+ 2624799704U, // <3,4,1,3>: Cost 3 vext2 <1,2,3,4>, <1,3,1,3>
+ 2564377910U, // <3,4,1,4>: Cost 3 vext1 <2,3,4,1>, RHS
+ 2689838050U, // <3,4,1,5>: Cost 3 vext3 LHS, <4,1,5,0>
+ 2689838062U, // <3,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
+ 2628117807U, // <3,4,1,7>: Cost 3 vext2 <1,7,3,4>, <1,7,3,4>
+ 1555039616U, // <3,4,1,u>: Cost 2 vext2 <1,u,3,4>, <1,u,3,4>
+ 3626180710U, // <3,4,2,0>: Cost 4 vext1 <0,3,4,2>, LHS
+ 2624800298U, // <3,4,2,1>: Cost 3 vext2 <1,2,3,4>, <2,1,4,3>
+ 2624800360U, // <3,4,2,2>: Cost 3 vext2 <1,2,3,4>, <2,2,2,2>
+ 2624800422U, // <3,4,2,3>: Cost 3 vext2 <1,2,3,4>, <2,3,0,1>
+ 2624800514U, // <3,4,2,4>: Cost 3 vext2 <1,2,3,4>, <2,4,1,3>
+ 2709965878U, // <3,4,2,5>: Cost 3 vext3 <4,2,5,3>, <4,2,5,3>
+ 2689838140U, // <3,4,2,6>: Cost 3 vext3 LHS, <4,2,6,0>
+ 2634090504U, // <3,4,2,7>: Cost 3 vext2 <2,7,3,4>, <2,7,3,4>
+ 2689838158U, // <3,4,2,u>: Cost 3 vext3 LHS, <4,2,u,0>
+ 2624800918U, // <3,4,3,0>: Cost 3 vext2 <1,2,3,4>, <3,0,1,2>
+ 2636081403U, // <3,4,3,1>: Cost 3 vext2 <3,1,3,4>, <3,1,3,4>
+ 2636745036U, // <3,4,3,2>: Cost 3 vext2 <3,2,3,4>, <3,2,3,4>
+ 2624801180U, // <3,4,3,3>: Cost 3 vext2 <1,2,3,4>, <3,3,3,3>
+ 2624801232U, // <3,4,3,4>: Cost 3 vext2 <1,2,3,4>, <3,4,0,1>
+ 2905836854U, // <3,4,3,5>: Cost 3 vzipl <3,3,3,3>, RHS
+ 3040054582U, // <3,4,3,6>: Cost 3 vtrnl <3,3,3,3>, RHS
+ 3702524611U, // <3,4,3,7>: Cost 4 vext2 <1,u,3,4>, <3,7,0,1>
+ 2624801566U, // <3,4,3,u>: Cost 3 vext2 <1,2,3,4>, <3,u,1,2>
+ 2564399206U, // <3,4,4,0>: Cost 3 vext1 <2,3,4,4>, LHS
+ 2564400026U, // <3,4,4,1>: Cost 3 vext1 <2,3,4,4>, <1,2,3,4>
+ 2564400845U, // <3,4,4,2>: Cost 3 vext1 <2,3,4,4>, <2,3,4,4>
+ 2570373542U, // <3,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
+ 1659227344U, // <3,4,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
+ 1551060278U, // <3,4,4,5>: Cost 2 vext2 <1,2,3,4>, RHS
+ 1659227364U, // <3,4,4,6>: Cost 2 vext3 LHS, <4,4,6,6>
+ 3668006154U, // <3,4,4,7>: Cost 4 vext1 <7,3,4,4>, <7,3,4,4>
+ 1551060521U, // <3,4,4,u>: Cost 2 vext2 <1,2,3,4>, RHS
+ 1490665574U, // <3,4,5,0>: Cost 2 vext1 <2,3,4,5>, LHS
+ 2689838341U, // <3,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
+ 1490667214U, // <3,4,5,2>: Cost 2 vext1 <2,3,4,5>, <2,3,4,5>
+ 2564409494U, // <3,4,5,3>: Cost 3 vext1 <2,3,4,5>, <3,0,1,2>
+ 1490668854U, // <3,4,5,4>: Cost 2 vext1 <2,3,4,5>, RHS
+ 2689838381U, // <3,4,5,5>: Cost 3 vext3 LHS, <4,5,5,7>
+ 537709878U, // <3,4,5,6>: Cost 1 vext3 LHS, RHS
+ 2594272523U, // <3,4,5,7>: Cost 3 vext1 <7,3,4,5>, <7,3,4,5>
+ 537709896U, // <3,4,5,u>: Cost 1 vext3 LHS, RHS
+ 2689838411U, // <3,4,6,0>: Cost 3 vext3 LHS, <4,6,0,1>
+ 2558444534U, // <3,4,6,1>: Cost 3 vext1 <1,3,4,6>, <1,3,4,6>
+ 2666607098U, // <3,4,6,2>: Cost 3 vext2 <u,2,3,4>, <6,2,7,3>
+ 2558446082U, // <3,4,6,3>: Cost 3 vext1 <1,3,4,6>, <3,4,5,6>
+ 1659227508U, // <3,4,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
+ 2689838462U, // <3,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
+ 2689838471U, // <3,4,6,6>: Cost 3 vext3 LHS, <4,6,6,7>
+ 2657981292U, // <3,4,6,7>: Cost 3 vext2 <6,7,3,4>, <6,7,3,4>
+ 1659227540U, // <3,4,6,u>: Cost 2 vext3 LHS, <4,6,u,2>
+ 2666607610U, // <3,4,7,0>: Cost 3 vext2 <u,2,3,4>, <7,0,1,2>
+ 3702527072U, // <3,4,7,1>: Cost 4 vext2 <1,u,3,4>, <7,1,3,5>
+ 2660635824U, // <3,4,7,2>: Cost 3 vext2 <7,2,3,4>, <7,2,3,4>
+ 3644139945U, // <3,4,7,3>: Cost 4 vext1 <3,3,4,7>, <3,3,4,7>
+ 2666607974U, // <3,4,7,4>: Cost 3 vext2 <u,2,3,4>, <7,4,5,6>
+ 2732969416U, // <3,4,7,5>: Cost 3 vext3 LHS, <4,7,5,0>
+ 2732969425U, // <3,4,7,6>: Cost 3 vext3 LHS, <4,7,6,0>
+ 2666608236U, // <3,4,7,7>: Cost 3 vext2 <u,2,3,4>, <7,7,7,7>
+ 2664617622U, // <3,4,7,u>: Cost 3 vext2 <7,u,3,4>, <7,u,3,4>
+ 1490690150U, // <3,4,u,0>: Cost 2 vext1 <2,3,4,u>, LHS
+ 1551062830U, // <3,4,u,1>: Cost 2 vext2 <1,2,3,4>, LHS
+ 1490691793U, // <3,4,u,2>: Cost 2 vext1 <2,3,4,u>, <2,3,4,u>
+ 2624804796U, // <3,4,u,3>: Cost 3 vext2 <1,2,3,4>, <u,3,0,1>
+ 1490693430U, // <3,4,u,4>: Cost 2 vext1 <2,3,4,u>, RHS
+ 1551063194U, // <3,4,u,5>: Cost 2 vext2 <1,2,3,4>, RHS
+ 537710121U, // <3,4,u,6>: Cost 1 vext3 LHS, RHS
+ 2594297102U, // <3,4,u,7>: Cost 3 vext1 <7,3,4,u>, <7,3,4,u>
+ 537710139U, // <3,4,u,u>: Cost 1 vext3 LHS, RHS
+ 3692576768U, // <3,5,0,0>: Cost 4 vext2 <0,2,3,5>, <0,0,0,0>
+ 2618835046U, // <3,5,0,1>: Cost 3 vext2 <0,2,3,5>, LHS
+ 2618835138U, // <3,5,0,2>: Cost 3 vext2 <0,2,3,5>, <0,2,3,5>
+ 3692577024U, // <3,5,0,3>: Cost 4 vext2 <0,2,3,5>, <0,3,1,4>
+ 2689838690U, // <3,5,0,4>: Cost 3 vext3 LHS, <5,0,4,1>
+ 2732969579U, // <3,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
+ 2732969588U, // <3,5,0,6>: Cost 3 vext3 LHS, <5,0,6,1>
+ 2246963055U, // <3,5,0,7>: Cost 3 vrev <5,3,7,0>
+ 2618835613U, // <3,5,0,u>: Cost 3 vext2 <0,2,3,5>, LHS
+ 2594308198U, // <3,5,1,0>: Cost 3 vext1 <7,3,5,1>, LHS
+ 3692577588U, // <3,5,1,1>: Cost 4 vext2 <0,2,3,5>, <1,1,1,1>
+ 2624807835U, // <3,5,1,2>: Cost 3 vext2 <1,2,3,5>, <1,2,3,5>
+ 2625471468U, // <3,5,1,3>: Cost 3 vext2 <1,3,3,5>, <1,3,3,5>
+ 2626135101U, // <3,5,1,4>: Cost 3 vext2 <1,4,3,5>, <1,4,3,5>
+ 2594311888U, // <3,5,1,5>: Cost 3 vext1 <7,3,5,1>, <5,1,7,3>
+ 3699877107U, // <3,5,1,6>: Cost 4 vext2 <1,4,3,5>, <1,6,5,7>
+ 1641680592U, // <3,5,1,7>: Cost 2 vext3 <5,1,7,3>, <5,1,7,3>
+ 1641754329U, // <3,5,1,u>: Cost 2 vext3 <5,1,u,3>, <5,1,u,3>
+ 3692578274U, // <3,5,2,0>: Cost 4 vext2 <0,2,3,5>, <2,0,5,3>
+ 2630116899U, // <3,5,2,1>: Cost 3 vext2 <2,1,3,5>, <2,1,3,5>
+ 3692578408U, // <3,5,2,2>: Cost 4 vext2 <0,2,3,5>, <2,2,2,2>
+ 2625472206U, // <3,5,2,3>: Cost 3 vext2 <1,3,3,5>, <2,3,4,5>
+ 2632107798U, // <3,5,2,4>: Cost 3 vext2 <2,4,3,5>, <2,4,3,5>
+ 2715938575U, // <3,5,2,5>: Cost 3 vext3 <5,2,5,3>, <5,2,5,3>
+ 3692578746U, // <3,5,2,6>: Cost 4 vext2 <0,2,3,5>, <2,6,3,7>
+ 2716086049U, // <3,5,2,7>: Cost 3 vext3 <5,2,7,3>, <5,2,7,3>
+ 2634762330U, // <3,5,2,u>: Cost 3 vext2 <2,u,3,5>, <2,u,3,5>
+ 3692578966U, // <3,5,3,0>: Cost 4 vext2 <0,2,3,5>, <3,0,1,2>
+ 2636089596U, // <3,5,3,1>: Cost 3 vext2 <3,1,3,5>, <3,1,3,5>
+ 3699214668U, // <3,5,3,2>: Cost 4 vext2 <1,3,3,5>, <3,2,3,4>
+ 2638080412U, // <3,5,3,3>: Cost 3 vext2 <3,4,3,5>, <3,3,3,3>
+ 2618837506U, // <3,5,3,4>: Cost 3 vext2 <0,2,3,5>, <3,4,5,6>
+ 2832844494U, // <3,5,3,5>: Cost 3 vuzpr <2,3,4,5>, <2,3,4,5>
+ 4033415682U, // <3,5,3,6>: Cost 4 vzipr <1,1,3,3>, <3,4,5,6>
+ 3095072054U, // <3,5,3,7>: Cost 3 vtrnr <1,3,1,3>, RHS
+ 3095072055U, // <3,5,3,u>: Cost 3 vtrnr <1,3,1,3>, RHS
+ 2600304742U, // <3,5,4,0>: Cost 3 vext1 <u,3,5,4>, LHS
+ 3763580815U, // <3,5,4,1>: Cost 4 vext3 LHS, <5,4,1,5>
+ 2564474582U, // <3,5,4,2>: Cost 3 vext1 <2,3,5,4>, <2,3,5,4>
+ 3699879044U, // <3,5,4,3>: Cost 4 vext2 <1,4,3,5>, <4,3,5,0>
+ 2600308022U, // <3,5,4,4>: Cost 3 vext1 <u,3,5,4>, RHS
+ 2618838326U, // <3,5,4,5>: Cost 3 vext2 <0,2,3,5>, RHS
+ 2772454710U, // <3,5,4,6>: Cost 3 vuzpl <3,4,5,6>, RHS
+ 1659228102U, // <3,5,4,7>: Cost 2 vext3 LHS, <5,4,7,6>
+ 1659228111U, // <3,5,4,u>: Cost 2 vext3 LHS, <5,4,u,6>
+ 2570453094U, // <3,5,5,0>: Cost 3 vext1 <3,3,5,5>, LHS
+ 2624810704U, // <3,5,5,1>: Cost 3 vext2 <1,2,3,5>, <5,1,7,3>
+ 2570454734U, // <3,5,5,2>: Cost 3 vext1 <3,3,5,5>, <2,3,4,5>
+ 2570455472U, // <3,5,5,3>: Cost 3 vext1 <3,3,5,5>, <3,3,5,5>
+ 2570456374U, // <3,5,5,4>: Cost 3 vext1 <3,3,5,5>, RHS
+ 1659228164U, // <3,5,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
+ 2732969998U, // <3,5,5,6>: Cost 3 vext3 LHS, <5,5,6,6>
+ 1659228184U, // <3,5,5,7>: Cost 2 vext3 LHS, <5,5,7,7>
+ 1659228193U, // <3,5,5,u>: Cost 2 vext3 LHS, <5,5,u,7>
+ 2732970020U, // <3,5,6,0>: Cost 3 vext3 LHS, <5,6,0,1>
+ 2732970035U, // <3,5,6,1>: Cost 3 vext3 LHS, <5,6,1,7>
+ 2564490968U, // <3,5,6,2>: Cost 3 vext1 <2,3,5,6>, <2,3,5,6>
+ 2732970050U, // <3,5,6,3>: Cost 3 vext3 LHS, <5,6,3,4>
+ 2732970060U, // <3,5,6,4>: Cost 3 vext3 LHS, <5,6,4,5>
+ 2732970071U, // <3,5,6,5>: Cost 3 vext3 LHS, <5,6,5,7>
+ 2732970080U, // <3,5,6,6>: Cost 3 vext3 LHS, <5,6,6,7>
+ 1659228258U, // <3,5,6,7>: Cost 2 vext3 LHS, <5,6,7,0>
+ 1659228267U, // <3,5,6,u>: Cost 2 vext3 LHS, <5,6,u,0>
+ 1484783718U, // <3,5,7,0>: Cost 2 vext1 <1,3,5,7>, LHS
+ 1484784640U, // <3,5,7,1>: Cost 2 vext1 <1,3,5,7>, <1,3,5,7>
+ 2558527080U, // <3,5,7,2>: Cost 3 vext1 <1,3,5,7>, <2,2,2,2>
+ 2558527638U, // <3,5,7,3>: Cost 3 vext1 <1,3,5,7>, <3,0,1,2>
+ 1484786998U, // <3,5,7,4>: Cost 2 vext1 <1,3,5,7>, RHS
+ 1659228328U, // <3,5,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
+ 2732970154U, // <3,5,7,6>: Cost 3 vext3 LHS, <5,7,6,0>
+ 2558531180U, // <3,5,7,7>: Cost 3 vext1 <1,3,5,7>, <7,7,7,7>
+ 1484789550U, // <3,5,7,u>: Cost 2 vext1 <1,3,5,7>, LHS
+ 1484791910U, // <3,5,u,0>: Cost 2 vext1 <1,3,5,u>, LHS
+ 1484792833U, // <3,5,u,1>: Cost 2 vext1 <1,3,5,u>, <1,3,5,u>
+ 2558535272U, // <3,5,u,2>: Cost 3 vext1 <1,3,5,u>, <2,2,2,2>
+ 2558535830U, // <3,5,u,3>: Cost 3 vext1 <1,3,5,u>, <3,0,1,2>
+ 1484795190U, // <3,5,u,4>: Cost 2 vext1 <1,3,5,u>, RHS
+ 1659228409U, // <3,5,u,5>: Cost 2 vext3 LHS, <5,u,5,7>
+ 2772457626U, // <3,5,u,6>: Cost 3 vuzpl <3,4,5,6>, RHS
+ 1646326023U, // <3,5,u,7>: Cost 2 vext3 <5,u,7,3>, <5,u,7,3>
+ 1484797742U, // <3,5,u,u>: Cost 2 vext1 <1,3,5,u>, LHS
+ 2558541926U, // <3,6,0,0>: Cost 3 vext1 <1,3,6,0>, LHS
+ 2689839393U, // <3,6,0,1>: Cost 3 vext3 LHS, <6,0,1,2>
+ 2689839404U, // <3,6,0,2>: Cost 3 vext3 LHS, <6,0,2,4>
+ 3706519808U, // <3,6,0,3>: Cost 4 vext2 <2,5,3,6>, <0,3,1,4>
+ 2689839420U, // <3,6,0,4>: Cost 3 vext3 LHS, <6,0,4,2>
+ 2732970314U, // <3,6,0,5>: Cost 3 vext3 LHS, <6,0,5,7>
+ 2732970316U, // <3,6,0,6>: Cost 3 vext3 LHS, <6,0,6,0>
+ 2960313654U, // <3,6,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
+ 2689839456U, // <3,6,0,u>: Cost 3 vext3 LHS, <6,0,u,2>
+ 3763581290U, // <3,6,1,0>: Cost 4 vext3 LHS, <6,1,0,3>
+ 3763581297U, // <3,6,1,1>: Cost 4 vext3 LHS, <6,1,1,1>
+ 2624816028U, // <3,6,1,2>: Cost 3 vext2 <1,2,3,6>, <1,2,3,6>
+ 3763581315U, // <3,6,1,3>: Cost 4 vext3 LHS, <6,1,3,1>
+ 2626143294U, // <3,6,1,4>: Cost 3 vext2 <1,4,3,6>, <1,4,3,6>
+ 3763581335U, // <3,6,1,5>: Cost 4 vext3 LHS, <6,1,5,3>
+ 2721321376U, // <3,6,1,6>: Cost 3 vext3 <6,1,6,3>, <6,1,6,3>
+ 2721395113U, // <3,6,1,7>: Cost 3 vext3 <6,1,7,3>, <6,1,7,3>
+ 2628797826U, // <3,6,1,u>: Cost 3 vext2 <1,u,3,6>, <1,u,3,6>
+ 2594390118U, // <3,6,2,0>: Cost 3 vext1 <7,3,6,2>, LHS
+ 2721616324U, // <3,6,2,1>: Cost 3 vext3 <6,2,1,3>, <6,2,1,3>
+ 2630788725U, // <3,6,2,2>: Cost 3 vext2 <2,2,3,6>, <2,2,3,6>
+ 3763581395U, // <3,6,2,3>: Cost 4 vext3 LHS, <6,2,3,0>
+ 2632115991U, // <3,6,2,4>: Cost 3 vext2 <2,4,3,6>, <2,4,3,6>
+ 2632779624U, // <3,6,2,5>: Cost 3 vext2 <2,5,3,6>, <2,5,3,6>
+ 2594394618U, // <3,6,2,6>: Cost 3 vext1 <7,3,6,2>, <6,2,7,3>
+ 1648316922U, // <3,6,2,7>: Cost 2 vext3 <6,2,7,3>, <6,2,7,3>
+ 1648390659U, // <3,6,2,u>: Cost 2 vext3 <6,2,u,3>, <6,2,u,3>
+ 3693914262U, // <3,6,3,0>: Cost 4 vext2 <0,4,3,6>, <3,0,1,2>
+ 3638281176U, // <3,6,3,1>: Cost 4 vext1 <2,3,6,3>, <1,3,1,3>
+ 3696568678U, // <3,6,3,2>: Cost 4 vext2 <0,u,3,6>, <3,2,6,3>
+ 2638088604U, // <3,6,3,3>: Cost 3 vext2 <3,4,3,6>, <3,3,3,3>
+ 2632780290U, // <3,6,3,4>: Cost 3 vext2 <2,5,3,6>, <3,4,5,6>
+ 3712494145U, // <3,6,3,5>: Cost 4 vext2 <3,5,3,6>, <3,5,3,6>
+ 3698559612U, // <3,6,3,6>: Cost 4 vext2 <1,2,3,6>, <3,6,1,2>
+ 2959674678U, // <3,6,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
+ 2959674679U, // <3,6,3,u>: Cost 3 vzipr <1,1,3,3>, RHS
+ 3763581536U, // <3,6,4,0>: Cost 4 vext3 LHS, <6,4,0,6>
+ 2722943590U, // <3,6,4,1>: Cost 3 vext3 <6,4,1,3>, <6,4,1,3>
+ 2732970609U, // <3,6,4,2>: Cost 3 vext3 LHS, <6,4,2,5>
+ 3698560147U, // <3,6,4,3>: Cost 4 vext2 <1,2,3,6>, <4,3,6,6>
+ 2732970628U, // <3,6,4,4>: Cost 3 vext3 LHS, <6,4,4,6>
+ 2689839757U, // <3,6,4,5>: Cost 3 vext3 LHS, <6,4,5,6>
+ 2732970640U, // <3,6,4,6>: Cost 3 vext3 LHS, <6,4,6,0>
+ 2960346422U, // <3,6,4,7>: Cost 3 vzipr <1,2,3,4>, RHS
+ 2689839784U, // <3,6,4,u>: Cost 3 vext3 LHS, <6,4,u,6>
+ 2576498790U, // <3,6,5,0>: Cost 3 vext1 <4,3,6,5>, LHS
+ 3650241270U, // <3,6,5,1>: Cost 4 vext1 <4,3,6,5>, <1,0,3,2>
+ 2732970692U, // <3,6,5,2>: Cost 3 vext3 LHS, <6,5,2,7>
+ 2576501250U, // <3,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
+ 2576501906U, // <3,6,5,4>: Cost 3 vext1 <4,3,6,5>, <4,3,6,5>
+ 3650244622U, // <3,6,5,5>: Cost 4 vext1 <4,3,6,5>, <5,5,6,6>
+ 4114633528U, // <3,6,5,6>: Cost 4 vtrnl <3,4,5,6>, <6,6,6,6>
+ 2732970735U, // <3,6,5,7>: Cost 3 vext3 LHS, <6,5,7,5>
+ 2576504622U, // <3,6,5,u>: Cost 3 vext1 <4,3,6,5>, LHS
+ 2732970749U, // <3,6,6,0>: Cost 3 vext3 LHS, <6,6,0,1>
+ 2724270856U, // <3,6,6,1>: Cost 3 vext3 <6,6,1,3>, <6,6,1,3>
+ 2624819706U, // <3,6,6,2>: Cost 3 vext2 <1,2,3,6>, <6,2,7,3>
+ 3656223234U, // <3,6,6,3>: Cost 4 vext1 <5,3,6,6>, <3,4,5,6>
+ 2732970788U, // <3,6,6,4>: Cost 3 vext3 LHS, <6,6,4,4>
+ 2732970800U, // <3,6,6,5>: Cost 3 vext3 LHS, <6,6,5,7>
+ 1659228984U, // <3,6,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
+ 1659228994U, // <3,6,6,7>: Cost 2 vext3 LHS, <6,6,7,7>
+ 1659229003U, // <3,6,6,u>: Cost 2 vext3 LHS, <6,6,u,7>
+ 1659229006U, // <3,6,7,0>: Cost 2 vext3 LHS, <6,7,0,1>
+ 2558600201U, // <3,6,7,1>: Cost 3 vext1 <1,3,6,7>, <1,3,6,7>
+ 2558601146U, // <3,6,7,2>: Cost 3 vext1 <1,3,6,7>, <2,6,3,7>
+ 2725081963U, // <3,6,7,3>: Cost 3 vext3 <6,7,3,3>, <6,7,3,3>
+ 1659229046U, // <3,6,7,4>: Cost 2 vext3 LHS, <6,7,4,5>
+ 2715423611U, // <3,6,7,5>: Cost 3 vext3 <5,1,7,3>, <6,7,5,1>
+ 2722059141U, // <3,6,7,6>: Cost 3 vext3 <6,2,7,3>, <6,7,6,2>
+ 2962361654U, // <3,6,7,7>: Cost 3 vzipr <1,5,3,7>, RHS
+ 1659229078U, // <3,6,7,u>: Cost 2 vext3 LHS, <6,7,u,1>
+ 1659229087U, // <3,6,u,0>: Cost 2 vext3 LHS, <6,u,0,1>
+ 2689840041U, // <3,6,u,1>: Cost 3 vext3 LHS, <6,u,1,2>
+ 2558609339U, // <3,6,u,2>: Cost 3 vext1 <1,3,6,u>, <2,6,3,u>
+ 2576525853U, // <3,6,u,3>: Cost 3 vext1 <4,3,6,u>, <3,4,u,6>
+ 1659229127U, // <3,6,u,4>: Cost 2 vext3 LHS, <6,u,4,5>
+ 2689840081U, // <3,6,u,5>: Cost 3 vext3 LHS, <6,u,5,6>
+ 1659228984U, // <3,6,u,6>: Cost 2 vext3 LHS, <6,6,6,6>
+ 1652298720U, // <3,6,u,7>: Cost 2 vext3 <6,u,7,3>, <6,u,7,3>
+ 1659229159U, // <3,6,u,u>: Cost 2 vext3 LHS, <6,u,u,1>
+ 2626813952U, // <3,7,0,0>: Cost 3 vext2 <1,5,3,7>, <0,0,0,0>
+ 1553072230U, // <3,7,0,1>: Cost 2 vext2 <1,5,3,7>, LHS
+ 2626814116U, // <3,7,0,2>: Cost 3 vext2 <1,5,3,7>, <0,2,0,2>
+ 3700556028U, // <3,7,0,3>: Cost 4 vext2 <1,5,3,7>, <0,3,1,0>
+ 2626814290U, // <3,7,0,4>: Cost 3 vext2 <1,5,3,7>, <0,4,1,5>
+ 2582507375U, // <3,7,0,5>: Cost 3 vext1 <5,3,7,0>, <5,3,7,0>
+ 2588480072U, // <3,7,0,6>: Cost 3 vext1 <6,3,7,0>, <6,3,7,0>
+ 2732971055U, // <3,7,0,7>: Cost 3 vext3 LHS, <7,0,7,1>
+ 1553072797U, // <3,7,0,u>: Cost 2 vext2 <1,5,3,7>, LHS
+ 2626814710U, // <3,7,1,0>: Cost 3 vext2 <1,5,3,7>, <1,0,3,2>
+ 2626814772U, // <3,7,1,1>: Cost 3 vext2 <1,5,3,7>, <1,1,1,1>
+ 2626814870U, // <3,7,1,2>: Cost 3 vext2 <1,5,3,7>, <1,2,3,0>
+ 2625487854U, // <3,7,1,3>: Cost 3 vext2 <1,3,3,7>, <1,3,3,7>
+ 2582514998U, // <3,7,1,4>: Cost 3 vext1 <5,3,7,1>, RHS
+ 1553073296U, // <3,7,1,5>: Cost 2 vext2 <1,5,3,7>, <1,5,3,7>
+ 2627478753U, // <3,7,1,6>: Cost 3 vext2 <1,6,3,7>, <1,6,3,7>
+ 2727367810U, // <3,7,1,7>: Cost 3 vext3 <7,1,7,3>, <7,1,7,3>
+ 1555064195U, // <3,7,1,u>: Cost 2 vext2 <1,u,3,7>, <1,u,3,7>
+ 2588491878U, // <3,7,2,0>: Cost 3 vext1 <6,3,7,2>, LHS
+ 3700557318U, // <3,7,2,1>: Cost 4 vext2 <1,5,3,7>, <2,1,0,3>
+ 2626815592U, // <3,7,2,2>: Cost 3 vext2 <1,5,3,7>, <2,2,2,2>
+ 2626815654U, // <3,7,2,3>: Cost 3 vext2 <1,5,3,7>, <2,3,0,1>
+ 2588495158U, // <3,7,2,4>: Cost 3 vext1 <6,3,7,2>, RHS
+ 2632787817U, // <3,7,2,5>: Cost 3 vext2 <2,5,3,7>, <2,5,3,7>
+ 1559709626U, // <3,7,2,6>: Cost 2 vext2 <2,6,3,7>, <2,6,3,7>
+ 2728031443U, // <3,7,2,7>: Cost 3 vext3 <7,2,7,3>, <7,2,7,3>
+ 1561036892U, // <3,7,2,u>: Cost 2 vext2 <2,u,3,7>, <2,u,3,7>
+ 2626816150U, // <3,7,3,0>: Cost 3 vext2 <1,5,3,7>, <3,0,1,2>
+ 2626816268U, // <3,7,3,1>: Cost 3 vext2 <1,5,3,7>, <3,1,5,3>
+ 2633451878U, // <3,7,3,2>: Cost 3 vext2 <2,6,3,7>, <3,2,6,3>
+ 2626816412U, // <3,7,3,3>: Cost 3 vext2 <1,5,3,7>, <3,3,3,3>
+ 2626816514U, // <3,7,3,4>: Cost 3 vext2 <1,5,3,7>, <3,4,5,6>
+ 2638760514U, // <3,7,3,5>: Cost 3 vext2 <3,5,3,7>, <3,5,3,7>
+ 2639424147U, // <3,7,3,6>: Cost 3 vext2 <3,6,3,7>, <3,6,3,7>
+ 2826961920U, // <3,7,3,7>: Cost 3 vuzpr <1,3,5,7>, <1,3,5,7>
+ 2626816798U, // <3,7,3,u>: Cost 3 vext2 <1,5,3,7>, <3,u,1,2>
+ 2582536294U, // <3,7,4,0>: Cost 3 vext1 <5,3,7,4>, LHS
+ 2582537360U, // <3,7,4,1>: Cost 3 vext1 <5,3,7,4>, <1,5,3,7>
+ 2588510138U, // <3,7,4,2>: Cost 3 vext1 <6,3,7,4>, <2,6,3,7>
+ 3700558996U, // <3,7,4,3>: Cost 4 vext2 <1,5,3,7>, <4,3,6,7>
+ 2582539574U, // <3,7,4,4>: Cost 3 vext1 <5,3,7,4>, RHS
+ 1553075510U, // <3,7,4,5>: Cost 2 vext2 <1,5,3,7>, RHS
+ 2588512844U, // <3,7,4,6>: Cost 3 vext1 <6,3,7,4>, <6,3,7,4>
+ 2564625766U, // <3,7,4,7>: Cost 3 vext1 <2,3,7,4>, <7,4,5,6>
+ 1553075753U, // <3,7,4,u>: Cost 2 vext2 <1,5,3,7>, RHS
+ 2732971398U, // <3,7,5,0>: Cost 3 vext3 LHS, <7,5,0,2>
+ 2626817744U, // <3,7,5,1>: Cost 3 vext2 <1,5,3,7>, <5,1,7,3>
+ 3700559649U, // <3,7,5,2>: Cost 4 vext2 <1,5,3,7>, <5,2,7,3>
+ 2626817903U, // <3,7,5,3>: Cost 3 vext2 <1,5,3,7>, <5,3,7,0>
+ 2258728203U, // <3,7,5,4>: Cost 3 vrev <7,3,4,5>
+ 2732971446U, // <3,7,5,5>: Cost 3 vext3 LHS, <7,5,5,5>
+ 2732971457U, // <3,7,5,6>: Cost 3 vext3 LHS, <7,5,6,7>
+ 2826964278U, // <3,7,5,7>: Cost 3 vuzpr <1,3,5,7>, RHS
+ 2826964279U, // <3,7,5,u>: Cost 3 vuzpr <1,3,5,7>, RHS
+ 2732971478U, // <3,7,6,0>: Cost 3 vext3 LHS, <7,6,0,1>
+ 2732971486U, // <3,7,6,1>: Cost 3 vext3 LHS, <7,6,1,0>
+ 2633454074U, // <3,7,6,2>: Cost 3 vext2 <2,6,3,7>, <6,2,7,3>
+ 2633454152U, // <3,7,6,3>: Cost 3 vext2 <2,6,3,7>, <6,3,7,0>
+ 2732971518U, // <3,7,6,4>: Cost 3 vext3 LHS, <7,6,4,5>
+ 2732971526U, // <3,7,6,5>: Cost 3 vext3 LHS, <7,6,5,4>
+ 2732971537U, // <3,7,6,6>: Cost 3 vext3 LHS, <7,6,6,6>
+ 2732971540U, // <3,7,6,7>: Cost 3 vext3 LHS, <7,6,7,0>
+ 2726041124U, // <3,7,6,u>: Cost 3 vext3 <6,u,7,3>, <7,6,u,7>
+ 2570616934U, // <3,7,7,0>: Cost 3 vext1 <3,3,7,7>, LHS
+ 2570617856U, // <3,7,7,1>: Cost 3 vext1 <3,3,7,7>, <1,3,5,7>
+ 2564646635U, // <3,7,7,2>: Cost 3 vext1 <2,3,7,7>, <2,3,7,7>
+ 2570619332U, // <3,7,7,3>: Cost 3 vext1 <3,3,7,7>, <3,3,7,7>
+ 2570620214U, // <3,7,7,4>: Cost 3 vext1 <3,3,7,7>, RHS
+ 2582564726U, // <3,7,7,5>: Cost 3 vext1 <5,3,7,7>, <5,3,7,7>
+ 2588537423U, // <3,7,7,6>: Cost 3 vext1 <6,3,7,7>, <6,3,7,7>
+ 1659229804U, // <3,7,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
+ 1659229804U, // <3,7,7,u>: Cost 2 vext3 LHS, <7,7,7,7>
+ 2626819795U, // <3,7,u,0>: Cost 3 vext2 <1,5,3,7>, <u,0,1,2>
+ 1553078062U, // <3,7,u,1>: Cost 2 vext2 <1,5,3,7>, LHS
+ 2626819973U, // <3,7,u,2>: Cost 3 vext2 <1,5,3,7>, <u,2,3,0>
+ 2826961565U, // <3,7,u,3>: Cost 3 vuzpr <1,3,5,7>, LHS
+ 2626820159U, // <3,7,u,4>: Cost 3 vext2 <1,5,3,7>, <u,4,5,6>
+ 1553078426U, // <3,7,u,5>: Cost 2 vext2 <1,5,3,7>, RHS
+ 1595545808U, // <3,7,u,6>: Cost 2 vext2 <u,6,3,7>, <u,6,3,7>
+ 1659229804U, // <3,7,u,7>: Cost 2 vext3 LHS, <7,7,7,7>
+ 1553078629U, // <3,7,u,u>: Cost 2 vext2 <1,5,3,7>, LHS
+ 1611448320U, // <3,u,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
+ 1611896531U, // <3,u,0,1>: Cost 2 vext3 LHS, <u,0,1,2>
+ 1659672284U, // <3,u,0,2>: Cost 2 vext3 LHS, <u,0,2,2>
+ 1616099045U, // <3,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
+ 2685638381U, // <3,u,0,4>: Cost 3 vext3 LHS, <u,0,4,1>
+ 1663874806U, // <3,u,0,5>: Cost 2 vext3 LHS, <u,0,5,1>
+ 1663874816U, // <3,u,0,6>: Cost 2 vext3 LHS, <u,0,6,2>
+ 2960313672U, // <3,u,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
+ 1611896594U, // <3,u,0,u>: Cost 2 vext3 LHS, <u,0,u,2>
+ 1549763324U, // <3,u,1,0>: Cost 2 vext2 <1,0,3,u>, <1,0,3,u>
+ 1550426957U, // <3,u,1,1>: Cost 2 vext2 <1,1,3,u>, <1,1,3,u>
+ 537712430U, // <3,u,1,2>: Cost 1 vext3 LHS, LHS
+ 1616541495U, // <3,u,1,3>: Cost 2 vext3 LHS, <u,1,3,3>
+ 1490930998U, // <3,u,1,4>: Cost 2 vext1 <2,3,u,1>, RHS
+ 1553081489U, // <3,u,1,5>: Cost 2 vext2 <1,5,3,u>, <1,5,3,u>
+ 2627486946U, // <3,u,1,6>: Cost 3 vext2 <1,6,3,u>, <1,6,3,u>
+ 1659230043U, // <3,u,1,7>: Cost 2 vext3 LHS, <u,1,7,3>
+ 537712484U, // <3,u,1,u>: Cost 1 vext3 LHS, LHS
+ 1611890852U, // <3,u,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+ 2624833102U, // <3,u,2,1>: Cost 3 vext2 <1,2,3,u>, <2,1,u,3>
+ 1557063287U, // <3,u,2,2>: Cost 2 vext2 <2,2,3,u>, <2,2,3,u>
+ 1616099205U, // <3,u,2,3>: Cost 2 vext3 LHS, <u,2,3,0>
+ 1611890892U, // <3,u,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+ 2689841054U, // <3,u,2,5>: Cost 3 vext3 LHS, <u,2,5,7>
+ 1559717819U, // <3,u,2,6>: Cost 2 vext2 <2,6,3,u>, <2,6,3,u>
+ 1659230124U, // <3,u,2,7>: Cost 2 vext3 LHS, <u,2,7,3>
+ 1616541618U, // <3,u,2,u>: Cost 2 vext3 LHS, <u,2,u,0>
+ 1611896764U, // <3,u,3,0>: Cost 2 vext3 LHS, <u,3,0,1>
+ 1484973079U, // <3,u,3,1>: Cost 2 vext1 <1,3,u,3>, <1,3,u,3>
+ 2685638607U, // <3,u,3,2>: Cost 3 vext3 LHS, <u,3,2,2>
+ 336380006U, // <3,u,3,3>: Cost 1 vdup3 LHS
+ 1611896804U, // <3,u,3,4>: Cost 2 vext3 LHS, <u,3,4,5>
+ 1616541679U, // <3,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
+ 2690283512U, // <3,u,3,6>: Cost 3 vext3 LHS, <u,3,6,7>
+ 2959674696U, // <3,u,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
+ 336380006U, // <3,u,3,u>: Cost 1 vdup3 LHS
+ 2558722150U, // <3,u,4,0>: Cost 3 vext1 <1,3,u,4>, LHS
+ 1659672602U, // <3,u,4,1>: Cost 2 vext3 LHS, <u,4,1,5>
+ 1659672612U, // <3,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
+ 2689841196U, // <3,u,4,3>: Cost 3 vext3 LHS, <u,4,3,5>
+ 1659227344U, // <3,u,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
+ 1611896895U, // <3,u,4,5>: Cost 2 vext3 LHS, <u,4,5,6>
+ 1663875144U, // <3,u,4,6>: Cost 2 vext3 LHS, <u,4,6,6>
+ 1659230289U, // <3,u,4,7>: Cost 2 vext3 LHS, <u,4,7,6>
+ 1611896922U, // <3,u,4,u>: Cost 2 vext3 LHS, <u,4,u,6>
+ 1490960486U, // <3,u,5,0>: Cost 2 vext1 <2,3,u,5>, LHS
+ 2689841261U, // <3,u,5,1>: Cost 3 vext3 LHS, <u,5,1,7>
+ 1490962162U, // <3,u,5,2>: Cost 2 vext1 <2,3,u,5>, <2,3,u,5>
+ 1616541823U, // <3,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
+ 1490963766U, // <3,u,5,4>: Cost 2 vext1 <2,3,u,5>, RHS
+ 1659228164U, // <3,u,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
+ 537712794U, // <3,u,5,6>: Cost 1 vext3 LHS, RHS
+ 1659230371U, // <3,u,5,7>: Cost 2 vext3 LHS, <u,5,7,7>
+ 537712812U, // <3,u,5,u>: Cost 1 vext3 LHS, RHS
+ 2689841327U, // <3,u,6,0>: Cost 3 vext3 LHS, <u,6,0,1>
+ 2558739482U, // <3,u,6,1>: Cost 3 vext1 <1,3,u,6>, <1,3,u,6>
+ 2689841351U, // <3,u,6,2>: Cost 3 vext3 LHS, <u,6,2,7>
+ 1616099536U, // <3,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
+ 1659227508U, // <3,u,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
+ 2690283746U, // <3,u,6,5>: Cost 3 vext3 LHS, <u,6,5,7>
+ 1659228984U, // <3,u,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
+ 1659230445U, // <3,u,6,7>: Cost 2 vext3 LHS, <u,6,7,0>
+ 1616099581U, // <3,u,6,u>: Cost 2 vext3 LHS, <u,6,u,7>
+ 1485004902U, // <3,u,7,0>: Cost 2 vext1 <1,3,u,7>, LHS
+ 1485005851U, // <3,u,7,1>: Cost 2 vext1 <1,3,u,7>, <1,3,u,7>
+ 2558748264U, // <3,u,7,2>: Cost 3 vext1 <1,3,u,7>, <2,2,2,2>
+ 3095397021U, // <3,u,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
+ 1485008182U, // <3,u,7,4>: Cost 2 vext1 <1,3,u,7>, RHS
+ 1659228328U, // <3,u,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
+ 2722060599U, // <3,u,7,6>: Cost 3 vext3 <6,2,7,3>, <u,7,6,2>
+ 1659229804U, // <3,u,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
+ 1485010734U, // <3,u,7,u>: Cost 2 vext1 <1,3,u,7>, LHS
+ 1616099665U, // <3,u,u,0>: Cost 2 vext3 LHS, <u,u,0,1>
+ 1611897179U, // <3,u,u,1>: Cost 2 vext3 LHS, <u,u,1,2>
+ 537712997U, // <3,u,u,2>: Cost 1 vext3 LHS, LHS
+ 336380006U, // <3,u,u,3>: Cost 1 vdup3 LHS
+ 1616099705U, // <3,u,u,4>: Cost 2 vext3 LHS, <u,u,4,5>
+ 1611897219U, // <3,u,u,5>: Cost 2 vext3 LHS, <u,u,5,6>
+ 537713037U, // <3,u,u,6>: Cost 1 vext3 LHS, RHS
+ 1659230607U, // <3,u,u,7>: Cost 2 vext3 LHS, <u,u,7,0>
+ 537713051U, // <3,u,u,u>: Cost 1 vext3 LHS, LHS
+ 2691907584U, // <4,0,0,0>: Cost 3 vext3 <1,2,3,4>, <0,0,0,0>
+ 2691907594U, // <4,0,0,1>: Cost 3 vext3 <1,2,3,4>, <0,0,1,1>
+ 2691907604U, // <4,0,0,2>: Cost 3 vext3 <1,2,3,4>, <0,0,2,2>
+ 3709862144U, // <4,0,0,3>: Cost 4 vext2 <3,1,4,0>, <0,3,1,4>
+ 2684682280U, // <4,0,0,4>: Cost 3 vext3 <0,0,4,4>, <0,0,4,4>
+ 3694600633U, // <4,0,0,5>: Cost 4 vext2 <0,5,4,0>, <0,5,4,0>
+ 3291431290U, // <4,0,0,6>: Cost 4 vrev <0,4,6,0>
+ 3668342067U, // <4,0,0,7>: Cost 4 vext1 <7,4,0,0>, <7,4,0,0>
+ 2691907657U, // <4,0,0,u>: Cost 3 vext3 <1,2,3,4>, <0,0,u,1>
+ 2570715238U, // <4,0,1,0>: Cost 3 vext1 <3,4,0,1>, LHS
+ 2570716058U, // <4,0,1,1>: Cost 3 vext1 <3,4,0,1>, <1,2,3,4>
+ 1618165862U, // <4,0,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
+ 2570717648U, // <4,0,1,3>: Cost 3 vext1 <3,4,0,1>, <3,4,0,1>
+ 2570718518U, // <4,0,1,4>: Cost 3 vext1 <3,4,0,1>, RHS
+ 2594607206U, // <4,0,1,5>: Cost 3 vext1 <7,4,0,1>, <5,6,7,4>
+ 3662377563U, // <4,0,1,6>: Cost 4 vext1 <6,4,0,1>, <6,4,0,1>
+ 2594608436U, // <4,0,1,7>: Cost 3 vext1 <7,4,0,1>, <7,4,0,1>
+ 1618165916U, // <4,0,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
+ 2685714598U, // <4,0,2,0>: Cost 3 vext3 <0,2,0,4>, <0,2,0,4>
+ 3759530159U, // <4,0,2,1>: Cost 4 vext3 <0,2,1,4>, <0,2,1,4>
+ 2685862072U, // <4,0,2,2>: Cost 3 vext3 <0,2,2,4>, <0,2,2,4>
+ 2631476937U, // <4,0,2,3>: Cost 3 vext2 <2,3,4,0>, <2,3,4,0>
+ 2685714636U, // <4,0,2,4>: Cost 3 vext3 <0,2,0,4>, <0,2,4,6>
+ 3765649622U, // <4,0,2,5>: Cost 4 vext3 <1,2,3,4>, <0,2,5,7>
+ 2686157020U, // <4,0,2,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
+ 3668358453U, // <4,0,2,7>: Cost 4 vext1 <7,4,0,2>, <7,4,0,2>
+ 2686304494U, // <4,0,2,u>: Cost 3 vext3 <0,2,u,4>, <0,2,u,4>
+ 3632529510U, // <4,0,3,0>: Cost 4 vext1 <1,4,0,3>, LHS
+ 2686451968U, // <4,0,3,1>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
+ 2686525705U, // <4,0,3,2>: Cost 3 vext3 <0,3,2,4>, <0,3,2,4>
+ 3760341266U, // <4,0,3,3>: Cost 4 vext3 <0,3,3,4>, <0,3,3,4>
+ 3632532790U, // <4,0,3,4>: Cost 4 vext1 <1,4,0,3>, RHS
+ 3913254606U, // <4,0,3,5>: Cost 4 vuzpr <3,4,5,0>, <2,3,4,5>
+ 3705219740U, // <4,0,3,6>: Cost 4 vext2 <2,3,4,0>, <3,6,4,7>
+ 3713845990U, // <4,0,3,7>: Cost 4 vext2 <3,7,4,0>, <3,7,4,0>
+ 2686451968U, // <4,0,3,u>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
+ 2552823910U, // <4,0,4,0>: Cost 3 vext1 <0,4,0,4>, LHS
+ 2691907922U, // <4,0,4,1>: Cost 3 vext3 <1,2,3,4>, <0,4,1,5>
+ 2691907932U, // <4,0,4,2>: Cost 3 vext3 <1,2,3,4>, <0,4,2,6>
+ 3626567830U, // <4,0,4,3>: Cost 4 vext1 <0,4,0,4>, <3,0,1,2>
+ 2552827190U, // <4,0,4,4>: Cost 3 vext1 <0,4,0,4>, RHS
+ 2631478582U, // <4,0,4,5>: Cost 3 vext2 <2,3,4,0>, RHS
+ 3626570017U, // <4,0,4,6>: Cost 4 vext1 <0,4,0,4>, <6,0,1,2>
+ 3668374839U, // <4,0,4,7>: Cost 4 vext1 <7,4,0,4>, <7,4,0,4>
+ 2552829742U, // <4,0,4,u>: Cost 3 vext1 <0,4,0,4>, LHS
+ 2558804070U, // <4,0,5,0>: Cost 3 vext1 <1,4,0,5>, LHS
+ 1839644774U, // <4,0,5,1>: Cost 2 vzipl RHS, LHS
+ 2913386660U, // <4,0,5,2>: Cost 3 vzipl RHS, <0,2,0,2>
+ 2570750420U, // <4,0,5,3>: Cost 3 vext1 <3,4,0,5>, <3,4,0,5>
+ 2558807350U, // <4,0,5,4>: Cost 3 vext1 <1,4,0,5>, RHS
+ 3987128750U, // <4,0,5,5>: Cost 4 vzipl RHS, <0,5,2,7>
+ 3987128822U, // <4,0,5,6>: Cost 4 vzipl RHS, <0,6,1,7>
+ 2594641208U, // <4,0,5,7>: Cost 3 vext1 <7,4,0,5>, <7,4,0,5>
+ 1839645341U, // <4,0,5,u>: Cost 2 vzipl RHS, LHS
+ 2552840294U, // <4,0,6,0>: Cost 3 vext1 <0,4,0,6>, LHS
+ 3047604234U, // <4,0,6,1>: Cost 3 vtrnl RHS, <0,0,1,1>
+ 1973862502U, // <4,0,6,2>: Cost 2 vtrnl RHS, LHS
+ 2570758613U, // <4,0,6,3>: Cost 3 vext1 <3,4,0,6>, <3,4,0,6>
+ 2552843574U, // <4,0,6,4>: Cost 3 vext1 <0,4,0,6>, RHS
+ 2217664887U, // <4,0,6,5>: Cost 3 vrev <0,4,5,6>
+ 3662418528U, // <4,0,6,6>: Cost 4 vext1 <6,4,0,6>, <6,4,0,6>
+ 2658022257U, // <4,0,6,7>: Cost 3 vext2 <6,7,4,0>, <6,7,4,0>
+ 1973862556U, // <4,0,6,u>: Cost 2 vtrnl RHS, LHS
+ 3731764218U, // <4,0,7,0>: Cost 4 vext2 <6,7,4,0>, <7,0,1,2>
+ 3988324454U, // <4,0,7,1>: Cost 4 vzipl <4,7,5,0>, LHS
+ 4122034278U, // <4,0,7,2>: Cost 4 vtrnl <4,6,7,1>, LHS
+ 3735082246U, // <4,0,7,3>: Cost 4 vext2 <7,3,4,0>, <7,3,4,0>
+ 3731764536U, // <4,0,7,4>: Cost 4 vext2 <6,7,4,0>, <7,4,0,5>
+ 3937145718U, // <4,0,7,5>: Cost 4 vuzpr <7,4,5,0>, <6,7,4,5>
+ 3737073145U, // <4,0,7,6>: Cost 4 vext2 <7,6,4,0>, <7,6,4,0>
+ 3731764844U, // <4,0,7,7>: Cost 4 vext2 <6,7,4,0>, <7,7,7,7>
+ 4122034332U, // <4,0,7,u>: Cost 4 vtrnl <4,6,7,1>, LHS
+ 2552856678U, // <4,0,u,0>: Cost 3 vext1 <0,4,0,u>, LHS
+ 1841635430U, // <4,0,u,1>: Cost 2 vzipl RHS, LHS
+ 1618166429U, // <4,0,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
+ 2570774999U, // <4,0,u,3>: Cost 3 vext1 <3,4,0,u>, <3,4,0,u>
+ 2552859958U, // <4,0,u,4>: Cost 3 vext1 <0,4,0,u>, RHS
+ 2631481498U, // <4,0,u,5>: Cost 3 vext2 <2,3,4,0>, RHS
+ 2686157020U, // <4,0,u,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
+ 2594665787U, // <4,0,u,7>: Cost 3 vext1 <7,4,0,u>, <7,4,0,u>
+ 1618166483U, // <4,0,u,u>: Cost 2 vext3 <1,2,3,4>, LHS
+ 2617548837U, // <4,1,0,0>: Cost 3 vext2 <0,0,4,1>, <0,0,4,1>
+ 2622857318U, // <4,1,0,1>: Cost 3 vext2 <0,u,4,1>, LHS
+ 3693281484U, // <4,1,0,2>: Cost 4 vext2 <0,3,4,1>, <0,2,4,6>
+ 2691908342U, // <4,1,0,3>: Cost 3 vext3 <1,2,3,4>, <1,0,3,2>
+ 2622857554U, // <4,1,0,4>: Cost 3 vext2 <0,u,4,1>, <0,4,1,5>
+ 3764470538U, // <4,1,0,5>: Cost 4 vext3 <1,0,5,4>, <1,0,5,4>
+ 3695272459U, // <4,1,0,6>: Cost 4 vext2 <0,6,4,1>, <0,6,4,1>
+ 3733094980U, // <4,1,0,7>: Cost 4 vext2 <7,0,4,1>, <0,7,1,4>
+ 2622857885U, // <4,1,0,u>: Cost 3 vext2 <0,u,4,1>, LHS
+ 3696599798U, // <4,1,1,0>: Cost 4 vext2 <0,u,4,1>, <1,0,3,2>
+ 2691097399U, // <4,1,1,1>: Cost 3 vext3 <1,1,1,4>, <1,1,1,4>
+ 2631484314U, // <4,1,1,2>: Cost 3 vext2 <2,3,4,1>, <1,2,3,4>
+ 2691908424U, // <4,1,1,3>: Cost 3 vext3 <1,2,3,4>, <1,1,3,3>
+ 3696600125U, // <4,1,1,4>: Cost 4 vext2 <0,u,4,1>, <1,4,3,5>
+ 3696600175U, // <4,1,1,5>: Cost 4 vext2 <0,u,4,1>, <1,5,0,1>
+ 3696600307U, // <4,1,1,6>: Cost 4 vext2 <0,u,4,1>, <1,6,5,7>
+ 3668423997U, // <4,1,1,7>: Cost 4 vext1 <7,4,1,1>, <7,4,1,1>
+ 2691908469U, // <4,1,1,u>: Cost 3 vext3 <1,2,3,4>, <1,1,u,3>
+ 2570797158U, // <4,1,2,0>: Cost 3 vext1 <3,4,1,2>, LHS
+ 2570797978U, // <4,1,2,1>: Cost 3 vext1 <3,4,1,2>, <1,2,3,4>
+ 3696600680U, // <4,1,2,2>: Cost 4 vext2 <0,u,4,1>, <2,2,2,2>
+ 1618166682U, // <4,1,2,3>: Cost 2 vext3 <1,2,3,4>, <1,2,3,4>
+ 2570800438U, // <4,1,2,4>: Cost 3 vext1 <3,4,1,2>, RHS
+ 3765650347U, // <4,1,2,5>: Cost 4 vext3 <1,2,3,4>, <1,2,5,3>
+ 3696601018U, // <4,1,2,6>: Cost 4 vext2 <0,u,4,1>, <2,6,3,7>
+ 3668432190U, // <4,1,2,7>: Cost 4 vext1 <7,4,1,2>, <7,4,1,2>
+ 1618535367U, // <4,1,2,u>: Cost 2 vext3 <1,2,u,4>, <1,2,u,4>
+ 2564833382U, // <4,1,3,0>: Cost 3 vext1 <2,4,1,3>, LHS
+ 2691908568U, // <4,1,3,1>: Cost 3 vext3 <1,2,3,4>, <1,3,1,3>
+ 2691908578U, // <4,1,3,2>: Cost 3 vext3 <1,2,3,4>, <1,3,2,4>
+ 2692572139U, // <4,1,3,3>: Cost 3 vext3 <1,3,3,4>, <1,3,3,4>
+ 2564836662U, // <4,1,3,4>: Cost 3 vext1 <2,4,1,3>, RHS
+ 2691908608U, // <4,1,3,5>: Cost 3 vext3 <1,2,3,4>, <1,3,5,7>
+ 2588725862U, // <4,1,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+ 3662468090U, // <4,1,3,7>: Cost 4 vext1 <6,4,1,3>, <7,0,1,2>
+ 2691908631U, // <4,1,3,u>: Cost 3 vext3 <1,2,3,4>, <1,3,u,3>
+ 3760194590U, // <4,1,4,0>: Cost 4 vext3 <0,3,1,4>, <1,4,0,1>
+ 3693947874U, // <4,1,4,1>: Cost 4 vext2 <0,4,4,1>, <4,1,5,0>
+ 3765650484U, // <4,1,4,2>: Cost 4 vext3 <1,2,3,4>, <1,4,2,5>
+ 3113877606U, // <4,1,4,3>: Cost 3 vtrnr <4,4,4,4>, LHS
+ 3760194630U, // <4,1,4,4>: Cost 4 vext3 <0,3,1,4>, <1,4,4,5>
+ 2622860598U, // <4,1,4,5>: Cost 3 vext2 <0,u,4,1>, RHS
+ 3297436759U, // <4,1,4,6>: Cost 4 vrev <1,4,6,4>
+ 3800007772U, // <4,1,4,7>: Cost 4 vext3 <7,0,1,4>, <1,4,7,0>
+ 2622860841U, // <4,1,4,u>: Cost 3 vext2 <0,u,4,1>, RHS
+ 1479164006U, // <4,1,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
+ 2552906486U, // <4,1,5,1>: Cost 3 vext1 <0,4,1,5>, <1,0,3,2>
+ 2552907299U, // <4,1,5,2>: Cost 3 vext1 <0,4,1,5>, <2,1,3,5>
+ 2552907926U, // <4,1,5,3>: Cost 3 vext1 <0,4,1,5>, <3,0,1,2>
+ 1479167286U, // <4,1,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
+ 2913387664U, // <4,1,5,5>: Cost 3 vzipl RHS, <1,5,3,7>
+ 2600686074U, // <4,1,5,6>: Cost 3 vext1 <u,4,1,5>, <6,2,7,3>
+ 2600686586U, // <4,1,5,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
+ 1479169838U, // <4,1,5,u>: Cost 2 vext1 <0,4,1,5>, LHS
+ 2552914022U, // <4,1,6,0>: Cost 3 vext1 <0,4,1,6>, LHS
+ 2558886708U, // <4,1,6,1>: Cost 3 vext1 <1,4,1,6>, <1,1,1,1>
+ 4028205206U, // <4,1,6,2>: Cost 4 vzipr <0,2,4,6>, <3,0,1,2>
+ 3089858662U, // <4,1,6,3>: Cost 3 vtrnr <0,4,2,6>, LHS
+ 2552917302U, // <4,1,6,4>: Cost 3 vext1 <0,4,1,6>, RHS
+ 2223637584U, // <4,1,6,5>: Cost 3 vrev <1,4,5,6>
+ 4121347081U, // <4,1,6,6>: Cost 4 vtrnl RHS, <1,3,6,7>
+ 3721155406U, // <4,1,6,7>: Cost 4 vext2 <5,0,4,1>, <6,7,0,1>
+ 2552919854U, // <4,1,6,u>: Cost 3 vext1 <0,4,1,6>, LHS
+ 2659357716U, // <4,1,7,0>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
+ 3733763173U, // <4,1,7,1>: Cost 4 vext2 <7,1,4,1>, <7,1,4,1>
+ 3734426806U, // <4,1,7,2>: Cost 4 vext2 <7,2,4,1>, <7,2,4,1>
+ 2695226671U, // <4,1,7,3>: Cost 3 vext3 <1,7,3,4>, <1,7,3,4>
+ 3721155942U, // <4,1,7,4>: Cost 4 vext2 <5,0,4,1>, <7,4,5,6>
+ 3721155976U, // <4,1,7,5>: Cost 4 vext2 <5,0,4,1>, <7,5,0,4>
+ 3662500458U, // <4,1,7,6>: Cost 4 vext1 <6,4,1,7>, <6,4,1,7>
+ 3721156204U, // <4,1,7,7>: Cost 4 vext2 <5,0,4,1>, <7,7,7,7>
+ 2659357716U, // <4,1,7,u>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
+ 1479188582U, // <4,1,u,0>: Cost 2 vext1 <0,4,1,u>, LHS
+ 2552931062U, // <4,1,u,1>: Cost 3 vext1 <0,4,1,u>, <1,0,3,2>
+ 2552931944U, // <4,1,u,2>: Cost 3 vext1 <0,4,1,u>, <2,2,2,2>
+ 1622148480U, // <4,1,u,3>: Cost 2 vext3 <1,u,3,4>, <1,u,3,4>
+ 1479191862U, // <4,1,u,4>: Cost 2 vext1 <0,4,1,u>, RHS
+ 2622863514U, // <4,1,u,5>: Cost 3 vext2 <0,u,4,1>, RHS
+ 2588725862U, // <4,1,u,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+ 2600686586U, // <4,1,u,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
+ 1479194414U, // <4,1,u,u>: Cost 2 vext1 <0,4,1,u>, LHS
+ 2617557030U, // <4,2,0,0>: Cost 3 vext2 <0,0,4,2>, <0,0,4,2>
+ 2622865510U, // <4,2,0,1>: Cost 3 vext2 <0,u,4,2>, LHS
+ 2622865612U, // <4,2,0,2>: Cost 3 vext2 <0,u,4,2>, <0,2,4,6>
+ 3693289753U, // <4,2,0,3>: Cost 4 vext2 <0,3,4,2>, <0,3,4,2>
+ 2635473244U, // <4,2,0,4>: Cost 3 vext2 <3,0,4,2>, <0,4,2,6>
+ 3765650918U, // <4,2,0,5>: Cost 4 vext3 <1,2,3,4>, <2,0,5,7>
+ 2696775148U, // <4,2,0,6>: Cost 3 vext3 <2,0,6,4>, <2,0,6,4>
+ 3695944285U, // <4,2,0,7>: Cost 4 vext2 <0,7,4,2>, <0,7,4,2>
+ 2622866077U, // <4,2,0,u>: Cost 3 vext2 <0,u,4,2>, LHS
+ 3696607990U, // <4,2,1,0>: Cost 4 vext2 <0,u,4,2>, <1,0,3,2>
+ 3696608052U, // <4,2,1,1>: Cost 4 vext2 <0,u,4,2>, <1,1,1,1>
+ 3696608150U, // <4,2,1,2>: Cost 4 vext2 <0,u,4,2>, <1,2,3,0>
+ 3895574630U, // <4,2,1,3>: Cost 4 vuzpr <0,4,u,2>, LHS
+ 2691909162U, // <4,2,1,4>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
+ 3696608400U, // <4,2,1,5>: Cost 4 vext2 <0,u,4,2>, <1,5,3,7>
+ 3760784956U, // <4,2,1,6>: Cost 4 vext3 <0,4,0,4>, <2,1,6,3>
+ 3773908549U, // <4,2,1,7>: Cost 5 vext3 <2,5,7,4>, <2,1,7,3>
+ 2691909162U, // <4,2,1,u>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
+ 3696608748U, // <4,2,2,0>: Cost 4 vext2 <0,u,4,2>, <2,0,6,4>
+ 3696608828U, // <4,2,2,1>: Cost 4 vext2 <0,u,4,2>, <2,1,6,3>
+ 2691909224U, // <4,2,2,2>: Cost 3 vext3 <1,2,3,4>, <2,2,2,2>
+ 2691909234U, // <4,2,2,3>: Cost 3 vext3 <1,2,3,4>, <2,2,3,3>
+ 3759605368U, // <4,2,2,4>: Cost 4 vext3 <0,2,2,4>, <2,2,4,0>
+ 3696609156U, // <4,2,2,5>: Cost 4 vext2 <0,u,4,2>, <2,5,6,7>
+ 3760785040U, // <4,2,2,6>: Cost 4 vext3 <0,4,0,4>, <2,2,6,6>
+ 3668505927U, // <4,2,2,7>: Cost 4 vext1 <7,4,2,2>, <7,4,2,2>
+ 2691909279U, // <4,2,2,u>: Cost 3 vext3 <1,2,3,4>, <2,2,u,3>
+ 2691909286U, // <4,2,3,0>: Cost 3 vext3 <1,2,3,4>, <2,3,0,1>
+ 3764840111U, // <4,2,3,1>: Cost 4 vext3 <1,1,1,4>, <2,3,1,1>
+ 3765651129U, // <4,2,3,2>: Cost 4 vext3 <1,2,3,4>, <2,3,2,2>
+ 2698544836U, // <4,2,3,3>: Cost 3 vext3 <2,3,3,4>, <2,3,3,4>
+ 2685863630U, // <4,2,3,4>: Cost 3 vext3 <0,2,2,4>, <2,3,4,5>
+ 2698692310U, // <4,2,3,5>: Cost 3 vext3 <2,3,5,4>, <2,3,5,4>
+ 3772507871U, // <4,2,3,6>: Cost 4 vext3 <2,3,6,4>, <2,3,6,4>
+ 2698839784U, // <4,2,3,7>: Cost 3 vext3 <2,3,7,4>, <2,3,7,4>
+ 2691909358U, // <4,2,3,u>: Cost 3 vext3 <1,2,3,4>, <2,3,u,1>
+ 2564915302U, // <4,2,4,0>: Cost 3 vext1 <2,4,2,4>, LHS
+ 2564916122U, // <4,2,4,1>: Cost 3 vext1 <2,4,2,4>, <1,2,3,4>
+ 2564917004U, // <4,2,4,2>: Cost 3 vext1 <2,4,2,4>, <2,4,2,4>
+ 2699208469U, // <4,2,4,3>: Cost 3 vext3 <2,4,3,4>, <2,4,3,4>
+ 2564918582U, // <4,2,4,4>: Cost 3 vext1 <2,4,2,4>, RHS
+ 2622868790U, // <4,2,4,5>: Cost 3 vext2 <0,u,4,2>, RHS
+ 2229667632U, // <4,2,4,6>: Cost 3 vrev <2,4,6,4>
+ 3800082229U, // <4,2,4,7>: Cost 4 vext3 <7,0,2,4>, <2,4,7,0>
+ 2622869033U, // <4,2,4,u>: Cost 3 vext2 <0,u,4,2>, RHS
+ 2552979558U, // <4,2,5,0>: Cost 3 vext1 <0,4,2,5>, LHS
+ 2558952342U, // <4,2,5,1>: Cost 3 vext1 <1,4,2,5>, <1,2,3,0>
+ 2564925032U, // <4,2,5,2>: Cost 3 vext1 <2,4,2,5>, <2,2,2,2>
+ 2967060582U, // <4,2,5,3>: Cost 3 vzipr <2,3,4,5>, LHS
+ 2552982838U, // <4,2,5,4>: Cost 3 vext1 <0,4,2,5>, RHS
+ 3987130190U, // <4,2,5,5>: Cost 4 vzipl RHS, <2,5,0,7>
+ 2913388474U, // <4,2,5,6>: Cost 3 vzipl RHS, <2,6,3,7>
+ 3895577910U, // <4,2,5,7>: Cost 4 vuzpr <0,4,u,2>, RHS
+ 2552985390U, // <4,2,5,u>: Cost 3 vext1 <0,4,2,5>, LHS
+ 1479245926U, // <4,2,6,0>: Cost 2 vext1 <0,4,2,6>, LHS
+ 2552988406U, // <4,2,6,1>: Cost 3 vext1 <0,4,2,6>, <1,0,3,2>
+ 2552989288U, // <4,2,6,2>: Cost 3 vext1 <0,4,2,6>, <2,2,2,2>
+ 2954461286U, // <4,2,6,3>: Cost 3 vzipr <0,2,4,6>, LHS
+ 1479249206U, // <4,2,6,4>: Cost 2 vext1 <0,4,2,6>, RHS
+ 2229610281U, // <4,2,6,5>: Cost 3 vrev <2,4,5,6>
+ 2600767994U, // <4,2,6,6>: Cost 3 vext1 <u,4,2,6>, <6,2,7,3>
+ 2600768506U, // <4,2,6,7>: Cost 3 vext1 <u,4,2,6>, <7,0,1,2>
+ 1479251758U, // <4,2,6,u>: Cost 2 vext1 <0,4,2,6>, LHS
+ 2659365909U, // <4,2,7,0>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
+ 3733771366U, // <4,2,7,1>: Cost 4 vext2 <7,1,4,2>, <7,1,4,2>
+ 3734434999U, // <4,2,7,2>: Cost 4 vext2 <7,2,4,2>, <7,2,4,2>
+ 2701199368U, // <4,2,7,3>: Cost 3 vext3 <2,7,3,4>, <2,7,3,4>
+ 4175774618U, // <4,2,7,4>: Cost 4 vtrnr <2,4,5,7>, <1,2,3,4>
+ 3303360298U, // <4,2,7,5>: Cost 4 vrev <2,4,5,7>
+ 3727136217U, // <4,2,7,6>: Cost 4 vext2 <6,0,4,2>, <7,6,0,4>
+ 3727136364U, // <4,2,7,7>: Cost 4 vext2 <6,0,4,2>, <7,7,7,7>
+ 2659365909U, // <4,2,7,u>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
+ 1479262310U, // <4,2,u,0>: Cost 2 vext1 <0,4,2,u>, LHS
+ 2553004790U, // <4,2,u,1>: Cost 3 vext1 <0,4,2,u>, <1,0,3,2>
+ 2553005672U, // <4,2,u,2>: Cost 3 vext1 <0,4,2,u>, <2,2,2,2>
+ 2954477670U, // <4,2,u,3>: Cost 3 vzipr <0,2,4,u>, LHS
+ 1479265590U, // <4,2,u,4>: Cost 2 vext1 <0,4,2,u>, RHS
+ 2622871706U, // <4,2,u,5>: Cost 3 vext2 <0,u,4,2>, RHS
+ 2229700404U, // <4,2,u,6>: Cost 3 vrev <2,4,6,u>
+ 2600784890U, // <4,2,u,7>: Cost 3 vext1 <u,4,2,u>, <7,0,1,2>
+ 1479268142U, // <4,2,u,u>: Cost 2 vext1 <0,4,2,u>, LHS
+ 3765651595U, // <4,3,0,0>: Cost 4 vext3 <1,2,3,4>, <3,0,0,0>
+ 2691909782U, // <4,3,0,1>: Cost 3 vext3 <1,2,3,4>, <3,0,1,2>
+ 2702452897U, // <4,3,0,2>: Cost 3 vext3 <3,0,2,4>, <3,0,2,4>
+ 3693297946U, // <4,3,0,3>: Cost 4 vext2 <0,3,4,3>, <0,3,4,3>
+ 3760711856U, // <4,3,0,4>: Cost 4 vext3 <0,3,u,4>, <3,0,4,1>
+ 2235533820U, // <4,3,0,5>: Cost 3 vrev <3,4,5,0>
+ 3309349381U, // <4,3,0,6>: Cost 4 vrev <3,4,6,0>
+ 3668563278U, // <4,3,0,7>: Cost 4 vext1 <7,4,3,0>, <7,4,3,0>
+ 2691909845U, // <4,3,0,u>: Cost 3 vext3 <1,2,3,4>, <3,0,u,2>
+ 2235173328U, // <4,3,1,0>: Cost 3 vrev <3,4,0,1>
+ 3764840678U, // <4,3,1,1>: Cost 4 vext3 <1,1,1,4>, <3,1,1,1>
+ 2630173594U, // <4,3,1,2>: Cost 3 vext2 <2,1,4,3>, <1,2,3,4>
+ 2703190267U, // <4,3,1,3>: Cost 3 vext3 <3,1,3,4>, <3,1,3,4>
+ 3760195840U, // <4,3,1,4>: Cost 4 vext3 <0,3,1,4>, <3,1,4,0>
+ 3765651724U, // <4,3,1,5>: Cost 4 vext3 <1,2,3,4>, <3,1,5,3>
+ 3309357574U, // <4,3,1,6>: Cost 4 vrev <3,4,6,1>
+ 3769633054U, // <4,3,1,7>: Cost 4 vext3 <1,u,3,4>, <3,1,7,3>
+ 2703558952U, // <4,3,1,u>: Cost 3 vext3 <3,1,u,4>, <3,1,u,4>
+ 3626770534U, // <4,3,2,0>: Cost 4 vext1 <0,4,3,2>, LHS
+ 2630174250U, // <4,3,2,1>: Cost 3 vext2 <2,1,4,3>, <2,1,4,3>
+ 3765651777U, // <4,3,2,2>: Cost 4 vext3 <1,2,3,4>, <3,2,2,2>
+ 2703853900U, // <4,3,2,3>: Cost 3 vext3 <3,2,3,4>, <3,2,3,4>
+ 3626773814U, // <4,3,2,4>: Cost 4 vext1 <0,4,3,2>, RHS
+ 2704001374U, // <4,3,2,5>: Cost 3 vext3 <3,2,5,4>, <3,2,5,4>
+ 3765651814U, // <4,3,2,6>: Cost 4 vext3 <1,2,3,4>, <3,2,6,3>
+ 3769633135U, // <4,3,2,7>: Cost 4 vext3 <1,u,3,4>, <3,2,7,3>
+ 2634819681U, // <4,3,2,u>: Cost 3 vext2 <2,u,4,3>, <2,u,4,3>
+ 3765651839U, // <4,3,3,0>: Cost 4 vext3 <1,2,3,4>, <3,3,0,1>
+ 3765651848U, // <4,3,3,1>: Cost 4 vext3 <1,2,3,4>, <3,3,1,1>
+ 3710552404U, // <4,3,3,2>: Cost 4 vext2 <3,2,4,3>, <3,2,4,3>
+ 2691910044U, // <4,3,3,3>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
+ 2704591270U, // <4,3,3,4>: Cost 3 vext3 <3,3,4,4>, <3,3,4,4>
+ 3769633202U, // <4,3,3,5>: Cost 4 vext3 <1,u,3,4>, <3,3,5,7>
+ 3703917212U, // <4,3,3,6>: Cost 4 vext2 <2,1,4,3>, <3,6,4,7>
+ 3769633220U, // <4,3,3,7>: Cost 4 vext3 <1,u,3,4>, <3,3,7,7>
+ 2691910044U, // <4,3,3,u>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
+ 2691910096U, // <4,3,4,0>: Cost 3 vext3 <1,2,3,4>, <3,4,0,1>
+ 2691910106U, // <4,3,4,1>: Cost 3 vext3 <1,2,3,4>, <3,4,1,2>
+ 2564990741U, // <4,3,4,2>: Cost 3 vext1 <2,4,3,4>, <2,4,3,4>
+ 3765651946U, // <4,3,4,3>: Cost 4 vext3 <1,2,3,4>, <3,4,3,0>
+ 2691910136U, // <4,3,4,4>: Cost 3 vext3 <1,2,3,4>, <3,4,4,5>
+ 2686454274U, // <4,3,4,5>: Cost 3 vext3 <0,3,1,4>, <3,4,5,6>
+ 2235640329U, // <4,3,4,6>: Cost 3 vrev <3,4,6,4>
+ 3801483792U, // <4,3,4,7>: Cost 4 vext3 <7,2,3,4>, <3,4,7,2>
+ 2691910168U, // <4,3,4,u>: Cost 3 vext3 <1,2,3,4>, <3,4,u,1>
+ 2559025254U, // <4,3,5,0>: Cost 3 vext1 <1,4,3,5>, LHS
+ 2559026237U, // <4,3,5,1>: Cost 3 vext1 <1,4,3,5>, <1,4,3,5>
+ 2564998862U, // <4,3,5,2>: Cost 3 vext1 <2,4,3,5>, <2,3,4,5>
+ 2570971548U, // <4,3,5,3>: Cost 3 vext1 <3,4,3,5>, <3,3,3,3>
+ 2559028534U, // <4,3,5,4>: Cost 3 vext1 <1,4,3,5>, RHS
+ 4163519477U, // <4,3,5,5>: Cost 4 vtrnr <0,4,1,5>, <1,3,4,5>
+ 3309390346U, // <4,3,5,6>: Cost 4 vrev <3,4,6,5>
+ 2706139747U, // <4,3,5,7>: Cost 3 vext3 <3,5,7,4>, <3,5,7,4>
+ 2559031086U, // <4,3,5,u>: Cost 3 vext1 <1,4,3,5>, LHS
+ 2559033446U, // <4,3,6,0>: Cost 3 vext1 <1,4,3,6>, LHS
+ 2559034430U, // <4,3,6,1>: Cost 3 vext1 <1,4,3,6>, <1,4,3,6>
+ 2565007127U, // <4,3,6,2>: Cost 3 vext1 <2,4,3,6>, <2,4,3,6>
+ 2570979740U, // <4,3,6,3>: Cost 3 vext1 <3,4,3,6>, <3,3,3,3>
+ 2559036726U, // <4,3,6,4>: Cost 3 vext1 <1,4,3,6>, RHS
+ 1161841154U, // <4,3,6,5>: Cost 2 vrev <3,4,5,6>
+ 4028203932U, // <4,3,6,6>: Cost 4 vzipr <0,2,4,6>, <1,2,3,6>
+ 2706803380U, // <4,3,6,7>: Cost 3 vext3 <3,6,7,4>, <3,6,7,4>
+ 1162062365U, // <4,3,6,u>: Cost 2 vrev <3,4,u,6>
+ 3769633475U, // <4,3,7,0>: Cost 4 vext3 <1,u,3,4>, <3,7,0,1>
+ 3769633488U, // <4,3,7,1>: Cost 4 vext3 <1,u,3,4>, <3,7,1,5>
+ 3638757144U, // <4,3,7,2>: Cost 4 vext1 <2,4,3,7>, <2,4,3,7>
+ 3769633508U, // <4,3,7,3>: Cost 4 vext3 <1,u,3,4>, <3,7,3,7>
+ 3769633515U, // <4,3,7,4>: Cost 4 vext3 <1,u,3,4>, <3,7,4,5>
+ 3769633526U, // <4,3,7,5>: Cost 4 vext3 <1,u,3,4>, <3,7,5,7>
+ 3662647932U, // <4,3,7,6>: Cost 4 vext1 <6,4,3,7>, <6,4,3,7>
+ 3781208837U, // <4,3,7,7>: Cost 4 vext3 <3,7,7,4>, <3,7,7,4>
+ 3769633547U, // <4,3,7,u>: Cost 4 vext3 <1,u,3,4>, <3,7,u,1>
+ 2559049830U, // <4,3,u,0>: Cost 3 vext1 <1,4,3,u>, LHS
+ 2691910430U, // <4,3,u,1>: Cost 3 vext3 <1,2,3,4>, <3,u,1,2>
+ 2565023513U, // <4,3,u,2>: Cost 3 vext1 <2,4,3,u>, <2,4,3,u>
+ 2707835698U, // <4,3,u,3>: Cost 3 vext3 <3,u,3,4>, <3,u,3,4>
+ 2559053110U, // <4,3,u,4>: Cost 3 vext1 <1,4,3,u>, RHS
+ 1161857540U, // <4,3,u,5>: Cost 2 vrev <3,4,5,u>
+ 2235673101U, // <4,3,u,6>: Cost 3 vrev <3,4,6,u>
+ 2708130646U, // <4,3,u,7>: Cost 3 vext3 <3,u,7,4>, <3,u,7,4>
+ 1162078751U, // <4,3,u,u>: Cost 2 vrev <3,4,u,u>
+ 2617573416U, // <4,4,0,0>: Cost 3 vext2 <0,0,4,4>, <0,0,4,4>
+ 1570373734U, // <4,4,0,1>: Cost 2 vext2 <4,4,4,4>, LHS
+ 2779676774U, // <4,4,0,2>: Cost 3 vuzpl <4,6,4,6>, LHS
+ 3760196480U, // <4,4,0,3>: Cost 4 vext3 <0,3,1,4>, <4,0,3,1>
+ 2576977100U, // <4,4,0,4>: Cost 3 vext1 <4,4,4,0>, <4,4,4,0>
+ 2718747538U, // <4,4,0,5>: Cost 3 vext3 <5,6,7,4>, <4,0,5,1>
+ 2718747548U, // <4,4,0,6>: Cost 3 vext3 <5,6,7,4>, <4,0,6,2>
+ 3668637015U, // <4,4,0,7>: Cost 4 vext1 <7,4,4,0>, <7,4,4,0>
+ 1570374301U, // <4,4,0,u>: Cost 2 vext2 <4,4,4,4>, LHS
+ 2644116214U, // <4,4,1,0>: Cost 3 vext2 <4,4,4,4>, <1,0,3,2>
+ 2644116276U, // <4,4,1,1>: Cost 3 vext2 <4,4,4,4>, <1,1,1,1>
+ 2691910602U, // <4,4,1,2>: Cost 3 vext3 <1,2,3,4>, <4,1,2,3>
+ 2644116440U, // <4,4,1,3>: Cost 3 vext2 <4,4,4,4>, <1,3,1,3>
+ 2711227356U, // <4,4,1,4>: Cost 3 vext3 <4,4,4,4>, <4,1,4,3>
+ 2709310438U, // <4,4,1,5>: Cost 3 vext3 <4,1,5,4>, <4,1,5,4>
+ 3765652462U, // <4,4,1,6>: Cost 4 vext3 <1,2,3,4>, <4,1,6,3>
+ 3768970231U, // <4,4,1,7>: Cost 4 vext3 <1,7,3,4>, <4,1,7,3>
+ 2695891968U, // <4,4,1,u>: Cost 3 vext3 <1,u,3,4>, <4,1,u,3>
+ 3703260634U, // <4,4,2,0>: Cost 4 vext2 <2,0,4,4>, <2,0,4,4>
+ 3765652499U, // <4,4,2,1>: Cost 4 vext3 <1,2,3,4>, <4,2,1,4>
+ 2644117096U, // <4,4,2,2>: Cost 3 vext2 <4,4,4,4>, <2,2,2,2>
+ 2631509709U, // <4,4,2,3>: Cost 3 vext2 <2,3,4,4>, <2,3,4,4>
+ 2644117269U, // <4,4,2,4>: Cost 3 vext2 <4,4,4,4>, <2,4,3,4>
+ 3705251698U, // <4,4,2,5>: Cost 4 vext2 <2,3,4,4>, <2,5,4,7>
+ 2710047808U, // <4,4,2,6>: Cost 3 vext3 <4,2,6,4>, <4,2,6,4>
+ 3783863369U, // <4,4,2,7>: Cost 4 vext3 <4,2,7,4>, <4,2,7,4>
+ 2634827874U, // <4,4,2,u>: Cost 3 vext2 <2,u,4,4>, <2,u,4,4>
+ 2644117654U, // <4,4,3,0>: Cost 3 vext2 <4,4,4,4>, <3,0,1,2>
+ 3638797210U, // <4,4,3,1>: Cost 4 vext1 <2,4,4,3>, <1,2,3,4>
+ 3638798082U, // <4,4,3,2>: Cost 4 vext1 <2,4,4,3>, <2,4,1,3>
+ 2637482406U, // <4,4,3,3>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
+ 2638146039U, // <4,4,3,4>: Cost 3 vext2 <3,4,4,4>, <3,4,4,4>
+ 3913287374U, // <4,4,3,5>: Cost 4 vuzpr <3,4,5,4>, <2,3,4,5>
+ 3765652625U, // <4,4,3,6>: Cost 4 vext3 <1,2,3,4>, <4,3,6,4>
+ 3713878762U, // <4,4,3,7>: Cost 4 vext2 <3,7,4,4>, <3,7,4,4>
+ 2637482406U, // <4,4,3,u>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
+ 1503264870U, // <4,4,4,0>: Cost 2 vext1 <4,4,4,4>, LHS
+ 2577007514U, // <4,4,4,1>: Cost 3 vext1 <4,4,4,4>, <1,2,3,4>
+ 2577008232U, // <4,4,4,2>: Cost 3 vext1 <4,4,4,4>, <2,2,2,2>
+ 2571037175U, // <4,4,4,3>: Cost 3 vext1 <3,4,4,4>, <3,4,4,4>
+ 161926454U, // <4,4,4,4>: Cost 1 vdup0 RHS
+ 1570377014U, // <4,4,4,5>: Cost 2 vext2 <4,4,4,4>, RHS
+ 2779680054U, // <4,4,4,6>: Cost 3 vuzpl <4,6,4,6>, RHS
+ 2594927963U, // <4,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
+ 161926454U, // <4,4,4,u>: Cost 1 vdup0 RHS
+ 2571042918U, // <4,4,5,0>: Cost 3 vext1 <3,4,4,5>, LHS
+ 2571043738U, // <4,4,5,1>: Cost 3 vext1 <3,4,4,5>, <1,2,3,4>
+ 3638814495U, // <4,4,5,2>: Cost 4 vext1 <2,4,4,5>, <2,4,4,5>
+ 2571045368U, // <4,4,5,3>: Cost 3 vext1 <3,4,4,5>, <3,4,4,5>
+ 2571046198U, // <4,4,5,4>: Cost 3 vext1 <3,4,4,5>, RHS
+ 1839648054U, // <4,4,5,5>: Cost 2 vzipl RHS, RHS
+ 1618169142U, // <4,4,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
+ 2594936156U, // <4,4,5,7>: Cost 3 vext1 <7,4,4,5>, <7,4,4,5>
+ 1618169160U, // <4,4,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
+ 2553135206U, // <4,4,6,0>: Cost 3 vext1 <0,4,4,6>, LHS
+ 3626877686U, // <4,4,6,1>: Cost 4 vext1 <0,4,4,6>, <1,0,3,2>
+ 2565080782U, // <4,4,6,2>: Cost 3 vext1 <2,4,4,6>, <2,3,4,5>
+ 2571053561U, // <4,4,6,3>: Cost 3 vext1 <3,4,4,6>, <3,4,4,6>
+ 2553138486U, // <4,4,6,4>: Cost 3 vext1 <0,4,4,6>, RHS
+ 2241555675U, // <4,4,6,5>: Cost 3 vrev <4,4,5,6>
+ 1973865782U, // <4,4,6,6>: Cost 2 vtrnl RHS, RHS
+ 2658055029U, // <4,4,6,7>: Cost 3 vext2 <6,7,4,4>, <6,7,4,4>
+ 1973865800U, // <4,4,6,u>: Cost 2 vtrnl RHS, RHS
+ 2644120570U, // <4,4,7,0>: Cost 3 vext2 <4,4,4,4>, <7,0,1,2>
+ 3638829978U, // <4,4,7,1>: Cost 4 vext1 <2,4,4,7>, <1,2,3,4>
+ 3638830881U, // <4,4,7,2>: Cost 4 vext1 <2,4,4,7>, <2,4,4,7>
+ 3735115018U, // <4,4,7,3>: Cost 4 vext2 <7,3,4,4>, <7,3,4,4>
+ 2662036827U, // <4,4,7,4>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
+ 2713292236U, // <4,4,7,5>: Cost 3 vext3 <4,7,5,4>, <4,7,5,4>
+ 2713365973U, // <4,4,7,6>: Cost 3 vext3 <4,7,6,4>, <4,7,6,4>
+ 2644121196U, // <4,4,7,7>: Cost 3 vext2 <4,4,4,4>, <7,7,7,7>
+ 2662036827U, // <4,4,7,u>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
+ 1503297638U, // <4,4,u,0>: Cost 2 vext1 <4,4,4,u>, LHS
+ 1570379566U, // <4,4,u,1>: Cost 2 vext2 <4,4,4,4>, LHS
+ 2779682606U, // <4,4,u,2>: Cost 3 vuzpl <4,6,4,6>, LHS
+ 2571069947U, // <4,4,u,3>: Cost 3 vext1 <3,4,4,u>, <3,4,4,u>
+ 161926454U, // <4,4,u,4>: Cost 1 vdup0 RHS
+ 1841638710U, // <4,4,u,5>: Cost 2 vzipl RHS, RHS
+ 1618169385U, // <4,4,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
+ 2594960735U, // <4,4,u,7>: Cost 3 vext1 <7,4,4,u>, <7,4,4,u>
+ 161926454U, // <4,4,u,u>: Cost 1 vdup0 RHS
+ 2631516160U, // <4,5,0,0>: Cost 3 vext2 <2,3,4,5>, <0,0,0,0>
+ 1557774438U, // <4,5,0,1>: Cost 2 vext2 <2,3,4,5>, LHS
+ 2618908875U, // <4,5,0,2>: Cost 3 vext2 <0,2,4,5>, <0,2,4,5>
+ 2571078140U, // <4,5,0,3>: Cost 3 vext1 <3,4,5,0>, <3,4,5,0>
+ 2626871634U, // <4,5,0,4>: Cost 3 vext2 <1,5,4,5>, <0,4,1,5>
+ 3705258414U, // <4,5,0,5>: Cost 4 vext2 <2,3,4,5>, <0,5,2,7>
+ 2594968438U, // <4,5,0,6>: Cost 3 vext1 <7,4,5,0>, <6,7,4,5>
+ 2594968928U, // <4,5,0,7>: Cost 3 vext1 <7,4,5,0>, <7,4,5,0>
+ 1557775005U, // <4,5,0,u>: Cost 2 vext2 <2,3,4,5>, LHS
+ 2631516918U, // <4,5,1,0>: Cost 3 vext2 <2,3,4,5>, <1,0,3,2>
+ 2624217939U, // <4,5,1,1>: Cost 3 vext2 <1,1,4,5>, <1,1,4,5>
+ 2631517078U, // <4,5,1,2>: Cost 3 vext2 <2,3,4,5>, <1,2,3,0>
+ 2821341286U, // <4,5,1,3>: Cost 3 vuzpr <0,4,1,5>, LHS
+ 3895086054U, // <4,5,1,4>: Cost 4 vuzpr <0,4,1,5>, <4,1,5,4>
+ 2626872471U, // <4,5,1,5>: Cost 3 vext2 <1,5,4,5>, <1,5,4,5>
+ 3895083131U, // <4,5,1,6>: Cost 4 vuzpr <0,4,1,5>, <0,1,4,6>
+ 2718748368U, // <4,5,1,7>: Cost 3 vext3 <5,6,7,4>, <5,1,7,3>
+ 2821341291U, // <4,5,1,u>: Cost 3 vuzpr <0,4,1,5>, LHS
+ 2571092070U, // <4,5,2,0>: Cost 3 vext1 <3,4,5,2>, LHS
+ 3699287585U, // <4,5,2,1>: Cost 4 vext2 <1,3,4,5>, <2,1,3,3>
+ 2630854269U, // <4,5,2,2>: Cost 3 vext2 <2,2,4,5>, <2,2,4,5>
+ 1557776078U, // <4,5,2,3>: Cost 2 vext2 <2,3,4,5>, <2,3,4,5>
+ 2631517974U, // <4,5,2,4>: Cost 3 vext2 <2,3,4,5>, <2,4,3,5>
+ 3692652384U, // <4,5,2,5>: Cost 4 vext2 <0,2,4,5>, <2,5,2,7>
+ 2631518138U, // <4,5,2,6>: Cost 3 vext2 <2,3,4,5>, <2,6,3,7>
+ 4164013366U, // <4,5,2,7>: Cost 4 vtrnr <0,4,u,2>, RHS
+ 1561094243U, // <4,5,2,u>: Cost 2 vext2 <2,u,4,5>, <2,u,4,5>
+ 2631518358U, // <4,5,3,0>: Cost 3 vext2 <2,3,4,5>, <3,0,1,2>
+ 3895084710U, // <4,5,3,1>: Cost 4 vuzpr <0,4,1,5>, <2,3,0,1>
+ 2631518540U, // <4,5,3,2>: Cost 3 vext2 <2,3,4,5>, <3,2,3,4>
+ 2631518620U, // <4,5,3,3>: Cost 3 vext2 <2,3,4,5>, <3,3,3,3>
+ 2631518716U, // <4,5,3,4>: Cost 3 vext2 <2,3,4,5>, <3,4,5,0>
+ 2631518784U, // <4,5,3,5>: Cost 3 vext2 <2,3,4,5>, <3,5,3,5>
+ 2658060980U, // <4,5,3,6>: Cost 3 vext2 <6,7,4,5>, <3,6,7,4>
+ 2640145131U, // <4,5,3,7>: Cost 3 vext2 <3,7,4,5>, <3,7,4,5>
+ 2631519006U, // <4,5,3,u>: Cost 3 vext2 <2,3,4,5>, <3,u,1,2>
+ 2571108454U, // <4,5,4,0>: Cost 3 vext1 <3,4,5,4>, LHS
+ 3632907342U, // <4,5,4,1>: Cost 4 vext1 <1,4,5,4>, <1,4,5,4>
+ 2571110094U, // <4,5,4,2>: Cost 3 vext1 <3,4,5,4>, <2,3,4,5>
+ 2571110912U, // <4,5,4,3>: Cost 3 vext1 <3,4,5,4>, <3,4,5,4>
+ 2571111734U, // <4,5,4,4>: Cost 3 vext1 <3,4,5,4>, RHS
+ 1557777718U, // <4,5,4,5>: Cost 2 vext2 <2,3,4,5>, RHS
+ 2645454195U, // <4,5,4,6>: Cost 3 vext2 <4,6,4,5>, <4,6,4,5>
+ 2718748614U, // <4,5,4,7>: Cost 3 vext3 <5,6,7,4>, <5,4,7,6>
+ 1557777961U, // <4,5,4,u>: Cost 2 vext2 <2,3,4,5>, RHS
+ 1503346790U, // <4,5,5,0>: Cost 2 vext1 <4,4,5,5>, LHS
+ 2913398480U, // <4,5,5,1>: Cost 3 vzipl RHS, <5,1,7,3>
+ 2631519998U, // <4,5,5,2>: Cost 3 vext2 <2,3,4,5>, <5,2,3,4>
+ 2577090710U, // <4,5,5,3>: Cost 3 vext1 <4,4,5,5>, <3,0,1,2>
+ 1503349978U, // <4,5,5,4>: Cost 2 vext1 <4,4,5,5>, <4,4,5,5>
+ 2631520260U, // <4,5,5,5>: Cost 3 vext2 <2,3,4,5>, <5,5,5,5>
+ 2913390690U, // <4,5,5,6>: Cost 3 vzipl RHS, <5,6,7,0>
+ 2821344566U, // <4,5,5,7>: Cost 3 vuzpr <0,4,1,5>, RHS
+ 1503352622U, // <4,5,5,u>: Cost 2 vext1 <4,4,5,5>, LHS
+ 1497383014U, // <4,5,6,0>: Cost 2 vext1 <3,4,5,6>, LHS
+ 2559181904U, // <4,5,6,1>: Cost 3 vext1 <1,4,5,6>, <1,4,5,6>
+ 2565154601U, // <4,5,6,2>: Cost 3 vext1 <2,4,5,6>, <2,4,5,6>
+ 1497385474U, // <4,5,6,3>: Cost 2 vext1 <3,4,5,6>, <3,4,5,6>
+ 1497386294U, // <4,5,6,4>: Cost 2 vext1 <3,4,5,6>, RHS
+ 3047608324U, // <4,5,6,5>: Cost 3 vtrnl RHS, <5,5,5,5>
+ 2571129656U, // <4,5,6,6>: Cost 3 vext1 <3,4,5,6>, <6,6,6,6>
+ 27705344U, // <4,5,6,7>: Cost 0 copy RHS
+ 27705344U, // <4,5,6,u>: Cost 0 copy RHS
+ 2565161062U, // <4,5,7,0>: Cost 3 vext1 <2,4,5,7>, LHS
+ 2565161882U, // <4,5,7,1>: Cost 3 vext1 <2,4,5,7>, <1,2,3,4>
+ 2565162794U, // <4,5,7,2>: Cost 3 vext1 <2,4,5,7>, <2,4,5,7>
+ 2661381387U, // <4,5,7,3>: Cost 3 vext2 <7,3,4,5>, <7,3,4,5>
+ 2565164342U, // <4,5,7,4>: Cost 3 vext1 <2,4,5,7>, RHS
+ 2718748840U, // <4,5,7,5>: Cost 3 vext3 <5,6,7,4>, <5,7,5,7>
+ 2718748846U, // <4,5,7,6>: Cost 3 vext3 <5,6,7,4>, <5,7,6,4>
+ 2719412407U, // <4,5,7,7>: Cost 3 vext3 <5,7,7,4>, <5,7,7,4>
+ 2565166894U, // <4,5,7,u>: Cost 3 vext1 <2,4,5,7>, LHS
+ 1497399398U, // <4,5,u,0>: Cost 2 vext1 <3,4,5,u>, LHS
+ 1557780270U, // <4,5,u,1>: Cost 2 vext2 <2,3,4,5>, LHS
+ 2631522181U, // <4,5,u,2>: Cost 3 vext2 <2,3,4,5>, <u,2,3,0>
+ 1497401860U, // <4,5,u,3>: Cost 2 vext1 <3,4,5,u>, <3,4,5,u>
+ 1497402678U, // <4,5,u,4>: Cost 2 vext1 <3,4,5,u>, RHS
+ 1557780634U, // <4,5,u,5>: Cost 2 vext2 <2,3,4,5>, RHS
+ 2631522512U, // <4,5,u,6>: Cost 3 vext2 <2,3,4,5>, <u,6,3,7>
+ 27705344U, // <4,5,u,7>: Cost 0 copy RHS
+ 27705344U, // <4,5,u,u>: Cost 0 copy RHS
+ 2618916864U, // <4,6,0,0>: Cost 3 vext2 <0,2,4,6>, <0,0,0,0>
+ 1545175142U, // <4,6,0,1>: Cost 2 vext2 <0,2,4,6>, LHS
+ 1545175244U, // <4,6,0,2>: Cost 2 vext2 <0,2,4,6>, <0,2,4,6>
+ 3692658940U, // <4,6,0,3>: Cost 4 vext2 <0,2,4,6>, <0,3,1,0>
+ 2618917202U, // <4,6,0,4>: Cost 3 vext2 <0,2,4,6>, <0,4,1,5>
+ 3852910806U, // <4,6,0,5>: Cost 4 vuzpl RHS, <0,2,5,7>
+ 2253525648U, // <4,6,0,6>: Cost 3 vrev <6,4,6,0>
+ 4040764726U, // <4,6,0,7>: Cost 4 vzipr <2,3,4,0>, RHS
+ 1545175709U, // <4,6,0,u>: Cost 2 vext2 <0,2,4,6>, LHS
+ 2618917622U, // <4,6,1,0>: Cost 3 vext2 <0,2,4,6>, <1,0,3,2>
+ 2618917684U, // <4,6,1,1>: Cost 3 vext2 <0,2,4,6>, <1,1,1,1>
+ 2618917782U, // <4,6,1,2>: Cost 3 vext2 <0,2,4,6>, <1,2,3,0>
+ 2618917848U, // <4,6,1,3>: Cost 3 vext2 <0,2,4,6>, <1,3,1,3>
+ 3692659773U, // <4,6,1,4>: Cost 4 vext2 <0,2,4,6>, <1,4,3,5>
+ 2618918032U, // <4,6,1,5>: Cost 3 vext2 <0,2,4,6>, <1,5,3,7>
+ 3692659937U, // <4,6,1,6>: Cost 4 vext2 <0,2,4,6>, <1,6,3,7>
+ 4032146742U, // <4,6,1,7>: Cost 4 vzipr <0,u,4,1>, RHS
+ 2618918253U, // <4,6,1,u>: Cost 3 vext2 <0,2,4,6>, <1,u,1,3>
+ 2618918380U, // <4,6,2,0>: Cost 3 vext2 <0,2,4,6>, <2,0,6,4>
+ 2618918460U, // <4,6,2,1>: Cost 3 vext2 <0,2,4,6>, <2,1,6,3>
+ 2618918504U, // <4,6,2,2>: Cost 3 vext2 <0,2,4,6>, <2,2,2,2>
+ 2618918566U, // <4,6,2,3>: Cost 3 vext2 <0,2,4,6>, <2,3,0,1>
+ 2618918679U, // <4,6,2,4>: Cost 3 vext2 <0,2,4,6>, <2,4,3,6>
+ 2618918788U, // <4,6,2,5>: Cost 3 vext2 <0,2,4,6>, <2,5,6,7>
+ 2618918842U, // <4,6,2,6>: Cost 3 vext2 <0,2,4,6>, <2,6,3,7>
+ 2718749178U, // <4,6,2,7>: Cost 3 vext3 <5,6,7,4>, <6,2,7,3>
+ 2618918971U, // <4,6,2,u>: Cost 3 vext2 <0,2,4,6>, <2,u,0,1>
+ 2618919062U, // <4,6,3,0>: Cost 3 vext2 <0,2,4,6>, <3,0,1,2>
+ 2636171526U, // <4,6,3,1>: Cost 3 vext2 <3,1,4,6>, <3,1,4,6>
+ 3692661057U, // <4,6,3,2>: Cost 4 vext2 <0,2,4,6>, <3,2,2,2>
+ 2618919324U, // <4,6,3,3>: Cost 3 vext2 <0,2,4,6>, <3,3,3,3>
+ 2618919426U, // <4,6,3,4>: Cost 3 vext2 <0,2,4,6>, <3,4,5,6>
+ 2638826058U, // <4,6,3,5>: Cost 3 vext2 <3,5,4,6>, <3,5,4,6>
+ 3913303030U, // <4,6,3,6>: Cost 4 vuzpr <3,4,5,6>, <1,3,4,6>
+ 2722730572U, // <4,6,3,7>: Cost 3 vext3 <6,3,7,4>, <6,3,7,4>
+ 2618919710U, // <4,6,3,u>: Cost 3 vext2 <0,2,4,6>, <3,u,1,2>
+ 2565210214U, // <4,6,4,0>: Cost 3 vext1 <2,4,6,4>, LHS
+ 2718749286U, // <4,6,4,1>: Cost 3 vext3 <5,6,7,4>, <6,4,1,3>
+ 2565211952U, // <4,6,4,2>: Cost 3 vext1 <2,4,6,4>, <2,4,6,4>
+ 2571184649U, // <4,6,4,3>: Cost 3 vext1 <3,4,6,4>, <3,4,6,4>
+ 2565213494U, // <4,6,4,4>: Cost 3 vext1 <2,4,6,4>, RHS
+ 1545178422U, // <4,6,4,5>: Cost 2 vext2 <0,2,4,6>, RHS
+ 1705430326U, // <4,6,4,6>: Cost 2 vuzpl RHS, RHS
+ 2595075437U, // <4,6,4,7>: Cost 3 vext1 <7,4,6,4>, <7,4,6,4>
+ 1545178665U, // <4,6,4,u>: Cost 2 vext2 <0,2,4,6>, RHS
+ 2565218406U, // <4,6,5,0>: Cost 3 vext1 <2,4,6,5>, LHS
+ 2645462736U, // <4,6,5,1>: Cost 3 vext2 <4,6,4,6>, <5,1,7,3>
+ 2913399290U, // <4,6,5,2>: Cost 3 vzipl RHS, <6,2,7,3>
+ 3913305394U, // <4,6,5,3>: Cost 4 vuzpr <3,4,5,6>, <4,5,6,3>
+ 2645462982U, // <4,6,5,4>: Cost 3 vext2 <4,6,4,6>, <5,4,7,6>
+ 2779172868U, // <4,6,5,5>: Cost 3 vuzpl RHS, <5,5,5,5>
+ 2913391416U, // <4,6,5,6>: Cost 3 vzipl RHS, <6,6,6,6>
+ 2821426486U, // <4,6,5,7>: Cost 3 vuzpr <0,4,2,6>, RHS
+ 2821426487U, // <4,6,5,u>: Cost 3 vuzpr <0,4,2,6>, RHS
+ 1503428710U, // <4,6,6,0>: Cost 2 vext1 <4,4,6,6>, LHS
+ 2577171190U, // <4,6,6,1>: Cost 3 vext1 <4,4,6,6>, <1,0,3,2>
+ 2645463546U, // <4,6,6,2>: Cost 3 vext2 <4,6,4,6>, <6,2,7,3>
+ 2577172630U, // <4,6,6,3>: Cost 3 vext1 <4,4,6,6>, <3,0,1,2>
+ 1503431908U, // <4,6,6,4>: Cost 2 vext1 <4,4,6,6>, <4,4,6,6>
+ 2253501069U, // <4,6,6,5>: Cost 3 vrev <6,4,5,6>
+ 2618921784U, // <4,6,6,6>: Cost 3 vext2 <0,2,4,6>, <6,6,6,6>
+ 2954464566U, // <4,6,6,7>: Cost 3 vzipr <0,2,4,6>, RHS
+ 1503434542U, // <4,6,6,u>: Cost 2 vext1 <4,4,6,6>, LHS
+ 2645464058U, // <4,6,7,0>: Cost 3 vext2 <4,6,4,6>, <7,0,1,2>
+ 2779173882U, // <4,6,7,1>: Cost 3 vuzpl RHS, <7,0,1,2>
+ 3638978355U, // <4,6,7,2>: Cost 4 vext1 <2,4,6,7>, <2,4,6,7>
+ 2725090156U, // <4,6,7,3>: Cost 3 vext3 <6,7,3,4>, <6,7,3,4>
+ 2645464422U, // <4,6,7,4>: Cost 3 vext2 <4,6,4,6>, <7,4,5,6>
+ 2779174246U, // <4,6,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
+ 3852915914U, // <4,6,7,6>: Cost 4 vuzpl RHS, <7,2,6,3>
+ 2779174508U, // <4,6,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
+ 2779173945U, // <4,6,7,u>: Cost 3 vuzpl RHS, <7,0,u,2>
+ 1503445094U, // <4,6,u,0>: Cost 2 vext1 <4,4,6,u>, LHS
+ 1545180974U, // <4,6,u,1>: Cost 2 vext2 <0,2,4,6>, LHS
+ 1705432878U, // <4,6,u,2>: Cost 2 vuzpl RHS, LHS
+ 2618922940U, // <4,6,u,3>: Cost 3 vext2 <0,2,4,6>, <u,3,0,1>
+ 1503448294U, // <4,6,u,4>: Cost 2 vext1 <4,4,6,u>, <4,4,6,u>
+ 1545181338U, // <4,6,u,5>: Cost 2 vext2 <0,2,4,6>, RHS
+ 1705433242U, // <4,6,u,6>: Cost 2 vuzpl RHS, RHS
+ 2954480950U, // <4,6,u,7>: Cost 3 vzipr <0,2,4,u>, RHS
+ 1545181541U, // <4,6,u,u>: Cost 2 vext2 <0,2,4,6>, LHS
+ 3706601472U, // <4,7,0,0>: Cost 4 vext2 <2,5,4,7>, <0,0,0,0>
+ 2632859750U, // <4,7,0,1>: Cost 3 vext2 <2,5,4,7>, LHS
+ 2726343685U, // <4,7,0,2>: Cost 3 vext3 <7,0,2,4>, <7,0,2,4>
+ 3701293312U, // <4,7,0,3>: Cost 4 vext2 <1,6,4,7>, <0,3,1,4>
+ 3706601810U, // <4,7,0,4>: Cost 4 vext2 <2,5,4,7>, <0,4,1,5>
+ 2259424608U, // <4,7,0,5>: Cost 3 vrev <7,4,5,0>
+ 3695321617U, // <4,7,0,6>: Cost 4 vext2 <0,6,4,7>, <0,6,4,7>
+ 3800454194U, // <4,7,0,7>: Cost 4 vext3 <7,0,7,4>, <7,0,7,4>
+ 2632860317U, // <4,7,0,u>: Cost 3 vext2 <2,5,4,7>, LHS
+ 2259064116U, // <4,7,1,0>: Cost 3 vrev <7,4,0,1>
+ 3700630324U, // <4,7,1,1>: Cost 4 vext2 <1,5,4,7>, <1,1,1,1>
+ 2632860570U, // <4,7,1,2>: Cost 3 vext2 <2,5,4,7>, <1,2,3,4>
+ 3769635936U, // <4,7,1,3>: Cost 4 vext3 <1,u,3,4>, <7,1,3,5>
+ 3656920374U, // <4,7,1,4>: Cost 4 vext1 <5,4,7,1>, RHS
+ 3700630681U, // <4,7,1,5>: Cost 4 vext2 <1,5,4,7>, <1,5,4,7>
+ 3701294314U, // <4,7,1,6>: Cost 4 vext2 <1,6,4,7>, <1,6,4,7>
+ 3793818754U, // <4,7,1,7>: Cost 4 vext3 <5,u,7,4>, <7,1,7,3>
+ 2259654012U, // <4,7,1,u>: Cost 3 vrev <7,4,u,1>
+ 3656925286U, // <4,7,2,0>: Cost 4 vext1 <5,4,7,2>, LHS
+ 3706603050U, // <4,7,2,1>: Cost 4 vext2 <2,5,4,7>, <2,1,4,3>
+ 3706603112U, // <4,7,2,2>: Cost 4 vext2 <2,5,4,7>, <2,2,2,2>
+ 2727744688U, // <4,7,2,3>: Cost 3 vext3 <7,2,3,4>, <7,2,3,4>
+ 3705939745U, // <4,7,2,4>: Cost 4 vext2 <2,4,4,7>, <2,4,4,7>
+ 2632861554U, // <4,7,2,5>: Cost 3 vext2 <2,5,4,7>, <2,5,4,7>
+ 3706603450U, // <4,7,2,6>: Cost 4 vext2 <2,5,4,7>, <2,6,3,7>
+ 3792491731U, // <4,7,2,7>: Cost 4 vext3 <5,6,7,4>, <7,2,7,3>
+ 2634852453U, // <4,7,2,u>: Cost 3 vext2 <2,u,4,7>, <2,u,4,7>
+ 3706603670U, // <4,7,3,0>: Cost 4 vext2 <2,5,4,7>, <3,0,1,2>
+ 3662906266U, // <4,7,3,1>: Cost 4 vext1 <6,4,7,3>, <1,2,3,4>
+ 3725183326U, // <4,7,3,2>: Cost 4 vext2 <5,6,4,7>, <3,2,5,4>
+ 3706603932U, // <4,7,3,3>: Cost 4 vext2 <2,5,4,7>, <3,3,3,3>
+ 3701295618U, // <4,7,3,4>: Cost 4 vext2 <1,6,4,7>, <3,4,5,6>
+ 2638834251U, // <4,7,3,5>: Cost 3 vext2 <3,5,4,7>, <3,5,4,7>
+ 2639497884U, // <4,7,3,6>: Cost 3 vext2 <3,6,4,7>, <3,6,4,7>
+ 3802445093U, // <4,7,3,7>: Cost 4 vext3 <7,3,7,4>, <7,3,7,4>
+ 2640825150U, // <4,7,3,u>: Cost 3 vext2 <3,u,4,7>, <3,u,4,7>
+ 2718750004U, // <4,7,4,0>: Cost 3 vext3 <5,6,7,4>, <7,4,0,1>
+ 3706604490U, // <4,7,4,1>: Cost 4 vext2 <2,5,4,7>, <4,1,2,3>
+ 3656943474U, // <4,7,4,2>: Cost 4 vext1 <5,4,7,4>, <2,5,4,7>
+ 3779884371U, // <4,7,4,3>: Cost 4 vext3 <3,5,7,4>, <7,4,3,5>
+ 2259383643U, // <4,7,4,4>: Cost 3 vrev <7,4,4,4>
+ 2632863030U, // <4,7,4,5>: Cost 3 vext2 <2,5,4,7>, RHS
+ 2259531117U, // <4,7,4,6>: Cost 3 vrev <7,4,6,4>
+ 3907340074U, // <4,7,4,7>: Cost 4 vuzpr <2,4,5,7>, <2,4,5,7>
+ 2632863273U, // <4,7,4,u>: Cost 3 vext2 <2,5,4,7>, RHS
+ 2913391610U, // <4,7,5,0>: Cost 3 vzipl RHS, <7,0,1,2>
+ 3645006848U, // <4,7,5,1>: Cost 4 vext1 <3,4,7,5>, <1,3,5,7>
+ 2589181646U, // <4,7,5,2>: Cost 3 vext1 <6,4,7,5>, <2,3,4,5>
+ 3645008403U, // <4,7,5,3>: Cost 4 vext1 <3,4,7,5>, <3,4,7,5>
+ 2913391974U, // <4,7,5,4>: Cost 3 vzipl RHS, <7,4,5,6>
+ 2583211973U, // <4,7,5,5>: Cost 3 vext1 <5,4,7,5>, <5,4,7,5>
+ 2589184670U, // <4,7,5,6>: Cost 3 vext1 <6,4,7,5>, <6,4,7,5>
+ 2913392236U, // <4,7,5,7>: Cost 3 vzipl RHS, <7,7,7,7>
+ 2913392258U, // <4,7,5,u>: Cost 3 vzipl RHS, <7,u,1,2>
+ 1509474406U, // <4,7,6,0>: Cost 2 vext1 <5,4,7,6>, LHS
+ 3047609338U, // <4,7,6,1>: Cost 3 vtrnl RHS, <7,0,1,2>
+ 2583217768U, // <4,7,6,2>: Cost 3 vext1 <5,4,7,6>, <2,2,2,2>
+ 2583218326U, // <4,7,6,3>: Cost 3 vext1 <5,4,7,6>, <3,0,1,2>
+ 1509477686U, // <4,7,6,4>: Cost 2 vext1 <5,4,7,6>, RHS
+ 1509478342U, // <4,7,6,5>: Cost 2 vext1 <5,4,7,6>, <5,4,7,6>
+ 2583220730U, // <4,7,6,6>: Cost 3 vext1 <5,4,7,6>, <6,2,7,3>
+ 3047609964U, // <4,7,6,7>: Cost 3 vtrnl RHS, <7,7,7,7>
+ 1509480238U, // <4,7,6,u>: Cost 2 vext1 <5,4,7,6>, LHS
+ 3650994278U, // <4,7,7,0>: Cost 4 vext1 <4,4,7,7>, LHS
+ 3650995098U, // <4,7,7,1>: Cost 4 vext1 <4,4,7,7>, <1,2,3,4>
+ 3650996010U, // <4,7,7,2>: Cost 4 vext1 <4,4,7,7>, <2,4,5,7>
+ 3804804677U, // <4,7,7,3>: Cost 4 vext3 <7,7,3,4>, <7,7,3,4>
+ 3650997486U, // <4,7,7,4>: Cost 4 vext1 <4,4,7,7>, <4,4,7,7>
+ 2662725039U, // <4,7,7,5>: Cost 3 vext2 <7,5,4,7>, <7,5,4,7>
+ 3662942880U, // <4,7,7,6>: Cost 4 vext1 <6,4,7,7>, <6,4,7,7>
+ 2718750316U, // <4,7,7,7>: Cost 3 vext3 <5,6,7,4>, <7,7,7,7>
+ 2664715938U, // <4,7,7,u>: Cost 3 vext2 <7,u,4,7>, <7,u,4,7>
+ 1509490790U, // <4,7,u,0>: Cost 2 vext1 <5,4,7,u>, LHS
+ 2632865582U, // <4,7,u,1>: Cost 3 vext2 <2,5,4,7>, LHS
+ 2583234152U, // <4,7,u,2>: Cost 3 vext1 <5,4,7,u>, <2,2,2,2>
+ 2583234710U, // <4,7,u,3>: Cost 3 vext1 <5,4,7,u>, <3,0,1,2>
+ 1509494070U, // <4,7,u,4>: Cost 2 vext1 <5,4,7,u>, RHS
+ 1509494728U, // <4,7,u,5>: Cost 2 vext1 <5,4,7,u>, <5,4,7,u>
+ 2583237114U, // <4,7,u,6>: Cost 3 vext1 <5,4,7,u>, <6,2,7,3>
+ 3047757420U, // <4,7,u,7>: Cost 3 vtrnl RHS, <7,7,7,7>
+ 1509496622U, // <4,7,u,u>: Cost 2 vext1 <5,4,7,u>, LHS
+ 2618933248U, // <4,u,0,0>: Cost 3 vext2 <0,2,4,u>, <0,0,0,0>
+ 1545191526U, // <4,u,0,1>: Cost 2 vext2 <0,2,4,u>, LHS
+ 1545191630U, // <4,u,0,2>: Cost 2 vext2 <0,2,4,u>, <0,2,4,u>
+ 2691913445U, // <4,u,0,3>: Cost 3 vext3 <1,2,3,4>, <u,0,3,2>
+ 2618933586U, // <4,u,0,4>: Cost 3 vext2 <0,2,4,u>, <0,4,1,5>
+ 2265397305U, // <4,u,0,5>: Cost 3 vrev <u,4,5,0>
+ 2595189625U, // <4,u,0,6>: Cost 3 vext1 <7,4,u,0>, <6,7,4,u>
+ 2595190139U, // <4,u,0,7>: Cost 3 vext1 <7,4,u,0>, <7,4,u,0>
+ 1545192093U, // <4,u,0,u>: Cost 2 vext2 <0,2,4,u>, LHS
+ 2618934006U, // <4,u,1,0>: Cost 3 vext2 <0,2,4,u>, <1,0,3,2>
+ 2618934068U, // <4,u,1,1>: Cost 3 vext2 <0,2,4,u>, <1,1,1,1>
+ 1618171694U, // <4,u,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
+ 2618934232U, // <4,u,1,3>: Cost 3 vext2 <0,2,4,u>, <1,3,1,3>
+ 2695894848U, // <4,u,1,4>: Cost 3 vext3 <1,u,3,4>, <u,1,4,3>
+ 2618934416U, // <4,u,1,5>: Cost 3 vext2 <0,2,4,u>, <1,5,3,7>
+ 3692676321U, // <4,u,1,6>: Cost 4 vext2 <0,2,4,u>, <1,6,3,7>
+ 2718750555U, // <4,u,1,7>: Cost 3 vext3 <5,6,7,4>, <u,1,7,3>
+ 1618171748U, // <4,u,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
+ 2553397350U, // <4,u,2,0>: Cost 3 vext1 <0,4,u,2>, LHS
+ 2630215215U, // <4,u,2,1>: Cost 3 vext2 <2,1,4,u>, <2,1,4,u>
+ 2618934888U, // <4,u,2,2>: Cost 3 vext2 <0,2,4,u>, <2,2,2,2>
+ 1557800657U, // <4,u,2,3>: Cost 2 vext2 <2,3,4,u>, <2,3,4,u>
+ 2618935065U, // <4,u,2,4>: Cost 3 vext2 <0,2,4,u>, <2,4,3,u>
+ 2733864859U, // <4,u,2,5>: Cost 3 vext3 <u,2,5,4>, <u,2,5,4>
+ 2618935226U, // <4,u,2,6>: Cost 3 vext2 <0,2,4,u>, <2,6,3,7>
+ 2718750636U, // <4,u,2,7>: Cost 3 vext3 <5,6,7,4>, <u,2,7,3>
+ 1561118822U, // <4,u,2,u>: Cost 2 vext2 <2,u,4,u>, <2,u,4,u>
+ 2618935446U, // <4,u,3,0>: Cost 3 vext2 <0,2,4,u>, <3,0,1,2>
+ 2779318422U, // <4,u,3,1>: Cost 3 vuzpl RHS, <3,0,1,2>
+ 2636851545U, // <4,u,3,2>: Cost 3 vext2 <3,2,4,u>, <3,2,4,u>
+ 2618935708U, // <4,u,3,3>: Cost 3 vext2 <0,2,4,u>, <3,3,3,3>
+ 2618935810U, // <4,u,3,4>: Cost 3 vext2 <0,2,4,u>, <3,4,5,6>
+ 2691913711U, // <4,u,3,5>: Cost 3 vext3 <1,2,3,4>, <u,3,5,7>
+ 2588725862U, // <4,u,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+ 2640169710U, // <4,u,3,7>: Cost 3 vext2 <3,7,4,u>, <3,7,4,u>
+ 2618936094U, // <4,u,3,u>: Cost 3 vext2 <0,2,4,u>, <3,u,1,2>
+ 1503559782U, // <4,u,4,0>: Cost 2 vext1 <4,4,u,4>, LHS
+ 2692282391U, // <4,u,4,1>: Cost 3 vext3 <1,2,u,4>, <u,4,1,2>
+ 2565359426U, // <4,u,4,2>: Cost 3 vext1 <2,4,u,4>, <2,4,u,4>
+ 2571332123U, // <4,u,4,3>: Cost 3 vext1 <3,4,u,4>, <3,4,u,4>
+ 161926454U, // <4,u,4,4>: Cost 1 vdup0 RHS
+ 1545194806U, // <4,u,4,5>: Cost 2 vext2 <0,2,4,u>, RHS
+ 1705577782U, // <4,u,4,6>: Cost 2 vuzpl RHS, RHS
+ 2718750801U, // <4,u,4,7>: Cost 3 vext3 <5,6,7,4>, <u,4,7,6>
+ 161926454U, // <4,u,4,u>: Cost 1 vdup0 RHS
+ 1479164006U, // <4,u,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
+ 1839650606U, // <4,u,5,1>: Cost 2 vzipl RHS, LHS
+ 2565367502U, // <4,u,5,2>: Cost 3 vext1 <2,4,u,5>, <2,3,4,5>
+ 3089777309U, // <4,u,5,3>: Cost 3 vtrnr <0,4,1,5>, LHS
+ 1479167286U, // <4,u,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
+ 1839650970U, // <4,u,5,5>: Cost 2 vzipl RHS, RHS
+ 1618172058U, // <4,u,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
+ 3089780265U, // <4,u,5,7>: Cost 3 vtrnr <0,4,1,5>, RHS
+ 1618172076U, // <4,u,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
+ 1479688294U, // <4,u,6,0>: Cost 2 vext1 <0,4,u,6>, LHS
+ 2553430774U, // <4,u,6,1>: Cost 3 vext1 <0,4,u,6>, <1,0,3,2>
+ 1973868334U, // <4,u,6,2>: Cost 2 vtrnl RHS, LHS
+ 1497606685U, // <4,u,6,3>: Cost 2 vext1 <3,4,u,6>, <3,4,u,6>
+ 1479691574U, // <4,u,6,4>: Cost 2 vext1 <0,4,u,6>, RHS
+ 1509552079U, // <4,u,6,5>: Cost 2 vext1 <5,4,u,6>, <5,4,u,6>
+ 1973868698U, // <4,u,6,6>: Cost 2 vtrnl RHS, RHS
+ 27705344U, // <4,u,6,7>: Cost 0 copy RHS
+ 27705344U, // <4,u,6,u>: Cost 0 copy RHS
+ 2565382246U, // <4,u,7,0>: Cost 3 vext1 <2,4,u,7>, LHS
+ 2565383066U, // <4,u,7,1>: Cost 3 vext1 <2,4,u,7>, <1,2,3,4>
+ 2565384005U, // <4,u,7,2>: Cost 3 vext1 <2,4,u,7>, <2,4,u,7>
+ 2661405966U, // <4,u,7,3>: Cost 3 vext2 <7,3,4,u>, <7,3,4,u>
+ 2565385526U, // <4,u,7,4>: Cost 3 vext1 <2,4,u,7>, RHS
+ 2779321702U, // <4,u,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
+ 2589274793U, // <4,u,7,6>: Cost 3 vext1 <6,4,u,7>, <6,4,u,7>
+ 2779321964U, // <4,u,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
+ 2565388078U, // <4,u,7,u>: Cost 3 vext1 <2,4,u,7>, LHS
+ 1479704678U, // <4,u,u,0>: Cost 2 vext1 <0,4,u,u>, LHS
+ 1545197358U, // <4,u,u,1>: Cost 2 vext2 <0,2,4,u>, LHS
+ 1618172261U, // <4,u,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
+ 1497623071U, // <4,u,u,3>: Cost 2 vext1 <3,4,u,u>, <3,4,u,u>
+ 161926454U, // <4,u,u,4>: Cost 1 vdup0 RHS
+ 1545197722U, // <4,u,u,5>: Cost 2 vext2 <0,2,4,u>, RHS
+ 1618172301U, // <4,u,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
+ 27705344U, // <4,u,u,7>: Cost 0 copy RHS
+ 27705344U, // <4,u,u,u>: Cost 0 copy RHS
+ 2687123456U, // <5,0,0,0>: Cost 3 vext3 <0,4,1,5>, <0,0,0,0>
+ 2687123466U, // <5,0,0,1>: Cost 3 vext3 <0,4,1,5>, <0,0,1,1>
+ 2687123476U, // <5,0,0,2>: Cost 3 vext3 <0,4,1,5>, <0,0,2,2>
+ 3710599434U, // <5,0,0,3>: Cost 4 vext2 <3,2,5,0>, <0,3,2,5>
+ 2642166098U, // <5,0,0,4>: Cost 3 vext2 <4,1,5,0>, <0,4,1,5>
+ 3657060306U, // <5,0,0,5>: Cost 4 vext1 <5,5,0,0>, <5,5,0,0>
+ 3292094923U, // <5,0,0,6>: Cost 4 vrev <0,5,6,0>
+ 3669005700U, // <5,0,0,7>: Cost 4 vext1 <7,5,0,0>, <7,5,0,0>
+ 2687123530U, // <5,0,0,u>: Cost 3 vext3 <0,4,1,5>, <0,0,u,2>
+ 2559434854U, // <5,0,1,0>: Cost 3 vext1 <1,5,0,1>, LHS
+ 2559435887U, // <5,0,1,1>: Cost 3 vext1 <1,5,0,1>, <1,5,0,1>
+ 1613381734U, // <5,0,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
+ 3698656256U, // <5,0,1,3>: Cost 4 vext2 <1,2,5,0>, <1,3,5,7>
+ 2559438134U, // <5,0,1,4>: Cost 3 vext1 <1,5,0,1>, RHS
+ 2583326675U, // <5,0,1,5>: Cost 3 vext1 <5,5,0,1>, <5,5,0,1>
+ 3715908851U, // <5,0,1,6>: Cost 4 vext2 <4,1,5,0>, <1,6,5,7>
+ 3657069562U, // <5,0,1,7>: Cost 4 vext1 <5,5,0,1>, <7,0,1,2>
+ 1613381788U, // <5,0,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
+ 2686017700U, // <5,0,2,0>: Cost 3 vext3 <0,2,4,5>, <0,2,0,2>
+ 2685796528U, // <5,0,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
+ 2698625208U, // <5,0,2,2>: Cost 3 vext3 <2,3,4,5>, <0,2,2,4>
+ 2685944002U, // <5,0,2,3>: Cost 3 vext3 <0,2,3,5>, <0,2,3,5>
+ 2686017739U, // <5,0,2,4>: Cost 3 vext3 <0,2,4,5>, <0,2,4,5>
+ 2686091476U, // <5,0,2,5>: Cost 3 vext3 <0,2,5,5>, <0,2,5,5>
+ 2725167324U, // <5,0,2,6>: Cost 3 vext3 <6,7,4,5>, <0,2,6,4>
+ 2595280230U, // <5,0,2,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
+ 2686312687U, // <5,0,2,u>: Cost 3 vext3 <0,2,u,5>, <0,2,u,5>
+ 3760128248U, // <5,0,3,0>: Cost 4 vext3 <0,3,0,5>, <0,3,0,5>
+ 3759685888U, // <5,0,3,1>: Cost 4 vext3 <0,2,3,5>, <0,3,1,4>
+ 2686533898U, // <5,0,3,2>: Cost 3 vext3 <0,3,2,5>, <0,3,2,5>
+ 3760349459U, // <5,0,3,3>: Cost 4 vext3 <0,3,3,5>, <0,3,3,5>
+ 2638187004U, // <5,0,3,4>: Cost 3 vext2 <3,4,5,0>, <3,4,5,0>
+ 3776348452U, // <5,0,3,5>: Cost 4 vext3 <3,0,4,5>, <0,3,5,4>
+ 3713256094U, // <5,0,3,6>: Cost 4 vext2 <3,6,5,0>, <3,6,5,0>
+ 3914064896U, // <5,0,3,7>: Cost 4 vuzpr <3,5,7,0>, <1,3,5,7>
+ 2686976320U, // <5,0,3,u>: Cost 3 vext3 <0,3,u,5>, <0,3,u,5>
+ 2559459430U, // <5,0,4,0>: Cost 3 vext1 <1,5,0,4>, LHS
+ 1613381970U, // <5,0,4,1>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
+ 2687123804U, // <5,0,4,2>: Cost 3 vext3 <0,4,1,5>, <0,4,2,6>
+ 3761013092U, // <5,0,4,3>: Cost 4 vext3 <0,4,3,5>, <0,4,3,5>
+ 2559462710U, // <5,0,4,4>: Cost 3 vext1 <1,5,0,4>, RHS
+ 2638187830U, // <5,0,4,5>: Cost 3 vext2 <3,4,5,0>, RHS
+ 3761234303U, // <5,0,4,6>: Cost 4 vext3 <0,4,6,5>, <0,4,6,5>
+ 2646150600U, // <5,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
+ 1613381970U, // <5,0,4,u>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
+ 3766763926U, // <5,0,5,0>: Cost 4 vext3 <1,4,0,5>, <0,5,0,1>
+ 2919268454U, // <5,0,5,1>: Cost 3 vzipl <5,5,5,5>, LHS
+ 3053486182U, // <5,0,5,2>: Cost 3 vtrnl <5,5,5,5>, LHS
+ 3723210589U, // <5,0,5,3>: Cost 4 vext2 <5,3,5,0>, <5,3,5,0>
+ 3766763966U, // <5,0,5,4>: Cost 4 vext3 <1,4,0,5>, <0,5,4,5>
+ 2650796031U, // <5,0,5,5>: Cost 3 vext2 <5,5,5,0>, <5,5,5,0>
+ 3719893090U, // <5,0,5,6>: Cost 4 vext2 <4,7,5,0>, <5,6,7,0>
+ 3914067254U, // <5,0,5,7>: Cost 4 vuzpr <3,5,7,0>, RHS
+ 2919269021U, // <5,0,5,u>: Cost 3 vzipl <5,5,5,5>, LHS
+ 4047519744U, // <5,0,6,0>: Cost 4 vzipr <3,4,5,6>, <0,0,0,0>
+ 2920038502U, // <5,0,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
+ 3759759871U, // <5,0,6,2>: Cost 4 vext3 <0,2,4,5>, <0,6,2,7>
+ 3645164070U, // <5,0,6,3>: Cost 4 vext1 <3,5,0,6>, <3,5,0,6>
+ 3762414095U, // <5,0,6,4>: Cost 4 vext3 <0,6,4,5>, <0,6,4,5>
+ 3993780690U, // <5,0,6,5>: Cost 4 vzipl <5,6,7,0>, <0,5,6,7>
+ 3719893816U, // <5,0,6,6>: Cost 4 vext2 <4,7,5,0>, <6,6,6,6>
+ 2662077302U, // <5,0,6,7>: Cost 3 vext2 <7,4,5,0>, <6,7,4,5>
+ 2920039069U, // <5,0,6,u>: Cost 3 vzipl <5,6,7,0>, LHS
+ 2565455974U, // <5,0,7,0>: Cost 3 vext1 <2,5,0,7>, LHS
+ 2565456790U, // <5,0,7,1>: Cost 3 vext1 <2,5,0,7>, <1,2,3,0>
+ 2565457742U, // <5,0,7,2>: Cost 3 vext1 <2,5,0,7>, <2,5,0,7>
+ 3639199894U, // <5,0,7,3>: Cost 4 vext1 <2,5,0,7>, <3,0,1,2>
+ 2565459254U, // <5,0,7,4>: Cost 3 vext1 <2,5,0,7>, RHS
+ 2589347938U, // <5,0,7,5>: Cost 3 vext1 <6,5,0,7>, <5,6,7,0>
+ 2589348530U, // <5,0,7,6>: Cost 3 vext1 <6,5,0,7>, <6,5,0,7>
+ 4188456422U, // <5,0,7,7>: Cost 4 vtrnr RHS, <2,0,5,7>
+ 2565461806U, // <5,0,7,u>: Cost 3 vext1 <2,5,0,7>, LHS
+ 2687124106U, // <5,0,u,0>: Cost 3 vext3 <0,4,1,5>, <0,u,0,2>
+ 1616036502U, // <5,0,u,1>: Cost 2 vext3 <0,u,1,5>, <0,u,1,5>
+ 1613382301U, // <5,0,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
+ 2689925800U, // <5,0,u,3>: Cost 3 vext3 <0,u,3,5>, <0,u,3,5>
+ 2687124146U, // <5,0,u,4>: Cost 3 vext3 <0,4,1,5>, <0,u,4,6>
+ 2638190746U, // <5,0,u,5>: Cost 3 vext2 <3,4,5,0>, RHS
+ 2589356723U, // <5,0,u,6>: Cost 3 vext1 <6,5,0,u>, <6,5,0,u>
+ 2595280230U, // <5,0,u,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
+ 1613382355U, // <5,0,u,u>: Cost 2 vext3 <0,4,1,5>, LHS
+ 2646818816U, // <5,1,0,0>: Cost 3 vext2 <4,u,5,1>, <0,0,0,0>
+ 1573077094U, // <5,1,0,1>: Cost 2 vext2 <4,u,5,1>, LHS
+ 2646818980U, // <5,1,0,2>: Cost 3 vext2 <4,u,5,1>, <0,2,0,2>
+ 2687124214U, // <5,1,0,3>: Cost 3 vext3 <0,4,1,5>, <1,0,3,2>
+ 2641510738U, // <5,1,0,4>: Cost 3 vext2 <4,0,5,1>, <0,4,1,5>
+ 2641510814U, // <5,1,0,5>: Cost 3 vext2 <4,0,5,1>, <0,5,1,0>
+ 3720561142U, // <5,1,0,6>: Cost 4 vext2 <4,u,5,1>, <0,6,1,7>
+ 3298141357U, // <5,1,0,7>: Cost 4 vrev <1,5,7,0>
+ 1573077661U, // <5,1,0,u>: Cost 2 vext2 <4,u,5,1>, LHS
+ 2223891567U, // <5,1,1,0>: Cost 3 vrev <1,5,0,1>
+ 2687124276U, // <5,1,1,1>: Cost 3 vext3 <0,4,1,5>, <1,1,1,1>
+ 2646819734U, // <5,1,1,2>: Cost 3 vext2 <4,u,5,1>, <1,2,3,0>
+ 2687124296U, // <5,1,1,3>: Cost 3 vext3 <0,4,1,5>, <1,1,3,3>
+ 2691326803U, // <5,1,1,4>: Cost 3 vext3 <1,1,4,5>, <1,1,4,5>
+ 2691400540U, // <5,1,1,5>: Cost 3 vext3 <1,1,5,5>, <1,1,5,5>
+ 3765216101U, // <5,1,1,6>: Cost 4 vext3 <1,1,6,5>, <1,1,6,5>
+ 3765289838U, // <5,1,1,7>: Cost 4 vext3 <1,1,7,5>, <1,1,7,5>
+ 2687124341U, // <5,1,1,u>: Cost 3 vext3 <0,4,1,5>, <1,1,u,3>
+ 3297641584U, // <5,1,2,0>: Cost 4 vrev <1,5,0,2>
+ 3763520391U, // <5,1,2,1>: Cost 4 vext3 <0,u,1,5>, <1,2,1,3>
+ 2646820456U, // <5,1,2,2>: Cost 3 vext2 <4,u,5,1>, <2,2,2,2>
+ 2687124374U, // <5,1,2,3>: Cost 3 vext3 <0,4,1,5>, <1,2,3,0>
+ 2691990436U, // <5,1,2,4>: Cost 3 vext3 <1,2,4,5>, <1,2,4,5>
+ 2687124395U, // <5,1,2,5>: Cost 3 vext3 <0,4,1,5>, <1,2,5,3>
+ 2646820794U, // <5,1,2,6>: Cost 3 vext2 <4,u,5,1>, <2,6,3,7>
+ 3808199610U, // <5,1,2,7>: Cost 4 vext3 <u,3,4,5>, <1,2,7,0>
+ 2687124419U, // <5,1,2,u>: Cost 3 vext3 <0,4,1,5>, <1,2,u,0>
+ 2577440870U, // <5,1,3,0>: Cost 3 vext1 <4,5,1,3>, LHS
+ 2687124440U, // <5,1,3,1>: Cost 3 vext3 <0,4,1,5>, <1,3,1,3>
+ 3759686627U, // <5,1,3,2>: Cost 4 vext3 <0,2,3,5>, <1,3,2,5>
+ 2692580332U, // <5,1,3,3>: Cost 3 vext3 <1,3,3,5>, <1,3,3,5>
+ 2687124469U, // <5,1,3,4>: Cost 3 vext3 <0,4,1,5>, <1,3,4,5>
+ 2685207552U, // <5,1,3,5>: Cost 3 vext3 <0,1,2,5>, <1,3,5,7>
+ 3760866313U, // <5,1,3,6>: Cost 4 vext3 <0,4,1,5>, <1,3,6,7>
+ 2692875280U, // <5,1,3,7>: Cost 3 vext3 <1,3,7,5>, <1,3,7,5>
+ 2687124503U, // <5,1,3,u>: Cost 3 vext3 <0,4,1,5>, <1,3,u,3>
+ 1567771538U, // <5,1,4,0>: Cost 2 vext2 <4,0,5,1>, <4,0,5,1>
+ 2693096491U, // <5,1,4,1>: Cost 3 vext3 <1,4,1,5>, <1,4,1,5>
+ 2693170228U, // <5,1,4,2>: Cost 3 vext3 <1,4,2,5>, <1,4,2,5>
+ 2687124541U, // <5,1,4,3>: Cost 3 vext3 <0,4,1,5>, <1,4,3,5>
+ 2646822096U, // <5,1,4,4>: Cost 3 vext2 <4,u,5,1>, <4,4,4,4>
+ 1573080374U, // <5,1,4,5>: Cost 2 vext2 <4,u,5,1>, RHS
+ 2646822260U, // <5,1,4,6>: Cost 3 vext2 <4,u,5,1>, <4,6,4,6>
+ 3298174129U, // <5,1,4,7>: Cost 4 vrev <1,5,7,4>
+ 1573080602U, // <5,1,4,u>: Cost 2 vext2 <4,u,5,1>, <4,u,5,1>
+ 2687124591U, // <5,1,5,0>: Cost 3 vext3 <0,4,1,5>, <1,5,0,1>
+ 2646822543U, // <5,1,5,1>: Cost 3 vext2 <4,u,5,1>, <5,1,0,1>
+ 3760866433U, // <5,1,5,2>: Cost 4 vext3 <0,4,1,5>, <1,5,2,1>
+ 2687124624U, // <5,1,5,3>: Cost 3 vext3 <0,4,1,5>, <1,5,3,7>
+ 2687124631U, // <5,1,5,4>: Cost 3 vext3 <0,4,1,5>, <1,5,4,5>
+ 2646822916U, // <5,1,5,5>: Cost 3 vext2 <4,u,5,1>, <5,5,5,5>
+ 2646823010U, // <5,1,5,6>: Cost 3 vext2 <4,u,5,1>, <5,6,7,0>
+ 2646823080U, // <5,1,5,7>: Cost 3 vext2 <4,u,5,1>, <5,7,5,7>
+ 2687124663U, // <5,1,5,u>: Cost 3 vext3 <0,4,1,5>, <1,5,u,1>
+ 2553577574U, // <5,1,6,0>: Cost 3 vext1 <0,5,1,6>, LHS
+ 3763520719U, // <5,1,6,1>: Cost 4 vext3 <0,u,1,5>, <1,6,1,7>
+ 2646823418U, // <5,1,6,2>: Cost 3 vext2 <4,u,5,1>, <6,2,7,3>
+ 3760866529U, // <5,1,6,3>: Cost 4 vext3 <0,4,1,5>, <1,6,3,7>
+ 2553580854U, // <5,1,6,4>: Cost 3 vext1 <0,5,1,6>, RHS
+ 2687124723U, // <5,1,6,5>: Cost 3 vext3 <0,4,1,5>, <1,6,5,7>
+ 2646823736U, // <5,1,6,6>: Cost 3 vext2 <4,u,5,1>, <6,6,6,6>
+ 2646823758U, // <5,1,6,7>: Cost 3 vext2 <4,u,5,1>, <6,7,0,1>
+ 2646823839U, // <5,1,6,u>: Cost 3 vext2 <4,u,5,1>, <6,u,0,1>
+ 2559557734U, // <5,1,7,0>: Cost 3 vext1 <1,5,1,7>, LHS
+ 2559558452U, // <5,1,7,1>: Cost 3 vext1 <1,5,1,7>, <1,1,1,1>
+ 2571503270U, // <5,1,7,2>: Cost 3 vext1 <3,5,1,7>, <2,3,0,1>
+ 2040971366U, // <5,1,7,3>: Cost 2 vtrnr RHS, LHS
+ 2559561014U, // <5,1,7,4>: Cost 3 vext1 <1,5,1,7>, RHS
+ 2595393232U, // <5,1,7,5>: Cost 3 vext1 <7,5,1,7>, <5,1,7,3>
+ 4188455035U, // <5,1,7,6>: Cost 4 vtrnr RHS, <0,1,4,6>
+ 2646824556U, // <5,1,7,7>: Cost 3 vext2 <4,u,5,1>, <7,7,7,7>
+ 2040971371U, // <5,1,7,u>: Cost 2 vtrnr RHS, LHS
+ 1591662326U, // <5,1,u,0>: Cost 2 vext2 <u,0,5,1>, <u,0,5,1>
+ 1573082926U, // <5,1,u,1>: Cost 2 vext2 <4,u,5,1>, LHS
+ 2695824760U, // <5,1,u,2>: Cost 3 vext3 <1,u,2,5>, <1,u,2,5>
+ 2040979558U, // <5,1,u,3>: Cost 2 vtrnr RHS, LHS
+ 2687124874U, // <5,1,u,4>: Cost 3 vext3 <0,4,1,5>, <1,u,4,5>
+ 1573083290U, // <5,1,u,5>: Cost 2 vext2 <4,u,5,1>, RHS
+ 2646825168U, // <5,1,u,6>: Cost 3 vext2 <4,u,5,1>, <u,6,3,7>
+ 2646825216U, // <5,1,u,7>: Cost 3 vext2 <4,u,5,1>, <u,7,0,1>
+ 2040979563U, // <5,1,u,u>: Cost 2 vtrnr RHS, LHS
+ 3702652928U, // <5,2,0,0>: Cost 4 vext2 <1,u,5,2>, <0,0,0,0>
+ 2628911206U, // <5,2,0,1>: Cost 3 vext2 <1,u,5,2>, LHS
+ 2641518756U, // <5,2,0,2>: Cost 3 vext2 <4,0,5,2>, <0,2,0,2>
+ 3759760847U, // <5,2,0,3>: Cost 4 vext3 <0,2,4,5>, <2,0,3,2>
+ 3760866775U, // <5,2,0,4>: Cost 4 vext3 <0,4,1,5>, <2,0,4,1>
+ 3759539680U, // <5,2,0,5>: Cost 4 vext3 <0,2,1,5>, <2,0,5,1>
+ 3760866796U, // <5,2,0,6>: Cost 4 vext3 <0,4,1,5>, <2,0,6,4>
+ 3304114054U, // <5,2,0,7>: Cost 4 vrev <2,5,7,0>
+ 2628911773U, // <5,2,0,u>: Cost 3 vext2 <1,u,5,2>, LHS
+ 2623603464U, // <5,2,1,0>: Cost 3 vext2 <1,0,5,2>, <1,0,5,2>
+ 3698008921U, // <5,2,1,1>: Cost 4 vext2 <1,1,5,2>, <1,1,5,2>
+ 3633325603U, // <5,2,1,2>: Cost 4 vext1 <1,5,2,1>, <2,1,3,5>
+ 2687125027U, // <5,2,1,3>: Cost 3 vext3 <0,4,1,5>, <2,1,3,5>
+ 3633327414U, // <5,2,1,4>: Cost 4 vext1 <1,5,2,1>, RHS
+ 3759539760U, // <5,2,1,5>: Cost 4 vext3 <0,2,1,5>, <2,1,5,0>
+ 3760866876U, // <5,2,1,6>: Cost 4 vext3 <0,4,1,5>, <2,1,6,3>
+ 3304122247U, // <5,2,1,7>: Cost 4 vrev <2,5,7,1>
+ 2687125072U, // <5,2,1,u>: Cost 3 vext3 <0,4,1,5>, <2,1,u,5>
+ 3633332326U, // <5,2,2,0>: Cost 4 vext1 <1,5,2,2>, LHS
+ 3759760992U, // <5,2,2,1>: Cost 4 vext3 <0,2,4,5>, <2,2,1,3>
+ 2687125096U, // <5,2,2,2>: Cost 3 vext3 <0,4,1,5>, <2,2,2,2>
+ 2687125106U, // <5,2,2,3>: Cost 3 vext3 <0,4,1,5>, <2,2,3,3>
+ 2697963133U, // <5,2,2,4>: Cost 3 vext3 <2,2,4,5>, <2,2,4,5>
+ 3759466120U, // <5,2,2,5>: Cost 4 vext3 <0,2,0,5>, <2,2,5,7>
+ 3760866960U, // <5,2,2,6>: Cost 4 vext3 <0,4,1,5>, <2,2,6,6>
+ 3771926168U, // <5,2,2,7>: Cost 4 vext3 <2,2,7,5>, <2,2,7,5>
+ 2687125151U, // <5,2,2,u>: Cost 3 vext3 <0,4,1,5>, <2,2,u,3>
+ 2687125158U, // <5,2,3,0>: Cost 3 vext3 <0,4,1,5>, <2,3,0,1>
+ 2698405555U, // <5,2,3,1>: Cost 3 vext3 <2,3,1,5>, <2,3,1,5>
+ 2577516238U, // <5,2,3,2>: Cost 3 vext1 <4,5,2,3>, <2,3,4,5>
+ 3759687365U, // <5,2,3,3>: Cost 4 vext3 <0,2,3,5>, <2,3,3,5>
+ 1624884942U, // <5,2,3,4>: Cost 2 vext3 <2,3,4,5>, <2,3,4,5>
+ 2698700503U, // <5,2,3,5>: Cost 3 vext3 <2,3,5,5>, <2,3,5,5>
+ 3772368608U, // <5,2,3,6>: Cost 4 vext3 <2,3,4,5>, <2,3,6,5>
+ 3702655716U, // <5,2,3,7>: Cost 4 vext2 <1,u,5,2>, <3,7,3,7>
+ 1625179890U, // <5,2,3,u>: Cost 2 vext3 <2,3,u,5>, <2,3,u,5>
+ 2641521555U, // <5,2,4,0>: Cost 3 vext2 <4,0,5,2>, <4,0,5,2>
+ 3772368642U, // <5,2,4,1>: Cost 4 vext3 <2,3,4,5>, <2,4,1,3>
+ 2699142925U, // <5,2,4,2>: Cost 3 vext3 <2,4,2,5>, <2,4,2,5>
+ 2698626838U, // <5,2,4,3>: Cost 3 vext3 <2,3,4,5>, <2,4,3,5>
+ 2698626848U, // <5,2,4,4>: Cost 3 vext3 <2,3,4,5>, <2,4,4,6>
+ 2628914486U, // <5,2,4,5>: Cost 3 vext2 <1,u,5,2>, RHS
+ 2645503353U, // <5,2,4,6>: Cost 3 vext2 <4,6,5,2>, <4,6,5,2>
+ 3304146826U, // <5,2,4,7>: Cost 4 vrev <2,5,7,4>
+ 2628914729U, // <5,2,4,u>: Cost 3 vext2 <1,u,5,2>, RHS
+ 2553643110U, // <5,2,5,0>: Cost 3 vext1 <0,5,2,5>, LHS
+ 3758950227U, // <5,2,5,1>: Cost 4 vext3 <0,1,2,5>, <2,5,1,3>
+ 3759761248U, // <5,2,5,2>: Cost 4 vext3 <0,2,4,5>, <2,5,2,7>
+ 2982396006U, // <5,2,5,3>: Cost 3 vzipr <4,u,5,5>, LHS
+ 2553646390U, // <5,2,5,4>: Cost 3 vext1 <0,5,2,5>, RHS
+ 2553647108U, // <5,2,5,5>: Cost 3 vext1 <0,5,2,5>, <5,5,5,5>
+ 3760867204U, // <5,2,5,6>: Cost 4 vext3 <0,4,1,5>, <2,5,6,7>
+ 3702657141U, // <5,2,5,7>: Cost 4 vext2 <1,u,5,2>, <5,7,0,1>
+ 2982396011U, // <5,2,5,u>: Cost 3 vzipr <4,u,5,5>, LHS
+ 3627393126U, // <5,2,6,0>: Cost 4 vext1 <0,5,2,6>, LHS
+ 3760867236U, // <5,2,6,1>: Cost 4 vext3 <0,4,1,5>, <2,6,1,3>
+ 2645504506U, // <5,2,6,2>: Cost 3 vext2 <4,6,5,2>, <6,2,7,3>
+ 2687125434U, // <5,2,6,3>: Cost 3 vext3 <0,4,1,5>, <2,6,3,7>
+ 2700617665U, // <5,2,6,4>: Cost 3 vext3 <2,6,4,5>, <2,6,4,5>
+ 3760867276U, // <5,2,6,5>: Cost 4 vext3 <0,4,1,5>, <2,6,5,7>
+ 3763521493U, // <5,2,6,6>: Cost 4 vext3 <0,u,1,5>, <2,6,6,7>
+ 3719246670U, // <5,2,6,7>: Cost 4 vext2 <4,6,5,2>, <6,7,0,1>
+ 2687125479U, // <5,2,6,u>: Cost 3 vext3 <0,4,1,5>, <2,6,u,7>
+ 2565603430U, // <5,2,7,0>: Cost 3 vext1 <2,5,2,7>, LHS
+ 2553660150U, // <5,2,7,1>: Cost 3 vext1 <0,5,2,7>, <1,0,3,2>
+ 2565605216U, // <5,2,7,2>: Cost 3 vext1 <2,5,2,7>, <2,5,2,7>
+ 2961178726U, // <5,2,7,3>: Cost 3 vzipr <1,3,5,7>, LHS
+ 2565606710U, // <5,2,7,4>: Cost 3 vext1 <2,5,2,7>, RHS
+ 4034920552U, // <5,2,7,5>: Cost 4 vzipr <1,3,5,7>, <0,1,2,5>
+ 3114713292U, // <5,2,7,6>: Cost 3 vtrnr RHS, <0,2,4,6>
+ 3702658668U, // <5,2,7,7>: Cost 4 vext2 <1,u,5,2>, <7,7,7,7>
+ 2961178731U, // <5,2,7,u>: Cost 3 vzipr <1,3,5,7>, LHS
+ 2687125563U, // <5,2,u,0>: Cost 3 vext3 <0,4,1,5>, <2,u,0,1>
+ 2628917038U, // <5,2,u,1>: Cost 3 vext2 <1,u,5,2>, LHS
+ 2565613409U, // <5,2,u,2>: Cost 3 vext1 <2,5,2,u>, <2,5,2,u>
+ 2687125592U, // <5,2,u,3>: Cost 3 vext3 <0,4,1,5>, <2,u,3,3>
+ 1628203107U, // <5,2,u,4>: Cost 2 vext3 <2,u,4,5>, <2,u,4,5>
+ 2628917402U, // <5,2,u,5>: Cost 3 vext2 <1,u,5,2>, RHS
+ 2702092405U, // <5,2,u,6>: Cost 3 vext3 <2,u,6,5>, <2,u,6,5>
+ 3304179598U, // <5,2,u,7>: Cost 4 vrev <2,5,7,u>
+ 1628498055U, // <5,2,u,u>: Cost 2 vext3 <2,u,u,5>, <2,u,u,5>
+ 3760867467U, // <5,3,0,0>: Cost 4 vext3 <0,4,1,5>, <3,0,0,0>
+ 2687125654U, // <5,3,0,1>: Cost 3 vext3 <0,4,1,5>, <3,0,1,2>
+ 3759761565U, // <5,3,0,2>: Cost 4 vext3 <0,2,4,5>, <3,0,2,0>
+ 3633391766U, // <5,3,0,3>: Cost 4 vext1 <1,5,3,0>, <3,0,1,2>
+ 2687125680U, // <5,3,0,4>: Cost 3 vext3 <0,4,1,5>, <3,0,4,1>
+ 3760277690U, // <5,3,0,5>: Cost 4 vext3 <0,3,2,5>, <3,0,5,2>
+ 3310013014U, // <5,3,0,6>: Cost 4 vrev <3,5,6,0>
+ 2236344927U, // <5,3,0,7>: Cost 3 vrev <3,5,7,0>
+ 2687125717U, // <5,3,0,u>: Cost 3 vext3 <0,4,1,5>, <3,0,u,2>
+ 3760867551U, // <5,3,1,0>: Cost 4 vext3 <0,4,1,5>, <3,1,0,3>
+ 3760867558U, // <5,3,1,1>: Cost 4 vext3 <0,4,1,5>, <3,1,1,1>
+ 2624938923U, // <5,3,1,2>: Cost 3 vext2 <1,2,5,3>, <1,2,5,3>
+ 2703198460U, // <5,3,1,3>: Cost 3 vext3 <3,1,3,5>, <3,1,3,5>
+ 3760867587U, // <5,3,1,4>: Cost 4 vext3 <0,4,1,5>, <3,1,4,3>
+ 2636219536U, // <5,3,1,5>: Cost 3 vext2 <3,1,5,3>, <1,5,3,7>
+ 3698681075U, // <5,3,1,6>: Cost 4 vext2 <1,2,5,3>, <1,6,5,7>
+ 2703493408U, // <5,3,1,7>: Cost 3 vext3 <3,1,7,5>, <3,1,7,5>
+ 2628920721U, // <5,3,1,u>: Cost 3 vext2 <1,u,5,3>, <1,u,5,3>
+ 3766765870U, // <5,3,2,0>: Cost 4 vext3 <1,4,0,5>, <3,2,0,1>
+ 3698681379U, // <5,3,2,1>: Cost 4 vext2 <1,2,5,3>, <2,1,3,5>
+ 3760867649U, // <5,3,2,2>: Cost 4 vext3 <0,4,1,5>, <3,2,2,2>
+ 2698627404U, // <5,3,2,3>: Cost 3 vext3 <2,3,4,5>, <3,2,3,4>
+ 2703935830U, // <5,3,2,4>: Cost 3 vext3 <3,2,4,5>, <3,2,4,5>
+ 2698627422U, // <5,3,2,5>: Cost 3 vext3 <2,3,4,5>, <3,2,5,4>
+ 3760867686U, // <5,3,2,6>: Cost 4 vext3 <0,4,1,5>, <3,2,6,3>
+ 3769788783U, // <5,3,2,7>: Cost 4 vext3 <1,u,5,5>, <3,2,7,3>
+ 2701945209U, // <5,3,2,u>: Cost 3 vext3 <2,u,4,5>, <3,2,u,4>
+ 3760867711U, // <5,3,3,0>: Cost 4 vext3 <0,4,1,5>, <3,3,0,1>
+ 2636220684U, // <5,3,3,1>: Cost 3 vext2 <3,1,5,3>, <3,1,5,3>
+ 3772369298U, // <5,3,3,2>: Cost 4 vext3 <2,3,4,5>, <3,3,2,2>
+ 2687125916U, // <5,3,3,3>: Cost 3 vext3 <0,4,1,5>, <3,3,3,3>
+ 2704599463U, // <5,3,3,4>: Cost 3 vext3 <3,3,4,5>, <3,3,4,5>
+ 2704673200U, // <5,3,3,5>: Cost 3 vext3 <3,3,5,5>, <3,3,5,5>
+ 3709962935U, // <5,3,3,6>: Cost 4 vext2 <3,1,5,3>, <3,6,7,7>
+ 3772369346U, // <5,3,3,7>: Cost 4 vext3 <2,3,4,5>, <3,3,7,5>
+ 2704894411U, // <5,3,3,u>: Cost 3 vext3 <3,3,u,5>, <3,3,u,5>
+ 2704968148U, // <5,3,4,0>: Cost 3 vext3 <3,4,0,5>, <3,4,0,5>
+ 3698682850U, // <5,3,4,1>: Cost 4 vext2 <1,2,5,3>, <4,1,5,0>
+ 2642857014U, // <5,3,4,2>: Cost 3 vext2 <4,2,5,3>, <4,2,5,3>
+ 2705189359U, // <5,3,4,3>: Cost 3 vext3 <3,4,3,5>, <3,4,3,5>
+ 2705263096U, // <5,3,4,4>: Cost 3 vext3 <3,4,4,5>, <3,4,4,5>
+ 2685946370U, // <5,3,4,5>: Cost 3 vext3 <0,2,3,5>, <3,4,5,6>
+ 3779152394U, // <5,3,4,6>: Cost 4 vext3 <3,4,6,5>, <3,4,6,5>
+ 2236377699U, // <5,3,4,7>: Cost 3 vrev <3,5,7,4>
+ 2687126045U, // <5,3,4,u>: Cost 3 vext3 <0,4,1,5>, <3,4,u,6>
+ 2571632742U, // <5,3,5,0>: Cost 3 vext1 <3,5,3,5>, LHS
+ 2559689870U, // <5,3,5,1>: Cost 3 vext1 <1,5,3,5>, <1,5,3,5>
+ 2571634382U, // <5,3,5,2>: Cost 3 vext1 <3,5,3,5>, <2,3,4,5>
+ 2571635264U, // <5,3,5,3>: Cost 3 vext1 <3,5,3,5>, <3,5,3,5>
+ 2571636022U, // <5,3,5,4>: Cost 3 vext1 <3,5,3,5>, RHS
+ 2559692804U, // <5,3,5,5>: Cost 3 vext1 <1,5,3,5>, <5,5,5,5>
+ 3720581218U, // <5,3,5,6>: Cost 4 vext2 <4,u,5,3>, <5,6,7,0>
+ 2236385892U, // <5,3,5,7>: Cost 3 vrev <3,5,7,5>
+ 2571638574U, // <5,3,5,u>: Cost 3 vext1 <3,5,3,5>, LHS
+ 2565668966U, // <5,3,6,0>: Cost 3 vext1 <2,5,3,6>, LHS
+ 3633439887U, // <5,3,6,1>: Cost 4 vext1 <1,5,3,6>, <1,5,3,6>
+ 2565670760U, // <5,3,6,2>: Cost 3 vext1 <2,5,3,6>, <2,5,3,6>
+ 2565671426U, // <5,3,6,3>: Cost 3 vext1 <2,5,3,6>, <3,4,5,6>
+ 2565672246U, // <5,3,6,4>: Cost 3 vext1 <2,5,3,6>, RHS
+ 3639414630U, // <5,3,6,5>: Cost 4 vext1 <2,5,3,6>, <5,3,6,0>
+ 4047521640U, // <5,3,6,6>: Cost 4 vzipr <3,4,5,6>, <2,5,3,6>
+ 2725169844U, // <5,3,6,7>: Cost 3 vext3 <6,7,4,5>, <3,6,7,4>
+ 2565674798U, // <5,3,6,u>: Cost 3 vext1 <2,5,3,6>, LHS
+ 1485963366U, // <5,3,7,0>: Cost 2 vext1 <1,5,3,7>, LHS
+ 1485964432U, // <5,3,7,1>: Cost 2 vext1 <1,5,3,7>, <1,5,3,7>
+ 2559706728U, // <5,3,7,2>: Cost 3 vext1 <1,5,3,7>, <2,2,2,2>
+ 2559707286U, // <5,3,7,3>: Cost 3 vext1 <1,5,3,7>, <3,0,1,2>
+ 1485966646U, // <5,3,7,4>: Cost 2 vext1 <1,5,3,7>, RHS
+ 2559708880U, // <5,3,7,5>: Cost 3 vext1 <1,5,3,7>, <5,1,7,3>
+ 2601513466U, // <5,3,7,6>: Cost 3 vext1 <u,5,3,7>, <6,2,7,3>
+ 3114714112U, // <5,3,7,7>: Cost 3 vtrnr RHS, <1,3,5,7>
+ 1485969198U, // <5,3,7,u>: Cost 2 vext1 <1,5,3,7>, LHS
+ 1485971558U, // <5,3,u,0>: Cost 2 vext1 <1,5,3,u>, LHS
+ 1485972625U, // <5,3,u,1>: Cost 2 vext1 <1,5,3,u>, <1,5,3,u>
+ 2559714920U, // <5,3,u,2>: Cost 3 vext1 <1,5,3,u>, <2,2,2,2>
+ 2559715478U, // <5,3,u,3>: Cost 3 vext1 <1,5,3,u>, <3,0,1,2>
+ 1485974838U, // <5,3,u,4>: Cost 2 vext1 <1,5,3,u>, RHS
+ 2687126342U, // <5,3,u,5>: Cost 3 vext3 <0,4,1,5>, <3,u,5,6>
+ 2601521658U, // <5,3,u,6>: Cost 3 vext1 <u,5,3,u>, <6,2,7,3>
+ 2236410471U, // <5,3,u,7>: Cost 3 vrev <3,5,7,u>
+ 1485977390U, // <5,3,u,u>: Cost 2 vext1 <1,5,3,u>, LHS
+ 3627491430U, // <5,4,0,0>: Cost 4 vext1 <0,5,4,0>, LHS
+ 2636890214U, // <5,4,0,1>: Cost 3 vext2 <3,2,5,4>, LHS
+ 3703333028U, // <5,4,0,2>: Cost 4 vext2 <2,0,5,4>, <0,2,0,2>
+ 3782249348U, // <5,4,0,3>: Cost 4 vext3 <4,0,3,5>, <4,0,3,5>
+ 2642198866U, // <5,4,0,4>: Cost 3 vext2 <4,1,5,4>, <0,4,1,5>
+ 2687126418U, // <5,4,0,5>: Cost 3 vext3 <0,4,1,5>, <4,0,5,1>
+ 2242243887U, // <5,4,0,6>: Cost 3 vrev <4,5,6,0>
+ 3316059448U, // <5,4,0,7>: Cost 4 vrev <4,5,7,0>
+ 2636890781U, // <5,4,0,u>: Cost 3 vext2 <3,2,5,4>, LHS
+ 2241809658U, // <5,4,1,0>: Cost 3 vrev <4,5,0,1>
+ 3698025307U, // <5,4,1,1>: Cost 4 vext2 <1,1,5,4>, <1,1,5,4>
+ 3698688940U, // <5,4,1,2>: Cost 4 vext2 <1,2,5,4>, <1,2,5,4>
+ 3698689024U, // <5,4,1,3>: Cost 4 vext2 <1,2,5,4>, <1,3,5,7>
+ 3700016206U, // <5,4,1,4>: Cost 4 vext2 <1,4,5,4>, <1,4,5,4>
+ 2687126498U, // <5,4,1,5>: Cost 3 vext3 <0,4,1,5>, <4,1,5,0>
+ 3760868336U, // <5,4,1,6>: Cost 4 vext3 <0,4,1,5>, <4,1,6,5>
+ 3316067641U, // <5,4,1,7>: Cost 4 vrev <4,5,7,1>
+ 2242399554U, // <5,4,1,u>: Cost 3 vrev <4,5,u,1>
+ 3703334371U, // <5,4,2,0>: Cost 4 vext2 <2,0,5,4>, <2,0,5,4>
+ 3703998004U, // <5,4,2,1>: Cost 4 vext2 <2,1,5,4>, <2,1,5,4>
+ 3704661637U, // <5,4,2,2>: Cost 4 vext2 <2,2,5,4>, <2,2,5,4>
+ 2636891854U, // <5,4,2,3>: Cost 3 vext2 <3,2,5,4>, <2,3,4,5>
+ 3705988903U, // <5,4,2,4>: Cost 4 vext2 <2,4,5,4>, <2,4,5,4>
+ 2698628150U, // <5,4,2,5>: Cost 3 vext3 <2,3,4,5>, <4,2,5,3>
+ 3760868415U, // <5,4,2,6>: Cost 4 vext3 <0,4,1,5>, <4,2,6,3>
+ 3783871562U, // <5,4,2,7>: Cost 4 vext3 <4,2,7,5>, <4,2,7,5>
+ 2666752099U, // <5,4,2,u>: Cost 3 vext2 <u,2,5,4>, <2,u,4,5>
+ 3639459942U, // <5,4,3,0>: Cost 4 vext1 <2,5,4,3>, LHS
+ 3709970701U, // <5,4,3,1>: Cost 4 vext2 <3,1,5,4>, <3,1,5,4>
+ 2636892510U, // <5,4,3,2>: Cost 3 vext2 <3,2,5,4>, <3,2,5,4>
+ 3710634396U, // <5,4,3,3>: Cost 4 vext2 <3,2,5,4>, <3,3,3,3>
+ 2638219776U, // <5,4,3,4>: Cost 3 vext2 <3,4,5,4>, <3,4,5,4>
+ 3766987908U, // <5,4,3,5>: Cost 4 vext3 <1,4,3,5>, <4,3,5,0>
+ 2710719634U, // <5,4,3,6>: Cost 3 vext3 <4,3,6,5>, <4,3,6,5>
+ 3914097664U, // <5,4,3,7>: Cost 4 vuzpr <3,5,7,4>, <1,3,5,7>
+ 2640874308U, // <5,4,3,u>: Cost 3 vext2 <3,u,5,4>, <3,u,5,4>
+ 2583642214U, // <5,4,4,0>: Cost 3 vext1 <5,5,4,4>, LHS
+ 2642201574U, // <5,4,4,1>: Cost 3 vext2 <4,1,5,4>, <4,1,5,4>
+ 3710635062U, // <5,4,4,2>: Cost 4 vext2 <3,2,5,4>, <4,2,5,3>
+ 3717270664U, // <5,4,4,3>: Cost 4 vext2 <4,3,5,4>, <4,3,5,4>
+ 2713963728U, // <5,4,4,4>: Cost 3 vext3 <4,u,5,5>, <4,4,4,4>
+ 1637567706U, // <5,4,4,5>: Cost 2 vext3 <4,4,5,5>, <4,4,5,5>
+ 2242276659U, // <5,4,4,6>: Cost 3 vrev <4,5,6,4>
+ 2646183372U, // <5,4,4,7>: Cost 3 vext2 <4,7,5,4>, <4,7,5,4>
+ 1637788917U, // <5,4,4,u>: Cost 2 vext3 <4,4,u,5>, <4,4,u,5>
+ 2559762534U, // <5,4,5,0>: Cost 3 vext1 <1,5,4,5>, LHS
+ 2559763607U, // <5,4,5,1>: Cost 3 vext1 <1,5,4,5>, <1,5,4,5>
+ 2698628366U, // <5,4,5,2>: Cost 3 vext3 <2,3,4,5>, <4,5,2,3>
+ 3633506454U, // <5,4,5,3>: Cost 4 vext1 <1,5,4,5>, <3,0,1,2>
+ 2559765814U, // <5,4,5,4>: Cost 3 vext1 <1,5,4,5>, RHS
+ 2583654395U, // <5,4,5,5>: Cost 3 vext1 <5,5,4,5>, <5,5,4,5>
+ 1613385014U, // <5,4,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
+ 3901639990U, // <5,4,5,7>: Cost 4 vuzpr <1,5,0,4>, RHS
+ 1613385032U, // <5,4,5,u>: Cost 2 vext3 <0,4,1,5>, RHS
+ 2559770726U, // <5,4,6,0>: Cost 3 vext1 <1,5,4,6>, LHS
+ 2559771648U, // <5,4,6,1>: Cost 3 vext1 <1,5,4,6>, <1,3,5,7>
+ 3633514088U, // <5,4,6,2>: Cost 4 vext1 <1,5,4,6>, <2,2,2,2>
+ 2571717122U, // <5,4,6,3>: Cost 3 vext1 <3,5,4,6>, <3,4,5,6>
+ 2559774006U, // <5,4,6,4>: Cost 3 vext1 <1,5,4,6>, RHS
+ 2712636796U, // <5,4,6,5>: Cost 3 vext3 <4,6,5,5>, <4,6,5,5>
+ 3760868743U, // <5,4,6,6>: Cost 4 vext3 <0,4,1,5>, <4,6,6,7>
+ 2712784270U, // <5,4,6,7>: Cost 3 vext3 <4,6,7,5>, <4,6,7,5>
+ 2559776558U, // <5,4,6,u>: Cost 3 vext1 <1,5,4,6>, LHS
+ 2565750886U, // <5,4,7,0>: Cost 3 vext1 <2,5,4,7>, LHS
+ 2565751706U, // <5,4,7,1>: Cost 3 vext1 <2,5,4,7>, <1,2,3,4>
+ 2565752690U, // <5,4,7,2>: Cost 3 vext1 <2,5,4,7>, <2,5,4,7>
+ 2571725387U, // <5,4,7,3>: Cost 3 vext1 <3,5,4,7>, <3,5,4,7>
+ 2565754166U, // <5,4,7,4>: Cost 3 vext1 <2,5,4,7>, RHS
+ 3114713426U, // <5,4,7,5>: Cost 3 vtrnr RHS, <0,4,1,5>
+ 94817590U, // <5,4,7,6>: Cost 1 vrev RHS
+ 2595616175U, // <5,4,7,7>: Cost 3 vext1 <7,5,4,7>, <7,5,4,7>
+ 94965064U, // <5,4,7,u>: Cost 1 vrev RHS
+ 2559787110U, // <5,4,u,0>: Cost 3 vext1 <1,5,4,u>, LHS
+ 2559788186U, // <5,4,u,1>: Cost 3 vext1 <1,5,4,u>, <1,5,4,u>
+ 2242014483U, // <5,4,u,2>: Cost 3 vrev <4,5,2,u>
+ 2667419628U, // <5,4,u,3>: Cost 3 vext2 <u,3,5,4>, <u,3,5,4>
+ 2559790390U, // <5,4,u,4>: Cost 3 vext1 <1,5,4,u>, RHS
+ 1640222238U, // <5,4,u,5>: Cost 2 vext3 <4,u,5,5>, <4,u,5,5>
+ 94825783U, // <5,4,u,6>: Cost 1 vrev RHS
+ 2714111536U, // <5,4,u,7>: Cost 3 vext3 <4,u,7,5>, <4,u,7,5>
+ 94973257U, // <5,4,u,u>: Cost 1 vrev RHS
+ 2646851584U, // <5,5,0,0>: Cost 3 vext2 <4,u,5,5>, <0,0,0,0>
+ 1573109862U, // <5,5,0,1>: Cost 2 vext2 <4,u,5,5>, LHS
+ 2646851748U, // <5,5,0,2>: Cost 3 vext2 <4,u,5,5>, <0,2,0,2>
+ 3760279130U, // <5,5,0,3>: Cost 4 vext3 <0,3,2,5>, <5,0,3,2>
+ 2687127138U, // <5,5,0,4>: Cost 3 vext3 <0,4,1,5>, <5,0,4,1>
+ 2248142847U, // <5,5,0,5>: Cost 3 vrev <5,5,5,0>
+ 3720593910U, // <5,5,0,6>: Cost 4 vext2 <4,u,5,5>, <0,6,1,7>
+ 4182502710U, // <5,5,0,7>: Cost 4 vtrnr <3,5,7,0>, RHS
+ 1573110429U, // <5,5,0,u>: Cost 2 vext2 <4,u,5,5>, LHS
+ 2646852342U, // <5,5,1,0>: Cost 3 vext2 <4,u,5,5>, <1,0,3,2>
+ 2624291676U, // <5,5,1,1>: Cost 3 vext2 <1,1,5,5>, <1,1,5,5>
+ 2646852502U, // <5,5,1,2>: Cost 3 vext2 <4,u,5,5>, <1,2,3,0>
+ 2646852568U, // <5,5,1,3>: Cost 3 vext2 <4,u,5,5>, <1,3,1,3>
+ 2715217591U, // <5,5,1,4>: Cost 3 vext3 <5,1,4,5>, <5,1,4,5>
+ 2628936848U, // <5,5,1,5>: Cost 3 vext2 <1,u,5,5>, <1,5,3,7>
+ 3698033907U, // <5,5,1,6>: Cost 4 vext2 <1,1,5,5>, <1,6,5,7>
+ 2713964240U, // <5,5,1,7>: Cost 3 vext3 <4,u,5,5>, <5,1,7,3>
+ 2628937107U, // <5,5,1,u>: Cost 3 vext2 <1,u,5,5>, <1,u,5,5>
+ 3645497446U, // <5,5,2,0>: Cost 4 vext1 <3,5,5,2>, LHS
+ 3760869099U, // <5,5,2,1>: Cost 4 vext3 <0,4,1,5>, <5,2,1,3>
+ 2646853224U, // <5,5,2,2>: Cost 3 vext2 <4,u,5,5>, <2,2,2,2>
+ 2698628862U, // <5,5,2,3>: Cost 3 vext3 <2,3,4,5>, <5,2,3,4>
+ 3772370694U, // <5,5,2,4>: Cost 4 vext3 <2,3,4,5>, <5,2,4,3>
+ 2713964303U, // <5,5,2,5>: Cost 3 vext3 <4,u,5,5>, <5,2,5,3>
+ 2646853562U, // <5,5,2,6>: Cost 3 vext2 <4,u,5,5>, <2,6,3,7>
+ 4038198272U, // <5,5,2,7>: Cost 4 vzipr <1,u,5,2>, <1,3,5,7>
+ 2701946667U, // <5,5,2,u>: Cost 3 vext3 <2,u,4,5>, <5,2,u,4>
+ 2646853782U, // <5,5,3,0>: Cost 3 vext2 <4,u,5,5>, <3,0,1,2>
+ 3698034922U, // <5,5,3,1>: Cost 4 vext2 <1,1,5,5>, <3,1,1,5>
+ 3702679919U, // <5,5,3,2>: Cost 4 vext2 <1,u,5,5>, <3,2,7,3>
+ 2637564336U, // <5,5,3,3>: Cost 3 vext2 <3,3,5,5>, <3,3,5,5>
+ 2646854146U, // <5,5,3,4>: Cost 3 vext2 <4,u,5,5>, <3,4,5,6>
+ 2638891602U, // <5,5,3,5>: Cost 3 vext2 <3,5,5,5>, <3,5,5,5>
+ 3702680247U, // <5,5,3,6>: Cost 4 vext2 <1,u,5,5>, <3,6,7,7>
+ 3702680259U, // <5,5,3,7>: Cost 4 vext2 <1,u,5,5>, <3,7,0,1>
+ 2646854430U, // <5,5,3,u>: Cost 3 vext2 <4,u,5,5>, <3,u,1,2>
+ 2646854546U, // <5,5,4,0>: Cost 3 vext2 <4,u,5,5>, <4,0,5,1>
+ 2642209767U, // <5,5,4,1>: Cost 3 vext2 <4,1,5,5>, <4,1,5,5>
+ 3711306806U, // <5,5,4,2>: Cost 4 vext2 <3,3,5,5>, <4,2,5,3>
+ 3645516369U, // <5,5,4,3>: Cost 4 vext1 <3,5,5,4>, <3,5,5,4>
+ 1570458842U, // <5,5,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
+ 1573113142U, // <5,5,4,5>: Cost 2 vext2 <4,u,5,5>, RHS
+ 2645527932U, // <5,5,4,6>: Cost 3 vext2 <4,6,5,5>, <4,6,5,5>
+ 2713964486U, // <5,5,4,7>: Cost 3 vext3 <4,u,5,5>, <5,4,7,6>
+ 1573113374U, // <5,5,4,u>: Cost 2 vext2 <4,u,5,5>, <4,u,5,5>
+ 1509982310U, // <5,5,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
+ 2646855376U, // <5,5,5,1>: Cost 3 vext2 <4,u,5,5>, <5,1,7,3>
+ 2583725672U, // <5,5,5,2>: Cost 3 vext1 <5,5,5,5>, <2,2,2,2>
+ 2583726230U, // <5,5,5,3>: Cost 3 vext1 <5,5,5,5>, <3,0,1,2>
+ 1509985590U, // <5,5,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
+ 229035318U, // <5,5,5,5>: Cost 1 vdup1 RHS
+ 2646855778U, // <5,5,5,6>: Cost 3 vext2 <4,u,5,5>, <5,6,7,0>
+ 2646855848U, // <5,5,5,7>: Cost 3 vext2 <4,u,5,5>, <5,7,5,7>
+ 229035318U, // <5,5,5,u>: Cost 1 vdup1 RHS
+ 2577760358U, // <5,5,6,0>: Cost 3 vext1 <4,5,5,6>, LHS
+ 3633587361U, // <5,5,6,1>: Cost 4 vext1 <1,5,5,6>, <1,5,5,6>
+ 2646856186U, // <5,5,6,2>: Cost 3 vext2 <4,u,5,5>, <6,2,7,3>
+ 3633588738U, // <5,5,6,3>: Cost 4 vext1 <1,5,5,6>, <3,4,5,6>
+ 2718535756U, // <5,5,6,4>: Cost 3 vext3 <5,6,4,5>, <5,6,4,5>
+ 2644202223U, // <5,5,6,5>: Cost 3 vext2 <4,4,5,5>, <6,5,7,5>
+ 2973780482U, // <5,5,6,6>: Cost 3 vzipr <3,4,5,6>, <3,4,5,6>
+ 2646856526U, // <5,5,6,7>: Cost 3 vext2 <4,u,5,5>, <6,7,0,1>
+ 2646856607U, // <5,5,6,u>: Cost 3 vext2 <4,u,5,5>, <6,u,0,1>
+ 2571796582U, // <5,5,7,0>: Cost 3 vext1 <3,5,5,7>, LHS
+ 3633595392U, // <5,5,7,1>: Cost 4 vext1 <1,5,5,7>, <1,3,5,7>
+ 2571798222U, // <5,5,7,2>: Cost 3 vext1 <3,5,5,7>, <2,3,4,5>
+ 2571799124U, // <5,5,7,3>: Cost 3 vext1 <3,5,5,7>, <3,5,5,7>
+ 2571799862U, // <5,5,7,4>: Cost 3 vext1 <3,5,5,7>, RHS
+ 3114717188U, // <5,5,7,5>: Cost 3 vtrnr RHS, <5,5,5,5>
+ 4034923010U, // <5,5,7,6>: Cost 4 vzipr <1,3,5,7>, <3,4,5,6>
+ 2040974646U, // <5,5,7,7>: Cost 2 vtrnr RHS, RHS
+ 2040974647U, // <5,5,7,u>: Cost 2 vtrnr RHS, RHS
+ 1509982310U, // <5,5,u,0>: Cost 2 vext1 <5,5,5,5>, LHS
+ 1573115694U, // <5,5,u,1>: Cost 2 vext2 <4,u,5,5>, LHS
+ 2571806414U, // <5,5,u,2>: Cost 3 vext1 <3,5,5,u>, <2,3,4,5>
+ 2571807317U, // <5,5,u,3>: Cost 3 vext1 <3,5,5,u>, <3,5,5,u>
+ 1509985590U, // <5,5,u,4>: Cost 2 vext1 <5,5,5,5>, RHS
+ 229035318U, // <5,5,u,5>: Cost 1 vdup1 RHS
+ 2646857936U, // <5,5,u,6>: Cost 3 vext2 <4,u,5,5>, <u,6,3,7>
+ 2040982838U, // <5,5,u,7>: Cost 2 vtrnr RHS, RHS
+ 229035318U, // <5,5,u,u>: Cost 1 vdup1 RHS
+ 2638233600U, // <5,6,0,0>: Cost 3 vext2 <3,4,5,6>, <0,0,0,0>
+ 1564491878U, // <5,6,0,1>: Cost 2 vext2 <3,4,5,6>, LHS
+ 2632261796U, // <5,6,0,2>: Cost 3 vext2 <2,4,5,6>, <0,2,0,2>
+ 2638233856U, // <5,6,0,3>: Cost 3 vext2 <3,4,5,6>, <0,3,1,4>
+ 2638233938U, // <5,6,0,4>: Cost 3 vext2 <3,4,5,6>, <0,4,1,5>
+ 3706003885U, // <5,6,0,5>: Cost 4 vext2 <2,4,5,6>, <0,5,2,6>
+ 3706003967U, // <5,6,0,6>: Cost 4 vext2 <2,4,5,6>, <0,6,2,7>
+ 4047473974U, // <5,6,0,7>: Cost 4 vzipr <3,4,5,0>, RHS
+ 1564492445U, // <5,6,0,u>: Cost 2 vext2 <3,4,5,6>, LHS
+ 2638234358U, // <5,6,1,0>: Cost 3 vext2 <3,4,5,6>, <1,0,3,2>
+ 2638234420U, // <5,6,1,1>: Cost 3 vext2 <3,4,5,6>, <1,1,1,1>
+ 2638234518U, // <5,6,1,2>: Cost 3 vext2 <3,4,5,6>, <1,2,3,0>
+ 2638234584U, // <5,6,1,3>: Cost 3 vext2 <3,4,5,6>, <1,3,1,3>
+ 2626290768U, // <5,6,1,4>: Cost 3 vext2 <1,4,5,6>, <1,4,5,6>
+ 2638234768U, // <5,6,1,5>: Cost 3 vext2 <3,4,5,6>, <1,5,3,7>
+ 3700032719U, // <5,6,1,6>: Cost 4 vext2 <1,4,5,6>, <1,6,1,7>
+ 2982366518U, // <5,6,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
+ 2628945300U, // <5,6,1,u>: Cost 3 vext2 <1,u,5,6>, <1,u,5,6>
+ 3706004925U, // <5,6,2,0>: Cost 4 vext2 <2,4,5,6>, <2,0,1,2>
+ 3711976966U, // <5,6,2,1>: Cost 4 vext2 <3,4,5,6>, <2,1,0,3>
+ 2638235240U, // <5,6,2,2>: Cost 3 vext2 <3,4,5,6>, <2,2,2,2>
+ 2638235302U, // <5,6,2,3>: Cost 3 vext2 <3,4,5,6>, <2,3,0,1>
+ 2632263465U, // <5,6,2,4>: Cost 3 vext2 <2,4,5,6>, <2,4,5,6>
+ 2638235496U, // <5,6,2,5>: Cost 3 vext2 <3,4,5,6>, <2,5,3,6>
+ 2638235578U, // <5,6,2,6>: Cost 3 vext2 <3,4,5,6>, <2,6,3,7>
+ 2713965050U, // <5,6,2,7>: Cost 3 vext3 <4,u,5,5>, <6,2,7,3>
+ 2634917997U, // <5,6,2,u>: Cost 3 vext2 <2,u,5,6>, <2,u,5,6>
+ 2638235798U, // <5,6,3,0>: Cost 3 vext2 <3,4,5,6>, <3,0,1,2>
+ 3711977695U, // <5,6,3,1>: Cost 4 vext2 <3,4,5,6>, <3,1,0,3>
+ 3710650720U, // <5,6,3,2>: Cost 4 vext2 <3,2,5,6>, <3,2,5,6>
+ 2638236060U, // <5,6,3,3>: Cost 3 vext2 <3,4,5,6>, <3,3,3,3>
+ 1564494338U, // <5,6,3,4>: Cost 2 vext2 <3,4,5,6>, <3,4,5,6>
+ 2638236234U, // <5,6,3,5>: Cost 3 vext2 <3,4,5,6>, <3,5,4,6>
+ 3711978104U, // <5,6,3,6>: Cost 4 vext2 <3,4,5,6>, <3,6,0,7>
+ 4034227510U, // <5,6,3,7>: Cost 4 vzipr <1,2,5,3>, RHS
+ 1567148870U, // <5,6,3,u>: Cost 2 vext2 <3,u,5,6>, <3,u,5,6>
+ 2577817702U, // <5,6,4,0>: Cost 3 vext1 <4,5,6,4>, LHS
+ 3700034544U, // <5,6,4,1>: Cost 4 vext2 <1,4,5,6>, <4,1,6,5>
+ 2723033713U, // <5,6,4,2>: Cost 3 vext3 <6,4,2,5>, <6,4,2,5>
+ 2638236818U, // <5,6,4,3>: Cost 3 vext2 <3,4,5,6>, <4,3,6,5>
+ 2644208859U, // <5,6,4,4>: Cost 3 vext2 <4,4,5,6>, <4,4,5,6>
+ 1564495158U, // <5,6,4,5>: Cost 2 vext2 <3,4,5,6>, RHS
+ 2645536125U, // <5,6,4,6>: Cost 3 vext2 <4,6,5,6>, <4,6,5,6>
+ 2723402398U, // <5,6,4,7>: Cost 3 vext3 <6,4,7,5>, <6,4,7,5>
+ 1564495401U, // <5,6,4,u>: Cost 2 vext2 <3,4,5,6>, RHS
+ 2577825894U, // <5,6,5,0>: Cost 3 vext1 <4,5,6,5>, LHS
+ 2662125264U, // <5,6,5,1>: Cost 3 vext2 <7,4,5,6>, <5,1,7,3>
+ 3775836867U, // <5,6,5,2>: Cost 4 vext3 <2,u,6,5>, <6,5,2,6>
+ 3711979343U, // <5,6,5,3>: Cost 4 vext2 <3,4,5,6>, <5,3,3,4>
+ 2650181556U, // <5,6,5,4>: Cost 3 vext2 <5,4,5,6>, <5,4,5,6>
+ 2662125572U, // <5,6,5,5>: Cost 3 vext2 <7,4,5,6>, <5,5,5,5>
+ 2638237732U, // <5,6,5,6>: Cost 3 vext2 <3,4,5,6>, <5,6,0,1>
+ 2982399286U, // <5,6,5,7>: Cost 3 vzipr <4,u,5,5>, RHS
+ 2982399287U, // <5,6,5,u>: Cost 3 vzipr <4,u,5,5>, RHS
+ 2583806054U, // <5,6,6,0>: Cost 3 vext1 <5,5,6,6>, LHS
+ 3711979910U, // <5,6,6,1>: Cost 4 vext2 <3,4,5,6>, <6,1,3,4>
+ 2662126074U, // <5,6,6,2>: Cost 3 vext2 <7,4,5,6>, <6,2,7,3>
+ 2583808514U, // <5,6,6,3>: Cost 3 vext1 <5,5,6,6>, <3,4,5,6>
+ 2583809334U, // <5,6,6,4>: Cost 3 vext1 <5,5,6,6>, RHS
+ 2583810062U, // <5,6,6,5>: Cost 3 vext1 <5,5,6,6>, <5,5,6,6>
+ 2638238520U, // <5,6,6,6>: Cost 3 vext2 <3,4,5,6>, <6,6,6,6>
+ 2973781302U, // <5,6,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
+ 2973781303U, // <5,6,6,u>: Cost 3 vzipr <3,4,5,6>, RHS
+ 430358630U, // <5,6,7,0>: Cost 1 vext1 RHS, LHS
+ 1504101110U, // <5,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
+ 1504101992U, // <5,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+ 1504102550U, // <5,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
+ 430361910U, // <5,6,7,4>: Cost 1 vext1 RHS, RHS
+ 1504104390U, // <5,6,7,5>: Cost 2 vext1 RHS, <5,4,7,6>
+ 1504105272U, // <5,6,7,6>: Cost 2 vext1 RHS, <6,6,6,6>
+ 1504106092U, // <5,6,7,7>: Cost 2 vext1 RHS, <7,7,7,7>
+ 430364462U, // <5,6,7,u>: Cost 1 vext1 RHS, LHS
+ 430366822U, // <5,6,u,0>: Cost 1 vext1 RHS, LHS
+ 1564497710U, // <5,6,u,1>: Cost 2 vext2 <3,4,5,6>, LHS
+ 1504110184U, // <5,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
+ 1504110742U, // <5,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
+ 430370103U, // <5,6,u,4>: Cost 1 vext1 RHS, RHS
+ 1564498074U, // <5,6,u,5>: Cost 2 vext2 <3,4,5,6>, RHS
+ 1504113146U, // <5,6,u,6>: Cost 2 vext1 RHS, <6,2,7,3>
+ 1504113658U, // <5,6,u,7>: Cost 2 vext1 RHS, <7,0,1,2>
+ 430372654U, // <5,6,u,u>: Cost 1 vext1 RHS, LHS
+ 2625634304U, // <5,7,0,0>: Cost 3 vext2 <1,3,5,7>, <0,0,0,0>
+ 1551892582U, // <5,7,0,1>: Cost 2 vext2 <1,3,5,7>, LHS
+ 2625634468U, // <5,7,0,2>: Cost 3 vext2 <1,3,5,7>, <0,2,0,2>
+ 2571889247U, // <5,7,0,3>: Cost 3 vext1 <3,5,7,0>, <3,5,7,0>
+ 2625634642U, // <5,7,0,4>: Cost 3 vext2 <1,3,5,7>, <0,4,1,5>
+ 2595778728U, // <5,7,0,5>: Cost 3 vext1 <7,5,7,0>, <5,7,5,7>
+ 3699376639U, // <5,7,0,6>: Cost 4 vext2 <1,3,5,7>, <0,6,2,7>
+ 2260235715U, // <5,7,0,7>: Cost 3 vrev <7,5,7,0>
+ 1551893149U, // <5,7,0,u>: Cost 2 vext2 <1,3,5,7>, LHS
+ 2625635062U, // <5,7,1,0>: Cost 3 vext2 <1,3,5,7>, <1,0,3,2>
+ 2624308020U, // <5,7,1,1>: Cost 3 vext2 <1,1,5,7>, <1,1,1,1>
+ 2625635222U, // <5,7,1,2>: Cost 3 vext2 <1,3,5,7>, <1,2,3,0>
+ 1551893504U, // <5,7,1,3>: Cost 2 vext2 <1,3,5,7>, <1,3,5,7>
+ 2571898166U, // <5,7,1,4>: Cost 3 vext1 <3,5,7,1>, RHS
+ 2625635472U, // <5,7,1,5>: Cost 3 vext2 <1,3,5,7>, <1,5,3,7>
+ 2627626227U, // <5,7,1,6>: Cost 3 vext2 <1,6,5,7>, <1,6,5,7>
+ 3702031684U, // <5,7,1,7>: Cost 4 vext2 <1,7,5,7>, <1,7,5,7>
+ 1555211669U, // <5,7,1,u>: Cost 2 vext2 <1,u,5,7>, <1,u,5,7>
+ 2629617126U, // <5,7,2,0>: Cost 3 vext2 <2,0,5,7>, <2,0,5,7>
+ 3699377670U, // <5,7,2,1>: Cost 4 vext2 <1,3,5,7>, <2,1,0,3>
+ 2625635944U, // <5,7,2,2>: Cost 3 vext2 <1,3,5,7>, <2,2,2,2>
+ 2625636006U, // <5,7,2,3>: Cost 3 vext2 <1,3,5,7>, <2,3,0,1>
+ 2632271658U, // <5,7,2,4>: Cost 3 vext2 <2,4,5,7>, <2,4,5,7>
+ 2625636201U, // <5,7,2,5>: Cost 3 vext2 <1,3,5,7>, <2,5,3,7>
+ 2625636282U, // <5,7,2,6>: Cost 3 vext2 <1,3,5,7>, <2,6,3,7>
+ 3708004381U, // <5,7,2,7>: Cost 4 vext2 <2,7,5,7>, <2,7,5,7>
+ 2625636411U, // <5,7,2,u>: Cost 3 vext2 <1,3,5,7>, <2,u,0,1>
+ 2625636502U, // <5,7,3,0>: Cost 3 vext2 <1,3,5,7>, <3,0,1,2>
+ 2625636604U, // <5,7,3,1>: Cost 3 vext2 <1,3,5,7>, <3,1,3,5>
+ 3699378478U, // <5,7,3,2>: Cost 4 vext2 <1,3,5,7>, <3,2,0,1>
+ 2625636764U, // <5,7,3,3>: Cost 3 vext2 <1,3,5,7>, <3,3,3,3>
+ 2625636866U, // <5,7,3,4>: Cost 3 vext2 <1,3,5,7>, <3,4,5,6>
+ 2625636959U, // <5,7,3,5>: Cost 3 vext2 <1,3,5,7>, <3,5,7,0>
+ 3699378808U, // <5,7,3,6>: Cost 4 vext2 <1,3,5,7>, <3,6,0,7>
+ 2640235254U, // <5,7,3,7>: Cost 3 vext2 <3,7,5,7>, <3,7,5,7>
+ 2625637150U, // <5,7,3,u>: Cost 3 vext2 <1,3,5,7>, <3,u,1,2>
+ 2571919462U, // <5,7,4,0>: Cost 3 vext1 <3,5,7,4>, LHS
+ 2571920384U, // <5,7,4,1>: Cost 3 vext1 <3,5,7,4>, <1,3,5,7>
+ 3699379260U, // <5,7,4,2>: Cost 4 vext2 <1,3,5,7>, <4,2,6,0>
+ 2571922019U, // <5,7,4,3>: Cost 3 vext1 <3,5,7,4>, <3,5,7,4>
+ 2571922742U, // <5,7,4,4>: Cost 3 vext1 <3,5,7,4>, RHS
+ 1551895862U, // <5,7,4,5>: Cost 2 vext2 <1,3,5,7>, RHS
+ 2846277980U, // <5,7,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
+ 2646207951U, // <5,7,4,7>: Cost 3 vext2 <4,7,5,7>, <4,7,5,7>
+ 1551896105U, // <5,7,4,u>: Cost 2 vext2 <1,3,5,7>, RHS
+ 2583871590U, // <5,7,5,0>: Cost 3 vext1 <5,5,7,5>, LHS
+ 2652180176U, // <5,7,5,1>: Cost 3 vext2 <5,7,5,7>, <5,1,7,3>
+ 2625638177U, // <5,7,5,2>: Cost 3 vext2 <1,3,5,7>, <5,2,7,3>
+ 2625638262U, // <5,7,5,3>: Cost 3 vext2 <1,3,5,7>, <5,3,7,7>
+ 2583874870U, // <5,7,5,4>: Cost 3 vext1 <5,5,7,5>, RHS
+ 2846281732U, // <5,7,5,5>: Cost 3 vuzpr RHS, <5,5,5,5>
+ 2651517015U, // <5,7,5,6>: Cost 3 vext2 <5,6,5,7>, <5,6,5,7>
+ 1772539190U, // <5,7,5,7>: Cost 2 vuzpr RHS, RHS
+ 1772539191U, // <5,7,5,u>: Cost 2 vuzpr RHS, RHS
+ 2846281826U, // <5,7,6,0>: Cost 3 vuzpr RHS, <5,6,7,0>
+ 3699380615U, // <5,7,6,1>: Cost 4 vext2 <1,3,5,7>, <6,1,3,5>
+ 2846281108U, // <5,7,6,2>: Cost 3 vuzpr RHS, <4,6,u,2>
+ 2589854210U, // <5,7,6,3>: Cost 3 vext1 <6,5,7,6>, <3,4,5,6>
+ 2846281830U, // <5,7,6,4>: Cost 3 vuzpr RHS, <5,6,7,4>
+ 2725467658U, // <5,7,6,5>: Cost 3 vext3 <6,7,u,5>, <7,6,5,u>
+ 2846281076U, // <5,7,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
+ 2846279610U, // <5,7,6,7>: Cost 3 vuzpr RHS, <2,6,3,7>
+ 2846279611U, // <5,7,6,u>: Cost 3 vuzpr RHS, <2,6,3,u>
+ 1510146150U, // <5,7,7,0>: Cost 2 vext1 <5,5,7,7>, LHS
+ 2846282574U, // <5,7,7,1>: Cost 3 vuzpr RHS, <6,7,0,1>
+ 2583889512U, // <5,7,7,2>: Cost 3 vext1 <5,5,7,7>, <2,2,2,2>
+ 2846281919U, // <5,7,7,3>: Cost 3 vuzpr RHS, <5,7,u,3>
+ 1510149430U, // <5,7,7,4>: Cost 2 vext1 <5,5,7,7>, RHS
+ 1510150168U, // <5,7,7,5>: Cost 2 vext1 <5,5,7,7>, <5,5,7,7>
+ 2583892474U, // <5,7,7,6>: Cost 3 vext1 <5,5,7,7>, <6,2,7,3>
+ 2625640044U, // <5,7,7,7>: Cost 3 vext2 <1,3,5,7>, <7,7,7,7>
+ 1510151982U, // <5,7,7,u>: Cost 2 vext1 <5,5,7,7>, LHS
+ 1510154342U, // <5,7,u,0>: Cost 2 vext1 <5,5,7,u>, LHS
+ 1551898414U, // <5,7,u,1>: Cost 2 vext2 <1,3,5,7>, LHS
+ 2625640325U, // <5,7,u,2>: Cost 3 vext2 <1,3,5,7>, <u,2,3,0>
+ 1772536477U, // <5,7,u,3>: Cost 2 vuzpr RHS, LHS
+ 1510157622U, // <5,7,u,4>: Cost 2 vext1 <5,5,7,u>, RHS
+ 1551898778U, // <5,7,u,5>: Cost 2 vext2 <1,3,5,7>, RHS
+ 2625640656U, // <5,7,u,6>: Cost 3 vext2 <1,3,5,7>, <u,6,3,7>
+ 1772539433U, // <5,7,u,7>: Cost 2 vuzpr RHS, RHS
+ 1551898981U, // <5,7,u,u>: Cost 2 vext2 <1,3,5,7>, LHS
+ 2625642496U, // <5,u,0,0>: Cost 3 vext2 <1,3,5,u>, <0,0,0,0>
+ 1551900774U, // <5,u,0,1>: Cost 2 vext2 <1,3,5,u>, LHS
+ 2625642660U, // <5,u,0,2>: Cost 3 vext2 <1,3,5,u>, <0,2,0,2>
+ 2698630885U, // <5,u,0,3>: Cost 3 vext3 <2,3,4,5>, <u,0,3,2>
+ 2687129325U, // <5,u,0,4>: Cost 3 vext3 <0,4,1,5>, <u,0,4,1>
+ 2689783542U, // <5,u,0,5>: Cost 3 vext3 <0,u,1,5>, <u,0,5,1>
+ 2266134675U, // <5,u,0,6>: Cost 3 vrev <u,5,6,0>
+ 2595853772U, // <5,u,0,7>: Cost 3 vext1 <7,5,u,0>, <7,5,u,0>
+ 1551901341U, // <5,u,0,u>: Cost 2 vext2 <1,3,5,u>, LHS
+ 2625643254U, // <5,u,1,0>: Cost 3 vext2 <1,3,5,u>, <1,0,3,2>
+ 2625643316U, // <5,u,1,1>: Cost 3 vext2 <1,3,5,u>, <1,1,1,1>
+ 1613387566U, // <5,u,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
+ 1551901697U, // <5,u,1,3>: Cost 2 vext2 <1,3,5,u>, <1,3,5,u>
+ 2626307154U, // <5,u,1,4>: Cost 3 vext2 <1,4,5,u>, <1,4,5,u>
+ 2689783622U, // <5,u,1,5>: Cost 3 vext3 <0,u,1,5>, <u,1,5,0>
+ 2627634420U, // <5,u,1,6>: Cost 3 vext2 <1,6,5,u>, <1,6,5,u>
+ 2982366536U, // <5,u,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
+ 1613387620U, // <5,u,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
+ 2846286742U, // <5,u,2,0>: Cost 3 vuzpr RHS, <1,2,3,0>
+ 2685796528U, // <5,u,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
+ 2625644136U, // <5,u,2,2>: Cost 3 vext2 <1,3,5,u>, <2,2,2,2>
+ 2687129480U, // <5,u,2,3>: Cost 3 vext3 <0,4,1,5>, <u,2,3,3>
+ 2632279851U, // <5,u,2,4>: Cost 3 vext2 <2,4,5,u>, <2,4,5,u>
+ 2625644394U, // <5,u,2,5>: Cost 3 vext2 <1,3,5,u>, <2,5,3,u>
+ 2625644474U, // <5,u,2,6>: Cost 3 vext2 <1,3,5,u>, <2,6,3,7>
+ 2713966508U, // <5,u,2,7>: Cost 3 vext3 <4,u,5,5>, <u,2,7,3>
+ 2625644603U, // <5,u,2,u>: Cost 3 vext2 <1,3,5,u>, <2,u,0,1>
+ 2687129532U, // <5,u,3,0>: Cost 3 vext3 <0,4,1,5>, <u,3,0,1>
+ 2636261649U, // <5,u,3,1>: Cost 3 vext2 <3,1,5,u>, <3,1,5,u>
+ 2636925282U, // <5,u,3,2>: Cost 3 vext2 <3,2,5,u>, <3,2,5,u>
+ 2625644956U, // <5,u,3,3>: Cost 3 vext2 <1,3,5,u>, <3,3,3,3>
+ 1564510724U, // <5,u,3,4>: Cost 2 vext2 <3,4,5,u>, <3,4,5,u>
+ 2625645160U, // <5,u,3,5>: Cost 3 vext2 <1,3,5,u>, <3,5,u,0>
+ 2734610422U, // <5,u,3,6>: Cost 3 vext3 <u,3,6,5>, <u,3,6,5>
+ 2640243447U, // <5,u,3,7>: Cost 3 vext2 <3,7,5,u>, <3,7,5,u>
+ 1567165256U, // <5,u,3,u>: Cost 2 vext2 <3,u,5,u>, <3,u,5,u>
+ 1567828889U, // <5,u,4,0>: Cost 2 vext2 <4,0,5,u>, <4,0,5,u>
+ 1661163546U, // <5,u,4,1>: Cost 2 vext3 <u,4,1,5>, <u,4,1,5>
+ 2734463012U, // <5,u,4,2>: Cost 3 vext3 <u,3,4,5>, <u,4,2,6>
+ 2698631212U, // <5,u,4,3>: Cost 3 vext3 <2,3,4,5>, <u,4,3,5>
+ 1570458842U, // <5,u,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
+ 1551904054U, // <5,u,4,5>: Cost 2 vext2 <1,3,5,u>, RHS
+ 2846286172U, // <5,u,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
+ 2646216144U, // <5,u,4,7>: Cost 3 vext2 <4,7,5,u>, <4,7,5,u>
+ 1551904297U, // <5,u,4,u>: Cost 2 vext2 <1,3,5,u>, RHS
+ 1509982310U, // <5,u,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
+ 2560058555U, // <5,u,5,1>: Cost 3 vext1 <1,5,u,5>, <1,5,u,5>
+ 2698926194U, // <5,u,5,2>: Cost 3 vext3 <2,3,u,5>, <u,5,2,3>
+ 2698631295U, // <5,u,5,3>: Cost 3 vext3 <2,3,4,5>, <u,5,3,7>
+ 1509985590U, // <5,u,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
+ 229035318U, // <5,u,5,5>: Cost 1 vdup1 RHS
+ 1613387930U, // <5,u,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
+ 1772547382U, // <5,u,5,7>: Cost 2 vuzpr RHS, RHS
+ 229035318U, // <5,u,5,u>: Cost 1 vdup1 RHS
+ 2566037606U, // <5,u,6,0>: Cost 3 vext1 <2,5,u,6>, LHS
+ 2920044334U, // <5,u,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
+ 2566039445U, // <5,u,6,2>: Cost 3 vext1 <2,5,u,6>, <2,5,u,6>
+ 2687129808U, // <5,u,6,3>: Cost 3 vext3 <0,4,1,5>, <u,6,3,7>
+ 2566040886U, // <5,u,6,4>: Cost 3 vext1 <2,5,u,6>, RHS
+ 2920044698U, // <5,u,6,5>: Cost 3 vzipl <5,6,7,0>, RHS
+ 2846289268U, // <5,u,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
+ 2973781320U, // <5,u,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
+ 2687129853U, // <5,u,6,u>: Cost 3 vext3 <0,4,1,5>, <u,6,u,7>
+ 430506086U, // <5,u,7,0>: Cost 1 vext1 RHS, LHS
+ 1486333117U, // <5,u,7,1>: Cost 2 vext1 <1,5,u,7>, <1,5,u,7>
+ 1504249448U, // <5,u,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+ 2040971933U, // <5,u,7,3>: Cost 2 vtrnr RHS, LHS
+ 430509384U, // <5,u,7,4>: Cost 1 vext1 RHS, RHS
+ 1504251600U, // <5,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+ 118708378U, // <5,u,7,6>: Cost 1 vrev RHS
+ 2040974889U, // <5,u,7,7>: Cost 2 vtrnr RHS, RHS
+ 430511918U, // <5,u,7,u>: Cost 1 vext1 RHS, LHS
+ 430514278U, // <5,u,u,0>: Cost 1 vext1 RHS, LHS
+ 1551906606U, // <5,u,u,1>: Cost 2 vext2 <1,3,5,u>, LHS
+ 1613388133U, // <5,u,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
+ 1772544669U, // <5,u,u,3>: Cost 2 vuzpr RHS, LHS
+ 430517577U, // <5,u,u,4>: Cost 1 vext1 RHS, RHS
+ 229035318U, // <5,u,u,5>: Cost 1 vdup1 RHS
+ 118716571U, // <5,u,u,6>: Cost 1 vrev RHS
+ 1772547625U, // <5,u,u,7>: Cost 2 vuzpr RHS, RHS
+ 430520110U, // <5,u,u,u>: Cost 1 vext1 RHS, LHS
+ 2686025728U, // <6,0,0,0>: Cost 3 vext3 <0,2,4,6>, <0,0,0,0>
+ 2686025738U, // <6,0,0,1>: Cost 3 vext3 <0,2,4,6>, <0,0,1,1>
+ 2686025748U, // <6,0,0,2>: Cost 3 vext3 <0,2,4,6>, <0,0,2,2>
+ 3779084320U, // <6,0,0,3>: Cost 4 vext3 <3,4,5,6>, <0,0,3,5>
+ 2642903388U, // <6,0,0,4>: Cost 3 vext2 <4,2,6,0>, <0,4,2,6>
+ 3657723939U, // <6,0,0,5>: Cost 4 vext1 <5,6,0,0>, <5,6,0,0>
+ 3926676514U, // <6,0,0,6>: Cost 4 vuzpr <5,6,7,0>, <7,0,5,6>
+ 3926675786U, // <6,0,0,7>: Cost 4 vuzpr <5,6,7,0>, <6,0,5,7>
+ 2686025802U, // <6,0,0,u>: Cost 3 vext3 <0,2,4,6>, <0,0,u,2>
+ 2566070374U, // <6,0,1,0>: Cost 3 vext1 <2,6,0,1>, LHS
+ 3759767642U, // <6,0,1,1>: Cost 4 vext3 <0,2,4,6>, <0,1,1,0>
+ 1612284006U, // <6,0,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
+ 2583988738U, // <6,0,1,3>: Cost 3 vext1 <5,6,0,1>, <3,4,5,6>
+ 2566073654U, // <6,0,1,4>: Cost 3 vext1 <2,6,0,1>, RHS
+ 2583990308U, // <6,0,1,5>: Cost 3 vext1 <5,6,0,1>, <5,6,0,1>
+ 2589963005U, // <6,0,1,6>: Cost 3 vext1 <6,6,0,1>, <6,6,0,1>
+ 2595935702U, // <6,0,1,7>: Cost 3 vext1 <7,6,0,1>, <7,6,0,1>
+ 1612284060U, // <6,0,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
+ 2686025892U, // <6,0,2,0>: Cost 3 vext3 <0,2,4,6>, <0,2,0,2>
+ 2685804721U, // <6,0,2,1>: Cost 3 vext3 <0,2,1,6>, <0,2,1,6>
+ 3759620282U, // <6,0,2,2>: Cost 4 vext3 <0,2,2,6>, <0,2,2,6>
+ 2705342658U, // <6,0,2,3>: Cost 3 vext3 <3,4,5,6>, <0,2,3,5>
+ 1612284108U, // <6,0,2,4>: Cost 2 vext3 <0,2,4,6>, <0,2,4,6>
+ 3706029956U, // <6,0,2,5>: Cost 4 vext2 <2,4,6,0>, <2,5,6,7>
+ 2686173406U, // <6,0,2,6>: Cost 3 vext3 <0,2,6,6>, <0,2,6,6>
+ 3651769338U, // <6,0,2,7>: Cost 4 vext1 <4,6,0,2>, <7,0,1,2>
+ 1612579056U, // <6,0,2,u>: Cost 2 vext3 <0,2,u,6>, <0,2,u,6>
+ 3706030230U, // <6,0,3,0>: Cost 4 vext2 <2,4,6,0>, <3,0,1,2>
+ 2705342720U, // <6,0,3,1>: Cost 3 vext3 <3,4,5,6>, <0,3,1,4>
+ 2705342730U, // <6,0,3,2>: Cost 3 vext3 <3,4,5,6>, <0,3,2,5>
+ 3706030492U, // <6,0,3,3>: Cost 4 vext2 <2,4,6,0>, <3,3,3,3>
+ 2644896258U, // <6,0,3,4>: Cost 3 vext2 <4,5,6,0>, <3,4,5,6>
+ 3718638154U, // <6,0,3,5>: Cost 4 vext2 <4,5,6,0>, <3,5,4,6>
+ 3729918619U, // <6,0,3,6>: Cost 4 vext2 <6,4,6,0>, <3,6,4,6>
+ 3926672384U, // <6,0,3,7>: Cost 4 vuzpr <5,6,7,0>, <1,3,5,7>
+ 2705342784U, // <6,0,3,u>: Cost 3 vext3 <3,4,5,6>, <0,3,u,5>
+ 2687058250U, // <6,0,4,0>: Cost 3 vext3 <0,4,0,6>, <0,4,0,6>
+ 2686026066U, // <6,0,4,1>: Cost 3 vext3 <0,2,4,6>, <0,4,1,5>
+ 1613463900U, // <6,0,4,2>: Cost 2 vext3 <0,4,2,6>, <0,4,2,6>
+ 3761021285U, // <6,0,4,3>: Cost 4 vext3 <0,4,3,6>, <0,4,3,6>
+ 2687353198U, // <6,0,4,4>: Cost 3 vext3 <0,4,4,6>, <0,4,4,6>
+ 2632289590U, // <6,0,4,5>: Cost 3 vext2 <2,4,6,0>, RHS
+ 2645560704U, // <6,0,4,6>: Cost 3 vext2 <4,6,6,0>, <4,6,6,0>
+ 2646224337U, // <6,0,4,7>: Cost 3 vext2 <4,7,6,0>, <4,7,6,0>
+ 1613906322U, // <6,0,4,u>: Cost 2 vext3 <0,4,u,6>, <0,4,u,6>
+ 3651788902U, // <6,0,5,0>: Cost 4 vext1 <4,6,0,5>, LHS
+ 2687795620U, // <6,0,5,1>: Cost 3 vext3 <0,5,1,6>, <0,5,1,6>
+ 3761611181U, // <6,0,5,2>: Cost 4 vext3 <0,5,2,6>, <0,5,2,6>
+ 3723284326U, // <6,0,5,3>: Cost 4 vext2 <5,3,6,0>, <5,3,6,0>
+ 2646224838U, // <6,0,5,4>: Cost 3 vext2 <4,7,6,0>, <5,4,7,6>
+ 3718639630U, // <6,0,5,5>: Cost 4 vext2 <4,5,6,0>, <5,5,6,6>
+ 2652196962U, // <6,0,5,6>: Cost 3 vext2 <5,7,6,0>, <5,6,7,0>
+ 2852932918U, // <6,0,5,7>: Cost 3 vuzpr <5,6,7,0>, RHS
+ 2852932919U, // <6,0,5,u>: Cost 3 vuzpr <5,6,7,0>, RHS
+ 2852933730U, // <6,0,6,0>: Cost 3 vuzpr <5,6,7,0>, <5,6,7,0>
+ 2925985894U, // <6,0,6,1>: Cost 3 vzipl <6,6,6,6>, LHS
+ 3060203622U, // <6,0,6,2>: Cost 3 vtrnl <6,6,6,6>, LHS
+ 3718640178U, // <6,0,6,3>: Cost 4 vext2 <4,5,6,0>, <6,3,4,5>
+ 2656178832U, // <6,0,6,4>: Cost 3 vext2 <6,4,6,0>, <6,4,6,0>
+ 3725939378U, // <6,0,6,5>: Cost 4 vext2 <5,7,6,0>, <6,5,0,7>
+ 2657506098U, // <6,0,6,6>: Cost 3 vext2 <6,6,6,0>, <6,6,6,0>
+ 2619020110U, // <6,0,6,7>: Cost 3 vext2 <0,2,6,0>, <6,7,0,1>
+ 2925986461U, // <6,0,6,u>: Cost 3 vzipl <6,6,6,6>, LHS
+ 2572091494U, // <6,0,7,0>: Cost 3 vext1 <3,6,0,7>, LHS
+ 2572092310U, // <6,0,7,1>: Cost 3 vext1 <3,6,0,7>, <1,2,3,0>
+ 2980495524U, // <6,0,7,2>: Cost 3 vzipr RHS, <0,2,0,2>
+ 2572094072U, // <6,0,7,3>: Cost 3 vext1 <3,6,0,7>, <3,6,0,7>
+ 2572094774U, // <6,0,7,4>: Cost 3 vext1 <3,6,0,7>, RHS
+ 4054238242U, // <6,0,7,5>: Cost 4 vzipr RHS, <1,4,0,5>
+ 3645837653U, // <6,0,7,6>: Cost 4 vext1 <3,6,0,7>, <6,0,7,0>
+ 4054239054U, // <6,0,7,7>: Cost 4 vzipr RHS, <2,5,0,7>
+ 2572097326U, // <6,0,7,u>: Cost 3 vext1 <3,6,0,7>, LHS
+ 2686026378U, // <6,0,u,0>: Cost 3 vext3 <0,2,4,6>, <0,u,0,2>
+ 2686026386U, // <6,0,u,1>: Cost 3 vext3 <0,2,4,6>, <0,u,1,1>
+ 1612284573U, // <6,0,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
+ 2705343144U, // <6,0,u,3>: Cost 3 vext3 <3,4,5,6>, <0,u,3,5>
+ 1616265906U, // <6,0,u,4>: Cost 2 vext3 <0,u,4,6>, <0,u,4,6>
+ 2632292506U, // <6,0,u,5>: Cost 3 vext2 <2,4,6,0>, RHS
+ 2590020356U, // <6,0,u,6>: Cost 3 vext1 <6,6,0,u>, <6,6,0,u>
+ 2852933161U, // <6,0,u,7>: Cost 3 vuzpr <5,6,7,0>, RHS
+ 1612284627U, // <6,0,u,u>: Cost 2 vext3 <0,2,4,6>, LHS
+ 2595995750U, // <6,1,0,0>: Cost 3 vext1 <7,6,1,0>, LHS
+ 2646229094U, // <6,1,0,1>: Cost 3 vext2 <4,7,6,1>, LHS
+ 3694092492U, // <6,1,0,2>: Cost 4 vext2 <0,4,6,1>, <0,2,4,6>
+ 2686026486U, // <6,1,0,3>: Cost 3 vext3 <0,2,4,6>, <1,0,3,2>
+ 2595999030U, // <6,1,0,4>: Cost 3 vext1 <7,6,1,0>, RHS
+ 3767730952U, // <6,1,0,5>: Cost 4 vext3 <1,5,4,6>, <1,0,5,2>
+ 2596000590U, // <6,1,0,6>: Cost 3 vext1 <7,6,1,0>, <6,7,0,1>
+ 2596001246U, // <6,1,0,7>: Cost 3 vext1 <7,6,1,0>, <7,6,1,0>
+ 2686026531U, // <6,1,0,u>: Cost 3 vext3 <0,2,4,6>, <1,0,u,2>
+ 3763602219U, // <6,1,1,0>: Cost 4 vext3 <0,u,2,6>, <1,1,0,1>
+ 2686026548U, // <6,1,1,1>: Cost 3 vext3 <0,2,4,6>, <1,1,1,1>
+ 3764929346U, // <6,1,1,2>: Cost 4 vext3 <1,1,2,6>, <1,1,2,6>
+ 2686026568U, // <6,1,1,3>: Cost 3 vext3 <0,2,4,6>, <1,1,3,3>
+ 2691334996U, // <6,1,1,4>: Cost 3 vext3 <1,1,4,6>, <1,1,4,6>
+ 3760874332U, // <6,1,1,5>: Cost 4 vext3 <0,4,1,6>, <1,1,5,5>
+ 3765224294U, // <6,1,1,6>: Cost 4 vext3 <1,1,6,6>, <1,1,6,6>
+ 3669751263U, // <6,1,1,7>: Cost 4 vext1 <7,6,1,1>, <7,6,1,1>
+ 2686026613U, // <6,1,1,u>: Cost 3 vext3 <0,2,4,6>, <1,1,u,3>
+ 2554208358U, // <6,1,2,0>: Cost 3 vext1 <0,6,1,2>, LHS
+ 3763602311U, // <6,1,2,1>: Cost 4 vext3 <0,u,2,6>, <1,2,1,3>
+ 3639895971U, // <6,1,2,2>: Cost 4 vext1 <2,6,1,2>, <2,6,1,2>
+ 2686026646U, // <6,1,2,3>: Cost 3 vext3 <0,2,4,6>, <1,2,3,0>
+ 2554211638U, // <6,1,2,4>: Cost 3 vext1 <0,6,1,2>, RHS
+ 3760874411U, // <6,1,2,5>: Cost 4 vext3 <0,4,1,6>, <1,2,5,3>
+ 2554212858U, // <6,1,2,6>: Cost 3 vext1 <0,6,1,2>, <6,2,7,3>
+ 3802973114U, // <6,1,2,7>: Cost 4 vext3 <7,4,5,6>, <1,2,7,0>
+ 2686026691U, // <6,1,2,u>: Cost 3 vext3 <0,2,4,6>, <1,2,u,0>
+ 2566160486U, // <6,1,3,0>: Cost 3 vext1 <2,6,1,3>, LHS
+ 2686026712U, // <6,1,3,1>: Cost 3 vext3 <0,2,4,6>, <1,3,1,3>
+ 2686026724U, // <6,1,3,2>: Cost 3 vext3 <0,2,4,6>, <1,3,2,6>
+ 3759768552U, // <6,1,3,3>: Cost 4 vext3 <0,2,4,6>, <1,3,3,1>
+ 2692662262U, // <6,1,3,4>: Cost 3 vext3 <1,3,4,6>, <1,3,4,6>
+ 2686026752U, // <6,1,3,5>: Cost 3 vext3 <0,2,4,6>, <1,3,5,7>
+ 2590053128U, // <6,1,3,6>: Cost 3 vext1 <6,6,1,3>, <6,6,1,3>
+ 3663795194U, // <6,1,3,7>: Cost 4 vext1 <6,6,1,3>, <7,0,1,2>
+ 2686026775U, // <6,1,3,u>: Cost 3 vext3 <0,2,4,6>, <1,3,u,3>
+ 2641587099U, // <6,1,4,0>: Cost 3 vext2 <4,0,6,1>, <4,0,6,1>
+ 2693104684U, // <6,1,4,1>: Cost 3 vext3 <1,4,1,6>, <1,4,1,6>
+ 3639912357U, // <6,1,4,2>: Cost 4 vext1 <2,6,1,4>, <2,6,1,4>
+ 2687206462U, // <6,1,4,3>: Cost 3 vext3 <0,4,2,6>, <1,4,3,6>
+ 3633941814U, // <6,1,4,4>: Cost 4 vext1 <1,6,1,4>, RHS
+ 2693399632U, // <6,1,4,5>: Cost 3 vext3 <1,4,5,6>, <1,4,5,6>
+ 3765077075U, // <6,1,4,6>: Cost 4 vext3 <1,1,4,6>, <1,4,6,0>
+ 2646232530U, // <6,1,4,7>: Cost 3 vext2 <4,7,6,1>, <4,7,6,1>
+ 2687206507U, // <6,1,4,u>: Cost 3 vext3 <0,4,2,6>, <1,4,u,6>
+ 2647559796U, // <6,1,5,0>: Cost 3 vext2 <5,0,6,1>, <5,0,6,1>
+ 3765077118U, // <6,1,5,1>: Cost 4 vext3 <1,1,4,6>, <1,5,1,7>
+ 3767583878U, // <6,1,5,2>: Cost 4 vext3 <1,5,2,6>, <1,5,2,6>
+ 2686026896U, // <6,1,5,3>: Cost 3 vext3 <0,2,4,6>, <1,5,3,7>
+ 2693989528U, // <6,1,5,4>: Cost 3 vext3 <1,5,4,6>, <1,5,4,6>
+ 3767805089U, // <6,1,5,5>: Cost 4 vext3 <1,5,5,6>, <1,5,5,6>
+ 2652868706U, // <6,1,5,6>: Cost 3 vext2 <5,u,6,1>, <5,6,7,0>
+ 3908250934U, // <6,1,5,7>: Cost 4 vuzpr <2,6,0,1>, RHS
+ 2686026941U, // <6,1,5,u>: Cost 3 vext3 <0,2,4,6>, <1,5,u,7>
+ 2554241126U, // <6,1,6,0>: Cost 3 vext1 <0,6,1,6>, LHS
+ 3763602639U, // <6,1,6,1>: Cost 4 vext3 <0,u,2,6>, <1,6,1,7>
+ 3759547607U, // <6,1,6,2>: Cost 4 vext3 <0,2,1,6>, <1,6,2,6>
+ 3115221094U, // <6,1,6,3>: Cost 3 vtrnr <4,6,4,6>, LHS
+ 2554244406U, // <6,1,6,4>: Cost 3 vext1 <0,6,1,6>, RHS
+ 3760874739U, // <6,1,6,5>: Cost 4 vext3 <0,4,1,6>, <1,6,5,7>
+ 2554245944U, // <6,1,6,6>: Cost 3 vext1 <0,6,1,6>, <6,6,6,6>
+ 3719975758U, // <6,1,6,7>: Cost 4 vext2 <4,7,6,1>, <6,7,0,1>
+ 3115221099U, // <6,1,6,u>: Cost 3 vtrnr <4,6,4,6>, LHS
+ 2560221286U, // <6,1,7,0>: Cost 3 vext1 <1,6,1,7>, LHS
+ 2560222415U, // <6,1,7,1>: Cost 3 vext1 <1,6,1,7>, <1,6,1,7>
+ 2980497558U, // <6,1,7,2>: Cost 3 vzipr RHS, <3,0,1,2>
+ 3103211622U, // <6,1,7,3>: Cost 3 vtrnr <2,6,3,7>, LHS
+ 2560224566U, // <6,1,7,4>: Cost 3 vext1 <1,6,1,7>, RHS
+ 2980495698U, // <6,1,7,5>: Cost 3 vzipr RHS, <0,4,1,5>
+ 3633967526U, // <6,1,7,6>: Cost 4 vext1 <1,6,1,7>, <6,1,7,0>
+ 4054237686U, // <6,1,7,7>: Cost 4 vzipr RHS, <0,6,1,7>
+ 2560227118U, // <6,1,7,u>: Cost 3 vext1 <1,6,1,7>, LHS
+ 2560229478U, // <6,1,u,0>: Cost 3 vext1 <1,6,1,u>, LHS
+ 2686027117U, // <6,1,u,1>: Cost 3 vext3 <0,2,4,6>, <1,u,1,3>
+ 2686027129U, // <6,1,u,2>: Cost 3 vext3 <0,2,4,6>, <1,u,2,6>
+ 2686027132U, // <6,1,u,3>: Cost 3 vext3 <0,2,4,6>, <1,u,3,0>
+ 2687206795U, // <6,1,u,4>: Cost 3 vext3 <0,4,2,6>, <1,u,4,6>
+ 2686027157U, // <6,1,u,5>: Cost 3 vext3 <0,2,4,6>, <1,u,5,7>
+ 2590094093U, // <6,1,u,6>: Cost 3 vext1 <6,6,1,u>, <6,6,1,u>
+ 2596066790U, // <6,1,u,7>: Cost 3 vext1 <7,6,1,u>, <7,6,1,u>
+ 2686027177U, // <6,1,u,u>: Cost 3 vext3 <0,2,4,6>, <1,u,u,0>
+ 2646900736U, // <6,2,0,0>: Cost 3 vext2 <4,u,6,2>, <0,0,0,0>
+ 1573159014U, // <6,2,0,1>: Cost 2 vext2 <4,u,6,2>, LHS
+ 2646900900U, // <6,2,0,2>: Cost 3 vext2 <4,u,6,2>, <0,2,0,2>
+ 3759769037U, // <6,2,0,3>: Cost 4 vext3 <0,2,4,6>, <2,0,3,0>
+ 2641592668U, // <6,2,0,4>: Cost 3 vext2 <4,0,6,2>, <0,4,2,6>
+ 3779085794U, // <6,2,0,5>: Cost 4 vext3 <3,4,5,6>, <2,0,5,3>
+ 2686027244U, // <6,2,0,6>: Cost 3 vext3 <0,2,4,6>, <2,0,6,4>
+ 3669816807U, // <6,2,0,7>: Cost 4 vext1 <7,6,2,0>, <7,6,2,0>
+ 1573159581U, // <6,2,0,u>: Cost 2 vext2 <4,u,6,2>, LHS
+ 2230527897U, // <6,2,1,0>: Cost 3 vrev <2,6,0,1>
+ 2646901556U, // <6,2,1,1>: Cost 3 vext2 <4,u,6,2>, <1,1,1,1>
+ 2646901654U, // <6,2,1,2>: Cost 3 vext2 <4,u,6,2>, <1,2,3,0>
+ 2847047782U, // <6,2,1,3>: Cost 3 vuzpr <4,6,u,2>, LHS
+ 3771049517U, // <6,2,1,4>: Cost 4 vext3 <2,1,4,6>, <2,1,4,6>
+ 2646901904U, // <6,2,1,5>: Cost 3 vext2 <4,u,6,2>, <1,5,3,7>
+ 2686027324U, // <6,2,1,6>: Cost 3 vext3 <0,2,4,6>, <2,1,6,3>
+ 3669825000U, // <6,2,1,7>: Cost 4 vext1 <7,6,2,1>, <7,6,2,1>
+ 2231117793U, // <6,2,1,u>: Cost 3 vrev <2,6,u,1>
+ 3763603029U, // <6,2,2,0>: Cost 4 vext3 <0,u,2,6>, <2,2,0,1>
+ 3759769184U, // <6,2,2,1>: Cost 4 vext3 <0,2,4,6>, <2,2,1,3>
+ 2686027368U, // <6,2,2,2>: Cost 3 vext3 <0,2,4,6>, <2,2,2,2>
+ 2686027378U, // <6,2,2,3>: Cost 3 vext3 <0,2,4,6>, <2,2,3,3>
+ 2697971326U, // <6,2,2,4>: Cost 3 vext3 <2,2,4,6>, <2,2,4,6>
+ 3759769224U, // <6,2,2,5>: Cost 4 vext3 <0,2,4,6>, <2,2,5,7>
+ 2698118800U, // <6,2,2,6>: Cost 3 vext3 <2,2,6,6>, <2,2,6,6>
+ 3920794092U, // <6,2,2,7>: Cost 4 vuzpr <4,6,u,2>, <6,2,5,7>
+ 2686027423U, // <6,2,2,u>: Cost 3 vext3 <0,2,4,6>, <2,2,u,3>
+ 2686027430U, // <6,2,3,0>: Cost 3 vext3 <0,2,4,6>, <2,3,0,1>
+ 3759769262U, // <6,2,3,1>: Cost 4 vext3 <0,2,4,6>, <2,3,1,0>
+ 2698487485U, // <6,2,3,2>: Cost 3 vext3 <2,3,2,6>, <2,3,2,6>
+ 2705344196U, // <6,2,3,3>: Cost 3 vext3 <3,4,5,6>, <2,3,3,4>
+ 2686027470U, // <6,2,3,4>: Cost 3 vext3 <0,2,4,6>, <2,3,4,5>
+ 2698708696U, // <6,2,3,5>: Cost 3 vext3 <2,3,5,6>, <2,3,5,6>
+ 2724660961U, // <6,2,3,6>: Cost 3 vext3 <6,6,6,6>, <2,3,6,6>
+ 2729232104U, // <6,2,3,7>: Cost 3 vext3 <7,4,5,6>, <2,3,7,4>
+ 2686027502U, // <6,2,3,u>: Cost 3 vext3 <0,2,4,6>, <2,3,u,1>
+ 1567853468U, // <6,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
+ 3759769351U, // <6,2,4,1>: Cost 4 vext3 <0,2,4,6>, <2,4,1,u>
+ 2699151118U, // <6,2,4,2>: Cost 3 vext3 <2,4,2,6>, <2,4,2,6>
+ 2686027543U, // <6,2,4,3>: Cost 3 vext3 <0,2,4,6>, <2,4,3,6>
+ 2699298592U, // <6,2,4,4>: Cost 3 vext3 <2,4,4,6>, <2,4,4,6>
+ 1573162294U, // <6,2,4,5>: Cost 2 vext2 <4,u,6,2>, RHS
+ 2686027564U, // <6,2,4,6>: Cost 3 vext3 <0,2,4,6>, <2,4,6,0>
+ 3719982547U, // <6,2,4,7>: Cost 4 vext2 <4,7,6,2>, <4,7,6,2>
+ 1573162532U, // <6,2,4,u>: Cost 2 vext2 <4,u,6,2>, <4,u,6,2>
+ 3779086154U, // <6,2,5,0>: Cost 4 vext3 <3,4,5,6>, <2,5,0,3>
+ 2646904528U, // <6,2,5,1>: Cost 3 vext2 <4,u,6,2>, <5,1,7,3>
+ 3759769440U, // <6,2,5,2>: Cost 4 vext3 <0,2,4,6>, <2,5,2,7>
+ 2699888488U, // <6,2,5,3>: Cost 3 vext3 <2,5,3,6>, <2,5,3,6>
+ 2230855617U, // <6,2,5,4>: Cost 3 vrev <2,6,4,5>
+ 2646904836U, // <6,2,5,5>: Cost 3 vext2 <4,u,6,2>, <5,5,5,5>
+ 2646904930U, // <6,2,5,6>: Cost 3 vext2 <4,u,6,2>, <5,6,7,0>
+ 2847051062U, // <6,2,5,7>: Cost 3 vuzpr <4,6,u,2>, RHS
+ 2700257173U, // <6,2,5,u>: Cost 3 vext3 <2,5,u,6>, <2,5,u,6>
+ 2687207321U, // <6,2,6,0>: Cost 3 vext3 <0,4,2,6>, <2,6,0,1>
+ 2686027684U, // <6,2,6,1>: Cost 3 vext3 <0,2,4,6>, <2,6,1,3>
+ 2566260656U, // <6,2,6,2>: Cost 3 vext1 <2,6,2,6>, <2,6,2,6>
+ 2685806522U, // <6,2,6,3>: Cost 3 vext3 <0,2,1,6>, <2,6,3,7>
+ 2687207361U, // <6,2,6,4>: Cost 3 vext3 <0,4,2,6>, <2,6,4,5>
+ 2686027724U, // <6,2,6,5>: Cost 3 vext3 <0,2,4,6>, <2,6,5,7>
+ 2646905656U, // <6,2,6,6>: Cost 3 vext2 <4,u,6,2>, <6,6,6,6>
+ 2646905678U, // <6,2,6,7>: Cost 3 vext2 <4,u,6,2>, <6,7,0,1>
+ 2686027751U, // <6,2,6,u>: Cost 3 vext3 <0,2,4,6>, <2,6,u,7>
+ 2554323046U, // <6,2,7,0>: Cost 3 vext1 <0,6,2,7>, LHS
+ 2572239606U, // <6,2,7,1>: Cost 3 vext1 <3,6,2,7>, <1,0,3,2>
+ 2566268849U, // <6,2,7,2>: Cost 3 vext1 <2,6,2,7>, <2,6,2,7>
+ 1906753638U, // <6,2,7,3>: Cost 2 vzipr RHS, LHS
+ 2554326326U, // <6,2,7,4>: Cost 3 vext1 <0,6,2,7>, RHS
+ 3304687564U, // <6,2,7,5>: Cost 4 vrev <2,6,5,7>
+ 2980495708U, // <6,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
+ 2646906476U, // <6,2,7,7>: Cost 3 vext2 <4,u,6,2>, <7,7,7,7>
+ 1906753643U, // <6,2,7,u>: Cost 2 vzipr RHS, LHS
+ 1591744256U, // <6,2,u,0>: Cost 2 vext2 <u,0,6,2>, <u,0,6,2>
+ 1573164846U, // <6,2,u,1>: Cost 2 vext2 <4,u,6,2>, LHS
+ 2701805650U, // <6,2,u,2>: Cost 3 vext3 <2,u,2,6>, <2,u,2,6>
+ 1906761830U, // <6,2,u,3>: Cost 2 vzipr RHS, LHS
+ 2686027875U, // <6,2,u,4>: Cost 3 vext3 <0,2,4,6>, <2,u,4,5>
+ 1573165210U, // <6,2,u,5>: Cost 2 vext2 <4,u,6,2>, RHS
+ 2686322800U, // <6,2,u,6>: Cost 3 vext3 <0,2,u,6>, <2,u,6,0>
+ 2847051305U, // <6,2,u,7>: Cost 3 vuzpr <4,6,u,2>, RHS
+ 1906761835U, // <6,2,u,u>: Cost 2 vzipr RHS, LHS
+ 3759769739U, // <6,3,0,0>: Cost 4 vext3 <0,2,4,6>, <3,0,0,0>
+ 2686027926U, // <6,3,0,1>: Cost 3 vext3 <0,2,4,6>, <3,0,1,2>
+ 2686027937U, // <6,3,0,2>: Cost 3 vext3 <0,2,4,6>, <3,0,2,4>
+ 3640027286U, // <6,3,0,3>: Cost 4 vext1 <2,6,3,0>, <3,0,1,2>
+ 2687207601U, // <6,3,0,4>: Cost 3 vext3 <0,4,2,6>, <3,0,4,2>
+ 2705344698U, // <6,3,0,5>: Cost 3 vext3 <3,4,5,6>, <3,0,5,2>
+ 3663917847U, // <6,3,0,6>: Cost 4 vext1 <6,6,3,0>, <6,6,3,0>
+ 2237008560U, // <6,3,0,7>: Cost 3 vrev <3,6,7,0>
+ 2686027989U, // <6,3,0,u>: Cost 3 vext3 <0,2,4,6>, <3,0,u,2>
+ 3759769823U, // <6,3,1,0>: Cost 4 vext3 <0,2,4,6>, <3,1,0,3>
+ 3759769830U, // <6,3,1,1>: Cost 4 vext3 <0,2,4,6>, <3,1,1,1>
+ 3759769841U, // <6,3,1,2>: Cost 4 vext3 <0,2,4,6>, <3,1,2,3>
+ 3759769848U, // <6,3,1,3>: Cost 4 vext3 <0,2,4,6>, <3,1,3,1>
+ 2703280390U, // <6,3,1,4>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
+ 3759769868U, // <6,3,1,5>: Cost 4 vext3 <0,2,4,6>, <3,1,5,3>
+ 3704063194U, // <6,3,1,6>: Cost 4 vext2 <2,1,6,3>, <1,6,3,0>
+ 3767732510U, // <6,3,1,7>: Cost 4 vext3 <1,5,4,6>, <3,1,7,3>
+ 2703280390U, // <6,3,1,u>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
+ 3704063468U, // <6,3,2,0>: Cost 4 vext2 <2,1,6,3>, <2,0,6,4>
+ 2630321724U, // <6,3,2,1>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
+ 3759769921U, // <6,3,2,2>: Cost 4 vext3 <0,2,4,6>, <3,2,2,2>
+ 3759769928U, // <6,3,2,3>: Cost 4 vext3 <0,2,4,6>, <3,2,3,0>
+ 3704063767U, // <6,3,2,4>: Cost 4 vext2 <2,1,6,3>, <2,4,3,6>
+ 3704063876U, // <6,3,2,5>: Cost 4 vext2 <2,1,6,3>, <2,5,6,7>
+ 2636957626U, // <6,3,2,6>: Cost 3 vext2 <3,2,6,3>, <2,6,3,7>
+ 3777907058U, // <6,3,2,7>: Cost 4 vext3 <3,2,7,6>, <3,2,7,6>
+ 2630321724U, // <6,3,2,u>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
+ 3759769983U, // <6,3,3,0>: Cost 4 vext3 <0,2,4,6>, <3,3,0,1>
+ 3710036245U, // <6,3,3,1>: Cost 4 vext2 <3,1,6,3>, <3,1,6,3>
+ 2636958054U, // <6,3,3,2>: Cost 3 vext2 <3,2,6,3>, <3,2,6,3>
+ 2686028188U, // <6,3,3,3>: Cost 3 vext3 <0,2,4,6>, <3,3,3,3>
+ 2704607656U, // <6,3,3,4>: Cost 3 vext3 <3,3,4,6>, <3,3,4,6>
+ 3773041072U, // <6,3,3,5>: Cost 4 vext3 <2,4,4,6>, <3,3,5,5>
+ 3711363731U, // <6,3,3,6>: Cost 4 vext2 <3,3,6,3>, <3,6,3,7>
+ 3767732676U, // <6,3,3,7>: Cost 4 vext3 <1,5,4,6>, <3,3,7,7>
+ 2707999179U, // <6,3,3,u>: Cost 3 vext3 <3,u,5,6>, <3,3,u,5>
+ 2584232038U, // <6,3,4,0>: Cost 3 vext1 <5,6,3,4>, LHS
+ 2642267118U, // <6,3,4,1>: Cost 3 vext2 <4,1,6,3>, <4,1,6,3>
+ 2642930751U, // <6,3,4,2>: Cost 3 vext2 <4,2,6,3>, <4,2,6,3>
+ 2705197552U, // <6,3,4,3>: Cost 3 vext3 <3,4,3,6>, <3,4,3,6>
+ 2584235318U, // <6,3,4,4>: Cost 3 vext1 <5,6,3,4>, RHS
+ 1631603202U, // <6,3,4,5>: Cost 2 vext3 <3,4,5,6>, <3,4,5,6>
+ 2654211444U, // <6,3,4,6>: Cost 3 vext2 <6,1,6,3>, <4,6,4,6>
+ 2237041332U, // <6,3,4,7>: Cost 3 vrev <3,6,7,4>
+ 1631824413U, // <6,3,4,u>: Cost 2 vext3 <3,4,u,6>, <3,4,u,6>
+ 3640066150U, // <6,3,5,0>: Cost 4 vext1 <2,6,3,5>, LHS
+ 3772746288U, // <6,3,5,1>: Cost 4 vext3 <2,4,0,6>, <3,5,1,7>
+ 3640067790U, // <6,3,5,2>: Cost 4 vext1 <2,6,3,5>, <2,3,4,5>
+ 3773041216U, // <6,3,5,3>: Cost 4 vext3 <2,4,4,6>, <3,5,3,5>
+ 2705934922U, // <6,3,5,4>: Cost 3 vext3 <3,5,4,6>, <3,5,4,6>
+ 3773041236U, // <6,3,5,5>: Cost 4 vext3 <2,4,4,6>, <3,5,5,7>
+ 3779086940U, // <6,3,5,6>: Cost 4 vext3 <3,4,5,6>, <3,5,6,6>
+ 3767732831U, // <6,3,5,7>: Cost 4 vext3 <1,5,4,6>, <3,5,7,0>
+ 2706229870U, // <6,3,5,u>: Cost 3 vext3 <3,5,u,6>, <3,5,u,6>
+ 2602164326U, // <6,3,6,0>: Cost 3 vext1 <u,6,3,6>, LHS
+ 2654212512U, // <6,3,6,1>: Cost 3 vext2 <6,1,6,3>, <6,1,6,3>
+ 2566334393U, // <6,3,6,2>: Cost 3 vext1 <2,6,3,6>, <2,6,3,6>
+ 3704066588U, // <6,3,6,3>: Cost 4 vext2 <2,1,6,3>, <6,3,2,1>
+ 2602167524U, // <6,3,6,4>: Cost 3 vext1 <u,6,3,6>, <4,4,6,6>
+ 3710702321U, // <6,3,6,5>: Cost 4 vext2 <3,2,6,3>, <6,5,7,7>
+ 2724661933U, // <6,3,6,6>: Cost 3 vext3 <6,6,6,6>, <3,6,6,6>
+ 3710702465U, // <6,3,6,7>: Cost 4 vext2 <3,2,6,3>, <6,7,5,7>
+ 2602170158U, // <6,3,6,u>: Cost 3 vext1 <u,6,3,6>, LHS
+ 1492598886U, // <6,3,7,0>: Cost 2 vext1 <2,6,3,7>, LHS
+ 2560369889U, // <6,3,7,1>: Cost 3 vext1 <1,6,3,7>, <1,6,3,7>
+ 1492600762U, // <6,3,7,2>: Cost 2 vext1 <2,6,3,7>, <2,6,3,7>
+ 2566342806U, // <6,3,7,3>: Cost 3 vext1 <2,6,3,7>, <3,0,1,2>
+ 1492602166U, // <6,3,7,4>: Cost 2 vext1 <2,6,3,7>, RHS
+ 2602176208U, // <6,3,7,5>: Cost 3 vext1 <u,6,3,7>, <5,1,7,3>
+ 2566345210U, // <6,3,7,6>: Cost 3 vext1 <2,6,3,7>, <6,2,7,3>
+ 2980496528U, // <6,3,7,7>: Cost 3 vzipr RHS, <1,5,3,7>
+ 1492604718U, // <6,3,7,u>: Cost 2 vext1 <2,6,3,7>, LHS
+ 1492607078U, // <6,3,u,0>: Cost 2 vext1 <2,6,3,u>, LHS
+ 2686028574U, // <6,3,u,1>: Cost 3 vext3 <0,2,4,6>, <3,u,1,2>
+ 1492608955U, // <6,3,u,2>: Cost 2 vext1 <2,6,3,u>, <2,6,3,u>
+ 2566350998U, // <6,3,u,3>: Cost 3 vext1 <2,6,3,u>, <3,0,1,2>
+ 1492610358U, // <6,3,u,4>: Cost 2 vext1 <2,6,3,u>, RHS
+ 1634257734U, // <6,3,u,5>: Cost 2 vext3 <3,u,5,6>, <3,u,5,6>
+ 2566353489U, // <6,3,u,6>: Cost 3 vext1 <2,6,3,u>, <6,3,u,0>
+ 2980504720U, // <6,3,u,7>: Cost 3 vzipr RHS, <1,5,3,7>
+ 1492612910U, // <6,3,u,u>: Cost 2 vext1 <2,6,3,u>, LHS
+ 3703406592U, // <6,4,0,0>: Cost 4 vext2 <2,0,6,4>, <0,0,0,0>
+ 2629664870U, // <6,4,0,1>: Cost 3 vext2 <2,0,6,4>, LHS
+ 2629664972U, // <6,4,0,2>: Cost 3 vext2 <2,0,6,4>, <0,2,4,6>
+ 3779087232U, // <6,4,0,3>: Cost 4 vext3 <3,4,5,6>, <4,0,3,1>
+ 2642936156U, // <6,4,0,4>: Cost 3 vext2 <4,2,6,4>, <0,4,2,6>
+ 2712570770U, // <6,4,0,5>: Cost 3 vext3 <4,6,4,6>, <4,0,5,1>
+ 2687208348U, // <6,4,0,6>: Cost 3 vext3 <0,4,2,6>, <4,0,6,2>
+ 3316723081U, // <6,4,0,7>: Cost 4 vrev <4,6,7,0>
+ 2629665437U, // <6,4,0,u>: Cost 3 vext2 <2,0,6,4>, LHS
+ 2242473291U, // <6,4,1,0>: Cost 3 vrev <4,6,0,1>
+ 3700089652U, // <6,4,1,1>: Cost 4 vext2 <1,4,6,4>, <1,1,1,1>
+ 3703407510U, // <6,4,1,2>: Cost 4 vext2 <2,0,6,4>, <1,2,3,0>
+ 2852962406U, // <6,4,1,3>: Cost 3 vuzpr <5,6,7,4>, LHS
+ 3628166454U, // <6,4,1,4>: Cost 4 vext1 <0,6,4,1>, RHS
+ 3760876514U, // <6,4,1,5>: Cost 4 vext3 <0,4,1,6>, <4,1,5,0>
+ 2687208430U, // <6,4,1,6>: Cost 3 vext3 <0,4,2,6>, <4,1,6,3>
+ 3316731274U, // <6,4,1,7>: Cost 4 vrev <4,6,7,1>
+ 2243063187U, // <6,4,1,u>: Cost 3 vrev <4,6,u,1>
+ 2629666284U, // <6,4,2,0>: Cost 3 vext2 <2,0,6,4>, <2,0,6,4>
+ 3703408188U, // <6,4,2,1>: Cost 4 vext2 <2,0,6,4>, <2,1,6,3>
+ 3703408232U, // <6,4,2,2>: Cost 4 vext2 <2,0,6,4>, <2,2,2,2>
+ 3703408294U, // <6,4,2,3>: Cost 4 vext2 <2,0,6,4>, <2,3,0,1>
+ 2632320816U, // <6,4,2,4>: Cost 3 vext2 <2,4,6,4>, <2,4,6,4>
+ 2923384118U, // <6,4,2,5>: Cost 3 vzipl <6,2,7,3>, RHS
+ 2687208508U, // <6,4,2,6>: Cost 3 vext3 <0,4,2,6>, <4,2,6,0>
+ 3760950341U, // <6,4,2,7>: Cost 4 vext3 <0,4,2,6>, <4,2,7,0>
+ 2634975348U, // <6,4,2,u>: Cost 3 vext2 <2,u,6,4>, <2,u,6,4>
+ 3703408790U, // <6,4,3,0>: Cost 4 vext2 <2,0,6,4>, <3,0,1,2>
+ 3316305238U, // <6,4,3,1>: Cost 4 vrev <4,6,1,3>
+ 3703408947U, // <6,4,3,2>: Cost 4 vext2 <2,0,6,4>, <3,2,0,6>
+ 3703409052U, // <6,4,3,3>: Cost 4 vext2 <2,0,6,4>, <3,3,3,3>
+ 2644929026U, // <6,4,3,4>: Cost 3 vext2 <4,5,6,4>, <3,4,5,6>
+ 3718670922U, // <6,4,3,5>: Cost 4 vext2 <4,5,6,4>, <3,5,4,6>
+ 2705345682U, // <6,4,3,6>: Cost 3 vext3 <3,4,5,6>, <4,3,6,5>
+ 3926705152U, // <6,4,3,7>: Cost 4 vuzpr <5,6,7,4>, <1,3,5,7>
+ 2668817222U, // <6,4,3,u>: Cost 3 vext2 <u,5,6,4>, <3,u,5,6>
+ 2590277734U, // <6,4,4,0>: Cost 3 vext1 <6,6,4,4>, LHS
+ 3716017135U, // <6,4,4,1>: Cost 4 vext2 <4,1,6,4>, <4,1,6,4>
+ 2642938944U, // <6,4,4,2>: Cost 3 vext2 <4,2,6,4>, <4,2,6,4>
+ 3717344401U, // <6,4,4,3>: Cost 4 vext2 <4,3,6,4>, <4,3,6,4>
+ 2712571088U, // <6,4,4,4>: Cost 3 vext3 <4,6,4,6>, <4,4,4,4>
+ 2629668150U, // <6,4,4,5>: Cost 3 vext2 <2,0,6,4>, RHS
+ 1637649636U, // <6,4,4,6>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
+ 2646257109U, // <6,4,4,7>: Cost 3 vext2 <4,7,6,4>, <4,7,6,4>
+ 1637649636U, // <6,4,4,u>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
+ 2566398054U, // <6,4,5,0>: Cost 3 vext1 <2,6,4,5>, LHS
+ 3760876805U, // <6,4,5,1>: Cost 4 vext3 <0,4,1,6>, <4,5,1,3>
+ 2566399937U, // <6,4,5,2>: Cost 3 vext1 <2,6,4,5>, <2,6,4,5>
+ 2584316418U, // <6,4,5,3>: Cost 3 vext1 <5,6,4,5>, <3,4,5,6>
+ 2566401334U, // <6,4,5,4>: Cost 3 vext1 <2,6,4,5>, RHS
+ 2584318028U, // <6,4,5,5>: Cost 3 vext1 <5,6,4,5>, <5,6,4,5>
+ 1612287286U, // <6,4,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
+ 2852965686U, // <6,4,5,7>: Cost 3 vuzpr <5,6,7,4>, RHS
+ 1612287304U, // <6,4,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
+ 1504608358U, // <6,4,6,0>: Cost 2 vext1 <4,6,4,6>, LHS
+ 2578350838U, // <6,4,6,1>: Cost 3 vext1 <4,6,4,6>, <1,0,3,2>
+ 2578351720U, // <6,4,6,2>: Cost 3 vext1 <4,6,4,6>, <2,2,2,2>
+ 2578352278U, // <6,4,6,3>: Cost 3 vext1 <4,6,4,6>, <3,0,1,2>
+ 1504611638U, // <6,4,6,4>: Cost 2 vext1 <4,6,4,6>, RHS
+ 2578353872U, // <6,4,6,5>: Cost 3 vext1 <4,6,4,6>, <5,1,7,3>
+ 2578354682U, // <6,4,6,6>: Cost 3 vext1 <4,6,4,6>, <6,2,7,3>
+ 2578355194U, // <6,4,6,7>: Cost 3 vext1 <4,6,4,6>, <7,0,1,2>
+ 1504614190U, // <6,4,6,u>: Cost 2 vext1 <4,6,4,6>, LHS
+ 2572386406U, // <6,4,7,0>: Cost 3 vext1 <3,6,4,7>, LHS
+ 2572387226U, // <6,4,7,1>: Cost 3 vext1 <3,6,4,7>, <1,2,3,4>
+ 3640157902U, // <6,4,7,2>: Cost 4 vext1 <2,6,4,7>, <2,3,4,5>
+ 2572389020U, // <6,4,7,3>: Cost 3 vext1 <3,6,4,7>, <3,6,4,7>
+ 2572389686U, // <6,4,7,4>: Cost 3 vext1 <3,6,4,7>, RHS
+ 2980497102U, // <6,4,7,5>: Cost 3 vzipr RHS, <2,3,4,5>
+ 2980495564U, // <6,4,7,6>: Cost 3 vzipr RHS, <0,2,4,6>
+ 4054239090U, // <6,4,7,7>: Cost 4 vzipr RHS, <2,5,4,7>
+ 2572392238U, // <6,4,7,u>: Cost 3 vext1 <3,6,4,7>, LHS
+ 1504608358U, // <6,4,u,0>: Cost 2 vext1 <4,6,4,6>, LHS
+ 2629670702U, // <6,4,u,1>: Cost 3 vext2 <2,0,6,4>, LHS
+ 2566424516U, // <6,4,u,2>: Cost 3 vext1 <2,6,4,u>, <2,6,4,u>
+ 2584340994U, // <6,4,u,3>: Cost 3 vext1 <5,6,4,u>, <3,4,5,6>
+ 1640156694U, // <6,4,u,4>: Cost 2 vext3 <4,u,4,6>, <4,u,4,6>
+ 2629671066U, // <6,4,u,5>: Cost 3 vext2 <2,0,6,4>, RHS
+ 1612287529U, // <6,4,u,6>: Cost 2 vext3 <0,2,4,6>, RHS
+ 2852965929U, // <6,4,u,7>: Cost 3 vuzpr <5,6,7,4>, RHS
+ 1612287547U, // <6,4,u,u>: Cost 2 vext3 <0,2,4,6>, RHS
+ 3708723200U, // <6,5,0,0>: Cost 4 vext2 <2,u,6,5>, <0,0,0,0>
+ 2634981478U, // <6,5,0,1>: Cost 3 vext2 <2,u,6,5>, LHS
+ 3694125260U, // <6,5,0,2>: Cost 4 vext2 <0,4,6,5>, <0,2,4,6>
+ 3779087962U, // <6,5,0,3>: Cost 4 vext3 <3,4,5,6>, <5,0,3,2>
+ 3760877154U, // <6,5,0,4>: Cost 4 vext3 <0,4,1,6>, <5,0,4,1>
+ 4195110916U, // <6,5,0,5>: Cost 4 vtrnr <5,6,7,0>, <5,5,5,5>
+ 3696779775U, // <6,5,0,6>: Cost 4 vext2 <0,u,6,5>, <0,6,2,7>
+ 1175212130U, // <6,5,0,7>: Cost 2 vrev <5,6,7,0>
+ 1175285867U, // <6,5,0,u>: Cost 2 vrev <5,6,u,0>
+ 2248445988U, // <6,5,1,0>: Cost 3 vrev <5,6,0,1>
+ 3698107237U, // <6,5,1,1>: Cost 4 vext2 <1,1,6,5>, <1,1,6,5>
+ 3708724118U, // <6,5,1,2>: Cost 4 vext2 <2,u,6,5>, <1,2,3,0>
+ 3908575334U, // <6,5,1,3>: Cost 4 vuzpr <2,6,4,5>, LHS
+ 3716023376U, // <6,5,1,4>: Cost 4 vext2 <4,1,6,5>, <1,4,5,6>
+ 3708724368U, // <6,5,1,5>: Cost 4 vext2 <2,u,6,5>, <1,5,3,7>
+ 3767733960U, // <6,5,1,6>: Cost 4 vext3 <1,5,4,6>, <5,1,6,4>
+ 2712571600U, // <6,5,1,7>: Cost 3 vext3 <4,6,4,6>, <5,1,7,3>
+ 2712571609U, // <6,5,1,u>: Cost 3 vext3 <4,6,4,6>, <5,1,u,3>
+ 2578391142U, // <6,5,2,0>: Cost 3 vext1 <4,6,5,2>, LHS
+ 3704079934U, // <6,5,2,1>: Cost 4 vext2 <2,1,6,5>, <2,1,6,5>
+ 3708724840U, // <6,5,2,2>: Cost 4 vext2 <2,u,6,5>, <2,2,2,2>
+ 3705407182U, // <6,5,2,3>: Cost 4 vext2 <2,3,6,5>, <2,3,4,5>
+ 2578394422U, // <6,5,2,4>: Cost 3 vext1 <4,6,5,2>, RHS
+ 3717351272U, // <6,5,2,5>: Cost 4 vext2 <4,3,6,5>, <2,5,3,6>
+ 2634983354U, // <6,5,2,6>: Cost 3 vext2 <2,u,6,5>, <2,6,3,7>
+ 3115486518U, // <6,5,2,7>: Cost 3 vtrnr <4,6,u,2>, RHS
+ 2634983541U, // <6,5,2,u>: Cost 3 vext2 <2,u,6,5>, <2,u,6,5>
+ 3708725398U, // <6,5,3,0>: Cost 4 vext2 <2,u,6,5>, <3,0,1,2>
+ 3710052631U, // <6,5,3,1>: Cost 4 vext2 <3,1,6,5>, <3,1,6,5>
+ 3708725606U, // <6,5,3,2>: Cost 4 vext2 <2,u,6,5>, <3,2,6,3>
+ 3708725660U, // <6,5,3,3>: Cost 4 vext2 <2,u,6,5>, <3,3,3,3>
+ 2643610114U, // <6,5,3,4>: Cost 3 vext2 <4,3,6,5>, <3,4,5,6>
+ 3717352010U, // <6,5,3,5>: Cost 4 vext2 <4,3,6,5>, <3,5,4,6>
+ 3773632358U, // <6,5,3,6>: Cost 4 vext3 <2,5,3,6>, <5,3,6,0>
+ 2248978533U, // <6,5,3,7>: Cost 3 vrev <5,6,7,3>
+ 2249052270U, // <6,5,3,u>: Cost 3 vrev <5,6,u,3>
+ 2596323430U, // <6,5,4,0>: Cost 3 vext1 <7,6,5,4>, LHS
+ 3716025328U, // <6,5,4,1>: Cost 4 vext2 <4,1,6,5>, <4,1,6,5>
+ 3716688961U, // <6,5,4,2>: Cost 4 vext2 <4,2,6,5>, <4,2,6,5>
+ 2643610770U, // <6,5,4,3>: Cost 3 vext2 <4,3,6,5>, <4,3,6,5>
+ 2596326710U, // <6,5,4,4>: Cost 3 vext1 <7,6,5,4>, RHS
+ 2634984758U, // <6,5,4,5>: Cost 3 vext2 <2,u,6,5>, RHS
+ 3767734199U, // <6,5,4,6>: Cost 4 vext3 <1,5,4,6>, <5,4,6,0>
+ 1643696070U, // <6,5,4,7>: Cost 2 vext3 <5,4,7,6>, <5,4,7,6>
+ 1643769807U, // <6,5,4,u>: Cost 2 vext3 <5,4,u,6>, <5,4,u,6>
+ 2578415718U, // <6,5,5,0>: Cost 3 vext1 <4,6,5,5>, LHS
+ 3652158198U, // <6,5,5,1>: Cost 4 vext1 <4,6,5,5>, <1,0,3,2>
+ 3652159080U, // <6,5,5,2>: Cost 4 vext1 <4,6,5,5>, <2,2,2,2>
+ 3652159638U, // <6,5,5,3>: Cost 4 vext1 <4,6,5,5>, <3,0,1,2>
+ 2578418998U, // <6,5,5,4>: Cost 3 vext1 <4,6,5,5>, RHS
+ 2712571908U, // <6,5,5,5>: Cost 3 vext3 <4,6,4,6>, <5,5,5,5>
+ 2718027790U, // <6,5,5,6>: Cost 3 vext3 <5,5,6,6>, <5,5,6,6>
+ 2712571928U, // <6,5,5,7>: Cost 3 vext3 <4,6,4,6>, <5,5,7,7>
+ 2712571937U, // <6,5,5,u>: Cost 3 vext3 <4,6,4,6>, <5,5,u,7>
+ 2705346596U, // <6,5,6,0>: Cost 3 vext3 <3,4,5,6>, <5,6,0,1>
+ 3767144496U, // <6,5,6,1>: Cost 4 vext3 <1,4,5,6>, <5,6,1,4>
+ 3773116473U, // <6,5,6,2>: Cost 4 vext3 <2,4,5,6>, <5,6,2,4>
+ 2705346626U, // <6,5,6,3>: Cost 3 vext3 <3,4,5,6>, <5,6,3,4>
+ 2705346636U, // <6,5,6,4>: Cost 3 vext3 <3,4,5,6>, <5,6,4,5>
+ 3908577217U, // <6,5,6,5>: Cost 4 vuzpr <2,6,4,5>, <2,6,4,5>
+ 2578428728U, // <6,5,6,6>: Cost 3 vext1 <4,6,5,6>, <6,6,6,6>
+ 2712572002U, // <6,5,6,7>: Cost 3 vext3 <4,6,4,6>, <5,6,7,0>
+ 2705346668U, // <6,5,6,u>: Cost 3 vext3 <3,4,5,6>, <5,6,u,1>
+ 2560516198U, // <6,5,7,0>: Cost 3 vext1 <1,6,5,7>, LHS
+ 2560517363U, // <6,5,7,1>: Cost 3 vext1 <1,6,5,7>, <1,6,5,7>
+ 2566490060U, // <6,5,7,2>: Cost 3 vext1 <2,6,5,7>, <2,6,5,7>
+ 3634260118U, // <6,5,7,3>: Cost 4 vext1 <1,6,5,7>, <3,0,1,2>
+ 2560519478U, // <6,5,7,4>: Cost 3 vext1 <1,6,5,7>, RHS
+ 2980498650U, // <6,5,7,5>: Cost 3 vzipr RHS, <4,4,5,5>
+ 2980497922U, // <6,5,7,6>: Cost 3 vzipr RHS, <3,4,5,6>
+ 3103214902U, // <6,5,7,7>: Cost 3 vtrnr <2,6,3,7>, RHS
+ 2560522030U, // <6,5,7,u>: Cost 3 vext1 <1,6,5,7>, LHS
+ 2560524390U, // <6,5,u,0>: Cost 3 vext1 <1,6,5,u>, LHS
+ 2560525556U, // <6,5,u,1>: Cost 3 vext1 <1,6,5,u>, <1,6,5,u>
+ 2566498253U, // <6,5,u,2>: Cost 3 vext1 <2,6,5,u>, <2,6,5,u>
+ 2646931439U, // <6,5,u,3>: Cost 3 vext2 <4,u,6,5>, <u,3,5,7>
+ 2560527670U, // <6,5,u,4>: Cost 3 vext1 <1,6,5,u>, RHS
+ 2634987674U, // <6,5,u,5>: Cost 3 vext2 <2,u,6,5>, RHS
+ 2980506114U, // <6,5,u,6>: Cost 3 vzipr RHS, <3,4,5,6>
+ 1175277674U, // <6,5,u,7>: Cost 2 vrev <5,6,7,u>
+ 1175351411U, // <6,5,u,u>: Cost 2 vrev <5,6,u,u>
+ 2578448486U, // <6,6,0,0>: Cost 3 vext1 <4,6,6,0>, LHS
+ 1573191782U, // <6,6,0,1>: Cost 2 vext2 <4,u,6,6>, LHS
+ 2686030124U, // <6,6,0,2>: Cost 3 vext3 <0,2,4,6>, <6,0,2,4>
+ 3779088690U, // <6,6,0,3>: Cost 4 vext3 <3,4,5,6>, <6,0,3,1>
+ 2687209788U, // <6,6,0,4>: Cost 3 vext3 <0,4,2,6>, <6,0,4,2>
+ 3652194000U, // <6,6,0,5>: Cost 4 vext1 <4,6,6,0>, <5,1,7,3>
+ 2254852914U, // <6,6,0,6>: Cost 3 vrev <6,6,6,0>
+ 4041575734U, // <6,6,0,7>: Cost 4 vzipr <2,4,6,0>, RHS
+ 1573192349U, // <6,6,0,u>: Cost 2 vext2 <4,u,6,6>, LHS
+ 2646934262U, // <6,6,1,0>: Cost 3 vext2 <4,u,6,6>, <1,0,3,2>
+ 2646934324U, // <6,6,1,1>: Cost 3 vext2 <4,u,6,6>, <1,1,1,1>
+ 2646934422U, // <6,6,1,2>: Cost 3 vext2 <4,u,6,6>, <1,2,3,0>
+ 2846785638U, // <6,6,1,3>: Cost 3 vuzpr <4,6,4,6>, LHS
+ 3760951694U, // <6,6,1,4>: Cost 4 vext3 <0,4,2,6>, <6,1,4,3>
+ 2646934672U, // <6,6,1,5>: Cost 3 vext2 <4,u,6,6>, <1,5,3,7>
+ 2712572320U, // <6,6,1,6>: Cost 3 vext3 <4,6,4,6>, <6,1,6,3>
+ 3775549865U, // <6,6,1,7>: Cost 4 vext3 <2,u,2,6>, <6,1,7,3>
+ 2846785643U, // <6,6,1,u>: Cost 3 vuzpr <4,6,4,6>, LHS
+ 3759772094U, // <6,6,2,0>: Cost 4 vext3 <0,2,4,6>, <6,2,0,6>
+ 3704751676U, // <6,6,2,1>: Cost 4 vext2 <2,2,6,6>, <2,1,6,3>
+ 2631009936U, // <6,6,2,2>: Cost 3 vext2 <2,2,6,6>, <2,2,6,6>
+ 2646935206U, // <6,6,2,3>: Cost 3 vext2 <4,u,6,6>, <2,3,0,1>
+ 3759772127U, // <6,6,2,4>: Cost 4 vext3 <0,2,4,6>, <6,2,4,3>
+ 3704752004U, // <6,6,2,5>: Cost 4 vext2 <2,2,6,6>, <2,5,6,7>
+ 2646935482U, // <6,6,2,6>: Cost 3 vext2 <4,u,6,6>, <2,6,3,7>
+ 2712572410U, // <6,6,2,7>: Cost 3 vext3 <4,6,4,6>, <6,2,7,3>
+ 2712572419U, // <6,6,2,u>: Cost 3 vext3 <4,6,4,6>, <6,2,u,3>
+ 2646935702U, // <6,6,3,0>: Cost 3 vext2 <4,u,6,6>, <3,0,1,2>
+ 3777024534U, // <6,6,3,1>: Cost 4 vext3 <3,1,4,6>, <6,3,1,4>
+ 3704752453U, // <6,6,3,2>: Cost 4 vext2 <2,2,6,6>, <3,2,2,6>
+ 2646935964U, // <6,6,3,3>: Cost 3 vext2 <4,u,6,6>, <3,3,3,3>
+ 2705347122U, // <6,6,3,4>: Cost 3 vext3 <3,4,5,6>, <6,3,4,5>
+ 3779678778U, // <6,6,3,5>: Cost 4 vext3 <3,5,4,6>, <6,3,5,4>
+ 2657553069U, // <6,6,3,6>: Cost 3 vext2 <6,6,6,6>, <3,6,6,6>
+ 4039609654U, // <6,6,3,7>: Cost 4 vzipr <2,1,6,3>, RHS
+ 2708001366U, // <6,6,3,u>: Cost 3 vext3 <3,u,5,6>, <6,3,u,5>
+ 2578481254U, // <6,6,4,0>: Cost 3 vext1 <4,6,6,4>, LHS
+ 3652223734U, // <6,6,4,1>: Cost 4 vext1 <4,6,6,4>, <1,0,3,2>
+ 3760951922U, // <6,6,4,2>: Cost 4 vext3 <0,4,2,6>, <6,4,2,6>
+ 3779089019U, // <6,6,4,3>: Cost 4 vext3 <3,4,5,6>, <6,4,3,6>
+ 1570540772U, // <6,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
+ 1573195062U, // <6,6,4,5>: Cost 2 vext2 <4,u,6,6>, RHS
+ 2712572560U, // <6,6,4,6>: Cost 3 vext3 <4,6,4,6>, <6,4,6,0>
+ 2723410591U, // <6,6,4,7>: Cost 3 vext3 <6,4,7,6>, <6,4,7,6>
+ 1573195304U, // <6,6,4,u>: Cost 2 vext2 <4,u,6,6>, <4,u,6,6>
+ 3640287334U, // <6,6,5,0>: Cost 4 vext1 <2,6,6,5>, LHS
+ 2646937296U, // <6,6,5,1>: Cost 3 vext2 <4,u,6,6>, <5,1,7,3>
+ 3640289235U, // <6,6,5,2>: Cost 4 vext1 <2,6,6,5>, <2,6,6,5>
+ 3720679279U, // <6,6,5,3>: Cost 4 vext2 <4,u,6,6>, <5,3,7,0>
+ 2646937542U, // <6,6,5,4>: Cost 3 vext2 <4,u,6,6>, <5,4,7,6>
+ 2646937604U, // <6,6,5,5>: Cost 3 vext2 <4,u,6,6>, <5,5,5,5>
+ 2646937698U, // <6,6,5,6>: Cost 3 vext2 <4,u,6,6>, <5,6,7,0>
+ 2846788918U, // <6,6,5,7>: Cost 3 vuzpr <4,6,4,6>, RHS
+ 2846788919U, // <6,6,5,u>: Cost 3 vuzpr <4,6,4,6>, RHS
+ 1516699750U, // <6,6,6,0>: Cost 2 vext1 <6,6,6,6>, LHS
+ 2590442230U, // <6,6,6,1>: Cost 3 vext1 <6,6,6,6>, <1,0,3,2>
+ 2646938106U, // <6,6,6,2>: Cost 3 vext2 <4,u,6,6>, <6,2,7,3>
+ 2590443670U, // <6,6,6,3>: Cost 3 vext1 <6,6,6,6>, <3,0,1,2>
+ 1516703030U, // <6,6,6,4>: Cost 2 vext1 <6,6,6,6>, RHS
+ 2590445264U, // <6,6,6,5>: Cost 3 vext1 <6,6,6,6>, <5,1,7,3>
+ 296144182U, // <6,6,6,6>: Cost 1 vdup2 RHS
+ 2712572738U, // <6,6,6,7>: Cost 3 vext3 <4,6,4,6>, <6,6,7,7>
+ 296144182U, // <6,6,6,u>: Cost 1 vdup2 RHS
+ 2566561894U, // <6,6,7,0>: Cost 3 vext1 <2,6,6,7>, LHS
+ 3634332924U, // <6,6,7,1>: Cost 4 vext1 <1,6,6,7>, <1,6,6,7>
+ 2566563797U, // <6,6,7,2>: Cost 3 vext1 <2,6,6,7>, <2,6,6,7>
+ 2584480258U, // <6,6,7,3>: Cost 3 vext1 <5,6,6,7>, <3,4,5,6>
+ 2566565174U, // <6,6,7,4>: Cost 3 vext1 <2,6,6,7>, RHS
+ 2717438846U, // <6,6,7,5>: Cost 3 vext3 <5,4,7,6>, <6,7,5,4>
+ 2980500280U, // <6,6,7,6>: Cost 3 vzipr RHS, <6,6,6,6>
+ 1906756918U, // <6,6,7,7>: Cost 2 vzipr RHS, RHS
+ 1906756919U, // <6,6,7,u>: Cost 2 vzipr RHS, RHS
+ 1516699750U, // <6,6,u,0>: Cost 2 vext1 <6,6,6,6>, LHS
+ 1573197614U, // <6,6,u,1>: Cost 2 vext2 <4,u,6,6>, LHS
+ 2566571990U, // <6,6,u,2>: Cost 3 vext1 <2,6,6,u>, <2,6,6,u>
+ 2846786205U, // <6,6,u,3>: Cost 3 vuzpr <4,6,4,6>, LHS
+ 1516703030U, // <6,6,u,4>: Cost 2 vext1 <6,6,6,6>, RHS
+ 1573197978U, // <6,6,u,5>: Cost 2 vext2 <4,u,6,6>, RHS
+ 296144182U, // <6,6,u,6>: Cost 1 vdup2 RHS
+ 1906765110U, // <6,6,u,7>: Cost 2 vzipr RHS, RHS
+ 296144182U, // <6,6,u,u>: Cost 1 vdup2 RHS
+ 1571209216U, // <6,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+ 497467494U, // <6,7,0,1>: Cost 1 vext2 RHS, LHS
+ 1571209380U, // <6,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+ 2644951292U, // <6,7,0,3>: Cost 3 vext2 RHS, <0,3,1,0>
+ 1571209554U, // <6,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+ 1510756450U, // <6,7,0,5>: Cost 2 vext1 <5,6,7,0>, <5,6,7,0>
+ 2644951542U, // <6,7,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
+ 2584499194U, // <6,7,0,7>: Cost 3 vext1 <5,6,7,0>, <7,0,1,2>
+ 497468061U, // <6,7,0,u>: Cost 1 vext2 RHS, LHS
+ 1571209974U, // <6,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+ 1571210036U, // <6,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+ 1571210134U, // <6,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
+ 1571210200U, // <6,7,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
+ 2644952098U, // <6,7,1,4>: Cost 3 vext2 RHS, <1,4,0,5>
+ 1571210384U, // <6,7,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
+ 2644952271U, // <6,7,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
+ 2578535418U, // <6,7,1,7>: Cost 3 vext1 <4,6,7,1>, <7,0,1,2>
+ 1571210605U, // <6,7,1,u>: Cost 2 vext2 RHS, <1,u,1,3>
+ 2644952509U, // <6,7,2,0>: Cost 3 vext2 RHS, <2,0,1,2>
+ 2644952582U, // <6,7,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
+ 1571210856U, // <6,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+ 1571210918U, // <6,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+ 2644952828U, // <6,7,2,4>: Cost 3 vext2 RHS, <2,4,0,6>
+ 2633009028U, // <6,7,2,5>: Cost 3 vext2 <2,5,6,7>, <2,5,6,7>
+ 1571211194U, // <6,7,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
+ 2668840938U, // <6,7,2,7>: Cost 3 vext2 RHS, <2,7,0,1>
+ 1571211323U, // <6,7,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
+ 1571211414U, // <6,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+ 2644953311U, // <6,7,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
+ 2644953390U, // <6,7,3,2>: Cost 3 vext2 RHS, <3,2,0,1>
+ 1571211676U, // <6,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+ 1571211778U, // <6,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+ 2644953648U, // <6,7,3,5>: Cost 3 vext2 RHS, <3,5,1,7>
+ 2644953720U, // <6,7,3,6>: Cost 3 vext2 RHS, <3,6,0,7>
+ 2644953795U, // <6,7,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
+ 1571212062U, // <6,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+ 1573202834U, // <6,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+ 2644954058U, // <6,7,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
+ 2644954166U, // <6,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
+ 2644954258U, // <6,7,4,3>: Cost 3 vext2 RHS, <4,3,6,5>
+ 1571212496U, // <6,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+ 497470774U, // <6,7,4,5>: Cost 1 vext2 RHS, RHS
+ 1573203316U, // <6,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+ 2646281688U, // <6,7,4,7>: Cost 3 vext2 <4,7,6,7>, <4,7,6,7>
+ 497471017U, // <6,7,4,u>: Cost 1 vext2 RHS, RHS
+ 2644954696U, // <6,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
+ 1573203664U, // <6,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+ 2644954878U, // <6,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
+ 2644954991U, // <6,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
+ 1571213254U, // <6,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+ 1571213316U, // <6,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+ 1571213410U, // <6,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
+ 1573204136U, // <6,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+ 1573204217U, // <6,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
+ 2644955425U, // <6,7,6,0>: Cost 3 vext2 RHS, <6,0,1,2>
+ 2644955561U, // <6,7,6,1>: Cost 3 vext2 RHS, <6,1,7,3>
+ 1573204474U, // <6,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+ 2644955698U, // <6,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
+ 2644955789U, // <6,7,6,4>: Cost 3 vext2 RHS, <6,4,5,6>
+ 2644955889U, // <6,7,6,5>: Cost 3 vext2 RHS, <6,5,7,7>
+ 1571214136U, // <6,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
+ 1571214158U, // <6,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+ 1573204895U, // <6,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
+ 1573204986U, // <6,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
+ 2572608656U, // <6,7,7,1>: Cost 3 vext1 <3,6,7,7>, <1,5,3,7>
+ 2644956362U, // <6,7,7,2>: Cost 3 vext2 RHS, <7,2,6,3>
+ 2572610231U, // <6,7,7,3>: Cost 3 vext1 <3,6,7,7>, <3,6,7,7>
+ 1573205350U, // <6,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
+ 2646947220U, // <6,7,7,5>: Cost 3 vext2 RHS, <7,5,1,7>
+ 1516786498U, // <6,7,7,6>: Cost 2 vext1 <6,6,7,7>, <6,6,7,7>
+ 1571214956U, // <6,7,7,7>: Cost 2 vext2 RHS, <7,7,7,7>
+ 1573205634U, // <6,7,7,u>: Cost 2 vext2 RHS, <7,u,1,2>
+ 1571215059U, // <6,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
+ 497473326U, // <6,7,u,1>: Cost 1 vext2 RHS, LHS
+ 1571215237U, // <6,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
+ 1571215292U, // <6,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+ 1571215423U, // <6,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
+ 497473690U, // <6,7,u,5>: Cost 1 vext2 RHS, RHS
+ 1571215568U, // <6,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
+ 1573206272U, // <6,7,u,7>: Cost 2 vext2 RHS, <u,7,0,1>
+ 497473893U, // <6,7,u,u>: Cost 1 vext2 RHS, LHS
+ 1571217408U, // <6,u,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+ 497475686U, // <6,u,0,1>: Cost 1 vext2 RHS, LHS
+ 1571217572U, // <6,u,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+ 2689865445U, // <6,u,0,3>: Cost 3 vext3 <0,u,2,6>, <u,0,3,2>
+ 1571217746U, // <6,u,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+ 1510830187U, // <6,u,0,5>: Cost 2 vext1 <5,6,u,0>, <5,6,u,0>
+ 2644959734U, // <6,u,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
+ 1193130221U, // <6,u,0,7>: Cost 2 vrev <u,6,7,0>
+ 497476253U, // <6,u,0,u>: Cost 1 vext2 RHS, LHS
+ 1571218166U, // <6,u,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+ 1571218228U, // <6,u,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+ 1612289838U, // <6,u,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
+ 1571218392U, // <6,u,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
+ 2566663478U, // <6,u,1,4>: Cost 3 vext1 <2,6,u,1>, RHS
+ 1571218576U, // <6,u,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
+ 2644960463U, // <6,u,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
+ 2717439835U, // <6,u,1,7>: Cost 3 vext3 <5,4,7,6>, <u,1,7,3>
+ 1612289892U, // <6,u,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
+ 1504870502U, // <6,u,2,0>: Cost 2 vext1 <4,6,u,2>, LHS
+ 2644960774U, // <6,u,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
+ 1571219048U, // <6,u,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+ 1571219110U, // <6,u,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+ 1504873782U, // <6,u,2,4>: Cost 2 vext1 <4,6,u,2>, RHS
+ 2633017221U, // <6,u,2,5>: Cost 3 vext2 <2,5,6,u>, <2,5,6,u>
+ 1571219386U, // <6,u,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
+ 2712573868U, // <6,u,2,7>: Cost 3 vext3 <4,6,4,6>, <u,2,7,3>
+ 1571219515U, // <6,u,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
+ 1571219606U, // <6,u,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+ 2644961503U, // <6,u,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
+ 2566678499U, // <6,u,3,2>: Cost 3 vext1 <2,6,u,3>, <2,6,u,3>
+ 1571219868U, // <6,u,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+ 1571219970U, // <6,u,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+ 2689865711U, // <6,u,3,5>: Cost 3 vext3 <0,u,2,6>, <u,3,5,7>
+ 2708002806U, // <6,u,3,6>: Cost 3 vext3 <3,u,5,6>, <u,3,6,5>
+ 2644961987U, // <6,u,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
+ 1571220254U, // <6,u,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+ 1571220370U, // <6,u,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+ 2644962250U, // <6,u,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
+ 1661245476U, // <6,u,4,2>: Cost 2 vext3 <u,4,2,6>, <u,4,2,6>
+ 2686031917U, // <6,u,4,3>: Cost 3 vext3 <0,2,4,6>, <u,4,3,6>
+ 1571220688U, // <6,u,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+ 497478967U, // <6,u,4,5>: Cost 1 vext2 RHS, RHS
+ 1571220852U, // <6,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+ 1661614161U, // <6,u,4,7>: Cost 2 vext3 <u,4,7,6>, <u,4,7,6>
+ 497479209U, // <6,u,4,u>: Cost 1 vext2 RHS, RHS
+ 2566692966U, // <6,u,5,0>: Cost 3 vext1 <2,6,u,5>, LHS
+ 1571221200U, // <6,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+ 2566694885U, // <6,u,5,2>: Cost 3 vext1 <2,6,u,5>, <2,6,u,5>
+ 2689865855U, // <6,u,5,3>: Cost 3 vext3 <0,u,2,6>, <u,5,3,7>
+ 1571221446U, // <6,u,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+ 1571221508U, // <6,u,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+ 1612290202U, // <6,u,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
+ 1571221672U, // <6,u,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+ 1612290220U, // <6,u,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
+ 1504903270U, // <6,u,6,0>: Cost 2 vext1 <4,6,u,6>, LHS
+ 2644963752U, // <6,u,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
+ 1571222010U, // <6,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+ 2686032080U, // <6,u,6,3>: Cost 3 vext3 <0,2,4,6>, <u,6,3,7>
+ 1504906550U, // <6,u,6,4>: Cost 2 vext1 <4,6,u,6>, RHS
+ 2644964079U, // <6,u,6,5>: Cost 3 vext2 RHS, <6,5,7,5>
+ 296144182U, // <6,u,6,6>: Cost 1 vdup2 RHS
+ 1571222350U, // <6,u,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+ 296144182U, // <6,u,6,u>: Cost 1 vdup2 RHS
+ 1492967526U, // <6,u,7,0>: Cost 2 vext1 <2,6,u,7>, LHS
+ 2560738574U, // <6,u,7,1>: Cost 3 vext1 <1,6,u,7>, <1,6,u,7>
+ 1492969447U, // <6,u,7,2>: Cost 2 vext1 <2,6,u,7>, <2,6,u,7>
+ 1906753692U, // <6,u,7,3>: Cost 2 vzipr RHS, LHS
+ 1492970806U, // <6,u,7,4>: Cost 2 vext1 <2,6,u,7>, RHS
+ 2980495761U, // <6,u,7,5>: Cost 3 vzipr RHS, <0,4,u,5>
+ 1516860235U, // <6,u,7,6>: Cost 2 vext1 <6,6,u,7>, <6,6,u,7>
+ 1906756936U, // <6,u,7,7>: Cost 2 vzipr RHS, RHS
+ 1492973358U, // <6,u,7,u>: Cost 2 vext1 <2,6,u,7>, LHS
+ 1492975718U, // <6,u,u,0>: Cost 2 vext1 <2,6,u,u>, LHS
+ 497481518U, // <6,u,u,1>: Cost 1 vext2 RHS, LHS
+ 1612290405U, // <6,u,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
+ 1571223484U, // <6,u,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+ 1492978998U, // <6,u,u,4>: Cost 2 vext1 <2,6,u,u>, RHS
+ 497481882U, // <6,u,u,5>: Cost 1 vext2 RHS, RHS
+ 296144182U, // <6,u,u,6>: Cost 1 vdup2 RHS
+ 1906765128U, // <6,u,u,7>: Cost 2 vzipr RHS, RHS
+ 497482085U, // <6,u,u,u>: Cost 1 vext2 RHS, LHS
+ 1638318080U, // <7,0,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
+ 1638318090U, // <7,0,0,1>: Cost 2 vext3 RHS, <0,0,1,1>
+ 1638318100U, // <7,0,0,2>: Cost 2 vext3 RHS, <0,0,2,2>
+ 3646442178U, // <7,0,0,3>: Cost 4 vext1 <3,7,0,0>, <3,7,0,0>
+ 2712059941U, // <7,0,0,4>: Cost 3 vext3 RHS, <0,0,4,1>
+ 2651603364U, // <7,0,0,5>: Cost 3 vext2 <5,6,7,0>, <0,5,1,6>
+ 2590618445U, // <7,0,0,6>: Cost 3 vext1 <6,7,0,0>, <6,7,0,0>
+ 3785801798U, // <7,0,0,7>: Cost 4 vext3 RHS, <0,0,7,7>
+ 1638318153U, // <7,0,0,u>: Cost 2 vext3 RHS, <0,0,u,1>
+ 1516879974U, // <7,0,1,0>: Cost 2 vext1 <6,7,0,1>, LHS
+ 2693922911U, // <7,0,1,1>: Cost 3 vext3 <1,5,3,7>, <0,1,1,5>
+ 564576358U, // <7,0,1,2>: Cost 1 vext3 RHS, LHS
+ 2638996480U, // <7,0,1,3>: Cost 3 vext2 <3,5,7,0>, <1,3,5,7>
+ 1516883254U, // <7,0,1,4>: Cost 2 vext1 <6,7,0,1>, RHS
+ 2649613456U, // <7,0,1,5>: Cost 3 vext2 <5,3,7,0>, <1,5,3,7>
+ 1516884814U, // <7,0,1,6>: Cost 2 vext1 <6,7,0,1>, <6,7,0,1>
+ 2590626808U, // <7,0,1,7>: Cost 3 vext1 <6,7,0,1>, <7,0,1,0>
+ 564576412U, // <7,0,1,u>: Cost 1 vext3 RHS, LHS
+ 1638318244U, // <7,0,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
+ 2692743344U, // <7,0,2,1>: Cost 3 vext3 <1,3,5,7>, <0,2,1,5>
+ 2712060084U, // <7,0,2,2>: Cost 3 vext3 RHS, <0,2,2,0>
+ 2712060094U, // <7,0,2,3>: Cost 3 vext3 RHS, <0,2,3,1>
+ 1638318284U, // <7,0,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
+ 2712060118U, // <7,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
+ 2651604922U, // <7,0,2,6>: Cost 3 vext2 <5,6,7,0>, <2,6,3,7>
+ 2686255336U, // <7,0,2,7>: Cost 3 vext3 <0,2,7,7>, <0,2,7,7>
+ 1638318316U, // <7,0,2,u>: Cost 2 vext3 RHS, <0,2,u,2>
+ 2651605142U, // <7,0,3,0>: Cost 3 vext2 <5,6,7,0>, <3,0,1,2>
+ 2712060156U, // <7,0,3,1>: Cost 3 vext3 RHS, <0,3,1,0>
+ 2712060165U, // <7,0,3,2>: Cost 3 vext3 RHS, <0,3,2,0>
+ 2651605404U, // <7,0,3,3>: Cost 3 vext2 <5,6,7,0>, <3,3,3,3>
+ 2651605506U, // <7,0,3,4>: Cost 3 vext2 <5,6,7,0>, <3,4,5,6>
+ 2638998111U, // <7,0,3,5>: Cost 3 vext2 <3,5,7,0>, <3,5,7,0>
+ 2639661744U, // <7,0,3,6>: Cost 3 vext2 <3,6,7,0>, <3,6,7,0>
+ 3712740068U, // <7,0,3,7>: Cost 4 vext2 <3,5,7,0>, <3,7,3,7>
+ 2640989010U, // <7,0,3,u>: Cost 3 vext2 <3,u,7,0>, <3,u,7,0>
+ 2712060232U, // <7,0,4,0>: Cost 3 vext3 RHS, <0,4,0,4>
+ 1638318418U, // <7,0,4,1>: Cost 2 vext3 RHS, <0,4,1,5>
+ 1638318428U, // <7,0,4,2>: Cost 2 vext3 RHS, <0,4,2,6>
+ 3646474950U, // <7,0,4,3>: Cost 4 vext1 <3,7,0,4>, <3,7,0,4>
+ 2712060270U, // <7,0,4,4>: Cost 3 vext3 RHS, <0,4,4,6>
+ 1577864502U, // <7,0,4,5>: Cost 2 vext2 <5,6,7,0>, RHS
+ 2651606388U, // <7,0,4,6>: Cost 3 vext2 <5,6,7,0>, <4,6,4,6>
+ 3787792776U, // <7,0,4,7>: Cost 4 vext3 RHS, <0,4,7,5>
+ 1638318481U, // <7,0,4,u>: Cost 2 vext3 RHS, <0,4,u,5>
+ 2590654566U, // <7,0,5,0>: Cost 3 vext1 <6,7,0,5>, LHS
+ 2651606736U, // <7,0,5,1>: Cost 3 vext2 <5,6,7,0>, <5,1,7,3>
+ 2712060334U, // <7,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
+ 2649616239U, // <7,0,5,3>: Cost 3 vext2 <5,3,7,0>, <5,3,7,0>
+ 2651606982U, // <7,0,5,4>: Cost 3 vext2 <5,6,7,0>, <5,4,7,6>
+ 2651607044U, // <7,0,5,5>: Cost 3 vext2 <5,6,7,0>, <5,5,5,5>
+ 1577865314U, // <7,0,5,6>: Cost 2 vext2 <5,6,7,0>, <5,6,7,0>
+ 2651607208U, // <7,0,5,7>: Cost 3 vext2 <5,6,7,0>, <5,7,5,7>
+ 1579192580U, // <7,0,5,u>: Cost 2 vext2 <5,u,7,0>, <5,u,7,0>
+ 2688393709U, // <7,0,6,0>: Cost 3 vext3 <0,6,0,7>, <0,6,0,7>
+ 2712060406U, // <7,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
+ 2688541183U, // <7,0,6,2>: Cost 3 vext3 <0,6,2,7>, <0,6,2,7>
+ 2655588936U, // <7,0,6,3>: Cost 3 vext2 <6,3,7,0>, <6,3,7,0>
+ 3762430481U, // <7,0,6,4>: Cost 4 vext3 <0,6,4,7>, <0,6,4,7>
+ 2651607730U, // <7,0,6,5>: Cost 3 vext2 <5,6,7,0>, <6,5,0,7>
+ 2651607864U, // <7,0,6,6>: Cost 3 vext2 <5,6,7,0>, <6,6,6,6>
+ 2651607886U, // <7,0,6,7>: Cost 3 vext2 <5,6,7,0>, <6,7,0,1>
+ 2688983605U, // <7,0,6,u>: Cost 3 vext3 <0,6,u,7>, <0,6,u,7>
+ 2651608058U, // <7,0,7,0>: Cost 3 vext2 <5,6,7,0>, <7,0,1,2>
+ 2932703334U, // <7,0,7,1>: Cost 3 vzipl <7,7,7,7>, LHS
+ 3066921062U, // <7,0,7,2>: Cost 3 vtrnl <7,7,7,7>, LHS
+ 3712742678U, // <7,0,7,3>: Cost 4 vext2 <3,5,7,0>, <7,3,5,7>
+ 2651608422U, // <7,0,7,4>: Cost 3 vext2 <5,6,7,0>, <7,4,5,6>
+ 2651608513U, // <7,0,7,5>: Cost 3 vext2 <5,6,7,0>, <7,5,6,7>
+ 2663552532U, // <7,0,7,6>: Cost 3 vext2 <7,6,7,0>, <7,6,7,0>
+ 2651608684U, // <7,0,7,7>: Cost 3 vext2 <5,6,7,0>, <7,7,7,7>
+ 2651608706U, // <7,0,7,u>: Cost 3 vext2 <5,6,7,0>, <7,u,1,2>
+ 1638318730U, // <7,0,u,0>: Cost 2 vext3 RHS, <0,u,0,2>
+ 1638318738U, // <7,0,u,1>: Cost 2 vext3 RHS, <0,u,1,1>
+ 564576925U, // <7,0,u,2>: Cost 1 vext3 RHS, LHS
+ 2572765898U, // <7,0,u,3>: Cost 3 vext1 <3,7,0,u>, <3,7,0,u>
+ 1638318770U, // <7,0,u,4>: Cost 2 vext3 RHS, <0,u,4,6>
+ 1577867418U, // <7,0,u,5>: Cost 2 vext2 <5,6,7,0>, RHS
+ 1516942165U, // <7,0,u,6>: Cost 2 vext1 <6,7,0,u>, <6,7,0,u>
+ 2651609344U, // <7,0,u,7>: Cost 3 vext2 <5,6,7,0>, <u,7,0,1>
+ 564576979U, // <7,0,u,u>: Cost 1 vext3 RHS, LHS
+ 2590687334U, // <7,1,0,0>: Cost 3 vext1 <6,7,1,0>, LHS
+ 2639003750U, // <7,1,0,1>: Cost 3 vext2 <3,5,7,1>, LHS
+ 2793357414U, // <7,1,0,2>: Cost 3 vuzpl <7,0,1,2>, LHS
+ 1638318838U, // <7,1,0,3>: Cost 2 vext3 RHS, <1,0,3,2>
+ 2590690614U, // <7,1,0,4>: Cost 3 vext1 <6,7,1,0>, RHS
+ 2712060679U, // <7,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
+ 2590692182U, // <7,1,0,6>: Cost 3 vext1 <6,7,1,0>, <6,7,1,0>
+ 3785802521U, // <7,1,0,7>: Cost 4 vext3 RHS, <1,0,7,1>
+ 1638318883U, // <7,1,0,u>: Cost 2 vext3 RHS, <1,0,u,2>
+ 2712060715U, // <7,1,1,0>: Cost 3 vext3 RHS, <1,1,0,1>
+ 1638318900U, // <7,1,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
+ 3774300994U, // <7,1,1,2>: Cost 4 vext3 <2,6,3,7>, <1,1,2,6>
+ 1638318920U, // <7,1,1,3>: Cost 2 vext3 RHS, <1,1,3,3>
+ 2712060755U, // <7,1,1,4>: Cost 3 vext3 RHS, <1,1,4,5>
+ 2691416926U, // <7,1,1,5>: Cost 3 vext3 <1,1,5,7>, <1,1,5,7>
+ 2590700375U, // <7,1,1,6>: Cost 3 vext1 <6,7,1,1>, <6,7,1,1>
+ 3765158766U, // <7,1,1,7>: Cost 4 vext3 <1,1,5,7>, <1,1,7,5>
+ 1638318965U, // <7,1,1,u>: Cost 2 vext3 RHS, <1,1,u,3>
+ 2712060796U, // <7,1,2,0>: Cost 3 vext3 RHS, <1,2,0,1>
+ 2712060807U, // <7,1,2,1>: Cost 3 vext3 RHS, <1,2,1,3>
+ 3712747112U, // <7,1,2,2>: Cost 4 vext2 <3,5,7,1>, <2,2,2,2>
+ 1638318998U, // <7,1,2,3>: Cost 2 vext3 RHS, <1,2,3,0>
+ 2712060836U, // <7,1,2,4>: Cost 3 vext3 RHS, <1,2,4,5>
+ 2712060843U, // <7,1,2,5>: Cost 3 vext3 RHS, <1,2,5,3>
+ 2590708568U, // <7,1,2,6>: Cost 3 vext1 <6,7,1,2>, <6,7,1,2>
+ 2735948730U, // <7,1,2,7>: Cost 3 vext3 RHS, <1,2,7,0>
+ 1638319043U, // <7,1,2,u>: Cost 2 vext3 RHS, <1,2,u,0>
+ 2712060876U, // <7,1,3,0>: Cost 3 vext3 RHS, <1,3,0,0>
+ 1638319064U, // <7,1,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
+ 2712060894U, // <7,1,3,2>: Cost 3 vext3 RHS, <1,3,2,0>
+ 2692596718U, // <7,1,3,3>: Cost 3 vext3 <1,3,3,7>, <1,3,3,7>
+ 2712060917U, // <7,1,3,4>: Cost 3 vext3 RHS, <1,3,4,5>
+ 1619002368U, // <7,1,3,5>: Cost 2 vext3 <1,3,5,7>, <1,3,5,7>
+ 2692817929U, // <7,1,3,6>: Cost 3 vext3 <1,3,6,7>, <1,3,6,7>
+ 2735948814U, // <7,1,3,7>: Cost 3 vext3 RHS, <1,3,7,3>
+ 1619223579U, // <7,1,3,u>: Cost 2 vext3 <1,3,u,7>, <1,3,u,7>
+ 2712060962U, // <7,1,4,0>: Cost 3 vext3 RHS, <1,4,0,5>
+ 2712060971U, // <7,1,4,1>: Cost 3 vext3 RHS, <1,4,1,5>
+ 2712060980U, // <7,1,4,2>: Cost 3 vext3 RHS, <1,4,2,5>
+ 2712060989U, // <7,1,4,3>: Cost 3 vext3 RHS, <1,4,3,5>
+ 3785802822U, // <7,1,4,4>: Cost 4 vext3 RHS, <1,4,4,5>
+ 2639007030U, // <7,1,4,5>: Cost 3 vext2 <3,5,7,1>, RHS
+ 2645642634U, // <7,1,4,6>: Cost 3 vext2 <4,6,7,1>, <4,6,7,1>
+ 3719384520U, // <7,1,4,7>: Cost 4 vext2 <4,6,7,1>, <4,7,5,0>
+ 2639007273U, // <7,1,4,u>: Cost 3 vext2 <3,5,7,1>, RHS
+ 2572812390U, // <7,1,5,0>: Cost 3 vext1 <3,7,1,5>, LHS
+ 2693776510U, // <7,1,5,1>: Cost 3 vext3 <1,5,1,7>, <1,5,1,7>
+ 3774301318U, // <7,1,5,2>: Cost 4 vext3 <2,6,3,7>, <1,5,2,6>
+ 1620182160U, // <7,1,5,3>: Cost 2 vext3 <1,5,3,7>, <1,5,3,7>
+ 2572815670U, // <7,1,5,4>: Cost 3 vext1 <3,7,1,5>, RHS
+ 3766486178U, // <7,1,5,5>: Cost 4 vext3 <1,3,5,7>, <1,5,5,7>
+ 2651615331U, // <7,1,5,6>: Cost 3 vext2 <5,6,7,1>, <5,6,7,1>
+ 2652278964U, // <7,1,5,7>: Cost 3 vext2 <5,7,7,1>, <5,7,7,1>
+ 1620550845U, // <7,1,5,u>: Cost 2 vext3 <1,5,u,7>, <1,5,u,7>
+ 3768108230U, // <7,1,6,0>: Cost 4 vext3 <1,6,0,7>, <1,6,0,7>
+ 2694440143U, // <7,1,6,1>: Cost 3 vext3 <1,6,1,7>, <1,6,1,7>
+ 2712061144U, // <7,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
+ 2694587617U, // <7,1,6,3>: Cost 3 vext3 <1,6,3,7>, <1,6,3,7>
+ 3768403178U, // <7,1,6,4>: Cost 4 vext3 <1,6,4,7>, <1,6,4,7>
+ 2694735091U, // <7,1,6,5>: Cost 3 vext3 <1,6,5,7>, <1,6,5,7>
+ 3768550652U, // <7,1,6,6>: Cost 4 vext3 <1,6,6,7>, <1,6,6,7>
+ 2652279630U, // <7,1,6,7>: Cost 3 vext2 <5,7,7,1>, <6,7,0,1>
+ 2694956302U, // <7,1,6,u>: Cost 3 vext3 <1,6,u,7>, <1,6,u,7>
+ 2645644282U, // <7,1,7,0>: Cost 3 vext2 <4,6,7,1>, <7,0,1,2>
+ 2859062094U, // <7,1,7,1>: Cost 3 vuzpr <6,7,0,1>, <6,7,0,1>
+ 3779462437U, // <7,1,7,2>: Cost 4 vext3 <3,5,1,7>, <1,7,2,3>
+ 3121938534U, // <7,1,7,3>: Cost 3 vtrnr <5,7,5,7>, LHS
+ 2554916150U, // <7,1,7,4>: Cost 3 vext1 <0,7,1,7>, RHS
+ 3769140548U, // <7,1,7,5>: Cost 4 vext3 <1,7,5,7>, <1,7,5,7>
+ 3726022164U, // <7,1,7,6>: Cost 4 vext2 <5,7,7,1>, <7,6,7,0>
+ 2554918508U, // <7,1,7,7>: Cost 3 vext1 <0,7,1,7>, <7,7,7,7>
+ 3121938539U, // <7,1,7,u>: Cost 3 vtrnr <5,7,5,7>, LHS
+ 2572836966U, // <7,1,u,0>: Cost 3 vext1 <3,7,1,u>, LHS
+ 1638319469U, // <7,1,u,1>: Cost 2 vext3 RHS, <1,u,1,3>
+ 2712061299U, // <7,1,u,2>: Cost 3 vext3 RHS, <1,u,2,0>
+ 1622173059U, // <7,1,u,3>: Cost 2 vext3 <1,u,3,7>, <1,u,3,7>
+ 2572840246U, // <7,1,u,4>: Cost 3 vext1 <3,7,1,u>, RHS
+ 1622320533U, // <7,1,u,5>: Cost 2 vext3 <1,u,5,7>, <1,u,5,7>
+ 2696136094U, // <7,1,u,6>: Cost 3 vext3 <1,u,6,7>, <1,u,6,7>
+ 2859060777U, // <7,1,u,7>: Cost 3 vuzpr <6,7,0,1>, RHS
+ 1622541744U, // <7,1,u,u>: Cost 2 vext3 <1,u,u,7>, <1,u,u,7>
+ 2712061364U, // <7,2,0,0>: Cost 3 vext3 RHS, <2,0,0,2>
+ 2712061373U, // <7,2,0,1>: Cost 3 vext3 RHS, <2,0,1,2>
+ 2712061380U, // <7,2,0,2>: Cost 3 vext3 RHS, <2,0,2,0>
+ 2712061389U, // <7,2,0,3>: Cost 3 vext3 RHS, <2,0,3,0>
+ 2712061404U, // <7,2,0,4>: Cost 3 vext3 RHS, <2,0,4,6>
+ 2696725990U, // <7,2,0,5>: Cost 3 vext3 <2,0,5,7>, <2,0,5,7>
+ 2712061417U, // <7,2,0,6>: Cost 3 vext3 RHS, <2,0,6,1>
+ 3785803251U, // <7,2,0,7>: Cost 4 vext3 RHS, <2,0,7,2>
+ 2696947201U, // <7,2,0,u>: Cost 3 vext3 <2,0,u,7>, <2,0,u,7>
+ 2712061446U, // <7,2,1,0>: Cost 3 vext3 RHS, <2,1,0,3>
+ 3785803276U, // <7,2,1,1>: Cost 4 vext3 RHS, <2,1,1,0>
+ 3785803285U, // <7,2,1,2>: Cost 4 vext3 RHS, <2,1,2,0>
+ 2712061471U, // <7,2,1,3>: Cost 3 vext3 RHS, <2,1,3,1>
+ 2712061482U, // <7,2,1,4>: Cost 3 vext3 RHS, <2,1,4,3>
+ 3766486576U, // <7,2,1,5>: Cost 4 vext3 <1,3,5,7>, <2,1,5,0>
+ 2712061500U, // <7,2,1,6>: Cost 3 vext3 RHS, <2,1,6,3>
+ 2602718850U, // <7,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
+ 2712061516U, // <7,2,1,u>: Cost 3 vext3 RHS, <2,1,u,1>
+ 2712061525U, // <7,2,2,0>: Cost 3 vext3 RHS, <2,2,0,1>
+ 2712061536U, // <7,2,2,1>: Cost 3 vext3 RHS, <2,2,1,3>
+ 1638319720U, // <7,2,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
+ 1638319730U, // <7,2,2,3>: Cost 2 vext3 RHS, <2,2,3,3>
+ 2712061565U, // <7,2,2,4>: Cost 3 vext3 RHS, <2,2,4,5>
+ 2698053256U, // <7,2,2,5>: Cost 3 vext3 <2,2,5,7>, <2,2,5,7>
+ 2712061584U, // <7,2,2,6>: Cost 3 vext3 RHS, <2,2,6,6>
+ 3771795096U, // <7,2,2,7>: Cost 4 vext3 <2,2,5,7>, <2,2,7,5>
+ 1638319775U, // <7,2,2,u>: Cost 2 vext3 RHS, <2,2,u,3>
+ 1638319782U, // <7,2,3,0>: Cost 2 vext3 RHS, <2,3,0,1>
+ 2693924531U, // <7,2,3,1>: Cost 3 vext3 <1,5,3,7>, <2,3,1,5>
+ 2700560061U, // <7,2,3,2>: Cost 3 vext3 <2,6,3,7>, <2,3,2,6>
+ 2693924551U, // <7,2,3,3>: Cost 3 vext3 <1,5,3,7>, <2,3,3,7>
+ 1638319822U, // <7,2,3,4>: Cost 2 vext3 RHS, <2,3,4,5>
+ 2698716889U, // <7,2,3,5>: Cost 3 vext3 <2,3,5,7>, <2,3,5,7>
+ 2712061665U, // <7,2,3,6>: Cost 3 vext3 RHS, <2,3,6,6>
+ 2735949540U, // <7,2,3,7>: Cost 3 vext3 RHS, <2,3,7,0>
+ 1638319854U, // <7,2,3,u>: Cost 2 vext3 RHS, <2,3,u,1>
+ 2712061692U, // <7,2,4,0>: Cost 3 vext3 RHS, <2,4,0,6>
+ 2712061698U, // <7,2,4,1>: Cost 3 vext3 RHS, <2,4,1,3>
+ 2712061708U, // <7,2,4,2>: Cost 3 vext3 RHS, <2,4,2,4>
+ 2712061718U, // <7,2,4,3>: Cost 3 vext3 RHS, <2,4,3,5>
+ 2712061728U, // <7,2,4,4>: Cost 3 vext3 RHS, <2,4,4,6>
+ 2699380522U, // <7,2,4,5>: Cost 3 vext3 <2,4,5,7>, <2,4,5,7>
+ 2712061740U, // <7,2,4,6>: Cost 3 vext3 RHS, <2,4,6,0>
+ 3809691445U, // <7,2,4,7>: Cost 4 vext3 RHS, <2,4,7,0>
+ 2699601733U, // <7,2,4,u>: Cost 3 vext3 <2,4,u,7>, <2,4,u,7>
+ 2699675470U, // <7,2,5,0>: Cost 3 vext3 <2,5,0,7>, <2,5,0,7>
+ 3766486867U, // <7,2,5,1>: Cost 4 vext3 <1,3,5,7>, <2,5,1,3>
+ 2699822944U, // <7,2,5,2>: Cost 3 vext3 <2,5,2,7>, <2,5,2,7>
+ 2692745065U, // <7,2,5,3>: Cost 3 vext3 <1,3,5,7>, <2,5,3,7>
+ 2699970418U, // <7,2,5,4>: Cost 3 vext3 <2,5,4,7>, <2,5,4,7>
+ 3766486907U, // <7,2,5,5>: Cost 4 vext3 <1,3,5,7>, <2,5,5,7>
+ 2700117892U, // <7,2,5,6>: Cost 3 vext3 <2,5,6,7>, <2,5,6,7>
+ 3771795334U, // <7,2,5,7>: Cost 4 vext3 <2,2,5,7>, <2,5,7,0>
+ 2692745110U, // <7,2,5,u>: Cost 3 vext3 <1,3,5,7>, <2,5,u,7>
+ 2572894310U, // <7,2,6,0>: Cost 3 vext1 <3,7,2,6>, LHS
+ 2712061860U, // <7,2,6,1>: Cost 3 vext3 RHS, <2,6,1,3>
+ 2700486577U, // <7,2,6,2>: Cost 3 vext3 <2,6,2,7>, <2,6,2,7>
+ 1626818490U, // <7,2,6,3>: Cost 2 vext3 <2,6,3,7>, <2,6,3,7>
+ 2572897590U, // <7,2,6,4>: Cost 3 vext1 <3,7,2,6>, RHS
+ 2700707788U, // <7,2,6,5>: Cost 3 vext3 <2,6,5,7>, <2,6,5,7>
+ 2700781525U, // <7,2,6,6>: Cost 3 vext3 <2,6,6,7>, <2,6,6,7>
+ 3774597086U, // <7,2,6,7>: Cost 4 vext3 <2,6,7,7>, <2,6,7,7>
+ 1627187175U, // <7,2,6,u>: Cost 2 vext3 <2,6,u,7>, <2,6,u,7>
+ 2735949802U, // <7,2,7,0>: Cost 3 vext3 RHS, <2,7,0,1>
+ 3780200434U, // <7,2,7,1>: Cost 4 vext3 <3,6,2,7>, <2,7,1,0>
+ 3773564928U, // <7,2,7,2>: Cost 4 vext3 <2,5,2,7>, <2,7,2,5>
+ 2986541158U, // <7,2,7,3>: Cost 3 vzipr <5,5,7,7>, LHS
+ 2554989878U, // <7,2,7,4>: Cost 3 vext1 <0,7,2,7>, RHS
+ 3775113245U, // <7,2,7,5>: Cost 4 vext3 <2,7,5,7>, <2,7,5,7>
+ 4060283228U, // <7,2,7,6>: Cost 4 vzipr <5,5,7,7>, <0,4,2,6>
+ 2554992236U, // <7,2,7,7>: Cost 3 vext1 <0,7,2,7>, <7,7,7,7>
+ 2986541163U, // <7,2,7,u>: Cost 3 vzipr <5,5,7,7>, LHS
+ 1638320187U, // <7,2,u,0>: Cost 2 vext3 RHS, <2,u,0,1>
+ 2693924936U, // <7,2,u,1>: Cost 3 vext3 <1,5,3,7>, <2,u,1,5>
+ 1638319720U, // <7,2,u,2>: Cost 2 vext3 RHS, <2,2,2,2>
+ 1628145756U, // <7,2,u,3>: Cost 2 vext3 <2,u,3,7>, <2,u,3,7>
+ 1638320227U, // <7,2,u,4>: Cost 2 vext3 RHS, <2,u,4,5>
+ 2702035054U, // <7,2,u,5>: Cost 3 vext3 <2,u,5,7>, <2,u,5,7>
+ 2702108791U, // <7,2,u,6>: Cost 3 vext3 <2,u,6,7>, <2,u,6,7>
+ 2735949945U, // <7,2,u,7>: Cost 3 vext3 RHS, <2,u,7,0>
+ 1628514441U, // <7,2,u,u>: Cost 2 vext3 <2,u,u,7>, <2,u,u,7>
+ 2712062091U, // <7,3,0,0>: Cost 3 vext3 RHS, <3,0,0,0>
+ 1638320278U, // <7,3,0,1>: Cost 2 vext3 RHS, <3,0,1,2>
+ 2712062109U, // <7,3,0,2>: Cost 3 vext3 RHS, <3,0,2,0>
+ 2590836886U, // <7,3,0,3>: Cost 3 vext1 <6,7,3,0>, <3,0,1,2>
+ 2712062128U, // <7,3,0,4>: Cost 3 vext3 RHS, <3,0,4,1>
+ 2712062138U, // <7,3,0,5>: Cost 3 vext3 RHS, <3,0,5,2>
+ 2590839656U, // <7,3,0,6>: Cost 3 vext1 <6,7,3,0>, <6,7,3,0>
+ 3311414017U, // <7,3,0,7>: Cost 4 vrev <3,7,7,0>
+ 1638320341U, // <7,3,0,u>: Cost 2 vext3 RHS, <3,0,u,2>
+ 2237164227U, // <7,3,1,0>: Cost 3 vrev <3,7,0,1>
+ 2712062182U, // <7,3,1,1>: Cost 3 vext3 RHS, <3,1,1,1>
+ 2712062193U, // <7,3,1,2>: Cost 3 vext3 RHS, <3,1,2,3>
+ 2692745468U, // <7,3,1,3>: Cost 3 vext3 <1,3,5,7>, <3,1,3,5>
+ 2712062214U, // <7,3,1,4>: Cost 3 vext3 RHS, <3,1,4,6>
+ 2693925132U, // <7,3,1,5>: Cost 3 vext3 <1,5,3,7>, <3,1,5,3>
+ 3768183059U, // <7,3,1,6>: Cost 4 vext3 <1,6,1,7>, <3,1,6,1>
+ 2692745504U, // <7,3,1,7>: Cost 3 vext3 <1,3,5,7>, <3,1,7,5>
+ 2696063273U, // <7,3,1,u>: Cost 3 vext3 <1,u,5,7>, <3,1,u,5>
+ 2712062254U, // <7,3,2,0>: Cost 3 vext3 RHS, <3,2,0,1>
+ 2712062262U, // <7,3,2,1>: Cost 3 vext3 RHS, <3,2,1,0>
+ 2712062273U, // <7,3,2,2>: Cost 3 vext3 RHS, <3,2,2,2>
+ 2712062280U, // <7,3,2,3>: Cost 3 vext3 RHS, <3,2,3,0>
+ 2712062294U, // <7,3,2,4>: Cost 3 vext3 RHS, <3,2,4,5>
+ 2712062302U, // <7,3,2,5>: Cost 3 vext3 RHS, <3,2,5,4>
+ 2700560742U, // <7,3,2,6>: Cost 3 vext3 <2,6,3,7>, <3,2,6,3>
+ 2712062319U, // <7,3,2,7>: Cost 3 vext3 RHS, <3,2,7,3>
+ 2712062325U, // <7,3,2,u>: Cost 3 vext3 RHS, <3,2,u,0>
+ 2712062335U, // <7,3,3,0>: Cost 3 vext3 RHS, <3,3,0,1>
+ 2636368158U, // <7,3,3,1>: Cost 3 vext2 <3,1,7,3>, <3,1,7,3>
+ 2637031791U, // <7,3,3,2>: Cost 3 vext2 <3,2,7,3>, <3,2,7,3>
+ 1638320540U, // <7,3,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
+ 2712062374U, // <7,3,3,4>: Cost 3 vext3 RHS, <3,3,4,4>
+ 2704689586U, // <7,3,3,5>: Cost 3 vext3 <3,3,5,7>, <3,3,5,7>
+ 2590864235U, // <7,3,3,6>: Cost 3 vext1 <6,7,3,3>, <6,7,3,3>
+ 2704837060U, // <7,3,3,7>: Cost 3 vext3 <3,3,7,7>, <3,3,7,7>
+ 1638320540U, // <7,3,3,u>: Cost 2 vext3 RHS, <3,3,3,3>
+ 2712062416U, // <7,3,4,0>: Cost 3 vext3 RHS, <3,4,0,1>
+ 2712062426U, // <7,3,4,1>: Cost 3 vext3 RHS, <3,4,1,2>
+ 2566981640U, // <7,3,4,2>: Cost 3 vext1 <2,7,3,4>, <2,7,3,4>
+ 2712062447U, // <7,3,4,3>: Cost 3 vext3 RHS, <3,4,3,5>
+ 2712062456U, // <7,3,4,4>: Cost 3 vext3 RHS, <3,4,4,5>
+ 1638320642U, // <7,3,4,5>: Cost 2 vext3 RHS, <3,4,5,6>
+ 2648313204U, // <7,3,4,6>: Cost 3 vext2 <5,1,7,3>, <4,6,4,6>
+ 3311446789U, // <7,3,4,7>: Cost 4 vrev <3,7,7,4>
+ 1638320669U, // <7,3,4,u>: Cost 2 vext3 RHS, <3,4,u,6>
+ 2602819686U, // <7,3,5,0>: Cost 3 vext1 <u,7,3,5>, LHS
+ 1574571728U, // <7,3,5,1>: Cost 2 vext2 <5,1,7,3>, <5,1,7,3>
+ 2648977185U, // <7,3,5,2>: Cost 3 vext2 <5,2,7,3>, <5,2,7,3>
+ 2705869378U, // <7,3,5,3>: Cost 3 vext3 <3,5,3,7>, <3,5,3,7>
+ 2237491947U, // <7,3,5,4>: Cost 3 vrev <3,7,4,5>
+ 2706016852U, // <7,3,5,5>: Cost 3 vext3 <3,5,5,7>, <3,5,5,7>
+ 2648313954U, // <7,3,5,6>: Cost 3 vext2 <5,1,7,3>, <5,6,7,0>
+ 2692745823U, // <7,3,5,7>: Cost 3 vext3 <1,3,5,7>, <3,5,7,0>
+ 1579217159U, // <7,3,5,u>: Cost 2 vext2 <5,u,7,3>, <5,u,7,3>
+ 2706311800U, // <7,3,6,0>: Cost 3 vext3 <3,6,0,7>, <3,6,0,7>
+ 2654286249U, // <7,3,6,1>: Cost 3 vext2 <6,1,7,3>, <6,1,7,3>
+ 1581208058U, // <7,3,6,2>: Cost 2 vext2 <6,2,7,3>, <6,2,7,3>
+ 2706533011U, // <7,3,6,3>: Cost 3 vext3 <3,6,3,7>, <3,6,3,7>
+ 2706606748U, // <7,3,6,4>: Cost 3 vext3 <3,6,4,7>, <3,6,4,7>
+ 3780422309U, // <7,3,6,5>: Cost 4 vext3 <3,6,5,7>, <3,6,5,7>
+ 2712062637U, // <7,3,6,6>: Cost 3 vext3 RHS, <3,6,6,6>
+ 2706827959U, // <7,3,6,7>: Cost 3 vext3 <3,6,7,7>, <3,6,7,7>
+ 1585189856U, // <7,3,6,u>: Cost 2 vext2 <6,u,7,3>, <6,u,7,3>
+ 2693925571U, // <7,3,7,0>: Cost 3 vext3 <1,5,3,7>, <3,7,0,1>
+ 2693925584U, // <7,3,7,1>: Cost 3 vext3 <1,5,3,7>, <3,7,1,5>
+ 2700561114U, // <7,3,7,2>: Cost 3 vext3 <2,6,3,7>, <3,7,2,6>
+ 2572978916U, // <7,3,7,3>: Cost 3 vext1 <3,7,3,7>, <3,7,3,7>
+ 2693925611U, // <7,3,7,4>: Cost 3 vext3 <1,5,3,7>, <3,7,4,5>
+ 2707344118U, // <7,3,7,5>: Cost 3 vext3 <3,7,5,7>, <3,7,5,7>
+ 2654950894U, // <7,3,7,6>: Cost 3 vext2 <6,2,7,3>, <7,6,2,7>
+ 2648315500U, // <7,3,7,7>: Cost 3 vext2 <5,1,7,3>, <7,7,7,7>
+ 2693925643U, // <7,3,7,u>: Cost 3 vext3 <1,5,3,7>, <3,7,u,1>
+ 2237221578U, // <7,3,u,0>: Cost 3 vrev <3,7,0,u>
+ 1638320926U, // <7,3,u,1>: Cost 2 vext3 RHS, <3,u,1,2>
+ 1593153452U, // <7,3,u,2>: Cost 2 vext2 <u,2,7,3>, <u,2,7,3>
+ 1638320540U, // <7,3,u,3>: Cost 2 vext3 RHS, <3,3,3,3>
+ 2237516526U, // <7,3,u,4>: Cost 3 vrev <3,7,4,u>
+ 1638320966U, // <7,3,u,5>: Cost 2 vext3 RHS, <3,u,5,6>
+ 2712062796U, // <7,3,u,6>: Cost 3 vext3 RHS, <3,u,6,3>
+ 2692967250U, // <7,3,u,7>: Cost 3 vext3 <1,3,u,7>, <3,u,7,0>
+ 1638320989U, // <7,3,u,u>: Cost 2 vext3 RHS, <3,u,u,2>
+ 2651635712U, // <7,4,0,0>: Cost 3 vext2 <5,6,7,4>, <0,0,0,0>
+ 1577893990U, // <7,4,0,1>: Cost 2 vext2 <5,6,7,4>, LHS
+ 2651635876U, // <7,4,0,2>: Cost 3 vext2 <5,6,7,4>, <0,2,0,2>
+ 3785804672U, // <7,4,0,3>: Cost 4 vext3 RHS, <4,0,3,1>
+ 2651636050U, // <7,4,0,4>: Cost 3 vext2 <5,6,7,4>, <0,4,1,5>
+ 1638468498U, // <7,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
+ 1638468508U, // <7,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
+ 3787795364U, // <7,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
+ 1640459181U, // <7,4,0,u>: Cost 2 vext3 RHS, <4,0,u,1>
+ 2651636470U, // <7,4,1,0>: Cost 3 vext2 <5,6,7,4>, <1,0,3,2>
+ 2651636532U, // <7,4,1,1>: Cost 3 vext2 <5,6,7,4>, <1,1,1,1>
+ 2712062922U, // <7,4,1,2>: Cost 3 vext3 RHS, <4,1,2,3>
+ 2639029248U, // <7,4,1,3>: Cost 3 vext2 <3,5,7,4>, <1,3,5,7>
+ 2712062940U, // <7,4,1,4>: Cost 3 vext3 RHS, <4,1,4,3>
+ 2712062946U, // <7,4,1,5>: Cost 3 vext3 RHS, <4,1,5,0>
+ 2712062958U, // <7,4,1,6>: Cost 3 vext3 RHS, <4,1,6,3>
+ 3785804791U, // <7,4,1,7>: Cost 4 vext3 RHS, <4,1,7,3>
+ 2712062973U, // <7,4,1,u>: Cost 3 vext3 RHS, <4,1,u,0>
+ 3785804807U, // <7,4,2,0>: Cost 4 vext3 RHS, <4,2,0,1>
+ 3785804818U, // <7,4,2,1>: Cost 4 vext3 RHS, <4,2,1,3>
+ 2651637352U, // <7,4,2,2>: Cost 3 vext2 <5,6,7,4>, <2,2,2,2>
+ 2651637414U, // <7,4,2,3>: Cost 3 vext2 <5,6,7,4>, <2,3,0,1>
+ 3716753194U, // <7,4,2,4>: Cost 4 vext2 <4,2,7,4>, <2,4,5,7>
+ 2712063030U, // <7,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
+ 2712063036U, // <7,4,2,6>: Cost 3 vext3 RHS, <4,2,6,0>
+ 3773123658U, // <7,4,2,7>: Cost 4 vext3 <2,4,5,7>, <4,2,7,5>
+ 2712063054U, // <7,4,2,u>: Cost 3 vext3 RHS, <4,2,u,0>
+ 2651637910U, // <7,4,3,0>: Cost 3 vext2 <5,6,7,4>, <3,0,1,2>
+ 3712772348U, // <7,4,3,1>: Cost 4 vext2 <3,5,7,4>, <3,1,3,5>
+ 3785804906U, // <7,4,3,2>: Cost 4 vext3 RHS, <4,3,2,1>
+ 2651638172U, // <7,4,3,3>: Cost 3 vext2 <5,6,7,4>, <3,3,3,3>
+ 2651638274U, // <7,4,3,4>: Cost 3 vext2 <5,6,7,4>, <3,4,5,6>
+ 2639030883U, // <7,4,3,5>: Cost 3 vext2 <3,5,7,4>, <3,5,7,4>
+ 2712063122U, // <7,4,3,6>: Cost 3 vext3 RHS, <4,3,6,5>
+ 3712772836U, // <7,4,3,7>: Cost 4 vext2 <3,5,7,4>, <3,7,3,7>
+ 2641021782U, // <7,4,3,u>: Cost 3 vext2 <3,u,7,4>, <3,u,7,4>
+ 2714053802U, // <7,4,4,0>: Cost 3 vext3 RHS, <4,4,0,2>
+ 3785804978U, // <7,4,4,1>: Cost 4 vext3 RHS, <4,4,1,1>
+ 3716754505U, // <7,4,4,2>: Cost 4 vext2 <4,2,7,4>, <4,2,7,4>
+ 3785804998U, // <7,4,4,3>: Cost 4 vext3 RHS, <4,4,3,3>
+ 1638321360U, // <7,4,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
+ 1638468826U, // <7,4,4,5>: Cost 2 vext3 RHS, <4,4,5,5>
+ 1638468836U, // <7,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
+ 3785215214U, // <7,4,4,7>: Cost 4 vext3 <4,4,7,7>, <4,4,7,7>
+ 1640459509U, // <7,4,4,u>: Cost 2 vext3 RHS, <4,4,u,5>
+ 1517207654U, // <7,4,5,0>: Cost 2 vext1 <6,7,4,5>, LHS
+ 2573034640U, // <7,4,5,1>: Cost 3 vext1 <3,7,4,5>, <1,5,3,7>
+ 2712063246U, // <7,4,5,2>: Cost 3 vext3 RHS, <4,5,2,3>
+ 2573036267U, // <7,4,5,3>: Cost 3 vext1 <3,7,4,5>, <3,7,4,5>
+ 1517210934U, // <7,4,5,4>: Cost 2 vext1 <6,7,4,5>, RHS
+ 2711989549U, // <7,4,5,5>: Cost 3 vext3 <4,5,5,7>, <4,5,5,7>
+ 564579638U, // <7,4,5,6>: Cost 1 vext3 RHS, RHS
+ 2651639976U, // <7,4,5,7>: Cost 3 vext2 <5,6,7,4>, <5,7,5,7>
+ 564579656U, // <7,4,5,u>: Cost 1 vext3 RHS, RHS
+ 2712063307U, // <7,4,6,0>: Cost 3 vext3 RHS, <4,6,0,1>
+ 3767668056U, // <7,4,6,1>: Cost 4 vext3 <1,5,3,7>, <4,6,1,5>
+ 2651640314U, // <7,4,6,2>: Cost 3 vext2 <5,6,7,4>, <6,2,7,3>
+ 2655621708U, // <7,4,6,3>: Cost 3 vext2 <6,3,7,4>, <6,3,7,4>
+ 1638468980U, // <7,4,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
+ 2712063358U, // <7,4,6,5>: Cost 3 vext3 RHS, <4,6,5,7>
+ 2712063367U, // <7,4,6,6>: Cost 3 vext3 RHS, <4,6,6,7>
+ 2712210826U, // <7,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
+ 1638469012U, // <7,4,6,u>: Cost 2 vext3 RHS, <4,6,u,2>
+ 2651640826U, // <7,4,7,0>: Cost 3 vext2 <5,6,7,4>, <7,0,1,2>
+ 3773713830U, // <7,4,7,1>: Cost 4 vext3 <2,5,4,7>, <4,7,1,2>
+ 3773713842U, // <7,4,7,2>: Cost 4 vext3 <2,5,4,7>, <4,7,2,5>
+ 3780349372U, // <7,4,7,3>: Cost 4 vext3 <3,6,4,7>, <4,7,3,6>
+ 2651641140U, // <7,4,7,4>: Cost 3 vext2 <5,6,7,4>, <7,4,0,1>
+ 2712210888U, // <7,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
+ 2712210898U, // <7,4,7,6>: Cost 3 vext3 RHS, <4,7,6,1>
+ 2651641452U, // <7,4,7,7>: Cost 3 vext2 <5,6,7,4>, <7,7,7,7>
+ 2713538026U, // <7,4,7,u>: Cost 3 vext3 <4,7,u,7>, <4,7,u,7>
+ 1517232230U, // <7,4,u,0>: Cost 2 vext1 <6,7,4,u>, LHS
+ 1577899822U, // <7,4,u,1>: Cost 2 vext2 <5,6,7,4>, LHS
+ 2712063489U, // <7,4,u,2>: Cost 3 vext3 RHS, <4,u,2,3>
+ 2573060846U, // <7,4,u,3>: Cost 3 vext1 <3,7,4,u>, <3,7,4,u>
+ 1640312342U, // <7,4,u,4>: Cost 2 vext3 RHS, <4,u,4,6>
+ 1638469146U, // <7,4,u,5>: Cost 2 vext3 RHS, <4,u,5,1>
+ 564579881U, // <7,4,u,6>: Cost 1 vext3 RHS, RHS
+ 2714054192U, // <7,4,u,7>: Cost 3 vext3 RHS, <4,u,7,5>
+ 564579899U, // <7,4,u,u>: Cost 1 vext3 RHS, RHS
+ 2579038310U, // <7,5,0,0>: Cost 3 vext1 <4,7,5,0>, LHS
+ 2636382310U, // <7,5,0,1>: Cost 3 vext2 <3,1,7,5>, LHS
+ 2796339302U, // <7,5,0,2>: Cost 3 vuzpl <7,4,5,6>, LHS
+ 3646810719U, // <7,5,0,3>: Cost 4 vext1 <3,7,5,0>, <3,5,7,0>
+ 2712063586U, // <7,5,0,4>: Cost 3 vext3 RHS, <5,0,4,1>
+ 2735951467U, // <7,5,0,5>: Cost 3 vext3 RHS, <5,0,5,1>
+ 2735951476U, // <7,5,0,6>: Cost 3 vext3 RHS, <5,0,6,1>
+ 2579043322U, // <7,5,0,7>: Cost 3 vext1 <4,7,5,0>, <7,0,1,2>
+ 2636382877U, // <7,5,0,u>: Cost 3 vext2 <3,1,7,5>, LHS
+ 2712211087U, // <7,5,1,0>: Cost 3 vext3 RHS, <5,1,0,1>
+ 3698180916U, // <7,5,1,1>: Cost 4 vext2 <1,1,7,5>, <1,1,1,1>
+ 3710124950U, // <7,5,1,2>: Cost 4 vext2 <3,1,7,5>, <1,2,3,0>
+ 2636383232U, // <7,5,1,3>: Cost 3 vext2 <3,1,7,5>, <1,3,5,7>
+ 2712211127U, // <7,5,1,4>: Cost 3 vext3 RHS, <5,1,4,5>
+ 2590994128U, // <7,5,1,5>: Cost 3 vext1 <6,7,5,1>, <5,1,7,3>
+ 2590995323U, // <7,5,1,6>: Cost 3 vext1 <6,7,5,1>, <6,7,5,1>
+ 1638469328U, // <7,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
+ 1638469337U, // <7,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
+ 3785805536U, // <7,5,2,0>: Cost 4 vext3 RHS, <5,2,0,1>
+ 3785805544U, // <7,5,2,1>: Cost 4 vext3 RHS, <5,2,1,0>
+ 3704817288U, // <7,5,2,2>: Cost 4 vext2 <2,2,7,5>, <2,2,5,7>
+ 2712063742U, // <7,5,2,3>: Cost 3 vext3 RHS, <5,2,3,4>
+ 3716761386U, // <7,5,2,4>: Cost 4 vext2 <4,2,7,5>, <2,4,5,7>
+ 2714054415U, // <7,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
+ 3774304024U, // <7,5,2,6>: Cost 4 vext3 <2,6,3,7>, <5,2,6,3>
+ 2712063777U, // <7,5,2,7>: Cost 3 vext3 RHS, <5,2,7,3>
+ 2712063787U, // <7,5,2,u>: Cost 3 vext3 RHS, <5,2,u,4>
+ 3634888806U, // <7,5,3,0>: Cost 4 vext1 <1,7,5,3>, LHS
+ 2636384544U, // <7,5,3,1>: Cost 3 vext2 <3,1,7,5>, <3,1,7,5>
+ 3710790001U, // <7,5,3,2>: Cost 4 vext2 <3,2,7,5>, <3,2,7,5>
+ 3710126492U, // <7,5,3,3>: Cost 4 vext2 <3,1,7,5>, <3,3,3,3>
+ 3634892086U, // <7,5,3,4>: Cost 4 vext1 <1,7,5,3>, RHS
+ 2639039076U, // <7,5,3,5>: Cost 3 vext2 <3,5,7,5>, <3,5,7,5>
+ 3713444533U, // <7,5,3,6>: Cost 4 vext2 <3,6,7,5>, <3,6,7,5>
+ 2693926767U, // <7,5,3,7>: Cost 3 vext3 <1,5,3,7>, <5,3,7,0>
+ 2712063864U, // <7,5,3,u>: Cost 3 vext3 RHS, <5,3,u,0>
+ 2579071078U, // <7,5,4,0>: Cost 3 vext1 <4,7,5,4>, LHS
+ 3646841856U, // <7,5,4,1>: Cost 4 vext1 <3,7,5,4>, <1,3,5,7>
+ 3716762698U, // <7,5,4,2>: Cost 4 vext2 <4,2,7,5>, <4,2,7,5>
+ 3646843491U, // <7,5,4,3>: Cost 4 vext1 <3,7,5,4>, <3,5,7,4>
+ 2579074358U, // <7,5,4,4>: Cost 3 vext1 <4,7,5,4>, RHS
+ 2636385590U, // <7,5,4,5>: Cost 3 vext2 <3,1,7,5>, RHS
+ 2645675406U, // <7,5,4,6>: Cost 3 vext2 <4,6,7,5>, <4,6,7,5>
+ 1638322118U, // <7,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
+ 1638469583U, // <7,5,4,u>: Cost 2 vext3 RHS, <5,4,u,6>
+ 2714054611U, // <7,5,5,0>: Cost 3 vext3 RHS, <5,5,0,1>
+ 2652974800U, // <7,5,5,1>: Cost 3 vext2 <5,u,7,5>, <5,1,7,3>
+ 3710127905U, // <7,5,5,2>: Cost 4 vext2 <3,1,7,5>, <5,2,7,3>
+ 3785805808U, // <7,5,5,3>: Cost 4 vext3 RHS, <5,5,3,3>
+ 2712211450U, // <7,5,5,4>: Cost 3 vext3 RHS, <5,5,4,4>
+ 1638322180U, // <7,5,5,5>: Cost 2 vext3 RHS, <5,5,5,5>
+ 2712064014U, // <7,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
+ 1638469656U, // <7,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
+ 1638469665U, // <7,5,5,u>: Cost 2 vext3 RHS, <5,5,u,7>
+ 2712064036U, // <7,5,6,0>: Cost 3 vext3 RHS, <5,6,0,1>
+ 2714054707U, // <7,5,6,1>: Cost 3 vext3 RHS, <5,6,1,7>
+ 3785805879U, // <7,5,6,2>: Cost 4 vext3 RHS, <5,6,2,2>
+ 2712064066U, // <7,5,6,3>: Cost 3 vext3 RHS, <5,6,3,4>
+ 2712064076U, // <7,5,6,4>: Cost 3 vext3 RHS, <5,6,4,5>
+ 2714054743U, // <7,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
+ 2712064096U, // <7,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
+ 1638322274U, // <7,5,6,7>: Cost 2 vext3 RHS, <5,6,7,0>
+ 1638469739U, // <7,5,6,u>: Cost 2 vext3 RHS, <5,6,u,0>
+ 1511325798U, // <7,5,7,0>: Cost 2 vext1 <5,7,5,7>, LHS
+ 2692747392U, // <7,5,7,1>: Cost 3 vext3 <1,3,5,7>, <5,7,1,3>
+ 2585069160U, // <7,5,7,2>: Cost 3 vext1 <5,7,5,7>, <2,2,2,2>
+ 2573126390U, // <7,5,7,3>: Cost 3 vext1 <3,7,5,7>, <3,7,5,7>
+ 1511329078U, // <7,5,7,4>: Cost 2 vext1 <5,7,5,7>, RHS
+ 1638469800U, // <7,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
+ 2712211626U, // <7,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
+ 2712211636U, // <7,5,7,7>: Cost 3 vext3 RHS, <5,7,7,1>
+ 1638469823U, // <7,5,7,u>: Cost 2 vext3 RHS, <5,7,u,3>
+ 1511333990U, // <7,5,u,0>: Cost 2 vext1 <5,7,5,u>, LHS
+ 2636388142U, // <7,5,u,1>: Cost 3 vext2 <3,1,7,5>, LHS
+ 2712211671U, // <7,5,u,2>: Cost 3 vext3 RHS, <5,u,2,0>
+ 2573134583U, // <7,5,u,3>: Cost 3 vext1 <3,7,5,u>, <3,7,5,u>
+ 1511337270U, // <7,5,u,4>: Cost 2 vext1 <5,7,5,u>, RHS
+ 1638469881U, // <7,5,u,5>: Cost 2 vext3 RHS, <5,u,5,7>
+ 2712064258U, // <7,5,u,6>: Cost 3 vext3 RHS, <5,u,6,7>
+ 1638469892U, // <7,5,u,7>: Cost 2 vext3 RHS, <5,u,7,0>
+ 1638469904U, // <7,5,u,u>: Cost 2 vext3 RHS, <5,u,u,3>
+ 2650324992U, // <7,6,0,0>: Cost 3 vext2 <5,4,7,6>, <0,0,0,0>
+ 1576583270U, // <7,6,0,1>: Cost 2 vext2 <5,4,7,6>, LHS
+ 2712064300U, // <7,6,0,2>: Cost 3 vext3 RHS, <6,0,2,4>
+ 2255295336U, // <7,6,0,3>: Cost 3 vrev <6,7,3,0>
+ 2712064316U, // <7,6,0,4>: Cost 3 vext3 RHS, <6,0,4,2>
+ 2585088098U, // <7,6,0,5>: Cost 3 vext1 <5,7,6,0>, <5,6,7,0>
+ 2735952204U, // <7,6,0,6>: Cost 3 vext3 RHS, <6,0,6,0>
+ 2712211799U, // <7,6,0,7>: Cost 3 vext3 RHS, <6,0,7,2>
+ 1576583837U, // <7,6,0,u>: Cost 2 vext2 <5,4,7,6>, LHS
+ 1181340494U, // <7,6,1,0>: Cost 2 vrev <6,7,0,1>
+ 2650325812U, // <7,6,1,1>: Cost 3 vext2 <5,4,7,6>, <1,1,1,1>
+ 2650325910U, // <7,6,1,2>: Cost 3 vext2 <5,4,7,6>, <1,2,3,0>
+ 2650325976U, // <7,6,1,3>: Cost 3 vext2 <5,4,7,6>, <1,3,1,3>
+ 2579123510U, // <7,6,1,4>: Cost 3 vext1 <4,7,6,1>, RHS
+ 2650326160U, // <7,6,1,5>: Cost 3 vext2 <5,4,7,6>, <1,5,3,7>
+ 2714055072U, // <7,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
+ 2712064425U, // <7,6,1,7>: Cost 3 vext3 RHS, <6,1,7,3>
+ 1181930390U, // <7,6,1,u>: Cost 2 vrev <6,7,u,1>
+ 2712211897U, // <7,6,2,0>: Cost 3 vext3 RHS, <6,2,0,1>
+ 2714055108U, // <7,6,2,1>: Cost 3 vext3 RHS, <6,2,1,3>
+ 2650326632U, // <7,6,2,2>: Cost 3 vext2 <5,4,7,6>, <2,2,2,2>
+ 2650326694U, // <7,6,2,3>: Cost 3 vext2 <5,4,7,6>, <2,3,0,1>
+ 2714055137U, // <7,6,2,4>: Cost 3 vext3 RHS, <6,2,4,5>
+ 2714055148U, // <7,6,2,5>: Cost 3 vext3 RHS, <6,2,5,7>
+ 2650326970U, // <7,6,2,6>: Cost 3 vext2 <5,4,7,6>, <2,6,3,7>
+ 1638470138U, // <7,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
+ 1638470147U, // <7,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
+ 2650327190U, // <7,6,3,0>: Cost 3 vext2 <5,4,7,6>, <3,0,1,2>
+ 2255172441U, // <7,6,3,1>: Cost 3 vrev <6,7,1,3>
+ 2255246178U, // <7,6,3,2>: Cost 3 vrev <6,7,2,3>
+ 2650327452U, // <7,6,3,3>: Cost 3 vext2 <5,4,7,6>, <3,3,3,3>
+ 2712064562U, // <7,6,3,4>: Cost 3 vext3 RHS, <6,3,4,5>
+ 2650327627U, // <7,6,3,5>: Cost 3 vext2 <5,4,7,6>, <3,5,4,7>
+ 3713452726U, // <7,6,3,6>: Cost 4 vext2 <3,6,7,6>, <3,6,7,6>
+ 2700563016U, // <7,6,3,7>: Cost 3 vext3 <2,6,3,7>, <6,3,7,0>
+ 2712064593U, // <7,6,3,u>: Cost 3 vext3 RHS, <6,3,u,0>
+ 2650327954U, // <7,6,4,0>: Cost 3 vext2 <5,4,7,6>, <4,0,5,1>
+ 2735952486U, // <7,6,4,1>: Cost 3 vext3 RHS, <6,4,1,3>
+ 2735952497U, // <7,6,4,2>: Cost 3 vext3 RHS, <6,4,2,5>
+ 2255328108U, // <7,6,4,3>: Cost 3 vrev <6,7,3,4>
+ 2712212100U, // <7,6,4,4>: Cost 3 vext3 RHS, <6,4,4,6>
+ 1576586550U, // <7,6,4,5>: Cost 2 vext2 <5,4,7,6>, RHS
+ 2714055312U, // <7,6,4,6>: Cost 3 vext3 RHS, <6,4,6,0>
+ 2712212126U, // <7,6,4,7>: Cost 3 vext3 RHS, <6,4,7,5>
+ 1576586793U, // <7,6,4,u>: Cost 2 vext2 <5,4,7,6>, RHS
+ 2579152998U, // <7,6,5,0>: Cost 3 vext1 <4,7,6,5>, LHS
+ 2650328784U, // <7,6,5,1>: Cost 3 vext2 <5,4,7,6>, <5,1,7,3>
+ 2714055364U, // <7,6,5,2>: Cost 3 vext3 RHS, <6,5,2,7>
+ 3785806538U, // <7,6,5,3>: Cost 4 vext3 RHS, <6,5,3,4>
+ 1576587206U, // <7,6,5,4>: Cost 2 vext2 <5,4,7,6>, <5,4,7,6>
+ 2650329092U, // <7,6,5,5>: Cost 3 vext2 <5,4,7,6>, <5,5,5,5>
+ 2650329186U, // <7,6,5,6>: Cost 3 vext2 <5,4,7,6>, <5,6,7,0>
+ 2712064753U, // <7,6,5,7>: Cost 3 vext3 RHS, <6,5,7,7>
+ 1181963162U, // <7,6,5,u>: Cost 2 vrev <6,7,u,5>
+ 2714055421U, // <7,6,6,0>: Cost 3 vext3 RHS, <6,6,0,1>
+ 2714055432U, // <7,6,6,1>: Cost 3 vext3 RHS, <6,6,1,3>
+ 2650329594U, // <7,6,6,2>: Cost 3 vext2 <5,4,7,6>, <6,2,7,3>
+ 3785806619U, // <7,6,6,3>: Cost 4 vext3 RHS, <6,6,3,4>
+ 2712212260U, // <7,6,6,4>: Cost 3 vext3 RHS, <6,6,4,4>
+ 2714055472U, // <7,6,6,5>: Cost 3 vext3 RHS, <6,6,5,7>
+ 1638323000U, // <7,6,6,6>: Cost 2 vext3 RHS, <6,6,6,6>
+ 1638470466U, // <7,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
+ 1638470475U, // <7,6,6,u>: Cost 2 vext3 RHS, <6,6,u,7>
+ 1638323022U, // <7,6,7,0>: Cost 2 vext3 RHS, <6,7,0,1>
+ 2712064854U, // <7,6,7,1>: Cost 3 vext3 RHS, <6,7,1,0>
+ 2712064865U, // <7,6,7,2>: Cost 3 vext3 RHS, <6,7,2,2>
+ 2712064872U, // <7,6,7,3>: Cost 3 vext3 RHS, <6,7,3,0>
+ 1638323062U, // <7,6,7,4>: Cost 2 vext3 RHS, <6,7,4,5>
+ 2712064894U, // <7,6,7,5>: Cost 3 vext3 RHS, <6,7,5,4>
+ 2712064905U, // <7,6,7,6>: Cost 3 vext3 RHS, <6,7,6,6>
+ 2712064915U, // <7,6,7,7>: Cost 3 vext3 RHS, <6,7,7,7>
+ 1638323094U, // <7,6,7,u>: Cost 2 vext3 RHS, <6,7,u,1>
+ 1638470559U, // <7,6,u,0>: Cost 2 vext3 RHS, <6,u,0,1>
+ 1576589102U, // <7,6,u,1>: Cost 2 vext2 <5,4,7,6>, LHS
+ 2712212402U, // <7,6,u,2>: Cost 3 vext3 RHS, <6,u,2,2>
+ 2712212409U, // <7,6,u,3>: Cost 3 vext3 RHS, <6,u,3,0>
+ 1638470599U, // <7,6,u,4>: Cost 2 vext3 RHS, <6,u,4,5>
+ 1576589466U, // <7,6,u,5>: Cost 2 vext2 <5,4,7,6>, RHS
+ 1638323000U, // <7,6,u,6>: Cost 2 vext3 RHS, <6,6,6,6>
+ 1638470624U, // <7,6,u,7>: Cost 2 vext3 RHS, <6,u,7,3>
+ 1638470631U, // <7,6,u,u>: Cost 2 vext3 RHS, <6,u,u,1>
+ 2712065007U, // <7,7,0,0>: Cost 3 vext3 RHS, <7,0,0,0>
+ 1638323194U, // <7,7,0,1>: Cost 2 vext3 RHS, <7,0,1,2>
+ 2712065025U, // <7,7,0,2>: Cost 3 vext3 RHS, <7,0,2,0>
+ 3646958337U, // <7,7,0,3>: Cost 4 vext1 <3,7,7,0>, <3,7,7,0>
+ 2712065044U, // <7,7,0,4>: Cost 3 vext3 RHS, <7,0,4,1>
+ 2585161907U, // <7,7,0,5>: Cost 3 vext1 <5,7,7,0>, <5,7,7,0>
+ 2591134604U, // <7,7,0,6>: Cost 3 vext1 <6,7,7,0>, <6,7,7,0>
+ 2591134714U, // <7,7,0,7>: Cost 3 vext1 <6,7,7,0>, <7,0,1,2>
+ 1638323257U, // <7,7,0,u>: Cost 2 vext3 RHS, <7,0,u,2>
+ 2712065091U, // <7,7,1,0>: Cost 3 vext3 RHS, <7,1,0,3>
+ 2712065098U, // <7,7,1,1>: Cost 3 vext3 RHS, <7,1,1,1>
+ 2712065109U, // <7,7,1,2>: Cost 3 vext3 RHS, <7,1,2,3>
+ 2692748384U, // <7,7,1,3>: Cost 3 vext3 <1,3,5,7>, <7,1,3,5>
+ 2585169206U, // <7,7,1,4>: Cost 3 vext1 <5,7,7,1>, RHS
+ 2693928048U, // <7,7,1,5>: Cost 3 vext3 <1,5,3,7>, <7,1,5,3>
+ 2585170766U, // <7,7,1,6>: Cost 3 vext1 <5,7,7,1>, <6,7,0,1>
+ 2735953024U, // <7,7,1,7>: Cost 3 vext3 RHS, <7,1,7,1>
+ 2695918731U, // <7,7,1,u>: Cost 3 vext3 <1,u,3,7>, <7,1,u,3>
+ 3770471574U, // <7,7,2,0>: Cost 4 vext3 <2,0,5,7>, <7,2,0,5>
+ 3785807002U, // <7,7,2,1>: Cost 4 vext3 RHS, <7,2,1,0>
+ 2712065189U, // <7,7,2,2>: Cost 3 vext3 RHS, <7,2,2,2>
+ 2712065196U, // <7,7,2,3>: Cost 3 vext3 RHS, <7,2,3,0>
+ 3773125818U, // <7,7,2,4>: Cost 4 vext3 <2,4,5,7>, <7,2,4,5>
+ 3766490305U, // <7,7,2,5>: Cost 4 vext3 <1,3,5,7>, <7,2,5,3>
+ 2700563658U, // <7,7,2,6>: Cost 3 vext3 <2,6,3,7>, <7,2,6,3>
+ 2735953107U, // <7,7,2,7>: Cost 3 vext3 RHS, <7,2,7,3>
+ 2701890780U, // <7,7,2,u>: Cost 3 vext3 <2,u,3,7>, <7,2,u,3>
+ 2712065251U, // <7,7,3,0>: Cost 3 vext3 RHS, <7,3,0,1>
+ 3766490350U, // <7,7,3,1>: Cost 4 vext3 <1,3,5,7>, <7,3,1,3>
+ 3774305530U, // <7,7,3,2>: Cost 4 vext3 <2,6,3,7>, <7,3,2,6>
+ 2637728196U, // <7,7,3,3>: Cost 3 vext2 <3,3,7,7>, <3,3,7,7>
+ 2712065291U, // <7,7,3,4>: Cost 3 vext3 RHS, <7,3,4,5>
+ 2585186486U, // <7,7,3,5>: Cost 3 vext1 <5,7,7,3>, <5,7,7,3>
+ 2639719095U, // <7,7,3,6>: Cost 3 vext2 <3,6,7,7>, <3,6,7,7>
+ 2640382728U, // <7,7,3,7>: Cost 3 vext2 <3,7,7,7>, <3,7,7,7>
+ 2641046361U, // <7,7,3,u>: Cost 3 vext2 <3,u,7,7>, <3,u,7,7>
+ 2712212792U, // <7,7,4,0>: Cost 3 vext3 RHS, <7,4,0,5>
+ 3646989312U, // <7,7,4,1>: Cost 4 vext1 <3,7,7,4>, <1,3,5,7>
+ 3785807176U, // <7,7,4,2>: Cost 4 vext3 RHS, <7,4,2,3>
+ 3646991109U, // <7,7,4,3>: Cost 4 vext1 <3,7,7,4>, <3,7,7,4>
+ 2712065371U, // <7,7,4,4>: Cost 3 vext3 RHS, <7,4,4,4>
+ 1638323558U, // <7,7,4,5>: Cost 2 vext3 RHS, <7,4,5,6>
+ 2712212845U, // <7,7,4,6>: Cost 3 vext3 RHS, <7,4,6,4>
+ 2591167846U, // <7,7,4,7>: Cost 3 vext1 <6,7,7,4>, <7,4,5,6>
+ 1638323585U, // <7,7,4,u>: Cost 2 vext3 RHS, <7,4,u,6>
+ 2585198694U, // <7,7,5,0>: Cost 3 vext1 <5,7,7,5>, LHS
+ 2712212884U, // <7,7,5,1>: Cost 3 vext3 RHS, <7,5,1,7>
+ 3711471393U, // <7,7,5,2>: Cost 4 vext2 <3,3,7,7>, <5,2,7,3>
+ 2649673590U, // <7,7,5,3>: Cost 3 vext2 <5,3,7,7>, <5,3,7,7>
+ 2712065455U, // <7,7,5,4>: Cost 3 vext3 RHS, <7,5,4,7>
+ 1577259032U, // <7,7,5,5>: Cost 2 vext2 <5,5,7,7>, <5,5,7,7>
+ 2712065473U, // <7,7,5,6>: Cost 3 vext3 RHS, <7,5,6,7>
+ 2712212936U, // <7,7,5,7>: Cost 3 vext3 RHS, <7,5,7,5>
+ 1579249931U, // <7,7,5,u>: Cost 2 vext2 <5,u,7,7>, <5,u,7,7>
+ 2591178854U, // <7,7,6,0>: Cost 3 vext1 <6,7,7,6>, LHS
+ 2735953374U, // <7,7,6,1>: Cost 3 vext3 RHS, <7,6,1,0>
+ 2712212974U, // <7,7,6,2>: Cost 3 vext3 RHS, <7,6,2,7>
+ 2655646287U, // <7,7,6,3>: Cost 3 vext2 <6,3,7,7>, <6,3,7,7>
+ 2591182134U, // <7,7,6,4>: Cost 3 vext1 <6,7,7,6>, RHS
+ 2656973553U, // <7,7,6,5>: Cost 3 vext2 <6,5,7,7>, <6,5,7,7>
+ 1583895362U, // <7,7,6,6>: Cost 2 vext2 <6,6,7,7>, <6,6,7,7>
+ 2712065556U, // <7,7,6,7>: Cost 3 vext3 RHS, <7,6,7,0>
+ 1585222628U, // <7,7,6,u>: Cost 2 vext2 <6,u,7,7>, <6,u,7,7>
+ 1523417190U, // <7,7,7,0>: Cost 2 vext1 <7,7,7,7>, LHS
+ 2597159670U, // <7,7,7,1>: Cost 3 vext1 <7,7,7,7>, <1,0,3,2>
+ 2597160552U, // <7,7,7,2>: Cost 3 vext1 <7,7,7,7>, <2,2,2,2>
+ 2597161110U, // <7,7,7,3>: Cost 3 vext1 <7,7,7,7>, <3,0,1,2>
+ 1523420470U, // <7,7,7,4>: Cost 2 vext1 <7,7,7,7>, RHS
+ 2651002296U, // <7,7,7,5>: Cost 3 vext2 <5,5,7,7>, <7,5,5,7>
+ 2657637906U, // <7,7,7,6>: Cost 3 vext2 <6,6,7,7>, <7,6,6,7>
+ 363253046U, // <7,7,7,7>: Cost 1 vdup3 RHS
+ 363253046U, // <7,7,7,u>: Cost 1 vdup3 RHS
+ 1523417190U, // <7,7,u,0>: Cost 2 vext1 <7,7,7,7>, LHS
+ 1638471298U, // <7,7,u,1>: Cost 2 vext3 RHS, <7,u,1,2>
+ 2712213132U, // <7,7,u,2>: Cost 3 vext3 RHS, <7,u,2,3>
+ 2712213138U, // <7,7,u,3>: Cost 3 vext3 RHS, <7,u,3,0>
+ 1523420470U, // <7,7,u,4>: Cost 2 vext1 <7,7,7,7>, RHS
+ 1638471338U, // <7,7,u,5>: Cost 2 vext3 RHS, <7,u,5,6>
+ 1595840756U, // <7,7,u,6>: Cost 2 vext2 <u,6,7,7>, <u,6,7,7>
+ 363253046U, // <7,7,u,7>: Cost 1 vdup3 RHS
+ 363253046U, // <7,7,u,u>: Cost 1 vdup3 RHS
+ 1638318080U, // <7,u,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
+ 1638323923U, // <7,u,0,1>: Cost 2 vext3 RHS, <u,0,1,2>
+ 1662211804U, // <7,u,0,2>: Cost 2 vext3 RHS, <u,0,2,2>
+ 1638323941U, // <7,u,0,3>: Cost 2 vext3 RHS, <u,0,3,2>
+ 2712065773U, // <7,u,0,4>: Cost 3 vext3 RHS, <u,0,4,1>
+ 1662359286U, // <7,u,0,5>: Cost 2 vext3 RHS, <u,0,5,1>
+ 1662359296U, // <7,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
+ 2987150664U, // <7,u,0,7>: Cost 3 vzipr <5,6,7,0>, RHS
+ 1638323986U, // <7,u,0,u>: Cost 2 vext3 RHS, <u,0,u,2>
+ 1517469798U, // <7,u,1,0>: Cost 2 vext1 <6,7,u,1>, LHS
+ 1638318900U, // <7,u,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
+ 564582190U, // <7,u,1,2>: Cost 1 vext3 RHS, LHS
+ 1638324023U, // <7,u,1,3>: Cost 2 vext3 RHS, <u,1,3,3>
+ 1517473078U, // <7,u,1,4>: Cost 2 vext1 <6,7,u,1>, RHS
+ 2693928777U, // <7,u,1,5>: Cost 3 vext3 <1,5,3,7>, <u,1,5,3>
+ 1517474710U, // <7,u,1,6>: Cost 2 vext1 <6,7,u,1>, <6,7,u,1>
+ 1640462171U, // <7,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
+ 564582244U, // <7,u,1,u>: Cost 1 vext3 RHS, LHS
+ 1638318244U, // <7,u,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
+ 2712065907U, // <7,u,2,1>: Cost 3 vext3 RHS, <u,2,1,0>
+ 1638319720U, // <7,u,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
+ 1638324101U, // <7,u,2,3>: Cost 2 vext3 RHS, <u,2,3,0>
+ 1638318284U, // <7,u,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
+ 2712065947U, // <7,u,2,5>: Cost 3 vext3 RHS, <u,2,5,4>
+ 2700564387U, // <7,u,2,6>: Cost 3 vext3 <2,6,3,7>, <u,2,6,3>
+ 1640314796U, // <7,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
+ 1638324146U, // <7,u,2,u>: Cost 2 vext3 RHS, <u,2,u,0>
+ 1638324156U, // <7,u,3,0>: Cost 2 vext3 RHS, <u,3,0,1>
+ 1638319064U, // <7,u,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
+ 2700564435U, // <7,u,3,2>: Cost 3 vext3 <2,6,3,7>, <u,3,2,6>
+ 1638320540U, // <7,u,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
+ 1638324196U, // <7,u,3,4>: Cost 2 vext3 RHS, <u,3,4,5>
+ 1638324207U, // <7,u,3,5>: Cost 2 vext3 RHS, <u,3,5,7>
+ 2700564472U, // <7,u,3,6>: Cost 3 vext3 <2,6,3,7>, <u,3,6,7>
+ 2695919610U, // <7,u,3,7>: Cost 3 vext3 <1,u,3,7>, <u,3,7,0>
+ 1638324228U, // <7,u,3,u>: Cost 2 vext3 RHS, <u,3,u,1>
+ 2712066061U, // <7,u,4,0>: Cost 3 vext3 RHS, <u,4,0,1>
+ 1662212122U, // <7,u,4,1>: Cost 2 vext3 RHS, <u,4,1,5>
+ 1662212132U, // <7,u,4,2>: Cost 2 vext3 RHS, <u,4,2,6>
+ 2712066092U, // <7,u,4,3>: Cost 3 vext3 RHS, <u,4,3,5>
+ 1638321360U, // <7,u,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
+ 1638324287U, // <7,u,4,5>: Cost 2 vext3 RHS, <u,4,5,6>
+ 1662359624U, // <7,u,4,6>: Cost 2 vext3 RHS, <u,4,6,6>
+ 1640314961U, // <7,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
+ 1638324314U, // <7,u,4,u>: Cost 2 vext3 RHS, <u,4,u,6>
+ 1517502566U, // <7,u,5,0>: Cost 2 vext1 <6,7,u,5>, LHS
+ 1574612693U, // <7,u,5,1>: Cost 2 vext2 <5,1,7,u>, <5,1,7,u>
+ 2712066162U, // <7,u,5,2>: Cost 3 vext3 RHS, <u,5,2,3>
+ 1638324351U, // <7,u,5,3>: Cost 2 vext3 RHS, <u,5,3,7>
+ 1576603592U, // <7,u,5,4>: Cost 2 vext2 <5,4,7,u>, <5,4,7,u>
+ 1577267225U, // <7,u,5,5>: Cost 2 vext2 <5,5,7,u>, <5,5,7,u>
+ 564582554U, // <7,u,5,6>: Cost 1 vext3 RHS, RHS
+ 1640462499U, // <7,u,5,7>: Cost 2 vext3 RHS, <u,5,7,7>
+ 564582572U, // <7,u,5,u>: Cost 1 vext3 RHS, RHS
+ 2712066223U, // <7,u,6,0>: Cost 3 vext3 RHS, <u,6,0,1>
+ 2712066238U, // <7,u,6,1>: Cost 3 vext3 RHS, <u,6,1,7>
+ 1581249023U, // <7,u,6,2>: Cost 2 vext2 <6,2,7,u>, <6,2,7,u>
+ 1638324432U, // <7,u,6,3>: Cost 2 vext3 RHS, <u,6,3,7>
+ 1638468980U, // <7,u,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
+ 2712066274U, // <7,u,6,5>: Cost 3 vext3 RHS, <u,6,5,7>
+ 1583903555U, // <7,u,6,6>: Cost 2 vext2 <6,6,7,u>, <6,6,7,u>
+ 1640315117U, // <7,u,6,7>: Cost 2 vext3 RHS, <u,6,7,0>
+ 1638324477U, // <7,u,6,u>: Cost 2 vext3 RHS, <u,6,u,7>
+ 1638471936U, // <7,u,7,0>: Cost 2 vext3 RHS, <u,7,0,1>
+ 2692970763U, // <7,u,7,1>: Cost 3 vext3 <1,3,u,7>, <u,7,1,3>
+ 2700933399U, // <7,u,7,2>: Cost 3 vext3 <2,6,u,7>, <u,7,2,6>
+ 2573347601U, // <7,u,7,3>: Cost 3 vext1 <3,7,u,7>, <3,7,u,7>
+ 1638471976U, // <7,u,7,4>: Cost 2 vext3 RHS, <u,7,4,5>
+ 1511551171U, // <7,u,7,5>: Cost 2 vext1 <5,7,u,7>, <5,7,u,7>
+ 2712213815U, // <7,u,7,6>: Cost 3 vext3 RHS, <u,7,6,2>
+ 363253046U, // <7,u,7,7>: Cost 1 vdup3 RHS
+ 363253046U, // <7,u,7,u>: Cost 1 vdup3 RHS
+ 1638324561U, // <7,u,u,0>: Cost 2 vext3 RHS, <u,u,0,1>
+ 1638324571U, // <7,u,u,1>: Cost 2 vext3 RHS, <u,u,1,2>
+ 564582757U, // <7,u,u,2>: Cost 1 vext3 RHS, LHS
+ 1638324587U, // <7,u,u,3>: Cost 2 vext3 RHS, <u,u,3,0>
+ 1638324601U, // <7,u,u,4>: Cost 2 vext3 RHS, <u,u,4,5>
+ 1638324611U, // <7,u,u,5>: Cost 2 vext3 RHS, <u,u,5,6>
+ 564582797U, // <7,u,u,6>: Cost 1 vext3 RHS, RHS
+ 363253046U, // <7,u,u,7>: Cost 1 vdup3 RHS
+ 564582811U, // <7,u,u,u>: Cost 1 vext3 RHS, LHS
+ 135053414U, // <u,0,0,0>: Cost 1 vdup0 LHS
+ 1611489290U, // <u,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
+ 1611489300U, // <u,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
+ 2568054923U, // <u,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
+ 1481706806U, // <u,0,0,4>: Cost 2 vext1 <0,u,0,0>, RHS
+ 2555449040U, // <u,0,0,5>: Cost 3 vext1 <0,u,0,0>, <5,1,7,3>
+ 2591282078U, // <u,0,0,6>: Cost 3 vext1 <6,u,0,0>, <6,u,0,0>
+ 2591945711U, // <u,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
+ 135053414U, // <u,0,0,u>: Cost 1 vdup0 LHS
+ 1493655654U, // <u,0,1,0>: Cost 2 vext1 <2,u,0,1>, LHS
+ 1860550758U, // <u,0,1,1>: Cost 2 vzipl LHS, LHS
+ 537747563U, // <u,0,1,2>: Cost 1 vext3 LHS, LHS
+ 2625135576U, // <u,0,1,3>: Cost 3 vext2 <1,2,u,0>, <1,3,1,3>
+ 1493658934U, // <u,0,1,4>: Cost 2 vext1 <2,u,0,1>, RHS
+ 2625135760U, // <u,0,1,5>: Cost 3 vext2 <1,2,u,0>, <1,5,3,7>
+ 1517548447U, // <u,0,1,6>: Cost 2 vext1 <6,u,0,1>, <6,u,0,1>
+ 2591290362U, // <u,0,1,7>: Cost 3 vext1 <6,u,0,1>, <7,0,1,2>
+ 537747612U, // <u,0,1,u>: Cost 1 vext3 LHS, LHS
+ 1611489444U, // <u,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+ 2685231276U, // <u,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
+ 1994768486U, // <u,0,2,2>: Cost 2 vtrnl LHS, LHS
+ 2685231294U, // <u,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
+ 1611489484U, // <u,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+ 2712068310U, // <u,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
+ 2625136570U, // <u,0,2,6>: Cost 3 vext2 <1,2,u,0>, <2,6,3,7>
+ 2591962097U, // <u,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
+ 1611489516U, // <u,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
+ 2954067968U, // <u,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
+ 2685231356U, // <u,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
+ 72589981U, // <u,0,3,2>: Cost 1 vrev LHS
+ 2625137052U, // <u,0,3,3>: Cost 3 vext2 <1,2,u,0>, <3,3,3,3>
+ 2625137154U, // <u,0,3,4>: Cost 3 vext2 <1,2,u,0>, <3,4,5,6>
+ 2639071848U, // <u,0,3,5>: Cost 3 vext2 <3,5,u,0>, <3,5,u,0>
+ 2639735481U, // <u,0,3,6>: Cost 3 vext2 <3,6,u,0>, <3,6,u,0>
+ 2597279354U, // <u,0,3,7>: Cost 3 vext1 <7,u,0,3>, <7,u,0,3>
+ 73032403U, // <u,0,3,u>: Cost 1 vrev LHS
+ 2687074636U, // <u,0,4,0>: Cost 3 vext3 <0,4,0,u>, <0,4,0,u>
+ 1611489618U, // <u,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
+ 1611489628U, // <u,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
+ 3629222038U, // <u,0,4,3>: Cost 4 vext1 <0,u,0,4>, <3,0,1,2>
+ 2555481398U, // <u,0,4,4>: Cost 3 vext1 <0,u,0,4>, RHS
+ 1551396150U, // <u,0,4,5>: Cost 2 vext2 <1,2,u,0>, RHS
+ 2651680116U, // <u,0,4,6>: Cost 3 vext2 <5,6,u,0>, <4,6,4,6>
+ 2646150600U, // <u,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
+ 1611932050U, // <u,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
+ 2561458278U, // <u,0,5,0>: Cost 3 vext1 <1,u,0,5>, LHS
+ 1863532646U, // <u,0,5,1>: Cost 2 vzipl RHS, LHS
+ 2712068526U, // <u,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
+ 2649689976U, // <u,0,5,3>: Cost 3 vext2 <5,3,u,0>, <5,3,u,0>
+ 2220237489U, // <u,0,5,4>: Cost 3 vrev <0,u,4,5>
+ 2651680772U, // <u,0,5,5>: Cost 3 vext2 <5,6,u,0>, <5,5,5,5>
+ 1577939051U, // <u,0,5,6>: Cost 2 vext2 <5,6,u,0>, <5,6,u,0>
+ 2830077238U, // <u,0,5,7>: Cost 3 vuzpr <1,u,3,0>, RHS
+ 1579266317U, // <u,0,5,u>: Cost 2 vext2 <5,u,u,0>, <5,u,u,0>
+ 2555494502U, // <u,0,6,0>: Cost 3 vext1 <0,u,0,6>, LHS
+ 2712068598U, // <u,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
+ 1997750374U, // <u,0,6,2>: Cost 2 vtrnl RHS, LHS
+ 2655662673U, // <u,0,6,3>: Cost 3 vext2 <6,3,u,0>, <6,3,u,0>
+ 2555497782U, // <u,0,6,4>: Cost 3 vext1 <0,u,0,6>, RHS
+ 2651681459U, // <u,0,6,5>: Cost 3 vext2 <5,6,u,0>, <6,5,0,u>
+ 2651681592U, // <u,0,6,6>: Cost 3 vext2 <5,6,u,0>, <6,6,6,6>
+ 2651681614U, // <u,0,6,7>: Cost 3 vext2 <5,6,u,0>, <6,7,0,1>
+ 1997750428U, // <u,0,6,u>: Cost 2 vtrnl RHS, LHS
+ 2567446630U, // <u,0,7,0>: Cost 3 vext1 <2,u,0,7>, LHS
+ 2567447446U, // <u,0,7,1>: Cost 3 vext1 <2,u,0,7>, <1,2,3,0>
+ 2567448641U, // <u,0,7,2>: Cost 3 vext1 <2,u,0,7>, <2,u,0,7>
+ 2573421338U, // <u,0,7,3>: Cost 3 vext1 <3,u,0,7>, <3,u,0,7>
+ 2567449910U, // <u,0,7,4>: Cost 3 vext1 <2,u,0,7>, RHS
+ 2651682242U, // <u,0,7,5>: Cost 3 vext2 <5,6,u,0>, <7,5,6,u>
+ 2591339429U, // <u,0,7,6>: Cost 3 vext1 <6,u,0,7>, <6,u,0,7>
+ 2651682412U, // <u,0,7,7>: Cost 3 vext2 <5,6,u,0>, <7,7,7,7>
+ 2567452462U, // <u,0,7,u>: Cost 3 vext1 <2,u,0,7>, LHS
+ 135053414U, // <u,0,u,0>: Cost 1 vdup0 LHS
+ 1611489938U, // <u,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
+ 537748125U, // <u,0,u,2>: Cost 1 vext3 LHS, LHS
+ 2685674148U, // <u,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
+ 1611932338U, // <u,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
+ 1551399066U, // <u,0,u,5>: Cost 2 vext2 <1,2,u,0>, RHS
+ 1517605798U, // <u,0,u,6>: Cost 2 vext1 <6,u,0,u>, <6,u,0,u>
+ 2830077481U, // <u,0,u,7>: Cost 3 vuzpr <1,u,3,0>, RHS
+ 537748179U, // <u,0,u,u>: Cost 1 vext3 LHS, LHS
+ 1544101961U, // <u,1,0,0>: Cost 2 vext2 <0,0,u,1>, <0,0,u,1>
+ 1558036582U, // <u,1,0,1>: Cost 2 vext2 <2,3,u,1>, LHS
+ 2619171051U, // <u,1,0,2>: Cost 3 vext2 <0,2,u,1>, <0,2,u,1>
+ 1611490038U, // <u,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
+ 2555522358U, // <u,1,0,4>: Cost 3 vext1 <0,u,1,0>, RHS
+ 2712068871U, // <u,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
+ 2591355815U, // <u,1,0,6>: Cost 3 vext1 <6,u,1,0>, <6,u,1,0>
+ 2597328512U, // <u,1,0,7>: Cost 3 vext1 <7,u,1,0>, <7,u,1,0>
+ 1611490083U, // <u,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
+ 1481785446U, // <u,1,1,0>: Cost 2 vext1 <0,u,1,1>, LHS
+ 202162278U, // <u,1,1,1>: Cost 1 vdup1 LHS
+ 2555528808U, // <u,1,1,2>: Cost 3 vext1 <0,u,1,1>, <2,2,2,2>
+ 1611490120U, // <u,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
+ 1481788726U, // <u,1,1,4>: Cost 2 vext1 <0,u,1,1>, RHS
+ 2689876828U, // <u,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
+ 2591364008U, // <u,1,1,6>: Cost 3 vext1 <6,u,1,1>, <6,u,1,1>
+ 2592691274U, // <u,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
+ 202162278U, // <u,1,1,u>: Cost 1 vdup1 LHS
+ 1499709542U, // <u,1,2,0>: Cost 2 vext1 <3,u,1,2>, LHS
+ 2689876871U, // <u,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
+ 2631116445U, // <u,1,2,2>: Cost 3 vext2 <2,2,u,1>, <2,2,u,1>
+ 835584U, // <u,1,2,3>: Cost 0 copy LHS
+ 1499712822U, // <u,1,2,4>: Cost 2 vext1 <3,u,1,2>, RHS
+ 2689876907U, // <u,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
+ 2631780282U, // <u,1,2,6>: Cost 3 vext2 <2,3,u,1>, <2,6,3,7>
+ 1523603074U, // <u,1,2,7>: Cost 2 vext1 <7,u,1,2>, <7,u,1,2>
+ 835584U, // <u,1,2,u>: Cost 0 copy LHS
+ 1487773798U, // <u,1,3,0>: Cost 2 vext1 <1,u,1,3>, LHS
+ 1611490264U, // <u,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
+ 2685232094U, // <u,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
+ 2018746470U, // <u,1,3,3>: Cost 2 vtrnr LHS, LHS
+ 1487777078U, // <u,1,3,4>: Cost 2 vext1 <1,u,1,3>, RHS
+ 1611490304U, // <u,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
+ 2685674505U, // <u,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
+ 2640407307U, // <u,1,3,7>: Cost 3 vext2 <3,7,u,1>, <3,7,u,1>
+ 1611490327U, // <u,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
+ 1567992749U, // <u,1,4,0>: Cost 2 vext2 <4,0,u,1>, <4,0,u,1>
+ 2693121070U, // <u,1,4,1>: Cost 3 vext3 <1,4,1,u>, <1,4,1,u>
+ 2693194807U, // <u,1,4,2>: Cost 3 vext3 <1,4,2,u>, <1,4,2,u>
+ 1152386432U, // <u,1,4,3>: Cost 2 vrev <1,u,3,4>
+ 2555555126U, // <u,1,4,4>: Cost 3 vext1 <0,u,1,4>, RHS
+ 1558039862U, // <u,1,4,5>: Cost 2 vext2 <2,3,u,1>, RHS
+ 2645716371U, // <u,1,4,6>: Cost 3 vext2 <4,6,u,1>, <4,6,u,1>
+ 2597361284U, // <u,1,4,7>: Cost 3 vext1 <7,u,1,4>, <7,u,1,4>
+ 1152755117U, // <u,1,4,u>: Cost 2 vrev <1,u,u,4>
+ 1481818214U, // <u,1,5,0>: Cost 2 vext1 <0,u,1,5>, LHS
+ 2555560694U, // <u,1,5,1>: Cost 3 vext1 <0,u,1,5>, <1,0,3,2>
+ 2555561576U, // <u,1,5,2>: Cost 3 vext1 <0,u,1,5>, <2,2,2,2>
+ 1611490448U, // <u,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
+ 1481821494U, // <u,1,5,4>: Cost 2 vext1 <0,u,1,5>, RHS
+ 2651025435U, // <u,1,5,5>: Cost 3 vext2 <5,5,u,1>, <5,5,u,1>
+ 2651689068U, // <u,1,5,6>: Cost 3 vext2 <5,6,u,1>, <5,6,u,1>
+ 2823966006U, // <u,1,5,7>: Cost 3 vuzpr <0,u,1,1>, RHS
+ 1611932861U, // <u,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
+ 2555568230U, // <u,1,6,0>: Cost 3 vext1 <0,u,1,6>, LHS
+ 2689877199U, // <u,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
+ 2712069336U, // <u,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
+ 2685232353U, // <u,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
+ 2555571510U, // <u,1,6,4>: Cost 3 vext1 <0,u,1,6>, RHS
+ 2689877235U, // <u,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
+ 2657661765U, // <u,1,6,6>: Cost 3 vext2 <6,6,u,1>, <6,6,u,1>
+ 1584583574U, // <u,1,6,7>: Cost 2 vext2 <6,7,u,1>, <6,7,u,1>
+ 1585247207U, // <u,1,6,u>: Cost 2 vext2 <6,u,u,1>, <6,u,u,1>
+ 2561548390U, // <u,1,7,0>: Cost 3 vext1 <1,u,1,7>, LHS
+ 2561549681U, // <u,1,7,1>: Cost 3 vext1 <1,u,1,7>, <1,u,1,7>
+ 2573493926U, // <u,1,7,2>: Cost 3 vext1 <3,u,1,7>, <2,3,0,1>
+ 2042962022U, // <u,1,7,3>: Cost 2 vtrnr RHS, LHS
+ 2561551670U, // <u,1,7,4>: Cost 3 vext1 <1,u,1,7>, RHS
+ 2226300309U, // <u,1,7,5>: Cost 3 vrev <1,u,5,7>
+ 2658325990U, // <u,1,7,6>: Cost 3 vext2 <6,7,u,1>, <7,6,1,u>
+ 2658326124U, // <u,1,7,7>: Cost 3 vext2 <6,7,u,1>, <7,7,7,7>
+ 2042962027U, // <u,1,7,u>: Cost 2 vtrnr RHS, LHS
+ 1481842790U, // <u,1,u,0>: Cost 2 vext1 <0,u,1,u>, LHS
+ 202162278U, // <u,1,u,1>: Cost 1 vdup1 LHS
+ 2685674867U, // <u,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
+ 835584U, // <u,1,u,3>: Cost 0 copy LHS
+ 1481846070U, // <u,1,u,4>: Cost 2 vext1 <0,u,1,u>, RHS
+ 1611933077U, // <u,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
+ 2685674910U, // <u,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
+ 1523652232U, // <u,1,u,7>: Cost 2 vext1 <7,u,1,u>, <7,u,1,u>
+ 835584U, // <u,1,u,u>: Cost 0 copy LHS
+ 1544110154U, // <u,2,0,0>: Cost 2 vext2 <0,0,u,2>, <0,0,u,2>
+ 1545437286U, // <u,2,0,1>: Cost 2 vext2 <0,2,u,2>, LHS
+ 1545437420U, // <u,2,0,2>: Cost 2 vext2 <0,2,u,2>, <0,2,u,2>
+ 2685232589U, // <u,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
+ 2619179346U, // <u,2,0,4>: Cost 3 vext2 <0,2,u,2>, <0,4,1,5>
+ 2712069606U, // <u,2,0,5>: Cost 3 vext3 RHS, <2,0,5,7>
+ 2689877484U, // <u,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
+ 2659656273U, // <u,2,0,7>: Cost 3 vext2 <7,0,u,2>, <0,7,2,u>
+ 1545437853U, // <u,2,0,u>: Cost 2 vext2 <0,2,u,2>, LHS
+ 1550082851U, // <u,2,1,0>: Cost 2 vext2 <1,0,u,2>, <1,0,u,2>
+ 2619179828U, // <u,2,1,1>: Cost 3 vext2 <0,2,u,2>, <1,1,1,1>
+ 2619179926U, // <u,2,1,2>: Cost 3 vext2 <0,2,u,2>, <1,2,3,0>
+ 2685232671U, // <u,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
+ 2555604278U, // <u,2,1,4>: Cost 3 vext1 <0,u,2,1>, RHS
+ 2619180176U, // <u,2,1,5>: Cost 3 vext2 <0,2,u,2>, <1,5,3,7>
+ 2689877564U, // <u,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
+ 2602718850U, // <u,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
+ 1158703235U, // <u,2,1,u>: Cost 2 vrev <2,u,u,1>
+ 1481867366U, // <u,2,2,0>: Cost 2 vext1 <0,u,2,2>, LHS
+ 2555609846U, // <u,2,2,1>: Cost 3 vext1 <0,u,2,2>, <1,0,3,2>
+ 269271142U, // <u,2,2,2>: Cost 1 vdup2 LHS
+ 1611490930U, // <u,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
+ 1481870646U, // <u,2,2,4>: Cost 2 vext1 <0,u,2,2>, RHS
+ 2689877640U, // <u,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
+ 2619180986U, // <u,2,2,6>: Cost 3 vext2 <0,2,u,2>, <2,6,3,7>
+ 2593436837U, // <u,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
+ 269271142U, // <u,2,2,u>: Cost 1 vdup2 LHS
+ 408134301U, // <u,2,3,0>: Cost 1 vext1 LHS, LHS
+ 1481876214U, // <u,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+ 1481877096U, // <u,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
+ 1880326246U, // <u,2,3,3>: Cost 2 vzipr LHS, LHS
+ 408137014U, // <u,2,3,4>: Cost 1 vext1 LHS, RHS
+ 1529654992U, // <u,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
+ 1529655802U, // <u,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+ 1529656314U, // <u,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
+ 408139566U, // <u,2,3,u>: Cost 1 vext1 LHS, LHS
+ 1567853468U, // <u,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
+ 2561598362U, // <u,2,4,1>: Cost 3 vext1 <1,u,2,4>, <1,2,3,4>
+ 2555627214U, // <u,2,4,2>: Cost 3 vext1 <0,u,2,4>, <2,3,4,5>
+ 2685232918U, // <u,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
+ 2555628854U, // <u,2,4,4>: Cost 3 vext1 <0,u,2,4>, RHS
+ 1545440566U, // <u,2,4,5>: Cost 2 vext2 <0,2,u,2>, RHS
+ 1571982740U, // <u,2,4,6>: Cost 2 vext2 <4,6,u,2>, <4,6,u,2>
+ 2592125957U, // <u,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
+ 1545440809U, // <u,2,4,u>: Cost 2 vext2 <0,2,u,2>, RHS
+ 2555633766U, // <u,2,5,0>: Cost 3 vext1 <0,u,2,5>, LHS
+ 2561606550U, // <u,2,5,1>: Cost 3 vext1 <1,u,2,5>, <1,2,3,0>
+ 2689877856U, // <u,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
+ 2685233000U, // <u,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
+ 1158441059U, // <u,2,5,4>: Cost 2 vrev <2,u,4,5>
+ 2645725188U, // <u,2,5,5>: Cost 3 vext2 <4,6,u,2>, <5,5,5,5>
+ 2689877892U, // <u,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
+ 2823900470U, // <u,2,5,7>: Cost 3 vuzpr <0,u,0,2>, RHS
+ 1158736007U, // <u,2,5,u>: Cost 2 vrev <2,u,u,5>
+ 1481900134U, // <u,2,6,0>: Cost 2 vext1 <0,u,2,6>, LHS
+ 2555642614U, // <u,2,6,1>: Cost 3 vext1 <0,u,2,6>, <1,0,3,2>
+ 2555643496U, // <u,2,6,2>: Cost 3 vext1 <0,u,2,6>, <2,2,2,2>
+ 1611491258U, // <u,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
+ 1481903414U, // <u,2,6,4>: Cost 2 vext1 <0,u,2,6>, RHS
+ 2689877964U, // <u,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
+ 2689877973U, // <u,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
+ 2645726030U, // <u,2,6,7>: Cost 3 vext2 <4,6,u,2>, <6,7,0,1>
+ 1611933671U, // <u,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
+ 1585919033U, // <u,2,7,0>: Cost 2 vext2 <7,0,u,2>, <7,0,u,2>
+ 2573566710U, // <u,2,7,1>: Cost 3 vext1 <3,u,2,7>, <1,0,3,2>
+ 2567596115U, // <u,2,7,2>: Cost 3 vext1 <2,u,2,7>, <2,u,2,7>
+ 1906901094U, // <u,2,7,3>: Cost 2 vzipr RHS, LHS
+ 2555653430U, // <u,2,7,4>: Cost 3 vext1 <0,u,2,7>, RHS
+ 2800080230U, // <u,2,7,5>: Cost 3 vuzpl LHS, <7,4,5,6>
+ 2980643164U, // <u,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
+ 2645726828U, // <u,2,7,7>: Cost 3 vext2 <4,6,u,2>, <7,7,7,7>
+ 1906901099U, // <u,2,7,u>: Cost 2 vzipr RHS, LHS
+ 408175266U, // <u,2,u,0>: Cost 1 vext1 LHS, LHS
+ 1545443118U, // <u,2,u,1>: Cost 2 vext2 <0,2,u,2>, LHS
+ 269271142U, // <u,2,u,2>: Cost 1 vdup2 LHS
+ 1611491416U, // <u,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
+ 408177974U, // <u,2,u,4>: Cost 1 vext1 LHS, RHS
+ 1545443482U, // <u,2,u,5>: Cost 2 vext2 <0,2,u,2>, RHS
+ 1726339226U, // <u,2,u,6>: Cost 2 vuzpl LHS, RHS
+ 1529697274U, // <u,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
+ 408180526U, // <u,2,u,u>: Cost 1 vext1 LHS, LHS
+ 1544781824U, // <u,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+ 471040156U, // <u,3,0,1>: Cost 1 vext2 LHS, LHS
+ 1544781988U, // <u,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+ 2618523900U, // <u,3,0,3>: Cost 3 vext2 LHS, <0,3,1,0>
+ 1544782162U, // <u,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+ 2238188352U, // <u,3,0,5>: Cost 3 vrev <3,u,5,0>
+ 2623169023U, // <u,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
+ 2238335826U, // <u,3,0,7>: Cost 3 vrev <3,u,7,0>
+ 471040669U, // <u,3,0,u>: Cost 1 vext2 LHS, LHS
+ 1544782582U, // <u,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+ 1544782644U, // <u,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+ 1544782742U, // <u,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+ 1544782808U, // <u,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+ 2618524733U, // <u,3,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
+ 1544782992U, // <u,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+ 2618524897U, // <u,3,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
+ 2703517987U, // <u,3,1,7>: Cost 3 vext3 <3,1,7,u>, <3,1,7,u>
+ 1544783213U, // <u,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
+ 1529716838U, // <u,3,2,0>: Cost 2 vext1 <u,u,3,2>, LHS
+ 1164167966U, // <u,3,2,1>: Cost 2 vrev <3,u,1,2>
+ 1544783464U, // <u,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
+ 1544783526U, // <u,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+ 1529720118U, // <u,3,2,4>: Cost 2 vext1 <u,u,3,2>, RHS
+ 2618525544U, // <u,3,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+ 1544783802U, // <u,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+ 2704181620U, // <u,3,2,7>: Cost 3 vext3 <3,2,7,u>, <3,2,7,u>
+ 1544783931U, // <u,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
+ 1544784022U, // <u,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+ 1487922559U, // <u,3,3,1>: Cost 2 vext1 <1,u,3,3>, <1,u,3,3>
+ 1493895256U, // <u,3,3,2>: Cost 2 vext1 <2,u,3,3>, <2,u,3,3>
+ 336380006U, // <u,3,3,3>: Cost 1 vdup3 LHS
+ 1544784386U, // <u,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+ 2824054478U, // <u,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
+ 2238286668U, // <u,3,3,6>: Cost 3 vrev <3,u,6,3>
+ 2954069136U, // <u,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
+ 336380006U, // <u,3,3,u>: Cost 1 vdup3 LHS
+ 1487929446U, // <u,3,4,0>: Cost 2 vext1 <1,u,3,4>, LHS
+ 1487930752U, // <u,3,4,1>: Cost 2 vext1 <1,u,3,4>, <1,u,3,4>
+ 2623171644U, // <u,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
+ 2561673366U, // <u,3,4,3>: Cost 3 vext1 <1,u,3,4>, <3,0,1,2>
+ 1487932726U, // <u,3,4,4>: Cost 2 vext1 <1,u,3,4>, RHS
+ 471043382U, // <u,3,4,5>: Cost 1 vext2 LHS, RHS
+ 1592561012U, // <u,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+ 2238368598U, // <u,3,4,7>: Cost 3 vrev <3,u,7,4>
+ 471043625U, // <u,3,4,u>: Cost 1 vext2 LHS, RHS
+ 2555707494U, // <u,3,5,0>: Cost 3 vext1 <0,u,3,5>, LHS
+ 1574645465U, // <u,3,5,1>: Cost 2 vext2 <5,1,u,3>, <5,1,u,3>
+ 2567653106U, // <u,3,5,2>: Cost 3 vext1 <2,u,3,5>, <2,3,u,5>
+ 2555709954U, // <u,3,5,3>: Cost 3 vext1 <0,u,3,5>, <3,4,5,6>
+ 1592561606U, // <u,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+ 1592561668U, // <u,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+ 1592561762U, // <u,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
+ 1750314294U, // <u,3,5,7>: Cost 2 vuzpr LHS, RHS
+ 1750314295U, // <u,3,5,u>: Cost 2 vuzpr LHS, RHS
+ 2623172897U, // <u,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
+ 2561688962U, // <u,3,6,1>: Cost 3 vext1 <1,u,3,6>, <1,u,3,6>
+ 1581281795U, // <u,3,6,2>: Cost 2 vext2 <6,2,u,3>, <6,2,u,3>
+ 2706541204U, // <u,3,6,3>: Cost 3 vext3 <3,6,3,u>, <3,6,3,u>
+ 2623173261U, // <u,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
+ 1164495686U, // <u,3,6,5>: Cost 2 vrev <3,u,5,6>
+ 1592562488U, // <u,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+ 1592562510U, // <u,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+ 1164716897U, // <u,3,6,u>: Cost 2 vrev <3,u,u,6>
+ 1487954022U, // <u,3,7,0>: Cost 2 vext1 <1,u,3,7>, LHS
+ 1487955331U, // <u,3,7,1>: Cost 2 vext1 <1,u,3,7>, <1,u,3,7>
+ 1493928028U, // <u,3,7,2>: Cost 2 vext1 <2,u,3,7>, <2,u,3,7>
+ 2561697942U, // <u,3,7,3>: Cost 3 vext1 <1,u,3,7>, <3,0,1,2>
+ 1487957302U, // <u,3,7,4>: Cost 2 vext1 <1,u,3,7>, RHS
+ 2707352311U, // <u,3,7,5>: Cost 3 vext3 <3,7,5,u>, <3,7,5,u>
+ 2655024623U, // <u,3,7,6>: Cost 3 vext2 <6,2,u,3>, <7,6,2,u>
+ 1592563308U, // <u,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+ 1487959854U, // <u,3,7,u>: Cost 2 vext1 <1,u,3,7>, LHS
+ 1544787667U, // <u,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
+ 471045934U, // <u,3,u,1>: Cost 1 vext2 LHS, LHS
+ 1549432709U, // <u,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
+ 336380006U, // <u,3,u,3>: Cost 1 vdup3 LHS
+ 1544788031U, // <u,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
+ 471046298U, // <u,3,u,5>: Cost 1 vext2 LHS, RHS
+ 1549433040U, // <u,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
+ 1750314537U, // <u,3,u,7>: Cost 2 vuzpr LHS, RHS
+ 471046501U, // <u,3,u,u>: Cost 1 vext2 LHS, LHS
+ 2625167360U, // <u,4,0,0>: Cost 3 vext2 <1,2,u,4>, <0,0,0,0>
+ 1551425638U, // <u,4,0,1>: Cost 2 vext2 <1,2,u,4>, LHS
+ 2619195630U, // <u,4,0,2>: Cost 3 vext2 <0,2,u,4>, <0,2,u,4>
+ 2619343104U, // <u,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
+ 2625167698U, // <u,4,0,4>: Cost 3 vext2 <1,2,u,4>, <0,4,1,5>
+ 1638329234U, // <u,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
+ 1638329244U, // <u,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
+ 3787803556U, // <u,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
+ 1551426205U, // <u,4,0,u>: Cost 2 vext2 <1,2,u,4>, LHS
+ 2555748454U, // <u,4,1,0>: Cost 3 vext1 <0,u,4,1>, LHS
+ 2625168180U, // <u,4,1,1>: Cost 3 vext2 <1,2,u,4>, <1,1,1,1>
+ 1551426503U, // <u,4,1,2>: Cost 2 vext2 <1,2,u,4>, <1,2,u,4>
+ 2625168344U, // <u,4,1,3>: Cost 3 vext2 <1,2,u,4>, <1,3,1,3>
+ 2555751734U, // <u,4,1,4>: Cost 3 vext1 <0,u,4,1>, RHS
+ 1860554038U, // <u,4,1,5>: Cost 2 vzipl LHS, RHS
+ 2689879022U, // <u,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
+ 2592248852U, // <u,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
+ 1555408301U, // <u,4,1,u>: Cost 2 vext2 <1,u,u,4>, <1,u,u,4>
+ 2555756646U, // <u,4,2,0>: Cost 3 vext1 <0,u,4,2>, LHS
+ 2625168943U, // <u,4,2,1>: Cost 3 vext2 <1,2,u,4>, <2,1,4,u>
+ 2625169000U, // <u,4,2,2>: Cost 3 vext2 <1,2,u,4>, <2,2,2,2>
+ 2619197134U, // <u,4,2,3>: Cost 3 vext2 <0,2,u,4>, <2,3,4,5>
+ 2555759926U, // <u,4,2,4>: Cost 3 vext1 <0,u,4,2>, RHS
+ 2712071222U, // <u,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
+ 1994771766U, // <u,4,2,6>: Cost 2 vtrnl LHS, RHS
+ 2592257045U, // <u,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
+ 1994771784U, // <u,4,2,u>: Cost 2 vtrnl LHS, RHS
+ 2625169558U, // <u,4,3,0>: Cost 3 vext2 <1,2,u,4>, <3,0,1,2>
+ 2567709594U, // <u,4,3,1>: Cost 3 vext1 <2,u,4,3>, <1,2,3,4>
+ 2567710817U, // <u,4,3,2>: Cost 3 vext1 <2,u,4,3>, <2,u,4,3>
+ 2625169820U, // <u,4,3,3>: Cost 3 vext2 <1,2,u,4>, <3,3,3,3>
+ 2625169922U, // <u,4,3,4>: Cost 3 vext2 <1,2,u,4>, <3,4,5,6>
+ 2954069710U, // <u,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
+ 2954068172U, // <u,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
+ 3903849472U, // <u,4,3,7>: Cost 4 vuzpr <1,u,3,4>, <1,3,5,7>
+ 2954068174U, // <u,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
+ 1505919078U, // <u,4,4,0>: Cost 2 vext1 <4,u,4,4>, LHS
+ 2567717831U, // <u,4,4,1>: Cost 3 vext1 <2,u,4,4>, <1,2,u,4>
+ 2567719010U, // <u,4,4,2>: Cost 3 vext1 <2,u,4,4>, <2,u,4,4>
+ 2570373542U, // <u,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
+ 161926454U, // <u,4,4,4>: Cost 1 vdup0 RHS
+ 1551428918U, // <u,4,4,5>: Cost 2 vext2 <1,2,u,4>, RHS
+ 1638329572U, // <u,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
+ 2594927963U, // <u,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
+ 161926454U, // <u,4,4,u>: Cost 1 vdup0 RHS
+ 1493983334U, // <u,4,5,0>: Cost 2 vext1 <2,u,4,5>, LHS
+ 2689879301U, // <u,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
+ 1493985379U, // <u,4,5,2>: Cost 2 vext1 <2,u,4,5>, <2,u,4,5>
+ 2567727254U, // <u,4,5,3>: Cost 3 vext1 <2,u,4,5>, <3,0,1,2>
+ 1493986614U, // <u,4,5,4>: Cost 2 vext1 <2,u,4,5>, RHS
+ 1863535926U, // <u,4,5,5>: Cost 2 vzipl RHS, RHS
+ 537750838U, // <u,4,5,6>: Cost 1 vext3 LHS, RHS
+ 2830110006U, // <u,4,5,7>: Cost 3 vuzpr <1,u,3,4>, RHS
+ 537750856U, // <u,4,5,u>: Cost 1 vext3 LHS, RHS
+ 1482047590U, // <u,4,6,0>: Cost 2 vext1 <0,u,4,6>, LHS
+ 2555790070U, // <u,4,6,1>: Cost 3 vext1 <0,u,4,6>, <1,0,3,2>
+ 2555790952U, // <u,4,6,2>: Cost 3 vext1 <0,u,4,6>, <2,2,2,2>
+ 2555791510U, // <u,4,6,3>: Cost 3 vext1 <0,u,4,6>, <3,0,1,2>
+ 1482050870U, // <u,4,6,4>: Cost 2 vext1 <0,u,4,6>, RHS
+ 2689879422U, // <u,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
+ 1997753654U, // <u,4,6,6>: Cost 2 vtrnl RHS, RHS
+ 2712071562U, // <u,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
+ 1482053422U, // <u,4,6,u>: Cost 2 vext1 <0,u,4,6>, LHS
+ 2567741542U, // <u,4,7,0>: Cost 3 vext1 <2,u,4,7>, LHS
+ 2567742362U, // <u,4,7,1>: Cost 3 vext1 <2,u,4,7>, <1,2,3,4>
+ 2567743589U, // <u,4,7,2>: Cost 3 vext1 <2,u,4,7>, <2,u,4,7>
+ 2573716286U, // <u,4,7,3>: Cost 3 vext1 <3,u,4,7>, <3,u,4,7>
+ 2567744822U, // <u,4,7,4>: Cost 3 vext1 <2,u,4,7>, RHS
+ 2712071624U, // <u,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
+ 96808489U, // <u,4,7,6>: Cost 1 vrev RHS
+ 2651715180U, // <u,4,7,7>: Cost 3 vext2 <5,6,u,4>, <7,7,7,7>
+ 96955963U, // <u,4,7,u>: Cost 1 vrev RHS
+ 1482063974U, // <u,4,u,0>: Cost 2 vext1 <0,u,4,u>, LHS
+ 1551431470U, // <u,4,u,1>: Cost 2 vext2 <1,2,u,4>, LHS
+ 1494009958U, // <u,4,u,2>: Cost 2 vext1 <2,u,4,u>, <2,u,4,u>
+ 2555807894U, // <u,4,u,3>: Cost 3 vext1 <0,u,4,u>, <3,0,1,2>
+ 161926454U, // <u,4,u,4>: Cost 1 vdup0 RHS
+ 1551431834U, // <u,4,u,5>: Cost 2 vext2 <1,2,u,4>, RHS
+ 537751081U, // <u,4,u,6>: Cost 1 vext3 LHS, RHS
+ 2830110249U, // <u,4,u,7>: Cost 3 vuzpr <1,u,3,4>, RHS
+ 537751099U, // <u,4,u,u>: Cost 1 vext3 LHS, RHS
+ 2631811072U, // <u,5,0,0>: Cost 3 vext2 <2,3,u,5>, <0,0,0,0>
+ 1558069350U, // <u,5,0,1>: Cost 2 vext2 <2,3,u,5>, LHS
+ 2619203823U, // <u,5,0,2>: Cost 3 vext2 <0,2,u,5>, <0,2,u,5>
+ 2619867456U, // <u,5,0,3>: Cost 3 vext2 <0,3,u,5>, <0,3,u,5>
+ 1546273106U, // <u,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
+ 2733010539U, // <u,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
+ 2597622682U, // <u,5,0,6>: Cost 3 vext1 <7,u,5,0>, <6,7,u,5>
+ 1176539396U, // <u,5,0,7>: Cost 2 vrev <5,u,7,0>
+ 1558069917U, // <u,5,0,u>: Cost 2 vext2 <2,3,u,5>, LHS
+ 1505968230U, // <u,5,1,0>: Cost 2 vext1 <4,u,5,1>, LHS
+ 2624512887U, // <u,5,1,1>: Cost 3 vext2 <1,1,u,5>, <1,1,u,5>
+ 2631811990U, // <u,5,1,2>: Cost 3 vext2 <2,3,u,5>, <1,2,3,0>
+ 2618541056U, // <u,5,1,3>: Cost 3 vext2 <0,1,u,5>, <1,3,5,7>
+ 1505971510U, // <u,5,1,4>: Cost 2 vext1 <4,u,5,1>, RHS
+ 2627167419U, // <u,5,1,5>: Cost 3 vext2 <1,5,u,5>, <1,5,u,5>
+ 2579714554U, // <u,5,1,6>: Cost 3 vext1 <4,u,5,1>, <6,2,7,3>
+ 1638330064U, // <u,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
+ 1638477529U, // <u,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
+ 2561802342U, // <u,5,2,0>: Cost 3 vext1 <1,u,5,2>, LHS
+ 2561803264U, // <u,5,2,1>: Cost 3 vext1 <1,u,5,2>, <1,3,5,7>
+ 2631149217U, // <u,5,2,2>: Cost 3 vext2 <2,2,u,5>, <2,2,u,5>
+ 1558071026U, // <u,5,2,3>: Cost 2 vext2 <2,3,u,5>, <2,3,u,5>
+ 2561805622U, // <u,5,2,4>: Cost 3 vext1 <1,u,5,2>, RHS
+ 2714062607U, // <u,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
+ 2631813050U, // <u,5,2,6>: Cost 3 vext2 <2,3,u,5>, <2,6,3,7>
+ 3092335926U, // <u,5,2,7>: Cost 3 vtrnr <0,u,0,2>, RHS
+ 1561389191U, // <u,5,2,u>: Cost 2 vext2 <2,u,u,5>, <2,u,u,5>
+ 2561810534U, // <u,5,3,0>: Cost 3 vext1 <1,u,5,3>, LHS
+ 2561811857U, // <u,5,3,1>: Cost 3 vext1 <1,u,5,3>, <1,u,5,3>
+ 2631813474U, // <u,5,3,2>: Cost 3 vext2 <2,3,u,5>, <3,2,5,u>
+ 2631813532U, // <u,5,3,3>: Cost 3 vext2 <2,3,u,5>, <3,3,3,3>
+ 2619869698U, // <u,5,3,4>: Cost 3 vext2 <0,3,u,5>, <3,4,5,6>
+ 3001847002U, // <u,5,3,5>: Cost 3 vzipr LHS, <4,4,5,5>
+ 2954070530U, // <u,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
+ 2018749750U, // <u,5,3,7>: Cost 2 vtrnr LHS, RHS
+ 2018749751U, // <u,5,3,u>: Cost 2 vtrnr LHS, RHS
+ 2573762662U, // <u,5,4,0>: Cost 3 vext1 <3,u,5,4>, LHS
+ 2620017634U, // <u,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
+ 2573764338U, // <u,5,4,2>: Cost 3 vext1 <3,u,5,4>, <2,3,u,5>
+ 2573765444U, // <u,5,4,3>: Cost 3 vext1 <3,u,5,4>, <3,u,5,4>
+ 1570680053U, // <u,5,4,4>: Cost 2 vext2 <4,4,u,5>, <4,4,u,5>
+ 1558072630U, // <u,5,4,5>: Cost 2 vext2 <2,3,u,5>, RHS
+ 2645749143U, // <u,5,4,6>: Cost 3 vext2 <4,6,u,5>, <4,6,u,5>
+ 1638330310U, // <u,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
+ 1558072873U, // <u,5,4,u>: Cost 2 vext2 <2,3,u,5>, RHS
+ 1506000998U, // <u,5,5,0>: Cost 2 vext1 <4,u,5,5>, LHS
+ 2561827984U, // <u,5,5,1>: Cost 3 vext1 <1,u,5,5>, <1,5,3,7>
+ 2579744360U, // <u,5,5,2>: Cost 3 vext1 <4,u,5,5>, <2,2,2,2>
+ 2579744918U, // <u,5,5,3>: Cost 3 vext1 <4,u,5,5>, <3,0,1,2>
+ 1506004278U, // <u,5,5,4>: Cost 2 vext1 <4,u,5,5>, RHS
+ 229035318U, // <u,5,5,5>: Cost 1 vdup1 RHS
+ 2712072206U, // <u,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
+ 1638330392U, // <u,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
+ 229035318U, // <u,5,5,u>: Cost 1 vdup1 RHS
+ 1500037222U, // <u,5,6,0>: Cost 2 vext1 <3,u,5,6>, LHS
+ 2561836436U, // <u,5,6,1>: Cost 3 vext1 <1,u,5,6>, <1,u,5,6>
+ 2567809133U, // <u,5,6,2>: Cost 3 vext1 <2,u,5,6>, <2,u,5,6>
+ 1500040006U, // <u,5,6,3>: Cost 2 vext1 <3,u,5,6>, <3,u,5,6>
+ 1500040502U, // <u,5,6,4>: Cost 2 vext1 <3,u,5,6>, RHS
+ 2714062935U, // <u,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
+ 2712072288U, // <u,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
+ 27705344U, // <u,5,6,7>: Cost 0 copy RHS
+ 27705344U, // <u,5,6,u>: Cost 0 copy RHS
+ 1488101478U, // <u,5,7,0>: Cost 2 vext1 <1,u,5,7>, LHS
+ 1488102805U, // <u,5,7,1>: Cost 2 vext1 <1,u,5,7>, <1,u,5,7>
+ 2561844840U, // <u,5,7,2>: Cost 3 vext1 <1,u,5,7>, <2,2,2,2>
+ 2561845398U, // <u,5,7,3>: Cost 3 vext1 <1,u,5,7>, <3,0,1,2>
+ 1488104758U, // <u,5,7,4>: Cost 2 vext1 <1,u,5,7>, RHS
+ 1638330536U, // <u,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
+ 2712072362U, // <u,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
+ 2042965302U, // <u,5,7,7>: Cost 2 vtrnr RHS, RHS
+ 1488107310U, // <u,5,7,u>: Cost 2 vext1 <1,u,5,7>, LHS
+ 1488109670U, // <u,5,u,0>: Cost 2 vext1 <1,u,5,u>, LHS
+ 1488110998U, // <u,5,u,1>: Cost 2 vext1 <1,u,5,u>, <1,u,5,u>
+ 2561853032U, // <u,5,u,2>: Cost 3 vext1 <1,u,5,u>, <2,2,2,2>
+ 1500056392U, // <u,5,u,3>: Cost 2 vext1 <3,u,5,u>, <3,u,5,u>
+ 1488112950U, // <u,5,u,4>: Cost 2 vext1 <1,u,5,u>, RHS
+ 229035318U, // <u,5,u,5>: Cost 1 vdup1 RHS
+ 2954111490U, // <u,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
+ 27705344U, // <u,5,u,7>: Cost 0 copy RHS
+ 27705344U, // <u,5,u,u>: Cost 0 copy RHS
+ 2619211776U, // <u,6,0,0>: Cost 3 vext2 <0,2,u,6>, <0,0,0,0>
+ 1545470054U, // <u,6,0,1>: Cost 2 vext2 <0,2,u,6>, LHS
+ 1545470192U, // <u,6,0,2>: Cost 2 vext2 <0,2,u,6>, <0,2,u,6>
+ 2255958969U, // <u,6,0,3>: Cost 3 vrev <6,u,3,0>
+ 1546797458U, // <u,6,0,4>: Cost 2 vext2 <0,4,u,6>, <0,4,u,6>
+ 2720624971U, // <u,6,0,5>: Cost 3 vext3 <6,0,5,u>, <6,0,5,u>
+ 2256180180U, // <u,6,0,6>: Cost 3 vrev <6,u,6,0>
+ 2960682294U, // <u,6,0,7>: Cost 3 vzipr <1,2,u,0>, RHS
+ 1545470621U, // <u,6,0,u>: Cost 2 vext2 <0,2,u,6>, LHS
+ 1182004127U, // <u,6,1,0>: Cost 2 vrev <6,u,0,1>
+ 2619212596U, // <u,6,1,1>: Cost 3 vext2 <0,2,u,6>, <1,1,1,1>
+ 2619212694U, // <u,6,1,2>: Cost 3 vext2 <0,2,u,6>, <1,2,3,0>
+ 2619212760U, // <u,6,1,3>: Cost 3 vext2 <0,2,u,6>, <1,3,1,3>
+ 2626511979U, // <u,6,1,4>: Cost 3 vext2 <1,4,u,6>, <1,4,u,6>
+ 2619212944U, // <u,6,1,5>: Cost 3 vext2 <0,2,u,6>, <1,5,3,7>
+ 2714063264U, // <u,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
+ 2967326006U, // <u,6,1,7>: Cost 3 vzipr <2,3,u,1>, RHS
+ 1182594023U, // <u,6,1,u>: Cost 2 vrev <6,u,u,1>
+ 1506050150U, // <u,6,2,0>: Cost 2 vext1 <4,u,6,2>, LHS
+ 2579792630U, // <u,6,2,1>: Cost 3 vext1 <4,u,6,2>, <1,0,3,2>
+ 2619213416U, // <u,6,2,2>: Cost 3 vext2 <0,2,u,6>, <2,2,2,2>
+ 2619213478U, // <u,6,2,3>: Cost 3 vext2 <0,2,u,6>, <2,3,0,1>
+ 1506053430U, // <u,6,2,4>: Cost 2 vext1 <4,u,6,2>, RHS
+ 2633148309U, // <u,6,2,5>: Cost 3 vext2 <2,5,u,6>, <2,5,u,6>
+ 2619213754U, // <u,6,2,6>: Cost 3 vext2 <0,2,u,6>, <2,6,3,7>
+ 1638330874U, // <u,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
+ 1638478339U, // <u,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
+ 2619213974U, // <u,6,3,0>: Cost 3 vext2 <0,2,u,6>, <3,0,1,2>
+ 2255836074U, // <u,6,3,1>: Cost 3 vrev <6,u,1,3>
+ 2255909811U, // <u,6,3,2>: Cost 3 vrev <6,u,2,3>
+ 2619214236U, // <u,6,3,3>: Cost 3 vext2 <0,2,u,6>, <3,3,3,3>
+ 1564715549U, // <u,6,3,4>: Cost 2 vext2 <3,4,u,6>, <3,4,u,6>
+ 2639121006U, // <u,6,3,5>: Cost 3 vext2 <3,5,u,6>, <3,5,u,6>
+ 3001847012U, // <u,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
+ 1880329526U, // <u,6,3,7>: Cost 2 vzipr LHS, RHS
+ 1880329527U, // <u,6,3,u>: Cost 2 vzipr LHS, RHS
+ 2567864422U, // <u,6,4,0>: Cost 3 vext1 <2,u,6,4>, LHS
+ 2733011558U, // <u,6,4,1>: Cost 3 vext3 LHS, <6,4,1,3>
+ 2567866484U, // <u,6,4,2>: Cost 3 vext1 <2,u,6,4>, <2,u,6,4>
+ 2638458005U, // <u,6,4,3>: Cost 3 vext2 <3,4,u,6>, <4,3,6,u>
+ 1570540772U, // <u,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
+ 1545473334U, // <u,6,4,5>: Cost 2 vext2 <0,2,u,6>, RHS
+ 1572015512U, // <u,6,4,6>: Cost 2 vext2 <4,6,u,6>, <4,6,u,6>
+ 2960715062U, // <u,6,4,7>: Cost 3 vzipr <1,2,u,4>, RHS
+ 1545473577U, // <u,6,4,u>: Cost 2 vext2 <0,2,u,6>, RHS
+ 2567872614U, // <u,6,5,0>: Cost 3 vext1 <2,u,6,5>, LHS
+ 2645757648U, // <u,6,5,1>: Cost 3 vext2 <4,6,u,6>, <5,1,7,3>
+ 2567874490U, // <u,6,5,2>: Cost 3 vext1 <2,u,6,5>, <2,6,3,7>
+ 2576501250U, // <u,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
+ 1576660943U, // <u,6,5,4>: Cost 2 vext2 <5,4,u,6>, <5,4,u,6>
+ 2645757956U, // <u,6,5,5>: Cost 3 vext2 <4,6,u,6>, <5,5,5,5>
+ 2645758050U, // <u,6,5,6>: Cost 3 vext2 <4,6,u,6>, <5,6,7,0>
+ 2824080694U, // <u,6,5,7>: Cost 3 vuzpr <0,u,2,6>, RHS
+ 1182626795U, // <u,6,5,u>: Cost 2 vrev <6,u,u,5>
+ 1506082918U, // <u,6,6,0>: Cost 2 vext1 <4,u,6,6>, LHS
+ 2579825398U, // <u,6,6,1>: Cost 3 vext1 <4,u,6,6>, <1,0,3,2>
+ 2645758458U, // <u,6,6,2>: Cost 3 vext2 <4,6,u,6>, <6,2,7,3>
+ 2579826838U, // <u,6,6,3>: Cost 3 vext1 <4,u,6,6>, <3,0,1,2>
+ 1506086198U, // <u,6,6,4>: Cost 2 vext1 <4,u,6,6>, RHS
+ 2579828432U, // <u,6,6,5>: Cost 3 vext1 <4,u,6,6>, <5,1,7,3>
+ 296144182U, // <u,6,6,6>: Cost 1 vdup2 RHS
+ 1638331202U, // <u,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
+ 296144182U, // <u,6,6,u>: Cost 1 vdup2 RHS
+ 432349286U, // <u,6,7,0>: Cost 1 vext1 RHS, LHS
+ 1506091766U, // <u,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
+ 1506092648U, // <u,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+ 1506093206U, // <u,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
+ 432352809U, // <u,6,7,4>: Cost 1 vext1 RHS, RHS
+ 1506094800U, // <u,6,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+ 1506095610U, // <u,6,7,6>: Cost 2 vext1 RHS, <6,2,7,3>
+ 1906904374U, // <u,6,7,7>: Cost 2 vzipr RHS, RHS
+ 432355118U, // <u,6,7,u>: Cost 1 vext1 RHS, LHS
+ 432357478U, // <u,6,u,0>: Cost 1 vext1 RHS, LHS
+ 1545475886U, // <u,6,u,1>: Cost 2 vext2 <0,2,u,6>, LHS
+ 1506100840U, // <u,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
+ 1506101398U, // <u,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
+ 432361002U, // <u,6,u,4>: Cost 1 vext1 RHS, RHS
+ 1545476250U, // <u,6,u,5>: Cost 2 vext2 <0,2,u,6>, RHS
+ 296144182U, // <u,6,u,6>: Cost 1 vdup2 RHS
+ 1880370486U, // <u,6,u,7>: Cost 2 vzipr LHS, RHS
+ 432363310U, // <u,6,u,u>: Cost 1 vext1 RHS, LHS
+ 1571356672U, // <u,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+ 497614950U, // <u,7,0,1>: Cost 1 vext2 RHS, LHS
+ 1571356836U, // <u,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+ 2573880146U, // <u,7,0,3>: Cost 3 vext1 <3,u,7,0>, <3,u,7,0>
+ 1571357010U, // <u,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+ 1512083716U, // <u,7,0,5>: Cost 2 vext1 <5,u,7,0>, <5,u,7,0>
+ 2621874741U, // <u,7,0,6>: Cost 3 vext2 <0,6,u,7>, <0,6,u,7>
+ 2585826298U, // <u,7,0,7>: Cost 3 vext1 <5,u,7,0>, <7,0,1,2>
+ 497615517U, // <u,7,0,u>: Cost 1 vext2 RHS, LHS
+ 1571357430U, // <u,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+ 1571357492U, // <u,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+ 1571357590U, // <u,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
+ 1552114715U, // <u,7,1,3>: Cost 2 vext2 <1,3,u,7>, <1,3,u,7>
+ 2573888822U, // <u,7,1,4>: Cost 3 vext1 <3,u,7,1>, RHS
+ 1553441981U, // <u,7,1,5>: Cost 2 vext2 <1,5,u,7>, <1,5,u,7>
+ 2627847438U, // <u,7,1,6>: Cost 3 vext2 <1,6,u,7>, <1,6,u,7>
+ 2727408775U, // <u,7,1,7>: Cost 3 vext3 <7,1,7,u>, <7,1,7,u>
+ 1555432880U, // <u,7,1,u>: Cost 2 vext2 <1,u,u,7>, <1,u,u,7>
+ 2629838337U, // <u,7,2,0>: Cost 3 vext2 <2,0,u,7>, <2,0,u,7>
+ 1188058754U, // <u,7,2,1>: Cost 2 vrev <7,u,1,2>
+ 1571358312U, // <u,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+ 1571358374U, // <u,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+ 2632492869U, // <u,7,2,4>: Cost 3 vext2 <2,4,u,7>, <2,4,u,7>
+ 2633156502U, // <u,7,2,5>: Cost 3 vext2 <2,5,u,7>, <2,5,u,7>
+ 1560078311U, // <u,7,2,6>: Cost 2 vext2 <2,6,u,7>, <2,6,u,7>
+ 2728072408U, // <u,7,2,7>: Cost 3 vext3 <7,2,7,u>, <7,2,7,u>
+ 1561405577U, // <u,7,2,u>: Cost 2 vext2 <2,u,u,7>, <2,u,u,7>
+ 1571358870U, // <u,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+ 2627184913U, // <u,7,3,1>: Cost 3 vext2 <1,5,u,7>, <3,1,5,u>
+ 2633820523U, // <u,7,3,2>: Cost 3 vext2 <2,6,u,7>, <3,2,6,u>
+ 1571359132U, // <u,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+ 1571359234U, // <u,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+ 1512108295U, // <u,7,3,5>: Cost 2 vext1 <5,u,7,3>, <5,u,7,3>
+ 1518080992U, // <u,7,3,6>: Cost 2 vext1 <6,u,7,3>, <6,u,7,3>
+ 2640456465U, // <u,7,3,7>: Cost 3 vext2 <3,7,u,7>, <3,7,u,7>
+ 1571359518U, // <u,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+ 1571359634U, // <u,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+ 2573911067U, // <u,7,4,1>: Cost 3 vext1 <3,u,7,4>, <1,3,u,7>
+ 2645101622U, // <u,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
+ 2573912918U, // <u,7,4,3>: Cost 3 vext1 <3,u,7,4>, <3,u,7,4>
+ 1571359952U, // <u,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+ 497618248U, // <u,7,4,5>: Cost 1 vext2 RHS, RHS
+ 1571360116U, // <u,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+ 2645102024U, // <u,7,4,7>: Cost 3 vext2 RHS, <4,7,5,0>
+ 497618473U, // <u,7,4,u>: Cost 1 vext2 RHS, RHS
+ 2645102152U, // <u,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
+ 1571360464U, // <u,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+ 2645102334U, // <u,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
+ 2645102447U, // <u,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
+ 1571360710U, // <u,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+ 1571360772U, // <u,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+ 1571360866U, // <u,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
+ 1571360936U, // <u,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+ 1571361017U, // <u,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
+ 1530044518U, // <u,7,6,0>: Cost 2 vext1 <u,u,7,6>, LHS
+ 2645103016U, // <u,7,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
+ 1571361274U, // <u,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+ 2645103154U, // <u,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
+ 1530047798U, // <u,7,6,4>: Cost 2 vext1 <u,u,7,6>, RHS
+ 1188386474U, // <u,7,6,5>: Cost 2 vrev <7,u,5,6>
+ 1571361592U, // <u,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
+ 1571361614U, // <u,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+ 1571361695U, // <u,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
+ 1571361786U, // <u,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
+ 2573935616U, // <u,7,7,1>: Cost 3 vext1 <3,u,7,7>, <1,3,5,7>
+ 2645103781U, // <u,7,7,2>: Cost 3 vext2 RHS, <7,2,2,2>
+ 2573937497U, // <u,7,7,3>: Cost 3 vext1 <3,u,7,7>, <3,u,7,7>
+ 1571362150U, // <u,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
+ 1512141067U, // <u,7,7,5>: Cost 2 vext1 <5,u,7,7>, <5,u,7,7>
+ 1518113764U, // <u,7,7,6>: Cost 2 vext1 <6,u,7,7>, <6,u,7,7>
+ 363253046U, // <u,7,7,7>: Cost 1 vdup3 RHS
+ 363253046U, // <u,7,7,u>: Cost 1 vdup3 RHS
+ 1571362515U, // <u,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
+ 497620782U, // <u,7,u,1>: Cost 1 vext2 RHS, LHS
+ 1571362693U, // <u,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
+ 1571362748U, // <u,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+ 1571362879U, // <u,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
+ 497621146U, // <u,7,u,5>: Cost 1 vext2 RHS, RHS
+ 1571363024U, // <u,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
+ 363253046U, // <u,7,u,7>: Cost 1 vdup3 RHS
+ 497621349U, // <u,7,u,u>: Cost 1 vext2 RHS, LHS
+ 135053414U, // <u,u,0,0>: Cost 1 vdup0 LHS
+ 471081121U, // <u,u,0,1>: Cost 1 vext2 LHS, LHS
+ 1544822948U, // <u,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+ 1616140005U, // <u,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
+ 1544823122U, // <u,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+ 1512157453U, // <u,u,0,5>: Cost 2 vext1 <5,u,u,0>, <5,u,u,0>
+ 1662220032U, // <u,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
+ 1194457487U, // <u,u,0,7>: Cost 2 vrev <u,u,7,0>
+ 471081629U, // <u,u,0,u>: Cost 1 vext2 LHS, LHS
+ 1544823542U, // <u,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+ 202162278U, // <u,u,1,1>: Cost 1 vdup1 LHS
+ 537753390U, // <u,u,1,2>: Cost 1 vext3 LHS, LHS
+ 1544823768U, // <u,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+ 1494248758U, // <u,u,1,4>: Cost 2 vext1 <2,u,u,1>, RHS
+ 1544823952U, // <u,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+ 1518138343U, // <u,u,1,6>: Cost 2 vext1 <6,u,u,1>, <6,u,u,1>
+ 1640322907U, // <u,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
+ 537753444U, // <u,u,1,u>: Cost 1 vext3 LHS, LHS
+ 1482309734U, // <u,u,2,0>: Cost 2 vext1 <0,u,u,2>, LHS
+ 1194031451U, // <u,u,2,1>: Cost 2 vrev <u,u,1,2>
+ 269271142U, // <u,u,2,2>: Cost 1 vdup2 LHS
+ 835584U, // <u,u,2,3>: Cost 0 copy LHS
+ 1482313014U, // <u,u,2,4>: Cost 2 vext1 <0,u,u,2>, RHS
+ 2618566504U, // <u,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+ 1544824762U, // <u,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+ 1638479788U, // <u,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
+ 835584U, // <u,u,2,u>: Cost 0 copy LHS
+ 408576723U, // <u,u,3,0>: Cost 1 vext1 LHS, LHS
+ 1482318582U, // <u,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+ 120371557U, // <u,u,3,2>: Cost 1 vrev LHS
+ 336380006U, // <u,u,3,3>: Cost 1 vdup3 LHS
+ 408579382U, // <u,u,3,4>: Cost 1 vext1 LHS, RHS
+ 1616140271U, // <u,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
+ 1530098170U, // <u,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+ 1880329544U, // <u,u,3,7>: Cost 2 vzipr LHS, RHS
+ 408581934U, // <u,u,3,u>: Cost 1 vext1 LHS, LHS
+ 1488298086U, // <u,u,4,0>: Cost 2 vext1 <1,u,u,4>, LHS
+ 1488299437U, // <u,u,4,1>: Cost 2 vext1 <1,u,u,4>, <1,u,u,4>
+ 1659271204U, // <u,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
+ 1194195311U, // <u,u,4,3>: Cost 2 vrev <u,u,3,4>
+ 161926454U, // <u,u,4,4>: Cost 1 vdup0 RHS
+ 471084342U, // <u,u,4,5>: Cost 1 vext2 LHS, RHS
+ 1571368308U, // <u,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+ 1640323153U, // <u,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
+ 471084585U, // <u,u,4,u>: Cost 1 vext2 LHS, RHS
+ 1494278246U, // <u,u,5,0>: Cost 2 vext1 <2,u,u,5>, LHS
+ 1571368656U, // <u,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+ 1494280327U, // <u,u,5,2>: Cost 2 vext1 <2,u,u,5>, <2,u,u,5>
+ 1616140415U, // <u,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
+ 1494281526U, // <u,u,5,4>: Cost 2 vext1 <2,u,u,5>, RHS
+ 229035318U, // <u,u,5,5>: Cost 1 vdup1 RHS
+ 537753754U, // <u,u,5,6>: Cost 1 vext3 LHS, RHS
+ 1750355254U, // <u,u,5,7>: Cost 2 vuzpr LHS, RHS
+ 537753772U, // <u,u,5,u>: Cost 1 vext3 LHS, RHS
+ 1482342502U, // <u,u,6,0>: Cost 2 vext1 <0,u,u,6>, LHS
+ 2556084982U, // <u,u,6,1>: Cost 3 vext1 <0,u,u,6>, <1,0,3,2>
+ 1571369466U, // <u,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+ 1611938000U, // <u,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
+ 1482345782U, // <u,u,6,4>: Cost 2 vext1 <0,u,u,6>, RHS
+ 1194359171U, // <u,u,6,5>: Cost 2 vrev <u,u,5,6>
+ 296144182U, // <u,u,6,6>: Cost 1 vdup2 RHS
+ 27705344U, // <u,u,6,7>: Cost 0 copy RHS
+ 27705344U, // <u,u,6,u>: Cost 0 copy RHS
+ 432496742U, // <u,u,7,0>: Cost 1 vext1 RHS, LHS
+ 1488324016U, // <u,u,7,1>: Cost 2 vext1 <1,u,u,7>, <1,u,u,7>
+ 1494296713U, // <u,u,7,2>: Cost 2 vext1 <2,u,u,7>, <2,u,u,7>
+ 1906901148U, // <u,u,7,3>: Cost 2 vzipr RHS, LHS
+ 432500283U, // <u,u,7,4>: Cost 1 vext1 RHS, RHS
+ 1506242256U, // <u,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+ 120699277U, // <u,u,7,6>: Cost 1 vrev RHS
+ 363253046U, // <u,u,7,7>: Cost 1 vdup3 RHS
+ 432502574U, // <u,u,7,u>: Cost 1 vext1 RHS, LHS
+ 408617688U, // <u,u,u,0>: Cost 1 vext1 LHS, LHS
+ 471086894U, // <u,u,u,1>: Cost 1 vext2 LHS, LHS
+ 537753957U, // <u,u,u,2>: Cost 1 vext3 LHS, LHS
+ 835584U, // <u,u,u,3>: Cost 0 copy LHS
+ 408620342U, // <u,u,u,4>: Cost 1 vext1 LHS, RHS
+ 471087258U, // <u,u,u,5>: Cost 1 vext2 LHS, RHS
+ 537753997U, // <u,u,u,6>: Cost 1 vext3 LHS, RHS
+ 27705344U, // <u,u,u,7>: Cost 0 copy RHS
+ 835584U, // <u,u,u,u>: Cost 0 copy LHS
+ 0
+};
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
new file mode 100644
index 000000000000..9bd036a1eace
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
@@ -0,0 +1,127 @@
+//===- ARMRegisterBankInfo.cpp -----------------------------------*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the RegisterBankInfo class for ARM.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "ARMRegisterBankInfo.h"
+#include "ARMInstrInfo.h" // For the register classes
+#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "You shouldn't build this"
+#endif
+
+// FIXME: TableGen this.
+// If it grows too much and TableGen still isn't ready to do the job, extract it
+// into an ARMGenRegisterBankInfo.def (similar to AArch64).
+namespace llvm {
+namespace ARM {
+RegisterBank GPRRegBank;
+RegisterBank *RegBanks[] = {&GPRRegBank};
+
+RegisterBankInfo::PartialMapping GPRPartialMapping{0, 32, GPRRegBank};
+
+RegisterBankInfo::ValueMapping ValueMappings[] = {
+ {&GPRPartialMapping, 1}, {&GPRPartialMapping, 1}, {&GPRPartialMapping, 1}};
+} // end namespace arm
+} // end namespace llvm
+
+ARMRegisterBankInfo::ARMRegisterBankInfo(const TargetRegisterInfo &TRI)
+ : RegisterBankInfo(ARM::RegBanks, ARM::NumRegisterBanks) {
+ static bool AlreadyInit = false;
+ // We have only one set of register banks, whatever the subtarget
+ // is. Therefore, the initialization of the RegBanks table should be
+ // done only once. Indeed the table of all register banks
+ // (ARM::RegBanks) is unique in the compiler. At some point, it
+ // will get tablegen'ed and the whole constructor becomes empty.
+ if (AlreadyInit)
+ return;
+ AlreadyInit = true;
+
+ // Initialize the GPR bank.
+ createRegisterBank(ARM::GPRRegBankID, "GPRB");
+
+ addRegBankCoverage(ARM::GPRRegBankID, ARM::GPRRegClassID, TRI);
+ addRegBankCoverage(ARM::GPRRegBankID, ARM::GPRwithAPSRRegClassID, TRI);
+ const RegisterBank &RBGPR = getRegBank(ARM::GPRRegBankID);
+ (void)RBGPR;
+ assert(&ARM::GPRRegBank == &RBGPR && "The order in RegBanks is messed up");
+ assert(RBGPR.covers(*TRI.getRegClass(ARM::GPRRegClassID)) &&
+ "Subclass not added?");
+ assert(RBGPR.covers(*TRI.getRegClass(ARM::GPRwithAPSRRegClassID)) &&
+ "Subclass not added?");
+ assert(RBGPR.covers(*TRI.getRegClass(ARM::GPRnopcRegClassID)) &&
+ "Subclass not added?");
+ assert(RBGPR.covers(*TRI.getRegClass(ARM::rGPRRegClassID)) &&
+ "Subclass not added?");
+ assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPRRegClassID)) &&
+ "Subclass not added?");
+ assert(RBGPR.covers(*TRI.getRegClass(ARM::tcGPRRegClassID)) &&
+ "Subclass not added?");
+ assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPR_and_tcGPRRegClassID)) &&
+ "Subclass not added?");
+ assert(RBGPR.getSize() == 32 && "GPRs should hold up to 32-bit");
+}
+
+const RegisterBank &ARMRegisterBankInfo::getRegBankFromRegClass(
+ const TargetRegisterClass &RC) const {
+ using namespace ARM;
+
+ switch (RC.getID()) {
+ case GPRRegClassID:
+ case tGPR_and_tcGPRRegClassID:
+ return getRegBank(ARM::GPRRegBankID);
+ default:
+ llvm_unreachable("Unsupported register kind");
+ }
+
+ llvm_unreachable("Switch should handle all register classes");
+}
+
+RegisterBankInfo::InstructionMapping
+ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
+ auto Opc = MI.getOpcode();
+
+ // Try the default logic for non-generic instructions that are either copies
+ // or already have some operands assigned to banks.
+ if (!isPreISelGenericOpcode(Opc)) {
+ InstructionMapping Mapping = getInstrMappingImpl(MI);
+ if (Mapping.isValid())
+ return Mapping;
+ }
+
+ using namespace TargetOpcode;
+
+ unsigned NumOperands = MI.getNumOperands();
+ const ValueMapping *OperandsMapping = &ARM::ValueMappings[0];
+
+ switch (Opc) {
+ case G_ADD:
+ case G_LOAD:
+ // FIXME: We're abusing the fact that everything lives in a GPR for now; in
+ // the real world we would use different mappings.
+ OperandsMapping = &ARM::ValueMappings[0];
+ break;
+ case G_FRAME_INDEX:
+ OperandsMapping = getOperandsMapping({&ARM::ValueMappings[0], nullptr});
+ break;
+ default:
+ return InstructionMapping{};
+ }
+
+ return InstructionMapping{DefaultMappingID, /*Cost=*/1, OperandsMapping,
+ NumOperands};
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.h b/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.h
new file mode 100644
index 000000000000..773920ee57a7
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.h
@@ -0,0 +1,41 @@
+//===- ARMRegisterBankInfo ---------------------------------------*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the RegisterBankInfo class for ARM.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMREGISTERBANKINFO_H
+#define LLVM_LIB_TARGET_ARM_ARMREGISTERBANKINFO_H
+
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+
+namespace llvm {
+
+class TargetRegisterInfo;
+
+namespace ARM {
+enum {
+ GPRRegBankID = 0, // General purpose registers
+ NumRegisterBanks,
+};
+} // end namespace ARM
+
+/// This class provides the information for the target register banks.
+class ARMRegisterBankInfo final : public RegisterBankInfo {
+public:
+ ARMRegisterBankInfo(const TargetRegisterInfo &TRI);
+
+ const RegisterBank &
+ getRegBankFromRegClass(const TargetRegisterClass &RC) const override;
+
+ InstructionMapping getInstrMapping(const MachineInstr &MI) const override;
+};
+} // End llvm namespace.
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.cpp
new file mode 100644
index 000000000000..e6e8cdf965e2
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.cpp
@@ -0,0 +1,19 @@
+//===-- ARMRegisterInfo.cpp - ARM Register Information --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMRegisterInfo.h"
+using namespace llvm;
+
+void ARMRegisterInfo::anchor() { }
+
+ARMRegisterInfo::ARMRegisterInfo() : ARMBaseRegisterInfo() {}
diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.h b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.h
new file mode 100644
index 000000000000..e2e650e4af93
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.h
@@ -0,0 +1,31 @@
+//===-- ARMRegisterInfo.h - ARM Register Information Impl -------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMREGISTERINFO_H
+#define LLVM_LIB_TARGET_ARM_ARMREGISTERINFO_H
+
+#include "ARMBaseRegisterInfo.h"
+
+namespace llvm {
+
+class ARMSubtarget;
+
+struct ARMRegisterInfo : public ARMBaseRegisterInfo {
+ virtual void anchor();
+public:
+ ARMRegisterInfo();
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td
new file mode 100644
index 000000000000..02cbfb1fa9f1
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td
@@ -0,0 +1,430 @@
+//===-- ARMRegisterInfo.td - ARM Register defs -------------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Declarations that describe the ARM register file
+//===----------------------------------------------------------------------===//
+
+// Registers are identified with 4-bit ID numbers.
+class ARMReg<bits<16> Enc, string n, list<Register> subregs = []> : Register<n> {
+ let HWEncoding = Enc;
+ let Namespace = "ARM";
+ let SubRegs = subregs;
+ // All bits of ARM registers with sub-registers are covered by sub-registers.
+ let CoveredBySubRegs = 1;
+}
+
+class ARMFReg<bits<16> Enc, string n> : Register<n> {
+ let HWEncoding = Enc;
+ let Namespace = "ARM";
+}
+
+// Subregister indices.
+let Namespace = "ARM" in {
+def qqsub_0 : SubRegIndex<256>;
+def qqsub_1 : SubRegIndex<256, 256>;
+
+// Note: Code depends on these having consecutive numbers.
+def qsub_0 : SubRegIndex<128>;
+def qsub_1 : SubRegIndex<128, 128>;
+def qsub_2 : ComposedSubRegIndex<qqsub_1, qsub_0>;
+def qsub_3 : ComposedSubRegIndex<qqsub_1, qsub_1>;
+
+def dsub_0 : SubRegIndex<64>;
+def dsub_1 : SubRegIndex<64, 64>;
+def dsub_2 : ComposedSubRegIndex<qsub_1, dsub_0>;
+def dsub_3 : ComposedSubRegIndex<qsub_1, dsub_1>;
+def dsub_4 : ComposedSubRegIndex<qsub_2, dsub_0>;
+def dsub_5 : ComposedSubRegIndex<qsub_2, dsub_1>;
+def dsub_6 : ComposedSubRegIndex<qsub_3, dsub_0>;
+def dsub_7 : ComposedSubRegIndex<qsub_3, dsub_1>;
+
+def ssub_0 : SubRegIndex<32>;
+def ssub_1 : SubRegIndex<32, 32>;
+def ssub_2 : ComposedSubRegIndex<dsub_1, ssub_0>;
+def ssub_3 : ComposedSubRegIndex<dsub_1, ssub_1>;
+
+def gsub_0 : SubRegIndex<32>;
+def gsub_1 : SubRegIndex<32, 32>;
+// Let TableGen synthesize the remaining 12 ssub_* indices.
+// We don't need to name them.
+}
+
+// Integer registers
+def R0 : ARMReg< 0, "r0">, DwarfRegNum<[0]>;
+def R1 : ARMReg< 1, "r1">, DwarfRegNum<[1]>;
+def R2 : ARMReg< 2, "r2">, DwarfRegNum<[2]>;
+def R3 : ARMReg< 3, "r3">, DwarfRegNum<[3]>;
+def R4 : ARMReg< 4, "r4">, DwarfRegNum<[4]>;
+def R5 : ARMReg< 5, "r5">, DwarfRegNum<[5]>;
+def R6 : ARMReg< 6, "r6">, DwarfRegNum<[6]>;
+def R7 : ARMReg< 7, "r7">, DwarfRegNum<[7]>;
+// These require 32-bit instructions.
+let CostPerUse = 1 in {
+def R8 : ARMReg< 8, "r8">, DwarfRegNum<[8]>;
+def R9 : ARMReg< 9, "r9">, DwarfRegNum<[9]>;
+def R10 : ARMReg<10, "r10">, DwarfRegNum<[10]>;
+def R11 : ARMReg<11, "r11">, DwarfRegNum<[11]>;
+def R12 : ARMReg<12, "r12">, DwarfRegNum<[12]>;
+def SP : ARMReg<13, "sp">, DwarfRegNum<[13]>;
+def LR : ARMReg<14, "lr">, DwarfRegNum<[14]>;
+def PC : ARMReg<15, "pc">, DwarfRegNum<[15]>;
+}
+
+// Float registers
+def S0 : ARMFReg< 0, "s0">; def S1 : ARMFReg< 1, "s1">;
+def S2 : ARMFReg< 2, "s2">; def S3 : ARMFReg< 3, "s3">;
+def S4 : ARMFReg< 4, "s4">; def S5 : ARMFReg< 5, "s5">;
+def S6 : ARMFReg< 6, "s6">; def S7 : ARMFReg< 7, "s7">;
+def S8 : ARMFReg< 8, "s8">; def S9 : ARMFReg< 9, "s9">;
+def S10 : ARMFReg<10, "s10">; def S11 : ARMFReg<11, "s11">;
+def S12 : ARMFReg<12, "s12">; def S13 : ARMFReg<13, "s13">;
+def S14 : ARMFReg<14, "s14">; def S15 : ARMFReg<15, "s15">;
+def S16 : ARMFReg<16, "s16">; def S17 : ARMFReg<17, "s17">;
+def S18 : ARMFReg<18, "s18">; def S19 : ARMFReg<19, "s19">;
+def S20 : ARMFReg<20, "s20">; def S21 : ARMFReg<21, "s21">;
+def S22 : ARMFReg<22, "s22">; def S23 : ARMFReg<23, "s23">;
+def S24 : ARMFReg<24, "s24">; def S25 : ARMFReg<25, "s25">;
+def S26 : ARMFReg<26, "s26">; def S27 : ARMFReg<27, "s27">;
+def S28 : ARMFReg<28, "s28">; def S29 : ARMFReg<29, "s29">;
+def S30 : ARMFReg<30, "s30">; def S31 : ARMFReg<31, "s31">;
+
+// Aliases of the F* registers used to hold 64-bit fp values (doubles)
+let SubRegIndices = [ssub_0, ssub_1] in {
+def D0 : ARMReg< 0, "d0", [S0, S1]>, DwarfRegNum<[256]>;
+def D1 : ARMReg< 1, "d1", [S2, S3]>, DwarfRegNum<[257]>;
+def D2 : ARMReg< 2, "d2", [S4, S5]>, DwarfRegNum<[258]>;
+def D3 : ARMReg< 3, "d3", [S6, S7]>, DwarfRegNum<[259]>;
+def D4 : ARMReg< 4, "d4", [S8, S9]>, DwarfRegNum<[260]>;
+def D5 : ARMReg< 5, "d5", [S10, S11]>, DwarfRegNum<[261]>;
+def D6 : ARMReg< 6, "d6", [S12, S13]>, DwarfRegNum<[262]>;
+def D7 : ARMReg< 7, "d7", [S14, S15]>, DwarfRegNum<[263]>;
+def D8 : ARMReg< 8, "d8", [S16, S17]>, DwarfRegNum<[264]>;
+def D9 : ARMReg< 9, "d9", [S18, S19]>, DwarfRegNum<[265]>;
+def D10 : ARMReg<10, "d10", [S20, S21]>, DwarfRegNum<[266]>;
+def D11 : ARMReg<11, "d11", [S22, S23]>, DwarfRegNum<[267]>;
+def D12 : ARMReg<12, "d12", [S24, S25]>, DwarfRegNum<[268]>;
+def D13 : ARMReg<13, "d13", [S26, S27]>, DwarfRegNum<[269]>;
+def D14 : ARMReg<14, "d14", [S28, S29]>, DwarfRegNum<[270]>;
+def D15 : ARMReg<15, "d15", [S30, S31]>, DwarfRegNum<[271]>;
+}
+
+// VFP3 defines 16 additional double registers
+def D16 : ARMFReg<16, "d16">, DwarfRegNum<[272]>;
+def D17 : ARMFReg<17, "d17">, DwarfRegNum<[273]>;
+def D18 : ARMFReg<18, "d18">, DwarfRegNum<[274]>;
+def D19 : ARMFReg<19, "d19">, DwarfRegNum<[275]>;
+def D20 : ARMFReg<20, "d20">, DwarfRegNum<[276]>;
+def D21 : ARMFReg<21, "d21">, DwarfRegNum<[277]>;
+def D22 : ARMFReg<22, "d22">, DwarfRegNum<[278]>;
+def D23 : ARMFReg<23, "d23">, DwarfRegNum<[279]>;
+def D24 : ARMFReg<24, "d24">, DwarfRegNum<[280]>;
+def D25 : ARMFReg<25, "d25">, DwarfRegNum<[281]>;
+def D26 : ARMFReg<26, "d26">, DwarfRegNum<[282]>;
+def D27 : ARMFReg<27, "d27">, DwarfRegNum<[283]>;
+def D28 : ARMFReg<28, "d28">, DwarfRegNum<[284]>;
+def D29 : ARMFReg<29, "d29">, DwarfRegNum<[285]>;
+def D30 : ARMFReg<30, "d30">, DwarfRegNum<[286]>;
+def D31 : ARMFReg<31, "d31">, DwarfRegNum<[287]>;
+
+// Advanced SIMD (NEON) defines 16 quad-word aliases
+let SubRegIndices = [dsub_0, dsub_1] in {
+def Q0 : ARMReg< 0, "q0", [D0, D1]>;
+def Q1 : ARMReg< 1, "q1", [D2, D3]>;
+def Q2 : ARMReg< 2, "q2", [D4, D5]>;
+def Q3 : ARMReg< 3, "q3", [D6, D7]>;
+def Q4 : ARMReg< 4, "q4", [D8, D9]>;
+def Q5 : ARMReg< 5, "q5", [D10, D11]>;
+def Q6 : ARMReg< 6, "q6", [D12, D13]>;
+def Q7 : ARMReg< 7, "q7", [D14, D15]>;
+}
+let SubRegIndices = [dsub_0, dsub_1] in {
+def Q8 : ARMReg< 8, "q8", [D16, D17]>;
+def Q9 : ARMReg< 9, "q9", [D18, D19]>;
+def Q10 : ARMReg<10, "q10", [D20, D21]>;
+def Q11 : ARMReg<11, "q11", [D22, D23]>;
+def Q12 : ARMReg<12, "q12", [D24, D25]>;
+def Q13 : ARMReg<13, "q13", [D26, D27]>;
+def Q14 : ARMReg<14, "q14", [D28, D29]>;
+def Q15 : ARMReg<15, "q15", [D30, D31]>;
+}
+
+// Current Program Status Register.
+// We model fpscr with two registers: FPSCR models the control bits and will be
+// reserved. FPSCR_NZCV models the flag bits and will be unreserved. APSR_NZCV
+// models the APSR when it's accessed by some special instructions. In such cases
+// it has the same encoding as PC.
+def CPSR : ARMReg<0, "cpsr">;
+def APSR : ARMReg<1, "apsr">;
+def APSR_NZCV : ARMReg<15, "apsr_nzcv">;
+def SPSR : ARMReg<2, "spsr">;
+def FPSCR : ARMReg<3, "fpscr">;
+def FPSCR_NZCV : ARMReg<3, "fpscr_nzcv"> {
+ let Aliases = [FPSCR];
+}
+def ITSTATE : ARMReg<4, "itstate">;
+
+// Special Registers - only available in privileged mode.
+def FPSID : ARMReg<0, "fpsid">;
+def MVFR2 : ARMReg<5, "mvfr2">;
+def MVFR1 : ARMReg<6, "mvfr1">;
+def MVFR0 : ARMReg<7, "mvfr0">;
+def FPEXC : ARMReg<8, "fpexc">;
+def FPINST : ARMReg<9, "fpinst">;
+def FPINST2 : ARMReg<10, "fpinst2">;
+
+// Register classes.
+//
+// pc == Program Counter
+// lr == Link Register
+// sp == Stack Pointer
+// r12 == ip (scratch)
+// r7 == Frame Pointer (thumb-style backtraces)
+// r9 == May be reserved as Thread Register
+// r11 == Frame Pointer (arm-style backtraces)
+// r10 == Stack Limit
+//
+def GPR : RegisterClass<"ARM", [i32], 32, (add (sequence "R%u", 0, 12),
+ SP, LR, PC)> {
+ // Allocate LR as the first CSR since it is always saved anyway.
+ // For Thumb1 mode, we don't want to allocate hi regs at all, as we don't
+ // know how to spill them. If we make our prologue/epilogue code smarter at
+ // some point, we can go back to using the above allocation orders for the
+ // Thumb1 instructions that know how to use hi regs.
+ let AltOrders = [(add LR, GPR), (trunc GPR, 8)];
+ let AltOrderSelect = [{
+ return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only();
+ }];
+}
+
+// GPRs without the PC. Some ARM instructions do not allow the PC in
+// certain operand slots, particularly as the destination. Primarily
+// useful for disassembly.
+def GPRnopc : RegisterClass<"ARM", [i32], 32, (sub GPR, PC)> {
+ let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8)];
+ let AltOrderSelect = [{
+ return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only();
+ }];
+}
+
+// GPRs without the PC but with APSR. Some instructions allow accessing the
+// APSR, while actually encoding PC in the register field. This is useful
+// for assembly and disassembly only.
+def GPRwithAPSR : RegisterClass<"ARM", [i32], 32, (add (sub GPR, PC), APSR_NZCV)> {
+ let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8)];
+ let AltOrderSelect = [{
+ return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only();
+ }];
+}
+
+// GPRsp - Only the SP is legal. Used by Thumb1 instructions that want the
+// implied SP argument list.
+// FIXME: It would be better to not use this at all and refactor the
+// instructions to not have SP an an explicit argument. That makes
+// frame index resolution a bit trickier, though.
+def GPRsp : RegisterClass<"ARM", [i32], 32, (add SP)>;
+
+// restricted GPR register class. Many Thumb2 instructions allow the full
+// register range for operands, but have undefined behaviours when PC
+// or SP (R13 or R15) are used. The ARM ISA refers to these operands
+// via the BadReg() pseudo-code description.
+def rGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, SP, PC)> {
+ let AltOrders = [(add LR, rGPR), (trunc rGPR, 8)];
+ let AltOrderSelect = [{
+ return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only();
+ }];
+}
+
+// Thumb registers are R0-R7 normally. Some instructions can still use
+// the general GPR register class above (MOV, e.g.)
+def tGPR : RegisterClass<"ARM", [i32], 32, (trunc GPR, 8)>;
+
+// The high registers in thumb mode, R8-R15.
+def hGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, tGPR)>;
+
+// For tail calls, we can't use callee-saved registers, as they are restored
+// to the saved value before the tail call, which would clobber a call address.
+// Note, getMinimalPhysRegClass(R0) returns tGPR because of the names of
+// this class and the preceding one(!) This is what we want.
+def tcGPR : RegisterClass<"ARM", [i32], 32, (add R0, R1, R2, R3, R12)> {
+ let AltOrders = [(and tcGPR, tGPR)];
+ let AltOrderSelect = [{
+ return MF.getSubtarget<ARMSubtarget>().isThumb1Only();
+ }];
+}
+
+// Condition code registers.
+def CCR : RegisterClass<"ARM", [i32], 32, (add CPSR)> {
+ let CopyCost = -1; // Don't allow copying of status registers.
+ let isAllocatable = 0;
+}
+
+// Scalar single precision floating point register class..
+// FIXME: Allocation order changed to s0, s2, ... or s0, s4, ... as a quick hack
+// to avoid partial-write dependencies on D or Q (depending on platform)
+// registers (S registers are renamed as portions of D/Q registers).
+def SPR : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 31)> {
+ let AltOrders = [(add (decimate SPR, 2), SPR),
+ (add (decimate SPR, 4),
+ (decimate SPR, 2),
+ (decimate (rotl SPR, 1), 4),
+ (decimate (rotl SPR, 1), 2))];
+ let AltOrderSelect = [{
+ return 1 + MF.getSubtarget<ARMSubtarget>().useStride4VFPs(MF);
+ }];
+}
+
+// Subset of SPR which can be used as a source of NEON scalars for 16-bit
+// operations
+def SPR_8 : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 15)>;
+
+// Scalar double precision floating point / generic 64-bit vector register
+// class.
+// ARM requires only word alignment for double. It's more performant if it
+// is double-word alignment though.
+def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16], 64,
+ (sequence "D%u", 0, 31)> {
+ // Allocate non-VFP2 registers D16-D31 first, and prefer even registers on
+ // Darwin platforms.
+ let AltOrders = [(rotl DPR, 16),
+ (add (decimate (rotl DPR, 16), 2), (rotl DPR, 16))];
+ let AltOrderSelect = [{
+ return 1 + MF.getSubtarget<ARMSubtarget>().useStride4VFPs(MF);
+ }];
+}
+
+// Subset of DPR that are accessible with VFP2 (and so that also have
+// 32-bit SPR subregs).
+def DPR_VFP2 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16], 64,
+ (trunc DPR, 16)>;
+
+// Subset of DPR which can be used as a source of NEON scalars for 16-bit
+// operations
+def DPR_8 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16], 64,
+ (trunc DPR, 8)>;
+
+// Generic 128-bit vector register class.
+def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16], 128,
+ (sequence "Q%u", 0, 15)> {
+ // Allocate non-VFP2 aliases Q8-Q15 first.
+ let AltOrders = [(rotl QPR, 8)];
+ let AltOrderSelect = [{ return 1; }];
+}
+
+// Subset of QPR that have 32-bit SPR subregs.
+def QPR_VFP2 : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ 128, (trunc QPR, 8)>;
+
+// Subset of QPR that have DPR_8 and SPR_8 subregs.
+def QPR_8 : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ 128, (trunc QPR, 4)>;
+
+// Pseudo-registers representing odd-even pairs of D registers. The even-odd
+// pairs are already represented by the Q registers.
+// These are needed by NEON instructions requiring two consecutive D registers.
+// There is no D31_D0 register as that is always an UNPREDICTABLE encoding.
+def TuplesOE2D : RegisterTuples<[dsub_0, dsub_1],
+ [(decimate (shl DPR, 1), 2),
+ (decimate (shl DPR, 2), 2)]>;
+
+// Register class representing a pair of consecutive D registers.
+// Use the Q registers for the even-odd pairs.
+def DPair : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ 128, (interleave QPR, TuplesOE2D)> {
+ // Allocate starting at non-VFP2 registers D16-D31 first.
+ // Prefer even-odd pairs as they are easier to copy.
+ let AltOrders = [(add (rotl QPR, 8), (rotl DPair, 16))];
+ let AltOrderSelect = [{ return 1; }];
+}
+
+// Pseudo-registers representing even-odd pairs of GPRs from R1 to R13/SP.
+// These are needed by instructions (e.g. ldrexd/strexd) requiring even-odd GPRs.
+def Tuples2R : RegisterTuples<[gsub_0, gsub_1],
+ [(add R0, R2, R4, R6, R8, R10, R12),
+ (add R1, R3, R5, R7, R9, R11, SP)]>;
+
+// Register class representing a pair of even-odd GPRs.
+def GPRPair : RegisterClass<"ARM", [untyped], 64, (add Tuples2R)> {
+ let Size = 64; // 2 x 32 bits, we have no predefined type of that size.
+}
+
+// Pseudo-registers representing 3 consecutive D registers.
+def Tuples3D : RegisterTuples<[dsub_0, dsub_1, dsub_2],
+ [(shl DPR, 0),
+ (shl DPR, 1),
+ (shl DPR, 2)]>;
+
+// 3 consecutive D registers.
+def DTriple : RegisterClass<"ARM", [untyped], 64, (add Tuples3D)> {
+ let Size = 192; // 3 x 64 bits, we have no predefined type of that size.
+}
+
+// Pseudo 256-bit registers to represent pairs of Q registers. These should
+// never be present in the emitted code.
+// These are used for NEON load / store instructions, e.g., vld4, vst3.
+def Tuples2Q : RegisterTuples<[qsub_0, qsub_1], [(shl QPR, 0), (shl QPR, 1)]>;
+
+// Pseudo 256-bit vector register class to model pairs of Q registers
+// (4 consecutive D registers).
+def QQPR : RegisterClass<"ARM", [v4i64], 256, (add Tuples2Q)> {
+ // Allocate non-VFP2 aliases first.
+ let AltOrders = [(rotl QQPR, 8)];
+ let AltOrderSelect = [{ return 1; }];
+}
+
+// Tuples of 4 D regs that isn't also a pair of Q regs.
+def TuplesOE4D : RegisterTuples<[dsub_0, dsub_1, dsub_2, dsub_3],
+ [(decimate (shl DPR, 1), 2),
+ (decimate (shl DPR, 2), 2),
+ (decimate (shl DPR, 3), 2),
+ (decimate (shl DPR, 4), 2)]>;
+
+// 4 consecutive D registers.
+def DQuad : RegisterClass<"ARM", [v4i64], 256,
+ (interleave Tuples2Q, TuplesOE4D)>;
+
+// Pseudo 512-bit registers to represent four consecutive Q registers.
+def Tuples2QQ : RegisterTuples<[qqsub_0, qqsub_1],
+ [(shl QQPR, 0), (shl QQPR, 2)]>;
+
+// Pseudo 512-bit vector register class to model 4 consecutive Q registers
+// (8 consecutive D registers).
+def QQQQPR : RegisterClass<"ARM", [v8i64], 256, (add Tuples2QQ)> {
+ // Allocate non-VFP2 aliases first.
+ let AltOrders = [(rotl QQQQPR, 8)];
+ let AltOrderSelect = [{ return 1; }];
+}
+
+
+// Pseudo-registers representing 2-spaced consecutive D registers.
+def Tuples2DSpc : RegisterTuples<[dsub_0, dsub_2],
+ [(shl DPR, 0),
+ (shl DPR, 2)]>;
+
+// Spaced pairs of D registers.
+def DPairSpc : RegisterClass<"ARM", [v2i64], 64, (add Tuples2DSpc)>;
+
+def Tuples3DSpc : RegisterTuples<[dsub_0, dsub_2, dsub_4],
+ [(shl DPR, 0),
+ (shl DPR, 2),
+ (shl DPR, 4)]>;
+
+// Spaced triples of D registers.
+def DTripleSpc : RegisterClass<"ARM", [untyped], 64, (add Tuples3DSpc)> {
+ let Size = 192; // 3 x 64 bits, we have no predefined type of that size.
+}
+
+def Tuples4DSpc : RegisterTuples<[dsub_0, dsub_2, dsub_4, dsub_6],
+ [(shl DPR, 0),
+ (shl DPR, 2),
+ (shl DPR, 4),
+ (shl DPR, 6)]>;
+
+// Spaced quads of D registers.
+def DQuadSpc : RegisterClass<"ARM", [v4i64], 64, (add Tuples3DSpc)>;
diff --git a/contrib/llvm/lib/Target/ARM/ARMSchedule.td b/contrib/llvm/lib/Target/ARM/ARMSchedule.td
new file mode 100644
index 000000000000..b7d2d34614df
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMSchedule.td
@@ -0,0 +1,367 @@
+//===-- ARMSchedule.td - ARM Scheduling Definitions --------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// Instruction scheduling annotations for out-of-order CPUs.
+// These annotations are independent of the itinerary class defined below.
+// Here we define the subtarget independent read/write per-operand resources.
+// The subtarget schedule definitions will then map these to the subtarget's
+// resource usages.
+// For example:
+// The instruction cycle timings table might contain an entry for an operation
+// like the following:
+// Rd <- ADD Rn, Rm, <shift> Rs
+// Uops | Latency from register | Uops - resource requirements - latency
+// 2 | Rn: 1 Rm: 4 Rs: 4 | uop T0, Rm, Rs - P01 - 3
+// | | uopc Rd, Rn, T0 - P01 - 1
+// This is telling us that the result will be available in destination register
+// Rd after a minimum of three cycles after the result in Rm and Rs is available
+// and one cycle after the result in Rn is available. The micro-ops can execute
+// on resource P01.
+// To model this, we need to express that we need to dispatch two micro-ops,
+// that the resource P01 is needed and that the latency to Rn is different than
+// the latency to Rm and Rs. The scheduler can decrease Rn's producer latency by
+// two.
+// We will do this by assigning (abstract) resources to register defs/uses.
+// ARMSchedule.td:
+// def WriteALUsr : SchedWrite;
+// def ReadAdvanceALUsr : ScheRead;
+//
+// ARMInstrInfo.td:
+// def ADDrs : I<>, Sched<[WriteALUsr, ReadAdvanceALUsr, ReadDefault,
+// ReadDefault]> { ...}
+// ReadAdvance read resources allow us to define "pipeline by-passes" or
+// shorter latencies to certain registers as needed in the example above.
+// The "ReadDefault" can be omitted.
+// Next, the subtarget td file assigns resources to the abstract resources
+// defined here.
+// ARMScheduleSubtarget.td:
+// // Resources.
+// def P01 : ProcResource<3>; // ALU unit (3 of it).
+// ...
+// // Resource usages.
+// def : WriteRes<WriteALUsr, [P01, P01]> {
+// Latency = 4; // Latency of 4.
+// NumMicroOps = 2; // Dispatch 2 micro-ops.
+// // The two instances of resource P01 are occupied for one cycle. It is one
+// // cycle because these resources happen to be pipelined.
+// ResourceCycles = [1, 1];
+// }
+// def : ReadAdvance<ReadAdvanceALUsr, 3>;
+
+// Basic ALU operation.
+def WriteALU : SchedWrite;
+def ReadALU : SchedRead;
+
+// Basic ALU with shifts.
+def WriteALUsi : SchedWrite; // Shift by immediate.
+def WriteALUsr : SchedWrite; // Shift by register.
+def WriteALUSsr : SchedWrite; // Shift by register (flag setting).
+def ReadALUsr : SchedRead; // Some operands are read later.
+
+// Compares.
+def WriteCMP : SchedWrite;
+def WriteCMPsi : SchedWrite;
+def WriteCMPsr : SchedWrite;
+
+// Division.
+def WriteDiv : SchedWrite;
+
+// Loads.
+def WriteLd : SchedWrite;
+def WritePreLd : SchedWrite;
+
+// Branches.
+def WriteBr : SchedWrite;
+def WriteBrL : SchedWrite;
+def WriteBrTbl : SchedWrite;
+
+// Fixpoint conversions.
+def WriteCvtFP : SchedWrite;
+
+// Noop.
+def WriteNoop : SchedWrite;
+
+// Define TII for use in SchedVariant Predicates.
+def : PredicateProlog<[{
+ const ARMBaseInstrInfo *TII =
+ static_cast<const ARMBaseInstrInfo*>(SchedModel->getInstrInfo());
+ (void)TII;
+}]>;
+
+def IsPredicatedPred : SchedPredicate<[{TII->isPredicated(*MI)}]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction Itinerary classes used for ARM
+//
+def IIC_iALUx : InstrItinClass;
+def IIC_iALUi : InstrItinClass;
+def IIC_iALUr : InstrItinClass;
+def IIC_iALUsi : InstrItinClass;
+def IIC_iALUsir : InstrItinClass;
+def IIC_iALUsr : InstrItinClass;
+def IIC_iBITi : InstrItinClass;
+def IIC_iBITr : InstrItinClass;
+def IIC_iBITsi : InstrItinClass;
+def IIC_iBITsr : InstrItinClass;
+def IIC_iUNAr : InstrItinClass;
+def IIC_iUNAsi : InstrItinClass;
+def IIC_iEXTr : InstrItinClass;
+def IIC_iEXTAr : InstrItinClass;
+def IIC_iEXTAsr : InstrItinClass;
+def IIC_iCMPi : InstrItinClass;
+def IIC_iCMPr : InstrItinClass;
+def IIC_iCMPsi : InstrItinClass;
+def IIC_iCMPsr : InstrItinClass;
+def IIC_iTSTi : InstrItinClass;
+def IIC_iTSTr : InstrItinClass;
+def IIC_iTSTsi : InstrItinClass;
+def IIC_iTSTsr : InstrItinClass;
+def IIC_iMOVi : InstrItinClass;
+def IIC_iMOVr : InstrItinClass;
+def IIC_iMOVsi : InstrItinClass;
+def IIC_iMOVsr : InstrItinClass;
+def IIC_iMOVix2 : InstrItinClass;
+def IIC_iMOVix2addpc : InstrItinClass;
+def IIC_iMOVix2ld : InstrItinClass;
+def IIC_iMVNi : InstrItinClass;
+def IIC_iMVNr : InstrItinClass;
+def IIC_iMVNsi : InstrItinClass;
+def IIC_iMVNsr : InstrItinClass;
+def IIC_iCMOVi : InstrItinClass;
+def IIC_iCMOVr : InstrItinClass;
+def IIC_iCMOVsi : InstrItinClass;
+def IIC_iCMOVsr : InstrItinClass;
+def IIC_iCMOVix2 : InstrItinClass;
+def IIC_iMUL16 : InstrItinClass;
+def IIC_iMAC16 : InstrItinClass;
+def IIC_iMUL32 : InstrItinClass;
+def IIC_iMAC32 : InstrItinClass;
+def IIC_iMUL64 : InstrItinClass;
+def IIC_iMAC64 : InstrItinClass;
+def IIC_iDIV : InstrItinClass;
+def IIC_iLoad_i : InstrItinClass;
+def IIC_iLoad_r : InstrItinClass;
+def IIC_iLoad_si : InstrItinClass;
+def IIC_iLoad_iu : InstrItinClass;
+def IIC_iLoad_ru : InstrItinClass;
+def IIC_iLoad_siu : InstrItinClass;
+def IIC_iLoad_bh_i : InstrItinClass;
+def IIC_iLoad_bh_r : InstrItinClass;
+def IIC_iLoad_bh_si : InstrItinClass;
+def IIC_iLoad_bh_iu : InstrItinClass;
+def IIC_iLoad_bh_ru : InstrItinClass;
+def IIC_iLoad_bh_siu : InstrItinClass;
+def IIC_iLoad_d_i : InstrItinClass;
+def IIC_iLoad_d_r : InstrItinClass;
+def IIC_iLoad_d_ru : InstrItinClass;
+def IIC_iLoad_m : InstrItinClass;
+def IIC_iLoad_mu : InstrItinClass;
+def IIC_iLoad_mBr : InstrItinClass;
+def IIC_iPop : InstrItinClass;
+def IIC_iPop_Br : InstrItinClass;
+def IIC_iLoadiALU : InstrItinClass;
+def IIC_iStore_i : InstrItinClass;
+def IIC_iStore_r : InstrItinClass;
+def IIC_iStore_si : InstrItinClass;
+def IIC_iStore_iu : InstrItinClass;
+def IIC_iStore_ru : InstrItinClass;
+def IIC_iStore_siu : InstrItinClass;
+def IIC_iStore_bh_i : InstrItinClass;
+def IIC_iStore_bh_r : InstrItinClass;
+def IIC_iStore_bh_si : InstrItinClass;
+def IIC_iStore_bh_iu : InstrItinClass;
+def IIC_iStore_bh_ru : InstrItinClass;
+def IIC_iStore_bh_siu : InstrItinClass;
+def IIC_iStore_d_i : InstrItinClass;
+def IIC_iStore_d_r : InstrItinClass;
+def IIC_iStore_d_ru : InstrItinClass;
+def IIC_iStore_m : InstrItinClass;
+def IIC_iStore_mu : InstrItinClass;
+def IIC_Preload : InstrItinClass;
+def IIC_Br : InstrItinClass;
+def IIC_fpSTAT : InstrItinClass;
+def IIC_fpUNA16 : InstrItinClass;
+def IIC_fpUNA32 : InstrItinClass;
+def IIC_fpUNA64 : InstrItinClass;
+def IIC_fpCMP16 : InstrItinClass;
+def IIC_fpCMP32 : InstrItinClass;
+def IIC_fpCMP64 : InstrItinClass;
+def IIC_fpCVTSD : InstrItinClass;
+def IIC_fpCVTDS : InstrItinClass;
+def IIC_fpCVTSH : InstrItinClass;
+def IIC_fpCVTHS : InstrItinClass;
+def IIC_fpCVTIH : InstrItinClass;
+def IIC_fpCVTIS : InstrItinClass;
+def IIC_fpCVTID : InstrItinClass;
+def IIC_fpCVTHI : InstrItinClass;
+def IIC_fpCVTSI : InstrItinClass;
+def IIC_fpCVTDI : InstrItinClass;
+def IIC_fpMOVIS : InstrItinClass;
+def IIC_fpMOVID : InstrItinClass;
+def IIC_fpMOVSI : InstrItinClass;
+def IIC_fpMOVDI : InstrItinClass;
+def IIC_fpALU16 : InstrItinClass;
+def IIC_fpALU32 : InstrItinClass;
+def IIC_fpALU64 : InstrItinClass;
+def IIC_fpMUL16 : InstrItinClass;
+def IIC_fpMUL32 : InstrItinClass;
+def IIC_fpMUL64 : InstrItinClass;
+def IIC_fpMAC16 : InstrItinClass;
+def IIC_fpMAC32 : InstrItinClass;
+def IIC_fpMAC64 : InstrItinClass;
+def IIC_fpFMAC16 : InstrItinClass;
+def IIC_fpFMAC32 : InstrItinClass;
+def IIC_fpFMAC64 : InstrItinClass;
+def IIC_fpDIV16 : InstrItinClass;
+def IIC_fpDIV32 : InstrItinClass;
+def IIC_fpDIV64 : InstrItinClass;
+def IIC_fpSQRT16 : InstrItinClass;
+def IIC_fpSQRT32 : InstrItinClass;
+def IIC_fpSQRT64 : InstrItinClass;
+def IIC_fpLoad16 : InstrItinClass;
+def IIC_fpLoad32 : InstrItinClass;
+def IIC_fpLoad64 : InstrItinClass;
+def IIC_fpLoad_m : InstrItinClass;
+def IIC_fpLoad_mu : InstrItinClass;
+def IIC_fpStore16 : InstrItinClass;
+def IIC_fpStore32 : InstrItinClass;
+def IIC_fpStore64 : InstrItinClass;
+def IIC_fpStore_m : InstrItinClass;
+def IIC_fpStore_mu : InstrItinClass;
+def IIC_VLD1 : InstrItinClass;
+def IIC_VLD1x2 : InstrItinClass;
+def IIC_VLD1x3 : InstrItinClass;
+def IIC_VLD1x4 : InstrItinClass;
+def IIC_VLD1u : InstrItinClass;
+def IIC_VLD1x2u : InstrItinClass;
+def IIC_VLD1x3u : InstrItinClass;
+def IIC_VLD1x4u : InstrItinClass;
+def IIC_VLD1ln : InstrItinClass;
+def IIC_VLD1lnu : InstrItinClass;
+def IIC_VLD1dup : InstrItinClass;
+def IIC_VLD1dupu : InstrItinClass;
+def IIC_VLD2 : InstrItinClass;
+def IIC_VLD2x2 : InstrItinClass;
+def IIC_VLD2u : InstrItinClass;
+def IIC_VLD2x2u : InstrItinClass;
+def IIC_VLD2ln : InstrItinClass;
+def IIC_VLD2lnu : InstrItinClass;
+def IIC_VLD2dup : InstrItinClass;
+def IIC_VLD2dupu : InstrItinClass;
+def IIC_VLD3 : InstrItinClass;
+def IIC_VLD3ln : InstrItinClass;
+def IIC_VLD3u : InstrItinClass;
+def IIC_VLD3lnu : InstrItinClass;
+def IIC_VLD3dup : InstrItinClass;
+def IIC_VLD3dupu : InstrItinClass;
+def IIC_VLD4 : InstrItinClass;
+def IIC_VLD4ln : InstrItinClass;
+def IIC_VLD4u : InstrItinClass;
+def IIC_VLD4lnu : InstrItinClass;
+def IIC_VLD4dup : InstrItinClass;
+def IIC_VLD4dupu : InstrItinClass;
+def IIC_VST1 : InstrItinClass;
+def IIC_VST1x2 : InstrItinClass;
+def IIC_VST1x3 : InstrItinClass;
+def IIC_VST1x4 : InstrItinClass;
+def IIC_VST1u : InstrItinClass;
+def IIC_VST1x2u : InstrItinClass;
+def IIC_VST1x3u : InstrItinClass;
+def IIC_VST1x4u : InstrItinClass;
+def IIC_VST1ln : InstrItinClass;
+def IIC_VST1lnu : InstrItinClass;
+def IIC_VST2 : InstrItinClass;
+def IIC_VST2x2 : InstrItinClass;
+def IIC_VST2u : InstrItinClass;
+def IIC_VST2x2u : InstrItinClass;
+def IIC_VST2ln : InstrItinClass;
+def IIC_VST2lnu : InstrItinClass;
+def IIC_VST3 : InstrItinClass;
+def IIC_VST3u : InstrItinClass;
+def IIC_VST3ln : InstrItinClass;
+def IIC_VST3lnu : InstrItinClass;
+def IIC_VST4 : InstrItinClass;
+def IIC_VST4u : InstrItinClass;
+def IIC_VST4ln : InstrItinClass;
+def IIC_VST4lnu : InstrItinClass;
+def IIC_VUNAD : InstrItinClass;
+def IIC_VUNAQ : InstrItinClass;
+def IIC_VBIND : InstrItinClass;
+def IIC_VBINQ : InstrItinClass;
+def IIC_VPBIND : InstrItinClass;
+def IIC_VFMULD : InstrItinClass;
+def IIC_VFMULQ : InstrItinClass;
+def IIC_VMOV : InstrItinClass;
+def IIC_VMOVImm : InstrItinClass;
+def IIC_VMOVD : InstrItinClass;
+def IIC_VMOVQ : InstrItinClass;
+def IIC_VMOVIS : InstrItinClass;
+def IIC_VMOVID : InstrItinClass;
+def IIC_VMOVISL : InstrItinClass;
+def IIC_VMOVSI : InstrItinClass;
+def IIC_VMOVDI : InstrItinClass;
+def IIC_VMOVN : InstrItinClass;
+def IIC_VPERMD : InstrItinClass;
+def IIC_VPERMQ : InstrItinClass;
+def IIC_VPERMQ3 : InstrItinClass;
+def IIC_VMACD : InstrItinClass;
+def IIC_VMACQ : InstrItinClass;
+def IIC_VFMACD : InstrItinClass;
+def IIC_VFMACQ : InstrItinClass;
+def IIC_VRECSD : InstrItinClass;
+def IIC_VRECSQ : InstrItinClass;
+def IIC_VCNTiD : InstrItinClass;
+def IIC_VCNTiQ : InstrItinClass;
+def IIC_VUNAiD : InstrItinClass;
+def IIC_VUNAiQ : InstrItinClass;
+def IIC_VQUNAiD : InstrItinClass;
+def IIC_VQUNAiQ : InstrItinClass;
+def IIC_VBINiD : InstrItinClass;
+def IIC_VBINiQ : InstrItinClass;
+def IIC_VSUBiD : InstrItinClass;
+def IIC_VSUBiQ : InstrItinClass;
+def IIC_VBINi4D : InstrItinClass;
+def IIC_VBINi4Q : InstrItinClass;
+def IIC_VSUBi4D : InstrItinClass;
+def IIC_VSUBi4Q : InstrItinClass;
+def IIC_VABAD : InstrItinClass;
+def IIC_VABAQ : InstrItinClass;
+def IIC_VSHLiD : InstrItinClass;
+def IIC_VSHLiQ : InstrItinClass;
+def IIC_VSHLi4D : InstrItinClass;
+def IIC_VSHLi4Q : InstrItinClass;
+def IIC_VPALiD : InstrItinClass;
+def IIC_VPALiQ : InstrItinClass;
+def IIC_VMULi16D : InstrItinClass;
+def IIC_VMULi32D : InstrItinClass;
+def IIC_VMULi16Q : InstrItinClass;
+def IIC_VMULi32Q : InstrItinClass;
+def IIC_VMACi16D : InstrItinClass;
+def IIC_VMACi32D : InstrItinClass;
+def IIC_VMACi16Q : InstrItinClass;
+def IIC_VMACi32Q : InstrItinClass;
+def IIC_VEXTD : InstrItinClass;
+def IIC_VEXTQ : InstrItinClass;
+def IIC_VTB1 : InstrItinClass;
+def IIC_VTB2 : InstrItinClass;
+def IIC_VTB3 : InstrItinClass;
+def IIC_VTB4 : InstrItinClass;
+def IIC_VTBX1 : InstrItinClass;
+def IIC_VTBX2 : InstrItinClass;
+def IIC_VTBX3 : InstrItinClass;
+def IIC_VTBX4 : InstrItinClass;
+
+//===----------------------------------------------------------------------===//
+// Processor instruction itineraries.
+
+include "ARMScheduleV6.td"
+include "ARMScheduleA8.td"
+include "ARMScheduleA9.td"
+include "ARMScheduleSwift.td"
+include "ARMScheduleR52.td"
diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleA8.td b/contrib/llvm/lib/Target/ARM/ARMScheduleA8.td
new file mode 100644
index 000000000000..ba380cba100f
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMScheduleA8.td
@@ -0,0 +1,1075 @@
+//=- ARMScheduleA8.td - ARM Cortex-A8 Scheduling Definitions -*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the ARM Cortex A8 processors.
+//
+//===----------------------------------------------------------------------===//
+
+//
+// Scheduling information derived from "Cortex-A8 Technical Reference Manual".
+// Functional Units.
+def A8_Pipe0 : FuncUnit; // pipeline 0
+def A8_Pipe1 : FuncUnit; // pipeline 1
+def A8_LSPipe : FuncUnit; // Load / store pipeline
+def A8_NPipe : FuncUnit; // NEON ALU/MUL pipe
+def A8_NLSPipe : FuncUnit; // NEON LS pipe
+//
+// Dual issue pipeline represented by A8_Pipe0 | A8_Pipe1
+//
+def CortexA8Itineraries : ProcessorItineraries<
+ [A8_Pipe0, A8_Pipe1, A8_LSPipe, A8_NPipe, A8_NLSPipe],
+ [], [
+ // Two fully-pipelined integer ALU pipelines
+ //
+ // No operand cycles
+ InstrItinData<IIC_iALUx , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>]>,
+ //
+ // Binary Instructions that produce a result
+ InstrItinData<IIC_iALUi ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
+ InstrItinData<IIC_iALUr ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 2]>,
+ InstrItinData<IIC_iALUsi,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1]>,
+ InstrItinData<IIC_iALUsir,[InstrStage<1,[A8_Pipe0, A8_Pipe1]>], [2, 1, 2]>,
+ InstrItinData<IIC_iALUsr,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1, 1]>,
+ //
+ // Bitwise Instructions that produce a result
+ InstrItinData<IIC_iBITi ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
+ InstrItinData<IIC_iBITr ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 2]>,
+ InstrItinData<IIC_iBITsi,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1]>,
+ InstrItinData<IIC_iBITsr,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1, 1]>,
+ //
+ // Unary Instructions that produce a result
+ InstrItinData<IIC_iUNAr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
+ InstrItinData<IIC_iUNAsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
+ //
+ // Zero and sign extension instructions
+ InstrItinData<IIC_iEXTr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>,
+ InstrItinData<IIC_iEXTAr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1]>,
+ InstrItinData<IIC_iEXTAsr,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>],[2, 2, 1, 1]>,
+ //
+ // Compare instructions
+ InstrItinData<IIC_iCMPi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>,
+ InstrItinData<IIC_iCMPr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
+ InstrItinData<IIC_iCMPsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
+ InstrItinData<IIC_iCMPsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>,
+ //
+ // Test instructions
+ InstrItinData<IIC_iTSTi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>,
+ InstrItinData<IIC_iTSTr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
+ InstrItinData<IIC_iTSTsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
+ InstrItinData<IIC_iTSTsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>,
+ //
+ // Move instructions, unconditional
+ InstrItinData<IIC_iMOVi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1]>,
+ InstrItinData<IIC_iMOVr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>,
+ InstrItinData<IIC_iMOVsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>,
+ InstrItinData<IIC_iMOVsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1, 1]>,
+ InstrItinData<IIC_iMOVix2,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+ InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>,
+ InstrItinData<IIC_iMOVix2addpc,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+ InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+ InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [3]>,
+ InstrItinData<IIC_iMOVix2ld,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+ InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+ InstrStage<1, [A8_LSPipe]>], [5]>,
+ //
+ // Move instructions, conditional
+ InstrItinData<IIC_iCMOVi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>,
+ InstrItinData<IIC_iCMOVr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
+ InstrItinData<IIC_iCMOVsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
+ InstrItinData<IIC_iCMOVsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>,
+ InstrItinData<IIC_iCMOVix2,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+ InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [3, 1]>,
+ //
+ // MVN instructions
+ InstrItinData<IIC_iMVNi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1]>,
+ InstrItinData<IIC_iMVNr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>,
+ InstrItinData<IIC_iMVNsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>,
+ InstrItinData<IIC_iMVNsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1, 1]>,
+
+ // Integer multiply pipeline
+ // Result written in E5, but that is relative to the last cycle of multicycle,
+ // so we use 6 for those cases
+ //
+ InstrItinData<IIC_iMUL16 , [InstrStage<1, [A8_Pipe0]>], [5, 1, 1]>,
+ InstrItinData<IIC_iMAC16 , [InstrStage<2, [A8_Pipe0]>], [6, 1, 1, 4]>,
+ InstrItinData<IIC_iMUL32 , [InstrStage<2, [A8_Pipe0]>], [6, 1, 1]>,
+ InstrItinData<IIC_iMAC32 , [InstrStage<2, [A8_Pipe0]>], [6, 1, 1, 4]>,
+ InstrItinData<IIC_iMUL64 , [InstrStage<3, [A8_Pipe0]>], [6, 6, 1, 1]>,
+ InstrItinData<IIC_iMAC64 , [InstrStage<3, [A8_Pipe0]>], [6, 6, 1, 1]>,
+
+ // Integer load pipeline
+ //
+ // Immediate offset
+ InstrItinData<IIC_iLoad_i , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_LSPipe]>], [3, 1]>,
+ InstrItinData<IIC_iLoad_bh_i, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_LSPipe]>], [3, 1]>,
+ InstrItinData<IIC_iLoad_d_i, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_LSPipe]>], [3, 1]>,
+ //
+ // Register offset
+ InstrItinData<IIC_iLoad_r , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>,
+ InstrItinData<IIC_iLoad_bh_r, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>,
+ InstrItinData<IIC_iLoad_d_r , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>,
+ //
+ // Scaled register offset, issues over 2 cycles
+ // FIXME: lsl by 2 takes 1 cycle.
+ InstrItinData<IIC_iLoad_si , [InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_LSPipe]>], [4, 1, 1]>,
+ InstrItinData<IIC_iLoad_bh_si,[InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_LSPipe]>], [4, 1, 1]>,
+ //
+ // Immediate offset with update
+ InstrItinData<IIC_iLoad_iu , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_LSPipe]>], [3, 2, 1]>,
+ InstrItinData<IIC_iLoad_bh_iu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_LSPipe]>], [3, 2, 1]>,
+ //
+ // Register offset with update
+ InstrItinData<IIC_iLoad_ru , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_LSPipe]>], [3, 2, 1, 1]>,
+ InstrItinData<IIC_iLoad_bh_ru,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_LSPipe]>], [3, 2, 1, 1]>,
+ InstrItinData<IIC_iLoad_d_ru, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_LSPipe]>], [3, 2, 1, 1]>,
+ //
+ // Scaled register offset with update, issues over 2 cycles
+ InstrItinData<IIC_iLoad_siu , [InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_LSPipe]>], [4, 3, 1, 1]>,
+ InstrItinData<IIC_iLoad_bh_siu,[InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_LSPipe]>], [4, 3, 1, 1]>,
+ //
+ // Load multiple, def is the 5th operand. Pipeline 0 only.
+ // FIXME: A8_LSPipe cycle time is dynamic, this assumes 3 to 4 registers.
+ InstrItinData<IIC_iLoad_m , [InstrStage<2, [A8_Pipe0], 0>,
+ InstrStage<2, [A8_LSPipe]>],
+ [1, 1, 1, 1, 3], [], -1>, // dynamic uops
+ //
+ // Load multiple + update, defs are the 1st and 5th operands.
+ InstrItinData<IIC_iLoad_mu , [InstrStage<3, [A8_Pipe0], 0>,
+ InstrStage<3, [A8_LSPipe]>],
+ [2, 1, 1, 1, 3], [], -1>, // dynamic uops
+ //
+ // Load multiple plus branch
+ InstrItinData<IIC_iLoad_mBr, [InstrStage<3, [A8_Pipe0], 0>,
+ InstrStage<3, [A8_LSPipe]>,
+ InstrStage<1, [A8_Pipe0, A8_Pipe1]>],
+ [1, 2, 1, 1, 3], [], -1>, // dynamic uops
+ //
+ // Pop, def is the 3rd operand.
+ InstrItinData<IIC_iPop , [InstrStage<3, [A8_Pipe0], 0>,
+ InstrStage<3, [A8_LSPipe]>],
+ [1, 1, 3], [], -1>, // dynamic uops
+ //
+ // Push, def is the 3th operand.
+ InstrItinData<IIC_iPop_Br, [InstrStage<3, [A8_Pipe0], 0>,
+ InstrStage<3, [A8_LSPipe]>,
+ InstrStage<1, [A8_Pipe0, A8_Pipe1]>],
+ [1, 1, 3], [], -1>, // dynamic uops
+ //
+ // iLoadi + iALUr for t2LDRpci_pic.
+ InstrItinData<IIC_iLoadiALU, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_LSPipe]>,
+ InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [4, 1]>,
+
+
+ // Integer store pipeline
+ //
+ // Immediate offset
+ InstrItinData<IIC_iStore_i , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_LSPipe]>], [3, 1]>,
+ InstrItinData<IIC_iStore_bh_i,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_LSPipe]>], [3, 1]>,
+ InstrItinData<IIC_iStore_d_i, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_LSPipe]>], [3, 1]>,
+ //
+ // Register offset
+ InstrItinData<IIC_iStore_r , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>,
+ InstrItinData<IIC_iStore_bh_r,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>,
+ InstrItinData<IIC_iStore_d_r, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>,
+ //
+ // Scaled register offset, issues over 2 cycles
+ InstrItinData<IIC_iStore_si , [InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_LSPipe]>], [3, 1, 1]>,
+ InstrItinData<IIC_iStore_bh_si,[InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_LSPipe]>], [3, 1, 1]>,
+ //
+ // Immediate offset with update
+ InstrItinData<IIC_iStore_iu , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_LSPipe]>], [2, 3, 1]>,
+ InstrItinData<IIC_iStore_bh_iu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_LSPipe]>], [2, 3, 1]>,
+ //
+ // Register offset with update
+ InstrItinData<IIC_iStore_ru , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_LSPipe]>], [2, 3, 1, 1]>,
+ InstrItinData<IIC_iStore_bh_ru,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_LSPipe]>], [2, 3, 1, 1]>,
+ InstrItinData<IIC_iStore_d_ru, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_LSPipe]>], [2, 3, 1, 1]>,
+ //
+ // Scaled register offset with update, issues over 2 cycles
+ InstrItinData<IIC_iStore_siu, [InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_LSPipe]>], [3, 3, 1, 1]>,
+ InstrItinData<IIC_iStore_bh_siu,[InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_LSPipe]>], [3, 3, 1, 1]>,
+ //
+ // Store multiple. Pipeline 0 only.
+ // FIXME: A8_LSPipe cycle time is dynamic, this assumes 3 to 4 registers.
+ InstrItinData<IIC_iStore_m , [InstrStage<2, [A8_Pipe0], 0>,
+ InstrStage<2, [A8_LSPipe]>],
+ [], [], -1>, // dynamic uops
+ //
+ // Store multiple + update
+ InstrItinData<IIC_iStore_mu, [InstrStage<2, [A8_Pipe0], 0>,
+ InstrStage<2, [A8_LSPipe]>],
+ [2], [], -1>, // dynamic uops
+ //
+ // Preload
+ InstrItinData<IIC_Preload, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
+
+ // Branch
+ //
+ // no delay slots, so the latency of a branch is unimportant
+ InstrItinData<IIC_Br , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>]>,
+
+ // VFP
+ // Issue through integer pipeline, and execute in NEON unit. We assume
+ // RunFast mode so that NFP pipeline is used for single-precision when
+ // possible.
+ //
+ // FP Special Register to Integer Register File Move
+ InstrItinData<IIC_fpSTAT , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NLSPipe]>], [20]>,
+ //
+ // Single-precision FP Unary
+ InstrItinData<IIC_fpUNA32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [7, 1]>,
+ //
+ // Double-precision FP Unary
+ InstrItinData<IIC_fpUNA64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<4, [A8_NPipe], 0>,
+ InstrStage<4, [A8_NLSPipe]>], [4, 1]>,
+ //
+ // Single-precision FP Compare
+ InstrItinData<IIC_fpCMP32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [1, 1]>,
+ //
+ // Double-precision FP Compare
+ InstrItinData<IIC_fpCMP64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<4, [A8_NPipe], 0>,
+ InstrStage<4, [A8_NLSPipe]>], [4, 1]>,
+ //
+ // Single to Double FP Convert
+ InstrItinData<IIC_fpCVTSD , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<7, [A8_NPipe], 0>,
+ InstrStage<7, [A8_NLSPipe]>], [7, 1]>,
+ //
+ // Double to Single FP Convert
+ InstrItinData<IIC_fpCVTDS , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<5, [A8_NPipe], 0>,
+ InstrStage<5, [A8_NLSPipe]>], [5, 1]>,
+ //
+ // Single-Precision FP to Integer Convert
+ InstrItinData<IIC_fpCVTSI , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [7, 1]>,
+ //
+ // Double-Precision FP to Integer Convert
+ InstrItinData<IIC_fpCVTDI , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<8, [A8_NPipe], 0>,
+ InstrStage<8, [A8_NLSPipe]>], [8, 1]>,
+ //
+ // Integer to Single-Precision FP Convert
+ InstrItinData<IIC_fpCVTIS , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [7, 1]>,
+ //
+ // Integer to Double-Precision FP Convert
+ InstrItinData<IIC_fpCVTID , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<8, [A8_NPipe], 0>,
+ InstrStage<8, [A8_NLSPipe]>], [8, 1]>,
+ //
+ // Single-precision FP ALU
+ InstrItinData<IIC_fpALU32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [7, 1, 1]>,
+ //
+ // Double-precision FP ALU
+ InstrItinData<IIC_fpALU64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<9, [A8_NPipe], 0>,
+ InstrStage<9, [A8_NLSPipe]>], [9, 1, 1]>,
+ //
+ // Single-precision FP Multiply
+ InstrItinData<IIC_fpMUL32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [7, 1, 1]>,
+ //
+ // Double-precision FP Multiply
+ InstrItinData<IIC_fpMUL64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<11, [A8_NPipe], 0>,
+ InstrStage<11, [A8_NLSPipe]>], [11, 1, 1]>,
+ //
+ // Single-precision FP MAC
+ InstrItinData<IIC_fpMAC32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [7, 2, 1, 1]>,
+ //
+ // Double-precision FP MAC
+ InstrItinData<IIC_fpMAC64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<19, [A8_NPipe], 0>,
+ InstrStage<19, [A8_NLSPipe]>], [19, 2, 1, 1]>,
+ //
+ // Single-precision Fused FP MAC
+ InstrItinData<IIC_fpFMAC32, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [7, 2, 1, 1]>,
+ //
+ // Double-precision Fused FP MAC
+ InstrItinData<IIC_fpFMAC64, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<19, [A8_NPipe], 0>,
+ InstrStage<19, [A8_NLSPipe]>], [19, 2, 1, 1]>,
+ //
+ // Single-precision FP DIV
+ InstrItinData<IIC_fpDIV32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<20, [A8_NPipe], 0>,
+ InstrStage<20, [A8_NLSPipe]>], [20, 1, 1]>,
+ //
+ // Double-precision FP DIV
+ InstrItinData<IIC_fpDIV64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<29, [A8_NPipe], 0>,
+ InstrStage<29, [A8_NLSPipe]>], [29, 1, 1]>,
+ //
+ // Single-precision FP SQRT
+ InstrItinData<IIC_fpSQRT32, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<19, [A8_NPipe], 0>,
+ InstrStage<19, [A8_NLSPipe]>], [19, 1]>,
+ //
+ // Double-precision FP SQRT
+ InstrItinData<IIC_fpSQRT64, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<29, [A8_NPipe], 0>,
+ InstrStage<29, [A8_NLSPipe]>], [29, 1]>,
+
+ //
+ // Integer to Single-precision Move
+ InstrItinData<IIC_fpMOVIS, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>],
+ [2, 1]>,
+ //
+ // Integer to Double-precision Move
+ InstrItinData<IIC_fpMOVID, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>],
+ [2, 1, 1]>,
+ //
+ // Single-precision to Integer Move
+ InstrItinData<IIC_fpMOVSI, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>],
+ [20, 1]>,
+ //
+ // Double-precision to Integer Move
+ InstrItinData<IIC_fpMOVDI, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>],
+ [20, 20, 1]>,
+
+ //
+ // Single-precision FP Load
+ InstrItinData<IIC_fpLoad32, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NLSPipe], 0>,
+ InstrStage<1, [A8_LSPipe]>],
+ [2, 1]>,
+ //
+ // Double-precision FP Load
+ InstrItinData<IIC_fpLoad64, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NLSPipe], 0>,
+ InstrStage<1, [A8_LSPipe]>],
+ [2, 1]>,
+ //
+ // FP Load Multiple
+ // FIXME: A8_LSPipe cycle time is dynamic, this assumes 3 to 4 registers.
+ InstrItinData<IIC_fpLoad_m, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NLSPipe], 0>,
+ InstrStage<1, [A8_LSPipe]>,
+ InstrStage<1, [A8_NLSPipe], 0>,
+ InstrStage<1, [A8_LSPipe]>],
+ [1, 1, 1, 2], [], -1>, // dynamic uops
+ //
+ // FP Load Multiple + update
+ InstrItinData<IIC_fpLoad_mu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NLSPipe], 0>,
+ InstrStage<1, [A8_LSPipe]>,
+ InstrStage<1, [A8_NLSPipe], 0>,
+ InstrStage<1, [A8_LSPipe]>],
+ [2, 1, 1, 1, 2], [], -1>, // dynamic uops
+ //
+ // Single-precision FP Store
+ InstrItinData<IIC_fpStore32,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NLSPipe], 0>,
+ InstrStage<1, [A8_LSPipe]>],
+ [1, 1]>,
+ //
+ // Double-precision FP Store
+ InstrItinData<IIC_fpStore64,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NLSPipe], 0>,
+ InstrStage<1, [A8_LSPipe]>],
+ [1, 1]>,
+ //
+ // FP Store Multiple
+ InstrItinData<IIC_fpStore_m,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NLSPipe], 0>,
+ InstrStage<1, [A8_LSPipe]>,
+ InstrStage<1, [A8_NLSPipe], 0>,
+ InstrStage<1, [A8_LSPipe]>],
+ [1, 1, 1, 1], [], -1>, // dynamic uops
+ //
+ // FP Store Multiple + update
+ InstrItinData<IIC_fpStore_mu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NLSPipe], 0>,
+ InstrStage<1, [A8_LSPipe]>,
+ InstrStage<1, [A8_NLSPipe], 0>,
+ InstrStage<1, [A8_LSPipe]>],
+ [2, 1, 1, 1, 1], [], -1>, // dynamic uops
+ // NEON
+ // Issue through integer pipeline, and execute in NEON unit.
+ //
+ // VLD1
+ InstrItinData<IIC_VLD1, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_NLSPipe], 0>,
+ InstrStage<2, [A8_LSPipe]>],
+ [2, 1]>,
+ // VLD1x2
+ InstrItinData<IIC_VLD1x2, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_NLSPipe], 0>,
+ InstrStage<2, [A8_LSPipe]>],
+ [2, 2, 1]>,
+ //
+ // VLD1x3
+ InstrItinData<IIC_VLD1x3, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<3, [A8_NLSPipe], 0>,
+ InstrStage<3, [A8_LSPipe]>],
+ [2, 2, 3, 1]>,
+ //
+ // VLD1x4
+ InstrItinData<IIC_VLD1x4, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<3, [A8_NLSPipe], 0>,
+ InstrStage<3, [A8_LSPipe]>],
+ [2, 2, 3, 3, 1]>,
+ //
+ // VLD1u
+ InstrItinData<IIC_VLD1u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_NLSPipe], 0>,
+ InstrStage<2, [A8_LSPipe]>],
+ [2, 2, 1]>,
+ //
+ // VLD1x2u
+ InstrItinData<IIC_VLD1x2u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_NLSPipe], 0>,
+ InstrStage<2, [A8_LSPipe]>],
+ [2, 2, 2, 1]>,
+ //
+ // VLD1x3u
+ InstrItinData<IIC_VLD1x3u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<3, [A8_NLSPipe], 0>,
+ InstrStage<3, [A8_LSPipe]>],
+ [2, 2, 3, 2, 1]>,
+ //
+ // VLD1x4u
+ InstrItinData<IIC_VLD1x4u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<3, [A8_NLSPipe], 0>,
+ InstrStage<3, [A8_LSPipe]>],
+ [2, 2, 3, 3, 2, 1]>,
+ //
+ // VLD1ln
+ InstrItinData<IIC_VLD1ln, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+ InstrStage<3, [A8_NLSPipe], 0>,
+ InstrStage<3, [A8_LSPipe]>],
+ [3, 1, 1, 1]>,
+ //
+ // VLD1lnu
+ InstrItinData<IIC_VLD1lnu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+ InstrStage<3, [A8_NLSPipe], 0>,
+ InstrStage<3, [A8_LSPipe]>],
+ [3, 2, 1, 1, 1, 1]>,
+ //
+ // VLD1dup
+ InstrItinData<IIC_VLD1dup, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+ InstrStage<2, [A8_NLSPipe], 0>,
+ InstrStage<2, [A8_LSPipe]>],
+ [2, 1]>,
+ //
+ // VLD1dupu
+ InstrItinData<IIC_VLD1dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+ InstrStage<2, [A8_NLSPipe], 0>,
+ InstrStage<2, [A8_LSPipe]>],
+ [2, 2, 1, 1]>,
+ //
+ // VLD2
+ InstrItinData<IIC_VLD2, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_NLSPipe], 0>,
+ InstrStage<2, [A8_LSPipe]>],
+ [2, 2, 1]>,
+ //
+ // VLD2x2
+ InstrItinData<IIC_VLD2x2, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<3, [A8_NLSPipe], 0>,
+ InstrStage<3, [A8_LSPipe]>],
+ [2, 2, 3, 3, 1]>,
+ //
+ // VLD2ln
+ InstrItinData<IIC_VLD2ln, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<3, [A8_NLSPipe], 0>,
+ InstrStage<3, [A8_LSPipe]>],
+ [3, 3, 1, 1, 1, 1]>,
+ //
+ // VLD2u
+ InstrItinData<IIC_VLD2u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_NLSPipe], 0>,
+ InstrStage<2, [A8_LSPipe]>],
+ [2, 2, 2, 1, 1, 1]>,
+ //
+ // VLD2x2u
+ InstrItinData<IIC_VLD2x2u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<3, [A8_NLSPipe], 0>,
+ InstrStage<3, [A8_LSPipe]>],
+ [2, 2, 3, 3, 2, 1]>,
+ //
+ // VLD2lnu
+ InstrItinData<IIC_VLD2lnu, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<3, [A8_NLSPipe], 0>,
+ InstrStage<3, [A8_LSPipe]>],
+ [3, 3, 2, 1, 1, 1, 1, 1]>,
+ //
+ // VLD2dup
+ InstrItinData<IIC_VLD2dup, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+ InstrStage<2, [A8_NLSPipe], 0>,
+ InstrStage<2, [A8_LSPipe]>],
+ [2, 2, 1]>,
+ //
+ // VLD2dupu
+ InstrItinData<IIC_VLD2dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+ InstrStage<2, [A8_NLSPipe], 0>,
+ InstrStage<2, [A8_LSPipe]>],
+ [2, 2, 2, 1, 1]>,
+ //
+ // VLD3
+ InstrItinData<IIC_VLD3, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<4, [A8_NLSPipe], 0>,
+ InstrStage<4, [A8_LSPipe]>],
+ [3, 3, 4, 1]>,
+ //
+ // VLD3ln
+ InstrItinData<IIC_VLD3ln, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<5, [A8_NLSPipe], 0>,
+ InstrStage<5, [A8_LSPipe]>],
+ [4, 4, 5, 1, 1, 1, 1, 2]>,
+ //
+ // VLD3u
+ InstrItinData<IIC_VLD3u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<4, [A8_NLSPipe], 0>,
+ InstrStage<4, [A8_LSPipe]>],
+ [3, 3, 4, 2, 1]>,
+ //
+ // VLD3lnu
+ InstrItinData<IIC_VLD3lnu, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<5, [A8_NLSPipe], 0>,
+ InstrStage<5, [A8_LSPipe]>],
+ [4, 4, 5, 2, 1, 1, 1, 1, 1, 2]>,
+ //
+ // VLD3dup
+ InstrItinData<IIC_VLD3dup, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+ InstrStage<3, [A8_NLSPipe], 0>,
+ InstrStage<3, [A8_LSPipe]>],
+ [2, 2, 3, 1]>,
+ //
+ // VLD3dupu
+ InstrItinData<IIC_VLD3dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+ InstrStage<3, [A8_NLSPipe], 0>,
+ InstrStage<3, [A8_LSPipe]>],
+ [2, 2, 3, 2, 1, 1]>,
+ //
+ // VLD4
+ InstrItinData<IIC_VLD4, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<4, [A8_NLSPipe], 0>,
+ InstrStage<4, [A8_LSPipe]>],
+ [3, 3, 4, 4, 1]>,
+ //
+ // VLD4ln
+ InstrItinData<IIC_VLD4ln, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<5, [A8_NLSPipe], 0>,
+ InstrStage<5, [A8_LSPipe]>],
+ [4, 4, 5, 5, 1, 1, 1, 1, 2, 2]>,
+ //
+ // VLD4u
+ InstrItinData<IIC_VLD4u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<4, [A8_NLSPipe], 0>,
+ InstrStage<4, [A8_LSPipe]>],
+ [3, 3, 4, 4, 2, 1]>,
+ //
+ // VLD4lnu
+ InstrItinData<IIC_VLD4lnu, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<5, [A8_NLSPipe], 0>,
+ InstrStage<5, [A8_LSPipe]>],
+ [4, 4, 5, 5, 2, 1, 1, 1, 1, 1, 2, 2]>,
+ //
+ // VLD4dup
+ InstrItinData<IIC_VLD4dup, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+ InstrStage<3, [A8_NLSPipe], 0>,
+ InstrStage<3, [A8_LSPipe]>],
+ [2, 2, 3, 3, 1]>,
+ //
+ // VLD4dupu
+ InstrItinData<IIC_VLD4dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+ InstrStage<3, [A8_NLSPipe], 0>,
+ InstrStage<3, [A8_LSPipe]>],
+ [2, 2, 3, 3, 2, 1, 1]>,
+ //
+ // VST1
+ InstrItinData<IIC_VST1, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_NLSPipe], 0>,
+ InstrStage<2, [A8_LSPipe]>],
+ [1, 1, 1]>,
+ //
+ // VST1x2
+ InstrItinData<IIC_VST1x2, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_NLSPipe], 0>,
+ InstrStage<2, [A8_LSPipe]>],
+ [1, 1, 1, 1]>,
+ //
+ // VST1x3
+ InstrItinData<IIC_VST1x3, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<3, [A8_NLSPipe], 0>,
+ InstrStage<3, [A8_LSPipe]>],
+ [1, 1, 1, 1, 2]>,
+ //
+ // VST1x4
+ InstrItinData<IIC_VST1x4, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<3, [A8_NLSPipe], 0>,
+ InstrStage<3, [A8_LSPipe]>],
+ [1, 1, 1, 1, 2, 2]>,
+ //
+ // VST1u
+ InstrItinData<IIC_VST1u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_NLSPipe], 0>,
+ InstrStage<2, [A8_LSPipe]>],
+ [2, 1, 1, 1, 1]>,
+ //
+ // VST1x2u
+ InstrItinData<IIC_VST1x2u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_NLSPipe], 0>,
+ InstrStage<2, [A8_LSPipe]>],
+ [2, 1, 1, 1, 1, 1]>,
+ //
+ // VST1x3u
+ InstrItinData<IIC_VST1x3u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<3, [A8_NLSPipe], 0>,
+ InstrStage<3, [A8_LSPipe]>],
+ [2, 1, 1, 1, 1, 1, 2]>,
+ //
+ // VST1x4u
+ InstrItinData<IIC_VST1x4u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<3, [A8_NLSPipe], 0>,
+ InstrStage<3, [A8_LSPipe]>],
+ [2, 1, 1, 1, 1, 1, 2, 2]>,
+ //
+ // VST1ln
+ InstrItinData<IIC_VST1ln, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+ InstrStage<2, [A8_NLSPipe], 0>,
+ InstrStage<2, [A8_LSPipe]>],
+ [1, 1, 1]>,
+ //
+ // VST1lnu
+ InstrItinData<IIC_VST1lnu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+ InstrStage<2, [A8_NLSPipe], 0>,
+ InstrStage<2, [A8_LSPipe]>],
+ [2, 1, 1, 1, 1]>,
+ //
+ // VST2
+ InstrItinData<IIC_VST2, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_NLSPipe], 0>,
+ InstrStage<2, [A8_LSPipe]>],
+ [1, 1, 1, 1]>,
+ //
+ // VST2x2
+ InstrItinData<IIC_VST2x2, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<4, [A8_NLSPipe], 0>,
+ InstrStage<4, [A8_LSPipe]>],
+ [1, 1, 1, 1, 2, 2]>,
+ //
+ // VST2u
+ InstrItinData<IIC_VST2u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_NLSPipe], 0>,
+ InstrStage<2, [A8_LSPipe]>],
+ [2, 1, 1, 1, 1, 1]>,
+ //
+ // VST2x2u
+ InstrItinData<IIC_VST2x2u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<4, [A8_NLSPipe], 0>,
+ InstrStage<4, [A8_LSPipe]>],
+ [2, 1, 1, 1, 1, 1, 2, 2]>,
+ //
+ // VST2ln
+ InstrItinData<IIC_VST2ln, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_NLSPipe], 0>,
+ InstrStage<2, [A8_LSPipe]>],
+ [1, 1, 1, 1]>,
+ //
+ // VST2lnu
+ InstrItinData<IIC_VST2lnu, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_NLSPipe], 0>,
+ InstrStage<2, [A8_LSPipe]>],
+ [2, 1, 1, 1, 1, 1]>,
+ //
+ // VST3
+ InstrItinData<IIC_VST3, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<3, [A8_NLSPipe], 0>,
+ InstrStage<3, [A8_LSPipe]>],
+ [1, 1, 1, 1, 2]>,
+ //
+ // VST3u
+ InstrItinData<IIC_VST3u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<3, [A8_NLSPipe], 0>,
+ InstrStage<3, [A8_LSPipe]>],
+ [2, 1, 1, 1, 1, 1, 2]>,
+ //
+ // VST3ln
+ InstrItinData<IIC_VST3ln, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<3, [A8_NLSPipe], 0>,
+ InstrStage<3, [A8_LSPipe]>],
+ [1, 1, 1, 1, 2]>,
+ //
+ // VST3lnu
+ InstrItinData<IIC_VST3lnu, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<3, [A8_NLSPipe], 0>,
+ InstrStage<3, [A8_LSPipe]>],
+ [2, 1, 1, 1, 1, 1, 2]>,
+ //
+ // VST4
+ InstrItinData<IIC_VST4, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<4, [A8_NLSPipe], 0>,
+ InstrStage<4, [A8_LSPipe]>],
+ [1, 1, 1, 1, 2, 2]>,
+ //
+ // VST4u
+ InstrItinData<IIC_VST4u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<4, [A8_NLSPipe], 0>,
+ InstrStage<4, [A8_LSPipe]>],
+ [2, 1, 1, 1, 1, 1, 2, 2]>,
+ //
+ // VST4ln
+ InstrItinData<IIC_VST4ln, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<4, [A8_NLSPipe], 0>,
+ InstrStage<4, [A8_LSPipe]>],
+ [1, 1, 1, 1, 2, 2]>,
+ //
+ // VST4lnu
+ InstrItinData<IIC_VST4lnu, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<4, [A8_NLSPipe], 0>,
+ InstrStage<4, [A8_LSPipe]>],
+ [2, 1, 1, 1, 1, 1, 2, 2]>,
+ //
+ // Double-register FP Unary
+ InstrItinData<IIC_VUNAD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [5, 2]>,
+ //
+ // Quad-register FP Unary
+ // Result written in N5, but that is relative to the last cycle of multicycle,
+ // so we use 6 for those cases
+ InstrItinData<IIC_VUNAQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_NPipe]>], [6, 2]>,
+ //
+ // Double-register FP Binary
+ InstrItinData<IIC_VBIND, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [5, 2, 2]>,
+ //
+ // VPADD, etc.
+ InstrItinData<IIC_VPBIND, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [5, 2, 2]>,
+ //
+ // Double-register FP VMUL
+ InstrItinData<IIC_VFMULD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [5, 2, 1]>,
+
+ //
+ // Quad-register FP Binary
+ // Result written in N5, but that is relative to the last cycle of multicycle,
+ // so we use 6 for those cases
+ InstrItinData<IIC_VBINQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_NPipe]>], [6, 2, 2]>,
+ //
+ // Quad-register FP VMUL
+ InstrItinData<IIC_VFMULQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [6, 2, 1]>,
+ //
+ // Move
+ InstrItinData<IIC_VMOV, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [1, 1]>,
+ //
+ // Move Immediate
+ InstrItinData<IIC_VMOVImm, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [3]>,
+ //
+ // Double-register Permute Move
+ InstrItinData<IIC_VMOVD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NLSPipe]>], [2, 1]>,
+ //
+ // Quad-register Permute Move
+ // Result written in N2, but that is relative to the last cycle of multicycle,
+ // so we use 3 for those cases
+ InstrItinData<IIC_VMOVQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_NLSPipe]>], [3, 1]>,
+ //
+ // Integer to Single-precision Move
+ InstrItinData<IIC_VMOVIS , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NLSPipe]>], [2, 1]>,
+ //
+ // Integer to Double-precision Move
+ InstrItinData<IIC_VMOVID , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NLSPipe]>], [2, 1, 1]>,
+ //
+ // Single-precision to Integer Move
+ InstrItinData<IIC_VMOVSI , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NLSPipe]>], [20, 1]>,
+ //
+ // Double-precision to Integer Move
+ InstrItinData<IIC_VMOVDI , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NLSPipe]>], [20, 20, 1]>,
+ //
+ // Integer to Lane Move
+ InstrItinData<IIC_VMOVISL , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_NLSPipe]>], [3, 1, 1]>,
+ //
+ // Vector narrow move
+ InstrItinData<IIC_VMOVN , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [2, 1]>,
+ //
+ // Double-register Permute
+ InstrItinData<IIC_VPERMD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NLSPipe]>], [2, 2, 1, 1]>,
+ //
+ // Quad-register Permute
+ // Result written in N2, but that is relative to the last cycle of multicycle,
+ // so we use 3 for those cases
+ InstrItinData<IIC_VPERMQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_NLSPipe]>], [3, 3, 1, 1]>,
+ //
+ // Quad-register Permute (3 cycle issue)
+ // Result written in N2, but that is relative to the last cycle of multicycle,
+ // so we use 4 for those cases
+ InstrItinData<IIC_VPERMQ3, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NLSPipe]>,
+ InstrStage<1, [A8_NPipe], 0>,
+ InstrStage<2, [A8_NLSPipe]>], [4, 4, 1, 1]>,
+ //
+ // Double-register FP Multiple-Accumulate
+ InstrItinData<IIC_VMACD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [9, 3, 2, 2]>,
+ //
+ // Quad-register FP Multiple-Accumulate
+ // Result written in N9, but that is relative to the last cycle of multicycle,
+ // so we use 10 for those cases
+ InstrItinData<IIC_VMACQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_NPipe]>], [10, 3, 2, 2]>,
+ //
+ // Double-register Fused FP Multiple-Accumulate
+ InstrItinData<IIC_VFMACD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [9, 3, 2, 2]>,
+ //
+ // Quad-register Fused FP Multiple-Accumulate
+ // Result written in N9, but that is relative to the last cycle of multicycle,
+ // so we use 10 for those cases
+ InstrItinData<IIC_VFMACQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_NPipe]>], [10, 3, 2, 2]>,
+ //
+ // Double-register Reciprical Step
+ InstrItinData<IIC_VRECSD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [9, 2, 2]>,
+ //
+ // Quad-register Reciprical Step
+ InstrItinData<IIC_VRECSQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_NPipe]>], [10, 2, 2]>,
+ //
+ // Double-register Integer Count
+ InstrItinData<IIC_VCNTiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [3, 2, 2]>,
+ //
+ // Quad-register Integer Count
+ // Result written in N3, but that is relative to the last cycle of multicycle,
+ // so we use 4 for those cases
+ InstrItinData<IIC_VCNTiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_NPipe]>], [4, 2, 2]>,
+ //
+ // Double-register Integer Unary
+ InstrItinData<IIC_VUNAiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [4, 2]>,
+ //
+ // Quad-register Integer Unary
+ InstrItinData<IIC_VUNAiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [4, 2]>,
+ //
+ // Double-register Integer Q-Unary
+ InstrItinData<IIC_VQUNAiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [4, 1]>,
+ //
+ // Quad-register Integer CountQ-Unary
+ InstrItinData<IIC_VQUNAiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [4, 1]>,
+ //
+ // Double-register Integer Binary
+ InstrItinData<IIC_VBINiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [3, 2, 2]>,
+ //
+ // Quad-register Integer Binary
+ InstrItinData<IIC_VBINiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [3, 2, 2]>,
+ //
+ // Double-register Integer Binary (4 cycle)
+ InstrItinData<IIC_VBINi4D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [4, 2, 1]>,
+ //
+ // Quad-register Integer Binary (4 cycle)
+ InstrItinData<IIC_VBINi4Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [4, 2, 1]>,
+
+ //
+ // Double-register Integer Subtract
+ InstrItinData<IIC_VSUBiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [3, 2, 1]>,
+ //
+ // Quad-register Integer Subtract
+ InstrItinData<IIC_VSUBiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [3, 2, 1]>,
+ //
+ // Double-register Integer Subtract
+ InstrItinData<IIC_VSUBi4D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [4, 2, 1]>,
+ //
+ // Quad-register Integer Subtract
+ InstrItinData<IIC_VSUBi4Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [4, 2, 1]>,
+ //
+ // Double-register Integer Shift
+ InstrItinData<IIC_VSHLiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [3, 1, 1]>,
+ //
+ // Quad-register Integer Shift
+ InstrItinData<IIC_VSHLiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_NPipe]>], [4, 1, 1]>,
+ //
+ // Double-register Integer Shift (4 cycle)
+ InstrItinData<IIC_VSHLi4D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [4, 1, 1]>,
+ //
+ // Quad-register Integer Shift (4 cycle)
+ InstrItinData<IIC_VSHLi4Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_NPipe]>], [5, 1, 1]>,
+ //
+ // Double-register Integer Pair Add Long
+ InstrItinData<IIC_VPALiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [6, 3, 1]>,
+ //
+ // Quad-register Integer Pair Add Long
+ InstrItinData<IIC_VPALiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_NPipe]>], [7, 3, 1]>,
+ //
+ // Double-register Absolute Difference and Accumulate
+ InstrItinData<IIC_VABAD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [6, 3, 2, 1]>,
+ //
+ // Quad-register Absolute Difference and Accumulate
+ InstrItinData<IIC_VABAQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_NPipe]>], [6, 3, 2, 1]>,
+
+ //
+ // Double-register Integer Multiply (.8, .16)
+ InstrItinData<IIC_VMULi16D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [6, 2, 2]>,
+ //
+ // Double-register Integer Multiply (.32)
+ InstrItinData<IIC_VMULi32D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_NPipe]>], [7, 2, 1]>,
+ //
+ // Quad-register Integer Multiply (.8, .16)
+ InstrItinData<IIC_VMULi16Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_NPipe]>], [7, 2, 2]>,
+ //
+ // Quad-register Integer Multiply (.32)
+ InstrItinData<IIC_VMULi32Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>,
+ InstrStage<2, [A8_NLSPipe], 0>,
+ InstrStage<3, [A8_NPipe]>], [9, 2, 1]>,
+ //
+ // Double-register Integer Multiply-Accumulate (.8, .16)
+ InstrItinData<IIC_VMACi16D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [6, 3, 2, 2]>,
+ //
+ // Double-register Integer Multiply-Accumulate (.32)
+ InstrItinData<IIC_VMACi32D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_NPipe]>], [7, 3, 2, 1]>,
+ //
+ // Quad-register Integer Multiply-Accumulate (.8, .16)
+ InstrItinData<IIC_VMACi16Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_NPipe]>], [7, 3, 2, 2]>,
+ //
+ // Quad-register Integer Multiply-Accumulate (.32)
+ InstrItinData<IIC_VMACi32Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>,
+ InstrStage<2, [A8_NLSPipe], 0>,
+ InstrStage<3, [A8_NPipe]>], [9, 3, 2, 1]>,
+ //
+ // Double-register VEXT
+ InstrItinData<IIC_VEXTD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NLSPipe]>], [2, 1, 1]>,
+ //
+ // Quad-register VEXT
+ InstrItinData<IIC_VEXTQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_NLSPipe]>], [3, 1, 1]>,
+ //
+ // VTB
+ InstrItinData<IIC_VTB1, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_NLSPipe]>], [3, 2, 1]>,
+ InstrItinData<IIC_VTB2, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_NLSPipe]>], [3, 2, 2, 1]>,
+ InstrItinData<IIC_VTB3, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NLSPipe]>,
+ InstrStage<1, [A8_NPipe], 0>,
+ InstrStage<2, [A8_NLSPipe]>], [4, 2, 2, 3, 1]>,
+ InstrItinData<IIC_VTB4, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NLSPipe]>,
+ InstrStage<1, [A8_NPipe], 0>,
+ InstrStage<2, [A8_NLSPipe]>],[4, 2, 2, 3, 3, 1]>,
+ //
+ // VTBX
+ InstrItinData<IIC_VTBX1, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_NLSPipe]>], [3, 1, 2, 1]>,
+ InstrItinData<IIC_VTBX2, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_NLSPipe]>], [3, 1, 2, 2, 1]>,
+ InstrItinData<IIC_VTBX3, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NLSPipe]>,
+ InstrStage<1, [A8_NPipe], 0>,
+ InstrStage<2, [A8_NLSPipe]>],[4, 1, 2, 2, 3, 1]>,
+ InstrItinData<IIC_VTBX4, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NLSPipe]>,
+ InstrStage<1, [A8_NPipe], 0>,
+ InstrStage<2, [A8_NLSPipe]>], [4, 1, 2, 2, 3, 3, 1]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// This following definitions describe the simple machine model which
+// will replace itineraries.
+
+// Cortex-A8 machine model for scheduling and other instruction cost heuristics.
+def CortexA8Model : SchedMachineModel {
+ let IssueWidth = 2; // 2 micro-ops are dispatched per cycle.
+ let LoadLatency = 2; // Optimistic load latency assuming bypass.
+ // This is overriden by OperandCycles if the
+ // Itineraries are queried instead.
+ let MispredictPenalty = 13; // Based on estimate of pipeline depth.
+ let CompleteModel = 0;
+
+ let Itineraries = CortexA8Itineraries;
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td b/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td
new file mode 100644
index 000000000000..519e595bd184
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td
@@ -0,0 +1,2529 @@
+//=- ARMScheduleA9.td - ARM Cortex-A9 Scheduling Definitions -*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the ARM Cortex A9 processors.
+//
+//===----------------------------------------------------------------------===//
+
+// ===---------------------------------------------------------------------===//
+// This section contains legacy support for itineraries. This is
+// required until SD and PostRA schedulers are replaced by MachineScheduler.
+
+//
+// Ad-hoc scheduling information derived from pretty vague "Cortex-A9 Technical
+// Reference Manual".
+//
+// Functional units
+def A9_Issue0 : FuncUnit; // Issue 0
+def A9_Issue1 : FuncUnit; // Issue 1
+def A9_Branch : FuncUnit; // Branch
+def A9_ALU0 : FuncUnit; // ALU / MUL pipeline 0
+def A9_ALU1 : FuncUnit; // ALU pipeline 1
+def A9_AGU : FuncUnit; // Address generation unit for ld / st
+def A9_NPipe : FuncUnit; // NEON pipeline
+def A9_MUX0 : FuncUnit; // AGU + NEON/FPU multiplexer
+def A9_LSUnit : FuncUnit; // L/S Unit
+def A9_DRegsVFP: FuncUnit; // FP register set, VFP side
+def A9_DRegsN : FuncUnit; // FP register set, NEON side
+
+// Bypasses
+def A9_LdBypass : Bypass;
+
+def CortexA9Itineraries : ProcessorItineraries<
+ [A9_Issue0, A9_Issue1, A9_Branch, A9_ALU0, A9_ALU1, A9_AGU, A9_NPipe, A9_MUX0,
+ A9_LSUnit, A9_DRegsVFP, A9_DRegsN],
+ [A9_LdBypass], [
+ // Two fully-pipelined integer ALU pipelines
+
+ //
+ // Move instructions, unconditional
+ InstrItinData<IIC_iMOVi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_ALU0, A9_ALU1]>], [1]>,
+ InstrItinData<IIC_iMOVr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+ InstrItinData<IIC_iMOVsi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+ InstrItinData<IIC_iMOVsr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1, 1]>,
+ InstrItinData<IIC_iMOVix2 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_ALU0, A9_ALU1]>,
+ InstrStage<1, [A9_ALU0, A9_ALU1]>], [2]>,
+ InstrItinData<IIC_iMOVix2addpc,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_ALU0, A9_ALU1]>,
+ InstrStage<1, [A9_ALU0, A9_ALU1]>,
+ InstrStage<1, [A9_ALU0, A9_ALU1]>], [3]>,
+ InstrItinData<IIC_iMOVix2ld,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_ALU0, A9_ALU1]>,
+ InstrStage<1, [A9_ALU0, A9_ALU1]>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_AGU], 0>,
+ InstrStage<1, [A9_LSUnit]>], [5]>,
+ //
+ // MVN instructions
+ InstrItinData<IIC_iMVNi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_ALU0, A9_ALU1]>],
+ [1]>,
+ InstrItinData<IIC_iMVNr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_ALU0, A9_ALU1]>],
+ [1, 1], [NoBypass, A9_LdBypass]>,
+ InstrItinData<IIC_iMVNsi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<2, [A9_ALU0, A9_ALU1]>],
+ [2, 1]>,
+ InstrItinData<IIC_iMVNsr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<3, [A9_ALU0, A9_ALU1]>],
+ [3, 1, 1]>,
+ //
+ // No operand cycles
+ InstrItinData<IIC_iALUx , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_ALU0, A9_ALU1]>]>,
+ //
+ // Binary Instructions that produce a result
+ InstrItinData<IIC_iALUi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_ALU0, A9_ALU1]>],
+ [1, 1], [NoBypass, A9_LdBypass]>,
+ InstrItinData<IIC_iALUr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_ALU0, A9_ALU1]>],
+ [1, 1, 1], [NoBypass, A9_LdBypass, A9_LdBypass]>,
+ InstrItinData<IIC_iALUsi, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<2, [A9_ALU0, A9_ALU1]>],
+ [2, 1, 1], [NoBypass, A9_LdBypass, NoBypass]>,
+ InstrItinData<IIC_iALUsir,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<2, [A9_ALU0, A9_ALU1]>],
+ [2, 1, 1], [NoBypass, NoBypass, A9_LdBypass]>,
+ InstrItinData<IIC_iALUsr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<3, [A9_ALU0, A9_ALU1]>],
+ [3, 1, 1, 1],
+ [NoBypass, A9_LdBypass, NoBypass, NoBypass]>,
+ //
+ // Bitwise Instructions that produce a result
+ InstrItinData<IIC_iBITi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+ InstrItinData<IIC_iBITr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1, 1]>,
+ InstrItinData<IIC_iBITsi, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1, 1]>,
+ InstrItinData<IIC_iBITsr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<3, [A9_ALU0, A9_ALU1]>], [3, 1, 1, 1]>,
+ //
+ // Unary Instructions that produce a result
+
+ // CLZ, RBIT, etc.
+ InstrItinData<IIC_iUNAr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+
+ // BFC, BFI, UBFX, SBFX
+ InstrItinData<IIC_iUNAsi, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1]>,
+
+ //
+ // Zero and sign extension instructions
+ InstrItinData<IIC_iEXTr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_ALU0, A9_ALU1]>], [2, 1]>,
+ InstrItinData<IIC_iEXTAr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<2, [A9_ALU0, A9_ALU1]>], [3, 1, 1]>,
+ InstrItinData<IIC_iEXTAsr,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<3, [A9_ALU0, A9_ALU1]>], [3, 1, 1, 1]>,
+ //
+ // Compare instructions
+ InstrItinData<IIC_iCMPi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_ALU0, A9_ALU1]>],
+ [1], [A9_LdBypass]>,
+ InstrItinData<IIC_iCMPr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_ALU0, A9_ALU1]>],
+ [1, 1], [A9_LdBypass, A9_LdBypass]>,
+ InstrItinData<IIC_iCMPsi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<2, [A9_ALU0, A9_ALU1]>],
+ [1, 1], [A9_LdBypass, NoBypass]>,
+ InstrItinData<IIC_iCMPsr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<3, [A9_ALU0, A9_ALU1]>],
+ [1, 1, 1], [A9_LdBypass, NoBypass, NoBypass]>,
+ //
+ // Test instructions
+ InstrItinData<IIC_iTSTi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_ALU0, A9_ALU1]>], [1]>,
+ InstrItinData<IIC_iTSTr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+ InstrItinData<IIC_iTSTsi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<2, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+ InstrItinData<IIC_iTSTsr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<3, [A9_ALU0, A9_ALU1]>], [1, 1, 1]>,
+ //
+ // Move instructions, conditional
+ // FIXME: Correctly model the extra input dep on the destination.
+ InstrItinData<IIC_iCMOVi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_ALU0, A9_ALU1]>], [1]>,
+ InstrItinData<IIC_iCMOVr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+ InstrItinData<IIC_iCMOVsi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+ InstrItinData<IIC_iCMOVsr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1, 1]>,
+ InstrItinData<IIC_iCMOVix2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_ALU0, A9_ALU1]>,
+ InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_ALU0, A9_ALU1]>], [2]>,
+
+ // Integer multiply pipeline
+ //
+ InstrItinData<IIC_iMUL16 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<2, [A9_ALU0]>], [3, 1, 1]>,
+ InstrItinData<IIC_iMAC16 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<2, [A9_ALU0]>],
+ [3, 1, 1, 1]>,
+ InstrItinData<IIC_iMUL32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<2, [A9_ALU0]>], [4, 1, 1]>,
+ InstrItinData<IIC_iMAC32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<2, [A9_ALU0]>],
+ [4, 1, 1, 1]>,
+ InstrItinData<IIC_iMUL64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<3, [A9_ALU0]>], [4, 5, 1, 1]>,
+ InstrItinData<IIC_iMAC64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<3, [A9_ALU0]>],
+ [4, 5, 1, 1]>,
+ // Integer load pipeline
+ // FIXME: The timings are some rough approximations
+ //
+ // Immediate offset
+ InstrItinData<IIC_iLoad_i , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_AGU], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [3, 1], [A9_LdBypass]>,
+ InstrItinData<IIC_iLoad_bh_i, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<2, [A9_AGU], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [4, 1], [A9_LdBypass]>,
+ // FIXME: If address is 64-bit aligned, AGU cycles is 1.
+ InstrItinData<IIC_iLoad_d_i , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<2, [A9_AGU], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [3, 3, 1], [A9_LdBypass]>,
+ //
+ // Register offset
+ InstrItinData<IIC_iLoad_r , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_AGU], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [3, 1, 1], [A9_LdBypass]>,
+ InstrItinData<IIC_iLoad_bh_r, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<2, [A9_AGU], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [4, 1, 1], [A9_LdBypass]>,
+ InstrItinData<IIC_iLoad_d_r , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<2, [A9_AGU], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [3, 3, 1, 1], [A9_LdBypass]>,
+ //
+ // Scaled register offset
+ InstrItinData<IIC_iLoad_si , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_AGU], 0>,
+ InstrStage<1, [A9_LSUnit], 0>],
+ [4, 1, 1], [A9_LdBypass]>,
+ InstrItinData<IIC_iLoad_bh_si,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<2, [A9_AGU], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [5, 1, 1], [A9_LdBypass]>,
+ //
+ // Immediate offset with update
+ InstrItinData<IIC_iLoad_iu , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_AGU], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [3, 2, 1], [A9_LdBypass]>,
+ InstrItinData<IIC_iLoad_bh_iu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<2, [A9_AGU], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [4, 3, 1], [A9_LdBypass]>,
+ //
+ // Register offset with update
+ InstrItinData<IIC_iLoad_ru , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_AGU], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [3, 2, 1, 1], [A9_LdBypass]>,
+ InstrItinData<IIC_iLoad_bh_ru,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<2, [A9_AGU], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [4, 3, 1, 1], [A9_LdBypass]>,
+ InstrItinData<IIC_iLoad_d_ru, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<2, [A9_AGU], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [3, 3, 1, 1], [A9_LdBypass]>,
+ //
+ // Scaled register offset with update
+ InstrItinData<IIC_iLoad_siu , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_AGU], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [4, 3, 1, 1], [A9_LdBypass]>,
+ InstrItinData<IIC_iLoad_bh_siu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<2, [A9_AGU], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [5, 4, 1, 1], [A9_LdBypass]>,
+ //
+ // Load multiple, def is the 5th operand.
+ // FIXME: This assumes 3 to 4 registers.
+ InstrItinData<IIC_iLoad_m , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<2, [A9_AGU], 1>,
+ InstrStage<2, [A9_LSUnit]>],
+ [1, 1, 1, 1, 3],
+ [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass],
+ -1>, // dynamic uops
+ //
+ // Load multiple + update, defs are the 1st and 5th operands.
+ InstrItinData<IIC_iLoad_mu , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<2, [A9_AGU], 1>,
+ InstrStage<2, [A9_LSUnit]>],
+ [2, 1, 1, 1, 3],
+ [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass],
+ -1>, // dynamic uops
+ //
+ // Load multiple plus branch
+ InstrItinData<IIC_iLoad_mBr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_AGU], 1>,
+ InstrStage<2, [A9_LSUnit]>,
+ InstrStage<1, [A9_Branch]>],
+ [1, 2, 1, 1, 3],
+ [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass],
+ -1>, // dynamic uops
+ //
+ // Pop, def is the 3rd operand.
+ InstrItinData<IIC_iPop , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<2, [A9_AGU], 1>,
+ InstrStage<2, [A9_LSUnit]>],
+ [1, 1, 3],
+ [NoBypass, NoBypass, A9_LdBypass],
+ -1>, // dynamic uops
+ //
+ // Pop + branch, def is the 3rd operand.
+ InstrItinData<IIC_iPop_Br, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<2, [A9_AGU], 1>,
+ InstrStage<2, [A9_LSUnit]>,
+ InstrStage<1, [A9_Branch]>],
+ [1, 1, 3],
+ [NoBypass, NoBypass, A9_LdBypass],
+ -1>, // dynamic uops
+ //
+ // iLoadi + iALUr for t2LDRpci_pic.
+ InstrItinData<IIC_iLoadiALU, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_AGU], 0>,
+ InstrStage<1, [A9_LSUnit]>,
+ InstrStage<1, [A9_ALU0, A9_ALU1]>],
+ [2, 1]>,
+
+ // Integer store pipeline
+ ///
+ // Immediate offset
+ InstrItinData<IIC_iStore_i , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_AGU], 0>,
+ InstrStage<1, [A9_LSUnit]>], [1, 1]>,
+ InstrItinData<IIC_iStore_bh_i,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<2, [A9_AGU], 1>,
+ InstrStage<1, [A9_LSUnit]>], [1, 1]>,
+ // FIXME: If address is 64-bit aligned, AGU cycles is 1.
+ InstrItinData<IIC_iStore_d_i, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<2, [A9_AGU], 1>,
+ InstrStage<1, [A9_LSUnit]>], [1, 1]>,
+ //
+ // Register offset
+ InstrItinData<IIC_iStore_r , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_AGU], 0>,
+ InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
+ InstrItinData<IIC_iStore_bh_r,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<2, [A9_AGU], 1>,
+ InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
+ InstrItinData<IIC_iStore_d_r, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<2, [A9_AGU], 1>,
+ InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
+ //
+ // Scaled register offset
+ InstrItinData<IIC_iStore_si , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_AGU], 0>,
+ InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
+ InstrItinData<IIC_iStore_bh_si,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<2, [A9_AGU], 1>,
+ InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
+ //
+ // Immediate offset with update
+ InstrItinData<IIC_iStore_iu , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_AGU], 0>,
+ InstrStage<1, [A9_LSUnit]>], [2, 1, 1]>,
+ InstrItinData<IIC_iStore_bh_iu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<2, [A9_AGU], 1>,
+ InstrStage<1, [A9_LSUnit]>], [3, 1, 1]>,
+ //
+ // Register offset with update
+ InstrItinData<IIC_iStore_ru , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_AGU], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [2, 1, 1, 1]>,
+ InstrItinData<IIC_iStore_bh_ru,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<2, [A9_AGU], 1>,
+ InstrStage<1, [A9_LSUnit]>],
+ [3, 1, 1, 1]>,
+ InstrItinData<IIC_iStore_d_ru, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<2, [A9_AGU], 1>,
+ InstrStage<1, [A9_LSUnit]>],
+ [3, 1, 1, 1]>,
+ //
+ // Scaled register offset with update
+ InstrItinData<IIC_iStore_siu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_AGU], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [2, 1, 1, 1]>,
+ InstrItinData<IIC_iStore_bh_siu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<2, [A9_AGU], 1>,
+ InstrStage<1, [A9_LSUnit]>],
+ [3, 1, 1, 1]>,
+ //
+ // Store multiple
+ InstrItinData<IIC_iStore_m , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_AGU], 0>,
+ InstrStage<2, [A9_LSUnit]>],
+ [], [], -1>, // dynamic uops
+ //
+ // Store multiple + update
+ InstrItinData<IIC_iStore_mu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_AGU], 0>,
+ InstrStage<2, [A9_LSUnit]>],
+ [2], [], -1>, // dynamic uops
+ //
+ // Preload
+ InstrItinData<IIC_Preload, [InstrStage<1, [A9_Issue0, A9_Issue1]>], [1, 1]>,
+
+ // Branch
+ //
+ // no delay slots, so the latency of a branch is unimportant
+ InstrItinData<IIC_Br , [InstrStage<1, [A9_Issue0], 0>,
+ InstrStage<1, [A9_Issue1], 0>,
+ InstrStage<1, [A9_Branch]>]>,
+
+ // VFP and NEON shares the same register file. This means that every VFP
+ // instruction should wait for full completion of the consecutive NEON
+ // instruction and vice-versa. We model this behavior with two artificial FUs:
+ // DRegsVFP and DRegsVFP.
+ //
+ // Every VFP instruction:
+ // - Acquires DRegsVFP resource for 1 cycle
+ // - Reserves DRegsN resource for the whole duration (including time to
+ // register file writeback!).
+ // Every NEON instruction does the same but with FUs swapped.
+ //
+ // Since the reserved FU cannot be acquired, this models precisely
+ // "cross-domain" stalls.
+
+ // VFP
+ // Issue through integer pipeline, and execute in NEON unit.
+
+ // FP Special Register to Integer Register File Move
+ InstrItinData<IIC_fpSTAT , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsVFP], 0, Required>,
+ InstrStage<2, [A9_DRegsN], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [1]>,
+ //
+ // Single-precision FP Unary
+ InstrItinData<IIC_fpUNA32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsVFP], 0, Required>,
+ // Extra latency cycles since wbck is 2 cycles
+ InstrStage<3, [A9_DRegsN], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [1, 1]>,
+ //
+ // Double-precision FP Unary
+ InstrItinData<IIC_fpUNA64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsVFP], 0, Required>,
+ // Extra latency cycles since wbck is 2 cycles
+ InstrStage<3, [A9_DRegsN], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [1, 1]>,
+
+ //
+ // Single-precision FP Compare
+ InstrItinData<IIC_fpCMP32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsVFP], 0, Required>,
+ // Extra latency cycles since wbck is 4 cycles
+ InstrStage<5, [A9_DRegsN], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [1, 1]>,
+ //
+ // Double-precision FP Compare
+ InstrItinData<IIC_fpCMP64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsVFP], 0, Required>,
+ // Extra latency cycles since wbck is 4 cycles
+ InstrStage<5, [A9_DRegsN], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [1, 1]>,
+ //
+ // Single to Double FP Convert
+ InstrItinData<IIC_fpCVTSD , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsVFP], 0, Required>,
+ InstrStage<5, [A9_DRegsN], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [4, 1]>,
+ //
+ // Double to Single FP Convert
+ InstrItinData<IIC_fpCVTDS , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsVFP], 0, Required>,
+ InstrStage<5, [A9_DRegsN], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [4, 1]>,
+
+ //
+ // Single to Half FP Convert
+ InstrItinData<IIC_fpCVTSH , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsVFP], 0, Required>,
+ InstrStage<5, [A9_DRegsN], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [4, 1]>,
+ //
+ // Half to Single FP Convert
+ InstrItinData<IIC_fpCVTHS , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsVFP], 0, Required>,
+ InstrStage<3, [A9_DRegsN], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [2, 1]>,
+
+ //
+ // Single-Precision FP to Integer Convert
+ InstrItinData<IIC_fpCVTSI , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsVFP], 0, Required>,
+ InstrStage<5, [A9_DRegsN], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [4, 1]>,
+ //
+ // Double-Precision FP to Integer Convert
+ InstrItinData<IIC_fpCVTDI , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsVFP], 0, Required>,
+ InstrStage<5, [A9_DRegsN], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [4, 1]>,
+ //
+ // Integer to Single-Precision FP Convert
+ InstrItinData<IIC_fpCVTIS , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsVFP], 0, Required>,
+ InstrStage<5, [A9_DRegsN], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [4, 1]>,
+ //
+ // Integer to Double-Precision FP Convert
+ InstrItinData<IIC_fpCVTID , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsVFP], 0, Required>,
+ InstrStage<5, [A9_DRegsN], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [4, 1]>,
+ //
+ // Single-precision FP ALU
+ InstrItinData<IIC_fpALU32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsVFP], 0, Required>,
+ InstrStage<5, [A9_DRegsN], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [4, 1, 1]>,
+ //
+ // Double-precision FP ALU
+ InstrItinData<IIC_fpALU64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsVFP], 0, Required>,
+ InstrStage<5, [A9_DRegsN], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [4, 1, 1]>,
+ //
+ // Single-precision FP Multiply
+ InstrItinData<IIC_fpMUL32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsVFP], 0, Required>,
+ InstrStage<6, [A9_DRegsN], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [5, 1, 1]>,
+ //
+ // Double-precision FP Multiply
+ InstrItinData<IIC_fpMUL64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsVFP], 0, Required>,
+ InstrStage<7, [A9_DRegsN], 0, Reserved>,
+ InstrStage<2, [A9_NPipe]>],
+ [6, 1, 1]>,
+ //
+ // Single-precision FP MAC
+ InstrItinData<IIC_fpMAC32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsVFP], 0, Required>,
+ InstrStage<9, [A9_DRegsN], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [8, 1, 1, 1]>,
+ //
+ // Double-precision FP MAC
+ InstrItinData<IIC_fpMAC64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsVFP], 0, Required>,
+ InstrStage<10, [A9_DRegsN], 0, Reserved>,
+ InstrStage<2, [A9_NPipe]>],
+ [9, 1, 1, 1]>,
+ //
+ // Single-precision Fused FP MAC
+ InstrItinData<IIC_fpFMAC32, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsVFP], 0, Required>,
+ InstrStage<9, [A9_DRegsN], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [8, 1, 1, 1]>,
+ //
+ // Double-precision Fused FP MAC
+ InstrItinData<IIC_fpFMAC64, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsVFP], 0, Required>,
+ InstrStage<10, [A9_DRegsN], 0, Reserved>,
+ InstrStage<2, [A9_NPipe]>],
+ [9, 1, 1, 1]>,
+ //
+ // Single-precision FP DIV
+ InstrItinData<IIC_fpDIV32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsVFP], 0, Required>,
+ InstrStage<16, [A9_DRegsN], 0, Reserved>,
+ InstrStage<10, [A9_NPipe]>],
+ [15, 1, 1]>,
+ //
+ // Double-precision FP DIV
+ InstrItinData<IIC_fpDIV64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsVFP], 0, Required>,
+ InstrStage<26, [A9_DRegsN], 0, Reserved>,
+ InstrStage<20, [A9_NPipe]>],
+ [25, 1, 1]>,
+ //
+ // Single-precision FP SQRT
+ InstrItinData<IIC_fpSQRT32, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsVFP], 0, Required>,
+ InstrStage<18, [A9_DRegsN], 0, Reserved>,
+ InstrStage<13, [A9_NPipe]>],
+ [17, 1]>,
+ //
+ // Double-precision FP SQRT
+ InstrItinData<IIC_fpSQRT64, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsVFP], 0, Required>,
+ InstrStage<33, [A9_DRegsN], 0, Reserved>,
+ InstrStage<28, [A9_NPipe]>],
+ [32, 1]>,
+
+ //
+ // Integer to Single-precision Move
+ InstrItinData<IIC_fpMOVIS, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsVFP], 0, Required>,
+ // Extra 1 latency cycle since wbck is 2 cycles
+ InstrStage<3, [A9_DRegsN], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [1, 1]>,
+ //
+ // Integer to Double-precision Move
+ InstrItinData<IIC_fpMOVID, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsVFP], 0, Required>,
+ // Extra 1 latency cycle since wbck is 2 cycles
+ InstrStage<3, [A9_DRegsN], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [1, 1, 1]>,
+ //
+ // Single-precision to Integer Move
+ //
+ // On A9 move-from-VFP is free to issue with no stall if other VFP
+ // operations are in flight. I assume it still can't dual-issue though.
+ InstrItinData<IIC_fpMOVSI, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>],
+ [2, 1]>,
+ //
+ // Double-precision to Integer Move
+ //
+ // On A9 move-from-VFP is free to issue with no stall if other VFP
+ // operations are in flight. I assume it still can't dual-issue though.
+ InstrItinData<IIC_fpMOVDI, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>],
+ [2, 1, 1]>,
+ //
+ // Single-precision FP Load
+ InstrItinData<IIC_fpLoad32, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsVFP], 0, Required>,
+ InstrStage<2, [A9_DRegsN], 0, Reserved>,
+ InstrStage<1, [A9_NPipe], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [1, 1]>,
+ //
+ // Double-precision FP Load
+ // FIXME: Result latency is 1 if address is 64-bit aligned.
+ InstrItinData<IIC_fpLoad64, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsVFP], 0, Required>,
+ InstrStage<2, [A9_DRegsN], 0, Reserved>,
+ InstrStage<1, [A9_NPipe], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [2, 1]>,
+ //
+ // FP Load Multiple
+ // FIXME: assumes 2 doubles which requires 2 LS cycles.
+ InstrItinData<IIC_fpLoad_m, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsVFP], 0, Required>,
+ InstrStage<2, [A9_DRegsN], 0, Reserved>,
+ InstrStage<1, [A9_NPipe], 0>,
+ InstrStage<2, [A9_LSUnit]>],
+ [1, 1, 1, 1], [], -1>, // dynamic uops
+ //
+ // FP Load Multiple + update
+ // FIXME: assumes 2 doubles which requires 2 LS cycles.
+ InstrItinData<IIC_fpLoad_mu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsVFP], 0, Required>,
+ InstrStage<2, [A9_DRegsN], 0, Reserved>,
+ InstrStage<1, [A9_NPipe], 0>,
+ InstrStage<2, [A9_LSUnit]>],
+ [2, 1, 1, 1], [], -1>, // dynamic uops
+ //
+ // Single-precision FP Store
+ InstrItinData<IIC_fpStore32,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsVFP], 0, Required>,
+ InstrStage<2, [A9_DRegsN], 0, Reserved>,
+ InstrStage<1, [A9_NPipe], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [1, 1]>,
+ //
+ // Double-precision FP Store
+ InstrItinData<IIC_fpStore64,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsVFP], 0, Required>,
+ InstrStage<2, [A9_DRegsN], 0, Reserved>,
+ InstrStage<1, [A9_NPipe], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [1, 1]>,
+ //
+ // FP Store Multiple
+ // FIXME: assumes 2 doubles which requires 2 LS cycles.
+ InstrItinData<IIC_fpStore_m,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsVFP], 0, Required>,
+ InstrStage<2, [A9_DRegsN], 0, Reserved>,
+ InstrStage<1, [A9_NPipe], 0>,
+ InstrStage<2, [A9_LSUnit]>],
+ [1, 1, 1, 1], [], -1>, // dynamic uops
+ //
+ // FP Store Multiple + update
+ // FIXME: assumes 2 doubles which requires 2 LS cycles.
+ InstrItinData<IIC_fpStore_mu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsVFP], 0, Required>,
+ InstrStage<2, [A9_DRegsN], 0, Reserved>,
+ InstrStage<1, [A9_NPipe], 0>,
+ InstrStage<2, [A9_LSUnit]>],
+ [2, 1, 1, 1], [], -1>, // dynamic uops
+ // NEON
+ // VLD1
+ InstrItinData<IIC_VLD1, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [1, 1]>,
+ // VLD1x2
+ InstrItinData<IIC_VLD1x2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [1, 1, 1]>,
+ // VLD1x3
+ InstrItinData<IIC_VLD1x3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe], 0>,
+ InstrStage<2, [A9_LSUnit]>],
+ [1, 1, 2, 1]>,
+ // VLD1x4
+ InstrItinData<IIC_VLD1x4, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe], 0>,
+ InstrStage<2, [A9_LSUnit]>],
+ [1, 1, 2, 2, 1]>,
+ // VLD1u
+ InstrItinData<IIC_VLD1u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [1, 2, 1]>,
+ // VLD1x2u
+ InstrItinData<IIC_VLD1x2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [1, 1, 2, 1]>,
+ // VLD1x3u
+ InstrItinData<IIC_VLD1x3u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe], 0>,
+ InstrStage<2, [A9_LSUnit]>],
+ [1, 1, 2, 2, 1]>,
+ // VLD1x4u
+ InstrItinData<IIC_VLD1x4u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe], 0>,
+ InstrStage<2, [A9_LSUnit]>],
+ [1, 1, 2, 2, 2, 1]>,
+ //
+ // VLD1ln
+ InstrItinData<IIC_VLD1ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe], 0>,
+ InstrStage<2, [A9_LSUnit]>],
+ [3, 1, 1, 1]>,
+ //
+ // VLD1lnu
+ InstrItinData<IIC_VLD1lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe], 0>,
+ InstrStage<2, [A9_LSUnit]>],
+ [3, 2, 1, 1, 1, 1]>,
+ //
+ // VLD1dup
+ InstrItinData<IIC_VLD1dup, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [2, 1]>,
+ //
+ // VLD1dupu
+ InstrItinData<IIC_VLD1dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [2, 2, 1, 1]>,
+ //
+ // VLD2
+ InstrItinData<IIC_VLD2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 7 cycles
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [2, 2, 1]>,
+ //
+ // VLD2x2
+ InstrItinData<IIC_VLD2x2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe], 0>,
+ InstrStage<2, [A9_LSUnit]>],
+ [2, 3, 2, 3, 1]>,
+ //
+ // VLD2ln
+ InstrItinData<IIC_VLD2ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe], 0>,
+ InstrStage<2, [A9_LSUnit]>],
+ [3, 3, 1, 1, 1, 1]>,
+ //
+ // VLD2u
+ InstrItinData<IIC_VLD2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 7 cycles
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [2, 2, 2, 1, 1, 1]>,
+ //
+ // VLD2x2u
+ InstrItinData<IIC_VLD2x2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe], 0>,
+ InstrStage<2, [A9_LSUnit]>],
+ [2, 3, 2, 3, 2, 1]>,
+ //
+ // VLD2lnu
+ InstrItinData<IIC_VLD2lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe], 0>,
+ InstrStage<2, [A9_LSUnit]>],
+ [3, 3, 2, 1, 1, 1, 1, 1]>,
+ //
+ // VLD2dup
+ InstrItinData<IIC_VLD2dup, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [2, 2, 1]>,
+ //
+ // VLD2dupu
+ InstrItinData<IIC_VLD2dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [2, 2, 2, 1, 1]>,
+ //
+ // VLD3
+ InstrItinData<IIC_VLD3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<9,[A9_DRegsVFP], 0, Reserved>,
+ InstrStage<3, [A9_NPipe], 0>,
+ InstrStage<3, [A9_LSUnit]>],
+ [3, 3, 4, 1]>,
+ //
+ // VLD3ln
+ InstrItinData<IIC_VLD3ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<11,[A9_DRegsVFP], 0, Reserved>,
+ InstrStage<5, [A9_NPipe], 0>,
+ InstrStage<5, [A9_LSUnit]>],
+ [5, 5, 6, 1, 1, 1, 1, 2]>,
+ //
+ // VLD3u
+ InstrItinData<IIC_VLD3u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<9,[A9_DRegsVFP], 0, Reserved>,
+ InstrStage<3, [A9_NPipe], 0>,
+ InstrStage<3, [A9_LSUnit]>],
+ [3, 3, 4, 2, 1]>,
+ //
+ // VLD3lnu
+ InstrItinData<IIC_VLD3lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<11,[A9_DRegsVFP], 0, Reserved>,
+ InstrStage<5, [A9_NPipe], 0>,
+ InstrStage<5, [A9_LSUnit]>],
+ [5, 5, 6, 2, 1, 1, 1, 1, 1, 2]>,
+ //
+ // VLD3dup
+ InstrItinData<IIC_VLD3dup, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<3, [A9_NPipe], 0>,
+ InstrStage<3, [A9_LSUnit]>],
+ [3, 3, 4, 1]>,
+ //
+ // VLD3dupu
+ InstrItinData<IIC_VLD3dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<3, [A9_NPipe], 0>,
+ InstrStage<3, [A9_LSUnit]>],
+ [3, 3, 4, 2, 1, 1]>,
+ //
+ // VLD4
+ InstrItinData<IIC_VLD4, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<9,[A9_DRegsVFP], 0, Reserved>,
+ InstrStage<3, [A9_NPipe], 0>,
+ InstrStage<3, [A9_LSUnit]>],
+ [3, 3, 4, 4, 1]>,
+ //
+ // VLD4ln
+ InstrItinData<IIC_VLD4ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<10,[A9_DRegsVFP], 0, Reserved>,
+ InstrStage<4, [A9_NPipe], 0>,
+ InstrStage<4, [A9_LSUnit]>],
+ [4, 4, 5, 5, 1, 1, 1, 1, 2, 2]>,
+ //
+ // VLD4u
+ InstrItinData<IIC_VLD4u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<9,[A9_DRegsVFP], 0, Reserved>,
+ InstrStage<3, [A9_NPipe], 0>,
+ InstrStage<3, [A9_LSUnit]>],
+ [3, 3, 4, 4, 2, 1]>,
+ //
+ // VLD4lnu
+ InstrItinData<IIC_VLD4lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<10,[A9_DRegsVFP], 0, Reserved>,
+ InstrStage<4, [A9_NPipe], 0>,
+ InstrStage<4, [A9_LSUnit]>],
+ [4, 4, 5, 5, 2, 1, 1, 1, 1, 1, 2, 2]>,
+ //
+ // VLD4dup
+ InstrItinData<IIC_VLD4dup, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe], 0>,
+ InstrStage<2, [A9_LSUnit]>],
+ [2, 2, 3, 3, 1]>,
+ //
+ // VLD4dupu
+ InstrItinData<IIC_VLD4dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe], 0>,
+ InstrStage<2, [A9_LSUnit]>],
+ [2, 2, 3, 3, 2, 1, 1]>,
+ //
+ // VST1
+ InstrItinData<IIC_VST1, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [1, 1, 1]>,
+ //
+ // VST1x2
+ InstrItinData<IIC_VST1x2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [1, 1, 1, 1]>,
+ //
+ // VST1x3
+ InstrItinData<IIC_VST1x3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe], 0>,
+ InstrStage<2, [A9_LSUnit]>],
+ [1, 1, 1, 1, 2]>,
+ //
+ // VST1x4
+ InstrItinData<IIC_VST1x4, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe], 0>,
+ InstrStage<2, [A9_LSUnit]>],
+ [1, 1, 1, 1, 2, 2]>,
+ //
+ // VST1u
+ InstrItinData<IIC_VST1u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [2, 1, 1, 1, 1]>,
+ //
+ // VST1x2u
+ InstrItinData<IIC_VST1x2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [2, 1, 1, 1, 1, 1]>,
+ //
+ // VST1x3u
+ InstrItinData<IIC_VST1x3u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe], 0>,
+ InstrStage<2, [A9_LSUnit]>],
+ [2, 1, 1, 1, 1, 1, 2]>,
+ //
+ // VST1x4u
+ InstrItinData<IIC_VST1x4u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe], 0>,
+ InstrStage<2, [A9_LSUnit]>],
+ [2, 1, 1, 1, 1, 1, 2, 2]>,
+ //
+ // VST1ln
+ InstrItinData<IIC_VST1ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [1, 1, 1]>,
+ //
+ // VST1lnu
+ InstrItinData<IIC_VST1lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [2, 1, 1, 1, 1]>,
+ //
+ // VST2
+ InstrItinData<IIC_VST2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [1, 1, 1, 1]>,
+ //
+ // VST2x2
+ InstrItinData<IIC_VST2x2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<3, [A9_NPipe], 0>,
+ InstrStage<3, [A9_LSUnit]>],
+ [1, 1, 1, 1, 2, 2]>,
+ //
+ // VST2u
+ InstrItinData<IIC_VST2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [2, 1, 1, 1, 1, 1]>,
+ //
+ // VST2x2u
+ InstrItinData<IIC_VST2x2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<3, [A9_NPipe], 0>,
+ InstrStage<3, [A9_LSUnit]>],
+ [2, 1, 1, 1, 1, 1, 2, 2]>,
+ //
+ // VST2ln
+ InstrItinData<IIC_VST2ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [1, 1, 1, 1]>,
+ //
+ // VST2lnu
+ InstrItinData<IIC_VST2lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe], 0>,
+ InstrStage<1, [A9_LSUnit]>],
+ [2, 1, 1, 1, 1, 1]>,
+ //
+ // VST3
+ InstrItinData<IIC_VST3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe], 0>,
+ InstrStage<2, [A9_LSUnit]>],
+ [1, 1, 1, 1, 2]>,
+ //
+ // VST3u
+ InstrItinData<IIC_VST3u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe], 0>,
+ InstrStage<2, [A9_LSUnit]>],
+ [2, 1, 1, 1, 1, 1, 2]>,
+ //
+ // VST3ln
+ InstrItinData<IIC_VST3ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<3, [A9_NPipe], 0>,
+ InstrStage<3, [A9_LSUnit]>],
+ [1, 1, 1, 1, 2]>,
+ //
+ // VST3lnu
+ InstrItinData<IIC_VST3lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<3, [A9_NPipe], 0>,
+ InstrStage<3, [A9_LSUnit]>],
+ [2, 1, 1, 1, 1, 1, 2]>,
+ //
+ // VST4
+ InstrItinData<IIC_VST4, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe], 0>,
+ InstrStage<2, [A9_LSUnit]>],
+ [1, 1, 1, 1, 2, 2]>,
+ //
+ // VST4u
+ InstrItinData<IIC_VST4u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe], 0>,
+ InstrStage<2, [A9_LSUnit]>],
+ [2, 1, 1, 1, 1, 1, 2, 2]>,
+ //
+ // VST4ln
+ InstrItinData<IIC_VST4ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe], 0>,
+ InstrStage<2, [A9_LSUnit]>],
+ [1, 1, 1, 1, 2, 2]>,
+ //
+ // VST4lnu
+ InstrItinData<IIC_VST4lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe], 0>,
+ InstrStage<2, [A9_LSUnit]>],
+ [2, 1, 1, 1, 1, 1, 2, 2]>,
+
+ //
+ // Double-register Integer Unary
+ InstrItinData<IIC_VUNAiD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 6 cycles
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [4, 2]>,
+ //
+ // Quad-register Integer Unary
+ InstrItinData<IIC_VUNAiQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 6 cycles
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [4, 2]>,
+ //
+ // Double-register Integer Q-Unary
+ InstrItinData<IIC_VQUNAiD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 6 cycles
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [4, 1]>,
+ //
+ // Quad-register Integer CountQ-Unary
+ InstrItinData<IIC_VQUNAiQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 6 cycles
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [4, 1]>,
+ //
+ // Double-register Integer Binary
+ InstrItinData<IIC_VBINiD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 6 cycles
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [3, 2, 2]>,
+ //
+ // Quad-register Integer Binary
+ InstrItinData<IIC_VBINiQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 6 cycles
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [3, 2, 2]>,
+ //
+ // Double-register Integer Subtract
+ InstrItinData<IIC_VSUBiD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 6 cycles
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [3, 2, 1]>,
+ //
+ // Quad-register Integer Subtract
+ InstrItinData<IIC_VSUBiQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 6 cycles
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [3, 2, 1]>,
+ //
+ // Double-register Integer Shift
+ InstrItinData<IIC_VSHLiD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 6 cycles
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [3, 1, 1]>,
+ //
+ // Quad-register Integer Shift
+ InstrItinData<IIC_VSHLiQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 6 cycles
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [3, 1, 1]>,
+ //
+ // Double-register Integer Shift (4 cycle)
+ InstrItinData<IIC_VSHLi4D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 6 cycles
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [4, 1, 1]>,
+ //
+ // Quad-register Integer Shift (4 cycle)
+ InstrItinData<IIC_VSHLi4Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 6 cycles
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [4, 1, 1]>,
+ //
+ // Double-register Integer Binary (4 cycle)
+ InstrItinData<IIC_VBINi4D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 6 cycles
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [4, 2, 2]>,
+ //
+ // Quad-register Integer Binary (4 cycle)
+ InstrItinData<IIC_VBINi4Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 6 cycles
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [4, 2, 2]>,
+ //
+ // Double-register Integer Subtract (4 cycle)
+ InstrItinData<IIC_VSUBi4D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 6 cycles
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [4, 2, 1]>,
+ //
+ // Quad-register Integer Subtract (4 cycle)
+ InstrItinData<IIC_VSUBi4Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 6 cycles
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [4, 2, 1]>,
+
+ //
+ // Double-register Integer Count
+ InstrItinData<IIC_VCNTiD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 6 cycles
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [3, 2, 2]>,
+ //
+ // Quad-register Integer Count
+ // Result written in N3, but that is relative to the last cycle of multicycle,
+ // so we use 4 for those cases
+ InstrItinData<IIC_VCNTiQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 7 cycles
+ InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe]>],
+ [4, 2, 2]>,
+ //
+ // Double-register Absolute Difference and Accumulate
+ InstrItinData<IIC_VABAD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 6 cycles
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [6, 3, 2, 1]>,
+ //
+ // Quad-register Absolute Difference and Accumulate
+ InstrItinData<IIC_VABAQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 6 cycles
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe]>],
+ [6, 3, 2, 1]>,
+ //
+ // Double-register Integer Pair Add Long
+ InstrItinData<IIC_VPALiD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 6 cycles
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [6, 3, 1]>,
+ //
+ // Quad-register Integer Pair Add Long
+ InstrItinData<IIC_VPALiQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 6 cycles
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe]>],
+ [6, 3, 1]>,
+
+ //
+ // Double-register Integer Multiply (.8, .16)
+ InstrItinData<IIC_VMULi16D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 6 cycles
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [6, 2, 2]>,
+ //
+ // Quad-register Integer Multiply (.8, .16)
+ InstrItinData<IIC_VMULi16Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 7 cycles
+ InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe]>],
+ [7, 2, 2]>,
+
+ //
+ // Double-register Integer Multiply (.32)
+ InstrItinData<IIC_VMULi32D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 7 cycles
+ InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe]>],
+ [7, 2, 1]>,
+ //
+ // Quad-register Integer Multiply (.32)
+ InstrItinData<IIC_VMULi32Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 9 cycles
+ InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<4, [A9_NPipe]>],
+ [9, 2, 1]>,
+ //
+ // Double-register Integer Multiply-Accumulate (.8, .16)
+ InstrItinData<IIC_VMACi16D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 6 cycles
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [6, 3, 2, 2]>,
+ //
+ // Double-register Integer Multiply-Accumulate (.32)
+ InstrItinData<IIC_VMACi32D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 7 cycles
+ InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe]>],
+ [7, 3, 2, 1]>,
+ //
+ // Quad-register Integer Multiply-Accumulate (.8, .16)
+ InstrItinData<IIC_VMACi16Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 7 cycles
+ InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe]>],
+ [7, 3, 2, 2]>,
+ //
+ // Quad-register Integer Multiply-Accumulate (.32)
+ InstrItinData<IIC_VMACi32Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 9 cycles
+ InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<4, [A9_NPipe]>],
+ [9, 3, 2, 1]>,
+
+ //
+ // Move
+ InstrItinData<IIC_VMOV, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [1,1]>,
+ //
+ // Move Immediate
+ InstrItinData<IIC_VMOVImm, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 6 cycles
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [3]>,
+ //
+ // Double-register Permute Move
+ InstrItinData<IIC_VMOVD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 6 cycles
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [2, 1]>,
+ //
+ // Quad-register Permute Move
+ InstrItinData<IIC_VMOVQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 6 cycles
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [2, 1]>,
+ //
+ // Integer to Single-precision Move
+ InstrItinData<IIC_VMOVIS , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [1, 1]>,
+ //
+ // Integer to Double-precision Move
+ InstrItinData<IIC_VMOVID , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [1, 1, 1]>,
+ //
+ // Single-precision to Integer Move
+ InstrItinData<IIC_VMOVSI , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [2, 1]>,
+ //
+ // Double-precision to Integer Move
+ InstrItinData<IIC_VMOVDI , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [2, 2, 1]>,
+ //
+ // Integer to Lane Move
+ InstrItinData<IIC_VMOVISL , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ InstrStage<4, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe]>],
+ [3, 1, 1]>,
+
+ //
+ // Vector narrow move
+ InstrItinData<IIC_VMOVN, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 6 cycles
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [3, 1]>,
+ //
+ // Double-register FP Unary
+ InstrItinData<IIC_VUNAD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 6 cycles
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [5, 2]>,
+ //
+ // Quad-register FP Unary
+ // Result written in N5, but that is relative to the last cycle of multicycle,
+ // so we use 6 for those cases
+ InstrItinData<IIC_VUNAQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 7 cycles
+ InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe]>],
+ [6, 2]>,
+ //
+ // Double-register FP Binary
+ // FIXME: We're using this itin for many instructions and [2, 2] here is too
+ // optimistic.
+ InstrItinData<IIC_VBIND, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 6 cycles
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [5, 2, 2]>,
+
+ //
+ // VPADD, etc.
+ InstrItinData<IIC_VPBIND, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 6 cycles
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [5, 1, 1]>,
+ //
+ // Double-register FP VMUL
+ InstrItinData<IIC_VFMULD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 6 cycles
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [5, 2, 1]>,
+ //
+ // Quad-register FP Binary
+ // Result written in N5, but that is relative to the last cycle of multicycle,
+ // so we use 6 for those cases
+ // FIXME: We're using this itin for many instructions and [2, 2] here is too
+ // optimistic.
+ InstrItinData<IIC_VBINQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 7 cycles
+ InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe]>],
+ [6, 2, 2]>,
+ //
+ // Quad-register FP VMUL
+ InstrItinData<IIC_VFMULQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 7 cycles
+ InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [6, 2, 1]>,
+ //
+ // Double-register FP Multiple-Accumulate
+ InstrItinData<IIC_VMACD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 7 cycles
+ InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe]>],
+ [6, 3, 2, 1]>,
+ //
+ // Quad-register FP Multiple-Accumulate
+ // Result written in N9, but that is relative to the last cycle of multicycle,
+ // so we use 10 for those cases
+ InstrItinData<IIC_VMACQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 9 cycles
+ InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<4, [A9_NPipe]>],
+ [8, 4, 2, 1]>,
+ //
+ // Double-register Fused FP Multiple-Accumulate
+ InstrItinData<IIC_VFMACD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 7 cycles
+ InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe]>],
+ [6, 3, 2, 1]>,
+ //
+ // Quad-register Fused FP Multiple-Accumulate
+ // Result written in N9, but that is relative to the last cycle of multicycle,
+ // so we use 10 for those cases
+ InstrItinData<IIC_VFMACQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 9 cycles
+ InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<4, [A9_NPipe]>],
+ [8, 4, 2, 1]>,
+ //
+ // Double-register Reciprical Step
+ InstrItinData<IIC_VRECSD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 10 cycles
+ InstrStage<11, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [9, 2, 2]>,
+ //
+ // Quad-register Reciprical Step
+ InstrItinData<IIC_VRECSQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 11 cycles
+ InstrStage<12, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe]>],
+ [10, 2, 2]>,
+ //
+ // Double-register Permute
+ InstrItinData<IIC_VPERMD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 6 cycles
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [2, 2, 1, 1]>,
+ //
+ // Quad-register Permute
+ // Result written in N2, but that is relative to the last cycle of multicycle,
+ // so we use 3 for those cases
+ InstrItinData<IIC_VPERMQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 7 cycles
+ InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe]>],
+ [3, 3, 1, 1]>,
+ //
+ // Quad-register Permute (3 cycle issue)
+ // Result written in N2, but that is relative to the last cycle of multicycle,
+ // so we use 4 for those cases
+ InstrItinData<IIC_VPERMQ3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 8 cycles
+ InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<3, [A9_NPipe]>],
+ [4, 4, 1, 1]>,
+
+ //
+ // Double-register VEXT
+ InstrItinData<IIC_VEXTD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 6 cycles
+ InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [2, 1, 1]>,
+ //
+ // Quad-register VEXT
+ InstrItinData<IIC_VEXTQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 7 cycles
+ InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe]>],
+ [3, 1, 2]>,
+ //
+ // VTB
+ InstrItinData<IIC_VTB1, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 7 cycles
+ InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe]>],
+ [3, 2, 1]>,
+ InstrItinData<IIC_VTB2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<2, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 7 cycles
+ InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe]>],
+ [3, 2, 2, 1]>,
+ InstrItinData<IIC_VTB3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<2, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 8 cycles
+ InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<3, [A9_NPipe]>],
+ [4, 2, 2, 3, 1]>,
+ InstrItinData<IIC_VTB4, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 8 cycles
+ InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<3, [A9_NPipe]>],
+ [4, 2, 2, 3, 3, 1]>,
+ //
+ // VTBX
+ InstrItinData<IIC_VTBX1, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 7 cycles
+ InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe]>],
+ [3, 1, 2, 1]>,
+ InstrItinData<IIC_VTBX2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 7 cycles
+ InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe]>],
+ [3, 1, 2, 2, 1]>,
+ InstrItinData<IIC_VTBX3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 8 cycles
+ InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<3, [A9_NPipe]>],
+ [4, 1, 2, 2, 3, 1]>,
+ InstrItinData<IIC_VTBX4, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 8 cycles
+ InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe]>],
+ [4, 1, 2, 2, 3, 3, 1]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// The following definitions describe the simpler per-operand machine model.
+// This works with MachineScheduler and will eventually replace itineraries.
+
+class A9WriteLMOpsListType<list<WriteSequence> writes> {
+ list <WriteSequence> Writes = writes;
+ SchedMachineModel SchedModel = ?;
+}
+
+// Cortex-A9 machine model for scheduling and other instruction cost heuristics.
+def CortexA9Model : SchedMachineModel {
+ let IssueWidth = 2; // 2 micro-ops are dispatched per cycle.
+ let MicroOpBufferSize = 56; // Based on available renamed registers.
+ let LoadLatency = 2; // Optimistic load latency assuming bypass.
+ // This is overriden by OperandCycles if the
+ // Itineraries are queried instead.
+ let MispredictPenalty = 8; // Based on estimate of pipeline depth.
+
+ let Itineraries = CortexA9Itineraries;
+
+ // FIXME: Many vector operations were never given an itinerary. We
+ // haven't mapped these to the new model either.
+ let CompleteModel = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available.
+//
+// The AGU unit has BufferSize=1 so that the latency between operations
+// that use it are considered to stall other operations.
+//
+// The FP unit has BufferSize=0 so that it is a hard dispatch
+// hazard. No instruction may be dispatched while the unit is reserved.
+
+let SchedModel = CortexA9Model in {
+
+def A9UnitALU : ProcResource<2>;
+def A9UnitMul : ProcResource<1> { let Super = A9UnitALU; }
+def A9UnitAGU : ProcResource<1> { let BufferSize = 1; }
+def A9UnitLS : ProcResource<1>;
+def A9UnitFP : ProcResource<1> { let BufferSize = 0; }
+def A9UnitB : ProcResource<1>;
+
+//===----------------------------------------------------------------------===//
+// Define scheduler read/write types with their resources and latency on A9.
+
+// Consume an issue slot, but no processor resources. This is useful when all
+// other writes associated with the operand have NumMicroOps = 0.
+def A9WriteIssue : SchedWriteRes<[]> { let Latency = 0; }
+
+// Write an integer register.
+def A9WriteI : SchedWriteRes<[A9UnitALU]>;
+// Write an integer shifted-by register
+def A9WriteIsr : SchedWriteRes<[A9UnitALU]> { let Latency = 2; }
+
+// Basic ALU.
+def A9WriteALU : SchedWriteRes<[A9UnitALU]>;
+// ALU with operand shifted by immediate.
+def : WriteRes<WriteALUsi, [A9UnitALU]> { let Latency = 2; }
+// ALU with operand shifted by register.
+def A9WriteALUsr : SchedWriteRes<[A9UnitALU]> { let Latency = 3; }
+
+// Multiplication
+def A9WriteM : SchedWriteRes<[A9UnitMul, A9UnitMul]> { let Latency = 4; }
+def A9WriteMHi : SchedWriteRes<[A9UnitMul]> { let Latency = 5;
+ let NumMicroOps = 0; }
+def A9WriteM16 : SchedWriteRes<[A9UnitMul]> { let Latency = 3; }
+def A9WriteM16Hi : SchedWriteRes<[A9UnitMul]> { let Latency = 4;
+ let NumMicroOps = 0; }
+
+// Floating-point
+// Only one FP or AGU instruction may issue per cycle. We model this
+// by having FP instructions consume the AGU resource.
+def A9WriteF : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 4; }
+def A9WriteFMov : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 1; }
+def A9WriteFMulS : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 5; }
+def A9WriteFMulD : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 6; }
+def A9WriteFMAS : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 8; }
+def A9WriteFMAD : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 9; }
+def A9WriteFDivS : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 15; }
+def A9WriteFDivD : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 25; }
+def A9WriteFSqrtS : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 17; }
+def A9WriteFSqrtD : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 32; }
+
+// NEON has an odd mix of latencies. Simply name the write types by latency.
+def A9WriteV1 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 1; }
+def A9WriteV2 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 2; }
+def A9WriteV3 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 3; }
+def A9WriteV4 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 4; }
+def A9WriteV5 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 5; }
+def A9WriteV6 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 6; }
+def A9WriteV7 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 7; }
+def A9WriteV9 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 9; }
+def A9WriteV10 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 10; }
+
+// Reserve A9UnitFP for 2 consecutive cycles.
+def A9Write2V4 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> {
+ let Latency = 4;
+ let ResourceCycles = [2];
+}
+def A9Write2V7 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> {
+ let Latency = 7;
+ let ResourceCycles = [2];
+}
+def A9Write2V9 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> {
+ let Latency = 9;
+ let ResourceCycles = [2];
+}
+
+// Branches don't have a def operand but still consume resources.
+def A9WriteB : SchedWriteRes<[A9UnitB]>;
+
+// Address generation.
+def A9WriteAdr : SchedWriteRes<[A9UnitAGU]> { let NumMicroOps = 0; }
+
+// Load Integer.
+def A9WriteL : SchedWriteRes<[A9UnitLS]> { let Latency = 3; }
+// Load the upper 32-bits using the same micro-op.
+def A9WriteLHi : SchedWriteRes<[]> { let Latency = 3;
+ let NumMicroOps = 0; }
+// Offset shifted by register.
+def A9WriteLsi : SchedWriteRes<[A9UnitLS]> { let Latency = 4; }
+// Load (and zero extend) a byte.
+def A9WriteLb : SchedWriteRes<[A9UnitLS]> { let Latency = 4; }
+def A9WriteLbsi : SchedWriteRes<[A9UnitLS]> { let Latency = 5; }
+
+// Load or Store Float, aligned.
+def A9WriteLSfp : SchedWriteRes<[A9UnitLS, A9UnitFP]> { let Latency = 1; }
+
+// Store Integer.
+def A9WriteS : SchedWriteRes<[A9UnitLS]>;
+
+//===----------------------------------------------------------------------===//
+// Define resources dynamically for load multiple variants.
+
+// Define helpers for extra latency without consuming resources.
+def A9WriteCycle1 : SchedWriteRes<[]> { let Latency = 1; let NumMicroOps = 0; }
+foreach NumCycles = 2-8 in {
+def A9WriteCycle#NumCycles : WriteSequence<[A9WriteCycle1], NumCycles>;
+} // foreach NumCycles
+
+// Define address generation sequences and predicates for 8 flavors of LDMs.
+foreach NumAddr = 1-8 in {
+
+// Define A9WriteAdr1-8 as a sequence of A9WriteAdr with additive
+// latency for instructions that generate multiple loads or stores.
+def A9WriteAdr#NumAddr : WriteSequence<[A9WriteAdr], NumAddr>;
+
+// Define a predicate to select the LDM based on number of memory addresses.
+def A9LMAdr#NumAddr#Pred :
+ SchedPredicate<"(TII->getNumLDMAddresses(*MI)+1)/2 == "#NumAddr>;
+
+} // foreach NumAddr
+
+// Fall-back for unknown LDMs.
+def A9LMUnknownPred : SchedPredicate<"TII->getNumLDMAddresses(*MI) == 0">;
+
+// LDM/VLDM/VLDn address generation latency & resources.
+// Dynamically select the A9WriteAdrN sequence using a predicate.
+def A9WriteLMAdr : SchedWriteVariant<[
+ SchedVar<A9LMAdr1Pred, [A9WriteAdr1]>,
+ SchedVar<A9LMAdr2Pred, [A9WriteAdr2]>,
+ SchedVar<A9LMAdr3Pred, [A9WriteAdr3]>,
+ SchedVar<A9LMAdr4Pred, [A9WriteAdr4]>,
+ SchedVar<A9LMAdr5Pred, [A9WriteAdr5]>,
+ SchedVar<A9LMAdr6Pred, [A9WriteAdr6]>,
+ SchedVar<A9LMAdr7Pred, [A9WriteAdr7]>,
+ SchedVar<A9LMAdr8Pred, [A9WriteAdr8]>,
+ // For unknown LDM/VLDM/VSTM, assume 2 32-bit registers.
+ SchedVar<A9LMUnknownPred, [A9WriteAdr2]>]>;
+
+// Define LDM Resources.
+// These take no issue resource, so they can be combined with other
+// writes like WriteB.
+// A9WriteLMLo takes a single LS resource and 2 cycles.
+def A9WriteLMLo : SchedWriteRes<[A9UnitLS]> { let Latency = 2;
+ let NumMicroOps = 0; }
+// Assuming aligned access, the upper half of each pair is free with
+// the same latency.
+def A9WriteLMHi : SchedWriteRes<[]> { let Latency = 2;
+ let NumMicroOps = 0; }
+// Each A9WriteL#N variant adds N cycles of latency without consuming
+// additional resources.
+foreach NumAddr = 1-8 in {
+def A9WriteL#NumAddr : WriteSequence<
+ [A9WriteLMLo, !cast<SchedWrite>("A9WriteCycle"#NumAddr)]>;
+def A9WriteL#NumAddr#Hi : WriteSequence<
+ [A9WriteLMHi, !cast<SchedWrite>("A9WriteCycle"#NumAddr)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// LDM: Load multiple into 32-bit integer registers.
+
+def A9WriteLMOpsList : A9WriteLMOpsListType<
+ [A9WriteL1, A9WriteL1Hi,
+ A9WriteL2, A9WriteL2Hi,
+ A9WriteL3, A9WriteL3Hi,
+ A9WriteL4, A9WriteL4Hi,
+ A9WriteL5, A9WriteL5Hi,
+ A9WriteL6, A9WriteL6Hi,
+ A9WriteL7, A9WriteL7Hi,
+ A9WriteL8, A9WriteL8Hi]>;
+
+// A9WriteLM variants expand into a pair of writes for each 64-bit
+// value loaded. When the number of registers is odd, the last
+// A9WriteLnHi is naturally ignored because the instruction has no
+// following def operands. These variants take no issue resource, so
+// they may need to be part of a WriteSequence that includes A9WriteIssue.
+def A9WriteLM : SchedWriteVariant<[
+ SchedVar<A9LMAdr1Pred, A9WriteLMOpsList.Writes[0-1]>,
+ SchedVar<A9LMAdr2Pred, A9WriteLMOpsList.Writes[0-3]>,
+ SchedVar<A9LMAdr3Pred, A9WriteLMOpsList.Writes[0-5]>,
+ SchedVar<A9LMAdr4Pred, A9WriteLMOpsList.Writes[0-7]>,
+ SchedVar<A9LMAdr5Pred, A9WriteLMOpsList.Writes[0-9]>,
+ SchedVar<A9LMAdr6Pred, A9WriteLMOpsList.Writes[0-11]>,
+ SchedVar<A9LMAdr7Pred, A9WriteLMOpsList.Writes[0-13]>,
+ SchedVar<A9LMAdr8Pred, A9WriteLMOpsList.Writes[0-15]>,
+ // For unknown LDMs, define the maximum number of writes, but only
+ // make the first two consume resources.
+ SchedVar<A9LMUnknownPred, [A9WriteL1, A9WriteL1Hi,
+ A9WriteL2, A9WriteL2Hi,
+ A9WriteL3Hi, A9WriteL3Hi,
+ A9WriteL4Hi, A9WriteL4Hi,
+ A9WriteL5Hi, A9WriteL5Hi,
+ A9WriteL6Hi, A9WriteL6Hi,
+ A9WriteL7Hi, A9WriteL7Hi,
+ A9WriteL8Hi, A9WriteL8Hi]>]> {
+ let Variadic = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// VFP Load/Store Multiple Variants, and NEON VLDn/VSTn support.
+
+// A9WriteLfpOp is the same as A9WriteLSfp but takes no issue resources
+// so can be used in WriteSequences for in single-issue instructions that
+// encapsulate multiple loads.
+def A9WriteLfpOp : SchedWriteRes<[A9UnitLS, A9UnitFP]> {
+ let Latency = 1;
+ let NumMicroOps = 0;
+}
+
+foreach NumAddr = 1-8 in {
+
+// Helper for A9WriteLfp1-8: A sequence of fp loads with no micro-ops.
+def A9WriteLfp#NumAddr#Seq : WriteSequence<[A9WriteLfpOp], NumAddr>;
+
+// A9WriteLfp1-8 definitions are statically expanded into a sequence of
+// A9WriteLfpOps with additive latency that takes a single issue slot.
+// Used directly to describe NEON VLDn.
+def A9WriteLfp#NumAddr : WriteSequence<
+ [A9WriteIssue, !cast<SchedWrite>("A9WriteLfp"#NumAddr#Seq)]>;
+
+// A9WriteLfp1-8Mov adds a cycle of latency and FP resource for
+// permuting loaded values.
+def A9WriteLfp#NumAddr#Mov : WriteSequence<
+ [A9WriteF, !cast<SchedWrite>("A9WriteLfp"#NumAddr#Seq)]>;
+
+} // foreach NumAddr
+
+// Define VLDM/VSTM PreRA resources.
+// A9WriteLMfpPreRA are dynamically expanded into the correct
+// A9WriteLfp1-8 sequence based on a predicate. This supports the
+// preRA VLDM variants in which all 64-bit loads are written to the
+// same tuple of either single or double precision registers.
+def A9WriteLMfpPreRA : SchedWriteVariant<[
+ SchedVar<A9LMAdr1Pred, [A9WriteLfp1]>,
+ SchedVar<A9LMAdr2Pred, [A9WriteLfp2]>,
+ SchedVar<A9LMAdr3Pred, [A9WriteLfp3]>,
+ SchedVar<A9LMAdr4Pred, [A9WriteLfp4]>,
+ SchedVar<A9LMAdr5Pred, [A9WriteLfp5]>,
+ SchedVar<A9LMAdr6Pred, [A9WriteLfp6]>,
+ SchedVar<A9LMAdr7Pred, [A9WriteLfp7]>,
+ SchedVar<A9LMAdr8Pred, [A9WriteLfp8]>,
+ // For unknown VLDM/VSTM PreRA, assume 2xS registers.
+ SchedVar<A9LMUnknownPred, [A9WriteLfp2]>]>;
+
+// Define VLDM/VSTM PostRA Resources.
+// A9WriteLMfpLo takes a LS and FP resource and one issue slot but no latency.
+def A9WriteLMfpLo : SchedWriteRes<[A9UnitLS, A9UnitFP]> { let Latency = 0; }
+
+foreach NumAddr = 1-8 in {
+
+// Each A9WriteL#N variant adds N cycles of latency without consuming
+// additional resources.
+def A9WriteLMfp#NumAddr : WriteSequence<
+ [A9WriteLMfpLo, !cast<SchedWrite>("A9WriteCycle"#NumAddr)]>;
+
+// Assuming aligned access, the upper half of each pair is free with
+// the same latency.
+def A9WriteLMfp#NumAddr#Hi : WriteSequence<
+ [A9WriteLMHi, !cast<SchedWrite>("A9WriteCycle"#NumAddr)]>;
+
+} // foreach NumAddr
+
+// VLDM PostRA Variants. These variants expand A9WriteLMfpPostRA into a
+// pair of writes for each 64-bit data loaded. When the number of
+// registers is odd, the last WriteLMfpnHi is naturally ignored because
+// the instruction has no following def operands.
+
+def A9WriteLMfpPostRAOpsList : A9WriteLMOpsListType<
+ [A9WriteLMfp1, A9WriteLMfp2, // 0-1
+ A9WriteLMfp3, A9WriteLMfp4, // 2-3
+ A9WriteLMfp5, A9WriteLMfp6, // 4-5
+ A9WriteLMfp7, A9WriteLMfp8, // 6-7
+ A9WriteLMfp1Hi, // 8-8
+ A9WriteLMfp2Hi, A9WriteLMfp2Hi, // 9-10
+ A9WriteLMfp3Hi, A9WriteLMfp3Hi, // 11-12
+ A9WriteLMfp4Hi, A9WriteLMfp4Hi, // 13-14
+ A9WriteLMfp5Hi, A9WriteLMfp5Hi, // 15-16
+ A9WriteLMfp6Hi, A9WriteLMfp6Hi, // 17-18
+ A9WriteLMfp7Hi, A9WriteLMfp7Hi, // 19-20
+ A9WriteLMfp8Hi, A9WriteLMfp8Hi]>; // 21-22
+
+def A9WriteLMfpPostRA : SchedWriteVariant<[
+ SchedVar<A9LMAdr1Pred, A9WriteLMfpPostRAOpsList.Writes[0-0, 8-8]>,
+ SchedVar<A9LMAdr2Pred, A9WriteLMfpPostRAOpsList.Writes[0-1, 9-10]>,
+ SchedVar<A9LMAdr3Pred, A9WriteLMfpPostRAOpsList.Writes[0-2, 10-12]>,
+ SchedVar<A9LMAdr4Pred, A9WriteLMfpPostRAOpsList.Writes[0-3, 11-14]>,
+ SchedVar<A9LMAdr5Pred, A9WriteLMfpPostRAOpsList.Writes[0-4, 12-16]>,
+ SchedVar<A9LMAdr6Pred, A9WriteLMfpPostRAOpsList.Writes[0-5, 13-18]>,
+ SchedVar<A9LMAdr7Pred, A9WriteLMfpPostRAOpsList.Writes[0-6, 14-20]>,
+ SchedVar<A9LMAdr8Pred, A9WriteLMfpPostRAOpsList.Writes[0-7, 15-22]>,
+ // For unknown LDMs, define the maximum number of writes, but only
+ // make the first two consume resources. We are optimizing for the case
+ // where the operands are DPRs, and this determines the first eight
+ // types. The remaining eight types are filled to cover the case
+ // where the operands are SPRs.
+ SchedVar<A9LMUnknownPred, [A9WriteLMfp1, A9WriteLMfp2,
+ A9WriteLMfp3Hi, A9WriteLMfp4Hi,
+ A9WriteLMfp5Hi, A9WriteLMfp6Hi,
+ A9WriteLMfp7Hi, A9WriteLMfp8Hi,
+ A9WriteLMfp5Hi, A9WriteLMfp5Hi,
+ A9WriteLMfp6Hi, A9WriteLMfp6Hi,
+ A9WriteLMfp7Hi, A9WriteLMfp7Hi,
+ A9WriteLMfp8Hi, A9WriteLMfp8Hi]>]> {
+ let Variadic = 1;
+}
+
+// Distinguish between our multiple MI-level forms of the same
+// VLDM/VSTM instructions.
+def A9PreRA : SchedPredicate<
+ "TargetRegisterInfo::isVirtualRegister(MI->getOperand(0).getReg())">;
+def A9PostRA : SchedPredicate<
+ "TargetRegisterInfo::isPhysicalRegister(MI->getOperand(0).getReg())">;
+
+// VLDM represents all destination registers as a single register
+// tuple, unlike LDM. So the number of write operands is not variadic.
+def A9WriteLMfp : SchedWriteVariant<[
+ SchedVar<A9PreRA, [A9WriteLMfpPreRA]>,
+ SchedVar<A9PostRA, [A9WriteLMfpPostRA]>]>;
+
+//===----------------------------------------------------------------------===//
+// Resources for other (non-LDM/VLDM) Variants.
+
+// These mov immediate writers are unconditionally expanded with
+// additive latency.
+def A9WriteI2 : WriteSequence<[A9WriteI, A9WriteI]>;
+def A9WriteI2pc : WriteSequence<[A9WriteI, A9WriteI, WriteALU]>;
+def A9WriteI2ld : WriteSequence<[A9WriteI, A9WriteI, A9WriteL]>;
+
+// Some ALU operations can read loaded integer values one cycle early.
+def A9ReadALU : SchedReadAdvance<1,
+ [A9WriteL, A9WriteLHi, A9WriteLsi, A9WriteLb, A9WriteLbsi,
+ A9WriteL1, A9WriteL2, A9WriteL3, A9WriteL4,
+ A9WriteL5, A9WriteL6, A9WriteL7, A9WriteL8,
+ A9WriteL1Hi, A9WriteL2Hi, A9WriteL3Hi, A9WriteL4Hi,
+ A9WriteL5Hi, A9WriteL6Hi, A9WriteL7Hi, A9WriteL8Hi]>;
+
+// Read types for operands that are unconditionally read in cycle N
+// after the instruction issues, decreases producer latency by N-1.
+def A9Read2 : SchedReadAdvance<1>;
+def A9Read3 : SchedReadAdvance<2>;
+def A9Read4 : SchedReadAdvance<3>;
+
+//===----------------------------------------------------------------------===//
+// Map itinerary classes to scheduler read/write resources per operand.
+//
+// For ARM, we piggyback scheduler resources on the Itinerary classes
+// to avoid perturbing the existing instruction definitions.
+
+// This table follows the ARM Cortex-A9 Technical Reference Manuals,
+// mostly in order.
+
+def :ItinRW<[WriteALU], [IIC_iMOVi,IIC_iMOVr,IIC_iMOVsi,
+ IIC_iMVNi,IIC_iMVNsi,
+ IIC_iCMOVi,IIC_iCMOVr,IIC_iCMOVsi]>;
+def :ItinRW<[WriteALU, A9ReadALU],[IIC_iMVNr]>;
+def :ItinRW<[A9WriteIsr], [IIC_iMOVsr,IIC_iMVNsr,IIC_iCMOVsr]>;
+
+def :ItinRW<[A9WriteI2], [IIC_iMOVix2,IIC_iCMOVix2]>;
+def :ItinRW<[A9WriteI2pc], [IIC_iMOVix2addpc]>;
+def :ItinRW<[A9WriteI2ld], [IIC_iMOVix2ld]>;
+
+def :ItinRW<[WriteALU], [IIC_iBITi,IIC_iBITr,IIC_iUNAr,IIC_iTSTi,IIC_iTSTr]>;
+def :ItinRW<[WriteALU, A9ReadALU], [IIC_iALUi, IIC_iCMPi, IIC_iCMPsi]>;
+def :ItinRW<[WriteALU, A9ReadALU, A9ReadALU],[IIC_iALUr,IIC_iCMPr]>;
+def :ItinRW<[WriteALUsi], [IIC_iBITsi,IIC_iUNAsi,IIC_iEXTr,IIC_iTSTsi]>;
+def :ItinRW<[WriteALUsi, A9ReadALU], [IIC_iALUsi]>;
+def :ItinRW<[WriteALUsi, ReadDefault, A9ReadALU], [IIC_iALUsir]>; // RSB
+def :ItinRW<[A9WriteALUsr], [IIC_iBITsr,IIC_iTSTsr,IIC_iEXTAr,IIC_iEXTAsr]>;
+def :ItinRW<[A9WriteALUsr, A9ReadALU], [IIC_iALUsr,IIC_iCMPsr]>;
+
+// A9WriteHi ignored for MUL32.
+def :ItinRW<[A9WriteM, A9WriteMHi], [IIC_iMUL32,IIC_iMAC32,
+ IIC_iMUL64,IIC_iMAC64]>;
+// FIXME: SMLALxx needs itin classes
+def :ItinRW<[A9WriteM16, A9WriteM16Hi], [IIC_iMUL16,IIC_iMAC16]>;
+
+// TODO: For floating-point ops, we model the pipeline forwarding
+// latencies here. WAW latencies are sometimes longer.
+
+def :ItinRW<[A9WriteFMov], [IIC_fpSTAT, IIC_fpMOVIS, IIC_fpMOVID, IIC_fpMOVSI,
+ IIC_fpUNA32, IIC_fpUNA64,
+ IIC_fpCMP32, IIC_fpCMP64]>;
+def :ItinRW<[A9WriteFMov, A9WriteFMov], [IIC_fpMOVDI]>;
+def :ItinRW<[A9WriteF], [IIC_fpCVTSD, IIC_fpCVTDS, IIC_fpCVTSH, IIC_fpCVTHS,
+ IIC_fpCVTIS, IIC_fpCVTID, IIC_fpCVTSI, IIC_fpCVTDI,
+ IIC_fpALU32, IIC_fpALU64]>;
+def :ItinRW<[A9WriteFMulS], [IIC_fpMUL32]>;
+def :ItinRW<[A9WriteFMulD], [IIC_fpMUL64]>;
+def :ItinRW<[A9WriteFMAS], [IIC_fpMAC32]>;
+def :ItinRW<[A9WriteFMAD], [IIC_fpMAC64]>;
+def :ItinRW<[A9WriteFDivS], [IIC_fpDIV32]>;
+def :ItinRW<[A9WriteFDivD], [IIC_fpDIV64]>;
+def :ItinRW<[A9WriteFSqrtS], [IIC_fpSQRT32]>;
+def :ItinRW<[A9WriteFSqrtD], [IIC_fpSQRT64]>;
+
+def :ItinRW<[A9WriteB], [IIC_Br]>;
+
+// A9 PLD is processed in a dedicated unit.
+def :ItinRW<[], [IIC_Preload]>;
+
+// Note: We must assume that loads are aligned, since the machine
+// model cannot know this statically and A9 ignores alignment hints.
+
+// A9WriteAdr consumes AGU regardless address writeback. But it's
+// latency is only relevant for users of an updated address.
+def :ItinRW<[A9WriteL, A9WriteAdr], [IIC_iLoad_i,IIC_iLoad_r,
+ IIC_iLoad_iu,IIC_iLoad_ru]>;
+def :ItinRW<[A9WriteLsi, A9WriteAdr], [IIC_iLoad_si,IIC_iLoad_siu]>;
+def :ItinRW<[A9WriteLb, A9WriteAdr2], [IIC_iLoad_bh_i,IIC_iLoad_bh_r,
+ IIC_iLoad_bh_iu,IIC_iLoad_bh_ru]>;
+def :ItinRW<[A9WriteLbsi, A9WriteAdr2], [IIC_iLoad_bh_si,IIC_iLoad_bh_siu]>;
+def :ItinRW<[A9WriteL, A9WriteLHi, A9WriteAdr], [IIC_iLoad_d_i,IIC_iLoad_d_r,
+ IIC_iLoad_d_ru]>;
+// Store either has no def operands, or the one def for address writeback.
+def :ItinRW<[A9WriteAdr, A9WriteS], [IIC_iStore_i, IIC_iStore_r,
+ IIC_iStore_iu, IIC_iStore_ru,
+ IIC_iStore_d_i, IIC_iStore_d_r,
+ IIC_iStore_d_ru]>;
+def :ItinRW<[A9WriteAdr2, A9WriteS], [IIC_iStore_si, IIC_iStore_siu,
+ IIC_iStore_bh_i, IIC_iStore_bh_r,
+ IIC_iStore_bh_iu, IIC_iStore_bh_ru]>;
+def :ItinRW<[A9WriteAdr3, A9WriteS], [IIC_iStore_bh_si, IIC_iStore_bh_siu]>;
+
+// A9WriteML will be expanded into a separate write for each def
+// operand. Address generation consumes resources, but A9WriteLMAdr
+// is listed after all def operands, so has no effective latency.
+//
+// Note: A9WriteLM expands into an even number of def operands. The
+// actual number of def operands may be less by one.
+def :ItinRW<[A9WriteLM, A9WriteLMAdr, A9WriteIssue], [IIC_iLoad_m, IIC_iPop]>;
+
+// Load multiple with address writeback has an extra def operand in
+// front of the loaded registers.
+//
+// Reuse the load-multiple variants for store-multiple because the
+// resources are identical, For stores only the address writeback
+// has a def operand so the WriteL latencies are unused.
+def :ItinRW<[A9WriteLMAdr, A9WriteLM, A9WriteIssue], [IIC_iLoad_mu,
+ IIC_iStore_m,
+ IIC_iStore_mu]>;
+def :ItinRW<[A9WriteLM, A9WriteLMAdr, A9WriteB], [IIC_iLoad_mBr, IIC_iPop_Br]>;
+def :ItinRW<[A9WriteL, A9WriteAdr, WriteALU], [IIC_iLoadiALU]>;
+
+def :ItinRW<[A9WriteLSfp, A9WriteAdr], [IIC_fpLoad32, IIC_fpLoad64]>;
+
+def :ItinRW<[A9WriteLMfp, A9WriteLMAdr], [IIC_fpLoad_m]>;
+def :ItinRW<[A9WriteLMAdr, A9WriteLMfp], [IIC_fpLoad_mu]>;
+def :ItinRW<[A9WriteAdr, A9WriteLSfp], [IIC_fpStore32, IIC_fpStore64,
+ IIC_fpStore_m, IIC_fpStore_mu]>;
+
+// Note: Unlike VLDM, VLD1 expects the writeback operand after the
+// normal writes.
+def :ItinRW<[A9WriteLfp1, A9WriteAdr1], [IIC_VLD1, IIC_VLD1u,
+ IIC_VLD1x2, IIC_VLD1x2u]>;
+def :ItinRW<[A9WriteLfp2, A9WriteAdr2], [IIC_VLD1x3, IIC_VLD1x3u,
+ IIC_VLD1x4, IIC_VLD1x4u,
+ IIC_VLD4dup, IIC_VLD4dupu]>;
+def :ItinRW<[A9WriteLfp1Mov, A9WriteAdr1], [IIC_VLD1dup, IIC_VLD1dupu,
+ IIC_VLD2, IIC_VLD2u,
+ IIC_VLD2dup, IIC_VLD2dupu]>;
+def :ItinRW<[A9WriteLfp2Mov, A9WriteAdr1], [IIC_VLD1ln, IIC_VLD1lnu,
+ IIC_VLD2x2, IIC_VLD2x2u,
+ IIC_VLD2ln, IIC_VLD2lnu]>;
+def :ItinRW<[A9WriteLfp3Mov, A9WriteAdr3], [IIC_VLD3, IIC_VLD3u,
+ IIC_VLD3dup, IIC_VLD3dupu]>;
+def :ItinRW<[A9WriteLfp4Mov, A9WriteAdr4], [IIC_VLD4, IIC_VLD4u,
+ IIC_VLD4ln, IIC_VLD4lnu]>;
+def :ItinRW<[A9WriteLfp5Mov, A9WriteAdr5], [IIC_VLD3ln, IIC_VLD3lnu]>;
+
+// Vector stores use similar resources to vector loads, so use the
+// same write types. The address write must be first for stores with
+// address writeback.
+def :ItinRW<[A9WriteAdr1, A9WriteLfp1], [IIC_VST1, IIC_VST1u,
+ IIC_VST1x2, IIC_VST1x2u,
+ IIC_VST1ln, IIC_VST1lnu,
+ IIC_VST2, IIC_VST2u,
+ IIC_VST2x2, IIC_VST2x2u,
+ IIC_VST2ln, IIC_VST2lnu]>;
+def :ItinRW<[A9WriteAdr2, A9WriteLfp2], [IIC_VST1x3, IIC_VST1x3u,
+ IIC_VST1x4, IIC_VST1x4u,
+ IIC_VST3, IIC_VST3u,
+ IIC_VST3ln, IIC_VST3lnu,
+ IIC_VST4, IIC_VST4u,
+ IIC_VST4ln, IIC_VST4lnu]>;
+
+// NEON moves.
+def :ItinRW<[A9WriteV2], [IIC_VMOVSI, IIC_VMOVDI, IIC_VMOVD, IIC_VMOVQ]>;
+def :ItinRW<[A9WriteV1], [IIC_VMOV, IIC_VMOVIS, IIC_VMOVID]>;
+def :ItinRW<[A9WriteV3], [IIC_VMOVISL, IIC_VMOVN]>;
+
+// NEON integer arithmetic
+//
+// VADD/VAND/VORR/VEOR/VBIC/VORN/VBIT/VBIF/VBSL
+def :ItinRW<[A9WriteV3, A9Read2, A9Read2], [IIC_VBINiD, IIC_VBINiQ]>;
+// VSUB/VMVN/VCLSD/VCLZD/VCNTD
+def :ItinRW<[A9WriteV3, A9Read2], [IIC_VSUBiD, IIC_VSUBiQ, IIC_VCNTiD]>;
+// VADDL/VSUBL/VNEG are mapped later under IIC_SHLi.
+// ...
+// VHADD/VRHADD/VQADD/VTST/VADH/VRADH
+def :ItinRW<[A9WriteV4, A9Read2, A9Read2], [IIC_VBINi4D, IIC_VBINi4Q]>;
+
+// VSBH/VRSBH/VHSUB/VQSUB/VABD/VCEQ/VCGE/VCGT/VMAX/VMIN/VPMAX/VPMIN/VABDL
+def :ItinRW<[A9WriteV4, A9Read2], [IIC_VSUBi4D, IIC_VSUBi4Q]>;
+// VQNEG/VQABS
+def :ItinRW<[A9WriteV4], [IIC_VQUNAiD, IIC_VQUNAiQ]>;
+// VABS
+def :ItinRW<[A9WriteV4, A9Read2], [IIC_VUNAiD, IIC_VUNAiQ]>;
+// VPADD/VPADDL are mapped later under IIC_SHLi.
+// ...
+// VCLSQ/VCLZQ/VCNTQ, takes two cycles.
+def :ItinRW<[A9Write2V4, A9Read3], [IIC_VCNTiQ]>;
+// VMOVimm/VMVNimm/VORRimm/VBICimm
+def :ItinRW<[A9WriteV3], [IIC_VMOVImm]>;
+def :ItinRW<[A9WriteV6, A9Read3, A9Read2], [IIC_VABAD, IIC_VABAQ]>;
+def :ItinRW<[A9WriteV6, A9Read3], [IIC_VPALiD, IIC_VPALiQ]>;
+
+// NEON integer multiply
+//
+// Note: these don't quite match the timing docs, but they do match
+// the original A9 itinerary.
+def :ItinRW<[A9WriteV6, A9Read2, A9Read2], [IIC_VMULi16D]>;
+def :ItinRW<[A9WriteV7, A9Read2, A9Read2], [IIC_VMULi16Q]>;
+def :ItinRW<[A9Write2V7, A9Read2], [IIC_VMULi32D]>;
+def :ItinRW<[A9Write2V9, A9Read2], [IIC_VMULi32Q]>;
+def :ItinRW<[A9WriteV6, A9Read3, A9Read2, A9Read2], [IIC_VMACi16D]>;
+def :ItinRW<[A9WriteV7, A9Read3, A9Read2, A9Read2], [IIC_VMACi16Q]>;
+def :ItinRW<[A9Write2V7, A9Read3, A9Read2], [IIC_VMACi32D]>;
+def :ItinRW<[A9Write2V9, A9Read3, A9Read2], [IIC_VMACi32Q]>;
+
+// NEON integer shift
+// TODO: Q,Q,Q shifts should actually reserve FP for 2 cycles.
+def :ItinRW<[A9WriteV3], [IIC_VSHLiD, IIC_VSHLiQ]>;
+def :ItinRW<[A9WriteV4], [IIC_VSHLi4D, IIC_VSHLi4Q]>;
+
+// NEON permute
+def :ItinRW<[A9WriteV2, A9WriteV2], [IIC_VPERMD, IIC_VPERMQ, IIC_VEXTD]>;
+def :ItinRW<[A9WriteV3, A9WriteV4, ReadDefault, A9Read2],
+ [IIC_VPERMQ3, IIC_VEXTQ]>;
+def :ItinRW<[A9WriteV3, A9Read2], [IIC_VTB1]>;
+def :ItinRW<[A9WriteV3, A9Read2, A9Read2], [IIC_VTB2]>;
+def :ItinRW<[A9WriteV4, A9Read2, A9Read2, A9Read3], [IIC_VTB3]>;
+def :ItinRW<[A9WriteV4, A9Read2, A9Read2, A9Read3, A9Read3], [IIC_VTB4]>;
+def :ItinRW<[A9WriteV3, ReadDefault, A9Read2], [IIC_VTBX1]>;
+def :ItinRW<[A9WriteV3, ReadDefault, A9Read2, A9Read2], [IIC_VTBX2]>;
+def :ItinRW<[A9WriteV4, ReadDefault, A9Read2, A9Read2, A9Read3], [IIC_VTBX3]>;
+def :ItinRW<[A9WriteV4, ReadDefault, A9Read2, A9Read2, A9Read3, A9Read3],
+ [IIC_VTBX4]>;
+
+// NEON floating-point
+def :ItinRW<[A9WriteV5, A9Read2, A9Read2], [IIC_VBIND]>;
+def :ItinRW<[A9WriteV6, A9Read2, A9Read2], [IIC_VBINQ]>;
+def :ItinRW<[A9WriteV5, A9Read2], [IIC_VUNAD, IIC_VFMULD]>;
+def :ItinRW<[A9WriteV6, A9Read2], [IIC_VUNAQ, IIC_VFMULQ]>;
+def :ItinRW<[A9WriteV9, A9Read3, A9Read2], [IIC_VMACD, IIC_VFMACD]>;
+def :ItinRW<[A9WriteV10, A9Read3, A9Read2], [IIC_VMACQ, IIC_VFMACQ]>;
+def :ItinRW<[A9WriteV9, A9Read2, A9Read2], [IIC_VRECSD]>;
+def :ItinRW<[A9WriteV10, A9Read2, A9Read2], [IIC_VRECSQ]>;
+
+// Map SchedRWs that are identical for cortexa9 to existing resources.
+def : SchedAlias<WriteALU, A9WriteALU>;
+def : SchedAlias<WriteALUsr, A9WriteALUsr>;
+def : SchedAlias<WriteALUSsr, A9WriteALUsr>;
+def : SchedAlias<ReadALU, A9ReadALU>;
+def : SchedAlias<ReadALUsr, A9ReadALU>;
+def : InstRW< [WriteALU],
+ (instregex "ANDri", "ORRri", "EORri", "BICri", "ANDrr", "ORRrr", "EORrr",
+ "BICrr")>;
+def : InstRW< [WriteALUsi], (instregex "ANDrsi", "ORRrsi", "EORrsi", "BICrsi")>;
+def : InstRW< [WriteALUsr], (instregex "ANDrsr", "ORRrsr", "EORrsr", "BICrsr")>;
+
+
+def : SchedAlias<WriteCMP, A9WriteALU>;
+def : SchedAlias<WriteCMPsi, A9WriteALU>;
+def : SchedAlias<WriteCMPsr, A9WriteALU>;
+
+def : InstRW< [A9WriteIsr], (instregex "MOVsr", "MOVsi", "MVNsr", "MOVCCsi",
+ "MOVCCsr")>;
+def : InstRW< [WriteALU, A9ReadALU], (instregex "MVNr")>;
+def : InstRW< [A9WriteI2], (instregex "MOVCCi32imm", "MOVi32imm",
+ "MOV_ga_dyn")>;
+def : InstRW< [A9WriteI2pc], (instregex "MOV_ga_pcrel")>;
+def : InstRW< [A9WriteI2ld], (instregex "MOV_ga_pcrel_ldr")>;
+
+def : InstRW< [WriteALU], (instregex "SEL")>;
+
+def : InstRW< [WriteALUsi], (instregex "BFC", "BFI", "UBFX", "SBFX")>;
+
+def : InstRW< [A9WriteM],
+ (instregex "MUL", "MULv5", "SMMUL", "SMMULR", "MLA", "MLAv5", "MLS",
+ "SMMLA", "SMMLAR", "SMMLS", "SMMLSR")>;
+def : InstRW< [A9WriteM, A9WriteMHi],
+ (instregex "SMULL", "SMULLv5", "UMULL", "UMULLv5", "SMLAL$", "UMLAL",
+ "UMAAL", "SMLALv5", "UMLALv5", "UMAALv5", "SMLALBB", "SMLALBT", "SMLALTB",
+ "SMLALTT")>;
+// FIXME: These instructions used to have NoItinerary. Just copied the one from above.
+def : InstRW< [A9WriteM, A9WriteMHi],
+ (instregex "SMLAD", "SMLADX", "SMLALD", "SMLALDX", "SMLSD", "SMLSDX",
+ "SMLSLD", "SMLLDX", "SMUAD", "SMUADX", "SMUSD", "SMUSDX")>;
+
+def : InstRW<[A9WriteM16, A9WriteM16Hi],
+ (instregex "SMULBB", "SMULBT", "SMULTB", "SMULTT", "SMULWB", "SMULWT")>;
+def : InstRW<[A9WriteM16, A9WriteM16Hi],
+ (instregex "SMLABB", "SMLABT", "SMLATB", "SMLATT", "SMLAWB", "SMLAWT")>;
+
+def : InstRW<[A9WriteL], (instregex "LDRi12", "PICLDR$")>;
+def : InstRW<[A9WriteLsi], (instregex "LDRrs")>;
+def : InstRW<[A9WriteLb],
+ (instregex "LDRBi12", "PICLDRH", "PICLDRB", "PICLDRSH", "PICLDRSB",
+ "LDRH", "LDRSH", "LDRSB")>;
+def : InstRW<[A9WriteLbsi], (instregex "LDRrs")>;
+
+def : WriteRes<WriteDiv, []> { let Latency = 0; }
+
+def : WriteRes<WriteBr, [A9UnitB]>;
+def : WriteRes<WriteBrL, [A9UnitB]>;
+def : WriteRes<WriteBrTbl, [A9UnitB]>;
+def : WriteRes<WritePreLd, []>;
+def : SchedAlias<WriteCvtFP, A9WriteF>;
+def : WriteRes<WriteNoop, []> { let Latency = 0; let NumMicroOps = 0; }
+} // SchedModel = CortexA9Model
diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleR52.td b/contrib/llvm/lib/Target/ARM/ARMScheduleR52.td
new file mode 100644
index 000000000000..1b40742a093b
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMScheduleR52.td
@@ -0,0 +1,983 @@
+//==- ARMScheduleR52.td - Cortex-R52 Scheduling Definitions -*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the SchedRead/Write data for the ARM Cortex-R52 processor.
+//
+//===----------------------------------------------------------------------===//
+
+// ===---------------------------------------------------------------------===//
+// The Cortex-R52 is an in-order pipelined superscalar microprocessor with
+// a 8 stage pipeline. It can issue maximum two instructions in each cycle.
+// There are two ALUs, one LDST, one MUL and a non-pipelined integer DIV.
+// A number of forwarding paths enable results of computations to be input
+// to subsequent operations before they are written to registers.
+// This scheduler is a MachineScheduler. See TargetSchedule.td for details.
+
+def CortexR52Model : SchedMachineModel {
+ let MicroOpBufferSize = 0; // R52 is in-order processor
+ let IssueWidth = 2; // 2 micro-ops dispatched per cycle
+ let LoadLatency = 1; // Optimistic, assuming no misses
+ let MispredictPenalty = 8; // A branch direction mispredict, including PFU
+ let PostRAScheduler = 1; // Enable PostRA scheduler pass.
+ let CompleteModel = 0; // Covers instructions applicable to cortex-r52.
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available.
+
+// Modeling each pipeline as a ProcResource using the BufferSize = 0 since
+// Cortex-R52 is an in-order processor.
+
+def R52UnitALU : ProcResource<2> { let BufferSize = 0; } // Int ALU
+def R52UnitMAC : ProcResource<1> { let BufferSize = 0; } // Int MAC
+def R52UnitDiv : ProcResource<1> { let BufferSize = 0; } // Int Division
+def R52UnitLd : ProcResource<1> { let BufferSize = 0; } // Load/Store
+def R52UnitB : ProcResource<1> { let BufferSize = 0; } // Branch
+def R52UnitFPALU : ProcResource<2> { let BufferSize = 0; } // FP ALU
+def R52UnitFPMUL : ProcResource<2> { let BufferSize = 0; } // FP MUL
+def R52UnitFPDIV : ProcResource<1> { let BufferSize = 0; } // FP DIV
+
+// Cortex-R52 specific SchedReads
+def R52Read_ISS : SchedRead;
+def R52Read_EX1 : SchedRead;
+def R52Read_EX2 : SchedRead;
+def R52Read_WRI : SchedRead;
+def R52Read_F0 : SchedRead; // F0 maps to ISS stage of integer pipe
+def R52Read_F1 : SchedRead;
+def R52Read_F2 : SchedRead;
+
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedWrite types which map ProcResources and set latency.
+
+let SchedModel = CortexR52Model in {
+
+// ALU - Write occurs in Late EX2 (independent of whether shift was required)
+def : WriteRes<WriteALU, [R52UnitALU]> { let Latency = 3; }
+def : WriteRes<WriteALUsi, [R52UnitALU]> { let Latency = 3; }
+def : WriteRes<WriteALUsr, [R52UnitALU]> { let Latency = 3; }
+def : WriteRes<WriteALUSsr, [R52UnitALU]> { let Latency = 3; }
+
+// Compares
+def : WriteRes<WriteCMP, [R52UnitALU]> { let Latency = 0; }
+def : WriteRes<WriteCMPsi, [R52UnitALU]> { let Latency = 0; }
+def : WriteRes<WriteCMPsr, [R52UnitALU]> { let Latency = 0; }
+
+// Div - may stall 0-9 cycles depending on input (i.e. WRI+(0-9)/2)
+def : WriteRes<WriteDiv, [R52UnitDiv]> {
+ let Latency = 8; let ResourceCycles = [8]; // not pipelined
+}
+
+// Loads
+def : WriteRes<WriteLd, [R52UnitLd]> { let Latency = 4; }
+def : WriteRes<WritePreLd, [R52UnitLd]> { let Latency = 4; }
+
+// Branches - LR written in Late EX2
+def : WriteRes<WriteBr, [R52UnitB]> { let Latency = 0; }
+def : WriteRes<WriteBrL, [R52UnitB]> { let Latency = 0; }
+def : WriteRes<WriteBrTbl, [R52UnitALU]> { let Latency = 0; }
+
+// Misc
+def : WriteRes<WriteNoop, []> { let Latency = 0; let NumMicroOps = 0; }
+def : WriteRes<WriteCvtFP, [R52UnitALU]> { let Latency = 3; }
+
+def : ReadAdvance<ReadALU, 1>; // Operand needed in EX1 stage
+def : ReadAdvance<ReadALUsr, 0>; // Shift operands needed in ISS
+
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedReadWrites.
+
+// Forwarding information - based on when an operand is read
+def : ReadAdvance<R52Read_ISS, 0>;
+def : ReadAdvance<R52Read_EX1, 1>;
+def : ReadAdvance<R52Read_EX2, 2>;
+def : ReadAdvance<R52Read_F0, 0>;
+def : ReadAdvance<R52Read_F1, 1>;
+def : ReadAdvance<R52Read_F2, 2>;
+
+
+// Cortex-R52 specific SchedWrites for use with InstRW
+def R52WriteMAC : SchedWriteRes<[R52UnitMAC]> { let Latency = 4; }
+def R52WriteDIV : SchedWriteRes<[R52UnitDiv]> {
+ let Latency = 8; let ResourceCycles = [8]; // not pipelined
+}
+def R52WriteLd : SchedWriteRes<[R52UnitLd]> { let Latency = 4; }
+def R52WriteST : SchedWriteRes<[R52UnitLd]> { let Latency = 4; }
+def R52WriteAdr : SchedWriteRes<[]> { let Latency = 0; }
+def R52WriteCC : SchedWriteRes<[]> { let Latency = 0; }
+def R52WriteALU_EX1 : SchedWriteRes<[R52UnitALU]> { let Latency = 2; }
+def R52WriteALU_EX2 : SchedWriteRes<[R52UnitALU]> { let Latency = 3; }
+def R52WriteALU_WRI : SchedWriteRes<[R52UnitALU]> { let Latency = 4; }
+
+def R52WriteNoRSRC_EX2 : SchedWriteRes<[]> { let Latency = 3; }
+def R52WriteNoRSRC_WRI : SchedWriteRes<[]> { let Latency = 4; }
+
+def R52WriteFPALU_F3 : SchedWriteRes<[R52UnitFPALU]> { let Latency = 4; }
+def R52Write2FPALU_F3 : SchedWriteRes<[R52UnitFPALU, R52UnitFPALU]> {
+ let Latency = 4;
+}
+def R52WriteFPALU_F4 : SchedWriteRes<[R52UnitFPALU]> { let Latency = 5; }
+def R52Write2FPALU_F4 : SchedWriteRes<[R52UnitFPALU, R52UnitFPALU]> {
+ let Latency = 5;
+}
+def R52WriteFPALU_F5 : SchedWriteRes<[R52UnitFPALU]> { let Latency = 6; }
+def R52Write2FPALU_F5 : SchedWriteRes<[R52UnitFPALU, R52UnitFPALU]> {
+ let Latency = 6;
+}
+def R52WriteFPMUL_F5 : SchedWriteRes<[R52UnitFPMUL]> { let Latency = 6; }
+def R52Write2FPMUL_F5 : SchedWriteRes<[R52UnitFPMUL, R52UnitFPMUL]> {
+ let Latency = 6;
+}
+def R52WriteFPMAC_F5 : SchedWriteRes<[R52UnitFPMUL, R52UnitFPALU]> {
+ let Latency = 11; // as it is internally two insns (MUL then ADD)
+}
+def R52Write2FPMAC_F5 : SchedWriteRes<[R52UnitFPMUL, R52UnitFPMUL,
+ R52UnitFPALU, R52UnitFPALU]> {
+ let Latency = 11;
+}
+
+def R52WriteFPLd_F4 : SchedWriteRes<[R52UnitLd]> { let Latency = 5; }
+def R52WriteFPST_F4 : SchedWriteRes<[R52UnitLd]> { let Latency = 5; }
+
+def R52WriteFPDIV_SP : SchedWriteRes<[R52UnitFPDIV]> {
+ let Latency = 7; // FP div takes fixed #cycles
+ let ResourceCycles = [7]; // is not pipelined
+ }
+def R52WriteFPDIV_DP : SchedWriteRes<[R52UnitFPDIV]> {
+ let Latency = 17;
+ let ResourceCycles = [17];
+}
+
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific - map operands to SchedReadWrites
+
+def : InstRW<[WriteALU], (instrs COPY)>;
+
+def : InstRW<[R52WriteALU_EX2, R52Read_EX1, R52Read_ISS],
+ (instregex "SXTB", "SXTH", "SXTB16", "UXTB", "UXTH", "UXTB16",
+ "t2SXTB", "t2SXTH", "t2SXTB16", "t2UXTB", "t2UXTH", "t2UXTB16")>;
+
+def : InstRW<[R52WriteALU_EX1, R52Read_ISS],
+ (instregex "MOVCCi32imm", "MOVi32imm", "MOV_ga_dyn", "t2MOVCCi",
+ "t2MOVi", "t2MOV_ga_dyn")>;
+def : InstRW<[R52WriteALU_EX2, R52Read_EX1],
+ (instregex "MOV_ga_pcrel", "t2MOV_ga_pcrel")>;
+def : InstRW<[R52WriteLd,R52Read_ISS],
+ (instregex "MOV_ga_pcrel_ldr", "t2MOV_ga_pcrel_ldr")>;
+
+def : InstRW<[R52WriteALU_EX2, R52Read_EX1, R52Read_EX1], (instregex "SEL", "t2SEL")>;
+
+def : InstRW< [R52WriteALU_EX2, R52Read_ISS, R52Read_ISS],
+ (instregex "BFC", "BFI", "UBFX", "SBFX", "(t|t2)BFC", "(t|t2)BFI",
+ "(t|t2)UBFX", "(t|t2)SBFX")>;
+
+// Saturating arithmetic
+def : InstRW< [R52WriteALU_WRI, R52Read_EX1, R52Read_EX1],
+ (instregex "QADD", "QSUB", "QDADD", "QDSUB", "SSAT", "SSAT16", "USAT",
+ "QADD8", "QADD16", "QSUB8", "QSUB16", "QASX", "QSAX",
+ "UQADD8", "UQADD16","UQSUB8","UQSUB16","UQASX","UQSAX", "t2QADD",
+ "t2QSUB", "t2QDADD", "t2QDSUB", "t2SSAT", "t2SSAT16", "t2USAT",
+ "t2QADD8", "t2QADD16", "t2QSUB8", "t2QSUB16", "t2QASX", "t2QSAX",
+ "t2UQADD8", "t2UQADD16","t2UQSUB8","t2UQSUB16","t2UQASX","t2UQSAX","t2ABS")>;
+
+// Parallel arithmetic
+def : InstRW< [R52WriteALU_EX2, R52Read_EX1, R52Read_EX1],
+ (instregex "SADD8", "SADD16", "SSUB8", "SSUB16", "SASX", "SSAX",
+ "UADD8", "UADD16", "USUB8", "USUB16", "UASX", "USAX", "t2SADD8",
+ "t2SADD16", "t2SSUB8", "t2SSUB16", "t2SASX", "t2SSAX", "t2UADD8",
+ "t2UADD16", "t2USUB8", "t2USUB16", "t2UASX", "t2USAX")>;
+
+// Flag setting.
+def : InstRW< [R52WriteALU_EX2, R52Read_EX1, R52Read_EX1],
+ (instregex "SHADD8", "SHADD16", "SHSUB8", "SHSUB16", "SHASX", "SHSAX",
+ "SXTAB", "SXTAB16", "SXTAH", "UHADD8", "UHADD16", "UHSUB8", "UHSUB16",
+ "UHASX", "UHSAX", "UXTAB", "UXTAB16", "UXTAH", "t2SHADD8", "t2SHADD16",
+ "t2SHSUB8", "t2SHSUB16", "t2SHASX", "t2SHSAX", "t2SXTAB", "t2SXTAB16",
+ "t2SXTAH", "t2UHADD8", "t2UHADD16", "t2UHSUB8", "t2UHSUB16", "t2UHASX",
+ "t2UHSAX", "t2UXTAB", "t2UXTAB16", "t2UXTAH")>;
+
+// Sum of Absolute Difference
+def : InstRW< [R52WriteALU_WRI, R52Read_ISS, R52Read_ISS, R52Read_ISS],
+ (instregex "USAD8", "t2USAD8", "tUSAD8","USADA8", "t2USADA8", "tUSADA8") >;
+
+// Integer Multiply
+def : InstRW<[R52WriteMAC, R52Read_ISS, R52Read_ISS],
+ (instregex "MULS", "MUL", "SMMUL", "SMMULR", "SMULBB", "SMULBT",
+ "SMULTB", "SMULTT", "SMULWB", "SMULWT", "SMUSD", "SMUSDXi", "t2MUL",
+ "t2SMMUL", "t2SMMULR", "t2SMULBB", "t2SMULBT", "t2SMULTB", "t2SMULTT",
+ "t2SMULWB", "t2SMULWT", "t2SMUSD")>;
+
+// Multiply Accumulate
+// Even for 64-bit accumulation (or Long), the single MAC is used (not ALUs).
+// The store pipeline is used partly for 64-bit operations.
+def : InstRW<[R52WriteMAC, R52Read_ISS, R52Read_ISS, R52Read_ISS],
+ (instregex "MLAS", "MLA", "MLS", "SMMLA", "SMMLAR", "SMMLS", "SMMLSR",
+ "t2MLA", "t2MLS", "t2MLAS", "t2SMMLA", "t2SMMLAR", "t2SMMLS", "t2SMMLSR",
+ "SMUAD", "SMUADX", "t2SMUAD", "t2SMUADX",
+ "SMLABB", "SMLABT", "SMLATB", "SMLATT", "SMLSD", "SMLSDX",
+ "SMLAWB", "SMLAWT", "t2SMLABB", "t2SMLABT", "t2SMLATB", "t2SMLATT",
+ "t2SMLSD", "t2SMLSDX", "t2SMLAWB", "t2SMLAWT",
+ "SMLAD", "SMLADX", "t2SMLAD", "t2SMLADX",
+ "SMULL$", "UMULL$", "t2SMULL$", "t2UMULL$",
+ "SMLALS", "UMLALS", "SMLAL", "UMLAL", "MLALBB", "SMLALBT",
+ "SMLALTB", "SMLALTT", "SMLALD", "SMLALDX", "SMLSLD", "SMLSLDX",
+ "UMAAL", "t2SMLALS", "t2UMLALS", "t2SMLAL", "t2UMLAL", "t2MLALBB",
+ "t2SMLALBT", "t2SMLALTB", "t2SMLALTT", "t2SMLALD", "t2SMLALDX",
+ "t2SMLSLD", "t2SMLSLDX", "t2UMAAL")>;
+
+def : InstRW <[R52WriteDIV, R52Read_ISS, R52Read_ISS],
+ (instregex "SDIV", "UDIV", "t2SDIV", "t2UDIV")>;
+
+// Loads (except POST) with SHL > 2, or ror, require 2 extra cycles.
+// However, that's non-trivial to specify, so we keep it uniform
+def : InstRW<[R52WriteLd, R52Read_ISS, R52Read_ISS],
+ (instregex "LDR(i12|rs)$", "LDRB(i12|rs)$", "t2LDR(i8|i12|s|pci)",
+ "t2LDR(H|B)(i8|i12|s|pci)", "LDREX", "t2LDREX",
+ "tLDR[BH](r|i|spi|pci|pciASM)", "tLDR(r|i|spi|pci|pciASM)",
+ "LDRH$", "PICLDR$", "PICLDR(H|B)$", "LDRcp$",
+ "PICLDRS(H|B)$", "t2LDRS(H|B)(i|r|p|s)", "LDRS(H|B)$",
+ "t2LDRpci_pic", "tLDRS(B|H)", "t2LDRDi8", "LDRD$", "LDA", "t2LDA")>;
+def : InstRW<[R52WriteLd, R52WriteAdr, R52Read_ISS, R52Read_ISS],
+ (instregex "LD(RB|R)(_|T_)(POST|PRE)_(IMM|REG)", "LDRH(_PRE|_POST)",
+ "LDRBT_POST$", "LDR(T|BT)_POST_(REG|IMM)", "LDRHT(i|r)",
+ "t2LD(R|RB|RH)_(PRE|POST)", "t2LD(R|RB|RH)T",
+ "LDR(SH|SB)(_POST|_PRE)", "t2LDR(SH|SB)(_POST|_PRE)",
+ "LDRS(B|H)T(i|r)", "t2LDRS(B|H)T(i|r)", "t2LDRS(B|H)T",
+ "LDRD_(POST|PRE)", "t2LDRD_(POST|PRE)")>;
+
+def : InstRW<[R52WriteALU_EX2, R52Read_EX1], (instregex "MOVS?sr", "t2MOVS?sr")>;
+def : InstRW<[R52WriteALU_WRI, R52Read_EX2], (instregex "MOVT", "t2MOVT")>;
+
+def : InstRW<[R52WriteALU_EX2, R52Read_EX1], (instregex "AD(C|D)S?ri","ANDS?ri",
+ "BICS?ri", "CLZ", "EORri", "MVNS?r", "ORRri", "RSBS?ri", "RSCri", "SBCri",
+ "t2AD(C|D)S?ri", "t2ANDS?ri", "t2BICS?ri","t2CLZ", "t2EORri", "t2MVN",
+ "t2ORRri", "t2RSBS?ri", "t2SBCri")>;
+
+def : InstRW<[R52WriteALU_EX2, R52Read_EX1, R52Read_EX1], (instregex "AD(C|D)S?rr",
+ "ANDS?rr", "BICS?rr", "CRC*", "EORrr", "ORRrr", "RSBrr", "RSCrr", "SBCrr",
+ "t2AD(C|D)S?rr", "t2ANDS?rr", "t2BICS?rr", "t2CRC", "t2EORrr", "t2SBCrr")>;
+
+def : InstRW<[R52WriteALU_EX2, R52Read_EX1, R52Read_ISS], (instregex "AD(C|D)S?rsi",
+ "ANDS?rsi", "BICS?rsi", "EORrsi", "ORRrsi", "RSBrsi", "RSCrsi", "SBCrsi",
+ "t2AD(|D)S?rsi", "t2ANDS?rsi", "t2BICS?rsi", "t2EORrsi", "t2ORRrsi", "t2RSBrsi", "t2SBCrsi")>;
+
+def : InstRW<[R52WriteALU_EX2, R52Read_EX1, R52Read_ISS, R52Read_ISS],
+ (instregex "AD(C|D)S?rsr", "ANDS?rsr", "BICS?rsr", "EORrsr", "MVNS?sr",
+ "ORRrsrr", "RSBrsr", "RSCrsr", "SBCrsr")>;
+
+def : InstRW<[R52WriteALU_EX1],
+ (instregex "ADR", "MOVSi", "MOVSsi", "MOVST?i16*", "MVNS?s?i", "t2MOVS?si")>;
+
+def : InstRW<[R52WriteALU_EX1, R52Read_ISS], (instregex "ASRi", "RORS?i")>;
+def : InstRW<[R52WriteALU_EX1, R52Read_ISS, R52Read_ISS],
+ (instregex "ASRr", "RORS?r", "LSR", "LSL")>;
+
+def : InstRW<[R52WriteCC, R52Read_EX1], (instregex "CMPri", "CMNri")>;
+def : InstRW<[R52WriteCC, R52Read_EX1, R52Read_EX1], (instregex "CMPrr", "CMNzrr")>;
+def : InstRW<[R52WriteCC, R52Read_EX1, R52Read_ISS], (instregex "CMPrsi", "CMNzrsi")>;
+def : InstRW<[R52WriteCC, R52Read_EX1, R52Read_ISS, R52Read_ISS], (instregex "CMPrsr", "CMNzrsr")>;
+
+def : InstRW<[R52WriteALU_EX2, R52Read_ISS],
+ (instregex "t2LDC", "RBIT", "REV", "REV16", "REVSH", "RRX")>;
+
+def : InstRW<[R52WriteCC, R52Read_ISS], (instregex "TST")>;
+
+def : InstRW<[R52WriteLd], (instregex "MRS", "MRSbanked")>;
+def : InstRW<[R52WriteLd, R52Read_EX1], (instregex "MSR", "MSRbanked")>;
+
+//def : InstRW<[R52WriteLd, R52Read_ISS], (instregex "^LDRB?(_PRE_IMM|_POST_IMM)", "LDRrs")>;
+//def : InstRW<[R52WriteLd, R52Read_ISS, R52Read_ISS], (instregex "^LDRB?_PRE_REG", "LDRB?rr")>;
+//def : InstRW<[R52WriteLd, R52Read_ISS, R52Read_ISS], (instregex "^LDRB?_POST_REG")>;
+
+//def : InstRW<[R52WriteST, R52Read_ISS], (instregex "STRi12", "PICSTR")>;
+//def : InstRW<[R52WriteST, R52WriteAdr, R52Read_ISS, R52Read_EX2], (instregex "t2STRB?_PRE_REG", "STRB?_PRE_REG")>;
+//def : InstRW<[R52WriteST, R52WriteAdr, R52Read_ISS, R52Read_EX2], (instregex "t2STRB?_POST_REG", "STRB?_POST_REG")>;
+
+
+// Integer Load, Multiple.
+foreach Lat = 3-25 in {
+ def R52WriteILDM#Lat#Cy : SchedWriteRes<[R52UnitLd]> {
+ let Latency = Lat;
+ }
+ def R52WriteILDM#Lat#CyNo : SchedWriteRes<[]> {
+ let Latency = Lat;
+ let NumMicroOps = 0;
+ }
+}
+foreach NAddr = 1-16 in {
+ def R52ILDMAddr#NAddr#Pred : SchedPredicate<"TII->getNumLDMAddresses(*MI) == "#NAddr>;
+}
+def R52WriteILDMAddrNoWB : SchedWriteRes<[R52UnitLd]> { let Latency = 0; }
+def R52WriteILDMAddrWB : SchedWriteRes<[R52UnitLd]>;
+def R52WriteILDM : SchedWriteVariant<[
+ SchedVar<R52ILDMAddr2Pred, [R52WriteILDM4Cy, R52WriteILDM5Cy]>,
+
+ SchedVar<R52ILDMAddr3Pred, [R52WriteILDM4Cy, R52WriteILDM5Cy,
+ R52WriteILDM6Cy]>,
+ SchedVar<R52ILDMAddr4Pred, [R52WriteILDM4Cy, R52WriteILDM5Cy,
+ R52WriteILDM6Cy, R52WriteILDM7Cy]>,
+
+ SchedVar<R52ILDMAddr5Pred, [R52WriteILDM4Cy, R52WriteILDM5Cy,
+ R52WriteILDM6Cy, R52WriteILDM7Cy,
+ R52WriteILDM8Cy]>,
+ SchedVar<R52ILDMAddr6Pred, [R52WriteILDM4Cy, R52WriteILDM5Cy,
+ R52WriteILDM6Cy, R52WriteILDM7Cy,
+ R52WriteILDM8Cy, R52WriteILDM9Cy]>,
+
+ SchedVar<R52ILDMAddr7Pred, [R52WriteILDM4Cy, R52WriteILDM5Cy,
+ R52WriteILDM6Cy, R52WriteILDM7Cy,
+ R52WriteILDM8Cy, R52WriteILDM9Cy,
+ R52WriteILDM10Cy]>,
+ SchedVar<R52ILDMAddr8Pred, [R52WriteILDM4Cy, R52WriteILDM5Cy,
+ R52WriteILDM6Cy, R52WriteILDM7Cy,
+ R52WriteILDM8Cy, R52WriteILDM9Cy,
+ R52WriteILDM10Cy, R52WriteILDM11Cy]>,
+
+ SchedVar<R52ILDMAddr9Pred, [R52WriteILDM4Cy, R52WriteILDM5Cy,
+ R52WriteILDM6Cy, R52WriteILDM7Cy,
+ R52WriteILDM8Cy, R52WriteILDM9Cy,
+ R52WriteILDM10Cy, R52WriteILDM11Cy,
+ R52WriteILDM12Cy]>,
+ SchedVar<R52ILDMAddr10Pred,[R52WriteILDM4Cy, R52WriteILDM5Cy,
+ R52WriteILDM6Cy, R52WriteILDM7Cy,
+ R52WriteILDM8Cy, R52WriteILDM9Cy,
+ R52WriteILDM10Cy, R52WriteILDM11Cy,
+ R52WriteILDM12Cy, R52WriteILDM13Cy]>,
+
+ SchedVar<R52ILDMAddr11Pred,[R52WriteILDM4Cy, R52WriteILDM5Cy,
+ R52WriteILDM6Cy, R52WriteILDM7Cy,
+ R52WriteILDM8Cy, R52WriteILDM9Cy,
+ R52WriteILDM10Cy, R52WriteILDM11Cy,
+ R52WriteILDM12Cy, R52WriteILDM13Cy,
+ R52WriteILDM14Cy]>,
+ SchedVar<R52ILDMAddr12Pred,[R52WriteILDM4Cy, R52WriteILDM5Cy,
+ R52WriteILDM6Cy, R52WriteILDM7Cy,
+ R52WriteILDM8Cy, R52WriteILDM9Cy,
+ R52WriteILDM10Cy, R52WriteILDM11Cy,
+ R52WriteILDM12Cy, R52WriteILDM13Cy,
+ R52WriteILDM14Cy, R52WriteILDM15Cy]>,
+
+ SchedVar<R52ILDMAddr13Pred,[R52WriteILDM4Cy, R52WriteILDM5Cy,
+ R52WriteILDM6Cy, R52WriteILDM7Cy,
+ R52WriteILDM8Cy, R52WriteILDM9Cy,
+ R52WriteILDM10Cy, R52WriteILDM11Cy,
+ R52WriteILDM12Cy, R52WriteILDM13Cy,
+ R52WriteILDM14Cy, R52WriteILDM15Cy,
+ R52WriteILDM16Cy]>,
+ SchedVar<R52ILDMAddr14Pred,[R52WriteILDM4Cy, R52WriteILDM5Cy,
+ R52WriteILDM6Cy, R52WriteILDM7Cy,
+ R52WriteILDM8Cy, R52WriteILDM9Cy,
+ R52WriteILDM10Cy, R52WriteILDM11Cy,
+ R52WriteILDM12Cy, R52WriteILDM13Cy,
+ R52WriteILDM14Cy, R52WriteILDM15Cy,
+ R52WriteILDM16Cy, R52WriteILDM17Cy]>,
+
+ SchedVar<R52ILDMAddr15Pred,[R52WriteILDM4Cy, R52WriteILDM5Cy,
+ R52WriteILDM6Cy, R52WriteILDM7Cy,
+ R52WriteILDM8Cy, R52WriteILDM9Cy,
+ R52WriteILDM10Cy, R52WriteILDM11Cy,
+ R52WriteILDM12Cy, R52WriteILDM13Cy,
+ R52WriteILDM14Cy, R52WriteILDM15Cy,
+ R52WriteILDM16Cy, R52WriteILDM17Cy,
+ R52WriteILDM18Cy]>,
+ SchedVar<R52ILDMAddr15Pred,[R52WriteILDM4Cy, R52WriteILDM5Cy,
+ R52WriteILDM6Cy, R52WriteILDM7Cy,
+ R52WriteILDM8Cy, R52WriteILDM9Cy,
+ R52WriteILDM10Cy, R52WriteILDM11Cy,
+ R52WriteILDM12Cy, R52WriteILDM13Cy,
+ R52WriteILDM14Cy, R52WriteILDM15Cy,
+ R52WriteILDM16Cy, R52WriteILDM17Cy,
+ R52WriteILDM18Cy, R52WriteILDM19Cy]>,
+
+// Unknown number of registers, just use resources for two registers.
+ SchedVar<NoSchedPred, [R52WriteILDM4Cy, R52WriteILDM5Cy,
+ R52WriteILDM6CyNo, R52WriteILDM7CyNo,
+ R52WriteILDM8CyNo, R52WriteILDM9CyNo,
+ R52WriteILDM10CyNo, R52WriteILDM11CyNo,
+ R52WriteILDM12CyNo, R52WriteILDM13CyNo,
+ R52WriteILDM14CyNo, R52WriteILDM15CyNo,
+ R52WriteILDM16CyNo, R52WriteILDM17CyNo,
+ R52WriteILDM18Cy, R52WriteILDM19Cy]>
+]> { let Variadic=1; }
+
+// Integer Store, Multiple
+def R52WriteIStIncAddr : SchedWriteRes<[R52UnitLd]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+foreach NumAddr = 1-16 in {
+ def R52WriteISTM#NumAddr : WriteSequence<[R52WriteIStIncAddr], NumAddr>;
+}
+def R52WriteISTM : SchedWriteVariant<[
+ SchedVar<R52ILDMAddr2Pred, [R52WriteISTM2]>,
+ SchedVar<R52ILDMAddr3Pred, [R52WriteISTM3]>,
+ SchedVar<R52ILDMAddr4Pred, [R52WriteISTM4]>,
+ SchedVar<R52ILDMAddr5Pred, [R52WriteISTM5]>,
+ SchedVar<R52ILDMAddr6Pred, [R52WriteISTM6]>,
+ SchedVar<R52ILDMAddr7Pred, [R52WriteISTM7]>,
+ SchedVar<R52ILDMAddr8Pred, [R52WriteISTM8]>,
+ SchedVar<R52ILDMAddr9Pred, [R52WriteISTM9]>,
+ SchedVar<R52ILDMAddr10Pred,[R52WriteISTM10]>,
+ SchedVar<R52ILDMAddr11Pred,[R52WriteISTM11]>,
+ SchedVar<R52ILDMAddr12Pred,[R52WriteISTM12]>,
+ SchedVar<R52ILDMAddr13Pred,[R52WriteISTM13]>,
+ SchedVar<R52ILDMAddr14Pred,[R52WriteISTM14]>,
+ SchedVar<R52ILDMAddr15Pred,[R52WriteISTM15]>,
+ SchedVar<R52ILDMAddr16Pred,[R52WriteISTM16]>,
+ // Unknow number of registers, just use resources for two registers.
+ SchedVar<NoSchedPred, [R52WriteISTM2]>
+]>;
+
+def : InstRW<[R52WriteILDM, R52Read_ISS],
+ (instregex "LDM(IA|DA|DB|IB)$", "t2LDM(IA|DA|DB|IB)$",
+ "(t|sys)LDM(IA|DA|DB|IB)$")>;
+def : InstRW<[R52WriteILDM, R52WriteAdr, R52Read_ISS],
+ (instregex "LDM(IA|DA|DB|IB)_UPD", "(t2|sys|t)LDM(IA|DA|DB|IB)_UPD")>;
+def : InstRW<[R52WriteILDM, R52WriteAdr, R52Read_ISS],
+ (instregex "LDMIA_RET", "(t|t2)LDMIA_RET", "POP", "tPOP")>;
+
+// Integer Store, Single Element
+def : InstRW<[R52WriteLd, R52Read_ISS, R52Read_EX2],
+ (instregex "PICSTR", "STR(i12|rs)", "STRB(i12|rs)", "STRH$", "STREX", "SRS", "t2SRS",
+ "t2SRSDB", "t2STREX", "t2STREXB", "t2STREXD", "t2STREXH", "t2STR(i12|i8|s)$",
+ "RFE", "t2RFE", "t2STR[BH](i12|i8|s)$", "tSTR[BH](i|r)", "tSTR(i|r)", "tSTRspi")>;
+
+def : InstRW<[R52WriteLd, R52WriteAdr, R52Read_ISS, R52Read_EX2],
+ (instregex "STR(B_|_|BT_|T_)(PRE_IMM|PRE_REG|POST_REG|POST_IMM)",
+ "STR(i|r)_preidx", "STRB(i|r)_preidx", "STRH_preidx", "STR(H_|HT_)(PRE|POST)",
+ "STR(BT|HT|T)", "t2STR_(PRE|POST)", "t2STR[BH]_(PRE|POST)",
+ "t2STR_preidx", "t2STR[BH]_preidx", "t2ST(RB|RH|R)T")>;
+
+// Integer Store, Dual
+def : InstRW<[R52WriteLd, R52Read_ISS, R52Read_EX2],
+ (instregex "STRD$", "t2STRDi8", "STL", "t2STRD$", "t2STL")>;
+def : InstRW<[R52WriteLd, R52WriteAdr, R52Read_ISS, R52Read_EX2],
+ (instregex "(t2|t)STRD_(POST|PRE)", "STRD_(POST|PRE)")>;
+
+def : InstRW<[R52WriteISTM, R52Read_ISS, R52Read_EX2],
+ (instregex "STM(IB|IA|DB|DA)$", "(t2|sys|t)STM(IB|IA|DB|DA)$")>;
+def : InstRW<[R52WriteISTM, R52WriteAdr, R52Read_ISS, R52Read_EX2],
+ (instregex "STM(IB|IA|DB|DA)_UPD", "(t2|sys|t)STM(IB|IA|DB|DA)_UPD",
+ "PUSH", "tPUSH")>;
+
+// LDRLIT pseudo instructions, they expand to LDR + PICADD
+def : InstRW<[R52WriteLd],
+ (instregex "t?LDRLIT_ga_abs", "t?LDRLIT_ga_pcrel")>;
+// LDRLIT_ga_pcrel_ldr expands to LDR + PICLDR
+def : InstRW<[R52WriteLd], (instregex "LDRLIT_ga_pcrel_ldr")>;
+
+
+
+//===----------------------------------------------------------------------===//
+// VFP, Floating Point Support
+def : InstRW<[R52WriteFPALU_F5, R52Read_F1, R52Read_F1], (instregex "VABD(fd|hd)")>;
+def : InstRW<[R52Write2FPALU_F5, R52Read_F1, R52Read_F1], (instregex "VABD(fq|hq)")>;
+
+def : InstRW<[R52WriteFPALU_F5, R52Read_F1], (instregex "VABS(D|S|H)")>;
+def : InstRW<[R52WriteFPALU_F5, R52Read_F1], (instregex "VABS(fd|hd)")>;
+def : InstRW<[R52Write2FPALU_F5, R52Read_F1], (instregex "VABS(fq|hq)")>;
+
+def : InstRW<[R52WriteFPALU_F3, R52Read_F1, R52Read_F1], (instregex "(VACGE|VACGT)(fd|hd)")>;
+def : InstRW<[R52Write2FPALU_F3, R52Read_F1, R52Read_F1], (instregex "(VACGE|VACGT)(fq|hq)")>;
+
+def : InstRW<[R52WriteFPALU_F5, R52Read_F1, R52Read_F1], (instregex "(VADD|VSUB)(D|S|H|fd|hd)")>;
+def : InstRW<[R52Write2FPALU_F5, R52Read_F1, R52Read_F1], (instregex "(VADD|VSUB)(fq|hq)")>;
+
+def : InstRW<[R52WriteFPDIV_SP, R52Read_F0, R52Read_F0], (instregex "VDIV(S|H)")>;
+def : InstRW<[R52WriteFPDIV_DP, R52Read_F0, R52Read_F0], (instregex "VDIVD")>;
+
+def : InstRW<[R52WriteFPMAC_F5, R52Read_F1, R52Read_F1, R52Read_F1],
+ (instregex "(VFMA|VFMS|VFNMA|VFNMS)(D|H|S)")>;
+
+def : InstRW<[R52WriteFPLd_F4, R52Read_ISS, R52Read_F1], (instregex "VLDR")>;
+def : InstRW<[R52WriteFPST_F4, R52Read_ISS, R52Read_F1], (instregex "VSTR")>;
+
+
+//===----------------------------------------------------------------------===//
+// Neon Support
+
+// vector multiple load stores
+foreach NumAddr = 1-16 in {
+ def R52LMAddrPred#NumAddr :
+ SchedPredicate<"MI->getNumOperands() == "#NumAddr>;
+}
+foreach Lat = 1-32 in {
+ def R52WriteLM#Lat#Cy : SchedWriteRes<[]> {
+ let Latency = Lat;
+ }
+}
+foreach Num = 1-32 in { // reserve LdSt resource, no dual-issue
+ def R52ReserveLd#Num#Cy : SchedWriteRes<[R52UnitLd]> {
+ let Latency = 0;
+ let NumMicroOps = Num;
+ let ResourceCycles = [Num];
+ }
+}
+def R52WriteVLDM : SchedWriteVariant<[
+ // 1 D reg
+ SchedVar<R52LMAddrPred1, [R52WriteLM5Cy,
+ R52ReserveLd5Cy]>,
+ SchedVar<R52LMAddrPred2, [R52WriteLM5Cy,
+ R52ReserveLd5Cy]>,
+
+ // 2 D reg
+ SchedVar<R52LMAddrPred3, [R52WriteLM5Cy, R52WriteLM6Cy,
+ R52ReserveLd6Cy]>,
+ SchedVar<R52LMAddrPred4, [R52WriteLM5Cy, R52WriteLM6Cy,
+ R52ReserveLd6Cy]>,
+
+ // 3 D reg
+ SchedVar<R52LMAddrPred5, [R52WriteLM5Cy, R52WriteLM6Cy,
+ R52WriteLM7Cy,
+ R52ReserveLd4Cy]>,
+ SchedVar<R52LMAddrPred6, [R52WriteLM5Cy, R52WriteLM6Cy,
+ R52WriteLM7Cy,
+ R52ReserveLd7Cy]>,
+
+ // 4 D reg
+ SchedVar<R52LMAddrPred7, [R52WriteLM5Cy, R52WriteLM6Cy,
+ R52WriteLM7Cy, R52WriteLM8Cy,
+ R52ReserveLd8Cy]>,
+ SchedVar<R52LMAddrPred8, [R52WriteLM5Cy, R52WriteLM6Cy,
+ R52WriteLM7Cy, R52WriteLM8Cy,
+ R52ReserveLd8Cy]>,
+
+ // 5 D reg
+ SchedVar<R52LMAddrPred9, [R52WriteLM5Cy, R52WriteLM6Cy,
+ R52WriteLM7Cy, R52WriteLM8Cy,
+ R52WriteLM9Cy,
+ R52ReserveLd9Cy]>,
+ SchedVar<R52LMAddrPred10, [R52WriteLM5Cy, R52WriteLM6Cy,
+ R52WriteLM7Cy, R52WriteLM8Cy,
+ R52WriteLM9Cy,
+ R52ReserveLd9Cy]>,
+
+ // 6 D reg
+ SchedVar<R52LMAddrPred11, [R52WriteLM5Cy, R52WriteLM6Cy,
+ R52WriteLM7Cy, R52WriteLM8Cy,
+ R52WriteLM9Cy, R52WriteLM10Cy,
+ R52ReserveLd10Cy]>,
+ SchedVar<R52LMAddrPred12, [R52WriteLM5Cy, R52WriteLM6Cy,
+ R52WriteLM7Cy, R52WriteLM8Cy,
+ R52WriteLM9Cy, R52WriteLM10Cy,
+ R52ReserveLd10Cy]>,
+
+ // 7 D reg
+ SchedVar<R52LMAddrPred13, [R52WriteLM5Cy, R52WriteLM6Cy,
+ R52WriteLM7Cy, R52WriteLM8Cy,
+ R52WriteLM9Cy, R52WriteLM10Cy,
+ R52WriteLM11Cy,
+ R52ReserveLd11Cy]>,
+ SchedVar<R52LMAddrPred14, [R52WriteLM5Cy, R52WriteLM6Cy,
+ R52WriteLM7Cy, R52WriteLM8Cy,
+ R52WriteLM9Cy, R52WriteLM10Cy,
+ R52WriteLM11Cy,
+ R52ReserveLd11Cy]>,
+
+ // 8 D reg
+ SchedVar<R52LMAddrPred14, [R52WriteLM5Cy, R52WriteLM6Cy,
+ R52WriteLM7Cy, R52WriteLM8Cy,
+ R52WriteLM9Cy, R52WriteLM10Cy,
+ R52WriteLM11Cy, R52WriteLM12Cy,
+ R52ReserveLd12Cy]>,
+ SchedVar<R52LMAddrPred15, [R52WriteLM5Cy, R52WriteLM6Cy,
+ R52WriteLM7Cy, R52WriteLM8Cy,
+ R52WriteLM9Cy, R52WriteLM10Cy,
+ R52WriteLM11Cy, R52WriteLM12Cy,
+ R52ReserveLd12Cy]>,
+ // unknown number of reg.
+ SchedVar<NoSchedPred, [R52WriteLM5Cy, R52WriteLM6Cy,
+ R52WriteLM7Cy, R52WriteLM8Cy,
+ R52WriteLM9Cy, R52WriteLM10Cy,
+ R52WriteLM11Cy, R52WriteLM12Cy,
+ R52ReserveLd5Cy]>
+]> { let Variadic=1;}
+
+// variable stores. Cannot dual-issue
+def R52WriteSTM5 : SchedWriteRes<[R52UnitLd]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1];
+}
+def R52WriteSTM6 : SchedWriteRes<[R52UnitLd]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2];
+}
+def R52WriteSTM7 : SchedWriteRes<[R52UnitLd]> {
+ let Latency = 7;
+ let NumMicroOps = 6;
+ let ResourceCycles = [3];
+}
+def R52WriteSTM8 : SchedWriteRes<[R52UnitLd]> {
+ let Latency = 8;
+ let NumMicroOps = 8;
+ let ResourceCycles = [4];
+}
+def R52WriteSTM9 : SchedWriteRes<[R52UnitLd]> {
+ let Latency = 9;
+ let NumMicroOps = 10;
+ let ResourceCycles = [5];
+}
+def R52WriteSTM10 : SchedWriteRes<[R52UnitLd]> {
+ let Latency = 10;
+ let NumMicroOps = 12;
+ let ResourceCycles = [6];
+}
+def R52WriteSTM11 : SchedWriteRes<[R52UnitLd]> {
+ let Latency = 11;
+ let NumMicroOps = 14;
+ let ResourceCycles = [7];
+}
+def R52WriteSTM12 : SchedWriteRes<[R52UnitLd]> {
+ let Latency = 12;
+ let NumMicroOps = 16;
+ let ResourceCycles = [8];
+}
+def R52WriteSTM13 : SchedWriteRes<[R52UnitLd]> {
+ let Latency = 13;
+ let NumMicroOps = 18;
+ let ResourceCycles = [9];
+}
+def R52WriteSTM14 : SchedWriteRes<[R52UnitLd]> {
+ let Latency = 14;
+ let NumMicroOps = 20;
+ let ResourceCycles = [10];
+}
+def R52WriteSTM15 : SchedWriteRes<[R52UnitLd]> {
+ let Latency = 15;
+ let NumMicroOps = 22;
+ let ResourceCycles = [11];
+}
+
+def R52WriteSTM : SchedWriteVariant<[
+ SchedVar<R52LMAddrPred1, [R52WriteSTM5]>,
+ SchedVar<R52LMAddrPred2, [R52WriteSTM5]>,
+ SchedVar<R52LMAddrPred3, [R52WriteSTM6]>,
+ SchedVar<R52LMAddrPred4, [R52WriteSTM6]>,
+ SchedVar<R52LMAddrPred5, [R52WriteSTM7]>,
+ SchedVar<R52LMAddrPred6, [R52WriteSTM7]>,
+ SchedVar<R52LMAddrPred7, [R52WriteSTM8]>,
+ SchedVar<R52LMAddrPred8, [R52WriteSTM8]>,
+ SchedVar<R52LMAddrPred9, [R52WriteSTM9]>,
+ SchedVar<R52LMAddrPred10, [R52WriteSTM9]>,
+ SchedVar<R52LMAddrPred11, [R52WriteSTM10]>,
+ SchedVar<R52LMAddrPred12, [R52WriteSTM10]>,
+ SchedVar<R52LMAddrPred13, [R52WriteSTM11]>,
+ SchedVar<R52LMAddrPred14, [R52WriteSTM11]>,
+ SchedVar<R52LMAddrPred15, [R52WriteSTM12]>,
+ SchedVar<R52LMAddrPred16, [R52WriteSTM12]>,
+ // unknown number of registers, just use resources for two
+ SchedVar<NoSchedPred, [R52WriteSTM6]>
+]>;
+
+// Vector Load/Stores. Can issue only in slot-0. Can dual-issue with
+// another instruction in slot-1, but only in the last issue.
+def R52WriteVLD1Mem : SchedWriteRes<[R52UnitLd]> { let Latency = 5;}
+def R52WriteVLD2Mem : SchedWriteRes<[R52UnitLd]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2];
+}
+def R52WriteVLD3Mem : SchedWriteRes<[R52UnitLd]> {
+ let Latency = 7;
+ let NumMicroOps = 5;
+ let ResourceCycles = [3];
+}
+def R52WriteVLD4Mem : SchedWriteRes<[R52UnitLd]> {
+ let Latency = 8;
+ let NumMicroOps = 7;
+ let ResourceCycles = [4];
+}
+def R52WriteVST1Mem : SchedWriteRes<[R52UnitLd]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def R52WriteVST2Mem : SchedWriteRes<[R52UnitLd]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2];
+}
+def R52WriteVST3Mem : SchedWriteRes<[R52UnitLd]> {
+ let Latency = 7;
+ let NumMicroOps = 5;
+ let ResourceCycles = [3];
+}
+def R52WriteVST4Mem : SchedWriteRes<[R52UnitLd]> {
+ let Latency = 8;
+ let NumMicroOps = 7;
+ let ResourceCycles = [4];
+}
+def R52WriteVST5Mem : SchedWriteRes<[R52UnitLd]> {
+ let Latency = 9;
+ let NumMicroOps = 9;
+ let ResourceCycles = [5];
+}
+
+
+def : InstRW<[R52WriteFPALU_F5, R52Read_F1, R52Read_F1, R52Read_F1], (instregex "VABA(u|s)(v8i8|v4i16|v2i32)")>;
+def : InstRW<[R52Write2FPALU_F5, R52Read_F1, R52Read_F1, R52Read_F1], (instregex "VABA(u|s)(v16i8|v8i16|v4i32)")>;
+def : InstRW<[R52Write2FPALU_F5, R52Read_F1, R52Read_F1, R52Read_F1], (instregex "VABAL(u|s)(v8i16|v4i32|v2i64)")>;
+
+def : InstRW<[R52WriteFPALU_F4, R52Read_F1, R52Read_F1], (instregex "VABD(u|s)(v8i8|v4i16|v2i32)")>;
+def : InstRW<[R52Write2FPALU_F4, R52Read_F1, R52Read_F1], (instregex "VABD(u|s)(v16i8|v8i16|v4i32)")>;
+def : InstRW<[R52Write2FPALU_F4, R52Read_F1, R52Read_F1], (instregex "VABDL(u|s)(v16i8|v8i16|v4i32)")>;
+
+def : InstRW<[R52Write2FPALU_F4, R52Read_F1], (instregex "VABS(v16i8|v8i16|v4i32)")>;
+
+def : InstRW<[R52WriteFPALU_F4, R52Read_F2, R52Read_F2],
+ (instregex "(VADD|VSUB)(v8i8|v4i16|v2i32|v1i64)")>;
+def : InstRW<[R52Write2FPALU_F4, R52Read_F2, R52Read_F2],
+ (instregex "(VADD|VSUB)(v16i8|v8i16|v4i32|v2i64)")>;
+def : InstRW<[R52Write2FPALU_F5, R52Read_F2, R52Read_F2],
+ (instregex "(VADDHN|VRADDHN|VSUBHN|VRSUBHN)(v8i8|v4i16|v2i32)")>;
+
+def : InstRW<[R52Write2FPALU_F4, R52Read_F1, R52Read_F1],
+ (instregex "VADDL", "VADDW", "VSUBL", "VSUBW")>;
+
+def : InstRW<[R52WriteFPALU_F3, R52Read_F2, R52Read_F2], (instregex "(VAND|VBIC|VEOR)d")>;
+def : InstRW<[R52Write2FPALU_F3, R52Read_F2, R52Read_F2], (instregex "(VAND|VBIC|VEOR)q")>;
+
+def : InstRW<[R52WriteFPALU_F3, R52Read_F2], (instregex "VBICi(v4i16|v2i32)")>;
+def : InstRW<[R52Write2FPALU_F3, R52Read_F2], (instregex "VBICi(v8i16|v4i32)")>;
+
+def : InstRW<[R52WriteFPALU_F3, R52Read_F1, R52Read_F2, R52Read_F2], (instregex "(VBIF|VBIT|VBSL)d")>;
+def : InstRW<[R52Write2FPALU_F3, R52Read_F1, R52Read_F2, R52Read_F2], (instregex "(VBIF|VBIT|VBSL)q")>;
+
+def : InstRW<[R52Write2FPALU_F3, R52Read_F2], (instregex "VBICi(v8i16|v4i32)")>;
+
+def : InstRW<[R52WriteFPALU_F3, R52Read_F1, R52Read_F1],
+ (instregex "(VCEQ|VCGE|VCGT|VCLE|VCLT|VCLZ|VCMP|VCMPE|VCNT)")>;
+def : InstRW<[R52WriteFPALU_F5, R52Read_F1, R52Read_F1],
+ (instregex "VCVT", "VSITO", "VUITO", "VTO")>;
+
+def : InstRW<[R52WriteFPALU_F3, R52Read_ISS], (instregex "VDUP(8|16|32)d")>;
+def : InstRW<[R52Write2FPALU_F3, R52Read_ISS], (instregex "VDUP(8|16|32)q")>;
+def : InstRW<[R52WriteFPALU_F3, R52Read_F1], (instregex "VDUPLN(8|16|32)d")>;
+def : InstRW<[R52Write2FPALU_F3, R52Read_F1], (instregex "VDUPLN(8|16|32)q")>;
+
+def : InstRW<[R52WriteFPALU_F3, R52Read_F1, R52Read_F1], (instregex "VEXTd(8|16|32)", "VSEL")>;
+def : InstRW<[R52Write2FPALU_F3, R52Read_F1, R52Read_F1], (instregex "VEXTq(8|16|32|64)")>;
+
+def : InstRW<[R52WriteFPMAC_F5, R52Read_F1, R52Read_F1, R52Read_F1], (instregex "(VFMA|VFMS)(f|h)d")>;
+def : InstRW<[R52Write2FPMAC_F5, R52Read_F1, R52Read_F1, R52Read_F1], (instregex "(VFMA|VFMS)(f|h)q")>;
+
+def : InstRW<[R52WriteFPALU_F4, R52Read_F2, R52Read_F2], (instregex "(VHADD|VHSUB)(u|s)(v8i8|v4i16|v2i32)")>;
+def : InstRW<[R52Write2FPALU_F4, R52Read_F2, R52Read_F2], (instregex "(VHADD|VHSUB)(u|s)(v16i8|v8i16|v4i32)")>;
+
+def : InstRW<[R52WriteVLDM], (instregex "VLDM[SD](IA|DB)$")>;
+def : InstRW<[R52WriteFPALU_F4, R52Read_F1, R52Read_F1], (instregex "VMAX", "VMIN", "VPMAX", "VPMIN")>;
+def : InstRW<[R52WriteFPALU_F3, R52Read_F1, R52Read_F1], (instregex "VMOV", "VORR", "VORN", "VREV")>;
+def : InstRW<[R52WriteNoRSRC_WRI], (instregex "VMRS")>;
+def : InstRW<[R52WriteFPMUL_F5, R52Read_F1, R52Read_F1, R52Read_F1], (instregex "VMUL", "VNMUL", "VMLA")>;
+def : InstRW<[R52WriteFPALU_F5, R52Read_F1], (instregex "VNEG")>;
+def : InstRW<[R52WriteFPALU_F4, R52Read_F1, R52Read_F1], (instregex "VPADDi")>;
+def : InstRW<[R52Write2FPALU_F4, R52Read_F1, R52Read_F1], (instregex "VPADAL", "VPADDL")>;
+def : InstRW<[R52WriteFPALU_F5, R52Read_F1], (instregex "VQABS(v8i8|v4i16|v2i32|v1i64)")>;
+def : InstRW<[R52Write2FPALU_F5, R52Read_F1], (instregex "VQABS(v16i8|v8i16|v4i32|v2i64)")>;
+def : InstRW<[R52WriteFPALU_F5, R52Read_F2, R52Read_F2],
+ (instregex "(VQADD|VQSUB)(u|s)(v8i8|v4i16|v2i32|v1i64)")>;
+def : InstRW<[R52Write2FPALU_F5, R52Read_F2, R52Read_F2],
+ (instregex "(VQADD|VQSUB)(u|s)(v16i8|v8i16|v4i32|v2i64)")>;
+def : InstRW<[R52Write2FPMAC_F5, R52Read_F1, R52Read_F1, R52Read_F1], (instregex "VQDMLAL", "VQDMLSL")>;
+def : InstRW<[R52WriteFPMUL_F5, R52Read_F1, R52Read_F1, R52Read_F1], (instregex "VQDMUL","VQRDMUL")>;
+def : InstRW<[R52WriteFPALU_F5, R52Read_F1, R52Read_F1],
+ (instregex "VQMOVN", "VQNEG", "VQSHL", "VQSHRN")>;
+def : InstRW<[R52WriteFPALU_F4, R52Read_F1, R52Read_F1], (instregex "VRSHL", "VRSHR", "VRSHRN", "VTB")>;
+def : InstRW<[R52WriteFPALU_F3, R52Read_F1, R52Read_F1], (instregex "VSWP", "VTRN", "VUZP", "VZIP")>;
+
+//---
+// VLDx. Vector Loads
+//---
+// 1-element structure load
+def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD1d(8|16|32|64)$")>;
+def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD1q(8|16|32|64)$")>;
+def : InstRW<[R52WriteVLD3Mem, R52Read_ISS], (instregex "VLD1d(8|16|32|64)T$")>;
+def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD1d(8|16|32|64)Q$")>;
+def : InstRW<[R52WriteVLD3Mem, R52Read_ISS], (instregex "VLD1d64TPseudo$")>;
+def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD1d64QPseudo$")>;
+
+def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD1(LN|DUP)d(8|16|32)$")>;
+def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD1LNdAsm_(8|16|32)")>;
+def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD1(LN|DUP)q(8|16|32)Pseudo$")>;
+
+def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1d(8|16|32|64)wb")>;
+def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1q(8|16|32|64)wb")>;
+def : InstRW<[R52WriteVLD3Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1d(8|16|32|64)Twb")>;
+def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1d(8|16|32|64)Qwb")>;
+def : InstRW<[R52WriteVLD3Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1d64TPseudoWB")>;
+def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1d64QPseudoWB")>;
+
+def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1LNd(8|16|32)_UPD")>;
+def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1LNdWB_(fixed|register)_Asm_(8|16|32)")>;
+def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1DUP(d|q)(8|16|32)wb")>;
+def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1(LN|DUP)q(8|16|32)Pseudo_UPD")>;
+
+// 2-element structure load
+def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD2(d|b)(8|16|32)$")>;
+def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD2q(8|16|32)$")>;
+def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2(d|b)(8|16|32)wb")>;
+def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2q(8|16|32)wb")>;
+def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD2q(8|16|32)Pseudo$")>;
+def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2q(8|16|32)PseudoWB")>;
+
+def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNd(8|16|32)$")>;
+def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNdAsm_(8|16|32)$")>;
+def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNq(16|32)$")>;
+def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNqAsm_(16|32)$")>;
+def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2DUPd(8|16|32)$")>;
+def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2DUPd(8|16|32)x2$")>;
+def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNd(8|16|32)Pseudo")>;
+def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNq(16|32)Pseudo")>;
+
+def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNd(8|16|32)_UPD")>;
+def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNdWB_(fixed|register)_Asm_(8|16|32)")>;
+
+def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNq(16|32)_UPD")>;
+def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNqWB_(fixed|register)_Asm_(16|32)")>;
+
+def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2DUPd(8|16|32)wb")>;
+def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2DUPd(8|16|32)x2wb")>;
+def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNd(8|16|32)Pseudo_UPD")>;
+def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNq(16|32)Pseudo_UPD")>;
+
+// 3-element structure load
+def : InstRW<[R52WriteVLD3Mem, R52Read_ISS], (instregex "VLD3(d|q)(8|16|32)$")>;
+def : InstRW<[R52WriteVLD3Mem, R52Read_ISS], (instregex "VLD3(d|q)Asm_(8|16|32)$")>;
+def : InstRW<[R52WriteVLD3Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(d|q)(8|16|32)_UPD")>;
+def : InstRW<[R52WriteVLD3Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(d|q)WB_(fixed|register)_Asm_(8|16|32)")>;
+def : InstRW<[R52WriteVLD3Mem, R52Read_ISS], (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo")>;
+def : InstRW<[R52WriteVLD3Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
+
+def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)(8|16|32)$")>;
+def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)Asm_(8|16|32)$")>;
+def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)(8|16|32)Pseudo$")>;
+
+def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)(8|16|32)_UPD")>;
+def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)WB_(fixed|register)_Asm_(8|16|32)")>;
+def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)WB_(fixed|register)_Asm_(8|16|32)")>;
+def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)(8|16|32)Pseudo_UPD")>;
+
+// 4-element structure load
+def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD4(d|q)(8|16|32)$")>;
+def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD4(d|q)Asm_(8|16|32)$")>;
+def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD4(d|q)(8|16|32)(oddP|P)seudo")>;
+def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(d|q)(8|16|32)_UPD")>;
+def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(d|q)WB_(fixed|register)_Asm_(8|16|32)")>;
+def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
+
+
+def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD4(LN|DUP)(d|q)(8|16|32)$")>;
+def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD4(LN|DUP)(d|q)Asm_(8|16|32)$")>;
+def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD4LN(d|q)(8|16|32)Pseudo$")>;
+def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD4DUPd(8|16|32)Pseudo$")>;
+def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(LN|DUP)(d|q)(8|16|32)_UPD")>;
+def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(LN|DUP)(d|q)WB_(fixed|register)_Asm_(8|16|32)")>;
+def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(LN|DUP)(d|q)(8|16|32)Pseudo_UPD")>;
+
+//---
+// VSTx. Vector Stores
+//---
+// 1-element structure store
+def : InstRW<[R52WriteVST1Mem, R52Read_ISS, R52Read_F2], (instregex "VST1d(8|16|32|64)$")>;
+def : InstRW<[R52WriteVST2Mem, R52Read_ISS, R52Read_F2], (instregex "VST1q(8|16|32|64)$")>;
+def : InstRW<[R52WriteVST3Mem, R52Read_ISS, R52Read_F2], (instregex "VST1d(8|16|32|64)T$")>;
+def : InstRW<[R52WriteVST4Mem, R52Read_ISS, R52Read_F2], (instregex "VST1d(8|16|32|64)Q$")>;
+def : InstRW<[R52WriteVST3Mem, R52Read_ISS, R52Read_F2], (instregex "VST1d64TPseudo$")>;
+def : InstRW<[R52WriteVST4Mem, R52Read_ISS, R52Read_F2], (instregex "VST1d64QPseudo$")>;
+
+def : InstRW<[R52WriteVST1Mem, R52Read_ISS, R52Read_F2], (instregex "VST1LNd(8|16|32)$")>;
+def : InstRW<[R52WriteVST1Mem, R52Read_ISS, R52Read_F2], (instregex "VST1LNdAsm_(8|16|32)$")>;
+def : InstRW<[R52WriteVST1Mem, R52Read_ISS, R52Read_F2], (instregex "VST1LNq(8|16|32)Pseudo$")>;
+
+def : InstRW<[R52WriteVST1Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST1d(8|16|32|64)wb")>;
+def : InstRW<[R52WriteVST2Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST1q(8|16|32|64)wb")>;
+def : InstRW<[R52WriteVST3Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST1d(8|16|32|64)Twb")>;
+def : InstRW<[R52WriteVST4Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST1d(8|16|32|64)Qwb")>;
+def : InstRW<[R52WriteVST3Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST1d64TPseudoWB")>;
+def : InstRW<[R52WriteVST4Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST1d64QPseudoWB")>;
+
+def : InstRW<[R52WriteVST1Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST1LNd(8|16|32)_UPD")>;
+def : InstRW<[R52WriteVST1Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST1LNdWB_(fixed|register)_Asm_(8|16|32)")>;
+def : InstRW<[R52WriteVST1Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST1LNq(8|16|32)Pseudo_UPD")>;
+
+// 2-element structure store
+def : InstRW<[R52WriteVST2Mem, R52Read_ISS, R52Read_F2], (instregex "VST2(d|b)(8|16|32)$")>;
+def : InstRW<[R52WriteVST4Mem, R52Read_ISS, R52Read_F2], (instregex "VST2q(8|16|32)$")>;
+def : InstRW<[R52WriteVST4Mem, R52Read_ISS, R52Read_F2], (instregex "VST2q(8|16|32)Pseudo$")>;
+
+def : InstRW<[R52WriteVST1Mem, R52Read_ISS, R52Read_F2], (instregex "VST2LNd(8|16|32)$")>;
+def : InstRW<[R52WriteVST1Mem, R52Read_ISS, R52Read_F2], (instregex "VST2LNdAsm_(8|16|32)$")>;
+def : InstRW<[R52WriteVST1Mem, R52Read_ISS, R52Read_F2], (instregex "VST2LNd(8|16|32)Pseudo$")>;
+def : InstRW<[R52WriteVST1Mem, R52Read_ISS, R52Read_F2], (instregex "VST2LNq(16|32)$")>;
+def : InstRW<[R52WriteVST1Mem, R52Read_ISS, R52Read_F2], (instregex "VST2LNqAsm_(16|32)$")>;
+def : InstRW<[R52WriteVST1Mem, R52Read_ISS, R52Read_F2], (instregex "VST2LNq(16|32)Pseudo$")>;
+
+def : InstRW<[R52WriteVST2Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST2(d|b)(8|16|32)wb")>;
+def : InstRW<[R52WriteVST4Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST2q(8|16|32)wb")>;
+def : InstRW<[R52WriteVST4Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST2q(8|16|32)PseudoWB")>;
+
+def : InstRW<[R52WriteVST1Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST2LNd(8|16|32)_UPD")>;
+def : InstRW<[R52WriteVST1Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST2LNdWB_(fixed|register)_Asm_(8|16|32)")>;
+def : InstRW<[R52WriteVST1Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST2LNd(8|16|32)Pseudo_UPD")>;
+def : InstRW<[R52WriteVST1Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST2LNq(16|32)_UPD")>;
+def : InstRW<[R52WriteVST1Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST2LNqWB_(fixed|register)_Asm_(16|32)")>;
+def : InstRW<[R52WriteVST1Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST2LNq(16|32)Pseudo_UPD")>;
+
+// 3-element structure store
+def : InstRW<[R52WriteVST4Mem, R52Read_ISS, R52Read_F2], (instregex "VST3(d|q)(8|16|32)$")>;
+def : InstRW<[R52WriteVST4Mem, R52Read_ISS, R52Read_F2], (instregex "VST3(d|q)Asm_(8|16|32)$")>;
+def : InstRW<[R52WriteVST4Mem, R52Read_ISS, R52Read_F2], (instregex "VST3d(8|16|32)(oddP|P)seudo$")>;
+
+def : InstRW<[R52WriteVST2Mem, R52Read_ISS, R52Read_F2], (instregex "VST3LNd(8|16|32)$")>;
+def : InstRW<[R52WriteVST2Mem, R52Read_ISS, R52Read_F2], (instregex "VST3LNdAsm_(8|16|32)$")>;
+def : InstRW<[R52WriteVST2Mem, R52Read_ISS, R52Read_F2], (instregex "VST3LNd(8|16|32)Pseudo$")>;
+def : InstRW<[R52WriteVST2Mem, R52Read_ISS, R52Read_F2], (instregex "VST3LNq(16|32)$")>;
+def : InstRW<[R52WriteVST2Mem, R52Read_ISS, R52Read_F2], (instregex "VST3LNqAsm_(16|32)$")>;
+def : InstRW<[R52WriteVST2Mem, R52Read_ISS, R52Read_F2], (instregex "VST3LNq(16|32)Pseudo$")>;
+
+def : InstRW<[R52WriteVST4Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST3(d|q)(8|16|32)_UPD$")>;
+def : InstRW<[R52WriteVST4Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST3(d|q)WB_(fixed|register)_Asm_(8|16|32)$")>;
+def : InstRW<[R52WriteVST4Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST3(d|q)(8|16|32)(oddP|P)seudo_UPD$")>;
+
+def : InstRW<[R52WriteVST2Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST3LNd(8|16|32)_UPD$")>;
+def : InstRW<[R52WriteVST2Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST3LNdWB_(fixed|register)_Asm_(8|16|32)")>;
+def : InstRW<[R52WriteVST2Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST3LNd(8|16|32)Pseudo_UPD$")>;
+def : InstRW<[R52WriteVST2Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST3LNq(16|32)_UPD$")>;
+def : InstRW<[R52WriteVST2Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST3LNqWB_(fixed|register)_Asm_(16|32)$")>;
+def : InstRW<[R52WriteVST2Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST3LNq(16|32)Pseudo_UPD$")>;
+
+// 4-element structure store
+def : InstRW<[R52WriteVST5Mem, R52Read_ISS, R52Read_F2], (instregex "VST4(d|q)(8|16|32)$")>;
+def : InstRW<[R52WriteVST5Mem, R52Read_ISS, R52Read_F2], (instregex "VST4(d|q)Asm_(8|16|32)$")>;
+def : InstRW<[R52WriteVST5Mem, R52Read_ISS, R52Read_F2], (instregex "VST4d(8|16|32)Pseudo$")>;
+
+def : InstRW<[R52WriteVST2Mem, R52Read_ISS, R52Read_F2], (instregex "VST4LNd(8|16|32)$")>;
+def : InstRW<[R52WriteVST2Mem, R52Read_ISS, R52Read_F2], (instregex "VST4LNdAsm_(8|16|32)$")>;
+def : InstRW<[R52WriteVST2Mem, R52Read_ISS, R52Read_F2], (instregex "VST4LNd(8|16|32)Pseudo$")>;
+def : InstRW<[R52WriteVST2Mem, R52Read_ISS, R52Read_F2], (instregex "VST4LNq(16|32)$")>;
+def : InstRW<[R52WriteVST2Mem, R52Read_ISS, R52Read_F2], (instregex "VST4LNqAsm_(16|32)$")>;
+def : InstRW<[R52WriteVST2Mem, R52Read_ISS, R52Read_F2], (instregex "VST4LNq(16|32)Pseudo$")>;
+
+def : InstRW<[R52WriteVST5Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST4(d|q)(8|16|32)_UPD")>;
+def : InstRW<[R52WriteVST5Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST4(d|q)WB_(fixed|register)_Asm_(8|16|32)")>;
+def : InstRW<[R52WriteVST5Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST4(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
+
+def : InstRW<[R52WriteVST2Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST4LNd(8|16|32)_UPD")>;
+def : InstRW<[R52WriteVST2Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST4LNdWB_(fixed|register)_Asm_(8|16|32)")>;
+def : InstRW<[R52WriteVST2Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST4LNd(8|16|32)Pseudo_UPD")>;
+def : InstRW<[R52WriteVST2Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST4LNq(16|32)_UPD")>;
+def : InstRW<[R52WriteVST2Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST4LNqWB_(fixed|register)_Asm_(16|32)")>;
+def : InstRW<[R52WriteVST2Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST4LNq(16|32)Pseudo_UPD")>;
+
+} // R52 SchedModel
diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleSwift.td b/contrib/llvm/lib/Target/ARM/ARMScheduleSwift.td
new file mode 100644
index 000000000000..ea2bf4b578f0
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMScheduleSwift.td
@@ -0,0 +1,1046 @@
+//=- ARMScheduleSwift.td - Swift Scheduling Definitions -*- tablegen -*----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the Swift processor..
+//
+//===----------------------------------------------------------------------===//
+
+// ===---------------------------------------------------------------------===//
+// This section contains legacy support for itineraries. This is
+// required until SD and PostRA schedulers are replaced by MachineScheduler.
+
+def SW_DIS0 : FuncUnit;
+def SW_DIS1 : FuncUnit;
+def SW_DIS2 : FuncUnit;
+
+def SW_ALU0 : FuncUnit;
+def SW_ALU1 : FuncUnit;
+def SW_LS : FuncUnit;
+def SW_IDIV : FuncUnit;
+def SW_FDIV : FuncUnit;
+
+// FIXME: Need bypasses.
+// FIXME: Model the multiple stages of IIC_iMOVix2, IIC_iMOVix2addpc, and
+// IIC_iMOVix2ld better.
+// FIXME: Model the special immediate shifts that are not microcoded.
+// FIXME: Do we need to model the fact that uses of r15 in a micro-op force it
+// to issue on pipe 1?
+// FIXME: Model the pipelined behavior of CMP / TST instructions.
+// FIXME: Better model the microcode stages of multiply instructions, especially
+// conditional variants.
+// FIXME: Add preload instruction when it is documented.
+// FIXME: Model non-pipelined nature of FP div / sqrt unit.
+
+// Swift machine model for scheduling and other instruction cost heuristics.
+def SwiftModel : SchedMachineModel {
+ let IssueWidth = 3; // 3 micro-ops are dispatched per cycle.
+ let MicroOpBufferSize = 45; // Based on NEON renamed registers.
+ let LoadLatency = 3;
+ let MispredictPenalty = 14; // A branch direction mispredict.
+ let CompleteModel = 0; // FIXME: Remove if all instructions are covered.
+}
+
+// Swift predicates.
+def IsFastImmShiftSwiftPred : SchedPredicate<[{TII->isSwiftFastImmShift(MI)}]>;
+
+// Swift resource mapping.
+let SchedModel = SwiftModel in {
+ // Processor resources.
+ def SwiftUnitP01 : ProcResource<2>; // ALU unit.
+ def SwiftUnitP0 : ProcResource<1> { let Super = SwiftUnitP01; } // Mul unit.
+ def SwiftUnitP1 : ProcResource<1> { let Super = SwiftUnitP01; } // Br unit.
+ def SwiftUnitP2 : ProcResource<1>; // LS unit.
+ def SwiftUnitDiv : ProcResource<1>;
+
+ // Generic resource requirements.
+ def SwiftWriteP0OneCycle : SchedWriteRes<[SwiftUnitP0]>;
+ def SwiftWriteP0TwoCycle : SchedWriteRes<[SwiftUnitP0]> { let Latency = 2; }
+ def SwiftWriteP0FourCycle : SchedWriteRes<[SwiftUnitP0]> { let Latency = 4; }
+ def SwiftWriteP0SixCycle : SchedWriteRes<[SwiftUnitP0]> { let Latency = 6; }
+ def SwiftWriteP0P1FourCycle : SchedWriteRes<[SwiftUnitP0, SwiftUnitP1]> {
+ let Latency = 4;
+ }
+ def SwiftWriteP0P1SixCycle : SchedWriteRes<[SwiftUnitP0, SwiftUnitP1]> {
+ let Latency = 6;
+ }
+ def SwiftWriteP01OneCycle : SchedWriteRes<[SwiftUnitP01]>;
+ def SwiftWriteP1TwoCycle : SchedWriteRes<[SwiftUnitP1]> { let Latency = 2; }
+ def SwiftWriteP1FourCycle : SchedWriteRes<[SwiftUnitP1]> { let Latency = 4; }
+ def SwiftWriteP1SixCycle : SchedWriteRes<[SwiftUnitP1]> { let Latency = 6; }
+ def SwiftWriteP1EightCycle : SchedWriteRes<[SwiftUnitP1]> { let Latency = 8; }
+ def SwiftWriteP1TwelveCyc : SchedWriteRes<[SwiftUnitP1]> { let Latency = 12; }
+ def SwiftWriteP01OneCycle2x : WriteSequence<[SwiftWriteP01OneCycle], 2>;
+ def SwiftWriteP01OneCycle3x : WriteSequence<[SwiftWriteP01OneCycle], 3>;
+ def SwiftWriteP01TwoCycle : SchedWriteRes<[SwiftUnitP01]> { let Latency = 2; }
+ def SwiftWriteP01ThreeCycleTwoUops : SchedWriteRes<[SwiftUnitP01,
+ SwiftUnitP01]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+ }
+ def SwiftWriteP0ThreeCycleThreeUops : SchedWriteRes<[SwiftUnitP0]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [3];
+ }
+ // Plain load without writeback.
+ def SwiftWriteP2ThreeCycle : SchedWriteRes<[SwiftUnitP2]> {
+ let Latency = 3;
+ }
+ def SwiftWriteP2FourCycle : SchedWriteRes<[SwiftUnitP2]> {
+ let Latency = 4;
+ }
+ // A store does not write to a register.
+ def SwiftWriteP2 : SchedWriteRes<[SwiftUnitP2]> {
+ let Latency = 0;
+ }
+ foreach Num = 1-4 in {
+ def SwiftWrite#Num#xP2 : WriteSequence<[SwiftWriteP2], Num>;
+ }
+ def SwiftWriteP01OneCycle2x_load : WriteSequence<[SwiftWriteP01OneCycle,
+ SwiftWriteP01OneCycle,
+ SwiftWriteP2ThreeCycle]>;
+ // 4.2.4 Arithmetic and Logical.
+ // ALU operation register shifted by immediate variant.
+ def SwiftWriteALUsi : SchedWriteVariant<[
+ // lsl #2, lsl #1, or lsr #1.
+ SchedVar<IsFastImmShiftSwiftPred, [SwiftWriteP01TwoCycle]>,
+ SchedVar<NoSchedPred, [WriteALU]>
+ ]>;
+ def SwiftWriteALUsr : SchedWriteVariant<[
+ SchedVar<IsPredicatedPred, [SwiftWriteP01ThreeCycleTwoUops]>,
+ SchedVar<NoSchedPred, [SwiftWriteP01TwoCycle]>
+ ]>;
+ def SwiftWriteALUSsr : SchedWriteVariant<[
+ SchedVar<IsPredicatedPred, [SwiftWriteP0ThreeCycleThreeUops]>,
+ SchedVar<NoSchedPred, [SwiftWriteP01TwoCycle]>
+ ]>;
+ def SwiftReadAdvanceALUsr : SchedReadVariant<[
+ SchedVar<IsPredicatedPred, [SchedReadAdvance<2>]>,
+ SchedVar<NoSchedPred, [NoReadAdvance]>
+ ]>;
+ // ADC,ADD,NEG,RSB,RSC,SBC,SUB,ADR
+ // AND,BIC,EOR,ORN,ORR
+ // CLZ,RBIT,REV,REV16,REVSH,PKH
+ def : WriteRes<WriteALU, [SwiftUnitP01]>;
+ def : SchedAlias<WriteALUsi, SwiftWriteALUsi>;
+ def : SchedAlias<WriteALUsr, SwiftWriteALUsr>;
+ def : SchedAlias<WriteALUSsr, SwiftWriteALUSsr>;
+ def : ReadAdvance<ReadALU, 0>;
+ def : SchedAlias<ReadALUsr, SwiftReadAdvanceALUsr>;
+
+
+ def SwiftChooseShiftKindP01OneOrTwoCycle : SchedWriteVariant<[
+ SchedVar<IsFastImmShiftSwiftPred, [SwiftWriteP01OneCycle]>,
+ SchedVar<NoSchedPred, [SwiftWriteP01TwoCycle]>
+ ]>;
+
+ // 4.2.5 Integer comparison
+ def : WriteRes<WriteCMP, [SwiftUnitP01]>;
+ def : SchedAlias<WriteCMPsi, SwiftChooseShiftKindP01OneOrTwoCycle>;
+ def : SchedAlias<WriteCMPsr, SwiftWriteP01TwoCycle>;
+
+ // 4.2.6 Shift, Move
+ // Shift
+ // ASR,LSL,ROR,RRX
+ // MOV(register-shiftedregister) MVN(register-shiftedregister)
+ // Move
+ // MOV,MVN
+ // MOVT
+ // Sign/Zero extension
+ def : InstRW<[SwiftWriteP01OneCycle],
+ (instregex "SXTB", "SXTH", "SXTB16", "UXTB", "UXTH", "UXTB16",
+ "t2SXTB", "t2SXTH", "t2SXTB16", "t2UXTB", "t2UXTH",
+ "t2UXTB16")>;
+ // Pseudo instructions.
+ def : InstRW<[SwiftWriteP01OneCycle2x],
+ (instregex "MOVCCi32imm", "MOVi32imm", "MOV_ga_dyn", "t2MOVCCi32imm",
+ "t2MOVi32imm", "t2MOV_ga_dyn")>;
+ def : InstRW<[SwiftWriteP01OneCycle3x],
+ (instregex "MOV_ga_pcrel", "t2MOV_ga_pcrel", "t2MOVi16_ga_pcrel")>;
+ def : InstRW<[SwiftWriteP01OneCycle2x_load],
+ (instregex "MOV_ga_pcrel_ldr", "t2MOV_ga_pcrel_ldr")>;
+
+ def SwiftWriteP0TwoCyleTwoUops : WriteSequence<[SwiftWriteP0OneCycle], 2>;
+
+ def SwiftPredP0OneOrTwoCycle : SchedWriteVariant<[
+ SchedVar<IsPredicatedPred, [ SwiftWriteP0TwoCyleTwoUops ]>,
+ SchedVar<NoSchedPred, [ SwiftWriteP0OneCycle ]>
+ ]>;
+
+ // 4.2.7 Select
+ // SEL
+ def : InstRW<[SwiftPredP0OneOrTwoCycle], (instregex "SEL", "t2SEL")>;
+
+ // 4.2.8 Bitfield
+ // BFI,BFC, SBFX,UBFX
+ def : InstRW< [SwiftWriteP01TwoCycle],
+ (instregex "BFC", "BFI", "UBFX", "SBFX", "(t|t2)BFC", "(t|t2)BFI",
+ "(t|t2)UBFX", "(t|t2)SBFX")>;
+
+ // 4.2.9 Saturating arithmetic
+ def : InstRW< [SwiftWriteP01TwoCycle],
+ (instregex "QADD", "QSUB", "QDADD", "QDSUB", "SSAT", "SSAT16", "USAT",
+ "USAT16", "QADD8", "QADD16", "QSUB8", "QSUB16", "QASX", "QSAX",
+ "UQADD8", "UQADD16","UQSUB8","UQSUB16","UQASX","UQSAX", "t2QADD",
+ "t2QSUB", "t2QDADD", "t2QDSUB", "t2SSAT", "t2SSAT16", "t2USAT",
+ "t2QADD8", "t2QADD16", "t2QSUB8", "t2QSUB16", "t2QASX", "t2QSAX",
+ "t2UQADD8", "t2UQADD16","t2UQSUB8","t2UQSUB16","t2UQASX","t2UQSAX")>;
+
+ // 4.2.10 Parallel Arithmetic
+ // Not flag setting.
+ def : InstRW< [SwiftWriteALUsr],
+ (instregex "SADD8", "SADD16", "SSUB8", "SSUB16", "SASX", "SSAX",
+ "UADD8", "UADD16", "USUB8", "USUB16", "UASX", "USAX", "t2SADD8",
+ "t2SADD16", "t2SSUB8", "t2SSUB16", "t2SASX", "t2SSAX", "t2UADD8",
+ "t2UADD16", "t2USUB8", "t2USUB16", "t2UASX", "t2USAX")>;
+ // Flag setting.
+ def : InstRW< [SwiftWriteP01TwoCycle],
+ (instregex "SHADD8", "SHADD16", "SHSUB8", "SHSUB16", "SHASX", "SHSAX",
+ "SXTAB", "SXTAB16", "SXTAH", "UHADD8", "UHADD16", "UHSUB8", "UHSUB16",
+ "UHASX", "UHSAX", "UXTAB", "UXTAB16", "UXTAH", "t2SHADD8", "t2SHADD16",
+ "t2SHSUB8", "t2SHSUB16", "t2SHASX", "t2SHSAX", "t2SXTAB", "t2SXTAB16",
+ "t2SXTAH", "t2UHADD8", "t2UHADD16", "t2UHSUB8", "t2UHSUB16", "t2UHASX",
+ "t2UHSAX", "t2UXTAB", "t2UXTAB16", "t2UXTAH")>;
+
+ // 4.2.11 Sum of Absolute Difference
+ def : InstRW< [SwiftWriteP0P1FourCycle], (instregex "USAD8") >;
+ def : InstRW<[SwiftWriteP0P1FourCycle, ReadALU, ReadALU, SchedReadAdvance<2>],
+ (instregex "USADA8")>;
+
+ // 4.2.12 Integer Multiply (32-bit result)
+ // Two sources.
+ def : InstRW< [SwiftWriteP0FourCycle],
+ (instregex "MULS", "MUL", "SMMUL", "SMMULR", "SMULBB", "SMULBT",
+ "SMULTB", "SMULTT", "SMULWB", "SMULWT", "SMUSD", "SMUSDXi", "t2MUL",
+ "t2SMMUL", "t2SMMULR", "t2SMULBB", "t2SMULBT", "t2SMULTB", "t2SMULTT",
+ "t2SMULWB", "t2SMULWT", "t2SMUSD")>;
+
+ def SwiftWriteP0P01FiveCycleTwoUops :
+ SchedWriteRes<[SwiftUnitP0, SwiftUnitP01]> {
+ let Latency = 5;
+ }
+
+ def SwiftPredP0P01FourFiveCycle : SchedWriteVariant<[
+ SchedVar<IsPredicatedPred, [ SwiftWriteP0P01FiveCycleTwoUops ]>,
+ SchedVar<NoSchedPred, [ SwiftWriteP0FourCycle ]>
+ ]>;
+
+ def SwiftReadAdvanceFourCyclesPred : SchedReadVariant<[
+ SchedVar<IsPredicatedPred, [SchedReadAdvance<4>]>,
+ SchedVar<NoSchedPred, [ReadALU]>
+ ]>;
+
+ // Multiply accumulate, three sources
+ def : InstRW< [SwiftPredP0P01FourFiveCycle, ReadALU, ReadALU,
+ SwiftReadAdvanceFourCyclesPred],
+ (instregex "MLAS", "MLA", "MLS", "SMMLA", "SMMLAR", "SMMLS", "SMMLSR",
+ "t2MLA", "t2MLS", "t2MLAS", "t2SMMLA", "t2SMMLAR", "t2SMMLS",
+ "t2SMMLSR")>;
+
+ // 4.2.13 Integer Multiply (32-bit result, Q flag)
+ def : InstRW< [SwiftWriteP0FourCycle],
+ (instregex "SMUAD", "SMUADX", "t2SMUAD", "t2SMUADX")>;
+ def : InstRW< [SwiftPredP0P01FourFiveCycle, ReadALU, ReadALU,
+ SwiftReadAdvanceFourCyclesPred],
+ (instregex "SMLABB", "SMLABT", "SMLATB", "SMLATT", "SMLSD", "SMLSDX",
+ "SMLAWB", "SMLAWT", "t2SMLABB", "t2SMLABT", "t2SMLATB", "t2SMLATT",
+ "t2SMLSD", "t2SMLSDX", "t2SMLAWB", "t2SMLAWT")>;
+ def : InstRW< [SwiftPredP0P01FourFiveCycle],
+ (instregex "SMLAD", "SMLADX", "t2SMLAD", "t2SMLADX")>;
+
+ def SwiftP0P0P01FiveCycle : SchedWriteRes<[SwiftUnitP0, SwiftUnitP01]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2, 1];
+ }
+ def SwiftWrite1Cycle : SchedWriteRes<[]> {
+ let Latency = 1;
+ let NumMicroOps = 0;
+ }
+ def SwiftWrite5Cycle : SchedWriteRes<[]> {
+ let Latency = 5;
+ let NumMicroOps = 0;
+ }
+ def SwiftWrite6Cycle : SchedWriteRes<[]> {
+ let Latency = 6;
+ let NumMicroOps = 0;
+ }
+
+ // 4.2.14 Integer Multiply, Long
+ def : InstRW< [SwiftP0P0P01FiveCycle, SwiftWrite5Cycle],
+ (instregex "SMULL$", "UMULL$", "t2SMULL$", "t2UMULL$")>;
+
+ def Swift2P03P01FiveCycle : SchedWriteRes<[SwiftUnitP0, SwiftUnitP01]> {
+ let Latency = 7;
+ let NumMicroOps = 5;
+ let ResourceCycles = [2, 3];
+ }
+
+ // 4.2.15 Integer Multiply Accumulate, Long
+ // 4.2.16 Integer Multiply Accumulate, Dual
+ // 4.2.17 Integer Multiply Accumulate Accumulate, Long
+ // We are being a bit inaccurate here.
+ def : InstRW< [SwiftWrite5Cycle, Swift2P03P01FiveCycle, ReadALU, ReadALU,
+ SchedReadAdvance<4>, SchedReadAdvance<3>],
+ (instregex "SMLALS", "UMLALS", "SMLAL", "UMLAL", "MLALBB", "SMLALBT",
+ "SMLALTB", "SMLALTT", "SMLALD", "SMLALDX", "SMLSLD", "SMLSLDX",
+ "UMAAL", "t2SMLALS", "t2UMLALS", "t2SMLAL", "t2UMLAL", "t2MLALBB", "t2SMLALBT",
+ "t2SMLALTB", "t2SMLALTT", "t2SMLALD", "t2SMLALDX", "t2SMLSLD", "t2SMLSLDX",
+ "t2UMAAL")>;
+
+ def SwiftDiv : SchedWriteRes<[SwiftUnitP0, SwiftUnitDiv]> {
+ let NumMicroOps = 1;
+ let Latency = 14;
+ let ResourceCycles = [1, 14];
+ }
+ // 4.2.18 Integer Divide
+ def : WriteRes<WriteDiv, [SwiftUnitDiv]>; // Workaround.
+ def : InstRW <[SwiftDiv],
+ (instregex "SDIV", "UDIV", "t2SDIV", "t2UDIV")>;
+
+ // 4.2.19 Integer Load Single Element
+ // 4.2.20 Integer Load Signextended
+ def SwiftWriteP2P01ThreeCycle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP01]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+ }
+ def SwiftWriteP2P01FourCyle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP01]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ }
+ def SwiftWriteP2P01P01FourCycle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP01,
+ SwiftUnitP01]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+ }
+ def SwiftWriteP2P2ThreeCycle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP2]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+ }
+ def SwiftWriteP2P2P01ThreeCycle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP2,
+ SwiftUnitP01]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ }
+ def SwiftWrBackOne : SchedWriteRes<[]> {
+ let Latency = 1;
+ let NumMicroOps = 0;
+ }
+ def SwiftWriteLdFour : SchedWriteRes<[]> {
+ let Latency = 4;
+ let NumMicroOps = 0;
+ }
+ // Not accurate.
+ def : InstRW<[SwiftWriteP2ThreeCycle],
+ (instregex "LDR(i12|rs)$", "LDRB(i12|rs)$", "t2LDR(i8|i12|s|pci)",
+ "t2LDR(H|B)(i8|i12|s|pci)", "LDREX", "tLDR[BH](r|i|spi|pci|pciASM)",
+ "tLDR(r|i|spi|pci|pciASM)")>;
+ def : InstRW<[SwiftWriteP2ThreeCycle],
+ (instregex "LDRH$", "PICLDR$", "PICLDR(H|B)$", "LDRcp$")>;
+ def : InstRW<[SwiftWriteP2P01FourCyle],
+ (instregex "PICLDRS(H|B)$", "t2LDRS(H|B)(i|r|p|s)", "LDRS(H|B)$",
+ "t2LDRpci_pic", "tLDRS(B|H)")>;
+ def : InstRW<[SwiftWriteP2P01ThreeCycle, SwiftWrBackOne],
+ (instregex "LD(RB|R)(_|T_)(POST|PRE)_(IMM|REG)", "LDRH(_PRE|_POST)",
+ "LDR(T|BT)_POST_(REG|IMM)", "LDRHT(i|r)",
+ "t2LD(R|RB|RH)_(PRE|POST)", "t2LD(R|RB|RH)T")>;
+ def : InstRW<[SwiftWriteP2P01P01FourCycle, SwiftWrBackOne],
+ (instregex "LDR(SH|SB)(_POST|_PRE)", "t2LDR(SH|SB)(_POST|_PRE)",
+ "LDRS(B|H)T(i|r)", "t2LDRS(B|H)T(i|r)", "t2LDRS(B|H)T")>;
+
+ // 4.2.21 Integer Dual Load
+ // Not accurate.
+ def : InstRW<[SwiftWriteP2P2ThreeCycle, SwiftWriteLdFour],
+ (instregex "t2LDRDi8", "LDRD$")>;
+ def : InstRW<[SwiftWriteP2P2P01ThreeCycle, SwiftWriteLdFour, SwiftWrBackOne],
+ (instregex "LDRD_(POST|PRE)", "t2LDRD_(POST|PRE)")>;
+
+ // 4.2.22 Integer Load, Multiple
+ // NumReg = 1 .. 16
+ foreach Lat = 3-25 in {
+ def SwiftWriteLM#Lat#Cy : SchedWriteRes<[SwiftUnitP2]> {
+ let Latency = Lat;
+ }
+ def SwiftWriteLM#Lat#CyNo : SchedWriteRes<[]> {
+ let Latency = Lat;
+ let NumMicroOps = 0;
+ }
+ }
+ // Predicate.
+ foreach NumAddr = 1-16 in {
+ def SwiftLMAddr#NumAddr#Pred : SchedPredicate<"TII->getNumLDMAddresses(*MI) == "#NumAddr>;
+ }
+ def SwiftWriteLDMAddrNoWB : SchedWriteRes<[SwiftUnitP01]> { let Latency = 0; }
+ def SwiftWriteLDMAddrWB : SchedWriteRes<[SwiftUnitP01, SwiftUnitP01]>;
+ def SwiftWriteLM : SchedWriteVariant<[
+ SchedVar<SwiftLMAddr2Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy]>,
+ SchedVar<SwiftLMAddr3Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+ SwiftWriteLM5Cy]>,
+ SchedVar<SwiftLMAddr4Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+ SwiftWriteLM5Cy, SwiftWriteLM6Cy]>,
+ SchedVar<SwiftLMAddr5Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+ SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+ SwiftWriteLM7Cy]>,
+ SchedVar<SwiftLMAddr6Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+ SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+ SwiftWriteLM7Cy, SwiftWriteLM8Cy]>,
+ SchedVar<SwiftLMAddr7Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+ SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+ SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+ SwiftWriteLM9Cy]>,
+ SchedVar<SwiftLMAddr8Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+ SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+ SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+ SwiftWriteLM9Cy, SwiftWriteLM10Cy]>,
+ SchedVar<SwiftLMAddr9Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+ SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+ SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+ SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+ SwiftWriteLM11Cy]>,
+ SchedVar<SwiftLMAddr10Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+ SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+ SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+ SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+ SwiftWriteLM11Cy, SwiftWriteLM12Cy]>,
+ SchedVar<SwiftLMAddr11Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+ SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+ SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+ SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+ SwiftWriteLM11Cy, SwiftWriteLM12Cy,
+ SwiftWriteLM13Cy]>,
+ SchedVar<SwiftLMAddr12Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+ SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+ SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+ SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+ SwiftWriteLM11Cy, SwiftWriteLM12Cy,
+ SwiftWriteLM13Cy, SwiftWriteLM14Cy]>,
+ SchedVar<SwiftLMAddr13Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+ SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+ SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+ SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+ SwiftWriteLM11Cy, SwiftWriteLM12Cy,
+ SwiftWriteLM13Cy, SwiftWriteLM14Cy,
+ SwiftWriteLM15Cy]>,
+ SchedVar<SwiftLMAddr14Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+ SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+ SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+ SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+ SwiftWriteLM11Cy, SwiftWriteLM12Cy,
+ SwiftWriteLM13Cy, SwiftWriteLM14Cy,
+ SwiftWriteLM15Cy, SwiftWriteLM16Cy]>,
+ SchedVar<SwiftLMAddr15Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+ SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+ SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+ SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+ SwiftWriteLM11Cy, SwiftWriteLM12Cy,
+ SwiftWriteLM13Cy, SwiftWriteLM14Cy,
+ SwiftWriteLM15Cy, SwiftWriteLM16Cy,
+ SwiftWriteLM17Cy]>,
+ SchedVar<SwiftLMAddr16Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+ SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+ SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+ SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+ SwiftWriteLM11Cy, SwiftWriteLM12Cy,
+ SwiftWriteLM13Cy, SwiftWriteLM14Cy,
+ SwiftWriteLM15Cy, SwiftWriteLM16Cy,
+ SwiftWriteLM17Cy, SwiftWriteLM18Cy]>,
+ // Unknow number of registers, just use resources for two registers.
+ SchedVar<NoSchedPred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+ SwiftWriteLM5CyNo, SwiftWriteLM6CyNo,
+ SwiftWriteLM7CyNo, SwiftWriteLM8CyNo,
+ SwiftWriteLM9CyNo, SwiftWriteLM10CyNo,
+ SwiftWriteLM11CyNo, SwiftWriteLM12CyNo,
+ SwiftWriteLM13CyNo, SwiftWriteLM14CyNo,
+ SwiftWriteLM15CyNo, SwiftWriteLM16CyNo,
+ SwiftWriteLM17CyNo, SwiftWriteLM18CyNo]>
+
+ ]> { let Variadic=1; }
+
+ def : InstRW<[SwiftWriteLM, SwiftWriteLDMAddrNoWB],
+ (instregex "LDM(IA|DA|DB|IB)$", "t2LDM(IA|DA|DB|IB)$",
+ "(t|sys)LDM(IA|DA|DB|IB)$")>;
+ def : InstRW<[SwiftWriteLDMAddrWB, SwiftWriteLM],
+ (instregex /*"t2LDMIA_RET", "tLDMIA_RET", "LDMIA_RET",*/
+ "LDM(IA|DA|DB|IB)_UPD", "(t2|sys|t)LDM(IA|DA|DB|IB)_UPD")>;
+ def : InstRW<[SwiftWriteLDMAddrWB, SwiftWriteLM, SwiftWriteP1TwoCycle],
+ (instregex "LDMIA_RET", "(t|t2)LDMIA_RET", "POP", "tPOP")>;
+ // 4.2.23 Integer Store, Single Element
+ def : InstRW<[SwiftWriteP2],
+ (instregex "PICSTR", "STR(i12|rs)", "STRB(i12|rs)", "STRH$", "STREX",
+ "t2STR(i12|i8|s)$", "t2STR[BH](i12|i8|s)$", "tSTR[BH](i|r)", "tSTR(i|r)", "tSTRspi")>;
+
+ def : InstRW<[SwiftWriteP01OneCycle, SwiftWriteP2],
+ (instregex "STR(B_|_|BT_|T_)(PRE_IMM|PRE_REG|POST_REG|POST_IMM)",
+ "STR(i|r)_preidx", "STRB(i|r)_preidx", "STRH_preidx", "STR(H_|HT_)(PRE|POST)",
+ "STR(BT|HT|T)", "t2STR_(PRE|POST)", "t2STR[BH]_(PRE|POST)",
+ "t2STR_preidx", "t2STR[BH]_preidx", "t2ST(RB|RH|R)T")>;
+
+ // 4.2.24 Integer Store, Dual
+ def : InstRW<[SwiftWriteP2, SwiftWriteP2, SwiftWriteP01OneCycle],
+ (instregex "STRD$", "t2STRDi8")>;
+ def : InstRW<[SwiftWriteP01OneCycle, SwiftWriteP2, SwiftWriteP2,
+ SwiftWriteP01OneCycle],
+ (instregex "(t2|t)STRD_(POST|PRE)", "STRD_(POST|PRE)")>;
+
+ // 4.2.25 Integer Store, Multiple
+ def SwiftWriteStIncAddr : SchedWriteRes<[SwiftUnitP2, SwiftUnitP01]> {
+ let Latency = 0;
+ let NumMicroOps = 2;
+ }
+ foreach NumAddr = 1-16 in {
+ def SwiftWriteSTM#NumAddr : WriteSequence<[SwiftWriteStIncAddr], NumAddr>;
+ }
+ def SwiftWriteSTM : SchedWriteVariant<[
+ SchedVar<SwiftLMAddr2Pred, [SwiftWriteSTM2]>,
+ SchedVar<SwiftLMAddr3Pred, [SwiftWriteSTM3]>,
+ SchedVar<SwiftLMAddr4Pred, [SwiftWriteSTM4]>,
+ SchedVar<SwiftLMAddr5Pred, [SwiftWriteSTM5]>,
+ SchedVar<SwiftLMAddr6Pred, [SwiftWriteSTM6]>,
+ SchedVar<SwiftLMAddr7Pred, [SwiftWriteSTM7]>,
+ SchedVar<SwiftLMAddr8Pred, [SwiftWriteSTM8]>,
+ SchedVar<SwiftLMAddr9Pred, [SwiftWriteSTM9]>,
+ SchedVar<SwiftLMAddr10Pred,[SwiftWriteSTM10]>,
+ SchedVar<SwiftLMAddr11Pred,[SwiftWriteSTM11]>,
+ SchedVar<SwiftLMAddr12Pred,[SwiftWriteSTM12]>,
+ SchedVar<SwiftLMAddr13Pred,[SwiftWriteSTM13]>,
+ SchedVar<SwiftLMAddr14Pred,[SwiftWriteSTM14]>,
+ SchedVar<SwiftLMAddr15Pred,[SwiftWriteSTM15]>,
+ SchedVar<SwiftLMAddr16Pred,[SwiftWriteSTM16]>,
+ // Unknow number of registers, just use resources for two registers.
+ SchedVar<NoSchedPred, [SwiftWriteSTM2]>
+ ]>;
+ def : InstRW<[SwiftWriteSTM],
+ (instregex "STM(IB|IA|DB|DA)$", "(t2|sys|t)STM(IB|IA|DB|DA)$")>;
+ def : InstRW<[SwiftWriteP01OneCycle, SwiftWriteSTM],
+ (instregex "STM(IB|IA|DB|DA)_UPD", "(t2|sys|t)STM(IB|IA|DB|DA)_UPD",
+ "PUSH", "tPUSH")>;
+
+ // LDRLIT pseudo instructions, they expand to LDR + PICADD
+ def : InstRW<[SwiftWriteP2ThreeCycle, WriteALU],
+ (instregex "t?LDRLIT_ga_abs", "t?LDRLIT_ga_pcrel")>;
+ // LDRLIT_ga_pcrel_ldr expands to LDR + PICLDR
+ def : InstRW<[SwiftWriteP2ThreeCycle, SwiftWriteP2ThreeCycle],
+ (instregex "LDRLIT_ga_pcrel_ldr")>;
+
+ // 4.2.26 Branch
+ def : WriteRes<WriteBr, [SwiftUnitP1]> { let Latency = 0; }
+ def : WriteRes<WriteBrL, [SwiftUnitP1]> { let Latency = 2; }
+ def : WriteRes<WriteBrTbl, [SwiftUnitP1, SwiftUnitP2]> { let Latency = 0; }
+
+ // 4.2.27 Not issued
+ def : WriteRes<WriteNoop, []> { let Latency = 0; let NumMicroOps = 0; }
+ def : InstRW<[WriteNoop], (instregex "t2IT", "IT", "NOP")>;
+
+ // 4.2.28 Advanced SIMD, Integer, 2 cycle
+ def : InstRW<[SwiftWriteP0TwoCycle],
+ (instregex "VADDv", "VSUBv", "VNEG(s|f|v)", "VADDL", "VSUBL",
+ "VADDW", "VSUBW", "VHADD", "VHSUB", "VRHADD", "VPADDi",
+ "VPADDL", "VAND", "VBIC", "VEOR", "VORN", "VORR", "VTST",
+ "VSHL", "VSHR(s|u)", "VSHLL", "VQSHL", "VQSHLU", "VBIF",
+ "VBIT", "VBSL", "VSLI", "VSRI", "VCLS", "VCLZ", "VCNT")>;
+
+ def : InstRW<[SwiftWriteP1TwoCycle],
+ (instregex "VEXT", "VREV16", "VREV32", "VREV64")>;
+
+ // 4.2.29 Advanced SIMD, Integer, 4 cycle
+ // 4.2.30 Advanced SIMD, Integer with Accumulate
+ def : InstRW<[SwiftWriteP0FourCycle],
+ (instregex "VABA", "VABAL", "VPADAL", "VRSRA", "VSRA", "VACGE", "VACGT",
+ "VACLE", "VACLT", "VCEQ", "VCGE", "VCGT", "VCLE", "VCLT", "VRSHL",
+ "VQRSHL", "VRSHR(u|s)", "VABS(f|v)", "VQABS", "VQNEG", "VQADD",
+ "VQSUB")>;
+ def : InstRW<[SwiftWriteP1FourCycle],
+ (instregex "VRECPE", "VRSQRTE")>;
+
+ // 4.2.31 Advanced SIMD, Add and Shift with Narrow
+ def : InstRW<[SwiftWriteP0P1FourCycle],
+ (instregex "VADDHN", "VSUBHN", "VSHRN")>;
+ def : InstRW<[SwiftWriteP0P1SixCycle],
+ (instregex "VRADDHN", "VRSUBHN", "VRSHRN", "VQSHRN", "VQSHRUN",
+ "VQRSHRN", "VQRSHRUN")>;
+
+ // 4.2.32 Advanced SIMD, Vector Table Lookup
+ foreach Num = 1-4 in {
+ def SwiftWrite#Num#xP1TwoCycle : WriteSequence<[SwiftWriteP1TwoCycle], Num>;
+ }
+ def : InstRW<[SwiftWrite1xP1TwoCycle],
+ (instregex "VTB(L|X)1")>;
+ def : InstRW<[SwiftWrite2xP1TwoCycle],
+ (instregex "VTB(L|X)2")>;
+ def : InstRW<[SwiftWrite3xP1TwoCycle],
+ (instregex "VTB(L|X)3")>;
+ def : InstRW<[SwiftWrite4xP1TwoCycle],
+ (instregex "VTB(L|X)4")>;
+
+ // 4.2.33 Advanced SIMD, Transpose
+ def : InstRW<[SwiftWriteP1FourCycle, SwiftWriteP1FourCycle,
+ SwiftWriteP1TwoCycle/*RsrcOnly*/, SchedReadAdvance<2>],
+ (instregex "VSWP", "VTRN", "VUZP", "VZIP")>;
+
+ // 4.2.34 Advanced SIMD and VFP, Floating Point
+ def : InstRW<[SwiftWriteP0TwoCycle], (instregex "VABS(S|D)$", "VNEG(S|D)$")>;
+ def : InstRW<[SwiftWriteP0FourCycle],
+ (instregex "VCMP(D|S|ZD|ZS)$", "VCMPE(D|S|ZD|ZS)")>;
+ def : InstRW<[SwiftWriteP0FourCycle],
+ (instregex "VADD(S|f)", "VSUB(S|f)", "VABD", "VPADDf", "VMAX", "VMIN", "VPMAX",
+ "VPMIN")>;
+ def : InstRW<[SwiftWriteP0SixCycle], (instregex "VADDD$", "VSUBD$")>;
+ def : InstRW<[SwiftWriteP1EightCycle], (instregex "VRECPS", "VRSQRTS")>;
+
+ // 4.2.35 Advanced SIMD and VFP, Multiply
+ def : InstRW<[SwiftWriteP1FourCycle],
+ (instregex "VMUL(S|v|p|f|s)", "VNMULS", "VQDMULH", "VQRDMULH",
+ "VMULL", "VQDMULL")>;
+ def : InstRW<[SwiftWriteP1SixCycle],
+ (instregex "VMULD", "VNMULD")>;
+ def : InstRW<[SwiftWriteP1FourCycle],
+ (instregex "VMLA", "VMLS", "VNMLA", "VNMLS", "VFMA(S|D)", "VFMS(S|D)",
+ "VFNMA", "VFNMS", "VMLAL", "VMLSL","VQDMLAL", "VQDMLSL")>;
+ def : InstRW<[SwiftWriteP1EightCycle], (instregex "VFMAfd", "VFMSfd")>;
+ def : InstRW<[SwiftWriteP1TwelveCyc], (instregex "VFMAfq", "VFMSfq")>;
+
+ // 4.2.36 Advanced SIMD and VFP, Convert
+ def : InstRW<[SwiftWriteP1FourCycle], (instregex "VCVT", "V(S|U)IT", "VTO(S|U)")>;
+ // Fixpoint conversions.
+ def : WriteRes<WriteCvtFP, [SwiftUnitP1]> { let Latency = 4; }
+
+ // 4.2.37 Advanced SIMD and VFP, Move
+ def : InstRW<[SwiftWriteP0TwoCycle],
+ (instregex "VMOVv", "VMOV(S|D)$", "VMOV(S|D)cc",
+ "VMVNv", "VMVN(d|q)", "VMVN(S|D)cc",
+ "FCONST(D|S)")>;
+ def : InstRW<[SwiftWriteP1TwoCycle], (instregex "VMOVN", "VMOVL")>;
+ def : InstRW<[WriteSequence<[SwiftWriteP0FourCycle, SwiftWriteP1TwoCycle]>],
+ (instregex "VQMOVN")>;
+ def : InstRW<[SwiftWriteP1TwoCycle], (instregex "VDUPLN", "VDUPf")>;
+ def : InstRW<[WriteSequence<[SwiftWriteP2FourCycle, SwiftWriteP1TwoCycle]>],
+ (instregex "VDUP(8|16|32)")>;
+ def : InstRW<[SwiftWriteP2ThreeCycle], (instregex "VMOVRS$")>;
+ def : InstRW<[WriteSequence<[SwiftWriteP2FourCycle, SwiftWriteP0TwoCycle]>],
+ (instregex "VMOVSR$", "VSETLN")>;
+ def : InstRW<[SwiftWriteP2ThreeCycle, SwiftWriteP2FourCycle],
+ (instregex "VMOVRR(D|S)$")>;
+ def : InstRW<[SwiftWriteP2FourCycle], (instregex "VMOVDRR$")>;
+ def : InstRW<[WriteSequence<[SwiftWriteP2FourCycle, SwiftWriteP1TwoCycle]>,
+ WriteSequence<[SwiftWrite1Cycle, SwiftWriteP2FourCycle,
+ SwiftWriteP1TwoCycle]>],
+ (instregex "VMOVSRR$")>;
+ def : InstRW<[WriteSequence<[SwiftWriteP1TwoCycle, SwiftWriteP2ThreeCycle]>],
+ (instregex "VGETLN(u|i)")>;
+ def : InstRW<[WriteSequence<[SwiftWriteP1TwoCycle, SwiftWriteP2ThreeCycle,
+ SwiftWriteP01OneCycle]>],
+ (instregex "VGETLNs")>;
+
+ // 4.2.38 Advanced SIMD and VFP, Move FPSCR
+ // Serializing instructions.
+ def SwiftWaitP0For15Cy : SchedWriteRes<[SwiftUnitP0]> {
+ let Latency = 15;
+ let ResourceCycles = [15];
+ }
+ def SwiftWaitP1For15Cy : SchedWriteRes<[SwiftUnitP1]> {
+ let Latency = 15;
+ let ResourceCycles = [15];
+ }
+ def SwiftWaitP2For15Cy : SchedWriteRes<[SwiftUnitP2]> {
+ let Latency = 15;
+ let ResourceCycles = [15];
+ }
+ def : InstRW<[SwiftWaitP0For15Cy, SwiftWaitP1For15Cy, SwiftWaitP2For15Cy],
+ (instregex "VMRS")>;
+ def : InstRW<[SwiftWaitP0For15Cy, SwiftWaitP1For15Cy, SwiftWaitP2For15Cy],
+ (instregex "VMSR")>;
+ // Not serializing.
+ def : InstRW<[SwiftWriteP0TwoCycle], (instregex "FMSTAT")>;
+
+ // 4.2.39 Advanced SIMD and VFP, Load Single Element
+ def : InstRW<[SwiftWriteLM4Cy], (instregex "VLDRD$", "VLDRS$")>;
+
+ // 4.2.40 Advanced SIMD and VFP, Store Single Element
+ def : InstRW<[SwiftWriteLM4Cy], (instregex "VSTRD$", "VSTRS$")>;
+
+ // 4.2.41 Advanced SIMD and VFP, Load Multiple
+ // 4.2.42 Advanced SIMD and VFP, Store Multiple
+
+ // Resource requirement for permuting, just reserves the resources.
+ foreach Num = 1-28 in {
+ def SwiftVLDMPerm#Num : SchedWriteRes<[SwiftUnitP1]> {
+ let Latency = 0;
+ let NumMicroOps = Num;
+ let ResourceCycles = [Num];
+ }
+ }
+
+ // Pre RA pseudos - load/store to a Q register as a D register pair.
+ def : InstRW<[SwiftWriteLM4Cy], (instregex "VLDMQIA$", "VSTMQIA$")>;
+
+ // Post RA not modelled accurately. We assume that register use of width 64
+ // bit maps to a D register, 128 maps to a Q register. Not all different kinds
+ // are accurately represented.
+ def SwiftWriteVLDM : SchedWriteVariant<[
+ // Load of one S register.
+ SchedVar<SwiftLMAddr1Pred, [SwiftWriteLM4Cy]>,
+ // Load of one D register.
+ SchedVar<SwiftLMAddr2Pred, [SwiftWriteLM4Cy, SwiftWriteLM4CyNo]>,
+ // Load of 3 S register.
+ SchedVar<SwiftLMAddr3Pred, [SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+ SwiftWriteLM13CyNo, SwiftWriteP01OneCycle,
+ SwiftVLDMPerm3]>,
+ // Load of a Q register (not necessarily true). We should not be mapping to
+ // 4 S registers, either.
+ SchedVar<SwiftLMAddr4Pred, [SwiftWriteLM4Cy, SwiftWriteLM4CyNo,
+ SwiftWriteLM4CyNo, SwiftWriteLM4CyNo]>,
+ // Load of 5 S registers.
+ SchedVar<SwiftLMAddr5Pred, [SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+ SwiftWriteLM13CyNo, SwiftWriteLM14CyNo,
+ SwiftWriteLM17CyNo, SwiftWriteP01OneCycle,
+ SwiftVLDMPerm5]>,
+ // Load of 3 D registers. (Must also be able to handle s register list -
+ // though, not accurate)
+ SchedVar<SwiftLMAddr6Pred, [SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+ SwiftWriteLM10Cy, SwiftWriteLM14CyNo,
+ SwiftWriteLM14CyNo, SwiftWriteLM14CyNo,
+ SwiftWriteP01OneCycle, SwiftVLDMPerm5]>,
+ // Load of 7 S registers.
+ SchedVar<SwiftLMAddr7Pred, [SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+ SwiftWriteLM13Cy, SwiftWriteLM14CyNo,
+ SwiftWriteLM17CyNo, SwiftWriteLM18CyNo,
+ SwiftWriteLM21CyNo, SwiftWriteP01OneCycle,
+ SwiftVLDMPerm7]>,
+ // Load of two Q registers.
+ SchedVar<SwiftLMAddr8Pred, [SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+ SwiftWriteLM13Cy, SwiftWriteLM13CyNo,
+ SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+ SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+ SwiftWriteP01OneCycle, SwiftVLDMPerm2]>,
+ // Load of 9 S registers.
+ SchedVar<SwiftLMAddr9Pred, [SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+ SwiftWriteLM13Cy, SwiftWriteLM14CyNo,
+ SwiftWriteLM17CyNo, SwiftWriteLM18CyNo,
+ SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+ SwiftWriteLM25CyNo, SwiftWriteP01OneCycle,
+ SwiftVLDMPerm9]>,
+ // Load of 5 D registers.
+ SchedVar<SwiftLMAddr10Pred,[SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+ SwiftWriteLM10Cy, SwiftWriteLM14Cy,
+ SwiftWriteLM14CyNo, SwiftWriteLM14CyNo,
+ SwiftWriteLM14CyNo, SwiftWriteLM14CyNo,
+ SwiftWriteLM14CyNo, SwiftWriteLM14CyNo,
+ SwiftWriteP01OneCycle, SwiftVLDMPerm5]>,
+ // Inaccurate: reuse describtion from 9 S registers.
+ SchedVar<SwiftLMAddr11Pred,[SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+ SwiftWriteLM13Cy, SwiftWriteLM14CyNo,
+ SwiftWriteLM17CyNo, SwiftWriteLM18CyNo,
+ SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+ SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+ SwiftWriteLM25CyNo, SwiftWriteP01OneCycle,
+ SwiftVLDMPerm9]>,
+ // Load of three Q registers.
+ SchedVar<SwiftLMAddr12Pred,[SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+ SwiftWriteLM11Cy, SwiftWriteLM11Cy,
+ SwiftWriteLM11CyNo, SwiftWriteLM11CyNo,
+ SwiftWriteLM11CyNo, SwiftWriteLM11CyNo,
+ SwiftWriteLM11CyNo, SwiftWriteLM11CyNo,
+ SwiftWriteLM11CyNo, SwiftWriteLM11CyNo,
+ SwiftWriteP01OneCycle, SwiftVLDMPerm3]>,
+ // Inaccurate: reuse describtion from 9 S registers.
+ SchedVar<SwiftLMAddr13Pred, [SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+ SwiftWriteLM13Cy, SwiftWriteLM14CyNo,
+ SwiftWriteLM17CyNo, SwiftWriteLM18CyNo,
+ SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+ SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+ SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+ SwiftWriteLM25CyNo, SwiftWriteP01OneCycle,
+ SwiftVLDMPerm9]>,
+ // Load of 7 D registers inaccurate.
+ SchedVar<SwiftLMAddr14Pred,[SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+ SwiftWriteLM10Cy, SwiftWriteLM14Cy,
+ SwiftWriteLM14Cy, SwiftWriteLM14CyNo,
+ SwiftWriteLM14CyNo, SwiftWriteLM14CyNo,
+ SwiftWriteLM14CyNo, SwiftWriteLM14CyNo,
+ SwiftWriteLM14CyNo, SwiftWriteLM14CyNo,
+ SwiftWriteP01OneCycle, SwiftVLDMPerm7]>,
+ SchedVar<SwiftLMAddr15Pred,[SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+ SwiftWriteLM13Cy, SwiftWriteLM14Cy,
+ SwiftWriteLM17Cy, SwiftWriteLM18CyNo,
+ SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+ SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+ SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+ SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+ SwiftWriteLM25CyNo, SwiftWriteP01OneCycle,
+ SwiftVLDMPerm9]>,
+ // Load of 4 Q registers.
+ SchedVar<SwiftLMAddr16Pred,[SwiftWriteLM7Cy, SwiftWriteLM10Cy,
+ SwiftWriteLM11Cy, SwiftWriteLM14Cy,
+ SwiftWriteLM15Cy, SwiftWriteLM18CyNo,
+ SwiftWriteLM19CyNo, SwiftWriteLM22CyNo,
+ SwiftWriteLM19CyNo, SwiftWriteLM22CyNo,
+ SwiftWriteLM19CyNo, SwiftWriteLM22CyNo,
+ SwiftWriteLM19CyNo, SwiftWriteLM22CyNo,
+ SwiftWriteLM19CyNo, SwiftWriteLM22CyNo,
+ SwiftWriteP01OneCycle, SwiftVLDMPerm4]>,
+ // Unknow number of registers, just use resources for two registers.
+ SchedVar<NoSchedPred, [SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+ SwiftWriteLM13Cy, SwiftWriteLM13CyNo,
+ SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+ SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+ SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+ SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+ SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+ SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+ SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+ SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+ SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+ SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+ SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+ SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+ SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+ SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+ SwiftWriteP01OneCycle, SwiftVLDMPerm2]>
+ ]> { let Variadic = 1; }
+
+ def : InstRW<[SwiftWriteVLDM], (instregex "VLDM[SD](IA|DB)$")>;
+
+ def : InstRW<[SwiftWriteP01OneCycle2x, SwiftWriteVLDM],
+ (instregex "VLDM[SD](IA|DB)_UPD$")>;
+
+ def SwiftWriteVSTM : SchedWriteVariant<[
+ // One S register.
+ SchedVar<SwiftLMAddr1Pred, [SwiftWriteSTM1]>,
+ // One D register.
+ SchedVar<SwiftLMAddr2Pred, [SwiftWriteSTM1]>,
+ // Three S registers.
+ SchedVar<SwiftLMAddr3Pred, [SwiftWriteSTM4]>,
+ // Assume one Q register.
+ SchedVar<SwiftLMAddr4Pred, [SwiftWriteSTM1]>,
+ SchedVar<SwiftLMAddr5Pred, [SwiftWriteSTM6]>,
+ // Assume three D registers.
+ SchedVar<SwiftLMAddr6Pred, [SwiftWriteSTM4]>,
+ SchedVar<SwiftLMAddr7Pred, [SwiftWriteSTM8]>,
+ // Assume two Q registers.
+ SchedVar<SwiftLMAddr8Pred, [SwiftWriteSTM3]>,
+ SchedVar<SwiftLMAddr9Pred, [SwiftWriteSTM10]>,
+ // Assume 5 D registers.
+ SchedVar<SwiftLMAddr10Pred, [SwiftWriteSTM6]>,
+ SchedVar<SwiftLMAddr11Pred, [SwiftWriteSTM12]>,
+ // Assume three Q registers.
+ SchedVar<SwiftLMAddr12Pred, [SwiftWriteSTM4]>,
+ SchedVar<SwiftLMAddr13Pred, [SwiftWriteSTM14]>,
+ // Assume 7 D registers.
+ SchedVar<SwiftLMAddr14Pred, [SwiftWriteSTM8]>,
+ SchedVar<SwiftLMAddr15Pred, [SwiftWriteSTM16]>,
+ // Assume four Q registers.
+ SchedVar<SwiftLMAddr16Pred, [SwiftWriteSTM5]>,
+ // Asumme two Q registers.
+ SchedVar<NoSchedPred, [SwiftWriteSTM3]>
+ ]> { let Variadic = 1; }
+
+ def : InstRW<[SwiftWriteVSTM], (instregex "VSTM[SD](IA|DB)$")>;
+
+ def : InstRW<[SwiftWriteP01OneCycle2x, SwiftWriteVSTM],
+ (instregex "VSTM[SD](IA|DB)_UPD")>;
+
+ // 4.2.43 Advanced SIMD, Element or Structure Load and Store
+ def SwiftWrite2xP2FourCy : SchedWriteRes<[SwiftUnitP2]> {
+ let Latency = 4;
+ let ResourceCycles = [2];
+ }
+ def SwiftWrite3xP2FourCy : SchedWriteRes<[SwiftUnitP2]> {
+ let Latency = 4;
+ let ResourceCycles = [3];
+ }
+ foreach Num = 1-2 in {
+ def SwiftExt#Num#xP0 : SchedWriteRes<[SwiftUnitP0]> {
+ let Latency = 0;
+ let NumMicroOps = Num;
+ let ResourceCycles = [Num];
+ }
+ }
+ // VLDx
+ // Multiple structures.
+ // Single element structure loads.
+ // We assume aligned.
+ // Single/two register.
+ def : InstRW<[SwiftWriteLM4Cy], (instregex "VLD1(d|q)(8|16|32|64)$")>;
+ def : InstRW<[SwiftWriteLM4Cy, SwiftWriteP01OneCycle],
+ (instregex "VLD1(d|q)(8|16|32|64)wb")>;
+ // Three register.
+ def : InstRW<[SwiftWrite3xP2FourCy],
+ (instregex "VLD1(d|q)(8|16|32|64)T$", "VLD1d64TPseudo")>;
+ def : InstRW<[SwiftWrite3xP2FourCy, SwiftWriteP01OneCycle],
+ (instregex "VLD1(d|q)(8|16|32|64)Twb")>;
+ /// Four Register.
+ def : InstRW<[SwiftWrite2xP2FourCy],
+ (instregex "VLD1(d|q)(8|16|32|64)Q$", "VLD1d64QPseudo")>;
+ def : InstRW<[SwiftWrite2xP2FourCy, SwiftWriteP01OneCycle],
+ (instregex "VLD1(d|q)(8|16|32|64)Qwb")>;
+ // Two element structure loads.
+ // Two/four register.
+ def : InstRW<[SwiftWriteLM9Cy, SwiftExt2xP0, SwiftVLDMPerm2],
+ (instregex "VLD2(d|q|b)(8|16|32)$", "VLD2q(8|16|32)Pseudo$")>;
+ def : InstRW<[SwiftWriteLM9Cy, SwiftWriteP01OneCycle, SwiftExt2xP0,
+ SwiftVLDMPerm2],
+ (instregex "VLD2(d|q|b)(8|16|32)wb", "VLD2q(8|16|32)PseudoWB")>;
+ // Three element structure.
+ def : InstRW<[SwiftWriteLM9Cy, SwiftWriteLM9CyNo, SwiftWriteLM9CyNo,
+ SwiftVLDMPerm3, SwiftWrite3xP2FourCy],
+ (instregex "VLD3(d|q)(8|16|32)$")>;
+ def : InstRW<[SwiftWriteLM9Cy, SwiftVLDMPerm3, SwiftWrite3xP2FourCy],
+ (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo$")>;
+
+ def : InstRW<[SwiftWriteLM9Cy, SwiftWriteLM9CyNo, SwiftWriteLM9CyNo,
+ SwiftWriteP01OneCycle, SwiftVLDMPerm3, SwiftWrite3xP2FourCy],
+ (instregex "VLD3(d|q)(8|16|32)_UPD$")>;
+ def : InstRW<[SwiftWriteLM9Cy, SwiftWriteP01OneCycle, SwiftVLDMPerm3,
+ SwiftWrite3xP2FourCy],
+ (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
+ // Four element structure loads.
+ def : InstRW<[SwiftWriteLM11Cy, SwiftWriteLM11Cy, SwiftWriteLM11Cy,
+ SwiftWriteLM11Cy, SwiftExt2xP0, SwiftVLDMPerm4,
+ SwiftWrite3xP2FourCy],
+ (instregex "VLD4(d|q)(8|16|32)$")>;
+ def : InstRW<[SwiftWriteLM11Cy, SwiftExt2xP0, SwiftVLDMPerm4,
+ SwiftWrite3xP2FourCy],
+ (instregex "VLD4(d|q)(8|16|32)(oddP|P)seudo$")>;
+ def : InstRW<[SwiftWriteLM11Cy, SwiftWriteLM11Cy, SwiftWriteLM11Cy,
+ SwiftWriteLM11Cy, SwiftWriteP01OneCycle, SwiftExt2xP0,
+ SwiftVLDMPerm4, SwiftWrite3xP2FourCy],
+ (instregex "VLD4(d|q)(8|16|32)_UPD")>;
+ def : InstRW<[SwiftWriteLM11Cy, SwiftWriteP01OneCycle, SwiftExt2xP0,
+ SwiftVLDMPerm4, SwiftWrite3xP2FourCy],
+ (instregex "VLD4(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
+
+ // Single all/lane loads.
+ // One element structure.
+ def : InstRW<[SwiftWriteLM6Cy, SwiftVLDMPerm2],
+ (instregex "VLD1(LN|DUP)(d|q)(8|16|32)$", "VLD1(LN|DUP)(d|q)(8|16|32)Pseudo$")>;
+ def : InstRW<[SwiftWriteLM6Cy, SwiftWriteP01OneCycle, SwiftVLDMPerm2],
+ (instregex "VLD1(LN|DUP)(d|q)(8|16|32)(wb|_UPD)",
+ "VLD1LNq(8|16|32)Pseudo_UPD")>;
+ // Two element structure.
+ def : InstRW<[SwiftWriteLM6Cy, SwiftWriteLM6Cy, SwiftExt1xP0, SwiftVLDMPerm2],
+ (instregex "VLD2(DUP|LN)(d|q)(8|16|32|8x2|16x2|32x2)$",
+ "VLD2LN(d|q)(8|16|32)Pseudo$")>;
+ def : InstRW<[SwiftWriteLM6Cy, SwiftWriteLM6Cy, SwiftWriteP01OneCycle,
+ SwiftExt1xP0, SwiftVLDMPerm2],
+ (instregex "VLD2LN(d|q)(8|16|32)_UPD$")>;
+ def : InstRW<[SwiftWriteLM6Cy, SwiftWriteP01OneCycle, SwiftWriteLM6Cy,
+ SwiftExt1xP0, SwiftVLDMPerm2],
+ (instregex "VLD2DUPd(8|16|32|8x2|16x2|32x2)wb")>;
+ def : InstRW<[SwiftWriteLM6Cy, SwiftWriteP01OneCycle, SwiftWriteLM6Cy,
+ SwiftExt1xP0, SwiftVLDMPerm2],
+ (instregex "VLD2LN(d|q)(8|16|32)Pseudo_UPD")>;
+ // Three element structure.
+ def : InstRW<[SwiftWriteLM7Cy, SwiftWriteLM8Cy, SwiftWriteLM8Cy, SwiftExt1xP0,
+ SwiftVLDMPerm3],
+ (instregex "VLD3(DUP|LN)(d|q)(8|16|32)$",
+ "VLD3(LN|DUP)(d|q)(8|16|32)Pseudo$")>;
+ def : InstRW<[SwiftWriteLM7Cy, SwiftWriteLM8Cy, SwiftWriteLM8Cy,
+ SwiftWriteP01OneCycle, SwiftExt1xP0, SwiftVLDMPerm3],
+ (instregex "VLD3(LN|DUP)(d|q)(8|16|32)_UPD")>;
+ def : InstRW<[SwiftWriteLM7Cy, SwiftWriteP01OneCycle, SwiftWriteLM8Cy,
+ SwiftWriteLM8Cy, SwiftExt1xP0, SwiftVLDMPerm3],
+ (instregex "VLD3(LN|DUP)(d|q)(8|16|32)Pseudo_UPD")>;
+ // Four element struture.
+ def : InstRW<[SwiftWriteLM8Cy, SwiftWriteLM9Cy, SwiftWriteLM10CyNo,
+ SwiftWriteLM10CyNo, SwiftExt1xP0, SwiftVLDMPerm5],
+ (instregex "VLD4(LN|DUP)(d|q)(8|16|32)$",
+ "VLD4(LN|DUP)(d|q)(8|16|32)Pseudo$")>;
+ def : InstRW<[SwiftWriteLM8Cy, SwiftWriteLM9Cy, SwiftWriteLM10CyNo,
+ SwiftWriteLM10CyNo, SwiftWriteP01OneCycle, SwiftExt1xP0,
+ SwiftVLDMPerm5],
+ (instregex "VLD4(DUP|LN)(d|q)(8|16|32)_UPD")>;
+ def : InstRW<[SwiftWriteLM8Cy, SwiftWriteP01OneCycle, SwiftWriteLM9Cy,
+ SwiftWriteLM10CyNo, SwiftWriteLM10CyNo, SwiftExt1xP0,
+ SwiftVLDMPerm5],
+ (instregex "VLD4(DUP|LN)(d|q)(8|16|32)Pseudo_UPD")>;
+ // VSTx
+ // Multiple structures.
+ // Single element structure store.
+ def : InstRW<[SwiftWrite1xP2], (instregex "VST1d(8|16|32|64)$")>;
+ def : InstRW<[SwiftWrite2xP2], (instregex "VST1q(8|16|32|64)$")>;
+ def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite1xP2],
+ (instregex "VST1d(8|16|32|64)wb")>;
+ def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite2xP2],
+ (instregex "VST1q(8|16|32|64)wb")>;
+ def : InstRW<[SwiftWrite3xP2],
+ (instregex "VST1d(8|16|32|64)T$", "VST1d64TPseudo$")>;
+ def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite3xP2],
+ (instregex "VST1d(8|16|32|64)Twb", "VST1d64TPseudoWB")>;
+ def : InstRW<[SwiftWrite4xP2],
+ (instregex "VST1d(8|16|32|64)(Q|QPseudo)$")>;
+ def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite4xP2],
+ (instregex "VST1d(8|16|32|64)(Qwb|QPseudoWB)")>;
+ // Two element structure store.
+ def : InstRW<[SwiftWrite1xP2, SwiftVLDMPerm1],
+ (instregex "VST2(d|b)(8|16|32)$")>;
+ def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite1xP2, SwiftVLDMPerm1],
+ (instregex "VST2(b|d)(8|16|32)wb")>;
+ def : InstRW<[SwiftWrite2xP2, SwiftVLDMPerm2],
+ (instregex "VST2q(8|16|32)$", "VST2q(8|16|32)Pseudo$")>;
+ def : InstRW<[SwiftWrite2xP2, SwiftVLDMPerm2],
+ (instregex "VST2q(8|16|32)wb", "VST2q(8|16|32)PseudoWB")>;
+ // Three element structure store.
+ def : InstRW<[SwiftWrite4xP2, SwiftVLDMPerm2],
+ (instregex "VST3(d|q)(8|16|32)$", "VST3(d|q)(8|16|32)(oddP|P)seudo$")>;
+ def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite4xP2, SwiftVLDMPerm2],
+ (instregex "VST3(d|q)(8|16|32)_UPD",
+ "VST3(d|q)(8|16|32)(oddP|P)seudo_UPD$")>;
+ // Four element structure store.
+ def : InstRW<[SwiftWrite4xP2, SwiftVLDMPerm2],
+ (instregex "VST4(d|q)(8|16|32)$", "VST4(d|q)(8|16|32)(oddP|P)seudo$")>;
+ def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite4xP2, SwiftVLDMPerm4],
+ (instregex "VST4(d|q)(8|16|32)_UPD",
+ "VST4(d|q)(8|16|32)(oddP|P)seudo_UPD$")>;
+ // Single/all lane store.
+ // One element structure.
+ def : InstRW<[SwiftWrite1xP2, SwiftVLDMPerm1],
+ (instregex "VST1LNd(8|16|32)$", "VST1LNq(8|16|32)Pseudo$")>;
+ def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite1xP2, SwiftVLDMPerm1],
+ (instregex "VST1LNd(8|16|32)_UPD", "VST1LNq(8|16|32)Pseudo_UPD")>;
+ // Two element structure.
+ def : InstRW<[SwiftWrite1xP2, SwiftVLDMPerm2],
+ (instregex "VST2LN(d|q)(8|16|32)$", "VST2LN(d|q)(8|16|32)Pseudo$")>;
+ def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite1xP2, SwiftVLDMPerm2],
+ (instregex "VST2LN(d|q)(8|16|32)_UPD",
+ "VST2LN(d|q)(8|16|32)Pseudo_UPD")>;
+ // Three element structure.
+ def : InstRW<[SwiftWrite4xP2, SwiftVLDMPerm2],
+ (instregex "VST3LN(d|q)(8|16|32)$", "VST3LN(d|q)(8|16|32)Pseudo$")>;
+ def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite4xP2, SwiftVLDMPerm2],
+ (instregex "VST3LN(d|q)(8|16|32)_UPD",
+ "VST3LN(d|q)(8|16|32)Pseudo_UPD")>;
+ // Four element structure.
+ def : InstRW<[SwiftWrite2xP2, SwiftVLDMPerm2],
+ (instregex "VST4LN(d|q)(8|16|32)$", "VST4LN(d|q)(8|16|32)Pseudo$")>;
+ def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite2xP2, SwiftVLDMPerm2],
+ (instregex "VST4LN(d|q)(8|16|32)_UPD",
+ "VST4LN(d|q)(8|16|32)Pseudo_UPD")>;
+
+ // 4.2.44 VFP, Divide and Square Root
+ def SwiftDiv17 : SchedWriteRes<[SwiftUnitP0, SwiftUnitDiv]> {
+ let NumMicroOps = 1;
+ let Latency = 17;
+ let ResourceCycles = [1, 15];
+ }
+ def SwiftDiv32 : SchedWriteRes<[SwiftUnitP0, SwiftUnitDiv]> {
+ let NumMicroOps = 1;
+ let Latency = 32;
+ let ResourceCycles = [1, 30];
+ }
+ def : InstRW<[SwiftDiv17], (instregex "VDIVS", "VSQRTS")>;
+ def : InstRW<[SwiftDiv32], (instregex "VDIVD", "VSQRTD")>;
+
+ // Not specified.
+ def : InstRW<[SwiftWriteP01OneCycle2x], (instregex "ABS")>;
+ // Preload.
+ def : WriteRes<WritePreLd, [SwiftUnitP2]> { let Latency = 0;
+ let ResourceCycles = [0];
+ }
+
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleV6.td b/contrib/llvm/lib/Target/ARM/ARMScheduleV6.td
new file mode 100644
index 000000000000..57d0bfb65049
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMScheduleV6.td
@@ -0,0 +1,300 @@
+//===-- ARMScheduleV6.td - ARM v6 Scheduling Definitions ---*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the ARM v6 processors.
+//
+//===----------------------------------------------------------------------===//
+
+// Model based on ARM1176
+//
+// Functional Units
+def V6_Pipe : FuncUnit; // pipeline
+
+// Scheduling information derived from "ARM1176JZF-S Technical Reference Manual"
+//
+def ARMV6Itineraries : ProcessorItineraries<
+ [V6_Pipe], [], [
+ //
+ // No operand cycles
+ InstrItinData<IIC_iALUx , [InstrStage<1, [V6_Pipe]>]>,
+ //
+ // Binary Instructions that produce a result
+ InstrItinData<IIC_iALUi , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
+ InstrItinData<IIC_iALUr , [InstrStage<1, [V6_Pipe]>], [2, 2, 2]>,
+ InstrItinData<IIC_iALUsi , [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>,
+ InstrItinData<IIC_iALUsr , [InstrStage<2, [V6_Pipe]>], [3, 3, 2, 1]>,
+ //
+ // Bitwise Instructions that produce a result
+ InstrItinData<IIC_iBITi , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
+ InstrItinData<IIC_iBITr , [InstrStage<1, [V6_Pipe]>], [2, 2, 2]>,
+ InstrItinData<IIC_iBITsi , [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>,
+ InstrItinData<IIC_iBITsr , [InstrStage<2, [V6_Pipe]>], [3, 3, 2, 1]>,
+ //
+ // Unary Instructions that produce a result
+ InstrItinData<IIC_iUNAr , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
+ InstrItinData<IIC_iUNAsi , [InstrStage<1, [V6_Pipe]>], [2, 1]>,
+ //
+ // Zero and sign extension instructions
+ InstrItinData<IIC_iEXTr , [InstrStage<1, [V6_Pipe]>], [1, 1]>,
+ InstrItinData<IIC_iEXTAr , [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>,
+ InstrItinData<IIC_iEXTAsr , [InstrStage<2, [V6_Pipe]>], [3, 3, 2, 1]>,
+ //
+ // Compare instructions
+ InstrItinData<IIC_iCMPi , [InstrStage<1, [V6_Pipe]>], [2]>,
+ InstrItinData<IIC_iCMPr , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
+ InstrItinData<IIC_iCMPsi , [InstrStage<1, [V6_Pipe]>], [2, 1]>,
+ InstrItinData<IIC_iCMPsr , [InstrStage<2, [V6_Pipe]>], [3, 2, 1]>,
+ //
+ // Test instructions
+ InstrItinData<IIC_iTSTi , [InstrStage<1, [V6_Pipe]>], [2]>,
+ InstrItinData<IIC_iTSTr , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
+ InstrItinData<IIC_iTSTsi , [InstrStage<1, [V6_Pipe]>], [2, 1]>,
+ InstrItinData<IIC_iTSTsr , [InstrStage<2, [V6_Pipe]>], [3, 2, 1]>,
+ //
+ // Move instructions, unconditional
+ InstrItinData<IIC_iMOVi , [InstrStage<1, [V6_Pipe]>], [2]>,
+ InstrItinData<IIC_iMOVr , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
+ InstrItinData<IIC_iMOVsi , [InstrStage<1, [V6_Pipe]>], [2, 1]>,
+ InstrItinData<IIC_iMOVsr , [InstrStage<2, [V6_Pipe]>], [3, 2, 1]>,
+ InstrItinData<IIC_iMOVix2 , [InstrStage<1, [V6_Pipe]>,
+ InstrStage<1, [V6_Pipe]>], [2]>,
+ InstrItinData<IIC_iMOVix2addpc,[InstrStage<1, [V6_Pipe]>,
+ InstrStage<1, [V6_Pipe]>,
+ InstrStage<1, [V6_Pipe]>], [3]>,
+ InstrItinData<IIC_iMOVix2ld , [InstrStage<1, [V6_Pipe]>,
+ InstrStage<1, [V6_Pipe]>,
+ InstrStage<1, [V6_Pipe]>], [5]>,
+ //
+ // Move instructions, conditional
+ InstrItinData<IIC_iCMOVi , [InstrStage<1, [V6_Pipe]>], [3]>,
+ InstrItinData<IIC_iCMOVr , [InstrStage<1, [V6_Pipe]>], [3, 2]>,
+ InstrItinData<IIC_iCMOVsi , [InstrStage<1, [V6_Pipe]>], [3, 1]>,
+ InstrItinData<IIC_iCMOVsr , [InstrStage<1, [V6_Pipe]>], [4, 2, 1]>,
+ InstrItinData<IIC_iCMOVix2 , [InstrStage<1, [V6_Pipe]>,
+ InstrStage<1, [V6_Pipe]>], [4]>,
+ //
+ // MVN instructions
+ InstrItinData<IIC_iMVNi , [InstrStage<1, [V6_Pipe]>], [2]>,
+ InstrItinData<IIC_iMVNr , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
+ InstrItinData<IIC_iMVNsi , [InstrStage<1, [V6_Pipe]>], [2, 1]>,
+ InstrItinData<IIC_iMVNsr , [InstrStage<2, [V6_Pipe]>], [3, 2, 1]>,
+
+ // Integer multiply pipeline
+ //
+ InstrItinData<IIC_iMUL16 , [InstrStage<1, [V6_Pipe]>], [4, 1, 1]>,
+ InstrItinData<IIC_iMAC16 , [InstrStage<1, [V6_Pipe]>], [4, 1, 1, 2]>,
+ InstrItinData<IIC_iMUL32 , [InstrStage<2, [V6_Pipe]>], [5, 1, 1]>,
+ InstrItinData<IIC_iMAC32 , [InstrStage<2, [V6_Pipe]>], [5, 1, 1, 2]>,
+ InstrItinData<IIC_iMUL64 , [InstrStage<3, [V6_Pipe]>], [6, 1, 1]>,
+ InstrItinData<IIC_iMAC64 , [InstrStage<3, [V6_Pipe]>], [6, 1, 1, 2]>,
+
+ // Integer load pipeline
+ //
+ // Immediate offset
+ InstrItinData<IIC_iLoad_i , [InstrStage<1, [V6_Pipe]>], [4, 1]>,
+ InstrItinData<IIC_iLoad_bh_i, [InstrStage<1, [V6_Pipe]>], [4, 1]>,
+ InstrItinData<IIC_iLoad_d_i , [InstrStage<1, [V6_Pipe]>], [4, 1]>,
+ //
+ // Register offset
+ InstrItinData<IIC_iLoad_r , [InstrStage<1, [V6_Pipe]>], [4, 1, 1]>,
+ InstrItinData<IIC_iLoad_bh_r, [InstrStage<1, [V6_Pipe]>], [4, 1, 1]>,
+ InstrItinData<IIC_iLoad_d_r , [InstrStage<1, [V6_Pipe]>], [4, 1, 1]>,
+ //
+ // Scaled register offset, issues over 2 cycles
+ InstrItinData<IIC_iLoad_si , [InstrStage<2, [V6_Pipe]>], [5, 2, 1]>,
+ InstrItinData<IIC_iLoad_bh_si, [InstrStage<2, [V6_Pipe]>], [5, 2, 1]>,
+ //
+ // Immediate offset with update
+ InstrItinData<IIC_iLoad_iu , [InstrStage<1, [V6_Pipe]>], [4, 2, 1]>,
+ InstrItinData<IIC_iLoad_bh_iu, [InstrStage<1, [V6_Pipe]>], [4, 2, 1]>,
+ //
+ // Register offset with update
+ InstrItinData<IIC_iLoad_ru , [InstrStage<1, [V6_Pipe]>], [4, 2, 1, 1]>,
+ InstrItinData<IIC_iLoad_bh_ru, [InstrStage<1, [V6_Pipe]>], [4, 2, 1, 1]>,
+ InstrItinData<IIC_iLoad_d_ru , [InstrStage<1, [V6_Pipe]>], [4, 2, 1, 1]>,
+ //
+ // Scaled register offset with update, issues over 2 cycles
+ InstrItinData<IIC_iLoad_siu, [InstrStage<2, [V6_Pipe]>], [5, 2, 2, 1]>,
+ InstrItinData<IIC_iLoad_bh_siu,[InstrStage<2, [V6_Pipe]>], [5, 2, 2, 1]>,
+
+ //
+ // Load multiple, def is the 5th operand.
+ InstrItinData<IIC_iLoad_m , [InstrStage<3, [V6_Pipe]>], [1, 1, 1, 1, 4]>,
+ //
+ // Load multiple + update, defs are the 1st and 5th operands.
+ InstrItinData<IIC_iLoad_mu , [InstrStage<3, [V6_Pipe]>], [2, 1, 1, 1, 4]>,
+ //
+ // Load multiple plus branch
+ InstrItinData<IIC_iLoad_mBr, [InstrStage<3, [V6_Pipe]>,
+ InstrStage<1, [V6_Pipe]>], [1, 2, 1, 1, 4]>,
+
+ //
+ // iLoadi + iALUr for t2LDRpci_pic.
+ InstrItinData<IIC_iLoadiALU, [InstrStage<1, [V6_Pipe]>,
+ InstrStage<1, [V6_Pipe]>], [3, 1]>,
+
+ //
+ // Pop, def is the 3rd operand.
+ InstrItinData<IIC_iPop , [InstrStage<3, [V6_Pipe]>], [1, 1, 4]>,
+ //
+ // Pop + branch, def is the 3rd operand.
+ InstrItinData<IIC_iPop_Br, [InstrStage<3, [V6_Pipe]>,
+ InstrStage<1, [V6_Pipe]>], [1, 2, 4]>,
+
+ // Integer store pipeline
+ //
+ // Immediate offset
+ InstrItinData<IIC_iStore_i , [InstrStage<1, [V6_Pipe]>], [2, 1]>,
+ InstrItinData<IIC_iStore_bh_i, [InstrStage<1, [V6_Pipe]>], [2, 1]>,
+ InstrItinData<IIC_iStore_d_i , [InstrStage<1, [V6_Pipe]>], [2, 1]>,
+ //
+ // Register offset
+ InstrItinData<IIC_iStore_r , [InstrStage<1, [V6_Pipe]>], [2, 1, 1]>,
+ InstrItinData<IIC_iStore_bh_r, [InstrStage<1, [V6_Pipe]>], [2, 1, 1]>,
+ InstrItinData<IIC_iStore_d_r , [InstrStage<1, [V6_Pipe]>], [2, 1, 1]>,
+ //
+ // Scaled register offset, issues over 2 cycles
+ InstrItinData<IIC_iStore_si , [InstrStage<2, [V6_Pipe]>], [2, 2, 1]>,
+ InstrItinData<IIC_iStore_bh_si, [InstrStage<2, [V6_Pipe]>], [2, 2, 1]>,
+ //
+ // Immediate offset with update
+ InstrItinData<IIC_iStore_iu , [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>,
+ InstrItinData<IIC_iStore_bh_iu, [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>,
+ //
+ // Register offset with update
+ InstrItinData<IIC_iStore_ru, [InstrStage<1, [V6_Pipe]>], [2, 2, 1, 1]>,
+ InstrItinData<IIC_iStore_bh_ru,[InstrStage<1, [V6_Pipe]>], [2, 2, 1, 1]>,
+ InstrItinData<IIC_iStore_d_ru, [InstrStage<1, [V6_Pipe]>], [2, 2, 1, 1]>,
+ //
+ // Scaled register offset with update, issues over 2 cycles
+ InstrItinData<IIC_iStore_siu, [InstrStage<2, [V6_Pipe]>], [2, 2, 2, 1]>,
+ InstrItinData<IIC_iStore_bh_siu,[InstrStage<2, [V6_Pipe]>], [2, 2, 2, 1]>,
+ //
+ // Store multiple
+ InstrItinData<IIC_iStore_m , [InstrStage<3, [V6_Pipe]>]>,
+ //
+ // Store multiple + update
+ InstrItinData<IIC_iStore_mu , [InstrStage<3, [V6_Pipe]>], [2]>,
+
+ // Branch
+ //
+ // no delay slots, so the latency of a branch is unimportant
+ InstrItinData<IIC_Br , [InstrStage<1, [V6_Pipe]>]>,
+
+ // VFP
+ // Issue through integer pipeline, and execute in NEON unit. We assume
+ // RunFast mode so that NFP pipeline is used for single-precision when
+ // possible.
+ //
+ // FP Special Register to Integer Register File Move
+ InstrItinData<IIC_fpSTAT , [InstrStage<1, [V6_Pipe]>], [3]>,
+ //
+ // Single-precision FP Unary
+ InstrItinData<IIC_fpUNA32 , [InstrStage<1, [V6_Pipe]>], [5, 2]>,
+ //
+ // Double-precision FP Unary
+ InstrItinData<IIC_fpUNA64 , [InstrStage<1, [V6_Pipe]>], [5, 2]>,
+ //
+ // Single-precision FP Compare
+ InstrItinData<IIC_fpCMP32 , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
+ //
+ // Double-precision FP Compare
+ InstrItinData<IIC_fpCMP64 , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
+ //
+ // Single to Double FP Convert
+ InstrItinData<IIC_fpCVTSD , [InstrStage<1, [V6_Pipe]>], [5, 2]>,
+ //
+ // Double to Single FP Convert
+ InstrItinData<IIC_fpCVTDS , [InstrStage<1, [V6_Pipe]>], [5, 2]>,
+ //
+ // Single-Precision FP to Integer Convert
+ InstrItinData<IIC_fpCVTSI , [InstrStage<1, [V6_Pipe]>], [9, 2]>,
+ //
+ // Double-Precision FP to Integer Convert
+ InstrItinData<IIC_fpCVTDI , [InstrStage<1, [V6_Pipe]>], [9, 2]>,
+ //
+ // Integer to Single-Precision FP Convert
+ InstrItinData<IIC_fpCVTIS , [InstrStage<1, [V6_Pipe]>], [9, 2]>,
+ //
+ // Integer to Double-Precision FP Convert
+ InstrItinData<IIC_fpCVTID , [InstrStage<1, [V6_Pipe]>], [9, 2]>,
+ //
+ // Single-precision FP ALU
+ InstrItinData<IIC_fpALU32 , [InstrStage<1, [V6_Pipe]>], [9, 2, 2]>,
+ //
+ // Double-precision FP ALU
+ InstrItinData<IIC_fpALU64 , [InstrStage<1, [V6_Pipe]>], [9, 2, 2]>,
+ //
+ // Single-precision FP Multiply
+ InstrItinData<IIC_fpMUL32 , [InstrStage<1, [V6_Pipe]>], [9, 2, 2]>,
+ //
+ // Double-precision FP Multiply
+ InstrItinData<IIC_fpMUL64 , [InstrStage<2, [V6_Pipe]>], [9, 2, 2]>,
+ //
+ // Single-precision FP MAC
+ InstrItinData<IIC_fpMAC32 , [InstrStage<1, [V6_Pipe]>], [9, 2, 2, 2]>,
+ //
+ // Double-precision FP MAC
+ InstrItinData<IIC_fpMAC64 , [InstrStage<2, [V6_Pipe]>], [9, 2, 2, 2]>,
+ //
+ // Single-precision Fused FP MAC
+ InstrItinData<IIC_fpFMAC32, [InstrStage<1, [V6_Pipe]>], [9, 2, 2, 2]>,
+ //
+ // Double-precision Fused FP MAC
+ InstrItinData<IIC_fpFMAC64, [InstrStage<2, [V6_Pipe]>], [9, 2, 2, 2]>,
+ //
+ // Single-precision FP DIV
+ InstrItinData<IIC_fpDIV32 , [InstrStage<15, [V6_Pipe]>], [20, 2, 2]>,
+ //
+ // Double-precision FP DIV
+ InstrItinData<IIC_fpDIV64 , [InstrStage<29, [V6_Pipe]>], [34, 2, 2]>,
+ //
+ // Single-precision FP SQRT
+ InstrItinData<IIC_fpSQRT32 , [InstrStage<15, [V6_Pipe]>], [20, 2, 2]>,
+ //
+ // Double-precision FP SQRT
+ InstrItinData<IIC_fpSQRT64 , [InstrStage<29, [V6_Pipe]>], [34, 2, 2]>,
+ //
+ // Integer to Single-precision Move
+ InstrItinData<IIC_fpMOVIS, [InstrStage<1, [V6_Pipe]>], [10, 1]>,
+ //
+ // Integer to Double-precision Move
+ InstrItinData<IIC_fpMOVID, [InstrStage<1, [V6_Pipe]>], [10, 1, 1]>,
+ //
+ // Single-precision to Integer Move
+ InstrItinData<IIC_fpMOVSI, [InstrStage<1, [V6_Pipe]>], [10, 1]>,
+ //
+ // Double-precision to Integer Move
+ InstrItinData<IIC_fpMOVDI, [InstrStage<1, [V6_Pipe]>], [10, 10, 1]>,
+ //
+ // Single-precision FP Load
+ InstrItinData<IIC_fpLoad32 , [InstrStage<1, [V6_Pipe]>], [5, 2, 2]>,
+ //
+ // Double-precision FP Load
+ InstrItinData<IIC_fpLoad64 , [InstrStage<1, [V6_Pipe]>], [5, 2, 2]>,
+ //
+ // FP Load Multiple
+ InstrItinData<IIC_fpLoad_m , [InstrStage<3, [V6_Pipe]>], [2, 1, 1, 5]>,
+ //
+ // FP Load Multiple + update
+ InstrItinData<IIC_fpLoad_mu, [InstrStage<3, [V6_Pipe]>], [3, 2, 1, 1, 5]>,
+ //
+ // Single-precision FP Store
+ InstrItinData<IIC_fpStore32 , [InstrStage<1, [V6_Pipe]>], [2, 2, 2]>,
+ //
+ // Double-precision FP Store
+ // use FU_Issue to enforce the 1 load/store per cycle limit
+ InstrItinData<IIC_fpStore64 , [InstrStage<1, [V6_Pipe]>], [2, 2, 2]>,
+ //
+ // FP Store Multiple
+ InstrItinData<IIC_fpStore_m, [InstrStage<3, [V6_Pipe]>], [2, 2, 2, 2]>,
+ //
+ // FP Store Multiple + update
+ InstrItinData<IIC_fpStore_mu,[InstrStage<3, [V6_Pipe]>], [3, 2, 2, 2, 2]>
+]>;
diff --git a/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
new file mode 100644
index 000000000000..3b99762f7157
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -0,0 +1,261 @@
+//===-- ARMSelectionDAGInfo.cpp - ARM SelectionDAG Info -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARMSelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/DerivedTypes.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-selectiondag-info"
+
+// Emit, if possible, a specialized version of the given Libcall. Typically this
+// means selecting the appropriately aligned version, but we also convert memset
+// of 0 into memclr.
+SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
+ SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align, RTLIB::Libcall LC) const {
+ const ARMSubtarget &Subtarget =
+ DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
+ const ARMTargetLowering *TLI = Subtarget.getTargetLowering();
+
+ // Only use a specialized AEABI function if the default version of this
+ // Libcall is an AEABI function.
+ if (std::strncmp(TLI->getLibcallName(LC), "__aeabi", 7) != 0)
+ return SDValue();
+
+ // Translate RTLIB::Libcall to AEABILibcall. We only do this in order to be
+ // able to translate memset to memclr and use the value to index the function
+ // name array.
+ enum {
+ AEABI_MEMCPY = 0,
+ AEABI_MEMMOVE,
+ AEABI_MEMSET,
+ AEABI_MEMCLR
+ } AEABILibcall;
+ switch (LC) {
+ case RTLIB::MEMCPY:
+ AEABILibcall = AEABI_MEMCPY;
+ break;
+ case RTLIB::MEMMOVE:
+ AEABILibcall = AEABI_MEMMOVE;
+ break;
+ case RTLIB::MEMSET:
+ AEABILibcall = AEABI_MEMSET;
+ if (ConstantSDNode *ConstantSrc = dyn_cast<ConstantSDNode>(Src))
+ if (ConstantSrc->getZExtValue() == 0)
+ AEABILibcall = AEABI_MEMCLR;
+ break;
+ default:
+ return SDValue();
+ }
+
+ // Choose the most-aligned libcall variant that we can
+ enum {
+ ALIGN1 = 0,
+ ALIGN4,
+ ALIGN8
+ } AlignVariant;
+ if ((Align & 7) == 0)
+ AlignVariant = ALIGN8;
+ else if ((Align & 3) == 0)
+ AlignVariant = ALIGN4;
+ else
+ AlignVariant = ALIGN1;
+
+ TargetLowering::ArgListTy Args;
+ TargetLowering::ArgListEntry Entry;
+ Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
+ Entry.Node = Dst;
+ Args.push_back(Entry);
+ if (AEABILibcall == AEABI_MEMCLR) {
+ Entry.Node = Size;
+ Args.push_back(Entry);
+ } else if (AEABILibcall == AEABI_MEMSET) {
+ // Adjust parameters for memset, EABI uses format (ptr, size, value),
+ // GNU library uses (ptr, value, size)
+ // See RTABI section 4.3.4
+ Entry.Node = Size;
+ Args.push_back(Entry);
+
+ // Extend or truncate the argument to be an i32 value for the call.
+ if (Src.getValueType().bitsGT(MVT::i32))
+ Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
+ else if (Src.getValueType().bitsLT(MVT::i32))
+ Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src);
+
+ Entry.Node = Src;
+ Entry.Ty = Type::getInt32Ty(*DAG.getContext());
+ Entry.isSExt = false;
+ Args.push_back(Entry);
+ } else {
+ Entry.Node = Src;
+ Args.push_back(Entry);
+
+ Entry.Node = Size;
+ Args.push_back(Entry);
+ }
+
+ char const *FunctionNames[4][3] = {
+ { "__aeabi_memcpy", "__aeabi_memcpy4", "__aeabi_memcpy8" },
+ { "__aeabi_memmove", "__aeabi_memmove4", "__aeabi_memmove8" },
+ { "__aeabi_memset", "__aeabi_memset4", "__aeabi_memset8" },
+ { "__aeabi_memclr", "__aeabi_memclr4", "__aeabi_memclr8" }
+ };
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(dl)
+ .setChain(Chain)
+ .setCallee(
+ TLI->getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()),
+ DAG.getExternalSymbol(FunctionNames[AEABILibcall][AlignVariant],
+ TLI->getPointerTy(DAG.getDataLayout())),
+ std::move(Args))
+ .setDiscardResult();
+ std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
+
+ return CallResult.second;
+}
+
+SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
+ SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline,
+ MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
+ const ARMSubtarget &Subtarget =
+ DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
+ // Do repeated 4-byte loads and stores. To be improved.
+ // This requires 4-byte alignment.
+ if ((Align & 3) != 0)
+ return SDValue();
+ // This requires the copy size to be a constant, preferably
+ // within a subtarget-specific limit.
+ ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+ if (!ConstantSize)
+ return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align,
+ RTLIB::MEMCPY);
+ uint64_t SizeVal = ConstantSize->getZExtValue();
+ if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
+ return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align,
+ RTLIB::MEMCPY);
+
+ unsigned BytesLeft = SizeVal & 3;
+ unsigned NumMemOps = SizeVal >> 2;
+ unsigned EmittedNumMemOps = 0;
+ EVT VT = MVT::i32;
+ unsigned VTSize = 4;
+ unsigned i = 0;
+ // Emit a maximum of 4 loads in Thumb1 since we have fewer registers
+ const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6;
+ SDValue TFOps[6];
+ SDValue Loads[6];
+ uint64_t SrcOff = 0, DstOff = 0;
+
+ // FIXME: We should invent a VMEMCPY pseudo-instruction that lowers to
+ // VLDM/VSTM and make this code emit it when appropriate. This would reduce
+ // pressure on the general purpose registers. However this seems harder to map
+ // onto the register allocator's view of the world.
+
+ // The number of MEMCPY pseudo-instructions to emit. We use up to
+ // MaxLoadsInLDM registers per mcopy, which will get lowered into ldm/stm
+ // later on. This is a lower bound on the number of MEMCPY operations we must
+ // emit.
+ unsigned NumMEMCPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM;
+
+ // Code size optimisation: do not inline memcpy if expansion results in
+ // more instructions than the libary call.
+ if (NumMEMCPYs > 1 && DAG.getMachineFunction().getFunction()->optForMinSize()) {
+ return SDValue();
+ }
+
+ SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other, MVT::Glue);
+
+ for (unsigned I = 0; I != NumMEMCPYs; ++I) {
+ // Evenly distribute registers among MEMCPY operations to reduce register
+ // pressure.
+ unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMEMCPYs;
+ unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps;
+
+ Dst = DAG.getNode(ARMISD::MEMCPY, dl, VTs, Chain, Dst, Src,
+ DAG.getConstant(NumRegs, dl, MVT::i32));
+ Src = Dst.getValue(1);
+ Chain = Dst.getValue(2);
+
+ DstPtrInfo = DstPtrInfo.getWithOffset(NumRegs * VTSize);
+ SrcPtrInfo = SrcPtrInfo.getWithOffset(NumRegs * VTSize);
+
+ EmittedNumMemOps = NextEmittedNumMemOps;
+ }
+
+ if (BytesLeft == 0)
+ return Chain;
+
+ // Issue loads / stores for the trailing (1 - 3) bytes.
+ unsigned BytesLeftSave = BytesLeft;
+ i = 0;
+ while (BytesLeft) {
+ if (BytesLeft >= 2) {
+ VT = MVT::i16;
+ VTSize = 2;
+ } else {
+ VT = MVT::i8;
+ VTSize = 1;
+ }
+
+ Loads[i] = DAG.getLoad(VT, dl, Chain,
+ DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
+ DAG.getConstant(SrcOff, dl, MVT::i32)),
+ SrcPtrInfo.getWithOffset(SrcOff));
+ TFOps[i] = Loads[i].getValue(1);
+ ++i;
+ SrcOff += VTSize;
+ BytesLeft -= VTSize;
+ }
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ makeArrayRef(TFOps, i));
+
+ i = 0;
+ BytesLeft = BytesLeftSave;
+ while (BytesLeft) {
+ if (BytesLeft >= 2) {
+ VT = MVT::i16;
+ VTSize = 2;
+ } else {
+ VT = MVT::i8;
+ VTSize = 1;
+ }
+
+ TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
+ DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
+ DAG.getConstant(DstOff, dl, MVT::i32)),
+ DstPtrInfo.getWithOffset(DstOff));
+ ++i;
+ DstOff += VTSize;
+ BytesLeft -= VTSize;
+ }
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ makeArrayRef(TFOps, i));
+}
+
+SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemmove(
+ SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align, bool isVolatile,
+ MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
+ return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align,
+ RTLIB::MEMMOVE);
+}
+
+SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset(
+ SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align, bool isVolatile,
+ MachinePointerInfo DstPtrInfo) const {
+ return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align,
+ RTLIB::MEMSET);
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
new file mode 100644
index 000000000000..2ddb42c95397
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
@@ -0,0 +1,69 @@
+//===-- ARMSelectionDAGInfo.h - ARM SelectionDAG Info -----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the ARM subclass for SelectionDAGTargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMSELECTIONDAGINFO_H
+#define LLVM_LIB_TARGET_ARM_ARMSELECTIONDAGINFO_H
+
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+
+namespace llvm {
+
+namespace ARM_AM {
+ static inline ShiftOpc getShiftOpcForNode(unsigned Opcode) {
+ switch (Opcode) {
+ default: return ARM_AM::no_shift;
+ case ISD::SHL: return ARM_AM::lsl;
+ case ISD::SRL: return ARM_AM::lsr;
+ case ISD::SRA: return ARM_AM::asr;
+ case ISD::ROTR: return ARM_AM::ror;
+ //case ISD::ROTL: // Only if imm -> turn into ROTR.
+ // Can't handle RRX here, because it would require folding a flag into
+ // the addressing mode. :( This causes us to miss certain things.
+ //case ARMISD::RRX: return ARM_AM::rrx;
+ }
+ }
+} // end namespace ARM_AM
+
+class ARMSelectionDAGInfo : public SelectionDAGTargetInfo {
+public:
+ SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
+ SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align, bool isVolatile,
+ bool AlwaysInline,
+ MachinePointerInfo DstPtrInfo,
+ MachinePointerInfo SrcPtrInfo) const override;
+
+ SDValue
+ EmitTargetCodeForMemmove(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain,
+ SDValue Dst, SDValue Src, SDValue Size,
+ unsigned Align, bool isVolatile,
+ MachinePointerInfo DstPtrInfo,
+ MachinePointerInfo SrcPtrInfo) const override;
+
+ // Adjust parameters for memset, see RTABI section 4.3.4
+ SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
+ SDValue Chain, SDValue Op1, SDValue Op2,
+ SDValue Op3, unsigned Align, bool isVolatile,
+ MachinePointerInfo DstPtrInfo) const override;
+
+ SDValue EmitSpecializedLibcall(SelectionDAG &DAG, const SDLoc &dl,
+ SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align,
+ RTLIB::Libcall LC) const;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp b/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp
new file mode 100644
index 000000000000..e2df0bddd0d1
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -0,0 +1,382 @@
+//===-- ARMSubtarget.cpp - ARM Subtarget Information ----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARM specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMSubtarget.h"
+#include "ARMFrameLowering.h"
+#include "ARMISelLowering.h"
+#include "ARMInstrInfo.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMSelectionDAGInfo.h"
+#include "ARMSubtarget.h"
+#include "ARMTargetMachine.h"
+#include "Thumb1FrameLowering.h"
+#include "Thumb1InstrInfo.h"
+#include "Thumb2InstrInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/TargetParser.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "ARMGenSubtargetInfo.inc"
+
+static cl::opt<bool>
+UseFusedMulOps("arm-use-mulops",
+ cl::init(true), cl::Hidden);
+
+enum ITMode {
+ DefaultIT,
+ RestrictedIT,
+ NoRestrictedIT
+};
+
+static cl::opt<ITMode>
+IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT),
+ cl::ZeroOrMore,
+ cl::values(clEnumValN(DefaultIT, "arm-default-it",
+ "Generate IT block based on arch"),
+ clEnumValN(RestrictedIT, "arm-restrict-it",
+ "Disallow deprecated IT based on ARMv8"),
+ clEnumValN(NoRestrictedIT, "arm-no-restrict-it",
+ "Allow IT blocks based on ARMv7")));
+
+/// ForceFastISel - Use the fast-isel, even for subtargets where it is not
+/// currently supported (for testing only).
+static cl::opt<bool>
+ForceFastISel("arm-force-fast-isel",
+ cl::init(false), cl::Hidden);
+
+/// initializeSubtargetDependencies - Initializes using a CPU and feature string
+/// so that we can use initializer lists for subtarget initialization.
+ARMSubtarget &ARMSubtarget::initializeSubtargetDependencies(StringRef CPU,
+ StringRef FS) {
+ initializeEnvironment();
+ initSubtargetFeatures(CPU, FS);
+ return *this;
+}
+
+/// EnableExecuteOnly - Enables the generation of execute-only code on supported
+/// targets
+static cl::opt<bool>
+EnableExecuteOnly("arm-execute-only");
+
+ARMFrameLowering *ARMSubtarget::initializeFrameLowering(StringRef CPU,
+ StringRef FS) {
+ ARMSubtarget &STI = initializeSubtargetDependencies(CPU, FS);
+ if (STI.isThumb1Only())
+ return (ARMFrameLowering *)new Thumb1FrameLowering(STI);
+
+ return new ARMFrameLowering(STI);
+}
+
+ARMSubtarget::ARMSubtarget(const Triple &TT, const std::string &CPU,
+ const std::string &FS,
+ const ARMBaseTargetMachine &TM, bool IsLittle)
+ : ARMGenSubtargetInfo(TT, CPU, FS), UseMulOps(UseFusedMulOps),
+ GenExecuteOnly(EnableExecuteOnly), CPUString(CPU), IsLittle(IsLittle),
+ TargetTriple(TT), Options(TM.Options), TM(TM),
+ FrameLowering(initializeFrameLowering(CPU, FS)),
+ // At this point initializeSubtargetDependencies has been called so
+ // we can query directly.
+ InstrInfo(isThumb1Only()
+ ? (ARMBaseInstrInfo *)new Thumb1InstrInfo(*this)
+ : !isThumb()
+ ? (ARMBaseInstrInfo *)new ARMInstrInfo(*this)
+ : (ARMBaseInstrInfo *)new Thumb2InstrInfo(*this)),
+ TLInfo(TM, *this), GISel() {}
+
+const CallLowering *ARMSubtarget::getCallLowering() const {
+ assert(GISel && "Access to GlobalISel APIs not set");
+ return GISel->getCallLowering();
+}
+
+const InstructionSelector *ARMSubtarget::getInstructionSelector() const {
+ assert(GISel && "Access to GlobalISel APIs not set");
+ return GISel->getInstructionSelector();
+}
+
+const LegalizerInfo *ARMSubtarget::getLegalizerInfo() const {
+ assert(GISel && "Access to GlobalISel APIs not set");
+ return GISel->getLegalizerInfo();
+}
+
+const RegisterBankInfo *ARMSubtarget::getRegBankInfo() const {
+ assert(GISel && "Access to GlobalISel APIs not set");
+ return GISel->getRegBankInfo();
+}
+
+bool ARMSubtarget::isXRaySupported() const {
+ // We don't currently suppport Thumb, but Windows requires Thumb.
+ return hasV6Ops() && hasARMOps() && !isTargetWindows();
+}
+
+void ARMSubtarget::initializeEnvironment() {
+ // MCAsmInfo isn't always present (e.g. in opt) so we can't initialize this
+ // directly from it, but we can try to make sure they're consistent when both
+ // available.
+ UseSjLjEH = isTargetDarwin() && !isTargetWatchABI();
+ assert((!TM.getMCAsmInfo() ||
+ (TM.getMCAsmInfo()->getExceptionHandlingType() ==
+ ExceptionHandling::SjLj) == UseSjLjEH) &&
+ "inconsistent sjlj choice between CodeGen and MC");
+}
+
+void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
+ if (CPUString.empty()) {
+ CPUString = "generic";
+
+ if (isTargetDarwin()) {
+ StringRef ArchName = TargetTriple.getArchName();
+ unsigned ArchKind = llvm::ARM::parseArch(ArchName);
+ if (ArchKind == llvm::ARM::AK_ARMV7S)
+ // Default to the Swift CPU when targeting armv7s/thumbv7s.
+ CPUString = "swift";
+ else if (ArchKind == llvm::ARM::AK_ARMV7K)
+ // Default to the Cortex-a7 CPU when targeting armv7k/thumbv7k.
+ // ARMv7k does not use SjLj exception handling.
+ CPUString = "cortex-a7";
+ }
+ }
+
+ // Insert the architecture feature derived from the target triple into the
+ // feature string. This is important for setting features that are implied
+ // based on the architecture version.
+ std::string ArchFS = ARM_MC::ParseARMTriple(TargetTriple, CPUString);
+ if (!FS.empty()) {
+ if (!ArchFS.empty())
+ ArchFS = (Twine(ArchFS) + "," + FS).str();
+ else
+ ArchFS = FS;
+ }
+ ParseSubtargetFeatures(CPUString, ArchFS);
+
+ // FIXME: This used enable V6T2 support implicitly for Thumb2 mode.
+ // Assert this for now to make the change obvious.
+ assert(hasV6T2Ops() || !hasThumb2());
+
+ // Execute only support requires movt support
+ if (genExecuteOnly())
+ assert(hasV8MBaselineOps() && !NoMovt && "Cannot generate execute-only code for this target");
+
+ // Keep a pointer to static instruction cost data for the specified CPU.
+ SchedModel = getSchedModelForCPU(CPUString);
+
+ // Initialize scheduling itinerary for the specified CPU.
+ InstrItins = getInstrItineraryForCPU(CPUString);
+
+ // FIXME: this is invalid for WindowsCE
+ if (isTargetWindows())
+ NoARM = true;
+
+ if (isAAPCS_ABI())
+ stackAlignment = 8;
+ if (isTargetNaCl() || isAAPCS16_ABI())
+ stackAlignment = 16;
+
+ // FIXME: Completely disable sibcall for Thumb1 since ThumbRegisterInfo::
+ // emitEpilogue is not ready for them. Thumb tail calls also use t2B, as
+ // the Thumb1 16-bit unconditional branch doesn't have sufficient relocation
+ // support in the assembler and linker to be used. This would need to be
+ // fixed to fully support tail calls in Thumb1.
+ //
+ // Doing this is tricky, since the LDM/POP instruction on Thumb doesn't take
+ // LR. This means if we need to reload LR, it takes an extra instructions,
+ // which outweighs the value of the tail call; but here we don't know yet
+ // whether LR is going to be used. Probably the right approach is to
+ // generate the tail call here and turn it back into CALL/RET in
+ // emitEpilogue if LR is used.
+
+ // Thumb1 PIC calls to external symbols use BX, so they can be tail calls,
+ // but we need to make sure there are enough registers; the only valid
+ // registers are the 4 used for parameters. We don't currently do this
+ // case.
+
+ SupportsTailCall = !isThumb() || hasV8MBaselineOps();
+
+ if (isTargetMachO() && isTargetIOS() && getTargetTriple().isOSVersionLT(5, 0))
+ SupportsTailCall = false;
+
+ switch (IT) {
+ case DefaultIT:
+ RestrictIT = hasV8Ops();
+ break;
+ case RestrictedIT:
+ RestrictIT = true;
+ break;
+ case NoRestrictedIT:
+ RestrictIT = false;
+ break;
+ }
+
+ // NEON f32 ops are non-IEEE 754 compliant. Darwin is ok with it by default.
+ const FeatureBitset &Bits = getFeatureBits();
+ if ((Bits[ARM::ProcA5] || Bits[ARM::ProcA8]) && // Where this matters
+ (Options.UnsafeFPMath || isTargetDarwin()))
+ UseNEONForSinglePrecisionFP = true;
+
+ if (isRWPI())
+ ReserveR9 = true;
+
+ // FIXME: Teach TableGen to deal with these instead of doing it manually here.
+ switch (ARMProcFamily) {
+ case Others:
+ case CortexA5:
+ break;
+ case CortexA7:
+ LdStMultipleTiming = DoubleIssue;
+ break;
+ case CortexA8:
+ LdStMultipleTiming = DoubleIssue;
+ break;
+ case CortexA9:
+ LdStMultipleTiming = DoubleIssueCheckUnalignedAccess;
+ PreISelOperandLatencyAdjustment = 1;
+ break;
+ case CortexA12:
+ break;
+ case CortexA15:
+ MaxInterleaveFactor = 2;
+ PreISelOperandLatencyAdjustment = 1;
+ PartialUpdateClearance = 12;
+ break;
+ case CortexA17:
+ case CortexA32:
+ case CortexA35:
+ case CortexA53:
+ case CortexA57:
+ case CortexA72:
+ case CortexA73:
+ case CortexR4:
+ case CortexR4F:
+ case CortexR5:
+ case CortexR7:
+ case CortexM3:
+ case ExynosM1:
+ case CortexR52:
+ break;
+ case Krait:
+ PreISelOperandLatencyAdjustment = 1;
+ break;
+ case Swift:
+ MaxInterleaveFactor = 2;
+ LdStMultipleTiming = SingleIssuePlusExtras;
+ PreISelOperandLatencyAdjustment = 1;
+ PartialUpdateClearance = 12;
+ break;
+ }
+}
+
+bool ARMSubtarget::isAPCS_ABI() const {
+ assert(TM.TargetABI != ARMBaseTargetMachine::ARM_ABI_UNKNOWN);
+ return TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_APCS;
+}
+bool ARMSubtarget::isAAPCS_ABI() const {
+ assert(TM.TargetABI != ARMBaseTargetMachine::ARM_ABI_UNKNOWN);
+ return TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS ||
+ TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16;
+}
+bool ARMSubtarget::isAAPCS16_ABI() const {
+ assert(TM.TargetABI != ARMBaseTargetMachine::ARM_ABI_UNKNOWN);
+ return TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16;
+}
+
+bool ARMSubtarget::isROPI() const {
+ return TM.getRelocationModel() == Reloc::ROPI ||
+ TM.getRelocationModel() == Reloc::ROPI_RWPI;
+}
+bool ARMSubtarget::isRWPI() const {
+ return TM.getRelocationModel() == Reloc::RWPI ||
+ TM.getRelocationModel() == Reloc::ROPI_RWPI;
+}
+
+bool ARMSubtarget::isGVIndirectSymbol(const GlobalValue *GV) const {
+ if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
+ return true;
+
+ // 32 bit macho has no relocation for a-b if a is undefined, even if b is in
+ // the section that is being relocated. This means we have to use o load even
+ // for GVs that are known to be local to the dso.
+ if (isTargetMachO() && TM.isPositionIndependent() &&
+ (GV->isDeclarationForLinker() || GV->hasCommonLinkage()))
+ return true;
+
+ return false;
+}
+
+unsigned ARMSubtarget::getMispredictionPenalty() const {
+ return SchedModel.MispredictPenalty;
+}
+
+bool ARMSubtarget::hasSinCos() const {
+ return isTargetWatchOS() ||
+ (isTargetIOS() && !getTargetTriple().isOSVersionLT(7, 0));
+}
+
+bool ARMSubtarget::enableMachineScheduler() const {
+ // Enable the MachineScheduler before register allocation for out-of-order
+ // architectures where we do not use the PostRA scheduler anymore (for now
+ // restricted to swift).
+ return getSchedModel().isOutOfOrder() && isSwift();
+}
+
+// This overrides the PostRAScheduler bit in the SchedModel for any CPU.
+bool ARMSubtarget::enablePostRAScheduler() const {
+ // No need for PostRA scheduling on out of order CPUs (for now restricted to
+ // swift).
+ if (getSchedModel().isOutOfOrder() && isSwift())
+ return false;
+ return (!isThumb() || hasThumb2());
+}
+
+bool ARMSubtarget::enableAtomicExpand() const { return hasAnyDataBarrier(); }
+
+bool ARMSubtarget::useStride4VFPs(const MachineFunction &MF) const {
+ // For general targets, the prologue can grow when VFPs are allocated with
+ // stride 4 (more vpush instructions). But WatchOS uses a compact unwind
+ // format which it's more important to get right.
+ return isTargetWatchABI() || (isSwift() && !MF.getFunction()->optForMinSize());
+}
+
+bool ARMSubtarget::useMovt(const MachineFunction &MF) const {
+ // NOTE Windows on ARM needs to use mov.w/mov.t pairs to materialise 32-bit
+ // immediates as it is inherently position independent, and may be out of
+ // range otherwise.
+ return !NoMovt && hasV8MBaselineOps() &&
+ (isTargetWindows() || !MF.getFunction()->optForMinSize() || genExecuteOnly());
+}
+
+bool ARMSubtarget::useFastISel() const {
+ // Enable fast-isel for any target, for testing only.
+ if (ForceFastISel)
+ return true;
+
+ // Limit fast-isel to the targets that are or have been tested.
+ if (!hasV6Ops())
+ return false;
+
+ // Thumb2 support on iOS; ARM support on iOS, Linux and NaCl.
+ return TM.Options.EnableFastISel &&
+ ((isTargetMachO() && !isThumb1Only()) ||
+ (isTargetLinux() && !isThumb()) || (isTargetNaCl() && !isThumb()));
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMSubtarget.h b/contrib/llvm/lib/Target/ARM/ARMSubtarget.h
new file mode 100644
index 000000000000..8c8218d0f432
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -0,0 +1,661 @@
+//===-- ARMSubtarget.h - Define Subtarget for the ARM ----------*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the ARM specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMSUBTARGET_H
+#define LLVM_LIB_TARGET_ARM_ARMSUBTARGET_H
+
+
+#include "ARMFrameLowering.h"
+#include "ARMISelLowering.h"
+#include "ARMInstrInfo.h"
+#include "ARMSelectionDAGInfo.h"
+#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "Thumb1FrameLowering.h"
+#include "Thumb1InstrInfo.h"
+#include "Thumb2InstrInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "ARMGenSubtargetInfo.inc"
+
+namespace llvm {
+class GlobalValue;
+class StringRef;
+class TargetOptions;
+class ARMBaseTargetMachine;
+
+class ARMSubtarget : public ARMGenSubtargetInfo {
+protected:
+ enum ARMProcFamilyEnum {
+ Others, CortexA5, CortexA7, CortexA8, CortexA9, CortexA12, CortexA15,
+ CortexA17, CortexR4, CortexR4F, CortexR5, CortexR7, CortexR52, CortexM3,
+ CortexA32, CortexA35, CortexA53, CortexA57, CortexA72, CortexA73,
+ Krait, Swift, ExynosM1
+ };
+ enum ARMProcClassEnum {
+ None, AClass, RClass, MClass
+ };
+ enum ARMArchEnum {
+ ARMv2, ARMv2a, ARMv3, ARMv3m, ARMv4, ARMv4t, ARMv5, ARMv5t, ARMv5te,
+ ARMv5tej, ARMv6, ARMv6k, ARMv6kz, ARMv6t2, ARMv6m, ARMv6sm, ARMv7a, ARMv7r,
+ ARMv7m, ARMv7em, ARMv8a, ARMv81a, ARMv82a, ARMv8mMainline, ARMv8mBaseline,
+ ARMv8r
+ };
+
+public:
+ /// What kind of timing do load multiple/store multiple instructions have.
+ enum ARMLdStMultipleTiming {
+ /// Can load/store 2 registers/cycle.
+ DoubleIssue,
+ /// Can load/store 2 registers/cycle, but needs an extra cycle if the access
+ /// is not 64-bit aligned.
+ DoubleIssueCheckUnalignedAccess,
+ /// Can load/store 1 register/cycle.
+ SingleIssue,
+ /// Can load/store 1 register/cycle, but needs an extra cycle for address
+ /// computation and potentially also for register writeback.
+ SingleIssuePlusExtras,
+ };
+
+protected:
+ /// ARMProcFamily - ARM processor family: Cortex-A8, Cortex-A9, and others.
+ ARMProcFamilyEnum ARMProcFamily = Others;
+
+ /// ARMProcClass - ARM processor class: None, AClass, RClass or MClass.
+ ARMProcClassEnum ARMProcClass = None;
+
+ /// ARMArch - ARM architecture
+ ARMArchEnum ARMArch = ARMv4t;
+
+ /// HasV4TOps, HasV5TOps, HasV5TEOps,
+ /// HasV6Ops, HasV6MOps, HasV6KOps, HasV6T2Ops, HasV7Ops, HasV8Ops -
+ /// Specify whether target support specific ARM ISA variants.
+ bool HasV4TOps = false;
+ bool HasV5TOps = false;
+ bool HasV5TEOps = false;
+ bool HasV6Ops = false;
+ bool HasV6MOps = false;
+ bool HasV6KOps = false;
+ bool HasV6T2Ops = false;
+ bool HasV7Ops = false;
+ bool HasV8Ops = false;
+ bool HasV8_1aOps = false;
+ bool HasV8_2aOps = false;
+ bool HasV8MBaselineOps = false;
+ bool HasV8MMainlineOps = false;
+
+ /// HasVFPv2, HasVFPv3, HasVFPv4, HasFPARMv8, HasNEON - Specify what
+ /// floating point ISAs are supported.
+ bool HasVFPv2 = false;
+ bool HasVFPv3 = false;
+ bool HasVFPv4 = false;
+ bool HasFPARMv8 = false;
+ bool HasNEON = false;
+
+ /// UseNEONForSinglePrecisionFP - if the NEONFP attribute has been
+ /// specified. Use the method useNEONForSinglePrecisionFP() to
+ /// determine if NEON should actually be used.
+ bool UseNEONForSinglePrecisionFP = false;
+
+ /// UseMulOps - True if non-microcoded fused integer multiply-add and
+ /// multiply-subtract instructions should be used.
+ bool UseMulOps = false;
+
+ /// SlowFPVMLx - If the VFP2 / NEON instructions are available, indicates
+ /// whether the FP VML[AS] instructions are slow (if so, don't use them).
+ bool SlowFPVMLx = false;
+
+ /// HasVMLxForwarding - If true, NEON has special multiplier accumulator
+ /// forwarding to allow mul + mla being issued back to back.
+ bool HasVMLxForwarding = false;
+
+ /// SlowFPBrcc - True if floating point compare + branch is slow.
+ bool SlowFPBrcc = false;
+
+ /// InThumbMode - True if compiling for Thumb, false for ARM.
+ bool InThumbMode = false;
+
+ /// UseSoftFloat - True if we're using software floating point features.
+ bool UseSoftFloat = false;
+
+ /// HasThumb2 - True if Thumb2 instructions are supported.
+ bool HasThumb2 = false;
+
+ /// NoARM - True if subtarget does not support ARM mode execution.
+ bool NoARM = false;
+
+ /// ReserveR9 - True if R9 is not available as a general purpose register.
+ bool ReserveR9 = false;
+
+ /// NoMovt - True if MOVT / MOVW pairs are not used for materialization of
+ /// 32-bit imms (including global addresses).
+ bool NoMovt = false;
+
+ /// SupportsTailCall - True if the OS supports tail call. The dynamic linker
+ /// must be able to synthesize call stubs for interworking between ARM and
+ /// Thumb.
+ bool SupportsTailCall = false;
+
+ /// HasFP16 - True if subtarget supports half-precision FP conversions
+ bool HasFP16 = false;
+
+ /// HasFullFP16 - True if subtarget supports half-precision FP operations
+ bool HasFullFP16 = false;
+
+ /// HasD16 - True if subtarget is limited to 16 double precision
+ /// FP registers for VFPv3.
+ bool HasD16 = false;
+
+ /// HasHardwareDivide - True if subtarget supports [su]div
+ bool HasHardwareDivide = false;
+
+ /// HasHardwareDivideInARM - True if subtarget supports [su]div in ARM mode
+ bool HasHardwareDivideInARM = false;
+
+ /// HasT2ExtractPack - True if subtarget supports thumb2 extract/pack
+ /// instructions.
+ bool HasT2ExtractPack = false;
+
+ /// HasDataBarrier - True if the subtarget supports DMB / DSB data barrier
+ /// instructions.
+ bool HasDataBarrier = false;
+
+ /// HasV7Clrex - True if the subtarget supports CLREX instructions
+ bool HasV7Clrex = false;
+
+ /// HasAcquireRelease - True if the subtarget supports v8 atomics (LDA/LDAEX etc)
+ /// instructions
+ bool HasAcquireRelease = false;
+
+ /// Pref32BitThumb - If true, codegen would prefer 32-bit Thumb instructions
+ /// over 16-bit ones.
+ bool Pref32BitThumb = false;
+
+ /// AvoidCPSRPartialUpdate - If true, codegen would avoid using instructions
+ /// that partially update CPSR and add false dependency on the previous
+ /// CPSR setting instruction.
+ bool AvoidCPSRPartialUpdate = false;
+
+ /// AvoidMOVsShifterOperand - If true, codegen should avoid using flag setting
+ /// movs with shifter operand (i.e. asr, lsl, lsr).
+ bool AvoidMOVsShifterOperand = false;
+
+ /// HasRetAddrStack - Some processors perform return stack prediction. CodeGen should
+ /// avoid issue "normal" call instructions to callees which do not return.
+ bool HasRetAddrStack = false;
+
+ /// HasMPExtension - True if the subtarget supports Multiprocessing
+ /// extension (ARMv7 only).
+ bool HasMPExtension = false;
+
+ /// HasVirtualization - True if the subtarget supports the Virtualization
+ /// extension.
+ bool HasVirtualization = false;
+
+ /// FPOnlySP - If true, the floating point unit only supports single
+ /// precision.
+ bool FPOnlySP = false;
+
+ /// If true, the processor supports the Performance Monitor Extensions. These
+ /// include a generic cycle-counter as well as more fine-grained (often
+ /// implementation-specific) events.
+ bool HasPerfMon = false;
+
+ /// HasTrustZone - if true, processor supports TrustZone security extensions
+ bool HasTrustZone = false;
+
+ /// Has8MSecExt - if true, processor supports ARMv8-M Security Extensions
+ bool Has8MSecExt = false;
+
+ /// HasCrypto - if true, processor supports Cryptography extensions
+ bool HasCrypto = false;
+
+ /// HasCRC - if true, processor supports CRC instructions
+ bool HasCRC = false;
+
+ /// HasRAS - if true, the processor supports RAS extensions
+ bool HasRAS = false;
+
+ /// If true, the instructions "vmov.i32 d0, #0" and "vmov.i32 q0, #0" are
+ /// particularly effective at zeroing a VFP register.
+ bool HasZeroCycleZeroing = false;
+
+ /// HasFPAO - if true, processor does positive address offset computation faster
+ bool HasFPAO = false;
+
+ /// If true, if conversion may decide to leave some instructions unpredicated.
+ bool IsProfitableToUnpredicate = false;
+
+ /// If true, VMOV will be favored over VGETLNi32.
+ bool HasSlowVGETLNi32 = false;
+
+ /// If true, VMOV will be favored over VDUP.
+ bool HasSlowVDUP32 = false;
+
+ /// If true, VMOVSR will be favored over VMOVDRR.
+ bool PreferVMOVSR = false;
+
+ /// If true, ISHST barriers will be used for Release semantics.
+ bool PreferISHST = false;
+
+ /// If true, a VLDM/VSTM starting with an odd register number is considered to
+ /// take more microops than single VLDRS/VSTRS.
+ bool SlowOddRegister = false;
+
+ /// If true, loading into a D subregister will be penalized.
+ bool SlowLoadDSubregister = false;
+
+ /// If true, the AGU and NEON/FPU units are multiplexed.
+ bool HasMuxedUnits = false;
+
+ /// If true, VMOVS will never be widened to VMOVD
+ bool DontWidenVMOVS = false;
+
+ /// If true, run the MLx expansion pass.
+ bool ExpandMLx = false;
+
+ /// If true, VFP/NEON VMLA/VMLS have special RAW hazards.
+ bool HasVMLxHazards = false;
+
+ /// If true, VMOVRS, VMOVSR and VMOVS will be converted from VFP to NEON.
+ bool UseNEONForFPMovs = false;
+
+ /// If true, VLDn instructions take an extra cycle for unaligned accesses.
+ bool CheckVLDnAlign = false;
+
+ /// If true, VFP instructions are not pipelined.
+ bool NonpipelinedVFP = false;
+
+ /// StrictAlign - If true, the subtarget disallows unaligned memory
+ /// accesses for some types. For details, see
+ /// ARMTargetLowering::allowsMisalignedMemoryAccesses().
+ bool StrictAlign = false;
+
+ /// RestrictIT - If true, the subtarget disallows generation of deprecated IT
+ /// blocks to conform to ARMv8 rule.
+ bool RestrictIT = false;
+
+ /// HasDSP - If true, the subtarget supports the DSP (saturating arith
+ /// and such) instructions.
+ bool HasDSP = false;
+
+ /// NaCl TRAP instruction is generated instead of the regular TRAP.
+ bool UseNaClTrap = false;
+
+ /// Generate calls via indirect call instructions.
+ bool GenLongCalls = false;
+
+ /// Generate code that does not contain data access to code sections.
+ bool GenExecuteOnly = false;
+
+ /// Target machine allowed unsafe FP math (such as use of NEON fp)
+ bool UnsafeFPMath = false;
+
+ /// UseSjLjEH - If true, the target uses SjLj exception handling (e.g. iOS).
+ bool UseSjLjEH = false;
+
+ /// stackAlignment - The minimum alignment known to hold of the stack frame on
+ /// entry to the function and which must be maintained by every function.
+ unsigned stackAlignment = 4;
+
+ /// CPUString - String name of used CPU.
+ std::string CPUString;
+
+ unsigned MaxInterleaveFactor = 1;
+
+ /// Clearance before partial register updates (in number of instructions)
+ unsigned PartialUpdateClearance = 0;
+
+ /// What kind of timing do load multiple/store multiple have (double issue,
+ /// single issue etc).
+ ARMLdStMultipleTiming LdStMultipleTiming = SingleIssue;
+
+ /// The adjustment that we need to apply to get the operand latency from the
+ /// operand cycle returned by the itinerary data for pre-ISel operands.
+ int PreISelOperandLatencyAdjustment = 2;
+
+ /// IsLittle - The target is Little Endian
+ bool IsLittle;
+
+ /// TargetTriple - What processor and OS we're targeting.
+ Triple TargetTriple;
+
+ /// SchedModel - Processor specific instruction costs.
+ MCSchedModel SchedModel;
+
+ /// Selected instruction itineraries (one entry per itinerary class.)
+ InstrItineraryData InstrItins;
+
+ /// Options passed via command line that could influence the target
+ const TargetOptions &Options;
+
+ const ARMBaseTargetMachine &TM;
+
+public:
+ /// This constructor initializes the data members to match that
+ /// of the specified triple.
+ ///
+ ARMSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS,
+ const ARMBaseTargetMachine &TM, bool IsLittle);
+
+ /// This object will take onwership of \p GISelAccessor.
+ void setGISelAccessor(GISelAccessor &GISel) { this->GISel.reset(&GISel); }
+
+ /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
+ /// that still makes it profitable to inline the call.
+ unsigned getMaxInlineSizeThreshold() const {
+ return 64;
+ }
+ /// ParseSubtargetFeatures - Parses features string setting specified
+ /// subtarget options. Definition of function is auto generated by tblgen.
+ void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+ /// initializeSubtargetDependencies - Initializes using a CPU and feature string
+ /// so that we can use initializer lists for subtarget initialization.
+ ARMSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
+
+ const ARMSelectionDAGInfo *getSelectionDAGInfo() const override {
+ return &TSInfo;
+ }
+ const ARMBaseInstrInfo *getInstrInfo() const override {
+ return InstrInfo.get();
+ }
+ const ARMTargetLowering *getTargetLowering() const override {
+ return &TLInfo;
+ }
+ const ARMFrameLowering *getFrameLowering() const override {
+ return FrameLowering.get();
+ }
+ const ARMBaseRegisterInfo *getRegisterInfo() const override {
+ return &InstrInfo->getRegisterInfo();
+ }
+
+ const CallLowering *getCallLowering() const override;
+ const InstructionSelector *getInstructionSelector() const override;
+ const LegalizerInfo *getLegalizerInfo() const override;
+ const RegisterBankInfo *getRegBankInfo() const override;
+
+private:
+ ARMSelectionDAGInfo TSInfo;
+ // Either Thumb1FrameLowering or ARMFrameLowering.
+ std::unique_ptr<ARMFrameLowering> FrameLowering;
+ // Either Thumb1InstrInfo or Thumb2InstrInfo.
+ std::unique_ptr<ARMBaseInstrInfo> InstrInfo;
+ ARMTargetLowering TLInfo;
+
+ /// Gather the accessor points to GlobalISel-related APIs.
+ /// This is used to avoid ifndefs spreading around while GISel is
+ /// an optional library.
+ std::unique_ptr<GISelAccessor> GISel;
+
+ void initializeEnvironment();
+ void initSubtargetFeatures(StringRef CPU, StringRef FS);
+ ARMFrameLowering *initializeFrameLowering(StringRef CPU, StringRef FS);
+
+public:
+ void computeIssueWidth();
+
+ bool hasV4TOps() const { return HasV4TOps; }
+ bool hasV5TOps() const { return HasV5TOps; }
+ bool hasV5TEOps() const { return HasV5TEOps; }
+ bool hasV6Ops() const { return HasV6Ops; }
+ bool hasV6MOps() const { return HasV6MOps; }
+ bool hasV6KOps() const { return HasV6KOps; }
+ bool hasV6T2Ops() const { return HasV6T2Ops; }
+ bool hasV7Ops() const { return HasV7Ops; }
+ bool hasV8Ops() const { return HasV8Ops; }
+ bool hasV8_1aOps() const { return HasV8_1aOps; }
+ bool hasV8_2aOps() const { return HasV8_2aOps; }
+ bool hasV8MBaselineOps() const { return HasV8MBaselineOps; }
+ bool hasV8MMainlineOps() const { return HasV8MMainlineOps; }
+
+ /// @{
+ /// These functions are obsolete, please consider adding subtarget features
+ /// or properties instead of calling them.
+ bool isCortexA5() const { return ARMProcFamily == CortexA5; }
+ bool isCortexA7() const { return ARMProcFamily == CortexA7; }
+ bool isCortexA8() const { return ARMProcFamily == CortexA8; }
+ bool isCortexA9() const { return ARMProcFamily == CortexA9; }
+ bool isCortexA15() const { return ARMProcFamily == CortexA15; }
+ bool isSwift() const { return ARMProcFamily == Swift; }
+ bool isCortexM3() const { return ARMProcFamily == CortexM3; }
+ bool isLikeA9() const { return isCortexA9() || isCortexA15() || isKrait(); }
+ bool isCortexR5() const { return ARMProcFamily == CortexR5; }
+ bool isKrait() const { return ARMProcFamily == Krait; }
+ /// @}
+
+ bool hasARMOps() const { return !NoARM; }
+
+ bool hasVFP2() const { return HasVFPv2; }
+ bool hasVFP3() const { return HasVFPv3; }
+ bool hasVFP4() const { return HasVFPv4; }
+ bool hasFPARMv8() const { return HasFPARMv8; }
+ bool hasNEON() const { return HasNEON; }
+ bool hasCrypto() const { return HasCrypto; }
+ bool hasCRC() const { return HasCRC; }
+ bool hasRAS() const { return HasRAS; }
+ bool hasVirtualization() const { return HasVirtualization; }
+ bool useNEONForSinglePrecisionFP() const {
+ return hasNEON() && UseNEONForSinglePrecisionFP;
+ }
+
+ bool hasDivide() const { return HasHardwareDivide; }
+ bool hasDivideInARMMode() const { return HasHardwareDivideInARM; }
+ bool hasT2ExtractPack() const { return HasT2ExtractPack; }
+ bool hasDataBarrier() const { return HasDataBarrier; }
+ bool hasV7Clrex() const { return HasV7Clrex; }
+ bool hasAcquireRelease() const { return HasAcquireRelease; }
+ bool hasAnyDataBarrier() const {
+ return HasDataBarrier || (hasV6Ops() && !isThumb());
+ }
+ bool useMulOps() const { return UseMulOps; }
+ bool useFPVMLx() const { return !SlowFPVMLx; }
+ bool hasVMLxForwarding() const { return HasVMLxForwarding; }
+ bool isFPBrccSlow() const { return SlowFPBrcc; }
+ bool isFPOnlySP() const { return FPOnlySP; }
+ bool hasPerfMon() const { return HasPerfMon; }
+ bool hasTrustZone() const { return HasTrustZone; }
+ bool has8MSecExt() const { return Has8MSecExt; }
+ bool hasZeroCycleZeroing() const { return HasZeroCycleZeroing; }
+ bool hasFPAO() const { return HasFPAO; }
+ bool isProfitableToUnpredicate() const { return IsProfitableToUnpredicate; }
+ bool hasSlowVGETLNi32() const { return HasSlowVGETLNi32; }
+ bool hasSlowVDUP32() const { return HasSlowVDUP32; }
+ bool preferVMOVSR() const { return PreferVMOVSR; }
+ bool preferISHSTBarriers() const { return PreferISHST; }
+ bool expandMLx() const { return ExpandMLx; }
+ bool hasVMLxHazards() const { return HasVMLxHazards; }
+ bool hasSlowOddRegister() const { return SlowOddRegister; }
+ bool hasSlowLoadDSubregister() const { return SlowLoadDSubregister; }
+ bool hasMuxedUnits() const { return HasMuxedUnits; }
+ bool dontWidenVMOVS() const { return DontWidenVMOVS; }
+ bool useNEONForFPMovs() const { return UseNEONForFPMovs; }
+ bool checkVLDnAccessAlignment() const { return CheckVLDnAlign; }
+ bool nonpipelinedVFP() const { return NonpipelinedVFP; }
+ bool prefers32BitThumb() const { return Pref32BitThumb; }
+ bool avoidCPSRPartialUpdate() const { return AvoidCPSRPartialUpdate; }
+ bool avoidMOVsShifterOperand() const { return AvoidMOVsShifterOperand; }
+ bool hasRetAddrStack() const { return HasRetAddrStack; }
+ bool hasMPExtension() const { return HasMPExtension; }
+ bool hasDSP() const { return HasDSP; }
+ bool useNaClTrap() const { return UseNaClTrap; }
+ bool useSjLjEH() const { return UseSjLjEH; }
+ bool genLongCalls() const { return GenLongCalls; }
+ bool genExecuteOnly() const { return GenExecuteOnly; }
+
+ bool hasFP16() const { return HasFP16; }
+ bool hasD16() const { return HasD16; }
+ bool hasFullFP16() const { return HasFullFP16; }
+
+ const Triple &getTargetTriple() const { return TargetTriple; }
+
+ bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
+ bool isTargetIOS() const { return TargetTriple.isiOS(); }
+ bool isTargetWatchOS() const { return TargetTriple.isWatchOS(); }
+ bool isTargetWatchABI() const { return TargetTriple.isWatchABI(); }
+ bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
+ bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
+ bool isTargetNetBSD() const { return TargetTriple.isOSNetBSD(); }
+ bool isTargetWindows() const { return TargetTriple.isOSWindows(); }
+
+ bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); }
+ bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
+ bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
+
+ // ARM EABI is the bare-metal EABI described in ARM ABI documents and
+ // can be accessed via -target arm-none-eabi. This is NOT GNUEABI.
+ // FIXME: Add a flag for bare-metal for that target and set Triple::EABI
+ // even for GNUEABI, so we can make a distinction here and still conform to
+ // the EABI on GNU (and Android) mode. This requires change in Clang, too.
+ // FIXME: The Darwin exception is temporary, while we move users to
+ // "*-*-*-macho" triples as quickly as possible.
+ bool isTargetAEABI() const {
+ return (TargetTriple.getEnvironment() == Triple::EABI ||
+ TargetTriple.getEnvironment() == Triple::EABIHF) &&
+ !isTargetDarwin() && !isTargetWindows();
+ }
+ bool isTargetGNUAEABI() const {
+ return (TargetTriple.getEnvironment() == Triple::GNUEABI ||
+ TargetTriple.getEnvironment() == Triple::GNUEABIHF) &&
+ !isTargetDarwin() && !isTargetWindows();
+ }
+ bool isTargetMuslAEABI() const {
+ return (TargetTriple.getEnvironment() == Triple::MuslEABI ||
+ TargetTriple.getEnvironment() == Triple::MuslEABIHF) &&
+ !isTargetDarwin() && !isTargetWindows();
+ }
+
+ // ARM Targets that support EHABI exception handling standard
+ // Darwin uses SjLj. Other targets might need more checks.
+ bool isTargetEHABICompatible() const {
+ return (TargetTriple.getEnvironment() == Triple::EABI ||
+ TargetTriple.getEnvironment() == Triple::GNUEABI ||
+ TargetTriple.getEnvironment() == Triple::MuslEABI ||
+ TargetTriple.getEnvironment() == Triple::EABIHF ||
+ TargetTriple.getEnvironment() == Triple::GNUEABIHF ||
+ TargetTriple.getEnvironment() == Triple::MuslEABIHF ||
+ isTargetAndroid()) &&
+ !isTargetDarwin() && !isTargetWindows();
+ }
+
+ bool isTargetHardFloat() const {
+ // FIXME: this is invalid for WindowsCE
+ return TargetTriple.getEnvironment() == Triple::GNUEABIHF ||
+ TargetTriple.getEnvironment() == Triple::MuslEABIHF ||
+ TargetTriple.getEnvironment() == Triple::EABIHF ||
+ isTargetWindows() || isAAPCS16_ABI();
+ }
+ bool isTargetAndroid() const { return TargetTriple.isAndroid(); }
+
+ virtual bool isXRaySupported() const override;
+
+ bool isAPCS_ABI() const;
+ bool isAAPCS_ABI() const;
+ bool isAAPCS16_ABI() const;
+
+ bool isROPI() const;
+ bool isRWPI() const;
+
+ bool useSoftFloat() const { return UseSoftFloat; }
+ bool isThumb() const { return InThumbMode; }
+ bool isThumb1Only() const { return InThumbMode && !HasThumb2; }
+ bool isThumb2() const { return InThumbMode && HasThumb2; }
+ bool hasThumb2() const { return HasThumb2; }
+ bool isMClass() const { return ARMProcClass == MClass; }
+ bool isRClass() const { return ARMProcClass == RClass; }
+ bool isAClass() const { return ARMProcClass == AClass; }
+
+ bool isR9Reserved() const {
+ return isTargetMachO() ? (ReserveR9 || !HasV6Ops) : ReserveR9;
+ }
+
+ bool useR7AsFramePointer() const {
+ return isTargetDarwin() || (!isTargetWindows() && isThumb());
+ }
+ /// Returns true if the frame setup is split into two separate pushes (first
+ /// r0-r7,lr then r8-r11), principally so that the frame pointer is adjacent
+ /// to lr. This is always required on Thumb1-only targets, as the push and
+ /// pop instructions can't access the high registers.
+ bool splitFramePushPop(const MachineFunction &MF) const {
+ return (useR7AsFramePointer() &&
+ MF.getTarget().Options.DisableFramePointerElim(MF)) ||
+ isThumb1Only();
+ }
+
+ bool useStride4VFPs(const MachineFunction &MF) const;
+
+ bool useMovt(const MachineFunction &MF) const;
+
+ bool supportsTailCall() const { return SupportsTailCall; }
+
+ bool allowsUnalignedMem() const { return !StrictAlign; }
+
+ bool restrictIT() const { return RestrictIT; }
+
+ const std::string & getCPUString() const { return CPUString; }
+
+ bool isLittle() const { return IsLittle; }
+
+ unsigned getMispredictionPenalty() const;
+
+ /// This function returns true if the target has sincos() routine in its
+ /// compiler runtime or math libraries.
+ bool hasSinCos() const;
+
+ /// Returns true if machine scheduler should be enabled.
+ bool enableMachineScheduler() const override;
+
+ /// True for some subtargets at > -O0.
+ bool enablePostRAScheduler() const override;
+
+ // enableAtomicExpand- True if we need to expand our atomics.
+ bool enableAtomicExpand() const override;
+
+ /// getInstrItins - Return the instruction itineraries based on subtarget
+ /// selection.
+ const InstrItineraryData *getInstrItineraryData() const override {
+ return &InstrItins;
+ }
+
+ /// getStackAlignment - Returns the minimum alignment known to hold of the
+ /// stack frame on entry to the function and which must be maintained by every
+ /// function for this subtarget.
+ unsigned getStackAlignment() const { return stackAlignment; }
+
+ unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
+
+ unsigned getPartialUpdateClearance() const { return PartialUpdateClearance; }
+
+ ARMLdStMultipleTiming getLdStMultipleTiming() const {
+ return LdStMultipleTiming;
+ }
+
+ int getPreISelOperandLatencyAdjustment() const {
+ return PreISelOperandLatencyAdjustment;
+ }
+
+ /// True if the GV will be accessed via an indirect symbol.
+ bool isGVIndirectSymbol(const GlobalValue *GV) const;
+
+ /// True if fast-isel is used.
+ bool useFastISel() const;
+};
+} // End llvm namespace
+
+#endif // ARMSUBTARGET_H
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp
new file mode 100644
index 000000000000..70c9567d99f8
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -0,0 +1,544 @@
+//===-- ARMTargetMachine.cpp - Define TargetMachine for ARM ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMTargetMachine.h"
+#include "ARM.h"
+#include "ARMCallLowering.h"
+#include "ARMFrameLowering.h"
+#include "ARMInstructionSelector.h"
+#include "ARMLegalizerInfo.h"
+#include "ARMRegisterBankInfo.h"
+#include "ARMTargetObjectFile.h"
+#include "ARMTargetTransformInfo.h"
+#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
+#include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetParser.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+static cl::opt<bool>
+DisableA15SDOptimization("disable-a15-sd-optimization", cl::Hidden,
+ cl::desc("Inhibit optimization of S->D register accesses on A15"),
+ cl::init(false));
+
+static cl::opt<bool>
+EnableAtomicTidy("arm-atomic-cfg-tidy", cl::Hidden,
+ cl::desc("Run SimplifyCFG after expanding atomic operations"
+ " to make use of cmpxchg flow-based information"),
+ cl::init(true));
+
+static cl::opt<bool>
+EnableARMLoadStoreOpt("arm-load-store-opt", cl::Hidden,
+ cl::desc("Enable ARM load/store optimization pass"),
+ cl::init(true));
+
+// FIXME: Unify control over GlobalMerge.
+static cl::opt<cl::boolOrDefault>
+EnableGlobalMerge("arm-global-merge", cl::Hidden,
+ cl::desc("Enable the global merge pass"));
+
+extern "C" void LLVMInitializeARMTarget() {
+ // Register the target.
+ RegisterTargetMachine<ARMLETargetMachine> X(getTheARMLETarget());
+ RegisterTargetMachine<ARMBETargetMachine> Y(getTheARMBETarget());
+ RegisterTargetMachine<ThumbLETargetMachine> A(getTheThumbLETarget());
+ RegisterTargetMachine<ThumbBETargetMachine> B(getTheThumbBETarget());
+
+ PassRegistry &Registry = *PassRegistry::getPassRegistry();
+ initializeGlobalISel(Registry);
+ initializeARMLoadStoreOptPass(Registry);
+ initializeARMPreAllocLoadStoreOptPass(Registry);
+}
+
+static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
+ if (TT.isOSBinFormatMachO())
+ return make_unique<TargetLoweringObjectFileMachO>();
+ if (TT.isOSWindows())
+ return make_unique<TargetLoweringObjectFileCOFF>();
+ return make_unique<ARMElfTargetObjectFile>();
+}
+
+static ARMBaseTargetMachine::ARMABI
+computeTargetABI(const Triple &TT, StringRef CPU,
+ const TargetOptions &Options) {
+ if (Options.MCOptions.getABIName() == "aapcs16")
+ return ARMBaseTargetMachine::ARM_ABI_AAPCS16;
+ else if (Options.MCOptions.getABIName().startswith("aapcs"))
+ return ARMBaseTargetMachine::ARM_ABI_AAPCS;
+ else if (Options.MCOptions.getABIName().startswith("apcs"))
+ return ARMBaseTargetMachine::ARM_ABI_APCS;
+
+ assert(Options.MCOptions.getABIName().empty() &&
+ "Unknown target-abi option!");
+
+ ARMBaseTargetMachine::ARMABI TargetABI =
+ ARMBaseTargetMachine::ARM_ABI_UNKNOWN;
+
+ unsigned ArchKind = llvm::ARM::parseCPUArch(CPU);
+ StringRef ArchName = llvm::ARM::getArchName(ArchKind);
+ // FIXME: This is duplicated code from the front end and should be unified.
+ if (TT.isOSBinFormatMachO()) {
+ if (TT.getEnvironment() == llvm::Triple::EABI ||
+ (TT.getOS() == llvm::Triple::UnknownOS && TT.isOSBinFormatMachO()) ||
+ llvm::ARM::parseArchProfile(ArchName) == llvm::ARM::PK_M) {
+ TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
+ } else if (TT.isWatchABI()) {
+ TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS16;
+ } else {
+ TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS;
+ }
+ } else if (TT.isOSWindows()) {
+ // FIXME: this is invalid for WindowsCE
+ TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
+ } else {
+ // Select the default based on the platform.
+ switch (TT.getEnvironment()) {
+ case llvm::Triple::Android:
+ case llvm::Triple::GNUEABI:
+ case llvm::Triple::GNUEABIHF:
+ case llvm::Triple::MuslEABI:
+ case llvm::Triple::MuslEABIHF:
+ case llvm::Triple::EABIHF:
+ case llvm::Triple::EABI:
+ TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
+ break;
+ case llvm::Triple::GNU:
+ TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS;
+ break;
+ default:
+ if (TT.isOSNetBSD())
+ TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS;
+ else
+ TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
+ break;
+ }
+ }
+
+ return TargetABI;
+}
+
+static std::string computeDataLayout(const Triple &TT, StringRef CPU,
+ const TargetOptions &Options,
+ bool isLittle) {
+ auto ABI = computeTargetABI(TT, CPU, Options);
+ std::string Ret = "";
+
+ if (isLittle)
+ // Little endian.
+ Ret += "e";
+ else
+ // Big endian.
+ Ret += "E";
+
+ Ret += DataLayout::getManglingComponent(TT);
+
+ // Pointers are 32 bits and aligned to 32 bits.
+ Ret += "-p:32:32";
+
+ // ABIs other than APCS have 64 bit integers with natural alignment.
+ if (ABI != ARMBaseTargetMachine::ARM_ABI_APCS)
+ Ret += "-i64:64";
+
+ // We have 64 bits floats. The APCS ABI requires them to be aligned to 32
+ // bits, others to 64 bits. We always try to align to 64 bits.
+ if (ABI == ARMBaseTargetMachine::ARM_ABI_APCS)
+ Ret += "-f64:32:64";
+
+ // We have 128 and 64 bit vectors. The APCS ABI aligns them to 32 bits, others
+ // to 64. We always ty to give them natural alignment.
+ if (ABI == ARMBaseTargetMachine::ARM_ABI_APCS)
+ Ret += "-v64:32:64-v128:32:128";
+ else if (ABI != ARMBaseTargetMachine::ARM_ABI_AAPCS16)
+ Ret += "-v128:64:128";
+
+ // Try to align aggregates to 32 bits (the default is 64 bits, which has no
+ // particular hardware support on 32-bit ARM).
+ Ret += "-a:0:32";
+
+ // Integer registers are 32 bits.
+ Ret += "-n32";
+
+ // The stack is 128 bit aligned on NaCl, 64 bit aligned on AAPCS and 32 bit
+ // aligned everywhere else.
+ if (TT.isOSNaCl() || ABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16)
+ Ret += "-S128";
+ else if (ABI == ARMBaseTargetMachine::ARM_ABI_AAPCS)
+ Ret += "-S64";
+ else
+ Ret += "-S32";
+
+ return Ret;
+}
+
+static Reloc::Model getEffectiveRelocModel(const Triple &TT,
+ Optional<Reloc::Model> RM) {
+ if (!RM.hasValue())
+ // Default relocation model on Darwin is PIC.
+ return TT.isOSBinFormatMachO() ? Reloc::PIC_ : Reloc::Static;
+
+ if (*RM == Reloc::ROPI || *RM == Reloc::RWPI || *RM == Reloc::ROPI_RWPI)
+ assert(TT.isOSBinFormatELF() &&
+ "ROPI/RWPI currently only supported for ELF");
+
+ // DynamicNoPIC is only used on darwin.
+ if (*RM == Reloc::DynamicNoPIC && !TT.isOSDarwin())
+ return Reloc::Static;
+
+ return *RM;
+}
+
+/// Create an ARM architecture model.
+///
+ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT,
+ StringRef CPU, StringRef FS,
+ const TargetOptions &Options,
+ Optional<Reloc::Model> RM,
+ CodeModel::Model CM,
+ CodeGenOpt::Level OL, bool isLittle)
+ : LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options, isLittle), TT,
+ CPU, FS, Options, getEffectiveRelocModel(TT, RM), CM,
+ OL),
+ TargetABI(computeTargetABI(TT, CPU, Options)),
+ TLOF(createTLOF(getTargetTriple())),
+ Subtarget(TT, CPU, FS, *this, isLittle), isLittle(isLittle) {
+
+ // Default to triple-appropriate float ABI
+ if (Options.FloatABIType == FloatABI::Default)
+ this->Options.FloatABIType =
+ Subtarget.isTargetHardFloat() ? FloatABI::Hard : FloatABI::Soft;
+
+ // Default to triple-appropriate EABI
+ if (Options.EABIVersion == EABI::Default ||
+ Options.EABIVersion == EABI::Unknown) {
+ // musl is compatible with glibc with regard to EABI version
+ if (Subtarget.isTargetGNUAEABI() || Subtarget.isTargetMuslAEABI())
+ this->Options.EABIVersion = EABI::GNU;
+ else
+ this->Options.EABIVersion = EABI::EABI5;
+ }
+}
+
+ARMBaseTargetMachine::~ARMBaseTargetMachine() {}
+
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+namespace {
+struct ARMGISelActualAccessor : public GISelAccessor {
+ std::unique_ptr<CallLowering> CallLoweringInfo;
+ std::unique_ptr<InstructionSelector> InstSelector;
+ std::unique_ptr<LegalizerInfo> Legalizer;
+ std::unique_ptr<RegisterBankInfo> RegBankInfo;
+ const CallLowering *getCallLowering() const override {
+ return CallLoweringInfo.get();
+ }
+ const InstructionSelector *getInstructionSelector() const override {
+ return InstSelector.get();
+ }
+ const LegalizerInfo *getLegalizerInfo() const override {
+ return Legalizer.get();
+ }
+ const RegisterBankInfo *getRegBankInfo() const override {
+ return RegBankInfo.get();
+ }
+};
+} // End anonymous namespace.
+#endif
+
+const ARMSubtarget *
+ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const {
+ Attribute CPUAttr = F.getFnAttribute("target-cpu");
+ Attribute FSAttr = F.getFnAttribute("target-features");
+
+ std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
+ ? CPUAttr.getValueAsString().str()
+ : TargetCPU;
+ std::string FS = !FSAttr.hasAttribute(Attribute::None)
+ ? FSAttr.getValueAsString().str()
+ : TargetFS;
+
+ // FIXME: This is related to the code below to reset the target options,
+ // we need to know whether or not the soft float flag is set on the
+ // function before we can generate a subtarget. We also need to use
+ // it as a key for the subtarget since that can be the only difference
+ // between two functions.
+ bool SoftFloat =
+ F.getFnAttribute("use-soft-float").getValueAsString() == "true";
+ // If the soft float attribute is set on the function turn on the soft float
+ // subtarget feature.
+ if (SoftFloat)
+ FS += FS.empty() ? "+soft-float" : ",+soft-float";
+
+ auto &I = SubtargetMap[CPU + FS];
+ if (!I) {
+ // This needs to be done before we create a new subtarget since any
+ // creation will depend on the TM and the code generation flags on the
+ // function that reside in TargetOptions.
+ resetTargetOptions(F);
+ I = llvm::make_unique<ARMSubtarget>(TargetTriple, CPU, FS, *this, isLittle);
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+ GISelAccessor *GISel = new GISelAccessor();
+#else
+ ARMGISelActualAccessor *GISel = new ARMGISelActualAccessor();
+ GISel->CallLoweringInfo.reset(new ARMCallLowering(*I->getTargetLowering()));
+ GISel->Legalizer.reset(new ARMLegalizerInfo());
+
+ auto *RBI = new ARMRegisterBankInfo(*I->getRegisterInfo());
+
+ // FIXME: At this point, we can't rely on Subtarget having RBI.
+ // It's awkward to mix passing RBI and the Subtarget; should we pass
+ // TII/TRI as well?
+ GISel->InstSelector.reset(new ARMInstructionSelector(*I, *RBI));
+
+ GISel->RegBankInfo.reset(RBI);
+#endif
+ I->setGISelAccessor(*GISel);
+ }
+ return I.get();
+}
+
+TargetIRAnalysis ARMBaseTargetMachine::getTargetIRAnalysis() {
+ return TargetIRAnalysis([this](const Function &F) {
+ return TargetTransformInfo(ARMTTIImpl(this, F));
+ });
+}
+
+void ARMTargetMachine::anchor() {}
+
+ARMTargetMachine::ARMTargetMachine(const Target &T, const Triple &TT,
+ StringRef CPU, StringRef FS,
+ const TargetOptions &Options,
+ Optional<Reloc::Model> RM,
+ CodeModel::Model CM, CodeGenOpt::Level OL,
+ bool isLittle)
+ : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, isLittle) {
+ initAsmInfo();
+ if (!Subtarget.hasARMOps())
+ report_fatal_error("CPU: '" + Subtarget.getCPUString() + "' does not "
+ "support ARM mode execution!");
+}
+
+void ARMLETargetMachine::anchor() {}
+
+ARMLETargetMachine::ARMLETargetMachine(const Target &T, const Triple &TT,
+ StringRef CPU, StringRef FS,
+ const TargetOptions &Options,
+ Optional<Reloc::Model> RM,
+ CodeModel::Model CM,
+ CodeGenOpt::Level OL)
+ : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
+
+void ARMBETargetMachine::anchor() {}
+
+ARMBETargetMachine::ARMBETargetMachine(const Target &T, const Triple &TT,
+ StringRef CPU, StringRef FS,
+ const TargetOptions &Options,
+ Optional<Reloc::Model> RM,
+ CodeModel::Model CM,
+ CodeGenOpt::Level OL)
+ : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
+
+void ThumbTargetMachine::anchor() {}
+
+ThumbTargetMachine::ThumbTargetMachine(const Target &T, const Triple &TT,
+ StringRef CPU, StringRef FS,
+ const TargetOptions &Options,
+ Optional<Reloc::Model> RM,
+ CodeModel::Model CM,
+ CodeGenOpt::Level OL, bool isLittle)
+ : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, isLittle) {
+ initAsmInfo();
+}
+
+void ThumbLETargetMachine::anchor() {}
+
+ThumbLETargetMachine::ThumbLETargetMachine(const Target &T, const Triple &TT,
+ StringRef CPU, StringRef FS,
+ const TargetOptions &Options,
+ Optional<Reloc::Model> RM,
+ CodeModel::Model CM,
+ CodeGenOpt::Level OL)
+ : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
+
+void ThumbBETargetMachine::anchor() {}
+
+ThumbBETargetMachine::ThumbBETargetMachine(const Target &T, const Triple &TT,
+ StringRef CPU, StringRef FS,
+ const TargetOptions &Options,
+ Optional<Reloc::Model> RM,
+ CodeModel::Model CM,
+ CodeGenOpt::Level OL)
+ : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
+
+namespace {
+/// ARM Code Generator Pass Configuration Options.
+class ARMPassConfig : public TargetPassConfig {
+public:
+ ARMPassConfig(ARMBaseTargetMachine *TM, PassManagerBase &PM)
+ : TargetPassConfig(TM, PM) {}
+
+ ARMBaseTargetMachine &getARMTargetMachine() const {
+ return getTM<ARMBaseTargetMachine>();
+ }
+
+ void addIRPasses() override;
+ bool addPreISel() override;
+ bool addInstSelector() override;
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+ bool addIRTranslator() override;
+ bool addLegalizeMachineIR() override;
+ bool addRegBankSelect() override;
+ bool addGlobalInstructionSelect() override;
+#endif
+ void addPreRegAlloc() override;
+ void addPreSched2() override;
+ void addPreEmitPass() override;
+};
+} // namespace
+
+TargetPassConfig *ARMBaseTargetMachine::createPassConfig(PassManagerBase &PM) {
+ return new ARMPassConfig(this, PM);
+}
+
+void ARMPassConfig::addIRPasses() {
+ if (TM->Options.ThreadModel == ThreadModel::Single)
+ addPass(createLowerAtomicPass());
+ else
+ addPass(createAtomicExpandPass(TM));
+
+ // Cmpxchg instructions are often used with a subsequent comparison to
+ // determine whether it succeeded. We can exploit existing control-flow in
+ // ldrex/strex loops to simplify this, but it needs tidying up.
+ if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy)
+ addPass(createCFGSimplificationPass(-1, [this](const Function &F) {
+ const auto &ST = this->TM->getSubtarget<ARMSubtarget>(F);
+ return ST.hasAnyDataBarrier() && !ST.isThumb1Only();
+ }));
+
+ TargetPassConfig::addIRPasses();
+
+ // Match interleaved memory accesses to ldN/stN intrinsics.
+ if (TM->getOptLevel() != CodeGenOpt::None)
+ addPass(createInterleavedAccessPass(TM));
+}
+
+bool ARMPassConfig::addPreISel() {
+ if ((TM->getOptLevel() != CodeGenOpt::None &&
+ EnableGlobalMerge == cl::BOU_UNSET) ||
+ EnableGlobalMerge == cl::BOU_TRUE) {
+ // FIXME: This is using the thumb1 only constant value for
+ // maximal global offset for merging globals. We may want
+ // to look into using the old value for non-thumb1 code of
+ // 4095 based on the TargetMachine, but this starts to become
+ // tricky when doing code gen per function.
+ bool OnlyOptimizeForSize = (TM->getOptLevel() < CodeGenOpt::Aggressive) &&
+ (EnableGlobalMerge == cl::BOU_UNSET);
+ // Merging of extern globals is enabled by default on non-Mach-O as we
+ // expect it to be generally either beneficial or harmless. On Mach-O it
+ // is disabled as we emit the .subsections_via_symbols directive which
+ // means that merging extern globals is not safe.
+ bool MergeExternalByDefault = !TM->getTargetTriple().isOSBinFormatMachO();
+ addPass(createGlobalMergePass(TM, 127, OnlyOptimizeForSize,
+ MergeExternalByDefault));
+ }
+
+ return false;
+}
+
+bool ARMPassConfig::addInstSelector() {
+ addPass(createARMISelDag(getARMTargetMachine(), getOptLevel()));
+ return false;
+}
+
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+bool ARMPassConfig::addIRTranslator() {
+ addPass(new IRTranslator());
+ return false;
+}
+
+bool ARMPassConfig::addLegalizeMachineIR() {
+ addPass(new Legalizer());
+ return false;
+}
+
+bool ARMPassConfig::addRegBankSelect() {
+ addPass(new RegBankSelect());
+ return false;
+}
+
+bool ARMPassConfig::addGlobalInstructionSelect() {
+ addPass(new InstructionSelect());
+ return false;
+}
+#endif
+
+void ARMPassConfig::addPreRegAlloc() {
+ if (getOptLevel() != CodeGenOpt::None) {
+ addPass(createMLxExpansionPass());
+
+ if (EnableARMLoadStoreOpt)
+ addPass(createARMLoadStoreOptimizationPass(/* pre-register alloc */ true));
+
+ if (!DisableA15SDOptimization)
+ addPass(createA15SDOptimizerPass());
+ }
+}
+
+void ARMPassConfig::addPreSched2() {
+ if (getOptLevel() != CodeGenOpt::None) {
+ if (EnableARMLoadStoreOpt)
+ addPass(createARMLoadStoreOptimizationPass());
+
+ addPass(createExecutionDependencyFixPass(&ARM::DPRRegClass));
+ }
+
+ // Expand some pseudo instructions into multiple instructions to allow
+ // proper scheduling.
+ addPass(createARMExpandPseudoPass());
+
+ if (getOptLevel() != CodeGenOpt::None) {
+ // in v8, IfConversion depends on Thumb instruction widths
+ addPass(createThumb2SizeReductionPass([this](const Function &F) {
+ return this->TM->getSubtarget<ARMSubtarget>(F).restrictIT();
+ }));
+
+ addPass(createIfConverter([](const MachineFunction &MF) {
+ return !MF.getSubtarget<ARMSubtarget>().isThumb1Only();
+ }));
+ }
+ addPass(createThumb2ITBlockPass());
+}
+
+void ARMPassConfig::addPreEmitPass() {
+ addPass(createThumb2SizeReductionPass());
+
+ // Constant island pass work on unbundled instructions.
+ addPass(createUnpackMachineBundles([](const MachineFunction &MF) {
+ return MF.getSubtarget<ARMSubtarget>().isThumb2();
+ }));
+
+ // Don't optimize barriers at -O0.
+ if (getOptLevel() != CodeGenOpt::None)
+ addPass(createARMOptimizeBarriersPass());
+
+ addPass(createARMConstantIslandPass());
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h
new file mode 100644
index 000000000000..c6b70b953162
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h
@@ -0,0 +1,131 @@
+//===-- ARMTargetMachine.h - Define TargetMachine for ARM -------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the ARM specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMTARGETMACHINE_H
+#define LLVM_LIB_TARGET_ARM_ARMTARGETMACHINE_H
+
+#include "ARMInstrInfo.h"
+#include "ARMSubtarget.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class ARMBaseTargetMachine : public LLVMTargetMachine {
+public:
+ enum ARMABI {
+ ARM_ABI_UNKNOWN,
+ ARM_ABI_APCS,
+ ARM_ABI_AAPCS, // ARM EABI
+ ARM_ABI_AAPCS16
+ } TargetABI;
+
+protected:
+ std::unique_ptr<TargetLoweringObjectFile> TLOF;
+ ARMSubtarget Subtarget;
+ bool isLittle;
+ mutable StringMap<std::unique_ptr<ARMSubtarget>> SubtargetMap;
+
+public:
+ ARMBaseTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL, bool isLittle);
+ ~ARMBaseTargetMachine() override;
+
+ const ARMSubtarget *getSubtargetImpl() const { return &Subtarget; }
+ const ARMSubtarget *getSubtargetImpl(const Function &F) const override;
+ bool isLittleEndian() const { return isLittle; }
+
+ /// \brief Get the TargetIRAnalysis for this target.
+ TargetIRAnalysis getTargetIRAnalysis() override;
+
+ // Pass Pipeline Configuration
+ TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+ TargetLoweringObjectFile *getObjFileLowering() const override {
+ return TLOF.get();
+ }
+};
+
+/// ARM target machine.
+///
+class ARMTargetMachine : public ARMBaseTargetMachine {
+ virtual void anchor();
+ public:
+ ARMTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL, bool isLittle);
+};
+
+/// ARM little endian target machine.
+///
+class ARMLETargetMachine : public ARMTargetMachine {
+ void anchor() override;
+public:
+ ARMLETargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL);
+};
+
+/// ARM big endian target machine.
+///
+class ARMBETargetMachine : public ARMTargetMachine {
+ void anchor() override;
+public:
+ ARMBETargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL);
+};
+
+/// Thumb target machine.
+/// Due to the way architectures are handled, this represents both
+/// Thumb-1 and Thumb-2.
+///
+class ThumbTargetMachine : public ARMBaseTargetMachine {
+ virtual void anchor();
+public:
+ ThumbTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL, bool isLittle);
+};
+
+/// Thumb little endian target machine.
+///
+class ThumbLETargetMachine : public ThumbTargetMachine {
+ void anchor() override;
+public:
+ ThumbLETargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL);
+};
+
+/// Thumb big endian target machine.
+///
+class ThumbBETargetMachine : public ThumbTargetMachine {
+ void anchor() override;
+public:
+ ThumbBETargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
new file mode 100644
index 000000000000..625c4280e1a6
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
@@ -0,0 +1,92 @@
+//===-- llvm/Target/ARMTargetObjectFile.cpp - ARM Object Info Impl --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMTargetObjectFile.h"
+#include "ARMTargetMachine.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+using namespace dwarf;
+
+//===----------------------------------------------------------------------===//
+// ELF Target
+//===----------------------------------------------------------------------===//
+
+void ARMElfTargetObjectFile::Initialize(MCContext &Ctx,
+ const TargetMachine &TM) {
+ const ARMTargetMachine &ARM_TM = static_cast<const ARMTargetMachine &>(TM);
+ bool isAAPCS_ABI = ARM_TM.TargetABI == ARMTargetMachine::ARMABI::ARM_ABI_AAPCS;
+ genExecuteOnly = ARM_TM.getSubtargetImpl()->genExecuteOnly();
+
+ TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+ InitializeELF(isAAPCS_ABI);
+
+ if (isAAPCS_ABI) {
+ LSDASection = nullptr;
+ }
+
+ AttributesSection =
+ getContext().getELFSection(".ARM.attributes", ELF::SHT_ARM_ATTRIBUTES, 0);
+
+ // Make code section unreadable when in execute-only mode
+ if (genExecuteOnly) {
+ unsigned Type = ELF::SHT_PROGBITS;
+ unsigned Flags = ELF::SHF_EXECINSTR | ELF::SHF_ALLOC | ELF::SHF_ARM_PURECODE;
+ // Since we cannot modify flags for an existing section, we create a new
+ // section with the right flags, and use 0 as the unique ID for
+ // execute-only text
+ TextSection = Ctx.getELFSection(".text", Type, Flags, 0, "", 0U);
+ }
+}
+
+const MCExpr *ARMElfTargetObjectFile::getTTypeGlobalReference(
+ const GlobalValue *GV, unsigned Encoding, const TargetMachine &TM,
+ MachineModuleInfo *MMI, MCStreamer &Streamer) const {
+ if (TM.getMCAsmInfo()->getExceptionHandlingType() != ExceptionHandling::ARM)
+ return TargetLoweringObjectFileELF::getTTypeGlobalReference(
+ GV, Encoding, TM, MMI, Streamer);
+
+ assert(Encoding == DW_EH_PE_absptr && "Can handle absptr encoding only");
+
+ return MCSymbolRefExpr::create(TM.getSymbol(GV),
+ MCSymbolRefExpr::VK_ARM_TARGET2, getContext());
+}
+
+const MCExpr *ARMElfTargetObjectFile::
+getDebugThreadLocalSymbol(const MCSymbol *Sym) const {
+ return MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_ARM_TLSLDO,
+ getContext());
+}
+
+MCSection *
+ARMElfTargetObjectFile::getExplicitSectionGlobal(const GlobalObject *GO,
+ SectionKind SK, const TargetMachine &TM) const {
+ // Set execute-only access for the explicit section
+ if (genExecuteOnly && SK.isText())
+ SK = SectionKind::getExecuteOnly();
+
+ return TargetLoweringObjectFileELF::getExplicitSectionGlobal(GO, SK, TM);
+}
+
+MCSection *
+ARMElfTargetObjectFile::SelectSectionForGlobal(const GlobalObject *GO,
+ SectionKind SK, const TargetMachine &TM) const {
+ // Place the global in the execute-only text section
+ if (genExecuteOnly && SK.isText())
+ SK = SectionKind::getExecuteOnly();
+
+ return TargetLoweringObjectFileELF::SelectSectionForGlobal(GO, SK, TM);
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.h b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.h
new file mode 100644
index 000000000000..24e755ddac27
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.h
@@ -0,0 +1,50 @@
+//===-- llvm/Target/ARMTargetObjectFile.h - ARM Object Info -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMTARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_ARM_ARMTARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+
+namespace llvm {
+
+class MCContext;
+class TargetMachine;
+
+class ARMElfTargetObjectFile : public TargetLoweringObjectFileELF {
+ mutable bool genExecuteOnly = false;
+protected:
+ const MCSection *AttributesSection;
+public:
+ ARMElfTargetObjectFile()
+ : TargetLoweringObjectFileELF(), AttributesSection(nullptr) {
+ PLTRelativeVariantKind = MCSymbolRefExpr::VK_ARM_PREL31;
+ }
+
+ void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+
+ const MCExpr *getTTypeGlobalReference(const GlobalValue *GV,
+ unsigned Encoding,
+ const TargetMachine &TM,
+ MachineModuleInfo *MMI,
+ MCStreamer &Streamer) const override;
+
+ /// \brief Describe a TLS variable address within debug info.
+ const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override;
+
+ MCSection *getExplicitSectionGlobal(const GlobalObject *GO, SectionKind Kind,
+ const TargetMachine &TM) const override;
+
+ MCSection *SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind,
+ const TargetMachine &TM) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
new file mode 100644
index 000000000000..10e6297ef1ed
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -0,0 +1,538 @@
+//===-- ARMTargetTransformInfo.cpp - ARM specific TTI ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMTargetTransformInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/CostTable.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "armtti"
+
+int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
+ assert(Ty->isIntegerTy());
+
+ unsigned Bits = Ty->getPrimitiveSizeInBits();
+ if (Bits == 0 || Imm.getActiveBits() >= 64)
+ return 4;
+
+ int64_t SImmVal = Imm.getSExtValue();
+ uint64_t ZImmVal = Imm.getZExtValue();
+ if (!ST->isThumb()) {
+ if ((SImmVal >= 0 && SImmVal < 65536) ||
+ (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
+ (ARM_AM::getSOImmVal(~ZImmVal) != -1))
+ return 1;
+ return ST->hasV6T2Ops() ? 2 : 3;
+ }
+ if (ST->isThumb2()) {
+ if ((SImmVal >= 0 && SImmVal < 65536) ||
+ (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
+ (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
+ return 1;
+ return ST->hasV6T2Ops() ? 2 : 3;
+ }
+ // Thumb1.
+ if (SImmVal >= 0 && SImmVal < 256)
+ return 1;
+ if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
+ return 2;
+ // Load from constantpool.
+ return 3;
+}
+
+
+// Constants smaller than 256 fit in the immediate field of
+// Thumb1 instructions so we return a zero cost and 1 otherwise.
+int ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
+ const APInt &Imm, Type *Ty) {
+ if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
+ return 0;
+
+ return 1;
+}
+
+int ARMTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+ Type *Ty) {
+ // Division by a constant can be turned into multiplication, but only if we
+ // know it's constant. So it's not so much that the immediate is cheap (it's
+ // not), but that the alternative is worse.
+ // FIXME: this is probably unneeded with GlobalISel.
+ if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
+ Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
+ Idx == 1)
+ return 0;
+
+ if (Opcode == Instruction::And)
+ // Conversion to BIC is free, and means we can use ~Imm instead.
+ return std::min(getIntImmCost(Imm, Ty), getIntImmCost(~Imm, Ty));
+
+ if (Opcode == Instruction::Add)
+ // Conversion to SUB is free, and means we can use -Imm instead.
+ return std::min(getIntImmCost(Imm, Ty), getIntImmCost(-Imm, Ty));
+
+ if (Opcode == Instruction::ICmp && Imm.isNegative() &&
+ Ty->getIntegerBitWidth() == 32) {
+ int64_t NegImm = -Imm.getSExtValue();
+ if (ST->isThumb2() && NegImm < 1<<12)
+ // icmp X, #-C -> cmn X, #C
+ return 0;
+ if (ST->isThumb() && NegImm < 1<<8)
+ // icmp X, #-C -> adds X, #C
+ return 0;
+ }
+
+ return getIntImmCost(Imm, Ty);
+}
+
+
+int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ assert(ISD && "Invalid opcode");
+
+ // Single to/from double precision conversions.
+ static const CostTblEntry NEONFltDblTbl[] = {
+ // Vector fptrunc/fpext conversions.
+ { ISD::FP_ROUND, MVT::v2f64, 2 },
+ { ISD::FP_EXTEND, MVT::v2f32, 2 },
+ { ISD::FP_EXTEND, MVT::v4f32, 4 }
+ };
+
+ if (Src->isVectorTy() && ST->hasNEON() && (ISD == ISD::FP_ROUND ||
+ ISD == ISD::FP_EXTEND)) {
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
+ if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ EVT SrcTy = TLI->getValueType(DL, Src);
+ EVT DstTy = TLI->getValueType(DL, Dst);
+
+ if (!SrcTy.isSimple() || !DstTy.isSimple())
+ return BaseT::getCastInstrCost(Opcode, Dst, Src);
+
+ // Some arithmetic, load and store operations have specific instructions
+ // to cast up/down their types automatically at no extra cost.
+ // TODO: Get these tables to know at least what the related operations are.
+ static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0 },
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0 },
+ { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
+ { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
+ { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
+ { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
+
+ // The number of vmovl instructions for the extension.
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
+ { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
+ { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
+ { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
+ { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
+ { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
+ { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
+
+ // Operations that we legalize using splitting.
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
+
+ // Vector float <-> i32 conversions.
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
+
+ { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
+ { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
+ { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
+ { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
+ { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
+ { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
+ { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
+ { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
+ { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
+
+ { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
+ { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 },
+ { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 },
+ { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
+ { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
+
+ // Vector double <-> i32 conversions.
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
+
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
+
+ { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
+ { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
+ { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 4 },
+ { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 4 },
+ { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 8 },
+ { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 8 }
+ };
+
+ if (SrcTy.isVector() && ST->hasNEON()) {
+ if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
+ DstTy.getSimpleVT(),
+ SrcTy.getSimpleVT()))
+ return Entry->Cost;
+ }
+
+ // Scalar float to integer conversions.
+ static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
+ { ISD::FP_TO_SINT, MVT::i1, MVT::f32, 2 },
+ { ISD::FP_TO_UINT, MVT::i1, MVT::f32, 2 },
+ { ISD::FP_TO_SINT, MVT::i1, MVT::f64, 2 },
+ { ISD::FP_TO_UINT, MVT::i1, MVT::f64, 2 },
+ { ISD::FP_TO_SINT, MVT::i8, MVT::f32, 2 },
+ { ISD::FP_TO_UINT, MVT::i8, MVT::f32, 2 },
+ { ISD::FP_TO_SINT, MVT::i8, MVT::f64, 2 },
+ { ISD::FP_TO_UINT, MVT::i8, MVT::f64, 2 },
+ { ISD::FP_TO_SINT, MVT::i16, MVT::f32, 2 },
+ { ISD::FP_TO_UINT, MVT::i16, MVT::f32, 2 },
+ { ISD::FP_TO_SINT, MVT::i16, MVT::f64, 2 },
+ { ISD::FP_TO_UINT, MVT::i16, MVT::f64, 2 },
+ { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 2 },
+ { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 2 },
+ { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 2 },
+ { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 2 },
+ { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 10 },
+ { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 10 },
+ { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 10 },
+ { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 10 }
+ };
+ if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
+ if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
+ DstTy.getSimpleVT(),
+ SrcTy.getSimpleVT()))
+ return Entry->Cost;
+ }
+
+ // Scalar integer to float conversions.
+ static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
+ { ISD::SINT_TO_FP, MVT::f32, MVT::i1, 2 },
+ { ISD::UINT_TO_FP, MVT::f32, MVT::i1, 2 },
+ { ISD::SINT_TO_FP, MVT::f64, MVT::i1, 2 },
+ { ISD::UINT_TO_FP, MVT::f64, MVT::i1, 2 },
+ { ISD::SINT_TO_FP, MVT::f32, MVT::i8, 2 },
+ { ISD::UINT_TO_FP, MVT::f32, MVT::i8, 2 },
+ { ISD::SINT_TO_FP, MVT::f64, MVT::i8, 2 },
+ { ISD::UINT_TO_FP, MVT::f64, MVT::i8, 2 },
+ { ISD::SINT_TO_FP, MVT::f32, MVT::i16, 2 },
+ { ISD::UINT_TO_FP, MVT::f32, MVT::i16, 2 },
+ { ISD::SINT_TO_FP, MVT::f64, MVT::i16, 2 },
+ { ISD::UINT_TO_FP, MVT::f64, MVT::i16, 2 },
+ { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 2 },
+ { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 2 },
+ { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 2 },
+ { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 2 },
+ { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 10 },
+ { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 10 },
+ { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 10 },
+ { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 10 }
+ };
+
+ if (SrcTy.isInteger() && ST->hasNEON()) {
+ if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
+ ISD, DstTy.getSimpleVT(),
+ SrcTy.getSimpleVT()))
+ return Entry->Cost;
+ }
+
+ // Scalar integer conversion costs.
+ static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
+ // i16 -> i64 requires two dependent operations.
+ { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
+
+ // Truncates on i64 are assumed to be free.
+ { ISD::TRUNCATE, MVT::i32, MVT::i64, 0 },
+ { ISD::TRUNCATE, MVT::i16, MVT::i64, 0 },
+ { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 },
+ { ISD::TRUNCATE, MVT::i1, MVT::i64, 0 }
+ };
+
+ if (SrcTy.isInteger()) {
+ if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
+ DstTy.getSimpleVT(),
+ SrcTy.getSimpleVT()))
+ return Entry->Cost;
+ }
+
+ return BaseT::getCastInstrCost(Opcode, Dst, Src);
+}
+
+int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
+ unsigned Index) {
+ // Penalize inserting into an D-subregister. We end up with a three times
+ // lower estimated throughput on swift.
+ if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
+ ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
+ return 3;
+
+ if ((Opcode == Instruction::InsertElement ||
+ Opcode == Instruction::ExtractElement)) {
+ // Cross-class copies are expensive on many microarchitectures,
+ // so assume they are expensive by default.
+ if (ValTy->getVectorElementType()->isIntegerTy())
+ return 3;
+
+ // Even if it's not a cross class copy, this likely leads to mixing
+ // of NEON and VFP code and should be therefore penalized.
+ if (ValTy->isVectorTy() &&
+ ValTy->getScalarSizeInBits() <= 32)
+ return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
+ }
+
+ return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
+}
+
+int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
+
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ // On NEON a a vector select gets lowered to vbsl.
+ if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT) {
+ // Lowering of some vector selects is currently far from perfect.
+ static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
+ { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
+ { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
+ { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
+ };
+
+ EVT SelCondTy = TLI->getValueType(DL, CondTy);
+ EVT SelValTy = TLI->getValueType(DL, ValTy);
+ if (SelCondTy.isSimple() && SelValTy.isSimple()) {
+ if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
+ SelCondTy.getSimpleVT(),
+ SelValTy.getSimpleVT()))
+ return Entry->Cost;
+ }
+
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+ return LT.first;
+ }
+
+ return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+}
+
+int ARMTTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
+ // Address computations in vectorized code with non-consecutive addresses will
+ // likely result in more instructions compared to scalar code where the
+ // computation can more often be merged into the index mode. The resulting
+ // extra micro-ops can significantly decrease throughput.
+ unsigned NumVectorInstToHideOverhead = 10;
+
+ if (Ty->isVectorTy() && IsComplex)
+ return NumVectorInstToHideOverhead;
+
+ // In many cases the address computation is not merged into the instruction
+ // addressing mode.
+ return 1;
+}
+
+int ARMTTIImpl::getFPOpCost(Type *Ty) {
+ // Use similar logic that's in ARMISelLowering:
+ // Any ARM CPU with VFP2 has floating point, but Thumb1 didn't have access
+ // to VFP.
+
+ if (ST->hasVFP2() && !ST->isThumb1Only()) {
+ if (Ty->isFloatTy()) {
+ return TargetTransformInfo::TCC_Basic;
+ }
+
+ if (Ty->isDoubleTy()) {
+ return ST->isFPOnlySP() ? TargetTransformInfo::TCC_Expensive :
+ TargetTransformInfo::TCC_Basic;
+ }
+ }
+
+ return TargetTransformInfo::TCC_Expensive;
+}
+
+int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+ Type *SubTp) {
+ // We only handle costs of reverse and alternate shuffles for now.
+ if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Alternate)
+ return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+
+ if (Kind == TTI::SK_Reverse) {
+ static const CostTblEntry NEONShuffleTbl[] = {
+ // Reverse shuffle cost one instruction if we are shuffling within a
+ // double word (vrev) or two if we shuffle a quad word (vrev, vext).
+ {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
+
+ {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
+ {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
+ {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
+ {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
+
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+
+ if (const auto *Entry = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE,
+ LT.second))
+ return LT.first * Entry->Cost;
+
+ return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+ }
+ if (Kind == TTI::SK_Alternate) {
+ static const CostTblEntry NEONAltShuffleTbl[] = {
+ // Alt shuffle cost table for ARM. Cost is the number of instructions
+ // required to create the shuffled vector.
+
+ {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
+
+ {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
+ {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
+ {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
+
+ {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
+
+ {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
+
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+ if (const auto *Entry = CostTableLookup(NEONAltShuffleTbl,
+ ISD::VECTOR_SHUFFLE, LT.second))
+ return LT.first * Entry->Cost;
+ return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+ }
+ return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+}
+
+int ARMTTIImpl::getArithmeticInstrCost(
+ unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
+ TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
+ TTI::OperandValueProperties Opd2PropInfo) {
+
+ int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+
+ const unsigned FunctionCallDivCost = 20;
+ const unsigned ReciprocalDivCost = 10;
+ static const CostTblEntry CostTbl[] = {
+ // Division.
+ // These costs are somewhat random. Choose a cost of 20 to indicate that
+ // vectorizing devision (added function call) is going to be very expensive.
+ // Double registers types.
+ { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
+ { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
+ { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
+ { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
+ { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
+ { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
+ { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
+ { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
+ { ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
+ { ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
+ { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
+ { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
+ { ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
+ { ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
+ { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost},
+ { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost},
+ // Quad register types.
+ { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
+ { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
+ { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
+ { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
+ { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
+ { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
+ { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
+ { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
+ { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
+ { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
+ { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
+ { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
+ { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
+ { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
+ { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
+ { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
+ // Multiplication.
+ };
+
+ if (ST->hasNEON())
+ if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
+ return LT.first * Entry->Cost;
+
+ int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
+ Opd1PropInfo, Opd2PropInfo);
+
+ // This is somewhat of a hack. The problem that we are facing is that SROA
+ // creates a sequence of shift, and, or instructions to construct values.
+ // These sequences are recognized by the ISel and have zero-cost. Not so for
+ // the vectorized code. Because we have support for v2i64 but not i64 those
+ // sequences look particularly beneficial to vectorize.
+ // To work around this we increase the cost of v2i64 operations to make them
+ // seem less beneficial.
+ if (LT.second == MVT::v2i64 &&
+ Op2Info == TargetTransformInfo::OK_UniformConstantValue)
+ Cost += 4;
+
+ return Cost;
+}
+
+int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+ unsigned AddressSpace) {
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
+
+ if (Src->isVectorTy() && Alignment != 16 &&
+ Src->getVectorElementType()->isDoubleTy()) {
+ // Unaligned loads/stores are extremely inefficient.
+ // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
+ return LT.first * 4;
+ }
+ return LT.first;
+}
+
+int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+ unsigned Factor,
+ ArrayRef<unsigned> Indices,
+ unsigned Alignment,
+ unsigned AddressSpace) {
+ assert(Factor >= 2 && "Invalid interleave factor");
+ assert(isa<VectorType>(VecTy) && "Expect a vector type");
+
+ // vldN/vstN doesn't support vector types of i64/f64 element.
+ bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
+
+ if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits) {
+ unsigned NumElts = VecTy->getVectorNumElements();
+ Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
+ unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
+
+ // vldN/vstN only support legal vector types of size 64 or 128 in bits.
+ if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128))
+ return Factor;
+ }
+
+ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace);
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
new file mode 100644
index 000000000000..d83228afb0ab
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -0,0 +1,139 @@
+//===-- ARMTargetTransformInfo.h - ARM specific TTI -------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file a TargetTransformInfo::Concept conforming object specific to the
+/// ARM target machine. It uses the target's detailed information to
+/// provide more precise answers to certain TTI queries, while letting the
+/// target independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMTARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_ARM_ARMTARGETTRANSFORMINFO_H
+
+#include "ARM.h"
+#include "ARMTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
+ typedef BasicTTIImplBase<ARMTTIImpl> BaseT;
+ typedef TargetTransformInfo TTI;
+ friend BaseT;
+
+ const ARMSubtarget *ST;
+ const ARMTargetLowering *TLI;
+
+ /// Estimate the overhead of scalarizing an instruction. Insert and Extract
+ /// are set if the result needs to be inserted and/or extracted from vectors.
+ unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract);
+
+ const ARMSubtarget *getST() const { return ST; }
+ const ARMTargetLowering *getTLI() const { return TLI; }
+
+public:
+ explicit ARMTTIImpl(const ARMBaseTargetMachine *TM, const Function &F)
+ : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+ TLI(ST->getTargetLowering()) {}
+
+ bool enableInterleavedAccessVectorization() { return true; }
+
+ /// Floating-point computation using ARMv8 AArch32 Advanced
+ /// SIMD instructions remains unchanged from ARMv7. Only AArch64 SIMD
+ /// is IEEE-754 compliant, but it's not covered in this target.
+ bool isFPVectorizationPotentiallyUnsafe() {
+ return !ST->isTargetDarwin();
+ }
+
+ /// \name Scalar TTI Implementations
+ /// @{
+
+ int getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+ Type *Ty);
+
+ using BaseT::getIntImmCost;
+ int getIntImmCost(const APInt &Imm, Type *Ty);
+
+ int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
+
+ /// @}
+
+ /// \name Vector TTI Implementations
+ /// @{
+
+ unsigned getNumberOfRegisters(bool Vector) {
+ if (Vector) {
+ if (ST->hasNEON())
+ return 16;
+ return 0;
+ }
+
+ if (ST->isThumb1Only())
+ return 8;
+ return 13;
+ }
+
+ unsigned getRegisterBitWidth(bool Vector) {
+ if (Vector) {
+ if (ST->hasNEON())
+ return 128;
+ return 0;
+ }
+
+ return 32;
+ }
+
+ unsigned getMaxInterleaveFactor(unsigned VF) {
+ return ST->getMaxInterleaveFactor();
+ }
+
+ int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
+
+ int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
+
+ int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+
+ int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+
+ int getAddressComputationCost(Type *Val, bool IsComplex);
+
+ int getFPOpCost(Type *Ty);
+
+ int getArithmeticInstrCost(
+ unsigned Opcode, Type *Ty,
+ TTI::OperandValueKind Op1Info = TTI::OK_AnyValue,
+ TTI::OperandValueKind Op2Info = TTI::OK_AnyValue,
+ TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+
+ int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+ unsigned AddressSpace);
+
+ int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
+ ArrayRef<unsigned> Indices, unsigned Alignment,
+ unsigned AddressSpace);
+
+ bool shouldBuildLookupTablesForConstant(Constant *C) const {
+ // In the ROPI and RWPI relocation models we can't have pointers to global
+ // variables or functions in constant data, so don't convert switches to
+ // lookup tables if any of the values would need relocation.
+ if (ST->isROPI() || ST->isRWPI())
+ return !C->needsRelocation();
+
+ return true;
+ }
+ /// @}
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
new file mode 100644
index 000000000000..c243a2d35979
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -0,0 +1,10352 @@
+//===-- ARMAsmParser.cpp - Parse ARM assembly to MCInst instructions ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMFeatures.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "MCTargetDesc/ARMMCExpr.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCAsmParserUtils.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ARMBuildAttributes.h"
+#include "llvm/Support/ARMEHABI.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetParser.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+enum class ImplicitItModeTy { Always, Never, ARMOnly, ThumbOnly };
+
+static cl::opt<ImplicitItModeTy> ImplicitItMode(
+ "arm-implicit-it", cl::init(ImplicitItModeTy::ARMOnly),
+ cl::desc("Allow conditional instructions outdside of an IT block"),
+ cl::values(clEnumValN(ImplicitItModeTy::Always, "always",
+ "Accept in both ISAs, emit implicit ITs in Thumb"),
+ clEnumValN(ImplicitItModeTy::Never, "never",
+ "Warn in ARM, reject in Thumb"),
+ clEnumValN(ImplicitItModeTy::ARMOnly, "arm",
+ "Accept in ARM, reject in Thumb"),
+ clEnumValN(ImplicitItModeTy::ThumbOnly, "thumb",
+ "Warn in ARM, emit implicit ITs in Thumb")));
+
+class ARMOperand;
+
+enum VectorLaneTy { NoLanes, AllLanes, IndexedLane };
+
+class UnwindContext {
+ MCAsmParser &Parser;
+
+ typedef SmallVector<SMLoc, 4> Locs;
+
+ Locs FnStartLocs;
+ Locs CantUnwindLocs;
+ Locs PersonalityLocs;
+ Locs PersonalityIndexLocs;
+ Locs HandlerDataLocs;
+ int FPReg;
+
+public:
+ UnwindContext(MCAsmParser &P) : Parser(P), FPReg(ARM::SP) {}
+
+ bool hasFnStart() const { return !FnStartLocs.empty(); }
+ bool cantUnwind() const { return !CantUnwindLocs.empty(); }
+ bool hasHandlerData() const { return !HandlerDataLocs.empty(); }
+ bool hasPersonality() const {
+ return !(PersonalityLocs.empty() && PersonalityIndexLocs.empty());
+ }
+
+ void recordFnStart(SMLoc L) { FnStartLocs.push_back(L); }
+ void recordCantUnwind(SMLoc L) { CantUnwindLocs.push_back(L); }
+ void recordPersonality(SMLoc L) { PersonalityLocs.push_back(L); }
+ void recordHandlerData(SMLoc L) { HandlerDataLocs.push_back(L); }
+ void recordPersonalityIndex(SMLoc L) { PersonalityIndexLocs.push_back(L); }
+
+ void saveFPReg(int Reg) { FPReg = Reg; }
+ int getFPReg() const { return FPReg; }
+
+ void emitFnStartLocNotes() const {
+ for (Locs::const_iterator FI = FnStartLocs.begin(), FE = FnStartLocs.end();
+ FI != FE; ++FI)
+ Parser.Note(*FI, ".fnstart was specified here");
+ }
+ void emitCantUnwindLocNotes() const {
+ for (Locs::const_iterator UI = CantUnwindLocs.begin(),
+ UE = CantUnwindLocs.end(); UI != UE; ++UI)
+ Parser.Note(*UI, ".cantunwind was specified here");
+ }
+ void emitHandlerDataLocNotes() const {
+ for (Locs::const_iterator HI = HandlerDataLocs.begin(),
+ HE = HandlerDataLocs.end(); HI != HE; ++HI)
+ Parser.Note(*HI, ".handlerdata was specified here");
+ }
+ void emitPersonalityLocNotes() const {
+ for (Locs::const_iterator PI = PersonalityLocs.begin(),
+ PE = PersonalityLocs.end(),
+ PII = PersonalityIndexLocs.begin(),
+ PIE = PersonalityIndexLocs.end();
+ PI != PE || PII != PIE;) {
+ if (PI != PE && (PII == PIE || PI->getPointer() < PII->getPointer()))
+ Parser.Note(*PI++, ".personality was specified here");
+ else if (PII != PIE && (PI == PE || PII->getPointer() < PI->getPointer()))
+ Parser.Note(*PII++, ".personalityindex was specified here");
+ else
+ llvm_unreachable(".personality and .personalityindex cannot be "
+ "at the same location");
+ }
+ }
+
+ void reset() {
+ FnStartLocs = Locs();
+ CantUnwindLocs = Locs();
+ PersonalityLocs = Locs();
+ HandlerDataLocs = Locs();
+ PersonalityIndexLocs = Locs();
+ FPReg = ARM::SP;
+ }
+};
+
+class ARMAsmParser : public MCTargetAsmParser {
+ const MCInstrInfo &MII;
+ const MCRegisterInfo *MRI;
+ UnwindContext UC;
+
+ ARMTargetStreamer &getTargetStreamer() {
+ assert(getParser().getStreamer().getTargetStreamer() &&
+ "do not have a target streamer");
+ MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
+ return static_cast<ARMTargetStreamer &>(TS);
+ }
+
+ // Map of register aliases registers via the .req directive.
+ StringMap<unsigned> RegisterReqs;
+
+ bool NextSymbolIsThumb;
+
+ bool useImplicitITThumb() const {
+ return ImplicitItMode == ImplicitItModeTy::Always ||
+ ImplicitItMode == ImplicitItModeTy::ThumbOnly;
+ }
+
+ bool useImplicitITARM() const {
+ return ImplicitItMode == ImplicitItModeTy::Always ||
+ ImplicitItMode == ImplicitItModeTy::ARMOnly;
+ }
+
+ struct {
+ ARMCC::CondCodes Cond; // Condition for IT block.
+ unsigned Mask:4; // Condition mask for instructions.
+ // Starting at first 1 (from lsb).
+ // '1' condition as indicated in IT.
+ // '0' inverse of condition (else).
+ // Count of instructions in IT block is
+ // 4 - trailingzeroes(mask)
+ // Note that this does not have the same encoding
+ // as in the IT instruction, which also depends
+ // on the low bit of the condition code.
+
+ unsigned CurPosition; // Current position in parsing of IT
+ // block. In range [0,4], with 0 being the IT
+ // instruction itself. Initialized according to
+ // count of instructions in block. ~0U if no
+ // active IT block.
+
+ bool IsExplicit; // true - The IT instruction was present in the
+ // input, we should not modify it.
+ // false - The IT instruction was added
+ // implicitly, we can extend it if that
+ // would be legal.
+ } ITState;
+
+ llvm::SmallVector<MCInst, 4> PendingConditionalInsts;
+
+ void flushPendingInstructions(MCStreamer &Out) override {
+ if (!inImplicitITBlock()) {
+ assert(PendingConditionalInsts.size() == 0);
+ return;
+ }
+
+ // Emit the IT instruction
+ unsigned Mask = getITMaskEncoding();
+ MCInst ITInst;
+ ITInst.setOpcode(ARM::t2IT);
+ ITInst.addOperand(MCOperand::createImm(ITState.Cond));
+ ITInst.addOperand(MCOperand::createImm(Mask));
+ Out.EmitInstruction(ITInst, getSTI());
+
+ // Emit the conditonal instructions
+ assert(PendingConditionalInsts.size() <= 4);
+ for (const MCInst &Inst : PendingConditionalInsts) {
+ Out.EmitInstruction(Inst, getSTI());
+ }
+ PendingConditionalInsts.clear();
+
+ // Clear the IT state
+ ITState.Mask = 0;
+ ITState.CurPosition = ~0U;
+ }
+
+ bool inITBlock() { return ITState.CurPosition != ~0U; }
+ bool inExplicitITBlock() { return inITBlock() && ITState.IsExplicit; }
+ bool inImplicitITBlock() { return inITBlock() && !ITState.IsExplicit; }
+ bool lastInITBlock() {
+ return ITState.CurPosition == 4 - countTrailingZeros(ITState.Mask);
+ }
+ void forwardITPosition() {
+ if (!inITBlock()) return;
+ // Move to the next instruction in the IT block, if there is one. If not,
+ // mark the block as done, except for implicit IT blocks, which we leave
+ // open until we find an instruction that can't be added to it.
+ unsigned TZ = countTrailingZeros(ITState.Mask);
+ if (++ITState.CurPosition == 5 - TZ && ITState.IsExplicit)
+ ITState.CurPosition = ~0U; // Done with the IT block after this.
+ }
+
+ // Rewind the state of the current IT block, removing the last slot from it.
+ void rewindImplicitITPosition() {
+ assert(inImplicitITBlock());
+ assert(ITState.CurPosition > 1);
+ ITState.CurPosition--;
+ unsigned TZ = countTrailingZeros(ITState.Mask);
+ unsigned NewMask = 0;
+ NewMask |= ITState.Mask & (0xC << TZ);
+ NewMask |= 0x2 << TZ;
+ ITState.Mask = NewMask;
+ }
+
+ // Rewind the state of the current IT block, removing the last slot from it.
+ // If we were at the first slot, this closes the IT block.
+ void discardImplicitITBlock() {
+ assert(inImplicitITBlock());
+ assert(ITState.CurPosition == 1);
+ ITState.CurPosition = ~0U;
+ return;
+ }
+
+ // Get the encoding of the IT mask, as it will appear in an IT instruction.
+ unsigned getITMaskEncoding() {
+ assert(inITBlock());
+ unsigned Mask = ITState.Mask;
+ unsigned TZ = countTrailingZeros(Mask);
+ if ((ITState.Cond & 1) == 0) {
+ assert(Mask && TZ <= 3 && "illegal IT mask value!");
+ Mask ^= (0xE << TZ) & 0xF;
+ }
+ return Mask;
+ }
+
+ // Get the condition code corresponding to the current IT block slot.
+ ARMCC::CondCodes currentITCond() {
+ unsigned MaskBit;
+ if (ITState.CurPosition == 1)
+ MaskBit = 1;
+ else
+ MaskBit = (ITState.Mask >> (5 - ITState.CurPosition)) & 1;
+
+ return MaskBit ? ITState.Cond : ARMCC::getOppositeCondition(ITState.Cond);
+ }
+
+ // Invert the condition of the current IT block slot without changing any
+ // other slots in the same block.
+ void invertCurrentITCondition() {
+ if (ITState.CurPosition == 1) {
+ ITState.Cond = ARMCC::getOppositeCondition(ITState.Cond);
+ } else {
+ ITState.Mask ^= 1 << (5 - ITState.CurPosition);
+ }
+ }
+
+ // Returns true if the current IT block is full (all 4 slots used).
+ bool isITBlockFull() {
+ return inITBlock() && (ITState.Mask & 1);
+ }
+
+ // Extend the current implicit IT block to have one more slot with the given
+ // condition code.
+ void extendImplicitITBlock(ARMCC::CondCodes Cond) {
+ assert(inImplicitITBlock());
+ assert(!isITBlockFull());
+ assert(Cond == ITState.Cond ||
+ Cond == ARMCC::getOppositeCondition(ITState.Cond));
+ unsigned TZ = countTrailingZeros(ITState.Mask);
+ unsigned NewMask = 0;
+ // Keep any existing condition bits.
+ NewMask |= ITState.Mask & (0xE << TZ);
+ // Insert the new condition bit.
+ NewMask |= (Cond == ITState.Cond) << TZ;
+ // Move the trailing 1 down one bit.
+ NewMask |= 1 << (TZ - 1);
+ ITState.Mask = NewMask;
+ }
+
+ // Create a new implicit IT block with a dummy condition code.
+ void startImplicitITBlock() {
+ assert(!inITBlock());
+ ITState.Cond = ARMCC::AL;
+ ITState.Mask = 8;
+ ITState.CurPosition = 1;
+ ITState.IsExplicit = false;
+ return;
+ }
+
+ // Create a new explicit IT block with the given condition and mask. The mask
+ // should be in the parsed format, with a 1 implying 't', regardless of the
+ // low bit of the condition.
+ void startExplicitITBlock(ARMCC::CondCodes Cond, unsigned Mask) {
+ assert(!inITBlock());
+ ITState.Cond = Cond;
+ ITState.Mask = Mask;
+ ITState.CurPosition = 0;
+ ITState.IsExplicit = true;
+ return;
+ }
+
+ void Note(SMLoc L, const Twine &Msg, SMRange Range = None) {
+ return getParser().Note(L, Msg, Range);
+ }
+ bool Warning(SMLoc L, const Twine &Msg, SMRange Range = None) {
+ return getParser().Warning(L, Msg, Range);
+ }
+ bool Error(SMLoc L, const Twine &Msg, SMRange Range = None) {
+ return getParser().Error(L, Msg, Range);
+ }
+
+ bool validatetLDMRegList(const MCInst &Inst, const OperandVector &Operands,
+ unsigned ListNo, bool IsARPop = false);
+ bool validatetSTMRegList(const MCInst &Inst, const OperandVector &Operands,
+ unsigned ListNo);
+
+ int tryParseRegister();
+ bool tryParseRegisterWithWriteBack(OperandVector &);
+ int tryParseShiftRegister(OperandVector &);
+ bool parseRegisterList(OperandVector &);
+ bool parseMemory(OperandVector &);
+ bool parseOperand(OperandVector &, StringRef Mnemonic);
+ bool parsePrefix(ARMMCExpr::VariantKind &RefKind);
+ bool parseMemRegOffsetShift(ARM_AM::ShiftOpc &ShiftType,
+ unsigned &ShiftAmount);
+ bool parseLiteralValues(unsigned Size, SMLoc L);
+ bool parseDirectiveThumb(SMLoc L);
+ bool parseDirectiveARM(SMLoc L);
+ bool parseDirectiveThumbFunc(SMLoc L);
+ bool parseDirectiveCode(SMLoc L);
+ bool parseDirectiveSyntax(SMLoc L);
+ bool parseDirectiveReq(StringRef Name, SMLoc L);
+ bool parseDirectiveUnreq(SMLoc L);
+ bool parseDirectiveArch(SMLoc L);
+ bool parseDirectiveEabiAttr(SMLoc L);
+ bool parseDirectiveCPU(SMLoc L);
+ bool parseDirectiveFPU(SMLoc L);
+ bool parseDirectiveFnStart(SMLoc L);
+ bool parseDirectiveFnEnd(SMLoc L);
+ bool parseDirectiveCantUnwind(SMLoc L);
+ bool parseDirectivePersonality(SMLoc L);
+ bool parseDirectiveHandlerData(SMLoc L);
+ bool parseDirectiveSetFP(SMLoc L);
+ bool parseDirectivePad(SMLoc L);
+ bool parseDirectiveRegSave(SMLoc L, bool IsVector);
+ bool parseDirectiveInst(SMLoc L, char Suffix = '\0');
+ bool parseDirectiveLtorg(SMLoc L);
+ bool parseDirectiveEven(SMLoc L);
+ bool parseDirectivePersonalityIndex(SMLoc L);
+ bool parseDirectiveUnwindRaw(SMLoc L);
+ bool parseDirectiveTLSDescSeq(SMLoc L);
+ bool parseDirectiveMovSP(SMLoc L);
+ bool parseDirectiveObjectArch(SMLoc L);
+ bool parseDirectiveArchExtension(SMLoc L);
+ bool parseDirectiveAlign(SMLoc L);
+ bool parseDirectiveThumbSet(SMLoc L);
+
+ StringRef splitMnemonic(StringRef Mnemonic, unsigned &PredicationCode,
+ bool &CarrySetting, unsigned &ProcessorIMod,
+ StringRef &ITMask);
+ void getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst,
+ bool &CanAcceptCarrySet,
+ bool &CanAcceptPredicationCode);
+
+ void tryConvertingToTwoOperandForm(StringRef Mnemonic, bool CarrySetting,
+ OperandVector &Operands);
+ bool isThumb() const {
+ // FIXME: Can tablegen auto-generate this?
+ return getSTI().getFeatureBits()[ARM::ModeThumb];
+ }
+ bool isThumbOne() const {
+ return isThumb() && !getSTI().getFeatureBits()[ARM::FeatureThumb2];
+ }
+ bool isThumbTwo() const {
+ return isThumb() && getSTI().getFeatureBits()[ARM::FeatureThumb2];
+ }
+ bool hasThumb() const {
+ return getSTI().getFeatureBits()[ARM::HasV4TOps];
+ }
+ bool hasThumb2() const {
+ return getSTI().getFeatureBits()[ARM::FeatureThumb2];
+ }
+ bool hasV6Ops() const {
+ return getSTI().getFeatureBits()[ARM::HasV6Ops];
+ }
+ bool hasV6T2Ops() const {
+ return getSTI().getFeatureBits()[ARM::HasV6T2Ops];
+ }
+ bool hasV6MOps() const {
+ return getSTI().getFeatureBits()[ARM::HasV6MOps];
+ }
+ bool hasV7Ops() const {
+ return getSTI().getFeatureBits()[ARM::HasV7Ops];
+ }
+ bool hasV8Ops() const {
+ return getSTI().getFeatureBits()[ARM::HasV8Ops];
+ }
+ bool hasV8MBaseline() const {
+ return getSTI().getFeatureBits()[ARM::HasV8MBaselineOps];
+ }
+ bool hasV8MMainline() const {
+ return getSTI().getFeatureBits()[ARM::HasV8MMainlineOps];
+ }
+ bool has8MSecExt() const {
+ return getSTI().getFeatureBits()[ARM::Feature8MSecExt];
+ }
+ bool hasARM() const {
+ return !getSTI().getFeatureBits()[ARM::FeatureNoARM];
+ }
+ bool hasDSP() const {
+ return getSTI().getFeatureBits()[ARM::FeatureDSP];
+ }
+ bool hasD16() const {
+ return getSTI().getFeatureBits()[ARM::FeatureD16];
+ }
+ bool hasV8_1aOps() const {
+ return getSTI().getFeatureBits()[ARM::HasV8_1aOps];
+ }
+ bool hasRAS() const {
+ return getSTI().getFeatureBits()[ARM::FeatureRAS];
+ }
+
+ void SwitchMode() {
+ MCSubtargetInfo &STI = copySTI();
+ uint64_t FB = ComputeAvailableFeatures(STI.ToggleFeature(ARM::ModeThumb));
+ setAvailableFeatures(FB);
+ }
+ void FixModeAfterArchChange(bool WasThumb, SMLoc Loc);
+ bool isMClass() const {
+ return getSTI().getFeatureBits()[ARM::FeatureMClass];
+ }
+
+ /// @name Auto-generated Match Functions
+ /// {
+
+#define GET_ASSEMBLER_HEADER
+#include "ARMGenAsmMatcher.inc"
+
+ /// }
+
+ OperandMatchResultTy parseITCondCode(OperandVector &);
+ OperandMatchResultTy parseCoprocNumOperand(OperandVector &);
+ OperandMatchResultTy parseCoprocRegOperand(OperandVector &);
+ OperandMatchResultTy parseCoprocOptionOperand(OperandVector &);
+ OperandMatchResultTy parseMemBarrierOptOperand(OperandVector &);
+ OperandMatchResultTy parseInstSyncBarrierOptOperand(OperandVector &);
+ OperandMatchResultTy parseProcIFlagsOperand(OperandVector &);
+ OperandMatchResultTy parseMSRMaskOperand(OperandVector &);
+ OperandMatchResultTy parseBankedRegOperand(OperandVector &);
+ OperandMatchResultTy parsePKHImm(OperandVector &O, StringRef Op, int Low,
+ int High);
+ OperandMatchResultTy parsePKHLSLImm(OperandVector &O) {
+ return parsePKHImm(O, "lsl", 0, 31);
+ }
+ OperandMatchResultTy parsePKHASRImm(OperandVector &O) {
+ return parsePKHImm(O, "asr", 1, 32);
+ }
+ OperandMatchResultTy parseSetEndImm(OperandVector &);
+ OperandMatchResultTy parseShifterImm(OperandVector &);
+ OperandMatchResultTy parseRotImm(OperandVector &);
+ OperandMatchResultTy parseModImm(OperandVector &);
+ OperandMatchResultTy parseBitfield(OperandVector &);
+ OperandMatchResultTy parsePostIdxReg(OperandVector &);
+ OperandMatchResultTy parseAM3Offset(OperandVector &);
+ OperandMatchResultTy parseFPImm(OperandVector &);
+ OperandMatchResultTy parseVectorList(OperandVector &);
+ OperandMatchResultTy parseVectorLane(VectorLaneTy &LaneKind, unsigned &Index,
+ SMLoc &EndLoc);
+
+ // Asm Match Converter Methods
+ void cvtThumbMultiply(MCInst &Inst, const OperandVector &);
+ void cvtThumbBranches(MCInst &Inst, const OperandVector &);
+
+ bool validateInstruction(MCInst &Inst, const OperandVector &Ops);
+ bool processInstruction(MCInst &Inst, const OperandVector &Ops, MCStreamer &Out);
+ bool shouldOmitCCOutOperand(StringRef Mnemonic, OperandVector &Operands);
+ bool shouldOmitPredicateOperand(StringRef Mnemonic, OperandVector &Operands);
+ bool isITBlockTerminator(MCInst &Inst) const;
+
+public:
+ enum ARMMatchResultTy {
+ Match_RequiresITBlock = FIRST_TARGET_MATCH_RESULT_TY,
+ Match_RequiresNotITBlock,
+ Match_RequiresV6,
+ Match_RequiresThumb2,
+ Match_RequiresV8,
+ Match_RequiresFlagSetting,
+#define GET_OPERAND_DIAGNOSTIC_TYPES
+#include "ARMGenAsmMatcher.inc"
+
+ };
+
+ ARMAsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
+ const MCInstrInfo &MII, const MCTargetOptions &Options)
+ : MCTargetAsmParser(Options, STI), MII(MII), UC(Parser) {
+ MCAsmParserExtension::Initialize(Parser);
+
+ // Cache the MCRegisterInfo.
+ MRI = getContext().getRegisterInfo();
+
+ // Initialize the set of available features.
+ setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+
+ // Not in an ITBlock to start with.
+ ITState.CurPosition = ~0U;
+
+ NextSymbolIsThumb = false;
+ }
+
+ // Implementation of the MCTargetAsmParser interface:
+ bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+ bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+ SMLoc NameLoc, OperandVector &Operands) override;
+ bool ParseDirective(AsmToken DirectiveID) override;
+
+ unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
+ unsigned Kind) override;
+ unsigned checkTargetMatchPredicate(MCInst &Inst) override;
+
+ bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands, MCStreamer &Out,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) override;
+ unsigned MatchInstruction(OperandVector &Operands, MCInst &Inst,
+ uint64_t &ErrorInfo, bool MatchingInlineAsm,
+ bool &EmitInITBlock, MCStreamer &Out);
+ void onLabelParsed(MCSymbol *Symbol) override;
+};
+} // end anonymous namespace
+
+namespace {
+
+/// ARMOperand - Instances of this class represent a parsed ARM machine
+/// operand.
+class ARMOperand : public MCParsedAsmOperand {
+ enum KindTy {
+ k_CondCode,
+ k_CCOut,
+ k_ITCondMask,
+ k_CoprocNum,
+ k_CoprocReg,
+ k_CoprocOption,
+ k_Immediate,
+ k_MemBarrierOpt,
+ k_InstSyncBarrierOpt,
+ k_Memory,
+ k_PostIndexRegister,
+ k_MSRMask,
+ k_BankedReg,
+ k_ProcIFlags,
+ k_VectorIndex,
+ k_Register,
+ k_RegisterList,
+ k_DPRRegisterList,
+ k_SPRRegisterList,
+ k_VectorList,
+ k_VectorListAllLanes,
+ k_VectorListIndexed,
+ k_ShiftedRegister,
+ k_ShiftedImmediate,
+ k_ShifterImmediate,
+ k_RotateImmediate,
+ k_ModifiedImmediate,
+ k_ConstantPoolImmediate,
+ k_BitfieldDescriptor,
+ k_Token,
+ } Kind;
+
+ SMLoc StartLoc, EndLoc, AlignmentLoc;
+ SmallVector<unsigned, 8> Registers;
+
+ struct CCOp {
+ ARMCC::CondCodes Val;
+ };
+
+ struct CopOp {
+ unsigned Val;
+ };
+
+ struct CoprocOptionOp {
+ unsigned Val;
+ };
+
+ struct ITMaskOp {
+ unsigned Mask:4;
+ };
+
+ struct MBOptOp {
+ ARM_MB::MemBOpt Val;
+ };
+
+ struct ISBOptOp {
+ ARM_ISB::InstSyncBOpt Val;
+ };
+
+ struct IFlagsOp {
+ ARM_PROC::IFlags Val;
+ };
+
+ struct MMaskOp {
+ unsigned Val;
+ };
+
+ struct BankedRegOp {
+ unsigned Val;
+ };
+
+ struct TokOp {
+ const char *Data;
+ unsigned Length;
+ };
+
+ struct RegOp {
+ unsigned RegNum;
+ };
+
+ // A vector register list is a sequential list of 1 to 4 registers.
+ struct VectorListOp {
+ unsigned RegNum;
+ unsigned Count;
+ unsigned LaneIndex;
+ bool isDoubleSpaced;
+ };
+
+ struct VectorIndexOp {
+ unsigned Val;
+ };
+
+ struct ImmOp {
+ const MCExpr *Val;
+ };
+
+ /// Combined record for all forms of ARM address expressions.
+ struct MemoryOp {
+ unsigned BaseRegNum;
+ // Offset is in OffsetReg or OffsetImm. If both are zero, no offset
+ // was specified.
+ const MCConstantExpr *OffsetImm; // Offset immediate value
+ unsigned OffsetRegNum; // Offset register num, when OffsetImm == NULL
+ ARM_AM::ShiftOpc ShiftType; // Shift type for OffsetReg
+ unsigned ShiftImm; // shift for OffsetReg.
+ unsigned Alignment; // 0 = no alignment specified
+ // n = alignment in bytes (2, 4, 8, 16, or 32)
+ unsigned isNegative : 1; // Negated OffsetReg? (~'U' bit)
+ };
+
+ struct PostIdxRegOp {
+ unsigned RegNum;
+ bool isAdd;
+ ARM_AM::ShiftOpc ShiftTy;
+ unsigned ShiftImm;
+ };
+
+ struct ShifterImmOp {
+ bool isASR;
+ unsigned Imm;
+ };
+
+ struct RegShiftedRegOp {
+ ARM_AM::ShiftOpc ShiftTy;
+ unsigned SrcReg;
+ unsigned ShiftReg;
+ unsigned ShiftImm;
+ };
+
+ struct RegShiftedImmOp {
+ ARM_AM::ShiftOpc ShiftTy;
+ unsigned SrcReg;
+ unsigned ShiftImm;
+ };
+
+ struct RotImmOp {
+ unsigned Imm;
+ };
+
+ struct ModImmOp {
+ unsigned Bits;
+ unsigned Rot;
+ };
+
+ struct BitfieldOp {
+ unsigned LSB;
+ unsigned Width;
+ };
+
+ union {
+ struct CCOp CC;
+ struct CopOp Cop;
+ struct CoprocOptionOp CoprocOption;
+ struct MBOptOp MBOpt;
+ struct ISBOptOp ISBOpt;
+ struct ITMaskOp ITMask;
+ struct IFlagsOp IFlags;
+ struct MMaskOp MMask;
+ struct BankedRegOp BankedReg;
+ struct TokOp Tok;
+ struct RegOp Reg;
+ struct VectorListOp VectorList;
+ struct VectorIndexOp VectorIndex;
+ struct ImmOp Imm;
+ struct MemoryOp Memory;
+ struct PostIdxRegOp PostIdxReg;
+ struct ShifterImmOp ShifterImm;
+ struct RegShiftedRegOp RegShiftedReg;
+ struct RegShiftedImmOp RegShiftedImm;
+ struct RotImmOp RotImm;
+ struct ModImmOp ModImm;
+ struct BitfieldOp Bitfield;
+ };
+
+public:
+ ARMOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
+
+ /// getStartLoc - Get the location of the first token of this operand.
+ SMLoc getStartLoc() const override { return StartLoc; }
+ /// getEndLoc - Get the location of the last token of this operand.
+ SMLoc getEndLoc() const override { return EndLoc; }
+ /// getLocRange - Get the range between the first and last token of this
+ /// operand.
+ SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); }
+
+ /// getAlignmentLoc - Get the location of the Alignment token of this operand.
+ SMLoc getAlignmentLoc() const {
+ assert(Kind == k_Memory && "Invalid access!");
+ return AlignmentLoc;
+ }
+
+ ARMCC::CondCodes getCondCode() const {
+ assert(Kind == k_CondCode && "Invalid access!");
+ return CC.Val;
+ }
+
+ unsigned getCoproc() const {
+ assert((Kind == k_CoprocNum || Kind == k_CoprocReg) && "Invalid access!");
+ return Cop.Val;
+ }
+
+ StringRef getToken() const {
+ assert(Kind == k_Token && "Invalid access!");
+ return StringRef(Tok.Data, Tok.Length);
+ }
+
+ unsigned getReg() const override {
+ assert((Kind == k_Register || Kind == k_CCOut) && "Invalid access!");
+ return Reg.RegNum;
+ }
+
+ const SmallVectorImpl<unsigned> &getRegList() const {
+ assert((Kind == k_RegisterList || Kind == k_DPRRegisterList ||
+ Kind == k_SPRRegisterList) && "Invalid access!");
+ return Registers;
+ }
+
+ const MCExpr *getImm() const {
+ assert(isImm() && "Invalid access!");
+ return Imm.Val;
+ }
+
+ const MCExpr *getConstantPoolImm() const {
+ assert(isConstantPoolImm() && "Invalid access!");
+ return Imm.Val;
+ }
+
+ unsigned getVectorIndex() const {
+ assert(Kind == k_VectorIndex && "Invalid access!");
+ return VectorIndex.Val;
+ }
+
+ ARM_MB::MemBOpt getMemBarrierOpt() const {
+ assert(Kind == k_MemBarrierOpt && "Invalid access!");
+ return MBOpt.Val;
+ }
+
+ ARM_ISB::InstSyncBOpt getInstSyncBarrierOpt() const {
+ assert(Kind == k_InstSyncBarrierOpt && "Invalid access!");
+ return ISBOpt.Val;
+ }
+
+ ARM_PROC::IFlags getProcIFlags() const {
+ assert(Kind == k_ProcIFlags && "Invalid access!");
+ return IFlags.Val;
+ }
+
+ unsigned getMSRMask() const {
+ assert(Kind == k_MSRMask && "Invalid access!");
+ return MMask.Val;
+ }
+
+ unsigned getBankedReg() const {
+ assert(Kind == k_BankedReg && "Invalid access!");
+ return BankedReg.Val;
+ }
+
+ bool isCoprocNum() const { return Kind == k_CoprocNum; }
+ bool isCoprocReg() const { return Kind == k_CoprocReg; }
+ bool isCoprocOption() const { return Kind == k_CoprocOption; }
+ bool isCondCode() const { return Kind == k_CondCode; }
+ bool isCCOut() const { return Kind == k_CCOut; }
+ bool isITMask() const { return Kind == k_ITCondMask; }
+ bool isITCondCode() const { return Kind == k_CondCode; }
+ bool isImm() const override {
+ return Kind == k_Immediate;
+ }
+
+ bool isARMBranchTarget() const {
+ if (!isImm()) return false;
+
+ if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()))
+ return CE->getValue() % 4 == 0;
+ return true;
+ }
+
+
+ bool isThumbBranchTarget() const {
+ if (!isImm()) return false;
+
+ if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()))
+ return CE->getValue() % 2 == 0;
+ return true;
+ }
+
+ // checks whether this operand is an unsigned offset which fits is a field
+ // of specified width and scaled by a specific number of bits
+ template<unsigned width, unsigned scale>
+ bool isUnsignedOffset() const {
+ if (!isImm()) return false;
+ if (isa<MCSymbolRefExpr>(Imm.Val)) return true;
+ if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
+ int64_t Val = CE->getValue();
+ int64_t Align = 1LL << scale;
+ int64_t Max = Align * ((1LL << width) - 1);
+ return ((Val % Align) == 0) && (Val >= 0) && (Val <= Max);
+ }
+ return false;
+ }
+ // checks whether this operand is an signed offset which fits is a field
+ // of specified width and scaled by a specific number of bits
+ template<unsigned width, unsigned scale>
+ bool isSignedOffset() const {
+ if (!isImm()) return false;
+ if (isa<MCSymbolRefExpr>(Imm.Val)) return true;
+ if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
+ int64_t Val = CE->getValue();
+ int64_t Align = 1LL << scale;
+ int64_t Max = Align * ((1LL << (width-1)) - 1);
+ int64_t Min = -Align * (1LL << (width-1));
+ return ((Val % Align) == 0) && (Val >= Min) && (Val <= Max);
+ }
+ return false;
+ }
+
+ // checks whether this operand is a memory operand computed as an offset
+ // applied to PC. the offset may have 8 bits of magnitude and is represented
+ // with two bits of shift. textually it may be either [pc, #imm], #imm or
+ // relocable expression...
+ bool isThumbMemPC() const {
+ int64_t Val = 0;
+ if (isImm()) {
+ if (isa<MCSymbolRefExpr>(Imm.Val)) return true;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val);
+ if (!CE) return false;
+ Val = CE->getValue();
+ }
+ else if (isMem()) {
+ if(!Memory.OffsetImm || Memory.OffsetRegNum) return false;
+ if(Memory.BaseRegNum != ARM::PC) return false;
+ Val = Memory.OffsetImm->getValue();
+ }
+ else return false;
+ return ((Val % 4) == 0) && (Val >= 0) && (Val <= 1020);
+ }
+ bool isFPImm() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int Val = ARM_AM::getFP32Imm(APInt(32, CE->getValue()));
+ return Val != -1;
+ }
+ bool isFBits16() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return Value >= 0 && Value <= 16;
+ }
+ bool isFBits32() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return Value >= 1 && Value <= 32;
+ }
+ bool isImm8s4() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return ((Value & 3) == 0) && Value >= -1020 && Value <= 1020;
+ }
+ bool isImm0_1020s4() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return ((Value & 3) == 0) && Value >= 0 && Value <= 1020;
+ }
+ bool isImm0_508s4() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return ((Value & 3) == 0) && Value >= 0 && Value <= 508;
+ }
+ bool isImm0_508s4Neg() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = -CE->getValue();
+ // explicitly exclude zero. we want that to use the normal 0_508 version.
+ return ((Value & 3) == 0) && Value > 0 && Value <= 508;
+ }
+ bool isImm0_239() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return Value >= 0 && Value < 240;
+ }
+ bool isImm0_255() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return Value >= 0 && Value < 256;
+ }
+ bool isImm0_4095() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return Value >= 0 && Value < 4096;
+ }
+ bool isImm0_4095Neg() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = -CE->getValue();
+ return Value > 0 && Value < 4096;
+ }
+ bool isImm0_1() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return Value >= 0 && Value < 2;
+ }
+ bool isImm0_3() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return Value >= 0 && Value < 4;
+ }
+ bool isImm0_7() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return Value >= 0 && Value < 8;
+ }
+ bool isImm0_15() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return Value >= 0 && Value < 16;
+ }
+ bool isImm0_31() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return Value >= 0 && Value < 32;
+ }
+ bool isImm0_63() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return Value >= 0 && Value < 64;
+ }
+ bool isImm8() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return Value == 8;
+ }
+ bool isImm16() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return Value == 16;
+ }
+ bool isImm32() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return Value == 32;
+ }
+ bool isShrImm8() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return Value > 0 && Value <= 8;
+ }
+ bool isShrImm16() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return Value > 0 && Value <= 16;
+ }
+ bool isShrImm32() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return Value > 0 && Value <= 32;
+ }
+ bool isShrImm64() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return Value > 0 && Value <= 64;
+ }
+ bool isImm1_7() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return Value > 0 && Value < 8;
+ }
+ bool isImm1_15() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return Value > 0 && Value < 16;
+ }
+ bool isImm1_31() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return Value > 0 && Value < 32;
+ }
+ bool isImm1_16() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return Value > 0 && Value < 17;
+ }
+ bool isImm1_32() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return Value > 0 && Value < 33;
+ }
+ bool isImm0_32() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return Value >= 0 && Value < 33;
+ }
+ bool isImm0_65535() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return Value >= 0 && Value < 65536;
+ }
+ bool isImm256_65535Expr() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ // If it's not a constant expression, it'll generate a fixup and be
+ // handled later.
+ if (!CE) return true;
+ int64_t Value = CE->getValue();
+ return Value >= 256 && Value < 65536;
+ }
+ bool isImm0_65535Expr() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ // If it's not a constant expression, it'll generate a fixup and be
+ // handled later.
+ if (!CE) return true;
+ int64_t Value = CE->getValue();
+ return Value >= 0 && Value < 65536;
+ }
+ bool isImm24bit() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return Value >= 0 && Value <= 0xffffff;
+ }
+ bool isImmThumbSR() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return Value > 0 && Value < 33;
+ }
+ bool isPKHLSLImm() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return Value >= 0 && Value < 32;
+ }
+ bool isPKHASRImm() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return Value > 0 && Value <= 32;
+ }
+ bool isAdrLabel() const {
+ // If we have an immediate that's not a constant, treat it as a label
+ // reference needing a fixup.
+ if (isImm() && !isa<MCConstantExpr>(getImm()))
+ return true;
+
+ // If it is a constant, it must fit into a modified immediate encoding.
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return (ARM_AM::getSOImmVal(Value) != -1 ||
+ ARM_AM::getSOImmVal(-Value) != -1);
+ }
+ bool isT2SOImm() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return ARM_AM::getT2SOImmVal(Value) != -1;
+ }
+ bool isT2SOImmNot() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return ARM_AM::getT2SOImmVal(Value) == -1 &&
+ ARM_AM::getT2SOImmVal(~Value) != -1;
+ }
+ bool isT2SOImmNeg() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ // Only use this when not representable as a plain so_imm.
+ return ARM_AM::getT2SOImmVal(Value) == -1 &&
+ ARM_AM::getT2SOImmVal(-Value) != -1;
+ }
+ bool isSetEndImm() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return Value == 1 || Value == 0;
+ }
+ bool isReg() const override { return Kind == k_Register; }
+ bool isRegList() const { return Kind == k_RegisterList; }
+ bool isDPRRegList() const { return Kind == k_DPRRegisterList; }
+ bool isSPRRegList() const { return Kind == k_SPRRegisterList; }
+ bool isToken() const override { return Kind == k_Token; }
+ bool isMemBarrierOpt() const { return Kind == k_MemBarrierOpt; }
+ bool isInstSyncBarrierOpt() const { return Kind == k_InstSyncBarrierOpt; }
+ bool isMem() const override { return Kind == k_Memory; }
+ bool isShifterImm() const { return Kind == k_ShifterImmediate; }
+ bool isRegShiftedReg() const { return Kind == k_ShiftedRegister; }
+ bool isRegShiftedImm() const { return Kind == k_ShiftedImmediate; }
+ bool isRotImm() const { return Kind == k_RotateImmediate; }
+ bool isModImm() const { return Kind == k_ModifiedImmediate; }
+ bool isModImmNot() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return ARM_AM::getSOImmVal(~Value) != -1;
+ }
+ bool isModImmNeg() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return ARM_AM::getSOImmVal(Value) == -1 &&
+ ARM_AM::getSOImmVal(-Value) != -1;
+ }
+ bool isConstantPoolImm() const { return Kind == k_ConstantPoolImmediate; }
+ bool isBitfield() const { return Kind == k_BitfieldDescriptor; }
+ bool isPostIdxRegShifted() const { return Kind == k_PostIndexRegister; }
+ bool isPostIdxReg() const {
+ return Kind == k_PostIndexRegister && PostIdxReg.ShiftTy ==ARM_AM::no_shift;
+ }
+ bool isMemNoOffset(bool alignOK = false, unsigned Alignment = 0) const {
+ if (!isMem())
+ return false;
+ // No offset of any kind.
+ return Memory.OffsetRegNum == 0 && Memory.OffsetImm == nullptr &&
+ (alignOK || Memory.Alignment == Alignment);
+ }
+ bool isMemPCRelImm12() const {
+ if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+ return false;
+ // Base register must be PC.
+ if (Memory.BaseRegNum != ARM::PC)
+ return false;
+ // Immediate offset in range [-4095, 4095].
+ if (!Memory.OffsetImm) return true;
+ int64_t Val = Memory.OffsetImm->getValue();
+ return (Val > -4096 && Val < 4096) || (Val == INT32_MIN);
+ }
+ bool isAlignedMemory() const {
+ return isMemNoOffset(true);
+ }
+ bool isAlignedMemoryNone() const {
+ return isMemNoOffset(false, 0);
+ }
+ bool isDupAlignedMemoryNone() const {
+ return isMemNoOffset(false, 0);
+ }
+ bool isAlignedMemory16() const {
+ if (isMemNoOffset(false, 2)) // alignment in bytes for 16-bits is 2.
+ return true;
+ return isMemNoOffset(false, 0);
+ }
+ bool isDupAlignedMemory16() const {
+ if (isMemNoOffset(false, 2)) // alignment in bytes for 16-bits is 2.
+ return true;
+ return isMemNoOffset(false, 0);
+ }
+ bool isAlignedMemory32() const {
+ if (isMemNoOffset(false, 4)) // alignment in bytes for 32-bits is 4.
+ return true;
+ return isMemNoOffset(false, 0);
+ }
+ bool isDupAlignedMemory32() const {
+ if (isMemNoOffset(false, 4)) // alignment in bytes for 32-bits is 4.
+ return true;
+ return isMemNoOffset(false, 0);
+ }
+ bool isAlignedMemory64() const {
+ if (isMemNoOffset(false, 8)) // alignment in bytes for 64-bits is 8.
+ return true;
+ return isMemNoOffset(false, 0);
+ }
+ bool isDupAlignedMemory64() const {
+ if (isMemNoOffset(false, 8)) // alignment in bytes for 64-bits is 8.
+ return true;
+ return isMemNoOffset(false, 0);
+ }
+ bool isAlignedMemory64or128() const {
+ if (isMemNoOffset(false, 8)) // alignment in bytes for 64-bits is 8.
+ return true;
+ if (isMemNoOffset(false, 16)) // alignment in bytes for 128-bits is 16.
+ return true;
+ return isMemNoOffset(false, 0);
+ }
+ bool isDupAlignedMemory64or128() const {
+ if (isMemNoOffset(false, 8)) // alignment in bytes for 64-bits is 8.
+ return true;
+ if (isMemNoOffset(false, 16)) // alignment in bytes for 128-bits is 16.
+ return true;
+ return isMemNoOffset(false, 0);
+ }
+ bool isAlignedMemory64or128or256() const {
+ if (isMemNoOffset(false, 8)) // alignment in bytes for 64-bits is 8.
+ return true;
+ if (isMemNoOffset(false, 16)) // alignment in bytes for 128-bits is 16.
+ return true;
+ if (isMemNoOffset(false, 32)) // alignment in bytes for 256-bits is 32.
+ return true;
+ return isMemNoOffset(false, 0);
+ }
+ bool isAddrMode2() const {
+ if (!isMem() || Memory.Alignment != 0) return false;
+ // Check for register offset.
+ if (Memory.OffsetRegNum) return true;
+ // Immediate offset in range [-4095, 4095].
+ if (!Memory.OffsetImm) return true;
+ int64_t Val = Memory.OffsetImm->getValue();
+ return Val > -4096 && Val < 4096;
+ }
+ bool isAM2OffsetImm() const {
+ if (!isImm()) return false;
+ // Immediate offset in range [-4095, 4095].
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Val = CE->getValue();
+ return (Val == INT32_MIN) || (Val > -4096 && Val < 4096);
+ }
+ bool isAddrMode3() const {
+ // If we have an immediate that's not a constant, treat it as a label
+ // reference needing a fixup. If it is a constant, it's something else
+ // and we reject it.
+ if (isImm() && !isa<MCConstantExpr>(getImm()))
+ return true;
+ if (!isMem() || Memory.Alignment != 0) return false;
+ // No shifts are legal for AM3.
+ if (Memory.ShiftType != ARM_AM::no_shift) return false;
+ // Check for register offset.
+ if (Memory.OffsetRegNum) return true;
+ // Immediate offset in range [-255, 255].
+ if (!Memory.OffsetImm) return true;
+ int64_t Val = Memory.OffsetImm->getValue();
+ // The #-0 offset is encoded as INT32_MIN, and we have to check
+ // for this too.
+ return (Val > -256 && Val < 256) || Val == INT32_MIN;
+ }
+ bool isAM3Offset() const {
+ if (Kind != k_Immediate && Kind != k_PostIndexRegister)
+ return false;
+ if (Kind == k_PostIndexRegister)
+ return PostIdxReg.ShiftTy == ARM_AM::no_shift;
+ // Immediate offset in range [-255, 255].
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Val = CE->getValue();
+ // Special case, #-0 is INT32_MIN.
+ return (Val > -256 && Val < 256) || Val == INT32_MIN;
+ }
+ bool isAddrMode5() const {
+ // If we have an immediate that's not a constant, treat it as a label
+ // reference needing a fixup. If it is a constant, it's something else
+ // and we reject it.
+ if (isImm() && !isa<MCConstantExpr>(getImm()))
+ return true;
+ if (!isMem() || Memory.Alignment != 0) return false;
+ // Check for register offset.
+ if (Memory.OffsetRegNum) return false;
+ // Immediate offset in range [-1020, 1020] and a multiple of 4.
+ if (!Memory.OffsetImm) return true;
+ int64_t Val = Memory.OffsetImm->getValue();
+ return (Val >= -1020 && Val <= 1020 && ((Val & 3) == 0)) ||
+ Val == INT32_MIN;
+ }
+ bool isAddrMode5FP16() const {
+ // If we have an immediate that's not a constant, treat it as a label
+ // reference needing a fixup. If it is a constant, it's something else
+ // and we reject it.
+ if (isImm() && !isa<MCConstantExpr>(getImm()))
+ return true;
+ if (!isMem() || Memory.Alignment != 0) return false;
+ // Check for register offset.
+ if (Memory.OffsetRegNum) return false;
+ // Immediate offset in range [-510, 510] and a multiple of 2.
+ if (!Memory.OffsetImm) return true;
+ int64_t Val = Memory.OffsetImm->getValue();
+ return (Val >= -510 && Val <= 510 && ((Val & 1) == 0)) || Val == INT32_MIN;
+ }
+ bool isMemTBB() const {
+ if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative ||
+ Memory.ShiftType != ARM_AM::no_shift || Memory.Alignment != 0)
+ return false;
+ return true;
+ }
+ bool isMemTBH() const {
+ if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative ||
+ Memory.ShiftType != ARM_AM::lsl || Memory.ShiftImm != 1 ||
+ Memory.Alignment != 0 )
+ return false;
+ return true;
+ }
+ bool isMemRegOffset() const {
+ if (!isMem() || !Memory.OffsetRegNum || Memory.Alignment != 0)
+ return false;
+ return true;
+ }
+ bool isT2MemRegOffset() const {
+ if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative ||
+ Memory.Alignment != 0 || Memory.BaseRegNum == ARM::PC)
+ return false;
+ // Only lsl #{0, 1, 2, 3} allowed.
+ if (Memory.ShiftType == ARM_AM::no_shift)
+ return true;
+ if (Memory.ShiftType != ARM_AM::lsl || Memory.ShiftImm > 3)
+ return false;
+ return true;
+ }
+ bool isMemThumbRR() const {
+ // Thumb reg+reg addressing is simple. Just two registers, a base and
+ // an offset. No shifts, negations or any other complicating factors.
+ if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative ||
+ Memory.ShiftType != ARM_AM::no_shift || Memory.Alignment != 0)
+ return false;
+ return isARMLowRegister(Memory.BaseRegNum) &&
+ (!Memory.OffsetRegNum || isARMLowRegister(Memory.OffsetRegNum));
+ }
+ bool isMemThumbRIs4() const {
+ if (!isMem() || Memory.OffsetRegNum != 0 ||
+ !isARMLowRegister(Memory.BaseRegNum) || Memory.Alignment != 0)
+ return false;
+ // Immediate offset, multiple of 4 in range [0, 124].
+ if (!Memory.OffsetImm) return true;
+ int64_t Val = Memory.OffsetImm->getValue();
+ return Val >= 0 && Val <= 124 && (Val % 4) == 0;
+ }
+ bool isMemThumbRIs2() const {
+ if (!isMem() || Memory.OffsetRegNum != 0 ||
+ !isARMLowRegister(Memory.BaseRegNum) || Memory.Alignment != 0)
+ return false;
+ // Immediate offset, multiple of 4 in range [0, 62].
+ if (!Memory.OffsetImm) return true;
+ int64_t Val = Memory.OffsetImm->getValue();
+ return Val >= 0 && Val <= 62 && (Val % 2) == 0;
+ }
+ bool isMemThumbRIs1() const {
+ if (!isMem() || Memory.OffsetRegNum != 0 ||
+ !isARMLowRegister(Memory.BaseRegNum) || Memory.Alignment != 0)
+ return false;
+ // Immediate offset in range [0, 31].
+ if (!Memory.OffsetImm) return true;
+ int64_t Val = Memory.OffsetImm->getValue();
+ return Val >= 0 && Val <= 31;
+ }
+ bool isMemThumbSPI() const {
+ if (!isMem() || Memory.OffsetRegNum != 0 ||
+ Memory.BaseRegNum != ARM::SP || Memory.Alignment != 0)
+ return false;
+ // Immediate offset, multiple of 4 in range [0, 1020].
+ if (!Memory.OffsetImm) return true;
+ int64_t Val = Memory.OffsetImm->getValue();
+ return Val >= 0 && Val <= 1020 && (Val % 4) == 0;
+ }
+ bool isMemImm8s4Offset() const {
+ // If we have an immediate that's not a constant, treat it as a label
+ // reference needing a fixup. If it is a constant, it's something else
+ // and we reject it.
+ if (isImm() && !isa<MCConstantExpr>(getImm()))
+ return true;
+ if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+ return false;
+ // Immediate offset a multiple of 4 in range [-1020, 1020].
+ if (!Memory.OffsetImm) return true;
+ int64_t Val = Memory.OffsetImm->getValue();
+ // Special case, #-0 is INT32_MIN.
+ return (Val >= -1020 && Val <= 1020 && (Val & 3) == 0) || Val == INT32_MIN;
+ }
+ bool isMemImm0_1020s4Offset() const {
+ if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+ return false;
+ // Immediate offset a multiple of 4 in range [0, 1020].
+ if (!Memory.OffsetImm) return true;
+ int64_t Val = Memory.OffsetImm->getValue();
+ return Val >= 0 && Val <= 1020 && (Val & 3) == 0;
+ }
+ bool isMemImm8Offset() const {
+ if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+ return false;
+ // Base reg of PC isn't allowed for these encodings.
+ if (Memory.BaseRegNum == ARM::PC) return false;
+ // Immediate offset in range [-255, 255].
+ if (!Memory.OffsetImm) return true;
+ int64_t Val = Memory.OffsetImm->getValue();
+ return (Val == INT32_MIN) || (Val > -256 && Val < 256);
+ }
+ bool isMemPosImm8Offset() const {
+ if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+ return false;
+ // Immediate offset in range [0, 255].
+ if (!Memory.OffsetImm) return true;
+ int64_t Val = Memory.OffsetImm->getValue();
+ return Val >= 0 && Val < 256;
+ }
+ bool isMemNegImm8Offset() const {
+ if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+ return false;
+ // Base reg of PC isn't allowed for these encodings.
+ if (Memory.BaseRegNum == ARM::PC) return false;
+ // Immediate offset in range [-255, -1].
+ if (!Memory.OffsetImm) return false;
+ int64_t Val = Memory.OffsetImm->getValue();
+ return (Val == INT32_MIN) || (Val > -256 && Val < 0);
+ }
+ bool isMemUImm12Offset() const {
+ if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+ return false;
+ // Immediate offset in range [0, 4095].
+ if (!Memory.OffsetImm) return true;
+ int64_t Val = Memory.OffsetImm->getValue();
+ return (Val >= 0 && Val < 4096);
+ }
+ bool isMemImm12Offset() const {
+ // If we have an immediate that's not a constant, treat it as a label
+ // reference needing a fixup. If it is a constant, it's something else
+ // and we reject it.
+
+ if (isImm() && !isa<MCConstantExpr>(getImm()))
+ return true;
+
+ if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+ return false;
+ // Immediate offset in range [-4095, 4095].
+ if (!Memory.OffsetImm) return true;
+ int64_t Val = Memory.OffsetImm->getValue();
+ return (Val > -4096 && Val < 4096) || (Val == INT32_MIN);
+ }
+ bool isConstPoolAsmImm() const {
+ // Delay processing of Constant Pool Immediate, this will turn into
+ // a constant. Match no other operand
+ return (isConstantPoolImm());
+ }
+ bool isPostIdxImm8() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Val = CE->getValue();
+ return (Val > -256 && Val < 256) || (Val == INT32_MIN);
+ }
+ bool isPostIdxImm8s4() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Val = CE->getValue();
+ return ((Val & 3) == 0 && Val >= -1020 && Val <= 1020) ||
+ (Val == INT32_MIN);
+ }
+
+ bool isMSRMask() const { return Kind == k_MSRMask; }
+ bool isBankedReg() const { return Kind == k_BankedReg; }
+ bool isProcIFlags() const { return Kind == k_ProcIFlags; }
+
+ // NEON operands.
+ bool isSingleSpacedVectorList() const {
+ return Kind == k_VectorList && !VectorList.isDoubleSpaced;
+ }
+ bool isDoubleSpacedVectorList() const {
+ return Kind == k_VectorList && VectorList.isDoubleSpaced;
+ }
+ bool isVecListOneD() const {
+ if (!isSingleSpacedVectorList()) return false;
+ return VectorList.Count == 1;
+ }
+
+ bool isVecListDPair() const {
+ if (!isSingleSpacedVectorList()) return false;
+ return (ARMMCRegisterClasses[ARM::DPairRegClassID]
+ .contains(VectorList.RegNum));
+ }
+
+ bool isVecListThreeD() const {
+ if (!isSingleSpacedVectorList()) return false;
+ return VectorList.Count == 3;
+ }
+
+ bool isVecListFourD() const {
+ if (!isSingleSpacedVectorList()) return false;
+ return VectorList.Count == 4;
+ }
+
+ bool isVecListDPairSpaced() const {
+ if (Kind != k_VectorList) return false;
+ if (isSingleSpacedVectorList()) return false;
+ return (ARMMCRegisterClasses[ARM::DPairSpcRegClassID]
+ .contains(VectorList.RegNum));
+ }
+
+ bool isVecListThreeQ() const {
+ if (!isDoubleSpacedVectorList()) return false;
+ return VectorList.Count == 3;
+ }
+
+ bool isVecListFourQ() const {
+ if (!isDoubleSpacedVectorList()) return false;
+ return VectorList.Count == 4;
+ }
+
+ bool isSingleSpacedVectorAllLanes() const {
+ return Kind == k_VectorListAllLanes && !VectorList.isDoubleSpaced;
+ }
+ bool isDoubleSpacedVectorAllLanes() const {
+ return Kind == k_VectorListAllLanes && VectorList.isDoubleSpaced;
+ }
+ bool isVecListOneDAllLanes() const {
+ if (!isSingleSpacedVectorAllLanes()) return false;
+ return VectorList.Count == 1;
+ }
+
+ bool isVecListDPairAllLanes() const {
+ if (!isSingleSpacedVectorAllLanes()) return false;
+ return (ARMMCRegisterClasses[ARM::DPairRegClassID]
+ .contains(VectorList.RegNum));
+ }
+
+ bool isVecListDPairSpacedAllLanes() const {
+ if (!isDoubleSpacedVectorAllLanes()) return false;
+ return VectorList.Count == 2;
+ }
+
+ bool isVecListThreeDAllLanes() const {
+ if (!isSingleSpacedVectorAllLanes()) return false;
+ return VectorList.Count == 3;
+ }
+
+ bool isVecListThreeQAllLanes() const {
+ if (!isDoubleSpacedVectorAllLanes()) return false;
+ return VectorList.Count == 3;
+ }
+
+ bool isVecListFourDAllLanes() const {
+ if (!isSingleSpacedVectorAllLanes()) return false;
+ return VectorList.Count == 4;
+ }
+
+ bool isVecListFourQAllLanes() const {
+ if (!isDoubleSpacedVectorAllLanes()) return false;
+ return VectorList.Count == 4;
+ }
+
+ bool isSingleSpacedVectorIndexed() const {
+ return Kind == k_VectorListIndexed && !VectorList.isDoubleSpaced;
+ }
+ bool isDoubleSpacedVectorIndexed() const {
+ return Kind == k_VectorListIndexed && VectorList.isDoubleSpaced;
+ }
+ bool isVecListOneDByteIndexed() const {
+ if (!isSingleSpacedVectorIndexed()) return false;
+ return VectorList.Count == 1 && VectorList.LaneIndex <= 7;
+ }
+
+ bool isVecListOneDHWordIndexed() const {
+ if (!isSingleSpacedVectorIndexed()) return false;
+ return VectorList.Count == 1 && VectorList.LaneIndex <= 3;
+ }
+
+ bool isVecListOneDWordIndexed() const {
+ if (!isSingleSpacedVectorIndexed()) return false;
+ return VectorList.Count == 1 && VectorList.LaneIndex <= 1;
+ }
+
+ bool isVecListTwoDByteIndexed() const {
+ if (!isSingleSpacedVectorIndexed()) return false;
+ return VectorList.Count == 2 && VectorList.LaneIndex <= 7;
+ }
+
+ bool isVecListTwoDHWordIndexed() const {
+ if (!isSingleSpacedVectorIndexed()) return false;
+ return VectorList.Count == 2 && VectorList.LaneIndex <= 3;
+ }
+
+ bool isVecListTwoQWordIndexed() const {
+ if (!isDoubleSpacedVectorIndexed()) return false;
+ return VectorList.Count == 2 && VectorList.LaneIndex <= 1;
+ }
+
+ bool isVecListTwoQHWordIndexed() const {
+ if (!isDoubleSpacedVectorIndexed()) return false;
+ return VectorList.Count == 2 && VectorList.LaneIndex <= 3;
+ }
+
+ bool isVecListTwoDWordIndexed() const {
+ if (!isSingleSpacedVectorIndexed()) return false;
+ return VectorList.Count == 2 && VectorList.LaneIndex <= 1;
+ }
+
+ bool isVecListThreeDByteIndexed() const {
+ if (!isSingleSpacedVectorIndexed()) return false;
+ return VectorList.Count == 3 && VectorList.LaneIndex <= 7;
+ }
+
+ bool isVecListThreeDHWordIndexed() const {
+ if (!isSingleSpacedVectorIndexed()) return false;
+ return VectorList.Count == 3 && VectorList.LaneIndex <= 3;
+ }
+
+ bool isVecListThreeQWordIndexed() const {
+ if (!isDoubleSpacedVectorIndexed()) return false;
+ return VectorList.Count == 3 && VectorList.LaneIndex <= 1;
+ }
+
+ bool isVecListThreeQHWordIndexed() const {
+ if (!isDoubleSpacedVectorIndexed()) return false;
+ return VectorList.Count == 3 && VectorList.LaneIndex <= 3;
+ }
+
+ bool isVecListThreeDWordIndexed() const {
+ if (!isSingleSpacedVectorIndexed()) return false;
+ return VectorList.Count == 3 && VectorList.LaneIndex <= 1;
+ }
+
+ bool isVecListFourDByteIndexed() const {
+ if (!isSingleSpacedVectorIndexed()) return false;
+ return VectorList.Count == 4 && VectorList.LaneIndex <= 7;
+ }
+
+ bool isVecListFourDHWordIndexed() const {
+ if (!isSingleSpacedVectorIndexed()) return false;
+ return VectorList.Count == 4 && VectorList.LaneIndex <= 3;
+ }
+
+ bool isVecListFourQWordIndexed() const {
+ if (!isDoubleSpacedVectorIndexed()) return false;
+ return VectorList.Count == 4 && VectorList.LaneIndex <= 1;
+ }
+
+ bool isVecListFourQHWordIndexed() const {
+ if (!isDoubleSpacedVectorIndexed()) return false;
+ return VectorList.Count == 4 && VectorList.LaneIndex <= 3;
+ }
+
+ bool isVecListFourDWordIndexed() const {
+ if (!isSingleSpacedVectorIndexed()) return false;
+ return VectorList.Count == 4 && VectorList.LaneIndex <= 1;
+ }
+
+ bool isVectorIndex8() const {
+ if (Kind != k_VectorIndex) return false;
+ return VectorIndex.Val < 8;
+ }
+ bool isVectorIndex16() const {
+ if (Kind != k_VectorIndex) return false;
+ return VectorIndex.Val < 4;
+ }
+ bool isVectorIndex32() const {
+ if (Kind != k_VectorIndex) return false;
+ return VectorIndex.Val < 2;
+ }
+
+ bool isNEONi8splat() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ // Must be a constant.
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ // i8 value splatted across 8 bytes. The immediate is just the 8 byte
+ // value.
+ return Value >= 0 && Value < 256;
+ }
+
+ bool isNEONi16splat() const {
+ if (isNEONByteReplicate(2))
+ return false; // Leave that for bytes replication and forbid by default.
+ if (!isImm())
+ return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ // Must be a constant.
+ if (!CE) return false;
+ unsigned Value = CE->getValue();
+ return ARM_AM::isNEONi16splat(Value);
+ }
+
+ bool isNEONi16splatNot() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ // Must be a constant.
+ if (!CE) return false;
+ unsigned Value = CE->getValue();
+ return ARM_AM::isNEONi16splat(~Value & 0xffff);
+ }
+
+ bool isNEONi32splat() const {
+ if (isNEONByteReplicate(4))
+ return false; // Leave that for bytes replication and forbid by default.
+ if (!isImm())
+ return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ // Must be a constant.
+ if (!CE) return false;
+ unsigned Value = CE->getValue();
+ return ARM_AM::isNEONi32splat(Value);
+ }
+
+ bool isNEONi32splatNot() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ // Must be a constant.
+ if (!CE) return false;
+ unsigned Value = CE->getValue();
+ return ARM_AM::isNEONi32splat(~Value);
+ }
+
+ bool isNEONByteReplicate(unsigned NumBytes) const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ // Must be a constant.
+ if (!CE)
+ return false;
+ int64_t Value = CE->getValue();
+ if (!Value)
+ return false; // Don't bother with zero.
+
+ unsigned char B = Value & 0xff;
+ for (unsigned i = 1; i < NumBytes; ++i) {
+ Value >>= 8;
+ if ((Value & 0xff) != B)
+ return false;
+ }
+ return true;
+ }
+ bool isNEONi16ByteReplicate() const { return isNEONByteReplicate(2); }
+ bool isNEONi32ByteReplicate() const { return isNEONByteReplicate(4); }
+ bool isNEONi32vmov() const {
+ if (isNEONByteReplicate(4))
+ return false; // Let it to be classified as byte-replicate case.
+ if (!isImm())
+ return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ // Must be a constant.
+ if (!CE)
+ return false;
+ int64_t Value = CE->getValue();
+ // i32 value with set bits only in one byte X000, 0X00, 00X0, or 000X,
+ // for VMOV/VMVN only, 00Xf or 0Xff are also accepted.
+ // FIXME: This is probably wrong and a copy and paste from previous example
+ return (Value >= 0 && Value < 256) ||
+ (Value >= 0x0100 && Value <= 0xff00) ||
+ (Value >= 0x010000 && Value <= 0xff0000) ||
+ (Value >= 0x01000000 && Value <= 0xff000000) ||
+ (Value >= 0x01ff && Value <= 0xffff && (Value & 0xff) == 0xff) ||
+ (Value >= 0x01ffff && Value <= 0xffffff && (Value & 0xffff) == 0xffff);
+ }
+ bool isNEONi32vmovNeg() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ // Must be a constant.
+ if (!CE) return false;
+ int64_t Value = ~CE->getValue();
+ // i32 value with set bits only in one byte X000, 0X00, 00X0, or 000X,
+ // for VMOV/VMVN only, 00Xf or 0Xff are also accepted.
+ // FIXME: This is probably wrong and a copy and paste from previous example
+ return (Value >= 0 && Value < 256) ||
+ (Value >= 0x0100 && Value <= 0xff00) ||
+ (Value >= 0x010000 && Value <= 0xff0000) ||
+ (Value >= 0x01000000 && Value <= 0xff000000) ||
+ (Value >= 0x01ff && Value <= 0xffff && (Value & 0xff) == 0xff) ||
+ (Value >= 0x01ffff && Value <= 0xffffff && (Value & 0xffff) == 0xffff);
+ }
+
+ bool isNEONi64splat() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ // Must be a constant.
+ if (!CE) return false;
+ uint64_t Value = CE->getValue();
+ // i64 value with each byte being either 0 or 0xff.
+ for (unsigned i = 0; i < 8; ++i, Value >>= 8)
+ if ((Value & 0xff) != 0 && (Value & 0xff) != 0xff) return false;
+ return true;
+ }
+
+ void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+ // Add as immediates when possible. Null MCExpr = 0.
+ if (!Expr)
+ Inst.addOperand(MCOperand::createImm(0));
+ else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+ Inst.addOperand(MCOperand::createImm(CE->getValue()));
+ else
+ Inst.addOperand(MCOperand::createExpr(Expr));
+ }
+
+ void addARMBranchTargetOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ addExpr(Inst, getImm());
+ }
+
+ void addThumbBranchTargetOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ addExpr(Inst, getImm());
+ }
+
+ void addCondCodeOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 2 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createImm(unsigned(getCondCode())));
+ unsigned RegNum = getCondCode() == ARMCC::AL ? 0: ARM::CPSR;
+ Inst.addOperand(MCOperand::createReg(RegNum));
+ }
+
+ void addCoprocNumOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createImm(getCoproc()));
+ }
+
+ void addCoprocRegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createImm(getCoproc()));
+ }
+
+ void addCoprocOptionOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createImm(CoprocOption.Val));
+ }
+
+ void addITMaskOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createImm(ITMask.Mask));
+ }
+
+ void addITCondCodeOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createImm(unsigned(getCondCode())));
+ }
+
+ void addCCOutOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getReg()));
+ }
+
+ void addRegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getReg()));
+ }
+
+ void addRegShiftedRegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 3 && "Invalid number of operands!");
+ assert(isRegShiftedReg() &&
+ "addRegShiftedRegOperands() on non-RegShiftedReg!");
+ Inst.addOperand(MCOperand::createReg(RegShiftedReg.SrcReg));
+ Inst.addOperand(MCOperand::createReg(RegShiftedReg.ShiftReg));
+ Inst.addOperand(MCOperand::createImm(
+ ARM_AM::getSORegOpc(RegShiftedReg.ShiftTy, RegShiftedReg.ShiftImm)));
+ }
+
+ void addRegShiftedImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 2 && "Invalid number of operands!");
+ assert(isRegShiftedImm() &&
+ "addRegShiftedImmOperands() on non-RegShiftedImm!");
+ Inst.addOperand(MCOperand::createReg(RegShiftedImm.SrcReg));
+ // Shift of #32 is encoded as 0 where permitted
+ unsigned Imm = (RegShiftedImm.ShiftImm == 32 ? 0 : RegShiftedImm.ShiftImm);
+ Inst.addOperand(MCOperand::createImm(
+ ARM_AM::getSORegOpc(RegShiftedImm.ShiftTy, Imm)));
+ }
+
+ void addShifterImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createImm((ShifterImm.isASR << 5) |
+ ShifterImm.Imm));
+ }
+
+ void addRegListOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const SmallVectorImpl<unsigned> &RegList = getRegList();
+ for (SmallVectorImpl<unsigned>::const_iterator
+ I = RegList.begin(), E = RegList.end(); I != E; ++I)
+ Inst.addOperand(MCOperand::createReg(*I));
+ }
+
+ void addDPRRegListOperands(MCInst &Inst, unsigned N) const {
+ addRegListOperands(Inst, N);
+ }
+
+ void addSPRRegListOperands(MCInst &Inst, unsigned N) const {
+ addRegListOperands(Inst, N);
+ }
+
+ void addRotImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ // Encoded as val>>3. The printer handles display as 8, 16, 24.
+ Inst.addOperand(MCOperand::createImm(RotImm.Imm >> 3));
+ }
+
+ void addModImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+
+ // Support for fixups (MCFixup)
+ if (isImm())
+ return addImmOperands(Inst, N);
+
+ Inst.addOperand(MCOperand::createImm(ModImm.Bits | (ModImm.Rot << 7)));
+ }
+
+ void addModImmNotOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ uint32_t Enc = ARM_AM::getSOImmVal(~CE->getValue());
+ Inst.addOperand(MCOperand::createImm(Enc));
+ }
+
+ void addModImmNegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ uint32_t Enc = ARM_AM::getSOImmVal(-CE->getValue());
+ Inst.addOperand(MCOperand::createImm(Enc));
+ }
+
+ void addBitfieldOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ // Munge the lsb/width into a bitfield mask.
+ unsigned lsb = Bitfield.LSB;
+ unsigned width = Bitfield.Width;
+ // Make a 32-bit mask w/ the referenced bits clear and all other bits set.
+ uint32_t Mask = ~(((uint32_t)0xffffffff >> lsb) << (32 - width) >>
+ (32 - (lsb + width)));
+ Inst.addOperand(MCOperand::createImm(Mask));
+ }
+
+ void addImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ addExpr(Inst, getImm());
+ }
+
+ void addFBits16Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ Inst.addOperand(MCOperand::createImm(16 - CE->getValue()));
+ }
+
+ void addFBits32Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ Inst.addOperand(MCOperand::createImm(32 - CE->getValue()));
+ }
+
+ void addFPImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ int Val = ARM_AM::getFP32Imm(APInt(32, CE->getValue()));
+ Inst.addOperand(MCOperand::createImm(Val));
+ }
+
+ void addImm8s4Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ // FIXME: We really want to scale the value here, but the LDRD/STRD
+ // instruction don't encode operands that way yet.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ Inst.addOperand(MCOperand::createImm(CE->getValue()));
+ }
+
+ void addImm0_1020s4Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ // The immediate is scaled by four in the encoding and is stored
+ // in the MCInst as such. Lop off the low two bits here.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ Inst.addOperand(MCOperand::createImm(CE->getValue() / 4));
+ }
+
+ void addImm0_508s4NegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ // The immediate is scaled by four in the encoding and is stored
+ // in the MCInst as such. Lop off the low two bits here.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ Inst.addOperand(MCOperand::createImm(-(CE->getValue() / 4)));
+ }
+
+ void addImm0_508s4Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ // The immediate is scaled by four in the encoding and is stored
+ // in the MCInst as such. Lop off the low two bits here.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ Inst.addOperand(MCOperand::createImm(CE->getValue() / 4));
+ }
+
+ void addImm1_16Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ // The constant encodes as the immediate-1, and we store in the instruction
+ // the bits as encoded, so subtract off one here.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ Inst.addOperand(MCOperand::createImm(CE->getValue() - 1));
+ }
+
+ void addImm1_32Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ // The constant encodes as the immediate-1, and we store in the instruction
+ // the bits as encoded, so subtract off one here.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ Inst.addOperand(MCOperand::createImm(CE->getValue() - 1));
+ }
+
+ void addImmThumbSROperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ // The constant encodes as the immediate, except for 32, which encodes as
+ // zero.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ unsigned Imm = CE->getValue();
+ Inst.addOperand(MCOperand::createImm((Imm == 32 ? 0 : Imm)));
+ }
+
+ void addPKHASRImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ // An ASR value of 32 encodes as 0, so that's how we want to add it to
+ // the instruction as well.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ int Val = CE->getValue();
+ Inst.addOperand(MCOperand::createImm(Val == 32 ? 0 : Val));
+ }
+
+ void addT2SOImmNotOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ // The operand is actually a t2_so_imm, but we have its bitwise
+ // negation in the assembly source, so twiddle it here.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ Inst.addOperand(MCOperand::createImm(~CE->getValue()));
+ }
+
+ void addT2SOImmNegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ // The operand is actually a t2_so_imm, but we have its
+ // negation in the assembly source, so twiddle it here.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ Inst.addOperand(MCOperand::createImm(-CE->getValue()));
+ }
+
+ void addImm0_4095NegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ // The operand is actually an imm0_4095, but we have its
+ // negation in the assembly source, so twiddle it here.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ Inst.addOperand(MCOperand::createImm(-CE->getValue()));
+ }
+
+ void addUnsignedOffset_b8s2Operands(MCInst &Inst, unsigned N) const {
+ if(const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm())) {
+ Inst.addOperand(MCOperand::createImm(CE->getValue() >> 2));
+ return;
+ }
+
+ const MCSymbolRefExpr *SR = dyn_cast<MCSymbolRefExpr>(Imm.Val);
+ assert(SR && "Unknown value type!");
+ Inst.addOperand(MCOperand::createExpr(SR));
+ }
+
+ void addThumbMemPCOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ if (isImm()) {
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (CE) {
+ Inst.addOperand(MCOperand::createImm(CE->getValue()));
+ return;
+ }
+
+ const MCSymbolRefExpr *SR = dyn_cast<MCSymbolRefExpr>(Imm.Val);
+
+ assert(SR && "Unknown value type!");
+ Inst.addOperand(MCOperand::createExpr(SR));
+ return;
+ }
+
+ assert(isMem() && "Unknown value type!");
+ assert(isa<MCConstantExpr>(Memory.OffsetImm) && "Unknown value type!");
+ Inst.addOperand(MCOperand::createImm(Memory.OffsetImm->getValue()));
+ }
+
+ void addMemBarrierOptOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createImm(unsigned(getMemBarrierOpt())));
+ }
+
+ void addInstSyncBarrierOptOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createImm(unsigned(getInstSyncBarrierOpt())));
+ }
+
+ void addMemNoOffsetOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+ }
+
+ void addMemPCRelImm12Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ int32_t Imm = Memory.OffsetImm->getValue();
+ Inst.addOperand(MCOperand::createImm(Imm));
+ }
+
+ void addAdrLabelOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ assert(isImm() && "Not an immediate!");
+
+ // If we have an immediate that's not a constant, treat it as a label
+ // reference needing a fixup.
+ if (!isa<MCConstantExpr>(getImm())) {
+ Inst.addOperand(MCOperand::createExpr(getImm()));
+ return;
+ }
+
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ int Val = CE->getValue();
+ Inst.addOperand(MCOperand::createImm(Val));
+ }
+
+ void addAlignedMemoryOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 2 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+ Inst.addOperand(MCOperand::createImm(Memory.Alignment));
+ }
+
+ void addDupAlignedMemoryNoneOperands(MCInst &Inst, unsigned N) const {
+ addAlignedMemoryOperands(Inst, N);
+ }
+
+ void addAlignedMemoryNoneOperands(MCInst &Inst, unsigned N) const {
+ addAlignedMemoryOperands(Inst, N);
+ }
+
+ void addAlignedMemory16Operands(MCInst &Inst, unsigned N) const {
+ addAlignedMemoryOperands(Inst, N);
+ }
+
+ void addDupAlignedMemory16Operands(MCInst &Inst, unsigned N) const {
+ addAlignedMemoryOperands(Inst, N);
+ }
+
+ void addAlignedMemory32Operands(MCInst &Inst, unsigned N) const {
+ addAlignedMemoryOperands(Inst, N);
+ }
+
+ void addDupAlignedMemory32Operands(MCInst &Inst, unsigned N) const {
+ addAlignedMemoryOperands(Inst, N);
+ }
+
+ void addAlignedMemory64Operands(MCInst &Inst, unsigned N) const {
+ addAlignedMemoryOperands(Inst, N);
+ }
+
+ void addDupAlignedMemory64Operands(MCInst &Inst, unsigned N) const {
+ addAlignedMemoryOperands(Inst, N);
+ }
+
+ void addAlignedMemory64or128Operands(MCInst &Inst, unsigned N) const {
+ addAlignedMemoryOperands(Inst, N);
+ }
+
+ void addDupAlignedMemory64or128Operands(MCInst &Inst, unsigned N) const {
+ addAlignedMemoryOperands(Inst, N);
+ }
+
+ void addAlignedMemory64or128or256Operands(MCInst &Inst, unsigned N) const {
+ addAlignedMemoryOperands(Inst, N);
+ }
+
+ void addAddrMode2Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 3 && "Invalid number of operands!");
+ int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
+ if (!Memory.OffsetRegNum) {
+ ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
+ // Special case for #-0
+ if (Val == INT32_MIN) Val = 0;
+ if (Val < 0) Val = -Val;
+ Val = ARM_AM::getAM2Opc(AddSub, Val, ARM_AM::no_shift);
+ } else {
+ // For register offset, we encode the shift type and negation flag
+ // here.
+ Val = ARM_AM::getAM2Opc(Memory.isNegative ? ARM_AM::sub : ARM_AM::add,
+ Memory.ShiftImm, Memory.ShiftType);
+ }
+ Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+ Inst.addOperand(MCOperand::createReg(Memory.OffsetRegNum));
+ Inst.addOperand(MCOperand::createImm(Val));
+ }
+
+ void addAM2OffsetImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 2 && "Invalid number of operands!");
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ assert(CE && "non-constant AM2OffsetImm operand!");
+ int32_t Val = CE->getValue();
+ ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
+ // Special case for #-0
+ if (Val == INT32_MIN) Val = 0;
+ if (Val < 0) Val = -Val;
+ Val = ARM_AM::getAM2Opc(AddSub, Val, ARM_AM::no_shift);
+ Inst.addOperand(MCOperand::createReg(0));
+ Inst.addOperand(MCOperand::createImm(Val));
+ }
+
+ void addAddrMode3Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 3 && "Invalid number of operands!");
+ // If we have an immediate that's not a constant, treat it as a label
+ // reference needing a fixup. If it is a constant, it's something else
+ // and we reject it.
+ if (isImm()) {
+ Inst.addOperand(MCOperand::createExpr(getImm()));
+ Inst.addOperand(MCOperand::createReg(0));
+ Inst.addOperand(MCOperand::createImm(0));
+ return;
+ }
+
+ int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
+ if (!Memory.OffsetRegNum) {
+ ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
+ // Special case for #-0
+ if (Val == INT32_MIN) Val = 0;
+ if (Val < 0) Val = -Val;
+ Val = ARM_AM::getAM3Opc(AddSub, Val);
+ } else {
+ // For register offset, we encode the shift type and negation flag
+ // here.
+ Val = ARM_AM::getAM3Opc(Memory.isNegative ? ARM_AM::sub : ARM_AM::add, 0);
+ }
+ Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+ Inst.addOperand(MCOperand::createReg(Memory.OffsetRegNum));
+ Inst.addOperand(MCOperand::createImm(Val));
+ }
+
+ void addAM3OffsetOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 2 && "Invalid number of operands!");
+ if (Kind == k_PostIndexRegister) {
+ int32_t Val =
+ ARM_AM::getAM3Opc(PostIdxReg.isAdd ? ARM_AM::add : ARM_AM::sub, 0);
+ Inst.addOperand(MCOperand::createReg(PostIdxReg.RegNum));
+ Inst.addOperand(MCOperand::createImm(Val));
+ return;
+ }
+
+ // Constant offset.
+ const MCConstantExpr *CE = static_cast<const MCConstantExpr*>(getImm());
+ int32_t Val = CE->getValue();
+ ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
+ // Special case for #-0
+ if (Val == INT32_MIN) Val = 0;
+ if (Val < 0) Val = -Val;
+ Val = ARM_AM::getAM3Opc(AddSub, Val);
+ Inst.addOperand(MCOperand::createReg(0));
+ Inst.addOperand(MCOperand::createImm(Val));
+ }
+
+ void addAddrMode5Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 2 && "Invalid number of operands!");
+ // If we have an immediate that's not a constant, treat it as a label
+ // reference needing a fixup. If it is a constant, it's something else
+ // and we reject it.
+ if (isImm()) {
+ Inst.addOperand(MCOperand::createExpr(getImm()));
+ Inst.addOperand(MCOperand::createImm(0));
+ return;
+ }
+
+ // The lower two bits are always zero and as such are not encoded.
+ int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() / 4 : 0;
+ ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
+ // Special case for #-0
+ if (Val == INT32_MIN) Val = 0;
+ if (Val < 0) Val = -Val;
+ Val = ARM_AM::getAM5Opc(AddSub, Val);
+ Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+ Inst.addOperand(MCOperand::createImm(Val));
+ }
+
+ void addAddrMode5FP16Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 2 && "Invalid number of operands!");
+ // If we have an immediate that's not a constant, treat it as a label
+ // reference needing a fixup. If it is a constant, it's something else
+ // and we reject it.
+ if (isImm()) {
+ Inst.addOperand(MCOperand::createExpr(getImm()));
+ Inst.addOperand(MCOperand::createImm(0));
+ return;
+ }
+
+ // The lower bit is always zero and as such is not encoded.
+ int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() / 2 : 0;
+ ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
+ // Special case for #-0
+ if (Val == INT32_MIN) Val = 0;
+ if (Val < 0) Val = -Val;
+ Val = ARM_AM::getAM5FP16Opc(AddSub, Val);
+ Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+ Inst.addOperand(MCOperand::createImm(Val));
+ }
+
+ void addMemImm8s4OffsetOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 2 && "Invalid number of operands!");
+ // If we have an immediate that's not a constant, treat it as a label
+ // reference needing a fixup. If it is a constant, it's something else
+ // and we reject it.
+ if (isImm()) {
+ Inst.addOperand(MCOperand::createExpr(getImm()));
+ Inst.addOperand(MCOperand::createImm(0));
+ return;
+ }
+
+ int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
+ Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+ Inst.addOperand(MCOperand::createImm(Val));
+ }
+
+ void addMemImm0_1020s4OffsetOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 2 && "Invalid number of operands!");
+ // The lower two bits are always zero and as such are not encoded.
+ int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() / 4 : 0;
+ Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+ Inst.addOperand(MCOperand::createImm(Val));
+ }
+
+ void addMemImm8OffsetOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 2 && "Invalid number of operands!");
+ int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
+ Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+ Inst.addOperand(MCOperand::createImm(Val));
+ }
+
+ void addMemPosImm8OffsetOperands(MCInst &Inst, unsigned N) const {
+ addMemImm8OffsetOperands(Inst, N);
+ }
+
+ void addMemNegImm8OffsetOperands(MCInst &Inst, unsigned N) const {
+ addMemImm8OffsetOperands(Inst, N);
+ }
+
+ void addMemUImm12OffsetOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 2 && "Invalid number of operands!");
+ // If this is an immediate, it's a label reference.
+ if (isImm()) {
+ addExpr(Inst, getImm());
+ Inst.addOperand(MCOperand::createImm(0));
+ return;
+ }
+
+ // Otherwise, it's a normal memory reg+offset.
+ int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
+ Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+ Inst.addOperand(MCOperand::createImm(Val));
+ }
+
+ void addMemImm12OffsetOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 2 && "Invalid number of operands!");
+ // If this is an immediate, it's a label reference.
+ if (isImm()) {
+ addExpr(Inst, getImm());
+ Inst.addOperand(MCOperand::createImm(0));
+ return;
+ }
+
+ // Otherwise, it's a normal memory reg+offset.
+ int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
+ Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+ Inst.addOperand(MCOperand::createImm(Val));
+ }
+
+ void addConstPoolAsmImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ // This is container for the immediate that we will create the constant
+ // pool from
+ addExpr(Inst, getConstantPoolImm());
+ return;
+ }
+
+ void addMemTBBOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 2 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+ Inst.addOperand(MCOperand::createReg(Memory.OffsetRegNum));
+ }
+
+ void addMemTBHOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 2 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+ Inst.addOperand(MCOperand::createReg(Memory.OffsetRegNum));
+ }
+
+ void addMemRegOffsetOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 3 && "Invalid number of operands!");
+ unsigned Val =
+ ARM_AM::getAM2Opc(Memory.isNegative ? ARM_AM::sub : ARM_AM::add,
+ Memory.ShiftImm, Memory.ShiftType);
+ Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+ Inst.addOperand(MCOperand::createReg(Memory.OffsetRegNum));
+ Inst.addOperand(MCOperand::createImm(Val));
+ }
+
+ void addT2MemRegOffsetOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 3 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+ Inst.addOperand(MCOperand::createReg(Memory.OffsetRegNum));
+ Inst.addOperand(MCOperand::createImm(Memory.ShiftImm));
+ }
+
+ void addMemThumbRROperands(MCInst &Inst, unsigned N) const {
+ assert(N == 2 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+ Inst.addOperand(MCOperand::createReg(Memory.OffsetRegNum));
+ }
+
+ void addMemThumbRIs4Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 2 && "Invalid number of operands!");
+ int64_t Val = Memory.OffsetImm ? (Memory.OffsetImm->getValue() / 4) : 0;
+ Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+ Inst.addOperand(MCOperand::createImm(Val));
+ }
+
+ void addMemThumbRIs2Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 2 && "Invalid number of operands!");
+ int64_t Val = Memory.OffsetImm ? (Memory.OffsetImm->getValue() / 2) : 0;
+ Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+ Inst.addOperand(MCOperand::createImm(Val));
+ }
+
+ void addMemThumbRIs1Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 2 && "Invalid number of operands!");
+ int64_t Val = Memory.OffsetImm ? (Memory.OffsetImm->getValue()) : 0;
+ Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+ Inst.addOperand(MCOperand::createImm(Val));
+ }
+
+ void addMemThumbSPIOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 2 && "Invalid number of operands!");
+ int64_t Val = Memory.OffsetImm ? (Memory.OffsetImm->getValue() / 4) : 0;
+ Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+ Inst.addOperand(MCOperand::createImm(Val));
+ }
+
+ void addPostIdxImm8Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ assert(CE && "non-constant post-idx-imm8 operand!");
+ int Imm = CE->getValue();
+ bool isAdd = Imm >= 0;
+ if (Imm == INT32_MIN) Imm = 0;
+ Imm = (Imm < 0 ? -Imm : Imm) | (int)isAdd << 8;
+ Inst.addOperand(MCOperand::createImm(Imm));
+ }
+
+ void addPostIdxImm8s4Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ assert(CE && "non-constant post-idx-imm8s4 operand!");
+ int Imm = CE->getValue();
+ bool isAdd = Imm >= 0;
+ if (Imm == INT32_MIN) Imm = 0;
+ // Immediate is scaled by 4.
+ Imm = ((Imm < 0 ? -Imm : Imm) / 4) | (int)isAdd << 8;
+ Inst.addOperand(MCOperand::createImm(Imm));
+ }
+
+ void addPostIdxRegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 2 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(PostIdxReg.RegNum));
+ Inst.addOperand(MCOperand::createImm(PostIdxReg.isAdd));
+ }
+
+ void addPostIdxRegShiftedOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 2 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(PostIdxReg.RegNum));
+ // The sign, shift type, and shift amount are encoded in a single operand
+ // using the AM2 encoding helpers.
+ ARM_AM::AddrOpc opc = PostIdxReg.isAdd ? ARM_AM::add : ARM_AM::sub;
+ unsigned Imm = ARM_AM::getAM2Opc(opc, PostIdxReg.ShiftImm,
+ PostIdxReg.ShiftTy);
+ Inst.addOperand(MCOperand::createImm(Imm));
+ }
+
+ void addMSRMaskOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createImm(unsigned(getMSRMask())));
+ }
+
+ void addBankedRegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createImm(unsigned(getBankedReg())));
+ }
+
+ void addProcIFlagsOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createImm(unsigned(getProcIFlags())));
+ }
+
+ void addVecListOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(VectorList.RegNum));
+ }
+
+ void addVecListIndexedOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 2 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(VectorList.RegNum));
+ Inst.addOperand(MCOperand::createImm(VectorList.LaneIndex));
+ }
+
+ void addVectorIndex8Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createImm(getVectorIndex()));
+ }
+
+ void addVectorIndex16Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createImm(getVectorIndex()));
+ }
+
+ void addVectorIndex32Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createImm(getVectorIndex()));
+ }
+
+ void addNEONi8splatOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ // The immediate encodes the type of constant as well as the value.
+ // Mask in that this is an i8 splat.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ Inst.addOperand(MCOperand::createImm(CE->getValue() | 0xe00));
+ }
+
+ void addNEONi16splatOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ // The immediate encodes the type of constant as well as the value.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ unsigned Value = CE->getValue();
+ Value = ARM_AM::encodeNEONi16splat(Value);
+ Inst.addOperand(MCOperand::createImm(Value));
+ }
+
+ void addNEONi16splatNotOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ // The immediate encodes the type of constant as well as the value.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ unsigned Value = CE->getValue();
+ Value = ARM_AM::encodeNEONi16splat(~Value & 0xffff);
+ Inst.addOperand(MCOperand::createImm(Value));
+ }
+
+ void addNEONi32splatOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ // The immediate encodes the type of constant as well as the value.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ unsigned Value = CE->getValue();
+ Value = ARM_AM::encodeNEONi32splat(Value);
+ Inst.addOperand(MCOperand::createImm(Value));
+ }
+
+ void addNEONi32splatNotOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ // The immediate encodes the type of constant as well as the value.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ unsigned Value = CE->getValue();
+ Value = ARM_AM::encodeNEONi32splat(~Value);
+ Inst.addOperand(MCOperand::createImm(Value));
+ }
+
+ void addNEONinvByteReplicateOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ // The immediate encodes the type of constant as well as the value.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ unsigned Value = CE->getValue();
+ assert((Inst.getOpcode() == ARM::VMOVv8i8 ||
+ Inst.getOpcode() == ARM::VMOVv16i8) &&
+ "All vmvn instructions that wants to replicate non-zero byte "
+ "always must be replaced with VMOVv8i8 or VMOVv16i8.");
+ unsigned B = ((~Value) & 0xff);
+ B |= 0xe00; // cmode = 0b1110
+ Inst.addOperand(MCOperand::createImm(B));
+ }
+ void addNEONi32vmovOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ // The immediate encodes the type of constant as well as the value.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ unsigned Value = CE->getValue();
+ if (Value >= 256 && Value <= 0xffff)
+ Value = (Value >> 8) | ((Value & 0xff) ? 0xc00 : 0x200);
+ else if (Value > 0xffff && Value <= 0xffffff)
+ Value = (Value >> 16) | ((Value & 0xff) ? 0xd00 : 0x400);
+ else if (Value > 0xffffff)
+ Value = (Value >> 24) | 0x600;
+ Inst.addOperand(MCOperand::createImm(Value));
+ }
+
+ void addNEONvmovByteReplicateOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ // The immediate encodes the type of constant as well as the value.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ unsigned Value = CE->getValue();
+ assert((Inst.getOpcode() == ARM::VMOVv8i8 ||
+ Inst.getOpcode() == ARM::VMOVv16i8) &&
+ "All instructions that wants to replicate non-zero byte "
+ "always must be replaced with VMOVv8i8 or VMOVv16i8.");
+ unsigned B = Value & 0xff;
+ B |= 0xe00; // cmode = 0b1110
+ Inst.addOperand(MCOperand::createImm(B));
+ }
+ void addNEONi32vmovNegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ // The immediate encodes the type of constant as well as the value.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ unsigned Value = ~CE->getValue();
+ if (Value >= 256 && Value <= 0xffff)
+ Value = (Value >> 8) | ((Value & 0xff) ? 0xc00 : 0x200);
+ else if (Value > 0xffff && Value <= 0xffffff)
+ Value = (Value >> 16) | ((Value & 0xff) ? 0xd00 : 0x400);
+ else if (Value > 0xffffff)
+ Value = (Value >> 24) | 0x600;
+ Inst.addOperand(MCOperand::createImm(Value));
+ }
+
+ void addNEONi64splatOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ // The immediate encodes the type of constant as well as the value.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ uint64_t Value = CE->getValue();
+ unsigned Imm = 0;
+ for (unsigned i = 0; i < 8; ++i, Value >>= 8) {
+ Imm |= (Value & 1) << i;
+ }
+ Inst.addOperand(MCOperand::createImm(Imm | 0x1e00));
+ }
+
+ void print(raw_ostream &OS) const override;
+
+ static std::unique_ptr<ARMOperand> CreateITMask(unsigned Mask, SMLoc S) {
+ auto Op = make_unique<ARMOperand>(k_ITCondMask);
+ Op->ITMask.Mask = Mask;
+ Op->StartLoc = S;
+ Op->EndLoc = S;
+ return Op;
+ }
+
+ static std::unique_ptr<ARMOperand> CreateCondCode(ARMCC::CondCodes CC,
+ SMLoc S) {
+ auto Op = make_unique<ARMOperand>(k_CondCode);
+ Op->CC.Val = CC;
+ Op->StartLoc = S;
+ Op->EndLoc = S;
+ return Op;
+ }
+
+ static std::unique_ptr<ARMOperand> CreateCoprocNum(unsigned CopVal, SMLoc S) {
+ auto Op = make_unique<ARMOperand>(k_CoprocNum);
+ Op->Cop.Val = CopVal;
+ Op->StartLoc = S;
+ Op->EndLoc = S;
+ return Op;
+ }
+
+ static std::unique_ptr<ARMOperand> CreateCoprocReg(unsigned CopVal, SMLoc S) {
+ auto Op = make_unique<ARMOperand>(k_CoprocReg);
+ Op->Cop.Val = CopVal;
+ Op->StartLoc = S;
+ Op->EndLoc = S;
+ return Op;
+ }
+
+ static std::unique_ptr<ARMOperand> CreateCoprocOption(unsigned Val, SMLoc S,
+ SMLoc E) {
+ auto Op = make_unique<ARMOperand>(k_CoprocOption);
+ Op->Cop.Val = Val;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ static std::unique_ptr<ARMOperand> CreateCCOut(unsigned RegNum, SMLoc S) {
+ auto Op = make_unique<ARMOperand>(k_CCOut);
+ Op->Reg.RegNum = RegNum;
+ Op->StartLoc = S;
+ Op->EndLoc = S;
+ return Op;
+ }
+
+ static std::unique_ptr<ARMOperand> CreateToken(StringRef Str, SMLoc S) {
+ auto Op = make_unique<ARMOperand>(k_Token);
+ Op->Tok.Data = Str.data();
+ Op->Tok.Length = Str.size();
+ Op->StartLoc = S;
+ Op->EndLoc = S;
+ return Op;
+ }
+
+ static std::unique_ptr<ARMOperand> CreateReg(unsigned RegNum, SMLoc S,
+ SMLoc E) {
+ auto Op = make_unique<ARMOperand>(k_Register);
+ Op->Reg.RegNum = RegNum;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ static std::unique_ptr<ARMOperand>
+ CreateShiftedRegister(ARM_AM::ShiftOpc ShTy, unsigned SrcReg,
+ unsigned ShiftReg, unsigned ShiftImm, SMLoc S,
+ SMLoc E) {
+ auto Op = make_unique<ARMOperand>(k_ShiftedRegister);
+ Op->RegShiftedReg.ShiftTy = ShTy;
+ Op->RegShiftedReg.SrcReg = SrcReg;
+ Op->RegShiftedReg.ShiftReg = ShiftReg;
+ Op->RegShiftedReg.ShiftImm = ShiftImm;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ static std::unique_ptr<ARMOperand>
+ CreateShiftedImmediate(ARM_AM::ShiftOpc ShTy, unsigned SrcReg,
+ unsigned ShiftImm, SMLoc S, SMLoc E) {
+ auto Op = make_unique<ARMOperand>(k_ShiftedImmediate);
+ Op->RegShiftedImm.ShiftTy = ShTy;
+ Op->RegShiftedImm.SrcReg = SrcReg;
+ Op->RegShiftedImm.ShiftImm = ShiftImm;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ static std::unique_ptr<ARMOperand> CreateShifterImm(bool isASR, unsigned Imm,
+ SMLoc S, SMLoc E) {
+ auto Op = make_unique<ARMOperand>(k_ShifterImmediate);
+ Op->ShifterImm.isASR = isASR;
+ Op->ShifterImm.Imm = Imm;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ static std::unique_ptr<ARMOperand> CreateRotImm(unsigned Imm, SMLoc S,
+ SMLoc E) {
+ auto Op = make_unique<ARMOperand>(k_RotateImmediate);
+ Op->RotImm.Imm = Imm;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ static std::unique_ptr<ARMOperand> CreateModImm(unsigned Bits, unsigned Rot,
+ SMLoc S, SMLoc E) {
+ auto Op = make_unique<ARMOperand>(k_ModifiedImmediate);
+ Op->ModImm.Bits = Bits;
+ Op->ModImm.Rot = Rot;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ static std::unique_ptr<ARMOperand>
+ CreateConstantPoolImm(const MCExpr *Val, SMLoc S, SMLoc E) {
+ auto Op = make_unique<ARMOperand>(k_ConstantPoolImmediate);
+ Op->Imm.Val = Val;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ static std::unique_ptr<ARMOperand>
+ CreateBitfield(unsigned LSB, unsigned Width, SMLoc S, SMLoc E) {
+ auto Op = make_unique<ARMOperand>(k_BitfieldDescriptor);
+ Op->Bitfield.LSB = LSB;
+ Op->Bitfield.Width = Width;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ static std::unique_ptr<ARMOperand>
+ CreateRegList(SmallVectorImpl<std::pair<unsigned, unsigned>> &Regs,
+ SMLoc StartLoc, SMLoc EndLoc) {
+ assert (Regs.size() > 0 && "RegList contains no registers?");
+ KindTy Kind = k_RegisterList;
+
+ if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Regs.front().second))
+ Kind = k_DPRRegisterList;
+ else if (ARMMCRegisterClasses[ARM::SPRRegClassID].
+ contains(Regs.front().second))
+ Kind = k_SPRRegisterList;
+
+ // Sort based on the register encoding values.
+ array_pod_sort(Regs.begin(), Regs.end());
+
+ auto Op = make_unique<ARMOperand>(Kind);
+ for (SmallVectorImpl<std::pair<unsigned, unsigned> >::const_iterator
+ I = Regs.begin(), E = Regs.end(); I != E; ++I)
+ Op->Registers.push_back(I->second);
+ Op->StartLoc = StartLoc;
+ Op->EndLoc = EndLoc;
+ return Op;
+ }
+
+ static std::unique_ptr<ARMOperand> CreateVectorList(unsigned RegNum,
+ unsigned Count,
+ bool isDoubleSpaced,
+ SMLoc S, SMLoc E) {
+ auto Op = make_unique<ARMOperand>(k_VectorList);
+ Op->VectorList.RegNum = RegNum;
+ Op->VectorList.Count = Count;
+ Op->VectorList.isDoubleSpaced = isDoubleSpaced;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ static std::unique_ptr<ARMOperand>
+ CreateVectorListAllLanes(unsigned RegNum, unsigned Count, bool isDoubleSpaced,
+ SMLoc S, SMLoc E) {
+ auto Op = make_unique<ARMOperand>(k_VectorListAllLanes);
+ Op->VectorList.RegNum = RegNum;
+ Op->VectorList.Count = Count;
+ Op->VectorList.isDoubleSpaced = isDoubleSpaced;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ static std::unique_ptr<ARMOperand>
+ CreateVectorListIndexed(unsigned RegNum, unsigned Count, unsigned Index,
+ bool isDoubleSpaced, SMLoc S, SMLoc E) {
+ auto Op = make_unique<ARMOperand>(k_VectorListIndexed);
+ Op->VectorList.RegNum = RegNum;
+ Op->VectorList.Count = Count;
+ Op->VectorList.LaneIndex = Index;
+ Op->VectorList.isDoubleSpaced = isDoubleSpaced;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ static std::unique_ptr<ARMOperand>
+ CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E, MCContext &Ctx) {
+ auto Op = make_unique<ARMOperand>(k_VectorIndex);
+ Op->VectorIndex.Val = Idx;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ static std::unique_ptr<ARMOperand> CreateImm(const MCExpr *Val, SMLoc S,
+ SMLoc E) {
+ auto Op = make_unique<ARMOperand>(k_Immediate);
+ Op->Imm.Val = Val;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ static std::unique_ptr<ARMOperand>
+ CreateMem(unsigned BaseRegNum, const MCConstantExpr *OffsetImm,
+ unsigned OffsetRegNum, ARM_AM::ShiftOpc ShiftType,
+ unsigned ShiftImm, unsigned Alignment, bool isNegative, SMLoc S,
+ SMLoc E, SMLoc AlignmentLoc = SMLoc()) {
+ auto Op = make_unique<ARMOperand>(k_Memory);
+ Op->Memory.BaseRegNum = BaseRegNum;
+ Op->Memory.OffsetImm = OffsetImm;
+ Op->Memory.OffsetRegNum = OffsetRegNum;
+ Op->Memory.ShiftType = ShiftType;
+ Op->Memory.ShiftImm = ShiftImm;
+ Op->Memory.Alignment = Alignment;
+ Op->Memory.isNegative = isNegative;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ Op->AlignmentLoc = AlignmentLoc;
+ return Op;
+ }
+
+ static std::unique_ptr<ARMOperand>
+ CreatePostIdxReg(unsigned RegNum, bool isAdd, ARM_AM::ShiftOpc ShiftTy,
+ unsigned ShiftImm, SMLoc S, SMLoc E) {
+ auto Op = make_unique<ARMOperand>(k_PostIndexRegister);
+ Op->PostIdxReg.RegNum = RegNum;
+ Op->PostIdxReg.isAdd = isAdd;
+ Op->PostIdxReg.ShiftTy = ShiftTy;
+ Op->PostIdxReg.ShiftImm = ShiftImm;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ static std::unique_ptr<ARMOperand> CreateMemBarrierOpt(ARM_MB::MemBOpt Opt,
+ SMLoc S) {
+ auto Op = make_unique<ARMOperand>(k_MemBarrierOpt);
+ Op->MBOpt.Val = Opt;
+ Op->StartLoc = S;
+ Op->EndLoc = S;
+ return Op;
+ }
+
+ static std::unique_ptr<ARMOperand>
+ CreateInstSyncBarrierOpt(ARM_ISB::InstSyncBOpt Opt, SMLoc S) {
+ auto Op = make_unique<ARMOperand>(k_InstSyncBarrierOpt);
+ Op->ISBOpt.Val = Opt;
+ Op->StartLoc = S;
+ Op->EndLoc = S;
+ return Op;
+ }
+
+ static std::unique_ptr<ARMOperand> CreateProcIFlags(ARM_PROC::IFlags IFlags,
+ SMLoc S) {
+ auto Op = make_unique<ARMOperand>(k_ProcIFlags);
+ Op->IFlags.Val = IFlags;
+ Op->StartLoc = S;
+ Op->EndLoc = S;
+ return Op;
+ }
+
+ static std::unique_ptr<ARMOperand> CreateMSRMask(unsigned MMask, SMLoc S) {
+ auto Op = make_unique<ARMOperand>(k_MSRMask);
+ Op->MMask.Val = MMask;
+ Op->StartLoc = S;
+ Op->EndLoc = S;
+ return Op;
+ }
+
+ static std::unique_ptr<ARMOperand> CreateBankedReg(unsigned Reg, SMLoc S) {
+ auto Op = make_unique<ARMOperand>(k_BankedReg);
+ Op->BankedReg.Val = Reg;
+ Op->StartLoc = S;
+ Op->EndLoc = S;
+ return Op;
+ }
+};
+
+} // end anonymous namespace.
+
+void ARMOperand::print(raw_ostream &OS) const {
+ switch (Kind) {
+ case k_CondCode:
+ OS << "<ARMCC::" << ARMCondCodeToString(getCondCode()) << ">";
+ break;
+ case k_CCOut:
+ OS << "<ccout " << getReg() << ">";
+ break;
+ case k_ITCondMask: {
+ static const char *const MaskStr[] = {
+ "()", "(t)", "(e)", "(tt)", "(et)", "(te)", "(ee)", "(ttt)", "(ett)",
+ "(tet)", "(eet)", "(tte)", "(ete)", "(tee)", "(eee)"
+ };
+ assert((ITMask.Mask & 0xf) == ITMask.Mask);
+ OS << "<it-mask " << MaskStr[ITMask.Mask] << ">";
+ break;
+ }
+ case k_CoprocNum:
+ OS << "<coprocessor number: " << getCoproc() << ">";
+ break;
+ case k_CoprocReg:
+ OS << "<coprocessor register: " << getCoproc() << ">";
+ break;
+ case k_CoprocOption:
+ OS << "<coprocessor option: " << CoprocOption.Val << ">";
+ break;
+ case k_MSRMask:
+ OS << "<mask: " << getMSRMask() << ">";
+ break;
+ case k_BankedReg:
+ OS << "<banked reg: " << getBankedReg() << ">";
+ break;
+ case k_Immediate:
+ OS << *getImm();
+ break;
+ case k_MemBarrierOpt:
+ OS << "<ARM_MB::" << MemBOptToString(getMemBarrierOpt(), false) << ">";
+ break;
+ case k_InstSyncBarrierOpt:
+ OS << "<ARM_ISB::" << InstSyncBOptToString(getInstSyncBarrierOpt()) << ">";
+ break;
+ case k_Memory:
+ OS << "<memory "
+ << " base:" << Memory.BaseRegNum;
+ OS << ">";
+ break;
+ case k_PostIndexRegister:
+ OS << "post-idx register " << (PostIdxReg.isAdd ? "" : "-")
+ << PostIdxReg.RegNum;
+ if (PostIdxReg.ShiftTy != ARM_AM::no_shift)
+ OS << ARM_AM::getShiftOpcStr(PostIdxReg.ShiftTy) << " "
+ << PostIdxReg.ShiftImm;
+ OS << ">";
+ break;
+ case k_ProcIFlags: {
+ OS << "<ARM_PROC::";
+ unsigned IFlags = getProcIFlags();
+ for (int i=2; i >= 0; --i)
+ if (IFlags & (1 << i))
+ OS << ARM_PROC::IFlagsToString(1 << i);
+ OS << ">";
+ break;
+ }
+ case k_Register:
+ OS << "<register " << getReg() << ">";
+ break;
+ case k_ShifterImmediate:
+ OS << "<shift " << (ShifterImm.isASR ? "asr" : "lsl")
+ << " #" << ShifterImm.Imm << ">";
+ break;
+ case k_ShiftedRegister:
+ OS << "<so_reg_reg "
+ << RegShiftedReg.SrcReg << " "
+ << ARM_AM::getShiftOpcStr(RegShiftedReg.ShiftTy)
+ << " " << RegShiftedReg.ShiftReg << ">";
+ break;
+ case k_ShiftedImmediate:
+ OS << "<so_reg_imm "
+ << RegShiftedImm.SrcReg << " "
+ << ARM_AM::getShiftOpcStr(RegShiftedImm.ShiftTy)
+ << " #" << RegShiftedImm.ShiftImm << ">";
+ break;
+ case k_RotateImmediate:
+ OS << "<ror " << " #" << (RotImm.Imm * 8) << ">";
+ break;
+ case k_ModifiedImmediate:
+ OS << "<mod_imm #" << ModImm.Bits << ", #"
+ << ModImm.Rot << ")>";
+ break;
+ case k_ConstantPoolImmediate:
+ OS << "<constant_pool_imm #" << *getConstantPoolImm();
+ break;
+ case k_BitfieldDescriptor:
+ OS << "<bitfield " << "lsb: " << Bitfield.LSB
+ << ", width: " << Bitfield.Width << ">";
+ break;
+ case k_RegisterList:
+ case k_DPRRegisterList:
+ case k_SPRRegisterList: {
+ OS << "<register_list ";
+
+ const SmallVectorImpl<unsigned> &RegList = getRegList();
+ for (SmallVectorImpl<unsigned>::const_iterator
+ I = RegList.begin(), E = RegList.end(); I != E; ) {
+ OS << *I;
+ if (++I < E) OS << ", ";
+ }
+
+ OS << ">";
+ break;
+ }
+ case k_VectorList:
+ OS << "<vector_list " << VectorList.Count << " * "
+ << VectorList.RegNum << ">";
+ break;
+ case k_VectorListAllLanes:
+ OS << "<vector_list(all lanes) " << VectorList.Count << " * "
+ << VectorList.RegNum << ">";
+ break;
+ case k_VectorListIndexed:
+ OS << "<vector_list(lane " << VectorList.LaneIndex << ") "
+ << VectorList.Count << " * " << VectorList.RegNum << ">";
+ break;
+ case k_Token:
+ OS << "'" << getToken() << "'";
+ break;
+ case k_VectorIndex:
+ OS << "<vectorindex " << getVectorIndex() << ">";
+ break;
+ }
+}
+
+/// @name Auto-generated Match Functions
+/// {
+
+static unsigned MatchRegisterName(StringRef Name);
+
+/// }
+
+bool ARMAsmParser::ParseRegister(unsigned &RegNo,
+ SMLoc &StartLoc, SMLoc &EndLoc) {
+ const AsmToken &Tok = getParser().getTok();
+ StartLoc = Tok.getLoc();
+ EndLoc = Tok.getEndLoc();
+ RegNo = tryParseRegister();
+
+ return (RegNo == (unsigned)-1);
+}
+
+/// Try to parse a register name. The token must be an Identifier when called,
+/// and if it is a register name the token is eaten and the register number is
+/// returned. Otherwise return -1.
+///
+int ARMAsmParser::tryParseRegister() {
+ MCAsmParser &Parser = getParser();
+ const AsmToken &Tok = Parser.getTok();
+ if (Tok.isNot(AsmToken::Identifier)) return -1;
+
+ std::string lowerCase = Tok.getString().lower();
+ unsigned RegNum = MatchRegisterName(lowerCase);
+ if (!RegNum) {
+ RegNum = StringSwitch<unsigned>(lowerCase)
+ .Case("r13", ARM::SP)
+ .Case("r14", ARM::LR)
+ .Case("r15", ARM::PC)
+ .Case("ip", ARM::R12)
+ // Additional register name aliases for 'gas' compatibility.
+ .Case("a1", ARM::R0)
+ .Case("a2", ARM::R1)
+ .Case("a3", ARM::R2)
+ .Case("a4", ARM::R3)
+ .Case("v1", ARM::R4)
+ .Case("v2", ARM::R5)
+ .Case("v3", ARM::R6)
+ .Case("v4", ARM::R7)
+ .Case("v5", ARM::R8)
+ .Case("v6", ARM::R9)
+ .Case("v7", ARM::R10)
+ .Case("v8", ARM::R11)
+ .Case("sb", ARM::R9)
+ .Case("sl", ARM::R10)
+ .Case("fp", ARM::R11)
+ .Default(0);
+ }
+ if (!RegNum) {
+ // Check for aliases registered via .req. Canonicalize to lower case.
+ // That's more consistent since register names are case insensitive, and
+ // it's how the original entry was passed in from MC/MCParser/AsmParser.
+ StringMap<unsigned>::const_iterator Entry = RegisterReqs.find(lowerCase);
+ // If no match, return failure.
+ if (Entry == RegisterReqs.end())
+ return -1;
+ Parser.Lex(); // Eat identifier token.
+ return Entry->getValue();
+ }
+
+ // Some FPUs only have 16 D registers, so D16-D31 are invalid
+ if (hasD16() && RegNum >= ARM::D16 && RegNum <= ARM::D31)
+ return -1;
+
+ Parser.Lex(); // Eat identifier token.
+
+ return RegNum;
+}
+
+// Try to parse a shifter (e.g., "lsl <amt>"). On success, return 0.
+// If a recoverable error occurs, return 1. If an irrecoverable error
+// occurs, return -1. An irrecoverable error is one where tokens have been
+// consumed in the process of trying to parse the shifter (i.e., when it is
+// indeed a shifter operand, but malformed).
+int ARMAsmParser::tryParseShiftRegister(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ SMLoc S = Parser.getTok().getLoc();
+ const AsmToken &Tok = Parser.getTok();
+ if (Tok.isNot(AsmToken::Identifier))
+ return -1;
+
+ std::string lowerCase = Tok.getString().lower();
+ ARM_AM::ShiftOpc ShiftTy = StringSwitch<ARM_AM::ShiftOpc>(lowerCase)
+ .Case("asl", ARM_AM::lsl)
+ .Case("lsl", ARM_AM::lsl)
+ .Case("lsr", ARM_AM::lsr)
+ .Case("asr", ARM_AM::asr)
+ .Case("ror", ARM_AM::ror)
+ .Case("rrx", ARM_AM::rrx)
+ .Default(ARM_AM::no_shift);
+
+ if (ShiftTy == ARM_AM::no_shift)
+ return 1;
+
+ Parser.Lex(); // Eat the operator.
+
+ // The source register for the shift has already been added to the
+ // operand list, so we need to pop it off and combine it into the shifted
+ // register operand instead.
+ std::unique_ptr<ARMOperand> PrevOp(
+ (ARMOperand *)Operands.pop_back_val().release());
+ if (!PrevOp->isReg())
+ return Error(PrevOp->getStartLoc(), "shift must be of a register");
+ int SrcReg = PrevOp->getReg();
+
+ SMLoc EndLoc;
+ int64_t Imm = 0;
+ int ShiftReg = 0;
+ if (ShiftTy == ARM_AM::rrx) {
+ // RRX Doesn't have an explicit shift amount. The encoder expects
+ // the shift register to be the same as the source register. Seems odd,
+ // but OK.
+ ShiftReg = SrcReg;
+ } else {
+ // Figure out if this is shifted by a constant or a register (for non-RRX).
+ if (Parser.getTok().is(AsmToken::Hash) ||
+ Parser.getTok().is(AsmToken::Dollar)) {
+ Parser.Lex(); // Eat hash.
+ SMLoc ImmLoc = Parser.getTok().getLoc();
+ const MCExpr *ShiftExpr = nullptr;
+ if (getParser().parseExpression(ShiftExpr, EndLoc)) {
+ Error(ImmLoc, "invalid immediate shift value");
+ return -1;
+ }
+ // The expression must be evaluatable as an immediate.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ShiftExpr);
+ if (!CE) {
+ Error(ImmLoc, "invalid immediate shift value");
+ return -1;
+ }
+ // Range check the immediate.
+ // lsl, ror: 0 <= imm <= 31
+ // lsr, asr: 0 <= imm <= 32
+ Imm = CE->getValue();
+ if (Imm < 0 ||
+ ((ShiftTy == ARM_AM::lsl || ShiftTy == ARM_AM::ror) && Imm > 31) ||
+ ((ShiftTy == ARM_AM::lsr || ShiftTy == ARM_AM::asr) && Imm > 32)) {
+ Error(ImmLoc, "immediate shift value out of range");
+ return -1;
+ }
+ // shift by zero is a nop. Always send it through as lsl.
+ // ('as' compatibility)
+ if (Imm == 0)
+ ShiftTy = ARM_AM::lsl;
+ } else if (Parser.getTok().is(AsmToken::Identifier)) {
+ SMLoc L = Parser.getTok().getLoc();
+ EndLoc = Parser.getTok().getEndLoc();
+ ShiftReg = tryParseRegister();
+ if (ShiftReg == -1) {
+ Error(L, "expected immediate or register in shift operand");
+ return -1;
+ }
+ } else {
+ Error(Parser.getTok().getLoc(),
+ "expected immediate or register in shift operand");
+ return -1;
+ }
+ }
+
+ if (ShiftReg && ShiftTy != ARM_AM::rrx)
+ Operands.push_back(ARMOperand::CreateShiftedRegister(ShiftTy, SrcReg,
+ ShiftReg, Imm,
+ S, EndLoc));
+ else
+ Operands.push_back(ARMOperand::CreateShiftedImmediate(ShiftTy, SrcReg, Imm,
+ S, EndLoc));
+
+ return 0;
+}
+
+
+/// Try to parse a register name. The token must be an Identifier when called.
+/// If it's a register, an AsmOperand is created. Another AsmOperand is created
+/// if there is a "writeback". 'true' if it's not a register.
+///
+/// TODO this is likely to change to allow different register types and or to
+/// parse for a specific register type.
+bool ARMAsmParser::tryParseRegisterWithWriteBack(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ const AsmToken &RegTok = Parser.getTok();
+ int RegNo = tryParseRegister();
+ if (RegNo == -1)
+ return true;
+
+ Operands.push_back(ARMOperand::CreateReg(RegNo, RegTok.getLoc(),
+ RegTok.getEndLoc()));
+
+ const AsmToken &ExclaimTok = Parser.getTok();
+ if (ExclaimTok.is(AsmToken::Exclaim)) {
+ Operands.push_back(ARMOperand::CreateToken(ExclaimTok.getString(),
+ ExclaimTok.getLoc()));
+ Parser.Lex(); // Eat exclaim token
+ return false;
+ }
+
+ // Also check for an index operand. This is only legal for vector registers,
+ // but that'll get caught OK in operand matching, so we don't need to
+ // explicitly filter everything else out here.
+ if (Parser.getTok().is(AsmToken::LBrac)) {
+ SMLoc SIdx = Parser.getTok().getLoc();
+ Parser.Lex(); // Eat left bracket token.
+
+ const MCExpr *ImmVal;
+ if (getParser().parseExpression(ImmVal))
+ return true;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+ if (!MCE)
+ return TokError("immediate value expected for vector index");
+
+ if (Parser.getTok().isNot(AsmToken::RBrac))
+ return Error(Parser.getTok().getLoc(), "']' expected");
+
+ SMLoc E = Parser.getTok().getEndLoc();
+ Parser.Lex(); // Eat right bracket token.
+
+ Operands.push_back(ARMOperand::CreateVectorIndex(MCE->getValue(),
+ SIdx, E,
+ getContext()));
+ }
+
+ return false;
+}
+
+/// MatchCoprocessorOperandName - Try to parse an coprocessor related
+/// instruction with a symbolic operand name.
+/// We accept "crN" syntax for GAS compatibility.
+/// <operand-name> ::= <prefix><number>
+/// If CoprocOp is 'c', then:
+/// <prefix> ::= c | cr
+/// If CoprocOp is 'p', then :
+/// <prefix> ::= p
+/// <number> ::= integer in range [0, 15]
+static int MatchCoprocessorOperandName(StringRef Name, char CoprocOp) {
+ // Use the same layout as the tablegen'erated register name matcher. Ugly,
+ // but efficient.
+ if (Name.size() < 2 || Name[0] != CoprocOp)
+ return -1;
+ Name = (Name[1] == 'r') ? Name.drop_front(2) : Name.drop_front();
+
+ switch (Name.size()) {
+ default: return -1;
+ case 1:
+ switch (Name[0]) {
+ default: return -1;
+ case '0': return 0;
+ case '1': return 1;
+ case '2': return 2;
+ case '3': return 3;
+ case '4': return 4;
+ case '5': return 5;
+ case '6': return 6;
+ case '7': return 7;
+ case '8': return 8;
+ case '9': return 9;
+ }
+ case 2:
+ if (Name[0] != '1')
+ return -1;
+ switch (Name[1]) {
+ default: return -1;
+ // CP10 and CP11 are VFP/NEON and so vector instructions should be used.
+ // However, old cores (v5/v6) did use them in that way.
+ case '0': return 10;
+ case '1': return 11;
+ case '2': return 12;
+ case '3': return 13;
+ case '4': return 14;
+ case '5': return 15;
+ }
+ }
+}
+
+/// parseITCondCode - Try to parse a condition code for an IT instruction.
+OperandMatchResultTy
+ARMAsmParser::parseITCondCode(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ SMLoc S = Parser.getTok().getLoc();
+ const AsmToken &Tok = Parser.getTok();
+ if (!Tok.is(AsmToken::Identifier))
+ return MatchOperand_NoMatch;
+ unsigned CC = StringSwitch<unsigned>(Tok.getString().lower())
+ .Case("eq", ARMCC::EQ)
+ .Case("ne", ARMCC::NE)
+ .Case("hs", ARMCC::HS)
+ .Case("cs", ARMCC::HS)
+ .Case("lo", ARMCC::LO)
+ .Case("cc", ARMCC::LO)
+ .Case("mi", ARMCC::MI)
+ .Case("pl", ARMCC::PL)
+ .Case("vs", ARMCC::VS)
+ .Case("vc", ARMCC::VC)
+ .Case("hi", ARMCC::HI)
+ .Case("ls", ARMCC::LS)
+ .Case("ge", ARMCC::GE)
+ .Case("lt", ARMCC::LT)
+ .Case("gt", ARMCC::GT)
+ .Case("le", ARMCC::LE)
+ .Case("al", ARMCC::AL)
+ .Default(~0U);
+ if (CC == ~0U)
+ return MatchOperand_NoMatch;
+ Parser.Lex(); // Eat the token.
+
+ Operands.push_back(ARMOperand::CreateCondCode(ARMCC::CondCodes(CC), S));
+
+ return MatchOperand_Success;
+}
+
+/// parseCoprocNumOperand - Try to parse an coprocessor number operand. The
+/// token must be an Identifier when called, and if it is a coprocessor
+/// number, the token is eaten and the operand is added to the operand list.
+OperandMatchResultTy
+ARMAsmParser::parseCoprocNumOperand(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ SMLoc S = Parser.getTok().getLoc();
+ const AsmToken &Tok = Parser.getTok();
+ if (Tok.isNot(AsmToken::Identifier))
+ return MatchOperand_NoMatch;
+
+ int Num = MatchCoprocessorOperandName(Tok.getString(), 'p');
+ if (Num == -1)
+ return MatchOperand_NoMatch;
+ // ARMv7 and v8 don't allow cp10/cp11 due to VFP/NEON specific instructions
+ if ((hasV7Ops() || hasV8Ops()) && (Num == 10 || Num == 11))
+ return MatchOperand_NoMatch;
+
+ Parser.Lex(); // Eat identifier token.
+ Operands.push_back(ARMOperand::CreateCoprocNum(Num, S));
+ return MatchOperand_Success;
+}
+
+/// parseCoprocRegOperand - Try to parse an coprocessor register operand. The
+/// token must be an Identifier when called, and if it is a coprocessor
+/// number, the token is eaten and the operand is added to the operand list.
+OperandMatchResultTy
+ARMAsmParser::parseCoprocRegOperand(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ SMLoc S = Parser.getTok().getLoc();
+ const AsmToken &Tok = Parser.getTok();
+ if (Tok.isNot(AsmToken::Identifier))
+ return MatchOperand_NoMatch;
+
+ int Reg = MatchCoprocessorOperandName(Tok.getString(), 'c');
+ if (Reg == -1)
+ return MatchOperand_NoMatch;
+
+ Parser.Lex(); // Eat identifier token.
+ Operands.push_back(ARMOperand::CreateCoprocReg(Reg, S));
+ return MatchOperand_Success;
+}
+
+/// parseCoprocOptionOperand - Try to parse an coprocessor option operand.
+/// coproc_option : '{' imm0_255 '}'
+OperandMatchResultTy
+ARMAsmParser::parseCoprocOptionOperand(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ SMLoc S = Parser.getTok().getLoc();
+
+ // If this isn't a '{', this isn't a coprocessor immediate operand.
+ if (Parser.getTok().isNot(AsmToken::LCurly))
+ return MatchOperand_NoMatch;
+ Parser.Lex(); // Eat the '{'
+
+ const MCExpr *Expr;
+ SMLoc Loc = Parser.getTok().getLoc();
+ if (getParser().parseExpression(Expr)) {
+ Error(Loc, "illegal expression");
+ return MatchOperand_ParseFail;
+ }
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr);
+ if (!CE || CE->getValue() < 0 || CE->getValue() > 255) {
+ Error(Loc, "coprocessor option must be an immediate in range [0, 255]");
+ return MatchOperand_ParseFail;
+ }
+ int Val = CE->getValue();
+
+ // Check for and consume the closing '}'
+ if (Parser.getTok().isNot(AsmToken::RCurly))
+ return MatchOperand_ParseFail;
+ SMLoc E = Parser.getTok().getEndLoc();
+ Parser.Lex(); // Eat the '}'
+
+ Operands.push_back(ARMOperand::CreateCoprocOption(Val, S, E));
+ return MatchOperand_Success;
+}
+
+// For register list parsing, we need to map from raw GPR register numbering
+// to the enumeration values. The enumeration values aren't sorted by
+// register number due to our using "sp", "lr" and "pc" as canonical names.
+static unsigned getNextRegister(unsigned Reg) {
+ // If this is a GPR, we need to do it manually, otherwise we can rely
+ // on the sort ordering of the enumeration since the other reg-classes
+ // are sane.
+ if (!ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg))
+ return Reg + 1;
+ switch(Reg) {
+ default: llvm_unreachable("Invalid GPR number!");
+ case ARM::R0: return ARM::R1; case ARM::R1: return ARM::R2;
+ case ARM::R2: return ARM::R3; case ARM::R3: return ARM::R4;
+ case ARM::R4: return ARM::R5; case ARM::R5: return ARM::R6;
+ case ARM::R6: return ARM::R7; case ARM::R7: return ARM::R8;
+ case ARM::R8: return ARM::R9; case ARM::R9: return ARM::R10;
+ case ARM::R10: return ARM::R11; case ARM::R11: return ARM::R12;
+ case ARM::R12: return ARM::SP; case ARM::SP: return ARM::LR;
+ case ARM::LR: return ARM::PC; case ARM::PC: return ARM::R0;
+ }
+}
+
+// Return the low-subreg of a given Q register.
+static unsigned getDRegFromQReg(unsigned QReg) {
+ switch (QReg) {
+ default: llvm_unreachable("expected a Q register!");
+ case ARM::Q0: return ARM::D0;
+ case ARM::Q1: return ARM::D2;
+ case ARM::Q2: return ARM::D4;
+ case ARM::Q3: return ARM::D6;
+ case ARM::Q4: return ARM::D8;
+ case ARM::Q5: return ARM::D10;
+ case ARM::Q6: return ARM::D12;
+ case ARM::Q7: return ARM::D14;
+ case ARM::Q8: return ARM::D16;
+ case ARM::Q9: return ARM::D18;
+ case ARM::Q10: return ARM::D20;
+ case ARM::Q11: return ARM::D22;
+ case ARM::Q12: return ARM::D24;
+ case ARM::Q13: return ARM::D26;
+ case ARM::Q14: return ARM::D28;
+ case ARM::Q15: return ARM::D30;
+ }
+}
+
+/// Parse a register list.
+bool ARMAsmParser::parseRegisterList(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ if (Parser.getTok().isNot(AsmToken::LCurly))
+ return TokError("Token is not a Left Curly Brace");
+ SMLoc S = Parser.getTok().getLoc();
+ Parser.Lex(); // Eat '{' token.
+ SMLoc RegLoc = Parser.getTok().getLoc();
+
+ // Check the first register in the list to see what register class
+ // this is a list of.
+ int Reg = tryParseRegister();
+ if (Reg == -1)
+ return Error(RegLoc, "register expected");
+
+ // The reglist instructions have at most 16 registers, so reserve
+ // space for that many.
+ int EReg = 0;
+ SmallVector<std::pair<unsigned, unsigned>, 16> Registers;
+
+ // Allow Q regs and just interpret them as the two D sub-registers.
+ if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) {
+ Reg = getDRegFromQReg(Reg);
+ EReg = MRI->getEncodingValue(Reg);
+ Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
+ ++Reg;
+ }
+ const MCRegisterClass *RC;
+ if (ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg))
+ RC = &ARMMCRegisterClasses[ARM::GPRRegClassID];
+ else if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg))
+ RC = &ARMMCRegisterClasses[ARM::DPRRegClassID];
+ else if (ARMMCRegisterClasses[ARM::SPRRegClassID].contains(Reg))
+ RC = &ARMMCRegisterClasses[ARM::SPRRegClassID];
+ else
+ return Error(RegLoc, "invalid register in register list");
+
+ // Store the register.
+ EReg = MRI->getEncodingValue(Reg);
+ Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
+
+ // This starts immediately after the first register token in the list,
+ // so we can see either a comma or a minus (range separator) as a legal
+ // next token.
+ while (Parser.getTok().is(AsmToken::Comma) ||
+ Parser.getTok().is(AsmToken::Minus)) {
+ if (Parser.getTok().is(AsmToken::Minus)) {
+ Parser.Lex(); // Eat the minus.
+ SMLoc AfterMinusLoc = Parser.getTok().getLoc();
+ int EndReg = tryParseRegister();
+ if (EndReg == -1)
+ return Error(AfterMinusLoc, "register expected");
+ // Allow Q regs and just interpret them as the two D sub-registers.
+ if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(EndReg))
+ EndReg = getDRegFromQReg(EndReg) + 1;
+ // If the register is the same as the start reg, there's nothing
+ // more to do.
+ if (Reg == EndReg)
+ continue;
+ // The register must be in the same register class as the first.
+ if (!RC->contains(EndReg))
+ return Error(AfterMinusLoc, "invalid register in register list");
+ // Ranges must go from low to high.
+ if (MRI->getEncodingValue(Reg) > MRI->getEncodingValue(EndReg))
+ return Error(AfterMinusLoc, "bad range in register list");
+
+ // Add all the registers in the range to the register list.
+ while (Reg != EndReg) {
+ Reg = getNextRegister(Reg);
+ EReg = MRI->getEncodingValue(Reg);
+ Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
+ }
+ continue;
+ }
+ Parser.Lex(); // Eat the comma.
+ RegLoc = Parser.getTok().getLoc();
+ int OldReg = Reg;
+ const AsmToken RegTok = Parser.getTok();
+ Reg = tryParseRegister();
+ if (Reg == -1)
+ return Error(RegLoc, "register expected");
+ // Allow Q regs and just interpret them as the two D sub-registers.
+ bool isQReg = false;
+ if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) {
+ Reg = getDRegFromQReg(Reg);
+ isQReg = true;
+ }
+ // The register must be in the same register class as the first.
+ if (!RC->contains(Reg))
+ return Error(RegLoc, "invalid register in register list");
+ // List must be monotonically increasing.
+ if (MRI->getEncodingValue(Reg) < MRI->getEncodingValue(OldReg)) {
+ if (ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg))
+ Warning(RegLoc, "register list not in ascending order");
+ else
+ return Error(RegLoc, "register list not in ascending order");
+ }
+ if (MRI->getEncodingValue(Reg) == MRI->getEncodingValue(OldReg)) {
+ Warning(RegLoc, "duplicated register (" + RegTok.getString() +
+ ") in register list");
+ continue;
+ }
+ // VFP register lists must also be contiguous.
+ if (RC != &ARMMCRegisterClasses[ARM::GPRRegClassID] &&
+ Reg != OldReg + 1)
+ return Error(RegLoc, "non-contiguous register range");
+ EReg = MRI->getEncodingValue(Reg);
+ Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
+ if (isQReg) {
+ EReg = MRI->getEncodingValue(++Reg);
+ Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
+ }
+ }
+
+ if (Parser.getTok().isNot(AsmToken::RCurly))
+ return Error(Parser.getTok().getLoc(), "'}' expected");
+ SMLoc E = Parser.getTok().getEndLoc();
+ Parser.Lex(); // Eat '}' token.
+
+ // Push the register list operand.
+ Operands.push_back(ARMOperand::CreateRegList(Registers, S, E));
+
+ // The ARM system instruction variants for LDM/STM have a '^' token here.
+ if (Parser.getTok().is(AsmToken::Caret)) {
+ Operands.push_back(ARMOperand::CreateToken("^",Parser.getTok().getLoc()));
+ Parser.Lex(); // Eat '^' token.
+ }
+
+ return false;
+}
+
+// Helper function to parse the lane index for vector lists.
+OperandMatchResultTy ARMAsmParser::
+parseVectorLane(VectorLaneTy &LaneKind, unsigned &Index, SMLoc &EndLoc) {
+ MCAsmParser &Parser = getParser();
+ Index = 0; // Always return a defined index value.
+ if (Parser.getTok().is(AsmToken::LBrac)) {
+ Parser.Lex(); // Eat the '['.
+ if (Parser.getTok().is(AsmToken::RBrac)) {
+ // "Dn[]" is the 'all lanes' syntax.
+ LaneKind = AllLanes;
+ EndLoc = Parser.getTok().getEndLoc();
+ Parser.Lex(); // Eat the ']'.
+ return MatchOperand_Success;
+ }
+
+ // There's an optional '#' token here. Normally there wouldn't be, but
+ // inline assemble puts one in, and it's friendly to accept that.
+ if (Parser.getTok().is(AsmToken::Hash))
+ Parser.Lex(); // Eat '#' or '$'.
+
+ const MCExpr *LaneIndex;
+ SMLoc Loc = Parser.getTok().getLoc();
+ if (getParser().parseExpression(LaneIndex)) {
+ Error(Loc, "illegal expression");
+ return MatchOperand_ParseFail;
+ }
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(LaneIndex);
+ if (!CE) {
+ Error(Loc, "lane index must be empty or an integer");
+ return MatchOperand_ParseFail;
+ }
+ if (Parser.getTok().isNot(AsmToken::RBrac)) {
+ Error(Parser.getTok().getLoc(), "']' expected");
+ return MatchOperand_ParseFail;
+ }
+ EndLoc = Parser.getTok().getEndLoc();
+ Parser.Lex(); // Eat the ']'.
+ int64_t Val = CE->getValue();
+
+ // FIXME: Make this range check context sensitive for .8, .16, .32.
+ if (Val < 0 || Val > 7) {
+ Error(Parser.getTok().getLoc(), "lane index out of range");
+ return MatchOperand_ParseFail;
+ }
+ Index = Val;
+ LaneKind = IndexedLane;
+ return MatchOperand_Success;
+ }
+ LaneKind = NoLanes;
+ return MatchOperand_Success;
+}
+
+// parse a vector register list
+OperandMatchResultTy
+ARMAsmParser::parseVectorList(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ VectorLaneTy LaneKind;
+ unsigned LaneIndex;
+ SMLoc S = Parser.getTok().getLoc();
+ // As an extension (to match gas), support a plain D register or Q register
+ // (without encosing curly braces) as a single or double entry list,
+ // respectively.
+ if (Parser.getTok().is(AsmToken::Identifier)) {
+ SMLoc E = Parser.getTok().getEndLoc();
+ int Reg = tryParseRegister();
+ if (Reg == -1)
+ return MatchOperand_NoMatch;
+ if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg)) {
+ OperandMatchResultTy Res = parseVectorLane(LaneKind, LaneIndex, E);
+ if (Res != MatchOperand_Success)
+ return Res;
+ switch (LaneKind) {
+ case NoLanes:
+ Operands.push_back(ARMOperand::CreateVectorList(Reg, 1, false, S, E));
+ break;
+ case AllLanes:
+ Operands.push_back(ARMOperand::CreateVectorListAllLanes(Reg, 1, false,
+ S, E));
+ break;
+ case IndexedLane:
+ Operands.push_back(ARMOperand::CreateVectorListIndexed(Reg, 1,
+ LaneIndex,
+ false, S, E));
+ break;
+ }
+ return MatchOperand_Success;
+ }
+ if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) {
+ Reg = getDRegFromQReg(Reg);
+ OperandMatchResultTy Res = parseVectorLane(LaneKind, LaneIndex, E);
+ if (Res != MatchOperand_Success)
+ return Res;
+ switch (LaneKind) {
+ case NoLanes:
+ Reg = MRI->getMatchingSuperReg(Reg, ARM::dsub_0,
+ &ARMMCRegisterClasses[ARM::DPairRegClassID]);
+ Operands.push_back(ARMOperand::CreateVectorList(Reg, 2, false, S, E));
+ break;
+ case AllLanes:
+ Reg = MRI->getMatchingSuperReg(Reg, ARM::dsub_0,
+ &ARMMCRegisterClasses[ARM::DPairRegClassID]);
+ Operands.push_back(ARMOperand::CreateVectorListAllLanes(Reg, 2, false,
+ S, E));
+ break;
+ case IndexedLane:
+ Operands.push_back(ARMOperand::CreateVectorListIndexed(Reg, 2,
+ LaneIndex,
+ false, S, E));
+ break;
+ }
+ return MatchOperand_Success;
+ }
+ Error(S, "vector register expected");
+ return MatchOperand_ParseFail;
+ }
+
+ if (Parser.getTok().isNot(AsmToken::LCurly))
+ return MatchOperand_NoMatch;
+
+ Parser.Lex(); // Eat '{' token.
+ SMLoc RegLoc = Parser.getTok().getLoc();
+
+ int Reg = tryParseRegister();
+ if (Reg == -1) {
+ Error(RegLoc, "register expected");
+ return MatchOperand_ParseFail;
+ }
+ unsigned Count = 1;
+ int Spacing = 0;
+ unsigned FirstReg = Reg;
+ // The list is of D registers, but we also allow Q regs and just interpret
+ // them as the two D sub-registers.
+ if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) {
+ FirstReg = Reg = getDRegFromQReg(Reg);
+ Spacing = 1; // double-spacing requires explicit D registers, otherwise
+ // it's ambiguous with four-register single spaced.
+ ++Reg;
+ ++Count;
+ }
+
+ SMLoc E;
+ if (parseVectorLane(LaneKind, LaneIndex, E) != MatchOperand_Success)
+ return MatchOperand_ParseFail;
+
+ while (Parser.getTok().is(AsmToken::Comma) ||
+ Parser.getTok().is(AsmToken::Minus)) {
+ if (Parser.getTok().is(AsmToken::Minus)) {
+ if (!Spacing)
+ Spacing = 1; // Register range implies a single spaced list.
+ else if (Spacing == 2) {
+ Error(Parser.getTok().getLoc(),
+ "sequential registers in double spaced list");
+ return MatchOperand_ParseFail;
+ }
+ Parser.Lex(); // Eat the minus.
+ SMLoc AfterMinusLoc = Parser.getTok().getLoc();
+ int EndReg = tryParseRegister();
+ if (EndReg == -1) {
+ Error(AfterMinusLoc, "register expected");
+ return MatchOperand_ParseFail;
+ }
+ // Allow Q regs and just interpret them as the two D sub-registers.
+ if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(EndReg))
+ EndReg = getDRegFromQReg(EndReg) + 1;
+ // If the register is the same as the start reg, there's nothing
+ // more to do.
+ if (Reg == EndReg)
+ continue;
+ // The register must be in the same register class as the first.
+ if (!ARMMCRegisterClasses[ARM::DPRRegClassID].contains(EndReg)) {
+ Error(AfterMinusLoc, "invalid register in register list");
+ return MatchOperand_ParseFail;
+ }
+ // Ranges must go from low to high.
+ if (Reg > EndReg) {
+ Error(AfterMinusLoc, "bad range in register list");
+ return MatchOperand_ParseFail;
+ }
+ // Parse the lane specifier if present.
+ VectorLaneTy NextLaneKind;
+ unsigned NextLaneIndex;
+ if (parseVectorLane(NextLaneKind, NextLaneIndex, E) !=
+ MatchOperand_Success)
+ return MatchOperand_ParseFail;
+ if (NextLaneKind != LaneKind || LaneIndex != NextLaneIndex) {
+ Error(AfterMinusLoc, "mismatched lane index in register list");
+ return MatchOperand_ParseFail;
+ }
+
+ // Add all the registers in the range to the register list.
+ Count += EndReg - Reg;
+ Reg = EndReg;
+ continue;
+ }
+ Parser.Lex(); // Eat the comma.
+ RegLoc = Parser.getTok().getLoc();
+ int OldReg = Reg;
+ Reg = tryParseRegister();
+ if (Reg == -1) {
+ Error(RegLoc, "register expected");
+ return MatchOperand_ParseFail;
+ }
+ // vector register lists must be contiguous.
+ // It's OK to use the enumeration values directly here rather, as the
+ // VFP register classes have the enum sorted properly.
+ //
+ // The list is of D registers, but we also allow Q regs and just interpret
+ // them as the two D sub-registers.
+ if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) {
+ if (!Spacing)
+ Spacing = 1; // Register range implies a single spaced list.
+ else if (Spacing == 2) {
+ Error(RegLoc,
+ "invalid register in double-spaced list (must be 'D' register')");
+ return MatchOperand_ParseFail;
+ }
+ Reg = getDRegFromQReg(Reg);
+ if (Reg != OldReg + 1) {
+ Error(RegLoc, "non-contiguous register range");
+ return MatchOperand_ParseFail;
+ }
+ ++Reg;
+ Count += 2;
+ // Parse the lane specifier if present.
+ VectorLaneTy NextLaneKind;
+ unsigned NextLaneIndex;
+ SMLoc LaneLoc = Parser.getTok().getLoc();
+ if (parseVectorLane(NextLaneKind, NextLaneIndex, E) !=
+ MatchOperand_Success)
+ return MatchOperand_ParseFail;
+ if (NextLaneKind != LaneKind || LaneIndex != NextLaneIndex) {
+ Error(LaneLoc, "mismatched lane index in register list");
+ return MatchOperand_ParseFail;
+ }
+ continue;
+ }
+ // Normal D register.
+ // Figure out the register spacing (single or double) of the list if
+ // we don't know it already.
+ if (!Spacing)
+ Spacing = 1 + (Reg == OldReg + 2);
+
+ // Just check that it's contiguous and keep going.
+ if (Reg != OldReg + Spacing) {
+ Error(RegLoc, "non-contiguous register range");
+ return MatchOperand_ParseFail;
+ }
+ ++Count;
+ // Parse the lane specifier if present.
+ VectorLaneTy NextLaneKind;
+ unsigned NextLaneIndex;
+ SMLoc EndLoc = Parser.getTok().getLoc();
+ if (parseVectorLane(NextLaneKind, NextLaneIndex, E) != MatchOperand_Success)
+ return MatchOperand_ParseFail;
+ if (NextLaneKind != LaneKind || LaneIndex != NextLaneIndex) {
+ Error(EndLoc, "mismatched lane index in register list");
+ return MatchOperand_ParseFail;
+ }
+ }
+
+ if (Parser.getTok().isNot(AsmToken::RCurly)) {
+ Error(Parser.getTok().getLoc(), "'}' expected");
+ return MatchOperand_ParseFail;
+ }
+ E = Parser.getTok().getEndLoc();
+ Parser.Lex(); // Eat '}' token.
+
+ switch (LaneKind) {
+ case NoLanes:
+ // Two-register operands have been converted to the
+ // composite register classes.
+ if (Count == 2) {
+ const MCRegisterClass *RC = (Spacing == 1) ?
+ &ARMMCRegisterClasses[ARM::DPairRegClassID] :
+ &ARMMCRegisterClasses[ARM::DPairSpcRegClassID];
+ FirstReg = MRI->getMatchingSuperReg(FirstReg, ARM::dsub_0, RC);
+ }
+
+ Operands.push_back(ARMOperand::CreateVectorList(FirstReg, Count,
+ (Spacing == 2), S, E));
+ break;
+ case AllLanes:
+ // Two-register operands have been converted to the
+ // composite register classes.
+ if (Count == 2) {
+ const MCRegisterClass *RC = (Spacing == 1) ?
+ &ARMMCRegisterClasses[ARM::DPairRegClassID] :
+ &ARMMCRegisterClasses[ARM::DPairSpcRegClassID];
+ FirstReg = MRI->getMatchingSuperReg(FirstReg, ARM::dsub_0, RC);
+ }
+ Operands.push_back(ARMOperand::CreateVectorListAllLanes(FirstReg, Count,
+ (Spacing == 2),
+ S, E));
+ break;
+ case IndexedLane:
+ Operands.push_back(ARMOperand::CreateVectorListIndexed(FirstReg, Count,
+ LaneIndex,
+ (Spacing == 2),
+ S, E));
+ break;
+ }
+ return MatchOperand_Success;
+}
+
+/// parseMemBarrierOptOperand - Try to parse DSB/DMB data barrier options.
+OperandMatchResultTy
+ARMAsmParser::parseMemBarrierOptOperand(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ SMLoc S = Parser.getTok().getLoc();
+ const AsmToken &Tok = Parser.getTok();
+ unsigned Opt;
+
+ if (Tok.is(AsmToken::Identifier)) {
+ StringRef OptStr = Tok.getString();
+
+ Opt = StringSwitch<unsigned>(OptStr.slice(0, OptStr.size()).lower())
+ .Case("sy", ARM_MB::SY)
+ .Case("st", ARM_MB::ST)
+ .Case("ld", ARM_MB::LD)
+ .Case("sh", ARM_MB::ISH)
+ .Case("ish", ARM_MB::ISH)
+ .Case("shst", ARM_MB::ISHST)
+ .Case("ishst", ARM_MB::ISHST)
+ .Case("ishld", ARM_MB::ISHLD)
+ .Case("nsh", ARM_MB::NSH)
+ .Case("un", ARM_MB::NSH)
+ .Case("nshst", ARM_MB::NSHST)
+ .Case("nshld", ARM_MB::NSHLD)
+ .Case("unst", ARM_MB::NSHST)
+ .Case("osh", ARM_MB::OSH)
+ .Case("oshst", ARM_MB::OSHST)
+ .Case("oshld", ARM_MB::OSHLD)
+ .Default(~0U);
+
+ // ishld, oshld, nshld and ld are only available from ARMv8.
+ if (!hasV8Ops() && (Opt == ARM_MB::ISHLD || Opt == ARM_MB::OSHLD ||
+ Opt == ARM_MB::NSHLD || Opt == ARM_MB::LD))
+ Opt = ~0U;
+
+ if (Opt == ~0U)
+ return MatchOperand_NoMatch;
+
+ Parser.Lex(); // Eat identifier token.
+ } else if (Tok.is(AsmToken::Hash) ||
+ Tok.is(AsmToken::Dollar) ||
+ Tok.is(AsmToken::Integer)) {
+ if (Parser.getTok().isNot(AsmToken::Integer))
+ Parser.Lex(); // Eat '#' or '$'.
+ SMLoc Loc = Parser.getTok().getLoc();
+
+ const MCExpr *MemBarrierID;
+ if (getParser().parseExpression(MemBarrierID)) {
+ Error(Loc, "illegal expression");
+ return MatchOperand_ParseFail;
+ }
+
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(MemBarrierID);
+ if (!CE) {
+ Error(Loc, "constant expression expected");
+ return MatchOperand_ParseFail;
+ }
+
+ int Val = CE->getValue();
+ if (Val & ~0xf) {
+ Error(Loc, "immediate value out of range");
+ return MatchOperand_ParseFail;
+ }
+
+ Opt = ARM_MB::RESERVED_0 + Val;
+ } else
+ return MatchOperand_ParseFail;
+
+ Operands.push_back(ARMOperand::CreateMemBarrierOpt((ARM_MB::MemBOpt)Opt, S));
+ return MatchOperand_Success;
+}
+
+/// parseInstSyncBarrierOptOperand - Try to parse ISB inst sync barrier options.
+OperandMatchResultTy
+ARMAsmParser::parseInstSyncBarrierOptOperand(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ SMLoc S = Parser.getTok().getLoc();
+ const AsmToken &Tok = Parser.getTok();
+ unsigned Opt;
+
+ if (Tok.is(AsmToken::Identifier)) {
+ StringRef OptStr = Tok.getString();
+
+ if (OptStr.equals_lower("sy"))
+ Opt = ARM_ISB::SY;
+ else
+ return MatchOperand_NoMatch;
+
+ Parser.Lex(); // Eat identifier token.
+ } else if (Tok.is(AsmToken::Hash) ||
+ Tok.is(AsmToken::Dollar) ||
+ Tok.is(AsmToken::Integer)) {
+ if (Parser.getTok().isNot(AsmToken::Integer))
+ Parser.Lex(); // Eat '#' or '$'.
+ SMLoc Loc = Parser.getTok().getLoc();
+
+ const MCExpr *ISBarrierID;
+ if (getParser().parseExpression(ISBarrierID)) {
+ Error(Loc, "illegal expression");
+ return MatchOperand_ParseFail;
+ }
+
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ISBarrierID);
+ if (!CE) {
+ Error(Loc, "constant expression expected");
+ return MatchOperand_ParseFail;
+ }
+
+ int Val = CE->getValue();
+ if (Val & ~0xf) {
+ Error(Loc, "immediate value out of range");
+ return MatchOperand_ParseFail;
+ }
+
+ Opt = ARM_ISB::RESERVED_0 + Val;
+ } else
+ return MatchOperand_ParseFail;
+
+ Operands.push_back(ARMOperand::CreateInstSyncBarrierOpt(
+ (ARM_ISB::InstSyncBOpt)Opt, S));
+ return MatchOperand_Success;
+}
+
+
+/// parseProcIFlagsOperand - Try to parse iflags from CPS instruction.
+OperandMatchResultTy
+ARMAsmParser::parseProcIFlagsOperand(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ SMLoc S = Parser.getTok().getLoc();
+ const AsmToken &Tok = Parser.getTok();
+ if (!Tok.is(AsmToken::Identifier))
+ return MatchOperand_NoMatch;
+ StringRef IFlagsStr = Tok.getString();
+
+ // An iflags string of "none" is interpreted to mean that none of the AIF
+ // bits are set. Not a terribly useful instruction, but a valid encoding.
+ unsigned IFlags = 0;
+ if (IFlagsStr != "none") {
+ for (int i = 0, e = IFlagsStr.size(); i != e; ++i) {
+ unsigned Flag = StringSwitch<unsigned>(IFlagsStr.substr(i, 1))
+ .Case("a", ARM_PROC::A)
+ .Case("i", ARM_PROC::I)
+ .Case("f", ARM_PROC::F)
+ .Default(~0U);
+
+ // If some specific iflag is already set, it means that some letter is
+ // present more than once, this is not acceptable.
+ if (Flag == ~0U || (IFlags & Flag))
+ return MatchOperand_NoMatch;
+
+ IFlags |= Flag;
+ }
+ }
+
+ Parser.Lex(); // Eat identifier token.
+ Operands.push_back(ARMOperand::CreateProcIFlags((ARM_PROC::IFlags)IFlags, S));
+ return MatchOperand_Success;
+}
+
+/// parseMSRMaskOperand - Try to parse mask flags from MSR instruction.
+OperandMatchResultTy
+ARMAsmParser::parseMSRMaskOperand(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ SMLoc S = Parser.getTok().getLoc();
+ const AsmToken &Tok = Parser.getTok();
+ if (!Tok.is(AsmToken::Identifier))
+ return MatchOperand_NoMatch;
+ StringRef Mask = Tok.getString();
+
+ if (isMClass()) {
+ // See ARMv6-M 10.1.1
+ std::string Name = Mask.lower();
+ unsigned FlagsVal = StringSwitch<unsigned>(Name)
+ // Note: in the documentation:
+ // ARM deprecates using MSR APSR without a _<bits> qualifier as an alias
+ // for MSR APSR_nzcvq.
+ // but we do make it an alias here. This is so to get the "mask encoding"
+ // bits correct on MSR APSR writes.
+ //
+ // FIXME: Note the 0xc00 "mask encoding" bits version of the registers
+ // should really only be allowed when writing a special register. Note
+ // they get dropped in the MRS instruction reading a special register as
+ // the SYSm field is only 8 bits.
+ .Case("apsr", 0x800)
+ .Case("apsr_nzcvq", 0x800)
+ .Case("apsr_g", 0x400)
+ .Case("apsr_nzcvqg", 0xc00)
+ .Case("iapsr", 0x801)
+ .Case("iapsr_nzcvq", 0x801)
+ .Case("iapsr_g", 0x401)
+ .Case("iapsr_nzcvqg", 0xc01)
+ .Case("eapsr", 0x802)
+ .Case("eapsr_nzcvq", 0x802)
+ .Case("eapsr_g", 0x402)
+ .Case("eapsr_nzcvqg", 0xc02)
+ .Case("xpsr", 0x803)
+ .Case("xpsr_nzcvq", 0x803)
+ .Case("xpsr_g", 0x403)
+ .Case("xpsr_nzcvqg", 0xc03)
+ .Case("ipsr", 0x805)
+ .Case("epsr", 0x806)
+ .Case("iepsr", 0x807)
+ .Case("msp", 0x808)
+ .Case("psp", 0x809)
+ .Case("primask", 0x810)
+ .Case("basepri", 0x811)
+ .Case("basepri_max", 0x812)
+ .Case("faultmask", 0x813)
+ .Case("control", 0x814)
+ .Case("msplim", 0x80a)
+ .Case("psplim", 0x80b)
+ .Case("msp_ns", 0x888)
+ .Case("psp_ns", 0x889)
+ .Case("msplim_ns", 0x88a)
+ .Case("psplim_ns", 0x88b)
+ .Case("primask_ns", 0x890)
+ .Case("basepri_ns", 0x891)
+ .Case("basepri_max_ns", 0x892)
+ .Case("faultmask_ns", 0x893)
+ .Case("control_ns", 0x894)
+ .Case("sp_ns", 0x898)
+ .Default(~0U);
+
+ if (FlagsVal == ~0U)
+ return MatchOperand_NoMatch;
+
+ if (!hasDSP() && (FlagsVal & 0x400))
+ // The _g and _nzcvqg versions are only valid if the DSP extension is
+ // available.
+ return MatchOperand_NoMatch;
+
+ if (!hasV7Ops() && FlagsVal >= 0x811 && FlagsVal <= 0x813)
+ // basepri, basepri_max and faultmask only valid for V7m.
+ return MatchOperand_NoMatch;
+
+ if (!has8MSecExt() && (FlagsVal == 0x80a || FlagsVal == 0x80b ||
+ (FlagsVal > 0x814 && FlagsVal < 0xc00)))
+ return MatchOperand_NoMatch;
+
+ if (!hasV8MMainline() && (FlagsVal == 0x88a || FlagsVal == 0x88b ||
+ (FlagsVal > 0x890 && FlagsVal <= 0x893)))
+ return MatchOperand_NoMatch;
+
+ Parser.Lex(); // Eat identifier token.
+ Operands.push_back(ARMOperand::CreateMSRMask(FlagsVal, S));
+ return MatchOperand_Success;
+ }
+
+ // Split spec_reg from flag, example: CPSR_sxf => "CPSR" and "sxf"
+ size_t Start = 0, Next = Mask.find('_');
+ StringRef Flags = "";
+ std::string SpecReg = Mask.slice(Start, Next).lower();
+ if (Next != StringRef::npos)
+ Flags = Mask.slice(Next+1, Mask.size());
+
+ // FlagsVal contains the complete mask:
+ // 3-0: Mask
+ // 4: Special Reg (cpsr, apsr => 0; spsr => 1)
+ unsigned FlagsVal = 0;
+
+ if (SpecReg == "apsr") {
+ FlagsVal = StringSwitch<unsigned>(Flags)
+ .Case("nzcvq", 0x8) // same as CPSR_f
+ .Case("g", 0x4) // same as CPSR_s
+ .Case("nzcvqg", 0xc) // same as CPSR_fs
+ .Default(~0U);
+
+ if (FlagsVal == ~0U) {
+ if (!Flags.empty())
+ return MatchOperand_NoMatch;
+ else
+ FlagsVal = 8; // No flag
+ }
+ } else if (SpecReg == "cpsr" || SpecReg == "spsr") {
+ // cpsr_all is an alias for cpsr_fc, as is plain cpsr.
+ if (Flags == "all" || Flags == "")
+ Flags = "fc";
+ for (int i = 0, e = Flags.size(); i != e; ++i) {
+ unsigned Flag = StringSwitch<unsigned>(Flags.substr(i, 1))
+ .Case("c", 1)
+ .Case("x", 2)
+ .Case("s", 4)
+ .Case("f", 8)
+ .Default(~0U);
+
+ // If some specific flag is already set, it means that some letter is
+ // present more than once, this is not acceptable.
+ if (FlagsVal == ~0U || (FlagsVal & Flag))
+ return MatchOperand_NoMatch;
+ FlagsVal |= Flag;
+ }
+ } else // No match for special register.
+ return MatchOperand_NoMatch;
+
+ // Special register without flags is NOT equivalent to "fc" flags.
+ // NOTE: This is a divergence from gas' behavior. Uncommenting the following
+ // two lines would enable gas compatibility at the expense of breaking
+ // round-tripping.
+ //
+ // if (!FlagsVal)
+ // FlagsVal = 0x9;
+
+ // Bit 4: Special Reg (cpsr, apsr => 0; spsr => 1)
+ if (SpecReg == "spsr")
+ FlagsVal |= 16;
+
+ Parser.Lex(); // Eat identifier token.
+ Operands.push_back(ARMOperand::CreateMSRMask(FlagsVal, S));
+ return MatchOperand_Success;
+}
+
+/// parseBankedRegOperand - Try to parse a banked register (e.g. "lr_irq") for
+/// use in the MRS/MSR instructions added to support virtualization.
+OperandMatchResultTy
+ARMAsmParser::parseBankedRegOperand(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ SMLoc S = Parser.getTok().getLoc();
+ const AsmToken &Tok = Parser.getTok();
+ if (!Tok.is(AsmToken::Identifier))
+ return MatchOperand_NoMatch;
+ StringRef RegName = Tok.getString();
+
+ // The values here come from B9.2.3 of the ARM ARM, where bits 4-0 are SysM
+ // and bit 5 is R.
+ unsigned Encoding = StringSwitch<unsigned>(RegName.lower())
+ .Case("r8_usr", 0x00)
+ .Case("r9_usr", 0x01)
+ .Case("r10_usr", 0x02)
+ .Case("r11_usr", 0x03)
+ .Case("r12_usr", 0x04)
+ .Case("sp_usr", 0x05)
+ .Case("lr_usr", 0x06)
+ .Case("r8_fiq", 0x08)
+ .Case("r9_fiq", 0x09)
+ .Case("r10_fiq", 0x0a)
+ .Case("r11_fiq", 0x0b)
+ .Case("r12_fiq", 0x0c)
+ .Case("sp_fiq", 0x0d)
+ .Case("lr_fiq", 0x0e)
+ .Case("lr_irq", 0x10)
+ .Case("sp_irq", 0x11)
+ .Case("lr_svc", 0x12)
+ .Case("sp_svc", 0x13)
+ .Case("lr_abt", 0x14)
+ .Case("sp_abt", 0x15)
+ .Case("lr_und", 0x16)
+ .Case("sp_und", 0x17)
+ .Case("lr_mon", 0x1c)
+ .Case("sp_mon", 0x1d)
+ .Case("elr_hyp", 0x1e)
+ .Case("sp_hyp", 0x1f)
+ .Case("spsr_fiq", 0x2e)
+ .Case("spsr_irq", 0x30)
+ .Case("spsr_svc", 0x32)
+ .Case("spsr_abt", 0x34)
+ .Case("spsr_und", 0x36)
+ .Case("spsr_mon", 0x3c)
+ .Case("spsr_hyp", 0x3e)
+ .Default(~0U);
+
+ if (Encoding == ~0U)
+ return MatchOperand_NoMatch;
+
+ Parser.Lex(); // Eat identifier token.
+ Operands.push_back(ARMOperand::CreateBankedReg(Encoding, S));
+ return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+ARMAsmParser::parsePKHImm(OperandVector &Operands, StringRef Op, int Low,
+ int High) {
+ MCAsmParser &Parser = getParser();
+ const AsmToken &Tok = Parser.getTok();
+ if (Tok.isNot(AsmToken::Identifier)) {
+ Error(Parser.getTok().getLoc(), Op + " operand expected.");
+ return MatchOperand_ParseFail;
+ }
+ StringRef ShiftName = Tok.getString();
+ std::string LowerOp = Op.lower();
+ std::string UpperOp = Op.upper();
+ if (ShiftName != LowerOp && ShiftName != UpperOp) {
+ Error(Parser.getTok().getLoc(), Op + " operand expected.");
+ return MatchOperand_ParseFail;
+ }
+ Parser.Lex(); // Eat shift type token.
+
+ // There must be a '#' and a shift amount.
+ if (Parser.getTok().isNot(AsmToken::Hash) &&
+ Parser.getTok().isNot(AsmToken::Dollar)) {
+ Error(Parser.getTok().getLoc(), "'#' expected");
+ return MatchOperand_ParseFail;
+ }
+ Parser.Lex(); // Eat hash token.
+
+ const MCExpr *ShiftAmount;
+ SMLoc Loc = Parser.getTok().getLoc();
+ SMLoc EndLoc;
+ if (getParser().parseExpression(ShiftAmount, EndLoc)) {
+ Error(Loc, "illegal expression");
+ return MatchOperand_ParseFail;
+ }
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ShiftAmount);
+ if (!CE) {
+ Error(Loc, "constant expression expected");
+ return MatchOperand_ParseFail;
+ }
+ int Val = CE->getValue();
+ if (Val < Low || Val > High) {
+ Error(Loc, "immediate value out of range");
+ return MatchOperand_ParseFail;
+ }
+
+ Operands.push_back(ARMOperand::CreateImm(CE, Loc, EndLoc));
+
+ return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+ARMAsmParser::parseSetEndImm(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ const AsmToken &Tok = Parser.getTok();
+ SMLoc S = Tok.getLoc();
+ if (Tok.isNot(AsmToken::Identifier)) {
+ Error(S, "'be' or 'le' operand expected");
+ return MatchOperand_ParseFail;
+ }
+ int Val = StringSwitch<int>(Tok.getString().lower())
+ .Case("be", 1)
+ .Case("le", 0)
+ .Default(-1);
+ Parser.Lex(); // Eat the token.
+
+ if (Val == -1) {
+ Error(S, "'be' or 'le' operand expected");
+ return MatchOperand_ParseFail;
+ }
+ Operands.push_back(ARMOperand::CreateImm(MCConstantExpr::create(Val,
+ getContext()),
+ S, Tok.getEndLoc()));
+ return MatchOperand_Success;
+}
+
+/// parseShifterImm - Parse the shifter immediate operand for SSAT/USAT
+/// instructions. Legal values are:
+/// lsl #n 'n' in [0,31]
+/// asr #n 'n' in [1,32]
+/// n == 32 encoded as n == 0.
+OperandMatchResultTy
+ARMAsmParser::parseShifterImm(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ const AsmToken &Tok = Parser.getTok();
+ SMLoc S = Tok.getLoc();
+ if (Tok.isNot(AsmToken::Identifier)) {
+ Error(S, "shift operator 'asr' or 'lsl' expected");
+ return MatchOperand_ParseFail;
+ }
+ StringRef ShiftName = Tok.getString();
+ bool isASR;
+ if (ShiftName == "lsl" || ShiftName == "LSL")
+ isASR = false;
+ else if (ShiftName == "asr" || ShiftName == "ASR")
+ isASR = true;
+ else {
+ Error(S, "shift operator 'asr' or 'lsl' expected");
+ return MatchOperand_ParseFail;
+ }
+ Parser.Lex(); // Eat the operator.
+
+ // A '#' and a shift amount.
+ if (Parser.getTok().isNot(AsmToken::Hash) &&
+ Parser.getTok().isNot(AsmToken::Dollar)) {
+ Error(Parser.getTok().getLoc(), "'#' expected");
+ return MatchOperand_ParseFail;
+ }
+ Parser.Lex(); // Eat hash token.
+ SMLoc ExLoc = Parser.getTok().getLoc();
+
+ const MCExpr *ShiftAmount;
+ SMLoc EndLoc;
+ if (getParser().parseExpression(ShiftAmount, EndLoc)) {
+ Error(ExLoc, "malformed shift expression");
+ return MatchOperand_ParseFail;
+ }
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ShiftAmount);
+ if (!CE) {
+ Error(ExLoc, "shift amount must be an immediate");
+ return MatchOperand_ParseFail;
+ }
+
+ int64_t Val = CE->getValue();
+ if (isASR) {
+ // Shift amount must be in [1,32]
+ if (Val < 1 || Val > 32) {
+ Error(ExLoc, "'asr' shift amount must be in range [1,32]");
+ return MatchOperand_ParseFail;
+ }
+ // asr #32 encoded as asr #0, but is not allowed in Thumb2 mode.
+ if (isThumb() && Val == 32) {
+ Error(ExLoc, "'asr #32' shift amount not allowed in Thumb mode");
+ return MatchOperand_ParseFail;
+ }
+ if (Val == 32) Val = 0;
+ } else {
+ // Shift amount must be in [1,32]
+ if (Val < 0 || Val > 31) {
+ Error(ExLoc, "'lsr' shift amount must be in range [0,31]");
+ return MatchOperand_ParseFail;
+ }
+ }
+
+ Operands.push_back(ARMOperand::CreateShifterImm(isASR, Val, S, EndLoc));
+
+ return MatchOperand_Success;
+}
+
+/// parseRotImm - Parse the shifter immediate operand for SXTB/UXTB family
+/// of instructions. Legal values are:
+/// ror #n 'n' in {0, 8, 16, 24}
+OperandMatchResultTy
+ARMAsmParser::parseRotImm(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ const AsmToken &Tok = Parser.getTok();
+ SMLoc S = Tok.getLoc();
+ if (Tok.isNot(AsmToken::Identifier))
+ return MatchOperand_NoMatch;
+ StringRef ShiftName = Tok.getString();
+ if (ShiftName != "ror" && ShiftName != "ROR")
+ return MatchOperand_NoMatch;
+ Parser.Lex(); // Eat the operator.
+
+ // A '#' and a rotate amount.
+ if (Parser.getTok().isNot(AsmToken::Hash) &&
+ Parser.getTok().isNot(AsmToken::Dollar)) {
+ Error(Parser.getTok().getLoc(), "'#' expected");
+ return MatchOperand_ParseFail;
+ }
+ Parser.Lex(); // Eat hash token.
+ SMLoc ExLoc = Parser.getTok().getLoc();
+
+ const MCExpr *ShiftAmount;
+ SMLoc EndLoc;
+ if (getParser().parseExpression(ShiftAmount, EndLoc)) {
+ Error(ExLoc, "malformed rotate expression");
+ return MatchOperand_ParseFail;
+ }
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ShiftAmount);
+ if (!CE) {
+ Error(ExLoc, "rotate amount must be an immediate");
+ return MatchOperand_ParseFail;
+ }
+
+ int64_t Val = CE->getValue();
+ // Shift amount must be in {0, 8, 16, 24} (0 is undocumented extension)
+ // normally, zero is represented in asm by omitting the rotate operand
+ // entirely.
+ if (Val != 8 && Val != 16 && Val != 24 && Val != 0) {
+ Error(ExLoc, "'ror' rotate amount must be 8, 16, or 24");
+ return MatchOperand_ParseFail;
+ }
+
+ Operands.push_back(ARMOperand::CreateRotImm(Val, S, EndLoc));
+
+ return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+ARMAsmParser::parseModImm(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ MCAsmLexer &Lexer = getLexer();
+ int64_t Imm1, Imm2;
+
+ SMLoc S = Parser.getTok().getLoc();
+
+ // 1) A mod_imm operand can appear in the place of a register name:
+ // add r0, #mod_imm
+ // add r0, r0, #mod_imm
+ // to correctly handle the latter, we bail out as soon as we see an
+ // identifier.
+ //
+ // 2) Similarly, we do not want to parse into complex operands:
+ // mov r0, #mod_imm
+ // mov r0, :lower16:(_foo)
+ if (Parser.getTok().is(AsmToken::Identifier) ||
+ Parser.getTok().is(AsmToken::Colon))
+ return MatchOperand_NoMatch;
+
+ // Hash (dollar) is optional as per the ARMARM
+ if (Parser.getTok().is(AsmToken::Hash) ||
+ Parser.getTok().is(AsmToken::Dollar)) {
+ // Avoid parsing into complex operands (#:)
+ if (Lexer.peekTok().is(AsmToken::Colon))
+ return MatchOperand_NoMatch;
+
+ // Eat the hash (dollar)
+ Parser.Lex();
+ }
+
+ SMLoc Sx1, Ex1;
+ Sx1 = Parser.getTok().getLoc();
+ const MCExpr *Imm1Exp;
+ if (getParser().parseExpression(Imm1Exp, Ex1)) {
+ Error(Sx1, "malformed expression");
+ return MatchOperand_ParseFail;
+ }
+
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm1Exp);
+
+ if (CE) {
+ // Immediate must fit within 32-bits
+ Imm1 = CE->getValue();
+ int Enc = ARM_AM::getSOImmVal(Imm1);
+ if (Enc != -1 && Parser.getTok().is(AsmToken::EndOfStatement)) {
+ // We have a match!
+ Operands.push_back(ARMOperand::CreateModImm((Enc & 0xFF),
+ (Enc & 0xF00) >> 7,
+ Sx1, Ex1));
+ return MatchOperand_Success;
+ }
+
+ // We have parsed an immediate which is not for us, fallback to a plain
+ // immediate. This can happen for instruction aliases. For an example,
+ // ARMInstrInfo.td defines the alias [mov <-> mvn] which can transform
+ // a mov (mvn) with a mod_imm_neg/mod_imm_not operand into the opposite
+ // instruction with a mod_imm operand. The alias is defined such that the
+ // parser method is shared, that's why we have to do this here.
+ if (Parser.getTok().is(AsmToken::EndOfStatement)) {
+ Operands.push_back(ARMOperand::CreateImm(Imm1Exp, Sx1, Ex1));
+ return MatchOperand_Success;
+ }
+ } else {
+ // Operands like #(l1 - l2) can only be evaluated at a later stage (via an
+ // MCFixup). Fallback to a plain immediate.
+ Operands.push_back(ARMOperand::CreateImm(Imm1Exp, Sx1, Ex1));
+ return MatchOperand_Success;
+ }
+
+ // From this point onward, we expect the input to be a (#bits, #rot) pair
+ if (Parser.getTok().isNot(AsmToken::Comma)) {
+ Error(Sx1, "expected modified immediate operand: #[0, 255], #even[0-30]");
+ return MatchOperand_ParseFail;
+ }
+
+ if (Imm1 & ~0xFF) {
+ Error(Sx1, "immediate operand must a number in the range [0, 255]");
+ return MatchOperand_ParseFail;
+ }
+
+ // Eat the comma
+ Parser.Lex();
+
+ // Repeat for #rot
+ SMLoc Sx2, Ex2;
+ Sx2 = Parser.getTok().getLoc();
+
+ // Eat the optional hash (dollar)
+ if (Parser.getTok().is(AsmToken::Hash) ||
+ Parser.getTok().is(AsmToken::Dollar))
+ Parser.Lex();
+
+ const MCExpr *Imm2Exp;
+ if (getParser().parseExpression(Imm2Exp, Ex2)) {
+ Error(Sx2, "malformed expression");
+ return MatchOperand_ParseFail;
+ }
+
+ CE = dyn_cast<MCConstantExpr>(Imm2Exp);
+
+ if (CE) {
+ Imm2 = CE->getValue();
+ if (!(Imm2 & ~0x1E)) {
+ // We have a match!
+ Operands.push_back(ARMOperand::CreateModImm(Imm1, Imm2, S, Ex2));
+ return MatchOperand_Success;
+ }
+ Error(Sx2, "immediate operand must an even number in the range [0, 30]");
+ return MatchOperand_ParseFail;
+ } else {
+ Error(Sx2, "constant expression expected");
+ return MatchOperand_ParseFail;
+ }
+}
+
+OperandMatchResultTy
+ARMAsmParser::parseBitfield(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ SMLoc S = Parser.getTok().getLoc();
+ // The bitfield descriptor is really two operands, the LSB and the width.
+ if (Parser.getTok().isNot(AsmToken::Hash) &&
+ Parser.getTok().isNot(AsmToken::Dollar)) {
+ Error(Parser.getTok().getLoc(), "'#' expected");
+ return MatchOperand_ParseFail;
+ }
+ Parser.Lex(); // Eat hash token.
+
+ const MCExpr *LSBExpr;
+ SMLoc E = Parser.getTok().getLoc();
+ if (getParser().parseExpression(LSBExpr)) {
+ Error(E, "malformed immediate expression");
+ return MatchOperand_ParseFail;
+ }
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(LSBExpr);
+ if (!CE) {
+ Error(E, "'lsb' operand must be an immediate");
+ return MatchOperand_ParseFail;
+ }
+
+ int64_t LSB = CE->getValue();
+ // The LSB must be in the range [0,31]
+ if (LSB < 0 || LSB > 31) {
+ Error(E, "'lsb' operand must be in the range [0,31]");
+ return MatchOperand_ParseFail;
+ }
+ E = Parser.getTok().getLoc();
+
+ // Expect another immediate operand.
+ if (Parser.getTok().isNot(AsmToken::Comma)) {
+ Error(Parser.getTok().getLoc(), "too few operands");
+ return MatchOperand_ParseFail;
+ }
+ Parser.Lex(); // Eat hash token.
+ if (Parser.getTok().isNot(AsmToken::Hash) &&
+ Parser.getTok().isNot(AsmToken::Dollar)) {
+ Error(Parser.getTok().getLoc(), "'#' expected");
+ return MatchOperand_ParseFail;
+ }
+ Parser.Lex(); // Eat hash token.
+
+ const MCExpr *WidthExpr;
+ SMLoc EndLoc;
+ if (getParser().parseExpression(WidthExpr, EndLoc)) {
+ Error(E, "malformed immediate expression");
+ return MatchOperand_ParseFail;
+ }
+ CE = dyn_cast<MCConstantExpr>(WidthExpr);
+ if (!CE) {
+ Error(E, "'width' operand must be an immediate");
+ return MatchOperand_ParseFail;
+ }
+
+ int64_t Width = CE->getValue();
+ // The LSB must be in the range [1,32-lsb]
+ if (Width < 1 || Width > 32 - LSB) {
+ Error(E, "'width' operand must be in the range [1,32-lsb]");
+ return MatchOperand_ParseFail;
+ }
+
+ Operands.push_back(ARMOperand::CreateBitfield(LSB, Width, S, EndLoc));
+
+ return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+ARMAsmParser::parsePostIdxReg(OperandVector &Operands) {
+ // Check for a post-index addressing register operand. Specifically:
+ // postidx_reg := '+' register {, shift}
+ // | '-' register {, shift}
+ // | register {, shift}
+
+ // This method must return MatchOperand_NoMatch without consuming any tokens
+ // in the case where there is no match, as other alternatives take other
+ // parse methods.
+ MCAsmParser &Parser = getParser();
+ AsmToken Tok = Parser.getTok();
+ SMLoc S = Tok.getLoc();
+ bool haveEaten = false;
+ bool isAdd = true;
+ if (Tok.is(AsmToken::Plus)) {
+ Parser.Lex(); // Eat the '+' token.
+ haveEaten = true;
+ } else if (Tok.is(AsmToken::Minus)) {
+ Parser.Lex(); // Eat the '-' token.
+ isAdd = false;
+ haveEaten = true;
+ }
+
+ SMLoc E = Parser.getTok().getEndLoc();
+ int Reg = tryParseRegister();
+ if (Reg == -1) {
+ if (!haveEaten)
+ return MatchOperand_NoMatch;
+ Error(Parser.getTok().getLoc(), "register expected");
+ return MatchOperand_ParseFail;
+ }
+
+ ARM_AM::ShiftOpc ShiftTy = ARM_AM::no_shift;
+ unsigned ShiftImm = 0;
+ if (Parser.getTok().is(AsmToken::Comma)) {
+ Parser.Lex(); // Eat the ','.
+ if (parseMemRegOffsetShift(ShiftTy, ShiftImm))
+ return MatchOperand_ParseFail;
+
+ // FIXME: Only approximates end...may include intervening whitespace.
+ E = Parser.getTok().getLoc();
+ }
+
+ Operands.push_back(ARMOperand::CreatePostIdxReg(Reg, isAdd, ShiftTy,
+ ShiftImm, S, E));
+
+ return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+ARMAsmParser::parseAM3Offset(OperandVector &Operands) {
+ // Check for a post-index addressing register operand. Specifically:
+ // am3offset := '+' register
+ // | '-' register
+ // | register
+ // | # imm
+ // | # + imm
+ // | # - imm
+
+ // This method must return MatchOperand_NoMatch without consuming any tokens
+ // in the case where there is no match, as other alternatives take other
+ // parse methods.
+ MCAsmParser &Parser = getParser();
+ AsmToken Tok = Parser.getTok();
+ SMLoc S = Tok.getLoc();
+
+ // Do immediates first, as we always parse those if we have a '#'.
+ if (Parser.getTok().is(AsmToken::Hash) ||
+ Parser.getTok().is(AsmToken::Dollar)) {
+ Parser.Lex(); // Eat '#' or '$'.
+ // Explicitly look for a '-', as we need to encode negative zero
+ // differently.
+ bool isNegative = Parser.getTok().is(AsmToken::Minus);
+ const MCExpr *Offset;
+ SMLoc E;
+ if (getParser().parseExpression(Offset, E))
+ return MatchOperand_ParseFail;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Offset);
+ if (!CE) {
+ Error(S, "constant expression expected");
+ return MatchOperand_ParseFail;
+ }
+ // Negative zero is encoded as the flag value INT32_MIN.
+ int32_t Val = CE->getValue();
+ if (isNegative && Val == 0)
+ Val = INT32_MIN;
+
+ Operands.push_back(
+ ARMOperand::CreateImm(MCConstantExpr::create(Val, getContext()), S, E));
+
+ return MatchOperand_Success;
+ }
+
+
+ bool haveEaten = false;
+ bool isAdd = true;
+ if (Tok.is(AsmToken::Plus)) {
+ Parser.Lex(); // Eat the '+' token.
+ haveEaten = true;
+ } else if (Tok.is(AsmToken::Minus)) {
+ Parser.Lex(); // Eat the '-' token.
+ isAdd = false;
+ haveEaten = true;
+ }
+
+ Tok = Parser.getTok();
+ int Reg = tryParseRegister();
+ if (Reg == -1) {
+ if (!haveEaten)
+ return MatchOperand_NoMatch;
+ Error(Tok.getLoc(), "register expected");
+ return MatchOperand_ParseFail;
+ }
+
+ Operands.push_back(ARMOperand::CreatePostIdxReg(Reg, isAdd, ARM_AM::no_shift,
+ 0, S, Tok.getEndLoc()));
+
+ return MatchOperand_Success;
+}
+
+/// Convert parsed operands to MCInst. Needed here because this instruction
+/// only has two register operands, but multiplication is commutative so
+/// assemblers should accept both "mul rD, rN, rD" and "mul rD, rD, rN".
+void ARMAsmParser::cvtThumbMultiply(MCInst &Inst,
+ const OperandVector &Operands) {
+ ((ARMOperand &)*Operands[3]).addRegOperands(Inst, 1);
+ ((ARMOperand &)*Operands[1]).addCCOutOperands(Inst, 1);
+ // If we have a three-operand form, make sure to set Rn to be the operand
+ // that isn't the same as Rd.
+ unsigned RegOp = 4;
+ if (Operands.size() == 6 &&
+ ((ARMOperand &)*Operands[4]).getReg() ==
+ ((ARMOperand &)*Operands[3]).getReg())
+ RegOp = 5;
+ ((ARMOperand &)*Operands[RegOp]).addRegOperands(Inst, 1);
+ Inst.addOperand(Inst.getOperand(0));
+ ((ARMOperand &)*Operands[2]).addCondCodeOperands(Inst, 2);
+}
+
+void ARMAsmParser::cvtThumbBranches(MCInst &Inst,
+ const OperandVector &Operands) {
+ int CondOp = -1, ImmOp = -1;
+ switch(Inst.getOpcode()) {
+ case ARM::tB:
+ case ARM::tBcc: CondOp = 1; ImmOp = 2; break;
+
+ case ARM::t2B:
+ case ARM::t2Bcc: CondOp = 1; ImmOp = 3; break;
+
+ default: llvm_unreachable("Unexpected instruction in cvtThumbBranches");
+ }
+ // first decide whether or not the branch should be conditional
+ // by looking at it's location relative to an IT block
+ if(inITBlock()) {
+ // inside an IT block we cannot have any conditional branches. any
+ // such instructions needs to be converted to unconditional form
+ switch(Inst.getOpcode()) {
+ case ARM::tBcc: Inst.setOpcode(ARM::tB); break;
+ case ARM::t2Bcc: Inst.setOpcode(ARM::t2B); break;
+ }
+ } else {
+ // outside IT blocks we can only have unconditional branches with AL
+ // condition code or conditional branches with non-AL condition code
+ unsigned Cond = static_cast<ARMOperand &>(*Operands[CondOp]).getCondCode();
+ switch(Inst.getOpcode()) {
+ case ARM::tB:
+ case ARM::tBcc:
+ Inst.setOpcode(Cond == ARMCC::AL ? ARM::tB : ARM::tBcc);
+ break;
+ case ARM::t2B:
+ case ARM::t2Bcc:
+ Inst.setOpcode(Cond == ARMCC::AL ? ARM::t2B : ARM::t2Bcc);
+ break;
+ }
+ }
+
+ // now decide on encoding size based on branch target range
+ switch(Inst.getOpcode()) {
+ // classify tB as either t2B or t1B based on range of immediate operand
+ case ARM::tB: {
+ ARMOperand &op = static_cast<ARMOperand &>(*Operands[ImmOp]);
+ if (!op.isSignedOffset<11, 1>() && isThumb() && hasV8MBaseline())
+ Inst.setOpcode(ARM::t2B);
+ break;
+ }
+ // classify tBcc as either t2Bcc or t1Bcc based on range of immediate operand
+ case ARM::tBcc: {
+ ARMOperand &op = static_cast<ARMOperand &>(*Operands[ImmOp]);
+ if (!op.isSignedOffset<8, 1>() && isThumb() && hasV8MBaseline())
+ Inst.setOpcode(ARM::t2Bcc);
+ break;
+ }
+ }
+ ((ARMOperand &)*Operands[ImmOp]).addImmOperands(Inst, 1);
+ ((ARMOperand &)*Operands[CondOp]).addCondCodeOperands(Inst, 2);
+}
+
+/// Parse an ARM memory expression, return false if successful else return true
+/// or an error. The first token must be a '[' when called.
+bool ARMAsmParser::parseMemory(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ SMLoc S, E;
+ if (Parser.getTok().isNot(AsmToken::LBrac))
+ return TokError("Token is not a Left Bracket");
+ S = Parser.getTok().getLoc();
+ Parser.Lex(); // Eat left bracket token.
+
+ const AsmToken &BaseRegTok = Parser.getTok();
+ int BaseRegNum = tryParseRegister();
+ if (BaseRegNum == -1)
+ return Error(BaseRegTok.getLoc(), "register expected");
+
+ // The next token must either be a comma, a colon or a closing bracket.
+ const AsmToken &Tok = Parser.getTok();
+ if (!Tok.is(AsmToken::Colon) && !Tok.is(AsmToken::Comma) &&
+ !Tok.is(AsmToken::RBrac))
+ return Error(Tok.getLoc(), "malformed memory operand");
+
+ if (Tok.is(AsmToken::RBrac)) {
+ E = Tok.getEndLoc();
+ Parser.Lex(); // Eat right bracket token.
+
+ Operands.push_back(ARMOperand::CreateMem(BaseRegNum, nullptr, 0,
+ ARM_AM::no_shift, 0, 0, false,
+ S, E));
+
+ // If there's a pre-indexing writeback marker, '!', just add it as a token
+ // operand. It's rather odd, but syntactically valid.
+ if (Parser.getTok().is(AsmToken::Exclaim)) {
+ Operands.push_back(ARMOperand::CreateToken("!",Parser.getTok().getLoc()));
+ Parser.Lex(); // Eat the '!'.
+ }
+
+ return false;
+ }
+
+ assert((Tok.is(AsmToken::Colon) || Tok.is(AsmToken::Comma)) &&
+ "Lost colon or comma in memory operand?!");
+ if (Tok.is(AsmToken::Comma)) {
+ Parser.Lex(); // Eat the comma.
+ }
+
+ // If we have a ':', it's an alignment specifier.
+ if (Parser.getTok().is(AsmToken::Colon)) {
+ Parser.Lex(); // Eat the ':'.
+ E = Parser.getTok().getLoc();
+ SMLoc AlignmentLoc = Tok.getLoc();
+
+ const MCExpr *Expr;
+ if (getParser().parseExpression(Expr))
+ return true;
+
+ // The expression has to be a constant. Memory references with relocations
+ // don't come through here, as they use the <label> forms of the relevant
+ // instructions.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr);
+ if (!CE)
+ return Error (E, "constant expression expected");
+
+ unsigned Align = 0;
+ switch (CE->getValue()) {
+ default:
+ return Error(E,
+ "alignment specifier must be 16, 32, 64, 128, or 256 bits");
+ case 16: Align = 2; break;
+ case 32: Align = 4; break;
+ case 64: Align = 8; break;
+ case 128: Align = 16; break;
+ case 256: Align = 32; break;
+ }
+
+ // Now we should have the closing ']'
+ if (Parser.getTok().isNot(AsmToken::RBrac))
+ return Error(Parser.getTok().getLoc(), "']' expected");
+ E = Parser.getTok().getEndLoc();
+ Parser.Lex(); // Eat right bracket token.
+
+ // Don't worry about range checking the value here. That's handled by
+ // the is*() predicates.
+ Operands.push_back(ARMOperand::CreateMem(BaseRegNum, nullptr, 0,
+ ARM_AM::no_shift, 0, Align,
+ false, S, E, AlignmentLoc));
+
+ // If there's a pre-indexing writeback marker, '!', just add it as a token
+ // operand.
+ if (Parser.getTok().is(AsmToken::Exclaim)) {
+ Operands.push_back(ARMOperand::CreateToken("!",Parser.getTok().getLoc()));
+ Parser.Lex(); // Eat the '!'.
+ }
+
+ return false;
+ }
+
+ // If we have a '#', it's an immediate offset, else assume it's a register
+ // offset. Be friendly and also accept a plain integer (without a leading
+ // hash) for gas compatibility.
+ if (Parser.getTok().is(AsmToken::Hash) ||
+ Parser.getTok().is(AsmToken::Dollar) ||
+ Parser.getTok().is(AsmToken::Integer)) {
+ if (Parser.getTok().isNot(AsmToken::Integer))
+ Parser.Lex(); // Eat '#' or '$'.
+ E = Parser.getTok().getLoc();
+
+ bool isNegative = getParser().getTok().is(AsmToken::Minus);
+ const MCExpr *Offset;
+ if (getParser().parseExpression(Offset))
+ return true;
+
+ // The expression has to be a constant. Memory references with relocations
+ // don't come through here, as they use the <label> forms of the relevant
+ // instructions.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Offset);
+ if (!CE)
+ return Error (E, "constant expression expected");
+
+ // If the constant was #-0, represent it as INT32_MIN.
+ int32_t Val = CE->getValue();
+ if (isNegative && Val == 0)
+ CE = MCConstantExpr::create(INT32_MIN, getContext());
+
+ // Now we should have the closing ']'
+ if (Parser.getTok().isNot(AsmToken::RBrac))
+ return Error(Parser.getTok().getLoc(), "']' expected");
+ E = Parser.getTok().getEndLoc();
+ Parser.Lex(); // Eat right bracket token.
+
+ // Don't worry about range checking the value here. That's handled by
+ // the is*() predicates.
+ Operands.push_back(ARMOperand::CreateMem(BaseRegNum, CE, 0,
+ ARM_AM::no_shift, 0, 0,
+ false, S, E));
+
+ // If there's a pre-indexing writeback marker, '!', just add it as a token
+ // operand.
+ if (Parser.getTok().is(AsmToken::Exclaim)) {
+ Operands.push_back(ARMOperand::CreateToken("!",Parser.getTok().getLoc()));
+ Parser.Lex(); // Eat the '!'.
+ }
+
+ return false;
+ }
+
+ // The register offset is optionally preceded by a '+' or '-'
+ bool isNegative = false;
+ if (Parser.getTok().is(AsmToken::Minus)) {
+ isNegative = true;
+ Parser.Lex(); // Eat the '-'.
+ } else if (Parser.getTok().is(AsmToken::Plus)) {
+ // Nothing to do.
+ Parser.Lex(); // Eat the '+'.
+ }
+
+ E = Parser.getTok().getLoc();
+ int OffsetRegNum = tryParseRegister();
+ if (OffsetRegNum == -1)
+ return Error(E, "register expected");
+
+ // If there's a shift operator, handle it.
+ ARM_AM::ShiftOpc ShiftType = ARM_AM::no_shift;
+ unsigned ShiftImm = 0;
+ if (Parser.getTok().is(AsmToken::Comma)) {
+ Parser.Lex(); // Eat the ','.
+ if (parseMemRegOffsetShift(ShiftType, ShiftImm))
+ return true;
+ }
+
+ // Now we should have the closing ']'
+ if (Parser.getTok().isNot(AsmToken::RBrac))
+ return Error(Parser.getTok().getLoc(), "']' expected");
+ E = Parser.getTok().getEndLoc();
+ Parser.Lex(); // Eat right bracket token.
+
+ Operands.push_back(ARMOperand::CreateMem(BaseRegNum, nullptr, OffsetRegNum,
+ ShiftType, ShiftImm, 0, isNegative,
+ S, E));
+
+ // If there's a pre-indexing writeback marker, '!', just add it as a token
+ // operand.
+ if (Parser.getTok().is(AsmToken::Exclaim)) {
+ Operands.push_back(ARMOperand::CreateToken("!",Parser.getTok().getLoc()));
+ Parser.Lex(); // Eat the '!'.
+ }
+
+ return false;
+}
+
+/// parseMemRegOffsetShift - one of these two:
+/// ( lsl | lsr | asr | ror ) , # shift_amount
+/// rrx
+/// return true if it parses a shift otherwise it returns false.
+bool ARMAsmParser::parseMemRegOffsetShift(ARM_AM::ShiftOpc &St,
+ unsigned &Amount) {
+ MCAsmParser &Parser = getParser();
+ SMLoc Loc = Parser.getTok().getLoc();
+ const AsmToken &Tok = Parser.getTok();
+ if (Tok.isNot(AsmToken::Identifier))
+ return true;
+ StringRef ShiftName = Tok.getString();
+ if (ShiftName == "lsl" || ShiftName == "LSL" ||
+ ShiftName == "asl" || ShiftName == "ASL")
+ St = ARM_AM::lsl;
+ else if (ShiftName == "lsr" || ShiftName == "LSR")
+ St = ARM_AM::lsr;
+ else if (ShiftName == "asr" || ShiftName == "ASR")
+ St = ARM_AM::asr;
+ else if (ShiftName == "ror" || ShiftName == "ROR")
+ St = ARM_AM::ror;
+ else if (ShiftName == "rrx" || ShiftName == "RRX")
+ St = ARM_AM::rrx;
+ else
+ return Error(Loc, "illegal shift operator");
+ Parser.Lex(); // Eat shift type token.
+
+ // rrx stands alone.
+ Amount = 0;
+ if (St != ARM_AM::rrx) {
+ Loc = Parser.getTok().getLoc();
+ // A '#' and a shift amount.
+ const AsmToken &HashTok = Parser.getTok();
+ if (HashTok.isNot(AsmToken::Hash) &&
+ HashTok.isNot(AsmToken::Dollar))
+ return Error(HashTok.getLoc(), "'#' expected");
+ Parser.Lex(); // Eat hash token.
+
+ const MCExpr *Expr;
+ if (getParser().parseExpression(Expr))
+ return true;
+ // Range check the immediate.
+ // lsl, ror: 0 <= imm <= 31
+ // lsr, asr: 0 <= imm <= 32
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr);
+ if (!CE)
+ return Error(Loc, "shift amount must be an immediate");
+ int64_t Imm = CE->getValue();
+ if (Imm < 0 ||
+ ((St == ARM_AM::lsl || St == ARM_AM::ror) && Imm > 31) ||
+ ((St == ARM_AM::lsr || St == ARM_AM::asr) && Imm > 32))
+ return Error(Loc, "immediate shift value out of range");
+ // If <ShiftTy> #0, turn it into a no_shift.
+ if (Imm == 0)
+ St = ARM_AM::lsl;
+ // For consistency, treat lsr #32 and asr #32 as having immediate value 0.
+ if (Imm == 32)
+ Imm = 0;
+ Amount = Imm;
+ }
+
+ return false;
+}
+
+/// parseFPImm - A floating point immediate expression operand.
+OperandMatchResultTy
+ARMAsmParser::parseFPImm(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ // Anything that can accept a floating point constant as an operand
+ // needs to go through here, as the regular parseExpression is
+ // integer only.
+ //
+ // This routine still creates a generic Immediate operand, containing
+ // a bitcast of the 64-bit floating point value. The various operands
+ // that accept floats can check whether the value is valid for them
+ // via the standard is*() predicates.
+
+ SMLoc S = Parser.getTok().getLoc();
+
+ if (Parser.getTok().isNot(AsmToken::Hash) &&
+ Parser.getTok().isNot(AsmToken::Dollar))
+ return MatchOperand_NoMatch;
+
+ // Disambiguate the VMOV forms that can accept an FP immediate.
+ // vmov.f32 <sreg>, #imm
+ // vmov.f64 <dreg>, #imm
+ // vmov.f32 <dreg>, #imm @ vector f32x2
+ // vmov.f32 <qreg>, #imm @ vector f32x4
+ //
+ // There are also the NEON VMOV instructions which expect an
+ // integer constant. Make sure we don't try to parse an FPImm
+ // for these:
+ // vmov.i{8|16|32|64} <dreg|qreg>, #imm
+ ARMOperand &TyOp = static_cast<ARMOperand &>(*Operands[2]);
+ bool isVmovf = TyOp.isToken() &&
+ (TyOp.getToken() == ".f32" || TyOp.getToken() == ".f64" ||
+ TyOp.getToken() == ".f16");
+ ARMOperand &Mnemonic = static_cast<ARMOperand &>(*Operands[0]);
+ bool isFconst = Mnemonic.isToken() && (Mnemonic.getToken() == "fconstd" ||
+ Mnemonic.getToken() == "fconsts");
+ if (!(isVmovf || isFconst))
+ return MatchOperand_NoMatch;
+
+ Parser.Lex(); // Eat '#' or '$'.
+
+ // Handle negation, as that still comes through as a separate token.
+ bool isNegative = false;
+ if (Parser.getTok().is(AsmToken::Minus)) {
+ isNegative = true;
+ Parser.Lex();
+ }
+ const AsmToken &Tok = Parser.getTok();
+ SMLoc Loc = Tok.getLoc();
+ if (Tok.is(AsmToken::Real) && isVmovf) {
+ APFloat RealVal(APFloat::IEEEsingle(), Tok.getString());
+ uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
+ // If we had a '-' in front, toggle the sign bit.
+ IntVal ^= (uint64_t)isNegative << 31;
+ Parser.Lex(); // Eat the token.
+ Operands.push_back(ARMOperand::CreateImm(
+ MCConstantExpr::create(IntVal, getContext()),
+ S, Parser.getTok().getLoc()));
+ return MatchOperand_Success;
+ }
+ // Also handle plain integers. Instructions which allow floating point
+ // immediates also allow a raw encoded 8-bit value.
+ if (Tok.is(AsmToken::Integer) && isFconst) {
+ int64_t Val = Tok.getIntVal();
+ Parser.Lex(); // Eat the token.
+ if (Val > 255 || Val < 0) {
+ Error(Loc, "encoded floating point value out of range");
+ return MatchOperand_ParseFail;
+ }
+ float RealVal = ARM_AM::getFPImmFloat(Val);
+ Val = APFloat(RealVal).bitcastToAPInt().getZExtValue();
+
+ Operands.push_back(ARMOperand::CreateImm(
+ MCConstantExpr::create(Val, getContext()), S,
+ Parser.getTok().getLoc()));
+ return MatchOperand_Success;
+ }
+
+ Error(Loc, "invalid floating point immediate");
+ return MatchOperand_ParseFail;
+}
+
+/// Parse a arm instruction operand. For now this parses the operand regardless
+/// of the mnemonic.
+bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
+ MCAsmParser &Parser = getParser();
+ SMLoc S, E;
+
+ // Check if the current operand has a custom associated parser, if so, try to
+ // custom parse the operand, or fallback to the general approach.
+ OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+ if (ResTy == MatchOperand_Success)
+ return false;
+ // If there wasn't a custom match, try the generic matcher below. Otherwise,
+ // there was a match, but an error occurred, in which case, just return that
+ // the operand parsing failed.
+ if (ResTy == MatchOperand_ParseFail)
+ return true;
+
+ switch (getLexer().getKind()) {
+ default:
+ Error(Parser.getTok().getLoc(), "unexpected token in operand");
+ return true;
+ case AsmToken::Identifier: {
+ // If we've seen a branch mnemonic, the next operand must be a label. This
+ // is true even if the label is a register name. So "br r1" means branch to
+ // label "r1".
+ bool ExpectLabel = Mnemonic == "b" || Mnemonic == "bl";
+ if (!ExpectLabel) {
+ if (!tryParseRegisterWithWriteBack(Operands))
+ return false;
+ int Res = tryParseShiftRegister(Operands);
+ if (Res == 0) // success
+ return false;
+ else if (Res == -1) // irrecoverable error
+ return true;
+ // If this is VMRS, check for the apsr_nzcv operand.
+ if (Mnemonic == "vmrs" &&
+ Parser.getTok().getString().equals_lower("apsr_nzcv")) {
+ S = Parser.getTok().getLoc();
+ Parser.Lex();
+ Operands.push_back(ARMOperand::CreateToken("APSR_nzcv", S));
+ return false;
+ }
+ }
+
+ // Fall though for the Identifier case that is not a register or a
+ // special name.
+ }
+ case AsmToken::LParen: // parenthesized expressions like (_strcmp-4)
+ case AsmToken::Integer: // things like 1f and 2b as a branch targets
+ case AsmToken::String: // quoted label names.
+ case AsmToken::Dot: { // . as a branch target
+ // This was not a register so parse other operands that start with an
+ // identifier (like labels) as expressions and create them as immediates.
+ const MCExpr *IdVal;
+ S = Parser.getTok().getLoc();
+ if (getParser().parseExpression(IdVal))
+ return true;
+ E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+ Operands.push_back(ARMOperand::CreateImm(IdVal, S, E));
+ return false;
+ }
+ case AsmToken::LBrac:
+ return parseMemory(Operands);
+ case AsmToken::LCurly:
+ return parseRegisterList(Operands);
+ case AsmToken::Dollar:
+ case AsmToken::Hash: {
+ // #42 -> immediate.
+ S = Parser.getTok().getLoc();
+ Parser.Lex();
+
+ if (Parser.getTok().isNot(AsmToken::Colon)) {
+ bool isNegative = Parser.getTok().is(AsmToken::Minus);
+ const MCExpr *ImmVal;
+ if (getParser().parseExpression(ImmVal))
+ return true;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ImmVal);
+ if (CE) {
+ int32_t Val = CE->getValue();
+ if (isNegative && Val == 0)
+ ImmVal = MCConstantExpr::create(INT32_MIN, getContext());
+ }
+ E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+ Operands.push_back(ARMOperand::CreateImm(ImmVal, S, E));
+
+ // There can be a trailing '!' on operands that we want as a separate
+ // '!' Token operand. Handle that here. For example, the compatibility
+ // alias for 'srsdb sp!, #imm' is 'srsdb #imm!'.
+ if (Parser.getTok().is(AsmToken::Exclaim)) {
+ Operands.push_back(ARMOperand::CreateToken(Parser.getTok().getString(),
+ Parser.getTok().getLoc()));
+ Parser.Lex(); // Eat exclaim token
+ }
+ return false;
+ }
+ // w/ a ':' after the '#', it's just like a plain ':'.
+ LLVM_FALLTHROUGH;
+ }
+ case AsmToken::Colon: {
+ S = Parser.getTok().getLoc();
+ // ":lower16:" and ":upper16:" expression prefixes
+ // FIXME: Check it's an expression prefix,
+ // e.g. (FOO - :lower16:BAR) isn't legal.
+ ARMMCExpr::VariantKind RefKind;
+ if (parsePrefix(RefKind))
+ return true;
+
+ const MCExpr *SubExprVal;
+ if (getParser().parseExpression(SubExprVal))
+ return true;
+
+ const MCExpr *ExprVal = ARMMCExpr::create(RefKind, SubExprVal,
+ getContext());
+ E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+ Operands.push_back(ARMOperand::CreateImm(ExprVal, S, E));
+ return false;
+ }
+ case AsmToken::Equal: {
+ S = Parser.getTok().getLoc();
+ if (Mnemonic != "ldr") // only parse for ldr pseudo (e.g. ldr r0, =val)
+ return Error(S, "unexpected token in operand");
+ Parser.Lex(); // Eat '='
+ const MCExpr *SubExprVal;
+ if (getParser().parseExpression(SubExprVal))
+ return true;
+ E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
+ // execute-only: we assume that assembly programmers know what they are
+ // doing and allow literal pool creation here
+ Operands.push_back(ARMOperand::CreateConstantPoolImm(SubExprVal, S, E));
+ return false;
+ }
+ }
+}
+
+// parsePrefix - Parse ARM 16-bit relocations expression prefix, i.e.
+// :lower16: and :upper16:.
+bool ARMAsmParser::parsePrefix(ARMMCExpr::VariantKind &RefKind) {
+ MCAsmParser &Parser = getParser();
+ RefKind = ARMMCExpr::VK_ARM_None;
+
+ // consume an optional '#' (GNU compatibility)
+ if (getLexer().is(AsmToken::Hash))
+ Parser.Lex();
+
+ // :lower16: and :upper16: modifiers
+ assert(getLexer().is(AsmToken::Colon) && "expected a :");
+ Parser.Lex(); // Eat ':'
+
+ if (getLexer().isNot(AsmToken::Identifier)) {
+ Error(Parser.getTok().getLoc(), "expected prefix identifier in operand");
+ return true;
+ }
+
+ enum {
+ COFF = (1 << MCObjectFileInfo::IsCOFF),
+ ELF = (1 << MCObjectFileInfo::IsELF),
+ MACHO = (1 << MCObjectFileInfo::IsMachO)
+ };
+ static const struct PrefixEntry {
+ const char *Spelling;
+ ARMMCExpr::VariantKind VariantKind;
+ uint8_t SupportedFormats;
+ } PrefixEntries[] = {
+ { "lower16", ARMMCExpr::VK_ARM_LO16, COFF | ELF | MACHO },
+ { "upper16", ARMMCExpr::VK_ARM_HI16, COFF | ELF | MACHO },
+ };
+
+ StringRef IDVal = Parser.getTok().getIdentifier();
+
+ const auto &Prefix =
+ std::find_if(std::begin(PrefixEntries), std::end(PrefixEntries),
+ [&IDVal](const PrefixEntry &PE) {
+ return PE.Spelling == IDVal;
+ });
+ if (Prefix == std::end(PrefixEntries)) {
+ Error(Parser.getTok().getLoc(), "unexpected prefix in operand");
+ return true;
+ }
+
+ uint8_t CurrentFormat;
+ switch (getContext().getObjectFileInfo()->getObjectFileType()) {
+ case MCObjectFileInfo::IsMachO:
+ CurrentFormat = MACHO;
+ break;
+ case MCObjectFileInfo::IsELF:
+ CurrentFormat = ELF;
+ break;
+ case MCObjectFileInfo::IsCOFF:
+ CurrentFormat = COFF;
+ break;
+ }
+
+ if (~Prefix->SupportedFormats & CurrentFormat) {
+ Error(Parser.getTok().getLoc(),
+ "cannot represent relocation in the current file format");
+ return true;
+ }
+
+ RefKind = Prefix->VariantKind;
+ Parser.Lex();
+
+ if (getLexer().isNot(AsmToken::Colon)) {
+ Error(Parser.getTok().getLoc(), "unexpected token after prefix");
+ return true;
+ }
+ Parser.Lex(); // Eat the last ':'
+
+ return false;
+}
+
+/// \brief Given a mnemonic, split out possible predication code and carry
+/// setting letters to form a canonical mnemonic and flags.
+//
+// FIXME: Would be nice to autogen this.
+// FIXME: This is a bit of a maze of special cases.
+StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
+ unsigned &PredicationCode,
+ bool &CarrySetting,
+ unsigned &ProcessorIMod,
+ StringRef &ITMask) {
+ PredicationCode = ARMCC::AL;
+ CarrySetting = false;
+ ProcessorIMod = 0;
+
+ // Ignore some mnemonics we know aren't predicated forms.
+ //
+ // FIXME: Would be nice to autogen this.
+ if ((Mnemonic == "movs" && isThumb()) ||
+ Mnemonic == "teq" || Mnemonic == "vceq" || Mnemonic == "svc" ||
+ Mnemonic == "mls" || Mnemonic == "smmls" || Mnemonic == "vcls" ||
+ Mnemonic == "vmls" || Mnemonic == "vnmls" || Mnemonic == "vacge" ||
+ Mnemonic == "vcge" || Mnemonic == "vclt" || Mnemonic == "vacgt" ||
+ Mnemonic == "vaclt" || Mnemonic == "vacle" || Mnemonic == "hlt" ||
+ Mnemonic == "vcgt" || Mnemonic == "vcle" || Mnemonic == "smlal" ||
+ Mnemonic == "umaal" || Mnemonic == "umlal" || Mnemonic == "vabal" ||
+ Mnemonic == "vmlal" || Mnemonic == "vpadal" || Mnemonic == "vqdmlal" ||
+ Mnemonic == "fmuls" || Mnemonic == "vmaxnm" || Mnemonic == "vminnm" ||
+ Mnemonic == "vcvta" || Mnemonic == "vcvtn" || Mnemonic == "vcvtp" ||
+ Mnemonic == "vcvtm" || Mnemonic == "vrinta" || Mnemonic == "vrintn" ||
+ Mnemonic == "vrintp" || Mnemonic == "vrintm" || Mnemonic == "hvc" ||
+ Mnemonic.startswith("vsel") || Mnemonic == "vins" || Mnemonic == "vmovx" ||
+ Mnemonic == "bxns" || Mnemonic == "blxns")
+ return Mnemonic;
+
+ // First, split out any predication code. Ignore mnemonics we know aren't
+ // predicated but do have a carry-set and so weren't caught above.
+ if (Mnemonic != "adcs" && Mnemonic != "bics" && Mnemonic != "movs" &&
+ Mnemonic != "muls" && Mnemonic != "smlals" && Mnemonic != "smulls" &&
+ Mnemonic != "umlals" && Mnemonic != "umulls" && Mnemonic != "lsls" &&
+ Mnemonic != "sbcs" && Mnemonic != "rscs") {
+ unsigned CC = StringSwitch<unsigned>(Mnemonic.substr(Mnemonic.size()-2))
+ .Case("eq", ARMCC::EQ)
+ .Case("ne", ARMCC::NE)
+ .Case("hs", ARMCC::HS)
+ .Case("cs", ARMCC::HS)
+ .Case("lo", ARMCC::LO)
+ .Case("cc", ARMCC::LO)
+ .Case("mi", ARMCC::MI)
+ .Case("pl", ARMCC::PL)
+ .Case("vs", ARMCC::VS)
+ .Case("vc", ARMCC::VC)
+ .Case("hi", ARMCC::HI)
+ .Case("ls", ARMCC::LS)
+ .Case("ge", ARMCC::GE)
+ .Case("lt", ARMCC::LT)
+ .Case("gt", ARMCC::GT)
+ .Case("le", ARMCC::LE)
+ .Case("al", ARMCC::AL)
+ .Default(~0U);
+ if (CC != ~0U) {
+ Mnemonic = Mnemonic.slice(0, Mnemonic.size() - 2);
+ PredicationCode = CC;
+ }
+ }
+
+ // Next, determine if we have a carry setting bit. We explicitly ignore all
+ // the instructions we know end in 's'.
+ if (Mnemonic.endswith("s") &&
+ !(Mnemonic == "cps" || Mnemonic == "mls" ||
+ Mnemonic == "mrs" || Mnemonic == "smmls" || Mnemonic == "vabs" ||
+ Mnemonic == "vcls" || Mnemonic == "vmls" || Mnemonic == "vmrs" ||
+ Mnemonic == "vnmls" || Mnemonic == "vqabs" || Mnemonic == "vrecps" ||
+ Mnemonic == "vrsqrts" || Mnemonic == "srs" || Mnemonic == "flds" ||
+ Mnemonic == "fmrs" || Mnemonic == "fsqrts" || Mnemonic == "fsubs" ||
+ Mnemonic == "fsts" || Mnemonic == "fcpys" || Mnemonic == "fdivs" ||
+ Mnemonic == "fmuls" || Mnemonic == "fcmps" || Mnemonic == "fcmpzs" ||
+ Mnemonic == "vfms" || Mnemonic == "vfnms" || Mnemonic == "fconsts" ||
+ Mnemonic == "bxns" || Mnemonic == "blxns" ||
+ (Mnemonic == "movs" && isThumb()))) {
+ Mnemonic = Mnemonic.slice(0, Mnemonic.size() - 1);
+ CarrySetting = true;
+ }
+
+ // The "cps" instruction can have a interrupt mode operand which is glued into
+ // the mnemonic. Check if this is the case, split it and parse the imod op
+ if (Mnemonic.startswith("cps")) {
+ // Split out any imod code.
+ unsigned IMod =
+ StringSwitch<unsigned>(Mnemonic.substr(Mnemonic.size()-2, 2))
+ .Case("ie", ARM_PROC::IE)
+ .Case("id", ARM_PROC::ID)
+ .Default(~0U);
+ if (IMod != ~0U) {
+ Mnemonic = Mnemonic.slice(0, Mnemonic.size()-2);
+ ProcessorIMod = IMod;
+ }
+ }
+
+ // The "it" instruction has the condition mask on the end of the mnemonic.
+ if (Mnemonic.startswith("it")) {
+ ITMask = Mnemonic.slice(2, Mnemonic.size());
+ Mnemonic = Mnemonic.slice(0, 2);
+ }
+
+ return Mnemonic;
+}
+
+/// \brief Given a canonical mnemonic, determine if the instruction ever allows
+/// inclusion of carry set or predication code operands.
+//
+// FIXME: It would be nice to autogen this.
+void ARMAsmParser::getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst,
+ bool &CanAcceptCarrySet,
+ bool &CanAcceptPredicationCode) {
+ CanAcceptCarrySet =
+ Mnemonic == "and" || Mnemonic == "lsl" || Mnemonic == "lsr" ||
+ Mnemonic == "rrx" || Mnemonic == "ror" || Mnemonic == "sub" ||
+ Mnemonic == "add" || Mnemonic == "adc" || Mnemonic == "mul" ||
+ Mnemonic == "bic" || Mnemonic == "asr" || Mnemonic == "orr" ||
+ Mnemonic == "mvn" || Mnemonic == "rsb" || Mnemonic == "rsc" ||
+ Mnemonic == "orn" || Mnemonic == "sbc" || Mnemonic == "eor" ||
+ Mnemonic == "neg" || Mnemonic == "vfm" || Mnemonic == "vfnm" ||
+ (!isThumb() &&
+ (Mnemonic == "smull" || Mnemonic == "mov" || Mnemonic == "mla" ||
+ Mnemonic == "smlal" || Mnemonic == "umlal" || Mnemonic == "umull"));
+
+ if (Mnemonic == "bkpt" || Mnemonic == "cbnz" || Mnemonic == "setend" ||
+ Mnemonic == "cps" || Mnemonic == "it" || Mnemonic == "cbz" ||
+ Mnemonic == "trap" || Mnemonic == "hlt" || Mnemonic == "udf" ||
+ Mnemonic.startswith("crc32") || Mnemonic.startswith("cps") ||
+ Mnemonic.startswith("vsel") || Mnemonic == "vmaxnm" ||
+ Mnemonic == "vminnm" || Mnemonic == "vcvta" || Mnemonic == "vcvtn" ||
+ Mnemonic == "vcvtp" || Mnemonic == "vcvtm" || Mnemonic == "vrinta" ||
+ Mnemonic == "vrintn" || Mnemonic == "vrintp" || Mnemonic == "vrintm" ||
+ Mnemonic.startswith("aes") || Mnemonic == "hvc" || Mnemonic == "setpan" ||
+ Mnemonic.startswith("sha1") || Mnemonic.startswith("sha256") ||
+ (FullInst.startswith("vmull") && FullInst.endswith(".p64")) ||
+ Mnemonic == "vmovx" || Mnemonic == "vins") {
+ // These mnemonics are never predicable
+ CanAcceptPredicationCode = false;
+ } else if (!isThumb()) {
+ // Some instructions are only predicable in Thumb mode
+ CanAcceptPredicationCode =
+ Mnemonic != "cdp2" && Mnemonic != "clrex" && Mnemonic != "mcr2" &&
+ Mnemonic != "mcrr2" && Mnemonic != "mrc2" && Mnemonic != "mrrc2" &&
+ Mnemonic != "dmb" && Mnemonic != "dsb" && Mnemonic != "isb" &&
+ Mnemonic != "pld" && Mnemonic != "pli" && Mnemonic != "pldw" &&
+ Mnemonic != "ldc2" && Mnemonic != "ldc2l" && Mnemonic != "stc2" &&
+ Mnemonic != "stc2l" && !Mnemonic.startswith("rfe") &&
+ !Mnemonic.startswith("srs");
+ } else if (isThumbOne()) {
+ if (hasV6MOps())
+ CanAcceptPredicationCode = Mnemonic != "movs";
+ else
+ CanAcceptPredicationCode = Mnemonic != "nop" && Mnemonic != "movs";
+ } else
+ CanAcceptPredicationCode = true;
+}
+
+// \brief Some Thumb instructions have two operand forms that are not
+// available as three operand, convert to two operand form if possible.
+//
+// FIXME: We would really like to be able to tablegen'erate this.
+void ARMAsmParser::tryConvertingToTwoOperandForm(StringRef Mnemonic,
+ bool CarrySetting,
+ OperandVector &Operands) {
+ if (Operands.size() != 6)
+ return;
+
+ const auto &Op3 = static_cast<ARMOperand &>(*Operands[3]);
+ auto &Op4 = static_cast<ARMOperand &>(*Operands[4]);
+ if (!Op3.isReg() || !Op4.isReg())
+ return;
+
+ auto Op3Reg = Op3.getReg();
+ auto Op4Reg = Op4.getReg();
+
+ // For most Thumb2 cases we just generate the 3 operand form and reduce
+ // it in processInstruction(), but the 3 operand form of ADD (t2ADDrr)
+ // won't accept SP or PC so we do the transformation here taking care
+ // with immediate range in the 'add sp, sp #imm' case.
+ auto &Op5 = static_cast<ARMOperand &>(*Operands[5]);
+ if (isThumbTwo()) {
+ if (Mnemonic != "add")
+ return;
+ bool TryTransform = Op3Reg == ARM::PC || Op4Reg == ARM::PC ||
+ (Op5.isReg() && Op5.getReg() == ARM::PC);
+ if (!TryTransform) {
+ TryTransform = (Op3Reg == ARM::SP || Op4Reg == ARM::SP ||
+ (Op5.isReg() && Op5.getReg() == ARM::SP)) &&
+ !(Op3Reg == ARM::SP && Op4Reg == ARM::SP &&
+ Op5.isImm() && !Op5.isImm0_508s4());
+ }
+ if (!TryTransform)
+ return;
+ } else if (!isThumbOne())
+ return;
+
+ if (!(Mnemonic == "add" || Mnemonic == "sub" || Mnemonic == "and" ||
+ Mnemonic == "eor" || Mnemonic == "lsl" || Mnemonic == "lsr" ||
+ Mnemonic == "asr" || Mnemonic == "adc" || Mnemonic == "sbc" ||
+ Mnemonic == "ror" || Mnemonic == "orr" || Mnemonic == "bic"))
+ return;
+
+ // If first 2 operands of a 3 operand instruction are the same
+ // then transform to 2 operand version of the same instruction
+ // e.g. 'adds r0, r0, #1' transforms to 'adds r0, #1'
+ bool Transform = Op3Reg == Op4Reg;
+
+ // For communtative operations, we might be able to transform if we swap
+ // Op4 and Op5. The 'ADD Rdm, SP, Rdm' form is already handled specially
+ // as tADDrsp.
+ const ARMOperand *LastOp = &Op5;
+ bool Swap = false;
+ if (!Transform && Op5.isReg() && Op3Reg == Op5.getReg() &&
+ ((Mnemonic == "add" && Op4Reg != ARM::SP) ||
+ Mnemonic == "and" || Mnemonic == "eor" ||
+ Mnemonic == "adc" || Mnemonic == "orr")) {
+ Swap = true;
+ LastOp = &Op4;
+ Transform = true;
+ }
+
+ // If both registers are the same then remove one of them from
+ // the operand list, with certain exceptions.
+ if (Transform) {
+ // Don't transform 'adds Rd, Rd, Rm' or 'sub{s} Rd, Rd, Rm' because the
+ // 2 operand forms don't exist.
+ if (((Mnemonic == "add" && CarrySetting) || Mnemonic == "sub") &&
+ LastOp->isReg())
+ Transform = false;
+
+ // Don't transform 'add/sub{s} Rd, Rd, #imm' if the immediate fits into
+ // 3-bits because the ARMARM says not to.
+ if ((Mnemonic == "add" || Mnemonic == "sub") && LastOp->isImm0_7())
+ Transform = false;
+ }
+
+ if (Transform) {
+ if (Swap)
+ std::swap(Op4, Op5);
+ Operands.erase(Operands.begin() + 3);
+ }
+}
+
+bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic,
+ OperandVector &Operands) {
+ // FIXME: This is all horribly hacky. We really need a better way to deal
+ // with optional operands like this in the matcher table.
+
+ // The 'mov' mnemonic is special. One variant has a cc_out operand, while
+ // another does not. Specifically, the MOVW instruction does not. So we
+ // special case it here and remove the defaulted (non-setting) cc_out
+ // operand if that's the instruction we're trying to match.
+ //
+ // We do this as post-processing of the explicit operands rather than just
+ // conditionally adding the cc_out in the first place because we need
+ // to check the type of the parsed immediate operand.
+ if (Mnemonic == "mov" && Operands.size() > 4 && !isThumb() &&
+ !static_cast<ARMOperand &>(*Operands[4]).isModImm() &&
+ static_cast<ARMOperand &>(*Operands[4]).isImm0_65535Expr() &&
+ static_cast<ARMOperand &>(*Operands[1]).getReg() == 0)
+ return true;
+
+ // Register-register 'add' for thumb does not have a cc_out operand
+ // when there are only two register operands.
+ if (isThumb() && Mnemonic == "add" && Operands.size() == 5 &&
+ static_cast<ARMOperand &>(*Operands[3]).isReg() &&
+ static_cast<ARMOperand &>(*Operands[4]).isReg() &&
+ static_cast<ARMOperand &>(*Operands[1]).getReg() == 0)
+ return true;
+ // Register-register 'add' for thumb does not have a cc_out operand
+ // when it's an ADD Rdm, SP, {Rdm|#imm0_255} instruction. We do
+ // have to check the immediate range here since Thumb2 has a variant
+ // that can handle a different range and has a cc_out operand.
+ if (((isThumb() && Mnemonic == "add") ||
+ (isThumbTwo() && Mnemonic == "sub")) &&
+ Operands.size() == 6 && static_cast<ARMOperand &>(*Operands[3]).isReg() &&
+ static_cast<ARMOperand &>(*Operands[4]).isReg() &&
+ static_cast<ARMOperand &>(*Operands[4]).getReg() == ARM::SP &&
+ static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 &&
+ ((Mnemonic == "add" && static_cast<ARMOperand &>(*Operands[5]).isReg()) ||
+ static_cast<ARMOperand &>(*Operands[5]).isImm0_1020s4()))
+ return true;
+ // For Thumb2, add/sub immediate does not have a cc_out operand for the
+ // imm0_4095 variant. That's the least-preferred variant when
+ // selecting via the generic "add" mnemonic, so to know that we
+ // should remove the cc_out operand, we have to explicitly check that
+ // it's not one of the other variants. Ugh.
+ if (isThumbTwo() && (Mnemonic == "add" || Mnemonic == "sub") &&
+ Operands.size() == 6 && static_cast<ARMOperand &>(*Operands[3]).isReg() &&
+ static_cast<ARMOperand &>(*Operands[4]).isReg() &&
+ static_cast<ARMOperand &>(*Operands[5]).isImm()) {
+ // Nest conditions rather than one big 'if' statement for readability.
+ //
+ // If both registers are low, we're in an IT block, and the immediate is
+ // in range, we should use encoding T1 instead, which has a cc_out.
+ if (inITBlock() &&
+ isARMLowRegister(static_cast<ARMOperand &>(*Operands[3]).getReg()) &&
+ isARMLowRegister(static_cast<ARMOperand &>(*Operands[4]).getReg()) &&
+ static_cast<ARMOperand &>(*Operands[5]).isImm0_7())
+ return false;
+ // Check against T3. If the second register is the PC, this is an
+ // alternate form of ADR, which uses encoding T4, so check for that too.
+ if (static_cast<ARMOperand &>(*Operands[4]).getReg() != ARM::PC &&
+ static_cast<ARMOperand &>(*Operands[5]).isT2SOImm())
+ return false;
+
+ // Otherwise, we use encoding T4, which does not have a cc_out
+ // operand.
+ return true;
+ }
+
+ // The thumb2 multiply instruction doesn't have a CCOut register, so
+ // if we have a "mul" mnemonic in Thumb mode, check if we'll be able to
+ // use the 16-bit encoding or not.
+ if (isThumbTwo() && Mnemonic == "mul" && Operands.size() == 6 &&
+ static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 &&
+ static_cast<ARMOperand &>(*Operands[3]).isReg() &&
+ static_cast<ARMOperand &>(*Operands[4]).isReg() &&
+ static_cast<ARMOperand &>(*Operands[5]).isReg() &&
+ // If the registers aren't low regs, the destination reg isn't the
+ // same as one of the source regs, or the cc_out operand is zero
+ // outside of an IT block, we have to use the 32-bit encoding, so
+ // remove the cc_out operand.
+ (!isARMLowRegister(static_cast<ARMOperand &>(*Operands[3]).getReg()) ||
+ !isARMLowRegister(static_cast<ARMOperand &>(*Operands[4]).getReg()) ||
+ !isARMLowRegister(static_cast<ARMOperand &>(*Operands[5]).getReg()) ||
+ !inITBlock() || (static_cast<ARMOperand &>(*Operands[3]).getReg() !=
+ static_cast<ARMOperand &>(*Operands[5]).getReg() &&
+ static_cast<ARMOperand &>(*Operands[3]).getReg() !=
+ static_cast<ARMOperand &>(*Operands[4]).getReg())))
+ return true;
+
+ // Also check the 'mul' syntax variant that doesn't specify an explicit
+ // destination register.
+ if (isThumbTwo() && Mnemonic == "mul" && Operands.size() == 5 &&
+ static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 &&
+ static_cast<ARMOperand &>(*Operands[3]).isReg() &&
+ static_cast<ARMOperand &>(*Operands[4]).isReg() &&
+ // If the registers aren't low regs or the cc_out operand is zero
+ // outside of an IT block, we have to use the 32-bit encoding, so
+ // remove the cc_out operand.
+ (!isARMLowRegister(static_cast<ARMOperand &>(*Operands[3]).getReg()) ||
+ !isARMLowRegister(static_cast<ARMOperand &>(*Operands[4]).getReg()) ||
+ !inITBlock()))
+ return true;
+
+
+
+ // Register-register 'add/sub' for thumb does not have a cc_out operand
+ // when it's an ADD/SUB SP, #imm. Be lenient on count since there's also
+ // the "add/sub SP, SP, #imm" version. If the follow-up operands aren't
+ // right, this will result in better diagnostics (which operand is off)
+ // anyway.
+ if (isThumb() && (Mnemonic == "add" || Mnemonic == "sub") &&
+ (Operands.size() == 5 || Operands.size() == 6) &&
+ static_cast<ARMOperand &>(*Operands[3]).isReg() &&
+ static_cast<ARMOperand &>(*Operands[3]).getReg() == ARM::SP &&
+ static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 &&
+ (static_cast<ARMOperand &>(*Operands[4]).isImm() ||
+ (Operands.size() == 6 &&
+ static_cast<ARMOperand &>(*Operands[5]).isImm())))
+ return true;
+
+ return false;
+}
+
+bool ARMAsmParser::shouldOmitPredicateOperand(StringRef Mnemonic,
+ OperandVector &Operands) {
+ // VRINT{Z, R, X} have a predicate operand in VFP, but not in NEON
+ unsigned RegIdx = 3;
+ if ((Mnemonic == "vrintz" || Mnemonic == "vrintx" || Mnemonic == "vrintr") &&
+ (static_cast<ARMOperand &>(*Operands[2]).getToken() == ".f32" ||
+ static_cast<ARMOperand &>(*Operands[2]).getToken() == ".f16")) {
+ if (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
+ (static_cast<ARMOperand &>(*Operands[3]).getToken() == ".f32" ||
+ static_cast<ARMOperand &>(*Operands[3]).getToken() == ".f16"))
+ RegIdx = 4;
+
+ if (static_cast<ARMOperand &>(*Operands[RegIdx]).isReg() &&
+ (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(
+ static_cast<ARMOperand &>(*Operands[RegIdx]).getReg()) ||
+ ARMMCRegisterClasses[ARM::QPRRegClassID].contains(
+ static_cast<ARMOperand &>(*Operands[RegIdx]).getReg())))
+ return true;
+ }
+ return false;
+}
+
+static bool isDataTypeToken(StringRef Tok) {
+ return Tok == ".8" || Tok == ".16" || Tok == ".32" || Tok == ".64" ||
+ Tok == ".i8" || Tok == ".i16" || Tok == ".i32" || Tok == ".i64" ||
+ Tok == ".u8" || Tok == ".u16" || Tok == ".u32" || Tok == ".u64" ||
+ Tok == ".s8" || Tok == ".s16" || Tok == ".s32" || Tok == ".s64" ||
+ Tok == ".p8" || Tok == ".p16" || Tok == ".f32" || Tok == ".f64" ||
+ Tok == ".f" || Tok == ".d";
+}
+
+// FIXME: This bit should probably be handled via an explicit match class
+// in the .td files that matches the suffix instead of having it be
+// a literal string token the way it is now.
+static bool doesIgnoreDataTypeSuffix(StringRef Mnemonic, StringRef DT) {
+ return Mnemonic.startswith("vldm") || Mnemonic.startswith("vstm");
+}
+static void applyMnemonicAliases(StringRef &Mnemonic, uint64_t Features,
+ unsigned VariantID);
+
+static bool RequiresVFPRegListValidation(StringRef Inst,
+ bool &AcceptSinglePrecisionOnly,
+ bool &AcceptDoublePrecisionOnly) {
+ if (Inst.size() < 7)
+ return false;
+
+ if (Inst.startswith("fldm") || Inst.startswith("fstm")) {
+ StringRef AddressingMode = Inst.substr(4, 2);
+ if (AddressingMode == "ia" || AddressingMode == "db" ||
+ AddressingMode == "ea" || AddressingMode == "fd") {
+ AcceptSinglePrecisionOnly = Inst[6] == 's';
+ AcceptDoublePrecisionOnly = Inst[6] == 'd' || Inst[6] == 'x';
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/// Parse an arm instruction mnemonic followed by its operands.
+bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+ SMLoc NameLoc, OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ // FIXME: Can this be done via tablegen in some fashion?
+ bool RequireVFPRegisterListCheck;
+ bool AcceptSinglePrecisionOnly;
+ bool AcceptDoublePrecisionOnly;
+ RequireVFPRegisterListCheck =
+ RequiresVFPRegListValidation(Name, AcceptSinglePrecisionOnly,
+ AcceptDoublePrecisionOnly);
+
+ // Apply mnemonic aliases before doing anything else, as the destination
+ // mnemonic may include suffices and we want to handle them normally.
+ // The generic tblgen'erated code does this later, at the start of
+ // MatchInstructionImpl(), but that's too late for aliases that include
+ // any sort of suffix.
+ uint64_t AvailableFeatures = getAvailableFeatures();
+ unsigned AssemblerDialect = getParser().getAssemblerDialect();
+ applyMnemonicAliases(Name, AvailableFeatures, AssemblerDialect);
+
+ // First check for the ARM-specific .req directive.
+ if (Parser.getTok().is(AsmToken::Identifier) &&
+ Parser.getTok().getIdentifier() == ".req") {
+ parseDirectiveReq(Name, NameLoc);
+ // We always return 'error' for this, as we're done with this
+ // statement and don't need to match the 'instruction."
+ return true;
+ }
+
+ // Create the leading tokens for the mnemonic, split by '.' characters.
+ size_t Start = 0, Next = Name.find('.');
+ StringRef Mnemonic = Name.slice(Start, Next);
+
+ // Split out the predication code and carry setting flag from the mnemonic.
+ unsigned PredicationCode;
+ unsigned ProcessorIMod;
+ bool CarrySetting;
+ StringRef ITMask;
+ Mnemonic = splitMnemonic(Mnemonic, PredicationCode, CarrySetting,
+ ProcessorIMod, ITMask);
+
+ // In Thumb1, only the branch (B) instruction can be predicated.
+ if (isThumbOne() && PredicationCode != ARMCC::AL && Mnemonic != "b") {
+ return Error(NameLoc, "conditional execution not supported in Thumb1");
+ }
+
+ Operands.push_back(ARMOperand::CreateToken(Mnemonic, NameLoc));
+
+ // Handle the IT instruction ITMask. Convert it to a bitmask. This
+ // is the mask as it will be for the IT encoding if the conditional
+ // encoding has a '1' as it's bit0 (i.e. 't' ==> '1'). In the case
+ // where the conditional bit0 is zero, the instruction post-processing
+ // will adjust the mask accordingly.
+ if (Mnemonic == "it") {
+ SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + 2);
+ if (ITMask.size() > 3) {
+ return Error(Loc, "too many conditions on IT instruction");
+ }
+ unsigned Mask = 8;
+ for (unsigned i = ITMask.size(); i != 0; --i) {
+ char pos = ITMask[i - 1];
+ if (pos != 't' && pos != 'e') {
+ return Error(Loc, "illegal IT block condition mask '" + ITMask + "'");
+ }
+ Mask >>= 1;
+ if (ITMask[i - 1] == 't')
+ Mask |= 8;
+ }
+ Operands.push_back(ARMOperand::CreateITMask(Mask, Loc));
+ }
+
+ // FIXME: This is all a pretty gross hack. We should automatically handle
+ // optional operands like this via tblgen.
+
+ // Next, add the CCOut and ConditionCode operands, if needed.
+ //
+ // For mnemonics which can ever incorporate a carry setting bit or predication
+ // code, our matching model involves us always generating CCOut and
+ // ConditionCode operands to match the mnemonic "as written" and then we let
+ // the matcher deal with finding the right instruction or generating an
+ // appropriate error.
+ bool CanAcceptCarrySet, CanAcceptPredicationCode;
+ getMnemonicAcceptInfo(Mnemonic, Name, CanAcceptCarrySet, CanAcceptPredicationCode);
+
+ // If we had a carry-set on an instruction that can't do that, issue an
+ // error.
+ if (!CanAcceptCarrySet && CarrySetting) {
+ return Error(NameLoc, "instruction '" + Mnemonic +
+ "' can not set flags, but 's' suffix specified");
+ }
+ // If we had a predication code on an instruction that can't do that, issue an
+ // error.
+ if (!CanAcceptPredicationCode && PredicationCode != ARMCC::AL) {
+ return Error(NameLoc, "instruction '" + Mnemonic +
+ "' is not predicable, but condition code specified");
+ }
+
+ // Add the carry setting operand, if necessary.
+ if (CanAcceptCarrySet) {
+ SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Mnemonic.size());
+ Operands.push_back(ARMOperand::CreateCCOut(CarrySetting ? ARM::CPSR : 0,
+ Loc));
+ }
+
+ // Add the predication code operand, if necessary.
+ if (CanAcceptPredicationCode) {
+ SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Mnemonic.size() +
+ CarrySetting);
+ Operands.push_back(ARMOperand::CreateCondCode(
+ ARMCC::CondCodes(PredicationCode), Loc));
+ }
+
+ // Add the processor imod operand, if necessary.
+ if (ProcessorIMod) {
+ Operands.push_back(ARMOperand::CreateImm(
+ MCConstantExpr::create(ProcessorIMod, getContext()),
+ NameLoc, NameLoc));
+ } else if (Mnemonic == "cps" && isMClass()) {
+ return Error(NameLoc, "instruction 'cps' requires effect for M-class");
+ }
+
+ // Add the remaining tokens in the mnemonic.
+ while (Next != StringRef::npos) {
+ Start = Next;
+ Next = Name.find('.', Start + 1);
+ StringRef ExtraToken = Name.slice(Start, Next);
+
+ // Some NEON instructions have an optional datatype suffix that is
+ // completely ignored. Check for that.
+ if (isDataTypeToken(ExtraToken) &&
+ doesIgnoreDataTypeSuffix(Mnemonic, ExtraToken))
+ continue;
+
+ // For for ARM mode generate an error if the .n qualifier is used.
+ if (ExtraToken == ".n" && !isThumb()) {
+ SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Start);
+ return Error(Loc, "instruction with .n (narrow) qualifier not allowed in "
+ "arm mode");
+ }
+
+ // The .n qualifier is always discarded as that is what the tables
+ // and matcher expect. In ARM mode the .w qualifier has no effect,
+ // so discard it to avoid errors that can be caused by the matcher.
+ if (ExtraToken != ".n" && (isThumb() || ExtraToken != ".w")) {
+ SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Start);
+ Operands.push_back(ARMOperand::CreateToken(ExtraToken, Loc));
+ }
+ }
+
+ // Read the remaining operands.
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ // Read the first operand.
+ if (parseOperand(Operands, Mnemonic)) {
+ return true;
+ }
+
+ while (parseOptionalToken(AsmToken::Comma)) {
+ // Parse and remember the operand.
+ if (parseOperand(Operands, Mnemonic)) {
+ return true;
+ }
+ }
+ }
+
+ if (parseToken(AsmToken::EndOfStatement, "unexpected token in argument list"))
+ return true;
+
+ if (RequireVFPRegisterListCheck) {
+ ARMOperand &Op = static_cast<ARMOperand &>(*Operands.back());
+ if (AcceptSinglePrecisionOnly && !Op.isSPRRegList())
+ return Error(Op.getStartLoc(),
+ "VFP/Neon single precision register expected");
+ if (AcceptDoublePrecisionOnly && !Op.isDPRRegList())
+ return Error(Op.getStartLoc(),
+ "VFP/Neon double precision register expected");
+ }
+
+ tryConvertingToTwoOperandForm(Mnemonic, CarrySetting, Operands);
+
+ // Some instructions, mostly Thumb, have forms for the same mnemonic that
+ // do and don't have a cc_out optional-def operand. With some spot-checks
+ // of the operand list, we can figure out which variant we're trying to
+ // parse and adjust accordingly before actually matching. We shouldn't ever
+ // try to remove a cc_out operand that was explicitly set on the
+ // mnemonic, of course (CarrySetting == true). Reason number #317 the
+ // table driven matcher doesn't fit well with the ARM instruction set.
+ if (!CarrySetting && shouldOmitCCOutOperand(Mnemonic, Operands))
+ Operands.erase(Operands.begin() + 1);
+
+ // Some instructions have the same mnemonic, but don't always
+ // have a predicate. Distinguish them here and delete the
+ // predicate if needed.
+ if (shouldOmitPredicateOperand(Mnemonic, Operands))
+ Operands.erase(Operands.begin() + 1);
+
+ // ARM mode 'blx' need special handling, as the register operand version
+ // is predicable, but the label operand version is not. So, we can't rely
+ // on the Mnemonic based checking to correctly figure out when to put
+ // a k_CondCode operand in the list. If we're trying to match the label
+ // version, remove the k_CondCode operand here.
+ if (!isThumb() && Mnemonic == "blx" && Operands.size() == 3 &&
+ static_cast<ARMOperand &>(*Operands[2]).isImm())
+ Operands.erase(Operands.begin() + 1);
+
+ // Adjust operands of ldrexd/strexd to MCK_GPRPair.
+ // ldrexd/strexd require even/odd GPR pair. To enforce this constraint,
+ // a single GPRPair reg operand is used in the .td file to replace the two
+ // GPRs. However, when parsing from asm, the two GRPs cannot be automatically
+ // expressed as a GPRPair, so we have to manually merge them.
+ // FIXME: We would really like to be able to tablegen'erate this.
+ if (!isThumb() && Operands.size() > 4 &&
+ (Mnemonic == "ldrexd" || Mnemonic == "strexd" || Mnemonic == "ldaexd" ||
+ Mnemonic == "stlexd")) {
+ bool isLoad = (Mnemonic == "ldrexd" || Mnemonic == "ldaexd");
+ unsigned Idx = isLoad ? 2 : 3;
+ ARMOperand &Op1 = static_cast<ARMOperand &>(*Operands[Idx]);
+ ARMOperand &Op2 = static_cast<ARMOperand &>(*Operands[Idx + 1]);
+
+ const MCRegisterClass& MRC = MRI->getRegClass(ARM::GPRRegClassID);
+ // Adjust only if Op1 and Op2 are GPRs.
+ if (Op1.isReg() && Op2.isReg() && MRC.contains(Op1.getReg()) &&
+ MRC.contains(Op2.getReg())) {
+ unsigned Reg1 = Op1.getReg();
+ unsigned Reg2 = Op2.getReg();
+ unsigned Rt = MRI->getEncodingValue(Reg1);
+ unsigned Rt2 = MRI->getEncodingValue(Reg2);
+
+ // Rt2 must be Rt + 1 and Rt must be even.
+ if (Rt + 1 != Rt2 || (Rt & 1)) {
+ return Error(Op2.getStartLoc(),
+ isLoad ? "destination operands must be sequential"
+ : "source operands must be sequential");
+ }
+ unsigned NewReg = MRI->getMatchingSuperReg(Reg1, ARM::gsub_0,
+ &(MRI->getRegClass(ARM::GPRPairRegClassID)));
+ Operands[Idx] =
+ ARMOperand::CreateReg(NewReg, Op1.getStartLoc(), Op2.getEndLoc());
+ Operands.erase(Operands.begin() + Idx + 1);
+ }
+ }
+
+ // GNU Assembler extension (compatibility)
+ if ((Mnemonic == "ldrd" || Mnemonic == "strd")) {
+ ARMOperand &Op2 = static_cast<ARMOperand &>(*Operands[2]);
+ ARMOperand &Op3 = static_cast<ARMOperand &>(*Operands[3]);
+ if (Op3.isMem()) {
+ assert(Op2.isReg() && "expected register argument");
+
+ unsigned SuperReg = MRI->getMatchingSuperReg(
+ Op2.getReg(), ARM::gsub_0, &MRI->getRegClass(ARM::GPRPairRegClassID));
+
+ assert(SuperReg && "expected register pair");
+
+ unsigned PairedReg = MRI->getSubReg(SuperReg, ARM::gsub_1);
+
+ Operands.insert(
+ Operands.begin() + 3,
+ ARMOperand::CreateReg(PairedReg, Op2.getStartLoc(), Op2.getEndLoc()));
+ }
+ }
+
+ // FIXME: As said above, this is all a pretty gross hack. This instruction
+ // does not fit with other "subs" and tblgen.
+ // Adjust operands of B9.3.19 SUBS PC, LR, #imm (Thumb2) system instruction
+ // so the Mnemonic is the original name "subs" and delete the predicate
+ // operand so it will match the table entry.
+ if (isThumbTwo() && Mnemonic == "sub" && Operands.size() == 6 &&
+ static_cast<ARMOperand &>(*Operands[3]).isReg() &&
+ static_cast<ARMOperand &>(*Operands[3]).getReg() == ARM::PC &&
+ static_cast<ARMOperand &>(*Operands[4]).isReg() &&
+ static_cast<ARMOperand &>(*Operands[4]).getReg() == ARM::LR &&
+ static_cast<ARMOperand &>(*Operands[5]).isImm()) {
+ Operands.front() = ARMOperand::CreateToken(Name, NameLoc);
+ Operands.erase(Operands.begin() + 1);
+ }
+ return false;
+}
+
+// Validate context-sensitive operand constraints.
+
+// return 'true' if register list contains non-low GPR registers,
+// 'false' otherwise. If Reg is in the register list or is HiReg, set
+// 'containsReg' to true.
+static bool checkLowRegisterList(const MCInst &Inst, unsigned OpNo,
+ unsigned Reg, unsigned HiReg,
+ bool &containsReg) {
+ containsReg = false;
+ for (unsigned i = OpNo; i < Inst.getNumOperands(); ++i) {
+ unsigned OpReg = Inst.getOperand(i).getReg();
+ if (OpReg == Reg)
+ containsReg = true;
+ // Anything other than a low register isn't legal here.
+ if (!isARMLowRegister(OpReg) && (!HiReg || OpReg != HiReg))
+ return true;
+ }
+ return false;
+}
+
+// Check if the specified regisgter is in the register list of the inst,
+// starting at the indicated operand number.
+static bool listContainsReg(const MCInst &Inst, unsigned OpNo, unsigned Reg) {
+ for (unsigned i = OpNo, e = Inst.getNumOperands(); i < e; ++i) {
+ unsigned OpReg = Inst.getOperand(i).getReg();
+ if (OpReg == Reg)
+ return true;
+ }
+ return false;
+}
+
+// Return true if instruction has the interesting property of being
+// allowed in IT blocks, but not being predicable.
+static bool instIsBreakpoint(const MCInst &Inst) {
+ return Inst.getOpcode() == ARM::tBKPT ||
+ Inst.getOpcode() == ARM::BKPT ||
+ Inst.getOpcode() == ARM::tHLT ||
+ Inst.getOpcode() == ARM::HLT;
+
+}
+
+bool ARMAsmParser::validatetLDMRegList(const MCInst &Inst,
+ const OperandVector &Operands,
+ unsigned ListNo, bool IsARPop) {
+ const ARMOperand &Op = static_cast<const ARMOperand &>(*Operands[ListNo]);
+ bool HasWritebackToken = Op.isToken() && Op.getToken() == "!";
+
+ bool ListContainsSP = listContainsReg(Inst, ListNo, ARM::SP);
+ bool ListContainsLR = listContainsReg(Inst, ListNo, ARM::LR);
+ bool ListContainsPC = listContainsReg(Inst, ListNo, ARM::PC);
+
+ if (!IsARPop && ListContainsSP)
+ return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(),
+ "SP may not be in the register list");
+ else if (ListContainsPC && ListContainsLR)
+ return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(),
+ "PC and LR may not be in the register list simultaneously");
+ else if (inITBlock() && !lastInITBlock() && ListContainsPC)
+ return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(),
+ "instruction must be outside of IT block or the last "
+ "instruction in an IT block");
+ return false;
+}
+
+bool ARMAsmParser::validatetSTMRegList(const MCInst &Inst,
+ const OperandVector &Operands,
+ unsigned ListNo) {
+ const ARMOperand &Op = static_cast<const ARMOperand &>(*Operands[ListNo]);
+ bool HasWritebackToken = Op.isToken() && Op.getToken() == "!";
+
+ bool ListContainsSP = listContainsReg(Inst, ListNo, ARM::SP);
+ bool ListContainsPC = listContainsReg(Inst, ListNo, ARM::PC);
+
+ if (ListContainsSP && ListContainsPC)
+ return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(),
+ "SP and PC may not be in the register list");
+ else if (ListContainsSP)
+ return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(),
+ "SP may not be in the register list");
+ else if (ListContainsPC)
+ return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(),
+ "PC may not be in the register list");
+ return false;
+}
+
+// FIXME: We would really like to be able to tablegen'erate this.
+bool ARMAsmParser::validateInstruction(MCInst &Inst,
+ const OperandVector &Operands) {
+ const MCInstrDesc &MCID = MII.get(Inst.getOpcode());
+ SMLoc Loc = Operands[0]->getStartLoc();
+
+ // Check the IT block state first.
+ // NOTE: BKPT and HLT instructions have the interesting property of being
+ // allowed in IT blocks, but not being predicable. They just always execute.
+ if (inITBlock() && !instIsBreakpoint(Inst)) {
+ // The instruction must be predicable.
+ if (!MCID.isPredicable())
+ return Error(Loc, "instructions in IT block must be predicable");
+ unsigned Cond = Inst.getOperand(MCID.findFirstPredOperandIdx()).getImm();
+ if (Cond != currentITCond()) {
+ // Find the condition code Operand to get its SMLoc information.
+ SMLoc CondLoc;
+ for (unsigned I = 1; I < Operands.size(); ++I)
+ if (static_cast<ARMOperand &>(*Operands[I]).isCondCode())
+ CondLoc = Operands[I]->getStartLoc();
+ return Error(CondLoc, "incorrect condition in IT block; got '" +
+ StringRef(ARMCondCodeToString(ARMCC::CondCodes(Cond))) +
+ "', but expected '" +
+ ARMCondCodeToString(ARMCC::CondCodes(currentITCond())) + "'");
+ }
+ // Check for non-'al' condition codes outside of the IT block.
+ } else if (isThumbTwo() && MCID.isPredicable() &&
+ Inst.getOperand(MCID.findFirstPredOperandIdx()).getImm() !=
+ ARMCC::AL && Inst.getOpcode() != ARM::tBcc &&
+ Inst.getOpcode() != ARM::t2Bcc) {
+ return Error(Loc, "predicated instructions must be in IT block");
+ } else if (!isThumb() && !useImplicitITARM() && MCID.isPredicable() &&
+ Inst.getOperand(MCID.findFirstPredOperandIdx()).getImm() !=
+ ARMCC::AL) {
+ return Warning(Loc, "predicated instructions should be in IT block");
+ }
+
+ const unsigned Opcode = Inst.getOpcode();
+ switch (Opcode) {
+ case ARM::LDRD:
+ case ARM::LDRD_PRE:
+ case ARM::LDRD_POST: {
+ const unsigned RtReg = Inst.getOperand(0).getReg();
+
+ // Rt can't be R14.
+ if (RtReg == ARM::LR)
+ return Error(Operands[3]->getStartLoc(),
+ "Rt can't be R14");
+
+ const unsigned Rt = MRI->getEncodingValue(RtReg);
+ // Rt must be even-numbered.
+ if ((Rt & 1) == 1)
+ return Error(Operands[3]->getStartLoc(),
+ "Rt must be even-numbered");
+
+ // Rt2 must be Rt + 1.
+ const unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(1).getReg());
+ if (Rt2 != Rt + 1)
+ return Error(Operands[3]->getStartLoc(),
+ "destination operands must be sequential");
+
+ if (Opcode == ARM::LDRD_PRE || Opcode == ARM::LDRD_POST) {
+ const unsigned Rn = MRI->getEncodingValue(Inst.getOperand(3).getReg());
+ // For addressing modes with writeback, the base register needs to be
+ // different from the destination registers.
+ if (Rn == Rt || Rn == Rt2)
+ return Error(Operands[3]->getStartLoc(),
+ "base register needs to be different from destination "
+ "registers");
+ }
+
+ return false;
+ }
+ case ARM::t2LDRDi8:
+ case ARM::t2LDRD_PRE:
+ case ARM::t2LDRD_POST: {
+ // Rt2 must be different from Rt.
+ unsigned Rt = MRI->getEncodingValue(Inst.getOperand(0).getReg());
+ unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(1).getReg());
+ if (Rt2 == Rt)
+ return Error(Operands[3]->getStartLoc(),
+ "destination operands can't be identical");
+ return false;
+ }
+ case ARM::t2BXJ: {
+ const unsigned RmReg = Inst.getOperand(0).getReg();
+ // Rm = SP is no longer unpredictable in v8-A
+ if (RmReg == ARM::SP && !hasV8Ops())
+ return Error(Operands[2]->getStartLoc(),
+ "r13 (SP) is an unpredictable operand to BXJ");
+ return false;
+ }
+ case ARM::STRD: {
+ // Rt2 must be Rt + 1.
+ unsigned Rt = MRI->getEncodingValue(Inst.getOperand(0).getReg());
+ unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(1).getReg());
+ if (Rt2 != Rt + 1)
+ return Error(Operands[3]->getStartLoc(),
+ "source operands must be sequential");
+ return false;
+ }
+ case ARM::STRD_PRE:
+ case ARM::STRD_POST: {
+ // Rt2 must be Rt + 1.
+ unsigned Rt = MRI->getEncodingValue(Inst.getOperand(1).getReg());
+ unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(2).getReg());
+ if (Rt2 != Rt + 1)
+ return Error(Operands[3]->getStartLoc(),
+ "source operands must be sequential");
+ return false;
+ }
+ case ARM::STR_PRE_IMM:
+ case ARM::STR_PRE_REG:
+ case ARM::STR_POST_IMM:
+ case ARM::STR_POST_REG:
+ case ARM::STRH_PRE:
+ case ARM::STRH_POST:
+ case ARM::STRB_PRE_IMM:
+ case ARM::STRB_PRE_REG:
+ case ARM::STRB_POST_IMM:
+ case ARM::STRB_POST_REG: {
+ // Rt must be different from Rn.
+ const unsigned Rt = MRI->getEncodingValue(Inst.getOperand(1).getReg());
+ const unsigned Rn = MRI->getEncodingValue(Inst.getOperand(2).getReg());
+
+ if (Rt == Rn)
+ return Error(Operands[3]->getStartLoc(),
+ "source register and base register can't be identical");
+ return false;
+ }
+ case ARM::LDR_PRE_IMM:
+ case ARM::LDR_PRE_REG:
+ case ARM::LDR_POST_IMM:
+ case ARM::LDR_POST_REG:
+ case ARM::LDRH_PRE:
+ case ARM::LDRH_POST:
+ case ARM::LDRSH_PRE:
+ case ARM::LDRSH_POST:
+ case ARM::LDRB_PRE_IMM:
+ case ARM::LDRB_PRE_REG:
+ case ARM::LDRB_POST_IMM:
+ case ARM::LDRB_POST_REG:
+ case ARM::LDRSB_PRE:
+ case ARM::LDRSB_POST: {
+ // Rt must be different from Rn.
+ const unsigned Rt = MRI->getEncodingValue(Inst.getOperand(0).getReg());
+ const unsigned Rn = MRI->getEncodingValue(Inst.getOperand(2).getReg());
+
+ if (Rt == Rn)
+ return Error(Operands[3]->getStartLoc(),
+ "destination register and base register can't be identical");
+ return false;
+ }
+ case ARM::SBFX:
+ case ARM::UBFX: {
+ // Width must be in range [1, 32-lsb].
+ unsigned LSB = Inst.getOperand(2).getImm();
+ unsigned Widthm1 = Inst.getOperand(3).getImm();
+ if (Widthm1 >= 32 - LSB)
+ return Error(Operands[5]->getStartLoc(),
+ "bitfield width must be in range [1,32-lsb]");
+ return false;
+ }
+ // Notionally handles ARM::tLDMIA_UPD too.
+ case ARM::tLDMIA: {
+ // If we're parsing Thumb2, the .w variant is available and handles
+ // most cases that are normally illegal for a Thumb1 LDM instruction.
+ // We'll make the transformation in processInstruction() if necessary.
+ //
+ // Thumb LDM instructions are writeback iff the base register is not
+ // in the register list.
+ unsigned Rn = Inst.getOperand(0).getReg();
+ bool HasWritebackToken =
+ (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
+ static_cast<ARMOperand &>(*Operands[3]).getToken() == "!");
+ bool ListContainsBase;
+ if (checkLowRegisterList(Inst, 3, Rn, 0, ListContainsBase) && !isThumbTwo())
+ return Error(Operands[3 + HasWritebackToken]->getStartLoc(),
+ "registers must be in range r0-r7");
+ // If we should have writeback, then there should be a '!' token.
+ if (!ListContainsBase && !HasWritebackToken && !isThumbTwo())
+ return Error(Operands[2]->getStartLoc(),
+ "writeback operator '!' expected");
+ // If we should not have writeback, there must not be a '!'. This is
+ // true even for the 32-bit wide encodings.
+ if (ListContainsBase && HasWritebackToken)
+ return Error(Operands[3]->getStartLoc(),
+ "writeback operator '!' not allowed when base register "
+ "in register list");
+
+ if (validatetLDMRegList(Inst, Operands, 3))
+ return true;
+ break;
+ }
+ case ARM::LDMIA_UPD:
+ case ARM::LDMDB_UPD:
+ case ARM::LDMIB_UPD:
+ case ARM::LDMDA_UPD:
+ // ARM variants loading and updating the same register are only officially
+ // UNPREDICTABLE on v7 upwards. Goodness knows what they did before.
+ if (!hasV7Ops())
+ break;
+ if (listContainsReg(Inst, 3, Inst.getOperand(0).getReg()))
+ return Error(Operands.back()->getStartLoc(),
+ "writeback register not allowed in register list");
+ break;
+ case ARM::t2LDMIA:
+ case ARM::t2LDMDB:
+ if (validatetLDMRegList(Inst, Operands, 3))
+ return true;
+ break;
+ case ARM::t2STMIA:
+ case ARM::t2STMDB:
+ if (validatetSTMRegList(Inst, Operands, 3))
+ return true;
+ break;
+ case ARM::t2LDMIA_UPD:
+ case ARM::t2LDMDB_UPD:
+ case ARM::t2STMIA_UPD:
+ case ARM::t2STMDB_UPD: {
+ if (listContainsReg(Inst, 3, Inst.getOperand(0).getReg()))
+ return Error(Operands.back()->getStartLoc(),
+ "writeback register not allowed in register list");
+
+ if (Opcode == ARM::t2LDMIA_UPD || Opcode == ARM::t2LDMDB_UPD) {
+ if (validatetLDMRegList(Inst, Operands, 3))
+ return true;
+ } else {
+ if (validatetSTMRegList(Inst, Operands, 3))
+ return true;
+ }
+ break;
+ }
+ case ARM::sysLDMIA_UPD:
+ case ARM::sysLDMDA_UPD:
+ case ARM::sysLDMDB_UPD:
+ case ARM::sysLDMIB_UPD:
+ if (!listContainsReg(Inst, 3, ARM::PC))
+ return Error(Operands[4]->getStartLoc(),
+ "writeback register only allowed on system LDM "
+ "if PC in register-list");
+ break;
+ case ARM::sysSTMIA_UPD:
+ case ARM::sysSTMDA_UPD:
+ case ARM::sysSTMDB_UPD:
+ case ARM::sysSTMIB_UPD:
+ return Error(Operands[2]->getStartLoc(),
+ "system STM cannot have writeback register");
+ case ARM::tMUL: {
+ // The second source operand must be the same register as the destination
+ // operand.
+ //
+ // In this case, we must directly check the parsed operands because the
+ // cvtThumbMultiply() function is written in such a way that it guarantees
+ // this first statement is always true for the new Inst. Essentially, the
+ // destination is unconditionally copied into the second source operand
+ // without checking to see if it matches what we actually parsed.
+ if (Operands.size() == 6 && (((ARMOperand &)*Operands[3]).getReg() !=
+ ((ARMOperand &)*Operands[5]).getReg()) &&
+ (((ARMOperand &)*Operands[3]).getReg() !=
+ ((ARMOperand &)*Operands[4]).getReg())) {
+ return Error(Operands[3]->getStartLoc(),
+ "destination register must match source register");
+ }
+ break;
+ }
+ // Like for ldm/stm, push and pop have hi-reg handling version in Thumb2,
+ // so only issue a diagnostic for thumb1. The instructions will be
+ // switched to the t2 encodings in processInstruction() if necessary.
+ case ARM::tPOP: {
+ bool ListContainsBase;
+ if (checkLowRegisterList(Inst, 2, 0, ARM::PC, ListContainsBase) &&
+ !isThumbTwo())
+ return Error(Operands[2]->getStartLoc(),
+ "registers must be in range r0-r7 or pc");
+ if (validatetLDMRegList(Inst, Operands, 2, !isMClass()))
+ return true;
+ break;
+ }
+ case ARM::tPUSH: {
+ bool ListContainsBase;
+ if (checkLowRegisterList(Inst, 2, 0, ARM::LR, ListContainsBase) &&
+ !isThumbTwo())
+ return Error(Operands[2]->getStartLoc(),
+ "registers must be in range r0-r7 or lr");
+ if (validatetSTMRegList(Inst, Operands, 2))
+ return true;
+ break;
+ }
+ case ARM::tSTMIA_UPD: {
+ bool ListContainsBase, InvalidLowList;
+ InvalidLowList = checkLowRegisterList(Inst, 4, Inst.getOperand(0).getReg(),
+ 0, ListContainsBase);
+ if (InvalidLowList && !isThumbTwo())
+ return Error(Operands[4]->getStartLoc(),
+ "registers must be in range r0-r7");
+
+ // This would be converted to a 32-bit stm, but that's not valid if the
+ // writeback register is in the list.
+ if (InvalidLowList && ListContainsBase)
+ return Error(Operands[4]->getStartLoc(),
+ "writeback operator '!' not allowed when base register "
+ "in register list");
+
+ if (validatetSTMRegList(Inst, Operands, 4))
+ return true;
+ break;
+ }
+ case ARM::tADDrSP: {
+ // If the non-SP source operand and the destination operand are not the
+ // same, we need thumb2 (for the wide encoding), or we have an error.
+ if (!isThumbTwo() &&
+ Inst.getOperand(0).getReg() != Inst.getOperand(2).getReg()) {
+ return Error(Operands[4]->getStartLoc(),
+ "source register must be the same as destination");
+ }
+ break;
+ }
+ // Final range checking for Thumb unconditional branch instructions.
+ case ARM::tB:
+ if (!(static_cast<ARMOperand &>(*Operands[2])).isSignedOffset<11, 1>())
+ return Error(Operands[2]->getStartLoc(), "branch target out of range");
+ break;
+ case ARM::t2B: {
+ int op = (Operands[2]->isImm()) ? 2 : 3;
+ if (!static_cast<ARMOperand &>(*Operands[op]).isSignedOffset<24, 1>())
+ return Error(Operands[op]->getStartLoc(), "branch target out of range");
+ break;
+ }
+ // Final range checking for Thumb conditional branch instructions.
+ case ARM::tBcc:
+ if (!static_cast<ARMOperand &>(*Operands[2]).isSignedOffset<8, 1>())
+ return Error(Operands[2]->getStartLoc(), "branch target out of range");
+ break;
+ case ARM::t2Bcc: {
+ int Op = (Operands[2]->isImm()) ? 2 : 3;
+ if (!static_cast<ARMOperand &>(*Operands[Op]).isSignedOffset<20, 1>())
+ return Error(Operands[Op]->getStartLoc(), "branch target out of range");
+ break;
+ }
+ case ARM::tCBZ:
+ case ARM::tCBNZ: {
+ if (!static_cast<ARMOperand &>(*Operands[2]).isUnsignedOffset<6, 1>())
+ return Error(Operands[2]->getStartLoc(), "branch target out of range");
+ break;
+ }
+ case ARM::MOVi16:
+ case ARM::t2MOVi16:
+ case ARM::t2MOVTi16:
+ {
+ // We want to avoid misleadingly allowing something like "mov r0, <symbol>"
+ // especially when we turn it into a movw and the expression <symbol> does
+ // not have a :lower16: or :upper16 as part of the expression. We don't
+ // want the behavior of silently truncating, which can be unexpected and
+ // lead to bugs that are difficult to find since this is an easy mistake
+ // to make.
+ int i = (Operands[3]->isImm()) ? 3 : 4;
+ ARMOperand &Op = static_cast<ARMOperand &>(*Operands[i]);
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm());
+ if (CE) break;
+ const MCExpr *E = dyn_cast<MCExpr>(Op.getImm());
+ if (!E) break;
+ const ARMMCExpr *ARM16Expr = dyn_cast<ARMMCExpr>(E);
+ if (!ARM16Expr || (ARM16Expr->getKind() != ARMMCExpr::VK_ARM_HI16 &&
+ ARM16Expr->getKind() != ARMMCExpr::VK_ARM_LO16))
+ return Error(
+ Op.getStartLoc(),
+ "immediate expression for mov requires :lower16: or :upper16");
+ break;
+ }
+ case ARM::HINT:
+ case ARM::t2HINT: {
+ if (hasRAS()) {
+ // ESB is not predicable (pred must be AL)
+ unsigned Imm8 = Inst.getOperand(0).getImm();
+ unsigned Pred = Inst.getOperand(1).getImm();
+ if (Imm8 == 0x10 && Pred != ARMCC::AL)
+ return Error(Operands[1]->getStartLoc(), "instruction 'esb' is not "
+ "predicable, but condition "
+ "code specified");
+ }
+ // Without the RAS extension, this behaves as any other unallocated hint.
+ break;
+ }
+ }
+
+ return false;
+}
+
+static unsigned getRealVSTOpcode(unsigned Opc, unsigned &Spacing) {
+ switch(Opc) {
+ default: llvm_unreachable("unexpected opcode!");
+ // VST1LN
+ case ARM::VST1LNdWB_fixed_Asm_8: Spacing = 1; return ARM::VST1LNd8_UPD;
+ case ARM::VST1LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VST1LNd16_UPD;
+ case ARM::VST1LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VST1LNd32_UPD;
+ case ARM::VST1LNdWB_register_Asm_8: Spacing = 1; return ARM::VST1LNd8_UPD;
+ case ARM::VST1LNdWB_register_Asm_16: Spacing = 1; return ARM::VST1LNd16_UPD;
+ case ARM::VST1LNdWB_register_Asm_32: Spacing = 1; return ARM::VST1LNd32_UPD;
+ case ARM::VST1LNdAsm_8: Spacing = 1; return ARM::VST1LNd8;
+ case ARM::VST1LNdAsm_16: Spacing = 1; return ARM::VST1LNd16;
+ case ARM::VST1LNdAsm_32: Spacing = 1; return ARM::VST1LNd32;
+
+ // VST2LN
+ case ARM::VST2LNdWB_fixed_Asm_8: Spacing = 1; return ARM::VST2LNd8_UPD;
+ case ARM::VST2LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VST2LNd16_UPD;
+ case ARM::VST2LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VST2LNd32_UPD;
+ case ARM::VST2LNqWB_fixed_Asm_16: Spacing = 2; return ARM::VST2LNq16_UPD;
+ case ARM::VST2LNqWB_fixed_Asm_32: Spacing = 2; return ARM::VST2LNq32_UPD;
+
+ case ARM::VST2LNdWB_register_Asm_8: Spacing = 1; return ARM::VST2LNd8_UPD;
+ case ARM::VST2LNdWB_register_Asm_16: Spacing = 1; return ARM::VST2LNd16_UPD;
+ case ARM::VST2LNdWB_register_Asm_32: Spacing = 1; return ARM::VST2LNd32_UPD;
+ case ARM::VST2LNqWB_register_Asm_16: Spacing = 2; return ARM::VST2LNq16_UPD;
+ case ARM::VST2LNqWB_register_Asm_32: Spacing = 2; return ARM::VST2LNq32_UPD;
+
+ case ARM::VST2LNdAsm_8: Spacing = 1; return ARM::VST2LNd8;
+ case ARM::VST2LNdAsm_16: Spacing = 1; return ARM::VST2LNd16;
+ case ARM::VST2LNdAsm_32: Spacing = 1; return ARM::VST2LNd32;
+ case ARM::VST2LNqAsm_16: Spacing = 2; return ARM::VST2LNq16;
+ case ARM::VST2LNqAsm_32: Spacing = 2; return ARM::VST2LNq32;
+
+ // VST3LN
+ case ARM::VST3LNdWB_fixed_Asm_8: Spacing = 1; return ARM::VST3LNd8_UPD;
+ case ARM::VST3LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VST3LNd16_UPD;
+ case ARM::VST3LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VST3LNd32_UPD;
+ case ARM::VST3LNqWB_fixed_Asm_16: Spacing = 1; return ARM::VST3LNq16_UPD;
+ case ARM::VST3LNqWB_fixed_Asm_32: Spacing = 2; return ARM::VST3LNq32_UPD;
+ case ARM::VST3LNdWB_register_Asm_8: Spacing = 1; return ARM::VST3LNd8_UPD;
+ case ARM::VST3LNdWB_register_Asm_16: Spacing = 1; return ARM::VST3LNd16_UPD;
+ case ARM::VST3LNdWB_register_Asm_32: Spacing = 1; return ARM::VST3LNd32_UPD;
+ case ARM::VST3LNqWB_register_Asm_16: Spacing = 2; return ARM::VST3LNq16_UPD;
+ case ARM::VST3LNqWB_register_Asm_32: Spacing = 2; return ARM::VST3LNq32_UPD;
+ case ARM::VST3LNdAsm_8: Spacing = 1; return ARM::VST3LNd8;
+ case ARM::VST3LNdAsm_16: Spacing = 1; return ARM::VST3LNd16;
+ case ARM::VST3LNdAsm_32: Spacing = 1; return ARM::VST3LNd32;
+ case ARM::VST3LNqAsm_16: Spacing = 2; return ARM::VST3LNq16;
+ case ARM::VST3LNqAsm_32: Spacing = 2; return ARM::VST3LNq32;
+
+ // VST3
+ case ARM::VST3dWB_fixed_Asm_8: Spacing = 1; return ARM::VST3d8_UPD;
+ case ARM::VST3dWB_fixed_Asm_16: Spacing = 1; return ARM::VST3d16_UPD;
+ case ARM::VST3dWB_fixed_Asm_32: Spacing = 1; return ARM::VST3d32_UPD;
+ case ARM::VST3qWB_fixed_Asm_8: Spacing = 2; return ARM::VST3q8_UPD;
+ case ARM::VST3qWB_fixed_Asm_16: Spacing = 2; return ARM::VST3q16_UPD;
+ case ARM::VST3qWB_fixed_Asm_32: Spacing = 2; return ARM::VST3q32_UPD;
+ case ARM::VST3dWB_register_Asm_8: Spacing = 1; return ARM::VST3d8_UPD;
+ case ARM::VST3dWB_register_Asm_16: Spacing = 1; return ARM::VST3d16_UPD;
+ case ARM::VST3dWB_register_Asm_32: Spacing = 1; return ARM::VST3d32_UPD;
+ case ARM::VST3qWB_register_Asm_8: Spacing = 2; return ARM::VST3q8_UPD;
+ case ARM::VST3qWB_register_Asm_16: Spacing = 2; return ARM::VST3q16_UPD;
+ case ARM::VST3qWB_register_Asm_32: Spacing = 2; return ARM::VST3q32_UPD;
+ case ARM::VST3dAsm_8: Spacing = 1; return ARM::VST3d8;
+ case ARM::VST3dAsm_16: Spacing = 1; return ARM::VST3d16;
+ case ARM::VST3dAsm_32: Spacing = 1; return ARM::VST3d32;
+ case ARM::VST3qAsm_8: Spacing = 2; return ARM::VST3q8;
+ case ARM::VST3qAsm_16: Spacing = 2; return ARM::VST3q16;
+ case ARM::VST3qAsm_32: Spacing = 2; return ARM::VST3q32;
+
+ // VST4LN
+ case ARM::VST4LNdWB_fixed_Asm_8: Spacing = 1; return ARM::VST4LNd8_UPD;
+ case ARM::VST4LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VST4LNd16_UPD;
+ case ARM::VST4LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VST4LNd32_UPD;
+ case ARM::VST4LNqWB_fixed_Asm_16: Spacing = 1; return ARM::VST4LNq16_UPD;
+ case ARM::VST4LNqWB_fixed_Asm_32: Spacing = 2; return ARM::VST4LNq32_UPD;
+ case ARM::VST4LNdWB_register_Asm_8: Spacing = 1; return ARM::VST4LNd8_UPD;
+ case ARM::VST4LNdWB_register_Asm_16: Spacing = 1; return ARM::VST4LNd16_UPD;
+ case ARM::VST4LNdWB_register_Asm_32: Spacing = 1; return ARM::VST4LNd32_UPD;
+ case ARM::VST4LNqWB_register_Asm_16: Spacing = 2; return ARM::VST4LNq16_UPD;
+ case ARM::VST4LNqWB_register_Asm_32: Spacing = 2; return ARM::VST4LNq32_UPD;
+ case ARM::VST4LNdAsm_8: Spacing = 1; return ARM::VST4LNd8;
+ case ARM::VST4LNdAsm_16: Spacing = 1; return ARM::VST4LNd16;
+ case ARM::VST4LNdAsm_32: Spacing = 1; return ARM::VST4LNd32;
+ case ARM::VST4LNqAsm_16: Spacing = 2; return ARM::VST4LNq16;
+ case ARM::VST4LNqAsm_32: Spacing = 2; return ARM::VST4LNq32;
+
+ // VST4
+ case ARM::VST4dWB_fixed_Asm_8: Spacing = 1; return ARM::VST4d8_UPD;
+ case ARM::VST4dWB_fixed_Asm_16: Spacing = 1; return ARM::VST4d16_UPD;
+ case ARM::VST4dWB_fixed_Asm_32: Spacing = 1; return ARM::VST4d32_UPD;
+ case ARM::VST4qWB_fixed_Asm_8: Spacing = 2; return ARM::VST4q8_UPD;
+ case ARM::VST4qWB_fixed_Asm_16: Spacing = 2; return ARM::VST4q16_UPD;
+ case ARM::VST4qWB_fixed_Asm_32: Spacing = 2; return ARM::VST4q32_UPD;
+ case ARM::VST4dWB_register_Asm_8: Spacing = 1; return ARM::VST4d8_UPD;
+ case ARM::VST4dWB_register_Asm_16: Spacing = 1; return ARM::VST4d16_UPD;
+ case ARM::VST4dWB_register_Asm_32: Spacing = 1; return ARM::VST4d32_UPD;
+ case ARM::VST4qWB_register_Asm_8: Spacing = 2; return ARM::VST4q8_UPD;
+ case ARM::VST4qWB_register_Asm_16: Spacing = 2; return ARM::VST4q16_UPD;
+ case ARM::VST4qWB_register_Asm_32: Spacing = 2; return ARM::VST4q32_UPD;
+ case ARM::VST4dAsm_8: Spacing = 1; return ARM::VST4d8;
+ case ARM::VST4dAsm_16: Spacing = 1; return ARM::VST4d16;
+ case ARM::VST4dAsm_32: Spacing = 1; return ARM::VST4d32;
+ case ARM::VST4qAsm_8: Spacing = 2; return ARM::VST4q8;
+ case ARM::VST4qAsm_16: Spacing = 2; return ARM::VST4q16;
+ case ARM::VST4qAsm_32: Spacing = 2; return ARM::VST4q32;
+ }
+}
+
+static unsigned getRealVLDOpcode(unsigned Opc, unsigned &Spacing) {
+ switch(Opc) {
+ default: llvm_unreachable("unexpected opcode!");
+ // VLD1LN
+ case ARM::VLD1LNdWB_fixed_Asm_8: Spacing = 1; return ARM::VLD1LNd8_UPD;
+ case ARM::VLD1LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VLD1LNd16_UPD;
+ case ARM::VLD1LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VLD1LNd32_UPD;
+ case ARM::VLD1LNdWB_register_Asm_8: Spacing = 1; return ARM::VLD1LNd8_UPD;
+ case ARM::VLD1LNdWB_register_Asm_16: Spacing = 1; return ARM::VLD1LNd16_UPD;
+ case ARM::VLD1LNdWB_register_Asm_32: Spacing = 1; return ARM::VLD1LNd32_UPD;
+ case ARM::VLD1LNdAsm_8: Spacing = 1; return ARM::VLD1LNd8;
+ case ARM::VLD1LNdAsm_16: Spacing = 1; return ARM::VLD1LNd16;
+ case ARM::VLD1LNdAsm_32: Spacing = 1; return ARM::VLD1LNd32;
+
+ // VLD2LN
+ case ARM::VLD2LNdWB_fixed_Asm_8: Spacing = 1; return ARM::VLD2LNd8_UPD;
+ case ARM::VLD2LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VLD2LNd16_UPD;
+ case ARM::VLD2LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VLD2LNd32_UPD;
+ case ARM::VLD2LNqWB_fixed_Asm_16: Spacing = 1; return ARM::VLD2LNq16_UPD;
+ case ARM::VLD2LNqWB_fixed_Asm_32: Spacing = 2; return ARM::VLD2LNq32_UPD;
+ case ARM::VLD2LNdWB_register_Asm_8: Spacing = 1; return ARM::VLD2LNd8_UPD;
+ case ARM::VLD2LNdWB_register_Asm_16: Spacing = 1; return ARM::VLD2LNd16_UPD;
+ case ARM::VLD2LNdWB_register_Asm_32: Spacing = 1; return ARM::VLD2LNd32_UPD;
+ case ARM::VLD2LNqWB_register_Asm_16: Spacing = 2; return ARM::VLD2LNq16_UPD;
+ case ARM::VLD2LNqWB_register_Asm_32: Spacing = 2; return ARM::VLD2LNq32_UPD;
+ case ARM::VLD2LNdAsm_8: Spacing = 1; return ARM::VLD2LNd8;
+ case ARM::VLD2LNdAsm_16: Spacing = 1; return ARM::VLD2LNd16;
+ case ARM::VLD2LNdAsm_32: Spacing = 1; return ARM::VLD2LNd32;
+ case ARM::VLD2LNqAsm_16: Spacing = 2; return ARM::VLD2LNq16;
+ case ARM::VLD2LNqAsm_32: Spacing = 2; return ARM::VLD2LNq32;
+
+ // VLD3DUP
+ case ARM::VLD3DUPdWB_fixed_Asm_8: Spacing = 1; return ARM::VLD3DUPd8_UPD;
+ case ARM::VLD3DUPdWB_fixed_Asm_16: Spacing = 1; return ARM::VLD3DUPd16_UPD;
+ case ARM::VLD3DUPdWB_fixed_Asm_32: Spacing = 1; return ARM::VLD3DUPd32_UPD;
+ case ARM::VLD3DUPqWB_fixed_Asm_8: Spacing = 1; return ARM::VLD3DUPq8_UPD;
+ case ARM::VLD3DUPqWB_fixed_Asm_16: Spacing = 2; return ARM::VLD3DUPq16_UPD;
+ case ARM::VLD3DUPqWB_fixed_Asm_32: Spacing = 2; return ARM::VLD3DUPq32_UPD;
+ case ARM::VLD3DUPdWB_register_Asm_8: Spacing = 1; return ARM::VLD3DUPd8_UPD;
+ case ARM::VLD3DUPdWB_register_Asm_16: Spacing = 1; return ARM::VLD3DUPd16_UPD;
+ case ARM::VLD3DUPdWB_register_Asm_32: Spacing = 1; return ARM::VLD3DUPd32_UPD;
+ case ARM::VLD3DUPqWB_register_Asm_8: Spacing = 2; return ARM::VLD3DUPq8_UPD;
+ case ARM::VLD3DUPqWB_register_Asm_16: Spacing = 2; return ARM::VLD3DUPq16_UPD;
+ case ARM::VLD3DUPqWB_register_Asm_32: Spacing = 2; return ARM::VLD3DUPq32_UPD;
+ case ARM::VLD3DUPdAsm_8: Spacing = 1; return ARM::VLD3DUPd8;
+ case ARM::VLD3DUPdAsm_16: Spacing = 1; return ARM::VLD3DUPd16;
+ case ARM::VLD3DUPdAsm_32: Spacing = 1; return ARM::VLD3DUPd32;
+ case ARM::VLD3DUPqAsm_8: Spacing = 2; return ARM::VLD3DUPq8;
+ case ARM::VLD3DUPqAsm_16: Spacing = 2; return ARM::VLD3DUPq16;
+ case ARM::VLD3DUPqAsm_32: Spacing = 2; return ARM::VLD3DUPq32;
+
+ // VLD3LN
+ case ARM::VLD3LNdWB_fixed_Asm_8: Spacing = 1; return ARM::VLD3LNd8_UPD;
+ case ARM::VLD3LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VLD3LNd16_UPD;
+ case ARM::VLD3LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VLD3LNd32_UPD;
+ case ARM::VLD3LNqWB_fixed_Asm_16: Spacing = 1; return ARM::VLD3LNq16_UPD;
+ case ARM::VLD3LNqWB_fixed_Asm_32: Spacing = 2; return ARM::VLD3LNq32_UPD;
+ case ARM::VLD3LNdWB_register_Asm_8: Spacing = 1; return ARM::VLD3LNd8_UPD;
+ case ARM::VLD3LNdWB_register_Asm_16: Spacing = 1; return ARM::VLD3LNd16_UPD;
+ case ARM::VLD3LNdWB_register_Asm_32: Spacing = 1; return ARM::VLD3LNd32_UPD;
+ case ARM::VLD3LNqWB_register_Asm_16: Spacing = 2; return ARM::VLD3LNq16_UPD;
+ case ARM::VLD3LNqWB_register_Asm_32: Spacing = 2; return ARM::VLD3LNq32_UPD;
+ case ARM::VLD3LNdAsm_8: Spacing = 1; return ARM::VLD3LNd8;
+ case ARM::VLD3LNdAsm_16: Spacing = 1; return ARM::VLD3LNd16;
+ case ARM::VLD3LNdAsm_32: Spacing = 1; return ARM::VLD3LNd32;
+ case ARM::VLD3LNqAsm_16: Spacing = 2; return ARM::VLD3LNq16;
+ case ARM::VLD3LNqAsm_32: Spacing = 2; return ARM::VLD3LNq32;
+
+ // VLD3
+ case ARM::VLD3dWB_fixed_Asm_8: Spacing = 1; return ARM::VLD3d8_UPD;
+ case ARM::VLD3dWB_fixed_Asm_16: Spacing = 1; return ARM::VLD3d16_UPD;
+ case ARM::VLD3dWB_fixed_Asm_32: Spacing = 1; return ARM::VLD3d32_UPD;
+ case ARM::VLD3qWB_fixed_Asm_8: Spacing = 2; return ARM::VLD3q8_UPD;
+ case ARM::VLD3qWB_fixed_Asm_16: Spacing = 2; return ARM::VLD3q16_UPD;
+ case ARM::VLD3qWB_fixed_Asm_32: Spacing = 2; return ARM::VLD3q32_UPD;
+ case ARM::VLD3dWB_register_Asm_8: Spacing = 1; return ARM::VLD3d8_UPD;
+ case ARM::VLD3dWB_register_Asm_16: Spacing = 1; return ARM::VLD3d16_UPD;
+ case ARM::VLD3dWB_register_Asm_32: Spacing = 1; return ARM::VLD3d32_UPD;
+ case ARM::VLD3qWB_register_Asm_8: Spacing = 2; return ARM::VLD3q8_UPD;
+ case ARM::VLD3qWB_register_Asm_16: Spacing = 2; return ARM::VLD3q16_UPD;
+ case ARM::VLD3qWB_register_Asm_32: Spacing = 2; return ARM::VLD3q32_UPD;
+ case ARM::VLD3dAsm_8: Spacing = 1; return ARM::VLD3d8;
+ case ARM::VLD3dAsm_16: Spacing = 1; return ARM::VLD3d16;
+ case ARM::VLD3dAsm_32: Spacing = 1; return ARM::VLD3d32;
+ case ARM::VLD3qAsm_8: Spacing = 2; return ARM::VLD3q8;
+ case ARM::VLD3qAsm_16: Spacing = 2; return ARM::VLD3q16;
+ case ARM::VLD3qAsm_32: Spacing = 2; return ARM::VLD3q32;
+
+ // VLD4LN
+ case ARM::VLD4LNdWB_fixed_Asm_8: Spacing = 1; return ARM::VLD4LNd8_UPD;
+ case ARM::VLD4LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VLD4LNd16_UPD;
+ case ARM::VLD4LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VLD4LNd32_UPD;
+ case ARM::VLD4LNqWB_fixed_Asm_16: Spacing = 2; return ARM::VLD4LNq16_UPD;
+ case ARM::VLD4LNqWB_fixed_Asm_32: Spacing = 2; return ARM::VLD4LNq32_UPD;
+ case ARM::VLD4LNdWB_register_Asm_8: Spacing = 1; return ARM::VLD4LNd8_UPD;
+ case ARM::VLD4LNdWB_register_Asm_16: Spacing = 1; return ARM::VLD4LNd16_UPD;
+ case ARM::VLD4LNdWB_register_Asm_32: Spacing = 1; return ARM::VLD4LNd32_UPD;
+ case ARM::VLD4LNqWB_register_Asm_16: Spacing = 2; return ARM::VLD4LNq16_UPD;
+ case ARM::VLD4LNqWB_register_Asm_32: Spacing = 2; return ARM::VLD4LNq32_UPD;
+ case ARM::VLD4LNdAsm_8: Spacing = 1; return ARM::VLD4LNd8;
+ case ARM::VLD4LNdAsm_16: Spacing = 1; return ARM::VLD4LNd16;
+ case ARM::VLD4LNdAsm_32: Spacing = 1; return ARM::VLD4LNd32;
+ case ARM::VLD4LNqAsm_16: Spacing = 2; return ARM::VLD4LNq16;
+ case ARM::VLD4LNqAsm_32: Spacing = 2; return ARM::VLD4LNq32;
+
+ // VLD4DUP
+ case ARM::VLD4DUPdWB_fixed_Asm_8: Spacing = 1; return ARM::VLD4DUPd8_UPD;
+ case ARM::VLD4DUPdWB_fixed_Asm_16: Spacing = 1; return ARM::VLD4DUPd16_UPD;
+ case ARM::VLD4DUPdWB_fixed_Asm_32: Spacing = 1; return ARM::VLD4DUPd32_UPD;
+ case ARM::VLD4DUPqWB_fixed_Asm_8: Spacing = 1; return ARM::VLD4DUPq8_UPD;
+ case ARM::VLD4DUPqWB_fixed_Asm_16: Spacing = 1; return ARM::VLD4DUPq16_UPD;
+ case ARM::VLD4DUPqWB_fixed_Asm_32: Spacing = 2; return ARM::VLD4DUPq32_UPD;
+ case ARM::VLD4DUPdWB_register_Asm_8: Spacing = 1; return ARM::VLD4DUPd8_UPD;
+ case ARM::VLD4DUPdWB_register_Asm_16: Spacing = 1; return ARM::VLD4DUPd16_UPD;
+ case ARM::VLD4DUPdWB_register_Asm_32: Spacing = 1; return ARM::VLD4DUPd32_UPD;
+ case ARM::VLD4DUPqWB_register_Asm_8: Spacing = 2; return ARM::VLD4DUPq8_UPD;
+ case ARM::VLD4DUPqWB_register_Asm_16: Spacing = 2; return ARM::VLD4DUPq16_UPD;
+ case ARM::VLD4DUPqWB_register_Asm_32: Spacing = 2; return ARM::VLD4DUPq32_UPD;
+ case ARM::VLD4DUPdAsm_8: Spacing = 1; return ARM::VLD4DUPd8;
+ case ARM::VLD4DUPdAsm_16: Spacing = 1; return ARM::VLD4DUPd16;
+ case ARM::VLD4DUPdAsm_32: Spacing = 1; return ARM::VLD4DUPd32;
+ case ARM::VLD4DUPqAsm_8: Spacing = 2; return ARM::VLD4DUPq8;
+ case ARM::VLD4DUPqAsm_16: Spacing = 2; return ARM::VLD4DUPq16;
+ case ARM::VLD4DUPqAsm_32: Spacing = 2; return ARM::VLD4DUPq32;
+
+ // VLD4
+ case ARM::VLD4dWB_fixed_Asm_8: Spacing = 1; return ARM::VLD4d8_UPD;
+ case ARM::VLD4dWB_fixed_Asm_16: Spacing = 1; return ARM::VLD4d16_UPD;
+ case ARM::VLD4dWB_fixed_Asm_32: Spacing = 1; return ARM::VLD4d32_UPD;
+ case ARM::VLD4qWB_fixed_Asm_8: Spacing = 2; return ARM::VLD4q8_UPD;
+ case ARM::VLD4qWB_fixed_Asm_16: Spacing = 2; return ARM::VLD4q16_UPD;
+ case ARM::VLD4qWB_fixed_Asm_32: Spacing = 2; return ARM::VLD4q32_UPD;
+ case ARM::VLD4dWB_register_Asm_8: Spacing = 1; return ARM::VLD4d8_UPD;
+ case ARM::VLD4dWB_register_Asm_16: Spacing = 1; return ARM::VLD4d16_UPD;
+ case ARM::VLD4dWB_register_Asm_32: Spacing = 1; return ARM::VLD4d32_UPD;
+ case ARM::VLD4qWB_register_Asm_8: Spacing = 2; return ARM::VLD4q8_UPD;
+ case ARM::VLD4qWB_register_Asm_16: Spacing = 2; return ARM::VLD4q16_UPD;
+ case ARM::VLD4qWB_register_Asm_32: Spacing = 2; return ARM::VLD4q32_UPD;
+ case ARM::VLD4dAsm_8: Spacing = 1; return ARM::VLD4d8;
+ case ARM::VLD4dAsm_16: Spacing = 1; return ARM::VLD4d16;
+ case ARM::VLD4dAsm_32: Spacing = 1; return ARM::VLD4d32;
+ case ARM::VLD4qAsm_8: Spacing = 2; return ARM::VLD4q8;
+ case ARM::VLD4qAsm_16: Spacing = 2; return ARM::VLD4q16;
+ case ARM::VLD4qAsm_32: Spacing = 2; return ARM::VLD4q32;
+ }
+}
+
+bool ARMAsmParser::processInstruction(MCInst &Inst,
+ const OperandVector &Operands,
+ MCStreamer &Out) {
+ switch (Inst.getOpcode()) {
+ // Alias for alternate form of 'ldr{,b}t Rt, [Rn], #imm' instruction.
+ case ARM::LDRT_POST:
+ case ARM::LDRBT_POST: {
+ const unsigned Opcode =
+ (Inst.getOpcode() == ARM::LDRT_POST) ? ARM::LDRT_POST_IMM
+ : ARM::LDRBT_POST_IMM;
+ MCInst TmpInst;
+ TmpInst.setOpcode(Opcode);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+ TmpInst.addOperand(Inst.getOperand(1));
+ TmpInst.addOperand(MCOperand::createReg(0));
+ TmpInst.addOperand(MCOperand::createImm(0));
+ TmpInst.addOperand(Inst.getOperand(2));
+ TmpInst.addOperand(Inst.getOperand(3));
+ Inst = TmpInst;
+ return true;
+ }
+ // Alias for alternate form of 'str{,b}t Rt, [Rn], #imm' instruction.
+ case ARM::STRT_POST:
+ case ARM::STRBT_POST: {
+ const unsigned Opcode =
+ (Inst.getOpcode() == ARM::STRT_POST) ? ARM::STRT_POST_IMM
+ : ARM::STRBT_POST_IMM;
+ MCInst TmpInst;
+ TmpInst.setOpcode(Opcode);
+ TmpInst.addOperand(Inst.getOperand(1));
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+ TmpInst.addOperand(MCOperand::createReg(0));
+ TmpInst.addOperand(MCOperand::createImm(0));
+ TmpInst.addOperand(Inst.getOperand(2));
+ TmpInst.addOperand(Inst.getOperand(3));
+ Inst = TmpInst;
+ return true;
+ }
+ // Alias for alternate form of 'ADR Rd, #imm' instruction.
+ case ARM::ADDri: {
+ if (Inst.getOperand(1).getReg() != ARM::PC ||
+ Inst.getOperand(5).getReg() != 0 ||
+ !(Inst.getOperand(2).isExpr() || Inst.getOperand(2).isImm()))
+ return false;
+ MCInst TmpInst;
+ TmpInst.setOpcode(ARM::ADR);
+ TmpInst.addOperand(Inst.getOperand(0));
+ if (Inst.getOperand(2).isImm()) {
+ // Immediate (mod_imm) will be in its encoded form, we must unencode it
+ // before passing it to the ADR instruction.
+ unsigned Enc = Inst.getOperand(2).getImm();
+ TmpInst.addOperand(MCOperand::createImm(
+ ARM_AM::rotr32(Enc & 0xFF, (Enc & 0xF00) >> 7)));
+ } else {
+ // Turn PC-relative expression into absolute expression.
+ // Reading PC provides the start of the current instruction + 8 and
+ // the transform to adr is biased by that.
+ MCSymbol *Dot = getContext().createTempSymbol();
+ Out.EmitLabel(Dot);
+ const MCExpr *OpExpr = Inst.getOperand(2).getExpr();
+ const MCExpr *InstPC = MCSymbolRefExpr::create(Dot,
+ MCSymbolRefExpr::VK_None,
+ getContext());
+ const MCExpr *Const8 = MCConstantExpr::create(8, getContext());
+ const MCExpr *ReadPC = MCBinaryExpr::createAdd(InstPC, Const8,
+ getContext());
+ const MCExpr *FixupAddr = MCBinaryExpr::createAdd(ReadPC, OpExpr,
+ getContext());
+ TmpInst.addOperand(MCOperand::createExpr(FixupAddr));
+ }
+ TmpInst.addOperand(Inst.getOperand(3));
+ TmpInst.addOperand(Inst.getOperand(4));
+ Inst = TmpInst;
+ return true;
+ }
+ // Aliases for alternate PC+imm syntax of LDR instructions.
+ case ARM::t2LDRpcrel:
+ // Select the narrow version if the immediate will fit.
+ if (Inst.getOperand(1).getImm() > 0 &&
+ Inst.getOperand(1).getImm() <= 0xff &&
+ !(static_cast<ARMOperand &>(*Operands[2]).isToken() &&
+ static_cast<ARMOperand &>(*Operands[2]).getToken() == ".w"))
+ Inst.setOpcode(ARM::tLDRpci);
+ else
+ Inst.setOpcode(ARM::t2LDRpci);
+ return true;
+ case ARM::t2LDRBpcrel:
+ Inst.setOpcode(ARM::t2LDRBpci);
+ return true;
+ case ARM::t2LDRHpcrel:
+ Inst.setOpcode(ARM::t2LDRHpci);
+ return true;
+ case ARM::t2LDRSBpcrel:
+ Inst.setOpcode(ARM::t2LDRSBpci);
+ return true;
+ case ARM::t2LDRSHpcrel:
+ Inst.setOpcode(ARM::t2LDRSHpci);
+ return true;
+ case ARM::LDRConstPool:
+ case ARM::tLDRConstPool:
+ case ARM::t2LDRConstPool: {
+ // Pseudo instruction ldr rt, =immediate is converted to a
+ // MOV rt, immediate if immediate is known and representable
+ // otherwise we create a constant pool entry that we load from.
+ MCInst TmpInst;
+ if (Inst.getOpcode() == ARM::LDRConstPool)
+ TmpInst.setOpcode(ARM::LDRi12);
+ else if (Inst.getOpcode() == ARM::tLDRConstPool)
+ TmpInst.setOpcode(ARM::tLDRpci);
+ else if (Inst.getOpcode() == ARM::t2LDRConstPool)
+ TmpInst.setOpcode(ARM::t2LDRpci);
+ const ARMOperand &PoolOperand =
+ (static_cast<ARMOperand &>(*Operands[2]).isToken() &&
+ static_cast<ARMOperand &>(*Operands[2]).getToken() == ".w") ?
+ static_cast<ARMOperand &>(*Operands[4]) :
+ static_cast<ARMOperand &>(*Operands[3]);
+ const MCExpr *SubExprVal = PoolOperand.getConstantPoolImm();
+ // If SubExprVal is a constant we may be able to use a MOV
+ if (isa<MCConstantExpr>(SubExprVal) &&
+ Inst.getOperand(0).getReg() != ARM::PC &&
+ Inst.getOperand(0).getReg() != ARM::SP) {
+ int64_t Value =
+ (int64_t) (cast<MCConstantExpr>(SubExprVal))->getValue();
+ bool UseMov = true;
+ bool MovHasS = true;
+ if (Inst.getOpcode() == ARM::LDRConstPool) {
+ // ARM Constant
+ if (ARM_AM::getSOImmVal(Value) != -1) {
+ Value = ARM_AM::getSOImmVal(Value);
+ TmpInst.setOpcode(ARM::MOVi);
+ }
+ else if (ARM_AM::getSOImmVal(~Value) != -1) {
+ Value = ARM_AM::getSOImmVal(~Value);
+ TmpInst.setOpcode(ARM::MVNi);
+ }
+ else if (hasV6T2Ops() &&
+ Value >=0 && Value < 65536) {
+ TmpInst.setOpcode(ARM::MOVi16);
+ MovHasS = false;
+ }
+ else
+ UseMov = false;
+ }
+ else {
+ // Thumb/Thumb2 Constant
+ if (hasThumb2() &&
+ ARM_AM::getT2SOImmVal(Value) != -1)
+ TmpInst.setOpcode(ARM::t2MOVi);
+ else if (hasThumb2() &&
+ ARM_AM::getT2SOImmVal(~Value) != -1) {
+ TmpInst.setOpcode(ARM::t2MVNi);
+ Value = ~Value;
+ }
+ else if (hasV8MBaseline() &&
+ Value >=0 && Value < 65536) {
+ TmpInst.setOpcode(ARM::t2MOVi16);
+ MovHasS = false;
+ }
+ else
+ UseMov = false;
+ }
+ if (UseMov) {
+ TmpInst.addOperand(Inst.getOperand(0)); // Rt
+ TmpInst.addOperand(MCOperand::createImm(Value)); // Immediate
+ TmpInst.addOperand(Inst.getOperand(2)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+ if (MovHasS)
+ TmpInst.addOperand(MCOperand::createReg(0)); // S
+ Inst = TmpInst;
+ return true;
+ }
+ }
+ // No opportunity to use MOV/MVN create constant pool
+ const MCExpr *CPLoc =
+ getTargetStreamer().addConstantPoolEntry(SubExprVal,
+ PoolOperand.getStartLoc());
+ TmpInst.addOperand(Inst.getOperand(0)); // Rt
+ TmpInst.addOperand(MCOperand::createExpr(CPLoc)); // offset to constpool
+ if (TmpInst.getOpcode() == ARM::LDRi12)
+ TmpInst.addOperand(MCOperand::createImm(0)); // unused offset
+ TmpInst.addOperand(Inst.getOperand(2)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+ Inst = TmpInst;
+ return true;
+ }
+ // Handle NEON VST complex aliases.
+ case ARM::VST1LNdWB_register_Asm_8:
+ case ARM::VST1LNdWB_register_Asm_16:
+ case ARM::VST1LNdWB_register_Asm_32: {
+ MCInst TmpInst;
+ // Shuffle the operands around so the lane index operand is in the
+ // right place.
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn
+ TmpInst.addOperand(Inst.getOperand(3)); // alignment
+ TmpInst.addOperand(Inst.getOperand(4)); // Rm
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(Inst.getOperand(1)); // lane
+ TmpInst.addOperand(Inst.getOperand(5)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(6));
+ Inst = TmpInst;
+ return true;
+ }
+
+ case ARM::VST2LNdWB_register_Asm_8:
+ case ARM::VST2LNdWB_register_Asm_16:
+ case ARM::VST2LNdWB_register_Asm_32:
+ case ARM::VST2LNqWB_register_Asm_16:
+ case ARM::VST2LNqWB_register_Asm_32: {
+ MCInst TmpInst;
+ // Shuffle the operands around so the lane index operand is in the
+ // right place.
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn
+ TmpInst.addOperand(Inst.getOperand(3)); // alignment
+ TmpInst.addOperand(Inst.getOperand(4)); // Rm
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(Inst.getOperand(1)); // lane
+ TmpInst.addOperand(Inst.getOperand(5)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(6));
+ Inst = TmpInst;
+ return true;
+ }
+
+ case ARM::VST3LNdWB_register_Asm_8:
+ case ARM::VST3LNdWB_register_Asm_16:
+ case ARM::VST3LNdWB_register_Asm_32:
+ case ARM::VST3LNqWB_register_Asm_16:
+ case ARM::VST3LNqWB_register_Asm_32: {
+ MCInst TmpInst;
+ // Shuffle the operands around so the lane index operand is in the
+ // right place.
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn
+ TmpInst.addOperand(Inst.getOperand(3)); // alignment
+ TmpInst.addOperand(Inst.getOperand(4)); // Rm
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 2));
+ TmpInst.addOperand(Inst.getOperand(1)); // lane
+ TmpInst.addOperand(Inst.getOperand(5)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(6));
+ Inst = TmpInst;
+ return true;
+ }
+
+ case ARM::VST4LNdWB_register_Asm_8:
+ case ARM::VST4LNdWB_register_Asm_16:
+ case ARM::VST4LNdWB_register_Asm_32:
+ case ARM::VST4LNqWB_register_Asm_16:
+ case ARM::VST4LNqWB_register_Asm_32: {
+ MCInst TmpInst;
+ // Shuffle the operands around so the lane index operand is in the
+ // right place.
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn
+ TmpInst.addOperand(Inst.getOperand(3)); // alignment
+ TmpInst.addOperand(Inst.getOperand(4)); // Rm
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 2));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 3));
+ TmpInst.addOperand(Inst.getOperand(1)); // lane
+ TmpInst.addOperand(Inst.getOperand(5)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(6));
+ Inst = TmpInst;
+ return true;
+ }
+
+ case ARM::VST1LNdWB_fixed_Asm_8:
+ case ARM::VST1LNdWB_fixed_Asm_16:
+ case ARM::VST1LNdWB_fixed_Asm_32: {
+ MCInst TmpInst;
+ // Shuffle the operands around so the lane index operand is in the
+ // right place.
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn
+ TmpInst.addOperand(Inst.getOperand(3)); // alignment
+ TmpInst.addOperand(MCOperand::createReg(0)); // Rm
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(Inst.getOperand(1)); // lane
+ TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(5));
+ Inst = TmpInst;
+ return true;
+ }
+
+ case ARM::VST2LNdWB_fixed_Asm_8:
+ case ARM::VST2LNdWB_fixed_Asm_16:
+ case ARM::VST2LNdWB_fixed_Asm_32:
+ case ARM::VST2LNqWB_fixed_Asm_16:
+ case ARM::VST2LNqWB_fixed_Asm_32: {
+ MCInst TmpInst;
+ // Shuffle the operands around so the lane index operand is in the
+ // right place.
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn
+ TmpInst.addOperand(Inst.getOperand(3)); // alignment
+ TmpInst.addOperand(MCOperand::createReg(0)); // Rm
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(Inst.getOperand(1)); // lane
+ TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(5));
+ Inst = TmpInst;
+ return true;
+ }
+
+ case ARM::VST3LNdWB_fixed_Asm_8:
+ case ARM::VST3LNdWB_fixed_Asm_16:
+ case ARM::VST3LNdWB_fixed_Asm_32:
+ case ARM::VST3LNqWB_fixed_Asm_16:
+ case ARM::VST3LNqWB_fixed_Asm_32: {
+ MCInst TmpInst;
+ // Shuffle the operands around so the lane index operand is in the
+ // right place.
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn
+ TmpInst.addOperand(Inst.getOperand(3)); // alignment
+ TmpInst.addOperand(MCOperand::createReg(0)); // Rm
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 2));
+ TmpInst.addOperand(Inst.getOperand(1)); // lane
+ TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(5));
+ Inst = TmpInst;
+ return true;
+ }
+
+ case ARM::VST4LNdWB_fixed_Asm_8:
+ case ARM::VST4LNdWB_fixed_Asm_16:
+ case ARM::VST4LNdWB_fixed_Asm_32:
+ case ARM::VST4LNqWB_fixed_Asm_16:
+ case ARM::VST4LNqWB_fixed_Asm_32: {
+ MCInst TmpInst;
+ // Shuffle the operands around so the lane index operand is in the
+ // right place.
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn
+ TmpInst.addOperand(Inst.getOperand(3)); // alignment
+ TmpInst.addOperand(MCOperand::createReg(0)); // Rm
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 2));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 3));
+ TmpInst.addOperand(Inst.getOperand(1)); // lane
+ TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(5));
+ Inst = TmpInst;
+ return true;
+ }
+
+ case ARM::VST1LNdAsm_8:
+ case ARM::VST1LNdAsm_16:
+ case ARM::VST1LNdAsm_32: {
+ MCInst TmpInst;
+ // Shuffle the operands around so the lane index operand is in the
+ // right place.
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn
+ TmpInst.addOperand(Inst.getOperand(3)); // alignment
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(Inst.getOperand(1)); // lane
+ TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(5));
+ Inst = TmpInst;
+ return true;
+ }
+
+ case ARM::VST2LNdAsm_8:
+ case ARM::VST2LNdAsm_16:
+ case ARM::VST2LNdAsm_32:
+ case ARM::VST2LNqAsm_16:
+ case ARM::VST2LNqAsm_32: {
+ MCInst TmpInst;
+ // Shuffle the operands around so the lane index operand is in the
+ // right place.
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn
+ TmpInst.addOperand(Inst.getOperand(3)); // alignment
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(Inst.getOperand(1)); // lane
+ TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(5));
+ Inst = TmpInst;
+ return true;
+ }
+
+ case ARM::VST3LNdAsm_8:
+ case ARM::VST3LNdAsm_16:
+ case ARM::VST3LNdAsm_32:
+ case ARM::VST3LNqAsm_16:
+ case ARM::VST3LNqAsm_32: {
+ MCInst TmpInst;
+ // Shuffle the operands around so the lane index operand is in the
+ // right place.
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn
+ TmpInst.addOperand(Inst.getOperand(3)); // alignment
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 2));
+ TmpInst.addOperand(Inst.getOperand(1)); // lane
+ TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(5));
+ Inst = TmpInst;
+ return true;
+ }
+
+ case ARM::VST4LNdAsm_8:
+ case ARM::VST4LNdAsm_16:
+ case ARM::VST4LNdAsm_32:
+ case ARM::VST4LNqAsm_16:
+ case ARM::VST4LNqAsm_32: {
+ MCInst TmpInst;
+ // Shuffle the operands around so the lane index operand is in the
+ // right place.
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn
+ TmpInst.addOperand(Inst.getOperand(3)); // alignment
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 2));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 3));
+ TmpInst.addOperand(Inst.getOperand(1)); // lane
+ TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(5));
+ Inst = TmpInst;
+ return true;
+ }
+
+ // Handle NEON VLD complex aliases.
+ case ARM::VLD1LNdWB_register_Asm_8:
+ case ARM::VLD1LNdWB_register_Asm_16:
+ case ARM::VLD1LNdWB_register_Asm_32: {
+ MCInst TmpInst;
+ // Shuffle the operands around so the lane index operand is in the
+ // right place.
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn
+ TmpInst.addOperand(Inst.getOperand(3)); // alignment
+ TmpInst.addOperand(Inst.getOperand(4)); // Rm
+ TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+ TmpInst.addOperand(Inst.getOperand(1)); // lane
+ TmpInst.addOperand(Inst.getOperand(5)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(6));
+ Inst = TmpInst;
+ return true;
+ }
+
+ case ARM::VLD2LNdWB_register_Asm_8:
+ case ARM::VLD2LNdWB_register_Asm_16:
+ case ARM::VLD2LNdWB_register_Asm_32:
+ case ARM::VLD2LNqWB_register_Asm_16:
+ case ARM::VLD2LNqWB_register_Asm_32: {
+ MCInst TmpInst;
+ // Shuffle the operands around so the lane index operand is in the
+ // right place.
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn
+ TmpInst.addOperand(Inst.getOperand(3)); // alignment
+ TmpInst.addOperand(Inst.getOperand(4)); // Rm
+ TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(Inst.getOperand(1)); // lane
+ TmpInst.addOperand(Inst.getOperand(5)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(6));
+ Inst = TmpInst;
+ return true;
+ }
+
+ case ARM::VLD3LNdWB_register_Asm_8:
+ case ARM::VLD3LNdWB_register_Asm_16:
+ case ARM::VLD3LNdWB_register_Asm_32:
+ case ARM::VLD3LNqWB_register_Asm_16:
+ case ARM::VLD3LNqWB_register_Asm_32: {
+ MCInst TmpInst;
+ // Shuffle the operands around so the lane index operand is in the
+ // right place.
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 2));
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn
+ TmpInst.addOperand(Inst.getOperand(3)); // alignment
+ TmpInst.addOperand(Inst.getOperand(4)); // Rm
+ TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 2));
+ TmpInst.addOperand(Inst.getOperand(1)); // lane
+ TmpInst.addOperand(Inst.getOperand(5)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(6));
+ Inst = TmpInst;
+ return true;
+ }
+
+ case ARM::VLD4LNdWB_register_Asm_8:
+ case ARM::VLD4LNdWB_register_Asm_16:
+ case ARM::VLD4LNdWB_register_Asm_32:
+ case ARM::VLD4LNqWB_register_Asm_16:
+ case ARM::VLD4LNqWB_register_Asm_32: {
+ MCInst TmpInst;
+ // Shuffle the operands around so the lane index operand is in the
+ // right place.
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 2));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 3));
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn
+ TmpInst.addOperand(Inst.getOperand(3)); // alignment
+ TmpInst.addOperand(Inst.getOperand(4)); // Rm
+ TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 2));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 3));
+ TmpInst.addOperand(Inst.getOperand(1)); // lane
+ TmpInst.addOperand(Inst.getOperand(5)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(6));
+ Inst = TmpInst;
+ return true;
+ }
+
+ case ARM::VLD1LNdWB_fixed_Asm_8:
+ case ARM::VLD1LNdWB_fixed_Asm_16:
+ case ARM::VLD1LNdWB_fixed_Asm_32: {
+ MCInst TmpInst;
+ // Shuffle the operands around so the lane index operand is in the
+ // right place.
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn
+ TmpInst.addOperand(Inst.getOperand(3)); // alignment
+ TmpInst.addOperand(MCOperand::createReg(0)); // Rm
+ TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+ TmpInst.addOperand(Inst.getOperand(1)); // lane
+ TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(5));
+ Inst = TmpInst;
+ return true;
+ }
+
+ case ARM::VLD2LNdWB_fixed_Asm_8:
+ case ARM::VLD2LNdWB_fixed_Asm_16:
+ case ARM::VLD2LNdWB_fixed_Asm_32:
+ case ARM::VLD2LNqWB_fixed_Asm_16:
+ case ARM::VLD2LNqWB_fixed_Asm_32: {
+ MCInst TmpInst;
+ // Shuffle the operands around so the lane index operand is in the
+ // right place.
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn
+ TmpInst.addOperand(Inst.getOperand(3)); // alignment
+ TmpInst.addOperand(MCOperand::createReg(0)); // Rm
+ TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(Inst.getOperand(1)); // lane
+ TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(5));
+ Inst = TmpInst;
+ return true;
+ }
+
+ case ARM::VLD3LNdWB_fixed_Asm_8:
+ case ARM::VLD3LNdWB_fixed_Asm_16:
+ case ARM::VLD3LNdWB_fixed_Asm_32:
+ case ARM::VLD3LNqWB_fixed_Asm_16:
+ case ARM::VLD3LNqWB_fixed_Asm_32: {
+ MCInst TmpInst;
+ // Shuffle the operands around so the lane index operand is in the
+ // right place.
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 2));
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn
+ TmpInst.addOperand(Inst.getOperand(3)); // alignment
+ TmpInst.addOperand(MCOperand::createReg(0)); // Rm
+ TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 2));
+ TmpInst.addOperand(Inst.getOperand(1)); // lane
+ TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(5));
+ Inst = TmpInst;
+ return true;
+ }
+
+ case ARM::VLD4LNdWB_fixed_Asm_8:
+ case ARM::VLD4LNdWB_fixed_Asm_16:
+ case ARM::VLD4LNdWB_fixed_Asm_32:
+ case ARM::VLD4LNqWB_fixed_Asm_16:
+ case ARM::VLD4LNqWB_fixed_Asm_32: {
+ MCInst TmpInst;
+ // Shuffle the operands around so the lane index operand is in the
+ // right place.
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 2));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 3));
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn
+ TmpInst.addOperand(Inst.getOperand(3)); // alignment
+ TmpInst.addOperand(MCOperand::createReg(0)); // Rm
+ TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 2));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 3));
+ TmpInst.addOperand(Inst.getOperand(1)); // lane
+ TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(5));
+ Inst = TmpInst;
+ return true;
+ }
+
+ case ARM::VLD1LNdAsm_8:
+ case ARM::VLD1LNdAsm_16:
+ case ARM::VLD1LNdAsm_32: {
+ MCInst TmpInst;
+ // Shuffle the operands around so the lane index operand is in the
+ // right place.
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn
+ TmpInst.addOperand(Inst.getOperand(3)); // alignment
+ TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+ TmpInst.addOperand(Inst.getOperand(1)); // lane
+ TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(5));
+ Inst = TmpInst;
+ return true;
+ }
+
+ case ARM::VLD2LNdAsm_8:
+ case ARM::VLD2LNdAsm_16:
+ case ARM::VLD2LNdAsm_32:
+ case ARM::VLD2LNqAsm_16:
+ case ARM::VLD2LNqAsm_32: {
+ MCInst TmpInst;
+ // Shuffle the operands around so the lane index operand is in the
+ // right place.
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn
+ TmpInst.addOperand(Inst.getOperand(3)); // alignment
+ TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(Inst.getOperand(1)); // lane
+ TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(5));
+ Inst = TmpInst;
+ return true;
+ }
+
+ case ARM::VLD3LNdAsm_8:
+ case ARM::VLD3LNdAsm_16:
+ case ARM::VLD3LNdAsm_32:
+ case ARM::VLD3LNqAsm_16:
+ case ARM::VLD3LNqAsm_32: {
+ MCInst TmpInst;
+ // Shuffle the operands around so the lane index operand is in the
+ // right place.
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 2));
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn
+ TmpInst.addOperand(Inst.getOperand(3)); // alignment
+ TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 2));
+ TmpInst.addOperand(Inst.getOperand(1)); // lane
+ TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(5));
+ Inst = TmpInst;
+ return true;
+ }
+
+ case ARM::VLD4LNdAsm_8:
+ case ARM::VLD4LNdAsm_16:
+ case ARM::VLD4LNdAsm_32:
+ case ARM::VLD4LNqAsm_16:
+ case ARM::VLD4LNqAsm_32: {
+ MCInst TmpInst;
+ // Shuffle the operands around so the lane index operand is in the
+ // right place.
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 2));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 3));
+ TmpInst.addOperand(Inst.getOperand(2)); // Rn
+ TmpInst.addOperand(Inst.getOperand(3)); // alignment
+ TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 2));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 3));
+ TmpInst.addOperand(Inst.getOperand(1)); // lane
+ TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(5));
+ Inst = TmpInst;
+ return true;
+ }
+
+ // VLD3DUP single 3-element structure to all lanes instructions.
+ case ARM::VLD3DUPdAsm_8:
+ case ARM::VLD3DUPdAsm_16:
+ case ARM::VLD3DUPdAsm_32:
+ case ARM::VLD3DUPqAsm_8:
+ case ARM::VLD3DUPqAsm_16:
+ case ARM::VLD3DUPqAsm_32: {
+ MCInst TmpInst;
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 2));
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn
+ TmpInst.addOperand(Inst.getOperand(2)); // alignment
+ TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(4));
+ Inst = TmpInst;
+ return true;
+ }
+
+ case ARM::VLD3DUPdWB_fixed_Asm_8:
+ case ARM::VLD3DUPdWB_fixed_Asm_16:
+ case ARM::VLD3DUPdWB_fixed_Asm_32:
+ case ARM::VLD3DUPqWB_fixed_Asm_8:
+ case ARM::VLD3DUPqWB_fixed_Asm_16:
+ case ARM::VLD3DUPqWB_fixed_Asm_32: {
+ MCInst TmpInst;
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 2));
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+ TmpInst.addOperand(Inst.getOperand(2)); // alignment
+ TmpInst.addOperand(MCOperand::createReg(0)); // Rm
+ TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(4));
+ Inst = TmpInst;
+ return true;
+ }
+
+ case ARM::VLD3DUPdWB_register_Asm_8:
+ case ARM::VLD3DUPdWB_register_Asm_16:
+ case ARM::VLD3DUPdWB_register_Asm_32:
+ case ARM::VLD3DUPqWB_register_Asm_8:
+ case ARM::VLD3DUPqWB_register_Asm_16:
+ case ARM::VLD3DUPqWB_register_Asm_32: {
+ MCInst TmpInst;
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 2));
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+ TmpInst.addOperand(Inst.getOperand(2)); // alignment
+ TmpInst.addOperand(Inst.getOperand(3)); // Rm
+ TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(5));
+ Inst = TmpInst;
+ return true;
+ }
+
+ // VLD3 multiple 3-element structure instructions.
+ case ARM::VLD3dAsm_8:
+ case ARM::VLD3dAsm_16:
+ case ARM::VLD3dAsm_32:
+ case ARM::VLD3qAsm_8:
+ case ARM::VLD3qAsm_16:
+ case ARM::VLD3qAsm_32: {
+ MCInst TmpInst;
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 2));
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn
+ TmpInst.addOperand(Inst.getOperand(2)); // alignment
+ TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(4));
+ Inst = TmpInst;
+ return true;
+ }
+
+ case ARM::VLD3dWB_fixed_Asm_8:
+ case ARM::VLD3dWB_fixed_Asm_16:
+ case ARM::VLD3dWB_fixed_Asm_32:
+ case ARM::VLD3qWB_fixed_Asm_8:
+ case ARM::VLD3qWB_fixed_Asm_16:
+ case ARM::VLD3qWB_fixed_Asm_32: {
+ MCInst TmpInst;
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 2));
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+ TmpInst.addOperand(Inst.getOperand(2)); // alignment
+ TmpInst.addOperand(MCOperand::createReg(0)); // Rm
+ TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(4));
+ Inst = TmpInst;
+ return true;
+ }
+
+ case ARM::VLD3dWB_register_Asm_8:
+ case ARM::VLD3dWB_register_Asm_16:
+ case ARM::VLD3dWB_register_Asm_32:
+ case ARM::VLD3qWB_register_Asm_8:
+ case ARM::VLD3qWB_register_Asm_16:
+ case ARM::VLD3qWB_register_Asm_32: {
+ MCInst TmpInst;
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 2));
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+ TmpInst.addOperand(Inst.getOperand(2)); // alignment
+ TmpInst.addOperand(Inst.getOperand(3)); // Rm
+ TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(5));
+ Inst = TmpInst;
+ return true;
+ }
+
+ // VLD4DUP single 3-element structure to all lanes instructions.
+ case ARM::VLD4DUPdAsm_8:
+ case ARM::VLD4DUPdAsm_16:
+ case ARM::VLD4DUPdAsm_32:
+ case ARM::VLD4DUPqAsm_8:
+ case ARM::VLD4DUPqAsm_16:
+ case ARM::VLD4DUPqAsm_32: {
+ MCInst TmpInst;
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 2));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 3));
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn
+ TmpInst.addOperand(Inst.getOperand(2)); // alignment
+ TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(4));
+ Inst = TmpInst;
+ return true;
+ }
+
+ case ARM::VLD4DUPdWB_fixed_Asm_8:
+ case ARM::VLD4DUPdWB_fixed_Asm_16:
+ case ARM::VLD4DUPdWB_fixed_Asm_32:
+ case ARM::VLD4DUPqWB_fixed_Asm_8:
+ case ARM::VLD4DUPqWB_fixed_Asm_16:
+ case ARM::VLD4DUPqWB_fixed_Asm_32: {
+ MCInst TmpInst;
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 2));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 3));
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+ TmpInst.addOperand(Inst.getOperand(2)); // alignment
+ TmpInst.addOperand(MCOperand::createReg(0)); // Rm
+ TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(4));
+ Inst = TmpInst;
+ return true;
+ }
+
+ case ARM::VLD4DUPdWB_register_Asm_8:
+ case ARM::VLD4DUPdWB_register_Asm_16:
+ case ARM::VLD4DUPdWB_register_Asm_32:
+ case ARM::VLD4DUPqWB_register_Asm_8:
+ case ARM::VLD4DUPqWB_register_Asm_16:
+ case ARM::VLD4DUPqWB_register_Asm_32: {
+ MCInst TmpInst;
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 2));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 3));
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+ TmpInst.addOperand(Inst.getOperand(2)); // alignment
+ TmpInst.addOperand(Inst.getOperand(3)); // Rm
+ TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(5));
+ Inst = TmpInst;
+ return true;
+ }
+
+ // VLD4 multiple 4-element structure instructions.
+ case ARM::VLD4dAsm_8:
+ case ARM::VLD4dAsm_16:
+ case ARM::VLD4dAsm_32:
+ case ARM::VLD4qAsm_8:
+ case ARM::VLD4qAsm_16:
+ case ARM::VLD4qAsm_32: {
+ MCInst TmpInst;
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 2));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 3));
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn
+ TmpInst.addOperand(Inst.getOperand(2)); // alignment
+ TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(4));
+ Inst = TmpInst;
+ return true;
+ }
+
+ case ARM::VLD4dWB_fixed_Asm_8:
+ case ARM::VLD4dWB_fixed_Asm_16:
+ case ARM::VLD4dWB_fixed_Asm_32:
+ case ARM::VLD4qWB_fixed_Asm_8:
+ case ARM::VLD4qWB_fixed_Asm_16:
+ case ARM::VLD4qWB_fixed_Asm_32: {
+ MCInst TmpInst;
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 2));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 3));
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+ TmpInst.addOperand(Inst.getOperand(2)); // alignment
+ TmpInst.addOperand(MCOperand::createReg(0)); // Rm
+ TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(4));
+ Inst = TmpInst;
+ return true;
+ }
+
+ case ARM::VLD4dWB_register_Asm_8:
+ case ARM::VLD4dWB_register_Asm_16:
+ case ARM::VLD4dWB_register_Asm_32:
+ case ARM::VLD4qWB_register_Asm_8:
+ case ARM::VLD4qWB_register_Asm_16:
+ case ARM::VLD4qWB_register_Asm_32: {
+ MCInst TmpInst;
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 2));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 3));
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+ TmpInst.addOperand(Inst.getOperand(2)); // alignment
+ TmpInst.addOperand(Inst.getOperand(3)); // Rm
+ TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(5));
+ Inst = TmpInst;
+ return true;
+ }
+
+ // VST3 multiple 3-element structure instructions.
+ case ARM::VST3dAsm_8:
+ case ARM::VST3dAsm_16:
+ case ARM::VST3dAsm_32:
+ case ARM::VST3qAsm_8:
+ case ARM::VST3qAsm_16:
+ case ARM::VST3qAsm_32: {
+ MCInst TmpInst;
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn
+ TmpInst.addOperand(Inst.getOperand(2)); // alignment
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 2));
+ TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(4));
+ Inst = TmpInst;
+ return true;
+ }
+
+ case ARM::VST3dWB_fixed_Asm_8:
+ case ARM::VST3dWB_fixed_Asm_16:
+ case ARM::VST3dWB_fixed_Asm_32:
+ case ARM::VST3qWB_fixed_Asm_8:
+ case ARM::VST3qWB_fixed_Asm_16:
+ case ARM::VST3qWB_fixed_Asm_32: {
+ MCInst TmpInst;
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+ TmpInst.addOperand(Inst.getOperand(2)); // alignment
+ TmpInst.addOperand(MCOperand::createReg(0)); // Rm
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 2));
+ TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(4));
+ Inst = TmpInst;
+ return true;
+ }
+
+ case ARM::VST3dWB_register_Asm_8:
+ case ARM::VST3dWB_register_Asm_16:
+ case ARM::VST3dWB_register_Asm_32:
+ case ARM::VST3qWB_register_Asm_8:
+ case ARM::VST3qWB_register_Asm_16:
+ case ARM::VST3qWB_register_Asm_32: {
+ MCInst TmpInst;
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+ TmpInst.addOperand(Inst.getOperand(2)); // alignment
+ TmpInst.addOperand(Inst.getOperand(3)); // Rm
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 2));
+ TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(5));
+ Inst = TmpInst;
+ return true;
+ }
+
+ // VST4 multiple 3-element structure instructions.
+ case ARM::VST4dAsm_8:
+ case ARM::VST4dAsm_16:
+ case ARM::VST4dAsm_32:
+ case ARM::VST4qAsm_8:
+ case ARM::VST4qAsm_16:
+ case ARM::VST4qAsm_32: {
+ MCInst TmpInst;
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn
+ TmpInst.addOperand(Inst.getOperand(2)); // alignment
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 2));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 3));
+ TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(4));
+ Inst = TmpInst;
+ return true;
+ }
+
+ case ARM::VST4dWB_fixed_Asm_8:
+ case ARM::VST4dWB_fixed_Asm_16:
+ case ARM::VST4dWB_fixed_Asm_32:
+ case ARM::VST4qWB_fixed_Asm_8:
+ case ARM::VST4qWB_fixed_Asm_16:
+ case ARM::VST4qWB_fixed_Asm_32: {
+ MCInst TmpInst;
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+ TmpInst.addOperand(Inst.getOperand(2)); // alignment
+ TmpInst.addOperand(MCOperand::createReg(0)); // Rm
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 2));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 3));
+ TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(4));
+ Inst = TmpInst;
+ return true;
+ }
+
+ case ARM::VST4dWB_register_Asm_8:
+ case ARM::VST4dWB_register_Asm_16:
+ case ARM::VST4dWB_register_Asm_32:
+ case ARM::VST4qWB_register_Asm_8:
+ case ARM::VST4qWB_register_Asm_16:
+ case ARM::VST4qWB_register_Asm_32: {
+ MCInst TmpInst;
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+ TmpInst.addOperand(Inst.getOperand(2)); // alignment
+ TmpInst.addOperand(Inst.getOperand(3)); // Rm
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 2));
+ TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+ Spacing * 3));
+ TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(5));
+ Inst = TmpInst;
+ return true;
+ }
+
+ // Handle encoding choice for the shift-immediate instructions.
+ case ARM::t2LSLri:
+ case ARM::t2LSRri:
+ case ARM::t2ASRri: {
+ if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
+ Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() &&
+ Inst.getOperand(5).getReg() == (inITBlock() ? 0 : ARM::CPSR) &&
+ !(static_cast<ARMOperand &>(*Operands[3]).isToken() &&
+ static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w")) {
+ unsigned NewOpc;
+ switch (Inst.getOpcode()) {
+ default: llvm_unreachable("unexpected opcode");
+ case ARM::t2LSLri: NewOpc = ARM::tLSLri; break;
+ case ARM::t2LSRri: NewOpc = ARM::tLSRri; break;
+ case ARM::t2ASRri: NewOpc = ARM::tASRri; break;
+ }
+ // The Thumb1 operands aren't in the same order. Awesome, eh?
+ MCInst TmpInst;
+ TmpInst.setOpcode(NewOpc);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(5));
+ TmpInst.addOperand(Inst.getOperand(1));
+ TmpInst.addOperand(Inst.getOperand(2));
+ TmpInst.addOperand(Inst.getOperand(3));
+ TmpInst.addOperand(Inst.getOperand(4));
+ Inst = TmpInst;
+ return true;
+ }
+ return false;
+ }
+
+ // Handle the Thumb2 mode MOV complex aliases.
+ case ARM::t2MOVsr:
+ case ARM::t2MOVSsr: {
+ // Which instruction to expand to depends on the CCOut operand and
+ // whether we're in an IT block if the register operands are low
+ // registers.
+ bool isNarrow = false;
+ if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
+ isARMLowRegister(Inst.getOperand(1).getReg()) &&
+ isARMLowRegister(Inst.getOperand(2).getReg()) &&
+ Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() &&
+ inITBlock() == (Inst.getOpcode() == ARM::t2MOVsr))
+ isNarrow = true;
+ MCInst TmpInst;
+ unsigned newOpc;
+ switch(ARM_AM::getSORegShOp(Inst.getOperand(3).getImm())) {
+ default: llvm_unreachable("unexpected opcode!");
+ case ARM_AM::asr: newOpc = isNarrow ? ARM::tASRrr : ARM::t2ASRrr; break;
+ case ARM_AM::lsr: newOpc = isNarrow ? ARM::tLSRrr : ARM::t2LSRrr; break;
+ case ARM_AM::lsl: newOpc = isNarrow ? ARM::tLSLrr : ARM::t2LSLrr; break;
+ case ARM_AM::ror: newOpc = isNarrow ? ARM::tROR : ARM::t2RORrr; break;
+ }
+ TmpInst.setOpcode(newOpc);
+ TmpInst.addOperand(Inst.getOperand(0)); // Rd
+ if (isNarrow)
+ TmpInst.addOperand(MCOperand::createReg(
+ Inst.getOpcode() == ARM::t2MOVSsr ? ARM::CPSR : 0));
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn
+ TmpInst.addOperand(Inst.getOperand(2)); // Rm
+ TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(5));
+ if (!isNarrow)
+ TmpInst.addOperand(MCOperand::createReg(
+ Inst.getOpcode() == ARM::t2MOVSsr ? ARM::CPSR : 0));
+ Inst = TmpInst;
+ return true;
+ }
+ case ARM::t2MOVsi:
+ case ARM::t2MOVSsi: {
+ // Which instruction to expand to depends on the CCOut operand and
+ // whether we're in an IT block if the register operands are low
+ // registers.
+ bool isNarrow = false;
+ if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
+ isARMLowRegister(Inst.getOperand(1).getReg()) &&
+ inITBlock() == (Inst.getOpcode() == ARM::t2MOVsi))
+ isNarrow = true;
+ MCInst TmpInst;
+ unsigned newOpc;
+ switch(ARM_AM::getSORegShOp(Inst.getOperand(2).getImm())) {
+ default: llvm_unreachable("unexpected opcode!");
+ case ARM_AM::asr: newOpc = isNarrow ? ARM::tASRri : ARM::t2ASRri; break;
+ case ARM_AM::lsr: newOpc = isNarrow ? ARM::tLSRri : ARM::t2LSRri; break;
+ case ARM_AM::lsl: newOpc = isNarrow ? ARM::tLSLri : ARM::t2LSLri; break;
+ case ARM_AM::ror: newOpc = ARM::t2RORri; isNarrow = false; break;
+ case ARM_AM::rrx: isNarrow = false; newOpc = ARM::t2RRX; break;
+ }
+ unsigned Amount = ARM_AM::getSORegOffset(Inst.getOperand(2).getImm());
+ if (Amount == 32) Amount = 0;
+ TmpInst.setOpcode(newOpc);
+ TmpInst.addOperand(Inst.getOperand(0)); // Rd
+ if (isNarrow)
+ TmpInst.addOperand(MCOperand::createReg(
+ Inst.getOpcode() == ARM::t2MOVSsi ? ARM::CPSR : 0));
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn
+ if (newOpc != ARM::t2RRX)
+ TmpInst.addOperand(MCOperand::createImm(Amount));
+ TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(4));
+ if (!isNarrow)
+ TmpInst.addOperand(MCOperand::createReg(
+ Inst.getOpcode() == ARM::t2MOVSsi ? ARM::CPSR : 0));
+ Inst = TmpInst;
+ return true;
+ }
+ // Handle the ARM mode MOV complex aliases.
+ case ARM::ASRr:
+ case ARM::LSRr:
+ case ARM::LSLr:
+ case ARM::RORr: {
+ ARM_AM::ShiftOpc ShiftTy;
+ switch(Inst.getOpcode()) {
+ default: llvm_unreachable("unexpected opcode!");
+ case ARM::ASRr: ShiftTy = ARM_AM::asr; break;
+ case ARM::LSRr: ShiftTy = ARM_AM::lsr; break;
+ case ARM::LSLr: ShiftTy = ARM_AM::lsl; break;
+ case ARM::RORr: ShiftTy = ARM_AM::ror; break;
+ }
+ unsigned Shifter = ARM_AM::getSORegOpc(ShiftTy, 0);
+ MCInst TmpInst;
+ TmpInst.setOpcode(ARM::MOVsr);
+ TmpInst.addOperand(Inst.getOperand(0)); // Rd
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn
+ TmpInst.addOperand(Inst.getOperand(2)); // Rm
+ TmpInst.addOperand(MCOperand::createImm(Shifter)); // Shift value and ty
+ TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(4));
+ TmpInst.addOperand(Inst.getOperand(5)); // cc_out
+ Inst = TmpInst;
+ return true;
+ }
+ case ARM::ASRi:
+ case ARM::LSRi:
+ case ARM::LSLi:
+ case ARM::RORi: {
+ ARM_AM::ShiftOpc ShiftTy;
+ switch(Inst.getOpcode()) {
+ default: llvm_unreachable("unexpected opcode!");
+ case ARM::ASRi: ShiftTy = ARM_AM::asr; break;
+ case ARM::LSRi: ShiftTy = ARM_AM::lsr; break;
+ case ARM::LSLi: ShiftTy = ARM_AM::lsl; break;
+ case ARM::RORi: ShiftTy = ARM_AM::ror; break;
+ }
+ // A shift by zero is a plain MOVr, not a MOVsi.
+ unsigned Amt = Inst.getOperand(2).getImm();
+ unsigned Opc = Amt == 0 ? ARM::MOVr : ARM::MOVsi;
+ // A shift by 32 should be encoded as 0 when permitted
+ if (Amt == 32 && (ShiftTy == ARM_AM::lsr || ShiftTy == ARM_AM::asr))
+ Amt = 0;
+ unsigned Shifter = ARM_AM::getSORegOpc(ShiftTy, Amt);
+ MCInst TmpInst;
+ TmpInst.setOpcode(Opc);
+ TmpInst.addOperand(Inst.getOperand(0)); // Rd
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn
+ if (Opc == ARM::MOVsi)
+ TmpInst.addOperand(MCOperand::createImm(Shifter)); // Shift value and ty
+ TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(4));
+ TmpInst.addOperand(Inst.getOperand(5)); // cc_out
+ Inst = TmpInst;
+ return true;
+ }
+ case ARM::RRXi: {
+ unsigned Shifter = ARM_AM::getSORegOpc(ARM_AM::rrx, 0);
+ MCInst TmpInst;
+ TmpInst.setOpcode(ARM::MOVsi);
+ TmpInst.addOperand(Inst.getOperand(0)); // Rd
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn
+ TmpInst.addOperand(MCOperand::createImm(Shifter)); // Shift value and ty
+ TmpInst.addOperand(Inst.getOperand(2)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(3));
+ TmpInst.addOperand(Inst.getOperand(4)); // cc_out
+ Inst = TmpInst;
+ return true;
+ }
+ case ARM::t2LDMIA_UPD: {
+ // If this is a load of a single register, then we should use
+ // a post-indexed LDR instruction instead, per the ARM ARM.
+ if (Inst.getNumOperands() != 5)
+ return false;
+ MCInst TmpInst;
+ TmpInst.setOpcode(ARM::t2LDR_POST);
+ TmpInst.addOperand(Inst.getOperand(4)); // Rt
+ TmpInst.addOperand(Inst.getOperand(0)); // Rn_wb
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn
+ TmpInst.addOperand(MCOperand::createImm(4));
+ TmpInst.addOperand(Inst.getOperand(2)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(3));
+ Inst = TmpInst;
+ return true;
+ }
+ case ARM::t2STMDB_UPD: {
+ // If this is a store of a single register, then we should use
+ // a pre-indexed STR instruction instead, per the ARM ARM.
+ if (Inst.getNumOperands() != 5)
+ return false;
+ MCInst TmpInst;
+ TmpInst.setOpcode(ARM::t2STR_PRE);
+ TmpInst.addOperand(Inst.getOperand(0)); // Rn_wb
+ TmpInst.addOperand(Inst.getOperand(4)); // Rt
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn
+ TmpInst.addOperand(MCOperand::createImm(-4));
+ TmpInst.addOperand(Inst.getOperand(2)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(3));
+ Inst = TmpInst;
+ return true;
+ }
+ case ARM::LDMIA_UPD:
+ // If this is a load of a single register via a 'pop', then we should use
+ // a post-indexed LDR instruction instead, per the ARM ARM.
+ if (static_cast<ARMOperand &>(*Operands[0]).getToken() == "pop" &&
+ Inst.getNumOperands() == 5) {
+ MCInst TmpInst;
+ TmpInst.setOpcode(ARM::LDR_POST_IMM);
+ TmpInst.addOperand(Inst.getOperand(4)); // Rt
+ TmpInst.addOperand(Inst.getOperand(0)); // Rn_wb
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn
+ TmpInst.addOperand(MCOperand::createReg(0)); // am2offset
+ TmpInst.addOperand(MCOperand::createImm(4));
+ TmpInst.addOperand(Inst.getOperand(2)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(3));
+ Inst = TmpInst;
+ return true;
+ }
+ break;
+ case ARM::STMDB_UPD:
+ // If this is a store of a single register via a 'push', then we should use
+ // a pre-indexed STR instruction instead, per the ARM ARM.
+ if (static_cast<ARMOperand &>(*Operands[0]).getToken() == "push" &&
+ Inst.getNumOperands() == 5) {
+ MCInst TmpInst;
+ TmpInst.setOpcode(ARM::STR_PRE_IMM);
+ TmpInst.addOperand(Inst.getOperand(0)); // Rn_wb
+ TmpInst.addOperand(Inst.getOperand(4)); // Rt
+ TmpInst.addOperand(Inst.getOperand(1)); // addrmode_imm12
+ TmpInst.addOperand(MCOperand::createImm(-4));
+ TmpInst.addOperand(Inst.getOperand(2)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(3));
+ Inst = TmpInst;
+ }
+ break;
+ case ARM::t2ADDri12:
+ // If the immediate fits for encoding T3 (t2ADDri) and the generic "add"
+ // mnemonic was used (not "addw"), encoding T3 is preferred.
+ if (static_cast<ARMOperand &>(*Operands[0]).getToken() != "add" ||
+ ARM_AM::getT2SOImmVal(Inst.getOperand(2).getImm()) == -1)
+ break;
+ Inst.setOpcode(ARM::t2ADDri);
+ Inst.addOperand(MCOperand::createReg(0)); // cc_out
+ break;
+ case ARM::t2SUBri12:
+ // If the immediate fits for encoding T3 (t2SUBri) and the generic "sub"
+ // mnemonic was used (not "subw"), encoding T3 is preferred.
+ if (static_cast<ARMOperand &>(*Operands[0]).getToken() != "sub" ||
+ ARM_AM::getT2SOImmVal(Inst.getOperand(2).getImm()) == -1)
+ break;
+ Inst.setOpcode(ARM::t2SUBri);
+ Inst.addOperand(MCOperand::createReg(0)); // cc_out
+ break;
+ case ARM::tADDi8:
+ // If the immediate is in the range 0-7, we want tADDi3 iff Rd was
+ // explicitly specified. From the ARM ARM: "Encoding T1 is preferred
+ // to encoding T2 if <Rd> is specified and encoding T2 is preferred
+ // to encoding T1 if <Rd> is omitted."
+ if ((unsigned)Inst.getOperand(3).getImm() < 8 && Operands.size() == 6) {
+ Inst.setOpcode(ARM::tADDi3);
+ return true;
+ }
+ break;
+ case ARM::tSUBi8:
+ // If the immediate is in the range 0-7, we want tADDi3 iff Rd was
+ // explicitly specified. From the ARM ARM: "Encoding T1 is preferred
+ // to encoding T2 if <Rd> is specified and encoding T2 is preferred
+ // to encoding T1 if <Rd> is omitted."
+ if ((unsigned)Inst.getOperand(3).getImm() < 8 && Operands.size() == 6) {
+ Inst.setOpcode(ARM::tSUBi3);
+ return true;
+ }
+ break;
+ case ARM::t2ADDri:
+ case ARM::t2SUBri: {
+ // If the destination and first source operand are the same, and
+ // the flags are compatible with the current IT status, use encoding T2
+ // instead of T3. For compatibility with the system 'as'. Make sure the
+ // wide encoding wasn't explicit.
+ if (Inst.getOperand(0).getReg() != Inst.getOperand(1).getReg() ||
+ !isARMLowRegister(Inst.getOperand(0).getReg()) ||
+ (unsigned)Inst.getOperand(2).getImm() > 255 ||
+ ((!inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR) ||
+ (inITBlock() && Inst.getOperand(5).getReg() != 0)) ||
+ (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
+ static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w"))
+ break;
+ MCInst TmpInst;
+ TmpInst.setOpcode(Inst.getOpcode() == ARM::t2ADDri ?
+ ARM::tADDi8 : ARM::tSUBi8);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(5));
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(2));
+ TmpInst.addOperand(Inst.getOperand(3));
+ TmpInst.addOperand(Inst.getOperand(4));
+ Inst = TmpInst;
+ return true;
+ }
+ case ARM::t2ADDrr: {
+ // If the destination and first source operand are the same, and
+ // there's no setting of the flags, use encoding T2 instead of T3.
+ // Note that this is only for ADD, not SUB. This mirrors the system
+ // 'as' behaviour. Also take advantage of ADD being commutative.
+ // Make sure the wide encoding wasn't explicit.
+ bool Swap = false;
+ auto DestReg = Inst.getOperand(0).getReg();
+ bool Transform = DestReg == Inst.getOperand(1).getReg();
+ if (!Transform && DestReg == Inst.getOperand(2).getReg()) {
+ Transform = true;
+ Swap = true;
+ }
+ if (!Transform ||
+ Inst.getOperand(5).getReg() != 0 ||
+ (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
+ static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w"))
+ break;
+ MCInst TmpInst;
+ TmpInst.setOpcode(ARM::tADDhirr);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(Swap ? 1 : 2));
+ TmpInst.addOperand(Inst.getOperand(3));
+ TmpInst.addOperand(Inst.getOperand(4));
+ Inst = TmpInst;
+ return true;
+ }
+ case ARM::tADDrSP: {
+ // If the non-SP source operand and the destination operand are not the
+ // same, we need to use the 32-bit encoding if it's available.
+ if (Inst.getOperand(0).getReg() != Inst.getOperand(2).getReg()) {
+ Inst.setOpcode(ARM::t2ADDrr);
+ Inst.addOperand(MCOperand::createReg(0)); // cc_out
+ return true;
+ }
+ break;
+ }
+ case ARM::tB:
+ // A Thumb conditional branch outside of an IT block is a tBcc.
+ if (Inst.getOperand(1).getImm() != ARMCC::AL && !inITBlock()) {
+ Inst.setOpcode(ARM::tBcc);
+ return true;
+ }
+ break;
+ case ARM::t2B:
+ // A Thumb2 conditional branch outside of an IT block is a t2Bcc.
+ if (Inst.getOperand(1).getImm() != ARMCC::AL && !inITBlock()){
+ Inst.setOpcode(ARM::t2Bcc);
+ return true;
+ }
+ break;
+ case ARM::t2Bcc:
+ // If the conditional is AL or we're in an IT block, we really want t2B.
+ if (Inst.getOperand(1).getImm() == ARMCC::AL || inITBlock()) {
+ Inst.setOpcode(ARM::t2B);
+ return true;
+ }
+ break;
+ case ARM::tBcc:
+ // If the conditional is AL, we really want tB.
+ if (Inst.getOperand(1).getImm() == ARMCC::AL) {
+ Inst.setOpcode(ARM::tB);
+ return true;
+ }
+ break;
+ case ARM::tLDMIA: {
+ // If the register list contains any high registers, or if the writeback
+ // doesn't match what tLDMIA can do, we need to use the 32-bit encoding
+ // instead if we're in Thumb2. Otherwise, this should have generated
+ // an error in validateInstruction().
+ unsigned Rn = Inst.getOperand(0).getReg();
+ bool hasWritebackToken =
+ (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
+ static_cast<ARMOperand &>(*Operands[3]).getToken() == "!");
+ bool listContainsBase;
+ if (checkLowRegisterList(Inst, 3, Rn, 0, listContainsBase) ||
+ (!listContainsBase && !hasWritebackToken) ||
+ (listContainsBase && hasWritebackToken)) {
+ // 16-bit encoding isn't sufficient. Switch to the 32-bit version.
+ assert (isThumbTwo());
+ Inst.setOpcode(hasWritebackToken ? ARM::t2LDMIA_UPD : ARM::t2LDMIA);
+ // If we're switching to the updating version, we need to insert
+ // the writeback tied operand.
+ if (hasWritebackToken)
+ Inst.insert(Inst.begin(),
+ MCOperand::createReg(Inst.getOperand(0).getReg()));
+ return true;
+ }
+ break;
+ }
+ case ARM::tSTMIA_UPD: {
+ // If the register list contains any high registers, we need to use
+ // the 32-bit encoding instead if we're in Thumb2. Otherwise, this
+ // should have generated an error in validateInstruction().
+ unsigned Rn = Inst.getOperand(0).getReg();
+ bool listContainsBase;
+ if (checkLowRegisterList(Inst, 4, Rn, 0, listContainsBase)) {
+ // 16-bit encoding isn't sufficient. Switch to the 32-bit version.
+ assert (isThumbTwo());
+ Inst.setOpcode(ARM::t2STMIA_UPD);
+ return true;
+ }
+ break;
+ }
+ case ARM::tPOP: {
+ bool listContainsBase;
+ // If the register list contains any high registers, we need to use
+ // the 32-bit encoding instead if we're in Thumb2. Otherwise, this
+ // should have generated an error in validateInstruction().
+ if (!checkLowRegisterList(Inst, 2, 0, ARM::PC, listContainsBase))
+ return false;
+ assert (isThumbTwo());
+ Inst.setOpcode(ARM::t2LDMIA_UPD);
+ // Add the base register and writeback operands.
+ Inst.insert(Inst.begin(), MCOperand::createReg(ARM::SP));
+ Inst.insert(Inst.begin(), MCOperand::createReg(ARM::SP));
+ return true;
+ }
+ case ARM::tPUSH: {
+ bool listContainsBase;
+ if (!checkLowRegisterList(Inst, 2, 0, ARM::LR, listContainsBase))
+ return false;
+ assert (isThumbTwo());
+ Inst.setOpcode(ARM::t2STMDB_UPD);
+ // Add the base register and writeback operands.
+ Inst.insert(Inst.begin(), MCOperand::createReg(ARM::SP));
+ Inst.insert(Inst.begin(), MCOperand::createReg(ARM::SP));
+ return true;
+ }
+ case ARM::t2MOVi: {
+ // If we can use the 16-bit encoding and the user didn't explicitly
+ // request the 32-bit variant, transform it here.
+ if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
+ (unsigned)Inst.getOperand(1).getImm() <= 255 &&
+ ((!inITBlock() && Inst.getOperand(2).getImm() == ARMCC::AL &&
+ Inst.getOperand(4).getReg() == ARM::CPSR) ||
+ (inITBlock() && Inst.getOperand(4).getReg() == 0)) &&
+ (!static_cast<ARMOperand &>(*Operands[2]).isToken() ||
+ static_cast<ARMOperand &>(*Operands[2]).getToken() != ".w")) {
+ // The operands aren't in the same order for tMOVi8...
+ MCInst TmpInst;
+ TmpInst.setOpcode(ARM::tMOVi8);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(4));
+ TmpInst.addOperand(Inst.getOperand(1));
+ TmpInst.addOperand(Inst.getOperand(2));
+ TmpInst.addOperand(Inst.getOperand(3));
+ Inst = TmpInst;
+ return true;
+ }
+ break;
+ }
+ case ARM::t2MOVr: {
+ // If we can use the 16-bit encoding and the user didn't explicitly
+ // request the 32-bit variant, transform it here.
+ if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
+ isARMLowRegister(Inst.getOperand(1).getReg()) &&
+ Inst.getOperand(2).getImm() == ARMCC::AL &&
+ Inst.getOperand(4).getReg() == ARM::CPSR &&
+ (!static_cast<ARMOperand &>(*Operands[2]).isToken() ||
+ static_cast<ARMOperand &>(*Operands[2]).getToken() != ".w")) {
+ // The operands aren't the same for tMOV[S]r... (no cc_out)
+ MCInst TmpInst;
+ TmpInst.setOpcode(Inst.getOperand(4).getReg() ? ARM::tMOVSr : ARM::tMOVr);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+ TmpInst.addOperand(Inst.getOperand(2));
+ TmpInst.addOperand(Inst.getOperand(3));
+ Inst = TmpInst;
+ return true;
+ }
+ break;
+ }
+ case ARM::t2SXTH:
+ case ARM::t2SXTB:
+ case ARM::t2UXTH:
+ case ARM::t2UXTB: {
+ // If we can use the 16-bit encoding and the user didn't explicitly
+ // request the 32-bit variant, transform it here.
+ if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
+ isARMLowRegister(Inst.getOperand(1).getReg()) &&
+ Inst.getOperand(2).getImm() == 0 &&
+ (!static_cast<ARMOperand &>(*Operands[2]).isToken() ||
+ static_cast<ARMOperand &>(*Operands[2]).getToken() != ".w")) {
+ unsigned NewOpc;
+ switch (Inst.getOpcode()) {
+ default: llvm_unreachable("Illegal opcode!");
+ case ARM::t2SXTH: NewOpc = ARM::tSXTH; break;
+ case ARM::t2SXTB: NewOpc = ARM::tSXTB; break;
+ case ARM::t2UXTH: NewOpc = ARM::tUXTH; break;
+ case ARM::t2UXTB: NewOpc = ARM::tUXTB; break;
+ }
+ // The operands aren't the same for thumb1 (no rotate operand).
+ MCInst TmpInst;
+ TmpInst.setOpcode(NewOpc);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+ TmpInst.addOperand(Inst.getOperand(3));
+ TmpInst.addOperand(Inst.getOperand(4));
+ Inst = TmpInst;
+ return true;
+ }
+ break;
+ }
+ case ARM::MOVsi: {
+ ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(Inst.getOperand(2).getImm());
+ // rrx shifts and asr/lsr of #32 is encoded as 0
+ if (SOpc == ARM_AM::rrx || SOpc == ARM_AM::asr || SOpc == ARM_AM::lsr)
+ return false;
+ if (ARM_AM::getSORegOffset(Inst.getOperand(2).getImm()) == 0) {
+ // Shifting by zero is accepted as a vanilla 'MOVr'
+ MCInst TmpInst;
+ TmpInst.setOpcode(ARM::MOVr);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+ TmpInst.addOperand(Inst.getOperand(3));
+ TmpInst.addOperand(Inst.getOperand(4));
+ TmpInst.addOperand(Inst.getOperand(5));
+ Inst = TmpInst;
+ return true;
+ }
+ return false;
+ }
+ case ARM::ANDrsi:
+ case ARM::ORRrsi:
+ case ARM::EORrsi:
+ case ARM::BICrsi:
+ case ARM::SUBrsi:
+ case ARM::ADDrsi: {
+ unsigned newOpc;
+ ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(Inst.getOperand(3).getImm());
+ if (SOpc == ARM_AM::rrx) return false;
+ switch (Inst.getOpcode()) {
+ default: llvm_unreachable("unexpected opcode!");
+ case ARM::ANDrsi: newOpc = ARM::ANDrr; break;
+ case ARM::ORRrsi: newOpc = ARM::ORRrr; break;
+ case ARM::EORrsi: newOpc = ARM::EORrr; break;
+ case ARM::BICrsi: newOpc = ARM::BICrr; break;
+ case ARM::SUBrsi: newOpc = ARM::SUBrr; break;
+ case ARM::ADDrsi: newOpc = ARM::ADDrr; break;
+ }
+ // If the shift is by zero, use the non-shifted instruction definition.
+ // The exception is for right shifts, where 0 == 32
+ if (ARM_AM::getSORegOffset(Inst.getOperand(3).getImm()) == 0 &&
+ !(SOpc == ARM_AM::lsr || SOpc == ARM_AM::asr)) {
+ MCInst TmpInst;
+ TmpInst.setOpcode(newOpc);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+ TmpInst.addOperand(Inst.getOperand(2));
+ TmpInst.addOperand(Inst.getOperand(4));
+ TmpInst.addOperand(Inst.getOperand(5));
+ TmpInst.addOperand(Inst.getOperand(6));
+ Inst = TmpInst;
+ return true;
+ }
+ return false;
+ }
+ case ARM::ITasm:
+ case ARM::t2IT: {
+ MCOperand &MO = Inst.getOperand(1);
+ unsigned Mask = MO.getImm();
+ ARMCC::CondCodes Cond = ARMCC::CondCodes(Inst.getOperand(0).getImm());
+
+ // Set up the IT block state according to the IT instruction we just
+ // matched.
+ assert(!inITBlock() && "nested IT blocks?!");
+ startExplicitITBlock(Cond, Mask);
+ MO.setImm(getITMaskEncoding());
+ break;
+ }
+ case ARM::t2LSLrr:
+ case ARM::t2LSRrr:
+ case ARM::t2ASRrr:
+ case ARM::t2SBCrr:
+ case ARM::t2RORrr:
+ case ARM::t2BICrr:
+ {
+ // Assemblers should use the narrow encodings of these instructions when permissible.
+ if ((isARMLowRegister(Inst.getOperand(1).getReg()) &&
+ isARMLowRegister(Inst.getOperand(2).getReg())) &&
+ Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() &&
+ ((!inITBlock() && Inst.getOperand(5).getReg() == ARM::CPSR) ||
+ (inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR)) &&
+ (!static_cast<ARMOperand &>(*Operands[3]).isToken() ||
+ !static_cast<ARMOperand &>(*Operands[3]).getToken().equals_lower(
+ ".w"))) {
+ unsigned NewOpc;
+ switch (Inst.getOpcode()) {
+ default: llvm_unreachable("unexpected opcode");
+ case ARM::t2LSLrr: NewOpc = ARM::tLSLrr; break;
+ case ARM::t2LSRrr: NewOpc = ARM::tLSRrr; break;
+ case ARM::t2ASRrr: NewOpc = ARM::tASRrr; break;
+ case ARM::t2SBCrr: NewOpc = ARM::tSBC; break;
+ case ARM::t2RORrr: NewOpc = ARM::tROR; break;
+ case ARM::t2BICrr: NewOpc = ARM::tBIC; break;
+ }
+ MCInst TmpInst;
+ TmpInst.setOpcode(NewOpc);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(5));
+ TmpInst.addOperand(Inst.getOperand(1));
+ TmpInst.addOperand(Inst.getOperand(2));
+ TmpInst.addOperand(Inst.getOperand(3));
+ TmpInst.addOperand(Inst.getOperand(4));
+ Inst = TmpInst;
+ return true;
+ }
+ return false;
+ }
+ case ARM::t2ANDrr:
+ case ARM::t2EORrr:
+ case ARM::t2ADCrr:
+ case ARM::t2ORRrr:
+ {
+ // Assemblers should use the narrow encodings of these instructions when permissible.
+ // These instructions are special in that they are commutable, so shorter encodings
+ // are available more often.
+ if ((isARMLowRegister(Inst.getOperand(1).getReg()) &&
+ isARMLowRegister(Inst.getOperand(2).getReg())) &&
+ (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() ||
+ Inst.getOperand(0).getReg() == Inst.getOperand(2).getReg()) &&
+ ((!inITBlock() && Inst.getOperand(5).getReg() == ARM::CPSR) ||
+ (inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR)) &&
+ (!static_cast<ARMOperand &>(*Operands[3]).isToken() ||
+ !static_cast<ARMOperand &>(*Operands[3]).getToken().equals_lower(
+ ".w"))) {
+ unsigned NewOpc;
+ switch (Inst.getOpcode()) {
+ default: llvm_unreachable("unexpected opcode");
+ case ARM::t2ADCrr: NewOpc = ARM::tADC; break;
+ case ARM::t2ANDrr: NewOpc = ARM::tAND; break;
+ case ARM::t2EORrr: NewOpc = ARM::tEOR; break;
+ case ARM::t2ORRrr: NewOpc = ARM::tORR; break;
+ }
+ MCInst TmpInst;
+ TmpInst.setOpcode(NewOpc);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(5));
+ if (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg()) {
+ TmpInst.addOperand(Inst.getOperand(1));
+ TmpInst.addOperand(Inst.getOperand(2));
+ } else {
+ TmpInst.addOperand(Inst.getOperand(2));
+ TmpInst.addOperand(Inst.getOperand(1));
+ }
+ TmpInst.addOperand(Inst.getOperand(3));
+ TmpInst.addOperand(Inst.getOperand(4));
+ Inst = TmpInst;
+ return true;
+ }
+ return false;
+ }
+ }
+ return false;
+}
+
+unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
+ // 16-bit thumb arithmetic instructions either require or preclude the 'S'
+ // suffix depending on whether they're in an IT block or not.
+ unsigned Opc = Inst.getOpcode();
+ const MCInstrDesc &MCID = MII.get(Opc);
+ if (MCID.TSFlags & ARMII::ThumbArithFlagSetting) {
+ assert(MCID.hasOptionalDef() &&
+ "optionally flag setting instruction missing optional def operand");
+ assert(MCID.NumOperands == Inst.getNumOperands() &&
+ "operand count mismatch!");
+ // Find the optional-def operand (cc_out).
+ unsigned OpNo;
+ for (OpNo = 0;
+ !MCID.OpInfo[OpNo].isOptionalDef() && OpNo < MCID.NumOperands;
+ ++OpNo)
+ ;
+ // If we're parsing Thumb1, reject it completely.
+ if (isThumbOne() && Inst.getOperand(OpNo).getReg() != ARM::CPSR)
+ return Match_RequiresFlagSetting;
+ // If we're parsing Thumb2, which form is legal depends on whether we're
+ // in an IT block.
+ if (isThumbTwo() && Inst.getOperand(OpNo).getReg() != ARM::CPSR &&
+ !inITBlock())
+ return Match_RequiresITBlock;
+ if (isThumbTwo() && Inst.getOperand(OpNo).getReg() == ARM::CPSR &&
+ inITBlock())
+ return Match_RequiresNotITBlock;
+ } else if (isThumbOne()) {
+ // Some high-register supporting Thumb1 encodings only allow both registers
+ // to be from r0-r7 when in Thumb2.
+ if (Opc == ARM::tADDhirr && !hasV6MOps() &&
+ isARMLowRegister(Inst.getOperand(1).getReg()) &&
+ isARMLowRegister(Inst.getOperand(2).getReg()))
+ return Match_RequiresThumb2;
+ // Others only require ARMv6 or later.
+ else if (Opc == ARM::tMOVr && !hasV6Ops() &&
+ isARMLowRegister(Inst.getOperand(0).getReg()) &&
+ isARMLowRegister(Inst.getOperand(1).getReg()))
+ return Match_RequiresV6;
+ }
+
+ for (unsigned I = 0; I < MCID.NumOperands; ++I)
+ if (MCID.OpInfo[I].RegClass == ARM::rGPRRegClassID) {
+ // rGPRRegClass excludes PC, and also excluded SP before ARMv8
+ if ((Inst.getOperand(I).getReg() == ARM::SP) && !hasV8Ops())
+ return Match_RequiresV8;
+ else if (Inst.getOperand(I).getReg() == ARM::PC)
+ return Match_InvalidOperand;
+ }
+
+ return Match_Success;
+}
+
+namespace llvm {
+template <> inline bool IsCPSRDead<MCInst>(MCInst *Instr) {
+ return true; // In an assembly source, no need to second-guess
+}
+}
+
+// Returns true if Inst is unpredictable if it is in and IT block, but is not
+// the last instruction in the block.
+bool ARMAsmParser::isITBlockTerminator(MCInst &Inst) const {
+ const MCInstrDesc &MCID = MII.get(Inst.getOpcode());
+
+ // All branch & call instructions terminate IT blocks.
+ if (MCID.isTerminator() || MCID.isCall() || MCID.isReturn() ||
+ MCID.isBranch() || MCID.isIndirectBranch())
+ return true;
+
+ // Any arithmetic instruction which writes to the PC also terminates the IT
+ // block.
+ for (unsigned OpIdx = 0; OpIdx < MCID.getNumDefs(); ++OpIdx) {
+ MCOperand &Op = Inst.getOperand(OpIdx);
+ if (Op.isReg() && Op.getReg() == ARM::PC)
+ return true;
+ }
+
+ if (MCID.hasImplicitDefOfPhysReg(ARM::PC, MRI))
+ return true;
+
+ // Instructions with variable operand lists, which write to the variable
+ // operands. We only care about Thumb instructions here, as ARM instructions
+ // obviously can't be in an IT block.
+ switch (Inst.getOpcode()) {
+ case ARM::t2LDMIA:
+ case ARM::t2LDMIA_UPD:
+ case ARM::t2LDMDB:
+ case ARM::t2LDMDB_UPD:
+ if (listContainsReg(Inst, 3, ARM::PC))
+ return true;
+ break;
+ case ARM::tPOP:
+ if (listContainsReg(Inst, 2, ARM::PC))
+ return true;
+ break;
+ }
+
+ return false;
+}
+
+unsigned ARMAsmParser::MatchInstruction(OperandVector &Operands, MCInst &Inst,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm,
+ bool &EmitInITBlock,
+ MCStreamer &Out) {
+ // If we can't use an implicit IT block here, just match as normal.
+ if (inExplicitITBlock() || !isThumbTwo() || !useImplicitITThumb())
+ return MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm);
+
+ // Try to match the instruction in an extension of the current IT block (if
+ // there is one).
+ if (inImplicitITBlock()) {
+ extendImplicitITBlock(ITState.Cond);
+ if (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm) ==
+ Match_Success) {
+ // The match succeded, but we still have to check that the instruction is
+ // valid in this implicit IT block.
+ const MCInstrDesc &MCID = MII.get(Inst.getOpcode());
+ if (MCID.isPredicable()) {
+ ARMCC::CondCodes InstCond =
+ (ARMCC::CondCodes)Inst.getOperand(MCID.findFirstPredOperandIdx())
+ .getImm();
+ ARMCC::CondCodes ITCond = currentITCond();
+ if (InstCond == ITCond) {
+ EmitInITBlock = true;
+ return Match_Success;
+ } else if (InstCond == ARMCC::getOppositeCondition(ITCond)) {
+ invertCurrentITCondition();
+ EmitInITBlock = true;
+ return Match_Success;
+ }
+ }
+ }
+ rewindImplicitITPosition();
+ }
+
+ // Finish the current IT block, and try to match outside any IT block.
+ flushPendingInstructions(Out);
+ unsigned PlainMatchResult =
+ MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm);
+ if (PlainMatchResult == Match_Success) {
+ const MCInstrDesc &MCID = MII.get(Inst.getOpcode());
+ if (MCID.isPredicable()) {
+ ARMCC::CondCodes InstCond =
+ (ARMCC::CondCodes)Inst.getOperand(MCID.findFirstPredOperandIdx())
+ .getImm();
+ // Some forms of the branch instruction have their own condition code
+ // fields, so can be conditionally executed without an IT block.
+ if (Inst.getOpcode() == ARM::tBcc || Inst.getOpcode() == ARM::t2Bcc) {
+ EmitInITBlock = false;
+ return Match_Success;
+ }
+ if (InstCond == ARMCC::AL) {
+ EmitInITBlock = false;
+ return Match_Success;
+ }
+ } else {
+ EmitInITBlock = false;
+ return Match_Success;
+ }
+ }
+
+ // Try to match in a new IT block. The matcher doesn't check the actual
+ // condition, so we create an IT block with a dummy condition, and fix it up
+ // once we know the actual condition.
+ startImplicitITBlock();
+ if (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm) ==
+ Match_Success) {
+ const MCInstrDesc &MCID = MII.get(Inst.getOpcode());
+ if (MCID.isPredicable()) {
+ ITState.Cond =
+ (ARMCC::CondCodes)Inst.getOperand(MCID.findFirstPredOperandIdx())
+ .getImm();
+ EmitInITBlock = true;
+ return Match_Success;
+ }
+ }
+ discardImplicitITBlock();
+
+ // If none of these succeed, return the error we got when trying to match
+ // outside any IT blocks.
+ EmitInITBlock = false;
+ return PlainMatchResult;
+}
+
+static const char *getSubtargetFeatureName(uint64_t Val);
+bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands,
+ MCStreamer &Out, uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) {
+ MCInst Inst;
+ unsigned MatchResult;
+ bool PendConditionalInstruction = false;
+
+ MatchResult = MatchInstruction(Operands, Inst, ErrorInfo, MatchingInlineAsm,
+ PendConditionalInstruction, Out);
+
+ switch (MatchResult) {
+ case Match_Success:
+ // Context sensitive operand constraints aren't handled by the matcher,
+ // so check them here.
+ if (validateInstruction(Inst, Operands)) {
+ // Still progress the IT block, otherwise one wrong condition causes
+ // nasty cascading errors.
+ forwardITPosition();
+ return true;
+ }
+
+ { // processInstruction() updates inITBlock state, we need to save it away
+ bool wasInITBlock = inITBlock();
+
+ // Some instructions need post-processing to, for example, tweak which
+ // encoding is selected. Loop on it while changes happen so the
+ // individual transformations can chain off each other. E.g.,
+ // tPOP(r8)->t2LDMIA_UPD(sp,r8)->t2STR_POST(sp,r8)
+ while (processInstruction(Inst, Operands, Out))
+ ;
+
+ // Only after the instruction is fully processed, we can validate it
+ if (wasInITBlock && hasV8Ops() && isThumb() &&
+ !isV8EligibleForIT(&Inst)) {
+ Warning(IDLoc, "deprecated instruction in IT block");
+ }
+ }
+
+ // Only move forward at the very end so that everything in validate
+ // and process gets a consistent answer about whether we're in an IT
+ // block.
+ forwardITPosition();
+
+ // ITasm is an ARM mode pseudo-instruction that just sets the ITblock and
+ // doesn't actually encode.
+ if (Inst.getOpcode() == ARM::ITasm)
+ return false;
+
+ Inst.setLoc(IDLoc);
+ if (PendConditionalInstruction) {
+ PendingConditionalInsts.push_back(Inst);
+ if (isITBlockFull() || isITBlockTerminator(Inst))
+ flushPendingInstructions(Out);
+ } else {
+ Out.EmitInstruction(Inst, getSTI());
+ }
+ return false;
+ case Match_MissingFeature: {
+ assert(ErrorInfo && "Unknown missing feature!");
+ // Special case the error message for the very common case where only
+ // a single subtarget feature is missing (Thumb vs. ARM, e.g.).
+ std::string Msg = "instruction requires:";
+ uint64_t Mask = 1;
+ for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) {
+ if (ErrorInfo & Mask) {
+ Msg += " ";
+ Msg += getSubtargetFeatureName(ErrorInfo & Mask);
+ }
+ Mask <<= 1;
+ }
+ return Error(IDLoc, Msg);
+ }
+ case Match_InvalidOperand: {
+ SMLoc ErrorLoc = IDLoc;
+ if (ErrorInfo != ~0ULL) {
+ if (ErrorInfo >= Operands.size())
+ return Error(IDLoc, "too few operands for instruction");
+
+ ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getStartLoc();
+ if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
+ }
+
+ return Error(ErrorLoc, "invalid operand for instruction");
+ }
+ case Match_MnemonicFail:
+ return Error(IDLoc, "invalid instruction",
+ ((ARMOperand &)*Operands[0]).getLocRange());
+ case Match_RequiresNotITBlock:
+ return Error(IDLoc, "flag setting instruction only valid outside IT block");
+ case Match_RequiresITBlock:
+ return Error(IDLoc, "instruction only valid inside IT block");
+ case Match_RequiresV6:
+ return Error(IDLoc, "instruction variant requires ARMv6 or later");
+ case Match_RequiresThumb2:
+ return Error(IDLoc, "instruction variant requires Thumb2");
+ case Match_RequiresV8:
+ return Error(IDLoc, "instruction variant requires ARMv8 or later");
+ case Match_RequiresFlagSetting:
+ return Error(IDLoc, "no flag-preserving variant of this instruction available");
+ case Match_ImmRange0_15: {
+ SMLoc ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getStartLoc();
+ if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
+ return Error(ErrorLoc, "immediate operand must be in the range [0,15]");
+ }
+ case Match_ImmRange0_239: {
+ SMLoc ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getStartLoc();
+ if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
+ return Error(ErrorLoc, "immediate operand must be in the range [0,239]");
+ }
+ case Match_AlignedMemoryRequiresNone:
+ case Match_DupAlignedMemoryRequiresNone:
+ case Match_AlignedMemoryRequires16:
+ case Match_DupAlignedMemoryRequires16:
+ case Match_AlignedMemoryRequires32:
+ case Match_DupAlignedMemoryRequires32:
+ case Match_AlignedMemoryRequires64:
+ case Match_DupAlignedMemoryRequires64:
+ case Match_AlignedMemoryRequires64or128:
+ case Match_DupAlignedMemoryRequires64or128:
+ case Match_AlignedMemoryRequires64or128or256:
+ {
+ SMLoc ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getAlignmentLoc();
+ if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
+ switch (MatchResult) {
+ default:
+ llvm_unreachable("Missing Match_Aligned type");
+ case Match_AlignedMemoryRequiresNone:
+ case Match_DupAlignedMemoryRequiresNone:
+ return Error(ErrorLoc, "alignment must be omitted");
+ case Match_AlignedMemoryRequires16:
+ case Match_DupAlignedMemoryRequires16:
+ return Error(ErrorLoc, "alignment must be 16 or omitted");
+ case Match_AlignedMemoryRequires32:
+ case Match_DupAlignedMemoryRequires32:
+ return Error(ErrorLoc, "alignment must be 32 or omitted");
+ case Match_AlignedMemoryRequires64:
+ case Match_DupAlignedMemoryRequires64:
+ return Error(ErrorLoc, "alignment must be 64 or omitted");
+ case Match_AlignedMemoryRequires64or128:
+ case Match_DupAlignedMemoryRequires64or128:
+ return Error(ErrorLoc, "alignment must be 64, 128 or omitted");
+ case Match_AlignedMemoryRequires64or128or256:
+ return Error(ErrorLoc, "alignment must be 64, 128, 256 or omitted");
+ }
+ }
+ }
+
+ llvm_unreachable("Implement any new match types added!");
+}
+
+/// parseDirective parses the arm specific directives
+bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) {
+ const MCObjectFileInfo::Environment Format =
+ getContext().getObjectFileInfo()->getObjectFileType();
+ bool IsMachO = Format == MCObjectFileInfo::IsMachO;
+ bool IsCOFF = Format == MCObjectFileInfo::IsCOFF;
+
+ StringRef IDVal = DirectiveID.getIdentifier();
+ if (IDVal == ".word")
+ parseLiteralValues(4, DirectiveID.getLoc());
+ else if (IDVal == ".short" || IDVal == ".hword")
+ parseLiteralValues(2, DirectiveID.getLoc());
+ else if (IDVal == ".thumb")
+ parseDirectiveThumb(DirectiveID.getLoc());
+ else if (IDVal == ".arm")
+ parseDirectiveARM(DirectiveID.getLoc());
+ else if (IDVal == ".thumb_func")
+ parseDirectiveThumbFunc(DirectiveID.getLoc());
+ else if (IDVal == ".code")
+ parseDirectiveCode(DirectiveID.getLoc());
+ else if (IDVal == ".syntax")
+ parseDirectiveSyntax(DirectiveID.getLoc());
+ else if (IDVal == ".unreq")
+ parseDirectiveUnreq(DirectiveID.getLoc());
+ else if (IDVal == ".fnend")
+ parseDirectiveFnEnd(DirectiveID.getLoc());
+ else if (IDVal == ".cantunwind")
+ parseDirectiveCantUnwind(DirectiveID.getLoc());
+ else if (IDVal == ".personality")
+ parseDirectivePersonality(DirectiveID.getLoc());
+ else if (IDVal == ".handlerdata")
+ parseDirectiveHandlerData(DirectiveID.getLoc());
+ else if (IDVal == ".setfp")
+ parseDirectiveSetFP(DirectiveID.getLoc());
+ else if (IDVal == ".pad")
+ parseDirectivePad(DirectiveID.getLoc());
+ else if (IDVal == ".save")
+ parseDirectiveRegSave(DirectiveID.getLoc(), false);
+ else if (IDVal == ".vsave")
+ parseDirectiveRegSave(DirectiveID.getLoc(), true);
+ else if (IDVal == ".ltorg" || IDVal == ".pool")
+ parseDirectiveLtorg(DirectiveID.getLoc());
+ else if (IDVal == ".even")
+ parseDirectiveEven(DirectiveID.getLoc());
+ else if (IDVal == ".personalityindex")
+ parseDirectivePersonalityIndex(DirectiveID.getLoc());
+ else if (IDVal == ".unwind_raw")
+ parseDirectiveUnwindRaw(DirectiveID.getLoc());
+ else if (IDVal == ".movsp")
+ parseDirectiveMovSP(DirectiveID.getLoc());
+ else if (IDVal == ".arch_extension")
+ parseDirectiveArchExtension(DirectiveID.getLoc());
+ else if (IDVal == ".align")
+ return parseDirectiveAlign(DirectiveID.getLoc()); // Use Generic on failure.
+ else if (IDVal == ".thumb_set")
+ parseDirectiveThumbSet(DirectiveID.getLoc());
+ else if (!IsMachO && !IsCOFF) {
+ if (IDVal == ".arch")
+ parseDirectiveArch(DirectiveID.getLoc());
+ else if (IDVal == ".cpu")
+ parseDirectiveCPU(DirectiveID.getLoc());
+ else if (IDVal == ".eabi_attribute")
+ parseDirectiveEabiAttr(DirectiveID.getLoc());
+ else if (IDVal == ".fpu")
+ parseDirectiveFPU(DirectiveID.getLoc());
+ else if (IDVal == ".fnstart")
+ parseDirectiveFnStart(DirectiveID.getLoc());
+ else if (IDVal == ".inst")
+ parseDirectiveInst(DirectiveID.getLoc());
+ else if (IDVal == ".inst.n")
+ parseDirectiveInst(DirectiveID.getLoc(), 'n');
+ else if (IDVal == ".inst.w")
+ parseDirectiveInst(DirectiveID.getLoc(), 'w');
+ else if (IDVal == ".object_arch")
+ parseDirectiveObjectArch(DirectiveID.getLoc());
+ else if (IDVal == ".tlsdescseq")
+ parseDirectiveTLSDescSeq(DirectiveID.getLoc());
+ else
+ return true;
+ } else
+ return true;
+ return false;
+}
+
+/// parseLiteralValues
+/// ::= .hword expression [, expression]*
+/// ::= .short expression [, expression]*
+/// ::= .word expression [, expression]*
+bool ARMAsmParser::parseLiteralValues(unsigned Size, SMLoc L) {
+ auto parseOne = [&]() -> bool {
+ const MCExpr *Value;
+ if (getParser().parseExpression(Value))
+ return true;
+ getParser().getStreamer().EmitValue(Value, Size, L);
+ return false;
+ };
+ return (parseMany(parseOne));
+}
+
+/// parseDirectiveThumb
+/// ::= .thumb
+bool ARMAsmParser::parseDirectiveThumb(SMLoc L) {
+ if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive") ||
+ check(!hasThumb(), L, "target does not support Thumb mode"))
+ return true;
+
+ if (!isThumb())
+ SwitchMode();
+
+ getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16);
+ return false;
+}
+
+/// parseDirectiveARM
+/// ::= .arm
+bool ARMAsmParser::parseDirectiveARM(SMLoc L) {
+ if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive") ||
+ check(!hasARM(), L, "target does not support ARM mode"))
+ return true;
+
+ if (isThumb())
+ SwitchMode();
+ getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32);
+ return false;
+}
+
+void ARMAsmParser::onLabelParsed(MCSymbol *Symbol) {
+ // We need to flush the current implicit IT block on a label, because it is
+ // not legal to branch into an IT block.
+ flushPendingInstructions(getStreamer());
+ if (NextSymbolIsThumb) {
+ getParser().getStreamer().EmitThumbFunc(Symbol);
+ NextSymbolIsThumb = false;
+ }
+}
+
+/// parseDirectiveThumbFunc
+/// ::= .thumbfunc symbol_name
+bool ARMAsmParser::parseDirectiveThumbFunc(SMLoc L) {
+ MCAsmParser &Parser = getParser();
+ const auto Format = getContext().getObjectFileInfo()->getObjectFileType();
+ bool IsMachO = Format == MCObjectFileInfo::IsMachO;
+
+ // Darwin asm has (optionally) function name after .thumb_func direction
+ // ELF doesn't
+
+ if (IsMachO) {
+ if (Parser.getTok().is(AsmToken::Identifier) ||
+ Parser.getTok().is(AsmToken::String)) {
+ MCSymbol *Func = getParser().getContext().getOrCreateSymbol(
+ Parser.getTok().getIdentifier());
+ getParser().getStreamer().EmitThumbFunc(Func);
+ Parser.Lex();
+ if (parseToken(AsmToken::EndOfStatement,
+ "unexpected token in '.thumb_func' directive"))
+ return true;
+ return false;
+ }
+ }
+
+ if (parseToken(AsmToken::EndOfStatement,
+ "unexpected token in '.thumb_func' directive"))
+ return true;
+
+ NextSymbolIsThumb = true;
+ return false;
+}
+
+/// parseDirectiveSyntax
+/// ::= .syntax unified | divided
+bool ARMAsmParser::parseDirectiveSyntax(SMLoc L) {
+ MCAsmParser &Parser = getParser();
+ const AsmToken &Tok = Parser.getTok();
+ if (Tok.isNot(AsmToken::Identifier)) {
+ Error(L, "unexpected token in .syntax directive");
+ return false;
+ }
+
+ StringRef Mode = Tok.getString();
+ Parser.Lex();
+ if (check(Mode == "divided" || Mode == "DIVIDED", L,
+ "'.syntax divided' arm assembly not supported") ||
+ check(Mode != "unified" && Mode != "UNIFIED", L,
+ "unrecognized syntax mode in .syntax directive") ||
+ parseToken(AsmToken::EndOfStatement, "unexpected token in directive"))
+ return true;
+
+ // TODO tell the MC streamer the mode
+ // getParser().getStreamer().Emit???();
+ return false;
+}
+
+/// parseDirectiveCode
+/// ::= .code 16 | 32
+bool ARMAsmParser::parseDirectiveCode(SMLoc L) {
+ MCAsmParser &Parser = getParser();
+ const AsmToken &Tok = Parser.getTok();
+ if (Tok.isNot(AsmToken::Integer))
+ return Error(L, "unexpected token in .code directive");
+ int64_t Val = Parser.getTok().getIntVal();
+ if (Val != 16 && Val != 32) {
+ Error(L, "invalid operand to .code directive");
+ return false;
+ }
+ Parser.Lex();
+
+ if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive"))
+ return true;
+
+ if (Val == 16) {
+ if (!hasThumb())
+ return Error(L, "target does not support Thumb mode");
+
+ if (!isThumb())
+ SwitchMode();
+ getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16);
+ } else {
+ if (!hasARM())
+ return Error(L, "target does not support ARM mode");
+
+ if (isThumb())
+ SwitchMode();
+ getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32);
+ }
+
+ return false;
+}
+
+/// parseDirectiveReq
+/// ::= name .req registername
+bool ARMAsmParser::parseDirectiveReq(StringRef Name, SMLoc L) {
+ MCAsmParser &Parser = getParser();
+ Parser.Lex(); // Eat the '.req' token.
+ unsigned Reg;
+ SMLoc SRegLoc, ERegLoc;
+ if (check(ParseRegister(Reg, SRegLoc, ERegLoc), SRegLoc,
+ "register name expected") ||
+ parseToken(AsmToken::EndOfStatement,
+ "unexpected input in .req directive."))
+ return true;
+
+ if (RegisterReqs.insert(std::make_pair(Name, Reg)).first->second != Reg)
+ return Error(SRegLoc,
+ "redefinition of '" + Name + "' does not match original.");
+
+ return false;
+}
+
+/// parseDirectiveUneq
+/// ::= .unreq registername
+bool ARMAsmParser::parseDirectiveUnreq(SMLoc L) {
+ MCAsmParser &Parser = getParser();
+ if (Parser.getTok().isNot(AsmToken::Identifier))
+ return Error(L, "unexpected input in .unreq directive.");
+ RegisterReqs.erase(Parser.getTok().getIdentifier().lower());
+ Parser.Lex(); // Eat the identifier.
+ if (parseToken(AsmToken::EndOfStatement,
+ "unexpected input in '.unreq' directive"))
+ return true;
+ return false;
+}
+
+// After changing arch/CPU, try to put the ARM/Thumb mode back to what it was
+// before, if supported by the new target, or emit mapping symbols for the mode
+// switch.
+void ARMAsmParser::FixModeAfterArchChange(bool WasThumb, SMLoc Loc) {
+ if (WasThumb != isThumb()) {
+ if (WasThumb && hasThumb()) {
+ // Stay in Thumb mode
+ SwitchMode();
+ } else if (!WasThumb && hasARM()) {
+ // Stay in ARM mode
+ SwitchMode();
+ } else {
+ // Mode switch forced, because the new arch doesn't support the old mode.
+ getParser().getStreamer().EmitAssemblerFlag(isThumb() ? MCAF_Code16
+ : MCAF_Code32);
+ // Warn about the implcit mode switch. GAS does not switch modes here,
+ // but instead stays in the old mode, reporting an error on any following
+ // instructions as the mode does not exist on the target.
+ Warning(Loc, Twine("new target does not support ") +
+ (WasThumb ? "thumb" : "arm") + " mode, switching to " +
+ (!WasThumb ? "thumb" : "arm") + " mode");
+ }
+ }
+}
+
+/// parseDirectiveArch
+/// ::= .arch token
+bool ARMAsmParser::parseDirectiveArch(SMLoc L) {
+ StringRef Arch = getParser().parseStringToEndOfStatement().trim();
+ unsigned ID = ARM::parseArch(Arch);
+
+ if (ID == ARM::AK_INVALID)
+ return Error(L, "Unknown arch name");
+
+ bool WasThumb = isThumb();
+ Triple T;
+ MCSubtargetInfo &STI = copySTI();
+ STI.setDefaultFeatures("", ("+" + ARM::getArchName(ID)).str());
+ setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+ FixModeAfterArchChange(WasThumb, L);
+
+ getTargetStreamer().emitArch(ID);
+ return false;
+}
+
+/// parseDirectiveEabiAttr
+/// ::= .eabi_attribute int, int [, "str"]
+/// ::= .eabi_attribute Tag_name, int [, "str"]
+bool ARMAsmParser::parseDirectiveEabiAttr(SMLoc L) {
+ MCAsmParser &Parser = getParser();
+ int64_t Tag;
+ SMLoc TagLoc;
+ TagLoc = Parser.getTok().getLoc();
+ if (Parser.getTok().is(AsmToken::Identifier)) {
+ StringRef Name = Parser.getTok().getIdentifier();
+ Tag = ARMBuildAttrs::AttrTypeFromString(Name);
+ if (Tag == -1) {
+ Error(TagLoc, "attribute name not recognised: " + Name);
+ return false;
+ }
+ Parser.Lex();
+ } else {
+ const MCExpr *AttrExpr;
+
+ TagLoc = Parser.getTok().getLoc();
+ if (Parser.parseExpression(AttrExpr))
+ return true;
+
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(AttrExpr);
+ if (check(!CE, TagLoc, "expected numeric constant"))
+ return true;
+
+ Tag = CE->getValue();
+ }
+
+ if (Parser.parseToken(AsmToken::Comma, "comma expected"))
+ return true;
+
+ StringRef StringValue = "";
+ bool IsStringValue = false;
+
+ int64_t IntegerValue = 0;
+ bool IsIntegerValue = false;
+
+ if (Tag == ARMBuildAttrs::CPU_raw_name || Tag == ARMBuildAttrs::CPU_name)
+ IsStringValue = true;
+ else if (Tag == ARMBuildAttrs::compatibility) {
+ IsStringValue = true;
+ IsIntegerValue = true;
+ } else if (Tag < 32 || Tag % 2 == 0)
+ IsIntegerValue = true;
+ else if (Tag % 2 == 1)
+ IsStringValue = true;
+ else
+ llvm_unreachable("invalid tag type");
+
+ if (IsIntegerValue) {
+ const MCExpr *ValueExpr;
+ SMLoc ValueExprLoc = Parser.getTok().getLoc();
+ if (Parser.parseExpression(ValueExpr))
+ return true;
+
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ValueExpr);
+ if (!CE)
+ return Error(ValueExprLoc, "expected numeric constant");
+ IntegerValue = CE->getValue();
+ }
+
+ if (Tag == ARMBuildAttrs::compatibility) {
+ if (Parser.parseToken(AsmToken::Comma, "comma expected"))
+ return true;
+ }
+
+ if (IsStringValue) {
+ if (Parser.getTok().isNot(AsmToken::String))
+ return Error(Parser.getTok().getLoc(), "bad string constant");
+
+ StringValue = Parser.getTok().getStringContents();
+ Parser.Lex();
+ }
+
+ if (Parser.parseToken(AsmToken::EndOfStatement,
+ "unexpected token in '.eabi_attribute' directive"))
+ return true;
+
+ if (IsIntegerValue && IsStringValue) {
+ assert(Tag == ARMBuildAttrs::compatibility);
+ getTargetStreamer().emitIntTextAttribute(Tag, IntegerValue, StringValue);
+ } else if (IsIntegerValue)
+ getTargetStreamer().emitAttribute(Tag, IntegerValue);
+ else if (IsStringValue)
+ getTargetStreamer().emitTextAttribute(Tag, StringValue);
+ return false;
+}
+
+/// parseDirectiveCPU
+/// ::= .cpu str
+bool ARMAsmParser::parseDirectiveCPU(SMLoc L) {
+ StringRef CPU = getParser().parseStringToEndOfStatement().trim();
+ getTargetStreamer().emitTextAttribute(ARMBuildAttrs::CPU_name, CPU);
+
+ // FIXME: This is using table-gen data, but should be moved to
+ // ARMTargetParser once that is table-gen'd.
+ if (!getSTI().isCPUStringValid(CPU))
+ return Error(L, "Unknown CPU name");
+
+ bool WasThumb = isThumb();
+ MCSubtargetInfo &STI = copySTI();
+ STI.setDefaultFeatures(CPU, "");
+ setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+ FixModeAfterArchChange(WasThumb, L);
+
+ return false;
+}
+/// parseDirectiveFPU
+/// ::= .fpu str
+bool ARMAsmParser::parseDirectiveFPU(SMLoc L) {
+ SMLoc FPUNameLoc = getTok().getLoc();
+ StringRef FPU = getParser().parseStringToEndOfStatement().trim();
+
+ unsigned ID = ARM::parseFPU(FPU);
+ std::vector<StringRef> Features;
+ if (!ARM::getFPUFeatures(ID, Features))
+ return Error(FPUNameLoc, "Unknown FPU name");
+
+ MCSubtargetInfo &STI = copySTI();
+ for (auto Feature : Features)
+ STI.ApplyFeatureFlag(Feature);
+ setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+
+ getTargetStreamer().emitFPU(ID);
+ return false;
+}
+
+/// parseDirectiveFnStart
+/// ::= .fnstart
+bool ARMAsmParser::parseDirectiveFnStart(SMLoc L) {
+ if (parseToken(AsmToken::EndOfStatement,
+ "unexpected token in '.fnstart' directive"))
+ return true;
+
+ if (UC.hasFnStart()) {
+ Error(L, ".fnstart starts before the end of previous one");
+ UC.emitFnStartLocNotes();
+ return true;
+ }
+
+ // Reset the unwind directives parser state
+ UC.reset();
+
+ getTargetStreamer().emitFnStart();
+
+ UC.recordFnStart(L);
+ return false;
+}
+
+/// parseDirectiveFnEnd
+/// ::= .fnend
+bool ARMAsmParser::parseDirectiveFnEnd(SMLoc L) {
+ if (parseToken(AsmToken::EndOfStatement,
+ "unexpected token in '.fnend' directive"))
+ return true;
+ // Check the ordering of unwind directives
+ if (!UC.hasFnStart())
+ return Error(L, ".fnstart must precede .fnend directive");
+
+ // Reset the unwind directives parser state
+ getTargetStreamer().emitFnEnd();
+
+ UC.reset();
+ return false;
+}
+
+/// parseDirectiveCantUnwind
+/// ::= .cantunwind
+bool ARMAsmParser::parseDirectiveCantUnwind(SMLoc L) {
+ if (parseToken(AsmToken::EndOfStatement,
+ "unexpected token in '.cantunwind' directive"))
+ return true;
+
+ UC.recordCantUnwind(L);
+ // Check the ordering of unwind directives
+ if (check(!UC.hasFnStart(), L, ".fnstart must precede .cantunwind directive"))
+ return true;
+
+ if (UC.hasHandlerData()) {
+ Error(L, ".cantunwind can't be used with .handlerdata directive");
+ UC.emitHandlerDataLocNotes();
+ return true;
+ }
+ if (UC.hasPersonality()) {
+ Error(L, ".cantunwind can't be used with .personality directive");
+ UC.emitPersonalityLocNotes();
+ return true;
+ }
+
+ getTargetStreamer().emitCantUnwind();
+ return false;
+}
+
+/// parseDirectivePersonality
+/// ::= .personality name
+bool ARMAsmParser::parseDirectivePersonality(SMLoc L) {
+ MCAsmParser &Parser = getParser();
+ bool HasExistingPersonality = UC.hasPersonality();
+
+ // Parse the name of the personality routine
+ if (Parser.getTok().isNot(AsmToken::Identifier))
+ return Error(L, "unexpected input in .personality directive.");
+ StringRef Name(Parser.getTok().getIdentifier());
+ Parser.Lex();
+
+ if (parseToken(AsmToken::EndOfStatement,
+ "unexpected token in '.personality' directive"))
+ return true;
+
+ UC.recordPersonality(L);
+
+ // Check the ordering of unwind directives
+ if (!UC.hasFnStart())
+ return Error(L, ".fnstart must precede .personality directive");
+ if (UC.cantUnwind()) {
+ Error(L, ".personality can't be used with .cantunwind directive");
+ UC.emitCantUnwindLocNotes();
+ return true;
+ }
+ if (UC.hasHandlerData()) {
+ Error(L, ".personality must precede .handlerdata directive");
+ UC.emitHandlerDataLocNotes();
+ return true;
+ }
+ if (HasExistingPersonality) {
+ Error(L, "multiple personality directives");
+ UC.emitPersonalityLocNotes();
+ return true;
+ }
+
+ MCSymbol *PR = getParser().getContext().getOrCreateSymbol(Name);
+ getTargetStreamer().emitPersonality(PR);
+ return false;
+}
+
+/// parseDirectiveHandlerData
+/// ::= .handlerdata
+bool ARMAsmParser::parseDirectiveHandlerData(SMLoc L) {
+ if (parseToken(AsmToken::EndOfStatement,
+ "unexpected token in '.handlerdata' directive"))
+ return true;
+
+ UC.recordHandlerData(L);
+ // Check the ordering of unwind directives
+ if (!UC.hasFnStart())
+ return Error(L, ".fnstart must precede .personality directive");
+ if (UC.cantUnwind()) {
+ Error(L, ".handlerdata can't be used with .cantunwind directive");
+ UC.emitCantUnwindLocNotes();
+ return true;
+ }
+
+ getTargetStreamer().emitHandlerData();
+ return false;
+}
+
+/// parseDirectiveSetFP
+/// ::= .setfp fpreg, spreg [, offset]
+bool ARMAsmParser::parseDirectiveSetFP(SMLoc L) {
+ MCAsmParser &Parser = getParser();
+ // Check the ordering of unwind directives
+ if (check(!UC.hasFnStart(), L, ".fnstart must precede .setfp directive") ||
+ check(UC.hasHandlerData(), L,
+ ".setfp must precede .handlerdata directive"))
+ return true;
+
+ // Parse fpreg
+ SMLoc FPRegLoc = Parser.getTok().getLoc();
+ int FPReg = tryParseRegister();
+
+ if (check(FPReg == -1, FPRegLoc, "frame pointer register expected") ||
+ Parser.parseToken(AsmToken::Comma, "comma expected"))
+ return true;
+
+ // Parse spreg
+ SMLoc SPRegLoc = Parser.getTok().getLoc();
+ int SPReg = tryParseRegister();
+ if (check(SPReg == -1, SPRegLoc, "stack pointer register expected") ||
+ check(SPReg != ARM::SP && SPReg != UC.getFPReg(), SPRegLoc,
+ "register should be either $sp or the latest fp register"))
+ return true;
+
+ // Update the frame pointer register
+ UC.saveFPReg(FPReg);
+
+ // Parse offset
+ int64_t Offset = 0;
+ if (Parser.parseOptionalToken(AsmToken::Comma)) {
+ if (Parser.getTok().isNot(AsmToken::Hash) &&
+ Parser.getTok().isNot(AsmToken::Dollar))
+ return Error(Parser.getTok().getLoc(), "'#' expected");
+ Parser.Lex(); // skip hash token.
+
+ const MCExpr *OffsetExpr;
+ SMLoc ExLoc = Parser.getTok().getLoc();
+ SMLoc EndLoc;
+ if (getParser().parseExpression(OffsetExpr, EndLoc))
+ return Error(ExLoc, "malformed setfp offset");
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(OffsetExpr);
+ if (check(!CE, ExLoc, "setfp offset must be an immediate"))
+ return true;
+ Offset = CE->getValue();
+ }
+
+ if (Parser.parseToken(AsmToken::EndOfStatement))
+ return true;
+
+ getTargetStreamer().emitSetFP(static_cast<unsigned>(FPReg),
+ static_cast<unsigned>(SPReg), Offset);
+ return false;
+}
+
+/// parseDirective
+/// ::= .pad offset
+bool ARMAsmParser::parseDirectivePad(SMLoc L) {
+ MCAsmParser &Parser = getParser();
+ // Check the ordering of unwind directives
+ if (!UC.hasFnStart())
+ return Error(L, ".fnstart must precede .pad directive");
+ if (UC.hasHandlerData())
+ return Error(L, ".pad must precede .handlerdata directive");
+
+ // Parse the offset
+ if (Parser.getTok().isNot(AsmToken::Hash) &&
+ Parser.getTok().isNot(AsmToken::Dollar))
+ return Error(Parser.getTok().getLoc(), "'#' expected");
+ Parser.Lex(); // skip hash token.
+
+ const MCExpr *OffsetExpr;
+ SMLoc ExLoc = Parser.getTok().getLoc();
+ SMLoc EndLoc;
+ if (getParser().parseExpression(OffsetExpr, EndLoc))
+ return Error(ExLoc, "malformed pad offset");
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(OffsetExpr);
+ if (!CE)
+ return Error(ExLoc, "pad offset must be an immediate");
+
+ if (parseToken(AsmToken::EndOfStatement,
+ "unexpected token in '.pad' directive"))
+ return true;
+
+ getTargetStreamer().emitPad(CE->getValue());
+ return false;
+}
+
+/// parseDirectiveRegSave
+/// ::= .save { registers }
+/// ::= .vsave { registers }
+bool ARMAsmParser::parseDirectiveRegSave(SMLoc L, bool IsVector) {
+ // Check the ordering of unwind directives
+ if (!UC.hasFnStart())
+ return Error(L, ".fnstart must precede .save or .vsave directives");
+ if (UC.hasHandlerData())
+ return Error(L, ".save or .vsave must precede .handlerdata directive");
+
+ // RAII object to make sure parsed operands are deleted.
+ SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> Operands;
+
+ // Parse the register list
+ if (parseRegisterList(Operands) ||
+ parseToken(AsmToken::EndOfStatement, "unexpected token in directive"))
+ return true;
+ ARMOperand &Op = (ARMOperand &)*Operands[0];
+ if (!IsVector && !Op.isRegList())
+ return Error(L, ".save expects GPR registers");
+ if (IsVector && !Op.isDPRRegList())
+ return Error(L, ".vsave expects DPR registers");
+
+ getTargetStreamer().emitRegSave(Op.getRegList(), IsVector);
+ return false;
+}
+
+/// parseDirectiveInst
+/// ::= .inst opcode [, ...]
+/// ::= .inst.n opcode [, ...]
+/// ::= .inst.w opcode [, ...]
+bool ARMAsmParser::parseDirectiveInst(SMLoc Loc, char Suffix) {
+ int Width = 4;
+
+ if (isThumb()) {
+ switch (Suffix) {
+ case 'n':
+ Width = 2;
+ break;
+ case 'w':
+ break;
+ default:
+ return Error(Loc, "cannot determine Thumb instruction size, "
+ "use inst.n/inst.w instead");
+ }
+ } else {
+ if (Suffix)
+ return Error(Loc, "width suffixes are invalid in ARM mode");
+ }
+
+ auto parseOne = [&]() -> bool {
+ const MCExpr *Expr;
+ if (getParser().parseExpression(Expr))
+ return true;
+ const MCConstantExpr *Value = dyn_cast_or_null<MCConstantExpr>(Expr);
+ if (!Value) {
+ return Error(Loc, "expected constant expression");
+ }
+
+ switch (Width) {
+ case 2:
+ if (Value->getValue() > 0xffff)
+ return Error(Loc, "inst.n operand is too big, use inst.w instead");
+ break;
+ case 4:
+ if (Value->getValue() > 0xffffffff)
+ return Error(Loc, StringRef(Suffix ? "inst.w" : "inst") +
+ " operand is too big");
+ break;
+ default:
+ llvm_unreachable("only supported widths are 2 and 4");
+ }
+
+ getTargetStreamer().emitInst(Value->getValue(), Suffix);
+ return false;
+ };
+
+ if (parseOptionalToken(AsmToken::EndOfStatement))
+ return Error(Loc, "expected expression following directive");
+ if (parseMany(parseOne))
+ return true;
+ return false;
+}
+
+/// parseDirectiveLtorg
+/// ::= .ltorg | .pool
+bool ARMAsmParser::parseDirectiveLtorg(SMLoc L) {
+ if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive"))
+ return true;
+ getTargetStreamer().emitCurrentConstantPool();
+ return false;
+}
+
+bool ARMAsmParser::parseDirectiveEven(SMLoc L) {
+ const MCSection *Section = getStreamer().getCurrentSectionOnly();
+
+ if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive"))
+ return true;
+
+ if (!Section) {
+ getStreamer().InitSections(false);
+ Section = getStreamer().getCurrentSectionOnly();
+ }
+
+ assert(Section && "must have section to emit alignment");
+ if (Section->UseCodeAlign())
+ getStreamer().EmitCodeAlignment(2);
+ else
+ getStreamer().EmitValueToAlignment(2);
+
+ return false;
+}
+
+/// parseDirectivePersonalityIndex
+/// ::= .personalityindex index
+bool ARMAsmParser::parseDirectivePersonalityIndex(SMLoc L) {
+ MCAsmParser &Parser = getParser();
+ bool HasExistingPersonality = UC.hasPersonality();
+
+ const MCExpr *IndexExpression;
+ SMLoc IndexLoc = Parser.getTok().getLoc();
+ if (Parser.parseExpression(IndexExpression) ||
+ parseToken(AsmToken::EndOfStatement,
+ "unexpected token in '.personalityindex' directive")) {
+ return true;
+ }
+
+ UC.recordPersonalityIndex(L);
+
+ if (!UC.hasFnStart()) {
+ return Error(L, ".fnstart must precede .personalityindex directive");
+ }
+ if (UC.cantUnwind()) {
+ Error(L, ".personalityindex cannot be used with .cantunwind");
+ UC.emitCantUnwindLocNotes();
+ return true;
+ }
+ if (UC.hasHandlerData()) {
+ Error(L, ".personalityindex must precede .handlerdata directive");
+ UC.emitHandlerDataLocNotes();
+ return true;
+ }
+ if (HasExistingPersonality) {
+ Error(L, "multiple personality directives");
+ UC.emitPersonalityLocNotes();
+ return true;
+ }
+
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(IndexExpression);
+ if (!CE)
+ return Error(IndexLoc, "index must be a constant number");
+ if (CE->getValue() < 0 || CE->getValue() >= ARM::EHABI::NUM_PERSONALITY_INDEX)
+ return Error(IndexLoc,
+ "personality routine index should be in range [0-3]");
+
+ getTargetStreamer().emitPersonalityIndex(CE->getValue());
+ return false;
+}
+
+/// parseDirectiveUnwindRaw
+/// ::= .unwind_raw offset, opcode [, opcode...]
+bool ARMAsmParser::parseDirectiveUnwindRaw(SMLoc L) {
+ MCAsmParser &Parser = getParser();
+ int64_t StackOffset;
+ const MCExpr *OffsetExpr;
+ SMLoc OffsetLoc = getLexer().getLoc();
+
+ if (!UC.hasFnStart())
+ return Error(L, ".fnstart must precede .unwind_raw directives");
+ if (getParser().parseExpression(OffsetExpr))
+ return Error(OffsetLoc, "expected expression");
+
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(OffsetExpr);
+ if (!CE)
+ return Error(OffsetLoc, "offset must be a constant");
+
+ StackOffset = CE->getValue();
+
+ if (Parser.parseToken(AsmToken::Comma, "expected comma"))
+ return true;
+
+ SmallVector<uint8_t, 16> Opcodes;
+
+ auto parseOne = [&]() -> bool {
+ const MCExpr *OE;
+ SMLoc OpcodeLoc = getLexer().getLoc();
+ if (check(getLexer().is(AsmToken::EndOfStatement) ||
+ Parser.parseExpression(OE),
+ OpcodeLoc, "expected opcode expression"))
+ return true;
+ const MCConstantExpr *OC = dyn_cast<MCConstantExpr>(OE);
+ if (!OC)
+ return Error(OpcodeLoc, "opcode value must be a constant");
+ const int64_t Opcode = OC->getValue();
+ if (Opcode & ~0xff)
+ return Error(OpcodeLoc, "invalid opcode");
+ Opcodes.push_back(uint8_t(Opcode));
+ return false;
+ };
+
+ // Must have at least 1 element
+ SMLoc OpcodeLoc = getLexer().getLoc();
+ if (parseOptionalToken(AsmToken::EndOfStatement))
+ return Error(OpcodeLoc, "expected opcode expression");
+ if (parseMany(parseOne))
+ return true;
+
+ getTargetStreamer().emitUnwindRaw(StackOffset, Opcodes);
+ return false;
+}
+
+/// parseDirectiveTLSDescSeq
+/// ::= .tlsdescseq tls-variable
+bool ARMAsmParser::parseDirectiveTLSDescSeq(SMLoc L) {
+ MCAsmParser &Parser = getParser();
+
+ if (getLexer().isNot(AsmToken::Identifier))
+ return TokError("expected variable after '.tlsdescseq' directive");
+
+ const MCSymbolRefExpr *SRE =
+ MCSymbolRefExpr::create(Parser.getTok().getIdentifier(),
+ MCSymbolRefExpr::VK_ARM_TLSDESCSEQ, getContext());
+ Lex();
+
+ if (parseToken(AsmToken::EndOfStatement,
+ "unexpected token in '.tlsdescseq' directive"))
+ return true;
+
+ getTargetStreamer().AnnotateTLSDescriptorSequence(SRE);
+ return false;
+}
+
+/// parseDirectiveMovSP
+/// ::= .movsp reg [, #offset]
+bool ARMAsmParser::parseDirectiveMovSP(SMLoc L) {
+ MCAsmParser &Parser = getParser();
+ if (!UC.hasFnStart())
+ return Error(L, ".fnstart must precede .movsp directives");
+ if (UC.getFPReg() != ARM::SP)
+ return Error(L, "unexpected .movsp directive");
+
+ SMLoc SPRegLoc = Parser.getTok().getLoc();
+ int SPReg = tryParseRegister();
+ if (SPReg == -1)
+ return Error(SPRegLoc, "register expected");
+ if (SPReg == ARM::SP || SPReg == ARM::PC)
+ return Error(SPRegLoc, "sp and pc are not permitted in .movsp directive");
+
+ int64_t Offset = 0;
+ if (Parser.parseOptionalToken(AsmToken::Comma)) {
+ if (Parser.parseToken(AsmToken::Hash, "expected #constant"))
+ return true;
+
+ const MCExpr *OffsetExpr;
+ SMLoc OffsetLoc = Parser.getTok().getLoc();
+
+ if (Parser.parseExpression(OffsetExpr))
+ return Error(OffsetLoc, "malformed offset expression");
+
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(OffsetExpr);
+ if (!CE)
+ return Error(OffsetLoc, "offset must be an immediate constant");
+
+ Offset = CE->getValue();
+ }
+
+ if (parseToken(AsmToken::EndOfStatement,
+ "unexpected token in '.movsp' directive"))
+ return true;
+
+ getTargetStreamer().emitMovSP(SPReg, Offset);
+ UC.saveFPReg(SPReg);
+
+ return false;
+}
+
+/// parseDirectiveObjectArch
+/// ::= .object_arch name
+bool ARMAsmParser::parseDirectiveObjectArch(SMLoc L) {
+ MCAsmParser &Parser = getParser();
+ if (getLexer().isNot(AsmToken::Identifier))
+ return Error(getLexer().getLoc(), "unexpected token");
+
+ StringRef Arch = Parser.getTok().getString();
+ SMLoc ArchLoc = Parser.getTok().getLoc();
+ Lex();
+
+ unsigned ID = ARM::parseArch(Arch);
+
+ if (ID == ARM::AK_INVALID)
+ return Error(ArchLoc, "unknown architecture '" + Arch + "'");
+ if (parseToken(AsmToken::EndOfStatement))
+ return true;
+
+ getTargetStreamer().emitObjectArch(ID);
+ return false;
+}
+
+/// parseDirectiveAlign
+/// ::= .align
+bool ARMAsmParser::parseDirectiveAlign(SMLoc L) {
+ // NOTE: if this is not the end of the statement, fall back to the target
+ // agnostic handling for this directive which will correctly handle this.
+ if (parseOptionalToken(AsmToken::EndOfStatement)) {
+ // '.align' is target specifically handled to mean 2**2 byte alignment.
+ const MCSection *Section = getStreamer().getCurrentSectionOnly();
+ assert(Section && "must have section to emit alignment");
+ if (Section->UseCodeAlign())
+ getStreamer().EmitCodeAlignment(4, 0);
+ else
+ getStreamer().EmitValueToAlignment(4, 0, 1, 0);
+ return false;
+ }
+ return true;
+}
+
+/// parseDirectiveThumbSet
+/// ::= .thumb_set name, value
+bool ARMAsmParser::parseDirectiveThumbSet(SMLoc L) {
+ MCAsmParser &Parser = getParser();
+
+ StringRef Name;
+ if (check(Parser.parseIdentifier(Name),
+ "expected identifier after '.thumb_set'") ||
+ parseToken(AsmToken::Comma, "expected comma after name '" + Name + "'"))
+ return true;
+
+ MCSymbol *Sym;
+ const MCExpr *Value;
+ if (MCParserUtils::parseAssignmentExpression(Name, /* allow_redef */ true,
+ Parser, Sym, Value))
+ return true;
+
+ getTargetStreamer().emitThumbSet(Sym, Value);
+ return false;
+}
+
+/// Force static initialization.
+extern "C" void LLVMInitializeARMAsmParser() {
+ RegisterMCAsmParser<ARMAsmParser> X(getTheARMLETarget());
+ RegisterMCAsmParser<ARMAsmParser> Y(getTheARMBETarget());
+ RegisterMCAsmParser<ARMAsmParser> A(getTheThumbLETarget());
+ RegisterMCAsmParser<ARMAsmParser> B(getTheThumbBETarget());
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_SUBTARGET_FEATURE_NAME
+#define GET_MATCHER_IMPLEMENTATION
+#include "ARMGenAsmMatcher.inc"
+
+// FIXME: This structure should be moved inside ARMTargetParser
+// when we start to table-generate them, and we can use the ARM
+// flags below, that were generated by table-gen.
+static const struct {
+ const unsigned Kind;
+ const uint64_t ArchCheck;
+ const FeatureBitset Features;
+} Extensions[] = {
+ { ARM::AEK_CRC, Feature_HasV8, {ARM::FeatureCRC} },
+ { ARM::AEK_CRYPTO, Feature_HasV8,
+ {ARM::FeatureCrypto, ARM::FeatureNEON, ARM::FeatureFPARMv8} },
+ { ARM::AEK_FP, Feature_HasV8, {ARM::FeatureFPARMv8} },
+ { (ARM::AEK_HWDIV | ARM::AEK_HWDIVARM), Feature_HasV7 | Feature_IsNotMClass,
+ {ARM::FeatureHWDiv, ARM::FeatureHWDivARM} },
+ { ARM::AEK_MP, Feature_HasV7 | Feature_IsNotMClass, {ARM::FeatureMP} },
+ { ARM::AEK_SIMD, Feature_HasV8, {ARM::FeatureNEON, ARM::FeatureFPARMv8} },
+ { ARM::AEK_SEC, Feature_HasV6K, {ARM::FeatureTrustZone} },
+ // FIXME: Only available in A-class, isel not predicated
+ { ARM::AEK_VIRT, Feature_HasV7, {ARM::FeatureVirtualization} },
+ { ARM::AEK_FP16, Feature_HasV8_2a, {ARM::FeatureFPARMv8, ARM::FeatureFullFP16} },
+ { ARM::AEK_RAS, Feature_HasV8, {ARM::FeatureRAS} },
+ // FIXME: Unsupported extensions.
+ { ARM::AEK_OS, Feature_None, {} },
+ { ARM::AEK_IWMMXT, Feature_None, {} },
+ { ARM::AEK_IWMMXT2, Feature_None, {} },
+ { ARM::AEK_MAVERICK, Feature_None, {} },
+ { ARM::AEK_XSCALE, Feature_None, {} },
+};
+
+/// parseDirectiveArchExtension
+/// ::= .arch_extension [no]feature
+bool ARMAsmParser::parseDirectiveArchExtension(SMLoc L) {
+ MCAsmParser &Parser = getParser();
+
+ if (getLexer().isNot(AsmToken::Identifier))
+ return Error(getLexer().getLoc(), "expected architecture extension name");
+
+ StringRef Name = Parser.getTok().getString();
+ SMLoc ExtLoc = Parser.getTok().getLoc();
+ Lex();
+
+ if (parseToken(AsmToken::EndOfStatement,
+ "unexpected token in '.arch_extension' directive"))
+ return true;
+
+ bool EnableFeature = true;
+ if (Name.startswith_lower("no")) {
+ EnableFeature = false;
+ Name = Name.substr(2);
+ }
+ unsigned FeatureKind = ARM::parseArchExt(Name);
+ if (FeatureKind == ARM::AEK_INVALID)
+ return Error(ExtLoc, "unknown architectural extension: " + Name);
+
+ for (const auto &Extension : Extensions) {
+ if (Extension.Kind != FeatureKind)
+ continue;
+
+ if (Extension.Features.none())
+ return Error(ExtLoc, "unsupported architectural extension: " + Name);
+
+ if ((getAvailableFeatures() & Extension.ArchCheck) != Extension.ArchCheck)
+ return Error(ExtLoc, "architectural extension '" + Name +
+ "' is not "
+ "allowed for the current base architecture");
+
+ MCSubtargetInfo &STI = copySTI();
+ FeatureBitset ToggleFeatures = EnableFeature
+ ? (~STI.getFeatureBits() & Extension.Features)
+ : ( STI.getFeatureBits() & Extension.Features);
+
+ uint64_t Features =
+ ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures));
+ setAvailableFeatures(Features);
+ return false;
+ }
+
+ return Error(ExtLoc, "unknown architectural extension: " + Name);
+}
+
+// Define this matcher function after the auto-generated include so we
+// have the match class enum definitions.
+unsigned ARMAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
+ unsigned Kind) {
+ ARMOperand &Op = static_cast<ARMOperand &>(AsmOp);
+ // If the kind is a token for a literal immediate, check if our asm
+ // operand matches. This is for InstAliases which have a fixed-value
+ // immediate in the syntax.
+ switch (Kind) {
+ default: break;
+ case MCK__35_0:
+ if (Op.isImm())
+ if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm()))
+ if (CE->getValue() == 0)
+ return Match_Success;
+ break;
+ case MCK_ModImm:
+ if (Op.isImm()) {
+ const MCExpr *SOExpr = Op.getImm();
+ int64_t Value;
+ if (!SOExpr->evaluateAsAbsolute(Value))
+ return Match_Success;
+ assert((Value >= INT32_MIN && Value <= UINT32_MAX) &&
+ "expression value must be representable in 32 bits");
+ }
+ break;
+ case MCK_rGPR:
+ if (hasV8Ops() && Op.isReg() && Op.getReg() == ARM::SP)
+ return Match_Success;
+ break;
+ case MCK_GPRPair:
+ if (Op.isReg() &&
+ MRI->getRegClass(ARM::GPRRegClassID).contains(Op.getReg()))
+ return Match_Success;
+ break;
+ }
+ return Match_InvalidOperand;
+}
diff --git a/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
new file mode 100644
index 000000000000..ac3d8c780af2
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -0,0 +1,5313 @@
+//===-- ARMDisassembler.cpp - Disassembler for ARM/Thumb ISA --------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "MCTargetDesc/ARMMCExpr.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-disassembler"
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+ // Handles the condition code status of instructions in IT blocks
+ class ITStatus
+ {
+ public:
+ // Returns the condition code for instruction in IT block
+ unsigned getITCC() {
+ unsigned CC = ARMCC::AL;
+ if (instrInITBlock())
+ CC = ITStates.back();
+ return CC;
+ }
+
+ // Advances the IT block state to the next T or E
+ void advanceITState() {
+ ITStates.pop_back();
+ }
+
+ // Returns true if the current instruction is in an IT block
+ bool instrInITBlock() {
+ return !ITStates.empty();
+ }
+
+ // Returns true if current instruction is the last instruction in an IT block
+ bool instrLastInITBlock() {
+ return ITStates.size() == 1;
+ }
+
+ // Called when decoding an IT instruction. Sets the IT state for the following
+ // instructions that for the IT block. Firstcond and Mask correspond to the
+ // fields in the IT instruction encoding.
+ void setITState(char Firstcond, char Mask) {
+ // (3 - the number of trailing zeros) is the number of then / else.
+ unsigned CondBit0 = Firstcond & 1;
+ unsigned NumTZ = countTrailingZeros<uint8_t>(Mask);
+ unsigned char CCBits = static_cast<unsigned char>(Firstcond & 0xf);
+ assert(NumTZ <= 3 && "Invalid IT mask!");
+ // push condition codes onto the stack the correct order for the pops
+ for (unsigned Pos = NumTZ+1; Pos <= 3; ++Pos) {
+ bool T = ((Mask >> Pos) & 1) == CondBit0;
+ if (T)
+ ITStates.push_back(CCBits);
+ else
+ ITStates.push_back(CCBits ^ 1);
+ }
+ ITStates.push_back(CCBits);
+ }
+
+ private:
+ std::vector<unsigned char> ITStates;
+ };
+}
+
+namespace {
+/// ARM disassembler for all ARM platforms.
+class ARMDisassembler : public MCDisassembler {
+public:
+ ARMDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) :
+ MCDisassembler(STI, Ctx) {
+ }
+
+ ~ARMDisassembler() override {}
+
+ DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes, uint64_t Address,
+ raw_ostream &VStream,
+ raw_ostream &CStream) const override;
+};
+
+/// Thumb disassembler for all Thumb platforms.
+class ThumbDisassembler : public MCDisassembler {
+public:
+ ThumbDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) :
+ MCDisassembler(STI, Ctx) {
+ }
+
+ ~ThumbDisassembler() override {}
+
+ DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes, uint64_t Address,
+ raw_ostream &VStream,
+ raw_ostream &CStream) const override;
+
+private:
+ mutable ITStatus ITBlock;
+ DecodeStatus AddThumbPredicate(MCInst&) const;
+ void UpdateThumbVFPPredicate(MCInst&) const;
+};
+}
+
+static bool Check(DecodeStatus &Out, DecodeStatus In) {
+ switch (In) {
+ case MCDisassembler::Success:
+ // Out stays the same.
+ return true;
+ case MCDisassembler::SoftFail:
+ Out = In;
+ return true;
+ case MCDisassembler::Fail:
+ Out = In;
+ return false;
+ }
+ llvm_unreachable("Invalid DecodeStatus!");
+}
+
+
+// Forward declare these because the autogenerated code will reference them.
+// Definitions are further down.
+static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeGPRnopcRegisterClass(MCInst &Inst,
+ unsigned RegNo, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeGPRwithAPSRRegisterClass(MCInst &Inst,
+ unsigned RegNo, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodetGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodetcGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecoderGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSPRRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDPR_8RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDPR_VFP2RegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeQPRRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDPairRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDPairSpacedRegisterClass(MCInst &Inst,
+ unsigned RegNo, uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodePredicateOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeCCOutOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSPRRegListOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDPRRegListOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeBitfieldMaskOperand(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeCopMemInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrMode2IdxInstruction(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeSORegMemOperand(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrMode3Instruction(MCInst &Inst,unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSORegImmOperand(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSORegRegOperand(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeMemMultipleWritebackInstruction(MCInst & Inst,
+ unsigned Insn,
+ uint64_t Adddress,
+ const void *Decoder);
+static DecodeStatus DecodeT2MOVTWInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeArmMOVTWInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSMLAInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeHINTInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeCPSInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeTSTInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSETPANInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2CPSInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrModeImm12Operand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrMode5Operand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrMode5FP16Operand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrMode7Operand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2BInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeBranchImmInstruction(MCInst &Inst,unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrMode6Operand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLDST1Instruction(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLDST2Instruction(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLDST3Instruction(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLDST4Instruction(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLDInstruction(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVSTInstruction(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD1DupInstruction(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD2DupInstruction(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD3DupInstruction(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD4DupInstruction(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeNEONModImmInstruction(MCInst &Inst,unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVSHLMaxInstruction(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeShiftRight8Imm(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeShiftRight16Imm(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeShiftRight32Imm(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeShiftRight64Imm(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeTBLInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodePostIdxReg(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeCoprocessor(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMemBarrierOption(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeInstSyncBarrierOption(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeBankedReg(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDoubleRegLoad(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDoubleRegStore(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeLDRPreImm(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeLDRPreReg(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSTRPreImm(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSTRPreReg(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD1LN(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD2LN(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD3LN(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD4LN(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVST1LN(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVST2LN(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVST3LN(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVST4LN(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVMOVSRR(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVMOVRRS(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSwap(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+
+
+static DecodeStatus DecodeThumbAddSpecialReg(MCInst &Inst, uint16_t Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbBROperand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2BROperand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbCmpBROperand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbAddrModeRR(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbAddrModeIS(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbAddrModePC(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbAddrModeSP(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2AddrModeSOReg(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2LoadShift(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2LoadImm8(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void* Decoder);
+static DecodeStatus DecodeT2LoadImm12(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void* Decoder);
+static DecodeStatus DecodeT2LoadT(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void* Decoder);
+static DecodeStatus DecodeT2LoadLabel(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void* Decoder);
+static DecodeStatus DecodeT2Imm8S4(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2AddrModeImm8s4(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2AddrModeImm0_1020s4(MCInst &Inst,unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2Imm8(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2AddrModeImm8(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbAddSPImm(MCInst &Inst, uint16_t Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbAddSPReg(MCInst &Inst, uint16_t Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbCPS(MCInst &Inst, uint16_t Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeQADDInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbBLXOffset(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2AddrModeImm12(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbTableBranch(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumb2BCCInstruction(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2SOImm(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbBCCTargetOperand(MCInst &Inst,unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbBLTargetOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeIT(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2LDRDPreInstruction(MCInst &Inst,unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2STRDPreInstruction(MCInst &Inst,unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2Adr(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2LdStPre(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2ShifterImmOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeLDR(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecoderForMRRC2AndMCRR2(llvm::MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+#include "ARMGenDisassemblerTables.inc"
+
+static MCDisassembler *createARMDisassembler(const Target &T,
+ const MCSubtargetInfo &STI,
+ MCContext &Ctx) {
+ return new ARMDisassembler(STI, Ctx);
+}
+
+static MCDisassembler *createThumbDisassembler(const Target &T,
+ const MCSubtargetInfo &STI,
+ MCContext &Ctx) {
+ return new ThumbDisassembler(STI, Ctx);
+}
+
+// Post-decoding checks
+static DecodeStatus checkDecodedInstruction(MCInst &MI, uint64_t &Size,
+ uint64_t Address, raw_ostream &OS,
+ raw_ostream &CS,
+ uint32_t Insn,
+ DecodeStatus Result)
+{
+ switch (MI.getOpcode()) {
+ case ARM::HVC: {
+ // HVC is undefined if condition = 0xf otherwise upredictable
+ // if condition != 0xe
+ uint32_t Cond = (Insn >> 28) & 0xF;
+ if (Cond == 0xF)
+ return MCDisassembler::Fail;
+ if (Cond != 0xE)
+ return MCDisassembler::SoftFail;
+ return Result;
+ }
+ default: return Result;
+ }
+}
+
+DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes,
+ uint64_t Address, raw_ostream &OS,
+ raw_ostream &CS) const {
+ CommentStream = &CS;
+
+ assert(!STI.getFeatureBits()[ARM::ModeThumb] &&
+ "Asked to disassemble an ARM instruction but Subtarget is in Thumb "
+ "mode!");
+
+ // We want to read exactly 4 bytes of data.
+ if (Bytes.size() < 4) {
+ Size = 0;
+ return MCDisassembler::Fail;
+ }
+
+ // Encoded as a small-endian 32-bit word in the stream.
+ uint32_t Insn =
+ (Bytes[3] << 24) | (Bytes[2] << 16) | (Bytes[1] << 8) | (Bytes[0] << 0);
+
+ // Calling the auto-generated decoder function.
+ DecodeStatus Result =
+ decodeInstruction(DecoderTableARM32, MI, Insn, Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ return checkDecodedInstruction(MI, Size, Address, OS, CS, Insn, Result);
+ }
+
+ // VFP and NEON instructions, similarly, are shared between ARM
+ // and Thumb modes.
+ Result = decodeInstruction(DecoderTableVFP32, MI, Insn, Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ return Result;
+ }
+
+ Result = decodeInstruction(DecoderTableVFPV832, MI, Insn, Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ return Result;
+ }
+
+ Result =
+ decodeInstruction(DecoderTableNEONData32, MI, Insn, Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ // Add a fake predicate operand, because we share these instruction
+ // definitions with Thumb2 where these instructions are predicable.
+ if (!DecodePredicateOperand(MI, 0xE, Address, this))
+ return MCDisassembler::Fail;
+ return Result;
+ }
+
+ Result = decodeInstruction(DecoderTableNEONLoadStore32, MI, Insn, Address,
+ this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ // Add a fake predicate operand, because we share these instruction
+ // definitions with Thumb2 where these instructions are predicable.
+ if (!DecodePredicateOperand(MI, 0xE, Address, this))
+ return MCDisassembler::Fail;
+ return Result;
+ }
+
+ Result =
+ decodeInstruction(DecoderTableNEONDup32, MI, Insn, Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ // Add a fake predicate operand, because we share these instruction
+ // definitions with Thumb2 where these instructions are predicable.
+ if (!DecodePredicateOperand(MI, 0xE, Address, this))
+ return MCDisassembler::Fail;
+ return Result;
+ }
+
+ Result =
+ decodeInstruction(DecoderTablev8NEON32, MI, Insn, Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ return Result;
+ }
+
+ Result =
+ decodeInstruction(DecoderTablev8Crypto32, MI, Insn, Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ return Result;
+ }
+
+ Size = 0;
+ return MCDisassembler::Fail;
+}
+
+namespace llvm {
+extern const MCInstrDesc ARMInsts[];
+}
+
+/// tryAddingSymbolicOperand - trys to add a symbolic operand in place of the
+/// immediate Value in the MCInst. The immediate Value has had any PC
+/// adjustment made by the caller. If the instruction is a branch instruction
+/// then isBranch is true, else false. If the getOpInfo() function was set as
+/// part of the setupForSymbolicDisassembly() call then that function is called
+/// to get any symbolic information at the Address for this instruction. If
+/// that returns non-zero then the symbolic information it returns is used to
+/// create an MCExpr and that is added as an operand to the MCInst. If
+/// getOpInfo() returns zero and isBranch is true then a symbol look up for
+/// Value is done and if a symbol is found an MCExpr is created with that, else
+/// an MCExpr with Value is created. This function returns true if it adds an
+/// operand to the MCInst and false otherwise.
+static bool tryAddingSymbolicOperand(uint64_t Address, int32_t Value,
+ bool isBranch, uint64_t InstSize,
+ MCInst &MI, const void *Decoder) {
+ const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
+ // FIXME: Does it make sense for value to be negative?
+ return Dis->tryAddingSymbolicOperand(MI, (uint32_t)Value, Address, isBranch,
+ /* Offset */ 0, InstSize);
+}
+
+/// tryAddingPcLoadReferenceComment - trys to add a comment as to what is being
+/// referenced by a load instruction with the base register that is the Pc.
+/// These can often be values in a literal pool near the Address of the
+/// instruction. The Address of the instruction and its immediate Value are
+/// used as a possible literal pool entry. The SymbolLookUp call back will
+/// return the name of a symbol referenced by the literal pool's entry if
+/// the referenced address is that of a symbol. Or it will return a pointer to
+/// a literal 'C' string if the referenced address of the literal pool's entry
+/// is an address into a section with 'C' string literals.
+static void tryAddingPcLoadReferenceComment(uint64_t Address, int Value,
+ const void *Decoder) {
+ const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
+ Dis->tryAddingPcLoadReferenceComment(Value, Address);
+}
+
+// Thumb1 instructions don't have explicit S bits. Rather, they
+// implicitly set CPSR. Since it's not represented in the encoding, the
+// auto-generated decoder won't inject the CPSR operand. We need to fix
+// that as a post-pass.
+static void AddThumb1SBit(MCInst &MI, bool InITBlock) {
+ const MCOperandInfo *OpInfo = ARMInsts[MI.getOpcode()].OpInfo;
+ unsigned short NumOps = ARMInsts[MI.getOpcode()].NumOperands;
+ MCInst::iterator I = MI.begin();
+ for (unsigned i = 0; i < NumOps; ++i, ++I) {
+ if (I == MI.end()) break;
+ if (OpInfo[i].isOptionalDef() && OpInfo[i].RegClass == ARM::CCRRegClassID) {
+ if (i > 0 && OpInfo[i-1].isPredicate()) continue;
+ MI.insert(I, MCOperand::createReg(InITBlock ? 0 : ARM::CPSR));
+ return;
+ }
+ }
+
+ MI.insert(I, MCOperand::createReg(InITBlock ? 0 : ARM::CPSR));
+}
+
+// Most Thumb instructions don't have explicit predicates in the
+// encoding, but rather get their predicates from IT context. We need
+// to fix up the predicate operands using this context information as a
+// post-pass.
+MCDisassembler::DecodeStatus
+ThumbDisassembler::AddThumbPredicate(MCInst &MI) const {
+ MCDisassembler::DecodeStatus S = Success;
+
+ const FeatureBitset &FeatureBits = getSubtargetInfo().getFeatureBits();
+
+ // A few instructions actually have predicates encoded in them. Don't
+ // try to overwrite it if we're seeing one of those.
+ switch (MI.getOpcode()) {
+ case ARM::tBcc:
+ case ARM::t2Bcc:
+ case ARM::tCBZ:
+ case ARM::tCBNZ:
+ case ARM::tCPS:
+ case ARM::t2CPS3p:
+ case ARM::t2CPS2p:
+ case ARM::t2CPS1p:
+ case ARM::tMOVSr:
+ case ARM::tSETEND:
+ // Some instructions (mostly conditional branches) are not
+ // allowed in IT blocks.
+ if (ITBlock.instrInITBlock())
+ S = SoftFail;
+ else
+ return Success;
+ break;
+ case ARM::t2HINT:
+ if (MI.getOperand(0).getImm() == 0x10 && (FeatureBits[ARM::FeatureRAS]) != 0)
+ S = SoftFail;
+ break;
+ case ARM::tB:
+ case ARM::t2B:
+ case ARM::t2TBB:
+ case ARM::t2TBH:
+ // Some instructions (mostly unconditional branches) can
+ // only appears at the end of, or outside of, an IT.
+ if (ITBlock.instrInITBlock() && !ITBlock.instrLastInITBlock())
+ S = SoftFail;
+ break;
+ default:
+ break;
+ }
+
+ // If we're in an IT block, base the predicate on that. Otherwise,
+ // assume a predicate of AL.
+ unsigned CC;
+ CC = ITBlock.getITCC();
+ if (CC == 0xF)
+ CC = ARMCC::AL;
+ if (ITBlock.instrInITBlock())
+ ITBlock.advanceITState();
+
+ const MCOperandInfo *OpInfo = ARMInsts[MI.getOpcode()].OpInfo;
+ unsigned short NumOps = ARMInsts[MI.getOpcode()].NumOperands;
+ MCInst::iterator I = MI.begin();
+ for (unsigned i = 0; i < NumOps; ++i, ++I) {
+ if (I == MI.end()) break;
+ if (OpInfo[i].isPredicate()) {
+ I = MI.insert(I, MCOperand::createImm(CC));
+ ++I;
+ if (CC == ARMCC::AL)
+ MI.insert(I, MCOperand::createReg(0));
+ else
+ MI.insert(I, MCOperand::createReg(ARM::CPSR));
+ return S;
+ }
+ }
+
+ I = MI.insert(I, MCOperand::createImm(CC));
+ ++I;
+ if (CC == ARMCC::AL)
+ MI.insert(I, MCOperand::createReg(0));
+ else
+ MI.insert(I, MCOperand::createReg(ARM::CPSR));
+
+ return S;
+}
+
+// Thumb VFP instructions are a special case. Because we share their
+// encodings between ARM and Thumb modes, and they are predicable in ARM
+// mode, the auto-generated decoder will give them an (incorrect)
+// predicate operand. We need to rewrite these operands based on the IT
+// context as a post-pass.
+void ThumbDisassembler::UpdateThumbVFPPredicate(MCInst &MI) const {
+ unsigned CC;
+ CC = ITBlock.getITCC();
+ if (ITBlock.instrInITBlock())
+ ITBlock.advanceITState();
+
+ const MCOperandInfo *OpInfo = ARMInsts[MI.getOpcode()].OpInfo;
+ MCInst::iterator I = MI.begin();
+ unsigned short NumOps = ARMInsts[MI.getOpcode()].NumOperands;
+ for (unsigned i = 0; i < NumOps; ++i, ++I) {
+ if (OpInfo[i].isPredicate() ) {
+ I->setImm(CC);
+ ++I;
+ if (CC == ARMCC::AL)
+ I->setReg(0);
+ else
+ I->setReg(ARM::CPSR);
+ return;
+ }
+ }
+}
+
+DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes,
+ uint64_t Address,
+ raw_ostream &OS,
+ raw_ostream &CS) const {
+ CommentStream = &CS;
+
+ assert(STI.getFeatureBits()[ARM::ModeThumb] &&
+ "Asked to disassemble in Thumb mode but Subtarget is in ARM mode!");
+
+ // We want to read exactly 2 bytes of data.
+ if (Bytes.size() < 2) {
+ Size = 0;
+ return MCDisassembler::Fail;
+ }
+
+ uint16_t Insn16 = (Bytes[1] << 8) | Bytes[0];
+ DecodeStatus Result =
+ decodeInstruction(DecoderTableThumb16, MI, Insn16, Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 2;
+ Check(Result, AddThumbPredicate(MI));
+ return Result;
+ }
+
+ Result = decodeInstruction(DecoderTableThumbSBit16, MI, Insn16, Address, this,
+ STI);
+ if (Result) {
+ Size = 2;
+ bool InITBlock = ITBlock.instrInITBlock();
+ Check(Result, AddThumbPredicate(MI));
+ AddThumb1SBit(MI, InITBlock);
+ return Result;
+ }
+
+ Result =
+ decodeInstruction(DecoderTableThumb216, MI, Insn16, Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 2;
+
+ // Nested IT blocks are UNPREDICTABLE. Must be checked before we add
+ // the Thumb predicate.
+ if (MI.getOpcode() == ARM::t2IT && ITBlock.instrInITBlock())
+ Result = MCDisassembler::SoftFail;
+
+ Check(Result, AddThumbPredicate(MI));
+
+ // If we find an IT instruction, we need to parse its condition
+ // code and mask operands so that we can apply them correctly
+ // to the subsequent instructions.
+ if (MI.getOpcode() == ARM::t2IT) {
+
+ unsigned Firstcond = MI.getOperand(0).getImm();
+ unsigned Mask = MI.getOperand(1).getImm();
+ ITBlock.setITState(Firstcond, Mask);
+ }
+
+ return Result;
+ }
+
+ // We want to read exactly 4 bytes of data.
+ if (Bytes.size() < 4) {
+ Size = 0;
+ return MCDisassembler::Fail;
+ }
+
+ uint32_t Insn32 =
+ (Bytes[3] << 8) | (Bytes[2] << 0) | (Bytes[1] << 24) | (Bytes[0] << 16);
+ Result =
+ decodeInstruction(DecoderTableThumb32, MI, Insn32, Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ bool InITBlock = ITBlock.instrInITBlock();
+ Check(Result, AddThumbPredicate(MI));
+ AddThumb1SBit(MI, InITBlock);
+ return Result;
+ }
+
+ Result =
+ decodeInstruction(DecoderTableThumb232, MI, Insn32, Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ Check(Result, AddThumbPredicate(MI));
+ return Result;
+ }
+
+ if (fieldFromInstruction(Insn32, 28, 4) == 0xE) {
+ Result =
+ decodeInstruction(DecoderTableVFP32, MI, Insn32, Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ UpdateThumbVFPPredicate(MI);
+ return Result;
+ }
+ }
+
+ Result =
+ decodeInstruction(DecoderTableVFPV832, MI, Insn32, Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ return Result;
+ }
+
+ if (fieldFromInstruction(Insn32, 28, 4) == 0xE) {
+ Result = decodeInstruction(DecoderTableNEONDup32, MI, Insn32, Address, this,
+ STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ Check(Result, AddThumbPredicate(MI));
+ return Result;
+ }
+ }
+
+ if (fieldFromInstruction(Insn32, 24, 8) == 0xF9) {
+ uint32_t NEONLdStInsn = Insn32;
+ NEONLdStInsn &= 0xF0FFFFFF;
+ NEONLdStInsn |= 0x04000000;
+ Result = decodeInstruction(DecoderTableNEONLoadStore32, MI, NEONLdStInsn,
+ Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ Check(Result, AddThumbPredicate(MI));
+ return Result;
+ }
+ }
+
+ if (fieldFromInstruction(Insn32, 24, 4) == 0xF) {
+ uint32_t NEONDataInsn = Insn32;
+ NEONDataInsn &= 0xF0FFFFFF; // Clear bits 27-24
+ NEONDataInsn |= (NEONDataInsn & 0x10000000) >> 4; // Move bit 28 to bit 24
+ NEONDataInsn |= 0x12000000; // Set bits 28 and 25
+ Result = decodeInstruction(DecoderTableNEONData32, MI, NEONDataInsn,
+ Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ Check(Result, AddThumbPredicate(MI));
+ return Result;
+ }
+
+ uint32_t NEONCryptoInsn = Insn32;
+ NEONCryptoInsn &= 0xF0FFFFFF; // Clear bits 27-24
+ NEONCryptoInsn |= (NEONCryptoInsn & 0x10000000) >> 4; // Move bit 28 to bit 24
+ NEONCryptoInsn |= 0x12000000; // Set bits 28 and 25
+ Result = decodeInstruction(DecoderTablev8Crypto32, MI, NEONCryptoInsn,
+ Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ return Result;
+ }
+
+ uint32_t NEONv8Insn = Insn32;
+ NEONv8Insn &= 0xF3FFFFFF; // Clear bits 27-26
+ Result = decodeInstruction(DecoderTablev8NEON32, MI, NEONv8Insn, Address,
+ this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ return Result;
+ }
+ }
+
+ Size = 0;
+ return MCDisassembler::Fail;
+}
+
+
+extern "C" void LLVMInitializeARMDisassembler() {
+ TargetRegistry::RegisterMCDisassembler(getTheARMLETarget(),
+ createARMDisassembler);
+ TargetRegistry::RegisterMCDisassembler(getTheARMBETarget(),
+ createARMDisassembler);
+ TargetRegistry::RegisterMCDisassembler(getTheThumbLETarget(),
+ createThumbDisassembler);
+ TargetRegistry::RegisterMCDisassembler(getTheThumbBETarget(),
+ createThumbDisassembler);
+}
+
+static const uint16_t GPRDecoderTable[] = {
+ ARM::R0, ARM::R1, ARM::R2, ARM::R3,
+ ARM::R4, ARM::R5, ARM::R6, ARM::R7,
+ ARM::R8, ARM::R9, ARM::R10, ARM::R11,
+ ARM::R12, ARM::SP, ARM::LR, ARM::PC
+};
+
+static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder) {
+ if (RegNo > 15)
+ return MCDisassembler::Fail;
+
+ unsigned Register = GPRDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus
+DecodeGPRnopcRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ if (RegNo == 15)
+ S = MCDisassembler::SoftFail;
+
+ Check(S, DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder));
+
+ return S;
+}
+
+static DecodeStatus
+DecodeGPRwithAPSRRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ if (RegNo == 15)
+ {
+ Inst.addOperand(MCOperand::createReg(ARM::APSR_NZCV));
+ return MCDisassembler::Success;
+ }
+
+ Check(S, DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder));
+ return S;
+}
+
+static DecodeStatus DecodetGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder) {
+ if (RegNo > 7)
+ return MCDisassembler::Fail;
+ return DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder);
+}
+
+static const uint16_t GPRPairDecoderTable[] = {
+ ARM::R0_R1, ARM::R2_R3, ARM::R4_R5, ARM::R6_R7,
+ ARM::R8_R9, ARM::R10_R11, ARM::R12_SP
+};
+
+static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ if (RegNo > 13)
+ return MCDisassembler::Fail;
+
+ if ((RegNo & 1) || RegNo == 0xe)
+ S = MCDisassembler::SoftFail;
+
+ unsigned RegisterPair = GPRPairDecoderTable[RegNo/2];
+ Inst.addOperand(MCOperand::createReg(RegisterPair));
+ return S;
+}
+
+static DecodeStatus DecodetcGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder) {
+ unsigned Register = 0;
+ switch (RegNo) {
+ case 0:
+ Register = ARM::R0;
+ break;
+ case 1:
+ Register = ARM::R1;
+ break;
+ case 2:
+ Register = ARM::R2;
+ break;
+ case 3:
+ Register = ARM::R3;
+ break;
+ case 9:
+ Register = ARM::R9;
+ break;
+ case 12:
+ Register = ARM::R12;
+ break;
+ default:
+ return MCDisassembler::Fail;
+ }
+
+ Inst.addOperand(MCOperand::createReg(Register));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecoderGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ const FeatureBitset &featureBits =
+ ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
+
+ if ((RegNo == 13 && !featureBits[ARM::HasV8Ops]) || RegNo == 15)
+ S = MCDisassembler::SoftFail;
+
+ Check(S, DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder));
+ return S;
+}
+
+static const uint16_t SPRDecoderTable[] = {
+ ARM::S0, ARM::S1, ARM::S2, ARM::S3,
+ ARM::S4, ARM::S5, ARM::S6, ARM::S7,
+ ARM::S8, ARM::S9, ARM::S10, ARM::S11,
+ ARM::S12, ARM::S13, ARM::S14, ARM::S15,
+ ARM::S16, ARM::S17, ARM::S18, ARM::S19,
+ ARM::S20, ARM::S21, ARM::S22, ARM::S23,
+ ARM::S24, ARM::S25, ARM::S26, ARM::S27,
+ ARM::S28, ARM::S29, ARM::S30, ARM::S31
+};
+
+static DecodeStatus DecodeSPRRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder) {
+ if (RegNo > 31)
+ return MCDisassembler::Fail;
+
+ unsigned Register = SPRDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return MCDisassembler::Success;
+}
+
+static const uint16_t DPRDecoderTable[] = {
+ ARM::D0, ARM::D1, ARM::D2, ARM::D3,
+ ARM::D4, ARM::D5, ARM::D6, ARM::D7,
+ ARM::D8, ARM::D9, ARM::D10, ARM::D11,
+ ARM::D12, ARM::D13, ARM::D14, ARM::D15,
+ ARM::D16, ARM::D17, ARM::D18, ARM::D19,
+ ARM::D20, ARM::D21, ARM::D22, ARM::D23,
+ ARM::D24, ARM::D25, ARM::D26, ARM::D27,
+ ARM::D28, ARM::D29, ARM::D30, ARM::D31
+};
+
+static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder) {
+ const FeatureBitset &featureBits =
+ ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
+
+ bool hasD16 = featureBits[ARM::FeatureD16];
+
+ if (RegNo > 31 || (hasD16 && RegNo > 15))
+ return MCDisassembler::Fail;
+
+ unsigned Register = DPRDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeDPR_8RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder) {
+ if (RegNo > 7)
+ return MCDisassembler::Fail;
+ return DecodeDPRRegisterClass(Inst, RegNo, Address, Decoder);
+}
+
+static DecodeStatus
+DecodeDPR_VFP2RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder) {
+ if (RegNo > 15)
+ return MCDisassembler::Fail;
+ return DecodeDPRRegisterClass(Inst, RegNo, Address, Decoder);
+}
+
+static const uint16_t QPRDecoderTable[] = {
+ ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3,
+ ARM::Q4, ARM::Q5, ARM::Q6, ARM::Q7,
+ ARM::Q8, ARM::Q9, ARM::Q10, ARM::Q11,
+ ARM::Q12, ARM::Q13, ARM::Q14, ARM::Q15
+};
+
+
+static DecodeStatus DecodeQPRRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder) {
+ if (RegNo > 31 || (RegNo & 1) != 0)
+ return MCDisassembler::Fail;
+ RegNo >>= 1;
+
+ unsigned Register = QPRDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return MCDisassembler::Success;
+}
+
+static const uint16_t DPairDecoderTable[] = {
+ ARM::Q0, ARM::D1_D2, ARM::Q1, ARM::D3_D4, ARM::Q2, ARM::D5_D6,
+ ARM::Q3, ARM::D7_D8, ARM::Q4, ARM::D9_D10, ARM::Q5, ARM::D11_D12,
+ ARM::Q6, ARM::D13_D14, ARM::Q7, ARM::D15_D16, ARM::Q8, ARM::D17_D18,
+ ARM::Q9, ARM::D19_D20, ARM::Q10, ARM::D21_D22, ARM::Q11, ARM::D23_D24,
+ ARM::Q12, ARM::D25_D26, ARM::Q13, ARM::D27_D28, ARM::Q14, ARM::D29_D30,
+ ARM::Q15
+};
+
+static DecodeStatus DecodeDPairRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder) {
+ if (RegNo > 30)
+ return MCDisassembler::Fail;
+
+ unsigned Register = DPairDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return MCDisassembler::Success;
+}
+
+static const uint16_t DPairSpacedDecoderTable[] = {
+ ARM::D0_D2, ARM::D1_D3, ARM::D2_D4, ARM::D3_D5,
+ ARM::D4_D6, ARM::D5_D7, ARM::D6_D8, ARM::D7_D9,
+ ARM::D8_D10, ARM::D9_D11, ARM::D10_D12, ARM::D11_D13,
+ ARM::D12_D14, ARM::D13_D15, ARM::D14_D16, ARM::D15_D17,
+ ARM::D16_D18, ARM::D17_D19, ARM::D18_D20, ARM::D19_D21,
+ ARM::D20_D22, ARM::D21_D23, ARM::D22_D24, ARM::D23_D25,
+ ARM::D24_D26, ARM::D25_D27, ARM::D26_D28, ARM::D27_D29,
+ ARM::D28_D30, ARM::D29_D31
+};
+
+static DecodeStatus DecodeDPairSpacedRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo > 29)
+ return MCDisassembler::Fail;
+
+ unsigned Register = DPairSpacedDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodePredicateOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ if (Val == 0xF) return MCDisassembler::Fail;
+ // AL predicate is not allowed on Thumb1 branches.
+ if (Inst.getOpcode() == ARM::tBcc && Val == 0xE)
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createImm(Val));
+ if (Val == ARMCC::AL) {
+ Inst.addOperand(MCOperand::createReg(0));
+ } else
+ Inst.addOperand(MCOperand::createReg(ARM::CPSR));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCCOutOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ if (Val)
+ Inst.addOperand(MCOperand::createReg(ARM::CPSR));
+ else
+ Inst.addOperand(MCOperand::createReg(0));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSORegImmOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rm = fieldFromInstruction(Val, 0, 4);
+ unsigned type = fieldFromInstruction(Val, 5, 2);
+ unsigned imm = fieldFromInstruction(Val, 7, 5);
+
+ // Register-immediate
+ if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ ARM_AM::ShiftOpc Shift = ARM_AM::lsl;
+ switch (type) {
+ case 0:
+ Shift = ARM_AM::lsl;
+ break;
+ case 1:
+ Shift = ARM_AM::lsr;
+ break;
+ case 2:
+ Shift = ARM_AM::asr;
+ break;
+ case 3:
+ Shift = ARM_AM::ror;
+ break;
+ }
+
+ if (Shift == ARM_AM::ror && imm == 0)
+ Shift = ARM_AM::rrx;
+
+ unsigned Op = Shift | (imm << 3);
+ Inst.addOperand(MCOperand::createImm(Op));
+
+ return S;
+}
+
+static DecodeStatus DecodeSORegRegOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rm = fieldFromInstruction(Val, 0, 4);
+ unsigned type = fieldFromInstruction(Val, 5, 2);
+ unsigned Rs = fieldFromInstruction(Val, 8, 4);
+
+ // Register-register
+ if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rs, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ ARM_AM::ShiftOpc Shift = ARM_AM::lsl;
+ switch (type) {
+ case 0:
+ Shift = ARM_AM::lsl;
+ break;
+ case 1:
+ Shift = ARM_AM::lsr;
+ break;
+ case 2:
+ Shift = ARM_AM::asr;
+ break;
+ case 3:
+ Shift = ARM_AM::ror;
+ break;
+ }
+
+ Inst.addOperand(MCOperand::createImm(Shift));
+
+ return S;
+}
+
+static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ bool NeedDisjointWriteback = false;
+ unsigned WritebackReg = 0;
+ switch (Inst.getOpcode()) {
+ default:
+ break;
+ case ARM::LDMIA_UPD:
+ case ARM::LDMDB_UPD:
+ case ARM::LDMIB_UPD:
+ case ARM::LDMDA_UPD:
+ case ARM::t2LDMIA_UPD:
+ case ARM::t2LDMDB_UPD:
+ case ARM::t2STMIA_UPD:
+ case ARM::t2STMDB_UPD:
+ NeedDisjointWriteback = true;
+ WritebackReg = Inst.getOperand(0).getReg();
+ break;
+ }
+
+ // Empty register lists are not allowed.
+ if (Val == 0) return MCDisassembler::Fail;
+ for (unsigned i = 0; i < 16; ++i) {
+ if (Val & (1 << i)) {
+ if (!Check(S, DecodeGPRRegisterClass(Inst, i, Address, Decoder)))
+ return MCDisassembler::Fail;
+ // Writeback not allowed if Rn is in the target list.
+ if (NeedDisjointWriteback && WritebackReg == Inst.end()[-1].getReg())
+ Check(S, MCDisassembler::SoftFail);
+ }
+ }
+
+ return S;
+}
+
+static DecodeStatus DecodeSPRRegListOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Vd = fieldFromInstruction(Val, 8, 5);
+ unsigned regs = fieldFromInstruction(Val, 0, 8);
+
+ // In case of unpredictable encoding, tweak the operands.
+ if (regs == 0 || (Vd + regs) > 32) {
+ regs = Vd + regs > 32 ? 32 - Vd : regs;
+ regs = std::max( 1u, regs);
+ S = MCDisassembler::SoftFail;
+ }
+
+ if (!Check(S, DecodeSPRRegisterClass(Inst, Vd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ for (unsigned i = 0; i < (regs - 1); ++i) {
+ if (!Check(S, DecodeSPRRegisterClass(Inst, ++Vd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ }
+
+ return S;
+}
+
+static DecodeStatus DecodeDPRRegListOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Vd = fieldFromInstruction(Val, 8, 5);
+ unsigned regs = fieldFromInstruction(Val, 1, 7);
+
+ // In case of unpredictable encoding, tweak the operands.
+ if (regs == 0 || regs > 16 || (Vd + regs) > 32) {
+ regs = Vd + regs > 32 ? 32 - Vd : regs;
+ regs = std::max( 1u, regs);
+ regs = std::min(16u, regs);
+ S = MCDisassembler::SoftFail;
+ }
+
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Vd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ for (unsigned i = 0; i < (regs - 1); ++i) {
+ if (!Check(S, DecodeDPRRegisterClass(Inst, ++Vd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ }
+
+ return S;
+}
+
+static DecodeStatus DecodeBitfieldMaskOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ // This operand encodes a mask of contiguous zeros between a specified MSB
+ // and LSB. To decode it, we create the mask of all bits MSB-and-lower,
+ // the mask of all bits LSB-and-lower, and then xor them to create
+ // the mask of that's all ones on [msb, lsb]. Finally we not it to
+ // create the final mask.
+ unsigned msb = fieldFromInstruction(Val, 5, 5);
+ unsigned lsb = fieldFromInstruction(Val, 0, 5);
+
+ DecodeStatus S = MCDisassembler::Success;
+ if (lsb > msb) {
+ Check(S, MCDisassembler::SoftFail);
+ // The check above will cause the warning for the "potentially undefined
+ // instruction encoding" but we can't build a bad MCOperand value here
+ // with a lsb > msb or else printing the MCInst will cause a crash.
+ lsb = msb;
+ }
+
+ uint32_t msb_mask = 0xFFFFFFFF;
+ if (msb != 31) msb_mask = (1U << (msb+1)) - 1;
+ uint32_t lsb_mask = (1U << lsb) - 1;
+
+ Inst.addOperand(MCOperand::createImm(~(msb_mask ^ lsb_mask)));
+ return S;
+}
+
+static DecodeStatus DecodeCopMemInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned pred = fieldFromInstruction(Insn, 28, 4);
+ unsigned CRd = fieldFromInstruction(Insn, 12, 4);
+ unsigned coproc = fieldFromInstruction(Insn, 8, 4);
+ unsigned imm = fieldFromInstruction(Insn, 0, 8);
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ unsigned U = fieldFromInstruction(Insn, 23, 1);
+
+ switch (Inst.getOpcode()) {
+ case ARM::LDC_OFFSET:
+ case ARM::LDC_PRE:
+ case ARM::LDC_POST:
+ case ARM::LDC_OPTION:
+ case ARM::LDCL_OFFSET:
+ case ARM::LDCL_PRE:
+ case ARM::LDCL_POST:
+ case ARM::LDCL_OPTION:
+ case ARM::STC_OFFSET:
+ case ARM::STC_PRE:
+ case ARM::STC_POST:
+ case ARM::STC_OPTION:
+ case ARM::STCL_OFFSET:
+ case ARM::STCL_PRE:
+ case ARM::STCL_POST:
+ case ARM::STCL_OPTION:
+ case ARM::t2LDC_OFFSET:
+ case ARM::t2LDC_PRE:
+ case ARM::t2LDC_POST:
+ case ARM::t2LDC_OPTION:
+ case ARM::t2LDCL_OFFSET:
+ case ARM::t2LDCL_PRE:
+ case ARM::t2LDCL_POST:
+ case ARM::t2LDCL_OPTION:
+ case ARM::t2STC_OFFSET:
+ case ARM::t2STC_PRE:
+ case ARM::t2STC_POST:
+ case ARM::t2STC_OPTION:
+ case ARM::t2STCL_OFFSET:
+ case ARM::t2STCL_PRE:
+ case ARM::t2STCL_POST:
+ case ARM::t2STCL_OPTION:
+ if (coproc == 0xA || coproc == 0xB)
+ return MCDisassembler::Fail;
+ break;
+ default:
+ break;
+ }
+
+ const FeatureBitset &featureBits =
+ ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
+ if (featureBits[ARM::HasV8Ops] && (coproc != 14))
+ return MCDisassembler::Fail;
+
+ Inst.addOperand(MCOperand::createImm(coproc));
+ Inst.addOperand(MCOperand::createImm(CRd));
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ switch (Inst.getOpcode()) {
+ case ARM::t2LDC2_OFFSET:
+ case ARM::t2LDC2L_OFFSET:
+ case ARM::t2LDC2_PRE:
+ case ARM::t2LDC2L_PRE:
+ case ARM::t2STC2_OFFSET:
+ case ARM::t2STC2L_OFFSET:
+ case ARM::t2STC2_PRE:
+ case ARM::t2STC2L_PRE:
+ case ARM::LDC2_OFFSET:
+ case ARM::LDC2L_OFFSET:
+ case ARM::LDC2_PRE:
+ case ARM::LDC2L_PRE:
+ case ARM::STC2_OFFSET:
+ case ARM::STC2L_OFFSET:
+ case ARM::STC2_PRE:
+ case ARM::STC2L_PRE:
+ case ARM::t2LDC_OFFSET:
+ case ARM::t2LDCL_OFFSET:
+ case ARM::t2LDC_PRE:
+ case ARM::t2LDCL_PRE:
+ case ARM::t2STC_OFFSET:
+ case ARM::t2STCL_OFFSET:
+ case ARM::t2STC_PRE:
+ case ARM::t2STCL_PRE:
+ case ARM::LDC_OFFSET:
+ case ARM::LDCL_OFFSET:
+ case ARM::LDC_PRE:
+ case ARM::LDCL_PRE:
+ case ARM::STC_OFFSET:
+ case ARM::STCL_OFFSET:
+ case ARM::STC_PRE:
+ case ARM::STCL_PRE:
+ imm = ARM_AM::getAM5Opc(U ? ARM_AM::add : ARM_AM::sub, imm);
+ Inst.addOperand(MCOperand::createImm(imm));
+ break;
+ case ARM::t2LDC2_POST:
+ case ARM::t2LDC2L_POST:
+ case ARM::t2STC2_POST:
+ case ARM::t2STC2L_POST:
+ case ARM::LDC2_POST:
+ case ARM::LDC2L_POST:
+ case ARM::STC2_POST:
+ case ARM::STC2L_POST:
+ case ARM::t2LDC_POST:
+ case ARM::t2LDCL_POST:
+ case ARM::t2STC_POST:
+ case ARM::t2STCL_POST:
+ case ARM::LDC_POST:
+ case ARM::LDCL_POST:
+ case ARM::STC_POST:
+ case ARM::STCL_POST:
+ imm |= U << 8;
+ LLVM_FALLTHROUGH;
+ default:
+ // The 'option' variant doesn't encode 'U' in the immediate since
+ // the immediate is unsigned [0,255].
+ Inst.addOperand(MCOperand::createImm(imm));
+ break;
+ }
+
+ switch (Inst.getOpcode()) {
+ case ARM::LDC_OFFSET:
+ case ARM::LDC_PRE:
+ case ARM::LDC_POST:
+ case ARM::LDC_OPTION:
+ case ARM::LDCL_OFFSET:
+ case ARM::LDCL_PRE:
+ case ARM::LDCL_POST:
+ case ARM::LDCL_OPTION:
+ case ARM::STC_OFFSET:
+ case ARM::STC_PRE:
+ case ARM::STC_POST:
+ case ARM::STC_OPTION:
+ case ARM::STCL_OFFSET:
+ case ARM::STCL_PRE:
+ case ARM::STCL_POST:
+ case ARM::STCL_OPTION:
+ if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+ return MCDisassembler::Fail;
+ break;
+ default:
+ break;
+ }
+
+ return S;
+}
+
+static DecodeStatus
+DecodeAddrMode2IdxInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+ unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+ unsigned imm = fieldFromInstruction(Insn, 0, 12);
+ unsigned pred = fieldFromInstruction(Insn, 28, 4);
+ unsigned reg = fieldFromInstruction(Insn, 25, 1);
+ unsigned P = fieldFromInstruction(Insn, 24, 1);
+ unsigned W = fieldFromInstruction(Insn, 21, 1);
+
+ // On stores, the writeback operand precedes Rt.
+ switch (Inst.getOpcode()) {
+ case ARM::STR_POST_IMM:
+ case ARM::STR_POST_REG:
+ case ARM::STRB_POST_IMM:
+ case ARM::STRB_POST_REG:
+ case ARM::STRT_POST_REG:
+ case ARM::STRT_POST_IMM:
+ case ARM::STRBT_POST_REG:
+ case ARM::STRBT_POST_IMM:
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ break;
+ default:
+ break;
+ }
+
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ // On loads, the writeback operand comes after Rt.
+ switch (Inst.getOpcode()) {
+ case ARM::LDR_POST_IMM:
+ case ARM::LDR_POST_REG:
+ case ARM::LDRB_POST_IMM:
+ case ARM::LDRB_POST_REG:
+ case ARM::LDRBT_POST_REG:
+ case ARM::LDRBT_POST_IMM:
+ case ARM::LDRT_POST_REG:
+ case ARM::LDRT_POST_IMM:
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ break;
+ default:
+ break;
+ }
+
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ ARM_AM::AddrOpc Op = ARM_AM::add;
+ if (!fieldFromInstruction(Insn, 23, 1))
+ Op = ARM_AM::sub;
+
+ bool writeback = (P == 0) || (W == 1);
+ unsigned idx_mode = 0;
+ if (P && writeback)
+ idx_mode = ARMII::IndexModePre;
+ else if (!P && writeback)
+ idx_mode = ARMII::IndexModePost;
+
+ if (writeback && (Rn == 15 || Rn == Rt))
+ S = MCDisassembler::SoftFail; // UNPREDICTABLE
+
+ if (reg) {
+ if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ ARM_AM::ShiftOpc Opc = ARM_AM::lsl;
+ switch( fieldFromInstruction(Insn, 5, 2)) {
+ case 0:
+ Opc = ARM_AM::lsl;
+ break;
+ case 1:
+ Opc = ARM_AM::lsr;
+ break;
+ case 2:
+ Opc = ARM_AM::asr;
+ break;
+ case 3:
+ Opc = ARM_AM::ror;
+ break;
+ default:
+ return MCDisassembler::Fail;
+ }
+ unsigned amt = fieldFromInstruction(Insn, 7, 5);
+ if (Opc == ARM_AM::ror && amt == 0)
+ Opc = ARM_AM::rrx;
+ unsigned imm = ARM_AM::getAM2Opc(Op, amt, Opc, idx_mode);
+
+ Inst.addOperand(MCOperand::createImm(imm));
+ } else {
+ Inst.addOperand(MCOperand::createReg(0));
+ unsigned tmp = ARM_AM::getAM2Opc(Op, imm, ARM_AM::lsl, idx_mode);
+ Inst.addOperand(MCOperand::createImm(tmp));
+ }
+
+ if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ return S;
+}
+
+static DecodeStatus DecodeSORegMemOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rn = fieldFromInstruction(Val, 13, 4);
+ unsigned Rm = fieldFromInstruction(Val, 0, 4);
+ unsigned type = fieldFromInstruction(Val, 5, 2);
+ unsigned imm = fieldFromInstruction(Val, 7, 5);
+ unsigned U = fieldFromInstruction(Val, 12, 1);
+
+ ARM_AM::ShiftOpc ShOp = ARM_AM::lsl;
+ switch (type) {
+ case 0:
+ ShOp = ARM_AM::lsl;
+ break;
+ case 1:
+ ShOp = ARM_AM::lsr;
+ break;
+ case 2:
+ ShOp = ARM_AM::asr;
+ break;
+ case 3:
+ ShOp = ARM_AM::ror;
+ break;
+ }
+
+ if (ShOp == ARM_AM::ror && imm == 0)
+ ShOp = ARM_AM::rrx;
+
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ unsigned shift;
+ if (U)
+ shift = ARM_AM::getAM2Opc(ARM_AM::add, imm, ShOp);
+ else
+ shift = ARM_AM::getAM2Opc(ARM_AM::sub, imm, ShOp);
+ Inst.addOperand(MCOperand::createImm(shift));
+
+ return S;
+}
+
+static DecodeStatus
+DecodeAddrMode3Instruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+ unsigned type = fieldFromInstruction(Insn, 22, 1);
+ unsigned imm = fieldFromInstruction(Insn, 8, 4);
+ unsigned U = ((~fieldFromInstruction(Insn, 23, 1)) & 1) << 8;
+ unsigned pred = fieldFromInstruction(Insn, 28, 4);
+ unsigned W = fieldFromInstruction(Insn, 21, 1);
+ unsigned P = fieldFromInstruction(Insn, 24, 1);
+ unsigned Rt2 = Rt + 1;
+
+ bool writeback = (W == 1) | (P == 0);
+
+ // For {LD,ST}RD, Rt must be even, else undefined.
+ switch (Inst.getOpcode()) {
+ case ARM::STRD:
+ case ARM::STRD_PRE:
+ case ARM::STRD_POST:
+ case ARM::LDRD:
+ case ARM::LDRD_PRE:
+ case ARM::LDRD_POST:
+ if (Rt & 0x1) S = MCDisassembler::SoftFail;
+ break;
+ default:
+ break;
+ }
+ switch (Inst.getOpcode()) {
+ case ARM::STRD:
+ case ARM::STRD_PRE:
+ case ARM::STRD_POST:
+ if (P == 0 && W == 1)
+ S = MCDisassembler::SoftFail;
+
+ if (writeback && (Rn == 15 || Rn == Rt || Rn == Rt2))
+ S = MCDisassembler::SoftFail;
+ if (type && Rm == 15)
+ S = MCDisassembler::SoftFail;
+ if (Rt2 == 15)
+ S = MCDisassembler::SoftFail;
+ if (!type && fieldFromInstruction(Insn, 8, 4))
+ S = MCDisassembler::SoftFail;
+ break;
+ case ARM::STRH:
+ case ARM::STRH_PRE:
+ case ARM::STRH_POST:
+ if (Rt == 15)
+ S = MCDisassembler::SoftFail;
+ if (writeback && (Rn == 15 || Rn == Rt))
+ S = MCDisassembler::SoftFail;
+ if (!type && Rm == 15)
+ S = MCDisassembler::SoftFail;
+ break;
+ case ARM::LDRD:
+ case ARM::LDRD_PRE:
+ case ARM::LDRD_POST:
+ if (type && Rn == 15){
+ if (Rt2 == 15)
+ S = MCDisassembler::SoftFail;
+ break;
+ }
+ if (P == 0 && W == 1)
+ S = MCDisassembler::SoftFail;
+ if (!type && (Rt2 == 15 || Rm == 15 || Rm == Rt || Rm == Rt2))
+ S = MCDisassembler::SoftFail;
+ if (!type && writeback && Rn == 15)
+ S = MCDisassembler::SoftFail;
+ if (writeback && (Rn == Rt || Rn == Rt2))
+ S = MCDisassembler::SoftFail;
+ break;
+ case ARM::LDRH:
+ case ARM::LDRH_PRE:
+ case ARM::LDRH_POST:
+ if (type && Rn == 15){
+ if (Rt == 15)
+ S = MCDisassembler::SoftFail;
+ break;
+ }
+ if (Rt == 15)
+ S = MCDisassembler::SoftFail;
+ if (!type && Rm == 15)
+ S = MCDisassembler::SoftFail;
+ if (!type && writeback && (Rn == 15 || Rn == Rt))
+ S = MCDisassembler::SoftFail;
+ break;
+ case ARM::LDRSH:
+ case ARM::LDRSH_PRE:
+ case ARM::LDRSH_POST:
+ case ARM::LDRSB:
+ case ARM::LDRSB_PRE:
+ case ARM::LDRSB_POST:
+ if (type && Rn == 15){
+ if (Rt == 15)
+ S = MCDisassembler::SoftFail;
+ break;
+ }
+ if (type && (Rt == 15 || (writeback && Rn == Rt)))
+ S = MCDisassembler::SoftFail;
+ if (!type && (Rt == 15 || Rm == 15))
+ S = MCDisassembler::SoftFail;
+ if (!type && writeback && (Rn == 15 || Rn == Rt))
+ S = MCDisassembler::SoftFail;
+ break;
+ default:
+ break;
+ }
+
+ if (writeback) { // Writeback
+ if (P)
+ U |= ARMII::IndexModePre << 9;
+ else
+ U |= ARMII::IndexModePost << 9;
+
+ // On stores, the writeback operand precedes Rt.
+ switch (Inst.getOpcode()) {
+ case ARM::STRD:
+ case ARM::STRD_PRE:
+ case ARM::STRD_POST:
+ case ARM::STRH:
+ case ARM::STRH_PRE:
+ case ARM::STRH_POST:
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+ return MCDisassembler::Fail;
+ switch (Inst.getOpcode()) {
+ case ARM::STRD:
+ case ARM::STRD_PRE:
+ case ARM::STRD_POST:
+ case ARM::LDRD:
+ case ARM::LDRD_PRE:
+ case ARM::LDRD_POST:
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rt+1, Address, Decoder)))
+ return MCDisassembler::Fail;
+ break;
+ default:
+ break;
+ }
+
+ if (writeback) {
+ // On loads, the writeback operand comes after Rt.
+ switch (Inst.getOpcode()) {
+ case ARM::LDRD:
+ case ARM::LDRD_PRE:
+ case ARM::LDRD_POST:
+ case ARM::LDRH:
+ case ARM::LDRH_PRE:
+ case ARM::LDRH_POST:
+ case ARM::LDRSH:
+ case ARM::LDRSH_PRE:
+ case ARM::LDRSH_POST:
+ case ARM::LDRSB:
+ case ARM::LDRSB_PRE:
+ case ARM::LDRSB_POST:
+ case ARM::LDRHTr:
+ case ARM::LDRSBTr:
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ if (type) {
+ Inst.addOperand(MCOperand::createReg(0));
+ Inst.addOperand(MCOperand::createImm(U | (imm << 4) | Rm));
+ } else {
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createImm(U));
+ }
+
+ if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ return S;
+}
+
+static DecodeStatus DecodeRFEInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ unsigned mode = fieldFromInstruction(Insn, 23, 2);
+
+ switch (mode) {
+ case 0:
+ mode = ARM_AM::da;
+ break;
+ case 1:
+ mode = ARM_AM::ia;
+ break;
+ case 2:
+ mode = ARM_AM::db;
+ break;
+ case 3:
+ mode = ARM_AM::ib;
+ break;
+ }
+
+ Inst.addOperand(MCOperand::createImm(mode));
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ return S;
+}
+
+static DecodeStatus DecodeQADDInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+ unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ unsigned pred = fieldFromInstruction(Insn, 28, 4);
+
+ if (pred == 0xF)
+ return DecodeCPSInstruction(Inst, Insn, Address, Decoder);
+
+ if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+ return MCDisassembler::Fail;
+ return S;
+}
+
+static DecodeStatus DecodeMemMultipleWritebackInstruction(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ unsigned pred = fieldFromInstruction(Insn, 28, 4);
+ unsigned reglist = fieldFromInstruction(Insn, 0, 16);
+
+ if (pred == 0xF) {
+ // Ambiguous with RFE and SRS
+ switch (Inst.getOpcode()) {
+ case ARM::LDMDA:
+ Inst.setOpcode(ARM::RFEDA);
+ break;
+ case ARM::LDMDA_UPD:
+ Inst.setOpcode(ARM::RFEDA_UPD);
+ break;
+ case ARM::LDMDB:
+ Inst.setOpcode(ARM::RFEDB);
+ break;
+ case ARM::LDMDB_UPD:
+ Inst.setOpcode(ARM::RFEDB_UPD);
+ break;
+ case ARM::LDMIA:
+ Inst.setOpcode(ARM::RFEIA);
+ break;
+ case ARM::LDMIA_UPD:
+ Inst.setOpcode(ARM::RFEIA_UPD);
+ break;
+ case ARM::LDMIB:
+ Inst.setOpcode(ARM::RFEIB);
+ break;
+ case ARM::LDMIB_UPD:
+ Inst.setOpcode(ARM::RFEIB_UPD);
+ break;
+ case ARM::STMDA:
+ Inst.setOpcode(ARM::SRSDA);
+ break;
+ case ARM::STMDA_UPD:
+ Inst.setOpcode(ARM::SRSDA_UPD);
+ break;
+ case ARM::STMDB:
+ Inst.setOpcode(ARM::SRSDB);
+ break;
+ case ARM::STMDB_UPD:
+ Inst.setOpcode(ARM::SRSDB_UPD);
+ break;
+ case ARM::STMIA:
+ Inst.setOpcode(ARM::SRSIA);
+ break;
+ case ARM::STMIA_UPD:
+ Inst.setOpcode(ARM::SRSIA_UPD);
+ break;
+ case ARM::STMIB:
+ Inst.setOpcode(ARM::SRSIB);
+ break;
+ case ARM::STMIB_UPD:
+ Inst.setOpcode(ARM::SRSIB_UPD);
+ break;
+ default:
+ return MCDisassembler::Fail;
+ }
+
+ // For stores (which become SRS's, the only operand is the mode.
+ if (fieldFromInstruction(Insn, 20, 1) == 0) {
+ // Check SRS encoding constraints
+ if (!(fieldFromInstruction(Insn, 22, 1) == 1 &&
+ fieldFromInstruction(Insn, 20, 1) == 0))
+ return MCDisassembler::Fail;
+
+ Inst.addOperand(
+ MCOperand::createImm(fieldFromInstruction(Insn, 0, 4)));
+ return S;
+ }
+
+ return DecodeRFEInstruction(Inst, Insn, Address, Decoder);
+ }
+
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail; // Tied
+ if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeRegListOperand(Inst, reglist, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ return S;
+}
+
+// Check for UNPREDICTABLE predicated ESB instruction
+static DecodeStatus DecodeHINTInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ unsigned pred = fieldFromInstruction(Insn, 28, 4);
+ unsigned imm8 = fieldFromInstruction(Insn, 0, 8);
+ const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
+ const FeatureBitset &FeatureBits = Dis->getSubtargetInfo().getFeatureBits();
+
+ DecodeStatus S = MCDisassembler::Success;
+
+ Inst.addOperand(MCOperand::createImm(imm8));
+
+ if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ // ESB is unpredictable if pred != AL. Without the RAS extension, it is a NOP,
+ // so all predicates should be allowed.
+ if (imm8 == 0x10 && pred != 0xe && ((FeatureBits[ARM::FeatureRAS]) != 0))
+ S = MCDisassembler::SoftFail;
+
+ return S;
+}
+
+static DecodeStatus DecodeCPSInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ unsigned imod = fieldFromInstruction(Insn, 18, 2);
+ unsigned M = fieldFromInstruction(Insn, 17, 1);
+ unsigned iflags = fieldFromInstruction(Insn, 6, 3);
+ unsigned mode = fieldFromInstruction(Insn, 0, 5);
+
+ DecodeStatus S = MCDisassembler::Success;
+
+ // This decoder is called from multiple location that do not check
+ // the full encoding is valid before they do.
+ if (fieldFromInstruction(Insn, 5, 1) != 0 ||
+ fieldFromInstruction(Insn, 16, 1) != 0 ||
+ fieldFromInstruction(Insn, 20, 8) != 0x10)
+ return MCDisassembler::Fail;
+
+ // imod == '01' --> UNPREDICTABLE
+ // NOTE: Even though this is technically UNPREDICTABLE, we choose to
+ // return failure here. The '01' imod value is unprintable, so there's
+ // nothing useful we could do even if we returned UNPREDICTABLE.
+
+ if (imod == 1) return MCDisassembler::Fail;
+
+ if (imod && M) {
+ Inst.setOpcode(ARM::CPS3p);
+ Inst.addOperand(MCOperand::createImm(imod));
+ Inst.addOperand(MCOperand::createImm(iflags));
+ Inst.addOperand(MCOperand::createImm(mode));
+ } else if (imod && !M) {
+ Inst.setOpcode(ARM::CPS2p);
+ Inst.addOperand(MCOperand::createImm(imod));
+ Inst.addOperand(MCOperand::createImm(iflags));
+ if (mode) S = MCDisassembler::SoftFail;
+ } else if (!imod && M) {
+ Inst.setOpcode(ARM::CPS1p);
+ Inst.addOperand(MCOperand::createImm(mode));
+ if (iflags) S = MCDisassembler::SoftFail;
+ } else {
+ // imod == '00' && M == '0' --> UNPREDICTABLE
+ Inst.setOpcode(ARM::CPS1p);
+ Inst.addOperand(MCOperand::createImm(mode));
+ S = MCDisassembler::SoftFail;
+ }
+
+ return S;
+}
+
+static DecodeStatus DecodeT2CPSInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ unsigned imod = fieldFromInstruction(Insn, 9, 2);
+ unsigned M = fieldFromInstruction(Insn, 8, 1);
+ unsigned iflags = fieldFromInstruction(Insn, 5, 3);
+ unsigned mode = fieldFromInstruction(Insn, 0, 5);
+
+ DecodeStatus S = MCDisassembler::Success;
+
+ // imod == '01' --> UNPREDICTABLE
+ // NOTE: Even though this is technically UNPREDICTABLE, we choose to
+ // return failure here. The '01' imod value is unprintable, so there's
+ // nothing useful we could do even if we returned UNPREDICTABLE.
+
+ if (imod == 1) return MCDisassembler::Fail;
+
+ if (imod && M) {
+ Inst.setOpcode(ARM::t2CPS3p);
+ Inst.addOperand(MCOperand::createImm(imod));
+ Inst.addOperand(MCOperand::createImm(iflags));
+ Inst.addOperand(MCOperand::createImm(mode));
+ } else if (imod && !M) {
+ Inst.setOpcode(ARM::t2CPS2p);
+ Inst.addOperand(MCOperand::createImm(imod));
+ Inst.addOperand(MCOperand::createImm(iflags));
+ if (mode) S = MCDisassembler::SoftFail;
+ } else if (!imod && M) {
+ Inst.setOpcode(ARM::t2CPS1p);
+ Inst.addOperand(MCOperand::createImm(mode));
+ if (iflags) S = MCDisassembler::SoftFail;
+ } else {
+ // imod == '00' && M == '0' --> this is a HINT instruction
+ int imm = fieldFromInstruction(Insn, 0, 8);
+ // HINT are defined only for immediate in [0..4]
+ if(imm > 4) return MCDisassembler::Fail;
+ Inst.setOpcode(ARM::t2HINT);
+ Inst.addOperand(MCOperand::createImm(imm));
+ }
+
+ return S;
+}
+
+static DecodeStatus DecodeT2MOVTWInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rd = fieldFromInstruction(Insn, 8, 4);
+ unsigned imm = 0;
+
+ imm |= (fieldFromInstruction(Insn, 0, 8) << 0);
+ imm |= (fieldFromInstruction(Insn, 12, 3) << 8);
+ imm |= (fieldFromInstruction(Insn, 16, 4) << 12);
+ imm |= (fieldFromInstruction(Insn, 26, 1) << 11);
+
+ if (Inst.getOpcode() == ARM::t2MOVTi16)
+ if (!Check(S, DecoderGPRRegisterClass(Inst, Rd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecoderGPRRegisterClass(Inst, Rd, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ if (!tryAddingSymbolicOperand(Address, imm, false, 4, Inst, Decoder))
+ Inst.addOperand(MCOperand::createImm(imm));
+
+ return S;
+}
+
+static DecodeStatus DecodeArmMOVTWInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+ unsigned pred = fieldFromInstruction(Insn, 28, 4);
+ unsigned imm = 0;
+
+ imm |= (fieldFromInstruction(Insn, 0, 12) << 0);
+ imm |= (fieldFromInstruction(Insn, 16, 4) << 12);
+
+ if (Inst.getOpcode() == ARM::MOVTi16)
+ if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rd, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rd, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ if (!tryAddingSymbolicOperand(Address, imm, false, 4, Inst, Decoder))
+ Inst.addOperand(MCOperand::createImm(imm));
+
+ if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ return S;
+}
+
+static DecodeStatus DecodeSMLAInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rd = fieldFromInstruction(Insn, 16, 4);
+ unsigned Rn = fieldFromInstruction(Insn, 0, 4);
+ unsigned Rm = fieldFromInstruction(Insn, 8, 4);
+ unsigned Ra = fieldFromInstruction(Insn, 12, 4);
+ unsigned pred = fieldFromInstruction(Insn, 28, 4);
+
+ if (pred == 0xF)
+ return DecodeCPSInstruction(Inst, Insn, Address, Decoder);
+
+ if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Ra, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ return S;
+}
+
+static DecodeStatus DecodeTSTInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Pred = fieldFromInstruction(Insn, 28, 4);
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+
+ if (Pred == 0xF)
+ return DecodeSETPANInstruction(Inst, Insn, Address, Decoder);
+
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodePredicateOperand(Inst, Pred, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ return S;
+}
+
+static DecodeStatus DecodeSETPANInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Imm = fieldFromInstruction(Insn, 9, 1);
+
+ const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
+ const FeatureBitset &FeatureBits = Dis->getSubtargetInfo().getFeatureBits();
+
+ if (!FeatureBits[ARM::HasV8_1aOps] ||
+ !FeatureBits[ARM::HasV8Ops])
+ return MCDisassembler::Fail;
+
+ // Decoder can be called from DecodeTST, which does not check the full
+ // encoding is valid.
+ if (fieldFromInstruction(Insn, 20,12) != 0xf11 ||
+ fieldFromInstruction(Insn, 4,4) != 0)
+ return MCDisassembler::Fail;
+ if (fieldFromInstruction(Insn, 10,10) != 0 ||
+ fieldFromInstruction(Insn, 0,4) != 0)
+ S = MCDisassembler::SoftFail;
+
+ Inst.setOpcode(ARM::SETPAN);
+ Inst.addOperand(MCOperand::createImm(Imm));
+
+ return S;
+}
+
+static DecodeStatus DecodeAddrModeImm12Operand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned add = fieldFromInstruction(Val, 12, 1);
+ unsigned imm = fieldFromInstruction(Val, 0, 12);
+ unsigned Rn = fieldFromInstruction(Val, 13, 4);
+
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ if (!add) imm *= -1;
+ if (imm == 0 && !add) imm = INT32_MIN;
+ Inst.addOperand(MCOperand::createImm(imm));
+ if (Rn == 15)
+ tryAddingPcLoadReferenceComment(Address, Address + imm + 8, Decoder);
+
+ return S;
+}
+
+static DecodeStatus DecodeAddrMode5Operand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rn = fieldFromInstruction(Val, 9, 4);
+ // U == 1 to add imm, 0 to subtract it.
+ unsigned U = fieldFromInstruction(Val, 8, 1);
+ unsigned imm = fieldFromInstruction(Val, 0, 8);
+
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ if (U)
+ Inst.addOperand(MCOperand::createImm(ARM_AM::getAM5Opc(ARM_AM::add, imm)));
+ else
+ Inst.addOperand(MCOperand::createImm(ARM_AM::getAM5Opc(ARM_AM::sub, imm)));
+
+ return S;
+}
+
+static DecodeStatus DecodeAddrMode5FP16Operand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rn = fieldFromInstruction(Val, 9, 4);
+ // U == 1 to add imm, 0 to subtract it.
+ unsigned U = fieldFromInstruction(Val, 8, 1);
+ unsigned imm = fieldFromInstruction(Val, 0, 8);
+
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ if (U)
+ Inst.addOperand(MCOperand::createImm(ARM_AM::getAM5FP16Opc(ARM_AM::add, imm)));
+ else
+ Inst.addOperand(MCOperand::createImm(ARM_AM::getAM5FP16Opc(ARM_AM::sub, imm)));
+
+ return S;
+}
+
+static DecodeStatus DecodeAddrMode7Operand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ return DecodeGPRRegisterClass(Inst, Val, Address, Decoder);
+}
+
+static DecodeStatus
+DecodeT2BInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus Status = MCDisassembler::Success;
+
+ // Note the J1 and J2 values are from the encoded instruction. So here
+ // change them to I1 and I2 values via as documented:
+ // I1 = NOT(J1 EOR S);
+ // I2 = NOT(J2 EOR S);
+ // and build the imm32 with one trailing zero as documented:
+ // imm32 = SignExtend(S:I1:I2:imm10:imm11:'0', 32);
+ unsigned S = fieldFromInstruction(Insn, 26, 1);
+ unsigned J1 = fieldFromInstruction(Insn, 13, 1);
+ unsigned J2 = fieldFromInstruction(Insn, 11, 1);
+ unsigned I1 = !(J1 ^ S);
+ unsigned I2 = !(J2 ^ S);
+ unsigned imm10 = fieldFromInstruction(Insn, 16, 10);
+ unsigned imm11 = fieldFromInstruction(Insn, 0, 11);
+ unsigned tmp = (S << 23) | (I1 << 22) | (I2 << 21) | (imm10 << 11) | imm11;
+ int imm32 = SignExtend32<25>(tmp << 1);
+ if (!tryAddingSymbolicOperand(Address, Address + imm32 + 4,
+ true, 4, Inst, Decoder))
+ Inst.addOperand(MCOperand::createImm(imm32));
+
+ return Status;
+}
+
+static DecodeStatus
+DecodeBranchImmInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned pred = fieldFromInstruction(Insn, 28, 4);
+ unsigned imm = fieldFromInstruction(Insn, 0, 24) << 2;
+
+ if (pred == 0xF) {
+ Inst.setOpcode(ARM::BLXi);
+ imm |= fieldFromInstruction(Insn, 24, 1) << 1;
+ if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<26>(imm) + 8,
+ true, 4, Inst, Decoder))
+ Inst.addOperand(MCOperand::createImm(SignExtend32<26>(imm)));
+ return S;
+ }
+
+ if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<26>(imm) + 8,
+ true, 4, Inst, Decoder))
+ Inst.addOperand(MCOperand::createImm(SignExtend32<26>(imm)));
+ if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ return S;
+}
+
+
+static DecodeStatus DecodeAddrMode6Operand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rm = fieldFromInstruction(Val, 0, 4);
+ unsigned align = fieldFromInstruction(Val, 4, 2);
+
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!align)
+ Inst.addOperand(MCOperand::createImm(0));
+ else
+ Inst.addOperand(MCOperand::createImm(4 << align));
+
+ return S;
+}
+
+static DecodeStatus DecodeVLDInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+ Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+ unsigned wb = fieldFromInstruction(Insn, 16, 4);
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ Rn |= fieldFromInstruction(Insn, 4, 2) << 4;
+ unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+
+ // First output register
+ switch (Inst.getOpcode()) {
+ case ARM::VLD1q16: case ARM::VLD1q32: case ARM::VLD1q64: case ARM::VLD1q8:
+ case ARM::VLD1q16wb_fixed: case ARM::VLD1q16wb_register:
+ case ARM::VLD1q32wb_fixed: case ARM::VLD1q32wb_register:
+ case ARM::VLD1q64wb_fixed: case ARM::VLD1q64wb_register:
+ case ARM::VLD1q8wb_fixed: case ARM::VLD1q8wb_register:
+ case ARM::VLD2d16: case ARM::VLD2d32: case ARM::VLD2d8:
+ case ARM::VLD2d16wb_fixed: case ARM::VLD2d16wb_register:
+ case ARM::VLD2d32wb_fixed: case ARM::VLD2d32wb_register:
+ case ARM::VLD2d8wb_fixed: case ARM::VLD2d8wb_register:
+ if (!Check(S, DecodeDPairRegisterClass(Inst, Rd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ break;
+ case ARM::VLD2b16:
+ case ARM::VLD2b32:
+ case ARM::VLD2b8:
+ case ARM::VLD2b16wb_fixed:
+ case ARM::VLD2b16wb_register:
+ case ARM::VLD2b32wb_fixed:
+ case ARM::VLD2b32wb_register:
+ case ARM::VLD2b8wb_fixed:
+ case ARM::VLD2b8wb_register:
+ if (!Check(S, DecodeDPairSpacedRegisterClass(Inst, Rd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ break;
+ default:
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ }
+
+ // Second output register
+ switch (Inst.getOpcode()) {
+ case ARM::VLD3d8:
+ case ARM::VLD3d16:
+ case ARM::VLD3d32:
+ case ARM::VLD3d8_UPD:
+ case ARM::VLD3d16_UPD:
+ case ARM::VLD3d32_UPD:
+ case ARM::VLD4d8:
+ case ARM::VLD4d16:
+ case ARM::VLD4d32:
+ case ARM::VLD4d8_UPD:
+ case ARM::VLD4d16_UPD:
+ case ARM::VLD4d32_UPD:
+ if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+1)%32, Address, Decoder)))
+ return MCDisassembler::Fail;
+ break;
+ case ARM::VLD3q8:
+ case ARM::VLD3q16:
+ case ARM::VLD3q32:
+ case ARM::VLD3q8_UPD:
+ case ARM::VLD3q16_UPD:
+ case ARM::VLD3q32_UPD:
+ case ARM::VLD4q8:
+ case ARM::VLD4q16:
+ case ARM::VLD4q32:
+ case ARM::VLD4q8_UPD:
+ case ARM::VLD4q16_UPD:
+ case ARM::VLD4q32_UPD:
+ if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2)%32, Address, Decoder)))
+ return MCDisassembler::Fail;
+ default:
+ break;
+ }
+
+ // Third output register
+ switch(Inst.getOpcode()) {
+ case ARM::VLD3d8:
+ case ARM::VLD3d16:
+ case ARM::VLD3d32:
+ case ARM::VLD3d8_UPD:
+ case ARM::VLD3d16_UPD:
+ case ARM::VLD3d32_UPD:
+ case ARM::VLD4d8:
+ case ARM::VLD4d16:
+ case ARM::VLD4d32:
+ case ARM::VLD4d8_UPD:
+ case ARM::VLD4d16_UPD:
+ case ARM::VLD4d32_UPD:
+ if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2)%32, Address, Decoder)))
+ return MCDisassembler::Fail;
+ break;
+ case ARM::VLD3q8:
+ case ARM::VLD3q16:
+ case ARM::VLD3q32:
+ case ARM::VLD3q8_UPD:
+ case ARM::VLD3q16_UPD:
+ case ARM::VLD3q32_UPD:
+ case ARM::VLD4q8:
+ case ARM::VLD4q16:
+ case ARM::VLD4q32:
+ case ARM::VLD4q8_UPD:
+ case ARM::VLD4q16_UPD:
+ case ARM::VLD4q32_UPD:
+ if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+4)%32, Address, Decoder)))
+ return MCDisassembler::Fail;
+ break;
+ default:
+ break;
+ }
+
+ // Fourth output register
+ switch (Inst.getOpcode()) {
+ case ARM::VLD4d8:
+ case ARM::VLD4d16:
+ case ARM::VLD4d32:
+ case ARM::VLD4d8_UPD:
+ case ARM::VLD4d16_UPD:
+ case ARM::VLD4d32_UPD:
+ if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+3)%32, Address, Decoder)))
+ return MCDisassembler::Fail;
+ break;
+ case ARM::VLD4q8:
+ case ARM::VLD4q16:
+ case ARM::VLD4q32:
+ case ARM::VLD4q8_UPD:
+ case ARM::VLD4q16_UPD:
+ case ARM::VLD4q32_UPD:
+ if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+6)%32, Address, Decoder)))
+ return MCDisassembler::Fail;
+ break;
+ default:
+ break;
+ }
+
+ // Writeback operand
+ switch (Inst.getOpcode()) {
+ case ARM::VLD1d8wb_fixed:
+ case ARM::VLD1d16wb_fixed:
+ case ARM::VLD1d32wb_fixed:
+ case ARM::VLD1d64wb_fixed:
+ case ARM::VLD1d8wb_register:
+ case ARM::VLD1d16wb_register:
+ case ARM::VLD1d32wb_register:
+ case ARM::VLD1d64wb_register:
+ case ARM::VLD1q8wb_fixed:
+ case ARM::VLD1q16wb_fixed:
+ case ARM::VLD1q32wb_fixed:
+ case ARM::VLD1q64wb_fixed:
+ case ARM::VLD1q8wb_register:
+ case ARM::VLD1q16wb_register:
+ case ARM::VLD1q32wb_register:
+ case ARM::VLD1q64wb_register:
+ case ARM::VLD1d8Twb_fixed:
+ case ARM::VLD1d8Twb_register:
+ case ARM::VLD1d16Twb_fixed:
+ case ARM::VLD1d16Twb_register:
+ case ARM::VLD1d32Twb_fixed:
+ case ARM::VLD1d32Twb_register:
+ case ARM::VLD1d64Twb_fixed:
+ case ARM::VLD1d64Twb_register:
+ case ARM::VLD1d8Qwb_fixed:
+ case ARM::VLD1d8Qwb_register:
+ case ARM::VLD1d16Qwb_fixed:
+ case ARM::VLD1d16Qwb_register:
+ case ARM::VLD1d32Qwb_fixed:
+ case ARM::VLD1d32Qwb_register:
+ case ARM::VLD1d64Qwb_fixed:
+ case ARM::VLD1d64Qwb_register:
+ case ARM::VLD2d8wb_fixed:
+ case ARM::VLD2d16wb_fixed:
+ case ARM::VLD2d32wb_fixed:
+ case ARM::VLD2q8wb_fixed:
+ case ARM::VLD2q16wb_fixed:
+ case ARM::VLD2q32wb_fixed:
+ case ARM::VLD2d8wb_register:
+ case ARM::VLD2d16wb_register:
+ case ARM::VLD2d32wb_register:
+ case ARM::VLD2q8wb_register:
+ case ARM::VLD2q16wb_register:
+ case ARM::VLD2q32wb_register:
+ case ARM::VLD2b8wb_fixed:
+ case ARM::VLD2b16wb_fixed:
+ case ARM::VLD2b32wb_fixed:
+ case ARM::VLD2b8wb_register:
+ case ARM::VLD2b16wb_register:
+ case ARM::VLD2b32wb_register:
+ Inst.addOperand(MCOperand::createImm(0));
+ break;
+ case ARM::VLD3d8_UPD:
+ case ARM::VLD3d16_UPD:
+ case ARM::VLD3d32_UPD:
+ case ARM::VLD3q8_UPD:
+ case ARM::VLD3q16_UPD:
+ case ARM::VLD3q32_UPD:
+ case ARM::VLD4d8_UPD:
+ case ARM::VLD4d16_UPD:
+ case ARM::VLD4d32_UPD:
+ case ARM::VLD4q8_UPD:
+ case ARM::VLD4q16_UPD:
+ case ARM::VLD4q32_UPD:
+ if (!Check(S, DecodeGPRRegisterClass(Inst, wb, Address, Decoder)))
+ return MCDisassembler::Fail;
+ break;
+ default:
+ break;
+ }
+
+ // AddrMode6 Base (register+alignment)
+ if (!Check(S, DecodeAddrMode6Operand(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ // AddrMode6 Offset (register)
+ switch (Inst.getOpcode()) {
+ default:
+ // The below have been updated to have explicit am6offset split
+ // between fixed and register offset. For those instructions not
+ // yet updated, we need to add an additional reg0 operand for the
+ // fixed variant.
+ //
+ // The fixed offset encodes as Rm == 0xd, so we check for that.
+ if (Rm == 0xd) {
+ Inst.addOperand(MCOperand::createReg(0));
+ break;
+ }
+ // Fall through to handle the register offset variant.
+ LLVM_FALLTHROUGH;
+ case ARM::VLD1d8wb_fixed:
+ case ARM::VLD1d16wb_fixed:
+ case ARM::VLD1d32wb_fixed:
+ case ARM::VLD1d64wb_fixed:
+ case ARM::VLD1d8Twb_fixed:
+ case ARM::VLD1d16Twb_fixed:
+ case ARM::VLD1d32Twb_fixed:
+ case ARM::VLD1d64Twb_fixed:
+ case ARM::VLD1d8Qwb_fixed:
+ case ARM::VLD1d16Qwb_fixed:
+ case ARM::VLD1d32Qwb_fixed:
+ case ARM::VLD1d64Qwb_fixed:
+ case ARM::VLD1d8wb_register:
+ case ARM::VLD1d16wb_register:
+ case ARM::VLD1d32wb_register:
+ case ARM::VLD1d64wb_register:
+ case ARM::VLD1q8wb_fixed:
+ case ARM::VLD1q16wb_fixed:
+ case ARM::VLD1q32wb_fixed:
+ case ARM::VLD1q64wb_fixed:
+ case ARM::VLD1q8wb_register:
+ case ARM::VLD1q16wb_register:
+ case ARM::VLD1q32wb_register:
+ case ARM::VLD1q64wb_register:
+ // The fixed offset post-increment encodes Rm == 0xd. The no-writeback
+ // variant encodes Rm == 0xf. Anything else is a register offset post-
+ // increment and we need to add the register operand to the instruction.
+ if (Rm != 0xD && Rm != 0xF &&
+ !Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ break;
+ case ARM::VLD2d8wb_fixed:
+ case ARM::VLD2d16wb_fixed:
+ case ARM::VLD2d32wb_fixed:
+ case ARM::VLD2b8wb_fixed:
+ case ARM::VLD2b16wb_fixed:
+ case ARM::VLD2b32wb_fixed:
+ case ARM::VLD2q8wb_fixed:
+ case ARM::VLD2q16wb_fixed:
+ case ARM::VLD2q32wb_fixed:
+ break;
+ }
+
+ return S;
+}
+
+static DecodeStatus DecodeVLDST1Instruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ unsigned type = fieldFromInstruction(Insn, 8, 4);
+ unsigned align = fieldFromInstruction(Insn, 4, 2);
+ if (type == 6 && (align & 2)) return MCDisassembler::Fail;
+ if (type == 7 && (align & 2)) return MCDisassembler::Fail;
+ if (type == 10 && align == 3) return MCDisassembler::Fail;
+
+ unsigned load = fieldFromInstruction(Insn, 21, 1);
+ return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder)
+ : DecodeVSTInstruction(Inst, Insn, Address, Decoder);
+}
+
+static DecodeStatus DecodeVLDST2Instruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ unsigned size = fieldFromInstruction(Insn, 6, 2);
+ if (size == 3) return MCDisassembler::Fail;
+
+ unsigned type = fieldFromInstruction(Insn, 8, 4);
+ unsigned align = fieldFromInstruction(Insn, 4, 2);
+ if (type == 8 && align == 3) return MCDisassembler::Fail;
+ if (type == 9 && align == 3) return MCDisassembler::Fail;
+
+ unsigned load = fieldFromInstruction(Insn, 21, 1);
+ return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder)
+ : DecodeVSTInstruction(Inst, Insn, Address, Decoder);
+}
+
+static DecodeStatus DecodeVLDST3Instruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ unsigned size = fieldFromInstruction(Insn, 6, 2);
+ if (size == 3) return MCDisassembler::Fail;
+
+ unsigned align = fieldFromInstruction(Insn, 4, 2);
+ if (align & 2) return MCDisassembler::Fail;
+
+ unsigned load = fieldFromInstruction(Insn, 21, 1);
+ return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder)
+ : DecodeVSTInstruction(Inst, Insn, Address, Decoder);
+}
+
+static DecodeStatus DecodeVLDST4Instruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ unsigned size = fieldFromInstruction(Insn, 6, 2);
+ if (size == 3) return MCDisassembler::Fail;
+
+ unsigned load = fieldFromInstruction(Insn, 21, 1);
+ return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder)
+ : DecodeVSTInstruction(Inst, Insn, Address, Decoder);
+}
+
+static DecodeStatus DecodeVSTInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+ Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+ unsigned wb = fieldFromInstruction(Insn, 16, 4);
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ Rn |= fieldFromInstruction(Insn, 4, 2) << 4;
+ unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+
+ // Writeback Operand
+ switch (Inst.getOpcode()) {
+ case ARM::VST1d8wb_fixed:
+ case ARM::VST1d16wb_fixed:
+ case ARM::VST1d32wb_fixed:
+ case ARM::VST1d64wb_fixed:
+ case ARM::VST1d8wb_register:
+ case ARM::VST1d16wb_register:
+ case ARM::VST1d32wb_register:
+ case ARM::VST1d64wb_register:
+ case ARM::VST1q8wb_fixed:
+ case ARM::VST1q16wb_fixed:
+ case ARM::VST1q32wb_fixed:
+ case ARM::VST1q64wb_fixed:
+ case ARM::VST1q8wb_register:
+ case ARM::VST1q16wb_register:
+ case ARM::VST1q32wb_register:
+ case ARM::VST1q64wb_register:
+ case ARM::VST1d8Twb_fixed:
+ case ARM::VST1d16Twb_fixed:
+ case ARM::VST1d32Twb_fixed:
+ case ARM::VST1d64Twb_fixed:
+ case ARM::VST1d8Twb_register:
+ case ARM::VST1d16Twb_register:
+ case ARM::VST1d32Twb_register:
+ case ARM::VST1d64Twb_register:
+ case ARM::VST1d8Qwb_fixed:
+ case ARM::VST1d16Qwb_fixed:
+ case ARM::VST1d32Qwb_fixed:
+ case ARM::VST1d64Qwb_fixed:
+ case ARM::VST1d8Qwb_register:
+ case ARM::VST1d16Qwb_register:
+ case ARM::VST1d32Qwb_register:
+ case ARM::VST1d64Qwb_register:
+ case ARM::VST2d8wb_fixed:
+ case ARM::VST2d16wb_fixed:
+ case ARM::VST2d32wb_fixed:
+ case ARM::VST2d8wb_register:
+ case ARM::VST2d16wb_register:
+ case ARM::VST2d32wb_register:
+ case ARM::VST2q8wb_fixed:
+ case ARM::VST2q16wb_fixed:
+ case ARM::VST2q32wb_fixed:
+ case ARM::VST2q8wb_register:
+ case ARM::VST2q16wb_register:
+ case ARM::VST2q32wb_register:
+ case ARM::VST2b8wb_fixed:
+ case ARM::VST2b16wb_fixed:
+ case ARM::VST2b32wb_fixed:
+ case ARM::VST2b8wb_register:
+ case ARM::VST2b16wb_register:
+ case ARM::VST2b32wb_register:
+ if (Rm == 0xF)
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createImm(0));
+ break;
+ case ARM::VST3d8_UPD:
+ case ARM::VST3d16_UPD:
+ case ARM::VST3d32_UPD:
+ case ARM::VST3q8_UPD:
+ case ARM::VST3q16_UPD:
+ case ARM::VST3q32_UPD:
+ case ARM::VST4d8_UPD:
+ case ARM::VST4d16_UPD:
+ case ARM::VST4d32_UPD:
+ case ARM::VST4q8_UPD:
+ case ARM::VST4q16_UPD:
+ case ARM::VST4q32_UPD:
+ if (!Check(S, DecodeGPRRegisterClass(Inst, wb, Address, Decoder)))
+ return MCDisassembler::Fail;
+ break;
+ default:
+ break;
+ }
+
+ // AddrMode6 Base (register+alignment)
+ if (!Check(S, DecodeAddrMode6Operand(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ // AddrMode6 Offset (register)
+ switch (Inst.getOpcode()) {
+ default:
+ if (Rm == 0xD)
+ Inst.addOperand(MCOperand::createReg(0));
+ else if (Rm != 0xF) {
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ }
+ break;
+ case ARM::VST1d8wb_fixed:
+ case ARM::VST1d16wb_fixed:
+ case ARM::VST1d32wb_fixed:
+ case ARM::VST1d64wb_fixed:
+ case ARM::VST1q8wb_fixed:
+ case ARM::VST1q16wb_fixed:
+ case ARM::VST1q32wb_fixed:
+ case ARM::VST1q64wb_fixed:
+ case ARM::VST1d8Twb_fixed:
+ case ARM::VST1d16Twb_fixed:
+ case ARM::VST1d32Twb_fixed:
+ case ARM::VST1d64Twb_fixed:
+ case ARM::VST1d8Qwb_fixed:
+ case ARM::VST1d16Qwb_fixed:
+ case ARM::VST1d32Qwb_fixed:
+ case ARM::VST1d64Qwb_fixed:
+ case ARM::VST2d8wb_fixed:
+ case ARM::VST2d16wb_fixed:
+ case ARM::VST2d32wb_fixed:
+ case ARM::VST2q8wb_fixed:
+ case ARM::VST2q16wb_fixed:
+ case ARM::VST2q32wb_fixed:
+ case ARM::VST2b8wb_fixed:
+ case ARM::VST2b16wb_fixed:
+ case ARM::VST2b32wb_fixed:
+ break;
+ }
+
+
+ // First input register
+ switch (Inst.getOpcode()) {
+ case ARM::VST1q16:
+ case ARM::VST1q32:
+ case ARM::VST1q64:
+ case ARM::VST1q8:
+ case ARM::VST1q16wb_fixed:
+ case ARM::VST1q16wb_register:
+ case ARM::VST1q32wb_fixed:
+ case ARM::VST1q32wb_register:
+ case ARM::VST1q64wb_fixed:
+ case ARM::VST1q64wb_register:
+ case ARM::VST1q8wb_fixed:
+ case ARM::VST1q8wb_register:
+ case ARM::VST2d16:
+ case ARM::VST2d32:
+ case ARM::VST2d8:
+ case ARM::VST2d16wb_fixed:
+ case ARM::VST2d16wb_register:
+ case ARM::VST2d32wb_fixed:
+ case ARM::VST2d32wb_register:
+ case ARM::VST2d8wb_fixed:
+ case ARM::VST2d8wb_register:
+ if (!Check(S, DecodeDPairRegisterClass(Inst, Rd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ break;
+ case ARM::VST2b16:
+ case ARM::VST2b32:
+ case ARM::VST2b8:
+ case ARM::VST2b16wb_fixed:
+ case ARM::VST2b16wb_register:
+ case ARM::VST2b32wb_fixed:
+ case ARM::VST2b32wb_register:
+ case ARM::VST2b8wb_fixed:
+ case ARM::VST2b8wb_register:
+ if (!Check(S, DecodeDPairSpacedRegisterClass(Inst, Rd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ break;
+ default:
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ }
+
+ // Second input register
+ switch (Inst.getOpcode()) {
+ case ARM::VST3d8:
+ case ARM::VST3d16:
+ case ARM::VST3d32:
+ case ARM::VST3d8_UPD:
+ case ARM::VST3d16_UPD:
+ case ARM::VST3d32_UPD:
+ case ARM::VST4d8:
+ case ARM::VST4d16:
+ case ARM::VST4d32:
+ case ARM::VST4d8_UPD:
+ case ARM::VST4d16_UPD:
+ case ARM::VST4d32_UPD:
+ if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+1)%32, Address, Decoder)))
+ return MCDisassembler::Fail;
+ break;
+ case ARM::VST3q8:
+ case ARM::VST3q16:
+ case ARM::VST3q32:
+ case ARM::VST3q8_UPD:
+ case ARM::VST3q16_UPD:
+ case ARM::VST3q32_UPD:
+ case ARM::VST4q8:
+ case ARM::VST4q16:
+ case ARM::VST4q32:
+ case ARM::VST4q8_UPD:
+ case ARM::VST4q16_UPD:
+ case ARM::VST4q32_UPD:
+ if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2)%32, Address, Decoder)))
+ return MCDisassembler::Fail;
+ break;
+ default:
+ break;
+ }
+
+ // Third input register
+ switch (Inst.getOpcode()) {
+ case ARM::VST3d8:
+ case ARM::VST3d16:
+ case ARM::VST3d32:
+ case ARM::VST3d8_UPD:
+ case ARM::VST3d16_UPD:
+ case ARM::VST3d32_UPD:
+ case ARM::VST4d8:
+ case ARM::VST4d16:
+ case ARM::VST4d32:
+ case ARM::VST4d8_UPD:
+ case ARM::VST4d16_UPD:
+ case ARM::VST4d32_UPD:
+ if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2)%32, Address, Decoder)))
+ return MCDisassembler::Fail;
+ break;
+ case ARM::VST3q8:
+ case ARM::VST3q16:
+ case ARM::VST3q32:
+ case ARM::VST3q8_UPD:
+ case ARM::VST3q16_UPD:
+ case ARM::VST3q32_UPD:
+ case ARM::VST4q8:
+ case ARM::VST4q16:
+ case ARM::VST4q32:
+ case ARM::VST4q8_UPD:
+ case ARM::VST4q16_UPD:
+ case ARM::VST4q32_UPD:
+ if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+4)%32, Address, Decoder)))
+ return MCDisassembler::Fail;
+ break;
+ default:
+ break;
+ }
+
+ // Fourth input register
+ switch (Inst.getOpcode()) {
+ case ARM::VST4d8:
+ case ARM::VST4d16:
+ case ARM::VST4d32:
+ case ARM::VST4d8_UPD:
+ case ARM::VST4d16_UPD:
+ case ARM::VST4d32_UPD:
+ if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+3)%32, Address, Decoder)))
+ return MCDisassembler::Fail;
+ break;
+ case ARM::VST4q8:
+ case ARM::VST4q16:
+ case ARM::VST4q32:
+ case ARM::VST4q8_UPD:
+ case ARM::VST4q16_UPD:
+ case ARM::VST4q32_UPD:
+ if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+6)%32, Address, Decoder)))
+ return MCDisassembler::Fail;
+ break;
+ default:
+ break;
+ }
+
+ return S;
+}
+
+static DecodeStatus DecodeVLD1DupInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+ Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+ unsigned align = fieldFromInstruction(Insn, 4, 1);
+ unsigned size = fieldFromInstruction(Insn, 6, 2);
+
+ if (size == 0 && align == 1)
+ return MCDisassembler::Fail;
+ align *= (1 << size);
+
+ switch (Inst.getOpcode()) {
+ case ARM::VLD1DUPq16: case ARM::VLD1DUPq32: case ARM::VLD1DUPq8:
+ case ARM::VLD1DUPq16wb_fixed: case ARM::VLD1DUPq16wb_register:
+ case ARM::VLD1DUPq32wb_fixed: case ARM::VLD1DUPq32wb_register:
+ case ARM::VLD1DUPq8wb_fixed: case ARM::VLD1DUPq8wb_register:
+ if (!Check(S, DecodeDPairRegisterClass(Inst, Rd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ break;
+ default:
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ break;
+ }
+ if (Rm != 0xF) {
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ }
+
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createImm(align));
+
+ // The fixed offset post-increment encodes Rm == 0xd. The no-writeback
+ // variant encodes Rm == 0xf. Anything else is a register offset post-
+ // increment and we need to add the register operand to the instruction.
+ if (Rm != 0xD && Rm != 0xF &&
+ !Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ return S;
+}
+
+static DecodeStatus DecodeVLD2DupInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+ Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+ unsigned align = fieldFromInstruction(Insn, 4, 1);
+ unsigned size = 1 << fieldFromInstruction(Insn, 6, 2);
+ align *= 2*size;
+
+ switch (Inst.getOpcode()) {
+ case ARM::VLD2DUPd16: case ARM::VLD2DUPd32: case ARM::VLD2DUPd8:
+ case ARM::VLD2DUPd16wb_fixed: case ARM::VLD2DUPd16wb_register:
+ case ARM::VLD2DUPd32wb_fixed: case ARM::VLD2DUPd32wb_register:
+ case ARM::VLD2DUPd8wb_fixed: case ARM::VLD2DUPd8wb_register:
+ if (!Check(S, DecodeDPairRegisterClass(Inst, Rd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ break;
+ case ARM::VLD2DUPd16x2: case ARM::VLD2DUPd32x2: case ARM::VLD2DUPd8x2:
+ case ARM::VLD2DUPd16x2wb_fixed: case ARM::VLD2DUPd16x2wb_register:
+ case ARM::VLD2DUPd32x2wb_fixed: case ARM::VLD2DUPd32x2wb_register:
+ case ARM::VLD2DUPd8x2wb_fixed: case ARM::VLD2DUPd8x2wb_register:
+ if (!Check(S, DecodeDPairSpacedRegisterClass(Inst, Rd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ break;
+ default:
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ break;
+ }
+
+ if (Rm != 0xF)
+ Inst.addOperand(MCOperand::createImm(0));
+
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createImm(align));
+
+ if (Rm != 0xD && Rm != 0xF) {
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ }
+
+ return S;
+}
+
+static DecodeStatus DecodeVLD3DupInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+ Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+ unsigned inc = fieldFromInstruction(Insn, 5, 1) + 1;
+
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+inc)%32, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2*inc)%32, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (Rm != 0xF) {
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ }
+
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createImm(0));
+
+ if (Rm == 0xD)
+ Inst.addOperand(MCOperand::createReg(0));
+ else if (Rm != 0xF) {
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ }
+
+ return S;
+}
+
+static DecodeStatus DecodeVLD4DupInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+ Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+ unsigned size = fieldFromInstruction(Insn, 6, 2);
+ unsigned inc = fieldFromInstruction(Insn, 5, 1) + 1;
+ unsigned align = fieldFromInstruction(Insn, 4, 1);
+
+ if (size == 0x3) {
+ if (align == 0)
+ return MCDisassembler::Fail;
+ align = 16;
+ } else {
+ if (size == 2) {
+ align *= 8;
+ } else {
+ size = 1 << size;
+ align *= 4*size;
+ }
+ }
+
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+inc)%32, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2*inc)%32, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+3*inc)%32, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (Rm != 0xF) {
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ }
+
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createImm(align));
+
+ if (Rm == 0xD)
+ Inst.addOperand(MCOperand::createReg(0));
+ else if (Rm != 0xF) {
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ }
+
+ return S;
+}
+
+static DecodeStatus
+DecodeNEONModImmInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+ Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+ unsigned imm = fieldFromInstruction(Insn, 0, 4);
+ imm |= fieldFromInstruction(Insn, 16, 3) << 4;
+ imm |= fieldFromInstruction(Insn, 24, 1) << 7;
+ imm |= fieldFromInstruction(Insn, 8, 4) << 8;
+ imm |= fieldFromInstruction(Insn, 5, 1) << 12;
+ unsigned Q = fieldFromInstruction(Insn, 6, 1);
+
+ if (Q) {
+ if (!Check(S, DecodeQPRRegisterClass(Inst, Rd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ } else {
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ }
+
+ Inst.addOperand(MCOperand::createImm(imm));
+
+ switch (Inst.getOpcode()) {
+ case ARM::VORRiv4i16:
+ case ARM::VORRiv2i32:
+ case ARM::VBICiv4i16:
+ case ARM::VBICiv2i32:
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ break;
+ case ARM::VORRiv8i16:
+ case ARM::VORRiv4i32:
+ case ARM::VBICiv8i16:
+ case ARM::VBICiv4i32:
+ if (!Check(S, DecodeQPRRegisterClass(Inst, Rd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ break;
+ default:
+ break;
+ }
+
+ return S;
+}
+
+static DecodeStatus DecodeVSHLMaxInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+ Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+ unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+ Rm |= fieldFromInstruction(Insn, 5, 1) << 4;
+ unsigned size = fieldFromInstruction(Insn, 18, 2);
+
+ if (!Check(S, DecodeQPRRegisterClass(Inst, Rd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createImm(8 << size));
+
+ return S;
+}
+
+static DecodeStatus DecodeShiftRight8Imm(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ Inst.addOperand(MCOperand::createImm(8 - Val));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeShiftRight16Imm(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ Inst.addOperand(MCOperand::createImm(16 - Val));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeShiftRight32Imm(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ Inst.addOperand(MCOperand::createImm(32 - Val));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeShiftRight64Imm(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ Inst.addOperand(MCOperand::createImm(64 - Val));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeTBLInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+ Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ Rn |= fieldFromInstruction(Insn, 7, 1) << 4;
+ unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+ Rm |= fieldFromInstruction(Insn, 5, 1) << 4;
+ unsigned op = fieldFromInstruction(Insn, 6, 1);
+
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (op) {
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+ return MCDisassembler::Fail; // Writeback
+ }
+
+ switch (Inst.getOpcode()) {
+ case ARM::VTBL2:
+ case ARM::VTBX2:
+ if (!Check(S, DecodeDPairRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ break;
+ default:
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ }
+
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rm, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ return S;
+}
+
+static DecodeStatus DecodeThumbAddSpecialReg(MCInst &Inst, uint16_t Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned dst = fieldFromInstruction(Insn, 8, 3);
+ unsigned imm = fieldFromInstruction(Insn, 0, 8);
+
+ if (!Check(S, DecodetGPRRegisterClass(Inst, dst, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ switch(Inst.getOpcode()) {
+ default:
+ return MCDisassembler::Fail;
+ case ARM::tADR:
+ break; // tADR does not explicitly represent the PC as an operand.
+ case ARM::tADDrSPi:
+ Inst.addOperand(MCOperand::createReg(ARM::SP));
+ break;
+ }
+
+ Inst.addOperand(MCOperand::createImm(imm));
+ return S;
+}
+
+static DecodeStatus DecodeThumbBROperand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<12>(Val<<1) + 4,
+ true, 2, Inst, Decoder))
+ Inst.addOperand(MCOperand::createImm(SignExtend32<12>(Val << 1)));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeT2BROperand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<21>(Val) + 4,
+ true, 4, Inst, Decoder))
+ Inst.addOperand(MCOperand::createImm(SignExtend32<21>(Val)));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeThumbCmpBROperand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ if (!tryAddingSymbolicOperand(Address, Address + (Val<<1) + 4,
+ true, 2, Inst, Decoder))
+ Inst.addOperand(MCOperand::createImm(Val << 1));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeThumbAddrModeRR(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rn = fieldFromInstruction(Val, 0, 3);
+ unsigned Rm = fieldFromInstruction(Val, 3, 3);
+
+ if (!Check(S, DecodetGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodetGPRRegisterClass(Inst, Rm, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ return S;
+}
+
+static DecodeStatus DecodeThumbAddrModeIS(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rn = fieldFromInstruction(Val, 0, 3);
+ unsigned imm = fieldFromInstruction(Val, 3, 5);
+
+ if (!Check(S, DecodetGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createImm(imm));
+
+ return S;
+}
+
+static DecodeStatus DecodeThumbAddrModePC(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ unsigned imm = Val << 2;
+
+ Inst.addOperand(MCOperand::createImm(imm));
+ tryAddingPcLoadReferenceComment(Address, (Address & ~2u) + imm + 4, Decoder);
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeThumbAddrModeSP(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ Inst.addOperand(MCOperand::createReg(ARM::SP));
+ Inst.addOperand(MCOperand::createImm(Val));
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeT2AddrModeSOReg(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rn = fieldFromInstruction(Val, 6, 4);
+ unsigned Rm = fieldFromInstruction(Val, 2, 4);
+ unsigned imm = fieldFromInstruction(Val, 0, 2);
+
+ // Thumb stores cannot use PC as dest register.
+ switch (Inst.getOpcode()) {
+ case ARM::t2STRHs:
+ case ARM::t2STRBs:
+ case ARM::t2STRs:
+ if (Rn == 15)
+ return MCDisassembler::Fail;
+ default:
+ break;
+ }
+
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createImm(imm));
+
+ return S;
+}
+
+static DecodeStatus DecodeT2LoadShift(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+
+ const FeatureBitset &featureBits =
+ ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
+
+ bool hasMP = featureBits[ARM::FeatureMP];
+ bool hasV7Ops = featureBits[ARM::HasV7Ops];
+
+ if (Rn == 15) {
+ switch (Inst.getOpcode()) {
+ case ARM::t2LDRBs:
+ Inst.setOpcode(ARM::t2LDRBpci);
+ break;
+ case ARM::t2LDRHs:
+ Inst.setOpcode(ARM::t2LDRHpci);
+ break;
+ case ARM::t2LDRSHs:
+ Inst.setOpcode(ARM::t2LDRSHpci);
+ break;
+ case ARM::t2LDRSBs:
+ Inst.setOpcode(ARM::t2LDRSBpci);
+ break;
+ case ARM::t2LDRs:
+ Inst.setOpcode(ARM::t2LDRpci);
+ break;
+ case ARM::t2PLDs:
+ Inst.setOpcode(ARM::t2PLDpci);
+ break;
+ case ARM::t2PLIs:
+ Inst.setOpcode(ARM::t2PLIpci);
+ break;
+ default:
+ return MCDisassembler::Fail;
+ }
+
+ return DecodeT2LoadLabel(Inst, Insn, Address, Decoder);
+ }
+
+ if (Rt == 15) {
+ switch (Inst.getOpcode()) {
+ case ARM::t2LDRSHs:
+ return MCDisassembler::Fail;
+ case ARM::t2LDRHs:
+ Inst.setOpcode(ARM::t2PLDWs);
+ break;
+ case ARM::t2LDRSBs:
+ Inst.setOpcode(ARM::t2PLIs);
+ default:
+ break;
+ }
+ }
+
+ switch (Inst.getOpcode()) {
+ case ARM::t2PLDs:
+ break;
+ case ARM::t2PLIs:
+ if (!hasV7Ops)
+ return MCDisassembler::Fail;
+ break;
+ case ARM::t2PLDWs:
+ if (!hasV7Ops || !hasMP)
+ return MCDisassembler::Fail;
+ break;
+ default:
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+ return MCDisassembler::Fail;
+ }
+
+ unsigned addrmode = fieldFromInstruction(Insn, 4, 2);
+ addrmode |= fieldFromInstruction(Insn, 0, 4) << 2;
+ addrmode |= fieldFromInstruction(Insn, 16, 4) << 6;
+ if (!Check(S, DecodeT2AddrModeSOReg(Inst, addrmode, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ return S;
+}
+
+static DecodeStatus DecodeT2LoadImm8(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void* Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+ unsigned U = fieldFromInstruction(Insn, 9, 1);
+ unsigned imm = fieldFromInstruction(Insn, 0, 8);
+ imm |= (U << 8);
+ imm |= (Rn << 9);
+ unsigned add = fieldFromInstruction(Insn, 9, 1);
+
+ const FeatureBitset &featureBits =
+ ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
+
+ bool hasMP = featureBits[ARM::FeatureMP];
+ bool hasV7Ops = featureBits[ARM::HasV7Ops];
+
+ if (Rn == 15) {
+ switch (Inst.getOpcode()) {
+ case ARM::t2LDRi8:
+ Inst.setOpcode(ARM::t2LDRpci);
+ break;
+ case ARM::t2LDRBi8:
+ Inst.setOpcode(ARM::t2LDRBpci);
+ break;
+ case ARM::t2LDRSBi8:
+ Inst.setOpcode(ARM::t2LDRSBpci);
+ break;
+ case ARM::t2LDRHi8:
+ Inst.setOpcode(ARM::t2LDRHpci);
+ break;
+ case ARM::t2LDRSHi8:
+ Inst.setOpcode(ARM::t2LDRSHpci);
+ break;
+ case ARM::t2PLDi8:
+ Inst.setOpcode(ARM::t2PLDpci);
+ break;
+ case ARM::t2PLIi8:
+ Inst.setOpcode(ARM::t2PLIpci);
+ break;
+ default:
+ return MCDisassembler::Fail;
+ }
+ return DecodeT2LoadLabel(Inst, Insn, Address, Decoder);
+ }
+
+ if (Rt == 15) {
+ switch (Inst.getOpcode()) {
+ case ARM::t2LDRSHi8:
+ return MCDisassembler::Fail;
+ case ARM::t2LDRHi8:
+ if (!add)
+ Inst.setOpcode(ARM::t2PLDWi8);
+ break;
+ case ARM::t2LDRSBi8:
+ Inst.setOpcode(ARM::t2PLIi8);
+ break;
+ default:
+ break;
+ }
+ }
+
+ switch (Inst.getOpcode()) {
+ case ARM::t2PLDi8:
+ break;
+ case ARM::t2PLIi8:
+ if (!hasV7Ops)
+ return MCDisassembler::Fail;
+ break;
+ case ARM::t2PLDWi8:
+ if (!hasV7Ops || !hasMP)
+ return MCDisassembler::Fail;
+ break;
+ default:
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+ return MCDisassembler::Fail;
+ }
+
+ if (!Check(S, DecodeT2AddrModeImm8(Inst, imm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ return S;
+}
+
+static DecodeStatus DecodeT2LoadImm12(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void* Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+ unsigned imm = fieldFromInstruction(Insn, 0, 12);
+ imm |= (Rn << 13);
+
+ const FeatureBitset &featureBits =
+ ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
+
+ bool hasMP = featureBits[ARM::FeatureMP];
+ bool hasV7Ops = featureBits[ARM::HasV7Ops];
+
+ if (Rn == 15) {
+ switch (Inst.getOpcode()) {
+ case ARM::t2LDRi12:
+ Inst.setOpcode(ARM::t2LDRpci);
+ break;
+ case ARM::t2LDRHi12:
+ Inst.setOpcode(ARM::t2LDRHpci);
+ break;
+ case ARM::t2LDRSHi12:
+ Inst.setOpcode(ARM::t2LDRSHpci);
+ break;
+ case ARM::t2LDRBi12:
+ Inst.setOpcode(ARM::t2LDRBpci);
+ break;
+ case ARM::t2LDRSBi12:
+ Inst.setOpcode(ARM::t2LDRSBpci);
+ break;
+ case ARM::t2PLDi12:
+ Inst.setOpcode(ARM::t2PLDpci);
+ break;
+ case ARM::t2PLIi12:
+ Inst.setOpcode(ARM::t2PLIpci);
+ break;
+ default:
+ return MCDisassembler::Fail;
+ }
+ return DecodeT2LoadLabel(Inst, Insn, Address, Decoder);
+ }
+
+ if (Rt == 15) {
+ switch (Inst.getOpcode()) {
+ case ARM::t2LDRSHi12:
+ return MCDisassembler::Fail;
+ case ARM::t2LDRHi12:
+ Inst.setOpcode(ARM::t2PLDWi12);
+ break;
+ case ARM::t2LDRSBi12:
+ Inst.setOpcode(ARM::t2PLIi12);
+ break;
+ default:
+ break;
+ }
+ }
+
+ switch (Inst.getOpcode()) {
+ case ARM::t2PLDi12:
+ break;
+ case ARM::t2PLIi12:
+ if (!hasV7Ops)
+ return MCDisassembler::Fail;
+ break;
+ case ARM::t2PLDWi12:
+ if (!hasV7Ops || !hasMP)
+ return MCDisassembler::Fail;
+ break;
+ default:
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+ return MCDisassembler::Fail;
+ }
+
+ if (!Check(S, DecodeT2AddrModeImm12(Inst, imm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ return S;
+}
+
+static DecodeStatus DecodeT2LoadT(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void* Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+ unsigned imm = fieldFromInstruction(Insn, 0, 8);
+ imm |= (Rn << 9);
+
+ if (Rn == 15) {
+ switch (Inst.getOpcode()) {
+ case ARM::t2LDRT:
+ Inst.setOpcode(ARM::t2LDRpci);
+ break;
+ case ARM::t2LDRBT:
+ Inst.setOpcode(ARM::t2LDRBpci);
+ break;
+ case ARM::t2LDRHT:
+ Inst.setOpcode(ARM::t2LDRHpci);
+ break;
+ case ARM::t2LDRSBT:
+ Inst.setOpcode(ARM::t2LDRSBpci);
+ break;
+ case ARM::t2LDRSHT:
+ Inst.setOpcode(ARM::t2LDRSHpci);
+ break;
+ default:
+ return MCDisassembler::Fail;
+ }
+ return DecodeT2LoadLabel(Inst, Insn, Address, Decoder);
+ }
+
+ if (!Check(S, DecoderGPRRegisterClass(Inst, Rt, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeT2AddrModeImm8(Inst, imm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ return S;
+}
+
+static DecodeStatus DecodeT2LoadLabel(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void* Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+ unsigned U = fieldFromInstruction(Insn, 23, 1);
+ int imm = fieldFromInstruction(Insn, 0, 12);
+
+ const FeatureBitset &featureBits =
+ ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
+
+ bool hasV7Ops = featureBits[ARM::HasV7Ops];
+
+ if (Rt == 15) {
+ switch (Inst.getOpcode()) {
+ case ARM::t2LDRBpci:
+ case ARM::t2LDRHpci:
+ Inst.setOpcode(ARM::t2PLDpci);
+ break;
+ case ARM::t2LDRSBpci:
+ Inst.setOpcode(ARM::t2PLIpci);
+ break;
+ case ARM::t2LDRSHpci:
+ return MCDisassembler::Fail;
+ default:
+ break;
+ }
+ }
+
+ switch(Inst.getOpcode()) {
+ case ARM::t2PLDpci:
+ break;
+ case ARM::t2PLIpci:
+ if (!hasV7Ops)
+ return MCDisassembler::Fail;
+ break;
+ default:
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+ return MCDisassembler::Fail;
+ }
+
+ if (!U) {
+ // Special case for #-0.
+ if (imm == 0)
+ imm = INT32_MIN;
+ else
+ imm = -imm;
+ }
+ Inst.addOperand(MCOperand::createImm(imm));
+
+ return S;
+}
+
+static DecodeStatus DecodeT2Imm8S4(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ if (Val == 0)
+ Inst.addOperand(MCOperand::createImm(INT32_MIN));
+ else {
+ int imm = Val & 0xFF;
+
+ if (!(Val & 0x100)) imm *= -1;
+ Inst.addOperand(MCOperand::createImm(imm * 4));
+ }
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeT2AddrModeImm8s4(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rn = fieldFromInstruction(Val, 9, 4);
+ unsigned imm = fieldFromInstruction(Val, 0, 9);
+
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeT2Imm8S4(Inst, imm, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ return S;
+}
+
+static DecodeStatus DecodeT2AddrModeImm0_1020s4(MCInst &Inst,unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rn = fieldFromInstruction(Val, 8, 4);
+ unsigned imm = fieldFromInstruction(Val, 0, 8);
+
+ if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ Inst.addOperand(MCOperand::createImm(imm));
+
+ return S;
+}
+
+static DecodeStatus DecodeT2Imm8(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ int imm = Val & 0xFF;
+ if (Val == 0)
+ imm = INT32_MIN;
+ else if (!(Val & 0x100))
+ imm *= -1;
+ Inst.addOperand(MCOperand::createImm(imm));
+
+ return MCDisassembler::Success;
+}
+
+
+static DecodeStatus DecodeT2AddrModeImm8(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rn = fieldFromInstruction(Val, 9, 4);
+ unsigned imm = fieldFromInstruction(Val, 0, 9);
+
+ // Thumb stores cannot use PC as dest register.
+ switch (Inst.getOpcode()) {
+ case ARM::t2STRT:
+ case ARM::t2STRBT:
+ case ARM::t2STRHT:
+ case ARM::t2STRi8:
+ case ARM::t2STRHi8:
+ case ARM::t2STRBi8:
+ if (Rn == 15)
+ return MCDisassembler::Fail;
+ break;
+ default:
+ break;
+ }
+
+ // Some instructions always use an additive offset.
+ switch (Inst.getOpcode()) {
+ case ARM::t2LDRT:
+ case ARM::t2LDRBT:
+ case ARM::t2LDRHT:
+ case ARM::t2LDRSBT:
+ case ARM::t2LDRSHT:
+ case ARM::t2STRT:
+ case ARM::t2STRBT:
+ case ARM::t2STRHT:
+ imm |= 0x100;
+ break;
+ default:
+ break;
+ }
+
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeT2Imm8(Inst, imm, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ return S;
+}
+
+static DecodeStatus DecodeT2LdStPre(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ unsigned addr = fieldFromInstruction(Insn, 0, 8);
+ addr |= fieldFromInstruction(Insn, 9, 1) << 8;
+ addr |= Rn << 9;
+ unsigned load = fieldFromInstruction(Insn, 20, 1);
+
+ if (Rn == 15) {
+ switch (Inst.getOpcode()) {
+ case ARM::t2LDR_PRE:
+ case ARM::t2LDR_POST:
+ Inst.setOpcode(ARM::t2LDRpci);
+ break;
+ case ARM::t2LDRB_PRE:
+ case ARM::t2LDRB_POST:
+ Inst.setOpcode(ARM::t2LDRBpci);
+ break;
+ case ARM::t2LDRH_PRE:
+ case ARM::t2LDRH_POST:
+ Inst.setOpcode(ARM::t2LDRHpci);
+ break;
+ case ARM::t2LDRSB_PRE:
+ case ARM::t2LDRSB_POST:
+ if (Rt == 15)
+ Inst.setOpcode(ARM::t2PLIpci);
+ else
+ Inst.setOpcode(ARM::t2LDRSBpci);
+ break;
+ case ARM::t2LDRSH_PRE:
+ case ARM::t2LDRSH_POST:
+ Inst.setOpcode(ARM::t2LDRSHpci);
+ break;
+ default:
+ return MCDisassembler::Fail;
+ }
+ return DecodeT2LoadLabel(Inst, Insn, Address, Decoder);
+ }
+
+ if (!load) {
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ }
+
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ if (load) {
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ }
+
+ if (!Check(S, DecodeT2AddrModeImm8(Inst, addr, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ return S;
+}
+
+static DecodeStatus DecodeT2AddrModeImm12(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rn = fieldFromInstruction(Val, 13, 4);
+ unsigned imm = fieldFromInstruction(Val, 0, 12);
+
+ // Thumb stores cannot use PC as dest register.
+ switch (Inst.getOpcode()) {
+ case ARM::t2STRi12:
+ case ARM::t2STRBi12:
+ case ARM::t2STRHi12:
+ if (Rn == 15)
+ return MCDisassembler::Fail;
+ default:
+ break;
+ }
+
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createImm(imm));
+
+ return S;
+}
+
+
+static DecodeStatus DecodeThumbAddSPImm(MCInst &Inst, uint16_t Insn,
+ uint64_t Address, const void *Decoder) {
+ unsigned imm = fieldFromInstruction(Insn, 0, 7);
+
+ Inst.addOperand(MCOperand::createReg(ARM::SP));
+ Inst.addOperand(MCOperand::createReg(ARM::SP));
+ Inst.addOperand(MCOperand::createImm(imm));
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeThumbAddSPReg(MCInst &Inst, uint16_t Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ if (Inst.getOpcode() == ARM::tADDrSP) {
+ unsigned Rdm = fieldFromInstruction(Insn, 0, 3);
+ Rdm |= fieldFromInstruction(Insn, 7, 1) << 3;
+
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rdm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createReg(ARM::SP));
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rdm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ } else if (Inst.getOpcode() == ARM::tADDspr) {
+ unsigned Rm = fieldFromInstruction(Insn, 3, 4);
+
+ Inst.addOperand(MCOperand::createReg(ARM::SP));
+ Inst.addOperand(MCOperand::createReg(ARM::SP));
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ }
+
+ return S;
+}
+
+static DecodeStatus DecodeThumbCPS(MCInst &Inst, uint16_t Insn,
+ uint64_t Address, const void *Decoder) {
+ unsigned imod = fieldFromInstruction(Insn, 4, 1) | 0x2;
+ unsigned flags = fieldFromInstruction(Insn, 0, 3);
+
+ Inst.addOperand(MCOperand::createImm(imod));
+ Inst.addOperand(MCOperand::createImm(flags));
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodePostIdxReg(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+ unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+ unsigned add = fieldFromInstruction(Insn, 4, 1);
+
+ if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createImm(add));
+
+ return S;
+}
+
+static DecodeStatus DecodeThumbBLXOffset(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ // Val is passed in as S:J1:J2:imm10H:imm10L:'0'
+ // Note only one trailing zero not two. Also the J1 and J2 values are from
+ // the encoded instruction. So here change to I1 and I2 values via:
+ // I1 = NOT(J1 EOR S);
+ // I2 = NOT(J2 EOR S);
+ // and build the imm32 with two trailing zeros as documented:
+ // imm32 = SignExtend(S:I1:I2:imm10H:imm10L:'00', 32);
+ unsigned S = (Val >> 23) & 1;
+ unsigned J1 = (Val >> 22) & 1;
+ unsigned J2 = (Val >> 21) & 1;
+ unsigned I1 = !(J1 ^ S);
+ unsigned I2 = !(J2 ^ S);
+ unsigned tmp = (Val & ~0x600000) | (I1 << 22) | (I2 << 21);
+ int imm32 = SignExtend32<25>(tmp << 1);
+
+ if (!tryAddingSymbolicOperand(Address,
+ (Address & ~2u) + imm32 + 4,
+ true, 4, Inst, Decoder))
+ Inst.addOperand(MCOperand::createImm(imm32));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCoprocessor(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ if (Val == 0xA || Val == 0xB)
+ return MCDisassembler::Fail;
+
+ const FeatureBitset &featureBits =
+ ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
+
+ if (featureBits[ARM::HasV8Ops] && !(Val == 14 || Val == 15))
+ return MCDisassembler::Fail;
+
+ Inst.addOperand(MCOperand::createImm(Val));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus
+DecodeThumbTableBranch(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+
+ if (Rn == ARM::SP) S = MCDisassembler::SoftFail;
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ return S;
+}
+
+static DecodeStatus
+DecodeThumb2BCCInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned pred = fieldFromInstruction(Insn, 22, 4);
+ if (pred == 0xE || pred == 0xF) {
+ unsigned opc = fieldFromInstruction(Insn, 4, 28);
+ switch (opc) {
+ default:
+ return MCDisassembler::Fail;
+ case 0xf3bf8f4:
+ Inst.setOpcode(ARM::t2DSB);
+ break;
+ case 0xf3bf8f5:
+ Inst.setOpcode(ARM::t2DMB);
+ break;
+ case 0xf3bf8f6:
+ Inst.setOpcode(ARM::t2ISB);
+ break;
+ }
+
+ unsigned imm = fieldFromInstruction(Insn, 0, 4);
+ return DecodeMemBarrierOption(Inst, imm, Address, Decoder);
+ }
+
+ unsigned brtarget = fieldFromInstruction(Insn, 0, 11) << 1;
+ brtarget |= fieldFromInstruction(Insn, 11, 1) << 19;
+ brtarget |= fieldFromInstruction(Insn, 13, 1) << 18;
+ brtarget |= fieldFromInstruction(Insn, 16, 6) << 12;
+ brtarget |= fieldFromInstruction(Insn, 26, 1) << 20;
+
+ if (!Check(S, DecodeT2BROperand(Inst, brtarget, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ return S;
+}
+
+// Decode a shifted immediate operand. These basically consist
+// of an 8-bit value, and a 4-bit directive that specifies either
+// a splat operation or a rotation.
+static DecodeStatus DecodeT2SOImm(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ unsigned ctrl = fieldFromInstruction(Val, 10, 2);
+ if (ctrl == 0) {
+ unsigned byte = fieldFromInstruction(Val, 8, 2);
+ unsigned imm = fieldFromInstruction(Val, 0, 8);
+ switch (byte) {
+ case 0:
+ Inst.addOperand(MCOperand::createImm(imm));
+ break;
+ case 1:
+ Inst.addOperand(MCOperand::createImm((imm << 16) | imm));
+ break;
+ case 2:
+ Inst.addOperand(MCOperand::createImm((imm << 24) | (imm << 8)));
+ break;
+ case 3:
+ Inst.addOperand(MCOperand::createImm((imm << 24) | (imm << 16) |
+ (imm << 8) | imm));
+ break;
+ }
+ } else {
+ unsigned unrot = fieldFromInstruction(Val, 0, 7) | 0x80;
+ unsigned rot = fieldFromInstruction(Val, 7, 5);
+ unsigned imm = (unrot >> rot) | (unrot << ((32-rot)&31));
+ Inst.addOperand(MCOperand::createImm(imm));
+ }
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus
+DecodeThumbBCCTargetOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder){
+ if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<9>(Val<<1) + 4,
+ true, 2, Inst, Decoder))
+ Inst.addOperand(MCOperand::createImm(SignExtend32<9>(Val << 1)));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeThumbBLTargetOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder){
+ // Val is passed in as S:J1:J2:imm10:imm11
+ // Note no trailing zero after imm11. Also the J1 and J2 values are from
+ // the encoded instruction. So here change to I1 and I2 values via:
+ // I1 = NOT(J1 EOR S);
+ // I2 = NOT(J2 EOR S);
+ // and build the imm32 with one trailing zero as documented:
+ // imm32 = SignExtend(S:I1:I2:imm10:imm11:'0', 32);
+ unsigned S = (Val >> 23) & 1;
+ unsigned J1 = (Val >> 22) & 1;
+ unsigned J2 = (Val >> 21) & 1;
+ unsigned I1 = !(J1 ^ S);
+ unsigned I2 = !(J2 ^ S);
+ unsigned tmp = (Val & ~0x600000) | (I1 << 22) | (I2 << 21);
+ int imm32 = SignExtend32<25>(tmp << 1);
+
+ if (!tryAddingSymbolicOperand(Address, Address + imm32 + 4,
+ true, 4, Inst, Decoder))
+ Inst.addOperand(MCOperand::createImm(imm32));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMemBarrierOption(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ if (Val & ~0xf)
+ return MCDisassembler::Fail;
+
+ Inst.addOperand(MCOperand::createImm(Val));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeInstSyncBarrierOption(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ if (Val & ~0xf)
+ return MCDisassembler::Fail;
+
+ Inst.addOperand(MCOperand::createImm(Val));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+ const FeatureBitset &FeatureBits =
+ ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
+
+ if (FeatureBits[ARM::FeatureMClass]) {
+ unsigned ValLow = Val & 0xff;
+
+ // Validate the SYSm value first.
+ switch (ValLow) {
+ case 0: // apsr
+ case 1: // iapsr
+ case 2: // eapsr
+ case 3: // xpsr
+ case 5: // ipsr
+ case 6: // epsr
+ case 7: // iepsr
+ case 8: // msp
+ case 9: // psp
+ case 16: // primask
+ case 20: // control
+ break;
+ case 17: // basepri
+ case 18: // basepri_max
+ case 19: // faultmask
+ if (!(FeatureBits[ARM::HasV7Ops]))
+ // Values basepri, basepri_max and faultmask are only valid for v7m.
+ return MCDisassembler::Fail;
+ break;
+ case 0x8a: // msplim_ns
+ case 0x8b: // psplim_ns
+ case 0x91: // basepri_ns
+ case 0x92: // basepri_max_ns
+ case 0x93: // faultmask_ns
+ if (!(FeatureBits[ARM::HasV8MMainlineOps]))
+ return MCDisassembler::Fail;
+ LLVM_FALLTHROUGH;
+ case 10: // msplim
+ case 11: // psplim
+ case 0x88: // msp_ns
+ case 0x89: // psp_ns
+ case 0x90: // primask_ns
+ case 0x94: // control_ns
+ case 0x98: // sp_ns
+ if (!(FeatureBits[ARM::Feature8MSecExt]))
+ return MCDisassembler::Fail;
+ break;
+ default:
+ return MCDisassembler::Fail;
+ }
+
+ if (Inst.getOpcode() == ARM::t2MSR_M) {
+ unsigned Mask = fieldFromInstruction(Val, 10, 2);
+ if (!(FeatureBits[ARM::HasV7Ops])) {
+ // The ARMv6-M MSR bits {11-10} can be only 0b10, other values are
+ // unpredictable.
+ if (Mask != 2)
+ S = MCDisassembler::SoftFail;
+ }
+ else {
+ // The ARMv7-M architecture stores an additional 2-bit mask value in
+ // MSR bits {11-10}. The mask is used only with apsr, iapsr, eapsr and
+ // xpsr, it has to be 0b10 in other cases. Bit mask{1} indicates if
+ // the NZCVQ bits should be moved by the instruction. Bit mask{0}
+ // indicates the move for the GE{3:0} bits, the mask{0} bit can be set
+ // only if the processor includes the DSP extension.
+ if (Mask == 0 || (Mask != 2 && ValLow > 3) ||
+ (!(FeatureBits[ARM::FeatureDSP]) && (Mask & 1)))
+ S = MCDisassembler::SoftFail;
+ }
+ }
+ } else {
+ // A/R class
+ if (Val == 0)
+ return MCDisassembler::Fail;
+ }
+ Inst.addOperand(MCOperand::createImm(Val));
+ return S;
+}
+
+static DecodeStatus DecodeBankedReg(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+
+ unsigned R = fieldFromInstruction(Val, 5, 1);
+ unsigned SysM = fieldFromInstruction(Val, 0, 5);
+
+ // The table of encodings for these banked registers comes from B9.2.3 of the
+ // ARM ARM. There are patterns, but nothing regular enough to make this logic
+ // neater. So by fiat, these values are UNPREDICTABLE:
+ if (!R) {
+ if (SysM == 0x7 || SysM == 0xf || SysM == 0x18 || SysM == 0x19 ||
+ SysM == 0x1a || SysM == 0x1b)
+ return MCDisassembler::SoftFail;
+ } else {
+ if (SysM != 0xe && SysM != 0x10 && SysM != 0x12 && SysM != 0x14 &&
+ SysM != 0x16 && SysM != 0x1c && SysM != 0x1e)
+ return MCDisassembler::SoftFail;
+ }
+
+ Inst.addOperand(MCOperand::createImm(Val));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeDoubleRegLoad(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ unsigned pred = fieldFromInstruction(Insn, 28, 4);
+
+ if (Rn == 0xF)
+ S = MCDisassembler::SoftFail;
+
+ if (!Check(S, DecodeGPRPairRegisterClass(Inst, Rt, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ return S;
+}
+
+static DecodeStatus DecodeDoubleRegStore(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder){
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+ unsigned Rt = fieldFromInstruction(Insn, 0, 4);
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ unsigned pred = fieldFromInstruction(Insn, 28, 4);
+
+ if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rd, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ if (Rn == 0xF || Rd == Rn || Rd == Rt || Rd == Rt+1)
+ S = MCDisassembler::SoftFail;
+
+ if (!Check(S, DecodeGPRPairRegisterClass(Inst, Rt, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ return S;
+}
+
+static DecodeStatus DecodeLDRPreImm(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+ unsigned imm = fieldFromInstruction(Insn, 0, 12);
+ imm |= fieldFromInstruction(Insn, 16, 4) << 13;
+ imm |= fieldFromInstruction(Insn, 23, 1) << 12;
+ unsigned pred = fieldFromInstruction(Insn, 28, 4);
+
+ if (Rn == 0xF || Rn == Rt) S = MCDisassembler::SoftFail;
+
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeAddrModeImm12Operand(Inst, imm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ return S;
+}
+
+static DecodeStatus DecodeLDRPreReg(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+ unsigned imm = fieldFromInstruction(Insn, 0, 12);
+ imm |= fieldFromInstruction(Insn, 16, 4) << 13;
+ imm |= fieldFromInstruction(Insn, 23, 1) << 12;
+ unsigned pred = fieldFromInstruction(Insn, 28, 4);
+ unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+
+ if (Rn == 0xF || Rn == Rt) S = MCDisassembler::SoftFail;
+ if (Rm == 0xF) S = MCDisassembler::SoftFail;
+
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeSORegMemOperand(Inst, imm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ return S;
+}
+
+
+static DecodeStatus DecodeSTRPreImm(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+ unsigned imm = fieldFromInstruction(Insn, 0, 12);
+ imm |= fieldFromInstruction(Insn, 16, 4) << 13;
+ imm |= fieldFromInstruction(Insn, 23, 1) << 12;
+ unsigned pred = fieldFromInstruction(Insn, 28, 4);
+
+ if (Rn == 0xF || Rn == Rt) S = MCDisassembler::SoftFail;
+
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeAddrModeImm12Operand(Inst, imm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ return S;
+}
+
+static DecodeStatus DecodeSTRPreReg(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+ unsigned imm = fieldFromInstruction(Insn, 0, 12);
+ imm |= fieldFromInstruction(Insn, 16, 4) << 13;
+ imm |= fieldFromInstruction(Insn, 23, 1) << 12;
+ unsigned pred = fieldFromInstruction(Insn, 28, 4);
+
+ if (Rn == 0xF || Rn == Rt) S = MCDisassembler::SoftFail;
+
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeSORegMemOperand(Inst, imm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ return S;
+}
+
+static DecodeStatus DecodeVLD1LN(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+ unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+ Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+ unsigned size = fieldFromInstruction(Insn, 10, 2);
+
+ unsigned align = 0;
+ unsigned index = 0;
+ switch (size) {
+ default:
+ return MCDisassembler::Fail;
+ case 0:
+ if (fieldFromInstruction(Insn, 4, 1))
+ return MCDisassembler::Fail; // UNDEFINED
+ index = fieldFromInstruction(Insn, 5, 3);
+ break;
+ case 1:
+ if (fieldFromInstruction(Insn, 5, 1))
+ return MCDisassembler::Fail; // UNDEFINED
+ index = fieldFromInstruction(Insn, 6, 2);
+ if (fieldFromInstruction(Insn, 4, 1))
+ align = 2;
+ break;
+ case 2:
+ if (fieldFromInstruction(Insn, 6, 1))
+ return MCDisassembler::Fail; // UNDEFINED
+ index = fieldFromInstruction(Insn, 7, 1);
+
+ switch (fieldFromInstruction(Insn, 4, 2)) {
+ case 0 :
+ align = 0; break;
+ case 3:
+ align = 4; break;
+ default:
+ return MCDisassembler::Fail;
+ }
+ break;
+ }
+
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (Rm != 0xF) { // Writeback
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ }
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createImm(align));
+ if (Rm != 0xF) {
+ if (Rm != 0xD) {
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ } else
+ Inst.addOperand(MCOperand::createReg(0));
+ }
+
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createImm(index));
+
+ return S;
+}
+
+static DecodeStatus DecodeVST1LN(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+ unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+ Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+ unsigned size = fieldFromInstruction(Insn, 10, 2);
+
+ unsigned align = 0;
+ unsigned index = 0;
+ switch (size) {
+ default:
+ return MCDisassembler::Fail;
+ case 0:
+ if (fieldFromInstruction(Insn, 4, 1))
+ return MCDisassembler::Fail; // UNDEFINED
+ index = fieldFromInstruction(Insn, 5, 3);
+ break;
+ case 1:
+ if (fieldFromInstruction(Insn, 5, 1))
+ return MCDisassembler::Fail; // UNDEFINED
+ index = fieldFromInstruction(Insn, 6, 2);
+ if (fieldFromInstruction(Insn, 4, 1))
+ align = 2;
+ break;
+ case 2:
+ if (fieldFromInstruction(Insn, 6, 1))
+ return MCDisassembler::Fail; // UNDEFINED
+ index = fieldFromInstruction(Insn, 7, 1);
+
+ switch (fieldFromInstruction(Insn, 4, 2)) {
+ case 0:
+ align = 0; break;
+ case 3:
+ align = 4; break;
+ default:
+ return MCDisassembler::Fail;
+ }
+ break;
+ }
+
+ if (Rm != 0xF) { // Writeback
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ }
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createImm(align));
+ if (Rm != 0xF) {
+ if (Rm != 0xD) {
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ } else
+ Inst.addOperand(MCOperand::createReg(0));
+ }
+
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createImm(index));
+
+ return S;
+}
+
+
+static DecodeStatus DecodeVLD2LN(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+ unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+ Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+ unsigned size = fieldFromInstruction(Insn, 10, 2);
+
+ unsigned align = 0;
+ unsigned index = 0;
+ unsigned inc = 1;
+ switch (size) {
+ default:
+ return MCDisassembler::Fail;
+ case 0:
+ index = fieldFromInstruction(Insn, 5, 3);
+ if (fieldFromInstruction(Insn, 4, 1))
+ align = 2;
+ break;
+ case 1:
+ index = fieldFromInstruction(Insn, 6, 2);
+ if (fieldFromInstruction(Insn, 4, 1))
+ align = 4;
+ if (fieldFromInstruction(Insn, 5, 1))
+ inc = 2;
+ break;
+ case 2:
+ if (fieldFromInstruction(Insn, 5, 1))
+ return MCDisassembler::Fail; // UNDEFINED
+ index = fieldFromInstruction(Insn, 7, 1);
+ if (fieldFromInstruction(Insn, 4, 1) != 0)
+ align = 8;
+ if (fieldFromInstruction(Insn, 6, 1))
+ inc = 2;
+ break;
+ }
+
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (Rm != 0xF) { // Writeback
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ }
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createImm(align));
+ if (Rm != 0xF) {
+ if (Rm != 0xD) {
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ } else
+ Inst.addOperand(MCOperand::createReg(0));
+ }
+
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createImm(index));
+
+ return S;
+}
+
+static DecodeStatus DecodeVST2LN(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+ unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+ Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+ unsigned size = fieldFromInstruction(Insn, 10, 2);
+
+ unsigned align = 0;
+ unsigned index = 0;
+ unsigned inc = 1;
+ switch (size) {
+ default:
+ return MCDisassembler::Fail;
+ case 0:
+ index = fieldFromInstruction(Insn, 5, 3);
+ if (fieldFromInstruction(Insn, 4, 1))
+ align = 2;
+ break;
+ case 1:
+ index = fieldFromInstruction(Insn, 6, 2);
+ if (fieldFromInstruction(Insn, 4, 1))
+ align = 4;
+ if (fieldFromInstruction(Insn, 5, 1))
+ inc = 2;
+ break;
+ case 2:
+ if (fieldFromInstruction(Insn, 5, 1))
+ return MCDisassembler::Fail; // UNDEFINED
+ index = fieldFromInstruction(Insn, 7, 1);
+ if (fieldFromInstruction(Insn, 4, 1) != 0)
+ align = 8;
+ if (fieldFromInstruction(Insn, 6, 1))
+ inc = 2;
+ break;
+ }
+
+ if (Rm != 0xF) { // Writeback
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ }
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createImm(align));
+ if (Rm != 0xF) {
+ if (Rm != 0xD) {
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ } else
+ Inst.addOperand(MCOperand::createReg(0));
+ }
+
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createImm(index));
+
+ return S;
+}
+
+
+static DecodeStatus DecodeVLD3LN(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+ unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+ Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+ unsigned size = fieldFromInstruction(Insn, 10, 2);
+
+ unsigned align = 0;
+ unsigned index = 0;
+ unsigned inc = 1;
+ switch (size) {
+ default:
+ return MCDisassembler::Fail;
+ case 0:
+ if (fieldFromInstruction(Insn, 4, 1))
+ return MCDisassembler::Fail; // UNDEFINED
+ index = fieldFromInstruction(Insn, 5, 3);
+ break;
+ case 1:
+ if (fieldFromInstruction(Insn, 4, 1))
+ return MCDisassembler::Fail; // UNDEFINED
+ index = fieldFromInstruction(Insn, 6, 2);
+ if (fieldFromInstruction(Insn, 5, 1))
+ inc = 2;
+ break;
+ case 2:
+ if (fieldFromInstruction(Insn, 4, 2))
+ return MCDisassembler::Fail; // UNDEFINED
+ index = fieldFromInstruction(Insn, 7, 1);
+ if (fieldFromInstruction(Insn, 6, 1))
+ inc = 2;
+ break;
+ }
+
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ if (Rm != 0xF) { // Writeback
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ }
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createImm(align));
+ if (Rm != 0xF) {
+ if (Rm != 0xD) {
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ } else
+ Inst.addOperand(MCOperand::createReg(0));
+ }
+
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder)))
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createImm(index));
+
+ return S;
+}
+
+static DecodeStatus DecodeVST3LN(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+ unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+ Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+ unsigned size = fieldFromInstruction(Insn, 10, 2);
+
+ unsigned align = 0;
+ unsigned index = 0;
+ unsigned inc = 1;
+ switch (size) {
+ default:
+ return MCDisassembler::Fail;
+ case 0:
+ if (fieldFromInstruction(Insn, 4, 1))
+ return MCDisassembler::Fail; // UNDEFINED
+ index = fieldFromInstruction(Insn, 5, 3);
+ break;
+ case 1:
+ if (fieldFromInstruction(Insn, 4, 1))
+ return MCDisassembler::Fail; // UNDEFINED
+ index = fieldFromInstruction(Insn, 6, 2);
+ if (fieldFromInstruction(Insn, 5, 1))
+ inc = 2;
+ break;
+ case 2:
+ if (fieldFromInstruction(Insn, 4, 2))
+ return MCDisassembler::Fail; // UNDEFINED
+ index = fieldFromInstruction(Insn, 7, 1);
+ if (fieldFromInstruction(Insn, 6, 1))
+ inc = 2;
+ break;
+ }
+
+ if (Rm != 0xF) { // Writeback
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ }
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createImm(align));
+ if (Rm != 0xF) {
+ if (Rm != 0xD) {
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ } else
+ Inst.addOperand(MCOperand::createReg(0));
+ }
+
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder)))
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createImm(index));
+
+ return S;
+}
+
+
+static DecodeStatus DecodeVLD4LN(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+ unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+ Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+ unsigned size = fieldFromInstruction(Insn, 10, 2);
+
+ unsigned align = 0;
+ unsigned index = 0;
+ unsigned inc = 1;
+ switch (size) {
+ default:
+ return MCDisassembler::Fail;
+ case 0:
+ if (fieldFromInstruction(Insn, 4, 1))
+ align = 4;
+ index = fieldFromInstruction(Insn, 5, 3);
+ break;
+ case 1:
+ if (fieldFromInstruction(Insn, 4, 1))
+ align = 8;
+ index = fieldFromInstruction(Insn, 6, 2);
+ if (fieldFromInstruction(Insn, 5, 1))
+ inc = 2;
+ break;
+ case 2:
+ switch (fieldFromInstruction(Insn, 4, 2)) {
+ case 0:
+ align = 0; break;
+ case 3:
+ return MCDisassembler::Fail;
+ default:
+ align = 4 << fieldFromInstruction(Insn, 4, 2); break;
+ }
+
+ index = fieldFromInstruction(Insn, 7, 1);
+ if (fieldFromInstruction(Insn, 6, 1))
+ inc = 2;
+ break;
+ }
+
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+3*inc, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ if (Rm != 0xF) { // Writeback
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ }
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createImm(align));
+ if (Rm != 0xF) {
+ if (Rm != 0xD) {
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ } else
+ Inst.addOperand(MCOperand::createReg(0));
+ }
+
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+3*inc, Address, Decoder)))
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createImm(index));
+
+ return S;
+}
+
+static DecodeStatus DecodeVST4LN(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+ unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+ Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+ unsigned size = fieldFromInstruction(Insn, 10, 2);
+
+ unsigned align = 0;
+ unsigned index = 0;
+ unsigned inc = 1;
+ switch (size) {
+ default:
+ return MCDisassembler::Fail;
+ case 0:
+ if (fieldFromInstruction(Insn, 4, 1))
+ align = 4;
+ index = fieldFromInstruction(Insn, 5, 3);
+ break;
+ case 1:
+ if (fieldFromInstruction(Insn, 4, 1))
+ align = 8;
+ index = fieldFromInstruction(Insn, 6, 2);
+ if (fieldFromInstruction(Insn, 5, 1))
+ inc = 2;
+ break;
+ case 2:
+ switch (fieldFromInstruction(Insn, 4, 2)) {
+ case 0:
+ align = 0; break;
+ case 3:
+ return MCDisassembler::Fail;
+ default:
+ align = 4 << fieldFromInstruction(Insn, 4, 2); break;
+ }
+
+ index = fieldFromInstruction(Insn, 7, 1);
+ if (fieldFromInstruction(Insn, 6, 1))
+ inc = 2;
+ break;
+ }
+
+ if (Rm != 0xF) { // Writeback
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ }
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createImm(align));
+ if (Rm != 0xF) {
+ if (Rm != 0xD) {
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ } else
+ Inst.addOperand(MCOperand::createReg(0));
+ }
+
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+3*inc, Address, Decoder)))
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createImm(index));
+
+ return S;
+}
+
+static DecodeStatus DecodeVMOVSRR(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+ unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+ unsigned Rt2 = fieldFromInstruction(Insn, 16, 4);
+ unsigned Rm = fieldFromInstruction(Insn, 5, 1);
+ unsigned pred = fieldFromInstruction(Insn, 28, 4);
+ Rm |= fieldFromInstruction(Insn, 0, 4) << 1;
+
+ if (Rt == 0xF || Rt2 == 0xF || Rm == 0x1F)
+ S = MCDisassembler::SoftFail;
+
+ if (!Check(S, DecodeSPRRegisterClass(Inst, Rm , Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeSPRRegisterClass(Inst, Rm+1, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rt , Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rt2 , Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ return S;
+}
+
+static DecodeStatus DecodeVMOVRRS(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+ unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+ unsigned Rt2 = fieldFromInstruction(Insn, 16, 4);
+ unsigned Rm = fieldFromInstruction(Insn, 5, 1);
+ unsigned pred = fieldFromInstruction(Insn, 28, 4);
+ Rm |= fieldFromInstruction(Insn, 0, 4) << 1;
+
+ if (Rt == 0xF || Rt2 == 0xF || Rm == 0x1F)
+ S = MCDisassembler::SoftFail;
+
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rt , Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rt2 , Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeSPRRegisterClass(Inst, Rm , Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeSPRRegisterClass(Inst, Rm+1, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ return S;
+}
+
+static DecodeStatus DecodeIT(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+ unsigned pred = fieldFromInstruction(Insn, 4, 4);
+ unsigned mask = fieldFromInstruction(Insn, 0, 4);
+
+ if (pred == 0xF) {
+ pred = 0xE;
+ S = MCDisassembler::SoftFail;
+ }
+
+ if (mask == 0x0)
+ return MCDisassembler::Fail;
+
+ Inst.addOperand(MCOperand::createImm(pred));
+ Inst.addOperand(MCOperand::createImm(mask));
+ return S;
+}
+
+static DecodeStatus
+DecodeT2LDRDPreInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+ unsigned Rt2 = fieldFromInstruction(Insn, 8, 4);
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ unsigned addr = fieldFromInstruction(Insn, 0, 8);
+ unsigned W = fieldFromInstruction(Insn, 21, 1);
+ unsigned U = fieldFromInstruction(Insn, 23, 1);
+ unsigned P = fieldFromInstruction(Insn, 24, 1);
+ bool writeback = (W == 1) | (P == 0);
+
+ addr |= (U << 8) | (Rn << 9);
+
+ if (writeback && (Rn == Rt || Rn == Rt2))
+ Check(S, MCDisassembler::SoftFail);
+ if (Rt == Rt2)
+ Check(S, MCDisassembler::SoftFail);
+
+ // Rt
+ if (!Check(S, DecoderGPRRegisterClass(Inst, Rt, Address, Decoder)))
+ return MCDisassembler::Fail;
+ // Rt2
+ if (!Check(S, DecoderGPRRegisterClass(Inst, Rt2, Address, Decoder)))
+ return MCDisassembler::Fail;
+ // Writeback operand
+ if (!Check(S, DecoderGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ // addr
+ if (!Check(S, DecodeT2AddrModeImm8s4(Inst, addr, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ return S;
+}
+
+static DecodeStatus
+DecodeT2STRDPreInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+ unsigned Rt2 = fieldFromInstruction(Insn, 8, 4);
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ unsigned addr = fieldFromInstruction(Insn, 0, 8);
+ unsigned W = fieldFromInstruction(Insn, 21, 1);
+ unsigned U = fieldFromInstruction(Insn, 23, 1);
+ unsigned P = fieldFromInstruction(Insn, 24, 1);
+ bool writeback = (W == 1) | (P == 0);
+
+ addr |= (U << 8) | (Rn << 9);
+
+ if (writeback && (Rn == Rt || Rn == Rt2))
+ Check(S, MCDisassembler::SoftFail);
+
+ // Writeback operand
+ if (!Check(S, DecoderGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ // Rt
+ if (!Check(S, DecoderGPRRegisterClass(Inst, Rt, Address, Decoder)))
+ return MCDisassembler::Fail;
+ // Rt2
+ if (!Check(S, DecoderGPRRegisterClass(Inst, Rt2, Address, Decoder)))
+ return MCDisassembler::Fail;
+ // addr
+ if (!Check(S, DecodeT2AddrModeImm8s4(Inst, addr, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ return S;
+}
+
+static DecodeStatus DecodeT2Adr(MCInst &Inst, uint32_t Insn,
+ uint64_t Address, const void *Decoder) {
+ unsigned sign1 = fieldFromInstruction(Insn, 21, 1);
+ unsigned sign2 = fieldFromInstruction(Insn, 23, 1);
+ if (sign1 != sign2) return MCDisassembler::Fail;
+
+ unsigned Val = fieldFromInstruction(Insn, 0, 8);
+ Val |= fieldFromInstruction(Insn, 12, 3) << 8;
+ Val |= fieldFromInstruction(Insn, 26, 1) << 11;
+ Val |= sign1 << 12;
+ Inst.addOperand(MCOperand::createImm(SignExtend32<13>(Val)));
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeT2ShifterImmOperand(MCInst &Inst, uint32_t Val,
+ uint64_t Address,
+ const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ // Shift of "asr #32" is not allowed in Thumb2 mode.
+ if (Val == 0x20) S = MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createImm(Val));
+ return S;
+}
+
+static DecodeStatus DecodeSwap(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+ unsigned Rt2 = fieldFromInstruction(Insn, 0, 4);
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ unsigned pred = fieldFromInstruction(Insn, 28, 4);
+
+ if (pred == 0xF)
+ return DecodeCPSInstruction(Inst, Insn, Address, Decoder);
+
+ DecodeStatus S = MCDisassembler::Success;
+
+ if (Rt == Rn || Rn == Rt2)
+ S = MCDisassembler::SoftFail;
+
+ if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt2, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ return S;
+}
+
+static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ const FeatureBitset &featureBits =
+ ((const MCDisassembler *)Decoder)->getSubtargetInfo().getFeatureBits();
+ bool hasFullFP16 = featureBits[ARM::FeatureFullFP16];
+
+ unsigned Vd = (fieldFromInstruction(Insn, 12, 4) << 0);
+ Vd |= (fieldFromInstruction(Insn, 22, 1) << 4);
+ unsigned Vm = (fieldFromInstruction(Insn, 0, 4) << 0);
+ Vm |= (fieldFromInstruction(Insn, 5, 1) << 4);
+ unsigned imm = fieldFromInstruction(Insn, 16, 6);
+ unsigned cmode = fieldFromInstruction(Insn, 8, 4);
+ unsigned op = fieldFromInstruction(Insn, 5, 1);
+
+ DecodeStatus S = MCDisassembler::Success;
+
+ // If the top 3 bits of imm are clear, this is a VMOV (immediate)
+ if (!(imm & 0x38)) {
+ if (cmode == 0xF) {
+ if (op == 1) return MCDisassembler::Fail;
+ Inst.setOpcode(ARM::VMOVv2f32);
+ }
+ if (hasFullFP16) {
+ if (cmode == 0xE) {
+ if (op == 1) {
+ Inst.setOpcode(ARM::VMOVv1i64);
+ } else {
+ Inst.setOpcode(ARM::VMOVv8i8);
+ }
+ }
+ if (cmode == 0xD) {
+ if (op == 1) {
+ Inst.setOpcode(ARM::VMVNv2i32);
+ } else {
+ Inst.setOpcode(ARM::VMOVv2i32);
+ }
+ }
+ if (cmode == 0xC) {
+ if (op == 1) {
+ Inst.setOpcode(ARM::VMVNv2i32);
+ } else {
+ Inst.setOpcode(ARM::VMOVv2i32);
+ }
+ }
+ }
+ return DecodeNEONModImmInstruction(Inst, Insn, Address, Decoder);
+ }
+
+ if (!(imm & 0x20)) return MCDisassembler::Fail;
+
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Vd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeDPRRegisterClass(Inst, Vm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createImm(64 - imm));
+
+ return S;
+}
+
+static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ const FeatureBitset &featureBits =
+ ((const MCDisassembler *)Decoder)->getSubtargetInfo().getFeatureBits();
+ bool hasFullFP16 = featureBits[ARM::FeatureFullFP16];
+
+ unsigned Vd = (fieldFromInstruction(Insn, 12, 4) << 0);
+ Vd |= (fieldFromInstruction(Insn, 22, 1) << 4);
+ unsigned Vm = (fieldFromInstruction(Insn, 0, 4) << 0);
+ Vm |= (fieldFromInstruction(Insn, 5, 1) << 4);
+ unsigned imm = fieldFromInstruction(Insn, 16, 6);
+ unsigned cmode = fieldFromInstruction(Insn, 8, 4);
+ unsigned op = fieldFromInstruction(Insn, 5, 1);
+
+ DecodeStatus S = MCDisassembler::Success;
+
+ // If the top 3 bits of imm are clear, this is a VMOV (immediate)
+ if (!(imm & 0x38)) {
+ if (cmode == 0xF) {
+ if (op == 1) return MCDisassembler::Fail;
+ Inst.setOpcode(ARM::VMOVv4f32);
+ }
+ if (hasFullFP16) {
+ if (cmode == 0xE) {
+ if (op == 1) {
+ Inst.setOpcode(ARM::VMOVv2i64);
+ } else {
+ Inst.setOpcode(ARM::VMOVv16i8);
+ }
+ }
+ if (cmode == 0xD) {
+ if (op == 1) {
+ Inst.setOpcode(ARM::VMVNv4i32);
+ } else {
+ Inst.setOpcode(ARM::VMOVv4i32);
+ }
+ }
+ if (cmode == 0xC) {
+ if (op == 1) {
+ Inst.setOpcode(ARM::VMVNv4i32);
+ } else {
+ Inst.setOpcode(ARM::VMOVv4i32);
+ }
+ }
+ }
+ return DecodeNEONModImmInstruction(Inst, Insn, Address, Decoder);
+ }
+
+ if (!(imm & 0x20)) return MCDisassembler::Fail;
+
+ if (!Check(S, DecodeQPRRegisterClass(Inst, Vd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeQPRRegisterClass(Inst, Vm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createImm(64 - imm));
+
+ return S;
+}
+
+static DecodeStatus DecodeLDR(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rn = fieldFromInstruction(Val, 16, 4);
+ unsigned Rt = fieldFromInstruction(Val, 12, 4);
+ unsigned Rm = fieldFromInstruction(Val, 0, 4);
+ Rm |= (fieldFromInstruction(Val, 23, 1) << 4);
+ unsigned Cond = fieldFromInstruction(Val, 28, 4);
+
+ if (fieldFromInstruction(Val, 8, 4) != 0 || Rn == Rt)
+ S = MCDisassembler::SoftFail;
+
+ if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeAddrMode7Operand(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodePostIdxReg(Inst, Rm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodePredicateOperand(Inst, Cond, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ return S;
+}
+
+static DecodeStatus DecoderForMRRC2AndMCRR2(llvm::MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned CRm = fieldFromInstruction(Val, 0, 4);
+ unsigned opc1 = fieldFromInstruction(Val, 4, 4);
+ unsigned cop = fieldFromInstruction(Val, 8, 4);
+ unsigned Rt = fieldFromInstruction(Val, 12, 4);
+ unsigned Rt2 = fieldFromInstruction(Val, 16, 4);
+
+ if ((cop & ~0x1) == 0xa)
+ return MCDisassembler::Fail;
+
+ if (Rt == Rt2)
+ S = MCDisassembler::SoftFail;
+
+ // We have to check if the instruction is MRRC2
+ // or MCRR2 when constructing the operands for
+ // Inst. Reason is because MRRC2 stores to two
+ // registers so it's tablegen desc has has two
+ // outputs whereas MCRR doesn't store to any
+ // registers so all of it's operands are listed
+ // as inputs, therefore the operand order for
+ // MRRC2 needs to be [Rt, Rt2, cop, opc1, CRm]
+ // and MCRR2 operand order is [cop, opc1, Rt, Rt2, CRm]
+
+ if (Inst.getOpcode() == ARM::MRRC2) {
+ if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt2, Address, Decoder)))
+ return MCDisassembler::Fail;
+ }
+ Inst.addOperand(MCOperand::createImm(cop));
+ Inst.addOperand(MCOperand::createImm(opc1));
+ if (Inst.getOpcode() == ARM::MCRR2) {
+ if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt2, Address, Decoder)))
+ return MCDisassembler::Fail;
+ }
+ Inst.addOperand(MCOperand::createImm(CRm));
+
+ return S;
+}
diff --git a/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
new file mode 100644
index 000000000000..3667952d44c0
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -0,0 +1,1669 @@
+//===-- ARMInstPrinter.cpp - Convert ARM MCInst to assembly syntax --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an ARM MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMInstPrinter.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+#define PRINT_ALIAS_INSTR
+#include "ARMGenAsmWriter.inc"
+
+/// translateShiftImm - Convert shift immediate from 0-31 to 1-32 for printing.
+///
+/// getSORegOffset returns an integer from 0-31, representing '32' as 0.
+static unsigned translateShiftImm(unsigned imm) {
+ // lsr #32 and asr #32 exist, but should be encoded as a 0.
+ assert((imm & ~0x1f) == 0 && "Invalid shift encoding");
+
+ if (imm == 0)
+ return 32;
+ return imm;
+}
+
+/// Prints the shift value with an immediate value.
+static void printRegImmShift(raw_ostream &O, ARM_AM::ShiftOpc ShOpc,
+ unsigned ShImm, bool UseMarkup) {
+ if (ShOpc == ARM_AM::no_shift || (ShOpc == ARM_AM::lsl && !ShImm))
+ return;
+ O << ", ";
+
+ assert(!(ShOpc == ARM_AM::ror && !ShImm) && "Cannot have ror #0");
+ O << getShiftOpcStr(ShOpc);
+
+ if (ShOpc != ARM_AM::rrx) {
+ O << " ";
+ if (UseMarkup)
+ O << "<imm:";
+ O << "#" << translateShiftImm(ShImm);
+ if (UseMarkup)
+ O << ">";
+ }
+}
+
+ARMInstPrinter::ARMInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI)
+ : MCInstPrinter(MAI, MII, MRI) {}
+
+void ARMInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+ OS << markup("<reg:") << getRegisterName(RegNo) << markup(">");
+}
+
+void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+ StringRef Annot, const MCSubtargetInfo &STI) {
+ unsigned Opcode = MI->getOpcode();
+
+ switch (Opcode) {
+
+ // Check for MOVs and print canonical forms, instead.
+ case ARM::MOVsr: {
+ // FIXME: Thumb variants?
+ const MCOperand &Dst = MI->getOperand(0);
+ const MCOperand &MO1 = MI->getOperand(1);
+ const MCOperand &MO2 = MI->getOperand(2);
+ const MCOperand &MO3 = MI->getOperand(3);
+
+ O << '\t' << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(MO3.getImm()));
+ printSBitModifierOperand(MI, 6, STI, O);
+ printPredicateOperand(MI, 4, STI, O);
+
+ O << '\t';
+ printRegName(O, Dst.getReg());
+ O << ", ";
+ printRegName(O, MO1.getReg());
+
+ O << ", ";
+ printRegName(O, MO2.getReg());
+ assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0);
+ printAnnotation(O, Annot);
+ return;
+ }
+
+ case ARM::MOVsi: {
+ // FIXME: Thumb variants?
+ const MCOperand &Dst = MI->getOperand(0);
+ const MCOperand &MO1 = MI->getOperand(1);
+ const MCOperand &MO2 = MI->getOperand(2);
+
+ O << '\t' << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(MO2.getImm()));
+ printSBitModifierOperand(MI, 5, STI, O);
+ printPredicateOperand(MI, 3, STI, O);
+
+ O << '\t';
+ printRegName(O, Dst.getReg());
+ O << ", ";
+ printRegName(O, MO1.getReg());
+
+ if (ARM_AM::getSORegShOp(MO2.getImm()) == ARM_AM::rrx) {
+ printAnnotation(O, Annot);
+ return;
+ }
+
+ O << ", " << markup("<imm:") << "#"
+ << translateShiftImm(ARM_AM::getSORegOffset(MO2.getImm())) << markup(">");
+ printAnnotation(O, Annot);
+ return;
+ }
+
+ // A8.6.123 PUSH
+ case ARM::STMDB_UPD:
+ case ARM::t2STMDB_UPD:
+ if (MI->getOperand(0).getReg() == ARM::SP && MI->getNumOperands() > 5) {
+ // Should only print PUSH if there are at least two registers in the list.
+ O << '\t' << "push";
+ printPredicateOperand(MI, 2, STI, O);
+ if (Opcode == ARM::t2STMDB_UPD)
+ O << ".w";
+ O << '\t';
+ printRegisterList(MI, 4, STI, O);
+ printAnnotation(O, Annot);
+ return;
+ } else
+ break;
+
+ case ARM::STR_PRE_IMM:
+ if (MI->getOperand(2).getReg() == ARM::SP &&
+ MI->getOperand(3).getImm() == -4) {
+ O << '\t' << "push";
+ printPredicateOperand(MI, 4, STI, O);
+ O << "\t{";
+ printRegName(O, MI->getOperand(1).getReg());
+ O << "}";
+ printAnnotation(O, Annot);
+ return;
+ } else
+ break;
+
+ // A8.6.122 POP
+ case ARM::LDMIA_UPD:
+ case ARM::t2LDMIA_UPD:
+ if (MI->getOperand(0).getReg() == ARM::SP && MI->getNumOperands() > 5) {
+ // Should only print POP if there are at least two registers in the list.
+ O << '\t' << "pop";
+ printPredicateOperand(MI, 2, STI, O);
+ if (Opcode == ARM::t2LDMIA_UPD)
+ O << ".w";
+ O << '\t';
+ printRegisterList(MI, 4, STI, O);
+ printAnnotation(O, Annot);
+ return;
+ } else
+ break;
+
+ case ARM::LDR_POST_IMM:
+ if (MI->getOperand(2).getReg() == ARM::SP &&
+ MI->getOperand(4).getImm() == 4) {
+ O << '\t' << "pop";
+ printPredicateOperand(MI, 5, STI, O);
+ O << "\t{";
+ printRegName(O, MI->getOperand(0).getReg());
+ O << "}";
+ printAnnotation(O, Annot);
+ return;
+ } else
+ break;
+
+ // A8.6.355 VPUSH
+ case ARM::VSTMSDB_UPD:
+ case ARM::VSTMDDB_UPD:
+ if (MI->getOperand(0).getReg() == ARM::SP) {
+ O << '\t' << "vpush";
+ printPredicateOperand(MI, 2, STI, O);
+ O << '\t';
+ printRegisterList(MI, 4, STI, O);
+ printAnnotation(O, Annot);
+ return;
+ } else
+ break;
+
+ // A8.6.354 VPOP
+ case ARM::VLDMSIA_UPD:
+ case ARM::VLDMDIA_UPD:
+ if (MI->getOperand(0).getReg() == ARM::SP) {
+ O << '\t' << "vpop";
+ printPredicateOperand(MI, 2, STI, O);
+ O << '\t';
+ printRegisterList(MI, 4, STI, O);
+ printAnnotation(O, Annot);
+ return;
+ } else
+ break;
+
+ case ARM::tLDMIA: {
+ bool Writeback = true;
+ unsigned BaseReg = MI->getOperand(0).getReg();
+ for (unsigned i = 3; i < MI->getNumOperands(); ++i) {
+ if (MI->getOperand(i).getReg() == BaseReg)
+ Writeback = false;
+ }
+
+ O << "\tldm";
+
+ printPredicateOperand(MI, 1, STI, O);
+ O << '\t';
+ printRegName(O, BaseReg);
+ if (Writeback)
+ O << "!";
+ O << ", ";
+ printRegisterList(MI, 3, STI, O);
+ printAnnotation(O, Annot);
+ return;
+ }
+
+ // Combine 2 GPRs from disassember into a GPRPair to match with instr def.
+ // ldrexd/strexd require even/odd GPR pair. To enforce this constraint,
+ // a single GPRPair reg operand is used in the .td file to replace the two
+ // GPRs. However, when decoding them, the two GRPs cannot be automatically
+ // expressed as a GPRPair, so we have to manually merge them.
+ // FIXME: We would really like to be able to tablegen'erate this.
+ case ARM::LDREXD:
+ case ARM::STREXD:
+ case ARM::LDAEXD:
+ case ARM::STLEXD: {
+ const MCRegisterClass &MRC = MRI.getRegClass(ARM::GPRRegClassID);
+ bool isStore = Opcode == ARM::STREXD || Opcode == ARM::STLEXD;
+ unsigned Reg = MI->getOperand(isStore ? 1 : 0).getReg();
+ if (MRC.contains(Reg)) {
+ MCInst NewMI;
+ MCOperand NewReg;
+ NewMI.setOpcode(Opcode);
+
+ if (isStore)
+ NewMI.addOperand(MI->getOperand(0));
+ NewReg = MCOperand::createReg(MRI.getMatchingSuperReg(
+ Reg, ARM::gsub_0, &MRI.getRegClass(ARM::GPRPairRegClassID)));
+ NewMI.addOperand(NewReg);
+
+ // Copy the rest operands into NewMI.
+ for (unsigned i = isStore ? 3 : 2; i < MI->getNumOperands(); ++i)
+ NewMI.addOperand(MI->getOperand(i));
+ printInstruction(&NewMI, STI, O);
+ return;
+ }
+ break;
+ }
+ }
+
+ if (!printAliasInstr(MI, STI, O))
+ printInstruction(MI, STI, O);
+
+ printAnnotation(O, Annot);
+}
+
+void ARMInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isReg()) {
+ unsigned Reg = Op.getReg();
+ printRegName(O, Reg);
+ } else if (Op.isImm()) {
+ O << markup("<imm:") << '#' << formatImm(Op.getImm()) << markup(">");
+ } else {
+ assert(Op.isExpr() && "unknown operand kind in printOperand");
+ const MCExpr *Expr = Op.getExpr();
+ switch (Expr->getKind()) {
+ case MCExpr::Binary:
+ O << '#';
+ Expr->print(O, &MAI);
+ break;
+ case MCExpr::Constant: {
+ // If a symbolic branch target was added as a constant expression then
+ // print that address in hex. And only print 32 unsigned bits for the
+ // address.
+ const MCConstantExpr *Constant = cast<MCConstantExpr>(Expr);
+ int64_t TargetAddress;
+ if (!Constant->evaluateAsAbsolute(TargetAddress)) {
+ O << '#';
+ Expr->print(O, &MAI);
+ } else {
+ O << "0x";
+ O.write_hex(static_cast<uint32_t>(TargetAddress));
+ }
+ break;
+ }
+ default:
+ // FIXME: Should we always treat this as if it is a constant literal and
+ // prefix it with '#'?
+ Expr->print(O, &MAI);
+ break;
+ }
+ }
+}
+
+void ARMInstPrinter::printThumbLdrLabelOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &MO1 = MI->getOperand(OpNum);
+ if (MO1.isExpr()) {
+ MO1.getExpr()->print(O, &MAI);
+ return;
+ }
+
+ O << markup("<mem:") << "[pc, ";
+
+ int32_t OffImm = (int32_t)MO1.getImm();
+ bool isSub = OffImm < 0;
+
+ // Special value for #-0. All others are normal.
+ if (OffImm == INT32_MIN)
+ OffImm = 0;
+ if (isSub) {
+ O << markup("<imm:") << "#-" << formatImm(-OffImm) << markup(">");
+ } else {
+ O << markup("<imm:") << "#" << formatImm(OffImm) << markup(">");
+ }
+ O << "]" << markup(">");
+}
+
+// so_reg is a 4-operand unit corresponding to register forms of the A5.1
+// "Addressing Mode 1 - Data-processing operands" forms. This includes:
+// REG 0 0 - e.g. R5
+// REG REG 0,SH_OPC - e.g. R5, ROR R3
+// REG 0 IMM,SH_OPC - e.g. R5, LSL #3
+void ARMInstPrinter::printSORegRegOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &MO1 = MI->getOperand(OpNum);
+ const MCOperand &MO2 = MI->getOperand(OpNum + 1);
+ const MCOperand &MO3 = MI->getOperand(OpNum + 2);
+
+ printRegName(O, MO1.getReg());
+
+ // Print the shift opc.
+ ARM_AM::ShiftOpc ShOpc = ARM_AM::getSORegShOp(MO3.getImm());
+ O << ", " << ARM_AM::getShiftOpcStr(ShOpc);
+ if (ShOpc == ARM_AM::rrx)
+ return;
+
+ O << ' ';
+ printRegName(O, MO2.getReg());
+ assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0);
+}
+
+void ARMInstPrinter::printSORegImmOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &MO1 = MI->getOperand(OpNum);
+ const MCOperand &MO2 = MI->getOperand(OpNum + 1);
+
+ printRegName(O, MO1.getReg());
+
+ // Print the shift opc.
+ printRegImmShift(O, ARM_AM::getSORegShOp(MO2.getImm()),
+ ARM_AM::getSORegOffset(MO2.getImm()), UseMarkup);
+}
+
+//===--------------------------------------------------------------------===//
+// Addressing Mode #2
+//===--------------------------------------------------------------------===//
+
+void ARMInstPrinter::printAM2PreOrOffsetIndexOp(const MCInst *MI, unsigned Op,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &MO1 = MI->getOperand(Op);
+ const MCOperand &MO2 = MI->getOperand(Op + 1);
+ const MCOperand &MO3 = MI->getOperand(Op + 2);
+
+ O << markup("<mem:") << "[";
+ printRegName(O, MO1.getReg());
+
+ if (!MO2.getReg()) {
+ if (ARM_AM::getAM2Offset(MO3.getImm())) { // Don't print +0.
+ O << ", " << markup("<imm:") << "#"
+ << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm()))
+ << ARM_AM::getAM2Offset(MO3.getImm()) << markup(">");
+ }
+ O << "]" << markup(">");
+ return;
+ }
+
+ O << ", ";
+ O << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm()));
+ printRegName(O, MO2.getReg());
+
+ printRegImmShift(O, ARM_AM::getAM2ShiftOpc(MO3.getImm()),
+ ARM_AM::getAM2Offset(MO3.getImm()), UseMarkup);
+ O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printAddrModeTBB(const MCInst *MI, unsigned Op,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &MO1 = MI->getOperand(Op);
+ const MCOperand &MO2 = MI->getOperand(Op + 1);
+ O << markup("<mem:") << "[";
+ printRegName(O, MO1.getReg());
+ O << ", ";
+ printRegName(O, MO2.getReg());
+ O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printAddrModeTBH(const MCInst *MI, unsigned Op,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &MO1 = MI->getOperand(Op);
+ const MCOperand &MO2 = MI->getOperand(Op + 1);
+ O << markup("<mem:") << "[";
+ printRegName(O, MO1.getReg());
+ O << ", ";
+ printRegName(O, MO2.getReg());
+ O << ", lsl " << markup("<imm:") << "#1" << markup(">") << "]" << markup(">");
+}
+
+void ARMInstPrinter::printAddrMode2Operand(const MCInst *MI, unsigned Op,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &MO1 = MI->getOperand(Op);
+
+ if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right.
+ printOperand(MI, Op, STI, O);
+ return;
+ }
+
+#ifndef NDEBUG
+ const MCOperand &MO3 = MI->getOperand(Op + 2);
+ unsigned IdxMode = ARM_AM::getAM2IdxMode(MO3.getImm());
+ assert(IdxMode != ARMII::IndexModePost && "Should be pre or offset index op");
+#endif
+
+ printAM2PreOrOffsetIndexOp(MI, Op, STI, O);
+}
+
+void ARMInstPrinter::printAddrMode2OffsetOperand(const MCInst *MI,
+ unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &MO1 = MI->getOperand(OpNum);
+ const MCOperand &MO2 = MI->getOperand(OpNum + 1);
+
+ if (!MO1.getReg()) {
+ unsigned ImmOffs = ARM_AM::getAM2Offset(MO2.getImm());
+ O << markup("<imm:") << '#'
+ << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO2.getImm())) << ImmOffs
+ << markup(">");
+ return;
+ }
+
+ O << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO2.getImm()));
+ printRegName(O, MO1.getReg());
+
+ printRegImmShift(O, ARM_AM::getAM2ShiftOpc(MO2.getImm()),
+ ARM_AM::getAM2Offset(MO2.getImm()), UseMarkup);
+}
+
+//===--------------------------------------------------------------------===//
+// Addressing Mode #3
+//===--------------------------------------------------------------------===//
+
+void ARMInstPrinter::printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned Op,
+ raw_ostream &O,
+ bool AlwaysPrintImm0) {
+ const MCOperand &MO1 = MI->getOperand(Op);
+ const MCOperand &MO2 = MI->getOperand(Op + 1);
+ const MCOperand &MO3 = MI->getOperand(Op + 2);
+
+ O << markup("<mem:") << '[';
+ printRegName(O, MO1.getReg());
+
+ if (MO2.getReg()) {
+ O << ", " << getAddrOpcStr(ARM_AM::getAM3Op(MO3.getImm()));
+ printRegName(O, MO2.getReg());
+ O << ']' << markup(">");
+ return;
+ }
+
+ // If the op is sub we have to print the immediate even if it is 0
+ unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm());
+ ARM_AM::AddrOpc op = ARM_AM::getAM3Op(MO3.getImm());
+
+ if (AlwaysPrintImm0 || ImmOffs || (op == ARM_AM::sub)) {
+ O << ", " << markup("<imm:") << "#" << ARM_AM::getAddrOpcStr(op) << ImmOffs
+ << markup(">");
+ }
+ O << ']' << markup(">");
+}
+
+template <bool AlwaysPrintImm0>
+void ARMInstPrinter::printAddrMode3Operand(const MCInst *MI, unsigned Op,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &MO1 = MI->getOperand(Op);
+ if (!MO1.isReg()) { // For label symbolic references.
+ printOperand(MI, Op, STI, O);
+ return;
+ }
+
+ assert(ARM_AM::getAM3IdxMode(MI->getOperand(Op + 2).getImm()) !=
+ ARMII::IndexModePost &&
+ "unexpected idxmode");
+ printAM3PreOrOffsetIndexOp(MI, Op, O, AlwaysPrintImm0);
+}
+
+void ARMInstPrinter::printAddrMode3OffsetOperand(const MCInst *MI,
+ unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &MO1 = MI->getOperand(OpNum);
+ const MCOperand &MO2 = MI->getOperand(OpNum + 1);
+
+ if (MO1.getReg()) {
+ O << getAddrOpcStr(ARM_AM::getAM3Op(MO2.getImm()));
+ printRegName(O, MO1.getReg());
+ return;
+ }
+
+ unsigned ImmOffs = ARM_AM::getAM3Offset(MO2.getImm());
+ O << markup("<imm:") << '#'
+ << ARM_AM::getAddrOpcStr(ARM_AM::getAM3Op(MO2.getImm())) << ImmOffs
+ << markup(">");
+}
+
+void ARMInstPrinter::printPostIdxImm8Operand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &MO = MI->getOperand(OpNum);
+ unsigned Imm = MO.getImm();
+ O << markup("<imm:") << '#' << ((Imm & 256) ? "" : "-") << (Imm & 0xff)
+ << markup(">");
+}
+
+void ARMInstPrinter::printPostIdxRegOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &MO1 = MI->getOperand(OpNum);
+ const MCOperand &MO2 = MI->getOperand(OpNum + 1);
+
+ O << (MO2.getImm() ? "" : "-");
+ printRegName(O, MO1.getReg());
+}
+
+void ARMInstPrinter::printPostIdxImm8s4Operand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &MO = MI->getOperand(OpNum);
+ unsigned Imm = MO.getImm();
+ O << markup("<imm:") << '#' << ((Imm & 256) ? "" : "-") << ((Imm & 0xff) << 2)
+ << markup(">");
+}
+
+void ARMInstPrinter::printLdStmModeOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ ARM_AM::AMSubMode Mode =
+ ARM_AM::getAM4SubMode(MI->getOperand(OpNum).getImm());
+ O << ARM_AM::getAMSubModeStr(Mode);
+}
+
+template <bool AlwaysPrintImm0>
+void ARMInstPrinter::printAddrMode5Operand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &MO1 = MI->getOperand(OpNum);
+ const MCOperand &MO2 = MI->getOperand(OpNum + 1);
+
+ if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right.
+ printOperand(MI, OpNum, STI, O);
+ return;
+ }
+
+ O << markup("<mem:") << "[";
+ printRegName(O, MO1.getReg());
+
+ unsigned ImmOffs = ARM_AM::getAM5Offset(MO2.getImm());
+ ARM_AM::AddrOpc Op = ARM_AM::getAM5Op(MO2.getImm());
+ if (AlwaysPrintImm0 || ImmOffs || Op == ARM_AM::sub) {
+ O << ", " << markup("<imm:") << "#" << ARM_AM::getAddrOpcStr(Op)
+ << ImmOffs * 4 << markup(">");
+ }
+ O << "]" << markup(">");
+}
+
+template <bool AlwaysPrintImm0>
+void ARMInstPrinter::printAddrMode5FP16Operand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &MO1 = MI->getOperand(OpNum);
+ const MCOperand &MO2 = MI->getOperand(OpNum+1);
+
+ if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right.
+ printOperand(MI, OpNum, STI, O);
+ return;
+ }
+
+ O << markup("<mem:") << "[";
+ printRegName(O, MO1.getReg());
+
+ unsigned ImmOffs = ARM_AM::getAM5FP16Offset(MO2.getImm());
+ unsigned Op = ARM_AM::getAM5FP16Op(MO2.getImm());
+ if (AlwaysPrintImm0 || ImmOffs || Op == ARM_AM::sub) {
+ O << ", "
+ << markup("<imm:")
+ << "#"
+ << ARM_AM::getAddrOpcStr(ARM_AM::getAM5FP16Op(MO2.getImm()))
+ << ImmOffs * 2
+ << markup(">");
+ }
+ O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printAddrMode6Operand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &MO1 = MI->getOperand(OpNum);
+ const MCOperand &MO2 = MI->getOperand(OpNum + 1);
+
+ O << markup("<mem:") << "[";
+ printRegName(O, MO1.getReg());
+ if (MO2.getImm()) {
+ O << ":" << (MO2.getImm() << 3);
+ }
+ O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printAddrMode7Operand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &MO1 = MI->getOperand(OpNum);
+ O << markup("<mem:") << "[";
+ printRegName(O, MO1.getReg());
+ O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printAddrMode6OffsetOperand(const MCInst *MI,
+ unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &MO = MI->getOperand(OpNum);
+ if (MO.getReg() == 0)
+ O << "!";
+ else {
+ O << ", ";
+ printRegName(O, MO.getReg());
+ }
+}
+
+void ARMInstPrinter::printBitfieldInvMaskImmOperand(const MCInst *MI,
+ unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &MO = MI->getOperand(OpNum);
+ uint32_t v = ~MO.getImm();
+ int32_t lsb = countTrailingZeros(v);
+ int32_t width = (32 - countLeadingZeros(v)) - lsb;
+ assert(MO.isImm() && "Not a valid bf_inv_mask_imm value!");
+ O << markup("<imm:") << '#' << lsb << markup(">") << ", " << markup("<imm:")
+ << '#' << width << markup(">");
+}
+
+void ARMInstPrinter::printMemBOption(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned val = MI->getOperand(OpNum).getImm();
+ O << ARM_MB::MemBOptToString(val, STI.getFeatureBits()[ARM::HasV8Ops]);
+}
+
+void ARMInstPrinter::printInstSyncBOption(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned val = MI->getOperand(OpNum).getImm();
+ O << ARM_ISB::InstSyncBOptToString(val);
+}
+
+void ARMInstPrinter::printShiftImmOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned ShiftOp = MI->getOperand(OpNum).getImm();
+ bool isASR = (ShiftOp & (1 << 5)) != 0;
+ unsigned Amt = ShiftOp & 0x1f;
+ if (isASR) {
+ O << ", asr " << markup("<imm:") << "#" << (Amt == 0 ? 32 : Amt)
+ << markup(">");
+ } else if (Amt) {
+ O << ", lsl " << markup("<imm:") << "#" << Amt << markup(">");
+ }
+}
+
+void ARMInstPrinter::printPKHLSLShiftImm(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned Imm = MI->getOperand(OpNum).getImm();
+ if (Imm == 0)
+ return;
+ assert(Imm > 0 && Imm < 32 && "Invalid PKH shift immediate value!");
+ O << ", lsl " << markup("<imm:") << "#" << Imm << markup(">");
+}
+
+void ARMInstPrinter::printPKHASRShiftImm(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned Imm = MI->getOperand(OpNum).getImm();
+ // A shift amount of 32 is encoded as 0.
+ if (Imm == 0)
+ Imm = 32;
+ assert(Imm > 0 && Imm <= 32 && "Invalid PKH shift immediate value!");
+ O << ", asr " << markup("<imm:") << "#" << Imm << markup(">");
+}
+
+void ARMInstPrinter::printRegisterList(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ assert(std::is_sorted(MI->begin() + OpNum, MI->end(),
+ [&](const MCOperand &LHS, const MCOperand &RHS) {
+ return MRI.getEncodingValue(LHS.getReg()) <
+ MRI.getEncodingValue(RHS.getReg());
+ }));
+
+ O << "{";
+ for (unsigned i = OpNum, e = MI->getNumOperands(); i != e; ++i) {
+ if (i != OpNum)
+ O << ", ";
+ printRegName(O, MI->getOperand(i).getReg());
+ }
+ O << "}";
+}
+
+void ARMInstPrinter::printGPRPairOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned Reg = MI->getOperand(OpNum).getReg();
+ printRegName(O, MRI.getSubReg(Reg, ARM::gsub_0));
+ O << ", ";
+ printRegName(O, MRI.getSubReg(Reg, ARM::gsub_1));
+}
+
+void ARMInstPrinter::printSetendOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNum);
+ if (Op.getImm())
+ O << "be";
+ else
+ O << "le";
+}
+
+void ARMInstPrinter::printCPSIMod(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNum);
+ O << ARM_PROC::IModToString(Op.getImm());
+}
+
+void ARMInstPrinter::printCPSIFlag(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNum);
+ unsigned IFlags = Op.getImm();
+ for (int i = 2; i >= 0; --i)
+ if (IFlags & (1 << i))
+ O << ARM_PROC::IFlagsToString(1 << i);
+
+ if (IFlags == 0)
+ O << "none";
+}
+
+void ARMInstPrinter::printMSRMaskOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNum);
+ unsigned SpecRegRBit = Op.getImm() >> 4;
+ unsigned Mask = Op.getImm() & 0xf;
+ const FeatureBitset &FeatureBits = STI.getFeatureBits();
+
+ if (FeatureBits[ARM::FeatureMClass]) {
+ unsigned SYSm = Op.getImm();
+ unsigned Opcode = MI->getOpcode();
+
+ // For writes, handle extended mask bits if the DSP extension is present.
+ if (Opcode == ARM::t2MSR_M && FeatureBits[ARM::FeatureDSP]) {
+ switch (SYSm) {
+ case 0x400:
+ O << "apsr_g";
+ return;
+ case 0xc00:
+ O << "apsr_nzcvqg";
+ return;
+ case 0x401:
+ O << "iapsr_g";
+ return;
+ case 0xc01:
+ O << "iapsr_nzcvqg";
+ return;
+ case 0x402:
+ O << "eapsr_g";
+ return;
+ case 0xc02:
+ O << "eapsr_nzcvqg";
+ return;
+ case 0x403:
+ O << "xpsr_g";
+ return;
+ case 0xc03:
+ O << "xpsr_nzcvqg";
+ return;
+ }
+ }
+
+ // Handle the basic 8-bit mask.
+ SYSm &= 0xff;
+
+ if (Opcode == ARM::t2MSR_M && FeatureBits [ARM::HasV7Ops]) {
+ // ARMv7-M deprecates using MSR APSR without a _<bits> qualifier as an
+ // alias for MSR APSR_nzcvq.
+ switch (SYSm) {
+ case 0:
+ O << "apsr_nzcvq";
+ return;
+ case 1:
+ O << "iapsr_nzcvq";
+ return;
+ case 2:
+ O << "eapsr_nzcvq";
+ return;
+ case 3:
+ O << "xpsr_nzcvq";
+ return;
+ }
+ }
+
+ switch (SYSm) {
+ default:
+ llvm_unreachable("Unexpected mask value!");
+ case 0:
+ O << "apsr";
+ return;
+ case 1:
+ O << "iapsr";
+ return;
+ case 2:
+ O << "eapsr";
+ return;
+ case 3:
+ O << "xpsr";
+ return;
+ case 5:
+ O << "ipsr";
+ return;
+ case 6:
+ O << "epsr";
+ return;
+ case 7:
+ O << "iepsr";
+ return;
+ case 8:
+ O << "msp";
+ return;
+ case 9:
+ O << "psp";
+ return;
+ case 16:
+ O << "primask";
+ return;
+ case 17:
+ O << "basepri";
+ return;
+ case 18:
+ O << "basepri_max";
+ return;
+ case 19:
+ O << "faultmask";
+ return;
+ case 20:
+ O << "control";
+ return;
+ case 10:
+ O << "msplim";
+ return;
+ case 11:
+ O << "psplim";
+ return;
+ case 0x88:
+ O << "msp_ns";
+ return;
+ case 0x89:
+ O << "psp_ns";
+ return;
+ case 0x8a:
+ O << "msplim_ns";
+ return;
+ case 0x8b:
+ O << "psplim_ns";
+ return;
+ case 0x90:
+ O << "primask_ns";
+ return;
+ case 0x91:
+ O << "basepri_ns";
+ return;
+ case 0x92:
+ O << "basepri_max_ns";
+ return;
+ case 0x93:
+ O << "faultmask_ns";
+ return;
+ case 0x94:
+ O << "control_ns";
+ return;
+ case 0x98:
+ O << "sp_ns";
+ return;
+ }
+ }
+
+ // As special cases, CPSR_f, CPSR_s and CPSR_fs prefer printing as
+ // APSR_nzcvq, APSR_g and APSRnzcvqg, respectively.
+ if (!SpecRegRBit && (Mask == 8 || Mask == 4 || Mask == 12)) {
+ O << "APSR_";
+ switch (Mask) {
+ default:
+ llvm_unreachable("Unexpected mask value!");
+ case 4:
+ O << "g";
+ return;
+ case 8:
+ O << "nzcvq";
+ return;
+ case 12:
+ O << "nzcvqg";
+ return;
+ }
+ }
+
+ if (SpecRegRBit)
+ O << "SPSR";
+ else
+ O << "CPSR";
+
+ if (Mask) {
+ O << '_';
+ if (Mask & 8)
+ O << 'f';
+ if (Mask & 4)
+ O << 's';
+ if (Mask & 2)
+ O << 'x';
+ if (Mask & 1)
+ O << 'c';
+ }
+}
+
+void ARMInstPrinter::printBankedRegOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ uint32_t Banked = MI->getOperand(OpNum).getImm();
+ uint32_t R = (Banked & 0x20) >> 5;
+ uint32_t SysM = Banked & 0x1f;
+
+ // Nothing much we can do about this, the encodings are specified in B9.2.3 of
+ // the ARM ARM v7C, and are all over the shop.
+ if (R) {
+ O << "SPSR_";
+
+ switch (SysM) {
+ case 0x0e:
+ O << "fiq";
+ return;
+ case 0x10:
+ O << "irq";
+ return;
+ case 0x12:
+ O << "svc";
+ return;
+ case 0x14:
+ O << "abt";
+ return;
+ case 0x16:
+ O << "und";
+ return;
+ case 0x1c:
+ O << "mon";
+ return;
+ case 0x1e:
+ O << "hyp";
+ return;
+ default:
+ llvm_unreachable("Invalid banked SPSR register");
+ }
+ }
+
+ assert(!R && "should have dealt with SPSR regs");
+ const char *RegNames[] = {
+ "r8_usr", "r9_usr", "r10_usr", "r11_usr", "r12_usr", "sp_usr", "lr_usr",
+ "", "r8_fiq", "r9_fiq", "r10_fiq", "r11_fiq", "r12_fiq", "sp_fiq",
+ "lr_fiq", "", "lr_irq", "sp_irq", "lr_svc", "sp_svc", "lr_abt",
+ "sp_abt", "lr_und", "sp_und", "", "", "", "",
+ "lr_mon", "sp_mon", "elr_hyp", "sp_hyp"};
+ const char *Name = RegNames[SysM];
+ assert(Name[0] && "invalid banked register operand");
+
+ O << Name;
+}
+
+void ARMInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(OpNum).getImm();
+ // Handle the undefined 15 CC value here for printing so we don't abort().
+ if ((unsigned)CC == 15)
+ O << "<und>";
+ else if (CC != ARMCC::AL)
+ O << ARMCondCodeToString(CC);
+}
+
+void ARMInstPrinter::printMandatoryPredicateOperand(const MCInst *MI,
+ unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(OpNum).getImm();
+ O << ARMCondCodeToString(CC);
+}
+
+void ARMInstPrinter::printSBitModifierOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ if (MI->getOperand(OpNum).getReg()) {
+ assert(MI->getOperand(OpNum).getReg() == ARM::CPSR &&
+ "Expect ARM CPSR register!");
+ O << 's';
+ }
+}
+
+void ARMInstPrinter::printNoHashImmediate(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ O << MI->getOperand(OpNum).getImm();
+}
+
+void ARMInstPrinter::printPImmediate(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ O << "p" << MI->getOperand(OpNum).getImm();
+}
+
+void ARMInstPrinter::printCImmediate(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ O << "c" << MI->getOperand(OpNum).getImm();
+}
+
+void ARMInstPrinter::printCoprocOptionImm(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ O << "{" << MI->getOperand(OpNum).getImm() << "}";
+}
+
+void ARMInstPrinter::printPCLabel(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ llvm_unreachable("Unhandled PC-relative pseudo-instruction!");
+}
+
+template <unsigned scale>
+void ARMInstPrinter::printAdrLabelOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &MO = MI->getOperand(OpNum);
+
+ if (MO.isExpr()) {
+ MO.getExpr()->print(O, &MAI);
+ return;
+ }
+
+ int32_t OffImm = (int32_t)MO.getImm() << scale;
+
+ O << markup("<imm:");
+ if (OffImm == INT32_MIN)
+ O << "#-0";
+ else if (OffImm < 0)
+ O << "#-" << -OffImm;
+ else
+ O << "#" << OffImm;
+ O << markup(">");
+}
+
+void ARMInstPrinter::printThumbS4ImmOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ O << markup("<imm:") << "#" << formatImm(MI->getOperand(OpNum).getImm() * 4)
+ << markup(">");
+}
+
+void ARMInstPrinter::printThumbSRImm(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned Imm = MI->getOperand(OpNum).getImm();
+ O << markup("<imm:") << "#" << formatImm((Imm == 0 ? 32 : Imm))
+ << markup(">");
+}
+
+void ARMInstPrinter::printThumbITMask(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ // (3 - the number of trailing zeros) is the number of then / else.
+ unsigned Mask = MI->getOperand(OpNum).getImm();
+ unsigned Firstcond = MI->getOperand(OpNum - 1).getImm();
+ unsigned CondBit0 = Firstcond & 1;
+ unsigned NumTZ = countTrailingZeros(Mask);
+ assert(NumTZ <= 3 && "Invalid IT mask!");
+ for (unsigned Pos = 3, e = NumTZ; Pos > e; --Pos) {
+ bool T = ((Mask >> Pos) & 1) == CondBit0;
+ if (T)
+ O << 't';
+ else
+ O << 'e';
+ }
+}
+
+void ARMInstPrinter::printThumbAddrModeRROperand(const MCInst *MI, unsigned Op,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &MO1 = MI->getOperand(Op);
+ const MCOperand &MO2 = MI->getOperand(Op + 1);
+
+ if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right.
+ printOperand(MI, Op, STI, O);
+ return;
+ }
+
+ O << markup("<mem:") << "[";
+ printRegName(O, MO1.getReg());
+ if (unsigned RegNum = MO2.getReg()) {
+ O << ", ";
+ printRegName(O, RegNum);
+ }
+ O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printThumbAddrModeImm5SOperand(const MCInst *MI,
+ unsigned Op,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O,
+ unsigned Scale) {
+ const MCOperand &MO1 = MI->getOperand(Op);
+ const MCOperand &MO2 = MI->getOperand(Op + 1);
+
+ if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right.
+ printOperand(MI, Op, STI, O);
+ return;
+ }
+
+ O << markup("<mem:") << "[";
+ printRegName(O, MO1.getReg());
+ if (unsigned ImmOffs = MO2.getImm()) {
+ O << ", " << markup("<imm:") << "#" << formatImm(ImmOffs * Scale)
+ << markup(">");
+ }
+ O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printThumbAddrModeImm5S1Operand(const MCInst *MI,
+ unsigned Op,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ printThumbAddrModeImm5SOperand(MI, Op, STI, O, 1);
+}
+
+void ARMInstPrinter::printThumbAddrModeImm5S2Operand(const MCInst *MI,
+ unsigned Op,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ printThumbAddrModeImm5SOperand(MI, Op, STI, O, 2);
+}
+
+void ARMInstPrinter::printThumbAddrModeImm5S4Operand(const MCInst *MI,
+ unsigned Op,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ printThumbAddrModeImm5SOperand(MI, Op, STI, O, 4);
+}
+
+void ARMInstPrinter::printThumbAddrModeSPOperand(const MCInst *MI, unsigned Op,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ printThumbAddrModeImm5SOperand(MI, Op, STI, O, 4);
+}
+
+// Constant shifts t2_so_reg is a 2-operand unit corresponding to the Thumb2
+// register with shift forms.
+// REG 0 0 - e.g. R5
+// REG IMM, SH_OPC - e.g. R5, LSL #3
+void ARMInstPrinter::printT2SOOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &MO1 = MI->getOperand(OpNum);
+ const MCOperand &MO2 = MI->getOperand(OpNum + 1);
+
+ unsigned Reg = MO1.getReg();
+ printRegName(O, Reg);
+
+ // Print the shift opc.
+ assert(MO2.isImm() && "Not a valid t2_so_reg value!");
+ printRegImmShift(O, ARM_AM::getSORegShOp(MO2.getImm()),
+ ARM_AM::getSORegOffset(MO2.getImm()), UseMarkup);
+}
+
+template <bool AlwaysPrintImm0>
+void ARMInstPrinter::printAddrModeImm12Operand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &MO1 = MI->getOperand(OpNum);
+ const MCOperand &MO2 = MI->getOperand(OpNum + 1);
+
+ if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right.
+ printOperand(MI, OpNum, STI, O);
+ return;
+ }
+
+ O << markup("<mem:") << "[";
+ printRegName(O, MO1.getReg());
+
+ int32_t OffImm = (int32_t)MO2.getImm();
+ bool isSub = OffImm < 0;
+ // Special value for #-0. All others are normal.
+ if (OffImm == INT32_MIN)
+ OffImm = 0;
+ if (isSub) {
+ O << ", " << markup("<imm:") << "#-" << formatImm(-OffImm) << markup(">");
+ } else if (AlwaysPrintImm0 || OffImm > 0) {
+ O << ", " << markup("<imm:") << "#" << formatImm(OffImm) << markup(">");
+ }
+ O << "]" << markup(">");
+}
+
+template <bool AlwaysPrintImm0>
+void ARMInstPrinter::printT2AddrModeImm8Operand(const MCInst *MI,
+ unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &MO1 = MI->getOperand(OpNum);
+ const MCOperand &MO2 = MI->getOperand(OpNum + 1);
+
+ O << markup("<mem:") << "[";
+ printRegName(O, MO1.getReg());
+
+ int32_t OffImm = (int32_t)MO2.getImm();
+ bool isSub = OffImm < 0;
+ // Don't print +0.
+ if (OffImm == INT32_MIN)
+ OffImm = 0;
+ if (isSub) {
+ O << ", " << markup("<imm:") << "#-" << -OffImm << markup(">");
+ } else if (AlwaysPrintImm0 || OffImm > 0) {
+ O << ", " << markup("<imm:") << "#" << OffImm << markup(">");
+ }
+ O << "]" << markup(">");
+}
+
+template <bool AlwaysPrintImm0>
+void ARMInstPrinter::printT2AddrModeImm8s4Operand(const MCInst *MI,
+ unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &MO1 = MI->getOperand(OpNum);
+ const MCOperand &MO2 = MI->getOperand(OpNum + 1);
+
+ if (!MO1.isReg()) { // For label symbolic references.
+ printOperand(MI, OpNum, STI, O);
+ return;
+ }
+
+ O << markup("<mem:") << "[";
+ printRegName(O, MO1.getReg());
+
+ int32_t OffImm = (int32_t)MO2.getImm();
+ bool isSub = OffImm < 0;
+
+ assert(((OffImm & 0x3) == 0) && "Not a valid immediate!");
+
+ // Don't print +0.
+ if (OffImm == INT32_MIN)
+ OffImm = 0;
+ if (isSub) {
+ O << ", " << markup("<imm:") << "#-" << -OffImm << markup(">");
+ } else if (AlwaysPrintImm0 || OffImm > 0) {
+ O << ", " << markup("<imm:") << "#" << OffImm << markup(">");
+ }
+ O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printT2AddrModeImm0_1020s4Operand(
+ const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &MO1 = MI->getOperand(OpNum);
+ const MCOperand &MO2 = MI->getOperand(OpNum + 1);
+
+ O << markup("<mem:") << "[";
+ printRegName(O, MO1.getReg());
+ if (MO2.getImm()) {
+ O << ", " << markup("<imm:") << "#" << formatImm(MO2.getImm() * 4)
+ << markup(">");
+ }
+ O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printT2AddrModeImm8OffsetOperand(
+ const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &MO1 = MI->getOperand(OpNum);
+ int32_t OffImm = (int32_t)MO1.getImm();
+ O << ", " << markup("<imm:");
+ if (OffImm == INT32_MIN)
+ O << "#-0";
+ else if (OffImm < 0)
+ O << "#-" << -OffImm;
+ else
+ O << "#" << OffImm;
+ O << markup(">");
+}
+
+void ARMInstPrinter::printT2AddrModeImm8s4OffsetOperand(
+ const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &MO1 = MI->getOperand(OpNum);
+ int32_t OffImm = (int32_t)MO1.getImm();
+
+ assert(((OffImm & 0x3) == 0) && "Not a valid immediate!");
+
+ O << ", " << markup("<imm:");
+ if (OffImm == INT32_MIN)
+ O << "#-0";
+ else if (OffImm < 0)
+ O << "#-" << -OffImm;
+ else
+ O << "#" << OffImm;
+ O << markup(">");
+}
+
+void ARMInstPrinter::printT2AddrModeSoRegOperand(const MCInst *MI,
+ unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &MO1 = MI->getOperand(OpNum);
+ const MCOperand &MO2 = MI->getOperand(OpNum + 1);
+ const MCOperand &MO3 = MI->getOperand(OpNum + 2);
+
+ O << markup("<mem:") << "[";
+ printRegName(O, MO1.getReg());
+
+ assert(MO2.getReg() && "Invalid so_reg load / store address!");
+ O << ", ";
+ printRegName(O, MO2.getReg());
+
+ unsigned ShAmt = MO3.getImm();
+ if (ShAmt) {
+ assert(ShAmt <= 3 && "Not a valid Thumb2 addressing mode!");
+ O << ", lsl " << markup("<imm:") << "#" << ShAmt << markup(">");
+ }
+ O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &MO = MI->getOperand(OpNum);
+ O << markup("<imm:") << '#' << ARM_AM::getFPImmFloat(MO.getImm())
+ << markup(">");
+}
+
+void ARMInstPrinter::printNEONModImmOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned EncodedImm = MI->getOperand(OpNum).getImm();
+ unsigned EltBits;
+ uint64_t Val = ARM_AM::decodeNEONModImm(EncodedImm, EltBits);
+ O << markup("<imm:") << "#0x";
+ O.write_hex(Val);
+ O << markup(">");
+}
+
+void ARMInstPrinter::printImmPlusOneOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned Imm = MI->getOperand(OpNum).getImm();
+ O << markup("<imm:") << "#" << formatImm(Imm + 1) << markup(">");
+}
+
+void ARMInstPrinter::printRotImmOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned Imm = MI->getOperand(OpNum).getImm();
+ if (Imm == 0)
+ return;
+ assert(Imm <= 3 && "illegal ror immediate!");
+ O << ", ror " << markup("<imm:") << "#" << 8 * Imm << markup(">");
+}
+
+void ARMInstPrinter::printModImmOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ MCOperand Op = MI->getOperand(OpNum);
+
+ // Support for fixups (MCFixup)
+ if (Op.isExpr())
+ return printOperand(MI, OpNum, STI, O);
+
+ unsigned Bits = Op.getImm() & 0xFF;
+ unsigned Rot = (Op.getImm() & 0xF00) >> 7;
+
+ bool PrintUnsigned = false;
+ switch (MI->getOpcode()) {
+ case ARM::MOVi:
+ // Movs to PC should be treated unsigned
+ PrintUnsigned = (MI->getOperand(OpNum - 1).getReg() == ARM::PC);
+ break;
+ case ARM::MSRi:
+ // Movs to special registers should be treated unsigned
+ PrintUnsigned = true;
+ break;
+ }
+
+ int32_t Rotated = ARM_AM::rotr32(Bits, Rot);
+ if (ARM_AM::getSOImmVal(Rotated) == Op.getImm()) {
+ // #rot has the least possible value
+ O << "#" << markup("<imm:");
+ if (PrintUnsigned)
+ O << static_cast<uint32_t>(Rotated);
+ else
+ O << Rotated;
+ O << markup(">");
+ return;
+ }
+
+ // Explicit #bits, #rot implied
+ O << "#" << markup("<imm:") << Bits << markup(">") << ", #" << markup("<imm:")
+ << Rot << markup(">");
+}
+
+void ARMInstPrinter::printFBits16(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ O << markup("<imm:") << "#" << 16 - MI->getOperand(OpNum).getImm()
+ << markup(">");
+}
+
+void ARMInstPrinter::printFBits32(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ O << markup("<imm:") << "#" << 32 - MI->getOperand(OpNum).getImm()
+ << markup(">");
+}
+
+void ARMInstPrinter::printVectorIndex(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ O << "[" << MI->getOperand(OpNum).getImm() << "]";
+}
+
+void ARMInstPrinter::printVectorListOne(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ O << "{";
+ printRegName(O, MI->getOperand(OpNum).getReg());
+ O << "}";
+}
+
+void ARMInstPrinter::printVectorListTwo(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned Reg = MI->getOperand(OpNum).getReg();
+ unsigned Reg0 = MRI.getSubReg(Reg, ARM::dsub_0);
+ unsigned Reg1 = MRI.getSubReg(Reg, ARM::dsub_1);
+ O << "{";
+ printRegName(O, Reg0);
+ O << ", ";
+ printRegName(O, Reg1);
+ O << "}";
+}
+
+void ARMInstPrinter::printVectorListTwoSpaced(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned Reg = MI->getOperand(OpNum).getReg();
+ unsigned Reg0 = MRI.getSubReg(Reg, ARM::dsub_0);
+ unsigned Reg1 = MRI.getSubReg(Reg, ARM::dsub_2);
+ O << "{";
+ printRegName(O, Reg0);
+ O << ", ";
+ printRegName(O, Reg1);
+ O << "}";
+}
+
+void ARMInstPrinter::printVectorListThree(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ // Normally, it's not safe to use register enum values directly with
+ // addition to get the next register, but for VFP registers, the
+ // sort order is guaranteed because they're all of the form D<n>.
+ O << "{";
+ printRegName(O, MI->getOperand(OpNum).getReg());
+ O << ", ";
+ printRegName(O, MI->getOperand(OpNum).getReg() + 1);
+ O << ", ";
+ printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+ O << "}";
+}
+
+void ARMInstPrinter::printVectorListFour(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ // Normally, it's not safe to use register enum values directly with
+ // addition to get the next register, but for VFP registers, the
+ // sort order is guaranteed because they're all of the form D<n>.
+ O << "{";
+ printRegName(O, MI->getOperand(OpNum).getReg());
+ O << ", ";
+ printRegName(O, MI->getOperand(OpNum).getReg() + 1);
+ O << ", ";
+ printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+ O << ", ";
+ printRegName(O, MI->getOperand(OpNum).getReg() + 3);
+ O << "}";
+}
+
+void ARMInstPrinter::printVectorListOneAllLanes(const MCInst *MI,
+ unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ O << "{";
+ printRegName(O, MI->getOperand(OpNum).getReg());
+ O << "[]}";
+}
+
+void ARMInstPrinter::printVectorListTwoAllLanes(const MCInst *MI,
+ unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned Reg = MI->getOperand(OpNum).getReg();
+ unsigned Reg0 = MRI.getSubReg(Reg, ARM::dsub_0);
+ unsigned Reg1 = MRI.getSubReg(Reg, ARM::dsub_1);
+ O << "{";
+ printRegName(O, Reg0);
+ O << "[], ";
+ printRegName(O, Reg1);
+ O << "[]}";
+}
+
+void ARMInstPrinter::printVectorListThreeAllLanes(const MCInst *MI,
+ unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ // Normally, it's not safe to use register enum values directly with
+ // addition to get the next register, but for VFP registers, the
+ // sort order is guaranteed because they're all of the form D<n>.
+ O << "{";
+ printRegName(O, MI->getOperand(OpNum).getReg());
+ O << "[], ";
+ printRegName(O, MI->getOperand(OpNum).getReg() + 1);
+ O << "[], ";
+ printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+ O << "[]}";
+}
+
+void ARMInstPrinter::printVectorListFourAllLanes(const MCInst *MI,
+ unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ // Normally, it's not safe to use register enum values directly with
+ // addition to get the next register, but for VFP registers, the
+ // sort order is guaranteed because they're all of the form D<n>.
+ O << "{";
+ printRegName(O, MI->getOperand(OpNum).getReg());
+ O << "[], ";
+ printRegName(O, MI->getOperand(OpNum).getReg() + 1);
+ O << "[], ";
+ printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+ O << "[], ";
+ printRegName(O, MI->getOperand(OpNum).getReg() + 3);
+ O << "[]}";
+}
+
+void ARMInstPrinter::printVectorListTwoSpacedAllLanes(
+ const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned Reg = MI->getOperand(OpNum).getReg();
+ unsigned Reg0 = MRI.getSubReg(Reg, ARM::dsub_0);
+ unsigned Reg1 = MRI.getSubReg(Reg, ARM::dsub_2);
+ O << "{";
+ printRegName(O, Reg0);
+ O << "[], ";
+ printRegName(O, Reg1);
+ O << "[]}";
+}
+
+void ARMInstPrinter::printVectorListThreeSpacedAllLanes(
+ const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ // Normally, it's not safe to use register enum values directly with
+ // addition to get the next register, but for VFP registers, the
+ // sort order is guaranteed because they're all of the form D<n>.
+ O << "{";
+ printRegName(O, MI->getOperand(OpNum).getReg());
+ O << "[], ";
+ printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+ O << "[], ";
+ printRegName(O, MI->getOperand(OpNum).getReg() + 4);
+ O << "[]}";
+}
+
+void ARMInstPrinter::printVectorListFourSpacedAllLanes(
+ const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ // Normally, it's not safe to use register enum values directly with
+ // addition to get the next register, but for VFP registers, the
+ // sort order is guaranteed because they're all of the form D<n>.
+ O << "{";
+ printRegName(O, MI->getOperand(OpNum).getReg());
+ O << "[], ";
+ printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+ O << "[], ";
+ printRegName(O, MI->getOperand(OpNum).getReg() + 4);
+ O << "[], ";
+ printRegName(O, MI->getOperand(OpNum).getReg() + 6);
+ O << "[]}";
+}
+
+void ARMInstPrinter::printVectorListThreeSpaced(const MCInst *MI,
+ unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ // Normally, it's not safe to use register enum values directly with
+ // addition to get the next register, but for VFP registers, the
+ // sort order is guaranteed because they're all of the form D<n>.
+ O << "{";
+ printRegName(O, MI->getOperand(OpNum).getReg());
+ O << ", ";
+ printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+ O << ", ";
+ printRegName(O, MI->getOperand(OpNum).getReg() + 4);
+ O << "}";
+}
+
+void ARMInstPrinter::printVectorListFourSpaced(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ // Normally, it's not safe to use register enum values directly with
+ // addition to get the next register, but for VFP registers, the
+ // sort order is guaranteed because they're all of the form D<n>.
+ O << "{";
+ printRegName(O, MI->getOperand(OpNum).getReg());
+ O << ", ";
+ printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+ O << ", ";
+ printRegName(O, MI->getOperand(OpNum).getReg() + 4);
+ O << ", ";
+ printRegName(O, MI->getOperand(OpNum).getReg() + 6);
+ O << "}";
+}
diff --git a/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
new file mode 100644
index 000000000000..9d80eed84dc2
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
@@ -0,0 +1,238 @@
+//===- ARMInstPrinter.h - Convert ARM MCInst to assembly syntax -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an ARM MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_INSTPRINTER_ARMINSTPRINTER_H
+#define LLVM_LIB_TARGET_ARM_INSTPRINTER_ARMINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+
+class ARMInstPrinter : public MCInstPrinter {
+public:
+ ARMInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI);
+
+ void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
+ const MCSubtargetInfo &STI) override;
+ void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+
+ // Autogenerated by tblgen.
+ void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ virtual bool printAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+ unsigned PrintMethodIdx,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ static const char *getRegisterName(unsigned RegNo);
+
+ void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+
+ void printSORegRegOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printSORegImmOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+
+ void printAddrModeTBB(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printAddrModeTBH(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printAddrMode2Operand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printAM2PostIndexOp(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printAM2PreOrOffsetIndexOp(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printAddrMode2OffsetOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ template <bool AlwaysPrintImm0>
+ void printAddrMode3Operand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printAddrMode3OffsetOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned Op, raw_ostream &O,
+ bool AlwaysPrintImm0);
+ void printPostIdxImm8Operand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printPostIdxRegOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printPostIdxImm8s4Operand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+
+ void printLdStmModeOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ template <bool AlwaysPrintImm0>
+ void printAddrMode5Operand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ template <bool AlwaysPrintImm0>
+ void printAddrMode5FP16Operand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printAddrMode6Operand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printAddrMode7Operand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printAddrMode6OffsetOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+
+ void printBitfieldInvMaskImmOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printMemBOption(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printInstSyncBOption(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printShiftImmOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printPKHLSLShiftImm(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printPKHASRShiftImm(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+
+ template <unsigned scale>
+ void printAdrLabelOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printThumbS4ImmOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printThumbSRImm(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printThumbITMask(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printThumbAddrModeRROperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printThumbAddrModeImm5SOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O, unsigned Scale);
+ void printThumbAddrModeImm5S1Operand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printThumbAddrModeImm5S2Operand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printThumbAddrModeImm5S4Operand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printThumbAddrModeSPOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+
+ void printT2SOOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ template <bool AlwaysPrintImm0>
+ void printAddrModeImm12Operand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ template <bool AlwaysPrintImm0>
+ void printT2AddrModeImm8Operand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ template <bool AlwaysPrintImm0>
+ void printT2AddrModeImm8s4Operand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printT2AddrModeImm0_1020s4Operand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printT2AddrModeImm8OffsetOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printT2AddrModeImm8s4OffsetOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printT2AddrModeSoRegOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+
+ void printSetendOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printCPSIMod(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printCPSIFlag(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printMSRMaskOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printBankedRegOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printPredicateOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printMandatoryPredicateOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printSBitModifierOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printRegisterList(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printNoHashImmediate(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printPImmediate(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printCImmediate(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printCoprocOptionImm(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printFPImmOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printNEONModImmOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printImmPlusOneOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printRotImmOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printModImmOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printGPRPairOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+
+ void printPCLabel(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printThumbLdrLabelOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printFBits16(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printFBits32(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printVectorIndex(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printVectorListOne(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printVectorListTwo(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printVectorListTwoSpaced(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printVectorListThree(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printVectorListFour(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printVectorListOneAllLanes(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printVectorListTwoAllLanes(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printVectorListThreeAllLanes(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printVectorListFourAllLanes(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printVectorListTwoSpacedAllLanes(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printVectorListThreeSpacedAllLanes(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printVectorListFourSpacedAllLanes(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printVectorListThreeSpaced(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printVectorListFourSpaced(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/LICENSE.TXT b/contrib/llvm/lib/Target/ARM/LICENSE.TXT
new file mode 100755
index 000000000000..68afea12ed44
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/LICENSE.TXT
@@ -0,0 +1,47 @@
+ARM Limited
+
+Software Grant License Agreement ("Agreement")
+
+Except for the license granted herein to you, ARM Limited ("ARM") reserves all
+right, title, and interest in and to the Software (defined below).
+
+Definition
+
+"Software" means the code and documentation as well as any original work of
+authorship, including any modifications or additions to an existing work, that
+is intentionally submitted by ARM to llvm.org (http://llvm.org) ("LLVM") for
+inclusion in, or documentation of, any of the products owned or managed by LLVM
+(the "Work"). For the purposes of this definition, "submitted" means any form of
+electronic, verbal, or written communication sent to LLVM or its
+representatives, including but not limited to communication on electronic
+mailing lists, source code control systems, and issue tracking systems that are
+managed by, or on behalf of, LLVM for the purpose of discussing and improving
+the Work, but excluding communication that is conspicuously marked otherwise.
+
+1. Grant of Copyright License. Subject to the terms and conditions of this
+ Agreement, ARM hereby grants to you and to recipients of the Software
+ distributed by LLVM a perpetual, worldwide, non-exclusive, no-charge,
+ royalty-free, irrevocable copyright license to reproduce, prepare derivative
+ works of, publicly display, publicly perform, sublicense, and distribute the
+ Software and such derivative works.
+
+2. Grant of Patent License. Subject to the terms and conditions of this
+ Agreement, ARM hereby grants you and to recipients of the Software
+ distributed by LLVM a perpetual, worldwide, non-exclusive, no-charge,
+ royalty-free, irrevocable (except as stated in this section) patent license
+ to make, have made, use, offer to sell, sell, import, and otherwise transfer
+ the Work, where such license applies only to those patent claims licensable
+ by ARM that are necessarily infringed by ARM's Software alone or by
+ combination of the Software with the Work to which such Software was
+ submitted. If any entity institutes patent litigation against ARM or any
+ other entity (including a cross-claim or counterclaim in a lawsuit) alleging
+ that ARM's Software, or the Work to which ARM has contributed constitutes
+ direct or contributory patent infringement, then any patent licenses granted
+ to that entity under this Agreement for the Software or Work shall terminate
+ as of the date such litigation is filed.
+
+Unless required by applicable law or agreed to in writing, the software is
+provided on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+either express or implied, including, without limitation, any warranties or
+conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+PARTICULAR PURPOSE.
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
new file mode 100644
index 000000000000..3959eab966a8
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
@@ -0,0 +1,762 @@
+//===-- ARMAddressingModes.h - ARM Addressing Modes -------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM addressing mode implementation stuff.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMADDRESSINGMODES_H
+#define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMADDRESSINGMODES_H
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+
+namespace llvm {
+
+/// ARM_AM - ARM Addressing Mode Stuff
+namespace ARM_AM {
+ enum ShiftOpc {
+ no_shift = 0,
+ asr,
+ lsl,
+ lsr,
+ ror,
+ rrx
+ };
+
+ enum AddrOpc {
+ sub = 0,
+ add
+ };
+
+ static inline const char *getAddrOpcStr(AddrOpc Op) {
+ return Op == sub ? "-" : "";
+ }
+
+ static inline const char *getShiftOpcStr(ShiftOpc Op) {
+ switch (Op) {
+ default: llvm_unreachable("Unknown shift opc!");
+ case ARM_AM::asr: return "asr";
+ case ARM_AM::lsl: return "lsl";
+ case ARM_AM::lsr: return "lsr";
+ case ARM_AM::ror: return "ror";
+ case ARM_AM::rrx: return "rrx";
+ }
+ }
+
+ static inline unsigned getShiftOpcEncoding(ShiftOpc Op) {
+ switch (Op) {
+ default: llvm_unreachable("Unknown shift opc!");
+ case ARM_AM::asr: return 2;
+ case ARM_AM::lsl: return 0;
+ case ARM_AM::lsr: return 1;
+ case ARM_AM::ror: return 3;
+ }
+ }
+
+ enum AMSubMode {
+ bad_am_submode = 0,
+ ia,
+ ib,
+ da,
+ db
+ };
+
+ static inline const char *getAMSubModeStr(AMSubMode Mode) {
+ switch (Mode) {
+ default: llvm_unreachable("Unknown addressing sub-mode!");
+ case ARM_AM::ia: return "ia";
+ case ARM_AM::ib: return "ib";
+ case ARM_AM::da: return "da";
+ case ARM_AM::db: return "db";
+ }
+ }
+
+ /// rotr32 - Rotate a 32-bit unsigned value right by a specified # bits.
+ ///
+ static inline unsigned rotr32(unsigned Val, unsigned Amt) {
+ assert(Amt < 32 && "Invalid rotate amount");
+ return (Val >> Amt) | (Val << ((32-Amt)&31));
+ }
+
+ /// rotl32 - Rotate a 32-bit unsigned value left by a specified # bits.
+ ///
+ static inline unsigned rotl32(unsigned Val, unsigned Amt) {
+ assert(Amt < 32 && "Invalid rotate amount");
+ return (Val << Amt) | (Val >> ((32-Amt)&31));
+ }
+
+ //===--------------------------------------------------------------------===//
+ // Addressing Mode #1: shift_operand with registers
+ //===--------------------------------------------------------------------===//
+ //
+ // This 'addressing mode' is used for arithmetic instructions. It can
+ // represent things like:
+ // reg
+ // reg [asr|lsl|lsr|ror|rrx] reg
+ // reg [asr|lsl|lsr|ror|rrx] imm
+ //
+ // This is stored three operands [rega, regb, opc]. The first is the base
+ // reg, the second is the shift amount (or reg0 if not present or imm). The
+ // third operand encodes the shift opcode and the imm if a reg isn't present.
+ //
+ static inline unsigned getSORegOpc(ShiftOpc ShOp, unsigned Imm) {
+ return ShOp | (Imm << 3);
+ }
+ static inline unsigned getSORegOffset(unsigned Op) {
+ return Op >> 3;
+ }
+ static inline ShiftOpc getSORegShOp(unsigned Op) {
+ return (ShiftOpc)(Op & 7);
+ }
+
+ /// getSOImmValImm - Given an encoded imm field for the reg/imm form, return
+ /// the 8-bit imm value.
+ static inline unsigned getSOImmValImm(unsigned Imm) {
+ return Imm & 0xFF;
+ }
+ /// getSOImmValRot - Given an encoded imm field for the reg/imm form, return
+ /// the rotate amount.
+ static inline unsigned getSOImmValRot(unsigned Imm) {
+ return (Imm >> 8) * 2;
+ }
+
+ /// getSOImmValRotate - Try to handle Imm with an immediate shifter operand,
+ /// computing the rotate amount to use. If this immediate value cannot be
+ /// handled with a single shifter-op, determine a good rotate amount that will
+ /// take a maximal chunk of bits out of the immediate.
+ static inline unsigned getSOImmValRotate(unsigned Imm) {
+ // 8-bit (or less) immediates are trivially shifter_operands with a rotate
+ // of zero.
+ if ((Imm & ~255U) == 0) return 0;
+
+ // Use CTZ to compute the rotate amount.
+ unsigned TZ = countTrailingZeros(Imm);
+
+ // Rotate amount must be even. Something like 0x200 must be rotated 8 bits,
+ // not 9.
+ unsigned RotAmt = TZ & ~1;
+
+ // If we can handle this spread, return it.
+ if ((rotr32(Imm, RotAmt) & ~255U) == 0)
+ return (32-RotAmt)&31; // HW rotates right, not left.
+
+ // For values like 0xF000000F, we should ignore the low 6 bits, then
+ // retry the hunt.
+ if (Imm & 63U) {
+ unsigned TZ2 = countTrailingZeros(Imm & ~63U);
+ unsigned RotAmt2 = TZ2 & ~1;
+ if ((rotr32(Imm, RotAmt2) & ~255U) == 0)
+ return (32-RotAmt2)&31; // HW rotates right, not left.
+ }
+
+ // Otherwise, we have no way to cover this span of bits with a single
+ // shifter_op immediate. Return a chunk of bits that will be useful to
+ // handle.
+ return (32-RotAmt)&31; // HW rotates right, not left.
+ }
+
+ /// getSOImmVal - Given a 32-bit immediate, if it is something that can fit
+ /// into an shifter_operand immediate operand, return the 12-bit encoding for
+ /// it. If not, return -1.
+ static inline int getSOImmVal(unsigned Arg) {
+ // 8-bit (or less) immediates are trivially shifter_operands with a rotate
+ // of zero.
+ if ((Arg & ~255U) == 0) return Arg;
+
+ unsigned RotAmt = getSOImmValRotate(Arg);
+
+ // If this cannot be handled with a single shifter_op, bail out.
+ if (rotr32(~255U, RotAmt) & Arg)
+ return -1;
+
+ // Encode this correctly.
+ return rotl32(Arg, RotAmt) | ((RotAmt>>1) << 8);
+ }
+
+ /// isSOImmTwoPartVal - Return true if the specified value can be obtained by
+ /// or'ing together two SOImmVal's.
+ static inline bool isSOImmTwoPartVal(unsigned V) {
+ // If this can be handled with a single shifter_op, bail out.
+ V = rotr32(~255U, getSOImmValRotate(V)) & V;
+ if (V == 0)
+ return false;
+
+ // If this can be handled with two shifter_op's, accept.
+ V = rotr32(~255U, getSOImmValRotate(V)) & V;
+ return V == 0;
+ }
+
+ /// getSOImmTwoPartFirst - If V is a value that satisfies isSOImmTwoPartVal,
+ /// return the first chunk of it.
+ static inline unsigned getSOImmTwoPartFirst(unsigned V) {
+ return rotr32(255U, getSOImmValRotate(V)) & V;
+ }
+
+ /// getSOImmTwoPartSecond - If V is a value that satisfies isSOImmTwoPartVal,
+ /// return the second chunk of it.
+ static inline unsigned getSOImmTwoPartSecond(unsigned V) {
+ // Mask out the first hunk.
+ V = rotr32(~255U, getSOImmValRotate(V)) & V;
+
+ // Take what's left.
+ assert(V == (rotr32(255U, getSOImmValRotate(V)) & V));
+ return V;
+ }
+
+ /// getThumbImmValShift - Try to handle Imm with a 8-bit immediate followed
+ /// by a left shift. Returns the shift amount to use.
+ static inline unsigned getThumbImmValShift(unsigned Imm) {
+ // 8-bit (or less) immediates are trivially immediate operand with a shift
+ // of zero.
+ if ((Imm & ~255U) == 0) return 0;
+
+ // Use CTZ to compute the shift amount.
+ return countTrailingZeros(Imm);
+ }
+
+ /// isThumbImmShiftedVal - Return true if the specified value can be obtained
+ /// by left shifting a 8-bit immediate.
+ static inline bool isThumbImmShiftedVal(unsigned V) {
+ // If this can be handled with
+ V = (~255U << getThumbImmValShift(V)) & V;
+ return V == 0;
+ }
+
+ /// getThumbImm16ValShift - Try to handle Imm with a 16-bit immediate followed
+ /// by a left shift. Returns the shift amount to use.
+ static inline unsigned getThumbImm16ValShift(unsigned Imm) {
+ // 16-bit (or less) immediates are trivially immediate operand with a shift
+ // of zero.
+ if ((Imm & ~65535U) == 0) return 0;
+
+ // Use CTZ to compute the shift amount.
+ return countTrailingZeros(Imm);
+ }
+
+ /// isThumbImm16ShiftedVal - Return true if the specified value can be
+ /// obtained by left shifting a 16-bit immediate.
+ static inline bool isThumbImm16ShiftedVal(unsigned V) {
+ // If this can be handled with
+ V = (~65535U << getThumbImm16ValShift(V)) & V;
+ return V == 0;
+ }
+
+ /// getThumbImmNonShiftedVal - If V is a value that satisfies
+ /// isThumbImmShiftedVal, return the non-shiftd value.
+ static inline unsigned getThumbImmNonShiftedVal(unsigned V) {
+ return V >> getThumbImmValShift(V);
+ }
+
+
+ /// getT2SOImmValSplat - Return the 12-bit encoded representation
+ /// if the specified value can be obtained by splatting the low 8 bits
+ /// into every other byte or every byte of a 32-bit value. i.e.,
+ /// 00000000 00000000 00000000 abcdefgh control = 0
+ /// 00000000 abcdefgh 00000000 abcdefgh control = 1
+ /// abcdefgh 00000000 abcdefgh 00000000 control = 2
+ /// abcdefgh abcdefgh abcdefgh abcdefgh control = 3
+ /// Return -1 if none of the above apply.
+ /// See ARM Reference Manual A6.3.2.
+ static inline int getT2SOImmValSplatVal(unsigned V) {
+ unsigned u, Vs, Imm;
+ // control = 0
+ if ((V & 0xffffff00) == 0)
+ return V;
+
+ // If the value is zeroes in the first byte, just shift those off
+ Vs = ((V & 0xff) == 0) ? V >> 8 : V;
+ // Any passing value only has 8 bits of payload, splatted across the word
+ Imm = Vs & 0xff;
+ // Likewise, any passing values have the payload splatted into the 3rd byte
+ u = Imm | (Imm << 16);
+
+ // control = 1 or 2
+ if (Vs == u)
+ return (((Vs == V) ? 1 : 2) << 8) | Imm;
+
+ // control = 3
+ if (Vs == (u | (u << 8)))
+ return (3 << 8) | Imm;
+
+ return -1;
+ }
+
+ /// getT2SOImmValRotateVal - Return the 12-bit encoded representation if the
+ /// specified value is a rotated 8-bit value. Return -1 if no rotation
+ /// encoding is possible.
+ /// See ARM Reference Manual A6.3.2.
+ static inline int getT2SOImmValRotateVal(unsigned V) {
+ unsigned RotAmt = countLeadingZeros(V);
+ if (RotAmt >= 24)
+ return -1;
+
+ // If 'Arg' can be handled with a single shifter_op return the value.
+ if ((rotr32(0xff000000U, RotAmt) & V) == V)
+ return (rotr32(V, 24 - RotAmt) & 0x7f) | ((RotAmt + 8) << 7);
+
+ return -1;
+ }
+
+ /// getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit
+ /// into a Thumb-2 shifter_operand immediate operand, return the 12-bit
+ /// encoding for it. If not, return -1.
+ /// See ARM Reference Manual A6.3.2.
+ static inline int getT2SOImmVal(unsigned Arg) {
+ // If 'Arg' is an 8-bit splat, then get the encoded value.
+ int Splat = getT2SOImmValSplatVal(Arg);
+ if (Splat != -1)
+ return Splat;
+
+ // If 'Arg' can be handled with a single shifter_op return the value.
+ int Rot = getT2SOImmValRotateVal(Arg);
+ if (Rot != -1)
+ return Rot;
+
+ return -1;
+ }
+
+ static inline unsigned getT2SOImmValRotate(unsigned V) {
+ if ((V & ~255U) == 0) return 0;
+ // Use CTZ to compute the rotate amount.
+ unsigned RotAmt = countTrailingZeros(V);
+ return (32 - RotAmt) & 31;
+ }
+
+ static inline bool isT2SOImmTwoPartVal (unsigned Imm) {
+ unsigned V = Imm;
+ // Passing values can be any combination of splat values and shifter
+ // values. If this can be handled with a single shifter or splat, bail
+ // out. Those should be handled directly, not with a two-part val.
+ if (getT2SOImmValSplatVal(V) != -1)
+ return false;
+ V = rotr32 (~255U, getT2SOImmValRotate(V)) & V;
+ if (V == 0)
+ return false;
+
+ // If this can be handled as an immediate, accept.
+ if (getT2SOImmVal(V) != -1) return true;
+
+ // Likewise, try masking out a splat value first.
+ V = Imm;
+ if (getT2SOImmValSplatVal(V & 0xff00ff00U) != -1)
+ V &= ~0xff00ff00U;
+ else if (getT2SOImmValSplatVal(V & 0x00ff00ffU) != -1)
+ V &= ~0x00ff00ffU;
+ // If what's left can be handled as an immediate, accept.
+ if (getT2SOImmVal(V) != -1) return true;
+
+ // Otherwise, do not accept.
+ return false;
+ }
+
+ static inline unsigned getT2SOImmTwoPartFirst(unsigned Imm) {
+ assert (isT2SOImmTwoPartVal(Imm) &&
+ "Immedate cannot be encoded as two part immediate!");
+ // Try a shifter operand as one part
+ unsigned V = rotr32 (~255, getT2SOImmValRotate(Imm)) & Imm;
+ // If the rest is encodable as an immediate, then return it.
+ if (getT2SOImmVal(V) != -1) return V;
+
+ // Try masking out a splat value first.
+ if (getT2SOImmValSplatVal(Imm & 0xff00ff00U) != -1)
+ return Imm & 0xff00ff00U;
+
+ // The other splat is all that's left as an option.
+ assert (getT2SOImmValSplatVal(Imm & 0x00ff00ffU) != -1);
+ return Imm & 0x00ff00ffU;
+ }
+
+ static inline unsigned getT2SOImmTwoPartSecond(unsigned Imm) {
+ // Mask out the first hunk
+ Imm ^= getT2SOImmTwoPartFirst(Imm);
+ // Return what's left
+ assert (getT2SOImmVal(Imm) != -1 &&
+ "Unable to encode second part of T2 two part SO immediate");
+ return Imm;
+ }
+
+
+ //===--------------------------------------------------------------------===//
+ // Addressing Mode #2
+ //===--------------------------------------------------------------------===//
+ //
+ // This is used for most simple load/store instructions.
+ //
+ // addrmode2 := reg +/- reg shop imm
+ // addrmode2 := reg +/- imm12
+ //
+ // The first operand is always a Reg. The second operand is a reg if in
+ // reg/reg form, otherwise it's reg#0. The third field encodes the operation
+ // in bit 12, the immediate in bits 0-11, and the shift op in 13-15. The
+ // fourth operand 16-17 encodes the index mode.
+ //
+ // If this addressing mode is a frame index (before prolog/epilog insertion
+ // and code rewriting), this operand will have the form: FI#, reg0, <offs>
+ // with no shift amount for the frame offset.
+ //
+ static inline unsigned getAM2Opc(AddrOpc Opc, unsigned Imm12, ShiftOpc SO,
+ unsigned IdxMode = 0) {
+ assert(Imm12 < (1 << 12) && "Imm too large!");
+ bool isSub = Opc == sub;
+ return Imm12 | ((int)isSub << 12) | (SO << 13) | (IdxMode << 16) ;
+ }
+ static inline unsigned getAM2Offset(unsigned AM2Opc) {
+ return AM2Opc & ((1 << 12)-1);
+ }
+ static inline AddrOpc getAM2Op(unsigned AM2Opc) {
+ return ((AM2Opc >> 12) & 1) ? sub : add;
+ }
+ static inline ShiftOpc getAM2ShiftOpc(unsigned AM2Opc) {
+ return (ShiftOpc)((AM2Opc >> 13) & 7);
+ }
+ static inline unsigned getAM2IdxMode(unsigned AM2Opc) {
+ return (AM2Opc >> 16);
+ }
+
+
+ //===--------------------------------------------------------------------===//
+ // Addressing Mode #3
+ //===--------------------------------------------------------------------===//
+ //
+ // This is used for sign-extending loads, and load/store-pair instructions.
+ //
+ // addrmode3 := reg +/- reg
+ // addrmode3 := reg +/- imm8
+ //
+ // The first operand is always a Reg. The second operand is a reg if in
+ // reg/reg form, otherwise it's reg#0. The third field encodes the operation
+ // in bit 8, the immediate in bits 0-7. The fourth operand 9-10 encodes the
+ // index mode.
+
+ /// getAM3Opc - This function encodes the addrmode3 opc field.
+ static inline unsigned getAM3Opc(AddrOpc Opc, unsigned char Offset,
+ unsigned IdxMode = 0) {
+ bool isSub = Opc == sub;
+ return ((int)isSub << 8) | Offset | (IdxMode << 9);
+ }
+ static inline unsigned char getAM3Offset(unsigned AM3Opc) {
+ return AM3Opc & 0xFF;
+ }
+ static inline AddrOpc getAM3Op(unsigned AM3Opc) {
+ return ((AM3Opc >> 8) & 1) ? sub : add;
+ }
+ static inline unsigned getAM3IdxMode(unsigned AM3Opc) {
+ return (AM3Opc >> 9);
+ }
+
+ //===--------------------------------------------------------------------===//
+ // Addressing Mode #4
+ //===--------------------------------------------------------------------===//
+ //
+ // This is used for load / store multiple instructions.
+ //
+ // addrmode4 := reg, <mode>
+ //
+ // The four modes are:
+ // IA - Increment after
+ // IB - Increment before
+ // DA - Decrement after
+ // DB - Decrement before
+ // For VFP instructions, only the IA and DB modes are valid.
+
+ static inline AMSubMode getAM4SubMode(unsigned Mode) {
+ return (AMSubMode)(Mode & 0x7);
+ }
+
+ static inline unsigned getAM4ModeImm(AMSubMode SubMode) {
+ return (int)SubMode;
+ }
+
+ //===--------------------------------------------------------------------===//
+ // Addressing Mode #5
+ //===--------------------------------------------------------------------===//
+ //
+ // This is used for coprocessor instructions, such as FP load/stores.
+ //
+ // addrmode5 := reg +/- imm8*4
+ //
+ // The first operand is always a Reg. The second operand encodes the
+ // operation (add or subtract) in bit 8 and the immediate in bits 0-7.
+
+ /// getAM5Opc - This function encodes the addrmode5 opc field.
+ static inline unsigned getAM5Opc(AddrOpc Opc, unsigned char Offset) {
+ bool isSub = Opc == sub;
+ return ((int)isSub << 8) | Offset;
+ }
+ static inline unsigned char getAM5Offset(unsigned AM5Opc) {
+ return AM5Opc & 0xFF;
+ }
+ static inline AddrOpc getAM5Op(unsigned AM5Opc) {
+ return ((AM5Opc >> 8) & 1) ? sub : add;
+ }
+
+ //===--------------------------------------------------------------------===//
+ // Addressing Mode #5 FP16
+ //===--------------------------------------------------------------------===//
+ //
+ // This is used for coprocessor instructions, such as 16-bit FP load/stores.
+ //
+ // addrmode5fp16 := reg +/- imm8*2
+ //
+ // The first operand is always a Reg. The second operand encodes the
+ // operation (add or subtract) in bit 8 and the immediate in bits 0-7.
+
+ /// getAM5FP16Opc - This function encodes the addrmode5fp16 opc field.
+ static inline unsigned getAM5FP16Opc(AddrOpc Opc, unsigned char Offset) {
+ bool isSub = Opc == sub;
+ return ((int)isSub << 8) | Offset;
+ }
+ static inline unsigned char getAM5FP16Offset(unsigned AM5Opc) {
+ return AM5Opc & 0xFF;
+ }
+ static inline AddrOpc getAM5FP16Op(unsigned AM5Opc) {
+ return ((AM5Opc >> 8) & 1) ? sub : add;
+ }
+
+ //===--------------------------------------------------------------------===//
+ // Addressing Mode #6
+ //===--------------------------------------------------------------------===//
+ //
+ // This is used for NEON load / store instructions.
+ //
+ // addrmode6 := reg with optional alignment
+ //
+ // This is stored in two operands [regaddr, align]. The first is the
+ // address register. The second operand is the value of the alignment
+ // specifier in bytes or zero if no explicit alignment.
+ // Valid alignments depend on the specific instruction.
+
+ //===--------------------------------------------------------------------===//
+ // NEON Modified Immediates
+ //===--------------------------------------------------------------------===//
+ //
+ // Several NEON instructions (e.g., VMOV) take a "modified immediate"
+ // vector operand, where a small immediate encoded in the instruction
+ // specifies a full NEON vector value. These modified immediates are
+ // represented here as encoded integers. The low 8 bits hold the immediate
+ // value; bit 12 holds the "Op" field of the instruction, and bits 11-8 hold
+ // the "Cmode" field of the instruction. The interfaces below treat the
+ // Op and Cmode values as a single 5-bit value.
+
+ static inline unsigned createNEONModImm(unsigned OpCmode, unsigned Val) {
+ return (OpCmode << 8) | Val;
+ }
+ static inline unsigned getNEONModImmOpCmode(unsigned ModImm) {
+ return (ModImm >> 8) & 0x1f;
+ }
+ static inline unsigned getNEONModImmVal(unsigned ModImm) {
+ return ModImm & 0xff;
+ }
+
+ /// decodeNEONModImm - Decode a NEON modified immediate value into the
+ /// element value and the element size in bits. (If the element size is
+ /// smaller than the vector, it is splatted into all the elements.)
+ static inline uint64_t decodeNEONModImm(unsigned ModImm, unsigned &EltBits) {
+ unsigned OpCmode = getNEONModImmOpCmode(ModImm);
+ unsigned Imm8 = getNEONModImmVal(ModImm);
+ uint64_t Val = 0;
+
+ if (OpCmode == 0xe) {
+ // 8-bit vector elements
+ Val = Imm8;
+ EltBits = 8;
+ } else if ((OpCmode & 0xc) == 0x8) {
+ // 16-bit vector elements
+ unsigned ByteNum = (OpCmode & 0x6) >> 1;
+ Val = Imm8 << (8 * ByteNum);
+ EltBits = 16;
+ } else if ((OpCmode & 0x8) == 0) {
+ // 32-bit vector elements, zero with one byte set
+ unsigned ByteNum = (OpCmode & 0x6) >> 1;
+ Val = Imm8 << (8 * ByteNum);
+ EltBits = 32;
+ } else if ((OpCmode & 0xe) == 0xc) {
+ // 32-bit vector elements, one byte with low bits set
+ unsigned ByteNum = 1 + (OpCmode & 0x1);
+ Val = (Imm8 << (8 * ByteNum)) | (0xffff >> (8 * (2 - ByteNum)));
+ EltBits = 32;
+ } else if (OpCmode == 0x1e) {
+ // 64-bit vector elements
+ for (unsigned ByteNum = 0; ByteNum < 8; ++ByteNum) {
+ if ((ModImm >> ByteNum) & 1)
+ Val |= (uint64_t)0xff << (8 * ByteNum);
+ }
+ EltBits = 64;
+ } else {
+ llvm_unreachable("Unsupported NEON immediate");
+ }
+ return Val;
+ }
+
+ // Generic validation for single-byte immediate (0X00, 00X0, etc).
+ static inline bool isNEONBytesplat(unsigned Value, unsigned Size) {
+ assert(Size >= 1 && Size <= 4 && "Invalid size");
+ unsigned count = 0;
+ for (unsigned i = 0; i < Size; ++i) {
+ if (Value & 0xff) count++;
+ Value >>= 8;
+ }
+ return count == 1;
+ }
+
+ /// Checks if Value is a correct immediate for instructions like VBIC/VORR.
+ static inline bool isNEONi16splat(unsigned Value) {
+ if (Value > 0xffff)
+ return false;
+ // i16 value with set bits only in one byte X0 or 0X.
+ return Value == 0 || isNEONBytesplat(Value, 2);
+ }
+
+ // Encode NEON 16 bits Splat immediate for instructions like VBIC/VORR
+ static inline unsigned encodeNEONi16splat(unsigned Value) {
+ assert(isNEONi16splat(Value) && "Invalid NEON splat value");
+ if (Value >= 0x100)
+ Value = (Value >> 8) | 0xa00;
+ else
+ Value |= 0x800;
+ return Value;
+ }
+
+ /// Checks if Value is a correct immediate for instructions like VBIC/VORR.
+ static inline bool isNEONi32splat(unsigned Value) {
+ // i32 value with set bits only in one byte X000, 0X00, 00X0, or 000X.
+ return Value == 0 || isNEONBytesplat(Value, 4);
+ }
+
+ /// Encode NEON 32 bits Splat immediate for instructions like VBIC/VORR.
+ static inline unsigned encodeNEONi32splat(unsigned Value) {
+ assert(isNEONi32splat(Value) && "Invalid NEON splat value");
+ if (Value >= 0x100 && Value <= 0xff00)
+ Value = (Value >> 8) | 0x200;
+ else if (Value > 0xffff && Value <= 0xff0000)
+ Value = (Value >> 16) | 0x400;
+ else if (Value > 0xffffff)
+ Value = (Value >> 24) | 0x600;
+ return Value;
+ }
+
+ //===--------------------------------------------------------------------===//
+ // Floating-point Immediates
+ //
+ static inline float getFPImmFloat(unsigned Imm) {
+ // We expect an 8-bit binary encoding of a floating-point number here.
+ union {
+ uint32_t I;
+ float F;
+ } FPUnion;
+
+ uint8_t Sign = (Imm >> 7) & 0x1;
+ uint8_t Exp = (Imm >> 4) & 0x7;
+ uint8_t Mantissa = Imm & 0xf;
+
+ // 8-bit FP iEEEE Float Encoding
+ // abcd efgh aBbbbbbc defgh000 00000000 00000000
+ //
+ // where B = NOT(b);
+
+ FPUnion.I = 0;
+ FPUnion.I |= Sign << 31;
+ FPUnion.I |= ((Exp & 0x4) != 0 ? 0 : 1) << 30;
+ FPUnion.I |= ((Exp & 0x4) != 0 ? 0x1f : 0) << 25;
+ FPUnion.I |= (Exp & 0x3) << 23;
+ FPUnion.I |= Mantissa << 19;
+ return FPUnion.F;
+ }
+
+ /// getFP16Imm - Return an 8-bit floating-point version of the 16-bit
+ /// floating-point value. If the value cannot be represented as an 8-bit
+ /// floating-point value, then return -1.
+ static inline int getFP16Imm(const APInt &Imm) {
+ uint32_t Sign = Imm.lshr(15).getZExtValue() & 1;
+ int32_t Exp = (Imm.lshr(10).getSExtValue() & 0x1f) - 15; // -14 to 15
+ int64_t Mantissa = Imm.getZExtValue() & 0x3ff; // 10 bits
+
+ // We can handle 4 bits of mantissa.
+ // mantissa = (16+UInt(e:f:g:h))/16.
+ if (Mantissa & 0x3f)
+ return -1;
+ Mantissa >>= 6;
+
+ // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
+ if (Exp < -3 || Exp > 4)
+ return -1;
+ Exp = ((Exp+3) & 0x7) ^ 4;
+
+ return ((int)Sign << 7) | (Exp << 4) | Mantissa;
+ }
+
+ static inline int getFP16Imm(const APFloat &FPImm) {
+ return getFP16Imm(FPImm.bitcastToAPInt());
+ }
+
+ /// getFP32Imm - Return an 8-bit floating-point version of the 32-bit
+ /// floating-point value. If the value cannot be represented as an 8-bit
+ /// floating-point value, then return -1.
+ static inline int getFP32Imm(const APInt &Imm) {
+ uint32_t Sign = Imm.lshr(31).getZExtValue() & 1;
+ int32_t Exp = (Imm.lshr(23).getSExtValue() & 0xff) - 127; // -126 to 127
+ int64_t Mantissa = Imm.getZExtValue() & 0x7fffff; // 23 bits
+
+ // We can handle 4 bits of mantissa.
+ // mantissa = (16+UInt(e:f:g:h))/16.
+ if (Mantissa & 0x7ffff)
+ return -1;
+ Mantissa >>= 19;
+ if ((Mantissa & 0xf) != Mantissa)
+ return -1;
+
+ // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
+ if (Exp < -3 || Exp > 4)
+ return -1;
+ Exp = ((Exp+3) & 0x7) ^ 4;
+
+ return ((int)Sign << 7) | (Exp << 4) | Mantissa;
+ }
+
+ static inline int getFP32Imm(const APFloat &FPImm) {
+ return getFP32Imm(FPImm.bitcastToAPInt());
+ }
+
+ /// getFP64Imm - Return an 8-bit floating-point version of the 64-bit
+ /// floating-point value. If the value cannot be represented as an 8-bit
+ /// floating-point value, then return -1.
+ static inline int getFP64Imm(const APInt &Imm) {
+ uint64_t Sign = Imm.lshr(63).getZExtValue() & 1;
+ int64_t Exp = (Imm.lshr(52).getSExtValue() & 0x7ff) - 1023; // -1022 to 1023
+ uint64_t Mantissa = Imm.getZExtValue() & 0xfffffffffffffULL;
+
+ // We can handle 4 bits of mantissa.
+ // mantissa = (16+UInt(e:f:g:h))/16.
+ if (Mantissa & 0xffffffffffffULL)
+ return -1;
+ Mantissa >>= 48;
+ if ((Mantissa & 0xf) != Mantissa)
+ return -1;
+
+ // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
+ if (Exp < -3 || Exp > 4)
+ return -1;
+ Exp = ((Exp+3) & 0x7) ^ 4;
+
+ return ((int)Sign << 7) | (Exp << 4) | Mantissa;
+ }
+
+ static inline int getFP64Imm(const APFloat &FPImm) {
+ return getFP64Imm(FPImm.bitcastToAPInt());
+ }
+
+} // end namespace ARM_AM
+} // end namespace llvm
+
+#endif
+
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
new file mode 100644
index 000000000000..a58d5b34131b
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -0,0 +1,1168 @@
+//===-- ARMAsmBackend.cpp - ARM Assembler Backend -------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMAsmBackend.h"
+#include "MCTargetDesc/ARMAsmBackendDarwin.h"
+#include "MCTargetDesc/ARMAsmBackendELF.h"
+#include "MCTargetDesc/ARMAsmBackendWinCOFF.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "MCTargetDesc/ARMFixupKinds.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MachO.h"
+#include "llvm/Support/TargetParser.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+namespace {
+class ARMELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+ ARMELFObjectWriter(uint8_t OSABI)
+ : MCELFObjectTargetWriter(/*Is64Bit*/ false, OSABI, ELF::EM_ARM,
+ /*HasRelocationAddend*/ false) {}
+};
+} // end anonymous namespace
+
+const MCFixupKindInfo &ARMAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
+ const static MCFixupKindInfo InfosLE[ARM::NumTargetFixupKinds] = {
+ // This table *must* be in the order that the fixup_* kinds are defined in
+ // ARMFixupKinds.h.
+ //
+ // Name Offset (bits) Size (bits) Flags
+ {"fixup_arm_ldst_pcrel_12", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_t2_ldst_pcrel_12", 0, 32,
+ MCFixupKindInfo::FKF_IsPCRel |
+ MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+ {"fixup_arm_pcrel_10_unscaled", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_arm_pcrel_10", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_t2_pcrel_10", 0, 32,
+ MCFixupKindInfo::FKF_IsPCRel |
+ MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+ {"fixup_arm_pcrel_9", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_t2_pcrel_9", 0, 32,
+ MCFixupKindInfo::FKF_IsPCRel |
+ MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+ {"fixup_thumb_adr_pcrel_10", 0, 8,
+ MCFixupKindInfo::FKF_IsPCRel |
+ MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+ {"fixup_arm_adr_pcrel_12", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_t2_adr_pcrel_12", 0, 32,
+ MCFixupKindInfo::FKF_IsPCRel |
+ MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+ {"fixup_arm_condbranch", 0, 24, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_arm_uncondbranch", 0, 24, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_t2_condbranch", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_t2_uncondbranch", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_arm_thumb_br", 0, 16, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_arm_uncondbl", 0, 24, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_arm_condbl", 0, 24, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_arm_blx", 0, 24, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_arm_thumb_bl", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_arm_thumb_blx", 0, 32,
+ MCFixupKindInfo::FKF_IsPCRel |
+ MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+ {"fixup_arm_thumb_cb", 0, 16, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_arm_thumb_cp", 0, 8,
+ MCFixupKindInfo::FKF_IsPCRel |
+ MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+ {"fixup_arm_thumb_bcc", 0, 8, MCFixupKindInfo::FKF_IsPCRel},
+ // movw / movt: 16-bits immediate but scattered into two chunks 0 - 12, 16
+ // - 19.
+ {"fixup_arm_movt_hi16", 0, 20, 0},
+ {"fixup_arm_movw_lo16", 0, 20, 0},
+ {"fixup_t2_movt_hi16", 0, 20, 0},
+ {"fixup_t2_movw_lo16", 0, 20, 0},
+ {"fixup_arm_mod_imm", 0, 12, 0},
+ };
+ const static MCFixupKindInfo InfosBE[ARM::NumTargetFixupKinds] = {
+ // This table *must* be in the order that the fixup_* kinds are defined in
+ // ARMFixupKinds.h.
+ //
+ // Name Offset (bits) Size (bits) Flags
+ {"fixup_arm_ldst_pcrel_12", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_t2_ldst_pcrel_12", 0, 32,
+ MCFixupKindInfo::FKF_IsPCRel |
+ MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+ {"fixup_arm_pcrel_10_unscaled", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_arm_pcrel_10", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_t2_pcrel_10", 0, 32,
+ MCFixupKindInfo::FKF_IsPCRel |
+ MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+ {"fixup_arm_pcrel_9", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_t2_pcrel_9", 0, 32,
+ MCFixupKindInfo::FKF_IsPCRel |
+ MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+ {"fixup_thumb_adr_pcrel_10", 8, 8,
+ MCFixupKindInfo::FKF_IsPCRel |
+ MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+ {"fixup_arm_adr_pcrel_12", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_t2_adr_pcrel_12", 0, 32,
+ MCFixupKindInfo::FKF_IsPCRel |
+ MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+ {"fixup_arm_condbranch", 8, 24, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_arm_uncondbranch", 8, 24, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_t2_condbranch", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_t2_uncondbranch", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_arm_thumb_br", 0, 16, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_arm_uncondbl", 8, 24, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_arm_condbl", 8, 24, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_arm_blx", 8, 24, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_arm_thumb_bl", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_arm_thumb_blx", 0, 32,
+ MCFixupKindInfo::FKF_IsPCRel |
+ MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+ {"fixup_arm_thumb_cb", 0, 16, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_arm_thumb_cp", 8, 8,
+ MCFixupKindInfo::FKF_IsPCRel |
+ MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+ {"fixup_arm_thumb_bcc", 8, 8, MCFixupKindInfo::FKF_IsPCRel},
+ // movw / movt: 16-bits immediate but scattered into two chunks 0 - 12, 16
+ // - 19.
+ {"fixup_arm_movt_hi16", 12, 20, 0},
+ {"fixup_arm_movw_lo16", 12, 20, 0},
+ {"fixup_t2_movt_hi16", 12, 20, 0},
+ {"fixup_t2_movw_lo16", 12, 20, 0},
+ {"fixup_arm_mod_imm", 20, 12, 0},
+ };
+
+ if (Kind < FirstTargetFixupKind)
+ return MCAsmBackend::getFixupKindInfo(Kind);
+
+ assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+ "Invalid kind!");
+ return (IsLittleEndian ? InfosLE : InfosBE)[Kind - FirstTargetFixupKind];
+}
+
+void ARMAsmBackend::handleAssemblerFlag(MCAssemblerFlag Flag) {
+ switch (Flag) {
+ default:
+ break;
+ case MCAF_Code16:
+ setIsThumb(true);
+ break;
+ case MCAF_Code32:
+ setIsThumb(false);
+ break;
+ }
+}
+
+unsigned ARMAsmBackend::getRelaxedOpcode(unsigned Op) const {
+ bool HasThumb2 = STI->getFeatureBits()[ARM::FeatureThumb2];
+ bool HasV8MBaselineOps = STI->getFeatureBits()[ARM::HasV8MBaselineOps];
+
+ switch (Op) {
+ default:
+ return Op;
+ case ARM::tBcc:
+ return HasThumb2 ? (unsigned)ARM::t2Bcc : Op;
+ case ARM::tLDRpci:
+ return HasThumb2 ? (unsigned)ARM::t2LDRpci : Op;
+ case ARM::tADR:
+ return HasThumb2 ? (unsigned)ARM::t2ADR : Op;
+ case ARM::tB:
+ return HasV8MBaselineOps ? (unsigned)ARM::t2B : Op;
+ case ARM::tCBZ:
+ return ARM::tHINT;
+ case ARM::tCBNZ:
+ return ARM::tHINT;
+ }
+}
+
+bool ARMAsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
+ if (getRelaxedOpcode(Inst.getOpcode()) != Inst.getOpcode())
+ return true;
+ return false;
+}
+
+const char *ARMAsmBackend::reasonForFixupRelaxation(const MCFixup &Fixup,
+ uint64_t Value) const {
+ switch ((unsigned)Fixup.getKind()) {
+ case ARM::fixup_arm_thumb_br: {
+ // Relaxing tB to t2B. tB has a signed 12-bit displacement with the
+ // low bit being an implied zero. There's an implied +4 offset for the
+ // branch, so we adjust the other way here to determine what's
+ // encodable.
+ //
+ // Relax if the value is too big for a (signed) i8.
+ int64_t Offset = int64_t(Value) - 4;
+ if (Offset > 2046 || Offset < -2048)
+ return "out of range pc-relative fixup value";
+ break;
+ }
+ case ARM::fixup_arm_thumb_bcc: {
+ // Relaxing tBcc to t2Bcc. tBcc has a signed 9-bit displacement with the
+ // low bit being an implied zero. There's an implied +4 offset for the
+ // branch, so we adjust the other way here to determine what's
+ // encodable.
+ //
+ // Relax if the value is too big for a (signed) i8.
+ int64_t Offset = int64_t(Value) - 4;
+ if (Offset > 254 || Offset < -256)
+ return "out of range pc-relative fixup value";
+ break;
+ }
+ case ARM::fixup_thumb_adr_pcrel_10:
+ case ARM::fixup_arm_thumb_cp: {
+ // If the immediate is negative, greater than 1020, or not a multiple
+ // of four, the wide version of the instruction must be used.
+ int64_t Offset = int64_t(Value) - 4;
+ if (Offset & 3)
+ return "misaligned pc-relative fixup value";
+ else if (Offset > 1020 || Offset < 0)
+ return "out of range pc-relative fixup value";
+ break;
+ }
+ case ARM::fixup_arm_thumb_cb: {
+ // If we have a Thumb CBZ or CBNZ instruction and its target is the next
+ // instruction it is is actually out of range for the instruction.
+ // It will be changed to a NOP.
+ int64_t Offset = (Value & ~1);
+ if (Offset == 2)
+ return "will be converted to nop";
+ break;
+ }
+ default:
+ llvm_unreachable("Unexpected fixup kind in reasonForFixupRelaxation()!");
+ }
+ return nullptr;
+}
+
+bool ARMAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+ const MCRelaxableFragment *DF,
+ const MCAsmLayout &Layout) const {
+ return reasonForFixupRelaxation(Fixup, Value);
+}
+
+void ARMAsmBackend::relaxInstruction(const MCInst &Inst,
+ const MCSubtargetInfo &STI,
+ MCInst &Res) const {
+ unsigned RelaxedOp = getRelaxedOpcode(Inst.getOpcode());
+
+ // Sanity check w/ diagnostic if we get here w/ a bogus instruction.
+ if (RelaxedOp == Inst.getOpcode()) {
+ SmallString<256> Tmp;
+ raw_svector_ostream OS(Tmp);
+ Inst.dump_pretty(OS);
+ OS << "\n";
+ report_fatal_error("unexpected instruction to relax: " + OS.str());
+ }
+
+ // If we are changing Thumb CBZ or CBNZ instruction to a NOP, aka tHINT, we
+ // have to change the operands too.
+ if ((Inst.getOpcode() == ARM::tCBZ || Inst.getOpcode() == ARM::tCBNZ) &&
+ RelaxedOp == ARM::tHINT) {
+ Res.setOpcode(RelaxedOp);
+ Res.addOperand(MCOperand::createImm(0));
+ Res.addOperand(MCOperand::createImm(14));
+ Res.addOperand(MCOperand::createReg(0));
+ return;
+ }
+
+ // The rest of instructions we're relaxing have the same operands.
+ // We just need to update to the proper opcode.
+ Res = Inst;
+ Res.setOpcode(RelaxedOp);
+}
+
+bool ARMAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+ const uint16_t Thumb1_16bitNopEncoding = 0x46c0; // using MOV r8,r8
+ const uint16_t Thumb2_16bitNopEncoding = 0xbf00; // NOP
+ const uint32_t ARMv4_NopEncoding = 0xe1a00000; // using MOV r0,r0
+ const uint32_t ARMv6T2_NopEncoding = 0xe320f000; // NOP
+ if (isThumb()) {
+ const uint16_t nopEncoding =
+ hasNOP() ? Thumb2_16bitNopEncoding : Thumb1_16bitNopEncoding;
+ uint64_t NumNops = Count / 2;
+ for (uint64_t i = 0; i != NumNops; ++i)
+ OW->write16(nopEncoding);
+ if (Count & 1)
+ OW->write8(0);
+ return true;
+ }
+ // ARM mode
+ const uint32_t nopEncoding =
+ hasNOP() ? ARMv6T2_NopEncoding : ARMv4_NopEncoding;
+ uint64_t NumNops = Count / 4;
+ for (uint64_t i = 0; i != NumNops; ++i)
+ OW->write32(nopEncoding);
+ // FIXME: should this function return false when unable to write exactly
+ // 'Count' bytes with NOP encodings?
+ switch (Count % 4) {
+ default:
+ break; // No leftover bytes to write
+ case 1:
+ OW->write8(0);
+ break;
+ case 2:
+ OW->write16(0);
+ break;
+ case 3:
+ OW->write16(0);
+ OW->write8(0xa0);
+ break;
+ }
+
+ return true;
+}
+
+static uint32_t swapHalfWords(uint32_t Value, bool IsLittleEndian) {
+ if (IsLittleEndian) {
+ // Note that the halfwords are stored high first and low second in thumb;
+ // so we need to swap the fixup value here to map properly.
+ uint32_t Swapped = (Value & 0xFFFF0000) >> 16;
+ Swapped |= (Value & 0x0000FFFF) << 16;
+ return Swapped;
+ } else
+ return Value;
+}
+
+static uint32_t joinHalfWords(uint32_t FirstHalf, uint32_t SecondHalf,
+ bool IsLittleEndian) {
+ uint32_t Value;
+
+ if (IsLittleEndian) {
+ Value = (SecondHalf & 0xFFFF) << 16;
+ Value |= (FirstHalf & 0xFFFF);
+ } else {
+ Value = (SecondHalf & 0xFFFF);
+ Value |= (FirstHalf & 0xFFFF) << 16;
+ }
+
+ return Value;
+}
+
+unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
+ bool IsPCRel, MCContext *Ctx,
+ bool IsLittleEndian,
+ bool IsResolved) const {
+ unsigned Kind = Fixup.getKind();
+ switch (Kind) {
+ default:
+ llvm_unreachable("Unknown fixup kind!");
+ case FK_Data_1:
+ case FK_Data_2:
+ case FK_Data_4:
+ return Value;
+ case FK_SecRel_2:
+ return Value;
+ case FK_SecRel_4:
+ return Value;
+ case ARM::fixup_arm_movt_hi16:
+ if (!IsPCRel)
+ Value >>= 16;
+ LLVM_FALLTHROUGH;
+ case ARM::fixup_arm_movw_lo16: {
+ unsigned Hi4 = (Value & 0xF000) >> 12;
+ unsigned Lo12 = Value & 0x0FFF;
+ // inst{19-16} = Hi4;
+ // inst{11-0} = Lo12;
+ Value = (Hi4 << 16) | (Lo12);
+ return Value;
+ }
+ case ARM::fixup_t2_movt_hi16:
+ if (!IsPCRel)
+ Value >>= 16;
+ LLVM_FALLTHROUGH;
+ case ARM::fixup_t2_movw_lo16: {
+ unsigned Hi4 = (Value & 0xF000) >> 12;
+ unsigned i = (Value & 0x800) >> 11;
+ unsigned Mid3 = (Value & 0x700) >> 8;
+ unsigned Lo8 = Value & 0x0FF;
+ // inst{19-16} = Hi4;
+ // inst{26} = i;
+ // inst{14-12} = Mid3;
+ // inst{7-0} = Lo8;
+ Value = (Hi4 << 16) | (i << 26) | (Mid3 << 12) | (Lo8);
+ return swapHalfWords(Value, IsLittleEndian);
+ }
+ case ARM::fixup_arm_ldst_pcrel_12:
+ // ARM PC-relative values are offset by 8.
+ Value -= 4;
+ LLVM_FALLTHROUGH;
+ case ARM::fixup_t2_ldst_pcrel_12: {
+ // Offset by 4, adjusted by two due to the half-word ordering of thumb.
+ Value -= 4;
+ bool isAdd = true;
+ if ((int64_t)Value < 0) {
+ Value = -Value;
+ isAdd = false;
+ }
+ if (Ctx && Value >= 4096) {
+ Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+ return 0;
+ }
+ Value |= isAdd << 23;
+
+ // Same addressing mode as fixup_arm_pcrel_10,
+ // but with 16-bit halfwords swapped.
+ if (Kind == ARM::fixup_t2_ldst_pcrel_12)
+ return swapHalfWords(Value, IsLittleEndian);
+
+ return Value;
+ }
+ case ARM::fixup_arm_adr_pcrel_12: {
+ // ARM PC-relative values are offset by 8.
+ Value -= 8;
+ unsigned opc = 4; // bits {24-21}. Default to add: 0b0100
+ if ((int64_t)Value < 0) {
+ Value = -Value;
+ opc = 2; // 0b0010
+ }
+ if (Ctx && ARM_AM::getSOImmVal(Value) == -1) {
+ Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+ return 0;
+ }
+ // Encode the immediate and shift the opcode into place.
+ return ARM_AM::getSOImmVal(Value) | (opc << 21);
+ }
+
+ case ARM::fixup_t2_adr_pcrel_12: {
+ Value -= 4;
+ unsigned opc = 0;
+ if ((int64_t)Value < 0) {
+ Value = -Value;
+ opc = 5;
+ }
+
+ uint32_t out = (opc << 21);
+ out |= (Value & 0x800) << 15;
+ out |= (Value & 0x700) << 4;
+ out |= (Value & 0x0FF);
+
+ return swapHalfWords(out, IsLittleEndian);
+ }
+
+ case ARM::fixup_arm_condbranch:
+ case ARM::fixup_arm_uncondbranch:
+ case ARM::fixup_arm_uncondbl:
+ case ARM::fixup_arm_condbl:
+ case ARM::fixup_arm_blx:
+ // These values don't encode the low two bits since they're always zero.
+ // Offset by 8 just as above.
+ if (const MCSymbolRefExpr *SRE =
+ dyn_cast<MCSymbolRefExpr>(Fixup.getValue()))
+ if (SRE->getKind() == MCSymbolRefExpr::VK_TLSCALL)
+ return 0;
+ return 0xffffff & ((Value - 8) >> 2);
+ case ARM::fixup_t2_uncondbranch: {
+ Value = Value - 4;
+ Value >>= 1; // Low bit is not encoded.
+
+ uint32_t out = 0;
+ bool I = Value & 0x800000;
+ bool J1 = Value & 0x400000;
+ bool J2 = Value & 0x200000;
+ J1 ^= I;
+ J2 ^= I;
+
+ out |= I << 26; // S bit
+ out |= !J1 << 13; // J1 bit
+ out |= !J2 << 11; // J2 bit
+ out |= (Value & 0x1FF800) << 5; // imm6 field
+ out |= (Value & 0x0007FF); // imm11 field
+
+ return swapHalfWords(out, IsLittleEndian);
+ }
+ case ARM::fixup_t2_condbranch: {
+ Value = Value - 4;
+ Value >>= 1; // Low bit is not encoded.
+
+ uint64_t out = 0;
+ out |= (Value & 0x80000) << 7; // S bit
+ out |= (Value & 0x40000) >> 7; // J2 bit
+ out |= (Value & 0x20000) >> 4; // J1 bit
+ out |= (Value & 0x1F800) << 5; // imm6 field
+ out |= (Value & 0x007FF); // imm11 field
+
+ return swapHalfWords(out, IsLittleEndian);
+ }
+ case ARM::fixup_arm_thumb_bl: {
+ // The value doesn't encode the low bit (always zero) and is offset by
+ // four. The 32-bit immediate value is encoded as
+ // imm32 = SignExtend(S:I1:I2:imm10:imm11:0)
+ // where I1 = NOT(J1 ^ S) and I2 = NOT(J2 ^ S).
+ // The value is encoded into disjoint bit positions in the destination
+ // opcode. x = unchanged, I = immediate value bit, S = sign extension bit,
+ // J = either J1 or J2 bit
+ //
+ // BL: xxxxxSIIIIIIIIII xxJxJIIIIIIIIIII
+ //
+ // Note that the halfwords are stored high first, low second; so we need
+ // to transpose the fixup value here to map properly.
+ uint32_t offset = (Value - 4) >> 1;
+ uint32_t signBit = (offset & 0x800000) >> 23;
+ uint32_t I1Bit = (offset & 0x400000) >> 22;
+ uint32_t J1Bit = (I1Bit ^ 0x1) ^ signBit;
+ uint32_t I2Bit = (offset & 0x200000) >> 21;
+ uint32_t J2Bit = (I2Bit ^ 0x1) ^ signBit;
+ uint32_t imm10Bits = (offset & 0x1FF800) >> 11;
+ uint32_t imm11Bits = (offset & 0x000007FF);
+
+ uint32_t FirstHalf = (((uint16_t)signBit << 10) | (uint16_t)imm10Bits);
+ uint32_t SecondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) |
+ (uint16_t)imm11Bits);
+ return joinHalfWords(FirstHalf, SecondHalf, IsLittleEndian);
+ }
+ case ARM::fixup_arm_thumb_blx: {
+ // The value doesn't encode the low two bits (always zero) and is offset by
+ // four (see fixup_arm_thumb_cp). The 32-bit immediate value is encoded as
+ // imm32 = SignExtend(S:I1:I2:imm10H:imm10L:00)
+ // where I1 = NOT(J1 ^ S) and I2 = NOT(J2 ^ S).
+ // The value is encoded into disjoint bit positions in the destination
+ // opcode. x = unchanged, I = immediate value bit, S = sign extension bit,
+ // J = either J1 or J2 bit, 0 = zero.
+ //
+ // BLX: xxxxxSIIIIIIIIII xxJxJIIIIIIIIII0
+ //
+ // Note that the halfwords are stored high first, low second; so we need
+ // to transpose the fixup value here to map properly.
+ if (Ctx && Value % 4 != 0) {
+ Ctx->reportError(Fixup.getLoc(), "misaligned ARM call destination");
+ return 0;
+ }
+
+ uint32_t offset = (Value - 4) >> 2;
+ if (const MCSymbolRefExpr *SRE =
+ dyn_cast<MCSymbolRefExpr>(Fixup.getValue()))
+ if (SRE->getKind() == MCSymbolRefExpr::VK_TLSCALL)
+ offset = 0;
+ uint32_t signBit = (offset & 0x400000) >> 22;
+ uint32_t I1Bit = (offset & 0x200000) >> 21;
+ uint32_t J1Bit = (I1Bit ^ 0x1) ^ signBit;
+ uint32_t I2Bit = (offset & 0x100000) >> 20;
+ uint32_t J2Bit = (I2Bit ^ 0x1) ^ signBit;
+ uint32_t imm10HBits = (offset & 0xFFC00) >> 10;
+ uint32_t imm10LBits = (offset & 0x3FF);
+
+ uint32_t FirstHalf = (((uint16_t)signBit << 10) | (uint16_t)imm10HBits);
+ uint32_t SecondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) |
+ ((uint16_t)imm10LBits) << 1);
+ return joinHalfWords(FirstHalf, SecondHalf, IsLittleEndian);
+ }
+ case ARM::fixup_thumb_adr_pcrel_10:
+ case ARM::fixup_arm_thumb_cp:
+ // On CPUs supporting Thumb2, this will be relaxed to an ldr.w, otherwise we
+ // could have an error on our hands.
+ if (Ctx && !STI->getFeatureBits()[ARM::FeatureThumb2] && IsResolved) {
+ const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value);
+ if (FixupDiagnostic) {
+ Ctx->reportError(Fixup.getLoc(), FixupDiagnostic);
+ return 0;
+ }
+ }
+ // Offset by 4, and don't encode the low two bits.
+ return ((Value - 4) >> 2) & 0xff;
+ case ARM::fixup_arm_thumb_cb: {
+ // CB instructions can only branch to offsets in [4, 126] in multiples of 2
+ // so ensure that the raw value LSB is zero and it lies in [2, 130].
+ // An offset of 2 will be relaxed to a NOP.
+ if (Ctx && ((int64_t)Value < 2 || Value > 0x82 || Value & 1)) {
+ Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+ return 0;
+ }
+ // Offset by 4 and don't encode the lower bit, which is always 0.
+ // FIXME: diagnose if no Thumb2
+ uint32_t Binary = (Value - 4) >> 1;
+ return ((Binary & 0x20) << 4) | ((Binary & 0x1f) << 3);
+ }
+ case ARM::fixup_arm_thumb_br:
+ // Offset by 4 and don't encode the lower bit, which is always 0.
+ if (Ctx && !STI->getFeatureBits()[ARM::FeatureThumb2] &&
+ !STI->getFeatureBits()[ARM::HasV8MBaselineOps]) {
+ const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value);
+ if (FixupDiagnostic) {
+ Ctx->reportError(Fixup.getLoc(), FixupDiagnostic);
+ return 0;
+ }
+ }
+ return ((Value - 4) >> 1) & 0x7ff;
+ case ARM::fixup_arm_thumb_bcc:
+ // Offset by 4 and don't encode the lower bit, which is always 0.
+ if (Ctx && !STI->getFeatureBits()[ARM::FeatureThumb2]) {
+ const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value);
+ if (FixupDiagnostic) {
+ Ctx->reportError(Fixup.getLoc(), FixupDiagnostic);
+ return 0;
+ }
+ }
+ return ((Value - 4) >> 1) & 0xff;
+ case ARM::fixup_arm_pcrel_10_unscaled: {
+ Value = Value - 8; // ARM fixups offset by an additional word and don't
+ // need to adjust for the half-word ordering.
+ bool isAdd = true;
+ if ((int64_t)Value < 0) {
+ Value = -Value;
+ isAdd = false;
+ }
+ // The value has the low 4 bits encoded in [3:0] and the high 4 in [11:8].
+ if (Ctx && Value >= 256) {
+ Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+ return 0;
+ }
+ Value = (Value & 0xf) | ((Value & 0xf0) << 4);
+ return Value | (isAdd << 23);
+ }
+ case ARM::fixup_arm_pcrel_10:
+ Value = Value - 4; // ARM fixups offset by an additional word and don't
+ // need to adjust for the half-word ordering.
+ LLVM_FALLTHROUGH;
+ case ARM::fixup_t2_pcrel_10: {
+ // Offset by 4, adjusted by two due to the half-word ordering of thumb.
+ Value = Value - 4;
+ bool isAdd = true;
+ if ((int64_t)Value < 0) {
+ Value = -Value;
+ isAdd = false;
+ }
+ // These values don't encode the low two bits since they're always zero.
+ Value >>= 2;
+ if (Ctx && Value >= 256) {
+ Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+ return 0;
+ }
+ Value |= isAdd << 23;
+
+ // Same addressing mode as fixup_arm_pcrel_10, but with 16-bit halfwords
+ // swapped.
+ if (Kind == ARM::fixup_t2_pcrel_10)
+ return swapHalfWords(Value, IsLittleEndian);
+
+ return Value;
+ }
+ case ARM::fixup_arm_pcrel_9:
+ Value = Value - 4; // ARM fixups offset by an additional word and don't
+ // need to adjust for the half-word ordering.
+ LLVM_FALLTHROUGH;
+ case ARM::fixup_t2_pcrel_9: {
+ // Offset by 4, adjusted by two due to the half-word ordering of thumb.
+ Value = Value - 4;
+ bool isAdd = true;
+ if ((int64_t)Value < 0) {
+ Value = -Value;
+ isAdd = false;
+ }
+ // These values don't encode the low bit since it's always zero.
+ if (Ctx && (Value & 1)) {
+ Ctx->reportError(Fixup.getLoc(), "invalid value for this fixup");
+ return 0;
+ }
+ Value >>= 1;
+ if (Ctx && Value >= 256) {
+ Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+ return 0;
+ }
+ Value |= isAdd << 23;
+
+ // Same addressing mode as fixup_arm_pcrel_9, but with 16-bit halfwords
+ // swapped.
+ if (Kind == ARM::fixup_t2_pcrel_9)
+ return swapHalfWords(Value, IsLittleEndian);
+
+ return Value;
+ }
+ case ARM::fixup_arm_mod_imm:
+ Value = ARM_AM::getSOImmVal(Value);
+ if (Ctx && Value >> 12) {
+ Ctx->reportError(Fixup.getLoc(), "out of range immediate fixup value");
+ return 0;
+ }
+ return Value;
+ }
+}
+
+void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFixup &Fixup,
+ const MCFragment *DF,
+ const MCValue &Target, uint64_t &Value,
+ bool &IsResolved) {
+ const MCSymbolRefExpr *A = Target.getSymA();
+ const MCSymbol *Sym = A ? &A->getSymbol() : nullptr;
+ // MachO (the only user of "Value") tries to make .o files that look vaguely
+ // pre-linked, so for MOVW/MOVT and .word relocations they put the Thumb bit
+ // into the addend if possible. Other relocation types don't want this bit
+ // though (branches couldn't encode it if it *was* present, and no other
+ // relocations exist) and it can interfere with checking valid expressions.
+ if ((unsigned)Fixup.getKind() == FK_Data_4 ||
+ (unsigned)Fixup.getKind() == ARM::fixup_arm_movw_lo16 ||
+ (unsigned)Fixup.getKind() == ARM::fixup_arm_movt_hi16 ||
+ (unsigned)Fixup.getKind() == ARM::fixup_t2_movw_lo16 ||
+ (unsigned)Fixup.getKind() == ARM::fixup_t2_movt_hi16) {
+ if (Sym) {
+ if (Asm.isThumbFunc(Sym))
+ Value |= 1;
+ }
+ }
+ if (IsResolved && (unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_bl) {
+ assert(Sym && "How did we resolve this?");
+
+ // If the symbol is external the linker will handle it.
+ // FIXME: Should we handle it as an optimization?
+
+ // If the symbol is out of range, produce a relocation and hope the
+ // linker can handle it. GNU AS produces an error in this case.
+ if (Sym->isExternal() || Value >= 0x400004)
+ IsResolved = false;
+ }
+ // We must always generate a relocation for BL/BLX instructions if we have
+ // a symbol to reference, as the linker relies on knowing the destination
+ // symbol's thumb-ness to get interworking right.
+ if (A && ((unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_blx ||
+ (unsigned)Fixup.getKind() == ARM::fixup_arm_blx ||
+ (unsigned)Fixup.getKind() == ARM::fixup_arm_uncondbl ||
+ (unsigned)Fixup.getKind() == ARM::fixup_arm_condbl))
+ IsResolved = false;
+
+ // Try to get the encoded value for the fixup as-if we're mapping it into
+ // the instruction. This allows adjustFixupValue() to issue a diagnostic
+ // if the value aren't invalid.
+ (void)adjustFixupValue(Fixup, Value, false, &Asm.getContext(),
+ IsLittleEndian, IsResolved);
+}
+
+/// getFixupKindNumBytes - The number of bytes the fixup may change.
+static unsigned getFixupKindNumBytes(unsigned Kind) {
+ switch (Kind) {
+ default:
+ llvm_unreachable("Unknown fixup kind!");
+
+ case FK_Data_1:
+ case ARM::fixup_arm_thumb_bcc:
+ case ARM::fixup_arm_thumb_cp:
+ case ARM::fixup_thumb_adr_pcrel_10:
+ return 1;
+
+ case FK_Data_2:
+ case ARM::fixup_arm_thumb_br:
+ case ARM::fixup_arm_thumb_cb:
+ case ARM::fixup_arm_mod_imm:
+ return 2;
+
+ case ARM::fixup_arm_pcrel_10_unscaled:
+ case ARM::fixup_arm_ldst_pcrel_12:
+ case ARM::fixup_arm_pcrel_10:
+ case ARM::fixup_arm_pcrel_9:
+ case ARM::fixup_arm_adr_pcrel_12:
+ case ARM::fixup_arm_uncondbl:
+ case ARM::fixup_arm_condbl:
+ case ARM::fixup_arm_blx:
+ case ARM::fixup_arm_condbranch:
+ case ARM::fixup_arm_uncondbranch:
+ return 3;
+
+ case FK_Data_4:
+ case ARM::fixup_t2_ldst_pcrel_12:
+ case ARM::fixup_t2_condbranch:
+ case ARM::fixup_t2_uncondbranch:
+ case ARM::fixup_t2_pcrel_10:
+ case ARM::fixup_t2_pcrel_9:
+ case ARM::fixup_t2_adr_pcrel_12:
+ case ARM::fixup_arm_thumb_bl:
+ case ARM::fixup_arm_thumb_blx:
+ case ARM::fixup_arm_movt_hi16:
+ case ARM::fixup_arm_movw_lo16:
+ case ARM::fixup_t2_movt_hi16:
+ case ARM::fixup_t2_movw_lo16:
+ return 4;
+
+ case FK_SecRel_2:
+ return 2;
+ case FK_SecRel_4:
+ return 4;
+ }
+}
+
+/// getFixupKindContainerSizeBytes - The number of bytes of the
+/// container involved in big endian.
+static unsigned getFixupKindContainerSizeBytes(unsigned Kind) {
+ switch (Kind) {
+ default:
+ llvm_unreachable("Unknown fixup kind!");
+
+ case FK_Data_1:
+ return 1;
+ case FK_Data_2:
+ return 2;
+ case FK_Data_4:
+ return 4;
+
+ case ARM::fixup_arm_thumb_bcc:
+ case ARM::fixup_arm_thumb_cp:
+ case ARM::fixup_thumb_adr_pcrel_10:
+ case ARM::fixup_arm_thumb_br:
+ case ARM::fixup_arm_thumb_cb:
+ // Instruction size is 2 bytes.
+ return 2;
+
+ case ARM::fixup_arm_pcrel_10_unscaled:
+ case ARM::fixup_arm_ldst_pcrel_12:
+ case ARM::fixup_arm_pcrel_10:
+ case ARM::fixup_arm_adr_pcrel_12:
+ case ARM::fixup_arm_uncondbl:
+ case ARM::fixup_arm_condbl:
+ case ARM::fixup_arm_blx:
+ case ARM::fixup_arm_condbranch:
+ case ARM::fixup_arm_uncondbranch:
+ case ARM::fixup_t2_ldst_pcrel_12:
+ case ARM::fixup_t2_condbranch:
+ case ARM::fixup_t2_uncondbranch:
+ case ARM::fixup_t2_pcrel_10:
+ case ARM::fixup_t2_adr_pcrel_12:
+ case ARM::fixup_arm_thumb_bl:
+ case ARM::fixup_arm_thumb_blx:
+ case ARM::fixup_arm_movt_hi16:
+ case ARM::fixup_arm_movw_lo16:
+ case ARM::fixup_t2_movt_hi16:
+ case ARM::fixup_t2_movw_lo16:
+ case ARM::fixup_arm_mod_imm:
+ // Instruction size is 4 bytes.
+ return 4;
+ }
+}
+
+void ARMAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+ unsigned DataSize, uint64_t Value,
+ bool IsPCRel) const {
+ unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
+ Value =
+ adjustFixupValue(Fixup, Value, IsPCRel, nullptr, IsLittleEndian, true);
+ if (!Value)
+ return; // Doesn't change encoding.
+
+ unsigned Offset = Fixup.getOffset();
+ assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+
+ // Used to point to big endian bytes.
+ unsigned FullSizeBytes;
+ if (!IsLittleEndian) {
+ FullSizeBytes = getFixupKindContainerSizeBytes(Fixup.getKind());
+ assert((Offset + FullSizeBytes) <= DataSize && "Invalid fixup size!");
+ assert(NumBytes <= FullSizeBytes && "Invalid fixup size!");
+ }
+
+ // For each byte of the fragment that the fixup touches, mask in the bits from
+ // the fixup value. The Value has been "split up" into the appropriate
+ // bitfields above.
+ for (unsigned i = 0; i != NumBytes; ++i) {
+ unsigned Idx = IsLittleEndian ? i : (FullSizeBytes - 1 - i);
+ Data[Offset + Idx] |= uint8_t((Value >> (i * 8)) & 0xff);
+ }
+}
+
+namespace CU {
+
+/// \brief Compact unwind encoding values.
+enum CompactUnwindEncodings {
+ UNWIND_ARM_MODE_MASK = 0x0F000000,
+ UNWIND_ARM_MODE_FRAME = 0x01000000,
+ UNWIND_ARM_MODE_FRAME_D = 0x02000000,
+ UNWIND_ARM_MODE_DWARF = 0x04000000,
+
+ UNWIND_ARM_FRAME_STACK_ADJUST_MASK = 0x00C00000,
+
+ UNWIND_ARM_FRAME_FIRST_PUSH_R4 = 0x00000001,
+ UNWIND_ARM_FRAME_FIRST_PUSH_R5 = 0x00000002,
+ UNWIND_ARM_FRAME_FIRST_PUSH_R6 = 0x00000004,
+
+ UNWIND_ARM_FRAME_SECOND_PUSH_R8 = 0x00000008,
+ UNWIND_ARM_FRAME_SECOND_PUSH_R9 = 0x00000010,
+ UNWIND_ARM_FRAME_SECOND_PUSH_R10 = 0x00000020,
+ UNWIND_ARM_FRAME_SECOND_PUSH_R11 = 0x00000040,
+ UNWIND_ARM_FRAME_SECOND_PUSH_R12 = 0x00000080,
+
+ UNWIND_ARM_FRAME_D_REG_COUNT_MASK = 0x00000F00,
+
+ UNWIND_ARM_DWARF_SECTION_OFFSET = 0x00FFFFFF
+};
+
+} // end CU namespace
+
+/// Generate compact unwind encoding for the function based on the CFI
+/// instructions. If the CFI instructions describe a frame that cannot be
+/// encoded in compact unwind, the method returns UNWIND_ARM_MODE_DWARF which
+/// tells the runtime to fallback and unwind using dwarf.
+uint32_t ARMAsmBackendDarwin::generateCompactUnwindEncoding(
+ ArrayRef<MCCFIInstruction> Instrs) const {
+ DEBUG_WITH_TYPE("compact-unwind", llvm::dbgs() << "generateCU()\n");
+ // Only armv7k uses CFI based unwinding.
+ if (Subtype != MachO::CPU_SUBTYPE_ARM_V7K)
+ return 0;
+ // No .cfi directives means no frame.
+ if (Instrs.empty())
+ return 0;
+ // Start off assuming CFA is at SP+0.
+ int CFARegister = ARM::SP;
+ int CFARegisterOffset = 0;
+ // Mark savable registers as initially unsaved
+ DenseMap<unsigned, int> RegOffsets;
+ int FloatRegCount = 0;
+ // Process each .cfi directive and build up compact unwind info.
+ for (size_t i = 0, e = Instrs.size(); i != e; ++i) {
+ int Reg;
+ const MCCFIInstruction &Inst = Instrs[i];
+ switch (Inst.getOperation()) {
+ case MCCFIInstruction::OpDefCfa: // DW_CFA_def_cfa
+ CFARegisterOffset = -Inst.getOffset();
+ CFARegister = MRI.getLLVMRegNum(Inst.getRegister(), true);
+ break;
+ case MCCFIInstruction::OpDefCfaOffset: // DW_CFA_def_cfa_offset
+ CFARegisterOffset = -Inst.getOffset();
+ break;
+ case MCCFIInstruction::OpDefCfaRegister: // DW_CFA_def_cfa_register
+ CFARegister = MRI.getLLVMRegNum(Inst.getRegister(), true);
+ break;
+ case MCCFIInstruction::OpOffset: // DW_CFA_offset
+ Reg = MRI.getLLVMRegNum(Inst.getRegister(), true);
+ if (ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg))
+ RegOffsets[Reg] = Inst.getOffset();
+ else if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg)) {
+ RegOffsets[Reg] = Inst.getOffset();
+ ++FloatRegCount;
+ } else {
+ DEBUG_WITH_TYPE("compact-unwind",
+ llvm::dbgs() << ".cfi_offset on unknown register="
+ << Inst.getRegister() << "\n");
+ return CU::UNWIND_ARM_MODE_DWARF;
+ }
+ break;
+ case MCCFIInstruction::OpRelOffset: // DW_CFA_advance_loc
+ // Ignore
+ break;
+ default:
+ // Directive not convertable to compact unwind, bail out.
+ DEBUG_WITH_TYPE("compact-unwind",
+ llvm::dbgs()
+ << "CFI directive not compatiable with comact "
+ "unwind encoding, opcode=" << Inst.getOperation()
+ << "\n");
+ return CU::UNWIND_ARM_MODE_DWARF;
+ break;
+ }
+ }
+
+ // If no frame set up, return no unwind info.
+ if ((CFARegister == ARM::SP) && (CFARegisterOffset == 0))
+ return 0;
+
+ // Verify standard frame (lr/r7) was used.
+ if (CFARegister != ARM::R7) {
+ DEBUG_WITH_TYPE("compact-unwind", llvm::dbgs() << "frame register is "
+ << CFARegister
+ << " instead of r7\n");
+ return CU::UNWIND_ARM_MODE_DWARF;
+ }
+ int StackAdjust = CFARegisterOffset - 8;
+ if (RegOffsets.lookup(ARM::LR) != (-4 - StackAdjust)) {
+ DEBUG_WITH_TYPE("compact-unwind",
+ llvm::dbgs()
+ << "LR not saved as standard frame, StackAdjust="
+ << StackAdjust
+ << ", CFARegisterOffset=" << CFARegisterOffset
+ << ", lr save at offset=" << RegOffsets[14] << "\n");
+ return CU::UNWIND_ARM_MODE_DWARF;
+ }
+ if (RegOffsets.lookup(ARM::R7) != (-8 - StackAdjust)) {
+ DEBUG_WITH_TYPE("compact-unwind",
+ llvm::dbgs() << "r7 not saved as standard frame\n");
+ return CU::UNWIND_ARM_MODE_DWARF;
+ }
+ uint32_t CompactUnwindEncoding = CU::UNWIND_ARM_MODE_FRAME;
+
+ // If var-args are used, there may be a stack adjust required.
+ switch (StackAdjust) {
+ case 0:
+ break;
+ case 4:
+ CompactUnwindEncoding |= 0x00400000;
+ break;
+ case 8:
+ CompactUnwindEncoding |= 0x00800000;
+ break;
+ case 12:
+ CompactUnwindEncoding |= 0x00C00000;
+ break;
+ default:
+ DEBUG_WITH_TYPE("compact-unwind", llvm::dbgs()
+ << ".cfi_def_cfa stack adjust ("
+ << StackAdjust << ") out of range\n");
+ return CU::UNWIND_ARM_MODE_DWARF;
+ }
+
+ // If r6 is saved, it must be right below r7.
+ static struct {
+ unsigned Reg;
+ unsigned Encoding;
+ } GPRCSRegs[] = {{ARM::R6, CU::UNWIND_ARM_FRAME_FIRST_PUSH_R6},
+ {ARM::R5, CU::UNWIND_ARM_FRAME_FIRST_PUSH_R5},
+ {ARM::R4, CU::UNWIND_ARM_FRAME_FIRST_PUSH_R4},
+ {ARM::R12, CU::UNWIND_ARM_FRAME_SECOND_PUSH_R12},
+ {ARM::R11, CU::UNWIND_ARM_FRAME_SECOND_PUSH_R11},
+ {ARM::R10, CU::UNWIND_ARM_FRAME_SECOND_PUSH_R10},
+ {ARM::R9, CU::UNWIND_ARM_FRAME_SECOND_PUSH_R9},
+ {ARM::R8, CU::UNWIND_ARM_FRAME_SECOND_PUSH_R8}};
+
+ int CurOffset = -8 - StackAdjust;
+ for (auto CSReg : GPRCSRegs) {
+ auto Offset = RegOffsets.find(CSReg.Reg);
+ if (Offset == RegOffsets.end())
+ continue;
+
+ int RegOffset = Offset->second;
+ if (RegOffset != CurOffset - 4) {
+ DEBUG_WITH_TYPE("compact-unwind",
+ llvm::dbgs() << MRI.getName(CSReg.Reg) << " saved at "
+ << RegOffset << " but only supported at "
+ << CurOffset << "\n");
+ return CU::UNWIND_ARM_MODE_DWARF;
+ }
+ CompactUnwindEncoding |= CSReg.Encoding;
+ CurOffset -= 4;
+ }
+
+ // If no floats saved, we are done.
+ if (FloatRegCount == 0)
+ return CompactUnwindEncoding;
+
+ // Switch mode to include D register saving.
+ CompactUnwindEncoding &= ~CU::UNWIND_ARM_MODE_MASK;
+ CompactUnwindEncoding |= CU::UNWIND_ARM_MODE_FRAME_D;
+
+ // FIXME: supporting more than 4 saved D-registers compactly would be trivial,
+ // but needs coordination with the linker and libunwind.
+ if (FloatRegCount > 4) {
+ DEBUG_WITH_TYPE("compact-unwind",
+ llvm::dbgs() << "unsupported number of D registers saved ("
+ << FloatRegCount << ")\n");
+ return CU::UNWIND_ARM_MODE_DWARF;
+ }
+
+ // Floating point registers must either be saved sequentially, or we defer to
+ // DWARF. No gaps allowed here so check that each saved d-register is
+ // precisely where it should be.
+ static unsigned FPRCSRegs[] = { ARM::D8, ARM::D10, ARM::D12, ARM::D14 };
+ for (int Idx = FloatRegCount - 1; Idx >= 0; --Idx) {
+ auto Offset = RegOffsets.find(FPRCSRegs[Idx]);
+ if (Offset == RegOffsets.end()) {
+ DEBUG_WITH_TYPE("compact-unwind",
+ llvm::dbgs() << FloatRegCount << " D-regs saved, but "
+ << MRI.getName(FPRCSRegs[Idx])
+ << " not saved\n");
+ return CU::UNWIND_ARM_MODE_DWARF;
+ } else if (Offset->second != CurOffset - 8) {
+ DEBUG_WITH_TYPE("compact-unwind",
+ llvm::dbgs() << FloatRegCount << " D-regs saved, but "
+ << MRI.getName(FPRCSRegs[Idx])
+ << " saved at " << Offset->second
+ << ", expected at " << CurOffset - 8
+ << "\n");
+ return CU::UNWIND_ARM_MODE_DWARF;
+ }
+ CurOffset -= 8;
+ }
+
+ return CompactUnwindEncoding | ((FloatRegCount - 1) << 8);
+}
+
+static MachO::CPUSubTypeARM getMachOSubTypeFromArch(StringRef Arch) {
+ unsigned AK = ARM::parseArch(Arch);
+ switch (AK) {
+ default:
+ return MachO::CPU_SUBTYPE_ARM_V7;
+ case ARM::AK_ARMV4T:
+ return MachO::CPU_SUBTYPE_ARM_V4T;
+ case ARM::AK_ARMV5T:
+ case ARM::AK_ARMV5TE:
+ case ARM::AK_ARMV5TEJ:
+ return MachO::CPU_SUBTYPE_ARM_V5;
+ case ARM::AK_ARMV6:
+ case ARM::AK_ARMV6K:
+ return MachO::CPU_SUBTYPE_ARM_V6;
+ case ARM::AK_ARMV7A:
+ return MachO::CPU_SUBTYPE_ARM_V7;
+ case ARM::AK_ARMV7S:
+ return MachO::CPU_SUBTYPE_ARM_V7S;
+ case ARM::AK_ARMV7K:
+ return MachO::CPU_SUBTYPE_ARM_V7K;
+ case ARM::AK_ARMV6M:
+ return MachO::CPU_SUBTYPE_ARM_V6M;
+ case ARM::AK_ARMV7M:
+ return MachO::CPU_SUBTYPE_ARM_V7M;
+ case ARM::AK_ARMV7EM:
+ return MachO::CPU_SUBTYPE_ARM_V7EM;
+ }
+}
+
+MCAsmBackend *llvm::createARMAsmBackend(const Target &T,
+ const MCRegisterInfo &MRI,
+ const Triple &TheTriple, StringRef CPU,
+ const MCTargetOptions &Options,
+ bool isLittle) {
+ switch (TheTriple.getObjectFormat()) {
+ default:
+ llvm_unreachable("unsupported object format");
+ case Triple::MachO: {
+ MachO::CPUSubTypeARM CS = getMachOSubTypeFromArch(TheTriple.getArchName());
+ return new ARMAsmBackendDarwin(T, TheTriple, MRI, CS);
+ }
+ case Triple::COFF:
+ assert(TheTriple.isOSWindows() && "non-Windows ARM COFF is not supported");
+ return new ARMAsmBackendWinCOFF(T, TheTriple);
+ case Triple::ELF:
+ assert(TheTriple.isOSBinFormatELF() && "using ELF for non-ELF target");
+ uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
+ return new ARMAsmBackendELF(T, TheTriple, OSABI, isLittle);
+ }
+}
+
+MCAsmBackend *llvm::createARMLEAsmBackend(const Target &T,
+ const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options) {
+ return createARMAsmBackend(T, MRI, TT, CPU, Options, true);
+}
+
+MCAsmBackend *llvm::createARMBEAsmBackend(const Target &T,
+ const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options) {
+ return createARMAsmBackend(T, MRI, TT, CPU, Options, false);
+}
+
+MCAsmBackend *llvm::createThumbLEAsmBackend(const Target &T,
+ const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options) {
+ return createARMAsmBackend(T, MRI, TT, CPU, Options, true);
+}
+
+MCAsmBackend *llvm::createThumbBEAsmBackend(const Target &T,
+ const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options) {
+ return createARMAsmBackend(T, MRI, TT, CPU, Options, false);
+}
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
new file mode 100644
index 000000000000..84caaacc47d3
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
@@ -0,0 +1,80 @@
+//===-- ARMAsmBackend.h - ARM Assembler Backend -----------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMASMBACKEND_H
+#define LLVM_LIB_TARGET_ARM_ARMASMBACKEND_H
+
+#include "MCTargetDesc/ARMFixupKinds.h"
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+namespace llvm {
+
+class ARMAsmBackend : public MCAsmBackend {
+ const MCSubtargetInfo *STI;
+ bool isThumbMode; // Currently emitting Thumb code.
+ bool IsLittleEndian; // Big or little endian.
+public:
+ ARMAsmBackend(const Target &T, const Triple &TT, bool IsLittle)
+ : MCAsmBackend(), STI(ARM_MC::createARMMCSubtargetInfo(TT, "", "")),
+ isThumbMode(TT.getArchName().startswith("thumb")),
+ IsLittleEndian(IsLittle) {}
+
+ ~ARMAsmBackend() override { delete STI; }
+
+ unsigned getNumFixupKinds() const override {
+ return ARM::NumTargetFixupKinds;
+ }
+
+ bool hasNOP() const { return STI->getFeatureBits()[ARM::HasV6T2Ops]; }
+
+ const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
+
+ /// processFixupValue - Target hook to process the literal value of a fixup
+ /// if necessary.
+ void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
+ const MCFixup &Fixup, const MCFragment *DF,
+ const MCValue &Target, uint64_t &Value,
+ bool &IsResolved) override;
+
+ unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value, bool IsPCRel,
+ MCContext *Ctx, bool IsLittleEndian,
+ bool IsResolved) const;
+
+ void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+ uint64_t Value, bool IsPCRel) const override;
+
+ unsigned getRelaxedOpcode(unsigned Op) const;
+
+ bool mayNeedRelaxation(const MCInst &Inst) const override;
+
+ const char *reasonForFixupRelaxation(const MCFixup &Fixup,
+ uint64_t Value) const;
+
+ bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+ const MCRelaxableFragment *DF,
+ const MCAsmLayout &Layout) const override;
+
+ void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+ MCInst &Res) const override;
+
+ bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+
+ void handleAssemblerFlag(MCAssemblerFlag Flag) override;
+
+ unsigned getPointerSize() const { return 4; }
+ bool isThumb() const { return isThumbMode; }
+ void setIsThumb(bool it) { isThumbMode = it; }
+ bool isLittle() const { return IsLittleEndian; }
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
new file mode 100644
index 000000000000..09dc0173ade6
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
@@ -0,0 +1,36 @@
+//===-- ARMAsmBackendDarwin.h ARM Asm Backend Darwin ----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMASMBACKENDDARWIN_H
+#define LLVM_LIB_TARGET_ARM_ARMASMBACKENDDARWIN_H
+
+#include "ARMAsmBackend.h"
+#include "llvm/Support/MachO.h"
+
+namespace llvm {
+class ARMAsmBackendDarwin : public ARMAsmBackend {
+ const MCRegisterInfo &MRI;
+public:
+ const MachO::CPUSubTypeARM Subtype;
+ ARMAsmBackendDarwin(const Target &T, const Triple &TT,
+ const MCRegisterInfo &MRI, MachO::CPUSubTypeARM st)
+ : ARMAsmBackend(T, TT, /* IsLittleEndian */ true), MRI(MRI), Subtype(st) {
+ }
+
+ MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+ return createARMMachObjectWriter(OS, /*Is64Bit=*/false, MachO::CPU_TYPE_ARM,
+ Subtype);
+ }
+
+ uint32_t generateCompactUnwindEncoding(
+ ArrayRef<MCCFIInstruction> Instrs) const override;
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h
new file mode 100644
index 000000000000..748f915be17b
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h
@@ -0,0 +1,31 @@
+//===-- ARMAsmBackendELF.h ARM Asm Backend ELF -----------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ELFARMASMBACKEND_H
+#define LLVM_LIB_TARGET_ARM_ELFARMASMBACKEND_H
+
+#include "ARMAsmBackend.h"
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+using namespace llvm;
+
+namespace {
+class ARMAsmBackendELF : public ARMAsmBackend {
+public:
+ uint8_t OSABI;
+ ARMAsmBackendELF(const Target &T, const Triple &TT, uint8_t OSABI,
+ bool IsLittle)
+ : ARMAsmBackend(T, TT, IsLittle), OSABI(OSABI) {}
+
+ MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+ return createARMELFObjectWriter(OS, OSABI, isLittle());
+ }
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h
new file mode 100644
index 000000000000..2a375be49a83
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h
@@ -0,0 +1,27 @@
+//===-- ARMAsmBackendWinCOFF.h - ARM Asm Backend WinCOFF --------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMASMBACKENDWINCOFF_H
+#define LLVM_LIB_TARGET_ARM_ARMASMBACKENDWINCOFF_H
+
+#include "ARMAsmBackend.h"
+using namespace llvm;
+
+namespace {
+class ARMAsmBackendWinCOFF : public ARMAsmBackend {
+public:
+ ARMAsmBackendWinCOFF(const Target &T, const Triple &TheTriple)
+ : ARMAsmBackend(T, TheTriple, true) {}
+ MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+ return createARMWinCOFFObjectWriter(OS, /*Is64Bit=*/false);
+ }
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
new file mode 100644
index 000000000000..088b4205ed62
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
@@ -0,0 +1,466 @@
+//===-- ARMBaseInfo.h - Top level definitions for ARM -------- --*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the ARM target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMBASEINFO_H
+#define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMBASEINFO_H
+
+#include "ARMMCTargetDesc.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+// Enums corresponding to ARM condition codes
+namespace ARMCC {
+ // The CondCodes constants map directly to the 4-bit encoding of the
+ // condition field for predicated instructions.
+ enum CondCodes { // Meaning (integer) Meaning (floating-point)
+ EQ, // Equal Equal
+ NE, // Not equal Not equal, or unordered
+ HS, // Carry set >, ==, or unordered
+ LO, // Carry clear Less than
+ MI, // Minus, negative Less than
+ PL, // Plus, positive or zero >, ==, or unordered
+ VS, // Overflow Unordered
+ VC, // No overflow Not unordered
+ HI, // Unsigned higher Greater than, or unordered
+ LS, // Unsigned lower or same Less than or equal
+ GE, // Greater than or equal Greater than or equal
+ LT, // Less than Less than, or unordered
+ GT, // Greater than Greater than
+ LE, // Less than or equal <, ==, or unordered
+ AL // Always (unconditional) Always (unconditional)
+ };
+
+ inline static CondCodes getOppositeCondition(CondCodes CC) {
+ switch (CC) {
+ default: llvm_unreachable("Unknown condition code");
+ case EQ: return NE;
+ case NE: return EQ;
+ case HS: return LO;
+ case LO: return HS;
+ case MI: return PL;
+ case PL: return MI;
+ case VS: return VC;
+ case VC: return VS;
+ case HI: return LS;
+ case LS: return HI;
+ case GE: return LT;
+ case LT: return GE;
+ case GT: return LE;
+ case LE: return GT;
+ }
+ }
+} // namespace ARMCC
+
+inline static const char *ARMCondCodeToString(ARMCC::CondCodes CC) {
+ switch (CC) {
+ case ARMCC::EQ: return "eq";
+ case ARMCC::NE: return "ne";
+ case ARMCC::HS: return "hs";
+ case ARMCC::LO: return "lo";
+ case ARMCC::MI: return "mi";
+ case ARMCC::PL: return "pl";
+ case ARMCC::VS: return "vs";
+ case ARMCC::VC: return "vc";
+ case ARMCC::HI: return "hi";
+ case ARMCC::LS: return "ls";
+ case ARMCC::GE: return "ge";
+ case ARMCC::LT: return "lt";
+ case ARMCC::GT: return "gt";
+ case ARMCC::LE: return "le";
+ case ARMCC::AL: return "al";
+ }
+ llvm_unreachable("Unknown condition code");
+}
+
+namespace ARM_PROC {
+ enum IMod {
+ IE = 2,
+ ID = 3
+ };
+
+ enum IFlags {
+ F = 1,
+ I = 2,
+ A = 4
+ };
+
+ inline static const char *IFlagsToString(unsigned val) {
+ switch (val) {
+ default: llvm_unreachable("Unknown iflags operand");
+ case F: return "f";
+ case I: return "i";
+ case A: return "a";
+ }
+ }
+
+ inline static const char *IModToString(unsigned val) {
+ switch (val) {
+ default: llvm_unreachable("Unknown imod operand");
+ case IE: return "ie";
+ case ID: return "id";
+ }
+ }
+}
+
+namespace ARM_MB {
+ // The Memory Barrier Option constants map directly to the 4-bit encoding of
+ // the option field for memory barrier operations.
+ enum MemBOpt {
+ RESERVED_0 = 0,
+ OSHLD = 1,
+ OSHST = 2,
+ OSH = 3,
+ RESERVED_4 = 4,
+ NSHLD = 5,
+ NSHST = 6,
+ NSH = 7,
+ RESERVED_8 = 8,
+ ISHLD = 9,
+ ISHST = 10,
+ ISH = 11,
+ RESERVED_12 = 12,
+ LD = 13,
+ ST = 14,
+ SY = 15
+ };
+
+ inline static const char *MemBOptToString(unsigned val, bool HasV8) {
+ switch (val) {
+ default: llvm_unreachable("Unknown memory operation");
+ case SY: return "sy";
+ case ST: return "st";
+ case LD: return HasV8 ? "ld" : "#0xd";
+ case RESERVED_12: return "#0xc";
+ case ISH: return "ish";
+ case ISHST: return "ishst";
+ case ISHLD: return HasV8 ? "ishld" : "#0x9";
+ case RESERVED_8: return "#0x8";
+ case NSH: return "nsh";
+ case NSHST: return "nshst";
+ case NSHLD: return HasV8 ? "nshld" : "#0x5";
+ case RESERVED_4: return "#0x4";
+ case OSH: return "osh";
+ case OSHST: return "oshst";
+ case OSHLD: return HasV8 ? "oshld" : "#0x1";
+ case RESERVED_0: return "#0x0";
+ }
+ }
+} // namespace ARM_MB
+
+namespace ARM_ISB {
+ enum InstSyncBOpt {
+ RESERVED_0 = 0,
+ RESERVED_1 = 1,
+ RESERVED_2 = 2,
+ RESERVED_3 = 3,
+ RESERVED_4 = 4,
+ RESERVED_5 = 5,
+ RESERVED_6 = 6,
+ RESERVED_7 = 7,
+ RESERVED_8 = 8,
+ RESERVED_9 = 9,
+ RESERVED_10 = 10,
+ RESERVED_11 = 11,
+ RESERVED_12 = 12,
+ RESERVED_13 = 13,
+ RESERVED_14 = 14,
+ SY = 15
+ };
+
+ inline static const char *InstSyncBOptToString(unsigned val) {
+ switch (val) {
+ default:
+ llvm_unreachable("Unknown memory operation");
+ case RESERVED_0: return "#0x0";
+ case RESERVED_1: return "#0x1";
+ case RESERVED_2: return "#0x2";
+ case RESERVED_3: return "#0x3";
+ case RESERVED_4: return "#0x4";
+ case RESERVED_5: return "#0x5";
+ case RESERVED_6: return "#0x6";
+ case RESERVED_7: return "#0x7";
+ case RESERVED_8: return "#0x8";
+ case RESERVED_9: return "#0x9";
+ case RESERVED_10: return "#0xa";
+ case RESERVED_11: return "#0xb";
+ case RESERVED_12: return "#0xc";
+ case RESERVED_13: return "#0xd";
+ case RESERVED_14: return "#0xe";
+ case SY: return "sy";
+ }
+ }
+} // namespace ARM_ISB
+
+/// isARMLowRegister - Returns true if the register is a low register (r0-r7).
+///
+static inline bool isARMLowRegister(unsigned Reg) {
+ using namespace ARM;
+ switch (Reg) {
+ case R0: case R1: case R2: case R3:
+ case R4: case R5: case R6: case R7:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/// ARMII - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace ARMII {
+
+ /// ARM Index Modes
+ enum IndexMode {
+ IndexModeNone = 0,
+ IndexModePre = 1,
+ IndexModePost = 2,
+ IndexModeUpd = 3
+ };
+
+ /// ARM Addressing Modes
+ enum AddrMode {
+ AddrModeNone = 0,
+ AddrMode1 = 1,
+ AddrMode2 = 2,
+ AddrMode3 = 3,
+ AddrMode4 = 4,
+ AddrMode5 = 5,
+ AddrMode6 = 6,
+ AddrModeT1_1 = 7,
+ AddrModeT1_2 = 8,
+ AddrModeT1_4 = 9,
+ AddrModeT1_s = 10, // i8 * 4 for pc and sp relative data
+ AddrModeT2_i12 = 11,
+ AddrModeT2_i8 = 12,
+ AddrModeT2_so = 13,
+ AddrModeT2_pc = 14, // +/- i12 for pc relative data
+ AddrModeT2_i8s4 = 15, // i8 * 4
+ AddrMode_i12 = 16
+ };
+
+ inline static const char *AddrModeToString(AddrMode addrmode) {
+ switch (addrmode) {
+ case AddrModeNone: return "AddrModeNone";
+ case AddrMode1: return "AddrMode1";
+ case AddrMode2: return "AddrMode2";
+ case AddrMode3: return "AddrMode3";
+ case AddrMode4: return "AddrMode4";
+ case AddrMode5: return "AddrMode5";
+ case AddrMode6: return "AddrMode6";
+ case AddrModeT1_1: return "AddrModeT1_1";
+ case AddrModeT1_2: return "AddrModeT1_2";
+ case AddrModeT1_4: return "AddrModeT1_4";
+ case AddrModeT1_s: return "AddrModeT1_s";
+ case AddrModeT2_i12: return "AddrModeT2_i12";
+ case AddrModeT2_i8: return "AddrModeT2_i8";
+ case AddrModeT2_so: return "AddrModeT2_so";
+ case AddrModeT2_pc: return "AddrModeT2_pc";
+ case AddrModeT2_i8s4: return "AddrModeT2_i8s4";
+ case AddrMode_i12: return "AddrMode_i12";
+ }
+ }
+
+ /// Target Operand Flag enum.
+ enum TOF {
+ //===------------------------------------------------------------------===//
+ // ARM Specific MachineOperand flags.
+
+ MO_NO_FLAG = 0,
+
+ /// MO_LO16 - On a symbol operand, this represents a relocation containing
+ /// lower 16 bit of the address. Used only via movw instruction.
+ MO_LO16 = 0x1,
+
+ /// MO_HI16 - On a symbol operand, this represents a relocation containing
+ /// higher 16 bit of the address. Used only via movt instruction.
+ MO_HI16 = 0x2,
+
+ /// MO_OPTION_MASK - Most flags are mutually exclusive; this mask selects
+ /// just that part of the flag set.
+ MO_OPTION_MASK = 0x1f,
+
+ /// MO_DLLIMPORT - On a symbol operand, this represents that the reference
+ /// to the symbol is for an import stub. This is used for DLL import
+ /// storage class indication on Windows.
+ MO_DLLIMPORT = 0x20,
+
+ /// MO_SECREL - On a symbol operand this indicates that the immediate is
+ /// the offset from beginning of section.
+ ///
+ /// This is the TLS offset for the COFF/Windows TLS mechanism.
+ MO_SECREL = 0x40,
+
+ /// MO_NONLAZY - This is an independent flag, on a symbol operand "FOO" it
+ /// represents a symbol which, if indirect, will get special Darwin mangling
+ /// as a non-lazy-ptr indirect symbol (i.e. "L_FOO$non_lazy_ptr"). Can be
+ /// combined with MO_LO16, MO_HI16 or MO_NO_FLAG (in a constant-pool, for
+ /// example).
+ MO_NONLAZY = 0x80,
+
+ // It's undefined behaviour if an enum overflows the range between its
+ // smallest and largest values, but since these are |ed together, it can
+ // happen. Put a sentinel in (values of this enum are stored as "unsigned
+ // char").
+ MO_UNUSED_MAXIMUM = 0xff
+ };
+
+ enum {
+ //===------------------------------------------------------------------===//
+ // Instruction Flags.
+
+ //===------------------------------------------------------------------===//
+ // This four-bit field describes the addressing mode used.
+ AddrModeMask = 0x1f, // The AddrMode enums are declared in ARMBaseInfo.h
+
+ // IndexMode - Unindex, pre-indexed, or post-indexed are valid for load
+ // and store ops only. Generic "updating" flag is used for ld/st multiple.
+ // The index mode enums are declared in ARMBaseInfo.h
+ IndexModeShift = 5,
+ IndexModeMask = 3 << IndexModeShift,
+
+ //===------------------------------------------------------------------===//
+ // Instruction encoding formats.
+ //
+ FormShift = 7,
+ FormMask = 0x3f << FormShift,
+
+ // Pseudo instructions
+ Pseudo = 0 << FormShift,
+
+ // Multiply instructions
+ MulFrm = 1 << FormShift,
+
+ // Branch instructions
+ BrFrm = 2 << FormShift,
+ BrMiscFrm = 3 << FormShift,
+
+ // Data Processing instructions
+ DPFrm = 4 << FormShift,
+ DPSoRegFrm = 5 << FormShift,
+
+ // Load and Store
+ LdFrm = 6 << FormShift,
+ StFrm = 7 << FormShift,
+ LdMiscFrm = 8 << FormShift,
+ StMiscFrm = 9 << FormShift,
+ LdStMulFrm = 10 << FormShift,
+
+ LdStExFrm = 11 << FormShift,
+
+ // Miscellaneous arithmetic instructions
+ ArithMiscFrm = 12 << FormShift,
+ SatFrm = 13 << FormShift,
+
+ // Extend instructions
+ ExtFrm = 14 << FormShift,
+
+ // VFP formats
+ VFPUnaryFrm = 15 << FormShift,
+ VFPBinaryFrm = 16 << FormShift,
+ VFPConv1Frm = 17 << FormShift,
+ VFPConv2Frm = 18 << FormShift,
+ VFPConv3Frm = 19 << FormShift,
+ VFPConv4Frm = 20 << FormShift,
+ VFPConv5Frm = 21 << FormShift,
+ VFPLdStFrm = 22 << FormShift,
+ VFPLdStMulFrm = 23 << FormShift,
+ VFPMiscFrm = 24 << FormShift,
+
+ // Thumb format
+ ThumbFrm = 25 << FormShift,
+
+ // Miscelleaneous format
+ MiscFrm = 26 << FormShift,
+
+ // NEON formats
+ NGetLnFrm = 27 << FormShift,
+ NSetLnFrm = 28 << FormShift,
+ NDupFrm = 29 << FormShift,
+ NLdStFrm = 30 << FormShift,
+ N1RegModImmFrm= 31 << FormShift,
+ N2RegFrm = 32 << FormShift,
+ NVCVTFrm = 33 << FormShift,
+ NVDupLnFrm = 34 << FormShift,
+ N2RegVShLFrm = 35 << FormShift,
+ N2RegVShRFrm = 36 << FormShift,
+ N3RegFrm = 37 << FormShift,
+ N3RegVShFrm = 38 << FormShift,
+ NVExtFrm = 39 << FormShift,
+ NVMulSLFrm = 40 << FormShift,
+ NVTBLFrm = 41 << FormShift,
+
+ //===------------------------------------------------------------------===//
+ // Misc flags.
+
+ // UnaryDP - Indicates this is a unary data processing instruction, i.e.
+ // it doesn't have a Rn operand.
+ UnaryDP = 1 << 13,
+
+ // Xform16Bit - Indicates this Thumb2 instruction may be transformed into
+ // a 16-bit Thumb instruction if certain conditions are met.
+ Xform16Bit = 1 << 14,
+
+ // ThumbArithFlagSetting - The instruction is a 16-bit flag setting Thumb
+ // instruction. Used by the parser to determine whether to require the 'S'
+ // suffix on the mnemonic (when not in an IT block) or preclude it (when
+ // in an IT block).
+ ThumbArithFlagSetting = 1 << 18,
+
+ //===------------------------------------------------------------------===//
+ // Code domain.
+ DomainShift = 15,
+ DomainMask = 7 << DomainShift,
+ DomainGeneral = 0 << DomainShift,
+ DomainVFP = 1 << DomainShift,
+ DomainNEON = 2 << DomainShift,
+ DomainNEONA8 = 4 << DomainShift,
+
+ //===------------------------------------------------------------------===//
+ // Field shifts - such shifts are used to set field while generating
+ // machine instructions.
+ //
+ // FIXME: This list will need adjusting/fixing as the MC code emitter
+ // takes shape and the ARMCodeEmitter.cpp bits go away.
+ ShiftTypeShift = 4,
+
+ M_BitShift = 5,
+ ShiftImmShift = 5,
+ ShiftShift = 7,
+ N_BitShift = 7,
+ ImmHiShift = 8,
+ SoRotImmShift = 8,
+ RegRsShift = 8,
+ ExtRotImmShift = 10,
+ RegRdLoShift = 12,
+ RegRdShift = 12,
+ RegRdHiShift = 16,
+ RegRnShift = 16,
+ S_BitShift = 20,
+ W_BitShift = 21,
+ AM3_I_BitShift = 22,
+ D_BitShift = 22,
+ U_BitShift = 23,
+ P_BitShift = 24,
+ I_BitShift = 25,
+ CondShift = 28
+ };
+
+} // end namespace ARMII
+
+} // end namespace llvm;
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
new file mode 100644
index 000000000000..6f19754b899e
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -0,0 +1,289 @@
+//===-- ARMELFObjectWriter.cpp - ARM ELF Writer ---------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "MCTargetDesc/ARMFixupKinds.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+ class ARMELFObjectWriter : public MCELFObjectTargetWriter {
+ enum { DefaultEABIVersion = 0x05000000U };
+ unsigned GetRelocTypeInner(const MCValue &Target,
+ const MCFixup &Fixup,
+ bool IsPCRel) const;
+
+
+ public:
+ ARMELFObjectWriter(uint8_t OSABI);
+
+ ~ARMELFObjectWriter() override;
+
+ unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+ const MCFixup &Fixup, bool IsPCRel) const override;
+
+ bool needsRelocateWithSymbol(const MCSymbol &Sym,
+ unsigned Type) const override;
+ };
+}
+
+ARMELFObjectWriter::ARMELFObjectWriter(uint8_t OSABI)
+ : MCELFObjectTargetWriter(/*Is64Bit*/ false, OSABI,
+ ELF::EM_ARM,
+ /*HasRelocationAddend*/ false) {}
+
+ARMELFObjectWriter::~ARMELFObjectWriter() {}
+
+bool ARMELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
+ unsigned Type) const {
+ // FIXME: This is extremely conservative. This really needs to use a
+ // whitelist with a clear explanation for why each realocation needs to
+ // point to the symbol, not to the section.
+ switch (Type) {
+ default:
+ return true;
+
+ case ELF::R_ARM_PREL31:
+ case ELF::R_ARM_ABS32:
+ return false;
+ }
+}
+
+// Need to examine the Fixup when determining whether to
+// emit the relocation as an explicit symbol or as a section relative
+// offset
+unsigned ARMELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
+ const MCFixup &Fixup,
+ bool IsPCRel) const {
+ return GetRelocTypeInner(Target, Fixup, IsPCRel);
+}
+
+unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
+ const MCFixup &Fixup,
+ bool IsPCRel) const {
+ MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
+
+ unsigned Type = 0;
+ if (IsPCRel) {
+ switch ((unsigned)Fixup.getKind()) {
+ default:
+ report_fatal_error("unsupported relocation on symbol");
+ return ELF::R_ARM_NONE;
+ case FK_Data_4:
+ switch (Modifier) {
+ default: llvm_unreachable("Unsupported Modifier");
+ case MCSymbolRefExpr::VK_None:
+ Type = ELF::R_ARM_REL32;
+ break;
+ case MCSymbolRefExpr::VK_TLSGD:
+ llvm_unreachable("unimplemented");
+ case MCSymbolRefExpr::VK_GOTTPOFF:
+ Type = ELF::R_ARM_TLS_IE32;
+ break;
+ case MCSymbolRefExpr::VK_ARM_GOT_PREL:
+ Type = ELF::R_ARM_GOT_PREL;
+ break;
+ case MCSymbolRefExpr::VK_ARM_PREL31:
+ Type = ELF::R_ARM_PREL31;
+ break;
+ }
+ break;
+ case ARM::fixup_arm_blx:
+ case ARM::fixup_arm_uncondbl:
+ switch (Modifier) {
+ case MCSymbolRefExpr::VK_PLT:
+ Type = ELF::R_ARM_CALL;
+ break;
+ case MCSymbolRefExpr::VK_TLSCALL:
+ Type = ELF::R_ARM_TLS_CALL;
+ break;
+ default:
+ Type = ELF::R_ARM_CALL;
+ break;
+ }
+ break;
+ case ARM::fixup_arm_condbl:
+ case ARM::fixup_arm_condbranch:
+ case ARM::fixup_arm_uncondbranch:
+ Type = ELF::R_ARM_JUMP24;
+ break;
+ case ARM::fixup_t2_condbranch:
+ Type = ELF::R_ARM_THM_JUMP19;
+ break;
+ case ARM::fixup_t2_uncondbranch:
+ Type = ELF::R_ARM_THM_JUMP24;
+ break;
+ case ARM::fixup_arm_movt_hi16:
+ Type = ELF::R_ARM_MOVT_PREL;
+ break;
+ case ARM::fixup_arm_movw_lo16:
+ Type = ELF::R_ARM_MOVW_PREL_NC;
+ break;
+ case ARM::fixup_t2_movt_hi16:
+ Type = ELF::R_ARM_THM_MOVT_PREL;
+ break;
+ case ARM::fixup_t2_movw_lo16:
+ Type = ELF::R_ARM_THM_MOVW_PREL_NC;
+ break;
+ case ARM::fixup_arm_thumb_br:
+ Type = ELF::R_ARM_THM_JUMP11;
+ break;
+ case ARM::fixup_arm_thumb_bcc:
+ Type = ELF::R_ARM_THM_JUMP8;
+ break;
+ case ARM::fixup_arm_thumb_bl:
+ case ARM::fixup_arm_thumb_blx:
+ switch (Modifier) {
+ case MCSymbolRefExpr::VK_TLSCALL:
+ Type = ELF::R_ARM_THM_TLS_CALL;
+ break;
+ default:
+ Type = ELF::R_ARM_THM_CALL;
+ break;
+ }
+ break;
+ }
+ } else {
+ switch ((unsigned)Fixup.getKind()) {
+ default:
+ report_fatal_error("unsupported relocation on symbol");
+ return ELF::R_ARM_NONE;
+ case FK_Data_1:
+ switch (Modifier) {
+ default: llvm_unreachable("unsupported Modifier");
+ case MCSymbolRefExpr::VK_None:
+ Type = ELF::R_ARM_ABS8;
+ break;
+ }
+ break;
+ case FK_Data_2:
+ switch (Modifier) {
+ default: llvm_unreachable("unsupported modifier");
+ case MCSymbolRefExpr::VK_None:
+ Type = ELF::R_ARM_ABS16;
+ break;
+ }
+ break;
+ case FK_Data_4:
+ switch (Modifier) {
+ default: llvm_unreachable("Unsupported Modifier");
+ case MCSymbolRefExpr::VK_ARM_NONE:
+ Type = ELF::R_ARM_NONE;
+ break;
+ case MCSymbolRefExpr::VK_GOT:
+ Type = ELF::R_ARM_GOT_BREL;
+ break;
+ case MCSymbolRefExpr::VK_TLSGD:
+ Type = ELF::R_ARM_TLS_GD32;
+ break;
+ case MCSymbolRefExpr::VK_TPOFF:
+ Type = ELF::R_ARM_TLS_LE32;
+ break;
+ case MCSymbolRefExpr::VK_GOTTPOFF:
+ Type = ELF::R_ARM_TLS_IE32;
+ break;
+ case MCSymbolRefExpr::VK_None:
+ Type = ELF::R_ARM_ABS32;
+ break;
+ case MCSymbolRefExpr::VK_GOTOFF:
+ Type = ELF::R_ARM_GOTOFF32;
+ break;
+ case MCSymbolRefExpr::VK_ARM_GOT_PREL:
+ Type = ELF::R_ARM_GOT_PREL;
+ break;
+ case MCSymbolRefExpr::VK_ARM_TARGET1:
+ Type = ELF::R_ARM_TARGET1;
+ break;
+ case MCSymbolRefExpr::VK_ARM_TARGET2:
+ Type = ELF::R_ARM_TARGET2;
+ break;
+ case MCSymbolRefExpr::VK_ARM_PREL31:
+ Type = ELF::R_ARM_PREL31;
+ break;
+ case MCSymbolRefExpr::VK_ARM_SBREL:
+ Type = ELF::R_ARM_SBREL32;
+ break;
+ case MCSymbolRefExpr::VK_ARM_TLSLDO:
+ Type = ELF::R_ARM_TLS_LDO32;
+ break;
+ case MCSymbolRefExpr::VK_TLSCALL:
+ Type = ELF::R_ARM_TLS_CALL;
+ break;
+ case MCSymbolRefExpr::VK_TLSDESC:
+ Type = ELF::R_ARM_TLS_GOTDESC;
+ break;
+ case MCSymbolRefExpr::VK_TLSLDM:
+ Type = ELF::R_ARM_TLS_LDM32;
+ break;
+ case MCSymbolRefExpr::VK_ARM_TLSDESCSEQ:
+ Type = ELF::R_ARM_TLS_DESCSEQ;
+ break;
+ }
+ break;
+ case ARM::fixup_arm_ldst_pcrel_12:
+ case ARM::fixup_arm_pcrel_10:
+ case ARM::fixup_arm_adr_pcrel_12:
+ case ARM::fixup_arm_thumb_bl:
+ case ARM::fixup_arm_thumb_cb:
+ case ARM::fixup_arm_thumb_cp:
+ case ARM::fixup_arm_thumb_br:
+ llvm_unreachable("Unimplemented");
+ case ARM::fixup_arm_condbranch:
+ case ARM::fixup_arm_uncondbranch:
+ Type = ELF::R_ARM_JUMP24;
+ break;
+ case ARM::fixup_arm_movt_hi16:
+ switch (Modifier) {
+ default: llvm_unreachable("Unsupported Modifier");
+ case MCSymbolRefExpr::VK_None:
+ Type = ELF::R_ARM_MOVT_ABS;
+ break;
+ case MCSymbolRefExpr::VK_ARM_SBREL:
+ Type = ELF:: R_ARM_MOVT_BREL;
+ break;
+ }
+ break;
+ case ARM::fixup_arm_movw_lo16:
+ switch (Modifier) {
+ default: llvm_unreachable("Unsupported Modifier");
+ case MCSymbolRefExpr::VK_None:
+ Type = ELF::R_ARM_MOVW_ABS_NC;
+ break;
+ case MCSymbolRefExpr::VK_ARM_SBREL:
+ Type = ELF:: R_ARM_MOVW_BREL_NC;
+ break;
+ }
+ break;
+ case ARM::fixup_t2_movt_hi16:
+ Type = ELF::R_ARM_THM_MOVT_ABS;
+ break;
+ case ARM::fixup_t2_movw_lo16:
+ Type = ELF::R_ARM_THM_MOVW_ABS_NC;
+ break;
+ }
+ }
+
+ return Type;
+}
+
+MCObjectWriter *llvm::createARMELFObjectWriter(raw_pwrite_stream &OS,
+ uint8_t OSABI,
+ bool IsLittleEndian) {
+ MCELFObjectTargetWriter *MOTW = new ARMELFObjectWriter(OSABI);
+ return createELFObjectWriter(MOTW, OS, IsLittleEndian);
+}
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
new file mode 100644
index 000000000000..f6bb35d2326b
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -0,0 +1,1403 @@
+//===- lib/MC/ARMELFStreamer.cpp - ELF Object Output for ARM --------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file assembles .s files and emits ARM ELF .o object files. Different
+// from generic ELF streamer in emitting mapping symbols ($a, $t and $d) to
+// delimit regions of data and code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMRegisterInfo.h"
+#include "ARMUnwindOpAsm.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ARMBuildAttributes.h"
+#include "llvm/Support/ARMEHABI.h"
+#include "llvm/Support/TargetParser.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+
+using namespace llvm;
+
+static std::string GetAEABIUnwindPersonalityName(unsigned Index) {
+ assert(Index < ARM::EHABI::NUM_PERSONALITY_INDEX &&
+ "Invalid personality index");
+ return (Twine("__aeabi_unwind_cpp_pr") + Twine(Index)).str();
+}
+
+namespace {
+
+class ARMELFStreamer;
+
+class ARMTargetAsmStreamer : public ARMTargetStreamer {
+ formatted_raw_ostream &OS;
+ MCInstPrinter &InstPrinter;
+ bool IsVerboseAsm;
+
+ void emitFnStart() override;
+ void emitFnEnd() override;
+ void emitCantUnwind() override;
+ void emitPersonality(const MCSymbol *Personality) override;
+ void emitPersonalityIndex(unsigned Index) override;
+ void emitHandlerData() override;
+ void emitSetFP(unsigned FpReg, unsigned SpReg, int64_t Offset = 0) override;
+ void emitMovSP(unsigned Reg, int64_t Offset = 0) override;
+ void emitPad(int64_t Offset) override;
+ void emitRegSave(const SmallVectorImpl<unsigned> &RegList,
+ bool isVector) override;
+ void emitUnwindRaw(int64_t Offset,
+ const SmallVectorImpl<uint8_t> &Opcodes) override;
+
+ void switchVendor(StringRef Vendor) override;
+ void emitAttribute(unsigned Attribute, unsigned Value) override;
+ void emitTextAttribute(unsigned Attribute, StringRef String) override;
+ void emitIntTextAttribute(unsigned Attribute, unsigned IntValue,
+ StringRef StringValue) override;
+ void emitArch(unsigned Arch) override;
+ void emitArchExtension(unsigned ArchExt) override;
+ void emitObjectArch(unsigned Arch) override;
+ void emitFPU(unsigned FPU) override;
+ void emitInst(uint32_t Inst, char Suffix = '\0') override;
+ void finishAttributeSection() override;
+
+ void AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *SRE) override;
+ void emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) override;
+
+public:
+ ARMTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS,
+ MCInstPrinter &InstPrinter, bool VerboseAsm);
+};
+
+ARMTargetAsmStreamer::ARMTargetAsmStreamer(MCStreamer &S,
+ formatted_raw_ostream &OS,
+ MCInstPrinter &InstPrinter,
+ bool VerboseAsm)
+ : ARMTargetStreamer(S), OS(OS), InstPrinter(InstPrinter),
+ IsVerboseAsm(VerboseAsm) {}
+void ARMTargetAsmStreamer::emitFnStart() { OS << "\t.fnstart\n"; }
+void ARMTargetAsmStreamer::emitFnEnd() { OS << "\t.fnend\n"; }
+void ARMTargetAsmStreamer::emitCantUnwind() { OS << "\t.cantunwind\n"; }
+void ARMTargetAsmStreamer::emitPersonality(const MCSymbol *Personality) {
+ OS << "\t.personality " << Personality->getName() << '\n';
+}
+void ARMTargetAsmStreamer::emitPersonalityIndex(unsigned Index) {
+ OS << "\t.personalityindex " << Index << '\n';
+}
+void ARMTargetAsmStreamer::emitHandlerData() { OS << "\t.handlerdata\n"; }
+void ARMTargetAsmStreamer::emitSetFP(unsigned FpReg, unsigned SpReg,
+ int64_t Offset) {
+ OS << "\t.setfp\t";
+ InstPrinter.printRegName(OS, FpReg);
+ OS << ", ";
+ InstPrinter.printRegName(OS, SpReg);
+ if (Offset)
+ OS << ", #" << Offset;
+ OS << '\n';
+}
+void ARMTargetAsmStreamer::emitMovSP(unsigned Reg, int64_t Offset) {
+ assert((Reg != ARM::SP && Reg != ARM::PC) &&
+ "the operand of .movsp cannot be either sp or pc");
+
+ OS << "\t.movsp\t";
+ InstPrinter.printRegName(OS, Reg);
+ if (Offset)
+ OS << ", #" << Offset;
+ OS << '\n';
+}
+void ARMTargetAsmStreamer::emitPad(int64_t Offset) {
+ OS << "\t.pad\t#" << Offset << '\n';
+}
+void ARMTargetAsmStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList,
+ bool isVector) {
+ assert(RegList.size() && "RegList should not be empty");
+ if (isVector)
+ OS << "\t.vsave\t{";
+ else
+ OS << "\t.save\t{";
+
+ InstPrinter.printRegName(OS, RegList[0]);
+
+ for (unsigned i = 1, e = RegList.size(); i != e; ++i) {
+ OS << ", ";
+ InstPrinter.printRegName(OS, RegList[i]);
+ }
+
+ OS << "}\n";
+}
+void ARMTargetAsmStreamer::switchVendor(StringRef Vendor) {
+}
+void ARMTargetAsmStreamer::emitAttribute(unsigned Attribute, unsigned Value) {
+ OS << "\t.eabi_attribute\t" << Attribute << ", " << Twine(Value);
+ if (IsVerboseAsm) {
+ StringRef Name = ARMBuildAttrs::AttrTypeAsString(Attribute);
+ if (!Name.empty())
+ OS << "\t@ " << Name;
+ }
+ OS << "\n";
+}
+void ARMTargetAsmStreamer::emitTextAttribute(unsigned Attribute,
+ StringRef String) {
+ switch (Attribute) {
+ case ARMBuildAttrs::CPU_name:
+ OS << "\t.cpu\t" << String.lower();
+ break;
+ default:
+ OS << "\t.eabi_attribute\t" << Attribute << ", \"" << String << "\"";
+ if (IsVerboseAsm) {
+ StringRef Name = ARMBuildAttrs::AttrTypeAsString(Attribute);
+ if (!Name.empty())
+ OS << "\t@ " << Name;
+ }
+ break;
+ }
+ OS << "\n";
+}
+void ARMTargetAsmStreamer::emitIntTextAttribute(unsigned Attribute,
+ unsigned IntValue,
+ StringRef StringValue) {
+ switch (Attribute) {
+ default: llvm_unreachable("unsupported multi-value attribute in asm mode");
+ case ARMBuildAttrs::compatibility:
+ OS << "\t.eabi_attribute\t" << Attribute << ", " << IntValue;
+ if (!StringValue.empty())
+ OS << ", \"" << StringValue << "\"";
+ if (IsVerboseAsm)
+ OS << "\t@ " << ARMBuildAttrs::AttrTypeAsString(Attribute);
+ break;
+ }
+ OS << "\n";
+}
+void ARMTargetAsmStreamer::emitArch(unsigned Arch) {
+ OS << "\t.arch\t" << ARM::getArchName(Arch) << "\n";
+}
+void ARMTargetAsmStreamer::emitArchExtension(unsigned ArchExt) {
+ OS << "\t.arch_extension\t" << ARM::getArchExtName(ArchExt) << "\n";
+}
+void ARMTargetAsmStreamer::emitObjectArch(unsigned Arch) {
+ OS << "\t.object_arch\t" << ARM::getArchName(Arch) << '\n';
+}
+void ARMTargetAsmStreamer::emitFPU(unsigned FPU) {
+ OS << "\t.fpu\t" << ARM::getFPUName(FPU) << "\n";
+}
+void ARMTargetAsmStreamer::finishAttributeSection() {
+}
+void
+ARMTargetAsmStreamer::AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *S) {
+ OS << "\t.tlsdescseq\t" << S->getSymbol().getName();
+}
+
+void ARMTargetAsmStreamer::emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) {
+ const MCAsmInfo *MAI = Streamer.getContext().getAsmInfo();
+
+ OS << "\t.thumb_set\t";
+ Symbol->print(OS, MAI);
+ OS << ", ";
+ Value->print(OS, MAI);
+ OS << '\n';
+}
+
+void ARMTargetAsmStreamer::emitInst(uint32_t Inst, char Suffix) {
+ OS << "\t.inst";
+ if (Suffix)
+ OS << "." << Suffix;
+ OS << "\t0x" << Twine::utohexstr(Inst) << "\n";
+}
+
+void ARMTargetAsmStreamer::emitUnwindRaw(int64_t Offset,
+ const SmallVectorImpl<uint8_t> &Opcodes) {
+ OS << "\t.unwind_raw " << Offset;
+ for (SmallVectorImpl<uint8_t>::const_iterator OCI = Opcodes.begin(),
+ OCE = Opcodes.end();
+ OCI != OCE; ++OCI)
+ OS << ", 0x" << Twine::utohexstr(*OCI);
+ OS << '\n';
+}
+
+class ARMTargetELFStreamer : public ARMTargetStreamer {
+private:
+ // This structure holds all attributes, accounting for
+ // their string/numeric value, so we can later emit them
+ // in declaration order, keeping all in the same vector
+ struct AttributeItem {
+ enum {
+ HiddenAttribute = 0,
+ NumericAttribute,
+ TextAttribute,
+ NumericAndTextAttributes
+ } Type;
+ unsigned Tag;
+ unsigned IntValue;
+ std::string StringValue;
+
+ static bool LessTag(const AttributeItem &LHS, const AttributeItem &RHS) {
+ // The conformance tag must be emitted first when serialised
+ // into an object file. Specifically, the addenda to the ARM ABI
+ // states that (2.3.7.4):
+ //
+ // "To simplify recognition by consumers in the common case of
+ // claiming conformity for the whole file, this tag should be
+ // emitted first in a file-scope sub-subsection of the first
+ // public subsection of the attributes section."
+ //
+ // So it is special-cased in this comparison predicate when the
+ // attributes are sorted in finishAttributeSection().
+ return (RHS.Tag != ARMBuildAttrs::conformance) &&
+ ((LHS.Tag == ARMBuildAttrs::conformance) || (LHS.Tag < RHS.Tag));
+ }
+ };
+
+ StringRef CurrentVendor;
+ unsigned FPU;
+ unsigned Arch;
+ unsigned EmittedArch;
+ SmallVector<AttributeItem, 64> Contents;
+
+ MCSection *AttributeSection;
+
+ AttributeItem *getAttributeItem(unsigned Attribute) {
+ for (size_t i = 0; i < Contents.size(); ++i)
+ if (Contents[i].Tag == Attribute)
+ return &Contents[i];
+ return nullptr;
+ }
+
+ void setAttributeItem(unsigned Attribute, unsigned Value,
+ bool OverwriteExisting) {
+ // Look for existing attribute item
+ if (AttributeItem *Item = getAttributeItem(Attribute)) {
+ if (!OverwriteExisting)
+ return;
+ Item->Type = AttributeItem::NumericAttribute;
+ Item->IntValue = Value;
+ return;
+ }
+
+ // Create new attribute item
+ AttributeItem Item = {
+ AttributeItem::NumericAttribute,
+ Attribute,
+ Value,
+ StringRef("")
+ };
+ Contents.push_back(Item);
+ }
+
+ void setAttributeItem(unsigned Attribute, StringRef Value,
+ bool OverwriteExisting) {
+ // Look for existing attribute item
+ if (AttributeItem *Item = getAttributeItem(Attribute)) {
+ if (!OverwriteExisting)
+ return;
+ Item->Type = AttributeItem::TextAttribute;
+ Item->StringValue = Value;
+ return;
+ }
+
+ // Create new attribute item
+ AttributeItem Item = {
+ AttributeItem::TextAttribute,
+ Attribute,
+ 0,
+ Value
+ };
+ Contents.push_back(Item);
+ }
+
+ void setAttributeItems(unsigned Attribute, unsigned IntValue,
+ StringRef StringValue, bool OverwriteExisting) {
+ // Look for existing attribute item
+ if (AttributeItem *Item = getAttributeItem(Attribute)) {
+ if (!OverwriteExisting)
+ return;
+ Item->Type = AttributeItem::NumericAndTextAttributes;
+ Item->IntValue = IntValue;
+ Item->StringValue = StringValue;
+ return;
+ }
+
+ // Create new attribute item
+ AttributeItem Item = {
+ AttributeItem::NumericAndTextAttributes,
+ Attribute,
+ IntValue,
+ StringValue
+ };
+ Contents.push_back(Item);
+ }
+
+ void emitArchDefaultAttributes();
+ void emitFPUDefaultAttributes();
+
+ ARMELFStreamer &getStreamer();
+
+ void emitFnStart() override;
+ void emitFnEnd() override;
+ void emitCantUnwind() override;
+ void emitPersonality(const MCSymbol *Personality) override;
+ void emitPersonalityIndex(unsigned Index) override;
+ void emitHandlerData() override;
+ void emitSetFP(unsigned FpReg, unsigned SpReg, int64_t Offset = 0) override;
+ void emitMovSP(unsigned Reg, int64_t Offset = 0) override;
+ void emitPad(int64_t Offset) override;
+ void emitRegSave(const SmallVectorImpl<unsigned> &RegList,
+ bool isVector) override;
+ void emitUnwindRaw(int64_t Offset,
+ const SmallVectorImpl<uint8_t> &Opcodes) override;
+
+ void switchVendor(StringRef Vendor) override;
+ void emitAttribute(unsigned Attribute, unsigned Value) override;
+ void emitTextAttribute(unsigned Attribute, StringRef String) override;
+ void emitIntTextAttribute(unsigned Attribute, unsigned IntValue,
+ StringRef StringValue) override;
+ void emitArch(unsigned Arch) override;
+ void emitObjectArch(unsigned Arch) override;
+ void emitFPU(unsigned FPU) override;
+ void emitInst(uint32_t Inst, char Suffix = '\0') override;
+ void finishAttributeSection() override;
+ void emitLabel(MCSymbol *Symbol) override;
+
+ void AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *SRE) override;
+ void emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) override;
+
+ size_t calculateContentSize() const;
+
+ // Reset state between object emissions
+ void reset() override;
+
+public:
+ ARMTargetELFStreamer(MCStreamer &S)
+ : ARMTargetStreamer(S), CurrentVendor("aeabi"), FPU(ARM::FK_INVALID),
+ Arch(ARM::AK_INVALID), EmittedArch(ARM::AK_INVALID),
+ AttributeSection(nullptr) {}
+};
+
+/// Extend the generic ELFStreamer class so that it can emit mapping symbols at
+/// the appropriate points in the object files. These symbols are defined in the
+/// ARM ELF ABI: infocenter.arm.com/help/topic/com.arm.../IHI0044D_aaelf.pdf.
+///
+/// In brief: $a, $t or $d should be emitted at the start of each contiguous
+/// region of ARM code, Thumb code or data in a section. In practice, this
+/// emission does not rely on explicit assembler directives but on inherent
+/// properties of the directives doing the emission (e.g. ".byte" is data, "add
+/// r0, r0, r0" an instruction).
+///
+/// As a result this system is orthogonal to the DataRegion infrastructure used
+/// by MachO. Beware!
+class ARMELFStreamer : public MCELFStreamer {
+public:
+ friend class ARMTargetELFStreamer;
+
+ ARMELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_pwrite_stream &OS,
+ MCCodeEmitter *Emitter, bool IsThumb)
+ : MCELFStreamer(Context, TAB, OS, Emitter), IsThumb(IsThumb),
+ MappingSymbolCounter(0), LastEMS(EMS_None) {
+ EHReset();
+ }
+
+ ~ARMELFStreamer() {}
+
+ void FinishImpl() override;
+
+ // ARM exception handling directives
+ void emitFnStart();
+ void emitFnEnd();
+ void emitCantUnwind();
+ void emitPersonality(const MCSymbol *Per);
+ void emitPersonalityIndex(unsigned index);
+ void emitHandlerData();
+ void emitSetFP(unsigned NewFpReg, unsigned NewSpReg, int64_t Offset = 0);
+ void emitMovSP(unsigned Reg, int64_t Offset = 0);
+ void emitPad(int64_t Offset);
+ void emitRegSave(const SmallVectorImpl<unsigned> &RegList, bool isVector);
+ void emitUnwindRaw(int64_t Offset, const SmallVectorImpl<uint8_t> &Opcodes);
+
+ void ChangeSection(MCSection *Section, const MCExpr *Subsection) override {
+ // We have to keep track of the mapping symbol state of any sections we
+ // use. Each one should start off as EMS_None, which is provided as the
+ // default constructor by DenseMap::lookup.
+ LastMappingSymbols[getPreviousSection().first] = LastEMS;
+ LastEMS = LastMappingSymbols.lookup(Section);
+
+ MCELFStreamer::ChangeSection(Section, Subsection);
+ }
+
+ /// This function is the one used to emit instruction data into the ELF
+ /// streamer. We override it to add the appropriate mapping symbol if
+ /// necessary.
+ void EmitInstruction(const MCInst& Inst,
+ const MCSubtargetInfo &STI) override {
+ if (IsThumb)
+ EmitThumbMappingSymbol();
+ else
+ EmitARMMappingSymbol();
+
+ MCELFStreamer::EmitInstruction(Inst, STI);
+ }
+
+ void emitInst(uint32_t Inst, char Suffix) {
+ unsigned Size;
+ char Buffer[4];
+ const bool LittleEndian = getContext().getAsmInfo()->isLittleEndian();
+
+ switch (Suffix) {
+ case '\0':
+ Size = 4;
+
+ assert(!IsThumb);
+ EmitARMMappingSymbol();
+ for (unsigned II = 0, IE = Size; II != IE; II++) {
+ const unsigned I = LittleEndian ? (Size - II - 1) : II;
+ Buffer[Size - II - 1] = uint8_t(Inst >> I * CHAR_BIT);
+ }
+
+ break;
+ case 'n':
+ case 'w':
+ Size = (Suffix == 'n' ? 2 : 4);
+
+ assert(IsThumb);
+ EmitThumbMappingSymbol();
+ for (unsigned II = 0, IE = Size; II != IE; II = II + 2) {
+ const unsigned I0 = LittleEndian ? II + 0 : (Size - II - 1);
+ const unsigned I1 = LittleEndian ? II + 1 : (Size - II - 2);
+ Buffer[Size - II - 2] = uint8_t(Inst >> I0 * CHAR_BIT);
+ Buffer[Size - II - 1] = uint8_t(Inst >> I1 * CHAR_BIT);
+ }
+
+ break;
+ default:
+ llvm_unreachable("Invalid Suffix");
+ }
+
+ MCELFStreamer::EmitBytes(StringRef(Buffer, Size));
+ }
+
+ /// This is one of the functions used to emit data into an ELF section, so the
+ /// ARM streamer overrides it to add the appropriate mapping symbol ($d) if
+ /// necessary.
+ void EmitBytes(StringRef Data) override {
+ EmitDataMappingSymbol();
+ MCELFStreamer::EmitBytes(Data);
+ }
+
+ /// This is one of the functions used to emit data into an ELF section, so the
+ /// ARM streamer overrides it to add the appropriate mapping symbol ($d) if
+ /// necessary.
+ void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override {
+ if (const MCSymbolRefExpr *SRE = dyn_cast_or_null<MCSymbolRefExpr>(Value))
+ if (SRE->getKind() == MCSymbolRefExpr::VK_ARM_SBREL && !(Size == 4)) {
+ getContext().reportError(Loc, "relocated expression must be 32-bit");
+ return;
+ }
+
+ EmitDataMappingSymbol();
+ MCELFStreamer::EmitValueImpl(Value, Size, Loc);
+ }
+
+ void EmitAssemblerFlag(MCAssemblerFlag Flag) override {
+ MCELFStreamer::EmitAssemblerFlag(Flag);
+
+ switch (Flag) {
+ case MCAF_SyntaxUnified:
+ return; // no-op here.
+ case MCAF_Code16:
+ IsThumb = true;
+ return; // Change to Thumb mode
+ case MCAF_Code32:
+ IsThumb = false;
+ return; // Change to ARM mode
+ case MCAF_Code64:
+ return;
+ case MCAF_SubsectionsViaSymbols:
+ return;
+ }
+ }
+
+private:
+ enum ElfMappingSymbol {
+ EMS_None,
+ EMS_ARM,
+ EMS_Thumb,
+ EMS_Data
+ };
+
+ void EmitDataMappingSymbol() {
+ if (LastEMS == EMS_Data) return;
+ EmitMappingSymbol("$d");
+ LastEMS = EMS_Data;
+ }
+
+ void EmitThumbMappingSymbol() {
+ if (LastEMS == EMS_Thumb) return;
+ EmitMappingSymbol("$t");
+ LastEMS = EMS_Thumb;
+ }
+
+ void EmitARMMappingSymbol() {
+ if (LastEMS == EMS_ARM) return;
+ EmitMappingSymbol("$a");
+ LastEMS = EMS_ARM;
+ }
+
+ void EmitMappingSymbol(StringRef Name) {
+ auto *Symbol = cast<MCSymbolELF>(getContext().getOrCreateSymbol(
+ Name + "." + Twine(MappingSymbolCounter++)));
+ EmitLabel(Symbol);
+
+ Symbol->setType(ELF::STT_NOTYPE);
+ Symbol->setBinding(ELF::STB_LOCAL);
+ Symbol->setExternal(false);
+ }
+
+ void EmitThumbFunc(MCSymbol *Func) override {
+ getAssembler().setIsThumbFunc(Func);
+ EmitSymbolAttribute(Func, MCSA_ELF_TypeFunction);
+ }
+
+ // Helper functions for ARM exception handling directives
+ void EHReset();
+
+ // Reset state between object emissions
+ void reset() override;
+
+ void EmitPersonalityFixup(StringRef Name);
+ void FlushPendingOffset();
+ void FlushUnwindOpcodes(bool NoHandlerData);
+
+ void SwitchToEHSection(StringRef Prefix, unsigned Type, unsigned Flags,
+ SectionKind Kind, const MCSymbol &Fn);
+ void SwitchToExTabSection(const MCSymbol &FnStart);
+ void SwitchToExIdxSection(const MCSymbol &FnStart);
+
+ void EmitFixup(const MCExpr *Expr, MCFixupKind Kind);
+
+ bool IsThumb;
+ int64_t MappingSymbolCounter;
+
+ DenseMap<const MCSection *, ElfMappingSymbol> LastMappingSymbols;
+ ElfMappingSymbol LastEMS;
+
+ // ARM Exception Handling Frame Information
+ MCSymbol *ExTab;
+ MCSymbol *FnStart;
+ const MCSymbol *Personality;
+ unsigned PersonalityIndex;
+ unsigned FPReg; // Frame pointer register
+ int64_t FPOffset; // Offset: (final frame pointer) - (initial $sp)
+ int64_t SPOffset; // Offset: (final $sp) - (initial $sp)
+ int64_t PendingOffset; // Offset: (final $sp) - (emitted $sp)
+ bool UsedFP;
+ bool CantUnwind;
+ SmallVector<uint8_t, 64> Opcodes;
+ UnwindOpcodeAssembler UnwindOpAsm;
+};
+} // end anonymous namespace
+
+ARMELFStreamer &ARMTargetELFStreamer::getStreamer() {
+ return static_cast<ARMELFStreamer &>(Streamer);
+}
+
+void ARMTargetELFStreamer::emitFnStart() { getStreamer().emitFnStart(); }
+void ARMTargetELFStreamer::emitFnEnd() { getStreamer().emitFnEnd(); }
+void ARMTargetELFStreamer::emitCantUnwind() { getStreamer().emitCantUnwind(); }
+void ARMTargetELFStreamer::emitPersonality(const MCSymbol *Personality) {
+ getStreamer().emitPersonality(Personality);
+}
+void ARMTargetELFStreamer::emitPersonalityIndex(unsigned Index) {
+ getStreamer().emitPersonalityIndex(Index);
+}
+void ARMTargetELFStreamer::emitHandlerData() {
+ getStreamer().emitHandlerData();
+}
+void ARMTargetELFStreamer::emitSetFP(unsigned FpReg, unsigned SpReg,
+ int64_t Offset) {
+ getStreamer().emitSetFP(FpReg, SpReg, Offset);
+}
+void ARMTargetELFStreamer::emitMovSP(unsigned Reg, int64_t Offset) {
+ getStreamer().emitMovSP(Reg, Offset);
+}
+void ARMTargetELFStreamer::emitPad(int64_t Offset) {
+ getStreamer().emitPad(Offset);
+}
+void ARMTargetELFStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList,
+ bool isVector) {
+ getStreamer().emitRegSave(RegList, isVector);
+}
+void ARMTargetELFStreamer::emitUnwindRaw(int64_t Offset,
+ const SmallVectorImpl<uint8_t> &Opcodes) {
+ getStreamer().emitUnwindRaw(Offset, Opcodes);
+}
+void ARMTargetELFStreamer::switchVendor(StringRef Vendor) {
+ assert(!Vendor.empty() && "Vendor cannot be empty.");
+
+ if (CurrentVendor == Vendor)
+ return;
+
+ if (!CurrentVendor.empty())
+ finishAttributeSection();
+
+ assert(Contents.empty() &&
+ ".ARM.attributes should be flushed before changing vendor");
+ CurrentVendor = Vendor;
+
+}
+void ARMTargetELFStreamer::emitAttribute(unsigned Attribute, unsigned Value) {
+ setAttributeItem(Attribute, Value, /* OverwriteExisting= */ true);
+}
+void ARMTargetELFStreamer::emitTextAttribute(unsigned Attribute,
+ StringRef Value) {
+ setAttributeItem(Attribute, Value, /* OverwriteExisting= */ true);
+}
+void ARMTargetELFStreamer::emitIntTextAttribute(unsigned Attribute,
+ unsigned IntValue,
+ StringRef StringValue) {
+ setAttributeItems(Attribute, IntValue, StringValue,
+ /* OverwriteExisting= */ true);
+}
+void ARMTargetELFStreamer::emitArch(unsigned Value) {
+ Arch = Value;
+}
+void ARMTargetELFStreamer::emitObjectArch(unsigned Value) {
+ EmittedArch = Value;
+}
+void ARMTargetELFStreamer::emitArchDefaultAttributes() {
+ using namespace ARMBuildAttrs;
+
+ setAttributeItem(CPU_name,
+ ARM::getCPUAttr(Arch),
+ false);
+
+ if (EmittedArch == ARM::AK_INVALID)
+ setAttributeItem(CPU_arch,
+ ARM::getArchAttr(Arch),
+ false);
+ else
+ setAttributeItem(CPU_arch,
+ ARM::getArchAttr(EmittedArch),
+ false);
+
+ switch (Arch) {
+ case ARM::AK_ARMV2:
+ case ARM::AK_ARMV2A:
+ case ARM::AK_ARMV3:
+ case ARM::AK_ARMV3M:
+ case ARM::AK_ARMV4:
+ setAttributeItem(ARM_ISA_use, Allowed, false);
+ break;
+
+ case ARM::AK_ARMV4T:
+ case ARM::AK_ARMV5T:
+ case ARM::AK_ARMV5TE:
+ case ARM::AK_ARMV6:
+ setAttributeItem(ARM_ISA_use, Allowed, false);
+ setAttributeItem(THUMB_ISA_use, Allowed, false);
+ break;
+
+ case ARM::AK_ARMV6T2:
+ setAttributeItem(ARM_ISA_use, Allowed, false);
+ setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
+ break;
+
+ case ARM::AK_ARMV6K:
+ case ARM::AK_ARMV6KZ:
+ setAttributeItem(ARM_ISA_use, Allowed, false);
+ setAttributeItem(THUMB_ISA_use, Allowed, false);
+ setAttributeItem(Virtualization_use, AllowTZ, false);
+ break;
+
+ case ARM::AK_ARMV6M:
+ setAttributeItem(THUMB_ISA_use, Allowed, false);
+ break;
+
+ case ARM::AK_ARMV7A:
+ setAttributeItem(CPU_arch_profile, ApplicationProfile, false);
+ setAttributeItem(ARM_ISA_use, Allowed, false);
+ setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
+ break;
+
+ case ARM::AK_ARMV7R:
+ setAttributeItem(CPU_arch_profile, RealTimeProfile, false);
+ setAttributeItem(ARM_ISA_use, Allowed, false);
+ setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
+ break;
+
+ case ARM::AK_ARMV7M:
+ setAttributeItem(CPU_arch_profile, MicroControllerProfile, false);
+ setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
+ break;
+
+ case ARM::AK_ARMV8A:
+ case ARM::AK_ARMV8_1A:
+ case ARM::AK_ARMV8_2A:
+ setAttributeItem(CPU_arch_profile, ApplicationProfile, false);
+ setAttributeItem(ARM_ISA_use, Allowed, false);
+ setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
+ setAttributeItem(MPextension_use, Allowed, false);
+ setAttributeItem(Virtualization_use, AllowTZVirtualization, false);
+ break;
+
+ case ARM::AK_ARMV8MBaseline:
+ case ARM::AK_ARMV8MMainline:
+ setAttributeItem(THUMB_ISA_use, AllowThumbDerived, false);
+ setAttributeItem(CPU_arch_profile, MicroControllerProfile, false);
+ break;
+
+ case ARM::AK_IWMMXT:
+ setAttributeItem(ARM_ISA_use, Allowed, false);
+ setAttributeItem(THUMB_ISA_use, Allowed, false);
+ setAttributeItem(WMMX_arch, AllowWMMXv1, false);
+ break;
+
+ case ARM::AK_IWMMXT2:
+ setAttributeItem(ARM_ISA_use, Allowed, false);
+ setAttributeItem(THUMB_ISA_use, Allowed, false);
+ setAttributeItem(WMMX_arch, AllowWMMXv2, false);
+ break;
+
+ default:
+ report_fatal_error("Unknown Arch: " + Twine(Arch));
+ break;
+ }
+}
+void ARMTargetELFStreamer::emitFPU(unsigned Value) {
+ FPU = Value;
+}
+void ARMTargetELFStreamer::emitFPUDefaultAttributes() {
+ switch (FPU) {
+ case ARM::FK_VFP:
+ case ARM::FK_VFPV2:
+ setAttributeItem(ARMBuildAttrs::FP_arch,
+ ARMBuildAttrs::AllowFPv2,
+ /* OverwriteExisting= */ false);
+ break;
+
+ case ARM::FK_VFPV3:
+ setAttributeItem(ARMBuildAttrs::FP_arch,
+ ARMBuildAttrs::AllowFPv3A,
+ /* OverwriteExisting= */ false);
+ break;
+
+ case ARM::FK_VFPV3_FP16:
+ setAttributeItem(ARMBuildAttrs::FP_arch,
+ ARMBuildAttrs::AllowFPv3A,
+ /* OverwriteExisting= */ false);
+ setAttributeItem(ARMBuildAttrs::FP_HP_extension,
+ ARMBuildAttrs::AllowHPFP,
+ /* OverwriteExisting= */ false);
+ break;
+
+ case ARM::FK_VFPV3_D16:
+ setAttributeItem(ARMBuildAttrs::FP_arch,
+ ARMBuildAttrs::AllowFPv3B,
+ /* OverwriteExisting= */ false);
+ break;
+
+ case ARM::FK_VFPV3_D16_FP16:
+ setAttributeItem(ARMBuildAttrs::FP_arch,
+ ARMBuildAttrs::AllowFPv3B,
+ /* OverwriteExisting= */ false);
+ setAttributeItem(ARMBuildAttrs::FP_HP_extension,
+ ARMBuildAttrs::AllowHPFP,
+ /* OverwriteExisting= */ false);
+ break;
+
+ case ARM::FK_VFPV3XD:
+ setAttributeItem(ARMBuildAttrs::FP_arch,
+ ARMBuildAttrs::AllowFPv3B,
+ /* OverwriteExisting= */ false);
+ break;
+ case ARM::FK_VFPV3XD_FP16:
+ setAttributeItem(ARMBuildAttrs::FP_arch,
+ ARMBuildAttrs::AllowFPv3B,
+ /* OverwriteExisting= */ false);
+ setAttributeItem(ARMBuildAttrs::FP_HP_extension,
+ ARMBuildAttrs::AllowHPFP,
+ /* OverwriteExisting= */ false);
+ break;
+
+ case ARM::FK_VFPV4:
+ setAttributeItem(ARMBuildAttrs::FP_arch,
+ ARMBuildAttrs::AllowFPv4A,
+ /* OverwriteExisting= */ false);
+ break;
+
+ // ABI_HardFP_use is handled in ARMAsmPrinter, so _SP_D16 is treated the same
+ // as _D16 here.
+ case ARM::FK_FPV4_SP_D16:
+ case ARM::FK_VFPV4_D16:
+ setAttributeItem(ARMBuildAttrs::FP_arch,
+ ARMBuildAttrs::AllowFPv4B,
+ /* OverwriteExisting= */ false);
+ break;
+
+ case ARM::FK_FP_ARMV8:
+ setAttributeItem(ARMBuildAttrs::FP_arch,
+ ARMBuildAttrs::AllowFPARMv8A,
+ /* OverwriteExisting= */ false);
+ break;
+
+ // FPV5_D16 is identical to FP_ARMV8 except for the number of D registers, so
+ // uses the FP_ARMV8_D16 build attribute.
+ case ARM::FK_FPV5_SP_D16:
+ case ARM::FK_FPV5_D16:
+ setAttributeItem(ARMBuildAttrs::FP_arch,
+ ARMBuildAttrs::AllowFPARMv8B,
+ /* OverwriteExisting= */ false);
+ break;
+
+ case ARM::FK_NEON:
+ setAttributeItem(ARMBuildAttrs::FP_arch,
+ ARMBuildAttrs::AllowFPv3A,
+ /* OverwriteExisting= */ false);
+ setAttributeItem(ARMBuildAttrs::Advanced_SIMD_arch,
+ ARMBuildAttrs::AllowNeon,
+ /* OverwriteExisting= */ false);
+ break;
+
+ case ARM::FK_NEON_FP16:
+ setAttributeItem(ARMBuildAttrs::FP_arch,
+ ARMBuildAttrs::AllowFPv3A,
+ /* OverwriteExisting= */ false);
+ setAttributeItem(ARMBuildAttrs::Advanced_SIMD_arch,
+ ARMBuildAttrs::AllowNeon,
+ /* OverwriteExisting= */ false);
+ setAttributeItem(ARMBuildAttrs::FP_HP_extension,
+ ARMBuildAttrs::AllowHPFP,
+ /* OverwriteExisting= */ false);
+ break;
+
+ case ARM::FK_NEON_VFPV4:
+ setAttributeItem(ARMBuildAttrs::FP_arch,
+ ARMBuildAttrs::AllowFPv4A,
+ /* OverwriteExisting= */ false);
+ setAttributeItem(ARMBuildAttrs::Advanced_SIMD_arch,
+ ARMBuildAttrs::AllowNeon2,
+ /* OverwriteExisting= */ false);
+ break;
+
+ case ARM::FK_NEON_FP_ARMV8:
+ case ARM::FK_CRYPTO_NEON_FP_ARMV8:
+ setAttributeItem(ARMBuildAttrs::FP_arch,
+ ARMBuildAttrs::AllowFPARMv8A,
+ /* OverwriteExisting= */ false);
+ // 'Advanced_SIMD_arch' must be emitted not here, but within
+ // ARMAsmPrinter::emitAttributes(), depending on hasV8Ops() and hasV8_1a()
+ break;
+
+ case ARM::FK_SOFTVFP:
+ case ARM::FK_NONE:
+ break;
+
+ default:
+ report_fatal_error("Unknown FPU: " + Twine(FPU));
+ break;
+ }
+}
+size_t ARMTargetELFStreamer::calculateContentSize() const {
+ size_t Result = 0;
+ for (size_t i = 0; i < Contents.size(); ++i) {
+ AttributeItem item = Contents[i];
+ switch (item.Type) {
+ case AttributeItem::HiddenAttribute:
+ break;
+ case AttributeItem::NumericAttribute:
+ Result += getULEB128Size(item.Tag);
+ Result += getULEB128Size(item.IntValue);
+ break;
+ case AttributeItem::TextAttribute:
+ Result += getULEB128Size(item.Tag);
+ Result += item.StringValue.size() + 1; // string + '\0'
+ break;
+ case AttributeItem::NumericAndTextAttributes:
+ Result += getULEB128Size(item.Tag);
+ Result += getULEB128Size(item.IntValue);
+ Result += item.StringValue.size() + 1; // string + '\0';
+ break;
+ }
+ }
+ return Result;
+}
+void ARMTargetELFStreamer::finishAttributeSection() {
+ // <format-version>
+ // [ <section-length> "vendor-name"
+ // [ <file-tag> <size> <attribute>*
+ // | <section-tag> <size> <section-number>* 0 <attribute>*
+ // | <symbol-tag> <size> <symbol-number>* 0 <attribute>*
+ // ]+
+ // ]*
+
+ if (FPU != ARM::FK_INVALID)
+ emitFPUDefaultAttributes();
+
+ if (Arch != ARM::AK_INVALID)
+ emitArchDefaultAttributes();
+
+ if (Contents.empty())
+ return;
+
+ std::sort(Contents.begin(), Contents.end(), AttributeItem::LessTag);
+
+ ARMELFStreamer &Streamer = getStreamer();
+
+ // Switch to .ARM.attributes section
+ if (AttributeSection) {
+ Streamer.SwitchSection(AttributeSection);
+ } else {
+ AttributeSection = Streamer.getContext().getELFSection(
+ ".ARM.attributes", ELF::SHT_ARM_ATTRIBUTES, 0);
+ Streamer.SwitchSection(AttributeSection);
+
+ // Format version
+ Streamer.EmitIntValue(0x41, 1);
+ }
+
+ // Vendor size + Vendor name + '\0'
+ const size_t VendorHeaderSize = 4 + CurrentVendor.size() + 1;
+
+ // Tag + Tag Size
+ const size_t TagHeaderSize = 1 + 4;
+
+ const size_t ContentsSize = calculateContentSize();
+
+ Streamer.EmitIntValue(VendorHeaderSize + TagHeaderSize + ContentsSize, 4);
+ Streamer.EmitBytes(CurrentVendor);
+ Streamer.EmitIntValue(0, 1); // '\0'
+
+ Streamer.EmitIntValue(ARMBuildAttrs::File, 1);
+ Streamer.EmitIntValue(TagHeaderSize + ContentsSize, 4);
+
+ // Size should have been accounted for already, now
+ // emit each field as its type (ULEB or String)
+ for (size_t i = 0; i < Contents.size(); ++i) {
+ AttributeItem item = Contents[i];
+ Streamer.EmitULEB128IntValue(item.Tag);
+ switch (item.Type) {
+ default: llvm_unreachable("Invalid attribute type");
+ case AttributeItem::NumericAttribute:
+ Streamer.EmitULEB128IntValue(item.IntValue);
+ break;
+ case AttributeItem::TextAttribute:
+ Streamer.EmitBytes(item.StringValue);
+ Streamer.EmitIntValue(0, 1); // '\0'
+ break;
+ case AttributeItem::NumericAndTextAttributes:
+ Streamer.EmitULEB128IntValue(item.IntValue);
+ Streamer.EmitBytes(item.StringValue);
+ Streamer.EmitIntValue(0, 1); // '\0'
+ break;
+ }
+ }
+
+ Contents.clear();
+ FPU = ARM::FK_INVALID;
+}
+
+void ARMTargetELFStreamer::emitLabel(MCSymbol *Symbol) {
+ ARMELFStreamer &Streamer = getStreamer();
+ if (!Streamer.IsThumb)
+ return;
+
+ Streamer.getAssembler().registerSymbol(*Symbol);
+ unsigned Type = cast<MCSymbolELF>(Symbol)->getType();
+ if (Type == ELF::STT_FUNC || Type == ELF::STT_GNU_IFUNC)
+ Streamer.EmitThumbFunc(Symbol);
+}
+
+void
+ARMTargetELFStreamer::AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *S) {
+ getStreamer().EmitFixup(S, FK_Data_4);
+}
+
+void ARMTargetELFStreamer::emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) {
+ if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(Value)) {
+ const MCSymbol &Sym = SRE->getSymbol();
+ if (!Sym.isDefined()) {
+ getStreamer().EmitAssignment(Symbol, Value);
+ return;
+ }
+ }
+
+ getStreamer().EmitThumbFunc(Symbol);
+ getStreamer().EmitAssignment(Symbol, Value);
+}
+
+void ARMTargetELFStreamer::emitInst(uint32_t Inst, char Suffix) {
+ getStreamer().emitInst(Inst, Suffix);
+}
+
+void ARMTargetELFStreamer::reset() { AttributeSection = nullptr; }
+
+void ARMELFStreamer::FinishImpl() {
+ MCTargetStreamer &TS = *getTargetStreamer();
+ ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
+ ATS.finishAttributeSection();
+
+ MCELFStreamer::FinishImpl();
+}
+
+void ARMELFStreamer::reset() {
+ MCTargetStreamer &TS = *getTargetStreamer();
+ ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
+ ATS.reset();
+ MappingSymbolCounter = 0;
+ MCELFStreamer::reset();
+ // MCELFStreamer clear's the assembler's e_flags. However, for
+ // arm we manually set the ABI version on streamer creation, so
+ // do the same here
+ getAssembler().setELFHeaderEFlags(ELF::EF_ARM_EABI_VER5);
+}
+
+inline void ARMELFStreamer::SwitchToEHSection(StringRef Prefix,
+ unsigned Type,
+ unsigned Flags,
+ SectionKind Kind,
+ const MCSymbol &Fn) {
+ const MCSectionELF &FnSection =
+ static_cast<const MCSectionELF &>(Fn.getSection());
+
+ // Create the name for new section
+ StringRef FnSecName(FnSection.getSectionName());
+ SmallString<128> EHSecName(Prefix);
+ if (FnSecName != ".text") {
+ EHSecName += FnSecName;
+ }
+
+ // Get .ARM.extab or .ARM.exidx section
+ const MCSymbolELF *Group = FnSection.getGroup();
+ if (Group)
+ Flags |= ELF::SHF_GROUP;
+ MCSectionELF *EHSection =
+ getContext().getELFSection(EHSecName, Type, Flags, 0, Group,
+ FnSection.getUniqueID(), nullptr, &FnSection);
+
+ assert(EHSection && "Failed to get the required EH section");
+
+ // Switch to .ARM.extab or .ARM.exidx section
+ SwitchSection(EHSection);
+ EmitCodeAlignment(4);
+}
+
+inline void ARMELFStreamer::SwitchToExTabSection(const MCSymbol &FnStart) {
+ SwitchToEHSection(".ARM.extab", ELF::SHT_PROGBITS, ELF::SHF_ALLOC,
+ SectionKind::getData(), FnStart);
+}
+
+inline void ARMELFStreamer::SwitchToExIdxSection(const MCSymbol &FnStart) {
+ SwitchToEHSection(".ARM.exidx", ELF::SHT_ARM_EXIDX,
+ ELF::SHF_ALLOC | ELF::SHF_LINK_ORDER,
+ SectionKind::getData(), FnStart);
+}
+void ARMELFStreamer::EmitFixup(const MCExpr *Expr, MCFixupKind Kind) {
+ MCDataFragment *Frag = getOrCreateDataFragment();
+ Frag->getFixups().push_back(MCFixup::create(Frag->getContents().size(), Expr,
+ Kind));
+}
+
+void ARMELFStreamer::EHReset() {
+ ExTab = nullptr;
+ FnStart = nullptr;
+ Personality = nullptr;
+ PersonalityIndex = ARM::EHABI::NUM_PERSONALITY_INDEX;
+ FPReg = ARM::SP;
+ FPOffset = 0;
+ SPOffset = 0;
+ PendingOffset = 0;
+ UsedFP = false;
+ CantUnwind = false;
+
+ Opcodes.clear();
+ UnwindOpAsm.Reset();
+}
+
+void ARMELFStreamer::emitFnStart() {
+ assert(FnStart == nullptr);
+ FnStart = getContext().createTempSymbol();
+ EmitLabel(FnStart);
+}
+
+void ARMELFStreamer::emitFnEnd() {
+ assert(FnStart && ".fnstart must precedes .fnend");
+
+ // Emit unwind opcodes if there is no .handlerdata directive
+ if (!ExTab && !CantUnwind)
+ FlushUnwindOpcodes(true);
+
+ // Emit the exception index table entry
+ SwitchToExIdxSection(*FnStart);
+
+ if (PersonalityIndex < ARM::EHABI::NUM_PERSONALITY_INDEX)
+ EmitPersonalityFixup(GetAEABIUnwindPersonalityName(PersonalityIndex));
+
+ const MCSymbolRefExpr *FnStartRef =
+ MCSymbolRefExpr::create(FnStart,
+ MCSymbolRefExpr::VK_ARM_PREL31,
+ getContext());
+
+ EmitValue(FnStartRef, 4);
+
+ if (CantUnwind) {
+ EmitIntValue(ARM::EHABI::EXIDX_CANTUNWIND, 4);
+ } else if (ExTab) {
+ // Emit a reference to the unwind opcodes in the ".ARM.extab" section.
+ const MCSymbolRefExpr *ExTabEntryRef =
+ MCSymbolRefExpr::create(ExTab,
+ MCSymbolRefExpr::VK_ARM_PREL31,
+ getContext());
+ EmitValue(ExTabEntryRef, 4);
+ } else {
+ // For the __aeabi_unwind_cpp_pr0, we have to emit the unwind opcodes in
+ // the second word of exception index table entry. The size of the unwind
+ // opcodes should always be 4 bytes.
+ assert(PersonalityIndex == ARM::EHABI::AEABI_UNWIND_CPP_PR0 &&
+ "Compact model must use __aeabi_unwind_cpp_pr0 as personality");
+ assert(Opcodes.size() == 4u &&
+ "Unwind opcode size for __aeabi_unwind_cpp_pr0 must be equal to 4");
+ uint64_t Intval = Opcodes[0] |
+ Opcodes[1] << 8 |
+ Opcodes[2] << 16 |
+ Opcodes[3] << 24;
+ EmitIntValue(Intval, Opcodes.size());
+ }
+
+ // Switch to the section containing FnStart
+ SwitchSection(&FnStart->getSection());
+
+ // Clean exception handling frame information
+ EHReset();
+}
+
+void ARMELFStreamer::emitCantUnwind() { CantUnwind = true; }
+
+// Add the R_ARM_NONE fixup at the same position
+void ARMELFStreamer::EmitPersonalityFixup(StringRef Name) {
+ const MCSymbol *PersonalitySym = getContext().getOrCreateSymbol(Name);
+
+ const MCSymbolRefExpr *PersonalityRef = MCSymbolRefExpr::create(
+ PersonalitySym, MCSymbolRefExpr::VK_ARM_NONE, getContext());
+
+ visitUsedExpr(*PersonalityRef);
+ MCDataFragment *DF = getOrCreateDataFragment();
+ DF->getFixups().push_back(MCFixup::create(DF->getContents().size(),
+ PersonalityRef,
+ MCFixup::getKindForSize(4, false)));
+}
+
+void ARMELFStreamer::FlushPendingOffset() {
+ if (PendingOffset != 0) {
+ UnwindOpAsm.EmitSPOffset(-PendingOffset);
+ PendingOffset = 0;
+ }
+}
+
+void ARMELFStreamer::FlushUnwindOpcodes(bool NoHandlerData) {
+ // Emit the unwind opcode to restore $sp.
+ if (UsedFP) {
+ const MCRegisterInfo *MRI = getContext().getRegisterInfo();
+ int64_t LastRegSaveSPOffset = SPOffset - PendingOffset;
+ UnwindOpAsm.EmitSPOffset(LastRegSaveSPOffset - FPOffset);
+ UnwindOpAsm.EmitSetSP(MRI->getEncodingValue(FPReg));
+ } else {
+ FlushPendingOffset();
+ }
+
+ // Finalize the unwind opcode sequence
+ UnwindOpAsm.Finalize(PersonalityIndex, Opcodes);
+
+ // For compact model 0, we have to emit the unwind opcodes in the .ARM.exidx
+ // section. Thus, we don't have to create an entry in the .ARM.extab
+ // section.
+ if (NoHandlerData && PersonalityIndex == ARM::EHABI::AEABI_UNWIND_CPP_PR0)
+ return;
+
+ // Switch to .ARM.extab section.
+ SwitchToExTabSection(*FnStart);
+
+ // Create .ARM.extab label for offset in .ARM.exidx
+ assert(!ExTab);
+ ExTab = getContext().createTempSymbol();
+ EmitLabel(ExTab);
+
+ // Emit personality
+ if (Personality) {
+ const MCSymbolRefExpr *PersonalityRef =
+ MCSymbolRefExpr::create(Personality,
+ MCSymbolRefExpr::VK_ARM_PREL31,
+ getContext());
+
+ EmitValue(PersonalityRef, 4);
+ }
+
+ // Emit unwind opcodes
+ assert((Opcodes.size() % 4) == 0 &&
+ "Unwind opcode size for __aeabi_cpp_unwind_pr0 must be multiple of 4");
+ for (unsigned I = 0; I != Opcodes.size(); I += 4) {
+ uint64_t Intval = Opcodes[I] |
+ Opcodes[I + 1] << 8 |
+ Opcodes[I + 2] << 16 |
+ Opcodes[I + 3] << 24;
+ EmitIntValue(Intval, 4);
+ }
+
+ // According to ARM EHABI section 9.2, if the __aeabi_unwind_cpp_pr1() or
+ // __aeabi_unwind_cpp_pr2() is used, then the handler data must be emitted
+ // after the unwind opcodes. The handler data consists of several 32-bit
+ // words, and should be terminated by zero.
+ //
+ // In case that the .handlerdata directive is not specified by the
+ // programmer, we should emit zero to terminate the handler data.
+ if (NoHandlerData && !Personality)
+ EmitIntValue(0, 4);
+}
+
+void ARMELFStreamer::emitHandlerData() { FlushUnwindOpcodes(false); }
+
+void ARMELFStreamer::emitPersonality(const MCSymbol *Per) {
+ Personality = Per;
+ UnwindOpAsm.setPersonality(Per);
+}
+
+void ARMELFStreamer::emitPersonalityIndex(unsigned Index) {
+ assert(Index < ARM::EHABI::NUM_PERSONALITY_INDEX && "invalid index");
+ PersonalityIndex = Index;
+}
+
+void ARMELFStreamer::emitSetFP(unsigned NewFPReg, unsigned NewSPReg,
+ int64_t Offset) {
+ assert((NewSPReg == ARM::SP || NewSPReg == FPReg) &&
+ "the operand of .setfp directive should be either $sp or $fp");
+
+ UsedFP = true;
+ FPReg = NewFPReg;
+
+ if (NewSPReg == ARM::SP)
+ FPOffset = SPOffset + Offset;
+ else
+ FPOffset += Offset;
+}
+
+void ARMELFStreamer::emitMovSP(unsigned Reg, int64_t Offset) {
+ assert((Reg != ARM::SP && Reg != ARM::PC) &&
+ "the operand of .movsp cannot be either sp or pc");
+ assert(FPReg == ARM::SP && "current FP must be SP");
+
+ FlushPendingOffset();
+
+ FPReg = Reg;
+ FPOffset = SPOffset + Offset;
+
+ const MCRegisterInfo *MRI = getContext().getRegisterInfo();
+ UnwindOpAsm.EmitSetSP(MRI->getEncodingValue(FPReg));
+}
+
+void ARMELFStreamer::emitPad(int64_t Offset) {
+ // Track the change of the $sp offset
+ SPOffset -= Offset;
+
+ // To squash multiple .pad directives, we should delay the unwind opcode
+ // until the .save, .vsave, .handlerdata, or .fnend directives.
+ PendingOffset -= Offset;
+}
+
+void ARMELFStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList,
+ bool IsVector) {
+ // Collect the registers in the register list
+ unsigned Count = 0;
+ uint32_t Mask = 0;
+ const MCRegisterInfo *MRI = getContext().getRegisterInfo();
+ for (size_t i = 0; i < RegList.size(); ++i) {
+ unsigned Reg = MRI->getEncodingValue(RegList[i]);
+ assert(Reg < (IsVector ? 32U : 16U) && "Register out of range");
+ unsigned Bit = (1u << Reg);
+ if ((Mask & Bit) == 0) {
+ Mask |= Bit;
+ ++Count;
+ }
+ }
+
+ // Track the change the $sp offset: For the .save directive, the
+ // corresponding push instruction will decrease the $sp by (4 * Count).
+ // For the .vsave directive, the corresponding vpush instruction will
+ // decrease $sp by (8 * Count).
+ SPOffset -= Count * (IsVector ? 8 : 4);
+
+ // Emit the opcode
+ FlushPendingOffset();
+ if (IsVector)
+ UnwindOpAsm.EmitVFPRegSave(Mask);
+ else
+ UnwindOpAsm.EmitRegSave(Mask);
+}
+
+void ARMELFStreamer::emitUnwindRaw(int64_t Offset,
+ const SmallVectorImpl<uint8_t> &Opcodes) {
+ FlushPendingOffset();
+ SPOffset = SPOffset - Offset;
+ UnwindOpAsm.EmitRaw(Opcodes);
+}
+
+namespace llvm {
+
+MCTargetStreamer *createARMTargetAsmStreamer(MCStreamer &S,
+ formatted_raw_ostream &OS,
+ MCInstPrinter *InstPrint,
+ bool isVerboseAsm) {
+ return new ARMTargetAsmStreamer(S, OS, *InstPrint, isVerboseAsm);
+}
+
+MCTargetStreamer *createARMNullTargetStreamer(MCStreamer &S) {
+ return new ARMTargetStreamer(S);
+}
+
+MCTargetStreamer *createARMObjectTargetStreamer(MCStreamer &S,
+ const MCSubtargetInfo &STI) {
+ const Triple &TT = STI.getTargetTriple();
+ if (TT.isOSBinFormatELF())
+ return new ARMTargetELFStreamer(S);
+ return new ARMTargetStreamer(S);
+}
+
+MCELFStreamer *createARMELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+ raw_pwrite_stream &OS,
+ MCCodeEmitter *Emitter, bool RelaxAll,
+ bool IsThumb) {
+ ARMELFStreamer *S = new ARMELFStreamer(Context, TAB, OS, Emitter, IsThumb);
+ // FIXME: This should eventually end up somewhere else where more
+ // intelligent flag decisions can be made. For now we are just maintaining
+ // the status quo for ARM and setting EF_ARM_EABI_VER5 as the default.
+ S->getAssembler().setELFHeaderEFlags(ELF::EF_ARM_EABI_VER5);
+
+ if (RelaxAll)
+ S->getAssembler().setRelaxAll(true);
+ return S;
+ }
+
+}
+
+
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
new file mode 100644
index 000000000000..3fe2302bdd37
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
@@ -0,0 +1,120 @@
+//===-- ARMFixupKinds.h - ARM Specific Fixup Entries ------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMFIXUPKINDS_H
+#define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMFIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace ARM {
+enum Fixups {
+ // fixup_arm_ldst_pcrel_12 - 12-bit PC relative relocation for symbol
+ // addresses
+ fixup_arm_ldst_pcrel_12 = FirstTargetFixupKind,
+
+ // fixup_t2_ldst_pcrel_12 - Equivalent to fixup_arm_ldst_pcrel_12, with
+ // the 16-bit halfwords reordered.
+ fixup_t2_ldst_pcrel_12,
+
+ // fixup_arm_pcrel_10_unscaled - 10-bit PC relative relocation for symbol
+ // addresses used in LDRD/LDRH/LDRB/etc. instructions. All bits are encoded.
+ fixup_arm_pcrel_10_unscaled,
+ // fixup_arm_pcrel_10 - 10-bit PC relative relocation for symbol addresses
+ // used in VFP instructions where the lower 2 bits are not encoded
+ // (so it's encoded as an 8-bit immediate).
+ fixup_arm_pcrel_10,
+ // fixup_t2_pcrel_10 - Equivalent to fixup_arm_pcrel_10, accounting for
+ // the short-swapped encoding of Thumb2 instructions.
+ fixup_t2_pcrel_10,
+ // fixup_arm_pcrel_9 - 9-bit PC relative relocation for symbol addresses
+ // used in VFP instructions where bit 0 not encoded (so it's encoded as an
+ // 8-bit immediate).
+ fixup_arm_pcrel_9,
+ // fixup_t2_pcrel_9 - Equivalent to fixup_arm_pcrel_9, accounting for
+ // the short-swapped encoding of Thumb2 instructions.
+ fixup_t2_pcrel_9,
+ // fixup_thumb_adr_pcrel_10 - 10-bit PC relative relocation for symbol
+ // addresses where the lower 2 bits are not encoded (so it's encoded as an
+ // 8-bit immediate).
+ fixup_thumb_adr_pcrel_10,
+ // fixup_arm_adr_pcrel_12 - 12-bit PC relative relocation for the ADR
+ // instruction.
+ fixup_arm_adr_pcrel_12,
+ // fixup_t2_adr_pcrel_12 - 12-bit PC relative relocation for the ADR
+ // instruction.
+ fixup_t2_adr_pcrel_12,
+ // fixup_arm_condbranch - 24-bit PC relative relocation for conditional branch
+ // instructions.
+ fixup_arm_condbranch,
+ // fixup_arm_uncondbranch - 24-bit PC relative relocation for
+ // branch instructions. (unconditional)
+ fixup_arm_uncondbranch,
+ // fixup_t2_condbranch - 20-bit PC relative relocation for Thumb2 direct
+ // uconditional branch instructions.
+ fixup_t2_condbranch,
+ // fixup_t2_uncondbranch - 20-bit PC relative relocation for Thumb2 direct
+ // branch unconditional branch instructions.
+ fixup_t2_uncondbranch,
+
+ // fixup_arm_thumb_br - 12-bit fixup for Thumb B instructions.
+ fixup_arm_thumb_br,
+
+ // The following fixups handle the ARM BL instructions. These can be
+ // conditionalised; however, the ARM ELF ABI requires a different relocation
+ // in that case: R_ARM_JUMP24 instead of R_ARM_CALL. The difference is that
+ // R_ARM_CALL is allowed to change the instruction to a BLX inline, which has
+ // no conditional version; R_ARM_JUMP24 would have to insert a veneer.
+ //
+ // MachO does not draw a distinction between the two cases, so it will treat
+ // fixup_arm_uncondbl and fixup_arm_condbl as identical fixups.
+
+ // fixup_arm_uncondbl - Fixup for unconditional ARM BL instructions.
+ fixup_arm_uncondbl,
+
+ // fixup_arm_condbl - Fixup for ARM BL instructions with nontrivial
+ // conditionalisation.
+ fixup_arm_condbl,
+
+ // fixup_arm_blx - Fixup for ARM BLX instructions.
+ fixup_arm_blx,
+
+ // fixup_arm_thumb_bl - Fixup for Thumb BL instructions.
+ fixup_arm_thumb_bl,
+
+ // fixup_arm_thumb_blx - Fixup for Thumb BLX instructions.
+ fixup_arm_thumb_blx,
+
+ // fixup_arm_thumb_cb - Fixup for Thumb branch instructions.
+ fixup_arm_thumb_cb,
+
+ // fixup_arm_thumb_cp - Fixup for Thumb load/store from constant pool instrs.
+ fixup_arm_thumb_cp,
+
+ // fixup_arm_thumb_bcc - Fixup for Thumb conditional branching instructions.
+ fixup_arm_thumb_bcc,
+
+ // The next two are for the movt/movw pair
+ // the 16bit imm field are split into imm{15-12} and imm{11-0}
+ fixup_arm_movt_hi16, // :upper16:
+ fixup_arm_movw_lo16, // :lower16:
+ fixup_t2_movt_hi16, // :upper16:
+ fixup_t2_movw_lo16, // :lower16:
+
+ // fixup_arm_mod_imm - Fixup for mod_imm
+ fixup_arm_mod_imm,
+
+ // Marker
+ LastTargetFixupKind,
+ NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+}
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
new file mode 100644
index 000000000000..1e062ad45af5
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
@@ -0,0 +1,115 @@
+//===-- ARMMCAsmInfo.cpp - ARM asm properties -----------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the ARMMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMMCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+
+using namespace llvm;
+
+void ARMMCAsmInfoDarwin::anchor() { }
+
+ARMMCAsmInfoDarwin::ARMMCAsmInfoDarwin(const Triple &TheTriple) {
+ if ((TheTriple.getArch() == Triple::armeb) ||
+ (TheTriple.getArch() == Triple::thumbeb))
+ IsLittleEndian = false;
+
+ Data64bitsDirective = nullptr;
+ CommentString = "@";
+ Code16Directive = ".code\t16";
+ Code32Directive = ".code\t32";
+ UseDataRegionDirectives = true;
+
+ SupportsDebugInformation = true;
+
+ // Exceptions handling
+ ExceptionsType = (TheTriple.isOSDarwin() && !TheTriple.isWatchABI())
+ ? ExceptionHandling::SjLj
+ : ExceptionHandling::DwarfCFI;
+
+ UseIntegratedAssembler = true;
+}
+
+void ARMELFMCAsmInfo::anchor() { }
+
+ARMELFMCAsmInfo::ARMELFMCAsmInfo(const Triple &TheTriple) {
+ if ((TheTriple.getArch() == Triple::armeb) ||
+ (TheTriple.getArch() == Triple::thumbeb))
+ IsLittleEndian = false;
+
+ // ".comm align is in bytes but .align is pow-2."
+ AlignmentIsInBytes = false;
+
+ Data64bitsDirective = nullptr;
+ CommentString = "@";
+ Code16Directive = ".code\t16";
+ Code32Directive = ".code\t32";
+
+ SupportsDebugInformation = true;
+
+ // Exceptions handling
+ switch (TheTriple.getOS()) {
+ case Triple::Bitrig:
+ case Triple::NetBSD:
+ ExceptionsType = ExceptionHandling::DwarfCFI;
+ break;
+ default:
+ ExceptionsType = ExceptionHandling::ARM;
+ break;
+ }
+
+ // foo(plt) instead of foo@plt
+ UseParensForSymbolVariant = true;
+
+ UseIntegratedAssembler = true;
+}
+
+void ARMELFMCAsmInfo::setUseIntegratedAssembler(bool Value) {
+ UseIntegratedAssembler = Value;
+ if (!UseIntegratedAssembler) {
+ // gas doesn't handle VFP register names in cfi directives,
+ // so don't use register names with external assembler.
+ // See https://sourceware.org/bugzilla/show_bug.cgi?id=16694
+ DwarfRegNumForCFI = true;
+ }
+}
+
+void ARMCOFFMCAsmInfoMicrosoft::anchor() { }
+
+ARMCOFFMCAsmInfoMicrosoft::ARMCOFFMCAsmInfoMicrosoft() {
+ AlignmentIsInBytes = false;
+
+ PrivateGlobalPrefix = "$M";
+ PrivateLabelPrefix = "$M";
+ CommentString = ";";
+}
+
+void ARMCOFFMCAsmInfoGNU::anchor() { }
+
+ARMCOFFMCAsmInfoGNU::ARMCOFFMCAsmInfoGNU() {
+ AlignmentIsInBytes = false;
+ HasSingleParameterDotFile = true;
+
+ CommentString = "@";
+ Code16Directive = ".code\t16";
+ Code32Directive = ".code\t32";
+ PrivateGlobalPrefix = ".L";
+ PrivateLabelPrefix = ".L";
+
+ SupportsDebugInformation = true;
+ ExceptionsType = ExceptionHandling::None;
+ UseParensForSymbolVariant = true;
+
+ UseIntegratedAssembler = false;
+ DwarfRegNumForCFI = true;
+}
+
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
new file mode 100644
index 000000000000..5e548162bec6
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
@@ -0,0 +1,56 @@
+//===-- ARMMCAsmInfo.h - ARM asm properties --------------------*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the ARMMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMMCASMINFO_H
+#define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMMCASMINFO_H
+
+#include "llvm/MC/MCAsmInfoCOFF.h"
+#include "llvm/MC/MCAsmInfoDarwin.h"
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+class Triple;
+
+class ARMMCAsmInfoDarwin : public MCAsmInfoDarwin {
+ virtual void anchor();
+
+public:
+ explicit ARMMCAsmInfoDarwin(const Triple &TheTriple);
+};
+
+class ARMELFMCAsmInfo : public MCAsmInfoELF {
+ void anchor() override;
+
+public:
+ explicit ARMELFMCAsmInfo(const Triple &TT);
+
+ void setUseIntegratedAssembler(bool Value) override;
+};
+
+class ARMCOFFMCAsmInfoMicrosoft : public MCAsmInfoMicrosoft {
+ void anchor() override;
+
+public:
+ explicit ARMCOFFMCAsmInfoMicrosoft();
+};
+
+class ARMCOFFMCAsmInfoGNU : public MCAsmInfoGNUCOFF {
+ void anchor() override;
+
+public:
+ explicit ARMCOFFMCAsmInfoGNU();
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
new file mode 100644
index 000000000000..559a4f8de75f
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -0,0 +1,1702 @@
+//===-- ARM/ARMMCCodeEmitter.cpp - Convert ARM code to machine code -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARMMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "MCTargetDesc/ARMFixupKinds.h"
+#include "MCTargetDesc/ARMMCExpr.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mccodeemitter"
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted.");
+STATISTIC(MCNumCPRelocations, "Number of constant pool relocations created.");
+
+namespace {
+class ARMMCCodeEmitter : public MCCodeEmitter {
+ ARMMCCodeEmitter(const ARMMCCodeEmitter &) = delete;
+ void operator=(const ARMMCCodeEmitter &) = delete;
+ const MCInstrInfo &MCII;
+ const MCContext &CTX;
+ bool IsLittleEndian;
+
+public:
+ ARMMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx, bool IsLittle)
+ : MCII(mcii), CTX(ctx), IsLittleEndian(IsLittle) {
+ }
+
+ ~ARMMCCodeEmitter() override {}
+
+ bool isThumb(const MCSubtargetInfo &STI) const {
+ return STI.getFeatureBits()[ARM::ModeThumb];
+ }
+ bool isThumb2(const MCSubtargetInfo &STI) const {
+ return isThumb(STI) && STI.getFeatureBits()[ARM::FeatureThumb2];
+ }
+ bool isTargetMachO(const MCSubtargetInfo &STI) const {
+ const Triple &TT = STI.getTargetTriple();
+ return TT.isOSBinFormatMachO();
+ }
+
+ unsigned getMachineSoImmOpValue(unsigned SoImm) const;
+
+ // getBinaryCodeForInstr - TableGen'erated function for getting the
+ // binary encoding for an instruction.
+ uint64_t getBinaryCodeForInstr(const MCInst &MI,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getMachineOpValue - Return binary encoding of operand. If the machine
+ /// operand requires relocation, record the relocation and return zero.
+ unsigned getMachineOpValue(const MCInst &MI,const MCOperand &MO,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getHiLo16ImmOpValue - Return the encoding for the hi / low 16-bit of
+ /// the specified operand. This is used for operands with :lower16: and
+ /// :upper16: prefixes.
+ uint32_t getHiLo16ImmOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ bool EncodeAddrModeOpValues(const MCInst &MI, unsigned OpIdx,
+ unsigned &Reg, unsigned &Imm,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getThumbBLTargetOpValue - Return encoding info for Thumb immediate
+ /// BL branch target.
+ uint32_t getThumbBLTargetOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getThumbBLXTargetOpValue - Return encoding info for Thumb immediate
+ /// BLX branch target.
+ uint32_t getThumbBLXTargetOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getThumbBRTargetOpValue - Return encoding info for Thumb branch target.
+ uint32_t getThumbBRTargetOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getThumbBCCTargetOpValue - Return encoding info for Thumb branch target.
+ uint32_t getThumbBCCTargetOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getThumbCBTargetOpValue - Return encoding info for Thumb branch target.
+ uint32_t getThumbCBTargetOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getBranchTargetOpValue - Return encoding info for 24-bit immediate
+ /// branch target.
+ uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getThumbBranchTargetOpValue - Return encoding info for 24-bit
+ /// immediate Thumb2 direct branch target.
+ uint32_t getThumbBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getARMBranchTargetOpValue - Return encoding info for 24-bit immediate
+ /// branch target.
+ uint32_t getARMBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ uint32_t getARMBLTargetOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ uint32_t getARMBLXTargetOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getAdrLabelOpValue - Return encoding info for 12-bit immediate
+ /// ADR label target.
+ uint32_t getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ uint32_t getThumbAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ uint32_t getT2AdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+
+ /// getAddrModeImm12OpValue - Return encoding info for 'reg +/- imm12'
+ /// operand.
+ uint32_t getAddrModeImm12OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getThumbAddrModeRegRegOpValue - Return encoding for 'reg + reg' operand.
+ uint32_t getThumbAddrModeRegRegOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getT2AddrModeImm8s4OpValue - Return encoding info for 'reg +/- imm8<<2'
+ /// operand.
+ uint32_t getT2AddrModeImm8s4OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getT2AddrModeImm0_1020s4OpValue - Return encoding info for 'reg + imm8<<2'
+ /// operand.
+ uint32_t getT2AddrModeImm0_1020s4OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getT2Imm8s4OpValue - Return encoding info for '+/- imm8<<2'
+ /// operand.
+ uint32_t getT2Imm8s4OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+
+ /// getLdStSORegOpValue - Return encoding info for 'reg +/- reg shop imm'
+ /// operand as needed by load/store instructions.
+ uint32_t getLdStSORegOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getLdStmModeOpValue - Return encoding for load/store multiple mode.
+ uint32_t getLdStmModeOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ ARM_AM::AMSubMode Mode = (ARM_AM::AMSubMode)MI.getOperand(OpIdx).getImm();
+ switch (Mode) {
+ default: llvm_unreachable("Unknown addressing sub-mode!");
+ case ARM_AM::da: return 0;
+ case ARM_AM::ia: return 1;
+ case ARM_AM::db: return 2;
+ case ARM_AM::ib: return 3;
+ }
+ }
+ /// getShiftOp - Return the shift opcode (bit[6:5]) of the immediate value.
+ ///
+ unsigned getShiftOp(ARM_AM::ShiftOpc ShOpc) const {
+ switch (ShOpc) {
+ case ARM_AM::no_shift:
+ case ARM_AM::lsl: return 0;
+ case ARM_AM::lsr: return 1;
+ case ARM_AM::asr: return 2;
+ case ARM_AM::ror:
+ case ARM_AM::rrx: return 3;
+ }
+ llvm_unreachable("Invalid ShiftOpc!");
+ }
+
+ /// getAddrMode2OffsetOpValue - Return encoding for am2offset operands.
+ uint32_t getAddrMode2OffsetOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getPostIdxRegOpValue - Return encoding for postidx_reg operands.
+ uint32_t getPostIdxRegOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getAddrMode3OffsetOpValue - Return encoding for am3offset operands.
+ uint32_t getAddrMode3OffsetOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getAddrMode3OpValue - Return encoding for addrmode3 operands.
+ uint32_t getAddrMode3OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getAddrModeThumbSPOpValue - Return encoding info for 'reg +/- imm12'
+ /// operand.
+ uint32_t getAddrModeThumbSPOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getAddrModeISOpValue - Encode the t_addrmode_is# operands.
+ uint32_t getAddrModeISOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getAddrModePCOpValue - Return encoding for t_addrmode_pc operands.
+ uint32_t getAddrModePCOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getAddrMode5OpValue - Return encoding info for 'reg +/- (imm8 << 2)' operand.
+ uint32_t getAddrMode5OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getAddrMode5FP16OpValue - Return encoding info for 'reg +/- (imm8 << 1)' operand.
+ uint32_t getAddrMode5FP16OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getCCOutOpValue - Return encoding of the 's' bit.
+ unsigned getCCOutOpValue(const MCInst &MI, unsigned Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // The operand is either reg0 or CPSR. The 's' bit is encoded as '0' or
+ // '1' respectively.
+ return MI.getOperand(Op).getReg() == ARM::CPSR;
+ }
+
+ /// getSOImmOpValue - Return an encoded 12-bit shifted-immediate value.
+ unsigned getSOImmOpValue(const MCInst &MI, unsigned Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+
+ const MCOperand &MO = MI.getOperand(Op);
+
+ // We expect MO to be an immediate or an expression,
+ // if it is an immediate - that's fine, just encode the value.
+ // Otherwise - create a Fixup.
+ if (MO.isExpr()) {
+ const MCExpr *Expr = MO.getExpr();
+ // In instruction code this value always encoded as lowest 12 bits,
+ // so we don't have to perform any specific adjustments.
+ // Due to requirements of relocatable records we have to use FK_Data_4.
+ // See ARMELFObjectWriter::ExplicitRelSym and
+ // ARMELFObjectWriter::GetRelocTypeInner for more details.
+ MCFixupKind Kind = MCFixupKind(FK_Data_4);
+ Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc()));
+ return 0;
+ }
+
+ unsigned SoImm = MO.getImm();
+ int SoImmVal = ARM_AM::getSOImmVal(SoImm);
+ assert(SoImmVal != -1 && "Not a valid so_imm value!");
+
+ // Encode rotate_imm.
+ unsigned Binary = (ARM_AM::getSOImmValRot((unsigned)SoImmVal) >> 1)
+ << ARMII::SoRotImmShift;
+
+ // Encode immed_8.
+ Binary |= ARM_AM::getSOImmValImm((unsigned)SoImmVal);
+ return Binary;
+ }
+
+ unsigned getModImmOpValue(const MCInst &MI, unsigned Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &ST) const {
+ const MCOperand &MO = MI.getOperand(Op);
+
+ // Support for fixups (MCFixup)
+ if (MO.isExpr()) {
+ const MCExpr *Expr = MO.getExpr();
+ // Fixups resolve to plain values that need to be encoded.
+ MCFixupKind Kind = MCFixupKind(ARM::fixup_arm_mod_imm);
+ Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc()));
+ return 0;
+ }
+
+ // Immediate is already in its encoded format
+ return MO.getImm();
+ }
+
+ /// getT2SOImmOpValue - Return an encoded 12-bit shifted-immediate value.
+ unsigned getT2SOImmOpValue(const MCInst &MI, unsigned Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ unsigned SoImm = MI.getOperand(Op).getImm();
+ unsigned Encoded = ARM_AM::getT2SOImmVal(SoImm);
+ assert(Encoded != ~0U && "Not a Thumb2 so_imm value?");
+ return Encoded;
+ }
+
+ unsigned getT2AddrModeSORegOpValue(const MCInst &MI, unsigned OpNum,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ unsigned getT2AddrModeImm8OpValue(const MCInst &MI, unsigned OpNum,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ unsigned getT2AddrModeImm8OffsetOpValue(const MCInst &MI, unsigned OpNum,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getSORegOpValue - Return an encoded so_reg shifted register value.
+ unsigned getSORegRegOpValue(const MCInst &MI, unsigned Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ unsigned getSORegImmOpValue(const MCInst &MI, unsigned Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ unsigned getT2SORegOpValue(const MCInst &MI, unsigned Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ unsigned getNEONVcvtImm32OpValue(const MCInst &MI, unsigned Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ return 64 - MI.getOperand(Op).getImm();
+ }
+
+ unsigned getBitfieldInvertedMaskOpValue(const MCInst &MI, unsigned Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ unsigned getRegisterListOpValue(const MCInst &MI, unsigned Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ unsigned getAddrMode6AddressOpValue(const MCInst &MI, unsigned Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ unsigned getAddrMode6OneLane32AddressOpValue(const MCInst &MI, unsigned Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ unsigned getAddrMode6DupAddressOpValue(const MCInst &MI, unsigned Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ unsigned getAddrMode6OffsetOpValue(const MCInst &MI, unsigned Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ unsigned getShiftRight8Imm(const MCInst &MI, unsigned Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ unsigned getShiftRight16Imm(const MCInst &MI, unsigned Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ unsigned getShiftRight32Imm(const MCInst &MI, unsigned Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ unsigned getShiftRight64Imm(const MCInst &MI, unsigned Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ unsigned getThumbSRImmOpValue(const MCInst &MI, unsigned Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ unsigned NEONThumb2DataIPostEncoder(const MCInst &MI,
+ unsigned EncodedValue,
+ const MCSubtargetInfo &STI) const;
+ unsigned NEONThumb2LoadStorePostEncoder(const MCInst &MI,
+ unsigned EncodedValue,
+ const MCSubtargetInfo &STI) const;
+ unsigned NEONThumb2DupPostEncoder(const MCInst &MI,
+ unsigned EncodedValue,
+ const MCSubtargetInfo &STI) const;
+ unsigned NEONThumb2V8PostEncoder(const MCInst &MI,
+ unsigned EncodedValue,
+ const MCSubtargetInfo &STI) const;
+
+ unsigned VFPThumb2PostEncoder(const MCInst &MI,
+ unsigned EncodedValue,
+ const MCSubtargetInfo &STI) const;
+
+ void EmitByte(unsigned char C, raw_ostream &OS) const {
+ OS << (char)C;
+ }
+
+ void EmitConstant(uint64_t Val, unsigned Size, raw_ostream &OS) const {
+ // Output the constant in little endian byte order.
+ for (unsigned i = 0; i != Size; ++i) {
+ unsigned Shift = IsLittleEndian ? i * 8 : (Size - 1 - i) * 8;
+ EmitByte((Val >> Shift) & 0xff, OS);
+ }
+ }
+
+ void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const override;
+};
+
+} // end anonymous namespace
+
+MCCodeEmitter *llvm::createARMLEMCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx) {
+ return new ARMMCCodeEmitter(MCII, Ctx, true);
+}
+
+MCCodeEmitter *llvm::createARMBEMCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx) {
+ return new ARMMCCodeEmitter(MCII, Ctx, false);
+}
+
+/// NEONThumb2DataIPostEncoder - Post-process encoded NEON data-processing
+/// instructions, and rewrite them to their Thumb2 form if we are currently in
+/// Thumb2 mode.
+unsigned ARMMCCodeEmitter::NEONThumb2DataIPostEncoder(const MCInst &MI,
+ unsigned EncodedValue,
+ const MCSubtargetInfo &STI) const {
+ if (isThumb2(STI)) {
+ // NEON Thumb2 data-processsing encodings are very simple: bit 24 is moved
+ // to bit 12 of the high half-word (i.e. bit 28), and bits 27-24 are
+ // set to 1111.
+ unsigned Bit24 = EncodedValue & 0x01000000;
+ unsigned Bit28 = Bit24 << 4;
+ EncodedValue &= 0xEFFFFFFF;
+ EncodedValue |= Bit28;
+ EncodedValue |= 0x0F000000;
+ }
+
+ return EncodedValue;
+}
+
+/// NEONThumb2LoadStorePostEncoder - Post-process encoded NEON load/store
+/// instructions, and rewrite them to their Thumb2 form if we are currently in
+/// Thumb2 mode.
+unsigned ARMMCCodeEmitter::NEONThumb2LoadStorePostEncoder(const MCInst &MI,
+ unsigned EncodedValue,
+ const MCSubtargetInfo &STI) const {
+ if (isThumb2(STI)) {
+ EncodedValue &= 0xF0FFFFFF;
+ EncodedValue |= 0x09000000;
+ }
+
+ return EncodedValue;
+}
+
+/// NEONThumb2DupPostEncoder - Post-process encoded NEON vdup
+/// instructions, and rewrite them to their Thumb2 form if we are currently in
+/// Thumb2 mode.
+unsigned ARMMCCodeEmitter::NEONThumb2DupPostEncoder(const MCInst &MI,
+ unsigned EncodedValue,
+ const MCSubtargetInfo &STI) const {
+ if (isThumb2(STI)) {
+ EncodedValue &= 0x00FFFFFF;
+ EncodedValue |= 0xEE000000;
+ }
+
+ return EncodedValue;
+}
+
+/// Post-process encoded NEON v8 instructions, and rewrite them to Thumb2 form
+/// if we are in Thumb2.
+unsigned ARMMCCodeEmitter::NEONThumb2V8PostEncoder(const MCInst &MI,
+ unsigned EncodedValue,
+ const MCSubtargetInfo &STI) const {
+ if (isThumb2(STI)) {
+ EncodedValue |= 0xC000000; // Set bits 27-26
+ }
+
+ return EncodedValue;
+}
+
+/// VFPThumb2PostEncoder - Post-process encoded VFP instructions and rewrite
+/// them to their Thumb2 form if we are currently in Thumb2 mode.
+unsigned ARMMCCodeEmitter::
+VFPThumb2PostEncoder(const MCInst &MI, unsigned EncodedValue,
+ const MCSubtargetInfo &STI) const {
+ if (isThumb2(STI)) {
+ EncodedValue &= 0x0FFFFFFF;
+ EncodedValue |= 0xE0000000;
+ }
+ return EncodedValue;
+}
+
+/// getMachineOpValue - Return binary encoding of operand. If the machine
+/// operand requires relocation, record the relocation and return zero.
+unsigned ARMMCCodeEmitter::
+getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ if (MO.isReg()) {
+ unsigned Reg = MO.getReg();
+ unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(Reg);
+
+ // Q registers are encoded as 2x their register number.
+ switch (Reg) {
+ default:
+ return RegNo;
+ case ARM::Q0: case ARM::Q1: case ARM::Q2: case ARM::Q3:
+ case ARM::Q4: case ARM::Q5: case ARM::Q6: case ARM::Q7:
+ case ARM::Q8: case ARM::Q9: case ARM::Q10: case ARM::Q11:
+ case ARM::Q12: case ARM::Q13: case ARM::Q14: case ARM::Q15:
+ return 2 * RegNo;
+ }
+ } else if (MO.isImm()) {
+ return static_cast<unsigned>(MO.getImm());
+ } else if (MO.isFPImm()) {
+ return static_cast<unsigned>(APFloat(MO.getFPImm())
+ .bitcastToAPInt().getHiBits(32).getLimitedValue());
+ }
+
+ llvm_unreachable("Unable to encode MCOperand!");
+}
+
+/// getAddrModeImmOpValue - Return encoding info for 'reg +/- imm' operand.
+bool ARMMCCodeEmitter::
+EncodeAddrModeOpValues(const MCInst &MI, unsigned OpIdx, unsigned &Reg,
+ unsigned &Imm, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+
+ Reg = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
+
+ int32_t SImm = MO1.getImm();
+ bool isAdd = true;
+
+ // Special value for #-0
+ if (SImm == INT32_MIN) {
+ SImm = 0;
+ isAdd = false;
+ }
+
+ // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+ if (SImm < 0) {
+ SImm = -SImm;
+ isAdd = false;
+ }
+
+ Imm = SImm;
+ return isAdd;
+}
+
+/// getBranchTargetOpValue - Helper function to get the branch target operand,
+/// which is either an immediate or requires a fixup.
+static uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+ unsigned FixupKind,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+
+ // If the destination is an immediate, we have nothing to do.
+ if (MO.isImm()) return MO.getImm();
+ assert(MO.isExpr() && "Unexpected branch target type!");
+ const MCExpr *Expr = MO.getExpr();
+ MCFixupKind Kind = MCFixupKind(FixupKind);
+ Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc()));
+
+ // All of the information is in the fixup.
+ return 0;
+}
+
+// Thumb BL and BLX use a strange offset encoding where bits 22 and 21 are
+// determined by negating them and XOR'ing them with bit 23.
+static int32_t encodeThumbBLOffset(int32_t offset) {
+ offset >>= 1;
+ uint32_t S = (offset & 0x800000) >> 23;
+ uint32_t J1 = (offset & 0x400000) >> 22;
+ uint32_t J2 = (offset & 0x200000) >> 21;
+ J1 = (~J1 & 0x1);
+ J2 = (~J2 & 0x1);
+ J1 ^= S;
+ J2 ^= S;
+
+ offset &= ~0x600000;
+ offset |= J1 << 22;
+ offset |= J2 << 21;
+
+ return offset;
+}
+
+/// getThumbBLTargetOpValue - Return encoding info for immediate branch target.
+uint32_t ARMMCCodeEmitter::
+getThumbBLTargetOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand MO = MI.getOperand(OpIdx);
+ if (MO.isExpr())
+ return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_bl,
+ Fixups, STI);
+ return encodeThumbBLOffset(MO.getImm());
+}
+
+/// getThumbBLXTargetOpValue - Return encoding info for Thumb immediate
+/// BLX branch target.
+uint32_t ARMMCCodeEmitter::
+getThumbBLXTargetOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand MO = MI.getOperand(OpIdx);
+ if (MO.isExpr())
+ return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_blx,
+ Fixups, STI);
+ return encodeThumbBLOffset(MO.getImm());
+}
+
+/// getThumbBRTargetOpValue - Return encoding info for Thumb branch target.
+uint32_t ARMMCCodeEmitter::
+getThumbBRTargetOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand MO = MI.getOperand(OpIdx);
+ if (MO.isExpr())
+ return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_br,
+ Fixups, STI);
+ return (MO.getImm() >> 1);
+}
+
+/// getThumbBCCTargetOpValue - Return encoding info for Thumb branch target.
+uint32_t ARMMCCodeEmitter::
+getThumbBCCTargetOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand MO = MI.getOperand(OpIdx);
+ if (MO.isExpr())
+ return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_bcc,
+ Fixups, STI);
+ return (MO.getImm() >> 1);
+}
+
+/// getThumbCBTargetOpValue - Return encoding info for Thumb branch target.
+uint32_t ARMMCCodeEmitter::
+getThumbCBTargetOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand MO = MI.getOperand(OpIdx);
+ if (MO.isExpr())
+ return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_cb, Fixups, STI);
+ return (MO.getImm() >> 1);
+}
+
+/// Return true if this branch has a non-always predication
+static bool HasConditionalBranch(const MCInst &MI) {
+ int NumOp = MI.getNumOperands();
+ if (NumOp >= 2) {
+ for (int i = 0; i < NumOp-1; ++i) {
+ const MCOperand &MCOp1 = MI.getOperand(i);
+ const MCOperand &MCOp2 = MI.getOperand(i + 1);
+ if (MCOp1.isImm() && MCOp2.isReg() &&
+ (MCOp2.getReg() == 0 || MCOp2.getReg() == ARM::CPSR)) {
+ if (ARMCC::CondCodes(MCOp1.getImm()) != ARMCC::AL)
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+/// getBranchTargetOpValue - Return encoding info for 24-bit immediate branch
+/// target.
+uint32_t ARMMCCodeEmitter::
+getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // FIXME: This really, really shouldn't use TargetMachine. We don't want
+ // coupling between MC and TM anywhere we can help it.
+ if (isThumb2(STI))
+ return
+ ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_condbranch, Fixups, STI);
+ return getARMBranchTargetOpValue(MI, OpIdx, Fixups, STI);
+}
+
+/// getBranchTargetOpValue - Return encoding info for 24-bit immediate branch
+/// target.
+uint32_t ARMMCCodeEmitter::
+getARMBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand MO = MI.getOperand(OpIdx);
+ if (MO.isExpr()) {
+ if (HasConditionalBranch(MI))
+ return ::getBranchTargetOpValue(MI, OpIdx,
+ ARM::fixup_arm_condbranch, Fixups, STI);
+ return ::getBranchTargetOpValue(MI, OpIdx,
+ ARM::fixup_arm_uncondbranch, Fixups, STI);
+ }
+
+ return MO.getImm() >> 2;
+}
+
+uint32_t ARMMCCodeEmitter::
+getARMBLTargetOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand MO = MI.getOperand(OpIdx);
+ if (MO.isExpr()) {
+ if (HasConditionalBranch(MI))
+ return ::getBranchTargetOpValue(MI, OpIdx,
+ ARM::fixup_arm_condbl, Fixups, STI);
+ return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_uncondbl, Fixups, STI);
+ }
+
+ return MO.getImm() >> 2;
+}
+
+uint32_t ARMMCCodeEmitter::
+getARMBLXTargetOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand MO = MI.getOperand(OpIdx);
+ if (MO.isExpr())
+ return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_blx, Fixups, STI);
+
+ return MO.getImm() >> 1;
+}
+
+/// getUnconditionalBranchTargetOpValue - Return encoding info for 24-bit
+/// immediate branch target.
+uint32_t ARMMCCodeEmitter::getThumbBranchTargetOpValue(
+ const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ unsigned Val = 0;
+ const MCOperand MO = MI.getOperand(OpIdx);
+
+ if(MO.isExpr())
+ return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_uncondbranch, Fixups, STI);
+ else
+ Val = MO.getImm() >> 1;
+
+ bool I = (Val & 0x800000);
+ bool J1 = (Val & 0x400000);
+ bool J2 = (Val & 0x200000);
+ if (I ^ J1)
+ Val &= ~0x400000;
+ else
+ Val |= 0x400000;
+
+ if (I ^ J2)
+ Val &= ~0x200000;
+ else
+ Val |= 0x200000;
+
+ return Val;
+}
+
+/// getAdrLabelOpValue - Return encoding info for 12-bit shifted-immediate
+/// ADR label target.
+uint32_t ARMMCCodeEmitter::
+getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand MO = MI.getOperand(OpIdx);
+ if (MO.isExpr())
+ return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_adr_pcrel_12,
+ Fixups, STI);
+ int64_t offset = MO.getImm();
+ uint32_t Val = 0x2000;
+
+ int SoImmVal;
+ if (offset == INT32_MIN) {
+ Val = 0x1000;
+ SoImmVal = 0;
+ } else if (offset < 0) {
+ Val = 0x1000;
+ offset *= -1;
+ SoImmVal = ARM_AM::getSOImmVal(offset);
+ if(SoImmVal == -1) {
+ Val = 0x2000;
+ offset *= -1;
+ SoImmVal = ARM_AM::getSOImmVal(offset);
+ }
+ } else {
+ SoImmVal = ARM_AM::getSOImmVal(offset);
+ if(SoImmVal == -1) {
+ Val = 0x1000;
+ offset *= -1;
+ SoImmVal = ARM_AM::getSOImmVal(offset);
+ }
+ }
+
+ assert(SoImmVal != -1 && "Not a valid so_imm value!");
+
+ Val |= SoImmVal;
+ return Val;
+}
+
+/// getT2AdrLabelOpValue - Return encoding info for 12-bit immediate ADR label
+/// target.
+uint32_t ARMMCCodeEmitter::
+getT2AdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand MO = MI.getOperand(OpIdx);
+ if (MO.isExpr())
+ return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_adr_pcrel_12,
+ Fixups, STI);
+ int32_t Val = MO.getImm();
+ if (Val == INT32_MIN)
+ Val = 0x1000;
+ else if (Val < 0) {
+ Val *= -1;
+ Val |= 0x1000;
+ }
+ return Val;
+}
+
+/// getThumbAdrLabelOpValue - Return encoding info for 8-bit immediate ADR label
+/// target.
+uint32_t ARMMCCodeEmitter::
+getThumbAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand MO = MI.getOperand(OpIdx);
+ if (MO.isExpr())
+ return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_thumb_adr_pcrel_10,
+ Fixups, STI);
+ return MO.getImm();
+}
+
+/// getThumbAddrModeRegRegOpValue - Return encoding info for 'reg + reg'
+/// operand.
+uint32_t ARMMCCodeEmitter::
+getThumbAddrModeRegRegOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &,
+ const MCSubtargetInfo &STI) const {
+ // [Rn, Rm]
+ // {5-3} = Rm
+ // {2-0} = Rn
+ const MCOperand &MO1 = MI.getOperand(OpIdx);
+ const MCOperand &MO2 = MI.getOperand(OpIdx + 1);
+ unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(MO1.getReg());
+ unsigned Rm = CTX.getRegisterInfo()->getEncodingValue(MO2.getReg());
+ return (Rm << 3) | Rn;
+}
+
+/// getAddrModeImm12OpValue - Return encoding info for 'reg +/- imm12' operand.
+uint32_t ARMMCCodeEmitter::
+getAddrModeImm12OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // {17-13} = reg
+ // {12} = (U)nsigned (add == '1', sub == '0')
+ // {11-0} = imm12
+ unsigned Reg, Imm12;
+ bool isAdd = true;
+ // If The first operand isn't a register, we have a label reference.
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ if (!MO.isReg()) {
+ Reg = CTX.getRegisterInfo()->getEncodingValue(ARM::PC); // Rn is PC.
+ Imm12 = 0;
+
+ if (MO.isExpr()) {
+ const MCExpr *Expr = MO.getExpr();
+ isAdd = false ; // 'U' bit is set as part of the fixup.
+
+ MCFixupKind Kind;
+ if (isThumb2(STI))
+ Kind = MCFixupKind(ARM::fixup_t2_ldst_pcrel_12);
+ else
+ Kind = MCFixupKind(ARM::fixup_arm_ldst_pcrel_12);
+ Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc()));
+
+ ++MCNumCPRelocations;
+ } else {
+ Reg = ARM::PC;
+ int32_t Offset = MO.getImm();
+ if (Offset == INT32_MIN) {
+ Offset = 0;
+ isAdd = false;
+ } else if (Offset < 0) {
+ Offset *= -1;
+ isAdd = false;
+ }
+ Imm12 = Offset;
+ }
+ } else
+ isAdd = EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm12, Fixups, STI);
+
+ uint32_t Binary = Imm12 & 0xfff;
+ // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+ if (isAdd)
+ Binary |= (1 << 12);
+ Binary |= (Reg << 13);
+ return Binary;
+}
+
+/// getT2Imm8s4OpValue - Return encoding info for
+/// '+/- imm8<<2' operand.
+uint32_t ARMMCCodeEmitter::
+getT2Imm8s4OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // FIXME: The immediate operand should have already been encoded like this
+ // before ever getting here. The encoder method should just need to combine
+ // the MI operands for the register and the offset into a single
+ // representation for the complex operand in the .td file. This isn't just
+ // style, unfortunately. As-is, we can't represent the distinct encoding
+ // for #-0.
+
+ // {8} = (U)nsigned (add == '1', sub == '0')
+ // {7-0} = imm8
+ int32_t Imm8 = MI.getOperand(OpIdx).getImm();
+ bool isAdd = Imm8 >= 0;
+
+ // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+ if (Imm8 < 0)
+ Imm8 = -(uint32_t)Imm8;
+
+ // Scaled by 4.
+ Imm8 /= 4;
+
+ uint32_t Binary = Imm8 & 0xff;
+ // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+ if (isAdd)
+ Binary |= (1 << 8);
+ return Binary;
+}
+
+/// getT2AddrModeImm8s4OpValue - Return encoding info for
+/// 'reg +/- imm8<<2' operand.
+uint32_t ARMMCCodeEmitter::
+getT2AddrModeImm8s4OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // {12-9} = reg
+ // {8} = (U)nsigned (add == '1', sub == '0')
+ // {7-0} = imm8
+ unsigned Reg, Imm8;
+ bool isAdd = true;
+ // If The first operand isn't a register, we have a label reference.
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ if (!MO.isReg()) {
+ Reg = CTX.getRegisterInfo()->getEncodingValue(ARM::PC); // Rn is PC.
+ Imm8 = 0;
+ isAdd = false ; // 'U' bit is set as part of the fixup.
+
+ assert(MO.isExpr() && "Unexpected machine operand type!");
+ const MCExpr *Expr = MO.getExpr();
+ MCFixupKind Kind = MCFixupKind(ARM::fixup_t2_pcrel_10);
+ Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc()));
+
+ ++MCNumCPRelocations;
+ } else
+ isAdd = EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm8, Fixups, STI);
+
+ // FIXME: The immediate operand should have already been encoded like this
+ // before ever getting here. The encoder method should just need to combine
+ // the MI operands for the register and the offset into a single
+ // representation for the complex operand in the .td file. This isn't just
+ // style, unfortunately. As-is, we can't represent the distinct encoding
+ // for #-0.
+ uint32_t Binary = (Imm8 >> 2) & 0xff;
+ // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+ if (isAdd)
+ Binary |= (1 << 8);
+ Binary |= (Reg << 9);
+ return Binary;
+}
+
+/// getT2AddrModeImm0_1020s4OpValue - Return encoding info for
+/// 'reg + imm8<<2' operand.
+uint32_t ARMMCCodeEmitter::
+getT2AddrModeImm0_1020s4OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // {11-8} = reg
+ // {7-0} = imm8
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+ unsigned Reg = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
+ unsigned Imm8 = MO1.getImm();
+ return (Reg << 8) | Imm8;
+}
+
+uint32_t
+ARMMCCodeEmitter::getHiLo16ImmOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // {20-16} = imm{15-12}
+ // {11-0} = imm{11-0}
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ if (MO.isImm())
+ // Hi / lo 16 bits already extracted during earlier passes.
+ return static_cast<unsigned>(MO.getImm());
+
+ // Handle :upper16: and :lower16: assembly prefixes.
+ const MCExpr *E = MO.getExpr();
+ MCFixupKind Kind;
+ if (E->getKind() == MCExpr::Target) {
+ const ARMMCExpr *ARM16Expr = cast<ARMMCExpr>(E);
+ E = ARM16Expr->getSubExpr();
+
+ if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(E)) {
+ const int64_t Value = MCE->getValue();
+ if (Value > UINT32_MAX)
+ report_fatal_error("constant value truncated (limited to 32-bit)");
+
+ switch (ARM16Expr->getKind()) {
+ case ARMMCExpr::VK_ARM_HI16:
+ return (int32_t(Value) & 0xffff0000) >> 16;
+ case ARMMCExpr::VK_ARM_LO16:
+ return (int32_t(Value) & 0x0000ffff);
+ default: llvm_unreachable("Unsupported ARMFixup");
+ }
+ }
+
+ switch (ARM16Expr->getKind()) {
+ default: llvm_unreachable("Unsupported ARMFixup");
+ case ARMMCExpr::VK_ARM_HI16:
+ Kind = MCFixupKind(isThumb(STI) ? ARM::fixup_t2_movt_hi16
+ : ARM::fixup_arm_movt_hi16);
+ break;
+ case ARMMCExpr::VK_ARM_LO16:
+ Kind = MCFixupKind(isThumb(STI) ? ARM::fixup_t2_movw_lo16
+ : ARM::fixup_arm_movw_lo16);
+ break;
+ }
+
+ Fixups.push_back(MCFixup::create(0, E, Kind, MI.getLoc()));
+ return 0;
+ }
+ // If the expression doesn't have :upper16: or :lower16: on it,
+ // it's just a plain immediate expression, previously those evaluated to
+ // the lower 16 bits of the expression regardless of whether
+ // we have a movt or a movw, but that led to misleadingly results.
+ // This is disallowed in the AsmParser in validateInstruction()
+ // so this should never happen.
+ llvm_unreachable("expression without :upper16: or :lower16:");
+}
+
+uint32_t ARMMCCodeEmitter::
+getLdStSORegOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ const MCOperand &MO1 = MI.getOperand(OpIdx+1);
+ const MCOperand &MO2 = MI.getOperand(OpIdx+2);
+ unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
+ unsigned Rm = CTX.getRegisterInfo()->getEncodingValue(MO1.getReg());
+ unsigned ShImm = ARM_AM::getAM2Offset(MO2.getImm());
+ bool isAdd = ARM_AM::getAM2Op(MO2.getImm()) == ARM_AM::add;
+ ARM_AM::ShiftOpc ShOp = ARM_AM::getAM2ShiftOpc(MO2.getImm());
+ unsigned SBits = getShiftOp(ShOp);
+
+ // While "lsr #32" and "asr #32" exist, they are encoded with a 0 in the shift
+ // amount. However, it would be an easy mistake to make so check here.
+ assert((ShImm & ~0x1f) == 0 && "Out of range shift amount");
+
+ // {16-13} = Rn
+ // {12} = isAdd
+ // {11-0} = shifter
+ // {3-0} = Rm
+ // {4} = 0
+ // {6-5} = type
+ // {11-7} = imm
+ uint32_t Binary = Rm;
+ Binary |= Rn << 13;
+ Binary |= SBits << 5;
+ Binary |= ShImm << 7;
+ if (isAdd)
+ Binary |= 1 << 12;
+ return Binary;
+}
+
+uint32_t ARMMCCodeEmitter::
+getAddrMode2OffsetOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // {13} 1 == imm12, 0 == Rm
+ // {12} isAdd
+ // {11-0} imm12/Rm
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ const MCOperand &MO1 = MI.getOperand(OpIdx+1);
+ unsigned Imm = MO1.getImm();
+ bool isAdd = ARM_AM::getAM2Op(Imm) == ARM_AM::add;
+ bool isReg = MO.getReg() != 0;
+ uint32_t Binary = ARM_AM::getAM2Offset(Imm);
+ // if reg +/- reg, Rm will be non-zero. Otherwise, we have reg +/- imm12
+ if (isReg) {
+ ARM_AM::ShiftOpc ShOp = ARM_AM::getAM2ShiftOpc(Imm);
+ Binary <<= 7; // Shift amount is bits [11:7]
+ Binary |= getShiftOp(ShOp) << 5; // Shift type is bits [6:5]
+ Binary |= CTX.getRegisterInfo()->getEncodingValue(MO.getReg()); // Rm is bits [3:0]
+ }
+ return Binary | (isAdd << 12) | (isReg << 13);
+}
+
+uint32_t ARMMCCodeEmitter::
+getPostIdxRegOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // {4} isAdd
+ // {3-0} Rm
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ const MCOperand &MO1 = MI.getOperand(OpIdx+1);
+ bool isAdd = MO1.getImm() != 0;
+ return CTX.getRegisterInfo()->getEncodingValue(MO.getReg()) | (isAdd << 4);
+}
+
+uint32_t ARMMCCodeEmitter::
+getAddrMode3OffsetOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // {9} 1 == imm8, 0 == Rm
+ // {8} isAdd
+ // {7-4} imm7_4/zero
+ // {3-0} imm3_0/Rm
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ const MCOperand &MO1 = MI.getOperand(OpIdx+1);
+ unsigned Imm = MO1.getImm();
+ bool isAdd = ARM_AM::getAM3Op(Imm) == ARM_AM::add;
+ bool isImm = MO.getReg() == 0;
+ uint32_t Imm8 = ARM_AM::getAM3Offset(Imm);
+ // if reg +/- reg, Rm will be non-zero. Otherwise, we have reg +/- imm8
+ if (!isImm)
+ Imm8 = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
+ return Imm8 | (isAdd << 8) | (isImm << 9);
+}
+
+uint32_t ARMMCCodeEmitter::
+getAddrMode3OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // {13} 1 == imm8, 0 == Rm
+ // {12-9} Rn
+ // {8} isAdd
+ // {7-4} imm7_4/zero
+ // {3-0} imm3_0/Rm
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ const MCOperand &MO1 = MI.getOperand(OpIdx+1);
+ const MCOperand &MO2 = MI.getOperand(OpIdx+2);
+
+ // If The first operand isn't a register, we have a label reference.
+ if (!MO.isReg()) {
+ unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(ARM::PC); // Rn is PC.
+
+ assert(MO.isExpr() && "Unexpected machine operand type!");
+ const MCExpr *Expr = MO.getExpr();
+ MCFixupKind Kind = MCFixupKind(ARM::fixup_arm_pcrel_10_unscaled);
+ Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc()));
+
+ ++MCNumCPRelocations;
+ return (Rn << 9) | (1 << 13);
+ }
+ unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
+ unsigned Imm = MO2.getImm();
+ bool isAdd = ARM_AM::getAM3Op(Imm) == ARM_AM::add;
+ bool isImm = MO1.getReg() == 0;
+ uint32_t Imm8 = ARM_AM::getAM3Offset(Imm);
+ // if reg +/- reg, Rm will be non-zero. Otherwise, we have reg +/- imm8
+ if (!isImm)
+ Imm8 = CTX.getRegisterInfo()->getEncodingValue(MO1.getReg());
+ return (Rn << 9) | Imm8 | (isAdd << 8) | (isImm << 13);
+}
+
+/// getAddrModeThumbSPOpValue - Encode the t_addrmode_sp operands.
+uint32_t ARMMCCodeEmitter::
+getAddrModeThumbSPOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // [SP, #imm]
+ // {7-0} = imm8
+ const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+ assert(MI.getOperand(OpIdx).getReg() == ARM::SP &&
+ "Unexpected base register!");
+
+ // The immediate is already shifted for the implicit zeroes, so no change
+ // here.
+ return MO1.getImm() & 0xff;
+}
+
+/// getAddrModeISOpValue - Encode the t_addrmode_is# operands.
+uint32_t ARMMCCodeEmitter::
+getAddrModeISOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // [Rn, #imm]
+ // {7-3} = imm5
+ // {2-0} = Rn
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+ unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
+ unsigned Imm5 = MO1.getImm();
+ return ((Imm5 & 0x1f) << 3) | Rn;
+}
+
+/// getAddrModePCOpValue - Return encoding for t_addrmode_pc operands.
+uint32_t ARMMCCodeEmitter::
+getAddrModePCOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand MO = MI.getOperand(OpIdx);
+ if (MO.isExpr())
+ return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_cp, Fixups, STI);
+ return (MO.getImm() >> 2);
+}
+
+/// getAddrMode5OpValue - Return encoding info for 'reg +/- (imm8 << 2)' operand.
+uint32_t ARMMCCodeEmitter::
+getAddrMode5OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // {12-9} = reg
+ // {8} = (U)nsigned (add == '1', sub == '0')
+ // {7-0} = imm8
+ unsigned Reg, Imm8;
+ bool isAdd;
+ // If The first operand isn't a register, we have a label reference.
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ if (!MO.isReg()) {
+ Reg = CTX.getRegisterInfo()->getEncodingValue(ARM::PC); // Rn is PC.
+ Imm8 = 0;
+ isAdd = false; // 'U' bit is handled as part of the fixup.
+
+ assert(MO.isExpr() && "Unexpected machine operand type!");
+ const MCExpr *Expr = MO.getExpr();
+ MCFixupKind Kind;
+ if (isThumb2(STI))
+ Kind = MCFixupKind(ARM::fixup_t2_pcrel_10);
+ else
+ Kind = MCFixupKind(ARM::fixup_arm_pcrel_10);
+ Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc()));
+
+ ++MCNumCPRelocations;
+ } else {
+ EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm8, Fixups, STI);
+ isAdd = ARM_AM::getAM5Op(Imm8) == ARM_AM::add;
+ }
+
+ uint32_t Binary = ARM_AM::getAM5Offset(Imm8);
+ // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+ if (isAdd)
+ Binary |= (1 << 8);
+ Binary |= (Reg << 9);
+ return Binary;
+}
+
+/// getAddrMode5FP16OpValue - Return encoding info for 'reg +/- (imm8 << 1)' operand.
+uint32_t ARMMCCodeEmitter::
+getAddrMode5FP16OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // {12-9} = reg
+ // {8} = (U)nsigned (add == '1', sub == '0')
+ // {7-0} = imm8
+ unsigned Reg, Imm8;
+ bool isAdd;
+ // If The first operand isn't a register, we have a label reference.
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ if (!MO.isReg()) {
+ Reg = CTX.getRegisterInfo()->getEncodingValue(ARM::PC); // Rn is PC.
+ Imm8 = 0;
+ isAdd = false; // 'U' bit is handled as part of the fixup.
+
+ assert(MO.isExpr() && "Unexpected machine operand type!");
+ const MCExpr *Expr = MO.getExpr();
+ MCFixupKind Kind;
+ if (isThumb2(STI))
+ Kind = MCFixupKind(ARM::fixup_t2_pcrel_9);
+ else
+ Kind = MCFixupKind(ARM::fixup_arm_pcrel_9);
+ Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc()));
+
+ ++MCNumCPRelocations;
+ } else {
+ EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm8, Fixups, STI);
+ isAdd = ARM_AM::getAM5Op(Imm8) == ARM_AM::add;
+ }
+
+ uint32_t Binary = ARM_AM::getAM5Offset(Imm8);
+ // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+ if (isAdd)
+ Binary |= (1 << 8);
+ Binary |= (Reg << 9);
+ return Binary;
+}
+
+unsigned ARMMCCodeEmitter::
+getSORegRegOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // Sub-operands are [reg, reg, imm]. The first register is Rm, the reg to be
+ // shifted. The second is Rs, the amount to shift by, and the third specifies
+ // the type of the shift.
+ //
+ // {3-0} = Rm.
+ // {4} = 1
+ // {6-5} = type
+ // {11-8} = Rs
+ // {7} = 0
+
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+ const MCOperand &MO2 = MI.getOperand(OpIdx + 2);
+ ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO2.getImm());
+
+ // Encode Rm.
+ unsigned Binary = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
+
+ // Encode the shift opcode.
+ unsigned SBits = 0;
+ unsigned Rs = MO1.getReg();
+ if (Rs) {
+ // Set shift operand (bit[7:4]).
+ // LSL - 0001
+ // LSR - 0011
+ // ASR - 0101
+ // ROR - 0111
+ switch (SOpc) {
+ default: llvm_unreachable("Unknown shift opc!");
+ case ARM_AM::lsl: SBits = 0x1; break;
+ case ARM_AM::lsr: SBits = 0x3; break;
+ case ARM_AM::asr: SBits = 0x5; break;
+ case ARM_AM::ror: SBits = 0x7; break;
+ }
+ }
+
+ Binary |= SBits << 4;
+
+ // Encode the shift operation Rs.
+ // Encode Rs bit[11:8].
+ assert(ARM_AM::getSORegOffset(MO2.getImm()) == 0);
+ return Binary | (CTX.getRegisterInfo()->getEncodingValue(Rs) << ARMII::RegRsShift);
+}
+
+unsigned ARMMCCodeEmitter::
+getSORegImmOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // Sub-operands are [reg, imm]. The first register is Rm, the reg to be
+ // shifted. The second is the amount to shift by.
+ //
+ // {3-0} = Rm.
+ // {4} = 0
+ // {6-5} = type
+ // {11-7} = imm
+
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+ ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO1.getImm());
+
+ // Encode Rm.
+ unsigned Binary = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
+
+ // Encode the shift opcode.
+ unsigned SBits = 0;
+
+ // Set shift operand (bit[6:4]).
+ // LSL - 000
+ // LSR - 010
+ // ASR - 100
+ // ROR - 110
+ // RRX - 110 and bit[11:8] clear.
+ switch (SOpc) {
+ default: llvm_unreachable("Unknown shift opc!");
+ case ARM_AM::lsl: SBits = 0x0; break;
+ case ARM_AM::lsr: SBits = 0x2; break;
+ case ARM_AM::asr: SBits = 0x4; break;
+ case ARM_AM::ror: SBits = 0x6; break;
+ case ARM_AM::rrx:
+ Binary |= 0x60;
+ return Binary;
+ }
+
+ // Encode shift_imm bit[11:7].
+ Binary |= SBits << 4;
+ unsigned Offset = ARM_AM::getSORegOffset(MO1.getImm());
+ assert(Offset < 32 && "Offset must be in range 0-31!");
+ return Binary | (Offset << 7);
+}
+
+
+unsigned ARMMCCodeEmitter::
+getT2AddrModeSORegOpValue(const MCInst &MI, unsigned OpNum,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO1 = MI.getOperand(OpNum);
+ const MCOperand &MO2 = MI.getOperand(OpNum+1);
+ const MCOperand &MO3 = MI.getOperand(OpNum+2);
+
+ // Encoded as [Rn, Rm, imm].
+ // FIXME: Needs fixup support.
+ unsigned Value = CTX.getRegisterInfo()->getEncodingValue(MO1.getReg());
+ Value <<= 4;
+ Value |= CTX.getRegisterInfo()->getEncodingValue(MO2.getReg());
+ Value <<= 2;
+ Value |= MO3.getImm();
+
+ return Value;
+}
+
+unsigned ARMMCCodeEmitter::
+getT2AddrModeImm8OpValue(const MCInst &MI, unsigned OpNum,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO1 = MI.getOperand(OpNum);
+ const MCOperand &MO2 = MI.getOperand(OpNum+1);
+
+ // FIXME: Needs fixup support.
+ unsigned Value = CTX.getRegisterInfo()->getEncodingValue(MO1.getReg());
+
+ // Even though the immediate is 8 bits long, we need 9 bits in order
+ // to represent the (inverse of the) sign bit.
+ Value <<= 9;
+ int32_t tmp = (int32_t)MO2.getImm();
+ if (tmp < 0)
+ tmp = abs(tmp);
+ else
+ Value |= 256; // Set the ADD bit
+ Value |= tmp & 255;
+ return Value;
+}
+
+unsigned ARMMCCodeEmitter::
+getT2AddrModeImm8OffsetOpValue(const MCInst &MI, unsigned OpNum,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO1 = MI.getOperand(OpNum);
+
+ // FIXME: Needs fixup support.
+ unsigned Value = 0;
+ int32_t tmp = (int32_t)MO1.getImm();
+ if (tmp < 0)
+ tmp = abs(tmp);
+ else
+ Value |= 256; // Set the ADD bit
+ Value |= tmp & 255;
+ return Value;
+}
+
+unsigned ARMMCCodeEmitter::
+getT2SORegOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // Sub-operands are [reg, imm]. The first register is Rm, the reg to be
+ // shifted. The second is the amount to shift by.
+ //
+ // {3-0} = Rm.
+ // {4} = 0
+ // {6-5} = type
+ // {11-7} = imm
+
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+ ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO1.getImm());
+
+ // Encode Rm.
+ unsigned Binary = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
+
+ // Encode the shift opcode.
+ unsigned SBits = 0;
+ // Set shift operand (bit[6:4]).
+ // LSL - 000
+ // LSR - 010
+ // ASR - 100
+ // ROR - 110
+ switch (SOpc) {
+ default: llvm_unreachable("Unknown shift opc!");
+ case ARM_AM::lsl: SBits = 0x0; break;
+ case ARM_AM::lsr: SBits = 0x2; break;
+ case ARM_AM::asr: SBits = 0x4; break;
+ case ARM_AM::rrx: LLVM_FALLTHROUGH;
+ case ARM_AM::ror: SBits = 0x6; break;
+ }
+
+ Binary |= SBits << 4;
+ if (SOpc == ARM_AM::rrx)
+ return Binary;
+
+ // Encode shift_imm bit[11:7].
+ return Binary | ARM_AM::getSORegOffset(MO1.getImm()) << 7;
+}
+
+unsigned ARMMCCodeEmitter::
+getBitfieldInvertedMaskOpValue(const MCInst &MI, unsigned Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // 10 bits. lower 5 bits are are the lsb of the mask, high five bits are the
+ // msb of the mask.
+ const MCOperand &MO = MI.getOperand(Op);
+ uint32_t v = ~MO.getImm();
+ uint32_t lsb = countTrailingZeros(v);
+ uint32_t msb = (32 - countLeadingZeros (v)) - 1;
+ assert (v != 0 && lsb < 32 && msb < 32 && "Illegal bitfield mask!");
+ return lsb | (msb << 5);
+}
+
+unsigned ARMMCCodeEmitter::
+getRegisterListOpValue(const MCInst &MI, unsigned Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // VLDM/VSTM:
+ // {12-8} = Vd
+ // {7-0} = Number of registers
+ //
+ // LDM/STM:
+ // {15-0} = Bitfield of GPRs.
+ unsigned Reg = MI.getOperand(Op).getReg();
+ bool SPRRegs = ARMMCRegisterClasses[ARM::SPRRegClassID].contains(Reg);
+ bool DPRRegs = ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg);
+
+ unsigned Binary = 0;
+
+ if (SPRRegs || DPRRegs) {
+ // VLDM/VSTM
+ unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(Reg);
+ unsigned NumRegs = (MI.getNumOperands() - Op) & 0xff;
+ Binary |= (RegNo & 0x1f) << 8;
+ if (SPRRegs)
+ Binary |= NumRegs;
+ else
+ Binary |= NumRegs * 2;
+ } else {
+ const MCRegisterInfo &MRI = *CTX.getRegisterInfo();
+ assert(std::is_sorted(MI.begin() + Op, MI.end(),
+ [&](const MCOperand &LHS, const MCOperand &RHS) {
+ return MRI.getEncodingValue(LHS.getReg()) <
+ MRI.getEncodingValue(RHS.getReg());
+ }));
+
+ for (unsigned I = Op, E = MI.getNumOperands(); I < E; ++I) {
+ unsigned RegNo = MRI.getEncodingValue(MI.getOperand(I).getReg());
+ Binary |= 1 << RegNo;
+ }
+ }
+
+ return Binary;
+}
+
+/// getAddrMode6AddressOpValue - Encode an addrmode6 register number along
+/// with the alignment operand.
+unsigned ARMMCCodeEmitter::
+getAddrMode6AddressOpValue(const MCInst &MI, unsigned Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &Reg = MI.getOperand(Op);
+ const MCOperand &Imm = MI.getOperand(Op + 1);
+
+ unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(Reg.getReg());
+ unsigned Align = 0;
+
+ switch (Imm.getImm()) {
+ default: break;
+ case 2:
+ case 4:
+ case 8: Align = 0x01; break;
+ case 16: Align = 0x02; break;
+ case 32: Align = 0x03; break;
+ }
+
+ return RegNo | (Align << 4);
+}
+
+/// getAddrMode6OneLane32AddressOpValue - Encode an addrmode6 register number
+/// along with the alignment operand for use in VST1 and VLD1 with size 32.
+unsigned ARMMCCodeEmitter::
+getAddrMode6OneLane32AddressOpValue(const MCInst &MI, unsigned Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &Reg = MI.getOperand(Op);
+ const MCOperand &Imm = MI.getOperand(Op + 1);
+
+ unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(Reg.getReg());
+ unsigned Align = 0;
+
+ switch (Imm.getImm()) {
+ default: break;
+ case 8:
+ case 16:
+ case 32: // Default '0' value for invalid alignments of 8, 16, 32 bytes.
+ case 2: Align = 0x00; break;
+ case 4: Align = 0x03; break;
+ }
+
+ return RegNo | (Align << 4);
+}
+
+
+/// getAddrMode6DupAddressOpValue - Encode an addrmode6 register number and
+/// alignment operand for use in VLD-dup instructions. This is the same as
+/// getAddrMode6AddressOpValue except for the alignment encoding, which is
+/// different for VLD4-dup.
+unsigned ARMMCCodeEmitter::
+getAddrMode6DupAddressOpValue(const MCInst &MI, unsigned Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &Reg = MI.getOperand(Op);
+ const MCOperand &Imm = MI.getOperand(Op + 1);
+
+ unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(Reg.getReg());
+ unsigned Align = 0;
+
+ switch (Imm.getImm()) {
+ default: break;
+ case 2:
+ case 4:
+ case 8: Align = 0x01; break;
+ case 16: Align = 0x03; break;
+ }
+
+ return RegNo | (Align << 4);
+}
+
+unsigned ARMMCCodeEmitter::
+getAddrMode6OffsetOpValue(const MCInst &MI, unsigned Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(Op);
+ if (MO.getReg() == 0) return 0x0D;
+ return CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
+}
+
+unsigned ARMMCCodeEmitter::
+getShiftRight8Imm(const MCInst &MI, unsigned Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ return 8 - MI.getOperand(Op).getImm();
+}
+
+unsigned ARMMCCodeEmitter::
+getShiftRight16Imm(const MCInst &MI, unsigned Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ return 16 - MI.getOperand(Op).getImm();
+}
+
+unsigned ARMMCCodeEmitter::
+getShiftRight32Imm(const MCInst &MI, unsigned Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ return 32 - MI.getOperand(Op).getImm();
+}
+
+unsigned ARMMCCodeEmitter::
+getShiftRight64Imm(const MCInst &MI, unsigned Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ return 64 - MI.getOperand(Op).getImm();
+}
+
+void ARMMCCodeEmitter::
+encodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // Pseudo instructions don't get encoded.
+ const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+ uint64_t TSFlags = Desc.TSFlags;
+ if ((TSFlags & ARMII::FormMask) == ARMII::Pseudo)
+ return;
+
+ int Size;
+ if (Desc.getSize() == 2 || Desc.getSize() == 4)
+ Size = Desc.getSize();
+ else
+ llvm_unreachable("Unexpected instruction size!");
+
+ uint32_t Binary = getBinaryCodeForInstr(MI, Fixups, STI);
+ // Thumb 32-bit wide instructions need to emit the high order halfword
+ // first.
+ if (isThumb(STI) && Size == 4) {
+ EmitConstant(Binary >> 16, 2, OS);
+ EmitConstant(Binary & 0xffff, 2, OS);
+ } else
+ EmitConstant(Binary, Size, OS);
+ ++MCNumEmitted; // Keep track of the # of mi's emitted.
+}
+
+#include "ARMGenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
new file mode 100644
index 000000000000..2063ca6bdf3b
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
@@ -0,0 +1,41 @@
+//===-- ARMMCExpr.cpp - ARM specific MC expression classes ----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMMCExpr.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "armmcexpr"
+
+const ARMMCExpr*
+ARMMCExpr::create(VariantKind Kind, const MCExpr *Expr,
+ MCContext &Ctx) {
+ return new (Ctx) ARMMCExpr(Kind, Expr);
+}
+
+void ARMMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
+ switch (Kind) {
+ default: llvm_unreachable("Invalid kind!");
+ case VK_ARM_HI16: OS << ":upper16:"; break;
+ case VK_ARM_LO16: OS << ":lower16:"; break;
+ }
+
+ const MCExpr *Expr = getSubExpr();
+ if (Expr->getKind() != MCExpr::SymbolRef)
+ OS << '(';
+ Expr->print(OS, MAI);
+ if (Expr->getKind() != MCExpr::SymbolRef)
+ OS << ')';
+}
+
+void ARMMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
+ Streamer.visitUsedExpr(*getSubExpr());
+}
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
new file mode 100644
index 000000000000..75dde8008fca
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
@@ -0,0 +1,79 @@
+//===-- ARMMCExpr.h - ARM specific MC expression classes --------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMMCEXPR_H
+#define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMMCEXPR_H
+
+#include "llvm/MC/MCExpr.h"
+
+namespace llvm {
+
+class ARMMCExpr : public MCTargetExpr {
+public:
+ enum VariantKind {
+ VK_ARM_None,
+ VK_ARM_HI16, // The R_ARM_MOVT_ABS relocation (:upper16: in the .s file)
+ VK_ARM_LO16 // The R_ARM_MOVW_ABS_NC relocation (:lower16: in the .s file)
+ };
+
+private:
+ const VariantKind Kind;
+ const MCExpr *Expr;
+
+ explicit ARMMCExpr(VariantKind Kind, const MCExpr *Expr)
+ : Kind(Kind), Expr(Expr) {}
+
+public:
+ /// @name Construction
+ /// @{
+
+ static const ARMMCExpr *create(VariantKind Kind, const MCExpr *Expr,
+ MCContext &Ctx);
+
+ static const ARMMCExpr *createUpper16(const MCExpr *Expr, MCContext &Ctx) {
+ return create(VK_ARM_HI16, Expr, Ctx);
+ }
+
+ static const ARMMCExpr *createLower16(const MCExpr *Expr, MCContext &Ctx) {
+ return create(VK_ARM_LO16, Expr, Ctx);
+ }
+
+ /// @}
+ /// @name Accessors
+ /// @{
+
+ /// getOpcode - Get the kind of this expression.
+ VariantKind getKind() const { return Kind; }
+
+ /// getSubExpr - Get the child of this expression.
+ const MCExpr *getSubExpr() const { return Expr; }
+
+ /// @}
+
+ void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+ bool evaluateAsRelocatableImpl(MCValue &Res,
+ const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const override {
+ return false;
+ }
+ void visitUsedExpr(MCStreamer &Streamer) const override;
+ MCFragment *findAssociatedFragment() const override {
+ return getSubExpr()->findAssociatedFragment();
+ }
+
+ // There are no TLS ARMMCExprs at the moment.
+ void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {}
+
+ static bool classof(const MCExpr *E) {
+ return E->getKind() == MCExpr::Target;
+ }
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
new file mode 100644
index 000000000000..9e4d202321e6
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -0,0 +1,331 @@
+//===-- ARMMCTargetDesc.cpp - ARM Target Descriptions ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides ARM specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMBaseInfo.h"
+#include "ARMMCAsmInfo.h"
+#include "ARMMCTargetDesc.h"
+#include "InstPrinter/ARMInstPrinter.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetParser.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_REGINFO_MC_DESC
+#include "ARMGenRegisterInfo.inc"
+
+static bool getMCRDeprecationInfo(MCInst &MI, const MCSubtargetInfo &STI,
+ std::string &Info) {
+ if (STI.getFeatureBits()[llvm::ARM::HasV7Ops] &&
+ (MI.getOperand(0).isImm() && MI.getOperand(0).getImm() == 15) &&
+ (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) &&
+ // Checks for the deprecated CP15ISB encoding:
+ // mcr p15, #0, rX, c7, c5, #4
+ (MI.getOperand(3).isImm() && MI.getOperand(3).getImm() == 7)) {
+ if ((MI.getOperand(5).isImm() && MI.getOperand(5).getImm() == 4)) {
+ if (MI.getOperand(4).isImm() && MI.getOperand(4).getImm() == 5) {
+ Info = "deprecated since v7, use 'isb'";
+ return true;
+ }
+
+ // Checks for the deprecated CP15DSB encoding:
+ // mcr p15, #0, rX, c7, c10, #4
+ if (MI.getOperand(4).isImm() && MI.getOperand(4).getImm() == 10) {
+ Info = "deprecated since v7, use 'dsb'";
+ return true;
+ }
+ }
+ // Checks for the deprecated CP15DMB encoding:
+ // mcr p15, #0, rX, c7, c10, #5
+ if (MI.getOperand(4).isImm() && MI.getOperand(4).getImm() == 10 &&
+ (MI.getOperand(5).isImm() && MI.getOperand(5).getImm() == 5)) {
+ Info = "deprecated since v7, use 'dmb'";
+ return true;
+ }
+ }
+ return false;
+}
+
+static bool getITDeprecationInfo(MCInst &MI, const MCSubtargetInfo &STI,
+ std::string &Info) {
+ if (STI.getFeatureBits()[llvm::ARM::HasV8Ops] && MI.getOperand(1).isImm() &&
+ MI.getOperand(1).getImm() != 8) {
+ Info = "applying IT instruction to more than one subsequent instruction is "
+ "deprecated";
+ return true;
+ }
+
+ return false;
+}
+
+static bool getARMStoreDeprecationInfo(MCInst &MI, const MCSubtargetInfo &STI,
+ std::string &Info) {
+ assert(!STI.getFeatureBits()[llvm::ARM::ModeThumb] &&
+ "cannot predicate thumb instructions");
+
+ assert(MI.getNumOperands() >= 4 && "expected >= 4 arguments");
+ for (unsigned OI = 4, OE = MI.getNumOperands(); OI < OE; ++OI) {
+ assert(MI.getOperand(OI).isReg() && "expected register");
+ if (MI.getOperand(OI).getReg() == ARM::SP ||
+ MI.getOperand(OI).getReg() == ARM::PC) {
+ Info = "use of SP or PC in the list is deprecated";
+ return true;
+ }
+ }
+ return false;
+}
+
+static bool getARMLoadDeprecationInfo(MCInst &MI, const MCSubtargetInfo &STI,
+ std::string &Info) {
+ assert(!STI.getFeatureBits()[llvm::ARM::ModeThumb] &&
+ "cannot predicate thumb instructions");
+
+ assert(MI.getNumOperands() >= 4 && "expected >= 4 arguments");
+ bool ListContainsPC = false, ListContainsLR = false;
+ for (unsigned OI = 4, OE = MI.getNumOperands(); OI < OE; ++OI) {
+ assert(MI.getOperand(OI).isReg() && "expected register");
+ switch (MI.getOperand(OI).getReg()) {
+ default:
+ break;
+ case ARM::LR:
+ ListContainsLR = true;
+ break;
+ case ARM::PC:
+ ListContainsPC = true;
+ break;
+ case ARM::SP:
+ Info = "use of SP in the list is deprecated";
+ return true;
+ }
+ }
+
+ if (ListContainsPC && ListContainsLR) {
+ Info = "use of LR and PC simultaneously in the list is deprecated";
+ return true;
+ }
+
+ return false;
+}
+
+#define GET_INSTRINFO_MC_DESC
+#include "ARMGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "ARMGenSubtargetInfo.inc"
+
+std::string ARM_MC::ParseARMTriple(const Triple &TT, StringRef CPU) {
+ bool isThumb =
+ TT.getArch() == Triple::thumb || TT.getArch() == Triple::thumbeb;
+
+ std::string ARMArchFeature;
+
+ unsigned ArchID = ARM::parseArch(TT.getArchName());
+ if (ArchID != ARM::AK_INVALID && (CPU.empty() || CPU == "generic"))
+ ARMArchFeature = (ARMArchFeature + "+" + ARM::getArchName(ArchID)).str();
+
+ if (isThumb) {
+ if (ARMArchFeature.empty())
+ ARMArchFeature = "+thumb-mode";
+ else
+ ARMArchFeature += ",+thumb-mode";
+ }
+
+ if (TT.isOSNaCl()) {
+ if (ARMArchFeature.empty())
+ ARMArchFeature = "+nacl-trap";
+ else
+ ARMArchFeature += ",+nacl-trap";
+ }
+
+ return ARMArchFeature;
+}
+
+MCSubtargetInfo *ARM_MC::createARMMCSubtargetInfo(const Triple &TT,
+ StringRef CPU, StringRef FS) {
+ std::string ArchFS = ARM_MC::ParseARMTriple(TT, CPU);
+ if (!FS.empty()) {
+ if (!ArchFS.empty())
+ ArchFS = (Twine(ArchFS) + "," + FS).str();
+ else
+ ArchFS = FS;
+ }
+
+ return createARMMCSubtargetInfoImpl(TT, CPU, ArchFS);
+}
+
+static MCInstrInfo *createARMMCInstrInfo() {
+ MCInstrInfo *X = new MCInstrInfo();
+ InitARMMCInstrInfo(X);
+ return X;
+}
+
+static MCRegisterInfo *createARMMCRegisterInfo(const Triple &Triple) {
+ MCRegisterInfo *X = new MCRegisterInfo();
+ InitARMMCRegisterInfo(X, ARM::LR, 0, 0, ARM::PC);
+ return X;
+}
+
+static MCAsmInfo *createARMMCAsmInfo(const MCRegisterInfo &MRI,
+ const Triple &TheTriple) {
+ MCAsmInfo *MAI;
+ if (TheTriple.isOSDarwin() || TheTriple.isOSBinFormatMachO())
+ MAI = new ARMMCAsmInfoDarwin(TheTriple);
+ else if (TheTriple.isWindowsMSVCEnvironment())
+ MAI = new ARMCOFFMCAsmInfoMicrosoft();
+ else if (TheTriple.isOSWindows())
+ MAI = new ARMCOFFMCAsmInfoGNU();
+ else
+ MAI = new ARMELFMCAsmInfo(TheTriple);
+
+ unsigned Reg = MRI.getDwarfRegNum(ARM::SP, true);
+ MAI->addInitialFrameState(MCCFIInstruction::createDefCfa(nullptr, Reg, 0));
+
+ return MAI;
+}
+
+static MCStreamer *createELFStreamer(const Triple &T, MCContext &Ctx,
+ MCAsmBackend &MAB, raw_pwrite_stream &OS,
+ MCCodeEmitter *Emitter, bool RelaxAll) {
+ return createARMELFStreamer(Ctx, MAB, OS, Emitter, false,
+ (T.getArch() == Triple::thumb ||
+ T.getArch() == Triple::thumbeb));
+}
+
+static MCStreamer *createARMMachOStreamer(MCContext &Ctx, MCAsmBackend &MAB,
+ raw_pwrite_stream &OS,
+ MCCodeEmitter *Emitter, bool RelaxAll,
+ bool DWARFMustBeAtTheEnd) {
+ return createMachOStreamer(Ctx, MAB, OS, Emitter, false, DWARFMustBeAtTheEnd);
+}
+
+static MCInstPrinter *createARMMCInstPrinter(const Triple &T,
+ unsigned SyntaxVariant,
+ const MCAsmInfo &MAI,
+ const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI) {
+ if (SyntaxVariant == 0)
+ return new ARMInstPrinter(MAI, MII, MRI);
+ return nullptr;
+}
+
+static MCRelocationInfo *createARMMCRelocationInfo(const Triple &TT,
+ MCContext &Ctx) {
+ if (TT.isOSBinFormatMachO())
+ return createARMMachORelocationInfo(Ctx);
+ // Default to the stock relocation info.
+ return llvm::createMCRelocationInfo(TT, Ctx);
+}
+
+namespace {
+
+class ARMMCInstrAnalysis : public MCInstrAnalysis {
+public:
+ ARMMCInstrAnalysis(const MCInstrInfo *Info) : MCInstrAnalysis(Info) {}
+
+ bool isUnconditionalBranch(const MCInst &Inst) const override {
+ // BCCs with the "always" predicate are unconditional branches.
+ if (Inst.getOpcode() == ARM::Bcc && Inst.getOperand(1).getImm()==ARMCC::AL)
+ return true;
+ return MCInstrAnalysis::isUnconditionalBranch(Inst);
+ }
+
+ bool isConditionalBranch(const MCInst &Inst) const override {
+ // BCCs with the "always" predicate are unconditional branches.
+ if (Inst.getOpcode() == ARM::Bcc && Inst.getOperand(1).getImm()==ARMCC::AL)
+ return false;
+ return MCInstrAnalysis::isConditionalBranch(Inst);
+ }
+
+ bool evaluateBranch(const MCInst &Inst, uint64_t Addr,
+ uint64_t Size, uint64_t &Target) const override {
+ // We only handle PCRel branches for now.
+ if (Info->get(Inst.getOpcode()).OpInfo[0].OperandType!=MCOI::OPERAND_PCREL)
+ return false;
+
+ int64_t Imm = Inst.getOperand(0).getImm();
+ // FIXME: This is not right for thumb.
+ Target = Addr+Imm+8; // In ARM mode the PC is always off by 8 bytes.
+ return true;
+ }
+};
+
+}
+
+static MCInstrAnalysis *createARMMCInstrAnalysis(const MCInstrInfo *Info) {
+ return new ARMMCInstrAnalysis(Info);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeARMTargetMC() {
+ for (Target *T : {&getTheARMLETarget(), &getTheARMBETarget(),
+ &getTheThumbLETarget(), &getTheThumbBETarget()}) {
+ // Register the MC asm info.
+ RegisterMCAsmInfoFn X(*T, createARMMCAsmInfo);
+
+ // Register the MC instruction info.
+ TargetRegistry::RegisterMCInstrInfo(*T, createARMMCInstrInfo);
+
+ // Register the MC register info.
+ TargetRegistry::RegisterMCRegInfo(*T, createARMMCRegisterInfo);
+
+ // Register the MC subtarget info.
+ TargetRegistry::RegisterMCSubtargetInfo(*T,
+ ARM_MC::createARMMCSubtargetInfo);
+
+ // Register the MC instruction analyzer.
+ TargetRegistry::RegisterMCInstrAnalysis(*T, createARMMCInstrAnalysis);
+
+ TargetRegistry::RegisterELFStreamer(*T, createELFStreamer);
+ TargetRegistry::RegisterCOFFStreamer(*T, createARMWinCOFFStreamer);
+ TargetRegistry::RegisterMachOStreamer(*T, createARMMachOStreamer);
+
+ // Register the obj target streamer.
+ TargetRegistry::RegisterObjectTargetStreamer(*T,
+ createARMObjectTargetStreamer);
+
+ // Register the asm streamer.
+ TargetRegistry::RegisterAsmTargetStreamer(*T, createARMTargetAsmStreamer);
+
+ // Register the null TargetStreamer.
+ TargetRegistry::RegisterNullTargetStreamer(*T, createARMNullTargetStreamer);
+
+ // Register the MCInstPrinter.
+ TargetRegistry::RegisterMCInstPrinter(*T, createARMMCInstPrinter);
+
+ // Register the MC relocation info.
+ TargetRegistry::RegisterMCRelocationInfo(*T, createARMMCRelocationInfo);
+ }
+
+ // Register the MC Code Emitter
+ for (Target *T : {&getTheARMLETarget(), &getTheThumbLETarget()})
+ TargetRegistry::RegisterMCCodeEmitter(*T, createARMLEMCCodeEmitter);
+ for (Target *T : {&getTheARMBETarget(), &getTheThumbBETarget()})
+ TargetRegistry::RegisterMCCodeEmitter(*T, createARMBEMCCodeEmitter);
+
+ // Register the asm backend.
+ TargetRegistry::RegisterMCAsmBackend(getTheARMLETarget(),
+ createARMLEAsmBackend);
+ TargetRegistry::RegisterMCAsmBackend(getTheARMBETarget(),
+ createARMBEAsmBackend);
+ TargetRegistry::RegisterMCAsmBackend(getTheThumbLETarget(),
+ createThumbLEAsmBackend);
+ TargetRegistry::RegisterMCAsmBackend(getTheThumbBETarget(),
+ createThumbBEAsmBackend);
+}
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
new file mode 100644
index 000000000000..ba834201e585
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
@@ -0,0 +1,131 @@
+//===-- ARMMCTargetDesc.h - ARM Target Descriptions -------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides ARM specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMMCTARGETDESC_H
+#define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMMCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+#include <string>
+
+namespace llvm {
+class formatted_raw_ostream;
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCInstPrinter;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class MCStreamer;
+class MCTargetOptions;
+class MCRelocationInfo;
+class MCTargetStreamer;
+class StringRef;
+class Target;
+class Triple;
+class raw_ostream;
+class raw_pwrite_stream;
+
+Target &getTheARMLETarget();
+Target &getTheThumbLETarget();
+Target &getTheARMBETarget();
+Target &getTheThumbBETarget();
+
+namespace ARM_MC {
+std::string ParseARMTriple(const Triple &TT, StringRef CPU);
+
+/// Create a ARM MCSubtargetInfo instance. This is exposed so Asm parser, etc.
+/// do not need to go through TargetRegistry.
+MCSubtargetInfo *createARMMCSubtargetInfo(const Triple &TT, StringRef CPU,
+ StringRef FS);
+}
+
+MCTargetStreamer *createARMNullTargetStreamer(MCStreamer &S);
+MCTargetStreamer *createARMTargetAsmStreamer(MCStreamer &S,
+ formatted_raw_ostream &OS,
+ MCInstPrinter *InstPrint,
+ bool isVerboseAsm);
+MCTargetStreamer *createARMObjectTargetStreamer(MCStreamer &S,
+ const MCSubtargetInfo &STI);
+
+MCCodeEmitter *createARMLEMCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx);
+
+MCCodeEmitter *createARMBEMCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx);
+
+MCAsmBackend *createARMAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options,
+ bool IsLittleEndian);
+
+MCAsmBackend *createARMLEAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options);
+
+MCAsmBackend *createARMBEAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options);
+
+MCAsmBackend *createThumbLEAsmBackend(const Target &T,
+ const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options);
+
+MCAsmBackend *createThumbBEAsmBackend(const Target &T,
+ const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options);
+
+// Construct a PE/COFF machine code streamer which will generate a PE/COFF
+// object file.
+MCStreamer *createARMWinCOFFStreamer(MCContext &Context, MCAsmBackend &MAB,
+ raw_pwrite_stream &OS,
+ MCCodeEmitter *Emitter, bool RelaxAll,
+ bool IncrementalLinkerCompatible);
+
+/// Construct an ELF Mach-O object writer.
+MCObjectWriter *createARMELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI,
+ bool IsLittleEndian);
+
+/// Construct an ARM Mach-O object writer.
+MCObjectWriter *createARMMachObjectWriter(raw_pwrite_stream &OS, bool Is64Bit,
+ uint32_t CPUType,
+ uint32_t CPUSubtype);
+
+/// Construct an ARM PE/COFF object writer.
+MCObjectWriter *createARMWinCOFFObjectWriter(raw_pwrite_stream &OS,
+ bool Is64Bit);
+
+/// Construct ARM Mach-O relocation info.
+MCRelocationInfo *createARMMachORelocationInfo(MCContext &Ctx);
+} // End llvm namespace
+
+// Defines symbolic names for ARM registers. This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "ARMGenRegisterInfo.inc"
+
+// Defines symbolic names for the ARM instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "ARMGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "ARMGenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
new file mode 100644
index 000000000000..482bcf902518
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
@@ -0,0 +1,43 @@
+//===-- ARMMachORelocationInfo.cpp ----------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "ARMMCExpr.h"
+#include "llvm-c/Disassembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCDisassembler/MCRelocationInfo.h"
+
+using namespace llvm;
+using namespace object;
+
+namespace {
+class ARMMachORelocationInfo : public MCRelocationInfo {
+public:
+ ARMMachORelocationInfo(MCContext &Ctx) : MCRelocationInfo(Ctx) {}
+
+ const MCExpr *createExprForCAPIVariantKind(const MCExpr *SubExpr,
+ unsigned VariantKind) override {
+ switch(VariantKind) {
+ case LLVMDisassembler_VariantKind_ARM_HI16:
+ return ARMMCExpr::createUpper16(SubExpr, Ctx);
+ case LLVMDisassembler_VariantKind_ARM_LO16:
+ return ARMMCExpr::createLower16(SubExpr, Ctx);
+ default:
+ return MCRelocationInfo::createExprForCAPIVariantKind(SubExpr,
+ VariantKind);
+ }
+ }
+};
+} // End unnamed namespace
+
+/// createARMMachORelocationInfo - Construct an ARM Mach-O RelocationInfo.
+MCRelocationInfo *llvm::createARMMachORelocationInfo(MCContext &Ctx) {
+ return new ARMMachORelocationInfo(Ctx);
+}
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
new file mode 100644
index 000000000000..b77181f29b2d
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -0,0 +1,486 @@
+//===-- ARMMachObjectWriter.cpp - ARM Mach Object Writer ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "MCTargetDesc/ARMFixupKinds.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
+using namespace llvm;
+
+namespace {
+class ARMMachObjectWriter : public MCMachObjectTargetWriter {
+ void RecordARMScatteredRelocation(MachObjectWriter *Writer,
+ const MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFragment *Fragment,
+ const MCFixup &Fixup,
+ MCValue Target,
+ unsigned Type,
+ unsigned Log2Size,
+ uint64_t &FixedValue);
+ void RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
+ const MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFragment *Fragment,
+ const MCFixup &Fixup, MCValue Target,
+ uint64_t &FixedValue);
+
+ bool requiresExternRelocation(MachObjectWriter *Writer,
+ const MCAssembler &Asm,
+ const MCFragment &Fragment, unsigned RelocType,
+ const MCSymbol &S, uint64_t FixedValue);
+
+public:
+ ARMMachObjectWriter(bool Is64Bit, uint32_t CPUType, uint32_t CPUSubtype)
+ : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype) {}
+
+ void recordRelocation(MachObjectWriter *Writer, MCAssembler &Asm,
+ const MCAsmLayout &Layout, const MCFragment *Fragment,
+ const MCFixup &Fixup, MCValue Target,
+ uint64_t &FixedValue) override;
+};
+}
+
+static bool getARMFixupKindMachOInfo(unsigned Kind, unsigned &RelocType,
+ unsigned &Log2Size) {
+ RelocType = unsigned(MachO::ARM_RELOC_VANILLA);
+ Log2Size = ~0U;
+
+ switch (Kind) {
+ default:
+ return false;
+
+ case FK_Data_1:
+ Log2Size = llvm::Log2_32(1);
+ return true;
+ case FK_Data_2:
+ Log2Size = llvm::Log2_32(2);
+ return true;
+ case FK_Data_4:
+ Log2Size = llvm::Log2_32(4);
+ return true;
+ case FK_Data_8:
+ Log2Size = llvm::Log2_32(8);
+ return true;
+
+ // These fixups are expected to always be resolvable at assembly time and
+ // have no relocations supported.
+ case ARM::fixup_arm_ldst_pcrel_12:
+ case ARM::fixup_arm_pcrel_10:
+ case ARM::fixup_arm_adr_pcrel_12:
+ case ARM::fixup_arm_thumb_br:
+ return false;
+
+ // Handle 24-bit branch kinds.
+ case ARM::fixup_arm_condbranch:
+ case ARM::fixup_arm_uncondbranch:
+ case ARM::fixup_arm_uncondbl:
+ case ARM::fixup_arm_condbl:
+ case ARM::fixup_arm_blx:
+ RelocType = unsigned(MachO::ARM_RELOC_BR24);
+ // Report as 'long', even though that is not quite accurate.
+ Log2Size = llvm::Log2_32(4);
+ return true;
+
+ case ARM::fixup_t2_uncondbranch:
+ case ARM::fixup_arm_thumb_bl:
+ case ARM::fixup_arm_thumb_blx:
+ RelocType = unsigned(MachO::ARM_THUMB_RELOC_BR22);
+ Log2Size = llvm::Log2_32(4);
+ return true;
+
+ // For movw/movt r_type relocations they always have a pair following them and
+ // the r_length bits are used differently. The encoding of the r_length is as
+ // follows:
+ // low bit of r_length:
+ // 0 - :lower16: for movw instructions
+ // 1 - :upper16: for movt instructions
+ // high bit of r_length:
+ // 0 - arm instructions
+ // 1 - thumb instructions
+ case ARM::fixup_arm_movt_hi16:
+ RelocType = unsigned(MachO::ARM_RELOC_HALF);
+ Log2Size = 1;
+ return true;
+ case ARM::fixup_t2_movt_hi16:
+ RelocType = unsigned(MachO::ARM_RELOC_HALF);
+ Log2Size = 3;
+ return true;
+
+ case ARM::fixup_arm_movw_lo16:
+ RelocType = unsigned(MachO::ARM_RELOC_HALF);
+ Log2Size = 0;
+ return true;
+ case ARM::fixup_t2_movw_lo16:
+ RelocType = unsigned(MachO::ARM_RELOC_HALF);
+ Log2Size = 2;
+ return true;
+ }
+}
+
+void ARMMachObjectWriter::
+RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
+ const MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFragment *Fragment,
+ const MCFixup &Fixup,
+ MCValue Target,
+ uint64_t &FixedValue) {
+ uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+ unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+ unsigned Type = MachO::ARM_RELOC_HALF;
+
+ // See <reloc.h>.
+ const MCSymbol *A = &Target.getSymA()->getSymbol();
+
+ if (!A->getFragment()) {
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "symbol '" + A->getName() +
+ "' can not be undefined in a subtraction expression");
+ return;
+ }
+
+ uint32_t Value = Writer->getSymbolAddress(*A, Layout);
+ uint32_t Value2 = 0;
+ uint64_t SecAddr = Writer->getSectionAddress(A->getFragment()->getParent());
+ FixedValue += SecAddr;
+
+ if (const MCSymbolRefExpr *B = Target.getSymB()) {
+ const MCSymbol *SB = &B->getSymbol();
+
+ if (!SB->getFragment()) {
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "symbol '" + B->getSymbol().getName() +
+ "' can not be undefined in a subtraction expression");
+ return;
+ }
+
+ // Select the appropriate difference relocation type.
+ Type = MachO::ARM_RELOC_HALF_SECTDIFF;
+ Value2 = Writer->getSymbolAddress(B->getSymbol(), Layout);
+ FixedValue -= Writer->getSectionAddress(SB->getFragment()->getParent());
+ }
+
+ // Relocations are written out in reverse order, so the PAIR comes first.
+ // ARM_RELOC_HALF and ARM_RELOC_HALF_SECTDIFF abuse the r_length field:
+ //
+ // For these two r_type relocations they always have a pair following them and
+ // the r_length bits are used differently. The encoding of the r_length is as
+ // follows:
+ // low bit of r_length:
+ // 0 - :lower16: for movw instructions
+ // 1 - :upper16: for movt instructions
+ // high bit of r_length:
+ // 0 - arm instructions
+ // 1 - thumb instructions
+ // the other half of the relocated expression is in the following pair
+ // relocation entry in the low 16 bits of r_address field.
+ unsigned ThumbBit = 0;
+ unsigned MovtBit = 0;
+ switch ((unsigned)Fixup.getKind()) {
+ default: break;
+ case ARM::fixup_arm_movt_hi16:
+ MovtBit = 1;
+ // The thumb bit shouldn't be set in the 'other-half' bit of the
+ // relocation, but it will be set in FixedValue if the base symbol
+ // is a thumb function. Clear it out here.
+ if (Asm.isThumbFunc(A))
+ FixedValue &= 0xfffffffe;
+ break;
+ case ARM::fixup_t2_movt_hi16:
+ if (Asm.isThumbFunc(A))
+ FixedValue &= 0xfffffffe;
+ MovtBit = 1;
+ LLVM_FALLTHROUGH;
+ case ARM::fixup_t2_movw_lo16:
+ ThumbBit = 1;
+ break;
+ }
+
+ if (Type == MachO::ARM_RELOC_HALF_SECTDIFF) {
+ uint32_t OtherHalf = MovtBit
+ ? (FixedValue & 0xffff) : ((FixedValue & 0xffff0000) >> 16);
+
+ MachO::any_relocation_info MRE;
+ MRE.r_word0 = ((OtherHalf << 0) |
+ (MachO::ARM_RELOC_PAIR << 24) |
+ (MovtBit << 28) |
+ (ThumbBit << 29) |
+ (IsPCRel << 30) |
+ MachO::R_SCATTERED);
+ MRE.r_word1 = Value2;
+ Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
+ }
+
+ MachO::any_relocation_info MRE;
+ MRE.r_word0 = ((FixupOffset << 0) |
+ (Type << 24) |
+ (MovtBit << 28) |
+ (ThumbBit << 29) |
+ (IsPCRel << 30) |
+ MachO::R_SCATTERED);
+ MRE.r_word1 = Value;
+ Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
+}
+
+void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer,
+ const MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFragment *Fragment,
+ const MCFixup &Fixup,
+ MCValue Target,
+ unsigned Type,
+ unsigned Log2Size,
+ uint64_t &FixedValue) {
+ uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+ unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+
+ // See <reloc.h>.
+ const MCSymbol *A = &Target.getSymA()->getSymbol();
+
+ if (!A->getFragment()) {
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "symbol '" + A->getName() +
+ "' can not be undefined in a subtraction expression");
+ return;
+ }
+
+ uint32_t Value = Writer->getSymbolAddress(*A, Layout);
+ uint64_t SecAddr = Writer->getSectionAddress(A->getFragment()->getParent());
+ FixedValue += SecAddr;
+ uint32_t Value2 = 0;
+
+ if (const MCSymbolRefExpr *B = Target.getSymB()) {
+ assert(Type == MachO::ARM_RELOC_VANILLA && "invalid reloc for 2 symbols");
+ const MCSymbol *SB = &B->getSymbol();
+
+ if (!SB->getFragment()) {
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "symbol '" + B->getSymbol().getName() +
+ "' can not be undefined in a subtraction expression");
+ return;
+ }
+
+ // Select the appropriate difference relocation type.
+ Type = MachO::ARM_RELOC_SECTDIFF;
+ Value2 = Writer->getSymbolAddress(B->getSymbol(), Layout);
+ FixedValue -= Writer->getSectionAddress(SB->getFragment()->getParent());
+ }
+
+ // Relocations are written out in reverse order, so the PAIR comes first.
+ if (Type == MachO::ARM_RELOC_SECTDIFF ||
+ Type == MachO::ARM_RELOC_LOCAL_SECTDIFF) {
+ MachO::any_relocation_info MRE;
+ MRE.r_word0 = ((0 << 0) |
+ (MachO::ARM_RELOC_PAIR << 24) |
+ (Log2Size << 28) |
+ (IsPCRel << 30) |
+ MachO::R_SCATTERED);
+ MRE.r_word1 = Value2;
+ Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
+ }
+
+ MachO::any_relocation_info MRE;
+ MRE.r_word0 = ((FixupOffset << 0) |
+ (Type << 24) |
+ (Log2Size << 28) |
+ (IsPCRel << 30) |
+ MachO::R_SCATTERED);
+ MRE.r_word1 = Value;
+ Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
+}
+
+bool ARMMachObjectWriter::requiresExternRelocation(MachObjectWriter *Writer,
+ const MCAssembler &Asm,
+ const MCFragment &Fragment,
+ unsigned RelocType,
+ const MCSymbol &S,
+ uint64_t FixedValue) {
+ // Most cases can be identified purely from the symbol.
+ if (Writer->doesSymbolRequireExternRelocation(S))
+ return true;
+ int64_t Value = (int64_t)FixedValue; // The displacement is signed.
+ int64_t Range;
+ switch (RelocType) {
+ default:
+ return false;
+ case MachO::ARM_RELOC_BR24:
+ // PC pre-adjustment of 8 for these instructions.
+ Value -= 8;
+ // ARM BL/BLX has a 25-bit offset.
+ Range = 0x1ffffff;
+ break;
+ case MachO::ARM_THUMB_RELOC_BR22:
+ // PC pre-adjustment of 4 for these instructions.
+ Value -= 4;
+ // Thumb BL/BLX has a 24-bit offset.
+ Range = 0xffffff;
+ }
+ // BL/BLX also use external relocations when an internal relocation
+ // would result in the target being out of range. This gives the linker
+ // enough information to generate a branch island.
+ Value += Writer->getSectionAddress(&S.getSection());
+ Value -= Writer->getSectionAddress(Fragment.getParent());
+ // If the resultant value would be out of range for an internal relocation,
+ // use an external instead.
+ if (Value > Range || Value < -(Range + 1))
+ return true;
+ return false;
+}
+
+void ARMMachObjectWriter::recordRelocation(MachObjectWriter *Writer,
+ MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFragment *Fragment,
+ const MCFixup &Fixup, MCValue Target,
+ uint64_t &FixedValue) {
+ unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+ unsigned Log2Size;
+ unsigned RelocType = MachO::ARM_RELOC_VANILLA;
+ if (!getARMFixupKindMachOInfo(Fixup.getKind(), RelocType, Log2Size)) {
+ // If we failed to get fixup kind info, it's because there's no legal
+ // relocation type for the fixup kind. This happens when it's a fixup that's
+ // expected to always be resolvable at assembly time and not have any
+ // relocations needed.
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "unsupported relocation on symbol");
+ return;
+ }
+
+ // If this is a difference or a defined symbol plus an offset, then we need a
+ // scattered relocation entry. Differences always require scattered
+ // relocations.
+ if (Target.getSymB()) {
+ if (RelocType == MachO::ARM_RELOC_HALF)
+ return RecordARMScatteredHalfRelocation(Writer, Asm, Layout, Fragment,
+ Fixup, Target, FixedValue);
+ return RecordARMScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
+ Target, RelocType, Log2Size,
+ FixedValue);
+ }
+
+ // Get the symbol data, if any.
+ const MCSymbol *A = nullptr;
+ if (Target.getSymA())
+ A = &Target.getSymA()->getSymbol();
+
+ // FIXME: For other platforms, we need to use scattered relocations for
+ // internal relocations with offsets. If this is an internal relocation with
+ // an offset, it also needs a scattered relocation entry.
+ //
+ // Is this right for ARM?
+ uint32_t Offset = Target.getConstant();
+ if (IsPCRel && RelocType == MachO::ARM_RELOC_VANILLA)
+ Offset += 1 << Log2Size;
+ if (Offset && A && !Writer->doesSymbolRequireExternRelocation(*A) &&
+ RelocType != MachO::ARM_RELOC_HALF)
+ return RecordARMScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
+ Target, RelocType, Log2Size,
+ FixedValue);
+
+ // See <reloc.h>.
+ uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+ unsigned Index = 0;
+ unsigned Type = 0;
+ const MCSymbol *RelSymbol = nullptr;
+
+ if (Target.isAbsolute()) { // constant
+ // FIXME!
+ report_fatal_error("FIXME: relocations to absolute targets "
+ "not yet implemented");
+ } else {
+ // Resolve constant variables.
+ if (A->isVariable()) {
+ int64_t Res;
+ if (A->getVariableValue()->evaluateAsAbsolute(
+ Res, Layout, Writer->getSectionAddressMap())) {
+ FixedValue = Res;
+ return;
+ }
+ }
+
+ // Check whether we need an external or internal relocation.
+ if (requiresExternRelocation(Writer, Asm, *Fragment, RelocType, *A,
+ FixedValue)) {
+ RelSymbol = A;
+
+ // For external relocations, make sure to offset the fixup value to
+ // compensate for the addend of the symbol address, if it was
+ // undefined. This occurs with weak definitions, for example.
+ if (!A->isUndefined())
+ FixedValue -= Layout.getSymbolOffset(*A);
+ } else {
+ // The index is the section ordinal (1-based).
+ const MCSection &Sec = A->getSection();
+ Index = Sec.getOrdinal() + 1;
+ FixedValue += Writer->getSectionAddress(&Sec);
+ }
+ if (IsPCRel)
+ FixedValue -= Writer->getSectionAddress(Fragment->getParent());
+
+ // The type is determined by the fixup kind.
+ Type = RelocType;
+ }
+
+ // struct relocation_info (8 bytes)
+ MachO::any_relocation_info MRE;
+ MRE.r_word0 = FixupOffset;
+ MRE.r_word1 =
+ (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
+
+ // Even when it's not a scattered relocation, movw/movt always uses
+ // a PAIR relocation.
+ if (Type == MachO::ARM_RELOC_HALF) {
+ // The entire addend is needed to correctly apply a relocation. One half is
+ // extracted from the instruction itself, the other comes from this
+ // PAIR. I.e. it's correct that we insert the high bits of the addend in the
+ // MOVW case here. relocation entries.
+ uint32_t Value = 0;
+ switch ((unsigned)Fixup.getKind()) {
+ default: break;
+ case ARM::fixup_arm_movw_lo16:
+ case ARM::fixup_t2_movw_lo16:
+ Value = (FixedValue >> 16) & 0xffff;
+ break;
+ case ARM::fixup_arm_movt_hi16:
+ case ARM::fixup_t2_movt_hi16:
+ Value = FixedValue & 0xffff;
+ break;
+ }
+ MachO::any_relocation_info MREPair;
+ MREPair.r_word0 = Value;
+ MREPair.r_word1 = ((0xffffff << 0) |
+ (Log2Size << 25) |
+ (MachO::ARM_RELOC_PAIR << 28));
+
+ Writer->addRelocation(nullptr, Fragment->getParent(), MREPair);
+ }
+
+ Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
+}
+
+MCObjectWriter *llvm::createARMMachObjectWriter(raw_pwrite_stream &OS,
+ bool Is64Bit, uint32_t CPUType,
+ uint32_t CPUSubtype) {
+ return createMachObjectWriter(new ARMMachObjectWriter(Is64Bit,
+ CPUType,
+ CPUSubtype),
+ OS, /*IsLittleEndian=*/true);
+}
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
new file mode 100644
index 000000000000..c0d10c896354
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
@@ -0,0 +1,77 @@
+//===- ARMTargetStreamer.cpp - ARMTargetStreamer class --*- C++ -*---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARMTargetStreamer class.
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/ADT/MapVector.h"
+#include "llvm/MC/ConstantPools.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
+
+using namespace llvm;
+//
+// ARMTargetStreamer Implemenation
+//
+ARMTargetStreamer::ARMTargetStreamer(MCStreamer &S)
+ : MCTargetStreamer(S), ConstantPools(new AssemblerConstantPools()) {}
+
+ARMTargetStreamer::~ARMTargetStreamer() {}
+
+// The constant pool handling is shared by all ARMTargetStreamer
+// implementations.
+const MCExpr *ARMTargetStreamer::addConstantPoolEntry(const MCExpr *Expr, SMLoc Loc) {
+ return ConstantPools->addEntry(Streamer, Expr, 4, Loc);
+}
+
+void ARMTargetStreamer::emitCurrentConstantPool() {
+ ConstantPools->emitForCurrentSection(Streamer);
+}
+
+// finish() - write out any non-empty assembler constant pools.
+void ARMTargetStreamer::finish() { ConstantPools->emitAll(Streamer); }
+
+// reset() - Reset any state
+void ARMTargetStreamer::reset() {}
+
+// The remaining callbacks should be handled separately by each
+// streamer.
+void ARMTargetStreamer::emitFnStart() {}
+void ARMTargetStreamer::emitFnEnd() {}
+void ARMTargetStreamer::emitCantUnwind() {}
+void ARMTargetStreamer::emitPersonality(const MCSymbol *Personality) {}
+void ARMTargetStreamer::emitPersonalityIndex(unsigned Index) {}
+void ARMTargetStreamer::emitHandlerData() {}
+void ARMTargetStreamer::emitSetFP(unsigned FpReg, unsigned SpReg,
+ int64_t Offset) {}
+void ARMTargetStreamer::emitMovSP(unsigned Reg, int64_t Offset) {}
+void ARMTargetStreamer::emitPad(int64_t Offset) {}
+void ARMTargetStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList,
+ bool isVector) {}
+void ARMTargetStreamer::emitUnwindRaw(int64_t StackOffset,
+ const SmallVectorImpl<uint8_t> &Opcodes) {
+}
+void ARMTargetStreamer::switchVendor(StringRef Vendor) {}
+void ARMTargetStreamer::emitAttribute(unsigned Attribute, unsigned Value) {}
+void ARMTargetStreamer::emitTextAttribute(unsigned Attribute,
+ StringRef String) {}
+void ARMTargetStreamer::emitIntTextAttribute(unsigned Attribute,
+ unsigned IntValue,
+ StringRef StringValue) {}
+void ARMTargetStreamer::emitArch(unsigned Arch) {}
+void ARMTargetStreamer::emitArchExtension(unsigned ArchExt) {}
+void ARMTargetStreamer::emitObjectArch(unsigned Arch) {}
+void ARMTargetStreamer::emitFPU(unsigned FPU) {}
+void ARMTargetStreamer::finishAttributeSection() {}
+void ARMTargetStreamer::emitInst(uint32_t Inst, char Suffix) {}
+void
+ARMTargetStreamer::AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *SRE) {}
+
+void ARMTargetStreamer::emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) {}
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp
new file mode 100644
index 000000000000..173cc93d44fb
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp
@@ -0,0 +1,196 @@
+//===-- ARMUnwindOpAsm.cpp - ARM Unwind Opcodes Assembler -------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the unwind opcode assmebler for ARM exception handling
+// table.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMUnwindOpAsm.h"
+#include "llvm/Support/ARMEHABI.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/LEB128.h"
+
+using namespace llvm;
+
+namespace {
+ /// UnwindOpcodeStreamer - The simple wrapper over SmallVector to emit bytes
+ /// with MSB to LSB per uint32_t ordering. For example, the first byte will
+ /// be placed in Vec[3], and the following bytes will be placed in 2, 1, 0,
+ /// 7, 6, 5, 4, 11, 10, 9, 8, and so on.
+ class UnwindOpcodeStreamer {
+ private:
+ SmallVectorImpl<uint8_t> &Vec;
+ size_t Pos;
+
+ public:
+ UnwindOpcodeStreamer(SmallVectorImpl<uint8_t> &V) : Vec(V), Pos(3) {
+ }
+
+ /// Emit the byte in MSB to LSB per uint32_t order.
+ inline void EmitByte(uint8_t elem) {
+ Vec[Pos] = elem;
+ Pos = (((Pos ^ 0x3u) + 1) ^ 0x3u);
+ }
+
+ /// Emit the size prefix.
+ inline void EmitSize(size_t Size) {
+ size_t SizeInWords = (Size + 3) / 4;
+ assert(SizeInWords <= 0x100u &&
+ "Only 256 additional words are allowed for unwind opcodes");
+ EmitByte(static_cast<uint8_t>(SizeInWords - 1));
+ }
+
+ /// Emit the personality index prefix.
+ inline void EmitPersonalityIndex(unsigned PI) {
+ assert(PI < ARM::EHABI::NUM_PERSONALITY_INDEX &&
+ "Invalid personality prefix");
+ EmitByte(ARM::EHABI::EHT_COMPACT | PI);
+ }
+
+ /// Fill the rest of bytes with FINISH opcode.
+ inline void FillFinishOpcode() {
+ while (Pos < Vec.size())
+ EmitByte(ARM::EHABI::UNWIND_OPCODE_FINISH);
+ }
+ };
+}
+
+void UnwindOpcodeAssembler::EmitRegSave(uint32_t RegSave) {
+ if (RegSave == 0u)
+ return;
+
+ // One byte opcode to save register r14 and r11-r4
+ if (RegSave & (1u << 4)) {
+ // The one byte opcode will always save r4, thus we can't use the one byte
+ // opcode when r4 is not in .save directive.
+
+ // Compute the consecutive registers from r4 to r11.
+ uint32_t Mask = RegSave & 0xff0u;
+ uint32_t Range = countTrailingOnes(Mask >> 5); // Exclude r4.
+ // Mask off non-consecutive registers. Keep r4.
+ Mask &= ~(0xffffffe0u << Range);
+
+ // Emit this opcode when the mask covers every registers.
+ uint32_t UnmaskedReg = RegSave & 0xfff0u & (~Mask);
+ if (UnmaskedReg == 0u) {
+ // Pop r[4 : (4 + n)]
+ EmitInt8(ARM::EHABI::UNWIND_OPCODE_POP_REG_RANGE_R4 | Range);
+ RegSave &= 0x000fu;
+ } else if (UnmaskedReg == (1u << 14)) {
+ // Pop r[14] + r[4 : (4 + n)]
+ EmitInt8(ARM::EHABI::UNWIND_OPCODE_POP_REG_RANGE_R4_R14 | Range);
+ RegSave &= 0x000fu;
+ }
+ }
+
+ // Two bytes opcode to save register r15-r4
+ if ((RegSave & 0xfff0u) != 0)
+ EmitInt16(ARM::EHABI::UNWIND_OPCODE_POP_REG_MASK_R4 | (RegSave >> 4));
+
+ // Opcode to save register r3-r0
+ if ((RegSave & 0x000fu) != 0)
+ EmitInt16(ARM::EHABI::UNWIND_OPCODE_POP_REG_MASK | (RegSave & 0x000fu));
+}
+
+/// Emit unwind opcodes for .vsave directives
+void UnwindOpcodeAssembler::EmitVFPRegSave(uint32_t VFPRegSave) {
+ // We only have 4 bits to save the offset in the opcode so look at the lower
+ // and upper 16 bits separately.
+ for (uint32_t Regs : {VFPRegSave & 0xffff0000u, VFPRegSave & 0x0000ffffu}) {
+ while (Regs) {
+ // Now look for a run of set bits. Remember the MSB and LSB of the run.
+ auto RangeMSB = 32 - countLeadingZeros(Regs);
+ auto RangeLen = countLeadingOnes(Regs << (32 - RangeMSB));
+ auto RangeLSB = RangeMSB - RangeLen;
+
+ int Opcode = RangeLSB >= 16
+ ? ARM::EHABI::UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDD_D16
+ : ARM::EHABI::UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDD;
+
+ EmitInt16(Opcode | ((RangeLSB % 16) << 4) | (RangeLen - 1));
+
+ // Zero out bits we're done with.
+ Regs &= ~(-1u << RangeLSB);
+ }
+ }
+}
+
+/// Emit unwind opcodes to copy address from source register to $sp.
+void UnwindOpcodeAssembler::EmitSetSP(uint16_t Reg) {
+ EmitInt8(ARM::EHABI::UNWIND_OPCODE_SET_VSP | Reg);
+}
+
+/// Emit unwind opcodes to add $sp with an offset.
+void UnwindOpcodeAssembler::EmitSPOffset(int64_t Offset) {
+ if (Offset > 0x200) {
+ uint8_t Buff[16];
+ Buff[0] = ARM::EHABI::UNWIND_OPCODE_INC_VSP_ULEB128;
+ size_t ULEBSize = encodeULEB128((Offset - 0x204) >> 2, Buff + 1);
+ EmitBytes(Buff, ULEBSize + 1);
+ } else if (Offset > 0) {
+ if (Offset > 0x100) {
+ EmitInt8(ARM::EHABI::UNWIND_OPCODE_INC_VSP | 0x3fu);
+ Offset -= 0x100;
+ }
+ EmitInt8(ARM::EHABI::UNWIND_OPCODE_INC_VSP |
+ static_cast<uint8_t>((Offset - 4) >> 2));
+ } else if (Offset < 0) {
+ while (Offset < -0x100) {
+ EmitInt8(ARM::EHABI::UNWIND_OPCODE_DEC_VSP | 0x3fu);
+ Offset += 0x100;
+ }
+ EmitInt8(ARM::EHABI::UNWIND_OPCODE_DEC_VSP |
+ static_cast<uint8_t>(((-Offset) - 4) >> 2));
+ }
+}
+
+void UnwindOpcodeAssembler::Finalize(unsigned &PersonalityIndex,
+ SmallVectorImpl<uint8_t> &Result) {
+
+ UnwindOpcodeStreamer OpStreamer(Result);
+
+ if (HasPersonality) {
+ // User-specifed personality routine: [ SIZE , OP1 , OP2 , ... ]
+ PersonalityIndex = ARM::EHABI::NUM_PERSONALITY_INDEX;
+ size_t TotalSize = Ops.size() + 1;
+ size_t RoundUpSize = (TotalSize + 3) / 4 * 4;
+ Result.resize(RoundUpSize);
+ OpStreamer.EmitSize(RoundUpSize);
+ } else {
+ // If no personalityindex is specified, select ane
+ if (PersonalityIndex == ARM::EHABI::NUM_PERSONALITY_INDEX)
+ PersonalityIndex = (Ops.size() <= 3) ? ARM::EHABI::AEABI_UNWIND_CPP_PR0
+ : ARM::EHABI::AEABI_UNWIND_CPP_PR1;
+ if (PersonalityIndex == ARM::EHABI::AEABI_UNWIND_CPP_PR0) {
+ // __aeabi_unwind_cpp_pr0: [ 0x80 , OP1 , OP2 , OP3 ]
+ assert(Ops.size() <= 3 && "too many opcodes for __aeabi_unwind_cpp_pr0");
+ Result.resize(4);
+ OpStreamer.EmitPersonalityIndex(PersonalityIndex);
+ } else {
+ // __aeabi_unwind_cpp_pr{1,2}: [ {0x81,0x82} , SIZE , OP1 , OP2 , ... ]
+ size_t TotalSize = Ops.size() + 2;
+ size_t RoundUpSize = (TotalSize + 3) / 4 * 4;
+ Result.resize(RoundUpSize);
+ OpStreamer.EmitPersonalityIndex(PersonalityIndex);
+ OpStreamer.EmitSize(RoundUpSize);
+ }
+ }
+
+ // Copy the unwind opcodes
+ for (size_t i = OpBegins.size() - 1; i > 0; --i)
+ for (size_t j = OpBegins[i - 1], end = OpBegins[i]; j < end; ++j)
+ OpStreamer.EmitByte(Ops[j]);
+
+ // Emit the padding finish opcodes if the size is not multiple of 4.
+ OpStreamer.FillFinishOpcode();
+
+ // Reset the assembler state
+ Reset();
+}
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h
new file mode 100644
index 000000000000..e0c113ecfaa3
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h
@@ -0,0 +1,93 @@
+//===-- ARMUnwindOpAsm.h - ARM Unwind Opcodes Assembler ---------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the unwind opcode assmebler for ARM exception handling
+// table.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMUNWINDOPASM_H
+#define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMUNWINDOPASM_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/ARMEHABI.h"
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+class MCSymbol;
+
+class UnwindOpcodeAssembler {
+private:
+ llvm::SmallVector<uint8_t, 32> Ops;
+ llvm::SmallVector<unsigned, 8> OpBegins;
+ bool HasPersonality;
+
+public:
+ UnwindOpcodeAssembler()
+ : HasPersonality(0) {
+ OpBegins.push_back(0);
+ }
+
+ /// Reset the unwind opcode assembler.
+ void Reset() {
+ Ops.clear();
+ OpBegins.clear();
+ OpBegins.push_back(0);
+ HasPersonality = 0;
+ }
+
+ /// Set the personality
+ void setPersonality(const MCSymbol *Per) {
+ HasPersonality = 1;
+ }
+
+ /// Emit unwind opcodes for .save directives
+ void EmitRegSave(uint32_t RegSave);
+
+ /// Emit unwind opcodes for .vsave directives
+ void EmitVFPRegSave(uint32_t VFPRegSave);
+
+ /// Emit unwind opcodes to copy address from source register to $sp.
+ void EmitSetSP(uint16_t Reg);
+
+ /// Emit unwind opcodes to add $sp with an offset.
+ void EmitSPOffset(int64_t Offset);
+
+ /// Emit unwind raw opcodes
+ void EmitRaw(const SmallVectorImpl<uint8_t> &Opcodes) {
+ Ops.insert(Ops.end(), Opcodes.begin(), Opcodes.end());
+ OpBegins.push_back(OpBegins.back() + Opcodes.size());
+ }
+
+ /// Finalize the unwind opcode sequence for EmitBytes()
+ void Finalize(unsigned &PersonalityIndex,
+ SmallVectorImpl<uint8_t> &Result);
+
+private:
+ void EmitInt8(unsigned Opcode) {
+ Ops.push_back(Opcode & 0xff);
+ OpBegins.push_back(OpBegins.back() + 1);
+ }
+
+ void EmitInt16(unsigned Opcode) {
+ Ops.push_back((Opcode >> 8) & 0xff);
+ Ops.push_back(Opcode & 0xff);
+ OpBegins.push_back(OpBegins.back() + 2);
+ }
+
+ void EmitBytes(const uint8_t *Opcode, size_t Size) {
+ Ops.insert(Ops.end(), Opcode, Opcode + Size);
+ OpBegins.push_back(OpBegins.back() + Size);
+ }
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
new file mode 100644
index 000000000000..166c04b41a77
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
@@ -0,0 +1,91 @@
+//===-- ARMWinCOFFObjectWriter.cpp - ARM Windows COFF Object Writer -- C++ -==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/ARMFixupKinds.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/MC/MCWinCOFFObjectWriter.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+namespace {
+class ARMWinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter {
+public:
+ ARMWinCOFFObjectWriter(bool Is64Bit)
+ : MCWinCOFFObjectTargetWriter(COFF::IMAGE_FILE_MACHINE_ARMNT) {
+ assert(!Is64Bit && "AArch64 support not yet implemented");
+ }
+ ~ARMWinCOFFObjectWriter() override {}
+
+ unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup,
+ bool IsCrossSection,
+ const MCAsmBackend &MAB) const override;
+
+ bool recordRelocation(const MCFixup &) const override;
+};
+
+unsigned ARMWinCOFFObjectWriter::getRelocType(const MCValue &Target,
+ const MCFixup &Fixup,
+ bool IsCrossSection,
+ const MCAsmBackend &MAB) const {
+ assert(getMachine() == COFF::IMAGE_FILE_MACHINE_ARMNT &&
+ "AArch64 support not yet implemented");
+
+ MCSymbolRefExpr::VariantKind Modifier =
+ Target.isAbsolute() ? MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
+
+ switch (static_cast<unsigned>(Fixup.getKind())) {
+ default: {
+ const MCFixupKindInfo &Info = MAB.getFixupKindInfo(Fixup.getKind());
+ report_fatal_error(Twine("unsupported relocation type: ") + Info.Name);
+ }
+ case FK_Data_4:
+ switch (Modifier) {
+ case MCSymbolRefExpr::VK_COFF_IMGREL32:
+ return COFF::IMAGE_REL_ARM_ADDR32NB;
+ case MCSymbolRefExpr::VK_SECREL:
+ return COFF::IMAGE_REL_ARM_SECREL;
+ default:
+ return COFF::IMAGE_REL_ARM_ADDR32;
+ }
+ case FK_SecRel_2:
+ return COFF::IMAGE_REL_ARM_SECTION;
+ case FK_SecRel_4:
+ return COFF::IMAGE_REL_ARM_SECREL;
+ case ARM::fixup_t2_condbranch:
+ return COFF::IMAGE_REL_ARM_BRANCH20T;
+ case ARM::fixup_t2_uncondbranch:
+ return COFF::IMAGE_REL_ARM_BRANCH24T;
+ case ARM::fixup_arm_thumb_bl:
+ case ARM::fixup_arm_thumb_blx:
+ return COFF::IMAGE_REL_ARM_BLX23T;
+ case ARM::fixup_t2_movw_lo16:
+ case ARM::fixup_t2_movt_hi16:
+ return COFF::IMAGE_REL_ARM_MOV32T;
+ }
+}
+
+bool ARMWinCOFFObjectWriter::recordRelocation(const MCFixup &Fixup) const {
+ return static_cast<unsigned>(Fixup.getKind()) != ARM::fixup_t2_movt_hi16;
+}
+}
+
+namespace llvm {
+MCObjectWriter *createARMWinCOFFObjectWriter(raw_pwrite_stream &OS,
+ bool Is64Bit) {
+ MCWinCOFFObjectTargetWriter *MOTW = new ARMWinCOFFObjectWriter(Is64Bit);
+ return createWinCOFFObjectWriter(MOTW, OS);
+}
+}
+
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
new file mode 100644
index 000000000000..83fa084e60c7
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
@@ -0,0 +1,47 @@
+//===-- ARMWinCOFFStreamer.cpp - ARM Target WinCOFF Streamer ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMMCTargetDesc.h"
+#include "llvm/MC/MCWinCOFFStreamer.h"
+
+using namespace llvm;
+
+namespace {
+class ARMWinCOFFStreamer : public MCWinCOFFStreamer {
+public:
+ ARMWinCOFFStreamer(MCContext &C, MCAsmBackend &AB, MCCodeEmitter &CE,
+ raw_pwrite_stream &OS)
+ : MCWinCOFFStreamer(C, AB, CE, OS) {}
+
+ void EmitAssemblerFlag(MCAssemblerFlag Flag) override;
+ void EmitThumbFunc(MCSymbol *Symbol) override;
+};
+
+void ARMWinCOFFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
+ switch (Flag) {
+ default: llvm_unreachable("not implemented");
+ case MCAF_SyntaxUnified:
+ case MCAF_Code16:
+ break;
+ }
+}
+
+void ARMWinCOFFStreamer::EmitThumbFunc(MCSymbol *Symbol) {
+ getAssembler().setIsThumbFunc(Symbol);
+}
+}
+
+MCStreamer *llvm::createARMWinCOFFStreamer(
+ MCContext &Context, MCAsmBackend &MAB, raw_pwrite_stream &OS,
+ MCCodeEmitter *Emitter, bool RelaxAll, bool IncrementalLinkerCompatible) {
+ auto *S = new ARMWinCOFFStreamer(Context, MAB, *Emitter, OS);
+ S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible);
+ return S;
+}
+
diff --git a/contrib/llvm/lib/Target/ARM/MLxExpansionPass.cpp b/contrib/llvm/lib/Target/ARM/MLxExpansionPass.cpp
new file mode 100644
index 000000000000..744761bcddb8
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MLxExpansionPass.cpp
@@ -0,0 +1,395 @@
+//===-- MLxExpansionPass.cpp - Expand MLx instrs to avoid hazards ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Expand VFP / NEON floating point MLA / MLS instructions (each to a pair of
+// multiple and add / sub instructions) when special VMLx hazards are detected.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMSubtarget.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "mlx-expansion"
+
+static cl::opt<bool>
+ForceExapnd("expand-all-fp-mlx", cl::init(false), cl::Hidden);
+static cl::opt<unsigned>
+ExpandLimit("expand-limit", cl::init(~0U), cl::Hidden);
+
+STATISTIC(NumExpand, "Number of fp MLA / MLS instructions expanded");
+
+namespace {
+ struct MLxExpansion : public MachineFunctionPass {
+ static char ID;
+ MLxExpansion() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+
+ StringRef getPassName() const override {
+ return "ARM MLA / MLS expansion pass";
+ }
+
+ private:
+ const ARMBaseInstrInfo *TII;
+ const TargetRegisterInfo *TRI;
+ MachineRegisterInfo *MRI;
+
+ bool isLikeA9;
+ bool isSwift;
+ unsigned MIIdx;
+ MachineInstr* LastMIs[4];
+ SmallPtrSet<MachineInstr*, 4> IgnoreStall;
+
+ void clearStack();
+ void pushStack(MachineInstr *MI);
+ MachineInstr *getAccDefMI(MachineInstr *MI) const;
+ unsigned getDefReg(MachineInstr *MI) const;
+ bool hasLoopHazard(MachineInstr *MI) const;
+ bool hasRAWHazard(unsigned Reg, MachineInstr *MI) const;
+ bool FindMLxHazard(MachineInstr *MI);
+ void ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI,
+ unsigned MulOpc, unsigned AddSubOpc,
+ bool NegAcc, bool HasLane);
+ bool ExpandFPMLxInstructions(MachineBasicBlock &MBB);
+ };
+ char MLxExpansion::ID = 0;
+}
+
+void MLxExpansion::clearStack() {
+ std::fill(LastMIs, LastMIs + 4, nullptr);
+ MIIdx = 0;
+}
+
+void MLxExpansion::pushStack(MachineInstr *MI) {
+ LastMIs[MIIdx] = MI;
+ if (++MIIdx == 4)
+ MIIdx = 0;
+}
+
+MachineInstr *MLxExpansion::getAccDefMI(MachineInstr *MI) const {
+ // Look past COPY and INSERT_SUBREG instructions to find the
+ // real definition MI. This is important for _sfp instructions.
+ unsigned Reg = MI->getOperand(1).getReg();
+ if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ return nullptr;
+
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineInstr *DefMI = MRI->getVRegDef(Reg);
+ while (true) {
+ if (DefMI->getParent() != MBB)
+ break;
+ if (DefMI->isCopyLike()) {
+ Reg = DefMI->getOperand(1).getReg();
+ if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ DefMI = MRI->getVRegDef(Reg);
+ continue;
+ }
+ } else if (DefMI->isInsertSubreg()) {
+ Reg = DefMI->getOperand(2).getReg();
+ if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ DefMI = MRI->getVRegDef(Reg);
+ continue;
+ }
+ }
+ break;
+ }
+ return DefMI;
+}
+
+unsigned MLxExpansion::getDefReg(MachineInstr *MI) const {
+ unsigned Reg = MI->getOperand(0).getReg();
+ if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
+ !MRI->hasOneNonDBGUse(Reg))
+ return Reg;
+
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineInstr *UseMI = &*MRI->use_instr_nodbg_begin(Reg);
+ if (UseMI->getParent() != MBB)
+ return Reg;
+
+ while (UseMI->isCopy() || UseMI->isInsertSubreg()) {
+ Reg = UseMI->getOperand(0).getReg();
+ if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
+ !MRI->hasOneNonDBGUse(Reg))
+ return Reg;
+ UseMI = &*MRI->use_instr_nodbg_begin(Reg);
+ if (UseMI->getParent() != MBB)
+ return Reg;
+ }
+
+ return Reg;
+}
+
+/// hasLoopHazard - Check whether an MLx instruction is chained to itself across
+/// a single-MBB loop.
+bool MLxExpansion::hasLoopHazard(MachineInstr *MI) const {
+ unsigned Reg = MI->getOperand(1).getReg();
+ if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ return false;
+
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineInstr *DefMI = MRI->getVRegDef(Reg);
+ while (true) {
+outer_continue:
+ if (DefMI->getParent() != MBB)
+ break;
+
+ if (DefMI->isPHI()) {
+ for (unsigned i = 1, e = DefMI->getNumOperands(); i < e; i += 2) {
+ if (DefMI->getOperand(i + 1).getMBB() == MBB) {
+ unsigned SrcReg = DefMI->getOperand(i).getReg();
+ if (TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+ DefMI = MRI->getVRegDef(SrcReg);
+ goto outer_continue;
+ }
+ }
+ }
+ } else if (DefMI->isCopyLike()) {
+ Reg = DefMI->getOperand(1).getReg();
+ if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ DefMI = MRI->getVRegDef(Reg);
+ continue;
+ }
+ } else if (DefMI->isInsertSubreg()) {
+ Reg = DefMI->getOperand(2).getReg();
+ if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ DefMI = MRI->getVRegDef(Reg);
+ continue;
+ }
+ }
+
+ break;
+ }
+
+ return DefMI == MI;
+}
+
+bool MLxExpansion::hasRAWHazard(unsigned Reg, MachineInstr *MI) const {
+ // FIXME: Detect integer instructions properly.
+ const MCInstrDesc &MCID = MI->getDesc();
+ unsigned Domain = MCID.TSFlags & ARMII::DomainMask;
+ if (MI->mayStore())
+ return false;
+ unsigned Opcode = MCID.getOpcode();
+ if (Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD)
+ return false;
+ if ((Domain & ARMII::DomainVFP) || (Domain & ARMII::DomainNEON))
+ return MI->readsRegister(Reg, TRI);
+ return false;
+}
+
+static bool isFpMulInstruction(unsigned Opcode) {
+ switch (Opcode) {
+ case ARM::VMULS:
+ case ARM::VMULfd:
+ case ARM::VMULfq:
+ case ARM::VMULD:
+ case ARM::VMULslfd:
+ case ARM::VMULslfq:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool MLxExpansion::FindMLxHazard(MachineInstr *MI) {
+ if (NumExpand >= ExpandLimit)
+ return false;
+
+ if (ForceExapnd)
+ return true;
+
+ MachineInstr *DefMI = getAccDefMI(MI);
+ if (TII->isFpMLxInstruction(DefMI->getOpcode())) {
+ // r0 = vmla
+ // r3 = vmla r0, r1, r2
+ // takes 16 - 17 cycles
+ //
+ // r0 = vmla
+ // r4 = vmul r1, r2
+ // r3 = vadd r0, r4
+ // takes about 14 - 15 cycles even with vmul stalling for 4 cycles.
+ IgnoreStall.insert(DefMI);
+ return true;
+ }
+
+ // On Swift, we mostly care about hazards from multiplication instructions
+ // writing the accumulator and the pipelining of loop iterations by out-of-
+ // order execution.
+ if (isSwift)
+ return isFpMulInstruction(DefMI->getOpcode()) || hasLoopHazard(MI);
+
+ if (IgnoreStall.count(MI))
+ return false;
+
+ // If a VMLA.F is followed by an VADD.F or VMUL.F with no RAW hazard, the
+ // VADD.F or VMUL.F will stall 4 cycles before issue. The 4 cycle stall
+ // preserves the in-order retirement of the instructions.
+ // Look at the next few instructions, if *most* of them can cause hazards,
+ // then the scheduler can't *fix* this, we'd better break up the VMLA.
+ unsigned Limit1 = isLikeA9 ? 1 : 4;
+ unsigned Limit2 = isLikeA9 ? 1 : 4;
+ for (unsigned i = 1; i <= 4; ++i) {
+ int Idx = ((int)MIIdx - i + 4) % 4;
+ MachineInstr *NextMI = LastMIs[Idx];
+ if (!NextMI)
+ continue;
+
+ if (TII->canCauseFpMLxStall(NextMI->getOpcode())) {
+ if (i <= Limit1)
+ return true;
+ }
+
+ // Look for VMLx RAW hazard.
+ if (i <= Limit2 && hasRAWHazard(getDefReg(MI), NextMI))
+ return true;
+ }
+
+ return false;
+}
+
+/// ExpandFPMLxInstructions - Expand a MLA / MLS instruction into a pair
+/// of MUL + ADD / SUB instructions.
+void
+MLxExpansion::ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI,
+ unsigned MulOpc, unsigned AddSubOpc,
+ bool NegAcc, bool HasLane) {
+ unsigned DstReg = MI->getOperand(0).getReg();
+ bool DstDead = MI->getOperand(0).isDead();
+ unsigned AccReg = MI->getOperand(1).getReg();
+ unsigned Src1Reg = MI->getOperand(2).getReg();
+ unsigned Src2Reg = MI->getOperand(3).getReg();
+ bool Src1Kill = MI->getOperand(2).isKill();
+ bool Src2Kill = MI->getOperand(3).isKill();
+ unsigned LaneImm = HasLane ? MI->getOperand(4).getImm() : 0;
+ unsigned NextOp = HasLane ? 5 : 4;
+ ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI->getOperand(NextOp).getImm();
+ unsigned PredReg = MI->getOperand(++NextOp).getReg();
+
+ const MCInstrDesc &MCID1 = TII->get(MulOpc);
+ const MCInstrDesc &MCID2 = TII->get(AddSubOpc);
+ const MachineFunction &MF = *MI->getParent()->getParent();
+ unsigned TmpReg = MRI->createVirtualRegister(
+ TII->getRegClass(MCID1, 0, TRI, MF));
+
+ MachineInstrBuilder MIB = BuildMI(MBB, MI, MI->getDebugLoc(), MCID1, TmpReg)
+ .addReg(Src1Reg, getKillRegState(Src1Kill))
+ .addReg(Src2Reg, getKillRegState(Src2Kill));
+ if (HasLane)
+ MIB.addImm(LaneImm);
+ MIB.addImm(Pred).addReg(PredReg);
+
+ MIB = BuildMI(MBB, MI, MI->getDebugLoc(), MCID2)
+ .addReg(DstReg, getDefRegState(true) | getDeadRegState(DstDead));
+
+ if (NegAcc) {
+ bool AccKill = MRI->hasOneNonDBGUse(AccReg);
+ MIB.addReg(TmpReg, getKillRegState(true))
+ .addReg(AccReg, getKillRegState(AccKill));
+ } else {
+ MIB.addReg(AccReg).addReg(TmpReg, getKillRegState(true));
+ }
+ MIB.addImm(Pred).addReg(PredReg);
+
+ DEBUG({
+ dbgs() << "Expanding: " << *MI;
+ dbgs() << " to:\n";
+ MachineBasicBlock::iterator MII = MI;
+ MII = std::prev(MII);
+ MachineInstr &MI2 = *MII;
+ MII = std::prev(MII);
+ MachineInstr &MI1 = *MII;
+ dbgs() << " " << MI1;
+ dbgs() << " " << MI2;
+ });
+
+ MI->eraseFromParent();
+ ++NumExpand;
+}
+
+bool MLxExpansion::ExpandFPMLxInstructions(MachineBasicBlock &MBB) {
+ bool Changed = false;
+
+ clearStack();
+ IgnoreStall.clear();
+
+ unsigned Skip = 0;
+ MachineBasicBlock::reverse_iterator MII = MBB.rbegin(), E = MBB.rend();
+ while (MII != E) {
+ MachineInstr *MI = &*MII++;
+
+ if (MI->isPosition() || MI->isImplicitDef() || MI->isCopy())
+ continue;
+
+ const MCInstrDesc &MCID = MI->getDesc();
+ if (MI->isBarrier()) {
+ clearStack();
+ Skip = 0;
+ continue;
+ }
+
+ unsigned Domain = MCID.TSFlags & ARMII::DomainMask;
+ if (Domain == ARMII::DomainGeneral) {
+ if (++Skip == 2)
+ // Assume dual issues of non-VFP / NEON instructions.
+ pushStack(nullptr);
+ } else {
+ Skip = 0;
+
+ unsigned MulOpc, AddSubOpc;
+ bool NegAcc, HasLane;
+ if (!TII->isFpMLxInstruction(MCID.getOpcode(),
+ MulOpc, AddSubOpc, NegAcc, HasLane) ||
+ !FindMLxHazard(MI))
+ pushStack(MI);
+ else {
+ ExpandFPMLxInstruction(MBB, MI, MulOpc, AddSubOpc, NegAcc, HasLane);
+ Changed = true;
+ }
+ }
+ }
+
+ return Changed;
+}
+
+bool MLxExpansion::runOnMachineFunction(MachineFunction &Fn) {
+ if (skipFunction(*Fn.getFunction()))
+ return false;
+
+ TII = static_cast<const ARMBaseInstrInfo *>(Fn.getSubtarget().getInstrInfo());
+ TRI = Fn.getSubtarget().getRegisterInfo();
+ MRI = &Fn.getRegInfo();
+ const ARMSubtarget *STI = &Fn.getSubtarget<ARMSubtarget>();
+ if (!STI->expandMLx())
+ return false;
+ isLikeA9 = STI->isLikeA9() || STI->isSwift();
+ isSwift = STI->isSwift();
+
+ bool Modified = false;
+ for (MachineBasicBlock &MBB : Fn)
+ Modified |= ExpandFPMLxInstructions(MBB);
+
+ return Modified;
+}
+
+FunctionPass *llvm::createMLxExpansionPass() {
+ return new MLxExpansion();
+}
diff --git a/contrib/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp b/contrib/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
new file mode 100644
index 000000000000..caa69f8d71b7
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
@@ -0,0 +1,41 @@
+//===-- ARMTargetInfo.cpp - ARM Target Implementation ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target &llvm::getTheARMLETarget() {
+ static Target TheARMLETarget;
+ return TheARMLETarget;
+}
+Target &llvm::getTheARMBETarget() {
+ static Target TheARMBETarget;
+ return TheARMBETarget;
+}
+Target &llvm::getTheThumbLETarget() {
+ static Target TheThumbLETarget;
+ return TheThumbLETarget;
+}
+Target &llvm::getTheThumbBETarget() {
+ static Target TheThumbBETarget;
+ return TheThumbBETarget;
+}
+
+extern "C" void LLVMInitializeARMTargetInfo() {
+ RegisterTarget<Triple::arm, /*HasJIT=*/true> X(getTheARMLETarget(), "arm",
+ "ARM");
+ RegisterTarget<Triple::armeb, /*HasJIT=*/true> Y(getTheARMBETarget(), "armeb",
+ "ARM (big endian)");
+
+ RegisterTarget<Triple::thumb, /*HasJIT=*/true> A(getTheThumbLETarget(),
+ "thumb", "Thumb");
+ RegisterTarget<Triple::thumbeb, /*HasJIT=*/true> B(
+ getTheThumbBETarget(), "thumbeb", "Thumb (big endian)");
+}
diff --git a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
new file mode 100644
index 000000000000..9953c61cd89c
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -0,0 +1,884 @@
+//===-- Thumb1FrameLowering.cpp - Thumb1 Frame Information ----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb1 implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Thumb1FrameLowering.h"
+#include "ARMMachineFunctionInfo.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+Thumb1FrameLowering::Thumb1FrameLowering(const ARMSubtarget &sti)
+ : ARMFrameLowering(sti) {}
+
+bool Thumb1FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const{
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ unsigned CFSize = MFI.getMaxCallFrameSize();
+ // It's not always a good idea to include the call frame as part of the
+ // stack frame. ARM (especially Thumb) has small immediate offset to
+ // address the stack frame. So a large call frame can cause poor codegen
+ // and may even makes it impossible to scavenge a register.
+ if (CFSize >= ((1 << 8) - 1) * 4 / 2) // Half of imm8 * 4
+ return false;
+
+ return !MFI.hasVarSizedObjects();
+}
+
+static void emitSPUpdate(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ const TargetInstrInfo &TII, const DebugLoc &dl,
+ const ThumbRegisterInfo &MRI, int NumBytes,
+ unsigned MIFlags = MachineInstr::NoFlags) {
+ emitThumbRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes, TII,
+ MRI, MIFlags);
+}
+
+
+MachineBasicBlock::iterator Thumb1FrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
+ const Thumb1InstrInfo &TII =
+ *static_cast<const Thumb1InstrInfo *>(STI.getInstrInfo());
+ const ThumbRegisterInfo *RegInfo =
+ static_cast<const ThumbRegisterInfo *>(STI.getRegisterInfo());
+ if (!hasReservedCallFrame(MF)) {
+ // If we have alloca, convert as follows:
+ // ADJCALLSTACKDOWN -> sub, sp, sp, amount
+ // ADJCALLSTACKUP -> add, sp, sp, amount
+ MachineInstr &Old = *I;
+ DebugLoc dl = Old.getDebugLoc();
+ unsigned Amount = Old.getOperand(0).getImm();
+ if (Amount != 0) {
+ // We need to keep the stack aligned properly. To do this, we round the
+ // amount of space needed for the outgoing arguments up to the next
+ // alignment boundary.
+ unsigned Align = getStackAlignment();
+ Amount = (Amount+Align-1)/Align*Align;
+
+ // Replace the pseudo instruction with a new instruction...
+ unsigned Opc = Old.getOpcode();
+ if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) {
+ emitSPUpdate(MBB, I, TII, dl, *RegInfo, -Amount);
+ } else {
+ assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP);
+ emitSPUpdate(MBB, I, TII, dl, *RegInfo, Amount);
+ }
+ }
+ }
+ return MBB.erase(I);
+}
+
+void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ MachineBasicBlock::iterator MBBI = MBB.begin();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+ MachineModuleInfo &MMI = MF.getMMI();
+ const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+ const ThumbRegisterInfo *RegInfo =
+ static_cast<const ThumbRegisterInfo *>(STI.getRegisterInfo());
+ const Thumb1InstrInfo &TII =
+ *static_cast<const Thumb1InstrInfo *>(STI.getInstrInfo());
+
+ unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize();
+ unsigned NumBytes = MFI.getStackSize();
+ assert(NumBytes >= ArgRegsSaveSize &&
+ "ArgRegsSaveSize is included in NumBytes");
+ const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+
+ // Debug location must be unknown since the first debug location is used
+ // to determine the end of the prologue.
+ DebugLoc dl;
+
+ unsigned FramePtr = RegInfo->getFrameRegister(MF);
+ unsigned BasePtr = RegInfo->getBaseRegister();
+ int CFAOffset = 0;
+
+ // Thumb add/sub sp, imm8 instructions implicitly multiply the offset by 4.
+ NumBytes = (NumBytes + 3) & ~3;
+ MFI.setStackSize(NumBytes);
+
+ // Determine the sizes of each callee-save spill areas and record which frame
+ // belongs to which callee-save spill areas.
+ unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0;
+ int FramePtrSpillFI = 0;
+
+ if (ArgRegsSaveSize) {
+ emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -ArgRegsSaveSize,
+ MachineInstr::FrameSetup);
+ CFAOffset -= ArgRegsSaveSize;
+ unsigned CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+ BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameSetup);
+ }
+
+ if (!AFI->hasStackFrame()) {
+ if (NumBytes - ArgRegsSaveSize != 0) {
+ emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -(NumBytes - ArgRegsSaveSize),
+ MachineInstr::FrameSetup);
+ CFAOffset -= NumBytes - ArgRegsSaveSize;
+ unsigned CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+ BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameSetup);
+ }
+ return;
+ }
+
+ for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+ unsigned Reg = CSI[i].getReg();
+ int FI = CSI[i].getFrameIdx();
+ switch (Reg) {
+ case ARM::R8:
+ case ARM::R9:
+ case ARM::R10:
+ case ARM::R11:
+ if (STI.splitFramePushPop(MF)) {
+ GPRCS2Size += 4;
+ break;
+ }
+ LLVM_FALLTHROUGH;
+ case ARM::R4:
+ case ARM::R5:
+ case ARM::R6:
+ case ARM::R7:
+ case ARM::LR:
+ if (Reg == FramePtr)
+ FramePtrSpillFI = FI;
+ GPRCS1Size += 4;
+ break;
+ default:
+ DPRCSSize += 8;
+ }
+ }
+
+ if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tPUSH) {
+ ++MBBI;
+ }
+
+ // Determine starting offsets of spill areas.
+ unsigned DPRCSOffset = NumBytes - ArgRegsSaveSize - (GPRCS1Size + GPRCS2Size + DPRCSSize);
+ unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize;
+ unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size;
+ bool HasFP = hasFP(MF);
+ if (HasFP)
+ AFI->setFramePtrSpillOffset(MFI.getObjectOffset(FramePtrSpillFI) +
+ NumBytes);
+ AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset);
+ AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset);
+ AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
+ NumBytes = DPRCSOffset;
+
+ int FramePtrOffsetInBlock = 0;
+ unsigned adjustedGPRCS1Size = GPRCS1Size;
+ if (GPRCS1Size > 0 && GPRCS2Size == 0 &&
+ tryFoldSPUpdateIntoPushPop(STI, MF, &*std::prev(MBBI), NumBytes)) {
+ FramePtrOffsetInBlock = NumBytes;
+ adjustedGPRCS1Size += NumBytes;
+ NumBytes = 0;
+ }
+
+ if (adjustedGPRCS1Size) {
+ CFAOffset -= adjustedGPRCS1Size;
+ unsigned CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+ BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameSetup);
+ }
+ for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
+ E = CSI.end(); I != E; ++I) {
+ unsigned Reg = I->getReg();
+ int FI = I->getFrameIdx();
+ switch (Reg) {
+ case ARM::R8:
+ case ARM::R9:
+ case ARM::R10:
+ case ARM::R11:
+ case ARM::R12:
+ if (STI.splitFramePushPop(MF))
+ break;
+ // fallthough
+ case ARM::R0:
+ case ARM::R1:
+ case ARM::R2:
+ case ARM::R3:
+ case ARM::R4:
+ case ARM::R5:
+ case ARM::R6:
+ case ARM::R7:
+ case ARM::LR:
+ unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
+ nullptr, MRI->getDwarfRegNum(Reg, true), MFI.getObjectOffset(FI)));
+ BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameSetup);
+ break;
+ }
+ }
+
+ // Adjust FP so it point to the stack slot that contains the previous FP.
+ if (HasFP) {
+ FramePtrOffsetInBlock +=
+ MFI.getObjectOffset(FramePtrSpillFI) + GPRCS1Size + ArgRegsSaveSize;
+ AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr)
+ .addReg(ARM::SP).addImm(FramePtrOffsetInBlock / 4)
+ .setMIFlags(MachineInstr::FrameSetup));
+ if(FramePtrOffsetInBlock) {
+ CFAOffset += FramePtrOffsetInBlock;
+ unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa(
+ nullptr, MRI->getDwarfRegNum(FramePtr, true), CFAOffset));
+ BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameSetup);
+ } else {
+ unsigned CFIIndex =
+ MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(
+ nullptr, MRI->getDwarfRegNum(FramePtr, true)));
+ BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameSetup);
+ }
+ if (NumBytes > 508)
+ // If offset is > 508 then sp cannot be adjusted in a single instruction,
+ // try restoring from fp instead.
+ AFI->setShouldRestoreSPFromFP(true);
+ }
+
+ // Skip past the spilling of r8-r11, which could consist of multiple tPUSH
+ // and tMOVr instructions. We don't need to add any call frame information
+ // in-between these instructions, because they do not modify the high
+ // registers.
+ while (true) {
+ MachineBasicBlock::iterator OldMBBI = MBBI;
+ // Skip a run of tMOVr instructions
+ while (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tMOVr)
+ MBBI++;
+ if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tPUSH) {
+ MBBI++;
+ } else {
+ // We have reached an instruction which is not a push, so the previous
+ // run of tMOVr instructions (which may have been empty) was not part of
+ // the prologue. Reset MBBI back to the last PUSH of the prologue.
+ MBBI = OldMBBI;
+ break;
+ }
+ }
+
+ // Emit call frame information for the callee-saved high registers.
+ for (auto &I : CSI) {
+ unsigned Reg = I.getReg();
+ int FI = I.getFrameIdx();
+ switch (Reg) {
+ case ARM::R8:
+ case ARM::R9:
+ case ARM::R10:
+ case ARM::R11:
+ case ARM::R12: {
+ unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
+ nullptr, MRI->getDwarfRegNum(Reg, true), MFI.getObjectOffset(FI)));
+ BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameSetup);
+ break;
+ }
+ default:
+ break;
+ }
+ }
+
+ if (NumBytes) {
+ // Insert it after all the callee-save spills.
+ emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -NumBytes,
+ MachineInstr::FrameSetup);
+ if (!HasFP) {
+ CFAOffset -= NumBytes;
+ unsigned CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+ BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameSetup);
+ }
+ }
+
+ if (STI.isTargetELF() && HasFP)
+ MFI.setOffsetAdjustment(MFI.getOffsetAdjustment() -
+ AFI->getFramePtrSpillOffset());
+
+ AFI->setGPRCalleeSavedArea1Size(GPRCS1Size);
+ AFI->setGPRCalleeSavedArea2Size(GPRCS2Size);
+ AFI->setDPRCalleeSavedAreaSize(DPRCSSize);
+
+ // Thumb1 does not currently support dynamic stack realignment. Report a
+ // fatal error rather then silently generate bad code.
+ if (RegInfo->needsStackRealignment(MF))
+ report_fatal_error("Dynamic stack realignment not supported for thumb1.");
+
+ // If we need a base pointer, set it up here. It's whatever the value
+ // of the stack pointer is at this point. Any variable size objects
+ // will be allocated after this, so we can still use the base pointer
+ // to reference locals.
+ if (RegInfo->hasBasePointer(MF))
+ AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), BasePtr)
+ .addReg(ARM::SP));
+
+ // If the frame has variable sized objects then the epilogue must restore
+ // the sp from fp. We can assume there's an FP here since hasFP already
+ // checks for hasVarSizedObjects.
+ if (MFI.hasVarSizedObjects())
+ AFI->setShouldRestoreSPFromFP(true);
+}
+
+static bool isCSRestore(MachineInstr &MI, const MCPhysReg *CSRegs) {
+ if (MI.getOpcode() == ARM::tLDRspi && MI.getOperand(1).isFI() &&
+ isCalleeSavedRegister(MI.getOperand(0).getReg(), CSRegs))
+ return true;
+ else if (MI.getOpcode() == ARM::tPOP) {
+ return true;
+ } else if (MI.getOpcode() == ARM::tMOVr) {
+ unsigned Dst = MI.getOperand(0).getReg();
+ unsigned Src = MI.getOperand(1).getReg();
+ return ((ARM::tGPRRegClass.contains(Src) || Src == ARM::LR) &&
+ ARM::hGPRRegClass.contains(Dst));
+ }
+ return false;
+}
+
+void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+ DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+ const ThumbRegisterInfo *RegInfo =
+ static_cast<const ThumbRegisterInfo *>(STI.getRegisterInfo());
+ const Thumb1InstrInfo &TII =
+ *static_cast<const Thumb1InstrInfo *>(STI.getInstrInfo());
+
+ unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize();
+ int NumBytes = (int)MFI.getStackSize();
+ assert((unsigned)NumBytes >= ArgRegsSaveSize &&
+ "ArgRegsSaveSize is included in NumBytes");
+ const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
+ unsigned FramePtr = RegInfo->getFrameRegister(MF);
+
+ if (!AFI->hasStackFrame()) {
+ if (NumBytes - ArgRegsSaveSize != 0)
+ emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes - ArgRegsSaveSize);
+ } else {
+ // Unwind MBBI to point to first LDR / VLDRD.
+ if (MBBI != MBB.begin()) {
+ do
+ --MBBI;
+ while (MBBI != MBB.begin() && isCSRestore(*MBBI, CSRegs));
+ if (!isCSRestore(*MBBI, CSRegs))
+ ++MBBI;
+ }
+
+ // Move SP to start of FP callee save spill area.
+ NumBytes -= (AFI->getGPRCalleeSavedArea1Size() +
+ AFI->getGPRCalleeSavedArea2Size() +
+ AFI->getDPRCalleeSavedAreaSize() +
+ ArgRegsSaveSize);
+
+ if (AFI->shouldRestoreSPFromFP()) {
+ NumBytes = AFI->getFramePtrSpillOffset() - NumBytes;
+ // Reset SP based on frame pointer only if the stack frame extends beyond
+ // frame pointer stack slot, the target is ELF and the function has FP, or
+ // the target uses var sized objects.
+ if (NumBytes) {
+ assert(!MFI.getPristineRegs(MF).test(ARM::R4) &&
+ "No scratch register to restore SP from FP!");
+ emitThumbRegPlusImmediate(MBB, MBBI, dl, ARM::R4, FramePtr, -NumBytes,
+ TII, *RegInfo);
+ AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr),
+ ARM::SP)
+ .addReg(ARM::R4));
+ } else
+ AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr),
+ ARM::SP)
+ .addReg(FramePtr));
+ } else {
+ if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tBX_RET &&
+ &MBB.front() != &*MBBI && std::prev(MBBI)->getOpcode() == ARM::tPOP) {
+ MachineBasicBlock::iterator PMBBI = std::prev(MBBI);
+ if (!tryFoldSPUpdateIntoPushPop(STI, MF, &*PMBBI, NumBytes))
+ emitSPUpdate(MBB, PMBBI, TII, dl, *RegInfo, NumBytes);
+ } else if (!tryFoldSPUpdateIntoPushPop(STI, MF, &*MBBI, NumBytes))
+ emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes);
+ }
+ }
+
+ if (needPopSpecialFixUp(MF)) {
+ bool Done = emitPopSpecialFixUp(MBB, /* DoIt */ true);
+ (void)Done;
+ assert(Done && "Emission of the special fixup failed!?");
+ }
+}
+
+bool Thumb1FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const {
+ if (!needPopSpecialFixUp(*MBB.getParent()))
+ return true;
+
+ MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
+ return emitPopSpecialFixUp(*TmpMBB, /* DoIt */ false);
+}
+
+bool Thumb1FrameLowering::needPopSpecialFixUp(const MachineFunction &MF) const {
+ ARMFunctionInfo *AFI =
+ const_cast<MachineFunction *>(&MF)->getInfo<ARMFunctionInfo>();
+ if (AFI->getArgRegsSaveSize())
+ return true;
+
+ // LR cannot be encoded with Thumb1, i.e., it requires a special fix-up.
+ for (const CalleeSavedInfo &CSI : MF.getFrameInfo().getCalleeSavedInfo())
+ if (CSI.getReg() == ARM::LR)
+ return true;
+
+ return false;
+}
+
+bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
+ bool DoIt) const {
+ MachineFunction &MF = *MBB.getParent();
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+ unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize();
+ const TargetInstrInfo &TII = *STI.getInstrInfo();
+ const ThumbRegisterInfo *RegInfo =
+ static_cast<const ThumbRegisterInfo *>(STI.getRegisterInfo());
+
+ // If MBBI is a return instruction, or is a tPOP followed by a return
+ // instruction in the successor BB, we may be able to directly restore
+ // LR in the PC.
+ // This is only possible with v5T ops (v4T can't change the Thumb bit via
+ // a POP PC instruction), and only if we do not need to emit any SP update.
+ // Otherwise, we need a temporary register to pop the value
+ // and copy that value into LR.
+ auto MBBI = MBB.getFirstTerminator();
+ bool CanRestoreDirectly = STI.hasV5TOps() && !ArgRegsSaveSize;
+ if (CanRestoreDirectly) {
+ if (MBBI != MBB.end() && MBBI->getOpcode() != ARM::tB)
+ CanRestoreDirectly = (MBBI->getOpcode() == ARM::tBX_RET ||
+ MBBI->getOpcode() == ARM::tPOP_RET);
+ else {
+ auto MBBI_prev = MBBI;
+ MBBI_prev--;
+ assert(MBBI_prev->getOpcode() == ARM::tPOP);
+ assert(MBB.succ_size() == 1);
+ if ((*MBB.succ_begin())->begin()->getOpcode() == ARM::tBX_RET)
+ MBBI = MBBI_prev; // Replace the final tPOP with a tPOP_RET.
+ else
+ CanRestoreDirectly = false;
+ }
+ }
+
+ if (CanRestoreDirectly) {
+ if (!DoIt || MBBI->getOpcode() == ARM::tPOP_RET)
+ return true;
+ MachineInstrBuilder MIB =
+ AddDefaultPred(
+ BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII.get(ARM::tPOP_RET)));
+ // Copy implicit ops and popped registers, if any.
+ for (auto MO: MBBI->operands())
+ if (MO.isReg() && (MO.isImplicit() || MO.isDef()))
+ MIB.addOperand(MO);
+ MIB.addReg(ARM::PC, RegState::Define);
+ // Erase the old instruction (tBX_RET or tPOP).
+ MBB.erase(MBBI);
+ return true;
+ }
+
+ // Look for a temporary register to use.
+ // First, compute the liveness information.
+ LivePhysRegs UsedRegs(STI.getRegisterInfo());
+ UsedRegs.addLiveOuts(MBB);
+ // The semantic of pristines changed recently and now,
+ // the callee-saved registers that are touched in the function
+ // are not part of the pristines set anymore.
+ // Add those callee-saved now.
+ const TargetRegisterInfo *TRI = STI.getRegisterInfo();
+ const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF);
+ for (unsigned i = 0; CSRegs[i]; ++i)
+ UsedRegs.addReg(CSRegs[i]);
+
+ DebugLoc dl = DebugLoc();
+ if (MBBI != MBB.end()) {
+ dl = MBBI->getDebugLoc();
+ auto InstUpToMBBI = MBB.end();
+ while (InstUpToMBBI != MBBI)
+ // The pre-decrement is on purpose here.
+ // We want to have the liveness right before MBBI.
+ UsedRegs.stepBackward(*--InstUpToMBBI);
+ }
+
+ // Look for a register that can be directly use in the POP.
+ unsigned PopReg = 0;
+ // And some temporary register, just in case.
+ unsigned TemporaryReg = 0;
+ BitVector PopFriendly =
+ TRI->getAllocatableSet(MF, TRI->getRegClass(ARM::tGPRRegClassID));
+ assert(PopFriendly.any() && "No allocatable pop-friendly register?!");
+ // Rebuild the GPRs from the high registers because they are removed
+ // form the GPR reg class for thumb1.
+ BitVector GPRsNoLRSP =
+ TRI->getAllocatableSet(MF, TRI->getRegClass(ARM::hGPRRegClassID));
+ GPRsNoLRSP |= PopFriendly;
+ GPRsNoLRSP.reset(ARM::LR);
+ GPRsNoLRSP.reset(ARM::SP);
+ GPRsNoLRSP.reset(ARM::PC);
+ for (int Register = GPRsNoLRSP.find_first(); Register != -1;
+ Register = GPRsNoLRSP.find_next(Register)) {
+ if (!UsedRegs.contains(Register)) {
+ // Remember the first pop-friendly register and exit.
+ if (PopFriendly.test(Register)) {
+ PopReg = Register;
+ TemporaryReg = 0;
+ break;
+ }
+ // Otherwise, remember that the register will be available to
+ // save a pop-friendly register.
+ TemporaryReg = Register;
+ }
+ }
+
+ if (!DoIt && !PopReg && !TemporaryReg)
+ return false;
+
+ assert((PopReg || TemporaryReg) && "Cannot get LR");
+
+ if (TemporaryReg) {
+ assert(!PopReg && "Unnecessary MOV is about to be inserted");
+ PopReg = PopFriendly.find_first();
+ AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
+ .addReg(TemporaryReg, RegState::Define)
+ .addReg(PopReg, RegState::Kill));
+ }
+
+ if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tPOP_RET) {
+ // We couldn't use the direct restoration above, so
+ // perform the opposite conversion: tPOP_RET to tPOP.
+ MachineInstrBuilder MIB =
+ AddDefaultPred(
+ BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII.get(ARM::tPOP)));
+ bool Popped = false;
+ for (auto MO: MBBI->operands())
+ if (MO.isReg() && (MO.isImplicit() || MO.isDef()) &&
+ MO.getReg() != ARM::PC) {
+ MIB.addOperand(MO);
+ if (!MO.isImplicit())
+ Popped = true;
+ }
+ // Is there anything left to pop?
+ if (!Popped)
+ MBB.erase(MIB.getInstr());
+ // Erase the old instruction.
+ MBB.erase(MBBI);
+ MBBI = AddDefaultPred(BuildMI(MBB, MBB.end(), dl, TII.get(ARM::tBX_RET)));
+ }
+
+ assert(PopReg && "Do not know how to get LR");
+ AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP)))
+ .addReg(PopReg, RegState::Define);
+
+ emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize);
+
+ AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
+ .addReg(ARM::LR, RegState::Define)
+ .addReg(PopReg, RegState::Kill));
+
+ if (TemporaryReg)
+ AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
+ .addReg(PopReg, RegState::Define)
+ .addReg(TemporaryReg, RegState::Kill));
+
+ return true;
+}
+
+// Return the first iteraror after CurrentReg which is present in EnabledRegs,
+// or OrderEnd if no further registers are in that set. This does not advance
+// the iterator fiorst, so returns CurrentReg if it is in EnabledRegs.
+template <unsigned SetSize>
+static const unsigned *
+findNextOrderedReg(const unsigned *CurrentReg,
+ SmallSet<unsigned, SetSize> &EnabledRegs,
+ const unsigned *OrderEnd) {
+ while (CurrentReg != OrderEnd && !EnabledRegs.count(*CurrentReg))
+ ++CurrentReg;
+ return CurrentReg;
+}
+
+bool Thumb1FrameLowering::
+spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const {
+ if (CSI.empty())
+ return false;
+
+ DebugLoc DL;
+ const TargetInstrInfo &TII = *STI.getInstrInfo();
+ MachineFunction &MF = *MBB.getParent();
+ const ARMBaseRegisterInfo *RegInfo = static_cast<const ARMBaseRegisterInfo *>(
+ MF.getSubtarget().getRegisterInfo());
+
+ SmallSet<unsigned, 9> LoRegsToSave; // r0-r7, lr
+ SmallSet<unsigned, 4> HiRegsToSave; // r8-r11
+ SmallSet<unsigned, 9> CopyRegs; // Registers which can be used after pushing
+ // LoRegs for saving HiRegs.
+
+ for (unsigned i = CSI.size(); i != 0; --i) {
+ unsigned Reg = CSI[i-1].getReg();
+
+ if (ARM::tGPRRegClass.contains(Reg) || Reg == ARM::LR) {
+ LoRegsToSave.insert(Reg);
+ } else if (ARM::hGPRRegClass.contains(Reg) && Reg != ARM::LR) {
+ HiRegsToSave.insert(Reg);
+ } else {
+ llvm_unreachable("callee-saved register of unexpected class");
+ }
+
+ if ((ARM::tGPRRegClass.contains(Reg) || Reg == ARM::LR) &&
+ !MF.getRegInfo().isLiveIn(Reg) &&
+ !(hasFP(MF) && Reg == RegInfo->getFrameRegister(MF)))
+ CopyRegs.insert(Reg);
+ }
+
+ // Unused argument registers can be used for the high register saving.
+ for (unsigned ArgReg : {ARM::R0, ARM::R1, ARM::R2, ARM::R3})
+ if (!MF.getRegInfo().isLiveIn(ArgReg))
+ CopyRegs.insert(ArgReg);
+
+ // Push the low registers and lr
+ if (!LoRegsToSave.empty()) {
+ MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(ARM::tPUSH));
+ AddDefaultPred(MIB);
+ for (unsigned Reg : {ARM::R4, ARM::R5, ARM::R6, ARM::R7, ARM::LR}) {
+ if (LoRegsToSave.count(Reg)) {
+ bool isKill = !MF.getRegInfo().isLiveIn(Reg);
+ if (isKill)
+ MBB.addLiveIn(Reg);
+
+ MIB.addReg(Reg, getKillRegState(isKill));
+ }
+ }
+ MIB.setMIFlags(MachineInstr::FrameSetup);
+ }
+
+ // Push the high registers. There are no store instructions that can access
+ // these registers directly, so we have to move them to low registers, and
+ // push them. This might take multiple pushes, as it is possible for there to
+ // be fewer low registers available than high registers which need saving.
+
+ // These are in reverse order so that in the case where we need to use
+ // multiple PUSH instructions, the order of the registers on the stack still
+ // matches the unwind info. They need to be swicthed back to ascending order
+ // before adding to the PUSH instruction.
+ static const unsigned AllCopyRegs[] = {ARM::LR, ARM::R7, ARM::R6,
+ ARM::R5, ARM::R4, ARM::R3,
+ ARM::R2, ARM::R1, ARM::R0};
+ static const unsigned AllHighRegs[] = {ARM::R11, ARM::R10, ARM::R9, ARM::R8};
+
+ const unsigned *AllCopyRegsEnd = std::end(AllCopyRegs);
+ const unsigned *AllHighRegsEnd = std::end(AllHighRegs);
+
+ // Find the first register to save.
+ const unsigned *HiRegToSave = findNextOrderedReg(
+ std::begin(AllHighRegs), HiRegsToSave, AllHighRegsEnd);
+
+ while (HiRegToSave != AllHighRegsEnd) {
+ // Find the first low register to use.
+ const unsigned *CopyReg =
+ findNextOrderedReg(std::begin(AllCopyRegs), CopyRegs, AllCopyRegsEnd);
+
+ // Create the PUSH, but don't insert it yet (the MOVs need to come first).
+ MachineInstrBuilder PushMIB = BuildMI(MF, DL, TII.get(ARM::tPUSH));
+ AddDefaultPred(PushMIB);
+
+ SmallVector<unsigned, 4> RegsToPush;
+ while (HiRegToSave != AllHighRegsEnd && CopyReg != AllCopyRegsEnd) {
+ if (HiRegsToSave.count(*HiRegToSave)) {
+ bool isKill = !MF.getRegInfo().isLiveIn(*HiRegToSave);
+ if (isKill)
+ MBB.addLiveIn(*HiRegToSave);
+
+ // Emit a MOV from the high reg to the low reg.
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MI, DL, TII.get(ARM::tMOVr));
+ MIB.addReg(*CopyReg, RegState::Define);
+ MIB.addReg(*HiRegToSave, getKillRegState(isKill));
+ AddDefaultPred(MIB);
+
+ // Record the register that must be added to the PUSH.
+ RegsToPush.push_back(*CopyReg);
+
+ CopyReg = findNextOrderedReg(++CopyReg, CopyRegs, AllCopyRegsEnd);
+ HiRegToSave =
+ findNextOrderedReg(++HiRegToSave, HiRegsToSave, AllHighRegsEnd);
+ }
+ }
+
+ // Add the low registers to the PUSH, in ascending order.
+ for (unsigned Reg : reverse(RegsToPush))
+ PushMIB.addReg(Reg, RegState::Kill);
+
+ // Insert the PUSH instruction after the MOVs.
+ MBB.insert(MI, PushMIB);
+ }
+
+ return true;
+}
+
+bool Thumb1FrameLowering::
+restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const {
+ if (CSI.empty())
+ return false;
+
+ MachineFunction &MF = *MBB.getParent();
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+ const TargetInstrInfo &TII = *STI.getInstrInfo();
+ const ARMBaseRegisterInfo *RegInfo = static_cast<const ARMBaseRegisterInfo *>(
+ MF.getSubtarget().getRegisterInfo());
+
+ bool isVarArg = AFI->getArgRegsSaveSize() > 0;
+ DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc();
+
+ SmallSet<unsigned, 9> LoRegsToRestore;
+ SmallSet<unsigned, 4> HiRegsToRestore;
+ // Low registers (r0-r7) which can be used to restore the high registers.
+ SmallSet<unsigned, 9> CopyRegs;
+
+ for (CalleeSavedInfo I : CSI) {
+ unsigned Reg = I.getReg();
+
+ if (ARM::tGPRRegClass.contains(Reg) || Reg == ARM::LR) {
+ LoRegsToRestore.insert(Reg);
+ } else if (ARM::hGPRRegClass.contains(Reg) && Reg != ARM::LR) {
+ HiRegsToRestore.insert(Reg);
+ } else {
+ llvm_unreachable("callee-saved register of unexpected class");
+ }
+
+ // If this is a low register not used as the frame pointer, we may want to
+ // use it for restoring the high registers.
+ if ((ARM::tGPRRegClass.contains(Reg)) &&
+ !(hasFP(MF) && Reg == RegInfo->getFrameRegister(MF)))
+ CopyRegs.insert(Reg);
+ }
+
+ // If this is a return block, we may be able to use some unused return value
+ // registers for restoring the high regs.
+ auto Terminator = MBB.getFirstTerminator();
+ if (Terminator != MBB.end() && Terminator->getOpcode() == ARM::tBX_RET) {
+ CopyRegs.insert(ARM::R0);
+ CopyRegs.insert(ARM::R1);
+ CopyRegs.insert(ARM::R2);
+ CopyRegs.insert(ARM::R3);
+ for (auto Op : Terminator->implicit_operands()) {
+ if (Op.isReg())
+ CopyRegs.erase(Op.getReg());
+ }
+ }
+
+ static const unsigned AllCopyRegs[] = {ARM::R0, ARM::R1, ARM::R2, ARM::R3,
+ ARM::R4, ARM::R5, ARM::R6, ARM::R7};
+ static const unsigned AllHighRegs[] = {ARM::R8, ARM::R9, ARM::R10, ARM::R11};
+
+ const unsigned *AllCopyRegsEnd = std::end(AllCopyRegs);
+ const unsigned *AllHighRegsEnd = std::end(AllHighRegs);
+
+ // Find the first register to restore.
+ auto HiRegToRestore = findNextOrderedReg(std::begin(AllHighRegs),
+ HiRegsToRestore, AllHighRegsEnd);
+
+ while (HiRegToRestore != AllHighRegsEnd) {
+ assert(!CopyRegs.empty());
+ // Find the first low register to use.
+ auto CopyReg =
+ findNextOrderedReg(std::begin(AllCopyRegs), CopyRegs, AllCopyRegsEnd);
+
+ // Create the POP instruction.
+ MachineInstrBuilder PopMIB = BuildMI(MBB, MI, DL, TII.get(ARM::tPOP));
+ AddDefaultPred(PopMIB);
+
+ while (HiRegToRestore != AllHighRegsEnd && CopyReg != AllCopyRegsEnd) {
+ // Add the low register to the POP.
+ PopMIB.addReg(*CopyReg, RegState::Define);
+
+ // Create the MOV from low to high register.
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MI, DL, TII.get(ARM::tMOVr));
+ MIB.addReg(*HiRegToRestore, RegState::Define);
+ MIB.addReg(*CopyReg, RegState::Kill);
+ AddDefaultPred(MIB);
+
+ CopyReg = findNextOrderedReg(++CopyReg, CopyRegs, AllCopyRegsEnd);
+ HiRegToRestore =
+ findNextOrderedReg(++HiRegToRestore, HiRegsToRestore, AllHighRegsEnd);
+ }
+ }
+
+
+
+
+ MachineInstrBuilder MIB = BuildMI(MF, DL, TII.get(ARM::tPOP));
+ AddDefaultPred(MIB);
+
+ bool NeedsPop = false;
+ for (unsigned i = CSI.size(); i != 0; --i) {
+ unsigned Reg = CSI[i-1].getReg();
+
+ // High registers (excluding lr) have already been dealt with
+ if (!(ARM::tGPRRegClass.contains(Reg) || Reg == ARM::LR))
+ continue;
+
+ if (Reg == ARM::LR) {
+ if (MBB.succ_empty()) {
+ // Special epilogue for vararg functions. See emitEpilogue
+ if (isVarArg)
+ continue;
+ // ARMv4T requires BX, see emitEpilogue
+ if (!STI.hasV5TOps())
+ continue;
+ Reg = ARM::PC;
+ (*MIB).setDesc(TII.get(ARM::tPOP_RET));
+ if (MI != MBB.end())
+ MIB.copyImplicitOps(*MI);
+ MI = MBB.erase(MI);
+ } else
+ // LR may only be popped into PC, as part of return sequence.
+ // If this isn't the return sequence, we'll need emitPopSpecialFixUp
+ // to restore LR the hard way.
+ continue;
+ }
+ MIB.addReg(Reg, getDefRegState(true));
+ NeedsPop = true;
+ }
+
+ // It's illegal to emit pop instruction without operands.
+ if (NeedsPop)
+ MBB.insert(MI, &*MIB);
+ else
+ MF.DeleteMachineInstr(MIB);
+
+ return true;
+}
diff --git a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h
new file mode 100644
index 000000000000..9de1ba1d7009
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h
@@ -0,0 +1,93 @@
+//===-- Thumb1FrameLowering.h - Thumb1-specific frame info stuff --*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_THUMB1FRAMELOWERING_H
+#define LLVM_LIB_TARGET_ARM_THUMB1FRAMELOWERING_H
+
+#include "ARMFrameLowering.h"
+#include "Thumb1InstrInfo.h"
+#include "ThumbRegisterInfo.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+
+class Thumb1FrameLowering : public ARMFrameLowering {
+public:
+ explicit Thumb1FrameLowering(const ARMSubtarget &sti);
+
+ /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+ /// the function.
+ void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+ void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+ bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const override;
+ bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const override;
+
+ bool hasReservedCallFrame(const MachineFunction &MF) const override;
+
+ MachineBasicBlock::iterator
+ eliminateCallFramePseudoInstr(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI) const override;
+
+ /// Check whether or not the given \p MBB can be used as a epilogue
+ /// for the target.
+ /// The epilogue will be inserted before the first terminator of that block.
+ /// This method is used by the shrink-wrapping pass to decide if
+ /// \p MBB will be correctly handled by the target.
+ bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override;
+
+ /// Disable shrink wrap as tBfar/BL will be used to adjust for long jumps.
+ bool enableShrinkWrapping(const MachineFunction &MF) const override {
+ return false;
+ }
+
+private:
+ /// Check if the frame lowering of \p MF needs a special fixup
+ /// code sequence for the epilogue.
+ /// Unlike T2 and ARM mode, the T1 pop instruction cannot restore
+ /// to LR, and we can't pop the value directly to the PC when
+ /// we need to update the SP after popping the value. So instead
+ /// we have to emit:
+ /// POP {r3}
+ /// ADD sp, #offset
+ /// BX r3
+ /// If this would clobber a return value, then generate this sequence instead:
+ /// MOV ip, r3
+ /// POP {r3}
+ /// ADD sp, #offset
+ /// MOV lr, r3
+ /// MOV r3, ip
+ /// BX lr
+ bool needPopSpecialFixUp(const MachineFunction &MF) const;
+
+ /// Emit the special fixup code sequence for the epilogue.
+ /// \see needPopSpecialFixUp for more details.
+ /// \p DoIt, tells this method whether or not to actually insert
+ /// the code sequence in \p MBB. I.e., when \p DoIt is false,
+ /// \p MBB is left untouched.
+ /// \returns For \p DoIt == true: True when the emission succeeded
+ /// false otherwise. For \p DoIt == false: True when the emission
+ /// would have been possible, false otherwise.
+ bool emitPopSpecialFixUp(MachineBasicBlock &MBB, bool DoIt) const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
new file mode 100644
index 000000000000..4b4fbaab28d9
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -0,0 +1,129 @@
+//===-- Thumb1InstrInfo.cpp - Thumb-1 Instruction Information -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb-1 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMSubtarget.h"
+#include "Thumb1InstrInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/MCInst.h"
+
+using namespace llvm;
+
+Thumb1InstrInfo::Thumb1InstrInfo(const ARMSubtarget &STI)
+ : ARMBaseInstrInfo(STI), RI() {}
+
+/// getNoopForMachoTarget - Return the noop instruction to use for a noop.
+void Thumb1InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+ NopInst.setOpcode(ARM::tMOVr);
+ NopInst.addOperand(MCOperand::createReg(ARM::R8));
+ NopInst.addOperand(MCOperand::createReg(ARM::R8));
+ NopInst.addOperand(MCOperand::createImm(ARMCC::AL));
+ NopInst.addOperand(MCOperand::createReg(0));
+}
+
+unsigned Thumb1InstrInfo::getUnindexedOpcode(unsigned Opc) const {
+ return 0;
+}
+
+void Thumb1InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL, unsigned DestReg,
+ unsigned SrcReg, bool KillSrc) const {
+ // Need to check the arch.
+ MachineFunction &MF = *MBB.getParent();
+ const ARMSubtarget &st = MF.getSubtarget<ARMSubtarget>();
+
+ assert(ARM::GPRRegClass.contains(DestReg, SrcReg) &&
+ "Thumb1 can only copy GPR registers");
+
+ if (st.hasV6Ops() || ARM::hGPRRegClass.contains(SrcReg)
+ || !ARM::tGPRRegClass.contains(DestReg))
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc)));
+ else {
+ // FIXME: The performance consequences of this are going to be atrocious.
+ // Some things to try that should be better:
+ // * 'mov hi, $src; mov $dst, hi', with hi as either r10 or r11
+ // * 'movs $dst, $src' if cpsr isn't live
+ // See: http://lists.llvm.org/pipermail/llvm-dev/2014-August/075998.html
+
+ // 'MOV lo, lo' is unpredictable on < v6, so use the stack to do it
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tPUSH)))
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tPOP)))
+ .addReg(DestReg, getDefRegState(true));
+ }
+}
+
+void Thumb1InstrInfo::
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ unsigned SrcReg, bool isKill, int FI,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ assert((RC == &ARM::tGPRRegClass ||
+ (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
+ isARMLowRegister(SrcReg))) && "Unknown regclass!");
+
+ if (RC == &ARM::tGPRRegClass ||
+ (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
+ isARMLowRegister(SrcReg))) {
+ DebugLoc DL;
+ if (I != MBB.end()) DL = I->getDebugLoc();
+
+ MachineFunction &MF = *MBB.getParent();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore,
+ MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tSTRspi))
+ .addReg(SrcReg, getKillRegState(isKill))
+ .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+ }
+}
+
+void Thumb1InstrInfo::
+loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ unsigned DestReg, int FI,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ assert((RC == &ARM::tGPRRegClass ||
+ (TargetRegisterInfo::isPhysicalRegister(DestReg) &&
+ isARMLowRegister(DestReg))) && "Unknown regclass!");
+
+ if (RC == &ARM::tGPRRegClass ||
+ (TargetRegisterInfo::isPhysicalRegister(DestReg) &&
+ isARMLowRegister(DestReg))) {
+ DebugLoc DL;
+ if (I != MBB.end()) DL = I->getDebugLoc();
+
+ MachineFunction &MF = *MBB.getParent();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad,
+ MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tLDRspi), DestReg)
+ .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+ }
+}
+
+void Thumb1InstrInfo::expandLoadStackGuard(
+ MachineBasicBlock::iterator MI) const {
+ MachineFunction &MF = *MI->getParent()->getParent();
+ const TargetMachine &TM = MF.getTarget();
+ if (TM.isPositionIndependent())
+ expandLoadStackGuardBase(MI, ARM::tLDRLIT_ga_pcrel, ARM::tLDRi);
+ else
+ expandLoadStackGuardBase(MI, ARM::tLDRLIT_ga_abs, ARM::tLDRi);
+}
diff --git a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h
new file mode 100644
index 000000000000..931914ad2799
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h
@@ -0,0 +1,61 @@
+//===-- Thumb1InstrInfo.h - Thumb-1 Instruction Information -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb-1 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_THUMB1INSTRINFO_H
+#define LLVM_LIB_TARGET_ARM_THUMB1INSTRINFO_H
+
+#include "ARMBaseInstrInfo.h"
+#include "ThumbRegisterInfo.h"
+
+namespace llvm {
+ class ARMSubtarget;
+
+class Thumb1InstrInfo : public ARMBaseInstrInfo {
+ ThumbRegisterInfo RI;
+public:
+ explicit Thumb1InstrInfo(const ARMSubtarget &STI);
+
+ /// getNoopForMachoTarget - Return the noop instruction to use for a noop.
+ void getNoopForMachoTarget(MCInst &NopInst) const override;
+
+ // Return the non-pre/post incrementing version of 'Opc'. Return 0
+ // if there is not such an opcode.
+ unsigned getUnindexedOpcode(unsigned Opc) const override;
+
+ /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As
+ /// such, whenever a client has an instance of instruction info, it should
+ /// always be able to get register info as well (through this method).
+ ///
+ const ThumbRegisterInfo &getRegisterInfo() const override { return RI; }
+
+ void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+ bool KillSrc) const override;
+ void storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ unsigned SrcReg, bool isKill, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
+
+ void loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ unsigned DestReg, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
+
+private:
+ void expandLoadStackGuard(MachineBasicBlock::iterator MI) const override;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp b/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp
new file mode 100644
index 000000000000..d01fc8c40ddf
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -0,0 +1,304 @@
+//===-- Thumb2ITBlockPass.cpp - Insert Thumb-2 IT blocks ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMMachineFunctionInfo.h"
+#include "Thumb2InstrInfo.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "thumb2-it"
+
+STATISTIC(NumITs, "Number of IT blocks inserted");
+STATISTIC(NumMovedInsts, "Number of predicated instructions moved");
+
+namespace {
+ class Thumb2ITBlockPass : public MachineFunctionPass {
+ public:
+ static char ID;
+ Thumb2ITBlockPass() : MachineFunctionPass(ID) {}
+
+ bool restrictIT;
+ const Thumb2InstrInfo *TII;
+ const TargetRegisterInfo *TRI;
+ ARMFunctionInfo *AFI;
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+ StringRef getPassName() const override {
+ return "Thumb IT blocks insertion pass";
+ }
+
+ private:
+ bool MoveCopyOutOfITBlock(MachineInstr *MI,
+ ARMCC::CondCodes CC, ARMCC::CondCodes OCC,
+ SmallSet<unsigned, 4> &Defs,
+ SmallSet<unsigned, 4> &Uses);
+ bool InsertITInstructions(MachineBasicBlock &MBB);
+ };
+ char Thumb2ITBlockPass::ID = 0;
+}
+
+/// TrackDefUses - Tracking what registers are being defined and used by
+/// instructions in the IT block. This also tracks "dependencies", i.e. uses
+/// in the IT block that are defined before the IT instruction.
+static void TrackDefUses(MachineInstr *MI,
+ SmallSet<unsigned, 4> &Defs,
+ SmallSet<unsigned, 4> &Uses,
+ const TargetRegisterInfo *TRI) {
+ SmallVector<unsigned, 4> LocalDefs;
+ SmallVector<unsigned, 4> LocalUses;
+
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = MI->getOperand(i);
+ if (!MO.isReg())
+ continue;
+ unsigned Reg = MO.getReg();
+ if (!Reg || Reg == ARM::ITSTATE || Reg == ARM::SP)
+ continue;
+ if (MO.isUse())
+ LocalUses.push_back(Reg);
+ else
+ LocalDefs.push_back(Reg);
+ }
+
+ for (unsigned i = 0, e = LocalUses.size(); i != e; ++i) {
+ unsigned Reg = LocalUses[i];
+ for (MCSubRegIterator Subreg(Reg, TRI, /*IncludeSelf=*/true);
+ Subreg.isValid(); ++Subreg)
+ Uses.insert(*Subreg);
+ }
+
+ for (unsigned i = 0, e = LocalDefs.size(); i != e; ++i) {
+ unsigned Reg = LocalDefs[i];
+ for (MCSubRegIterator Subreg(Reg, TRI, /*IncludeSelf=*/true);
+ Subreg.isValid(); ++Subreg)
+ Defs.insert(*Subreg);
+ if (Reg == ARM::CPSR)
+ continue;
+ }
+}
+
+/// Clear kill flags for any uses in the given set. This will likely
+/// conservatively remove more kill flags than are necessary, but removing them
+/// is safer than incorrect kill flags remaining on instructions.
+static void ClearKillFlags(MachineInstr *MI, SmallSet<unsigned, 4> &Uses) {
+ for (MachineOperand &MO : MI->operands()) {
+ if (!MO.isReg() || MO.isDef() || !MO.isKill())
+ continue;
+ if (!Uses.count(MO.getReg()))
+ continue;
+ MO.setIsKill(false);
+ }
+}
+
+static bool isCopy(MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ default:
+ return false;
+ case ARM::MOVr:
+ case ARM::MOVr_TC:
+ case ARM::tMOVr:
+ case ARM::t2MOVr:
+ return true;
+ }
+}
+
+bool
+Thumb2ITBlockPass::MoveCopyOutOfITBlock(MachineInstr *MI,
+ ARMCC::CondCodes CC, ARMCC::CondCodes OCC,
+ SmallSet<unsigned, 4> &Defs,
+ SmallSet<unsigned, 4> &Uses) {
+ if (!isCopy(MI))
+ return false;
+ // llvm models select's as two-address instructions. That means a copy
+ // is inserted before a t2MOVccr, etc. If the copy is scheduled in
+ // between selects we would end up creating multiple IT blocks.
+ assert(MI->getOperand(0).getSubReg() == 0 &&
+ MI->getOperand(1).getSubReg() == 0 &&
+ "Sub-register indices still around?");
+
+ unsigned DstReg = MI->getOperand(0).getReg();
+ unsigned SrcReg = MI->getOperand(1).getReg();
+
+ // First check if it's safe to move it.
+ if (Uses.count(DstReg) || Defs.count(SrcReg))
+ return false;
+
+ // If the CPSR is defined by this copy, then we don't want to move it. E.g.,
+ // if we have:
+ //
+ // movs r1, r1
+ // rsb r1, 0
+ // movs r2, r2
+ // rsb r2, 0
+ //
+ // we don't want this to be converted to:
+ //
+ // movs r1, r1
+ // movs r2, r2
+ // itt mi
+ // rsb r1, 0
+ // rsb r2, 0
+ //
+ const MCInstrDesc &MCID = MI->getDesc();
+ if (MI->hasOptionalDef() &&
+ MI->getOperand(MCID.getNumOperands() - 1).getReg() == ARM::CPSR)
+ return false;
+
+ // Then peek at the next instruction to see if it's predicated on CC or OCC.
+ // If not, then there is nothing to be gained by moving the copy.
+ MachineBasicBlock::iterator I = MI; ++I;
+ MachineBasicBlock::iterator E = MI->getParent()->end();
+ while (I != E && I->isDebugValue())
+ ++I;
+ if (I != E) {
+ unsigned NPredReg = 0;
+ ARMCC::CondCodes NCC = getITInstrPredicate(*I, NPredReg);
+ if (NCC == CC || NCC == OCC)
+ return true;
+ }
+ return false;
+}
+
+bool Thumb2ITBlockPass::InsertITInstructions(MachineBasicBlock &MBB) {
+ bool Modified = false;
+
+ SmallSet<unsigned, 4> Defs;
+ SmallSet<unsigned, 4> Uses;
+ MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+ while (MBBI != E) {
+ MachineInstr *MI = &*MBBI;
+ DebugLoc dl = MI->getDebugLoc();
+ unsigned PredReg = 0;
+ ARMCC::CondCodes CC = getITInstrPredicate(*MI, PredReg);
+ if (CC == ARMCC::AL) {
+ ++MBBI;
+ continue;
+ }
+
+ Defs.clear();
+ Uses.clear();
+ TrackDefUses(MI, Defs, Uses, TRI);
+
+ // Insert an IT instruction.
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(ARM::t2IT))
+ .addImm(CC);
+
+ // Add implicit use of ITSTATE to IT block instructions.
+ MI->addOperand(MachineOperand::CreateReg(ARM::ITSTATE, false/*ifDef*/,
+ true/*isImp*/, false/*isKill*/));
+
+ MachineInstr *LastITMI = MI;
+ MachineBasicBlock::iterator InsertPos = MIB.getInstr();
+ ++MBBI;
+
+ // Form IT block.
+ ARMCC::CondCodes OCC = ARMCC::getOppositeCondition(CC);
+ unsigned Mask = 0, Pos = 3;
+
+ // v8 IT blocks are limited to one conditional op unless -arm-no-restrict-it
+ // is set: skip the loop
+ if (!restrictIT) {
+ // Branches, including tricky ones like LDM_RET, need to end an IT
+ // block so check the instruction we just put in the block.
+ for (; MBBI != E && Pos &&
+ (!MI->isBranch() && !MI->isReturn()) ; ++MBBI) {
+ if (MBBI->isDebugValue())
+ continue;
+
+ MachineInstr *NMI = &*MBBI;
+ MI = NMI;
+
+ unsigned NPredReg = 0;
+ ARMCC::CondCodes NCC = getITInstrPredicate(*NMI, NPredReg);
+ if (NCC == CC || NCC == OCC) {
+ Mask |= (NCC & 1) << Pos;
+ // Add implicit use of ITSTATE.
+ NMI->addOperand(MachineOperand::CreateReg(ARM::ITSTATE, false/*ifDef*/,
+ true/*isImp*/, false/*isKill*/));
+ LastITMI = NMI;
+ } else {
+ if (NCC == ARMCC::AL &&
+ MoveCopyOutOfITBlock(NMI, CC, OCC, Defs, Uses)) {
+ --MBBI;
+ MBB.remove(NMI);
+ MBB.insert(InsertPos, NMI);
+ ClearKillFlags(MI, Uses);
+ ++NumMovedInsts;
+ continue;
+ }
+ break;
+ }
+ TrackDefUses(NMI, Defs, Uses, TRI);
+ --Pos;
+ }
+ }
+
+ // Finalize IT mask.
+ Mask |= (1 << Pos);
+ // Tag along (firstcond[0] << 4) with the mask.
+ Mask |= (CC & 1) << 4;
+ MIB.addImm(Mask);
+
+ // Last instruction in IT block kills ITSTATE.
+ LastITMI->findRegisterUseOperand(ARM::ITSTATE)->setIsKill();
+
+ // Finalize the bundle.
+ finalizeBundle(MBB, InsertPos.getInstrIterator(),
+ ++LastITMI->getIterator());
+
+ Modified = true;
+ ++NumITs;
+ }
+
+ return Modified;
+}
+
+bool Thumb2ITBlockPass::runOnMachineFunction(MachineFunction &Fn) {
+ const ARMSubtarget &STI =
+ static_cast<const ARMSubtarget &>(Fn.getSubtarget());
+ if (!STI.isThumb2())
+ return false;
+ AFI = Fn.getInfo<ARMFunctionInfo>();
+ TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo());
+ TRI = STI.getRegisterInfo();
+ restrictIT = STI.restrictIT();
+
+ if (!AFI->isThumbFunction())
+ return false;
+
+ bool Modified = false;
+ for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; ) {
+ MachineBasicBlock &MBB = *MFI;
+ ++MFI;
+ Modified |= InsertITInstructions(MBB);
+ }
+
+ if (Modified)
+ AFI->setHasITBlocks(true);
+
+ return Modified;
+}
+
+/// createThumb2ITBlockPass - Returns an instance of the Thumb2 IT blocks
+/// insertion pass.
+FunctionPass *llvm::createThumb2ITBlockPass() {
+ return new Thumb2ITBlockPass();
+}
diff --git a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
new file mode 100644
index 000000000000..1c731d669eda
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -0,0 +1,643 @@
+//===-- Thumb2InstrInfo.cpp - Thumb-2 Instruction Information -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb-2 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Thumb2InstrInfo.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMMachineFunctionInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+OldT2IfCvt("old-thumb2-ifcvt", cl::Hidden,
+ cl::desc("Use old-style Thumb2 if-conversion heuristics"),
+ cl::init(false));
+
+Thumb2InstrInfo::Thumb2InstrInfo(const ARMSubtarget &STI)
+ : ARMBaseInstrInfo(STI), RI() {}
+
+/// getNoopForMachoTarget - Return the noop instruction to use for a noop.
+void Thumb2InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+ NopInst.setOpcode(ARM::tHINT);
+ NopInst.addOperand(MCOperand::createImm(0));
+ NopInst.addOperand(MCOperand::createImm(ARMCC::AL));
+ NopInst.addOperand(MCOperand::createReg(0));
+}
+
+unsigned Thumb2InstrInfo::getUnindexedOpcode(unsigned Opc) const {
+ // FIXME
+ return 0;
+}
+
+void
+Thumb2InstrInfo::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
+ MachineBasicBlock *NewDest) const {
+ MachineBasicBlock *MBB = Tail->getParent();
+ ARMFunctionInfo *AFI = MBB->getParent()->getInfo<ARMFunctionInfo>();
+ if (!AFI->hasITBlocks() || Tail->isBranch()) {
+ TargetInstrInfo::ReplaceTailWithBranchTo(Tail, NewDest);
+ return;
+ }
+
+ // If the first instruction of Tail is predicated, we may have to update
+ // the IT instruction.
+ unsigned PredReg = 0;
+ ARMCC::CondCodes CC = getInstrPredicate(*Tail, PredReg);
+ MachineBasicBlock::iterator MBBI = Tail;
+ if (CC != ARMCC::AL)
+ // Expecting at least the t2IT instruction before it.
+ --MBBI;
+
+ // Actually replace the tail.
+ TargetInstrInfo::ReplaceTailWithBranchTo(Tail, NewDest);
+
+ // Fix up IT.
+ if (CC != ARMCC::AL) {
+ MachineBasicBlock::iterator E = MBB->begin();
+ unsigned Count = 4; // At most 4 instructions in an IT block.
+ while (Count && MBBI != E) {
+ if (MBBI->isDebugValue()) {
+ --MBBI;
+ continue;
+ }
+ if (MBBI->getOpcode() == ARM::t2IT) {
+ unsigned Mask = MBBI->getOperand(1).getImm();
+ if (Count == 4)
+ MBBI->eraseFromParent();
+ else {
+ unsigned MaskOn = 1 << Count;
+ unsigned MaskOff = ~(MaskOn - 1);
+ MBBI->getOperand(1).setImm((Mask & MaskOff) | MaskOn);
+ }
+ return;
+ }
+ --MBBI;
+ --Count;
+ }
+
+ // Ctrl flow can reach here if branch folding is run before IT block
+ // formation pass.
+ }
+}
+
+bool
+Thumb2InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI) const {
+ while (MBBI->isDebugValue()) {
+ ++MBBI;
+ if (MBBI == MBB.end())
+ return false;
+ }
+
+ unsigned PredReg = 0;
+ return getITInstrPredicate(*MBBI, PredReg) == ARMCC::AL;
+}
+
+void Thumb2InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL, unsigned DestReg,
+ unsigned SrcReg, bool KillSrc) const {
+ // Handle SPR, DPR, and QPR copies.
+ if (!ARM::GPRRegClass.contains(DestReg, SrcReg))
+ return ARMBaseInstrInfo::copyPhysReg(MBB, I, DL, DestReg, SrcReg, KillSrc);
+
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc)));
+}
+
+void Thumb2InstrInfo::
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ unsigned SrcReg, bool isKill, int FI,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ DebugLoc DL;
+ if (I != MBB.end()) DL = I->getDebugLoc();
+
+ MachineFunction &MF = *MBB.getParent();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore,
+ MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+
+ if (RC == &ARM::GPRRegClass || RC == &ARM::tGPRRegClass ||
+ RC == &ARM::tcGPRRegClass || RC == &ARM::rGPRRegClass ||
+ RC == &ARM::GPRnopcRegClass) {
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::t2STRi12))
+ .addReg(SrcReg, getKillRegState(isKill))
+ .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+ return;
+ }
+
+ if (ARM::GPRPairRegClass.hasSubClassEq(RC)) {
+ // Thumb2 STRD expects its dest-registers to be in rGPR. Not a problem for
+ // gsub_0, but needs an extra constraint for gsub_1 (which could be sp
+ // otherwise).
+ if (TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+ MachineRegisterInfo *MRI = &MF.getRegInfo();
+ MRI->constrainRegClass(SrcReg, &ARM::GPRPair_with_gsub_1_in_rGPRRegClass);
+ }
+
+ MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2STRDi8));
+ AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI);
+ AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI);
+ MIB.addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+ AddDefaultPred(MIB);
+ return;
+ }
+
+ ARMBaseInstrInfo::storeRegToStackSlot(MBB, I, SrcReg, isKill, FI, RC, TRI);
+}
+
+void Thumb2InstrInfo::
+loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ unsigned DestReg, int FI,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ MachineFunction &MF = *MBB.getParent();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad,
+ MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+ DebugLoc DL;
+ if (I != MBB.end()) DL = I->getDebugLoc();
+
+ if (RC == &ARM::GPRRegClass || RC == &ARM::tGPRRegClass ||
+ RC == &ARM::tcGPRRegClass || RC == &ARM::rGPRRegClass ||
+ RC == &ARM::GPRnopcRegClass) {
+ AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::t2LDRi12), DestReg)
+ .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+ return;
+ }
+
+ if (ARM::GPRPairRegClass.hasSubClassEq(RC)) {
+ // Thumb2 LDRD expects its dest-registers to be in rGPR. Not a problem for
+ // gsub_0, but needs an extra constraint for gsub_1 (which could be sp
+ // otherwise).
+ if (TargetRegisterInfo::isVirtualRegister(DestReg)) {
+ MachineRegisterInfo *MRI = &MF.getRegInfo();
+ MRI->constrainRegClass(DestReg,
+ &ARM::GPRPair_with_gsub_1_in_rGPRRegClass);
+ }
+
+ MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2LDRDi8));
+ AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI);
+ AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI);
+ MIB.addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+ AddDefaultPred(MIB);
+
+ if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+ MIB.addReg(DestReg, RegState::ImplicitDefine);
+ return;
+ }
+
+ ARMBaseInstrInfo::loadRegFromStackSlot(MBB, I, DestReg, FI, RC, TRI);
+}
+
+void Thumb2InstrInfo::expandLoadStackGuard(
+ MachineBasicBlock::iterator MI) const {
+ MachineFunction &MF = *MI->getParent()->getParent();
+ if (MF.getTarget().isPositionIndependent())
+ expandLoadStackGuardBase(MI, ARM::t2MOV_ga_pcrel, ARM::t2LDRi12);
+ else
+ expandLoadStackGuardBase(MI, ARM::t2MOVi32imm, ARM::t2LDRi12);
+}
+
+void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ const DebugLoc &dl, unsigned DestReg,
+ unsigned BaseReg, int NumBytes,
+ ARMCC::CondCodes Pred, unsigned PredReg,
+ const ARMBaseInstrInfo &TII,
+ unsigned MIFlags) {
+ if (NumBytes == 0 && DestReg != BaseReg) {
+ BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), DestReg)
+ .addReg(BaseReg, RegState::Kill)
+ .addImm((unsigned)Pred).addReg(PredReg).setMIFlags(MIFlags);
+ return;
+ }
+
+ bool isSub = NumBytes < 0;
+ if (isSub) NumBytes = -NumBytes;
+
+ // If profitable, use a movw or movt to materialize the offset.
+ // FIXME: Use the scavenger to grab a scratch register.
+ if (DestReg != ARM::SP && DestReg != BaseReg &&
+ NumBytes >= 4096 &&
+ ARM_AM::getT2SOImmVal(NumBytes) == -1) {
+ bool Fits = false;
+ if (NumBytes < 65536) {
+ // Use a movw to materialize the 16-bit constant.
+ BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi16), DestReg)
+ .addImm(NumBytes)
+ .addImm((unsigned)Pred).addReg(PredReg).setMIFlags(MIFlags);
+ Fits = true;
+ } else if ((NumBytes & 0xffff) == 0) {
+ // Use a movt to materialize the 32-bit constant.
+ BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVTi16), DestReg)
+ .addReg(DestReg)
+ .addImm(NumBytes >> 16)
+ .addImm((unsigned)Pred).addReg(PredReg).setMIFlags(MIFlags);
+ Fits = true;
+ }
+
+ if (Fits) {
+ if (isSub) {
+ BuildMI(MBB, MBBI, dl, TII.get(ARM::t2SUBrr), DestReg)
+ .addReg(BaseReg)
+ .addReg(DestReg, RegState::Kill)
+ .addImm((unsigned)Pred).addReg(PredReg).addReg(0)
+ .setMIFlags(MIFlags);
+ } else {
+ // Here we know that DestReg is not SP but we do not
+ // know anything about BaseReg. t2ADDrr is an invalid
+ // instruction is SP is used as the second argument, but
+ // is fine if SP is the first argument. To be sure we
+ // do not generate invalid encoding, put BaseReg first.
+ BuildMI(MBB, MBBI, dl, TII.get(ARM::t2ADDrr), DestReg)
+ .addReg(BaseReg)
+ .addReg(DestReg, RegState::Kill)
+ .addImm((unsigned)Pred).addReg(PredReg).addReg(0)
+ .setMIFlags(MIFlags);
+ }
+ return;
+ }
+ }
+
+ while (NumBytes) {
+ unsigned ThisVal = NumBytes;
+ unsigned Opc = 0;
+ if (DestReg == ARM::SP && BaseReg != ARM::SP) {
+ // mov sp, rn. Note t2MOVr cannot be used.
+ AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr),DestReg)
+ .addReg(BaseReg).setMIFlags(MIFlags));
+ BaseReg = ARM::SP;
+ continue;
+ }
+
+ bool HasCCOut = true;
+ if (BaseReg == ARM::SP) {
+ // sub sp, sp, #imm7
+ if (DestReg == ARM::SP && (ThisVal < ((1 << 7)-1) * 4)) {
+ assert((ThisVal & 3) == 0 && "Stack update is not multiple of 4?");
+ Opc = isSub ? ARM::tSUBspi : ARM::tADDspi;
+ AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
+ .addReg(BaseReg).addImm(ThisVal/4).setMIFlags(MIFlags));
+ NumBytes = 0;
+ continue;
+ }
+
+ // sub rd, sp, so_imm
+ Opc = isSub ? ARM::t2SUBri : ARM::t2ADDri;
+ if (ARM_AM::getT2SOImmVal(NumBytes) != -1) {
+ NumBytes = 0;
+ } else {
+ // FIXME: Move this to ARMAddressingModes.h?
+ unsigned RotAmt = countLeadingZeros(ThisVal);
+ ThisVal = ThisVal & ARM_AM::rotr32(0xff000000U, RotAmt);
+ NumBytes &= ~ThisVal;
+ assert(ARM_AM::getT2SOImmVal(ThisVal) != -1 &&
+ "Bit extraction didn't work?");
+ }
+ } else {
+ assert(DestReg != ARM::SP && BaseReg != ARM::SP);
+ Opc = isSub ? ARM::t2SUBri : ARM::t2ADDri;
+ if (ARM_AM::getT2SOImmVal(NumBytes) != -1) {
+ NumBytes = 0;
+ } else if (ThisVal < 4096) {
+ Opc = isSub ? ARM::t2SUBri12 : ARM::t2ADDri12;
+ HasCCOut = false;
+ NumBytes = 0;
+ } else {
+ // FIXME: Move this to ARMAddressingModes.h?
+ unsigned RotAmt = countLeadingZeros(ThisVal);
+ ThisVal = ThisVal & ARM_AM::rotr32(0xff000000U, RotAmt);
+ NumBytes &= ~ThisVal;
+ assert(ARM_AM::getT2SOImmVal(ThisVal) != -1 &&
+ "Bit extraction didn't work?");
+ }
+ }
+
+ // Build the new ADD / SUB.
+ MachineInstrBuilder MIB =
+ AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
+ .addReg(BaseReg, RegState::Kill)
+ .addImm(ThisVal)).setMIFlags(MIFlags);
+ if (HasCCOut)
+ AddDefaultCC(MIB);
+
+ BaseReg = DestReg;
+ }
+}
+
+static unsigned
+negativeOffsetOpcode(unsigned opcode)
+{
+ switch (opcode) {
+ case ARM::t2LDRi12: return ARM::t2LDRi8;
+ case ARM::t2LDRHi12: return ARM::t2LDRHi8;
+ case ARM::t2LDRBi12: return ARM::t2LDRBi8;
+ case ARM::t2LDRSHi12: return ARM::t2LDRSHi8;
+ case ARM::t2LDRSBi12: return ARM::t2LDRSBi8;
+ case ARM::t2STRi12: return ARM::t2STRi8;
+ case ARM::t2STRBi12: return ARM::t2STRBi8;
+ case ARM::t2STRHi12: return ARM::t2STRHi8;
+ case ARM::t2PLDi12: return ARM::t2PLDi8;
+
+ case ARM::t2LDRi8:
+ case ARM::t2LDRHi8:
+ case ARM::t2LDRBi8:
+ case ARM::t2LDRSHi8:
+ case ARM::t2LDRSBi8:
+ case ARM::t2STRi8:
+ case ARM::t2STRBi8:
+ case ARM::t2STRHi8:
+ case ARM::t2PLDi8:
+ return opcode;
+
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+static unsigned
+positiveOffsetOpcode(unsigned opcode)
+{
+ switch (opcode) {
+ case ARM::t2LDRi8: return ARM::t2LDRi12;
+ case ARM::t2LDRHi8: return ARM::t2LDRHi12;
+ case ARM::t2LDRBi8: return ARM::t2LDRBi12;
+ case ARM::t2LDRSHi8: return ARM::t2LDRSHi12;
+ case ARM::t2LDRSBi8: return ARM::t2LDRSBi12;
+ case ARM::t2STRi8: return ARM::t2STRi12;
+ case ARM::t2STRBi8: return ARM::t2STRBi12;
+ case ARM::t2STRHi8: return ARM::t2STRHi12;
+ case ARM::t2PLDi8: return ARM::t2PLDi12;
+
+ case ARM::t2LDRi12:
+ case ARM::t2LDRHi12:
+ case ARM::t2LDRBi12:
+ case ARM::t2LDRSHi12:
+ case ARM::t2LDRSBi12:
+ case ARM::t2STRi12:
+ case ARM::t2STRBi12:
+ case ARM::t2STRHi12:
+ case ARM::t2PLDi12:
+ return opcode;
+
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+static unsigned
+immediateOffsetOpcode(unsigned opcode)
+{
+ switch (opcode) {
+ case ARM::t2LDRs: return ARM::t2LDRi12;
+ case ARM::t2LDRHs: return ARM::t2LDRHi12;
+ case ARM::t2LDRBs: return ARM::t2LDRBi12;
+ case ARM::t2LDRSHs: return ARM::t2LDRSHi12;
+ case ARM::t2LDRSBs: return ARM::t2LDRSBi12;
+ case ARM::t2STRs: return ARM::t2STRi12;
+ case ARM::t2STRBs: return ARM::t2STRBi12;
+ case ARM::t2STRHs: return ARM::t2STRHi12;
+ case ARM::t2PLDs: return ARM::t2PLDi12;
+
+ case ARM::t2LDRi12:
+ case ARM::t2LDRHi12:
+ case ARM::t2LDRBi12:
+ case ARM::t2LDRSHi12:
+ case ARM::t2LDRSBi12:
+ case ARM::t2STRi12:
+ case ARM::t2STRBi12:
+ case ARM::t2STRHi12:
+ case ARM::t2PLDi12:
+ case ARM::t2LDRi8:
+ case ARM::t2LDRHi8:
+ case ARM::t2LDRBi8:
+ case ARM::t2LDRSHi8:
+ case ARM::t2LDRSBi8:
+ case ARM::t2STRi8:
+ case ARM::t2STRBi8:
+ case ARM::t2STRHi8:
+ case ARM::t2PLDi8:
+ return opcode;
+
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+ unsigned FrameReg, int &Offset,
+ const ARMBaseInstrInfo &TII) {
+ unsigned Opcode = MI.getOpcode();
+ const MCInstrDesc &Desc = MI.getDesc();
+ unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+ bool isSub = false;
+
+ // Memory operands in inline assembly always use AddrModeT2_i12.
+ if (Opcode == ARM::INLINEASM)
+ AddrMode = ARMII::AddrModeT2_i12; // FIXME. mode for thumb2?
+
+ if (Opcode == ARM::t2ADDri || Opcode == ARM::t2ADDri12) {
+ Offset += MI.getOperand(FrameRegIdx+1).getImm();
+
+ unsigned PredReg;
+ if (Offset == 0 && getInstrPredicate(MI, PredReg) == ARMCC::AL) {
+ // Turn it into a move.
+ MI.setDesc(TII.get(ARM::tMOVr));
+ MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+ // Remove offset and remaining explicit predicate operands.
+ do MI.RemoveOperand(FrameRegIdx+1);
+ while (MI.getNumOperands() > FrameRegIdx+1);
+ MachineInstrBuilder MIB(*MI.getParent()->getParent(), &MI);
+ AddDefaultPred(MIB);
+ return true;
+ }
+
+ bool HasCCOut = Opcode != ARM::t2ADDri12;
+
+ if (Offset < 0) {
+ Offset = -Offset;
+ isSub = true;
+ MI.setDesc(TII.get(ARM::t2SUBri));
+ } else {
+ MI.setDesc(TII.get(ARM::t2ADDri));
+ }
+
+ // Common case: small offset, fits into instruction.
+ if (ARM_AM::getT2SOImmVal(Offset) != -1) {
+ MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+ MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset);
+ // Add cc_out operand if the original instruction did not have one.
+ if (!HasCCOut)
+ MI.addOperand(MachineOperand::CreateReg(0, false));
+ Offset = 0;
+ return true;
+ }
+ // Another common case: imm12.
+ if (Offset < 4096 &&
+ (!HasCCOut || MI.getOperand(MI.getNumOperands()-1).getReg() == 0)) {
+ unsigned NewOpc = isSub ? ARM::t2SUBri12 : ARM::t2ADDri12;
+ MI.setDesc(TII.get(NewOpc));
+ MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+ MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset);
+ // Remove the cc_out operand.
+ if (HasCCOut)
+ MI.RemoveOperand(MI.getNumOperands()-1);
+ Offset = 0;
+ return true;
+ }
+
+ // Otherwise, extract 8 adjacent bits from the immediate into this
+ // t2ADDri/t2SUBri.
+ unsigned RotAmt = countLeadingZeros<unsigned>(Offset);
+ unsigned ThisImmVal = Offset & ARM_AM::rotr32(0xff000000U, RotAmt);
+
+ // We will handle these bits from offset, clear them.
+ Offset &= ~ThisImmVal;
+
+ assert(ARM_AM::getT2SOImmVal(ThisImmVal) != -1 &&
+ "Bit extraction didn't work?");
+ MI.getOperand(FrameRegIdx+1).ChangeToImmediate(ThisImmVal);
+ // Add cc_out operand if the original instruction did not have one.
+ if (!HasCCOut)
+ MI.addOperand(MachineOperand::CreateReg(0, false));
+
+ } else {
+
+ // AddrMode4 and AddrMode6 cannot handle any offset.
+ if (AddrMode == ARMII::AddrMode4 || AddrMode == ARMII::AddrMode6)
+ return false;
+
+ // AddrModeT2_so cannot handle any offset. If there is no offset
+ // register then we change to an immediate version.
+ unsigned NewOpc = Opcode;
+ if (AddrMode == ARMII::AddrModeT2_so) {
+ unsigned OffsetReg = MI.getOperand(FrameRegIdx+1).getReg();
+ if (OffsetReg != 0) {
+ MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+ return Offset == 0;
+ }
+
+ MI.RemoveOperand(FrameRegIdx+1);
+ MI.getOperand(FrameRegIdx+1).ChangeToImmediate(0);
+ NewOpc = immediateOffsetOpcode(Opcode);
+ AddrMode = ARMII::AddrModeT2_i12;
+ }
+
+ unsigned NumBits = 0;
+ unsigned Scale = 1;
+ if (AddrMode == ARMII::AddrModeT2_i8 || AddrMode == ARMII::AddrModeT2_i12) {
+ // i8 supports only negative, and i12 supports only positive, so
+ // based on Offset sign convert Opcode to the appropriate
+ // instruction
+ Offset += MI.getOperand(FrameRegIdx+1).getImm();
+ if (Offset < 0) {
+ NewOpc = negativeOffsetOpcode(Opcode);
+ NumBits = 8;
+ isSub = true;
+ Offset = -Offset;
+ } else {
+ NewOpc = positiveOffsetOpcode(Opcode);
+ NumBits = 12;
+ }
+ } else if (AddrMode == ARMII::AddrMode5) {
+ // VFP address mode.
+ const MachineOperand &OffOp = MI.getOperand(FrameRegIdx+1);
+ int InstrOffs = ARM_AM::getAM5Offset(OffOp.getImm());
+ if (ARM_AM::getAM5Op(OffOp.getImm()) == ARM_AM::sub)
+ InstrOffs *= -1;
+ NumBits = 8;
+ Scale = 4;
+ Offset += InstrOffs * 4;
+ assert((Offset & (Scale-1)) == 0 && "Can't encode this offset!");
+ if (Offset < 0) {
+ Offset = -Offset;
+ isSub = true;
+ }
+ } else if (AddrMode == ARMII::AddrModeT2_i8s4) {
+ Offset += MI.getOperand(FrameRegIdx + 1).getImm() * 4;
+ NumBits = 10; // 8 bits scaled by 4
+ // MCInst operand expects already scaled value.
+ Scale = 1;
+ assert((Offset & 3) == 0 && "Can't encode this offset!");
+ } else {
+ llvm_unreachable("Unsupported addressing mode!");
+ }
+
+ if (NewOpc != Opcode)
+ MI.setDesc(TII.get(NewOpc));
+
+ MachineOperand &ImmOp = MI.getOperand(FrameRegIdx+1);
+
+ // Attempt to fold address computation
+ // Common case: small offset, fits into instruction.
+ int ImmedOffset = Offset / Scale;
+ unsigned Mask = (1 << NumBits) - 1;
+ if ((unsigned)Offset <= Mask * Scale) {
+ // Replace the FrameIndex with fp/sp
+ MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+ if (isSub) {
+ if (AddrMode == ARMII::AddrMode5)
+ // FIXME: Not consistent.
+ ImmedOffset |= 1 << NumBits;
+ else
+ ImmedOffset = -ImmedOffset;
+ }
+ ImmOp.ChangeToImmediate(ImmedOffset);
+ Offset = 0;
+ return true;
+ }
+
+ // Otherwise, offset doesn't fit. Pull in what we can to simplify
+ ImmedOffset = ImmedOffset & Mask;
+ if (isSub) {
+ if (AddrMode == ARMII::AddrMode5)
+ // FIXME: Not consistent.
+ ImmedOffset |= 1 << NumBits;
+ else {
+ ImmedOffset = -ImmedOffset;
+ if (ImmedOffset == 0)
+ // Change the opcode back if the encoded offset is zero.
+ MI.setDesc(TII.get(positiveOffsetOpcode(NewOpc)));
+ }
+ }
+ ImmOp.ChangeToImmediate(ImmedOffset);
+ Offset &= ~(Mask*Scale);
+ }
+
+ Offset = (isSub) ? -Offset : Offset;
+ return Offset == 0;
+}
+
+ARMCC::CondCodes llvm::getITInstrPredicate(const MachineInstr &MI,
+ unsigned &PredReg) {
+ unsigned Opc = MI.getOpcode();
+ if (Opc == ARM::tBcc || Opc == ARM::t2Bcc)
+ return ARMCC::AL;
+ return getInstrPredicate(MI, PredReg);
+}
diff --git a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h
new file mode 100644
index 000000000000..15d63300b6a2
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h
@@ -0,0 +1,74 @@
+//===-- Thumb2InstrInfo.h - Thumb-2 Instruction Information -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb-2 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_THUMB2INSTRINFO_H
+#define LLVM_LIB_TARGET_ARM_THUMB2INSTRINFO_H
+
+#include "ARMBaseInstrInfo.h"
+#include "ThumbRegisterInfo.h"
+
+namespace llvm {
+class ARMSubtarget;
+class ScheduleHazardRecognizer;
+
+class Thumb2InstrInfo : public ARMBaseInstrInfo {
+ ThumbRegisterInfo RI;
+public:
+ explicit Thumb2InstrInfo(const ARMSubtarget &STI);
+
+ /// getNoopForMachoTarget - Return the noop instruction to use for a noop.
+ void getNoopForMachoTarget(MCInst &NopInst) const override;
+
+ // Return the non-pre/post incrementing version of 'Opc'. Return 0
+ // if there is not such an opcode.
+ unsigned getUnindexedOpcode(unsigned Opc) const override;
+
+ void ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
+ MachineBasicBlock *NewDest) const override;
+
+ bool isLegalToSplitMBBAt(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI) const override;
+
+ void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+ bool KillSrc) const override;
+
+ void storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ unsigned SrcReg, bool isKill, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
+
+ void loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ unsigned DestReg, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
+
+ /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As
+ /// such, whenever a client has an instance of instruction info, it should
+ /// always be able to get register info as well (through this method).
+ ///
+ const ThumbRegisterInfo &getRegisterInfo() const override { return RI; }
+
+private:
+ void expandLoadStackGuard(MachineBasicBlock::iterator MI) const override;
+};
+
+/// getITInstrPredicate - Valid only in Thumb2 mode. This function is identical
+/// to llvm::getInstrPredicate except it returns AL for conditional branch
+/// instructions which are "predicated", but are not in IT blocks.
+ARMCC::CondCodes getITInstrPredicate(const MachineInstr &MI, unsigned &PredReg);
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp b/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
new file mode 100644
index 000000000000..8208e7e24770
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -0,0 +1,1106 @@
+//===-- Thumb2SizeReduction.cpp - Thumb2 code size reduction pass -*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "Thumb2InstrInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Function.h" // To access Function attributes
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <utility>
+using namespace llvm;
+
+#define DEBUG_TYPE "t2-reduce-size"
+
+STATISTIC(NumNarrows, "Number of 32-bit instrs reduced to 16-bit ones");
+STATISTIC(Num2Addrs, "Number of 32-bit instrs reduced to 2addr 16-bit ones");
+STATISTIC(NumLdSts, "Number of 32-bit load / store reduced to 16-bit ones");
+
+static cl::opt<int> ReduceLimit("t2-reduce-limit",
+ cl::init(-1), cl::Hidden);
+static cl::opt<int> ReduceLimit2Addr("t2-reduce-limit2",
+ cl::init(-1), cl::Hidden);
+static cl::opt<int> ReduceLimitLdSt("t2-reduce-limit3",
+ cl::init(-1), cl::Hidden);
+
+namespace {
+ /// ReduceTable - A static table with information on mapping from wide
+ /// opcodes to narrow
+ struct ReduceEntry {
+ uint16_t WideOpc; // Wide opcode
+ uint16_t NarrowOpc1; // Narrow opcode to transform to
+ uint16_t NarrowOpc2; // Narrow opcode when it's two-address
+ uint8_t Imm1Limit; // Limit of immediate field (bits)
+ uint8_t Imm2Limit; // Limit of immediate field when it's two-address
+ unsigned LowRegs1 : 1; // Only possible if low-registers are used
+ unsigned LowRegs2 : 1; // Only possible if low-registers are used (2addr)
+ unsigned PredCC1 : 2; // 0 - If predicated, cc is on and vice versa.
+ // 1 - No cc field.
+ // 2 - Always set CPSR.
+ unsigned PredCC2 : 2;
+ unsigned PartFlag : 1; // 16-bit instruction does partial flag update
+ unsigned Special : 1; // Needs to be dealt with specially
+ unsigned AvoidMovs: 1; // Avoid movs with shifter operand (for Swift)
+ };
+
+ static const ReduceEntry ReduceTable[] = {
+ // Wide, Narrow1, Narrow2, imm1,imm2, lo1, lo2, P/C,PF,S,AM
+ { ARM::t2ADCrr, 0, ARM::tADC, 0, 0, 0, 1, 0,0, 0,0,0 },
+ { ARM::t2ADDri, ARM::tADDi3, ARM::tADDi8, 3, 8, 1, 1, 0,0, 0,1,0 },
+ { ARM::t2ADDrr, ARM::tADDrr, ARM::tADDhirr, 0, 0, 1, 0, 0,1, 0,0,0 },
+ { ARM::t2ADDSri,ARM::tADDi3, ARM::tADDi8, 3, 8, 1, 1, 2,2, 0,1,0 },
+ { ARM::t2ADDSrr,ARM::tADDrr, 0, 0, 0, 1, 0, 2,0, 0,1,0 },
+ { ARM::t2ANDrr, 0, ARM::tAND, 0, 0, 0, 1, 0,0, 1,0,0 },
+ { ARM::t2ASRri, ARM::tASRri, 0, 5, 0, 1, 0, 0,0, 1,0,1 },
+ { ARM::t2ASRrr, 0, ARM::tASRrr, 0, 0, 0, 1, 0,0, 1,0,1 },
+ { ARM::t2BICrr, 0, ARM::tBIC, 0, 0, 0, 1, 0,0, 1,0,0 },
+ //FIXME: Disable CMN, as CCodes are backwards from compare expectations
+ //{ ARM::t2CMNrr, ARM::tCMN, 0, 0, 0, 1, 0, 2,0, 0,0,0 },
+ { ARM::t2CMNzrr, ARM::tCMNz, 0, 0, 0, 1, 0, 2,0, 0,0,0 },
+ { ARM::t2CMPri, ARM::tCMPi8, 0, 8, 0, 1, 0, 2,0, 0,0,0 },
+ { ARM::t2CMPrr, ARM::tCMPhir, 0, 0, 0, 0, 0, 2,0, 0,1,0 },
+ { ARM::t2EORrr, 0, ARM::tEOR, 0, 0, 0, 1, 0,0, 1,0,0 },
+ // FIXME: adr.n immediate offset must be multiple of 4.
+ //{ ARM::t2LEApcrelJT,ARM::tLEApcrelJT, 0, 0, 0, 1, 0, 1,0, 0,0,0 },
+ { ARM::t2LSLri, ARM::tLSLri, 0, 5, 0, 1, 0, 0,0, 1,0,1 },
+ { ARM::t2LSLrr, 0, ARM::tLSLrr, 0, 0, 0, 1, 0,0, 1,0,1 },
+ { ARM::t2LSRri, ARM::tLSRri, 0, 5, 0, 1, 0, 0,0, 1,0,1 },
+ { ARM::t2LSRrr, 0, ARM::tLSRrr, 0, 0, 0, 1, 0,0, 1,0,1 },
+ { ARM::t2MOVi, ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 1,0,0 },
+ { ARM::t2MOVi16,ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 1,1,0 },
+ // FIXME: Do we need the 16-bit 'S' variant?
+ { ARM::t2MOVr,ARM::tMOVr, 0, 0, 0, 0, 0, 1,0, 0,0,0 },
+ { ARM::t2MUL, 0, ARM::tMUL, 0, 0, 0, 1, 0,0, 1,0,0 },
+ { ARM::t2MVNr, ARM::tMVN, 0, 0, 0, 1, 0, 0,0, 0,0,0 },
+ { ARM::t2ORRrr, 0, ARM::tORR, 0, 0, 0, 1, 0,0, 1,0,0 },
+ { ARM::t2REV, ARM::tREV, 0, 0, 0, 1, 0, 1,0, 0,0,0 },
+ { ARM::t2REV16, ARM::tREV16, 0, 0, 0, 1, 0, 1,0, 0,0,0 },
+ { ARM::t2REVSH, ARM::tREVSH, 0, 0, 0, 1, 0, 1,0, 0,0,0 },
+ { ARM::t2RORrr, 0, ARM::tROR, 0, 0, 0, 1, 0,0, 1,0,0 },
+ { ARM::t2RSBri, ARM::tRSB, 0, 0, 0, 1, 0, 0,0, 0,1,0 },
+ { ARM::t2RSBSri,ARM::tRSB, 0, 0, 0, 1, 0, 2,0, 0,1,0 },
+ { ARM::t2SBCrr, 0, ARM::tSBC, 0, 0, 0, 1, 0,0, 0,0,0 },
+ { ARM::t2SUBri, ARM::tSUBi3, ARM::tSUBi8, 3, 8, 1, 1, 0,0, 0,0,0 },
+ { ARM::t2SUBrr, ARM::tSUBrr, 0, 0, 0, 1, 0, 0,0, 0,0,0 },
+ { ARM::t2SUBSri,ARM::tSUBi3, ARM::tSUBi8, 3, 8, 1, 1, 2,2, 0,0,0 },
+ { ARM::t2SUBSrr,ARM::tSUBrr, 0, 0, 0, 1, 0, 2,0, 0,0,0 },
+ { ARM::t2SXTB, ARM::tSXTB, 0, 0, 0, 1, 0, 1,0, 0,1,0 },
+ { ARM::t2SXTH, ARM::tSXTH, 0, 0, 0, 1, 0, 1,0, 0,1,0 },
+ { ARM::t2TSTrr, ARM::tTST, 0, 0, 0, 1, 0, 2,0, 0,0,0 },
+ { ARM::t2UXTB, ARM::tUXTB, 0, 0, 0, 1, 0, 1,0, 0,1,0 },
+ { ARM::t2UXTH, ARM::tUXTH, 0, 0, 0, 1, 0, 1,0, 0,1,0 },
+
+ // FIXME: Clean this up after splitting each Thumb load / store opcode
+ // into multiple ones.
+ { ARM::t2LDRi12,ARM::tLDRi, ARM::tLDRspi, 5, 8, 1, 0, 0,0, 0,1,0 },
+ { ARM::t2LDRs, ARM::tLDRr, 0, 0, 0, 1, 0, 0,0, 0,1,0 },
+ { ARM::t2LDRBi12,ARM::tLDRBi, 0, 5, 0, 1, 0, 0,0, 0,1,0 },
+ { ARM::t2LDRBs, ARM::tLDRBr, 0, 0, 0, 1, 0, 0,0, 0,1,0 },
+ { ARM::t2LDRHi12,ARM::tLDRHi, 0, 5, 0, 1, 0, 0,0, 0,1,0 },
+ { ARM::t2LDRHs, ARM::tLDRHr, 0, 0, 0, 1, 0, 0,0, 0,1,0 },
+ { ARM::t2LDRSBs,ARM::tLDRSB, 0, 0, 0, 1, 0, 0,0, 0,1,0 },
+ { ARM::t2LDRSHs,ARM::tLDRSH, 0, 0, 0, 1, 0, 0,0, 0,1,0 },
+ { ARM::t2LDR_POST,ARM::tLDMIA_UPD,0, 0, 0, 1, 0, 0,0, 0,1,0 },
+ { ARM::t2STRi12,ARM::tSTRi, ARM::tSTRspi, 5, 8, 1, 0, 0,0, 0,1,0 },
+ { ARM::t2STRs, ARM::tSTRr, 0, 0, 0, 1, 0, 0,0, 0,1,0 },
+ { ARM::t2STRBi12,ARM::tSTRBi, 0, 5, 0, 1, 0, 0,0, 0,1,0 },
+ { ARM::t2STRBs, ARM::tSTRBr, 0, 0, 0, 1, 0, 0,0, 0,1,0 },
+ { ARM::t2STRHi12,ARM::tSTRHi, 0, 5, 0, 1, 0, 0,0, 0,1,0 },
+ { ARM::t2STRHs, ARM::tSTRHr, 0, 0, 0, 1, 0, 0,0, 0,1,0 },
+ { ARM::t2STR_POST,ARM::tSTMIA_UPD,0, 0, 0, 1, 0, 0,0, 0,1,0 },
+
+ { ARM::t2LDMIA, ARM::tLDMIA, 0, 0, 0, 1, 1, 1,1, 0,1,0 },
+ { ARM::t2LDMIA_RET,0, ARM::tPOP_RET, 0, 0, 1, 1, 1,1, 0,1,0 },
+ { ARM::t2LDMIA_UPD,ARM::tLDMIA_UPD,ARM::tPOP,0, 0, 1, 1, 1,1, 0,1,0 },
+ // ARM::t2STMIA (with no basereg writeback) has no Thumb1 equivalent.
+ // tSTMIA_UPD is a change in semantics which can only be used if the base
+ // register is killed. This difference is correctly handled elsewhere.
+ { ARM::t2STMIA, ARM::tSTMIA_UPD, 0, 0, 0, 1, 1, 1,1, 0,1,0 },
+ { ARM::t2STMIA_UPD,ARM::tSTMIA_UPD, 0, 0, 0, 1, 1, 1,1, 0,1,0 },
+ { ARM::t2STMDB_UPD, 0, ARM::tPUSH, 0, 0, 1, 1, 1,1, 0,1,0 }
+ };
+
+ class Thumb2SizeReduce : public MachineFunctionPass {
+ public:
+ static char ID;
+ Thumb2SizeReduce(std::function<bool(const Function &)> Ftor);
+
+ const Thumb2InstrInfo *TII;
+ const ARMSubtarget *STI;
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+ StringRef getPassName() const override {
+ return "Thumb2 instruction size reduction pass";
+ }
+
+ private:
+ /// ReduceOpcodeMap - Maps wide opcode to index of entry in ReduceTable.
+ DenseMap<unsigned, unsigned> ReduceOpcodeMap;
+
+ bool canAddPseudoFlagDep(MachineInstr *Use, bool IsSelfLoop);
+
+ bool VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry,
+ bool is2Addr, ARMCC::CondCodes Pred,
+ bool LiveCPSR, bool &HasCC, bool &CCDead);
+
+ bool ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
+ const ReduceEntry &Entry);
+
+ bool ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
+ const ReduceEntry &Entry, bool LiveCPSR, bool IsSelfLoop);
+
+ /// ReduceTo2Addr - Reduce a 32-bit instruction to a 16-bit two-address
+ /// instruction.
+ bool ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
+ const ReduceEntry &Entry, bool LiveCPSR,
+ bool IsSelfLoop);
+
+ /// ReduceToNarrow - Reduce a 32-bit instruction to a 16-bit
+ /// non-two-address instruction.
+ bool ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
+ const ReduceEntry &Entry, bool LiveCPSR,
+ bool IsSelfLoop);
+
+ /// ReduceMI - Attempt to reduce MI, return true on success.
+ bool ReduceMI(MachineBasicBlock &MBB, MachineInstr *MI,
+ bool LiveCPSR, bool IsSelfLoop);
+
+ /// ReduceMBB - Reduce width of instructions in the specified basic block.
+ bool ReduceMBB(MachineBasicBlock &MBB);
+
+ bool OptimizeSize;
+ bool MinimizeSize;
+
+ // Last instruction to define CPSR in the current block.
+ MachineInstr *CPSRDef;
+ // Was CPSR last defined by a high latency instruction?
+ // When CPSRDef is null, this refers to CPSR defs in predecessors.
+ bool HighLatencyCPSR;
+
+ struct MBBInfo {
+ // The flags leaving this block have high latency.
+ bool HighLatencyCPSR;
+ // Has this block been visited yet?
+ bool Visited;
+
+ MBBInfo() : HighLatencyCPSR(false), Visited(false) {}
+ };
+
+ SmallVector<MBBInfo, 8> BlockInfo;
+
+ std::function<bool(const Function &)> PredicateFtor;
+ };
+ char Thumb2SizeReduce::ID = 0;
+}
+
+Thumb2SizeReduce::Thumb2SizeReduce(std::function<bool(const Function &)> Ftor)
+ : MachineFunctionPass(ID), PredicateFtor(std::move(Ftor)) {
+ OptimizeSize = MinimizeSize = false;
+ for (unsigned i = 0, e = array_lengthof(ReduceTable); i != e; ++i) {
+ unsigned FromOpc = ReduceTable[i].WideOpc;
+ if (!ReduceOpcodeMap.insert(std::make_pair(FromOpc, i)).second)
+ llvm_unreachable("Duplicated entries?");
+ }
+}
+
+static bool HasImplicitCPSRDef(const MCInstrDesc &MCID) {
+ for (const MCPhysReg *Regs = MCID.getImplicitDefs(); *Regs; ++Regs)
+ if (*Regs == ARM::CPSR)
+ return true;
+ return false;
+}
+
+// Check for a likely high-latency flag def.
+static bool isHighLatencyCPSR(MachineInstr *Def) {
+ switch(Def->getOpcode()) {
+ case ARM::FMSTAT:
+ case ARM::tMUL:
+ return true;
+ }
+ return false;
+}
+
+/// canAddPseudoFlagDep - For A9 (and other out-of-order) implementations,
+/// the 's' 16-bit instruction partially update CPSR. Abort the
+/// transformation to avoid adding false dependency on last CPSR setting
+/// instruction which hurts the ability for out-of-order execution engine
+/// to do register renaming magic.
+/// This function checks if there is a read-of-write dependency between the
+/// last instruction that defines the CPSR and the current instruction. If there
+/// is, then there is no harm done since the instruction cannot be retired
+/// before the CPSR setting instruction anyway.
+/// Note, we are not doing full dependency analysis here for the sake of compile
+/// time. We're not looking for cases like:
+/// r0 = muls ...
+/// r1 = add.w r0, ...
+/// ...
+/// = mul.w r1
+/// In this case it would have been ok to narrow the mul.w to muls since there
+/// are indirect RAW dependency between the muls and the mul.w
+bool
+Thumb2SizeReduce::canAddPseudoFlagDep(MachineInstr *Use, bool FirstInSelfLoop) {
+ // Disable the check for -Oz (aka OptimizeForSizeHarder).
+ if (MinimizeSize || !STI->avoidCPSRPartialUpdate())
+ return false;
+
+ if (!CPSRDef)
+ // If this BB loops back to itself, conservatively avoid narrowing the
+ // first instruction that does partial flag update.
+ return HighLatencyCPSR || FirstInSelfLoop;
+
+ SmallSet<unsigned, 2> Defs;
+ for (const MachineOperand &MO : CPSRDef->operands()) {
+ if (!MO.isReg() || MO.isUndef() || MO.isUse())
+ continue;
+ unsigned Reg = MO.getReg();
+ if (Reg == 0 || Reg == ARM::CPSR)
+ continue;
+ Defs.insert(Reg);
+ }
+
+ for (const MachineOperand &MO : Use->operands()) {
+ if (!MO.isReg() || MO.isUndef() || MO.isDef())
+ continue;
+ unsigned Reg = MO.getReg();
+ if (Defs.count(Reg))
+ return false;
+ }
+
+ // If the current CPSR has high latency, try to avoid the false dependency.
+ if (HighLatencyCPSR)
+ return true;
+
+ // tMOVi8 usually doesn't start long dependency chains, and there are a lot
+ // of them, so always shrink them when CPSR doesn't have high latency.
+ if (Use->getOpcode() == ARM::t2MOVi ||
+ Use->getOpcode() == ARM::t2MOVi16)
+ return false;
+
+ // No read-after-write dependency. The narrowing will add false dependency.
+ return true;
+}
+
+bool
+Thumb2SizeReduce::VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry,
+ bool is2Addr, ARMCC::CondCodes Pred,
+ bool LiveCPSR, bool &HasCC, bool &CCDead) {
+ if ((is2Addr && Entry.PredCC2 == 0) ||
+ (!is2Addr && Entry.PredCC1 == 0)) {
+ if (Pred == ARMCC::AL) {
+ // Not predicated, must set CPSR.
+ if (!HasCC) {
+ // Original instruction was not setting CPSR, but CPSR is not
+ // currently live anyway. It's ok to set it. The CPSR def is
+ // dead though.
+ if (!LiveCPSR) {
+ HasCC = true;
+ CCDead = true;
+ return true;
+ }
+ return false;
+ }
+ } else {
+ // Predicated, must not set CPSR.
+ if (HasCC)
+ return false;
+ }
+ } else if ((is2Addr && Entry.PredCC2 == 2) ||
+ (!is2Addr && Entry.PredCC1 == 2)) {
+ /// Old opcode has an optional def of CPSR.
+ if (HasCC)
+ return true;
+ // If old opcode does not implicitly define CPSR, then it's not ok since
+ // these new opcodes' CPSR def is not meant to be thrown away. e.g. CMP.
+ if (!HasImplicitCPSRDef(MI->getDesc()))
+ return false;
+ HasCC = true;
+ } else {
+ // 16-bit instruction does not set CPSR.
+ if (HasCC)
+ return false;
+ }
+
+ return true;
+}
+
+static bool VerifyLowRegs(MachineInstr *MI) {
+ unsigned Opc = MI->getOpcode();
+ bool isPCOk = (Opc == ARM::t2LDMIA_RET || Opc == ARM::t2LDMIA_UPD);
+ bool isLROk = (Opc == ARM::t2STMDB_UPD);
+ bool isSPOk = isPCOk || isLROk;
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+ if (!MO.isReg() || MO.isImplicit())
+ continue;
+ unsigned Reg = MO.getReg();
+ if (Reg == 0 || Reg == ARM::CPSR)
+ continue;
+ if (isPCOk && Reg == ARM::PC)
+ continue;
+ if (isLROk && Reg == ARM::LR)
+ continue;
+ if (Reg == ARM::SP) {
+ if (isSPOk)
+ continue;
+ if (i == 1 && (Opc == ARM::t2LDRi12 || Opc == ARM::t2STRi12))
+ // Special case for these ldr / str with sp as base register.
+ continue;
+ }
+ if (!isARMLowRegister(Reg))
+ return false;
+ }
+ return true;
+}
+
+bool
+Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
+ const ReduceEntry &Entry) {
+ if (ReduceLimitLdSt != -1 && ((int)NumLdSts >= ReduceLimitLdSt))
+ return false;
+
+ unsigned Scale = 1;
+ bool HasImmOffset = false;
+ bool HasShift = false;
+ bool HasOffReg = true;
+ bool isLdStMul = false;
+ unsigned Opc = Entry.NarrowOpc1;
+ unsigned OpNum = 3; // First 'rest' of operands.
+ uint8_t ImmLimit = Entry.Imm1Limit;
+
+ switch (Entry.WideOpc) {
+ default:
+ llvm_unreachable("Unexpected Thumb2 load / store opcode!");
+ case ARM::t2LDRi12:
+ case ARM::t2STRi12:
+ if (MI->getOperand(1).getReg() == ARM::SP) {
+ Opc = Entry.NarrowOpc2;
+ ImmLimit = Entry.Imm2Limit;
+ }
+
+ Scale = 4;
+ HasImmOffset = true;
+ HasOffReg = false;
+ break;
+ case ARM::t2LDRBi12:
+ case ARM::t2STRBi12:
+ HasImmOffset = true;
+ HasOffReg = false;
+ break;
+ case ARM::t2LDRHi12:
+ case ARM::t2STRHi12:
+ Scale = 2;
+ HasImmOffset = true;
+ HasOffReg = false;
+ break;
+ case ARM::t2LDRs:
+ case ARM::t2LDRBs:
+ case ARM::t2LDRHs:
+ case ARM::t2LDRSBs:
+ case ARM::t2LDRSHs:
+ case ARM::t2STRs:
+ case ARM::t2STRBs:
+ case ARM::t2STRHs:
+ HasShift = true;
+ OpNum = 4;
+ break;
+ case ARM::t2LDR_POST:
+ case ARM::t2STR_POST: {
+ if (!MBB.getParent()->getFunction()->optForMinSize())
+ return false;
+
+ if (!MI->hasOneMemOperand() ||
+ (*MI->memoperands_begin())->getAlignment() < 4)
+ return false;
+
+ // We're creating a completely different type of load/store - LDM from LDR.
+ // For this reason we can't reuse the logic at the end of this function; we
+ // have to implement the MI building here.
+ bool IsStore = Entry.WideOpc == ARM::t2STR_POST;
+ unsigned Rt = MI->getOperand(IsStore ? 1 : 0).getReg();
+ unsigned Rn = MI->getOperand(IsStore ? 0 : 1).getReg();
+ unsigned Offset = MI->getOperand(3).getImm();
+ unsigned PredImm = MI->getOperand(4).getImm();
+ unsigned PredReg = MI->getOperand(5).getReg();
+ assert(isARMLowRegister(Rt));
+ assert(isARMLowRegister(Rn));
+
+ if (Offset != 4)
+ return false;
+
+ // Add the 16-bit load / store instruction.
+ DebugLoc dl = MI->getDebugLoc();
+ auto MIB = BuildMI(MBB, MI, dl, TII->get(Entry.NarrowOpc1))
+ .addReg(Rn, RegState::Define)
+ .addReg(Rn)
+ .addImm(PredImm)
+ .addReg(PredReg)
+ .addReg(Rt, IsStore ? 0 : RegState::Define);
+
+ // Transfer memoperands.
+ MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+ // Transfer MI flags.
+ MIB.setMIFlags(MI->getFlags());
+
+ // Kill the old instruction.
+ MI->eraseFromBundle();
+ ++NumLdSts;
+ return true;
+ }
+ case ARM::t2LDMIA: {
+ unsigned BaseReg = MI->getOperand(0).getReg();
+ assert(isARMLowRegister(BaseReg));
+
+ // For the non-writeback version (this one), the base register must be
+ // one of the registers being loaded.
+ bool isOK = false;
+ for (unsigned i = 3; i < MI->getNumOperands(); ++i) {
+ if (MI->getOperand(i).getReg() == BaseReg) {
+ isOK = true;
+ break;
+ }
+ }
+
+ if (!isOK)
+ return false;
+
+ OpNum = 0;
+ isLdStMul = true;
+ break;
+ }
+ case ARM::t2STMIA: {
+ // If the base register is killed, we don't care what its value is after the
+ // instruction, so we can use an updating STMIA.
+ if (!MI->getOperand(0).isKill())
+ return false;
+
+ break;
+ }
+ case ARM::t2LDMIA_RET: {
+ unsigned BaseReg = MI->getOperand(1).getReg();
+ if (BaseReg != ARM::SP)
+ return false;
+ Opc = Entry.NarrowOpc2; // tPOP_RET
+ OpNum = 2;
+ isLdStMul = true;
+ break;
+ }
+ case ARM::t2LDMIA_UPD:
+ case ARM::t2STMIA_UPD:
+ case ARM::t2STMDB_UPD: {
+ OpNum = 0;
+
+ unsigned BaseReg = MI->getOperand(1).getReg();
+ if (BaseReg == ARM::SP &&
+ (Entry.WideOpc == ARM::t2LDMIA_UPD ||
+ Entry.WideOpc == ARM::t2STMDB_UPD)) {
+ Opc = Entry.NarrowOpc2; // tPOP or tPUSH
+ OpNum = 2;
+ } else if (!isARMLowRegister(BaseReg) ||
+ (Entry.WideOpc != ARM::t2LDMIA_UPD &&
+ Entry.WideOpc != ARM::t2STMIA_UPD)) {
+ return false;
+ }
+
+ isLdStMul = true;
+ break;
+ }
+ }
+
+ unsigned OffsetReg = 0;
+ bool OffsetKill = false;
+ bool OffsetInternal = false;
+ if (HasShift) {
+ OffsetReg = MI->getOperand(2).getReg();
+ OffsetKill = MI->getOperand(2).isKill();
+ OffsetInternal = MI->getOperand(2).isInternalRead();
+
+ if (MI->getOperand(3).getImm())
+ // Thumb1 addressing mode doesn't support shift.
+ return false;
+ }
+
+ unsigned OffsetImm = 0;
+ if (HasImmOffset) {
+ OffsetImm = MI->getOperand(2).getImm();
+ unsigned MaxOffset = ((1 << ImmLimit) - 1) * Scale;
+
+ if ((OffsetImm & (Scale - 1)) || OffsetImm > MaxOffset)
+ // Make sure the immediate field fits.
+ return false;
+ }
+
+ // Add the 16-bit load / store instruction.
+ DebugLoc dl = MI->getDebugLoc();
+ MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, TII->get(Opc));
+
+ // tSTMIA_UPD takes a defining register operand. We've already checked that
+ // the register is killed, so mark it as dead here.
+ if (Entry.WideOpc == ARM::t2STMIA)
+ MIB.addReg(MI->getOperand(0).getReg(), RegState::Define | RegState::Dead);
+
+ if (!isLdStMul) {
+ MIB.addOperand(MI->getOperand(0));
+ MIB.addOperand(MI->getOperand(1));
+
+ if (HasImmOffset)
+ MIB.addImm(OffsetImm / Scale);
+
+ assert((!HasShift || OffsetReg) && "Invalid so_reg load / store address!");
+
+ if (HasOffReg)
+ MIB.addReg(OffsetReg, getKillRegState(OffsetKill) |
+ getInternalReadRegState(OffsetInternal));
+ }
+
+ // Transfer the rest of operands.
+ for (unsigned e = MI->getNumOperands(); OpNum != e; ++OpNum)
+ MIB.addOperand(MI->getOperand(OpNum));
+
+ // Transfer memoperands.
+ MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+ // Transfer MI flags.
+ MIB.setMIFlags(MI->getFlags());
+
+ DEBUG(errs() << "Converted 32-bit: " << *MI << " to 16-bit: " << *MIB);
+
+ MBB.erase_instr(MI);
+ ++NumLdSts;
+ return true;
+}
+
+bool
+Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
+ const ReduceEntry &Entry,
+ bool LiveCPSR, bool IsSelfLoop) {
+ unsigned Opc = MI->getOpcode();
+ if (Opc == ARM::t2ADDri) {
+ // If the source register is SP, try to reduce to tADDrSPi, otherwise
+ // it's a normal reduce.
+ if (MI->getOperand(1).getReg() != ARM::SP) {
+ if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, IsSelfLoop))
+ return true;
+ return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
+ }
+ // Try to reduce to tADDrSPi.
+ unsigned Imm = MI->getOperand(2).getImm();
+ // The immediate must be in range, the destination register must be a low
+ // reg, the predicate must be "always" and the condition flags must not
+ // be being set.
+ if (Imm & 3 || Imm > 1020)
+ return false;
+ if (!isARMLowRegister(MI->getOperand(0).getReg()))
+ return false;
+ if (MI->getOperand(3).getImm() != ARMCC::AL)
+ return false;
+ const MCInstrDesc &MCID = MI->getDesc();
+ if (MCID.hasOptionalDef() &&
+ MI->getOperand(MCID.getNumOperands()-1).getReg() == ARM::CPSR)
+ return false;
+
+ MachineInstrBuilder MIB = BuildMI(MBB, MI, MI->getDebugLoc(),
+ TII->get(ARM::tADDrSPi))
+ .addOperand(MI->getOperand(0))
+ .addOperand(MI->getOperand(1))
+ .addImm(Imm / 4); // The tADDrSPi has an implied scale by four.
+ AddDefaultPred(MIB);
+
+ // Transfer MI flags.
+ MIB.setMIFlags(MI->getFlags());
+
+ DEBUG(errs() << "Converted 32-bit: " << *MI << " to 16-bit: " <<*MIB);
+
+ MBB.erase_instr(MI);
+ ++NumNarrows;
+ return true;
+ }
+
+ if (Entry.LowRegs1 && !VerifyLowRegs(MI))
+ return false;
+
+ if (MI->mayLoadOrStore())
+ return ReduceLoadStore(MBB, MI, Entry);
+
+ switch (Opc) {
+ default: break;
+ case ARM::t2ADDSri:
+ case ARM::t2ADDSrr: {
+ unsigned PredReg = 0;
+ if (getInstrPredicate(*MI, PredReg) == ARMCC::AL) {
+ switch (Opc) {
+ default: break;
+ case ARM::t2ADDSri: {
+ if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, IsSelfLoop))
+ return true;
+ LLVM_FALLTHROUGH;
+ }
+ case ARM::t2ADDSrr:
+ return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
+ }
+ }
+ break;
+ }
+ case ARM::t2RSBri:
+ case ARM::t2RSBSri:
+ case ARM::t2SXTB:
+ case ARM::t2SXTH:
+ case ARM::t2UXTB:
+ case ARM::t2UXTH:
+ if (MI->getOperand(2).getImm() == 0)
+ return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
+ break;
+ case ARM::t2MOVi16:
+ // Can convert only 'pure' immediate operands, not immediates obtained as
+ // globals' addresses.
+ if (MI->getOperand(1).isImm())
+ return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
+ break;
+ case ARM::t2CMPrr: {
+ // Try to reduce to the lo-reg only version first. Why there are two
+ // versions of the instruction is a mystery.
+ // It would be nice to just have two entries in the master table that
+ // are prioritized, but the table assumes a unique entry for each
+ // source insn opcode. So for now, we hack a local entry record to use.
+ static const ReduceEntry NarrowEntry =
+ { ARM::t2CMPrr,ARM::tCMPr, 0, 0, 0, 1, 1,2, 0, 0,1,0 };
+ if (ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR, IsSelfLoop))
+ return true;
+ return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
+ }
+ }
+ return false;
+}
+
+bool
+Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
+ const ReduceEntry &Entry,
+ bool LiveCPSR, bool IsSelfLoop) {
+
+ if (ReduceLimit2Addr != -1 && ((int)Num2Addrs >= ReduceLimit2Addr))
+ return false;
+
+ if (!OptimizeSize && Entry.AvoidMovs && STI->avoidMOVsShifterOperand())
+ // Don't issue movs with shifter operand for some CPUs unless we
+ // are optimizing for size.
+ return false;
+
+ unsigned Reg0 = MI->getOperand(0).getReg();
+ unsigned Reg1 = MI->getOperand(1).getReg();
+ // t2MUL is "special". The tied source operand is second, not first.
+ if (MI->getOpcode() == ARM::t2MUL) {
+ unsigned Reg2 = MI->getOperand(2).getReg();
+ // Early exit if the regs aren't all low regs.
+ if (!isARMLowRegister(Reg0) || !isARMLowRegister(Reg1)
+ || !isARMLowRegister(Reg2))
+ return false;
+ if (Reg0 != Reg2) {
+ // If the other operand also isn't the same as the destination, we
+ // can't reduce.
+ if (Reg1 != Reg0)
+ return false;
+ // Try to commute the operands to make it a 2-address instruction.
+ MachineInstr *CommutedMI = TII->commuteInstruction(*MI);
+ if (!CommutedMI)
+ return false;
+ }
+ } else if (Reg0 != Reg1) {
+ // Try to commute the operands to make it a 2-address instruction.
+ unsigned CommOpIdx1 = 1;
+ unsigned CommOpIdx2 = TargetInstrInfo::CommuteAnyOperandIndex;
+ if (!TII->findCommutedOpIndices(*MI, CommOpIdx1, CommOpIdx2) ||
+ MI->getOperand(CommOpIdx2).getReg() != Reg0)
+ return false;
+ MachineInstr *CommutedMI =
+ TII->commuteInstruction(*MI, false, CommOpIdx1, CommOpIdx2);
+ if (!CommutedMI)
+ return false;
+ }
+ if (Entry.LowRegs2 && !isARMLowRegister(Reg0))
+ return false;
+ if (Entry.Imm2Limit) {
+ unsigned Imm = MI->getOperand(2).getImm();
+ unsigned Limit = (1 << Entry.Imm2Limit) - 1;
+ if (Imm > Limit)
+ return false;
+ } else {
+ unsigned Reg2 = MI->getOperand(2).getReg();
+ if (Entry.LowRegs2 && !isARMLowRegister(Reg2))
+ return false;
+ }
+
+ // Check if it's possible / necessary to transfer the predicate.
+ const MCInstrDesc &NewMCID = TII->get(Entry.NarrowOpc2);
+ unsigned PredReg = 0;
+ ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
+ bool SkipPred = false;
+ if (Pred != ARMCC::AL) {
+ if (!NewMCID.isPredicable())
+ // Can't transfer predicate, fail.
+ return false;
+ } else {
+ SkipPred = !NewMCID.isPredicable();
+ }
+
+ bool HasCC = false;
+ bool CCDead = false;
+ const MCInstrDesc &MCID = MI->getDesc();
+ if (MCID.hasOptionalDef()) {
+ unsigned NumOps = MCID.getNumOperands();
+ HasCC = (MI->getOperand(NumOps-1).getReg() == ARM::CPSR);
+ if (HasCC && MI->getOperand(NumOps-1).isDead())
+ CCDead = true;
+ }
+ if (!VerifyPredAndCC(MI, Entry, true, Pred, LiveCPSR, HasCC, CCDead))
+ return false;
+
+ // Avoid adding a false dependency on partial flag update by some 16-bit
+ // instructions which has the 's' bit set.
+ if (Entry.PartFlag && NewMCID.hasOptionalDef() && HasCC &&
+ canAddPseudoFlagDep(MI, IsSelfLoop))
+ return false;
+
+ // Add the 16-bit instruction.
+ DebugLoc dl = MI->getDebugLoc();
+ MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, NewMCID);
+ MIB.addOperand(MI->getOperand(0));
+ if (NewMCID.hasOptionalDef()) {
+ if (HasCC)
+ AddDefaultT1CC(MIB, CCDead);
+ else
+ AddNoT1CC(MIB);
+ }
+
+ // Transfer the rest of operands.
+ unsigned NumOps = MCID.getNumOperands();
+ for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) {
+ if (i < NumOps && MCID.OpInfo[i].isOptionalDef())
+ continue;
+ if (SkipPred && MCID.OpInfo[i].isPredicate())
+ continue;
+ MIB.addOperand(MI->getOperand(i));
+ }
+
+ // Transfer MI flags.
+ MIB.setMIFlags(MI->getFlags());
+
+ DEBUG(errs() << "Converted 32-bit: " << *MI << " to 16-bit: " << *MIB);
+
+ MBB.erase_instr(MI);
+ ++Num2Addrs;
+ return true;
+}
+
+bool
+Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
+ const ReduceEntry &Entry,
+ bool LiveCPSR, bool IsSelfLoop) {
+ if (ReduceLimit != -1 && ((int)NumNarrows >= ReduceLimit))
+ return false;
+
+ if (!OptimizeSize && Entry.AvoidMovs && STI->avoidMOVsShifterOperand())
+ // Don't issue movs with shifter operand for some CPUs unless we
+ // are optimizing for size.
+ return false;
+
+ unsigned Limit = ~0U;
+ if (Entry.Imm1Limit)
+ Limit = (1 << Entry.Imm1Limit) - 1;
+
+ const MCInstrDesc &MCID = MI->getDesc();
+ for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i) {
+ if (MCID.OpInfo[i].isPredicate())
+ continue;
+ const MachineOperand &MO = MI->getOperand(i);
+ if (MO.isReg()) {
+ unsigned Reg = MO.getReg();
+ if (!Reg || Reg == ARM::CPSR)
+ continue;
+ if (Entry.LowRegs1 && !isARMLowRegister(Reg))
+ return false;
+ } else if (MO.isImm() &&
+ !MCID.OpInfo[i].isPredicate()) {
+ if (((unsigned)MO.getImm()) > Limit)
+ return false;
+ }
+ }
+
+ // Check if it's possible / necessary to transfer the predicate.
+ const MCInstrDesc &NewMCID = TII->get(Entry.NarrowOpc1);
+ unsigned PredReg = 0;
+ ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
+ bool SkipPred = false;
+ if (Pred != ARMCC::AL) {
+ if (!NewMCID.isPredicable())
+ // Can't transfer predicate, fail.
+ return false;
+ } else {
+ SkipPred = !NewMCID.isPredicable();
+ }
+
+ bool HasCC = false;
+ bool CCDead = false;
+ if (MCID.hasOptionalDef()) {
+ unsigned NumOps = MCID.getNumOperands();
+ HasCC = (MI->getOperand(NumOps-1).getReg() == ARM::CPSR);
+ if (HasCC && MI->getOperand(NumOps-1).isDead())
+ CCDead = true;
+ }
+ if (!VerifyPredAndCC(MI, Entry, false, Pred, LiveCPSR, HasCC, CCDead))
+ return false;
+
+ // Avoid adding a false dependency on partial flag update by some 16-bit
+ // instructions which has the 's' bit set.
+ if (Entry.PartFlag && NewMCID.hasOptionalDef() && HasCC &&
+ canAddPseudoFlagDep(MI, IsSelfLoop))
+ return false;
+
+ // Add the 16-bit instruction.
+ DebugLoc dl = MI->getDebugLoc();
+ MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, NewMCID);
+ MIB.addOperand(MI->getOperand(0));
+ if (NewMCID.hasOptionalDef()) {
+ if (HasCC)
+ AddDefaultT1CC(MIB, CCDead);
+ else
+ AddNoT1CC(MIB);
+ }
+
+ // Transfer the rest of operands.
+ unsigned NumOps = MCID.getNumOperands();
+ for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) {
+ if (i < NumOps && MCID.OpInfo[i].isOptionalDef())
+ continue;
+ if ((MCID.getOpcode() == ARM::t2RSBSri ||
+ MCID.getOpcode() == ARM::t2RSBri ||
+ MCID.getOpcode() == ARM::t2SXTB ||
+ MCID.getOpcode() == ARM::t2SXTH ||
+ MCID.getOpcode() == ARM::t2UXTB ||
+ MCID.getOpcode() == ARM::t2UXTH) && i == 2)
+ // Skip the zero immediate operand, it's now implicit.
+ continue;
+ bool isPred = (i < NumOps && MCID.OpInfo[i].isPredicate());
+ if (SkipPred && isPred)
+ continue;
+ const MachineOperand &MO = MI->getOperand(i);
+ if (MO.isReg() && MO.isImplicit() && MO.getReg() == ARM::CPSR)
+ // Skip implicit def of CPSR. Either it's modeled as an optional
+ // def now or it's already an implicit def on the new instruction.
+ continue;
+ MIB.addOperand(MO);
+ }
+ if (!MCID.isPredicable() && NewMCID.isPredicable())
+ AddDefaultPred(MIB);
+
+ // Transfer MI flags.
+ MIB.setMIFlags(MI->getFlags());
+
+ DEBUG(errs() << "Converted 32-bit: " << *MI << " to 16-bit: " << *MIB);
+
+ MBB.erase_instr(MI);
+ ++NumNarrows;
+ return true;
+}
+
+static bool UpdateCPSRDef(MachineInstr &MI, bool LiveCPSR, bool &DefCPSR) {
+ bool HasDef = false;
+ for (const MachineOperand &MO : MI.operands()) {
+ if (!MO.isReg() || MO.isUndef() || MO.isUse())
+ continue;
+ if (MO.getReg() != ARM::CPSR)
+ continue;
+
+ DefCPSR = true;
+ if (!MO.isDead())
+ HasDef = true;
+ }
+
+ return HasDef || LiveCPSR;
+}
+
+static bool UpdateCPSRUse(MachineInstr &MI, bool LiveCPSR) {
+ for (const MachineOperand &MO : MI.operands()) {
+ if (!MO.isReg() || MO.isUndef() || MO.isDef())
+ continue;
+ if (MO.getReg() != ARM::CPSR)
+ continue;
+ assert(LiveCPSR && "CPSR liveness tracking is wrong!");
+ if (MO.isKill()) {
+ LiveCPSR = false;
+ break;
+ }
+ }
+
+ return LiveCPSR;
+}
+
+bool Thumb2SizeReduce::ReduceMI(MachineBasicBlock &MBB, MachineInstr *MI,
+ bool LiveCPSR, bool IsSelfLoop) {
+ unsigned Opcode = MI->getOpcode();
+ DenseMap<unsigned, unsigned>::iterator OPI = ReduceOpcodeMap.find(Opcode);
+ if (OPI == ReduceOpcodeMap.end())
+ return false;
+ const ReduceEntry &Entry = ReduceTable[OPI->second];
+
+ // Don't attempt normal reductions on "special" cases for now.
+ if (Entry.Special)
+ return ReduceSpecial(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
+
+ // Try to transform to a 16-bit two-address instruction.
+ if (Entry.NarrowOpc2 &&
+ ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, IsSelfLoop))
+ return true;
+
+ // Try to transform to a 16-bit non-two-address instruction.
+ if (Entry.NarrowOpc1 &&
+ ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop))
+ return true;
+
+ return false;
+}
+
+bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
+ bool Modified = false;
+
+ // Yes, CPSR could be livein.
+ bool LiveCPSR = MBB.isLiveIn(ARM::CPSR);
+ MachineInstr *BundleMI = nullptr;
+
+ CPSRDef = nullptr;
+ HighLatencyCPSR = false;
+
+ // Check predecessors for the latest CPSRDef.
+ for (auto *Pred : MBB.predecessors()) {
+ const MBBInfo &PInfo = BlockInfo[Pred->getNumber()];
+ if (!PInfo.Visited) {
+ // Since blocks are visited in RPO, this must be a back-edge.
+ continue;
+ }
+ if (PInfo.HighLatencyCPSR) {
+ HighLatencyCPSR = true;
+ break;
+ }
+ }
+
+ // If this BB loops back to itself, conservatively avoid narrowing the
+ // first instruction that does partial flag update.
+ bool IsSelfLoop = MBB.isSuccessor(&MBB);
+ MachineBasicBlock::instr_iterator MII = MBB.instr_begin(),E = MBB.instr_end();
+ MachineBasicBlock::instr_iterator NextMII;
+ for (; MII != E; MII = NextMII) {
+ NextMII = std::next(MII);
+
+ MachineInstr *MI = &*MII;
+ if (MI->isBundle()) {
+ BundleMI = MI;
+ continue;
+ }
+ if (MI->isDebugValue())
+ continue;
+
+ LiveCPSR = UpdateCPSRUse(*MI, LiveCPSR);
+
+ // Does NextMII belong to the same bundle as MI?
+ bool NextInSameBundle = NextMII != E && NextMII->isBundledWithPred();
+
+ if (ReduceMI(MBB, MI, LiveCPSR, IsSelfLoop)) {
+ Modified = true;
+ MachineBasicBlock::instr_iterator I = std::prev(NextMII);
+ MI = &*I;
+ // Removing and reinserting the first instruction in a bundle will break
+ // up the bundle. Fix the bundling if it was broken.
+ if (NextInSameBundle && !NextMII->isBundledWithPred())
+ NextMII->bundleWithPred();
+ }
+
+ if (BundleMI && !NextInSameBundle && MI->isInsideBundle()) {
+ // FIXME: Since post-ra scheduler operates on bundles, the CPSR kill
+ // marker is only on the BUNDLE instruction. Process the BUNDLE
+ // instruction as we finish with the bundled instruction to work around
+ // the inconsistency.
+ if (BundleMI->killsRegister(ARM::CPSR))
+ LiveCPSR = false;
+ MachineOperand *MO = BundleMI->findRegisterDefOperand(ARM::CPSR);
+ if (MO && !MO->isDead())
+ LiveCPSR = true;
+ MO = BundleMI->findRegisterUseOperand(ARM::CPSR);
+ if (MO && !MO->isKill())
+ LiveCPSR = true;
+ }
+
+ bool DefCPSR = false;
+ LiveCPSR = UpdateCPSRDef(*MI, LiveCPSR, DefCPSR);
+ if (MI->isCall()) {
+ // Calls don't really set CPSR.
+ CPSRDef = nullptr;
+ HighLatencyCPSR = false;
+ IsSelfLoop = false;
+ } else if (DefCPSR) {
+ // This is the last CPSR defining instruction.
+ CPSRDef = MI;
+ HighLatencyCPSR = isHighLatencyCPSR(CPSRDef);
+ IsSelfLoop = false;
+ }
+ }
+
+ MBBInfo &Info = BlockInfo[MBB.getNumber()];
+ Info.HighLatencyCPSR = HighLatencyCPSR;
+ Info.Visited = true;
+ return Modified;
+}
+
+bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) {
+ if (PredicateFtor && !PredicateFtor(*MF.getFunction()))
+ return false;
+
+ STI = &static_cast<const ARMSubtarget &>(MF.getSubtarget());
+ if (STI->isThumb1Only() || STI->prefers32BitThumb())
+ return false;
+
+ TII = static_cast<const Thumb2InstrInfo *>(STI->getInstrInfo());
+
+ // Optimizing / minimizing size? Minimizing size implies optimizing for size.
+ OptimizeSize = MF.getFunction()->optForSize();
+ MinimizeSize = MF.getFunction()->optForMinSize();
+
+ BlockInfo.clear();
+ BlockInfo.resize(MF.getNumBlockIDs());
+
+ // Visit blocks in reverse post-order so LastCPSRDef is known for all
+ // predecessors.
+ ReversePostOrderTraversal<MachineFunction*> RPOT(&MF);
+ bool Modified = false;
+ for (ReversePostOrderTraversal<MachineFunction*>::rpo_iterator
+ I = RPOT.begin(), E = RPOT.end(); I != E; ++I)
+ Modified |= ReduceMBB(**I);
+ return Modified;
+}
+
+/// createThumb2SizeReductionPass - Returns an instance of the Thumb2 size
+/// reduction pass.
+FunctionPass *llvm::createThumb2SizeReductionPass(
+ std::function<bool(const Function &)> Ftor) {
+ return new Thumb2SizeReduce(std::move(Ftor));
+}
diff --git a/contrib/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp
new file mode 100644
index 000000000000..2efd63b84a2c
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp
@@ -0,0 +1,625 @@
+//===-- ThumbRegisterInfo.cpp - Thumb-1 Register Information -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb-1 implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ThumbRegisterInfo.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+extern cl::opt<bool> ReuseFrameIndexVals;
+}
+
+using namespace llvm;
+
+ThumbRegisterInfo::ThumbRegisterInfo() : ARMBaseRegisterInfo() {}
+
+const TargetRegisterClass *
+ThumbRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
+ const MachineFunction &MF) const {
+ if (!MF.getSubtarget<ARMSubtarget>().isThumb1Only())
+ return ARMBaseRegisterInfo::getLargestLegalSuperClass(RC, MF);
+
+ if (ARM::tGPRRegClass.hasSubClassEq(RC))
+ return &ARM::tGPRRegClass;
+ return ARMBaseRegisterInfo::getLargestLegalSuperClass(RC, MF);
+}
+
+const TargetRegisterClass *
+ThumbRegisterInfo::getPointerRegClass(const MachineFunction &MF,
+ unsigned Kind) const {
+ if (!MF.getSubtarget<ARMSubtarget>().isThumb1Only())
+ return ARMBaseRegisterInfo::getPointerRegClass(MF, Kind);
+ return &ARM::tGPRRegClass;
+}
+
+static void emitThumb1LoadConstPool(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ const DebugLoc &dl, unsigned DestReg,
+ unsigned SubIdx, int Val,
+ ARMCC::CondCodes Pred, unsigned PredReg,
+ unsigned MIFlags) {
+ MachineFunction &MF = *MBB.getParent();
+ const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
+ const TargetInstrInfo &TII = *STI.getInstrInfo();
+ MachineConstantPool *ConstantPool = MF.getConstantPool();
+ const Constant *C = ConstantInt::get(
+ Type::getInt32Ty(MBB.getParent()->getFunction()->getContext()), Val);
+ unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
+
+ BuildMI(MBB, MBBI, dl, TII.get(ARM::tLDRpci))
+ .addReg(DestReg, getDefRegState(true), SubIdx)
+ .addConstantPoolIndex(Idx).addImm(Pred).addReg(PredReg)
+ .setMIFlags(MIFlags);
+}
+
+static void emitThumb2LoadConstPool(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ const DebugLoc &dl, unsigned DestReg,
+ unsigned SubIdx, int Val,
+ ARMCC::CondCodes Pred, unsigned PredReg,
+ unsigned MIFlags) {
+ MachineFunction &MF = *MBB.getParent();
+ const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+ MachineConstantPool *ConstantPool = MF.getConstantPool();
+ const Constant *C = ConstantInt::get(
+ Type::getInt32Ty(MBB.getParent()->getFunction()->getContext()), Val);
+ unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
+
+ BuildMI(MBB, MBBI, dl, TII.get(ARM::t2LDRpci))
+ .addReg(DestReg, getDefRegState(true), SubIdx)
+ .addConstantPoolIndex(Idx).addImm((int64_t)ARMCC::AL).addReg(0)
+ .setMIFlags(MIFlags);
+}
+
+/// emitLoadConstPool - Emits a load from constpool to materialize the
+/// specified immediate.
+void ThumbRegisterInfo::emitLoadConstPool(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+ const DebugLoc &dl, unsigned DestReg, unsigned SubIdx, int Val,
+ ARMCC::CondCodes Pred, unsigned PredReg, unsigned MIFlags) const {
+ MachineFunction &MF = *MBB.getParent();
+ const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
+ if (STI.isThumb1Only()) {
+ assert((isARMLowRegister(DestReg) || isVirtualRegister(DestReg)) &&
+ "Thumb1 does not have ldr to high register");
+ return emitThumb1LoadConstPool(MBB, MBBI, dl, DestReg, SubIdx, Val, Pred,
+ PredReg, MIFlags);
+ }
+ return emitThumb2LoadConstPool(MBB, MBBI, dl, DestReg, SubIdx, Val, Pred,
+ PredReg, MIFlags);
+}
+
+/// emitThumbRegPlusImmInReg - Emits a series of instructions to materialize
+/// a destreg = basereg + immediate in Thumb code. Materialize the immediate
+/// in a register using mov / mvn sequences or load the immediate from a
+/// constpool entry.
+static void emitThumbRegPlusImmInReg(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+ const DebugLoc &dl, unsigned DestReg, unsigned BaseReg, int NumBytes,
+ bool CanChangeCC, const TargetInstrInfo &TII,
+ const ARMBaseRegisterInfo &MRI, unsigned MIFlags = MachineInstr::NoFlags) {
+ MachineFunction &MF = *MBB.getParent();
+ const ARMSubtarget &ST = MF.getSubtarget<ARMSubtarget>();
+ bool isHigh = !isARMLowRegister(DestReg) ||
+ (BaseReg != 0 && !isARMLowRegister(BaseReg));
+ bool isSub = false;
+ // Subtract doesn't have high register version. Load the negative value
+ // if either base or dest register is a high register. Also, if do not
+ // issue sub as part of the sequence if condition register is to be
+ // preserved.
+ if (NumBytes < 0 && !isHigh && CanChangeCC) {
+ isSub = true;
+ NumBytes = -NumBytes;
+ }
+ unsigned LdReg = DestReg;
+ if (DestReg == ARM::SP)
+ assert(BaseReg == ARM::SP && "Unexpected!");
+ if (!isARMLowRegister(DestReg) && !MRI.isVirtualRegister(DestReg))
+ LdReg = MF.getRegInfo().createVirtualRegister(&ARM::tGPRRegClass);
+
+ if (NumBytes <= 255 && NumBytes >= 0 && CanChangeCC) {
+ AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg))
+ .addImm(NumBytes)
+ .setMIFlags(MIFlags);
+ } else if (NumBytes < 0 && NumBytes >= -255 && CanChangeCC) {
+ AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg))
+ .addImm(NumBytes)
+ .setMIFlags(MIFlags);
+ AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tRSB), LdReg))
+ .addReg(LdReg, RegState::Kill)
+ .setMIFlags(MIFlags);
+ } else if (ST.genExecuteOnly()) {
+ BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi32imm), LdReg)
+ .addImm(NumBytes).setMIFlags(MIFlags);
+ } else
+ MRI.emitLoadConstPool(MBB, MBBI, dl, LdReg, 0, NumBytes, ARMCC::AL, 0,
+ MIFlags);
+
+ // Emit add / sub.
+ int Opc = (isSub) ? ARM::tSUBrr
+ : ((isHigh || !CanChangeCC) ? ARM::tADDhirr : ARM::tADDrr);
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg);
+ if (Opc != ARM::tADDhirr)
+ MIB = AddDefaultT1CC(MIB);
+ if (DestReg == ARM::SP || isSub)
+ MIB.addReg(BaseReg).addReg(LdReg, RegState::Kill);
+ else
+ MIB.addReg(LdReg).addReg(BaseReg, RegState::Kill);
+ AddDefaultPred(MIB);
+}
+
+/// emitThumbRegPlusImmediate - Emits a series of instructions to materialize
+/// a destreg = basereg + immediate in Thumb code. Tries a series of ADDs or
+/// SUBs first, and uses a constant pool value if the instruction sequence would
+/// be too long. This is allowed to modify the condition flags.
+void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ const DebugLoc &dl, unsigned DestReg,
+ unsigned BaseReg, int NumBytes,
+ const TargetInstrInfo &TII,
+ const ARMBaseRegisterInfo &MRI,
+ unsigned MIFlags) {
+ bool isSub = NumBytes < 0;
+ unsigned Bytes = (unsigned)NumBytes;
+ if (isSub) Bytes = -NumBytes;
+
+ int CopyOpc = 0;
+ unsigned CopyBits = 0;
+ unsigned CopyScale = 1;
+ bool CopyNeedsCC = false;
+ int ExtraOpc = 0;
+ unsigned ExtraBits = 0;
+ unsigned ExtraScale = 1;
+ bool ExtraNeedsCC = false;
+
+ // Strategy:
+ // We need to select two types of instruction, maximizing the available
+ // immediate range of each. The instructions we use will depend on whether
+ // DestReg and BaseReg are low, high or the stack pointer.
+ // * CopyOpc - DestReg = BaseReg + imm
+ // This will be emitted once if DestReg != BaseReg, and never if
+ // DestReg == BaseReg.
+ // * ExtraOpc - DestReg = DestReg + imm
+ // This will be emitted as many times as necessary to add the
+ // full immediate.
+ // If the immediate ranges of these instructions are not large enough to cover
+ // NumBytes with a reasonable number of instructions, we fall back to using a
+ // value loaded from a constant pool.
+ if (DestReg == ARM::SP) {
+ if (BaseReg == ARM::SP) {
+ // sp -> sp
+ // Already in right reg, no copy needed
+ } else {
+ // low -> sp or high -> sp
+ CopyOpc = ARM::tMOVr;
+ CopyBits = 0;
+ }
+ ExtraOpc = isSub ? ARM::tSUBspi : ARM::tADDspi;
+ ExtraBits = 7;
+ ExtraScale = 4;
+ } else if (isARMLowRegister(DestReg)) {
+ if (BaseReg == ARM::SP) {
+ // sp -> low
+ assert(!isSub && "Thumb1 does not have tSUBrSPi");
+ CopyOpc = ARM::tADDrSPi;
+ CopyBits = 8;
+ CopyScale = 4;
+ } else if (DestReg == BaseReg) {
+ // low -> same low
+ // Already in right reg, no copy needed
+ } else if (isARMLowRegister(BaseReg)) {
+ // low -> different low
+ CopyOpc = isSub ? ARM::tSUBi3 : ARM::tADDi3;
+ CopyBits = 3;
+ CopyNeedsCC = true;
+ } else {
+ // high -> low
+ CopyOpc = ARM::tMOVr;
+ CopyBits = 0;
+ }
+ ExtraOpc = isSub ? ARM::tSUBi8 : ARM::tADDi8;
+ ExtraBits = 8;
+ ExtraNeedsCC = true;
+ } else /* DestReg is high */ {
+ if (DestReg == BaseReg) {
+ // high -> same high
+ // Already in right reg, no copy needed
+ } else {
+ // {low,high,sp} -> high
+ CopyOpc = ARM::tMOVr;
+ CopyBits = 0;
+ }
+ ExtraOpc = 0;
+ }
+
+ // We could handle an unaligned immediate with an unaligned copy instruction
+ // and an aligned extra instruction, but this case is not currently needed.
+ assert(((Bytes & 3) == 0 || ExtraScale == 1) &&
+ "Unaligned offset, but all instructions require alignment");
+
+ unsigned CopyRange = ((1 << CopyBits) - 1) * CopyScale;
+ // If we would emit the copy with an immediate of 0, just use tMOVr.
+ if (CopyOpc && Bytes < CopyScale) {
+ CopyOpc = ARM::tMOVr;
+ CopyScale = 1;
+ CopyNeedsCC = false;
+ CopyRange = 0;
+ }
+ unsigned ExtraRange = ((1 << ExtraBits) - 1) * ExtraScale; // per instruction
+ unsigned RequiredCopyInstrs = CopyOpc ? 1 : 0;
+ unsigned RangeAfterCopy = (CopyRange > Bytes) ? 0 : (Bytes - CopyRange);
+
+ // We could handle this case when the copy instruction does not require an
+ // aligned immediate, but we do not currently do this.
+ assert(RangeAfterCopy % ExtraScale == 0 &&
+ "Extra instruction requires immediate to be aligned");
+
+ unsigned RequiredExtraInstrs;
+ if (ExtraRange)
+ RequiredExtraInstrs = alignTo(RangeAfterCopy, ExtraRange) / ExtraRange;
+ else if (RangeAfterCopy > 0)
+ // We need an extra instruction but none is available
+ RequiredExtraInstrs = 1000000;
+ else
+ RequiredExtraInstrs = 0;
+ unsigned RequiredInstrs = RequiredCopyInstrs + RequiredExtraInstrs;
+ unsigned Threshold = (DestReg == ARM::SP) ? 3 : 2;
+
+ // Use a constant pool, if the sequence of ADDs/SUBs is too expensive.
+ if (RequiredInstrs > Threshold) {
+ emitThumbRegPlusImmInReg(MBB, MBBI, dl,
+ DestReg, BaseReg, NumBytes, true,
+ TII, MRI, MIFlags);
+ return;
+ }
+
+ // Emit zero or one copy instructions
+ if (CopyOpc) {
+ unsigned CopyImm = std::min(Bytes, CopyRange) / CopyScale;
+ Bytes -= CopyImm * CopyScale;
+
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(CopyOpc), DestReg);
+ if (CopyNeedsCC)
+ MIB = AddDefaultT1CC(MIB);
+ MIB.addReg(BaseReg, RegState::Kill);
+ if (CopyOpc != ARM::tMOVr) {
+ MIB.addImm(CopyImm);
+ }
+ AddDefaultPred(MIB.setMIFlags(MIFlags));
+
+ BaseReg = DestReg;
+ }
+
+ // Emit zero or more in-place add/sub instructions
+ while (Bytes) {
+ unsigned ExtraImm = std::min(Bytes, ExtraRange) / ExtraScale;
+ Bytes -= ExtraImm * ExtraScale;
+
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(ExtraOpc), DestReg);
+ if (ExtraNeedsCC)
+ MIB = AddDefaultT1CC(MIB);
+ MIB.addReg(BaseReg).addImm(ExtraImm);
+ MIB = AddDefaultPred(MIB);
+ MIB.setMIFlags(MIFlags);
+ }
+}
+
+static void removeOperands(MachineInstr &MI, unsigned i) {
+ unsigned Op = i;
+ for (unsigned e = MI.getNumOperands(); i != e; ++i)
+ MI.RemoveOperand(Op);
+}
+
+/// convertToNonSPOpcode - Change the opcode to the non-SP version, because
+/// we're replacing the frame index with a non-SP register.
+static unsigned convertToNonSPOpcode(unsigned Opcode) {
+ switch (Opcode) {
+ case ARM::tLDRspi:
+ return ARM::tLDRi;
+
+ case ARM::tSTRspi:
+ return ARM::tSTRi;
+ }
+
+ return Opcode;
+}
+
+bool ThumbRegisterInfo::rewriteFrameIndex(MachineBasicBlock::iterator II,
+ unsigned FrameRegIdx,
+ unsigned FrameReg, int &Offset,
+ const ARMBaseInstrInfo &TII) const {
+ MachineInstr &MI = *II;
+ MachineBasicBlock &MBB = *MI.getParent();
+ assert(MBB.getParent()->getSubtarget<ARMSubtarget>().isThumb1Only() &&
+ "This isn't needed for thumb2!");
+ DebugLoc dl = MI.getDebugLoc();
+ MachineInstrBuilder MIB(*MBB.getParent(), &MI);
+ unsigned Opcode = MI.getOpcode();
+ const MCInstrDesc &Desc = MI.getDesc();
+ unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+
+ if (Opcode == ARM::tADDframe) {
+ Offset += MI.getOperand(FrameRegIdx+1).getImm();
+ unsigned DestReg = MI.getOperand(0).getReg();
+
+ emitThumbRegPlusImmediate(MBB, II, dl, DestReg, FrameReg, Offset, TII,
+ *this);
+ MBB.erase(II);
+ return true;
+ } else {
+ if (AddrMode != ARMII::AddrModeT1_s)
+ llvm_unreachable("Unsupported addressing mode!");
+
+ unsigned ImmIdx = FrameRegIdx + 1;
+ int InstrOffs = MI.getOperand(ImmIdx).getImm();
+ unsigned NumBits = (FrameReg == ARM::SP) ? 8 : 5;
+ unsigned Scale = 4;
+
+ Offset += InstrOffs * Scale;
+ assert((Offset & (Scale - 1)) == 0 && "Can't encode this offset!");
+
+ // Common case: small offset, fits into instruction.
+ MachineOperand &ImmOp = MI.getOperand(ImmIdx);
+ int ImmedOffset = Offset / Scale;
+ unsigned Mask = (1 << NumBits) - 1;
+
+ if ((unsigned)Offset <= Mask * Scale) {
+ // Replace the FrameIndex with the frame register (e.g., sp).
+ MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+ ImmOp.ChangeToImmediate(ImmedOffset);
+
+ // If we're using a register where sp was stored, convert the instruction
+ // to the non-SP version.
+ unsigned NewOpc = convertToNonSPOpcode(Opcode);
+ if (NewOpc != Opcode && FrameReg != ARM::SP)
+ MI.setDesc(TII.get(NewOpc));
+
+ return true;
+ }
+
+ NumBits = 5;
+ Mask = (1 << NumBits) - 1;
+
+ // If this is a thumb spill / restore, we will be using a constpool load to
+ // materialize the offset.
+ if (Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi) {
+ ImmOp.ChangeToImmediate(0);
+ } else {
+ // Otherwise, it didn't fit. Pull in what we can to simplify the immed.
+ ImmedOffset = ImmedOffset & Mask;
+ ImmOp.ChangeToImmediate(ImmedOffset);
+ Offset &= ~(Mask * Scale);
+ }
+ }
+
+ return Offset == 0;
+}
+
+void ThumbRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+ int64_t Offset) const {
+ const MachineFunction &MF = *MI.getParent()->getParent();
+ const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
+ if (!STI.isThumb1Only())
+ return ARMBaseRegisterInfo::resolveFrameIndex(MI, BaseReg, Offset);
+
+ const ARMBaseInstrInfo &TII = *STI.getInstrInfo();
+ int Off = Offset; // ARM doesn't need the general 64-bit offsets
+ unsigned i = 0;
+
+ while (!MI.getOperand(i).isFI()) {
+ ++i;
+ assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+ }
+ bool Done = rewriteFrameIndex(MI, i, BaseReg, Off, TII);
+ assert (Done && "Unable to resolve frame index!");
+ (void)Done;
+}
+
+/// saveScavengerRegister - Spill the register so it can be used by the
+/// register scavenger. Return true.
+bool ThumbRegisterInfo::saveScavengerRegister(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator &UseMI, const TargetRegisterClass *RC,
+ unsigned Reg) const {
+
+ const ARMSubtarget &STI = MBB.getParent()->getSubtarget<ARMSubtarget>();
+ if (!STI.isThumb1Only())
+ return ARMBaseRegisterInfo::saveScavengerRegister(MBB, I, UseMI, RC, Reg);
+
+ // Thumb1 can't use the emergency spill slot on the stack because
+ // ldr/str immediate offsets must be positive, and if we're referencing
+ // off the frame pointer (if, for example, there are alloca() calls in
+ // the function, the offset will be negative. Use R12 instead since that's
+ // a call clobbered register that we know won't be used in Thumb1 mode.
+ const TargetInstrInfo &TII = *STI.getInstrInfo();
+ DebugLoc DL;
+ AddDefaultPred(BuildMI(MBB, I, DL, TII.get(ARM::tMOVr))
+ .addReg(ARM::R12, RegState::Define)
+ .addReg(Reg, RegState::Kill));
+
+ // The UseMI is where we would like to restore the register. If there's
+ // interference with R12 before then, however, we'll need to restore it
+ // before that instead and adjust the UseMI.
+ bool done = false;
+ for (MachineBasicBlock::iterator II = I; !done && II != UseMI ; ++II) {
+ if (II->isDebugValue())
+ continue;
+ // If this instruction affects R12, adjust our restore point.
+ for (unsigned i = 0, e = II->getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = II->getOperand(i);
+ if (MO.isRegMask() && MO.clobbersPhysReg(ARM::R12)) {
+ UseMI = II;
+ done = true;
+ break;
+ }
+ if (!MO.isReg() || MO.isUndef() || !MO.getReg() ||
+ TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+ continue;
+ if (MO.getReg() == ARM::R12) {
+ UseMI = II;
+ done = true;
+ break;
+ }
+ }
+ }
+ // Restore the register from R12
+ AddDefaultPred(BuildMI(MBB, UseMI, DL, TII.get(ARM::tMOVr)).
+ addReg(Reg, RegState::Define).addReg(ARM::R12, RegState::Kill));
+
+ return true;
+}
+
+void ThumbRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+ int SPAdj, unsigned FIOperandNum,
+ RegScavenger *RS) const {
+ MachineInstr &MI = *II;
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineFunction &MF = *MBB.getParent();
+ const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
+ if (!STI.isThumb1Only())
+ return ARMBaseRegisterInfo::eliminateFrameIndex(II, SPAdj, FIOperandNum,
+ RS);
+
+ unsigned VReg = 0;
+ const ARMBaseInstrInfo &TII = *STI.getInstrInfo();
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+ DebugLoc dl = MI.getDebugLoc();
+ MachineInstrBuilder MIB(*MBB.getParent(), &MI);
+
+ unsigned FrameReg = ARM::SP;
+ int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+ int Offset = MF.getFrameInfo().getObjectOffset(FrameIndex) +
+ MF.getFrameInfo().getStackSize() + SPAdj;
+
+ if (MF.getFrameInfo().hasVarSizedObjects()) {
+ assert(SPAdj == 0 && STI.getFrameLowering()->hasFP(MF) && "Unexpected");
+ // There are alloca()'s in this function, must reference off the frame
+ // pointer or base pointer instead.
+ if (!hasBasePointer(MF)) {
+ FrameReg = getFrameRegister(MF);
+ Offset -= AFI->getFramePtrSpillOffset();
+ } else
+ FrameReg = BasePtr;
+ }
+
+ // PEI::scavengeFrameVirtualRegs() cannot accurately track SPAdj because the
+ // call frame setup/destroy instructions have already been eliminated. That
+ // means the stack pointer cannot be used to access the emergency spill slot
+ // when !hasReservedCallFrame().
+#ifndef NDEBUG
+ if (RS && FrameReg == ARM::SP && RS->isScavengingFrameIndex(FrameIndex)){
+ assert(STI.getFrameLowering()->hasReservedCallFrame(MF) &&
+ "Cannot use SP to access the emergency spill slot in "
+ "functions without a reserved call frame");
+ assert(!MF.getFrameInfo().hasVarSizedObjects() &&
+ "Cannot use SP to access the emergency spill slot in "
+ "functions with variable sized frame objects");
+ }
+#endif // NDEBUG
+
+ // Special handling of dbg_value instructions.
+ if (MI.isDebugValue()) {
+ MI.getOperand(FIOperandNum). ChangeToRegister(FrameReg, false /*isDef*/);
+ MI.getOperand(FIOperandNum+1).ChangeToImmediate(Offset);
+ return;
+ }
+
+ // Modify MI as necessary to handle as much of 'Offset' as possible
+ assert(AFI->isThumbFunction() &&
+ "This eliminateFrameIndex only supports Thumb1!");
+ if (rewriteFrameIndex(MI, FIOperandNum, FrameReg, Offset, TII))
+ return;
+
+ // If we get here, the immediate doesn't fit into the instruction. We folded
+ // as much as possible above, handle the rest, providing a register that is
+ // SP+LargeImm.
+ assert(Offset && "This code isn't needed if offset already handled!");
+
+ unsigned Opcode = MI.getOpcode();
+
+ // Remove predicate first.
+ int PIdx = MI.findFirstPredOperandIdx();
+ if (PIdx != -1)
+ removeOperands(MI, PIdx);
+
+ if (MI.mayLoad()) {
+ // Use the destination register to materialize sp + offset.
+ unsigned TmpReg = MI.getOperand(0).getReg();
+ bool UseRR = false;
+ if (Opcode == ARM::tLDRspi) {
+ if (FrameReg == ARM::SP || STI.genExecuteOnly())
+ emitThumbRegPlusImmInReg(MBB, II, dl, TmpReg, FrameReg,
+ Offset, false, TII, *this);
+ else {
+ emitLoadConstPool(MBB, II, dl, TmpReg, 0, Offset);
+ UseRR = true;
+ }
+ } else {
+ emitThumbRegPlusImmediate(MBB, II, dl, TmpReg, FrameReg, Offset, TII,
+ *this);
+ }
+
+ MI.setDesc(TII.get(UseRR ? ARM::tLDRr : ARM::tLDRi));
+ MI.getOperand(FIOperandNum).ChangeToRegister(TmpReg, false, false, true);
+ if (UseRR)
+ // Use [reg, reg] addrmode. Replace the immediate operand w/ the frame
+ // register. The offset is already handled in the vreg value.
+ MI.getOperand(FIOperandNum+1).ChangeToRegister(FrameReg, false, false,
+ false);
+ } else if (MI.mayStore()) {
+ VReg = MF.getRegInfo().createVirtualRegister(&ARM::tGPRRegClass);
+ bool UseRR = false;
+
+ if (Opcode == ARM::tSTRspi) {
+ if (FrameReg == ARM::SP || STI.genExecuteOnly())
+ emitThumbRegPlusImmInReg(MBB, II, dl, VReg, FrameReg,
+ Offset, false, TII, *this);
+ else {
+ emitLoadConstPool(MBB, II, dl, VReg, 0, Offset);
+ UseRR = true;
+ }
+ } else
+ emitThumbRegPlusImmediate(MBB, II, dl, VReg, FrameReg, Offset, TII,
+ *this);
+ MI.setDesc(TII.get(UseRR ? ARM::tSTRr : ARM::tSTRi));
+ MI.getOperand(FIOperandNum).ChangeToRegister(VReg, false, false, true);
+ if (UseRR)
+ // Use [reg, reg] addrmode. Replace the immediate operand w/ the frame
+ // register. The offset is already handled in the vreg value.
+ MI.getOperand(FIOperandNum+1).ChangeToRegister(FrameReg, false, false,
+ false);
+ } else {
+ llvm_unreachable("Unexpected opcode!");
+ }
+
+ // Add predicate back if it's needed.
+ if (MI.isPredicable())
+ AddDefaultPred(MIB);
+}
diff --git a/contrib/llvm/lib/Target/ARM/ThumbRegisterInfo.h b/contrib/llvm/lib/Target/ARM/ThumbRegisterInfo.h
new file mode 100644
index 000000000000..e6b06959e428
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ThumbRegisterInfo.h
@@ -0,0 +1,66 @@
+//===- ThumbRegisterInfo.h - Thumb Register Information Impl -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb implementation of the TargetRegisterInfo
+// class. With the exception of emitLoadConstPool Thumb2 tracks
+// ARMBaseRegisterInfo, Thumb1 overloads the functions below.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_THUMB1REGISTERINFO_H
+#define LLVM_LIB_TARGET_ARM_THUMB1REGISTERINFO_H
+
+#include "ARMBaseRegisterInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+namespace llvm {
+ class ARMSubtarget;
+ class ARMBaseInstrInfo;
+
+struct ThumbRegisterInfo : public ARMBaseRegisterInfo {
+public:
+ ThumbRegisterInfo();
+
+ const TargetRegisterClass *
+ getLargestLegalSuperClass(const TargetRegisterClass *RC,
+ const MachineFunction &MF) const override;
+
+ const TargetRegisterClass *
+ getPointerRegClass(const MachineFunction &MF,
+ unsigned Kind = 0) const override;
+
+ /// emitLoadConstPool - Emits a load from constpool to materialize the
+ /// specified immediate.
+ void
+ emitLoadConstPool(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+ const DebugLoc &dl, unsigned DestReg, unsigned SubIdx,
+ int Val, ARMCC::CondCodes Pred = ARMCC::AL,
+ unsigned PredReg = 0,
+ unsigned MIFlags = MachineInstr::NoFlags) const override;
+
+ // rewrite MI to access 'Offset' bytes from the FP. Update Offset to be
+ // however much remains to be handled. Return 'true' if no further
+ // work is required.
+ bool rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx,
+ unsigned FrameReg, int &Offset,
+ const ARMBaseInstrInfo &TII) const;
+ void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+ int64_t Offset) const override;
+ bool saveScavengerRegister(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator &UseMI,
+ const TargetRegisterClass *RC,
+ unsigned Reg) const override;
+ void eliminateFrameIndex(MachineBasicBlock::iterator II,
+ int SPAdj, unsigned FIOperandNum,
+ RegScavenger *RS = nullptr) const override;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/AVR/AVR.h b/contrib/llvm/lib/Target/AVR/AVR.h
new file mode 100644
index 000000000000..8e5cc5360ad4
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVR.h
@@ -0,0 +1,58 @@
+//===-- AVR.h - Top-level interface for AVR representation ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// AVR back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_H
+#define LLVM_AVR_H
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+
+namespace llvm {
+
+class AVRTargetMachine;
+class FunctionPass;
+
+FunctionPass *createAVRISelDag(AVRTargetMachine &TM,
+ CodeGenOpt::Level OptLevel);
+FunctionPass *createAVRExpandPseudoPass();
+FunctionPass *createAVRFrameAnalyzerPass();
+FunctionPass *createAVRInstrumentFunctionsPass();
+FunctionPass *createAVRRelaxMemPass();
+FunctionPass *createAVRDynAllocaSRPass();
+FunctionPass *createAVRBranchSelectionPass();
+
+void initializeAVRExpandPseudoPass(PassRegistry&);
+void initializeAVRInstrumentFunctionsPass(PassRegistry&);
+void initializeAVRRelaxMemPass(PassRegistry&);
+
+/// Contains the AVR backend.
+namespace AVR {
+
+enum AddressSpace { DataMemory, ProgramMemory };
+
+template <typename T> bool isProgramMemoryAddress(T *V) {
+ return cast<PointerType>(V->getType())->getAddressSpace() == ProgramMemory;
+}
+
+inline bool isProgramMemoryAccess(MemSDNode const *N) {
+ auto V = N->getMemOperand()->getValue();
+
+ return (V != nullptr) ? isProgramMemoryAddress(V) : false;
+}
+
+} // end of namespace AVR
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_H
diff --git a/contrib/llvm/lib/Target/AVR/AVR.td b/contrib/llvm/lib/Target/AVR/AVR.td
new file mode 100644
index 000000000000..d03b983aa70b
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVR.td
@@ -0,0 +1,81 @@
+//===-- AVR.td - Describe the AVR Target Machine ----------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+// This is the top level entry point for the AVR target.
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===---------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===---------------------------------------------------------------------===//
+// AVR Device Definitions
+//===---------------------------------------------------------------------===//
+
+include "AVRDevices.td"
+
+//===---------------------------------------------------------------------===//
+// Register File Description
+//===---------------------------------------------------------------------===//
+
+include "AVRRegisterInfo.td"
+
+//===---------------------------------------------------------------------===//
+// Instruction Descriptions
+//===---------------------------------------------------------------------===//
+
+include "AVRInstrInfo.td"
+
+def AVRInstrInfo : InstrInfo;
+
+//===---------------------------------------------------------------------===//
+// Calling Conventions
+//===---------------------------------------------------------------------===//
+
+include "AVRCallingConv.td"
+
+//===---------------------------------------------------------------------===//
+// Assembly Printers
+//===---------------------------------------------------------------------===//
+
+def AVRAsmWriter : AsmWriter {
+ string AsmWriterClassName = "InstPrinter";
+ bit isMCAsmWriter = 1;
+}
+
+//===---------------------------------------------------------------------===//
+// Assembly Parsers
+//===---------------------------------------------------------------------===//
+
+def AVRAsmParser : AsmParser {
+ let ShouldEmitMatchRegisterName = 1;
+ let ShouldEmitMatchRegisterAltName = 1;
+}
+
+def AVRAsmParserVariant : AsmParserVariant {
+ int Variant = 0;
+
+ // Recognize hard coded registers.
+ string RegisterPrefix = "$";
+ string TokenizingCharacters = "+";
+}
+
+//===---------------------------------------------------------------------===//
+// Target Declaration
+//===---------------------------------------------------------------------===//
+
+def AVR : Target {
+ let InstructionSet = AVRInstrInfo;
+ let AssemblyWriters = [AVRAsmWriter];
+
+ let AssemblyParsers = [AVRAsmParser];
+ let AssemblyParserVariants = [AVRAsmParserVariant];
+}
+
diff --git a/contrib/llvm/lib/Target/AVR/AVRAsmPrinter.cpp b/contrib/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
new file mode 100644
index 000000000000..4afdd3a0ec08
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
@@ -0,0 +1,184 @@
+//===-- AVRAsmPrinter.cpp - AVR LLVM assembly writer ----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to GAS-format AVR assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVR.h"
+#include "AVRMCInstLower.h"
+#include "AVRSubtarget.h"
+#include "InstPrinter/AVRInstPrinter.h"
+
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+#define DEBUG_TYPE "avr-asm-printer"
+
+namespace llvm {
+
+/// An AVR assembly code printer.
+class AVRAsmPrinter : public AsmPrinter {
+public:
+ AVRAsmPrinter(TargetMachine &TM,
+ std::unique_ptr<MCStreamer> Streamer)
+ : AsmPrinter(TM, std::move(Streamer)), MRI(*TM.getMCRegisterInfo()) { }
+
+ StringRef getPassName() const override { return "AVR Assembly Printer"; }
+
+ void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O,
+ const char *Modifier = 0);
+
+ bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &O) override;
+
+ bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &O) override;
+
+ void EmitInstruction(const MachineInstr *MI) override;
+
+private:
+ const MCRegisterInfo &MRI;
+};
+
+void AVRAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
+ raw_ostream &O, const char *Modifier) {
+ const MachineOperand &MO = MI->getOperand(OpNo);
+
+ switch (MO.getType()) {
+ case MachineOperand::MO_Register:
+ O << AVRInstPrinter::getPrettyRegisterName(MO.getReg(), MRI);
+ break;
+ case MachineOperand::MO_Immediate:
+ O << MO.getImm();
+ break;
+ case MachineOperand::MO_GlobalAddress:
+ O << getSymbol(MO.getGlobal());
+ break;
+ case MachineOperand::MO_ExternalSymbol:
+ O << *GetExternalSymbolSymbol(MO.getSymbolName());
+ break;
+ case MachineOperand::MO_MachineBasicBlock:
+ O << *MO.getMBB()->getSymbol();
+ break;
+ default:
+ llvm_unreachable("Not implemented yet!");
+ }
+}
+
+bool AVRAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &O) {
+ // Default asm printer can only deal with some extra codes,
+ // so try it first.
+ bool Error = AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O);
+
+ if (Error && ExtraCode && ExtraCode[0]) {
+ if (ExtraCode[1] != 0)
+ return true; // Unknown modifier.
+
+ if (ExtraCode[0] >= 'A' && ExtraCode[0] <= 'Z') {
+ const MachineOperand &RegOp = MI->getOperand(OpNum);
+
+ assert(RegOp.isReg() && "Operand must be a register when you're"
+ "using 'A'..'Z' operand extracodes.");
+ unsigned Reg = RegOp.getReg();
+
+ unsigned ByteNumber = ExtraCode[0] - 'A';
+
+ unsigned OpFlags = MI->getOperand(OpNum - 1).getImm();
+ unsigned NumOpRegs = InlineAsm::getNumOperandRegisters(OpFlags);
+ (void)NumOpRegs;
+
+ const AVRSubtarget &STI = MF->getSubtarget<AVRSubtarget>();
+ const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+
+ unsigned BytesPerReg = TRI.getMinimalPhysRegClass(Reg)->getSize();
+ assert(BytesPerReg <= 2 && "Only 8 and 16 bit regs are supported.");
+
+ unsigned RegIdx = ByteNumber / BytesPerReg;
+ assert(RegIdx < NumOpRegs && "Multibyte index out of range.");
+
+ Reg = MI->getOperand(OpNum + RegIdx).getReg();
+
+ if (BytesPerReg == 2) {
+ Reg = TRI.getSubReg(Reg, ByteNumber % BytesPerReg ? AVR::sub_hi
+ : AVR::sub_lo);
+ }
+
+ O << AVRInstPrinter::getPrettyRegisterName(Reg, MRI);
+ return false;
+ }
+ }
+
+ printOperand(MI, OpNum, O);
+
+ return false;
+}
+
+bool AVRAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+ unsigned OpNum, unsigned AsmVariant,
+ const char *ExtraCode,
+ raw_ostream &O) {
+ if (ExtraCode && ExtraCode[0]) {
+ llvm_unreachable("This branch is not implemented yet");
+ }
+
+ const MachineOperand &MO = MI->getOperand(OpNum);
+ (void)MO;
+ assert(MO.isReg() && "Unexpected inline asm memory operand");
+
+ // TODO: We can look up the alternative name for the register if it's given.
+ if (MI->getOperand(OpNum).getReg() == AVR::R31R30) {
+ O << "Z";
+ } else {
+ assert(MI->getOperand(OpNum).getReg() == AVR::R29R28 &&
+ "Wrong register class for memory operand.");
+ O << "Y";
+ }
+
+ // If NumOpRegs == 2, then we assume it is product of a FrameIndex expansion
+ // and the second operand is an Imm.
+ unsigned OpFlags = MI->getOperand(OpNum - 1).getImm();
+ unsigned NumOpRegs = InlineAsm::getNumOperandRegisters(OpFlags);
+
+ if (NumOpRegs == 2) {
+ O << '+' << MI->getOperand(OpNum + 1).getImm();
+ }
+
+ return false;
+}
+
+void AVRAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+ AVRMCInstLower MCInstLowering(OutContext, *this);
+
+ MCInst I;
+ MCInstLowering.lowerInstruction(*MI, I);
+ EmitToStreamer(*OutStreamer, I);
+}
+
+} // end of namespace llvm
+
+extern "C" void LLVMInitializeAVRAsmPrinter() {
+ llvm::RegisterAsmPrinter<llvm::AVRAsmPrinter> X(llvm::getTheAVRTarget());
+}
+
diff --git a/contrib/llvm/lib/Target/AVR/AVRCallingConv.td b/contrib/llvm/lib/Target/AVR/AVRCallingConv.td
new file mode 100644
index 000000000000..68dbce02706f
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRCallingConv.td
@@ -0,0 +1,58 @@
+//===-- AVRCallingConv.td - Calling Conventions for AVR ----*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for AVR architecture.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// AVR Return Value Calling Convention
+//===----------------------------------------------------------------------===//
+
+def RetCC_AVR : CallingConv
+<[
+ // i8 is returned in R24.
+ CCIfType<[i8], CCAssignToReg<[R24]>>,
+
+ // i16 are returned in R25:R24, R23:R22, R21:R20 and R19:R18.
+ CCIfType<[i16], CCAssignToReg<[R25R24, R23R22, R21R20, R19R18]>>
+]>;
+
+// Special return value calling convention for runtime functions.
+def RetCC_AVR_BUILTIN : CallingConv
+<[
+ CCIfType<[i8], CCAssignToReg<[R24,R25]>>,
+ CCIfType<[i16], CCAssignToReg<[R23R22, R25R24]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// AVR Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// The calling conventions are implemented in custom C++ code
+
+// Calling convention for variadic functions.
+def ArgCC_AVR_Vararg : CallingConv
+<[
+ // i16 are always passed through the stack with an alignment of 1.
+ CCAssignToStack<2, 1>
+]>;
+
+// Special argument calling convention for
+// division runtime functions.
+def ArgCC_AVR_BUILTIN_DIV : CallingConv
+<[
+ CCIfType<[i8], CCAssignToReg<[R24,R22]>>,
+ CCIfType<[i16], CCAssignToReg<[R25R24, R23R22]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Callee-saved register lists.
+//===----------------------------------------------------------------------===//
+
+def CSR_Normal : CalleeSavedRegs<(add R29, R28, (sequence "R%u", 17, 2))>;
+def CSR_Interrupts : CalleeSavedRegs<(add (sequence "R%u", 31, 0))>;
diff --git a/contrib/llvm/lib/Target/AVR/AVRDevices.td b/contrib/llvm/lib/Target/AVR/AVRDevices.td
new file mode 100644
index 000000000000..9224af613d14
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRDevices.td
@@ -0,0 +1,491 @@
+//===---------------------------------------------------------------------===//
+// AVR Device Definitions
+//===---------------------------------------------------------------------===//
+
+// :TODO: Implement the skip errata, see `gcc/config/avr/avr-arch.h` for details
+// :TODO: We define all devices with SRAM to have all variants of LD/ST/LDD/STD.
+// In reality, avr1 (no SRAM) has one variant each of `LD` and `ST`.
+// avr2 (with SRAM) adds the rest of the variants.
+// :TODO: s/AVRTiny/Tiny
+
+
+// A feature set aggregates features, grouping them. We don't want to create a
+// new member in AVRSubtarget (to store a value) for each set because we do not
+// care if the set is supported, only the subfeatures inside the set. We fix
+// this by simply setting the same dummy member for all feature sets, which is
+// then ignored.
+class FeatureSet<string name, string desc, list<SubtargetFeature> i>
+ : SubtargetFeature<name, "m_FeatureSetDummy", "true", desc, i>;
+
+// A family of microcontrollers, defining a set of supported features.
+class Family<string name, list<SubtargetFeature> i>
+ : FeatureSet<name, !strconcat("The device is a part of the ",
+ name, " family"), i>;
+
+// The device has SRAM, and supports the bare minimum of
+// SRAM-relevant instructions.
+//
+// These are:
+// LD - all 9 variants
+// ST - all 9 variants
+// LDD - two variants for Y and Z
+// STD - two variants for Y and Z
+// `LDS Rd, K`
+// `STS k, Rr`
+// `PUSH`/`POP`
+def FeatureSRAM : SubtargetFeature<"sram", "m_hasSRAM", "true",
+ "The device has random access memory">;
+
+// The device supports the `JMP k` and `CALL k` instructions.
+def FeatureJMPCALL : SubtargetFeature<"jmpcall", "m_hasJMPCALL", "true",
+ "The device supports the `JMP` and "
+ "`CALL` instructions">;
+
+
+// The device supports the indirect branches `IJMP` and `ICALL`.
+def FeatureIJMPCALL : SubtargetFeature<"ijmpcall", "m_hasIJMPCALL",
+ "true",
+ "The device supports `IJMP`/`ICALL`"
+ "instructions">;
+
+// The device supports the extended indirect branches `EIJMP` and `EICALL`.
+def FeatureEIJMPCALL : SubtargetFeature<"eijmpcall", "m_hasEIJMPCALL",
+ "true", "The device supports the "
+ "`EIJMP`/`EICALL` instructions">;
+
+// The device supports `ADDI Rd, K`, `SUBI Rd, K`.
+def FeatureADDSUBIW : SubtargetFeature<"addsubiw", "m_hasADDSUBIW",
+ "true", "Enable 16-bit register-immediate "
+ "addition and subtraction instructions">;
+
+// The device has an 8-bit stack pointer (SP) register.
+def FeatureSmallStack : SubtargetFeature<"smallstack", "m_hasSmallStack",
+ "true", "The device has an 8-bit "
+ "stack pointer">;
+
+// The device supports the 16-bit GPR pair MOVW instruction.
+def FeatureMOVW : SubtargetFeature<"movw", "m_hasMOVW", "true",
+ "The device supports the 16-bit MOVW "
+ "instruction">;
+
+// The device supports the `LPM` instruction, with implied destination being r0.
+def FeatureLPM : SubtargetFeature<"lpm", "m_hasLPM", "true",
+ "The device supports the `LPM` instruction">;
+
+// The device supports the `LPM Rd, Z[+] instruction.
+def FeatureLPMX : SubtargetFeature<"lpmx", "m_hasLPMX", "true",
+ "The device supports the `LPM Rd, Z[+]` "
+ "instruction">;
+
+// The device supports the `ELPM` instruction.
+def FeatureELPM : SubtargetFeature<"elpm", "m_hasELPM", "true",
+ "The device supports the ELPM instruction">;
+
+// The device supports the `ELPM Rd, Z[+]` instructions.
+def FeatureELPMX : SubtargetFeature<"elpmx", "m_hasELPMX", "true",
+ "The device supports the `ELPM Rd, Z[+]` "
+ "instructions">;
+
+// The device supports the `SPM` instruction.
+def FeatureSPM : SubtargetFeature<"spm", "m_hasSPM", "true",
+ "The device supports the `SPM` instruction">;
+
+// The device supports the `SPM Z+` instruction.
+def FeatureSPMX : SubtargetFeature<"spmx", "m_hasSPMX", "true",
+ "The device supports the `SPM Z+` "
+ "instruction">;
+
+// The device supports the `DES k` instruction.
+def FeatureDES : SubtargetFeature<"des", "m_hasDES", "true",
+ "The device supports the `DES k` encryption "
+ "instruction">;
+
+// The device supports the Read-Write-Modify instructions
+// XCH, LAS, LAC, and LAT.
+def FeatureRMW : SubtargetFeature<"rmw", "m_supportsRMW", "true",
+ "The device supports the read-write-modify "
+ "instructions: XCH, LAS, LAC, LAT">;
+
+// The device supports the `[F]MUL[S][U]` family of instructions.
+def FeatureMultiplication : SubtargetFeature<"mul", "m_supportsMultiplication",
+ "true", "The device supports the "
+ "multiplication instructions">;
+
+// The device supports the `BREAK` instruction.
+def FeatureBREAK : SubtargetFeature<"break", "m_hasBREAK", "true",
+ "The device supports the `BREAK` debugging "
+ "instruction">;
+
+// The device has instruction encodings specific to the Tiny core.
+def FeatureTinyEncoding : SubtargetFeature<"tinyencoding",
+ "m_hasTinyEncoding", "true",
+ "The device has Tiny core specific "
+ "instruction encodings">;
+
+class ELFArch<string name> : SubtargetFeature<"", "ELFArch",
+ !strconcat("ELF::",name), "">;
+
+// ELF e_flags architecture values
+def ELFArchAVR1 : ELFArch<"EF_AVR_ARCH_AVR1">;
+def ELFArchAVR2 : ELFArch<"EF_AVR_ARCH_AVR2">;
+def ELFArchAVR25 : ELFArch<"EF_AVR_ARCH_AVR25">;
+def ELFArchAVR3 : ELFArch<"EF_AVR_ARCH_AVR3">;
+def ELFArchAVR31 : ELFArch<"EF_AVR_ARCH_AVR31">;
+def ELFArchAVR35 : ELFArch<"EF_AVR_ARCH_AVR35">;
+def ELFArchAVR4 : ELFArch<"EF_AVR_ARCH_AVR4">;
+def ELFArchAVR5 : ELFArch<"EF_AVR_ARCH_AVR5">;
+def ELFArchAVR51 : ELFArch<"EF_AVR_ARCH_AVR51">;
+def ELFArchAVR6 : ELFArch<"EF_AVR_ARCH_AVR6">;
+def ELFArchAVRTiny : ELFArch<"EF_AVR_ARCH_AVRTINY">;
+def ELFArchXMEGA1 : ELFArch<"EF_AVR_ARCH_XMEGA1">;
+def ELFArchXMEGA2 : ELFArch<"EF_AVR_ARCH_XMEGA2">;
+def ELFArchXMEGA3 : ELFArch<"EF_AVR_ARCH_XMEGA3">;
+def ELFArchXMEGA4 : ELFArch<"EF_AVR_ARCH_XMEGA4">;
+def ELFArchXMEGA5 : ELFArch<"EF_AVR_ARCH_XMEGA5">;
+def ELFArchXMEGA6 : ELFArch<"EF_AVR_ARCH_XMEGA6">;
+def ELFArchXMEGA7 : ELFArch<"EF_AVR_ARCH_XMEGA7">;
+
+//===---------------------------------------------------------------------===//
+// AVR Families
+//===---------------------------------------------------------------------===//
+
+// The device has at least the bare minimum that **every** single AVR
+// device should have.
+def FamilyAVR0 : Family<"avr0", []>;
+
+def FamilyAVR1 : Family<"avr1", [FamilyAVR0, FeatureLPM]>;
+
+def FamilyAVR2 : Family<"avr2",
+ [FamilyAVR1, FeatureIJMPCALL, FeatureADDSUBIW,
+ FeatureSRAM]>;
+
+def FamilyAVR25 : Family<"avr25",
+ [FamilyAVR2, FeatureMOVW, FeatureLPMX,
+ FeatureSPM, FeatureBREAK]>;
+
+def FamilyAVR3 : Family<"avr3",
+ [FamilyAVR2, FeatureJMPCALL]>;
+
+def FamilyAVR31 : Family<"avr31",
+ [FamilyAVR3, FeatureELPM]>;
+
+def FamilyAVR35 : Family<"avr35",
+ [FamilyAVR3, FeatureMOVW, FeatureLPMX,
+ FeatureSPM, FeatureBREAK]>;
+
+def FamilyAVR4 : Family<"avr4",
+ [FamilyAVR2, FeatureMultiplication,
+ FeatureMOVW, FeatureLPMX, FeatureSPM,
+ FeatureBREAK]>;
+
+def FamilyAVR5 : Family<"avr5",
+ [FamilyAVR3, FeatureMultiplication,
+ FeatureMOVW, FeatureLPMX, FeatureSPM,
+ FeatureBREAK]>;
+
+def FamilyAVR51 : Family<"avr51",
+ [FamilyAVR5, FeatureELPM, FeatureELPMX]>;
+
+def FamilyAVR6 : Family<"avr6",
+ [FamilyAVR51]>;
+
+def FamilyAVRTiny : Family<"avrtiny",
+ [FamilyAVR0, FeatureBREAK, FeatureSRAM,
+ FeatureTinyEncoding]>;
+
+def FamilyXMEGA : Family<"xmega",
+ [FamilyAVR51, FeatureEIJMPCALL, FeatureSPMX,
+ FeatureDES]>;
+
+def FamilyXMEGAU : Family<"xmegau",
+ [FamilyXMEGA, FeatureRMW]>;
+
+def FeatureSetSpecial : FeatureSet<"special",
+ "Enable use of the entire instruction "
+ "set - used for debugging",
+ [FeatureSRAM, FeatureJMPCALL,
+ FeatureIJMPCALL, FeatureEIJMPCALL,
+ FeatureADDSUBIW, FeatureMOVW,
+ FeatureLPM, FeatureLPMX, FeatureELPM,
+ FeatureELPMX, FeatureSPM, FeatureSPMX,
+ FeatureDES, FeatureRMW,
+ FeatureMultiplication, FeatureBREAK]>;
+
+//===---------------------------------------------------------------------===//
+// AVR microcontrollers supported.
+//===---------------------------------------------------------------------===//
+
+class Device<string Name, Family Fam, ELFArch Arch,
+ list<SubtargetFeature> ExtraFeatures = []>
+ : Processor<Name, NoItineraries, !listconcat([Fam,Arch],ExtraFeatures)>;
+
+// Generic MCUs
+// Note that several versions of GCC has strange ELF architecture
+// settings for backwards compatibility - see `gas/config/tc-avr.c`
+// in AVR binutils. We do not replicate this.
+def : Device<"avr1", FamilyAVR1, ELFArchAVR1>;
+def : Device<"avr2", FamilyAVR2, ELFArchAVR2>;
+def : Device<"avr25", FamilyAVR25, ELFArchAVR25>;
+def : Device<"avr3", FamilyAVR3, ELFArchAVR3>;
+def : Device<"avr31", FamilyAVR31, ELFArchAVR31>;
+def : Device<"avr35", FamilyAVR35, ELFArchAVR35>;
+def : Device<"avr4", FamilyAVR4, ELFArchAVR4>;
+def : Device<"avr5", FamilyAVR5, ELFArchAVR5>;
+def : Device<"avr51", FamilyAVR51, ELFArchAVR51>;
+def : Device<"avr6", FamilyAVR6, ELFArchAVR6>;
+def : Device<"avrxmega1", FamilyXMEGA, ELFArchXMEGA1>;
+def : Device<"avrxmega2", FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"avrxmega3", FamilyXMEGA, ELFArchXMEGA3>;
+def : Device<"avrxmega4", FamilyXMEGA, ELFArchXMEGA4>;
+def : Device<"avrxmega5", FamilyXMEGA, ELFArchXMEGA5>;
+def : Device<"avrxmega6", FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"avrxmega7", FamilyXMEGA, ELFArchXMEGA7>;
+def : Device<"avrtiny", FamilyAVRTiny, ELFArchAVRTiny>;
+
+// Specific MCUs
+def : Device<"at90s1200", FamilyAVR0, ELFArchAVR1>;
+def : Device<"attiny11", FamilyAVR1, ELFArchAVR1>;
+def : Device<"attiny12", FamilyAVR1, ELFArchAVR1>;
+def : Device<"attiny15", FamilyAVR1, ELFArchAVR1>;
+def : Device<"attiny28", FamilyAVR1, ELFArchAVR1>;
+def : Device<"at90s2313", FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90s2323", FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90s2333", FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90s2343", FamilyAVR2, ELFArchAVR2>;
+def : Device<"attiny22", FamilyAVR2, ELFArchAVR2>;
+def : Device<"attiny26", FamilyAVR2, ELFArchAVR2, [FeatureLPMX]>;
+def : Device<"at86rf401", FamilyAVR2, ELFArchAVR25,
+ [FeatureMOVW, FeatureLPMX]>;
+def : Device<"at90s4414", FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90s4433", FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90s4434", FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90s8515", FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90c8534", FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90s8535", FamilyAVR2, ELFArchAVR2>;
+def : Device<"ata5272", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny13", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny13a", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny2313", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny2313a", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny24", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny24a", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny4313", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny44", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny44a", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny84", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny84a", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny25", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny45", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny85", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny261", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny261a", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny461", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny461a", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny861", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny861a", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny87", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny43u", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny48", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny88", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny828", FamilyAVR25, ELFArchAVR25>;
+def : Device<"at43usb355", FamilyAVR3, ELFArchAVR3>;
+def : Device<"at76c711", FamilyAVR3, ELFArchAVR3>;
+def : Device<"atmega103", FamilyAVR31, ELFArchAVR31>;
+def : Device<"at43usb320", FamilyAVR31, ELFArchAVR31>;
+def : Device<"attiny167", FamilyAVR35, ELFArchAVR35>;
+def : Device<"at90usb82", FamilyAVR35, ELFArchAVR35>;
+def : Device<"at90usb162", FamilyAVR35, ELFArchAVR35>;
+def : Device<"ata5505", FamilyAVR35, ELFArchAVR35>;
+def : Device<"atmega8u2", FamilyAVR35, ELFArchAVR35>;
+def : Device<"atmega16u2", FamilyAVR35, ELFArchAVR35>;
+def : Device<"atmega32u2", FamilyAVR35, ELFArchAVR35>;
+def : Device<"attiny1634", FamilyAVR35, ELFArchAVR35>;
+def : Device<"atmega8", FamilyAVR4, ELFArchAVR4>; // FIXME: family may be wrong
+def : Device<"ata6289", FamilyAVR4, ELFArchAVR4>;
+def : Device<"atmega8a", FamilyAVR4, ELFArchAVR4>;
+def : Device<"ata6285", FamilyAVR4, ELFArchAVR4>;
+def : Device<"ata6286", FamilyAVR4, ELFArchAVR4>;
+def : Device<"atmega48", FamilyAVR4, ELFArchAVR4>;
+def : Device<"atmega48a", FamilyAVR4, ELFArchAVR4>;
+def : Device<"atmega48pa", FamilyAVR4, ELFArchAVR4>;
+def : Device<"atmega48p", FamilyAVR4, ELFArchAVR4>;
+def : Device<"atmega88", FamilyAVR4, ELFArchAVR4>;
+def : Device<"atmega88a", FamilyAVR4, ELFArchAVR4>;
+def : Device<"atmega88p", FamilyAVR4, ELFArchAVR4>;
+def : Device<"atmega88pa", FamilyAVR4, ELFArchAVR4>;
+def : Device<"atmega8515", FamilyAVR2, ELFArchAVR4,
+ [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>;
+def : Device<"atmega8535", FamilyAVR2, ELFArchAVR4,
+ [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>;
+def : Device<"atmega8hva", FamilyAVR4, ELFArchAVR4>;
+def : Device<"at90pwm1", FamilyAVR4, ELFArchAVR4>;
+def : Device<"at90pwm2", FamilyAVR4, ELFArchAVR4>;
+def : Device<"at90pwm2b", FamilyAVR4, ELFArchAVR4>;
+def : Device<"at90pwm3", FamilyAVR4, ELFArchAVR4>;
+def : Device<"at90pwm3b", FamilyAVR4, ELFArchAVR4>;
+def : Device<"at90pwm81", FamilyAVR4, ELFArchAVR4>;
+def : Device<"ata5790", FamilyAVR5, ELFArchAVR5>;
+def : Device<"ata5795", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega16", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega16a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega161", FamilyAVR3, ELFArchAVR5,
+ [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>;
+def : Device<"atmega162", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega163", FamilyAVR3, ELFArchAVR5,
+ [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>;
+def : Device<"atmega164a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega164p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega164pa", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega165", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega165a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega165p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega165pa", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega168", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega168a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega168p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega168pa", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega169", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega169a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega169p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega169pa", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega32", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega32a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega323", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega324a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega324p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega324pa", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega325", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega325a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega325p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega325pa", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega3250", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega3250a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega3250p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega3250pa", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega328", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega328p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega329", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega329a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega329p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega329pa", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega3290", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega3290a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega3290p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega3290pa", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega406", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega64", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega64a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega640", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega644", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega644a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega644p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega644pa", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega645", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega645a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega645p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega649", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega649a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega649p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega6450", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega6450a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega6450p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega6490", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega6490a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega6490p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega64rfr2", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega644rfr2", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega16hva", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega16hva2", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega16hvb", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega16hvbrevb", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega32hvb", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega32hvbrevb", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega64hve", FamilyAVR5, ELFArchAVR5>;
+def : Device<"at90can32", FamilyAVR5, ELFArchAVR5>;
+def : Device<"at90can64", FamilyAVR5, ELFArchAVR5>;
+def : Device<"at90pwm161", FamilyAVR5, ELFArchAVR5>;
+def : Device<"at90pwm216", FamilyAVR5, ELFArchAVR5>;
+def : Device<"at90pwm316", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega32c1", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega64c1", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega16m1", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega32m1", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega64m1", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega16u4", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega32u4", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega32u6", FamilyAVR5, ELFArchAVR5>;
+def : Device<"at90usb646", FamilyAVR5, ELFArchAVR5>;
+def : Device<"at90usb647", FamilyAVR5, ELFArchAVR5>;
+def : Device<"at90scr100", FamilyAVR5, ELFArchAVR5>;
+def : Device<"at94k", FamilyAVR3, ELFArchAVR5,
+ [FeatureMultiplication, FeatureMOVW, FeatureLPMX]>;
+def : Device<"m3000", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega128", FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega128a", FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega1280", FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega1281", FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega1284", FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega1284p", FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega128rfa1", FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega128rfr2", FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega1284rfr2", FamilyAVR51, ELFArchAVR51>;
+def : Device<"at90can128", FamilyAVR51, ELFArchAVR51>;
+def : Device<"at90usb1286", FamilyAVR51, ELFArchAVR51>;
+def : Device<"at90usb1287", FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega2560", FamilyAVR6, ELFArchAVR6>;
+def : Device<"atmega2561", FamilyAVR6, ELFArchAVR6>;
+def : Device<"atmega256rfr2", FamilyAVR6, ELFArchAVR6>;
+def : Device<"atmega2564rfr2", FamilyAVR6, ELFArchAVR6>;
+def : Device<"atxmega16a4", FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"atxmega16a4u", FamilyXMEGAU, ELFArchXMEGA2>;
+def : Device<"atxmega16c4", FamilyXMEGAU, ELFArchXMEGA2>;
+def : Device<"atxmega16d4", FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"atxmega32a4", FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"atxmega32a4u", FamilyXMEGAU, ELFArchXMEGA2>;
+def : Device<"atxmega32c4", FamilyXMEGAU, ELFArchXMEGA2>;
+def : Device<"atxmega32d4", FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"atxmega32e5", FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"atxmega16e5", FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"atxmega8e5", FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"atxmega32x1", FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"atxmega64a3", FamilyXMEGA, ELFArchXMEGA4>;
+def : Device<"atxmega64a3u", FamilyXMEGAU, ELFArchXMEGA4>;
+def : Device<"atxmega64a4u", FamilyXMEGAU, ELFArchXMEGA4>;
+def : Device<"atxmega64b1", FamilyXMEGAU, ELFArchXMEGA4>;
+def : Device<"atxmega64b3", FamilyXMEGAU, ELFArchXMEGA4>;
+def : Device<"atxmega64c3", FamilyXMEGAU, ELFArchXMEGA4>;
+def : Device<"atxmega64d3", FamilyXMEGA, ELFArchXMEGA4>;
+def : Device<"atxmega64d4", FamilyXMEGA, ELFArchXMEGA4>;
+def : Device<"atxmega64a1", FamilyXMEGA, ELFArchXMEGA5>;
+def : Device<"atxmega64a1u", FamilyXMEGAU, ELFArchXMEGA5>;
+def : Device<"atxmega128a3", FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega128a3u", FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega128b1", FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega128b3", FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega128c3", FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega128d3", FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega128d4", FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega192a3", FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega192a3u", FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega192c3", FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega192d3", FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega256a3", FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega256a3u", FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega256a3b", FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega256a3bu", FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega256c3", FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega256d3", FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega384c3", FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega384d3", FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega128a1", FamilyXMEGA, ELFArchXMEGA7>;
+def : Device<"atxmega128a1u", FamilyXMEGAU, ELFArchXMEGA7>;
+def : Device<"atxmega128a4u", FamilyXMEGAU, ELFArchXMEGA7>;
+def : Device<"attiny4", FamilyAVRTiny, ELFArchAVRTiny>;
+def : Device<"attiny5", FamilyAVRTiny, ELFArchAVRTiny>;
+def : Device<"attiny9", FamilyAVRTiny, ELFArchAVRTiny>;
+def : Device<"attiny10", FamilyAVRTiny, ELFArchAVRTiny>;
+def : Device<"attiny20", FamilyAVRTiny, ELFArchAVRTiny>;
+def : Device<"attiny40", FamilyAVRTiny, ELFArchAVRTiny>;
+def : Device<"attiny102", FamilyAVRTiny, ELFArchAVRTiny>;
+def : Device<"attiny104", FamilyAVRTiny, ELFArchAVRTiny>;
+
diff --git a/contrib/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp b/contrib/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
new file mode 100644
index 000000000000..1b2f2cec0bca
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
@@ -0,0 +1,1515 @@
+//===-- AVRExpandPseudoInsts.cpp - Expand pseudo instructions -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that expands pseudo instructions into target
+// instructions. This pass should be run after register allocation but before
+// the post-regalloc scheduling pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVR.h"
+#include "AVRInstrInfo.h"
+#include "AVRTargetMachine.h"
+#include "MCTargetDesc/AVRMCTargetDesc.h"
+
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define AVR_EXPAND_PSEUDO_NAME "AVR pseudo instruction expansion pass"
+
+namespace {
+
+/// Expands "placeholder" instructions marked as pseudo into
+/// actual AVR instructions.
+class AVRExpandPseudo : public MachineFunctionPass {
+public:
+ static char ID;
+
+ AVRExpandPseudo() : MachineFunctionPass(ID) {
+ initializeAVRExpandPseudoPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override { return AVR_EXPAND_PSEUDO_NAME; }
+
+private:
+ typedef MachineBasicBlock Block;
+ typedef Block::iterator BlockIt;
+
+ const AVRRegisterInfo *TRI;
+ const TargetInstrInfo *TII;
+
+ /// The register to be used for temporary storage.
+ const unsigned SCRATCH_REGISTER = AVR::R0;
+ /// The IO address of the status register.
+ const unsigned SREG_ADDR = 0x3f;
+
+ bool expandMBB(Block &MBB);
+ bool expandMI(Block &MBB, BlockIt MBBI);
+ template <unsigned OP> bool expand(Block &MBB, BlockIt MBBI);
+
+ MachineInstrBuilder buildMI(Block &MBB, BlockIt MBBI, unsigned Opcode) {
+ return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(Opcode));
+ }
+
+ MachineInstrBuilder buildMI(Block &MBB, BlockIt MBBI, unsigned Opcode,
+ unsigned DstReg) {
+ return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(Opcode), DstReg);
+ }
+
+ MachineRegisterInfo &getRegInfo(Block &MBB) { return MBB.getParent()->getRegInfo(); }
+
+ bool expandArith(unsigned OpLo, unsigned OpHi, Block &MBB, BlockIt MBBI);
+ bool expandLogic(unsigned Op, Block &MBB, BlockIt MBBI);
+ bool expandLogicImm(unsigned Op, Block &MBB, BlockIt MBBI);
+ bool isLogicImmOpRedundant(unsigned Op, unsigned ImmVal) const;
+
+ template<typename Func>
+ bool expandAtomic(Block &MBB, BlockIt MBBI, Func f);
+
+ template<typename Func>
+ bool expandAtomicBinaryOp(unsigned Opcode, Block &MBB, BlockIt MBBI, Func f);
+
+ bool expandAtomicBinaryOp(unsigned Opcode, Block &MBB, BlockIt MBBI);
+
+ bool expandAtomicArithmeticOp(unsigned MemOpcode,
+ unsigned ArithOpcode,
+ Block &MBB,
+ BlockIt MBBI);
+};
+
+char AVRExpandPseudo::ID = 0;
+
+bool AVRExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
+ bool Modified = false;
+
+ BlockIt MBBI = MBB.begin(), E = MBB.end();
+ while (MBBI != E) {
+ BlockIt NMBBI = std::next(MBBI);
+ Modified |= expandMI(MBB, MBBI);
+ MBBI = NMBBI;
+ }
+
+ return Modified;
+}
+
+bool AVRExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
+ bool Modified = false;
+
+ const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>();
+ TRI = STI.getRegisterInfo();
+ TII = STI.getInstrInfo();
+
+ // We need to track liveness in order to use register scavenging.
+ MF.getProperties().set(MachineFunctionProperties::Property::TracksLiveness);
+
+ for (Block &MBB : MF) {
+ bool ContinueExpanding = true;
+ unsigned ExpandCount = 0;
+
+ // Continue expanding the block until all pseudos are expanded.
+ do {
+ assert(ExpandCount < 10 && "pseudo expand limit reached");
+
+ bool BlockModified = expandMBB(MBB);
+ Modified |= BlockModified;
+ ExpandCount++;
+
+ ContinueExpanding = BlockModified;
+ } while (ContinueExpanding);
+ }
+
+ return Modified;
+}
+
+bool AVRExpandPseudo::
+expandArith(unsigned OpLo, unsigned OpHi, Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned SrcLoReg, SrcHiReg, DstLoReg, DstHiReg;
+ unsigned DstReg = MI.getOperand(0).getReg();
+ unsigned SrcReg = MI.getOperand(2).getReg();
+ bool DstIsDead = MI.getOperand(0).isDead();
+ bool DstIsKill = MI.getOperand(1).isKill();
+ bool SrcIsKill = MI.getOperand(2).isKill();
+ bool ImpIsDead = MI.getOperand(3).isDead();
+ TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
+ TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+ buildMI(MBB, MBBI, OpLo)
+ .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstLoReg, getKillRegState(DstIsKill))
+ .addReg(SrcLoReg, getKillRegState(SrcIsKill));
+
+ auto MIBHI = buildMI(MBB, MBBI, OpHi)
+ .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstHiReg, getKillRegState(DstIsKill))
+ .addReg(SrcHiReg, getKillRegState(SrcIsKill));
+
+ if (ImpIsDead)
+ MIBHI->getOperand(3).setIsDead();
+
+ // SREG is always implicitly killed
+ MIBHI->getOperand(4).setIsKill();
+
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AVRExpandPseudo::
+expandLogic(unsigned Op, Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned SrcLoReg, SrcHiReg, DstLoReg, DstHiReg;
+ unsigned DstReg = MI.getOperand(0).getReg();
+ unsigned SrcReg = MI.getOperand(2).getReg();
+ bool DstIsDead = MI.getOperand(0).isDead();
+ bool DstIsKill = MI.getOperand(1).isKill();
+ bool SrcIsKill = MI.getOperand(2).isKill();
+ bool ImpIsDead = MI.getOperand(3).isDead();
+ TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
+ TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+ auto MIBLO = buildMI(MBB, MBBI, Op)
+ .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstLoReg, getKillRegState(DstIsKill))
+ .addReg(SrcLoReg, getKillRegState(SrcIsKill));
+
+ // SREG is always implicitly dead
+ MIBLO->getOperand(3).setIsDead();
+
+ auto MIBHI = buildMI(MBB, MBBI, Op)
+ .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstHiReg, getKillRegState(DstIsKill))
+ .addReg(SrcHiReg, getKillRegState(SrcIsKill));
+
+ if (ImpIsDead)
+ MIBHI->getOperand(3).setIsDead();
+
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AVRExpandPseudo::
+ isLogicImmOpRedundant(unsigned Op, unsigned ImmVal) const {
+
+ // ANDI Rd, 0xff is redundant.
+ if (Op == AVR::ANDIRdK && ImmVal == 0xff)
+ return true;
+
+ // ORI Rd, 0x0 is redundant.
+ if (Op == AVR::ORIRdK && ImmVal == 0x0)
+ return true;
+
+ return false;
+}
+
+bool AVRExpandPseudo::
+expandLogicImm(unsigned Op, Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned DstLoReg, DstHiReg;
+ unsigned DstReg = MI.getOperand(0).getReg();
+ bool DstIsDead = MI.getOperand(0).isDead();
+ bool SrcIsKill = MI.getOperand(1).isKill();
+ bool ImpIsDead = MI.getOperand(3).isDead();
+ unsigned Imm = MI.getOperand(2).getImm();
+ unsigned Lo8 = Imm & 0xff;
+ unsigned Hi8 = (Imm >> 8) & 0xff;
+ TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+ if (!isLogicImmOpRedundant(Op, Lo8)) {
+ auto MIBLO = buildMI(MBB, MBBI, Op)
+ .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstLoReg, getKillRegState(SrcIsKill))
+ .addImm(Lo8);
+
+ // SREG is always implicitly dead
+ MIBLO->getOperand(3).setIsDead();
+ }
+
+ if (!isLogicImmOpRedundant(Op, Hi8)) {
+ auto MIBHI = buildMI(MBB, MBBI, Op)
+ .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstHiReg, getKillRegState(SrcIsKill))
+ .addImm(Hi8);
+
+ if (ImpIsDead)
+ MIBHI->getOperand(3).setIsDead();
+ }
+
+ MI.eraseFromParent();
+ return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::ADDWRdRr>(Block &MBB, BlockIt MBBI) {
+ return expandArith(AVR::ADDRdRr, AVR::ADCRdRr, MBB, MBBI);
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::ADCWRdRr>(Block &MBB, BlockIt MBBI) {
+ return expandArith(AVR::ADCRdRr, AVR::ADCRdRr, MBB, MBBI);
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::SUBWRdRr>(Block &MBB, BlockIt MBBI) {
+ return expandArith(AVR::SUBRdRr, AVR::SBCRdRr, MBB, MBBI);
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::SUBIWRdK>(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned DstLoReg, DstHiReg;
+ unsigned DstReg = MI.getOperand(0).getReg();
+ bool DstIsDead = MI.getOperand(0).isDead();
+ bool SrcIsKill = MI.getOperand(1).isKill();
+ bool ImpIsDead = MI.getOperand(3).isDead();
+ TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+ auto MIBLO = buildMI(MBB, MBBI, AVR::SUBIRdK)
+ .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstLoReg, getKillRegState(SrcIsKill));
+
+ auto MIBHI = buildMI(MBB, MBBI, AVR::SBCIRdK)
+ .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstHiReg, getKillRegState(SrcIsKill));
+
+ switch (MI.getOperand(2).getType()) {
+ case MachineOperand::MO_GlobalAddress: {
+ const GlobalValue *GV = MI.getOperand(2).getGlobal();
+ int64_t Offs = MI.getOperand(2).getOffset();
+ unsigned TF = MI.getOperand(2).getTargetFlags();
+ MIBLO.addGlobalAddress(GV, Offs, TF | AVRII::MO_NEG | AVRII::MO_LO);
+ MIBHI.addGlobalAddress(GV, Offs, TF | AVRII::MO_NEG | AVRII::MO_HI);
+ break;
+ }
+ case MachineOperand::MO_Immediate: {
+ unsigned Imm = MI.getOperand(2).getImm();
+ MIBLO.addImm(Imm & 0xff);
+ MIBHI.addImm((Imm >> 8) & 0xff);
+ break;
+ }
+ default:
+ llvm_unreachable("Unknown operand type!");
+ }
+
+ if (ImpIsDead)
+ MIBHI->getOperand(3).setIsDead();
+
+ // SREG is always implicitly killed
+ MIBHI->getOperand(4).setIsKill();
+
+ MI.eraseFromParent();
+ return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::SBCWRdRr>(Block &MBB, BlockIt MBBI) {
+ return expandArith(AVR::SBCRdRr, AVR::SBCRdRr, MBB, MBBI);
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::SBCIWRdK>(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned OpLo, OpHi, DstLoReg, DstHiReg;
+ unsigned DstReg = MI.getOperand(0).getReg();
+ bool DstIsDead = MI.getOperand(0).isDead();
+ bool SrcIsKill = MI.getOperand(1).isKill();
+ bool ImpIsDead = MI.getOperand(3).isDead();
+ unsigned Imm = MI.getOperand(2).getImm();
+ unsigned Lo8 = Imm & 0xff;
+ unsigned Hi8 = (Imm >> 8) & 0xff;
+ OpLo = AVR::SBCIRdK;
+ OpHi = AVR::SBCIRdK;
+ TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+ auto MIBLO = buildMI(MBB, MBBI, OpLo)
+ .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstLoReg, getKillRegState(SrcIsKill))
+ .addImm(Lo8);
+
+ // SREG is always implicitly killed
+ MIBLO->getOperand(4).setIsKill();
+
+ auto MIBHI = buildMI(MBB, MBBI, OpHi)
+ .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstHiReg, getKillRegState(SrcIsKill))
+ .addImm(Hi8);
+
+ if (ImpIsDead)
+ MIBHI->getOperand(3).setIsDead();
+
+ // SREG is always implicitly killed
+ MIBHI->getOperand(4).setIsKill();
+
+ MI.eraseFromParent();
+ return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::ANDWRdRr>(Block &MBB, BlockIt MBBI) {
+ return expandLogic(AVR::ANDRdRr, MBB, MBBI);
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::ANDIWRdK>(Block &MBB, BlockIt MBBI) {
+ return expandLogicImm(AVR::ANDIRdK, MBB, MBBI);
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::ORWRdRr>(Block &MBB, BlockIt MBBI) {
+ return expandLogic(AVR::ORRdRr, MBB, MBBI);
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::ORIWRdK>(Block &MBB, BlockIt MBBI) {
+ return expandLogicImm(AVR::ORIRdK, MBB, MBBI);
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::EORWRdRr>(Block &MBB, BlockIt MBBI) {
+ return expandLogic(AVR::EORRdRr, MBB, MBBI);
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::COMWRd>(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned OpLo, OpHi, DstLoReg, DstHiReg;
+ unsigned DstReg = MI.getOperand(0).getReg();
+ bool DstIsDead = MI.getOperand(0).isDead();
+ bool DstIsKill = MI.getOperand(1).isKill();
+ bool ImpIsDead = MI.getOperand(2).isDead();
+ OpLo = AVR::COMRd;
+ OpHi = AVR::COMRd;
+ TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+ auto MIBLO = buildMI(MBB, MBBI, OpLo)
+ .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstLoReg, getKillRegState(DstIsKill));
+
+ // SREG is always implicitly dead
+ MIBLO->getOperand(2).setIsDead();
+
+ auto MIBHI = buildMI(MBB, MBBI, OpHi)
+ .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstHiReg, getKillRegState(DstIsKill));
+
+ if (ImpIsDead)
+ MIBHI->getOperand(2).setIsDead();
+
+ MI.eraseFromParent();
+ return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::CPWRdRr>(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned OpLo, OpHi, SrcLoReg, SrcHiReg, DstLoReg, DstHiReg;
+ unsigned DstReg = MI.getOperand(0).getReg();
+ unsigned SrcReg = MI.getOperand(1).getReg();
+ bool DstIsKill = MI.getOperand(0).isKill();
+ bool SrcIsKill = MI.getOperand(1).isKill();
+ bool ImpIsDead = MI.getOperand(2).isDead();
+ OpLo = AVR::CPRdRr;
+ OpHi = AVR::CPCRdRr;
+ TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
+ TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+ // Low part
+ buildMI(MBB, MBBI, OpLo)
+ .addReg(DstLoReg, getKillRegState(DstIsKill))
+ .addReg(SrcLoReg, getKillRegState(SrcIsKill));
+
+ auto MIBHI = buildMI(MBB, MBBI, OpHi)
+ .addReg(DstHiReg, getKillRegState(DstIsKill))
+ .addReg(SrcHiReg, getKillRegState(SrcIsKill));
+
+ if (ImpIsDead)
+ MIBHI->getOperand(2).setIsDead();
+
+ // SREG is always implicitly killed
+ MIBHI->getOperand(3).setIsKill();
+
+ MI.eraseFromParent();
+ return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::CPCWRdRr>(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned OpLo, OpHi, SrcLoReg, SrcHiReg, DstLoReg, DstHiReg;
+ unsigned DstReg = MI.getOperand(0).getReg();
+ unsigned SrcReg = MI.getOperand(1).getReg();
+ bool DstIsKill = MI.getOperand(0).isKill();
+ bool SrcIsKill = MI.getOperand(1).isKill();
+ bool ImpIsDead = MI.getOperand(2).isDead();
+ OpLo = AVR::CPCRdRr;
+ OpHi = AVR::CPCRdRr;
+ TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
+ TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+ auto MIBLO = buildMI(MBB, MBBI, OpLo)
+ .addReg(DstLoReg, getKillRegState(DstIsKill))
+ .addReg(SrcLoReg, getKillRegState(SrcIsKill));
+
+ // SREG is always implicitly killed
+ MIBLO->getOperand(3).setIsKill();
+
+ auto MIBHI = buildMI(MBB, MBBI, OpHi)
+ .addReg(DstHiReg, getKillRegState(DstIsKill))
+ .addReg(SrcHiReg, getKillRegState(SrcIsKill));
+
+ if (ImpIsDead)
+ MIBHI->getOperand(2).setIsDead();
+
+ // SREG is always implicitly killed
+ MIBHI->getOperand(3).setIsKill();
+
+ MI.eraseFromParent();
+ return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::LDIWRdK>(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned OpLo, OpHi, DstLoReg, DstHiReg;
+ unsigned DstReg = MI.getOperand(0).getReg();
+ bool DstIsDead = MI.getOperand(0).isDead();
+ OpLo = AVR::LDIRdK;
+ OpHi = AVR::LDIRdK;
+ TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+ auto MIBLO = buildMI(MBB, MBBI, OpLo)
+ .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead));
+
+ auto MIBHI = buildMI(MBB, MBBI, OpHi)
+ .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead));
+
+ switch (MI.getOperand(1).getType()) {
+ case MachineOperand::MO_GlobalAddress: {
+ const GlobalValue *GV = MI.getOperand(1).getGlobal();
+ int64_t Offs = MI.getOperand(1).getOffset();
+ unsigned TF = MI.getOperand(1).getTargetFlags();
+
+ MIBLO.addGlobalAddress(GV, Offs, TF | AVRII::MO_LO);
+ MIBHI.addGlobalAddress(GV, Offs, TF | AVRII::MO_HI);
+ break;
+ }
+ case MachineOperand::MO_BlockAddress: {
+ const BlockAddress *BA = MI.getOperand(1).getBlockAddress();
+ unsigned TF = MI.getOperand(1).getTargetFlags();
+
+ MIBLO.addOperand(MachineOperand::CreateBA(BA, TF | AVRII::MO_LO));
+ MIBHI.addOperand(MachineOperand::CreateBA(BA, TF | AVRII::MO_HI));
+ break;
+ }
+ case MachineOperand::MO_Immediate: {
+ unsigned Imm = MI.getOperand(1).getImm();
+
+ MIBLO.addImm(Imm & 0xff);
+ MIBHI.addImm((Imm >> 8) & 0xff);
+ break;
+ }
+ default:
+ llvm_unreachable("Unknown operand type!");
+ }
+
+ MI.eraseFromParent();
+ return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::LDSWRdK>(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned OpLo, OpHi, DstLoReg, DstHiReg;
+ unsigned DstReg = MI.getOperand(0).getReg();
+ bool DstIsDead = MI.getOperand(0).isDead();
+ OpLo = AVR::LDSRdK;
+ OpHi = AVR::LDSRdK;
+ TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+ auto MIBLO = buildMI(MBB, MBBI, OpLo)
+ .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead));
+
+ auto MIBHI = buildMI(MBB, MBBI, OpHi)
+ .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead));
+
+ switch (MI.getOperand(1).getType()) {
+ case MachineOperand::MO_GlobalAddress: {
+ const GlobalValue *GV = MI.getOperand(1).getGlobal();
+ int64_t Offs = MI.getOperand(1).getOffset();
+ unsigned TF = MI.getOperand(1).getTargetFlags();
+
+ MIBLO.addGlobalAddress(GV, Offs, TF);
+ MIBHI.addGlobalAddress(GV, Offs + 1, TF);
+ break;
+ }
+ case MachineOperand::MO_Immediate: {
+ unsigned Imm = MI.getOperand(1).getImm();
+
+ MIBLO.addImm(Imm);
+ MIBHI.addImm(Imm + 1);
+ break;
+ }
+ default:
+ llvm_unreachable("Unknown operand type!");
+ }
+
+ MIBLO->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ MIBHI->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+ MI.eraseFromParent();
+ return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::LDWRdPtr>(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned OpLo, OpHi, DstLoReg, DstHiReg;
+ unsigned DstReg = MI.getOperand(0).getReg();
+ unsigned SrcReg = MI.getOperand(1).getReg();
+ bool DstIsDead = MI.getOperand(0).isDead();
+ bool SrcIsKill = MI.getOperand(1).isKill();
+ OpLo = AVR::LDRdPtr;
+ OpHi = AVR::LDDRdPtrQ;
+ TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+ assert(DstReg != SrcReg && "SrcReg and DstReg cannot be the same");
+
+ auto MIBLO = buildMI(MBB, MBBI, OpLo)
+ .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(SrcReg);
+
+ auto MIBHI = buildMI(MBB, MBBI, OpHi)
+ .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(SrcReg, getKillRegState(SrcIsKill))
+ .addImm(1);
+
+ MIBLO->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ MIBHI->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+ MI.eraseFromParent();
+ return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::LDWRdPtrPi>(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned OpLo, OpHi, DstLoReg, DstHiReg;
+ unsigned DstReg = MI.getOperand(0).getReg();
+ unsigned SrcReg = MI.getOperand(1).getReg();
+ bool DstIsDead = MI.getOperand(0).isDead();
+ bool SrcIsDead = MI.getOperand(1).isKill();
+ OpLo = AVR::LDRdPtrPi;
+ OpHi = AVR::LDRdPtrPi;
+ TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+ assert(DstReg != SrcReg && "SrcReg and DstReg cannot be the same");
+
+ auto MIBLO = buildMI(MBB, MBBI, OpLo)
+ .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(SrcReg, RegState::Define)
+ .addReg(SrcReg, RegState::Kill);
+
+ auto MIBHI = buildMI(MBB, MBBI, OpHi)
+ .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(SrcReg, RegState::Define | getDeadRegState(SrcIsDead))
+ .addReg(SrcReg, RegState::Kill);
+
+ MIBLO->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ MIBHI->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+ MI.eraseFromParent();
+ return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::LDWRdPtrPd>(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned OpLo, OpHi, DstLoReg, DstHiReg;
+ unsigned DstReg = MI.getOperand(0).getReg();
+ unsigned SrcReg = MI.getOperand(1).getReg();
+ bool DstIsDead = MI.getOperand(0).isDead();
+ bool SrcIsDead = MI.getOperand(1).isKill();
+ OpLo = AVR::LDRdPtrPd;
+ OpHi = AVR::LDRdPtrPd;
+ TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+ assert(DstReg != SrcReg && "SrcReg and DstReg cannot be the same");
+
+ auto MIBHI = buildMI(MBB, MBBI, OpHi)
+ .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(SrcReg, RegState::Define)
+ .addReg(SrcReg, RegState::Kill);
+
+ auto MIBLO = buildMI(MBB, MBBI, OpLo)
+ .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(SrcReg, RegState::Define | getDeadRegState(SrcIsDead))
+ .addReg(SrcReg, RegState::Kill);
+
+ MIBLO->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ MIBHI->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+ MI.eraseFromParent();
+ return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::LDDWRdPtrQ>(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned OpLo, OpHi, DstLoReg, DstHiReg;
+ unsigned DstReg = MI.getOperand(0).getReg();
+ unsigned SrcReg = MI.getOperand(1).getReg();
+ unsigned Imm = MI.getOperand(2).getImm();
+ bool DstIsDead = MI.getOperand(0).isDead();
+ bool SrcIsKill = MI.getOperand(1).isKill();
+ OpLo = AVR::LDDRdPtrQ;
+ OpHi = AVR::LDDRdPtrQ;
+ TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+ assert(Imm <= 63 && "Offset is out of range");
+
+ MachineInstr *MIBLO, *MIBHI;
+
+ // HACK: We shouldn't have instances of this instruction
+ // where src==dest because the instruction itself is
+ // marked earlyclobber. We do however get this instruction when
+ // loading from stack slots where the earlyclobber isn't useful.
+ //
+ // In this case, just use a temporary register.
+ if (DstReg == SrcReg) {
+ RegScavenger RS;
+
+ RS.enterBasicBlock(MBB);
+ RS.forward(MBBI);
+
+ BitVector Candidates =
+ TRI->getAllocatableSet
+ (*MBB.getParent(), &AVR::GPR8RegClass);
+
+ // Exclude all the registers being used by the instruction.
+ for (MachineOperand &MO : MI.operands()) {
+ if (MO.isReg() && MO.getReg() != 0 && !MO.isDef() &&
+ !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+ Candidates.reset(MO.getReg());
+ }
+
+ BitVector Available = RS.getRegsAvailable(&AVR::GPR8RegClass);
+ Available &= Candidates;
+
+ signed TmpReg = Available.find_first();
+ assert(TmpReg != -1 && "ran out of registers");
+
+ MIBLO = buildMI(MBB, MBBI, OpLo)
+ .addReg(TmpReg, RegState::Define)
+ .addReg(SrcReg)
+ .addImm(Imm);
+
+ buildMI(MBB, MBBI, AVR::MOVRdRr).addReg(DstLoReg).addReg(TmpReg);
+
+ MIBHI = buildMI(MBB, MBBI, OpHi)
+ .addReg(TmpReg, RegState::Define)
+ .addReg(SrcReg, getKillRegState(SrcIsKill))
+ .addImm(Imm + 1);
+
+ buildMI(MBB, MBBI, AVR::MOVRdRr).addReg(DstHiReg).addReg(TmpReg);
+ } else {
+ MIBLO = buildMI(MBB, MBBI, OpLo)
+ .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(SrcReg)
+ .addImm(Imm);
+
+ MIBHI = buildMI(MBB, MBBI, OpHi)
+ .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(SrcReg, getKillRegState(SrcIsKill))
+ .addImm(Imm + 1);
+ }
+
+ MIBLO->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ MIBHI->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+ MI.eraseFromParent();
+ return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::LPMWRdZ>(Block &MBB, BlockIt MBBI) {
+ llvm_unreachable("wide LPM is unimplemented");
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::LPMWRdZPi>(Block &MBB, BlockIt MBBI) {
+ llvm_unreachable("wide LPMPi is unimplemented");
+}
+
+template<typename Func>
+bool AVRExpandPseudo::expandAtomic(Block &MBB, BlockIt MBBI, Func f) {
+ // Remove the pseudo instruction.
+ MachineInstr &MI = *MBBI;
+
+ // Store the SREG.
+ buildMI(MBB, MBBI, AVR::INRdA)
+ .addReg(SCRATCH_REGISTER, RegState::Define)
+ .addImm(SREG_ADDR);
+
+ // Disable exceptions.
+ buildMI(MBB, MBBI, AVR::BCLRs).addImm(7); // CLI
+
+ f(MI);
+
+ // Restore the status reg.
+ buildMI(MBB, MBBI, AVR::OUTARr)
+ .addImm(SREG_ADDR)
+ .addReg(SCRATCH_REGISTER);
+
+ MI.eraseFromParent();
+ return true;
+}
+
+template<typename Func>
+bool AVRExpandPseudo::expandAtomicBinaryOp(unsigned Opcode,
+ Block &MBB,
+ BlockIt MBBI,
+ Func f) {
+ return expandAtomic(MBB, MBBI, [&](MachineInstr &MI) {
+ auto Op1 = MI.getOperand(0);
+ auto Op2 = MI.getOperand(1);
+
+ MachineInstr &NewInst = *buildMI(MBB, MBBI, Opcode)
+ .addOperand(Op1).addOperand(Op2)
+ .getInstr();
+ f(NewInst);
+ });
+}
+
+bool AVRExpandPseudo::expandAtomicBinaryOp(unsigned Opcode,
+ Block &MBB,
+ BlockIt MBBI) {
+ return expandAtomicBinaryOp(Opcode, MBB, MBBI, [](MachineInstr &MI) {});
+}
+
+bool AVRExpandPseudo::expandAtomicArithmeticOp(unsigned Width,
+ unsigned ArithOpcode,
+ Block &MBB,
+ BlockIt MBBI) {
+ return expandAtomic(MBB, MBBI, [&](MachineInstr &MI) {
+ auto Op1 = MI.getOperand(0);
+ auto Op2 = MI.getOperand(1);
+
+ unsigned LoadOpcode = (Width == 8) ? AVR::LDRdPtr : AVR::LDWRdPtr;
+ unsigned StoreOpcode = (Width == 8) ? AVR::STPtrRr : AVR::STWPtrRr;
+
+ // Create the load
+ buildMI(MBB, MBBI, LoadOpcode).addOperand(Op1).addOperand(Op2);
+
+ // Create the arithmetic op
+ buildMI(MBB, MBBI, ArithOpcode)
+ .addOperand(Op1).addOperand(Op1)
+ .addOperand(Op2);
+
+ // Create the store
+ buildMI(MBB, MBBI, StoreOpcode).addOperand(Op2).addOperand(Op1);
+ });
+}
+
+template<>
+bool AVRExpandPseudo::expand<AVR::AtomicLoad8>(Block &MBB, BlockIt MBBI) {
+ return expandAtomicBinaryOp(AVR::LDRdPtr, MBB, MBBI);
+}
+
+template<>
+bool AVRExpandPseudo::expand<AVR::AtomicLoad16>(Block &MBB, BlockIt MBBI) {
+ return expandAtomicBinaryOp(AVR::LDWRdPtr, MBB, MBBI);
+}
+
+template<>
+bool AVRExpandPseudo::expand<AVR::AtomicStore8>(Block &MBB, BlockIt MBBI) {
+ return expandAtomicBinaryOp(AVR::STPtrRr, MBB, MBBI);
+}
+
+template<>
+bool AVRExpandPseudo::expand<AVR::AtomicStore16>(Block &MBB, BlockIt MBBI) {
+ return expandAtomicBinaryOp(AVR::STWPtrRr, MBB, MBBI);
+}
+
+template<>
+bool AVRExpandPseudo::expand<AVR::AtomicLoadAdd8>(Block &MBB, BlockIt MBBI) {
+ return expandAtomicArithmeticOp(8, AVR::ADDRdRr, MBB, MBBI);
+}
+
+template<>
+bool AVRExpandPseudo::expand<AVR::AtomicLoadAdd16>(Block &MBB, BlockIt MBBI) {
+ return expandAtomicArithmeticOp(16, AVR::ADDWRdRr, MBB, MBBI);
+}
+
+template<>
+bool AVRExpandPseudo::expand<AVR::AtomicLoadSub8>(Block &MBB, BlockIt MBBI) {
+ return expandAtomicArithmeticOp(8, AVR::SUBRdRr, MBB, MBBI);
+}
+
+template<>
+bool AVRExpandPseudo::expand<AVR::AtomicLoadSub16>(Block &MBB, BlockIt MBBI) {
+ return expandAtomicArithmeticOp(16, AVR::SUBWRdRr, MBB, MBBI);
+}
+
+template<>
+bool AVRExpandPseudo::expand<AVR::AtomicLoadAnd8>(Block &MBB, BlockIt MBBI) {
+ return expandAtomicArithmeticOp(8, AVR::ANDRdRr, MBB, MBBI);
+}
+
+template<>
+bool AVRExpandPseudo::expand<AVR::AtomicLoadAnd16>(Block &MBB, BlockIt MBBI) {
+ return expandAtomicArithmeticOp(16, AVR::ANDWRdRr, MBB, MBBI);
+}
+
+template<>
+bool AVRExpandPseudo::expand<AVR::AtomicLoadOr8>(Block &MBB, BlockIt MBBI) {
+ return expandAtomicArithmeticOp(8, AVR::ORRdRr, MBB, MBBI);
+}
+
+template<>
+bool AVRExpandPseudo::expand<AVR::AtomicLoadOr16>(Block &MBB, BlockIt MBBI) {
+ return expandAtomicArithmeticOp(16, AVR::ORWRdRr, MBB, MBBI);
+}
+
+template<>
+bool AVRExpandPseudo::expand<AVR::AtomicLoadXor8>(Block &MBB, BlockIt MBBI) {
+ return expandAtomicArithmeticOp(8, AVR::EORRdRr, MBB, MBBI);
+}
+
+template<>
+bool AVRExpandPseudo::expand<AVR::AtomicLoadXor16>(Block &MBB, BlockIt MBBI) {
+ return expandAtomicArithmeticOp(16, AVR::EORWRdRr, MBB, MBBI);
+}
+
+template<>
+bool AVRExpandPseudo::expand<AVR::AtomicFence>(Block &MBB, BlockIt MBBI) {
+ // On AVR, there is only one core and so atomic fences do nothing.
+ MBBI->eraseFromParent();
+ return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::STSWKRr>(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned OpLo, OpHi, SrcLoReg, SrcHiReg;
+ unsigned SrcReg = MI.getOperand(1).getReg();
+ bool SrcIsKill = MI.getOperand(1).isKill();
+ OpLo = AVR::STSKRr;
+ OpHi = AVR::STSKRr;
+ TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
+
+ // Write the high byte first in case this address belongs to a special
+ // I/O address with a special temporary register.
+ auto MIBHI = buildMI(MBB, MBBI, OpHi);
+ auto MIBLO = buildMI(MBB, MBBI, OpLo);
+
+ switch (MI.getOperand(0).getType()) {
+ case MachineOperand::MO_GlobalAddress: {
+ const GlobalValue *GV = MI.getOperand(0).getGlobal();
+ int64_t Offs = MI.getOperand(0).getOffset();
+ unsigned TF = MI.getOperand(0).getTargetFlags();
+
+ MIBLO.addGlobalAddress(GV, Offs, TF);
+ MIBHI.addGlobalAddress(GV, Offs + 1, TF);
+ break;
+ }
+ case MachineOperand::MO_Immediate: {
+ unsigned Imm = MI.getOperand(0).getImm();
+
+ MIBLO.addImm(Imm);
+ MIBHI.addImm(Imm + 1);
+ break;
+ }
+ default:
+ llvm_unreachable("Unknown operand type!");
+ }
+
+ MIBLO.addReg(SrcLoReg, getKillRegState(SrcIsKill));
+ MIBHI.addReg(SrcHiReg, getKillRegState(SrcIsKill));
+
+ MIBLO->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ MIBHI->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+ MI.eraseFromParent();
+ return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::STWPtrRr>(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned OpLo, OpHi, SrcLoReg, SrcHiReg;
+ unsigned DstReg = MI.getOperand(0).getReg();
+ unsigned SrcReg = MI.getOperand(1).getReg();
+ bool DstIsKill = MI.getOperand(0).isKill();
+ bool SrcIsKill = MI.getOperand(1).isKill();
+ OpLo = AVR::STPtrRr;
+ OpHi = AVR::STDPtrQRr;
+ TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
+
+ //:TODO: need to reverse this order like inw and stsw?
+ auto MIBLO = buildMI(MBB, MBBI, OpLo)
+ .addReg(DstReg)
+ .addReg(SrcLoReg, getKillRegState(SrcIsKill));
+
+ auto MIBHI = buildMI(MBB, MBBI, OpHi)
+ .addReg(DstReg, getKillRegState(DstIsKill))
+ .addImm(1)
+ .addReg(SrcHiReg, getKillRegState(SrcIsKill));
+
+ MIBLO->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ MIBHI->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+ MI.eraseFromParent();
+ return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::STWPtrPiRr>(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned OpLo, OpHi, SrcLoReg, SrcHiReg;
+ unsigned DstReg = MI.getOperand(0).getReg();
+ unsigned SrcReg = MI.getOperand(2).getReg();
+ unsigned Imm = MI.getOperand(3).getImm();
+ bool DstIsDead = MI.getOperand(0).isDead();
+ bool SrcIsKill = MI.getOperand(2).isKill();
+ OpLo = AVR::STPtrPiRr;
+ OpHi = AVR::STPtrPiRr;
+ TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
+
+ assert(DstReg != SrcReg && "SrcReg and DstReg cannot be the same");
+
+ auto MIBLO = buildMI(MBB, MBBI, OpLo)
+ .addReg(DstReg, RegState::Define)
+ .addReg(DstReg, RegState::Kill)
+ .addReg(SrcLoReg, getKillRegState(SrcIsKill))
+ .addImm(Imm);
+
+ auto MIBHI = buildMI(MBB, MBBI, OpHi)
+ .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstReg, RegState::Kill)
+ .addReg(SrcHiReg, getKillRegState(SrcIsKill))
+ .addImm(Imm);
+
+ MIBLO->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ MIBHI->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+ MI.eraseFromParent();
+ return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::STWPtrPdRr>(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned OpLo, OpHi, SrcLoReg, SrcHiReg;
+ unsigned DstReg = MI.getOperand(0).getReg();
+ unsigned SrcReg = MI.getOperand(2).getReg();
+ unsigned Imm = MI.getOperand(3).getImm();
+ bool DstIsDead = MI.getOperand(0).isDead();
+ bool SrcIsKill = MI.getOperand(2).isKill();
+ OpLo = AVR::STPtrPdRr;
+ OpHi = AVR::STPtrPdRr;
+ TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
+
+ assert(DstReg != SrcReg && "SrcReg and DstReg cannot be the same");
+
+ auto MIBHI = buildMI(MBB, MBBI, OpHi)
+ .addReg(DstReg, RegState::Define)
+ .addReg(DstReg, RegState::Kill)
+ .addReg(SrcHiReg, getKillRegState(SrcIsKill))
+ .addImm(Imm);
+
+ auto MIBLO = buildMI(MBB, MBBI, OpLo)
+ .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstReg, RegState::Kill)
+ .addReg(SrcLoReg, getKillRegState(SrcIsKill))
+ .addImm(Imm);
+
+ MIBLO->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ MIBHI->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+ MI.eraseFromParent();
+ return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::STDWPtrQRr>(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned OpLo, OpHi, SrcLoReg, SrcHiReg;
+ unsigned DstReg = MI.getOperand(0).getReg();
+ unsigned SrcReg = MI.getOperand(2).getReg();
+ unsigned Imm = MI.getOperand(1).getImm();
+ bool DstIsKill = MI.getOperand(0).isKill();
+ bool SrcIsKill = MI.getOperand(2).isKill();
+ OpLo = AVR::STDPtrQRr;
+ OpHi = AVR::STDPtrQRr;
+ TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
+
+ assert(Imm <= 63 && "Offset is out of range");
+
+ auto MIBLO = buildMI(MBB, MBBI, OpLo)
+ .addReg(DstReg)
+ .addImm(Imm)
+ .addReg(SrcLoReg, getKillRegState(SrcIsKill));
+
+ auto MIBHI = buildMI(MBB, MBBI, OpHi)
+ .addReg(DstReg, getKillRegState(DstIsKill))
+ .addImm(Imm + 1)
+ .addReg(SrcHiReg, getKillRegState(SrcIsKill));
+
+ MIBLO->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ MIBHI->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+ MI.eraseFromParent();
+ return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::INWRdA>(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned OpLo, OpHi, DstLoReg, DstHiReg;
+ unsigned Imm = MI.getOperand(1).getImm();
+ unsigned DstReg = MI.getOperand(0).getReg();
+ bool DstIsDead = MI.getOperand(0).isDead();
+ OpLo = AVR::INRdA;
+ OpHi = AVR::INRdA;
+ TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+ assert(Imm <= 63 && "Address is out of range");
+
+ auto MIBLO = buildMI(MBB, MBBI, OpLo)
+ .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addImm(Imm);
+
+ auto MIBHI = buildMI(MBB, MBBI, OpHi)
+ .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addImm(Imm + 1);
+
+ MIBLO->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ MIBHI->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+ MI.eraseFromParent();
+ return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::OUTWARr>(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned OpLo, OpHi, SrcLoReg, SrcHiReg;
+ unsigned Imm = MI.getOperand(0).getImm();
+ unsigned SrcReg = MI.getOperand(1).getReg();
+ bool SrcIsKill = MI.getOperand(1).isKill();
+ OpLo = AVR::OUTARr;
+ OpHi = AVR::OUTARr;
+ TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
+
+ assert(Imm <= 63 && "Address is out of range");
+
+ // 16 bit I/O writes need the high byte first
+ auto MIBHI = buildMI(MBB, MBBI, OpHi)
+ .addImm(Imm + 1)
+ .addReg(SrcHiReg, getKillRegState(SrcIsKill));
+
+ auto MIBLO = buildMI(MBB, MBBI, OpLo)
+ .addImm(Imm)
+ .addReg(SrcLoReg, getKillRegState(SrcIsKill));
+
+ MIBLO->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ MIBHI->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+ MI.eraseFromParent();
+ return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::PUSHWRr>(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned OpLo, OpHi, SrcLoReg, SrcHiReg;
+ unsigned SrcReg = MI.getOperand(0).getReg();
+ bool SrcIsKill = MI.getOperand(0).isKill();
+ unsigned Flags = MI.getFlags();
+ OpLo = AVR::PUSHRr;
+ OpHi = AVR::PUSHRr;
+ TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
+
+ // Low part
+ buildMI(MBB, MBBI, OpLo)
+ .addReg(SrcLoReg, getKillRegState(SrcIsKill))
+ .setMIFlags(Flags);
+
+ // High part
+ buildMI(MBB, MBBI, OpHi)
+ .addReg(SrcHiReg, getKillRegState(SrcIsKill))
+ .setMIFlags(Flags);
+
+ MI.eraseFromParent();
+ return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::POPWRd>(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned OpLo, OpHi, DstLoReg, DstHiReg;
+ unsigned DstReg = MI.getOperand(0).getReg();
+ unsigned Flags = MI.getFlags();
+ OpLo = AVR::POPRd;
+ OpHi = AVR::POPRd;
+ TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+ buildMI(MBB, MBBI, OpHi, DstHiReg).setMIFlags(Flags); // High
+ buildMI(MBB, MBBI, OpLo, DstLoReg).setMIFlags(Flags); // Low
+
+ MI.eraseFromParent();
+ return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::LSLWRd>(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned OpLo, OpHi, DstLoReg, DstHiReg;
+ unsigned DstReg = MI.getOperand(0).getReg();
+ bool DstIsDead = MI.getOperand(0).isDead();
+ bool DstIsKill = MI.getOperand(1).isKill();
+ bool ImpIsDead = MI.getOperand(2).isDead();
+ OpLo = AVR::LSLRd;
+ OpHi = AVR::ROLRd;
+ TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+ // Low part
+ buildMI(MBB, MBBI, OpLo)
+ .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstLoReg, getKillRegState(DstIsKill));
+
+ auto MIBHI = buildMI(MBB, MBBI, OpHi)
+ .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstHiReg, getKillRegState(DstIsKill));
+
+ if (ImpIsDead)
+ MIBHI->getOperand(2).setIsDead();
+
+ // SREG is always implicitly killed
+ MIBHI->getOperand(3).setIsKill();
+
+ MI.eraseFromParent();
+ return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::LSRWRd>(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned OpLo, OpHi, DstLoReg, DstHiReg;
+ unsigned DstReg = MI.getOperand(0).getReg();
+ bool DstIsDead = MI.getOperand(0).isDead();
+ bool DstIsKill = MI.getOperand(1).isKill();
+ bool ImpIsDead = MI.getOperand(2).isDead();
+ OpLo = AVR::RORRd;
+ OpHi = AVR::LSRRd;
+ TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+ // High part
+ buildMI(MBB, MBBI, OpHi)
+ .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstHiReg, getKillRegState(DstIsKill));
+
+ auto MIBLO = buildMI(MBB, MBBI, OpLo)
+ .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstLoReg, getKillRegState(DstIsKill));
+
+ if (ImpIsDead)
+ MIBLO->getOperand(2).setIsDead();
+
+ // SREG is always implicitly killed
+ MIBLO->getOperand(3).setIsKill();
+
+ MI.eraseFromParent();
+ return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::RORWRd>(Block &MBB, BlockIt MBBI) {
+ llvm_unreachable("RORW unimplemented");
+ return false;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::ROLWRd>(Block &MBB, BlockIt MBBI) {
+ llvm_unreachable("ROLW unimplemented");
+ return false;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::ASRWRd>(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned OpLo, OpHi, DstLoReg, DstHiReg;
+ unsigned DstReg = MI.getOperand(0).getReg();
+ bool DstIsDead = MI.getOperand(0).isDead();
+ bool DstIsKill = MI.getOperand(1).isKill();
+ bool ImpIsDead = MI.getOperand(2).isDead();
+ OpLo = AVR::RORRd;
+ OpHi = AVR::ASRRd;
+ TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+ // High part
+ buildMI(MBB, MBBI, OpHi)
+ .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstHiReg, getKillRegState(DstIsKill));
+
+ auto MIBLO = buildMI(MBB, MBBI, OpLo)
+ .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstLoReg, getKillRegState(DstIsKill));
+
+ if (ImpIsDead)
+ MIBLO->getOperand(2).setIsDead();
+
+ // SREG is always implicitly killed
+ MIBLO->getOperand(3).setIsKill();
+
+ MI.eraseFromParent();
+ return true;
+}
+
+template <> bool AVRExpandPseudo::expand<AVR::SEXT>(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned DstLoReg, DstHiReg;
+ // sext R17:R16, R17
+ // mov r16, r17
+ // lsl r17
+ // sbc r17, r17
+ // sext R17:R16, R13
+ // mov r16, r13
+ // mov r17, r13
+ // lsl r17
+ // sbc r17, r17
+ // sext R17:R16, R16
+ // mov r17, r16
+ // lsl r17
+ // sbc r17, r17
+ unsigned DstReg = MI.getOperand(0).getReg();
+ unsigned SrcReg = MI.getOperand(1).getReg();
+ bool DstIsDead = MI.getOperand(0).isDead();
+ bool SrcIsKill = MI.getOperand(1).isKill();
+ bool ImpIsDead = MI.getOperand(2).isDead();
+ TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+ if (SrcReg != DstLoReg) {
+ auto MOV = buildMI(MBB, MBBI, AVR::MOVRdRr)
+ .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(SrcReg);
+
+ if (SrcReg == DstHiReg) {
+ MOV->getOperand(1).setIsKill();
+ }
+ }
+
+ if (SrcReg != DstHiReg) {
+ buildMI(MBB, MBBI, AVR::MOVRdRr)
+ .addReg(DstHiReg, RegState::Define)
+ .addReg(SrcReg, getKillRegState(SrcIsKill));
+ }
+
+ buildMI(MBB, MBBI, AVR::LSLRd)
+ .addReg(DstHiReg, RegState::Define)
+ .addReg(DstHiReg, RegState::Kill);
+
+ auto SBC = buildMI(MBB, MBBI, AVR::SBCRdRr)
+ .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstHiReg, RegState::Kill)
+ .addReg(DstHiReg, RegState::Kill);
+
+ if (ImpIsDead)
+ SBC->getOperand(3).setIsDead();
+
+ // SREG is always implicitly killed
+ SBC->getOperand(4).setIsKill();
+
+ MI.eraseFromParent();
+ return true;
+}
+
+template <> bool AVRExpandPseudo::expand<AVR::ZEXT>(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned DstLoReg, DstHiReg;
+ // zext R25:R24, R20
+ // mov R24, R20
+ // eor R25, R25
+ // zext R25:R24, R24
+ // eor R25, R25
+ // zext R25:R24, R25
+ // mov R24, R25
+ // eor R25, R25
+ unsigned DstReg = MI.getOperand(0).getReg();
+ unsigned SrcReg = MI.getOperand(1).getReg();
+ bool DstIsDead = MI.getOperand(0).isDead();
+ bool SrcIsKill = MI.getOperand(1).isKill();
+ bool ImpIsDead = MI.getOperand(2).isDead();
+ TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+ if (SrcReg != DstLoReg) {
+ buildMI(MBB, MBBI, AVR::MOVRdRr)
+ .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(SrcReg, getKillRegState(SrcIsKill));
+ }
+
+ auto EOR = buildMI(MBB, MBBI, AVR::EORRdRr)
+ .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstHiReg, RegState::Kill)
+ .addReg(DstHiReg, RegState::Kill);
+
+ if (ImpIsDead)
+ EOR->getOperand(3).setIsDead();
+
+ MI.eraseFromParent();
+ return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::SPREAD>(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned OpLo, OpHi, DstLoReg, DstHiReg;
+ unsigned DstReg = MI.getOperand(0).getReg();
+ bool DstIsDead = MI.getOperand(0).isDead();
+ unsigned Flags = MI.getFlags();
+ OpLo = AVR::INRdA;
+ OpHi = AVR::INRdA;
+ TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+ // Low part
+ buildMI(MBB, MBBI, OpLo)
+ .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addImm(0x3d)
+ .setMIFlags(Flags);
+
+ // High part
+ buildMI(MBB, MBBI, OpHi)
+ .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addImm(0x3e)
+ .setMIFlags(Flags);
+
+ MI.eraseFromParent();
+ return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::SPWRITE>(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned SrcLoReg, SrcHiReg;
+ unsigned SrcReg = MI.getOperand(1).getReg();
+ bool SrcIsKill = MI.getOperand(1).isKill();
+ unsigned Flags = MI.getFlags();
+ TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
+
+ buildMI(MBB, MBBI, AVR::INRdA)
+ .addReg(AVR::R0, RegState::Define)
+ .addImm(SREG_ADDR)
+ .setMIFlags(Flags);
+
+ buildMI(MBB, MBBI, AVR::BCLRs).addImm(0x07).setMIFlags(Flags);
+
+ buildMI(MBB, MBBI, AVR::OUTARr)
+ .addImm(0x3e)
+ .addReg(SrcHiReg, getKillRegState(SrcIsKill))
+ .setMIFlags(Flags);
+
+ buildMI(MBB, MBBI, AVR::OUTARr)
+ .addImm(SREG_ADDR)
+ .addReg(AVR::R0, RegState::Kill)
+ .setMIFlags(Flags);
+
+ buildMI(MBB, MBBI, AVR::OUTARr)
+ .addImm(0x3d)
+ .addReg(SrcLoReg, getKillRegState(SrcIsKill))
+ .setMIFlags(Flags);
+
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AVRExpandPseudo::expandMI(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ int Opcode = MBBI->getOpcode();
+
+#define EXPAND(Op) \
+ case Op: \
+ return expand<Op>(MBB, MI)
+
+ switch (Opcode) {
+ EXPAND(AVR::ADDWRdRr);
+ EXPAND(AVR::ADCWRdRr);
+ EXPAND(AVR::SUBWRdRr);
+ EXPAND(AVR::SUBIWRdK);
+ EXPAND(AVR::SBCWRdRr);
+ EXPAND(AVR::SBCIWRdK);
+ EXPAND(AVR::ANDWRdRr);
+ EXPAND(AVR::ANDIWRdK);
+ EXPAND(AVR::ORWRdRr);
+ EXPAND(AVR::ORIWRdK);
+ EXPAND(AVR::EORWRdRr);
+ EXPAND(AVR::COMWRd);
+ EXPAND(AVR::CPWRdRr);
+ EXPAND(AVR::CPCWRdRr);
+ EXPAND(AVR::LDIWRdK);
+ EXPAND(AVR::LDSWRdK);
+ EXPAND(AVR::LDWRdPtr);
+ EXPAND(AVR::LDWRdPtrPi);
+ EXPAND(AVR::LDWRdPtrPd);
+ case AVR::LDDWRdYQ: //:FIXME: remove this once PR13375 gets fixed
+ EXPAND(AVR::LDDWRdPtrQ);
+ EXPAND(AVR::LPMWRdZ);
+ EXPAND(AVR::LPMWRdZPi);
+ EXPAND(AVR::AtomicLoad8);
+ EXPAND(AVR::AtomicLoad16);
+ EXPAND(AVR::AtomicStore8);
+ EXPAND(AVR::AtomicStore16);
+ EXPAND(AVR::AtomicLoadAdd8);
+ EXPAND(AVR::AtomicLoadAdd16);
+ EXPAND(AVR::AtomicLoadSub8);
+ EXPAND(AVR::AtomicLoadSub16);
+ EXPAND(AVR::AtomicLoadAnd8);
+ EXPAND(AVR::AtomicLoadAnd16);
+ EXPAND(AVR::AtomicLoadOr8);
+ EXPAND(AVR::AtomicLoadOr16);
+ EXPAND(AVR::AtomicLoadXor8);
+ EXPAND(AVR::AtomicLoadXor16);
+ EXPAND(AVR::AtomicFence);
+ EXPAND(AVR::STSWKRr);
+ EXPAND(AVR::STWPtrRr);
+ EXPAND(AVR::STWPtrPiRr);
+ EXPAND(AVR::STWPtrPdRr);
+ EXPAND(AVR::STDWPtrQRr);
+ EXPAND(AVR::INWRdA);
+ EXPAND(AVR::OUTWARr);
+ EXPAND(AVR::PUSHWRr);
+ EXPAND(AVR::POPWRd);
+ EXPAND(AVR::LSLWRd);
+ EXPAND(AVR::LSRWRd);
+ EXPAND(AVR::RORWRd);
+ EXPAND(AVR::ROLWRd);
+ EXPAND(AVR::ASRWRd);
+ EXPAND(AVR::SEXT);
+ EXPAND(AVR::ZEXT);
+ EXPAND(AVR::SPREAD);
+ EXPAND(AVR::SPWRITE);
+ }
+#undef EXPAND
+ return false;
+}
+
+} // end of anonymous namespace
+
+INITIALIZE_PASS(AVRExpandPseudo, "avr-expand-pseudo",
+ AVR_EXPAND_PSEUDO_NAME, false, false)
+namespace llvm {
+
+FunctionPass *createAVRExpandPseudoPass() { return new AVRExpandPseudo(); }
+
+} // end of namespace llvm
diff --git a/contrib/llvm/lib/Target/AVR/AVRFrameLowering.cpp b/contrib/llvm/lib/Target/AVR/AVRFrameLowering.cpp
new file mode 100644
index 000000000000..b8cb2215ddb4
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRFrameLowering.cpp
@@ -0,0 +1,538 @@
+//===-- AVRFrameLowering.cpp - AVR Frame Information ----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AVR implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVRFrameLowering.h"
+
+#include "AVR.h"
+#include "AVRInstrInfo.h"
+#include "AVRMachineFunctionInfo.h"
+#include "AVRTargetMachine.h"
+#include "MCTargetDesc/AVRMCTargetDesc.h"
+
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+
+#include <vector>
+
+namespace llvm {
+
+AVRFrameLowering::AVRFrameLowering()
+ : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 1, -2) {}
+
+bool AVRFrameLowering::canSimplifyCallFramePseudos(
+ const MachineFunction &MF) const {
+ // Always simplify call frame pseudo instructions, even when
+ // hasReservedCallFrame is false.
+ return true;
+}
+
+bool AVRFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+ // Reserve call frame memory in function prologue under the following
+ // conditions:
+ // - Y pointer is reserved to be the frame pointer.
+ // - The function does not contain variable sized objects.
+
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ return hasFP(MF) && !MFI.hasVarSizedObjects();
+}
+
+void AVRFrameLowering::emitPrologue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ MachineBasicBlock::iterator MBBI = MBB.begin();
+ CallingConv::ID CallConv = MF.getFunction()->getCallingConv();
+ DebugLoc DL = (MBBI != MBB.end()) ? MBBI->getDebugLoc() : DebugLoc();
+ const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>();
+ const AVRInstrInfo &TII = *STI.getInstrInfo();
+
+ // Interrupt handlers re-enable interrupts in function entry.
+ if (CallConv == CallingConv::AVR_INTR) {
+ BuildMI(MBB, MBBI, DL, TII.get(AVR::BSETs))
+ .addImm(0x07)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ // Emit special prologue code to save R1, R0 and SREG in interrupt/signal
+ // handlers before saving any other registers.
+ if (CallConv == CallingConv::AVR_INTR ||
+ CallConv == CallingConv::AVR_SIGNAL) {
+ BuildMI(MBB, MBBI, DL, TII.get(AVR::PUSHWRr))
+ .addReg(AVR::R1R0, RegState::Kill)
+ .setMIFlag(MachineInstr::FrameSetup);
+ BuildMI(MBB, MBBI, DL, TII.get(AVR::INRdA), AVR::R0)
+ .addImm(0x3f)
+ .setMIFlag(MachineInstr::FrameSetup);
+ BuildMI(MBB, MBBI, DL, TII.get(AVR::PUSHRr))
+ .addReg(AVR::R0, RegState::Kill)
+ .setMIFlag(MachineInstr::FrameSetup);
+ BuildMI(MBB, MBBI, DL, TII.get(AVR::EORRdRr))
+ .addReg(AVR::R0, RegState::Define)
+ .addReg(AVR::R0, RegState::Kill)
+ .addReg(AVR::R0, RegState::Kill)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ // Early exit if the frame pointer is not needed in this function.
+ if (!hasFP(MF)) {
+ return;
+ }
+
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const AVRMachineFunctionInfo *AFI = MF.getInfo<AVRMachineFunctionInfo>();
+ unsigned FrameSize = MFI.getStackSize() - AFI->getCalleeSavedFrameSize();
+
+ // Skip the callee-saved push instructions.
+ while (
+ (MBBI != MBB.end()) && MBBI->getFlag(MachineInstr::FrameSetup) &&
+ (MBBI->getOpcode() == AVR::PUSHRr || MBBI->getOpcode() == AVR::PUSHWRr)) {
+ ++MBBI;
+ }
+
+ // Update Y with the new base value.
+ BuildMI(MBB, MBBI, DL, TII.get(AVR::SPREAD), AVR::R29R28)
+ .addReg(AVR::SP)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ // Mark the FramePtr as live-in in every block except the entry.
+ for (MachineFunction::iterator I = std::next(MF.begin()), E = MF.end();
+ I != E; ++I) {
+ I->addLiveIn(AVR::R29R28);
+ }
+
+ if (!FrameSize) {
+ return;
+ }
+
+ // Reserve the necessary frame memory by doing FP -= <size>.
+ unsigned Opcode = (isUInt<6>(FrameSize)) ? AVR::SBIWRdK : AVR::SUBIWRdK;
+
+ MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opcode), AVR::R29R28)
+ .addReg(AVR::R29R28, RegState::Kill)
+ .addImm(FrameSize)
+ .setMIFlag(MachineInstr::FrameSetup);
+ // The SREG implicit def is dead.
+ MI->getOperand(3).setIsDead();
+
+ // Write back R29R28 to SP and temporarily disable interrupts.
+ BuildMI(MBB, MBBI, DL, TII.get(AVR::SPWRITE), AVR::SP)
+ .addReg(AVR::R29R28)
+ .setMIFlag(MachineInstr::FrameSetup);
+}
+
+void AVRFrameLowering::emitEpilogue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ CallingConv::ID CallConv = MF.getFunction()->getCallingConv();
+ bool isHandler = (CallConv == CallingConv::AVR_INTR ||
+ CallConv == CallingConv::AVR_SIGNAL);
+
+ // Early exit if the frame pointer is not needed in this function except for
+ // signal/interrupt handlers where special code generation is required.
+ if (!hasFP(MF) && !isHandler) {
+ return;
+ }
+
+ MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+ assert(MBBI->getDesc().isReturn() &&
+ "Can only insert epilog into returning blocks");
+
+ DebugLoc DL = MBBI->getDebugLoc();
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const AVRMachineFunctionInfo *AFI = MF.getInfo<AVRMachineFunctionInfo>();
+ unsigned FrameSize = MFI.getStackSize() - AFI->getCalleeSavedFrameSize();
+ const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>();
+ const AVRInstrInfo &TII = *STI.getInstrInfo();
+
+ // Emit special epilogue code to restore R1, R0 and SREG in interrupt/signal
+ // handlers at the very end of the function, just before reti.
+ if (isHandler) {
+ BuildMI(MBB, MBBI, DL, TII.get(AVR::POPRd), AVR::R0);
+ BuildMI(MBB, MBBI, DL, TII.get(AVR::OUTARr))
+ .addImm(0x3f)
+ .addReg(AVR::R0, RegState::Kill);
+ BuildMI(MBB, MBBI, DL, TII.get(AVR::POPWRd), AVR::R1R0);
+ }
+
+ // Early exit if there is no need to restore the frame pointer.
+ if (!FrameSize) {
+ return;
+ }
+
+ // Skip the callee-saved pop instructions.
+ while (MBBI != MBB.begin()) {
+ MachineBasicBlock::iterator PI = std::prev(MBBI);
+ int Opc = PI->getOpcode();
+
+ if (Opc != AVR::POPRd && Opc != AVR::POPWRd && !PI->isTerminator()) {
+ break;
+ }
+
+ --MBBI;
+ }
+
+ unsigned Opcode;
+
+ // Select the optimal opcode depending on how big it is.
+ if (isUInt<6>(FrameSize)) {
+ Opcode = AVR::ADIWRdK;
+ } else {
+ Opcode = AVR::SUBIWRdK;
+ FrameSize = -FrameSize;
+ }
+
+ // Restore the frame pointer by doing FP += <size>.
+ MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opcode), AVR::R29R28)
+ .addReg(AVR::R29R28, RegState::Kill)
+ .addImm(FrameSize);
+ // The SREG implicit def is dead.
+ MI->getOperand(3).setIsDead();
+
+ // Write back R29R28 to SP and temporarily disable interrupts.
+ BuildMI(MBB, MBBI, DL, TII.get(AVR::SPWRITE), AVR::SP)
+ .addReg(AVR::R29R28, RegState::Kill);
+}
+
+// Return true if the specified function should have a dedicated frame
+// pointer register. This is true if the function meets any of the following
+// conditions:
+// - a register has been spilled
+// - has allocas
+// - input arguments are passed using the stack
+//
+// Notice that strictly this is not a frame pointer because it contains SP after
+// frame allocation instead of having the original SP in function entry.
+bool AVRFrameLowering::hasFP(const MachineFunction &MF) const {
+ const AVRMachineFunctionInfo *FuncInfo = MF.getInfo<AVRMachineFunctionInfo>();
+
+ return (FuncInfo->getHasSpills() || FuncInfo->getHasAllocas() ||
+ FuncInfo->getHasStackArgs());
+}
+
+bool AVRFrameLowering::spillCalleeSavedRegisters(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const {
+ if (CSI.empty()) {
+ return false;
+ }
+
+ unsigned CalleeFrameSize = 0;
+ DebugLoc DL = MBB.findDebugLoc(MI);
+ MachineFunction &MF = *MBB.getParent();
+ const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>();
+ const TargetInstrInfo &TII = *STI.getInstrInfo();
+ AVRMachineFunctionInfo *AVRFI = MF.getInfo<AVRMachineFunctionInfo>();
+
+ for (unsigned i = CSI.size(); i != 0; --i) {
+ unsigned Reg = CSI[i - 1].getReg();
+ bool IsNotLiveIn = !MBB.isLiveIn(Reg);
+
+ assert(TRI->getMinimalPhysRegClass(Reg)->getSize() == 1 &&
+ "Invalid register size");
+
+ // Add the callee-saved register as live-in only if it is not already a
+ // live-in register, this usually happens with arguments that are passed
+ // through callee-saved registers.
+ if (IsNotLiveIn) {
+ MBB.addLiveIn(Reg);
+ }
+
+ // Do not kill the register when it is an input argument.
+ BuildMI(MBB, MI, DL, TII.get(AVR::PUSHRr))
+ .addReg(Reg, getKillRegState(IsNotLiveIn))
+ .setMIFlag(MachineInstr::FrameSetup);
+ ++CalleeFrameSize;
+ }
+
+ AVRFI->setCalleeSavedFrameSize(CalleeFrameSize);
+
+ return true;
+}
+
+bool AVRFrameLowering::restoreCalleeSavedRegisters(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const {
+ if (CSI.empty()) {
+ return false;
+ }
+
+ DebugLoc DL = MBB.findDebugLoc(MI);
+ const MachineFunction &MF = *MBB.getParent();
+ const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>();
+ const TargetInstrInfo &TII = *STI.getInstrInfo();
+
+ for (const CalleeSavedInfo &CCSI : CSI) {
+ unsigned Reg = CCSI.getReg();
+
+ assert(TRI->getMinimalPhysRegClass(Reg)->getSize() == 1 &&
+ "Invalid register size");
+
+ BuildMI(MBB, MI, DL, TII.get(AVR::POPRd), Reg);
+ }
+
+ return true;
+}
+
+/// Replace pseudo store instructions that pass arguments through the stack with
+/// real instructions. If insertPushes is true then all instructions are
+/// replaced with push instructions, otherwise regular std instructions are
+/// inserted.
+static void fixStackStores(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const TargetInstrInfo &TII, bool insertPushes) {
+ const AVRSubtarget &STI = MBB.getParent()->getSubtarget<AVRSubtarget>();
+ const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+
+ // Iterate through the BB until we hit a call instruction or we reach the end.
+ for (auto I = MI, E = MBB.end(); I != E && !I->isCall();) {
+ MachineBasicBlock::iterator NextMI = std::next(I);
+ MachineInstr &MI = *I;
+ unsigned Opcode = I->getOpcode();
+
+ // Only care of pseudo store instructions where SP is the base pointer.
+ if (Opcode != AVR::STDSPQRr && Opcode != AVR::STDWSPQRr) {
+ I = NextMI;
+ continue;
+ }
+
+ assert(MI.getOperand(0).getReg() == AVR::SP &&
+ "Invalid register, should be SP!");
+ if (insertPushes) {
+ // Replace this instruction with a push.
+ unsigned SrcReg = MI.getOperand(2).getReg();
+ bool SrcIsKill = MI.getOperand(2).isKill();
+
+ // We can't use PUSHWRr here because when expanded the order of the new
+ // instructions are reversed from what we need. Perform the expansion now.
+ if (Opcode == AVR::STDWSPQRr) {
+ BuildMI(MBB, I, MI.getDebugLoc(), TII.get(AVR::PUSHRr))
+ .addReg(TRI.getSubReg(SrcReg, AVR::sub_hi),
+ getKillRegState(SrcIsKill));
+ BuildMI(MBB, I, MI.getDebugLoc(), TII.get(AVR::PUSHRr))
+ .addReg(TRI.getSubReg(SrcReg, AVR::sub_lo),
+ getKillRegState(SrcIsKill));
+ } else {
+ BuildMI(MBB, I, MI.getDebugLoc(), TII.get(AVR::PUSHRr))
+ .addReg(SrcReg, getKillRegState(SrcIsKill));
+ }
+
+ MI.eraseFromParent();
+ I = NextMI;
+ continue;
+ }
+
+ // Replace this instruction with a regular store. Use Y as the base
+ // pointer since it is guaranteed to contain a copy of SP.
+ unsigned STOpc =
+ (Opcode == AVR::STDWSPQRr) ? AVR::STDWPtrQRr : AVR::STDPtrQRr;
+
+ MI.setDesc(TII.get(STOpc));
+ MI.getOperand(0).setReg(AVR::R29R28);
+
+ I = NextMI;
+ }
+}
+
+MachineBasicBlock::iterator AVRFrameLowering::eliminateCallFramePseudoInstr(
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI) const {
+ const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>();
+ const TargetFrameLowering &TFI = *STI.getFrameLowering();
+ const AVRInstrInfo &TII = *STI.getInstrInfo();
+
+ // There is nothing to insert when the call frame memory is allocated during
+ // function entry. Delete the call frame pseudo and replace all pseudo stores
+ // with real store instructions.
+ if (TFI.hasReservedCallFrame(MF)) {
+ fixStackStores(MBB, MI, TII, false);
+ return MBB.erase(MI);
+ }
+
+ DebugLoc DL = MI->getDebugLoc();
+ unsigned int Opcode = MI->getOpcode();
+ int Amount = MI->getOperand(0).getImm();
+
+ // Adjcallstackup does not need to allocate stack space for the call, instead
+ // we insert push instructions that will allocate the necessary stack.
+ // For adjcallstackdown we convert it into an 'adiw reg, <amt>' handling
+ // the read and write of SP in I/O space.
+ if (Amount != 0) {
+ assert(TFI.getStackAlignment() == 1 && "Unsupported stack alignment");
+
+ if (Opcode == TII.getCallFrameSetupOpcode()) {
+ fixStackStores(MBB, MI, TII, true);
+ } else {
+ assert(Opcode == TII.getCallFrameDestroyOpcode());
+
+ // Select the best opcode to adjust SP based on the offset size.
+ unsigned addOpcode;
+ if (isUInt<6>(Amount)) {
+ addOpcode = AVR::ADIWRdK;
+ } else {
+ addOpcode = AVR::SUBIWRdK;
+ Amount = -Amount;
+ }
+
+ // Build the instruction sequence.
+ BuildMI(MBB, MI, DL, TII.get(AVR::SPREAD), AVR::R31R30).addReg(AVR::SP);
+
+ MachineInstr *New = BuildMI(MBB, MI, DL, TII.get(addOpcode), AVR::R31R30)
+ .addReg(AVR::R31R30, RegState::Kill)
+ .addImm(Amount);
+ New->getOperand(3).setIsDead();
+
+ BuildMI(MBB, MI, DL, TII.get(AVR::SPWRITE), AVR::SP)
+ .addReg(AVR::R31R30, RegState::Kill);
+ }
+ }
+
+ return MBB.erase(MI);
+}
+
+void AVRFrameLowering::determineCalleeSaves(MachineFunction &MF,
+ BitVector &SavedRegs,
+ RegScavenger *RS) const {
+ TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+
+ // Spill register Y when it is used as the frame pointer.
+ if (hasFP(MF)) {
+ SavedRegs.set(AVR::R29R28);
+ SavedRegs.set(AVR::R29);
+ SavedRegs.set(AVR::R28);
+ }
+}
+/// The frame analyzer pass.
+///
+/// Scans the function for allocas and used arguments
+/// that are passed through the stack.
+struct AVRFrameAnalyzer : public MachineFunctionPass {
+ static char ID;
+ AVRFrameAnalyzer() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ AVRMachineFunctionInfo *FuncInfo = MF.getInfo<AVRMachineFunctionInfo>();
+
+ // If there are no fixed frame indexes during this stage it means there
+ // are allocas present in the function.
+ if (MFI.getNumObjects() != MFI.getNumFixedObjects()) {
+ // Check for the type of allocas present in the function. We only care
+ // about fixed size allocas so do not give false positives if only
+ // variable sized allocas are present.
+ for (unsigned i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
+ // Variable sized objects have size 0.
+ if (MFI.getObjectSize(i)) {
+ FuncInfo->setHasAllocas(true);
+ break;
+ }
+ }
+ }
+
+ // If there are fixed frame indexes present, scan the function to see if
+ // they are really being used.
+ if (MFI.getNumFixedObjects() == 0) {
+ return false;
+ }
+
+ // Ok fixed frame indexes present, now scan the function to see if they
+ // are really being used, otherwise we can ignore them.
+ for (const MachineBasicBlock &BB : MF) {
+ for (const MachineInstr &MI : BB) {
+ int Opcode = MI.getOpcode();
+
+ if ((Opcode != AVR::LDDRdPtrQ) && (Opcode != AVR::LDDWRdPtrQ) &&
+ (Opcode != AVR::STDPtrQRr) && (Opcode != AVR::STDWPtrQRr)) {
+ continue;
+ }
+
+ for (const MachineOperand &MO : MI.operands()) {
+ if (!MO.isFI()) {
+ continue;
+ }
+
+ if (MFI.isFixedObjectIndex(MO.getIndex())) {
+ FuncInfo->setHasStackArgs(true);
+ return false;
+ }
+ }
+ }
+ }
+
+ return false;
+ }
+
+ StringRef getPassName() const { return "AVR Frame Analyzer"; }
+};
+
+char AVRFrameAnalyzer::ID = 0;
+
+/// Creates instance of the frame analyzer pass.
+FunctionPass *createAVRFrameAnalyzerPass() { return new AVRFrameAnalyzer(); }
+
+/// Create the Dynalloca Stack Pointer Save/Restore pass.
+/// Insert a copy of SP before allocating the dynamic stack memory and restore
+/// it in function exit to restore the original SP state. This avoids the need
+/// of reserving a register pair for a frame pointer.
+struct AVRDynAllocaSR : public MachineFunctionPass {
+ static char ID;
+ AVRDynAllocaSR() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) {
+ // Early exit when there are no variable sized objects in the function.
+ if (!MF.getFrameInfo().hasVarSizedObjects()) {
+ return false;
+ }
+
+ const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>();
+ const TargetInstrInfo &TII = *STI.getInstrInfo();
+ MachineBasicBlock &EntryMBB = MF.front();
+ MachineBasicBlock::iterator MBBI = EntryMBB.begin();
+ DebugLoc DL = EntryMBB.findDebugLoc(MBBI);
+
+ unsigned SPCopy =
+ MF.getRegInfo().createVirtualRegister(&AVR::DREGSRegClass);
+
+ // Create a copy of SP in function entry before any dynallocas are
+ // inserted.
+ BuildMI(EntryMBB, MBBI, DL, TII.get(AVR::COPY), SPCopy).addReg(AVR::SP);
+
+ // Restore SP in all exit basic blocks.
+ for (MachineBasicBlock &MBB : MF) {
+ // If last instruction is a return instruction, add a restore copy.
+ if (!MBB.empty() && MBB.back().isReturn()) {
+ MBBI = MBB.getLastNonDebugInstr();
+ DL = MBBI->getDebugLoc();
+ BuildMI(MBB, MBBI, DL, TII.get(AVR::COPY), AVR::SP)
+ .addReg(SPCopy, RegState::Kill);
+ }
+ }
+
+ return true;
+ }
+
+ StringRef getPassName() const {
+ return "AVR dynalloca stack pointer save/restore";
+ }
+};
+
+char AVRDynAllocaSR::ID = 0;
+
+/// createAVRDynAllocaSRPass - returns an instance of the dynalloca stack
+/// pointer save/restore pass.
+FunctionPass *createAVRDynAllocaSRPass() { return new AVRDynAllocaSR(); }
+
+} // end of namespace llvm
+
diff --git a/contrib/llvm/lib/Target/AVR/AVRFrameLowering.h b/contrib/llvm/lib/Target/AVR/AVRFrameLowering.h
new file mode 100644
index 000000000000..850a43abebfa
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRFrameLowering.h
@@ -0,0 +1,46 @@
+//===-- AVRFrameLowering.h - Define frame lowering for AVR ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_FRAME_LOWERING_H
+#define LLVM_AVR_FRAME_LOWERING_H
+
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+
+/// Utilities for creating function call frames.
+class AVRFrameLowering : public TargetFrameLowering {
+public:
+ explicit AVRFrameLowering();
+
+public:
+ void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+ void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+ bool hasFP(const MachineFunction &MF) const override;
+ bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const override;
+ bool
+ restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const override;
+ bool hasReservedCallFrame(const MachineFunction &MF) const override;
+ bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override;
+ void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+ RegScavenger *RS = nullptr) const override;
+ MachineBasicBlock::iterator
+ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI) const override;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_FRAME_LOWERING_H
diff --git a/contrib/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp b/contrib/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp
new file mode 100644
index 000000000000..156a21dfecfe
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp
@@ -0,0 +1,565 @@
+//===-- AVRISelDAGToDAG.cpp - A dag to dag inst selector for AVR ----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the AVR target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVR.h"
+#include "AVRTargetMachine.h"
+#include "MCTargetDesc/AVRMCTargetDesc.h"
+
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "avr-isel"
+
+namespace llvm {
+
+/// Lowers LLVM IR (in DAG form) to AVR MC instructions (in DAG form).
+class AVRDAGToDAGISel : public SelectionDAGISel {
+public:
+ AVRDAGToDAGISel(AVRTargetMachine &TM, CodeGenOpt::Level OptLevel)
+ : SelectionDAGISel(TM, OptLevel), Subtarget(nullptr) {}
+
+ StringRef getPassName() const override {
+ return "AVR DAG->DAG Instruction Selection";
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ bool SelectAddr(SDNode *Op, SDValue N, SDValue &Base, SDValue &Disp);
+
+ bool selectIndexedLoad(SDNode *N);
+ unsigned selectIndexedProgMemLoad(const LoadSDNode *LD, MVT VT);
+
+ bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintCode,
+ std::vector<SDValue> &OutOps) override;
+
+// Include the pieces autogenerated from the target description.
+#include "AVRGenDAGISel.inc"
+
+private:
+ void Select(SDNode *N) override;
+ bool trySelect(SDNode *N);
+
+ template <unsigned NodeType> bool select(SDNode *N);
+ bool selectMultiplication(SDNode *N);
+
+ const AVRSubtarget *Subtarget;
+};
+
+bool AVRDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
+ Subtarget = &MF.getSubtarget<AVRSubtarget>();
+ return SelectionDAGISel::runOnMachineFunction(MF);
+}
+
+bool AVRDAGToDAGISel::SelectAddr(SDNode *Op, SDValue N, SDValue &Base,
+ SDValue &Disp) {
+ SDLoc dl(Op);
+ auto DL = CurDAG->getDataLayout();
+ MVT PtrVT = getTargetLowering()->getPointerTy(DL);
+
+ // if the address is a frame index get the TargetFrameIndex.
+ if (const FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(N)) {
+ Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), PtrVT);
+ Disp = CurDAG->getTargetConstant(0, dl, MVT::i8);
+
+ return true;
+ }
+
+ // Match simple Reg + uimm6 operands.
+ if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB &&
+ !CurDAG->isBaseWithConstantOffset(N)) {
+ return false;
+ }
+
+ if (const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+ int RHSC = (int)RHS->getZExtValue();
+
+ // Convert negative offsets into positives ones.
+ if (N.getOpcode() == ISD::SUB) {
+ RHSC = -RHSC;
+ }
+
+ // <#Frame index + const>
+ // Allow folding offsets bigger than 63 so the frame pointer can be used
+ // directly instead of copying it around by adjusting and restoring it for
+ // each access.
+ if (N.getOperand(0).getOpcode() == ISD::FrameIndex) {
+ int FI = cast<FrameIndexSDNode>(N.getOperand(0))->getIndex();
+
+ Base = CurDAG->getTargetFrameIndex(FI, PtrVT);
+ Disp = CurDAG->getTargetConstant(RHSC, dl, MVT::i16);
+
+ return true;
+ }
+
+ // The value type of the memory instruction determines what is the maximum
+ // offset allowed.
+ MVT VT = cast<MemSDNode>(Op)->getMemoryVT().getSimpleVT();
+
+ // We only accept offsets that fit in 6 bits (unsigned).
+ if (isUInt<6>(RHSC) && (VT == MVT::i8 || VT == MVT::i16)) {
+ Base = N.getOperand(0);
+ Disp = CurDAG->getTargetConstant(RHSC, dl, MVT::i8);
+
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool AVRDAGToDAGISel::selectIndexedLoad(SDNode *N) {
+ const LoadSDNode *LD = cast<LoadSDNode>(N);
+ ISD::MemIndexedMode AM = LD->getAddressingMode();
+ MVT VT = LD->getMemoryVT().getSimpleVT();
+ auto PtrVT = getTargetLowering()->getPointerTy(CurDAG->getDataLayout());
+
+ // We only care if this load uses a POSTINC or PREDEC mode.
+ if ((LD->getExtensionType() != ISD::NON_EXTLOAD) ||
+ (AM != ISD::POST_INC && AM != ISD::PRE_DEC)) {
+
+ return false;
+ }
+
+ unsigned Opcode = 0;
+ bool isPre = (AM == ISD::PRE_DEC);
+ int Offs = cast<ConstantSDNode>(LD->getOffset())->getSExtValue();
+
+ switch (VT.SimpleTy) {
+ case MVT::i8: {
+ if ((!isPre && Offs != 1) || (isPre && Offs != -1)) {
+ return false;
+ }
+
+ Opcode = (isPre) ? AVR::LDRdPtrPd : AVR::LDRdPtrPi;
+ break;
+ }
+ case MVT::i16: {
+ if ((!isPre && Offs != 2) || (isPre && Offs != -2)) {
+ return false;
+ }
+
+ Opcode = (isPre) ? AVR::LDWRdPtrPd : AVR::LDWRdPtrPi;
+ break;
+ }
+ default:
+ return false;
+ }
+
+ SDNode *ResNode = CurDAG->getMachineNode(Opcode, SDLoc(N), VT,
+ PtrVT, MVT::Other,
+ LD->getBasePtr(), LD->getChain());
+ ReplaceUses(N, ResNode);
+ CurDAG->RemoveDeadNode(N);
+
+ return true;
+}
+
+unsigned AVRDAGToDAGISel::selectIndexedProgMemLoad(const LoadSDNode *LD,
+ MVT VT) {
+ ISD::MemIndexedMode AM = LD->getAddressingMode();
+
+ // Progmem indexed loads only work in POSTINC mode.
+ if (LD->getExtensionType() != ISD::NON_EXTLOAD || AM != ISD::POST_INC) {
+ return 0;
+ }
+
+ unsigned Opcode = 0;
+ int Offs = cast<ConstantSDNode>(LD->getOffset())->getSExtValue();
+
+ switch (VT.SimpleTy) {
+ case MVT::i8: {
+ if (Offs != 1) {
+ return 0;
+ }
+ Opcode = AVR::LPMRdZPi;
+ break;
+ }
+ case MVT::i16: {
+ if (Offs != 2) {
+ return 0;
+ }
+ Opcode = AVR::LPMWRdZPi;
+ break;
+ }
+ default:
+ return 0;
+ }
+
+ return Opcode;
+}
+
+bool AVRDAGToDAGISel::SelectInlineAsmMemoryOperand(const SDValue &Op,
+ unsigned ConstraintCode,
+ std::vector<SDValue> &OutOps) {
+ assert(ConstraintCode == InlineAsm::Constraint_m ||
+ ConstraintCode == InlineAsm::Constraint_Q &&
+ "Unexpected asm memory constraint");
+
+ MachineRegisterInfo &RI = MF->getRegInfo();
+ const AVRSubtarget &STI = MF->getSubtarget<AVRSubtarget>();
+ const TargetLowering &TL = *STI.getTargetLowering();
+ SDLoc dl(Op);
+ auto DL = CurDAG->getDataLayout();
+
+ const RegisterSDNode *RegNode = dyn_cast<RegisterSDNode>(Op);
+
+ // If address operand is of PTRDISPREGS class, all is OK, then.
+ if (RegNode &&
+ RI.getRegClass(RegNode->getReg()) == &AVR::PTRDISPREGSRegClass) {
+ OutOps.push_back(Op);
+ return false;
+ }
+
+ if (Op->getOpcode() == ISD::FrameIndex) {
+ SDValue Base, Disp;
+
+ if (SelectAddr(Op.getNode(), Op, Base, Disp)) {
+ OutOps.push_back(Base);
+ OutOps.push_back(Disp);
+
+ return false;
+ }
+
+ return true;
+ }
+
+ // If Op is add 'register, immediate' and
+ // register is either virtual register or register of PTRDISPREGSRegClass
+ if (Op->getOpcode() == ISD::ADD || Op->getOpcode() == ISD::SUB) {
+ SDValue CopyFromRegOp = Op->getOperand(0);
+ SDValue ImmOp = Op->getOperand(1);
+ ConstantSDNode *ImmNode = dyn_cast<ConstantSDNode>(ImmOp);
+
+ unsigned Reg;
+ bool CanHandleRegImmOpt = true;
+
+ CanHandleRegImmOpt &= ImmNode != 0;
+ CanHandleRegImmOpt &= ImmNode->getAPIntValue().getZExtValue() < 64;
+
+ if (CopyFromRegOp->getOpcode() == ISD::CopyFromReg) {
+ RegisterSDNode *RegNode =
+ cast<RegisterSDNode>(CopyFromRegOp->getOperand(1));
+ Reg = RegNode->getReg();
+ CanHandleRegImmOpt &= (TargetRegisterInfo::isVirtualRegister(Reg) ||
+ AVR::PTRDISPREGSRegClass.contains(Reg));
+ } else {
+ CanHandleRegImmOpt = false;
+ }
+
+ // If we detect proper case - correct virtual register class
+ // if needed and go to another inlineasm operand.
+ if (CanHandleRegImmOpt) {
+ SDValue Base, Disp;
+
+ if (RI.getRegClass(Reg) != &AVR::PTRDISPREGSRegClass) {
+ SDLoc dl(CopyFromRegOp);
+
+ unsigned VReg = RI.createVirtualRegister(&AVR::PTRDISPREGSRegClass);
+
+ SDValue CopyToReg =
+ CurDAG->getCopyToReg(CopyFromRegOp, dl, VReg, CopyFromRegOp);
+
+ SDValue NewCopyFromRegOp =
+ CurDAG->getCopyFromReg(CopyToReg, dl, VReg, TL.getPointerTy(DL));
+
+ Base = NewCopyFromRegOp;
+ } else {
+ Base = CopyFromRegOp;
+ }
+
+ if (ImmNode->getValueType(0) != MVT::i8) {
+ Disp = CurDAG->getTargetConstant(ImmNode->getAPIntValue().getZExtValue(), dl, MVT::i8);
+ } else {
+ Disp = ImmOp;
+ }
+
+ OutOps.push_back(Base);
+ OutOps.push_back(Disp);
+
+ return false;
+ }
+ }
+
+ // More generic case.
+ // Create chain that puts Op into pointer register
+ // and return that register.
+ unsigned VReg = RI.createVirtualRegister(&AVR::PTRDISPREGSRegClass);
+
+ SDValue CopyToReg = CurDAG->getCopyToReg(Op, dl, VReg, Op);
+ SDValue CopyFromReg =
+ CurDAG->getCopyFromReg(CopyToReg, dl, VReg, TL.getPointerTy(DL));
+
+ OutOps.push_back(CopyFromReg);
+
+ return false;
+}
+
+template <> bool AVRDAGToDAGISel::select<ISD::FrameIndex>(SDNode *N) {
+ auto DL = CurDAG->getDataLayout();
+
+ // Convert the frameindex into a temp instruction that will hold the
+ // effective address of the final stack slot.
+ int FI = cast<FrameIndexSDNode>(N)->getIndex();
+ SDValue TFI =
+ CurDAG->getTargetFrameIndex(FI, getTargetLowering()->getPointerTy(DL));
+
+ CurDAG->SelectNodeTo(N, AVR::FRMIDX,
+ getTargetLowering()->getPointerTy(DL), TFI,
+ CurDAG->getTargetConstant(0, SDLoc(N), MVT::i16));
+ return true;
+}
+
+template <> bool AVRDAGToDAGISel::select<ISD::STORE>(SDNode *N) {
+ // Use the STD{W}SPQRr pseudo instruction when passing arguments through
+ // the stack on function calls for further expansion during the PEI phase.
+ const StoreSDNode *ST = cast<StoreSDNode>(N);
+ SDValue BasePtr = ST->getBasePtr();
+
+ // Early exit when the base pointer is a frame index node or a constant.
+ if (isa<FrameIndexSDNode>(BasePtr) || isa<ConstantSDNode>(BasePtr) ||
+ BasePtr.isUndef()) {
+ return false;
+ }
+
+ const RegisterSDNode *RN = dyn_cast<RegisterSDNode>(BasePtr.getOperand(0));
+ // Only stores where SP is the base pointer are valid.
+ if (!RN || (RN->getReg() != AVR::SP)) {
+ return false;
+ }
+
+ int CST = (int)cast<ConstantSDNode>(BasePtr.getOperand(1))->getZExtValue();
+ SDValue Chain = ST->getChain();
+ EVT VT = ST->getValue().getValueType();
+ SDLoc DL(N);
+ SDValue Offset = CurDAG->getTargetConstant(CST, DL, MVT::i16);
+ SDValue Ops[] = {BasePtr.getOperand(0), Offset, ST->getValue(), Chain};
+ unsigned Opc = (VT == MVT::i16) ? AVR::STDWSPQRr : AVR::STDSPQRr;
+
+ SDNode *ResNode = CurDAG->getMachineNode(Opc, DL, MVT::Other, Ops);
+
+ // Transfer memory operands.
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = ST->getMemOperand();
+ cast<MachineSDNode>(ResNode)->setMemRefs(MemOp, MemOp + 1);
+
+ ReplaceUses(SDValue(N, 0), SDValue(ResNode, 0));
+ CurDAG->RemoveDeadNode(N);
+
+ return true;
+}
+
+template <> bool AVRDAGToDAGISel::select<ISD::LOAD>(SDNode *N) {
+ const LoadSDNode *LD = cast<LoadSDNode>(N);
+ if (!AVR::isProgramMemoryAccess(LD)) {
+ // Check if the opcode can be converted into an indexed load.
+ return selectIndexedLoad(N);
+ }
+
+ assert(Subtarget->hasLPM() && "cannot load from program memory on this mcu");
+
+ // This is a flash memory load, move the pointer into R31R30 and emit
+ // the lpm instruction.
+ MVT VT = LD->getMemoryVT().getSimpleVT();
+ SDValue Chain = LD->getChain();
+ SDValue Ptr = LD->getBasePtr();
+ SDNode *ResNode;
+ SDLoc DL(N);
+
+ Chain = CurDAG->getCopyToReg(Chain, DL, AVR::R31R30, Ptr, SDValue());
+ Ptr = CurDAG->getCopyFromReg(Chain, DL, AVR::R31R30, MVT::i16,
+ Chain.getValue(1));
+
+ SDValue RegZ = CurDAG->getRegister(AVR::R31R30, MVT::i16);
+
+ // Check if the opcode can be converted into an indexed load.
+ if (unsigned LPMOpc = selectIndexedProgMemLoad(LD, VT)) {
+ // It is legal to fold the load into an indexed load.
+ ResNode = CurDAG->getMachineNode(LPMOpc, DL, VT, MVT::i16, MVT::Other, Ptr,
+ RegZ);
+ ReplaceUses(SDValue(N, 1), SDValue(ResNode, 1));
+ } else {
+ // Selecting an indexed load is not legal, fallback to a normal load.
+ switch (VT.SimpleTy) {
+ case MVT::i8:
+ ResNode = CurDAG->getMachineNode(AVR::LPMRdZ, DL, MVT::i8, MVT::Other,
+ Ptr, RegZ);
+ break;
+ case MVT::i16:
+ ResNode = CurDAG->getMachineNode(AVR::LPMWRdZ, DL, MVT::i16,
+ MVT::Other, Ptr, RegZ);
+ ReplaceUses(SDValue(N, 1), SDValue(ResNode, 1));
+ break;
+ default:
+ llvm_unreachable("Unsupported VT!");
+ }
+ }
+
+ // Transfer memory operands.
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = LD->getMemOperand();
+ cast<MachineSDNode>(ResNode)->setMemRefs(MemOp, MemOp + 1);
+
+ ReplaceUses(SDValue(N, 0), SDValue(ResNode, 0));
+ ReplaceUses(SDValue(N, 1), SDValue(ResNode, 1));
+ CurDAG->RemoveDeadNode(N);
+
+ return true;
+}
+
+template <> bool AVRDAGToDAGISel::select<AVRISD::CALL>(SDNode *N) {
+ SDValue InFlag;
+ SDValue Chain = N->getOperand(0);
+ SDValue Callee = N->getOperand(1);
+ unsigned LastOpNum = N->getNumOperands() - 1;
+
+ // Direct calls are autogenerated.
+ unsigned Op = Callee.getOpcode();
+ if (Op == ISD::TargetGlobalAddress || Op == ISD::TargetExternalSymbol) {
+ return false;
+ }
+
+ // Skip the incoming flag if present
+ if (N->getOperand(LastOpNum).getValueType() == MVT::Glue) {
+ --LastOpNum;
+ }
+
+ SDLoc DL(N);
+ Chain = CurDAG->getCopyToReg(Chain, DL, AVR::R31R30, Callee, InFlag);
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(CurDAG->getRegister(AVR::R31R30, MVT::i16));
+
+ // Map all operands into the new node.
+ for (unsigned i = 2, e = LastOpNum + 1; i != e; ++i) {
+ Ops.push_back(N->getOperand(i));
+ }
+
+ Ops.push_back(Chain);
+ Ops.push_back(Chain.getValue(1));
+
+ SDNode *ResNode =
+ CurDAG->getMachineNode(AVR::ICALL, DL, MVT::Other, MVT::Glue, Ops);
+
+ ReplaceUses(SDValue(N, 0), SDValue(ResNode, 0));
+ ReplaceUses(SDValue(N, 1), SDValue(ResNode, 1));
+ CurDAG->RemoveDeadNode(N);
+
+ return true;
+}
+
+template <> bool AVRDAGToDAGISel::select<ISD::BRIND>(SDNode *N) {
+ SDValue Chain = N->getOperand(0);
+ SDValue JmpAddr = N->getOperand(1);
+
+ SDLoc DL(N);
+ // Move the destination address of the indirect branch into R31R30.
+ Chain = CurDAG->getCopyToReg(Chain, DL, AVR::R31R30, JmpAddr);
+ SDNode *ResNode = CurDAG->getMachineNode(AVR::IJMP, DL, MVT::Other, Chain);
+
+ ReplaceUses(SDValue(N, 0), SDValue(ResNode, 0));
+ CurDAG->RemoveDeadNode(N);
+
+ return true;
+}
+
+bool AVRDAGToDAGISel::selectMultiplication(llvm::SDNode *N) {
+ SDLoc DL(N);
+ MVT Type = N->getSimpleValueType(0);
+
+ assert(Type == MVT::i8 && "unexpected value type");
+
+ bool isSigned = N->getOpcode() == ISD::SMUL_LOHI;
+ unsigned MachineOp = isSigned ? AVR::MULSRdRr : AVR::MULRdRr;
+
+ SDValue Lhs = N->getOperand(0);
+ SDValue Rhs = N->getOperand(1);
+ SDNode *Mul = CurDAG->getMachineNode(MachineOp, DL, MVT::Glue, Lhs, Rhs);
+ SDValue InChain = CurDAG->getEntryNode();
+ SDValue InGlue = SDValue(Mul, 0);
+
+ // Copy the low half of the result, if it is needed.
+ if (N->hasAnyUseOfValue(0)) {
+ SDValue CopyFromLo =
+ CurDAG->getCopyFromReg(InChain, DL, AVR::R0, Type, InGlue);
+
+ ReplaceUses(SDValue(N, 0), CopyFromLo);
+
+ InChain = CopyFromLo.getValue(1);
+ InGlue = CopyFromLo.getValue(2);
+ }
+
+ // Copy the high half of the result, if it is needed.
+ if (N->hasAnyUseOfValue(1)) {
+ SDValue CopyFromHi =
+ CurDAG->getCopyFromReg(InChain, DL, AVR::R1, Type, InGlue);
+
+ ReplaceUses(SDValue(N, 1), CopyFromHi);
+
+ InChain = CopyFromHi.getValue(1);
+ InGlue = CopyFromHi.getValue(2);
+ }
+
+ CurDAG->RemoveDeadNode(N);
+
+ // We need to clear R1. This is currently done (dirtily)
+ // using a custom inserter.
+
+ return true;
+}
+
+void AVRDAGToDAGISel::Select(SDNode *N) {
+ // Dump information about the Node being selected
+ DEBUG(errs() << "Selecting: "; N->dump(CurDAG); errs() << "\n");
+
+ // If we have a custom node, we already have selected!
+ if (N->isMachineOpcode()) {
+ DEBUG(errs() << "== "; N->dump(CurDAG); errs() << "\n");
+ N->setNodeId(-1);
+ return;
+ }
+
+ // See if subclasses can handle this node.
+ if (trySelect(N))
+ return;
+
+ // Select the default instruction
+ SelectCode(N);
+}
+
+bool AVRDAGToDAGISel::trySelect(SDNode *N) {
+ unsigned Opcode = N->getOpcode();
+ SDLoc DL(N);
+
+ switch (Opcode) {
+ // Nodes we fully handle.
+ case ISD::FrameIndex: return select<ISD::FrameIndex>(N);
+ case ISD::BRIND: return select<ISD::BRIND>(N);
+ case ISD::UMUL_LOHI:
+ case ISD::SMUL_LOHI: return selectMultiplication(N);
+
+ // Nodes we handle partially. Other cases are autogenerated
+ case ISD::STORE: return select<ISD::STORE>(N);
+ case ISD::LOAD: return select<ISD::LOAD>(N);
+ case AVRISD::CALL: return select<AVRISD::CALL>(N);
+ default: return false;
+ }
+}
+
+FunctionPass *createAVRISelDag(AVRTargetMachine &TM,
+ CodeGenOpt::Level OptLevel) {
+ return new AVRDAGToDAGISel(TM, OptLevel);
+}
+
+} // end of namespace llvm
+
diff --git a/contrib/llvm/lib/Target/AVR/AVRISelLowering.cpp b/contrib/llvm/lib/Target/AVR/AVRISelLowering.cpp
new file mode 100644
index 000000000000..53668f05b59b
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRISelLowering.cpp
@@ -0,0 +1,1937 @@
+//===-- AVRISelLowering.cpp - AVR DAG Lowering Implementation -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that AVR uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVRISelLowering.h"
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/ErrorHandling.h"
+
+#include "AVR.h"
+#include "AVRMachineFunctionInfo.h"
+#include "AVRTargetMachine.h"
+#include "MCTargetDesc/AVRMCTargetDesc.h"
+
+namespace llvm {
+
+AVRTargetLowering::AVRTargetLowering(AVRTargetMachine &tm)
+ : TargetLowering(tm) {
+ // Set up the register classes.
+ addRegisterClass(MVT::i8, &AVR::GPR8RegClass);
+ addRegisterClass(MVT::i16, &AVR::DREGSRegClass);
+
+ // Compute derived properties from the register classes.
+ computeRegisterProperties(tm.getSubtargetImpl()->getRegisterInfo());
+
+ setBooleanContents(ZeroOrOneBooleanContent);
+ setBooleanVectorContents(ZeroOrOneBooleanContent);
+ setSchedulingPreference(Sched::RegPressure);
+ setStackPointerRegisterToSaveRestore(AVR::SP);
+
+ setOperationAction(ISD::GlobalAddress, MVT::i16, Custom);
+ setOperationAction(ISD::BlockAddress, MVT::i16, Custom);
+
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i8, Expand);
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i16, Expand);
+
+ for (MVT VT : MVT::integer_valuetypes()) {
+ for (auto N : {ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}) {
+ setLoadExtAction(N, VT, MVT::i1, Promote);
+ setLoadExtAction(N, VT, MVT::i8, Expand);
+ }
+ }
+
+ setTruncStoreAction(MVT::i16, MVT::i8, Expand);
+
+ // sub (x, imm) gets canonicalized to add (x, -imm), so for illegal types
+ // revert into a sub since we don't have an add with immediate instruction.
+ setOperationAction(ISD::ADD, MVT::i32, Custom);
+ setOperationAction(ISD::ADD, MVT::i64, Custom);
+
+ // our shift instructions are only able to shift 1 bit at a time, so handle
+ // this in a custom way.
+ setOperationAction(ISD::SRA, MVT::i8, Custom);
+ setOperationAction(ISD::SHL, MVT::i8, Custom);
+ setOperationAction(ISD::SRL, MVT::i8, Custom);
+ setOperationAction(ISD::SRA, MVT::i16, Custom);
+ setOperationAction(ISD::SHL, MVT::i16, Custom);
+ setOperationAction(ISD::SRL, MVT::i16, Custom);
+ setOperationAction(ISD::SHL_PARTS, MVT::i16, Expand);
+ setOperationAction(ISD::SRA_PARTS, MVT::i16, Expand);
+ setOperationAction(ISD::SRL_PARTS, MVT::i16, Expand);
+
+ setOperationAction(ISD::BR_CC, MVT::i8, Custom);
+ setOperationAction(ISD::BR_CC, MVT::i16, Custom);
+ setOperationAction(ISD::BR_CC, MVT::i32, Custom);
+ setOperationAction(ISD::BR_CC, MVT::i64, Custom);
+ setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+
+ setOperationAction(ISD::SELECT_CC, MVT::i8, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::i16, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
+ setOperationAction(ISD::SETCC, MVT::i8, Custom);
+ setOperationAction(ISD::SETCC, MVT::i16, Custom);
+ setOperationAction(ISD::SETCC, MVT::i32, Custom);
+ setOperationAction(ISD::SETCC, MVT::i64, Custom);
+ setOperationAction(ISD::SELECT, MVT::i8, Expand);
+ setOperationAction(ISD::SELECT, MVT::i16, Expand);
+
+ setOperationAction(ISD::BSWAP, MVT::i16, Expand);
+
+ // Add support for postincrement and predecrement load/stores.
+ setIndexedLoadAction(ISD::POST_INC, MVT::i8, Legal);
+ setIndexedLoadAction(ISD::POST_INC, MVT::i16, Legal);
+ setIndexedLoadAction(ISD::PRE_DEC, MVT::i8, Legal);
+ setIndexedLoadAction(ISD::PRE_DEC, MVT::i16, Legal);
+ setIndexedStoreAction(ISD::POST_INC, MVT::i8, Legal);
+ setIndexedStoreAction(ISD::POST_INC, MVT::i16, Legal);
+ setIndexedStoreAction(ISD::PRE_DEC, MVT::i8, Legal);
+ setIndexedStoreAction(ISD::PRE_DEC, MVT::i16, Legal);
+
+ setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+
+ setOperationAction(ISD::VASTART, MVT::Other, Custom);
+ setOperationAction(ISD::VAEND, MVT::Other, Expand);
+ setOperationAction(ISD::VAARG, MVT::Other, Expand);
+ setOperationAction(ISD::VACOPY, MVT::Other, Expand);
+
+ // Atomic operations which must be lowered to rtlib calls
+ for (MVT VT : MVT::integer_valuetypes()) {
+ setOperationAction(ISD::ATOMIC_SWAP, VT, Expand);
+ setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Expand);
+ setOperationAction(ISD::ATOMIC_LOAD_NAND, VT, Expand);
+ setOperationAction(ISD::ATOMIC_LOAD_MAX, VT, Expand);
+ setOperationAction(ISD::ATOMIC_LOAD_MIN, VT, Expand);
+ setOperationAction(ISD::ATOMIC_LOAD_UMAX, VT, Expand);
+ setOperationAction(ISD::ATOMIC_LOAD_UMIN, VT, Expand);
+ }
+
+ // Division/remainder
+ setOperationAction(ISD::UDIV, MVT::i8, Expand);
+ setOperationAction(ISD::UDIV, MVT::i16, Expand);
+ setOperationAction(ISD::UREM, MVT::i8, Expand);
+ setOperationAction(ISD::UREM, MVT::i16, Expand);
+ setOperationAction(ISD::SDIV, MVT::i8, Expand);
+ setOperationAction(ISD::SDIV, MVT::i16, Expand);
+ setOperationAction(ISD::SREM, MVT::i8, Expand);
+ setOperationAction(ISD::SREM, MVT::i16, Expand);
+
+ // Make division and modulus custom
+ for (MVT VT : MVT::integer_valuetypes()) {
+ setOperationAction(ISD::UDIVREM, VT, Custom);
+ setOperationAction(ISD::SDIVREM, VT, Custom);
+ }
+
+ // Do not use MUL. The AVR instructions are closer to SMUL_LOHI &co.
+ setOperationAction(ISD::MUL, MVT::i8, Expand);
+ setOperationAction(ISD::MUL, MVT::i16, Expand);
+
+ // Expand 16 bit multiplications.
+ setOperationAction(ISD::SMUL_LOHI, MVT::i16, Expand);
+ setOperationAction(ISD::UMUL_LOHI, MVT::i16, Expand);
+
+ for (MVT VT : MVT::integer_valuetypes()) {
+ setOperationAction(ISD::MULHS, VT, Expand);
+ setOperationAction(ISD::MULHU, VT, Expand);
+ }
+
+ for (MVT VT : MVT::integer_valuetypes()) {
+ setOperationAction(ISD::CTPOP, VT, Expand);
+ setOperationAction(ISD::CTLZ, VT, Expand);
+ setOperationAction(ISD::CTTZ, VT, Expand);
+ }
+
+ for (MVT VT : MVT::integer_valuetypes()) {
+ setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
+ // TODO: The generated code is pretty poor. Investigate using the
+ // same "shift and subtract with carry" trick that we do for
+ // extending 8-bit to 16-bit. This may require infrastructure
+ // improvements in how we treat 16-bit "registers" to be feasible.
+ }
+
+ // Division rtlib functions (not supported)
+ setLibcallName(RTLIB::SDIV_I8, nullptr);
+ setLibcallName(RTLIB::SDIV_I16, nullptr);
+ setLibcallName(RTLIB::SDIV_I32, nullptr);
+ setLibcallName(RTLIB::SDIV_I64, nullptr);
+ setLibcallName(RTLIB::SDIV_I128, nullptr);
+ setLibcallName(RTLIB::UDIV_I8, nullptr);
+ setLibcallName(RTLIB::UDIV_I16, nullptr);
+ setLibcallName(RTLIB::UDIV_I32, nullptr);
+ setLibcallName(RTLIB::UDIV_I64, nullptr);
+ setLibcallName(RTLIB::UDIV_I128, nullptr);
+
+ // Modulus rtlib functions (not supported)
+ setLibcallName(RTLIB::SREM_I8, nullptr);
+ setLibcallName(RTLIB::SREM_I16, nullptr);
+ setLibcallName(RTLIB::SREM_I32, nullptr);
+ setLibcallName(RTLIB::SREM_I64, nullptr);
+ setLibcallName(RTLIB::SREM_I128, nullptr);
+ setLibcallName(RTLIB::UREM_I8, nullptr);
+ setLibcallName(RTLIB::UREM_I16, nullptr);
+ setLibcallName(RTLIB::UREM_I32, nullptr);
+ setLibcallName(RTLIB::UREM_I64, nullptr);
+ setLibcallName(RTLIB::UREM_I128, nullptr);
+
+ // Division and modulus rtlib functions
+ setLibcallName(RTLIB::SDIVREM_I8, "__divmodqi4");
+ setLibcallName(RTLIB::SDIVREM_I16, "__divmodhi4");
+ setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
+ setLibcallName(RTLIB::SDIVREM_I64, "__divmoddi4");
+ setLibcallName(RTLIB::SDIVREM_I128, "__divmodti4");
+ setLibcallName(RTLIB::UDIVREM_I8, "__udivmodqi4");
+ setLibcallName(RTLIB::UDIVREM_I16, "__udivmodhi4");
+ setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
+ setLibcallName(RTLIB::UDIVREM_I64, "__udivmoddi4");
+ setLibcallName(RTLIB::UDIVREM_I128, "__udivmodti4");
+
+ // Several of the runtime library functions use a special calling conv
+ setLibcallCallingConv(RTLIB::SDIVREM_I8, CallingConv::AVR_BUILTIN);
+ setLibcallCallingConv(RTLIB::SDIVREM_I16, CallingConv::AVR_BUILTIN);
+ setLibcallCallingConv(RTLIB::UDIVREM_I8, CallingConv::AVR_BUILTIN);
+ setLibcallCallingConv(RTLIB::UDIVREM_I16, CallingConv::AVR_BUILTIN);
+
+ // Trigonometric rtlib functions
+ setLibcallName(RTLIB::SIN_F32, "sin");
+ setLibcallName(RTLIB::COS_F32, "cos");
+
+ setMinFunctionAlignment(1);
+ setMinimumJumpTableEntries(INT_MAX);
+}
+
+const char *AVRTargetLowering::getTargetNodeName(unsigned Opcode) const {
+#define NODE(name) \
+ case AVRISD::name: \
+ return #name
+
+ switch (Opcode) {
+ default:
+ return nullptr;
+ NODE(RET_FLAG);
+ NODE(RETI_FLAG);
+ NODE(CALL);
+ NODE(WRAPPER);
+ NODE(LSL);
+ NODE(LSR);
+ NODE(ROL);
+ NODE(ROR);
+ NODE(ASR);
+ NODE(LSLLOOP);
+ NODE(LSRLOOP);
+ NODE(ASRLOOP);
+ NODE(BRCOND);
+ NODE(CMP);
+ NODE(CMPC);
+ NODE(TST);
+ NODE(SELECT_CC);
+#undef NODE
+ }
+}
+
+EVT AVRTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
+ EVT VT) const {
+ assert(!VT.isVector() && "No AVR SetCC type for vectors!");
+ return MVT::i8;
+}
+
+SDValue AVRTargetLowering::LowerShifts(SDValue Op, SelectionDAG &DAG) const {
+ //:TODO: this function has to be completely rewritten to produce optimal
+ // code, for now it's producing very long but correct code.
+ unsigned Opc8;
+ const SDNode *N = Op.getNode();
+ EVT VT = Op.getValueType();
+ SDLoc dl(N);
+
+ // Expand non-constant shifts to loops.
+ if (!isa<ConstantSDNode>(N->getOperand(1))) {
+ switch (Op.getOpcode()) {
+ default:
+ llvm_unreachable("Invalid shift opcode!");
+ case ISD::SHL:
+ return DAG.getNode(AVRISD::LSLLOOP, dl, VT, N->getOperand(0),
+ N->getOperand(1));
+ case ISD::SRL:
+ return DAG.getNode(AVRISD::LSRLOOP, dl, VT, N->getOperand(0),
+ N->getOperand(1));
+ case ISD::SRA:
+ return DAG.getNode(AVRISD::ASRLOOP, dl, VT, N->getOperand(0),
+ N->getOperand(1));
+ }
+ }
+
+ uint64_t ShiftAmount = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ SDValue Victim = N->getOperand(0);
+
+ switch (Op.getOpcode()) {
+ case ISD::SRA:
+ Opc8 = AVRISD::ASR;
+ break;
+ case ISD::ROTL:
+ Opc8 = AVRISD::ROL;
+ break;
+ case ISD::ROTR:
+ Opc8 = AVRISD::ROR;
+ break;
+ case ISD::SRL:
+ Opc8 = AVRISD::LSR;
+ break;
+ case ISD::SHL:
+ Opc8 = AVRISD::LSL;
+ break;
+ default:
+ llvm_unreachable("Invalid shift opcode");
+ }
+
+ while (ShiftAmount--) {
+ Victim = DAG.getNode(Opc8, dl, VT, Victim);
+ }
+
+ return Victim;
+}
+
+SDValue AVRTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
+ unsigned Opcode = Op->getOpcode();
+ assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
+ "Invalid opcode for Div/Rem lowering");
+ bool isSigned = (Opcode == ISD::SDIVREM);
+ EVT VT = Op->getValueType(0);
+ Type *Ty = VT.getTypeForEVT(*DAG.getContext());
+
+ RTLIB::Libcall LC;
+ switch (VT.getSimpleVT().SimpleTy) {
+ default:
+ llvm_unreachable("Unexpected request for libcall!");
+ case MVT::i8:
+ LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8;
+ break;
+ case MVT::i16:
+ LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16;
+ break;
+ case MVT::i32:
+ LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32;
+ break;
+ case MVT::i64:
+ LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64;
+ break;
+ }
+
+ SDValue InChain = DAG.getEntryNode();
+
+ TargetLowering::ArgListTy Args;
+ TargetLowering::ArgListEntry Entry;
+ for (SDValue const &Value : Op->op_values()) {
+ Entry.Node = Value;
+ Entry.Ty = Value.getValueType().getTypeForEVT(*DAG.getContext());
+ Entry.isSExt = isSigned;
+ Entry.isZExt = !isSigned;
+ Args.push_back(Entry);
+ }
+
+ SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
+ getPointerTy(DAG.getDataLayout()));
+
+ Type *RetTy = (Type *)StructType::get(Ty, Ty, nullptr);
+
+ SDLoc dl(Op);
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(dl)
+ .setChain(InChain)
+ .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
+ .setInRegister()
+ .setSExtResult(isSigned)
+ .setZExtResult(!isSigned);
+
+ std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+ return CallInfo.first;
+}
+
+SDValue AVRTargetLowering::LowerGlobalAddress(SDValue Op,
+ SelectionDAG &DAG) const {
+ auto DL = DAG.getDataLayout();
+
+ const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+ int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
+
+ // Create the TargetGlobalAddress node, folding in the constant offset.
+ SDValue Result =
+ DAG.getTargetGlobalAddress(GV, SDLoc(Op), getPointerTy(DL), Offset);
+ return DAG.getNode(AVRISD::WRAPPER, SDLoc(Op), getPointerTy(DL), Result);
+}
+
+SDValue AVRTargetLowering::LowerBlockAddress(SDValue Op,
+ SelectionDAG &DAG) const {
+ auto DL = DAG.getDataLayout();
+ const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+
+ SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(DL));
+
+ return DAG.getNode(AVRISD::WRAPPER, SDLoc(Op), getPointerTy(DL), Result);
+}
+
+/// IntCCToAVRCC - Convert a DAG integer condition code to an AVR CC.
+static AVRCC::CondCodes intCCToAVRCC(ISD::CondCode CC) {
+ switch (CC) {
+ default:
+ llvm_unreachable("Unknown condition code!");
+ case ISD::SETEQ:
+ return AVRCC::COND_EQ;
+ case ISD::SETNE:
+ return AVRCC::COND_NE;
+ case ISD::SETGE:
+ return AVRCC::COND_GE;
+ case ISD::SETLT:
+ return AVRCC::COND_LT;
+ case ISD::SETUGE:
+ return AVRCC::COND_SH;
+ case ISD::SETULT:
+ return AVRCC::COND_LO;
+ }
+}
+
+/// Returns appropriate AVR CMP/CMPC nodes and corresponding condition code for
+/// the given operands.
+SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+ SDValue &AVRcc, SelectionDAG &DAG,
+ SDLoc DL) const {
+ SDValue Cmp;
+ EVT VT = LHS.getValueType();
+ bool UseTest = false;
+
+ switch (CC) {
+ default:
+ break;
+ case ISD::SETLE: {
+ // Swap operands and reverse the branching condition.
+ std::swap(LHS, RHS);
+ CC = ISD::SETGE;
+ break;
+ }
+ case ISD::SETGT: {
+ if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
+ switch (C->getSExtValue()) {
+ case -1: {
+ // When doing lhs > -1 use a tst instruction on the top part of lhs
+ // and use brpl instead of using a chain of cp/cpc.
+ UseTest = true;
+ AVRcc = DAG.getConstant(AVRCC::COND_PL, DL, MVT::i8);
+ break;
+ }
+ case 0: {
+ // Turn lhs > 0 into 0 < lhs since 0 can be materialized with
+ // __zero_reg__ in lhs.
+ RHS = LHS;
+ LHS = DAG.getConstant(0, DL, VT);
+ CC = ISD::SETLT;
+ break;
+ }
+ default: {
+ // Turn lhs < rhs with lhs constant into rhs >= lhs+1, this allows
+ // us to fold the constant into the cmp instruction.
+ RHS = DAG.getConstant(C->getSExtValue() + 1, DL, VT);
+ CC = ISD::SETGE;
+ break;
+ }
+ }
+ break;
+ }
+ // Swap operands and reverse the branching condition.
+ std::swap(LHS, RHS);
+ CC = ISD::SETLT;
+ break;
+ }
+ case ISD::SETLT: {
+ if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
+ switch (C->getSExtValue()) {
+ case 1: {
+ // Turn lhs < 1 into 0 >= lhs since 0 can be materialized with
+ // __zero_reg__ in lhs.
+ RHS = LHS;
+ LHS = DAG.getConstant(0, DL, VT);
+ CC = ISD::SETGE;
+ break;
+ }
+ case 0: {
+ // When doing lhs < 0 use a tst instruction on the top part of lhs
+ // and use brmi instead of using a chain of cp/cpc.
+ UseTest = true;
+ AVRcc = DAG.getConstant(AVRCC::COND_MI, DL, MVT::i8);
+ break;
+ }
+ }
+ }
+ break;
+ }
+ case ISD::SETULE: {
+ // Swap operands and reverse the branching condition.
+ std::swap(LHS, RHS);
+ CC = ISD::SETUGE;
+ break;
+ }
+ case ISD::SETUGT: {
+ // Turn lhs < rhs with lhs constant into rhs >= lhs+1, this allows us to
+ // fold the constant into the cmp instruction.
+ if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
+ RHS = DAG.getConstant(C->getSExtValue() + 1, DL, VT);
+ CC = ISD::SETUGE;
+ break;
+ }
+ // Swap operands and reverse the branching condition.
+ std::swap(LHS, RHS);
+ CC = ISD::SETULT;
+ break;
+ }
+ }
+
+ // Expand 32 and 64 bit comparisons with custom CMP and CMPC nodes instead of
+ // using the default and/or/xor expansion code which is much longer.
+ if (VT == MVT::i32) {
+ SDValue LHSlo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, LHS,
+ DAG.getIntPtrConstant(0, DL));
+ SDValue LHShi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, LHS,
+ DAG.getIntPtrConstant(1, DL));
+ SDValue RHSlo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, RHS,
+ DAG.getIntPtrConstant(0, DL));
+ SDValue RHShi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, RHS,
+ DAG.getIntPtrConstant(1, DL));
+
+ if (UseTest) {
+ // When using tst we only care about the highest part.
+ SDValue Top = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i8, LHShi,
+ DAG.getIntPtrConstant(1, DL));
+ Cmp = DAG.getNode(AVRISD::TST, DL, MVT::Glue, Top);
+ } else {
+ Cmp = DAG.getNode(AVRISD::CMP, DL, MVT::Glue, LHSlo, RHSlo);
+ Cmp = DAG.getNode(AVRISD::CMPC, DL, MVT::Glue, LHShi, RHShi, Cmp);
+ }
+ } else if (VT == MVT::i64) {
+ SDValue LHS_0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, LHS,
+ DAG.getIntPtrConstant(0, DL));
+ SDValue LHS_1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, LHS,
+ DAG.getIntPtrConstant(1, DL));
+
+ SDValue LHS0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, LHS_0,
+ DAG.getIntPtrConstant(0, DL));
+ SDValue LHS1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, LHS_0,
+ DAG.getIntPtrConstant(1, DL));
+ SDValue LHS2 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, LHS_1,
+ DAG.getIntPtrConstant(0, DL));
+ SDValue LHS3 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, LHS_1,
+ DAG.getIntPtrConstant(1, DL));
+
+ SDValue RHS_0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, RHS,
+ DAG.getIntPtrConstant(0, DL));
+ SDValue RHS_1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, RHS,
+ DAG.getIntPtrConstant(1, DL));
+
+ SDValue RHS0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, RHS_0,
+ DAG.getIntPtrConstant(0, DL));
+ SDValue RHS1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, RHS_0,
+ DAG.getIntPtrConstant(1, DL));
+ SDValue RHS2 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, RHS_1,
+ DAG.getIntPtrConstant(0, DL));
+ SDValue RHS3 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, RHS_1,
+ DAG.getIntPtrConstant(1, DL));
+
+ if (UseTest) {
+ // When using tst we only care about the highest part.
+ SDValue Top = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i8, LHS3,
+ DAG.getIntPtrConstant(1, DL));
+ Cmp = DAG.getNode(AVRISD::TST, DL, MVT::Glue, Top);
+ } else {
+ Cmp = DAG.getNode(AVRISD::CMP, DL, MVT::Glue, LHS0, RHS0);
+ Cmp = DAG.getNode(AVRISD::CMPC, DL, MVT::Glue, LHS1, RHS1, Cmp);
+ Cmp = DAG.getNode(AVRISD::CMPC, DL, MVT::Glue, LHS2, RHS2, Cmp);
+ Cmp = DAG.getNode(AVRISD::CMPC, DL, MVT::Glue, LHS3, RHS3, Cmp);
+ }
+ } else if (VT == MVT::i8 || VT == MVT::i16) {
+ if (UseTest) {
+ // When using tst we only care about the highest part.
+ Cmp = DAG.getNode(AVRISD::TST, DL, MVT::Glue,
+ (VT == MVT::i8)
+ ? LHS
+ : DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i8,
+ LHS, DAG.getIntPtrConstant(1, DL)));
+ } else {
+ Cmp = DAG.getNode(AVRISD::CMP, DL, MVT::Glue, LHS, RHS);
+ }
+ } else {
+ llvm_unreachable("Invalid comparison size");
+ }
+
+ // When using a test instruction AVRcc is already set.
+ if (!UseTest) {
+ AVRcc = DAG.getConstant(intCCToAVRCC(CC), DL, MVT::i8);
+ }
+
+ return Cmp;
+}
+
+SDValue AVRTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
+ SDValue Chain = Op.getOperand(0);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+ SDValue LHS = Op.getOperand(2);
+ SDValue RHS = Op.getOperand(3);
+ SDValue Dest = Op.getOperand(4);
+ SDLoc dl(Op);
+
+ SDValue TargetCC;
+ SDValue Cmp = getAVRCmp(LHS, RHS, CC, TargetCC, DAG, dl);
+
+ return DAG.getNode(AVRISD::BRCOND, dl, MVT::Other, Chain, Dest, TargetCC,
+ Cmp);
+}
+
+SDValue AVRTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ SDValue TrueV = Op.getOperand(2);
+ SDValue FalseV = Op.getOperand(3);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+ SDLoc dl(Op);
+
+ SDValue TargetCC;
+ SDValue Cmp = getAVRCmp(LHS, RHS, CC, TargetCC, DAG, dl);
+
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
+ SDValue Ops[] = {TrueV, FalseV, TargetCC, Cmp};
+
+ return DAG.getNode(AVRISD::SELECT_CC, dl, VTs, Ops);
+}
+
+SDValue AVRTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+ SDLoc DL(Op);
+
+ SDValue TargetCC;
+ SDValue Cmp = getAVRCmp(LHS, RHS, CC, TargetCC, DAG, DL);
+
+ SDValue TrueV = DAG.getConstant(1, DL, Op.getValueType());
+ SDValue FalseV = DAG.getConstant(0, DL, Op.getValueType());
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
+ SDValue Ops[] = {TrueV, FalseV, TargetCC, Cmp};
+
+ return DAG.getNode(AVRISD::SELECT_CC, DL, VTs, Ops);
+}
+
+SDValue AVRTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+ const MachineFunction &MF = DAG.getMachineFunction();
+ const AVRMachineFunctionInfo *AFI = MF.getInfo<AVRMachineFunctionInfo>();
+ const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+ auto DL = DAG.getDataLayout();
+ SDLoc dl(Op);
+
+ // Vastart just stores the address of the VarArgsFrameIndex slot into the
+ // memory location argument.
+ SDValue FI = DAG.getFrameIndex(AFI->getVarArgsFrameIndex(), getPointerTy(DL));
+
+ return DAG.getStore(Op.getOperand(0), dl, FI, Op.getOperand(1),
+ MachinePointerInfo(SV), 0);
+}
+
+SDValue AVRTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+ switch (Op.getOpcode()) {
+ default:
+ llvm_unreachable("Don't know how to custom lower this!");
+ case ISD::SHL:
+ case ISD::SRA:
+ case ISD::SRL:
+ case ISD::ROTL:
+ case ISD::ROTR:
+ return LowerShifts(Op, DAG);
+ case ISD::GlobalAddress:
+ return LowerGlobalAddress(Op, DAG);
+ case ISD::BlockAddress:
+ return LowerBlockAddress(Op, DAG);
+ case ISD::BR_CC:
+ return LowerBR_CC(Op, DAG);
+ case ISD::SELECT_CC:
+ return LowerSELECT_CC(Op, DAG);
+ case ISD::SETCC:
+ return LowerSETCC(Op, DAG);
+ case ISD::VASTART:
+ return LowerVASTART(Op, DAG);
+ case ISD::SDIVREM:
+ case ISD::UDIVREM:
+ return LowerDivRem(Op, DAG);
+ }
+
+ return SDValue();
+}
+
+/// Replace a node with an illegal result type
+/// with a new node built out of custom code.
+void AVRTargetLowering::ReplaceNodeResults(SDNode *N,
+ SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const {
+ SDLoc DL(N);
+
+ switch (N->getOpcode()) {
+ case ISD::ADD: {
+ // Convert add (x, imm) into sub (x, -imm).
+ if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+ SDValue Sub = DAG.getNode(
+ ISD::SUB, DL, N->getValueType(0), N->getOperand(0),
+ DAG.getConstant(-C->getAPIntValue(), DL, C->getValueType(0)));
+ Results.push_back(Sub);
+ }
+ break;
+ }
+ default: {
+ SDValue Res = LowerOperation(SDValue(N, 0), DAG);
+
+ for (unsigned I = 0, E = Res->getNumValues(); I != E; ++I)
+ Results.push_back(Res.getValue(I));
+
+ break;
+ }
+ }
+}
+
+/// Return true if the addressing mode represented
+/// by AM is legal for this target, for a load/store of the specified type.
+bool AVRTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+ const AddrMode &AM, Type *Ty,
+ unsigned AS) const {
+ int64_t Offs = AM.BaseOffs;
+
+ // Allow absolute addresses.
+ if (AM.BaseGV && !AM.HasBaseReg && AM.Scale == 0 && Offs == 0) {
+ return true;
+ }
+
+ // Flash memory instructions only allow zero offsets.
+ if (isa<PointerType>(Ty) && AS == AVR::ProgramMemory) {
+ return false;
+ }
+
+ // Allow reg+<6bit> offset.
+ if (Offs < 0)
+ Offs = -Offs;
+ if (AM.BaseGV == 0 && AM.HasBaseReg && AM.Scale == 0 && isUInt<6>(Offs)) {
+ return true;
+ }
+
+ return false;
+}
+
+/// Returns true by value, base pointer and
+/// offset pointer and addressing mode by reference if the node's address
+/// can be legally represented as pre-indexed load / store address.
+bool AVRTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
+ SDValue &Offset,
+ ISD::MemIndexedMode &AM,
+ SelectionDAG &DAG) const {
+ EVT VT;
+ const SDNode *Op;
+ SDLoc DL(N);
+
+ if (const LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+ VT = LD->getMemoryVT();
+ Op = LD->getBasePtr().getNode();
+ if (LD->getExtensionType() != ISD::NON_EXTLOAD)
+ return false;
+ if (AVR::isProgramMemoryAccess(LD)) {
+ return false;
+ }
+ } else if (const StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+ VT = ST->getMemoryVT();
+ Op = ST->getBasePtr().getNode();
+ if (AVR::isProgramMemoryAccess(ST)) {
+ return false;
+ }
+ } else {
+ return false;
+ }
+
+ if (VT != MVT::i8 && VT != MVT::i16) {
+ return false;
+ }
+
+ if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB) {
+ return false;
+ }
+
+ if (const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
+ int RHSC = RHS->getSExtValue();
+ if (Op->getOpcode() == ISD::SUB)
+ RHSC = -RHSC;
+
+ if ((VT == MVT::i16 && RHSC != -2) || (VT == MVT::i8 && RHSC != -1)) {
+ return false;
+ }
+
+ Base = Op->getOperand(0);
+ Offset = DAG.getConstant(RHSC, DL, MVT::i8);
+ AM = ISD::PRE_DEC;
+
+ return true;
+ }
+
+ return false;
+}
+
+/// Returns true by value, base pointer and
+/// offset pointer and addressing mode by reference if this node can be
+/// combined with a load / store to form a post-indexed load / store.
+bool AVRTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+ SDValue &Base,
+ SDValue &Offset,
+ ISD::MemIndexedMode &AM,
+ SelectionDAG &DAG) const {
+ EVT VT;
+ SDLoc DL(N);
+
+ if (const LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+ VT = LD->getMemoryVT();
+ if (LD->getExtensionType() != ISD::NON_EXTLOAD)
+ return false;
+ } else if (const StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+ VT = ST->getMemoryVT();
+ if (AVR::isProgramMemoryAccess(ST)) {
+ return false;
+ }
+ } else {
+ return false;
+ }
+
+ if (VT != MVT::i8 && VT != MVT::i16) {
+ return false;
+ }
+
+ if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB) {
+ return false;
+ }
+
+ if (const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
+ int RHSC = RHS->getSExtValue();
+ if (Op->getOpcode() == ISD::SUB)
+ RHSC = -RHSC;
+ if ((VT == MVT::i16 && RHSC != 2) || (VT == MVT::i8 && RHSC != 1)) {
+ return false;
+ }
+
+ Base = Op->getOperand(0);
+ Offset = DAG.getConstant(RHSC, DL, MVT::i8);
+ AM = ISD::POST_INC;
+
+ return true;
+ }
+
+ return false;
+}
+
+bool AVRTargetLowering::isOffsetFoldingLegal(
+ const GlobalAddressSDNode *GA) const {
+ return true;
+}
+
+//===----------------------------------------------------------------------===//
+// Formal Arguments Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "AVRGenCallingConv.inc"
+
+/// For each argument in a function store the number of pieces it is composed
+/// of.
+static void parseFunctionArgs(const Function *F, const DataLayout *TD,
+ SmallVectorImpl<unsigned> &Out) {
+ for (Argument const &Arg : F->args()) {
+ unsigned Bytes = (TD->getTypeSizeInBits(Arg.getType()) + 7) / 8;
+ Out.push_back((Bytes + 1) / 2);
+ }
+}
+
+/// For external symbols there is no function prototype information so we
+/// have to rely directly on argument sizes.
+static void parseExternFuncCallArgs(const SmallVectorImpl<ISD::OutputArg> &In,
+ SmallVectorImpl<unsigned> &Out) {
+ for (unsigned i = 0, e = In.size(); i != e;) {
+ unsigned Size = 0;
+ unsigned Offset = 0;
+ while ((i != e) && (In[i].PartOffset == Offset)) {
+ Offset += In[i].VT.getStoreSize();
+ ++i;
+ ++Size;
+ }
+ Out.push_back(Size);
+ }
+}
+
+static StringRef getFunctionName(TargetLowering::CallLoweringInfo &CLI) {
+ SDValue Callee = CLI.Callee;
+
+ if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+ return G->getSymbol();
+ }
+
+ if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ return G->getGlobal()->getName();
+ }
+
+ llvm_unreachable("don't know how to get the name for this callee");
+}
+
+/// Analyze incoming and outgoing function arguments. We need custom C++ code
+/// to handle special constraints in the ABI like reversing the order of the
+/// pieces of splitted arguments. In addition, all pieces of a certain argument
+/// have to be passed either using registers or the stack but never mixing both.
+static void analyzeStandardArguments(TargetLowering::CallLoweringInfo *CLI,
+ const Function *F, const DataLayout *TD,
+ const SmallVectorImpl<ISD::OutputArg> *Outs,
+ const SmallVectorImpl<ISD::InputArg> *Ins,
+ CallingConv::ID CallConv,
+ SmallVectorImpl<CCValAssign> &ArgLocs,
+ CCState &CCInfo, bool IsCall, bool IsVarArg) {
+ static const MCPhysReg RegList8[] = {AVR::R24, AVR::R22, AVR::R20,
+ AVR::R18, AVR::R16, AVR::R14,
+ AVR::R12, AVR::R10, AVR::R8};
+ static const MCPhysReg RegList16[] = {AVR::R25R24, AVR::R23R22, AVR::R21R20,
+ AVR::R19R18, AVR::R17R16, AVR::R15R14,
+ AVR::R13R12, AVR::R11R10, AVR::R9R8};
+ if (IsVarArg) {
+ // Variadic functions do not need all the analisys below.
+ if (IsCall) {
+ CCInfo.AnalyzeCallOperands(*Outs, ArgCC_AVR_Vararg);
+ } else {
+ CCInfo.AnalyzeFormalArguments(*Ins, ArgCC_AVR_Vararg);
+ }
+ return;
+ }
+
+ // Fill in the Args array which will contain original argument sizes.
+ SmallVector<unsigned, 8> Args;
+ if (IsCall) {
+ parseExternFuncCallArgs(*Outs, Args);
+ } else {
+ assert(F != nullptr && "function should not be null");
+ parseFunctionArgs(F, TD, Args);
+ }
+
+ unsigned RegsLeft = array_lengthof(RegList8), ValNo = 0;
+ // Variadic functions always use the stack.
+ bool UsesStack = false;
+ for (unsigned i = 0, pos = 0, e = Args.size(); i != e; ++i) {
+ unsigned Size = Args[i];
+ MVT LocVT = (IsCall) ? (*Outs)[pos].VT : (*Ins)[pos].VT;
+
+ // If we have plenty of regs to pass the whole argument do it.
+ if (!UsesStack && (Size <= RegsLeft)) {
+ const MCPhysReg *RegList = (LocVT == MVT::i16) ? RegList16 : RegList8;
+
+ for (unsigned j = 0; j != Size; ++j) {
+ unsigned Reg = CCInfo.AllocateReg(
+ ArrayRef<MCPhysReg>(RegList, array_lengthof(RegList8)));
+ CCInfo.addLoc(
+ CCValAssign::getReg(ValNo++, LocVT, Reg, LocVT, CCValAssign::Full));
+ --RegsLeft;
+ }
+
+ // Reverse the order of the pieces to agree with the "big endian" format
+ // required in the calling convention ABI.
+ std::reverse(ArgLocs.begin() + pos, ArgLocs.begin() + pos + Size);
+ } else {
+ // Pass the rest of arguments using the stack.
+ UsesStack = true;
+ for (unsigned j = 0; j != Size; ++j) {
+ unsigned Offset = CCInfo.AllocateStack(
+ TD->getTypeAllocSize(EVT(LocVT).getTypeForEVT(CCInfo.getContext())),
+ TD->getABITypeAlignment(
+ EVT(LocVT).getTypeForEVT(CCInfo.getContext())));
+ CCInfo.addLoc(CCValAssign::getMem(ValNo++, LocVT, Offset, LocVT,
+ CCValAssign::Full));
+ }
+ }
+ pos += Size;
+ }
+}
+
+static void analyzeBuiltinArguments(TargetLowering::CallLoweringInfo &CLI,
+ const Function *F, const DataLayout *TD,
+ const SmallVectorImpl<ISD::OutputArg> *Outs,
+ const SmallVectorImpl<ISD::InputArg> *Ins,
+ CallingConv::ID CallConv,
+ SmallVectorImpl<CCValAssign> &ArgLocs,
+ CCState &CCInfo, bool IsCall, bool IsVarArg) {
+ StringRef FuncName = getFunctionName(CLI);
+
+ if (FuncName.startswith("__udivmod") || FuncName.startswith("__divmod")) {
+ CCInfo.AnalyzeCallOperands(*Outs, ArgCC_AVR_BUILTIN_DIV);
+ } else {
+ analyzeStandardArguments(&CLI, F, TD, Outs, Ins,
+ CallConv, ArgLocs, CCInfo,
+ IsCall, IsVarArg);
+ }
+}
+
+static void analyzeArguments(TargetLowering::CallLoweringInfo *CLI,
+ const Function *F, const DataLayout *TD,
+ const SmallVectorImpl<ISD::OutputArg> *Outs,
+ const SmallVectorImpl<ISD::InputArg> *Ins,
+ CallingConv::ID CallConv,
+ SmallVectorImpl<CCValAssign> &ArgLocs,
+ CCState &CCInfo, bool IsCall, bool IsVarArg) {
+ switch (CallConv) {
+ case CallingConv::AVR_BUILTIN: {
+ analyzeBuiltinArguments(*CLI, F, TD, Outs, Ins,
+ CallConv, ArgLocs, CCInfo,
+ IsCall, IsVarArg);
+ return;
+ }
+ default: {
+ analyzeStandardArguments(CLI, F, TD, Outs, Ins,
+ CallConv, ArgLocs, CCInfo,
+ IsCall, IsVarArg);
+ return;
+ }
+ }
+}
+
+SDValue AVRTargetLowering::LowerFormalArguments(
+ SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ auto DL = DAG.getDataLayout();
+
+ // Assign locations to all of the incoming arguments.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+ *DAG.getContext());
+
+ analyzeArguments(nullptr, MF.getFunction(), &DL, 0, &Ins, CallConv, ArgLocs, CCInfo,
+ false, isVarArg);
+
+ SDValue ArgValue;
+ for (CCValAssign &VA : ArgLocs) {
+
+ // Arguments stored on registers.
+ if (VA.isRegLoc()) {
+ EVT RegVT = VA.getLocVT();
+ const TargetRegisterClass *RC;
+ if (RegVT == MVT::i8) {
+ RC = &AVR::GPR8RegClass;
+ } else if (RegVT == MVT::i16) {
+ RC = &AVR::DREGSRegClass;
+ } else {
+ llvm_unreachable("Unknown argument type!");
+ }
+
+ unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+ ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
+
+ // :NOTE: Clang should not promote any i8 into i16 but for safety the
+ // following code will handle zexts or sexts generated by other
+ // front ends. Otherwise:
+ // If this is an 8 bit value, it is really passed promoted
+ // to 16 bits. Insert an assert[sz]ext to capture this, then
+ // truncate to the right size.
+ switch (VA.getLocInfo()) {
+ default:
+ llvm_unreachable("Unknown loc info!");
+ case CCValAssign::Full:
+ break;
+ case CCValAssign::BCvt:
+ ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
+ break;
+ case CCValAssign::SExt:
+ ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
+ DAG.getValueType(VA.getValVT()));
+ ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+ break;
+ case CCValAssign::ZExt:
+ ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
+ DAG.getValueType(VA.getValVT()));
+ ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+ break;
+ }
+
+ InVals.push_back(ArgValue);
+ } else {
+ // Sanity check.
+ assert(VA.isMemLoc());
+
+ EVT LocVT = VA.getLocVT();
+
+ // Create the frame index object for this incoming parameter.
+ int FI = MFI.CreateFixedObject(LocVT.getSizeInBits() / 8,
+ VA.getLocMemOffset(), true);
+
+ // Create the SelectionDAG nodes corresponding to a load
+ // from this parameter.
+ SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DL));
+ InVals.push_back(DAG.getLoad(LocVT, dl, Chain, FIN,
+ MachinePointerInfo::getFixedStack(MF, FI),
+ 0));
+ }
+ }
+
+ // If the function takes variable number of arguments, make a frame index for
+ // the start of the first vararg value... for expansion of llvm.va_start.
+ if (isVarArg) {
+ unsigned StackSize = CCInfo.getNextStackOffset();
+ AVRMachineFunctionInfo *AFI = MF.getInfo<AVRMachineFunctionInfo>();
+
+ AFI->setVarArgsFrameIndex(MFI.CreateFixedObject(2, StackSize, true));
+ }
+
+ return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+// Call Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+SDValue AVRTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const {
+ SelectionDAG &DAG = CLI.DAG;
+ SDLoc &DL = CLI.DL;
+ SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+ SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+ SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
+ SDValue Chain = CLI.Chain;
+ SDValue Callee = CLI.Callee;
+ bool &isTailCall = CLI.IsTailCall;
+ CallingConv::ID CallConv = CLI.CallConv;
+ bool isVarArg = CLI.IsVarArg;
+
+ MachineFunction &MF = DAG.getMachineFunction();
+
+ // AVR does not yet support tail call optimization.
+ isTailCall = false;
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+ *DAG.getContext());
+
+ // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
+ // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
+ // node so that legalize doesn't hack it.
+ const Function *F = nullptr;
+ if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ const GlobalValue *GV = G->getGlobal();
+
+ F = cast<Function>(GV);
+ Callee =
+ DAG.getTargetGlobalAddress(GV, DL, getPointerTy(DAG.getDataLayout()));
+ } else if (const ExternalSymbolSDNode *ES =
+ dyn_cast<ExternalSymbolSDNode>(Callee)) {
+ Callee = DAG.getTargetExternalSymbol(ES->getSymbol(),
+ getPointerTy(DAG.getDataLayout()));
+ }
+
+ analyzeArguments(&CLI, F, &DAG.getDataLayout(), &Outs, 0, CallConv, ArgLocs, CCInfo,
+ true, isVarArg);
+
+ // Get a count of how many bytes are to be pushed on the stack.
+ unsigned NumBytes = CCInfo.getNextStackOffset();
+
+ Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
+ DL);
+
+ SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+
+ // First, walk the register assignments, inserting copies.
+ unsigned AI, AE;
+ bool HasStackArgs = false;
+ for (AI = 0, AE = ArgLocs.size(); AI != AE; ++AI) {
+ CCValAssign &VA = ArgLocs[AI];
+ EVT RegVT = VA.getLocVT();
+ SDValue Arg = OutVals[AI];
+
+ // Promote the value if needed. With Clang this should not happen.
+ switch (VA.getLocInfo()) {
+ default:
+ llvm_unreachable("Unknown loc info!");
+ case CCValAssign::Full:
+ break;
+ case CCValAssign::SExt:
+ Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, RegVT, Arg);
+ break;
+ case CCValAssign::ZExt:
+ Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, RegVT, Arg);
+ break;
+ case CCValAssign::AExt:
+ Arg = DAG.getNode(ISD::ANY_EXTEND, DL, RegVT, Arg);
+ break;
+ case CCValAssign::BCvt:
+ Arg = DAG.getNode(ISD::BITCAST, DL, RegVT, Arg);
+ break;
+ }
+
+ // Stop when we encounter a stack argument, we need to process them
+ // in reverse order in the loop below.
+ if (VA.isMemLoc()) {
+ HasStackArgs = true;
+ break;
+ }
+
+ // Arguments that can be passed on registers must be kept in the RegsToPass
+ // vector.
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+ }
+
+ // Second, stack arguments have to walked in reverse order by inserting
+ // chained stores, this ensures their order is not changed by the scheduler
+ // and that the push instruction sequence generated is correct, otherwise they
+ // can be freely intermixed.
+ if (HasStackArgs) {
+ for (AE = AI, AI = ArgLocs.size(); AI != AE; --AI) {
+ unsigned Loc = AI - 1;
+ CCValAssign &VA = ArgLocs[Loc];
+ SDValue Arg = OutVals[Loc];
+
+ assert(VA.isMemLoc());
+
+ // SP points to one stack slot further so add one to adjust it.
+ SDValue PtrOff = DAG.getNode(
+ ISD::ADD, DL, getPointerTy(DAG.getDataLayout()),
+ DAG.getRegister(AVR::SP, getPointerTy(DAG.getDataLayout())),
+ DAG.getIntPtrConstant(VA.getLocMemOffset() + 1, DL));
+
+ Chain =
+ DAG.getStore(Chain, DL, Arg, PtrOff,
+ MachinePointerInfo::getStack(MF, VA.getLocMemOffset()),
+ 0);
+ }
+ }
+
+ // Build a sequence of copy-to-reg nodes chained together with token chain and
+ // flag operands which copy the outgoing args into registers. The InFlag in
+ // necessary since all emited instructions must be stuck together.
+ SDValue InFlag;
+ for (auto Reg : RegsToPass) {
+ Chain = DAG.getCopyToReg(Chain, DL, Reg.first, Reg.second, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ // Returns a chain & a flag for retval copy to use.
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(Chain);
+ Ops.push_back(Callee);
+
+ // Add argument registers to the end of the list so that they are known live
+ // into the call.
+ for (auto Reg : RegsToPass) {
+ Ops.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
+ }
+
+ // Add a register mask operand representing the call-preserved registers.
+ const AVRTargetMachine &TM = (const AVRTargetMachine &)getTargetMachine();
+ const TargetRegisterInfo *TRI = TM.getSubtargetImpl()->getRegisterInfo();
+ const uint32_t *Mask =
+ TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv);
+ assert(Mask && "Missing call preserved mask for calling convention");
+ Ops.push_back(DAG.getRegisterMask(Mask));
+
+ if (InFlag.getNode()) {
+ Ops.push_back(InFlag);
+ }
+
+ Chain = DAG.getNode(AVRISD::CALL, DL, NodeTys, Ops);
+ InFlag = Chain.getValue(1);
+
+ // Create the CALLSEQ_END node.
+ Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
+ DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
+
+ if (!Ins.empty()) {
+ InFlag = Chain.getValue(1);
+ }
+
+ // Handle result values, copying them out of physregs into vregs that we
+ // return.
+ return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, DL, DAG,
+ InVals);
+}
+
+/// Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+///
+SDValue AVRTargetLowering::LowerCallResult(
+ SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const {
+
+ // Assign locations to each value returned by this call.
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+ *DAG.getContext());
+
+ // Handle runtime calling convs.
+ auto CCFunction = CCAssignFnForReturn(CallConv);
+ CCInfo.AnalyzeCallResult(Ins, CCFunction);
+
+ if (CallConv != CallingConv::AVR_BUILTIN && RVLocs.size() > 1) {
+ // Reverse splitted return values to get the "big endian" format required
+ // to agree with the calling convention ABI.
+ std::reverse(RVLocs.begin(), RVLocs.end());
+ }
+
+ // Copy all of the result registers out of their specified physreg.
+ for (CCValAssign const &RVLoc : RVLocs) {
+ Chain = DAG.getCopyFromReg(Chain, dl, RVLoc.getLocReg(), RVLoc.getValVT(),
+ InFlag)
+ .getValue(1);
+ InFlag = Chain.getValue(2);
+ InVals.push_back(Chain.getValue(0));
+ }
+
+ return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+CCAssignFn *AVRTargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
+ switch (CC) {
+ case CallingConv::AVR_BUILTIN:
+ return RetCC_AVR_BUILTIN;
+ default:
+ return RetCC_AVR;
+ }
+}
+
+bool
+AVRTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
+ MachineFunction &MF, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ LLVMContext &Context) const
+{
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
+
+ auto CCFunction = CCAssignFnForReturn(CallConv);
+ return CCInfo.CheckReturn(Outs, CCFunction);
+}
+
+SDValue
+AVRTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SDLoc &dl, SelectionDAG &DAG) const {
+ // CCValAssign - represent the assignment of the return value to locations.
+ SmallVector<CCValAssign, 16> RVLocs;
+
+ // CCState - Info about the registers and stack slot.
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+ *DAG.getContext());
+
+ // Analyze return values.
+ auto CCFunction = CCAssignFnForReturn(CallConv);
+ CCInfo.AnalyzeReturn(Outs, CCFunction);
+
+ // If this is the first return lowered for this function, add the regs to
+ // the liveout set for the function.
+ MachineFunction &MF = DAG.getMachineFunction();
+ unsigned e = RVLocs.size();
+
+ // Reverse splitted return values to get the "big endian" format required
+ // to agree with the calling convention ABI.
+ if (e > 1) {
+ std::reverse(RVLocs.begin(), RVLocs.end());
+ }
+
+ SDValue Flag;
+ SmallVector<SDValue, 4> RetOps(1, Chain);
+ // Copy the result values into the output registers.
+ for (unsigned i = 0; i != e; ++i) {
+ CCValAssign &VA = RVLocs[i];
+ assert(VA.isRegLoc() && "Can only return in registers!");
+
+ Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), OutVals[i], Flag);
+
+ // Guarantee that all emitted copies are stuck together with flags.
+ Flag = Chain.getValue(1);
+ RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+ }
+
+ // Don't emit the ret/reti instruction when the naked attribute is present in
+ // the function being compiled.
+ if (MF.getFunction()->getAttributes().hasAttribute(
+ AttributeSet::FunctionIndex, Attribute::Naked)) {
+ return Chain;
+ }
+
+ unsigned RetOpc =
+ (CallConv == CallingConv::AVR_INTR || CallConv == CallingConv::AVR_SIGNAL)
+ ? AVRISD::RETI_FLAG
+ : AVRISD::RET_FLAG;
+
+ RetOps[0] = Chain; // Update chain.
+
+ if (Flag.getNode()) {
+ RetOps.push_back(Flag);
+ }
+
+ return DAG.getNode(RetOpc, dl, MVT::Other, RetOps);
+}
+
+//===----------------------------------------------------------------------===//
+// Custom Inserters
+//===----------------------------------------------------------------------===//
+
+MachineBasicBlock *AVRTargetLowering::insertShift(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ unsigned Opc;
+ const TargetRegisterClass *RC;
+ MachineFunction *F = BB->getParent();
+ MachineRegisterInfo &RI = F->getRegInfo();
+ const AVRTargetMachine &TM = (const AVRTargetMachine &)getTargetMachine();
+ const TargetInstrInfo &TII = *TM.getSubtargetImpl()->getInstrInfo();
+ DebugLoc dl = MI.getDebugLoc();
+
+ switch (MI.getOpcode()) {
+ default:
+ llvm_unreachable("Invalid shift opcode!");
+ case AVR::Lsl8:
+ Opc = AVR::LSLRd;
+ RC = &AVR::GPR8RegClass;
+ break;
+ case AVR::Lsl16:
+ Opc = AVR::LSLWRd;
+ RC = &AVR::DREGSRegClass;
+ break;
+ case AVR::Asr8:
+ Opc = AVR::ASRRd;
+ RC = &AVR::GPR8RegClass;
+ break;
+ case AVR::Asr16:
+ Opc = AVR::ASRWRd;
+ RC = &AVR::DREGSRegClass;
+ break;
+ case AVR::Lsr8:
+ Opc = AVR::LSRRd;
+ RC = &AVR::GPR8RegClass;
+ break;
+ case AVR::Lsr16:
+ Opc = AVR::LSRWRd;
+ RC = &AVR::DREGSRegClass;
+ break;
+ }
+
+ const BasicBlock *LLVM_BB = BB->getBasicBlock();
+ MachineFunction::iterator I = BB->getParent()->begin();
+ ++I;
+
+ // Create loop block.
+ MachineBasicBlock *LoopBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *RemBB = F->CreateMachineBasicBlock(LLVM_BB);
+
+ F->insert(I, LoopBB);
+ F->insert(I, RemBB);
+
+ // Update machine-CFG edges by transferring all successors of the current
+ // block to the block containing instructions after shift.
+ RemBB->splice(RemBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)),
+ BB->end());
+ RemBB->transferSuccessorsAndUpdatePHIs(BB);
+
+ // Add adges BB => LoopBB => RemBB, BB => RemBB, LoopBB => LoopBB.
+ BB->addSuccessor(LoopBB);
+ BB->addSuccessor(RemBB);
+ LoopBB->addSuccessor(RemBB);
+ LoopBB->addSuccessor(LoopBB);
+
+ unsigned ShiftAmtReg = RI.createVirtualRegister(&AVR::LD8RegClass);
+ unsigned ShiftAmtReg2 = RI.createVirtualRegister(&AVR::LD8RegClass);
+ unsigned ShiftReg = RI.createVirtualRegister(RC);
+ unsigned ShiftReg2 = RI.createVirtualRegister(RC);
+ unsigned ShiftAmtSrcReg = MI.getOperand(2).getReg();
+ unsigned SrcReg = MI.getOperand(1).getReg();
+ unsigned DstReg = MI.getOperand(0).getReg();
+
+ // BB:
+ // cp 0, N
+ // breq RemBB
+ BuildMI(BB, dl, TII.get(AVR::CPRdRr)).addReg(ShiftAmtSrcReg).addReg(AVR::R0);
+ BuildMI(BB, dl, TII.get(AVR::BREQk)).addMBB(RemBB);
+
+ // LoopBB:
+ // ShiftReg = phi [%SrcReg, BB], [%ShiftReg2, LoopBB]
+ // ShiftAmt = phi [%N, BB], [%ShiftAmt2, LoopBB]
+ // ShiftReg2 = shift ShiftReg
+ // ShiftAmt2 = ShiftAmt - 1;
+ BuildMI(LoopBB, dl, TII.get(AVR::PHI), ShiftReg)
+ .addReg(SrcReg)
+ .addMBB(BB)
+ .addReg(ShiftReg2)
+ .addMBB(LoopBB);
+ BuildMI(LoopBB, dl, TII.get(AVR::PHI), ShiftAmtReg)
+ .addReg(ShiftAmtSrcReg)
+ .addMBB(BB)
+ .addReg(ShiftAmtReg2)
+ .addMBB(LoopBB);
+ BuildMI(LoopBB, dl, TII.get(Opc), ShiftReg2).addReg(ShiftReg);
+ BuildMI(LoopBB, dl, TII.get(AVR::SUBIRdK), ShiftAmtReg2)
+ .addReg(ShiftAmtReg)
+ .addImm(1);
+ BuildMI(LoopBB, dl, TII.get(AVR::BRNEk)).addMBB(LoopBB);
+
+ // RemBB:
+ // DestReg = phi [%SrcReg, BB], [%ShiftReg, LoopBB]
+ BuildMI(*RemBB, RemBB->begin(), dl, TII.get(AVR::PHI), DstReg)
+ .addReg(SrcReg)
+ .addMBB(BB)
+ .addReg(ShiftReg2)
+ .addMBB(LoopBB);
+
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
+ return RemBB;
+}
+
+static bool isCopyMulResult(MachineBasicBlock::iterator const &I) {
+ if (I->getOpcode() == AVR::COPY) {
+ unsigned SrcReg = I->getOperand(1).getReg();
+ return (SrcReg == AVR::R0 || SrcReg == AVR::R1);
+ }
+
+ return false;
+}
+
+// The mul instructions wreak havock on our zero_reg R1. We need to clear it
+// after the result has been evacuated. This is probably not the best way to do
+// it, but it works for now.
+MachineBasicBlock *AVRTargetLowering::insertMul(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ const AVRTargetMachine &TM = (const AVRTargetMachine &)getTargetMachine();
+ const TargetInstrInfo &TII = *TM.getSubtargetImpl()->getInstrInfo();
+ MachineBasicBlock::iterator I(MI);
+ ++I; // in any case insert *after* the mul instruction
+ if (isCopyMulResult(I))
+ ++I;
+ if (isCopyMulResult(I))
+ ++I;
+ BuildMI(*BB, I, MI.getDebugLoc(), TII.get(AVR::EORRdRr), AVR::R1)
+ .addReg(AVR::R1)
+ .addReg(AVR::R1);
+ return BB;
+}
+
+MachineBasicBlock *
+AVRTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *MBB) const {
+ int Opc = MI.getOpcode();
+
+ // Pseudo shift instructions with a non constant shift amount are expanded
+ // into a loop.
+ switch (Opc) {
+ case AVR::Lsl8:
+ case AVR::Lsl16:
+ case AVR::Lsr8:
+ case AVR::Lsr16:
+ case AVR::Asr8:
+ case AVR::Asr16:
+ return insertShift(MI, MBB);
+ case AVR::MULRdRr:
+ case AVR::MULSRdRr:
+ return insertMul(MI, MBB);
+ }
+
+ assert((Opc == AVR::Select16 || Opc == AVR::Select8) &&
+ "Unexpected instr type to insert");
+
+ const AVRInstrInfo &TII = (const AVRInstrInfo &)*MI.getParent()
+ ->getParent()
+ ->getSubtarget()
+ .getInstrInfo();
+ DebugLoc dl = MI.getDebugLoc();
+
+ // To "insert" a SELECT instruction, we insert the diamond
+ // control-flow pattern. The incoming instruction knows the
+ // destination vreg to set, the condition code register to branch
+ // on, the true/false values to select between, and a branch opcode
+ // to use.
+
+ MachineFunction *MF = MBB->getParent();
+ const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+ MachineBasicBlock *trueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *falseMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+
+ MachineFunction::iterator I = MBB->getParent()->begin();
+ ++I;
+ MF->insert(I, trueMBB);
+ MF->insert(I, falseMBB);
+
+ // Transfer remaining instructions and all successors of the current
+ // block to the block which will contain the Phi node for the
+ // select.
+ trueMBB->splice(trueMBB->begin(), MBB,
+ std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+ trueMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+ AVRCC::CondCodes CC = (AVRCC::CondCodes)MI.getOperand(3).getImm();
+ BuildMI(MBB, dl, TII.getBrCond(CC)).addMBB(trueMBB);
+ BuildMI(MBB, dl, TII.get(AVR::RJMPk)).addMBB(falseMBB);
+ MBB->addSuccessor(falseMBB);
+ MBB->addSuccessor(trueMBB);
+
+ // Unconditionally flow back to the true block
+ BuildMI(falseMBB, dl, TII.get(AVR::RJMPk)).addMBB(trueMBB);
+ falseMBB->addSuccessor(trueMBB);
+
+ // Set up the Phi node to determine where we came from
+ BuildMI(*trueMBB, trueMBB->begin(), dl, TII.get(AVR::PHI), MI.getOperand(0).getReg())
+ .addReg(MI.getOperand(1).getReg())
+ .addMBB(MBB)
+ .addReg(MI.getOperand(2).getReg())
+ .addMBB(falseMBB) ;
+
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
+ return trueMBB;
+}
+
+//===----------------------------------------------------------------------===//
+// Inline Asm Support
+//===----------------------------------------------------------------------===//
+
+AVRTargetLowering::ConstraintType
+AVRTargetLowering::getConstraintType(StringRef Constraint) const {
+ if (Constraint.size() == 1) {
+ // See http://www.nongnu.org/avr-libc/user-manual/inline_asm.html
+ switch (Constraint[0]) {
+ case 'a': // Simple upper registers
+ case 'b': // Base pointer registers pairs
+ case 'd': // Upper register
+ case 'l': // Lower registers
+ case 'e': // Pointer register pairs
+ case 'q': // Stack pointer register
+ case 'r': // Any register
+ case 'w': // Special upper register pairs
+ return C_RegisterClass;
+ case 't': // Temporary register
+ case 'x': case 'X': // Pointer register pair X
+ case 'y': case 'Y': // Pointer register pair Y
+ case 'z': case 'Z': // Pointer register pair Z
+ return C_Register;
+ case 'Q': // A memory address based on Y or Z pointer with displacement.
+ return C_Memory;
+ case 'G': // Floating point constant
+ case 'I': // 6-bit positive integer constant
+ case 'J': // 6-bit negative integer constant
+ case 'K': // Integer constant (Range: 2)
+ case 'L': // Integer constant (Range: 0)
+ case 'M': // 8-bit integer constant
+ case 'N': // Integer constant (Range: -1)
+ case 'O': // Integer constant (Range: 8, 16, 24)
+ case 'P': // Integer constant (Range: 1)
+ case 'R': // Integer constant (Range: -6 to 5)x
+ return C_Other;
+ default:
+ break;
+ }
+ }
+
+ return TargetLowering::getConstraintType(Constraint);
+}
+
+unsigned
+AVRTargetLowering::getInlineAsmMemConstraint(StringRef ConstraintCode) const {
+ // Not sure if this is actually the right thing to do, but we got to do
+ // *something* [agnat]
+ switch (ConstraintCode[0]) {
+ case 'Q':
+ return InlineAsm::Constraint_Q;
+ }
+ return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
+}
+
+AVRTargetLowering::ConstraintWeight
+AVRTargetLowering::getSingleConstraintMatchWeight(
+ AsmOperandInfo &info, const char *constraint) const {
+ ConstraintWeight weight = CW_Invalid;
+ Value *CallOperandVal = info.CallOperandVal;
+
+ // If we don't have a value, we can't do a match,
+ // but allow it at the lowest weight.
+ // (this behaviour has been copied from the ARM backend)
+ if (!CallOperandVal) {
+ return CW_Default;
+ }
+
+ // Look at the constraint type.
+ switch (*constraint) {
+ default:
+ weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+ break;
+ case 'd':
+ case 'r':
+ case 'l':
+ weight = CW_Register;
+ break;
+ case 'a':
+ case 'b':
+ case 'e':
+ case 'q':
+ case 't':
+ case 'w':
+ case 'x': case 'X':
+ case 'y': case 'Y':
+ case 'z': case 'Z':
+ weight = CW_SpecificReg;
+ break;
+ case 'G':
+ if (const ConstantFP *C = dyn_cast<ConstantFP>(CallOperandVal)) {
+ if (C->isZero()) {
+ weight = CW_Constant;
+ }
+ }
+ break;
+ case 'I':
+ if (const ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if (isUInt<6>(C->getZExtValue())) {
+ weight = CW_Constant;
+ }
+ }
+ break;
+ case 'J':
+ if (const ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if ((C->getSExtValue() >= -63) && (C->getSExtValue() <= 0)) {
+ weight = CW_Constant;
+ }
+ }
+ break;
+ case 'K':
+ if (const ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if (C->getZExtValue() == 2) {
+ weight = CW_Constant;
+ }
+ }
+ break;
+ case 'L':
+ if (const ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if (C->getZExtValue() == 0) {
+ weight = CW_Constant;
+ }
+ }
+ break;
+ case 'M':
+ if (const ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if (isUInt<8>(C->getZExtValue())) {
+ weight = CW_Constant;
+ }
+ }
+ break;
+ case 'N':
+ if (const ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if (C->getSExtValue() == -1) {
+ weight = CW_Constant;
+ }
+ }
+ break;
+ case 'O':
+ if (const ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if ((C->getZExtValue() == 8) || (C->getZExtValue() == 16) ||
+ (C->getZExtValue() == 24)) {
+ weight = CW_Constant;
+ }
+ }
+ break;
+ case 'P':
+ if (const ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if (C->getZExtValue() == 1) {
+ weight = CW_Constant;
+ }
+ }
+ break;
+ case 'R':
+ if (const ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if ((C->getSExtValue() >= -6) && (C->getSExtValue() <= 5)) {
+ weight = CW_Constant;
+ }
+ }
+ break;
+ case 'Q':
+ weight = CW_Memory;
+ break;
+ }
+
+ return weight;
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+AVRTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ StringRef Constraint,
+ MVT VT) const {
+ auto STI = static_cast<const AVRTargetMachine &>(this->getTargetMachine())
+ .getSubtargetImpl();
+
+ // We only support i8 and i16.
+ //
+ //:FIXME: remove this assert for now since it gets sometimes executed
+ // assert((VT == MVT::i16 || VT == MVT::i8) && "Wrong operand type.");
+
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ case 'a': // Simple upper registers r16..r23.
+ return std::make_pair(0U, &AVR::LD8loRegClass);
+ case 'b': // Base pointer registers: y, z.
+ return std::make_pair(0U, &AVR::PTRDISPREGSRegClass);
+ case 'd': // Upper registers r16..r31.
+ return std::make_pair(0U, &AVR::LD8RegClass);
+ case 'l': // Lower registers r0..r15.
+ return std::make_pair(0U, &AVR::GPR8loRegClass);
+ case 'e': // Pointer register pairs: x, y, z.
+ return std::make_pair(0U, &AVR::PTRREGSRegClass);
+ case 'q': // Stack pointer register: SPH:SPL.
+ return std::make_pair(0U, &AVR::GPRSPRegClass);
+ case 'r': // Any register: r0..r31.
+ if (VT == MVT::i8)
+ return std::make_pair(0U, &AVR::GPR8RegClass);
+
+ assert(VT == MVT::i16 && "inline asm constraint too large");
+ return std::make_pair(0U, &AVR::DREGSRegClass);
+ case 't': // Temporary register: r0.
+ return std::make_pair(unsigned(AVR::R0), &AVR::GPR8RegClass);
+ case 'w': // Special upper register pairs: r24, r26, r28, r30.
+ return std::make_pair(0U, &AVR::IWREGSRegClass);
+ case 'x': // Pointer register pair X: r27:r26.
+ case 'X':
+ return std::make_pair(unsigned(AVR::R27R26), &AVR::PTRREGSRegClass);
+ case 'y': // Pointer register pair Y: r29:r28.
+ case 'Y':
+ return std::make_pair(unsigned(AVR::R29R28), &AVR::PTRREGSRegClass);
+ case 'z': // Pointer register pair Z: r31:r30.
+ case 'Z':
+ return std::make_pair(unsigned(AVR::R31R30), &AVR::PTRREGSRegClass);
+ default:
+ break;
+ }
+ }
+
+ return TargetLowering::getRegForInlineAsmConstraint(STI->getRegisterInfo(),
+ Constraint, VT);
+}
+
+void AVRTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+ std::string &Constraint,
+ std::vector<SDValue> &Ops,
+ SelectionDAG &DAG) const {
+ SDValue Result(0, 0);
+ SDLoc DL(Op);
+ EVT Ty = Op.getValueType();
+
+ // Currently only support length 1 constraints.
+ if (Constraint.length() != 1) {
+ return;
+ }
+
+ char ConstraintLetter = Constraint[0];
+ switch (ConstraintLetter) {
+ default:
+ break;
+ // Deal with integers first:
+ case 'I':
+ case 'J':
+ case 'K':
+ case 'L':
+ case 'M':
+ case 'N':
+ case 'O':
+ case 'P':
+ case 'R': {
+ const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+ if (!C) {
+ return;
+ }
+
+ int64_t CVal64 = C->getSExtValue();
+ uint64_t CUVal64 = C->getZExtValue();
+ switch (ConstraintLetter) {
+ case 'I': // 0..63
+ if (!isUInt<6>(CUVal64))
+ return;
+ Result = DAG.getTargetConstant(CUVal64, DL, Ty);
+ break;
+ case 'J': // -63..0
+ if (CVal64 < -63 || CVal64 > 0)
+ return;
+ Result = DAG.getTargetConstant(CVal64, DL, Ty);
+ break;
+ case 'K': // 2
+ if (CUVal64 != 2)
+ return;
+ Result = DAG.getTargetConstant(CUVal64, DL, Ty);
+ break;
+ case 'L': // 0
+ if (CUVal64 != 0)
+ return;
+ Result = DAG.getTargetConstant(CUVal64, DL, Ty);
+ break;
+ case 'M': // 0..255
+ if (!isUInt<8>(CUVal64))
+ return;
+ // i8 type may be printed as a negative number,
+ // e.g. 254 would be printed as -2,
+ // so we force it to i16 at least.
+ if (Ty.getSimpleVT() == MVT::i8) {
+ Ty = MVT::i16;
+ }
+ Result = DAG.getTargetConstant(CUVal64, DL, Ty);
+ break;
+ case 'N': // -1
+ if (CVal64 != -1)
+ return;
+ Result = DAG.getTargetConstant(CVal64, DL, Ty);
+ break;
+ case 'O': // 8, 16, 24
+ if (CUVal64 != 8 && CUVal64 != 16 && CUVal64 != 24)
+ return;
+ Result = DAG.getTargetConstant(CUVal64, DL, Ty);
+ break;
+ case 'P': // 1
+ if (CUVal64 != 1)
+ return;
+ Result = DAG.getTargetConstant(CUVal64, DL, Ty);
+ break;
+ case 'R': // -6..5
+ if (CVal64 < -6 || CVal64 > 5)
+ return;
+ Result = DAG.getTargetConstant(CVal64, DL, Ty);
+ break;
+ }
+
+ break;
+ }
+ case 'G':
+ const ConstantFPSDNode *FC = dyn_cast<ConstantFPSDNode>(Op);
+ if (!FC || !FC->isZero())
+ return;
+ // Soften float to i8 0
+ Result = DAG.getTargetConstant(0, DL, MVT::i8);
+ break;
+ }
+
+ if (Result.getNode()) {
+ Ops.push_back(Result);
+ return;
+ }
+
+ return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+} // end of namespace llvm
+
diff --git a/contrib/llvm/lib/Target/AVR/AVRISelLowering.h b/contrib/llvm/lib/Target/AVR/AVRISelLowering.h
new file mode 100644
index 000000000000..17074e1b1eee
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRISelLowering.h
@@ -0,0 +1,163 @@
+//===-- AVRISelLowering.h - AVR DAG Lowering Interface ----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that AVR uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_ISEL_LOWERING_H
+#define LLVM_AVR_ISEL_LOWERING_H
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+namespace AVRISD {
+
+/// AVR Specific DAG Nodes
+enum NodeType {
+ /// Start the numbering where the builtin ops leave off.
+ FIRST_NUMBER = ISD::BUILTIN_OP_END,
+ /// Return from subroutine.
+ RET_FLAG,
+ /// Return from ISR.
+ RETI_FLAG,
+ /// Represents an abstract call instruction,
+ /// which includes a bunch of information.
+ CALL,
+ /// A wrapper node for TargetConstantPool,
+ /// TargetExternalSymbol, and TargetGlobalAddress.
+ WRAPPER,
+ LSL, ///< Logical shift left.
+ LSR, ///< Logical shift right.
+ ASR, ///< Arithmetic shift right.
+ ROR, ///< Bit rotate right.
+ ROL, ///< Bit rotate left.
+ LSLLOOP, ///< A loop of single logical shift left instructions.
+ LSRLOOP, ///< A loop of single logical shift right instructions.
+ ASRLOOP, ///< A loop of single arithmetic shift right instructions.
+ /// AVR conditional branches. Operand 0 is the chain operand, operand 1
+ /// is the block to branch if condition is true, operand 2 is the
+ /// condition code, and operand 3 is the flag operand produced by a CMP
+ /// or TEST instruction.
+ BRCOND,
+ /// Compare instruction.
+ CMP,
+ /// Compare with carry instruction.
+ CMPC,
+ /// Test for zero or minus instruction.
+ TST,
+ /// Operand 0 and operand 1 are selection variable, operand 2
+ /// is condition code and operand 3 is flag operand.
+ SELECT_CC
+};
+
+} // end of namespace AVRISD
+
+class AVRTargetMachine;
+
+/// Performs target lowering for the AVR.
+class AVRTargetLowering : public TargetLowering {
+public:
+ explicit AVRTargetLowering(AVRTargetMachine &TM);
+
+public:
+ MVT getScalarShiftAmountTy(const DataLayout &, EVT LHSTy) const override {
+ return MVT::i8;
+ }
+ const char *getTargetNodeName(unsigned Opcode) const override;
+
+ SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+ void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const override;
+
+ bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
+ unsigned AS) const override;
+
+ bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset,
+ ISD::MemIndexedMode &AM,
+ SelectionDAG &DAG) const override;
+
+ bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base,
+ SDValue &Offset, ISD::MemIndexedMode &AM,
+ SelectionDAG &DAG) const override;
+
+ bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+
+ EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+ EVT VT) const override;
+
+ MachineBasicBlock *
+ EmitInstrWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *MBB) const override;
+
+ ConstraintType getConstraintType(StringRef Constraint) const override;
+
+ ConstraintWeight
+ getSingleConstraintMatchWeight(AsmOperandInfo &info,
+ const char *constraint) const override;
+
+ std::pair<unsigned, const TargetRegisterClass *>
+ getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ StringRef Constraint, MVT VT) const override;
+
+ unsigned getInlineAsmMemConstraint(StringRef ConstraintCode) const override;
+
+ void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
+ std::vector<SDValue> &Ops,
+ SelectionDAG &DAG) const override;
+
+private:
+ SDValue getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AVRcc,
+ SelectionDAG &DAG, SDLoc dl) const;
+ SDValue LowerShifts(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+
+ CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC) const;
+
+ bool CanLowerReturn(CallingConv::ID CallConv,
+ MachineFunction &MF, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ LLVMContext &Context) const override;
+
+ SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals, const SDLoc &dl,
+ SelectionDAG &DAG) const override;
+ SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const override;
+ SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const override;
+ SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+ CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const;
+
+private:
+ MachineBasicBlock *insertShift(MachineInstr &MI, MachineBasicBlock *BB) const;
+ MachineBasicBlock *insertMul(MachineInstr &MI, MachineBasicBlock *BB) const;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_ISEL_LOWERING_H
diff --git a/contrib/llvm/lib/Target/AVR/AVRInstrFormats.td b/contrib/llvm/lib/Target/AVR/AVRInstrFormats.td
new file mode 100644
index 000000000000..ce5e606f9787
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRInstrFormats.td
@@ -0,0 +1,579 @@
+//===-- AVRInstrInfo.td - AVR Instruction Formats ----------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// AVR Instruction Format Definitions.
+//
+//===----------------------------------------------------------------------===//
+
+// A generic AVR instruction.
+class AVRInst<dag outs, dag ins, string asmstr, list<dag> pattern> : Instruction
+{
+ let Namespace = "AVR";
+
+ dag OutOperandList = outs;
+ dag InOperandList = ins;
+ let AsmString = asmstr;
+ let Pattern = pattern;
+
+ field bits<32> SoftFail = 0;
+}
+
+/// A 16-bit AVR instruction.
+class AVRInst16<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : AVRInst<outs, ins, asmstr, pattern>
+{
+ field bits<16> Inst;
+
+ let Size = 2;
+}
+
+/// a 32-bit AVR instruction.
+class AVRInst32<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : AVRInst<outs, ins, asmstr, pattern>
+{
+ field bits<32> Inst;
+
+ let Size = 4;
+}
+
+// A class for pseudo instructions.
+// Psuedo instructions are not real AVR instructions. The DAG stores
+// psuedo instructions which are replaced by real AVR instructions by
+// AVRExpandPseudoInsts.cpp.
+//
+// For example, the ADDW (add wide, as in add 16 bit values) instruction
+// is defined as a pseudo instruction. In AVRExpandPseudoInsts.cpp,
+// the instruction is then replaced by two add instructions - one for each byte.
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : AVRInst16<outs, ins, asmstr, pattern>
+{
+ let Pattern = pattern;
+
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Register / register instruction: <|opcode|ffrd|dddd|rrrr|>
+// opcode = 4 bits.
+// f = secondary opcode = 2 bits
+// d = destination = 5 bits
+// r = source = 5 bits
+// (Accepts all registers)
+//===----------------------------------------------------------------------===//
+class FRdRr<bits<4> opcode, bits<2> f, dag outs, dag ins, string asmstr,
+ list<dag> pattern> : AVRInst16<outs, ins, asmstr, pattern>
+{
+ bits<5> rd;
+ bits<5> rr;
+
+ let Inst{15-12} = opcode;
+ let Inst{11-10} = f;
+ let Inst{9} = rr{4};
+ let Inst{8-4} = rd;
+ let Inst{3-0} = rr{3-0};
+}
+
+class FTST<bits<4> opcode, bits<2> f, dag outs, dag ins, string asmstr,
+ list<dag> pattern> : AVRInst16<outs, ins, asmstr, pattern>
+{
+ bits<5> rd;
+
+ let Inst{15-12} = opcode;
+ let Inst{11-10} = f;
+ let Inst{9} = rd{4};
+ let Inst{8-4} = rd;
+ let Inst{3-0} = rd{3-0};
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction of the format `<mnemonic> Z, Rd`
+// <|1001|001r|rrrr|0ttt>
+//===----------------------------------------------------------------------===//
+class FZRd<bits<3> t, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : AVRInst16<outs, ins, asmstr, pattern>
+{
+ bits<5> rd;
+
+ let Inst{15-12} = 0b1001;
+
+ let Inst{11-9} = 0b001;
+ let Inst{8} = rd{4};
+
+ let Inst{7-4} = rd{3-0};
+
+ let Inst{3} = 0;
+ let Inst{2-0} = t;
+}
+
+//===----------------------------------------------------------------------===//
+// Register / immediate8 instruction: <|opcode|KKKK|dddd|KKKK|>
+// opcode = 4 bits.
+// K = constant data = 8 bits
+// d = destination = 4 bits
+// (Only accepts r16-r31)
+//===----------------------------------------------------------------------===//
+class FRdK<bits<4> opcode, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : AVRInst16<outs, ins, asmstr, pattern>
+{
+ bits<4> rd;
+ bits<8> k;
+
+ let Inst{15-12} = opcode;
+ let Inst{11-8} = k{7-4};
+ let Inst{7-4} = rd{3-0};
+ let Inst{3-0} = k{3-0};
+
+ let isAsCheapAsAMove = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Register instruction: <|opcode|fffd|dddd|ffff|>
+// opcode = 4 bits.
+// f = secondary opcode = 7 bits
+// d = destination = 5 bits
+// (Accepts all registers)
+//===----------------------------------------------------------------------===//
+class FRd<bits<4> opcode, bits<7> f, dag outs, dag ins, string asmstr,
+ list<dag> pattern> : AVRInst16<outs, ins, asmstr, pattern>
+{
+ bits<5> d;
+
+ let Inst{15-12} = opcode;
+ let Inst{11-9} = f{6-4};
+ let Inst{8-4} = d;
+ let Inst{3-0} = f{3-0};
+}
+
+//===----------------------------------------------------------------------===//
+// [STD/LDD] P+q, Rr special encoding: <|10q0|qqtr|rrrr|pqqq>
+// t = type (1 for STD, 0 for LDD)
+// q = displacement (6 bits)
+// r = register (5 bits)
+// p = pointer register (1 bit) [1 for Y, 0 for Z]
+//===----------------------------------------------------------------------===//
+class FSTDLDD<bit type, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : AVRInst16<outs, ins, asmstr, pattern>
+{
+ bits<7> memri;
+ bits<5> reg; // the GP register
+
+ let Inst{15-14} = 0b10;
+ let Inst{13} = memri{5};
+ let Inst{12} = 0;
+
+ let Inst{11-10} = memri{4-3};
+ let Inst{9} = type;
+ let Inst{8} = reg{4};
+
+ let Inst{7-4} = reg{3-0};
+
+ let Inst{3} = memri{6};
+ let Inst{2-0} = memri{2-0};
+}
+
+//===---------------------------------------------------------------------===//
+// An ST/LD instruction.
+// <|100i|00tr|rrrr|ppaa|>
+// t = type (1 for store, 0 for load)
+// a = regular/postinc/predec (reg = 0b00, postinc = 0b01, predec = 0b10)
+// p = pointer register
+// r = src/dst register
+//
+// Note that the bit labelled 'i' above does not follow a simple pattern,
+// so there exists a post encoder method to set it manually.
+//===---------------------------------------------------------------------===//
+class FSTLD<bit type, bits<2> mode, dag outs, dag ins,
+ string asmstr, list<dag> pattern>
+ : AVRInst16<outs, ins, asmstr, pattern>
+{
+ bits<2> ptrreg;
+ bits<5> reg;
+
+ let Inst{15-13} = 0b100;
+ // This bit varies depending on the arguments and the mode.
+ // We have a post encoder method to set this bit manually.
+ let Inst{12} = 0;
+
+ let Inst{11-10} = 0b00;
+ let Inst{9} = type;
+ let Inst{8} = reg{4};
+
+ let Inst{7-4} = reg{3-0};
+
+ let Inst{3-2} = ptrreg{1-0};
+ let Inst{1-0} = mode{1-0};
+
+ let PostEncoderMethod = "loadStorePostEncoder";
+}
+
+//===---------------------------------------------------------------------===//
+// Special format for the LPM/ELPM instructions
+// [E]LPM Rd, Z[+]
+// <|1001|000d|dddd|01ep>
+// d = destination register
+// e = is elpm
+// p = is postincrement
+//===---------------------------------------------------------------------===//
+class FLPMX<bit e, bit p, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : AVRInst16<outs, ins, asmstr, pattern>
+{
+ bits<5> reg;
+
+ let Inst{15-12} = 0b1001;
+
+ let Inst{11-9} = 0b000;
+ let Inst{8} = reg{4};
+
+ let Inst{7-4} = reg{3-0};
+
+ let Inst{3-2} = 0b01;
+ let Inst{1} = e;
+ let Inst{0} = p;
+}
+
+//===----------------------------------------------------------------------===//
+// MOVWRdRr special encoding: <|0000|0001|dddd|rrrr|>
+// d = destination = 4 bits
+// r = source = 4 bits
+// (Only accepts even registers)
+//===----------------------------------------------------------------------===//
+class FMOVWRdRr<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : AVRInst16<outs, ins, asmstr, pattern>
+{
+ bits<5> d;
+ bits<5> r;
+
+ let Inst{15-8} = 0b00000001;
+ let Inst{7-4} = d{4-1};
+ let Inst{3-0} = r{4-1};
+}
+
+//===----------------------------------------------------------------------===//
+// MULSrr special encoding: <|0000|0010|dddd|rrrr|>
+// d = multiplicand = 4 bits
+// r = multiplier = 4 bits
+// (Only accepts r16-r31)
+//===----------------------------------------------------------------------===//
+class FMUL2RdRr<bit f, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : AVRInst16<outs, ins, asmstr, pattern>
+{
+ bits<5> rd; // accept 5 bits but only encode the lower 4
+ bits<5> rr; // accept 5 bits but only encode the lower 4
+
+ let Inst{15-9} = 0b0000001;
+ let Inst{8} = f;
+ let Inst{7-4} = rd{3-0};
+ let Inst{3-0} = rr{3-0};
+}
+
+// Special encoding for the FMUL family of instructions.
+//
+// <0000|0011|fddd|frrr|>
+//
+// ff = 0b01 for FMUL
+// 0b10 for FMULS
+// 0b11 for FMULSU
+//
+// ddd = destination register
+// rrr = source register
+class FFMULRdRr<bits<2> f, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : AVRInst16<outs, ins, asmstr, pattern>
+{
+ bits<3> rd;
+ bits<3> rr;
+
+ let Inst{15-8} = 0b00000011;
+ let Inst{7} = f{1};
+ let Inst{6-4} = rd;
+ let Inst{3} = f{0};
+ let Inst{2-0} = rr;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Arithmetic word instructions (ADIW / SBIW): <|1001|011f|kkdd|kkkk|>
+// f = secondary opcode = 1 bit
+// k = constant data = 6 bits
+// d = destination = 4 bits
+// (Only accepts r25:24 r27:26 r29:28 r31:30)
+//===----------------------------------------------------------------------===//
+class FWRdK<bit f, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : AVRInst16<outs, ins, asmstr, pattern>
+{
+ bits<5> dst; // accept 5 bits but only encode bits 1 and 2
+ bits<6> k;
+
+ let Inst{15-9} = 0b1001011;
+ let Inst{8} = f;
+ let Inst{7-6} = k{5-4};
+ let Inst{5-4} = dst{2-1};
+ let Inst{3-0} = k{3-0};
+}
+
+//===----------------------------------------------------------------------===//
+// In I/O instruction: <|1011|0AAd|dddd|AAAA|>
+// A = I/O location address = 6 bits
+// d = destination = 5 bits
+// (Accepts all registers)
+//===----------------------------------------------------------------------===//
+class FIORdA<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : AVRInst16<outs, ins, asmstr, pattern>
+{
+ bits<5> d;
+ bits<6> A;
+
+ let Inst{15-11} = 0b10110;
+ let Inst{10-9} = A{5-4};
+ let Inst{8-4} = d;
+ let Inst{3-0} = A{3-0};
+}
+
+//===----------------------------------------------------------------------===//
+// Out I/O instruction: <|1011|1AAr|rrrr|AAAA|>
+// A = I/O location address = 6 bits
+// d = destination = 5 bits
+// (Accepts all registers)
+//===----------------------------------------------------------------------===//
+class FIOARr<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : AVRInst16<outs, ins, asmstr, pattern>
+{
+ bits<6> A;
+ bits<5> r;
+
+ let Inst{15-11} = 0b10111;
+ let Inst{10-9} = A{5-4};
+ let Inst{8-4} = r;
+ let Inst{3-0} = A{3-0};
+}
+
+//===----------------------------------------------------------------------===//
+// I/O bit instruction.
+// <|1001|10tt|AAAA|Abbb>
+// t = type (1 for SBI, 0 for CBI)
+// A = I/O location address (5 bits)
+// b = bit number
+//===----------------------------------------------------------------------===//
+class FIOBIT<bits<2> t, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : AVRInst16<outs, ins, asmstr, pattern>
+{
+ bits<5> A;
+ bits<3> b;
+
+ let Inst{15-12} = 0b1001;
+
+ let Inst{11-10} = 0b10;
+ let Inst{9-8} = t;
+
+ let Inst{7-4} = A{4-1};
+
+ let Inst{3} = A{0};
+ let Inst{2-0} = b{2-0};
+}
+
+//===----------------------------------------------------------------------===//
+// BST/BLD instruction.
+// <|1111|1ttd|dddd|0bbb>
+// t = type (1 for BST, 0 for BLD)
+// d = destination register
+// b = bit
+//===----------------------------------------------------------------------===//
+class FRdB<bits<2> t, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : AVRInst16<outs, ins, asmstr, pattern>
+{
+ bits<5> rd;
+ bits<3> b;
+
+ let Inst{15-12} = 0b1111;
+
+ let Inst{11} = 0b1;
+ let Inst{10-9} = t;
+ let Inst{8} = rd{4};
+
+ let Inst{7-4} = rd{3-0};
+
+ let Inst{3} = 0;
+ let Inst{2-0} = b;
+}
+
+// Special encoding for the `DES K` instruction.
+//
+// <|1001|0100|KKKK|1011>
+//
+// KKKK = 4 bit immediate
+class FDES<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : AVRInst16<outs, ins, asmstr, pattern>
+{
+ bits<4> k;
+
+ let Inst{15-12} = 0b1001;
+
+ let Inst{11-8} = 0b0100;
+
+ let Inst{7-4} = k;
+
+ let Inst{3-0} = 0b1011;
+}
+
+//===----------------------------------------------------------------------===//
+// Conditional Branching instructions: <|1111|0fkk|kkkk|ksss|>
+// f = secondary opcode = 1 bit
+// k = constant address = 7 bits
+// s = bit in status register = 3 bits
+//===----------------------------------------------------------------------===//
+class FBRsk<bit f, bits<3> s, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : AVRInst16<outs, ins, asmstr, pattern>
+{
+ bits<7> k;
+
+ let Inst{15-11} = 0b11110;
+ let Inst{10} = f;
+ let Inst{9-3} = k;
+ let Inst{2-0} = s;
+}
+
+//===----------------------------------------------------------------------===//
+// Special, opcode only instructions: <|opcode|>
+//===----------------------------------------------------------------------===//
+
+class F16<bits<16> opcode, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : AVRInst16<outs, ins, asmstr, pattern>
+{
+ let Inst = opcode;
+}
+
+class F32<bits<32> opcode, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : AVRInst32<outs, ins, asmstr, pattern>
+{
+ let Inst = opcode;
+}
+
+//===----------------------------------------------------------------------===//
+// Branching instructions with immediate12: <|110f|kkkk|kkkk|kkkk|>
+// f = secondary opcode = 1 bit
+// k = constant address = 12 bits
+//===----------------------------------------------------------------------===//
+class FBRk<bit f, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : AVRInst16<outs, ins, asmstr, pattern>
+{
+ bits<12> k;
+
+ let Inst{15-13} = 0b110;
+ let Inst{12} = f;
+ let Inst{11-0} = k;
+}
+
+//===----------------------------------------------------------------------===//
+// 32 bits branching instructions: <|1001|010k|kkkk|fffk|kkkk|kkkk|kkkk|kkkk|>
+// f = secondary opcode = 3 bits
+// k = constant address = 22 bits
+//===----------------------------------------------------------------------===//
+class F32BRk<bits<3> f, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : AVRInst32<outs, ins, asmstr, pattern>
+{
+ bits<22> k;
+
+ let Inst{31-25} = 0b1001010;
+ let Inst{24-20} = k{21-17};
+ let Inst{19-17} = f;
+ let Inst{16-0} = k{16-0};
+}
+
+//===----------------------------------------------------------------------===//
+// 32 bits direct mem instructions: <|1001|00fd|dddd|0000|kkkk|kkkk|kkkk|kkkk|>
+// f = secondary opcode = 1 bit
+// d = destination = 5 bits
+// k = constant address = 16 bits
+// (Accepts all registers)
+//===----------------------------------------------------------------------===//
+class F32DM<bit f, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : AVRInst32<outs, ins, asmstr, pattern>
+{
+ bits<5> rd;
+ bits<16> k;
+
+ let Inst{31-28} = 0b1001;
+
+ let Inst{27-26} = 0b00;
+ let Inst{25} = f;
+ let Inst{24} = rd{4};
+
+ let Inst{23-20} = rd{3-0};
+
+ let Inst{19-16} = 0b0000;
+
+ let Inst{15-0} = k;
+}
+
+// <|1001|0100|bfff|1000>
+class FS<bit b, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : AVRInst16<outs, ins, asmstr, pattern>
+{
+ bits<3> s;
+
+ let Inst{15-12} = 0b1001;
+
+ let Inst{11-8} = 0b0100;
+
+ let Inst{7} = b;
+ let Inst{6-4} = s;
+
+ let Inst{3-0} = 0b1000;
+}
+
+// Set/clr bit in status flag instructions/
+// <BRBS|BRBC> s, k
+// ---------------------
+// <|1111|0fkk|kkkk|ksss>
+class FSK<bit f, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : AVRInst16<outs, ins, asmstr, pattern>
+{
+ bits<7> k;
+ bits<3> s;
+
+ let Inst{15-12} = 0b1111;
+
+ let Inst{11} = 0;
+ let Inst{10} = f;
+ let Inst{9-8} = k{6-5};
+
+ let Inst{7-4} = k{4-1};
+
+ let Inst{3} = k{0};
+ let Inst{2-0} = s;
+}
+
+class ExtensionPseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : Pseudo<outs, ins, asmstr, pattern>
+{
+ let Defs = [SREG];
+}
+
+class StorePseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : Pseudo<outs, ins, asmstr, pattern>
+{
+ let Defs = [SP];
+}
+
+class SelectPseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : Pseudo<outs, ins, asmstr, pattern>
+{
+ let usesCustomInserter = 1;
+
+ let Uses = [SREG];
+}
+
+class ShiftPseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : Pseudo<outs, ins, asmstr, pattern>
+{
+ let usesCustomInserter = 1;
+
+ let Defs = [SREG];
+}
+
diff --git a/contrib/llvm/lib/Target/AVR/AVRInstrInfo.cpp b/contrib/llvm/lib/Target/AVR/AVRInstrInfo.cpp
new file mode 100644
index 000000000000..88f889260cce
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRInstrInfo.cpp
@@ -0,0 +1,498 @@
+//===-- AVRInstrInfo.cpp - AVR Instruction Information --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AVR implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVRInstrInfo.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#include "AVR.h"
+#include "AVRMachineFunctionInfo.h"
+#include "AVRRegisterInfo.h"
+#include "AVRTargetMachine.h"
+#include "MCTargetDesc/AVRMCTargetDesc.h"
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "AVRGenInstrInfo.inc"
+
+namespace llvm {
+
+AVRInstrInfo::AVRInstrInfo()
+ : AVRGenInstrInfo(AVR::ADJCALLSTACKDOWN, AVR::ADJCALLSTACKUP), RI() {}
+
+void AVRInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const DebugLoc &DL, unsigned DestReg,
+ unsigned SrcReg, bool KillSrc) const {
+ const AVRSubtarget &STI = MBB.getParent()->getSubtarget<AVRSubtarget>();
+ const AVRRegisterInfo &TRI = *STI.getRegisterInfo();
+ unsigned Opc;
+
+ // Not all AVR devices support the 16-bit `MOVW` instruction.
+ if (AVR::DREGSRegClass.contains(DestReg, SrcReg)) {
+ if (STI.hasMOVW()) {
+ BuildMI(MBB, MI, DL, get(AVR::MOVWRdRr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ } else {
+ unsigned DestLo, DestHi, SrcLo, SrcHi;
+
+ TRI.splitReg(DestReg, DestLo, DestHi);
+ TRI.splitReg(SrcReg, SrcLo, SrcHi);
+
+ // Copy each individual register with the `MOV` instruction.
+ BuildMI(MBB, MI, DL, get(AVR::MOVRdRr), DestLo)
+ .addReg(SrcLo, getKillRegState(KillSrc));
+ BuildMI(MBB, MI, DL, get(AVR::MOVRdRr), DestHi)
+ .addReg(SrcHi, getKillRegState(KillSrc));
+ }
+ } else {
+ if (AVR::GPR8RegClass.contains(DestReg, SrcReg)) {
+ Opc = AVR::MOVRdRr;
+ } else if (SrcReg == AVR::SP && AVR::DREGSRegClass.contains(DestReg)) {
+ Opc = AVR::SPREAD;
+ } else if (DestReg == AVR::SP && AVR::DREGSRegClass.contains(SrcReg)) {
+ Opc = AVR::SPWRITE;
+ } else {
+ llvm_unreachable("Impossible reg-to-reg copy");
+ }
+
+ BuildMI(MBB, MI, DL, get(Opc), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
+}
+
+unsigned AVRInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const {
+ switch (MI.getOpcode()) {
+ case AVR::LDDRdPtrQ:
+ case AVR::LDDWRdYQ: { //:FIXME: remove this once PR13375 gets fixed
+ if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
+ MI.getOperand(2).getImm() == 0) {
+ FrameIndex = MI.getOperand(1).getIndex();
+ return MI.getOperand(0).getReg();
+ }
+ break;
+ }
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+unsigned AVRInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const {
+ switch (MI.getOpcode()) {
+ case AVR::STDPtrQRr:
+ case AVR::STDWPtrQRr: {
+ if (MI.getOperand(0).isFI() && MI.getOperand(1).isImm() &&
+ MI.getOperand(1).getImm() == 0) {
+ FrameIndex = MI.getOperand(0).getIndex();
+ return MI.getOperand(2).getReg();
+ }
+ break;
+ }
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+void AVRInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned SrcReg, bool isKill,
+ int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ MachineFunction &MF = *MBB.getParent();
+ AVRMachineFunctionInfo *AFI = MF.getInfo<AVRMachineFunctionInfo>();
+
+ AFI->setHasSpills(true);
+
+ DebugLoc DL;
+ if (MI != MBB.end()) {
+ DL = MI->getDebugLoc();
+ }
+
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FrameIndex),
+ MachineMemOperand::MOStore, MFI.getObjectSize(FrameIndex),
+ MFI.getObjectAlignment(FrameIndex));
+
+ unsigned Opcode = 0;
+ if (RC->hasType(MVT::i8)) {
+ Opcode = AVR::STDPtrQRr;
+ } else if (RC->hasType(MVT::i16)) {
+ Opcode = AVR::STDWPtrQRr;
+ } else {
+ llvm_unreachable("Cannot store this register into a stack slot!");
+ }
+
+ BuildMI(MBB, MI, DL, get(Opcode))
+ .addFrameIndex(FrameIndex)
+ .addImm(0)
+ .addReg(SrcReg, getKillRegState(isKill))
+ .addMemOperand(MMO);
+}
+
+void AVRInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned DestReg, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ DebugLoc DL;
+ if (MI != MBB.end()) {
+ DL = MI->getDebugLoc();
+ }
+
+ MachineFunction &MF = *MBB.getParent();
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FrameIndex),
+ MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIndex),
+ MFI.getObjectAlignment(FrameIndex));
+
+ unsigned Opcode = 0;
+ if (RC->hasType(MVT::i8)) {
+ Opcode = AVR::LDDRdPtrQ;
+ } else if (RC->hasType(MVT::i16)) {
+ // Opcode = AVR::LDDWRdPtrQ;
+ //:FIXME: remove this once PR13375 gets fixed
+ Opcode = AVR::LDDWRdYQ;
+ } else {
+ llvm_unreachable("Cannot load this register from a stack slot!");
+ }
+
+ BuildMI(MBB, MI, DL, get(Opcode), DestReg)
+ .addFrameIndex(FrameIndex)
+ .addImm(0)
+ .addMemOperand(MMO);
+}
+
+const MCInstrDesc &AVRInstrInfo::getBrCond(AVRCC::CondCodes CC) const {
+ switch (CC) {
+ default:
+ llvm_unreachable("Unknown condition code!");
+ case AVRCC::COND_EQ:
+ return get(AVR::BREQk);
+ case AVRCC::COND_NE:
+ return get(AVR::BRNEk);
+ case AVRCC::COND_GE:
+ return get(AVR::BRGEk);
+ case AVRCC::COND_LT:
+ return get(AVR::BRLTk);
+ case AVRCC::COND_SH:
+ return get(AVR::BRSHk);
+ case AVRCC::COND_LO:
+ return get(AVR::BRLOk);
+ case AVRCC::COND_MI:
+ return get(AVR::BRMIk);
+ case AVRCC::COND_PL:
+ return get(AVR::BRPLk);
+ }
+}
+
+AVRCC::CondCodes AVRInstrInfo::getCondFromBranchOpc(unsigned Opc) const {
+ switch (Opc) {
+ default:
+ return AVRCC::COND_INVALID;
+ case AVR::BREQk:
+ return AVRCC::COND_EQ;
+ case AVR::BRNEk:
+ return AVRCC::COND_NE;
+ case AVR::BRSHk:
+ return AVRCC::COND_SH;
+ case AVR::BRLOk:
+ return AVRCC::COND_LO;
+ case AVR::BRMIk:
+ return AVRCC::COND_MI;
+ case AVR::BRPLk:
+ return AVRCC::COND_PL;
+ case AVR::BRGEk:
+ return AVRCC::COND_GE;
+ case AVR::BRLTk:
+ return AVRCC::COND_LT;
+ }
+}
+
+AVRCC::CondCodes AVRInstrInfo::getOppositeCondition(AVRCC::CondCodes CC) const {
+ switch (CC) {
+ default:
+ llvm_unreachable("Invalid condition!");
+ case AVRCC::COND_EQ:
+ return AVRCC::COND_NE;
+ case AVRCC::COND_NE:
+ return AVRCC::COND_EQ;
+ case AVRCC::COND_SH:
+ return AVRCC::COND_LO;
+ case AVRCC::COND_LO:
+ return AVRCC::COND_SH;
+ case AVRCC::COND_GE:
+ return AVRCC::COND_LT;
+ case AVRCC::COND_LT:
+ return AVRCC::COND_GE;
+ case AVRCC::COND_MI:
+ return AVRCC::COND_PL;
+ case AVRCC::COND_PL:
+ return AVRCC::COND_MI;
+ }
+}
+
+bool AVRInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const {
+ // Start from the bottom of the block and work up, examining the
+ // terminator instructions.
+ MachineBasicBlock::iterator I = MBB.end();
+ MachineBasicBlock::iterator UnCondBrIter = MBB.end();
+
+ while (I != MBB.begin()) {
+ --I;
+ if (I->isDebugValue()) {
+ continue;
+ }
+
+ // Working from the bottom, when we see a non-terminator
+ // instruction, we're done.
+ if (!isUnpredicatedTerminator(*I)) {
+ break;
+ }
+
+ // A terminator that isn't a branch can't easily be handled
+ // by this analysis.
+ if (!I->getDesc().isBranch()) {
+ return true;
+ }
+
+ // Handle unconditional branches.
+ //:TODO: add here jmp
+ if (I->getOpcode() == AVR::RJMPk) {
+ UnCondBrIter = I;
+
+ if (!AllowModify) {
+ TBB = I->getOperand(0).getMBB();
+ continue;
+ }
+
+ // If the block has any instructions after a JMP, delete them.
+ while (std::next(I) != MBB.end()) {
+ std::next(I)->eraseFromParent();
+ }
+
+ Cond.clear();
+ FBB = 0;
+
+ // Delete the JMP if it's equivalent to a fall-through.
+ if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
+ TBB = 0;
+ I->eraseFromParent();
+ I = MBB.end();
+ UnCondBrIter = MBB.end();
+ continue;
+ }
+
+ // TBB is used to indicate the unconditinal destination.
+ TBB = I->getOperand(0).getMBB();
+ continue;
+ }
+
+ // Handle conditional branches.
+ AVRCC::CondCodes BranchCode = getCondFromBranchOpc(I->getOpcode());
+ if (BranchCode == AVRCC::COND_INVALID) {
+ return true; // Can't handle indirect branch.
+ }
+
+ // Working from the bottom, handle the first conditional branch.
+ if (Cond.empty()) {
+ MachineBasicBlock *TargetBB = I->getOperand(0).getMBB();
+ if (AllowModify && UnCondBrIter != MBB.end() &&
+ MBB.isLayoutSuccessor(TargetBB)) {
+ // If we can modify the code and it ends in something like:
+ //
+ // jCC L1
+ // jmp L2
+ // L1:
+ // ...
+ // L2:
+ //
+ // Then we can change this to:
+ //
+ // jnCC L2
+ // L1:
+ // ...
+ // L2:
+ //
+ // Which is a bit more efficient.
+ // We conditionally jump to the fall-through block.
+ BranchCode = getOppositeCondition(BranchCode);
+ unsigned JNCC = getBrCond(BranchCode).getOpcode();
+ MachineBasicBlock::iterator OldInst = I;
+
+ BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(JNCC))
+ .addMBB(UnCondBrIter->getOperand(0).getMBB());
+ BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(AVR::RJMPk))
+ .addMBB(TargetBB);
+
+ OldInst->eraseFromParent();
+ UnCondBrIter->eraseFromParent();
+
+ // Restart the analysis.
+ UnCondBrIter = MBB.end();
+ I = MBB.end();
+ continue;
+ }
+
+ FBB = TBB;
+ TBB = I->getOperand(0).getMBB();
+ Cond.push_back(MachineOperand::CreateImm(BranchCode));
+ continue;
+ }
+
+ // Handle subsequent conditional branches. Only handle the case where all
+ // conditional branches branch to the same destination.
+ assert(Cond.size() == 1);
+ assert(TBB);
+
+ // Only handle the case where all conditional branches branch to
+ // the same destination.
+ if (TBB != I->getOperand(0).getMBB()) {
+ return true;
+ }
+
+ AVRCC::CondCodes OldBranchCode = (AVRCC::CondCodes)Cond[0].getImm();
+ // If the conditions are the same, we can leave them alone.
+ if (OldBranchCode == BranchCode) {
+ continue;
+ }
+
+ return true;
+ }
+
+ return false;
+}
+
+unsigned AVRInstrInfo::insertBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB,
+ ArrayRef<MachineOperand> Cond,
+ const DebugLoc &DL,
+ int *BytesAdded) const {
+ assert(!BytesAdded && "code size not handled");
+
+ // Shouldn't be a fall through.
+ assert(TBB && "insertBranch must not be told to insert a fallthrough");
+ assert((Cond.size() == 1 || Cond.size() == 0) &&
+ "AVR branch conditions have one component!");
+
+ if (Cond.empty()) {
+ assert(!FBB && "Unconditional branch with multiple successors!");
+ BuildMI(&MBB, DL, get(AVR::RJMPk)).addMBB(TBB);
+ return 1;
+ }
+
+ // Conditional branch.
+ unsigned Count = 0;
+ AVRCC::CondCodes CC = (AVRCC::CondCodes)Cond[0].getImm();
+ BuildMI(&MBB, DL, getBrCond(CC)).addMBB(TBB);
+ ++Count;
+
+ if (FBB) {
+ // Two-way Conditional branch. Insert the second branch.
+ BuildMI(&MBB, DL, get(AVR::RJMPk)).addMBB(FBB);
+ ++Count;
+ }
+
+ return Count;
+}
+
+unsigned AVRInstrInfo::removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved) const {
+ assert(!BytesRemoved && "code size not handled");
+
+ MachineBasicBlock::iterator I = MBB.end();
+ unsigned Count = 0;
+
+ while (I != MBB.begin()) {
+ --I;
+ if (I->isDebugValue()) {
+ continue;
+ }
+ //:TODO: add here the missing jmp instructions once they are implemented
+ // like jmp, {e}ijmp, and other cond branches, ...
+ if (I->getOpcode() != AVR::RJMPk &&
+ getCondFromBranchOpc(I->getOpcode()) == AVRCC::COND_INVALID) {
+ break;
+ }
+
+ // Remove the branch.
+ I->eraseFromParent();
+ I = MBB.end();
+ ++Count;
+ }
+
+ return Count;
+}
+
+bool AVRInstrInfo::reverseBranchCondition(
+ SmallVectorImpl<MachineOperand> &Cond) const {
+ assert(Cond.size() == 1 && "Invalid AVR branch condition!");
+
+ AVRCC::CondCodes CC = static_cast<AVRCC::CondCodes>(Cond[0].getImm());
+ Cond[0].setImm(getOppositeCondition(CC));
+
+ return false;
+}
+
+unsigned AVRInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
+ unsigned Opcode = MI.getOpcode();
+
+ switch (Opcode) {
+ // A regular instruction
+ default: {
+ const MCInstrDesc &Desc = get(Opcode);
+ return Desc.getSize();
+ }
+ case TargetOpcode::EH_LABEL:
+ case TargetOpcode::IMPLICIT_DEF:
+ case TargetOpcode::KILL:
+ case TargetOpcode::DBG_VALUE:
+ return 0;
+ case TargetOpcode::INLINEASM: {
+ const MachineFunction &MF = *MI.getParent()->getParent();
+ const AVRTargetMachine &TM = static_cast<const AVRTargetMachine&>(MF.getTarget());
+ const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>();
+ const TargetInstrInfo &TII = *STI.getInstrInfo();
+
+ return TII.getInlineAsmLength(MI.getOperand(0).getSymbolName(),
+ *TM.getMCAsmInfo());
+ }
+ }
+}
+
+} // end of namespace llvm
+
diff --git a/contrib/llvm/lib/Target/AVR/AVRInstrInfo.h b/contrib/llvm/lib/Target/AVR/AVRInstrInfo.h
new file mode 100644
index 000000000000..c5105dafe5eb
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRInstrInfo.h
@@ -0,0 +1,112 @@
+//===-- AVRInstrInfo.h - AVR Instruction Information ------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AVR implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_INSTR_INFO_H
+#define LLVM_AVR_INSTR_INFO_H
+
+#include "llvm/Target/TargetInstrInfo.h"
+
+#include "AVRRegisterInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "AVRGenInstrInfo.inc"
+#undef GET_INSTRINFO_HEADER
+
+namespace llvm {
+
+namespace AVRCC {
+
+/// AVR specific condition codes.
+/// These correspond to `AVR_*_COND` in `AVRInstrInfo.td`.
+/// They must be kept in synch.
+enum CondCodes {
+ COND_EQ, //!< Equal
+ COND_NE, //!< Not equal
+ COND_GE, //!< Greater than or equal
+ COND_LT, //!< Less than
+ COND_SH, //!< Unsigned same or higher
+ COND_LO, //!< Unsigned lower
+ COND_MI, //!< Minus
+ COND_PL, //!< Plus
+ COND_INVALID
+};
+
+} // end of namespace AVRCC
+
+namespace AVRII {
+
+/// Specifies a target operand flag.
+enum TOF {
+ MO_NO_FLAG,
+
+ /// On a symbol operand, this represents the lo part.
+ MO_LO = (1 << 1),
+
+ /// On a symbol operand, this represents the hi part.
+ MO_HI = (1 << 2),
+
+ /// On a symbol operand, this represents it has to be negated.
+ MO_NEG = (1 << 3)
+};
+
+} // end of namespace AVRII
+
+/// Utilities related to the AVR instruction set.
+class AVRInstrInfo : public AVRGenInstrInfo {
+public:
+ explicit AVRInstrInfo();
+
+ const AVRRegisterInfo &getRegisterInfo() const { return RI; }
+ const MCInstrDesc &getBrCond(AVRCC::CondCodes CC) const;
+ AVRCC::CondCodes getCondFromBranchOpc(unsigned Opc) const;
+ AVRCC::CondCodes getOppositeCondition(AVRCC::CondCodes CC) const;
+ unsigned getInstSizeInBytes(const MachineInstr &MI) const override;
+
+ void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+ bool KillSrc) const override;
+ void storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI, unsigned SrcReg,
+ bool isKill, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
+ void loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI, unsigned DestReg,
+ int FrameIndex, const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
+ unsigned isLoadFromStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const override;
+ unsigned isStoreToStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const override;
+
+ // Branch analysis.
+ bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify = false) const override;
+ unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+ const DebugLoc &DL,
+ int *BytesAdded = nullptr) const override;
+ unsigned removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved = nullptr) const override;
+ bool
+ reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+
+private:
+ const AVRRegisterInfo RI;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_INSTR_INFO_H
diff --git a/contrib/llvm/lib/Target/AVR/AVRInstrInfo.td b/contrib/llvm/lib/Target/AVR/AVRInstrInfo.td
new file mode 100644
index 000000000000..bc66379ab708
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRInstrInfo.td
@@ -0,0 +1,2047 @@
+//===-- AVRInstrInfo.td - AVR Instruction defs -------------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the AVR instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+include "AVRInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// AVR Type Profiles
+//===----------------------------------------------------------------------===//
+
+def SDT_AVRCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>]>;
+def SDT_AVRCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
+def SDT_AVRCall : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
+def SDT_AVRWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+def SDT_AVRBrcond : SDTypeProfile<0, 2,
+ [SDTCisVT<0, OtherVT>, SDTCisVT<1, i8>]>;
+def SDT_AVRCmp : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
+def SDT_AVRTst : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def SDT_AVRSelectCC : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>,
+ SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+
+//===----------------------------------------------------------------------===//
+// AVR Specific Node Definitions
+//===----------------------------------------------------------------------===//
+
+def AVRretflag : SDNode<"AVRISD::RET_FLAG", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def AVRretiflag : SDNode<"AVRISD::RETI_FLAG", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+def AVRcallseq_start : SDNode<"ISD::CALLSEQ_START", SDT_AVRCallSeqStart,
+ [SDNPHasChain, SDNPOutGlue]>;
+def AVRcallseq_end : SDNode<"ISD::CALLSEQ_END", SDT_AVRCallSeqEnd,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def AVRcall : SDNode<"AVRISD::CALL", SDT_AVRCall,
+ [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue, SDNPVariadic]>;
+
+def AVRWrapper : SDNode<"AVRISD::WRAPPER", SDT_AVRWrapper>;
+
+def AVRbrcond : SDNode<"AVRISD::BRCOND", SDT_AVRBrcond,
+ [SDNPHasChain, SDNPInGlue]>;
+def AVRcmp : SDNode<"AVRISD::CMP", SDT_AVRCmp, [SDNPOutGlue]>;
+def AVRcmpc : SDNode<"AVRISD::CMPC", SDT_AVRCmp, [SDNPInGlue, SDNPOutGlue]>;
+def AVRtst : SDNode<"AVRISD::TST", SDT_AVRTst, [SDNPOutGlue]>;
+def AVRselectcc: SDNode<"AVRISD::SELECT_CC", SDT_AVRSelectCC, [SDNPInGlue]>;
+
+// Shift nodes.
+def AVRlsl : SDNode<"AVRISD::LSL", SDTIntUnaryOp>;
+def AVRlsr : SDNode<"AVRISD::LSR", SDTIntUnaryOp>;
+def AVRrol : SDNode<"AVRISD::ROL", SDTIntUnaryOp>;
+def AVRror : SDNode<"AVRISD::ROR", SDTIntUnaryOp>;
+def AVRasr : SDNode<"AVRISD::ASR", SDTIntUnaryOp>;
+
+// Pseudo shift nodes for non-constant shift amounts.
+def AVRlslLoop : SDNode<"AVRISD::LSLLOOP", SDTIntShiftOp>;
+def AVRlsrLoop : SDNode<"AVRISD::LSRLOOP", SDTIntShiftOp>;
+def AVRasrLoop : SDNode<"AVRISD::ASRLOOP", SDTIntShiftOp>;
+
+//===----------------------------------------------------------------------===//
+// AVR Operands, Complex Patterns and Transformations Definitions.
+//===----------------------------------------------------------------------===//
+
+def imm8_neg_XFORM : SDNodeXForm<imm,
+[{
+ return CurDAG->getTargetConstant(-N->getAPIntValue(), SDLoc(N), MVT::i8);
+}]>;
+
+def imm16_neg_XFORM : SDNodeXForm<imm,
+[{
+ return CurDAG->getTargetConstant(-N->getAPIntValue(), SDLoc(N), MVT::i16);
+}]>;
+
+def imm0_63_neg : PatLeaf<(imm),
+[{
+ int64_t val = -N->getSExtValue();
+ return val >= 0 && val < 64;
+}], imm16_neg_XFORM>;
+
+def uimm6 : PatLeaf<(imm), [{ return isUInt<6>(N->getZExtValue()); }]>;
+
+def ioaddr_XFORM : SDNodeXForm<imm,
+[{
+ return CurDAG->getTargetConstant(uint8_t(N->getZExtValue()) - 0x20, SDLoc(N), MVT::i8);
+}]>;
+
+def iobitpos8_XFORM : SDNodeXForm<imm,
+[{
+ return CurDAG->getTargetConstant(Log2_32(uint8_t(N->getZExtValue())),
+ SDLoc(N), MVT::i8);
+}]>;
+
+def iobitposn8_XFORM : SDNodeXForm<imm,
+[{
+ return CurDAG->getTargetConstant(Log2_32(uint8_t(~N->getZExtValue())),
+ SDLoc(N), MVT::i8);
+}]>;
+
+def ioaddr8 : PatLeaf<(imm),
+[{
+ uint64_t val = N->getZExtValue();
+ return val >= 0x20 && val < 0x60;
+}], ioaddr_XFORM>;
+
+def lowioaddr8 : PatLeaf<(imm),
+[{
+ uint64_t val = N->getZExtValue();
+ return val >= 0x20 && val < 0x40;
+}], ioaddr_XFORM>;
+
+def ioaddr16 : PatLeaf<(imm),
+[{
+ uint64_t val = N->getZExtValue();
+ return val >= 0x20 && val < 0x5f;
+}], ioaddr_XFORM>;
+
+def iobitpos8 : PatLeaf<(imm),
+[{
+ return isPowerOf2_32(uint8_t(N->getZExtValue()));
+}], iobitpos8_XFORM>;
+
+def iobitposn8 : PatLeaf<(imm),
+[{
+ return isPowerOf2_32(uint8_t(~N->getZExtValue()));
+}], iobitposn8_XFORM>;
+
+def MemriAsmOperand : AsmOperandClass {
+ let Name = "Memri";
+ let ParserMethod = "parseMemriOperand";
+}
+
+/// Address operand for `reg+imm` used by STD and LDD.
+def memri : Operand<iPTR>
+{
+ let MIOperandInfo = (ops PTRDISPREGS, i16imm);
+
+ let PrintMethod = "printMemri";
+ let EncoderMethod = "encodeMemri";
+
+ let ParserMatchClass = MemriAsmOperand;
+}
+
+// Address operand for `SP+imm` used by STD{W}SPQRr
+def memspi : Operand<iPTR>
+{
+ let MIOperandInfo = (ops GPRSP, i16imm);
+}
+
+def imm_com8 : Operand<i8>
+{
+ let EncoderMethod = "encodeComplement";
+
+ let MIOperandInfo = (ops i8imm);
+}
+
+def relbrtarget_7 : Operand<OtherVT>
+{
+ let PrintMethod = "printPCRelImm";
+ let EncoderMethod = "encodeRelCondBrTarget<AVR::fixup_7_pcrel>";
+}
+
+def brtarget_13 : Operand<OtherVT>
+{
+ let PrintMethod = "printPCRelImm";
+ let EncoderMethod = "encodeRelCondBrTarget<AVR::fixup_13_pcrel>";
+}
+
+// The target of a 22 or 16-bit call/jmp instruction.
+def call_target : Operand<iPTR>
+{
+ let EncoderMethod = "encodeCallTarget";
+}
+
+// A 16-bit address (which can lead to an R_AVR_16 relocation).
+def imm16 : Operand<i16>
+{
+ let EncoderMethod = "encodeImm<AVR::fixup_16>";
+}
+
+/// A 6-bit immediate used in the ADIW/SBIW instructions.
+def imm_arith6 : Operand<i16>
+{
+ let EncoderMethod = "encodeImm<AVR::fixup_6_adiw>";
+}
+
+/// An 8-bit immediate inside an instruction with the same format
+/// as the `LDI` instruction (the `FRdK` format).
+def imm_ldi8 : Operand<i8>
+{
+ let EncoderMethod = "encodeImm<AVR::fixup_ldi>";
+}
+
+/// A 5-bit port number used in SBIC and friends (the `FIOBIT` format).
+def imm_port5 : Operand<i8>
+{
+ let EncoderMethod = "encodeImm<AVR::fixup_port5>";
+}
+
+/// A 6-bit port number used in the `IN` instruction and friends (the
+/// `FIORdA` format.
+def imm_port6 : Operand<i8>
+{
+ let EncoderMethod = "encodeImm<AVR::fixup_port6>";
+}
+
+// Addressing mode pattern reg+imm6
+def addr : ComplexPattern<iPTR, 2, "SelectAddr", [], [SDNPWantRoot]>;
+
+// AsmOperand class for a pointer register.
+// Used with the LD/ST family of instructions.
+// See FSTLD in AVRInstrFormats.td
+def PtrRegAsmOperand : AsmOperandClass
+{
+ let Name = "Reg";
+}
+
+// A special operand type for the LD/ST instructions.
+// It converts the pointer register number into a two-bit field used in the
+// instruction.
+def LDSTPtrReg : Operand<i16>
+{
+ let MIOperandInfo = (ops PTRREGS);
+ let EncoderMethod = "encodeLDSTPtrReg";
+
+ let ParserMatchClass = PtrRegAsmOperand;
+}
+
+// A special operand type for the LDD/STD instructions.
+// It behaves identically to the LD/ST version, except restricts
+// the pointer registers to Y and Z.
+def LDDSTDPtrReg : Operand<i16>
+{
+ let MIOperandInfo = (ops PTRDISPREGS);
+ let EncoderMethod = "encodeLDSTPtrReg";
+
+ let ParserMatchClass = PtrRegAsmOperand;
+}
+
+//===----------------------------------------------------------------------===//
+// AVR predicates for subtarget features
+//===----------------------------------------------------------------------===//
+
+def HasSRAM : Predicate<"Subtarget->hasSRAM()">,
+ AssemblerPredicate<"FeatureSRAM">;
+
+def HasJMPCALL : Predicate<"Subtarget->hasJMPCALL()">,
+ AssemblerPredicate<"FeatureJMPCALL">;
+
+def HasIJMPCALL : Predicate<"Subtarget->hasIJMPCALL()">,
+ AssemblerPredicate<"FeatureIJMPCALL">;
+
+def HasEIJMPCALL : Predicate<"Subtarget->hasEIJMPCALL()">,
+ AssemblerPredicate<"FeatureEIJMPCALL">;
+
+def HasADDSUBIW : Predicate<"Subtarget->hasADDSUBIW()">,
+ AssemblerPredicate<"FeatureADDSUBIW">;
+
+def HasSmallStack : Predicate<"Subtarget->HasSmallStack()">,
+ AssemblerPredicate<"FeatureSmallStack">;
+
+def HasMOVW : Predicate<"Subtarget->hasMOVW()">,
+ AssemblerPredicate<"FeatureMOVW">;
+
+def HasLPM : Predicate<"Subtarget->hasLPM()">,
+ AssemblerPredicate<"FeatureLPM">;
+
+def HasLPMX : Predicate<"Subtarget->hasLPMX()">,
+ AssemblerPredicate<"FeatureLPMX">;
+
+def HasELPM : Predicate<"Subtarget->hasELPM()">,
+ AssemblerPredicate<"FeatureELPM">;
+
+def HasELPMX : Predicate<"Subtarget->hasELPMX()">,
+ AssemblerPredicate<"FeatureELPMX">;
+
+def HasSPM : Predicate<"Subtarget->hasSPM()">,
+ AssemblerPredicate<"FeatureSPM">;
+
+def HasSPMX : Predicate<"Subtarget->hasSPMX()">,
+ AssemblerPredicate<"FeatureSPMX">;
+
+def HasDES : Predicate<"Subtarget->hasDES()">,
+ AssemblerPredicate<"FeatureDES">;
+
+def SupportsRMW : Predicate<"Subtarget->supportsRMW()">,
+ AssemblerPredicate<"FeatureRMW">;
+
+def SupportsMultiplication : Predicate<"Subtarget->supportsMultiplication()">,
+ AssemblerPredicate<"FeatureMultiplication">;
+
+def HasBREAK : Predicate<"Subtarget->hasBREAK()">,
+ AssemblerPredicate<"FeatureBREAK">;
+
+def HasTinyEncoding : Predicate<"Subtarget->hasTinyEncoding()">,
+ AssemblerPredicate<"FeatureTinyEncoding">;
+
+
+// AVR specific condition code. These correspond to AVR_*_COND in
+// AVRInstrInfo.td. They must be kept in synch.
+def AVR_COND_EQ : PatLeaf<(i8 0)>;
+def AVR_COND_NE : PatLeaf<(i8 1)>;
+def AVR_COND_GE : PatLeaf<(i8 2)>;
+def AVR_COND_LT : PatLeaf<(i8 3)>;
+def AVR_COND_SH : PatLeaf<(i8 4)>;
+def AVR_COND_LO : PatLeaf<(i8 5)>;
+def AVR_COND_MI : PatLeaf<(i8 6)>;
+def AVR_COND_PL : PatLeaf<(i8 7)>;
+
+
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// AVR Instruction list
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+
+// ADJCALLSTACKDOWN/UP implicitly use/def SP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
+// sub / add which can clobber SREG.
+let Defs = [SP, SREG],
+Uses = [SP] in
+{
+ def ADJCALLSTACKDOWN : Pseudo<(outs),
+ (ins i16imm:$amt),
+ "#ADJCALLSTACKDOWN",
+ [(AVRcallseq_start timm:$amt)]>;
+
+ // R31R30 is used to update SP, since it is a scratch reg and this instruction
+ // is placed after the function call then R31R30 should be always free.
+ //let Defs = [R31R30],
+ //Uses = [R31R30] in
+ //:TODO: if we enable this, the pseudo is killed because it looks dead
+ def ADJCALLSTACKUP : Pseudo<(outs),
+ (ins i16imm:$amt1, i16imm:$amt2),
+ "#ADJCALLSTACKUP",
+ [(AVRcallseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Addition
+//===----------------------------------------------------------------------===//
+let isCommutable = 1,
+Constraints = "$src = $rd",
+Defs = [SREG] in
+{
+ // ADD Rd, Rr
+ // Adds two 8-bit registers.
+ def ADDRdRr : FRdRr<0b0000,
+ 0b11,
+ (outs GPR8:$rd),
+ (ins GPR8:$src, GPR8:$rr),
+ "add\t$rd, $rr",
+ [(set i8:$rd, (add i8:$src, i8:$rr)),
+ (implicit SREG)]>;
+
+ // ADDW Rd+1:Rd, Rr+1:Rr
+ // Pseudo instruction to add four 8-bit registers as two 16-bit values.
+ //
+ // Expands to:
+ // add Rd, Rr
+ // adc Rd+1, Rr+1
+ def ADDWRdRr : Pseudo<(outs DREGS:$rd),
+ (ins DREGS:$src, DREGS:$rr),
+ "addw\t$rd, $rr",
+ [(set i16:$rd, (add i16:$src, i16:$rr)),
+ (implicit SREG)]>;
+
+ // ADC Rd, Rr
+ // Adds two 8-bit registers with carry.
+ let Uses = [SREG] in
+ def ADCRdRr : FRdRr<0b0001,
+ 0b11,
+ (outs GPR8:$rd),
+ (ins GPR8:$src, GPR8:$rr),
+ "adc\t$rd, $rr",
+ [(set i8:$rd, (adde i8:$src, i8:$rr)),
+ (implicit SREG)]>;
+
+ // ADCW Rd+1:Rd, Rr+1:Rr
+ // Pseudo instruction to add four 8-bit registers as two 16-bit values with
+ // carry.
+ //
+ // Expands to:
+ // adc Rd, Rr
+ // adc Rd+1, Rr+1
+ let Uses = [SREG] in
+ def ADCWRdRr : Pseudo<(outs DREGS:$rd),
+ (ins DREGS:$src, DREGS:$rr),
+ "adcw\t$rd, $rr",
+ [(set i16:$rd, (adde i16:$src, i16:$rr)),
+ (implicit SREG)]>;
+
+ // AIDW Rd, k
+ // Adds an immediate 6-bit value K to Rd, placing the result in Rd.
+ def ADIWRdK : FWRdK<0b0,
+ (outs IWREGS:$rd),
+ (ins IWREGS:$src, imm_arith6:$k),
+ "adiw\t$rd, $k",
+ [(set i16:$rd, (add i16:$src, uimm6:$k)),
+ (implicit SREG)]>,
+ Requires<[HasADDSUBIW]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Subtraction
+//===----------------------------------------------------------------------===//
+let Constraints = "$src = $rd",
+Defs = [SREG] in
+{
+ // SUB Rd, Rr
+ // Subtracts the 8-bit value of Rr from Rd and places the value in Rd.
+ def SUBRdRr : FRdRr<0b0001,
+ 0b10,
+ (outs GPR8:$rd),
+ (ins GPR8:$src, GPR8:$rr),
+ "sub\t$rd, $rr",
+ [(set i8:$rd, (sub i8:$src, i8:$rr)),
+ (implicit SREG)]>;
+
+ // SUBW Rd+1:Rd, Rr+1:Rr
+ // Subtracts two 16-bit values and places the result into Rd.
+ //
+ // Expands to:
+ // sub Rd, Rr
+ // sbc Rd+1, Rr+1
+ def SUBWRdRr : Pseudo<(outs DREGS:$rd),
+ (ins DREGS:$src, DREGS:$rr),
+ "subw\t$rd, $rr",
+ [(set i16:$rd, (sub i16:$src, i16:$rr)),
+ (implicit SREG)]>;
+
+ def SUBIRdK : FRdK<0b0101,
+ (outs LD8:$rd),
+ (ins LD8:$src, imm_ldi8:$k),
+ "subi\t$rd, $k",
+ [(set i8:$rd, (sub i8:$src, imm:$k)),
+ (implicit SREG)]>;
+
+ // SUBIW Rd+1:Rd, K+1:K
+ //
+ // Expands to:
+ // subi Rd, K
+ // sbci Rd+1, K+1
+ def SUBIWRdK : Pseudo<(outs DLDREGS:$rd),
+ (ins DLDREGS:$src, i16imm:$rr),
+ "subiw\t$rd, $rr",
+ [(set i16:$rd, (sub i16:$src, imm:$rr)),
+ (implicit SREG)]>;
+
+ def SBIWRdK : FWRdK<0b1,
+ (outs IWREGS:$rd),
+ (ins IWREGS:$src, imm_arith6:$k),
+ "sbiw\t$rd, $k",
+ [(set i16:$rd, (sub i16:$src, uimm6:$k)),
+ (implicit SREG)]>,
+ Requires<[HasADDSUBIW]>;
+
+ // Subtract with carry operations which must read the carry flag in SREG.
+ let Uses = [SREG] in
+ {
+ def SBCRdRr : FRdRr<0b0000,
+ 0b10,
+ (outs GPR8:$rd),
+ (ins GPR8:$src, GPR8:$rr),
+ "sbc\t$rd, $rr",
+ [(set i8:$rd, (sube i8:$src, i8:$rr)),
+ (implicit SREG)]>;
+
+ // SBCW Rd+1:Rd, Rr+1:Rr
+ //
+ // Expands to:
+ // sbc Rd, Rr
+ // sbc Rd+1, Rr+1
+ def SBCWRdRr : Pseudo<(outs DREGS:$rd),
+ (ins DREGS:$src, DREGS:$rr),
+ "sbcw\t$rd, $rr",
+ [(set i16:$rd, (sube i16:$src, i16:$rr)),
+ (implicit SREG)]>;
+
+ def SBCIRdK : FRdK<0b0100,
+ (outs LD8:$rd),
+ (ins LD8:$src, imm_ldi8:$k),
+ "sbci\t$rd, $k",
+ [(set i8:$rd, (sube i8:$src, imm:$k)),
+ (implicit SREG)]>;
+
+ // SBCIW Rd+1:Rd, K+1:K
+ // sbci Rd, K
+ // sbci Rd+1, K+1
+ def SBCIWRdK : Pseudo<(outs DLDREGS:$rd),
+ (ins DLDREGS:$src, i16imm:$rr),
+ "sbciw\t$rd, $rr",
+ [(set i16:$rd, (sube i16:$src, imm:$rr)),
+ (implicit SREG)]>;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Increment and Decrement
+//===----------------------------------------------------------------------===//
+let Constraints = "$src = $rd",
+Defs = [SREG] in
+{
+ def INCRd : FRd<0b1001,
+ 0b0100011,
+ (outs GPR8:$rd),
+ (ins GPR8:$src),
+ "inc\t$rd",
+ [(set i8:$rd, (add i8:$src, 1)), (implicit SREG)]>;
+
+ def DECRd : FRd<0b1001,
+ 0b0101010,
+ (outs GPR8:$rd),
+ (ins GPR8:$src),
+ "dec\t$rd",
+ [(set i8:$rd, (add i8:$src, -1)), (implicit SREG)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Multiplication
+//===----------------------------------------------------------------------===//
+
+let isCommutable = 1,
+Defs = [R1, R0, SREG] in
+{
+ // MUL Rd, Rr
+ // Multiplies Rd by Rr and places the result into R1:R0.
+ let usesCustomInserter = 1 in {
+ def MULRdRr : FRdRr<0b1001, 0b11,
+ (outs),
+ (ins GPR8:$lhs, GPR8:$rhs),
+ "mul\t$lhs, $rhs",
+ [/*(set R1, R0, (smullohi i8:$lhs, i8:$rhs))*/]>,
+ Requires<[SupportsMultiplication]>;
+
+ def MULSRdRr : FMUL2RdRr<0,
+ (outs),
+ (ins GPR8:$lhs, GPR8:$rhs),
+ "muls\t$lhs, $rhs",
+ []>,
+ Requires<[SupportsMultiplication]>;
+ }
+
+ def MULSURdRr : FMUL2RdRr<1,
+ (outs),
+ (ins GPR8:$lhs, GPR8:$rhs),
+ "mulsu\t$lhs, $rhs",
+ []>,
+ Requires<[SupportsMultiplication]>;
+
+ def FMUL : FFMULRdRr<0b01,
+ (outs),
+ (ins GPR8:$lhs, GPR8:$rhs),
+ "fmul\t$lhs, $rhs",
+ []>,
+ Requires<[SupportsMultiplication]>;
+
+ def FMULS : FFMULRdRr<0b10,
+ (outs),
+ (ins GPR8:$lhs, GPR8:$rhs),
+ "fmuls\t$lhs, $rhs",
+ []>,
+ Requires<[SupportsMultiplication]>;
+
+ def FMULSU : FFMULRdRr<0b11,
+ (outs),
+ (ins GPR8:$lhs, GPR8:$rhs),
+ "fmulsu\t$lhs, $rhs",
+ []>,
+ Requires<[SupportsMultiplication]>;
+}
+
+let Defs = [R15, R14, R13, R12, R11, R10, R9,
+ R8, R7, R6, R5, R4, R3, R2, R1, R0] in
+def DESK : FDES<(outs),
+ (ins i8imm:$k),
+ "des\t$k",
+ []>,
+ Requires<[HasDES]>;
+
+//===----------------------------------------------------------------------===//
+// Logic
+//===----------------------------------------------------------------------===//
+let Constraints = "$src = $rd",
+Defs = [SREG] in
+{
+ // Register-Register logic instructions (which have the
+ // property of commutativity).
+ let isCommutable = 1 in
+ {
+ def ANDRdRr : FRdRr<0b0010,
+ 0b00,
+ (outs GPR8:$rd),
+ (ins GPR8:$src, GPR8:$rr),
+ "and\t$rd, $rr",
+ [(set i8:$rd, (and i8:$src, i8:$rr)),
+ (implicit SREG)]>;
+
+ // ANDW Rd+1:Rd, Rr+1:Rr
+ //
+ // Expands to:
+ // and Rd, Rr
+ // and Rd+1, Rr+1
+ def ANDWRdRr : Pseudo<(outs DREGS:$rd),
+ (ins DREGS:$src, DREGS:$rr),
+ "andw\t$rd, $rr",
+ [(set i16:$rd, (and i16:$src, i16:$rr)),
+ (implicit SREG)]>;
+
+ def ORRdRr : FRdRr<0b0010,
+ 0b10,
+ (outs GPR8:$rd),
+ (ins GPR8:$src, GPR8:$rr),
+ "or\t$rd, $rr",
+ [(set i8:$rd, (or i8:$src, i8:$rr)),
+ (implicit SREG)]>;
+
+ // ORW Rd+1:Rd, Rr+1:Rr
+ //
+ // Expands to:
+ // or Rd, Rr
+ // or Rd+1, Rr+1
+ def ORWRdRr : Pseudo<(outs DREGS:$rd),
+ (ins DREGS:$src, DREGS:$rr),
+ "orw\t$rd, $rr",
+ [(set i16:$rd, (or i16:$src, i16:$rr)),
+ (implicit SREG)]>;
+
+ def EORRdRr : FRdRr<0b0010,
+ 0b01,
+ (outs GPR8:$rd),
+ (ins GPR8:$src, GPR8:$rr),
+ "eor\t$rd, $rr",
+ [(set i8:$rd, (xor i8:$src, i8:$rr)),
+ (implicit SREG)]>;
+
+ // EORW Rd+1:Rd, Rr+1:Rr
+ //
+ // Expands to:
+ // eor Rd, Rr
+ // eor Rd+1, Rr+1
+ def EORWRdRr : Pseudo<(outs DREGS:$rd),
+ (ins DREGS:$src, DREGS:$rr),
+ "eorw\t$rd, $rr",
+ [(set i16:$rd, (xor i16:$src, i16:$rr)),
+ (implicit SREG)]>;
+ }
+
+ def ANDIRdK : FRdK<0b0111,
+ (outs LD8:$rd),
+ (ins LD8:$src, imm_ldi8:$k),
+ "andi\t$rd, $k",
+ [(set i8:$rd, (and i8:$src, imm:$k)),
+ (implicit SREG)]>;
+
+ // ANDI Rd+1:Rd, K+1:K
+ //
+ // Expands to:
+ // andi Rd, K
+ // andi Rd+1, K+1
+ def ANDIWRdK : Pseudo<(outs DLDREGS:$rd),
+ (ins DLDREGS:$src, i16imm:$k),
+ "andiw\t$rd, $k",
+ [(set i16:$rd, (and i16:$src, imm:$k)),
+ (implicit SREG)]>;
+
+ def ORIRdK : FRdK<0b0110,
+ (outs LD8:$rd),
+ (ins LD8:$src, imm_ldi8:$k),
+ "ori\t$rd, $k",
+ [(set i8:$rd, (or i8:$src, imm:$k)),
+ (implicit SREG)]>;
+
+ // ORIW Rd+1:Rd, K+1,K
+ //
+ // Expands to:
+ // ori Rd, K
+ // ori Rd+1, K+1
+ def ORIWRdK : Pseudo<(outs DLDREGS:$rd),
+ (ins DLDREGS:$src, i16imm:$rr),
+ "oriw\t$rd, $rr",
+ [(set i16:$rd, (or i16:$src, imm:$rr)),
+ (implicit SREG)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// One's/Two's Compliment
+//===----------------------------------------------------------------------===//
+let Constraints = "$src = $rd",
+Defs = [SREG] in
+{
+ def COMRd : FRd<0b1001,
+ 0b0100000,
+ (outs GPR8:$rd),
+ (ins GPR8:$src),
+ "com\t$rd",
+ [(set i8:$rd, (not i8:$src)), (implicit SREG)]>;
+
+ // COMW Rd+1:Rd
+ //
+ // Expands to:
+ // com Rd
+ // com Rd+1
+ def COMWRd : Pseudo<(outs DREGS:$rd),
+ (ins DREGS:$src),
+ "comw\t$rd",
+ [(set i16:$rd, (not i16:$src)), (implicit SREG)]>;
+
+ //:TODO: optimize NEG for wider types
+ def NEGRd : FRd<0b1001,
+ 0b0100001,
+ (outs GPR8:$rd),
+ (ins GPR8:$src),
+ "neg\t$rd",
+ [(set i8:$rd, (ineg i8:$src)), (implicit SREG)]>;
+}
+
+// TST Rd
+// Test for zero of minus.
+// This operation is identical to a `Rd AND Rd`.
+//def : InstAlias<"tst\t$rd", (ANDRdRr GPR8:$rd, GPR8:$rd), 1>;
+
+let Defs = [SREG] in
+def TSTRd : FTST<0b0010,
+ 0b00,
+ (outs),
+ (ins GPR8:$rd),
+ "tst\t$rd",
+ [(AVRtst i8:$rd)]>;
+
+//===----------------------------------------------------------------------===//
+// Jump instructions
+//===----------------------------------------------------------------------===//
+let isBarrier = 1,
+isBranch = 1,
+isTerminator = 1 in
+{
+ def RJMPk : FBRk<0,
+ (outs),
+ (ins brtarget_13:$target),
+ "rjmp\t$target",
+ [(br bb:$target)]>;
+
+ let isIndirectBranch = 1,
+ Uses = [R31R30] in
+ def IJMP : F16<0b1001010000001001,
+ (outs),
+ (ins),
+ "ijmp",
+ []>,
+ Requires<[HasIJMPCALL]>;
+
+ let isIndirectBranch = 1,
+ Uses = [R31R30] in
+ def EIJMP : F16<0b1001010000011001,
+ (outs),
+ (ins),
+ "eijmp",
+ []>,
+ Requires<[HasEIJMPCALL]>;
+
+ def JMPk : F32BRk<0b110,
+ (outs),
+ (ins call_target:$k),
+ "jmp\t$k",
+ []>,
+ Requires<[HasJMPCALL]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Call instructions
+//===----------------------------------------------------------------------===//
+let isCall = 1 in
+{
+ // SP is marked as a use to prevent stack-pointer assignments that appear
+ // immediately before calls from potentially appearing dead.
+ let Uses = [SP] in
+ def RCALLk : FBRk<1,
+ (outs),
+ (ins brtarget_13:$target),
+ "rcall\t$target",
+ []>;
+
+ // SP is marked as a use to prevent stack-pointer assignments that appear
+ // immediately before calls from potentially appearing dead.
+ let Uses = [SP, R31R30] in
+ def ICALL : F16<0b1001010100001001,
+ (outs),
+ (ins variable_ops),
+ "icall",
+ []>,
+ Requires<[HasIJMPCALL]>;
+
+ // SP is marked as a use to prevent stack-pointer assignments that appear
+ // immediately before calls from potentially appearing dead.
+ let Uses = [SP, R31R30] in
+ def EICALL : F16<0b1001010100011001,
+ (outs),
+ (ins variable_ops),
+ "eicall",
+ []>,
+ Requires<[HasEIJMPCALL]>;
+
+ // SP is marked as a use to prevent stack-pointer assignments that appear
+ // immediately before calls from potentially appearing dead.
+ //
+ //:TODO: the imm field can be either 16 or 22 bits in devices with more
+ // than 64k of ROM, fix it once we support the largest devices.
+ let Uses = [SP] in
+ def CALLk : F32BRk<0b111,
+ (outs),
+ (ins call_target:$k),
+ "call\t$k",
+ [(AVRcall imm:$k)]>,
+ Requires<[HasJMPCALL]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Return instructions.
+//===----------------------------------------------------------------------===//
+let isTerminator = 1,
+isReturn = 1,
+isBarrier = 1 in
+{
+ def RET : F16<0b1001010100001000,
+ (outs),
+ (ins),
+ "ret",
+ [(AVRretflag)]>;
+
+ def RETI : F16<0b1001010100011000,
+ (outs),
+ (ins),
+ "reti",
+ [(AVRretiflag)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Compare operations.
+//===----------------------------------------------------------------------===//
+let Defs = [SREG] in
+{
+ // CPSE Rd, Rr
+ // Compare Rd and Rr, skipping the next instruction if they are equal.
+ let isBarrier = 1,
+ isBranch = 1,
+ isTerminator = 1 in
+ def CPSE : FRdRr<0b0001,
+ 0b00,
+ (outs),
+ (ins GPR8:$rd, GPR8:$rr),
+ "cpse\t$rd, $rr",
+ []>;
+
+ def CPRdRr : FRdRr<0b0001,
+ 0b01,
+ (outs),
+ (ins GPR8:$rd, GPR8:$rr),
+ "cp\t$rd, $rr",
+ [(AVRcmp i8:$rd, i8:$rr), (implicit SREG)]>;
+
+ // CPW Rd+1:Rd, Rr+1:Rr
+ //
+ // Expands to:
+ // cp Rd, Rr
+ // cpc Rd+1, Rr+1
+ def CPWRdRr : Pseudo<(outs),
+ (ins DREGS:$src, DREGS:$src2),
+ "cpw\t$src, $src2",
+ [(AVRcmp i16:$src, i16:$src2), (implicit SREG)]>;
+
+ let Uses = [SREG] in
+ def CPCRdRr : FRdRr<0b0000,
+ 0b01,
+ (outs),
+ (ins GPR8:$rd, GPR8:$rr),
+ "cpc\t$rd, $rr",
+ [(AVRcmpc i8:$rd, i8:$rr), (implicit SREG)]>;
+
+ // CPCW Rd+1:Rd. Rr+1:Rr
+ //
+ // Expands to:
+ // cpc Rd, Rr
+ // cpc Rd+1, Rr+1
+ let Uses = [SREG] in
+ def CPCWRdRr : Pseudo<(outs),
+ (ins DREGS:$src, DREGS:$src2),
+ "cpcw\t$src, $src2",
+ [(AVRcmpc i16:$src, i16:$src2), (implicit SREG)]>;
+
+ // CPI Rd, K
+ // Compares a register with an 8 bit immediate.
+ let Uses = [SREG] in
+ def CPIRdK : FRdK<0b0011,
+ (outs),
+ (ins GPR8:$rd, imm_ldi8:$k),
+ "cpi\t$rd, $k",
+ [(AVRcmp i8:$rd, imm:$k), (implicit SREG)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Register conditional skipping/branching operations.
+//===----------------------------------------------------------------------===//
+let isBranch = 1,
+isTerminator = 1 in
+{
+ // Conditional skipping on GPR register bits, and
+ // conditional skipping on IO register bits.
+ let isBarrier = 1 in
+ {
+ def SBRCRrB : FRdB<0b10,
+ (outs),
+ (ins GPR8:$rr, i8imm:$b),
+ "sbrc\t$rr, $b",
+ []>;
+
+ def SBRSRrB : FRdB<0b11,
+ (outs),
+ (ins GPR8:$rr, i8imm:$b),
+ "sbrs\t$rr, $b",
+ []>;
+
+ def SBICAb : FIOBIT<0b01,
+ (outs),
+ (ins imm_port5:$a, i8imm:$b),
+ "sbic\t$a, $b",
+ []>;
+
+ def SBISAb : FIOBIT<0b11,
+ (outs),
+ (ins imm_port5:$a, i8imm:$b),
+ "sbis\t$a, $b",
+ []>;
+ }
+
+ // Relative branches on status flag bits.
+ let Uses = [SREG] in
+ {
+ // BRBS s, k
+ // Branch if `s` flag in status register is set.
+ def BRBSsk : FSK<0,
+ (outs),
+ (ins i8imm:$s, relbrtarget_7:$k),
+ "brbs\t$s, $k",
+ []>;
+
+ // BRBC s, k
+ // Branch if `s` flag in status register is clear.
+ def BRBCsk : FSK<1,
+ (outs),
+ (ins i8imm:$s, relbrtarget_7:$k),
+ "brbc\t$s, $k",
+ []>;
+ }
+}
+
+
+// BRCS k
+// Branch if carry flag is set
+def : InstAlias<"brcs\t$k", (BRBSsk 0, relbrtarget_7:$k)>;
+
+// BRCC k
+// Branch if carry flag is clear
+def : InstAlias<"brcc\t$k", (BRBCsk 0, relbrtarget_7:$k)>;
+
+// BRHS k
+// Branch if half carry flag is set
+def : InstAlias<"brhs\t$k", (BRBSsk 5, relbrtarget_7:$k)>;
+
+// BRHC k
+// Branch if half carry flag is clear
+def : InstAlias<"brhc\t$k", (BRBCsk 5, relbrtarget_7:$k)>;
+
+// BRTS k
+// Branch if the T flag is set
+def : InstAlias<"brts\t$k", (BRBSsk 6, relbrtarget_7:$k)>;
+
+// BRTC k
+// Branch if the T flag is clear
+def : InstAlias<"brtc\t$k", (BRBCsk 6, relbrtarget_7:$k)>;
+
+// BRVS k
+// Branch if the overflow flag is set
+def : InstAlias<"brvs\t$k", (BRBSsk 3, relbrtarget_7:$k)>;
+
+// BRVC k
+// Branch if the overflow flag is clear
+def : InstAlias<"brvc\t$k", (BRBCsk 3, relbrtarget_7:$k)>;
+
+// BRIE k
+// Branch if the global interrupt flag is enabled
+def : InstAlias<"brie\t$k", (BRBSsk 7, relbrtarget_7:$k)>;
+
+// BRID k
+// Branch if the global interrupt flag is disabled
+def : InstAlias<"brid\t$k", (BRBCsk 7, relbrtarget_7:$k)>;
+
+//===----------------------------------------------------------------------===//
+// PC-relative conditional branches
+//===----------------------------------------------------------------------===//
+// Based on status register. We cannot simplify these into instruction aliases
+// because we also need to be able to specify a pattern to match for ISel.
+let isBranch = 1,
+isTerminator = 1,
+Uses = [SREG] in
+{
+ def BREQk : FBRsk<0,
+ 0b001,
+ (outs),
+ (ins relbrtarget_7:$target),
+ "breq\t$target",
+ [(AVRbrcond bb:$target, AVR_COND_EQ)]>;
+
+ def BRNEk : FBRsk<1,
+ 0b001,
+ (outs),
+ (ins relbrtarget_7:$target),
+ "brne\t$target",
+ [(AVRbrcond bb:$target, AVR_COND_NE)]>;
+
+
+ def BRSHk : FBRsk<1,
+ 0b000,
+ (outs),
+ (ins relbrtarget_7:$target),
+ "brsh\t$target",
+ [(AVRbrcond bb:$target, AVR_COND_SH)]>;
+
+ def BRLOk : FBRsk<0,
+ 0b000,
+ (outs),
+ (ins relbrtarget_7:$target),
+ "brlo\t$target",
+ [(AVRbrcond bb:$target, AVR_COND_LO)]>;
+
+ def BRMIk : FBRsk<0,
+ 0b010,
+ (outs),
+ (ins relbrtarget_7:$target),
+ "brmi\t$target",
+ [(AVRbrcond bb:$target, AVR_COND_MI)]>;
+
+ def BRPLk : FBRsk<1,
+ 0b010,
+ (outs),
+ (ins relbrtarget_7:$target),
+ "brpl\t$target",
+ [(AVRbrcond bb:$target, AVR_COND_PL)]>;
+
+ def BRGEk : FBRsk<1,
+ 0b100,
+ (outs),
+ (ins relbrtarget_7:$target),
+ "brge\t$target",
+ [(AVRbrcond bb:$target, AVR_COND_GE)]>;
+
+ def BRLTk : FBRsk<0,
+ 0b100,
+ (outs),
+ (ins relbrtarget_7:$target),
+ "brlt\t$target",
+ [(AVRbrcond bb:$target, AVR_COND_LT)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Data transfer instructions
+//===----------------------------------------------------------------------===//
+// 8 and 16-bit register move instructions.
+let hasSideEffects = 0 in
+{
+ def MOVRdRr : FRdRr<0b0010,
+ 0b11,
+ (outs GPR8:$rd),
+ (ins GPR8:$rr),
+ "mov\t$rd, $rr",
+ []>;
+
+ def MOVWRdRr : FMOVWRdRr<(outs DREGS:$dst),
+ (ins DREGS:$src),
+ "movw\t$dst, $src",
+ []>,
+ Requires<[HasMOVW]>;
+}
+
+// Load immediate values into registers.
+let isReMaterializable = 1 in
+{
+ def LDIRdK : FRdK<0b1110,
+ (outs LD8:$rd),
+ (ins imm_ldi8:$k),
+ "ldi\t$rd, $k",
+ [(set i8:$rd, imm:$k)]>;
+
+ // LDIW Rd+1:Rd, K+1:K
+ //
+ // Expands to:
+ // ldi Rd, K
+ // ldi Rd+1, K+1
+ def LDIWRdK : Pseudo<(outs DLDREGS:$dst),
+ (ins i16imm:$src),
+ "ldiw\t$dst, $src",
+ [(set i16:$dst, imm:$src)]>;
+}
+
+// Load from data space into register.
+let canFoldAsLoad = 1,
+isReMaterializable = 1 in
+{
+ def LDSRdK : F32DM<0b0,
+ (outs GPR8:$rd),
+ (ins imm16:$k),
+ "lds\t$rd, $k",
+ [(set i8:$rd, (load imm:$k))]>,
+ Requires<[HasSRAM]>;
+
+ // LDSW Rd+1:Rd, K+1:K
+ //
+ // Expands to:
+ // lds Rd, (K+1:K)
+ // lds Rd+1 (K+1:K) + 1
+ def LDSWRdK : Pseudo<(outs DREGS:$dst),
+ (ins i16imm:$src),
+ "ldsw\t$dst, $src",
+ [(set i16:$dst, (load imm:$src))]>,
+ Requires<[HasSRAM]>;
+}
+
+// Indirect loads.
+let canFoldAsLoad = 1,
+isReMaterializable = 1 in
+{
+ def LDRdPtr : FSTLD<0,
+ 0b00,
+ (outs GPR8:$reg),
+ (ins LDSTPtrReg:$ptrreg),
+ "ld\t$reg, $ptrreg",
+ [(set GPR8:$reg, (load i16:$ptrreg))]>,
+ Requires<[HasSRAM]>;
+
+ // LDW Rd+1:Rd, P
+ //
+ // Expands to:
+ // ld Rd, P+
+ // ld Rd+1, P+
+ let Constraints = "@earlyclobber $reg" in
+ def LDWRdPtr : Pseudo<(outs DREGS:$reg),
+ (ins PTRDISPREGS:$ptrreg),
+ "ldw\t$reg, $ptrreg",
+ [(set i16:$reg, (load i16:$ptrreg))]>,
+ Requires<[HasSRAM]>;
+}
+
+// Indirect loads (with postincrement or predecrement).
+let mayLoad = 1,
+hasSideEffects = 0,
+Constraints = "$ptrreg = $base_wb,@earlyclobber $reg,@earlyclobber $base_wb" in
+{
+ def LDRdPtrPi : FSTLD<0,
+ 0b01,
+ (outs GPR8:$reg, PTRREGS:$base_wb),
+ (ins LDSTPtrReg:$ptrreg),
+ "ld\t$reg, $ptrreg+",
+ []>,
+ Requires<[HasSRAM]>;
+
+ // LDW Rd+1:Rd, P+
+ // Expands to:
+ // ld Rd, P+
+ // ld Rd+1, P+
+ def LDWRdPtrPi : Pseudo<(outs DREGS:$reg, PTRREGS:$base_wb),
+ (ins PTRREGS:$ptrreg),
+ "ldw\t$reg, $ptrreg+",
+ []>,
+ Requires<[HasSRAM]>;
+
+ def LDRdPtrPd : FSTLD<0,
+ 0b10,
+ (outs GPR8:$reg, PTRREGS:$base_wb),
+ (ins LDSTPtrReg:$ptrreg),
+ "ld\t$reg, -$ptrreg",
+ []>,
+ Requires<[HasSRAM]>;
+
+ // LDW Rd+1:Rd, -P
+ //
+ // Expands to:
+ // ld Rd+1, -P
+ // ld Rd, -P
+ def LDWRdPtrPd : Pseudo<(outs DREGS:$reg, PTRREGS:$base_wb),
+ (ins PTRREGS:$ptrreg),
+ "ldw\t$reg, -$ptrreg",
+ []>,
+ Requires<[HasSRAM]>;
+}
+
+// Load indirect with displacement operations.
+let canFoldAsLoad = 1,
+isReMaterializable = 1 in
+{
+ let Constraints = "@earlyclobber $reg" in
+ def LDDRdPtrQ : FSTDLDD<0,
+ (outs GPR8:$reg),
+ (ins memri:$memri),
+ "ldd\t$reg, $memri",
+ [(set i8:$reg, (load addr:$memri))]>,
+ Requires<[HasSRAM]>;
+
+ // LDDW Rd+1:Rd, P+q
+ //
+ // Expands to:
+ // ldd Rd, P+q
+ // ldd Rd+1, P+q+1
+ let Constraints = "@earlyclobber $dst" in
+ def LDDWRdPtrQ : Pseudo<(outs DREGS:$dst),
+ (ins memri:$memri),
+ "lddw\t$dst, $memri",
+ [(set i16:$dst, (load addr:$memri))]>,
+ Requires<[HasSRAM]>;
+
+ let mayLoad = 1,
+ hasSideEffects = 0,
+ Constraints = "@earlyclobber $dst" in
+ def LDDWRdYQ : Pseudo<(outs DREGS:$dst),
+ (ins memri:$memri),
+ "lddw\t$dst, $memri",
+ []>,
+ Requires<[HasSRAM]>;
+}
+
+class AtomicLoad<PatFrag Op, RegisterClass DRC> :
+ Pseudo<(outs DRC:$rd), (ins PTRREGS:$rr), "atomic_op",
+ [(set DRC:$rd, (Op i16:$rr))]>;
+
+class AtomicStore<PatFrag Op, RegisterClass DRC> :
+ Pseudo<(outs), (ins PTRDISPREGS:$rd, DRC:$rr), "atomic_op",
+ [(Op i16:$rd, DRC:$rr)]>;
+
+class AtomicLoadOp<PatFrag Op, RegisterClass DRC> :
+ Pseudo<(outs DRC:$rd), (ins PTRREGS:$rr, DRC:$operand),
+ "atomic_op",
+ [(set DRC:$rd, (Op i16:$rr, DRC:$operand))]>;
+
+def AtomicLoad8 : AtomicLoad<atomic_load_8, GPR8>;
+def AtomicLoad16 : AtomicLoad<atomic_load_16, DREGS>;
+
+def AtomicStore8 : AtomicStore<atomic_store_8, GPR8>;
+def AtomicStore16 : AtomicStore<atomic_store_16, DREGS>;
+
+def AtomicLoadAdd8 : AtomicLoadOp<atomic_load_add_8, GPR8>;
+def AtomicLoadAdd16 : AtomicLoadOp<atomic_load_add_16, DREGS>;
+def AtomicLoadSub8 : AtomicLoadOp<atomic_load_sub_8, GPR8>;
+def AtomicLoadSub16 : AtomicLoadOp<atomic_load_sub_16, DREGS>;
+def AtomicLoadAnd8 : AtomicLoadOp<atomic_load_and_8, GPR8>;
+def AtomicLoadAnd16 : AtomicLoadOp<atomic_load_and_16, DREGS>;
+def AtomicLoadOr8 : AtomicLoadOp<atomic_load_or_8, GPR8>;
+def AtomicLoadOr16 : AtomicLoadOp<atomic_load_or_16, DREGS>;
+def AtomicLoadXor8 : AtomicLoadOp<atomic_load_xor_8, GPR8>;
+def AtomicLoadXor16 : AtomicLoadOp<atomic_load_xor_16, DREGS>;
+def AtomicFence : Pseudo<(outs), (ins), "atomic_fence",
+ [(atomic_fence imm, imm)]>;
+
+// Indirect store from register to data space.
+def STSKRr : F32DM<0b1,
+ (outs),
+ (ins imm16:$k, GPR8:$rd),
+ "sts\t$k, $rd",
+ [(store i8:$rd, imm:$k)]>,
+ Requires<[HasSRAM]>;
+
+// STSW K+1:K, Rr+1:Rr
+//
+// Expands to:
+// sts Rr+1, (K+1:K) + 1
+// sts Rr, (K+1:K)
+def STSWKRr : Pseudo<(outs),
+ (ins i16imm:$dst, DREGS:$src),
+ "stsw\t$dst, $src",
+ [(store i16:$src, imm:$dst)]>,
+ Requires<[HasSRAM]>;
+
+// Indirect stores.
+// ST P, Rr
+// Stores the value of Rr into the location addressed by pointer P.
+def STPtrRr : FSTLD<1,
+ 0b00,
+ (outs),
+ (ins LDSTPtrReg:$ptrreg, GPR8:$reg),
+ "st\t$ptrreg, $reg",
+ [(store GPR8:$reg, i16:$ptrreg)]>,
+ Requires<[HasSRAM]>;
+
+// STW P, Rr+1:Rr
+// Stores the value of Rr into the location addressed by pointer P.
+//
+// Expands to:
+// st P, Rr
+// std P+1, Rr+1
+def STWPtrRr : Pseudo<(outs),
+ (ins PTRDISPREGS:$ptrreg, DREGS:$reg),
+ "stw\t$ptrreg, $reg",
+ [(store i16:$reg, i16:$ptrreg)]>,
+ Requires<[HasSRAM]>;
+
+// Indirect stores (with postincrement or predecrement).
+let Constraints = "$ptrreg = $base_wb,@earlyclobber $base_wb" in
+{
+
+ // ST P+, Rr
+ // Stores the value of Rr into the location addressed by pointer P.
+ // Post increments P.
+ def STPtrPiRr : FSTLD<1,
+ 0b01,
+ (outs LDSTPtrReg:$base_wb),
+ (ins LDSTPtrReg:$ptrreg, GPR8:$reg, i8imm:$offs),
+ "st\t$ptrreg+, $reg",
+ [(set i16:$base_wb,
+ (post_store GPR8:$reg, i16:$ptrreg, imm:$offs))]>,
+ Requires<[HasSRAM]>;
+
+ // STW P+, Rr+1:Rr
+ // Stores the value of Rr into the location addressed by pointer P.
+ // Post increments P.
+ //
+ // Expands to:
+ // st P+, Rr
+ // st P+, Rr+1
+ def STWPtrPiRr : Pseudo<(outs PTRREGS:$base_wb),
+ (ins PTRREGS:$ptrreg, DREGS:$trh, i8imm:$offs),
+ "stw\t$ptrreg+, $trh",
+ [(set PTRREGS:$base_wb,
+ (post_store DREGS:$trh, PTRREGS:$ptrreg, imm:$offs))]>,
+ Requires<[HasSRAM]>;
+
+ // ST -P, Rr
+ // Stores the value of Rr into the location addressed by pointer P.
+ // Pre decrements P.
+ def STPtrPdRr : FSTLD<1,
+ 0b10,
+ (outs LDSTPtrReg:$base_wb),
+ (ins LDSTPtrReg:$ptrreg, GPR8:$reg, i8imm:$offs),
+ "st\t-$ptrreg, $reg",
+ [(set i16:$base_wb,
+ (pre_store GPR8:$reg, i16:$ptrreg, imm:$offs))]>,
+ Requires<[HasSRAM]>;
+
+ // STW -P, Rr+1:Rr
+ // Stores the value of Rr into the location addressed by pointer P.
+ // Pre decrements P.
+ //
+ // Expands to:
+ // st -P, Rr+1
+ // st -P, Rr
+ def STWPtrPdRr : Pseudo<(outs PTRREGS:$base_wb),
+ (ins PTRREGS:$ptrreg, DREGS:$reg, i8imm:$offs),
+ "stw\t-$ptrreg, $reg",
+ [(set PTRREGS:$base_wb,
+ (pre_store i16:$reg, i16:$ptrreg, imm:$offs))]>,
+ Requires<[HasSRAM]>;
+}
+
+// Store indirect with displacement operations.
+// STD P+q, Rr
+// Stores the value of Rr into the location addressed by pointer P with a
+// displacement of q. Does not modify P.
+def STDPtrQRr : FSTDLDD<1,
+ (outs),
+ (ins memri:$memri, GPR8:$reg),
+ "std\t$memri, $reg",
+ [(store i8:$reg, addr:$memri)]>,
+ Requires<[HasSRAM]>;
+
+// STDW P+q, Rr+1:Rr
+// Stores the value of Rr into the location addressed by pointer P with a
+// displacement of q. Does not modify P.
+//
+// Expands to:
+// std P+q, Rr
+// std P+q+1, Rr+1
+def STDWPtrQRr : Pseudo<(outs),
+ (ins memri:$memri, DREGS:$src),
+ "stdw\t$memri, $src",
+ [(store i16:$src, addr:$memri)]>,
+ Requires<[HasSRAM]>;
+
+
+// Load program memory operations.
+let canFoldAsLoad = 1,
+isReMaterializable = 1,
+hasSideEffects = 0 in
+{
+ let Defs = [R0],
+ Uses = [R31R30] in
+ def LPM : F16<0b1001010111001000,
+ (outs),
+ (ins),
+ "lpm",
+ []>,
+ Requires<[HasLPM]>;
+
+ def LPMRdZ : FLPMX<0,
+ 0,
+ (outs GPR8:$dst),
+ (ins ZREGS:$z),
+ "lpm\t$dst, $z",
+ []>,
+ Requires<[HasLPMX]>;
+
+ def LPMWRdZ : Pseudo<(outs DREGS:$dst),
+ (ins ZREGS:$z),
+ "lpmw\t$dst, $z",
+ []>,
+ Requires<[HasLPMX]>;
+
+ // Load program memory, while postincrementing the Z register.
+ let mayLoad = 1,
+ Defs = [R31R30] in
+ {
+ def LPMRdZPi : FLPMX<0,
+ 1,
+ (outs GPR8:$dst),
+ (ins ZREGS:$z),
+ "lpm\t$dst, $z+",
+ []>,
+ Requires<[HasLPMX]>;
+
+ def LPMWRdZPi : Pseudo<(outs DREGS:$dst),
+ (ins ZREGS:$z),
+ "lpmw\t$dst, $z+",
+ []>,
+ Requires<[HasLPMX]>;
+ }
+}
+
+// Extended load program memory operations.
+let mayLoad = 1,
+hasSideEffects = 0 in
+{
+ let Defs = [R0],
+ Uses = [R31R30] in
+ def ELPM : F16<0b1001010111011000,
+ (outs),
+ (ins),
+ "elpm",
+ []>,
+ Requires<[HasELPM]>;
+
+ def ELPMRdZ : FLPMX<1,
+ 0,
+ (outs GPR8:$dst),
+ (ins ZREGS:$z),
+ "elpm\t$dst, $z",
+ []>,
+ Requires<[HasELPMX]>;
+
+ let Defs = [R31R30] in
+ def ELPMRdZPi : FLPMX<1,
+ 1,
+ (outs GPR8:$dst),
+ (ins ZREGS: $z),
+ "elpm\t$dst, $z+",
+ []>,
+ Requires<[HasELPMX]>;
+}
+
+// Store program memory operations.
+let Uses = [R1, R0] in
+{
+ let Uses = [R31R30, R1, R0] in
+ def SPM : F16<0b1001010111101000,
+ (outs),
+ (ins),
+ "spm",
+ []>,
+ Requires<[HasSPM]>;
+
+ let Defs = [R31R30] in
+ def SPMZPi : F16<0b1001010111111000,
+ (outs),
+ (ins ZREGS:$z),
+ "spm $z+",
+ []>,
+ Requires<[HasSPMX]>;
+}
+
+// Read data from IO location operations.
+let canFoldAsLoad = 1,
+isReMaterializable = 1 in
+{
+ def INRdA : FIORdA<(outs GPR8:$dst),
+ (ins imm_port6:$src),
+ "in\t$dst, $src",
+ [(set i8:$dst, (load ioaddr8:$src))]>;
+
+ def INWRdA : Pseudo<(outs DREGS:$dst),
+ (ins imm_port6:$src),
+ "inw\t$dst, $src",
+ [(set i16:$dst, (load ioaddr16:$src))]>;
+}
+
+// Write data to IO location operations.
+def OUTARr : FIOARr<(outs),
+ (ins imm_port6:$dst, GPR8:$src),
+ "out\t$dst, $src",
+ [(store i8:$src, ioaddr8:$dst)]>;
+
+def OUTWARr : Pseudo<(outs),
+ (ins imm_port6:$dst, DREGS:$src),
+ "outw\t$dst, $src",
+ [(store i16:$src, ioaddr16:$dst)]>;
+
+// Stack push/pop operations.
+let Defs = [SP],
+Uses = [SP],
+hasSideEffects = 0 in
+{
+ // Stack push operations.
+ let mayStore = 1 in
+ {
+ def PUSHRr : FRd<0b1001,
+ 0b0011111,
+ (outs),
+ (ins GPR8:$reg),
+ "push\t$reg",
+ []>,
+ Requires<[HasSRAM]>;
+
+ def PUSHWRr : Pseudo<(outs),
+ (ins DREGS:$reg),
+ "pushw\t$reg",
+ []>,
+ Requires<[HasSRAM]>;
+ }
+
+ // Stack pop operations.
+ let mayLoad = 1 in
+ {
+ def POPRd : FRd<0b1001,
+ 0b0001111,
+ (outs GPR8:$reg),
+ (ins),
+ "pop\t$reg",
+ []>,
+ Requires<[HasSRAM]>;
+
+ def POPWRd : Pseudo<(outs DREGS:$reg),
+ (ins),
+ "popw\t$reg",
+ []>,
+ Requires<[HasSRAM]>;
+ }
+}
+
+// Read-Write-Modify (RMW) instructions.
+def XCHZRd : FZRd<0b100,
+ (outs GPR8:$rd),
+ (ins ZREGS:$z),
+ "xch\t$z, $rd",
+ []>,
+ Requires<[SupportsRMW]>;
+
+def LASZRd : FZRd<0b101,
+ (outs GPR8:$rd),
+ (ins ZREGS:$z),
+ "las\t$z, $rd",
+ []>,
+ Requires<[SupportsRMW]>;
+
+def LACZRd : FZRd<0b110,
+ (outs GPR8:$rd),
+ (ins ZREGS:$z),
+ "lac\t$z, $rd",
+ []>,
+ Requires<[SupportsRMW]>;
+
+def LATZRd : FZRd<0b111,
+ (outs GPR8:$rd),
+ (ins ZREGS:$z),
+ "lat\t$z, $rd",
+ []>,
+ Requires<[SupportsRMW]>;
+
+//===----------------------------------------------------------------------===//
+// Bit and bit-test instructions
+//===----------------------------------------------------------------------===//
+
+// Bit shift/rotate operations.
+let Constraints = "$src = $rd",
+Defs = [SREG] in
+{
+ def LSLRd : FRdRr<0b0000,
+ 0b11,
+ (outs GPR8:$rd),
+ (ins GPR8:$src),
+ "lsl\t$rd",
+ [(set i8:$rd, (AVRlsl i8:$src)), (implicit SREG)]>;
+
+ def LSLWRd : Pseudo<(outs DREGS:$rd),
+ (ins DREGS:$src),
+ "lslw\t$rd",
+ [(set i16:$rd, (AVRlsl i16:$src)), (implicit SREG)]>;
+
+ def LSRRd : FRd<0b1001,
+ 0b0100110,
+ (outs GPR8:$rd),
+ (ins GPR8:$src),
+ "lsr\t$rd",
+ [(set i8:$rd, (AVRlsr i8:$src)), (implicit SREG)]>;
+
+ def LSRWRd : Pseudo<(outs DREGS:$rd),
+ (ins DREGS:$src),
+ "lsrw\t$rd",
+ [(set i16:$rd, (AVRlsr i16:$src)), (implicit SREG)]>;
+
+ def ASRRd : FRd<0b1001,
+ 0b0100101,
+ (outs GPR8:$rd),
+ (ins GPR8:$src),
+ "asr\t$rd",
+ [(set i8:$rd, (AVRasr i8:$src)), (implicit SREG)]>;
+
+ def ASRWRd : Pseudo<(outs DREGS:$rd),
+ (ins DREGS:$src),
+ "asrw\t$rd",
+ [(set i16:$rd, (AVRasr i16:$src)), (implicit SREG)]>;
+
+ // Bit rotate operations.
+ let Uses = [SREG] in
+ {
+ def ROLRd : FRdRr<0b0001,
+ 0b11,
+ (outs GPR8:$rd),
+ (ins GPR8:$src),
+ "rol\t$rd",
+ [(set i8:$rd, (AVRrol i8:$src)), (implicit SREG)]>;
+
+ def ROLWRd : Pseudo<(outs DREGS:$rd),
+ (ins DREGS:$src),
+ "rolw\t$rd",
+ [(set i16:$rd, (AVRrol i16:$src)), (implicit SREG)]>;
+
+ def RORRd : FRd<0b1001,
+ 0b0100111,
+ (outs GPR8:$rd),
+ (ins GPR8:$src),
+ "ror\t$rd",
+ [(set i8:$rd, (AVRror i8:$src)), (implicit SREG)]>;
+
+ def RORWRd : Pseudo<(outs DREGS:$rd),
+ (ins DREGS:$src),
+ "rorw\t$rd",
+ [(set i16:$rd, (AVRror i16:$src)), (implicit SREG)]>;
+ }
+}
+
+// SWAP Rd
+// Swaps the high and low nibbles in a register.
+let Constraints = "$src = $rd" in
+def SWAPRd : FRd<0b1001,
+ 0b0100010,
+ (outs GPR8:$rd),
+ (ins GPR8:$src),
+ "swap\t$rd",
+ [(set i8:$rd, (bswap i8:$src))]>;
+
+// IO register bit set/clear operations.
+//:TODO: add patterns when popcount(imm)==2 to be expanded with 2 sbi/cbi
+// instead of in+ori+out which requires one more instr.
+def SBIAb : FIOBIT<0b10,
+ (outs),
+ (ins imm_port5:$addr, i8imm:$bit),
+ "sbi\t$addr, $bit",
+ [(store (or (i8 (load lowioaddr8:$addr)), iobitpos8:$bit),
+ lowioaddr8:$addr)]>;
+
+def CBIAb : FIOBIT<0b00,
+ (outs),
+ (ins imm_port5:$addr, i8imm:$bit),
+ "cbi\t$addr, $bit",
+ [(store (and (i8 (load lowioaddr8:$addr)), iobitposn8:$bit),
+ lowioaddr8:$addr)]>;
+
+// Status register bit load/store operations.
+let Defs = [SREG] in
+def BST : FRdB<0b01,
+ (outs),
+ (ins GPR8:$rd, i8imm:$b),
+ "bst\t$rd, $b",
+ []>;
+
+let Uses = [SREG] in
+def BLD : FRdB<0b00,
+ (outs),
+ (ins GPR8:$rd, i8imm:$b),
+ "bld\t$rd, $b",
+ []>;
+
+// Set/clear bit in register operations.
+let Constraints = "$src = $rd",
+Defs = [SREG] in
+{
+ // SBR Rd, K
+ // Alias for ORI Rd, K
+ def SBRRdK : FRdK<0b0110,
+ (outs LD8:$rd),
+ (ins LD8:$src, imm_ldi8:$k),
+ "sbr\t$rd, $k",
+ [(set i8:$rd, (or i8:$src, imm:$k)),
+ (implicit SREG)]>;
+
+ // CBR Rd, K
+ // Alias for `ANDI Rd, COM(K)` where COM(K) is the compliment of K.
+ // FIXME: This uses the 'complement' encoder. We need it to also use the
+ // imm_ldi8 encoder. This will cause no fixups to be created on this instruction.
+ def CBRRdK : FRdK<0b0111,
+ (outs LD8:$rd),
+ (ins LD8:$src, imm_com8:$k),
+ "cbr\t$rd, $k",
+ []>;
+}
+
+// CLR Rd
+// Alias for EOR Rd, Rd
+// -------------
+// Clears all bits in a register.
+def CLR : InstAlias<"clr\t$rd", (EORRdRr GPR8:$rd, GPR8:$rd)>;
+
+// SER Rd
+// Alias for LDI Rd, 0xff
+// ---------
+// Sets all bits in a register.
+def : InstAlias<"ser\t$rd", (LDIRdK LD8:$rd, 0xff), 0>;
+
+let Defs = [SREG] in
+def BSETs : FS<0,
+ (outs),
+ (ins i8imm:$s),
+ "bset\t$s",
+ []>;
+
+let Defs = [SREG] in
+def BCLRs : FS<1,
+ (outs),
+ (ins i8imm:$s),
+ "bclr\t$s",
+ []>;
+
+// Set/clear aliases for the carry (C) status flag (bit 0).
+def : InstAlias<"sec", (BSETs 0)>;
+def : InstAlias<"clc", (BCLRs 0)>;
+
+// Set/clear aliases for the zero (Z) status flag (bit 1).
+def : InstAlias<"sez", (BSETs 1)>;
+def : InstAlias<"clz", (BCLRs 1)>;
+
+// Set/clear aliases for the negative (N) status flag (bit 2).
+def : InstAlias<"sen", (BSETs 2)>;
+def : InstAlias<"cln", (BCLRs 2)>;
+
+// Set/clear aliases for the overflow (V) status flag (bit 3).
+def : InstAlias<"sev", (BSETs 3)>;
+def : InstAlias<"clv", (BCLRs 3)>;
+
+// Set/clear aliases for the signed (S) status flag (bit 4).
+def : InstAlias<"ses", (BSETs 4)>;
+def : InstAlias<"cls", (BCLRs 4)>;
+
+// Set/clear aliases for the half-carry (H) status flag (bit 5).
+def : InstAlias<"seh", (BSETs 5)>;
+def : InstAlias<"clh", (BCLRs 5)>;
+
+// Set/clear aliases for the T status flag (bit 6).
+def : InstAlias<"set", (BSETs 6)>;
+def : InstAlias<"clt", (BCLRs 6)>;
+
+// Set/clear aliases for the interrupt (I) status flag (bit 7).
+def : InstAlias<"sei", (BSETs 7)>;
+def : InstAlias<"cli", (BCLRs 7)>;
+
+//===----------------------------------------------------------------------===//
+// Special/Control instructions
+//===----------------------------------------------------------------------===//
+
+// BREAK
+// Breakpoint instruction
+// ---------
+// <|1001|0101|1001|1000>
+def BREAK : F16<0b1001010110011000,
+ (outs),
+ (ins),
+ "break",
+ []>,
+ Requires<[HasBREAK]>;
+
+// NOP
+// No-operation instruction
+// ---------
+// <|0000|0000|0000|0000>
+def NOP : F16<0b0000000000000000,
+ (outs),
+ (ins),
+ "nop",
+ []>;
+
+// SLEEP
+// Sleep instruction
+// ---------
+// <|1001|0101|1000|1000>
+def SLEEP : F16<0b1001010110001000,
+ (outs),
+ (ins),
+ "sleep",
+ []>;
+
+// WDR
+// Watchdog reset
+// ---------
+// <|1001|0101|1010|1000>
+def WDR : F16<0b1001010110101000,
+ (outs),
+ (ins),
+ "wdr",
+ []>;
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions for later expansion
+//===----------------------------------------------------------------------===//
+
+//:TODO: Optimize this for wider types AND optimize the following code
+// compile int foo(char a, char b, char c, char d) {return d+b;}
+// looks like a missed sext_inreg opportunity.
+def SEXT : ExtensionPseudo<
+ (outs DREGS:$dst),
+ (ins GPR8:$src),
+ "sext\t$dst, $src",
+ [(set i16:$dst, (sext i8:$src)), (implicit SREG)]
+>;
+
+def ZEXT : ExtensionPseudo<
+ (outs DREGS:$dst),
+ (ins GPR8:$src),
+ "zext\t$dst, $src",
+ [(set i16:$dst, (zext i8:$src)), (implicit SREG)]
+>;
+
+// This pseudo gets expanded into a movw+adiw thus it clobbers SREG.
+let Defs = [SREG],
+ hasSideEffects = 0 in
+def FRMIDX : Pseudo<(outs DLDREGS:$dst),
+ (ins DLDREGS:$src, i16imm:$src2),
+ "frmidx\t$dst, $src, $src2",
+ []>;
+
+// This pseudo is either converted to a regular store or a push which clobbers
+// SP.
+def STDSPQRr : StorePseudo<
+ (outs),
+ (ins memspi:$dst, GPR8:$src),
+ "stdstk\t$dst, $src",
+ [(store i8:$src, addr:$dst)]
+>;
+
+// This pseudo is either converted to a regular store or a push which clobbers
+// SP.
+def STDWSPQRr : StorePseudo<
+ (outs),
+ (ins memspi:$dst, DREGS:$src),
+ "stdwstk\t$dst, $src",
+ [(store i16:$src, addr:$dst)]
+>;
+
+// SP read/write pseudos.
+let hasSideEffects = 0 in
+{
+ let Uses = [SP] in
+ def SPREAD : Pseudo<
+ (outs DREGS:$dst),
+ (ins GPRSP:$src),
+ "spread\t$dst, $src",
+ []
+ >;
+
+ let Defs = [SP] in
+ def SPWRITE : Pseudo<
+ (outs GPRSP:$dst),
+ (ins DREGS:$src),
+ "spwrite\t$dst, $src",
+ []>;
+}
+
+def Select8 : SelectPseudo<
+ (outs GPR8:$dst),
+ (ins GPR8:$src, GPR8:$src2, i8imm:$cc),
+ "# Select8 PSEUDO",
+ [(set i8:$dst, (AVRselectcc i8:$src, i8:$src2, imm:$cc))]
+>;
+
+def Select16 : SelectPseudo<
+ (outs DREGS:$dst),
+ (ins DREGS:$src, DREGS:$src2, i8imm:$cc),
+ "# Select16 PSEUDO",
+ [(set i16:$dst, (AVRselectcc i16:$src, i16:$src2, imm:$cc))]
+>;
+
+def Lsl8 : ShiftPseudo<
+ (outs GPR8:$dst),
+ (ins GPR8:$src, GPR8:$cnt),
+ "# Lsl8 PSEUDO",
+ [(set i8:$dst, (AVRlslLoop i8:$src, i8:$cnt))]
+>;
+
+def Lsl16 : ShiftPseudo<
+ (outs DREGS:$dst),
+ (ins DREGS:$src, GPR8:$cnt),
+ "# Lsl16 PSEUDO",
+ [(set i16:$dst, (AVRlslLoop i16:$src, i8:$cnt))]
+>;
+
+def Lsr8 : ShiftPseudo<
+ (outs GPR8:$dst),
+ (ins GPR8:$src, GPR8:$cnt),
+ "# Lsr8 PSEUDO",
+ [(set i8:$dst, (AVRlsrLoop i8:$src, i8:$cnt))]
+>;
+
+
+def Lsr16 : ShiftPseudo<
+ (outs DREGS:$dst),
+ (ins DREGS:$src, GPR8:$cnt),
+ "# Lsr16 PSEUDO",
+ [(set i16:$dst, (AVRlsrLoop i16:$src, i8:$cnt))]
+>;
+
+def Asr8 : ShiftPseudo<
+ (outs GPR8:$dst),
+ (ins GPR8:$src, GPR8:$cnt),
+ "# Asr8 PSEUDO",
+ [(set i8:$dst, (AVRasrLoop i8:$src, i8:$cnt))]
+>;
+
+def Asr16 : ShiftPseudo<
+ (outs DREGS:$dst),
+ (ins DREGS:$src, GPR8:$cnt),
+ "# Asr16 PSEUDO",
+ [(set i16:$dst, (AVRasrLoop i16:$src, i8:$cnt))]
+>;
+
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+//:TODO: look in x86InstrCompiler.td for odd encoding trick related to
+// add x, 128 -> sub x, -128. Clang is emitting an eor for this (ldi+eor)
+
+// the add instruction always writes the carry flag
+def : Pat<(addc i8:$src, i8:$src2),
+ (ADDRdRr i8:$src, i8:$src2)>;
+def : Pat<(addc DREGS:$src, DREGS:$src2),
+ (ADDWRdRr DREGS:$src, DREGS:$src2)>;
+
+// all sub instruction variants always writes the carry flag
+def : Pat<(subc i8:$src, i8:$src2),
+ (SUBRdRr i8:$src, i8:$src2)>;
+def : Pat<(subc i16:$src, i16:$src2),
+ (SUBWRdRr i16:$src, i16:$src2)>;
+def : Pat<(subc i8:$src, imm:$src2),
+ (SUBIRdK i8:$src, imm:$src2)>;
+def : Pat<(subc i16:$src, imm:$src2),
+ (SUBIWRdK i16:$src, imm:$src2)>;
+
+// These patterns convert add (x, -imm) to sub (x, imm) since we dont have
+// any add with imm instructions. Also take care of the adiw/sbiw instructions.
+def : Pat<(add i16:$src1, imm0_63_neg:$src2),
+ (SBIWRdK i16:$src1, (imm0_63_neg:$src2))>;
+def : Pat<(add i16:$src1, imm:$src2),
+ (SUBIWRdK i16:$src1, (imm16_neg_XFORM imm:$src2))>;
+def : Pat<(addc i16:$src1, imm:$src2),
+ (SUBIWRdK i16:$src1, (imm16_neg_XFORM imm:$src2))>;
+def : Pat<(adde i16:$src1, imm:$src2),
+ (SBCIWRdK i16:$src1, (imm16_neg_XFORM imm:$src2))>;
+
+def : Pat<(add i8:$src1, imm:$src2),
+ (SUBIRdK i8:$src1, (imm8_neg_XFORM imm:$src2))>;
+def : Pat<(addc i8:$src1, imm:$src2),
+ (SUBIRdK i8:$src1, (imm8_neg_XFORM imm:$src2))>;
+def : Pat<(adde i8:$src1, imm:$src2),
+ (SBCIRdK i8:$src1, (imm8_neg_XFORM imm:$src2))>;
+
+// Calls.
+def : Pat<(AVRcall (i16 tglobaladdr:$dst)),
+ (CALLk tglobaladdr:$dst)>;
+def : Pat<(AVRcall (i16 texternalsym:$dst)),
+ (CALLk texternalsym:$dst)>;
+
+// `anyext`
+def : Pat<(i16 (anyext i8:$src)),
+ (INSERT_SUBREG (i16 (IMPLICIT_DEF)), i8:$src, sub_lo)>;
+
+// `trunc`
+def : Pat<(i8 (trunc i16:$src)),
+ (EXTRACT_SUBREG i16:$src, sub_lo)>;
+
+// sext_inreg
+def : Pat<(sext_inreg i16:$src, i8),
+ (SEXT (i8 (EXTRACT_SUBREG i16:$src, sub_lo)))>;
+
+// GlobalAddress
+def : Pat<(i16 (AVRWrapper tglobaladdr:$dst)),
+ (LDIWRdK tglobaladdr:$dst)>;
+def : Pat<(add i16:$src, (AVRWrapper tglobaladdr:$src2)),
+ (SUBIWRdK i16:$src, tglobaladdr:$src2)>;
+def : Pat<(i8 (load (AVRWrapper tglobaladdr:$dst))),
+ (LDSRdK tglobaladdr:$dst)>;
+def : Pat<(i16 (load (AVRWrapper tglobaladdr:$dst))),
+ (LDSWRdK tglobaladdr:$dst)>;
+def : Pat<(store i8:$src, (i16 (AVRWrapper tglobaladdr:$dst))),
+ (STSKRr tglobaladdr:$dst, i8:$src)>;
+def : Pat<(store i16:$src, (i16 (AVRWrapper tglobaladdr:$dst))),
+ (STSWKRr tglobaladdr:$dst, i16:$src)>;
+
+// BlockAddress
+def : Pat<(i16 (AVRWrapper tblockaddress:$dst)),
+ (LDIWRdK tblockaddress:$dst)>;
+
+// hi-reg truncation : trunc(int16 >> 8)
+//:FIXME: i think it's better to emit an extract subreg node in the DAG than
+// all this mess once we get optimal shift code
+// lol... I think so, too. [@agnat]
+def : Pat<(i8 (trunc (AVRlsr (AVRlsr (AVRlsr (AVRlsr (AVRlsr (AVRlsr (AVRlsr
+ (AVRlsr DREGS:$src)))))))))),
+ (EXTRACT_SUBREG DREGS:$src, sub_hi)>;
+
+// :FIXME: DAGCombiner produces an shl node after legalization from these seq:
+// BR_JT -> (mul x, 2) -> (shl x, 1)
+def : Pat<(shl i16:$src1, (i8 1)),
+ (LSLWRd i16:$src1)>;
+
diff --git a/contrib/llvm/lib/Target/AVR/AVRInstrumentFunctions.cpp b/contrib/llvm/lib/Target/AVR/AVRInstrumentFunctions.cpp
new file mode 100644
index 000000000000..5553dc2da31b
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRInstrumentFunctions.cpp
@@ -0,0 +1,222 @@
+//===-- AVRInstrumentFunctions.cpp - Insert instrumentation for testing ---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass takes a function and inserts calls to hook functions which are
+// told the name, arguments, and results of function calls.
+//
+// The hooks can do anything with the information given. It is possible to
+// send the data through a serial connection in order to runs tests on
+// bare metal.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVR.h"
+
+#include <llvm/IR/Function.h>
+#include <llvm/IR/Module.h>
+
+using namespace llvm;
+
+#define AVR_INSTRUMENT_FUNCTIONS_NAME "AVR function instrumentation pass"
+
+namespace {
+
+// External symbols that we emit calls to.
+namespace symbols {
+
+#define SYMBOL_PREFIX "avr_instrumentation"
+
+ const StringRef PREFIX = SYMBOL_PREFIX;
+
+ // void (i16 argCount);
+ const StringRef BEGIN_FUNCTION_SIGNATURE = SYMBOL_PREFIX "_begin_signature";
+ // void(i16 argCount);
+ const StringRef END_FUNCTION_SIGNATURE = SYMBOL_PREFIX "_end_signature";
+
+#undef SYMBOL_PREFIX
+}
+
+class AVRInstrumentFunctions : public FunctionPass {
+public:
+ static char ID;
+
+ AVRInstrumentFunctions() : FunctionPass(ID) {
+ initializeAVRInstrumentFunctionsPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+
+ StringRef getPassName() const override { return AVR_INSTRUMENT_FUNCTIONS_NAME; }
+};
+
+char AVRInstrumentFunctions::ID = 0;
+
+/// Creates a pointer to a string.
+static Value *CreateStringPtr(BasicBlock &BB, StringRef Str) {
+ LLVMContext &Ctx = BB.getContext();
+ IntegerType *I8 = Type::getInt8Ty(Ctx);
+
+ Constant *ConstantStr = ConstantDataArray::getString(Ctx, Str);
+ GlobalVariable *GlobalStr = new GlobalVariable(*BB.getParent()->getParent(),
+ ConstantStr->getType(),
+ true, /* is a constant */
+ GlobalValue::PrivateLinkage,
+ ConstantStr);
+ return GetElementPtrInst::CreateInBounds(GlobalStr,
+ {ConstantInt::get(I8, 0), ConstantInt::get(I8, 0)}, "", &BB);
+}
+
+static std::string GetTypeName(Type &Ty) {
+ if (auto *IntTy = dyn_cast<IntegerType>(&Ty)) {
+ return std::string("i") + std::to_string(IntTy->getBitWidth());
+ }
+
+ if (Ty.isFloatingPointTy()) {
+ return std::string("f") + std::to_string(Ty.getPrimitiveSizeInBits());
+ }
+
+ llvm_unreachable("unknown return type");
+}
+
+/// Builds a call to one of the signature begin/end hooks.
+static void BuildSignatureCall(StringRef SymName, BasicBlock &BB, Function &F) {
+ LLVMContext &Ctx = F.getContext();
+ IntegerType *I16 = Type::getInt16Ty(Ctx);
+
+ FunctionType *FnType = FunctionType::get(Type::getVoidTy(Ctx),
+ {Type::getInt8PtrTy(Ctx), I16}, false);
+
+ Constant *Fn = F.getParent()->getOrInsertFunction(SymName, FnType);
+ Value *FunctionName = CreateStringPtr(BB, F.getName());
+
+ Value *Args[] = {FunctionName,
+ ConstantInt::get(I16, F.getArgumentList().size())};
+ CallInst::Create(Fn, Args, "", &BB);
+}
+
+/// Builds instructions to call into an external function to
+/// notify about a function signature beginning.
+static void BuildBeginSignature(BasicBlock &BB, Function &F) {
+ return BuildSignatureCall(symbols::BEGIN_FUNCTION_SIGNATURE, BB, F);
+}
+
+/// Builds instructions to call into an external function to
+/// notify about a function signature ending.
+static void BuildEndSignature(BasicBlock &BB, Function &F) {
+ return BuildSignatureCall(symbols::END_FUNCTION_SIGNATURE, BB, F);
+}
+
+/// Get the name of the external symbol that we need to call
+/// to notify about this argument.
+static std::string GetArgumentSymbolName(Argument &Arg) {
+ return (symbols::PREFIX + "_argument_" + GetTypeName(*Arg.getType())).str();
+}
+
+/// Builds a call to one of the argument hooks.
+static void BuildArgument(BasicBlock &BB, Argument &Arg) {
+ Function &F = *Arg.getParent();
+ LLVMContext &Ctx = F.getContext();
+
+ Type *I8 = Type::getInt8Ty(Ctx);
+
+ FunctionType *FnType = FunctionType::get(Type::getVoidTy(Ctx),
+ {Type::getInt8PtrTy(Ctx), I8, Arg.getType()}, false);
+
+ Constant *Fn = F.getParent()->getOrInsertFunction(
+ GetArgumentSymbolName(Arg), FnType);
+ Value *ArgName = CreateStringPtr(BB, Arg.getName());
+
+ Value *Args[] = {ArgName, ConstantInt::get(I8, Arg.getArgNo()), &Arg};
+ CallInst::Create(Fn, Args, "", &BB);
+}
+
+/// Builds a call to all of the function signature hooks.
+static void BuildSignature(BasicBlock &BB, Function &F) {
+ BuildBeginSignature(BB, F);
+ for (Argument &Arg : F.args()) { BuildArgument(BB, Arg); }
+ BuildEndSignature(BB, F);
+}
+
+/// Builds the instrumentation entry block.
+static void BuildEntryBlock(Function &F) {
+ BasicBlock &EntryBlock = F.getEntryBlock();
+
+ // Create a new basic block at the start of the existing entry block.
+ BasicBlock *BB = BasicBlock::Create(F.getContext(),
+ "instrumentation_entry",
+ &F, &EntryBlock);
+
+ BuildSignature(*BB, F);
+
+ // Jump to the actual entry block.
+ BranchInst::Create(&EntryBlock, BB);
+}
+
+static std::string GetReturnSymbolName(Value &Val) {
+ return (symbols::PREFIX + "_result_" + GetTypeName(*Val.getType())).str();
+}
+
+static void BuildExitHook(Instruction &I) {
+ Function &F = *I.getParent()->getParent();
+ LLVMContext &Ctx = F.getContext();
+
+ if (auto *Ret = dyn_cast<ReturnInst>(&I)) {
+ Value *RetVal = Ret->getReturnValue();
+ assert(RetVal && "should only be instrumenting functions with return values");
+
+ FunctionType *FnType = FunctionType::get(Type::getVoidTy(Ctx),
+ {RetVal->getType()}, false);
+
+ Constant *Fn = F.getParent()->getOrInsertFunction(
+ GetReturnSymbolName(*RetVal), FnType);
+
+ // Call the result hook just before the return.
+ CallInst::Create(Fn, {RetVal}, "", &I);
+ }
+}
+
+/// Runs return hooks before all returns in a function.
+static void BuildExitHooks(Function &F) {
+ for (BasicBlock &BB : F) {
+ auto BBI = BB.begin(), E = BB.end();
+ while (BBI != E) {
+ auto NBBI = std::next(BBI);
+
+ BuildExitHook(*BBI);
+
+ // Modified |= expandMI(BB, MBBI);
+ BBI = NBBI;
+ }
+ }
+}
+
+static bool ShouldInstrument(Function &F) {
+ // No point reporting results if there are none.
+ return !F.getReturnType()->isVoidTy();
+}
+
+bool AVRInstrumentFunctions::runOnFunction(Function &F) {
+ if (ShouldInstrument(F)) {
+ BuildEntryBlock(F);
+ BuildExitHooks(F);
+ }
+
+ return true;
+}
+
+} // end of anonymous namespace
+
+INITIALIZE_PASS(AVRInstrumentFunctions, "avr-instrument-functions",
+ AVR_INSTRUMENT_FUNCTIONS_NAME, false, false)
+
+namespace llvm {
+
+FunctionPass *createAVRInstrumentFunctionsPass() { return new AVRInstrumentFunctions(); }
+
+} // end of namespace llvm
diff --git a/contrib/llvm/lib/Target/AVR/AVRMCInstLower.cpp b/contrib/llvm/lib/Target/AVR/AVRMCInstLower.cpp
new file mode 100644
index 000000000000..342fe558813a
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRMCInstLower.cpp
@@ -0,0 +1,100 @@
+//===-- AVRMCInstLower.cpp - Convert AVR MachineInstr to an MCInst --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower AVR MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVRMCInstLower.h"
+
+#include "AVRInstrInfo.h"
+#include "MCTargetDesc/AVRMCExpr.h"
+
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+MCOperand AVRMCInstLower::lowerSymbolOperand(const MachineOperand &MO,
+ MCSymbol *Sym) const {
+ unsigned char TF = MO.getTargetFlags();
+ const MCExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx);
+
+ bool IsNegated = false;
+ if (TF & AVRII::MO_NEG) { IsNegated = true; }
+
+ if (!MO.isJTI() && MO.getOffset()) {
+ Expr = MCBinaryExpr::createAdd(
+ Expr, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
+ }
+
+ if (TF & AVRII::MO_LO) {
+ Expr = AVRMCExpr::create(AVRMCExpr::VK_AVR_LO8, Expr, IsNegated, Ctx);
+ } else if (TF & AVRII::MO_HI) {
+ Expr = AVRMCExpr::create(AVRMCExpr::VK_AVR_HI8, Expr, IsNegated, Ctx);
+ } else if (TF != 0) {
+ llvm_unreachable("Unknown target flag on symbol operand");
+ }
+
+ return MCOperand::createExpr(Expr);
+}
+
+void AVRMCInstLower::lowerInstruction(const MachineInstr &MI, MCInst &OutMI) const {
+ OutMI.setOpcode(MI.getOpcode());
+
+ for (MachineOperand const &MO : MI.operands()) {
+ MCOperand MCOp;
+
+ switch (MO.getType()) {
+ default:
+ MI.dump();
+ llvm_unreachable("unknown operand type");
+ case MachineOperand::MO_Register:
+ // Ignore all implicit register operands.
+ if (MO.isImplicit())
+ continue;
+ MCOp = MCOperand::createReg(MO.getReg());
+ break;
+ case MachineOperand::MO_Immediate:
+ MCOp = MCOperand::createImm(MO.getImm());
+ break;
+ case MachineOperand::MO_GlobalAddress:
+ MCOp = lowerSymbolOperand(MO, Printer.getSymbol(MO.getGlobal()));
+ break;
+ case MachineOperand::MO_ExternalSymbol:
+ MCOp = lowerSymbolOperand(
+ MO, Printer.GetExternalSymbolSymbol(MO.getSymbolName()));
+ break;
+ case MachineOperand::MO_MachineBasicBlock:
+ MCOp = MCOperand::createExpr(
+ MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx));
+ break;
+ case MachineOperand::MO_RegisterMask:
+ continue;
+ case MachineOperand::MO_BlockAddress:
+ MCOp = lowerSymbolOperand(
+ MO, Printer.GetBlockAddressSymbol(MO.getBlockAddress()));
+ break;
+ case MachineOperand::MO_JumpTableIndex:
+ MCOp = lowerSymbolOperand(MO, Printer.GetJTISymbol(MO.getIndex()));
+ break;
+ case MachineOperand::MO_ConstantPoolIndex:
+ MCOp = lowerSymbolOperand(MO, Printer.GetCPISymbol(MO.getIndex()));
+ break;
+ }
+
+ OutMI.addOperand(MCOp);
+ }
+}
+
+} // end of namespace llvm
+
diff --git a/contrib/llvm/lib/Target/AVR/AVRMCInstLower.h b/contrib/llvm/lib/Target/AVR/AVRMCInstLower.h
new file mode 100644
index 000000000000..2e2d1014485e
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRMCInstLower.h
@@ -0,0 +1,43 @@
+//===-- AVRMCInstLower.h - Lower MachineInstr to MCInst ---------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_MCINST_LOWER_H
+#define LLVM_AVR_MCINST_LOWER_H
+
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+
+class AsmPrinter;
+class MachineInstr;
+class MachineOperand;
+class MCContext;
+class MCInst;
+class MCOperand;
+class MCSymbol;
+
+/// Lowers `MachineInstr` objects into `MCInst` objects.
+class AVRMCInstLower {
+public:
+ AVRMCInstLower(MCContext &Ctx, AsmPrinter &Printer)
+ : Ctx(Ctx), Printer(Printer) {}
+
+ /// Lowers a `MachineInstr` into a `MCInst`.
+ void lowerInstruction(const MachineInstr &MI, MCInst &OutMI) const;
+ MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+
+private:
+ MCContext &Ctx;
+ AsmPrinter &Printer;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_MCINST_LOWER_H
+
diff --git a/contrib/llvm/lib/Target/AVR/AVRMachineFunctionInfo.h b/contrib/llvm/lib/Target/AVR/AVRMachineFunctionInfo.h
new file mode 100644
index 000000000000..cf0c73576301
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRMachineFunctionInfo.h
@@ -0,0 +1,69 @@
+//===-- AVRMachineFuctionInfo.h - AVR machine function info -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares AVR-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_MACHINE_FUNCTION_INFO_H
+#define LLVM_AVR_MACHINE_FUNCTION_INFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+/// Contains AVR-specific information for each MachineFunction.
+class AVRMachineFunctionInfo : public MachineFunctionInfo {
+ /// Indicates if a register has been spilled by the register
+ /// allocator.
+ bool HasSpills;
+
+ /// Indicates if there are any fixed size allocas present.
+ /// Note that if there are only variable sized allocas this is set to false.
+ bool HasAllocas;
+
+ /// Indicates if arguments passed using the stack are being
+ /// used inside the function.
+ bool HasStackArgs;
+
+ /// Size of the callee-saved register portion of the
+ /// stack frame in bytes.
+ unsigned CalleeSavedFrameSize;
+
+ /// FrameIndex for start of varargs area.
+ int VarArgsFrameIndex;
+
+public:
+ AVRMachineFunctionInfo()
+ : HasSpills(false), HasAllocas(false), HasStackArgs(false),
+ CalleeSavedFrameSize(0), VarArgsFrameIndex(0) {}
+
+ explicit AVRMachineFunctionInfo(MachineFunction &MF)
+ : HasSpills(false), HasAllocas(false), HasStackArgs(false),
+ CalleeSavedFrameSize(0), VarArgsFrameIndex(0) {}
+
+ bool getHasSpills() const { return HasSpills; }
+ void setHasSpills(bool B) { HasSpills = B; }
+
+ bool getHasAllocas() const { return HasAllocas; }
+ void setHasAllocas(bool B) { HasAllocas = B; }
+
+ bool getHasStackArgs() const { return HasStackArgs; }
+ void setHasStackArgs(bool B) { HasStackArgs = B; }
+
+ unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
+ void setCalleeSavedFrameSize(unsigned Bytes) { CalleeSavedFrameSize = Bytes; }
+
+ int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+ void setVarArgsFrameIndex(int Idx) { VarArgsFrameIndex = Idx; }
+};
+
+} // end llvm namespace
+
+#endif // LLVM_AVR_MACHINE_FUNCTION_INFO_H
diff --git a/contrib/llvm/lib/Target/AVR/AVRRegisterInfo.cpp b/contrib/llvm/lib/Target/AVR/AVRRegisterInfo.cpp
new file mode 100644
index 000000000000..48798bd4a1da
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRRegisterInfo.cpp
@@ -0,0 +1,266 @@
+//===-- AVRRegisterInfo.cpp - AVR Register Information --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AVR implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVRRegisterInfo.h"
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+#include "AVR.h"
+#include "AVRInstrInfo.h"
+#include "AVRTargetMachine.h"
+#include "MCTargetDesc/AVRMCTargetDesc.h"
+
+#define GET_REGINFO_TARGET_DESC
+#include "AVRGenRegisterInfo.inc"
+
+namespace llvm {
+
+AVRRegisterInfo::AVRRegisterInfo() : AVRGenRegisterInfo(0) {}
+
+const uint16_t *
+AVRRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+ CallingConv::ID CC = MF->getFunction()->getCallingConv();
+
+ return ((CC == CallingConv::AVR_INTR || CC == CallingConv::AVR_SIGNAL)
+ ? CSR_Interrupts_SaveList
+ : CSR_Normal_SaveList);
+}
+
+const uint32_t *
+AVRRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
+ CallingConv::ID CC) const {
+ return ((CC == CallingConv::AVR_INTR || CC == CallingConv::AVR_SIGNAL)
+ ? CSR_Interrupts_RegMask
+ : CSR_Normal_RegMask);
+}
+
+BitVector AVRRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+ BitVector Reserved(getNumRegs());
+ const AVRTargetMachine &TM = static_cast<const AVRTargetMachine&>(MF.getTarget());
+ const TargetFrameLowering *TFI = TM.getSubtargetImpl()->getFrameLowering();
+
+ // Reserve the intermediate result registers r1 and r2
+ // The result of instructions like 'mul' is always stored here.
+ Reserved.set(AVR::R0);
+ Reserved.set(AVR::R1);
+ Reserved.set(AVR::R1R0);
+
+ // Reserve the stack pointer.
+ Reserved.set(AVR::SPL);
+ Reserved.set(AVR::SPH);
+ Reserved.set(AVR::SP);
+
+ // Reserve the frame pointer registers r28 and r29 if the function requires one.
+ if (TFI->hasFP(MF)) {
+ Reserved.set(AVR::R28);
+ Reserved.set(AVR::R29);
+ Reserved.set(AVR::R29R28);
+ }
+
+ return Reserved;
+}
+
+const TargetRegisterClass *
+AVRRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
+ const MachineFunction &MF) const {
+ if (RC->hasType(MVT::i16)) {
+ return &AVR::DREGSRegClass;
+ }
+
+ if (RC->hasType(MVT::i8)) {
+ return &AVR::GPR8RegClass;
+ }
+
+ llvm_unreachable("Invalid register size");
+}
+
+/// Fold a frame offset shared between two add instructions into a single one.
+static void foldFrameOffset(MachineInstr &MI, int &Offset, unsigned DstReg) {
+ int Opcode = MI.getOpcode();
+
+ // Don't bother trying if the next instruction is not an add or a sub.
+ if ((Opcode != AVR::SUBIWRdK) && (Opcode != AVR::ADIWRdK)) {
+ return;
+ }
+
+ // Check that DstReg matches with next instruction, otherwise the instruction
+ // is not related to stack address manipulation.
+ if (DstReg != MI.getOperand(0).getReg()) {
+ return;
+ }
+
+ // Add the offset in the next instruction to our offset.
+ switch (Opcode) {
+ case AVR::SUBIWRdK:
+ Offset += -MI.getOperand(2).getImm();
+ break;
+ case AVR::ADIWRdK:
+ Offset += MI.getOperand(2).getImm();
+ break;
+ }
+
+ // Finally remove the instruction.
+ MI.eraseFromParent();
+}
+
+void AVRRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+ int SPAdj, unsigned FIOperandNum,
+ RegScavenger *RS) const {
+ assert(SPAdj == 0 && "Unexpected SPAdj value");
+
+ MachineInstr &MI = *II;
+ DebugLoc dl = MI.getDebugLoc();
+ MachineBasicBlock &MBB = *MI.getParent();
+ const MachineFunction &MF = *MBB.getParent();
+ const AVRTargetMachine &TM = (const AVRTargetMachine &)MF.getTarget();
+ const TargetInstrInfo &TII = *TM.getSubtargetImpl()->getInstrInfo();
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const TargetFrameLowering *TFI = TM.getSubtargetImpl()->getFrameLowering();
+ int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+ int Offset = MFI.getObjectOffset(FrameIndex);
+
+ // Add one to the offset because SP points to an empty slot.
+ Offset += MFI.getStackSize() - TFI->getOffsetOfLocalArea() + 1;
+ // Fold incoming offset.
+ Offset += MI.getOperand(FIOperandNum + 1).getImm();
+
+ // This is actually "load effective address" of the stack slot
+ // instruction. We have only two-address instructions, thus we need to
+ // expand it into move + add.
+ if (MI.getOpcode() == AVR::FRMIDX) {
+ MI.setDesc(TII.get(AVR::MOVWRdRr));
+ MI.getOperand(FIOperandNum).ChangeToRegister(AVR::R29R28, false);
+
+ assert(Offset > 0 && "Invalid offset");
+
+ // We need to materialize the offset via an add instruction.
+ unsigned Opcode;
+ unsigned DstReg = MI.getOperand(0).getReg();
+ assert(DstReg != AVR::R29R28 && "Dest reg cannot be the frame pointer");
+
+ // Generally, to load a frame address two add instructions are emitted that
+ // could get folded into a single one:
+ // movw r31:r30, r29:r28
+ // adiw r31:r30, 29
+ // adiw r31:r30, 16
+ // to:
+ // movw r31:r30, r29:r28
+ // adiw r31:r30, 45
+ foldFrameOffset(*std::next(II), Offset, DstReg);
+
+ // Select the best opcode based on DstReg and the offset size.
+ switch (DstReg) {
+ case AVR::R25R24:
+ case AVR::R27R26:
+ case AVR::R31R30: {
+ if (isUInt<6>(Offset)) {
+ Opcode = AVR::ADIWRdK;
+ break;
+ }
+ LLVM_FALLTHROUGH;
+ }
+ default: {
+ // This opcode will get expanded into a pair of subi/sbci.
+ Opcode = AVR::SUBIWRdK;
+ Offset = -Offset;
+ break;
+ }
+ }
+
+ MachineInstr *New = BuildMI(MBB, std::next(II), dl, TII.get(Opcode), DstReg)
+ .addReg(DstReg, RegState::Kill)
+ .addImm(Offset);
+ New->getOperand(3).setIsDead();
+
+ return;
+ }
+
+ // If the offset is too big we have to adjust and restore the frame pointer
+ // to materialize a valid load/store with displacement.
+ //:TODO: consider using only one adiw/sbiw chain for more than one frame index
+ if (Offset > 63) {
+ unsigned AddOpc = AVR::ADIWRdK, SubOpc = AVR::SBIWRdK;
+ int AddOffset = Offset - 63 + 1;
+
+ // For huge offsets where adiw/sbiw cannot be used use a pair of subi/sbci.
+ if ((Offset - 63 + 1) > 63) {
+ AddOpc = AVR::SUBIWRdK;
+ SubOpc = AVR::SUBIWRdK;
+ AddOffset = -AddOffset;
+ }
+
+ // It is possible that the spiller places this frame instruction in between
+ // a compare and branch, invalidating the contents of SREG set by the
+ // compare instruction because of the add/sub pairs. Conservatively save and
+ // restore SREG before and after each add/sub pair.
+ BuildMI(MBB, II, dl, TII.get(AVR::INRdA), AVR::R0).addImm(0x3f);
+
+ MachineInstr *New = BuildMI(MBB, II, dl, TII.get(AddOpc), AVR::R29R28)
+ .addReg(AVR::R29R28, RegState::Kill)
+ .addImm(AddOffset);
+ New->getOperand(3).setIsDead();
+
+ // Restore SREG.
+ BuildMI(MBB, std::next(II), dl, TII.get(AVR::OUTARr))
+ .addImm(0x3f)
+ .addReg(AVR::R0, RegState::Kill);
+
+ // No need to set SREG as dead here otherwise if the next instruction is a
+ // cond branch it will be using a dead register.
+ New = BuildMI(MBB, std::next(II), dl, TII.get(SubOpc), AVR::R29R28)
+ .addReg(AVR::R29R28, RegState::Kill)
+ .addImm(Offset - 63 + 1);
+
+ Offset = 62;
+ }
+
+ MI.getOperand(FIOperandNum).ChangeToRegister(AVR::R29R28, false);
+ assert(isUInt<6>(Offset) && "Offset is out of range");
+ MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+}
+
+unsigned AVRRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+ const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+ if (TFI->hasFP(MF)) {
+ // The Y pointer register
+ return AVR::R28;
+ }
+
+ return AVR::SP;
+}
+
+const TargetRegisterClass *
+AVRRegisterInfo::getPointerRegClass(const MachineFunction &MF,
+ unsigned Kind) const {
+ // FIXME: Currently we're using avr-gcc as reference, so we restrict
+ // ptrs to Y and Z regs. Though avr-gcc has buggy implementation
+ // of memory constraint, so we can fix it and bit avr-gcc here ;-)
+ return &AVR::PTRDISPREGSRegClass;
+}
+
+void AVRRegisterInfo::splitReg(unsigned Reg,
+ unsigned &LoReg,
+ unsigned &HiReg) const {
+ assert(AVR::DREGSRegClass.contains(Reg) && "can only split 16-bit registers");
+
+ LoReg = getSubReg(Reg, AVR::sub_lo);
+ HiReg = getSubReg(Reg, AVR::sub_hi);
+}
+
+} // end of namespace llvm
+
diff --git a/contrib/llvm/lib/Target/AVR/AVRRegisterInfo.h b/contrib/llvm/lib/Target/AVR/AVRRegisterInfo.h
new file mode 100644
index 000000000000..b97e32ea203f
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRRegisterInfo.h
@@ -0,0 +1,58 @@
+//===-- AVRRegisterInfo.h - AVR Register Information Impl -------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AVR implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_REGISTER_INFO_H
+#define LLVM_AVR_REGISTER_INFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "AVRGenRegisterInfo.inc"
+
+namespace llvm {
+
+/// Utilities relating to AVR registers.
+class AVRRegisterInfo : public AVRGenRegisterInfo {
+public:
+ AVRRegisterInfo();
+
+public:
+ const uint16_t *
+ getCalleeSavedRegs(const MachineFunction *MF = 0) const override;
+ const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+ CallingConv::ID CC) const override;
+ BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+ const TargetRegisterClass *
+ getLargestLegalSuperClass(const TargetRegisterClass *RC,
+ const MachineFunction &MF) const override;
+
+ /// Stack Frame Processing Methods
+ void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
+ unsigned FIOperandNum,
+ RegScavenger *RS = NULL) const override;
+
+ unsigned getFrameRegister(const MachineFunction &MF) const override;
+
+ const TargetRegisterClass *
+ getPointerRegClass(const MachineFunction &MF,
+ unsigned Kind = 0) const override;
+
+ /// Splits a 16-bit `DREGS` register into the lo/hi register pair.
+ /// \param Reg A 16-bit register to split.
+ void splitReg(unsigned Reg, unsigned &LoReg, unsigned &HiReg) const;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_REGISTER_INFO_H
diff --git a/contrib/llvm/lib/Target/AVR/AVRRegisterInfo.td b/contrib/llvm/lib/Target/AVR/AVRRegisterInfo.td
new file mode 100644
index 000000000000..32650fc66751
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRRegisterInfo.td
@@ -0,0 +1,216 @@
+//===-- AVRRegisterInfo.td - AVR Register defs -------------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Declarations that describe the AVR register file
+//===----------------------------------------------------------------------===//
+
+// 8-bit General purpose register definition.
+class AVRReg<bits<16> num,
+ string name,
+ list<Register> subregs = [],
+ list<string> altNames = []>
+ : RegisterWithSubRegs<name, subregs>
+{
+ field bits<16> Num = num;
+
+ let HWEncoding = num;
+ let Namespace = "AVR";
+ let SubRegs = subregs;
+ let AltNames = altNames;
+}
+
+// Subregister indices.
+let Namespace = "AVR" in
+{
+ def sub_lo : SubRegIndex<8>;
+ def sub_hi : SubRegIndex<8, 8>;
+}
+
+let Namespace = "AVR" in {
+ def ptr : RegAltNameIndex;
+}
+
+
+//===----------------------------------------------------------------------===//
+// 8-bit general purpose registers
+//===----------------------------------------------------------------------===//
+
+def R0 : AVRReg<0, "r0">, DwarfRegNum<[0]>;
+def R1 : AVRReg<1, "r1">, DwarfRegNum<[1]>;
+def R2 : AVRReg<2, "r2">, DwarfRegNum<[2]>;
+def R3 : AVRReg<3, "r3">, DwarfRegNum<[3]>;
+def R4 : AVRReg<4, "r4">, DwarfRegNum<[4]>;
+def R5 : AVRReg<5, "r5">, DwarfRegNum<[5]>;
+def R6 : AVRReg<6, "r6">, DwarfRegNum<[6]>;
+def R7 : AVRReg<7, "r7">, DwarfRegNum<[7]>;
+def R8 : AVRReg<8, "r8">, DwarfRegNum<[8]>;
+def R9 : AVRReg<9, "r9">, DwarfRegNum<[9]>;
+def R10 : AVRReg<10, "r10">, DwarfRegNum<[10]>;
+def R11 : AVRReg<11, "r11">, DwarfRegNum<[11]>;
+def R12 : AVRReg<12, "r12">, DwarfRegNum<[12]>;
+def R13 : AVRReg<13, "r13">, DwarfRegNum<[13]>;
+def R14 : AVRReg<14, "r14">, DwarfRegNum<[14]>;
+def R15 : AVRReg<15, "r15">, DwarfRegNum<[15]>;
+def R16 : AVRReg<16, "r16">, DwarfRegNum<[16]>;
+def R17 : AVRReg<17, "r17">, DwarfRegNum<[17]>;
+def R18 : AVRReg<18, "r18">, DwarfRegNum<[18]>;
+def R19 : AVRReg<19, "r19">, DwarfRegNum<[19]>;
+def R20 : AVRReg<20, "r20">, DwarfRegNum<[20]>;
+def R21 : AVRReg<21, "r21">, DwarfRegNum<[21]>;
+def R22 : AVRReg<22, "r22">, DwarfRegNum<[22]>;
+def R23 : AVRReg<23, "r23">, DwarfRegNum<[23]>;
+def R24 : AVRReg<24, "r24">, DwarfRegNum<[24]>;
+def R25 : AVRReg<25, "r25">, DwarfRegNum<[25]>;
+def R26 : AVRReg<26, "r26">, DwarfRegNum<[26]>;
+def R27 : AVRReg<27, "r27">, DwarfRegNum<[27]>;
+def R28 : AVRReg<28, "r28">, DwarfRegNum<[28]>;
+def R29 : AVRReg<29, "r29">, DwarfRegNum<[29]>;
+def R30 : AVRReg<30, "r30">, DwarfRegNum<[30]>;
+def R31 : AVRReg<31, "r31">, DwarfRegNum<[31]>;
+def SPL : AVRReg<32, "SPL">, DwarfRegNum<[32]>;
+def SPH : AVRReg<33, "SPH">, DwarfRegNum<[33]>;
+
+let SubRegIndices = [sub_lo, sub_hi],
+CoveredBySubRegs = 1 in
+{
+ // 16 bit GPR pairs.
+ def SP : AVRReg<32, "SP", [SPL, SPH]>, DwarfRegNum<[32]>;
+
+ // The pointer registers (X,Y,Z) are a special case because they
+ // are printed as a `high:low` pair when a DREG is expected,
+ // but printed using `X`, `Y`, `Z` when a pointer register is expected.
+ let RegAltNameIndices = [ptr] in {
+ def R31R30 : AVRReg<30, "r31:r30", [R30, R31], ["Z"]>, DwarfRegNum<[30]>;
+ def R29R28 : AVRReg<28, "r29:r28", [R28, R29], ["Y"]>, DwarfRegNum<[28]>;
+ def R27R26 : AVRReg<26, "r27:r26", [R26, R27], ["X"]>, DwarfRegNum<[26]>;
+ }
+ def R25R24 : AVRReg<24, "r25:r24", [R24, R25]>, DwarfRegNum<[24]>;
+ def R23R22 : AVRReg<22, "r23:r22", [R22, R23]>, DwarfRegNum<[22]>;
+ def R21R20 : AVRReg<20, "r21:r20", [R20, R21]>, DwarfRegNum<[20]>;
+ def R19R18 : AVRReg<18, "r19:r18", [R18, R19]>, DwarfRegNum<[18]>;
+ def R17R16 : AVRReg<16, "r17:r16", [R16, R17]>, DwarfRegNum<[16]>;
+ def R15R14 : AVRReg<14, "r15:r14", [R14, R15]>, DwarfRegNum<[14]>;
+ def R13R12 : AVRReg<12, "r13:r12", [R12, R13]>, DwarfRegNum<[12]>;
+ def R11R10 : AVRReg<10, "r11:r10", [R10, R11]>, DwarfRegNum<[10]>;
+ def R9R8 : AVRReg<8, "r9:r8", [R8, R9]>, DwarfRegNum<[8]>;
+ def R7R6 : AVRReg<6, "r7:r6", [R6, R7]>, DwarfRegNum<[6]>;
+ def R5R4 : AVRReg<4, "r5:r4", [R4, R5]>, DwarfRegNum<[4]>;
+ def R3R2 : AVRReg<2, "r3:r2", [R2, R3]>, DwarfRegNum<[2]>;
+ def R1R0 : AVRReg<0, "r1:r0", [R0, R1]>, DwarfRegNum<[0]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Register Classes
+//===----------------------------------------------------------------------===//
+
+//:TODO: use proper set instructions instead of using always "add"
+
+// Main 8-bit register class.
+def GPR8 : RegisterClass<"AVR", [i8], 8,
+ (
+ // Return value and argument registers.
+ add R24, R25, R18, R19, R20, R21, R22, R23,
+ // Scratch registers.
+ R30, R31, R26, R27,
+ // Callee saved registers.
+ R28, R29, R17, R16, R15, R14, R13, R12, R11, R10,
+ R9, R8, R7, R6, R5, R4, R3, R2, R0, R1
+ )>;
+
+// Simple lower registers r0..r15
+def GPR8lo : RegisterClass<"AVR", [i8], 8,
+ (
+ add R15, R14, R13, R12, R11, R10, R9, R8, R7, R6, R5, R4, R3, R2, R0, R1
+ )>;
+
+// 8-bit register class for instructions which take immediates.
+def LD8 : RegisterClass<"AVR", [i8], 8,
+ (
+ // Return value and arguments.
+ add R24, R25, R18, R19, R20, R21, R22, R23,
+ // Scratch registers.
+ R30, R31, R26, R27,
+ // Callee saved registers.
+ R28, R29, R17, R16
+ )>;
+
+// Simple lower registers r16..r23
+def LD8lo : RegisterClass<"AVR", [i8], 8,
+ (
+ add R23, R22, R21, R20, R19, R18, R17, R16
+ )>;
+
+// Main 16-bit pair register class.
+def DREGS : RegisterClass<"AVR", [i16], 8,
+ (
+ // Return value and arguments.
+ add R25R24, R19R18, R21R20, R23R22,
+ // Scratch registers.
+ R31R30, R27R26,
+ // Callee saved registers.
+ R29R28, R17R16, R15R14, R13R12, R11R10,
+ R9R8, R7R6, R5R4, R3R2, R1R0
+ )>;
+
+// 16-bit register class for immediate instructions.
+def DLDREGS : RegisterClass<"AVR", [i16], 8,
+ (
+ // Return value and arguments.
+ add R25R24, R19R18, R21R20, R23R22,
+ // Scratch registers.
+ R31R30, R27R26,
+ // Callee saved registers.
+ R29R28, R17R16
+ )>;
+
+// 16-bit register class for the adiw/sbiw instructions.
+def IWREGS : RegisterClass<"AVR", [i16], 8,
+ (
+ // Return value and arguments.
+ add R25R24,
+ // Scratch registers.
+ R31R30, R27R26,
+ // Callee saved registers.
+ R29R28
+ )>;
+
+// 16-bit register class for the ld and st instructions.
+// AKA X,Y, and Z
+def PTRREGS : RegisterClass<"AVR", [i16], 8,
+ (
+ add R27R26, // X
+ R29R28, // Y
+ R31R30 // Z
+ ), ptr>;
+
+// 16-bit register class for the ldd and std instructions.
+// AKA Y and Z.
+def PTRDISPREGS : RegisterClass<"AVR", [i16], 8,
+ (
+ add R31R30, R29R28
+ ), ptr>;
+
+// We have a bunch of instructions with an explicit Z register argument. We
+// model this using a register class containing only the Z register.
+// :TODO: Rename to 'ZREG'.
+def ZREGS : RegisterClass<"AVR", [i16], 8, (add R31R30)>;
+
+// Register class used for the stack read pseudo instruction.
+def GPRSP: RegisterClass<"AVR", [i16], 8, (add SP)>;
+
+//:TODO: if we remove this we get an error in tablegen
+//:TODO: this is just a hack, remove it once add16 works!
+// Status register.
+def SREG : AVRReg<14, "FLAGS">, DwarfRegNum<[88]>;
+def CCR : RegisterClass<"AVR", [i8], 8, (add SREG)>
+{
+ let CopyCost = -1; // Don't allow copying of status registers
+}
+
diff --git a/contrib/llvm/lib/Target/AVR/AVRRelaxMemOperations.cpp b/contrib/llvm/lib/Target/AVR/AVRRelaxMemOperations.cpp
new file mode 100644
index 000000000000..26dbcf77b452
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRRelaxMemOperations.cpp
@@ -0,0 +1,149 @@
+//===-- AVRRelaxMemOperations.cpp - Relax out of range loads/stores -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass which relaxes out of range memory operations into
+// equivalent operations which handle bigger addresses.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVR.h"
+#include "AVRInstrInfo.h"
+#include "AVRTargetMachine.h"
+#include "MCTargetDesc/AVRMCTargetDesc.h"
+
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define AVR_RELAX_MEM_OPS_NAME "AVR memory operation relaxation pass"
+
+namespace {
+
+class AVRRelaxMem : public MachineFunctionPass {
+public:
+ static char ID;
+
+ AVRRelaxMem() : MachineFunctionPass(ID) {
+ initializeAVRRelaxMemPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override { return AVR_RELAX_MEM_OPS_NAME; }
+
+private:
+ typedef MachineBasicBlock Block;
+ typedef Block::iterator BlockIt;
+
+ const TargetInstrInfo *TII;
+
+ template <unsigned OP> bool relax(Block &MBB, BlockIt MBBI);
+
+ bool runOnBasicBlock(Block &MBB);
+ bool runOnInstruction(Block &MBB, BlockIt MBBI);
+
+ MachineInstrBuilder buildMI(Block &MBB, BlockIt MBBI, unsigned Opcode) {
+ return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(Opcode));
+ }
+};
+
+char AVRRelaxMem::ID = 0;
+
+bool AVRRelaxMem::runOnMachineFunction(MachineFunction &MF) {
+ bool Modified = false;
+
+ const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>();
+ TII = STI.getInstrInfo();
+
+ for (Block &MBB : MF) {
+ bool BlockModified = runOnBasicBlock(MBB);
+ Modified |= BlockModified;
+ }
+
+ return Modified;
+}
+
+bool AVRRelaxMem::runOnBasicBlock(Block &MBB) {
+ bool Modified = false;
+
+ BlockIt MBBI = MBB.begin(), E = MBB.end();
+ while (MBBI != E) {
+ BlockIt NMBBI = std::next(MBBI);
+ Modified |= runOnInstruction(MBB, MBBI);
+ MBBI = NMBBI;
+ }
+
+ return Modified;
+}
+
+template <>
+bool AVRRelaxMem::relax<AVR::STDWPtrQRr>(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+
+ MachineOperand &Ptr = MI.getOperand(0);
+ MachineOperand &Src = MI.getOperand(2);
+ int64_t Imm = MI.getOperand(1).getImm();
+
+ // We can definitely optimise this better.
+ if (Imm > 63) {
+ // Push the previous state of the pointer register.
+ // This instruction must preserve the value.
+ buildMI(MBB, MBBI, AVR::PUSHWRr)
+ .addReg(Ptr.getReg());
+
+ // Add the immediate to the pointer register.
+ buildMI(MBB, MBBI, AVR::SBCIWRdK)
+ .addReg(Ptr.getReg(), RegState::Define)
+ .addReg(Ptr.getReg())
+ .addImm(-Imm);
+
+ // Store the value in the source register to the address
+ // pointed to by the pointer register.
+ buildMI(MBB, MBBI, AVR::STWPtrRr)
+ .addReg(Ptr.getReg())
+ .addReg(Src.getReg(), getKillRegState(Src.isKill()));
+
+ // Pop the original state of the pointer register.
+ buildMI(MBB, MBBI, AVR::POPWRd)
+ .addReg(Ptr.getReg(), getKillRegState(Ptr.isKill()));
+
+ MI.removeFromParent();
+ }
+
+ return false;
+}
+
+bool AVRRelaxMem::runOnInstruction(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ int Opcode = MBBI->getOpcode();
+
+#define RELAX(Op) \
+ case Op: \
+ return relax<Op>(MBB, MI)
+
+ switch (Opcode) {
+ RELAX(AVR::STDWPtrQRr);
+ }
+#undef RELAX
+ return false;
+}
+
+} // end of anonymous namespace
+
+INITIALIZE_PASS(AVRRelaxMem, "avr-relax-mem",
+ AVR_RELAX_MEM_OPS_NAME, false, false)
+
+namespace llvm {
+
+FunctionPass *createAVRRelaxMemPass() { return new AVRRelaxMem(); }
+
+} // end of namespace llvm
diff --git a/contrib/llvm/lib/Target/AVR/AVRSelectionDAGInfo.h b/contrib/llvm/lib/Target/AVR/AVRSelectionDAGInfo.h
new file mode 100644
index 000000000000..6474c8779330
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRSelectionDAGInfo.h
@@ -0,0 +1,28 @@
+//===-- AVRSelectionDAGInfo.h - AVR SelectionDAG Info -----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the AVR subclass for SelectionDAGTargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_SELECTION_DAG_INFO_H
+#define LLVM_AVR_SELECTION_DAG_INFO_H
+
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+
+namespace llvm {
+
+/// Holds information about the AVR instruction selection DAG.
+class AVRSelectionDAGInfo : public SelectionDAGTargetInfo {
+public:
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_SELECTION_DAG_INFO_H
diff --git a/contrib/llvm/lib/Target/AVR/AVRSubtarget.cpp b/contrib/llvm/lib/Target/AVR/AVRSubtarget.cpp
new file mode 100644
index 000000000000..c228d051d771
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRSubtarget.cpp
@@ -0,0 +1,47 @@
+//===-- AVRSubtarget.cpp - AVR Subtarget Information ----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AVR specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVRSubtarget.h"
+
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#include "AVR.h"
+#include "AVRTargetMachine.h"
+#include "MCTargetDesc/AVRMCTargetDesc.h"
+
+#define DEBUG_TYPE "avr-subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "AVRGenSubtargetInfo.inc"
+
+namespace llvm {
+
+AVRSubtarget::AVRSubtarget(const Triple &TT, const std::string &CPU,
+ const std::string &FS, AVRTargetMachine &TM)
+ : AVRGenSubtargetInfo(TT, CPU, FS), InstrInfo(), FrameLowering(),
+ TLInfo(TM), TSInfo(),
+
+ // Subtarget features
+ m_hasSRAM(false), m_hasJMPCALL(false), m_hasIJMPCALL(false),
+ m_hasEIJMPCALL(false), m_hasADDSUBIW(false), m_hasSmallStack(false),
+ m_hasMOVW(false), m_hasLPM(false), m_hasLPMX(false), m_hasELPM(false),
+ m_hasELPMX(false), m_hasSPM(false), m_hasSPMX(false), m_hasDES(false),
+ m_supportsRMW(false), m_supportsMultiplication(false), m_hasBREAK(false),
+ m_hasTinyEncoding(false), ELFArch(false), m_FeatureSetDummy(false) {
+ // Parse features string.
+ ParseSubtargetFeatures(CPU, FS);
+}
+
+} // end of namespace llvm
diff --git a/contrib/llvm/lib/Target/AVR/AVRSubtarget.h b/contrib/llvm/lib/Target/AVR/AVRSubtarget.h
new file mode 100644
index 000000000000..a37849c3f3f7
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRSubtarget.h
@@ -0,0 +1,119 @@
+//===-- AVRSubtarget.h - Define Subtarget for the AVR -----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the AVR specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_SUBTARGET_H
+#define LLVM_AVR_SUBTARGET_H
+
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+
+#include "AVRFrameLowering.h"
+#include "AVRISelLowering.h"
+#include "AVRInstrInfo.h"
+#include "AVRSelectionDAGInfo.h"
+
+#define GET_SUBTARGETINFO_HEADER
+#include "AVRGenSubtargetInfo.inc"
+
+namespace llvm {
+
+/// A specific AVR target MCU.
+class AVRSubtarget : public AVRGenSubtargetInfo {
+public:
+ //! Creates an AVR subtarget.
+ //! \param TT The target triple.
+ //! \param CPU The CPU to target.
+ //! \param FS The feature string.
+ //! \param TM The target machine.
+ AVRSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS,
+ AVRTargetMachine &TM);
+
+ const AVRInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+ const TargetFrameLowering *getFrameLowering() const override { return &FrameLowering; }
+ const AVRTargetLowering *getTargetLowering() const override { return &TLInfo; }
+ const AVRSelectionDAGInfo *getSelectionDAGInfo() const override { return &TSInfo; }
+ const AVRRegisterInfo *getRegisterInfo() const override { return &InstrInfo.getRegisterInfo(); }
+
+ /// Parses a subtarget feature string, setting appropriate options.
+ /// \note Definition of function is auto generated by `tblgen`.
+ void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+ // Subtarget feature getters.
+ // See AVR.td for details.
+ bool hasSRAM() const { return m_hasSRAM; }
+ bool hasJMPCALL() const { return m_hasJMPCALL; }
+ bool hasIJMPCALL() const { return m_hasIJMPCALL; }
+ bool hasEIJMPCALL() const { return m_hasEIJMPCALL; }
+ bool hasADDSUBIW() const { return m_hasADDSUBIW; }
+ bool hasSmallStack() const { return m_hasSmallStack; }
+ bool hasMOVW() const { return m_hasMOVW; }
+ bool hasLPM() const { return m_hasLPM; }
+ bool hasLPMX() const { return m_hasLPMX; }
+ bool hasELPM() const { return m_hasELPM; }
+ bool hasELPMX() const { return m_hasELPMX; }
+ bool hasSPM() const { return m_hasSPM; }
+ bool hasSPMX() const { return m_hasSPMX; }
+ bool hasDES() const { return m_hasDES; }
+ bool supportsRMW() const { return m_supportsRMW; }
+ bool supportsMultiplication() const { return m_supportsMultiplication; }
+ bool hasBREAK() const { return m_hasBREAK; }
+ bool hasTinyEncoding() const { return m_hasTinyEncoding; }
+
+ /// Gets the ELF architecture for the e_flags field
+ /// of an ELF object file.
+ unsigned getELFArch() const {
+ assert(ELFArch != 0 &&
+ "every device must have an associate ELF architecture");
+ return ELFArch;
+ }
+
+private:
+ AVRInstrInfo InstrInfo;
+ AVRFrameLowering FrameLowering;
+ AVRTargetLowering TLInfo;
+ AVRSelectionDAGInfo TSInfo;
+
+ // Subtarget feature settings
+ // See AVR.td for details.
+ bool m_hasSRAM;
+ bool m_hasJMPCALL;
+ bool m_hasIJMPCALL;
+ bool m_hasEIJMPCALL;
+ bool m_hasADDSUBIW;
+ bool m_hasSmallStack;
+ bool m_hasMOVW;
+ bool m_hasLPM;
+ bool m_hasLPMX;
+ bool m_hasELPM;
+ bool m_hasELPMX;
+ bool m_hasSPM;
+ bool m_hasSPMX;
+ bool m_hasDES;
+ bool m_supportsRMW;
+ bool m_supportsMultiplication;
+ bool m_hasBREAK;
+ bool m_hasTinyEncoding;
+
+ /// The ELF e_flags architecture.
+ unsigned ELFArch;
+
+ // Dummy member, used by FeatureSet's. We cannot have a SubtargetFeature with
+ // no variable, so we instead bind pseudo features to this variable.
+ bool m_FeatureSetDummy;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_SUBTARGET_H
diff --git a/contrib/llvm/lib/Target/AVR/AVRTargetMachine.cpp b/contrib/llvm/lib/Target/AVR/AVRTargetMachine.cpp
new file mode 100644
index 000000000000..fb3262916b4f
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRTargetMachine.cpp
@@ -0,0 +1,118 @@
+//===-- AVRTargetMachine.cpp - Define TargetMachine for AVR ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the AVR specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVRTargetMachine.h"
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#include "AVRTargetObjectFile.h"
+#include "AVR.h"
+#include "MCTargetDesc/AVRMCTargetDesc.h"
+
+namespace llvm {
+
+static const char *AVRDataLayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-n8";
+
+/// Processes a CPU name.
+static StringRef getCPU(StringRef CPU) {
+ if (CPU.empty() || CPU == "generic") {
+ return "avr2";
+ }
+
+ return CPU;
+}
+
+static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
+ return RM.hasValue() ? *RM : Reloc::Static;
+}
+
+AVRTargetMachine::AVRTargetMachine(const Target &T, const Triple &TT,
+ StringRef CPU, StringRef FS,
+ const TargetOptions &Options,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL)
+ : LLVMTargetMachine(
+ T, AVRDataLayout, TT,
+ getCPU(CPU), FS, Options, getEffectiveRelocModel(RM), CM, OL),
+ SubTarget(TT, getCPU(CPU), FS, *this) {
+ this->TLOF = make_unique<AVRTargetObjectFile>();
+ initAsmInfo();
+}
+
+namespace {
+/// AVR Code Generator Pass Configuration Options.
+class AVRPassConfig : public TargetPassConfig {
+public:
+ AVRPassConfig(AVRTargetMachine *TM, PassManagerBase &PM)
+ : TargetPassConfig(TM, PM) {}
+
+ AVRTargetMachine &getAVRTargetMachine() const {
+ return getTM<AVRTargetMachine>();
+ }
+
+ bool addInstSelector() override;
+ void addPreSched2() override;
+ void addPreRegAlloc() override;
+};
+} // namespace
+
+TargetPassConfig *AVRTargetMachine::createPassConfig(PassManagerBase &PM) {
+ return new AVRPassConfig(this, PM);
+}
+
+extern "C" void LLVMInitializeAVRTarget() {
+ // Register the target.
+ RegisterTargetMachine<AVRTargetMachine> X(getTheAVRTarget());
+
+ auto &PR = *PassRegistry::getPassRegistry();
+ initializeAVRExpandPseudoPass(PR);
+ initializeAVRInstrumentFunctionsPass(PR);
+ initializeAVRRelaxMemPass(PR);
+}
+
+const AVRSubtarget *AVRTargetMachine::getSubtargetImpl() const {
+ return &SubTarget;
+}
+
+const AVRSubtarget *AVRTargetMachine::getSubtargetImpl(const Function &) const {
+ return &SubTarget;
+}
+
+//===----------------------------------------------------------------------===//
+// Pass Pipeline Configuration
+//===----------------------------------------------------------------------===//
+
+bool AVRPassConfig::addInstSelector() {
+ // Install an instruction selector.
+ addPass(createAVRISelDag(getAVRTargetMachine(), getOptLevel()));
+ // Create the frame analyzer pass used by the PEI pass.
+ addPass(createAVRFrameAnalyzerPass());
+
+ return false;
+}
+
+void AVRPassConfig::addPreRegAlloc() {
+ // Create the dynalloc SP save/restore pass to handle variable sized allocas.
+ addPass(createAVRDynAllocaSRPass());
+}
+
+void AVRPassConfig::addPreSched2() {
+ addPass(createAVRRelaxMemPass());
+ addPass(createAVRExpandPseudoPass());
+}
+
+} // end of namespace llvm
diff --git a/contrib/llvm/lib/Target/AVR/AVRTargetMachine.h b/contrib/llvm/lib/Target/AVR/AVRTargetMachine.h
new file mode 100644
index 000000000000..10345193d14a
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRTargetMachine.h
@@ -0,0 +1,51 @@
+//===-- AVRTargetMachine.h - Define TargetMachine for AVR -------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the AVR specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_TARGET_MACHINE_H
+#define LLVM_AVR_TARGET_MACHINE_H
+
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+
+#include "AVRFrameLowering.h"
+#include "AVRISelLowering.h"
+#include "AVRInstrInfo.h"
+#include "AVRSelectionDAGInfo.h"
+#include "AVRSubtarget.h"
+
+namespace llvm {
+
+/// A generic AVR implementation.
+class AVRTargetMachine : public LLVMTargetMachine {
+public:
+ AVRTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options, Optional<Reloc::Model> RM,
+ CodeModel::Model CM, CodeGenOpt::Level OL);
+
+ const AVRSubtarget *getSubtargetImpl() const;
+ const AVRSubtarget *getSubtargetImpl(const Function &) const override;
+
+ TargetLoweringObjectFile *getObjFileLowering() const override {
+ return this->TLOF.get();
+ }
+
+ TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+private:
+ std::unique_ptr<TargetLoweringObjectFile> TLOF;
+ AVRSubtarget SubTarget;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_TARGET_MACHINE_H
diff --git a/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.cpp b/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.cpp
new file mode 100644
index 000000000000..af14d9292f27
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.cpp
@@ -0,0 +1,41 @@
+//===-- AVRTargetObjectFile.cpp - AVR Object Files ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVRTargetObjectFile.h"
+
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/ELF.h"
+
+#include "AVR.h"
+
+namespace llvm {
+void AVRTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) {
+ Base::Initialize(Ctx, TM);
+ ProgmemDataSection =
+ Ctx.getELFSection(".progmem.data", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
+}
+
+MCSection *
+AVRTargetObjectFile::SelectSectionForGlobal(const GlobalObject *GO,
+ SectionKind Kind,
+ const TargetMachine &TM) const {
+ // Global values in flash memory are placed in the progmem.data section
+ // unless they already have a user assigned section.
+ if (AVR::isProgramMemoryAddress(GO) && !GO->hasSection())
+ return ProgmemDataSection;
+
+ // Otherwise, we work the same way as ELF.
+ return Base::SelectSectionForGlobal(GO, Kind, TM);
+}
+} // end of namespace llvm
+
diff --git a/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.h b/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.h
new file mode 100644
index 000000000000..ba91036fd64c
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.h
@@ -0,0 +1,33 @@
+//===-- AVRTargetObjectFile.h - AVR Object Info -----------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_TARGET_OBJECT_FILE_H
+#define LLVM_AVR_TARGET_OBJECT_FILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+
+namespace llvm {
+
+/// Lowering for an AVR ELF32 object file.
+class AVRTargetObjectFile : public TargetLoweringObjectFileELF {
+ typedef TargetLoweringObjectFileELF Base;
+
+public:
+ void Initialize(MCContext &ctx, const TargetMachine &TM) override;
+
+ MCSection *SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind,
+ const TargetMachine &TM) const override;
+
+private:
+ MCSection *ProgmemDataSection;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_TARGET_OBJECT_FILE_H
diff --git a/contrib/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp b/contrib/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
new file mode 100644
index 000000000000..5b0398c0ca34
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
@@ -0,0 +1,631 @@
+//===---- AVRAsmParser.cpp - Parse AVR assembly to MCInst instructions ----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVR.h"
+#include "AVRRegisterInfo.h"
+#include "MCTargetDesc/AVRMCExpr.h"
+#include "MCTargetDesc/AVRMCTargetDesc.h"
+
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#include <sstream>
+
+#define DEBUG_TYPE "avr-asm-parser"
+
+namespace llvm {
+
+/// Parses AVR assembly from a stream.
+class AVRAsmParser : public MCTargetAsmParser {
+ const MCSubtargetInfo &STI;
+ MCAsmParser &Parser;
+ const MCRegisterInfo *MRI;
+
+#define GET_ASSEMBLER_HEADER
+#include "AVRGenAsmMatcher.inc"
+
+ bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands, MCStreamer &Out,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) override;
+
+ bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+
+ bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+ SMLoc NameLoc, OperandVector &Operands) override;
+
+ bool ParseDirective(AsmToken directiveID) override;
+
+ OperandMatchResultTy parseMemriOperand(OperandVector &Operands);
+
+ bool parseOperand(OperandVector &Operands);
+ int parseRegisterName(unsigned (*matchFn)(StringRef));
+ int parseRegisterName();
+ int parseRegister();
+ bool tryParseRegisterOperand(OperandVector &Operands);
+ bool tryParseExpression(OperandVector &Operands);
+ bool tryParseRelocExpression(OperandVector &Operands);
+ void eatComma();
+
+ unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
+ unsigned Kind) override;
+
+ unsigned toDREG(unsigned Reg, unsigned From = AVR::sub_lo) {
+ MCRegisterClass const *Class = &AVRMCRegisterClasses[AVR::DREGSRegClassID];
+ return MRI->getMatchingSuperReg(Reg, From, Class);
+ }
+
+ bool emit(MCInst &Instruction, SMLoc const &Loc, MCStreamer &Out) const;
+ bool invalidOperand(SMLoc const &Loc, OperandVector const &Operands,
+ uint64_t const &ErrorInfo);
+ bool missingFeature(SMLoc const &Loc, uint64_t const &ErrorInfo);
+
+public:
+ AVRAsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
+ const MCInstrInfo &MII, const MCTargetOptions &Options)
+ : MCTargetAsmParser(Options, STI), STI(STI), Parser(Parser) {
+ MCAsmParserExtension::Initialize(Parser);
+ MRI = getContext().getRegisterInfo();
+
+ setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+ }
+
+ MCAsmParser &getParser() const { return Parser; }
+ MCAsmLexer &getLexer() const { return Parser.getLexer(); }
+};
+
+/// An parsed AVR assembly operand.
+class AVROperand : public MCParsedAsmOperand {
+ typedef MCParsedAsmOperand Base;
+ enum KindTy { k_Immediate, k_Register, k_Token, k_Memri } Kind;
+
+public:
+ AVROperand(StringRef Tok, SMLoc const &S)
+ : Base(), Kind(k_Token), Tok(Tok), Start(S), End(S) {}
+ AVROperand(unsigned Reg, SMLoc const &S, SMLoc const &E)
+ : Base(), Kind(k_Register), RegImm({Reg, nullptr}), Start(S), End(E) {}
+ AVROperand(MCExpr const *Imm, SMLoc const &S, SMLoc const &E)
+ : Base(), Kind(k_Immediate), RegImm({0, Imm}), Start(S), End(E) {}
+ AVROperand(unsigned Reg, MCExpr const *Imm, SMLoc const &S, SMLoc const &E)
+ : Base(), Kind(k_Memri), RegImm({Reg, Imm}), Start(S), End(E) {}
+
+ struct RegisterImmediate {
+ unsigned Reg;
+ MCExpr const *Imm;
+ };
+ union {
+ StringRef Tok;
+ RegisterImmediate RegImm;
+ };
+
+ SMLoc Start, End;
+
+public:
+ void addRegOperands(MCInst &Inst, unsigned N) const {
+ assert(Kind == k_Register && "Unexpected operand kind");
+ assert(N == 1 && "Invalid number of operands!");
+
+ Inst.addOperand(MCOperand::createReg(getReg()));
+ }
+
+ void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+ // Add as immediate when possible
+ if (!Expr)
+ Inst.addOperand(MCOperand::createImm(0));
+ else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+ Inst.addOperand(MCOperand::createImm(CE->getValue()));
+ else
+ Inst.addOperand(MCOperand::createExpr(Expr));
+ }
+
+ void addImmOperands(MCInst &Inst, unsigned N) const {
+ assert(Kind == k_Immediate && "Unexpected operand kind");
+ assert(N == 1 && "Invalid number of operands!");
+
+ const MCExpr *Expr = getImm();
+ addExpr(Inst, Expr);
+ }
+
+ /// Adds the contained reg+imm operand to an instruction.
+ void addMemriOperands(MCInst &Inst, unsigned N) const {
+ assert(Kind == k_Memri && "Unexpected operand kind");
+ assert(N == 2 && "Invalid number of operands");
+
+ Inst.addOperand(MCOperand::createReg(getReg()));
+ addExpr(Inst, getImm());
+ }
+
+ bool isReg() const { return Kind == k_Register; }
+ bool isImm() const { return Kind == k_Immediate; }
+ bool isToken() const { return Kind == k_Token; }
+ bool isMem() const { return Kind == k_Memri; }
+ bool isMemri() const { return Kind == k_Memri; }
+
+ StringRef getToken() const {
+ assert(Kind == k_Token && "Invalid access!");
+ return Tok;
+ }
+
+ unsigned getReg() const {
+ assert((Kind == k_Register || Kind == k_Memri) && "Invalid access!");
+
+ return RegImm.Reg;
+ }
+
+ const MCExpr *getImm() const {
+ assert((Kind == k_Immediate || Kind == k_Memri) && "Invalid access!");
+ return RegImm.Imm;
+ }
+
+ static std::unique_ptr<AVROperand> CreateToken(StringRef Str, SMLoc S) {
+ return make_unique<AVROperand>(Str, S);
+ }
+
+ static std::unique_ptr<AVROperand> CreateReg(unsigned RegNum, SMLoc S,
+ SMLoc E) {
+ return make_unique<AVROperand>(RegNum, S, E);
+ }
+
+ static std::unique_ptr<AVROperand> CreateImm(const MCExpr *Val, SMLoc S,
+ SMLoc E) {
+ return make_unique<AVROperand>(Val, S, E);
+ }
+
+ static std::unique_ptr<AVROperand>
+ CreateMemri(unsigned RegNum, const MCExpr *Val, SMLoc S, SMLoc E) {
+ return make_unique<AVROperand>(RegNum, Val, S, E);
+ }
+
+ void makeToken(StringRef Token) {
+ Kind = k_Token;
+ Tok = Token;
+ }
+
+ void makeReg(unsigned RegNo) {
+ Kind = k_Register;
+ RegImm = {RegNo, nullptr};
+ }
+
+ void makeImm(MCExpr const *Ex) {
+ Kind = k_Immediate;
+ RegImm = {0, Ex};
+ }
+
+ void makeMemri(unsigned RegNo, MCExpr const *Imm) {
+ Kind = k_Memri;
+ RegImm = {RegNo, Imm};
+ }
+
+ SMLoc getStartLoc() const { return Start; }
+ SMLoc getEndLoc() const { return End; }
+
+ virtual void print(raw_ostream &O) const {
+ switch (Kind) {
+ case k_Token:
+ O << "Token: \"" << getToken() << "\"";
+ break;
+ case k_Register:
+ O << "Register: " << getReg();
+ break;
+ case k_Immediate:
+ O << "Immediate: \"" << *getImm() << "\"";
+ break;
+ case k_Memri: {
+ // only manually print the size for non-negative values,
+ // as the sign is inserted automatically.
+ O << "Memri: \"" << getReg() << '+' << *getImm() << "\"";
+ break;
+ }
+ }
+ O << "\n";
+ }
+};
+
+// Auto-generated Match Functions
+
+/// Maps from the set of all register names to a register number.
+/// \note Generated by TableGen.
+static unsigned MatchRegisterName(StringRef Name);
+
+/// Maps from the set of all alternative registernames to a register number.
+/// \note Generated by TableGen.
+static unsigned MatchRegisterAltName(StringRef Name);
+
+bool AVRAsmParser::invalidOperand(SMLoc const &Loc,
+ OperandVector const &Operands,
+ uint64_t const &ErrorInfo) {
+ SMLoc ErrorLoc = Loc;
+ char const *Diag = 0;
+
+ if (ErrorInfo != ~0U) {
+ if (ErrorInfo >= Operands.size()) {
+ Diag = "too few operands for instruction.";
+ } else {
+ AVROperand const &Op = (AVROperand const &)*Operands[ErrorInfo];
+
+ // TODO: See if we can do a better error than just "invalid ...".
+ if (Op.getStartLoc() != SMLoc()) {
+ ErrorLoc = Op.getStartLoc();
+ }
+ }
+ }
+
+ if (!Diag) {
+ Diag = "invalid operand for instruction";
+ }
+
+ return Error(ErrorLoc, Diag);
+}
+
+bool AVRAsmParser::missingFeature(llvm::SMLoc const &Loc,
+ uint64_t const &ErrorInfo) {
+ return Error(Loc, "instruction requires a CPU feature not currently enabled");
+}
+
+bool AVRAsmParser::emit(MCInst &Inst, SMLoc const &Loc, MCStreamer &Out) const {
+ Inst.setLoc(Loc);
+ Out.EmitInstruction(Inst, STI);
+
+ return false;
+}
+
+bool AVRAsmParser::MatchAndEmitInstruction(SMLoc Loc, unsigned &Opcode,
+ OperandVector &Operands,
+ MCStreamer &Out, uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) {
+ MCInst Inst;
+ unsigned MatchResult =
+ MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm);
+
+ switch (MatchResult) {
+ case Match_Success: return emit(Inst, Loc, Out);
+ case Match_MissingFeature: return missingFeature(Loc, ErrorInfo);
+ case Match_InvalidOperand: return invalidOperand(Loc, Operands, ErrorInfo);
+ case Match_MnemonicFail: return Error(Loc, "invalid instruction");
+ default: return true;
+ }
+}
+
+/// Parses a register name using a given matching function.
+/// Checks for lowercase or uppercase if necessary.
+int AVRAsmParser::parseRegisterName(unsigned (*matchFn)(StringRef)) {
+ StringRef Name = Parser.getTok().getString();
+
+ int RegNum = matchFn(Name);
+
+ // GCC supports case insensitive register names. Some of the AVR registers
+ // are all lower case, some are all upper case but non are mixed. We prefer
+ // to use the original names in the register definitions. That is why we
+ // have to test both upper and lower case here.
+ if (RegNum == AVR::NoRegister) {
+ RegNum = matchFn(Name.lower());
+ }
+ if (RegNum == AVR::NoRegister) {
+ RegNum = matchFn(Name.upper());
+ }
+
+ return RegNum;
+}
+
+int AVRAsmParser::parseRegisterName() {
+ int RegNum = parseRegisterName(&MatchRegisterName);
+
+ if (RegNum == AVR::NoRegister)
+ RegNum = parseRegisterName(&MatchRegisterAltName);
+
+ return RegNum;
+}
+
+int AVRAsmParser::parseRegister() {
+ int RegNum = AVR::NoRegister;
+
+ if (Parser.getTok().is(AsmToken::Identifier)) {
+ // Check for register pair syntax
+ if (Parser.getLexer().peekTok().is(AsmToken::Colon)) {
+ Parser.Lex();
+ Parser.Lex(); // Eat high (odd) register and colon
+
+ if (Parser.getTok().is(AsmToken::Identifier)) {
+ // Convert lower (even) register to DREG
+ RegNum = toDREG(parseRegisterName());
+ }
+ } else {
+ RegNum = parseRegisterName();
+ }
+ }
+ return RegNum;
+}
+
+bool AVRAsmParser::tryParseRegisterOperand(OperandVector &Operands) {
+ int RegNo = parseRegister();
+
+ if (RegNo == AVR::NoRegister)
+ return true;
+
+ AsmToken const &T = Parser.getTok();
+ Operands.push_back(AVROperand::CreateReg(RegNo, T.getLoc(), T.getEndLoc()));
+ Parser.Lex(); // Eat register token.
+
+ return false;
+}
+
+bool AVRAsmParser::tryParseExpression(OperandVector &Operands) {
+ SMLoc S = Parser.getTok().getLoc();
+
+ if (!tryParseRelocExpression(Operands))
+ return false;
+
+ if ((Parser.getTok().getKind() == AsmToken::Plus ||
+ Parser.getTok().getKind() == AsmToken::Minus) &&
+ Parser.getLexer().peekTok().getKind() == AsmToken::Identifier) {
+ // Don't handle this case - it should be split into two
+ // separate tokens.
+ return true;
+ }
+
+ // Parse (potentially inner) expression
+ MCExpr const *Expression;
+ if (getParser().parseExpression(Expression))
+ return true;
+
+ SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+ Operands.push_back(AVROperand::CreateImm(Expression, S, E));
+ return false;
+}
+
+bool AVRAsmParser::tryParseRelocExpression(OperandVector &Operands) {
+ bool isNegated = false;
+ AVRMCExpr::VariantKind ModifierKind = AVRMCExpr::VK_AVR_None;
+
+ SMLoc S = Parser.getTok().getLoc();
+
+ // Check for sign
+ AsmToken tokens[2];
+ size_t ReadCount = Parser.getLexer().peekTokens(tokens);
+
+ if (ReadCount == 2) {
+ if (tokens[0].getKind() == AsmToken::Identifier &&
+ tokens[1].getKind() == AsmToken::LParen) {
+
+ AsmToken::TokenKind CurTok = Parser.getLexer().getKind();
+ if (CurTok == AsmToken::Minus) {
+ isNegated = true;
+ } else {
+ assert(CurTok == AsmToken::Plus);
+ isNegated = false;
+ }
+
+ // Eat the sign
+ Parser.Lex();
+ }
+ }
+
+ // Check if we have a target specific modifier (lo8, hi8, &c)
+ if (Parser.getTok().getKind() != AsmToken::Identifier ||
+ Parser.getLexer().peekTok().getKind() != AsmToken::LParen) {
+ // Not a reloc expr
+ return true;
+ }
+ StringRef ModifierName = Parser.getTok().getString();
+ ModifierKind = AVRMCExpr::getKindByName(ModifierName.str().c_str());
+
+ if (ModifierKind != AVRMCExpr::VK_AVR_None) {
+ Parser.Lex();
+ Parser.Lex(); // Eat modifier name and parenthesis
+ } else {
+ return Error(Parser.getTok().getLoc(), "unknown modifier");
+ }
+
+ MCExpr const *InnerExpression;
+ if (getParser().parseExpression(InnerExpression))
+ return true;
+
+ // If we have a modifier wrap the inner expression
+ assert(Parser.getTok().getKind() == AsmToken::RParen);
+ Parser.Lex(); // Eat closing parenthesis
+
+ MCExpr const *Expression = AVRMCExpr::create(ModifierKind, InnerExpression,
+ isNegated, getContext());
+
+ SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+ Operands.push_back(AVROperand::CreateImm(Expression, S, E));
+
+ return false;
+}
+
+bool AVRAsmParser::parseOperand(OperandVector &Operands) {
+ DEBUG(dbgs() << "parseOperand\n");
+
+ switch (getLexer().getKind()) {
+ default:
+ return Error(Parser.getTok().getLoc(), "unexpected token in operand");
+
+ case AsmToken::Identifier:
+ // Try to parse a register, if it fails,
+ // fall through to the next case.
+ if (!tryParseRegisterOperand(Operands)) {
+ return false;
+ }
+ case AsmToken::LParen:
+ case AsmToken::Integer:
+ case AsmToken::Dot:
+ return tryParseExpression(Operands);
+ case AsmToken::Plus:
+ case AsmToken::Minus: {
+ // If the sign preceeds a number, parse the number,
+ // otherwise treat the sign a an independent token.
+ switch (getLexer().peekTok().getKind()) {
+ case AsmToken::Integer:
+ case AsmToken::BigNum:
+ case AsmToken::Identifier:
+ case AsmToken::Real:
+ if (!tryParseExpression(Operands))
+ return false;
+ default:
+ break;
+ }
+ // Treat the token as an independent token.
+ Operands.push_back(AVROperand::CreateToken(Parser.getTok().getString(),
+ Parser.getTok().getLoc()));
+ Parser.Lex(); // Eat the token.
+ return false;
+ }
+ }
+
+ // Could not parse operand
+ return true;
+}
+
+OperandMatchResultTy
+AVRAsmParser::parseMemriOperand(OperandVector &Operands) {
+ DEBUG(dbgs() << "parseMemriOperand()\n");
+
+ SMLoc E, S;
+ MCExpr const *Expression;
+ int RegNo;
+
+ // Parse register.
+ {
+ RegNo = parseRegister();
+
+ if (RegNo == AVR::NoRegister)
+ return MatchOperand_ParseFail;
+
+ S = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+ Parser.Lex(); // Eat register token.
+ }
+
+ // Parse immediate;
+ {
+ if (getParser().parseExpression(Expression))
+ return MatchOperand_ParseFail;
+
+ E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+ }
+
+ Operands.push_back(AVROperand::CreateMemri(RegNo, Expression, S, E));
+
+ return MatchOperand_Success;
+}
+
+bool AVRAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+ SMLoc &EndLoc) {
+ StartLoc = Parser.getTok().getLoc();
+ RegNo = parseRegister();
+ EndLoc = Parser.getTok().getLoc();
+
+ return (RegNo == AVR::NoRegister);
+}
+
+void AVRAsmParser::eatComma() {
+ if (getLexer().is(AsmToken::Comma)) {
+ Parser.Lex();
+ } else {
+ // GCC allows commas to be omitted.
+ }
+}
+
+bool AVRAsmParser::ParseInstruction(ParseInstructionInfo &Info,
+ StringRef Mnemonic, SMLoc NameLoc,
+ OperandVector &Operands) {
+ Operands.push_back(AVROperand::CreateToken(Mnemonic, NameLoc));
+
+ bool first = true;
+ while (getLexer().isNot(AsmToken::EndOfStatement)) {
+ if (!first) eatComma();
+
+ first = false;
+
+ auto MatchResult = MatchOperandParserImpl(Operands, Mnemonic);
+
+ if (MatchResult == MatchOperand_Success) {
+ continue;
+ }
+
+ if (MatchResult == MatchOperand_ParseFail) {
+ SMLoc Loc = getLexer().getLoc();
+ Parser.eatToEndOfStatement();
+
+ return Error(Loc, "failed to parse register and immediate pair");
+ }
+
+ if (parseOperand(Operands)) {
+ SMLoc Loc = getLexer().getLoc();
+ Parser.eatToEndOfStatement();
+ return Error(Loc, "unexpected token in argument list");
+ }
+ }
+ Parser.Lex(); // Consume the EndOfStatement
+ return false;
+}
+
+bool AVRAsmParser::ParseDirective(llvm::AsmToken DirectiveID) { return true; }
+
+extern "C" void LLVMInitializeAVRAsmParser() {
+ RegisterMCAsmParser<AVRAsmParser> X(getTheAVRTarget());
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#include "AVRGenAsmMatcher.inc"
+
+// Uses enums defined in AVRGenAsmMatcher.inc
+unsigned AVRAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
+ unsigned ExpectedKind) {
+ AVROperand &Op = static_cast<AVROperand &>(AsmOp);
+ MatchClassKind Expected = static_cast<MatchClassKind>(ExpectedKind);
+
+ // If need be, GCC converts bare numbers to register names
+ // It's ugly, but GCC supports it.
+ if (Op.isImm()) {
+ if (MCConstantExpr const *Const = dyn_cast<MCConstantExpr>(Op.getImm())) {
+ int64_t RegNum = Const->getValue();
+ std::ostringstream RegName;
+ RegName << "r" << RegNum;
+ RegNum = MatchRegisterName(RegName.str().c_str());
+ if (RegNum != AVR::NoRegister) {
+ Op.makeReg(RegNum);
+ if (validateOperandClass(Op, Expected) == Match_Success) {
+ return Match_Success;
+ }
+ }
+ // Let the other quirks try their magic.
+ }
+ }
+
+ if (Op.isReg()) {
+ // If the instruction uses a register pair but we got a single, lower
+ // register we perform a "class cast".
+ if (isSubclass(Expected, MCK_DREGS)) {
+ unsigned correspondingDREG = toDREG(Op.getReg());
+
+ if (correspondingDREG != AVR::NoRegister) {
+ Op.makeReg(correspondingDREG);
+ return validateOperandClass(Op, Expected);
+ }
+ }
+ }
+ return Match_InvalidOperand;
+}
+
+} // end of namespace llvm
diff --git a/contrib/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp b/contrib/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
new file mode 100644
index 000000000000..d2a21fb64635
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
@@ -0,0 +1,156 @@
+//===- AVRDisassembler.cpp - Disassembler for AVR ---------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the AVR Disassembler.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVR.h"
+#include "AVRRegisterInfo.h"
+#include "AVRSubtarget.h"
+#include "MCTargetDesc/AVRMCTargetDesc.h"
+
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "avr-disassembler"
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+
+/// A disassembler class for AVR.
+class AVRDisassembler : public MCDisassembler {
+public:
+ AVRDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
+ : MCDisassembler(STI, Ctx) {}
+ virtual ~AVRDisassembler() {}
+
+ DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes, uint64_t Address,
+ raw_ostream &VStream,
+ raw_ostream &CStream) const override;
+};
+}
+
+static MCDisassembler *createAVRDisassembler(const Target &T,
+ const MCSubtargetInfo &STI,
+ MCContext &Ctx) {
+ return new AVRDisassembler(STI, Ctx);
+}
+
+
+extern "C" void LLVMInitializeAVRDisassembler() {
+ // Register the disassembler.
+ TargetRegistry::RegisterMCDisassembler(getTheAVRTarget(),
+ createAVRDisassembler);
+}
+
+static DecodeStatus DecodeGPR8RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder) {
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeLD8RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder) {
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodePTRREGSRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder) {
+ return MCDisassembler::Success;
+}
+
+#include "AVRGenDisassemblerTables.inc"
+
+static DecodeStatus readInstruction16(ArrayRef<uint8_t> Bytes, uint64_t Address,
+ uint64_t &Size, uint32_t &Insn) {
+ if (Bytes.size() < 2) {
+ Size = 0;
+ return MCDisassembler::Fail;
+ }
+
+ Size = 2;
+ Insn = (Bytes[0] << 0) | (Bytes[1] << 8);
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus readInstruction32(ArrayRef<uint8_t> Bytes, uint64_t Address,
+ uint64_t &Size, uint32_t &Insn) {
+
+ if (Bytes.size() < 4) {
+ Size = 0;
+ return MCDisassembler::Fail;
+ }
+
+ Size = 4;
+ Insn = (Bytes[0] << 0) | (Bytes[1] << 8) | (Bytes[2] << 16) | (Bytes[3] << 24);
+
+ return MCDisassembler::Success;
+}
+
+static const uint8_t *getDecoderTable(uint64_t Size) {
+
+ switch (Size) {
+ case 2: return DecoderTable16;
+ case 4: return DecoderTable32;
+ default: llvm_unreachable("instructions must be 16 or 32-bits");
+ }
+}
+
+DecodeStatus AVRDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes,
+ uint64_t Address,
+ raw_ostream &VStream,
+ raw_ostream &CStream) const {
+ uint32_t Insn;
+
+ DecodeStatus Result;
+
+ // Try decode a 16-bit instruction.
+ {
+ Result = readInstruction16(Bytes, Address, Size, Insn);
+
+ if (Result == MCDisassembler::Fail) return MCDisassembler::Fail;
+
+ // Try to auto-decode a 16-bit instruction.
+ Result = decodeInstruction(getDecoderTable(Size), Instr,
+ Insn, Address, this, STI);
+
+ if (Result != MCDisassembler::Fail)
+ return Result;
+ }
+
+ // Try decode a 32-bit instruction.
+ {
+ Result = readInstruction32(Bytes, Address, Size, Insn);
+
+ if (Result == MCDisassembler::Fail) return MCDisassembler::Fail;
+
+ Result = decodeInstruction(getDecoderTable(Size), Instr, Insn,
+ Address, this, STI);
+
+ if (Result != MCDisassembler::Fail) {
+ return Result;
+ }
+
+ return MCDisassembler::Fail;
+ }
+}
+
+typedef DecodeStatus (*DecodeFunc)(MCInst &MI, unsigned insn, uint64_t Address,
+ const void *Decoder);
+
diff --git a/contrib/llvm/lib/Target/AVR/InstPrinter/AVRInstPrinter.cpp b/contrib/llvm/lib/Target/AVR/InstPrinter/AVRInstPrinter.cpp
new file mode 100644
index 000000000000..316b7836df0d
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/InstPrinter/AVRInstPrinter.cpp
@@ -0,0 +1,171 @@
+//===-- AVRInstPrinter.cpp - Convert AVR MCInst to assembly syntax --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an AVR MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVRInstPrinter.h"
+
+#include "MCTargetDesc/AVRMCTargetDesc.h"
+
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+
+#include <cstring>
+
+#define DEBUG_TYPE "asm-printer"
+
+namespace llvm {
+
+// Include the auto-generated portion of the assembly writer.
+#define PRINT_ALIAS_INSTR
+#include "AVRGenAsmWriter.inc"
+
+void AVRInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+ StringRef Annot, const MCSubtargetInfo &STI) {
+ unsigned Opcode = MI->getOpcode();
+
+ // First handle load and store instructions with postinc or predec
+ // of the form "ld reg, X+".
+ // TODO: We should be able to rewrite this using TableGen data.
+ switch (Opcode) {
+ case AVR::LDRdPtr:
+ case AVR::LDRdPtrPi:
+ case AVR::LDRdPtrPd:
+ O << "\tld\t";
+ printOperand(MI, 0, O);
+ O << ", ";
+
+ if (Opcode == AVR::LDRdPtrPd)
+ O << '-';
+
+ printOperand(MI, 1, O);
+
+ if (Opcode == AVR::LDRdPtrPi)
+ O << '+';
+ break;
+ case AVR::STPtrRr:
+ O << "\tst\t";
+ printOperand(MI, 0, O);
+ O << ", ";
+ printOperand(MI, 1, O);
+ break;
+ case AVR::STPtrPiRr:
+ case AVR::STPtrPdRr:
+ O << "\tst\t";
+
+ if (Opcode == AVR::STPtrPdRr)
+ O << '-';
+
+ printOperand(MI, 1, O);
+
+ if (Opcode == AVR::STPtrPiRr)
+ O << '+';
+
+ O << ", ";
+ printOperand(MI, 2, O);
+ break;
+ default:
+ if (!printAliasInstr(MI, O))
+ printInstruction(MI, O);
+
+ printAnnotation(O, Annot);
+ break;
+ }
+}
+
+const char *AVRInstPrinter::getPrettyRegisterName(unsigned RegNum,
+ MCRegisterInfo const &MRI) {
+ // GCC prints register pairs by just printing the lower register
+ // If the register contains a subregister, print it instead
+ if (MRI.getNumSubRegIndices() > 0) {
+ unsigned RegLoNum = MRI.getSubReg(RegNum, AVR::sub_lo);
+ RegNum = (RegLoNum != AVR::NoRegister) ? RegLoNum : RegNum;
+ }
+
+ return getRegisterName(RegNum);
+}
+
+void AVRInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ const MCOperandInfo &MOI = this->MII.get(MI->getOpcode()).OpInfo[OpNo];
+
+ if (Op.isReg()) {
+ bool isPtrReg = (MOI.RegClass == AVR::PTRREGSRegClassID) ||
+ (MOI.RegClass == AVR::PTRDISPREGSRegClassID) ||
+ (MOI.RegClass == AVR::ZREGSRegClassID);
+
+ if (isPtrReg) {
+ O << getRegisterName(Op.getReg(), AVR::ptr);
+ } else {
+ O << getPrettyRegisterName(Op.getReg(), MRI);
+ }
+ } else if (Op.isImm()) {
+ O << Op.getImm();
+ } else {
+ assert(Op.isExpr() && "Unknown operand kind in printOperand");
+ O << *Op.getExpr();
+ }
+}
+
+/// This is used to print an immediate value that ends up
+/// being encoded as a pc-relative value.
+void AVRInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+
+ if (Op.isImm()) {
+ int64_t Imm = Op.getImm();
+ O << '.';
+
+ // Print a position sign if needed.
+ // Negative values have their sign printed automatically.
+ if (Imm >= 0)
+ O << '+';
+
+ O << Imm;
+ } else {
+ assert(Op.isExpr() && "Unknown pcrel immediate operand");
+ O << *Op.getExpr();
+ }
+}
+
+void AVRInstPrinter::printMemri(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ assert(MI->getOperand(OpNo).isReg() && "Expected a register for the first operand");
+
+ const MCOperand &OffsetOp = MI->getOperand(OpNo + 1);
+
+ // Print the register.
+ printOperand(MI, OpNo, O);
+
+ // Print the {+,-}offset.
+ if (OffsetOp.isImm()) {
+ int64_t Offset = OffsetOp.getImm();
+
+ if (Offset >= 0)
+ O << '+';
+
+ O << Offset;
+ } else if (OffsetOp.isExpr()) {
+ O << *OffsetOp.getExpr();
+ } else {
+ llvm_unreachable("unknown type for offset");
+ }
+}
+
+} // end of namespace llvm
+
diff --git a/contrib/llvm/lib/Target/AVR/InstPrinter/AVRInstPrinter.h b/contrib/llvm/lib/Target/AVR/InstPrinter/AVRInstPrinter.h
new file mode 100644
index 000000000000..c9f65b922745
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/InstPrinter/AVRInstPrinter.h
@@ -0,0 +1,54 @@
+//===- AVRInstPrinter.h - Convert AVR MCInst to assembly syntax -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an AVR MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_INST_PRINTER_H
+#define LLVM_AVR_INST_PRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+#include "MCTargetDesc/AVRMCTargetDesc.h"
+
+namespace llvm {
+
+/// Prints AVR instructions to a textual stream.
+class AVRInstPrinter : public MCInstPrinter {
+public:
+ AVRInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI)
+ : MCInstPrinter(MAI, MII, MRI) {}
+
+ static const char *getPrettyRegisterName(unsigned RegNo,
+ MCRegisterInfo const &MRI);
+
+ void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
+ const MCSubtargetInfo &STI) override;
+
+private:
+ static const char *getRegisterName(unsigned RegNo,
+ unsigned AltIdx = AVR::NoRegAltName);
+
+ void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printMemri(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+ // Autogenerated by TableGen.
+ void printInstruction(const MCInst *MI, raw_ostream &O);
+ bool printAliasInstr(const MCInst *MI, raw_ostream &O);
+ void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+ unsigned PrintMethodIdx, raw_ostream &O);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_INST_PRINTER_H
+
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
new file mode 100644
index 000000000000..081d8b5740ef
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
@@ -0,0 +1,473 @@
+//===-- AVRAsmBackend.cpp - AVR Asm Backend ------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AVRAsmBackend class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AVRAsmBackend.h"
+#include "MCTargetDesc/AVRFixupKinds.h"
+#include "MCTargetDesc/AVRMCTargetDesc.h"
+
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+
+// FIXME: we should be doing checks to make sure asm operands
+// are not out of bounds.
+
+namespace adjust {
+
+using namespace llvm;
+
+void signed_width(unsigned Width, uint64_t Value, std::string Description,
+ const MCFixup &Fixup, MCContext *Ctx = nullptr) {
+ if (!isIntN(Width, Value)) {
+ std::string Diagnostic = "out of range " + Description;
+
+ int64_t Min = minIntN(Width);
+ int64_t Max = maxIntN(Width);
+
+ Diagnostic += " (expected an integer in the range " + std::to_string(Min) +
+ " to " + std::to_string(Max) + ")";
+
+ if (Ctx) {
+ Ctx->reportFatalError(Fixup.getLoc(), Diagnostic);
+ } else {
+ llvm_unreachable(Diagnostic.c_str());
+ }
+ }
+}
+
+void unsigned_width(unsigned Width, uint64_t Value, std::string Description,
+ const MCFixup &Fixup, MCContext *Ctx = nullptr) {
+ if (!isUIntN(Width, Value)) {
+ std::string Diagnostic = "out of range " + Description;
+
+ int64_t Max = maxUIntN(Width);
+
+ Diagnostic += " (expected an integer in the range 0 to " +
+ std::to_string(Max) + ")";
+
+ if (Ctx) {
+ Ctx->reportFatalError(Fixup.getLoc(), Diagnostic);
+ } else {
+ llvm_unreachable(Diagnostic.c_str());
+ }
+ }
+}
+
+/// Adjusts the value of a branch target before fixup application.
+void adjustBranch(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
+ MCContext *Ctx = nullptr) {
+ // We have one extra bit of precision because the value is rightshifted by
+ // one.
+ unsigned_width(Size + 1, Value, std::string("branch target"), Fixup, Ctx);
+
+ // Rightshifts the value by one.
+ AVR::fixups::adjustBranchTarget(Value);
+}
+
+/// Adjusts the value of a relative branch target before fixup application.
+void adjustRelativeBranch(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
+ MCContext *Ctx = nullptr) {
+ // We have one extra bit of precision because the value is rightshifted by
+ // one.
+ signed_width(Size + 1, Value, std::string("branch target"), Fixup, Ctx);
+
+ Value -= 2;
+
+ // Rightshifts the value by one.
+ AVR::fixups::adjustBranchTarget(Value);
+}
+
+/// 22-bit absolute fixup.
+///
+/// Resolves to:
+/// 1001 kkkk 010k kkkk kkkk kkkk 111k kkkk
+///
+/// Offset of 0 (so the result is left shifted by 3 bits before application).
+void fixup_call(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
+ MCContext *Ctx = nullptr) {
+ adjustBranch(Size, Fixup, Value, Ctx);
+
+ auto top = Value & (0xf00000 << 6); // the top four bits
+ auto middle = Value & (0x1ffff << 5); // the middle 13 bits
+ auto bottom = Value & 0x1f; // end bottom 5 bits
+
+ Value = (top << 6) | (middle << 3) | (bottom << 0);
+}
+
+/// 7-bit PC-relative fixup.
+///
+/// Resolves to:
+/// 0000 00kk kkkk k000
+/// Offset of 0 (so the result is left shifted by 3 bits before application).
+void fixup_7_pcrel(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
+ MCContext *Ctx = nullptr) {
+ adjustRelativeBranch(Size, Fixup, Value, Ctx);
+
+ // Because the value may be negative, we must mask out the sign bits
+ Value &= 0x7f;
+}
+
+/// 12-bit PC-relative fixup.
+/// Yes, the fixup is 12 bits even though the name says otherwise.
+///
+/// Resolves to:
+/// 0000 kkkk kkkk kkkk
+/// Offset of 0 (so the result isn't left-shifted before application).
+void fixup_13_pcrel(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
+ MCContext *Ctx = nullptr) {
+ adjustRelativeBranch(Size, Fixup, Value, Ctx);
+
+ // Because the value may be negative, we must mask out the sign bits
+ Value &= 0xfff;
+}
+
+/// 6-bit fixup for the immediate operand of the ADIW family of
+/// instructions.
+///
+/// Resolves to:
+/// 0000 0000 kk00 kkkk
+void fixup_6_adiw(const MCFixup &Fixup, uint64_t &Value,
+ MCContext *Ctx = nullptr) {
+ unsigned_width(6, Value, std::string("immediate"), Fixup, Ctx);
+
+ Value = ((Value & 0x30) << 2) | (Value & 0x0f);
+}
+
+/// 5-bit port number fixup on the SBIC family of instructions.
+///
+/// Resolves to:
+/// 0000 0000 AAAA A000
+void fixup_port5(const MCFixup &Fixup, uint64_t &Value,
+ MCContext *Ctx = nullptr) {
+ unsigned_width(5, Value, std::string("port number"), Fixup, Ctx);
+
+ Value &= 0x1f;
+
+ Value <<= 3;
+}
+
+/// 6-bit port number fixup on the `IN` family of instructions.
+///
+/// Resolves to:
+/// 1011 0AAd dddd AAAA
+void fixup_port6(const MCFixup &Fixup, uint64_t &Value,
+ MCContext *Ctx = nullptr) {
+ unsigned_width(6, Value, std::string("port number"), Fixup, Ctx);
+
+ Value = ((Value & 0x30) << 5) | (Value & 0x0f);
+}
+
+/// Adjusts a program memory address.
+/// This is a simple right-shift.
+void pm(uint64_t &Value) {
+ Value >>= 1;
+}
+
+/// Fixups relating to the LDI instruction.
+namespace ldi {
+
+/// Adjusts a value to fix up the immediate of an `LDI Rd, K` instruction.
+///
+/// Resolves to:
+/// 0000 KKKK 0000 KKKK
+/// Offset of 0 (so the result isn't left-shifted before application).
+void fixup(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
+ MCContext *Ctx = nullptr) {
+ uint64_t upper = Value & 0xf0;
+ uint64_t lower = Value & 0x0f;
+
+ Value = (upper << 4) | lower;
+}
+
+void neg(uint64_t &Value) { Value *= -1; }
+
+void lo8(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
+ MCContext *Ctx = nullptr) {
+ Value &= 0xff;
+ ldi::fixup(Size, Fixup, Value, Ctx);
+}
+
+void hi8(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
+ MCContext *Ctx = nullptr) {
+ Value = (Value & 0xff00) >> 8;
+ ldi::fixup(Size, Fixup, Value, Ctx);
+}
+
+void hh8(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
+ MCContext *Ctx = nullptr) {
+ Value = (Value & 0xff0000) >> 16;
+ ldi::fixup(Size, Fixup, Value, Ctx);
+}
+
+void ms8(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
+ MCContext *Ctx = nullptr) {
+ Value = (Value & 0xff000000) >> 24;
+ ldi::fixup(Size, Fixup, Value, Ctx);
+}
+
+} // end of ldi namespace
+} // end of adjust namespace
+
+namespace llvm {
+
+// Prepare value for the target space for it
+void AVRAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t &Value,
+ MCContext *Ctx) const {
+ // The size of the fixup in bits.
+ uint64_t Size = AVRAsmBackend::getFixupKindInfo(Fixup.getKind()).TargetSize;
+
+ unsigned Kind = Fixup.getKind();
+
+ switch (Kind) {
+ default:
+ llvm_unreachable("unhandled fixup");
+ case AVR::fixup_7_pcrel:
+ adjust::fixup_7_pcrel(Size, Fixup, Value, Ctx);
+ break;
+ case AVR::fixup_13_pcrel:
+ adjust::fixup_13_pcrel(Size, Fixup, Value, Ctx);
+ break;
+ case AVR::fixup_call:
+ adjust::fixup_call(Size, Fixup, Value, Ctx);
+ break;
+ case AVR::fixup_ldi:
+ adjust::ldi::fixup(Size, Fixup, Value, Ctx);
+ break;
+ case AVR::fixup_lo8_ldi:
+ case AVR::fixup_lo8_ldi_pm:
+ if (Kind == AVR::fixup_lo8_ldi_pm) adjust::pm(Value);
+
+ adjust::ldi::lo8(Size, Fixup, Value, Ctx);
+ break;
+ case AVR::fixup_hi8_ldi:
+ case AVR::fixup_hi8_ldi_pm:
+ if (Kind == AVR::fixup_hi8_ldi_pm) adjust::pm(Value);
+
+ adjust::ldi::hi8(Size, Fixup, Value, Ctx);
+ break;
+ case AVR::fixup_hh8_ldi:
+ case AVR::fixup_hh8_ldi_pm:
+ if (Kind == AVR::fixup_hh8_ldi_pm) adjust::pm(Value);
+
+ adjust::ldi::hh8(Size, Fixup, Value, Ctx);
+ break;
+ case AVR::fixup_ms8_ldi:
+ adjust::ldi::ms8(Size, Fixup, Value, Ctx);
+ break;
+
+ case AVR::fixup_lo8_ldi_neg:
+ case AVR::fixup_lo8_ldi_pm_neg:
+ if (Kind == AVR::fixup_lo8_ldi_pm_neg) adjust::pm(Value);
+
+ adjust::ldi::neg(Value);
+ adjust::ldi::lo8(Size, Fixup, Value, Ctx);
+ break;
+ case AVR::fixup_hi8_ldi_neg:
+ case AVR::fixup_hi8_ldi_pm_neg:
+ if (Kind == AVR::fixup_hi8_ldi_pm_neg) adjust::pm(Value);
+
+ adjust::ldi::neg(Value);
+ adjust::ldi::hi8(Size, Fixup, Value, Ctx);
+ break;
+ case AVR::fixup_hh8_ldi_neg:
+ case AVR::fixup_hh8_ldi_pm_neg:
+ if (Kind == AVR::fixup_hh8_ldi_pm_neg) adjust::pm(Value);
+
+ adjust::ldi::neg(Value);
+ adjust::ldi::hh8(Size, Fixup, Value, Ctx);
+ break;
+ case AVR::fixup_ms8_ldi_neg:
+ adjust::ldi::neg(Value);
+ adjust::ldi::ms8(Size, Fixup, Value, Ctx);
+ break;
+ case AVR::fixup_16:
+ adjust::unsigned_width(16, Value, std::string("port number"), Fixup, Ctx);
+
+ Value &= 0xffff;
+ break;
+ case AVR::fixup_6_adiw:
+ adjust::fixup_6_adiw(Fixup, Value, Ctx);
+ break;
+
+ case AVR::fixup_port5:
+ adjust::fixup_port5(Fixup, Value, Ctx);
+ break;
+
+ case AVR::fixup_port6:
+ adjust::fixup_port6(Fixup, Value, Ctx);
+ break;
+
+ // Fixups which do not require adjustments.
+ case FK_Data_2:
+ case FK_Data_4:
+ case FK_Data_8:
+ break;
+
+ case FK_GPRel_4:
+ llvm_unreachable("don't know how to adjust this fixup");
+ break;
+ }
+}
+
+MCObjectWriter *AVRAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
+ return createAVRELFObjectWriter(OS,
+ MCELFObjectTargetWriter::getOSABI(OSType));
+}
+
+void AVRAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+ unsigned DataSize, uint64_t Value,
+ bool IsPCRel) const {
+ if (Value == 0)
+ return; // Doesn't change encoding.
+
+ MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
+
+ // The number of bits in the fixup mask
+ auto NumBits = Info.TargetSize + Info.TargetOffset;
+ auto NumBytes = (NumBits / 8) + ((NumBits % 8) == 0 ? 0 : 1);
+
+ // Shift the value into position.
+ Value <<= Info.TargetOffset;
+
+ unsigned Offset = Fixup.getOffset();
+ assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+
+ // For each byte of the fragment that the fixup touches, mask in the
+ // bits from the fixup value.
+ for (unsigned i = 0; i < NumBytes; ++i) {
+ uint8_t mask = (((Value >> (i * 8)) & 0xff));
+ Data[Offset + i] |= mask;
+ }
+}
+
+MCFixupKindInfo const &AVRAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
+ // NOTE: Many AVR fixups work on sets of non-contignous bits. We work around
+ // this by saying that the fixup is the size of the entire instruction.
+ const static MCFixupKindInfo Infos[AVR::NumTargetFixupKinds] = {
+ // This table *must* be in same the order of fixup_* kinds in
+ // AVRFixupKinds.h.
+ //
+ // name offset bits flags
+ {"fixup_32", 0, 32, 0},
+
+ {"fixup_7_pcrel", 3, 7, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_13_pcrel", 0, 12, MCFixupKindInfo::FKF_IsPCRel},
+
+ {"fixup_16", 0, 16, 0},
+ {"fixup_16_pm", 0, 16, 0},
+
+ {"fixup_ldi", 0, 8, 0},
+
+ {"fixup_lo8_ldi", 0, 8, 0},
+ {"fixup_hi8_ldi", 0, 8, 0},
+ {"fixup_hh8_ldi", 0, 8, 0},
+ {"fixup_ms8_ldi", 0, 8, 0},
+
+ {"fixup_lo8_ldi_neg", 0, 8, 0},
+ {"fixup_hi8_ldi_neg", 0, 8, 0},
+ {"fixup_hh8_ldi_neg", 0, 8, 0},
+ {"fixup_ms8_ldi_neg", 0, 8, 0},
+
+ {"fixup_lo8_ldi_pm", 0, 8, 0},
+ {"fixup_hi8_ldi_pm", 0, 8, 0},
+ {"fixup_hh8_ldi_pm", 0, 8, 0},
+
+ {"fixup_lo8_ldi_pm_neg", 0, 8, 0},
+ {"fixup_hi8_ldi_pm_neg", 0, 8, 0},
+ {"fixup_hh8_ldi_pm_neg", 0, 8, 0},
+
+ {"fixup_call", 0, 22, 0},
+
+ {"fixup_6", 0, 16, 0}, // non-contiguous
+ {"fixup_6_adiw", 0, 6, 0},
+
+ {"fixup_lo8_ldi_gs", 0, 8, 0},
+ {"fixup_hi8_ldi_gs", 0, 8, 0},
+
+ {"fixup_8", 0, 8, 0},
+ {"fixup_8_lo8", 0, 8, 0},
+ {"fixup_8_hi8", 0, 8, 0},
+ {"fixup_8_hlo8", 0, 8, 0},
+
+ {"fixup_sym_diff", 0, 32, 0},
+ {"fixup_16_ldst", 0, 16, 0},
+
+ {"fixup_lds_sts_16", 0, 16, 0},
+
+ {"fixup_port6", 0, 16, 0}, // non-contiguous
+ {"fixup_port5", 3, 5, 0},
+ };
+
+ if (Kind < FirstTargetFixupKind)
+ return MCAsmBackend::getFixupKindInfo(Kind);
+
+ assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+ "Invalid kind!");
+
+ return Infos[Kind - FirstTargetFixupKind];
+}
+
+bool AVRAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+ // If the count is not 2-byte aligned, we must be writing data into the text
+ // section (otherwise we have unaligned instructions, and thus have far
+ // bigger problems), so just write zeros instead.
+ assert((Count % 2) == 0 && "NOP instructions must be 2 bytes");
+
+ OW->WriteZeros(Count);
+ return true;
+}
+
+void AVRAsmBackend::processFixupValue(const MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFixup &Fixup,
+ const MCFragment *DF,
+ const MCValue &Target, uint64_t &Value,
+ bool &IsResolved) {
+ switch ((unsigned) Fixup.getKind()) {
+ // Fixups which should always be recorded as relocations.
+ case AVR::fixup_7_pcrel:
+ case AVR::fixup_13_pcrel:
+ case AVR::fixup_call:
+ IsResolved = false;
+ break;
+ default:
+ // Parsed LLVM-generated temporary labels are already
+ // adjusted for instruction size, but normal labels aren't.
+ //
+ // To handle both cases, we simply un-adjust the temporary label
+ // case so it acts like all other labels.
+ if (Target.getSymA()->getSymbol().isTemporary())
+ Value += 2;
+
+ adjustFixupValue(Fixup, Value, &Asm.getContext());
+ break;
+ }
+}
+
+MCAsmBackend *createAVRAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU,
+ const llvm::MCTargetOptions &TO) {
+ return new AVRAsmBackend(TT.getOS());
+}
+
+} // end of namespace llvm
+
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
new file mode 100644
index 000000000000..7ff4b8f350f6
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
@@ -0,0 +1,78 @@
+//===-- AVRAsmBackend.h - AVR Asm Backend --------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// \file The AVR assembly backend implementation.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#ifndef LLVM_AVR_ASM_BACKEND_H
+#define LLVM_AVR_ASM_BACKEND_H
+
+#include "MCTargetDesc/AVRFixupKinds.h"
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCAsmBackend.h"
+
+namespace llvm {
+
+class MCAssembler;
+class MCObjectWriter;
+class Target;
+
+struct MCFixupKindInfo;
+
+/// Utilities for manipulating generated AVR machine code.
+class AVRAsmBackend : public MCAsmBackend {
+public:
+
+ AVRAsmBackend(Triple::OSType OSType)
+ : MCAsmBackend(), OSType(OSType) {}
+
+ void adjustFixupValue(const MCFixup &Fixup, uint64_t &Value,
+ MCContext *Ctx = nullptr) const;
+
+ MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
+
+ void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+ uint64_t Value, bool IsPCRel) const override;
+
+ const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
+
+ unsigned getNumFixupKinds() const override {
+ return AVR::NumTargetFixupKinds;
+ }
+
+ bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
+
+ bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+ const MCRelaxableFragment *DF,
+ const MCAsmLayout &Layout) const override {
+ llvm_unreachable("RelaxInstruction() unimplemented");
+ return false;
+ }
+
+ void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+ MCInst &Res) const override {}
+
+ bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+
+ void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
+ const MCFixup &Fixup, const MCFragment *DF,
+ const MCValue &Target, uint64_t &Value,
+ bool &IsResolved) override;
+
+private:
+ Triple::OSType OSType;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_ASM_BACKEND_H
+
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
new file mode 100644
index 000000000000..161f305fd014
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
@@ -0,0 +1,127 @@
+//===-- AVRELFObjectWriter.cpp - AVR ELF Writer ---------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AVRFixupKinds.h"
+#include "MCTargetDesc/AVRMCTargetDesc.h"
+
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+/// Writes AVR machine code into an ELF32 object file.
+class AVRELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+ AVRELFObjectWriter(uint8_t OSABI);
+
+ virtual ~AVRELFObjectWriter() {}
+
+ unsigned getRelocType(MCContext &Ctx,
+ const MCValue &Target,
+ const MCFixup &Fixup,
+ bool IsPCRel) const override;
+};
+
+AVRELFObjectWriter::AVRELFObjectWriter(uint8_t OSABI)
+ : MCELFObjectTargetWriter(false, OSABI, ELF::EM_AVR, true, false) {}
+
+unsigned AVRELFObjectWriter::getRelocType(MCContext &Ctx,
+ const MCValue &Target,
+ const MCFixup &Fixup,
+ bool IsPCRel) const {
+ switch ((unsigned) Fixup.getKind()) {
+ case FK_Data_1:
+ case FK_Data_4:
+ llvm_unreachable("unsupported relocation type");
+ case FK_Data_2:
+ return ELF::R_AVR_16_PM;
+ case AVR::fixup_32:
+ return ELF::R_AVR_32;
+ case AVR::fixup_7_pcrel:
+ return ELF::R_AVR_7_PCREL;
+ case AVR::fixup_13_pcrel:
+ return ELF::R_AVR_13_PCREL;
+ case AVR::fixup_16:
+ return ELF::R_AVR_16;
+ case AVR::fixup_16_pm:
+ return ELF::R_AVR_16_PM;
+ case AVR::fixup_lo8_ldi:
+ return ELF::R_AVR_LO8_LDI;
+ case AVR::fixup_hi8_ldi:
+ return ELF::R_AVR_HI8_LDI;
+ case AVR::fixup_hh8_ldi:
+ return ELF::R_AVR_HH8_LDI;
+ case AVR::fixup_lo8_ldi_neg:
+ return ELF::R_AVR_LO8_LDI_NEG;
+ case AVR::fixup_hi8_ldi_neg:
+ return ELF::R_AVR_HI8_LDI_NEG;
+ case AVR::fixup_hh8_ldi_neg:
+ return ELF::R_AVR_HH8_LDI_NEG;
+ case AVR::fixup_lo8_ldi_pm:
+ return ELF::R_AVR_LO8_LDI_PM;
+ case AVR::fixup_hi8_ldi_pm:
+ return ELF::R_AVR_HI8_LDI_PM;
+ case AVR::fixup_hh8_ldi_pm:
+ return ELF::R_AVR_HH8_LDI_PM;
+ case AVR::fixup_lo8_ldi_pm_neg:
+ return ELF::R_AVR_LO8_LDI_PM_NEG;
+ case AVR::fixup_hi8_ldi_pm_neg:
+ return ELF::R_AVR_HI8_LDI_PM_NEG;
+ case AVR::fixup_hh8_ldi_pm_neg:
+ return ELF::R_AVR_HH8_LDI_PM_NEG;
+ case AVR::fixup_call:
+ return ELF::R_AVR_CALL;
+ case AVR::fixup_ldi:
+ return ELF::R_AVR_LDI;
+ case AVR::fixup_6:
+ return ELF::R_AVR_6;
+ case AVR::fixup_6_adiw:
+ return ELF::R_AVR_6_ADIW;
+ case AVR::fixup_ms8_ldi:
+ return ELF::R_AVR_MS8_LDI;
+ case AVR::fixup_ms8_ldi_neg:
+ return ELF::R_AVR_MS8_LDI_NEG;
+ case AVR::fixup_lo8_ldi_gs:
+ return ELF::R_AVR_LO8_LDI_GS;
+ case AVR::fixup_hi8_ldi_gs:
+ return ELF::R_AVR_HI8_LDI_GS;
+ case AVR::fixup_8:
+ return ELF::R_AVR_8;
+ case AVR::fixup_8_lo8:
+ return ELF::R_AVR_8_LO8;
+ case AVR::fixup_8_hi8:
+ return ELF::R_AVR_8_HI8;
+ case AVR::fixup_8_hlo8:
+ return ELF::R_AVR_8_HLO8;
+ case AVR::fixup_sym_diff:
+ return ELF::R_AVR_SYM_DIFF;
+ case AVR::fixup_16_ldst:
+ return ELF::R_AVR_16_LDST;
+ case AVR::fixup_lds_sts_16:
+ return ELF::R_AVR_LDS_STS_16;
+ case AVR::fixup_port6:
+ return ELF::R_AVR_PORT6;
+ case AVR::fixup_port5:
+ return ELF::R_AVR_PORT5;
+ default:
+ llvm_unreachable("invalid fixup kind!");
+ }
+}
+
+MCObjectWriter *createAVRELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI) {
+ MCELFObjectTargetWriter *MOTW = new AVRELFObjectWriter(OSABI);
+ return createELFObjectWriter(MOTW, OS, true);
+}
+
+} // end of namespace llvm
+
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp
new file mode 100644
index 000000000000..481de320b22f
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp
@@ -0,0 +1,66 @@
+#include "AVRELFStreamer.h"
+
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/FormattedStream.h"
+
+#include "AVRMCTargetDesc.h"
+
+namespace llvm {
+
+static unsigned getEFlagsForFeatureSet(const FeatureBitset &Features) {
+ unsigned EFlags = 0;
+
+ // Set architecture
+ if (Features[AVR::ELFArchAVR1])
+ EFlags |= ELF::EF_AVR_ARCH_AVR1;
+ else if (Features[AVR::ELFArchAVR2])
+ EFlags |= ELF::EF_AVR_ARCH_AVR2;
+ else if (Features[AVR::ELFArchAVR25])
+ EFlags |= ELF::EF_AVR_ARCH_AVR25;
+ else if (Features[AVR::ELFArchAVR3])
+ EFlags |= ELF::EF_AVR_ARCH_AVR3;
+ else if (Features[AVR::ELFArchAVR31])
+ EFlags |= ELF::EF_AVR_ARCH_AVR31;
+ else if (Features[AVR::ELFArchAVR35])
+ EFlags |= ELF::EF_AVR_ARCH_AVR35;
+ else if (Features[AVR::ELFArchAVR4])
+ EFlags |= ELF::EF_AVR_ARCH_AVR4;
+ else if (Features[AVR::ELFArchAVR5])
+ EFlags |= ELF::EF_AVR_ARCH_AVR5;
+ else if (Features[AVR::ELFArchAVR51])
+ EFlags |= ELF::EF_AVR_ARCH_AVR51;
+ else if (Features[AVR::ELFArchAVR6])
+ EFlags |= ELF::EF_AVR_ARCH_AVR6;
+ else if (Features[AVR::ELFArchAVRTiny])
+ EFlags |= ELF::EF_AVR_ARCH_AVRTINY;
+ else if (Features[AVR::ELFArchXMEGA1])
+ EFlags |= ELF::EF_AVR_ARCH_XMEGA1;
+ else if (Features[AVR::ELFArchXMEGA2])
+ EFlags |= ELF::EF_AVR_ARCH_XMEGA2;
+ else if (Features[AVR::ELFArchXMEGA3])
+ EFlags |= ELF::EF_AVR_ARCH_XMEGA3;
+ else if (Features[AVR::ELFArchXMEGA4])
+ EFlags |= ELF::EF_AVR_ARCH_XMEGA4;
+ else if (Features[AVR::ELFArchXMEGA5])
+ EFlags |= ELF::EF_AVR_ARCH_XMEGA5;
+ else if (Features[AVR::ELFArchXMEGA6])
+ EFlags |= ELF::EF_AVR_ARCH_XMEGA6;
+ else if (Features[AVR::ELFArchXMEGA7])
+ EFlags |= ELF::EF_AVR_ARCH_XMEGA7;
+
+ return EFlags;
+}
+
+AVRELFStreamer::AVRELFStreamer(MCStreamer &S,
+ const MCSubtargetInfo &STI)
+ : AVRTargetStreamer(S) {
+
+ MCAssembler &MCA = getStreamer().getAssembler();
+ unsigned EFlags = MCA.getELFHeaderEFlags();
+
+ EFlags |= getEFlagsForFeatureSet(STI.getFeatureBits());
+
+ MCA.setELFHeaderEFlags(EFlags);
+}
+
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.h b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.h
new file mode 100644
index 000000000000..e5df6cc34e40
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.h
@@ -0,0 +1,29 @@
+//===----- AVRELFStreamer.h - AVR Target Streamer --------------*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_ELF_STREAMER_H
+#define LLVM_AVR_ELF_STREAMER_H
+
+#include "AVRTargetStreamer.h"
+
+namespace llvm {
+
+/// A target streamer for an AVR ELF object file.
+class AVRELFStreamer : public AVRTargetStreamer {
+public:
+ AVRELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI);
+
+ MCELFStreamer &getStreamer() {
+ return static_cast<MCELFStreamer &>(Streamer);
+ }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRFixupKinds.h b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRFixupKinds.h
new file mode 100644
index 000000000000..d3bd52d343fc
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRFixupKinds.h
@@ -0,0 +1,149 @@
+//===-- AVRFixupKinds.h - AVR Specific Fixup Entries ------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_FIXUP_KINDS_H
+#define LLVM_AVR_FIXUP_KINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace AVR {
+
+/// The set of supported fixups.
+///
+/// Although most of the current fixup types reflect a unique relocation
+/// one can have multiple fixup types for a given relocation and thus need
+/// to be uniquely named.
+///
+/// \note This table *must* be in the same order of
+/// MCFixupKindInfo Infos[AVR::NumTargetFixupKinds]
+/// in `AVRAsmBackend.cpp`.
+enum Fixups {
+ /// A 32-bit AVR fixup.
+ fixup_32 = FirstTargetFixupKind,
+
+ /// A 7-bit PC-relative fixup for the family of conditional
+ /// branches which take 7-bit targets (BRNE,BRGT,etc).
+ fixup_7_pcrel,
+ /// A 12-bit PC-relative fixup for the family of branches
+ /// which take 12-bit targets (RJMP,RCALL,etc).
+ /// \note Although the fixup is labelled as 13 bits, it
+ /// is actually only encoded in 12. The reason for
+ /// The nonmenclature is that AVR branch targets are
+ /// rightshifted by 1, because instructions are always
+ /// aligned to 2 bytes, so the 0'th bit is always 0.
+ /// This way there is 13-bits of precision.
+ fixup_13_pcrel,
+
+ /// A 16-bit address.
+ fixup_16,
+ /// A 16-bit program memory address.
+ fixup_16_pm,
+
+ /// Replaces the 8-bit immediate with another value.
+ fixup_ldi,
+
+ /// Replaces the immediate operand of a 16-bit `Rd, K` instruction
+ /// with the lower 8 bits of a 16-bit value (bits 0-7).
+ fixup_lo8_ldi,
+ /// Replaces the immediate operand of a 16-bit `Rd, K` instruction
+ /// with the upper 8 bits of a 16-bit value (bits 8-15).
+ fixup_hi8_ldi,
+ /// Replaces the immediate operand of a 16-bit `Rd, K` instruction
+ /// with the upper 8 bits of a 24-bit value (bits 16-23).
+ fixup_hh8_ldi,
+ /// Replaces the immediate operand of a 16-bit `Rd, K` instruction
+ /// with the upper 8 bits of a 32-bit value (bits 24-31).
+ fixup_ms8_ldi,
+
+ /// Replaces the immediate operand of a 16-bit `Rd, K` instruction
+ /// with the lower 8 bits of a negated 16-bit value (bits 0-7).
+ fixup_lo8_ldi_neg,
+ /// Replaces the immediate operand of a 16-bit `Rd, K` instruction
+ /// with the upper 8 bits of a negated 16-bit value (bits 8-15).
+ fixup_hi8_ldi_neg,
+ /// Replaces the immediate operand of a 16-bit `Rd, K` instruction
+ /// with the upper 8 bits of a negated negated 24-bit value (bits 16-23).
+ fixup_hh8_ldi_neg,
+ /// Replaces the immediate operand of a 16-bit `Rd, K` instruction
+ /// with the upper 8 bits of a negated negated 32-bit value (bits 24-31).
+ fixup_ms8_ldi_neg,
+
+ /// Replaces the immediate operand of a 16-bit `Rd, K` instruction
+ /// with the lower 8 bits of a 16-bit program memory address value (bits 0-7).
+ fixup_lo8_ldi_pm,
+ /// Replaces the immediate operand of a 16-bit `Rd, K` instruction
+ /// with the upper 8 bits of a 16-bit program memory address value (bits
+ /// 8-15).
+ fixup_hi8_ldi_pm,
+ /// Replaces the immediate operand of a 16-bit `Rd, K` instruction
+ /// with the upper 8 bits of a 24-bit program memory address value (bits
+ /// 16-23).
+ fixup_hh8_ldi_pm,
+
+ /// Replaces the immediate operand of a 16-bit `Rd, K` instruction
+ /// with the lower 8 bits of a negated 16-bit program memory address value
+ /// (bits 0-7).
+ fixup_lo8_ldi_pm_neg,
+ /// Replaces the immediate operand of a 16-bit `Rd, K` instruction
+ /// with the upper 8 bits of a negated 16-bit program memory address value
+ /// (bits 8-15).
+ fixup_hi8_ldi_pm_neg,
+ /// Replaces the immediate operand of a 16-bit `Rd, K` instruction
+ /// with the upper 8 bits of a negated 24-bit program memory address value
+ /// (bits 16-23).
+ fixup_hh8_ldi_pm_neg,
+
+ /// A 22-bit fixup for the target of a `CALL k` or `JMP k` instruction.
+ fixup_call,
+
+ fixup_6,
+ /// A symbol+addr fixup for the `LDD <x>+<n>, <r>" family of instructions.
+ fixup_6_adiw,
+
+ fixup_lo8_ldi_gs,
+ fixup_hi8_ldi_gs,
+
+ fixup_8,
+ fixup_8_lo8,
+ fixup_8_hi8,
+ fixup_8_hlo8,
+
+ /// Fixup to calculate the difference between two symbols.
+ /// Is the only stateful fixup. We do not support it yet.
+ fixup_sym_diff,
+ fixup_16_ldst,
+
+ fixup_lds_sts_16,
+
+ /// A 6-bit port address.
+ fixup_port6,
+ /// A 5-bit port address.
+ fixup_port5,
+
+ // Marker
+ LastTargetFixupKind,
+ NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+
+namespace fixups {
+
+/// Adjusts the value of a branch target.
+/// All branch targets in AVR are rightshifted by 1 to take advantage
+/// of the fact that all instructions are aligned to addresses of size
+/// 2, so bit 0 of an address is always 0. This gives us another bit
+/// of precision.
+/// \param[in,out] The target to adjust.
+template <typename T> inline void adjustBranchTarget(T &val) { val >>= 1; }
+
+} // end of namespace fixups
+}
+} // end of namespace llvm::AVR
+
+#endif // LLVM_AVR_FIXUP_KINDS_H
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
new file mode 100644
index 000000000000..cca3bcc4968a
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
@@ -0,0 +1,28 @@
+//===-- AVRMCAsmInfo.cpp - AVR asm properties -----------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the AVRMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVRMCAsmInfo.h"
+
+#include "llvm/ADT/Triple.h"
+
+namespace llvm {
+
+AVRMCAsmInfo::AVRMCAsmInfo(const Triple &TT) {
+ PointerSize = 2;
+ CalleeSaveStackSlotSize = 2;
+ CommentString = ";";
+ PrivateGlobalPrefix = ".L";
+ UsesELFSectionDirectiveForBSS = true;
+}
+
+} // end of namespace llvm
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h
new file mode 100644
index 000000000000..cc2207a3cfae
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h
@@ -0,0 +1,31 @@
+//===-- AVRMCAsmInfo.h - AVR asm properties ---------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the AVRMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_ASM_INFO_H
+#define LLVM_AVR_ASM_INFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+
+class Triple;
+
+/// Specifies the format of AVR assembly files.
+class AVRMCAsmInfo : public MCAsmInfo {
+public:
+ explicit AVRMCAsmInfo(const Triple &TT);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_ASM_INFO_H
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
new file mode 100644
index 000000000000..e6dc8868c705
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
@@ -0,0 +1,304 @@
+//===-- AVRMCCodeEmitter.cpp - Convert AVR Code to Machine Code -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AVRMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVRMCCodeEmitter.h"
+
+#include "MCTargetDesc/AVRMCExpr.h"
+#include "MCTargetDesc/AVRMCTargetDesc.h"
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "mccodeemitter"
+
+#define GET_INSTRMAP_INFO
+#include "AVRGenInstrInfo.inc"
+#undef GET_INSTRMAP_INFO
+
+namespace llvm {
+
+/// Performs a post-encoding step on a `LD` or `ST` instruction.
+///
+/// The encoding of the LD/ST family of instructions is inconsistent w.r.t
+/// the pointer register and the addressing mode.
+///
+/// The permutations of the format are as followed:
+/// ld Rd, X `1001 000d dddd 1100`
+/// ld Rd, X+ `1001 000d dddd 1101`
+/// ld Rd, -X `1001 000d dddd 1110`
+///
+/// ld Rd, Y `1000 000d dddd 1000`
+/// ld Rd, Y+ `1001 000d dddd 1001`
+/// ld Rd, -Y `1001 000d dddd 1010`
+///
+/// ld Rd, Z `1000 000d dddd 0000`
+/// ld Rd, Z+ `1001 000d dddd 0001`
+/// ld Rd, -Z `1001 000d dddd 0010`
+/// ^
+/// |
+/// Note this one inconsistent bit - it is 1 sometimes and 0 at other times.
+/// There is no logical pattern. Looking at a truth table, the following
+/// formula can be derived to fit the pattern:
+//
+/// ```
+/// inconsistent_bit = is_predec OR is_postinc OR is_reg_x
+/// ```
+//
+/// We manually set this bit in this post encoder method.
+unsigned
+AVRMCCodeEmitter::loadStorePostEncoder(const MCInst &MI, unsigned EncodedValue,
+ const MCSubtargetInfo &STI) const {
+
+ assert(MI.getOperand(0).isReg() && MI.getOperand(1).isReg() &&
+ "the load/store operands must be registers");
+
+ unsigned Opcode = MI.getOpcode();
+
+ // check whether either of the registers are the X pointer register.
+ bool IsRegX = MI.getOperand(0).getReg() == AVR::R27R26 ||
+ MI.getOperand(1).getReg() == AVR::R27R26;
+
+ bool IsPredec = Opcode == AVR::LDRdPtrPd || Opcode == AVR::STPtrPdRr;
+ bool IsPostinc = Opcode == AVR::LDRdPtrPi || Opcode == AVR::STPtrPiRr;
+
+ // Check if we need to set the inconsistent bit
+ if (IsRegX || IsPredec || IsPostinc) {
+ EncodedValue |= (1 << 12);
+ }
+
+ return EncodedValue;
+}
+
+template <AVR::Fixups Fixup>
+unsigned
+AVRMCCodeEmitter::encodeRelCondBrTarget(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpNo);
+
+ if (MO.isExpr()) {
+ Fixups.push_back(MCFixup::create(0, MO.getExpr(),
+ MCFixupKind(Fixup), MI.getLoc()));
+ return 0;
+ }
+
+ assert(MO.isImm());
+
+ // Take the size of the current instruction away.
+ // With labels, this is implicitly done.
+ auto target = MO.getImm();
+ AVR::fixups::adjustBranchTarget(target);
+ return target;
+}
+
+unsigned AVRMCCodeEmitter::encodeLDSTPtrReg(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ auto MO = MI.getOperand(OpNo);
+
+ // The operand should be a pointer register.
+ assert(MO.isReg());
+
+ switch (MO.getReg()) {
+ case AVR::R27R26: return 0x03; // X: 0b11
+ case AVR::R29R28: return 0x02; // Y: 0b10
+ case AVR::R31R30: return 0x00; // Z: 0b00
+ default:
+ llvm_unreachable("invalid pointer register");
+ }
+}
+
+/// Encodes a `memri` operand.
+/// The operand is 7-bits.
+/// * The lower 6 bits is the immediate
+/// * The upper bit is the pointer register bit (Z=0,Y=1)
+unsigned AVRMCCodeEmitter::encodeMemri(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ auto RegOp = MI.getOperand(OpNo);
+ auto OffsetOp = MI.getOperand(OpNo + 1);
+
+ assert(RegOp.isReg() && "Expected register operand");
+
+ uint8_t RegBit = 0;
+
+ switch (RegOp.getReg()) {
+ default:
+ llvm_unreachable("Expected either Y or Z register");
+ case AVR::R31R30:
+ RegBit = 0;
+ break; // Z register
+ case AVR::R29R28:
+ RegBit = 1;
+ break; // Y register
+ }
+
+ int8_t OffsetBits;
+
+ if (OffsetOp.isImm()) {
+ OffsetBits = OffsetOp.getImm();
+ } else if (OffsetOp.isExpr()) {
+ OffsetBits = 0;
+ Fixups.push_back(MCFixup::create(0, OffsetOp.getExpr(),
+ MCFixupKind(AVR::fixup_6), MI.getLoc()));
+ } else {
+ llvm_unreachable("invalid value for offset");
+ }
+
+ return (RegBit << 6) | OffsetBits;
+}
+
+unsigned AVRMCCodeEmitter::encodeComplement(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // The operand should be an immediate.
+ assert(MI.getOperand(OpNo).isImm());
+
+ auto Imm = MI.getOperand(OpNo).getImm();
+ return (~0) - Imm;
+}
+
+template <AVR::Fixups Fixup>
+unsigned AVRMCCodeEmitter::encodeImm(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ auto MO = MI.getOperand(OpNo);
+
+ if (MO.isExpr()) {
+ if (isa<AVRMCExpr>(MO.getExpr())) {
+ // If the expression is already an AVRMCExpr (i.e. a lo8(symbol),
+ // we shouldn't perform any more fixups. Without this check, we would
+ // instead create a fixup to the symbol named 'lo8(symbol)' which
+ // is not correct.
+ return getExprOpValue(MO.getExpr(), Fixups, STI);
+ }
+
+ MCFixupKind FixupKind = static_cast<MCFixupKind>(Fixup);
+ Fixups.push_back(MCFixup::create(0, MO.getExpr(), FixupKind, MI.getLoc()));
+
+ return 0;
+ }
+
+ assert(MO.isImm());
+ return MO.getImm();
+}
+
+unsigned AVRMCCodeEmitter::encodeCallTarget(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ auto MO = MI.getOperand(OpNo);
+
+ if (MO.isExpr()) {
+ MCFixupKind FixupKind = static_cast<MCFixupKind>(AVR::fixup_call);
+ Fixups.push_back(MCFixup::create(0, MO.getExpr(), FixupKind, MI.getLoc()));
+ return 0;
+ }
+
+ assert(MO.isImm());
+
+ auto Target = MO.getImm();
+ AVR::fixups::adjustBranchTarget(Target);
+ return Target;
+}
+
+unsigned AVRMCCodeEmitter::getExprOpValue(const MCExpr *Expr,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+
+ MCExpr::ExprKind Kind = Expr->getKind();
+
+ if (Kind == MCExpr::Binary) {
+ Expr = static_cast<const MCBinaryExpr *>(Expr)->getLHS();
+ Kind = Expr->getKind();
+ }
+
+ if (Kind == MCExpr::Target) {
+ AVRMCExpr const *AVRExpr = cast<AVRMCExpr>(Expr);
+ int64_t Result;
+ if (AVRExpr->evaluateAsConstant(Result)) {
+ return Result;
+ }
+
+ MCFixupKind FixupKind = static_cast<MCFixupKind>(AVRExpr->getFixupKind());
+ Fixups.push_back(MCFixup::create(0, AVRExpr, FixupKind));
+ return 0;
+ }
+
+ assert(Kind == MCExpr::SymbolRef);
+ return 0;
+}
+
+unsigned AVRMCCodeEmitter::getMachineOpValue(const MCInst &MI,
+ const MCOperand &MO,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ if (MO.isReg()) return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
+ if (MO.isImm()) return static_cast<unsigned>(MO.getImm());
+
+ if (MO.isFPImm())
+ return static_cast<unsigned>(APFloat(MO.getFPImm())
+ .bitcastToAPInt()
+ .getHiBits(32)
+ .getLimitedValue());
+
+ // MO must be an Expr.
+ assert(MO.isExpr());
+
+ return getExprOpValue(MO.getExpr(), Fixups, STI);
+}
+
+void AVRMCCodeEmitter::emitInstruction(uint64_t Val, unsigned Size,
+ const MCSubtargetInfo &STI,
+ raw_ostream &OS) const {
+ const uint16_t *Words = reinterpret_cast<uint16_t const *>(&Val);
+ size_t WordCount = Size / 2;
+
+ for (int64_t i = WordCount - 1; i >= 0; --i) {
+ uint16_t Word = Words[i];
+
+ OS << (uint8_t) ((Word & 0x00ff) >> 0);
+ OS << (uint8_t) ((Word & 0xff00) >> 8);
+ }
+}
+
+void AVRMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+
+ // Get byte count of instruction
+ unsigned Size = Desc.getSize();
+
+ assert(Size > 0 && "Instruction size cannot be zero");
+
+ uint64_t BinaryOpCode = getBinaryCodeForInstr(MI, Fixups, STI);
+ emitInstruction(BinaryOpCode, Size, STI, OS);
+}
+
+MCCodeEmitter *createAVRMCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx) {
+ return new AVRMCCodeEmitter(MCII, Ctx);
+}
+
+#include "AVRGenMCCodeEmitter.inc"
+
+} // end of namespace llvm
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.h b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.h
new file mode 100644
index 000000000000..5fa425c296a5
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.h
@@ -0,0 +1,115 @@
+//===-- AVRMCCodeEmitter.h - Convert AVR Code to Machine Code -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the AVRMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#ifndef LLVM_AVR_CODE_EMITTER_H
+#define LLVM_AVR_CODE_EMITTER_H
+
+#include "AVRFixupKinds.h"
+
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/Support/DataTypes.h"
+
+#define GET_INSTRINFO_OPERAND_TYPES_ENUM
+#include "AVRGenInstrInfo.inc"
+
+namespace llvm {
+
+class MCContext;
+class MCExpr;
+class MCFixup;
+class MCInst;
+class MCInstrInfo;
+class MCOperand;
+class MCSubtargetInfo;
+class raw_ostream;
+
+/// Writes AVR machine code to a stream.
+class AVRMCCodeEmitter : public MCCodeEmitter {
+public:
+ AVRMCCodeEmitter(const MCInstrInfo &MCII, MCContext &Ctx)
+ : MCII(MCII), Ctx(Ctx) {}
+
+private:
+ /// Finishes up encoding an LD/ST instruction.
+ /// The purpose of this function is to set an bit in the instruction
+ /// which follows no logical pattern. See the implementation for details.
+ unsigned loadStorePostEncoder(const MCInst &MI, unsigned EncodedValue,
+ const MCSubtargetInfo &STI) const;
+
+ /// Gets the encoding for a conditional branch target.
+ template <AVR::Fixups Fixup>
+ unsigned encodeRelCondBrTarget(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// Encodes the `PTRREGS` operand to a load or store instruction.
+ unsigned encodeLDSTPtrReg(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// Encodes a `register+immediate` operand for `LDD`/`STD`.
+ unsigned encodeMemri(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// Takes the compliment of a number (~0 - val).
+ unsigned encodeComplement(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// Encodes an immediate value with a given fixup.
+ template <AVR::Fixups Fixup>
+ unsigned encodeImm(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// Gets the encoding of the target for the `CALL k` instruction.
+ unsigned encodeCallTarget(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// TableGen'ed function to get the binary encoding for an instruction.
+ uint64_t getBinaryCodeForInstr(const MCInst &MI,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ unsigned getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// Returns the binary encoding of operand.
+ ///
+ /// If the machine operand requires relocation, the relocation is recorded
+ /// and zero is returned.
+ unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ void emitInstruction(uint64_t Val, unsigned Size, const MCSubtargetInfo &STI,
+ raw_ostream &OS) const;
+
+ void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const override;
+
+ AVRMCCodeEmitter(const AVRMCCodeEmitter &) = delete;
+ void operator=(const AVRMCCodeEmitter &) = delete;
+
+ const MCInstrInfo &MCII;
+ MCContext &Ctx;
+};
+
+} // end namespace of llvm.
+
+#endif // LLVM_AVR_CODE_EMITTER_H
+
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
new file mode 100644
index 000000000000..400296b8409b
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
@@ -0,0 +1,189 @@
+//===-- AVRMCExpr.cpp - AVR specific MC expression classes ----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVRMCExpr.h"
+
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/MC/MCAsmLayout.h"
+
+namespace llvm {
+
+namespace {
+
+const struct ModifierEntry {
+ const char * const Spelling;
+ AVRMCExpr::VariantKind VariantKind;
+} ModifierNames[] = {
+ {"lo8", AVRMCExpr::VK_AVR_LO8}, {"hi8", AVRMCExpr::VK_AVR_HI8},
+ {"hh8", AVRMCExpr::VK_AVR_HH8}, // synonym with hlo8
+ {"hlo8", AVRMCExpr::VK_AVR_HH8}, {"hhi8", AVRMCExpr::VK_AVR_HHI8},
+
+ {"pm_lo8", AVRMCExpr::VK_AVR_PM_LO8}, {"pm_hi8", AVRMCExpr::VK_AVR_PM_HI8},
+ {"pm_hh8", AVRMCExpr::VK_AVR_PM_HH8},
+};
+
+} // end of anonymous namespace
+
+const AVRMCExpr *AVRMCExpr::create(VariantKind Kind, const MCExpr *Expr,
+ bool Negated, MCContext &Ctx) {
+ return new (Ctx) AVRMCExpr(Kind, Expr, Negated);
+}
+
+void AVRMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
+ assert(Kind != VK_AVR_None);
+
+ if (isNegated())
+ OS << '-';
+
+ OS << getName() << '(';
+ getSubExpr()->print(OS, MAI);
+ OS << ')';
+}
+
+bool AVRMCExpr::evaluateAsConstant(int64_t &Result) const {
+ MCValue Value;
+
+ bool isRelocatable =
+ getSubExpr()->evaluateAsRelocatable(Value, nullptr, nullptr);
+
+ if (!isRelocatable)
+ return false;
+
+ if (Value.isAbsolute()) {
+ Result = evaluateAsInt64(Value.getConstant());
+ return true;
+ }
+
+ return false;
+}
+
+bool AVRMCExpr::evaluateAsRelocatableImpl(MCValue &Result,
+ const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const {
+ MCValue Value;
+ bool isRelocatable = SubExpr->evaluateAsRelocatable(Value, Layout, Fixup);
+
+ if (!isRelocatable)
+ return false;
+
+ if (Value.isAbsolute()) {
+ Result = MCValue::get(evaluateAsInt64(Value.getConstant()));
+ } else {
+ if (!Layout) return false;
+
+ MCContext &Context = Layout->getAssembler().getContext();
+ const MCSymbolRefExpr *Sym = Value.getSymA();
+ MCSymbolRefExpr::VariantKind Modifier = Sym->getKind();
+ if (Modifier != MCSymbolRefExpr::VK_None)
+ return false;
+
+ Sym = MCSymbolRefExpr::create(&Sym->getSymbol(), Modifier, Context);
+ Result = MCValue::get(Sym, Value.getSymB(), Value.getConstant());
+ }
+
+ return true;
+}
+
+int64_t AVRMCExpr::evaluateAsInt64(int64_t Value) const {
+ if (Negated)
+ Value *= -1;
+
+ switch (Kind) {
+ case AVRMCExpr::VK_AVR_LO8:
+ break;
+ case AVRMCExpr::VK_AVR_HI8:
+ Value >>= 8;
+ break;
+ case AVRMCExpr::VK_AVR_HH8:
+ Value >>= 16;
+ break;
+ case AVRMCExpr::VK_AVR_HHI8:
+ Value >>= 24;
+ break;
+ case AVRMCExpr::VK_AVR_PM_LO8:
+ Value >>= 1;
+ break;
+ case AVRMCExpr::VK_AVR_PM_HI8:
+ Value >>= 9;
+ break;
+ case AVRMCExpr::VK_AVR_PM_HH8:
+ Value >>= 17;
+ break;
+
+ case AVRMCExpr::VK_AVR_None:
+ llvm_unreachable("Uninitialized expression.");
+ }
+ return static_cast<uint64_t>(Value) & 0xff;
+}
+
+AVR::Fixups AVRMCExpr::getFixupKind() const {
+ AVR::Fixups Kind = AVR::Fixups::LastTargetFixupKind;
+
+ switch (getKind()) {
+ case VK_AVR_LO8:
+ Kind = isNegated() ? AVR::fixup_lo8_ldi_neg : AVR::fixup_lo8_ldi;
+ break;
+ case VK_AVR_HI8:
+ Kind = isNegated() ? AVR::fixup_hi8_ldi_neg : AVR::fixup_hi8_ldi;
+ break;
+ case VK_AVR_HH8:
+ Kind = isNegated() ? AVR::fixup_hh8_ldi_neg : AVR::fixup_hh8_ldi;
+ break;
+ case VK_AVR_HHI8:
+ Kind = isNegated() ? AVR::fixup_ms8_ldi_neg : AVR::fixup_ms8_ldi;
+ break;
+
+ case VK_AVR_PM_LO8:
+ Kind = isNegated() ? AVR::fixup_lo8_ldi_pm_neg : AVR::fixup_lo8_ldi_pm;
+ break;
+ case VK_AVR_PM_HI8:
+ Kind = isNegated() ? AVR::fixup_hi8_ldi_pm_neg : AVR::fixup_hi8_ldi_pm;
+ break;
+ case VK_AVR_PM_HH8:
+ Kind = isNegated() ? AVR::fixup_hh8_ldi_pm_neg : AVR::fixup_hh8_ldi_pm;
+ break;
+
+ case VK_AVR_None:
+ llvm_unreachable("Uninitialized expression");
+ }
+
+ return Kind;
+}
+
+void AVRMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
+ Streamer.visitUsedExpr(*getSubExpr());
+}
+
+const char *AVRMCExpr::getName() const {
+ const auto &Modifier = std::find_if(
+ std::begin(ModifierNames), std::end(ModifierNames),
+ [this](ModifierEntry const &Mod) { return Mod.VariantKind == Kind; });
+
+ if (Modifier != std::end(ModifierNames)) {
+ return Modifier->Spelling;
+ }
+ return nullptr;
+}
+
+AVRMCExpr::VariantKind AVRMCExpr::getKindByName(StringRef Name) {
+ const auto &Modifier = std::find_if(
+ std::begin(ModifierNames), std::end(ModifierNames),
+ [&Name](ModifierEntry const &Mod) { return Mod.Spelling == Name; });
+
+ if (Modifier != std::end(ModifierNames)) {
+ return Modifier->VariantKind;
+ }
+ return VK_AVR_None;
+}
+
+} // end of namespace llvm
+
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h
new file mode 100644
index 000000000000..be565a8be340
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h
@@ -0,0 +1,88 @@
+//===-- AVRMCExpr.h - AVR specific MC expression classes --------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_MCEXPR_H
+#define LLVM_AVR_MCEXPR_H
+
+#include "llvm/MC/MCExpr.h"
+
+#include "MCTargetDesc/AVRFixupKinds.h"
+
+namespace llvm {
+
+/// A expression in AVR machine code.
+class AVRMCExpr : public MCTargetExpr {
+public:
+ /// Specifies the type of an expression.
+ enum VariantKind {
+ VK_AVR_None,
+
+ VK_AVR_HI8, ///< Corresponds to `hi8()`.
+ VK_AVR_LO8, ///< Corresponds to `lo8()`.
+ VK_AVR_HH8, ///< Corresponds to `hlo8() and hh8()`.
+ VK_AVR_HHI8, ///< Corresponds to `hhi8()`.
+
+ VK_AVR_PM_LO8, ///< Corresponds to `pm_lo8()`.
+ VK_AVR_PM_HI8, ///< Corresponds to `pm_hi8()`.
+ VK_AVR_PM_HH8 ///< Corresponds to `pm_hh8()`.
+ };
+
+public:
+ /// Creates an AVR machine code expression.
+ static const AVRMCExpr *create(VariantKind Kind, const MCExpr *Expr,
+ bool isNegated, MCContext &Ctx);
+
+ /// Gets the type of the expression.
+ VariantKind getKind() const { return Kind; }
+ /// Gets the name of the expression.
+ const char *getName() const;
+ const MCExpr *getSubExpr() const { return SubExpr; }
+ /// Gets the fixup which corresponds to the expression.
+ AVR::Fixups getFixupKind() const;
+ /// Evaluates the fixup as a constant value.
+ bool evaluateAsConstant(int64_t &Result) const;
+
+ bool isNegated() const { return Negated; }
+ void setNegated(bool negated = true) { Negated = negated; }
+
+ void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+ bool evaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const override;
+
+ void visitUsedExpr(MCStreamer &streamer) const override;
+
+ MCFragment *findAssociatedFragment() const override {
+ return getSubExpr()->findAssociatedFragment();
+ }
+
+ void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {}
+
+ static bool classof(const MCExpr *E) {
+ return E->getKind() == MCExpr::Target;
+ }
+
+public:
+ static VariantKind getKindByName(StringRef Name);
+
+private:
+ int64_t evaluateAsInt64(int64_t Value) const;
+
+ const VariantKind Kind;
+ const MCExpr *SubExpr;
+ bool Negated;
+
+private:
+ explicit AVRMCExpr(VariantKind Kind, const MCExpr *Expr, bool Negated)
+ : Kind(Kind), SubExpr(Expr), Negated(Negated) {}
+ ~AVRMCExpr() {}
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_MCEXPR_H
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
new file mode 100644
index 000000000000..a4fa5c0a9310
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
@@ -0,0 +1,121 @@
+//===-- AVRMCTargetDesc.cpp - AVR Target Descriptions ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides AVR specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVRELFStreamer.h"
+#include "AVRMCAsmInfo.h"
+#include "AVRMCTargetDesc.h"
+#include "AVRTargetStreamer.h"
+#include "InstPrinter/AVRInstPrinter.h"
+
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_INSTRINFO_MC_DESC
+#include "AVRGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "AVRGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "AVRGenRegisterInfo.inc"
+
+using namespace llvm;
+
+static MCInstrInfo *createAVRMCInstrInfo() {
+ MCInstrInfo *X = new MCInstrInfo();
+ InitAVRMCInstrInfo(X);
+
+ return X;
+}
+
+static MCRegisterInfo *createAVRMCRegisterInfo(const Triple &TT) {
+ MCRegisterInfo *X = new MCRegisterInfo();
+ InitAVRMCRegisterInfo(X, 0);
+
+ return X;
+}
+
+static MCSubtargetInfo *createAVRMCSubtargetInfo(const Triple &TT,
+ StringRef CPU, StringRef FS) {
+ return createAVRMCSubtargetInfoImpl(TT, CPU, FS);
+}
+
+static MCInstPrinter *createAVRMCInstPrinter(const Triple &T,
+ unsigned SyntaxVariant,
+ const MCAsmInfo &MAI,
+ const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI) {
+ if (SyntaxVariant == 0) {
+ return new AVRInstPrinter(MAI, MII, MRI);
+ }
+
+ return nullptr;
+}
+
+static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context,
+ MCAsmBackend &MAB, raw_pwrite_stream &OS,
+ MCCodeEmitter *Emitter, bool RelaxAll) {
+ return createELFStreamer(Context, MAB, OS, Emitter, RelaxAll);
+}
+
+static MCTargetStreamer *
+createAVRObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
+ return new AVRELFStreamer(S, STI);
+}
+
+static MCTargetStreamer *createMCAsmTargetStreamer(MCStreamer &S,
+ formatted_raw_ostream &OS,
+ MCInstPrinter *InstPrint,
+ bool isVerboseAsm) {
+ return new AVRTargetAsmStreamer(S);
+}
+
+extern "C" void LLVMInitializeAVRTargetMC() {
+ // Register the MC asm info.
+ RegisterMCAsmInfo<AVRMCAsmInfo> X(getTheAVRTarget());
+
+ // Register the MC instruction info.
+ TargetRegistry::RegisterMCInstrInfo(getTheAVRTarget(), createAVRMCInstrInfo);
+
+ // Register the MC register info.
+ TargetRegistry::RegisterMCRegInfo(getTheAVRTarget(), createAVRMCRegisterInfo);
+
+ // Register the MC subtarget info.
+ TargetRegistry::RegisterMCSubtargetInfo(getTheAVRTarget(),
+ createAVRMCSubtargetInfo);
+
+ // Register the MCInstPrinter.
+ TargetRegistry::RegisterMCInstPrinter(getTheAVRTarget(),
+ createAVRMCInstPrinter);
+
+ // Register the MC Code Emitter
+ TargetRegistry::RegisterMCCodeEmitter(getTheAVRTarget(), createAVRMCCodeEmitter);
+
+ // Register the ELF streamer
+ TargetRegistry::RegisterELFStreamer(getTheAVRTarget(), createMCStreamer);
+
+ // Register the obj target streamer.
+ TargetRegistry::RegisterObjectTargetStreamer(getTheAVRTarget(),
+ createAVRObjectTargetStreamer);
+
+ // Register the asm target streamer.
+ TargetRegistry::RegisterAsmTargetStreamer(getTheAVRTarget(),
+ createMCAsmTargetStreamer);
+
+ // Register the asm backend (as little endian).
+ TargetRegistry::RegisterMCAsmBackend(getTheAVRTarget(), createAVRAsmBackend);
+}
+
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h
new file mode 100644
index 000000000000..41a574767910
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h
@@ -0,0 +1,59 @@
+//===-- AVRMCTargetDesc.h - AVR Target Descriptions -------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides AVR specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_MCTARGET_DESC_H
+#define LLVM_AVR_MCTARGET_DESC_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCTargetOptions;
+class StringRef;
+class Target;
+class Triple;
+class raw_pwrite_stream;
+
+Target &getTheAVRTarget();
+
+/// Creates a machine code emitter for AVR.
+MCCodeEmitter *createAVRMCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx);
+
+/// Creates an assembly backend for AVR.
+MCAsmBackend *createAVRAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU,
+ const llvm::MCTargetOptions &TO);
+
+/// Creates an ELF object writer for AVR.
+MCObjectWriter *createAVRELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI);
+
+} // end namespace llvm
+
+#define GET_REGINFO_ENUM
+#include "AVRGenRegisterInfo.inc"
+
+#define GET_INSTRINFO_ENUM
+#include "AVRGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "AVRGenSubtargetInfo.inc"
+
+#endif // LLVM_AVR_MCTARGET_DESC_H
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.cpp b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.cpp
new file mode 100644
index 000000000000..a2d8c16eeb8c
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.cpp
@@ -0,0 +1,24 @@
+//===-- AVRTargetStreamer.cpp - AVR Target Streamer Methods ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides AVR specific target streamer methods.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVRTargetStreamer.h"
+
+namespace llvm {
+
+AVRTargetStreamer::AVRTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
+
+AVRTargetAsmStreamer::AVRTargetAsmStreamer(MCStreamer &S)
+ : AVRTargetStreamer(S) {}
+
+} // end namespace llvm
+
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.h b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.h
new file mode 100644
index 000000000000..99a536699ae9
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.h
@@ -0,0 +1,32 @@
+//===-- AVRTargetStreamer.h - AVR Target Streamer --------------*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_TARGET_STREAMER_H
+#define LLVM_AVR_TARGET_STREAMER_H
+
+#include "llvm/MC/MCELFStreamer.h"
+
+namespace llvm {
+class MCStreamer;
+
+/// A generic AVR target output stream.
+class AVRTargetStreamer : public MCTargetStreamer {
+public:
+ explicit AVRTargetStreamer(MCStreamer &S);
+};
+
+/// A target streamer for textual AVR assembly code.
+class AVRTargetAsmStreamer : public AVRTargetStreamer {
+public:
+ explicit AVRTargetAsmStreamer(MCStreamer &S);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_TARGET_STREAMER_H
diff --git a/contrib/llvm/lib/Target/AVR/README.md b/contrib/llvm/lib/Target/AVR/README.md
new file mode 100644
index 000000000000..bd8b453aa81e
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/README.md
@@ -0,0 +1,8 @@
+# AVR backend
+
+This experimental backend is for the 8-bit Atmel [AVR](https://en.wikipedia.org/wiki/Atmel_AVR) microcontroller.
+
+## Useful links
+
+* [Unresolved bugs](https://llvm.org/bugs/buglist.cgi?product=libraries&component=Backend%3A%20AVR&resolution=---&list_id=109466)
+* [Architecture notes](https://github.com/avr-llvm/architecture)
diff --git a/contrib/llvm/lib/Target/AVR/TODO.md b/contrib/llvm/lib/Target/AVR/TODO.md
new file mode 100644
index 000000000000..3a333355646d
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/TODO.md
@@ -0,0 +1,7 @@
+# Write an XFAIL test for this `FIXME` in `AVRInstrInfo.td`
+
+```
+// :FIXME: DAGCombiner produces an shl node after legalization from these seq:
+// BR_JT -> (mul x, 2) -> (shl x, 1)
+```
+
diff --git a/contrib/llvm/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp b/contrib/llvm/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp
new file mode 100644
index 000000000000..36cecaa7ac7a
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp
@@ -0,0 +1,23 @@
+//===-- AVRTargetInfo.cpp - AVR Target Implementation ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+namespace llvm {
+Target &getTheAVRTarget() {
+ static Target TheAVRTarget;
+ return TheAVRTarget;
+}
+}
+
+extern "C" void LLVMInitializeAVRTargetInfo() {
+ llvm::RegisterTarget<llvm::Triple::avr> X(llvm::getTheAVRTarget(), "avr",
+ "Atmel AVR Microcontroller");
+}
+
diff --git a/contrib/llvm/lib/Target/BPF/BPF.h b/contrib/llvm/lib/Target/BPF/BPF.h
new file mode 100644
index 000000000000..4a0cb20357c8
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPF.h
@@ -0,0 +1,22 @@
+//===-- BPF.h - Top-level interface for BPF representation ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_BPF_H
+#define LLVM_LIB_TARGET_BPF_BPF_H
+
+#include "MCTargetDesc/BPFMCTargetDesc.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class BPFTargetMachine;
+
+FunctionPass *createBPFISelDag(BPFTargetMachine &TM);
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/BPF/BPF.td b/contrib/llvm/lib/Target/BPF/BPF.td
new file mode 100644
index 000000000000..11abe520c506
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPF.td
@@ -0,0 +1,44 @@
+//===-- BPF.td - Describe the BPF Target Machine -----------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+include "BPFRegisterInfo.td"
+include "BPFCallingConv.td"
+include "BPFInstrInfo.td"
+
+def BPFInstrInfo : InstrInfo;
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"generic", []>;
+
+def BPFInstPrinter : AsmWriter {
+ string AsmWriterClassName = "InstPrinter";
+ bit isMCAsmWriter = 1;
+}
+
+def BPFAsmParser : AsmParser {
+ bit HasMnemonicFirst = 0;
+}
+
+def BPFAsmParserVariant : AsmParserVariant {
+ int Variant = 0;
+ string Name = "BPF";
+ string BreakCharacters = ".";
+ string TokenizingCharacters = "#()[]=:.<>!+*";
+}
+
+def BPF : Target {
+ let InstructionSet = BPFInstrInfo;
+ let AssemblyWriters = [BPFInstPrinter];
+ let AssemblyParsers = [BPFAsmParser];
+ let AssemblyParserVariants = [BPFAsmParserVariant];
+}
diff --git a/contrib/llvm/lib/Target/BPF/BPFAsmPrinter.cpp b/contrib/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
new file mode 100644
index 000000000000..c5201465e074
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
@@ -0,0 +1,61 @@
+//===-- BPFAsmPrinter.cpp - BPF LLVM assembly writer ----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to the BPF assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPF.h"
+#include "BPFInstrInfo.h"
+#include "BPFMCInstLower.h"
+#include "BPFTargetMachine.h"
+#include "InstPrinter/BPFInstPrinter.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+namespace {
+class BPFAsmPrinter : public AsmPrinter {
+public:
+ explicit BPFAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
+ : AsmPrinter(TM, std::move(Streamer)) {}
+
+ StringRef getPassName() const override { return "BPF Assembly Printer"; }
+
+ void EmitInstruction(const MachineInstr *MI) override;
+};
+}
+
+void BPFAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+
+ BPFMCInstLower MCInstLowering(OutContext, *this);
+
+ MCInst TmpInst;
+ MCInstLowering.Lower(MI, TmpInst);
+ EmitToStreamer(*OutStreamer, TmpInst);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeBPFAsmPrinter() {
+ RegisterAsmPrinter<BPFAsmPrinter> X(getTheBPFleTarget());
+ RegisterAsmPrinter<BPFAsmPrinter> Y(getTheBPFbeTarget());
+ RegisterAsmPrinter<BPFAsmPrinter> Z(getTheBPFTarget());
+}
diff --git a/contrib/llvm/lib/Target/BPF/BPFCallingConv.td b/contrib/llvm/lib/Target/BPF/BPFCallingConv.td
new file mode 100644
index 000000000000..8cec6fa54698
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFCallingConv.td
@@ -0,0 +1,29 @@
+//===-- BPFCallingConv.td - Calling Conventions BPF --------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the BPF architecture.
+//
+//===----------------------------------------------------------------------===//
+
+// BPF 64-bit C return-value convention.
+def RetCC_BPF64 : CallingConv<[CCIfType<[i64], CCAssignToReg<[R0]>>]>;
+
+// BPF 64-bit C Calling convention.
+def CC_BPF64 : CallingConv<[
+ // Promote i8/i16/i32 args to i64
+ CCIfType<[ i8, i16, i32 ], CCPromoteToType<i64>>,
+
+ // All arguments get passed in integer registers if there is space.
+ CCIfType<[i64], CCAssignToReg<[ R1, R2, R3, R4, R5 ]>>,
+
+ // Could be assigned to the stack in 8-byte aligned units, but unsupported
+ CCAssignToStack<8, 8>
+]>;
+
+def CSR : CalleeSavedRegs<(add R6, R7, R8, R9, R10)>;
diff --git a/contrib/llvm/lib/Target/BPF/BPFFrameLowering.cpp b/contrib/llvm/lib/Target/BPF/BPFFrameLowering.cpp
new file mode 100644
index 000000000000..c2806c85f24f
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFFrameLowering.cpp
@@ -0,0 +1,40 @@
+//===-- BPFFrameLowering.cpp - BPF Frame Information ----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the BPF implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPFFrameLowering.h"
+#include "BPFInstrInfo.h"
+#include "BPFSubtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+bool BPFFrameLowering::hasFP(const MachineFunction &MF) const { return true; }
+
+void BPFFrameLowering::emitPrologue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {}
+
+void BPFFrameLowering::emitEpilogue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {}
+
+void BPFFrameLowering::determineCalleeSaves(MachineFunction &MF,
+ BitVector &SavedRegs,
+ RegScavenger *RS) const {
+ TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+ SavedRegs.reset(BPF::R6);
+ SavedRegs.reset(BPF::R7);
+ SavedRegs.reset(BPF::R8);
+ SavedRegs.reset(BPF::R9);
+}
diff --git a/contrib/llvm/lib/Target/BPF/BPFFrameLowering.h b/contrib/llvm/lib/Target/BPF/BPFFrameLowering.h
new file mode 100644
index 000000000000..5db963f518b1
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFFrameLowering.h
@@ -0,0 +1,41 @@
+//===-- BPFFrameLowering.h - Define frame lowering for BPF -----*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements BPF-specific bits of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_BPFFRAMELOWERING_H
+#define LLVM_LIB_TARGET_BPF_BPFFRAMELOWERING_H
+
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+class BPFSubtarget;
+
+class BPFFrameLowering : public TargetFrameLowering {
+public:
+ explicit BPFFrameLowering(const BPFSubtarget &sti)
+ : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 8, 0) {}
+
+ void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+ void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+ bool hasFP(const MachineFunction &MF) const override;
+ void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+ RegScavenger *RS) const override;
+
+ MachineBasicBlock::iterator
+ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI) const override {
+ return MBB.erase(MI);
+ }
+};
+}
+#endif
diff --git a/contrib/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp b/contrib/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
new file mode 100644
index 000000000000..12091449cc11
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
@@ -0,0 +1,186 @@
+//===-- BPFISelDAGToDAG.cpp - A dag to dag inst selector for BPF ----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a DAG pattern matching instruction selector for BPF,
+// converting from a legalized dag to a BPF dag.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPF.h"
+#include "BPFRegisterInfo.h"
+#include "BPFSubtarget.h"
+#include "BPFTargetMachine.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "bpf-isel"
+
+// Instruction Selector Implementation
+namespace {
+
+class BPFDAGToDAGISel : public SelectionDAGISel {
+public:
+ explicit BPFDAGToDAGISel(BPFTargetMachine &TM) : SelectionDAGISel(TM) {}
+
+ StringRef getPassName() const override {
+ return "BPF DAG->DAG Pattern Instruction Selection";
+ }
+
+private:
+// Include the pieces autogenerated from the target description.
+#include "BPFGenDAGISel.inc"
+
+ void Select(SDNode *N) override;
+
+ // Complex Pattern for address selection.
+ bool SelectAddr(SDValue Addr, SDValue &Base, SDValue &Offset);
+ bool SelectFIAddr(SDValue Addr, SDValue &Base, SDValue &Offset);
+};
+}
+
+// ComplexPattern used on BPF Load/Store instructions
+bool BPFDAGToDAGISel::SelectAddr(SDValue Addr, SDValue &Base, SDValue &Offset) {
+ // if Address is FI, get the TargetFrameIndex.
+ SDLoc DL(Addr);
+ if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+ Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64);
+ Offset = CurDAG->getTargetConstant(0, DL, MVT::i64);
+ return true;
+ }
+
+ if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+ Addr.getOpcode() == ISD::TargetGlobalAddress)
+ return false;
+
+ // Addresses of the form Addr+const or Addr|const
+ if (CurDAG->isBaseWithConstantOffset(Addr)) {
+ ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+ if (isInt<32>(CN->getSExtValue())) {
+
+ // If the first operand is a FI, get the TargetFI Node
+ if (FrameIndexSDNode *FIN =
+ dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
+ Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64);
+ else
+ Base = Addr.getOperand(0);
+
+ Offset = CurDAG->getTargetConstant(CN->getSExtValue(), DL, MVT::i64);
+ return true;
+ }
+ }
+
+ Base = Addr;
+ Offset = CurDAG->getTargetConstant(0, DL, MVT::i64);
+ return true;
+}
+
+// ComplexPattern used on BPF FI instruction
+bool BPFDAGToDAGISel::SelectFIAddr(SDValue Addr, SDValue &Base, SDValue &Offset) {
+ SDLoc DL(Addr);
+
+ if (!CurDAG->isBaseWithConstantOffset(Addr))
+ return false;
+
+ // Addresses of the form Addr+const or Addr|const
+ ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+ if (isInt<32>(CN->getSExtValue())) {
+
+ // If the first operand is a FI, get the TargetFI Node
+ if (FrameIndexSDNode *FIN =
+ dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
+ Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64);
+ else
+ return false;
+
+ Offset = CurDAG->getTargetConstant(CN->getSExtValue(), DL, MVT::i64);
+ return true;
+ }
+
+ return false;
+}
+
+void BPFDAGToDAGISel::Select(SDNode *Node) {
+ unsigned Opcode = Node->getOpcode();
+
+ // Dump information about the Node being selected
+ DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << '\n');
+
+ // If we have a custom node, we already have selected!
+ if (Node->isMachineOpcode()) {
+ DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
+ return;
+ }
+
+ // tablegen selection should be handled here.
+ switch (Opcode) {
+ default: break;
+ case ISD::SDIV: {
+ DebugLoc Empty;
+ const DebugLoc &DL = Node->getDebugLoc();
+ if (DL != Empty)
+ errs() << "Error at line " << DL.getLine() << ": ";
+ else
+ errs() << "Error: ";
+ errs() << "Unsupport signed division for DAG: ";
+ Node->dump(CurDAG);
+ errs() << "Please convert to unsigned div/mod.\n";
+ break;
+ }
+ case ISD::INTRINSIC_W_CHAIN: {
+ unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+ switch (IntNo) {
+ case Intrinsic::bpf_load_byte:
+ case Intrinsic::bpf_load_half:
+ case Intrinsic::bpf_load_word: {
+ SDLoc DL(Node);
+ SDValue Chain = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+ SDValue Skb = Node->getOperand(2);
+ SDValue N3 = Node->getOperand(3);
+
+ SDValue R6Reg = CurDAG->getRegister(BPF::R6, MVT::i64);
+ Chain = CurDAG->getCopyToReg(Chain, DL, R6Reg, Skb, SDValue());
+ Node = CurDAG->UpdateNodeOperands(Node, Chain, N1, R6Reg, N3);
+ break;
+ }
+ }
+ break;
+ }
+
+ case ISD::FrameIndex: {
+ int FI = cast<FrameIndexSDNode>(Node)->getIndex();
+ EVT VT = Node->getValueType(0);
+ SDValue TFI = CurDAG->getTargetFrameIndex(FI, VT);
+ unsigned Opc = BPF::MOV_rr;
+ if (Node->hasOneUse()) {
+ CurDAG->SelectNodeTo(Node, Opc, VT, TFI);
+ return;
+ }
+ ReplaceNode(Node, CurDAG->getMachineNode(Opc, SDLoc(Node), VT, TFI));
+ return;
+ }
+ }
+
+ // Select the default instruction
+ SelectCode(Node);
+}
+
+FunctionPass *llvm::createBPFISelDag(BPFTargetMachine &TM) {
+ return new BPFDAGToDAGISel(TM);
+}
diff --git a/contrib/llvm/lib/Target/BPF/BPFISelLowering.cpp b/contrib/llvm/lib/Target/BPF/BPFISelLowering.cpp
new file mode 100644
index 000000000000..cca3492a1992
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFISelLowering.cpp
@@ -0,0 +1,596 @@
+//===-- BPFISelLowering.cpp - BPF DAG Lowering Implementation ------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that BPF uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPFISelLowering.h"
+#include "BPF.h"
+#include "BPFSubtarget.h"
+#include "BPFTargetMachine.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "bpf-lower"
+
+static void fail(const SDLoc &DL, SelectionDAG &DAG, const char *Msg) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ DAG.getContext()->diagnose(
+ DiagnosticInfoUnsupported(*MF.getFunction(), Msg, DL.getDebugLoc()));
+}
+
+static void fail(const SDLoc &DL, SelectionDAG &DAG, const char *Msg,
+ SDValue Val) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ std::string Str;
+ raw_string_ostream OS(Str);
+ OS << Msg;
+ Val->print(OS);
+ OS.flush();
+ DAG.getContext()->diagnose(
+ DiagnosticInfoUnsupported(*MF.getFunction(), Str, DL.getDebugLoc()));
+}
+
+BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
+ const BPFSubtarget &STI)
+ : TargetLowering(TM) {
+
+ // Set up the register classes.
+ addRegisterClass(MVT::i64, &BPF::GPRRegClass);
+
+ // Compute derived properties from the register classes
+ computeRegisterProperties(STI.getRegisterInfo());
+
+ setStackPointerRegisterToSaveRestore(BPF::R11);
+
+ setOperationAction(ISD::BR_CC, MVT::i64, Custom);
+ setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+ setOperationAction(ISD::BRIND, MVT::Other, Expand);
+ setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+ setOperationAction(ISD::SETCC, MVT::i64, Expand);
+ setOperationAction(ISD::SELECT, MVT::i64, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
+
+ setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
+ setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+ setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+
+ setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+ setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
+ setOperationAction(ISD::SREM, MVT::i64, Expand);
+ setOperationAction(ISD::UREM, MVT::i64, Expand);
+
+ setOperationAction(ISD::MULHU, MVT::i64, Expand);
+ setOperationAction(ISD::MULHS, MVT::i64, Expand);
+ setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+ setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+
+ setOperationAction(ISD::ADDC, MVT::i64, Expand);
+ setOperationAction(ISD::ADDE, MVT::i64, Expand);
+ setOperationAction(ISD::SUBC, MVT::i64, Expand);
+ setOperationAction(ISD::SUBE, MVT::i64, Expand);
+
+ setOperationAction(ISD::ROTR, MVT::i64, Expand);
+ setOperationAction(ISD::ROTL, MVT::i64, Expand);
+ setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
+ setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
+ setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
+
+ setOperationAction(ISD::CTTZ, MVT::i64, Custom);
+ setOperationAction(ISD::CTLZ, MVT::i64, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
+ setOperationAction(ISD::CTPOP, MVT::i64, Expand);
+
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Expand);
+
+ // Extended load operations for i1 types must be promoted
+ for (MVT VT : MVT::integer_valuetypes()) {
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
+ }
+
+ setBooleanContents(ZeroOrOneBooleanContent);
+
+ // Function alignments (log2)
+ setMinFunctionAlignment(3);
+ setPrefFunctionAlignment(3);
+
+ // inline memcpy() for kernel to see explicit copy
+ MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 128;
+ MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 128;
+ MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 128;
+}
+
+SDValue BPFTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+ switch (Op.getOpcode()) {
+ case ISD::BR_CC:
+ return LowerBR_CC(Op, DAG);
+ case ISD::GlobalAddress:
+ return LowerGlobalAddress(Op, DAG);
+ case ISD::SELECT_CC:
+ return LowerSELECT_CC(Op, DAG);
+ default:
+ llvm_unreachable("unimplemented operand");
+ }
+}
+
+// Calling Convention Implementation
+#include "BPFGenCallingConv.inc"
+
+SDValue BPFTargetLowering::LowerFormalArguments(
+ SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+ switch (CallConv) {
+ default:
+ llvm_unreachable("Unsupported calling convention");
+ case CallingConv::C:
+ case CallingConv::Fast:
+ break;
+ }
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineRegisterInfo &RegInfo = MF.getRegInfo();
+
+ // Assign locations to all of the incoming arguments.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
+ CCInfo.AnalyzeFormalArguments(Ins, CC_BPF64);
+
+ for (auto &VA : ArgLocs) {
+ if (VA.isRegLoc()) {
+ // Arguments passed in registers
+ EVT RegVT = VA.getLocVT();
+ switch (RegVT.getSimpleVT().SimpleTy) {
+ default: {
+ errs() << "LowerFormalArguments Unhandled argument type: "
+ << RegVT.getEVTString() << '\n';
+ llvm_unreachable(0);
+ }
+ case MVT::i64:
+ unsigned VReg = RegInfo.createVirtualRegister(&BPF::GPRRegClass);
+ RegInfo.addLiveIn(VA.getLocReg(), VReg);
+ SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, RegVT);
+
+ // If this is an 8/16/32-bit value, it is really passed promoted to 64
+ // bits. Insert an assert[sz]ext to capture this, then truncate to the
+ // right size.
+ if (VA.getLocInfo() == CCValAssign::SExt)
+ ArgValue = DAG.getNode(ISD::AssertSext, DL, RegVT, ArgValue,
+ DAG.getValueType(VA.getValVT()));
+ else if (VA.getLocInfo() == CCValAssign::ZExt)
+ ArgValue = DAG.getNode(ISD::AssertZext, DL, RegVT, ArgValue,
+ DAG.getValueType(VA.getValVT()));
+
+ if (VA.getLocInfo() != CCValAssign::Full)
+ ArgValue = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), ArgValue);
+
+ InVals.push_back(ArgValue);
+ }
+ } else {
+ fail(DL, DAG, "defined with too many args");
+ InVals.push_back(DAG.getConstant(0, DL, VA.getLocVT()));
+ }
+ }
+
+ if (IsVarArg || MF.getFunction()->hasStructRetAttr()) {
+ fail(DL, DAG, "functions with VarArgs or StructRet are not supported");
+ }
+
+ return Chain;
+}
+
+const unsigned BPFTargetLowering::MaxArgs = 5;
+
+SDValue BPFTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const {
+ SelectionDAG &DAG = CLI.DAG;
+ auto &Outs = CLI.Outs;
+ auto &OutVals = CLI.OutVals;
+ auto &Ins = CLI.Ins;
+ SDValue Chain = CLI.Chain;
+ SDValue Callee = CLI.Callee;
+ bool &IsTailCall = CLI.IsTailCall;
+ CallingConv::ID CallConv = CLI.CallConv;
+ bool IsVarArg = CLI.IsVarArg;
+ MachineFunction &MF = DAG.getMachineFunction();
+
+ // BPF target does not support tail call optimization.
+ IsTailCall = false;
+
+ switch (CallConv) {
+ default:
+ report_fatal_error("Unsupported calling convention");
+ case CallingConv::Fast:
+ case CallingConv::C:
+ break;
+ }
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
+
+ CCInfo.AnalyzeCallOperands(Outs, CC_BPF64);
+
+ unsigned NumBytes = CCInfo.getNextStackOffset();
+
+ if (Outs.size() > MaxArgs)
+ fail(CLI.DL, DAG, "too many args to ", Callee);
+
+ for (auto &Arg : Outs) {
+ ISD::ArgFlagsTy Flags = Arg.Flags;
+ if (!Flags.isByVal())
+ continue;
+
+ fail(CLI.DL, DAG, "pass by value not supported ", Callee);
+ }
+
+ auto PtrVT = getPointerTy(MF.getDataLayout());
+ Chain = DAG.getCALLSEQ_START(
+ Chain, DAG.getConstant(NumBytes, CLI.DL, PtrVT, true), CLI.DL);
+
+ SmallVector<std::pair<unsigned, SDValue>, MaxArgs> RegsToPass;
+
+ // Walk arg assignments
+ for (unsigned i = 0,
+ e = std::min(static_cast<unsigned>(ArgLocs.size()), MaxArgs);
+ i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ SDValue Arg = OutVals[i];
+
+ // Promote the value if needed.
+ switch (VA.getLocInfo()) {
+ default:
+ llvm_unreachable("Unknown loc info");
+ case CCValAssign::Full:
+ break;
+ case CCValAssign::SExt:
+ Arg = DAG.getNode(ISD::SIGN_EXTEND, CLI.DL, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::ZExt:
+ Arg = DAG.getNode(ISD::ZERO_EXTEND, CLI.DL, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::AExt:
+ Arg = DAG.getNode(ISD::ANY_EXTEND, CLI.DL, VA.getLocVT(), Arg);
+ break;
+ }
+
+ // Push arguments into RegsToPass vector
+ if (VA.isRegLoc())
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+ else
+ llvm_unreachable("call arg pass bug");
+ }
+
+ SDValue InFlag;
+
+ // Build a sequence of copy-to-reg nodes chained together with token chain and
+ // flag operands which copy the outgoing args into registers. The InFlag in
+ // necessary since all emitted instructions must be stuck together.
+ for (auto &Reg : RegsToPass) {
+ Chain = DAG.getCopyToReg(Chain, CLI.DL, Reg.first, Reg.second, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ // If the callee is a GlobalAddress node (quite common, every direct call is)
+ // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+ // Likewise ExternalSymbol -> TargetExternalSymbol.
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+ Callee = DAG.getTargetGlobalAddress(G->getGlobal(), CLI.DL, PtrVT,
+ G->getOffset(), 0);
+ else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
+ Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT, 0);
+
+ // Returns a chain & a flag for retval copy to use.
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(Chain);
+ Ops.push_back(Callee);
+
+ // Add argument registers to the end of the list so that they are
+ // known live into the call.
+ for (auto &Reg : RegsToPass)
+ Ops.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
+
+ if (InFlag.getNode())
+ Ops.push_back(InFlag);
+
+ Chain = DAG.getNode(BPFISD::CALL, CLI.DL, NodeTys, Ops);
+ InFlag = Chain.getValue(1);
+
+ // Create the CALLSEQ_END node.
+ Chain = DAG.getCALLSEQ_END(
+ Chain, DAG.getConstant(NumBytes, CLI.DL, PtrVT, true),
+ DAG.getConstant(0, CLI.DL, PtrVT, true), InFlag, CLI.DL);
+ InFlag = Chain.getValue(1);
+
+ // Handle result values, copying them out of physregs into vregs that we
+ // return.
+ return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, CLI.DL, DAG,
+ InVals);
+}
+
+SDValue
+BPFTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+ bool IsVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SDLoc &DL, SelectionDAG &DAG) const {
+ unsigned Opc = BPFISD::RET_FLAG;
+
+ // CCValAssign - represent the assignment of the return value to a location
+ SmallVector<CCValAssign, 16> RVLocs;
+ MachineFunction &MF = DAG.getMachineFunction();
+
+ // CCState - Info about the registers and stack slot.
+ CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, *DAG.getContext());
+
+ if (MF.getFunction()->getReturnType()->isAggregateType()) {
+ fail(DL, DAG, "only integer returns supported");
+ return DAG.getNode(Opc, DL, MVT::Other, Chain);
+ }
+
+ // Analize return values.
+ CCInfo.AnalyzeReturn(Outs, RetCC_BPF64);
+
+ SDValue Flag;
+ SmallVector<SDValue, 4> RetOps(1, Chain);
+
+ // Copy the result values into the output registers.
+ for (unsigned i = 0; i != RVLocs.size(); ++i) {
+ CCValAssign &VA = RVLocs[i];
+ assert(VA.isRegLoc() && "Can only return in registers!");
+
+ Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), OutVals[i], Flag);
+
+ // Guarantee that all emitted copies are stuck together,
+ // avoiding something bad.
+ Flag = Chain.getValue(1);
+ RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+ }
+
+ RetOps[0] = Chain; // Update chain.
+
+ // Add the flag if we have it.
+ if (Flag.getNode())
+ RetOps.push_back(Flag);
+
+ return DAG.getNode(Opc, DL, MVT::Other, RetOps);
+}
+
+SDValue BPFTargetLowering::LowerCallResult(
+ SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ // Assign locations to each value returned by this call.
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, *DAG.getContext());
+
+ if (Ins.size() >= 2) {
+ fail(DL, DAG, "only small returns supported");
+ for (unsigned i = 0, e = Ins.size(); i != e; ++i)
+ InVals.push_back(DAG.getConstant(0, DL, Ins[i].VT));
+ return DAG.getCopyFromReg(Chain, DL, 1, Ins[0].VT, InFlag).getValue(1);
+ }
+
+ CCInfo.AnalyzeCallResult(Ins, RetCC_BPF64);
+
+ // Copy all of the result registers out of their specified physreg.
+ for (auto &Val : RVLocs) {
+ Chain = DAG.getCopyFromReg(Chain, DL, Val.getLocReg(),
+ Val.getValVT(), InFlag).getValue(1);
+ InFlag = Chain.getValue(2);
+ InVals.push_back(Chain.getValue(0));
+ }
+
+ return Chain;
+}
+
+static void NegateCC(SDValue &LHS, SDValue &RHS, ISD::CondCode &CC) {
+ switch (CC) {
+ default:
+ break;
+ case ISD::SETULT:
+ case ISD::SETULE:
+ case ISD::SETLT:
+ case ISD::SETLE:
+ CC = ISD::getSetCCSwappedOperands(CC);
+ std::swap(LHS, RHS);
+ break;
+ }
+}
+
+SDValue BPFTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
+ SDValue Chain = Op.getOperand(0);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+ SDValue LHS = Op.getOperand(2);
+ SDValue RHS = Op.getOperand(3);
+ SDValue Dest = Op.getOperand(4);
+ SDLoc DL(Op);
+
+ NegateCC(LHS, RHS, CC);
+
+ return DAG.getNode(BPFISD::BR_CC, DL, Op.getValueType(), Chain, LHS, RHS,
+ DAG.getConstant(CC, DL, MVT::i64), Dest);
+}
+
+SDValue BPFTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ SDValue TrueV = Op.getOperand(2);
+ SDValue FalseV = Op.getOperand(3);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+ SDLoc DL(Op);
+
+ NegateCC(LHS, RHS, CC);
+
+ SDValue TargetCC = DAG.getConstant(CC, DL, MVT::i64);
+
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
+ SDValue Ops[] = {LHS, RHS, TargetCC, TrueV, FalseV};
+
+ return DAG.getNode(BPFISD::SELECT_CC, DL, VTs, Ops);
+}
+
+const char *BPFTargetLowering::getTargetNodeName(unsigned Opcode) const {
+ switch ((BPFISD::NodeType)Opcode) {
+ case BPFISD::FIRST_NUMBER:
+ break;
+ case BPFISD::RET_FLAG:
+ return "BPFISD::RET_FLAG";
+ case BPFISD::CALL:
+ return "BPFISD::CALL";
+ case BPFISD::SELECT_CC:
+ return "BPFISD::SELECT_CC";
+ case BPFISD::BR_CC:
+ return "BPFISD::BR_CC";
+ case BPFISD::Wrapper:
+ return "BPFISD::Wrapper";
+ }
+ return nullptr;
+}
+
+SDValue BPFTargetLowering::LowerGlobalAddress(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+ SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i64);
+
+ return DAG.getNode(BPFISD::Wrapper, DL, MVT::i64, GA);
+}
+
+MachineBasicBlock *
+BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
+ DebugLoc DL = MI.getDebugLoc();
+
+ assert(MI.getOpcode() == BPF::Select && "Unexpected instr type to insert");
+
+ // To "insert" a SELECT instruction, we actually have to insert the diamond
+ // control-flow pattern. The incoming instruction knows the destination vreg
+ // to set, the condition code register to branch on, the true/false values to
+ // select between, and a branch opcode to use.
+ const BasicBlock *LLVM_BB = BB->getBasicBlock();
+ MachineFunction::iterator I = ++BB->getIterator();
+
+ // ThisMBB:
+ // ...
+ // TrueVal = ...
+ // jmp_XX r1, r2 goto Copy1MBB
+ // fallthrough --> Copy0MBB
+ MachineBasicBlock *ThisMBB = BB;
+ MachineFunction *F = BB->getParent();
+ MachineBasicBlock *Copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *Copy1MBB = F->CreateMachineBasicBlock(LLVM_BB);
+
+ F->insert(I, Copy0MBB);
+ F->insert(I, Copy1MBB);
+ // Update machine-CFG edges by transferring all successors of the current
+ // block to the new block which will contain the Phi node for the select.
+ Copy1MBB->splice(Copy1MBB->begin(), BB,
+ std::next(MachineBasicBlock::iterator(MI)), BB->end());
+ Copy1MBB->transferSuccessorsAndUpdatePHIs(BB);
+ // Next, add the true and fallthrough blocks as its successors.
+ BB->addSuccessor(Copy0MBB);
+ BB->addSuccessor(Copy1MBB);
+
+ // Insert Branch if Flag
+ unsigned LHS = MI.getOperand(1).getReg();
+ unsigned RHS = MI.getOperand(2).getReg();
+ int CC = MI.getOperand(3).getImm();
+ switch (CC) {
+ case ISD::SETGT:
+ BuildMI(BB, DL, TII.get(BPF::JSGT_rr))
+ .addReg(LHS)
+ .addReg(RHS)
+ .addMBB(Copy1MBB);
+ break;
+ case ISD::SETUGT:
+ BuildMI(BB, DL, TII.get(BPF::JUGT_rr))
+ .addReg(LHS)
+ .addReg(RHS)
+ .addMBB(Copy1MBB);
+ break;
+ case ISD::SETGE:
+ BuildMI(BB, DL, TII.get(BPF::JSGE_rr))
+ .addReg(LHS)
+ .addReg(RHS)
+ .addMBB(Copy1MBB);
+ break;
+ case ISD::SETUGE:
+ BuildMI(BB, DL, TII.get(BPF::JUGE_rr))
+ .addReg(LHS)
+ .addReg(RHS)
+ .addMBB(Copy1MBB);
+ break;
+ case ISD::SETEQ:
+ BuildMI(BB, DL, TII.get(BPF::JEQ_rr))
+ .addReg(LHS)
+ .addReg(RHS)
+ .addMBB(Copy1MBB);
+ break;
+ case ISD::SETNE:
+ BuildMI(BB, DL, TII.get(BPF::JNE_rr))
+ .addReg(LHS)
+ .addReg(RHS)
+ .addMBB(Copy1MBB);
+ break;
+ default:
+ report_fatal_error("unimplemented select CondCode " + Twine(CC));
+ }
+
+ // Copy0MBB:
+ // %FalseValue = ...
+ // # fallthrough to Copy1MBB
+ BB = Copy0MBB;
+
+ // Update machine-CFG edges
+ BB->addSuccessor(Copy1MBB);
+
+ // Copy1MBB:
+ // %Result = phi [ %FalseValue, Copy0MBB ], [ %TrueValue, ThisMBB ]
+ // ...
+ BB = Copy1MBB;
+ BuildMI(*BB, BB->begin(), DL, TII.get(BPF::PHI), MI.getOperand(0).getReg())
+ .addReg(MI.getOperand(5).getReg())
+ .addMBB(Copy0MBB)
+ .addReg(MI.getOperand(4).getReg())
+ .addMBB(ThisMBB);
+
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
+ return BB;
+}
diff --git a/contrib/llvm/lib/Target/BPF/BPFISelLowering.h b/contrib/llvm/lib/Target/BPF/BPFISelLowering.h
new file mode 100644
index 000000000000..3d1726be286e
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFISelLowering.h
@@ -0,0 +1,93 @@
+//===-- BPFISelLowering.h - BPF DAG Lowering Interface ----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that BPF uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_BPFISELLOWERING_H
+#define LLVM_LIB_TARGET_BPF_BPFISELLOWERING_H
+
+#include "BPF.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+class BPFSubtarget;
+namespace BPFISD {
+enum NodeType : unsigned {
+ FIRST_NUMBER = ISD::BUILTIN_OP_END,
+ RET_FLAG,
+ CALL,
+ SELECT_CC,
+ BR_CC,
+ Wrapper
+};
+}
+
+class BPFTargetLowering : public TargetLowering {
+public:
+ explicit BPFTargetLowering(const TargetMachine &TM, const BPFSubtarget &STI);
+
+ // Provide custom lowering hooks for some operations.
+ SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+ // This method returns the name of a target specific DAG node.
+ const char *getTargetNodeName(unsigned Opcode) const override;
+
+ MachineBasicBlock *
+ EmitInstrWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *BB) const override;
+
+private:
+ SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+
+ // Lower the result values of a call, copying them out of physregs into vregs
+ SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+ CallingConv::ID CallConv, bool IsVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &DL, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const;
+
+ // Maximum number of arguments to a call
+ static const unsigned MaxArgs;
+
+ // Lower a call into CALLSEQ_START - BPFISD:CALL - CALLSEQ_END chain
+ SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const override;
+
+ // Lower incoming arguments, copy physregs into vregs
+ SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+ bool IsVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &DL, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const override;
+
+ SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
+ SelectionDAG &DAG) const override;
+
+ EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
+ bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
+ MachineFunction &MF) const override {
+ return Size >= 8 ? MVT::i64 : MVT::i32;
+ }
+
+ bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+ Type *Ty) const override {
+ return true;
+ }
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/BPF/BPFInstrFormats.td b/contrib/llvm/lib/Target/BPF/BPFInstrFormats.td
new file mode 100644
index 000000000000..53f3ad623587
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFInstrFormats.td
@@ -0,0 +1,33 @@
+//===-- BPFInstrFormats.td - BPF Instruction Formats -------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+class InstBPF<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : Instruction {
+ field bits<64> Inst;
+ field bits<64> SoftFail = 0;
+ let Size = 8;
+
+ let Namespace = "BPF";
+ let DecoderNamespace = "BPF";
+
+ bits<3> BPFClass;
+ let Inst{58-56} = BPFClass;
+
+ dag OutOperandList = outs;
+ dag InOperandList = ins;
+ let AsmString = asmstr;
+ let Pattern = pattern;
+}
+
+// Pseudo instructions
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstBPF<outs, ins, asmstr, pattern> {
+ let Inst{63-0} = 0;
+ let isPseudo = 1;
+}
diff --git a/contrib/llvm/lib/Target/BPF/BPFInstrInfo.cpp b/contrib/llvm/lib/Target/BPF/BPFInstrInfo.cpp
new file mode 100644
index 000000000000..cbe4466164f9
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFInstrInfo.cpp
@@ -0,0 +1,174 @@
+//===-- BPFInstrInfo.cpp - BPF Instruction Information ----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the BPF implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPF.h"
+#include "BPFInstrInfo.h"
+#include "BPFSubtarget.h"
+#include "BPFTargetMachine.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "BPFGenInstrInfo.inc"
+
+using namespace llvm;
+
+BPFInstrInfo::BPFInstrInfo()
+ : BPFGenInstrInfo(BPF::ADJCALLSTACKDOWN, BPF::ADJCALLSTACKUP) {}
+
+void BPFInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL, unsigned DestReg,
+ unsigned SrcReg, bool KillSrc) const {
+ if (BPF::GPRRegClass.contains(DestReg, SrcReg))
+ BuildMI(MBB, I, DL, get(BPF::MOV_rr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ else
+ llvm_unreachable("Impossible reg-to-reg copy");
+}
+
+void BPFInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ unsigned SrcReg, bool IsKill, int FI,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ DebugLoc DL;
+ if (I != MBB.end())
+ DL = I->getDebugLoc();
+
+ if (RC == &BPF::GPRRegClass)
+ BuildMI(MBB, I, DL, get(BPF::STD))
+ .addReg(SrcReg, getKillRegState(IsKill))
+ .addFrameIndex(FI)
+ .addImm(0);
+ else
+ llvm_unreachable("Can't store this register to stack slot");
+}
+
+void BPFInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ unsigned DestReg, int FI,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ DebugLoc DL;
+ if (I != MBB.end())
+ DL = I->getDebugLoc();
+
+ if (RC == &BPF::GPRRegClass)
+ BuildMI(MBB, I, DL, get(BPF::LDD), DestReg).addFrameIndex(FI).addImm(0);
+ else
+ llvm_unreachable("Can't load this register from stack slot");
+}
+
+bool BPFInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const {
+ // Start from the bottom of the block and work up, examining the
+ // terminator instructions.
+ MachineBasicBlock::iterator I = MBB.end();
+ while (I != MBB.begin()) {
+ --I;
+ if (I->isDebugValue())
+ continue;
+
+ // Working from the bottom, when we see a non-terminator
+ // instruction, we're done.
+ if (!isUnpredicatedTerminator(*I))
+ break;
+
+ // A terminator that isn't a branch can't easily be handled
+ // by this analysis.
+ if (!I->isBranch())
+ return true;
+
+ // Handle unconditional branches.
+ if (I->getOpcode() == BPF::JMP) {
+ if (!AllowModify) {
+ TBB = I->getOperand(0).getMBB();
+ continue;
+ }
+
+ // If the block has any instructions after a J, delete them.
+ while (std::next(I) != MBB.end())
+ std::next(I)->eraseFromParent();
+ Cond.clear();
+ FBB = 0;
+
+ // Delete the J if it's equivalent to a fall-through.
+ if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
+ TBB = 0;
+ I->eraseFromParent();
+ I = MBB.end();
+ continue;
+ }
+
+ // TBB is used to indicate the unconditinal destination.
+ TBB = I->getOperand(0).getMBB();
+ continue;
+ }
+ // Cannot handle conditional branches
+ return true;
+ }
+
+ return false;
+}
+
+unsigned BPFInstrInfo::insertBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB,
+ ArrayRef<MachineOperand> Cond,
+ const DebugLoc &DL,
+ int *BytesAdded) const {
+ assert(!BytesAdded && "code size not handled");
+
+ // Shouldn't be a fall through.
+ assert(TBB && "insertBranch must not be told to insert a fallthrough");
+
+ if (Cond.empty()) {
+ // Unconditional branch
+ assert(!FBB && "Unconditional branch with multiple successors!");
+ BuildMI(&MBB, DL, get(BPF::JMP)).addMBB(TBB);
+ return 1;
+ }
+
+ llvm_unreachable("Unexpected conditional branch");
+}
+
+unsigned BPFInstrInfo::removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved) const {
+ assert(!BytesRemoved && "code size not handled");
+
+ MachineBasicBlock::iterator I = MBB.end();
+ unsigned Count = 0;
+
+ while (I != MBB.begin()) {
+ --I;
+ if (I->isDebugValue())
+ continue;
+ if (I->getOpcode() != BPF::JMP)
+ break;
+ // Remove the branch.
+ I->eraseFromParent();
+ I = MBB.end();
+ ++Count;
+ }
+
+ return Count;
+}
diff --git a/contrib/llvm/lib/Target/BPF/BPFInstrInfo.h b/contrib/llvm/lib/Target/BPF/BPFInstrInfo.h
new file mode 100644
index 000000000000..c7048ab979b7
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFInstrInfo.h
@@ -0,0 +1,61 @@
+//===-- BPFInstrInfo.h - BPF Instruction Information ------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the BPF implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_BPFINSTRINFO_H
+#define LLVM_LIB_TARGET_BPF_BPFINSTRINFO_H
+
+#include "BPFRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "BPFGenInstrInfo.inc"
+
+namespace llvm {
+
+class BPFInstrInfo : public BPFGenInstrInfo {
+ const BPFRegisterInfo RI;
+
+public:
+ BPFInstrInfo();
+
+ const BPFRegisterInfo &getRegisterInfo() const { return RI; }
+
+ void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+ bool KillSrc) const override;
+
+ void storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, unsigned SrcReg,
+ bool isKill, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
+
+ void loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, unsigned DestReg,
+ int FrameIndex, const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
+ bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const override;
+
+ unsigned removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved = nullptr) const override;
+ unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+ const DebugLoc &DL,
+ int *BytesAdded = nullptr) const override;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/BPF/BPFInstrInfo.td b/contrib/llvm/lib/Target/BPF/BPFInstrInfo.td
new file mode 100644
index 000000000000..a7910dea98de
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFInstrInfo.td
@@ -0,0 +1,578 @@
+//===-- BPFInstrInfo.td - Target Description for BPF Target ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the BPF instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+include "BPFInstrFormats.td"
+
+// Instruction Operands and Patterns
+
+// These are target-independent nodes, but have target-specific formats.
+def SDT_BPFCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>]>;
+def SDT_BPFCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
+def SDT_BPFCall : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
+def SDT_BPFSetFlag : SDTypeProfile<0, 3, [SDTCisSameAs<0, 1>]>;
+def SDT_BPFSelectCC : SDTypeProfile<1, 5, [SDTCisSameAs<1, 2>,
+ SDTCisSameAs<0, 4>,
+ SDTCisSameAs<4, 5>]>;
+def SDT_BPFBrCC : SDTypeProfile<0, 4, [SDTCisSameAs<0, 1>,
+ SDTCisVT<3, OtherVT>]>;
+def SDT_BPFWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
+ SDTCisPtrTy<0>]>;
+
+def BPFcall : SDNode<"BPFISD::CALL", SDT_BPFCall,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+ SDNPVariadic]>;
+def BPFretflag : SDNode<"BPFISD::RET_FLAG", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def BPFcallseq_start: SDNode<"ISD::CALLSEQ_START", SDT_BPFCallSeqStart,
+ [SDNPHasChain, SDNPOutGlue]>;
+def BPFcallseq_end : SDNode<"ISD::CALLSEQ_END", SDT_BPFCallSeqEnd,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def BPFbrcc : SDNode<"BPFISD::BR_CC", SDT_BPFBrCC,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue]>;
+
+def BPFselectcc : SDNode<"BPFISD::SELECT_CC", SDT_BPFSelectCC, [SDNPInGlue]>;
+def BPFWrapper : SDNode<"BPFISD::Wrapper", SDT_BPFWrapper>;
+
+def brtarget : Operand<OtherVT>;
+def calltarget : Operand<i64>;
+
+def u64imm : Operand<i64> {
+ let PrintMethod = "printImm64Operand";
+}
+
+def i64immSExt32 : PatLeaf<(imm),
+ [{return isInt<32>(N->getSExtValue()); }]>;
+
+// Addressing modes.
+def ADDRri : ComplexPattern<i64, 2, "SelectAddr", [], []>;
+def FIri : ComplexPattern<i64, 2, "SelectFIAddr", [add, or], []>;
+
+// Address operands
+def MEMri : Operand<i64> {
+ let PrintMethod = "printMemOperand";
+ let EncoderMethod = "getMemoryOpValue";
+ let DecoderMethod = "decodeMemoryOpValue";
+ let MIOperandInfo = (ops GPR, i16imm);
+}
+
+// Conditional code predicates - used for pattern matching for jump instructions
+def BPF_CC_EQ : PatLeaf<(imm),
+ [{return (N->getZExtValue() == ISD::SETEQ);}]>;
+def BPF_CC_NE : PatLeaf<(imm),
+ [{return (N->getZExtValue() == ISD::SETNE);}]>;
+def BPF_CC_GE : PatLeaf<(imm),
+ [{return (N->getZExtValue() == ISD::SETGE);}]>;
+def BPF_CC_GT : PatLeaf<(imm),
+ [{return (N->getZExtValue() == ISD::SETGT);}]>;
+def BPF_CC_GTU : PatLeaf<(imm),
+ [{return (N->getZExtValue() == ISD::SETUGT);}]>;
+def BPF_CC_GEU : PatLeaf<(imm),
+ [{return (N->getZExtValue() == ISD::SETUGE);}]>;
+
+// jump instructions
+class JMP_RR<bits<4> Opc, string OpcodeStr, PatLeaf Cond>
+ : InstBPF<(outs), (ins GPR:$dst, GPR:$src, brtarget:$BrDst),
+ "if $dst "#OpcodeStr#" $src goto $BrDst",
+ [(BPFbrcc i64:$dst, i64:$src, Cond, bb:$BrDst)]> {
+ bits<4> op;
+ bits<1> BPFSrc;
+ bits<4> dst;
+ bits<4> src;
+ bits<16> BrDst;
+
+ let Inst{63-60} = op;
+ let Inst{59} = BPFSrc;
+ let Inst{55-52} = src;
+ let Inst{51-48} = dst;
+ let Inst{47-32} = BrDst;
+
+ let op = Opc;
+ let BPFSrc = 1;
+ let BPFClass = 5; // BPF_JMP
+}
+
+class JMP_RI<bits<4> Opc, string OpcodeStr, PatLeaf Cond>
+ : InstBPF<(outs), (ins GPR:$dst, i64imm:$imm, brtarget:$BrDst),
+ "if $dst "#OpcodeStr#" $imm goto $BrDst",
+ [(BPFbrcc i64:$dst, i64immSExt32:$imm, Cond, bb:$BrDst)]> {
+ bits<4> op;
+ bits<1> BPFSrc;
+ bits<4> dst;
+ bits<16> BrDst;
+ bits<32> imm;
+
+ let Inst{63-60} = op;
+ let Inst{59} = BPFSrc;
+ let Inst{51-48} = dst;
+ let Inst{47-32} = BrDst;
+ let Inst{31-0} = imm;
+
+ let op = Opc;
+ let BPFSrc = 0;
+ let BPFClass = 5; // BPF_JMP
+}
+
+multiclass J<bits<4> Opc, string OpcodeStr, PatLeaf Cond> {
+ def _rr : JMP_RR<Opc, OpcodeStr, Cond>;
+ def _ri : JMP_RI<Opc, OpcodeStr, Cond>;
+}
+
+let isBranch = 1, isTerminator = 1, hasDelaySlot=0 in {
+// cmp+goto instructions
+defm JEQ : J<0x1, "==", BPF_CC_EQ>;
+defm JUGT : J<0x2, ">", BPF_CC_GTU>;
+defm JUGE : J<0x3, ">=", BPF_CC_GEU>;
+defm JNE : J<0x5, "!=", BPF_CC_NE>;
+defm JSGT : J<0x6, "s>", BPF_CC_GT>;
+defm JSGE : J<0x7, "s>=", BPF_CC_GE>;
+}
+
+// ALU instructions
+class ALU_RI<bits<4> Opc, string OpcodeStr, SDNode OpNode>
+ : InstBPF<(outs GPR:$dst), (ins GPR:$src2, i64imm:$imm),
+ "$dst "#OpcodeStr#" $imm",
+ [(set GPR:$dst, (OpNode GPR:$src2, i64immSExt32:$imm))]> {
+ bits<4> op;
+ bits<1> BPFSrc;
+ bits<4> dst;
+ bits<32> imm;
+
+ let Inst{63-60} = op;
+ let Inst{59} = BPFSrc;
+ let Inst{51-48} = dst;
+ let Inst{31-0} = imm;
+
+ let op = Opc;
+ let BPFSrc = 0;
+ let BPFClass = 7; // BPF_ALU64
+}
+
+class ALU_RR<bits<4> Opc, string OpcodeStr, SDNode OpNode>
+ : InstBPF<(outs GPR:$dst), (ins GPR:$src2, GPR:$src),
+ "$dst "#OpcodeStr#" $src",
+ [(set GPR:$dst, (OpNode i64:$src2, i64:$src))]> {
+ bits<4> op;
+ bits<1> BPFSrc;
+ bits<4> dst;
+ bits<4> src;
+
+ let Inst{63-60} = op;
+ let Inst{59} = BPFSrc;
+ let Inst{55-52} = src;
+ let Inst{51-48} = dst;
+
+ let op = Opc;
+ let BPFSrc = 1;
+ let BPFClass = 7; // BPF_ALU64
+}
+
+multiclass ALU<bits<4> Opc, string OpcodeStr, SDNode OpNode> {
+ def _rr : ALU_RR<Opc, OpcodeStr, OpNode>;
+ def _ri : ALU_RI<Opc, OpcodeStr, OpNode>;
+}
+
+let Constraints = "$dst = $src2" in {
+let isAsCheapAsAMove = 1 in {
+ defm ADD : ALU<0x0, "+=", add>;
+ defm SUB : ALU<0x1, "-=", sub>;
+ defm OR : ALU<0x4, "|=", or>;
+ defm AND : ALU<0x5, "&=", and>;
+ defm SLL : ALU<0x6, "<<=", shl>;
+ defm SRL : ALU<0x7, ">>=", srl>;
+ defm XOR : ALU<0xa, "^=", xor>;
+ defm SRA : ALU<0xc, "s>>=", sra>;
+}
+ defm MUL : ALU<0x2, "*=", mul>;
+ defm DIV : ALU<0x3, "/=", udiv>;
+}
+
+class MOV_RR<string OpcodeStr>
+ : InstBPF<(outs GPR:$dst), (ins GPR:$src),
+ "$dst "#OpcodeStr#" $src",
+ []> {
+ bits<4> op;
+ bits<1> BPFSrc;
+ bits<4> dst;
+ bits<4> src;
+
+ let Inst{63-60} = op;
+ let Inst{59} = BPFSrc;
+ let Inst{55-52} = src;
+ let Inst{51-48} = dst;
+
+ let op = 0xb; // BPF_MOV
+ let BPFSrc = 1; // BPF_X
+ let BPFClass = 7; // BPF_ALU64
+}
+
+class MOV_RI<string OpcodeStr>
+ : InstBPF<(outs GPR:$dst), (ins i64imm:$imm),
+ "$dst "#OpcodeStr#" $imm",
+ [(set GPR:$dst, (i64 i64immSExt32:$imm))]> {
+ bits<4> op;
+ bits<1> BPFSrc;
+ bits<4> dst;
+ bits<32> imm;
+
+ let Inst{63-60} = op;
+ let Inst{59} = BPFSrc;
+ let Inst{51-48} = dst;
+ let Inst{31-0} = imm;
+
+ let op = 0xb; // BPF_MOV
+ let BPFSrc = 0; // BPF_K
+ let BPFClass = 7; // BPF_ALU64
+}
+
+class LD_IMM64<bits<4> Pseudo, string OpcodeStr>
+ : InstBPF<(outs GPR:$dst), (ins u64imm:$imm),
+ "$dst "#OpcodeStr#" ${imm}ll",
+ [(set GPR:$dst, (i64 imm:$imm))]> {
+
+ bits<3> mode;
+ bits<2> size;
+ bits<4> dst;
+ bits<64> imm;
+
+ let Inst{63-61} = mode;
+ let Inst{60-59} = size;
+ let Inst{51-48} = dst;
+ let Inst{55-52} = Pseudo;
+ let Inst{47-32} = 0;
+ let Inst{31-0} = imm{31-0};
+
+ let mode = 0; // BPF_IMM
+ let size = 3; // BPF_DW
+ let BPFClass = 0; // BPF_LD
+}
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+def LD_imm64 : LD_IMM64<0, "=">;
+def MOV_rr : MOV_RR<"=">;
+def MOV_ri : MOV_RI<"=">;
+}
+
+def FI_ri
+ : InstBPF<(outs GPR:$dst), (ins MEMri:$addr),
+ "lea\t$dst, $addr",
+ [(set i64:$dst, FIri:$addr)]> {
+ // This is a tentative instruction, and will be replaced
+ // with MOV_rr and ADD_ri in PEI phase
+ let Inst{63-61} = 0;
+ let Inst{60-59} = 3;
+ let Inst{51-48} = 0;
+ let Inst{55-52} = 2;
+ let Inst{47-32} = 0;
+ let Inst{31-0} = 0;
+ let BPFClass = 0;
+}
+
+
+def LD_pseudo
+ : InstBPF<(outs GPR:$dst), (ins i64imm:$pseudo, u64imm:$imm),
+ "ld_pseudo\t$dst, $pseudo, $imm",
+ [(set GPR:$dst, (int_bpf_pseudo imm:$pseudo, imm:$imm))]> {
+
+ bits<3> mode;
+ bits<2> size;
+ bits<4> dst;
+ bits<64> imm;
+ bits<4> pseudo;
+
+ let Inst{63-61} = mode;
+ let Inst{60-59} = size;
+ let Inst{51-48} = dst;
+ let Inst{55-52} = pseudo;
+ let Inst{47-32} = 0;
+ let Inst{31-0} = imm{31-0};
+
+ let mode = 0; // BPF_IMM
+ let size = 3; // BPF_DW
+ let BPFClass = 0; // BPF_LD
+}
+
+// STORE instructions
+class STORE<bits<2> SizeOp, string OpcodeStr, list<dag> Pattern>
+ : InstBPF<(outs), (ins GPR:$src, MEMri:$addr),
+ "*("#OpcodeStr#" *)($addr) = $src", Pattern> {
+ bits<3> mode;
+ bits<2> size;
+ bits<4> src;
+ bits<20> addr;
+
+ let Inst{63-61} = mode;
+ let Inst{60-59} = size;
+ let Inst{51-48} = addr{19-16}; // base reg
+ let Inst{55-52} = src;
+ let Inst{47-32} = addr{15-0}; // offset
+
+ let mode = 3; // BPF_MEM
+ let size = SizeOp;
+ let BPFClass = 3; // BPF_STX
+}
+
+class STOREi64<bits<2> Opc, string OpcodeStr, PatFrag OpNode>
+ : STORE<Opc, OpcodeStr, [(OpNode i64:$src, ADDRri:$addr)]>;
+
+def STW : STOREi64<0x0, "u32", truncstorei32>;
+def STH : STOREi64<0x1, "u16", truncstorei16>;
+def STB : STOREi64<0x2, "u8", truncstorei8>;
+def STD : STOREi64<0x3, "u64", store>;
+
+// LOAD instructions
+class LOAD<bits<2> SizeOp, string OpcodeStr, list<dag> Pattern>
+ : InstBPF<(outs GPR:$dst), (ins MEMri:$addr),
+ "$dst = *("#OpcodeStr#" *)($addr)", Pattern> {
+ bits<3> mode;
+ bits<2> size;
+ bits<4> dst;
+ bits<20> addr;
+
+ let Inst{63-61} = mode;
+ let Inst{60-59} = size;
+ let Inst{51-48} = dst;
+ let Inst{55-52} = addr{19-16};
+ let Inst{47-32} = addr{15-0};
+
+ let mode = 3; // BPF_MEM
+ let size = SizeOp;
+ let BPFClass = 1; // BPF_LDX
+}
+
+class LOADi64<bits<2> SizeOp, string OpcodeStr, PatFrag OpNode>
+ : LOAD<SizeOp, OpcodeStr, [(set i64:$dst, (OpNode ADDRri:$addr))]>;
+
+def LDW : LOADi64<0x0, "u32", zextloadi32>;
+def LDH : LOADi64<0x1, "u16", zextloadi16>;
+def LDB : LOADi64<0x2, "u8", zextloadi8>;
+def LDD : LOADi64<0x3, "u64", load>;
+
+class BRANCH<bits<4> Opc, string OpcodeStr, list<dag> Pattern>
+ : InstBPF<(outs), (ins brtarget:$BrDst),
+ !strconcat(OpcodeStr, " $BrDst"), Pattern> {
+ bits<4> op;
+ bits<16> BrDst;
+ bits<1> BPFSrc;
+
+ let Inst{63-60} = op;
+ let Inst{59} = BPFSrc;
+ let Inst{47-32} = BrDst;
+
+ let op = Opc;
+ let BPFSrc = 0;
+ let BPFClass = 5; // BPF_JMP
+}
+
+class CALL<string OpcodeStr>
+ : InstBPF<(outs), (ins calltarget:$BrDst),
+ !strconcat(OpcodeStr, " $BrDst"), []> {
+ bits<4> op;
+ bits<32> BrDst;
+ bits<1> BPFSrc;
+
+ let Inst{63-60} = op;
+ let Inst{59} = BPFSrc;
+ let Inst{31-0} = BrDst;
+
+ let op = 8; // BPF_CALL
+ let BPFSrc = 0;
+ let BPFClass = 5; // BPF_JMP
+}
+
+// Jump always
+let isBranch = 1, isTerminator = 1, hasDelaySlot=0, isBarrier = 1 in {
+ def JMP : BRANCH<0x0, "goto", [(br bb:$BrDst)]>;
+}
+
+// Jump and link
+let isCall=1, hasDelaySlot=0, Uses = [R11],
+ // Potentially clobbered registers
+ Defs = [R0, R1, R2, R3, R4, R5] in {
+ def JAL : CALL<"call">;
+}
+
+class NOP_I<string OpcodeStr>
+ : InstBPF<(outs), (ins i32imm:$imm),
+ !strconcat(OpcodeStr, "\t$imm"), []> {
+ // mov r0, r0 == nop
+ bits<4> op;
+ bits<1> BPFSrc;
+ bits<4> dst;
+ bits<4> src;
+
+ let Inst{63-60} = op;
+ let Inst{59} = BPFSrc;
+ let Inst{55-52} = src;
+ let Inst{51-48} = dst;
+
+ let op = 0xb; // BPF_MOV
+ let BPFSrc = 1; // BPF_X
+ let BPFClass = 7; // BPF_ALU64
+ let src = 0; // R0
+ let dst = 0; // R0
+}
+
+let hasSideEffects = 0 in
+ def NOP : NOP_I<"nop">;
+
+class RET<string OpcodeStr>
+ : InstBPF<(outs), (ins),
+ !strconcat(OpcodeStr, ""), [(BPFretflag)]> {
+ bits<4> op;
+
+ let Inst{63-60} = op;
+ let Inst{59} = 0;
+ let Inst{31-0} = 0;
+
+ let op = 9; // BPF_EXIT
+ let BPFClass = 5; // BPF_JMP
+}
+
+let isReturn = 1, isTerminator = 1, hasDelaySlot=0, isBarrier = 1,
+ isNotDuplicable = 1 in {
+ def RET : RET<"exit">;
+}
+
+// ADJCALLSTACKDOWN/UP pseudo insns
+let Defs = [R11], Uses = [R11] in {
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt),
+ "#ADJCALLSTACKDOWN $amt",
+ [(BPFcallseq_start timm:$amt)]>;
+def ADJCALLSTACKUP : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2),
+ "#ADJCALLSTACKUP $amt1 $amt2",
+ [(BPFcallseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+let usesCustomInserter = 1 in {
+ def Select : Pseudo<(outs GPR:$dst),
+ (ins GPR:$lhs, GPR:$rhs, i64imm:$imm, GPR:$src, GPR:$src2),
+ "# Select PSEUDO $dst = $lhs $imm $rhs ? $src : $src2",
+ [(set i64:$dst,
+ (BPFselectcc i64:$lhs, i64:$rhs, (i64 imm:$imm), i64:$src, i64:$src2))]>;
+}
+
+// load 64-bit global addr into register
+def : Pat<(BPFWrapper tglobaladdr:$in), (LD_imm64 tglobaladdr:$in)>;
+
+// 0xffffFFFF doesn't fit into simm32, optimize common case
+def : Pat<(i64 (and (i64 GPR:$src), 0xffffFFFF)),
+ (SRL_ri (SLL_ri (i64 GPR:$src), 32), 32)>;
+
+// Calls
+def : Pat<(BPFcall tglobaladdr:$dst), (JAL tglobaladdr:$dst)>;
+def : Pat<(BPFcall imm:$dst), (JAL imm:$dst)>;
+
+// Loads
+def : Pat<(extloadi8 ADDRri:$src), (i64 (LDB ADDRri:$src))>;
+def : Pat<(extloadi16 ADDRri:$src), (i64 (LDH ADDRri:$src))>;
+def : Pat<(extloadi32 ADDRri:$src), (i64 (LDW ADDRri:$src))>;
+
+// Atomics
+class XADD<bits<2> SizeOp, string OpcodeStr, PatFrag OpNode>
+ : InstBPF<(outs GPR:$dst), (ins MEMri:$addr, GPR:$val),
+ "lock *("#OpcodeStr#" *)($addr) += $val",
+ [(set GPR:$dst, (OpNode ADDRri:$addr, GPR:$val))]> {
+ bits<3> mode;
+ bits<2> size;
+ bits<4> dst;
+ bits<20> addr;
+
+ let Inst{63-61} = mode;
+ let Inst{60-59} = size;
+ let Inst{51-48} = addr{19-16}; // base reg
+ let Inst{55-52} = dst;
+ let Inst{47-32} = addr{15-0}; // offset
+
+ let mode = 6; // BPF_XADD
+ let size = SizeOp;
+ let BPFClass = 3; // BPF_STX
+}
+
+let Constraints = "$dst = $val" in {
+def XADD32 : XADD<0, "u32", atomic_load_add_32>;
+def XADD64 : XADD<3, "u64", atomic_load_add_64>;
+// undefined def XADD16 : XADD<1, "xadd16", atomic_load_add_16>;
+// undefined def XADD8 : XADD<2, "xadd8", atomic_load_add_8>;
+}
+
+// bswap16, bswap32, bswap64
+class BSWAP<bits<32> SizeOp, string OpcodeStr, list<dag> Pattern>
+ : InstBPF<(outs GPR:$dst), (ins GPR:$src),
+ !strconcat(OpcodeStr, "\t$dst"),
+ Pattern> {
+ bits<4> op;
+ bits<1> BPFSrc;
+ bits<4> dst;
+ bits<32> imm;
+
+ let Inst{63-60} = op;
+ let Inst{59} = BPFSrc;
+ let Inst{51-48} = dst;
+ let Inst{31-0} = imm;
+
+ let op = 0xd; // BPF_END
+ let BPFSrc = 1; // BPF_TO_BE (TODO: use BPF_TO_LE for big-endian target)
+ let BPFClass = 4; // BPF_ALU
+ let imm = SizeOp;
+}
+
+let Constraints = "$dst = $src" in {
+def BSWAP16 : BSWAP<16, "bswap16", [(set GPR:$dst, (srl (bswap GPR:$src), (i64 48)))]>;
+def BSWAP32 : BSWAP<32, "bswap32", [(set GPR:$dst, (srl (bswap GPR:$src), (i64 32)))]>;
+def BSWAP64 : BSWAP<64, "bswap64", [(set GPR:$dst, (bswap GPR:$src))]>;
+}
+
+let Defs = [R0, R1, R2, R3, R4, R5], Uses = [R6], hasSideEffects = 1,
+ hasExtraDefRegAllocReq = 1, hasExtraSrcRegAllocReq = 1, mayLoad = 1 in {
+class LOAD_ABS<bits<2> SizeOp, string OpcodeStr, Intrinsic OpNode>
+ : InstBPF<(outs), (ins GPR:$skb, i64imm:$imm),
+ "r0 = *("#OpcodeStr#" *)skb[$imm]",
+ [(set R0, (OpNode GPR:$skb, i64immSExt32:$imm))]> {
+ bits<3> mode;
+ bits<2> size;
+ bits<32> imm;
+
+ let Inst{63-61} = mode;
+ let Inst{60-59} = size;
+ let Inst{31-0} = imm;
+
+ let mode = 1; // BPF_ABS
+ let size = SizeOp;
+ let BPFClass = 0; // BPF_LD
+}
+
+class LOAD_IND<bits<2> SizeOp, string OpcodeStr, Intrinsic OpNode>
+ : InstBPF<(outs), (ins GPR:$skb, GPR:$val),
+ "r0 = *("#OpcodeStr#" *)skb[$val]",
+ [(set R0, (OpNode GPR:$skb, GPR:$val))]> {
+ bits<3> mode;
+ bits<2> size;
+ bits<4> val;
+
+ let Inst{63-61} = mode;
+ let Inst{60-59} = size;
+ let Inst{55-52} = val;
+
+ let mode = 2; // BPF_IND
+ let size = SizeOp;
+ let BPFClass = 0; // BPF_LD
+}
+}
+
+def LD_ABS_B : LOAD_ABS<2, "u8", int_bpf_load_byte>;
+def LD_ABS_H : LOAD_ABS<1, "u16", int_bpf_load_half>;
+def LD_ABS_W : LOAD_ABS<0, "u32", int_bpf_load_word>;
+
+def LD_IND_B : LOAD_IND<2, "u8", int_bpf_load_byte>;
+def LD_IND_H : LOAD_IND<1, "u16", int_bpf_load_half>;
+def LD_IND_W : LOAD_IND<0, "u32", int_bpf_load_word>;
diff --git a/contrib/llvm/lib/Target/BPF/BPFMCInstLower.cpp b/contrib/llvm/lib/Target/BPF/BPFMCInstLower.cpp
new file mode 100644
index 000000000000..f64defecf3cc
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFMCInstLower.cpp
@@ -0,0 +1,76 @@
+//=-- BPFMCInstLower.cpp - Convert BPF MachineInstr to an MCInst ------------=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower BPF MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPFMCInstLower.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+MCSymbol *
+BPFMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
+ return Printer.getSymbol(MO.getGlobal());
+}
+
+MCOperand BPFMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
+ MCSymbol *Sym) const {
+
+ const MCExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx);
+
+ if (!MO.isJTI() && MO.getOffset())
+ llvm_unreachable("unknown symbol op");
+
+ return MCOperand::createExpr(Expr);
+}
+
+void BPFMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+ OutMI.setOpcode(MI->getOpcode());
+
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+
+ MCOperand MCOp;
+ switch (MO.getType()) {
+ default:
+ MI->dump();
+ llvm_unreachable("unknown operand type");
+ case MachineOperand::MO_Register:
+ // Ignore all implicit register operands.
+ if (MO.isImplicit())
+ continue;
+ MCOp = MCOperand::createReg(MO.getReg());
+ break;
+ case MachineOperand::MO_Immediate:
+ MCOp = MCOperand::createImm(MO.getImm());
+ break;
+ case MachineOperand::MO_MachineBasicBlock:
+ MCOp = MCOperand::createExpr(
+ MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx));
+ break;
+ case MachineOperand::MO_RegisterMask:
+ continue;
+ case MachineOperand::MO_GlobalAddress:
+ MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
+ break;
+ }
+
+ OutMI.addOperand(MCOp);
+ }
+}
diff --git a/contrib/llvm/lib/Target/BPF/BPFMCInstLower.h b/contrib/llvm/lib/Target/BPF/BPFMCInstLower.h
new file mode 100644
index 000000000000..054e89407db2
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFMCInstLower.h
@@ -0,0 +1,43 @@
+//===-- BPFMCInstLower.h - Lower MachineInstr to MCInst ---------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_BPFMCINSTLOWER_H
+#define LLVM_LIB_TARGET_BPF_BPFMCINSTLOWER_H
+
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+class AsmPrinter;
+class MCContext;
+class MCInst;
+class MCOperand;
+class MCSymbol;
+class MachineInstr;
+class MachineModuleInfoMachO;
+class MachineOperand;
+class Mangler;
+
+// BPFMCInstLower - This class is used to lower an MachineInstr into an MCInst.
+class LLVM_LIBRARY_VISIBILITY BPFMCInstLower {
+ MCContext &Ctx;
+
+ AsmPrinter &Printer;
+
+public:
+ BPFMCInstLower(MCContext &ctx, AsmPrinter &printer)
+ : Ctx(ctx), Printer(printer) {}
+ void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+ MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+
+ MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.cpp b/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.cpp
new file mode 100644
index 000000000000..71846e3e92c9
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.cpp
@@ -0,0 +1,103 @@
+//===-- BPFRegisterInfo.cpp - BPF Register Information ----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the BPF implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPF.h"
+#include "BPFRegisterInfo.h"
+#include "BPFSubtarget.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_REGINFO_TARGET_DESC
+#include "BPFGenRegisterInfo.inc"
+using namespace llvm;
+
+BPFRegisterInfo::BPFRegisterInfo()
+ : BPFGenRegisterInfo(BPF::R0) {}
+
+const MCPhysReg *
+BPFRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+ return CSR_SaveList;
+}
+
+BitVector BPFRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+ BitVector Reserved(getNumRegs());
+ Reserved.set(BPF::R10); // R10 is read only frame pointer
+ Reserved.set(BPF::R11); // R11 is pseudo stack pointer
+ return Reserved;
+}
+
+void BPFRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+ int SPAdj, unsigned FIOperandNum,
+ RegScavenger *RS) const {
+ assert(SPAdj == 0 && "Unexpected");
+
+ unsigned i = 0;
+ MachineInstr &MI = *II;
+ MachineFunction &MF = *MI.getParent()->getParent();
+ DebugLoc DL = MI.getDebugLoc();
+
+ while (!MI.getOperand(i).isFI()) {
+ ++i;
+ assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+ }
+
+ unsigned FrameReg = getFrameRegister(MF);
+ int FrameIndex = MI.getOperand(i).getIndex();
+ const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+ MachineBasicBlock &MBB = *MI.getParent();
+
+ if (MI.getOpcode() == BPF::MOV_rr) {
+ int Offset = MF.getFrameInfo().getObjectOffset(FrameIndex);
+
+ MI.getOperand(i).ChangeToRegister(FrameReg, false);
+ unsigned reg = MI.getOperand(i - 1).getReg();
+ BuildMI(MBB, ++II, DL, TII.get(BPF::ADD_ri), reg)
+ .addReg(reg)
+ .addImm(Offset);
+ return;
+ }
+
+ int Offset = MF.getFrameInfo().getObjectOffset(FrameIndex) +
+ MI.getOperand(i + 1).getImm();
+
+ if (!isInt<32>(Offset))
+ llvm_unreachable("bug in frame offset");
+
+ if (MI.getOpcode() == BPF::FI_ri) {
+ // architecture does not really support FI_ri, replace it with
+ // MOV_rr <target_reg>, frame_reg
+ // ADD_ri <target_reg>, imm
+ unsigned reg = MI.getOperand(i - 1).getReg();
+
+ BuildMI(MBB, ++II, DL, TII.get(BPF::MOV_rr), reg)
+ .addReg(FrameReg);
+ BuildMI(MBB, II, DL, TII.get(BPF::ADD_ri), reg)
+ .addReg(reg)
+ .addImm(Offset);
+
+ // Remove FI_ri instruction
+ MI.eraseFromParent();
+ } else {
+ MI.getOperand(i).ChangeToRegister(FrameReg, false);
+ MI.getOperand(i + 1).ChangeToImmediate(Offset);
+ }
+}
+
+unsigned BPFRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+ return BPF::R10;
+}
diff --git a/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.h b/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.h
new file mode 100644
index 000000000000..7072dd0bde1a
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.h
@@ -0,0 +1,40 @@
+//===-- BPFRegisterInfo.h - BPF Register Information Impl -------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the BPF implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_BPFREGISTERINFO_H
+#define LLVM_LIB_TARGET_BPF_BPFREGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "BPFGenRegisterInfo.inc"
+
+namespace llvm {
+
+struct BPFRegisterInfo : public BPFGenRegisterInfo {
+
+ BPFRegisterInfo();
+
+ const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+
+ BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+ void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
+ unsigned FIOperandNum,
+ RegScavenger *RS = nullptr) const override;
+
+ unsigned getFrameRegister(const MachineFunction &MF) const override;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.td b/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.td
new file mode 100644
index 000000000000..c8e24f810310
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.td
@@ -0,0 +1,41 @@
+//===-- BPFRegisterInfo.td - BPF Register defs -------------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Declarations that describe the BPF register file
+//===----------------------------------------------------------------------===//
+
+// Registers are identified with 4-bit ID numbers.
+// Ri - 64-bit integer registers
+class Ri<bits<16> Enc, string n> : Register<n> {
+ let Namespace = "BPF";
+ let HWEncoding = Enc;
+}
+
+// Integer registers
+def R0 : Ri< 0, "r0">, DwarfRegNum<[0]>;
+def R1 : Ri< 1, "r1">, DwarfRegNum<[1]>;
+def R2 : Ri< 2, "r2">, DwarfRegNum<[2]>;
+def R3 : Ri< 3, "r3">, DwarfRegNum<[3]>;
+def R4 : Ri< 4, "r4">, DwarfRegNum<[4]>;
+def R5 : Ri< 5, "r5">, DwarfRegNum<[5]>;
+def R6 : Ri< 6, "r6">, DwarfRegNum<[6]>;
+def R7 : Ri< 7, "r7">, DwarfRegNum<[7]>;
+def R8 : Ri< 8, "r8">, DwarfRegNum<[8]>;
+def R9 : Ri< 9, "r9">, DwarfRegNum<[9]>;
+def R10 : Ri<10, "r10">, DwarfRegNum<[10]>;
+def R11 : Ri<11, "r11">, DwarfRegNum<[11]>;
+
+// Register classes.
+def GPR : RegisterClass<"BPF", [i64], 64, (add R1, R2, R3, R4, R5,
+ R6, R7, R8, R9, // callee saved
+ R0, // return value
+ R11, // stack ptr
+ R10 // frame ptr
+ )>;
diff --git a/contrib/llvm/lib/Target/BPF/BPFSubtarget.cpp b/contrib/llvm/lib/Target/BPF/BPFSubtarget.cpp
new file mode 100644
index 000000000000..c3a8b1caa63d
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFSubtarget.cpp
@@ -0,0 +1,31 @@
+//===-- BPFSubtarget.cpp - BPF Subtarget Information ----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the BPF specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPFSubtarget.h"
+#include "BPF.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "bpf-subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "BPFGenSubtargetInfo.inc"
+
+void BPFSubtarget::anchor() {}
+
+BPFSubtarget::BPFSubtarget(const Triple &TT, const std::string &CPU,
+ const std::string &FS, const TargetMachine &TM)
+ : BPFGenSubtargetInfo(TT, CPU, FS), InstrInfo(), FrameLowering(*this),
+ TLInfo(TM, *this) {}
diff --git a/contrib/llvm/lib/Target/BPF/BPFSubtarget.h b/contrib/llvm/lib/Target/BPF/BPFSubtarget.h
new file mode 100644
index 000000000000..27cc9a262fc3
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFSubtarget.h
@@ -0,0 +1,64 @@
+//===-- BPFSubtarget.h - Define Subtarget for the BPF -----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the BPF specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_BPFSUBTARGET_H
+#define LLVM_LIB_TARGET_BPF_BPFSUBTARGET_H
+
+#include "BPFFrameLowering.h"
+#include "BPFISelLowering.h"
+#include "BPFInstrInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+#define GET_SUBTARGETINFO_HEADER
+#include "BPFGenSubtargetInfo.inc"
+
+namespace llvm {
+class StringRef;
+
+class BPFSubtarget : public BPFGenSubtargetInfo {
+ virtual void anchor();
+ BPFInstrInfo InstrInfo;
+ BPFFrameLowering FrameLowering;
+ BPFTargetLowering TLInfo;
+ SelectionDAGTargetInfo TSInfo;
+
+public:
+ // This constructor initializes the data members to match that
+ // of the specified triple.
+ BPFSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS,
+ const TargetMachine &TM);
+
+ // ParseSubtargetFeatures - Parses features string setting specified
+ // subtarget options. Definition of function is auto generated by tblgen.
+ void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+ const BPFInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+ const BPFFrameLowering *getFrameLowering() const override {
+ return &FrameLowering;
+ }
+ const BPFTargetLowering *getTargetLowering() const override {
+ return &TLInfo;
+ }
+ const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+ return &TSInfo;
+ }
+ const TargetRegisterInfo *getRegisterInfo() const override {
+ return &InstrInfo.getRegisterInfo();
+ }
+};
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/BPF/BPFTargetMachine.cpp b/contrib/llvm/lib/Target/BPF/BPFTargetMachine.cpp
new file mode 100644
index 000000000000..897695633e46
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFTargetMachine.cpp
@@ -0,0 +1,82 @@
+//===-- BPFTargetMachine.cpp - Define TargetMachine for BPF ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements the info about BPF target spec.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPF.h"
+#include "BPFTargetMachine.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+extern "C" void LLVMInitializeBPFTarget() {
+ // Register the target.
+ RegisterTargetMachine<BPFTargetMachine> X(getTheBPFleTarget());
+ RegisterTargetMachine<BPFTargetMachine> Y(getTheBPFbeTarget());
+ RegisterTargetMachine<BPFTargetMachine> Z(getTheBPFTarget());
+}
+
+// DataLayout: little or big endian
+static std::string computeDataLayout(const Triple &TT) {
+ if (TT.getArch() == Triple::bpfeb)
+ return "E-m:e-p:64:64-i64:64-n32:64-S128";
+ else
+ return "e-m:e-p:64:64-i64:64-n32:64-S128";
+}
+
+static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
+ if (!RM.hasValue())
+ return Reloc::PIC_;
+ return *RM;
+}
+
+BPFTargetMachine::BPFTargetMachine(const Target &T, const Triple &TT,
+ StringRef CPU, StringRef FS,
+ const TargetOptions &Options,
+ Optional<Reloc::Model> RM,
+ CodeModel::Model CM, CodeGenOpt::Level OL)
+ : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options,
+ getEffectiveRelocModel(RM), CM, OL),
+ TLOF(make_unique<TargetLoweringObjectFileELF>()),
+ Subtarget(TT, CPU, FS, *this) {
+ initAsmInfo();
+}
+namespace {
+// BPF Code Generator Pass Configuration Options.
+class BPFPassConfig : public TargetPassConfig {
+public:
+ BPFPassConfig(BPFTargetMachine *TM, PassManagerBase &PM)
+ : TargetPassConfig(TM, PM) {}
+
+ BPFTargetMachine &getBPFTargetMachine() const {
+ return getTM<BPFTargetMachine>();
+ }
+
+ bool addInstSelector() override;
+};
+}
+
+TargetPassConfig *BPFTargetMachine::createPassConfig(PassManagerBase &PM) {
+ return new BPFPassConfig(this, PM);
+}
+
+// Install an instruction selector pass using
+// the ISelDag to gen BPF code.
+bool BPFPassConfig::addInstSelector() {
+ addPass(createBPFISelDag(getBPFTargetMachine()));
+
+ return false;
+}
diff --git a/contrib/llvm/lib/Target/BPF/BPFTargetMachine.h b/contrib/llvm/lib/Target/BPF/BPFTargetMachine.h
new file mode 100644
index 000000000000..644481446883
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFTargetMachine.h
@@ -0,0 +1,44 @@
+//===-- BPFTargetMachine.h - Define TargetMachine for BPF --- C++ ---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the BPF specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_BPFTARGETMACHINE_H
+#define LLVM_LIB_TARGET_BPF_BPFTARGETMACHINE_H
+
+#include "BPFSubtarget.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class BPFTargetMachine : public LLVMTargetMachine {
+ std::unique_ptr<TargetLoweringObjectFile> TLOF;
+ BPFSubtarget Subtarget;
+
+public:
+ BPFTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL);
+
+ const BPFSubtarget *getSubtargetImpl() const { return &Subtarget; }
+ const BPFSubtarget *getSubtargetImpl(const Function &) const override {
+ return &Subtarget;
+ }
+
+ TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+ TargetLoweringObjectFile *getObjFileLowering() const override {
+ return TLOF.get();
+ }
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp b/contrib/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
new file mode 100644
index 000000000000..b0037fbc16ac
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
@@ -0,0 +1,154 @@
+//===- BPFDisassembler.cpp - Disassembler for BPF ---------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the BPF Disassembler.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPF.h"
+#include "BPFRegisterInfo.h"
+#include "BPFSubtarget.h"
+#include "MCTargetDesc/BPFMCTargetDesc.h"
+
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "bpf-disassembler"
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+
+/// A disassembler class for BPF.
+class BPFDisassembler : public MCDisassembler {
+public:
+ BPFDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
+ : MCDisassembler(STI, Ctx) {}
+ virtual ~BPFDisassembler() {}
+
+ DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes, uint64_t Address,
+ raw_ostream &VStream,
+ raw_ostream &CStream) const override;
+};
+}
+
+static MCDisassembler *createBPFDisassembler(const Target &T,
+ const MCSubtargetInfo &STI,
+ MCContext &Ctx) {
+ return new BPFDisassembler(STI, Ctx);
+}
+
+
+extern "C" void LLVMInitializeBPFDisassembler() {
+ // Register the disassembler.
+ TargetRegistry::RegisterMCDisassembler(getTheBPFTarget(),
+ createBPFDisassembler);
+ TargetRegistry::RegisterMCDisassembler(getTheBPFleTarget(),
+ createBPFDisassembler);
+ TargetRegistry::RegisterMCDisassembler(getTheBPFbeTarget(),
+ createBPFDisassembler);
+}
+
+static const unsigned GPRDecoderTable[] = {
+ BPF::R0, BPF::R1, BPF::R2, BPF::R3, BPF::R4, BPF::R5,
+ BPF::R6, BPF::R7, BPF::R8, BPF::R9, BPF::R10, BPF::R11};
+
+static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t /*Address*/,
+ const void * /*Decoder*/) {
+ if (RegNo > 11)
+ return MCDisassembler::Fail;
+
+ unsigned Reg = GPRDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeMemoryOpValue(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ unsigned Register = (Insn >> 16) & 0xf;
+ Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register]));
+ unsigned Offset = (Insn & 0xffff);
+ Inst.addOperand(MCOperand::createImm(SignExtend32<16>(Offset)));
+
+ return MCDisassembler::Success;
+}
+
+#include "BPFGenDisassemblerTables.inc"
+
+static DecodeStatus readInstruction64(ArrayRef<uint8_t> Bytes, uint64_t Address,
+ uint64_t &Size, uint64_t &Insn) {
+ uint64_t Lo, Hi;
+
+ if (Bytes.size() < 8) {
+ Size = 0;
+ return MCDisassembler::Fail;
+ }
+
+ Size = 8;
+ Hi = (Bytes[0] << 24) | (Bytes[1] << 16) | (Bytes[2] << 0) | (Bytes[3] << 8);
+ Lo = (Bytes[4] << 0) | (Bytes[5] << 8) | (Bytes[6] << 16) | (Bytes[7] << 24);
+ Insn = Make_64(Hi, Lo);
+
+ return MCDisassembler::Success;
+}
+
+DecodeStatus BPFDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes,
+ uint64_t Address,
+ raw_ostream &VStream,
+ raw_ostream &CStream) const {
+ uint64_t Insn;
+ DecodeStatus Result;
+
+ Result = readInstruction64(Bytes, Address, Size, Insn);
+ if (Result == MCDisassembler::Fail) return MCDisassembler::Fail;
+
+ Result = decodeInstruction(DecoderTableBPF64, Instr, Insn,
+ Address, this, STI);
+ if (Result == MCDisassembler::Fail) return MCDisassembler::Fail;
+
+ switch (Instr.getOpcode()) {
+ case BPF::LD_imm64: {
+ if (Bytes.size() < 16) {
+ Size = 0;
+ return MCDisassembler::Fail;
+ }
+ Size = 16;
+ uint64_t Hi = (Bytes[12] << 0) | (Bytes[13] << 8) | (Bytes[14] << 16) | (Bytes[15] << 24);
+ auto& Op = Instr.getOperand(1);
+ Op.setImm(Make_64(Hi, Op.getImm()));
+ break;
+ }
+ case BPF::LD_ABS_B:
+ case BPF::LD_ABS_H:
+ case BPF::LD_ABS_W:
+ case BPF::LD_IND_B:
+ case BPF::LD_IND_H:
+ case BPF::LD_IND_W: {
+ auto Op = Instr.getOperand(0);
+ Instr.clear();
+ Instr.addOperand(MCOperand::createReg(BPF::R6));
+ Instr.addOperand(Op);
+ break;
+ }
+ }
+
+ return Result;
+}
+
+typedef DecodeStatus (*DecodeFunc)(MCInst &MI, unsigned insn, uint64_t Address,
+ const void *Decoder);
diff --git a/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp b/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp
new file mode 100644
index 000000000000..ffd29f3ea991
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp
@@ -0,0 +1,94 @@
+//===-- BPFInstPrinter.cpp - Convert BPF MCInst to asm syntax -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an BPF MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPF.h"
+#include "BPFInstPrinter.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+// Include the auto-generated portion of the assembly writer.
+#include "BPFGenAsmWriter.inc"
+
+void BPFInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+ StringRef Annot, const MCSubtargetInfo &STI) {
+ printInstruction(MI, O);
+ printAnnotation(O, Annot);
+}
+
+static void printExpr(const MCExpr *Expr, raw_ostream &O) {
+#ifndef NDEBUG
+ const MCSymbolRefExpr *SRE;
+
+ if (const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(Expr))
+ SRE = dyn_cast<MCSymbolRefExpr>(BE->getLHS());
+ else
+ SRE = dyn_cast<MCSymbolRefExpr>(Expr);
+ assert(SRE && "Unexpected MCExpr type.");
+
+ MCSymbolRefExpr::VariantKind Kind = SRE->getKind();
+
+ assert(Kind == MCSymbolRefExpr::VK_None);
+#endif
+ O << *Expr;
+}
+
+void BPFInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O, const char *Modifier) {
+ assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported");
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isReg()) {
+ O << getRegisterName(Op.getReg());
+ } else if (Op.isImm()) {
+ O << (int32_t)Op.getImm();
+ } else {
+ assert(Op.isExpr() && "Expected an expression");
+ printExpr(Op.getExpr(), O);
+ }
+}
+
+void BPFInstPrinter::printMemOperand(const MCInst *MI, int OpNo, raw_ostream &O,
+ const char *Modifier) {
+ const MCOperand &RegOp = MI->getOperand(OpNo);
+ const MCOperand &OffsetOp = MI->getOperand(OpNo + 1);
+
+ // register
+ assert(RegOp.isReg() && "Register operand not a register");
+ O << getRegisterName(RegOp.getReg());
+
+ // offset
+ if (OffsetOp.isImm()) {
+ auto Imm = OffsetOp.getImm();
+ if (Imm >= 0)
+ O << " + " << formatDec(Imm);
+ else
+ O << " - " << formatDec(-Imm);
+ } else {
+ assert(0 && "Expected an immediate");
+ }
+}
+
+void BPFInstPrinter::printImm64Operand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isImm())
+ O << (uint64_t)Op.getImm();
+ else
+ O << Op;
+}
diff --git a/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.h b/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.h
new file mode 100644
index 000000000000..4276d0858c2e
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.h
@@ -0,0 +1,40 @@
+//===-- BPFInstPrinter.h - Convert BPF MCInst to asm syntax -------*- C++ -*--//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints a BPF MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_INSTPRINTER_BPFINSTPRINTER_H
+#define LLVM_LIB_TARGET_BPF_INSTPRINTER_BPFINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+class BPFInstPrinter : public MCInstPrinter {
+public:
+ BPFInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI)
+ : MCInstPrinter(MAI, MII, MRI) {}
+
+ void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
+ const MCSubtargetInfo &STI) override;
+ void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O,
+ const char *Modifier = nullptr);
+ void printMemOperand(const MCInst *MI, int OpNo, raw_ostream &O,
+ const char *Modifier = nullptr);
+ void printImm64Operand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+ // Autogenerated by tblgen.
+ void printInstruction(const MCInst *MI, raw_ostream &O);
+ static const char *getRegisterName(unsigned RegNo);
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
new file mode 100644
index 000000000000..a6cd2002c12c
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
@@ -0,0 +1,109 @@
+//===-- BPFAsmBackend.cpp - BPF Assembler Backend -------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/BPFMCTargetDesc.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+class BPFAsmBackend : public MCAsmBackend {
+public:
+ bool IsLittleEndian;
+
+ BPFAsmBackend(bool IsLittleEndian)
+ : MCAsmBackend(), IsLittleEndian(IsLittleEndian) {}
+ ~BPFAsmBackend() override {}
+
+ void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+ uint64_t Value, bool IsPCRel) const override;
+
+ MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
+
+ // No instruction requires relaxation
+ bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+ const MCRelaxableFragment *DF,
+ const MCAsmLayout &Layout) const override {
+ return false;
+ }
+
+ unsigned getNumFixupKinds() const override { return 1; }
+
+ bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
+
+ void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+ MCInst &Res) const override {}
+
+ bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+};
+
+bool BPFAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+ if ((Count % 8) != 0)
+ return false;
+
+ for (uint64_t i = 0; i < Count; i += 8)
+ OW->write64(0x15000000);
+
+ return true;
+}
+
+void BPFAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+ unsigned DataSize, uint64_t Value,
+ bool IsPCRel) const {
+
+ if (Fixup.getKind() == FK_SecRel_4 || Fixup.getKind() == FK_SecRel_8) {
+ assert(Value == 0);
+ } else if (Fixup.getKind() == FK_Data_4 || Fixup.getKind() == FK_Data_8) {
+ unsigned Size = Fixup.getKind() == FK_Data_4 ? 4 : 8;
+
+ for (unsigned i = 0; i != Size; ++i) {
+ unsigned Idx = IsLittleEndian ? i : Size - i;
+ Data[Fixup.getOffset() + Idx] = uint8_t(Value >> (i * 8));
+ }
+ } else {
+ assert(Fixup.getKind() == FK_PCRel_2);
+ Value = (uint16_t)((Value - 8) / 8);
+ if (IsLittleEndian) {
+ Data[Fixup.getOffset() + 2] = Value & 0xFF;
+ Data[Fixup.getOffset() + 3] = Value >> 8;
+ } else {
+ Data[Fixup.getOffset() + 2] = Value >> 8;
+ Data[Fixup.getOffset() + 3] = Value & 0xFF;
+ }
+ }
+}
+
+MCObjectWriter *BPFAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
+ return createBPFELFObjectWriter(OS, 0, IsLittleEndian);
+}
+}
+
+MCAsmBackend *llvm::createBPFAsmBackend(const Target &T,
+ const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions&) {
+ return new BPFAsmBackend(/*IsLittleEndian=*/true);
+}
+
+MCAsmBackend *llvm::createBPFbeAsmBackend(const Target &T,
+ const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions&) {
+ return new BPFAsmBackend(/*IsLittleEndian=*/false);
+}
diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
new file mode 100644
index 000000000000..3d1c0eb55afa
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
@@ -0,0 +1,58 @@
+//===-- BPFELFObjectWriter.cpp - BPF ELF Writer ---------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/BPFMCTargetDesc.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+class BPFELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+ BPFELFObjectWriter(uint8_t OSABI);
+
+ ~BPFELFObjectWriter() override;
+
+protected:
+ unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+ const MCFixup &Fixup, bool IsPCRel) const override;
+};
+}
+
+BPFELFObjectWriter::BPFELFObjectWriter(uint8_t OSABI)
+ : MCELFObjectTargetWriter(/*Is64Bit*/ true, OSABI, ELF::EM_BPF,
+ /*HasRelocationAddend*/ false) {}
+
+BPFELFObjectWriter::~BPFELFObjectWriter() {}
+
+unsigned BPFELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
+ const MCFixup &Fixup,
+ bool IsPCRel) const {
+ // determine the type of the relocation
+ switch ((unsigned)Fixup.getKind()) {
+ default:
+ llvm_unreachable("invalid fixup kind!");
+ case FK_SecRel_8:
+ return ELF::R_BPF_64_64;
+ case FK_SecRel_4:
+ return ELF::R_BPF_64_32;
+ case FK_Data_8:
+ return ELF::R_BPF_64_64;
+ case FK_Data_4:
+ return ELF::R_BPF_64_32;
+ }
+}
+
+MCObjectWriter *llvm::createBPFELFObjectWriter(raw_pwrite_stream &OS,
+ uint8_t OSABI, bool IsLittleEndian) {
+ MCELFObjectTargetWriter *MOTW = new BPFELFObjectWriter(OSABI);
+ return createELFObjectWriter(MOTW, OS, IsLittleEndian);
+}
diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
new file mode 100644
index 000000000000..559ac291a79e
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
@@ -0,0 +1,50 @@
+//===-- BPFMCAsmInfo.h - BPF asm properties -------------------*- C++ -*--====//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the BPFMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_MCTARGETDESC_BPFMCASMINFO_H
+#define LLVM_LIB_TARGET_BPF_MCTARGETDESC_BPFMCASMINFO_H
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+class Target;
+
+class BPFMCAsmInfo : public MCAsmInfo {
+public:
+ explicit BPFMCAsmInfo(const Triple &TT) {
+ if (TT.getArch() == Triple::bpfeb)
+ IsLittleEndian = false;
+
+ PrivateGlobalPrefix = ".L";
+ WeakRefDirective = "\t.weak\t";
+
+ UsesELFSectionDirectiveForBSS = true;
+ HasSingleParameterDotFile = false;
+ HasDotTypeDotSizeDirective = false;
+
+ SupportsDebugInformation = true;
+ ExceptionsType = ExceptionHandling::DwarfCFI;
+ MinInstAlignment = 8;
+
+ // the default is 4 and it only affects dwarf elf output
+ // so if not set correctly, the dwarf data will be
+ // messed up in random places by 4 bytes. .debug_line
+ // section will be parsable, but with odd offsets and
+ // line numbers, etc.
+ PointerSize = 8;
+ }
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
new file mode 100644
index 000000000000..47f16512a397
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
@@ -0,0 +1,179 @@
+//===-- BPFMCCodeEmitter.cpp - Convert BPF code to machine code -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the BPFMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/BPFMCTargetDesc.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "mccodeemitter"
+
+namespace {
+class BPFMCCodeEmitter : public MCCodeEmitter {
+ BPFMCCodeEmitter(const BPFMCCodeEmitter &) = delete;
+ void operator=(const BPFMCCodeEmitter &) = delete;
+ const MCInstrInfo &MCII;
+ const MCRegisterInfo &MRI;
+ bool IsLittleEndian;
+
+public:
+ BPFMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
+ bool IsLittleEndian)
+ : MCII(mcii), MRI(mri), IsLittleEndian(IsLittleEndian) {}
+
+ ~BPFMCCodeEmitter() {}
+
+ // getBinaryCodeForInstr - TableGen'erated function for getting the
+ // binary encoding for an instruction.
+ uint64_t getBinaryCodeForInstr(const MCInst &MI,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ // getMachineOpValue - Return binary encoding of operand. If the machin
+ // operand requires relocation, record the relocation and return zero.
+ unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ uint64_t getMemoryOpValue(const MCInst &MI, unsigned Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const override;
+
+private:
+ uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;
+ void verifyInstructionPredicates(const MCInst &MI,
+ uint64_t AvailableFeatures) const;
+};
+}
+
+MCCodeEmitter *llvm::createBPFMCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx) {
+ return new BPFMCCodeEmitter(MCII, MRI, true);
+}
+
+MCCodeEmitter *llvm::createBPFbeMCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx) {
+ return new BPFMCCodeEmitter(MCII, MRI, false);
+}
+
+unsigned BPFMCCodeEmitter::getMachineOpValue(const MCInst &MI,
+ const MCOperand &MO,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ if (MO.isReg())
+ return MRI.getEncodingValue(MO.getReg());
+ if (MO.isImm())
+ return static_cast<unsigned>(MO.getImm());
+
+ assert(MO.isExpr());
+
+ const MCExpr *Expr = MO.getExpr();
+
+ assert(Expr->getKind() == MCExpr::SymbolRef);
+
+ if (MI.getOpcode() == BPF::JAL)
+ // func call name
+ Fixups.push_back(MCFixup::create(0, Expr, FK_SecRel_4));
+ else if (MI.getOpcode() == BPF::LD_imm64)
+ Fixups.push_back(MCFixup::create(0, Expr, FK_SecRel_8));
+ else
+ // bb label
+ Fixups.push_back(MCFixup::create(0, Expr, FK_PCRel_2));
+
+ return 0;
+}
+
+static uint8_t SwapBits(uint8_t Val)
+{
+ return (Val & 0x0F) << 4 | (Val & 0xF0) >> 4;
+}
+
+void BPFMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ verifyInstructionPredicates(MI,
+ computeAvailableFeatures(STI.getFeatureBits()));
+
+ unsigned Opcode = MI.getOpcode();
+ support::endian::Writer<support::little> LE(OS);
+ support::endian::Writer<support::big> BE(OS);
+
+ if (Opcode == BPF::LD_imm64 || Opcode == BPF::LD_pseudo) {
+ uint64_t Value = getBinaryCodeForInstr(MI, Fixups, STI);
+ LE.write<uint8_t>(Value >> 56);
+ if (IsLittleEndian)
+ LE.write<uint8_t>((Value >> 48) & 0xff);
+ else
+ LE.write<uint8_t>(SwapBits((Value >> 48) & 0xff));
+ LE.write<uint16_t>(0);
+ if (IsLittleEndian)
+ LE.write<uint32_t>(Value & 0xffffFFFF);
+ else
+ BE.write<uint32_t>(Value & 0xffffFFFF);
+
+ const MCOperand &MO = MI.getOperand(1);
+ uint64_t Imm = MO.isImm() ? MO.getImm() : 0;
+ LE.write<uint8_t>(0);
+ LE.write<uint8_t>(0);
+ LE.write<uint16_t>(0);
+ if (IsLittleEndian)
+ LE.write<uint32_t>(Imm >> 32);
+ else
+ BE.write<uint32_t>(Imm >> 32);
+ } else {
+ // Get instruction encoding and emit it
+ uint64_t Value = getBinaryCodeForInstr(MI, Fixups, STI);
+ LE.write<uint8_t>(Value >> 56);
+ if (IsLittleEndian) {
+ LE.write<uint8_t>((Value >> 48) & 0xff);
+ LE.write<uint16_t>((Value >> 32) & 0xffff);
+ LE.write<uint32_t>(Value & 0xffffFFFF);
+ } else {
+ LE.write<uint8_t>(SwapBits((Value >> 48) & 0xff));
+ BE.write<uint16_t>((Value >> 32) & 0xffff);
+ BE.write<uint32_t>(Value & 0xffffFFFF);
+ }
+ }
+}
+
+// Encode BPF Memory Operand
+uint64_t BPFMCCodeEmitter::getMemoryOpValue(const MCInst &MI, unsigned Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ uint64_t Encoding;
+ const MCOperand Op1 = MI.getOperand(1);
+ assert(Op1.isReg() && "First operand is not register.");
+ Encoding = MRI.getEncodingValue(Op1.getReg());
+ Encoding <<= 16;
+ MCOperand Op2 = MI.getOperand(2);
+ assert(Op2.isImm() && "Second operand is not immediate.");
+ Encoding |= Op2.getImm() & 0xffff;
+ return Encoding;
+}
+
+#define ENABLE_INSTR_PREDICATE_VERIFIER
+#include "BPFGenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
new file mode 100644
index 000000000000..55415f97396b
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
@@ -0,0 +1,116 @@
+//===-- BPFMCTargetDesc.cpp - BPF Target Descriptions ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides BPF specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPF.h"
+#include "BPFMCTargetDesc.h"
+#include "BPFMCAsmInfo.h"
+#include "InstPrinter/BPFInstPrinter.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_INSTRINFO_MC_DESC
+#include "BPFGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "BPFGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "BPFGenRegisterInfo.inc"
+
+using namespace llvm;
+
+static MCInstrInfo *createBPFMCInstrInfo() {
+ MCInstrInfo *X = new MCInstrInfo();
+ InitBPFMCInstrInfo(X);
+ return X;
+}
+
+static MCRegisterInfo *createBPFMCRegisterInfo(const Triple &TT) {
+ MCRegisterInfo *X = new MCRegisterInfo();
+ InitBPFMCRegisterInfo(X, BPF::R11 /* RAReg doesn't exist */);
+ return X;
+}
+
+static MCSubtargetInfo *createBPFMCSubtargetInfo(const Triple &TT,
+ StringRef CPU, StringRef FS) {
+ return createBPFMCSubtargetInfoImpl(TT, CPU, FS);
+}
+
+static MCStreamer *createBPFMCStreamer(const Triple &T,
+ MCContext &Ctx, MCAsmBackend &MAB,
+ raw_pwrite_stream &OS, MCCodeEmitter *Emitter,
+ bool RelaxAll) {
+ return createELFStreamer(Ctx, MAB, OS, Emitter, RelaxAll);
+}
+
+static MCInstPrinter *createBPFMCInstPrinter(const Triple &T,
+ unsigned SyntaxVariant,
+ const MCAsmInfo &MAI,
+ const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI) {
+ if (SyntaxVariant == 0)
+ return new BPFInstPrinter(MAI, MII, MRI);
+ return 0;
+}
+
+extern "C" void LLVMInitializeBPFTargetMC() {
+ for (Target *T :
+ {&getTheBPFleTarget(), &getTheBPFbeTarget(), &getTheBPFTarget()}) {
+ // Register the MC asm info.
+ RegisterMCAsmInfo<BPFMCAsmInfo> X(*T);
+
+ // Register the MC instruction info.
+ TargetRegistry::RegisterMCInstrInfo(*T, createBPFMCInstrInfo);
+
+ // Register the MC register info.
+ TargetRegistry::RegisterMCRegInfo(*T, createBPFMCRegisterInfo);
+
+ // Register the MC subtarget info.
+ TargetRegistry::RegisterMCSubtargetInfo(*T,
+ createBPFMCSubtargetInfo);
+
+ // Register the object streamer
+ TargetRegistry::RegisterELFStreamer(*T, createBPFMCStreamer);
+
+ // Register the MCInstPrinter.
+ TargetRegistry::RegisterMCInstPrinter(*T, createBPFMCInstPrinter);
+ }
+
+ // Register the MC code emitter
+ TargetRegistry::RegisterMCCodeEmitter(getTheBPFleTarget(),
+ createBPFMCCodeEmitter);
+ TargetRegistry::RegisterMCCodeEmitter(getTheBPFbeTarget(),
+ createBPFbeMCCodeEmitter);
+
+ // Register the ASM Backend
+ TargetRegistry::RegisterMCAsmBackend(getTheBPFleTarget(),
+ createBPFAsmBackend);
+ TargetRegistry::RegisterMCAsmBackend(getTheBPFbeTarget(),
+ createBPFbeAsmBackend);
+
+ if (sys::IsLittleEndianHost) {
+ TargetRegistry::RegisterMCCodeEmitter(getTheBPFTarget(),
+ createBPFMCCodeEmitter);
+ TargetRegistry::RegisterMCAsmBackend(getTheBPFTarget(),
+ createBPFAsmBackend);
+ } else {
+ TargetRegistry::RegisterMCCodeEmitter(getTheBPFTarget(),
+ createBPFbeMCCodeEmitter);
+ TargetRegistry::RegisterMCAsmBackend(getTheBPFTarget(),
+ createBPFbeAsmBackend);
+ }
+}
diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h
new file mode 100644
index 000000000000..3df673eaeb4b
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h
@@ -0,0 +1,71 @@
+//===-- BPFMCTargetDesc.h - BPF Target Descriptions -------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides BPF specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_MCTARGETDESC_BPFMCTARGETDESC_H
+#define LLVM_LIB_TARGET_BPF_MCTARGETDESC_BPFMCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Config/config.h"
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class MCTargetOptions;
+class StringRef;
+class Target;
+class Triple;
+class raw_ostream;
+class raw_pwrite_stream;
+
+Target &getTheBPFleTarget();
+Target &getTheBPFbeTarget();
+Target &getTheBPFTarget();
+
+MCCodeEmitter *createBPFMCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx);
+MCCodeEmitter *createBPFbeMCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx);
+
+MCAsmBackend *createBPFAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options);
+MCAsmBackend *createBPFbeAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options);
+
+MCObjectWriter *createBPFELFObjectWriter(raw_pwrite_stream &OS,
+ uint8_t OSABI, bool IsLittleEndian);
+}
+
+// Defines symbolic names for BPF registers. This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "BPFGenRegisterInfo.inc"
+
+// Defines symbolic names for the BPF instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "BPFGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "BPFGenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp b/contrib/llvm/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp
new file mode 100644
index 000000000000..265180b99876
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp
@@ -0,0 +1,36 @@
+//===-- BPFTargetInfo.cpp - BPF Target Implementation ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPF.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+namespace llvm {
+Target &getTheBPFleTarget() {
+ static Target TheBPFleTarget;
+ return TheBPFleTarget;
+}
+Target &getTheBPFbeTarget() {
+ static Target TheBPFbeTarget;
+ return TheBPFbeTarget;
+}
+Target &getTheBPFTarget() {
+ static Target TheBPFTarget;
+ return TheBPFTarget;
+}
+} // namespace llvm
+
+extern "C" void LLVMInitializeBPFTargetInfo() {
+ TargetRegistry::RegisterTarget(getTheBPFTarget(), "bpf", "BPF (host endian)",
+ [](Triple::ArchType) { return false; }, true);
+ RegisterTarget<Triple::bpfel, /*HasJIT=*/true> X(getTheBPFleTarget(), "bpfel",
+ "BPF (little endian)");
+ RegisterTarget<Triple::bpfeb, /*HasJIT=*/true> Y(getTheBPFbeTarget(), "bpfeb",
+ "BPF (big endian)");
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/contrib/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
new file mode 100644
index 000000000000..becc086c81b0
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -0,0 +1,2154 @@
+//===-- HexagonAsmParser.cpp - Parse Hexagon asm to MCInst instructions----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mcasmparser"
+
+#include "Hexagon.h"
+#include "HexagonTargetStreamer.h"
+#include "MCTargetDesc/HexagonMCChecker.h"
+#include "MCTargetDesc/HexagonMCELFStreamer.h"
+#include "MCTargetDesc/HexagonMCExpr.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
+#include "MCTargetDesc/HexagonShuffler.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCAsmParserExtension.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/SMLoc.h"
+#include "llvm/Support/TargetRegistry.h"
+#include <algorithm>
+#include <cassert>
+#include <cctype>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+using namespace llvm;
+
+static cl::opt<bool> EnableFutureRegs("mfuture-regs",
+ cl::desc("Enable future registers"));
+
+static cl::opt<bool> WarnMissingParenthesis("mwarn-missing-parenthesis",
+cl::desc("Warn for missing parenthesis around predicate registers"),
+cl::init(true));
+static cl::opt<bool> ErrorMissingParenthesis("merror-missing-parenthesis",
+cl::desc("Error for missing parenthesis around predicate registers"),
+cl::init(false));
+static cl::opt<bool> WarnSignedMismatch("mwarn-sign-mismatch",
+cl::desc("Warn for mismatching a signed and unsigned value"),
+cl::init(true));
+static cl::opt<bool> WarnNoncontigiousRegister("mwarn-noncontigious-register",
+cl::desc("Warn for register names that arent contigious"),
+cl::init(true));
+static cl::opt<bool> ErrorNoncontigiousRegister("merror-noncontigious-register",
+cl::desc("Error for register names that aren't contigious"),
+cl::init(false));
+
+namespace {
+
+struct HexagonOperand;
+
+class HexagonAsmParser : public MCTargetAsmParser {
+
+ HexagonTargetStreamer &getTargetStreamer() {
+ MCTargetStreamer &TS = *Parser.getStreamer().getTargetStreamer();
+ return static_cast<HexagonTargetStreamer &>(TS);
+ }
+
+ MCAsmParser &Parser;
+ MCAssembler *Assembler;
+ MCInstrInfo const &MCII;
+ MCInst MCB;
+ bool InBrackets;
+
+ MCAsmParser &getParser() const { return Parser; }
+ MCAssembler *getAssembler() const { return Assembler; }
+ MCAsmLexer &getLexer() const { return Parser.getLexer(); }
+
+ bool equalIsAsmAssignment() override { return false; }
+ bool isLabel(AsmToken &Token) override;
+
+ void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); }
+ bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); }
+ bool ParseDirectiveFalign(unsigned Size, SMLoc L);
+
+ bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+ bool ParseDirectiveSubsection(SMLoc L);
+ bool ParseDirectiveValue(unsigned Size, SMLoc L);
+ bool ParseDirectiveComm(bool IsLocal, SMLoc L);
+ bool RegisterMatchesArch(unsigned MatchNum) const;
+
+ bool matchBundleOptions();
+ bool handleNoncontigiousRegister(bool Contigious, SMLoc &Loc);
+ bool finishBundle(SMLoc IDLoc, MCStreamer &Out);
+ void canonicalizeImmediates(MCInst &MCI);
+ bool matchOneInstruction(MCInst &MCB, SMLoc IDLoc,
+ OperandVector &InstOperands, uint64_t &ErrorInfo,
+ bool MatchingInlineAsm);
+
+ bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands, MCStreamer &Out,
+ uint64_t &ErrorInfo, bool MatchingInlineAsm) override;
+
+ unsigned validateTargetOperandClass(MCParsedAsmOperand &Op, unsigned Kind) override;
+ bool OutOfRange(SMLoc IDLoc, long long Val, long long Max);
+ int processInstruction(MCInst &Inst, OperandVector const &Operands,
+ SMLoc IDLoc);
+
+ // Check if we have an assembler and, if so, set the ELF e_header flags.
+ void chksetELFHeaderEFlags(unsigned flags) {
+ if (getAssembler())
+ getAssembler()->setELFHeaderEFlags(flags);
+ }
+
+ unsigned matchRegister(StringRef Name);
+
+/// @name Auto-generated Match Functions
+/// {
+
+#define GET_ASSEMBLER_HEADER
+#include "HexagonGenAsmMatcher.inc"
+
+ /// }
+
+public:
+ HexagonAsmParser(const MCSubtargetInfo &_STI, MCAsmParser &_Parser,
+ const MCInstrInfo &MII, const MCTargetOptions &Options)
+ : MCTargetAsmParser(Options, _STI), Parser(_Parser),
+ MCII (MII), MCB(HexagonMCInstrInfo::createBundle()), InBrackets(false) {
+ setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
+
+ MCAsmParserExtension::Initialize(_Parser);
+
+ Assembler = nullptr;
+ // FIXME: need better way to detect AsmStreamer (upstream removed getKind())
+ if (!Parser.getStreamer().hasRawTextSupport()) {
+ MCELFStreamer *MES = static_cast<MCELFStreamer *>(&Parser.getStreamer());
+ Assembler = &MES->getAssembler();
+ }
+ }
+
+ bool splitIdentifier(OperandVector &Operands);
+ bool parseOperand(OperandVector &Operands);
+ bool parseInstruction(OperandVector &Operands);
+ bool implicitExpressionLocation(OperandVector &Operands);
+ bool parseExpressionOrOperand(OperandVector &Operands);
+ bool parseExpression(MCExpr const *& Expr);
+
+ bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+ SMLoc NameLoc, OperandVector &Operands) override
+ {
+ llvm_unreachable("Unimplemented");
+ }
+
+ bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, AsmToken ID,
+ OperandVector &Operands) override;
+
+ bool ParseDirective(AsmToken DirectiveID) override;
+};
+
+/// HexagonOperand - Instances of this class represent a parsed Hexagon machine
+/// instruction.
+struct HexagonOperand : public MCParsedAsmOperand {
+ enum KindTy { Token, Immediate, Register } Kind;
+
+ SMLoc StartLoc, EndLoc;
+
+ struct TokTy {
+ const char *Data;
+ unsigned Length;
+ };
+
+ struct RegTy {
+ unsigned RegNum;
+ };
+
+ struct ImmTy {
+ const MCExpr *Val;
+ };
+
+ struct InstTy {
+ OperandVector *SubInsts;
+ };
+
+ union {
+ struct TokTy Tok;
+ struct RegTy Reg;
+ struct ImmTy Imm;
+ };
+
+ HexagonOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
+
+public:
+ HexagonOperand(const HexagonOperand &o) : MCParsedAsmOperand() {
+ Kind = o.Kind;
+ StartLoc = o.StartLoc;
+ EndLoc = o.EndLoc;
+ switch (Kind) {
+ case Register:
+ Reg = o.Reg;
+ break;
+ case Immediate:
+ Imm = o.Imm;
+ break;
+ case Token:
+ Tok = o.Tok;
+ break;
+ }
+ }
+
+ /// getStartLoc - Get the location of the first token of this operand.
+ SMLoc getStartLoc() const override { return StartLoc; }
+
+ /// getEndLoc - Get the location of the last token of this operand.
+ SMLoc getEndLoc() const override { return EndLoc; }
+
+ unsigned getReg() const override {
+ assert(Kind == Register && "Invalid access!");
+ return Reg.RegNum;
+ }
+
+ const MCExpr *getImm() const {
+ assert(Kind == Immediate && "Invalid access!");
+ return Imm.Val;
+ }
+
+ bool isToken() const override { return Kind == Token; }
+ bool isImm() const override { return Kind == Immediate; }
+ bool isMem() const override { llvm_unreachable("No isMem"); }
+ bool isReg() const override { return Kind == Register; }
+
+ bool CheckImmRange(int immBits, int zeroBits, bool isSigned,
+ bool isRelocatable, bool Extendable) const {
+ if (Kind == Immediate) {
+ const MCExpr *myMCExpr = &HexagonMCInstrInfo::getExpr(*getImm());
+ if (HexagonMCInstrInfo::mustExtend(*Imm.Val) && !Extendable)
+ return false;
+ int64_t Res;
+ if (myMCExpr->evaluateAsAbsolute(Res)) {
+ int bits = immBits + zeroBits;
+ // Field bit range is zerobits + bits
+ // zeroBits must be 0
+ if (Res & ((1 << zeroBits) - 1))
+ return false;
+ if (isSigned) {
+ if (Res < (1LL << (bits - 1)) && Res >= -(1LL << (bits - 1)))
+ return true;
+ } else {
+ if (bits == 64)
+ return true;
+ if (Res >= 0)
+ return ((uint64_t)Res < (uint64_t)(1ULL << bits));
+ else {
+ const int64_t high_bit_set = 1ULL << 63;
+ const uint64_t mask = (high_bit_set >> (63 - bits));
+ return (((uint64_t)Res & mask) == mask);
+ }
+ }
+ } else if (myMCExpr->getKind() == MCExpr::SymbolRef && isRelocatable)
+ return true;
+ else if (myMCExpr->getKind() == MCExpr::Binary ||
+ myMCExpr->getKind() == MCExpr::Unary)
+ return true;
+ }
+ return false;
+ }
+
+ bool isf32Ext() const { return false; }
+ bool iss32_0Imm() const { return CheckImmRange(32, 0, true, true, false); }
+ bool iss23_2Imm() const { return CheckImmRange(23, 2, true, true, false); }
+ bool iss8_0Imm() const { return CheckImmRange(8, 0, true, false, false); }
+ bool iss8_0Imm64() const { return CheckImmRange(8, 0, true, true, false); }
+ bool iss7_0Imm() const { return CheckImmRange(7, 0, true, false, false); }
+ bool iss6_0Imm() const { return CheckImmRange(6, 0, true, false, false); }
+ bool iss4_0Imm() const { return CheckImmRange(4, 0, true, false, false); }
+ bool iss4_1Imm() const { return CheckImmRange(4, 1, true, false, false); }
+ bool iss4_2Imm() const { return CheckImmRange(4, 2, true, false, false); }
+ bool iss4_3Imm() const { return CheckImmRange(4, 3, true, false, false); }
+ bool iss4_6Imm() const { return CheckImmRange(4, 0, true, false, false); }
+ bool iss3_6Imm() const { return CheckImmRange(3, 0, true, false, false); }
+ bool iss3_0Imm() const { return CheckImmRange(3, 0, true, false, false); }
+
+ bool isu64_0Imm() const { return CheckImmRange(64, 0, false, true, true); }
+ bool isu32_0Imm() const { return CheckImmRange(32, 0, false, true, false); }
+ bool isu26_6Imm() const { return CheckImmRange(26, 6, false, true, false); }
+ bool isu16_0Imm() const { return CheckImmRange(16, 0, false, true, false); }
+ bool isu16_1Imm() const { return CheckImmRange(16, 1, false, true, false); }
+ bool isu16_2Imm() const { return CheckImmRange(16, 2, false, true, false); }
+ bool isu16_3Imm() const { return CheckImmRange(16, 3, false, true, false); }
+ bool isu11_3Imm() const { return CheckImmRange(11, 3, false, false, false); }
+ bool isu6_1Imm() const { return CheckImmRange(6, 1, false, false, false); }
+ bool isu6_2Imm() const { return CheckImmRange(6, 2, false, false, false); }
+ bool isu6_3Imm() const { return CheckImmRange(6, 3, false, false, false); }
+ bool isu10_0Imm() const { return CheckImmRange(10, 0, false, false, false); }
+ bool isu9_0Imm() const { return CheckImmRange(9, 0, false, false, false); }
+ bool isu8_0Imm() const { return CheckImmRange(8, 0, false, false, false); }
+ bool isu7_0Imm() const { return CheckImmRange(7, 0, false, false, false); }
+ bool isu6_0Imm() const { return CheckImmRange(6, 0, false, false, false); }
+ bool isu5_0Imm() const { return CheckImmRange(5, 0, false, false, false); }
+ bool isu4_0Imm() const { return CheckImmRange(4, 0, false, false, false); }
+ bool isu3_0Imm() const { return CheckImmRange(3, 0, false, false, false); }
+ bool isu2_0Imm() const { return CheckImmRange(2, 0, false, false, false); }
+ bool isu1_0Imm() const { return CheckImmRange(1, 0, false, false, false); }
+
+ bool ism6_0Imm() const { return CheckImmRange(6, 0, false, false, false); }
+ bool isn8_0Imm() const { return CheckImmRange(8, 0, false, false, false); }
+ bool isn1Const() const {
+ if (!isImm())
+ return false;
+ int64_t Value;
+ if (!getImm()->evaluateAsAbsolute(Value))
+ return false;
+ return Value == -1;
+ }
+
+ bool iss16_0Ext() const { return CheckImmRange(16 + 26, 0, true, true, true); }
+ bool iss12_0Ext() const { return CheckImmRange(12 + 26, 0, true, true, true); }
+ bool iss10_0Ext() const { return CheckImmRange(10 + 26, 0, true, true, true); }
+ bool iss9_0Ext() const { return CheckImmRange(9 + 26, 0, true, true, true); }
+ bool iss8_0Ext() const { return CheckImmRange(8 + 26, 0, true, true, true); }
+ bool iss7_0Ext() const { return CheckImmRange(7 + 26, 0, true, true, true); }
+ bool iss6_0Ext() const { return CheckImmRange(6 + 26, 0, true, true, true); }
+ bool iss11_0Ext() const {
+ return CheckImmRange(11 + 26, 0, true, true, true);
+ }
+ bool iss11_1Ext() const {
+ return CheckImmRange(11 + 26, 1, true, true, true);
+ }
+ bool iss11_2Ext() const {
+ return CheckImmRange(11 + 26, 2, true, true, true);
+ }
+ bool iss11_3Ext() const {
+ return CheckImmRange(11 + 26, 3, true, true, true);
+ }
+
+ bool isu7_0Ext() const { return CheckImmRange(7 + 26, 0, false, true, true); }
+ bool isu8_0Ext() const { return CheckImmRange(8 + 26, 0, false, true, true); }
+ bool isu9_0Ext() const { return CheckImmRange(9 + 26, 0, false, true, true); }
+ bool isu10_0Ext() const { return CheckImmRange(10 + 26, 0, false, true, true); }
+ bool isu6_0Ext() const { return CheckImmRange(6 + 26, 0, false, true, true); }
+ bool isu6_1Ext() const { return CheckImmRange(6 + 26, 1, false, true, true); }
+ bool isu6_2Ext() const { return CheckImmRange(6 + 26, 2, false, true, true); }
+ bool isu6_3Ext() const { return CheckImmRange(6 + 26, 3, false, true, true); }
+ bool isu32_0MustExt() const { return isImm(); }
+
+ void addRegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getReg()));
+ }
+
+ void addImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createExpr(getImm()));
+ }
+
+ void addSignedImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ HexagonMCExpr *Expr =
+ const_cast<HexagonMCExpr *>(cast<HexagonMCExpr>(getImm()));
+ int64_t Value;
+ if (!Expr->evaluateAsAbsolute(Value)) {
+ Inst.addOperand(MCOperand::createExpr(Expr));
+ return;
+ }
+ int64_t Extended = SignExtend64(Value, 32);
+ if ((Extended < 0) != (Value < 0))
+ Expr->setSignMismatch();
+ Inst.addOperand(MCOperand::createExpr(Expr));
+ }
+
+ void addf32ExtOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+
+ void adds32_0ImmOperands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+ void adds23_2ImmOperands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+ void adds8_0ImmOperands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+ void adds8_0Imm64Operands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+ void adds6_0ImmOperands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+ void adds4_0ImmOperands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+ void adds4_1ImmOperands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+ void adds4_2ImmOperands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+ void adds4_3ImmOperands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+ void adds3_0ImmOperands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+
+ void addu64_0ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu32_0ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu26_6ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu16_0ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu16_1ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu16_2ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu16_3ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu11_3ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu10_0ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu9_0ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu8_0ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu7_0ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu6_0ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu6_1ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu6_2ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu6_3ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu5_0ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu4_0ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu3_0ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu2_0ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu1_0ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+
+ void addm6_0ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addn8_0ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+
+ void adds16_0ExtOperands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+ void adds12_0ExtOperands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+ void adds10_0ExtOperands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+ void adds9_0ExtOperands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+ void adds8_0ExtOperands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+ void adds6_0ExtOperands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+ void adds11_0ExtOperands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+ void adds11_1ExtOperands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+ void adds11_2ExtOperands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+ void adds11_3ExtOperands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+ void addn1ConstOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+
+ void addu7_0ExtOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu8_0ExtOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu9_0ExtOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu10_0ExtOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu6_0ExtOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu6_1ExtOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu6_2ExtOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu6_3ExtOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu32_0MustExtOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+
+ void adds4_6ImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *CE =
+ dyn_cast<MCConstantExpr>(&HexagonMCInstrInfo::getExpr(*getImm()));
+ Inst.addOperand(MCOperand::createImm(CE->getValue() * 64));
+ }
+
+ void adds3_6ImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *CE =
+ dyn_cast<MCConstantExpr>(&HexagonMCInstrInfo::getExpr(*getImm()));
+ Inst.addOperand(MCOperand::createImm(CE->getValue() * 64));
+ }
+
+ StringRef getToken() const {
+ assert(Kind == Token && "Invalid access!");
+ return StringRef(Tok.Data, Tok.Length);
+ }
+
+ void print(raw_ostream &OS) const override;
+
+ static std::unique_ptr<HexagonOperand> CreateToken(StringRef Str, SMLoc S) {
+ HexagonOperand *Op = new HexagonOperand(Token);
+ Op->Tok.Data = Str.data();
+ Op->Tok.Length = Str.size();
+ Op->StartLoc = S;
+ Op->EndLoc = S;
+ return std::unique_ptr<HexagonOperand>(Op);
+ }
+
+ static std::unique_ptr<HexagonOperand> CreateReg(unsigned RegNum, SMLoc S,
+ SMLoc E) {
+ HexagonOperand *Op = new HexagonOperand(Register);
+ Op->Reg.RegNum = RegNum;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return std::unique_ptr<HexagonOperand>(Op);
+ }
+
+ static std::unique_ptr<HexagonOperand> CreateImm(const MCExpr *Val, SMLoc S,
+ SMLoc E) {
+ HexagonOperand *Op = new HexagonOperand(Immediate);
+ Op->Imm.Val = Val;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return std::unique_ptr<HexagonOperand>(Op);
+ }
+};
+
+} // end anonymous namespace
+
+void HexagonOperand::print(raw_ostream &OS) const {
+ switch (Kind) {
+ case Immediate:
+ getImm()->print(OS, nullptr);
+ break;
+ case Register:
+ OS << "<register R";
+ OS << getReg() << ">";
+ break;
+ case Token:
+ OS << "'" << getToken() << "'";
+ break;
+ }
+}
+
+bool HexagonAsmParser::finishBundle(SMLoc IDLoc, MCStreamer &Out) {
+ DEBUG(dbgs() << "Bundle:");
+ DEBUG(MCB.dump_pretty(dbgs()));
+ DEBUG(dbgs() << "--\n");
+
+ // Check the bundle for errors.
+ const MCRegisterInfo *RI = getContext().getRegisterInfo();
+ HexagonMCChecker Check(MCII, getSTI(), MCB, MCB, *RI);
+
+ bool CheckOk = HexagonMCInstrInfo::canonicalizePacket(MCII, getSTI(),
+ getContext(), MCB,
+ &Check);
+
+ while (Check.getNextErrInfo()) {
+ unsigned Reg = Check.getErrRegister();
+ Twine R(RI->getName(Reg));
+
+ uint64_t Err = Check.getError();
+ if (Err != HexagonMCErrInfo::CHECK_SUCCESS) {
+ if (HexagonMCErrInfo::CHECK_ERROR_BRANCHES & Err)
+ return Error(
+ IDLoc,
+ "unconditional branch cannot precede another branch in packet");
+
+ if (HexagonMCErrInfo::CHECK_ERROR_NEWP & Err ||
+ HexagonMCErrInfo::CHECK_ERROR_NEWV & Err)
+ return Error(IDLoc, "register `" + R +
+ "' used with `.new' "
+ "but not validly modified in the same packet");
+
+ if (HexagonMCErrInfo::CHECK_ERROR_REGISTERS & Err)
+ return Error(IDLoc, "register `" + R + "' modified more than once");
+
+ if (HexagonMCErrInfo::CHECK_ERROR_READONLY & Err)
+ return Error(IDLoc, "cannot write to read-only register `" + R + "'");
+
+ if (HexagonMCErrInfo::CHECK_ERROR_LOOP & Err)
+ return Error(IDLoc, "loop-setup and some branch instructions "
+ "cannot be in the same packet");
+
+ if (HexagonMCErrInfo::CHECK_ERROR_ENDLOOP & Err) {
+ Twine N(HexagonMCInstrInfo::isInnerLoop(MCB) ? '0' : '1');
+ return Error(IDLoc,
+ "packet marked with `:endloop" + N + "' " +
+ "cannot contain instructions that modify register " +
+ "`" + R + "'");
+ }
+
+ if (HexagonMCErrInfo::CHECK_ERROR_SOLO & Err)
+ return Error(
+ IDLoc,
+ "instruction cannot appear in packet with other instructions");
+
+ if (HexagonMCErrInfo::CHECK_ERROR_NOSLOTS & Err)
+ return Error(IDLoc, "too many slots used in packet");
+
+ if (Err & HexagonMCErrInfo::CHECK_ERROR_SHUFFLE) {
+ uint64_t Erm = Check.getShuffleError();
+
+ if (HexagonShuffler::SHUFFLE_ERROR_INVALID == Erm)
+ return Error(IDLoc, "invalid instruction packet");
+ else if (HexagonShuffler::SHUFFLE_ERROR_STORES == Erm)
+ return Error(IDLoc, "invalid instruction packet: too many stores");
+ else if (HexagonShuffler::SHUFFLE_ERROR_LOADS == Erm)
+ return Error(IDLoc, "invalid instruction packet: too many loads");
+ else if (HexagonShuffler::SHUFFLE_ERROR_BRANCHES == Erm)
+ return Error(IDLoc, "too many branches in packet");
+ else if (HexagonShuffler::SHUFFLE_ERROR_NOSLOTS == Erm)
+ return Error(IDLoc, "invalid instruction packet: out of slots");
+ else if (HexagonShuffler::SHUFFLE_ERROR_SLOTS == Erm)
+ return Error(IDLoc, "invalid instruction packet: slot error");
+ else if (HexagonShuffler::SHUFFLE_ERROR_ERRATA2 == Erm)
+ return Error(IDLoc, "v60 packet violation");
+ else if (HexagonShuffler::SHUFFLE_ERROR_STORE_LOAD_CONFLICT == Erm)
+ return Error(IDLoc, "slot 0 instruction does not allow slot 1 store");
+ else
+ return Error(IDLoc, "unknown error in instruction packet");
+ }
+ }
+
+ unsigned Warn = Check.getWarning();
+ if (Warn != HexagonMCErrInfo::CHECK_SUCCESS) {
+ if (HexagonMCErrInfo::CHECK_WARN_CURRENT & Warn)
+ Warning(IDLoc, "register `" + R + "' used with `.cur' "
+ "but not used in the same packet");
+ else if (HexagonMCErrInfo::CHECK_WARN_TEMPORARY & Warn)
+ Warning(IDLoc, "register `" + R + "' used with `.tmp' "
+ "but not used in the same packet");
+ }
+ }
+
+ if (CheckOk) {
+ MCB.setLoc(IDLoc);
+ if (HexagonMCInstrInfo::bundleSize(MCB) == 0) {
+ assert(!HexagonMCInstrInfo::isInnerLoop(MCB));
+ assert(!HexagonMCInstrInfo::isOuterLoop(MCB));
+ // Empty packets are valid yet aren't emitted
+ return false;
+ }
+ Out.EmitInstruction(MCB, getSTI());
+ } else {
+ // If compounding and duplexing didn't reduce the size below
+ // 4 or less we have a packet that is too big.
+ if (HexagonMCInstrInfo::bundleSize(MCB) > HEXAGON_PACKET_SIZE) {
+ Error(IDLoc, "invalid instruction packet: out of slots");
+ return true; // Error
+ }
+ }
+
+ return false; // No error
+}
+
+bool HexagonAsmParser::matchBundleOptions() {
+ MCAsmParser &Parser = getParser();
+ while (true) {
+ if (!Parser.getTok().is(AsmToken::Colon))
+ return false;
+ Lex();
+ StringRef Option = Parser.getTok().getString();
+ if (Option.compare_lower("endloop0") == 0)
+ HexagonMCInstrInfo::setInnerLoop(MCB);
+ else if (Option.compare_lower("endloop1") == 0)
+ HexagonMCInstrInfo::setOuterLoop(MCB);
+ else if (Option.compare_lower("mem_noshuf") == 0)
+ HexagonMCInstrInfo::setMemReorderDisabled(MCB);
+ else if (Option.compare_lower("mem_shuf") == 0)
+ HexagonMCInstrInfo::setMemStoreReorderEnabled(MCB);
+ else
+ return true;
+ Lex();
+ }
+}
+
+// For instruction aliases, immediates are generated rather than
+// MCConstantExpr. Convert them for uniform MCExpr.
+// Also check for signed/unsigned mismatches and warn
+void HexagonAsmParser::canonicalizeImmediates(MCInst &MCI) {
+ MCInst NewInst;
+ NewInst.setOpcode(MCI.getOpcode());
+ for (MCOperand &I : MCI)
+ if (I.isImm()) {
+ int64_t Value (I.getImm());
+ NewInst.addOperand(MCOperand::createExpr(HexagonMCExpr::create(
+ MCConstantExpr::create(Value, getContext()), getContext())));
+ }
+ else {
+ if (I.isExpr() && cast<HexagonMCExpr>(I.getExpr())->signMismatch() &&
+ WarnSignedMismatch)
+ Warning (MCI.getLoc(), "Signed/Unsigned mismatch");
+ NewInst.addOperand(I);
+ }
+ MCI = NewInst;
+}
+
+bool HexagonAsmParser::matchOneInstruction(MCInst &MCI, SMLoc IDLoc,
+ OperandVector &InstOperands,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) {
+ // Perform matching with tablegen asmmatcher generated function
+ int result =
+ MatchInstructionImpl(InstOperands, MCI, ErrorInfo, MatchingInlineAsm);
+ if (result == Match_Success) {
+ MCI.setLoc(IDLoc);
+ canonicalizeImmediates(MCI);
+ result = processInstruction(MCI, InstOperands, IDLoc);
+
+ DEBUG(dbgs() << "Insn:");
+ DEBUG(MCI.dump_pretty(dbgs()));
+ DEBUG(dbgs() << "\n\n");
+
+ MCI.setLoc(IDLoc);
+ }
+
+ // Create instruction operand for bundle instruction
+ // Break this into a separate function Code here is less readable
+ // Think about how to get an instruction error to report correctly.
+ // SMLoc will return the "{"
+ switch (result) {
+ default:
+ break;
+ case Match_Success:
+ return false;
+ case Match_MissingFeature:
+ return Error(IDLoc, "invalid instruction");
+ case Match_MnemonicFail:
+ return Error(IDLoc, "unrecognized instruction");
+ case Match_InvalidOperand:
+ SMLoc ErrorLoc = IDLoc;
+ if (ErrorInfo != ~0U) {
+ if (ErrorInfo >= InstOperands.size())
+ return Error(IDLoc, "too few operands for instruction");
+
+ ErrorLoc = (static_cast<HexagonOperand *>(InstOperands[ErrorInfo].get()))
+ ->getStartLoc();
+ if (ErrorLoc == SMLoc())
+ ErrorLoc = IDLoc;
+ }
+ return Error(ErrorLoc, "invalid operand for instruction");
+ }
+ llvm_unreachable("Implement any new match types added!");
+}
+
+bool HexagonAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands,
+ MCStreamer &Out,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) {
+ if (!InBrackets) {
+ MCB.clear();
+ MCB.addOperand(MCOperand::createImm(0));
+ }
+ HexagonOperand &FirstOperand = static_cast<HexagonOperand &>(*Operands[0]);
+ if (FirstOperand.isToken() && FirstOperand.getToken() == "{") {
+ assert(Operands.size() == 1 && "Brackets should be by themselves");
+ if (InBrackets) {
+ getParser().Error(IDLoc, "Already in a packet");
+ return true;
+ }
+ InBrackets = true;
+ return false;
+ }
+ if (FirstOperand.isToken() && FirstOperand.getToken() == "}") {
+ assert(Operands.size() == 1 && "Brackets should be by themselves");
+ if (!InBrackets) {
+ getParser().Error(IDLoc, "Not in a packet");
+ return true;
+ }
+ InBrackets = false;
+ if (matchBundleOptions())
+ return true;
+ return finishBundle(IDLoc, Out);
+ }
+ MCInst *SubInst = new (getParser().getContext()) MCInst;
+ if (matchOneInstruction(*SubInst, IDLoc, Operands, ErrorInfo,
+ MatchingInlineAsm))
+ return true;
+ HexagonMCInstrInfo::extendIfNeeded(
+ getParser().getContext(), MCII, MCB, *SubInst);
+ MCB.addOperand(MCOperand::createInst(SubInst));
+ if (!InBrackets)
+ return finishBundle(IDLoc, Out);
+ return false;
+}
+
+/// ParseDirective parses the Hexagon specific directives
+bool HexagonAsmParser::ParseDirective(AsmToken DirectiveID) {
+ StringRef IDVal = DirectiveID.getIdentifier();
+ if ((IDVal.lower() == ".word") || (IDVal.lower() == ".4byte"))
+ return ParseDirectiveValue(4, DirectiveID.getLoc());
+ if (IDVal.lower() == ".short" || IDVal.lower() == ".hword" ||
+ IDVal.lower() == ".half")
+ return ParseDirectiveValue(2, DirectiveID.getLoc());
+ if (IDVal.lower() == ".falign")
+ return ParseDirectiveFalign(256, DirectiveID.getLoc());
+ if ((IDVal.lower() == ".lcomm") || (IDVal.lower() == ".lcommon"))
+ return ParseDirectiveComm(true, DirectiveID.getLoc());
+ if ((IDVal.lower() == ".comm") || (IDVal.lower() == ".common"))
+ return ParseDirectiveComm(false, DirectiveID.getLoc());
+ if (IDVal.lower() == ".subsection")
+ return ParseDirectiveSubsection(DirectiveID.getLoc());
+
+ return true;
+}
+bool HexagonAsmParser::ParseDirectiveSubsection(SMLoc L) {
+ const MCExpr *Subsection = nullptr;
+ int64_t Res;
+
+ assert((getLexer().isNot(AsmToken::EndOfStatement)) &&
+ "Invalid subsection directive");
+ getParser().parseExpression(Subsection);
+
+ if (!Subsection->evaluateAsAbsolute(Res))
+ return Error(L, "Cannot evaluate subsection number");
+
+ if (getLexer().isNot(AsmToken::EndOfStatement))
+ return TokError("unexpected token in directive");
+
+ // 0-8192 is the hard-coded range in MCObjectStreamper.cpp, this keeps the
+ // negative subsections together and in the same order but at the opposite
+ // end of the section. Only legacy hexagon-gcc created assembly code
+ // used negative subsections.
+ if ((Res < 0) && (Res > -8193))
+ Subsection = HexagonMCExpr::create(
+ MCConstantExpr::create(8192 + Res, getContext()), getContext());
+
+ getStreamer().SubSection(Subsection);
+ return false;
+}
+
+/// ::= .falign [expression]
+bool HexagonAsmParser::ParseDirectiveFalign(unsigned Size, SMLoc L) {
+
+ int64_t MaxBytesToFill = 15;
+
+ // if there is an argument
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ const MCExpr *Value;
+ SMLoc ExprLoc = L;
+
+ // Make sure we have a number (false is returned if expression is a number)
+ if (!getParser().parseExpression(Value)) {
+ // Make sure this is a number that is in range
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value);
+ uint64_t IntValue = MCE->getValue();
+ if (!isUIntN(Size, IntValue) && !isIntN(Size, IntValue))
+ return Error(ExprLoc, "literal value out of range (256) for falign");
+ MaxBytesToFill = IntValue;
+ Lex();
+ } else {
+ return Error(ExprLoc, "not a valid expression for falign directive");
+ }
+ }
+
+ getTargetStreamer().emitFAlign(16, MaxBytesToFill);
+ Lex();
+
+ return false;
+}
+
+/// ::= .word [ expression (, expression)* ]
+bool HexagonAsmParser::ParseDirectiveValue(unsigned Size, SMLoc L) {
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ while (true) {
+ const MCExpr *Value;
+ SMLoc ExprLoc = L;
+ if (getParser().parseExpression(Value))
+ return true;
+
+ // Special case constant expressions to match code generator.
+ if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value)) {
+ assert(Size <= 8 && "Invalid size");
+ uint64_t IntValue = MCE->getValue();
+ if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue))
+ return Error(ExprLoc, "literal value out of range for directive");
+ getStreamer().EmitIntValue(IntValue, Size);
+ } else
+ getStreamer().EmitValue(Value, Size);
+
+ if (getLexer().is(AsmToken::EndOfStatement))
+ break;
+
+ // FIXME: Improve diagnostic.
+ if (getLexer().isNot(AsmToken::Comma))
+ return TokError("unexpected token in directive");
+ Lex();
+ }
+ }
+
+ Lex();
+ return false;
+}
+
+// This is largely a copy of AsmParser's ParseDirectiveComm extended to
+// accept a 3rd argument, AccessAlignment which indicates the smallest
+// memory access made to the symbol, expressed in bytes. If no
+// AccessAlignment is specified it defaults to the Alignment Value.
+// Hexagon's .lcomm:
+// .lcomm Symbol, Length, Alignment, AccessAlignment
+bool HexagonAsmParser::ParseDirectiveComm(bool IsLocal, SMLoc Loc) {
+ // FIXME: need better way to detect if AsmStreamer (upstream removed
+ // getKind())
+ if (getStreamer().hasRawTextSupport())
+ return true; // Only object file output requires special treatment.
+
+ StringRef Name;
+ if (getParser().parseIdentifier(Name))
+ return TokError("expected identifier in directive");
+ // Handle the identifier as the key symbol.
+ MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
+
+ if (getLexer().isNot(AsmToken::Comma))
+ return TokError("unexpected token in directive");
+ Lex();
+
+ int64_t Size;
+ SMLoc SizeLoc = getLexer().getLoc();
+ if (getParser().parseAbsoluteExpression(Size))
+ return true;
+
+ int64_t ByteAlignment = 1;
+ SMLoc ByteAlignmentLoc;
+ if (getLexer().is(AsmToken::Comma)) {
+ Lex();
+ ByteAlignmentLoc = getLexer().getLoc();
+ if (getParser().parseAbsoluteExpression(ByteAlignment))
+ return true;
+ if (!isPowerOf2_64(ByteAlignment))
+ return Error(ByteAlignmentLoc, "alignment must be a power of 2");
+ }
+
+ int64_t AccessAlignment = 0;
+ if (getLexer().is(AsmToken::Comma)) {
+ // The optional access argument specifies the size of the smallest memory
+ // access to be made to the symbol, expressed in bytes.
+ SMLoc AccessAlignmentLoc;
+ Lex();
+ AccessAlignmentLoc = getLexer().getLoc();
+ if (getParser().parseAbsoluteExpression(AccessAlignment))
+ return true;
+
+ if (!isPowerOf2_64(AccessAlignment))
+ return Error(AccessAlignmentLoc, "access alignment must be a power of 2");
+ }
+
+ if (getLexer().isNot(AsmToken::EndOfStatement))
+ return TokError("unexpected token in '.comm' or '.lcomm' directive");
+
+ Lex();
+
+ // NOTE: a size of zero for a .comm should create a undefined symbol
+ // but a size of .lcomm creates a bss symbol of size zero.
+ if (Size < 0)
+ return Error(SizeLoc, "invalid '.comm' or '.lcomm' directive size, can't "
+ "be less than zero");
+
+ // NOTE: The alignment in the directive is a power of 2 value, the assembler
+ // may internally end up wanting an alignment in bytes.
+ // FIXME: Diagnose overflow.
+ if (ByteAlignment < 0)
+ return Error(ByteAlignmentLoc, "invalid '.comm' or '.lcomm' directive "
+ "alignment, can't be less than zero");
+
+ if (!Sym->isUndefined())
+ return Error(Loc, "invalid symbol redefinition");
+
+ HexagonMCELFStreamer &HexagonELFStreamer =
+ static_cast<HexagonMCELFStreamer &>(getStreamer());
+ if (IsLocal) {
+ HexagonELFStreamer.HexagonMCEmitLocalCommonSymbol(Sym, Size, ByteAlignment,
+ AccessAlignment);
+ return false;
+ }
+
+ HexagonELFStreamer.HexagonMCEmitCommonSymbol(Sym, Size, ByteAlignment,
+ AccessAlignment);
+ return false;
+}
+
+// validate register against architecture
+bool HexagonAsmParser::RegisterMatchesArch(unsigned MatchNum) const {
+ return true;
+}
+
+// extern "C" void LLVMInitializeHexagonAsmLexer();
+
+/// Force static initialization.
+extern "C" void LLVMInitializeHexagonAsmParser() {
+ RegisterMCAsmParser<HexagonAsmParser> X(getTheHexagonTarget());
+}
+
+#define GET_MATCHER_IMPLEMENTATION
+#define GET_REGISTER_MATCHER
+#include "HexagonGenAsmMatcher.inc"
+
+static bool previousEqual(OperandVector &Operands, size_t Index,
+ StringRef String) {
+ if (Index >= Operands.size())
+ return false;
+ MCParsedAsmOperand &Operand = *Operands[Operands.size() - Index - 1];
+ if (!Operand.isToken())
+ return false;
+ return static_cast<HexagonOperand &>(Operand).getToken().equals_lower(String);
+}
+
+static bool previousIsLoop(OperandVector &Operands, size_t Index) {
+ return previousEqual(Operands, Index, "loop0") ||
+ previousEqual(Operands, Index, "loop1") ||
+ previousEqual(Operands, Index, "sp1loop0") ||
+ previousEqual(Operands, Index, "sp2loop0") ||
+ previousEqual(Operands, Index, "sp3loop0");
+}
+
+bool HexagonAsmParser::splitIdentifier(OperandVector &Operands) {
+ AsmToken const &Token = getParser().getTok();
+ StringRef String = Token.getString();
+ SMLoc Loc = Token.getLoc();
+ Lex();
+ do {
+ std::pair<StringRef, StringRef> HeadTail = String.split('.');
+ if (!HeadTail.first.empty())
+ Operands.push_back(HexagonOperand::CreateToken(HeadTail.first, Loc));
+ if (!HeadTail.second.empty())
+ Operands.push_back(HexagonOperand::CreateToken(
+ String.substr(HeadTail.first.size(), 1), Loc));
+ String = HeadTail.second;
+ } while (!String.empty());
+ return false;
+}
+
+bool HexagonAsmParser::parseOperand(OperandVector &Operands) {
+ unsigned Register;
+ SMLoc Begin;
+ SMLoc End;
+ MCAsmLexer &Lexer = getLexer();
+ if (!ParseRegister(Register, Begin, End)) {
+ if (!ErrorMissingParenthesis)
+ switch (Register) {
+ default:
+ break;
+ case Hexagon::P0:
+ case Hexagon::P1:
+ case Hexagon::P2:
+ case Hexagon::P3:
+ if (previousEqual(Operands, 0, "if")) {
+ if (WarnMissingParenthesis)
+ Warning (Begin, "Missing parenthesis around predicate register");
+ static char const *LParen = "(";
+ static char const *RParen = ")";
+ Operands.push_back(HexagonOperand::CreateToken(LParen, Begin));
+ Operands.push_back(HexagonOperand::CreateReg(Register, Begin, End));
+ const AsmToken &MaybeDotNew = Lexer.getTok();
+ if (MaybeDotNew.is(AsmToken::TokenKind::Identifier) &&
+ MaybeDotNew.getString().equals_lower(".new"))
+ splitIdentifier(Operands);
+ Operands.push_back(HexagonOperand::CreateToken(RParen, Begin));
+ return false;
+ }
+ if (previousEqual(Operands, 0, "!") &&
+ previousEqual(Operands, 1, "if")) {
+ if (WarnMissingParenthesis)
+ Warning (Begin, "Missing parenthesis around predicate register");
+ static char const *LParen = "(";
+ static char const *RParen = ")";
+ Operands.insert(Operands.end () - 1,
+ HexagonOperand::CreateToken(LParen, Begin));
+ Operands.push_back(HexagonOperand::CreateReg(Register, Begin, End));
+ const AsmToken &MaybeDotNew = Lexer.getTok();
+ if (MaybeDotNew.is(AsmToken::TokenKind::Identifier) &&
+ MaybeDotNew.getString().equals_lower(".new"))
+ splitIdentifier(Operands);
+ Operands.push_back(HexagonOperand::CreateToken(RParen, Begin));
+ return false;
+ }
+ break;
+ }
+ Operands.push_back(HexagonOperand::CreateReg(
+ Register, Begin, End));
+ return false;
+ }
+ return splitIdentifier(Operands);
+}
+
+bool HexagonAsmParser::isLabel(AsmToken &Token) {
+ MCAsmLexer &Lexer = getLexer();
+ AsmToken const &Second = Lexer.getTok();
+ AsmToken Third = Lexer.peekTok();
+ StringRef String = Token.getString();
+ if (Token.is(AsmToken::TokenKind::LCurly) ||
+ Token.is(AsmToken::TokenKind::RCurly))
+ return false;
+ if (!Token.is(AsmToken::TokenKind::Identifier))
+ return true;
+ if (!matchRegister(String.lower()))
+ return true;
+ (void)Second;
+ assert(Second.is(AsmToken::Colon));
+ StringRef Raw (String.data(), Third.getString().data() - String.data() +
+ Third.getString().size());
+ std::string Collapsed = Raw;
+ Collapsed.erase(llvm::remove_if(Collapsed, isspace), Collapsed.end());
+ StringRef Whole = Collapsed;
+ std::pair<StringRef, StringRef> DotSplit = Whole.split('.');
+ if (!matchRegister(DotSplit.first.lower()))
+ return true;
+ return false;
+}
+
+bool HexagonAsmParser::handleNoncontigiousRegister(bool Contigious, SMLoc &Loc) {
+ if (!Contigious && ErrorNoncontigiousRegister) {
+ Error(Loc, "Register name is not contigious");
+ return true;
+ }
+ if (!Contigious && WarnNoncontigiousRegister)
+ Warning(Loc, "Register name is not contigious");
+ return false;
+}
+
+bool HexagonAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) {
+ MCAsmLexer &Lexer = getLexer();
+ StartLoc = getLexer().getLoc();
+ SmallVector<AsmToken, 5> Lookahead;
+ StringRef RawString(Lexer.getTok().getString().data(), 0);
+ bool Again = Lexer.is(AsmToken::Identifier);
+ bool NeededWorkaround = false;
+ while (Again) {
+ AsmToken const &Token = Lexer.getTok();
+ RawString = StringRef(RawString.data(),
+ Token.getString().data() - RawString.data () +
+ Token.getString().size());
+ Lookahead.push_back(Token);
+ Lexer.Lex();
+ bool Contigious = Lexer.getTok().getString().data() ==
+ Lookahead.back().getString().data() +
+ Lookahead.back().getString().size();
+ bool Type = Lexer.is(AsmToken::Identifier) || Lexer.is(AsmToken::Dot) ||
+ Lexer.is(AsmToken::Integer) || Lexer.is(AsmToken::Real) ||
+ Lexer.is(AsmToken::Colon);
+ bool Workaround = Lexer.is(AsmToken::Colon) ||
+ Lookahead.back().is(AsmToken::Colon);
+ Again = (Contigious && Type) || (Workaround && Type);
+ NeededWorkaround = NeededWorkaround || (Again && !(Contigious && Type));
+ }
+ std::string Collapsed = RawString;
+ Collapsed.erase(llvm::remove_if(Collapsed, isspace), Collapsed.end());
+ StringRef FullString = Collapsed;
+ std::pair<StringRef, StringRef> DotSplit = FullString.split('.');
+ unsigned DotReg = matchRegister(DotSplit.first.lower());
+ if (DotReg != Hexagon::NoRegister && RegisterMatchesArch(DotReg)) {
+ if (DotSplit.second.empty()) {
+ RegNo = DotReg;
+ EndLoc = Lexer.getLoc();
+ if (handleNoncontigiousRegister(!NeededWorkaround, StartLoc))
+ return true;
+ return false;
+ } else {
+ RegNo = DotReg;
+ size_t First = RawString.find('.');
+ StringRef DotString (RawString.data() + First, RawString.size() - First);
+ Lexer.UnLex(AsmToken(AsmToken::Identifier, DotString));
+ EndLoc = Lexer.getLoc();
+ if (handleNoncontigiousRegister(!NeededWorkaround, StartLoc))
+ return true;
+ return false;
+ }
+ }
+ std::pair<StringRef, StringRef> ColonSplit = StringRef(FullString).split(':');
+ unsigned ColonReg = matchRegister(ColonSplit.first.lower());
+ if (ColonReg != Hexagon::NoRegister && RegisterMatchesArch(DotReg)) {
+ Lexer.UnLex(Lookahead.back());
+ Lookahead.pop_back();
+ Lexer.UnLex(Lookahead.back());
+ Lookahead.pop_back();
+ RegNo = ColonReg;
+ EndLoc = Lexer.getLoc();
+ if (handleNoncontigiousRegister(!NeededWorkaround, StartLoc))
+ return true;
+ return false;
+ }
+ while (!Lookahead.empty()) {
+ Lexer.UnLex(Lookahead.back());
+ Lookahead.pop_back();
+ }
+ return true;
+}
+
+bool HexagonAsmParser::implicitExpressionLocation(OperandVector &Operands) {
+ if (previousEqual(Operands, 0, "call"))
+ return true;
+ if (previousEqual(Operands, 0, "jump"))
+ if (!getLexer().getTok().is(AsmToken::Colon))
+ return true;
+ if (previousEqual(Operands, 0, "(") && previousIsLoop(Operands, 1))
+ return true;
+ if (previousEqual(Operands, 1, ":") && previousEqual(Operands, 2, "jump") &&
+ (previousEqual(Operands, 0, "nt") || previousEqual(Operands, 0, "t")))
+ return true;
+ return false;
+}
+
+bool HexagonAsmParser::parseExpression(MCExpr const *& Expr) {
+ SmallVector<AsmToken, 4> Tokens;
+ MCAsmLexer &Lexer = getLexer();
+ bool Done = false;
+ static char const * Comma = ",";
+ do {
+ Tokens.emplace_back (Lexer.getTok());
+ Lex();
+ switch (Tokens.back().getKind())
+ {
+ case AsmToken::TokenKind::Hash:
+ if (Tokens.size () > 1)
+ if ((Tokens.end () - 2)->getKind() == AsmToken::TokenKind::Plus) {
+ Tokens.insert(Tokens.end() - 2,
+ AsmToken(AsmToken::TokenKind::Comma, Comma));
+ Done = true;
+ }
+ break;
+ case AsmToken::TokenKind::RCurly:
+ case AsmToken::TokenKind::EndOfStatement:
+ case AsmToken::TokenKind::Eof:
+ Done = true;
+ break;
+ default:
+ break;
+ }
+ } while (!Done);
+ while (!Tokens.empty()) {
+ Lexer.UnLex(Tokens.back());
+ Tokens.pop_back();
+ }
+ return getParser().parseExpression(Expr);
+}
+
+bool HexagonAsmParser::parseExpressionOrOperand(OperandVector &Operands) {
+ if (implicitExpressionLocation(Operands)) {
+ MCAsmParser &Parser = getParser();
+ SMLoc Loc = Parser.getLexer().getLoc();
+ MCExpr const *Expr = nullptr;
+ bool Error = parseExpression(Expr);
+ Expr = HexagonMCExpr::create(Expr, getContext());
+ if (!Error)
+ Operands.push_back(HexagonOperand::CreateImm(Expr, Loc, Loc));
+ return Error;
+ }
+ return parseOperand(Operands);
+}
+
+/// Parse an instruction.
+bool HexagonAsmParser::parseInstruction(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ MCAsmLexer &Lexer = getLexer();
+ while (true) {
+ AsmToken const &Token = Parser.getTok();
+ switch (Token.getKind()) {
+ case AsmToken::EndOfStatement: {
+ Lex();
+ return false;
+ }
+ case AsmToken::LCurly: {
+ if (!Operands.empty())
+ return true;
+ Operands.push_back(
+ HexagonOperand::CreateToken(Token.getString(), Token.getLoc()));
+ Lex();
+ return false;
+ }
+ case AsmToken::RCurly: {
+ if (Operands.empty()) {
+ Operands.push_back(
+ HexagonOperand::CreateToken(Token.getString(), Token.getLoc()));
+ Lex();
+ }
+ return false;
+ }
+ case AsmToken::Comma: {
+ Lex();
+ continue;
+ }
+ case AsmToken::EqualEqual:
+ case AsmToken::ExclaimEqual:
+ case AsmToken::GreaterEqual:
+ case AsmToken::GreaterGreater:
+ case AsmToken::LessEqual:
+ case AsmToken::LessLess: {
+ Operands.push_back(HexagonOperand::CreateToken(
+ Token.getString().substr(0, 1), Token.getLoc()));
+ Operands.push_back(HexagonOperand::CreateToken(
+ Token.getString().substr(1, 1), Token.getLoc()));
+ Lex();
+ continue;
+ }
+ case AsmToken::Hash: {
+ bool MustNotExtend = false;
+ bool ImplicitExpression = implicitExpressionLocation(Operands);
+ SMLoc ExprLoc = Lexer.getLoc();
+ if (!ImplicitExpression)
+ Operands.push_back(
+ HexagonOperand::CreateToken(Token.getString(), Token.getLoc()));
+ Lex();
+ bool MustExtend = false;
+ bool HiOnly = false;
+ bool LoOnly = false;
+ if (Lexer.is(AsmToken::Hash)) {
+ Lex();
+ MustExtend = true;
+ } else if (ImplicitExpression)
+ MustNotExtend = true;
+ AsmToken const &Token = Parser.getTok();
+ if (Token.is(AsmToken::Identifier)) {
+ StringRef String = Token.getString();
+ if (String.lower() == "hi") {
+ HiOnly = true;
+ } else if (String.lower() == "lo") {
+ LoOnly = true;
+ }
+ if (HiOnly || LoOnly) {
+ AsmToken LParen = Lexer.peekTok();
+ if (!LParen.is(AsmToken::LParen)) {
+ HiOnly = false;
+ LoOnly = false;
+ } else {
+ Lex();
+ }
+ }
+ }
+ MCExpr const *Expr = nullptr;
+ if (parseExpression(Expr))
+ return true;
+ int64_t Value;
+ MCContext &Context = Parser.getContext();
+ assert(Expr != nullptr);
+ if (Expr->evaluateAsAbsolute(Value)) {
+ if (HiOnly)
+ Expr = MCBinaryExpr::createLShr(
+ Expr, MCConstantExpr::create(16, Context), Context);
+ if (HiOnly || LoOnly)
+ Expr = MCBinaryExpr::createAnd(Expr,
+ MCConstantExpr::create(0xffff, Context),
+ Context);
+ } else {
+ MCValue Value;
+ if (Expr->evaluateAsRelocatable(Value, nullptr, nullptr)) {
+ if (!Value.isAbsolute()) {
+ switch(Value.getAccessVariant()) {
+ case MCSymbolRefExpr::VariantKind::VK_TPREL:
+ case MCSymbolRefExpr::VariantKind::VK_DTPREL:
+ // Don't lazy extend these expression variants
+ MustNotExtend = !MustExtend;
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ }
+ Expr = HexagonMCExpr::create(Expr, Context);
+ HexagonMCInstrInfo::setMustNotExtend(*Expr, MustNotExtend);
+ HexagonMCInstrInfo::setMustExtend(*Expr, MustExtend);
+ std::unique_ptr<HexagonOperand> Operand =
+ HexagonOperand::CreateImm(Expr, ExprLoc, ExprLoc);
+ Operands.push_back(std::move(Operand));
+ continue;
+ }
+ default:
+ break;
+ }
+ if (parseExpressionOrOperand(Operands))
+ return true;
+ }
+}
+
+bool HexagonAsmParser::ParseInstruction(ParseInstructionInfo &Info,
+ StringRef Name,
+ AsmToken ID,
+ OperandVector &Operands) {
+ getLexer().UnLex(ID);
+ return parseInstruction(Operands);
+}
+
+static MCInst makeCombineInst(int opCode, MCOperand &Rdd,
+ MCOperand &MO1, MCOperand &MO2) {
+ MCInst TmpInst;
+ TmpInst.setOpcode(opCode);
+ TmpInst.addOperand(Rdd);
+ TmpInst.addOperand(MO1);
+ TmpInst.addOperand(MO2);
+
+ return TmpInst;
+}
+
+// Define this matcher function after the auto-generated include so we
+// have the match class enum definitions.
+unsigned HexagonAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
+ unsigned Kind) {
+ HexagonOperand *Op = static_cast<HexagonOperand *>(&AsmOp);
+
+ switch (Kind) {
+ case MCK_0: {
+ int64_t Value;
+ return Op->isImm() && Op->Imm.Val->evaluateAsAbsolute(Value) && Value == 0
+ ? Match_Success
+ : Match_InvalidOperand;
+ }
+ case MCK_1: {
+ int64_t Value;
+ return Op->isImm() && Op->Imm.Val->evaluateAsAbsolute(Value) && Value == 1
+ ? Match_Success
+ : Match_InvalidOperand;
+ }
+ }
+ if (Op->Kind == HexagonOperand::Token && Kind != InvalidMatchClass) {
+ StringRef myStringRef = StringRef(Op->Tok.Data, Op->Tok.Length);
+ if (matchTokenString(myStringRef.lower()) == (MatchClassKind)Kind)
+ return Match_Success;
+ if (matchTokenString(myStringRef.upper()) == (MatchClassKind)Kind)
+ return Match_Success;
+ }
+
+ DEBUG(dbgs() << "Unmatched Operand:");
+ DEBUG(Op->dump());
+ DEBUG(dbgs() << "\n");
+
+ return Match_InvalidOperand;
+}
+
+// FIXME: Calls to OutOfRange shoudl propagate failure up to parseStatement.
+bool HexagonAsmParser::OutOfRange(SMLoc IDLoc, long long Val, long long Max) {
+ std::string errStr;
+ raw_string_ostream ES(errStr);
+ ES << "value " << Val << "(" << format_hex(Val, 0) << ") out of range: ";
+ if (Max >= 0)
+ ES << "0-" << Max;
+ else
+ ES << Max << "-" << (-Max - 1);
+ return Parser.printError(IDLoc, ES.str());
+}
+
+int HexagonAsmParser::processInstruction(MCInst &Inst,
+ OperandVector const &Operands,
+ SMLoc IDLoc) {
+ MCContext &Context = getParser().getContext();
+ const MCRegisterInfo *RI = getContext().getRegisterInfo();
+ std::string r = "r";
+ std::string v = "v";
+ std::string Colon = ":";
+
+ bool is32bit = false; // used to distinguish between CONST32 and CONST64
+ switch (Inst.getOpcode()) {
+ default:
+ break;
+
+ case Hexagon::A2_iconst: {
+ Inst.setOpcode(Hexagon::A2_addi);
+ MCOperand Reg = Inst.getOperand(0);
+ MCOperand S16 = Inst.getOperand(1);
+ HexagonMCInstrInfo::setMustNotExtend(*S16.getExpr());
+ HexagonMCInstrInfo::setS23_2_reloc(*S16.getExpr());
+ Inst.clear();
+ Inst.addOperand(Reg);
+ Inst.addOperand(MCOperand::createReg(Hexagon::R0));
+ Inst.addOperand(S16);
+ break;
+ }
+ case Hexagon::M4_mpyrr_addr:
+ case Hexagon::S4_addi_asl_ri:
+ case Hexagon::S4_addi_lsr_ri:
+ case Hexagon::S4_andi_asl_ri:
+ case Hexagon::S4_andi_lsr_ri:
+ case Hexagon::S4_ori_asl_ri:
+ case Hexagon::S4_ori_lsr_ri:
+ case Hexagon::S4_or_andix:
+ case Hexagon::S4_subi_asl_ri:
+ case Hexagon::S4_subi_lsr_ri: {
+ MCOperand &Ry = Inst.getOperand(0);
+ MCOperand &src = Inst.getOperand(2);
+ if (RI->getEncodingValue(Ry.getReg()) != RI->getEncodingValue(src.getReg()))
+ return Match_InvalidOperand;
+ break;
+ }
+
+ case Hexagon::C2_cmpgei: {
+ MCOperand &MO = Inst.getOperand(2);
+ MO.setExpr(HexagonMCExpr::create(MCBinaryExpr::createSub(
+ MO.getExpr(), MCConstantExpr::create(1, Context), Context), Context));
+ Inst.setOpcode(Hexagon::C2_cmpgti);
+ break;
+ }
+
+ case Hexagon::C2_cmpgeui: {
+ MCOperand &MO = Inst.getOperand(2);
+ int64_t Value;
+ bool Success = MO.getExpr()->evaluateAsAbsolute(Value);
+ (void)Success;
+ assert(Success && "Assured by matcher");
+ if (Value == 0) {
+ MCInst TmpInst;
+ MCOperand &Pd = Inst.getOperand(0);
+ MCOperand &Rt = Inst.getOperand(1);
+ TmpInst.setOpcode(Hexagon::C2_cmpeq);
+ TmpInst.addOperand(Pd);
+ TmpInst.addOperand(Rt);
+ TmpInst.addOperand(Rt);
+ Inst = TmpInst;
+ } else {
+ MO.setExpr(HexagonMCExpr::create(MCBinaryExpr::createSub(
+ MO.getExpr(), MCConstantExpr::create(1, Context), Context), Context));
+ Inst.setOpcode(Hexagon::C2_cmpgtui);
+ }
+ break;
+ }
+
+ // Translate a "$Rdd = $Rss" to "$Rdd = combine($Rs, $Rt)"
+ case Hexagon::A2_tfrp: {
+ MCOperand &MO = Inst.getOperand(1);
+ unsigned int RegPairNum = RI->getEncodingValue(MO.getReg());
+ std::string R1 = r + utostr(RegPairNum + 1);
+ StringRef Reg1(R1);
+ MO.setReg(matchRegister(Reg1));
+ // Add a new operand for the second register in the pair.
+ std::string R2 = r + utostr(RegPairNum);
+ StringRef Reg2(R2);
+ Inst.addOperand(MCOperand::createReg(matchRegister(Reg2)));
+ Inst.setOpcode(Hexagon::A2_combinew);
+ break;
+ }
+
+ case Hexagon::A2_tfrpt:
+ case Hexagon::A2_tfrpf: {
+ MCOperand &MO = Inst.getOperand(2);
+ unsigned int RegPairNum = RI->getEncodingValue(MO.getReg());
+ std::string R1 = r + utostr(RegPairNum + 1);
+ StringRef Reg1(R1);
+ MO.setReg(matchRegister(Reg1));
+ // Add a new operand for the second register in the pair.
+ std::string R2 = r + utostr(RegPairNum);
+ StringRef Reg2(R2);
+ Inst.addOperand(MCOperand::createReg(matchRegister(Reg2)));
+ Inst.setOpcode((Inst.getOpcode() == Hexagon::A2_tfrpt)
+ ? Hexagon::C2_ccombinewt
+ : Hexagon::C2_ccombinewf);
+ break;
+ }
+ case Hexagon::A2_tfrptnew:
+ case Hexagon::A2_tfrpfnew: {
+ MCOperand &MO = Inst.getOperand(2);
+ unsigned int RegPairNum = RI->getEncodingValue(MO.getReg());
+ std::string R1 = r + utostr(RegPairNum + 1);
+ StringRef Reg1(R1);
+ MO.setReg(matchRegister(Reg1));
+ // Add a new operand for the second register in the pair.
+ std::string R2 = r + utostr(RegPairNum);
+ StringRef Reg2(R2);
+ Inst.addOperand(MCOperand::createReg(matchRegister(Reg2)));
+ Inst.setOpcode((Inst.getOpcode() == Hexagon::A2_tfrptnew)
+ ? Hexagon::C2_ccombinewnewt
+ : Hexagon::C2_ccombinewnewf);
+ break;
+ }
+
+ // Translate a "$Vdd = $Vss" to "$Vdd = vcombine($Vs, $Vt)"
+ case Hexagon::V6_vassignp: {
+ MCOperand &MO = Inst.getOperand(1);
+ unsigned int RegPairNum = RI->getEncodingValue(MO.getReg());
+ std::string R1 = v + utostr(RegPairNum + 1);
+ MO.setReg(MatchRegisterName(R1));
+ // Add a new operand for the second register in the pair.
+ std::string R2 = v + utostr(RegPairNum);
+ Inst.addOperand(MCOperand::createReg(MatchRegisterName(R2)));
+ Inst.setOpcode(Hexagon::V6_vcombine);
+ break;
+ }
+
+ // Translate a "$Rx = CONST32(#imm)" to "$Rx = memw(gp+#LABEL) "
+ case Hexagon::CONST32:
+ is32bit = true;
+ // Translate a "$Rx:y = CONST64(#imm)" to "$Rx:y = memd(gp+#LABEL) "
+ case Hexagon::CONST64:
+ // FIXME: need better way to detect AsmStreamer (upstream removed getKind())
+ if (!Parser.getStreamer().hasRawTextSupport()) {
+ MCELFStreamer *MES = static_cast<MCELFStreamer *>(&Parser.getStreamer());
+ MCOperand &MO_1 = Inst.getOperand(1);
+ MCOperand &MO_0 = Inst.getOperand(0);
+
+ // push section onto section stack
+ MES->PushSection();
+
+ std::string myCharStr;
+ MCSectionELF *mySection;
+
+ // check if this as an immediate or a symbol
+ int64_t Value;
+ bool Absolute = MO_1.getExpr()->evaluateAsAbsolute(Value);
+ if (Absolute) {
+ // Create a new section - one for each constant
+ // Some or all of the zeros are replaced with the given immediate.
+ if (is32bit) {
+ std::string myImmStr = utohexstr(static_cast<uint32_t>(Value));
+ myCharStr = StringRef(".gnu.linkonce.l4.CONST_00000000")
+ .drop_back(myImmStr.size())
+ .str() +
+ myImmStr;
+ } else {
+ std::string myImmStr = utohexstr(Value);
+ myCharStr = StringRef(".gnu.linkonce.l8.CONST_0000000000000000")
+ .drop_back(myImmStr.size())
+ .str() +
+ myImmStr;
+ }
+
+ mySection = getContext().getELFSection(myCharStr, ELF::SHT_PROGBITS,
+ ELF::SHF_ALLOC | ELF::SHF_WRITE);
+ } else if (MO_1.isExpr()) {
+ // .lita - for expressions
+ myCharStr = ".lita";
+ mySection = getContext().getELFSection(myCharStr, ELF::SHT_PROGBITS,
+ ELF::SHF_ALLOC | ELF::SHF_WRITE);
+ } else
+ llvm_unreachable("unexpected type of machine operand!");
+
+ MES->SwitchSection(mySection);
+ unsigned byteSize = is32bit ? 4 : 8;
+ getStreamer().EmitCodeAlignment(byteSize, byteSize);
+
+ MCSymbol *Sym;
+
+ // for symbols, get rid of prepended ".gnu.linkonce.lx."
+
+ // emit symbol if needed
+ if (Absolute) {
+ Sym = getContext().getOrCreateSymbol(StringRef(myCharStr.c_str() + 16));
+ if (Sym->isUndefined()) {
+ getStreamer().EmitLabel(Sym);
+ getStreamer().EmitSymbolAttribute(Sym, MCSA_Global);
+ getStreamer().EmitIntValue(Value, byteSize);
+ }
+ } else if (MO_1.isExpr()) {
+ const char *StringStart = nullptr;
+ const char *StringEnd = nullptr;
+ if (*Operands[4]->getStartLoc().getPointer() == '#') {
+ StringStart = Operands[5]->getStartLoc().getPointer();
+ StringEnd = Operands[6]->getStartLoc().getPointer();
+ } else { // no pound
+ StringStart = Operands[4]->getStartLoc().getPointer();
+ StringEnd = Operands[5]->getStartLoc().getPointer();
+ }
+
+ unsigned size = StringEnd - StringStart;
+ std::string DotConst = ".CONST_";
+ Sym = getContext().getOrCreateSymbol(DotConst +
+ StringRef(StringStart, size));
+
+ if (Sym->isUndefined()) {
+ // case where symbol is not yet defined: emit symbol
+ getStreamer().EmitLabel(Sym);
+ getStreamer().EmitSymbolAttribute(Sym, MCSA_Local);
+ getStreamer().EmitValue(MO_1.getExpr(), 4);
+ }
+ } else
+ llvm_unreachable("unexpected type of machine operand!");
+
+ MES->PopSection();
+
+ if (Sym) {
+ MCInst TmpInst;
+ if (is32bit) // 32 bit
+ TmpInst.setOpcode(Hexagon::L2_loadrigp);
+ else // 64 bit
+ TmpInst.setOpcode(Hexagon::L2_loadrdgp);
+
+ TmpInst.addOperand(MO_0);
+ TmpInst.addOperand(
+ MCOperand::createExpr(MCSymbolRefExpr::create(Sym, getContext())));
+ Inst = TmpInst;
+ }
+ }
+ break;
+
+ // Translate a "$Rdd = #-imm" to "$Rdd = combine(#[-1,0], #-imm)"
+ case Hexagon::A2_tfrpi: {
+ MCOperand &Rdd = Inst.getOperand(0);
+ MCOperand &MO = Inst.getOperand(1);
+ int64_t Value;
+ int sVal = (MO.getExpr()->evaluateAsAbsolute(Value) && Value < 0) ? -1 : 0;
+ MCOperand imm(MCOperand::createExpr(
+ HexagonMCExpr::create(MCConstantExpr::create(sVal, Context), Context)));
+ Inst = makeCombineInst(Hexagon::A2_combineii, Rdd, imm, MO);
+ break;
+ }
+
+ // Translate a "$Rdd = [#]#imm" to "$Rdd = combine(#, [#]#imm)"
+ case Hexagon::TFRI64_V4: {
+ MCOperand &Rdd = Inst.getOperand(0);
+ MCOperand &MO = Inst.getOperand(1);
+ int64_t Value;
+ if (MO.getExpr()->evaluateAsAbsolute(Value)) {
+ int s8 = Hi_32(Value);
+ if (!isInt<8>(s8))
+ OutOfRange(IDLoc, s8, -128);
+ MCOperand imm(MCOperand::createExpr(HexagonMCExpr::create(
+ MCConstantExpr::create(s8, Context), Context))); // upper 32
+ auto Expr = HexagonMCExpr::create(
+ MCConstantExpr::create(Lo_32(Value), Context), Context);
+ HexagonMCInstrInfo::setMustExtend(*Expr, HexagonMCInstrInfo::mustExtend(*MO.getExpr()));
+ MCOperand imm2(MCOperand::createExpr(Expr)); // lower 32
+ Inst = makeCombineInst(Hexagon::A4_combineii, Rdd, imm, imm2);
+ } else {
+ MCOperand imm(MCOperand::createExpr(HexagonMCExpr::create(
+ MCConstantExpr::create(0, Context), Context))); // upper 32
+ Inst = makeCombineInst(Hexagon::A4_combineii, Rdd, imm, MO);
+ }
+ break;
+ }
+
+ // Handle $Rdd = combine(##imm, #imm)"
+ case Hexagon::TFRI64_V2_ext: {
+ MCOperand &Rdd = Inst.getOperand(0);
+ MCOperand &MO1 = Inst.getOperand(1);
+ MCOperand &MO2 = Inst.getOperand(2);
+ int64_t Value;
+ if (MO2.getExpr()->evaluateAsAbsolute(Value)) {
+ int s8 = Value;
+ if (s8 < -128 || s8 > 127)
+ OutOfRange(IDLoc, s8, -128);
+ }
+ Inst = makeCombineInst(Hexagon::A2_combineii, Rdd, MO1, MO2);
+ break;
+ }
+
+ // Handle $Rdd = combine(#imm, ##imm)"
+ case Hexagon::A4_combineii: {
+ MCOperand &Rdd = Inst.getOperand(0);
+ MCOperand &MO1 = Inst.getOperand(1);
+ int64_t Value;
+ if (MO1.getExpr()->evaluateAsAbsolute(Value)) {
+ int s8 = Value;
+ if (s8 < -128 || s8 > 127)
+ OutOfRange(IDLoc, s8, -128);
+ }
+ MCOperand &MO2 = Inst.getOperand(2);
+ Inst = makeCombineInst(Hexagon::A4_combineii, Rdd, MO1, MO2);
+ break;
+ }
+
+ case Hexagon::S2_tableidxb_goodsyntax:
+ Inst.setOpcode(Hexagon::S2_tableidxb);
+ break;
+
+ case Hexagon::S2_tableidxh_goodsyntax: {
+ MCInst TmpInst;
+ MCOperand &Rx = Inst.getOperand(0);
+ MCOperand &_dst_ = Inst.getOperand(1);
+ MCOperand &Rs = Inst.getOperand(2);
+ MCOperand &Imm4 = Inst.getOperand(3);
+ MCOperand &Imm6 = Inst.getOperand(4);
+ Imm6.setExpr(HexagonMCExpr::create(MCBinaryExpr::createSub(
+ Imm6.getExpr(), MCConstantExpr::create(1, Context), Context), Context));
+ TmpInst.setOpcode(Hexagon::S2_tableidxh);
+ TmpInst.addOperand(Rx);
+ TmpInst.addOperand(_dst_);
+ TmpInst.addOperand(Rs);
+ TmpInst.addOperand(Imm4);
+ TmpInst.addOperand(Imm6);
+ Inst = TmpInst;
+ break;
+ }
+
+ case Hexagon::S2_tableidxw_goodsyntax: {
+ MCInst TmpInst;
+ MCOperand &Rx = Inst.getOperand(0);
+ MCOperand &_dst_ = Inst.getOperand(1);
+ MCOperand &Rs = Inst.getOperand(2);
+ MCOperand &Imm4 = Inst.getOperand(3);
+ MCOperand &Imm6 = Inst.getOperand(4);
+ Imm6.setExpr(HexagonMCExpr::create(MCBinaryExpr::createSub(
+ Imm6.getExpr(), MCConstantExpr::create(2, Context), Context), Context));
+ TmpInst.setOpcode(Hexagon::S2_tableidxw);
+ TmpInst.addOperand(Rx);
+ TmpInst.addOperand(_dst_);
+ TmpInst.addOperand(Rs);
+ TmpInst.addOperand(Imm4);
+ TmpInst.addOperand(Imm6);
+ Inst = TmpInst;
+ break;
+ }
+
+ case Hexagon::S2_tableidxd_goodsyntax: {
+ MCInst TmpInst;
+ MCOperand &Rx = Inst.getOperand(0);
+ MCOperand &_dst_ = Inst.getOperand(1);
+ MCOperand &Rs = Inst.getOperand(2);
+ MCOperand &Imm4 = Inst.getOperand(3);
+ MCOperand &Imm6 = Inst.getOperand(4);
+ Imm6.setExpr(HexagonMCExpr::create(MCBinaryExpr::createSub(
+ Imm6.getExpr(), MCConstantExpr::create(3, Context), Context), Context));
+ TmpInst.setOpcode(Hexagon::S2_tableidxd);
+ TmpInst.addOperand(Rx);
+ TmpInst.addOperand(_dst_);
+ TmpInst.addOperand(Rs);
+ TmpInst.addOperand(Imm4);
+ TmpInst.addOperand(Imm6);
+ Inst = TmpInst;
+ break;
+ }
+
+ case Hexagon::M2_mpyui:
+ Inst.setOpcode(Hexagon::M2_mpyi);
+ break;
+ case Hexagon::M2_mpysmi: {
+ MCInst TmpInst;
+ MCOperand &Rd = Inst.getOperand(0);
+ MCOperand &Rs = Inst.getOperand(1);
+ MCOperand &Imm = Inst.getOperand(2);
+ int64_t Value;
+ MCExpr const &Expr = *Imm.getExpr();
+ bool Absolute = Expr.evaluateAsAbsolute(Value);
+ assert(Absolute);
+ (void)Absolute;
+ if (!HexagonMCInstrInfo::mustExtend(Expr)) {
+ if (Value < 0 && Value > -256) {
+ Imm.setExpr(HexagonMCExpr::create(
+ MCConstantExpr::create(Value * -1, Context), Context));
+ TmpInst.setOpcode(Hexagon::M2_mpysin);
+ } else if (Value < 256 && Value >= 0)
+ TmpInst.setOpcode(Hexagon::M2_mpysip);
+ else
+ return Match_InvalidOperand;
+ } else {
+ if (Value >= 0)
+ TmpInst.setOpcode(Hexagon::M2_mpysip);
+ else
+ return Match_InvalidOperand;
+ }
+ TmpInst.addOperand(Rd);
+ TmpInst.addOperand(Rs);
+ TmpInst.addOperand(Imm);
+ Inst = TmpInst;
+ break;
+ }
+
+ case Hexagon::S2_asr_i_r_rnd_goodsyntax: {
+ MCOperand &Imm = Inst.getOperand(2);
+ MCInst TmpInst;
+ int64_t Value;
+ bool Absolute = Imm.getExpr()->evaluateAsAbsolute(Value);
+ assert(Absolute);
+ (void)Absolute;
+ if (Value == 0) { // convert to $Rd = $Rs
+ TmpInst.setOpcode(Hexagon::A2_tfr);
+ MCOperand &Rd = Inst.getOperand(0);
+ MCOperand &Rs = Inst.getOperand(1);
+ TmpInst.addOperand(Rd);
+ TmpInst.addOperand(Rs);
+ } else {
+ Imm.setExpr(HexagonMCExpr::create(
+ MCBinaryExpr::createSub(Imm.getExpr(),
+ MCConstantExpr::create(1, Context), Context),
+ Context));
+ TmpInst.setOpcode(Hexagon::S2_asr_i_r_rnd);
+ MCOperand &Rd = Inst.getOperand(0);
+ MCOperand &Rs = Inst.getOperand(1);
+ TmpInst.addOperand(Rd);
+ TmpInst.addOperand(Rs);
+ TmpInst.addOperand(Imm);
+ }
+ Inst = TmpInst;
+ break;
+ }
+
+ case Hexagon::S2_asr_i_p_rnd_goodsyntax: {
+ MCOperand &Rdd = Inst.getOperand(0);
+ MCOperand &Rss = Inst.getOperand(1);
+ MCOperand &Imm = Inst.getOperand(2);
+ int64_t Value;
+ bool Absolute = Imm.getExpr()->evaluateAsAbsolute(Value);
+ assert(Absolute);
+ (void)Absolute;
+ if (Value == 0) { // convert to $Rdd = combine ($Rs[0], $Rs[1])
+ MCInst TmpInst;
+ unsigned int RegPairNum = RI->getEncodingValue(Rss.getReg());
+ std::string R1 = r + utostr(RegPairNum + 1);
+ StringRef Reg1(R1);
+ Rss.setReg(matchRegister(Reg1));
+ // Add a new operand for the second register in the pair.
+ std::string R2 = r + utostr(RegPairNum);
+ StringRef Reg2(R2);
+ TmpInst.setOpcode(Hexagon::A2_combinew);
+ TmpInst.addOperand(Rdd);
+ TmpInst.addOperand(Rss);
+ TmpInst.addOperand(MCOperand::createReg(matchRegister(Reg2)));
+ Inst = TmpInst;
+ } else {
+ Imm.setExpr(HexagonMCExpr::create(
+ MCBinaryExpr::createSub(Imm.getExpr(),
+ MCConstantExpr::create(1, Context), Context),
+ Context));
+ Inst.setOpcode(Hexagon::S2_asr_i_p_rnd);
+ }
+ break;
+ }
+
+ case Hexagon::A4_boundscheck: {
+ MCOperand &Rs = Inst.getOperand(1);
+ unsigned int RegNum = RI->getEncodingValue(Rs.getReg());
+ if (RegNum & 1) { // Odd mapped to raw:hi, regpair is rodd:odd-1, like r3:2
+ Inst.setOpcode(Hexagon::A4_boundscheck_hi);
+ std::string Name = r + utostr(RegNum) + Colon + utostr(RegNum - 1);
+ StringRef RegPair = Name;
+ Rs.setReg(matchRegister(RegPair));
+ } else { // raw:lo
+ Inst.setOpcode(Hexagon::A4_boundscheck_lo);
+ std::string Name = r + utostr(RegNum + 1) + Colon + utostr(RegNum);
+ StringRef RegPair = Name;
+ Rs.setReg(matchRegister(RegPair));
+ }
+ break;
+ }
+
+ case Hexagon::A2_addsp: {
+ MCOperand &Rs = Inst.getOperand(1);
+ unsigned int RegNum = RI->getEncodingValue(Rs.getReg());
+ if (RegNum & 1) { // Odd mapped to raw:hi
+ Inst.setOpcode(Hexagon::A2_addsph);
+ std::string Name = r + utostr(RegNum) + Colon + utostr(RegNum - 1);
+ StringRef RegPair = Name;
+ Rs.setReg(matchRegister(RegPair));
+ } else { // Even mapped raw:lo
+ Inst.setOpcode(Hexagon::A2_addspl);
+ std::string Name = r + utostr(RegNum + 1) + Colon + utostr(RegNum);
+ StringRef RegPair = Name;
+ Rs.setReg(matchRegister(RegPair));
+ }
+ break;
+ }
+
+ case Hexagon::M2_vrcmpys_s1: {
+ MCOperand &Rt = Inst.getOperand(2);
+ unsigned int RegNum = RI->getEncodingValue(Rt.getReg());
+ if (RegNum & 1) { // Odd mapped to sat:raw:hi
+ Inst.setOpcode(Hexagon::M2_vrcmpys_s1_h);
+ std::string Name = r + utostr(RegNum) + Colon + utostr(RegNum - 1);
+ StringRef RegPair = Name;
+ Rt.setReg(matchRegister(RegPair));
+ } else { // Even mapped sat:raw:lo
+ Inst.setOpcode(Hexagon::M2_vrcmpys_s1_l);
+ std::string Name = r + utostr(RegNum + 1) + Colon + utostr(RegNum);
+ StringRef RegPair = Name;
+ Rt.setReg(matchRegister(RegPair));
+ }
+ break;
+ }
+
+ case Hexagon::M2_vrcmpys_acc_s1: {
+ MCInst TmpInst;
+ MCOperand &Rxx = Inst.getOperand(0);
+ MCOperand &Rss = Inst.getOperand(2);
+ MCOperand &Rt = Inst.getOperand(3);
+ unsigned int RegNum = RI->getEncodingValue(Rt.getReg());
+ if (RegNum & 1) { // Odd mapped to sat:raw:hi
+ TmpInst.setOpcode(Hexagon::M2_vrcmpys_acc_s1_h);
+ std::string Name = r + utostr(RegNum) + Colon + utostr(RegNum - 1);
+ StringRef RegPair = Name;
+ Rt.setReg(matchRegister(RegPair));
+ } else { // Even mapped sat:raw:lo
+ TmpInst.setOpcode(Hexagon::M2_vrcmpys_acc_s1_l);
+ std::string Name = r + utostr(RegNum + 1) + Colon + utostr(RegNum);
+ StringRef RegPair = Name;
+ Rt.setReg(matchRegister(RegPair));
+ }
+ // Registers are in different positions
+ TmpInst.addOperand(Rxx);
+ TmpInst.addOperand(Rxx);
+ TmpInst.addOperand(Rss);
+ TmpInst.addOperand(Rt);
+ Inst = TmpInst;
+ break;
+ }
+
+ case Hexagon::M2_vrcmpys_s1rp: {
+ MCOperand &Rt = Inst.getOperand(2);
+ unsigned int RegNum = RI->getEncodingValue(Rt.getReg());
+ if (RegNum & 1) { // Odd mapped to rnd:sat:raw:hi
+ Inst.setOpcode(Hexagon::M2_vrcmpys_s1rp_h);
+ std::string Name = r + utostr(RegNum) + Colon + utostr(RegNum - 1);
+ StringRef RegPair = Name;
+ Rt.setReg(matchRegister(RegPair));
+ } else { // Even mapped rnd:sat:raw:lo
+ Inst.setOpcode(Hexagon::M2_vrcmpys_s1rp_l);
+ std::string Name = r + utostr(RegNum + 1) + Colon + utostr(RegNum);
+ StringRef RegPair = Name;
+ Rt.setReg(matchRegister(RegPair));
+ }
+ break;
+ }
+
+ case Hexagon::S5_asrhub_rnd_sat_goodsyntax: {
+ MCOperand &Imm = Inst.getOperand(2);
+ int64_t Value;
+ bool Absolute = Imm.getExpr()->evaluateAsAbsolute(Value);
+ assert(Absolute);
+ (void)Absolute;
+ if (Value == 0)
+ Inst.setOpcode(Hexagon::S2_vsathub);
+ else {
+ Imm.setExpr(HexagonMCExpr::create(
+ MCBinaryExpr::createSub(Imm.getExpr(),
+ MCConstantExpr::create(1, Context), Context),
+ Context));
+ Inst.setOpcode(Hexagon::S5_asrhub_rnd_sat);
+ }
+ break;
+ }
+
+ case Hexagon::S5_vasrhrnd_goodsyntax: {
+ MCOperand &Rdd = Inst.getOperand(0);
+ MCOperand &Rss = Inst.getOperand(1);
+ MCOperand &Imm = Inst.getOperand(2);
+ int64_t Value;
+ bool Absolute = Imm.getExpr()->evaluateAsAbsolute(Value);
+ assert(Absolute);
+ (void)Absolute;
+ if (Value == 0) {
+ MCInst TmpInst;
+ unsigned int RegPairNum = RI->getEncodingValue(Rss.getReg());
+ std::string R1 = r + utostr(RegPairNum + 1);
+ StringRef Reg1(R1);
+ Rss.setReg(matchRegister(Reg1));
+ // Add a new operand for the second register in the pair.
+ std::string R2 = r + utostr(RegPairNum);
+ StringRef Reg2(R2);
+ TmpInst.setOpcode(Hexagon::A2_combinew);
+ TmpInst.addOperand(Rdd);
+ TmpInst.addOperand(Rss);
+ TmpInst.addOperand(MCOperand::createReg(matchRegister(Reg2)));
+ Inst = TmpInst;
+ } else {
+ Imm.setExpr(HexagonMCExpr::create(
+ MCBinaryExpr::createSub(Imm.getExpr(),
+ MCConstantExpr::create(1, Context), Context),
+ Context));
+ Inst.setOpcode(Hexagon::S5_vasrhrnd);
+ }
+ break;
+ }
+
+ case Hexagon::A2_not: {
+ MCInst TmpInst;
+ MCOperand &Rd = Inst.getOperand(0);
+ MCOperand &Rs = Inst.getOperand(1);
+ TmpInst.setOpcode(Hexagon::A2_subri);
+ TmpInst.addOperand(Rd);
+ TmpInst.addOperand(MCOperand::createExpr(
+ HexagonMCExpr::create(MCConstantExpr::create(-1, Context), Context)));
+ TmpInst.addOperand(Rs);
+ Inst = TmpInst;
+ break;
+ }
+ } // switch
+
+ return Match_Success;
+}
+
+unsigned HexagonAsmParser::matchRegister(StringRef Name) {
+ if (unsigned Reg = MatchRegisterName(Name))
+ return Reg;
+ return MatchRegisterAltName(Name);
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/BitTracker.cpp b/contrib/llvm/lib/Target/Hexagon/BitTracker.cpp
new file mode 100644
index 000000000000..c0591c332dea
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/BitTracker.cpp
@@ -0,0 +1,1144 @@
+//===--- BitTracker.cpp ---------------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// SSA-based bit propagation.
+//
+// The purpose of this code is, for a given virtual register, to provide
+// information about the value of each bit in the register. The values
+// of bits are represented by the class BitValue, and take one of four
+// cases: 0, 1, "ref" and "bottom". The 0 and 1 are rather clear, the
+// "ref" value means that the bit is a copy of another bit (which itself
+// cannot be a copy of yet another bit---such chains are not allowed).
+// A "ref" value is associated with a BitRef structure, which indicates
+// which virtual register, and which bit in that register is the origin
+// of the value. For example, given an instruction
+// vreg2 = ASL vreg1, 1
+// assuming that nothing is known about bits of vreg1, bit 1 of vreg2
+// will be a "ref" to (vreg1, 0). If there is a subsequent instruction
+// vreg3 = ASL vreg2, 2
+// then bit 3 of vreg3 will be a "ref" to (vreg1, 0) as well.
+// The "bottom" case means that the bit's value cannot be determined,
+// and that this virtual register actually defines it. The "bottom" case
+// is discussed in detail in BitTracker.h. In fact, "bottom" is a "ref
+// to self", so for the vreg1 above, the bit 0 of it will be a "ref" to
+// (vreg1, 0), bit 1 will be a "ref" to (vreg1, 1), etc.
+//
+// The tracker implements the Wegman-Zadeck algorithm, originally developed
+// for SSA-based constant propagation. Each register is represented as
+// a sequence of bits, with the convention that bit 0 is the least signi-
+// ficant bit. Each bit is propagated individually. The class RegisterCell
+// implements the register's representation, and is also the subject of
+// the lattice operations in the tracker.
+//
+// The intended usage of the bit tracker is to create a target-specific
+// machine instruction evaluator, pass the evaluator to the BitTracker
+// object, and run the tracker. The tracker will then collect the bit
+// value information for a given machine function. After that, it can be
+// queried for the cells for each virtual register.
+// Sample code:
+// const TargetSpecificEvaluator TSE(TRI, MRI);
+// BitTracker BT(TSE, MF);
+// BT.run();
+// ...
+// unsigned Reg = interestingRegister();
+// RegisterCell RC = BT.get(Reg);
+// if (RC[3].is(1))
+// Reg0bit3 = 1;
+//
+// The code below is intended to be fully target-independent.
+
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#include "BitTracker.h"
+
+using namespace llvm;
+
+typedef BitTracker BT;
+
+namespace {
+ // Local trickery to pretty print a register (without the whole "%vreg"
+ // business).
+ struct printv {
+ printv(unsigned r) : R(r) {}
+ unsigned R;
+ };
+ raw_ostream &operator<< (raw_ostream &OS, const printv &PV) {
+ if (PV.R)
+ OS << 'v' << TargetRegisterInfo::virtReg2Index(PV.R);
+ else
+ OS << 's';
+ return OS;
+ }
+}
+
+namespace llvm {
+ raw_ostream &operator<<(raw_ostream &OS, const BT::BitValue &BV) {
+ switch (BV.Type) {
+ case BT::BitValue::Top:
+ OS << 'T';
+ break;
+ case BT::BitValue::Zero:
+ OS << '0';
+ break;
+ case BT::BitValue::One:
+ OS << '1';
+ break;
+ case BT::BitValue::Ref:
+ OS << printv(BV.RefI.Reg) << '[' << BV.RefI.Pos << ']';
+ break;
+ }
+ return OS;
+ }
+
+ raw_ostream &operator<<(raw_ostream &OS, const BT::RegisterCell &RC) {
+ unsigned n = RC.Bits.size();
+ OS << "{ w:" << n;
+ // Instead of printing each bit value individually, try to group them
+ // into logical segments, such as sequences of 0 or 1 bits or references
+ // to consecutive bits (e.g. "bits 3-5 are same as bits 7-9 of reg xyz").
+ // "Start" will be the index of the beginning of the most recent segment.
+ unsigned Start = 0;
+ bool SeqRef = false; // A sequence of refs to consecutive bits.
+ bool ConstRef = false; // A sequence of refs to the same bit.
+
+ for (unsigned i = 1, n = RC.Bits.size(); i < n; ++i) {
+ const BT::BitValue &V = RC[i];
+ const BT::BitValue &SV = RC[Start];
+ bool IsRef = (V.Type == BT::BitValue::Ref);
+ // If the current value is the same as Start, skip to the next one.
+ if (!IsRef && V == SV)
+ continue;
+ if (IsRef && SV.Type == BT::BitValue::Ref && V.RefI.Reg == SV.RefI.Reg) {
+ if (Start+1 == i) {
+ SeqRef = (V.RefI.Pos == SV.RefI.Pos+1);
+ ConstRef = (V.RefI.Pos == SV.RefI.Pos);
+ }
+ if (SeqRef && V.RefI.Pos == SV.RefI.Pos+(i-Start))
+ continue;
+ if (ConstRef && V.RefI.Pos == SV.RefI.Pos)
+ continue;
+ }
+
+ // The current value is different. Print the previous one and reset
+ // the Start.
+ OS << " [" << Start;
+ unsigned Count = i - Start;
+ if (Count == 1) {
+ OS << "]:" << SV;
+ } else {
+ OS << '-' << i-1 << "]:";
+ if (SV.Type == BT::BitValue::Ref && SeqRef)
+ OS << printv(SV.RefI.Reg) << '[' << SV.RefI.Pos << '-'
+ << SV.RefI.Pos+(Count-1) << ']';
+ else
+ OS << SV;
+ }
+ Start = i;
+ SeqRef = ConstRef = false;
+ }
+
+ OS << " [" << Start;
+ unsigned Count = n - Start;
+ if (n-Start == 1) {
+ OS << "]:" << RC[Start];
+ } else {
+ OS << '-' << n-1 << "]:";
+ const BT::BitValue &SV = RC[Start];
+ if (SV.Type == BT::BitValue::Ref && SeqRef)
+ OS << printv(SV.RefI.Reg) << '[' << SV.RefI.Pos << '-'
+ << SV.RefI.Pos+(Count-1) << ']';
+ else
+ OS << SV;
+ }
+ OS << " }";
+
+ return OS;
+ }
+}
+
+void BitTracker::print_cells(raw_ostream &OS) const {
+ for (CellMapType::iterator I = Map.begin(), E = Map.end(); I != E; ++I)
+ dbgs() << PrintReg(I->first, &ME.TRI) << " -> " << I->second << "\n";
+}
+
+
+BitTracker::BitTracker(const MachineEvaluator &E, MachineFunction &F)
+ : Trace(false), ME(E), MF(F), MRI(F.getRegInfo()), Map(*new CellMapType) {}
+
+BitTracker::~BitTracker() {
+ delete &Map;
+}
+
+
+// If we were allowed to update a cell for a part of a register, the meet
+// operation would need to be parametrized by the register number and the
+// exact part of the register, so that the computer BitRefs correspond to
+// the actual bits of the "self" register.
+// While this cannot happen in the current implementation, I'm not sure
+// if this should be ruled out in the future.
+bool BT::RegisterCell::meet(const RegisterCell &RC, unsigned SelfR) {
+ // An example when "meet" can be invoked with SelfR == 0 is a phi node
+ // with a physical register as an operand.
+ assert(SelfR == 0 || TargetRegisterInfo::isVirtualRegister(SelfR));
+ bool Changed = false;
+ for (uint16_t i = 0, n = Bits.size(); i < n; ++i) {
+ const BitValue &RCV = RC[i];
+ Changed |= Bits[i].meet(RCV, BitRef(SelfR, i));
+ }
+ return Changed;
+}
+
+
+// Insert the entire cell RC into the current cell at position given by M.
+BT::RegisterCell &BT::RegisterCell::insert(const BT::RegisterCell &RC,
+ const BitMask &M) {
+ uint16_t B = M.first(), E = M.last(), W = width();
+ // Sanity: M must be a valid mask for *this.
+ assert(B < W && E < W);
+ // Sanity: the masked part of *this must have the same number of bits
+ // as the source.
+ assert(B > E || E-B+1 == RC.width()); // B <= E => E-B+1 = |RC|.
+ assert(B <= E || E+(W-B)+1 == RC.width()); // E < B => E+(W-B)+1 = |RC|.
+ if (B <= E) {
+ for (uint16_t i = 0; i <= E-B; ++i)
+ Bits[i+B] = RC[i];
+ } else {
+ for (uint16_t i = 0; i < W-B; ++i)
+ Bits[i+B] = RC[i];
+ for (uint16_t i = 0; i <= E; ++i)
+ Bits[i] = RC[i+(W-B)];
+ }
+ return *this;
+}
+
+
+BT::RegisterCell BT::RegisterCell::extract(const BitMask &M) const {
+ uint16_t B = M.first(), E = M.last(), W = width();
+ assert(B < W && E < W);
+ if (B <= E) {
+ RegisterCell RC(E-B+1);
+ for (uint16_t i = B; i <= E; ++i)
+ RC.Bits[i-B] = Bits[i];
+ return RC;
+ }
+
+ RegisterCell RC(E+(W-B)+1);
+ for (uint16_t i = 0; i < W-B; ++i)
+ RC.Bits[i] = Bits[i+B];
+ for (uint16_t i = 0; i <= E; ++i)
+ RC.Bits[i+(W-B)] = Bits[i];
+ return RC;
+}
+
+
+BT::RegisterCell &BT::RegisterCell::rol(uint16_t Sh) {
+ // Rotate left (i.e. towards increasing bit indices).
+ // Swap the two parts: [0..W-Sh-1] [W-Sh..W-1]
+ uint16_t W = width();
+ Sh = Sh % W;
+ if (Sh == 0)
+ return *this;
+
+ RegisterCell Tmp(W-Sh);
+ // Tmp = [0..W-Sh-1].
+ for (uint16_t i = 0; i < W-Sh; ++i)
+ Tmp[i] = Bits[i];
+ // Shift [W-Sh..W-1] to [0..Sh-1].
+ for (uint16_t i = 0; i < Sh; ++i)
+ Bits[i] = Bits[W-Sh+i];
+ // Copy Tmp to [Sh..W-1].
+ for (uint16_t i = 0; i < W-Sh; ++i)
+ Bits[i+Sh] = Tmp.Bits[i];
+ return *this;
+}
+
+
+BT::RegisterCell &BT::RegisterCell::fill(uint16_t B, uint16_t E,
+ const BitValue &V) {
+ assert(B <= E);
+ while (B < E)
+ Bits[B++] = V;
+ return *this;
+}
+
+
+BT::RegisterCell &BT::RegisterCell::cat(const RegisterCell &RC) {
+ // Append the cell given as the argument to the "this" cell.
+ // Bit 0 of RC becomes bit W of the result, where W is this->width().
+ uint16_t W = width(), WRC = RC.width();
+ Bits.resize(W+WRC);
+ for (uint16_t i = 0; i < WRC; ++i)
+ Bits[i+W] = RC.Bits[i];
+ return *this;
+}
+
+
+uint16_t BT::RegisterCell::ct(bool B) const {
+ uint16_t W = width();
+ uint16_t C = 0;
+ BitValue V = B;
+ while (C < W && Bits[C] == V)
+ C++;
+ return C;
+}
+
+
+uint16_t BT::RegisterCell::cl(bool B) const {
+ uint16_t W = width();
+ uint16_t C = 0;
+ BitValue V = B;
+ while (C < W && Bits[W-(C+1)] == V)
+ C++;
+ return C;
+}
+
+
+bool BT::RegisterCell::operator== (const RegisterCell &RC) const {
+ uint16_t W = Bits.size();
+ if (RC.Bits.size() != W)
+ return false;
+ for (uint16_t i = 0; i < W; ++i)
+ if (Bits[i] != RC[i])
+ return false;
+ return true;
+}
+
+
+uint16_t BT::MachineEvaluator::getRegBitWidth(const RegisterRef &RR) const {
+ // The general problem is with finding a register class that corresponds
+ // to a given reference reg:sub. There can be several such classes, and
+ // since we only care about the register size, it does not matter which
+ // such class we would find.
+ // The easiest way to accomplish what we want is to
+ // 1. find a physical register PhysR from the same class as RR.Reg,
+ // 2. find a physical register PhysS that corresponds to PhysR:RR.Sub,
+ // 3. find a register class that contains PhysS.
+ unsigned PhysR;
+ if (TargetRegisterInfo::isVirtualRegister(RR.Reg)) {
+ const TargetRegisterClass *VC = MRI.getRegClass(RR.Reg);
+ assert(VC->begin() != VC->end() && "Empty register class");
+ PhysR = *VC->begin();
+ } else {
+ assert(TargetRegisterInfo::isPhysicalRegister(RR.Reg));
+ PhysR = RR.Reg;
+ }
+
+ unsigned PhysS = (RR.Sub == 0) ? PhysR : TRI.getSubReg(PhysR, RR.Sub);
+ const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(PhysS);
+ uint16_t BW = RC->getSize()*8;
+ return BW;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::getCell(const RegisterRef &RR,
+ const CellMapType &M) const {
+ uint16_t BW = getRegBitWidth(RR);
+
+ // Physical registers are assumed to be present in the map with an unknown
+ // value. Don't actually insert anything in the map, just return the cell.
+ if (TargetRegisterInfo::isPhysicalRegister(RR.Reg))
+ return RegisterCell::self(0, BW);
+
+ assert(TargetRegisterInfo::isVirtualRegister(RR.Reg));
+ // For virtual registers that belong to a class that is not tracked,
+ // generate an "unknown" value as well.
+ const TargetRegisterClass *C = MRI.getRegClass(RR.Reg);
+ if (!track(C))
+ return RegisterCell::self(0, BW);
+
+ CellMapType::const_iterator F = M.find(RR.Reg);
+ if (F != M.end()) {
+ if (!RR.Sub)
+ return F->second;
+ BitMask M = mask(RR.Reg, RR.Sub);
+ return F->second.extract(M);
+ }
+ // If not found, create a "top" entry, but do not insert it in the map.
+ return RegisterCell::top(BW);
+}
+
+
+void BT::MachineEvaluator::putCell(const RegisterRef &RR, RegisterCell RC,
+ CellMapType &M) const {
+ // While updating the cell map can be done in a meaningful way for
+ // a part of a register, it makes little sense to implement it as the
+ // SSA representation would never contain such "partial definitions".
+ if (!TargetRegisterInfo::isVirtualRegister(RR.Reg))
+ return;
+ assert(RR.Sub == 0 && "Unexpected sub-register in definition");
+ // Eliminate all ref-to-reg-0 bit values: replace them with "self".
+ for (unsigned i = 0, n = RC.width(); i < n; ++i) {
+ const BitValue &V = RC[i];
+ if (V.Type == BitValue::Ref && V.RefI.Reg == 0)
+ RC[i].RefI = BitRef(RR.Reg, i);
+ }
+ M[RR.Reg] = RC;
+}
+
+
+// Check if the cell represents a compile-time integer value.
+bool BT::MachineEvaluator::isInt(const RegisterCell &A) const {
+ uint16_t W = A.width();
+ for (uint16_t i = 0; i < W; ++i)
+ if (!A[i].is(0) && !A[i].is(1))
+ return false;
+ return true;
+}
+
+
+// Convert a cell to the integer value. The result must fit in uint64_t.
+uint64_t BT::MachineEvaluator::toInt(const RegisterCell &A) const {
+ assert(isInt(A));
+ uint64_t Val = 0;
+ uint16_t W = A.width();
+ for (uint16_t i = 0; i < W; ++i) {
+ Val <<= 1;
+ Val |= A[i].is(1);
+ }
+ return Val;
+}
+
+
+// Evaluator helper functions. These implement some common operation on
+// register cells that can be used to implement target-specific instructions
+// in a target-specific evaluator.
+
+BT::RegisterCell BT::MachineEvaluator::eIMM(int64_t V, uint16_t W) const {
+ RegisterCell Res(W);
+ // For bits beyond the 63rd, this will generate the sign bit of V.
+ for (uint16_t i = 0; i < W; ++i) {
+ Res[i] = BitValue(V & 1);
+ V >>= 1;
+ }
+ return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eIMM(const ConstantInt *CI) const {
+ const APInt &A = CI->getValue();
+ uint16_t BW = A.getBitWidth();
+ assert((unsigned)BW == A.getBitWidth() && "BitWidth overflow");
+ RegisterCell Res(BW);
+ for (uint16_t i = 0; i < BW; ++i)
+ Res[i] = A[i];
+ return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eADD(const RegisterCell &A1,
+ const RegisterCell &A2) const {
+ uint16_t W = A1.width();
+ assert(W == A2.width());
+ RegisterCell Res(W);
+ bool Carry = false;
+ uint16_t I;
+ for (I = 0; I < W; ++I) {
+ const BitValue &V1 = A1[I];
+ const BitValue &V2 = A2[I];
+ if (!V1.num() || !V2.num())
+ break;
+ unsigned S = bool(V1) + bool(V2) + Carry;
+ Res[I] = BitValue(S & 1);
+ Carry = (S > 1);
+ }
+ for (; I < W; ++I) {
+ const BitValue &V1 = A1[I];
+ const BitValue &V2 = A2[I];
+ // If the next bit is same as Carry, the result will be 0 plus the
+ // other bit. The Carry bit will remain unchanged.
+ if (V1.is(Carry))
+ Res[I] = BitValue::ref(V2);
+ else if (V2.is(Carry))
+ Res[I] = BitValue::ref(V1);
+ else
+ break;
+ }
+ for (; I < W; ++I)
+ Res[I] = BitValue::self();
+ return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eSUB(const RegisterCell &A1,
+ const RegisterCell &A2) const {
+ uint16_t W = A1.width();
+ assert(W == A2.width());
+ RegisterCell Res(W);
+ bool Borrow = false;
+ uint16_t I;
+ for (I = 0; I < W; ++I) {
+ const BitValue &V1 = A1[I];
+ const BitValue &V2 = A2[I];
+ if (!V1.num() || !V2.num())
+ break;
+ unsigned S = bool(V1) - bool(V2) - Borrow;
+ Res[I] = BitValue(S & 1);
+ Borrow = (S > 1);
+ }
+ for (; I < W; ++I) {
+ const BitValue &V1 = A1[I];
+ const BitValue &V2 = A2[I];
+ if (V1.is(Borrow)) {
+ Res[I] = BitValue::ref(V2);
+ break;
+ }
+ if (V2.is(Borrow))
+ Res[I] = BitValue::ref(V1);
+ else
+ break;
+ }
+ for (; I < W; ++I)
+ Res[I] = BitValue::self();
+ return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eMLS(const RegisterCell &A1,
+ const RegisterCell &A2) const {
+ uint16_t W = A1.width() + A2.width();
+ uint16_t Z = A1.ct(0) + A2.ct(0);
+ RegisterCell Res(W);
+ Res.fill(0, Z, BitValue::Zero);
+ Res.fill(Z, W, BitValue::self());
+ return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eMLU(const RegisterCell &A1,
+ const RegisterCell &A2) const {
+ uint16_t W = A1.width() + A2.width();
+ uint16_t Z = A1.ct(0) + A2.ct(0);
+ RegisterCell Res(W);
+ Res.fill(0, Z, BitValue::Zero);
+ Res.fill(Z, W, BitValue::self());
+ return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eASL(const RegisterCell &A1,
+ uint16_t Sh) const {
+ assert(Sh <= A1.width());
+ RegisterCell Res = RegisterCell::ref(A1);
+ Res.rol(Sh);
+ Res.fill(0, Sh, BitValue::Zero);
+ return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eLSR(const RegisterCell &A1,
+ uint16_t Sh) const {
+ uint16_t W = A1.width();
+ assert(Sh <= W);
+ RegisterCell Res = RegisterCell::ref(A1);
+ Res.rol(W-Sh);
+ Res.fill(W-Sh, W, BitValue::Zero);
+ return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eASR(const RegisterCell &A1,
+ uint16_t Sh) const {
+ uint16_t W = A1.width();
+ assert(Sh <= W);
+ RegisterCell Res = RegisterCell::ref(A1);
+ BitValue Sign = Res[W-1];
+ Res.rol(W-Sh);
+ Res.fill(W-Sh, W, Sign);
+ return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eAND(const RegisterCell &A1,
+ const RegisterCell &A2) const {
+ uint16_t W = A1.width();
+ assert(W == A2.width());
+ RegisterCell Res(W);
+ for (uint16_t i = 0; i < W; ++i) {
+ const BitValue &V1 = A1[i];
+ const BitValue &V2 = A2[i];
+ if (V1.is(1))
+ Res[i] = BitValue::ref(V2);
+ else if (V2.is(1))
+ Res[i] = BitValue::ref(V1);
+ else if (V1.is(0) || V2.is(0))
+ Res[i] = BitValue::Zero;
+ else if (V1 == V2)
+ Res[i] = V1;
+ else
+ Res[i] = BitValue::self();
+ }
+ return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eORL(const RegisterCell &A1,
+ const RegisterCell &A2) const {
+ uint16_t W = A1.width();
+ assert(W == A2.width());
+ RegisterCell Res(W);
+ for (uint16_t i = 0; i < W; ++i) {
+ const BitValue &V1 = A1[i];
+ const BitValue &V2 = A2[i];
+ if (V1.is(1) || V2.is(1))
+ Res[i] = BitValue::One;
+ else if (V1.is(0))
+ Res[i] = BitValue::ref(V2);
+ else if (V2.is(0))
+ Res[i] = BitValue::ref(V1);
+ else if (V1 == V2)
+ Res[i] = V1;
+ else
+ Res[i] = BitValue::self();
+ }
+ return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eXOR(const RegisterCell &A1,
+ const RegisterCell &A2) const {
+ uint16_t W = A1.width();
+ assert(W == A2.width());
+ RegisterCell Res(W);
+ for (uint16_t i = 0; i < W; ++i) {
+ const BitValue &V1 = A1[i];
+ const BitValue &V2 = A2[i];
+ if (V1.is(0))
+ Res[i] = BitValue::ref(V2);
+ else if (V2.is(0))
+ Res[i] = BitValue::ref(V1);
+ else if (V1 == V2)
+ Res[i] = BitValue::Zero;
+ else
+ Res[i] = BitValue::self();
+ }
+ return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eNOT(const RegisterCell &A1) const {
+ uint16_t W = A1.width();
+ RegisterCell Res(W);
+ for (uint16_t i = 0; i < W; ++i) {
+ const BitValue &V = A1[i];
+ if (V.is(0))
+ Res[i] = BitValue::One;
+ else if (V.is(1))
+ Res[i] = BitValue::Zero;
+ else
+ Res[i] = BitValue::self();
+ }
+ return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eSET(const RegisterCell &A1,
+ uint16_t BitN) const {
+ assert(BitN < A1.width());
+ RegisterCell Res = RegisterCell::ref(A1);
+ Res[BitN] = BitValue::One;
+ return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eCLR(const RegisterCell &A1,
+ uint16_t BitN) const {
+ assert(BitN < A1.width());
+ RegisterCell Res = RegisterCell::ref(A1);
+ Res[BitN] = BitValue::Zero;
+ return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eCLB(const RegisterCell &A1, bool B,
+ uint16_t W) const {
+ uint16_t C = A1.cl(B), AW = A1.width();
+ // If the last leading non-B bit is not a constant, then we don't know
+ // the real count.
+ if ((C < AW && A1[AW-1-C].num()) || C == AW)
+ return eIMM(C, W);
+ return RegisterCell::self(0, W);
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eCTB(const RegisterCell &A1, bool B,
+ uint16_t W) const {
+ uint16_t C = A1.ct(B), AW = A1.width();
+ // If the last trailing non-B bit is not a constant, then we don't know
+ // the real count.
+ if ((C < AW && A1[C].num()) || C == AW)
+ return eIMM(C, W);
+ return RegisterCell::self(0, W);
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eSXT(const RegisterCell &A1,
+ uint16_t FromN) const {
+ uint16_t W = A1.width();
+ assert(FromN <= W);
+ RegisterCell Res = RegisterCell::ref(A1);
+ BitValue Sign = Res[FromN-1];
+ // Sign-extend "inreg".
+ Res.fill(FromN, W, Sign);
+ return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eZXT(const RegisterCell &A1,
+ uint16_t FromN) const {
+ uint16_t W = A1.width();
+ assert(FromN <= W);
+ RegisterCell Res = RegisterCell::ref(A1);
+ Res.fill(FromN, W, BitValue::Zero);
+ return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eXTR(const RegisterCell &A1,
+ uint16_t B, uint16_t E) const {
+ uint16_t W = A1.width();
+ assert(B < W && E <= W);
+ if (B == E)
+ return RegisterCell(0);
+ uint16_t Last = (E > 0) ? E-1 : W-1;
+ RegisterCell Res = RegisterCell::ref(A1).extract(BT::BitMask(B, Last));
+ // Return shorter cell.
+ return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eINS(const RegisterCell &A1,
+ const RegisterCell &A2, uint16_t AtN) const {
+ uint16_t W1 = A1.width(), W2 = A2.width();
+ (void)W1;
+ assert(AtN < W1 && AtN+W2 <= W1);
+ // Copy bits from A1, insert A2 at position AtN.
+ RegisterCell Res = RegisterCell::ref(A1);
+ if (W2 > 0)
+ Res.insert(RegisterCell::ref(A2), BT::BitMask(AtN, AtN+W2-1));
+ return Res;
+}
+
+
+BT::BitMask BT::MachineEvaluator::mask(unsigned Reg, unsigned Sub) const {
+ assert(Sub == 0 && "Generic BitTracker::mask called for Sub != 0");
+ uint16_t W = getRegBitWidth(Reg);
+ assert(W > 0 && "Cannot generate mask for empty register");
+ return BitMask(0, W-1);
+}
+
+bool BT::MachineEvaluator::evaluate(const MachineInstr &MI,
+ const CellMapType &Inputs,
+ CellMapType &Outputs) const {
+ unsigned Opc = MI.getOpcode();
+ switch (Opc) {
+ case TargetOpcode::REG_SEQUENCE: {
+ RegisterRef RD = MI.getOperand(0);
+ assert(RD.Sub == 0);
+ RegisterRef RS = MI.getOperand(1);
+ unsigned SS = MI.getOperand(2).getImm();
+ RegisterRef RT = MI.getOperand(3);
+ unsigned ST = MI.getOperand(4).getImm();
+ assert(SS != ST);
+
+ uint16_t W = getRegBitWidth(RD);
+ RegisterCell Res(W);
+ Res.insert(RegisterCell::ref(getCell(RS, Inputs)), mask(RD.Reg, SS));
+ Res.insert(RegisterCell::ref(getCell(RT, Inputs)), mask(RD.Reg, ST));
+ putCell(RD, Res, Outputs);
+ break;
+ }
+
+ case TargetOpcode::COPY: {
+ // COPY can transfer a smaller register into a wider one.
+ // If that is the case, fill the remaining high bits with 0.
+ RegisterRef RD = MI.getOperand(0);
+ RegisterRef RS = MI.getOperand(1);
+ assert(RD.Sub == 0);
+ uint16_t WD = getRegBitWidth(RD);
+ uint16_t WS = getRegBitWidth(RS);
+ assert(WD >= WS);
+ RegisterCell Src = getCell(RS, Inputs);
+ RegisterCell Res(WD);
+ Res.insert(Src, BitMask(0, WS-1));
+ Res.fill(WS, WD, BitValue::Zero);
+ putCell(RD, Res, Outputs);
+ break;
+ }
+
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+
+// Main W-Z implementation.
+
+void BT::visitPHI(const MachineInstr &PI) {
+ int ThisN = PI.getParent()->getNumber();
+ if (Trace)
+ dbgs() << "Visit FI(BB#" << ThisN << "): " << PI;
+
+ const MachineOperand &MD = PI.getOperand(0);
+ assert(MD.getSubReg() == 0 && "Unexpected sub-register in definition");
+ RegisterRef DefRR(MD);
+ uint16_t DefBW = ME.getRegBitWidth(DefRR);
+
+ RegisterCell DefC = ME.getCell(DefRR, Map);
+ if (DefC == RegisterCell::self(DefRR.Reg, DefBW)) // XXX slow
+ return;
+
+ bool Changed = false;
+
+ for (unsigned i = 1, n = PI.getNumOperands(); i < n; i += 2) {
+ const MachineBasicBlock *PB = PI.getOperand(i + 1).getMBB();
+ int PredN = PB->getNumber();
+ if (Trace)
+ dbgs() << " edge BB#" << PredN << "->BB#" << ThisN;
+ if (!EdgeExec.count(CFGEdge(PredN, ThisN))) {
+ if (Trace)
+ dbgs() << " not executable\n";
+ continue;
+ }
+
+ RegisterRef RU = PI.getOperand(i);
+ RegisterCell ResC = ME.getCell(RU, Map);
+ if (Trace)
+ dbgs() << " input reg: " << PrintReg(RU.Reg, &ME.TRI, RU.Sub)
+ << " cell: " << ResC << "\n";
+ Changed |= DefC.meet(ResC, DefRR.Reg);
+ }
+
+ if (Changed) {
+ if (Trace)
+ dbgs() << "Output: " << PrintReg(DefRR.Reg, &ME.TRI, DefRR.Sub)
+ << " cell: " << DefC << "\n";
+ ME.putCell(DefRR, DefC, Map);
+ visitUsesOf(DefRR.Reg);
+ }
+}
+
+void BT::visitNonBranch(const MachineInstr &MI) {
+ if (Trace) {
+ int ThisN = MI.getParent()->getNumber();
+ dbgs() << "Visit MI(BB#" << ThisN << "): " << MI;
+ }
+ if (MI.isDebugValue())
+ return;
+ assert(!MI.isBranch() && "Unexpected branch instruction");
+
+ CellMapType ResMap;
+ bool Eval = ME.evaluate(MI, Map, ResMap);
+
+ if (Trace && Eval) {
+ for (unsigned i = 0, n = MI.getNumOperands(); i < n; ++i) {
+ const MachineOperand &MO = MI.getOperand(i);
+ if (!MO.isReg() || !MO.isUse())
+ continue;
+ RegisterRef RU(MO);
+ dbgs() << " input reg: " << PrintReg(RU.Reg, &ME.TRI, RU.Sub)
+ << " cell: " << ME.getCell(RU, Map) << "\n";
+ }
+ dbgs() << "Outputs:\n";
+ for (CellMapType::iterator I = ResMap.begin(), E = ResMap.end();
+ I != E; ++I) {
+ RegisterRef RD(I->first);
+ dbgs() << " " << PrintReg(I->first, &ME.TRI) << " cell: "
+ << ME.getCell(RD, ResMap) << "\n";
+ }
+ }
+
+ // Iterate over all definitions of the instruction, and update the
+ // cells accordingly.
+ for (unsigned i = 0, n = MI.getNumOperands(); i < n; ++i) {
+ const MachineOperand &MO = MI.getOperand(i);
+ // Visit register defs only.
+ if (!MO.isReg() || !MO.isDef())
+ continue;
+ RegisterRef RD(MO);
+ assert(RD.Sub == 0 && "Unexpected sub-register in definition");
+ if (!TargetRegisterInfo::isVirtualRegister(RD.Reg))
+ continue;
+
+ bool Changed = false;
+ if (!Eval || ResMap.count(RD.Reg) == 0) {
+ // Set to "ref" (aka "bottom").
+ uint16_t DefBW = ME.getRegBitWidth(RD);
+ RegisterCell RefC = RegisterCell::self(RD.Reg, DefBW);
+ if (RefC != ME.getCell(RD, Map)) {
+ ME.putCell(RD, RefC, Map);
+ Changed = true;
+ }
+ } else {
+ RegisterCell DefC = ME.getCell(RD, Map);
+ RegisterCell ResC = ME.getCell(RD, ResMap);
+ // This is a non-phi instruction, so the values of the inputs come
+ // from the same registers each time this instruction is evaluated.
+ // During the propagation, the values of the inputs can become lowered
+ // in the sense of the lattice operation, which may cause different
+ // results to be calculated in subsequent evaluations. This should
+ // not cause the bottoming of the result in the map, since the new
+ // result is already reflecting the lowered inputs.
+ for (uint16_t i = 0, w = DefC.width(); i < w; ++i) {
+ BitValue &V = DefC[i];
+ // Bits that are already "bottom" should not be updated.
+ if (V.Type == BitValue::Ref && V.RefI.Reg == RD.Reg)
+ continue;
+ // Same for those that are identical in DefC and ResC.
+ if (V == ResC[i])
+ continue;
+ V = ResC[i];
+ Changed = true;
+ }
+ if (Changed)
+ ME.putCell(RD, DefC, Map);
+ }
+ if (Changed)
+ visitUsesOf(RD.Reg);
+ }
+}
+
+void BT::visitBranchesFrom(const MachineInstr &BI) {
+ const MachineBasicBlock &B = *BI.getParent();
+ MachineBasicBlock::const_iterator It = BI, End = B.end();
+ BranchTargetList Targets, BTs;
+ bool FallsThrough = true, DefaultToAll = false;
+ int ThisN = B.getNumber();
+
+ do {
+ BTs.clear();
+ const MachineInstr &MI = *It;
+ if (Trace)
+ dbgs() << "Visit BR(BB#" << ThisN << "): " << MI;
+ assert(MI.isBranch() && "Expecting branch instruction");
+ InstrExec.insert(&MI);
+ bool Eval = ME.evaluate(MI, Map, BTs, FallsThrough);
+ if (!Eval) {
+ // If the evaluation failed, we will add all targets. Keep going in
+ // the loop to mark all executable branches as such.
+ DefaultToAll = true;
+ FallsThrough = true;
+ if (Trace)
+ dbgs() << " failed to evaluate: will add all CFG successors\n";
+ } else if (!DefaultToAll) {
+ // If evaluated successfully add the targets to the cumulative list.
+ if (Trace) {
+ dbgs() << " adding targets:";
+ for (unsigned i = 0, n = BTs.size(); i < n; ++i)
+ dbgs() << " BB#" << BTs[i]->getNumber();
+ if (FallsThrough)
+ dbgs() << "\n falls through\n";
+ else
+ dbgs() << "\n does not fall through\n";
+ }
+ Targets.insert(BTs.begin(), BTs.end());
+ }
+ ++It;
+ } while (FallsThrough && It != End);
+
+ typedef MachineBasicBlock::const_succ_iterator succ_iterator;
+ if (!DefaultToAll) {
+ // Need to add all CFG successors that lead to EH landing pads.
+ // There won't be explicit branches to these blocks, but they must
+ // be processed.
+ for (succ_iterator I = B.succ_begin(), E = B.succ_end(); I != E; ++I) {
+ const MachineBasicBlock *SB = *I;
+ if (SB->isEHPad())
+ Targets.insert(SB);
+ }
+ if (FallsThrough) {
+ MachineFunction::const_iterator BIt = B.getIterator();
+ MachineFunction::const_iterator Next = std::next(BIt);
+ if (Next != MF.end())
+ Targets.insert(&*Next);
+ }
+ } else {
+ for (succ_iterator I = B.succ_begin(), E = B.succ_end(); I != E; ++I)
+ Targets.insert(*I);
+ }
+
+ for (unsigned i = 0, n = Targets.size(); i < n; ++i) {
+ int TargetN = Targets[i]->getNumber();
+ FlowQ.push(CFGEdge(ThisN, TargetN));
+ }
+}
+
+
+void BT::visitUsesOf(unsigned Reg) {
+ if (Trace)
+ dbgs() << "visiting uses of " << PrintReg(Reg, &ME.TRI) << "\n";
+
+ typedef MachineRegisterInfo::use_nodbg_iterator use_iterator;
+ use_iterator End = MRI.use_nodbg_end();
+ for (use_iterator I = MRI.use_nodbg_begin(Reg); I != End; ++I) {
+ MachineInstr *UseI = I->getParent();
+ if (!InstrExec.count(UseI))
+ continue;
+ if (UseI->isPHI())
+ visitPHI(*UseI);
+ else if (!UseI->isBranch())
+ visitNonBranch(*UseI);
+ else
+ visitBranchesFrom(*UseI);
+ }
+}
+
+
+BT::RegisterCell BT::get(RegisterRef RR) const {
+ return ME.getCell(RR, Map);
+}
+
+
+void BT::put(RegisterRef RR, const RegisterCell &RC) {
+ ME.putCell(RR, RC, Map);
+}
+
+
+// Replace all references to bits from OldRR with the corresponding bits
+// in NewRR.
+void BT::subst(RegisterRef OldRR, RegisterRef NewRR) {
+ assert(Map.count(OldRR.Reg) > 0 && "OldRR not present in map");
+ BitMask OM = ME.mask(OldRR.Reg, OldRR.Sub);
+ BitMask NM = ME.mask(NewRR.Reg, NewRR.Sub);
+ uint16_t OMB = OM.first(), OME = OM.last();
+ uint16_t NMB = NM.first(), NME = NM.last();
+ (void)NME;
+ assert((OME-OMB == NME-NMB) &&
+ "Substituting registers of different lengths");
+ for (CellMapType::iterator I = Map.begin(), E = Map.end(); I != E; ++I) {
+ RegisterCell &RC = I->second;
+ for (uint16_t i = 0, w = RC.width(); i < w; ++i) {
+ BitValue &V = RC[i];
+ if (V.Type != BitValue::Ref || V.RefI.Reg != OldRR.Reg)
+ continue;
+ if (V.RefI.Pos < OMB || V.RefI.Pos > OME)
+ continue;
+ V.RefI.Reg = NewRR.Reg;
+ V.RefI.Pos += NMB-OMB;
+ }
+ }
+}
+
+
+// Check if the block has been "executed" during propagation. (If not, the
+// block is dead, but it may still appear to be reachable.)
+bool BT::reached(const MachineBasicBlock *B) const {
+ int BN = B->getNumber();
+ assert(BN >= 0);
+ for (EdgeSetType::iterator I = EdgeExec.begin(), E = EdgeExec.end();
+ I != E; ++I) {
+ if (I->second == BN)
+ return true;
+ }
+ return false;
+}
+
+
+// Visit an individual instruction. This could be a newly added instruction,
+// or one that has been modified by an optimization.
+void BT::visit(const MachineInstr &MI) {
+ assert(!MI.isBranch() && "Only non-branches are allowed");
+ InstrExec.insert(&MI);
+ visitNonBranch(MI);
+ // The call to visitNonBranch could propagate the changes until a branch
+ // is actually visited. This could result in adding CFG edges to the flow
+ // queue. Since the queue won't be processed, clear it.
+ while (!FlowQ.empty())
+ FlowQ.pop();
+}
+
+
+void BT::reset() {
+ EdgeExec.clear();
+ InstrExec.clear();
+ Map.clear();
+}
+
+
+void BT::run() {
+ reset();
+ assert(FlowQ.empty());
+
+ typedef GraphTraits<const MachineFunction*> MachineFlowGraphTraits;
+ const MachineBasicBlock *Entry = MachineFlowGraphTraits::getEntryNode(&MF);
+
+ unsigned MaxBN = 0;
+ for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+ I != E; ++I) {
+ assert(I->getNumber() >= 0 && "Disconnected block");
+ unsigned BN = I->getNumber();
+ if (BN > MaxBN)
+ MaxBN = BN;
+ }
+
+ // Keep track of visited blocks.
+ BitVector BlockScanned(MaxBN+1);
+
+ int EntryN = Entry->getNumber();
+ // Generate a fake edge to get something to start with.
+ FlowQ.push(CFGEdge(-1, EntryN));
+
+ while (!FlowQ.empty()) {
+ CFGEdge Edge = FlowQ.front();
+ FlowQ.pop();
+
+ if (EdgeExec.count(Edge))
+ continue;
+ EdgeExec.insert(Edge);
+
+ const MachineBasicBlock &B = *MF.getBlockNumbered(Edge.second);
+ MachineBasicBlock::const_iterator It = B.begin(), End = B.end();
+ // Visit PHI nodes first.
+ while (It != End && It->isPHI()) {
+ const MachineInstr &PI = *It++;
+ InstrExec.insert(&PI);
+ visitPHI(PI);
+ }
+
+ // If this block has already been visited through a flow graph edge,
+ // then the instructions have already been processed. Any updates to
+ // the cells would now only happen through visitUsesOf...
+ if (BlockScanned[Edge.second])
+ continue;
+ BlockScanned[Edge.second] = true;
+
+ // Visit non-branch instructions.
+ while (It != End && !It->isBranch()) {
+ const MachineInstr &MI = *It++;
+ InstrExec.insert(&MI);
+ visitNonBranch(MI);
+ }
+ // If block end has been reached, add the fall-through edge to the queue.
+ if (It == End) {
+ MachineFunction::const_iterator BIt = B.getIterator();
+ MachineFunction::const_iterator Next = std::next(BIt);
+ if (Next != MF.end() && B.isSuccessor(&*Next)) {
+ int ThisN = B.getNumber();
+ int NextN = Next->getNumber();
+ FlowQ.push(CFGEdge(ThisN, NextN));
+ }
+ } else {
+ // Handle the remaining sequence of branches. This function will update
+ // the work queue.
+ visitBranchesFrom(*It);
+ }
+ } // while (!FlowQ->empty())
+
+ if (Trace)
+ print_cells(dbgs() << "Cells after propagation:\n");
+}
+
diff --git a/contrib/llvm/lib/Target/Hexagon/BitTracker.h b/contrib/llvm/lib/Target/Hexagon/BitTracker.h
new file mode 100644
index 000000000000..74cafcd00b60
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/BitTracker.h
@@ -0,0 +1,438 @@
+//===--- BitTracker.h -----------------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BITTRACKER_H
+#define BITTRACKER_H
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunction.h"
+
+#include <map>
+#include <queue>
+#include <set>
+
+namespace llvm {
+ class ConstantInt;
+ class MachineRegisterInfo;
+ class MachineBasicBlock;
+ class MachineInstr;
+ class MachineOperand;
+ class raw_ostream;
+
+struct BitTracker {
+ struct BitRef;
+ struct RegisterRef;
+ struct BitValue;
+ struct BitMask;
+ struct RegisterCell;
+ struct MachineEvaluator;
+
+ typedef SetVector<const MachineBasicBlock *> BranchTargetList;
+
+ typedef std::map<unsigned, RegisterCell> CellMapType;
+
+ BitTracker(const MachineEvaluator &E, MachineFunction &F);
+ ~BitTracker();
+
+ void run();
+ void trace(bool On = false) { Trace = On; }
+ bool has(unsigned Reg) const;
+ const RegisterCell &lookup(unsigned Reg) const;
+ RegisterCell get(RegisterRef RR) const;
+ void put(RegisterRef RR, const RegisterCell &RC);
+ void subst(RegisterRef OldRR, RegisterRef NewRR);
+ bool reached(const MachineBasicBlock *B) const;
+ void visit(const MachineInstr &MI);
+
+ void print_cells(raw_ostream &OS) const;
+
+private:
+ void visitPHI(const MachineInstr &PI);
+ void visitNonBranch(const MachineInstr &MI);
+ void visitBranchesFrom(const MachineInstr &BI);
+ void visitUsesOf(unsigned Reg);
+ void reset();
+
+ typedef std::pair<int,int> CFGEdge;
+ typedef std::set<CFGEdge> EdgeSetType;
+ typedef std::set<const MachineInstr *> InstrSetType;
+ typedef std::queue<CFGEdge> EdgeQueueType;
+
+ EdgeSetType EdgeExec; // Executable flow graph edges.
+ InstrSetType InstrExec; // Executable instructions.
+ EdgeQueueType FlowQ; // Work queue of CFG edges.
+ bool Trace; // Enable tracing for debugging.
+
+ const MachineEvaluator &ME;
+ MachineFunction &MF;
+ MachineRegisterInfo &MRI;
+ CellMapType &Map;
+};
+
+
+// Abstraction of a reference to bit at position Pos from a register Reg.
+struct BitTracker::BitRef {
+ BitRef(unsigned R = 0, uint16_t P = 0) : Reg(R), Pos(P) {}
+ bool operator== (const BitRef &BR) const {
+ // If Reg is 0, disregard Pos.
+ return Reg == BR.Reg && (Reg == 0 || Pos == BR.Pos);
+ }
+ unsigned Reg;
+ uint16_t Pos;
+};
+
+
+// Abstraction of a register reference in MachineOperand. It contains the
+// register number and the subregister index.
+struct BitTracker::RegisterRef {
+ RegisterRef(unsigned R = 0, unsigned S = 0)
+ : Reg(R), Sub(S) {}
+ RegisterRef(const MachineOperand &MO)
+ : Reg(MO.getReg()), Sub(MO.getSubReg()) {}
+ unsigned Reg, Sub;
+};
+
+
+// Value that a single bit can take. This is outside of the context of
+// any register, it is more of an abstraction of the two-element set of
+// possible bit values. One extension here is the "Ref" type, which
+// indicates that this bit takes the same value as the bit described by
+// RefInfo.
+struct BitTracker::BitValue {
+ enum ValueType {
+ Top, // Bit not yet defined.
+ Zero, // Bit = 0.
+ One, // Bit = 1.
+ Ref // Bit value same as the one described in RefI.
+ // Conceptually, there is no explicit "bottom" value: the lattice's
+ // bottom will be expressed as a "ref to itself", which, in the context
+ // of registers, could be read as "this value of this bit is defined by
+ // this bit".
+ // The ordering is:
+ // x <= Top,
+ // Self <= x, where "Self" is "ref to itself".
+ // This makes the value lattice different for each virtual register
+ // (even for each bit in the same virtual register), since the "bottom"
+ // for one register will be a simple "ref" for another register.
+ // Since we do not store the "Self" bit and register number, the meet
+ // operation will need to take it as a parameter.
+ //
+ // In practice there is a special case for values that are not associa-
+ // ted with any specific virtual register. An example would be a value
+ // corresponding to a bit of a physical register, or an intermediate
+ // value obtained in some computation (such as instruction evaluation).
+ // Such cases are identical to the usual Ref type, but the register
+ // number is 0. In such case the Pos field of the reference is ignored.
+ //
+ // What is worthy of notice is that in value V (that is a "ref"), as long
+ // as the RefI.Reg is not 0, it may actually be the same register as the
+ // one in which V will be contained. If the RefI.Pos refers to the posi-
+ // tion of V, then V is assumed to be "bottom" (as a "ref to itself"),
+ // otherwise V is taken to be identical to the referenced bit of the
+ // same register.
+ // If RefI.Reg is 0, however, such a reference to the same register is
+ // not possible. Any value V that is a "ref", and whose RefI.Reg is 0
+ // is treated as "bottom".
+ };
+ ValueType Type;
+ BitRef RefI;
+
+ BitValue(ValueType T = Top) : Type(T) {}
+ BitValue(bool B) : Type(B ? One : Zero) {}
+ BitValue(unsigned Reg, uint16_t Pos) : Type(Ref), RefI(Reg, Pos) {}
+
+ bool operator== (const BitValue &V) const {
+ if (Type != V.Type)
+ return false;
+ if (Type == Ref && !(RefI == V.RefI))
+ return false;
+ return true;
+ }
+ bool operator!= (const BitValue &V) const {
+ return !operator==(V);
+ }
+ bool is(unsigned T) const {
+ assert(T == 0 || T == 1);
+ return T == 0 ? Type == Zero
+ : (T == 1 ? Type == One : false);
+ }
+
+ // The "meet" operation is the "." operation in a semilattice (L, ., T, B):
+ // (1) x.x = x
+ // (2) x.y = y.x
+ // (3) x.(y.z) = (x.y).z
+ // (4) x.T = x (i.e. T = "top")
+ // (5) x.B = B (i.e. B = "bottom")
+ //
+ // This "meet" function will update the value of the "*this" object with
+ // the newly calculated one, and return "true" if the value of *this has
+ // changed, and "false" otherwise.
+ // To prove that it satisfies the conditions (1)-(5), it is sufficient
+ // to show that a relation
+ // x <= y <=> x.y = x
+ // defines a partial order (i.e. that "meet" is same as "infimum").
+ bool meet(const BitValue &V, const BitRef &Self) {
+ // First, check the cases where there is nothing to be done.
+ if (Type == Ref && RefI == Self) // Bottom.meet(V) = Bottom (i.e. This)
+ return false;
+ if (V.Type == Top) // This.meet(Top) = This
+ return false;
+ if (*this == V) // This.meet(This) = This
+ return false;
+
+ // At this point, we know that the value of "this" will change.
+ // If it is Top, it will become the same as V, otherwise it will
+ // become "bottom" (i.e. Self).
+ if (Type == Top) {
+ Type = V.Type;
+ RefI = V.RefI; // This may be irrelevant, but copy anyway.
+ return true;
+ }
+ // Become "bottom".
+ Type = Ref;
+ RefI = Self;
+ return true;
+ }
+
+ // Create a reference to the bit value V.
+ static BitValue ref(const BitValue &V);
+ // Create a "self".
+ static BitValue self(const BitRef &Self = BitRef());
+
+ bool num() const {
+ return Type == Zero || Type == One;
+ }
+ operator bool() const {
+ assert(Type == Zero || Type == One);
+ return Type == One;
+ }
+
+ friend raw_ostream &operator<<(raw_ostream &OS, const BitValue &BV);
+};
+
+
+// This operation must be idempotent, i.e. ref(ref(V)) == ref(V).
+inline BitTracker::BitValue
+BitTracker::BitValue::ref(const BitValue &V) {
+ if (V.Type != Ref)
+ return BitValue(V.Type);
+ if (V.RefI.Reg != 0)
+ return BitValue(V.RefI.Reg, V.RefI.Pos);
+ return self();
+}
+
+
+inline BitTracker::BitValue
+BitTracker::BitValue::self(const BitRef &Self) {
+ return BitValue(Self.Reg, Self.Pos);
+}
+
+
+// A sequence of bits starting from index B up to and including index E.
+// If E < B, the mask represents two sections: [0..E] and [B..W) where
+// W is the width of the register.
+struct BitTracker::BitMask {
+ BitMask() : B(0), E(0) {}
+ BitMask(uint16_t b, uint16_t e) : B(b), E(e) {}
+ uint16_t first() const { return B; }
+ uint16_t last() const { return E; }
+private:
+ uint16_t B, E;
+};
+
+
+// Representation of a register: a list of BitValues.
+struct BitTracker::RegisterCell {
+ RegisterCell(uint16_t Width = DefaultBitN) : Bits(Width) {}
+
+ uint16_t width() const {
+ return Bits.size();
+ }
+ const BitValue &operator[](uint16_t BitN) const {
+ assert(BitN < Bits.size());
+ return Bits[BitN];
+ }
+ BitValue &operator[](uint16_t BitN) {
+ assert(BitN < Bits.size());
+ return Bits[BitN];
+ }
+
+ bool meet(const RegisterCell &RC, unsigned SelfR);
+ RegisterCell &insert(const RegisterCell &RC, const BitMask &M);
+ RegisterCell extract(const BitMask &M) const; // Returns a new cell.
+ RegisterCell &rol(uint16_t Sh); // Rotate left.
+ RegisterCell &fill(uint16_t B, uint16_t E, const BitValue &V);
+ RegisterCell &cat(const RegisterCell &RC); // Concatenate.
+ uint16_t cl(bool B) const;
+ uint16_t ct(bool B) const;
+
+ bool operator== (const RegisterCell &RC) const;
+ bool operator!= (const RegisterCell &RC) const {
+ return !operator==(RC);
+ }
+
+ // Generate a "ref" cell for the corresponding register. In the resulting
+ // cell each bit will be described as being the same as the corresponding
+ // bit in register Reg (i.e. the cell is "defined" by register Reg).
+ static RegisterCell self(unsigned Reg, uint16_t Width);
+ // Generate a "top" cell of given size.
+ static RegisterCell top(uint16_t Width);
+ // Generate a cell that is a "ref" to another cell.
+ static RegisterCell ref(const RegisterCell &C);
+
+private:
+ // The DefaultBitN is here only to avoid frequent reallocation of the
+ // memory in the vector.
+ static const unsigned DefaultBitN = 32;
+ typedef SmallVector<BitValue, DefaultBitN> BitValueList;
+ BitValueList Bits;
+
+ friend raw_ostream &operator<<(raw_ostream &OS, const RegisterCell &RC);
+};
+
+
+inline bool BitTracker::has(unsigned Reg) const {
+ return Map.find(Reg) != Map.end();
+}
+
+
+inline const BitTracker::RegisterCell&
+BitTracker::lookup(unsigned Reg) const {
+ CellMapType::const_iterator F = Map.find(Reg);
+ assert(F != Map.end());
+ return F->second;
+}
+
+
+inline BitTracker::RegisterCell
+BitTracker::RegisterCell::self(unsigned Reg, uint16_t Width) {
+ RegisterCell RC(Width);
+ for (uint16_t i = 0; i < Width; ++i)
+ RC.Bits[i] = BitValue::self(BitRef(Reg, i));
+ return RC;
+}
+
+
+inline BitTracker::RegisterCell
+BitTracker::RegisterCell::top(uint16_t Width) {
+ RegisterCell RC(Width);
+ for (uint16_t i = 0; i < Width; ++i)
+ RC.Bits[i] = BitValue(BitValue::Top);
+ return RC;
+}
+
+
+inline BitTracker::RegisterCell
+BitTracker::RegisterCell::ref(const RegisterCell &C) {
+ uint16_t W = C.width();
+ RegisterCell RC(W);
+ for (unsigned i = 0; i < W; ++i)
+ RC[i] = BitValue::ref(C[i]);
+ return RC;
+}
+
+// A class to evaluate target's instructions and update the cell maps.
+// This is used internally by the bit tracker. A target that wants to
+// utilize this should implement the evaluation functions (noted below)
+// in a subclass of this class.
+struct BitTracker::MachineEvaluator {
+ MachineEvaluator(const TargetRegisterInfo &T, MachineRegisterInfo &M)
+ : TRI(T), MRI(M) {}
+ virtual ~MachineEvaluator() {}
+
+ uint16_t getRegBitWidth(const RegisterRef &RR) const;
+
+ RegisterCell getCell(const RegisterRef &RR, const CellMapType &M) const;
+ void putCell(const RegisterRef &RR, RegisterCell RC, CellMapType &M) const;
+ // A result of any operation should use refs to the source cells, not
+ // the cells directly. This function is a convenience wrapper to quickly
+ // generate a ref for a cell corresponding to a register reference.
+ RegisterCell getRef(const RegisterRef &RR, const CellMapType &M) const {
+ RegisterCell RC = getCell(RR, M);
+ return RegisterCell::ref(RC);
+ }
+
+ // Helper functions.
+ // Check if a cell is an immediate value (i.e. all bits are either 0 or 1).
+ bool isInt(const RegisterCell &A) const;
+ // Convert cell to an immediate value.
+ uint64_t toInt(const RegisterCell &A) const;
+
+ // Generate cell from an immediate value.
+ RegisterCell eIMM(int64_t V, uint16_t W) const;
+ RegisterCell eIMM(const ConstantInt *CI) const;
+
+ // Arithmetic.
+ RegisterCell eADD(const RegisterCell &A1, const RegisterCell &A2) const;
+ RegisterCell eSUB(const RegisterCell &A1, const RegisterCell &A2) const;
+ RegisterCell eMLS(const RegisterCell &A1, const RegisterCell &A2) const;
+ RegisterCell eMLU(const RegisterCell &A1, const RegisterCell &A2) const;
+
+ // Shifts.
+ RegisterCell eASL(const RegisterCell &A1, uint16_t Sh) const;
+ RegisterCell eLSR(const RegisterCell &A1, uint16_t Sh) const;
+ RegisterCell eASR(const RegisterCell &A1, uint16_t Sh) const;
+
+ // Logical.
+ RegisterCell eAND(const RegisterCell &A1, const RegisterCell &A2) const;
+ RegisterCell eORL(const RegisterCell &A1, const RegisterCell &A2) const;
+ RegisterCell eXOR(const RegisterCell &A1, const RegisterCell &A2) const;
+ RegisterCell eNOT(const RegisterCell &A1) const;
+
+ // Set bit, clear bit.
+ RegisterCell eSET(const RegisterCell &A1, uint16_t BitN) const;
+ RegisterCell eCLR(const RegisterCell &A1, uint16_t BitN) const;
+
+ // Count leading/trailing bits (zeros/ones).
+ RegisterCell eCLB(const RegisterCell &A1, bool B, uint16_t W) const;
+ RegisterCell eCTB(const RegisterCell &A1, bool B, uint16_t W) const;
+
+ // Sign/zero extension.
+ RegisterCell eSXT(const RegisterCell &A1, uint16_t FromN) const;
+ RegisterCell eZXT(const RegisterCell &A1, uint16_t FromN) const;
+
+ // Extract/insert
+ // XTR R,b,e: extract bits from A1 starting at bit b, ending at e-1.
+ // INS R,S,b: take R and replace bits starting from b with S.
+ RegisterCell eXTR(const RegisterCell &A1, uint16_t B, uint16_t E) const;
+ RegisterCell eINS(const RegisterCell &A1, const RegisterCell &A2,
+ uint16_t AtN) const;
+
+ // User-provided functions for individual targets:
+
+ // Return a sub-register mask that indicates which bits in Reg belong
+ // to the subregister Sub. These bits are assumed to be contiguous in
+ // the super-register, and have the same ordering in the sub-register
+ // as in the super-register. It is valid to call this function with
+ // Sub == 0, in this case, the function should return a mask that spans
+ // the entire register Reg (which is what the default implementation
+ // does).
+ virtual BitMask mask(unsigned Reg, unsigned Sub) const;
+ // Indicate whether a given register class should be tracked.
+ virtual bool track(const TargetRegisterClass *RC) const { return true; }
+ // Evaluate a non-branching machine instruction, given the cell map with
+ // the input values. Place the results in the Outputs map. Return "true"
+ // if evaluation succeeded, "false" otherwise.
+ virtual bool evaluate(const MachineInstr &MI, const CellMapType &Inputs,
+ CellMapType &Outputs) const;
+ // Evaluate a branch, given the cell map with the input values. Fill out
+ // a list of all possible branch targets and indicate (through a flag)
+ // whether the branch could fall-through. Return "true" if this information
+ // has been successfully computed, "false" otherwise.
+ virtual bool evaluate(const MachineInstr &BI, const CellMapType &Inputs,
+ BranchTargetList &Targets, bool &FallsThru) const = 0;
+
+ const TargetRegisterInfo &TRI;
+ MachineRegisterInfo &MRI;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/contrib/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
new file mode 100644
index 000000000000..c05fbc1d7756
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -0,0 +1,1624 @@
+//===-- HexagonDisassembler.cpp - Disassembler for Hexagon ISA ------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hexagon-disassembler"
+
+#include "Hexagon.h"
+#include "MCTargetDesc/HexagonBaseInfo.h"
+#include "MCTargetDesc/HexagonMCChecker.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+using namespace llvm;
+using namespace Hexagon;
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+
+/// \brief Hexagon disassembler for all Hexagon platforms.
+class HexagonDisassembler : public MCDisassembler {
+public:
+ std::unique_ptr<MCInstrInfo const> const MCII;
+ std::unique_ptr<MCInst *> CurrentBundle;
+
+ HexagonDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
+ MCInstrInfo const *MCII)
+ : MCDisassembler(STI, Ctx), MCII(MCII), CurrentBundle(new MCInst *) {}
+
+ DecodeStatus getSingleInstruction(MCInst &Instr, MCInst &MCB,
+ ArrayRef<uint8_t> Bytes, uint64_t Address,
+ raw_ostream &VStream, raw_ostream &CStream,
+ bool &Complete) const;
+ DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes, uint64_t Address,
+ raw_ostream &VStream,
+ raw_ostream &CStream) const override;
+
+ void adjustExtendedInstructions(MCInst &MCI, MCInst const &MCB) const;
+ void addSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) const;
+};
+
+} // end anonymous namespace
+
+// Forward declare these because the auto-generated code will reference them.
+// Definitions are further down.
+
+static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeIntRegsLow8RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeVectorRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeDoubleRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeVecDblRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodePredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeVecPredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeModRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus decodeSpecial(MCInst &MI, uint32_t insn);
+static DecodeStatus decodeImmext(MCInst &MI, uint32_t insn,
+ void const *Decoder);
+
+static unsigned GetSubinstOpcode(unsigned IClass, unsigned inst, unsigned &op,
+ raw_ostream &os);
+
+static unsigned getRegFromSubinstEncoding(unsigned encoded_reg);
+
+static DecodeStatus unsignedImmDecoder(MCInst &MI, unsigned tmp,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus s16_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus s12_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus s11_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus s11_1ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus s11_2ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus s11_3ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus s10_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus s8_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus s6_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus s4_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus s4_1ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus s4_2ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus s4_3ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus s4_6ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus s3_6ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus brtargetDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+ const void *Decoder);
+
+#include "HexagonGenDisassemblerTables.inc"
+
+static MCDisassembler *createHexagonDisassembler(const Target &T,
+ const MCSubtargetInfo &STI,
+ MCContext &Ctx) {
+ return new HexagonDisassembler(STI, Ctx, T.createMCInstrInfo());
+}
+
+extern "C" void LLVMInitializeHexagonDisassembler() {
+ TargetRegistry::RegisterMCDisassembler(getTheHexagonTarget(),
+ createHexagonDisassembler);
+}
+
+DecodeStatus HexagonDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes,
+ uint64_t Address,
+ raw_ostream &os,
+ raw_ostream &cs) const {
+ DecodeStatus Result = DecodeStatus::Success;
+ bool Complete = false;
+ Size = 0;
+
+ *CurrentBundle = &MI;
+ MI = HexagonMCInstrInfo::createBundle();
+ while (Result == Success && !Complete) {
+ if (Bytes.size() < HEXAGON_INSTR_SIZE)
+ return MCDisassembler::Fail;
+ MCInst *Inst = new (getContext()) MCInst;
+ Result = getSingleInstruction(*Inst, MI, Bytes, Address, os, cs, Complete);
+ MI.addOperand(MCOperand::createInst(Inst));
+ Size += HEXAGON_INSTR_SIZE;
+ Bytes = Bytes.slice(HEXAGON_INSTR_SIZE);
+ }
+ if(Result == MCDisassembler::Fail)
+ return Result;
+ HexagonMCChecker Checker (*MCII, STI, MI, MI, *getContext().getRegisterInfo());
+ if(!Checker.check())
+ return MCDisassembler::Fail;
+ return MCDisassembler::Success;
+}
+
+static HexagonDisassembler const &disassembler(void const *Decoder) {
+ return *static_cast<HexagonDisassembler const *>(Decoder);
+}
+
+static MCContext &contextFromDecoder(void const *Decoder) {
+ return disassembler(Decoder).getContext();
+}
+
+DecodeStatus HexagonDisassembler::getSingleInstruction(
+ MCInst &MI, MCInst &MCB, ArrayRef<uint8_t> Bytes, uint64_t Address,
+ raw_ostream &os, raw_ostream &cs, bool &Complete) const {
+ assert(Bytes.size() >= HEXAGON_INSTR_SIZE);
+
+ uint32_t Instruction =
+ (Bytes[3] << 24) | (Bytes[2] << 16) | (Bytes[1] << 8) | (Bytes[0] << 0);
+
+ auto BundleSize = HexagonMCInstrInfo::bundleSize(MCB);
+ if ((Instruction & HexagonII::INST_PARSE_MASK) ==
+ HexagonII::INST_PARSE_LOOP_END) {
+ if (BundleSize == 0)
+ HexagonMCInstrInfo::setInnerLoop(MCB);
+ else if (BundleSize == 1)
+ HexagonMCInstrInfo::setOuterLoop(MCB);
+ else
+ return DecodeStatus::Fail;
+ }
+
+ DecodeStatus Result = DecodeStatus::Success;
+ if ((Instruction & HexagonII::INST_PARSE_MASK) ==
+ HexagonII::INST_PARSE_DUPLEX) {
+ // Determine the instruction class of each instruction in the duplex.
+ unsigned duplexIClass, IClassLow, IClassHigh;
+
+ duplexIClass = ((Instruction >> 28) & 0xe) | ((Instruction >> 13) & 0x1);
+ switch (duplexIClass) {
+ default:
+ return MCDisassembler::Fail;
+ case 0:
+ IClassLow = HexagonII::HSIG_L1;
+ IClassHigh = HexagonII::HSIG_L1;
+ break;
+ case 1:
+ IClassLow = HexagonII::HSIG_L2;
+ IClassHigh = HexagonII::HSIG_L1;
+ break;
+ case 2:
+ IClassLow = HexagonII::HSIG_L2;
+ IClassHigh = HexagonII::HSIG_L2;
+ break;
+ case 3:
+ IClassLow = HexagonII::HSIG_A;
+ IClassHigh = HexagonII::HSIG_A;
+ break;
+ case 4:
+ IClassLow = HexagonII::HSIG_L1;
+ IClassHigh = HexagonII::HSIG_A;
+ break;
+ case 5:
+ IClassLow = HexagonII::HSIG_L2;
+ IClassHigh = HexagonII::HSIG_A;
+ break;
+ case 6:
+ IClassLow = HexagonII::HSIG_S1;
+ IClassHigh = HexagonII::HSIG_A;
+ break;
+ case 7:
+ IClassLow = HexagonII::HSIG_S2;
+ IClassHigh = HexagonII::HSIG_A;
+ break;
+ case 8:
+ IClassLow = HexagonII::HSIG_S1;
+ IClassHigh = HexagonII::HSIG_L1;
+ break;
+ case 9:
+ IClassLow = HexagonII::HSIG_S1;
+ IClassHigh = HexagonII::HSIG_L2;
+ break;
+ case 10:
+ IClassLow = HexagonII::HSIG_S1;
+ IClassHigh = HexagonII::HSIG_S1;
+ break;
+ case 11:
+ IClassLow = HexagonII::HSIG_S2;
+ IClassHigh = HexagonII::HSIG_S1;
+ break;
+ case 12:
+ IClassLow = HexagonII::HSIG_S2;
+ IClassHigh = HexagonII::HSIG_L1;
+ break;
+ case 13:
+ IClassLow = HexagonII::HSIG_S2;
+ IClassHigh = HexagonII::HSIG_L2;
+ break;
+ case 14:
+ IClassLow = HexagonII::HSIG_S2;
+ IClassHigh = HexagonII::HSIG_S2;
+ break;
+ }
+
+ // Set the MCInst to be a duplex instruction. Which one doesn't matter.
+ MI.setOpcode(Hexagon::DuplexIClass0);
+
+ // Decode each instruction in the duplex.
+ // Create an MCInst for each instruction.
+ unsigned instLow = Instruction & 0x1fff;
+ unsigned instHigh = (Instruction >> 16) & 0x1fff;
+ unsigned opLow;
+ if (GetSubinstOpcode(IClassLow, instLow, opLow, os) !=
+ MCDisassembler::Success)
+ return MCDisassembler::Fail;
+ unsigned opHigh;
+ if (GetSubinstOpcode(IClassHigh, instHigh, opHigh, os) !=
+ MCDisassembler::Success)
+ return MCDisassembler::Fail;
+ MCInst *MILow = new (getContext()) MCInst;
+ MILow->setOpcode(opLow);
+ MCInst *MIHigh = new (getContext()) MCInst;
+ MIHigh->setOpcode(opHigh);
+ addSubinstOperands(MILow, opLow, instLow);
+ addSubinstOperands(MIHigh, opHigh, instHigh);
+ // see ConvertToSubInst() in
+ // lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
+
+ // Add the duplex instruction MCInsts as operands to the passed in MCInst.
+ MCOperand OPLow = MCOperand::createInst(MILow);
+ MCOperand OPHigh = MCOperand::createInst(MIHigh);
+ MI.addOperand(OPLow);
+ MI.addOperand(OPHigh);
+ Complete = true;
+ } else {
+ if ((Instruction & HexagonII::INST_PARSE_MASK) ==
+ HexagonII::INST_PARSE_PACKET_END)
+ Complete = true;
+ // Calling the auto-generated decoder function.
+ Result =
+ decodeInstruction(DecoderTable32, MI, Instruction, Address, this, STI);
+
+ // If a, "standard" insn isn't found check special cases.
+ if (MCDisassembler::Success != Result ||
+ MI.getOpcode() == Hexagon::A4_ext) {
+ Result = decodeImmext(MI, Instruction, this);
+ if (MCDisassembler::Success != Result) {
+ Result = decodeSpecial(MI, Instruction);
+ }
+ } else {
+ // If the instruction is a compound instruction, register values will
+ // follow the duplex model, so the register values in the MCInst are
+ // incorrect. If the instruction is a compound, loop through the
+ // operands and change registers appropriately.
+ if (HexagonMCInstrInfo::getType(*MCII, MI) == HexagonII::TypeCOMPOUND) {
+ for (MCInst::iterator i = MI.begin(), last = MI.end(); i < last; ++i) {
+ if (i->isReg()) {
+ unsigned reg = i->getReg() - Hexagon::R0;
+ i->setReg(getRegFromSubinstEncoding(reg));
+ }
+ }
+ }
+ }
+ }
+
+ switch(MI.getOpcode()) {
+ case Hexagon::J4_cmpeqn1_f_jumpnv_nt:
+ case Hexagon::J4_cmpeqn1_f_jumpnv_t:
+ case Hexagon::J4_cmpeqn1_fp0_jump_nt:
+ case Hexagon::J4_cmpeqn1_fp0_jump_t:
+ case Hexagon::J4_cmpeqn1_fp1_jump_nt:
+ case Hexagon::J4_cmpeqn1_fp1_jump_t:
+ case Hexagon::J4_cmpeqn1_t_jumpnv_nt:
+ case Hexagon::J4_cmpeqn1_t_jumpnv_t:
+ case Hexagon::J4_cmpeqn1_tp0_jump_nt:
+ case Hexagon::J4_cmpeqn1_tp0_jump_t:
+ case Hexagon::J4_cmpeqn1_tp1_jump_nt:
+ case Hexagon::J4_cmpeqn1_tp1_jump_t:
+ case Hexagon::J4_cmpgtn1_f_jumpnv_nt:
+ case Hexagon::J4_cmpgtn1_f_jumpnv_t:
+ case Hexagon::J4_cmpgtn1_fp0_jump_nt:
+ case Hexagon::J4_cmpgtn1_fp0_jump_t:
+ case Hexagon::J4_cmpgtn1_fp1_jump_nt:
+ case Hexagon::J4_cmpgtn1_fp1_jump_t:
+ case Hexagon::J4_cmpgtn1_t_jumpnv_nt:
+ case Hexagon::J4_cmpgtn1_t_jumpnv_t:
+ case Hexagon::J4_cmpgtn1_tp0_jump_nt:
+ case Hexagon::J4_cmpgtn1_tp0_jump_t:
+ case Hexagon::J4_cmpgtn1_tp1_jump_nt:
+ case Hexagon::J4_cmpgtn1_tp1_jump_t:
+ MI.insert(MI.begin() + 1, MCOperand::createExpr(MCConstantExpr::create(-1, getContext())));
+ break;
+ default:
+ break;
+ }
+
+ if (HexagonMCInstrInfo::isNewValue(*MCII, MI)) {
+ unsigned OpIndex = HexagonMCInstrInfo::getNewValueOp(*MCII, MI);
+ MCOperand &MCO = MI.getOperand(OpIndex);
+ assert(MCO.isReg() && "New value consumers must be registers");
+ unsigned Register =
+ getContext().getRegisterInfo()->getEncodingValue(MCO.getReg());
+ if ((Register & 0x6) == 0)
+ // HexagonPRM 10.11 Bit 1-2 == 0 is reserved
+ return MCDisassembler::Fail;
+ unsigned Lookback = (Register & 0x6) >> 1;
+ unsigned Offset = 1;
+ bool Vector = HexagonMCInstrInfo::isVector(*MCII, MI);
+ auto Instructions = HexagonMCInstrInfo::bundleInstructions(**CurrentBundle);
+ auto i = Instructions.end() - 1;
+ for (auto n = Instructions.begin() - 1;; --i, ++Offset) {
+ if (i == n)
+ // Couldn't find producer
+ return MCDisassembler::Fail;
+ if (Vector && !HexagonMCInstrInfo::isVector(*MCII, *i->getInst()))
+ // Skip scalars when calculating distances for vectors
+ ++Lookback;
+ if (HexagonMCInstrInfo::isImmext(*i->getInst()))
+ ++Lookback;
+ if (Offset == Lookback)
+ break;
+ }
+ auto const &Inst = *i->getInst();
+ bool SubregBit = (Register & 0x1) != 0;
+ if (SubregBit && HexagonMCInstrInfo::hasNewValue2(*MCII, Inst)) {
+ // If subreg bit is set we're selecting the second produced newvalue
+ unsigned Producer =
+ HexagonMCInstrInfo::getNewValueOperand2(*MCII, Inst).getReg();
+ assert(Producer != Hexagon::NoRegister);
+ MCO.setReg(Producer);
+ } else if (HexagonMCInstrInfo::hasNewValue(*MCII, Inst)) {
+ unsigned Producer =
+ HexagonMCInstrInfo::getNewValueOperand(*MCII, Inst).getReg();
+ if (Producer >= Hexagon::W0 && Producer <= Hexagon::W15)
+ Producer = ((Producer - Hexagon::W0) << 1) + SubregBit + Hexagon::V0;
+ else if (SubregBit)
+ // Hexagon PRM 10.11 New-value operands
+ // Nt[0] is reserved and should always be encoded as zero.
+ return MCDisassembler::Fail;
+ assert(Producer != Hexagon::NoRegister);
+ MCO.setReg(Producer);
+ } else
+ return MCDisassembler::Fail;
+ }
+
+ adjustExtendedInstructions(MI, MCB);
+ MCInst const *Extender =
+ HexagonMCInstrInfo::extenderForIndex(MCB,
+ HexagonMCInstrInfo::bundleSize(MCB));
+ if(Extender != nullptr) {
+ MCInst const & Inst = HexagonMCInstrInfo::isDuplex(*MCII, MI) ?
+ *MI.getOperand(1).getInst() : MI;
+ if (!HexagonMCInstrInfo::isExtendable(*MCII, Inst) &&
+ !HexagonMCInstrInfo::isExtended(*MCII, Inst))
+ return MCDisassembler::Fail;
+ }
+ return Result;
+}
+
+void HexagonDisassembler::adjustExtendedInstructions(MCInst &MCI,
+ MCInst const &MCB) const {
+ if (!HexagonMCInstrInfo::hasExtenderForIndex(
+ MCB, HexagonMCInstrInfo::bundleSize(MCB))) {
+ unsigned opcode;
+ // This code is used by the disassembler to disambiguate between GP
+ // relative and absolute addressing instructions since they both have
+ // same encoding bits. However, an absolute addressing instruction must
+ // follow an immediate extender. Disassembler alwaus select absolute
+ // addressing instructions first and uses this code to change them into
+ // GP relative instruction in the absence of the corresponding immediate
+ // extender.
+ switch (MCI.getOpcode()) {
+ case Hexagon::PS_storerbabs:
+ opcode = Hexagon::S2_storerbgp;
+ break;
+ case Hexagon::PS_storerhabs:
+ opcode = Hexagon::S2_storerhgp;
+ break;
+ case Hexagon::PS_storerfabs:
+ opcode = Hexagon::S2_storerfgp;
+ break;
+ case Hexagon::PS_storeriabs:
+ opcode = Hexagon::S2_storerigp;
+ break;
+ case Hexagon::PS_storerbnewabs:
+ opcode = Hexagon::S2_storerbnewgp;
+ break;
+ case Hexagon::PS_storerhnewabs:
+ opcode = Hexagon::S2_storerhnewgp;
+ break;
+ case Hexagon::PS_storerinewabs:
+ opcode = Hexagon::S2_storerinewgp;
+ break;
+ case Hexagon::PS_storerdabs:
+ opcode = Hexagon::S2_storerdgp;
+ break;
+ case Hexagon::PS_loadrbabs:
+ opcode = Hexagon::L2_loadrbgp;
+ break;
+ case Hexagon::PS_loadrubabs:
+ opcode = Hexagon::L2_loadrubgp;
+ break;
+ case Hexagon::PS_loadrhabs:
+ opcode = Hexagon::L2_loadrhgp;
+ break;
+ case Hexagon::PS_loadruhabs:
+ opcode = Hexagon::L2_loadruhgp;
+ break;
+ case Hexagon::PS_loadriabs:
+ opcode = Hexagon::L2_loadrigp;
+ break;
+ case Hexagon::PS_loadrdabs:
+ opcode = Hexagon::L2_loadrdgp;
+ break;
+ default:
+ opcode = MCI.getOpcode();
+ }
+ MCI.setOpcode(opcode);
+ }
+}
+
+static DecodeStatus DecodeRegisterClass(MCInst &Inst, unsigned RegNo,
+ ArrayRef<MCPhysReg> Table) {
+ if (RegNo < Table.size()) {
+ Inst.addOperand(MCOperand::createReg(Table[RegNo]));
+ return MCDisassembler::Success;
+ }
+
+ return MCDisassembler::Fail;
+}
+
+static DecodeStatus DecodeIntRegsLow8RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ return DecodeIntRegsRegisterClass(Inst, RegNo, Address, Decoder);
+}
+
+static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ static const MCPhysReg IntRegDecoderTable[] = {
+ Hexagon::R0, Hexagon::R1, Hexagon::R2, Hexagon::R3, Hexagon::R4,
+ Hexagon::R5, Hexagon::R6, Hexagon::R7, Hexagon::R8, Hexagon::R9,
+ Hexagon::R10, Hexagon::R11, Hexagon::R12, Hexagon::R13, Hexagon::R14,
+ Hexagon::R15, Hexagon::R16, Hexagon::R17, Hexagon::R18, Hexagon::R19,
+ Hexagon::R20, Hexagon::R21, Hexagon::R22, Hexagon::R23, Hexagon::R24,
+ Hexagon::R25, Hexagon::R26, Hexagon::R27, Hexagon::R28, Hexagon::R29,
+ Hexagon::R30, Hexagon::R31};
+
+ return DecodeRegisterClass(Inst, RegNo, IntRegDecoderTable);
+}
+
+static DecodeStatus DecodeVectorRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t /*Address*/,
+ const void *Decoder) {
+ static const MCPhysReg VecRegDecoderTable[] = {
+ Hexagon::V0, Hexagon::V1, Hexagon::V2, Hexagon::V3, Hexagon::V4,
+ Hexagon::V5, Hexagon::V6, Hexagon::V7, Hexagon::V8, Hexagon::V9,
+ Hexagon::V10, Hexagon::V11, Hexagon::V12, Hexagon::V13, Hexagon::V14,
+ Hexagon::V15, Hexagon::V16, Hexagon::V17, Hexagon::V18, Hexagon::V19,
+ Hexagon::V20, Hexagon::V21, Hexagon::V22, Hexagon::V23, Hexagon::V24,
+ Hexagon::V25, Hexagon::V26, Hexagon::V27, Hexagon::V28, Hexagon::V29,
+ Hexagon::V30, Hexagon::V31};
+
+ return DecodeRegisterClass(Inst, RegNo, VecRegDecoderTable);
+}
+
+static DecodeStatus DecodeDoubleRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t /*Address*/,
+ const void *Decoder) {
+ static const MCPhysReg DoubleRegDecoderTable[] = {
+ Hexagon::D0, Hexagon::D1, Hexagon::D2, Hexagon::D3,
+ Hexagon::D4, Hexagon::D5, Hexagon::D6, Hexagon::D7,
+ Hexagon::D8, Hexagon::D9, Hexagon::D10, Hexagon::D11,
+ Hexagon::D12, Hexagon::D13, Hexagon::D14, Hexagon::D15};
+
+ return DecodeRegisterClass(Inst, RegNo >> 1, DoubleRegDecoderTable);
+}
+
+static DecodeStatus DecodeVecDblRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t /*Address*/,
+ const void *Decoder) {
+ static const MCPhysReg VecDblRegDecoderTable[] = {
+ Hexagon::W0, Hexagon::W1, Hexagon::W2, Hexagon::W3,
+ Hexagon::W4, Hexagon::W5, Hexagon::W6, Hexagon::W7,
+ Hexagon::W8, Hexagon::W9, Hexagon::W10, Hexagon::W11,
+ Hexagon::W12, Hexagon::W13, Hexagon::W14, Hexagon::W15};
+
+ return (DecodeRegisterClass(Inst, RegNo >> 1, VecDblRegDecoderTable));
+}
+
+static DecodeStatus DecodePredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t /*Address*/,
+ const void *Decoder) {
+ static const MCPhysReg PredRegDecoderTable[] = {Hexagon::P0, Hexagon::P1,
+ Hexagon::P2, Hexagon::P3};
+
+ return DecodeRegisterClass(Inst, RegNo, PredRegDecoderTable);
+}
+
+static DecodeStatus DecodeVecPredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t /*Address*/,
+ const void *Decoder) {
+ static const MCPhysReg VecPredRegDecoderTable[] = {Hexagon::Q0, Hexagon::Q1,
+ Hexagon::Q2, Hexagon::Q3};
+
+ return DecodeRegisterClass(Inst, RegNo, VecPredRegDecoderTable);
+}
+
+static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t /*Address*/,
+ const void *Decoder) {
+ static const MCPhysReg CtrlRegDecoderTable[] = {
+ Hexagon::SA0, Hexagon::LC0, Hexagon::SA1, Hexagon::LC1,
+ Hexagon::P3_0, Hexagon::C5, Hexagon::C6, Hexagon::C7,
+ Hexagon::USR, Hexagon::PC, Hexagon::UGP, Hexagon::GP,
+ Hexagon::CS0, Hexagon::CS1, Hexagon::UPCL, Hexagon::UPC
+ };
+
+ if (RegNo >= array_lengthof(CtrlRegDecoderTable))
+ return MCDisassembler::Fail;
+
+ if (CtrlRegDecoderTable[RegNo] == Hexagon::NoRegister)
+ return MCDisassembler::Fail;
+
+ unsigned Register = CtrlRegDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t /*Address*/,
+ const void *Decoder) {
+ static const MCPhysReg CtrlReg64DecoderTable[] = {
+ Hexagon::C1_0, Hexagon::NoRegister,
+ Hexagon::C3_2, Hexagon::NoRegister,
+ Hexagon::C7_6, Hexagon::NoRegister,
+ Hexagon::C9_8, Hexagon::NoRegister,
+ Hexagon::C11_10, Hexagon::NoRegister,
+ Hexagon::CS, Hexagon::NoRegister,
+ Hexagon::UPC, Hexagon::NoRegister
+ };
+
+ if (RegNo >= array_lengthof(CtrlReg64DecoderTable))
+ return MCDisassembler::Fail;
+
+ if (CtrlReg64DecoderTable[RegNo] == Hexagon::NoRegister)
+ return MCDisassembler::Fail;
+
+ unsigned Register = CtrlReg64DecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeModRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t /*Address*/,
+ const void *Decoder) {
+ unsigned Register = 0;
+ switch (RegNo) {
+ case 0:
+ Register = Hexagon::M0;
+ break;
+ case 1:
+ Register = Hexagon::M1;
+ break;
+ default:
+ return MCDisassembler::Fail;
+ }
+ Inst.addOperand(MCOperand::createReg(Register));
+ return MCDisassembler::Success;
+}
+
+static uint32_t fullValue(MCInstrInfo const &MCII, MCInst &MCB, MCInst &MI,
+ int64_t Value) {
+ MCInst const *Extender = HexagonMCInstrInfo::extenderForIndex(
+ MCB, HexagonMCInstrInfo::bundleSize(MCB));
+ if(!Extender || MI.size() != HexagonMCInstrInfo::getExtendableOp(MCII, MI))
+ return Value;
+ unsigned Alignment = HexagonMCInstrInfo::getExtentAlignment(MCII, MI);
+ uint32_t Lower6 = static_cast<uint32_t>(Value >> Alignment) & 0x3f;
+ int64_t Bits;
+ bool Success = Extender->getOperand(0).getExpr()->evaluateAsAbsolute(Bits);
+ assert(Success);(void)Success;
+ uint32_t Upper26 = static_cast<uint32_t>(Bits);
+ uint32_t Operand = Upper26 | Lower6;
+ return Operand;
+}
+
+template <size_t T>
+static void signedDecoder(MCInst &MI, unsigned tmp, const void *Decoder) {
+ HexagonDisassembler const &Disassembler = disassembler(Decoder);
+ int64_t FullValue = fullValue(*Disassembler.MCII,
+ **Disassembler.CurrentBundle,
+ MI, SignExtend64<T>(tmp));
+ int64_t Extended = SignExtend64<32>(FullValue);
+ HexagonMCInstrInfo::addConstant(MI, Extended,
+ Disassembler.getContext());
+}
+
+static DecodeStatus unsignedImmDecoder(MCInst &MI, unsigned tmp,
+ uint64_t /*Address*/,
+ const void *Decoder) {
+ HexagonDisassembler const &Disassembler = disassembler(Decoder);
+ int64_t FullValue = fullValue(*Disassembler.MCII,
+ **Disassembler.CurrentBundle,
+ MI, tmp);
+ assert(FullValue >= 0 && "Negative in unsigned decoder");
+ HexagonMCInstrInfo::addConstant(MI, FullValue, Disassembler.getContext());
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus s16_0ImmDecoder(MCInst &MI, unsigned tmp,
+ uint64_t /*Address*/, const void *Decoder) {
+ signedDecoder<16>(MI, tmp, Decoder);
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus s12_0ImmDecoder(MCInst &MI, unsigned tmp,
+ uint64_t /*Address*/, const void *Decoder) {
+ signedDecoder<12>(MI, tmp, Decoder);
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus s11_0ImmDecoder(MCInst &MI, unsigned tmp,
+ uint64_t /*Address*/, const void *Decoder) {
+ signedDecoder<11>(MI, tmp, Decoder);
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus s11_1ImmDecoder(MCInst &MI, unsigned tmp,
+ uint64_t /*Address*/, const void *Decoder) {
+ HexagonMCInstrInfo::addConstant(MI, SignExtend64<12>(tmp), contextFromDecoder(Decoder));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus s11_2ImmDecoder(MCInst &MI, unsigned tmp,
+ uint64_t /*Address*/, const void *Decoder) {
+ signedDecoder<13>(MI, tmp, Decoder);
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus s11_3ImmDecoder(MCInst &MI, unsigned tmp,
+ uint64_t /*Address*/, const void *Decoder) {
+ signedDecoder<14>(MI, tmp, Decoder);
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus s10_0ImmDecoder(MCInst &MI, unsigned tmp,
+ uint64_t /*Address*/, const void *Decoder) {
+ signedDecoder<10>(MI, tmp, Decoder);
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus s8_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t /*Address*/,
+ const void *Decoder) {
+ signedDecoder<8>(MI, tmp, Decoder);
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus s6_0ImmDecoder(MCInst &MI, unsigned tmp,
+ uint64_t /*Address*/, const void *Decoder) {
+ signedDecoder<6>(MI, tmp, Decoder);
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus s4_0ImmDecoder(MCInst &MI, unsigned tmp,
+ uint64_t /*Address*/, const void *Decoder) {
+ signedDecoder<4>(MI, tmp, Decoder);
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus s4_1ImmDecoder(MCInst &MI, unsigned tmp,
+ uint64_t /*Address*/, const void *Decoder) {
+ signedDecoder<5>(MI, tmp, Decoder);
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus s4_2ImmDecoder(MCInst &MI, unsigned tmp,
+ uint64_t /*Address*/, const void *Decoder) {
+ signedDecoder<6>(MI, tmp, Decoder);
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus s4_3ImmDecoder(MCInst &MI, unsigned tmp,
+ uint64_t /*Address*/, const void *Decoder) {
+ signedDecoder<7>(MI, tmp, Decoder);
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus s4_6ImmDecoder(MCInst &MI, unsigned tmp,
+ uint64_t /*Address*/, const void *Decoder) {
+ signedDecoder<10>(MI, tmp, Decoder);
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus s3_6ImmDecoder(MCInst &MI, unsigned tmp,
+ uint64_t /*Address*/, const void *Decoder) {
+ signedDecoder<19>(MI, tmp, Decoder);
+ return MCDisassembler::Success;
+}
+
+// custom decoder for various jump/call immediates
+static DecodeStatus brtargetDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+ const void *Decoder) {
+ HexagonDisassembler const &Disassembler = disassembler(Decoder);
+ unsigned Bits = HexagonMCInstrInfo::getExtentBits(*Disassembler.MCII, MI);
+ // r13_2 is not extendable, so if there are no extent bits, it's r13_2
+ if (Bits == 0)
+ Bits = 15;
+ uint32_t FullValue = fullValue(*Disassembler.MCII,
+ **Disassembler.CurrentBundle,
+ MI, SignExtend64(tmp, Bits));
+ int64_t Extended = SignExtend64<32>(FullValue) + Address;
+ if (!Disassembler.tryAddingSymbolicOperand(MI, Extended, Address, true,
+ 0, 4))
+ HexagonMCInstrInfo::addConstant(MI, Extended, Disassembler.getContext());
+ return MCDisassembler::Success;
+}
+
+// Addressing mode dependent load store opcode map.
+// - If an insn is preceded by an extender the address is absolute.
+// - memw(##symbol) = r0
+// - If an insn is not preceded by an extender the address is GP relative.
+// - memw(gp + #symbol) = r0
+// Please note that the instructions must be ordered in the descending order
+// of their opcode.
+// HexagonII::INST_ICLASS_ST
+static const unsigned int StoreConditionalOpcodeData[][2] = {
+ {S4_pstorerdfnew_abs, 0xafc02084},
+ {S4_pstorerdtnew_abs, 0xafc02080},
+ {S4_pstorerdf_abs, 0xafc00084},
+ {S4_pstorerdt_abs, 0xafc00080},
+ {S4_pstorerinewfnew_abs, 0xafa03084},
+ {S4_pstorerinewtnew_abs, 0xafa03080},
+ {S4_pstorerhnewfnew_abs, 0xafa02884},
+ {S4_pstorerhnewtnew_abs, 0xafa02880},
+ {S4_pstorerbnewfnew_abs, 0xafa02084},
+ {S4_pstorerbnewtnew_abs, 0xafa02080},
+ {S4_pstorerinewf_abs, 0xafa01084},
+ {S4_pstorerinewt_abs, 0xafa01080},
+ {S4_pstorerhnewf_abs, 0xafa00884},
+ {S4_pstorerhnewt_abs, 0xafa00880},
+ {S4_pstorerbnewf_abs, 0xafa00084},
+ {S4_pstorerbnewt_abs, 0xafa00080},
+ {S4_pstorerifnew_abs, 0xaf802084},
+ {S4_pstoreritnew_abs, 0xaf802080},
+ {S4_pstorerif_abs, 0xaf800084},
+ {S4_pstorerit_abs, 0xaf800080},
+ {S4_pstorerhfnew_abs, 0xaf402084},
+ {S4_pstorerhtnew_abs, 0xaf402080},
+ {S4_pstorerhf_abs, 0xaf400084},
+ {S4_pstorerht_abs, 0xaf400080},
+ {S4_pstorerbfnew_abs, 0xaf002084},
+ {S4_pstorerbtnew_abs, 0xaf002080},
+ {S4_pstorerbf_abs, 0xaf000084},
+ {S4_pstorerbt_abs, 0xaf000080}};
+// HexagonII::INST_ICLASS_LD
+
+// HexagonII::INST_ICLASS_LD_ST_2
+static unsigned int LoadStoreOpcodeData[][2] = {{PS_loadrdabs, 0x49c00000},
+ {PS_loadriabs, 0x49800000},
+ {PS_loadruhabs, 0x49600000},
+ {PS_loadrhabs, 0x49400000},
+ {PS_loadrubabs, 0x49200000},
+ {PS_loadrbabs, 0x49000000},
+ {PS_storerdabs, 0x48c00000},
+ {PS_storerinewabs, 0x48a01000},
+ {PS_storerhnewabs, 0x48a00800},
+ {PS_storerbnewabs, 0x48a00000},
+ {PS_storeriabs, 0x48800000},
+ {PS_storerfabs, 0x48600000},
+ {PS_storerhabs, 0x48400000},
+ {PS_storerbabs, 0x48000000}};
+static const size_t NumCondS = array_lengthof(StoreConditionalOpcodeData);
+static const size_t NumLS = array_lengthof(LoadStoreOpcodeData);
+
+static DecodeStatus decodeSpecial(MCInst &MI, uint32_t insn) {
+ unsigned MachineOpcode = 0;
+ unsigned LLVMOpcode = 0;
+
+ if ((insn & HexagonII::INST_ICLASS_MASK) == HexagonII::INST_ICLASS_ST) {
+ for (size_t i = 0; i < NumCondS; ++i) {
+ if ((insn & StoreConditionalOpcodeData[i][1]) ==
+ StoreConditionalOpcodeData[i][1]) {
+ MachineOpcode = StoreConditionalOpcodeData[i][1];
+ LLVMOpcode = StoreConditionalOpcodeData[i][0];
+ break;
+ }
+ }
+ }
+ if ((insn & HexagonII::INST_ICLASS_MASK) == HexagonII::INST_ICLASS_LD_ST_2) {
+ for (size_t i = 0; i < NumLS; ++i) {
+ if ((insn & LoadStoreOpcodeData[i][1]) == LoadStoreOpcodeData[i][1]) {
+ MachineOpcode = LoadStoreOpcodeData[i][1];
+ LLVMOpcode = LoadStoreOpcodeData[i][0];
+ break;
+ }
+ }
+ }
+
+ if (MachineOpcode) {
+ unsigned Value = 0;
+ unsigned shift = 0;
+ MI.setOpcode(LLVMOpcode);
+ // Remove the parse bits from the insn.
+ insn &= ~HexagonII::INST_PARSE_MASK;
+
+ switch (LLVMOpcode) {
+ default:
+ return MCDisassembler::Fail;
+ break;
+
+ case Hexagon::S4_pstorerdf_abs:
+ case Hexagon::S4_pstorerdt_abs:
+ case Hexagon::S4_pstorerdfnew_abs:
+ case Hexagon::S4_pstorerdtnew_abs:
+ // op: Pv
+ Value = insn & UINT64_C(3);
+ DecodePredRegsRegisterClass(MI, Value, 0, nullptr);
+ // op: u6
+ Value = (insn >> 12) & UINT64_C(48);
+ Value |= (insn >> 3) & UINT64_C(15);
+ MI.addOperand(MCOperand::createImm(Value));
+ // op: Rtt
+ Value = (insn >> 8) & UINT64_C(31);
+ DecodeDoubleRegsRegisterClass(MI, Value, 0, nullptr);
+ break;
+
+ case Hexagon::S4_pstorerbnewf_abs:
+ case Hexagon::S4_pstorerbnewt_abs:
+ case Hexagon::S4_pstorerbnewfnew_abs:
+ case Hexagon::S4_pstorerbnewtnew_abs:
+ case Hexagon::S4_pstorerhnewf_abs:
+ case Hexagon::S4_pstorerhnewt_abs:
+ case Hexagon::S4_pstorerhnewfnew_abs:
+ case Hexagon::S4_pstorerhnewtnew_abs:
+ case Hexagon::S4_pstorerinewf_abs:
+ case Hexagon::S4_pstorerinewt_abs:
+ case Hexagon::S4_pstorerinewfnew_abs:
+ case Hexagon::S4_pstorerinewtnew_abs:
+ // op: Pv
+ Value = insn & UINT64_C(3);
+ DecodePredRegsRegisterClass(MI, Value, 0, nullptr);
+ // op: u6
+ Value = (insn >> 12) & UINT64_C(48);
+ Value |= (insn >> 3) & UINT64_C(15);
+ MI.addOperand(MCOperand::createImm(Value));
+ // op: Nt
+ Value = (insn >> 8) & UINT64_C(7);
+ DecodeIntRegsRegisterClass(MI, Value, 0, nullptr);
+ break;
+
+ case Hexagon::S4_pstorerbf_abs:
+ case Hexagon::S4_pstorerbt_abs:
+ case Hexagon::S4_pstorerbfnew_abs:
+ case Hexagon::S4_pstorerbtnew_abs:
+ case Hexagon::S4_pstorerhf_abs:
+ case Hexagon::S4_pstorerht_abs:
+ case Hexagon::S4_pstorerhfnew_abs:
+ case Hexagon::S4_pstorerhtnew_abs:
+ case Hexagon::S4_pstorerif_abs:
+ case Hexagon::S4_pstorerit_abs:
+ case Hexagon::S4_pstorerifnew_abs:
+ case Hexagon::S4_pstoreritnew_abs:
+ // op: Pv
+ Value = insn & UINT64_C(3);
+ DecodePredRegsRegisterClass(MI, Value, 0, nullptr);
+ // op: u6
+ Value = (insn >> 12) & UINT64_C(48);
+ Value |= (insn >> 3) & UINT64_C(15);
+ MI.addOperand(MCOperand::createImm(Value));
+ // op: Rt
+ Value = (insn >> 8) & UINT64_C(31);
+ DecodeIntRegsRegisterClass(MI, Value, 0, nullptr);
+ break;
+
+ case Hexagon::L4_ploadrdf_abs:
+ case Hexagon::L4_ploadrdt_abs:
+ case Hexagon::L4_ploadrdfnew_abs:
+ case Hexagon::L4_ploadrdtnew_abs:
+ // op: Rdd
+ Value = insn & UINT64_C(31);
+ DecodeDoubleRegsRegisterClass(MI, Value, 0, nullptr);
+ // op: Pt
+ Value = ((insn >> 9) & UINT64_C(3));
+ DecodePredRegsRegisterClass(MI, Value, 0, nullptr);
+ // op: u6
+ Value = ((insn >> 15) & UINT64_C(62));
+ Value |= ((insn >> 8) & UINT64_C(1));
+ MI.addOperand(MCOperand::createImm(Value));
+ break;
+
+ case Hexagon::L4_ploadrbf_abs:
+ case Hexagon::L4_ploadrbt_abs:
+ case Hexagon::L4_ploadrbfnew_abs:
+ case Hexagon::L4_ploadrbtnew_abs:
+ case Hexagon::L4_ploadrhf_abs:
+ case Hexagon::L4_ploadrht_abs:
+ case Hexagon::L4_ploadrhfnew_abs:
+ case Hexagon::L4_ploadrhtnew_abs:
+ case Hexagon::L4_ploadrubf_abs:
+ case Hexagon::L4_ploadrubt_abs:
+ case Hexagon::L4_ploadrubfnew_abs:
+ case Hexagon::L4_ploadrubtnew_abs:
+ case Hexagon::L4_ploadruhf_abs:
+ case Hexagon::L4_ploadruht_abs:
+ case Hexagon::L4_ploadruhfnew_abs:
+ case Hexagon::L4_ploadruhtnew_abs:
+ case Hexagon::L4_ploadrif_abs:
+ case Hexagon::L4_ploadrit_abs:
+ case Hexagon::L4_ploadrifnew_abs:
+ case Hexagon::L4_ploadritnew_abs:
+ // op: Rd
+ Value = insn & UINT64_C(31);
+ DecodeIntRegsRegisterClass(MI, Value, 0, nullptr);
+ // op: Pt
+ Value = (insn >> 9) & UINT64_C(3);
+ DecodePredRegsRegisterClass(MI, Value, 0, nullptr);
+ // op: u6
+ Value = (insn >> 15) & UINT64_C(62);
+ Value |= (insn >> 8) & UINT64_C(1);
+ MI.addOperand(MCOperand::createImm(Value));
+ break;
+
+ // op: g16_2
+ case (Hexagon::PS_loadriabs):
+ ++shift;
+ // op: g16_1
+ case Hexagon::PS_loadrhabs:
+ case Hexagon::PS_loadruhabs:
+ ++shift;
+ // op: g16_0
+ case Hexagon::PS_loadrbabs:
+ case Hexagon::PS_loadrubabs:
+ // op: Rd
+ Value |= insn & UINT64_C(31);
+ DecodeIntRegsRegisterClass(MI, Value, 0, nullptr);
+ Value = (insn >> 11) & UINT64_C(49152);
+ Value |= (insn >> 7) & UINT64_C(15872);
+ Value |= (insn >> 5) & UINT64_C(511);
+ MI.addOperand(MCOperand::createImm(Value << shift));
+ break;
+
+ case Hexagon::PS_loadrdabs:
+ Value = insn & UINT64_C(31);
+ DecodeDoubleRegsRegisterClass(MI, Value, 0, nullptr);
+ Value = (insn >> 11) & UINT64_C(49152);
+ Value |= (insn >> 7) & UINT64_C(15872);
+ Value |= (insn >> 5) & UINT64_C(511);
+ MI.addOperand(MCOperand::createImm(Value << 3));
+ break;
+
+ case Hexagon::PS_storerdabs:
+ // op: g16_3
+ Value = (insn >> 11) & UINT64_C(49152);
+ Value |= (insn >> 7) & UINT64_C(15872);
+ Value |= (insn >> 5) & UINT64_C(256);
+ Value |= insn & UINT64_C(255);
+ MI.addOperand(MCOperand::createImm(Value << 3));
+ // op: Rtt
+ Value = (insn >> 8) & UINT64_C(31);
+ DecodeDoubleRegsRegisterClass(MI, Value, 0, nullptr);
+ break;
+
+ // op: g16_2
+ case Hexagon::PS_storerinewabs:
+ ++shift;
+ // op: g16_1
+ case Hexagon::PS_storerhnewabs:
+ ++shift;
+ // op: g16_0
+ case Hexagon::PS_storerbnewabs:
+ Value = (insn >> 11) & UINT64_C(49152);
+ Value |= (insn >> 7) & UINT64_C(15872);
+ Value |= (insn >> 5) & UINT64_C(256);
+ Value |= insn & UINT64_C(255);
+ MI.addOperand(MCOperand::createImm(Value << shift));
+ // op: Nt
+ Value = (insn >> 8) & UINT64_C(7);
+ DecodeIntRegsRegisterClass(MI, Value, 0, nullptr);
+ break;
+
+ // op: g16_2
+ case Hexagon::PS_storeriabs:
+ ++shift;
+ // op: g16_1
+ case Hexagon::PS_storerhabs:
+ case Hexagon::PS_storerfabs:
+ ++shift;
+ // op: g16_0
+ case Hexagon::PS_storerbabs:
+ Value = (insn >> 11) & UINT64_C(49152);
+ Value |= (insn >> 7) & UINT64_C(15872);
+ Value |= (insn >> 5) & UINT64_C(256);
+ Value |= insn & UINT64_C(255);
+ MI.addOperand(MCOperand::createImm(Value << shift));
+ // op: Rt
+ Value = (insn >> 8) & UINT64_C(31);
+ DecodeIntRegsRegisterClass(MI, Value, 0, nullptr);
+ break;
+ }
+ return MCDisassembler::Success;
+ }
+ return MCDisassembler::Fail;
+}
+
+static DecodeStatus decodeImmext(MCInst &MI, uint32_t insn,
+ void const *Decoder) {
+ // Instruction Class for a constant a extender: bits 31:28 = 0x0000
+ if ((~insn & 0xf0000000) == 0xf0000000) {
+ unsigned Value;
+ // 27:16 High 12 bits of 26-bit extender.
+ Value = (insn & 0x0fff0000) << 4;
+ // 13:0 Low 14 bits of 26-bit extender.
+ Value |= ((insn & 0x3fff) << 6);
+ MI.setOpcode(Hexagon::A4_ext);
+ HexagonMCInstrInfo::addConstant(MI, Value, contextFromDecoder(Decoder));
+ return MCDisassembler::Success;
+ }
+ return MCDisassembler::Fail;
+}
+
+// These values are from HexagonGenMCCodeEmitter.inc and HexagonIsetDx.td
+enum subInstBinaryValues {
+ SA1_addi_BITS = 0x0000,
+ SA1_addi_MASK = 0x1800,
+ SA1_addrx_BITS = 0x1800,
+ SA1_addrx_MASK = 0x1f00,
+ SA1_addsp_BITS = 0x0c00,
+ SA1_addsp_MASK = 0x1c00,
+ SA1_and1_BITS = 0x1200,
+ SA1_and1_MASK = 0x1f00,
+ SA1_clrf_BITS = 0x1a70,
+ SA1_clrf_MASK = 0x1e70,
+ SA1_clrfnew_BITS = 0x1a50,
+ SA1_clrfnew_MASK = 0x1e70,
+ SA1_clrt_BITS = 0x1a60,
+ SA1_clrt_MASK = 0x1e70,
+ SA1_clrtnew_BITS = 0x1a40,
+ SA1_clrtnew_MASK = 0x1e70,
+ SA1_cmpeqi_BITS = 0x1900,
+ SA1_cmpeqi_MASK = 0x1f00,
+ SA1_combine0i_BITS = 0x1c00,
+ SA1_combine0i_MASK = 0x1d18,
+ SA1_combine1i_BITS = 0x1c08,
+ SA1_combine1i_MASK = 0x1d18,
+ SA1_combine2i_BITS = 0x1c10,
+ SA1_combine2i_MASK = 0x1d18,
+ SA1_combine3i_BITS = 0x1c18,
+ SA1_combine3i_MASK = 0x1d18,
+ SA1_combinerz_BITS = 0x1d08,
+ SA1_combinerz_MASK = 0x1d08,
+ SA1_combinezr_BITS = 0x1d00,
+ SA1_combinezr_MASK = 0x1d08,
+ SA1_dec_BITS = 0x1300,
+ SA1_dec_MASK = 0x1f00,
+ SA1_inc_BITS = 0x1100,
+ SA1_inc_MASK = 0x1f00,
+ SA1_seti_BITS = 0x0800,
+ SA1_seti_MASK = 0x1c00,
+ SA1_setin1_BITS = 0x1a00,
+ SA1_setin1_MASK = 0x1e40,
+ SA1_sxtb_BITS = 0x1500,
+ SA1_sxtb_MASK = 0x1f00,
+ SA1_sxth_BITS = 0x1400,
+ SA1_sxth_MASK = 0x1f00,
+ SA1_tfr_BITS = 0x1000,
+ SA1_tfr_MASK = 0x1f00,
+ SA1_zxtb_BITS = 0x1700,
+ SA1_zxtb_MASK = 0x1f00,
+ SA1_zxth_BITS = 0x1600,
+ SA1_zxth_MASK = 0x1f00,
+ SL1_loadri_io_BITS = 0x0000,
+ SL1_loadri_io_MASK = 0x1000,
+ SL1_loadrub_io_BITS = 0x1000,
+ SL1_loadrub_io_MASK = 0x1000,
+ SL2_deallocframe_BITS = 0x1f00,
+ SL2_deallocframe_MASK = 0x1fc0,
+ SL2_jumpr31_BITS = 0x1fc0,
+ SL2_jumpr31_MASK = 0x1fc4,
+ SL2_jumpr31_f_BITS = 0x1fc5,
+ SL2_jumpr31_f_MASK = 0x1fc7,
+ SL2_jumpr31_fnew_BITS = 0x1fc7,
+ SL2_jumpr31_fnew_MASK = 0x1fc7,
+ SL2_jumpr31_t_BITS = 0x1fc4,
+ SL2_jumpr31_t_MASK = 0x1fc7,
+ SL2_jumpr31_tnew_BITS = 0x1fc6,
+ SL2_jumpr31_tnew_MASK = 0x1fc7,
+ SL2_loadrb_io_BITS = 0x1000,
+ SL2_loadrb_io_MASK = 0x1800,
+ SL2_loadrd_sp_BITS = 0x1e00,
+ SL2_loadrd_sp_MASK = 0x1f00,
+ SL2_loadrh_io_BITS = 0x0000,
+ SL2_loadrh_io_MASK = 0x1800,
+ SL2_loadri_sp_BITS = 0x1c00,
+ SL2_loadri_sp_MASK = 0x1e00,
+ SL2_loadruh_io_BITS = 0x0800,
+ SL2_loadruh_io_MASK = 0x1800,
+ SL2_return_BITS = 0x1f40,
+ SL2_return_MASK = 0x1fc4,
+ SL2_return_f_BITS = 0x1f45,
+ SL2_return_f_MASK = 0x1fc7,
+ SL2_return_fnew_BITS = 0x1f47,
+ SL2_return_fnew_MASK = 0x1fc7,
+ SL2_return_t_BITS = 0x1f44,
+ SL2_return_t_MASK = 0x1fc7,
+ SL2_return_tnew_BITS = 0x1f46,
+ SL2_return_tnew_MASK = 0x1fc7,
+ SS1_storeb_io_BITS = 0x1000,
+ SS1_storeb_io_MASK = 0x1000,
+ SS1_storew_io_BITS = 0x0000,
+ SS1_storew_io_MASK = 0x1000,
+ SS2_allocframe_BITS = 0x1c00,
+ SS2_allocframe_MASK = 0x1e00,
+ SS2_storebi0_BITS = 0x1200,
+ SS2_storebi0_MASK = 0x1f00,
+ SS2_storebi1_BITS = 0x1300,
+ SS2_storebi1_MASK = 0x1f00,
+ SS2_stored_sp_BITS = 0x0a00,
+ SS2_stored_sp_MASK = 0x1e00,
+ SS2_storeh_io_BITS = 0x0000,
+ SS2_storeh_io_MASK = 0x1800,
+ SS2_storew_sp_BITS = 0x0800,
+ SS2_storew_sp_MASK = 0x1e00,
+ SS2_storewi0_BITS = 0x1000,
+ SS2_storewi0_MASK = 0x1f00,
+ SS2_storewi1_BITS = 0x1100,
+ SS2_storewi1_MASK = 0x1f00
+};
+
+static unsigned GetSubinstOpcode(unsigned IClass, unsigned inst, unsigned &op,
+ raw_ostream &os) {
+ switch (IClass) {
+ case HexagonII::HSIG_L1:
+ if ((inst & SL1_loadri_io_MASK) == SL1_loadri_io_BITS)
+ op = Hexagon::SL1_loadri_io;
+ else if ((inst & SL1_loadrub_io_MASK) == SL1_loadrub_io_BITS)
+ op = Hexagon::SL1_loadrub_io;
+ else {
+ os << "<unknown subinstruction>";
+ return MCDisassembler::Fail;
+ }
+ break;
+ case HexagonII::HSIG_L2:
+ if ((inst & SL2_deallocframe_MASK) == SL2_deallocframe_BITS)
+ op = Hexagon::SL2_deallocframe;
+ else if ((inst & SL2_jumpr31_MASK) == SL2_jumpr31_BITS)
+ op = Hexagon::SL2_jumpr31;
+ else if ((inst & SL2_jumpr31_f_MASK) == SL2_jumpr31_f_BITS)
+ op = Hexagon::SL2_jumpr31_f;
+ else if ((inst & SL2_jumpr31_fnew_MASK) == SL2_jumpr31_fnew_BITS)
+ op = Hexagon::SL2_jumpr31_fnew;
+ else if ((inst & SL2_jumpr31_t_MASK) == SL2_jumpr31_t_BITS)
+ op = Hexagon::SL2_jumpr31_t;
+ else if ((inst & SL2_jumpr31_tnew_MASK) == SL2_jumpr31_tnew_BITS)
+ op = Hexagon::SL2_jumpr31_tnew;
+ else if ((inst & SL2_loadrb_io_MASK) == SL2_loadrb_io_BITS)
+ op = Hexagon::SL2_loadrb_io;
+ else if ((inst & SL2_loadrd_sp_MASK) == SL2_loadrd_sp_BITS)
+ op = Hexagon::SL2_loadrd_sp;
+ else if ((inst & SL2_loadrh_io_MASK) == SL2_loadrh_io_BITS)
+ op = Hexagon::SL2_loadrh_io;
+ else if ((inst & SL2_loadri_sp_MASK) == SL2_loadri_sp_BITS)
+ op = Hexagon::SL2_loadri_sp;
+ else if ((inst & SL2_loadruh_io_MASK) == SL2_loadruh_io_BITS)
+ op = Hexagon::SL2_loadruh_io;
+ else if ((inst & SL2_return_MASK) == SL2_return_BITS)
+ op = Hexagon::SL2_return;
+ else if ((inst & SL2_return_f_MASK) == SL2_return_f_BITS)
+ op = Hexagon::SL2_return_f;
+ else if ((inst & SL2_return_fnew_MASK) == SL2_return_fnew_BITS)
+ op = Hexagon::SL2_return_fnew;
+ else if ((inst & SL2_return_t_MASK) == SL2_return_t_BITS)
+ op = Hexagon::SL2_return_t;
+ else if ((inst & SL2_return_tnew_MASK) == SL2_return_tnew_BITS)
+ op = Hexagon::SL2_return_tnew;
+ else {
+ os << "<unknown subinstruction>";
+ return MCDisassembler::Fail;
+ }
+ break;
+ case HexagonII::HSIG_A:
+ if ((inst & SA1_addi_MASK) == SA1_addi_BITS)
+ op = Hexagon::SA1_addi;
+ else if ((inst & SA1_addrx_MASK) == SA1_addrx_BITS)
+ op = Hexagon::SA1_addrx;
+ else if ((inst & SA1_addsp_MASK) == SA1_addsp_BITS)
+ op = Hexagon::SA1_addsp;
+ else if ((inst & SA1_and1_MASK) == SA1_and1_BITS)
+ op = Hexagon::SA1_and1;
+ else if ((inst & SA1_clrf_MASK) == SA1_clrf_BITS)
+ op = Hexagon::SA1_clrf;
+ else if ((inst & SA1_clrfnew_MASK) == SA1_clrfnew_BITS)
+ op = Hexagon::SA1_clrfnew;
+ else if ((inst & SA1_clrt_MASK) == SA1_clrt_BITS)
+ op = Hexagon::SA1_clrt;
+ else if ((inst & SA1_clrtnew_MASK) == SA1_clrtnew_BITS)
+ op = Hexagon::SA1_clrtnew;
+ else if ((inst & SA1_cmpeqi_MASK) == SA1_cmpeqi_BITS)
+ op = Hexagon::SA1_cmpeqi;
+ else if ((inst & SA1_combine0i_MASK) == SA1_combine0i_BITS)
+ op = Hexagon::SA1_combine0i;
+ else if ((inst & SA1_combine1i_MASK) == SA1_combine1i_BITS)
+ op = Hexagon::SA1_combine1i;
+ else if ((inst & SA1_combine2i_MASK) == SA1_combine2i_BITS)
+ op = Hexagon::SA1_combine2i;
+ else if ((inst & SA1_combine3i_MASK) == SA1_combine3i_BITS)
+ op = Hexagon::SA1_combine3i;
+ else if ((inst & SA1_combinerz_MASK) == SA1_combinerz_BITS)
+ op = Hexagon::SA1_combinerz;
+ else if ((inst & SA1_combinezr_MASK) == SA1_combinezr_BITS)
+ op = Hexagon::SA1_combinezr;
+ else if ((inst & SA1_dec_MASK) == SA1_dec_BITS)
+ op = Hexagon::SA1_dec;
+ else if ((inst & SA1_inc_MASK) == SA1_inc_BITS)
+ op = Hexagon::SA1_inc;
+ else if ((inst & SA1_seti_MASK) == SA1_seti_BITS)
+ op = Hexagon::SA1_seti;
+ else if ((inst & SA1_setin1_MASK) == SA1_setin1_BITS)
+ op = Hexagon::SA1_setin1;
+ else if ((inst & SA1_sxtb_MASK) == SA1_sxtb_BITS)
+ op = Hexagon::SA1_sxtb;
+ else if ((inst & SA1_sxth_MASK) == SA1_sxth_BITS)
+ op = Hexagon::SA1_sxth;
+ else if ((inst & SA1_tfr_MASK) == SA1_tfr_BITS)
+ op = Hexagon::SA1_tfr;
+ else if ((inst & SA1_zxtb_MASK) == SA1_zxtb_BITS)
+ op = Hexagon::SA1_zxtb;
+ else if ((inst & SA1_zxth_MASK) == SA1_zxth_BITS)
+ op = Hexagon::SA1_zxth;
+ else {
+ os << "<unknown subinstruction>";
+ return MCDisassembler::Fail;
+ }
+ break;
+ case HexagonII::HSIG_S1:
+ if ((inst & SS1_storeb_io_MASK) == SS1_storeb_io_BITS)
+ op = Hexagon::SS1_storeb_io;
+ else if ((inst & SS1_storew_io_MASK) == SS1_storew_io_BITS)
+ op = Hexagon::SS1_storew_io;
+ else {
+ os << "<unknown subinstruction>";
+ return MCDisassembler::Fail;
+ }
+ break;
+ case HexagonII::HSIG_S2:
+ if ((inst & SS2_allocframe_MASK) == SS2_allocframe_BITS)
+ op = Hexagon::SS2_allocframe;
+ else if ((inst & SS2_storebi0_MASK) == SS2_storebi0_BITS)
+ op = Hexagon::SS2_storebi0;
+ else if ((inst & SS2_storebi1_MASK) == SS2_storebi1_BITS)
+ op = Hexagon::SS2_storebi1;
+ else if ((inst & SS2_stored_sp_MASK) == SS2_stored_sp_BITS)
+ op = Hexagon::SS2_stored_sp;
+ else if ((inst & SS2_storeh_io_MASK) == SS2_storeh_io_BITS)
+ op = Hexagon::SS2_storeh_io;
+ else if ((inst & SS2_storew_sp_MASK) == SS2_storew_sp_BITS)
+ op = Hexagon::SS2_storew_sp;
+ else if ((inst & SS2_storewi0_MASK) == SS2_storewi0_BITS)
+ op = Hexagon::SS2_storewi0;
+ else if ((inst & SS2_storewi1_MASK) == SS2_storewi1_BITS)
+ op = Hexagon::SS2_storewi1;
+ else {
+ os << "<unknown subinstruction>";
+ return MCDisassembler::Fail;
+ }
+ break;
+ default:
+ os << "<unknown>";
+ return MCDisassembler::Fail;
+ }
+ return MCDisassembler::Success;
+}
+
+static unsigned getRegFromSubinstEncoding(unsigned encoded_reg) {
+ if (encoded_reg < 8)
+ return Hexagon::R0 + encoded_reg;
+ else if (encoded_reg < 16)
+ return Hexagon::R0 + encoded_reg + 8;
+
+ // patently false value
+ return Hexagon::NoRegister;
+}
+
+static unsigned getDRegFromSubinstEncoding(unsigned encoded_dreg) {
+ if (encoded_dreg < 4)
+ return Hexagon::D0 + encoded_dreg;
+ else if (encoded_dreg < 8)
+ return Hexagon::D0 + encoded_dreg + 4;
+
+ // patently false value
+ return Hexagon::NoRegister;
+}
+
+void HexagonDisassembler::addSubinstOperands(MCInst *MI, unsigned opcode,
+ unsigned inst) const {
+ int64_t operand;
+ MCOperand Op;
+ switch (opcode) {
+ case Hexagon::SL2_deallocframe:
+ case Hexagon::SL2_jumpr31:
+ case Hexagon::SL2_jumpr31_f:
+ case Hexagon::SL2_jumpr31_fnew:
+ case Hexagon::SL2_jumpr31_t:
+ case Hexagon::SL2_jumpr31_tnew:
+ case Hexagon::SL2_return:
+ case Hexagon::SL2_return_f:
+ case Hexagon::SL2_return_fnew:
+ case Hexagon::SL2_return_t:
+ case Hexagon::SL2_return_tnew:
+ // no operands for these instructions
+ break;
+ case Hexagon::SS2_allocframe:
+ // u 8-4{5_3}
+ operand = ((inst & 0x1f0) >> 4) << 3;
+ HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
+ break;
+ case Hexagon::SL1_loadri_io:
+ // Rd 3-0, Rs 7-4, u 11-8{4_2}
+ operand = getRegFromSubinstEncoding(inst & 0xf);
+ Op = MCOperand::createReg(operand);
+ MI->addOperand(Op);
+ operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
+ Op = MCOperand::createReg(operand);
+ MI->addOperand(Op);
+ operand = (inst & 0xf00) >> 6;
+ HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
+ break;
+ case Hexagon::SL1_loadrub_io:
+ // Rd 3-0, Rs 7-4, u 11-8
+ operand = getRegFromSubinstEncoding(inst & 0xf);
+ Op = MCOperand::createReg(operand);
+ MI->addOperand(Op);
+ operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
+ Op = MCOperand::createReg(operand);
+ MI->addOperand(Op);
+ operand = (inst & 0xf00) >> 8;
+ HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
+ break;
+ case Hexagon::SL2_loadrb_io:
+ // Rd 3-0, Rs 7-4, u 10-8
+ operand = getRegFromSubinstEncoding(inst & 0xf);
+ Op = MCOperand::createReg(operand);
+ MI->addOperand(Op);
+ operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
+ Op = MCOperand::createReg(operand);
+ MI->addOperand(Op);
+ operand = (inst & 0x700) >> 8;
+ HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
+ break;
+ case Hexagon::SL2_loadrh_io:
+ case Hexagon::SL2_loadruh_io:
+ // Rd 3-0, Rs 7-4, u 10-8{3_1}
+ operand = getRegFromSubinstEncoding(inst & 0xf);
+ Op = MCOperand::createReg(operand);
+ MI->addOperand(Op);
+ operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
+ Op = MCOperand::createReg(operand);
+ MI->addOperand(Op);
+ operand = ((inst & 0x700) >> 8) << 1;
+ HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
+ break;
+ case Hexagon::SL2_loadrd_sp:
+ // Rdd 2-0, u 7-3{5_3}
+ operand = getDRegFromSubinstEncoding(inst & 0x7);
+ Op = MCOperand::createReg(operand);
+ MI->addOperand(Op);
+ operand = ((inst & 0x0f8) >> 3) << 3;
+ HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
+ break;
+ case Hexagon::SL2_loadri_sp:
+ // Rd 3-0, u 8-4{5_2}
+ operand = getRegFromSubinstEncoding(inst & 0xf);
+ Op = MCOperand::createReg(operand);
+ MI->addOperand(Op);
+ operand = ((inst & 0x1f0) >> 4) << 2;
+ HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
+ break;
+ case Hexagon::SA1_addi:
+ // Rx 3-0 (x2), s7 10-4
+ operand = getRegFromSubinstEncoding(inst & 0xf);
+ Op = MCOperand::createReg(operand);
+ MI->addOperand(Op);
+ MI->addOperand(Op);
+ operand = SignExtend64<7>((inst & 0x7f0) >> 4);
+ HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
+ break;
+ case Hexagon::SA1_addrx:
+ // Rx 3-0 (x2), Rs 7-4
+ operand = getRegFromSubinstEncoding(inst & 0xf);
+ Op = MCOperand::createReg(operand);
+ MI->addOperand(Op);
+ MI->addOperand(Op);
+ operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
+ Op = MCOperand::createReg(operand);
+ MI->addOperand(Op);
+ break;
+ case Hexagon::SA1_and1:
+ case Hexagon::SA1_dec:
+ case Hexagon::SA1_inc:
+ case Hexagon::SA1_sxtb:
+ case Hexagon::SA1_sxth:
+ case Hexagon::SA1_tfr:
+ case Hexagon::SA1_zxtb:
+ case Hexagon::SA1_zxth:
+ // Rd 3-0, Rs 7-4
+ operand = getRegFromSubinstEncoding(inst & 0xf);
+ Op = MCOperand::createReg(operand);
+ MI->addOperand(Op);
+ operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
+ Op = MCOperand::createReg(operand);
+ MI->addOperand(Op);
+ break;
+ case Hexagon::SA1_addsp:
+ // Rd 3-0, u 9-4{6_2}
+ operand = getRegFromSubinstEncoding(inst & 0xf);
+ Op = MCOperand::createReg(operand);
+ MI->addOperand(Op);
+ operand = ((inst & 0x3f0) >> 4) << 2;
+ HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
+ break;
+ case Hexagon::SA1_seti:
+ // Rd 3-0, u 9-4
+ operand = getRegFromSubinstEncoding(inst & 0xf);
+ Op = MCOperand::createReg(operand);
+ MI->addOperand(Op);
+ operand = (inst & 0x3f0) >> 4;
+ HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
+ break;
+ case Hexagon::SA1_clrf:
+ case Hexagon::SA1_clrfnew:
+ case Hexagon::SA1_clrt:
+ case Hexagon::SA1_clrtnew:
+ case Hexagon::SA1_setin1:
+ // Rd 3-0
+ operand = getRegFromSubinstEncoding(inst & 0xf);
+ Op = MCOperand::createReg(operand);
+ MI->addOperand(Op);
+ if (opcode == Hexagon::SA1_setin1)
+ break;
+ MI->addOperand(MCOperand::createReg(Hexagon::P0));
+ break;
+ case Hexagon::SA1_cmpeqi:
+ // Rs 7-4, u 1-0
+ operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
+ Op = MCOperand::createReg(operand);
+ MI->addOperand(Op);
+ operand = inst & 0x3;
+ HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
+ break;
+ case Hexagon::SA1_combine0i:
+ case Hexagon::SA1_combine1i:
+ case Hexagon::SA1_combine2i:
+ case Hexagon::SA1_combine3i:
+ // Rdd 2-0, u 6-5
+ operand = getDRegFromSubinstEncoding(inst & 0x7);
+ Op = MCOperand::createReg(operand);
+ MI->addOperand(Op);
+ operand = (inst & 0x060) >> 5;
+ HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
+ break;
+ case Hexagon::SA1_combinerz:
+ case Hexagon::SA1_combinezr:
+ // Rdd 2-0, Rs 7-4
+ operand = getDRegFromSubinstEncoding(inst & 0x7);
+ Op = MCOperand::createReg(operand);
+ MI->addOperand(Op);
+ operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
+ Op = MCOperand::createReg(operand);
+ MI->addOperand(Op);
+ break;
+ case Hexagon::SS1_storeb_io:
+ // Rs 7-4, u 11-8, Rt 3-0
+ operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
+ Op = MCOperand::createReg(operand);
+ MI->addOperand(Op);
+ operand = (inst & 0xf00) >> 8;
+ HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
+ operand = getRegFromSubinstEncoding(inst & 0xf);
+ Op = MCOperand::createReg(operand);
+ MI->addOperand(Op);
+ break;
+ case Hexagon::SS1_storew_io:
+ // Rs 7-4, u 11-8{4_2}, Rt 3-0
+ operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
+ Op = MCOperand::createReg(operand);
+ MI->addOperand(Op);
+ operand = ((inst & 0xf00) >> 8) << 2;
+ HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
+ operand = getRegFromSubinstEncoding(inst & 0xf);
+ Op = MCOperand::createReg(operand);
+ MI->addOperand(Op);
+ break;
+ case Hexagon::SS2_storebi0:
+ case Hexagon::SS2_storebi1:
+ // Rs 7-4, u 3-0
+ operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
+ Op = MCOperand::createReg(operand);
+ MI->addOperand(Op);
+ operand = inst & 0xf;
+ HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
+ break;
+ case Hexagon::SS2_storewi0:
+ case Hexagon::SS2_storewi1:
+ // Rs 7-4, u 3-0{4_2}
+ operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
+ Op = MCOperand::createReg(operand);
+ MI->addOperand(Op);
+ operand = (inst & 0xf) << 2;
+ HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
+ break;
+ case Hexagon::SS2_stored_sp:
+ // s 8-3{6_3}, Rtt 2-0
+ operand = SignExtend64<9>(((inst & 0x1f8) >> 3) << 3);
+ HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
+ operand = getDRegFromSubinstEncoding(inst & 0x7);
+ Op = MCOperand::createReg(operand);
+ MI->addOperand(Op);
+ break;
+ case Hexagon::SS2_storeh_io:
+ // Rs 7-4, u 10-8{3_1}, Rt 3-0
+ operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
+ Op = MCOperand::createReg(operand);
+ MI->addOperand(Op);
+ operand = ((inst & 0x700) >> 8) << 1;
+ HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
+ operand = getRegFromSubinstEncoding(inst & 0xf);
+ Op = MCOperand::createReg(operand);
+ MI->addOperand(Op);
+ break;
+ case Hexagon::SS2_storew_sp:
+ // u 8-4{5_2}, Rd 3-0
+ operand = ((inst & 0x1f0) >> 4) << 2;
+ HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
+ operand = getRegFromSubinstEncoding(inst & 0xf);
+ Op = MCOperand::createReg(operand);
+ MI->addOperand(Op);
+ break;
+ default:
+ // don't crash with an invalid subinstruction
+ // llvm_unreachable("Invalid subinstruction in duplex instruction");
+ break;
+ }
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/Hexagon.h b/contrib/llvm/lib/Target/Hexagon/Hexagon.h
new file mode 100644
index 000000000000..ed7d9578902e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/Hexagon.h
@@ -0,0 +1,56 @@
+//=-- Hexagon.h - Top-level interface for Hexagon representation --*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// Hexagon back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGON_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGON_H
+
+#define Hexagon_POINTER_SIZE 4
+
+#define Hexagon_PointerSize (Hexagon_POINTER_SIZE)
+#define Hexagon_PointerSize_Bits (Hexagon_POINTER_SIZE * 8)
+#define Hexagon_WordSize Hexagon_PointerSize
+#define Hexagon_WordSize_Bits Hexagon_PointerSize_Bits
+
+// allocframe saves LR and FP on stack before allocating
+// a new stack frame. This takes 8 bytes.
+#define HEXAGON_LRFP_SIZE 8
+
+// Normal instruction size (in bytes).
+#define HEXAGON_INSTR_SIZE 4
+
+// Maximum number of words and instructions in a packet.
+#define HEXAGON_PACKET_SIZE 4
+#define HEXAGON_MAX_PACKET_SIZE (HEXAGON_PACKET_SIZE * HEXAGON_INSTR_SIZE)
+// Minimum number of instructions in an end-loop packet.
+#define HEXAGON_PACKET_INNER_SIZE 2
+#define HEXAGON_PACKET_OUTER_SIZE 3
+// Maximum number of instructions in a packet before shuffling,
+// including a compound one or a duplex or an extender.
+#define HEXAGON_PRESHUFFLE_PACKET_SIZE (HEXAGON_PACKET_SIZE + 3)
+
+// Name of the global offset table as defined by the Hexagon ABI
+#define HEXAGON_GOT_SYM_NAME "_GLOBAL_OFFSET_TABLE_"
+
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+ class HexagonTargetMachine;
+
+ /// \brief Creates a Hexagon-specific Target Transformation Info pass.
+ ImmutablePass *createHexagonTargetTransformInfoPass(const HexagonTargetMachine *TM);
+} // end namespace llvm;
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/Hexagon.td b/contrib/llvm/lib/Target/Hexagon/Hexagon.td
new file mode 100644
index 000000000000..0b2b46387b6a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/Hexagon.td
@@ -0,0 +1,295 @@
+//===-- Hexagon.td - Describe the Hexagon Target Machine --*- tablegen -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the top level entry point for the Hexagon target.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// Hexagon Subtarget features.
+//===----------------------------------------------------------------------===//
+
+// Hexagon Architectures
+def ArchV4: SubtargetFeature<"v4", "HexagonArchVersion", "V4", "Hexagon V4">;
+def ArchV5: SubtargetFeature<"v5", "HexagonArchVersion", "V5", "Hexagon V5">;
+def ArchV55: SubtargetFeature<"v55", "HexagonArchVersion", "V55", "Hexagon V55">;
+def ArchV60: SubtargetFeature<"v60", "HexagonArchVersion", "V60", "Hexagon V60">;
+
+def FeatureHVX: SubtargetFeature<"hvx", "UseHVXOps", "true",
+ "Hexagon HVX instructions">;
+def FeatureHVXDbl: SubtargetFeature<"hvx-double", "UseHVXDblOps", "true",
+ "Hexagon HVX Double instructions">;
+def FeatureLongCalls: SubtargetFeature<"long-calls", "UseLongCalls", "true",
+ "Use constant-extended calls">;
+
+//===----------------------------------------------------------------------===//
+// Hexagon Instruction Predicate Definitions.
+//===----------------------------------------------------------------------===//
+def HasV5T : Predicate<"HST->hasV5TOps()">;
+def NoV5T : Predicate<"!HST->hasV5TOps()">;
+def HasV55T : Predicate<"HST->hasV55TOps()">,
+ AssemblerPredicate<"ArchV55">;
+def HasV60T : Predicate<"HST->hasV60TOps()">,
+ AssemblerPredicate<"ArchV60">;
+def UseMEMOP : Predicate<"HST->useMemOps()">;
+def IEEERndNearV5T : Predicate<"HST->modeIEEERndNear()">;
+def UseHVXDbl : Predicate<"HST->useHVXDblOps()">,
+ AssemblerPredicate<"FeatureHVXDbl">;
+def UseHVXSgl : Predicate<"HST->useHVXSglOps()">;
+def UseHVX : Predicate<"HST->useHVXSglOps() ||HST->useHVXDblOps()">,
+ AssemblerPredicate<"FeatureHVX">;
+
+//===----------------------------------------------------------------------===//
+// Classes used for relation maps.
+//===----------------------------------------------------------------------===//
+
+class ImmRegShl;
+// PredRel - Filter class used to relate non-predicated instructions with their
+// predicated forms.
+class PredRel;
+// PredNewRel - Filter class used to relate predicated instructions with their
+// predicate-new forms.
+class PredNewRel: PredRel;
+// ImmRegRel - Filter class used to relate instructions having reg-reg form
+// with their reg-imm counterparts.
+class ImmRegRel;
+// NewValueRel - Filter class used to relate regular store instructions with
+// their new-value store form.
+class NewValueRel: PredNewRel;
+// NewValueRel - Filter class used to relate load/store instructions having
+// different addressing modes with each other.
+class AddrModeRel: NewValueRel;
+class IntrinsicsRel;
+
+//===----------------------------------------------------------------------===//
+// Generate mapping table to relate non-predicate instructions with their
+// predicated formats - true and false.
+//
+
+def getPredOpcode : InstrMapping {
+ let FilterClass = "PredRel";
+ // Instructions with the same BaseOpcode and isNVStore values form a row.
+ let RowFields = ["BaseOpcode", "isNVStore", "PNewValue", "isNT"];
+ // Instructions with the same predicate sense form a column.
+ let ColFields = ["PredSense"];
+ // The key column is the unpredicated instructions.
+ let KeyCol = [""];
+ // Value columns are PredSense=true and PredSense=false
+ let ValueCols = [["true"], ["false"]];
+}
+
+//===----------------------------------------------------------------------===//
+// Generate mapping table to relate predicate-true instructions with their
+// predicate-false forms
+//
+def getFalsePredOpcode : InstrMapping {
+ let FilterClass = "PredRel";
+ let RowFields = ["BaseOpcode", "PNewValue", "isNVStore", "isBrTaken", "isNT"];
+ let ColFields = ["PredSense"];
+ let KeyCol = ["true"];
+ let ValueCols = [["false"]];
+}
+
+//===----------------------------------------------------------------------===//
+// Generate mapping table to relate predicate-false instructions with their
+// predicate-true forms
+//
+def getTruePredOpcode : InstrMapping {
+ let FilterClass = "PredRel";
+ let RowFields = ["BaseOpcode", "PNewValue", "isNVStore", "isBrTaken", "isNT"];
+ let ColFields = ["PredSense"];
+ let KeyCol = ["false"];
+ let ValueCols = [["true"]];
+}
+
+//===----------------------------------------------------------------------===//
+// Generate mapping table to relate predicated instructions with their .new
+// format.
+//
+def getPredNewOpcode : InstrMapping {
+ let FilterClass = "PredNewRel";
+ let RowFields = ["BaseOpcode", "PredSense", "isNVStore", "isBrTaken"];
+ let ColFields = ["PNewValue"];
+ let KeyCol = [""];
+ let ValueCols = [["new"]];
+}
+
+//===----------------------------------------------------------------------===//
+// Generate mapping table to relate .new predicated instructions with their old
+// format.
+//
+def getPredOldOpcode : InstrMapping {
+ let FilterClass = "PredNewRel";
+ let RowFields = ["BaseOpcode", "PredSense", "isNVStore"];
+ let ColFields = ["PNewValue"];
+ let KeyCol = ["new"];
+ let ValueCols = [[""]];
+}
+
+//===----------------------------------------------------------------------===//
+// Generate mapping table to relate store instructions with their new-value
+// format.
+//
+def getNewValueOpcode : InstrMapping {
+ let FilterClass = "NewValueRel";
+ let RowFields = ["BaseOpcode", "PredSense", "PNewValue", "addrMode", "isNT"];
+ let ColFields = ["NValueST"];
+ let KeyCol = ["false"];
+ let ValueCols = [["true"]];
+}
+
+//===----------------------------------------------------------------------===//
+// Generate mapping table to relate new-value store instructions with their old
+// format.
+//
+def getNonNVStore : InstrMapping {
+ let FilterClass = "NewValueRel";
+ let RowFields = ["BaseOpcode", "PredSense", "PNewValue", "addrMode", "isNT"];
+ let ColFields = ["NValueST"];
+ let KeyCol = ["true"];
+ let ValueCols = [["false"]];
+}
+
+def getBaseWithImmOffset : InstrMapping {
+ let FilterClass = "AddrModeRel";
+ let RowFields = ["CextOpcode", "PredSense", "PNewValue", "isNVStore",
+ "isFloat"];
+ let ColFields = ["addrMode"];
+ let KeyCol = ["Absolute"];
+ let ValueCols = [["BaseImmOffset"]];
+}
+
+def getAbsoluteForm : InstrMapping {
+ let FilterClass = "AddrModeRel";
+ let RowFields = ["CextOpcode", "PredSense", "PNewValue", "isNVStore",
+ "isFloat"];
+ let ColFields = ["addrMode"];
+ let KeyCol = ["BaseImmOffset"];
+ let ValueCols = [["Absolute"]];
+}
+
+def getBaseWithRegOffset : InstrMapping {
+ let FilterClass = "AddrModeRel";
+ let RowFields = ["CextOpcode", "PredSense", "PNewValue", "isNVStore"];
+ let ColFields = ["addrMode"];
+ let KeyCol = ["BaseImmOffset"];
+ let ValueCols = [["BaseRegOffset"]];
+}
+
+def xformRegToImmOffset : InstrMapping {
+ let FilterClass = "AddrModeRel";
+ let RowFields = ["CextOpcode", "PredSense", "PNewValue", "isNVStore"];
+ let ColFields = ["addrMode"];
+ let KeyCol = ["BaseRegOffset"];
+ let ValueCols = [["BaseImmOffset"]];
+}
+
+def getBaseWithLongOffset : InstrMapping {
+ let FilterClass = "ImmRegShl";
+ let RowFields = ["CextOpcode", "PredSense", "PNewValue", "isNVStore"];
+ let ColFields = ["addrMode"];
+ let KeyCol = ["BaseRegOffset"];
+ let ValueCols = [["BaseLongOffset"]];
+}
+
+def getRegForm : InstrMapping {
+ let FilterClass = "ImmRegRel";
+ let RowFields = ["CextOpcode", "PredSense", "PNewValue"];
+ let ColFields = ["InputType"];
+ let KeyCol = ["imm"];
+ let ValueCols = [["reg"]];
+}
+
+def getRegShlForm : InstrMapping {
+ let FilterClass = "ImmRegShl";
+ let RowFields = ["CextOpcode", "PredSense", "PNewValue", "isNVStore"];
+ let ColFields = ["InputType"];
+ let KeyCol = ["imm"];
+ let ValueCols = [["reg"]];
+}
+
+def notTakenBranchPrediction : InstrMapping {
+ let FilterClass = "PredRel";
+ let RowFields = ["BaseOpcode", "PNewValue", "PredSense", "isBranch", "isPredicated"];
+ let ColFields = ["isBrTaken"];
+ let KeyCol = ["true"];
+ let ValueCols = [["false"]];
+}
+
+def takenBranchPrediction : InstrMapping {
+ let FilterClass = "PredRel";
+ let RowFields = ["BaseOpcode", "PNewValue", "PredSense", "isBranch", "isPredicated"];
+ let ColFields = ["isBrTaken"];
+ let KeyCol = ["false"];
+ let ValueCols = [["true"]];
+}
+
+def getRealHWInstr : InstrMapping {
+ let FilterClass = "IntrinsicsRel";
+ let RowFields = ["BaseOpcode"];
+ let ColFields = ["InstrType"];
+ let KeyCol = ["Pseudo"];
+ let ValueCols = [["Pseudo"], ["Real"]];
+}
+//===----------------------------------------------------------------------===//
+// Register File, Calling Conv, Instruction Descriptions
+//===----------------------------------------------------------------------===//
+include "HexagonSchedule.td"
+include "HexagonRegisterInfo.td"
+include "HexagonCallingConv.td"
+include "HexagonInstrInfo.td"
+include "HexagonPatterns.td"
+include "HexagonIntrinsics.td"
+include "HexagonIntrinsicsDerived.td"
+
+def HexagonInstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// Hexagon processors supported.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, SchedMachineModel Model,
+ list<SubtargetFeature> Features>
+ : ProcessorModel<Name, Model, Features>;
+
+def : Proc<"hexagonv4", HexagonModelV4,
+ [ArchV4]>;
+def : Proc<"hexagonv5", HexagonModelV4,
+ [ArchV4, ArchV5]>;
+def : Proc<"hexagonv55", HexagonModelV55,
+ [ArchV4, ArchV5, ArchV55]>;
+def : Proc<"hexagonv60", HexagonModelV60,
+ [ArchV4, ArchV5, ArchV55, ArchV60, FeatureHVX]>;
+
+//===----------------------------------------------------------------------===//
+// Declare the target which we are implementing
+//===----------------------------------------------------------------------===//
+
+def HexagonAsmParser : AsmParser {
+ let ShouldEmitMatchRegisterAltName = 1;
+ bit HasMnemonicFirst = 0;
+}
+
+def HexagonAsmParserVariant : AsmParserVariant {
+ int Variant = 0;
+ string TokenizingCharacters = "#()=:.<>!+*-|^&";
+}
+
+def Hexagon : Target {
+ // Pull in Instruction Info:
+ let InstructionSet = HexagonInstrInfo;
+ let AssemblyParsers = [HexagonAsmParser];
+ let AssemblyParserVariants = [HexagonAsmParserVariant];
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
new file mode 100644
index 000000000000..54db5ad4374b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -0,0 +1,604 @@
+//===-- HexagonAsmPrinter.cpp - Print machine instrs to Hexagon assembly --===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to Hexagon assembly language. This printer is
+// the output mechanism used by `llc'.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "HexagonAsmPrinter.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "MCTargetDesc/HexagonInstPrinter.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
+#include "MCTargetDesc/HexagonMCShuffler.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+namespace llvm {
+ void HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI,
+ MCInst &MCB, HexagonAsmPrinter &AP);
+}
+
+#define DEBUG_TYPE "asm-printer"
+
+static cl::opt<bool> AlignCalls(
+ "hexagon-align-calls", cl::Hidden, cl::init(true),
+ cl::desc("Insert falign after call instruction for Hexagon target"));
+
+// Given a scalar register return its pair.
+inline static unsigned getHexagonRegisterPair(unsigned Reg,
+ const MCRegisterInfo *RI) {
+ assert(Hexagon::IntRegsRegClass.contains(Reg));
+ MCSuperRegIterator SR(Reg, RI, false);
+ unsigned Pair = *SR;
+ assert(Hexagon::DoubleRegsRegClass.contains(Pair));
+ return Pair;
+}
+
+HexagonAsmPrinter::HexagonAsmPrinter(TargetMachine &TM,
+ std::unique_ptr<MCStreamer> Streamer)
+ : AsmPrinter(TM, std::move(Streamer)), Subtarget(nullptr) {}
+
+void HexagonAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MachineOperand &MO = MI->getOperand(OpNo);
+
+ switch (MO.getType()) {
+ default: llvm_unreachable ("<unknown operand type>");
+ case MachineOperand::MO_Register:
+ O << HexagonInstPrinter::getRegisterName(MO.getReg());
+ return;
+ case MachineOperand::MO_Immediate:
+ O << MO.getImm();
+ return;
+ case MachineOperand::MO_MachineBasicBlock:
+ MO.getMBB()->getSymbol()->print(O, MAI);
+ return;
+ case MachineOperand::MO_ConstantPoolIndex:
+ GetCPISymbol(MO.getIndex())->print(O, MAI);
+ return;
+ case MachineOperand::MO_GlobalAddress:
+ // Computing the address of a global symbol, not calling it.
+ getSymbol(MO.getGlobal())->print(O, MAI);
+ printOffset(MO.getOffset(), O);
+ return;
+ }
+}
+
+//
+// isBlockOnlyReachableByFallthrough - We need to override this since the
+// default AsmPrinter does not print labels for any basic block that
+// is only reachable by a fall through. That works for all cases except
+// for the case in which the basic block is reachable by a fall through but
+// through an indirect from a jump table. In this case, the jump table
+// will contain a label not defined by AsmPrinter.
+//
+bool HexagonAsmPrinter::
+isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const {
+ if (MBB->hasAddressTaken())
+ return false;
+ return AsmPrinter::isBlockOnlyReachableByFallthrough(MBB);
+}
+
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool HexagonAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant,
+ const char *ExtraCode,
+ raw_ostream &OS) {
+ // Does this asm operand have a single letter operand modifier?
+ if (ExtraCode && ExtraCode[0]) {
+ if (ExtraCode[1] != 0)
+ return true; // Unknown modifier.
+
+ switch (ExtraCode[0]) {
+ default:
+ // See if this is a generic print operand
+ return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, OS);
+ case 'c': // Don't print "$" before a global var name or constant.
+ // Hexagon never has a prefix.
+ printOperand(MI, OpNo, OS);
+ return false;
+ case 'L':
+ case 'H': { // The highest-numbered register of a pair.
+ const MachineOperand &MO = MI->getOperand(OpNo);
+ const MachineFunction &MF = *MI->getParent()->getParent();
+ const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+ if (!MO.isReg())
+ return true;
+ unsigned RegNumber = MO.getReg();
+ // This should be an assert in the frontend.
+ if (Hexagon::DoubleRegsRegClass.contains(RegNumber))
+ RegNumber = TRI->getSubReg(RegNumber, ExtraCode[0] == 'L' ?
+ Hexagon::isub_lo :
+ Hexagon::isub_hi);
+ OS << HexagonInstPrinter::getRegisterName(RegNumber);
+ return false;
+ }
+ case 'I':
+ // Write 'i' if an integer constant, otherwise nothing. Used to print
+ // addi vs add, etc.
+ if (MI->getOperand(OpNo).isImm())
+ OS << "i";
+ return false;
+ }
+ }
+
+ printOperand(MI, OpNo, OS);
+ return false;
+}
+
+bool HexagonAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+ unsigned OpNo, unsigned AsmVariant,
+ const char *ExtraCode,
+ raw_ostream &O) {
+ if (ExtraCode && ExtraCode[0])
+ return true; // Unknown modifier.
+
+ const MachineOperand &Base = MI->getOperand(OpNo);
+ const MachineOperand &Offset = MI->getOperand(OpNo+1);
+
+ if (Base.isReg())
+ printOperand(MI, OpNo, O);
+ else
+ llvm_unreachable("Unimplemented");
+
+ if (Offset.isImm()) {
+ if (Offset.getImm())
+ O << " + #" << Offset.getImm();
+ }
+ else
+ llvm_unreachable("Unimplemented");
+
+ return false;
+}
+
+static MCSymbol *smallData(AsmPrinter &AP, const MachineInstr &MI,
+ MCStreamer &OutStreamer, const MCOperand &Imm,
+ int AlignSize) {
+ MCSymbol *Sym;
+ int64_t Value;
+ if (Imm.getExpr()->evaluateAsAbsolute(Value)) {
+ StringRef sectionPrefix;
+ std::string ImmString;
+ StringRef Name;
+ if (AlignSize == 8) {
+ Name = ".CONST_0000000000000000";
+ sectionPrefix = ".gnu.linkonce.l8";
+ ImmString = utohexstr(Value);
+ } else {
+ Name = ".CONST_00000000";
+ sectionPrefix = ".gnu.linkonce.l4";
+ ImmString = utohexstr(static_cast<uint32_t>(Value));
+ }
+
+ std::string symbolName = // Yes, leading zeros are kept.
+ Name.drop_back(ImmString.size()).str() + ImmString;
+ std::string sectionName = sectionPrefix.str() + symbolName;
+
+ MCSectionELF *Section = OutStreamer.getContext().getELFSection(
+ sectionName, ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
+ OutStreamer.SwitchSection(Section);
+
+ Sym = AP.OutContext.getOrCreateSymbol(Twine(symbolName));
+ if (Sym->isUndefined()) {
+ OutStreamer.EmitLabel(Sym);
+ OutStreamer.EmitSymbolAttribute(Sym, MCSA_Global);
+ OutStreamer.EmitIntValue(Value, AlignSize);
+ OutStreamer.EmitCodeAlignment(AlignSize);
+ }
+ } else {
+ assert(Imm.isExpr() && "Expected expression and found none");
+ const MachineOperand &MO = MI.getOperand(1);
+ assert(MO.isGlobal() || MO.isCPI() || MO.isJTI());
+ MCSymbol *MOSymbol = nullptr;
+ if (MO.isGlobal())
+ MOSymbol = AP.getSymbol(MO.getGlobal());
+ else if (MO.isCPI())
+ MOSymbol = AP.GetCPISymbol(MO.getIndex());
+ else if (MO.isJTI())
+ MOSymbol = AP.GetJTISymbol(MO.getIndex());
+ else
+ llvm_unreachable("Unknown operand type!");
+
+ StringRef SymbolName = MOSymbol->getName();
+ std::string LitaName = ".CONST_" + SymbolName.str();
+
+ MCSectionELF *Section = OutStreamer.getContext().getELFSection(
+ ".lita", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
+
+ OutStreamer.SwitchSection(Section);
+ Sym = AP.OutContext.getOrCreateSymbol(Twine(LitaName));
+ if (Sym->isUndefined()) {
+ OutStreamer.EmitLabel(Sym);
+ OutStreamer.EmitSymbolAttribute(Sym, MCSA_Local);
+ OutStreamer.EmitValue(Imm.getExpr(), AlignSize);
+ OutStreamer.EmitCodeAlignment(AlignSize);
+ }
+ }
+ return Sym;
+}
+
+void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
+ const MachineInstr &MI) {
+ MCInst &MappedInst = static_cast <MCInst &>(Inst);
+ const MCRegisterInfo *RI = OutStreamer->getContext().getRegisterInfo();
+
+ switch (Inst.getOpcode()) {
+ default: return;
+
+ case Hexagon::A2_iconst: {
+ Inst.setOpcode(Hexagon::A2_addi);
+ MCOperand Reg = Inst.getOperand(0);
+ MCOperand S16 = Inst.getOperand(1);
+ HexagonMCInstrInfo::setMustNotExtend(*S16.getExpr());
+ HexagonMCInstrInfo::setS23_2_reloc(*S16.getExpr());
+ Inst.clear();
+ Inst.addOperand(Reg);
+ Inst.addOperand(MCOperand::createReg(Hexagon::R0));
+ Inst.addOperand(S16);
+ break;
+ }
+
+ // "$dst = CONST64(#$src1)",
+ case Hexagon::CONST64:
+ if (!OutStreamer->hasRawTextSupport()) {
+ const MCOperand &Imm = MappedInst.getOperand(1);
+ MCSectionSubPair Current = OutStreamer->getCurrentSection();
+
+ MCSymbol *Sym = smallData(*this, MI, *OutStreamer, Imm, 8);
+
+ OutStreamer->SwitchSection(Current.first, Current.second);
+ MCInst TmpInst;
+ MCOperand &Reg = MappedInst.getOperand(0);
+ TmpInst.setOpcode(Hexagon::L2_loadrdgp);
+ TmpInst.addOperand(Reg);
+ TmpInst.addOperand(MCOperand::createExpr(
+ MCSymbolRefExpr::create(Sym, OutContext)));
+ MappedInst = TmpInst;
+
+ }
+ break;
+ case Hexagon::CONST32:
+ if (!OutStreamer->hasRawTextSupport()) {
+ MCOperand &Imm = MappedInst.getOperand(1);
+ MCSectionSubPair Current = OutStreamer->getCurrentSection();
+ MCSymbol *Sym = smallData(*this, MI, *OutStreamer, Imm, 4);
+ OutStreamer->SwitchSection(Current.first, Current.second);
+ MCInst TmpInst;
+ MCOperand &Reg = MappedInst.getOperand(0);
+ TmpInst.setOpcode(Hexagon::L2_loadrigp);
+ TmpInst.addOperand(Reg);
+ TmpInst.addOperand(MCOperand::createExpr(HexagonMCExpr::create(
+ MCSymbolRefExpr::create(Sym, OutContext), OutContext)));
+ MappedInst = TmpInst;
+ }
+ break;
+
+ // C2_pxfer_map maps to C2_or instruction. Though, it's possible to use
+ // C2_or during instruction selection itself but it results
+ // into suboptimal code.
+ case Hexagon::C2_pxfer_map: {
+ MCOperand &Ps = Inst.getOperand(1);
+ MappedInst.setOpcode(Hexagon::C2_or);
+ MappedInst.addOperand(Ps);
+ return;
+ }
+
+ // Vector reduce complex multiply by scalar, Rt & 1 map to :hi else :lo
+ // The insn is mapped from the 4 operand to the 3 operand raw form taking
+ // 3 register pairs.
+ case Hexagon::M2_vrcmpys_acc_s1: {
+ MCOperand &Rt = Inst.getOperand(3);
+ assert (Rt.isReg() && "Expected register and none was found");
+ unsigned Reg = RI->getEncodingValue(Rt.getReg());
+ if (Reg & 1)
+ MappedInst.setOpcode(Hexagon::M2_vrcmpys_acc_s1_h);
+ else
+ MappedInst.setOpcode(Hexagon::M2_vrcmpys_acc_s1_l);
+ Rt.setReg(getHexagonRegisterPair(Rt.getReg(), RI));
+ return;
+ }
+ case Hexagon::M2_vrcmpys_s1: {
+ MCOperand &Rt = Inst.getOperand(2);
+ assert (Rt.isReg() && "Expected register and none was found");
+ unsigned Reg = RI->getEncodingValue(Rt.getReg());
+ if (Reg & 1)
+ MappedInst.setOpcode(Hexagon::M2_vrcmpys_s1_h);
+ else
+ MappedInst.setOpcode(Hexagon::M2_vrcmpys_s1_l);
+ Rt.setReg(getHexagonRegisterPair(Rt.getReg(), RI));
+ return;
+ }
+
+ case Hexagon::M2_vrcmpys_s1rp: {
+ MCOperand &Rt = Inst.getOperand(2);
+ assert (Rt.isReg() && "Expected register and none was found");
+ unsigned Reg = RI->getEncodingValue(Rt.getReg());
+ if (Reg & 1)
+ MappedInst.setOpcode(Hexagon::M2_vrcmpys_s1rp_h);
+ else
+ MappedInst.setOpcode(Hexagon::M2_vrcmpys_s1rp_l);
+ Rt.setReg(getHexagonRegisterPair(Rt.getReg(), RI));
+ return;
+ }
+
+ case Hexagon::A4_boundscheck: {
+ MCOperand &Rs = Inst.getOperand(1);
+ assert (Rs.isReg() && "Expected register and none was found");
+ unsigned Reg = RI->getEncodingValue(Rs.getReg());
+ if (Reg & 1) // Odd mapped to raw:hi, regpair is rodd:odd-1, like r3:2
+ MappedInst.setOpcode(Hexagon::A4_boundscheck_hi);
+ else // raw:lo
+ MappedInst.setOpcode(Hexagon::A4_boundscheck_lo);
+ Rs.setReg(getHexagonRegisterPair(Rs.getReg(), RI));
+ return;
+ }
+ case Hexagon::S5_asrhub_rnd_sat_goodsyntax: {
+ MCOperand &MO = MappedInst.getOperand(2);
+ int64_t Imm;
+ MCExpr const *Expr = MO.getExpr();
+ bool Success = Expr->evaluateAsAbsolute(Imm);
+ assert (Success && "Expected immediate and none was found");
+ (void)Success;
+ MCInst TmpInst;
+ if (Imm == 0) {
+ TmpInst.setOpcode(Hexagon::S2_vsathub);
+ TmpInst.addOperand(MappedInst.getOperand(0));
+ TmpInst.addOperand(MappedInst.getOperand(1));
+ MappedInst = TmpInst;
+ return;
+ }
+ TmpInst.setOpcode(Hexagon::S5_asrhub_rnd_sat);
+ TmpInst.addOperand(MappedInst.getOperand(0));
+ TmpInst.addOperand(MappedInst.getOperand(1));
+ const MCExpr *One = MCConstantExpr::create(1, OutContext);
+ const MCExpr *Sub = MCBinaryExpr::createSub(Expr, One, OutContext);
+ TmpInst.addOperand(
+ MCOperand::createExpr(HexagonMCExpr::create(Sub, OutContext)));
+ MappedInst = TmpInst;
+ return;
+ }
+ case Hexagon::S5_vasrhrnd_goodsyntax:
+ case Hexagon::S2_asr_i_p_rnd_goodsyntax: {
+ MCOperand &MO2 = MappedInst.getOperand(2);
+ MCExpr const *Expr = MO2.getExpr();
+ int64_t Imm;
+ bool Success = Expr->evaluateAsAbsolute(Imm);
+ assert (Success && "Expected immediate and none was found");
+ (void)Success;
+ MCInst TmpInst;
+ if (Imm == 0) {
+ TmpInst.setOpcode(Hexagon::A2_combinew);
+ TmpInst.addOperand(MappedInst.getOperand(0));
+ MCOperand &MO1 = MappedInst.getOperand(1);
+ unsigned High = RI->getSubReg(MO1.getReg(), Hexagon::isub_hi);
+ unsigned Low = RI->getSubReg(MO1.getReg(), Hexagon::isub_lo);
+ // Add a new operand for the second register in the pair.
+ TmpInst.addOperand(MCOperand::createReg(High));
+ TmpInst.addOperand(MCOperand::createReg(Low));
+ MappedInst = TmpInst;
+ return;
+ }
+
+ if (Inst.getOpcode() == Hexagon::S2_asr_i_p_rnd_goodsyntax)
+ TmpInst.setOpcode(Hexagon::S2_asr_i_p_rnd);
+ else
+ TmpInst.setOpcode(Hexagon::S5_vasrhrnd);
+ TmpInst.addOperand(MappedInst.getOperand(0));
+ TmpInst.addOperand(MappedInst.getOperand(1));
+ const MCExpr *One = MCConstantExpr::create(1, OutContext);
+ const MCExpr *Sub = MCBinaryExpr::createSub(Expr, One, OutContext);
+ TmpInst.addOperand(
+ MCOperand::createExpr(HexagonMCExpr::create(Sub, OutContext)));
+ MappedInst = TmpInst;
+ return;
+ }
+ // if ("#u5==0") Assembler mapped to: "Rd=Rs"; else Rd=asr(Rs,#u5-1):rnd
+ case Hexagon::S2_asr_i_r_rnd_goodsyntax: {
+ MCOperand &MO = Inst.getOperand(2);
+ MCExpr const *Expr = MO.getExpr();
+ int64_t Imm;
+ bool Success = Expr->evaluateAsAbsolute(Imm);
+ assert (Success && "Expected immediate and none was found");
+ (void)Success;
+ MCInst TmpInst;
+ if (Imm == 0) {
+ TmpInst.setOpcode(Hexagon::A2_tfr);
+ TmpInst.addOperand(MappedInst.getOperand(0));
+ TmpInst.addOperand(MappedInst.getOperand(1));
+ MappedInst = TmpInst;
+ return;
+ }
+ TmpInst.setOpcode(Hexagon::S2_asr_i_r_rnd);
+ TmpInst.addOperand(MappedInst.getOperand(0));
+ TmpInst.addOperand(MappedInst.getOperand(1));
+ const MCExpr *One = MCConstantExpr::create(1, OutContext);
+ const MCExpr *Sub = MCBinaryExpr::createSub(Expr, One, OutContext);
+ TmpInst.addOperand(
+ MCOperand::createExpr(HexagonMCExpr::create(Sub, OutContext)));
+ MappedInst = TmpInst;
+ return;
+ }
+
+ // Translate a "$Rdd = #imm" to "$Rdd = combine(#[-1,0], #imm)"
+ case Hexagon::A2_tfrpi: {
+ MCInst TmpInst;
+ MCOperand &Rdd = MappedInst.getOperand(0);
+ MCOperand &MO = MappedInst.getOperand(1);
+
+ TmpInst.setOpcode(Hexagon::A2_combineii);
+ TmpInst.addOperand(Rdd);
+ int64_t Imm;
+ bool Success = MO.getExpr()->evaluateAsAbsolute(Imm);
+ if (Success && Imm < 0) {
+ const MCExpr *MOne = MCConstantExpr::create(-1, OutContext);
+ TmpInst.addOperand(MCOperand::createExpr(HexagonMCExpr::create(MOne, OutContext)));
+ } else {
+ const MCExpr *Zero = MCConstantExpr::create(0, OutContext);
+ TmpInst.addOperand(MCOperand::createExpr(HexagonMCExpr::create(Zero, OutContext)));
+ }
+ TmpInst.addOperand(MO);
+ MappedInst = TmpInst;
+ return;
+ }
+ // Translate a "$Rdd = $Rss" to "$Rdd = combine($Rs, $Rt)"
+ case Hexagon::A2_tfrp: {
+ MCOperand &MO = MappedInst.getOperand(1);
+ unsigned High = RI->getSubReg(MO.getReg(), Hexagon::isub_hi);
+ unsigned Low = RI->getSubReg(MO.getReg(), Hexagon::isub_lo);
+ MO.setReg(High);
+ // Add a new operand for the second register in the pair.
+ MappedInst.addOperand(MCOperand::createReg(Low));
+ MappedInst.setOpcode(Hexagon::A2_combinew);
+ return;
+ }
+
+ case Hexagon::A2_tfrpt:
+ case Hexagon::A2_tfrpf: {
+ MCOperand &MO = MappedInst.getOperand(2);
+ unsigned High = RI->getSubReg(MO.getReg(), Hexagon::isub_hi);
+ unsigned Low = RI->getSubReg(MO.getReg(), Hexagon::isub_lo);
+ MO.setReg(High);
+ // Add a new operand for the second register in the pair.
+ MappedInst.addOperand(MCOperand::createReg(Low));
+ MappedInst.setOpcode((Inst.getOpcode() == Hexagon::A2_tfrpt)
+ ? Hexagon::C2_ccombinewt
+ : Hexagon::C2_ccombinewf);
+ return;
+ }
+ case Hexagon::A2_tfrptnew:
+ case Hexagon::A2_tfrpfnew: {
+ MCOperand &MO = MappedInst.getOperand(2);
+ unsigned High = RI->getSubReg(MO.getReg(), Hexagon::isub_hi);
+ unsigned Low = RI->getSubReg(MO.getReg(), Hexagon::isub_lo);
+ MO.setReg(High);
+ // Add a new operand for the second register in the pair.
+ MappedInst.addOperand(MCOperand::createReg(Low));
+ MappedInst.setOpcode((Inst.getOpcode() == Hexagon::A2_tfrptnew)
+ ? Hexagon::C2_ccombinewnewt
+ : Hexagon::C2_ccombinewnewf);
+ return;
+ }
+
+ case Hexagon::M2_mpysmi: {
+ MCOperand &Imm = MappedInst.getOperand(2);
+ MCExpr const *Expr = Imm.getExpr();
+ int64_t Value;
+ bool Success = Expr->evaluateAsAbsolute(Value);
+ assert(Success);
+ (void)Success;
+ if (Value < 0 && Value > -256) {
+ MappedInst.setOpcode(Hexagon::M2_mpysin);
+ Imm.setExpr(HexagonMCExpr::create(
+ MCUnaryExpr::createMinus(Expr, OutContext), OutContext));
+ } else
+ MappedInst.setOpcode(Hexagon::M2_mpysip);
+ return;
+ }
+
+ case Hexagon::A2_addsp: {
+ MCOperand &Rt = Inst.getOperand(1);
+ assert (Rt.isReg() && "Expected register and none was found");
+ unsigned Reg = RI->getEncodingValue(Rt.getReg());
+ if (Reg & 1)
+ MappedInst.setOpcode(Hexagon::A2_addsph);
+ else
+ MappedInst.setOpcode(Hexagon::A2_addspl);
+ Rt.setReg(getHexagonRegisterPair(Rt.getReg(), RI));
+ return;
+ }
+ case Hexagon::V6_vd0:
+ case Hexagon::V6_vd0_128B: {
+ MCInst TmpInst;
+ assert (Inst.getOperand(0).isReg() &&
+ "Expected register and none was found");
+
+ TmpInst.setOpcode(Hexagon::V6_vxor);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(0));
+ MappedInst = TmpInst;
+ return;
+ }
+
+ }
+}
+
+
+/// printMachineInstruction -- Print out a single Hexagon MI in Darwin syntax to
+/// the current output stream.
+///
+void HexagonAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+ MCInst MCB = HexagonMCInstrInfo::createBundle();
+ const MCInstrInfo &MCII = *Subtarget->getInstrInfo();
+
+ if (MI->isBundle()) {
+ const MachineBasicBlock* MBB = MI->getParent();
+ MachineBasicBlock::const_instr_iterator MII = MI->getIterator();
+ unsigned IgnoreCount = 0;
+
+ for (++MII; MII != MBB->instr_end() && MII->isInsideBundle(); ++MII)
+ if (MII->getOpcode() == TargetOpcode::DBG_VALUE ||
+ MII->getOpcode() == TargetOpcode::IMPLICIT_DEF)
+ ++IgnoreCount;
+ else
+ HexagonLowerToMC(MCII, &*MII, MCB, *this);
+ }
+ else
+ HexagonLowerToMC(MCII, MI, MCB, *this);
+
+ bool Ok = HexagonMCInstrInfo::canonicalizePacket(
+ MCII, *Subtarget, OutStreamer->getContext(), MCB, nullptr);
+ assert(Ok);
+ (void)Ok;
+ if(HexagonMCInstrInfo::bundleSize(MCB) == 0)
+ return;
+ OutStreamer->EmitInstruction(MCB, getSubtargetInfo());
+}
+
+extern "C" void LLVMInitializeHexagonAsmPrinter() {
+ RegisterAsmPrinter<HexagonAsmPrinter> X(getTheHexagonTarget());
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h
new file mode 100644
index 000000000000..775da03e0f8c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h
@@ -0,0 +1,62 @@
+//===-- HexagonAsmPrinter.h - Print machine code to an Hexagon .s file ----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Hexagon Assembly printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONASMPRINTER_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONASMPRINTER_H
+
+#include "Hexagon.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+ class HexagonAsmPrinter : public AsmPrinter {
+ const HexagonSubtarget *Subtarget;
+
+ public:
+ explicit HexagonAsmPrinter(TargetMachine &TM,
+ std::unique_ptr<MCStreamer> Streamer);
+
+ bool runOnMachineFunction(MachineFunction &Fn) override {
+ Subtarget = &Fn.getSubtarget<HexagonSubtarget>();
+ return AsmPrinter::runOnMachineFunction(Fn);
+ }
+
+ StringRef getPassName() const override {
+ return "Hexagon Assembly Printer";
+ }
+
+ bool isBlockOnlyReachableByFallthrough(
+ const MachineBasicBlock *MBB) const override;
+
+ void EmitInstruction(const MachineInstr *MI) override;
+
+ void HexagonProcessInstruction(MCInst &Inst,
+ const MachineInstr &MBB);
+
+
+ void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
+ bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &OS) override;
+ bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &OS) override;
+
+ static const char *getRegisterName(unsigned RegNo);
+ };
+
+} // end of llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
new file mode 100644
index 000000000000..fe7278fde1b1
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -0,0 +1,2881 @@
+//===--- HexagonBitSimplify.cpp -------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hexbit"
+
+#include "HexagonBitTracker.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <limits>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+static cl::opt<bool> PreserveTiedOps("hexbit-keep-tied", cl::Hidden,
+ cl::init(true), cl::desc("Preserve subregisters in tied operands"));
+
+namespace llvm {
+
+ void initializeHexagonBitSimplifyPass(PassRegistry& Registry);
+ FunctionPass *createHexagonBitSimplify();
+
+} // end namespace llvm
+
+namespace {
+
+ // Set of virtual registers, based on BitVector.
+ struct RegisterSet : private BitVector {
+ RegisterSet() = default;
+ explicit RegisterSet(unsigned s, bool t = false) : BitVector(s, t) {}
+ RegisterSet(const RegisterSet &RS) = default;
+
+ using BitVector::clear;
+ using BitVector::count;
+
+ unsigned find_first() const {
+ int First = BitVector::find_first();
+ if (First < 0)
+ return 0;
+ return x2v(First);
+ }
+
+ unsigned find_next(unsigned Prev) const {
+ int Next = BitVector::find_next(v2x(Prev));
+ if (Next < 0)
+ return 0;
+ return x2v(Next);
+ }
+
+ RegisterSet &insert(unsigned R) {
+ unsigned Idx = v2x(R);
+ ensure(Idx);
+ return static_cast<RegisterSet&>(BitVector::set(Idx));
+ }
+ RegisterSet &remove(unsigned R) {
+ unsigned Idx = v2x(R);
+ if (Idx >= size())
+ return *this;
+ return static_cast<RegisterSet&>(BitVector::reset(Idx));
+ }
+
+ RegisterSet &insert(const RegisterSet &Rs) {
+ return static_cast<RegisterSet&>(BitVector::operator|=(Rs));
+ }
+ RegisterSet &remove(const RegisterSet &Rs) {
+ return static_cast<RegisterSet&>(BitVector::reset(Rs));
+ }
+
+ reference operator[](unsigned R) {
+ unsigned Idx = v2x(R);
+ ensure(Idx);
+ return BitVector::operator[](Idx);
+ }
+ bool operator[](unsigned R) const {
+ unsigned Idx = v2x(R);
+ assert(Idx < size());
+ return BitVector::operator[](Idx);
+ }
+ bool has(unsigned R) const {
+ unsigned Idx = v2x(R);
+ if (Idx >= size())
+ return false;
+ return BitVector::test(Idx);
+ }
+
+ bool empty() const {
+ return !BitVector::any();
+ }
+ bool includes(const RegisterSet &Rs) const {
+ // A.BitVector::test(B) <=> A-B != {}
+ return !Rs.BitVector::test(*this);
+ }
+ bool intersects(const RegisterSet &Rs) const {
+ return BitVector::anyCommon(Rs);
+ }
+
+ private:
+ void ensure(unsigned Idx) {
+ if (size() <= Idx)
+ resize(std::max(Idx+1, 32U));
+ }
+
+ static inline unsigned v2x(unsigned v) {
+ return TargetRegisterInfo::virtReg2Index(v);
+ }
+
+ static inline unsigned x2v(unsigned x) {
+ return TargetRegisterInfo::index2VirtReg(x);
+ }
+ };
+
+ struct PrintRegSet {
+ PrintRegSet(const RegisterSet &S, const TargetRegisterInfo *RI)
+ : RS(S), TRI(RI) {}
+
+ friend raw_ostream &operator<< (raw_ostream &OS,
+ const PrintRegSet &P);
+
+ private:
+ const RegisterSet &RS;
+ const TargetRegisterInfo *TRI;
+ };
+
+ raw_ostream &operator<< (raw_ostream &OS, const PrintRegSet &P)
+ LLVM_ATTRIBUTE_UNUSED;
+ raw_ostream &operator<< (raw_ostream &OS, const PrintRegSet &P) {
+ OS << '{';
+ for (unsigned R = P.RS.find_first(); R; R = P.RS.find_next(R))
+ OS << ' ' << PrintReg(R, P.TRI);
+ OS << " }";
+ return OS;
+ }
+
+ class Transformation;
+
+ class HexagonBitSimplify : public MachineFunctionPass {
+ public:
+ static char ID;
+
+ HexagonBitSimplify() : MachineFunctionPass(ID), MDT(nullptr) {
+ initializeHexagonBitSimplifyPass(*PassRegistry::getPassRegistry());
+ }
+
+ StringRef getPassName() const override {
+ return "Hexagon bit simplification";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ static void getInstrDefs(const MachineInstr &MI, RegisterSet &Defs);
+ static void getInstrUses(const MachineInstr &MI, RegisterSet &Uses);
+ static bool isEqual(const BitTracker::RegisterCell &RC1, uint16_t B1,
+ const BitTracker::RegisterCell &RC2, uint16_t B2, uint16_t W);
+ static bool isZero(const BitTracker::RegisterCell &RC, uint16_t B,
+ uint16_t W);
+ static bool getConst(const BitTracker::RegisterCell &RC, uint16_t B,
+ uint16_t W, uint64_t &U);
+ static bool replaceReg(unsigned OldR, unsigned NewR,
+ MachineRegisterInfo &MRI);
+ static bool getSubregMask(const BitTracker::RegisterRef &RR,
+ unsigned &Begin, unsigned &Width, MachineRegisterInfo &MRI);
+ static bool replaceRegWithSub(unsigned OldR, unsigned NewR,
+ unsigned NewSR, MachineRegisterInfo &MRI);
+ static bool replaceSubWithSub(unsigned OldR, unsigned OldSR,
+ unsigned NewR, unsigned NewSR, MachineRegisterInfo &MRI);
+ static bool parseRegSequence(const MachineInstr &I,
+ BitTracker::RegisterRef &SL, BitTracker::RegisterRef &SH,
+ const MachineRegisterInfo &MRI);
+
+ static bool getUsedBitsInStore(unsigned Opc, BitVector &Bits,
+ uint16_t Begin);
+ static bool getUsedBits(unsigned Opc, unsigned OpN, BitVector &Bits,
+ uint16_t Begin, const HexagonInstrInfo &HII);
+
+ static const TargetRegisterClass *getFinalVRegClass(
+ const BitTracker::RegisterRef &RR, MachineRegisterInfo &MRI);
+ static bool isTransparentCopy(const BitTracker::RegisterRef &RD,
+ const BitTracker::RegisterRef &RS, MachineRegisterInfo &MRI);
+
+ private:
+ MachineDominatorTree *MDT;
+
+ bool visitBlock(MachineBasicBlock &B, Transformation &T, RegisterSet &AVs);
+ static bool hasTiedUse(unsigned Reg, MachineRegisterInfo &MRI,
+ unsigned NewSub = Hexagon::NoSubRegister);
+ };
+
+ char HexagonBitSimplify::ID = 0;
+ typedef HexagonBitSimplify HBS;
+
+ // The purpose of this class is to provide a common facility to traverse
+ // the function top-down or bottom-up via the dominator tree, and keep
+ // track of the available registers.
+ class Transformation {
+ public:
+ bool TopDown;
+
+ Transformation(bool TD) : TopDown(TD) {}
+ virtual ~Transformation() = default;
+
+ virtual bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) = 0;
+ };
+
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(HexagonBitSimplify, "hexbit",
+ "Hexagon bit simplification", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(HexagonBitSimplify, "hexbit",
+ "Hexagon bit simplification", false, false)
+
+bool HexagonBitSimplify::visitBlock(MachineBasicBlock &B, Transformation &T,
+ RegisterSet &AVs) {
+ MachineDomTreeNode *N = MDT->getNode(&B);
+ typedef GraphTraits<MachineDomTreeNode*> GTN;
+ bool Changed = false;
+
+ if (T.TopDown)
+ Changed = T.processBlock(B, AVs);
+
+ RegisterSet Defs;
+ for (auto &I : B)
+ getInstrDefs(I, Defs);
+ RegisterSet NewAVs = AVs;
+ NewAVs.insert(Defs);
+
+ for (auto I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I) {
+ MachineBasicBlock *SB = (*I)->getBlock();
+ Changed |= visitBlock(*SB, T, NewAVs);
+ }
+ if (!T.TopDown)
+ Changed |= T.processBlock(B, AVs);
+
+ return Changed;
+}
+
+//
+// Utility functions:
+//
+void HexagonBitSimplify::getInstrDefs(const MachineInstr &MI,
+ RegisterSet &Defs) {
+ for (auto &Op : MI.operands()) {
+ if (!Op.isReg() || !Op.isDef())
+ continue;
+ unsigned R = Op.getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(R))
+ continue;
+ Defs.insert(R);
+ }
+}
+
+void HexagonBitSimplify::getInstrUses(const MachineInstr &MI,
+ RegisterSet &Uses) {
+ for (auto &Op : MI.operands()) {
+ if (!Op.isReg() || !Op.isUse())
+ continue;
+ unsigned R = Op.getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(R))
+ continue;
+ Uses.insert(R);
+ }
+}
+
+// Check if all the bits in range [B, E) in both cells are equal.
+bool HexagonBitSimplify::isEqual(const BitTracker::RegisterCell &RC1,
+ uint16_t B1, const BitTracker::RegisterCell &RC2, uint16_t B2,
+ uint16_t W) {
+ for (uint16_t i = 0; i < W; ++i) {
+ // If RC1[i] is "bottom", it cannot be proven equal to RC2[i].
+ if (RC1[B1+i].Type == BitTracker::BitValue::Ref && RC1[B1+i].RefI.Reg == 0)
+ return false;
+ // Same for RC2[i].
+ if (RC2[B2+i].Type == BitTracker::BitValue::Ref && RC2[B2+i].RefI.Reg == 0)
+ return false;
+ if (RC1[B1+i] != RC2[B2+i])
+ return false;
+ }
+ return true;
+}
+
+bool HexagonBitSimplify::isZero(const BitTracker::RegisterCell &RC,
+ uint16_t B, uint16_t W) {
+ assert(B < RC.width() && B+W <= RC.width());
+ for (uint16_t i = B; i < B+W; ++i)
+ if (!RC[i].is(0))
+ return false;
+ return true;
+}
+
+bool HexagonBitSimplify::getConst(const BitTracker::RegisterCell &RC,
+ uint16_t B, uint16_t W, uint64_t &U) {
+ assert(B < RC.width() && B+W <= RC.width());
+ int64_t T = 0;
+ for (uint16_t i = B+W; i > B; --i) {
+ const BitTracker::BitValue &BV = RC[i-1];
+ T <<= 1;
+ if (BV.is(1))
+ T |= 1;
+ else if (!BV.is(0))
+ return false;
+ }
+ U = T;
+ return true;
+}
+
+bool HexagonBitSimplify::replaceReg(unsigned OldR, unsigned NewR,
+ MachineRegisterInfo &MRI) {
+ if (!TargetRegisterInfo::isVirtualRegister(OldR) ||
+ !TargetRegisterInfo::isVirtualRegister(NewR))
+ return false;
+ auto Begin = MRI.use_begin(OldR), End = MRI.use_end();
+ decltype(End) NextI;
+ for (auto I = Begin; I != End; I = NextI) {
+ NextI = std::next(I);
+ I->setReg(NewR);
+ }
+ return Begin != End;
+}
+
+bool HexagonBitSimplify::replaceRegWithSub(unsigned OldR, unsigned NewR,
+ unsigned NewSR, MachineRegisterInfo &MRI) {
+ if (!TargetRegisterInfo::isVirtualRegister(OldR) ||
+ !TargetRegisterInfo::isVirtualRegister(NewR))
+ return false;
+ if (hasTiedUse(OldR, MRI, NewSR))
+ return false;
+ auto Begin = MRI.use_begin(OldR), End = MRI.use_end();
+ decltype(End) NextI;
+ for (auto I = Begin; I != End; I = NextI) {
+ NextI = std::next(I);
+ I->setReg(NewR);
+ I->setSubReg(NewSR);
+ }
+ return Begin != End;
+}
+
+bool HexagonBitSimplify::replaceSubWithSub(unsigned OldR, unsigned OldSR,
+ unsigned NewR, unsigned NewSR, MachineRegisterInfo &MRI) {
+ if (!TargetRegisterInfo::isVirtualRegister(OldR) ||
+ !TargetRegisterInfo::isVirtualRegister(NewR))
+ return false;
+ if (OldSR != NewSR && hasTiedUse(OldR, MRI, NewSR))
+ return false;
+ auto Begin = MRI.use_begin(OldR), End = MRI.use_end();
+ decltype(End) NextI;
+ for (auto I = Begin; I != End; I = NextI) {
+ NextI = std::next(I);
+ if (I->getSubReg() != OldSR)
+ continue;
+ I->setReg(NewR);
+ I->setSubReg(NewSR);
+ }
+ return Begin != End;
+}
+
+// For a register ref (pair Reg:Sub), set Begin to the position of the LSB
+// of Sub in Reg, and set Width to the size of Sub in bits. Return true,
+// if this succeeded, otherwise return false.
+bool HexagonBitSimplify::getSubregMask(const BitTracker::RegisterRef &RR,
+ unsigned &Begin, unsigned &Width, MachineRegisterInfo &MRI) {
+ const TargetRegisterClass *RC = MRI.getRegClass(RR.Reg);
+ if (RR.Sub == 0) {
+ Begin = 0;
+ Width = RC->getSize()*8;
+ return true;
+ }
+
+ Begin = 0;
+
+ switch (RC->getID()) {
+ case Hexagon::DoubleRegsRegClassID:
+ case Hexagon::VecDblRegsRegClassID:
+ case Hexagon::VecDblRegs128BRegClassID:
+ Width = RC->getSize()*8 / 2;
+ if (RR.Sub == Hexagon::isub_hi || RR.Sub == Hexagon::vsub_hi)
+ Begin = Width;
+ break;
+ default:
+ return false;
+ }
+ return true;
+}
+
+
+// For a REG_SEQUENCE, set SL to the low subregister and SH to the high
+// subregister.
+bool HexagonBitSimplify::parseRegSequence(const MachineInstr &I,
+ BitTracker::RegisterRef &SL, BitTracker::RegisterRef &SH,
+ const MachineRegisterInfo &MRI) {
+ assert(I.getOpcode() == TargetOpcode::REG_SEQUENCE);
+ unsigned Sub1 = I.getOperand(2).getImm(), Sub2 = I.getOperand(4).getImm();
+ auto *DstRC = MRI.getRegClass(I.getOperand(0).getReg());
+ auto &HRI = static_cast<const HexagonRegisterInfo&>(
+ *MRI.getTargetRegisterInfo());
+ unsigned SubLo = HRI.getHexagonSubRegIndex(DstRC, Hexagon::ps_sub_lo);
+ unsigned SubHi = HRI.getHexagonSubRegIndex(DstRC, Hexagon::ps_sub_hi);
+ assert((Sub1 == SubLo && Sub2 == SubHi) || (Sub1 == SubHi && Sub2 == SubLo));
+ if (Sub1 == SubLo && Sub2 == SubHi) {
+ SL = I.getOperand(1);
+ SH = I.getOperand(3);
+ return true;
+ }
+ if (Sub1 == SubHi && Sub2 == SubLo) {
+ SH = I.getOperand(1);
+ SL = I.getOperand(3);
+ return true;
+ }
+ return false;
+}
+
+// All stores (except 64-bit stores) take a 32-bit register as the source
+// of the value to be stored. If the instruction stores into a location
+// that is shorter than 32 bits, some bits of the source register are not
+// used. For each store instruction, calculate the set of used bits in
+// the source register, and set appropriate bits in Bits. Return true if
+// the bits are calculated, false otherwise.
+bool HexagonBitSimplify::getUsedBitsInStore(unsigned Opc, BitVector &Bits,
+ uint16_t Begin) {
+ using namespace Hexagon;
+
+ switch (Opc) {
+ // Store byte
+ case S2_storerb_io: // memb(Rs32+#s11:0)=Rt32
+ case S2_storerbnew_io: // memb(Rs32+#s11:0)=Nt8.new
+ case S2_pstorerbt_io: // if (Pv4) memb(Rs32+#u6:0)=Rt32
+ case S2_pstorerbf_io: // if (!Pv4) memb(Rs32+#u6:0)=Rt32
+ case S4_pstorerbtnew_io: // if (Pv4.new) memb(Rs32+#u6:0)=Rt32
+ case S4_pstorerbfnew_io: // if (!Pv4.new) memb(Rs32+#u6:0)=Rt32
+ case S2_pstorerbnewt_io: // if (Pv4) memb(Rs32+#u6:0)=Nt8.new
+ case S2_pstorerbnewf_io: // if (!Pv4) memb(Rs32+#u6:0)=Nt8.new
+ case S4_pstorerbnewtnew_io: // if (Pv4.new) memb(Rs32+#u6:0)=Nt8.new
+ case S4_pstorerbnewfnew_io: // if (!Pv4.new) memb(Rs32+#u6:0)=Nt8.new
+ case S2_storerb_pi: // memb(Rx32++#s4:0)=Rt32
+ case S2_storerbnew_pi: // memb(Rx32++#s4:0)=Nt8.new
+ case S2_pstorerbt_pi: // if (Pv4) memb(Rx32++#s4:0)=Rt32
+ case S2_pstorerbf_pi: // if (!Pv4) memb(Rx32++#s4:0)=Rt32
+ case S2_pstorerbtnew_pi: // if (Pv4.new) memb(Rx32++#s4:0)=Rt32
+ case S2_pstorerbfnew_pi: // if (!Pv4.new) memb(Rx32++#s4:0)=Rt32
+ case S2_pstorerbnewt_pi: // if (Pv4) memb(Rx32++#s4:0)=Nt8.new
+ case S2_pstorerbnewf_pi: // if (!Pv4) memb(Rx32++#s4:0)=Nt8.new
+ case S2_pstorerbnewtnew_pi: // if (Pv4.new) memb(Rx32++#s4:0)=Nt8.new
+ case S2_pstorerbnewfnew_pi: // if (!Pv4.new) memb(Rx32++#s4:0)=Nt8.new
+ case S4_storerb_ap: // memb(Re32=#U6)=Rt32
+ case S4_storerbnew_ap: // memb(Re32=#U6)=Nt8.new
+ case S2_storerb_pr: // memb(Rx32++Mu2)=Rt32
+ case S2_storerbnew_pr: // memb(Rx32++Mu2)=Nt8.new
+ case S4_storerb_ur: // memb(Ru32<<#u2+#U6)=Rt32
+ case S4_storerbnew_ur: // memb(Ru32<<#u2+#U6)=Nt8.new
+ case S2_storerb_pbr: // memb(Rx32++Mu2:brev)=Rt32
+ case S2_storerbnew_pbr: // memb(Rx32++Mu2:brev)=Nt8.new
+ case S2_storerb_pci: // memb(Rx32++#s4:0:circ(Mu2))=Rt32
+ case S2_storerbnew_pci: // memb(Rx32++#s4:0:circ(Mu2))=Nt8.new
+ case S2_storerb_pcr: // memb(Rx32++I:circ(Mu2))=Rt32
+ case S2_storerbnew_pcr: // memb(Rx32++I:circ(Mu2))=Nt8.new
+ case S4_storerb_rr: // memb(Rs32+Ru32<<#u2)=Rt32
+ case S4_storerbnew_rr: // memb(Rs32+Ru32<<#u2)=Nt8.new
+ case S4_pstorerbt_rr: // if (Pv4) memb(Rs32+Ru32<<#u2)=Rt32
+ case S4_pstorerbf_rr: // if (!Pv4) memb(Rs32+Ru32<<#u2)=Rt32
+ case S4_pstorerbtnew_rr: // if (Pv4.new) memb(Rs32+Ru32<<#u2)=Rt32
+ case S4_pstorerbfnew_rr: // if (!Pv4.new) memb(Rs32+Ru32<<#u2)=Rt32
+ case S4_pstorerbnewt_rr: // if (Pv4) memb(Rs32+Ru32<<#u2)=Nt8.new
+ case S4_pstorerbnewf_rr: // if (!Pv4) memb(Rs32+Ru32<<#u2)=Nt8.new
+ case S4_pstorerbnewtnew_rr: // if (Pv4.new) memb(Rs32+Ru32<<#u2)=Nt8.new
+ case S4_pstorerbnewfnew_rr: // if (!Pv4.new) memb(Rs32+Ru32<<#u2)=Nt8.new
+ case S2_storerbgp: // memb(gp+#u16:0)=Rt32
+ case S2_storerbnewgp: // memb(gp+#u16:0)=Nt8.new
+ case S4_pstorerbt_abs: // if (Pv4) memb(#u6)=Rt32
+ case S4_pstorerbf_abs: // if (!Pv4) memb(#u6)=Rt32
+ case S4_pstorerbtnew_abs: // if (Pv4.new) memb(#u6)=Rt32
+ case S4_pstorerbfnew_abs: // if (!Pv4.new) memb(#u6)=Rt32
+ case S4_pstorerbnewt_abs: // if (Pv4) memb(#u6)=Nt8.new
+ case S4_pstorerbnewf_abs: // if (!Pv4) memb(#u6)=Nt8.new
+ case S4_pstorerbnewtnew_abs: // if (Pv4.new) memb(#u6)=Nt8.new
+ case S4_pstorerbnewfnew_abs: // if (!Pv4.new) memb(#u6)=Nt8.new
+ Bits.set(Begin, Begin+8);
+ return true;
+
+ // Store low half
+ case S2_storerh_io: // memh(Rs32+#s11:1)=Rt32
+ case S2_storerhnew_io: // memh(Rs32+#s11:1)=Nt8.new
+ case S2_pstorerht_io: // if (Pv4) memh(Rs32+#u6:1)=Rt32
+ case S2_pstorerhf_io: // if (!Pv4) memh(Rs32+#u6:1)=Rt32
+ case S4_pstorerhtnew_io: // if (Pv4.new) memh(Rs32+#u6:1)=Rt32
+ case S4_pstorerhfnew_io: // if (!Pv4.new) memh(Rs32+#u6:1)=Rt32
+ case S2_pstorerhnewt_io: // if (Pv4) memh(Rs32+#u6:1)=Nt8.new
+ case S2_pstorerhnewf_io: // if (!Pv4) memh(Rs32+#u6:1)=Nt8.new
+ case S4_pstorerhnewtnew_io: // if (Pv4.new) memh(Rs32+#u6:1)=Nt8.new
+ case S4_pstorerhnewfnew_io: // if (!Pv4.new) memh(Rs32+#u6:1)=Nt8.new
+ case S2_storerh_pi: // memh(Rx32++#s4:1)=Rt32
+ case S2_storerhnew_pi: // memh(Rx32++#s4:1)=Nt8.new
+ case S2_pstorerht_pi: // if (Pv4) memh(Rx32++#s4:1)=Rt32
+ case S2_pstorerhf_pi: // if (!Pv4) memh(Rx32++#s4:1)=Rt32
+ case S2_pstorerhtnew_pi: // if (Pv4.new) memh(Rx32++#s4:1)=Rt32
+ case S2_pstorerhfnew_pi: // if (!Pv4.new) memh(Rx32++#s4:1)=Rt32
+ case S2_pstorerhnewt_pi: // if (Pv4) memh(Rx32++#s4:1)=Nt8.new
+ case S2_pstorerhnewf_pi: // if (!Pv4) memh(Rx32++#s4:1)=Nt8.new
+ case S2_pstorerhnewtnew_pi: // if (Pv4.new) memh(Rx32++#s4:1)=Nt8.new
+ case S2_pstorerhnewfnew_pi: // if (!Pv4.new) memh(Rx32++#s4:1)=Nt8.new
+ case S4_storerh_ap: // memh(Re32=#U6)=Rt32
+ case S4_storerhnew_ap: // memh(Re32=#U6)=Nt8.new
+ case S2_storerh_pr: // memh(Rx32++Mu2)=Rt32
+ case S2_storerhnew_pr: // memh(Rx32++Mu2)=Nt8.new
+ case S4_storerh_ur: // memh(Ru32<<#u2+#U6)=Rt32
+ case S4_storerhnew_ur: // memh(Ru32<<#u2+#U6)=Nt8.new
+ case S2_storerh_pbr: // memh(Rx32++Mu2:brev)=Rt32
+ case S2_storerhnew_pbr: // memh(Rx32++Mu2:brev)=Nt8.new
+ case S2_storerh_pci: // memh(Rx32++#s4:1:circ(Mu2))=Rt32
+ case S2_storerhnew_pci: // memh(Rx32++#s4:1:circ(Mu2))=Nt8.new
+ case S2_storerh_pcr: // memh(Rx32++I:circ(Mu2))=Rt32
+ case S2_storerhnew_pcr: // memh(Rx32++I:circ(Mu2))=Nt8.new
+ case S4_storerh_rr: // memh(Rs32+Ru32<<#u2)=Rt32
+ case S4_pstorerht_rr: // if (Pv4) memh(Rs32+Ru32<<#u2)=Rt32
+ case S4_pstorerhf_rr: // if (!Pv4) memh(Rs32+Ru32<<#u2)=Rt32
+ case S4_pstorerhtnew_rr: // if (Pv4.new) memh(Rs32+Ru32<<#u2)=Rt32
+ case S4_pstorerhfnew_rr: // if (!Pv4.new) memh(Rs32+Ru32<<#u2)=Rt32
+ case S4_storerhnew_rr: // memh(Rs32+Ru32<<#u2)=Nt8.new
+ case S4_pstorerhnewt_rr: // if (Pv4) memh(Rs32+Ru32<<#u2)=Nt8.new
+ case S4_pstorerhnewf_rr: // if (!Pv4) memh(Rs32+Ru32<<#u2)=Nt8.new
+ case S4_pstorerhnewtnew_rr: // if (Pv4.new) memh(Rs32+Ru32<<#u2)=Nt8.new
+ case S4_pstorerhnewfnew_rr: // if (!Pv4.new) memh(Rs32+Ru32<<#u2)=Nt8.new
+ case S2_storerhgp: // memh(gp+#u16:1)=Rt32
+ case S2_storerhnewgp: // memh(gp+#u16:1)=Nt8.new
+ case S4_pstorerht_abs: // if (Pv4) memh(#u6)=Rt32
+ case S4_pstorerhf_abs: // if (!Pv4) memh(#u6)=Rt32
+ case S4_pstorerhtnew_abs: // if (Pv4.new) memh(#u6)=Rt32
+ case S4_pstorerhfnew_abs: // if (!Pv4.new) memh(#u6)=Rt32
+ case S4_pstorerhnewt_abs: // if (Pv4) memh(#u6)=Nt8.new
+ case S4_pstorerhnewf_abs: // if (!Pv4) memh(#u6)=Nt8.new
+ case S4_pstorerhnewtnew_abs: // if (Pv4.new) memh(#u6)=Nt8.new
+ case S4_pstorerhnewfnew_abs: // if (!Pv4.new) memh(#u6)=Nt8.new
+ Bits.set(Begin, Begin+16);
+ return true;
+
+ // Store high half
+ case S2_storerf_io: // memh(Rs32+#s11:1)=Rt.H32
+ case S2_pstorerft_io: // if (Pv4) memh(Rs32+#u6:1)=Rt.H32
+ case S2_pstorerff_io: // if (!Pv4) memh(Rs32+#u6:1)=Rt.H32
+ case S4_pstorerftnew_io: // if (Pv4.new) memh(Rs32+#u6:1)=Rt.H32
+ case S4_pstorerffnew_io: // if (!Pv4.new) memh(Rs32+#u6:1)=Rt.H32
+ case S2_storerf_pi: // memh(Rx32++#s4:1)=Rt.H32
+ case S2_pstorerft_pi: // if (Pv4) memh(Rx32++#s4:1)=Rt.H32
+ case S2_pstorerff_pi: // if (!Pv4) memh(Rx32++#s4:1)=Rt.H32
+ case S2_pstorerftnew_pi: // if (Pv4.new) memh(Rx32++#s4:1)=Rt.H32
+ case S2_pstorerffnew_pi: // if (!Pv4.new) memh(Rx32++#s4:1)=Rt.H32
+ case S4_storerf_ap: // memh(Re32=#U6)=Rt.H32
+ case S2_storerf_pr: // memh(Rx32++Mu2)=Rt.H32
+ case S4_storerf_ur: // memh(Ru32<<#u2+#U6)=Rt.H32
+ case S2_storerf_pbr: // memh(Rx32++Mu2:brev)=Rt.H32
+ case S2_storerf_pci: // memh(Rx32++#s4:1:circ(Mu2))=Rt.H32
+ case S2_storerf_pcr: // memh(Rx32++I:circ(Mu2))=Rt.H32
+ case S4_storerf_rr: // memh(Rs32+Ru32<<#u2)=Rt.H32
+ case S4_pstorerft_rr: // if (Pv4) memh(Rs32+Ru32<<#u2)=Rt.H32
+ case S4_pstorerff_rr: // if (!Pv4) memh(Rs32+Ru32<<#u2)=Rt.H32
+ case S4_pstorerftnew_rr: // if (Pv4.new) memh(Rs32+Ru32<<#u2)=Rt.H32
+ case S4_pstorerffnew_rr: // if (!Pv4.new) memh(Rs32+Ru32<<#u2)=Rt.H32
+ case S2_storerfgp: // memh(gp+#u16:1)=Rt.H32
+ case S4_pstorerft_abs: // if (Pv4) memh(#u6)=Rt.H32
+ case S4_pstorerff_abs: // if (!Pv4) memh(#u6)=Rt.H32
+ case S4_pstorerftnew_abs: // if (Pv4.new) memh(#u6)=Rt.H32
+ case S4_pstorerffnew_abs: // if (!Pv4.new) memh(#u6)=Rt.H32
+ Bits.set(Begin+16, Begin+32);
+ return true;
+ }
+
+ return false;
+}
+
+// For an instruction with opcode Opc, calculate the set of bits that it
+// uses in a register in operand OpN. This only calculates the set of used
+// bits for cases where it does not depend on any operands (as is the case
+// in shifts, for example). For concrete instructions from a program, the
+// operand may be a subregister of a larger register, while Bits would
+// correspond to the larger register in its entirety. Because of that,
+// the parameter Begin can be used to indicate which bit of Bits should be
+// considered the LSB of of the operand.
+bool HexagonBitSimplify::getUsedBits(unsigned Opc, unsigned OpN,
+ BitVector &Bits, uint16_t Begin, const HexagonInstrInfo &HII) {
+ using namespace Hexagon;
+
+ const MCInstrDesc &D = HII.get(Opc);
+ if (D.mayStore()) {
+ if (OpN == D.getNumOperands()-1)
+ return getUsedBitsInStore(Opc, Bits, Begin);
+ return false;
+ }
+
+ switch (Opc) {
+ // One register source. Used bits: R1[0-7].
+ case A2_sxtb:
+ case A2_zxtb:
+ case A4_cmpbeqi:
+ case A4_cmpbgti:
+ case A4_cmpbgtui:
+ if (OpN == 1) {
+ Bits.set(Begin, Begin+8);
+ return true;
+ }
+ break;
+
+ // One register source. Used bits: R1[0-15].
+ case A2_aslh:
+ case A2_sxth:
+ case A2_zxth:
+ case A4_cmpheqi:
+ case A4_cmphgti:
+ case A4_cmphgtui:
+ if (OpN == 1) {
+ Bits.set(Begin, Begin+16);
+ return true;
+ }
+ break;
+
+ // One register source. Used bits: R1[16-31].
+ case A2_asrh:
+ if (OpN == 1) {
+ Bits.set(Begin+16, Begin+32);
+ return true;
+ }
+ break;
+
+ // Two register sources. Used bits: R1[0-7], R2[0-7].
+ case A4_cmpbeq:
+ case A4_cmpbgt:
+ case A4_cmpbgtu:
+ if (OpN == 1) {
+ Bits.set(Begin, Begin+8);
+ return true;
+ }
+ break;
+
+ // Two register sources. Used bits: R1[0-15], R2[0-15].
+ case A4_cmpheq:
+ case A4_cmphgt:
+ case A4_cmphgtu:
+ case A2_addh_h16_ll:
+ case A2_addh_h16_sat_ll:
+ case A2_addh_l16_ll:
+ case A2_addh_l16_sat_ll:
+ case A2_combine_ll:
+ case A2_subh_h16_ll:
+ case A2_subh_h16_sat_ll:
+ case A2_subh_l16_ll:
+ case A2_subh_l16_sat_ll:
+ case M2_mpy_acc_ll_s0:
+ case M2_mpy_acc_ll_s1:
+ case M2_mpy_acc_sat_ll_s0:
+ case M2_mpy_acc_sat_ll_s1:
+ case M2_mpy_ll_s0:
+ case M2_mpy_ll_s1:
+ case M2_mpy_nac_ll_s0:
+ case M2_mpy_nac_ll_s1:
+ case M2_mpy_nac_sat_ll_s0:
+ case M2_mpy_nac_sat_ll_s1:
+ case M2_mpy_rnd_ll_s0:
+ case M2_mpy_rnd_ll_s1:
+ case M2_mpy_sat_ll_s0:
+ case M2_mpy_sat_ll_s1:
+ case M2_mpy_sat_rnd_ll_s0:
+ case M2_mpy_sat_rnd_ll_s1:
+ case M2_mpyd_acc_ll_s0:
+ case M2_mpyd_acc_ll_s1:
+ case M2_mpyd_ll_s0:
+ case M2_mpyd_ll_s1:
+ case M2_mpyd_nac_ll_s0:
+ case M2_mpyd_nac_ll_s1:
+ case M2_mpyd_rnd_ll_s0:
+ case M2_mpyd_rnd_ll_s1:
+ case M2_mpyu_acc_ll_s0:
+ case M2_mpyu_acc_ll_s1:
+ case M2_mpyu_ll_s0:
+ case M2_mpyu_ll_s1:
+ case M2_mpyu_nac_ll_s0:
+ case M2_mpyu_nac_ll_s1:
+ case M2_mpyud_acc_ll_s0:
+ case M2_mpyud_acc_ll_s1:
+ case M2_mpyud_ll_s0:
+ case M2_mpyud_ll_s1:
+ case M2_mpyud_nac_ll_s0:
+ case M2_mpyud_nac_ll_s1:
+ if (OpN == 1 || OpN == 2) {
+ Bits.set(Begin, Begin+16);
+ return true;
+ }
+ break;
+
+ // Two register sources. Used bits: R1[0-15], R2[16-31].
+ case A2_addh_h16_lh:
+ case A2_addh_h16_sat_lh:
+ case A2_combine_lh:
+ case A2_subh_h16_lh:
+ case A2_subh_h16_sat_lh:
+ case M2_mpy_acc_lh_s0:
+ case M2_mpy_acc_lh_s1:
+ case M2_mpy_acc_sat_lh_s0:
+ case M2_mpy_acc_sat_lh_s1:
+ case M2_mpy_lh_s0:
+ case M2_mpy_lh_s1:
+ case M2_mpy_nac_lh_s0:
+ case M2_mpy_nac_lh_s1:
+ case M2_mpy_nac_sat_lh_s0:
+ case M2_mpy_nac_sat_lh_s1:
+ case M2_mpy_rnd_lh_s0:
+ case M2_mpy_rnd_lh_s1:
+ case M2_mpy_sat_lh_s0:
+ case M2_mpy_sat_lh_s1:
+ case M2_mpy_sat_rnd_lh_s0:
+ case M2_mpy_sat_rnd_lh_s1:
+ case M2_mpyd_acc_lh_s0:
+ case M2_mpyd_acc_lh_s1:
+ case M2_mpyd_lh_s0:
+ case M2_mpyd_lh_s1:
+ case M2_mpyd_nac_lh_s0:
+ case M2_mpyd_nac_lh_s1:
+ case M2_mpyd_rnd_lh_s0:
+ case M2_mpyd_rnd_lh_s1:
+ case M2_mpyu_acc_lh_s0:
+ case M2_mpyu_acc_lh_s1:
+ case M2_mpyu_lh_s0:
+ case M2_mpyu_lh_s1:
+ case M2_mpyu_nac_lh_s0:
+ case M2_mpyu_nac_lh_s1:
+ case M2_mpyud_acc_lh_s0:
+ case M2_mpyud_acc_lh_s1:
+ case M2_mpyud_lh_s0:
+ case M2_mpyud_lh_s1:
+ case M2_mpyud_nac_lh_s0:
+ case M2_mpyud_nac_lh_s1:
+ // These four are actually LH.
+ case A2_addh_l16_hl:
+ case A2_addh_l16_sat_hl:
+ case A2_subh_l16_hl:
+ case A2_subh_l16_sat_hl:
+ if (OpN == 1) {
+ Bits.set(Begin, Begin+16);
+ return true;
+ }
+ if (OpN == 2) {
+ Bits.set(Begin+16, Begin+32);
+ return true;
+ }
+ break;
+
+ // Two register sources, used bits: R1[16-31], R2[0-15].
+ case A2_addh_h16_hl:
+ case A2_addh_h16_sat_hl:
+ case A2_combine_hl:
+ case A2_subh_h16_hl:
+ case A2_subh_h16_sat_hl:
+ case M2_mpy_acc_hl_s0:
+ case M2_mpy_acc_hl_s1:
+ case M2_mpy_acc_sat_hl_s0:
+ case M2_mpy_acc_sat_hl_s1:
+ case M2_mpy_hl_s0:
+ case M2_mpy_hl_s1:
+ case M2_mpy_nac_hl_s0:
+ case M2_mpy_nac_hl_s1:
+ case M2_mpy_nac_sat_hl_s0:
+ case M2_mpy_nac_sat_hl_s1:
+ case M2_mpy_rnd_hl_s0:
+ case M2_mpy_rnd_hl_s1:
+ case M2_mpy_sat_hl_s0:
+ case M2_mpy_sat_hl_s1:
+ case M2_mpy_sat_rnd_hl_s0:
+ case M2_mpy_sat_rnd_hl_s1:
+ case M2_mpyd_acc_hl_s0:
+ case M2_mpyd_acc_hl_s1:
+ case M2_mpyd_hl_s0:
+ case M2_mpyd_hl_s1:
+ case M2_mpyd_nac_hl_s0:
+ case M2_mpyd_nac_hl_s1:
+ case M2_mpyd_rnd_hl_s0:
+ case M2_mpyd_rnd_hl_s1:
+ case M2_mpyu_acc_hl_s0:
+ case M2_mpyu_acc_hl_s1:
+ case M2_mpyu_hl_s0:
+ case M2_mpyu_hl_s1:
+ case M2_mpyu_nac_hl_s0:
+ case M2_mpyu_nac_hl_s1:
+ case M2_mpyud_acc_hl_s0:
+ case M2_mpyud_acc_hl_s1:
+ case M2_mpyud_hl_s0:
+ case M2_mpyud_hl_s1:
+ case M2_mpyud_nac_hl_s0:
+ case M2_mpyud_nac_hl_s1:
+ if (OpN == 1) {
+ Bits.set(Begin+16, Begin+32);
+ return true;
+ }
+ if (OpN == 2) {
+ Bits.set(Begin, Begin+16);
+ return true;
+ }
+ break;
+
+ // Two register sources, used bits: R1[16-31], R2[16-31].
+ case A2_addh_h16_hh:
+ case A2_addh_h16_sat_hh:
+ case A2_combine_hh:
+ case A2_subh_h16_hh:
+ case A2_subh_h16_sat_hh:
+ case M2_mpy_acc_hh_s0:
+ case M2_mpy_acc_hh_s1:
+ case M2_mpy_acc_sat_hh_s0:
+ case M2_mpy_acc_sat_hh_s1:
+ case M2_mpy_hh_s0:
+ case M2_mpy_hh_s1:
+ case M2_mpy_nac_hh_s0:
+ case M2_mpy_nac_hh_s1:
+ case M2_mpy_nac_sat_hh_s0:
+ case M2_mpy_nac_sat_hh_s1:
+ case M2_mpy_rnd_hh_s0:
+ case M2_mpy_rnd_hh_s1:
+ case M2_mpy_sat_hh_s0:
+ case M2_mpy_sat_hh_s1:
+ case M2_mpy_sat_rnd_hh_s0:
+ case M2_mpy_sat_rnd_hh_s1:
+ case M2_mpyd_acc_hh_s0:
+ case M2_mpyd_acc_hh_s1:
+ case M2_mpyd_hh_s0:
+ case M2_mpyd_hh_s1:
+ case M2_mpyd_nac_hh_s0:
+ case M2_mpyd_nac_hh_s1:
+ case M2_mpyd_rnd_hh_s0:
+ case M2_mpyd_rnd_hh_s1:
+ case M2_mpyu_acc_hh_s0:
+ case M2_mpyu_acc_hh_s1:
+ case M2_mpyu_hh_s0:
+ case M2_mpyu_hh_s1:
+ case M2_mpyu_nac_hh_s0:
+ case M2_mpyu_nac_hh_s1:
+ case M2_mpyud_acc_hh_s0:
+ case M2_mpyud_acc_hh_s1:
+ case M2_mpyud_hh_s0:
+ case M2_mpyud_hh_s1:
+ case M2_mpyud_nac_hh_s0:
+ case M2_mpyud_nac_hh_s1:
+ if (OpN == 1 || OpN == 2) {
+ Bits.set(Begin+16, Begin+32);
+ return true;
+ }
+ break;
+ }
+
+ return false;
+}
+
+// Calculate the register class that matches Reg:Sub. For example, if
+// vreg1 is a double register, then vreg1:isub_hi would match the "int"
+// register class.
+const TargetRegisterClass *HexagonBitSimplify::getFinalVRegClass(
+ const BitTracker::RegisterRef &RR, MachineRegisterInfo &MRI) {
+ if (!TargetRegisterInfo::isVirtualRegister(RR.Reg))
+ return nullptr;
+ auto *RC = MRI.getRegClass(RR.Reg);
+ if (RR.Sub == 0)
+ return RC;
+ auto &HRI = static_cast<const HexagonRegisterInfo&>(
+ *MRI.getTargetRegisterInfo());
+
+ auto VerifySR = [&HRI] (const TargetRegisterClass *RC, unsigned Sub) -> void {
+ assert(Sub == HRI.getHexagonSubRegIndex(RC, Hexagon::ps_sub_lo) ||
+ Sub == HRI.getHexagonSubRegIndex(RC, Hexagon::ps_sub_hi));
+ };
+
+ switch (RC->getID()) {
+ case Hexagon::DoubleRegsRegClassID:
+ VerifySR(RC, RR.Sub);
+ return &Hexagon::IntRegsRegClass;
+ case Hexagon::VecDblRegsRegClassID:
+ VerifySR(RC, RR.Sub);
+ return &Hexagon::VectorRegsRegClass;
+ case Hexagon::VecDblRegs128BRegClassID:
+ VerifySR(RC, RR.Sub);
+ return &Hexagon::VectorRegs128BRegClass;
+ }
+ return nullptr;
+}
+
+// Check if RD could be replaced with RS at any possible use of RD.
+// For example a predicate register cannot be replaced with a integer
+// register, but a 64-bit register with a subregister can be replaced
+// with a 32-bit register.
+bool HexagonBitSimplify::isTransparentCopy(const BitTracker::RegisterRef &RD,
+ const BitTracker::RegisterRef &RS, MachineRegisterInfo &MRI) {
+ if (!TargetRegisterInfo::isVirtualRegister(RD.Reg) ||
+ !TargetRegisterInfo::isVirtualRegister(RS.Reg))
+ return false;
+ // Return false if one (or both) classes are nullptr.
+ auto *DRC = getFinalVRegClass(RD, MRI);
+ if (!DRC)
+ return false;
+
+ return DRC == getFinalVRegClass(RS, MRI);
+}
+
+bool HexagonBitSimplify::hasTiedUse(unsigned Reg, MachineRegisterInfo &MRI,
+ unsigned NewSub) {
+ if (!PreserveTiedOps)
+ return false;
+ return llvm::any_of(MRI.use_operands(Reg),
+ [NewSub] (const MachineOperand &Op) -> bool {
+ return Op.getSubReg() != NewSub && Op.isTied();
+ });
+}
+
+namespace {
+
+ class DeadCodeElimination {
+ public:
+ DeadCodeElimination(MachineFunction &mf, MachineDominatorTree &mdt)
+ : MF(mf), HII(*MF.getSubtarget<HexagonSubtarget>().getInstrInfo()),
+ MDT(mdt), MRI(mf.getRegInfo()) {}
+
+ bool run() {
+ return runOnNode(MDT.getRootNode());
+ }
+
+ private:
+ bool isDead(unsigned R) const;
+ bool runOnNode(MachineDomTreeNode *N);
+
+ MachineFunction &MF;
+ const HexagonInstrInfo &HII;
+ MachineDominatorTree &MDT;
+ MachineRegisterInfo &MRI;
+ };
+
+} // end anonymous namespace
+
+bool DeadCodeElimination::isDead(unsigned R) const {
+ for (auto I = MRI.use_begin(R), E = MRI.use_end(); I != E; ++I) {
+ MachineInstr *UseI = I->getParent();
+ if (UseI->isDebugValue())
+ continue;
+ if (UseI->isPHI()) {
+ assert(!UseI->getOperand(0).getSubReg());
+ unsigned DR = UseI->getOperand(0).getReg();
+ if (DR == R)
+ continue;
+ }
+ return false;
+ }
+ return true;
+}
+
+bool DeadCodeElimination::runOnNode(MachineDomTreeNode *N) {
+ bool Changed = false;
+ typedef GraphTraits<MachineDomTreeNode*> GTN;
+ for (auto I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I)
+ Changed |= runOnNode(*I);
+
+ MachineBasicBlock *B = N->getBlock();
+ std::vector<MachineInstr*> Instrs;
+ for (auto I = B->rbegin(), E = B->rend(); I != E; ++I)
+ Instrs.push_back(&*I);
+
+ for (auto MI : Instrs) {
+ unsigned Opc = MI->getOpcode();
+ // Do not touch lifetime markers. This is why the target-independent DCE
+ // cannot be used.
+ if (Opc == TargetOpcode::LIFETIME_START ||
+ Opc == TargetOpcode::LIFETIME_END)
+ continue;
+ bool Store = false;
+ if (MI->isInlineAsm())
+ continue;
+ // Delete PHIs if possible.
+ if (!MI->isPHI() && !MI->isSafeToMove(nullptr, Store))
+ continue;
+
+ bool AllDead = true;
+ SmallVector<unsigned,2> Regs;
+ for (auto &Op : MI->operands()) {
+ if (!Op.isReg() || !Op.isDef())
+ continue;
+ unsigned R = Op.getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(R) || !isDead(R)) {
+ AllDead = false;
+ break;
+ }
+ Regs.push_back(R);
+ }
+ if (!AllDead)
+ continue;
+
+ B->erase(MI);
+ for (unsigned i = 0, n = Regs.size(); i != n; ++i)
+ MRI.markUsesInDebugValueAsUndef(Regs[i]);
+ Changed = true;
+ }
+
+ return Changed;
+}
+
+namespace {
+
+// Eliminate redundant instructions
+//
+// This transformation will identify instructions where the output register
+// is the same as one of its input registers. This only works on instructions
+// that define a single register (unlike post-increment loads, for example).
+// The equality check is actually more detailed: the code calculates which
+// bits of the output are used, and only compares these bits with the input
+// registers.
+// If the output matches an input, the instruction is replaced with COPY.
+// The copies will be removed by another transformation.
+ class RedundantInstrElimination : public Transformation {
+ public:
+ RedundantInstrElimination(BitTracker &bt, const HexagonInstrInfo &hii,
+ MachineRegisterInfo &mri)
+ : Transformation(true), HII(hii), MRI(mri), BT(bt) {}
+
+ bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) override;
+
+ private:
+ bool isLossyShiftLeft(const MachineInstr &MI, unsigned OpN,
+ unsigned &LostB, unsigned &LostE);
+ bool isLossyShiftRight(const MachineInstr &MI, unsigned OpN,
+ unsigned &LostB, unsigned &LostE);
+ bool computeUsedBits(unsigned Reg, BitVector &Bits);
+ bool computeUsedBits(const MachineInstr &MI, unsigned OpN, BitVector &Bits,
+ uint16_t Begin);
+ bool usedBitsEqual(BitTracker::RegisterRef RD, BitTracker::RegisterRef RS);
+
+ const HexagonInstrInfo &HII;
+ MachineRegisterInfo &MRI;
+ BitTracker &BT;
+ };
+
+} // end anonymous namespace
+
+// Check if the instruction is a lossy shift left, where the input being
+// shifted is the operand OpN of MI. If true, [LostB, LostE) is the range
+// of bit indices that are lost.
+bool RedundantInstrElimination::isLossyShiftLeft(const MachineInstr &MI,
+ unsigned OpN, unsigned &LostB, unsigned &LostE) {
+ using namespace Hexagon;
+
+ unsigned Opc = MI.getOpcode();
+ unsigned ImN, RegN, Width;
+ switch (Opc) {
+ case S2_asl_i_p:
+ ImN = 2;
+ RegN = 1;
+ Width = 64;
+ break;
+ case S2_asl_i_p_acc:
+ case S2_asl_i_p_and:
+ case S2_asl_i_p_nac:
+ case S2_asl_i_p_or:
+ case S2_asl_i_p_xacc:
+ ImN = 3;
+ RegN = 2;
+ Width = 64;
+ break;
+ case S2_asl_i_r:
+ ImN = 2;
+ RegN = 1;
+ Width = 32;
+ break;
+ case S2_addasl_rrri:
+ case S4_andi_asl_ri:
+ case S4_ori_asl_ri:
+ case S4_addi_asl_ri:
+ case S4_subi_asl_ri:
+ case S2_asl_i_r_acc:
+ case S2_asl_i_r_and:
+ case S2_asl_i_r_nac:
+ case S2_asl_i_r_or:
+ case S2_asl_i_r_sat:
+ case S2_asl_i_r_xacc:
+ ImN = 3;
+ RegN = 2;
+ Width = 32;
+ break;
+ default:
+ return false;
+ }
+
+ if (RegN != OpN)
+ return false;
+
+ assert(MI.getOperand(ImN).isImm());
+ unsigned S = MI.getOperand(ImN).getImm();
+ if (S == 0)
+ return false;
+ LostB = Width-S;
+ LostE = Width;
+ return true;
+}
+
+// Check if the instruction is a lossy shift right, where the input being
+// shifted is the operand OpN of MI. If true, [LostB, LostE) is the range
+// of bit indices that are lost.
+bool RedundantInstrElimination::isLossyShiftRight(const MachineInstr &MI,
+ unsigned OpN, unsigned &LostB, unsigned &LostE) {
+ using namespace Hexagon;
+
+ unsigned Opc = MI.getOpcode();
+ unsigned ImN, RegN;
+ switch (Opc) {
+ case S2_asr_i_p:
+ case S2_lsr_i_p:
+ ImN = 2;
+ RegN = 1;
+ break;
+ case S2_asr_i_p_acc:
+ case S2_asr_i_p_and:
+ case S2_asr_i_p_nac:
+ case S2_asr_i_p_or:
+ case S2_lsr_i_p_acc:
+ case S2_lsr_i_p_and:
+ case S2_lsr_i_p_nac:
+ case S2_lsr_i_p_or:
+ case S2_lsr_i_p_xacc:
+ ImN = 3;
+ RegN = 2;
+ break;
+ case S2_asr_i_r:
+ case S2_lsr_i_r:
+ ImN = 2;
+ RegN = 1;
+ break;
+ case S4_andi_lsr_ri:
+ case S4_ori_lsr_ri:
+ case S4_addi_lsr_ri:
+ case S4_subi_lsr_ri:
+ case S2_asr_i_r_acc:
+ case S2_asr_i_r_and:
+ case S2_asr_i_r_nac:
+ case S2_asr_i_r_or:
+ case S2_lsr_i_r_acc:
+ case S2_lsr_i_r_and:
+ case S2_lsr_i_r_nac:
+ case S2_lsr_i_r_or:
+ case S2_lsr_i_r_xacc:
+ ImN = 3;
+ RegN = 2;
+ break;
+
+ default:
+ return false;
+ }
+
+ if (RegN != OpN)
+ return false;
+
+ assert(MI.getOperand(ImN).isImm());
+ unsigned S = MI.getOperand(ImN).getImm();
+ LostB = 0;
+ LostE = S;
+ return true;
+}
+
+// Calculate the bit vector that corresponds to the used bits of register Reg.
+// The vector Bits has the same size, as the size of Reg in bits. If the cal-
+// culation fails (i.e. the used bits are unknown), it returns false. Other-
+// wise, it returns true and sets the corresponding bits in Bits.
+bool RedundantInstrElimination::computeUsedBits(unsigned Reg, BitVector &Bits) {
+ BitVector Used(Bits.size());
+ RegisterSet Visited;
+ std::vector<unsigned> Pending;
+ Pending.push_back(Reg);
+
+ for (unsigned i = 0; i < Pending.size(); ++i) {
+ unsigned R = Pending[i];
+ if (Visited.has(R))
+ continue;
+ Visited.insert(R);
+ for (auto I = MRI.use_begin(R), E = MRI.use_end(); I != E; ++I) {
+ BitTracker::RegisterRef UR = *I;
+ unsigned B, W;
+ if (!HBS::getSubregMask(UR, B, W, MRI))
+ return false;
+ MachineInstr &UseI = *I->getParent();
+ if (UseI.isPHI() || UseI.isCopy()) {
+ unsigned DefR = UseI.getOperand(0).getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(DefR))
+ return false;
+ Pending.push_back(DefR);
+ } else {
+ if (!computeUsedBits(UseI, I.getOperandNo(), Used, B))
+ return false;
+ }
+ }
+ }
+ Bits |= Used;
+ return true;
+}
+
+// Calculate the bits used by instruction MI in a register in operand OpN.
+// Return true/false if the calculation succeeds/fails. If is succeeds, set
+// used bits in Bits. This function does not reset any bits in Bits, so
+// subsequent calls over different instructions will result in the union
+// of the used bits in all these instructions.
+// The register in question may be used with a sub-register, whereas Bits
+// holds the bits for the entire register. To keep track of that, the
+// argument Begin indicates where in Bits is the lowest-significant bit
+// of the register used in operand OpN. For example, in instruction:
+// vreg1 = S2_lsr_i_r vreg2:isub_hi, 10
+// the operand 1 is a 32-bit register, which happens to be a subregister
+// of the 64-bit register vreg2, and that subregister starts at position 32.
+// In this case Begin=32, since Bits[32] would be the lowest-significant bit
+// of vreg2:isub_hi.
+bool RedundantInstrElimination::computeUsedBits(const MachineInstr &MI,
+ unsigned OpN, BitVector &Bits, uint16_t Begin) {
+ unsigned Opc = MI.getOpcode();
+ BitVector T(Bits.size());
+ bool GotBits = HBS::getUsedBits(Opc, OpN, T, Begin, HII);
+ // Even if we don't have bits yet, we could still provide some information
+ // if the instruction is a lossy shift: the lost bits will be marked as
+ // not used.
+ unsigned LB, LE;
+ if (isLossyShiftLeft(MI, OpN, LB, LE) || isLossyShiftRight(MI, OpN, LB, LE)) {
+ assert(MI.getOperand(OpN).isReg());
+ BitTracker::RegisterRef RR = MI.getOperand(OpN);
+ const TargetRegisterClass *RC = HBS::getFinalVRegClass(RR, MRI);
+ uint16_t Width = RC->getSize()*8;
+
+ if (!GotBits)
+ T.set(Begin, Begin+Width);
+ assert(LB <= LE && LB < Width && LE <= Width);
+ T.reset(Begin+LB, Begin+LE);
+ GotBits = true;
+ }
+ if (GotBits)
+ Bits |= T;
+ return GotBits;
+}
+
+// Calculates the used bits in RD ("defined register"), and checks if these
+// bits in RS ("used register") and RD are identical.
+bool RedundantInstrElimination::usedBitsEqual(BitTracker::RegisterRef RD,
+ BitTracker::RegisterRef RS) {
+ const BitTracker::RegisterCell &DC = BT.lookup(RD.Reg);
+ const BitTracker::RegisterCell &SC = BT.lookup(RS.Reg);
+
+ unsigned DB, DW;
+ if (!HBS::getSubregMask(RD, DB, DW, MRI))
+ return false;
+ unsigned SB, SW;
+ if (!HBS::getSubregMask(RS, SB, SW, MRI))
+ return false;
+ if (SW != DW)
+ return false;
+
+ BitVector Used(DC.width());
+ if (!computeUsedBits(RD.Reg, Used))
+ return false;
+
+ for (unsigned i = 0; i != DW; ++i)
+ if (Used[i+DB] && DC[DB+i] != SC[SB+i])
+ return false;
+ return true;
+}
+
+bool RedundantInstrElimination::processBlock(MachineBasicBlock &B,
+ const RegisterSet&) {
+ if (!BT.reached(&B))
+ return false;
+ bool Changed = false;
+
+ for (auto I = B.begin(), E = B.end(), NextI = I; I != E; ++I) {
+ NextI = std::next(I);
+ MachineInstr *MI = &*I;
+
+ if (MI->getOpcode() == TargetOpcode::COPY)
+ continue;
+ if (MI->hasUnmodeledSideEffects() || MI->isInlineAsm())
+ continue;
+ unsigned NumD = MI->getDesc().getNumDefs();
+ if (NumD != 1)
+ continue;
+
+ BitTracker::RegisterRef RD = MI->getOperand(0);
+ if (!BT.has(RD.Reg))
+ continue;
+ const BitTracker::RegisterCell &DC = BT.lookup(RD.Reg);
+ auto At = MI->isPHI() ? B.getFirstNonPHI()
+ : MachineBasicBlock::iterator(MI);
+
+ // Find a source operand that is equal to the result.
+ for (auto &Op : MI->uses()) {
+ if (!Op.isReg())
+ continue;
+ BitTracker::RegisterRef RS = Op;
+ if (!BT.has(RS.Reg))
+ continue;
+ if (!HBS::isTransparentCopy(RD, RS, MRI))
+ continue;
+
+ unsigned BN, BW;
+ if (!HBS::getSubregMask(RS, BN, BW, MRI))
+ continue;
+
+ const BitTracker::RegisterCell &SC = BT.lookup(RS.Reg);
+ if (!usedBitsEqual(RD, RS) && !HBS::isEqual(DC, 0, SC, BN, BW))
+ continue;
+
+ // If found, replace the instruction with a COPY.
+ const DebugLoc &DL = MI->getDebugLoc();
+ const TargetRegisterClass *FRC = HBS::getFinalVRegClass(RD, MRI);
+ unsigned NewR = MRI.createVirtualRegister(FRC);
+ MachineInstr *CopyI =
+ BuildMI(B, At, DL, HII.get(TargetOpcode::COPY), NewR)
+ .addReg(RS.Reg, 0, RS.Sub);
+ HBS::replaceSubWithSub(RD.Reg, RD.Sub, NewR, 0, MRI);
+ // This pass can create copies between registers that don't have the
+ // exact same values. Updating the tracker has to involve updating
+ // all dependent cells. Example:
+ // vreg1 = inst vreg2 ; vreg1 != vreg2, but used bits are equal
+ //
+ // vreg3 = copy vreg2 ; <- inserted
+ // ... = vreg3 ; <- replaced from vreg2
+ // Indirectly, we can create a "copy" between vreg1 and vreg2 even
+ // though their exact values do not match.
+ BT.visit(*CopyI);
+ Changed = true;
+ break;
+ }
+ }
+
+ return Changed;
+}
+
+namespace {
+
+// Recognize instructions that produce constant values known at compile-time.
+// Replace them with register definitions that load these constants directly.
+ class ConstGeneration : public Transformation {
+ public:
+ ConstGeneration(BitTracker &bt, const HexagonInstrInfo &hii,
+ MachineRegisterInfo &mri)
+ : Transformation(true), HII(hii), MRI(mri), BT(bt) {}
+
+ bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) override;
+ static bool isTfrConst(const MachineInstr &MI);
+
+ private:
+ unsigned genTfrConst(const TargetRegisterClass *RC, int64_t C,
+ MachineBasicBlock &B, MachineBasicBlock::iterator At, DebugLoc &DL);
+
+ const HexagonInstrInfo &HII;
+ MachineRegisterInfo &MRI;
+ BitTracker &BT;
+ };
+
+} // end anonymous namespace
+
+bool ConstGeneration::isTfrConst(const MachineInstr &MI) {
+ unsigned Opc = MI.getOpcode();
+ switch (Opc) {
+ case Hexagon::A2_combineii:
+ case Hexagon::A4_combineii:
+ case Hexagon::A2_tfrsi:
+ case Hexagon::A2_tfrpi:
+ case Hexagon::PS_true:
+ case Hexagon::PS_false:
+ case Hexagon::CONST32:
+ case Hexagon::CONST64:
+ return true;
+ }
+ return false;
+}
+
+// Generate a transfer-immediate instruction that is appropriate for the
+// register class and the actual value being transferred.
+unsigned ConstGeneration::genTfrConst(const TargetRegisterClass *RC, int64_t C,
+ MachineBasicBlock &B, MachineBasicBlock::iterator At, DebugLoc &DL) {
+ unsigned Reg = MRI.createVirtualRegister(RC);
+ if (RC == &Hexagon::IntRegsRegClass) {
+ BuildMI(B, At, DL, HII.get(Hexagon::A2_tfrsi), Reg)
+ .addImm(int32_t(C));
+ return Reg;
+ }
+
+ if (RC == &Hexagon::DoubleRegsRegClass) {
+ if (isInt<8>(C)) {
+ BuildMI(B, At, DL, HII.get(Hexagon::A2_tfrpi), Reg)
+ .addImm(C);
+ return Reg;
+ }
+
+ unsigned Lo = Lo_32(C), Hi = Hi_32(C);
+ if (isInt<8>(Lo) || isInt<8>(Hi)) {
+ unsigned Opc = isInt<8>(Lo) ? Hexagon::A2_combineii
+ : Hexagon::A4_combineii;
+ BuildMI(B, At, DL, HII.get(Opc), Reg)
+ .addImm(int32_t(Hi))
+ .addImm(int32_t(Lo));
+ return Reg;
+ }
+
+ BuildMI(B, At, DL, HII.get(Hexagon::CONST64), Reg)
+ .addImm(C);
+ return Reg;
+ }
+
+ if (RC == &Hexagon::PredRegsRegClass) {
+ unsigned Opc;
+ if (C == 0)
+ Opc = Hexagon::PS_false;
+ else if ((C & 0xFF) == 0xFF)
+ Opc = Hexagon::PS_true;
+ else
+ return 0;
+ BuildMI(B, At, DL, HII.get(Opc), Reg);
+ return Reg;
+ }
+
+ return 0;
+}
+
+bool ConstGeneration::processBlock(MachineBasicBlock &B, const RegisterSet&) {
+ if (!BT.reached(&B))
+ return false;
+ bool Changed = false;
+ RegisterSet Defs;
+
+ for (auto I = B.begin(), E = B.end(); I != E; ++I) {
+ if (isTfrConst(*I))
+ continue;
+ Defs.clear();
+ HBS::getInstrDefs(*I, Defs);
+ if (Defs.count() != 1)
+ continue;
+ unsigned DR = Defs.find_first();
+ if (!TargetRegisterInfo::isVirtualRegister(DR))
+ continue;
+ uint64_t U;
+ const BitTracker::RegisterCell &DRC = BT.lookup(DR);
+ if (HBS::getConst(DRC, 0, DRC.width(), U)) {
+ int64_t C = U;
+ DebugLoc DL = I->getDebugLoc();
+ auto At = I->isPHI() ? B.getFirstNonPHI() : I;
+ unsigned ImmReg = genTfrConst(MRI.getRegClass(DR), C, B, At, DL);
+ if (ImmReg) {
+ HBS::replaceReg(DR, ImmReg, MRI);
+ BT.put(ImmReg, DRC);
+ Changed = true;
+ }
+ }
+ }
+ return Changed;
+}
+
+namespace {
+
+// Identify pairs of available registers which hold identical values.
+// In such cases, only one of them needs to be calculated, the other one
+// will be defined as a copy of the first.
+ class CopyGeneration : public Transformation {
+ public:
+ CopyGeneration(BitTracker &bt, const HexagonInstrInfo &hii,
+ const HexagonRegisterInfo &hri, MachineRegisterInfo &mri)
+ : Transformation(true), HII(hii), HRI(hri), MRI(mri), BT(bt) {}
+
+ bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) override;
+
+ private:
+ bool findMatch(const BitTracker::RegisterRef &Inp,
+ BitTracker::RegisterRef &Out, const RegisterSet &AVs);
+
+ const HexagonInstrInfo &HII;
+ const HexagonRegisterInfo &HRI;
+ MachineRegisterInfo &MRI;
+ BitTracker &BT;
+ RegisterSet Forbidden;
+ };
+
+// Eliminate register copies RD = RS, by replacing the uses of RD with
+// with uses of RS.
+ class CopyPropagation : public Transformation {
+ public:
+ CopyPropagation(const HexagonRegisterInfo &hri, MachineRegisterInfo &mri)
+ : Transformation(false), HRI(hri), MRI(mri) {}
+
+ bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) override;
+
+ static bool isCopyReg(unsigned Opc, bool NoConv);
+
+ private:
+ bool propagateRegCopy(MachineInstr &MI);
+
+ const HexagonRegisterInfo &HRI;
+ MachineRegisterInfo &MRI;
+ };
+
+} // end anonymous namespace
+
+/// Check if there is a register in AVs that is identical to Inp. If so,
+/// set Out to the found register. The output may be a pair Reg:Sub.
+bool CopyGeneration::findMatch(const BitTracker::RegisterRef &Inp,
+ BitTracker::RegisterRef &Out, const RegisterSet &AVs) {
+ if (!BT.has(Inp.Reg))
+ return false;
+ const BitTracker::RegisterCell &InpRC = BT.lookup(Inp.Reg);
+ auto *FRC = HBS::getFinalVRegClass(Inp, MRI);
+ unsigned B, W;
+ if (!HBS::getSubregMask(Inp, B, W, MRI))
+ return false;
+
+ for (unsigned R = AVs.find_first(); R; R = AVs.find_next(R)) {
+ if (!BT.has(R) || Forbidden[R])
+ continue;
+ const BitTracker::RegisterCell &RC = BT.lookup(R);
+ unsigned RW = RC.width();
+ if (W == RW) {
+ if (FRC != MRI.getRegClass(R))
+ continue;
+ if (!HBS::isTransparentCopy(R, Inp, MRI))
+ continue;
+ if (!HBS::isEqual(InpRC, B, RC, 0, W))
+ continue;
+ Out.Reg = R;
+ Out.Sub = 0;
+ return true;
+ }
+ // Check if there is a super-register, whose part (with a subregister)
+ // is equal to the input.
+ // Only do double registers for now.
+ if (W*2 != RW)
+ continue;
+ if (MRI.getRegClass(R) != &Hexagon::DoubleRegsRegClass)
+ continue;
+
+ if (HBS::isEqual(InpRC, B, RC, 0, W))
+ Out.Sub = Hexagon::isub_lo;
+ else if (HBS::isEqual(InpRC, B, RC, W, W))
+ Out.Sub = Hexagon::isub_hi;
+ else
+ continue;
+ Out.Reg = R;
+ if (HBS::isTransparentCopy(Out, Inp, MRI))
+ return true;
+ }
+ return false;
+}
+
+bool CopyGeneration::processBlock(MachineBasicBlock &B,
+ const RegisterSet &AVs) {
+ if (!BT.reached(&B))
+ return false;
+ RegisterSet AVB(AVs);
+ bool Changed = false;
+ RegisterSet Defs;
+
+ for (auto I = B.begin(), E = B.end(), NextI = I; I != E;
+ ++I, AVB.insert(Defs)) {
+ NextI = std::next(I);
+ Defs.clear();
+ HBS::getInstrDefs(*I, Defs);
+
+ unsigned Opc = I->getOpcode();
+ if (CopyPropagation::isCopyReg(Opc, false) ||
+ ConstGeneration::isTfrConst(*I))
+ continue;
+
+ DebugLoc DL = I->getDebugLoc();
+ auto At = I->isPHI() ? B.getFirstNonPHI() : I;
+
+ for (unsigned R = Defs.find_first(); R; R = Defs.find_next(R)) {
+ BitTracker::RegisterRef MR;
+ auto *FRC = HBS::getFinalVRegClass(R, MRI);
+
+ if (findMatch(R, MR, AVB)) {
+ unsigned NewR = MRI.createVirtualRegister(FRC);
+ BuildMI(B, At, DL, HII.get(TargetOpcode::COPY), NewR)
+ .addReg(MR.Reg, 0, MR.Sub);
+ BT.put(BitTracker::RegisterRef(NewR), BT.get(MR));
+ HBS::replaceReg(R, NewR, MRI);
+ Forbidden.insert(R);
+ continue;
+ }
+
+ if (FRC == &Hexagon::DoubleRegsRegClass ||
+ FRC == &Hexagon::VecDblRegsRegClass ||
+ FRC == &Hexagon::VecDblRegs128BRegClass) {
+ // Try to generate REG_SEQUENCE.
+ unsigned SubLo = HRI.getHexagonSubRegIndex(FRC, Hexagon::ps_sub_lo);
+ unsigned SubHi = HRI.getHexagonSubRegIndex(FRC, Hexagon::ps_sub_hi);
+ BitTracker::RegisterRef TL = { R, SubLo };
+ BitTracker::RegisterRef TH = { R, SubHi };
+ BitTracker::RegisterRef ML, MH;
+ if (findMatch(TL, ML, AVB) && findMatch(TH, MH, AVB)) {
+ auto *FRC = HBS::getFinalVRegClass(R, MRI);
+ unsigned NewR = MRI.createVirtualRegister(FRC);
+ BuildMI(B, At, DL, HII.get(TargetOpcode::REG_SEQUENCE), NewR)
+ .addReg(ML.Reg, 0, ML.Sub)
+ .addImm(SubLo)
+ .addReg(MH.Reg, 0, MH.Sub)
+ .addImm(SubHi);
+ BT.put(BitTracker::RegisterRef(NewR), BT.get(R));
+ HBS::replaceReg(R, NewR, MRI);
+ Forbidden.insert(R);
+ }
+ }
+ }
+ }
+
+ return Changed;
+}
+
+bool CopyPropagation::isCopyReg(unsigned Opc, bool NoConv) {
+ switch (Opc) {
+ case TargetOpcode::COPY:
+ case TargetOpcode::REG_SEQUENCE:
+ case Hexagon::A4_combineir:
+ case Hexagon::A4_combineri:
+ return true;
+ case Hexagon::A2_tfr:
+ case Hexagon::A2_tfrp:
+ case Hexagon::A2_combinew:
+ case Hexagon::V6_vcombine:
+ case Hexagon::V6_vcombine_128B:
+ return NoConv;
+ default:
+ break;
+ }
+ return false;
+}
+
+bool CopyPropagation::propagateRegCopy(MachineInstr &MI) {
+ bool Changed = false;
+ unsigned Opc = MI.getOpcode();
+ BitTracker::RegisterRef RD = MI.getOperand(0);
+ assert(MI.getOperand(0).getSubReg() == 0);
+
+ switch (Opc) {
+ case TargetOpcode::COPY:
+ case Hexagon::A2_tfr:
+ case Hexagon::A2_tfrp: {
+ BitTracker::RegisterRef RS = MI.getOperand(1);
+ if (!HBS::isTransparentCopy(RD, RS, MRI))
+ break;
+ if (RS.Sub != 0)
+ Changed = HBS::replaceRegWithSub(RD.Reg, RS.Reg, RS.Sub, MRI);
+ else
+ Changed = HBS::replaceReg(RD.Reg, RS.Reg, MRI);
+ break;
+ }
+ case TargetOpcode::REG_SEQUENCE: {
+ BitTracker::RegisterRef SL, SH;
+ if (HBS::parseRegSequence(MI, SL, SH, MRI)) {
+ const TargetRegisterClass *RC = MRI.getRegClass(RD.Reg);
+ unsigned SubLo = HRI.getHexagonSubRegIndex(RC, Hexagon::ps_sub_lo);
+ unsigned SubHi = HRI.getHexagonSubRegIndex(RC, Hexagon::ps_sub_hi);
+ Changed = HBS::replaceSubWithSub(RD.Reg, SubLo, SL.Reg, SL.Sub, MRI);
+ Changed |= HBS::replaceSubWithSub(RD.Reg, SubHi, SH.Reg, SH.Sub, MRI);
+ }
+ break;
+ }
+ case Hexagon::A2_combinew:
+ case Hexagon::V6_vcombine:
+ case Hexagon::V6_vcombine_128B: {
+ const TargetRegisterClass *RC = MRI.getRegClass(RD.Reg);
+ unsigned SubLo = HRI.getHexagonSubRegIndex(RC, Hexagon::ps_sub_lo);
+ unsigned SubHi = HRI.getHexagonSubRegIndex(RC, Hexagon::ps_sub_hi);
+ BitTracker::RegisterRef RH = MI.getOperand(1), RL = MI.getOperand(2);
+ Changed = HBS::replaceSubWithSub(RD.Reg, SubLo, RL.Reg, RL.Sub, MRI);
+ Changed |= HBS::replaceSubWithSub(RD.Reg, SubHi, RH.Reg, RH.Sub, MRI);
+ break;
+ }
+ case Hexagon::A4_combineir:
+ case Hexagon::A4_combineri: {
+ unsigned SrcX = (Opc == Hexagon::A4_combineir) ? 2 : 1;
+ unsigned Sub = (Opc == Hexagon::A4_combineir) ? Hexagon::isub_lo
+ : Hexagon::isub_hi;
+ BitTracker::RegisterRef RS = MI.getOperand(SrcX);
+ Changed = HBS::replaceSubWithSub(RD.Reg, Sub, RS.Reg, RS.Sub, MRI);
+ break;
+ }
+ }
+ return Changed;
+}
+
+bool CopyPropagation::processBlock(MachineBasicBlock &B, const RegisterSet&) {
+ std::vector<MachineInstr*> Instrs;
+ for (auto I = B.rbegin(), E = B.rend(); I != E; ++I)
+ Instrs.push_back(&*I);
+
+ bool Changed = false;
+ for (auto I : Instrs) {
+ unsigned Opc = I->getOpcode();
+ if (!CopyPropagation::isCopyReg(Opc, true))
+ continue;
+ Changed |= propagateRegCopy(*I);
+ }
+
+ return Changed;
+}
+
+namespace {
+
+// Recognize patterns that can be simplified and replace them with the
+// simpler forms.
+// This is by no means complete
+ class BitSimplification : public Transformation {
+ public:
+ BitSimplification(BitTracker &bt, const HexagonInstrInfo &hii,
+ const HexagonRegisterInfo &hri, MachineRegisterInfo &mri,
+ MachineFunction &mf)
+ : Transformation(true), HII(hii), HRI(hri), MRI(mri), MF(mf), BT(bt) {}
+
+ bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) override;
+
+ private:
+ struct RegHalf : public BitTracker::RegisterRef {
+ bool Low; // Low/High halfword.
+ };
+
+ bool matchHalf(unsigned SelfR, const BitTracker::RegisterCell &RC,
+ unsigned B, RegHalf &RH);
+ bool validateReg(BitTracker::RegisterRef R, unsigned Opc, unsigned OpNum);
+
+ bool matchPackhl(unsigned SelfR, const BitTracker::RegisterCell &RC,
+ BitTracker::RegisterRef &Rs, BitTracker::RegisterRef &Rt);
+ unsigned getCombineOpcode(bool HLow, bool LLow);
+
+ bool genStoreUpperHalf(MachineInstr *MI);
+ bool genStoreImmediate(MachineInstr *MI);
+ bool genPackhl(MachineInstr *MI, BitTracker::RegisterRef RD,
+ const BitTracker::RegisterCell &RC);
+ bool genExtractHalf(MachineInstr *MI, BitTracker::RegisterRef RD,
+ const BitTracker::RegisterCell &RC);
+ bool genCombineHalf(MachineInstr *MI, BitTracker::RegisterRef RD,
+ const BitTracker::RegisterCell &RC);
+ bool genExtractLow(MachineInstr *MI, BitTracker::RegisterRef RD,
+ const BitTracker::RegisterCell &RC);
+ bool simplifyTstbit(MachineInstr *MI, BitTracker::RegisterRef RD,
+ const BitTracker::RegisterCell &RC);
+
+ const HexagonInstrInfo &HII;
+ const HexagonRegisterInfo &HRI;
+ MachineRegisterInfo &MRI;
+ MachineFunction &MF;
+ BitTracker &BT;
+ };
+
+} // end anonymous namespace
+
+// Check if the bits [B..B+16) in register cell RC form a valid halfword,
+// i.e. [0..16), [16..32), etc. of some register. If so, return true and
+// set the information about the found register in RH.
+bool BitSimplification::matchHalf(unsigned SelfR,
+ const BitTracker::RegisterCell &RC, unsigned B, RegHalf &RH) {
+ // XXX This could be searching in the set of available registers, in case
+ // the match is not exact.
+
+ // Match 16-bit chunks, where the RC[B..B+15] references exactly one
+ // register and all the bits B..B+15 match between RC and the register.
+ // This is meant to match "v1[0-15]", where v1 = { [0]:0 [1-15]:v1... },
+ // and RC = { [0]:0 [1-15]:v1[1-15]... }.
+ bool Low = false;
+ unsigned I = B;
+ while (I < B+16 && RC[I].num())
+ I++;
+ if (I == B+16)
+ return false;
+
+ unsigned Reg = RC[I].RefI.Reg;
+ unsigned P = RC[I].RefI.Pos; // The RefI.Pos will be advanced by I-B.
+ if (P < I-B)
+ return false;
+ unsigned Pos = P - (I-B);
+
+ if (Reg == 0 || Reg == SelfR) // Don't match "self".
+ return false;
+ if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ return false;
+ if (!BT.has(Reg))
+ return false;
+
+ const BitTracker::RegisterCell &SC = BT.lookup(Reg);
+ if (Pos+16 > SC.width())
+ return false;
+
+ for (unsigned i = 0; i < 16; ++i) {
+ const BitTracker::BitValue &RV = RC[i+B];
+ if (RV.Type == BitTracker::BitValue::Ref) {
+ if (RV.RefI.Reg != Reg)
+ return false;
+ if (RV.RefI.Pos != i+Pos)
+ return false;
+ continue;
+ }
+ if (RC[i+B] != SC[i+Pos])
+ return false;
+ }
+
+ unsigned Sub = 0;
+ switch (Pos) {
+ case 0:
+ Sub = Hexagon::isub_lo;
+ Low = true;
+ break;
+ case 16:
+ Sub = Hexagon::isub_lo;
+ Low = false;
+ break;
+ case 32:
+ Sub = Hexagon::isub_hi;
+ Low = true;
+ break;
+ case 48:
+ Sub = Hexagon::isub_hi;
+ Low = false;
+ break;
+ default:
+ return false;
+ }
+
+ RH.Reg = Reg;
+ RH.Sub = Sub;
+ RH.Low = Low;
+ // If the subregister is not valid with the register, set it to 0.
+ if (!HBS::getFinalVRegClass(RH, MRI))
+ RH.Sub = 0;
+
+ return true;
+}
+
+bool BitSimplification::validateReg(BitTracker::RegisterRef R, unsigned Opc,
+ unsigned OpNum) {
+ auto *OpRC = HII.getRegClass(HII.get(Opc), OpNum, &HRI, MF);
+ auto *RRC = HBS::getFinalVRegClass(R, MRI);
+ return OpRC->hasSubClassEq(RRC);
+}
+
+// Check if RC matches the pattern of a S2_packhl. If so, return true and
+// set the inputs Rs and Rt.
+bool BitSimplification::matchPackhl(unsigned SelfR,
+ const BitTracker::RegisterCell &RC, BitTracker::RegisterRef &Rs,
+ BitTracker::RegisterRef &Rt) {
+ RegHalf L1, H1, L2, H2;
+
+ if (!matchHalf(SelfR, RC, 0, L2) || !matchHalf(SelfR, RC, 16, L1))
+ return false;
+ if (!matchHalf(SelfR, RC, 32, H2) || !matchHalf(SelfR, RC, 48, H1))
+ return false;
+
+ // Rs = H1.L1, Rt = H2.L2
+ if (H1.Reg != L1.Reg || H1.Sub != L1.Sub || H1.Low || !L1.Low)
+ return false;
+ if (H2.Reg != L2.Reg || H2.Sub != L2.Sub || H2.Low || !L2.Low)
+ return false;
+
+ Rs = H1;
+ Rt = H2;
+ return true;
+}
+
+unsigned BitSimplification::getCombineOpcode(bool HLow, bool LLow) {
+ return HLow ? LLow ? Hexagon::A2_combine_ll
+ : Hexagon::A2_combine_lh
+ : LLow ? Hexagon::A2_combine_hl
+ : Hexagon::A2_combine_hh;
+}
+
+// If MI stores the upper halfword of a register (potentially obtained via
+// shifts or extracts), replace it with a storerf instruction. This could
+// cause the "extraction" code to become dead.
+bool BitSimplification::genStoreUpperHalf(MachineInstr *MI) {
+ unsigned Opc = MI->getOpcode();
+ if (Opc != Hexagon::S2_storerh_io)
+ return false;
+
+ MachineOperand &ValOp = MI->getOperand(2);
+ BitTracker::RegisterRef RS = ValOp;
+ if (!BT.has(RS.Reg))
+ return false;
+ const BitTracker::RegisterCell &RC = BT.lookup(RS.Reg);
+ RegHalf H;
+ if (!matchHalf(0, RC, 0, H))
+ return false;
+ if (H.Low)
+ return false;
+ MI->setDesc(HII.get(Hexagon::S2_storerf_io));
+ ValOp.setReg(H.Reg);
+ ValOp.setSubReg(H.Sub);
+ return true;
+}
+
+// If MI stores a value known at compile-time, and the value is within a range
+// that avoids using constant-extenders, replace it with a store-immediate.
+bool BitSimplification::genStoreImmediate(MachineInstr *MI) {
+ unsigned Opc = MI->getOpcode();
+ unsigned Align = 0;
+ switch (Opc) {
+ case Hexagon::S2_storeri_io:
+ Align++;
+ case Hexagon::S2_storerh_io:
+ Align++;
+ case Hexagon::S2_storerb_io:
+ break;
+ default:
+ return false;
+ }
+
+ // Avoid stores to frame-indices (due to an unknown offset).
+ if (!MI->getOperand(0).isReg())
+ return false;
+ MachineOperand &OffOp = MI->getOperand(1);
+ if (!OffOp.isImm())
+ return false;
+
+ int64_t Off = OffOp.getImm();
+ // Offset is u6:a. Sadly, there is no isShiftedUInt(n,x).
+ if (!isUIntN(6+Align, Off) || (Off & ((1<<Align)-1)))
+ return false;
+ // Source register:
+ BitTracker::RegisterRef RS = MI->getOperand(2);
+ if (!BT.has(RS.Reg))
+ return false;
+ const BitTracker::RegisterCell &RC = BT.lookup(RS.Reg);
+ uint64_t U;
+ if (!HBS::getConst(RC, 0, RC.width(), U))
+ return false;
+
+ // Only consider 8-bit values to avoid constant-extenders.
+ int V;
+ switch (Opc) {
+ case Hexagon::S2_storerb_io:
+ V = int8_t(U);
+ break;
+ case Hexagon::S2_storerh_io:
+ V = int16_t(U);
+ break;
+ case Hexagon::S2_storeri_io:
+ V = int32_t(U);
+ break;
+ }
+ if (!isInt<8>(V))
+ return false;
+
+ MI->RemoveOperand(2);
+ switch (Opc) {
+ case Hexagon::S2_storerb_io:
+ MI->setDesc(HII.get(Hexagon::S4_storeirb_io));
+ break;
+ case Hexagon::S2_storerh_io:
+ MI->setDesc(HII.get(Hexagon::S4_storeirh_io));
+ break;
+ case Hexagon::S2_storeri_io:
+ MI->setDesc(HII.get(Hexagon::S4_storeiri_io));
+ break;
+ }
+ MI->addOperand(MachineOperand::CreateImm(V));
+ return true;
+}
+
+// If MI is equivalent o S2_packhl, generate the S2_packhl. MI could be the
+// last instruction in a sequence that results in something equivalent to
+// the pack-halfwords. The intent is to cause the entire sequence to become
+// dead.
+bool BitSimplification::genPackhl(MachineInstr *MI,
+ BitTracker::RegisterRef RD, const BitTracker::RegisterCell &RC) {
+ unsigned Opc = MI->getOpcode();
+ if (Opc == Hexagon::S2_packhl)
+ return false;
+ BitTracker::RegisterRef Rs, Rt;
+ if (!matchPackhl(RD.Reg, RC, Rs, Rt))
+ return false;
+ if (!validateReg(Rs, Hexagon::S2_packhl, 1) ||
+ !validateReg(Rt, Hexagon::S2_packhl, 2))
+ return false;
+
+ MachineBasicBlock &B = *MI->getParent();
+ unsigned NewR = MRI.createVirtualRegister(&Hexagon::DoubleRegsRegClass);
+ DebugLoc DL = MI->getDebugLoc();
+ auto At = MI->isPHI() ? B.getFirstNonPHI()
+ : MachineBasicBlock::iterator(MI);
+ BuildMI(B, At, DL, HII.get(Hexagon::S2_packhl), NewR)
+ .addReg(Rs.Reg, 0, Rs.Sub)
+ .addReg(Rt.Reg, 0, Rt.Sub);
+ HBS::replaceSubWithSub(RD.Reg, RD.Sub, NewR, 0, MRI);
+ BT.put(BitTracker::RegisterRef(NewR), RC);
+ return true;
+}
+
+// If MI produces halfword of the input in the low half of the output,
+// replace it with zero-extend or extractu.
+bool BitSimplification::genExtractHalf(MachineInstr *MI,
+ BitTracker::RegisterRef RD, const BitTracker::RegisterCell &RC) {
+ RegHalf L;
+ // Check for halfword in low 16 bits, zeros elsewhere.
+ if (!matchHalf(RD.Reg, RC, 0, L) || !HBS::isZero(RC, 16, 16))
+ return false;
+
+ unsigned Opc = MI->getOpcode();
+ MachineBasicBlock &B = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+
+ // Prefer zxth, since zxth can go in any slot, while extractu only in
+ // slots 2 and 3.
+ unsigned NewR = 0;
+ auto At = MI->isPHI() ? B.getFirstNonPHI()
+ : MachineBasicBlock::iterator(MI);
+ if (L.Low && Opc != Hexagon::A2_zxth) {
+ if (validateReg(L, Hexagon::A2_zxth, 1)) {
+ NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+ BuildMI(B, At, DL, HII.get(Hexagon::A2_zxth), NewR)
+ .addReg(L.Reg, 0, L.Sub);
+ }
+ } else if (!L.Low && Opc != Hexagon::S2_lsr_i_r) {
+ if (validateReg(L, Hexagon::S2_lsr_i_r, 1)) {
+ NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+ BuildMI(B, MI, DL, HII.get(Hexagon::S2_lsr_i_r), NewR)
+ .addReg(L.Reg, 0, L.Sub)
+ .addImm(16);
+ }
+ }
+ if (NewR == 0)
+ return false;
+ HBS::replaceSubWithSub(RD.Reg, RD.Sub, NewR, 0, MRI);
+ BT.put(BitTracker::RegisterRef(NewR), RC);
+ return true;
+}
+
+// If MI is equivalent to a combine(.L/.H, .L/.H) replace with with the
+// combine.
+bool BitSimplification::genCombineHalf(MachineInstr *MI,
+ BitTracker::RegisterRef RD, const BitTracker::RegisterCell &RC) {
+ RegHalf L, H;
+ // Check for combine h/l
+ if (!matchHalf(RD.Reg, RC, 0, L) || !matchHalf(RD.Reg, RC, 16, H))
+ return false;
+ // Do nothing if this is just a reg copy.
+ if (L.Reg == H.Reg && L.Sub == H.Sub && !H.Low && L.Low)
+ return false;
+
+ unsigned Opc = MI->getOpcode();
+ unsigned COpc = getCombineOpcode(H.Low, L.Low);
+ if (COpc == Opc)
+ return false;
+ if (!validateReg(H, COpc, 1) || !validateReg(L, COpc, 2))
+ return false;
+
+ MachineBasicBlock &B = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+ unsigned NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+ auto At = MI->isPHI() ? B.getFirstNonPHI()
+ : MachineBasicBlock::iterator(MI);
+ BuildMI(B, At, DL, HII.get(COpc), NewR)
+ .addReg(H.Reg, 0, H.Sub)
+ .addReg(L.Reg, 0, L.Sub);
+ HBS::replaceSubWithSub(RD.Reg, RD.Sub, NewR, 0, MRI);
+ BT.put(BitTracker::RegisterRef(NewR), RC);
+ return true;
+}
+
+// If MI resets high bits of a register and keeps the lower ones, replace it
+// with zero-extend byte/half, and-immediate, or extractu, as appropriate.
+bool BitSimplification::genExtractLow(MachineInstr *MI,
+ BitTracker::RegisterRef RD, const BitTracker::RegisterCell &RC) {
+ unsigned Opc = MI->getOpcode();
+ switch (Opc) {
+ case Hexagon::A2_zxtb:
+ case Hexagon::A2_zxth:
+ case Hexagon::S2_extractu:
+ return false;
+ }
+ if (Opc == Hexagon::A2_andir && MI->getOperand(2).isImm()) {
+ int32_t Imm = MI->getOperand(2).getImm();
+ if (isInt<10>(Imm))
+ return false;
+ }
+
+ if (MI->hasUnmodeledSideEffects() || MI->isInlineAsm())
+ return false;
+ unsigned W = RC.width();
+ while (W > 0 && RC[W-1].is(0))
+ W--;
+ if (W == 0 || W == RC.width())
+ return false;
+ unsigned NewOpc = (W == 8) ? Hexagon::A2_zxtb
+ : (W == 16) ? Hexagon::A2_zxth
+ : (W < 10) ? Hexagon::A2_andir
+ : Hexagon::S2_extractu;
+ MachineBasicBlock &B = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+
+ for (auto &Op : MI->uses()) {
+ if (!Op.isReg())
+ continue;
+ BitTracker::RegisterRef RS = Op;
+ if (!BT.has(RS.Reg))
+ continue;
+ const BitTracker::RegisterCell &SC = BT.lookup(RS.Reg);
+ unsigned BN, BW;
+ if (!HBS::getSubregMask(RS, BN, BW, MRI))
+ continue;
+ if (BW < W || !HBS::isEqual(RC, 0, SC, BN, W))
+ continue;
+ if (!validateReg(RS, NewOpc, 1))
+ continue;
+
+ unsigned NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+ auto At = MI->isPHI() ? B.getFirstNonPHI()
+ : MachineBasicBlock::iterator(MI);
+ auto MIB = BuildMI(B, At, DL, HII.get(NewOpc), NewR)
+ .addReg(RS.Reg, 0, RS.Sub);
+ if (NewOpc == Hexagon::A2_andir)
+ MIB.addImm((1 << W) - 1);
+ else if (NewOpc == Hexagon::S2_extractu)
+ MIB.addImm(W).addImm(0);
+ HBS::replaceSubWithSub(RD.Reg, RD.Sub, NewR, 0, MRI);
+ BT.put(BitTracker::RegisterRef(NewR), RC);
+ return true;
+ }
+ return false;
+}
+
+// Check for tstbit simplification opportunity, where the bit being checked
+// can be tracked back to another register. For example:
+// vreg2 = S2_lsr_i_r vreg1, 5
+// vreg3 = S2_tstbit_i vreg2, 0
+// =>
+// vreg3 = S2_tstbit_i vreg1, 5
+bool BitSimplification::simplifyTstbit(MachineInstr *MI,
+ BitTracker::RegisterRef RD, const BitTracker::RegisterCell &RC) {
+ unsigned Opc = MI->getOpcode();
+ if (Opc != Hexagon::S2_tstbit_i)
+ return false;
+
+ unsigned BN = MI->getOperand(2).getImm();
+ BitTracker::RegisterRef RS = MI->getOperand(1);
+ unsigned F, W;
+ DebugLoc DL = MI->getDebugLoc();
+ if (!BT.has(RS.Reg) || !HBS::getSubregMask(RS, F, W, MRI))
+ return false;
+ MachineBasicBlock &B = *MI->getParent();
+ auto At = MI->isPHI() ? B.getFirstNonPHI()
+ : MachineBasicBlock::iterator(MI);
+
+ const BitTracker::RegisterCell &SC = BT.lookup(RS.Reg);
+ const BitTracker::BitValue &V = SC[F+BN];
+ if (V.Type == BitTracker::BitValue::Ref && V.RefI.Reg != RS.Reg) {
+ const TargetRegisterClass *TC = MRI.getRegClass(V.RefI.Reg);
+ // Need to map V.RefI.Reg to a 32-bit register, i.e. if it is
+ // a double register, need to use a subregister and adjust bit
+ // number.
+ unsigned P = std::numeric_limits<unsigned>::max();
+ BitTracker::RegisterRef RR(V.RefI.Reg, 0);
+ if (TC == &Hexagon::DoubleRegsRegClass) {
+ P = V.RefI.Pos;
+ RR.Sub = Hexagon::isub_lo;
+ if (P >= 32) {
+ P -= 32;
+ RR.Sub = Hexagon::isub_hi;
+ }
+ } else if (TC == &Hexagon::IntRegsRegClass) {
+ P = V.RefI.Pos;
+ }
+ if (P != std::numeric_limits<unsigned>::max()) {
+ unsigned NewR = MRI.createVirtualRegister(&Hexagon::PredRegsRegClass);
+ BuildMI(B, At, DL, HII.get(Hexagon::S2_tstbit_i), NewR)
+ .addReg(RR.Reg, 0, RR.Sub)
+ .addImm(P);
+ HBS::replaceReg(RD.Reg, NewR, MRI);
+ BT.put(NewR, RC);
+ return true;
+ }
+ } else if (V.is(0) || V.is(1)) {
+ unsigned NewR = MRI.createVirtualRegister(&Hexagon::PredRegsRegClass);
+ unsigned NewOpc = V.is(0) ? Hexagon::PS_false : Hexagon::PS_true;
+ BuildMI(B, At, DL, HII.get(NewOpc), NewR);
+ HBS::replaceReg(RD.Reg, NewR, MRI);
+ return true;
+ }
+
+ return false;
+}
+
+bool BitSimplification::processBlock(MachineBasicBlock &B,
+ const RegisterSet &AVs) {
+ if (!BT.reached(&B))
+ return false;
+ bool Changed = false;
+ RegisterSet AVB = AVs;
+ RegisterSet Defs;
+
+ for (auto I = B.begin(), E = B.end(); I != E; ++I, AVB.insert(Defs)) {
+ MachineInstr *MI = &*I;
+ Defs.clear();
+ HBS::getInstrDefs(*MI, Defs);
+
+ unsigned Opc = MI->getOpcode();
+ if (Opc == TargetOpcode::COPY || Opc == TargetOpcode::REG_SEQUENCE)
+ continue;
+
+ if (MI->mayStore()) {
+ bool T = genStoreUpperHalf(MI);
+ T = T || genStoreImmediate(MI);
+ Changed |= T;
+ continue;
+ }
+
+ if (Defs.count() != 1)
+ continue;
+ const MachineOperand &Op0 = MI->getOperand(0);
+ if (!Op0.isReg() || !Op0.isDef())
+ continue;
+ BitTracker::RegisterRef RD = Op0;
+ if (!BT.has(RD.Reg))
+ continue;
+ const TargetRegisterClass *FRC = HBS::getFinalVRegClass(RD, MRI);
+ const BitTracker::RegisterCell &RC = BT.lookup(RD.Reg);
+
+ if (FRC->getID() == Hexagon::DoubleRegsRegClassID) {
+ bool T = genPackhl(MI, RD, RC);
+ Changed |= T;
+ continue;
+ }
+
+ if (FRC->getID() == Hexagon::IntRegsRegClassID) {
+ bool T = genExtractHalf(MI, RD, RC);
+ T = T || genCombineHalf(MI, RD, RC);
+ T = T || genExtractLow(MI, RD, RC);
+ Changed |= T;
+ continue;
+ }
+
+ if (FRC->getID() == Hexagon::PredRegsRegClassID) {
+ bool T = simplifyTstbit(MI, RD, RC);
+ Changed |= T;
+ continue;
+ }
+ }
+ return Changed;
+}
+
+bool HexagonBitSimplify::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
+ auto &HST = MF.getSubtarget<HexagonSubtarget>();
+ auto &HRI = *HST.getRegisterInfo();
+ auto &HII = *HST.getInstrInfo();
+
+ MDT = &getAnalysis<MachineDominatorTree>();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ bool Changed;
+
+ Changed = DeadCodeElimination(MF, *MDT).run();
+
+ const HexagonEvaluator HE(HRI, MRI, HII, MF);
+ BitTracker BT(HE, MF);
+ DEBUG(BT.trace(true));
+ BT.run();
+
+ MachineBasicBlock &Entry = MF.front();
+
+ RegisterSet AIG; // Available registers for IG.
+ ConstGeneration ImmG(BT, HII, MRI);
+ Changed |= visitBlock(Entry, ImmG, AIG);
+
+ RegisterSet ARE; // Available registers for RIE.
+ RedundantInstrElimination RIE(BT, HII, MRI);
+ bool Ried = visitBlock(Entry, RIE, ARE);
+ if (Ried) {
+ Changed = true;
+ BT.run();
+ }
+
+ RegisterSet ACG; // Available registers for CG.
+ CopyGeneration CopyG(BT, HII, HRI, MRI);
+ Changed |= visitBlock(Entry, CopyG, ACG);
+
+ RegisterSet ACP; // Available registers for CP.
+ CopyPropagation CopyP(HRI, MRI);
+ Changed |= visitBlock(Entry, CopyP, ACP);
+
+ Changed = DeadCodeElimination(MF, *MDT).run() || Changed;
+
+ BT.run();
+ RegisterSet ABS; // Available registers for BS.
+ BitSimplification BitS(BT, HII, HRI, MRI, MF);
+ Changed |= visitBlock(Entry, BitS, ABS);
+
+ Changed = DeadCodeElimination(MF, *MDT).run() || Changed;
+
+ if (Changed) {
+ for (auto &B : MF)
+ for (auto &I : B)
+ I.clearKillInfo();
+ DeadCodeElimination(MF, *MDT).run();
+ }
+ return Changed;
+}
+
+// Recognize loops where the code at the end of the loop matches the code
+// before the entry of the loop, and the matching code is such that is can
+// be simplified. This pass relies on the bit simplification above and only
+// prepares code in a way that can be handled by the bit simplifcation.
+//
+// This is the motivating testcase (and explanation):
+//
+// {
+// loop0(.LBB0_2, r1) // %for.body.preheader
+// r5:4 = memd(r0++#8)
+// }
+// {
+// r3 = lsr(r4, #16)
+// r7:6 = combine(r5, r5)
+// }
+// {
+// r3 = insert(r5, #16, #16)
+// r7:6 = vlsrw(r7:6, #16)
+// }
+// .LBB0_2:
+// {
+// memh(r2+#4) = r5
+// memh(r2+#6) = r6 # R6 is really R5.H
+// }
+// {
+// r2 = add(r2, #8)
+// memh(r2+#0) = r4
+// memh(r2+#2) = r3 # R3 is really R4.H
+// }
+// {
+// r5:4 = memd(r0++#8)
+// }
+// { # "Shuffling" code that sets up R3 and R6
+// r3 = lsr(r4, #16) # so that their halves can be stored in the
+// r7:6 = combine(r5, r5) # next iteration. This could be folded into
+// } # the stores if the code was at the beginning
+// { # of the loop iteration. Since the same code
+// r3 = insert(r5, #16, #16) # precedes the loop, it can actually be moved
+// r7:6 = vlsrw(r7:6, #16) # there.
+// }:endloop0
+//
+//
+// The outcome:
+//
+// {
+// loop0(.LBB0_2, r1)
+// r5:4 = memd(r0++#8)
+// }
+// .LBB0_2:
+// {
+// memh(r2+#4) = r5
+// memh(r2+#6) = r5.h
+// }
+// {
+// r2 = add(r2, #8)
+// memh(r2+#0) = r4
+// memh(r2+#2) = r4.h
+// }
+// {
+// r5:4 = memd(r0++#8)
+// }:endloop0
+
+namespace llvm {
+
+ FunctionPass *createHexagonLoopRescheduling();
+ void initializeHexagonLoopReschedulingPass(PassRegistry&);
+
+} // end namespace llvm
+
+namespace {
+
+ class HexagonLoopRescheduling : public MachineFunctionPass {
+ public:
+ static char ID;
+
+ HexagonLoopRescheduling() : MachineFunctionPass(ID),
+ HII(nullptr), HRI(nullptr), MRI(nullptr), BTP(nullptr) {
+ initializeHexagonLoopReschedulingPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ private:
+ const HexagonInstrInfo *HII;
+ const HexagonRegisterInfo *HRI;
+ MachineRegisterInfo *MRI;
+ BitTracker *BTP;
+
+ struct LoopCand {
+ LoopCand(MachineBasicBlock *lb, MachineBasicBlock *pb,
+ MachineBasicBlock *eb) : LB(lb), PB(pb), EB(eb) {}
+ MachineBasicBlock *LB, *PB, *EB;
+ };
+ typedef std::vector<MachineInstr*> InstrList;
+ struct InstrGroup {
+ BitTracker::RegisterRef Inp, Out;
+ InstrList Ins;
+ };
+ struct PhiInfo {
+ PhiInfo(MachineInstr &P, MachineBasicBlock &B);
+ unsigned DefR;
+ BitTracker::RegisterRef LR, PR; // Loop Register, Preheader Register
+ MachineBasicBlock *LB, *PB; // Loop Block, Preheader Block
+ };
+
+ static unsigned getDefReg(const MachineInstr *MI);
+ bool isConst(unsigned Reg) const;
+ bool isBitShuffle(const MachineInstr *MI, unsigned DefR) const;
+ bool isStoreInput(const MachineInstr *MI, unsigned DefR) const;
+ bool isShuffleOf(unsigned OutR, unsigned InpR) const;
+ bool isSameShuffle(unsigned OutR1, unsigned InpR1, unsigned OutR2,
+ unsigned &InpR2) const;
+ void moveGroup(InstrGroup &G, MachineBasicBlock &LB, MachineBasicBlock &PB,
+ MachineBasicBlock::iterator At, unsigned OldPhiR, unsigned NewPredR);
+ bool processLoop(LoopCand &C);
+ };
+
+} // end anonymous namespace
+
+char HexagonLoopRescheduling::ID = 0;
+
+INITIALIZE_PASS(HexagonLoopRescheduling, "hexagon-loop-resched",
+ "Hexagon Loop Rescheduling", false, false)
+
+HexagonLoopRescheduling::PhiInfo::PhiInfo(MachineInstr &P,
+ MachineBasicBlock &B) {
+ DefR = HexagonLoopRescheduling::getDefReg(&P);
+ LB = &B;
+ PB = nullptr;
+ for (unsigned i = 1, n = P.getNumOperands(); i < n; i += 2) {
+ const MachineOperand &OpB = P.getOperand(i+1);
+ if (OpB.getMBB() == &B) {
+ LR = P.getOperand(i);
+ continue;
+ }
+ PB = OpB.getMBB();
+ PR = P.getOperand(i);
+ }
+}
+
+unsigned HexagonLoopRescheduling::getDefReg(const MachineInstr *MI) {
+ RegisterSet Defs;
+ HBS::getInstrDefs(*MI, Defs);
+ if (Defs.count() != 1)
+ return 0;
+ return Defs.find_first();
+}
+
+bool HexagonLoopRescheduling::isConst(unsigned Reg) const {
+ if (!BTP->has(Reg))
+ return false;
+ const BitTracker::RegisterCell &RC = BTP->lookup(Reg);
+ for (unsigned i = 0, w = RC.width(); i < w; ++i) {
+ const BitTracker::BitValue &V = RC[i];
+ if (!V.is(0) && !V.is(1))
+ return false;
+ }
+ return true;
+}
+
+bool HexagonLoopRescheduling::isBitShuffle(const MachineInstr *MI,
+ unsigned DefR) const {
+ unsigned Opc = MI->getOpcode();
+ switch (Opc) {
+ case TargetOpcode::COPY:
+ case Hexagon::S2_lsr_i_r:
+ case Hexagon::S2_asr_i_r:
+ case Hexagon::S2_asl_i_r:
+ case Hexagon::S2_lsr_i_p:
+ case Hexagon::S2_asr_i_p:
+ case Hexagon::S2_asl_i_p:
+ case Hexagon::S2_insert:
+ case Hexagon::A2_or:
+ case Hexagon::A2_orp:
+ case Hexagon::A2_and:
+ case Hexagon::A2_andp:
+ case Hexagon::A2_combinew:
+ case Hexagon::A4_combineri:
+ case Hexagon::A4_combineir:
+ case Hexagon::A2_combineii:
+ case Hexagon::A4_combineii:
+ case Hexagon::A2_combine_ll:
+ case Hexagon::A2_combine_lh:
+ case Hexagon::A2_combine_hl:
+ case Hexagon::A2_combine_hh:
+ return true;
+ }
+ return false;
+}
+
+bool HexagonLoopRescheduling::isStoreInput(const MachineInstr *MI,
+ unsigned InpR) const {
+ for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) {
+ const MachineOperand &Op = MI->getOperand(i);
+ if (!Op.isReg())
+ continue;
+ if (Op.getReg() == InpR)
+ return i == n-1;
+ }
+ return false;
+}
+
+bool HexagonLoopRescheduling::isShuffleOf(unsigned OutR, unsigned InpR) const {
+ if (!BTP->has(OutR) || !BTP->has(InpR))
+ return false;
+ const BitTracker::RegisterCell &OutC = BTP->lookup(OutR);
+ for (unsigned i = 0, w = OutC.width(); i < w; ++i) {
+ const BitTracker::BitValue &V = OutC[i];
+ if (V.Type != BitTracker::BitValue::Ref)
+ continue;
+ if (V.RefI.Reg != InpR)
+ return false;
+ }
+ return true;
+}
+
+bool HexagonLoopRescheduling::isSameShuffle(unsigned OutR1, unsigned InpR1,
+ unsigned OutR2, unsigned &InpR2) const {
+ if (!BTP->has(OutR1) || !BTP->has(InpR1) || !BTP->has(OutR2))
+ return false;
+ const BitTracker::RegisterCell &OutC1 = BTP->lookup(OutR1);
+ const BitTracker::RegisterCell &OutC2 = BTP->lookup(OutR2);
+ unsigned W = OutC1.width();
+ unsigned MatchR = 0;
+ if (W != OutC2.width())
+ return false;
+ for (unsigned i = 0; i < W; ++i) {
+ const BitTracker::BitValue &V1 = OutC1[i], &V2 = OutC2[i];
+ if (V1.Type != V2.Type || V1.Type == BitTracker::BitValue::One)
+ return false;
+ if (V1.Type != BitTracker::BitValue::Ref)
+ continue;
+ if (V1.RefI.Pos != V2.RefI.Pos)
+ return false;
+ if (V1.RefI.Reg != InpR1)
+ return false;
+ if (V2.RefI.Reg == 0 || V2.RefI.Reg == OutR2)
+ return false;
+ if (!MatchR)
+ MatchR = V2.RefI.Reg;
+ else if (V2.RefI.Reg != MatchR)
+ return false;
+ }
+ InpR2 = MatchR;
+ return true;
+}
+
+void HexagonLoopRescheduling::moveGroup(InstrGroup &G, MachineBasicBlock &LB,
+ MachineBasicBlock &PB, MachineBasicBlock::iterator At, unsigned OldPhiR,
+ unsigned NewPredR) {
+ DenseMap<unsigned,unsigned> RegMap;
+
+ const TargetRegisterClass *PhiRC = MRI->getRegClass(NewPredR);
+ unsigned PhiR = MRI->createVirtualRegister(PhiRC);
+ BuildMI(LB, At, At->getDebugLoc(), HII->get(TargetOpcode::PHI), PhiR)
+ .addReg(NewPredR)
+ .addMBB(&PB)
+ .addReg(G.Inp.Reg)
+ .addMBB(&LB);
+ RegMap.insert(std::make_pair(G.Inp.Reg, PhiR));
+
+ for (unsigned i = G.Ins.size(); i > 0; --i) {
+ const MachineInstr *SI = G.Ins[i-1];
+ unsigned DR = getDefReg(SI);
+ const TargetRegisterClass *RC = MRI->getRegClass(DR);
+ unsigned NewDR = MRI->createVirtualRegister(RC);
+ DebugLoc DL = SI->getDebugLoc();
+
+ auto MIB = BuildMI(LB, At, DL, HII->get(SI->getOpcode()), NewDR);
+ for (unsigned j = 0, m = SI->getNumOperands(); j < m; ++j) {
+ const MachineOperand &Op = SI->getOperand(j);
+ if (!Op.isReg()) {
+ MIB.addOperand(Op);
+ continue;
+ }
+ if (!Op.isUse())
+ continue;
+ unsigned UseR = RegMap[Op.getReg()];
+ MIB.addReg(UseR, 0, Op.getSubReg());
+ }
+ RegMap.insert(std::make_pair(DR, NewDR));
+ }
+
+ HBS::replaceReg(OldPhiR, RegMap[G.Out.Reg], *MRI);
+}
+
+bool HexagonLoopRescheduling::processLoop(LoopCand &C) {
+ DEBUG(dbgs() << "Processing loop in BB#" << C.LB->getNumber() << "\n");
+ std::vector<PhiInfo> Phis;
+ for (auto &I : *C.LB) {
+ if (!I.isPHI())
+ break;
+ unsigned PR = getDefReg(&I);
+ if (isConst(PR))
+ continue;
+ bool BadUse = false, GoodUse = false;
+ for (auto UI = MRI->use_begin(PR), UE = MRI->use_end(); UI != UE; ++UI) {
+ MachineInstr *UseI = UI->getParent();
+ if (UseI->getParent() != C.LB) {
+ BadUse = true;
+ break;
+ }
+ if (isBitShuffle(UseI, PR) || isStoreInput(UseI, PR))
+ GoodUse = true;
+ }
+ if (BadUse || !GoodUse)
+ continue;
+
+ Phis.push_back(PhiInfo(I, *C.LB));
+ }
+
+ DEBUG({
+ dbgs() << "Phis: {";
+ for (auto &I : Phis) {
+ dbgs() << ' ' << PrintReg(I.DefR, HRI) << "=phi("
+ << PrintReg(I.PR.Reg, HRI, I.PR.Sub) << ":b" << I.PB->getNumber()
+ << ',' << PrintReg(I.LR.Reg, HRI, I.LR.Sub) << ":b"
+ << I.LB->getNumber() << ')';
+ }
+ dbgs() << " }\n";
+ });
+
+ if (Phis.empty())
+ return false;
+
+ bool Changed = false;
+ InstrList ShufIns;
+
+ // Go backwards in the block: for each bit shuffling instruction, check
+ // if that instruction could potentially be moved to the front of the loop:
+ // the output of the loop cannot be used in a non-shuffling instruction
+ // in this loop.
+ for (auto I = C.LB->rbegin(), E = C.LB->rend(); I != E; ++I) {
+ if (I->isTerminator())
+ continue;
+ if (I->isPHI())
+ break;
+
+ RegisterSet Defs;
+ HBS::getInstrDefs(*I, Defs);
+ if (Defs.count() != 1)
+ continue;
+ unsigned DefR = Defs.find_first();
+ if (!TargetRegisterInfo::isVirtualRegister(DefR))
+ continue;
+ if (!isBitShuffle(&*I, DefR))
+ continue;
+
+ bool BadUse = false;
+ for (auto UI = MRI->use_begin(DefR), UE = MRI->use_end(); UI != UE; ++UI) {
+ MachineInstr *UseI = UI->getParent();
+ if (UseI->getParent() == C.LB) {
+ if (UseI->isPHI()) {
+ // If the use is in a phi node in this loop, then it should be
+ // the value corresponding to the back edge.
+ unsigned Idx = UI.getOperandNo();
+ if (UseI->getOperand(Idx+1).getMBB() != C.LB)
+ BadUse = true;
+ } else {
+ auto F = find(ShufIns, UseI);
+ if (F == ShufIns.end())
+ BadUse = true;
+ }
+ } else {
+ // There is a use outside of the loop, but there is no epilog block
+ // suitable for a copy-out.
+ if (C.EB == nullptr)
+ BadUse = true;
+ }
+ if (BadUse)
+ break;
+ }
+
+ if (BadUse)
+ continue;
+ ShufIns.push_back(&*I);
+ }
+
+ // Partition the list of shuffling instructions into instruction groups,
+ // where each group has to be moved as a whole (i.e. a group is a chain of
+ // dependent instructions). A group produces a single live output register,
+ // which is meant to be the input of the loop phi node (although this is
+ // not checked here yet). It also uses a single register as its input,
+ // which is some value produced in the loop body. After moving the group
+ // to the beginning of the loop, that input register would need to be
+ // the loop-carried register (through a phi node) instead of the (currently
+ // loop-carried) output register.
+ typedef std::vector<InstrGroup> InstrGroupList;
+ InstrGroupList Groups;
+
+ for (unsigned i = 0, n = ShufIns.size(); i < n; ++i) {
+ MachineInstr *SI = ShufIns[i];
+ if (SI == nullptr)
+ continue;
+
+ InstrGroup G;
+ G.Ins.push_back(SI);
+ G.Out.Reg = getDefReg(SI);
+ RegisterSet Inputs;
+ HBS::getInstrUses(*SI, Inputs);
+
+ for (unsigned j = i+1; j < n; ++j) {
+ MachineInstr *MI = ShufIns[j];
+ if (MI == nullptr)
+ continue;
+ RegisterSet Defs;
+ HBS::getInstrDefs(*MI, Defs);
+ // If this instruction does not define any pending inputs, skip it.
+ if (!Defs.intersects(Inputs))
+ continue;
+ // Otherwise, add it to the current group and remove the inputs that
+ // are defined by MI.
+ G.Ins.push_back(MI);
+ Inputs.remove(Defs);
+ // Then add all registers used by MI.
+ HBS::getInstrUses(*MI, Inputs);
+ ShufIns[j] = nullptr;
+ }
+
+ // Only add a group if it requires at most one register.
+ if (Inputs.count() > 1)
+ continue;
+ auto LoopInpEq = [G] (const PhiInfo &P) -> bool {
+ return G.Out.Reg == P.LR.Reg;
+ };
+ if (llvm::find_if(Phis, LoopInpEq) == Phis.end())
+ continue;
+
+ G.Inp.Reg = Inputs.find_first();
+ Groups.push_back(G);
+ }
+
+ DEBUG({
+ for (unsigned i = 0, n = Groups.size(); i < n; ++i) {
+ InstrGroup &G = Groups[i];
+ dbgs() << "Group[" << i << "] inp: "
+ << PrintReg(G.Inp.Reg, HRI, G.Inp.Sub)
+ << " out: " << PrintReg(G.Out.Reg, HRI, G.Out.Sub) << "\n";
+ for (unsigned j = 0, m = G.Ins.size(); j < m; ++j)
+ dbgs() << " " << *G.Ins[j];
+ }
+ });
+
+ for (unsigned i = 0, n = Groups.size(); i < n; ++i) {
+ InstrGroup &G = Groups[i];
+ if (!isShuffleOf(G.Out.Reg, G.Inp.Reg))
+ continue;
+ auto LoopInpEq = [G] (const PhiInfo &P) -> bool {
+ return G.Out.Reg == P.LR.Reg;
+ };
+ auto F = llvm::find_if(Phis, LoopInpEq);
+ if (F == Phis.end())
+ continue;
+ unsigned PrehR = 0;
+ if (!isSameShuffle(G.Out.Reg, G.Inp.Reg, F->PR.Reg, PrehR)) {
+ const MachineInstr *DefPrehR = MRI->getVRegDef(F->PR.Reg);
+ unsigned Opc = DefPrehR->getOpcode();
+ if (Opc != Hexagon::A2_tfrsi && Opc != Hexagon::A2_tfrpi)
+ continue;
+ if (!DefPrehR->getOperand(1).isImm())
+ continue;
+ if (DefPrehR->getOperand(1).getImm() != 0)
+ continue;
+ const TargetRegisterClass *RC = MRI->getRegClass(G.Inp.Reg);
+ if (RC != MRI->getRegClass(F->PR.Reg)) {
+ PrehR = MRI->createVirtualRegister(RC);
+ unsigned TfrI = (RC == &Hexagon::IntRegsRegClass) ? Hexagon::A2_tfrsi
+ : Hexagon::A2_tfrpi;
+ auto T = C.PB->getFirstTerminator();
+ DebugLoc DL = (T != C.PB->end()) ? T->getDebugLoc() : DebugLoc();
+ BuildMI(*C.PB, T, DL, HII->get(TfrI), PrehR)
+ .addImm(0);
+ } else {
+ PrehR = F->PR.Reg;
+ }
+ }
+ // isSameShuffle could match with PrehR being of a wider class than
+ // G.Inp.Reg, for example if G shuffles the low 32 bits of its input,
+ // it would match for the input being a 32-bit register, and PrehR
+ // being a 64-bit register (where the low 32 bits match). This could
+ // be handled, but for now skip these cases.
+ if (MRI->getRegClass(PrehR) != MRI->getRegClass(G.Inp.Reg))
+ continue;
+ moveGroup(G, *F->LB, *F->PB, F->LB->getFirstNonPHI(), F->DefR, PrehR);
+ Changed = true;
+ }
+
+ return Changed;
+}
+
+bool HexagonLoopRescheduling::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
+ auto &HST = MF.getSubtarget<HexagonSubtarget>();
+ HII = HST.getInstrInfo();
+ HRI = HST.getRegisterInfo();
+ MRI = &MF.getRegInfo();
+ const HexagonEvaluator HE(*HRI, *MRI, *HII, MF);
+ BitTracker BT(HE, MF);
+ DEBUG(BT.trace(true));
+ BT.run();
+ BTP = &BT;
+
+ std::vector<LoopCand> Cand;
+
+ for (auto &B : MF) {
+ if (B.pred_size() != 2 || B.succ_size() != 2)
+ continue;
+ MachineBasicBlock *PB = nullptr;
+ bool IsLoop = false;
+ for (auto PI = B.pred_begin(), PE = B.pred_end(); PI != PE; ++PI) {
+ if (*PI != &B)
+ PB = *PI;
+ else
+ IsLoop = true;
+ }
+ if (!IsLoop)
+ continue;
+
+ MachineBasicBlock *EB = nullptr;
+ for (auto SI = B.succ_begin(), SE = B.succ_end(); SI != SE; ++SI) {
+ if (*SI == &B)
+ continue;
+ // Set EP to the epilog block, if it has only 1 predecessor (i.e. the
+ // edge from B to EP is non-critical.
+ if ((*SI)->pred_size() == 1)
+ EB = *SI;
+ break;
+ }
+
+ Cand.push_back(LoopCand(&B, PB, EB));
+ }
+
+ bool Changed = false;
+ for (auto &C : Cand)
+ Changed |= processLoop(C);
+
+ return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+FunctionPass *llvm::createHexagonLoopRescheduling() {
+ return new HexagonLoopRescheduling();
+}
+
+FunctionPass *llvm::createHexagonBitSimplify() {
+ return new HexagonBitSimplify();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp
new file mode 100644
index 000000000000..b78c4126e0b1
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp
@@ -0,0 +1,1191 @@
+//===--- HexagonBitTracker.cpp --------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include "Hexagon.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonTargetMachine.h"
+#include "HexagonBitTracker.h"
+
+using namespace llvm;
+
+typedef BitTracker BT;
+
+HexagonEvaluator::HexagonEvaluator(const HexagonRegisterInfo &tri,
+ MachineRegisterInfo &mri,
+ const HexagonInstrInfo &tii,
+ MachineFunction &mf)
+ : MachineEvaluator(tri, mri), MF(mf), MFI(mf.getFrameInfo()), TII(tii) {
+ // Populate the VRX map (VR to extension-type).
+ // Go over all the formal parameters of the function. If a given parameter
+ // P is sign- or zero-extended, locate the virtual register holding that
+ // parameter and create an entry in the VRX map indicating the type of ex-
+ // tension (and the source type).
+ // This is a bit complicated to do accurately, since the memory layout in-
+ // formation is necessary to precisely determine whether an aggregate para-
+ // meter will be passed in a register or in memory. What is given in MRI
+ // is the association between the physical register that is live-in (i.e.
+ // holds an argument), and the virtual register that this value will be
+ // copied into. This, by itself, is not sufficient to map back the virtual
+ // register to a formal parameter from Function (since consecutive live-ins
+ // from MRI may not correspond to consecutive formal parameters from Func-
+ // tion). To avoid the complications with in-memory arguments, only consi-
+ // der the initial sequence of formal parameters that are known to be
+ // passed via registers.
+ unsigned AttrIdx = 0;
+ unsigned InVirtReg, InPhysReg = 0;
+ const Function &F = *MF.getFunction();
+ typedef Function::const_arg_iterator arg_iterator;
+ for (arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) {
+ AttrIdx++;
+ const Argument &Arg = *I;
+ Type *ATy = Arg.getType();
+ unsigned Width = 0;
+ if (ATy->isIntegerTy())
+ Width = ATy->getIntegerBitWidth();
+ else if (ATy->isPointerTy())
+ Width = 32;
+ // If pointer size is not set through target data, it will default to
+ // Module::AnyPointerSize.
+ if (Width == 0 || Width > 64)
+ break;
+ AttributeSet Attrs = F.getAttributes();
+ if (Attrs.hasAttribute(AttrIdx, Attribute::ByVal))
+ continue;
+ InPhysReg = getNextPhysReg(InPhysReg, Width);
+ if (!InPhysReg)
+ break;
+ InVirtReg = getVirtRegFor(InPhysReg);
+ if (!InVirtReg)
+ continue;
+ if (Attrs.hasAttribute(AttrIdx, Attribute::SExt))
+ VRX.insert(std::make_pair(InVirtReg, ExtType(ExtType::SExt, Width)));
+ else if (Attrs.hasAttribute(AttrIdx, Attribute::ZExt))
+ VRX.insert(std::make_pair(InVirtReg, ExtType(ExtType::ZExt, Width)));
+ }
+}
+
+
+BT::BitMask HexagonEvaluator::mask(unsigned Reg, unsigned Sub) const {
+ if (Sub == 0)
+ return MachineEvaluator::mask(Reg, 0);
+ using namespace Hexagon;
+ const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+ unsigned ID = RC->getID();
+ uint16_t RW = getRegBitWidth(RegisterRef(Reg, Sub));
+ auto &HRI = static_cast<const HexagonRegisterInfo&>(TRI);
+ bool IsSubLo = (Sub == HRI.getHexagonSubRegIndex(RC, Hexagon::ps_sub_lo));
+ switch (ID) {
+ case DoubleRegsRegClassID:
+ case VecDblRegsRegClassID:
+ case VecDblRegs128BRegClassID:
+ return IsSubLo ? BT::BitMask(0, RW-1)
+ : BT::BitMask(RW, 2*RW-1);
+ default:
+ break;
+ }
+#ifndef NDEBUG
+ dbgs() << PrintReg(Reg, &TRI, Sub) << '\n';
+#endif
+ llvm_unreachable("Unexpected register/subregister");
+}
+
+namespace {
+class RegisterRefs {
+ std::vector<BT::RegisterRef> Vector;
+
+public:
+ RegisterRefs(const MachineInstr &MI) : Vector(MI.getNumOperands()) {
+ for (unsigned i = 0, n = Vector.size(); i < n; ++i) {
+ const MachineOperand &MO = MI.getOperand(i);
+ if (MO.isReg())
+ Vector[i] = BT::RegisterRef(MO);
+ // For indices that don't correspond to registers, the entry will
+ // remain constructed via the default constructor.
+ }
+ }
+
+ size_t size() const { return Vector.size(); }
+ const BT::RegisterRef &operator[](unsigned n) const {
+ // The main purpose of this operator is to assert with bad argument.
+ assert(n < Vector.size());
+ return Vector[n];
+ }
+};
+}
+
+bool HexagonEvaluator::evaluate(const MachineInstr &MI,
+ const CellMapType &Inputs,
+ CellMapType &Outputs) const {
+ unsigned NumDefs = 0;
+
+ // Sanity verification: there should not be any defs with subregisters.
+ for (unsigned i = 0, n = MI.getNumOperands(); i < n; ++i) {
+ const MachineOperand &MO = MI.getOperand(i);
+ if (!MO.isReg() || !MO.isDef())
+ continue;
+ NumDefs++;
+ assert(MO.getSubReg() == 0);
+ }
+
+ if (NumDefs == 0)
+ return false;
+
+ using namespace Hexagon;
+ unsigned Opc = MI.getOpcode();
+
+ if (MI.mayLoad()) {
+ switch (Opc) {
+ // These instructions may be marked as mayLoad, but they are generating
+ // immediate values, so skip them.
+ case CONST32:
+ case CONST64:
+ break;
+ default:
+ return evaluateLoad(MI, Inputs, Outputs);
+ }
+ }
+
+ // Check COPY instructions that copy formal parameters into virtual
+ // registers. Such parameters can be sign- or zero-extended at the
+ // call site, and we should take advantage of this knowledge. The MRI
+ // keeps a list of pairs of live-in physical and virtual registers,
+ // which provides information about which virtual registers will hold
+ // the argument values. The function will still contain instructions
+ // defining those virtual registers, and in practice those are COPY
+ // instructions from a physical to a virtual register. In such cases,
+ // applying the argument extension to the virtual register can be seen
+ // as simply mirroring the extension that had already been applied to
+ // the physical register at the call site. If the defining instruction
+ // was not a COPY, it would not be clear how to mirror that extension
+ // on the callee's side. For that reason, only check COPY instructions
+ // for potential extensions.
+ if (MI.isCopy()) {
+ if (evaluateFormalCopy(MI, Inputs, Outputs))
+ return true;
+ }
+
+ // Beyond this point, if any operand is a global, skip that instruction.
+ // The reason is that certain instructions that can take an immediate
+ // operand can also have a global symbol in that operand. To avoid
+ // checking what kind of operand a given instruction has individually
+ // for each instruction, do it here. Global symbols as operands gene-
+ // rally do not provide any useful information.
+ for (unsigned i = 0, n = MI.getNumOperands(); i < n; ++i) {
+ const MachineOperand &MO = MI.getOperand(i);
+ if (MO.isGlobal() || MO.isBlockAddress() || MO.isSymbol() || MO.isJTI() ||
+ MO.isCPI())
+ return false;
+ }
+
+ RegisterRefs Reg(MI);
+#define op(i) MI.getOperand(i)
+#define rc(i) RegisterCell::ref(getCell(Reg[i], Inputs))
+#define im(i) MI.getOperand(i).getImm()
+
+ // If the instruction has no register operands, skip it.
+ if (Reg.size() == 0)
+ return false;
+
+ // Record result for register in operand 0.
+ auto rr0 = [this,Reg] (const BT::RegisterCell &Val, CellMapType &Outputs)
+ -> bool {
+ putCell(Reg[0], Val, Outputs);
+ return true;
+ };
+ // Get the cell corresponding to the N-th operand.
+ auto cop = [this, &Reg, &MI, &Inputs](unsigned N,
+ uint16_t W) -> BT::RegisterCell {
+ const MachineOperand &Op = MI.getOperand(N);
+ if (Op.isImm())
+ return eIMM(Op.getImm(), W);
+ if (!Op.isReg())
+ return RegisterCell::self(0, W);
+ assert(getRegBitWidth(Reg[N]) == W && "Register width mismatch");
+ return rc(N);
+ };
+ // Extract RW low bits of the cell.
+ auto lo = [this] (const BT::RegisterCell &RC, uint16_t RW)
+ -> BT::RegisterCell {
+ assert(RW <= RC.width());
+ return eXTR(RC, 0, RW);
+ };
+ // Extract RW high bits of the cell.
+ auto hi = [this] (const BT::RegisterCell &RC, uint16_t RW)
+ -> BT::RegisterCell {
+ uint16_t W = RC.width();
+ assert(RW <= W);
+ return eXTR(RC, W-RW, W);
+ };
+ // Extract N-th halfword (counting from the least significant position).
+ auto half = [this] (const BT::RegisterCell &RC, unsigned N)
+ -> BT::RegisterCell {
+ assert(N*16+16 <= RC.width());
+ return eXTR(RC, N*16, N*16+16);
+ };
+ // Shuffle bits (pick even/odd from cells and merge into result).
+ auto shuffle = [this] (const BT::RegisterCell &Rs, const BT::RegisterCell &Rt,
+ uint16_t BW, bool Odd) -> BT::RegisterCell {
+ uint16_t I = Odd, Ws = Rs.width();
+ assert(Ws == Rt.width());
+ RegisterCell RC = eXTR(Rt, I*BW, I*BW+BW).cat(eXTR(Rs, I*BW, I*BW+BW));
+ I += 2;
+ while (I*BW < Ws) {
+ RC.cat(eXTR(Rt, I*BW, I*BW+BW)).cat(eXTR(Rs, I*BW, I*BW+BW));
+ I += 2;
+ }
+ return RC;
+ };
+
+ // The bitwidth of the 0th operand. In most (if not all) of the
+ // instructions below, the 0th operand is the defined register.
+ // Pre-compute the bitwidth here, because it is needed in many cases
+ // cases below.
+ uint16_t W0 = (Reg[0].Reg != 0) ? getRegBitWidth(Reg[0]) : 0;
+
+ switch (Opc) {
+ // Transfer immediate:
+
+ case A2_tfrsi:
+ case A2_tfrpi:
+ case CONST32:
+ case CONST64:
+ return rr0(eIMM(im(1), W0), Outputs);
+ case PS_false:
+ return rr0(RegisterCell(W0).fill(0, W0, BT::BitValue::Zero), Outputs);
+ case PS_true:
+ return rr0(RegisterCell(W0).fill(0, W0, BT::BitValue::One), Outputs);
+ case PS_fi: {
+ int FI = op(1).getIndex();
+ int Off = op(2).getImm();
+ unsigned A = MFI.getObjectAlignment(FI) + std::abs(Off);
+ unsigned L = Log2_32(A);
+ RegisterCell RC = RegisterCell::self(Reg[0].Reg, W0);
+ RC.fill(0, L, BT::BitValue::Zero);
+ return rr0(RC, Outputs);
+ }
+
+ // Transfer register:
+
+ case A2_tfr:
+ case A2_tfrp:
+ case C2_pxfer_map:
+ return rr0(rc(1), Outputs);
+ case C2_tfrpr: {
+ uint16_t RW = W0;
+ uint16_t PW = 8; // XXX Pred size: getRegBitWidth(Reg[1]);
+ assert(PW <= RW);
+ RegisterCell PC = eXTR(rc(1), 0, PW);
+ RegisterCell RC = RegisterCell(RW).insert(PC, BT::BitMask(0, PW-1));
+ RC.fill(PW, RW, BT::BitValue::Zero);
+ return rr0(RC, Outputs);
+ }
+ case C2_tfrrp: {
+ RegisterCell RC = RegisterCell::self(Reg[0].Reg, W0);
+ W0 = 8; // XXX Pred size
+ return rr0(eINS(RC, eXTR(rc(1), 0, W0), 0), Outputs);
+ }
+
+ // Arithmetic:
+
+ case A2_abs:
+ case A2_absp:
+ // TODO
+ break;
+
+ case A2_addsp: {
+ uint16_t W1 = getRegBitWidth(Reg[1]);
+ assert(W0 == 64 && W1 == 32);
+ RegisterCell CW = RegisterCell(W0).insert(rc(1), BT::BitMask(0, W1-1));
+ RegisterCell RC = eADD(eSXT(CW, W1), rc(2));
+ return rr0(RC, Outputs);
+ }
+ case A2_add:
+ case A2_addp:
+ return rr0(eADD(rc(1), rc(2)), Outputs);
+ case A2_addi:
+ return rr0(eADD(rc(1), eIMM(im(2), W0)), Outputs);
+ case S4_addi_asl_ri: {
+ RegisterCell RC = eADD(eIMM(im(1), W0), eASL(rc(2), im(3)));
+ return rr0(RC, Outputs);
+ }
+ case S4_addi_lsr_ri: {
+ RegisterCell RC = eADD(eIMM(im(1), W0), eLSR(rc(2), im(3)));
+ return rr0(RC, Outputs);
+ }
+ case S4_addaddi: {
+ RegisterCell RC = eADD(rc(1), eADD(rc(2), eIMM(im(3), W0)));
+ return rr0(RC, Outputs);
+ }
+ case M4_mpyri_addi: {
+ RegisterCell M = eMLS(rc(2), eIMM(im(3), W0));
+ RegisterCell RC = eADD(eIMM(im(1), W0), lo(M, W0));
+ return rr0(RC, Outputs);
+ }
+ case M4_mpyrr_addi: {
+ RegisterCell M = eMLS(rc(2), rc(3));
+ RegisterCell RC = eADD(eIMM(im(1), W0), lo(M, W0));
+ return rr0(RC, Outputs);
+ }
+ case M4_mpyri_addr_u2: {
+ RegisterCell M = eMLS(eIMM(im(2), W0), rc(3));
+ RegisterCell RC = eADD(rc(1), lo(M, W0));
+ return rr0(RC, Outputs);
+ }
+ case M4_mpyri_addr: {
+ RegisterCell M = eMLS(rc(2), eIMM(im(3), W0));
+ RegisterCell RC = eADD(rc(1), lo(M, W0));
+ return rr0(RC, Outputs);
+ }
+ case M4_mpyrr_addr: {
+ RegisterCell M = eMLS(rc(2), rc(3));
+ RegisterCell RC = eADD(rc(1), lo(M, W0));
+ return rr0(RC, Outputs);
+ }
+ case S4_subaddi: {
+ RegisterCell RC = eADD(rc(1), eSUB(eIMM(im(2), W0), rc(3)));
+ return rr0(RC, Outputs);
+ }
+ case M2_accii: {
+ RegisterCell RC = eADD(rc(1), eADD(rc(2), eIMM(im(3), W0)));
+ return rr0(RC, Outputs);
+ }
+ case M2_acci: {
+ RegisterCell RC = eADD(rc(1), eADD(rc(2), rc(3)));
+ return rr0(RC, Outputs);
+ }
+ case M2_subacc: {
+ RegisterCell RC = eADD(rc(1), eSUB(rc(2), rc(3)));
+ return rr0(RC, Outputs);
+ }
+ case S2_addasl_rrri: {
+ RegisterCell RC = eADD(rc(1), eASL(rc(2), im(3)));
+ return rr0(RC, Outputs);
+ }
+ case C4_addipc: {
+ RegisterCell RPC = RegisterCell::self(Reg[0].Reg, W0);
+ RPC.fill(0, 2, BT::BitValue::Zero);
+ return rr0(eADD(RPC, eIMM(im(2), W0)), Outputs);
+ }
+ case A2_sub:
+ case A2_subp:
+ return rr0(eSUB(rc(1), rc(2)), Outputs);
+ case A2_subri:
+ return rr0(eSUB(eIMM(im(1), W0), rc(2)), Outputs);
+ case S4_subi_asl_ri: {
+ RegisterCell RC = eSUB(eIMM(im(1), W0), eASL(rc(2), im(3)));
+ return rr0(RC, Outputs);
+ }
+ case S4_subi_lsr_ri: {
+ RegisterCell RC = eSUB(eIMM(im(1), W0), eLSR(rc(2), im(3)));
+ return rr0(RC, Outputs);
+ }
+ case M2_naccii: {
+ RegisterCell RC = eSUB(rc(1), eADD(rc(2), eIMM(im(3), W0)));
+ return rr0(RC, Outputs);
+ }
+ case M2_nacci: {
+ RegisterCell RC = eSUB(rc(1), eADD(rc(2), rc(3)));
+ return rr0(RC, Outputs);
+ }
+ // 32-bit negation is done by "Rd = A2_subri 0, Rs"
+ case A2_negp:
+ return rr0(eSUB(eIMM(0, W0), rc(1)), Outputs);
+
+ case M2_mpy_up: {
+ RegisterCell M = eMLS(rc(1), rc(2));
+ return rr0(hi(M, W0), Outputs);
+ }
+ case M2_dpmpyss_s0:
+ return rr0(eMLS(rc(1), rc(2)), Outputs);
+ case M2_dpmpyss_acc_s0:
+ return rr0(eADD(rc(1), eMLS(rc(2), rc(3))), Outputs);
+ case M2_dpmpyss_nac_s0:
+ return rr0(eSUB(rc(1), eMLS(rc(2), rc(3))), Outputs);
+ case M2_mpyi: {
+ RegisterCell M = eMLS(rc(1), rc(2));
+ return rr0(lo(M, W0), Outputs);
+ }
+ case M2_macsip: {
+ RegisterCell M = eMLS(rc(2), eIMM(im(3), W0));
+ RegisterCell RC = eADD(rc(1), lo(M, W0));
+ return rr0(RC, Outputs);
+ }
+ case M2_macsin: {
+ RegisterCell M = eMLS(rc(2), eIMM(im(3), W0));
+ RegisterCell RC = eSUB(rc(1), lo(M, W0));
+ return rr0(RC, Outputs);
+ }
+ case M2_maci: {
+ RegisterCell M = eMLS(rc(2), rc(3));
+ RegisterCell RC = eADD(rc(1), lo(M, W0));
+ return rr0(RC, Outputs);
+ }
+ case M2_mpysmi: {
+ RegisterCell M = eMLS(rc(1), eIMM(im(2), W0));
+ return rr0(lo(M, 32), Outputs);
+ }
+ case M2_mpysin: {
+ RegisterCell M = eMLS(rc(1), eIMM(-im(2), W0));
+ return rr0(lo(M, 32), Outputs);
+ }
+ case M2_mpysip: {
+ RegisterCell M = eMLS(rc(1), eIMM(im(2), W0));
+ return rr0(lo(M, 32), Outputs);
+ }
+ case M2_mpyu_up: {
+ RegisterCell M = eMLU(rc(1), rc(2));
+ return rr0(hi(M, W0), Outputs);
+ }
+ case M2_dpmpyuu_s0:
+ return rr0(eMLU(rc(1), rc(2)), Outputs);
+ case M2_dpmpyuu_acc_s0:
+ return rr0(eADD(rc(1), eMLU(rc(2), rc(3))), Outputs);
+ case M2_dpmpyuu_nac_s0:
+ return rr0(eSUB(rc(1), eMLU(rc(2), rc(3))), Outputs);
+ //case M2_mpysu_up:
+
+ // Logical/bitwise:
+
+ case A2_andir:
+ return rr0(eAND(rc(1), eIMM(im(2), W0)), Outputs);
+ case A2_and:
+ case A2_andp:
+ return rr0(eAND(rc(1), rc(2)), Outputs);
+ case A4_andn:
+ case A4_andnp:
+ return rr0(eAND(rc(1), eNOT(rc(2))), Outputs);
+ case S4_andi_asl_ri: {
+ RegisterCell RC = eAND(eIMM(im(1), W0), eASL(rc(2), im(3)));
+ return rr0(RC, Outputs);
+ }
+ case S4_andi_lsr_ri: {
+ RegisterCell RC = eAND(eIMM(im(1), W0), eLSR(rc(2), im(3)));
+ return rr0(RC, Outputs);
+ }
+ case M4_and_and:
+ return rr0(eAND(rc(1), eAND(rc(2), rc(3))), Outputs);
+ case M4_and_andn:
+ return rr0(eAND(rc(1), eAND(rc(2), eNOT(rc(3)))), Outputs);
+ case M4_and_or:
+ return rr0(eAND(rc(1), eORL(rc(2), rc(3))), Outputs);
+ case M4_and_xor:
+ return rr0(eAND(rc(1), eXOR(rc(2), rc(3))), Outputs);
+ case A2_orir:
+ return rr0(eORL(rc(1), eIMM(im(2), W0)), Outputs);
+ case A2_or:
+ case A2_orp:
+ return rr0(eORL(rc(1), rc(2)), Outputs);
+ case A4_orn:
+ case A4_ornp:
+ return rr0(eORL(rc(1), eNOT(rc(2))), Outputs);
+ case S4_ori_asl_ri: {
+ RegisterCell RC = eORL(eIMM(im(1), W0), eASL(rc(2), im(3)));
+ return rr0(RC, Outputs);
+ }
+ case S4_ori_lsr_ri: {
+ RegisterCell RC = eORL(eIMM(im(1), W0), eLSR(rc(2), im(3)));
+ return rr0(RC, Outputs);
+ }
+ case M4_or_and:
+ return rr0(eORL(rc(1), eAND(rc(2), rc(3))), Outputs);
+ case M4_or_andn:
+ return rr0(eORL(rc(1), eAND(rc(2), eNOT(rc(3)))), Outputs);
+ case S4_or_andi:
+ case S4_or_andix: {
+ RegisterCell RC = eORL(rc(1), eAND(rc(2), eIMM(im(3), W0)));
+ return rr0(RC, Outputs);
+ }
+ case S4_or_ori: {
+ RegisterCell RC = eORL(rc(1), eORL(rc(2), eIMM(im(3), W0)));
+ return rr0(RC, Outputs);
+ }
+ case M4_or_or:
+ return rr0(eORL(rc(1), eORL(rc(2), rc(3))), Outputs);
+ case M4_or_xor:
+ return rr0(eORL(rc(1), eXOR(rc(2), rc(3))), Outputs);
+ case A2_xor:
+ case A2_xorp:
+ return rr0(eXOR(rc(1), rc(2)), Outputs);
+ case M4_xor_and:
+ return rr0(eXOR(rc(1), eAND(rc(2), rc(3))), Outputs);
+ case M4_xor_andn:
+ return rr0(eXOR(rc(1), eAND(rc(2), eNOT(rc(3)))), Outputs);
+ case M4_xor_or:
+ return rr0(eXOR(rc(1), eORL(rc(2), rc(3))), Outputs);
+ case M4_xor_xacc:
+ return rr0(eXOR(rc(1), eXOR(rc(2), rc(3))), Outputs);
+ case A2_not:
+ case A2_notp:
+ return rr0(eNOT(rc(1)), Outputs);
+
+ case S2_asl_i_r:
+ case S2_asl_i_p:
+ return rr0(eASL(rc(1), im(2)), Outputs);
+ case A2_aslh:
+ return rr0(eASL(rc(1), 16), Outputs);
+ case S2_asl_i_r_acc:
+ case S2_asl_i_p_acc:
+ return rr0(eADD(rc(1), eASL(rc(2), im(3))), Outputs);
+ case S2_asl_i_r_nac:
+ case S2_asl_i_p_nac:
+ return rr0(eSUB(rc(1), eASL(rc(2), im(3))), Outputs);
+ case S2_asl_i_r_and:
+ case S2_asl_i_p_and:
+ return rr0(eAND(rc(1), eASL(rc(2), im(3))), Outputs);
+ case S2_asl_i_r_or:
+ case S2_asl_i_p_or:
+ return rr0(eORL(rc(1), eASL(rc(2), im(3))), Outputs);
+ case S2_asl_i_r_xacc:
+ case S2_asl_i_p_xacc:
+ return rr0(eXOR(rc(1), eASL(rc(2), im(3))), Outputs);
+ case S2_asl_i_vh:
+ case S2_asl_i_vw:
+ // TODO
+ break;
+
+ case S2_asr_i_r:
+ case S2_asr_i_p:
+ return rr0(eASR(rc(1), im(2)), Outputs);
+ case A2_asrh:
+ return rr0(eASR(rc(1), 16), Outputs);
+ case S2_asr_i_r_acc:
+ case S2_asr_i_p_acc:
+ return rr0(eADD(rc(1), eASR(rc(2), im(3))), Outputs);
+ case S2_asr_i_r_nac:
+ case S2_asr_i_p_nac:
+ return rr0(eSUB(rc(1), eASR(rc(2), im(3))), Outputs);
+ case S2_asr_i_r_and:
+ case S2_asr_i_p_and:
+ return rr0(eAND(rc(1), eASR(rc(2), im(3))), Outputs);
+ case S2_asr_i_r_or:
+ case S2_asr_i_p_or:
+ return rr0(eORL(rc(1), eASR(rc(2), im(3))), Outputs);
+ case S2_asr_i_r_rnd: {
+ // The input is first sign-extended to 64 bits, then the output
+ // is truncated back to 32 bits.
+ assert(W0 == 32);
+ RegisterCell XC = eSXT(rc(1).cat(eIMM(0, W0)), W0);
+ RegisterCell RC = eASR(eADD(eASR(XC, im(2)), eIMM(1, 2*W0)), 1);
+ return rr0(eXTR(RC, 0, W0), Outputs);
+ }
+ case S2_asr_i_r_rnd_goodsyntax: {
+ int64_t S = im(2);
+ if (S == 0)
+ return rr0(rc(1), Outputs);
+ // Result: S2_asr_i_r_rnd Rs, u5-1
+ RegisterCell XC = eSXT(rc(1).cat(eIMM(0, W0)), W0);
+ RegisterCell RC = eLSR(eADD(eASR(XC, S-1), eIMM(1, 2*W0)), 1);
+ return rr0(eXTR(RC, 0, W0), Outputs);
+ }
+ case S2_asr_r_vh:
+ case S2_asr_i_vw:
+ case S2_asr_i_svw_trun:
+ // TODO
+ break;
+
+ case S2_lsr_i_r:
+ case S2_lsr_i_p:
+ return rr0(eLSR(rc(1), im(2)), Outputs);
+ case S2_lsr_i_r_acc:
+ case S2_lsr_i_p_acc:
+ return rr0(eADD(rc(1), eLSR(rc(2), im(3))), Outputs);
+ case S2_lsr_i_r_nac:
+ case S2_lsr_i_p_nac:
+ return rr0(eSUB(rc(1), eLSR(rc(2), im(3))), Outputs);
+ case S2_lsr_i_r_and:
+ case S2_lsr_i_p_and:
+ return rr0(eAND(rc(1), eLSR(rc(2), im(3))), Outputs);
+ case S2_lsr_i_r_or:
+ case S2_lsr_i_p_or:
+ return rr0(eORL(rc(1), eLSR(rc(2), im(3))), Outputs);
+ case S2_lsr_i_r_xacc:
+ case S2_lsr_i_p_xacc:
+ return rr0(eXOR(rc(1), eLSR(rc(2), im(3))), Outputs);
+
+ case S2_clrbit_i: {
+ RegisterCell RC = rc(1);
+ RC[im(2)] = BT::BitValue::Zero;
+ return rr0(RC, Outputs);
+ }
+ case S2_setbit_i: {
+ RegisterCell RC = rc(1);
+ RC[im(2)] = BT::BitValue::One;
+ return rr0(RC, Outputs);
+ }
+ case S2_togglebit_i: {
+ RegisterCell RC = rc(1);
+ uint16_t BX = im(2);
+ RC[BX] = RC[BX].is(0) ? BT::BitValue::One
+ : RC[BX].is(1) ? BT::BitValue::Zero
+ : BT::BitValue::self();
+ return rr0(RC, Outputs);
+ }
+
+ case A4_bitspliti: {
+ uint16_t W1 = getRegBitWidth(Reg[1]);
+ uint16_t BX = im(2);
+ // Res.uw[1] = Rs[bx+1:], Res.uw[0] = Rs[0:bx]
+ const BT::BitValue Zero = BT::BitValue::Zero;
+ RegisterCell RZ = RegisterCell(W0).fill(BX, W1, Zero)
+ .fill(W1+(W1-BX), W0, Zero);
+ RegisterCell BF1 = eXTR(rc(1), 0, BX), BF2 = eXTR(rc(1), BX, W1);
+ RegisterCell RC = eINS(eINS(RZ, BF1, 0), BF2, W1);
+ return rr0(RC, Outputs);
+ }
+ case S4_extract:
+ case S4_extractp:
+ case S2_extractu:
+ case S2_extractup: {
+ uint16_t Wd = im(2), Of = im(3);
+ assert(Wd <= W0);
+ if (Wd == 0)
+ return rr0(eIMM(0, W0), Outputs);
+ // If the width extends beyond the register size, pad the register
+ // with 0 bits.
+ RegisterCell Pad = (Wd+Of > W0) ? rc(1).cat(eIMM(0, Wd+Of-W0)) : rc(1);
+ RegisterCell Ext = eXTR(Pad, Of, Wd+Of);
+ // Ext is short, need to extend it with 0s or sign bit.
+ RegisterCell RC = RegisterCell(W0).insert(Ext, BT::BitMask(0, Wd-1));
+ if (Opc == S2_extractu || Opc == S2_extractup)
+ return rr0(eZXT(RC, Wd), Outputs);
+ return rr0(eSXT(RC, Wd), Outputs);
+ }
+ case S2_insert:
+ case S2_insertp: {
+ uint16_t Wd = im(3), Of = im(4);
+ assert(Wd < W0 && Of < W0);
+ // If Wd+Of exceeds W0, the inserted bits are truncated.
+ if (Wd+Of > W0)
+ Wd = W0-Of;
+ if (Wd == 0)
+ return rr0(rc(1), Outputs);
+ return rr0(eINS(rc(1), eXTR(rc(2), 0, Wd), Of), Outputs);
+ }
+
+ // Bit permutations:
+
+ case A2_combineii:
+ case A4_combineii:
+ case A4_combineir:
+ case A4_combineri:
+ case A2_combinew:
+ case V6_vcombine:
+ case V6_vcombine_128B:
+ assert(W0 % 2 == 0);
+ return rr0(cop(2, W0/2).cat(cop(1, W0/2)), Outputs);
+ case A2_combine_ll:
+ case A2_combine_lh:
+ case A2_combine_hl:
+ case A2_combine_hh: {
+ assert(W0 == 32);
+ assert(getRegBitWidth(Reg[1]) == 32 && getRegBitWidth(Reg[2]) == 32);
+ // Low half in the output is 0 for _ll and _hl, 1 otherwise:
+ unsigned LoH = !(Opc == A2_combine_ll || Opc == A2_combine_hl);
+ // High half in the output is 0 for _ll and _lh, 1 otherwise:
+ unsigned HiH = !(Opc == A2_combine_ll || Opc == A2_combine_lh);
+ RegisterCell R1 = rc(1);
+ RegisterCell R2 = rc(2);
+ RegisterCell RC = half(R2, LoH).cat(half(R1, HiH));
+ return rr0(RC, Outputs);
+ }
+ case S2_packhl: {
+ assert(W0 == 64);
+ assert(getRegBitWidth(Reg[1]) == 32 && getRegBitWidth(Reg[2]) == 32);
+ RegisterCell R1 = rc(1);
+ RegisterCell R2 = rc(2);
+ RegisterCell RC = half(R2, 0).cat(half(R1, 0)).cat(half(R2, 1))
+ .cat(half(R1, 1));
+ return rr0(RC, Outputs);
+ }
+ case S2_shuffeb: {
+ RegisterCell RC = shuffle(rc(1), rc(2), 8, false);
+ return rr0(RC, Outputs);
+ }
+ case S2_shuffeh: {
+ RegisterCell RC = shuffle(rc(1), rc(2), 16, false);
+ return rr0(RC, Outputs);
+ }
+ case S2_shuffob: {
+ RegisterCell RC = shuffle(rc(1), rc(2), 8, true);
+ return rr0(RC, Outputs);
+ }
+ case S2_shuffoh: {
+ RegisterCell RC = shuffle(rc(1), rc(2), 16, true);
+ return rr0(RC, Outputs);
+ }
+ case C2_mask: {
+ uint16_t WR = W0;
+ uint16_t WP = 8; // XXX Pred size: getRegBitWidth(Reg[1]);
+ assert(WR == 64 && WP == 8);
+ RegisterCell R1 = rc(1);
+ RegisterCell RC(WR);
+ for (uint16_t i = 0; i < WP; ++i) {
+ const BT::BitValue &V = R1[i];
+ BT::BitValue F = (V.is(0) || V.is(1)) ? V : BT::BitValue::self();
+ RC.fill(i*8, i*8+8, F);
+ }
+ return rr0(RC, Outputs);
+ }
+
+ // Mux:
+
+ case C2_muxii:
+ case C2_muxir:
+ case C2_muxri:
+ case C2_mux: {
+ BT::BitValue PC0 = rc(1)[0];
+ RegisterCell R2 = cop(2, W0);
+ RegisterCell R3 = cop(3, W0);
+ if (PC0.is(0) || PC0.is(1))
+ return rr0(RegisterCell::ref(PC0 ? R2 : R3), Outputs);
+ R2.meet(R3, Reg[0].Reg);
+ return rr0(R2, Outputs);
+ }
+ case C2_vmux:
+ // TODO
+ break;
+
+ // Sign- and zero-extension:
+
+ case A2_sxtb:
+ return rr0(eSXT(rc(1), 8), Outputs);
+ case A2_sxth:
+ return rr0(eSXT(rc(1), 16), Outputs);
+ case A2_sxtw: {
+ uint16_t W1 = getRegBitWidth(Reg[1]);
+ assert(W0 == 64 && W1 == 32);
+ RegisterCell RC = eSXT(rc(1).cat(eIMM(0, W1)), W1);
+ return rr0(RC, Outputs);
+ }
+ case A2_zxtb:
+ return rr0(eZXT(rc(1), 8), Outputs);
+ case A2_zxth:
+ return rr0(eZXT(rc(1), 16), Outputs);
+
+ // Bit count:
+
+ case S2_cl0:
+ case S2_cl0p:
+ // Always produce a 32-bit result.
+ return rr0(eCLB(rc(1), 0/*bit*/, 32), Outputs);
+ case S2_cl1:
+ case S2_cl1p:
+ return rr0(eCLB(rc(1), 1/*bit*/, 32), Outputs);
+ case S2_clb:
+ case S2_clbp: {
+ uint16_t W1 = getRegBitWidth(Reg[1]);
+ RegisterCell R1 = rc(1);
+ BT::BitValue TV = R1[W1-1];
+ if (TV.is(0) || TV.is(1))
+ return rr0(eCLB(R1, TV, 32), Outputs);
+ break;
+ }
+ case S2_ct0:
+ case S2_ct0p:
+ return rr0(eCTB(rc(1), 0/*bit*/, 32), Outputs);
+ case S2_ct1:
+ case S2_ct1p:
+ return rr0(eCTB(rc(1), 1/*bit*/, 32), Outputs);
+ case S5_popcountp:
+ // TODO
+ break;
+
+ case C2_all8: {
+ RegisterCell P1 = rc(1);
+ bool Has0 = false, All1 = true;
+ for (uint16_t i = 0; i < 8/*XXX*/; ++i) {
+ if (!P1[i].is(1))
+ All1 = false;
+ if (!P1[i].is(0))
+ continue;
+ Has0 = true;
+ break;
+ }
+ if (!Has0 && !All1)
+ break;
+ RegisterCell RC(W0);
+ RC.fill(0, W0, (All1 ? BT::BitValue::One : BT::BitValue::Zero));
+ return rr0(RC, Outputs);
+ }
+ case C2_any8: {
+ RegisterCell P1 = rc(1);
+ bool Has1 = false, All0 = true;
+ for (uint16_t i = 0; i < 8/*XXX*/; ++i) {
+ if (!P1[i].is(0))
+ All0 = false;
+ if (!P1[i].is(1))
+ continue;
+ Has1 = true;
+ break;
+ }
+ if (!Has1 && !All0)
+ break;
+ RegisterCell RC(W0);
+ RC.fill(0, W0, (Has1 ? BT::BitValue::One : BT::BitValue::Zero));
+ return rr0(RC, Outputs);
+ }
+ case C2_and:
+ return rr0(eAND(rc(1), rc(2)), Outputs);
+ case C2_andn:
+ return rr0(eAND(rc(1), eNOT(rc(2))), Outputs);
+ case C2_not:
+ return rr0(eNOT(rc(1)), Outputs);
+ case C2_or:
+ return rr0(eORL(rc(1), rc(2)), Outputs);
+ case C2_orn:
+ return rr0(eORL(rc(1), eNOT(rc(2))), Outputs);
+ case C2_xor:
+ return rr0(eXOR(rc(1), rc(2)), Outputs);
+ case C4_and_and:
+ return rr0(eAND(rc(1), eAND(rc(2), rc(3))), Outputs);
+ case C4_and_andn:
+ return rr0(eAND(rc(1), eAND(rc(2), eNOT(rc(3)))), Outputs);
+ case C4_and_or:
+ return rr0(eAND(rc(1), eORL(rc(2), rc(3))), Outputs);
+ case C4_and_orn:
+ return rr0(eAND(rc(1), eORL(rc(2), eNOT(rc(3)))), Outputs);
+ case C4_or_and:
+ return rr0(eORL(rc(1), eAND(rc(2), rc(3))), Outputs);
+ case C4_or_andn:
+ return rr0(eORL(rc(1), eAND(rc(2), eNOT(rc(3)))), Outputs);
+ case C4_or_or:
+ return rr0(eORL(rc(1), eORL(rc(2), rc(3))), Outputs);
+ case C4_or_orn:
+ return rr0(eORL(rc(1), eORL(rc(2), eNOT(rc(3)))), Outputs);
+ case C2_bitsclr:
+ case C2_bitsclri:
+ case C2_bitsset:
+ case C4_nbitsclr:
+ case C4_nbitsclri:
+ case C4_nbitsset:
+ // TODO
+ break;
+ case S2_tstbit_i:
+ case S4_ntstbit_i: {
+ BT::BitValue V = rc(1)[im(2)];
+ if (V.is(0) || V.is(1)) {
+ // If instruction is S2_tstbit_i, test for 1, otherwise test for 0.
+ bool TV = (Opc == S2_tstbit_i);
+ BT::BitValue F = V.is(TV) ? BT::BitValue::One : BT::BitValue::Zero;
+ return rr0(RegisterCell(W0).fill(0, W0, F), Outputs);
+ }
+ break;
+ }
+
+ default:
+ return MachineEvaluator::evaluate(MI, Inputs, Outputs);
+ }
+ #undef im
+ #undef rc
+ #undef op
+ return false;
+}
+
+bool HexagonEvaluator::evaluate(const MachineInstr &BI,
+ const CellMapType &Inputs,
+ BranchTargetList &Targets,
+ bool &FallsThru) const {
+ // We need to evaluate one branch at a time. TII::analyzeBranch checks
+ // all the branches in a basic block at once, so we cannot use it.
+ unsigned Opc = BI.getOpcode();
+ bool SimpleBranch = false;
+ bool Negated = false;
+ switch (Opc) {
+ case Hexagon::J2_jumpf:
+ case Hexagon::J2_jumpfpt:
+ case Hexagon::J2_jumpfnew:
+ case Hexagon::J2_jumpfnewpt:
+ Negated = true;
+ case Hexagon::J2_jumpt:
+ case Hexagon::J2_jumptpt:
+ case Hexagon::J2_jumptnew:
+ case Hexagon::J2_jumptnewpt:
+ // Simple branch: if([!]Pn) jump ...
+ // i.e. Op0 = predicate, Op1 = branch target.
+ SimpleBranch = true;
+ break;
+ case Hexagon::J2_jump:
+ Targets.insert(BI.getOperand(0).getMBB());
+ FallsThru = false;
+ return true;
+ default:
+ // If the branch is of unknown type, assume that all successors are
+ // executable.
+ return false;
+ }
+
+ if (!SimpleBranch)
+ return false;
+
+ // BI is a conditional branch if we got here.
+ RegisterRef PR = BI.getOperand(0);
+ RegisterCell PC = getCell(PR, Inputs);
+ const BT::BitValue &Test = PC[0];
+
+ // If the condition is neither true nor false, then it's unknown.
+ if (!Test.is(0) && !Test.is(1))
+ return false;
+
+ // "Test.is(!Negated)" means "branch condition is true".
+ if (!Test.is(!Negated)) {
+ // Condition known to be false.
+ FallsThru = true;
+ return true;
+ }
+
+ Targets.insert(BI.getOperand(1).getMBB());
+ FallsThru = false;
+ return true;
+}
+
+bool HexagonEvaluator::evaluateLoad(const MachineInstr &MI,
+ const CellMapType &Inputs,
+ CellMapType &Outputs) const {
+ if (TII.isPredicated(MI))
+ return false;
+ assert(MI.mayLoad() && "A load that mayn't?");
+ unsigned Opc = MI.getOpcode();
+
+ uint16_t BitNum;
+ bool SignEx;
+ using namespace Hexagon;
+
+ switch (Opc) {
+ default:
+ return false;
+
+#if 0
+ // memb_fifo
+ case L2_loadalignb_pbr:
+ case L2_loadalignb_pcr:
+ case L2_loadalignb_pi:
+ // memh_fifo
+ case L2_loadalignh_pbr:
+ case L2_loadalignh_pcr:
+ case L2_loadalignh_pi:
+ // membh
+ case L2_loadbsw2_pbr:
+ case L2_loadbsw2_pci:
+ case L2_loadbsw2_pcr:
+ case L2_loadbsw2_pi:
+ case L2_loadbsw4_pbr:
+ case L2_loadbsw4_pci:
+ case L2_loadbsw4_pcr:
+ case L2_loadbsw4_pi:
+ // memubh
+ case L2_loadbzw2_pbr:
+ case L2_loadbzw2_pci:
+ case L2_loadbzw2_pcr:
+ case L2_loadbzw2_pi:
+ case L2_loadbzw4_pbr:
+ case L2_loadbzw4_pci:
+ case L2_loadbzw4_pcr:
+ case L2_loadbzw4_pi:
+#endif
+
+ case L2_loadrbgp:
+ case L2_loadrb_io:
+ case L2_loadrb_pbr:
+ case L2_loadrb_pci:
+ case L2_loadrb_pcr:
+ case L2_loadrb_pi:
+ case PS_loadrbabs:
+ case L4_loadrb_ap:
+ case L4_loadrb_rr:
+ case L4_loadrb_ur:
+ BitNum = 8;
+ SignEx = true;
+ break;
+
+ case L2_loadrubgp:
+ case L2_loadrub_io:
+ case L2_loadrub_pbr:
+ case L2_loadrub_pci:
+ case L2_loadrub_pcr:
+ case L2_loadrub_pi:
+ case PS_loadrubabs:
+ case L4_loadrub_ap:
+ case L4_loadrub_rr:
+ case L4_loadrub_ur:
+ BitNum = 8;
+ SignEx = false;
+ break;
+
+ case L2_loadrhgp:
+ case L2_loadrh_io:
+ case L2_loadrh_pbr:
+ case L2_loadrh_pci:
+ case L2_loadrh_pcr:
+ case L2_loadrh_pi:
+ case PS_loadrhabs:
+ case L4_loadrh_ap:
+ case L4_loadrh_rr:
+ case L4_loadrh_ur:
+ BitNum = 16;
+ SignEx = true;
+ break;
+
+ case L2_loadruhgp:
+ case L2_loadruh_io:
+ case L2_loadruh_pbr:
+ case L2_loadruh_pci:
+ case L2_loadruh_pcr:
+ case L2_loadruh_pi:
+ case L4_loadruh_rr:
+ case PS_loadruhabs:
+ case L4_loadruh_ap:
+ case L4_loadruh_ur:
+ BitNum = 16;
+ SignEx = false;
+ break;
+
+ case L2_loadrigp:
+ case L2_loadri_io:
+ case L2_loadri_pbr:
+ case L2_loadri_pci:
+ case L2_loadri_pcr:
+ case L2_loadri_pi:
+ case L2_loadw_locked:
+ case PS_loadriabs:
+ case L4_loadri_ap:
+ case L4_loadri_rr:
+ case L4_loadri_ur:
+ case LDriw_pred:
+ BitNum = 32;
+ SignEx = true;
+ break;
+
+ case L2_loadrdgp:
+ case L2_loadrd_io:
+ case L2_loadrd_pbr:
+ case L2_loadrd_pci:
+ case L2_loadrd_pcr:
+ case L2_loadrd_pi:
+ case L4_loadd_locked:
+ case PS_loadrdabs:
+ case L4_loadrd_ap:
+ case L4_loadrd_rr:
+ case L4_loadrd_ur:
+ BitNum = 64;
+ SignEx = true;
+ break;
+ }
+
+ const MachineOperand &MD = MI.getOperand(0);
+ assert(MD.isReg() && MD.isDef());
+ RegisterRef RD = MD;
+
+ uint16_t W = getRegBitWidth(RD);
+ assert(W >= BitNum && BitNum > 0);
+ RegisterCell Res(W);
+
+ for (uint16_t i = 0; i < BitNum; ++i)
+ Res[i] = BT::BitValue::self(BT::BitRef(RD.Reg, i));
+
+ if (SignEx) {
+ const BT::BitValue &Sign = Res[BitNum-1];
+ for (uint16_t i = BitNum; i < W; ++i)
+ Res[i] = BT::BitValue::ref(Sign);
+ } else {
+ for (uint16_t i = BitNum; i < W; ++i)
+ Res[i] = BT::BitValue::Zero;
+ }
+
+ putCell(RD, Res, Outputs);
+ return true;
+}
+
+bool HexagonEvaluator::evaluateFormalCopy(const MachineInstr &MI,
+ const CellMapType &Inputs,
+ CellMapType &Outputs) const {
+ // If MI defines a formal parameter, but is not a copy (loads are handled
+ // in evaluateLoad), then it's not clear what to do.
+ assert(MI.isCopy());
+
+ RegisterRef RD = MI.getOperand(0);
+ RegisterRef RS = MI.getOperand(1);
+ assert(RD.Sub == 0);
+ if (!TargetRegisterInfo::isPhysicalRegister(RS.Reg))
+ return false;
+ RegExtMap::const_iterator F = VRX.find(RD.Reg);
+ if (F == VRX.end())
+ return false;
+
+ uint16_t EW = F->second.Width;
+ // Store RD's cell into the map. This will associate the cell with a virtual
+ // register, and make zero-/sign-extends possible (otherwise we would be ex-
+ // tending "self" bit values, which will have no effect, since "self" values
+ // cannot be references to anything).
+ putCell(RD, getCell(RS, Inputs), Outputs);
+
+ RegisterCell Res;
+ // Read RD's cell from the outputs instead of RS's cell from the inputs:
+ if (F->second.Type == ExtType::SExt)
+ Res = eSXT(getCell(RD, Outputs), EW);
+ else if (F->second.Type == ExtType::ZExt)
+ Res = eZXT(getCell(RD, Outputs), EW);
+
+ putCell(RD, Res, Outputs);
+ return true;
+}
+
+
+unsigned HexagonEvaluator::getNextPhysReg(unsigned PReg, unsigned Width) const {
+ using namespace Hexagon;
+ bool Is64 = DoubleRegsRegClass.contains(PReg);
+ assert(PReg == 0 || Is64 || IntRegsRegClass.contains(PReg));
+
+ static const unsigned Phys32[] = { R0, R1, R2, R3, R4, R5 };
+ static const unsigned Phys64[] = { D0, D1, D2 };
+ const unsigned Num32 = sizeof(Phys32)/sizeof(unsigned);
+ const unsigned Num64 = sizeof(Phys64)/sizeof(unsigned);
+
+ // Return the first parameter register of the required width.
+ if (PReg == 0)
+ return (Width <= 32) ? Phys32[0] : Phys64[0];
+
+ // Set Idx32, Idx64 in such a way that Idx+1 would give the index of the
+ // next register.
+ unsigned Idx32 = 0, Idx64 = 0;
+ if (!Is64) {
+ while (Idx32 < Num32) {
+ if (Phys32[Idx32] == PReg)
+ break;
+ Idx32++;
+ }
+ Idx64 = Idx32/2;
+ } else {
+ while (Idx64 < Num64) {
+ if (Phys64[Idx64] == PReg)
+ break;
+ Idx64++;
+ }
+ Idx32 = Idx64*2+1;
+ }
+
+ if (Width <= 32)
+ return (Idx32+1 < Num32) ? Phys32[Idx32+1] : 0;
+ return (Idx64+1 < Num64) ? Phys64[Idx64+1] : 0;
+}
+
+
+unsigned HexagonEvaluator::getVirtRegFor(unsigned PReg) const {
+ typedef MachineRegisterInfo::livein_iterator iterator;
+ for (iterator I = MRI.livein_begin(), E = MRI.livein_end(); I != E; ++I) {
+ if (I->first == PReg)
+ return I->second;
+ }
+ return 0;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.h b/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.h
new file mode 100644
index 000000000000..9e7b1dbe298f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.h
@@ -0,0 +1,64 @@
+//===--- HexagonBitTracker.h ----------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGONBITTRACKER_H
+#define HEXAGONBITTRACKER_H
+
+#include "BitTracker.h"
+#include "llvm/ADT/DenseMap.h"
+
+namespace llvm {
+ class HexagonInstrInfo;
+ class HexagonRegisterInfo;
+
+struct HexagonEvaluator : public BitTracker::MachineEvaluator {
+ typedef BitTracker::CellMapType CellMapType;
+ typedef BitTracker::RegisterRef RegisterRef;
+ typedef BitTracker::RegisterCell RegisterCell;
+ typedef BitTracker::BranchTargetList BranchTargetList;
+
+ HexagonEvaluator(const HexagonRegisterInfo &tri, MachineRegisterInfo &mri,
+ const HexagonInstrInfo &tii, MachineFunction &mf);
+
+ bool evaluate(const MachineInstr &MI, const CellMapType &Inputs,
+ CellMapType &Outputs) const override;
+ bool evaluate(const MachineInstr &BI, const CellMapType &Inputs,
+ BranchTargetList &Targets, bool &FallsThru) const override;
+
+ BitTracker::BitMask mask(unsigned Reg, unsigned Sub) const override;
+
+ MachineFunction &MF;
+ MachineFrameInfo &MFI;
+ const HexagonInstrInfo &TII;
+
+private:
+ bool evaluateLoad(const MachineInstr &MI, const CellMapType &Inputs,
+ CellMapType &Outputs) const;
+ bool evaluateFormalCopy(const MachineInstr &MI, const CellMapType &Inputs,
+ CellMapType &Outputs) const;
+
+ unsigned getNextPhysReg(unsigned PReg, unsigned Width) const;
+ unsigned getVirtRegFor(unsigned PReg) const;
+
+ // Type of formal parameter extension.
+ struct ExtType {
+ enum { SExt, ZExt };
+ char Type;
+ uint16_t Width;
+ ExtType() : Type(0), Width(0) {}
+ ExtType(char t, uint16_t w) : Type(t), Width(w) {}
+ };
+ // Map VR -> extension type.
+ typedef DenseMap<unsigned, ExtType> RegExtMap;
+ RegExtMap VRX;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp
new file mode 100644
index 000000000000..adc213c3d438
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp
@@ -0,0 +1,483 @@
+//===--- HexagonBlockRanges.cpp -------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hbr"
+
+#include "HexagonBlockRanges.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <map>
+
+using namespace llvm;
+
+bool HexagonBlockRanges::IndexRange::overlaps(const IndexRange &A) const {
+ // If A contains start(), or "this" contains A.start(), then overlap.
+ IndexType S = start(), E = end(), AS = A.start(), AE = A.end();
+ if (AS == S)
+ return true;
+ bool SbAE = (S < AE) || (S == AE && A.TiedEnd); // S-before-AE.
+ bool ASbE = (AS < E) || (AS == E && TiedEnd); // AS-before-E.
+ if ((AS < S && SbAE) || (S < AS && ASbE))
+ return true;
+ // Otherwise no overlap.
+ return false;
+}
+
+bool HexagonBlockRanges::IndexRange::contains(const IndexRange &A) const {
+ if (start() <= A.start()) {
+ // Treat "None" in the range end as equal to the range start.
+ IndexType E = (end() != IndexType::None) ? end() : start();
+ IndexType AE = (A.end() != IndexType::None) ? A.end() : A.start();
+ if (AE <= E)
+ return true;
+ }
+ return false;
+}
+
+void HexagonBlockRanges::IndexRange::merge(const IndexRange &A) {
+ // Allow merging adjacent ranges.
+ assert(end() == A.start() || overlaps(A));
+ IndexType AS = A.start(), AE = A.end();
+ if (AS < start() || start() == IndexType::None)
+ setStart(AS);
+ if (end() < AE || end() == IndexType::None) {
+ setEnd(AE);
+ TiedEnd = A.TiedEnd;
+ } else {
+ if (end() == AE)
+ TiedEnd |= A.TiedEnd;
+ }
+ if (A.Fixed)
+ Fixed = true;
+}
+
+void HexagonBlockRanges::RangeList::include(const RangeList &RL) {
+ for (auto &R : RL)
+ if (!is_contained(*this, R))
+ push_back(R);
+}
+
+// Merge all overlapping ranges in the list, so that all that remains
+// is a list of disjoint ranges.
+void HexagonBlockRanges::RangeList::unionize(bool MergeAdjacent) {
+ if (empty())
+ return;
+
+ std::sort(begin(), end());
+ iterator Iter = begin();
+
+ while (Iter != end()-1) {
+ iterator Next = std::next(Iter);
+ // If MergeAdjacent is true, merge ranges A and B, where A.end == B.start.
+ // This allows merging dead ranges, but is not valid for live ranges.
+ bool Merge = MergeAdjacent && (Iter->end() == Next->start());
+ if (Merge || Iter->overlaps(*Next)) {
+ Iter->merge(*Next);
+ erase(Next);
+ continue;
+ }
+ ++Iter;
+ }
+}
+
+// Compute a range A-B and add it to the list.
+void HexagonBlockRanges::RangeList::addsub(const IndexRange &A,
+ const IndexRange &B) {
+ // Exclusion of non-overlapping ranges makes some checks simpler
+ // later in this function.
+ if (!A.overlaps(B)) {
+ // A - B = A.
+ add(A);
+ return;
+ }
+
+ IndexType AS = A.start(), AE = A.end();
+ IndexType BS = B.start(), BE = B.end();
+
+ // If AE is None, then A is included in B, since A and B overlap.
+ // The result of subtraction if empty, so just return.
+ if (AE == IndexType::None)
+ return;
+
+ if (AS < BS) {
+ // A starts before B.
+ // AE cannot be None since A and B overlap.
+ assert(AE != IndexType::None);
+ // Add the part of A that extends on the "less" side of B.
+ add(AS, BS, A.Fixed, false);
+ }
+
+ if (BE < AE) {
+ // BE cannot be Exit here.
+ if (BE == IndexType::None)
+ add(BS, AE, A.Fixed, false);
+ else
+ add(BE, AE, A.Fixed, false);
+ }
+}
+
+// Subtract a given range from each element in the list.
+void HexagonBlockRanges::RangeList::subtract(const IndexRange &Range) {
+ // Cannot assume that the list is unionized (i.e. contains only non-
+ // overlapping ranges.
+ RangeList T;
+ for (iterator Next, I = begin(); I != end(); I = Next) {
+ IndexRange &Rg = *I;
+ if (Rg.overlaps(Range)) {
+ T.addsub(Rg, Range);
+ Next = this->erase(I);
+ } else {
+ Next = std::next(I);
+ }
+ }
+ include(T);
+}
+
+HexagonBlockRanges::InstrIndexMap::InstrIndexMap(MachineBasicBlock &B)
+ : Block(B) {
+ IndexType Idx = IndexType::First;
+ First = Idx;
+ for (auto &In : B) {
+ if (In.isDebugValue())
+ continue;
+ assert(getIndex(&In) == IndexType::None && "Instruction already in map");
+ Map.insert(std::make_pair(Idx, &In));
+ ++Idx;
+ }
+ Last = B.empty() ? IndexType::None : unsigned(Idx)-1;
+}
+
+MachineInstr *HexagonBlockRanges::InstrIndexMap::getInstr(IndexType Idx) const {
+ auto F = Map.find(Idx);
+ return (F != Map.end()) ? F->second : nullptr;
+}
+
+HexagonBlockRanges::IndexType HexagonBlockRanges::InstrIndexMap::getIndex(
+ MachineInstr *MI) const {
+ for (auto &I : Map)
+ if (I.second == MI)
+ return I.first;
+ return IndexType::None;
+}
+
+HexagonBlockRanges::IndexType HexagonBlockRanges::InstrIndexMap::getPrevIndex(
+ IndexType Idx) const {
+ assert (Idx != IndexType::None);
+ if (Idx == IndexType::Entry)
+ return IndexType::None;
+ if (Idx == IndexType::Exit)
+ return Last;
+ if (Idx == First)
+ return IndexType::Entry;
+ return unsigned(Idx)-1;
+}
+
+HexagonBlockRanges::IndexType HexagonBlockRanges::InstrIndexMap::getNextIndex(
+ IndexType Idx) const {
+ assert (Idx != IndexType::None);
+ if (Idx == IndexType::Entry)
+ return IndexType::First;
+ if (Idx == IndexType::Exit || Idx == Last)
+ return IndexType::None;
+ return unsigned(Idx)+1;
+}
+
+void HexagonBlockRanges::InstrIndexMap::replaceInstr(MachineInstr *OldMI,
+ MachineInstr *NewMI) {
+ for (auto &I : Map) {
+ if (I.second != OldMI)
+ continue;
+ if (NewMI != nullptr)
+ I.second = NewMI;
+ else
+ Map.erase(I.first);
+ break;
+ }
+}
+
+HexagonBlockRanges::HexagonBlockRanges(MachineFunction &mf)
+ : MF(mf), HST(mf.getSubtarget<HexagonSubtarget>()),
+ TII(*HST.getInstrInfo()), TRI(*HST.getRegisterInfo()),
+ Reserved(TRI.getReservedRegs(mf)) {
+ // Consider all non-allocatable registers as reserved.
+ for (auto I = TRI.regclass_begin(), E = TRI.regclass_end(); I != E; ++I) {
+ auto *RC = *I;
+ if (RC->isAllocatable())
+ continue;
+ for (unsigned R : *RC)
+ Reserved[R] = true;
+ }
+}
+
+HexagonBlockRanges::RegisterSet HexagonBlockRanges::getLiveIns(
+ const MachineBasicBlock &B, const MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI) {
+ RegisterSet LiveIns;
+ RegisterSet Tmp;
+ for (auto I : B.liveins()) {
+ if (I.LaneMask.all()) {
+ Tmp.insert({I.PhysReg,0});
+ continue;
+ }
+ for (MCSubRegIndexIterator S(I.PhysReg, &TRI); S.isValid(); ++S) {
+ LaneBitmask M = TRI.getSubRegIndexLaneMask(S.getSubRegIndex());
+ if ((M & I.LaneMask).any())
+ Tmp.insert({S.getSubReg(), 0});
+ }
+ }
+
+ for (auto R : Tmp) {
+ if (!Reserved[R.Reg])
+ LiveIns.insert(R);
+ for (auto S : expandToSubRegs(R, MRI, TRI))
+ if (!Reserved[S.Reg])
+ LiveIns.insert(S);
+ }
+ return LiveIns;
+}
+
+HexagonBlockRanges::RegisterSet HexagonBlockRanges::expandToSubRegs(
+ RegisterRef R, const MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI) {
+ RegisterSet SRs;
+
+ if (R.Sub != 0) {
+ SRs.insert(R);
+ return SRs;
+ }
+
+ if (TargetRegisterInfo::isPhysicalRegister(R.Reg)) {
+ MCSubRegIterator I(R.Reg, &TRI);
+ if (!I.isValid())
+ SRs.insert({R.Reg, 0});
+ for (; I.isValid(); ++I)
+ SRs.insert({*I, 0});
+ } else {
+ assert(TargetRegisterInfo::isVirtualRegister(R.Reg));
+ auto &RC = *MRI.getRegClass(R.Reg);
+ unsigned PReg = *RC.begin();
+ MCSubRegIndexIterator I(PReg, &TRI);
+ if (!I.isValid())
+ SRs.insert({R.Reg, 0});
+ for (; I.isValid(); ++I)
+ SRs.insert({R.Reg, I.getSubRegIndex()});
+ }
+ return SRs;
+}
+
+void HexagonBlockRanges::computeInitialLiveRanges(InstrIndexMap &IndexMap,
+ RegToRangeMap &LiveMap) {
+ std::map<RegisterRef,IndexType> LastDef, LastUse;
+ RegisterSet LiveOnEntry;
+ MachineBasicBlock &B = IndexMap.getBlock();
+ MachineRegisterInfo &MRI = B.getParent()->getRegInfo();
+
+ for (auto R : getLiveIns(B, MRI, TRI))
+ LiveOnEntry.insert(R);
+
+ for (auto R : LiveOnEntry)
+ LastDef[R] = IndexType::Entry;
+
+ auto closeRange = [&LastUse,&LastDef,&LiveMap] (RegisterRef R) -> void {
+ auto LD = LastDef[R], LU = LastUse[R];
+ if (LD == IndexType::None)
+ LD = IndexType::Entry;
+ if (LU == IndexType::None)
+ LU = IndexType::Exit;
+ LiveMap[R].add(LD, LU, false, false);
+ LastUse[R] = LastDef[R] = IndexType::None;
+ };
+
+ for (auto &In : B) {
+ if (In.isDebugValue())
+ continue;
+ IndexType Index = IndexMap.getIndex(&In);
+ // Process uses first.
+ for (auto &Op : In.operands()) {
+ if (!Op.isReg() || !Op.isUse() || Op.isUndef())
+ continue;
+ RegisterRef R = { Op.getReg(), Op.getSubReg() };
+ if (TargetRegisterInfo::isPhysicalRegister(R.Reg) && Reserved[R.Reg])
+ continue;
+ bool IsKill = Op.isKill();
+ for (auto S : expandToSubRegs(R, MRI, TRI)) {
+ LastUse[S] = Index;
+ if (IsKill)
+ closeRange(S);
+ }
+ }
+ // Process defs.
+ for (auto &Op : In.operands()) {
+ if (!Op.isReg() || !Op.isDef() || Op.isUndef())
+ continue;
+ RegisterRef R = { Op.getReg(), Op.getSubReg() };
+ if (TargetRegisterInfo::isPhysicalRegister(R.Reg) && Reserved[R.Reg])
+ continue;
+ for (auto S : expandToSubRegs(R, MRI, TRI)) {
+ if (LastDef[S] != IndexType::None || LastUse[S] != IndexType::None)
+ closeRange(S);
+ LastDef[S] = Index;
+ }
+ }
+ }
+
+ // Collect live-on-exit.
+ RegisterSet LiveOnExit;
+ for (auto *SB : B.successors())
+ for (auto R : getLiveIns(*SB, MRI, TRI))
+ LiveOnExit.insert(R);
+
+ for (auto R : LiveOnExit)
+ LastUse[R] = IndexType::Exit;
+
+ // Process remaining registers.
+ RegisterSet Left;
+ for (auto &I : LastUse)
+ if (I.second != IndexType::None)
+ Left.insert(I.first);
+ for (auto &I : LastDef)
+ if (I.second != IndexType::None)
+ Left.insert(I.first);
+ for (auto R : Left)
+ closeRange(R);
+
+ // Finalize the live ranges.
+ for (auto &P : LiveMap)
+ P.second.unionize();
+}
+
+HexagonBlockRanges::RegToRangeMap HexagonBlockRanges::computeLiveMap(
+ InstrIndexMap &IndexMap) {
+ RegToRangeMap LiveMap;
+ DEBUG(dbgs() << __func__ << ": index map\n" << IndexMap << '\n');
+ computeInitialLiveRanges(IndexMap, LiveMap);
+ DEBUG(dbgs() << __func__ << ": live map\n"
+ << PrintRangeMap(LiveMap, TRI) << '\n');
+ return LiveMap;
+}
+
+HexagonBlockRanges::RegToRangeMap HexagonBlockRanges::computeDeadMap(
+ InstrIndexMap &IndexMap, RegToRangeMap &LiveMap) {
+ RegToRangeMap DeadMap;
+
+ auto addDeadRanges = [&IndexMap,&LiveMap,&DeadMap] (RegisterRef R) -> void {
+ auto F = LiveMap.find(R);
+ if (F == LiveMap.end() || F->second.empty()) {
+ DeadMap[R].add(IndexType::Entry, IndexType::Exit, false, false);
+ return;
+ }
+
+ RangeList &RL = F->second;
+ RangeList::iterator A = RL.begin(), Z = RL.end()-1;
+
+ // Try to create the initial range.
+ if (A->start() != IndexType::Entry) {
+ IndexType DE = IndexMap.getPrevIndex(A->start());
+ if (DE != IndexType::Entry)
+ DeadMap[R].add(IndexType::Entry, DE, false, false);
+ }
+
+ while (A != Z) {
+ // Creating a dead range that follows A. Pay attention to empty
+ // ranges (i.e. those ending with "None").
+ IndexType AE = (A->end() == IndexType::None) ? A->start() : A->end();
+ IndexType DS = IndexMap.getNextIndex(AE);
+ ++A;
+ IndexType DE = IndexMap.getPrevIndex(A->start());
+ if (DS < DE)
+ DeadMap[R].add(DS, DE, false, false);
+ }
+
+ // Try to create the final range.
+ if (Z->end() != IndexType::Exit) {
+ IndexType ZE = (Z->end() == IndexType::None) ? Z->start() : Z->end();
+ IndexType DS = IndexMap.getNextIndex(ZE);
+ if (DS < IndexType::Exit)
+ DeadMap[R].add(DS, IndexType::Exit, false, false);
+ }
+ };
+
+ MachineFunction &MF = *IndexMap.getBlock().getParent();
+ auto &MRI = MF.getRegInfo();
+ unsigned NumRegs = TRI.getNumRegs();
+ BitVector Visited(NumRegs);
+ for (unsigned R = 1; R < NumRegs; ++R) {
+ for (auto S : expandToSubRegs({R,0}, MRI, TRI)) {
+ if (Reserved[S.Reg] || Visited[S.Reg])
+ continue;
+ addDeadRanges(S);
+ Visited[S.Reg] = true;
+ }
+ }
+ for (auto &P : LiveMap)
+ if (TargetRegisterInfo::isVirtualRegister(P.first.Reg))
+ addDeadRanges(P.first);
+
+ DEBUG(dbgs() << __func__ << ": dead map\n"
+ << PrintRangeMap(DeadMap, TRI) << '\n');
+ return DeadMap;
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS,
+ HexagonBlockRanges::IndexType Idx) {
+ if (Idx == HexagonBlockRanges::IndexType::None)
+ return OS << '-';
+ if (Idx == HexagonBlockRanges::IndexType::Entry)
+ return OS << 'n';
+ if (Idx == HexagonBlockRanges::IndexType::Exit)
+ return OS << 'x';
+ return OS << unsigned(Idx)-HexagonBlockRanges::IndexType::First+1;
+}
+
+// A mapping to translate between instructions and their indices.
+raw_ostream &llvm::operator<<(raw_ostream &OS,
+ const HexagonBlockRanges::IndexRange &IR) {
+ OS << '[' << IR.start() << ':' << IR.end() << (IR.TiedEnd ? '}' : ']');
+ if (IR.Fixed)
+ OS << '!';
+ return OS;
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS,
+ const HexagonBlockRanges::RangeList &RL) {
+ for (auto &R : RL)
+ OS << R << " ";
+ return OS;
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS,
+ const HexagonBlockRanges::InstrIndexMap &M) {
+ for (auto &In : M.Block) {
+ HexagonBlockRanges::IndexType Idx = M.getIndex(&In);
+ OS << Idx << (Idx == M.Last ? ". " : " ") << In;
+ }
+ return OS;
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS,
+ const HexagonBlockRanges::PrintRangeMap &P) {
+ for (auto &I : P.Map) {
+ const HexagonBlockRanges::RangeList &RL = I.second;
+ OS << PrintReg(I.first.Reg, &P.TRI, I.first.Sub) << " -> " << RL << "\n";
+ }
+ return OS;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonBlockRanges.h b/contrib/llvm/lib/Target/Hexagon/HexagonBlockRanges.h
new file mode 100644
index 000000000000..717480314d16
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonBlockRanges.h
@@ -0,0 +1,244 @@
+//===--- HexagonBlockRanges.h -----------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#ifndef HEXAGON_BLOCK_RANGES_H
+#define HEXAGON_BLOCK_RANGES_H
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include <cassert>
+#include <map>
+#include <set>
+#include <vector>
+#include <utility>
+
+namespace llvm {
+
+class HexagonSubtarget;
+class MachineBasicBlock;
+class MachineFunction;
+class MachineInstr;
+class raw_ostream;
+class TargetInstrInfo;
+class TargetRegisterInfo;
+
+struct HexagonBlockRanges {
+ HexagonBlockRanges(MachineFunction &MF);
+
+ struct RegisterRef {
+ unsigned Reg, Sub;
+ bool operator<(RegisterRef R) const {
+ return Reg < R.Reg || (Reg == R.Reg && Sub < R.Sub);
+ }
+ };
+ typedef std::set<RegisterRef> RegisterSet;
+
+ // This is to represent an "index", which is an abstraction of a position
+ // of an instruction within a basic block.
+ class IndexType {
+ public:
+ enum : unsigned {
+ None = 0,
+ Entry = 1,
+ Exit = 2,
+ First = 11 // 10th + 1st
+ };
+
+ IndexType() : Index(None) {}
+ IndexType(unsigned Idx) : Index(Idx) {}
+
+ static bool isInstr(IndexType X) { return X.Index >= First; }
+
+ operator unsigned() const;
+ bool operator== (unsigned x) const;
+ bool operator== (IndexType Idx) const;
+ bool operator!= (unsigned x) const;
+ bool operator!= (IndexType Idx) const;
+ IndexType operator++ ();
+ bool operator< (unsigned Idx) const;
+ bool operator< (IndexType Idx) const;
+ bool operator<= (IndexType Idx) const;
+
+ private:
+ bool operator> (IndexType Idx) const;
+ bool operator>= (IndexType Idx) const;
+
+ unsigned Index;
+ };
+
+ // A range of indices, essentially a representation of a live range.
+ // This is also used to represent "dead ranges", i.e. ranges where a
+ // register is dead.
+ class IndexRange : public std::pair<IndexType,IndexType> {
+ public:
+ IndexRange() = default;
+ IndexRange(IndexType Start, IndexType End, bool F = false, bool T = false)
+ : std::pair<IndexType,IndexType>(Start, End), Fixed(F), TiedEnd(T) {}
+
+ IndexType start() const { return first; }
+ IndexType end() const { return second; }
+
+ bool operator< (const IndexRange &A) const {
+ return start() < A.start();
+ }
+
+ bool overlaps(const IndexRange &A) const;
+ bool contains(const IndexRange &A) const;
+ void merge(const IndexRange &A);
+
+ bool Fixed = false; // Can be renamed? "Fixed" means "no".
+ bool TiedEnd = false; // The end is not a use, but a dead def tied to a use.
+
+ private:
+ void setStart(const IndexType &S) { first = S; }
+ void setEnd(const IndexType &E) { second = E; }
+ };
+
+ // A list of index ranges. This represents liveness of a register
+ // in a basic block.
+ class RangeList : public std::vector<IndexRange> {
+ public:
+ void add(IndexType Start, IndexType End, bool Fixed, bool TiedEnd) {
+ push_back(IndexRange(Start, End, Fixed, TiedEnd));
+ }
+ void add(const IndexRange &Range) {
+ push_back(Range);
+ }
+
+ void include(const RangeList &RL);
+ void unionize(bool MergeAdjacent = false);
+ void subtract(const IndexRange &Range);
+
+ private:
+ void addsub(const IndexRange &A, const IndexRange &B);
+ };
+
+ class InstrIndexMap {
+ public:
+ InstrIndexMap(MachineBasicBlock &B);
+
+ MachineInstr *getInstr(IndexType Idx) const;
+ IndexType getIndex(MachineInstr *MI) const;
+ MachineBasicBlock &getBlock() const { return Block; }
+ IndexType getPrevIndex(IndexType Idx) const;
+ IndexType getNextIndex(IndexType Idx) const;
+ void replaceInstr(MachineInstr *OldMI, MachineInstr *NewMI);
+
+ friend raw_ostream &operator<< (raw_ostream &OS, const InstrIndexMap &Map);
+
+ IndexType First, Last;
+
+ private:
+ MachineBasicBlock &Block;
+ std::map<IndexType,MachineInstr*> Map;
+ };
+
+ typedef std::map<RegisterRef,RangeList> RegToRangeMap;
+ RegToRangeMap computeLiveMap(InstrIndexMap &IndexMap);
+ RegToRangeMap computeDeadMap(InstrIndexMap &IndexMap, RegToRangeMap &LiveMap);
+ static RegisterSet expandToSubRegs(RegisterRef R,
+ const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI);
+
+ struct PrintRangeMap {
+ PrintRangeMap(const RegToRangeMap &M, const TargetRegisterInfo &I)
+ : Map(M), TRI(I) {}
+
+ friend raw_ostream &operator<< (raw_ostream &OS, const PrintRangeMap &P);
+
+ private:
+ const RegToRangeMap &Map;
+ const TargetRegisterInfo &TRI;
+ };
+
+private:
+ RegisterSet getLiveIns(const MachineBasicBlock &B,
+ const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI);
+
+ void computeInitialLiveRanges(InstrIndexMap &IndexMap,
+ RegToRangeMap &LiveMap);
+
+ MachineFunction &MF;
+ const HexagonSubtarget &HST;
+ const TargetInstrInfo &TII;
+ const TargetRegisterInfo &TRI;
+ BitVector Reserved;
+};
+
+inline HexagonBlockRanges::IndexType::operator unsigned() const {
+ assert(Index >= First);
+ return Index;
+}
+
+inline bool HexagonBlockRanges::IndexType::operator== (unsigned x) const {
+ return Index == x;
+}
+
+inline bool HexagonBlockRanges::IndexType::operator== (IndexType Idx) const {
+ return Index == Idx.Index;
+}
+
+inline bool HexagonBlockRanges::IndexType::operator!= (unsigned x) const {
+ return Index != x;
+}
+
+inline bool HexagonBlockRanges::IndexType::operator!= (IndexType Idx) const {
+ return Index != Idx.Index;
+}
+
+inline
+HexagonBlockRanges::IndexType HexagonBlockRanges::IndexType::operator++ () {
+ assert(Index != None);
+ assert(Index != Exit);
+ if (Index == Entry)
+ Index = First;
+ else
+ ++Index;
+ return *this;
+}
+
+inline bool HexagonBlockRanges::IndexType::operator< (unsigned Idx) const {
+ return operator< (IndexType(Idx));
+}
+
+inline bool HexagonBlockRanges::IndexType::operator< (IndexType Idx) const {
+ // !(x < x).
+ if (Index == Idx.Index)
+ return false;
+ // !(None < x) for all x.
+ // !(x < None) for all x.
+ if (Index == None || Idx.Index == None)
+ return false;
+ // !(Exit < x) for all x.
+ // !(x < Entry) for all x.
+ if (Index == Exit || Idx.Index == Entry)
+ return false;
+ // Entry < x for all x != Entry.
+ // x < Exit for all x != Exit.
+ if (Index == Entry || Idx.Index == Exit)
+ return true;
+
+ return Index < Idx.Index;
+}
+
+inline bool HexagonBlockRanges::IndexType::operator<= (IndexType Idx) const {
+ return operator==(Idx) || operator<(Idx);
+}
+
+raw_ostream &operator<< (raw_ostream &OS, HexagonBlockRanges::IndexType Idx);
+raw_ostream &operator<< (raw_ostream &OS,
+ const HexagonBlockRanges::IndexRange &IR);
+raw_ostream &operator<< (raw_ostream &OS,
+ const HexagonBlockRanges::RangeList &RL);
+raw_ostream &operator<< (raw_ostream &OS,
+ const HexagonBlockRanges::InstrIndexMap &M);
+raw_ostream &operator<< (raw_ostream &OS,
+ const HexagonBlockRanges::PrintRangeMap &P);
+
+} // end namespace llvm
+
+#endif // HEXAGON_BLOCK_RANGES_H
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonBranchRelaxation.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonBranchRelaxation.cpp
new file mode 100644
index 000000000000..84af4b14b9f7
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonBranchRelaxation.cpp
@@ -0,0 +1,219 @@
+//===--- HexagonBranchRelaxation.cpp - Identify and relax long jumps ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hexagon-brelax"
+
+#include "Hexagon.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <iterator>
+
+using namespace llvm;
+
+// Since we have no exact knowledge of code layout, allow some safety buffer
+// for jump target. This is measured in bytes.
+static cl::opt<uint32_t> BranchRelaxSafetyBuffer("branch-relax-safety-buffer",
+ cl::init(200), cl::Hidden, cl::ZeroOrMore, cl::desc("safety buffer size"));
+
+namespace llvm {
+
+ FunctionPass *createHexagonBranchRelaxation();
+ void initializeHexagonBranchRelaxationPass(PassRegistry&);
+
+} // end namespace llvm
+
+namespace {
+
+ struct HexagonBranchRelaxation : public MachineFunctionPass {
+ public:
+ static char ID;
+
+ HexagonBranchRelaxation() : MachineFunctionPass(ID) {
+ initializeHexagonBranchRelaxationPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "Hexagon Branch Relaxation";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ private:
+ const HexagonInstrInfo *HII;
+ const HexagonRegisterInfo *HRI;
+
+ bool relaxBranches(MachineFunction &MF);
+ void computeOffset(MachineFunction &MF,
+ DenseMap<MachineBasicBlock*, unsigned> &BlockToInstOffset);
+ bool reGenerateBranch(MachineFunction &MF,
+ DenseMap<MachineBasicBlock*, unsigned> &BlockToInstOffset);
+ bool isJumpOutOfRange(MachineInstr &MI,
+ DenseMap<MachineBasicBlock*, unsigned> &BlockToInstOffset);
+ };
+
+ char HexagonBranchRelaxation::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(HexagonBranchRelaxation, "hexagon-brelax",
+ "Hexagon Branch Relaxation", false, false)
+
+FunctionPass *llvm::createHexagonBranchRelaxation() {
+ return new HexagonBranchRelaxation();
+}
+
+bool HexagonBranchRelaxation::runOnMachineFunction(MachineFunction &MF) {
+ DEBUG(dbgs() << "****** Hexagon Branch Relaxation ******\n");
+
+ auto &HST = MF.getSubtarget<HexagonSubtarget>();
+ HII = HST.getInstrInfo();
+ HRI = HST.getRegisterInfo();
+
+ bool Changed = false;
+ Changed = relaxBranches(MF);
+ return Changed;
+}
+
+void HexagonBranchRelaxation::computeOffset(MachineFunction &MF,
+ DenseMap<MachineBasicBlock*, unsigned> &OffsetMap) {
+ // offset of the current instruction from the start.
+ unsigned InstOffset = 0;
+ for (auto &B : MF) {
+ if (B.getAlignment()) {
+ // Although we don't know the exact layout of the final code, we need
+ // to account for alignment padding somehow. This heuristic pads each
+ // aligned basic block according to the alignment value.
+ int ByteAlign = (1u << B.getAlignment()) - 1;
+ InstOffset = (InstOffset + ByteAlign) & ~(ByteAlign);
+ }
+ OffsetMap[&B] = InstOffset;
+ for (auto &MI : B.instrs())
+ InstOffset += HII->getSize(MI);
+ }
+}
+
+/// relaxBranches - For Hexagon, if the jump target/loop label is too far from
+/// the jump/loop instruction then, we need to make sure that we have constant
+/// extenders set for jumps and loops.
+
+/// There are six iterations in this phase. It's self explanatory below.
+bool HexagonBranchRelaxation::relaxBranches(MachineFunction &MF) {
+ // Compute the offset of each basic block
+ // offset of the current instruction from the start.
+ // map for each instruction to the beginning of the function
+ DenseMap<MachineBasicBlock*, unsigned> BlockToInstOffset;
+ computeOffset(MF, BlockToInstOffset);
+
+ return reGenerateBranch(MF, BlockToInstOffset);
+}
+
+/// Check if a given instruction is:
+/// - a jump to a distant target
+/// - that exceeds its immediate range
+/// If both conditions are true, it requires constant extension.
+bool HexagonBranchRelaxation::isJumpOutOfRange(MachineInstr &MI,
+ DenseMap<MachineBasicBlock*, unsigned> &BlockToInstOffset) {
+ MachineBasicBlock &B = *MI.getParent();
+ auto FirstTerm = B.getFirstInstrTerminator();
+ if (FirstTerm == B.instr_end())
+ return false;
+
+ unsigned InstOffset = BlockToInstOffset[&B];
+ unsigned Distance = 0;
+
+ // To save time, estimate exact position of a branch instruction
+ // as one at the end of the MBB.
+ // Number of instructions times typical instruction size.
+ InstOffset += HII->nonDbgBBSize(&B) * HEXAGON_INSTR_SIZE;
+
+ MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+ SmallVector<MachineOperand, 4> Cond;
+
+ // Try to analyze this branch.
+ if (HII->analyzeBranch(B, TBB, FBB, Cond, false)) {
+ // Could not analyze it. See if this is something we can recognize.
+ // If it is a NVJ, it should always have its target in
+ // a fixed location.
+ if (HII->isNewValueJump(*FirstTerm))
+ TBB = FirstTerm->getOperand(HII->getCExtOpNum(*FirstTerm)).getMBB();
+ }
+ if (TBB && &MI == &*FirstTerm) {
+ Distance = std::abs((long long)InstOffset - BlockToInstOffset[TBB])
+ + BranchRelaxSafetyBuffer;
+ return !HII->isJumpWithinBranchRange(*FirstTerm, Distance);
+ }
+ if (FBB) {
+ // Look for second terminator.
+ auto SecondTerm = std::next(FirstTerm);
+ assert(SecondTerm != B.instr_end() &&
+ (SecondTerm->isBranch() || SecondTerm->isCall()) &&
+ "Bad second terminator");
+ if (&MI != &*SecondTerm)
+ return false;
+ // Analyze the second branch in the BB.
+ Distance = std::abs((long long)InstOffset - BlockToInstOffset[FBB])
+ + BranchRelaxSafetyBuffer;
+ return !HII->isJumpWithinBranchRange(*SecondTerm, Distance);
+ }
+ return false;
+}
+
+bool HexagonBranchRelaxation::reGenerateBranch(MachineFunction &MF,
+ DenseMap<MachineBasicBlock*, unsigned> &BlockToInstOffset) {
+ bool Changed = false;
+
+ for (auto &B : MF) {
+ for (auto &MI : B) {
+ if (!MI.isBranch() || !isJumpOutOfRange(MI, BlockToInstOffset))
+ continue;
+ DEBUG(dbgs() << "Long distance jump. isExtendable("
+ << HII->isExtendable(MI) << ") isConstExtended("
+ << HII->isConstExtended(MI) << ") " << MI);
+
+ // Since we have not merged HW loops relaxation into
+ // this code (yet), soften our approach for the moment.
+ if (!HII->isExtendable(MI) && !HII->isExtended(MI)) {
+ DEBUG(dbgs() << "\tUnderimplemented relax branch instruction.\n");
+ } else {
+ // Find which operand is expandable.
+ int ExtOpNum = HII->getCExtOpNum(MI);
+ MachineOperand &MO = MI.getOperand(ExtOpNum);
+ // This need to be something we understand. So far we assume all
+ // branches have only MBB address as expandable field.
+ // If it changes, this will need to be expanded.
+ assert(MO.isMBB() && "Branch with unknown expandable field type");
+ // Mark given operand as extended.
+ MO.addTargetFlag(HexagonII::HMOTF_ConstExtended);
+ Changed = true;
+ }
+ }
+ }
+ return Changed;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
new file mode 100644
index 000000000000..2f8fe6e087f5
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
@@ -0,0 +1,253 @@
+//===-- HexagonCFGOptimizer.cpp - CFG optimizations -----------------------===//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagon_cfg"
+
+namespace llvm {
+ FunctionPass *createHexagonCFGOptimizer();
+ void initializeHexagonCFGOptimizerPass(PassRegistry&);
+}
+
+
+namespace {
+
+class HexagonCFGOptimizer : public MachineFunctionPass {
+
+private:
+ void InvertAndChangeJumpTarget(MachineInstr &, MachineBasicBlock *);
+
+public:
+ static char ID;
+ HexagonCFGOptimizer() : MachineFunctionPass(ID) {
+ initializeHexagonCFGOptimizerPass(*PassRegistry::getPassRegistry());
+ }
+
+ StringRef getPassName() const override { return "Hexagon CFG Optimizer"; }
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+};
+
+
+char HexagonCFGOptimizer::ID = 0;
+
+static bool IsConditionalBranch(int Opc) {
+ switch (Opc) {
+ case Hexagon::J2_jumpt:
+ case Hexagon::J2_jumptpt:
+ case Hexagon::J2_jumpf:
+ case Hexagon::J2_jumpfpt:
+ case Hexagon::J2_jumptnew:
+ case Hexagon::J2_jumpfnew:
+ case Hexagon::J2_jumptnewpt:
+ case Hexagon::J2_jumpfnewpt:
+ return true;
+ }
+ return false;
+}
+
+
+static bool IsUnconditionalJump(int Opc) {
+ return (Opc == Hexagon::J2_jump);
+}
+
+void HexagonCFGOptimizer::InvertAndChangeJumpTarget(
+ MachineInstr &MI, MachineBasicBlock *NewTarget) {
+ const TargetInstrInfo *TII =
+ MI.getParent()->getParent()->getSubtarget().getInstrInfo();
+ int NewOpcode = 0;
+ switch (MI.getOpcode()) {
+ case Hexagon::J2_jumpt:
+ NewOpcode = Hexagon::J2_jumpf;
+ break;
+
+ case Hexagon::J2_jumpf:
+ NewOpcode = Hexagon::J2_jumpt;
+ break;
+
+ case Hexagon::J2_jumptnewpt:
+ NewOpcode = Hexagon::J2_jumpfnewpt;
+ break;
+
+ case Hexagon::J2_jumpfnewpt:
+ NewOpcode = Hexagon::J2_jumptnewpt;
+ break;
+
+ default:
+ llvm_unreachable("Cannot handle this case");
+ }
+
+ MI.setDesc(TII->get(NewOpcode));
+ MI.getOperand(1).setMBB(NewTarget);
+}
+
+
+bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) {
+ if (skipFunction(*Fn.getFunction()))
+ return false;
+
+ // Loop over all of the basic blocks.
+ for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end();
+ MBBb != MBBe; ++MBBb) {
+ MachineBasicBlock *MBB = &*MBBb;
+
+ // Traverse the basic block.
+ MachineBasicBlock::iterator MII = MBB->getFirstTerminator();
+ if (MII != MBB->end()) {
+ MachineInstr &MI = *MII;
+ int Opc = MI.getOpcode();
+ if (IsConditionalBranch(Opc)) {
+
+ //
+ // (Case 1) Transform the code if the following condition occurs:
+ // BB1: if (p0) jump BB3
+ // ...falls-through to BB2 ...
+ // BB2: jump BB4
+ // ...next block in layout is BB3...
+ // BB3: ...
+ //
+ // Transform this to:
+ // BB1: if (!p0) jump BB4
+ // Remove BB2
+ // BB3: ...
+ //
+ // (Case 2) A variation occurs when BB3 contains a JMP to BB4:
+ // BB1: if (p0) jump BB3
+ // ...falls-through to BB2 ...
+ // BB2: jump BB4
+ // ...other basic blocks ...
+ // BB4:
+ // ...not a fall-thru
+ // BB3: ...
+ // jump BB4
+ //
+ // Transform this to:
+ // BB1: if (!p0) jump BB4
+ // Remove BB2
+ // BB3: ...
+ // BB4: ...
+ //
+ unsigned NumSuccs = MBB->succ_size();
+ MachineBasicBlock::succ_iterator SI = MBB->succ_begin();
+ MachineBasicBlock* FirstSucc = *SI;
+ MachineBasicBlock* SecondSucc = *(++SI);
+ MachineBasicBlock* LayoutSucc = nullptr;
+ MachineBasicBlock* JumpAroundTarget = nullptr;
+
+ if (MBB->isLayoutSuccessor(FirstSucc)) {
+ LayoutSucc = FirstSucc;
+ JumpAroundTarget = SecondSucc;
+ } else if (MBB->isLayoutSuccessor(SecondSucc)) {
+ LayoutSucc = SecondSucc;
+ JumpAroundTarget = FirstSucc;
+ } else {
+ // Odd case...cannot handle.
+ }
+
+ // The target of the unconditional branch must be JumpAroundTarget.
+ // TODO: If not, we should not invert the unconditional branch.
+ MachineBasicBlock* CondBranchTarget = nullptr;
+ if (MI.getOpcode() == Hexagon::J2_jumpt ||
+ MI.getOpcode() == Hexagon::J2_jumpf) {
+ CondBranchTarget = MI.getOperand(1).getMBB();
+ }
+
+ if (!LayoutSucc || (CondBranchTarget != JumpAroundTarget)) {
+ continue;
+ }
+
+ if ((NumSuccs == 2) && LayoutSucc && (LayoutSucc->pred_size() == 1)) {
+
+ // Ensure that BB2 has one instruction -- an unconditional jump.
+ if ((LayoutSucc->size() == 1) &&
+ IsUnconditionalJump(LayoutSucc->front().getOpcode())) {
+ assert(JumpAroundTarget && "jump target is needed to process second basic block");
+ MachineBasicBlock* UncondTarget =
+ LayoutSucc->front().getOperand(0).getMBB();
+ // Check if the layout successor of BB2 is BB3.
+ bool case1 = LayoutSucc->isLayoutSuccessor(JumpAroundTarget);
+ bool case2 = JumpAroundTarget->isSuccessor(UncondTarget) &&
+ JumpAroundTarget->size() >= 1 &&
+ IsUnconditionalJump(JumpAroundTarget->back().getOpcode()) &&
+ JumpAroundTarget->pred_size() == 1 &&
+ JumpAroundTarget->succ_size() == 1;
+
+ if (case1 || case2) {
+ InvertAndChangeJumpTarget(MI, UncondTarget);
+ MBB->replaceSuccessor(JumpAroundTarget, UncondTarget);
+
+ // Remove the unconditional branch in LayoutSucc.
+ LayoutSucc->erase(LayoutSucc->begin());
+ LayoutSucc->replaceSuccessor(UncondTarget, JumpAroundTarget);
+
+ // This code performs the conversion for case 2, which moves
+ // the block to the fall-thru case (BB3 in the code above).
+ if (case2 && !case1) {
+ JumpAroundTarget->moveAfter(LayoutSucc);
+ // only move a block if it doesn't have a fall-thru. otherwise
+ // the CFG will be incorrect.
+ if (!UncondTarget->canFallThrough()) {
+ UncondTarget->moveAfter(JumpAroundTarget);
+ }
+ }
+
+ //
+ // Correct live-in information. Is used by post-RA scheduler
+ // The live-in to LayoutSucc is now all values live-in to
+ // JumpAroundTarget.
+ //
+ std::vector<MachineBasicBlock::RegisterMaskPair> OrigLiveIn(
+ LayoutSucc->livein_begin(), LayoutSucc->livein_end());
+ std::vector<MachineBasicBlock::RegisterMaskPair> NewLiveIn(
+ JumpAroundTarget->livein_begin(),
+ JumpAroundTarget->livein_end());
+ for (const auto &OrigLI : OrigLiveIn)
+ LayoutSucc->removeLiveIn(OrigLI.PhysReg);
+ for (const auto &NewLI : NewLiveIn)
+ LayoutSucc->addLiveIn(NewLI);
+ }
+ }
+ }
+ }
+ }
+ }
+ return true;
+}
+}
+
+
+//===----------------------------------------------------------------------===//
+// Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+INITIALIZE_PASS(HexagonCFGOptimizer, "hexagon-cfg", "Hexagon CFG Optimizer",
+ false, false)
+
+FunctionPass *llvm::createHexagonCFGOptimizer() {
+ return new HexagonCFGOptimizer();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonCallingConv.td b/contrib/llvm/lib/Target/Hexagon/HexagonCallingConv.td
new file mode 100644
index 000000000000..e61b2a7a58ac
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonCallingConv.td
@@ -0,0 +1,35 @@
+//===- HexagonCallingConv.td - Calling Conventions Hexagon -*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the Hexagon architectures.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// Hexagon 32-bit C return-value convention.
+def RetCC_Hexagon32 : CallingConv<[
+ CCIfType<[i32, f32], CCAssignToReg<[R0, R1, R2, R3, R4, R5]>>,
+ CCIfType<[i64, f64], CCAssignToReg<[D0, D1, D2]>>,
+
+ // Alternatively, they are assigned to the stack in 4-byte aligned units.
+ CCAssignToStack<4, 4>
+]>;
+
+// Hexagon 32-bit C Calling convention.
+def CC_Hexagon32 : CallingConv<[
+ // All arguments get passed in integer registers if there is space.
+ CCIfType<[f32, i32, i16, i8], CCAssignToReg<[R0, R1, R2, R3, R4, R5]>>,
+ CCIfType<[f64, i64], CCAssignToReg<[D0, D1, D2]>>,
+
+ // Alternatively, they are assigned to the stack in 4-byte aligned units.
+ CCAssignToStack<4, 4>
+]>;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
new file mode 100644
index 000000000000..489da6be923d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
@@ -0,0 +1,1304 @@
+//===--- HexagonCommonGEP.cpp ---------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "commgep"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <map>
+#include <set>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+static cl::opt<bool> OptSpeculate("commgep-speculate", cl::init(true),
+ cl::Hidden, cl::ZeroOrMore);
+
+static cl::opt<bool> OptEnableInv("commgep-inv", cl::init(true), cl::Hidden,
+ cl::ZeroOrMore);
+
+static cl::opt<bool> OptEnableConst("commgep-const", cl::init(true),
+ cl::Hidden, cl::ZeroOrMore);
+
+namespace llvm {
+
+ void initializeHexagonCommonGEPPass(PassRegistry&);
+
+} // end namespace llvm
+
+namespace {
+
+ struct GepNode;
+ typedef std::set<GepNode*> NodeSet;
+ typedef std::map<GepNode*,Value*> NodeToValueMap;
+ typedef std::vector<GepNode*> NodeVect;
+ typedef std::map<GepNode*,NodeVect> NodeChildrenMap;
+ typedef std::set<Use*> UseSet;
+ typedef std::map<GepNode*,UseSet> NodeToUsesMap;
+
+ // Numbering map for gep nodes. Used to keep track of ordering for
+ // gep nodes.
+ struct NodeOrdering {
+ NodeOrdering() = default;
+
+ void insert(const GepNode *N) { Map.insert(std::make_pair(N, ++LastNum)); }
+ void clear() { Map.clear(); }
+
+ bool operator()(const GepNode *N1, const GepNode *N2) const {
+ auto F1 = Map.find(N1), F2 = Map.find(N2);
+ assert(F1 != Map.end() && F2 != Map.end());
+ return F1->second < F2->second;
+ }
+
+ private:
+ std::map<const GepNode *, unsigned> Map;
+ unsigned LastNum = 0;
+ };
+
+ class HexagonCommonGEP : public FunctionPass {
+ public:
+ static char ID;
+
+ HexagonCommonGEP() : FunctionPass(ID) {
+ initializeHexagonCommonGEPPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+ StringRef getPassName() const override { return "Hexagon Common GEP"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addRequired<PostDominatorTreeWrapperPass>();
+ AU.addPreserved<PostDominatorTreeWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ FunctionPass::getAnalysisUsage(AU);
+ }
+
+ private:
+ typedef std::map<Value*,GepNode*> ValueToNodeMap;
+ typedef std::vector<Value*> ValueVect;
+ typedef std::map<GepNode*,ValueVect> NodeToValuesMap;
+
+ void getBlockTraversalOrder(BasicBlock *Root, ValueVect &Order);
+ bool isHandledGepForm(GetElementPtrInst *GepI);
+ void processGepInst(GetElementPtrInst *GepI, ValueToNodeMap &NM);
+ void collect();
+ void common();
+
+ BasicBlock *recalculatePlacement(GepNode *Node, NodeChildrenMap &NCM,
+ NodeToValueMap &Loc);
+ BasicBlock *recalculatePlacementRec(GepNode *Node, NodeChildrenMap &NCM,
+ NodeToValueMap &Loc);
+ bool isInvariantIn(Value *Val, Loop *L);
+ bool isInvariantIn(GepNode *Node, Loop *L);
+ bool isInMainPath(BasicBlock *B, Loop *L);
+ BasicBlock *adjustForInvariance(GepNode *Node, NodeChildrenMap &NCM,
+ NodeToValueMap &Loc);
+ void separateChainForNode(GepNode *Node, Use *U, NodeToValueMap &Loc);
+ void separateConstantChains(GepNode *Node, NodeChildrenMap &NCM,
+ NodeToValueMap &Loc);
+ void computeNodePlacement(NodeToValueMap &Loc);
+
+ Value *fabricateGEP(NodeVect &NA, BasicBlock::iterator At,
+ BasicBlock *LocB);
+ void getAllUsersForNode(GepNode *Node, ValueVect &Values,
+ NodeChildrenMap &NCM);
+ void materialize(NodeToValueMap &Loc);
+
+ void removeDeadCode();
+
+ NodeVect Nodes;
+ NodeToUsesMap Uses;
+ NodeOrdering NodeOrder; // Node ordering, for deterministic behavior.
+ SpecificBumpPtrAllocator<GepNode> *Mem;
+ LLVMContext *Ctx;
+ LoopInfo *LI;
+ DominatorTree *DT;
+ PostDominatorTree *PDT;
+ Function *Fn;
+ };
+
+} // end anonymous namespace
+
+char HexagonCommonGEP::ID = 0;
+INITIALIZE_PASS_BEGIN(HexagonCommonGEP, "hcommgep", "Hexagon Common GEP",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(HexagonCommonGEP, "hcommgep", "Hexagon Common GEP",
+ false, false)
+
+namespace {
+
+ struct GepNode {
+ enum {
+ None = 0,
+ Root = 0x01,
+ Internal = 0x02,
+ Used = 0x04
+ };
+
+ uint32_t Flags;
+ union {
+ GepNode *Parent;
+ Value *BaseVal;
+ };
+ Value *Idx;
+ Type *PTy; // Type of the pointer operand.
+
+ GepNode() : Flags(0), Parent(nullptr), Idx(nullptr), PTy(nullptr) {}
+ GepNode(const GepNode *N) : Flags(N->Flags), Idx(N->Idx), PTy(N->PTy) {
+ if (Flags & Root)
+ BaseVal = N->BaseVal;
+ else
+ Parent = N->Parent;
+ }
+
+ friend raw_ostream &operator<< (raw_ostream &OS, const GepNode &GN);
+ };
+
+ Type *next_type(Type *Ty, Value *Idx) {
+ if (auto *PTy = dyn_cast<PointerType>(Ty))
+ return PTy->getElementType();
+ // Advance the type.
+ if (!Ty->isStructTy()) {
+ Type *NexTy = cast<SequentialType>(Ty)->getElementType();
+ return NexTy;
+ }
+ // Otherwise it is a struct type.
+ ConstantInt *CI = dyn_cast<ConstantInt>(Idx);
+ assert(CI && "Struct type with non-constant index");
+ int64_t i = CI->getValue().getSExtValue();
+ Type *NextTy = cast<StructType>(Ty)->getElementType(i);
+ return NextTy;
+ }
+
+ raw_ostream &operator<< (raw_ostream &OS, const GepNode &GN) {
+ OS << "{ {";
+ bool Comma = false;
+ if (GN.Flags & GepNode::Root) {
+ OS << "root";
+ Comma = true;
+ }
+ if (GN.Flags & GepNode::Internal) {
+ if (Comma)
+ OS << ',';
+ OS << "internal";
+ Comma = true;
+ }
+ if (GN.Flags & GepNode::Used) {
+ if (Comma)
+ OS << ',';
+ OS << "used";
+ }
+ OS << "} ";
+ if (GN.Flags & GepNode::Root)
+ OS << "BaseVal:" << GN.BaseVal->getName() << '(' << GN.BaseVal << ')';
+ else
+ OS << "Parent:" << GN.Parent;
+
+ OS << " Idx:";
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(GN.Idx))
+ OS << CI->getValue().getSExtValue();
+ else if (GN.Idx->hasName())
+ OS << GN.Idx->getName();
+ else
+ OS << "<anon> =" << *GN.Idx;
+
+ OS << " PTy:";
+ if (GN.PTy->isStructTy()) {
+ StructType *STy = cast<StructType>(GN.PTy);
+ if (!STy->isLiteral())
+ OS << GN.PTy->getStructName();
+ else
+ OS << "<anon-struct>:" << *STy;
+ }
+ else
+ OS << *GN.PTy;
+ OS << " }";
+ return OS;
+ }
+
+ template <typename NodeContainer>
+ void dump_node_container(raw_ostream &OS, const NodeContainer &S) {
+ typedef typename NodeContainer::const_iterator const_iterator;
+ for (const_iterator I = S.begin(), E = S.end(); I != E; ++I)
+ OS << *I << ' ' << **I << '\n';
+ }
+
+ raw_ostream &operator<< (raw_ostream &OS,
+ const NodeVect &S) LLVM_ATTRIBUTE_UNUSED;
+ raw_ostream &operator<< (raw_ostream &OS, const NodeVect &S) {
+ dump_node_container(OS, S);
+ return OS;
+ }
+
+ raw_ostream &operator<< (raw_ostream &OS,
+ const NodeToUsesMap &M) LLVM_ATTRIBUTE_UNUSED;
+ raw_ostream &operator<< (raw_ostream &OS, const NodeToUsesMap &M){
+ typedef NodeToUsesMap::const_iterator const_iterator;
+ for (const_iterator I = M.begin(), E = M.end(); I != E; ++I) {
+ const UseSet &Us = I->second;
+ OS << I->first << " -> #" << Us.size() << '{';
+ for (UseSet::const_iterator J = Us.begin(), F = Us.end(); J != F; ++J) {
+ User *R = (*J)->getUser();
+ if (R->hasName())
+ OS << ' ' << R->getName();
+ else
+ OS << " <?>(" << *R << ')';
+ }
+ OS << " }\n";
+ }
+ return OS;
+ }
+
+ struct in_set {
+ in_set(const NodeSet &S) : NS(S) {}
+ bool operator() (GepNode *N) const {
+ return NS.find(N) != NS.end();
+ }
+
+ private:
+ const NodeSet &NS;
+ };
+
+} // end anonymous namespace
+
+inline void *operator new(size_t, SpecificBumpPtrAllocator<GepNode> &A) {
+ return A.Allocate();
+}
+
+void HexagonCommonGEP::getBlockTraversalOrder(BasicBlock *Root,
+ ValueVect &Order) {
+ // Compute block ordering for a typical DT-based traversal of the flow
+ // graph: "before visiting a block, all of its dominators must have been
+ // visited".
+
+ Order.push_back(Root);
+ DomTreeNode *DTN = DT->getNode(Root);
+ typedef GraphTraits<DomTreeNode*> GTN;
+ typedef GTN::ChildIteratorType Iter;
+ for (Iter I = GTN::child_begin(DTN), E = GTN::child_end(DTN); I != E; ++I)
+ getBlockTraversalOrder((*I)->getBlock(), Order);
+}
+
+bool HexagonCommonGEP::isHandledGepForm(GetElementPtrInst *GepI) {
+ // No vector GEPs.
+ if (!GepI->getType()->isPointerTy())
+ return false;
+ // No GEPs without any indices. (Is this possible?)
+ if (GepI->idx_begin() == GepI->idx_end())
+ return false;
+ return true;
+}
+
+void HexagonCommonGEP::processGepInst(GetElementPtrInst *GepI,
+ ValueToNodeMap &NM) {
+ DEBUG(dbgs() << "Visiting GEP: " << *GepI << '\n');
+ GepNode *N = new (*Mem) GepNode;
+ Value *PtrOp = GepI->getPointerOperand();
+ ValueToNodeMap::iterator F = NM.find(PtrOp);
+ if (F == NM.end()) {
+ N->BaseVal = PtrOp;
+ N->Flags |= GepNode::Root;
+ } else {
+ // If PtrOp was a GEP instruction, it must have already been processed.
+ // The ValueToNodeMap entry for it is the last gep node in the generated
+ // chain. Link to it here.
+ N->Parent = F->second;
+ }
+ N->PTy = PtrOp->getType();
+ N->Idx = *GepI->idx_begin();
+
+ // Collect the list of users of this GEP instruction. Will add it to the
+ // last node created for it.
+ UseSet Us;
+ for (Value::user_iterator UI = GepI->user_begin(), UE = GepI->user_end();
+ UI != UE; ++UI) {
+ // Check if this gep is used by anything other than other geps that
+ // we will process.
+ if (isa<GetElementPtrInst>(*UI)) {
+ GetElementPtrInst *UserG = cast<GetElementPtrInst>(*UI);
+ if (isHandledGepForm(UserG))
+ continue;
+ }
+ Us.insert(&UI.getUse());
+ }
+ Nodes.push_back(N);
+ NodeOrder.insert(N);
+
+ // Skip the first index operand, since we only handle 0. This dereferences
+ // the pointer operand.
+ GepNode *PN = N;
+ Type *PtrTy = cast<PointerType>(PtrOp->getType())->getElementType();
+ for (User::op_iterator OI = GepI->idx_begin()+1, OE = GepI->idx_end();
+ OI != OE; ++OI) {
+ Value *Op = *OI;
+ GepNode *Nx = new (*Mem) GepNode;
+ Nx->Parent = PN; // Link Nx to the previous node.
+ Nx->Flags |= GepNode::Internal;
+ Nx->PTy = PtrTy;
+ Nx->Idx = Op;
+ Nodes.push_back(Nx);
+ NodeOrder.insert(Nx);
+ PN = Nx;
+
+ PtrTy = next_type(PtrTy, Op);
+ }
+
+ // After last node has been created, update the use information.
+ if (!Us.empty()) {
+ PN->Flags |= GepNode::Used;
+ Uses[PN].insert(Us.begin(), Us.end());
+ }
+
+ // Link the last node with the originating GEP instruction. This is to
+ // help with linking chained GEP instructions.
+ NM.insert(std::make_pair(GepI, PN));
+}
+
+void HexagonCommonGEP::collect() {
+ // Establish depth-first traversal order of the dominator tree.
+ ValueVect BO;
+ getBlockTraversalOrder(&Fn->front(), BO);
+
+ // The creation of gep nodes requires DT-traversal. When processing a GEP
+ // instruction that uses another GEP instruction as the base pointer, the
+ // gep node for the base pointer should already exist.
+ ValueToNodeMap NM;
+ for (ValueVect::iterator I = BO.begin(), E = BO.end(); I != E; ++I) {
+ BasicBlock *B = cast<BasicBlock>(*I);
+ for (BasicBlock::iterator J = B->begin(), F = B->end(); J != F; ++J) {
+ if (!isa<GetElementPtrInst>(J))
+ continue;
+ GetElementPtrInst *GepI = cast<GetElementPtrInst>(J);
+ if (isHandledGepForm(GepI))
+ processGepInst(GepI, NM);
+ }
+ }
+
+ DEBUG(dbgs() << "Gep nodes after initial collection:\n" << Nodes);
+}
+
+static void invert_find_roots(const NodeVect &Nodes, NodeChildrenMap &NCM,
+ NodeVect &Roots) {
+ typedef NodeVect::const_iterator const_iterator;
+ for (const_iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
+ GepNode *N = *I;
+ if (N->Flags & GepNode::Root) {
+ Roots.push_back(N);
+ continue;
+ }
+ GepNode *PN = N->Parent;
+ NCM[PN].push_back(N);
+ }
+}
+
+static void nodes_for_root(GepNode *Root, NodeChildrenMap &NCM,
+ NodeSet &Nodes) {
+ NodeVect Work;
+ Work.push_back(Root);
+ Nodes.insert(Root);
+
+ while (!Work.empty()) {
+ NodeVect::iterator First = Work.begin();
+ GepNode *N = *First;
+ Work.erase(First);
+ NodeChildrenMap::iterator CF = NCM.find(N);
+ if (CF != NCM.end()) {
+ Work.insert(Work.end(), CF->second.begin(), CF->second.end());
+ Nodes.insert(CF->second.begin(), CF->second.end());
+ }
+ }
+}
+
+namespace {
+
+ typedef std::set<NodeSet> NodeSymRel;
+ typedef std::pair<GepNode*,GepNode*> NodePair;
+ typedef std::set<NodePair> NodePairSet;
+
+} // end anonymous namespace
+
+static const NodeSet *node_class(GepNode *N, NodeSymRel &Rel) {
+ for (NodeSymRel::iterator I = Rel.begin(), E = Rel.end(); I != E; ++I)
+ if (I->count(N))
+ return &*I;
+ return nullptr;
+}
+
+ // Create an ordered pair of GepNode pointers. The pair will be used in
+ // determining equality. The only purpose of the ordering is to eliminate
+ // duplication due to the commutativity of equality/non-equality.
+static NodePair node_pair(GepNode *N1, GepNode *N2) {
+ uintptr_t P1 = uintptr_t(N1), P2 = uintptr_t(N2);
+ if (P1 <= P2)
+ return std::make_pair(N1, N2);
+ return std::make_pair(N2, N1);
+}
+
+static unsigned node_hash(GepNode *N) {
+ // Include everything except flags and parent.
+ FoldingSetNodeID ID;
+ ID.AddPointer(N->Idx);
+ ID.AddPointer(N->PTy);
+ return ID.ComputeHash();
+}
+
+static bool node_eq(GepNode *N1, GepNode *N2, NodePairSet &Eq,
+ NodePairSet &Ne) {
+ // Don't cache the result for nodes with different hashes. The hash
+ // comparison is fast enough.
+ if (node_hash(N1) != node_hash(N2))
+ return false;
+
+ NodePair NP = node_pair(N1, N2);
+ NodePairSet::iterator FEq = Eq.find(NP);
+ if (FEq != Eq.end())
+ return true;
+ NodePairSet::iterator FNe = Ne.find(NP);
+ if (FNe != Ne.end())
+ return false;
+ // Not previously compared.
+ bool Root1 = N1->Flags & GepNode::Root;
+ bool Root2 = N2->Flags & GepNode::Root;
+ NodePair P = node_pair(N1, N2);
+ // If the Root flag has different values, the nodes are different.
+ // If both nodes are root nodes, but their base pointers differ,
+ // they are different.
+ if (Root1 != Root2 || (Root1 && N1->BaseVal != N2->BaseVal)) {
+ Ne.insert(P);
+ return false;
+ }
+ // Here the root flags are identical, and for root nodes the
+ // base pointers are equal, so the root nodes are equal.
+ // For non-root nodes, compare their parent nodes.
+ if (Root1 || node_eq(N1->Parent, N2->Parent, Eq, Ne)) {
+ Eq.insert(P);
+ return true;
+ }
+ return false;
+}
+
+void HexagonCommonGEP::common() {
+ // The essence of this commoning is finding gep nodes that are equal.
+ // To do this we need to compare all pairs of nodes. To save time,
+ // first, partition the set of all nodes into sets of potentially equal
+ // nodes, and then compare pairs from within each partition.
+ typedef std::map<unsigned,NodeSet> NodeSetMap;
+ NodeSetMap MaybeEq;
+
+ for (NodeVect::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
+ GepNode *N = *I;
+ unsigned H = node_hash(N);
+ MaybeEq[H].insert(N);
+ }
+
+ // Compute the equivalence relation for the gep nodes. Use two caches,
+ // one for equality and the other for non-equality.
+ NodeSymRel EqRel; // Equality relation (as set of equivalence classes).
+ NodePairSet Eq, Ne; // Caches.
+ for (NodeSetMap::iterator I = MaybeEq.begin(), E = MaybeEq.end();
+ I != E; ++I) {
+ NodeSet &S = I->second;
+ for (NodeSet::iterator NI = S.begin(), NE = S.end(); NI != NE; ++NI) {
+ GepNode *N = *NI;
+ // If node already has a class, then the class must have been created
+ // in a prior iteration of this loop. Since equality is transitive,
+ // nothing more will be added to that class, so skip it.
+ if (node_class(N, EqRel))
+ continue;
+
+ // Create a new class candidate now.
+ NodeSet C;
+ for (NodeSet::iterator NJ = std::next(NI); NJ != NE; ++NJ)
+ if (node_eq(N, *NJ, Eq, Ne))
+ C.insert(*NJ);
+ // If Tmp is empty, N would be the only element in it. Don't bother
+ // creating a class for it then.
+ if (!C.empty()) {
+ C.insert(N); // Finalize the set before adding it to the relation.
+ std::pair<NodeSymRel::iterator, bool> Ins = EqRel.insert(C);
+ (void)Ins;
+ assert(Ins.second && "Cannot add a class");
+ }
+ }
+ }
+
+ DEBUG({
+ dbgs() << "Gep node equality:\n";
+ for (NodePairSet::iterator I = Eq.begin(), E = Eq.end(); I != E; ++I)
+ dbgs() << "{ " << I->first << ", " << I->second << " }\n";
+
+ dbgs() << "Gep equivalence classes:\n";
+ for (NodeSymRel::iterator I = EqRel.begin(), E = EqRel.end(); I != E; ++I) {
+ dbgs() << '{';
+ const NodeSet &S = *I;
+ for (NodeSet::const_iterator J = S.begin(), F = S.end(); J != F; ++J) {
+ if (J != S.begin())
+ dbgs() << ',';
+ dbgs() << ' ' << *J;
+ }
+ dbgs() << " }\n";
+ }
+ });
+
+ // Create a projection from a NodeSet to the minimal element in it.
+ typedef std::map<const NodeSet*,GepNode*> ProjMap;
+ ProjMap PM;
+ for (NodeSymRel::iterator I = EqRel.begin(), E = EqRel.end(); I != E; ++I) {
+ const NodeSet &S = *I;
+ GepNode *Min = *std::min_element(S.begin(), S.end(), NodeOrder);
+ std::pair<ProjMap::iterator,bool> Ins = PM.insert(std::make_pair(&S, Min));
+ (void)Ins;
+ assert(Ins.second && "Cannot add minimal element");
+
+ // Update the min element's flags, and user list.
+ uint32_t Flags = 0;
+ UseSet &MinUs = Uses[Min];
+ for (NodeSet::iterator J = S.begin(), F = S.end(); J != F; ++J) {
+ GepNode *N = *J;
+ uint32_t NF = N->Flags;
+ // If N is used, append all original values of N to the list of
+ // original values of Min.
+ if (NF & GepNode::Used)
+ MinUs.insert(Uses[N].begin(), Uses[N].end());
+ Flags |= NF;
+ }
+ if (MinUs.empty())
+ Uses.erase(Min);
+
+ // The collected flags should include all the flags from the min element.
+ assert((Min->Flags & Flags) == Min->Flags);
+ Min->Flags = Flags;
+ }
+
+ // Commoning: for each non-root gep node, replace "Parent" with the
+ // selected (minimum) node from the corresponding equivalence class.
+ // If a given parent does not have an equivalence class, leave it
+ // unchanged (it means that it's the only element in its class).
+ for (NodeVect::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
+ GepNode *N = *I;
+ if (N->Flags & GepNode::Root)
+ continue;
+ const NodeSet *PC = node_class(N->Parent, EqRel);
+ if (!PC)
+ continue;
+ ProjMap::iterator F = PM.find(PC);
+ if (F == PM.end())
+ continue;
+ // Found a replacement, use it.
+ GepNode *Rep = F->second;
+ N->Parent = Rep;
+ }
+
+ DEBUG(dbgs() << "Gep nodes after commoning:\n" << Nodes);
+
+ // Finally, erase the nodes that are no longer used.
+ NodeSet Erase;
+ for (NodeVect::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
+ GepNode *N = *I;
+ const NodeSet *PC = node_class(N, EqRel);
+ if (!PC)
+ continue;
+ ProjMap::iterator F = PM.find(PC);
+ if (F == PM.end())
+ continue;
+ if (N == F->second)
+ continue;
+ // Node for removal.
+ Erase.insert(*I);
+ }
+ NodeVect::iterator NewE = remove_if(Nodes, in_set(Erase));
+ Nodes.resize(std::distance(Nodes.begin(), NewE));
+
+ DEBUG(dbgs() << "Gep nodes after post-commoning cleanup:\n" << Nodes);
+}
+
+template <typename T>
+static BasicBlock *nearest_common_dominator(DominatorTree *DT, T &Blocks) {
+ DEBUG({
+ dbgs() << "NCD of {";
+ for (typename T::iterator I = Blocks.begin(), E = Blocks.end();
+ I != E; ++I) {
+ if (!*I)
+ continue;
+ BasicBlock *B = cast<BasicBlock>(*I);
+ dbgs() << ' ' << B->getName();
+ }
+ dbgs() << " }\n";
+ });
+
+ // Allow null basic blocks in Blocks. In such cases, return nullptr.
+ typename T::iterator I = Blocks.begin(), E = Blocks.end();
+ if (I == E || !*I)
+ return nullptr;
+ BasicBlock *Dom = cast<BasicBlock>(*I);
+ while (++I != E) {
+ BasicBlock *B = cast_or_null<BasicBlock>(*I);
+ Dom = B ? DT->findNearestCommonDominator(Dom, B) : nullptr;
+ if (!Dom)
+ return nullptr;
+ }
+ DEBUG(dbgs() << "computed:" << Dom->getName() << '\n');
+ return Dom;
+}
+
+template <typename T>
+static BasicBlock *nearest_common_dominatee(DominatorTree *DT, T &Blocks) {
+ // If two blocks, A and B, dominate a block C, then A dominates B,
+ // or B dominates A.
+ typename T::iterator I = Blocks.begin(), E = Blocks.end();
+ // Find the first non-null block.
+ while (I != E && !*I)
+ ++I;
+ if (I == E)
+ return DT->getRoot();
+ BasicBlock *DomB = cast<BasicBlock>(*I);
+ while (++I != E) {
+ if (!*I)
+ continue;
+ BasicBlock *B = cast<BasicBlock>(*I);
+ if (DT->dominates(B, DomB))
+ continue;
+ if (!DT->dominates(DomB, B))
+ return nullptr;
+ DomB = B;
+ }
+ return DomB;
+}
+
+// Find the first use in B of any value from Values. If no such use,
+// return B->end().
+template <typename T>
+static BasicBlock::iterator first_use_of_in_block(T &Values, BasicBlock *B) {
+ BasicBlock::iterator FirstUse = B->end(), BEnd = B->end();
+ typedef typename T::iterator iterator;
+ for (iterator I = Values.begin(), E = Values.end(); I != E; ++I) {
+ Value *V = *I;
+ // If V is used in a PHI node, the use belongs to the incoming block,
+ // not the block with the PHI node. In the incoming block, the use
+ // would be considered as being at the end of it, so it cannot
+ // influence the position of the first use (which is assumed to be
+ // at the end to start with).
+ if (isa<PHINode>(V))
+ continue;
+ if (!isa<Instruction>(V))
+ continue;
+ Instruction *In = cast<Instruction>(V);
+ if (In->getParent() != B)
+ continue;
+ BasicBlock::iterator It = In->getIterator();
+ if (std::distance(FirstUse, BEnd) < std::distance(It, BEnd))
+ FirstUse = It;
+ }
+ return FirstUse;
+}
+
+static bool is_empty(const BasicBlock *B) {
+ return B->empty() || (&*B->begin() == B->getTerminator());
+}
+
+BasicBlock *HexagonCommonGEP::recalculatePlacement(GepNode *Node,
+ NodeChildrenMap &NCM, NodeToValueMap &Loc) {
+ DEBUG(dbgs() << "Loc for node:" << Node << '\n');
+ // Recalculate the placement for Node, assuming that the locations of
+ // its children in Loc are valid.
+ // Return nullptr if there is no valid placement for Node (for example, it
+ // uses an index value that is not available at the location required
+ // to dominate all children, etc.).
+
+ // Find the nearest common dominator for:
+ // - all users, if the node is used, and
+ // - all children.
+ ValueVect Bs;
+ if (Node->Flags & GepNode::Used) {
+ // Append all blocks with uses of the original values to the
+ // block vector Bs.
+ NodeToUsesMap::iterator UF = Uses.find(Node);
+ assert(UF != Uses.end() && "Used node with no use information");
+ UseSet &Us = UF->second;
+ for (UseSet::iterator I = Us.begin(), E = Us.end(); I != E; ++I) {
+ Use *U = *I;
+ User *R = U->getUser();
+ if (!isa<Instruction>(R))
+ continue;
+ BasicBlock *PB = isa<PHINode>(R)
+ ? cast<PHINode>(R)->getIncomingBlock(*U)
+ : cast<Instruction>(R)->getParent();
+ Bs.push_back(PB);
+ }
+ }
+ // Append the location of each child.
+ NodeChildrenMap::iterator CF = NCM.find(Node);
+ if (CF != NCM.end()) {
+ NodeVect &Cs = CF->second;
+ for (NodeVect::iterator I = Cs.begin(), E = Cs.end(); I != E; ++I) {
+ GepNode *CN = *I;
+ NodeToValueMap::iterator LF = Loc.find(CN);
+ // If the child is only used in GEP instructions (i.e. is not used in
+ // non-GEP instructions), the nearest dominator computed for it may
+ // have been null. In such case it won't have a location available.
+ if (LF == Loc.end())
+ continue;
+ Bs.push_back(LF->second);
+ }
+ }
+
+ BasicBlock *DomB = nearest_common_dominator(DT, Bs);
+ if (!DomB)
+ return nullptr;
+ // Check if the index used by Node dominates the computed dominator.
+ Instruction *IdxI = dyn_cast<Instruction>(Node->Idx);
+ if (IdxI && !DT->dominates(IdxI->getParent(), DomB))
+ return nullptr;
+
+ // Avoid putting nodes into empty blocks.
+ while (is_empty(DomB)) {
+ DomTreeNode *N = (*DT)[DomB]->getIDom();
+ if (!N)
+ break;
+ DomB = N->getBlock();
+ }
+
+ // Otherwise, DomB is fine. Update the location map.
+ Loc[Node] = DomB;
+ return DomB;
+}
+
+BasicBlock *HexagonCommonGEP::recalculatePlacementRec(GepNode *Node,
+ NodeChildrenMap &NCM, NodeToValueMap &Loc) {
+ DEBUG(dbgs() << "LocRec begin for node:" << Node << '\n');
+ // Recalculate the placement of Node, after recursively recalculating the
+ // placements of all its children.
+ NodeChildrenMap::iterator CF = NCM.find(Node);
+ if (CF != NCM.end()) {
+ NodeVect &Cs = CF->second;
+ for (NodeVect::iterator I = Cs.begin(), E = Cs.end(); I != E; ++I)
+ recalculatePlacementRec(*I, NCM, Loc);
+ }
+ BasicBlock *LB = recalculatePlacement(Node, NCM, Loc);
+ DEBUG(dbgs() << "LocRec end for node:" << Node << '\n');
+ return LB;
+}
+
+bool HexagonCommonGEP::isInvariantIn(Value *Val, Loop *L) {
+ if (isa<Constant>(Val) || isa<Argument>(Val))
+ return true;
+ Instruction *In = dyn_cast<Instruction>(Val);
+ if (!In)
+ return false;
+ BasicBlock *HdrB = L->getHeader(), *DefB = In->getParent();
+ return DT->properlyDominates(DefB, HdrB);
+}
+
+bool HexagonCommonGEP::isInvariantIn(GepNode *Node, Loop *L) {
+ if (Node->Flags & GepNode::Root)
+ if (!isInvariantIn(Node->BaseVal, L))
+ return false;
+ return isInvariantIn(Node->Idx, L);
+}
+
+bool HexagonCommonGEP::isInMainPath(BasicBlock *B, Loop *L) {
+ BasicBlock *HB = L->getHeader();
+ BasicBlock *LB = L->getLoopLatch();
+ // B must post-dominate the loop header or dominate the loop latch.
+ if (PDT->dominates(B, HB))
+ return true;
+ if (LB && DT->dominates(B, LB))
+ return true;
+ return false;
+}
+
+static BasicBlock *preheader(DominatorTree *DT, Loop *L) {
+ if (BasicBlock *PH = L->getLoopPreheader())
+ return PH;
+ if (!OptSpeculate)
+ return nullptr;
+ DomTreeNode *DN = DT->getNode(L->getHeader());
+ if (!DN)
+ return nullptr;
+ return DN->getIDom()->getBlock();
+}
+
+BasicBlock *HexagonCommonGEP::adjustForInvariance(GepNode *Node,
+ NodeChildrenMap &NCM, NodeToValueMap &Loc) {
+ // Find the "topmost" location for Node: it must be dominated by both,
+ // its parent (or the BaseVal, if it's a root node), and by the index
+ // value.
+ ValueVect Bs;
+ if (Node->Flags & GepNode::Root) {
+ if (Instruction *PIn = dyn_cast<Instruction>(Node->BaseVal))
+ Bs.push_back(PIn->getParent());
+ } else {
+ Bs.push_back(Loc[Node->Parent]);
+ }
+ if (Instruction *IIn = dyn_cast<Instruction>(Node->Idx))
+ Bs.push_back(IIn->getParent());
+ BasicBlock *TopB = nearest_common_dominatee(DT, Bs);
+
+ // Traverse the loop nest upwards until we find a loop in which Node
+ // is no longer invariant, or until we get to the upper limit of Node's
+ // placement. The traversal will also stop when a suitable "preheader"
+ // cannot be found for a given loop. The "preheader" may actually be
+ // a regular block outside of the loop (i.e. not guarded), in which case
+ // the Node will be speculated.
+ // For nodes that are not in the main path of the containing loop (i.e.
+ // are not executed in each iteration), do not move them out of the loop.
+ BasicBlock *LocB = cast_or_null<BasicBlock>(Loc[Node]);
+ if (LocB) {
+ Loop *Lp = LI->getLoopFor(LocB);
+ while (Lp) {
+ if (!isInvariantIn(Node, Lp) || !isInMainPath(LocB, Lp))
+ break;
+ BasicBlock *NewLoc = preheader(DT, Lp);
+ if (!NewLoc || !DT->dominates(TopB, NewLoc))
+ break;
+ Lp = Lp->getParentLoop();
+ LocB = NewLoc;
+ }
+ }
+ Loc[Node] = LocB;
+
+ // Recursively compute the locations of all children nodes.
+ NodeChildrenMap::iterator CF = NCM.find(Node);
+ if (CF != NCM.end()) {
+ NodeVect &Cs = CF->second;
+ for (NodeVect::iterator I = Cs.begin(), E = Cs.end(); I != E; ++I)
+ adjustForInvariance(*I, NCM, Loc);
+ }
+ return LocB;
+}
+
+namespace {
+
+ struct LocationAsBlock {
+ LocationAsBlock(const NodeToValueMap &L) : Map(L) {}
+
+ const NodeToValueMap &Map;
+ };
+
+ raw_ostream &operator<< (raw_ostream &OS,
+ const LocationAsBlock &Loc) LLVM_ATTRIBUTE_UNUSED ;
+ raw_ostream &operator<< (raw_ostream &OS, const LocationAsBlock &Loc) {
+ for (NodeToValueMap::const_iterator I = Loc.Map.begin(), E = Loc.Map.end();
+ I != E; ++I) {
+ OS << I->first << " -> ";
+ BasicBlock *B = cast<BasicBlock>(I->second);
+ OS << B->getName() << '(' << B << ')';
+ OS << '\n';
+ }
+ return OS;
+ }
+
+ inline bool is_constant(GepNode *N) {
+ return isa<ConstantInt>(N->Idx);
+ }
+
+} // end anonymous namespace
+
+void HexagonCommonGEP::separateChainForNode(GepNode *Node, Use *U,
+ NodeToValueMap &Loc) {
+ User *R = U->getUser();
+ DEBUG(dbgs() << "Separating chain for node (" << Node << ") user: "
+ << *R << '\n');
+ BasicBlock *PB = cast<Instruction>(R)->getParent();
+
+ GepNode *N = Node;
+ GepNode *C = nullptr, *NewNode = nullptr;
+ while (is_constant(N) && !(N->Flags & GepNode::Root)) {
+ // XXX if (single-use) dont-replicate;
+ GepNode *NewN = new (*Mem) GepNode(N);
+ Nodes.push_back(NewN);
+ Loc[NewN] = PB;
+
+ if (N == Node)
+ NewNode = NewN;
+ NewN->Flags &= ~GepNode::Used;
+ if (C)
+ C->Parent = NewN;
+ C = NewN;
+ N = N->Parent;
+ }
+ if (!NewNode)
+ return;
+
+ // Move over all uses that share the same user as U from Node to NewNode.
+ NodeToUsesMap::iterator UF = Uses.find(Node);
+ assert(UF != Uses.end());
+ UseSet &Us = UF->second;
+ UseSet NewUs;
+ for (UseSet::iterator I = Us.begin(); I != Us.end(); ) {
+ User *S = (*I)->getUser();
+ UseSet::iterator Nx = std::next(I);
+ if (S == R) {
+ NewUs.insert(*I);
+ Us.erase(I);
+ }
+ I = Nx;
+ }
+ if (Us.empty()) {
+ Node->Flags &= ~GepNode::Used;
+ Uses.erase(UF);
+ }
+
+ // Should at least have U in NewUs.
+ NewNode->Flags |= GepNode::Used;
+ DEBUG(dbgs() << "new node: " << NewNode << " " << *NewNode << '\n');
+ assert(!NewUs.empty());
+ Uses[NewNode] = NewUs;
+}
+
+void HexagonCommonGEP::separateConstantChains(GepNode *Node,
+ NodeChildrenMap &NCM, NodeToValueMap &Loc) {
+ // First approximation: extract all chains.
+ NodeSet Ns;
+ nodes_for_root(Node, NCM, Ns);
+
+ DEBUG(dbgs() << "Separating constant chains for node: " << Node << '\n');
+ // Collect all used nodes together with the uses from loads and stores,
+ // where the GEP node could be folded into the load/store instruction.
+ NodeToUsesMap FNs; // Foldable nodes.
+ for (NodeSet::iterator I = Ns.begin(), E = Ns.end(); I != E; ++I) {
+ GepNode *N = *I;
+ if (!(N->Flags & GepNode::Used))
+ continue;
+ NodeToUsesMap::iterator UF = Uses.find(N);
+ assert(UF != Uses.end());
+ UseSet &Us = UF->second;
+ // Loads/stores that use the node N.
+ UseSet LSs;
+ for (UseSet::iterator J = Us.begin(), F = Us.end(); J != F; ++J) {
+ Use *U = *J;
+ User *R = U->getUser();
+ // We're interested in uses that provide the address. It can happen
+ // that the value may also be provided via GEP, but we won't handle
+ // those cases here for now.
+ if (LoadInst *Ld = dyn_cast<LoadInst>(R)) {
+ unsigned PtrX = LoadInst::getPointerOperandIndex();
+ if (&Ld->getOperandUse(PtrX) == U)
+ LSs.insert(U);
+ } else if (StoreInst *St = dyn_cast<StoreInst>(R)) {
+ unsigned PtrX = StoreInst::getPointerOperandIndex();
+ if (&St->getOperandUse(PtrX) == U)
+ LSs.insert(U);
+ }
+ }
+ // Even if the total use count is 1, separating the chain may still be
+ // beneficial, since the constant chain may be longer than the GEP alone
+ // would be (e.g. if the parent node has a constant index and also has
+ // other children).
+ if (!LSs.empty())
+ FNs.insert(std::make_pair(N, LSs));
+ }
+
+ DEBUG(dbgs() << "Nodes with foldable users:\n" << FNs);
+
+ for (NodeToUsesMap::iterator I = FNs.begin(), E = FNs.end(); I != E; ++I) {
+ GepNode *N = I->first;
+ UseSet &Us = I->second;
+ for (UseSet::iterator J = Us.begin(), F = Us.end(); J != F; ++J)
+ separateChainForNode(N, *J, Loc);
+ }
+}
+
+void HexagonCommonGEP::computeNodePlacement(NodeToValueMap &Loc) {
+ // Compute the inverse of the Node.Parent links. Also, collect the set
+ // of root nodes.
+ NodeChildrenMap NCM;
+ NodeVect Roots;
+ invert_find_roots(Nodes, NCM, Roots);
+
+ // Compute the initial placement determined by the users' locations, and
+ // the locations of the child nodes.
+ for (NodeVect::iterator I = Roots.begin(), E = Roots.end(); I != E; ++I)
+ recalculatePlacementRec(*I, NCM, Loc);
+
+ DEBUG(dbgs() << "Initial node placement:\n" << LocationAsBlock(Loc));
+
+ if (OptEnableInv) {
+ for (NodeVect::iterator I = Roots.begin(), E = Roots.end(); I != E; ++I)
+ adjustForInvariance(*I, NCM, Loc);
+
+ DEBUG(dbgs() << "Node placement after adjustment for invariance:\n"
+ << LocationAsBlock(Loc));
+ }
+ if (OptEnableConst) {
+ for (NodeVect::iterator I = Roots.begin(), E = Roots.end(); I != E; ++I)
+ separateConstantChains(*I, NCM, Loc);
+ }
+ DEBUG(dbgs() << "Node use information:\n" << Uses);
+
+ // At the moment, there is no further refinement of the initial placement.
+ // Such a refinement could include splitting the nodes if they are placed
+ // too far from some of its users.
+
+ DEBUG(dbgs() << "Final node placement:\n" << LocationAsBlock(Loc));
+}
+
+Value *HexagonCommonGEP::fabricateGEP(NodeVect &NA, BasicBlock::iterator At,
+ BasicBlock *LocB) {
+ DEBUG(dbgs() << "Fabricating GEP in " << LocB->getName()
+ << " for nodes:\n" << NA);
+ unsigned Num = NA.size();
+ GepNode *RN = NA[0];
+ assert((RN->Flags & GepNode::Root) && "Creating GEP for non-root");
+
+ Value *NewInst = nullptr;
+ Value *Input = RN->BaseVal;
+ Value **IdxList = new Value*[Num+1];
+ unsigned nax = 0;
+ do {
+ unsigned IdxC = 0;
+ // If the type of the input of the first node is not a pointer,
+ // we need to add an artificial i32 0 to the indices (because the
+ // actual input in the IR will be a pointer).
+ if (!NA[nax]->PTy->isPointerTy()) {
+ Type *Int32Ty = Type::getInt32Ty(*Ctx);
+ IdxList[IdxC++] = ConstantInt::get(Int32Ty, 0);
+ }
+
+ // Keep adding indices from NA until we have to stop and generate
+ // an "intermediate" GEP.
+ while (++nax <= Num) {
+ GepNode *N = NA[nax-1];
+ IdxList[IdxC++] = N->Idx;
+ if (nax < Num) {
+ // We have to stop, if the expected type of the output of this node
+ // is not the same as the input type of the next node.
+ Type *NextTy = next_type(N->PTy, N->Idx);
+ if (NextTy != NA[nax]->PTy)
+ break;
+ }
+ }
+ ArrayRef<Value*> A(IdxList, IdxC);
+ Type *InpTy = Input->getType();
+ Type *ElTy = cast<PointerType>(InpTy->getScalarType())->getElementType();
+ NewInst = GetElementPtrInst::Create(ElTy, Input, A, "cgep", &*At);
+ DEBUG(dbgs() << "new GEP: " << *NewInst << '\n');
+ Input = NewInst;
+ } while (nax <= Num);
+
+ delete[] IdxList;
+ return NewInst;
+}
+
+void HexagonCommonGEP::getAllUsersForNode(GepNode *Node, ValueVect &Values,
+ NodeChildrenMap &NCM) {
+ NodeVect Work;
+ Work.push_back(Node);
+
+ while (!Work.empty()) {
+ NodeVect::iterator First = Work.begin();
+ GepNode *N = *First;
+ Work.erase(First);
+ if (N->Flags & GepNode::Used) {
+ NodeToUsesMap::iterator UF = Uses.find(N);
+ assert(UF != Uses.end() && "No use information for used node");
+ UseSet &Us = UF->second;
+ for (UseSet::iterator I = Us.begin(), E = Us.end(); I != E; ++I)
+ Values.push_back((*I)->getUser());
+ }
+ NodeChildrenMap::iterator CF = NCM.find(N);
+ if (CF != NCM.end()) {
+ NodeVect &Cs = CF->second;
+ Work.insert(Work.end(), Cs.begin(), Cs.end());
+ }
+ }
+}
+
+void HexagonCommonGEP::materialize(NodeToValueMap &Loc) {
+ DEBUG(dbgs() << "Nodes before materialization:\n" << Nodes << '\n');
+ NodeChildrenMap NCM;
+ NodeVect Roots;
+ // Compute the inversion again, since computing placement could alter
+ // "parent" relation between nodes.
+ invert_find_roots(Nodes, NCM, Roots);
+
+ while (!Roots.empty()) {
+ NodeVect::iterator First = Roots.begin();
+ GepNode *Root = *First, *Last = *First;
+ Roots.erase(First);
+
+ NodeVect NA; // Nodes to assemble.
+ // Append to NA all child nodes up to (and including) the first child
+ // that:
+ // (1) has more than 1 child, or
+ // (2) is used, or
+ // (3) has a child located in a different block.
+ bool LastUsed = false;
+ unsigned LastCN = 0;
+ // The location may be null if the computation failed (it can legitimately
+ // happen for nodes created from dead GEPs).
+ Value *LocV = Loc[Last];
+ if (!LocV)
+ continue;
+ BasicBlock *LastB = cast<BasicBlock>(LocV);
+ do {
+ NA.push_back(Last);
+ LastUsed = (Last->Flags & GepNode::Used);
+ if (LastUsed)
+ break;
+ NodeChildrenMap::iterator CF = NCM.find(Last);
+ LastCN = (CF != NCM.end()) ? CF->second.size() : 0;
+ if (LastCN != 1)
+ break;
+ GepNode *Child = CF->second.front();
+ BasicBlock *ChildB = cast_or_null<BasicBlock>(Loc[Child]);
+ if (ChildB != nullptr && LastB != ChildB)
+ break;
+ Last = Child;
+ } while (true);
+
+ BasicBlock::iterator InsertAt = LastB->getTerminator()->getIterator();
+ if (LastUsed || LastCN > 0) {
+ ValueVect Urs;
+ getAllUsersForNode(Root, Urs, NCM);
+ BasicBlock::iterator FirstUse = first_use_of_in_block(Urs, LastB);
+ if (FirstUse != LastB->end())
+ InsertAt = FirstUse;
+ }
+
+ // Generate a new instruction for NA.
+ Value *NewInst = fabricateGEP(NA, InsertAt, LastB);
+
+ // Convert all the children of Last node into roots, and append them
+ // to the Roots list.
+ if (LastCN > 0) {
+ NodeVect &Cs = NCM[Last];
+ for (NodeVect::iterator I = Cs.begin(), E = Cs.end(); I != E; ++I) {
+ GepNode *CN = *I;
+ CN->Flags &= ~GepNode::Internal;
+ CN->Flags |= GepNode::Root;
+ CN->BaseVal = NewInst;
+ Roots.push_back(CN);
+ }
+ }
+
+ // Lastly, if the Last node was used, replace all uses with the new GEP.
+ // The uses reference the original GEP values.
+ if (LastUsed) {
+ NodeToUsesMap::iterator UF = Uses.find(Last);
+ assert(UF != Uses.end() && "No use information found");
+ UseSet &Us = UF->second;
+ for (UseSet::iterator I = Us.begin(), E = Us.end(); I != E; ++I) {
+ Use *U = *I;
+ U->set(NewInst);
+ }
+ }
+ }
+}
+
+void HexagonCommonGEP::removeDeadCode() {
+ ValueVect BO;
+ BO.push_back(&Fn->front());
+
+ for (unsigned i = 0; i < BO.size(); ++i) {
+ BasicBlock *B = cast<BasicBlock>(BO[i]);
+ DomTreeNode *N = DT->getNode(B);
+ typedef GraphTraits<DomTreeNode*> GTN;
+ typedef GTN::ChildIteratorType Iter;
+ for (Iter I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I)
+ BO.push_back((*I)->getBlock());
+ }
+
+ for (unsigned i = BO.size(); i > 0; --i) {
+ BasicBlock *B = cast<BasicBlock>(BO[i-1]);
+ BasicBlock::InstListType &IL = B->getInstList();
+ typedef BasicBlock::InstListType::reverse_iterator reverse_iterator;
+ ValueVect Ins;
+ for (reverse_iterator I = IL.rbegin(), E = IL.rend(); I != E; ++I)
+ Ins.push_back(&*I);
+ for (ValueVect::iterator I = Ins.begin(), E = Ins.end(); I != E; ++I) {
+ Instruction *In = cast<Instruction>(*I);
+ if (isInstructionTriviallyDead(In))
+ In->eraseFromParent();
+ }
+ }
+}
+
+bool HexagonCommonGEP::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ // For now bail out on C++ exception handling.
+ for (Function::iterator A = F.begin(), Z = F.end(); A != Z; ++A)
+ for (BasicBlock::iterator I = A->begin(), E = A->end(); I != E; ++I)
+ if (isa<InvokeInst>(I) || isa<LandingPadInst>(I))
+ return false;
+
+ Fn = &F;
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ PDT = &getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ Ctx = &F.getContext();
+
+ Nodes.clear();
+ Uses.clear();
+ NodeOrder.clear();
+
+ SpecificBumpPtrAllocator<GepNode> Allocator;
+ Mem = &Allocator;
+
+ collect();
+ common();
+
+ NodeToValueMap Loc;
+ computeNodePlacement(Loc);
+ materialize(Loc);
+ removeDeadCode();
+
+#ifdef EXPENSIVE_CHECKS
+ // Run this only when expensive checks are enabled.
+ verifyFunction(F);
+#endif
+ return true;
+}
+
+namespace llvm {
+
+ FunctionPass *createHexagonCommonGEP() {
+ return new HexagonCommonGEP();
+ }
+
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
new file mode 100644
index 000000000000..783b916e04b0
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
@@ -0,0 +1,3149 @@
+//===--- HexagonConstPropagation.cpp --------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hcp"
+
+#include "HexagonInstrInfo.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <map>
+#include <queue>
+#include <set>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+namespace {
+
+ // Properties of a value that are tracked by the propagation.
+ // A property that is marked as present (i.e. bit is set) dentes that the
+ // value is known (proven) to have this property. Not all combinations
+ // of bits make sense, for example Zero and NonZero are mutually exclusive,
+ // but on the other hand, Zero implies Finite. In this case, whenever
+ // the Zero property is present, Finite should also be present.
+ class ConstantProperties {
+ public:
+ enum {
+ Unknown = 0x0000,
+ Zero = 0x0001,
+ NonZero = 0x0002,
+ Finite = 0x0004,
+ Infinity = 0x0008,
+ NaN = 0x0010,
+ SignedZero = 0x0020,
+ NumericProperties = (Zero|NonZero|Finite|Infinity|NaN|SignedZero),
+ PosOrZero = 0x0100,
+ NegOrZero = 0x0200,
+ SignProperties = (PosOrZero|NegOrZero),
+ Everything = (NumericProperties|SignProperties)
+ };
+
+ // For a given constant, deduce the set of trackable properties that this
+ // constant has.
+ static uint32_t deduce(const Constant *C);
+ };
+
+ // A representation of a register as it can appear in a MachineOperand,
+ // i.e. a pair register:subregister.
+ struct Register {
+ unsigned Reg, SubReg;
+
+ explicit Register(unsigned R, unsigned SR = 0) : Reg(R), SubReg(SR) {}
+ explicit Register(const MachineOperand &MO)
+ : Reg(MO.getReg()), SubReg(MO.getSubReg()) {}
+
+ void print(const TargetRegisterInfo *TRI = nullptr) const {
+ dbgs() << PrintReg(Reg, TRI, SubReg);
+ }
+
+ bool operator== (const Register &R) const {
+ return (Reg == R.Reg) && (SubReg == R.SubReg);
+ }
+ };
+
+ // Lattice cell, based on that was described in the W-Z paper on constant
+ // propagation.
+ // Latice cell will be allowed to hold multiple constant values. While
+ // multiple values would normally indicate "bottom", we can still derive
+ // some useful information from them. For example, comparison X > 0
+ // could be folded if all the values in the cell associated with X are
+ // positive.
+ class LatticeCell {
+ private:
+ enum { Normal, Top, Bottom };
+
+ static const unsigned MaxCellSize = 4;
+
+ unsigned Kind:2;
+ unsigned Size:3;
+ unsigned IsSpecial:1;
+ unsigned :0;
+
+ public:
+ union {
+ uint32_t Properties;
+ const Constant *Value;
+ const Constant *Values[MaxCellSize];
+ };
+
+ LatticeCell() : Kind(Top), Size(0), IsSpecial(false) {
+ for (unsigned i = 0; i < MaxCellSize; ++i)
+ Values[i] = nullptr;
+ }
+
+ bool meet(const LatticeCell &L);
+ bool add(const Constant *C);
+ bool add(uint32_t Property);
+ uint32_t properties() const;
+ unsigned size() const { return Size; }
+
+ LatticeCell &operator= (const LatticeCell &L) {
+ if (this != &L) {
+ // This memcpy also copies Properties (when L.Size == 0).
+ uint32_t N = L.IsSpecial ? sizeof L.Properties
+ : L.Size*sizeof(const Constant*);
+ memcpy(Values, L.Values, N);
+ Kind = L.Kind;
+ Size = L.Size;
+ IsSpecial = L.IsSpecial;
+ }
+ return *this;
+ }
+
+ bool isSingle() const { return size() == 1; }
+ bool isProperty() const { return IsSpecial; }
+ bool isTop() const { return Kind == Top; }
+ bool isBottom() const { return Kind == Bottom; }
+
+ bool setBottom() {
+ bool Changed = (Kind != Bottom);
+ Kind = Bottom;
+ Size = 0;
+ IsSpecial = false;
+ return Changed;
+ }
+
+ void print(raw_ostream &os) const;
+
+ private:
+ void setProperty() {
+ IsSpecial = true;
+ Size = 0;
+ Kind = Normal;
+ }
+
+ bool convertToProperty();
+ };
+
+ raw_ostream &operator<< (raw_ostream &os, const LatticeCell &L) {
+ L.print(os);
+ return os;
+ }
+
+ class MachineConstEvaluator;
+
+ class MachineConstPropagator {
+ public:
+ MachineConstPropagator(MachineConstEvaluator &E) : MCE(E) {
+ Bottom.setBottom();
+ }
+
+ // Mapping: vreg -> cell
+ // The keys are registers _without_ subregisters. This won't allow
+ // definitions in the form of "vreg:subreg<def> = ...". Such definitions
+ // would be questionable from the point of view of SSA, since the "vreg"
+ // could not be initialized in its entirety (specifically, an instruction
+ // defining the "other part" of "vreg" would also count as a definition
+ // of "vreg", which would violate the SSA).
+ // If a value of a pair vreg:subreg needs to be obtained, the cell for
+ // "vreg" needs to be looked up, and then the value of subregister "subreg"
+ // needs to be evaluated.
+ class CellMap {
+ public:
+ CellMap() {
+ assert(Top.isTop());
+ Bottom.setBottom();
+ }
+
+ void clear() { Map.clear(); }
+
+ bool has(unsigned R) const {
+ // All non-virtual registers are considered "bottom".
+ if (!TargetRegisterInfo::isVirtualRegister(R))
+ return true;
+ MapType::const_iterator F = Map.find(R);
+ return F != Map.end();
+ }
+
+ const LatticeCell &get(unsigned R) const {
+ if (!TargetRegisterInfo::isVirtualRegister(R))
+ return Bottom;
+ MapType::const_iterator F = Map.find(R);
+ if (F != Map.end())
+ return F->second;
+ return Top;
+ }
+
+ // Invalidates any const references.
+ void update(unsigned R, const LatticeCell &L) {
+ Map[R] = L;
+ }
+
+ void print(raw_ostream &os, const TargetRegisterInfo &TRI) const;
+
+ private:
+ typedef std::map<unsigned,LatticeCell> MapType;
+ MapType Map;
+ // To avoid creating "top" entries, return a const reference to
+ // this cell in "get". Also, have a "Bottom" cell to return from
+ // get when a value of a physical register is requested.
+ LatticeCell Top, Bottom;
+
+ public:
+ typedef MapType::const_iterator const_iterator;
+ const_iterator begin() const { return Map.begin(); }
+ const_iterator end() const { return Map.end(); }
+ };
+
+ bool run(MachineFunction &MF);
+
+ private:
+ void visitPHI(const MachineInstr &PN);
+ void visitNonBranch(const MachineInstr &MI);
+ void visitBranchesFrom(const MachineInstr &BrI);
+ void visitUsesOf(unsigned R);
+ bool computeBlockSuccessors(const MachineBasicBlock *MB,
+ SetVector<const MachineBasicBlock*> &Targets);
+ void removeCFGEdge(MachineBasicBlock *From, MachineBasicBlock *To);
+
+ void propagate(MachineFunction &MF);
+ bool rewrite(MachineFunction &MF);
+
+ MachineRegisterInfo *MRI;
+ MachineConstEvaluator &MCE;
+
+ typedef std::pair<unsigned,unsigned> CFGEdge;
+ typedef std::set<CFGEdge> SetOfCFGEdge;
+ typedef std::set<const MachineInstr*> SetOfInstr;
+ typedef std::queue<CFGEdge> QueueOfCFGEdge;
+
+ LatticeCell Bottom;
+ CellMap Cells;
+ SetOfCFGEdge EdgeExec;
+ SetOfInstr InstrExec;
+ QueueOfCFGEdge FlowQ;
+ };
+
+ // The "evaluator/rewriter" of machine instructions. This is an abstract
+ // base class that provides the interface that the propagator will use,
+ // as well as some helper functions that are target-independent.
+ class MachineConstEvaluator {
+ public:
+ MachineConstEvaluator(MachineFunction &Fn)
+ : TRI(*Fn.getSubtarget().getRegisterInfo()),
+ MF(Fn), CX(Fn.getFunction()->getContext()) {}
+ virtual ~MachineConstEvaluator() = default;
+
+ // The required interface:
+ // - A set of three "evaluate" functions. Each returns "true" if the
+ // computation succeeded, "false" otherwise.
+ // (1) Given an instruction MI, and the map with input values "Inputs",
+ // compute the set of output values "Outputs". An example of when
+ // the computation can "fail" is if MI is not an instruction that
+ // is recognized by the evaluator.
+ // (2) Given a register R (as reg:subreg), compute the cell that
+ // corresponds to the "subreg" part of the given register.
+ // (3) Given a branch instruction BrI, compute the set of target blocks.
+ // If the branch can fall-through, add null (0) to the list of
+ // possible targets.
+ // - A function "rewrite", that given the cell map after propagation,
+ // could rewrite instruction MI in a more beneficial form. Return
+ // "true" if a change has been made, "false" otherwise.
+ typedef MachineConstPropagator::CellMap CellMap;
+ virtual bool evaluate(const MachineInstr &MI, const CellMap &Inputs,
+ CellMap &Outputs) = 0;
+ virtual bool evaluate(const Register &R, const LatticeCell &SrcC,
+ LatticeCell &Result) = 0;
+ virtual bool evaluate(const MachineInstr &BrI, const CellMap &Inputs,
+ SetVector<const MachineBasicBlock*> &Targets,
+ bool &CanFallThru) = 0;
+ virtual bool rewrite(MachineInstr &MI, const CellMap &Inputs) = 0;
+
+ const TargetRegisterInfo &TRI;
+
+ protected:
+ MachineFunction &MF;
+ LLVMContext &CX;
+
+ struct Comparison {
+ enum {
+ Unk = 0x00,
+ EQ = 0x01,
+ NE = 0x02,
+ L = 0x04, // Less-than property.
+ G = 0x08, // Greater-than property.
+ U = 0x40, // Unsigned property.
+ LTs = L,
+ LEs = L | EQ,
+ GTs = G,
+ GEs = G | EQ,
+ LTu = L | U,
+ LEu = L | EQ | U,
+ GTu = G | U,
+ GEu = G | EQ | U
+ };
+
+ static uint32_t negate(uint32_t Cmp) {
+ if (Cmp == EQ)
+ return NE;
+ if (Cmp == NE)
+ return EQ;
+ assert((Cmp & (L|G)) != (L|G));
+ return Cmp ^ (L|G);
+ }
+ };
+
+ // Helper functions.
+
+ bool getCell(const Register &R, const CellMap &Inputs, LatticeCell &RC);
+ bool constToInt(const Constant *C, APInt &Val) const;
+ bool constToFloat(const Constant *C, APFloat &Val) const;
+ const ConstantInt *intToConst(const APInt &Val) const;
+
+ // Compares.
+ bool evaluateCMPrr(uint32_t Cmp, const Register &R1, const Register &R2,
+ const CellMap &Inputs, bool &Result);
+ bool evaluateCMPri(uint32_t Cmp, const Register &R1, const APInt &A2,
+ const CellMap &Inputs, bool &Result);
+ bool evaluateCMPrp(uint32_t Cmp, const Register &R1, uint64_t Props2,
+ const CellMap &Inputs, bool &Result);
+ bool evaluateCMPii(uint32_t Cmp, const APInt &A1, const APInt &A2,
+ bool &Result);
+ bool evaluateCMPpi(uint32_t Cmp, uint32_t Props, const APInt &A2,
+ bool &Result);
+ bool evaluateCMPpp(uint32_t Cmp, uint32_t Props1, uint32_t Props2,
+ bool &Result);
+
+ bool evaluateCOPY(const Register &R1, const CellMap &Inputs,
+ LatticeCell &Result);
+
+ // Logical operations.
+ bool evaluateANDrr(const Register &R1, const Register &R2,
+ const CellMap &Inputs, LatticeCell &Result);
+ bool evaluateANDri(const Register &R1, const APInt &A2,
+ const CellMap &Inputs, LatticeCell &Result);
+ bool evaluateANDii(const APInt &A1, const APInt &A2, APInt &Result);
+ bool evaluateORrr(const Register &R1, const Register &R2,
+ const CellMap &Inputs, LatticeCell &Result);
+ bool evaluateORri(const Register &R1, const APInt &A2,
+ const CellMap &Inputs, LatticeCell &Result);
+ bool evaluateORii(const APInt &A1, const APInt &A2, APInt &Result);
+ bool evaluateXORrr(const Register &R1, const Register &R2,
+ const CellMap &Inputs, LatticeCell &Result);
+ bool evaluateXORri(const Register &R1, const APInt &A2,
+ const CellMap &Inputs, LatticeCell &Result);
+ bool evaluateXORii(const APInt &A1, const APInt &A2, APInt &Result);
+
+ // Extensions.
+ bool evaluateZEXTr(const Register &R1, unsigned Width, unsigned Bits,
+ const CellMap &Inputs, LatticeCell &Result);
+ bool evaluateZEXTi(const APInt &A1, unsigned Width, unsigned Bits,
+ APInt &Result);
+ bool evaluateSEXTr(const Register &R1, unsigned Width, unsigned Bits,
+ const CellMap &Inputs, LatticeCell &Result);
+ bool evaluateSEXTi(const APInt &A1, unsigned Width, unsigned Bits,
+ APInt &Result);
+
+ // Leading/trailing bits.
+ bool evaluateCLBr(const Register &R1, bool Zeros, bool Ones,
+ const CellMap &Inputs, LatticeCell &Result);
+ bool evaluateCLBi(const APInt &A1, bool Zeros, bool Ones, APInt &Result);
+ bool evaluateCTBr(const Register &R1, bool Zeros, bool Ones,
+ const CellMap &Inputs, LatticeCell &Result);
+ bool evaluateCTBi(const APInt &A1, bool Zeros, bool Ones, APInt &Result);
+
+ // Bitfield extract.
+ bool evaluateEXTRACTr(const Register &R1, unsigned Width, unsigned Bits,
+ unsigned Offset, bool Signed, const CellMap &Inputs,
+ LatticeCell &Result);
+ bool evaluateEXTRACTi(const APInt &A1, unsigned Bits, unsigned Offset,
+ bool Signed, APInt &Result);
+ // Vector operations.
+ bool evaluateSplatr(const Register &R1, unsigned Bits, unsigned Count,
+ const CellMap &Inputs, LatticeCell &Result);
+ bool evaluateSplati(const APInt &A1, unsigned Bits, unsigned Count,
+ APInt &Result);
+ };
+
+} // end anonymous namespace
+
+uint32_t ConstantProperties::deduce(const Constant *C) {
+ if (isa<ConstantInt>(C)) {
+ const ConstantInt *CI = cast<ConstantInt>(C);
+ if (CI->isZero())
+ return Zero | PosOrZero | NegOrZero | Finite;
+ uint32_t Props = (NonZero | Finite);
+ if (CI->isNegative())
+ return Props | NegOrZero;
+ return Props | PosOrZero;
+ }
+
+ if (isa<ConstantFP>(C)) {
+ const ConstantFP *CF = cast<ConstantFP>(C);
+ uint32_t Props = CF->isNegative() ? (NegOrZero|NonZero)
+ : PosOrZero;
+ if (CF->isZero())
+ return (Props & ~NumericProperties) | (Zero|Finite);
+ Props = (Props & ~NumericProperties) | NonZero;
+ if (CF->isNaN())
+ return (Props & ~NumericProperties) | NaN;
+ const APFloat &Val = CF->getValueAPF();
+ if (Val.isInfinity())
+ return (Props & ~NumericProperties) | Infinity;
+ Props |= Finite;
+ return Props;
+ }
+
+ return Unknown;
+}
+
+// Convert a cell from a set of specific values to a cell that tracks
+// properties.
+bool LatticeCell::convertToProperty() {
+ if (isProperty())
+ return false;
+ // Corner case: converting a fresh (top) cell to "special".
+ // This can happen, when adding a property to a top cell.
+ uint32_t Everything = ConstantProperties::Everything;
+ uint32_t Ps = !isTop() ? properties()
+ : Everything;
+ if (Ps != ConstantProperties::Unknown) {
+ Properties = Ps;
+ setProperty();
+ } else {
+ setBottom();
+ }
+ return true;
+}
+
+void LatticeCell::print(raw_ostream &os) const {
+ if (isProperty()) {
+ os << "{ ";
+ uint32_t Ps = properties();
+ if (Ps & ConstantProperties::Zero)
+ os << "zero ";
+ if (Ps & ConstantProperties::NonZero)
+ os << "nonzero ";
+ if (Ps & ConstantProperties::Finite)
+ os << "finite ";
+ if (Ps & ConstantProperties::Infinity)
+ os << "infinity ";
+ if (Ps & ConstantProperties::NaN)
+ os << "nan ";
+ if (Ps & ConstantProperties::PosOrZero)
+ os << "poz ";
+ if (Ps & ConstantProperties::NegOrZero)
+ os << "nez ";
+ os << '}';
+ return;
+ }
+
+ os << "{ ";
+ if (isBottom()) {
+ os << "bottom";
+ } else if (isTop()) {
+ os << "top";
+ } else {
+ for (unsigned i = 0; i < size(); ++i) {
+ const Constant *C = Values[i];
+ if (i != 0)
+ os << ", ";
+ C->print(os);
+ }
+ }
+ os << " }";
+}
+
+// "Meet" operation on two cells. This is the key of the propagation
+// algorithm.
+bool LatticeCell::meet(const LatticeCell &L) {
+ bool Changed = false;
+ if (L.isBottom())
+ Changed = setBottom();
+ if (isBottom() || L.isTop())
+ return Changed;
+ if (isTop()) {
+ *this = L;
+ // L can be neither Top nor Bottom, so *this must have changed.
+ return true;
+ }
+
+ // Top/bottom cases covered. Need to integrate L's set into ours.
+ if (L.isProperty())
+ return add(L.properties());
+ for (unsigned i = 0; i < L.size(); ++i) {
+ const Constant *LC = L.Values[i];
+ Changed |= add(LC);
+ }
+ return Changed;
+}
+
+// Add a new constant to the cell. This is actually where the cell update
+// happens. If a cell has room for more constants, the new constant is added.
+// Otherwise, the cell is converted to a "property" cell (i.e. a cell that
+// will track properties of the associated values, and not the values
+// themselves. Care is taken to handle special cases, like "bottom", etc.
+bool LatticeCell::add(const Constant *LC) {
+ assert(LC);
+ if (isBottom())
+ return false;
+
+ if (!isProperty()) {
+ // Cell is not special. Try to add the constant here first,
+ // if there is room.
+ unsigned Index = 0;
+ while (Index < Size) {
+ const Constant *C = Values[Index];
+ // If the constant is already here, no change is needed.
+ if (C == LC)
+ return false;
+ Index++;
+ }
+ if (Index < MaxCellSize) {
+ Values[Index] = LC;
+ Kind = Normal;
+ Size++;
+ return true;
+ }
+ }
+
+ bool Changed = false;
+
+ // This cell is special, or is not special, but is full. After this
+ // it will be special.
+ Changed = convertToProperty();
+ uint32_t Ps = properties();
+ uint32_t NewPs = Ps & ConstantProperties::deduce(LC);
+ if (NewPs == ConstantProperties::Unknown) {
+ setBottom();
+ return true;
+ }
+ if (Ps != NewPs) {
+ Properties = NewPs;
+ Changed = true;
+ }
+ return Changed;
+}
+
+// Add a property to the cell. This will force the cell to become a property-
+// tracking cell.
+bool LatticeCell::add(uint32_t Property) {
+ bool Changed = convertToProperty();
+ uint32_t Ps = properties();
+ if (Ps == (Ps & Property))
+ return Changed;
+ Properties = Property & Ps;
+ return true;
+}
+
+// Return the properties of the values in the cell. This is valid for any
+// cell, and does not alter the cell itself.
+uint32_t LatticeCell::properties() const {
+ if (isProperty())
+ return Properties;
+ assert(!isTop() && "Should not call this for a top cell");
+ if (isBottom())
+ return ConstantProperties::Unknown;
+
+ assert(size() > 0 && "Empty cell");
+ uint32_t Ps = ConstantProperties::deduce(Values[0]);
+ for (unsigned i = 1; i < size(); ++i) {
+ if (Ps == ConstantProperties::Unknown)
+ break;
+ Ps &= ConstantProperties::deduce(Values[i]);
+ }
+ return Ps;
+}
+
+void MachineConstPropagator::CellMap::print(raw_ostream &os,
+ const TargetRegisterInfo &TRI) const {
+ for (auto &I : Map)
+ dbgs() << " " << PrintReg(I.first, &TRI) << " -> " << I.second << '\n';
+}
+
+void MachineConstPropagator::visitPHI(const MachineInstr &PN) {
+ const MachineBasicBlock *MB = PN.getParent();
+ unsigned MBN = MB->getNumber();
+ DEBUG(dbgs() << "Visiting FI(BB#" << MBN << "): " << PN);
+
+ const MachineOperand &MD = PN.getOperand(0);
+ Register DefR(MD);
+ assert(TargetRegisterInfo::isVirtualRegister(DefR.Reg));
+
+ bool Changed = false;
+
+ // If the def has a sub-register, set the corresponding cell to "bottom".
+ if (DefR.SubReg) {
+Bottomize:
+ const LatticeCell &T = Cells.get(DefR.Reg);
+ Changed = !T.isBottom();
+ Cells.update(DefR.Reg, Bottom);
+ if (Changed)
+ visitUsesOf(DefR.Reg);
+ return;
+ }
+
+ LatticeCell DefC = Cells.get(DefR.Reg);
+
+ for (unsigned i = 1, n = PN.getNumOperands(); i < n; i += 2) {
+ const MachineBasicBlock *PB = PN.getOperand(i+1).getMBB();
+ unsigned PBN = PB->getNumber();
+ if (!EdgeExec.count(CFGEdge(PBN, MBN))) {
+ DEBUG(dbgs() << " edge BB#" << PBN << "->BB#" << MBN
+ << " not executable\n");
+ continue;
+ }
+ const MachineOperand &SO = PN.getOperand(i);
+ Register UseR(SO);
+ // If the input is not a virtual register, we don't really know what
+ // value it holds.
+ if (!TargetRegisterInfo::isVirtualRegister(UseR.Reg))
+ goto Bottomize;
+ // If there is no cell for an input register, it means top.
+ if (!Cells.has(UseR.Reg))
+ continue;
+
+ LatticeCell SrcC;
+ bool Eval = MCE.evaluate(UseR, Cells.get(UseR.Reg), SrcC);
+ DEBUG(dbgs() << " edge from BB#" << PBN << ": "
+ << PrintReg(UseR.Reg, &MCE.TRI, UseR.SubReg)
+ << SrcC << '\n');
+ Changed |= Eval ? DefC.meet(SrcC)
+ : DefC.setBottom();
+ Cells.update(DefR.Reg, DefC);
+ if (DefC.isBottom())
+ break;
+ }
+ if (Changed)
+ visitUsesOf(DefR.Reg);
+}
+
+void MachineConstPropagator::visitNonBranch(const MachineInstr &MI) {
+ DEBUG(dbgs() << "Visiting MI(BB#" << MI.getParent()->getNumber()
+ << "): " << MI);
+ CellMap Outputs;
+ bool Eval = MCE.evaluate(MI, Cells, Outputs);
+ DEBUG({
+ if (Eval) {
+ dbgs() << " outputs:";
+ for (auto &I : Outputs)
+ dbgs() << ' ' << I.second;
+ dbgs() << '\n';
+ }
+ });
+
+ // Update outputs. If the value was not computed, set all the
+ // def cells to bottom.
+ for (const MachineOperand &MO : MI.operands()) {
+ if (!MO.isReg() || !MO.isDef())
+ continue;
+ Register DefR(MO);
+ // Only track virtual registers.
+ if (!TargetRegisterInfo::isVirtualRegister(DefR.Reg))
+ continue;
+ bool Changed = false;
+ // If the evaluation failed, set cells for all output registers to bottom.
+ if (!Eval) {
+ const LatticeCell &T = Cells.get(DefR.Reg);
+ Changed = !T.isBottom();
+ Cells.update(DefR.Reg, Bottom);
+ } else {
+ // Find the corresponding cell in the computed outputs.
+ // If it's not there, go on to the next def.
+ if (!Outputs.has(DefR.Reg))
+ continue;
+ LatticeCell RC = Cells.get(DefR.Reg);
+ Changed = RC.meet(Outputs.get(DefR.Reg));
+ Cells.update(DefR.Reg, RC);
+ }
+ if (Changed)
+ visitUsesOf(DefR.Reg);
+ }
+}
+
+// \brief Starting at a given branch, visit remaining branches in the block.
+// Traverse over the subsequent branches for as long as the preceding one
+// can fall through. Add all the possible targets to the flow work queue,
+// including the potential fall-through to the layout-successor block.
+void MachineConstPropagator::visitBranchesFrom(const MachineInstr &BrI) {
+ const MachineBasicBlock &B = *BrI.getParent();
+ unsigned MBN = B.getNumber();
+ MachineBasicBlock::const_iterator It = BrI.getIterator();
+ MachineBasicBlock::const_iterator End = B.end();
+
+ SetVector<const MachineBasicBlock*> Targets;
+ bool EvalOk = true, FallsThru = true;
+ while (It != End) {
+ const MachineInstr &MI = *It;
+ InstrExec.insert(&MI);
+ DEBUG(dbgs() << "Visiting " << (EvalOk ? "BR" : "br") << "(BB#"
+ << MBN << "): " << MI);
+ // Do not evaluate subsequent branches if the evaluation of any of the
+ // previous branches failed. Keep iterating over the branches only
+ // to mark them as executable.
+ EvalOk = EvalOk && MCE.evaluate(MI, Cells, Targets, FallsThru);
+ if (!EvalOk)
+ FallsThru = true;
+ if (!FallsThru)
+ break;
+ ++It;
+ }
+
+ if (EvalOk) {
+ // Need to add all CFG successors that lead to EH landing pads.
+ // There won't be explicit branches to these blocks, but they must
+ // be processed.
+ for (const MachineBasicBlock *SB : B.successors()) {
+ if (SB->isEHPad())
+ Targets.insert(SB);
+ }
+ if (FallsThru) {
+ const MachineFunction &MF = *B.getParent();
+ MachineFunction::const_iterator BI = B.getIterator();
+ MachineFunction::const_iterator Next = std::next(BI);
+ if (Next != MF.end())
+ Targets.insert(&*Next);
+ }
+ } else {
+ // If the evaluation of the branches failed, make "Targets" to be the
+ // set of all successors of the block from the CFG.
+ // If the evaluation succeeded for all visited branches, then if the
+ // last one set "FallsThru", then add an edge to the layout successor
+ // to the targets.
+ Targets.clear();
+ DEBUG(dbgs() << " failed to evaluate a branch...adding all CFG "
+ "successors\n");
+ for (const MachineBasicBlock *SB : B.successors())
+ Targets.insert(SB);
+ }
+
+ for (const MachineBasicBlock *TB : Targets) {
+ unsigned TBN = TB->getNumber();
+ DEBUG(dbgs() << " pushing edge BB#" << MBN << " -> BB#" << TBN << "\n");
+ FlowQ.push(CFGEdge(MBN, TBN));
+ }
+}
+
+void MachineConstPropagator::visitUsesOf(unsigned Reg) {
+ DEBUG(dbgs() << "Visiting uses of " << PrintReg(Reg, &MCE.TRI)
+ << Cells.get(Reg) << '\n');
+ for (MachineInstr &MI : MRI->use_nodbg_instructions(Reg)) {
+ // Do not process non-executable instructions. They can become exceutable
+ // later (via a flow-edge in the work queue). In such case, the instruc-
+ // tion will be visited at that time.
+ if (!InstrExec.count(&MI))
+ continue;
+ if (MI.isPHI())
+ visitPHI(MI);
+ else if (!MI.isBranch())
+ visitNonBranch(MI);
+ else
+ visitBranchesFrom(MI);
+ }
+}
+
+bool MachineConstPropagator::computeBlockSuccessors(const MachineBasicBlock *MB,
+ SetVector<const MachineBasicBlock*> &Targets) {
+ MachineBasicBlock::const_iterator FirstBr = MB->end();
+ for (const MachineInstr &MI : *MB) {
+ if (MI.isDebugValue())
+ continue;
+ if (MI.isBranch()) {
+ FirstBr = MI.getIterator();
+ break;
+ }
+ }
+
+ Targets.clear();
+ MachineBasicBlock::const_iterator End = MB->end();
+
+ bool DoNext = true;
+ for (MachineBasicBlock::const_iterator I = FirstBr; I != End; ++I) {
+ const MachineInstr &MI = *I;
+ // Can there be debug instructions between branches?
+ if (MI.isDebugValue())
+ continue;
+ if (!InstrExec.count(&MI))
+ continue;
+ bool Eval = MCE.evaluate(MI, Cells, Targets, DoNext);
+ if (!Eval)
+ return false;
+ if (!DoNext)
+ break;
+ }
+ // If the last branch could fall-through, add block's layout successor.
+ if (DoNext) {
+ MachineFunction::const_iterator BI = MB->getIterator();
+ MachineFunction::const_iterator NextI = std::next(BI);
+ if (NextI != MB->getParent()->end())
+ Targets.insert(&*NextI);
+ }
+
+ // Add all the EH landing pads.
+ for (const MachineBasicBlock *SB : MB->successors())
+ if (SB->isEHPad())
+ Targets.insert(SB);
+
+ return true;
+}
+
+void MachineConstPropagator::removeCFGEdge(MachineBasicBlock *From,
+ MachineBasicBlock *To) {
+ // First, remove the CFG successor/predecessor information.
+ From->removeSuccessor(To);
+ // Remove all corresponding PHI operands in the To block.
+ for (auto I = To->begin(), E = To->getFirstNonPHI(); I != E; ++I) {
+ MachineInstr *PN = &*I;
+ // reg0 = PHI reg1, bb2, reg3, bb4, ...
+ int N = PN->getNumOperands()-2;
+ while (N > 0) {
+ if (PN->getOperand(N+1).getMBB() == From) {
+ PN->RemoveOperand(N+1);
+ PN->RemoveOperand(N);
+ }
+ N -= 2;
+ }
+ }
+}
+
+void MachineConstPropagator::propagate(MachineFunction &MF) {
+ MachineBasicBlock *Entry = GraphTraits<MachineFunction*>::getEntryNode(&MF);
+ unsigned EntryNum = Entry->getNumber();
+
+ // Start with a fake edge, just to process the entry node.
+ FlowQ.push(CFGEdge(EntryNum, EntryNum));
+
+ while (!FlowQ.empty()) {
+ CFGEdge Edge = FlowQ.front();
+ FlowQ.pop();
+
+ DEBUG(dbgs() << "Picked edge BB#" << Edge.first << "->BB#"
+ << Edge.second << '\n');
+ if (Edge.first != EntryNum)
+ if (EdgeExec.count(Edge))
+ continue;
+ EdgeExec.insert(Edge);
+ MachineBasicBlock *SB = MF.getBlockNumbered(Edge.second);
+
+ // Process the block in three stages:
+ // - visit all PHI nodes,
+ // - visit all non-branch instructions,
+ // - visit block branches.
+ MachineBasicBlock::const_iterator It = SB->begin(), End = SB->end();
+
+ // Visit PHI nodes in the successor block.
+ while (It != End && It->isPHI()) {
+ InstrExec.insert(&*It);
+ visitPHI(*It);
+ ++It;
+ }
+
+ // If the successor block just became executable, visit all instructions.
+ // To see if this is the first time we're visiting it, check the first
+ // non-debug instruction to see if it is executable.
+ while (It != End && It->isDebugValue())
+ ++It;
+ assert(It == End || !It->isPHI());
+ // If this block has been visited, go on to the next one.
+ if (It != End && InstrExec.count(&*It))
+ continue;
+ // For now, scan all non-branch instructions. Branches require different
+ // processing.
+ while (It != End && !It->isBranch()) {
+ if (!It->isDebugValue()) {
+ InstrExec.insert(&*It);
+ visitNonBranch(*It);
+ }
+ ++It;
+ }
+
+ // Time to process the end of the block. This is different from
+ // processing regular (non-branch) instructions, because there can
+ // be multiple branches in a block, and they can cause the block to
+ // terminate early.
+ if (It != End) {
+ visitBranchesFrom(*It);
+ } else {
+ // If the block didn't have a branch, add all successor edges to the
+ // work queue. (There should really be only one successor in such case.)
+ unsigned SBN = SB->getNumber();
+ for (const MachineBasicBlock *SSB : SB->successors())
+ FlowQ.push(CFGEdge(SBN, SSB->getNumber()));
+ }
+ } // while (FlowQ)
+
+ DEBUG({
+ dbgs() << "Cells after propagation:\n";
+ Cells.print(dbgs(), MCE.TRI);
+ dbgs() << "Dead CFG edges:\n";
+ for (const MachineBasicBlock &B : MF) {
+ unsigned BN = B.getNumber();
+ for (const MachineBasicBlock *SB : B.successors()) {
+ unsigned SN = SB->getNumber();
+ if (!EdgeExec.count(CFGEdge(BN, SN)))
+ dbgs() << " BB#" << BN << " -> BB#" << SN << '\n';
+ }
+ }
+ });
+}
+
+bool MachineConstPropagator::rewrite(MachineFunction &MF) {
+ bool Changed = false;
+ // Rewrite all instructions based on the collected cell information.
+ //
+ // Traverse the instructions in a post-order, so that rewriting an
+ // instruction can make changes "downstream" in terms of control-flow
+ // without affecting the rewriting process. (We should not change
+ // instructions that have not yet been visited by the rewriter.)
+ // The reason for this is that the rewriter can introduce new vregs,
+ // and replace uses of old vregs (which had corresponding cells
+ // computed during propagation) with these new vregs (which at this
+ // point would not have any cells, and would appear to be "top").
+ // If an attempt was made to evaluate an instruction with a fresh
+ // "top" vreg, it would cause an error (abend) in the evaluator.
+
+ // Collect the post-order-traversal block ordering. The subsequent
+ // traversal/rewrite will update block successors, so it's safer
+ // if the visiting order it computed ahead of time.
+ std::vector<MachineBasicBlock*> POT;
+ for (MachineBasicBlock *B : post_order(&MF))
+ if (!B->empty())
+ POT.push_back(B);
+
+ for (MachineBasicBlock *B : POT) {
+ // Walk the block backwards (which usually begin with the branches).
+ // If any branch is rewritten, we may need to update the successor
+ // information for this block. Unless the block's successors can be
+ // precisely determined (which may not be the case for indirect
+ // branches), we cannot modify any branch.
+
+ // Compute the successor information.
+ SetVector<const MachineBasicBlock*> Targets;
+ bool HaveTargets = computeBlockSuccessors(B, Targets);
+ // Rewrite the executable instructions. Skip branches if we don't
+ // have block successor information.
+ for (auto I = B->rbegin(), E = B->rend(); I != E; ++I) {
+ MachineInstr &MI = *I;
+ if (InstrExec.count(&MI)) {
+ if (MI.isBranch() && !HaveTargets)
+ continue;
+ Changed |= MCE.rewrite(MI, Cells);
+ }
+ }
+ // The rewriting could rewrite PHI nodes to non-PHI nodes, causing
+ // regular instructions to appear in between PHI nodes. Bring all
+ // the PHI nodes to the beginning of the block.
+ for (auto I = B->begin(), E = B->end(); I != E; ++I) {
+ if (I->isPHI())
+ continue;
+ // I is not PHI. Find the next PHI node P.
+ auto P = I;
+ while (++P != E)
+ if (P->isPHI())
+ break;
+ // Not found.
+ if (P == E)
+ break;
+ // Splice P right before I.
+ B->splice(I, B, P);
+ // Reset I to point at the just spliced PHI node.
+ --I;
+ }
+ // Update the block successor information: remove unnecessary successors.
+ if (HaveTargets) {
+ SmallVector<MachineBasicBlock*,2> ToRemove;
+ for (MachineBasicBlock *SB : B->successors()) {
+ if (!Targets.count(SB))
+ ToRemove.push_back(const_cast<MachineBasicBlock*>(SB));
+ Targets.remove(SB);
+ }
+ for (unsigned i = 0, n = ToRemove.size(); i < n; ++i)
+ removeCFGEdge(B, ToRemove[i]);
+ // If there are any blocks left in the computed targets, it means that
+ // we think that the block could go somewhere, but the CFG does not.
+ // This could legitimately happen in blocks that have non-returning
+ // calls---we would think that the execution can continue, but the
+ // CFG will not have a successor edge.
+ }
+ }
+ // Need to do some final post-processing.
+ // If a branch was not executable, it will not get rewritten, but should
+ // be removed (or replaced with something equivalent to a A2_nop). We can't
+ // erase instructions during rewriting, so this needs to be delayed until
+ // now.
+ for (MachineBasicBlock &B : MF) {
+ MachineBasicBlock::iterator I = B.begin(), E = B.end();
+ while (I != E) {
+ auto Next = std::next(I);
+ if (I->isBranch() && !InstrExec.count(&*I))
+ B.erase(I);
+ I = Next;
+ }
+ }
+ return Changed;
+}
+
+// This is the constant propagation algorithm as described by Wegman-Zadeck.
+// Most of the terminology comes from there.
+bool MachineConstPropagator::run(MachineFunction &MF) {
+ DEBUG(MF.print(dbgs() << "Starting MachineConstPropagator\n", 0));
+
+ MRI = &MF.getRegInfo();
+
+ Cells.clear();
+ EdgeExec.clear();
+ InstrExec.clear();
+ assert(FlowQ.empty());
+
+ propagate(MF);
+ bool Changed = rewrite(MF);
+
+ DEBUG({
+ dbgs() << "End of MachineConstPropagator (Changed=" << Changed << ")\n";
+ if (Changed)
+ MF.print(dbgs(), 0);
+ });
+ return Changed;
+}
+
+// --------------------------------------------------------------------
+// Machine const evaluator.
+
+bool MachineConstEvaluator::getCell(const Register &R, const CellMap &Inputs,
+ LatticeCell &RC) {
+ if (!TargetRegisterInfo::isVirtualRegister(R.Reg))
+ return false;
+ const LatticeCell &L = Inputs.get(R.Reg);
+ if (!R.SubReg) {
+ RC = L;
+ return !RC.isBottom();
+ }
+ bool Eval = evaluate(R, L, RC);
+ return Eval && !RC.isBottom();
+}
+
+bool MachineConstEvaluator::constToInt(const Constant *C,
+ APInt &Val) const {
+ const ConstantInt *CI = dyn_cast<ConstantInt>(C);
+ if (!CI)
+ return false;
+ Val = CI->getValue();
+ return true;
+}
+
+const ConstantInt *MachineConstEvaluator::intToConst(const APInt &Val) const {
+ return ConstantInt::get(CX, Val);
+}
+
+bool MachineConstEvaluator::evaluateCMPrr(uint32_t Cmp, const Register &R1,
+ const Register &R2, const CellMap &Inputs, bool &Result) {
+ assert(Inputs.has(R1.Reg) && Inputs.has(R2.Reg));
+ LatticeCell LS1, LS2;
+ if (!getCell(R1, Inputs, LS1) || !getCell(R2, Inputs, LS2))
+ return false;
+
+ bool IsProp1 = LS1.isProperty();
+ bool IsProp2 = LS2.isProperty();
+ if (IsProp1) {
+ uint32_t Prop1 = LS1.properties();
+ if (IsProp2)
+ return evaluateCMPpp(Cmp, Prop1, LS2.properties(), Result);
+ uint32_t NegCmp = Comparison::negate(Cmp);
+ return evaluateCMPrp(NegCmp, R2, Prop1, Inputs, Result);
+ }
+ if (IsProp2) {
+ uint32_t Prop2 = LS2.properties();
+ return evaluateCMPrp(Cmp, R1, Prop2, Inputs, Result);
+ }
+
+ APInt A;
+ bool IsTrue = true, IsFalse = true;
+ for (unsigned i = 0; i < LS2.size(); ++i) {
+ bool Res;
+ bool Computed = constToInt(LS2.Values[i], A) &&
+ evaluateCMPri(Cmp, R1, A, Inputs, Res);
+ if (!Computed)
+ return false;
+ IsTrue &= Res;
+ IsFalse &= !Res;
+ }
+ assert(!IsTrue || !IsFalse);
+ // The actual logical value of the comparison is same as IsTrue.
+ Result = IsTrue;
+ // Return true if the result was proven to be true or proven to be false.
+ return IsTrue || IsFalse;
+}
+
+bool MachineConstEvaluator::evaluateCMPri(uint32_t Cmp, const Register &R1,
+ const APInt &A2, const CellMap &Inputs, bool &Result) {
+ assert(Inputs.has(R1.Reg));
+ LatticeCell LS;
+ if (!getCell(R1, Inputs, LS))
+ return false;
+ if (LS.isProperty())
+ return evaluateCMPpi(Cmp, LS.properties(), A2, Result);
+
+ APInt A;
+ bool IsTrue = true, IsFalse = true;
+ for (unsigned i = 0; i < LS.size(); ++i) {
+ bool Res;
+ bool Computed = constToInt(LS.Values[i], A) &&
+ evaluateCMPii(Cmp, A, A2, Res);
+ if (!Computed)
+ return false;
+ IsTrue &= Res;
+ IsFalse &= !Res;
+ }
+ assert(!IsTrue || !IsFalse);
+ // The actual logical value of the comparison is same as IsTrue.
+ Result = IsTrue;
+ // Return true if the result was proven to be true or proven to be false.
+ return IsTrue || IsFalse;
+}
+
+bool MachineConstEvaluator::evaluateCMPrp(uint32_t Cmp, const Register &R1,
+ uint64_t Props2, const CellMap &Inputs, bool &Result) {
+ assert(Inputs.has(R1.Reg));
+ LatticeCell LS;
+ if (!getCell(R1, Inputs, LS))
+ return false;
+ if (LS.isProperty())
+ return evaluateCMPpp(Cmp, LS.properties(), Props2, Result);
+
+ APInt A;
+ uint32_t NegCmp = Comparison::negate(Cmp);
+ bool IsTrue = true, IsFalse = true;
+ for (unsigned i = 0; i < LS.size(); ++i) {
+ bool Res;
+ bool Computed = constToInt(LS.Values[i], A) &&
+ evaluateCMPpi(NegCmp, Props2, A, Res);
+ if (!Computed)
+ return false;
+ IsTrue &= Res;
+ IsFalse &= !Res;
+ }
+ assert(!IsTrue || !IsFalse);
+ Result = IsTrue;
+ return IsTrue || IsFalse;
+}
+
+bool MachineConstEvaluator::evaluateCMPii(uint32_t Cmp, const APInt &A1,
+ const APInt &A2, bool &Result) {
+ // NE is a special kind of comparison (not composed of smaller properties).
+ if (Cmp == Comparison::NE) {
+ Result = !APInt::isSameValue(A1, A2);
+ return true;
+ }
+ if (Cmp == Comparison::EQ) {
+ Result = APInt::isSameValue(A1, A2);
+ return true;
+ }
+ if (Cmp & Comparison::EQ) {
+ if (APInt::isSameValue(A1, A2))
+ return (Result = true);
+ }
+ assert((Cmp & (Comparison::L | Comparison::G)) && "Malformed comparison");
+ Result = false;
+
+ unsigned W1 = A1.getBitWidth();
+ unsigned W2 = A2.getBitWidth();
+ unsigned MaxW = (W1 >= W2) ? W1 : W2;
+ if (Cmp & Comparison::U) {
+ const APInt Zx1 = A1.zextOrSelf(MaxW);
+ const APInt Zx2 = A2.zextOrSelf(MaxW);
+ if (Cmp & Comparison::L)
+ Result = Zx1.ult(Zx2);
+ else if (Cmp & Comparison::G)
+ Result = Zx2.ult(Zx1);
+ return true;
+ }
+
+ // Signed comparison.
+ const APInt Sx1 = A1.sextOrSelf(MaxW);
+ const APInt Sx2 = A2.sextOrSelf(MaxW);
+ if (Cmp & Comparison::L)
+ Result = Sx1.slt(Sx2);
+ else if (Cmp & Comparison::G)
+ Result = Sx2.slt(Sx1);
+ return true;
+}
+
+bool MachineConstEvaluator::evaluateCMPpi(uint32_t Cmp, uint32_t Props,
+ const APInt &A2, bool &Result) {
+ if (Props == ConstantProperties::Unknown)
+ return false;
+
+ // Should never see NaN here, but check for it for completeness.
+ if (Props & ConstantProperties::NaN)
+ return false;
+ // Infinity could theoretically be compared to a number, but the
+ // presence of infinity here would be very suspicious. If we don't
+ // know for sure that the number is finite, bail out.
+ if (!(Props & ConstantProperties::Finite))
+ return false;
+
+ // Let X be a number that has properties Props.
+
+ if (Cmp & Comparison::U) {
+ // In case of unsigned comparisons, we can only compare against 0.
+ if (A2 == 0) {
+ // Any x!=0 will be considered >0 in an unsigned comparison.
+ if (Props & ConstantProperties::Zero)
+ Result = (Cmp & Comparison::EQ);
+ else if (Props & ConstantProperties::NonZero)
+ Result = (Cmp & Comparison::G) || (Cmp == Comparison::NE);
+ else
+ return false;
+ return true;
+ }
+ // A2 is not zero. The only handled case is if X = 0.
+ if (Props & ConstantProperties::Zero) {
+ Result = (Cmp & Comparison::L) || (Cmp == Comparison::NE);
+ return true;
+ }
+ return false;
+ }
+
+ // Signed comparisons are different.
+ if (Props & ConstantProperties::Zero) {
+ if (A2 == 0)
+ Result = (Cmp & Comparison::EQ);
+ else
+ Result = (Cmp == Comparison::NE) ||
+ ((Cmp & Comparison::L) && !A2.isNegative()) ||
+ ((Cmp & Comparison::G) && A2.isNegative());
+ return true;
+ }
+ if (Props & ConstantProperties::PosOrZero) {
+ // X >= 0 and !(A2 < 0) => cannot compare
+ if (!A2.isNegative())
+ return false;
+ // X >= 0 and A2 < 0
+ Result = (Cmp & Comparison::G) || (Cmp == Comparison::NE);
+ return true;
+ }
+ if (Props & ConstantProperties::NegOrZero) {
+ // X <= 0 and Src1 < 0 => cannot compare
+ if (A2 == 0 || A2.isNegative())
+ return false;
+ // X <= 0 and A2 > 0
+ Result = (Cmp & Comparison::L) || (Cmp == Comparison::NE);
+ return true;
+ }
+
+ return false;
+}
+
+bool MachineConstEvaluator::evaluateCMPpp(uint32_t Cmp, uint32_t Props1,
+ uint32_t Props2, bool &Result) {
+ typedef ConstantProperties P;
+ if ((Props1 & P::NaN) && (Props2 & P::NaN))
+ return false;
+ if (!(Props1 & P::Finite) || !(Props2 & P::Finite))
+ return false;
+
+ bool Zero1 = (Props1 & P::Zero), Zero2 = (Props2 & P::Zero);
+ bool NonZero1 = (Props1 & P::NonZero), NonZero2 = (Props2 & P::NonZero);
+ if (Zero1 && Zero2) {
+ Result = (Cmp & Comparison::EQ);
+ return true;
+ }
+ if (Cmp == Comparison::NE) {
+ if ((Zero1 && NonZero2) || (NonZero1 && Zero2))
+ return (Result = true);
+ return false;
+ }
+
+ if (Cmp & Comparison::U) {
+ // In unsigned comparisons, we can only compare against a known zero,
+ // or a known non-zero.
+ if (Zero1 && NonZero2) {
+ Result = (Cmp & Comparison::L);
+ return true;
+ }
+ if (NonZero1 && Zero2) {
+ Result = (Cmp & Comparison::G);
+ return true;
+ }
+ return false;
+ }
+
+ // Signed comparison. The comparison is not NE.
+ bool Poz1 = (Props1 & P::PosOrZero), Poz2 = (Props2 & P::PosOrZero);
+ bool Nez1 = (Props1 & P::NegOrZero), Nez2 = (Props2 & P::NegOrZero);
+ if (Nez1 && Poz2) {
+ if (NonZero1 || NonZero2) {
+ Result = (Cmp & Comparison::L);
+ return true;
+ }
+ // Either (or both) could be zero. Can only say that X <= Y.
+ if ((Cmp & Comparison::EQ) && (Cmp & Comparison::L))
+ return (Result = true);
+ }
+ if (Poz1 && Nez2) {
+ if (NonZero1 || NonZero2) {
+ Result = (Cmp & Comparison::G);
+ return true;
+ }
+ // Either (or both) could be zero. Can only say that X >= Y.
+ if ((Cmp & Comparison::EQ) && (Cmp & Comparison::G))
+ return (Result = true);
+ }
+
+ return false;
+}
+
+bool MachineConstEvaluator::evaluateCOPY(const Register &R1,
+ const CellMap &Inputs, LatticeCell &Result) {
+ return getCell(R1, Inputs, Result);
+}
+
+bool MachineConstEvaluator::evaluateANDrr(const Register &R1,
+ const Register &R2, const CellMap &Inputs, LatticeCell &Result) {
+ assert(Inputs.has(R1.Reg) && Inputs.has(R2.Reg));
+ const LatticeCell &L1 = Inputs.get(R2.Reg);
+ const LatticeCell &L2 = Inputs.get(R2.Reg);
+ // If both sources are bottom, exit. Otherwise try to evaluate ANDri
+ // with the non-bottom argument passed as the immediate. This is to
+ // catch cases of ANDing with 0.
+ if (L2.isBottom()) {
+ if (L1.isBottom())
+ return false;
+ return evaluateANDrr(R2, R1, Inputs, Result);
+ }
+ LatticeCell LS2;
+ if (!evaluate(R2, L2, LS2))
+ return false;
+ if (LS2.isBottom() || LS2.isProperty())
+ return false;
+
+ APInt A;
+ for (unsigned i = 0; i < LS2.size(); ++i) {
+ LatticeCell RC;
+ bool Eval = constToInt(LS2.Values[i], A) &&
+ evaluateANDri(R1, A, Inputs, RC);
+ if (!Eval)
+ return false;
+ Result.meet(RC);
+ }
+ return !Result.isBottom();
+}
+
+bool MachineConstEvaluator::evaluateANDri(const Register &R1,
+ const APInt &A2, const CellMap &Inputs, LatticeCell &Result) {
+ assert(Inputs.has(R1.Reg));
+ if (A2 == -1)
+ return getCell(R1, Inputs, Result);
+ if (A2 == 0) {
+ LatticeCell RC;
+ RC.add(intToConst(A2));
+ // Overwrite Result.
+ Result = RC;
+ return true;
+ }
+ LatticeCell LS1;
+ if (!getCell(R1, Inputs, LS1))
+ return false;
+ if (LS1.isBottom() || LS1.isProperty())
+ return false;
+
+ APInt A, ResA;
+ for (unsigned i = 0; i < LS1.size(); ++i) {
+ bool Eval = constToInt(LS1.Values[i], A) &&
+ evaluateANDii(A, A2, ResA);
+ if (!Eval)
+ return false;
+ const Constant *C = intToConst(ResA);
+ Result.add(C);
+ }
+ return !Result.isBottom();
+}
+
+bool MachineConstEvaluator::evaluateANDii(const APInt &A1,
+ const APInt &A2, APInt &Result) {
+ Result = A1 & A2;
+ return true;
+}
+
+bool MachineConstEvaluator::evaluateORrr(const Register &R1,
+ const Register &R2, const CellMap &Inputs, LatticeCell &Result) {
+ assert(Inputs.has(R1.Reg) && Inputs.has(R2.Reg));
+ const LatticeCell &L1 = Inputs.get(R2.Reg);
+ const LatticeCell &L2 = Inputs.get(R2.Reg);
+ // If both sources are bottom, exit. Otherwise try to evaluate ORri
+ // with the non-bottom argument passed as the immediate. This is to
+ // catch cases of ORing with -1.
+ if (L2.isBottom()) {
+ if (L1.isBottom())
+ return false;
+ return evaluateORrr(R2, R1, Inputs, Result);
+ }
+ LatticeCell LS2;
+ if (!evaluate(R2, L2, LS2))
+ return false;
+ if (LS2.isBottom() || LS2.isProperty())
+ return false;
+
+ APInt A;
+ for (unsigned i = 0; i < LS2.size(); ++i) {
+ LatticeCell RC;
+ bool Eval = constToInt(LS2.Values[i], A) &&
+ evaluateORri(R1, A, Inputs, RC);
+ if (!Eval)
+ return false;
+ Result.meet(RC);
+ }
+ return !Result.isBottom();
+}
+
+bool MachineConstEvaluator::evaluateORri(const Register &R1,
+ const APInt &A2, const CellMap &Inputs, LatticeCell &Result) {
+ assert(Inputs.has(R1.Reg));
+ if (A2 == 0)
+ return getCell(R1, Inputs, Result);
+ if (A2 == -1) {
+ LatticeCell RC;
+ RC.add(intToConst(A2));
+ // Overwrite Result.
+ Result = RC;
+ return true;
+ }
+ LatticeCell LS1;
+ if (!getCell(R1, Inputs, LS1))
+ return false;
+ if (LS1.isBottom() || LS1.isProperty())
+ return false;
+
+ APInt A, ResA;
+ for (unsigned i = 0; i < LS1.size(); ++i) {
+ bool Eval = constToInt(LS1.Values[i], A) &&
+ evaluateORii(A, A2, ResA);
+ if (!Eval)
+ return false;
+ const Constant *C = intToConst(ResA);
+ Result.add(C);
+ }
+ return !Result.isBottom();
+}
+
+bool MachineConstEvaluator::evaluateORii(const APInt &A1,
+ const APInt &A2, APInt &Result) {
+ Result = A1 | A2;
+ return true;
+}
+
+bool MachineConstEvaluator::evaluateXORrr(const Register &R1,
+ const Register &R2, const CellMap &Inputs, LatticeCell &Result) {
+ assert(Inputs.has(R1.Reg) && Inputs.has(R2.Reg));
+ LatticeCell LS1, LS2;
+ if (!getCell(R1, Inputs, LS1) || !getCell(R2, Inputs, LS2))
+ return false;
+ if (LS1.isProperty()) {
+ if (LS1.properties() & ConstantProperties::Zero)
+ return !(Result = LS2).isBottom();
+ return false;
+ }
+ if (LS2.isProperty()) {
+ if (LS2.properties() & ConstantProperties::Zero)
+ return !(Result = LS1).isBottom();
+ return false;
+ }
+
+ APInt A;
+ for (unsigned i = 0; i < LS2.size(); ++i) {
+ LatticeCell RC;
+ bool Eval = constToInt(LS2.Values[i], A) &&
+ evaluateXORri(R1, A, Inputs, RC);
+ if (!Eval)
+ return false;
+ Result.meet(RC);
+ }
+ return !Result.isBottom();
+}
+
+bool MachineConstEvaluator::evaluateXORri(const Register &R1,
+ const APInt &A2, const CellMap &Inputs, LatticeCell &Result) {
+ assert(Inputs.has(R1.Reg));
+ LatticeCell LS1;
+ if (!getCell(R1, Inputs, LS1))
+ return false;
+ if (LS1.isProperty()) {
+ if (LS1.properties() & ConstantProperties::Zero) {
+ const Constant *C = intToConst(A2);
+ Result.add(C);
+ return !Result.isBottom();
+ }
+ return false;
+ }
+
+ APInt A, XA;
+ for (unsigned i = 0; i < LS1.size(); ++i) {
+ bool Eval = constToInt(LS1.Values[i], A) &&
+ evaluateXORii(A, A2, XA);
+ if (!Eval)
+ return false;
+ const Constant *C = intToConst(XA);
+ Result.add(C);
+ }
+ return !Result.isBottom();
+}
+
+bool MachineConstEvaluator::evaluateXORii(const APInt &A1,
+ const APInt &A2, APInt &Result) {
+ Result = A1 ^ A2;
+ return true;
+}
+
+bool MachineConstEvaluator::evaluateZEXTr(const Register &R1, unsigned Width,
+ unsigned Bits, const CellMap &Inputs, LatticeCell &Result) {
+ assert(Inputs.has(R1.Reg));
+ LatticeCell LS1;
+ if (!getCell(R1, Inputs, LS1))
+ return false;
+ if (LS1.isProperty())
+ return false;
+
+ APInt A, XA;
+ for (unsigned i = 0; i < LS1.size(); ++i) {
+ bool Eval = constToInt(LS1.Values[i], A) &&
+ evaluateZEXTi(A, Width, Bits, XA);
+ if (!Eval)
+ return false;
+ const Constant *C = intToConst(XA);
+ Result.add(C);
+ }
+ return true;
+}
+
+bool MachineConstEvaluator::evaluateZEXTi(const APInt &A1, unsigned Width,
+ unsigned Bits, APInt &Result) {
+ unsigned BW = A1.getBitWidth();
+ (void)BW;
+ assert(Width >= Bits && BW >= Bits);
+ APInt Mask = APInt::getLowBitsSet(Width, Bits);
+ Result = A1.zextOrTrunc(Width) & Mask;
+ return true;
+}
+
+bool MachineConstEvaluator::evaluateSEXTr(const Register &R1, unsigned Width,
+ unsigned Bits, const CellMap &Inputs, LatticeCell &Result) {
+ assert(Inputs.has(R1.Reg));
+ LatticeCell LS1;
+ if (!getCell(R1, Inputs, LS1))
+ return false;
+ if (LS1.isBottom() || LS1.isProperty())
+ return false;
+
+ APInt A, XA;
+ for (unsigned i = 0; i < LS1.size(); ++i) {
+ bool Eval = constToInt(LS1.Values[i], A) &&
+ evaluateSEXTi(A, Width, Bits, XA);
+ if (!Eval)
+ return false;
+ const Constant *C = intToConst(XA);
+ Result.add(C);
+ }
+ return true;
+}
+
+bool MachineConstEvaluator::evaluateSEXTi(const APInt &A1, unsigned Width,
+ unsigned Bits, APInt &Result) {
+ unsigned BW = A1.getBitWidth();
+ assert(Width >= Bits && BW >= Bits);
+ // Special case to make things faster for smaller source widths.
+ // Sign extension of 0 bits generates 0 as a result. This is consistent
+ // with what the HW does.
+ if (Bits == 0) {
+ Result = APInt(Width, 0);
+ return true;
+ }
+ // In C, shifts by 64 invoke undefined behavior: handle that case in APInt.
+ if (BW <= 64 && Bits != 0) {
+ int64_t V = A1.getSExtValue();
+ switch (Bits) {
+ case 8:
+ V = static_cast<int8_t>(V);
+ break;
+ case 16:
+ V = static_cast<int16_t>(V);
+ break;
+ case 32:
+ V = static_cast<int32_t>(V);
+ break;
+ default:
+ // Shift left to lose all bits except lower "Bits" bits, then shift
+ // the value back, replicating what was a sign bit after the first
+ // shift.
+ V = (V << (64-Bits)) >> (64-Bits);
+ break;
+ }
+ // V is a 64-bit sign-extended value. Convert it to APInt of desired
+ // width.
+ Result = APInt(Width, V, true);
+ return true;
+ }
+ // Slow case: the value doesn't fit in int64_t.
+ if (Bits < BW)
+ Result = A1.trunc(Bits).sext(Width);
+ else // Bits == BW
+ Result = A1.sext(Width);
+ return true;
+}
+
+bool MachineConstEvaluator::evaluateCLBr(const Register &R1, bool Zeros,
+ bool Ones, const CellMap &Inputs, LatticeCell &Result) {
+ assert(Inputs.has(R1.Reg));
+ LatticeCell LS1;
+ if (!getCell(R1, Inputs, LS1))
+ return false;
+ if (LS1.isBottom() || LS1.isProperty())
+ return false;
+
+ APInt A, CA;
+ for (unsigned i = 0; i < LS1.size(); ++i) {
+ bool Eval = constToInt(LS1.Values[i], A) &&
+ evaluateCLBi(A, Zeros, Ones, CA);
+ if (!Eval)
+ return false;
+ const Constant *C = intToConst(CA);
+ Result.add(C);
+ }
+ return true;
+}
+
+bool MachineConstEvaluator::evaluateCLBi(const APInt &A1, bool Zeros,
+ bool Ones, APInt &Result) {
+ unsigned BW = A1.getBitWidth();
+ if (!Zeros && !Ones)
+ return false;
+ unsigned Count = 0;
+ if (Zeros && (Count == 0))
+ Count = A1.countLeadingZeros();
+ if (Ones && (Count == 0))
+ Count = A1.countLeadingOnes();
+ Result = APInt(BW, static_cast<uint64_t>(Count), false);
+ return true;
+}
+
+bool MachineConstEvaluator::evaluateCTBr(const Register &R1, bool Zeros,
+ bool Ones, const CellMap &Inputs, LatticeCell &Result) {
+ assert(Inputs.has(R1.Reg));
+ LatticeCell LS1;
+ if (!getCell(R1, Inputs, LS1))
+ return false;
+ if (LS1.isBottom() || LS1.isProperty())
+ return false;
+
+ APInt A, CA;
+ for (unsigned i = 0; i < LS1.size(); ++i) {
+ bool Eval = constToInt(LS1.Values[i], A) &&
+ evaluateCTBi(A, Zeros, Ones, CA);
+ if (!Eval)
+ return false;
+ const Constant *C = intToConst(CA);
+ Result.add(C);
+ }
+ return true;
+}
+
+bool MachineConstEvaluator::evaluateCTBi(const APInt &A1, bool Zeros,
+ bool Ones, APInt &Result) {
+ unsigned BW = A1.getBitWidth();
+ if (!Zeros && !Ones)
+ return false;
+ unsigned Count = 0;
+ if (Zeros && (Count == 0))
+ Count = A1.countTrailingZeros();
+ if (Ones && (Count == 0))
+ Count = A1.countTrailingOnes();
+ Result = APInt(BW, static_cast<uint64_t>(Count), false);
+ return true;
+}
+
+bool MachineConstEvaluator::evaluateEXTRACTr(const Register &R1,
+ unsigned Width, unsigned Bits, unsigned Offset, bool Signed,
+ const CellMap &Inputs, LatticeCell &Result) {
+ assert(Inputs.has(R1.Reg));
+ assert(Bits+Offset <= Width);
+ LatticeCell LS1;
+ if (!getCell(R1, Inputs, LS1))
+ return false;
+ if (LS1.isBottom())
+ return false;
+ if (LS1.isProperty()) {
+ uint32_t Ps = LS1.properties();
+ if (Ps & ConstantProperties::Zero) {
+ const Constant *C = intToConst(APInt(Width, 0, false));
+ Result.add(C);
+ return true;
+ }
+ return false;
+ }
+
+ APInt A, CA;
+ for (unsigned i = 0; i < LS1.size(); ++i) {
+ bool Eval = constToInt(LS1.Values[i], A) &&
+ evaluateEXTRACTi(A, Bits, Offset, Signed, CA);
+ if (!Eval)
+ return false;
+ const Constant *C = intToConst(CA);
+ Result.add(C);
+ }
+ return true;
+}
+
+bool MachineConstEvaluator::evaluateEXTRACTi(const APInt &A1, unsigned Bits,
+ unsigned Offset, bool Signed, APInt &Result) {
+ unsigned BW = A1.getBitWidth();
+ assert(Bits+Offset <= BW);
+ // Extracting 0 bits generates 0 as a result (as indicated by the HW people).
+ if (Bits == 0) {
+ Result = APInt(BW, 0);
+ return true;
+ }
+ if (BW <= 64) {
+ int64_t V = A1.getZExtValue();
+ V <<= (64-Bits-Offset);
+ if (Signed)
+ V >>= (64-Bits);
+ else
+ V = static_cast<uint64_t>(V) >> (64-Bits);
+ Result = APInt(BW, V, Signed);
+ return true;
+ }
+ if (Signed)
+ Result = A1.shl(BW-Bits-Offset).ashr(BW-Bits);
+ else
+ Result = A1.shl(BW-Bits-Offset).lshr(BW-Bits);
+ return true;
+}
+
+bool MachineConstEvaluator::evaluateSplatr(const Register &R1,
+ unsigned Bits, unsigned Count, const CellMap &Inputs,
+ LatticeCell &Result) {
+ assert(Inputs.has(R1.Reg));
+ LatticeCell LS1;
+ if (!getCell(R1, Inputs, LS1))
+ return false;
+ if (LS1.isBottom() || LS1.isProperty())
+ return false;
+
+ APInt A, SA;
+ for (unsigned i = 0; i < LS1.size(); ++i) {
+ bool Eval = constToInt(LS1.Values[i], A) &&
+ evaluateSplati(A, Bits, Count, SA);
+ if (!Eval)
+ return false;
+ const Constant *C = intToConst(SA);
+ Result.add(C);
+ }
+ return true;
+}
+
+bool MachineConstEvaluator::evaluateSplati(const APInt &A1, unsigned Bits,
+ unsigned Count, APInt &Result) {
+ assert(Count > 0);
+ unsigned BW = A1.getBitWidth(), SW = Count*Bits;
+ APInt LoBits = (Bits < BW) ? A1.trunc(Bits) : A1.zextOrSelf(Bits);
+ if (Count > 1)
+ LoBits = LoBits.zext(SW);
+
+ APInt Res(SW, 0, false);
+ for (unsigned i = 0; i < Count; ++i) {
+ Res <<= Bits;
+ Res |= LoBits;
+ }
+ Result = Res;
+ return true;
+}
+
+// ----------------------------------------------------------------------
+// Hexagon-specific code.
+
+namespace llvm {
+
+ FunctionPass *createHexagonConstPropagationPass();
+ void initializeHexagonConstPropagationPass(PassRegistry &Registry);
+
+} // end namespace llvm
+
+namespace {
+
+ class HexagonConstEvaluator : public MachineConstEvaluator {
+ public:
+ HexagonConstEvaluator(MachineFunction &Fn);
+
+ bool evaluate(const MachineInstr &MI, const CellMap &Inputs,
+ CellMap &Outputs) override;
+ bool evaluate(const Register &R, const LatticeCell &SrcC,
+ LatticeCell &Result) override;
+ bool evaluate(const MachineInstr &BrI, const CellMap &Inputs,
+ SetVector<const MachineBasicBlock*> &Targets, bool &FallsThru)
+ override;
+ bool rewrite(MachineInstr &MI, const CellMap &Inputs) override;
+
+ private:
+ unsigned getRegBitWidth(unsigned Reg) const;
+
+ static uint32_t getCmp(unsigned Opc);
+ static APInt getCmpImm(unsigned Opc, unsigned OpX,
+ const MachineOperand &MO);
+ void replaceWithNop(MachineInstr &MI);
+
+ bool evaluateHexRSEQ32(Register RL, Register RH, const CellMap &Inputs,
+ LatticeCell &Result);
+ bool evaluateHexCompare(const MachineInstr &MI, const CellMap &Inputs,
+ CellMap &Outputs);
+ // This is suitable to be called for compare-and-jump instructions.
+ bool evaluateHexCompare2(uint32_t Cmp, const MachineOperand &Src1,
+ const MachineOperand &Src2, const CellMap &Inputs, bool &Result);
+ bool evaluateHexLogical(const MachineInstr &MI, const CellMap &Inputs,
+ CellMap &Outputs);
+ bool evaluateHexCondMove(const MachineInstr &MI, const CellMap &Inputs,
+ CellMap &Outputs);
+ bool evaluateHexExt(const MachineInstr &MI, const CellMap &Inputs,
+ CellMap &Outputs);
+ bool evaluateHexVector1(const MachineInstr &MI, const CellMap &Inputs,
+ CellMap &Outputs);
+ bool evaluateHexVector2(const MachineInstr &MI, const CellMap &Inputs,
+ CellMap &Outputs);
+
+ void replaceAllRegUsesWith(unsigned FromReg, unsigned ToReg);
+ bool rewriteHexBranch(MachineInstr &BrI, const CellMap &Inputs);
+ bool rewriteHexConstDefs(MachineInstr &MI, const CellMap &Inputs,
+ bool &AllDefs);
+ bool rewriteHexConstUses(MachineInstr &MI, const CellMap &Inputs);
+
+ MachineRegisterInfo *MRI;
+ const HexagonInstrInfo &HII;
+ const HexagonRegisterInfo &HRI;
+ };
+
+ class HexagonConstPropagation : public MachineFunctionPass {
+ public:
+ static char ID;
+
+ HexagonConstPropagation() : MachineFunctionPass(ID) {
+ PassRegistry &Registry = *PassRegistry::getPassRegistry();
+ initializeHexagonConstPropagationPass(Registry);
+ }
+
+ StringRef getPassName() const override {
+ return "Hexagon Constant Propagation";
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ const Function *F = MF.getFunction();
+ if (!F)
+ return false;
+ if (skipFunction(*F))
+ return false;
+
+ HexagonConstEvaluator HCE(MF);
+ return MachineConstPropagator(HCE).run(MF);
+ }
+ };
+
+ char HexagonConstPropagation::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(HexagonConstPropagation, "hcp", "Hexagon Constant Propagation",
+ false, false)
+
+HexagonConstEvaluator::HexagonConstEvaluator(MachineFunction &Fn)
+ : MachineConstEvaluator(Fn),
+ HII(*Fn.getSubtarget<HexagonSubtarget>().getInstrInfo()),
+ HRI(*Fn.getSubtarget<HexagonSubtarget>().getRegisterInfo()) {
+ MRI = &Fn.getRegInfo();
+}
+
+bool HexagonConstEvaluator::evaluate(const MachineInstr &MI,
+ const CellMap &Inputs, CellMap &Outputs) {
+ if (MI.isCall())
+ return false;
+ if (MI.getNumOperands() == 0 || !MI.getOperand(0).isReg())
+ return false;
+ const MachineOperand &MD = MI.getOperand(0);
+ if (!MD.isDef())
+ return false;
+
+ unsigned Opc = MI.getOpcode();
+ Register DefR(MD);
+ assert(!DefR.SubReg);
+ if (!TargetRegisterInfo::isVirtualRegister(DefR.Reg))
+ return false;
+
+ if (MI.isCopy()) {
+ LatticeCell RC;
+ Register SrcR(MI.getOperand(1));
+ bool Eval = evaluateCOPY(SrcR, Inputs, RC);
+ if (!Eval)
+ return false;
+ Outputs.update(DefR.Reg, RC);
+ return true;
+ }
+ if (MI.isRegSequence()) {
+ unsigned Sub1 = MI.getOperand(2).getImm();
+ unsigned Sub2 = MI.getOperand(4).getImm();
+ const TargetRegisterClass *DefRC = MRI->getRegClass(DefR.Reg);
+ unsigned SubLo = HRI.getHexagonSubRegIndex(DefRC, Hexagon::ps_sub_lo);
+ unsigned SubHi = HRI.getHexagonSubRegIndex(DefRC, Hexagon::ps_sub_hi);
+ if (Sub1 != SubLo && Sub1 != SubHi)
+ return false;
+ if (Sub2 != SubLo && Sub2 != SubHi)
+ return false;
+ assert(Sub1 != Sub2);
+ bool LoIs1 = (Sub1 == SubLo);
+ const MachineOperand &OpLo = LoIs1 ? MI.getOperand(1) : MI.getOperand(3);
+ const MachineOperand &OpHi = LoIs1 ? MI.getOperand(3) : MI.getOperand(1);
+ LatticeCell RC;
+ Register SrcRL(OpLo), SrcRH(OpHi);
+ bool Eval = evaluateHexRSEQ32(SrcRL, SrcRH, Inputs, RC);
+ if (!Eval)
+ return false;
+ Outputs.update(DefR.Reg, RC);
+ return true;
+ }
+ if (MI.isCompare()) {
+ bool Eval = evaluateHexCompare(MI, Inputs, Outputs);
+ return Eval;
+ }
+
+ switch (Opc) {
+ default:
+ return false;
+ case Hexagon::A2_tfrsi:
+ case Hexagon::A2_tfrpi:
+ case Hexagon::CONST32:
+ case Hexagon::CONST64:
+ {
+ const MachineOperand &VO = MI.getOperand(1);
+ // The operand of CONST32 can be a blockaddress, e.g.
+ // %vreg0<def> = CONST32 <blockaddress(@eat, %L)>
+ // Do this check for all instructions for safety.
+ if (!VO.isImm())
+ return false;
+ int64_t V = MI.getOperand(1).getImm();
+ unsigned W = getRegBitWidth(DefR.Reg);
+ if (W != 32 && W != 64)
+ return false;
+ IntegerType *Ty = (W == 32) ? Type::getInt32Ty(CX)
+ : Type::getInt64Ty(CX);
+ const ConstantInt *CI = ConstantInt::get(Ty, V, true);
+ LatticeCell RC = Outputs.get(DefR.Reg);
+ RC.add(CI);
+ Outputs.update(DefR.Reg, RC);
+ break;
+ }
+
+ case Hexagon::PS_true:
+ case Hexagon::PS_false:
+ {
+ LatticeCell RC = Outputs.get(DefR.Reg);
+ bool NonZero = (Opc == Hexagon::PS_true);
+ uint32_t P = NonZero ? ConstantProperties::NonZero
+ : ConstantProperties::Zero;
+ RC.add(P);
+ Outputs.update(DefR.Reg, RC);
+ break;
+ }
+
+ case Hexagon::A2_and:
+ case Hexagon::A2_andir:
+ case Hexagon::A2_andp:
+ case Hexagon::A2_or:
+ case Hexagon::A2_orir:
+ case Hexagon::A2_orp:
+ case Hexagon::A2_xor:
+ case Hexagon::A2_xorp:
+ {
+ bool Eval = evaluateHexLogical(MI, Inputs, Outputs);
+ if (!Eval)
+ return false;
+ break;
+ }
+
+ case Hexagon::A2_combineii: // combine(#s8Ext, #s8)
+ case Hexagon::A4_combineii: // combine(#s8, #u6Ext)
+ {
+ uint64_t Hi = MI.getOperand(1).getImm();
+ uint64_t Lo = MI.getOperand(2).getImm();
+ uint64_t Res = (Hi << 32) | (Lo & 0xFFFFFFFF);
+ IntegerType *Ty = Type::getInt64Ty(CX);
+ const ConstantInt *CI = ConstantInt::get(Ty, Res, false);
+ LatticeCell RC = Outputs.get(DefR.Reg);
+ RC.add(CI);
+ Outputs.update(DefR.Reg, RC);
+ break;
+ }
+
+ case Hexagon::S2_setbit_i:
+ {
+ int64_t B = MI.getOperand(2).getImm();
+ assert(B >=0 && B < 32);
+ APInt A(32, (1ull << B), false);
+ Register R(MI.getOperand(1));
+ LatticeCell RC = Outputs.get(DefR.Reg);
+ bool Eval = evaluateORri(R, A, Inputs, RC);
+ if (!Eval)
+ return false;
+ Outputs.update(DefR.Reg, RC);
+ break;
+ }
+
+ case Hexagon::C2_mux:
+ case Hexagon::C2_muxir:
+ case Hexagon::C2_muxri:
+ case Hexagon::C2_muxii:
+ {
+ bool Eval = evaluateHexCondMove(MI, Inputs, Outputs);
+ if (!Eval)
+ return false;
+ break;
+ }
+
+ case Hexagon::A2_sxtb:
+ case Hexagon::A2_sxth:
+ case Hexagon::A2_sxtw:
+ case Hexagon::A2_zxtb:
+ case Hexagon::A2_zxth:
+ {
+ bool Eval = evaluateHexExt(MI, Inputs, Outputs);
+ if (!Eval)
+ return false;
+ break;
+ }
+
+ case Hexagon::S2_ct0:
+ case Hexagon::S2_ct0p:
+ case Hexagon::S2_ct1:
+ case Hexagon::S2_ct1p:
+ {
+ using namespace Hexagon;
+
+ bool Ones = (Opc == S2_ct1) || (Opc == S2_ct1p);
+ Register R1(MI.getOperand(1));
+ assert(Inputs.has(R1.Reg));
+ LatticeCell T;
+ bool Eval = evaluateCTBr(R1, !Ones, Ones, Inputs, T);
+ if (!Eval)
+ return false;
+ // All of these instructions return a 32-bit value. The evaluate
+ // will generate the same type as the operand, so truncate the
+ // result if necessary.
+ APInt C;
+ LatticeCell RC = Outputs.get(DefR.Reg);
+ for (unsigned i = 0; i < T.size(); ++i) {
+ const Constant *CI = T.Values[i];
+ if (constToInt(CI, C) && C.getBitWidth() > 32)
+ CI = intToConst(C.trunc(32));
+ RC.add(CI);
+ }
+ Outputs.update(DefR.Reg, RC);
+ break;
+ }
+
+ case Hexagon::S2_cl0:
+ case Hexagon::S2_cl0p:
+ case Hexagon::S2_cl1:
+ case Hexagon::S2_cl1p:
+ case Hexagon::S2_clb:
+ case Hexagon::S2_clbp:
+ {
+ using namespace Hexagon;
+
+ bool OnlyZeros = (Opc == S2_cl0) || (Opc == S2_cl0p);
+ bool OnlyOnes = (Opc == S2_cl1) || (Opc == S2_cl1p);
+ Register R1(MI.getOperand(1));
+ assert(Inputs.has(R1.Reg));
+ LatticeCell T;
+ bool Eval = evaluateCLBr(R1, !OnlyOnes, !OnlyZeros, Inputs, T);
+ if (!Eval)
+ return false;
+ // All of these instructions return a 32-bit value. The evaluate
+ // will generate the same type as the operand, so truncate the
+ // result if necessary.
+ APInt C;
+ LatticeCell RC = Outputs.get(DefR.Reg);
+ for (unsigned i = 0; i < T.size(); ++i) {
+ const Constant *CI = T.Values[i];
+ if (constToInt(CI, C) && C.getBitWidth() > 32)
+ CI = intToConst(C.trunc(32));
+ RC.add(CI);
+ }
+ Outputs.update(DefR.Reg, RC);
+ break;
+ }
+
+ case Hexagon::S4_extract:
+ case Hexagon::S4_extractp:
+ case Hexagon::S2_extractu:
+ case Hexagon::S2_extractup:
+ {
+ bool Signed = (Opc == Hexagon::S4_extract) ||
+ (Opc == Hexagon::S4_extractp);
+ Register R1(MI.getOperand(1));
+ unsigned BW = getRegBitWidth(R1.Reg);
+ unsigned Bits = MI.getOperand(2).getImm();
+ unsigned Offset = MI.getOperand(3).getImm();
+ LatticeCell RC = Outputs.get(DefR.Reg);
+ if (Offset >= BW) {
+ APInt Zero(BW, 0, false);
+ RC.add(intToConst(Zero));
+ break;
+ }
+ if (Offset+Bits > BW) {
+ // If the requested bitfield extends beyond the most significant bit,
+ // the extra bits are treated as 0s. To emulate this behavior, reduce
+ // the number of requested bits, and make the extract unsigned.
+ Bits = BW-Offset;
+ Signed = false;
+ }
+ bool Eval = evaluateEXTRACTr(R1, BW, Bits, Offset, Signed, Inputs, RC);
+ if (!Eval)
+ return false;
+ Outputs.update(DefR.Reg, RC);
+ break;
+ }
+
+ case Hexagon::S2_vsplatrb:
+ case Hexagon::S2_vsplatrh:
+ // vabsh, vabsh:sat
+ // vabsw, vabsw:sat
+ // vconj:sat
+ // vrndwh, vrndwh:sat
+ // vsathb, vsathub, vsatwuh
+ // vsxtbh, vsxthw
+ // vtrunehb, vtrunohb
+ // vzxtbh, vzxthw
+ {
+ bool Eval = evaluateHexVector1(MI, Inputs, Outputs);
+ if (!Eval)
+ return false;
+ break;
+ }
+
+ // TODO:
+ // A2_vaddh
+ // A2_vaddhs
+ // A2_vaddw
+ // A2_vaddws
+ }
+
+ return true;
+}
+
+bool HexagonConstEvaluator::evaluate(const Register &R,
+ const LatticeCell &Input, LatticeCell &Result) {
+ if (!R.SubReg) {
+ Result = Input;
+ return true;
+ }
+ const TargetRegisterClass *RC = MRI->getRegClass(R.Reg);
+ if (RC != &Hexagon::DoubleRegsRegClass)
+ return false;
+ if (R.SubReg != Hexagon::isub_lo && R.SubReg != Hexagon::isub_hi)
+ return false;
+
+ assert(!Input.isTop());
+ if (Input.isBottom())
+ return false;
+
+ typedef ConstantProperties P;
+ if (Input.isProperty()) {
+ uint32_t Ps = Input.properties();
+ if (Ps & (P::Zero|P::NaN)) {
+ uint32_t Ns = (Ps & (P::Zero|P::NaN|P::SignProperties));
+ Result.add(Ns);
+ return true;
+ }
+ if (R.SubReg == Hexagon::isub_hi) {
+ uint32_t Ns = (Ps & P::SignProperties);
+ Result.add(Ns);
+ return true;
+ }
+ return false;
+ }
+
+ // The Input cell contains some known values. Pick the word corresponding
+ // to the subregister.
+ APInt A;
+ for (unsigned i = 0; i < Input.size(); ++i) {
+ const Constant *C = Input.Values[i];
+ if (!constToInt(C, A))
+ return false;
+ if (!A.isIntN(64))
+ return false;
+ uint64_t U = A.getZExtValue();
+ if (R.SubReg == Hexagon::isub_hi)
+ U >>= 32;
+ U &= 0xFFFFFFFFULL;
+ uint32_t U32 = Lo_32(U);
+ int32_t V32;
+ memcpy(&V32, &U32, sizeof V32);
+ IntegerType *Ty = Type::getInt32Ty(CX);
+ const ConstantInt *C32 = ConstantInt::get(Ty, static_cast<int64_t>(V32));
+ Result.add(C32);
+ }
+ return true;
+}
+
+bool HexagonConstEvaluator::evaluate(const MachineInstr &BrI,
+ const CellMap &Inputs, SetVector<const MachineBasicBlock*> &Targets,
+ bool &FallsThru) {
+ // We need to evaluate one branch at a time. TII::analyzeBranch checks
+ // all the branches in a basic block at once, so we cannot use it.
+ unsigned Opc = BrI.getOpcode();
+ bool SimpleBranch = false;
+ bool Negated = false;
+ switch (Opc) {
+ case Hexagon::J2_jumpf:
+ case Hexagon::J2_jumpfnew:
+ case Hexagon::J2_jumpfnewpt:
+ Negated = true;
+ case Hexagon::J2_jumpt:
+ case Hexagon::J2_jumptnew:
+ case Hexagon::J2_jumptnewpt:
+ // Simple branch: if([!]Pn) jump ...
+ // i.e. Op0 = predicate, Op1 = branch target.
+ SimpleBranch = true;
+ break;
+ case Hexagon::J2_jump:
+ Targets.insert(BrI.getOperand(0).getMBB());
+ FallsThru = false;
+ return true;
+ default:
+Undetermined:
+ // If the branch is of unknown type, assume that all successors are
+ // executable.
+ FallsThru = !BrI.isUnconditionalBranch();
+ return false;
+ }
+
+ if (SimpleBranch) {
+ const MachineOperand &MD = BrI.getOperand(0);
+ Register PR(MD);
+ // If the condition operand has a subregister, this is not something
+ // we currently recognize.
+ if (PR.SubReg)
+ goto Undetermined;
+ assert(Inputs.has(PR.Reg));
+ const LatticeCell &PredC = Inputs.get(PR.Reg);
+ if (PredC.isBottom())
+ goto Undetermined;
+
+ uint32_t Props = PredC.properties();
+ bool CTrue = false, CFalse = false;;
+ if (Props & ConstantProperties::Zero)
+ CFalse = true;
+ else if (Props & ConstantProperties::NonZero)
+ CTrue = true;
+ // If the condition is not known to be either, bail out.
+ if (!CTrue && !CFalse)
+ goto Undetermined;
+
+ const MachineBasicBlock *BranchTarget = BrI.getOperand(1).getMBB();
+
+ FallsThru = false;
+ if ((!Negated && CTrue) || (Negated && CFalse))
+ Targets.insert(BranchTarget);
+ else if ((!Negated && CFalse) || (Negated && CTrue))
+ FallsThru = true;
+ else
+ goto Undetermined;
+ }
+
+ return true;
+}
+
+bool HexagonConstEvaluator::rewrite(MachineInstr &MI, const CellMap &Inputs) {
+ if (MI.isBranch())
+ return rewriteHexBranch(MI, Inputs);
+
+ unsigned Opc = MI.getOpcode();
+ switch (Opc) {
+ default:
+ break;
+ case Hexagon::A2_tfrsi:
+ case Hexagon::A2_tfrpi:
+ case Hexagon::CONST32:
+ case Hexagon::CONST64:
+ case Hexagon::PS_true:
+ case Hexagon::PS_false:
+ return false;
+ }
+
+ unsigned NumOp = MI.getNumOperands();
+ if (NumOp == 0)
+ return false;
+
+ bool AllDefs, Changed;
+ Changed = rewriteHexConstDefs(MI, Inputs, AllDefs);
+ // If not all defs have been rewritten (i.e. the instruction defines
+ // a register that is not compile-time constant), then try to rewrite
+ // register operands that are known to be constant with immediates.
+ if (!AllDefs)
+ Changed |= rewriteHexConstUses(MI, Inputs);
+
+ return Changed;
+}
+
+unsigned HexagonConstEvaluator::getRegBitWidth(unsigned Reg) const {
+ const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+ if (Hexagon::IntRegsRegClass.hasSubClassEq(RC))
+ return 32;
+ if (Hexagon::DoubleRegsRegClass.hasSubClassEq(RC))
+ return 64;
+ if (Hexagon::PredRegsRegClass.hasSubClassEq(RC))
+ return 8;
+ llvm_unreachable("Invalid register");
+ return 0;
+}
+
+uint32_t HexagonConstEvaluator::getCmp(unsigned Opc) {
+ switch (Opc) {
+ case Hexagon::C2_cmpeq:
+ case Hexagon::C2_cmpeqp:
+ case Hexagon::A4_cmpbeq:
+ case Hexagon::A4_cmpheq:
+ case Hexagon::A4_cmpbeqi:
+ case Hexagon::A4_cmpheqi:
+ case Hexagon::C2_cmpeqi:
+ case Hexagon::J4_cmpeqn1_t_jumpnv_nt:
+ case Hexagon::J4_cmpeqn1_t_jumpnv_t:
+ case Hexagon::J4_cmpeqi_t_jumpnv_nt:
+ case Hexagon::J4_cmpeqi_t_jumpnv_t:
+ case Hexagon::J4_cmpeq_t_jumpnv_nt:
+ case Hexagon::J4_cmpeq_t_jumpnv_t:
+ return Comparison::EQ;
+
+ case Hexagon::C4_cmpneq:
+ case Hexagon::C4_cmpneqi:
+ case Hexagon::J4_cmpeqn1_f_jumpnv_nt:
+ case Hexagon::J4_cmpeqn1_f_jumpnv_t:
+ case Hexagon::J4_cmpeqi_f_jumpnv_nt:
+ case Hexagon::J4_cmpeqi_f_jumpnv_t:
+ case Hexagon::J4_cmpeq_f_jumpnv_nt:
+ case Hexagon::J4_cmpeq_f_jumpnv_t:
+ return Comparison::NE;
+
+ case Hexagon::C2_cmpgt:
+ case Hexagon::C2_cmpgtp:
+ case Hexagon::A4_cmpbgt:
+ case Hexagon::A4_cmphgt:
+ case Hexagon::A4_cmpbgti:
+ case Hexagon::A4_cmphgti:
+ case Hexagon::C2_cmpgti:
+ case Hexagon::J4_cmpgtn1_t_jumpnv_nt:
+ case Hexagon::J4_cmpgtn1_t_jumpnv_t:
+ case Hexagon::J4_cmpgti_t_jumpnv_nt:
+ case Hexagon::J4_cmpgti_t_jumpnv_t:
+ case Hexagon::J4_cmpgt_t_jumpnv_nt:
+ case Hexagon::J4_cmpgt_t_jumpnv_t:
+ return Comparison::GTs;
+
+ case Hexagon::C4_cmplte:
+ case Hexagon::C4_cmpltei:
+ case Hexagon::J4_cmpgtn1_f_jumpnv_nt:
+ case Hexagon::J4_cmpgtn1_f_jumpnv_t:
+ case Hexagon::J4_cmpgti_f_jumpnv_nt:
+ case Hexagon::J4_cmpgti_f_jumpnv_t:
+ case Hexagon::J4_cmpgt_f_jumpnv_nt:
+ case Hexagon::J4_cmpgt_f_jumpnv_t:
+ return Comparison::LEs;
+
+ case Hexagon::C2_cmpgtu:
+ case Hexagon::C2_cmpgtup:
+ case Hexagon::A4_cmpbgtu:
+ case Hexagon::A4_cmpbgtui:
+ case Hexagon::A4_cmphgtu:
+ case Hexagon::A4_cmphgtui:
+ case Hexagon::C2_cmpgtui:
+ case Hexagon::J4_cmpgtui_t_jumpnv_nt:
+ case Hexagon::J4_cmpgtui_t_jumpnv_t:
+ case Hexagon::J4_cmpgtu_t_jumpnv_nt:
+ case Hexagon::J4_cmpgtu_t_jumpnv_t:
+ return Comparison::GTu;
+
+ case Hexagon::J4_cmpltu_f_jumpnv_nt:
+ case Hexagon::J4_cmpltu_f_jumpnv_t:
+ return Comparison::GEu;
+
+ case Hexagon::J4_cmpltu_t_jumpnv_nt:
+ case Hexagon::J4_cmpltu_t_jumpnv_t:
+ return Comparison::LTu;
+
+ case Hexagon::J4_cmplt_f_jumpnv_nt:
+ case Hexagon::J4_cmplt_f_jumpnv_t:
+ return Comparison::GEs;
+
+ case Hexagon::C4_cmplteu:
+ case Hexagon::C4_cmplteui:
+ case Hexagon::J4_cmpgtui_f_jumpnv_nt:
+ case Hexagon::J4_cmpgtui_f_jumpnv_t:
+ case Hexagon::J4_cmpgtu_f_jumpnv_nt:
+ case Hexagon::J4_cmpgtu_f_jumpnv_t:
+ return Comparison::LEu;
+
+ case Hexagon::J4_cmplt_t_jumpnv_nt:
+ case Hexagon::J4_cmplt_t_jumpnv_t:
+ return Comparison::LTs;
+
+ default:
+ break;
+ }
+ return Comparison::Unk;
+}
+
+APInt HexagonConstEvaluator::getCmpImm(unsigned Opc, unsigned OpX,
+ const MachineOperand &MO) {
+ bool Signed = false;
+ switch (Opc) {
+ case Hexagon::A4_cmpbgtui: // u7
+ case Hexagon::A4_cmphgtui: // u7
+ break;
+ case Hexagon::A4_cmpheqi: // s8
+ case Hexagon::C4_cmpneqi: // s8
+ Signed = true;
+ case Hexagon::A4_cmpbeqi: // u8
+ break;
+ case Hexagon::C2_cmpgtui: // u9
+ case Hexagon::C4_cmplteui: // u9
+ break;
+ case Hexagon::C2_cmpeqi: // s10
+ case Hexagon::C2_cmpgti: // s10
+ case Hexagon::C4_cmpltei: // s10
+ Signed = true;
+ break;
+ case Hexagon::J4_cmpeqi_f_jumpnv_nt: // u5
+ case Hexagon::J4_cmpeqi_f_jumpnv_t: // u5
+ case Hexagon::J4_cmpeqi_t_jumpnv_nt: // u5
+ case Hexagon::J4_cmpeqi_t_jumpnv_t: // u5
+ case Hexagon::J4_cmpgti_f_jumpnv_nt: // u5
+ case Hexagon::J4_cmpgti_f_jumpnv_t: // u5
+ case Hexagon::J4_cmpgti_t_jumpnv_nt: // u5
+ case Hexagon::J4_cmpgti_t_jumpnv_t: // u5
+ case Hexagon::J4_cmpgtui_f_jumpnv_nt: // u5
+ case Hexagon::J4_cmpgtui_f_jumpnv_t: // u5
+ case Hexagon::J4_cmpgtui_t_jumpnv_nt: // u5
+ case Hexagon::J4_cmpgtui_t_jumpnv_t: // u5
+ break;
+ default:
+ llvm_unreachable("Unhandled instruction");
+ break;
+ }
+
+ uint64_t Val = MO.getImm();
+ return APInt(32, Val, Signed);
+}
+
+void HexagonConstEvaluator::replaceWithNop(MachineInstr &MI) {
+ MI.setDesc(HII.get(Hexagon::A2_nop));
+ while (MI.getNumOperands() > 0)
+ MI.RemoveOperand(0);
+}
+
+bool HexagonConstEvaluator::evaluateHexRSEQ32(Register RL, Register RH,
+ const CellMap &Inputs, LatticeCell &Result) {
+ assert(Inputs.has(RL.Reg) && Inputs.has(RH.Reg));
+ LatticeCell LSL, LSH;
+ if (!getCell(RL, Inputs, LSL) || !getCell(RH, Inputs, LSH))
+ return false;
+ if (LSL.isProperty() || LSH.isProperty())
+ return false;
+
+ unsigned LN = LSL.size(), HN = LSH.size();
+ SmallVector<APInt,4> LoVs(LN), HiVs(HN);
+ for (unsigned i = 0; i < LN; ++i) {
+ bool Eval = constToInt(LSL.Values[i], LoVs[i]);
+ if (!Eval)
+ return false;
+ assert(LoVs[i].getBitWidth() == 32);
+ }
+ for (unsigned i = 0; i < HN; ++i) {
+ bool Eval = constToInt(LSH.Values[i], HiVs[i]);
+ if (!Eval)
+ return false;
+ assert(HiVs[i].getBitWidth() == 32);
+ }
+
+ for (unsigned i = 0; i < HiVs.size(); ++i) {
+ APInt HV = HiVs[i].zextOrSelf(64) << 32;
+ for (unsigned j = 0; j < LoVs.size(); ++j) {
+ APInt LV = LoVs[j].zextOrSelf(64);
+ const Constant *C = intToConst(HV | LV);
+ Result.add(C);
+ if (Result.isBottom())
+ return false;
+ }
+ }
+ return !Result.isBottom();
+}
+
+bool HexagonConstEvaluator::evaluateHexCompare(const MachineInstr &MI,
+ const CellMap &Inputs, CellMap &Outputs) {
+ unsigned Opc = MI.getOpcode();
+ bool Classic = false;
+ switch (Opc) {
+ case Hexagon::C2_cmpeq:
+ case Hexagon::C2_cmpeqp:
+ case Hexagon::C2_cmpgt:
+ case Hexagon::C2_cmpgtp:
+ case Hexagon::C2_cmpgtu:
+ case Hexagon::C2_cmpgtup:
+ case Hexagon::C2_cmpeqi:
+ case Hexagon::C2_cmpgti:
+ case Hexagon::C2_cmpgtui:
+ // Classic compare: Dst0 = CMP Src1, Src2
+ Classic = true;
+ break;
+ default:
+ // Not handling other compare instructions now.
+ return false;
+ }
+
+ if (Classic) {
+ const MachineOperand &Src1 = MI.getOperand(1);
+ const MachineOperand &Src2 = MI.getOperand(2);
+
+ bool Result;
+ unsigned Opc = MI.getOpcode();
+ bool Computed = evaluateHexCompare2(Opc, Src1, Src2, Inputs, Result);
+ if (Computed) {
+ // Only create a zero/non-zero cell. At this time there isn't really
+ // much need for specific values.
+ Register DefR(MI.getOperand(0));
+ LatticeCell L = Outputs.get(DefR.Reg);
+ uint32_t P = Result ? ConstantProperties::NonZero
+ : ConstantProperties::Zero;
+ L.add(P);
+ Outputs.update(DefR.Reg, L);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool HexagonConstEvaluator::evaluateHexCompare2(unsigned Opc,
+ const MachineOperand &Src1, const MachineOperand &Src2,
+ const CellMap &Inputs, bool &Result) {
+ uint32_t Cmp = getCmp(Opc);
+ bool Reg1 = Src1.isReg(), Reg2 = Src2.isReg();
+ bool Imm1 = Src1.isImm(), Imm2 = Src2.isImm();
+ if (Reg1) {
+ Register R1(Src1);
+ if (Reg2) {
+ Register R2(Src2);
+ return evaluateCMPrr(Cmp, R1, R2, Inputs, Result);
+ } else if (Imm2) {
+ APInt A2 = getCmpImm(Opc, 2, Src2);
+ return evaluateCMPri(Cmp, R1, A2, Inputs, Result);
+ }
+ } else if (Imm1) {
+ APInt A1 = getCmpImm(Opc, 1, Src1);
+ if (Reg2) {
+ Register R2(Src2);
+ uint32_t NegCmp = Comparison::negate(Cmp);
+ return evaluateCMPri(NegCmp, R2, A1, Inputs, Result);
+ } else if (Imm2) {
+ APInt A2 = getCmpImm(Opc, 2, Src2);
+ return evaluateCMPii(Cmp, A1, A2, Result);
+ }
+ }
+ // Unknown kind of comparison.
+ return false;
+}
+
+bool HexagonConstEvaluator::evaluateHexLogical(const MachineInstr &MI,
+ const CellMap &Inputs, CellMap &Outputs) {
+ unsigned Opc = MI.getOpcode();
+ if (MI.getNumOperands() != 3)
+ return false;
+ const MachineOperand &Src1 = MI.getOperand(1);
+ const MachineOperand &Src2 = MI.getOperand(2);
+ Register R1(Src1);
+ bool Eval = false;
+ LatticeCell RC;
+ switch (Opc) {
+ default:
+ return false;
+ case Hexagon::A2_and:
+ case Hexagon::A2_andp:
+ Eval = evaluateANDrr(R1, Register(Src2), Inputs, RC);
+ break;
+ case Hexagon::A2_andir: {
+ APInt A(32, Src2.getImm(), true);
+ Eval = evaluateANDri(R1, A, Inputs, RC);
+ break;
+ }
+ case Hexagon::A2_or:
+ case Hexagon::A2_orp:
+ Eval = evaluateORrr(R1, Register(Src2), Inputs, RC);
+ break;
+ case Hexagon::A2_orir: {
+ APInt A(32, Src2.getImm(), true);
+ Eval = evaluateORri(R1, A, Inputs, RC);
+ break;
+ }
+ case Hexagon::A2_xor:
+ case Hexagon::A2_xorp:
+ Eval = evaluateXORrr(R1, Register(Src2), Inputs, RC);
+ break;
+ }
+ if (Eval) {
+ Register DefR(MI.getOperand(0));
+ Outputs.update(DefR.Reg, RC);
+ }
+ return Eval;
+}
+
+bool HexagonConstEvaluator::evaluateHexCondMove(const MachineInstr &MI,
+ const CellMap &Inputs, CellMap &Outputs) {
+ // Dst0 = Cond1 ? Src2 : Src3
+ Register CR(MI.getOperand(1));
+ assert(Inputs.has(CR.Reg));
+ LatticeCell LS;
+ if (!getCell(CR, Inputs, LS))
+ return false;
+ uint32_t Ps = LS.properties();
+ unsigned TakeOp;
+ if (Ps & ConstantProperties::Zero)
+ TakeOp = 3;
+ else if (Ps & ConstantProperties::NonZero)
+ TakeOp = 2;
+ else
+ return false;
+
+ const MachineOperand &ValOp = MI.getOperand(TakeOp);
+ Register DefR(MI.getOperand(0));
+ LatticeCell RC = Outputs.get(DefR.Reg);
+
+ if (ValOp.isImm()) {
+ int64_t V = ValOp.getImm();
+ unsigned W = getRegBitWidth(DefR.Reg);
+ APInt A(W, V, true);
+ const Constant *C = intToConst(A);
+ RC.add(C);
+ Outputs.update(DefR.Reg, RC);
+ return true;
+ }
+ if (ValOp.isReg()) {
+ Register R(ValOp);
+ const LatticeCell &LR = Inputs.get(R.Reg);
+ LatticeCell LSR;
+ if (!evaluate(R, LR, LSR))
+ return false;
+ RC.meet(LSR);
+ Outputs.update(DefR.Reg, RC);
+ return true;
+ }
+ return false;
+}
+
+bool HexagonConstEvaluator::evaluateHexExt(const MachineInstr &MI,
+ const CellMap &Inputs, CellMap &Outputs) {
+ // Dst0 = ext R1
+ Register R1(MI.getOperand(1));
+ assert(Inputs.has(R1.Reg));
+
+ unsigned Opc = MI.getOpcode();
+ unsigned Bits;
+ switch (Opc) {
+ case Hexagon::A2_sxtb:
+ case Hexagon::A2_zxtb:
+ Bits = 8;
+ break;
+ case Hexagon::A2_sxth:
+ case Hexagon::A2_zxth:
+ Bits = 16;
+ break;
+ case Hexagon::A2_sxtw:
+ Bits = 32;
+ break;
+ }
+
+ bool Signed = false;
+ switch (Opc) {
+ case Hexagon::A2_sxtb:
+ case Hexagon::A2_sxth:
+ case Hexagon::A2_sxtw:
+ Signed = true;
+ break;
+ }
+
+ Register DefR(MI.getOperand(0));
+ unsigned BW = getRegBitWidth(DefR.Reg);
+ LatticeCell RC = Outputs.get(DefR.Reg);
+ bool Eval = Signed ? evaluateSEXTr(R1, BW, Bits, Inputs, RC)
+ : evaluateZEXTr(R1, BW, Bits, Inputs, RC);
+ if (!Eval)
+ return false;
+ Outputs.update(DefR.Reg, RC);
+ return true;
+}
+
+bool HexagonConstEvaluator::evaluateHexVector1(const MachineInstr &MI,
+ const CellMap &Inputs, CellMap &Outputs) {
+ // DefR = op R1
+ Register DefR(MI.getOperand(0));
+ Register R1(MI.getOperand(1));
+ assert(Inputs.has(R1.Reg));
+ LatticeCell RC = Outputs.get(DefR.Reg);
+ bool Eval;
+
+ unsigned Opc = MI.getOpcode();
+ switch (Opc) {
+ case Hexagon::S2_vsplatrb:
+ // Rd = 4 times Rs:0..7
+ Eval = evaluateSplatr(R1, 8, 4, Inputs, RC);
+ break;
+ case Hexagon::S2_vsplatrh:
+ // Rdd = 4 times Rs:0..15
+ Eval = evaluateSplatr(R1, 16, 4, Inputs, RC);
+ break;
+ default:
+ return false;
+ }
+
+ if (!Eval)
+ return false;
+ Outputs.update(DefR.Reg, RC);
+ return true;
+}
+
+bool HexagonConstEvaluator::rewriteHexConstDefs(MachineInstr &MI,
+ const CellMap &Inputs, bool &AllDefs) {
+ AllDefs = false;
+
+ // Some diagnostics.
+ // DEBUG({...}) gets confused with all this code as an argument.
+#ifndef NDEBUG
+ bool Debugging = DebugFlag && isCurrentDebugType(DEBUG_TYPE);
+ if (Debugging) {
+ bool Const = true, HasUse = false;
+ for (const MachineOperand &MO : MI.operands()) {
+ if (!MO.isReg() || !MO.isUse() || MO.isImplicit())
+ continue;
+ Register R(MO);
+ if (!TargetRegisterInfo::isVirtualRegister(R.Reg))
+ continue;
+ HasUse = true;
+ // PHIs can legitimately have "top" cells after propagation.
+ if (!MI.isPHI() && !Inputs.has(R.Reg)) {
+ dbgs() << "Top " << PrintReg(R.Reg, &HRI, R.SubReg)
+ << " in MI: " << MI;
+ continue;
+ }
+ const LatticeCell &L = Inputs.get(R.Reg);
+ Const &= L.isSingle();
+ if (!Const)
+ break;
+ }
+ if (HasUse && Const) {
+ if (!MI.isCopy()) {
+ dbgs() << "CONST: " << MI;
+ for (const MachineOperand &MO : MI.operands()) {
+ if (!MO.isReg() || !MO.isUse() || MO.isImplicit())
+ continue;
+ unsigned R = MO.getReg();
+ dbgs() << PrintReg(R, &TRI) << ": " << Inputs.get(R) << "\n";
+ }
+ }
+ }
+ }
+#endif
+
+ // Avoid generating TFRIs for register transfers---this will keep the
+ // coalescing opportunities.
+ if (MI.isCopy())
+ return false;
+
+ // Collect all virtual register-def operands.
+ SmallVector<unsigned,2> DefRegs;
+ for (const MachineOperand &MO : MI.operands()) {
+ if (!MO.isReg() || !MO.isDef())
+ continue;
+ unsigned R = MO.getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(R))
+ continue;
+ assert(!MO.getSubReg());
+ assert(Inputs.has(R));
+ DefRegs.push_back(R);
+ }
+
+ MachineBasicBlock &B = *MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+ unsigned ChangedNum = 0;
+#ifndef NDEBUG
+ SmallVector<const MachineInstr*,4> NewInstrs;
+#endif
+
+ // For each defined register, if it is a constant, create an instruction
+ // NewR = const
+ // and replace all uses of the defined register with NewR.
+ for (unsigned i = 0, n = DefRegs.size(); i < n; ++i) {
+ unsigned R = DefRegs[i];
+ const LatticeCell &L = Inputs.get(R);
+ if (L.isBottom())
+ continue;
+ const TargetRegisterClass *RC = MRI->getRegClass(R);
+ MachineBasicBlock::iterator At = MI.getIterator();
+
+ if (!L.isSingle()) {
+ // If this a zero/non-zero cell, we can fold a definition
+ // of a predicate register.
+ typedef ConstantProperties P;
+ uint64_t Ps = L.properties();
+ if (!(Ps & (P::Zero|P::NonZero)))
+ continue;
+ const TargetRegisterClass *PredRC = &Hexagon::PredRegsRegClass;
+ if (RC != PredRC)
+ continue;
+ const MCInstrDesc *NewD = (Ps & P::Zero) ?
+ &HII.get(Hexagon::PS_false) :
+ &HII.get(Hexagon::PS_true);
+ unsigned NewR = MRI->createVirtualRegister(PredRC);
+ const MachineInstrBuilder &MIB = BuildMI(B, At, DL, *NewD, NewR);
+ (void)MIB;
+#ifndef NDEBUG
+ NewInstrs.push_back(&*MIB);
+#endif
+ replaceAllRegUsesWith(R, NewR);
+ } else {
+ // This cell has a single value.
+ APInt A;
+ if (!constToInt(L.Value, A) || !A.isSignedIntN(64))
+ continue;
+ const TargetRegisterClass *NewRC;
+ const MCInstrDesc *NewD;
+
+ unsigned W = getRegBitWidth(R);
+ int64_t V = A.getSExtValue();
+ assert(W == 32 || W == 64);
+ if (W == 32)
+ NewRC = &Hexagon::IntRegsRegClass;
+ else
+ NewRC = &Hexagon::DoubleRegsRegClass;
+ unsigned NewR = MRI->createVirtualRegister(NewRC);
+ const MachineInstr *NewMI;
+
+ if (W == 32) {
+ NewD = &HII.get(Hexagon::A2_tfrsi);
+ NewMI = BuildMI(B, At, DL, *NewD, NewR)
+ .addImm(V);
+ } else {
+ if (A.isSignedIntN(8)) {
+ NewD = &HII.get(Hexagon::A2_tfrpi);
+ NewMI = BuildMI(B, At, DL, *NewD, NewR)
+ .addImm(V);
+ } else {
+ int32_t Hi = V >> 32;
+ int32_t Lo = V & 0xFFFFFFFFLL;
+ if (isInt<8>(Hi) && isInt<8>(Lo)) {
+ NewD = &HII.get(Hexagon::A2_combineii);
+ NewMI = BuildMI(B, At, DL, *NewD, NewR)
+ .addImm(Hi)
+ .addImm(Lo);
+ } else {
+ NewD = &HII.get(Hexagon::CONST64);
+ NewMI = BuildMI(B, At, DL, *NewD, NewR)
+ .addImm(V);
+ }
+ }
+ }
+ (void)NewMI;
+#ifndef NDEBUG
+ NewInstrs.push_back(NewMI);
+#endif
+ replaceAllRegUsesWith(R, NewR);
+ }
+ ChangedNum++;
+ }
+
+ DEBUG({
+ if (!NewInstrs.empty()) {
+ MachineFunction &MF = *MI.getParent()->getParent();
+ dbgs() << "In function: " << MF.getFunction()->getName() << "\n";
+ dbgs() << "Rewrite: for " << MI << " created " << *NewInstrs[0];
+ for (unsigned i = 1; i < NewInstrs.size(); ++i)
+ dbgs() << " " << *NewInstrs[i];
+ }
+ });
+
+ AllDefs = (ChangedNum == DefRegs.size());
+ return ChangedNum > 0;
+}
+
+bool HexagonConstEvaluator::rewriteHexConstUses(MachineInstr &MI,
+ const CellMap &Inputs) {
+ bool Changed = false;
+ unsigned Opc = MI.getOpcode();
+ MachineBasicBlock &B = *MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineBasicBlock::iterator At = MI.getIterator();
+ MachineInstr *NewMI = nullptr;
+
+ switch (Opc) {
+ case Hexagon::M2_maci:
+ // Convert DefR += mpyi(R2, R3)
+ // to DefR += mpyi(R, #imm),
+ // or DefR -= mpyi(R, #imm).
+ {
+ Register DefR(MI.getOperand(0));
+ assert(!DefR.SubReg);
+ Register R2(MI.getOperand(2));
+ Register R3(MI.getOperand(3));
+ assert(Inputs.has(R2.Reg) && Inputs.has(R3.Reg));
+ LatticeCell LS2, LS3;
+ // It is enough to get one of the input cells, since we will only try
+ // to replace one argument---whichever happens to be a single constant.
+ bool HasC2 = getCell(R2, Inputs, LS2), HasC3 = getCell(R3, Inputs, LS3);
+ if (!HasC2 && !HasC3)
+ return false;
+ bool Zero = ((HasC2 && (LS2.properties() & ConstantProperties::Zero)) ||
+ (HasC3 && (LS3.properties() & ConstantProperties::Zero)));
+ // If one of the operands is zero, eliminate the multiplication.
+ if (Zero) {
+ // DefR == R1 (tied operands).
+ MachineOperand &Acc = MI.getOperand(1);
+ Register R1(Acc);
+ unsigned NewR = R1.Reg;
+ if (R1.SubReg) {
+ // Generate COPY. FIXME: Replace with the register:subregister.
+ const TargetRegisterClass *RC = MRI->getRegClass(DefR.Reg);
+ NewR = MRI->createVirtualRegister(RC);
+ NewMI = BuildMI(B, At, DL, HII.get(TargetOpcode::COPY), NewR)
+ .addReg(R1.Reg, getRegState(Acc), R1.SubReg);
+ }
+ replaceAllRegUsesWith(DefR.Reg, NewR);
+ MRI->clearKillFlags(NewR);
+ Changed = true;
+ break;
+ }
+
+ bool Swap = false;
+ if (!LS3.isSingle()) {
+ if (!LS2.isSingle())
+ return false;
+ Swap = true;
+ }
+ const LatticeCell &LI = Swap ? LS2 : LS3;
+ const MachineOperand &OpR2 = Swap ? MI.getOperand(3)
+ : MI.getOperand(2);
+ // LI is single here.
+ APInt A;
+ if (!constToInt(LI.Value, A) || !A.isSignedIntN(8))
+ return false;
+ int64_t V = A.getSExtValue();
+ const MCInstrDesc &D = (V >= 0) ? HII.get(Hexagon::M2_macsip)
+ : HII.get(Hexagon::M2_macsin);
+ if (V < 0)
+ V = -V;
+ const TargetRegisterClass *RC = MRI->getRegClass(DefR.Reg);
+ unsigned NewR = MRI->createVirtualRegister(RC);
+ const MachineOperand &Src1 = MI.getOperand(1);
+ NewMI = BuildMI(B, At, DL, D, NewR)
+ .addReg(Src1.getReg(), getRegState(Src1), Src1.getSubReg())
+ .addReg(OpR2.getReg(), getRegState(OpR2), OpR2.getSubReg())
+ .addImm(V);
+ replaceAllRegUsesWith(DefR.Reg, NewR);
+ Changed = true;
+ break;
+ }
+
+ case Hexagon::A2_and:
+ {
+ Register R1(MI.getOperand(1));
+ Register R2(MI.getOperand(2));
+ assert(Inputs.has(R1.Reg) && Inputs.has(R2.Reg));
+ LatticeCell LS1, LS2;
+ unsigned CopyOf = 0;
+ // Check if any of the operands is -1 (i.e. all bits set).
+ if (getCell(R1, Inputs, LS1) && LS1.isSingle()) {
+ APInt M1;
+ if (constToInt(LS1.Value, M1) && !~M1)
+ CopyOf = 2;
+ }
+ else if (getCell(R2, Inputs, LS2) && LS2.isSingle()) {
+ APInt M1;
+ if (constToInt(LS2.Value, M1) && !~M1)
+ CopyOf = 1;
+ }
+ if (!CopyOf)
+ return false;
+ MachineOperand &SO = MI.getOperand(CopyOf);
+ Register SR(SO);
+ Register DefR(MI.getOperand(0));
+ unsigned NewR = SR.Reg;
+ if (SR.SubReg) {
+ const TargetRegisterClass *RC = MRI->getRegClass(DefR.Reg);
+ NewR = MRI->createVirtualRegister(RC);
+ NewMI = BuildMI(B, At, DL, HII.get(TargetOpcode::COPY), NewR)
+ .addReg(SR.Reg, getRegState(SO), SR.SubReg);
+ }
+ replaceAllRegUsesWith(DefR.Reg, NewR);
+ MRI->clearKillFlags(NewR);
+ Changed = true;
+ }
+ break;
+
+ case Hexagon::A2_or:
+ {
+ Register R1(MI.getOperand(1));
+ Register R2(MI.getOperand(2));
+ assert(Inputs.has(R1.Reg) && Inputs.has(R2.Reg));
+ LatticeCell LS1, LS2;
+ unsigned CopyOf = 0;
+ typedef ConstantProperties P;
+ if (getCell(R1, Inputs, LS1) && (LS1.properties() & P::Zero))
+ CopyOf = 2;
+ else if (getCell(R2, Inputs, LS2) && (LS2.properties() & P::Zero))
+ CopyOf = 1;
+ if (!CopyOf)
+ return false;
+ MachineOperand &SO = MI.getOperand(CopyOf);
+ Register SR(SO);
+ Register DefR(MI.getOperand(0));
+ unsigned NewR = SR.Reg;
+ if (SR.SubReg) {
+ const TargetRegisterClass *RC = MRI->getRegClass(DefR.Reg);
+ NewR = MRI->createVirtualRegister(RC);
+ NewMI = BuildMI(B, At, DL, HII.get(TargetOpcode::COPY), NewR)
+ .addReg(SR.Reg, getRegState(SO), SR.SubReg);
+ }
+ replaceAllRegUsesWith(DefR.Reg, NewR);
+ MRI->clearKillFlags(NewR);
+ Changed = true;
+ }
+ break;
+ }
+
+ if (NewMI) {
+ // clear all the kill flags of this new instruction.
+ for (MachineOperand &MO : NewMI->operands())
+ if (MO.isReg() && MO.isUse())
+ MO.setIsKill(false);
+ }
+
+ DEBUG({
+ if (NewMI) {
+ dbgs() << "Rewrite: for " << MI;
+ if (NewMI != &MI)
+ dbgs() << " created " << *NewMI;
+ else
+ dbgs() << " modified the instruction itself and created:" << *NewMI;
+ }
+ });
+
+ return Changed;
+}
+
+void HexagonConstEvaluator::replaceAllRegUsesWith(unsigned FromReg,
+ unsigned ToReg) {
+ assert(TargetRegisterInfo::isVirtualRegister(FromReg));
+ assert(TargetRegisterInfo::isVirtualRegister(ToReg));
+ for (auto I = MRI->use_begin(FromReg), E = MRI->use_end(); I != E;) {
+ MachineOperand &O = *I;
+ ++I;
+ O.setReg(ToReg);
+ }
+}
+
+bool HexagonConstEvaluator::rewriteHexBranch(MachineInstr &BrI,
+ const CellMap &Inputs) {
+ MachineBasicBlock &B = *BrI.getParent();
+ unsigned NumOp = BrI.getNumOperands();
+ if (!NumOp)
+ return false;
+
+ bool FallsThru;
+ SetVector<const MachineBasicBlock*> Targets;
+ bool Eval = evaluate(BrI, Inputs, Targets, FallsThru);
+ unsigned NumTargets = Targets.size();
+ if (!Eval || NumTargets > 1 || (NumTargets == 1 && FallsThru))
+ return false;
+ if (BrI.getOpcode() == Hexagon::J2_jump)
+ return false;
+
+ DEBUG(dbgs() << "Rewrite(BB#" << B.getNumber() << "):" << BrI);
+ bool Rewritten = false;
+ if (NumTargets > 0) {
+ assert(!FallsThru && "This should have been checked before");
+ // MIB.addMBB needs non-const pointer.
+ MachineBasicBlock *TargetB = const_cast<MachineBasicBlock*>(Targets[0]);
+ bool Moot = B.isLayoutSuccessor(TargetB);
+ if (!Moot) {
+ // If we build a branch here, we must make sure that it won't be
+ // erased as "non-executable". We can't mark any new instructions
+ // as executable here, so we need to overwrite the BrI, which we
+ // know is executable.
+ const MCInstrDesc &JD = HII.get(Hexagon::J2_jump);
+ auto NI = BuildMI(B, BrI.getIterator(), BrI.getDebugLoc(), JD)
+ .addMBB(TargetB);
+ BrI.setDesc(JD);
+ while (BrI.getNumOperands() > 0)
+ BrI.RemoveOperand(0);
+ // This ensures that all implicit operands (e.g. %R31<imp-def>, etc)
+ // are present in the rewritten branch.
+ for (auto &Op : NI->operands())
+ BrI.addOperand(Op);
+ NI->eraseFromParent();
+ Rewritten = true;
+ }
+ }
+
+ // Do not erase instructions. A newly created instruction could get
+ // the same address as an instruction marked as executable during the
+ // propagation.
+ if (!Rewritten)
+ replaceWithNop(BrI);
+ return true;
+}
+
+FunctionPass *llvm::createHexagonConstPropagationPass() {
+ return new HexagonConstPropagation();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
new file mode 100644
index 000000000000..36080997ec6b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
@@ -0,0 +1,887 @@
+//===------- HexagonCopyToCombine.cpp - Hexagon Copy-To-Combine Pass ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This pass replaces transfer instructions by combine instructions.
+// We walk along a basic block and look for two combinable instructions and try
+// to move them together. If we can move them next to each other we do so and
+// replace them with a combine instruction.
+//===----------------------------------------------------------------------===//
+#include "HexagonInstrInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/PassSupport.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagon-copy-combine"
+
+static
+cl::opt<bool> IsCombinesDisabled("disable-merge-into-combines",
+ cl::Hidden, cl::ZeroOrMore,
+ cl::init(false),
+ cl::desc("Disable merging into combines"));
+static
+cl::opt<bool> IsConst64Disabled("disable-const64",
+ cl::Hidden, cl::ZeroOrMore,
+ cl::init(false),
+ cl::desc("Disable generation of const64"));
+static
+cl::opt<unsigned>
+MaxNumOfInstsBetweenNewValueStoreAndTFR("max-num-inst-between-tfr-and-nv-store",
+ cl::Hidden, cl::init(4),
+ cl::desc("Maximum distance between a tfr feeding a store we "
+ "consider the store still to be newifiable"));
+
+namespace llvm {
+ FunctionPass *createHexagonCopyToCombine();
+ void initializeHexagonCopyToCombinePass(PassRegistry&);
+}
+
+
+namespace {
+
+class HexagonCopyToCombine : public MachineFunctionPass {
+ const HexagonInstrInfo *TII;
+ const TargetRegisterInfo *TRI;
+ const HexagonSubtarget *ST;
+ bool ShouldCombineAggressively;
+
+ DenseSet<MachineInstr *> PotentiallyNewifiableTFR;
+ SmallVector<MachineInstr *, 8> DbgMItoMove;
+
+public:
+ static char ID;
+
+ HexagonCopyToCombine() : MachineFunctionPass(ID) {
+ initializeHexagonCopyToCombinePass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ StringRef getPassName() const override {
+ return "Hexagon Copy-To-Combine Pass";
+ }
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+private:
+ MachineInstr *findPairable(MachineInstr &I1, bool &DoInsertAtI1,
+ bool AllowC64);
+
+ void findPotentialNewifiableTFRs(MachineBasicBlock &);
+
+ void combine(MachineInstr &I1, MachineInstr &I2,
+ MachineBasicBlock::iterator &MI, bool DoInsertAtI1,
+ bool OptForSize);
+
+ bool isSafeToMoveTogether(MachineInstr &I1, MachineInstr &I2,
+ unsigned I1DestReg, unsigned I2DestReg,
+ bool &DoInsertAtI1);
+
+ void emitCombineRR(MachineBasicBlock::iterator &Before, unsigned DestReg,
+ MachineOperand &HiOperand, MachineOperand &LoOperand);
+
+ void emitCombineRI(MachineBasicBlock::iterator &Before, unsigned DestReg,
+ MachineOperand &HiOperand, MachineOperand &LoOperand);
+
+ void emitCombineIR(MachineBasicBlock::iterator &Before, unsigned DestReg,
+ MachineOperand &HiOperand, MachineOperand &LoOperand);
+
+ void emitCombineII(MachineBasicBlock::iterator &Before, unsigned DestReg,
+ MachineOperand &HiOperand, MachineOperand &LoOperand);
+
+ void emitConst64(MachineBasicBlock::iterator &Before, unsigned DestReg,
+ MachineOperand &HiOperand, MachineOperand &LoOperand);
+};
+
+} // End anonymous namespace.
+
+char HexagonCopyToCombine::ID = 0;
+
+INITIALIZE_PASS(HexagonCopyToCombine, "hexagon-copy-combine",
+ "Hexagon Copy-To-Combine Pass", false, false)
+
+static bool isCombinableInstType(MachineInstr &MI, const HexagonInstrInfo *TII,
+ bool ShouldCombineAggressively) {
+ switch (MI.getOpcode()) {
+ case Hexagon::A2_tfr: {
+ // A COPY instruction can be combined if its arguments are IntRegs (32bit).
+ const MachineOperand &Op0 = MI.getOperand(0);
+ const MachineOperand &Op1 = MI.getOperand(1);
+ assert(Op0.isReg() && Op1.isReg());
+
+ unsigned DestReg = Op0.getReg();
+ unsigned SrcReg = Op1.getReg();
+ return Hexagon::IntRegsRegClass.contains(DestReg) &&
+ Hexagon::IntRegsRegClass.contains(SrcReg);
+ }
+
+ case Hexagon::A2_tfrsi: {
+ // A transfer-immediate can be combined if its argument is a signed 8bit
+ // value.
+ const MachineOperand &Op0 = MI.getOperand(0);
+ const MachineOperand &Op1 = MI.getOperand(1);
+ assert(Op0.isReg());
+
+ unsigned DestReg = Op0.getReg();
+ // Ensure that TargetFlags are MO_NO_FLAG for a global. This is a
+ // workaround for an ABI bug that prevents GOT relocations on combine
+ // instructions
+ if (!Op1.isImm() && Op1.getTargetFlags() != HexagonII::MO_NO_FLAG)
+ return false;
+
+ // Only combine constant extended A2_tfrsi if we are in aggressive mode.
+ bool NotExt = Op1.isImm() && isInt<8>(Op1.getImm());
+ return Hexagon::IntRegsRegClass.contains(DestReg) &&
+ (ShouldCombineAggressively || NotExt);
+ }
+
+ case Hexagon::V6_vassign:
+ case Hexagon::V6_vassign_128B:
+ return true;
+
+ default:
+ break;
+ }
+
+ return false;
+}
+
+template <unsigned N> static bool isGreaterThanNBitTFRI(const MachineInstr &I) {
+ if (I.getOpcode() == Hexagon::TFRI64_V4 ||
+ I.getOpcode() == Hexagon::A2_tfrsi) {
+ const MachineOperand &Op = I.getOperand(1);
+ return !Op.isImm() || !isInt<N>(Op.getImm());
+ }
+ return false;
+}
+
+/// areCombinableOperations - Returns true if the two instruction can be merge
+/// into a combine (ignoring register constraints).
+static bool areCombinableOperations(const TargetRegisterInfo *TRI,
+ MachineInstr &HighRegInst,
+ MachineInstr &LowRegInst, bool AllowC64) {
+ unsigned HiOpc = HighRegInst.getOpcode();
+ unsigned LoOpc = LowRegInst.getOpcode();
+
+ auto verifyOpc = [](unsigned Opc) -> void {
+ switch (Opc) {
+ case Hexagon::A2_tfr:
+ case Hexagon::A2_tfrsi:
+ case Hexagon::V6_vassign:
+ break;
+ default:
+ llvm_unreachable("Unexpected opcode");
+ }
+ };
+ verifyOpc(HiOpc);
+ verifyOpc(LoOpc);
+
+ if (HiOpc == Hexagon::V6_vassign || LoOpc == Hexagon::V6_vassign)
+ return HiOpc == LoOpc;
+
+ if (!AllowC64) {
+ // There is no combine of two constant extended values.
+ if (isGreaterThanNBitTFRI<8>(HighRegInst) &&
+ isGreaterThanNBitTFRI<6>(LowRegInst))
+ return false;
+ }
+
+ // There is a combine of two constant extended values into CONST64,
+ // provided both constants are true immediates.
+ if (isGreaterThanNBitTFRI<16>(HighRegInst) &&
+ isGreaterThanNBitTFRI<16>(LowRegInst))
+ return (HighRegInst.getOperand(1).isImm() &&
+ LowRegInst.getOperand(1).isImm());
+
+ // There is no combine of two constant extended values, unless handled above
+ // Make both 8-bit size checks to allow both combine (#,##) and combine(##,#)
+ if (isGreaterThanNBitTFRI<8>(HighRegInst) &&
+ isGreaterThanNBitTFRI<8>(LowRegInst))
+ return false;
+
+ return true;
+}
+
+static bool isEvenReg(unsigned Reg) {
+ assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+ if (Hexagon::IntRegsRegClass.contains(Reg))
+ return (Reg - Hexagon::R0) % 2 == 0;
+ if (Hexagon::VectorRegsRegClass.contains(Reg) ||
+ Hexagon::VectorRegs128BRegClass.contains(Reg))
+ return (Reg - Hexagon::V0) % 2 == 0;
+ llvm_unreachable("Invalid register");
+}
+
+static void removeKillInfo(MachineInstr &MI, unsigned RegNotKilled) {
+ for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+ MachineOperand &Op = MI.getOperand(I);
+ if (!Op.isReg() || Op.getReg() != RegNotKilled || !Op.isKill())
+ continue;
+ Op.setIsKill(false);
+ }
+}
+
+/// Returns true if it is unsafe to move a copy instruction from \p UseReg to
+/// \p DestReg over the instruction \p MI.
+static bool isUnsafeToMoveAcross(MachineInstr &MI, unsigned UseReg,
+ unsigned DestReg,
+ const TargetRegisterInfo *TRI) {
+ return (UseReg && (MI.modifiesRegister(UseReg, TRI))) ||
+ MI.modifiesRegister(DestReg, TRI) || MI.readsRegister(DestReg, TRI) ||
+ MI.hasUnmodeledSideEffects() || MI.isInlineAsm() || MI.isDebugValue();
+}
+
+static unsigned UseReg(const MachineOperand& MO) {
+ return MO.isReg() ? MO.getReg() : 0;
+}
+
+/// isSafeToMoveTogether - Returns true if it is safe to move I1 next to I2 such
+/// that the two instructions can be paired in a combine.
+bool HexagonCopyToCombine::isSafeToMoveTogether(MachineInstr &I1,
+ MachineInstr &I2,
+ unsigned I1DestReg,
+ unsigned I2DestReg,
+ bool &DoInsertAtI1) {
+ unsigned I2UseReg = UseReg(I2.getOperand(1));
+
+ // It is not safe to move I1 and I2 into one combine if I2 has a true
+ // dependence on I1.
+ if (I2UseReg && I1.modifiesRegister(I2UseReg, TRI))
+ return false;
+
+ bool isSafe = true;
+
+ // First try to move I2 towards I1.
+ {
+ // A reverse_iterator instantiated like below starts before I2, and I1
+ // respectively.
+ // Look at instructions I in between I2 and (excluding) I1.
+ MachineBasicBlock::reverse_iterator I(I2),
+ End = --(MachineBasicBlock::reverse_iterator(I1));
+ // At 03 we got better results (dhrystone!) by being more conservative.
+ if (!ShouldCombineAggressively)
+ End = MachineBasicBlock::reverse_iterator(I1);
+ // If I2 kills its operand and we move I2 over an instruction that also
+ // uses I2's use reg we need to modify that (first) instruction to now kill
+ // this reg.
+ unsigned KilledOperand = 0;
+ if (I2.killsRegister(I2UseReg))
+ KilledOperand = I2UseReg;
+ MachineInstr *KillingInstr = nullptr;
+
+ for (; I != End; ++I) {
+ // If the intervening instruction I:
+ // * modifies I2's use reg
+ // * modifies I2's def reg
+ // * reads I2's def reg
+ // * or has unmodelled side effects
+ // we can't move I2 across it.
+ if (I->isDebugValue())
+ continue;
+
+ if (isUnsafeToMoveAcross(*I, I2UseReg, I2DestReg, TRI)) {
+ isSafe = false;
+ break;
+ }
+
+ // Update first use of the killed operand.
+ if (!KillingInstr && KilledOperand &&
+ I->readsRegister(KilledOperand, TRI))
+ KillingInstr = &*I;
+ }
+ if (isSafe) {
+ // Update the intermediate instruction to with the kill flag.
+ if (KillingInstr) {
+ bool Added = KillingInstr->addRegisterKilled(KilledOperand, TRI, true);
+ (void)Added; // suppress compiler warning
+ assert(Added && "Must successfully update kill flag");
+ removeKillInfo(I2, KilledOperand);
+ }
+ DoInsertAtI1 = true;
+ return true;
+ }
+ }
+
+ // Try to move I1 towards I2.
+ {
+ // Look at instructions I in between I1 and (excluding) I2.
+ MachineBasicBlock::iterator I(I1), End(I2);
+ // At O3 we got better results (dhrystone) by being more conservative here.
+ if (!ShouldCombineAggressively)
+ End = std::next(MachineBasicBlock::iterator(I2));
+ unsigned I1UseReg = UseReg(I1.getOperand(1));
+ // Track killed operands. If we move across an instruction that kills our
+ // operand, we need to update the kill information on the moved I1. It kills
+ // the operand now.
+ MachineInstr *KillingInstr = nullptr;
+ unsigned KilledOperand = 0;
+
+ while(++I != End) {
+ MachineInstr &MI = *I;
+ // If the intervening instruction MI:
+ // * modifies I1's use reg
+ // * modifies I1's def reg
+ // * reads I1's def reg
+ // * or has unmodelled side effects
+ // We introduce this special case because llvm has no api to remove a
+ // kill flag for a register (a removeRegisterKilled() analogous to
+ // addRegisterKilled) that handles aliased register correctly.
+ // * or has a killed aliased register use of I1's use reg
+ // %D4<def> = A2_tfrpi 16
+ // %R6<def> = A2_tfr %R9
+ // %R8<def> = KILL %R8, %D4<imp-use,kill>
+ // If we want to move R6 = across the KILL instruction we would have
+ // to remove the %D4<imp-use,kill> operand. For now, we are
+ // conservative and disallow the move.
+ // we can't move I1 across it.
+ if (MI.isDebugValue()) {
+ if (MI.readsRegister(I1DestReg, TRI)) // Move this instruction after I2.
+ DbgMItoMove.push_back(&MI);
+ continue;
+ }
+
+ if (isUnsafeToMoveAcross(MI, I1UseReg, I1DestReg, TRI) ||
+ // Check for an aliased register kill. Bail out if we see one.
+ (!MI.killsRegister(I1UseReg) && MI.killsRegister(I1UseReg, TRI)))
+ return false;
+
+ // Check for an exact kill (registers match).
+ if (I1UseReg && MI.killsRegister(I1UseReg)) {
+ assert(!KillingInstr && "Should only see one killing instruction");
+ KilledOperand = I1UseReg;
+ KillingInstr = &MI;
+ }
+ }
+ if (KillingInstr) {
+ removeKillInfo(*KillingInstr, KilledOperand);
+ // Update I1 to set the kill flag. This flag will later be picked up by
+ // the new COMBINE instruction.
+ bool Added = I1.addRegisterKilled(KilledOperand, TRI);
+ (void)Added; // suppress compiler warning
+ assert(Added && "Must successfully update kill flag");
+ }
+ DoInsertAtI1 = false;
+ }
+
+ return true;
+}
+
+/// findPotentialNewifiableTFRs - Finds tranfers that feed stores that could be
+/// newified. (A use of a 64 bit register define can not be newified)
+void
+HexagonCopyToCombine::findPotentialNewifiableTFRs(MachineBasicBlock &BB) {
+ DenseMap<unsigned, MachineInstr *> LastDef;
+ for (MachineInstr &MI : BB) {
+ if (MI.isDebugValue())
+ continue;
+
+ // Mark TFRs that feed a potential new value store as such.
+ if (TII->mayBeNewStore(MI)) {
+ // Look for uses of TFR instructions.
+ for (unsigned OpdIdx = 0, OpdE = MI.getNumOperands(); OpdIdx != OpdE;
+ ++OpdIdx) {
+ MachineOperand &Op = MI.getOperand(OpdIdx);
+
+ // Skip over anything except register uses.
+ if (!Op.isReg() || !Op.isUse() || !Op.getReg())
+ continue;
+
+ // Look for the defining instruction.
+ unsigned Reg = Op.getReg();
+ MachineInstr *DefInst = LastDef[Reg];
+ if (!DefInst)
+ continue;
+ if (!isCombinableInstType(*DefInst, TII, ShouldCombineAggressively))
+ continue;
+
+ // Only close newifiable stores should influence the decision.
+ // Ignore the debug instructions in between.
+ MachineBasicBlock::iterator It(DefInst);
+ unsigned NumInstsToDef = 0;
+ while (&*It != &MI) {
+ if (!It->isDebugValue())
+ ++NumInstsToDef;
+ ++It;
+ }
+
+ if (NumInstsToDef > MaxNumOfInstsBetweenNewValueStoreAndTFR)
+ continue;
+
+ PotentiallyNewifiableTFR.insert(DefInst);
+ }
+ // Skip to next instruction.
+ continue;
+ }
+
+ // Put instructions that last defined integer or double registers into the
+ // map.
+ for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+ MachineOperand &Op = MI.getOperand(I);
+ if (!Op.isReg() || !Op.isDef() || !Op.getReg())
+ continue;
+ unsigned Reg = Op.getReg();
+ if (Hexagon::DoubleRegsRegClass.contains(Reg)) {
+ for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+ LastDef[*SubRegs] = &MI;
+ }
+ } else if (Hexagon::IntRegsRegClass.contains(Reg))
+ LastDef[Reg] = &MI;
+ }
+ }
+}
+
+bool HexagonCopyToCombine::runOnMachineFunction(MachineFunction &MF) {
+
+ if (IsCombinesDisabled) return false;
+
+ bool HasChanged = false;
+
+ // Get target info.
+ ST = &MF.getSubtarget<HexagonSubtarget>();
+ TRI = ST->getRegisterInfo();
+ TII = ST->getInstrInfo();
+
+ const Function *F = MF.getFunction();
+ bool OptForSize = F->hasFnAttribute(Attribute::OptimizeForSize);
+
+ // Combine aggressively (for code size)
+ ShouldCombineAggressively =
+ MF.getTarget().getOptLevel() <= CodeGenOpt::Default;
+
+ // Traverse basic blocks.
+ for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
+ ++BI) {
+ PotentiallyNewifiableTFR.clear();
+ findPotentialNewifiableTFRs(*BI);
+
+ // Traverse instructions in basic block.
+ for(MachineBasicBlock::iterator MI = BI->begin(), End = BI->end();
+ MI != End;) {
+ MachineInstr &I1 = *MI++;
+
+ if (I1.isDebugValue())
+ continue;
+
+ // Don't combine a TFR whose user could be newified (instructions that
+ // define double registers can not be newified - Programmer's Ref Manual
+ // 5.4.2 New-value stores).
+ if (ShouldCombineAggressively && PotentiallyNewifiableTFR.count(&I1))
+ continue;
+
+ // Ignore instructions that are not combinable.
+ if (!isCombinableInstType(I1, TII, ShouldCombineAggressively))
+ continue;
+
+ // Find a second instruction that can be merged into a combine
+ // instruction. In addition, also find all the debug instructions that
+ // need to be moved along with it.
+ bool DoInsertAtI1 = false;
+ DbgMItoMove.clear();
+ MachineInstr *I2 = findPairable(I1, DoInsertAtI1, OptForSize);
+ if (I2) {
+ HasChanged = true;
+ combine(I1, *I2, MI, DoInsertAtI1, OptForSize);
+ }
+ }
+ }
+
+ return HasChanged;
+}
+
+/// findPairable - Returns an instruction that can be merged with \p I1 into a
+/// COMBINE instruction or 0 if no such instruction can be found. Returns true
+/// in \p DoInsertAtI1 if the combine must be inserted at instruction \p I1
+/// false if the combine must be inserted at the returned instruction.
+MachineInstr *HexagonCopyToCombine::findPairable(MachineInstr &I1,
+ bool &DoInsertAtI1,
+ bool AllowC64) {
+ MachineBasicBlock::iterator I2 = std::next(MachineBasicBlock::iterator(I1));
+ while (I2 != I1.getParent()->end() && I2->isDebugValue())
+ ++I2;
+
+ unsigned I1DestReg = I1.getOperand(0).getReg();
+
+ for (MachineBasicBlock::iterator End = I1.getParent()->end(); I2 != End;
+ ++I2) {
+ // Bail out early if we see a second definition of I1DestReg.
+ if (I2->modifiesRegister(I1DestReg, TRI))
+ break;
+
+ // Ignore non-combinable instructions.
+ if (!isCombinableInstType(*I2, TII, ShouldCombineAggressively))
+ continue;
+
+ // Don't combine a TFR whose user could be newified.
+ if (ShouldCombineAggressively && PotentiallyNewifiableTFR.count(&*I2))
+ continue;
+
+ unsigned I2DestReg = I2->getOperand(0).getReg();
+
+ // Check that registers are adjacent and that the first destination register
+ // is even.
+ bool IsI1LowReg = (I2DestReg - I1DestReg) == 1;
+ bool IsI2LowReg = (I1DestReg - I2DestReg) == 1;
+ unsigned FirstRegIndex = IsI1LowReg ? I1DestReg : I2DestReg;
+ if ((!IsI1LowReg && !IsI2LowReg) || !isEvenReg(FirstRegIndex))
+ continue;
+
+ // Check that the two instructions are combinable. V4 allows more
+ // instructions to be merged into a combine.
+ // The order matters because in a A2_tfrsi we might can encode a int8 as
+ // the hi reg operand but only a uint6 as the low reg operand.
+ if ((IsI2LowReg && !areCombinableOperations(TRI, I1, *I2, AllowC64)) ||
+ (IsI1LowReg && !areCombinableOperations(TRI, *I2, I1, AllowC64)))
+ break;
+
+ if (isSafeToMoveTogether(I1, *I2, I1DestReg, I2DestReg, DoInsertAtI1))
+ return &*I2;
+
+ // Not safe. Stop searching.
+ break;
+ }
+ return nullptr;
+}
+
+void HexagonCopyToCombine::combine(MachineInstr &I1, MachineInstr &I2,
+ MachineBasicBlock::iterator &MI,
+ bool DoInsertAtI1, bool OptForSize) {
+ // We are going to delete I2. If MI points to I2 advance it to the next
+ // instruction.
+ if (MI == I2.getIterator())
+ ++MI;
+
+ // Figure out whether I1 or I2 goes into the lowreg part.
+ unsigned I1DestReg = I1.getOperand(0).getReg();
+ unsigned I2DestReg = I2.getOperand(0).getReg();
+ bool IsI1Loreg = (I2DestReg - I1DestReg) == 1;
+ unsigned LoRegDef = IsI1Loreg ? I1DestReg : I2DestReg;
+ unsigned SubLo;
+
+ const TargetRegisterClass *SuperRC = nullptr;
+ if (Hexagon::IntRegsRegClass.contains(LoRegDef)) {
+ SuperRC = &Hexagon::DoubleRegsRegClass;
+ SubLo = Hexagon::isub_lo;
+ } else if (Hexagon::VectorRegsRegClass.contains(LoRegDef)) {
+ assert(ST->useHVXOps());
+ if (ST->useHVXSglOps())
+ SuperRC = &Hexagon::VecDblRegsRegClass;
+ else
+ SuperRC = &Hexagon::VecDblRegs128BRegClass;
+ SubLo = Hexagon::vsub_lo;
+ } else
+ llvm_unreachable("Unexpected register class");
+
+ // Get the double word register.
+ unsigned DoubleRegDest = TRI->getMatchingSuperReg(LoRegDef, SubLo, SuperRC);
+ assert(DoubleRegDest != 0 && "Expect a valid register");
+
+ // Setup source operands.
+ MachineOperand &LoOperand = IsI1Loreg ? I1.getOperand(1) : I2.getOperand(1);
+ MachineOperand &HiOperand = IsI1Loreg ? I2.getOperand(1) : I1.getOperand(1);
+
+ // Figure out which source is a register and which a constant.
+ bool IsHiReg = HiOperand.isReg();
+ bool IsLoReg = LoOperand.isReg();
+
+ // There is a combine of two constant extended values into CONST64.
+ bool IsC64 = OptForSize && LoOperand.isImm() && HiOperand.isImm() &&
+ isGreaterThanNBitTFRI<16>(I1) && isGreaterThanNBitTFRI<16>(I2);
+
+ MachineBasicBlock::iterator InsertPt(DoInsertAtI1 ? I1 : I2);
+ // Emit combine.
+ if (IsHiReg && IsLoReg)
+ emitCombineRR(InsertPt, DoubleRegDest, HiOperand, LoOperand);
+ else if (IsHiReg)
+ emitCombineRI(InsertPt, DoubleRegDest, HiOperand, LoOperand);
+ else if (IsLoReg)
+ emitCombineIR(InsertPt, DoubleRegDest, HiOperand, LoOperand);
+ else if (IsC64 && !IsConst64Disabled)
+ emitConst64(InsertPt, DoubleRegDest, HiOperand, LoOperand);
+ else
+ emitCombineII(InsertPt, DoubleRegDest, HiOperand, LoOperand);
+
+ // Move debug instructions along with I1 if it's being
+ // moved towards I2.
+ if (!DoInsertAtI1 && DbgMItoMove.size() != 0) {
+ // Insert debug instructions at the new location before I2.
+ MachineBasicBlock *BB = InsertPt->getParent();
+ for (auto NewMI : DbgMItoMove) {
+ // If iterator MI is pointing to DEBUG_VAL, make sure
+ // MI now points to next relevant instruction.
+ if (NewMI == MI)
+ ++MI;
+ BB->splice(InsertPt, BB, NewMI);
+ }
+ }
+
+ I1.eraseFromParent();
+ I2.eraseFromParent();
+}
+
+void HexagonCopyToCombine::emitConst64(MachineBasicBlock::iterator &InsertPt,
+ unsigned DoubleDestReg,
+ MachineOperand &HiOperand,
+ MachineOperand &LoOperand) {
+ DEBUG(dbgs() << "Found a CONST64\n");
+
+ DebugLoc DL = InsertPt->getDebugLoc();
+ MachineBasicBlock *BB = InsertPt->getParent();
+ assert(LoOperand.isImm() && HiOperand.isImm() &&
+ "Both operands must be immediate");
+
+ int64_t V = HiOperand.getImm();
+ V = (V << 32) | (0x0ffffffffLL & LoOperand.getImm());
+ BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::CONST64), DoubleDestReg)
+ .addImm(V);
+}
+
+void HexagonCopyToCombine::emitCombineII(MachineBasicBlock::iterator &InsertPt,
+ unsigned DoubleDestReg,
+ MachineOperand &HiOperand,
+ MachineOperand &LoOperand) {
+ DebugLoc DL = InsertPt->getDebugLoc();
+ MachineBasicBlock *BB = InsertPt->getParent();
+
+ // Handle globals.
+ if (HiOperand.isGlobal()) {
+ BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A2_combineii), DoubleDestReg)
+ .addGlobalAddress(HiOperand.getGlobal(), HiOperand.getOffset(),
+ HiOperand.getTargetFlags())
+ .addImm(LoOperand.getImm());
+ return;
+ }
+ if (LoOperand.isGlobal()) {
+ BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A4_combineii), DoubleDestReg)
+ .addImm(HiOperand.getImm())
+ .addGlobalAddress(LoOperand.getGlobal(), LoOperand.getOffset(),
+ LoOperand.getTargetFlags());
+ return;
+ }
+
+ // Handle block addresses.
+ if (HiOperand.isBlockAddress()) {
+ BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A2_combineii), DoubleDestReg)
+ .addBlockAddress(HiOperand.getBlockAddress(), HiOperand.getOffset(),
+ HiOperand.getTargetFlags())
+ .addImm(LoOperand.getImm());
+ return;
+ }
+ if (LoOperand.isBlockAddress()) {
+ BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A4_combineii), DoubleDestReg)
+ .addImm(HiOperand.getImm())
+ .addBlockAddress(LoOperand.getBlockAddress(), LoOperand.getOffset(),
+ LoOperand.getTargetFlags());
+ return;
+ }
+
+ // Handle jump tables.
+ if (HiOperand.isJTI()) {
+ BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A2_combineii), DoubleDestReg)
+ .addJumpTableIndex(HiOperand.getIndex(), HiOperand.getTargetFlags())
+ .addImm(LoOperand.getImm());
+ return;
+ }
+ if (LoOperand.isJTI()) {
+ BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A4_combineii), DoubleDestReg)
+ .addImm(HiOperand.getImm())
+ .addJumpTableIndex(LoOperand.getIndex(), LoOperand.getTargetFlags());
+ return;
+ }
+
+ // Handle constant pools.
+ if (HiOperand.isCPI()) {
+ BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A2_combineii), DoubleDestReg)
+ .addConstantPoolIndex(HiOperand.getIndex(), HiOperand.getOffset(),
+ HiOperand.getTargetFlags())
+ .addImm(LoOperand.getImm());
+ return;
+ }
+ if (LoOperand.isCPI()) {
+ BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A4_combineii), DoubleDestReg)
+ .addImm(HiOperand.getImm())
+ .addConstantPoolIndex(LoOperand.getIndex(), LoOperand.getOffset(),
+ LoOperand.getTargetFlags());
+ return;
+ }
+
+ // First preference should be given to Hexagon::A2_combineii instruction
+ // as it can include U6 (in Hexagon::A4_combineii) as well.
+ // In this instruction, HiOperand is const extended, if required.
+ if (isInt<8>(LoOperand.getImm())) {
+ BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A2_combineii), DoubleDestReg)
+ .addImm(HiOperand.getImm())
+ .addImm(LoOperand.getImm());
+ return;
+ }
+
+ // In this instruction, LoOperand is const extended, if required.
+ if (isInt<8>(HiOperand.getImm())) {
+ BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A4_combineii), DoubleDestReg)
+ .addImm(HiOperand.getImm())
+ .addImm(LoOperand.getImm());
+ return;
+ }
+
+ // Insert new combine instruction.
+ // DoubleRegDest = combine #HiImm, #LoImm
+ BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A2_combineii), DoubleDestReg)
+ .addImm(HiOperand.getImm())
+ .addImm(LoOperand.getImm());
+}
+
+void HexagonCopyToCombine::emitCombineIR(MachineBasicBlock::iterator &InsertPt,
+ unsigned DoubleDestReg,
+ MachineOperand &HiOperand,
+ MachineOperand &LoOperand) {
+ unsigned LoReg = LoOperand.getReg();
+ unsigned LoRegKillFlag = getKillRegState(LoOperand.isKill());
+
+ DebugLoc DL = InsertPt->getDebugLoc();
+ MachineBasicBlock *BB = InsertPt->getParent();
+
+ // Handle globals.
+ if (HiOperand.isGlobal()) {
+ BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A4_combineir), DoubleDestReg)
+ .addGlobalAddress(HiOperand.getGlobal(), HiOperand.getOffset(),
+ HiOperand.getTargetFlags())
+ .addReg(LoReg, LoRegKillFlag);
+ return;
+ }
+ // Handle block addresses.
+ if (HiOperand.isBlockAddress()) {
+ BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A4_combineir), DoubleDestReg)
+ .addBlockAddress(HiOperand.getBlockAddress(), HiOperand.getOffset(),
+ HiOperand.getTargetFlags())
+ .addReg(LoReg, LoRegKillFlag);
+ return;
+ }
+ // Handle jump tables.
+ if (HiOperand.isJTI()) {
+ BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A4_combineir), DoubleDestReg)
+ .addJumpTableIndex(HiOperand.getIndex(), HiOperand.getTargetFlags())
+ .addReg(LoReg, LoRegKillFlag);
+ return;
+ }
+ // Handle constant pools.
+ if (HiOperand.isCPI()) {
+ BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A4_combineir), DoubleDestReg)
+ .addConstantPoolIndex(HiOperand.getIndex(), HiOperand.getOffset(),
+ HiOperand.getTargetFlags())
+ .addReg(LoReg, LoRegKillFlag);
+ return;
+ }
+ // Insert new combine instruction.
+ // DoubleRegDest = combine #HiImm, LoReg
+ BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A4_combineir), DoubleDestReg)
+ .addImm(HiOperand.getImm())
+ .addReg(LoReg, LoRegKillFlag);
+}
+
+void HexagonCopyToCombine::emitCombineRI(MachineBasicBlock::iterator &InsertPt,
+ unsigned DoubleDestReg,
+ MachineOperand &HiOperand,
+ MachineOperand &LoOperand) {
+ unsigned HiRegKillFlag = getKillRegState(HiOperand.isKill());
+ unsigned HiReg = HiOperand.getReg();
+
+ DebugLoc DL = InsertPt->getDebugLoc();
+ MachineBasicBlock *BB = InsertPt->getParent();
+
+ // Handle global.
+ if (LoOperand.isGlobal()) {
+ BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A4_combineri), DoubleDestReg)
+ .addReg(HiReg, HiRegKillFlag)
+ .addGlobalAddress(LoOperand.getGlobal(), LoOperand.getOffset(),
+ LoOperand.getTargetFlags());
+ return;
+ }
+ // Handle block addresses.
+ if (LoOperand.isBlockAddress()) {
+ BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A4_combineri), DoubleDestReg)
+ .addReg(HiReg, HiRegKillFlag)
+ .addBlockAddress(LoOperand.getBlockAddress(), LoOperand.getOffset(),
+ LoOperand.getTargetFlags());
+ return;
+ }
+ // Handle jump tables.
+ if (LoOperand.isJTI()) {
+ BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A4_combineri), DoubleDestReg)
+ .addReg(HiOperand.getReg(), HiRegKillFlag)
+ .addJumpTableIndex(LoOperand.getIndex(), LoOperand.getTargetFlags());
+ return;
+ }
+ // Handle constant pools.
+ if (LoOperand.isCPI()) {
+ BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A4_combineri), DoubleDestReg)
+ .addReg(HiOperand.getReg(), HiRegKillFlag)
+ .addConstantPoolIndex(LoOperand.getIndex(), LoOperand.getOffset(),
+ LoOperand.getTargetFlags());
+ return;
+ }
+
+ // Insert new combine instruction.
+ // DoubleRegDest = combine HiReg, #LoImm
+ BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A4_combineri), DoubleDestReg)
+ .addReg(HiReg, HiRegKillFlag)
+ .addImm(LoOperand.getImm());
+}
+
+void HexagonCopyToCombine::emitCombineRR(MachineBasicBlock::iterator &InsertPt,
+ unsigned DoubleDestReg,
+ MachineOperand &HiOperand,
+ MachineOperand &LoOperand) {
+ unsigned LoRegKillFlag = getKillRegState(LoOperand.isKill());
+ unsigned HiRegKillFlag = getKillRegState(HiOperand.isKill());
+ unsigned LoReg = LoOperand.getReg();
+ unsigned HiReg = HiOperand.getReg();
+
+ DebugLoc DL = InsertPt->getDebugLoc();
+ MachineBasicBlock *BB = InsertPt->getParent();
+
+ // Insert new combine instruction.
+ // DoubleRegDest = combine HiReg, LoReg
+ unsigned NewOpc;
+ if (Hexagon::DoubleRegsRegClass.contains(DoubleDestReg)) {
+ NewOpc = Hexagon::A2_combinew;
+ } else if (Hexagon::VecDblRegsRegClass.contains(DoubleDestReg)) {
+ assert(ST->useHVXOps());
+ if (ST->useHVXSglOps())
+ NewOpc = Hexagon::V6_vcombine;
+ else
+ NewOpc = Hexagon::V6_vcombine_128B;
+ } else
+ llvm_unreachable("Unexpected register");
+
+ BuildMI(*BB, InsertPt, DL, TII->get(NewOpc), DoubleDestReg)
+ .addReg(HiReg, HiRegKillFlag)
+ .addReg(LoReg, LoRegKillFlag);
+}
+
+FunctionPass *llvm::createHexagonCopyToCombine() {
+ return new HexagonCopyToCombine();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
new file mode 100644
index 000000000000..a5351cd08da5
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
@@ -0,0 +1,1034 @@
+//===--- HexagonEarlyIfConv.cpp -------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements a Hexagon-specific if-conversion pass that runs on the
+// SSA form.
+// In SSA it is not straightforward to represent instructions that condi-
+// tionally define registers, since a conditionally-defined register may
+// only be used under the same condition on which the definition was based.
+// To avoid complications of this nature, this patch will only generate
+// predicated stores, and speculate other instructions from the "if-conver-
+// ted" block.
+// The code will recognize CFG patterns where a block with a conditional
+// branch "splits" into a "true block" and a "false block". Either of these
+// could be omitted (in case of a triangle, for example).
+// If after conversion of the side block(s) the CFG allows it, the resul-
+// ting blocks may be merged. If the "join" block contained PHI nodes, they
+// will be replaced with MUX (or MUX-like) instructions to maintain the
+// semantics of the PHI.
+//
+// Example:
+//
+// %vreg40<def> = L2_loadrub_io %vreg39<kill>, 1
+// %vreg41<def> = S2_tstbit_i %vreg40<kill>, 0
+// J2_jumpt %vreg41<kill>, <BB#5>, %PC<imp-def,dead>
+// J2_jump <BB#4>, %PC<imp-def,dead>
+// Successors according to CFG: BB#4(62) BB#5(62)
+//
+// BB#4: derived from LLVM BB %if.then
+// Predecessors according to CFG: BB#3
+// %vreg11<def> = A2_addp %vreg6, %vreg10
+// S2_storerd_io %vreg32, 16, %vreg11
+// Successors according to CFG: BB#5
+//
+// BB#5: derived from LLVM BB %if.end
+// Predecessors according to CFG: BB#3 BB#4
+// %vreg12<def> = PHI %vreg6, <BB#3>, %vreg11, <BB#4>
+// %vreg13<def> = A2_addp %vreg7, %vreg12
+// %vreg42<def> = C2_cmpeqi %vreg9, 10
+// J2_jumpf %vreg42<kill>, <BB#3>, %PC<imp-def,dead>
+// J2_jump <BB#6>, %PC<imp-def,dead>
+// Successors according to CFG: BB#6(4) BB#3(124)
+//
+// would become:
+//
+// %vreg40<def> = L2_loadrub_io %vreg39<kill>, 1
+// %vreg41<def> = S2_tstbit_i %vreg40<kill>, 0
+// spec-> %vreg11<def> = A2_addp %vreg6, %vreg10
+// pred-> S2_pstorerdf_io %vreg41, %vreg32, 16, %vreg11
+// %vreg46<def> = PS_pselect %vreg41, %vreg6, %vreg11
+// %vreg13<def> = A2_addp %vreg7, %vreg46
+// %vreg42<def> = C2_cmpeqi %vreg9, 10
+// J2_jumpf %vreg42<kill>, <BB#3>, %PC<imp-def,dead>
+// J2_jump <BB#6>, %PC<imp-def,dead>
+// Successors according to CFG: BB#6 BB#3
+
+#define DEBUG_TYPE "hexagon-eif"
+
+#include "Hexagon.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <cassert>
+#include <iterator>
+
+using namespace llvm;
+
+namespace llvm {
+
+ FunctionPass *createHexagonEarlyIfConversion();
+ void initializeHexagonEarlyIfConversionPass(PassRegistry& Registry);
+
+} // end namespace llvm
+
+namespace {
+
+ cl::opt<bool> EnableHexagonBP("enable-hexagon-br-prob", cl::Hidden,
+ cl::init(false), cl::desc("Enable branch probability info"));
+ cl::opt<unsigned> SizeLimit("eif-limit", cl::init(6), cl::Hidden,
+ cl::desc("Size limit in Hexagon early if-conversion"));
+
+ struct PrintMB {
+ PrintMB(const MachineBasicBlock *B) : MB(B) {}
+ const MachineBasicBlock *MB;
+ };
+ raw_ostream &operator<< (raw_ostream &OS, const PrintMB &P) {
+ if (!P.MB)
+ return OS << "<none>";
+ return OS << '#' << P.MB->getNumber();
+ }
+
+ struct FlowPattern {
+ FlowPattern() = default;
+ FlowPattern(MachineBasicBlock *B, unsigned PR, MachineBasicBlock *TB,
+ MachineBasicBlock *FB, MachineBasicBlock *JB)
+ : SplitB(B), TrueB(TB), FalseB(FB), JoinB(JB), PredR(PR) {}
+
+ MachineBasicBlock *SplitB = nullptr;
+ MachineBasicBlock *TrueB = nullptr;
+ MachineBasicBlock *FalseB = nullptr;
+ MachineBasicBlock *JoinB = nullptr;
+ unsigned PredR = 0;
+ };
+
+ struct PrintFP {
+ PrintFP(const FlowPattern &P, const TargetRegisterInfo &T)
+ : FP(P), TRI(T) {}
+
+ const FlowPattern &FP;
+ const TargetRegisterInfo &TRI;
+ friend raw_ostream &operator<< (raw_ostream &OS, const PrintFP &P);
+ };
+ raw_ostream &operator<<(raw_ostream &OS,
+ const PrintFP &P) LLVM_ATTRIBUTE_UNUSED;
+ raw_ostream &operator<<(raw_ostream &OS, const PrintFP &P) {
+ OS << "{ SplitB:" << PrintMB(P.FP.SplitB)
+ << ", PredR:" << PrintReg(P.FP.PredR, &P.TRI)
+ << ", TrueB:" << PrintMB(P.FP.TrueB) << ", FalseB:"
+ << PrintMB(P.FP.FalseB)
+ << ", JoinB:" << PrintMB(P.FP.JoinB) << " }";
+ return OS;
+ }
+
+ class HexagonEarlyIfConversion : public MachineFunctionPass {
+ public:
+ static char ID;
+
+ HexagonEarlyIfConversion() : MachineFunctionPass(ID),
+ HII(nullptr), TRI(nullptr), MFN(nullptr), MRI(nullptr), MDT(nullptr),
+ MLI(nullptr) {
+ initializeHexagonEarlyIfConversionPass(*PassRegistry::getPassRegistry());
+ }
+
+ StringRef getPassName() const override {
+ return "Hexagon early if conversion";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineBranchProbabilityInfo>();
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineLoopInfo>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ private:
+ typedef DenseSet<MachineBasicBlock*> BlockSetType;
+
+ bool isPreheader(const MachineBasicBlock *B) const;
+ bool matchFlowPattern(MachineBasicBlock *B, MachineLoop *L,
+ FlowPattern &FP);
+ bool visitBlock(MachineBasicBlock *B, MachineLoop *L);
+ bool visitLoop(MachineLoop *L);
+
+ bool hasEHLabel(const MachineBasicBlock *B) const;
+ bool hasUncondBranch(const MachineBasicBlock *B) const;
+ bool isValidCandidate(const MachineBasicBlock *B) const;
+ bool usesUndefVReg(const MachineInstr *MI) const;
+ bool isValid(const FlowPattern &FP) const;
+ unsigned countPredicateDefs(const MachineBasicBlock *B) const;
+ unsigned computePhiCost(MachineBasicBlock *B) const;
+ bool isProfitable(const FlowPattern &FP) const;
+ bool isPredicableStore(const MachineInstr *MI) const;
+ bool isSafeToSpeculate(const MachineInstr *MI) const;
+
+ unsigned getCondStoreOpcode(unsigned Opc, bool IfTrue) const;
+ void predicateInstr(MachineBasicBlock *ToB, MachineBasicBlock::iterator At,
+ MachineInstr *MI, unsigned PredR, bool IfTrue);
+ void predicateBlockNB(MachineBasicBlock *ToB,
+ MachineBasicBlock::iterator At, MachineBasicBlock *FromB,
+ unsigned PredR, bool IfTrue);
+
+ void updatePhiNodes(MachineBasicBlock *WhereB, const FlowPattern &FP);
+ void convert(const FlowPattern &FP);
+
+ void removeBlock(MachineBasicBlock *B);
+ void eliminatePhis(MachineBasicBlock *B);
+ void replacePhiEdges(MachineBasicBlock *OldB, MachineBasicBlock *NewB);
+ void mergeBlocks(MachineBasicBlock *PredB, MachineBasicBlock *SuccB);
+ void simplifyFlowGraph(const FlowPattern &FP);
+
+ const HexagonInstrInfo *HII;
+ const TargetRegisterInfo *TRI;
+ MachineFunction *MFN;
+ MachineRegisterInfo *MRI;
+ MachineDominatorTree *MDT;
+ MachineLoopInfo *MLI;
+ BlockSetType Deleted;
+ const MachineBranchProbabilityInfo *MBPI;
+ };
+
+ char HexagonEarlyIfConversion::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(HexagonEarlyIfConversion, "hexagon-eif",
+ "Hexagon early if conversion", false, false)
+
+bool HexagonEarlyIfConversion::isPreheader(const MachineBasicBlock *B) const {
+ if (B->succ_size() != 1)
+ return false;
+ MachineBasicBlock *SB = *B->succ_begin();
+ MachineLoop *L = MLI->getLoopFor(SB);
+ return L && SB == L->getHeader();
+}
+
+bool HexagonEarlyIfConversion::matchFlowPattern(MachineBasicBlock *B,
+ MachineLoop *L, FlowPattern &FP) {
+ DEBUG(dbgs() << "Checking flow pattern at BB#" << B->getNumber() << "\n");
+
+ // Interested only in conditional branches, no .new, no new-value, etc.
+ // Check the terminators directly, it's easier than handling all responses
+ // from AnalyzeBranch.
+ MachineBasicBlock *TB = nullptr, *FB = nullptr;
+ MachineBasicBlock::const_iterator T1I = B->getFirstTerminator();
+ if (T1I == B->end())
+ return false;
+ unsigned Opc = T1I->getOpcode();
+ if (Opc != Hexagon::J2_jumpt && Opc != Hexagon::J2_jumpf)
+ return false;
+ unsigned PredR = T1I->getOperand(0).getReg();
+
+ // Get the layout successor, or 0 if B does not have one.
+ MachineFunction::iterator NextBI = std::next(MachineFunction::iterator(B));
+ MachineBasicBlock *NextB = (NextBI != MFN->end()) ? &*NextBI : nullptr;
+
+ MachineBasicBlock *T1B = T1I->getOperand(1).getMBB();
+ MachineBasicBlock::const_iterator T2I = std::next(T1I);
+ // The second terminator should be an unconditional branch.
+ assert(T2I == B->end() || T2I->getOpcode() == Hexagon::J2_jump);
+ MachineBasicBlock *T2B = (T2I == B->end()) ? NextB
+ : T2I->getOperand(0).getMBB();
+ if (T1B == T2B) {
+ // XXX merge if T1B == NextB, or convert branch to unconditional.
+ // mark as diamond with both sides equal?
+ return false;
+ }
+ // Loop could be null for both.
+ if (MLI->getLoopFor(T1B) != L || MLI->getLoopFor(T2B) != L)
+ return false;
+
+ // Record the true/false blocks in such a way that "true" means "if (PredR)",
+ // and "false" means "if (!PredR)".
+ if (Opc == Hexagon::J2_jumpt)
+ TB = T1B, FB = T2B;
+ else
+ TB = T2B, FB = T1B;
+
+ if (!MDT->properlyDominates(B, TB) || !MDT->properlyDominates(B, FB))
+ return false;
+
+ // Detect triangle first. In case of a triangle, one of the blocks TB/FB
+ // can fall through into the other, in other words, it will be executed
+ // in both cases. We only want to predicate the block that is executed
+ // conditionally.
+ unsigned TNP = TB->pred_size(), FNP = FB->pred_size();
+ unsigned TNS = TB->succ_size(), FNS = FB->succ_size();
+
+ // A block is predicable if it has one predecessor (it must be B), and
+ // it has a single successor. In fact, the block has to end either with
+ // an unconditional branch (which can be predicated), or with a fall-
+ // through.
+ bool TOk = (TNP == 1) && (TNS == 1);
+ bool FOk = (FNP == 1) && (FNS == 1);
+
+ // If neither is predicable, there is nothing interesting.
+ if (!TOk && !FOk)
+ return false;
+
+ MachineBasicBlock *TSB = (TNS > 0) ? *TB->succ_begin() : nullptr;
+ MachineBasicBlock *FSB = (FNS > 0) ? *FB->succ_begin() : nullptr;
+ MachineBasicBlock *JB = nullptr;
+
+ if (TOk) {
+ if (FOk) {
+ if (TSB == FSB)
+ JB = TSB;
+ // Diamond: "if (P) then TB; else FB;".
+ } else {
+ // TOk && !FOk
+ if (TSB == FB) {
+ JB = FB;
+ FB = nullptr;
+ }
+ }
+ } else {
+ // !TOk && FOk (at least one must be true by now).
+ if (FSB == TB) {
+ JB = TB;
+ TB = nullptr;
+ }
+ }
+ // Don't try to predicate loop preheaders.
+ if ((TB && isPreheader(TB)) || (FB && isPreheader(FB))) {
+ DEBUG(dbgs() << "One of blocks " << PrintMB(TB) << ", " << PrintMB(FB)
+ << " is a loop preheader. Skipping.\n");
+ return false;
+ }
+
+ FP = FlowPattern(B, PredR, TB, FB, JB);
+ DEBUG(dbgs() << "Detected " << PrintFP(FP, *TRI) << "\n");
+ return true;
+}
+
+// KLUDGE: HexagonInstrInfo::AnalyzeBranch won't work on a block that
+// contains EH_LABEL.
+bool HexagonEarlyIfConversion::hasEHLabel(const MachineBasicBlock *B) const {
+ for (auto &I : *B)
+ if (I.isEHLabel())
+ return true;
+ return false;
+}
+
+// KLUDGE: HexagonInstrInfo::AnalyzeBranch may be unable to recognize
+// that a block can never fall-through.
+bool HexagonEarlyIfConversion::hasUncondBranch(const MachineBasicBlock *B)
+ const {
+ MachineBasicBlock::const_iterator I = B->getFirstTerminator(), E = B->end();
+ while (I != E) {
+ if (I->isBarrier())
+ return true;
+ ++I;
+ }
+ return false;
+}
+
+bool HexagonEarlyIfConversion::isValidCandidate(const MachineBasicBlock *B)
+ const {
+ if (!B)
+ return true;
+ if (B->isEHPad() || B->hasAddressTaken())
+ return false;
+ if (B->succ_size() == 0)
+ return false;
+
+ for (auto &MI : *B) {
+ if (MI.isDebugValue())
+ continue;
+ if (MI.isConditionalBranch())
+ return false;
+ unsigned Opc = MI.getOpcode();
+ bool IsJMP = (Opc == Hexagon::J2_jump);
+ if (!isPredicableStore(&MI) && !IsJMP && !isSafeToSpeculate(&MI))
+ return false;
+ // Look for predicate registers defined by this instruction. It's ok
+ // to speculate such an instruction, but the predicate register cannot
+ // be used outside of this block (or else it won't be possible to
+ // update the use of it after predication). PHI uses will be updated
+ // to use a result of a MUX, and a MUX cannot be created for predicate
+ // registers.
+ for (const MachineOperand &MO : MI.operands()) {
+ if (!MO.isReg() || !MO.isDef())
+ continue;
+ unsigned R = MO.getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(R))
+ continue;
+ if (MRI->getRegClass(R) != &Hexagon::PredRegsRegClass)
+ continue;
+ for (auto U = MRI->use_begin(R); U != MRI->use_end(); ++U)
+ if (U->getParent()->isPHI())
+ return false;
+ }
+ }
+ return true;
+}
+
+bool HexagonEarlyIfConversion::usesUndefVReg(const MachineInstr *MI) const {
+ for (const MachineOperand &MO : MI->operands()) {
+ if (!MO.isReg() || !MO.isUse())
+ continue;
+ unsigned R = MO.getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(R))
+ continue;
+ const MachineInstr *DefI = MRI->getVRegDef(R);
+ // "Undefined" virtual registers are actually defined via IMPLICIT_DEF.
+ assert(DefI && "Expecting a reaching def in MRI");
+ if (DefI->isImplicitDef())
+ return true;
+ }
+ return false;
+}
+
+bool HexagonEarlyIfConversion::isValid(const FlowPattern &FP) const {
+ if (hasEHLabel(FP.SplitB)) // KLUDGE: see function definition
+ return false;
+ if (FP.TrueB && !isValidCandidate(FP.TrueB))
+ return false;
+ if (FP.FalseB && !isValidCandidate(FP.FalseB))
+ return false;
+ // Check the PHIs in the join block. If any of them use a register
+ // that is defined as IMPLICIT_DEF, do not convert this. This can
+ // legitimately happen if one side of the split never executes, but
+ // the compiler is unable to prove it. That side may then seem to
+ // provide an "undef" value to the join block, however it will never
+ // execute at run-time. If we convert this case, the "undef" will
+ // be used in a MUX instruction, and that may seem like actually
+ // using an undefined value to other optimizations. This could lead
+ // to trouble further down the optimization stream, cause assertions
+ // to fail, etc.
+ if (FP.JoinB) {
+ const MachineBasicBlock &B = *FP.JoinB;
+ for (auto &MI : B) {
+ if (!MI.isPHI())
+ break;
+ if (usesUndefVReg(&MI))
+ return false;
+ unsigned DefR = MI.getOperand(0).getReg();
+ const TargetRegisterClass *RC = MRI->getRegClass(DefR);
+ if (RC == &Hexagon::PredRegsRegClass)
+ return false;
+ }
+ }
+ return true;
+}
+
+unsigned HexagonEarlyIfConversion::computePhiCost(MachineBasicBlock *B) const {
+ assert(B->pred_size() <= 2);
+ if (B->pred_size() < 2)
+ return 0;
+
+ unsigned Cost = 0;
+ MachineBasicBlock::const_iterator I, E = B->getFirstNonPHI();
+ for (I = B->begin(); I != E; ++I) {
+ const MachineOperand &RO1 = I->getOperand(1);
+ const MachineOperand &RO3 = I->getOperand(3);
+ assert(RO1.isReg() && RO3.isReg());
+ // Must have a MUX if the phi uses a subregister.
+ if (RO1.getSubReg() != 0 || RO3.getSubReg() != 0) {
+ Cost++;
+ continue;
+ }
+ MachineInstr *Def1 = MRI->getVRegDef(RO1.getReg());
+ MachineInstr *Def3 = MRI->getVRegDef(RO3.getReg());
+ if (!HII->isPredicable(*Def1) || !HII->isPredicable(*Def3))
+ Cost++;
+ }
+ return Cost;
+}
+
+unsigned HexagonEarlyIfConversion::countPredicateDefs(
+ const MachineBasicBlock *B) const {
+ unsigned PredDefs = 0;
+ for (auto &MI : *B) {
+ for (const MachineOperand &MO : MI.operands()) {
+ if (!MO.isReg() || !MO.isDef())
+ continue;
+ unsigned R = MO.getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(R))
+ continue;
+ if (MRI->getRegClass(R) == &Hexagon::PredRegsRegClass)
+ PredDefs++;
+ }
+ }
+ return PredDefs;
+}
+
+bool HexagonEarlyIfConversion::isProfitable(const FlowPattern &FP) const {
+ if (FP.TrueB && FP.FalseB) {
+
+ // Do not IfCovert if the branch is one sided.
+ if (MBPI) {
+ BranchProbability Prob(9, 10);
+ if (MBPI->getEdgeProbability(FP.SplitB, FP.TrueB) > Prob)
+ return false;
+ if (MBPI->getEdgeProbability(FP.SplitB, FP.FalseB) > Prob)
+ return false;
+ }
+
+ // If both sides are predicable, convert them if they join, and the
+ // join block has no other predecessors.
+ MachineBasicBlock *TSB = *FP.TrueB->succ_begin();
+ MachineBasicBlock *FSB = *FP.FalseB->succ_begin();
+ if (TSB != FSB)
+ return false;
+ if (TSB->pred_size() != 2)
+ return false;
+ }
+
+ // Calculate the total size of the predicated blocks.
+ // Assume instruction counts without branches to be the approximation of
+ // the code size. If the predicated blocks are smaller than a packet size,
+ // approximate the spare room in the packet that could be filled with the
+ // predicated/speculated instructions.
+ unsigned TS = 0, FS = 0, Spare = 0;
+ if (FP.TrueB) {
+ TS = std::distance(FP.TrueB->begin(), FP.TrueB->getFirstTerminator());
+ if (TS < HEXAGON_PACKET_SIZE)
+ Spare += HEXAGON_PACKET_SIZE-TS;
+ }
+ if (FP.FalseB) {
+ FS = std::distance(FP.FalseB->begin(), FP.FalseB->getFirstTerminator());
+ if (FS < HEXAGON_PACKET_SIZE)
+ Spare += HEXAGON_PACKET_SIZE-TS;
+ }
+ unsigned TotalIn = TS+FS;
+ DEBUG(dbgs() << "Total number of instructions to be predicated/speculated: "
+ << TotalIn << ", spare room: " << Spare << "\n");
+ if (TotalIn >= SizeLimit+Spare)
+ return false;
+
+ // Count the number of PHI nodes that will need to be updated (converted
+ // to MUX). Those can be later converted to predicated instructions, so
+ // they aren't always adding extra cost.
+ // KLUDGE: Also, count the number of predicate register definitions in
+ // each block. The scheduler may increase the pressure of these and cause
+ // expensive spills (e.g. bitmnp01).
+ unsigned TotalPh = 0;
+ unsigned PredDefs = countPredicateDefs(FP.SplitB);
+ if (FP.JoinB) {
+ TotalPh = computePhiCost(FP.JoinB);
+ PredDefs += countPredicateDefs(FP.JoinB);
+ } else {
+ if (FP.TrueB && FP.TrueB->succ_size() > 0) {
+ MachineBasicBlock *SB = *FP.TrueB->succ_begin();
+ TotalPh += computePhiCost(SB);
+ PredDefs += countPredicateDefs(SB);
+ }
+ if (FP.FalseB && FP.FalseB->succ_size() > 0) {
+ MachineBasicBlock *SB = *FP.FalseB->succ_begin();
+ TotalPh += computePhiCost(SB);
+ PredDefs += countPredicateDefs(SB);
+ }
+ }
+ DEBUG(dbgs() << "Total number of extra muxes from converted phis: "
+ << TotalPh << "\n");
+ if (TotalIn+TotalPh >= SizeLimit+Spare)
+ return false;
+
+ DEBUG(dbgs() << "Total number of predicate registers: " << PredDefs << "\n");
+ if (PredDefs > 4)
+ return false;
+
+ return true;
+}
+
+bool HexagonEarlyIfConversion::visitBlock(MachineBasicBlock *B,
+ MachineLoop *L) {
+ bool Changed = false;
+
+ // Visit all dominated blocks from the same loop first, then process B.
+ MachineDomTreeNode *N = MDT->getNode(B);
+ typedef GraphTraits<MachineDomTreeNode*> GTN;
+ // We will change CFG/DT during this traversal, so take precautions to
+ // avoid problems related to invalidated iterators. In fact, processing
+ // a child C of B cannot cause another child to be removed, but it can
+ // cause a new child to be added (which was a child of C before C itself
+ // was removed. This new child C, however, would have been processed
+ // prior to processing B, so there is no need to process it again.
+ // Simply keep a list of children of B, and traverse that list.
+ typedef SmallVector<MachineDomTreeNode*,4> DTNodeVectType;
+ DTNodeVectType Cn(GTN::child_begin(N), GTN::child_end(N));
+ for (DTNodeVectType::iterator I = Cn.begin(), E = Cn.end(); I != E; ++I) {
+ MachineBasicBlock *SB = (*I)->getBlock();
+ if (!Deleted.count(SB))
+ Changed |= visitBlock(SB, L);
+ }
+ // When walking down the dominator tree, we want to traverse through
+ // blocks from nested (other) loops, because they can dominate blocks
+ // that are in L. Skip the non-L blocks only after the tree traversal.
+ if (MLI->getLoopFor(B) != L)
+ return Changed;
+
+ FlowPattern FP;
+ if (!matchFlowPattern(B, L, FP))
+ return Changed;
+
+ if (!isValid(FP)) {
+ DEBUG(dbgs() << "Conversion is not valid\n");
+ return Changed;
+ }
+ if (!isProfitable(FP)) {
+ DEBUG(dbgs() << "Conversion is not profitable\n");
+ return Changed;
+ }
+
+ convert(FP);
+ simplifyFlowGraph(FP);
+ return true;
+}
+
+bool HexagonEarlyIfConversion::visitLoop(MachineLoop *L) {
+ MachineBasicBlock *HB = L ? L->getHeader() : nullptr;
+ DEBUG((L ? dbgs() << "Visiting loop H:" << PrintMB(HB)
+ : dbgs() << "Visiting function") << "\n");
+ bool Changed = false;
+ if (L) {
+ for (MachineLoop::iterator I = L->begin(), E = L->end(); I != E; ++I)
+ Changed |= visitLoop(*I);
+ }
+
+ MachineBasicBlock *EntryB = GraphTraits<MachineFunction*>::getEntryNode(MFN);
+ Changed |= visitBlock(L ? HB : EntryB, L);
+ return Changed;
+}
+
+bool HexagonEarlyIfConversion::isPredicableStore(const MachineInstr *MI)
+ const {
+ // HexagonInstrInfo::isPredicable will consider these stores are non-
+ // -predicable if the offset would become constant-extended after
+ // predication.
+ unsigned Opc = MI->getOpcode();
+ switch (Opc) {
+ case Hexagon::S2_storerb_io:
+ case Hexagon::S2_storerbnew_io:
+ case Hexagon::S2_storerh_io:
+ case Hexagon::S2_storerhnew_io:
+ case Hexagon::S2_storeri_io:
+ case Hexagon::S2_storerinew_io:
+ case Hexagon::S2_storerd_io:
+ case Hexagon::S4_storeirb_io:
+ case Hexagon::S4_storeirh_io:
+ case Hexagon::S4_storeiri_io:
+ return true;
+ }
+
+ // TargetInstrInfo::isPredicable takes a non-const pointer.
+ return MI->mayStore() && HII->isPredicable(const_cast<MachineInstr&>(*MI));
+}
+
+bool HexagonEarlyIfConversion::isSafeToSpeculate(const MachineInstr *MI)
+ const {
+ if (MI->mayLoad() || MI->mayStore())
+ return false;
+ if (MI->isCall() || MI->isBarrier() || MI->isBranch())
+ return false;
+ if (MI->hasUnmodeledSideEffects())
+ return false;
+
+ return true;
+}
+
+unsigned HexagonEarlyIfConversion::getCondStoreOpcode(unsigned Opc,
+ bool IfTrue) const {
+ return HII->getCondOpcode(Opc, !IfTrue);
+}
+
+void HexagonEarlyIfConversion::predicateInstr(MachineBasicBlock *ToB,
+ MachineBasicBlock::iterator At, MachineInstr *MI,
+ unsigned PredR, bool IfTrue) {
+ DebugLoc DL;
+ if (At != ToB->end())
+ DL = At->getDebugLoc();
+ else if (!ToB->empty())
+ DL = ToB->back().getDebugLoc();
+
+ unsigned Opc = MI->getOpcode();
+
+ if (isPredicableStore(MI)) {
+ unsigned COpc = getCondStoreOpcode(Opc, IfTrue);
+ assert(COpc);
+ MachineInstrBuilder MIB = BuildMI(*ToB, At, DL, HII->get(COpc));
+ MachineInstr::mop_iterator MOI = MI->operands_begin();
+ if (HII->isPostIncrement(*MI)) {
+ MIB.addOperand(*MOI);
+ ++MOI;
+ }
+ MIB.addReg(PredR);
+ for (const MachineOperand &MO : make_range(MOI, MI->operands_end()))
+ MIB.addOperand(MO);
+
+ // Set memory references.
+ MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+ MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+ MIB.setMemRefs(MMOBegin, MMOEnd);
+
+ MI->eraseFromParent();
+ return;
+ }
+
+ if (Opc == Hexagon::J2_jump) {
+ MachineBasicBlock *TB = MI->getOperand(0).getMBB();
+ const MCInstrDesc &D = HII->get(IfTrue ? Hexagon::J2_jumpt
+ : Hexagon::J2_jumpf);
+ BuildMI(*ToB, At, DL, D)
+ .addReg(PredR)
+ .addMBB(TB);
+ MI->eraseFromParent();
+ return;
+ }
+
+ // Print the offending instruction unconditionally as we are about to
+ // abort.
+ dbgs() << *MI;
+ llvm_unreachable("Unexpected instruction");
+}
+
+// Predicate/speculate non-branch instructions from FromB into block ToB.
+// Leave the branches alone, they will be handled later. Btw, at this point
+// FromB should have at most one branch, and it should be unconditional.
+void HexagonEarlyIfConversion::predicateBlockNB(MachineBasicBlock *ToB,
+ MachineBasicBlock::iterator At, MachineBasicBlock *FromB,
+ unsigned PredR, bool IfTrue) {
+ DEBUG(dbgs() << "Predicating block " << PrintMB(FromB) << "\n");
+ MachineBasicBlock::iterator End = FromB->getFirstTerminator();
+ MachineBasicBlock::iterator I, NextI;
+
+ for (I = FromB->begin(); I != End; I = NextI) {
+ assert(!I->isPHI());
+ NextI = std::next(I);
+ if (isSafeToSpeculate(&*I))
+ ToB->splice(At, FromB, I);
+ else
+ predicateInstr(ToB, At, &*I, PredR, IfTrue);
+ }
+}
+
+void HexagonEarlyIfConversion::updatePhiNodes(MachineBasicBlock *WhereB,
+ const FlowPattern &FP) {
+ // Visit all PHI nodes in the WhereB block and generate MUX instructions
+ // in the split block. Update the PHI nodes with the values of the MUX.
+ auto NonPHI = WhereB->getFirstNonPHI();
+ for (auto I = WhereB->begin(); I != NonPHI; ++I) {
+ MachineInstr *PN = &*I;
+ // Registers and subregisters corresponding to TrueB, FalseB and SplitB.
+ unsigned TR = 0, TSR = 0, FR = 0, FSR = 0, SR = 0, SSR = 0;
+ for (int i = PN->getNumOperands()-2; i > 0; i -= 2) {
+ const MachineOperand &RO = PN->getOperand(i), &BO = PN->getOperand(i+1);
+ if (BO.getMBB() == FP.SplitB)
+ SR = RO.getReg(), SSR = RO.getSubReg();
+ else if (BO.getMBB() == FP.TrueB)
+ TR = RO.getReg(), TSR = RO.getSubReg();
+ else if (BO.getMBB() == FP.FalseB)
+ FR = RO.getReg(), FSR = RO.getSubReg();
+ else
+ continue;
+ PN->RemoveOperand(i+1);
+ PN->RemoveOperand(i);
+ }
+ if (TR == 0)
+ TR = SR, TSR = SSR;
+ else if (FR == 0)
+ FR = SR, FSR = SSR;
+ assert(TR && FR);
+
+ using namespace Hexagon;
+
+ unsigned DR = PN->getOperand(0).getReg();
+ const TargetRegisterClass *RC = MRI->getRegClass(DR);
+ unsigned Opc = 0;
+ if (RC == &IntRegsRegClass)
+ Opc = C2_mux;
+ else if (RC == &DoubleRegsRegClass)
+ Opc = PS_pselect;
+ else if (RC == &VectorRegsRegClass)
+ Opc = PS_vselect;
+ else if (RC == &VecDblRegsRegClass)
+ Opc = PS_wselect;
+ else if (RC == &VectorRegs128BRegClass)
+ Opc = PS_vselect_128B;
+ else if (RC == &VecDblRegs128BRegClass)
+ Opc = PS_wselect_128B;
+ else
+ llvm_unreachable("unexpected register type");
+ const MCInstrDesc &D = HII->get(Opc);
+
+ MachineBasicBlock::iterator MuxAt = FP.SplitB->getFirstTerminator();
+ DebugLoc DL;
+ if (MuxAt != FP.SplitB->end())
+ DL = MuxAt->getDebugLoc();
+ unsigned MuxR = MRI->createVirtualRegister(RC);
+ BuildMI(*FP.SplitB, MuxAt, DL, D, MuxR)
+ .addReg(FP.PredR)
+ .addReg(TR, 0, TSR)
+ .addReg(FR, 0, FSR);
+
+ PN->addOperand(MachineOperand::CreateReg(MuxR, false));
+ PN->addOperand(MachineOperand::CreateMBB(FP.SplitB));
+ }
+}
+
+void HexagonEarlyIfConversion::convert(const FlowPattern &FP) {
+ MachineBasicBlock *TSB = nullptr, *FSB = nullptr;
+ MachineBasicBlock::iterator OldTI = FP.SplitB->getFirstTerminator();
+ assert(OldTI != FP.SplitB->end());
+ DebugLoc DL = OldTI->getDebugLoc();
+
+ if (FP.TrueB) {
+ TSB = *FP.TrueB->succ_begin();
+ predicateBlockNB(FP.SplitB, OldTI, FP.TrueB, FP.PredR, true);
+ }
+ if (FP.FalseB) {
+ FSB = *FP.FalseB->succ_begin();
+ MachineBasicBlock::iterator At = FP.SplitB->getFirstTerminator();
+ predicateBlockNB(FP.SplitB, At, FP.FalseB, FP.PredR, false);
+ }
+
+ // Regenerate new terminators in the split block and update the successors.
+ // First, remember any information that may be needed later and remove the
+ // existing terminators/successors from the split block.
+ MachineBasicBlock *SSB = nullptr;
+ FP.SplitB->erase(OldTI, FP.SplitB->end());
+ while (FP.SplitB->succ_size() > 0) {
+ MachineBasicBlock *T = *FP.SplitB->succ_begin();
+ // It's possible that the split block had a successor that is not a pre-
+ // dicated block. This could only happen if there was only one block to
+ // be predicated. Example:
+ // split_b:
+ // if (p) jump true_b
+ // jump unrelated2_b
+ // unrelated1_b:
+ // ...
+ // unrelated2_b: ; can have other predecessors, so it's not "false_b"
+ // jump other_b
+ // true_b: ; only reachable from split_b, can be predicated
+ // ...
+ //
+ // Find this successor (SSB) if it exists.
+ if (T != FP.TrueB && T != FP.FalseB) {
+ assert(!SSB);
+ SSB = T;
+ }
+ FP.SplitB->removeSuccessor(FP.SplitB->succ_begin());
+ }
+
+ // Insert new branches and update the successors of the split block. This
+ // may create unconditional branches to the layout successor, etc., but
+ // that will be cleaned up later. For now, make sure that correct code is
+ // generated.
+ if (FP.JoinB) {
+ assert(!SSB || SSB == FP.JoinB);
+ BuildMI(*FP.SplitB, FP.SplitB->end(), DL, HII->get(Hexagon::J2_jump))
+ .addMBB(FP.JoinB);
+ FP.SplitB->addSuccessor(FP.JoinB);
+ } else {
+ bool HasBranch = false;
+ if (TSB) {
+ BuildMI(*FP.SplitB, FP.SplitB->end(), DL, HII->get(Hexagon::J2_jumpt))
+ .addReg(FP.PredR)
+ .addMBB(TSB);
+ FP.SplitB->addSuccessor(TSB);
+ HasBranch = true;
+ }
+ if (FSB) {
+ const MCInstrDesc &D = HasBranch ? HII->get(Hexagon::J2_jump)
+ : HII->get(Hexagon::J2_jumpf);
+ MachineInstrBuilder MIB = BuildMI(*FP.SplitB, FP.SplitB->end(), DL, D);
+ if (!HasBranch)
+ MIB.addReg(FP.PredR);
+ MIB.addMBB(FSB);
+ FP.SplitB->addSuccessor(FSB);
+ }
+ if (SSB) {
+ // This cannot happen if both TSB and FSB are set. [TF]SB are the
+ // successor blocks of the TrueB and FalseB (or null of the TrueB
+ // or FalseB block is null). SSB is the potential successor block
+ // of the SplitB that is neither TrueB nor FalseB.
+ BuildMI(*FP.SplitB, FP.SplitB->end(), DL, HII->get(Hexagon::J2_jump))
+ .addMBB(SSB);
+ FP.SplitB->addSuccessor(SSB);
+ }
+ }
+
+ // What is left to do is to update the PHI nodes that could have entries
+ // referring to predicated blocks.
+ if (FP.JoinB) {
+ updatePhiNodes(FP.JoinB, FP);
+ } else {
+ if (TSB)
+ updatePhiNodes(TSB, FP);
+ if (FSB)
+ updatePhiNodes(FSB, FP);
+ // Nothing to update in SSB, since SSB's predecessors haven't changed.
+ }
+}
+
+void HexagonEarlyIfConversion::removeBlock(MachineBasicBlock *B) {
+ DEBUG(dbgs() << "Removing block " << PrintMB(B) << "\n");
+
+ // Transfer the immediate dominator information from B to its descendants.
+ MachineDomTreeNode *N = MDT->getNode(B);
+ MachineDomTreeNode *IDN = N->getIDom();
+ if (IDN) {
+ MachineBasicBlock *IDB = IDN->getBlock();
+ typedef GraphTraits<MachineDomTreeNode*> GTN;
+ typedef SmallVector<MachineDomTreeNode*,4> DTNodeVectType;
+ DTNodeVectType Cn(GTN::child_begin(N), GTN::child_end(N));
+ for (DTNodeVectType::iterator I = Cn.begin(), E = Cn.end(); I != E; ++I) {
+ MachineBasicBlock *SB = (*I)->getBlock();
+ MDT->changeImmediateDominator(SB, IDB);
+ }
+ }
+
+ while (B->succ_size() > 0)
+ B->removeSuccessor(B->succ_begin());
+
+ for (auto I = B->pred_begin(), E = B->pred_end(); I != E; ++I)
+ (*I)->removeSuccessor(B, true);
+
+ Deleted.insert(B);
+ MDT->eraseNode(B);
+ MFN->erase(B->getIterator());
+}
+
+void HexagonEarlyIfConversion::eliminatePhis(MachineBasicBlock *B) {
+ DEBUG(dbgs() << "Removing phi nodes from block " << PrintMB(B) << "\n");
+ MachineBasicBlock::iterator I, NextI, NonPHI = B->getFirstNonPHI();
+ for (I = B->begin(); I != NonPHI; I = NextI) {
+ NextI = std::next(I);
+ MachineInstr *PN = &*I;
+ assert(PN->getNumOperands() == 3 && "Invalid phi node");
+ MachineOperand &UO = PN->getOperand(1);
+ unsigned UseR = UO.getReg(), UseSR = UO.getSubReg();
+ unsigned DefR = PN->getOperand(0).getReg();
+ unsigned NewR = UseR;
+ if (UseSR) {
+ // MRI.replaceVregUsesWith does not allow to update the subregister,
+ // so instead of doing the use-iteration here, create a copy into a
+ // "non-subregistered" register.
+ const DebugLoc &DL = PN->getDebugLoc();
+ const TargetRegisterClass *RC = MRI->getRegClass(DefR);
+ NewR = MRI->createVirtualRegister(RC);
+ NonPHI = BuildMI(*B, NonPHI, DL, HII->get(TargetOpcode::COPY), NewR)
+ .addReg(UseR, 0, UseSR);
+ }
+ MRI->replaceRegWith(DefR, NewR);
+ B->erase(I);
+ }
+}
+
+void HexagonEarlyIfConversion::replacePhiEdges(MachineBasicBlock *OldB,
+ MachineBasicBlock *NewB) {
+ for (auto I = OldB->succ_begin(), E = OldB->succ_end(); I != E; ++I) {
+ MachineBasicBlock *SB = *I;
+ MachineBasicBlock::iterator P, N = SB->getFirstNonPHI();
+ for (P = SB->begin(); P != N; ++P) {
+ MachineInstr &PN = *P;
+ for (MachineOperand &MO : PN.operands())
+ if (MO.isMBB() && MO.getMBB() == OldB)
+ MO.setMBB(NewB);
+ }
+ }
+}
+
+void HexagonEarlyIfConversion::mergeBlocks(MachineBasicBlock *PredB,
+ MachineBasicBlock *SuccB) {
+ DEBUG(dbgs() << "Merging blocks " << PrintMB(PredB) << " and "
+ << PrintMB(SuccB) << "\n");
+ bool TermOk = hasUncondBranch(SuccB);
+ eliminatePhis(SuccB);
+ HII->removeBranch(*PredB);
+ PredB->removeSuccessor(SuccB);
+ PredB->splice(PredB->end(), SuccB, SuccB->begin(), SuccB->end());
+ MachineBasicBlock::succ_iterator I, E = SuccB->succ_end();
+ for (I = SuccB->succ_begin(); I != E; ++I)
+ PredB->addSuccessor(*I);
+ PredB->normalizeSuccProbs();
+ replacePhiEdges(SuccB, PredB);
+ removeBlock(SuccB);
+ if (!TermOk)
+ PredB->updateTerminator();
+}
+
+void HexagonEarlyIfConversion::simplifyFlowGraph(const FlowPattern &FP) {
+ if (FP.TrueB)
+ removeBlock(FP.TrueB);
+ if (FP.FalseB)
+ removeBlock(FP.FalseB);
+
+ FP.SplitB->updateTerminator();
+ if (FP.SplitB->succ_size() != 1)
+ return;
+
+ MachineBasicBlock *SB = *FP.SplitB->succ_begin();
+ if (SB->pred_size() != 1)
+ return;
+
+ // By now, the split block has only one successor (SB), and SB has only
+ // one predecessor. We can try to merge them. We will need to update ter-
+ // minators in FP.Split+SB, and that requires working AnalyzeBranch, which
+ // fails on Hexagon for blocks that have EH_LABELs. However, if SB ends
+ // with an unconditional branch, we won't need to touch the terminators.
+ if (!hasEHLabel(SB) || hasUncondBranch(SB))
+ mergeBlocks(FP.SplitB, SB);
+}
+
+bool HexagonEarlyIfConversion::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
+ auto &ST = MF.getSubtarget<HexagonSubtarget>();
+ HII = ST.getInstrInfo();
+ TRI = ST.getRegisterInfo();
+ MFN = &MF;
+ MRI = &MF.getRegInfo();
+ MDT = &getAnalysis<MachineDominatorTree>();
+ MLI = &getAnalysis<MachineLoopInfo>();
+ MBPI = EnableHexagonBP ? &getAnalysis<MachineBranchProbabilityInfo>() :
+ nullptr;
+
+ Deleted.clear();
+ bool Changed = false;
+
+ for (MachineLoopInfo::iterator I = MLI->begin(), E = MLI->end(); I != E; ++I)
+ Changed |= visitLoop(*I);
+ Changed |= visitLoop(nullptr);
+
+ return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// Public Constructor Functions
+//===----------------------------------------------------------------------===//
+FunctionPass *llvm::createHexagonEarlyIfConversion() {
+ return new HexagonEarlyIfConversion();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp
new file mode 100644
index 000000000000..8f070d842b8c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp
@@ -0,0 +1,1283 @@
+//===--- HexagonExpandCondsets.cpp ----------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// Replace mux instructions with the corresponding legal instructions.
+// It is meant to work post-SSA, but still on virtual registers. It was
+// originally placed between register coalescing and machine instruction
+// scheduler.
+// In this place in the optimization sequence, live interval analysis had
+// been performed, and the live intervals should be preserved. A large part
+// of the code deals with preserving the liveness information.
+//
+// Liveness tracking aside, the main functionality of this pass is divided
+// into two steps. The first step is to replace an instruction
+// vreg0 = C2_mux vreg1, vreg2, vreg3
+// with a pair of conditional transfers
+// vreg0 = A2_tfrt vreg1, vreg2
+// vreg0 = A2_tfrf vreg1, vreg3
+// It is the intention that the execution of this pass could be terminated
+// after this step, and the code generated would be functionally correct.
+//
+// If the uses of the source values vreg1 and vreg2 are kills, and their
+// definitions are predicable, then in the second step, the conditional
+// transfers will then be rewritten as predicated instructions. E.g.
+// vreg0 = A2_or vreg1, vreg2
+// vreg3 = A2_tfrt vreg99, vreg0<kill>
+// will be rewritten as
+// vreg3 = A2_port vreg99, vreg1, vreg2
+//
+// This replacement has two variants: "up" and "down". Consider this case:
+// vreg0 = A2_or vreg1, vreg2
+// ... [intervening instructions] ...
+// vreg3 = A2_tfrt vreg99, vreg0<kill>
+// variant "up":
+// vreg3 = A2_port vreg99, vreg1, vreg2
+// ... [intervening instructions, vreg0->vreg3] ...
+// [deleted]
+// variant "down":
+// [deleted]
+// ... [intervening instructions] ...
+// vreg3 = A2_port vreg99, vreg1, vreg2
+//
+// Both, one or none of these variants may be valid, and checks are made
+// to rule out inapplicable variants.
+//
+// As an additional optimization, before either of the two steps above is
+// executed, the pass attempts to coalesce the target register with one of
+// the source registers, e.g. given an instruction
+// vreg3 = C2_mux vreg0, vreg1, vreg2
+// vreg3 will be coalesced with either vreg1 or vreg2. If this succeeds,
+// the instruction would then be (for example)
+// vreg3 = C2_mux vreg0, vreg3, vreg2
+// and, under certain circumstances, this could result in only one predicated
+// instruction:
+// vreg3 = A2_tfrf vreg0, vreg2
+//
+
+// Splitting a definition of a register into two predicated transfers
+// creates a complication in liveness tracking. Live interval computation
+// will see both instructions as actual definitions, and will mark the
+// first one as dead. The definition is not actually dead, and this
+// situation will need to be fixed. For example:
+// vreg1<def,dead> = A2_tfrt ... ; marked as dead
+// vreg1<def> = A2_tfrf ...
+//
+// Since any of the individual predicated transfers may end up getting
+// removed (in case it is an identity copy), some pre-existing def may
+// be marked as dead after live interval recomputation:
+// vreg1<def,dead> = ... ; marked as dead
+// ...
+// vreg1<def> = A2_tfrf ... ; if A2_tfrt is removed
+// This case happens if vreg1 was used as a source in A2_tfrt, which means
+// that is it actually live at the A2_tfrf, and so the now dead definition
+// of vreg1 will need to be updated to non-dead at some point.
+//
+// This issue could be remedied by adding implicit uses to the predicated
+// transfers, but this will create a problem with subsequent predication,
+// since the transfers will no longer be possible to reorder. To avoid
+// that, the initial splitting will not add any implicit uses. These
+// implicit uses will be added later, after predication. The extra price,
+// however, is that finding the locations where the implicit uses need
+// to be added, and updating the live ranges will be more involved.
+
+#define DEBUG_TYPE "expand-condsets"
+
+#include "HexagonInstrInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <cassert>
+#include <iterator>
+#include <set>
+#include <utility>
+
+using namespace llvm;
+
+static cl::opt<unsigned> OptTfrLimit("expand-condsets-tfr-limit",
+ cl::init(~0U), cl::Hidden, cl::desc("Max number of mux expansions"));
+static cl::opt<unsigned> OptCoaLimit("expand-condsets-coa-limit",
+ cl::init(~0U), cl::Hidden, cl::desc("Max number of segment coalescings"));
+
+namespace llvm {
+
+ void initializeHexagonExpandCondsetsPass(PassRegistry&);
+ FunctionPass *createHexagonExpandCondsets();
+
+} // end namespace llvm
+
+namespace {
+
+ class HexagonExpandCondsets : public MachineFunctionPass {
+ public:
+ static char ID;
+
+ HexagonExpandCondsets() :
+ MachineFunctionPass(ID), HII(nullptr), TRI(nullptr), MRI(nullptr),
+ LIS(nullptr), CoaLimitActive(false),
+ TfrLimitActive(false), CoaCounter(0), TfrCounter(0) {
+ if (OptCoaLimit.getPosition())
+ CoaLimitActive = true, CoaLimit = OptCoaLimit;
+ if (OptTfrLimit.getPosition())
+ TfrLimitActive = true, TfrLimit = OptTfrLimit;
+ initializeHexagonExpandCondsetsPass(*PassRegistry::getPassRegistry());
+ }
+
+ StringRef getPassName() const override { return "Hexagon Expand Condsets"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LiveIntervals>();
+ AU.addPreserved<LiveIntervals>();
+ AU.addPreserved<SlotIndexes>();
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ private:
+ const HexagonInstrInfo *HII;
+ const TargetRegisterInfo *TRI;
+ MachineDominatorTree *MDT;
+ MachineRegisterInfo *MRI;
+ LiveIntervals *LIS;
+
+ bool CoaLimitActive, TfrLimitActive;
+ unsigned CoaLimit, TfrLimit, CoaCounter, TfrCounter;
+
+ struct RegisterRef {
+ RegisterRef(const MachineOperand &Op) : Reg(Op.getReg()),
+ Sub(Op.getSubReg()) {}
+ RegisterRef(unsigned R = 0, unsigned S = 0) : Reg(R), Sub(S) {}
+
+ bool operator== (RegisterRef RR) const {
+ return Reg == RR.Reg && Sub == RR.Sub;
+ }
+ bool operator!= (RegisterRef RR) const { return !operator==(RR); }
+ bool operator< (RegisterRef RR) const {
+ return Reg < RR.Reg || (Reg == RR.Reg && Sub < RR.Sub);
+ }
+
+ unsigned Reg, Sub;
+ };
+
+ typedef DenseMap<unsigned,unsigned> ReferenceMap;
+ enum { Sub_Low = 0x1, Sub_High = 0x2, Sub_None = (Sub_Low | Sub_High) };
+ enum { Exec_Then = 0x10, Exec_Else = 0x20 };
+ unsigned getMaskForSub(unsigned Sub);
+ bool isCondset(const MachineInstr &MI);
+ LaneBitmask getLaneMask(unsigned Reg, unsigned Sub);
+
+ void addRefToMap(RegisterRef RR, ReferenceMap &Map, unsigned Exec);
+ bool isRefInMap(RegisterRef, ReferenceMap &Map, unsigned Exec);
+
+ void updateDeadsInRange(unsigned Reg, LaneBitmask LM, LiveRange &Range);
+ void updateKillFlags(unsigned Reg);
+ void updateDeadFlags(unsigned Reg);
+ void recalculateLiveInterval(unsigned Reg);
+ void removeInstr(MachineInstr &MI);
+ void updateLiveness(std::set<unsigned> &RegSet, bool Recalc,
+ bool UpdateKills, bool UpdateDeads);
+
+ unsigned getCondTfrOpcode(const MachineOperand &SO, bool Cond);
+ MachineInstr *genCondTfrFor(MachineOperand &SrcOp,
+ MachineBasicBlock::iterator At, unsigned DstR,
+ unsigned DstSR, const MachineOperand &PredOp, bool PredSense,
+ bool ReadUndef, bool ImpUse);
+ bool split(MachineInstr &MI, std::set<unsigned> &UpdRegs);
+
+ bool isPredicable(MachineInstr *MI);
+ MachineInstr *getReachingDefForPred(RegisterRef RD,
+ MachineBasicBlock::iterator UseIt, unsigned PredR, bool Cond);
+ bool canMoveOver(MachineInstr &MI, ReferenceMap &Defs, ReferenceMap &Uses);
+ bool canMoveMemTo(MachineInstr &MI, MachineInstr &ToI, bool IsDown);
+ void predicateAt(const MachineOperand &DefOp, MachineInstr &MI,
+ MachineBasicBlock::iterator Where,
+ const MachineOperand &PredOp, bool Cond,
+ std::set<unsigned> &UpdRegs);
+ void renameInRange(RegisterRef RO, RegisterRef RN, unsigned PredR,
+ bool Cond, MachineBasicBlock::iterator First,
+ MachineBasicBlock::iterator Last);
+ bool predicate(MachineInstr &TfrI, bool Cond, std::set<unsigned> &UpdRegs);
+ bool predicateInBlock(MachineBasicBlock &B,
+ std::set<unsigned> &UpdRegs);
+
+ bool isIntReg(RegisterRef RR, unsigned &BW);
+ bool isIntraBlocks(LiveInterval &LI);
+ bool coalesceRegisters(RegisterRef R1, RegisterRef R2);
+ bool coalesceSegments(const SmallVectorImpl<MachineInstr*> &Condsets,
+ std::set<unsigned> &UpdRegs);
+ };
+
+} // end anonymous namespace
+
+char HexagonExpandCondsets::ID = 0;
+
+namespace llvm {
+
+ char &HexagonExpandCondsetsID = HexagonExpandCondsets::ID;
+
+} // end namespace llvm
+
+INITIALIZE_PASS_BEGIN(HexagonExpandCondsets, "expand-condsets",
+ "Hexagon Expand Condsets", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(HexagonExpandCondsets, "expand-condsets",
+ "Hexagon Expand Condsets", false, false)
+
+unsigned HexagonExpandCondsets::getMaskForSub(unsigned Sub) {
+ switch (Sub) {
+ case Hexagon::isub_lo:
+ case Hexagon::vsub_lo:
+ return Sub_Low;
+ case Hexagon::isub_hi:
+ case Hexagon::vsub_hi:
+ return Sub_High;
+ case Hexagon::NoSubRegister:
+ return Sub_None;
+ }
+ llvm_unreachable("Invalid subregister");
+}
+
+bool HexagonExpandCondsets::isCondset(const MachineInstr &MI) {
+ unsigned Opc = MI.getOpcode();
+ switch (Opc) {
+ case Hexagon::C2_mux:
+ case Hexagon::C2_muxii:
+ case Hexagon::C2_muxir:
+ case Hexagon::C2_muxri:
+ case Hexagon::PS_pselect:
+ return true;
+ break;
+ }
+ return false;
+}
+
+LaneBitmask HexagonExpandCondsets::getLaneMask(unsigned Reg, unsigned Sub) {
+ assert(TargetRegisterInfo::isVirtualRegister(Reg));
+ return Sub != 0 ? TRI->getSubRegIndexLaneMask(Sub)
+ : MRI->getMaxLaneMaskForVReg(Reg);
+}
+
+void HexagonExpandCondsets::addRefToMap(RegisterRef RR, ReferenceMap &Map,
+ unsigned Exec) {
+ unsigned Mask = getMaskForSub(RR.Sub) | Exec;
+ ReferenceMap::iterator F = Map.find(RR.Reg);
+ if (F == Map.end())
+ Map.insert(std::make_pair(RR.Reg, Mask));
+ else
+ F->second |= Mask;
+}
+
+bool HexagonExpandCondsets::isRefInMap(RegisterRef RR, ReferenceMap &Map,
+ unsigned Exec) {
+ ReferenceMap::iterator F = Map.find(RR.Reg);
+ if (F == Map.end())
+ return false;
+ unsigned Mask = getMaskForSub(RR.Sub) | Exec;
+ if (Mask & F->second)
+ return true;
+ return false;
+}
+
+void HexagonExpandCondsets::updateKillFlags(unsigned Reg) {
+ auto KillAt = [this,Reg] (SlotIndex K, LaneBitmask LM) -> void {
+ // Set the <kill> flag on a use of Reg whose lane mask is contained in LM.
+ MachineInstr *MI = LIS->getInstructionFromIndex(K);
+ for (auto &Op : MI->operands()) {
+ if (!Op.isReg() || !Op.isUse() || Op.getReg() != Reg)
+ continue;
+ LaneBitmask SLM = getLaneMask(Reg, Op.getSubReg());
+ if ((SLM & LM) == SLM) {
+ // Only set the kill flag on the first encountered use of Reg in this
+ // instruction.
+ Op.setIsKill(true);
+ break;
+ }
+ }
+ };
+
+ LiveInterval &LI = LIS->getInterval(Reg);
+ for (auto I = LI.begin(), E = LI.end(); I != E; ++I) {
+ if (!I->end.isRegister())
+ continue;
+ // Do not mark the end of the segment as <kill>, if the next segment
+ // starts with a predicated instruction.
+ auto NextI = std::next(I);
+ if (NextI != E && NextI->start.isRegister()) {
+ MachineInstr *DefI = LIS->getInstructionFromIndex(NextI->start);
+ if (HII->isPredicated(*DefI))
+ continue;
+ }
+ bool WholeReg = true;
+ if (LI.hasSubRanges()) {
+ auto EndsAtI = [I] (LiveInterval::SubRange &S) -> bool {
+ LiveRange::iterator F = S.find(I->end);
+ return F != S.end() && I->end == F->end;
+ };
+ // Check if all subranges end at I->end. If so, make sure to kill
+ // the whole register.
+ for (LiveInterval::SubRange &S : LI.subranges()) {
+ if (EndsAtI(S))
+ KillAt(I->end, S.LaneMask);
+ else
+ WholeReg = false;
+ }
+ }
+ if (WholeReg)
+ KillAt(I->end, MRI->getMaxLaneMaskForVReg(Reg));
+ }
+}
+
+void HexagonExpandCondsets::updateDeadsInRange(unsigned Reg, LaneBitmask LM,
+ LiveRange &Range) {
+ assert(TargetRegisterInfo::isVirtualRegister(Reg));
+ if (Range.empty())
+ return;
+
+ auto IsRegDef = [this,Reg,LM] (MachineOperand &Op) -> bool {
+ if (!Op.isReg() || !Op.isDef())
+ return false;
+ unsigned DR = Op.getReg(), DSR = Op.getSubReg();
+ if (!TargetRegisterInfo::isVirtualRegister(DR) || DR != Reg)
+ return false;
+ LaneBitmask SLM = getLaneMask(DR, DSR);
+ return (SLM & LM).any();
+ };
+
+ // The splitting step will create pairs of predicated definitions without
+ // any implicit uses (since implicit uses would interfere with predication).
+ // This can cause the reaching defs to become dead after live range
+ // recomputation, even though they are not really dead.
+ // We need to identify predicated defs that need implicit uses, and
+ // dead defs that are not really dead, and correct both problems.
+
+ auto Dominate = [this] (SetVector<MachineBasicBlock*> &Defs,
+ MachineBasicBlock *Dest) -> bool {
+ for (MachineBasicBlock *D : Defs)
+ if (D != Dest && MDT->dominates(D, Dest))
+ return true;
+
+ MachineBasicBlock *Entry = &Dest->getParent()->front();
+ SetVector<MachineBasicBlock*> Work(Dest->pred_begin(), Dest->pred_end());
+ for (unsigned i = 0; i < Work.size(); ++i) {
+ MachineBasicBlock *B = Work[i];
+ if (Defs.count(B))
+ continue;
+ if (B == Entry)
+ return false;
+ for (auto *P : B->predecessors())
+ Work.insert(P);
+ }
+ return true;
+ };
+
+ // First, try to extend live range within individual basic blocks. This
+ // will leave us only with dead defs that do not reach any predicated
+ // defs in the same block.
+ SetVector<MachineBasicBlock*> Defs;
+ SmallVector<SlotIndex,4> PredDefs;
+ for (auto &Seg : Range) {
+ if (!Seg.start.isRegister())
+ continue;
+ MachineInstr *DefI = LIS->getInstructionFromIndex(Seg.start);
+ Defs.insert(DefI->getParent());
+ if (HII->isPredicated(*DefI))
+ PredDefs.push_back(Seg.start);
+ }
+
+ SmallVector<SlotIndex,8> Undefs;
+ LiveInterval &LI = LIS->getInterval(Reg);
+ LI.computeSubRangeUndefs(Undefs, LM, *MRI, *LIS->getSlotIndexes());
+
+ for (auto &SI : PredDefs) {
+ MachineBasicBlock *BB = LIS->getMBBFromIndex(SI);
+ auto P = Range.extendInBlock(Undefs, LIS->getMBBStartIdx(BB), SI);
+ if (P.first != nullptr || P.second)
+ SI = SlotIndex();
+ }
+
+ // Calculate reachability for those predicated defs that were not handled
+ // by the in-block extension.
+ SmallVector<SlotIndex,4> ExtTo;
+ for (auto &SI : PredDefs) {
+ if (!SI.isValid())
+ continue;
+ MachineBasicBlock *BB = LIS->getMBBFromIndex(SI);
+ if (BB->pred_empty())
+ continue;
+ // If the defs from this range reach SI via all predecessors, it is live.
+ // It can happen that SI is reached by the defs through some paths, but
+ // not all. In the IR coming into this optimization, SI would not be
+ // considered live, since the defs would then not jointly dominate SI.
+ // That means that SI is an overwriting def, and no implicit use is
+ // needed at this point. Do not add SI to the extension points, since
+ // extendToIndices will abort if there is no joint dominance.
+ // If the abort was avoided by adding extra undefs added to Undefs,
+ // extendToIndices could actually indicate that SI is live, contrary
+ // to the original IR.
+ if (Dominate(Defs, BB))
+ ExtTo.push_back(SI);
+ }
+
+ if (!ExtTo.empty())
+ LIS->extendToIndices(Range, ExtTo, Undefs);
+
+ // Remove <dead> flags from all defs that are not dead after live range
+ // extension, and collect all def operands. They will be used to generate
+ // the necessary implicit uses.
+ std::set<RegisterRef> DefRegs;
+ for (auto &Seg : Range) {
+ if (!Seg.start.isRegister())
+ continue;
+ MachineInstr *DefI = LIS->getInstructionFromIndex(Seg.start);
+ for (auto &Op : DefI->operands()) {
+ if (Seg.start.isDead() || !IsRegDef(Op))
+ continue;
+ DefRegs.insert(Op);
+ Op.setIsDead(false);
+ }
+ }
+
+ // Finally, add implicit uses to each predicated def that is reached
+ // by other defs.
+ for (auto &Seg : Range) {
+ if (!Seg.start.isRegister() || !Range.liveAt(Seg.start.getPrevSlot()))
+ continue;
+ MachineInstr *DefI = LIS->getInstructionFromIndex(Seg.start);
+ if (!HII->isPredicated(*DefI))
+ continue;
+ // Construct the set of all necessary implicit uses, based on the def
+ // operands in the instruction.
+ std::set<RegisterRef> ImpUses;
+ for (auto &Op : DefI->operands())
+ if (Op.isReg() && Op.isDef() && DefRegs.count(Op))
+ ImpUses.insert(Op);
+ if (ImpUses.empty())
+ continue;
+ MachineFunction &MF = *DefI->getParent()->getParent();
+ for (RegisterRef R : ImpUses)
+ MachineInstrBuilder(MF, DefI).addReg(R.Reg, RegState::Implicit, R.Sub);
+ }
+}
+
+void HexagonExpandCondsets::updateDeadFlags(unsigned Reg) {
+ LiveInterval &LI = LIS->getInterval(Reg);
+ if (LI.hasSubRanges()) {
+ for (LiveInterval::SubRange &S : LI.subranges()) {
+ updateDeadsInRange(Reg, S.LaneMask, S);
+ LIS->shrinkToUses(S, Reg);
+ }
+ LI.clear();
+ LIS->constructMainRangeFromSubranges(LI);
+ } else {
+ updateDeadsInRange(Reg, MRI->getMaxLaneMaskForVReg(Reg), LI);
+ }
+}
+
+void HexagonExpandCondsets::recalculateLiveInterval(unsigned Reg) {
+ LIS->removeInterval(Reg);
+ LIS->createAndComputeVirtRegInterval(Reg);
+}
+
+void HexagonExpandCondsets::removeInstr(MachineInstr &MI) {
+ LIS->RemoveMachineInstrFromMaps(MI);
+ MI.eraseFromParent();
+}
+
+void HexagonExpandCondsets::updateLiveness(std::set<unsigned> &RegSet,
+ bool Recalc, bool UpdateKills, bool UpdateDeads) {
+ UpdateKills |= UpdateDeads;
+ for (auto R : RegSet) {
+ if (Recalc)
+ recalculateLiveInterval(R);
+ if (UpdateKills)
+ MRI->clearKillFlags(R);
+ if (UpdateDeads)
+ updateDeadFlags(R);
+ // Fixing <dead> flags may extend live ranges, so reset <kill> flags
+ // after that.
+ if (UpdateKills)
+ updateKillFlags(R);
+ LIS->getInterval(R).verify();
+ }
+}
+
+/// Get the opcode for a conditional transfer of the value in SO (source
+/// operand). The condition (true/false) is given in Cond.
+unsigned HexagonExpandCondsets::getCondTfrOpcode(const MachineOperand &SO,
+ bool IfTrue) {
+ using namespace Hexagon;
+
+ if (SO.isReg()) {
+ unsigned PhysR;
+ RegisterRef RS = SO;
+ if (TargetRegisterInfo::isVirtualRegister(RS.Reg)) {
+ const TargetRegisterClass *VC = MRI->getRegClass(RS.Reg);
+ assert(VC->begin() != VC->end() && "Empty register class");
+ PhysR = *VC->begin();
+ } else {
+ assert(TargetRegisterInfo::isPhysicalRegister(RS.Reg));
+ PhysR = RS.Reg;
+ }
+ unsigned PhysS = (RS.Sub == 0) ? PhysR : TRI->getSubReg(PhysR, RS.Sub);
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysS);
+ switch (RC->getSize()) {
+ case 4:
+ return IfTrue ? A2_tfrt : A2_tfrf;
+ case 8:
+ return IfTrue ? A2_tfrpt : A2_tfrpf;
+ }
+ llvm_unreachable("Invalid register operand");
+ }
+ if (SO.isImm() || SO.isFPImm())
+ return IfTrue ? C2_cmoveit : C2_cmoveif;
+ llvm_unreachable("Unexpected source operand");
+}
+
+/// Generate a conditional transfer, copying the value SrcOp to the
+/// destination register DstR:DstSR, and using the predicate register from
+/// PredOp. The Cond argument specifies whether the predicate is to be
+/// if(PredOp), or if(!PredOp).
+MachineInstr *HexagonExpandCondsets::genCondTfrFor(MachineOperand &SrcOp,
+ MachineBasicBlock::iterator At,
+ unsigned DstR, unsigned DstSR, const MachineOperand &PredOp,
+ bool PredSense, bool ReadUndef, bool ImpUse) {
+ MachineInstr *MI = SrcOp.getParent();
+ MachineBasicBlock &B = *At->getParent();
+ const DebugLoc &DL = MI->getDebugLoc();
+
+ // Don't avoid identity copies here (i.e. if the source and the destination
+ // are the same registers). It is actually better to generate them here,
+ // since this would cause the copy to potentially be predicated in the next
+ // step. The predication will remove such a copy if it is unable to
+ /// predicate.
+
+ unsigned Opc = getCondTfrOpcode(SrcOp, PredSense);
+ unsigned DstState = RegState::Define | (ReadUndef ? RegState::Undef : 0);
+ unsigned PredState = getRegState(PredOp) & ~RegState::Kill;
+ MachineInstrBuilder MIB;
+
+ if (SrcOp.isReg()) {
+ unsigned SrcState = getRegState(SrcOp);
+ if (RegisterRef(SrcOp) == RegisterRef(DstR, DstSR))
+ SrcState &= ~RegState::Kill;
+ MIB = BuildMI(B, At, DL, HII->get(Opc))
+ .addReg(DstR, DstState, DstSR)
+ .addReg(PredOp.getReg(), PredState, PredOp.getSubReg())
+ .addReg(SrcOp.getReg(), SrcState, SrcOp.getSubReg());
+ } else {
+ MIB = BuildMI(B, At, DL, HII->get(Opc))
+ .addReg(DstR, DstState, DstSR)
+ .addReg(PredOp.getReg(), PredState, PredOp.getSubReg())
+ .addOperand(SrcOp);
+ }
+
+ DEBUG(dbgs() << "created an initial copy: " << *MIB);
+ return &*MIB;
+}
+
+/// Replace a MUX instruction MI with a pair A2_tfrt/A2_tfrf. This function
+/// performs all necessary changes to complete the replacement.
+bool HexagonExpandCondsets::split(MachineInstr &MI,
+ std::set<unsigned> &UpdRegs) {
+ if (TfrLimitActive) {
+ if (TfrCounter >= TfrLimit)
+ return false;
+ TfrCounter++;
+ }
+ DEBUG(dbgs() << "\nsplitting BB#" << MI.getParent()->getNumber() << ": "
+ << MI);
+ MachineOperand &MD = MI.getOperand(0); // Definition
+ MachineOperand &MP = MI.getOperand(1); // Predicate register
+ assert(MD.isDef());
+ unsigned DR = MD.getReg(), DSR = MD.getSubReg();
+ bool ReadUndef = MD.isUndef();
+ MachineBasicBlock::iterator At = MI;
+
+ // If this is a mux of the same register, just replace it with COPY.
+ // Ideally, this would happen earlier, so that register coalescing would
+ // see it.
+ MachineOperand &ST = MI.getOperand(2);
+ MachineOperand &SF = MI.getOperand(3);
+ if (ST.isReg() && SF.isReg()) {
+ RegisterRef RT(ST);
+ if (RT == RegisterRef(SF)) {
+ MI.setDesc(HII->get(TargetOpcode::COPY));
+ unsigned S = getRegState(ST);
+ while (MI.getNumOperands() > 1)
+ MI.RemoveOperand(MI.getNumOperands()-1);
+ MachineFunction &MF = *MI.getParent()->getParent();
+ MachineInstrBuilder(MF, MI).addReg(RT.Reg, S, RT.Sub);
+ return true;
+ }
+ }
+
+ // First, create the two invididual conditional transfers, and add each
+ // of them to the live intervals information. Do that first and then remove
+ // the old instruction from live intervals.
+ MachineInstr *TfrT =
+ genCondTfrFor(ST, At, DR, DSR, MP, true, ReadUndef, false);
+ MachineInstr *TfrF =
+ genCondTfrFor(SF, At, DR, DSR, MP, false, ReadUndef, true);
+ LIS->InsertMachineInstrInMaps(*TfrT);
+ LIS->InsertMachineInstrInMaps(*TfrF);
+
+ // Will need to recalculate live intervals for all registers in MI.
+ for (auto &Op : MI.operands())
+ if (Op.isReg())
+ UpdRegs.insert(Op.getReg());
+
+ removeInstr(MI);
+ return true;
+}
+
+bool HexagonExpandCondsets::isPredicable(MachineInstr *MI) {
+ if (HII->isPredicated(*MI) || !HII->isPredicable(*MI))
+ return false;
+ if (MI->hasUnmodeledSideEffects() || MI->mayStore())
+ return false;
+ // Reject instructions with multiple defs (e.g. post-increment loads).
+ bool HasDef = false;
+ for (auto &Op : MI->operands()) {
+ if (!Op.isReg() || !Op.isDef())
+ continue;
+ if (HasDef)
+ return false;
+ HasDef = true;
+ }
+ for (auto &Mo : MI->memoperands())
+ if (Mo->isVolatile())
+ return false;
+ return true;
+}
+
+/// Find the reaching definition for a predicated use of RD. The RD is used
+/// under the conditions given by PredR and Cond, and this function will ignore
+/// definitions that set RD under the opposite conditions.
+MachineInstr *HexagonExpandCondsets::getReachingDefForPred(RegisterRef RD,
+ MachineBasicBlock::iterator UseIt, unsigned PredR, bool Cond) {
+ MachineBasicBlock &B = *UseIt->getParent();
+ MachineBasicBlock::iterator I = UseIt, S = B.begin();
+ if (I == S)
+ return nullptr;
+
+ bool PredValid = true;
+ do {
+ --I;
+ MachineInstr *MI = &*I;
+ // Check if this instruction can be ignored, i.e. if it is predicated
+ // on the complementary condition.
+ if (PredValid && HII->isPredicated(*MI)) {
+ if (MI->readsRegister(PredR) && (Cond != HII->isPredicatedTrue(*MI)))
+ continue;
+ }
+
+ // Check the defs. If the PredR is defined, invalidate it. If RD is
+ // defined, return the instruction or 0, depending on the circumstances.
+ for (auto &Op : MI->operands()) {
+ if (!Op.isReg() || !Op.isDef())
+ continue;
+ RegisterRef RR = Op;
+ if (RR.Reg == PredR) {
+ PredValid = false;
+ continue;
+ }
+ if (RR.Reg != RD.Reg)
+ continue;
+ // If the "Reg" part agrees, there is still the subregister to check.
+ // If we are looking for vreg1:loreg, we can skip vreg1:hireg, but
+ // not vreg1 (w/o subregisters).
+ if (RR.Sub == RD.Sub)
+ return MI;
+ if (RR.Sub == 0 || RD.Sub == 0)
+ return nullptr;
+ // We have different subregisters, so we can continue looking.
+ }
+ } while (I != S);
+
+ return nullptr;
+}
+
+/// Check if the instruction MI can be safely moved over a set of instructions
+/// whose side-effects (in terms of register defs and uses) are expressed in
+/// the maps Defs and Uses. These maps reflect the conditional defs and uses
+/// that depend on the same predicate register to allow moving instructions
+/// over instructions predicated on the opposite condition.
+bool HexagonExpandCondsets::canMoveOver(MachineInstr &MI, ReferenceMap &Defs,
+ ReferenceMap &Uses) {
+ // In order to be able to safely move MI over instructions that define
+ // "Defs" and use "Uses", no def operand from MI can be defined or used
+ // and no use operand can be defined.
+ for (auto &Op : MI.operands()) {
+ if (!Op.isReg())
+ continue;
+ RegisterRef RR = Op;
+ // For physical register we would need to check register aliases, etc.
+ // and we don't want to bother with that. It would be of little value
+ // before the actual register rewriting (from virtual to physical).
+ if (!TargetRegisterInfo::isVirtualRegister(RR.Reg))
+ return false;
+ // No redefs for any operand.
+ if (isRefInMap(RR, Defs, Exec_Then))
+ return false;
+ // For defs, there cannot be uses.
+ if (Op.isDef() && isRefInMap(RR, Uses, Exec_Then))
+ return false;
+ }
+ return true;
+}
+
+/// Check if the instruction accessing memory (TheI) can be moved to the
+/// location ToI.
+bool HexagonExpandCondsets::canMoveMemTo(MachineInstr &TheI, MachineInstr &ToI,
+ bool IsDown) {
+ bool IsLoad = TheI.mayLoad(), IsStore = TheI.mayStore();
+ if (!IsLoad && !IsStore)
+ return true;
+ if (HII->areMemAccessesTriviallyDisjoint(TheI, ToI))
+ return true;
+ if (TheI.hasUnmodeledSideEffects())
+ return false;
+
+ MachineBasicBlock::iterator StartI = IsDown ? TheI : ToI;
+ MachineBasicBlock::iterator EndI = IsDown ? ToI : TheI;
+ bool Ordered = TheI.hasOrderedMemoryRef();
+
+ // Search for aliased memory reference in (StartI, EndI).
+ for (MachineBasicBlock::iterator I = std::next(StartI); I != EndI; ++I) {
+ MachineInstr *MI = &*I;
+ if (MI->hasUnmodeledSideEffects())
+ return false;
+ bool L = MI->mayLoad(), S = MI->mayStore();
+ if (!L && !S)
+ continue;
+ if (Ordered && MI->hasOrderedMemoryRef())
+ return false;
+
+ bool Conflict = (L && IsStore) || S;
+ if (Conflict)
+ return false;
+ }
+ return true;
+}
+
+/// Generate a predicated version of MI (where the condition is given via
+/// PredR and Cond) at the point indicated by Where.
+void HexagonExpandCondsets::predicateAt(const MachineOperand &DefOp,
+ MachineInstr &MI,
+ MachineBasicBlock::iterator Where,
+ const MachineOperand &PredOp, bool Cond,
+ std::set<unsigned> &UpdRegs) {
+ // The problem with updating live intervals is that we can move one def
+ // past another def. In particular, this can happen when moving an A2_tfrt
+ // over an A2_tfrf defining the same register. From the point of view of
+ // live intervals, these two instructions are two separate definitions,
+ // and each one starts another live segment. LiveIntervals's "handleMove"
+ // does not allow such moves, so we need to handle it ourselves. To avoid
+ // invalidating liveness data while we are using it, the move will be
+ // implemented in 4 steps: (1) add a clone of the instruction MI at the
+ // target location, (2) update liveness, (3) delete the old instruction,
+ // and (4) update liveness again.
+
+ MachineBasicBlock &B = *MI.getParent();
+ DebugLoc DL = Where->getDebugLoc(); // "Where" points to an instruction.
+ unsigned Opc = MI.getOpcode();
+ unsigned PredOpc = HII->getCondOpcode(Opc, !Cond);
+ MachineInstrBuilder MB = BuildMI(B, Where, DL, HII->get(PredOpc));
+ unsigned Ox = 0, NP = MI.getNumOperands();
+ // Skip all defs from MI first.
+ while (Ox < NP) {
+ MachineOperand &MO = MI.getOperand(Ox);
+ if (!MO.isReg() || !MO.isDef())
+ break;
+ Ox++;
+ }
+ // Add the new def, then the predicate register, then the rest of the
+ // operands.
+ MB.addReg(DefOp.getReg(), getRegState(DefOp), DefOp.getSubReg());
+ MB.addReg(PredOp.getReg(), PredOp.isUndef() ? RegState::Undef : 0,
+ PredOp.getSubReg());
+ while (Ox < NP) {
+ MachineOperand &MO = MI.getOperand(Ox);
+ if (!MO.isReg() || !MO.isImplicit())
+ MB.addOperand(MO);
+ Ox++;
+ }
+
+ MachineFunction &MF = *B.getParent();
+ MachineInstr::mmo_iterator I = MI.memoperands_begin();
+ unsigned NR = std::distance(I, MI.memoperands_end());
+ MachineInstr::mmo_iterator MemRefs = MF.allocateMemRefsArray(NR);
+ for (unsigned i = 0; i < NR; ++i)
+ MemRefs[i] = *I++;
+ MB.setMemRefs(MemRefs, MemRefs+NR);
+
+ MachineInstr *NewI = MB;
+ NewI->clearKillInfo();
+ LIS->InsertMachineInstrInMaps(*NewI);
+
+ for (auto &Op : NewI->operands())
+ if (Op.isReg())
+ UpdRegs.insert(Op.getReg());
+}
+
+/// In the range [First, Last], rename all references to the "old" register RO
+/// to the "new" register RN, but only in instructions predicated on the given
+/// condition.
+void HexagonExpandCondsets::renameInRange(RegisterRef RO, RegisterRef RN,
+ unsigned PredR, bool Cond, MachineBasicBlock::iterator First,
+ MachineBasicBlock::iterator Last) {
+ MachineBasicBlock::iterator End = std::next(Last);
+ for (MachineBasicBlock::iterator I = First; I != End; ++I) {
+ MachineInstr *MI = &*I;
+ // Do not touch instructions that are not predicated, or are predicated
+ // on the opposite condition.
+ if (!HII->isPredicated(*MI))
+ continue;
+ if (!MI->readsRegister(PredR) || (Cond != HII->isPredicatedTrue(*MI)))
+ continue;
+
+ for (auto &Op : MI->operands()) {
+ if (!Op.isReg() || RO != RegisterRef(Op))
+ continue;
+ Op.setReg(RN.Reg);
+ Op.setSubReg(RN.Sub);
+ // In practice, this isn't supposed to see any defs.
+ assert(!Op.isDef() && "Not expecting a def");
+ }
+ }
+}
+
+/// For a given conditional copy, predicate the definition of the source of
+/// the copy under the given condition (using the same predicate register as
+/// the copy).
+bool HexagonExpandCondsets::predicate(MachineInstr &TfrI, bool Cond,
+ std::set<unsigned> &UpdRegs) {
+ // TfrI - A2_tfr[tf] Instruction (not A2_tfrsi).
+ unsigned Opc = TfrI.getOpcode();
+ (void)Opc;
+ assert(Opc == Hexagon::A2_tfrt || Opc == Hexagon::A2_tfrf);
+ DEBUG(dbgs() << "\nattempt to predicate if-" << (Cond ? "true" : "false")
+ << ": " << TfrI);
+
+ MachineOperand &MD = TfrI.getOperand(0);
+ MachineOperand &MP = TfrI.getOperand(1);
+ MachineOperand &MS = TfrI.getOperand(2);
+ // The source operand should be a <kill>. This is not strictly necessary,
+ // but it makes things a lot simpler. Otherwise, we would need to rename
+ // some registers, which would complicate the transformation considerably.
+ if (!MS.isKill())
+ return false;
+ // Avoid predicating instructions that define a subregister if subregister
+ // liveness tracking is not enabled.
+ if (MD.getSubReg() && !MRI->shouldTrackSubRegLiveness(MD.getReg()))
+ return false;
+
+ RegisterRef RT(MS);
+ unsigned PredR = MP.getReg();
+ MachineInstr *DefI = getReachingDefForPred(RT, TfrI, PredR, Cond);
+ if (!DefI || !isPredicable(DefI))
+ return false;
+
+ DEBUG(dbgs() << "Source def: " << *DefI);
+
+ // Collect the information about registers defined and used between the
+ // DefI and the TfrI.
+ // Map: reg -> bitmask of subregs
+ ReferenceMap Uses, Defs;
+ MachineBasicBlock::iterator DefIt = DefI, TfrIt = TfrI;
+
+ // Check if the predicate register is valid between DefI and TfrI.
+ // If it is, we can then ignore instructions predicated on the negated
+ // conditions when collecting def and use information.
+ bool PredValid = true;
+ for (MachineBasicBlock::iterator I = std::next(DefIt); I != TfrIt; ++I) {
+ if (!I->modifiesRegister(PredR, nullptr))
+ continue;
+ PredValid = false;
+ break;
+ }
+
+ for (MachineBasicBlock::iterator I = std::next(DefIt); I != TfrIt; ++I) {
+ MachineInstr *MI = &*I;
+ // If this instruction is predicated on the same register, it could
+ // potentially be ignored.
+ // By default assume that the instruction executes on the same condition
+ // as TfrI (Exec_Then), and also on the opposite one (Exec_Else).
+ unsigned Exec = Exec_Then | Exec_Else;
+ if (PredValid && HII->isPredicated(*MI) && MI->readsRegister(PredR))
+ Exec = (Cond == HII->isPredicatedTrue(*MI)) ? Exec_Then : Exec_Else;
+
+ for (auto &Op : MI->operands()) {
+ if (!Op.isReg())
+ continue;
+ // We don't want to deal with physical registers. The reason is that
+ // they can be aliased with other physical registers. Aliased virtual
+ // registers must share the same register number, and can only differ
+ // in the subregisters, which we are keeping track of. Physical
+ // registers ters no longer have subregisters---their super- and
+ // subregisters are other physical registers, and we are not checking
+ // that.
+ RegisterRef RR = Op;
+ if (!TargetRegisterInfo::isVirtualRegister(RR.Reg))
+ return false;
+
+ ReferenceMap &Map = Op.isDef() ? Defs : Uses;
+ if (Op.isDef() && Op.isUndef()) {
+ assert(RR.Sub && "Expecting a subregister on <def,read-undef>");
+ // If this is a <def,read-undef>, then it invalidates the non-written
+ // part of the register. For the purpose of checking the validity of
+ // the move, assume that it modifies the whole register.
+ RR.Sub = 0;
+ }
+ addRefToMap(RR, Map, Exec);
+ }
+ }
+
+ // The situation:
+ // RT = DefI
+ // ...
+ // RD = TfrI ..., RT
+
+ // If the register-in-the-middle (RT) is used or redefined between
+ // DefI and TfrI, we may not be able proceed with this transformation.
+ // We can ignore a def that will not execute together with TfrI, and a
+ // use that will. If there is such a use (that does execute together with
+ // TfrI), we will not be able to move DefI down. If there is a use that
+ // executed if TfrI's condition is false, then RT must be available
+ // unconditionally (cannot be predicated).
+ // Essentially, we need to be able to rename RT to RD in this segment.
+ if (isRefInMap(RT, Defs, Exec_Then) || isRefInMap(RT, Uses, Exec_Else))
+ return false;
+ RegisterRef RD = MD;
+ // If the predicate register is defined between DefI and TfrI, the only
+ // potential thing to do would be to move the DefI down to TfrI, and then
+ // predicate. The reaching def (DefI) must be movable down to the location
+ // of the TfrI.
+ // If the target register of the TfrI (RD) is not used or defined between
+ // DefI and TfrI, consider moving TfrI up to DefI.
+ bool CanUp = canMoveOver(TfrI, Defs, Uses);
+ bool CanDown = canMoveOver(*DefI, Defs, Uses);
+ // The TfrI does not access memory, but DefI could. Check if it's safe
+ // to move DefI down to TfrI.
+ if (DefI->mayLoad() || DefI->mayStore())
+ if (!canMoveMemTo(*DefI, TfrI, true))
+ CanDown = false;
+
+ DEBUG(dbgs() << "Can move up: " << (CanUp ? "yes" : "no")
+ << ", can move down: " << (CanDown ? "yes\n" : "no\n"));
+ MachineBasicBlock::iterator PastDefIt = std::next(DefIt);
+ if (CanUp)
+ predicateAt(MD, *DefI, PastDefIt, MP, Cond, UpdRegs);
+ else if (CanDown)
+ predicateAt(MD, *DefI, TfrIt, MP, Cond, UpdRegs);
+ else
+ return false;
+
+ if (RT != RD) {
+ renameInRange(RT, RD, PredR, Cond, PastDefIt, TfrIt);
+ UpdRegs.insert(RT.Reg);
+ }
+
+ removeInstr(TfrI);
+ removeInstr(*DefI);
+ return true;
+}
+
+/// Predicate all cases of conditional copies in the specified block.
+bool HexagonExpandCondsets::predicateInBlock(MachineBasicBlock &B,
+ std::set<unsigned> &UpdRegs) {
+ bool Changed = false;
+ MachineBasicBlock::iterator I, E, NextI;
+ for (I = B.begin(), E = B.end(); I != E; I = NextI) {
+ NextI = std::next(I);
+ unsigned Opc = I->getOpcode();
+ if (Opc == Hexagon::A2_tfrt || Opc == Hexagon::A2_tfrf) {
+ bool Done = predicate(*I, (Opc == Hexagon::A2_tfrt), UpdRegs);
+ if (!Done) {
+ // If we didn't predicate I, we may need to remove it in case it is
+ // an "identity" copy, e.g. vreg1 = A2_tfrt vreg2, vreg1.
+ if (RegisterRef(I->getOperand(0)) == RegisterRef(I->getOperand(2))) {
+ for (auto &Op : I->operands())
+ if (Op.isReg())
+ UpdRegs.insert(Op.getReg());
+ removeInstr(*I);
+ }
+ }
+ Changed |= Done;
+ }
+ }
+ return Changed;
+}
+
+bool HexagonExpandCondsets::isIntReg(RegisterRef RR, unsigned &BW) {
+ if (!TargetRegisterInfo::isVirtualRegister(RR.Reg))
+ return false;
+ const TargetRegisterClass *RC = MRI->getRegClass(RR.Reg);
+ if (RC == &Hexagon::IntRegsRegClass) {
+ BW = 32;
+ return true;
+ }
+ if (RC == &Hexagon::DoubleRegsRegClass) {
+ BW = (RR.Sub != 0) ? 32 : 64;
+ return true;
+ }
+ return false;
+}
+
+bool HexagonExpandCondsets::isIntraBlocks(LiveInterval &LI) {
+ for (LiveInterval::iterator I = LI.begin(), E = LI.end(); I != E; ++I) {
+ LiveRange::Segment &LR = *I;
+ // Range must start at a register...
+ if (!LR.start.isRegister())
+ return false;
+ // ...and end in a register or in a dead slot.
+ if (!LR.end.isRegister() && !LR.end.isDead())
+ return false;
+ }
+ return true;
+}
+
+bool HexagonExpandCondsets::coalesceRegisters(RegisterRef R1, RegisterRef R2) {
+ if (CoaLimitActive) {
+ if (CoaCounter >= CoaLimit)
+ return false;
+ CoaCounter++;
+ }
+ unsigned BW1, BW2;
+ if (!isIntReg(R1, BW1) || !isIntReg(R2, BW2) || BW1 != BW2)
+ return false;
+ if (MRI->isLiveIn(R1.Reg))
+ return false;
+ if (MRI->isLiveIn(R2.Reg))
+ return false;
+
+ LiveInterval &L1 = LIS->getInterval(R1.Reg);
+ LiveInterval &L2 = LIS->getInterval(R2.Reg);
+ if (L2.empty())
+ return false;
+ if (L1.hasSubRanges() || L2.hasSubRanges())
+ return false;
+ bool Overlap = L1.overlaps(L2);
+
+ DEBUG(dbgs() << "compatible registers: ("
+ << (Overlap ? "overlap" : "disjoint") << ")\n "
+ << PrintReg(R1.Reg, TRI, R1.Sub) << " " << L1 << "\n "
+ << PrintReg(R2.Reg, TRI, R2.Sub) << " " << L2 << "\n");
+ if (R1.Sub || R2.Sub)
+ return false;
+ if (Overlap)
+ return false;
+
+ // Coalescing could have a negative impact on scheduling, so try to limit
+ // to some reasonable extent. Only consider coalescing segments, when one
+ // of them does not cross basic block boundaries.
+ if (!isIntraBlocks(L1) && !isIntraBlocks(L2))
+ return false;
+
+ MRI->replaceRegWith(R2.Reg, R1.Reg);
+
+ // Move all live segments from L2 to L1.
+ typedef DenseMap<VNInfo*,VNInfo*> ValueInfoMap;
+ ValueInfoMap VM;
+ for (LiveInterval::iterator I = L2.begin(), E = L2.end(); I != E; ++I) {
+ VNInfo *NewVN, *OldVN = I->valno;
+ ValueInfoMap::iterator F = VM.find(OldVN);
+ if (F == VM.end()) {
+ NewVN = L1.getNextValue(I->valno->def, LIS->getVNInfoAllocator());
+ VM.insert(std::make_pair(OldVN, NewVN));
+ } else {
+ NewVN = F->second;
+ }
+ L1.addSegment(LiveRange::Segment(I->start, I->end, NewVN));
+ }
+ while (L2.begin() != L2.end())
+ L2.removeSegment(*L2.begin());
+ LIS->removeInterval(R2.Reg);
+
+ updateKillFlags(R1.Reg);
+ DEBUG(dbgs() << "coalesced: " << L1 << "\n");
+ L1.verify();
+
+ return true;
+}
+
+/// Attempt to coalesce one of the source registers to a MUX instruction with
+/// the destination register. This could lead to having only one predicated
+/// instruction in the end instead of two.
+bool HexagonExpandCondsets::coalesceSegments(
+ const SmallVectorImpl<MachineInstr*> &Condsets,
+ std::set<unsigned> &UpdRegs) {
+ SmallVector<MachineInstr*,16> TwoRegs;
+ for (MachineInstr *MI : Condsets) {
+ MachineOperand &S1 = MI->getOperand(2), &S2 = MI->getOperand(3);
+ if (!S1.isReg() && !S2.isReg())
+ continue;
+ TwoRegs.push_back(MI);
+ }
+
+ bool Changed = false;
+ for (MachineInstr *CI : TwoRegs) {
+ RegisterRef RD = CI->getOperand(0);
+ RegisterRef RP = CI->getOperand(1);
+ MachineOperand &S1 = CI->getOperand(2), &S2 = CI->getOperand(3);
+ bool Done = false;
+ // Consider this case:
+ // vreg1 = instr1 ...
+ // vreg2 = instr2 ...
+ // vreg0 = C2_mux ..., vreg1, vreg2
+ // If vreg0 was coalesced with vreg1, we could end up with the following
+ // code:
+ // vreg0 = instr1 ...
+ // vreg2 = instr2 ...
+ // vreg0 = A2_tfrf ..., vreg2
+ // which will later become:
+ // vreg0 = instr1 ...
+ // vreg0 = instr2_cNotPt ...
+ // i.e. there will be an unconditional definition (instr1) of vreg0
+ // followed by a conditional one. The output dependency was there before
+ // and it unavoidable, but if instr1 is predicable, we will no longer be
+ // able to predicate it here.
+ // To avoid this scenario, don't coalesce the destination register with
+ // a source register that is defined by a predicable instruction.
+ if (S1.isReg()) {
+ RegisterRef RS = S1;
+ MachineInstr *RDef = getReachingDefForPred(RS, CI, RP.Reg, true);
+ if (!RDef || !HII->isPredicable(*RDef)) {
+ Done = coalesceRegisters(RD, RegisterRef(S1));
+ if (Done) {
+ UpdRegs.insert(RD.Reg);
+ UpdRegs.insert(S1.getReg());
+ }
+ }
+ }
+ if (!Done && S2.isReg()) {
+ RegisterRef RS = S2;
+ MachineInstr *RDef = getReachingDefForPred(RS, CI, RP.Reg, false);
+ if (!RDef || !HII->isPredicable(*RDef)) {
+ Done = coalesceRegisters(RD, RegisterRef(S2));
+ if (Done) {
+ UpdRegs.insert(RD.Reg);
+ UpdRegs.insert(S2.getReg());
+ }
+ }
+ }
+ Changed |= Done;
+ }
+ return Changed;
+}
+
+bool HexagonExpandCondsets::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
+ HII = static_cast<const HexagonInstrInfo*>(MF.getSubtarget().getInstrInfo());
+ TRI = MF.getSubtarget().getRegisterInfo();
+ MDT = &getAnalysis<MachineDominatorTree>();
+ LIS = &getAnalysis<LiveIntervals>();
+ MRI = &MF.getRegInfo();
+
+ DEBUG(LIS->print(dbgs() << "Before expand-condsets\n",
+ MF.getFunction()->getParent()));
+
+ bool Changed = false;
+ std::set<unsigned> CoalUpd, PredUpd;
+
+ SmallVector<MachineInstr*,16> Condsets;
+ for (auto &B : MF)
+ for (auto &I : B)
+ if (isCondset(I))
+ Condsets.push_back(&I);
+
+ // Try to coalesce the target of a mux with one of its sources.
+ // This could eliminate a register copy in some circumstances.
+ Changed |= coalesceSegments(Condsets, CoalUpd);
+
+ // Update kill flags on all source operands. This is done here because
+ // at this moment (when expand-condsets runs), there are no kill flags
+ // in the IR (they have been removed by live range analysis).
+ // Updating them right before we split is the easiest, because splitting
+ // adds definitions which would interfere with updating kills afterwards.
+ std::set<unsigned> KillUpd;
+ for (MachineInstr *MI : Condsets)
+ for (MachineOperand &Op : MI->operands())
+ if (Op.isReg() && Op.isUse())
+ if (!CoalUpd.count(Op.getReg()))
+ KillUpd.insert(Op.getReg());
+ updateLiveness(KillUpd, false, true, false);
+ DEBUG(LIS->print(dbgs() << "After coalescing\n",
+ MF.getFunction()->getParent()));
+
+ // First, simply split all muxes into a pair of conditional transfers
+ // and update the live intervals to reflect the new arrangement. The
+ // goal is to update the kill flags, since predication will rely on
+ // them.
+ for (MachineInstr *MI : Condsets)
+ Changed |= split(*MI, PredUpd);
+ Condsets.clear(); // The contents of Condsets are invalid here anyway.
+
+ // Do not update live ranges after splitting. Recalculation of live
+ // intervals removes kill flags, which were preserved by splitting on
+ // the source operands of condsets. These kill flags are needed by
+ // predication, and after splitting they are difficult to recalculate
+ // (because of predicated defs), so make sure they are left untouched.
+ // Predication does not use live intervals.
+ DEBUG(LIS->print(dbgs() << "After splitting\n",
+ MF.getFunction()->getParent()));
+
+ // Traverse all blocks and collapse predicable instructions feeding
+ // conditional transfers into predicated instructions.
+ // Walk over all the instructions again, so we may catch pre-existing
+ // cases that were not created in the previous step.
+ for (auto &B : MF)
+ Changed |= predicateInBlock(B, PredUpd);
+ DEBUG(LIS->print(dbgs() << "After predicating\n",
+ MF.getFunction()->getParent()));
+
+ PredUpd.insert(CoalUpd.begin(), CoalUpd.end());
+ updateLiveness(PredUpd, true, true, true);
+
+ DEBUG({
+ if (Changed)
+ LIS->print(dbgs() << "After expand-condsets\n",
+ MF.getFunction()->getParent());
+ });
+
+ return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+FunctionPass *llvm::createHexagonExpandCondsets() {
+ return new HexagonExpandCondsets();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonFixupHwLoops.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
new file mode 100644
index 000000000000..dfd1f1d4f886
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
@@ -0,0 +1,194 @@
+//===---- HexagonFixupHwLoops.cpp - Fixup HW loops too far from LOOPn. ----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// The loop start address in the LOOPn instruction is encoded as a distance
+// from the LOOPn instruction itself. If the start address is too far from
+// the LOOPn instruction, the instruction needs to use a constant extender.
+// This pass will identify and convert such LOOPn instructions to a proper
+// form.
+//===----------------------------------------------------------------------===//
+
+
+#include "llvm/ADT/DenseMap.h"
+#include "Hexagon.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/PassSupport.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+static cl::opt<unsigned> MaxLoopRange(
+ "hexagon-loop-range", cl::Hidden, cl::init(200),
+ cl::desc("Restrict range of loopN instructions (testing only)"));
+
+namespace llvm {
+ FunctionPass *createHexagonFixupHwLoops();
+ void initializeHexagonFixupHwLoopsPass(PassRegistry&);
+}
+
+namespace {
+ struct HexagonFixupHwLoops : public MachineFunctionPass {
+ public:
+ static char ID;
+
+ HexagonFixupHwLoops() : MachineFunctionPass(ID) {
+ initializeHexagonFixupHwLoopsPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+ StringRef getPassName() const override {
+ return "Hexagon Hardware Loop Fixup";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ private:
+ /// \brief Check the offset between each loop instruction and
+ /// the loop basic block to determine if we can use the LOOP instruction
+ /// or if we need to set the LC/SA registers explicitly.
+ bool fixupLoopInstrs(MachineFunction &MF);
+
+ /// \brief Replace loop instruction with the constant extended
+ /// version if the loop label is too far from the loop instruction.
+ void useExtLoopInstr(MachineFunction &MF,
+ MachineBasicBlock::iterator &MII);
+ };
+
+ char HexagonFixupHwLoops::ID = 0;
+}
+
+INITIALIZE_PASS(HexagonFixupHwLoops, "hwloopsfixup",
+ "Hexagon Hardware Loops Fixup", false, false)
+
+FunctionPass *llvm::createHexagonFixupHwLoops() {
+ return new HexagonFixupHwLoops();
+}
+
+/// \brief Returns true if the instruction is a hardware loop instruction.
+static bool isHardwareLoop(const MachineInstr &MI) {
+ return MI.getOpcode() == Hexagon::J2_loop0r ||
+ MI.getOpcode() == Hexagon::J2_loop0i ||
+ MI.getOpcode() == Hexagon::J2_loop1r ||
+ MI.getOpcode() == Hexagon::J2_loop1i;
+}
+
+bool HexagonFixupHwLoops::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+ return fixupLoopInstrs(MF);
+}
+
+/// \brief For Hexagon, if the loop label is to far from the
+/// loop instruction then we need to set the LC0 and SA0 registers
+/// explicitly instead of using LOOP(start,count). This function
+/// checks the distance, and generates register assignments if needed.
+///
+/// This function makes two passes over the basic blocks. The first
+/// pass computes the offset of the basic block from the start.
+/// The second pass checks all the loop instructions.
+bool HexagonFixupHwLoops::fixupLoopInstrs(MachineFunction &MF) {
+
+ // Offset of the current instruction from the start.
+ unsigned InstOffset = 0;
+ // Map for each basic block to it's first instruction.
+ DenseMap<const MachineBasicBlock *, unsigned> BlockToInstOffset;
+
+ const HexagonInstrInfo *HII =
+ static_cast<const HexagonInstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+ // First pass - compute the offset of each basic block.
+ for (const MachineBasicBlock &MBB : MF) {
+ if (MBB.getAlignment()) {
+ // Although we don't know the exact layout of the final code, we need
+ // to account for alignment padding somehow. This heuristic pads each
+ // aligned basic block according to the alignment value.
+ int ByteAlign = (1u << MBB.getAlignment()) - 1;
+ InstOffset = (InstOffset + ByteAlign) & ~(ByteAlign);
+ }
+
+ BlockToInstOffset[&MBB] = InstOffset;
+ for (const MachineInstr &MI : MBB)
+ InstOffset += HII->getSize(MI);
+ }
+
+ // Second pass - check each loop instruction to see if it needs to be
+ // converted.
+ bool Changed = false;
+ for (MachineBasicBlock &MBB : MF) {
+ InstOffset = BlockToInstOffset[&MBB];
+
+ // Loop over all the instructions.
+ MachineBasicBlock::iterator MII = MBB.begin();
+ MachineBasicBlock::iterator MIE = MBB.end();
+ while (MII != MIE) {
+ InstOffset += HII->getSize(*MII);
+ if (MII->isDebugValue()) {
+ ++MII;
+ continue;
+ }
+ if (isHardwareLoop(*MII)) {
+ assert(MII->getOperand(0).isMBB() &&
+ "Expect a basic block as loop operand");
+ int diff = InstOffset - BlockToInstOffset[MII->getOperand(0).getMBB()];
+ if ((unsigned)abs(diff) > MaxLoopRange) {
+ useExtLoopInstr(MF, MII);
+ MII = MBB.erase(MII);
+ Changed = true;
+ } else {
+ ++MII;
+ }
+ } else {
+ ++MII;
+ }
+ }
+ }
+
+ return Changed;
+}
+
+/// \brief Replace loop instructions with the constant extended version.
+void HexagonFixupHwLoops::useExtLoopInstr(MachineFunction &MF,
+ MachineBasicBlock::iterator &MII) {
+ const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+ MachineBasicBlock *MBB = MII->getParent();
+ DebugLoc DL = MII->getDebugLoc();
+ MachineInstrBuilder MIB;
+ unsigned newOp;
+ switch (MII->getOpcode()) {
+ case Hexagon::J2_loop0r:
+ newOp = Hexagon::J2_loop0rext;
+ break;
+ case Hexagon::J2_loop0i:
+ newOp = Hexagon::J2_loop0iext;
+ break;
+ case Hexagon::J2_loop1r:
+ newOp = Hexagon::J2_loop1rext;
+ break;
+ case Hexagon::J2_loop1i:
+ newOp = Hexagon::J2_loop1iext;
+ break;
+ default:
+ llvm_unreachable("Invalid Hardware Loop Instruction.");
+ }
+ MIB = BuildMI(*MBB, MII, DL, TII->get(newOp));
+
+ for (unsigned i = 0; i < MII->getNumOperands(); ++i)
+ MIB.addOperand(MII->getOperand(i));
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
new file mode 100644
index 000000000000..a3f6273f9f67
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -0,0 +1,2441 @@
+//===-- HexagonFrameLowering.cpp - Define frame lowering ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hexagon-pei"
+
+#include "HexagonBlockRanges.h"
+#include "HexagonFrameLowering.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "MCTargetDesc/HexagonBaseInfo.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <limits>
+#include <map>
+#include <new>
+#include <utility>
+#include <vector>
+
+// Hexagon stack frame layout as defined by the ABI:
+//
+// Incoming arguments
+// passed via stack
+// |
+// |
+// SP during function's FP during function's |
+// +-- runtime (top of stack) runtime (bottom) --+ |
+// | | |
+// --++---------------------+------------------+-----------------++-+-------
+// | parameter area for | variable-size | fixed-size |LR| arg
+// | called functions | local objects | local objects |FP|
+// --+----------------------+------------------+-----------------+--+-------
+// <- size known -> <- size unknown -> <- size known ->
+//
+// Low address High address
+//
+// <--- stack growth
+//
+//
+// - In any circumstances, the outgoing function arguments are always accessi-
+// ble using the SP, and the incoming arguments are accessible using the FP.
+// - If the local objects are not aligned, they can always be accessed using
+// the FP.
+// - If there are no variable-sized objects, the local objects can always be
+// accessed using the SP, regardless whether they are aligned or not. (The
+// alignment padding will be at the bottom of the stack (highest address),
+// and so the offset with respect to the SP will be known at the compile-
+// -time.)
+//
+// The only complication occurs if there are both, local aligned objects, and
+// dynamically allocated (variable-sized) objects. The alignment pad will be
+// placed between the FP and the local objects, thus preventing the use of the
+// FP to access the local objects. At the same time, the variable-sized objects
+// will be between the SP and the local objects, thus introducing an unknown
+// distance from the SP to the locals.
+//
+// To avoid this problem, a new register is created that holds the aligned
+// address of the bottom of the stack, referred in the sources as AP (aligned
+// pointer). The AP will be equal to "FP-p", where "p" is the smallest pad
+// that aligns AP to the required boundary (a maximum of the alignments of
+// all stack objects, fixed- and variable-sized). All local objects[1] will
+// then use AP as the base pointer.
+// [1] The exception is with "fixed" stack objects. "Fixed" stack objects get
+// their name from being allocated at fixed locations on the stack, relative
+// to the FP. In the presence of dynamic allocation and local alignment, such
+// objects can only be accessed through the FP.
+//
+// Illustration of the AP:
+// FP --+
+// |
+// ---------------+---------------------+-----+-----------------------++-+--
+// Rest of the | Local stack objects | Pad | Fixed stack objects |LR|
+// stack frame | (aligned) | | (CSR, spills, etc.) |FP|
+// ---------------+---------------------+-----+-----------------+-----+--+--
+// |<-- Multiple of the -->|
+// stack alignment +-- AP
+//
+// The AP is set up at the beginning of the function. Since it is not a dedi-
+// cated (reserved) register, it needs to be kept live throughout the function
+// to be available as the base register for local object accesses.
+// Normally, an address of a stack objects is obtained by a pseudo-instruction
+// PS_fi. To access local objects with the AP register present, a different
+// pseudo-instruction needs to be used: PS_fia. The PS_fia takes one extra
+// argument compared to PS_fi: the first input register is the AP register.
+// This keeps the register live between its definition and its uses.
+
+// The AP register is originally set up using pseudo-instruction PS_aligna:
+// AP = PS_aligna A
+// where
+// A - required stack alignment
+// The alignment value must be the maximum of all alignments required by
+// any stack object.
+
+// The dynamic allocation uses a pseudo-instruction PS_alloca:
+// Rd = PS_alloca Rs, A
+// where
+// Rd - address of the allocated space
+// Rs - minimum size (the actual allocated can be larger to accommodate
+// alignment)
+// A - required alignment
+
+using namespace llvm;
+
+static cl::opt<bool> DisableDeallocRet("disable-hexagon-dealloc-ret",
+ cl::Hidden, cl::desc("Disable Dealloc Return for Hexagon target"));
+
+static cl::opt<unsigned> NumberScavengerSlots("number-scavenger-slots",
+ cl::Hidden, cl::desc("Set the number of scavenger slots"), cl::init(2),
+ cl::ZeroOrMore);
+
+static cl::opt<int> SpillFuncThreshold("spill-func-threshold",
+ cl::Hidden, cl::desc("Specify O2(not Os) spill func threshold"),
+ cl::init(6), cl::ZeroOrMore);
+
+static cl::opt<int> SpillFuncThresholdOs("spill-func-threshold-Os",
+ cl::Hidden, cl::desc("Specify Os spill func threshold"),
+ cl::init(1), cl::ZeroOrMore);
+
+static cl::opt<bool> EnableStackOVFSanitizer("enable-stackovf-sanitizer",
+ cl::Hidden, cl::desc("Enable runtime checks for stack overflow."),
+ cl::init(false), cl::ZeroOrMore);
+
+static cl::opt<bool> EnableShrinkWrapping("hexagon-shrink-frame",
+ cl::init(true), cl::Hidden, cl::ZeroOrMore,
+ cl::desc("Enable stack frame shrink wrapping"));
+
+static cl::opt<unsigned> ShrinkLimit("shrink-frame-limit",
+ cl::init(std::numeric_limits<unsigned>::max()), cl::Hidden, cl::ZeroOrMore,
+ cl::desc("Max count of stack frame shrink-wraps"));
+
+static cl::opt<bool> EnableSaveRestoreLong("enable-save-restore-long",
+ cl::Hidden, cl::desc("Enable long calls for save-restore stubs."),
+ cl::init(false), cl::ZeroOrMore);
+
+static cl::opt<bool> UseAllocframe("use-allocframe", cl::init(true),
+ cl::Hidden, cl::desc("Use allocframe more conservatively"));
+
+static cl::opt<bool> OptimizeSpillSlots("hexagon-opt-spill", cl::Hidden,
+ cl::init(true), cl::desc("Optimize spill slots"));
+
+#ifndef NDEBUG
+static cl::opt<unsigned> SpillOptMax("spill-opt-max", cl::Hidden,
+ cl::init(std::numeric_limits<unsigned>::max()));
+static unsigned SpillOptCount = 0;
+#endif
+
+namespace llvm {
+
+ void initializeHexagonCallFrameInformationPass(PassRegistry&);
+ FunctionPass *createHexagonCallFrameInformation();
+
+} // end namespace llvm
+
+namespace {
+
+ class HexagonCallFrameInformation : public MachineFunctionPass {
+ public:
+ static char ID;
+
+ HexagonCallFrameInformation() : MachineFunctionPass(ID) {
+ PassRegistry &PR = *PassRegistry::getPassRegistry();
+ initializeHexagonCallFrameInformationPass(PR);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+ };
+
+ char HexagonCallFrameInformation::ID = 0;
+
+} // end anonymous namespace
+
+bool HexagonCallFrameInformation::runOnMachineFunction(MachineFunction &MF) {
+ auto &HFI = *MF.getSubtarget<HexagonSubtarget>().getFrameLowering();
+ bool NeedCFI = MF.getMMI().hasDebugInfo() ||
+ MF.getFunction()->needsUnwindTableEntry();
+
+ if (!NeedCFI)
+ return false;
+ HFI.insertCFIInstructions(MF);
+ return true;
+}
+
+INITIALIZE_PASS(HexagonCallFrameInformation, "hexagon-cfi",
+ "Hexagon call frame information", false, false)
+
+FunctionPass *llvm::createHexagonCallFrameInformation() {
+ return new HexagonCallFrameInformation();
+}
+
+/// Map a register pair Reg to the subregister that has the greater "number",
+/// i.e. D3 (aka R7:6) will be mapped to R7, etc.
+static unsigned getMax32BitSubRegister(unsigned Reg,
+ const TargetRegisterInfo &TRI,
+ bool hireg = true) {
+ if (Reg < Hexagon::D0 || Reg > Hexagon::D15)
+ return Reg;
+
+ unsigned RegNo = 0;
+ for (MCSubRegIterator SubRegs(Reg, &TRI); SubRegs.isValid(); ++SubRegs) {
+ if (hireg) {
+ if (*SubRegs > RegNo)
+ RegNo = *SubRegs;
+ } else {
+ if (!RegNo || *SubRegs < RegNo)
+ RegNo = *SubRegs;
+ }
+ }
+ return RegNo;
+}
+
+/// Returns the callee saved register with the largest id in the vector.
+static unsigned getMaxCalleeSavedReg(const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo &TRI) {
+ static_assert(Hexagon::R1 > 0,
+ "Assume physical registers are encoded as positive integers");
+ if (CSI.empty())
+ return 0;
+
+ unsigned Max = getMax32BitSubRegister(CSI[0].getReg(), TRI);
+ for (unsigned I = 1, E = CSI.size(); I < E; ++I) {
+ unsigned Reg = getMax32BitSubRegister(CSI[I].getReg(), TRI);
+ if (Reg > Max)
+ Max = Reg;
+ }
+ return Max;
+}
+
+/// Checks if the basic block contains any instruction that needs a stack
+/// frame to be already in place.
+static bool needsStackFrame(const MachineBasicBlock &MBB, const BitVector &CSR,
+ const HexagonRegisterInfo &HRI) {
+ for (auto &I : MBB) {
+ const MachineInstr *MI = &I;
+ if (MI->isCall())
+ return true;
+ unsigned Opc = MI->getOpcode();
+ switch (Opc) {
+ case Hexagon::PS_alloca:
+ case Hexagon::PS_aligna:
+ return true;
+ default:
+ break;
+ }
+ // Check individual operands.
+ for (const MachineOperand &MO : MI->operands()) {
+ // While the presence of a frame index does not prove that a stack
+ // frame will be required, all frame indexes should be within alloc-
+ // frame/deallocframe. Otherwise, the code that translates a frame
+ // index into an offset would have to be aware of the placement of
+ // the frame creation/destruction instructions.
+ if (MO.isFI())
+ return true;
+ if (!MO.isReg())
+ continue;
+ unsigned R = MO.getReg();
+ // Virtual registers will need scavenging, which then may require
+ // a stack slot.
+ if (TargetRegisterInfo::isVirtualRegister(R))
+ return true;
+ for (MCSubRegIterator S(R, &HRI, true); S.isValid(); ++S)
+ if (CSR[*S])
+ return true;
+ }
+ }
+ return false;
+}
+
+ /// Returns true if MBB has a machine instructions that indicates a tail call
+ /// in the block.
+static bool hasTailCall(const MachineBasicBlock &MBB) {
+ MachineBasicBlock::const_iterator I = MBB.getLastNonDebugInstr();
+ unsigned RetOpc = I->getOpcode();
+ return RetOpc == Hexagon::PS_tailcall_i || RetOpc == Hexagon::PS_tailcall_r;
+}
+
+/// Returns true if MBB contains an instruction that returns.
+static bool hasReturn(const MachineBasicBlock &MBB) {
+ for (auto I = MBB.getFirstTerminator(), E = MBB.end(); I != E; ++I)
+ if (I->isReturn())
+ return true;
+ return false;
+}
+
+/// Returns the "return" instruction from this block, or nullptr if there
+/// isn't any.
+static MachineInstr *getReturn(MachineBasicBlock &MBB) {
+ for (auto &I : MBB)
+ if (I.isReturn())
+ return &I;
+ return nullptr;
+}
+
+static bool isRestoreCall(unsigned Opc) {
+ switch (Opc) {
+ case Hexagon::RESTORE_DEALLOC_RET_JMP_V4:
+ case Hexagon::RESTORE_DEALLOC_RET_JMP_V4_PIC:
+ case Hexagon::RESTORE_DEALLOC_RET_JMP_V4_EXT:
+ case Hexagon::RESTORE_DEALLOC_RET_JMP_V4_EXT_PIC:
+ case Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4_EXT:
+ case Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4_EXT_PIC:
+ case Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4:
+ case Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4_PIC:
+ return true;
+ }
+ return false;
+}
+
+static inline bool isOptNone(const MachineFunction &MF) {
+ return MF.getFunction()->hasFnAttribute(Attribute::OptimizeNone) ||
+ MF.getTarget().getOptLevel() == CodeGenOpt::None;
+}
+
+static inline bool isOptSize(const MachineFunction &MF) {
+ const Function &F = *MF.getFunction();
+ return F.optForSize() && !F.optForMinSize();
+}
+
+static inline bool isMinSize(const MachineFunction &MF) {
+ return MF.getFunction()->optForMinSize();
+}
+
+/// Implements shrink-wrapping of the stack frame. By default, stack frame
+/// is created in the function entry block, and is cleaned up in every block
+/// that returns. This function finds alternate blocks: one for the frame
+/// setup (prolog) and one for the cleanup (epilog).
+void HexagonFrameLowering::findShrunkPrologEpilog(MachineFunction &MF,
+ MachineBasicBlock *&PrologB, MachineBasicBlock *&EpilogB) const {
+ static unsigned ShrinkCounter = 0;
+
+ if (ShrinkLimit.getPosition()) {
+ if (ShrinkCounter >= ShrinkLimit)
+ return;
+ ShrinkCounter++;
+ }
+
+ auto &HST = MF.getSubtarget<HexagonSubtarget>();
+ auto &HRI = *HST.getRegisterInfo();
+
+ MachineDominatorTree MDT;
+ MDT.runOnMachineFunction(MF);
+ MachinePostDominatorTree MPT;
+ MPT.runOnMachineFunction(MF);
+
+ typedef DenseMap<unsigned,unsigned> UnsignedMap;
+ UnsignedMap RPO;
+ typedef ReversePostOrderTraversal<const MachineFunction*> RPOTType;
+ RPOTType RPOT(&MF);
+ unsigned RPON = 0;
+ for (RPOTType::rpo_iterator I = RPOT.begin(), E = RPOT.end(); I != E; ++I)
+ RPO[(*I)->getNumber()] = RPON++;
+
+ // Don't process functions that have loops, at least for now. Placement
+ // of prolog and epilog must take loop structure into account. For simpli-
+ // city don't do it right now.
+ for (auto &I : MF) {
+ unsigned BN = RPO[I.getNumber()];
+ for (auto SI = I.succ_begin(), SE = I.succ_end(); SI != SE; ++SI) {
+ // If found a back-edge, return.
+ if (RPO[(*SI)->getNumber()] <= BN)
+ return;
+ }
+ }
+
+ // Collect the set of blocks that need a stack frame to execute. Scan
+ // each block for uses/defs of callee-saved registers, calls, etc.
+ SmallVector<MachineBasicBlock*,16> SFBlocks;
+ BitVector CSR(Hexagon::NUM_TARGET_REGS);
+ for (const MCPhysReg *P = HRI.getCalleeSavedRegs(&MF); *P; ++P)
+ for (MCSubRegIterator S(*P, &HRI, true); S.isValid(); ++S)
+ CSR[*S] = true;
+
+ for (auto &I : MF)
+ if (needsStackFrame(I, CSR, HRI))
+ SFBlocks.push_back(&I);
+
+ DEBUG({
+ dbgs() << "Blocks needing SF: {";
+ for (auto &B : SFBlocks)
+ dbgs() << " BB#" << B->getNumber();
+ dbgs() << " }\n";
+ });
+ // No frame needed?
+ if (SFBlocks.empty())
+ return;
+
+ // Pick a common dominator and a common post-dominator.
+ MachineBasicBlock *DomB = SFBlocks[0];
+ for (unsigned i = 1, n = SFBlocks.size(); i < n; ++i) {
+ DomB = MDT.findNearestCommonDominator(DomB, SFBlocks[i]);
+ if (!DomB)
+ break;
+ }
+ MachineBasicBlock *PDomB = SFBlocks[0];
+ for (unsigned i = 1, n = SFBlocks.size(); i < n; ++i) {
+ PDomB = MPT.findNearestCommonDominator(PDomB, SFBlocks[i]);
+ if (!PDomB)
+ break;
+ }
+ DEBUG({
+ dbgs() << "Computed dom block: BB#";
+ if (DomB) dbgs() << DomB->getNumber();
+ else dbgs() << "<null>";
+ dbgs() << ", computed pdom block: BB#";
+ if (PDomB) dbgs() << PDomB->getNumber();
+ else dbgs() << "<null>";
+ dbgs() << "\n";
+ });
+ if (!DomB || !PDomB)
+ return;
+
+ // Make sure that DomB dominates PDomB and PDomB post-dominates DomB.
+ if (!MDT.dominates(DomB, PDomB)) {
+ DEBUG(dbgs() << "Dom block does not dominate pdom block\n");
+ return;
+ }
+ if (!MPT.dominates(PDomB, DomB)) {
+ DEBUG(dbgs() << "PDom block does not post-dominate dom block\n");
+ return;
+ }
+
+ // Finally, everything seems right.
+ PrologB = DomB;
+ EpilogB = PDomB;
+}
+
+/// Perform most of the PEI work here:
+/// - saving/restoring of the callee-saved registers,
+/// - stack frame creation and destruction.
+/// Normally, this work is distributed among various functions, but doing it
+/// in one place allows shrink-wrapping of the stack frame.
+void HexagonFrameLowering::emitPrologue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ auto &HST = MF.getSubtarget<HexagonSubtarget>();
+ auto &HRI = *HST.getRegisterInfo();
+
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+
+ MachineBasicBlock *PrologB = &MF.front(), *EpilogB = nullptr;
+ if (EnableShrinkWrapping)
+ findShrunkPrologEpilog(MF, PrologB, EpilogB);
+
+ bool PrologueStubs = false;
+ insertCSRSpillsInBlock(*PrologB, CSI, HRI, PrologueStubs);
+ insertPrologueInBlock(*PrologB, PrologueStubs);
+ updateEntryPaths(MF, *PrologB);
+
+ if (EpilogB) {
+ insertCSRRestoresInBlock(*EpilogB, CSI, HRI);
+ insertEpilogueInBlock(*EpilogB);
+ } else {
+ for (auto &B : MF)
+ if (B.isReturnBlock())
+ insertCSRRestoresInBlock(B, CSI, HRI);
+
+ for (auto &B : MF)
+ if (B.isReturnBlock())
+ insertEpilogueInBlock(B);
+
+ for (auto &B : MF) {
+ if (B.empty())
+ continue;
+ MachineInstr *RetI = getReturn(B);
+ if (!RetI || isRestoreCall(RetI->getOpcode()))
+ continue;
+ for (auto &R : CSI)
+ RetI->addOperand(MachineOperand::CreateReg(R.getReg(), false, true));
+ }
+ }
+
+ if (EpilogB) {
+ // If there is an epilog block, it may not have a return instruction.
+ // In such case, we need to add the callee-saved registers as live-ins
+ // in all blocks on all paths from the epilog to any return block.
+ unsigned MaxBN = MF.getNumBlockIDs();
+ BitVector DoneT(MaxBN+1), DoneF(MaxBN+1), Path(MaxBN+1);
+ updateExitPaths(*EpilogB, *EpilogB, DoneT, DoneF, Path);
+ }
+}
+
+void HexagonFrameLowering::insertPrologueInBlock(MachineBasicBlock &MBB,
+ bool PrologueStubs) const {
+ MachineFunction &MF = *MBB.getParent();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ auto &HST = MF.getSubtarget<HexagonSubtarget>();
+ auto &HII = *HST.getInstrInfo();
+ auto &HRI = *HST.getRegisterInfo();
+ DebugLoc dl;
+
+ unsigned MaxAlign = std::max(MFI.getMaxAlignment(), getStackAlignment());
+
+ // Calculate the total stack frame size.
+ // Get the number of bytes to allocate from the FrameInfo.
+ unsigned FrameSize = MFI.getStackSize();
+ // Round up the max call frame size to the max alignment on the stack.
+ unsigned MaxCFA = alignTo(MFI.getMaxCallFrameSize(), MaxAlign);
+ MFI.setMaxCallFrameSize(MaxCFA);
+
+ FrameSize = MaxCFA + alignTo(FrameSize, MaxAlign);
+ MFI.setStackSize(FrameSize);
+
+ bool AlignStack = (MaxAlign > getStackAlignment());
+
+ // Get the number of bytes to allocate from the FrameInfo.
+ unsigned NumBytes = MFI.getStackSize();
+ unsigned SP = HRI.getStackRegister();
+ unsigned MaxCF = MFI.getMaxCallFrameSize();
+ MachineBasicBlock::iterator InsertPt = MBB.begin();
+
+ SmallVector<MachineInstr *, 4> AdjustRegs;
+ for (auto &MBB : MF)
+ for (auto &MI : MBB)
+ if (MI.getOpcode() == Hexagon::PS_alloca)
+ AdjustRegs.push_back(&MI);
+
+ for (auto MI : AdjustRegs) {
+ assert((MI->getOpcode() == Hexagon::PS_alloca) && "Expected alloca");
+ expandAlloca(MI, HII, SP, MaxCF);
+ MI->eraseFromParent();
+ }
+
+ if (!hasFP(MF))
+ return;
+
+ // Check for overflow.
+ // Hexagon_TODO: Ugh! hardcoding. Is there an API that can be used?
+ const unsigned int ALLOCFRAME_MAX = 16384;
+
+ // Create a dummy memory operand to avoid allocframe from being treated as
+ // a volatile memory reference.
+ MachineMemOperand *MMO =
+ MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOStore,
+ 4, 4);
+
+ if (NumBytes >= ALLOCFRAME_MAX) {
+ // Emit allocframe(#0).
+ BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::S2_allocframe))
+ .addImm(0)
+ .addMemOperand(MMO);
+
+ // Subtract offset from frame pointer.
+ // We use a caller-saved non-parameter register for that.
+ unsigned CallerSavedReg = HRI.getFirstCallerSavedNonParamReg();
+ BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::CONST32),
+ CallerSavedReg).addImm(NumBytes);
+ BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_sub), SP)
+ .addReg(SP)
+ .addReg(CallerSavedReg);
+ } else {
+ BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::S2_allocframe))
+ .addImm(NumBytes)
+ .addMemOperand(MMO);
+ }
+
+ if (AlignStack) {
+ BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_andir), SP)
+ .addReg(SP)
+ .addImm(-int64_t(MaxAlign));
+ }
+
+ // If the stack-checking is enabled, and we spilled the callee-saved
+ // registers inline (i.e. did not use a spill function), then call
+ // the stack checker directly.
+ if (EnableStackOVFSanitizer && !PrologueStubs)
+ BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::PS_call_stk))
+ .addExternalSymbol("__runtime_stack_check");
+}
+
+void HexagonFrameLowering::insertEpilogueInBlock(MachineBasicBlock &MBB) const {
+ MachineFunction &MF = *MBB.getParent();
+ if (!hasFP(MF))
+ return;
+
+ auto &HST = MF.getSubtarget<HexagonSubtarget>();
+ auto &HII = *HST.getInstrInfo();
+ auto &HRI = *HST.getRegisterInfo();
+ unsigned SP = HRI.getStackRegister();
+
+ MachineInstr *RetI = getReturn(MBB);
+ unsigned RetOpc = RetI ? RetI->getOpcode() : 0;
+
+ MachineBasicBlock::iterator InsertPt = MBB.getFirstTerminator();
+ DebugLoc DL;
+ if (InsertPt != MBB.end())
+ DL = InsertPt->getDebugLoc();
+ else if (!MBB.empty())
+ DL = std::prev(MBB.end())->getDebugLoc();
+
+ // Handle EH_RETURN.
+ if (RetOpc == Hexagon::EH_RETURN_JMPR) {
+ BuildMI(MBB, InsertPt, DL, HII.get(Hexagon::L2_deallocframe));
+ BuildMI(MBB, InsertPt, DL, HII.get(Hexagon::A2_add), SP)
+ .addReg(SP)
+ .addReg(Hexagon::R28);
+ return;
+ }
+
+ // Check for RESTORE_DEALLOC_RET* tail call. Don't emit an extra dealloc-
+ // frame instruction if we encounter it.
+ if (RetOpc == Hexagon::RESTORE_DEALLOC_RET_JMP_V4 ||
+ RetOpc == Hexagon::RESTORE_DEALLOC_RET_JMP_V4_PIC ||
+ RetOpc == Hexagon::RESTORE_DEALLOC_RET_JMP_V4_EXT ||
+ RetOpc == Hexagon::RESTORE_DEALLOC_RET_JMP_V4_EXT_PIC) {
+ MachineBasicBlock::iterator It = RetI;
+ ++It;
+ // Delete all instructions after the RESTORE (except labels).
+ while (It != MBB.end()) {
+ if (!It->isLabel())
+ It = MBB.erase(It);
+ else
+ ++It;
+ }
+ return;
+ }
+
+ // It is possible that the restoring code is a call to a library function.
+ // All of the restore* functions include "deallocframe", so we need to make
+ // sure that we don't add an extra one.
+ bool NeedsDeallocframe = true;
+ if (!MBB.empty() && InsertPt != MBB.begin()) {
+ MachineBasicBlock::iterator PrevIt = std::prev(InsertPt);
+ unsigned COpc = PrevIt->getOpcode();
+ if (COpc == Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4 ||
+ COpc == Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4_PIC ||
+ COpc == Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4_EXT ||
+ COpc == Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4_EXT_PIC ||
+ COpc == Hexagon::PS_call_nr || COpc == Hexagon::PS_callr_nr)
+ NeedsDeallocframe = false;
+ }
+
+ if (!NeedsDeallocframe)
+ return;
+ // If the returning instruction is PS_jmpret, replace it with dealloc_return,
+ // otherwise just add deallocframe. The function could be returning via a
+ // tail call.
+ if (RetOpc != Hexagon::PS_jmpret || DisableDeallocRet) {
+ BuildMI(MBB, InsertPt, DL, HII.get(Hexagon::L2_deallocframe));
+ return;
+ }
+ unsigned NewOpc = Hexagon::L4_return;
+ MachineInstr *NewI = BuildMI(MBB, RetI, DL, HII.get(NewOpc));
+ // Transfer the function live-out registers.
+ NewI->copyImplicitOps(MF, *RetI);
+ MBB.erase(RetI);
+}
+
+void HexagonFrameLowering::updateEntryPaths(MachineFunction &MF,
+ MachineBasicBlock &SaveB) const {
+ SetVector<unsigned> Worklist;
+
+ MachineBasicBlock &EntryB = MF.front();
+ Worklist.insert(EntryB.getNumber());
+
+ unsigned SaveN = SaveB.getNumber();
+ auto &CSI = MF.getFrameInfo().getCalleeSavedInfo();
+
+ for (unsigned i = 0; i < Worklist.size(); ++i) {
+ unsigned BN = Worklist[i];
+ MachineBasicBlock &MBB = *MF.getBlockNumbered(BN);
+ for (auto &R : CSI)
+ if (!MBB.isLiveIn(R.getReg()))
+ MBB.addLiveIn(R.getReg());
+ if (BN != SaveN)
+ for (auto &SB : MBB.successors())
+ Worklist.insert(SB->getNumber());
+ }
+}
+
+bool HexagonFrameLowering::updateExitPaths(MachineBasicBlock &MBB,
+ MachineBasicBlock &RestoreB, BitVector &DoneT, BitVector &DoneF,
+ BitVector &Path) const {
+ assert(MBB.getNumber() >= 0);
+ unsigned BN = MBB.getNumber();
+ if (Path[BN] || DoneF[BN])
+ return false;
+ if (DoneT[BN])
+ return true;
+
+ auto &CSI = MBB.getParent()->getFrameInfo().getCalleeSavedInfo();
+
+ Path[BN] = true;
+ bool ReachedExit = false;
+ for (auto &SB : MBB.successors())
+ ReachedExit |= updateExitPaths(*SB, RestoreB, DoneT, DoneF, Path);
+
+ if (!MBB.empty() && MBB.back().isReturn()) {
+ // Add implicit uses of all callee-saved registers to the reached
+ // return instructions. This is to prevent the anti-dependency breaker
+ // from renaming these registers.
+ MachineInstr &RetI = MBB.back();
+ if (!isRestoreCall(RetI.getOpcode()))
+ for (auto &R : CSI)
+ RetI.addOperand(MachineOperand::CreateReg(R.getReg(), false, true));
+ ReachedExit = true;
+ }
+
+ // We don't want to add unnecessary live-ins to the restore block: since
+ // the callee-saved registers are being defined in it, the entry of the
+ // restore block cannot be on the path from the definitions to any exit.
+ if (ReachedExit && &MBB != &RestoreB) {
+ for (auto &R : CSI)
+ if (!MBB.isLiveIn(R.getReg()))
+ MBB.addLiveIn(R.getReg());
+ DoneT[BN] = true;
+ }
+ if (!ReachedExit)
+ DoneF[BN] = true;
+
+ Path[BN] = false;
+ return ReachedExit;
+}
+
+static Optional<MachineBasicBlock::iterator>
+findCFILocation(MachineBasicBlock &B) {
+ // The CFI instructions need to be inserted right after allocframe.
+ // An exception to this is a situation where allocframe is bundled
+ // with a call: then the CFI instructions need to be inserted before
+ // the packet with the allocframe+call (in case the call throws an
+ // exception).
+ auto End = B.instr_end();
+
+ for (MachineInstr &I : B) {
+ MachineBasicBlock::iterator It = I.getIterator();
+ if (!I.isBundle()) {
+ if (I.getOpcode() == Hexagon::S2_allocframe)
+ return std::next(It);
+ continue;
+ }
+ // I is a bundle.
+ bool HasCall = false, HasAllocFrame = false;
+ auto T = It.getInstrIterator();
+ while (++T != End && T->isBundled()) {
+ if (T->getOpcode() == Hexagon::S2_allocframe)
+ HasAllocFrame = true;
+ else if (T->isCall())
+ HasCall = true;
+ }
+ if (HasAllocFrame)
+ return HasCall ? It : std::next(It);
+ }
+ return None;
+}
+
+void HexagonFrameLowering::insertCFIInstructions(MachineFunction &MF) const {
+ for (auto &B : MF) {
+ auto At = findCFILocation(B);
+ if (At.hasValue())
+ insertCFIInstructionsAt(B, At.getValue());
+ }
+}
+
+void HexagonFrameLowering::insertCFIInstructionsAt(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator At) const {
+ MachineFunction &MF = *MBB.getParent();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MachineModuleInfo &MMI = MF.getMMI();
+ auto &HST = MF.getSubtarget<HexagonSubtarget>();
+ auto &HII = *HST.getInstrInfo();
+ auto &HRI = *HST.getRegisterInfo();
+
+ // If CFI instructions have debug information attached, something goes
+ // wrong with the final assembly generation: the prolog_end is placed
+ // in a wrong location.
+ DebugLoc DL;
+ const MCInstrDesc &CFID = HII.get(TargetOpcode::CFI_INSTRUCTION);
+
+ MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
+ bool HasFP = hasFP(MF);
+
+ if (HasFP) {
+ unsigned DwFPReg = HRI.getDwarfRegNum(HRI.getFrameRegister(), true);
+ unsigned DwRAReg = HRI.getDwarfRegNum(HRI.getRARegister(), true);
+
+ // Define CFA via an offset from the value of FP.
+ //
+ // -8 -4 0 (SP)
+ // --+----+----+---------------------
+ // | FP | LR | increasing addresses -->
+ // --+----+----+---------------------
+ // | +-- Old SP (before allocframe)
+ // +-- New FP (after allocframe)
+ //
+ // MCCFIInstruction::createDefCfa subtracts the offset from the register.
+ // MCCFIInstruction::createOffset takes the offset without sign change.
+ auto DefCfa = MCCFIInstruction::createDefCfa(FrameLabel, DwFPReg, -8);
+ BuildMI(MBB, At, DL, CFID)
+ .addCFIIndex(MF.addFrameInst(DefCfa));
+ // R31 (return addr) = CFA - 4
+ auto OffR31 = MCCFIInstruction::createOffset(FrameLabel, DwRAReg, -4);
+ BuildMI(MBB, At, DL, CFID)
+ .addCFIIndex(MF.addFrameInst(OffR31));
+ // R30 (frame ptr) = CFA - 8
+ auto OffR30 = MCCFIInstruction::createOffset(FrameLabel, DwFPReg, -8);
+ BuildMI(MBB, At, DL, CFID)
+ .addCFIIndex(MF.addFrameInst(OffR30));
+ }
+
+ static unsigned int RegsToMove[] = {
+ Hexagon::R1, Hexagon::R0, Hexagon::R3, Hexagon::R2,
+ Hexagon::R17, Hexagon::R16, Hexagon::R19, Hexagon::R18,
+ Hexagon::R21, Hexagon::R20, Hexagon::R23, Hexagon::R22,
+ Hexagon::R25, Hexagon::R24, Hexagon::R27, Hexagon::R26,
+ Hexagon::D0, Hexagon::D1, Hexagon::D8, Hexagon::D9,
+ Hexagon::D10, Hexagon::D11, Hexagon::D12, Hexagon::D13,
+ Hexagon::NoRegister
+ };
+
+ const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+
+ for (unsigned i = 0; RegsToMove[i] != Hexagon::NoRegister; ++i) {
+ unsigned Reg = RegsToMove[i];
+ auto IfR = [Reg] (const CalleeSavedInfo &C) -> bool {
+ return C.getReg() == Reg;
+ };
+ auto F = find_if(CSI, IfR);
+ if (F == CSI.end())
+ continue;
+
+ int64_t Offset;
+ if (HasFP) {
+ // If the function has a frame pointer (i.e. has an allocframe),
+ // then the CFA has been defined in terms of FP. Any offsets in
+ // the following CFI instructions have to be defined relative
+ // to FP, which points to the bottom of the stack frame.
+ // The function getFrameIndexReference can still choose to use SP
+ // for the offset calculation, so we cannot simply call it here.
+ // Instead, get the offset (relative to the FP) directly.
+ Offset = MFI.getObjectOffset(F->getFrameIdx());
+ } else {
+ unsigned FrameReg;
+ Offset = getFrameIndexReference(MF, F->getFrameIdx(), FrameReg);
+ }
+ // Subtract 8 to make room for R30 and R31, which are added above.
+ Offset -= 8;
+
+ if (Reg < Hexagon::D0 || Reg > Hexagon::D15) {
+ unsigned DwarfReg = HRI.getDwarfRegNum(Reg, true);
+ auto OffReg = MCCFIInstruction::createOffset(FrameLabel, DwarfReg,
+ Offset);
+ BuildMI(MBB, At, DL, CFID)
+ .addCFIIndex(MF.addFrameInst(OffReg));
+ } else {
+ // Split the double regs into subregs, and generate appropriate
+ // cfi_offsets.
+ // The only reason, we are split double regs is, llvm-mc does not
+ // understand paired registers for cfi_offset.
+ // Eg .cfi_offset r1:0, -64
+
+ unsigned HiReg = HRI.getSubReg(Reg, Hexagon::isub_hi);
+ unsigned LoReg = HRI.getSubReg(Reg, Hexagon::isub_lo);
+ unsigned HiDwarfReg = HRI.getDwarfRegNum(HiReg, true);
+ unsigned LoDwarfReg = HRI.getDwarfRegNum(LoReg, true);
+ auto OffHi = MCCFIInstruction::createOffset(FrameLabel, HiDwarfReg,
+ Offset+4);
+ BuildMI(MBB, At, DL, CFID)
+ .addCFIIndex(MF.addFrameInst(OffHi));
+ auto OffLo = MCCFIInstruction::createOffset(FrameLabel, LoDwarfReg,
+ Offset);
+ BuildMI(MBB, At, DL, CFID)
+ .addCFIIndex(MF.addFrameInst(OffLo));
+ }
+ }
+}
+
+bool HexagonFrameLowering::hasFP(const MachineFunction &MF) const {
+ auto &MFI = MF.getFrameInfo();
+ auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+
+ bool HasFixed = MFI.getNumFixedObjects();
+ bool HasPrealloc = const_cast<MachineFrameInfo&>(MFI)
+ .getLocalFrameObjectCount();
+ bool HasExtraAlign = HRI.needsStackRealignment(MF);
+ bool HasAlloca = MFI.hasVarSizedObjects();
+
+ // Insert ALLOCFRAME if we need to or at -O0 for the debugger. Think
+ // that this shouldn't be required, but doing so now because gcc does and
+ // gdb can't break at the start of the function without it. Will remove if
+ // this turns out to be a gdb bug.
+ //
+ if (MF.getTarget().getOptLevel() == CodeGenOpt::None)
+ return true;
+
+ // By default we want to use SP (since it's always there). FP requires
+ // some setup (i.e. ALLOCFRAME).
+ // Fixed and preallocated objects need FP if the distance from them to
+ // the SP is unknown (as is with alloca or aligna).
+ if ((HasFixed || HasPrealloc) && (HasAlloca || HasExtraAlign))
+ return true;
+
+ if (MFI.getStackSize() > 0) {
+ if (EnableStackOVFSanitizer || UseAllocframe)
+ return true;
+ }
+
+ if (MFI.hasCalls() ||
+ MF.getInfo<HexagonMachineFunctionInfo>()->hasClobberLR())
+ return true;
+
+ return false;
+}
+
+enum SpillKind {
+ SK_ToMem,
+ SK_FromMem,
+ SK_FromMemTailcall
+};
+
+static const char *getSpillFunctionFor(unsigned MaxReg, SpillKind SpillType,
+ bool Stkchk = false) {
+ const char * V4SpillToMemoryFunctions[] = {
+ "__save_r16_through_r17",
+ "__save_r16_through_r19",
+ "__save_r16_through_r21",
+ "__save_r16_through_r23",
+ "__save_r16_through_r25",
+ "__save_r16_through_r27" };
+
+ const char * V4SpillToMemoryStkchkFunctions[] = {
+ "__save_r16_through_r17_stkchk",
+ "__save_r16_through_r19_stkchk",
+ "__save_r16_through_r21_stkchk",
+ "__save_r16_through_r23_stkchk",
+ "__save_r16_through_r25_stkchk",
+ "__save_r16_through_r27_stkchk" };
+
+ const char * V4SpillFromMemoryFunctions[] = {
+ "__restore_r16_through_r17_and_deallocframe",
+ "__restore_r16_through_r19_and_deallocframe",
+ "__restore_r16_through_r21_and_deallocframe",
+ "__restore_r16_through_r23_and_deallocframe",
+ "__restore_r16_through_r25_and_deallocframe",
+ "__restore_r16_through_r27_and_deallocframe" };
+
+ const char * V4SpillFromMemoryTailcallFunctions[] = {
+ "__restore_r16_through_r17_and_deallocframe_before_tailcall",
+ "__restore_r16_through_r19_and_deallocframe_before_tailcall",
+ "__restore_r16_through_r21_and_deallocframe_before_tailcall",
+ "__restore_r16_through_r23_and_deallocframe_before_tailcall",
+ "__restore_r16_through_r25_and_deallocframe_before_tailcall",
+ "__restore_r16_through_r27_and_deallocframe_before_tailcall"
+ };
+
+ const char **SpillFunc = nullptr;
+
+ switch(SpillType) {
+ case SK_ToMem:
+ SpillFunc = Stkchk ? V4SpillToMemoryStkchkFunctions
+ : V4SpillToMemoryFunctions;
+ break;
+ case SK_FromMem:
+ SpillFunc = V4SpillFromMemoryFunctions;
+ break;
+ case SK_FromMemTailcall:
+ SpillFunc = V4SpillFromMemoryTailcallFunctions;
+ break;
+ }
+ assert(SpillFunc && "Unknown spill kind");
+
+ // Spill all callee-saved registers up to the highest register used.
+ switch (MaxReg) {
+ case Hexagon::R17:
+ return SpillFunc[0];
+ case Hexagon::R19:
+ return SpillFunc[1];
+ case Hexagon::R21:
+ return SpillFunc[2];
+ case Hexagon::R23:
+ return SpillFunc[3];
+ case Hexagon::R25:
+ return SpillFunc[4];
+ case Hexagon::R27:
+ return SpillFunc[5];
+ default:
+ llvm_unreachable("Unhandled maximum callee save register");
+ }
+ return nullptr;
+}
+
+int HexagonFrameLowering::getFrameIndexReference(const MachineFunction &MF,
+ int FI, unsigned &FrameReg) const {
+ auto &MFI = MF.getFrameInfo();
+ auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+
+ int Offset = MFI.getObjectOffset(FI);
+ bool HasAlloca = MFI.hasVarSizedObjects();
+ bool HasExtraAlign = HRI.needsStackRealignment(MF);
+ bool NoOpt = MF.getTarget().getOptLevel() == CodeGenOpt::None;
+
+ unsigned SP = HRI.getStackRegister(), FP = HRI.getFrameRegister();
+ auto &HMFI = *MF.getInfo<HexagonMachineFunctionInfo>();
+ unsigned AP = HMFI.getStackAlignBasePhysReg();
+ unsigned FrameSize = MFI.getStackSize();
+
+ bool UseFP = false, UseAP = false; // Default: use SP (except at -O0).
+ // Use FP at -O0, except when there are objects with extra alignment.
+ // That additional alignment requirement may cause a pad to be inserted,
+ // which will make it impossible to use FP to access objects located
+ // past the pad.
+ if (NoOpt && !HasExtraAlign)
+ UseFP = true;
+ if (MFI.isFixedObjectIndex(FI) || MFI.isObjectPreAllocated(FI)) {
+ // Fixed and preallocated objects will be located before any padding
+ // so FP must be used to access them.
+ UseFP |= (HasAlloca || HasExtraAlign);
+ } else {
+ if (HasAlloca) {
+ if (HasExtraAlign)
+ UseAP = true;
+ else
+ UseFP = true;
+ }
+ }
+
+ // If FP was picked, then there had better be FP.
+ bool HasFP = hasFP(MF);
+ assert((HasFP || !UseFP) && "This function must have frame pointer");
+
+ // Having FP implies allocframe. Allocframe will store extra 8 bytes:
+ // FP/LR. If the base register is used to access an object across these
+ // 8 bytes, then the offset will need to be adjusted by 8.
+ //
+ // After allocframe:
+ // HexagonISelLowering adds 8 to ---+
+ // the offsets of all stack-based |
+ // arguments (*) |
+ // |
+ // getObjectOffset < 0 0 8 getObjectOffset >= 8
+ // ------------------------+-----+------------------------> increasing
+ // <local objects> |FP/LR| <input arguments> addresses
+ // -----------------+------+-----+------------------------>
+ // | |
+ // SP/AP point --+ +-- FP points here (**)
+ // somewhere on
+ // this side of FP/LR
+ //
+ // (*) See LowerFormalArguments. The FP/LR is assumed to be present.
+ // (**) *FP == old-FP. FP+0..7 are the bytes of FP/LR.
+
+ // The lowering assumes that FP/LR is present, and so the offsets of
+ // the formal arguments start at 8. If FP/LR is not there we need to
+ // reduce the offset by 8.
+ if (Offset > 0 && !HasFP)
+ Offset -= 8;
+
+ if (UseFP)
+ FrameReg = FP;
+ else if (UseAP)
+ FrameReg = AP;
+ else
+ FrameReg = SP;
+
+ // Calculate the actual offset in the instruction. If there is no FP
+ // (in other words, no allocframe), then SP will not be adjusted (i.e.
+ // there will be no SP -= FrameSize), so the frame size should not be
+ // added to the calculated offset.
+ int RealOffset = Offset;
+ if (!UseFP && !UseAP && HasFP)
+ RealOffset = FrameSize+Offset;
+ return RealOffset;
+}
+
+bool HexagonFrameLowering::insertCSRSpillsInBlock(MachineBasicBlock &MBB,
+ const CSIVect &CSI, const HexagonRegisterInfo &HRI,
+ bool &PrologueStubs) const {
+ if (CSI.empty())
+ return true;
+
+ MachineBasicBlock::iterator MI = MBB.begin();
+ PrologueStubs = false;
+ MachineFunction &MF = *MBB.getParent();
+ auto &HST = MF.getSubtarget<HexagonSubtarget>();
+ auto &HII = *HST.getInstrInfo();
+
+ if (useSpillFunction(MF, CSI)) {
+ PrologueStubs = true;
+ unsigned MaxReg = getMaxCalleeSavedReg(CSI, HRI);
+ bool StkOvrFlowEnabled = EnableStackOVFSanitizer;
+ const char *SpillFun = getSpillFunctionFor(MaxReg, SK_ToMem,
+ StkOvrFlowEnabled);
+ auto &HTM = static_cast<const HexagonTargetMachine&>(MF.getTarget());
+ bool IsPIC = HTM.isPositionIndependent();
+ bool LongCalls = HST.useLongCalls() || EnableSaveRestoreLong;
+
+ // Call spill function.
+ DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc();
+ unsigned SpillOpc;
+ if (StkOvrFlowEnabled) {
+ if (LongCalls)
+ SpillOpc = IsPIC ? Hexagon::SAVE_REGISTERS_CALL_V4STK_EXT_PIC
+ : Hexagon::SAVE_REGISTERS_CALL_V4STK_EXT;
+ else
+ SpillOpc = IsPIC ? Hexagon::SAVE_REGISTERS_CALL_V4STK_PIC
+ : Hexagon::SAVE_REGISTERS_CALL_V4STK;
+ } else {
+ if (LongCalls)
+ SpillOpc = IsPIC ? Hexagon::SAVE_REGISTERS_CALL_V4_EXT_PIC
+ : Hexagon::SAVE_REGISTERS_CALL_V4_EXT;
+ else
+ SpillOpc = IsPIC ? Hexagon::SAVE_REGISTERS_CALL_V4_PIC
+ : Hexagon::SAVE_REGISTERS_CALL_V4;
+ }
+
+ MachineInstr *SaveRegsCall =
+ BuildMI(MBB, MI, DL, HII.get(SpillOpc))
+ .addExternalSymbol(SpillFun);
+
+ // Add callee-saved registers as use.
+ addCalleeSaveRegistersAsImpOperand(SaveRegsCall, CSI, false, true);
+ // Add live in registers.
+ for (unsigned I = 0; I < CSI.size(); ++I)
+ MBB.addLiveIn(CSI[I].getReg());
+ return true;
+ }
+
+ for (unsigned i = 0, n = CSI.size(); i < n; ++i) {
+ unsigned Reg = CSI[i].getReg();
+ // Add live in registers. We treat eh_return callee saved register r0 - r3
+ // specially. They are not really callee saved registers as they are not
+ // supposed to be killed.
+ bool IsKill = !HRI.isEHReturnCalleeSaveReg(Reg);
+ int FI = CSI[i].getFrameIdx();
+ const TargetRegisterClass *RC = HRI.getMinimalPhysRegClass(Reg);
+ HII.storeRegToStackSlot(MBB, MI, Reg, IsKill, FI, RC, &HRI);
+ if (IsKill)
+ MBB.addLiveIn(Reg);
+ }
+ return true;
+}
+
+bool HexagonFrameLowering::insertCSRRestoresInBlock(MachineBasicBlock &MBB,
+ const CSIVect &CSI, const HexagonRegisterInfo &HRI) const {
+ if (CSI.empty())
+ return false;
+
+ MachineBasicBlock::iterator MI = MBB.getFirstTerminator();
+ MachineFunction &MF = *MBB.getParent();
+ auto &HST = MF.getSubtarget<HexagonSubtarget>();
+ auto &HII = *HST.getInstrInfo();
+
+ if (useRestoreFunction(MF, CSI)) {
+ bool HasTC = hasTailCall(MBB) || !hasReturn(MBB);
+ unsigned MaxR = getMaxCalleeSavedReg(CSI, HRI);
+ SpillKind Kind = HasTC ? SK_FromMemTailcall : SK_FromMem;
+ const char *RestoreFn = getSpillFunctionFor(MaxR, Kind);
+ auto &HTM = static_cast<const HexagonTargetMachine&>(MF.getTarget());
+ bool IsPIC = HTM.isPositionIndependent();
+ bool LongCalls = HST.useLongCalls() || EnableSaveRestoreLong;
+
+ // Call spill function.
+ DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc()
+ : MBB.getLastNonDebugInstr()->getDebugLoc();
+ MachineInstr *DeallocCall = nullptr;
+
+ if (HasTC) {
+ unsigned RetOpc;
+ if (LongCalls)
+ RetOpc = IsPIC ? Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4_EXT_PIC
+ : Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4_EXT;
+ else
+ RetOpc = IsPIC ? Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4_PIC
+ : Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4;
+ DeallocCall = BuildMI(MBB, MI, DL, HII.get(RetOpc))
+ .addExternalSymbol(RestoreFn);
+ } else {
+ // The block has a return.
+ MachineBasicBlock::iterator It = MBB.getFirstTerminator();
+ assert(It->isReturn() && std::next(It) == MBB.end());
+ unsigned RetOpc;
+ if (LongCalls)
+ RetOpc = IsPIC ? Hexagon::RESTORE_DEALLOC_RET_JMP_V4_EXT_PIC
+ : Hexagon::RESTORE_DEALLOC_RET_JMP_V4_EXT;
+ else
+ RetOpc = IsPIC ? Hexagon::RESTORE_DEALLOC_RET_JMP_V4_PIC
+ : Hexagon::RESTORE_DEALLOC_RET_JMP_V4;
+ DeallocCall = BuildMI(MBB, It, DL, HII.get(RetOpc))
+ .addExternalSymbol(RestoreFn);
+ // Transfer the function live-out registers.
+ DeallocCall->copyImplicitOps(MF, *It);
+ }
+ addCalleeSaveRegistersAsImpOperand(DeallocCall, CSI, true, false);
+ return true;
+ }
+
+ for (unsigned i = 0; i < CSI.size(); ++i) {
+ unsigned Reg = CSI[i].getReg();
+ const TargetRegisterClass *RC = HRI.getMinimalPhysRegClass(Reg);
+ int FI = CSI[i].getFrameIdx();
+ HII.loadRegFromStackSlot(MBB, MI, Reg, FI, RC, &HRI);
+ }
+
+ return true;
+}
+
+MachineBasicBlock::iterator HexagonFrameLowering::eliminateCallFramePseudoInstr(
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
+ MachineInstr &MI = *I;
+ unsigned Opc = MI.getOpcode();
+ (void)Opc; // Silence compiler warning.
+ assert((Opc == Hexagon::ADJCALLSTACKDOWN || Opc == Hexagon::ADJCALLSTACKUP) &&
+ "Cannot handle this call frame pseudo instruction");
+ return MBB.erase(I);
+}
+
+void HexagonFrameLowering::processFunctionBeforeFrameFinalized(
+ MachineFunction &MF, RegScavenger *RS) const {
+ // If this function has uses aligned stack and also has variable sized stack
+ // objects, then we need to map all spill slots to fixed positions, so that
+ // they can be accessed through FP. Otherwise they would have to be accessed
+ // via AP, which may not be available at the particular place in the program.
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ bool HasAlloca = MFI.hasVarSizedObjects();
+ bool NeedsAlign = (MFI.getMaxAlignment() > getStackAlignment());
+
+ if (!HasAlloca || !NeedsAlign)
+ return;
+
+ unsigned LFS = MFI.getLocalFrameSize();
+ for (int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
+ if (!MFI.isSpillSlotObjectIndex(i) || MFI.isDeadObjectIndex(i))
+ continue;
+ unsigned S = MFI.getObjectSize(i);
+ // Reduce the alignment to at most 8. This will require unaligned vector
+ // stores if they happen here.
+ unsigned A = std::max(MFI.getObjectAlignment(i), 8U);
+ MFI.setObjectAlignment(i, 8);
+ LFS = alignTo(LFS+S, A);
+ MFI.mapLocalFrameObject(i, -LFS);
+ }
+
+ MFI.setLocalFrameSize(LFS);
+ unsigned A = MFI.getLocalFrameMaxAlign();
+ assert(A <= 8 && "Unexpected local frame alignment");
+ if (A == 0)
+ MFI.setLocalFrameMaxAlign(8);
+ MFI.setUseLocalStackAllocationBlock(true);
+
+ // Set the physical aligned-stack base address register.
+ unsigned AP = 0;
+ if (const MachineInstr *AI = getAlignaInstr(MF))
+ AP = AI->getOperand(0).getReg();
+ auto &HMFI = *MF.getInfo<HexagonMachineFunctionInfo>();
+ HMFI.setStackAlignBasePhysReg(AP);
+}
+
+/// Returns true if there are no caller-saved registers available in class RC.
+static bool needToReserveScavengingSpillSlots(MachineFunction &MF,
+ const HexagonRegisterInfo &HRI, const TargetRegisterClass *RC) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ auto IsUsed = [&HRI,&MRI] (unsigned Reg) -> bool {
+ for (MCRegAliasIterator AI(Reg, &HRI, true); AI.isValid(); ++AI)
+ if (MRI.isPhysRegUsed(*AI))
+ return true;
+ return false;
+ };
+
+ // Check for an unused caller-saved register. Callee-saved registers
+ // have become pristine by now.
+ for (const MCPhysReg *P = HRI.getCallerSavedRegs(&MF, RC); *P; ++P)
+ if (!IsUsed(*P))
+ return false;
+
+ // All caller-saved registers are used.
+ return true;
+}
+
+#ifndef NDEBUG
+static void dump_registers(BitVector &Regs, const TargetRegisterInfo &TRI) {
+ dbgs() << '{';
+ for (int x = Regs.find_first(); x >= 0; x = Regs.find_next(x)) {
+ unsigned R = x;
+ dbgs() << ' ' << PrintReg(R, &TRI);
+ }
+ dbgs() << " }";
+}
+#endif
+
+bool HexagonFrameLowering::assignCalleeSavedSpillSlots(MachineFunction &MF,
+ const TargetRegisterInfo *TRI, std::vector<CalleeSavedInfo> &CSI) const {
+ DEBUG(dbgs() << __func__ << " on "
+ << MF.getFunction()->getName() << '\n');
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ BitVector SRegs(Hexagon::NUM_TARGET_REGS);
+
+ // Generate a set of unique, callee-saved registers (SRegs), where each
+ // register in the set is maximal in terms of sub-/super-register relation,
+ // i.e. for each R in SRegs, no proper super-register of R is also in SRegs.
+
+ // (1) For each callee-saved register, add that register and all of its
+ // sub-registers to SRegs.
+ DEBUG(dbgs() << "Initial CS registers: {");
+ for (unsigned i = 0, n = CSI.size(); i < n; ++i) {
+ unsigned R = CSI[i].getReg();
+ DEBUG(dbgs() << ' ' << PrintReg(R, TRI));
+ for (MCSubRegIterator SR(R, TRI, true); SR.isValid(); ++SR)
+ SRegs[*SR] = true;
+ }
+ DEBUG(dbgs() << " }\n");
+ DEBUG(dbgs() << "SRegs.1: "; dump_registers(SRegs, *TRI); dbgs() << "\n");
+
+ // (2) For each reserved register, remove that register and all of its
+ // sub- and super-registers from SRegs.
+ BitVector Reserved = TRI->getReservedRegs(MF);
+ for (int x = Reserved.find_first(); x >= 0; x = Reserved.find_next(x)) {
+ unsigned R = x;
+ for (MCSuperRegIterator SR(R, TRI, true); SR.isValid(); ++SR)
+ SRegs[*SR] = false;
+ }
+ DEBUG(dbgs() << "Res: "; dump_registers(Reserved, *TRI); dbgs() << "\n");
+ DEBUG(dbgs() << "SRegs.2: "; dump_registers(SRegs, *TRI); dbgs() << "\n");
+
+ // (3) Collect all registers that have at least one sub-register in SRegs,
+ // and also have no sub-registers that are reserved. These will be the can-
+ // didates for saving as a whole instead of their individual sub-registers.
+ // (Saving R17:16 instead of R16 is fine, but only if R17 was not reserved.)
+ BitVector TmpSup(Hexagon::NUM_TARGET_REGS);
+ for (int x = SRegs.find_first(); x >= 0; x = SRegs.find_next(x)) {
+ unsigned R = x;
+ for (MCSuperRegIterator SR(R, TRI); SR.isValid(); ++SR)
+ TmpSup[*SR] = true;
+ }
+ for (int x = TmpSup.find_first(); x >= 0; x = TmpSup.find_next(x)) {
+ unsigned R = x;
+ for (MCSubRegIterator SR(R, TRI, true); SR.isValid(); ++SR) {
+ if (!Reserved[*SR])
+ continue;
+ TmpSup[R] = false;
+ break;
+ }
+ }
+ DEBUG(dbgs() << "TmpSup: "; dump_registers(TmpSup, *TRI); dbgs() << "\n");
+
+ // (4) Include all super-registers found in (3) into SRegs.
+ SRegs |= TmpSup;
+ DEBUG(dbgs() << "SRegs.4: "; dump_registers(SRegs, *TRI); dbgs() << "\n");
+
+ // (5) For each register R in SRegs, if any super-register of R is in SRegs,
+ // remove R from SRegs.
+ for (int x = SRegs.find_first(); x >= 0; x = SRegs.find_next(x)) {
+ unsigned R = x;
+ for (MCSuperRegIterator SR(R, TRI); SR.isValid(); ++SR) {
+ if (!SRegs[*SR])
+ continue;
+ SRegs[R] = false;
+ break;
+ }
+ }
+ DEBUG(dbgs() << "SRegs.5: "; dump_registers(SRegs, *TRI); dbgs() << "\n");
+
+ // Now, for each register that has a fixed stack slot, create the stack
+ // object for it.
+ CSI.clear();
+
+ typedef TargetFrameLowering::SpillSlot SpillSlot;
+ unsigned NumFixed;
+ int MinOffset = 0; // CS offsets are negative.
+ const SpillSlot *FixedSlots = getCalleeSavedSpillSlots(NumFixed);
+ for (const SpillSlot *S = FixedSlots; S != FixedSlots+NumFixed; ++S) {
+ if (!SRegs[S->Reg])
+ continue;
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(S->Reg);
+ int FI = MFI.CreateFixedSpillStackObject(RC->getSize(), S->Offset);
+ MinOffset = std::min(MinOffset, S->Offset);
+ CSI.push_back(CalleeSavedInfo(S->Reg, FI));
+ SRegs[S->Reg] = false;
+ }
+
+ // There can be some registers that don't have fixed slots. For example,
+ // we need to store R0-R3 in functions with exception handling. For each
+ // such register, create a non-fixed stack object.
+ for (int x = SRegs.find_first(); x >= 0; x = SRegs.find_next(x)) {
+ unsigned R = x;
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(R);
+ int Off = MinOffset - RC->getSize();
+ unsigned Align = std::min(RC->getAlignment(), getStackAlignment());
+ assert(isPowerOf2_32(Align));
+ Off &= -Align;
+ int FI = MFI.CreateFixedSpillStackObject(RC->getSize(), Off);
+ MinOffset = std::min(MinOffset, Off);
+ CSI.push_back(CalleeSavedInfo(R, FI));
+ SRegs[R] = false;
+ }
+
+ DEBUG({
+ dbgs() << "CS information: {";
+ for (unsigned i = 0, n = CSI.size(); i < n; ++i) {
+ int FI = CSI[i].getFrameIdx();
+ int Off = MFI.getObjectOffset(FI);
+ dbgs() << ' ' << PrintReg(CSI[i].getReg(), TRI) << ":fi#" << FI << ":sp";
+ if (Off >= 0)
+ dbgs() << '+';
+ dbgs() << Off;
+ }
+ dbgs() << " }\n";
+ });
+
+#ifndef NDEBUG
+ // Verify that all registers were handled.
+ bool MissedReg = false;
+ for (int x = SRegs.find_first(); x >= 0; x = SRegs.find_next(x)) {
+ unsigned R = x;
+ dbgs() << PrintReg(R, TRI) << ' ';
+ MissedReg = true;
+ }
+ if (MissedReg)
+ llvm_unreachable("...there are unhandled callee-saved registers!");
+#endif
+
+ return true;
+}
+
+bool HexagonFrameLowering::expandCopy(MachineBasicBlock &B,
+ MachineBasicBlock::iterator It, MachineRegisterInfo &MRI,
+ const HexagonInstrInfo &HII, SmallVectorImpl<unsigned> &NewRegs) const {
+ MachineInstr *MI = &*It;
+ DebugLoc DL = MI->getDebugLoc();
+ unsigned DstR = MI->getOperand(0).getReg();
+ unsigned SrcR = MI->getOperand(1).getReg();
+ if (!Hexagon::ModRegsRegClass.contains(DstR) ||
+ !Hexagon::ModRegsRegClass.contains(SrcR))
+ return false;
+
+ unsigned TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+ BuildMI(B, It, DL, HII.get(TargetOpcode::COPY), TmpR)
+ .addOperand(MI->getOperand(1));
+ BuildMI(B, It, DL, HII.get(TargetOpcode::COPY), DstR)
+ .addReg(TmpR, RegState::Kill);
+
+ NewRegs.push_back(TmpR);
+ B.erase(It);
+ return true;
+}
+
+bool HexagonFrameLowering::expandStoreInt(MachineBasicBlock &B,
+ MachineBasicBlock::iterator It, MachineRegisterInfo &MRI,
+ const HexagonInstrInfo &HII, SmallVectorImpl<unsigned> &NewRegs) const {
+ MachineInstr *MI = &*It;
+ if (!MI->getOperand(0).isFI())
+ return false;
+
+ DebugLoc DL = MI->getDebugLoc();
+ unsigned Opc = MI->getOpcode();
+ unsigned SrcR = MI->getOperand(2).getReg();
+ bool IsKill = MI->getOperand(2).isKill();
+ int FI = MI->getOperand(0).getIndex();
+
+ // TmpR = C2_tfrpr SrcR if SrcR is a predicate register
+ // TmpR = A2_tfrcrr SrcR if SrcR is a modifier register
+ unsigned TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+ unsigned TfrOpc = (Opc == Hexagon::STriw_pred) ? Hexagon::C2_tfrpr
+ : Hexagon::A2_tfrcrr;
+ BuildMI(B, It, DL, HII.get(TfrOpc), TmpR)
+ .addReg(SrcR, getKillRegState(IsKill));
+
+ // S2_storeri_io FI, 0, TmpR
+ BuildMI(B, It, DL, HII.get(Hexagon::S2_storeri_io))
+ .addFrameIndex(FI)
+ .addImm(0)
+ .addReg(TmpR, RegState::Kill)
+ .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+ NewRegs.push_back(TmpR);
+ B.erase(It);
+ return true;
+}
+
+bool HexagonFrameLowering::expandLoadInt(MachineBasicBlock &B,
+ MachineBasicBlock::iterator It, MachineRegisterInfo &MRI,
+ const HexagonInstrInfo &HII, SmallVectorImpl<unsigned> &NewRegs) const {
+ MachineInstr *MI = &*It;
+ if (!MI->getOperand(1).isFI())
+ return false;
+
+ DebugLoc DL = MI->getDebugLoc();
+ unsigned Opc = MI->getOpcode();
+ unsigned DstR = MI->getOperand(0).getReg();
+ int FI = MI->getOperand(1).getIndex();
+
+ // TmpR = L2_loadri_io FI, 0
+ unsigned TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+ BuildMI(B, It, DL, HII.get(Hexagon::L2_loadri_io), TmpR)
+ .addFrameIndex(FI)
+ .addImm(0)
+ .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+ // DstR = C2_tfrrp TmpR if DstR is a predicate register
+ // DstR = A2_tfrrcr TmpR if DstR is a modifier register
+ unsigned TfrOpc = (Opc == Hexagon::LDriw_pred) ? Hexagon::C2_tfrrp
+ : Hexagon::A2_tfrrcr;
+ BuildMI(B, It, DL, HII.get(TfrOpc), DstR)
+ .addReg(TmpR, RegState::Kill);
+
+ NewRegs.push_back(TmpR);
+ B.erase(It);
+ return true;
+}
+
+bool HexagonFrameLowering::expandStoreVecPred(MachineBasicBlock &B,
+ MachineBasicBlock::iterator It, MachineRegisterInfo &MRI,
+ const HexagonInstrInfo &HII, SmallVectorImpl<unsigned> &NewRegs) const {
+ auto &HST = B.getParent()->getSubtarget<HexagonSubtarget>();
+ MachineInstr *MI = &*It;
+ if (!MI->getOperand(0).isFI())
+ return false;
+
+ DebugLoc DL = MI->getDebugLoc();
+ unsigned SrcR = MI->getOperand(2).getReg();
+ bool IsKill = MI->getOperand(2).isKill();
+ int FI = MI->getOperand(0).getIndex();
+
+ bool Is128B = HST.useHVXDblOps();
+ auto *RC = !Is128B ? &Hexagon::VectorRegsRegClass
+ : &Hexagon::VectorRegs128BRegClass;
+
+ // Insert transfer to general vector register.
+ // TmpR0 = A2_tfrsi 0x01010101
+ // TmpR1 = V6_vandqrt Qx, TmpR0
+ // store FI, 0, TmpR1
+ unsigned TmpR0 = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+ unsigned TmpR1 = MRI.createVirtualRegister(RC);
+
+ BuildMI(B, It, DL, HII.get(Hexagon::A2_tfrsi), TmpR0)
+ .addImm(0x01010101);
+
+ unsigned VandOpc = !Is128B ? Hexagon::V6_vandqrt : Hexagon::V6_vandqrt_128B;
+ BuildMI(B, It, DL, HII.get(VandOpc), TmpR1)
+ .addReg(SrcR, getKillRegState(IsKill))
+ .addReg(TmpR0, RegState::Kill);
+
+ auto *HRI = B.getParent()->getSubtarget<HexagonSubtarget>().getRegisterInfo();
+ HII.storeRegToStackSlot(B, It, TmpR1, true, FI, RC, HRI);
+ expandStoreVec(B, std::prev(It), MRI, HII, NewRegs);
+
+ NewRegs.push_back(TmpR0);
+ NewRegs.push_back(TmpR1);
+ B.erase(It);
+ return true;
+}
+
+bool HexagonFrameLowering::expandLoadVecPred(MachineBasicBlock &B,
+ MachineBasicBlock::iterator It, MachineRegisterInfo &MRI,
+ const HexagonInstrInfo &HII, SmallVectorImpl<unsigned> &NewRegs) const {
+ auto &HST = B.getParent()->getSubtarget<HexagonSubtarget>();
+ MachineInstr *MI = &*It;
+ if (!MI->getOperand(1).isFI())
+ return false;
+
+ DebugLoc DL = MI->getDebugLoc();
+ unsigned DstR = MI->getOperand(0).getReg();
+ int FI = MI->getOperand(1).getIndex();
+
+ bool Is128B = HST.useHVXDblOps();
+ auto *RC = !Is128B ? &Hexagon::VectorRegsRegClass
+ : &Hexagon::VectorRegs128BRegClass;
+
+ // TmpR0 = A2_tfrsi 0x01010101
+ // TmpR1 = load FI, 0
+ // DstR = V6_vandvrt TmpR1, TmpR0
+ unsigned TmpR0 = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+ unsigned TmpR1 = MRI.createVirtualRegister(RC);
+
+ BuildMI(B, It, DL, HII.get(Hexagon::A2_tfrsi), TmpR0)
+ .addImm(0x01010101);
+ auto *HRI = B.getParent()->getSubtarget<HexagonSubtarget>().getRegisterInfo();
+ HII.loadRegFromStackSlot(B, It, TmpR1, FI, RC, HRI);
+ expandLoadVec(B, std::prev(It), MRI, HII, NewRegs);
+
+ unsigned VandOpc = !Is128B ? Hexagon::V6_vandvrt : Hexagon::V6_vandvrt_128B;
+ BuildMI(B, It, DL, HII.get(VandOpc), DstR)
+ .addReg(TmpR1, RegState::Kill)
+ .addReg(TmpR0, RegState::Kill);
+
+ NewRegs.push_back(TmpR0);
+ NewRegs.push_back(TmpR1);
+ B.erase(It);
+ return true;
+}
+
+bool HexagonFrameLowering::expandStoreVec2(MachineBasicBlock &B,
+ MachineBasicBlock::iterator It, MachineRegisterInfo &MRI,
+ const HexagonInstrInfo &HII, SmallVectorImpl<unsigned> &NewRegs) const {
+ MachineFunction &MF = *B.getParent();
+ auto &HST = MF.getSubtarget<HexagonSubtarget>();
+ auto &MFI = MF.getFrameInfo();
+ auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+ MachineInstr *MI = &*It;
+ if (!MI->getOperand(0).isFI())
+ return false;
+
+ // It is possible that the double vector being stored is only partially
+ // defined. From the point of view of the liveness tracking, it is ok to
+ // store it as a whole, but if we break it up we may end up storing a
+ // register that is entirely undefined.
+ LivePhysRegs LPR(&HRI);
+ LPR.addLiveIns(B);
+ SmallVector<std::pair<unsigned, const MachineOperand*>,2> Clobbers;
+ for (auto R = B.begin(); R != It; ++R)
+ LPR.stepForward(*R, Clobbers);
+
+ DebugLoc DL = MI->getDebugLoc();
+ unsigned SrcR = MI->getOperand(2).getReg();
+ unsigned SrcLo = HRI.getSubReg(SrcR, Hexagon::vsub_lo);
+ unsigned SrcHi = HRI.getSubReg(SrcR, Hexagon::vsub_hi);
+ bool IsKill = MI->getOperand(2).isKill();
+ int FI = MI->getOperand(0).getIndex();
+
+ bool Is128B = HST.useHVXDblOps();
+ auto *RC = !Is128B ? &Hexagon::VectorRegsRegClass
+ : &Hexagon::VectorRegs128BRegClass;
+ unsigned Size = RC->getSize();
+ unsigned NeedAlign = RC->getAlignment();
+ unsigned HasAlign = MFI.getObjectAlignment(FI);
+ unsigned StoreOpc;
+
+ // Store low part.
+ if (LPR.contains(SrcLo)) {
+ if (NeedAlign <= HasAlign)
+ StoreOpc = !Is128B ? Hexagon::V6_vS32b_ai : Hexagon::V6_vS32b_ai_128B;
+ else
+ StoreOpc = !Is128B ? Hexagon::V6_vS32Ub_ai : Hexagon::V6_vS32Ub_ai_128B;
+
+ BuildMI(B, It, DL, HII.get(StoreOpc))
+ .addFrameIndex(FI)
+ .addImm(0)
+ .addReg(SrcLo, getKillRegState(IsKill))
+ .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+ }
+
+ // Store high part.
+ if (LPR.contains(SrcHi)) {
+ if (NeedAlign <= MinAlign(HasAlign, Size))
+ StoreOpc = !Is128B ? Hexagon::V6_vS32b_ai : Hexagon::V6_vS32b_ai_128B;
+ else
+ StoreOpc = !Is128B ? Hexagon::V6_vS32Ub_ai : Hexagon::V6_vS32Ub_ai_128B;
+
+ BuildMI(B, It, DL, HII.get(StoreOpc))
+ .addFrameIndex(FI)
+ .addImm(Size)
+ .addReg(SrcHi, getKillRegState(IsKill))
+ .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+ }
+
+ B.erase(It);
+ return true;
+}
+
+bool HexagonFrameLowering::expandLoadVec2(MachineBasicBlock &B,
+ MachineBasicBlock::iterator It, MachineRegisterInfo &MRI,
+ const HexagonInstrInfo &HII, SmallVectorImpl<unsigned> &NewRegs) const {
+ MachineFunction &MF = *B.getParent();
+ auto &HST = MF.getSubtarget<HexagonSubtarget>();
+ auto &MFI = MF.getFrameInfo();
+ auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+ MachineInstr *MI = &*It;
+ if (!MI->getOperand(1).isFI())
+ return false;
+
+ DebugLoc DL = MI->getDebugLoc();
+ unsigned DstR = MI->getOperand(0).getReg();
+ unsigned DstHi = HRI.getSubReg(DstR, Hexagon::vsub_hi);
+ unsigned DstLo = HRI.getSubReg(DstR, Hexagon::vsub_lo);
+ int FI = MI->getOperand(1).getIndex();
+
+ bool Is128B = HST.useHVXDblOps();
+ auto *RC = !Is128B ? &Hexagon::VectorRegsRegClass
+ : &Hexagon::VectorRegs128BRegClass;
+ unsigned Size = RC->getSize();
+ unsigned NeedAlign = RC->getAlignment();
+ unsigned HasAlign = MFI.getObjectAlignment(FI);
+ unsigned LoadOpc;
+
+ // Load low part.
+ if (NeedAlign <= HasAlign)
+ LoadOpc = !Is128B ? Hexagon::V6_vL32b_ai : Hexagon::V6_vL32b_ai_128B;
+ else
+ LoadOpc = !Is128B ? Hexagon::V6_vL32Ub_ai : Hexagon::V6_vL32Ub_ai_128B;
+
+ BuildMI(B, It, DL, HII.get(LoadOpc), DstLo)
+ .addFrameIndex(FI)
+ .addImm(0)
+ .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+ // Load high part.
+ if (NeedAlign <= MinAlign(HasAlign, Size))
+ LoadOpc = !Is128B ? Hexagon::V6_vL32b_ai : Hexagon::V6_vL32b_ai_128B;
+ else
+ LoadOpc = !Is128B ? Hexagon::V6_vL32Ub_ai : Hexagon::V6_vL32Ub_ai_128B;
+
+ BuildMI(B, It, DL, HII.get(LoadOpc), DstHi)
+ .addFrameIndex(FI)
+ .addImm(Size)
+ .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+ B.erase(It);
+ return true;
+}
+
+bool HexagonFrameLowering::expandStoreVec(MachineBasicBlock &B,
+ MachineBasicBlock::iterator It, MachineRegisterInfo &MRI,
+ const HexagonInstrInfo &HII, SmallVectorImpl<unsigned> &NewRegs) const {
+ MachineFunction &MF = *B.getParent();
+ auto &HST = MF.getSubtarget<HexagonSubtarget>();
+ auto &MFI = MF.getFrameInfo();
+ MachineInstr *MI = &*It;
+ if (!MI->getOperand(0).isFI())
+ return false;
+
+ DebugLoc DL = MI->getDebugLoc();
+ unsigned SrcR = MI->getOperand(2).getReg();
+ bool IsKill = MI->getOperand(2).isKill();
+ int FI = MI->getOperand(0).getIndex();
+
+ bool Is128B = HST.useHVXDblOps();
+ auto *RC = !Is128B ? &Hexagon::VectorRegsRegClass
+ : &Hexagon::VectorRegs128BRegClass;
+
+ unsigned NeedAlign = RC->getAlignment();
+ unsigned HasAlign = MFI.getObjectAlignment(FI);
+ unsigned StoreOpc;
+
+ if (NeedAlign <= HasAlign)
+ StoreOpc = !Is128B ? Hexagon::V6_vS32b_ai : Hexagon::V6_vS32b_ai_128B;
+ else
+ StoreOpc = !Is128B ? Hexagon::V6_vS32Ub_ai : Hexagon::V6_vS32Ub_ai_128B;
+
+ BuildMI(B, It, DL, HII.get(StoreOpc))
+ .addFrameIndex(FI)
+ .addImm(0)
+ .addReg(SrcR, getKillRegState(IsKill))
+ .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+ B.erase(It);
+ return true;
+}
+
+bool HexagonFrameLowering::expandLoadVec(MachineBasicBlock &B,
+ MachineBasicBlock::iterator It, MachineRegisterInfo &MRI,
+ const HexagonInstrInfo &HII, SmallVectorImpl<unsigned> &NewRegs) const {
+ MachineFunction &MF = *B.getParent();
+ auto &HST = MF.getSubtarget<HexagonSubtarget>();
+ auto &MFI = MF.getFrameInfo();
+ MachineInstr *MI = &*It;
+ if (!MI->getOperand(1).isFI())
+ return false;
+
+ DebugLoc DL = MI->getDebugLoc();
+ unsigned DstR = MI->getOperand(0).getReg();
+ int FI = MI->getOperand(1).getIndex();
+
+ bool Is128B = HST.useHVXDblOps();
+ auto *RC = !Is128B ? &Hexagon::VectorRegsRegClass
+ : &Hexagon::VectorRegs128BRegClass;
+
+ unsigned NeedAlign = RC->getAlignment();
+ unsigned HasAlign = MFI.getObjectAlignment(FI);
+ unsigned LoadOpc;
+
+ if (NeedAlign <= HasAlign)
+ LoadOpc = !Is128B ? Hexagon::V6_vL32b_ai : Hexagon::V6_vL32b_ai_128B;
+ else
+ LoadOpc = !Is128B ? Hexagon::V6_vL32Ub_ai : Hexagon::V6_vL32Ub_ai_128B;
+
+ BuildMI(B, It, DL, HII.get(LoadOpc), DstR)
+ .addFrameIndex(FI)
+ .addImm(0)
+ .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+ B.erase(It);
+ return true;
+}
+
+bool HexagonFrameLowering::expandSpillMacros(MachineFunction &MF,
+ SmallVectorImpl<unsigned> &NewRegs) const {
+ auto &HST = MF.getSubtarget<HexagonSubtarget>();
+ auto &HII = *HST.getInstrInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ bool Changed = false;
+
+ for (auto &B : MF) {
+ // Traverse the basic block.
+ MachineBasicBlock::iterator NextI;
+ for (auto I = B.begin(), E = B.end(); I != E; I = NextI) {
+ MachineInstr *MI = &*I;
+ NextI = std::next(I);
+ unsigned Opc = MI->getOpcode();
+
+ switch (Opc) {
+ case TargetOpcode::COPY:
+ Changed |= expandCopy(B, I, MRI, HII, NewRegs);
+ break;
+ case Hexagon::STriw_pred:
+ case Hexagon::STriw_mod:
+ Changed |= expandStoreInt(B, I, MRI, HII, NewRegs);
+ break;
+ case Hexagon::LDriw_pred:
+ case Hexagon::LDriw_mod:
+ Changed |= expandLoadInt(B, I, MRI, HII, NewRegs);
+ break;
+ case Hexagon::PS_vstorerq_ai:
+ case Hexagon::PS_vstorerq_ai_128B:
+ Changed |= expandStoreVecPred(B, I, MRI, HII, NewRegs);
+ break;
+ case Hexagon::PS_vloadrq_ai:
+ case Hexagon::PS_vloadrq_ai_128B:
+ Changed |= expandLoadVecPred(B, I, MRI, HII, NewRegs);
+ break;
+ case Hexagon::PS_vloadrw_ai:
+ case Hexagon::PS_vloadrwu_ai:
+ case Hexagon::PS_vloadrw_ai_128B:
+ case Hexagon::PS_vloadrwu_ai_128B:
+ Changed |= expandLoadVec2(B, I, MRI, HII, NewRegs);
+ break;
+ case Hexagon::PS_vstorerw_ai:
+ case Hexagon::PS_vstorerwu_ai:
+ case Hexagon::PS_vstorerw_ai_128B:
+ case Hexagon::PS_vstorerwu_ai_128B:
+ Changed |= expandStoreVec2(B, I, MRI, HII, NewRegs);
+ break;
+ }
+ }
+ }
+
+ return Changed;
+}
+
+void HexagonFrameLowering::determineCalleeSaves(MachineFunction &MF,
+ BitVector &SavedRegs,
+ RegScavenger *RS) const {
+ auto &HST = MF.getSubtarget<HexagonSubtarget>();
+ auto &HRI = *HST.getRegisterInfo();
+
+ SavedRegs.resize(HRI.getNumRegs());
+
+ // If we have a function containing __builtin_eh_return we want to spill and
+ // restore all callee saved registers. Pretend that they are used.
+ if (MF.getInfo<HexagonMachineFunctionInfo>()->hasEHReturn())
+ for (const MCPhysReg *R = HRI.getCalleeSavedRegs(&MF); *R; ++R)
+ SavedRegs.set(*R);
+
+ // Replace predicate register pseudo spill code.
+ SmallVector<unsigned,8> NewRegs;
+ expandSpillMacros(MF, NewRegs);
+ if (OptimizeSpillSlots && !isOptNone(MF))
+ optimizeSpillSlots(MF, NewRegs);
+
+ // We need to reserve a a spill slot if scavenging could potentially require
+ // spilling a scavenged register.
+ if (!NewRegs.empty() || mayOverflowFrameOffset(MF)) {
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ SetVector<const TargetRegisterClass*> SpillRCs;
+ // Reserve an int register in any case, because it could be used to hold
+ // the stack offset in case it does not fit into a spill instruction.
+ SpillRCs.insert(&Hexagon::IntRegsRegClass);
+
+ for (unsigned VR : NewRegs)
+ SpillRCs.insert(MRI.getRegClass(VR));
+
+ for (auto *RC : SpillRCs) {
+ if (!needToReserveScavengingSpillSlots(MF, HRI, RC))
+ continue;
+ unsigned Num = RC == &Hexagon::IntRegsRegClass ? NumberScavengerSlots : 1;
+ unsigned S = RC->getSize(), A = RC->getAlignment();
+ for (unsigned i = 0; i < Num; i++) {
+ int NewFI = MFI.CreateSpillStackObject(S, A);
+ RS->addScavengingFrameIndex(NewFI);
+ }
+ }
+ }
+
+ TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+}
+
+unsigned HexagonFrameLowering::findPhysReg(MachineFunction &MF,
+ HexagonBlockRanges::IndexRange &FIR,
+ HexagonBlockRanges::InstrIndexMap &IndexMap,
+ HexagonBlockRanges::RegToRangeMap &DeadMap,
+ const TargetRegisterClass *RC) const {
+ auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+ auto &MRI = MF.getRegInfo();
+
+ auto isDead = [&FIR,&DeadMap] (unsigned Reg) -> bool {
+ auto F = DeadMap.find({Reg,0});
+ if (F == DeadMap.end())
+ return false;
+ for (auto &DR : F->second)
+ if (DR.contains(FIR))
+ return true;
+ return false;
+ };
+
+ for (unsigned Reg : RC->getRawAllocationOrder(MF)) {
+ bool Dead = true;
+ for (auto R : HexagonBlockRanges::expandToSubRegs({Reg,0}, MRI, HRI)) {
+ if (isDead(R.Reg))
+ continue;
+ Dead = false;
+ break;
+ }
+ if (Dead)
+ return Reg;
+ }
+ return 0;
+}
+
+void HexagonFrameLowering::optimizeSpillSlots(MachineFunction &MF,
+ SmallVectorImpl<unsigned> &VRegs) const {
+ auto &HST = MF.getSubtarget<HexagonSubtarget>();
+ auto &HII = *HST.getInstrInfo();
+ auto &HRI = *HST.getRegisterInfo();
+ auto &MRI = MF.getRegInfo();
+ HexagonBlockRanges HBR(MF);
+
+ typedef std::map<MachineBasicBlock*,HexagonBlockRanges::InstrIndexMap>
+ BlockIndexMap;
+ typedef std::map<MachineBasicBlock*,HexagonBlockRanges::RangeList>
+ BlockRangeMap;
+ typedef HexagonBlockRanges::IndexType IndexType;
+
+ struct SlotInfo {
+ BlockRangeMap Map;
+ unsigned Size = 0;
+ const TargetRegisterClass *RC = nullptr;
+
+ SlotInfo() = default;
+ };
+
+ BlockIndexMap BlockIndexes;
+ SmallSet<int,4> BadFIs;
+ std::map<int,SlotInfo> FIRangeMap;
+
+ // Accumulate register classes: get a common class for a pre-existing
+ // class HaveRC and a new class NewRC. Return nullptr if a common class
+ // cannot be found, otherwise return the resulting class. If HaveRC is
+ // nullptr, assume that it is still unset.
+ auto getCommonRC = [&HRI] (const TargetRegisterClass *HaveRC,
+ const TargetRegisterClass *NewRC)
+ -> const TargetRegisterClass* {
+ if (HaveRC == nullptr || HaveRC == NewRC)
+ return NewRC;
+ // Different classes, both non-null. Pick the more general one.
+ if (HaveRC->hasSubClassEq(NewRC))
+ return HaveRC;
+ if (NewRC->hasSubClassEq(HaveRC))
+ return NewRC;
+ return nullptr;
+ };
+
+ // Scan all blocks in the function. Check all occurrences of frame indexes,
+ // and collect relevant information.
+ for (auto &B : MF) {
+ std::map<int,IndexType> LastStore, LastLoad;
+ // Emplace appears not to be supported in gcc 4.7.2-4.
+ //auto P = BlockIndexes.emplace(&B, HexagonBlockRanges::InstrIndexMap(B));
+ auto P = BlockIndexes.insert(
+ std::make_pair(&B, HexagonBlockRanges::InstrIndexMap(B)));
+ auto &IndexMap = P.first->second;
+ DEBUG(dbgs() << "Index map for BB#" << B.getNumber() << "\n"
+ << IndexMap << '\n');
+
+ for (auto &In : B) {
+ int LFI, SFI;
+ bool Load = HII.isLoadFromStackSlot(In, LFI) && !HII.isPredicated(In);
+ bool Store = HII.isStoreToStackSlot(In, SFI) && !HII.isPredicated(In);
+ if (Load && Store) {
+ // If it's both a load and a store, then we won't handle it.
+ BadFIs.insert(LFI);
+ BadFIs.insert(SFI);
+ continue;
+ }
+ // Check for register classes of the register used as the source for
+ // the store, and the register used as the destination for the load.
+ // Also, only accept base+imm_offset addressing modes. Other addressing
+ // modes can have side-effects (post-increments, etc.). For stack
+ // slots they are very unlikely, so there is not much loss due to
+ // this restriction.
+ if (Load || Store) {
+ int TFI = Load ? LFI : SFI;
+ unsigned AM = HII.getAddrMode(In);
+ SlotInfo &SI = FIRangeMap[TFI];
+ bool Bad = (AM != HexagonII::BaseImmOffset);
+ if (!Bad) {
+ // If the addressing mode is ok, check the register class.
+ unsigned OpNum = Load ? 0 : 2;
+ auto *RC = HII.getRegClass(In.getDesc(), OpNum, &HRI, MF);
+ RC = getCommonRC(SI.RC, RC);
+ if (RC == nullptr)
+ Bad = true;
+ else
+ SI.RC = RC;
+ }
+ if (!Bad) {
+ // Check sizes.
+ unsigned S = (1U << (HII.getMemAccessSize(In) - 1));
+ if (SI.Size != 0 && SI.Size != S)
+ Bad = true;
+ else
+ SI.Size = S;
+ }
+ if (!Bad) {
+ for (auto *Mo : In.memoperands()) {
+ if (!Mo->isVolatile())
+ continue;
+ Bad = true;
+ break;
+ }
+ }
+ if (Bad)
+ BadFIs.insert(TFI);
+ }
+
+ // Locate uses of frame indices.
+ for (unsigned i = 0, n = In.getNumOperands(); i < n; ++i) {
+ const MachineOperand &Op = In.getOperand(i);
+ if (!Op.isFI())
+ continue;
+ int FI = Op.getIndex();
+ // Make sure that the following operand is an immediate and that
+ // it is 0. This is the offset in the stack object.
+ if (i+1 >= n || !In.getOperand(i+1).isImm() ||
+ In.getOperand(i+1).getImm() != 0)
+ BadFIs.insert(FI);
+ if (BadFIs.count(FI))
+ continue;
+
+ IndexType Index = IndexMap.getIndex(&In);
+ if (Load) {
+ if (LastStore[FI] == IndexType::None)
+ LastStore[FI] = IndexType::Entry;
+ LastLoad[FI] = Index;
+ } else if (Store) {
+ HexagonBlockRanges::RangeList &RL = FIRangeMap[FI].Map[&B];
+ if (LastStore[FI] != IndexType::None)
+ RL.add(LastStore[FI], LastLoad[FI], false, false);
+ else if (LastLoad[FI] != IndexType::None)
+ RL.add(IndexType::Entry, LastLoad[FI], false, false);
+ LastLoad[FI] = IndexType::None;
+ LastStore[FI] = Index;
+ } else {
+ BadFIs.insert(FI);
+ }
+ }
+ }
+
+ for (auto &I : LastLoad) {
+ IndexType LL = I.second;
+ if (LL == IndexType::None)
+ continue;
+ auto &RL = FIRangeMap[I.first].Map[&B];
+ IndexType &LS = LastStore[I.first];
+ if (LS != IndexType::None)
+ RL.add(LS, LL, false, false);
+ else
+ RL.add(IndexType::Entry, LL, false, false);
+ LS = IndexType::None;
+ }
+ for (auto &I : LastStore) {
+ IndexType LS = I.second;
+ if (LS == IndexType::None)
+ continue;
+ auto &RL = FIRangeMap[I.first].Map[&B];
+ RL.add(LS, IndexType::None, false, false);
+ }
+ }
+
+ DEBUG({
+ for (auto &P : FIRangeMap) {
+ dbgs() << "fi#" << P.first;
+ if (BadFIs.count(P.first))
+ dbgs() << " (bad)";
+ dbgs() << " RC: ";
+ if (P.second.RC != nullptr)
+ dbgs() << HRI.getRegClassName(P.second.RC) << '\n';
+ else
+ dbgs() << "<null>\n";
+ for (auto &R : P.second.Map)
+ dbgs() << " BB#" << R.first->getNumber() << " { " << R.second << "}\n";
+ }
+ });
+
+ // When a slot is loaded from in a block without being stored to in the
+ // same block, it is live-on-entry to this block. To avoid CFG analysis,
+ // consider this slot to be live-on-exit from all blocks.
+ SmallSet<int,4> LoxFIs;
+
+ std::map<MachineBasicBlock*,std::vector<int>> BlockFIMap;
+
+ for (auto &P : FIRangeMap) {
+ // P = pair(FI, map: BB->RangeList)
+ if (BadFIs.count(P.first))
+ continue;
+ for (auto &B : MF) {
+ auto F = P.second.Map.find(&B);
+ // F = pair(BB, RangeList)
+ if (F == P.second.Map.end() || F->second.empty())
+ continue;
+ HexagonBlockRanges::IndexRange &IR = F->second.front();
+ if (IR.start() == IndexType::Entry)
+ LoxFIs.insert(P.first);
+ BlockFIMap[&B].push_back(P.first);
+ }
+ }
+
+ DEBUG({
+ dbgs() << "Block-to-FI map (* -- live-on-exit):\n";
+ for (auto &P : BlockFIMap) {
+ auto &FIs = P.second;
+ if (FIs.empty())
+ continue;
+ dbgs() << " BB#" << P.first->getNumber() << ": {";
+ for (auto I : FIs) {
+ dbgs() << " fi#" << I;
+ if (LoxFIs.count(I))
+ dbgs() << '*';
+ }
+ dbgs() << " }\n";
+ }
+ });
+
+#ifndef NDEBUG
+ bool HasOptLimit = SpillOptMax.getPosition();
+#endif
+
+ // eliminate loads, when all loads eliminated, eliminate all stores.
+ for (auto &B : MF) {
+ auto F = BlockIndexes.find(&B);
+ assert(F != BlockIndexes.end());
+ HexagonBlockRanges::InstrIndexMap &IM = F->second;
+ HexagonBlockRanges::RegToRangeMap LM = HBR.computeLiveMap(IM);
+ HexagonBlockRanges::RegToRangeMap DM = HBR.computeDeadMap(IM, LM);
+ DEBUG(dbgs() << "BB#" << B.getNumber() << " dead map\n"
+ << HexagonBlockRanges::PrintRangeMap(DM, HRI));
+
+ for (auto FI : BlockFIMap[&B]) {
+ if (BadFIs.count(FI))
+ continue;
+ DEBUG(dbgs() << "Working on fi#" << FI << '\n');
+ HexagonBlockRanges::RangeList &RL = FIRangeMap[FI].Map[&B];
+ for (auto &Range : RL) {
+ DEBUG(dbgs() << "--Examining range:" << RL << '\n');
+ if (!IndexType::isInstr(Range.start()) ||
+ !IndexType::isInstr(Range.end()))
+ continue;
+ MachineInstr &SI = *IM.getInstr(Range.start());
+ MachineInstr &EI = *IM.getInstr(Range.end());
+ assert(SI.mayStore() && "Unexpected start instruction");
+ assert(EI.mayLoad() && "Unexpected end instruction");
+ MachineOperand &SrcOp = SI.getOperand(2);
+
+ HexagonBlockRanges::RegisterRef SrcRR = { SrcOp.getReg(),
+ SrcOp.getSubReg() };
+ auto *RC = HII.getRegClass(SI.getDesc(), 2, &HRI, MF);
+ // The this-> is needed to unconfuse MSVC.
+ unsigned FoundR = this->findPhysReg(MF, Range, IM, DM, RC);
+ DEBUG(dbgs() << "Replacement reg:" << PrintReg(FoundR, &HRI) << '\n');
+ if (FoundR == 0)
+ continue;
+#ifndef NDEBUG
+ if (HasOptLimit) {
+ if (SpillOptCount >= SpillOptMax)
+ return;
+ SpillOptCount++;
+ }
+#endif
+
+ // Generate the copy-in: "FoundR = COPY SrcR" at the store location.
+ MachineBasicBlock::iterator StartIt = SI.getIterator(), NextIt;
+ MachineInstr *CopyIn = nullptr;
+ if (SrcRR.Reg != FoundR || SrcRR.Sub != 0) {
+ const DebugLoc &DL = SI.getDebugLoc();
+ CopyIn = BuildMI(B, StartIt, DL, HII.get(TargetOpcode::COPY), FoundR)
+ .addOperand(SrcOp);
+ }
+
+ ++StartIt;
+ // Check if this is a last store and the FI is live-on-exit.
+ if (LoxFIs.count(FI) && (&Range == &RL.back())) {
+ // Update store's source register.
+ if (unsigned SR = SrcOp.getSubReg())
+ SrcOp.setReg(HRI.getSubReg(FoundR, SR));
+ else
+ SrcOp.setReg(FoundR);
+ SrcOp.setSubReg(0);
+ // We are keeping this register live.
+ SrcOp.setIsKill(false);
+ } else {
+ B.erase(&SI);
+ IM.replaceInstr(&SI, CopyIn);
+ }
+
+ auto EndIt = std::next(EI.getIterator());
+ for (auto It = StartIt; It != EndIt; It = NextIt) {
+ MachineInstr &MI = *It;
+ NextIt = std::next(It);
+ int TFI;
+ if (!HII.isLoadFromStackSlot(MI, TFI) || TFI != FI)
+ continue;
+ unsigned DstR = MI.getOperand(0).getReg();
+ assert(MI.getOperand(0).getSubReg() == 0);
+ MachineInstr *CopyOut = nullptr;
+ if (DstR != FoundR) {
+ DebugLoc DL = MI.getDebugLoc();
+ unsigned MemSize = (1U << (HII.getMemAccessSize(MI) - 1));
+ assert(HII.getAddrMode(MI) == HexagonII::BaseImmOffset);
+ unsigned CopyOpc = TargetOpcode::COPY;
+ if (HII.isSignExtendingLoad(MI))
+ CopyOpc = (MemSize == 1) ? Hexagon::A2_sxtb : Hexagon::A2_sxth;
+ else if (HII.isZeroExtendingLoad(MI))
+ CopyOpc = (MemSize == 1) ? Hexagon::A2_zxtb : Hexagon::A2_zxth;
+ CopyOut = BuildMI(B, It, DL, HII.get(CopyOpc), DstR)
+ .addReg(FoundR, getKillRegState(&MI == &EI));
+ }
+ IM.replaceInstr(&MI, CopyOut);
+ B.erase(It);
+ }
+
+ // Update the dead map.
+ HexagonBlockRanges::RegisterRef FoundRR = { FoundR, 0 };
+ for (auto RR : HexagonBlockRanges::expandToSubRegs(FoundRR, MRI, HRI))
+ DM[RR].subtract(Range);
+ } // for Range in range list
+ }
+ }
+}
+
+void HexagonFrameLowering::expandAlloca(MachineInstr *AI,
+ const HexagonInstrInfo &HII, unsigned SP, unsigned CF) const {
+ MachineBasicBlock &MB = *AI->getParent();
+ DebugLoc DL = AI->getDebugLoc();
+ unsigned A = AI->getOperand(2).getImm();
+
+ // Have
+ // Rd = alloca Rs, #A
+ //
+ // If Rs and Rd are different registers, use this sequence:
+ // Rd = sub(r29, Rs)
+ // r29 = sub(r29, Rs)
+ // Rd = and(Rd, #-A) ; if necessary
+ // r29 = and(r29, #-A) ; if necessary
+ // Rd = add(Rd, #CF) ; CF size aligned to at most A
+ // otherwise, do
+ // Rd = sub(r29, Rs)
+ // Rd = and(Rd, #-A) ; if necessary
+ // r29 = Rd
+ // Rd = add(Rd, #CF) ; CF size aligned to at most A
+
+ MachineOperand &RdOp = AI->getOperand(0);
+ MachineOperand &RsOp = AI->getOperand(1);
+ unsigned Rd = RdOp.getReg(), Rs = RsOp.getReg();
+
+ // Rd = sub(r29, Rs)
+ BuildMI(MB, AI, DL, HII.get(Hexagon::A2_sub), Rd)
+ .addReg(SP)
+ .addReg(Rs);
+ if (Rs != Rd) {
+ // r29 = sub(r29, Rs)
+ BuildMI(MB, AI, DL, HII.get(Hexagon::A2_sub), SP)
+ .addReg(SP)
+ .addReg(Rs);
+ }
+ if (A > 8) {
+ // Rd = and(Rd, #-A)
+ BuildMI(MB, AI, DL, HII.get(Hexagon::A2_andir), Rd)
+ .addReg(Rd)
+ .addImm(-int64_t(A));
+ if (Rs != Rd)
+ BuildMI(MB, AI, DL, HII.get(Hexagon::A2_andir), SP)
+ .addReg(SP)
+ .addImm(-int64_t(A));
+ }
+ if (Rs == Rd) {
+ // r29 = Rd
+ BuildMI(MB, AI, DL, HII.get(TargetOpcode::COPY), SP)
+ .addReg(Rd);
+ }
+ if (CF > 0) {
+ // Rd = add(Rd, #CF)
+ BuildMI(MB, AI, DL, HII.get(Hexagon::A2_addi), Rd)
+ .addReg(Rd)
+ .addImm(CF);
+ }
+}
+
+bool HexagonFrameLowering::needsAligna(const MachineFunction &MF) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ if (!MFI.hasVarSizedObjects())
+ return false;
+ unsigned MaxA = MFI.getMaxAlignment();
+ if (MaxA <= getStackAlignment())
+ return false;
+ return true;
+}
+
+const MachineInstr *HexagonFrameLowering::getAlignaInstr(
+ const MachineFunction &MF) const {
+ for (auto &B : MF)
+ for (auto &I : B)
+ if (I.getOpcode() == Hexagon::PS_aligna)
+ return &I;
+ return nullptr;
+}
+
+/// Adds all callee-saved registers as implicit uses or defs to the
+/// instruction.
+void HexagonFrameLowering::addCalleeSaveRegistersAsImpOperand(MachineInstr *MI,
+ const CSIVect &CSI, bool IsDef, bool IsKill) const {
+ // Add the callee-saved registers as implicit uses.
+ for (auto &R : CSI)
+ MI->addOperand(MachineOperand::CreateReg(R.getReg(), IsDef, true, IsKill));
+}
+
+/// Determine whether the callee-saved register saves and restores should
+/// be generated via inline code. If this function returns "true", inline
+/// code will be generated. If this function returns "false", additional
+/// checks are performed, which may still lead to the inline code.
+bool HexagonFrameLowering::shouldInlineCSR(MachineFunction &MF,
+ const CSIVect &CSI) const {
+ if (MF.getInfo<HexagonMachineFunctionInfo>()->hasEHReturn())
+ return true;
+ if (!isOptSize(MF) && !isMinSize(MF))
+ if (MF.getTarget().getOptLevel() > CodeGenOpt::Default)
+ return true;
+
+ // Check if CSI only has double registers, and if the registers form
+ // a contiguous block starting from D8.
+ BitVector Regs(Hexagon::NUM_TARGET_REGS);
+ for (unsigned i = 0, n = CSI.size(); i < n; ++i) {
+ unsigned R = CSI[i].getReg();
+ if (!Hexagon::DoubleRegsRegClass.contains(R))
+ return true;
+ Regs[R] = true;
+ }
+ int F = Regs.find_first();
+ if (F != Hexagon::D8)
+ return true;
+ while (F >= 0) {
+ int N = Regs.find_next(F);
+ if (N >= 0 && N != F+1)
+ return true;
+ F = N;
+ }
+
+ return false;
+}
+
+bool HexagonFrameLowering::useSpillFunction(MachineFunction &MF,
+ const CSIVect &CSI) const {
+ if (shouldInlineCSR(MF, CSI))
+ return false;
+ unsigned NumCSI = CSI.size();
+ if (NumCSI <= 1)
+ return false;
+
+ unsigned Threshold = isOptSize(MF) ? SpillFuncThresholdOs
+ : SpillFuncThreshold;
+ return Threshold < NumCSI;
+}
+
+bool HexagonFrameLowering::useRestoreFunction(MachineFunction &MF,
+ const CSIVect &CSI) const {
+ if (shouldInlineCSR(MF, CSI))
+ return false;
+ // The restore functions do a bit more than just restoring registers.
+ // The non-returning versions will go back directly to the caller's
+ // caller, others will clean up the stack frame in preparation for
+ // a tail call. Using them can still save code size even if only one
+ // register is getting restores. Make the decision based on -Oz:
+ // using -Os will use inline restore for a single register.
+ if (isMinSize(MF))
+ return true;
+ unsigned NumCSI = CSI.size();
+ if (NumCSI <= 1)
+ return false;
+
+ unsigned Threshold = isOptSize(MF) ? SpillFuncThresholdOs-1
+ : SpillFuncThreshold;
+ return Threshold < NumCSI;
+}
+
+bool HexagonFrameLowering::mayOverflowFrameOffset(MachineFunction &MF) const {
+ unsigned StackSize = MF.getFrameInfo().estimateStackSize(MF);
+ auto &HST = MF.getSubtarget<HexagonSubtarget>();
+ // A fairly simplistic guess as to whether a potential load/store to a
+ // stack location could require an extra register. It does not account
+ // for store-immediate instructions.
+ if (HST.useHVXOps())
+ return StackSize > 256;
+ return false;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.h b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.h
new file mode 100644
index 000000000000..529a61d4a5b5
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.h
@@ -0,0 +1,159 @@
+//=- HexagonFrameLowering.h - Define frame lowering for Hexagon --*- C++ -*--=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONFRAMELOWERING_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONFRAMELOWERING_H
+
+#include "Hexagon.h"
+#include "HexagonBlockRanges.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include <vector>
+
+namespace llvm {
+
+class HexagonInstrInfo;
+class HexagonRegisterInfo;
+
+class HexagonFrameLowering : public TargetFrameLowering {
+public:
+ explicit HexagonFrameLowering()
+ : TargetFrameLowering(StackGrowsDown, 8, 0, 1, true) {}
+
+ // All of the prolog/epilog functionality, including saving and restoring
+ // callee-saved registers is handled in emitPrologue. This is to have the
+ // logic for shrink-wrapping in one place.
+ void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const
+ override;
+ void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const
+ override {}
+
+ bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI, const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const override {
+ return true;
+ }
+
+ bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI, const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const override {
+ return true;
+ }
+
+ MachineBasicBlock::iterator
+ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const override;
+ void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+ RegScavenger *RS = nullptr) const override;
+ void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+ RegScavenger *RS) const override;
+
+ bool targetHandlesStackFrameRounding() const override {
+ return true;
+ }
+
+ int getFrameIndexReference(const MachineFunction &MF, int FI,
+ unsigned &FrameReg) const override;
+ bool hasFP(const MachineFunction &MF) const override;
+
+ const SpillSlot *getCalleeSavedSpillSlots(unsigned &NumEntries)
+ const override {
+ static const SpillSlot Offsets[] = {
+ { Hexagon::R17, -4 }, { Hexagon::R16, -8 }, { Hexagon::D8, -8 },
+ { Hexagon::R19, -12 }, { Hexagon::R18, -16 }, { Hexagon::D9, -16 },
+ { Hexagon::R21, -20 }, { Hexagon::R20, -24 }, { Hexagon::D10, -24 },
+ { Hexagon::R23, -28 }, { Hexagon::R22, -32 }, { Hexagon::D11, -32 },
+ { Hexagon::R25, -36 }, { Hexagon::R24, -40 }, { Hexagon::D12, -40 },
+ { Hexagon::R27, -44 }, { Hexagon::R26, -48 }, { Hexagon::D13, -48 }
+ };
+ NumEntries = array_lengthof(Offsets);
+ return Offsets;
+ }
+
+ bool assignCalleeSavedSpillSlots(MachineFunction &MF,
+ const TargetRegisterInfo *TRI, std::vector<CalleeSavedInfo> &CSI)
+ const override;
+
+ bool needsAligna(const MachineFunction &MF) const;
+ const MachineInstr *getAlignaInstr(const MachineFunction &MF) const;
+
+ void insertCFIInstructions(MachineFunction &MF) const;
+
+private:
+ typedef std::vector<CalleeSavedInfo> CSIVect;
+
+ void expandAlloca(MachineInstr *AI, const HexagonInstrInfo &TII,
+ unsigned SP, unsigned CF) const;
+ void insertPrologueInBlock(MachineBasicBlock &MBB, bool PrologueStubs) const;
+ void insertEpilogueInBlock(MachineBasicBlock &MBB) const;
+ bool insertCSRSpillsInBlock(MachineBasicBlock &MBB, const CSIVect &CSI,
+ const HexagonRegisterInfo &HRI, bool &PrologueStubs) const;
+ bool insertCSRRestoresInBlock(MachineBasicBlock &MBB, const CSIVect &CSI,
+ const HexagonRegisterInfo &HRI) const;
+ void updateEntryPaths(MachineFunction &MF, MachineBasicBlock &SaveB) const;
+ bool updateExitPaths(MachineBasicBlock &MBB, MachineBasicBlock &RestoreB,
+ BitVector &DoneT, BitVector &DoneF, BitVector &Path) const;
+ void insertCFIInstructionsAt(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator At) const;
+
+ void adjustForCalleeSavedRegsSpillCall(MachineFunction &MF) const;
+
+ bool expandCopy(MachineBasicBlock &B, MachineBasicBlock::iterator It,
+ MachineRegisterInfo &MRI, const HexagonInstrInfo &HII,
+ SmallVectorImpl<unsigned> &NewRegs) const;
+ bool expandStoreInt(MachineBasicBlock &B, MachineBasicBlock::iterator It,
+ MachineRegisterInfo &MRI, const HexagonInstrInfo &HII,
+ SmallVectorImpl<unsigned> &NewRegs) const;
+ bool expandLoadInt(MachineBasicBlock &B, MachineBasicBlock::iterator It,
+ MachineRegisterInfo &MRI, const HexagonInstrInfo &HII,
+ SmallVectorImpl<unsigned> &NewRegs) const;
+ bool expandStoreVecPred(MachineBasicBlock &B, MachineBasicBlock::iterator It,
+ MachineRegisterInfo &MRI, const HexagonInstrInfo &HII,
+ SmallVectorImpl<unsigned> &NewRegs) const;
+ bool expandLoadVecPred(MachineBasicBlock &B, MachineBasicBlock::iterator It,
+ MachineRegisterInfo &MRI, const HexagonInstrInfo &HII,
+ SmallVectorImpl<unsigned> &NewRegs) const;
+ bool expandStoreVec2(MachineBasicBlock &B, MachineBasicBlock::iterator It,
+ MachineRegisterInfo &MRI, const HexagonInstrInfo &HII,
+ SmallVectorImpl<unsigned> &NewRegs) const;
+ bool expandLoadVec2(MachineBasicBlock &B, MachineBasicBlock::iterator It,
+ MachineRegisterInfo &MRI, const HexagonInstrInfo &HII,
+ SmallVectorImpl<unsigned> &NewRegs) const;
+ bool expandStoreVec(MachineBasicBlock &B, MachineBasicBlock::iterator It,
+ MachineRegisterInfo &MRI, const HexagonInstrInfo &HII,
+ SmallVectorImpl<unsigned> &NewRegs) const;
+ bool expandLoadVec(MachineBasicBlock &B, MachineBasicBlock::iterator It,
+ MachineRegisterInfo &MRI, const HexagonInstrInfo &HII,
+ SmallVectorImpl<unsigned> &NewRegs) const;
+ bool expandSpillMacros(MachineFunction &MF,
+ SmallVectorImpl<unsigned> &NewRegs) const;
+
+ unsigned findPhysReg(MachineFunction &MF, HexagonBlockRanges::IndexRange &FIR,
+ HexagonBlockRanges::InstrIndexMap &IndexMap,
+ HexagonBlockRanges::RegToRangeMap &DeadMap,
+ const TargetRegisterClass *RC) const;
+ void optimizeSpillSlots(MachineFunction &MF,
+ SmallVectorImpl<unsigned> &VRegs) const;
+
+ void findShrunkPrologEpilog(MachineFunction &MF, MachineBasicBlock *&PrologB,
+ MachineBasicBlock *&EpilogB) const;
+
+ void addCalleeSaveRegistersAsImpOperand(MachineInstr *MI, const CSIVect &CSI,
+ bool IsDef, bool IsKill) const;
+ bool shouldInlineCSR(MachineFunction &MF, const CSIVect &CSI) const;
+ bool useSpillFunction(MachineFunction &MF, const CSIVect &CSI) const;
+ bool useRestoreFunction(MachineFunction &MF, const CSIVect &CSI) const;
+ bool mayOverflowFrameOffset(MachineFunction &MF) const;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_HEXAGON_HEXAGONFRAMELOWERING_H
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp
new file mode 100644
index 000000000000..bb5e379ce014
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp
@@ -0,0 +1,271 @@
+//===--- HexagonGenExtract.cpp --------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include <algorithm>
+#include <cstdint>
+#include <iterator>
+
+using namespace llvm;
+
+static cl::opt<unsigned> ExtractCutoff("extract-cutoff", cl::init(~0U),
+ cl::Hidden, cl::desc("Cutoff for generating \"extract\""
+ " instructions"));
+
+// This prevents generating extract instructions that have the offset of 0.
+// One of the reasons for "extract" is to put a sequence of bits in a regis-
+// ter, starting at offset 0 (so that these bits can then be used by an
+// "insert"). If the bits are already at offset 0, it is better not to gene-
+// rate "extract", since logical bit operations can be merged into compound
+// instructions (as opposed to "extract").
+static cl::opt<bool> NoSR0("extract-nosr0", cl::init(true), cl::Hidden,
+ cl::desc("No extract instruction with offset 0"));
+
+static cl::opt<bool> NeedAnd("extract-needand", cl::init(true), cl::Hidden,
+ cl::desc("Require & in extract patterns"));
+
+namespace llvm {
+
+ void initializeHexagonGenExtractPass(PassRegistry&);
+ FunctionPass *createHexagonGenExtract();
+
+} // end namespace llvm
+
+namespace {
+
+ class HexagonGenExtract : public FunctionPass {
+ public:
+ static char ID;
+
+ HexagonGenExtract() : FunctionPass(ID), ExtractCount(0) {
+ initializeHexagonGenExtractPass(*PassRegistry::getPassRegistry());
+ }
+
+ StringRef getPassName() const override {
+ return "Hexagon generate \"extract\" instructions";
+ }
+
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ FunctionPass::getAnalysisUsage(AU);
+ }
+
+ private:
+ bool visitBlock(BasicBlock *B);
+ bool convert(Instruction *In);
+
+ unsigned ExtractCount;
+ DominatorTree *DT;
+ };
+
+ char HexagonGenExtract::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(HexagonGenExtract, "hextract", "Hexagon generate "
+ "\"extract\" instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(HexagonGenExtract, "hextract", "Hexagon generate "
+ "\"extract\" instructions", false, false)
+
+bool HexagonGenExtract::convert(Instruction *In) {
+ using namespace PatternMatch;
+
+ Value *BF = nullptr;
+ ConstantInt *CSL = nullptr, *CSR = nullptr, *CM = nullptr;
+ BasicBlock *BB = In->getParent();
+ LLVMContext &Ctx = BB->getContext();
+ bool LogicalSR;
+
+ // (and (shl (lshr x, #sr), #sl), #m)
+ LogicalSR = true;
+ bool Match = match(In, m_And(m_Shl(m_LShr(m_Value(BF), m_ConstantInt(CSR)),
+ m_ConstantInt(CSL)),
+ m_ConstantInt(CM)));
+
+ if (!Match) {
+ // (and (shl (ashr x, #sr), #sl), #m)
+ LogicalSR = false;
+ Match = match(In, m_And(m_Shl(m_AShr(m_Value(BF), m_ConstantInt(CSR)),
+ m_ConstantInt(CSL)),
+ m_ConstantInt(CM)));
+ }
+ if (!Match) {
+ // (and (shl x, #sl), #m)
+ LogicalSR = true;
+ CSR = ConstantInt::get(Type::getInt32Ty(Ctx), 0);
+ Match = match(In, m_And(m_Shl(m_Value(BF), m_ConstantInt(CSL)),
+ m_ConstantInt(CM)));
+ if (Match && NoSR0)
+ return false;
+ }
+ if (!Match) {
+ // (and (lshr x, #sr), #m)
+ LogicalSR = true;
+ CSL = ConstantInt::get(Type::getInt32Ty(Ctx), 0);
+ Match = match(In, m_And(m_LShr(m_Value(BF), m_ConstantInt(CSR)),
+ m_ConstantInt(CM)));
+ }
+ if (!Match) {
+ // (and (ashr x, #sr), #m)
+ LogicalSR = false;
+ CSL = ConstantInt::get(Type::getInt32Ty(Ctx), 0);
+ Match = match(In, m_And(m_AShr(m_Value(BF), m_ConstantInt(CSR)),
+ m_ConstantInt(CM)));
+ }
+ if (!Match) {
+ CM = nullptr;
+ // (shl (lshr x, #sr), #sl)
+ LogicalSR = true;
+ Match = match(In, m_Shl(m_LShr(m_Value(BF), m_ConstantInt(CSR)),
+ m_ConstantInt(CSL)));
+ }
+ if (!Match) {
+ CM = nullptr;
+ // (shl (ashr x, #sr), #sl)
+ LogicalSR = false;
+ Match = match(In, m_Shl(m_AShr(m_Value(BF), m_ConstantInt(CSR)),
+ m_ConstantInt(CSL)));
+ }
+ if (!Match)
+ return false;
+
+ Type *Ty = BF->getType();
+ if (!Ty->isIntegerTy())
+ return false;
+ unsigned BW = Ty->getPrimitiveSizeInBits();
+ if (BW != 32 && BW != 64)
+ return false;
+
+ uint32_t SR = CSR->getZExtValue();
+ uint32_t SL = CSL->getZExtValue();
+
+ if (!CM) {
+ // If there was no and, and the shift left did not remove all potential
+ // sign bits created by the shift right, then extractu cannot reproduce
+ // this value.
+ if (!LogicalSR && (SR > SL))
+ return false;
+ APInt A = APInt(BW, ~0ULL).lshr(SR).shl(SL);
+ CM = ConstantInt::get(Ctx, A);
+ }
+
+ // CM is the shifted-left mask. Shift it back right to remove the zero
+ // bits on least-significant positions.
+ APInt M = CM->getValue().lshr(SL);
+ uint32_t T = M.countTrailingOnes();
+
+ // During the shifts some of the bits will be lost. Calculate how many
+ // of the original value will remain after shift right and then left.
+ uint32_t U = BW - std::max(SL, SR);
+ // The width of the extracted field is the minimum of the original bits
+ // that remain after the shifts and the number of contiguous 1s in the mask.
+ uint32_t W = std::min(U, T);
+ if (W == 0)
+ return false;
+
+ // Check if the extracted bits are contained within the mask that it is
+ // and-ed with. The extract operation will copy these bits, and so the
+ // mask cannot any holes in it that would clear any of the bits of the
+ // extracted field.
+ if (!LogicalSR) {
+ // If the shift right was arithmetic, it could have included some 1 bits.
+ // It is still ok to generate extract, but only if the mask eliminates
+ // those bits (i.e. M does not have any bits set beyond U).
+ APInt C = APInt::getHighBitsSet(BW, BW-U);
+ if (M.intersects(C) || !APIntOps::isMask(W, M))
+ return false;
+ } else {
+ // Check if M starts with a contiguous sequence of W times 1 bits. Get
+ // the low U bits of M (which eliminates the 0 bits shifted in on the
+ // left), and check if the result is APInt's "mask":
+ if (!APIntOps::isMask(W, M.getLoBits(U)))
+ return false;
+ }
+
+ IRBuilder<> IRB(In);
+ Intrinsic::ID IntId = (BW == 32) ? Intrinsic::hexagon_S2_extractu
+ : Intrinsic::hexagon_S2_extractup;
+ Module *Mod = BB->getParent()->getParent();
+ Value *ExtF = Intrinsic::getDeclaration(Mod, IntId);
+ Value *NewIn = IRB.CreateCall(ExtF, {BF, IRB.getInt32(W), IRB.getInt32(SR)});
+ if (SL != 0)
+ NewIn = IRB.CreateShl(NewIn, SL, CSL->getName());
+ In->replaceAllUsesWith(NewIn);
+ return true;
+}
+
+bool HexagonGenExtract::visitBlock(BasicBlock *B) {
+ // Depth-first, bottom-up traversal.
+ DomTreeNode *DTN = DT->getNode(B);
+ typedef GraphTraits<DomTreeNode*> GTN;
+ typedef GTN::ChildIteratorType Iter;
+ for (Iter I = GTN::child_begin(DTN), E = GTN::child_end(DTN); I != E; ++I)
+ visitBlock((*I)->getBlock());
+
+ // Allow limiting the number of generated extracts for debugging purposes.
+ bool HasCutoff = ExtractCutoff.getPosition();
+ unsigned Cutoff = ExtractCutoff;
+
+ bool Changed = false;
+ BasicBlock::iterator I = std::prev(B->end()), NextI, Begin = B->begin();
+ while (true) {
+ if (HasCutoff && (ExtractCount >= Cutoff))
+ return Changed;
+ bool Last = (I == Begin);
+ if (!Last)
+ NextI = std::prev(I);
+ Instruction *In = &*I;
+ bool Done = convert(In);
+ if (HasCutoff && Done)
+ ExtractCount++;
+ Changed |= Done;
+ if (Last)
+ break;
+ I = NextI;
+ }
+ return Changed;
+}
+
+bool HexagonGenExtract::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ bool Changed;
+
+ // Traverse the function bottom-up, to see super-expressions before their
+ // sub-expressions.
+ BasicBlock *Entry = GraphTraits<Function*>::getEntryNode(&F);
+ Changed = visitBlock(Entry);
+
+ return Changed;
+}
+
+FunctionPass *llvm::createHexagonGenExtract() {
+ return new HexagonGenExtract();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
new file mode 100644
index 000000000000..5a8e392d1275
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -0,0 +1,1601 @@
+//===--- HexagonGenInsert.cpp ---------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hexinsert"
+
+#include "BitTracker.h"
+#include "HexagonBitTracker.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Timer.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+static cl::opt<unsigned> VRegIndexCutoff("insert-vreg-cutoff", cl::init(~0U),
+ cl::Hidden, cl::ZeroOrMore, cl::desc("Vreg# cutoff for insert generation."));
+// The distance cutoff is selected based on the precheckin-perf results:
+// cutoffs 20, 25, 35, and 40 are worse than 30.
+static cl::opt<unsigned> VRegDistCutoff("insert-dist-cutoff", cl::init(30U),
+ cl::Hidden, cl::ZeroOrMore, cl::desc("Vreg distance cutoff for insert "
+ "generation."));
+
+static cl::opt<bool> OptTiming("insert-timing", cl::init(false), cl::Hidden,
+ cl::ZeroOrMore, cl::desc("Enable timing of insert generation"));
+static cl::opt<bool> OptTimingDetail("insert-timing-detail", cl::init(false),
+ cl::Hidden, cl::ZeroOrMore, cl::desc("Enable detailed timing of insert "
+ "generation"));
+
+static cl::opt<bool> OptSelectAll0("insert-all0", cl::init(false), cl::Hidden,
+ cl::ZeroOrMore);
+static cl::opt<bool> OptSelectHas0("insert-has0", cl::init(false), cl::Hidden,
+ cl::ZeroOrMore);
+// Whether to construct constant values via "insert". Could eliminate constant
+// extenders, but often not practical.
+static cl::opt<bool> OptConst("insert-const", cl::init(false), cl::Hidden,
+ cl::ZeroOrMore);
+
+// The preprocessor gets confused when the DEBUG macro is passed larger
+// chunks of code. Use this function to detect debugging.
+inline static bool isDebug() {
+#ifndef NDEBUG
+ return DebugFlag && isCurrentDebugType(DEBUG_TYPE);
+#else
+ return false;
+#endif
+}
+
+namespace {
+
+ // Set of virtual registers, based on BitVector.
+ struct RegisterSet : private BitVector {
+ RegisterSet() = default;
+ explicit RegisterSet(unsigned s, bool t = false) : BitVector(s, t) {}
+
+ using BitVector::clear;
+
+ unsigned find_first() const {
+ int First = BitVector::find_first();
+ if (First < 0)
+ return 0;
+ return x2v(First);
+ }
+
+ unsigned find_next(unsigned Prev) const {
+ int Next = BitVector::find_next(v2x(Prev));
+ if (Next < 0)
+ return 0;
+ return x2v(Next);
+ }
+
+ RegisterSet &insert(unsigned R) {
+ unsigned Idx = v2x(R);
+ ensure(Idx);
+ return static_cast<RegisterSet&>(BitVector::set(Idx));
+ }
+ RegisterSet &remove(unsigned R) {
+ unsigned Idx = v2x(R);
+ if (Idx >= size())
+ return *this;
+ return static_cast<RegisterSet&>(BitVector::reset(Idx));
+ }
+
+ RegisterSet &insert(const RegisterSet &Rs) {
+ return static_cast<RegisterSet&>(BitVector::operator|=(Rs));
+ }
+ RegisterSet &remove(const RegisterSet &Rs) {
+ return static_cast<RegisterSet&>(BitVector::reset(Rs));
+ }
+
+ reference operator[](unsigned R) {
+ unsigned Idx = v2x(R);
+ ensure(Idx);
+ return BitVector::operator[](Idx);
+ }
+ bool operator[](unsigned R) const {
+ unsigned Idx = v2x(R);
+ assert(Idx < size());
+ return BitVector::operator[](Idx);
+ }
+ bool has(unsigned R) const {
+ unsigned Idx = v2x(R);
+ if (Idx >= size())
+ return false;
+ return BitVector::test(Idx);
+ }
+
+ bool empty() const {
+ return !BitVector::any();
+ }
+ bool includes(const RegisterSet &Rs) const {
+ // A.BitVector::test(B) <=> A-B != {}
+ return !Rs.BitVector::test(*this);
+ }
+ bool intersects(const RegisterSet &Rs) const {
+ return BitVector::anyCommon(Rs);
+ }
+
+ private:
+ void ensure(unsigned Idx) {
+ if (size() <= Idx)
+ resize(std::max(Idx+1, 32U));
+ }
+
+ static inline unsigned v2x(unsigned v) {
+ return TargetRegisterInfo::virtReg2Index(v);
+ }
+
+ static inline unsigned x2v(unsigned x) {
+ return TargetRegisterInfo::index2VirtReg(x);
+ }
+ };
+
+ struct PrintRegSet {
+ PrintRegSet(const RegisterSet &S, const TargetRegisterInfo *RI)
+ : RS(S), TRI(RI) {}
+
+ friend raw_ostream &operator<< (raw_ostream &OS,
+ const PrintRegSet &P);
+
+ private:
+ const RegisterSet &RS;
+ const TargetRegisterInfo *TRI;
+ };
+
+ raw_ostream &operator<< (raw_ostream &OS, const PrintRegSet &P) {
+ OS << '{';
+ for (unsigned R = P.RS.find_first(); R; R = P.RS.find_next(R))
+ OS << ' ' << PrintReg(R, P.TRI);
+ OS << " }";
+ return OS;
+ }
+
+ // A convenience class to associate unsigned numbers (such as virtual
+ // registers) with unsigned numbers.
+ struct UnsignedMap : public DenseMap<unsigned,unsigned> {
+ UnsignedMap() = default;
+
+ private:
+ typedef DenseMap<unsigned,unsigned> BaseType;
+ };
+
+ // A utility to establish an ordering between virtual registers:
+ // VRegA < VRegB <=> RegisterOrdering[VRegA] < RegisterOrdering[VRegB]
+ // This is meant as a cache for the ordering of virtual registers defined
+ // by a potentially expensive comparison function, or obtained by a proce-
+ // dure that should not be repeated each time two registers are compared.
+ struct RegisterOrdering : public UnsignedMap {
+ RegisterOrdering() = default;
+
+ unsigned operator[](unsigned VR) const {
+ const_iterator F = find(VR);
+ assert(F != end());
+ return F->second;
+ }
+
+ // Add operator(), so that objects of this class can be used as
+ // comparators in std::sort et al.
+ bool operator() (unsigned VR1, unsigned VR2) const {
+ return operator[](VR1) < operator[](VR2);
+ }
+ };
+
+ // Ordering of bit values. This class does not have operator[], but
+ // is supplies a comparison operator() for use in std:: algorithms.
+ // The order is as follows:
+ // - 0 < 1 < ref
+ // - ref1 < ref2, if ord(ref1.Reg) < ord(ref2.Reg),
+ // or ord(ref1.Reg) == ord(ref2.Reg), and ref1.Pos < ref2.Pos.
+ struct BitValueOrdering {
+ BitValueOrdering(const RegisterOrdering &RB) : BaseOrd(RB) {}
+
+ bool operator() (const BitTracker::BitValue &V1,
+ const BitTracker::BitValue &V2) const;
+
+ const RegisterOrdering &BaseOrd;
+ };
+
+} // end anonymous namespace
+
+bool BitValueOrdering::operator() (const BitTracker::BitValue &V1,
+ const BitTracker::BitValue &V2) const {
+ if (V1 == V2)
+ return false;
+ // V1==0 => true, V2==0 => false
+ if (V1.is(0) || V2.is(0))
+ return V1.is(0);
+ // Neither of V1,V2 is 0, and V1!=V2.
+ // V2==1 => false, V1==1 => true
+ if (V2.is(1) || V1.is(1))
+ return !V2.is(1);
+ // Both V1,V2 are refs.
+ unsigned Ind1 = BaseOrd[V1.RefI.Reg], Ind2 = BaseOrd[V2.RefI.Reg];
+ if (Ind1 != Ind2)
+ return Ind1 < Ind2;
+ // If V1.Pos==V2.Pos
+ assert(V1.RefI.Pos != V2.RefI.Pos && "Bit values should be different");
+ return V1.RefI.Pos < V2.RefI.Pos;
+}
+
+namespace {
+
+ // Cache for the BitTracker's cell map. Map lookup has a logarithmic
+ // complexity, this class will memoize the lookup results to reduce
+ // the access time for repeated lookups of the same cell.
+ struct CellMapShadow {
+ CellMapShadow(const BitTracker &T) : BT(T) {}
+
+ const BitTracker::RegisterCell &lookup(unsigned VR) {
+ unsigned RInd = TargetRegisterInfo::virtReg2Index(VR);
+ // Grow the vector to at least 32 elements.
+ if (RInd >= CVect.size())
+ CVect.resize(std::max(RInd+16, 32U), nullptr);
+ const BitTracker::RegisterCell *CP = CVect[RInd];
+ if (CP == nullptr)
+ CP = CVect[RInd] = &BT.lookup(VR);
+ return *CP;
+ }
+
+ const BitTracker &BT;
+
+ private:
+ typedef std::vector<const BitTracker::RegisterCell*> CellVectType;
+ CellVectType CVect;
+ };
+
+ // Comparator class for lexicographic ordering of virtual registers
+ // according to the corresponding BitTracker::RegisterCell objects.
+ struct RegisterCellLexCompare {
+ RegisterCellLexCompare(const BitValueOrdering &BO, CellMapShadow &M)
+ : BitOrd(BO), CM(M) {}
+
+ bool operator() (unsigned VR1, unsigned VR2) const;
+
+ private:
+ const BitValueOrdering &BitOrd;
+ CellMapShadow &CM;
+ };
+
+ // Comparator class for lexicographic ordering of virtual registers
+ // according to the specified bits of the corresponding BitTracker::
+ // RegisterCell objects.
+ // Specifically, this class will be used to compare bit B of a register
+ // cell for a selected virtual register R with bit N of any register
+ // other than R.
+ struct RegisterCellBitCompareSel {
+ RegisterCellBitCompareSel(unsigned R, unsigned B, unsigned N,
+ const BitValueOrdering &BO, CellMapShadow &M)
+ : SelR(R), SelB(B), BitN(N), BitOrd(BO), CM(M) {}
+
+ bool operator() (unsigned VR1, unsigned VR2) const;
+
+ private:
+ const unsigned SelR, SelB;
+ const unsigned BitN;
+ const BitValueOrdering &BitOrd;
+ CellMapShadow &CM;
+ };
+
+} // end anonymous namespace
+
+bool RegisterCellLexCompare::operator() (unsigned VR1, unsigned VR2) const {
+ // Ordering of registers, made up from two given orderings:
+ // - the ordering of the register numbers, and
+ // - the ordering of register cells.
+ // Def. R1 < R2 if:
+ // - cell(R1) < cell(R2), or
+ // - cell(R1) == cell(R2), and index(R1) < index(R2).
+ //
+ // For register cells, the ordering is lexicographic, with index 0 being
+ // the most significant.
+ if (VR1 == VR2)
+ return false;
+
+ const BitTracker::RegisterCell &RC1 = CM.lookup(VR1), &RC2 = CM.lookup(VR2);
+ uint16_t W1 = RC1.width(), W2 = RC2.width();
+ for (uint16_t i = 0, w = std::min(W1, W2); i < w; ++i) {
+ const BitTracker::BitValue &V1 = RC1[i], &V2 = RC2[i];
+ if (V1 != V2)
+ return BitOrd(V1, V2);
+ }
+ // Cells are equal up until the common length.
+ if (W1 != W2)
+ return W1 < W2;
+
+ return BitOrd.BaseOrd[VR1] < BitOrd.BaseOrd[VR2];
+}
+
+bool RegisterCellBitCompareSel::operator() (unsigned VR1, unsigned VR2) const {
+ if (VR1 == VR2)
+ return false;
+ const BitTracker::RegisterCell &RC1 = CM.lookup(VR1);
+ const BitTracker::RegisterCell &RC2 = CM.lookup(VR2);
+ uint16_t W1 = RC1.width(), W2 = RC2.width();
+ uint16_t Bit1 = (VR1 == SelR) ? SelB : BitN;
+ uint16_t Bit2 = (VR2 == SelR) ? SelB : BitN;
+ // If Bit1 exceeds the width of VR1, then:
+ // - return false, if at the same time Bit2 exceeds VR2, or
+ // - return true, otherwise.
+ // (I.e. "a bit value that does not exist is less than any bit value
+ // that does exist".)
+ if (W1 <= Bit1)
+ return Bit2 < W2;
+ // If Bit1 is within VR1, but Bit2 is not within VR2, return false.
+ if (W2 <= Bit2)
+ return false;
+
+ const BitTracker::BitValue &V1 = RC1[Bit1], V2 = RC2[Bit2];
+ if (V1 != V2)
+ return BitOrd(V1, V2);
+ return false;
+}
+
+namespace {
+
+ class OrderedRegisterList {
+ typedef std::vector<unsigned> ListType;
+
+ public:
+ OrderedRegisterList(const RegisterOrdering &RO) : Ord(RO) {}
+
+ void insert(unsigned VR);
+ void remove(unsigned VR);
+
+ unsigned operator[](unsigned Idx) const {
+ assert(Idx < Seq.size());
+ return Seq[Idx];
+ }
+
+ unsigned size() const {
+ return Seq.size();
+ }
+
+ typedef ListType::iterator iterator;
+ typedef ListType::const_iterator const_iterator;
+ iterator begin() { return Seq.begin(); }
+ iterator end() { return Seq.end(); }
+ const_iterator begin() const { return Seq.begin(); }
+ const_iterator end() const { return Seq.end(); }
+
+ // Convenience function to convert an iterator to the corresponding index.
+ unsigned idx(iterator It) const { return It-begin(); }
+
+ private:
+ ListType Seq;
+ const RegisterOrdering &Ord;
+ };
+
+ struct PrintORL {
+ PrintORL(const OrderedRegisterList &L, const TargetRegisterInfo *RI)
+ : RL(L), TRI(RI) {}
+
+ friend raw_ostream &operator<< (raw_ostream &OS, const PrintORL &P);
+
+ private:
+ const OrderedRegisterList &RL;
+ const TargetRegisterInfo *TRI;
+ };
+
+ raw_ostream &operator<< (raw_ostream &OS, const PrintORL &P) {
+ OS << '(';
+ OrderedRegisterList::const_iterator B = P.RL.begin(), E = P.RL.end();
+ for (OrderedRegisterList::const_iterator I = B; I != E; ++I) {
+ if (I != B)
+ OS << ", ";
+ OS << PrintReg(*I, P.TRI);
+ }
+ OS << ')';
+ return OS;
+ }
+
+} // end anonymous namespace
+
+void OrderedRegisterList::insert(unsigned VR) {
+ iterator L = std::lower_bound(Seq.begin(), Seq.end(), VR, Ord);
+ if (L == Seq.end())
+ Seq.push_back(VR);
+ else
+ Seq.insert(L, VR);
+}
+
+void OrderedRegisterList::remove(unsigned VR) {
+ iterator L = std::lower_bound(Seq.begin(), Seq.end(), VR, Ord);
+ assert(L != Seq.end());
+ Seq.erase(L);
+}
+
+namespace {
+
+ // A record of the insert form. The fields correspond to the operands
+ // of the "insert" instruction:
+ // ... = insert(SrcR, InsR, #Wdh, #Off)
+ struct IFRecord {
+ IFRecord(unsigned SR = 0, unsigned IR = 0, uint16_t W = 0, uint16_t O = 0)
+ : SrcR(SR), InsR(IR), Wdh(W), Off(O) {}
+
+ unsigned SrcR, InsR;
+ uint16_t Wdh, Off;
+ };
+
+ struct PrintIFR {
+ PrintIFR(const IFRecord &R, const TargetRegisterInfo *RI)
+ : IFR(R), TRI(RI) {}
+
+ private:
+ friend raw_ostream &operator<< (raw_ostream &OS, const PrintIFR &P);
+
+ const IFRecord &IFR;
+ const TargetRegisterInfo *TRI;
+ };
+
+ raw_ostream &operator<< (raw_ostream &OS, const PrintIFR &P) {
+ unsigned SrcR = P.IFR.SrcR, InsR = P.IFR.InsR;
+ OS << '(' << PrintReg(SrcR, P.TRI) << ',' << PrintReg(InsR, P.TRI)
+ << ",#" << P.IFR.Wdh << ",#" << P.IFR.Off << ')';
+ return OS;
+ }
+
+ typedef std::pair<IFRecord,RegisterSet> IFRecordWithRegSet;
+
+} // end anonymous namespace
+
+namespace llvm {
+
+ void initializeHexagonGenInsertPass(PassRegistry&);
+ FunctionPass *createHexagonGenInsert();
+
+} // end namespace llvm
+
+namespace {
+
+ class HexagonGenInsert : public MachineFunctionPass {
+ public:
+ static char ID;
+
+ HexagonGenInsert() : MachineFunctionPass(ID), HII(nullptr), HRI(nullptr) {
+ initializeHexagonGenInsertPass(*PassRegistry::getPassRegistry());
+ }
+
+ StringRef getPassName() const override {
+ return "Hexagon generate \"insert\" instructions";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ private:
+ typedef DenseMap<std::pair<unsigned,unsigned>,unsigned> PairMapType;
+
+ void buildOrderingMF(RegisterOrdering &RO) const;
+ void buildOrderingBT(RegisterOrdering &RB, RegisterOrdering &RO) const;
+ bool isIntClass(const TargetRegisterClass *RC) const;
+ bool isConstant(unsigned VR) const;
+ bool isSmallConstant(unsigned VR) const;
+ bool isValidInsertForm(unsigned DstR, unsigned SrcR, unsigned InsR,
+ uint16_t L, uint16_t S) const;
+ bool findSelfReference(unsigned VR) const;
+ bool findNonSelfReference(unsigned VR) const;
+ void getInstrDefs(const MachineInstr *MI, RegisterSet &Defs) const;
+ void getInstrUses(const MachineInstr *MI, RegisterSet &Uses) const;
+ unsigned distance(const MachineBasicBlock *FromB,
+ const MachineBasicBlock *ToB, const UnsignedMap &RPO,
+ PairMapType &M) const;
+ unsigned distance(MachineBasicBlock::const_iterator FromI,
+ MachineBasicBlock::const_iterator ToI, const UnsignedMap &RPO,
+ PairMapType &M) const;
+ bool findRecordInsertForms(unsigned VR, OrderedRegisterList &AVs);
+ void collectInBlock(MachineBasicBlock *B, OrderedRegisterList &AVs);
+ void findRemovableRegisters(unsigned VR, IFRecord IF,
+ RegisterSet &RMs) const;
+ void computeRemovableRegisters();
+
+ void pruneEmptyLists();
+ void pruneCoveredSets(unsigned VR);
+ void pruneUsesTooFar(unsigned VR, const UnsignedMap &RPO, PairMapType &M);
+ void pruneRegCopies(unsigned VR);
+ void pruneCandidates();
+ void selectCandidates();
+ bool generateInserts();
+
+ bool removeDeadCode(MachineDomTreeNode *N);
+
+ // IFRecord coupled with a set of potentially removable registers:
+ typedef std::vector<IFRecordWithRegSet> IFListType;
+ typedef DenseMap<unsigned,IFListType> IFMapType; // vreg -> IFListType
+
+ void dump_map() const;
+
+ const HexagonInstrInfo *HII;
+ const HexagonRegisterInfo *HRI;
+
+ MachineFunction *MFN;
+ MachineRegisterInfo *MRI;
+ MachineDominatorTree *MDT;
+ CellMapShadow *CMS;
+
+ RegisterOrdering BaseOrd;
+ RegisterOrdering CellOrd;
+ IFMapType IFMap;
+ };
+
+ char HexagonGenInsert::ID = 0;
+
+} // end anonymous namespace
+
+void HexagonGenInsert::dump_map() const {
+ typedef IFMapType::const_iterator iterator;
+ for (iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) {
+ dbgs() << " " << PrintReg(I->first, HRI) << ":\n";
+ const IFListType &LL = I->second;
+ for (unsigned i = 0, n = LL.size(); i < n; ++i)
+ dbgs() << " " << PrintIFR(LL[i].first, HRI) << ", "
+ << PrintRegSet(LL[i].second, HRI) << '\n';
+ }
+}
+
+void HexagonGenInsert::buildOrderingMF(RegisterOrdering &RO) const {
+ unsigned Index = 0;
+ typedef MachineFunction::const_iterator mf_iterator;
+ for (mf_iterator A = MFN->begin(), Z = MFN->end(); A != Z; ++A) {
+ const MachineBasicBlock &B = *A;
+ if (!CMS->BT.reached(&B))
+ continue;
+ typedef MachineBasicBlock::const_iterator mb_iterator;
+ for (mb_iterator I = B.begin(), E = B.end(); I != E; ++I) {
+ const MachineInstr *MI = &*I;
+ for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+ if (MO.isReg() && MO.isDef()) {
+ unsigned R = MO.getReg();
+ assert(MO.getSubReg() == 0 && "Unexpected subregister in definition");
+ if (TargetRegisterInfo::isVirtualRegister(R))
+ RO.insert(std::make_pair(R, Index++));
+ }
+ }
+ }
+ }
+ // Since some virtual registers may have had their def and uses eliminated,
+ // they are no longer referenced in the code, and so they will not appear
+ // in the map.
+}
+
+void HexagonGenInsert::buildOrderingBT(RegisterOrdering &RB,
+ RegisterOrdering &RO) const {
+ // Create a vector of all virtual registers (collect them from the base
+ // ordering RB), and then sort it using the RegisterCell comparator.
+ BitValueOrdering BVO(RB);
+ RegisterCellLexCompare LexCmp(BVO, *CMS);
+ typedef std::vector<unsigned> SortableVectorType;
+ SortableVectorType VRs;
+ for (RegisterOrdering::iterator I = RB.begin(), E = RB.end(); I != E; ++I)
+ VRs.push_back(I->first);
+ std::sort(VRs.begin(), VRs.end(), LexCmp);
+ // Transfer the results to the outgoing register ordering.
+ for (unsigned i = 0, n = VRs.size(); i < n; ++i)
+ RO.insert(std::make_pair(VRs[i], i));
+}
+
+inline bool HexagonGenInsert::isIntClass(const TargetRegisterClass *RC) const {
+ return RC == &Hexagon::IntRegsRegClass || RC == &Hexagon::DoubleRegsRegClass;
+}
+
+bool HexagonGenInsert::isConstant(unsigned VR) const {
+ const BitTracker::RegisterCell &RC = CMS->lookup(VR);
+ uint16_t W = RC.width();
+ for (uint16_t i = 0; i < W; ++i) {
+ const BitTracker::BitValue &BV = RC[i];
+ if (BV.is(0) || BV.is(1))
+ continue;
+ return false;
+ }
+ return true;
+}
+
+bool HexagonGenInsert::isSmallConstant(unsigned VR) const {
+ const BitTracker::RegisterCell &RC = CMS->lookup(VR);
+ uint16_t W = RC.width();
+ if (W > 64)
+ return false;
+ uint64_t V = 0, B = 1;
+ for (uint16_t i = 0; i < W; ++i) {
+ const BitTracker::BitValue &BV = RC[i];
+ if (BV.is(1))
+ V |= B;
+ else if (!BV.is(0))
+ return false;
+ B <<= 1;
+ }
+
+ // For 32-bit registers, consider: Rd = #s16.
+ if (W == 32)
+ return isInt<16>(V);
+
+ // For 64-bit registers, it's Rdd = #s8 or Rdd = combine(#s8,#s8)
+ return isInt<8>(Lo_32(V)) && isInt<8>(Hi_32(V));
+}
+
+bool HexagonGenInsert::isValidInsertForm(unsigned DstR, unsigned SrcR,
+ unsigned InsR, uint16_t L, uint16_t S) const {
+ const TargetRegisterClass *DstRC = MRI->getRegClass(DstR);
+ const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcR);
+ const TargetRegisterClass *InsRC = MRI->getRegClass(InsR);
+ // Only integet (32-/64-bit) register classes.
+ if (!isIntClass(DstRC) || !isIntClass(SrcRC) || !isIntClass(InsRC))
+ return false;
+ // The "source" register must be of the same class as DstR.
+ if (DstRC != SrcRC)
+ return false;
+ if (DstRC == InsRC)
+ return true;
+ // A 64-bit register can only be generated from other 64-bit registers.
+ if (DstRC == &Hexagon::DoubleRegsRegClass)
+ return false;
+ // Otherwise, the L and S cannot span 32-bit word boundary.
+ if (S < 32 && S+L > 32)
+ return false;
+ return true;
+}
+
+bool HexagonGenInsert::findSelfReference(unsigned VR) const {
+ const BitTracker::RegisterCell &RC = CMS->lookup(VR);
+ for (uint16_t i = 0, w = RC.width(); i < w; ++i) {
+ const BitTracker::BitValue &V = RC[i];
+ if (V.Type == BitTracker::BitValue::Ref && V.RefI.Reg == VR)
+ return true;
+ }
+ return false;
+}
+
+bool HexagonGenInsert::findNonSelfReference(unsigned VR) const {
+ BitTracker::RegisterCell RC = CMS->lookup(VR);
+ for (uint16_t i = 0, w = RC.width(); i < w; ++i) {
+ const BitTracker::BitValue &V = RC[i];
+ if (V.Type == BitTracker::BitValue::Ref && V.RefI.Reg != VR)
+ return true;
+ }
+ return false;
+}
+
+void HexagonGenInsert::getInstrDefs(const MachineInstr *MI,
+ RegisterSet &Defs) const {
+ for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+ if (!MO.isReg() || !MO.isDef())
+ continue;
+ unsigned R = MO.getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(R))
+ continue;
+ Defs.insert(R);
+ }
+}
+
+void HexagonGenInsert::getInstrUses(const MachineInstr *MI,
+ RegisterSet &Uses) const {
+ for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+ if (!MO.isReg() || !MO.isUse())
+ continue;
+ unsigned R = MO.getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(R))
+ continue;
+ Uses.insert(R);
+ }
+}
+
+unsigned HexagonGenInsert::distance(const MachineBasicBlock *FromB,
+ const MachineBasicBlock *ToB, const UnsignedMap &RPO,
+ PairMapType &M) const {
+ // Forward distance from the end of a block to the beginning of it does
+ // not make sense. This function should not be called with FromB == ToB.
+ assert(FromB != ToB);
+
+ unsigned FromN = FromB->getNumber(), ToN = ToB->getNumber();
+ // If we have already computed it, return the cached result.
+ PairMapType::iterator F = M.find(std::make_pair(FromN, ToN));
+ if (F != M.end())
+ return F->second;
+ unsigned ToRPO = RPO.lookup(ToN);
+
+ unsigned MaxD = 0;
+ typedef MachineBasicBlock::const_pred_iterator pred_iterator;
+ for (pred_iterator I = ToB->pred_begin(), E = ToB->pred_end(); I != E; ++I) {
+ const MachineBasicBlock *PB = *I;
+ // Skip back edges. Also, if FromB is a predecessor of ToB, the distance
+ // along that path will be 0, and we don't need to do any calculations
+ // on it.
+ if (PB == FromB || RPO.lookup(PB->getNumber()) >= ToRPO)
+ continue;
+ unsigned D = PB->size() + distance(FromB, PB, RPO, M);
+ if (D > MaxD)
+ MaxD = D;
+ }
+
+ // Memoize the result for later lookup.
+ M.insert(std::make_pair(std::make_pair(FromN, ToN), MaxD));
+ return MaxD;
+}
+
+unsigned HexagonGenInsert::distance(MachineBasicBlock::const_iterator FromI,
+ MachineBasicBlock::const_iterator ToI, const UnsignedMap &RPO,
+ PairMapType &M) const {
+ const MachineBasicBlock *FB = FromI->getParent(), *TB = ToI->getParent();
+ if (FB == TB)
+ return std::distance(FromI, ToI);
+ unsigned D1 = std::distance(TB->begin(), ToI);
+ unsigned D2 = distance(FB, TB, RPO, M);
+ unsigned D3 = std::distance(FromI, FB->end());
+ return D1+D2+D3;
+}
+
+bool HexagonGenInsert::findRecordInsertForms(unsigned VR,
+ OrderedRegisterList &AVs) {
+ if (isDebug()) {
+ dbgs() << __func__ << ": " << PrintReg(VR, HRI)
+ << " AVs: " << PrintORL(AVs, HRI) << "\n";
+ }
+ if (AVs.size() == 0)
+ return false;
+
+ typedef OrderedRegisterList::iterator iterator;
+ BitValueOrdering BVO(BaseOrd);
+ const BitTracker::RegisterCell &RC = CMS->lookup(VR);
+ uint16_t W = RC.width();
+
+ typedef std::pair<unsigned,uint16_t> RSRecord; // (reg,shift)
+ typedef std::vector<RSRecord> RSListType;
+ // Have a map, with key being the matching prefix length, and the value
+ // being the list of pairs (R,S), where R's prefix matches VR at S.
+ // (DenseMap<uint16_t,RSListType> fails to instantiate.)
+ typedef DenseMap<unsigned,RSListType> LRSMapType;
+ LRSMapType LM;
+
+ // Conceptually, rotate the cell RC right (i.e. towards the LSB) by S,
+ // and find matching prefixes from AVs with the rotated RC. Such a prefix
+ // would match a string of bits (of length L) in RC starting at S.
+ for (uint16_t S = 0; S < W; ++S) {
+ iterator B = AVs.begin(), E = AVs.end();
+ // The registers in AVs are ordered according to the lexical order of
+ // the corresponding register cells. This means that the range of regis-
+ // ters in AVs that match a prefix of length L+1 will be contained in
+ // the range that matches a prefix of length L. This means that we can
+ // keep narrowing the search space as the prefix length goes up. This
+ // helps reduce the overall complexity of the search.
+ uint16_t L;
+ for (L = 0; L < W-S; ++L) {
+ // Compare against VR's bits starting at S, which emulates rotation
+ // of VR by S.
+ RegisterCellBitCompareSel RCB(VR, S+L, L, BVO, *CMS);
+ iterator NewB = std::lower_bound(B, E, VR, RCB);
+ iterator NewE = std::upper_bound(NewB, E, VR, RCB);
+ // For the registers that are eliminated from the next range, L is
+ // the longest prefix matching VR at position S (their prefixes
+ // differ from VR at S+L). If L>0, record this information for later
+ // use.
+ if (L > 0) {
+ for (iterator I = B; I != NewB; ++I)
+ LM[L].push_back(std::make_pair(*I, S));
+ for (iterator I = NewE; I != E; ++I)
+ LM[L].push_back(std::make_pair(*I, S));
+ }
+ B = NewB, E = NewE;
+ if (B == E)
+ break;
+ }
+ // Record the final register range. If this range is non-empty, then
+ // L=W-S.
+ assert(B == E || L == W-S);
+ if (B != E) {
+ for (iterator I = B; I != E; ++I)
+ LM[L].push_back(std::make_pair(*I, S));
+ // If B!=E, then we found a range of registers whose prefixes cover the
+ // rest of VR from position S. There is no need to further advance S.
+ break;
+ }
+ }
+
+ if (isDebug()) {
+ dbgs() << "Prefixes matching register " << PrintReg(VR, HRI) << "\n";
+ for (LRSMapType::iterator I = LM.begin(), E = LM.end(); I != E; ++I) {
+ dbgs() << " L=" << I->first << ':';
+ const RSListType &LL = I->second;
+ for (unsigned i = 0, n = LL.size(); i < n; ++i)
+ dbgs() << " (" << PrintReg(LL[i].first, HRI) << ",@"
+ << LL[i].second << ')';
+ dbgs() << '\n';
+ }
+ }
+
+ bool Recorded = false;
+
+ for (iterator I = AVs.begin(), E = AVs.end(); I != E; ++I) {
+ unsigned SrcR = *I;
+ int FDi = -1, LDi = -1; // First/last different bit.
+ const BitTracker::RegisterCell &AC = CMS->lookup(SrcR);
+ uint16_t AW = AC.width();
+ for (uint16_t i = 0, w = std::min(W, AW); i < w; ++i) {
+ if (RC[i] == AC[i])
+ continue;
+ if (FDi == -1)
+ FDi = i;
+ LDi = i;
+ }
+ if (FDi == -1)
+ continue; // TODO (future): Record identical registers.
+ // Look for a register whose prefix could patch the range [FD..LD]
+ // where VR and SrcR differ.
+ uint16_t FD = FDi, LD = LDi; // Switch to unsigned type.
+ uint16_t MinL = LD-FD+1;
+ for (uint16_t L = MinL; L < W; ++L) {
+ LRSMapType::iterator F = LM.find(L);
+ if (F == LM.end())
+ continue;
+ RSListType &LL = F->second;
+ for (unsigned i = 0, n = LL.size(); i < n; ++i) {
+ uint16_t S = LL[i].second;
+ // MinL is the minimum length of the prefix. Any length above MinL
+ // allows some flexibility as to where the prefix can start:
+ // given the extra length EL=L-MinL, the prefix must start between
+ // max(0,FD-EL) and FD.
+ if (S > FD) // Starts too late.
+ continue;
+ uint16_t EL = L-MinL;
+ uint16_t LowS = (EL < FD) ? FD-EL : 0;
+ if (S < LowS) // Starts too early.
+ continue;
+ unsigned InsR = LL[i].first;
+ if (!isValidInsertForm(VR, SrcR, InsR, L, S))
+ continue;
+ if (isDebug()) {
+ dbgs() << PrintReg(VR, HRI) << " = insert(" << PrintReg(SrcR, HRI)
+ << ',' << PrintReg(InsR, HRI) << ",#" << L << ",#"
+ << S << ")\n";
+ }
+ IFRecordWithRegSet RR(IFRecord(SrcR, InsR, L, S), RegisterSet());
+ IFMap[VR].push_back(RR);
+ Recorded = true;
+ }
+ }
+ }
+
+ return Recorded;
+}
+
+void HexagonGenInsert::collectInBlock(MachineBasicBlock *B,
+ OrderedRegisterList &AVs) {
+ if (isDebug())
+ dbgs() << "visiting block BB#" << B->getNumber() << "\n";
+
+ // First, check if this block is reachable at all. If not, the bit tracker
+ // will not have any information about registers in it.
+ if (!CMS->BT.reached(B))
+ return;
+
+ bool DoConst = OptConst;
+ // Keep a separate set of registers defined in this block, so that we
+ // can remove them from the list of available registers once all DT
+ // successors have been processed.
+ RegisterSet BlockDefs, InsDefs;
+ for (MachineBasicBlock::iterator I = B->begin(), E = B->end(); I != E; ++I) {
+ MachineInstr *MI = &*I;
+ InsDefs.clear();
+ getInstrDefs(MI, InsDefs);
+ // Leave those alone. They are more transparent than "insert".
+ bool Skip = MI->isCopy() || MI->isRegSequence();
+
+ if (!Skip) {
+ // Visit all defined registers, and attempt to find the corresponding
+ // "insert" representations.
+ for (unsigned VR = InsDefs.find_first(); VR; VR = InsDefs.find_next(VR)) {
+ // Do not collect registers that are known to be compile-time cons-
+ // tants, unless requested.
+ if (!DoConst && isConstant(VR))
+ continue;
+ // If VR's cell contains a reference to VR, then VR cannot be defined
+ // via "insert". If VR is a constant that can be generated in a single
+ // instruction (without constant extenders), generating it via insert
+ // makes no sense.
+ if (findSelfReference(VR) || isSmallConstant(VR))
+ continue;
+
+ findRecordInsertForms(VR, AVs);
+ }
+ }
+
+ // Insert the defined registers into the list of available registers
+ // after they have been processed.
+ for (unsigned VR = InsDefs.find_first(); VR; VR = InsDefs.find_next(VR))
+ AVs.insert(VR);
+ BlockDefs.insert(InsDefs);
+ }
+
+ MachineDomTreeNode *N = MDT->getNode(B);
+ typedef GraphTraits<MachineDomTreeNode*> GTN;
+ typedef GTN::ChildIteratorType ChildIter;
+ for (ChildIter I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I) {
+ MachineBasicBlock *SB = (*I)->getBlock();
+ collectInBlock(SB, AVs);
+ }
+
+ for (unsigned VR = BlockDefs.find_first(); VR; VR = BlockDefs.find_next(VR))
+ AVs.remove(VR);
+}
+
+void HexagonGenInsert::findRemovableRegisters(unsigned VR, IFRecord IF,
+ RegisterSet &RMs) const {
+ // For a given register VR and a insert form, find the registers that are
+ // used by the current definition of VR, and which would no longer be
+ // needed for it after the definition of VR is replaced with the insert
+ // form. These are the registers that could potentially become dead.
+ RegisterSet Regs[2];
+
+ unsigned S = 0; // Register set selector.
+ Regs[S].insert(VR);
+
+ while (!Regs[S].empty()) {
+ // Breadth-first search.
+ unsigned OtherS = 1-S;
+ Regs[OtherS].clear();
+ for (unsigned R = Regs[S].find_first(); R; R = Regs[S].find_next(R)) {
+ Regs[S].remove(R);
+ if (R == IF.SrcR || R == IF.InsR)
+ continue;
+ // Check if a given register has bits that are references to any other
+ // registers. This is to detect situations where the instruction that
+ // defines register R takes register Q as an operand, but R itself does
+ // not contain any bits from Q. Loads are examples of how this could
+ // happen:
+ // R = load Q
+ // In this case (assuming we do not have any knowledge about the loaded
+ // value), we must not treat R as a "conveyance" of the bits from Q.
+ // (The information in BT about R's bits would have them as constants,
+ // in case of zero-extending loads, or refs to R.)
+ if (!findNonSelfReference(R))
+ continue;
+ RMs.insert(R);
+ const MachineInstr *DefI = MRI->getVRegDef(R);
+ assert(DefI);
+ // Do not iterate past PHI nodes to avoid infinite loops. This can
+ // make the final set a bit less accurate, but the removable register
+ // sets are an approximation anyway.
+ if (DefI->isPHI())
+ continue;
+ getInstrUses(DefI, Regs[OtherS]);
+ }
+ S = OtherS;
+ }
+ // The register VR is added to the list as a side-effect of the algorithm,
+ // but it is not "potentially removable". A potentially removable register
+ // is one that may become unused (dead) after conversion to the insert form
+ // IF, and obviously VR (or its replacement) will not become dead by apply-
+ // ing IF.
+ RMs.remove(VR);
+}
+
+void HexagonGenInsert::computeRemovableRegisters() {
+ for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) {
+ IFListType &LL = I->second;
+ for (unsigned i = 0, n = LL.size(); i < n; ++i)
+ findRemovableRegisters(I->first, LL[i].first, LL[i].second);
+ }
+}
+
+void HexagonGenInsert::pruneEmptyLists() {
+ // Remove all entries from the map, where the register has no insert forms
+ // associated with it.
+ typedef SmallVector<IFMapType::iterator,16> IterListType;
+ IterListType Prune;
+ for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) {
+ if (I->second.empty())
+ Prune.push_back(I);
+ }
+ for (unsigned i = 0, n = Prune.size(); i < n; ++i)
+ IFMap.erase(Prune[i]);
+}
+
+void HexagonGenInsert::pruneCoveredSets(unsigned VR) {
+ IFMapType::iterator F = IFMap.find(VR);
+ assert(F != IFMap.end());
+ IFListType &LL = F->second;
+
+ // First, examine the IF candidates for register VR whose removable-regis-
+ // ter sets are empty. This means that a given candidate will not help eli-
+ // minate any registers, but since "insert" is not a constant-extendable
+ // instruction, using such a candidate may reduce code size if the defini-
+ // tion of VR is constant-extended.
+ // If there exists a candidate with a non-empty set, the ones with empty
+ // sets will not be used and can be removed.
+ MachineInstr *DefVR = MRI->getVRegDef(VR);
+ bool DefEx = HII->isConstExtended(*DefVR);
+ bool HasNE = false;
+ for (unsigned i = 0, n = LL.size(); i < n; ++i) {
+ if (LL[i].second.empty())
+ continue;
+ HasNE = true;
+ break;
+ }
+ if (!DefEx || HasNE) {
+ // The definition of VR is not constant-extended, or there is a candidate
+ // with a non-empty set. Remove all candidates with empty sets.
+ auto IsEmpty = [] (const IFRecordWithRegSet &IR) -> bool {
+ return IR.second.empty();
+ };
+ auto End = llvm::remove_if(LL, IsEmpty);
+ if (End != LL.end())
+ LL.erase(End, LL.end());
+ } else {
+ // The definition of VR is constant-extended, and all candidates have
+ // empty removable-register sets. Pick the maximum candidate, and remove
+ // all others. The "maximum" does not have any special meaning here, it
+ // is only so that the candidate that will remain on the list is selec-
+ // ted deterministically.
+ IFRecord MaxIF = LL[0].first;
+ for (unsigned i = 1, n = LL.size(); i < n; ++i) {
+ // If LL[MaxI] < LL[i], then MaxI = i.
+ const IFRecord &IF = LL[i].first;
+ unsigned M0 = BaseOrd[MaxIF.SrcR], M1 = BaseOrd[MaxIF.InsR];
+ unsigned R0 = BaseOrd[IF.SrcR], R1 = BaseOrd[IF.InsR];
+ if (M0 > R0)
+ continue;
+ if (M0 == R0) {
+ if (M1 > R1)
+ continue;
+ if (M1 == R1) {
+ if (MaxIF.Wdh > IF.Wdh)
+ continue;
+ if (MaxIF.Wdh == IF.Wdh && MaxIF.Off >= IF.Off)
+ continue;
+ }
+ }
+ // MaxIF < IF.
+ MaxIF = IF;
+ }
+ // Remove everything except the maximum candidate. All register sets
+ // are empty, so no need to preserve anything.
+ LL.clear();
+ LL.push_back(std::make_pair(MaxIF, RegisterSet()));
+ }
+
+ // Now, remove those whose sets of potentially removable registers are
+ // contained in another IF candidate for VR. For example, given these
+ // candidates for vreg45,
+ // %vreg45:
+ // (%vreg44,%vreg41,#9,#8), { %vreg42 }
+ // (%vreg43,%vreg41,#9,#8), { %vreg42 %vreg44 }
+ // remove the first one, since it is contained in the second one.
+ for (unsigned i = 0, n = LL.size(); i < n; ) {
+ const RegisterSet &RMi = LL[i].second;
+ unsigned j = 0;
+ while (j < n) {
+ if (j != i && LL[j].second.includes(RMi))
+ break;
+ j++;
+ }
+ if (j == n) { // RMi not contained in anything else.
+ i++;
+ continue;
+ }
+ LL.erase(LL.begin()+i);
+ n = LL.size();
+ }
+}
+
+void HexagonGenInsert::pruneUsesTooFar(unsigned VR, const UnsignedMap &RPO,
+ PairMapType &M) {
+ IFMapType::iterator F = IFMap.find(VR);
+ assert(F != IFMap.end());
+ IFListType &LL = F->second;
+ unsigned Cutoff = VRegDistCutoff;
+ const MachineInstr *DefV = MRI->getVRegDef(VR);
+
+ for (unsigned i = LL.size(); i > 0; --i) {
+ unsigned SR = LL[i-1].first.SrcR, IR = LL[i-1].first.InsR;
+ const MachineInstr *DefS = MRI->getVRegDef(SR);
+ const MachineInstr *DefI = MRI->getVRegDef(IR);
+ unsigned DSV = distance(DefS, DefV, RPO, M);
+ if (DSV < Cutoff) {
+ unsigned DIV = distance(DefI, DefV, RPO, M);
+ if (DIV < Cutoff)
+ continue;
+ }
+ LL.erase(LL.begin()+(i-1));
+ }
+}
+
+void HexagonGenInsert::pruneRegCopies(unsigned VR) {
+ IFMapType::iterator F = IFMap.find(VR);
+ assert(F != IFMap.end());
+ IFListType &LL = F->second;
+
+ auto IsCopy = [] (const IFRecordWithRegSet &IR) -> bool {
+ return IR.first.Wdh == 32 && (IR.first.Off == 0 || IR.first.Off == 32);
+ };
+ auto End = llvm::remove_if(LL, IsCopy);
+ if (End != LL.end())
+ LL.erase(End, LL.end());
+}
+
+void HexagonGenInsert::pruneCandidates() {
+ // Remove candidates that are not beneficial, regardless of the final
+ // selection method.
+ // First, remove candidates whose potentially removable set is a subset
+ // of another candidate's set.
+ for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I)
+ pruneCoveredSets(I->first);
+
+ UnsignedMap RPO;
+ typedef ReversePostOrderTraversal<const MachineFunction*> RPOTType;
+ RPOTType RPOT(MFN);
+ unsigned RPON = 0;
+ for (RPOTType::rpo_iterator I = RPOT.begin(), E = RPOT.end(); I != E; ++I)
+ RPO[(*I)->getNumber()] = RPON++;
+
+ PairMapType Memo; // Memoization map for distance calculation.
+ // Remove candidates that would use registers defined too far away.
+ for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I)
+ pruneUsesTooFar(I->first, RPO, Memo);
+
+ pruneEmptyLists();
+
+ for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I)
+ pruneRegCopies(I->first);
+}
+
+namespace {
+
+ // Class for comparing IF candidates for registers that have multiple of
+ // them. The smaller the candidate, according to this ordering, the better.
+ // First, compare the number of zeros in the associated potentially remova-
+ // ble register sets. "Zero" indicates that the register is very likely to
+ // become dead after this transformation.
+ // Second, compare "averages", i.e. use-count per size. The lower wins.
+ // After that, it does not really matter which one is smaller. Resolve
+ // the tie in some deterministic way.
+ struct IFOrdering {
+ IFOrdering(const UnsignedMap &UC, const RegisterOrdering &BO)
+ : UseC(UC), BaseOrd(BO) {}
+
+ bool operator() (const IFRecordWithRegSet &A,
+ const IFRecordWithRegSet &B) const;
+
+ private:
+ void stats(const RegisterSet &Rs, unsigned &Size, unsigned &Zero,
+ unsigned &Sum) const;
+
+ const UnsignedMap &UseC;
+ const RegisterOrdering &BaseOrd;
+ };
+
+} // end anonymous namespace
+
+bool IFOrdering::operator() (const IFRecordWithRegSet &A,
+ const IFRecordWithRegSet &B) const {
+ unsigned SizeA = 0, ZeroA = 0, SumA = 0;
+ unsigned SizeB = 0, ZeroB = 0, SumB = 0;
+ stats(A.second, SizeA, ZeroA, SumA);
+ stats(B.second, SizeB, ZeroB, SumB);
+
+ // We will pick the minimum element. The more zeros, the better.
+ if (ZeroA != ZeroB)
+ return ZeroA > ZeroB;
+ // Compare SumA/SizeA with SumB/SizeB, lower is better.
+ uint64_t AvgA = SumA*SizeB, AvgB = SumB*SizeA;
+ if (AvgA != AvgB)
+ return AvgA < AvgB;
+
+ // The sets compare identical so far. Resort to comparing the IF records.
+ // The actual values don't matter, this is only for determinism.
+ unsigned OSA = BaseOrd[A.first.SrcR], OSB = BaseOrd[B.first.SrcR];
+ if (OSA != OSB)
+ return OSA < OSB;
+ unsigned OIA = BaseOrd[A.first.InsR], OIB = BaseOrd[B.first.InsR];
+ if (OIA != OIB)
+ return OIA < OIB;
+ if (A.first.Wdh != B.first.Wdh)
+ return A.first.Wdh < B.first.Wdh;
+ return A.first.Off < B.first.Off;
+}
+
+void IFOrdering::stats(const RegisterSet &Rs, unsigned &Size, unsigned &Zero,
+ unsigned &Sum) const {
+ for (unsigned R = Rs.find_first(); R; R = Rs.find_next(R)) {
+ UnsignedMap::const_iterator F = UseC.find(R);
+ assert(F != UseC.end());
+ unsigned UC = F->second;
+ if (UC == 0)
+ Zero++;
+ Sum += UC;
+ Size++;
+ }
+}
+
+void HexagonGenInsert::selectCandidates() {
+ // Some registers may have multiple valid candidates. Pick the best one
+ // (or decide not to use any).
+
+ // Compute the "removability" measure of R:
+ // For each potentially removable register R, record the number of regis-
+ // ters with IF candidates, where R appears in at least one set.
+ RegisterSet AllRMs;
+ UnsignedMap UseC, RemC;
+ IFMapType::iterator End = IFMap.end();
+
+ for (IFMapType::iterator I = IFMap.begin(); I != End; ++I) {
+ const IFListType &LL = I->second;
+ RegisterSet TT;
+ for (unsigned i = 0, n = LL.size(); i < n; ++i)
+ TT.insert(LL[i].second);
+ for (unsigned R = TT.find_first(); R; R = TT.find_next(R))
+ RemC[R]++;
+ AllRMs.insert(TT);
+ }
+
+ for (unsigned R = AllRMs.find_first(); R; R = AllRMs.find_next(R)) {
+ typedef MachineRegisterInfo::use_nodbg_iterator use_iterator;
+ typedef SmallSet<const MachineInstr*,16> InstrSet;
+ InstrSet UIs;
+ // Count as the number of instructions in which R is used, not the
+ // number of operands.
+ use_iterator E = MRI->use_nodbg_end();
+ for (use_iterator I = MRI->use_nodbg_begin(R); I != E; ++I)
+ UIs.insert(I->getParent());
+ unsigned C = UIs.size();
+ // Calculate a measure, which is the number of instructions using R,
+ // minus the "removability" count computed earlier.
+ unsigned D = RemC[R];
+ UseC[R] = (C > D) ? C-D : 0; // doz
+ }
+
+ bool SelectAll0 = OptSelectAll0, SelectHas0 = OptSelectHas0;
+ if (!SelectAll0 && !SelectHas0)
+ SelectAll0 = true;
+
+ // The smaller the number UseC for a given register R, the "less used"
+ // R is aside from the opportunities for removal offered by generating
+ // "insert" instructions.
+ // Iterate over the IF map, and for those registers that have multiple
+ // candidates, pick the minimum one according to IFOrdering.
+ IFOrdering IFO(UseC, BaseOrd);
+ for (IFMapType::iterator I = IFMap.begin(); I != End; ++I) {
+ IFListType &LL = I->second;
+ if (LL.empty())
+ continue;
+ // Get the minimum element, remember it and clear the list. If the
+ // element found is adequate, we will put it back on the list, other-
+ // wise the list will remain empty, and the entry for this register
+ // will be removed (i.e. this register will not be replaced by insert).
+ IFListType::iterator MinI = std::min_element(LL.begin(), LL.end(), IFO);
+ assert(MinI != LL.end());
+ IFRecordWithRegSet M = *MinI;
+ LL.clear();
+
+ // We want to make sure that this replacement will have a chance to be
+ // beneficial, and that means that we want to have indication that some
+ // register will be removed. The most likely registers to be eliminated
+ // are the use operands in the definition of I->first. Accept/reject a
+ // candidate based on how many of its uses it can potentially eliminate.
+
+ RegisterSet Us;
+ const MachineInstr *DefI = MRI->getVRegDef(I->first);
+ getInstrUses(DefI, Us);
+ bool Accept = false;
+
+ if (SelectAll0) {
+ bool All0 = true;
+ for (unsigned R = Us.find_first(); R; R = Us.find_next(R)) {
+ if (UseC[R] == 0)
+ continue;
+ All0 = false;
+ break;
+ }
+ Accept = All0;
+ } else if (SelectHas0) {
+ bool Has0 = false;
+ for (unsigned R = Us.find_first(); R; R = Us.find_next(R)) {
+ if (UseC[R] != 0)
+ continue;
+ Has0 = true;
+ break;
+ }
+ Accept = Has0;
+ }
+ if (Accept)
+ LL.push_back(M);
+ }
+
+ // Remove candidates that add uses of removable registers, unless the
+ // removable registers are among replacement candidates.
+ // Recompute the removable registers, since some candidates may have
+ // been eliminated.
+ AllRMs.clear();
+ for (IFMapType::iterator I = IFMap.begin(); I != End; ++I) {
+ const IFListType &LL = I->second;
+ if (!LL.empty())
+ AllRMs.insert(LL[0].second);
+ }
+ for (IFMapType::iterator I = IFMap.begin(); I != End; ++I) {
+ IFListType &LL = I->second;
+ if (LL.empty())
+ continue;
+ unsigned SR = LL[0].first.SrcR, IR = LL[0].first.InsR;
+ if (AllRMs[SR] || AllRMs[IR])
+ LL.clear();
+ }
+
+ pruneEmptyLists();
+}
+
+bool HexagonGenInsert::generateInserts() {
+ // Create a new register for each one from IFMap, and store them in the
+ // map.
+ UnsignedMap RegMap;
+ for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) {
+ unsigned VR = I->first;
+ const TargetRegisterClass *RC = MRI->getRegClass(VR);
+ unsigned NewVR = MRI->createVirtualRegister(RC);
+ RegMap[VR] = NewVR;
+ }
+
+ // We can generate the "insert" instructions using potentially stale re-
+ // gisters: SrcR and InsR for a given VR may be among other registers that
+ // are also replaced. This is fine, we will do the mass "rauw" a bit later.
+ for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) {
+ MachineInstr *MI = MRI->getVRegDef(I->first);
+ MachineBasicBlock &B = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+ unsigned NewR = RegMap[I->first];
+ bool R32 = MRI->getRegClass(NewR) == &Hexagon::IntRegsRegClass;
+ const MCInstrDesc &D = R32 ? HII->get(Hexagon::S2_insert)
+ : HII->get(Hexagon::S2_insertp);
+ IFRecord IF = I->second[0].first;
+ unsigned Wdh = IF.Wdh, Off = IF.Off;
+ unsigned InsS = 0;
+ if (R32 && MRI->getRegClass(IF.InsR) == &Hexagon::DoubleRegsRegClass) {
+ InsS = Hexagon::isub_lo;
+ if (Off >= 32) {
+ InsS = Hexagon::isub_hi;
+ Off -= 32;
+ }
+ }
+ // Advance to the proper location for inserting instructions. This could
+ // be B.end().
+ MachineBasicBlock::iterator At = MI;
+ if (MI->isPHI())
+ At = B.getFirstNonPHI();
+
+ BuildMI(B, At, DL, D, NewR)
+ .addReg(IF.SrcR)
+ .addReg(IF.InsR, 0, InsS)
+ .addImm(Wdh)
+ .addImm(Off);
+
+ MRI->clearKillFlags(IF.SrcR);
+ MRI->clearKillFlags(IF.InsR);
+ }
+
+ for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) {
+ MachineInstr *DefI = MRI->getVRegDef(I->first);
+ MRI->replaceRegWith(I->first, RegMap[I->first]);
+ DefI->eraseFromParent();
+ }
+
+ return true;
+}
+
+bool HexagonGenInsert::removeDeadCode(MachineDomTreeNode *N) {
+ bool Changed = false;
+ typedef GraphTraits<MachineDomTreeNode*> GTN;
+ for (auto I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I)
+ Changed |= removeDeadCode(*I);
+
+ MachineBasicBlock *B = N->getBlock();
+ std::vector<MachineInstr*> Instrs;
+ for (auto I = B->rbegin(), E = B->rend(); I != E; ++I)
+ Instrs.push_back(&*I);
+
+ for (auto I = Instrs.begin(), E = Instrs.end(); I != E; ++I) {
+ MachineInstr *MI = *I;
+ unsigned Opc = MI->getOpcode();
+ // Do not touch lifetime markers. This is why the target-independent DCE
+ // cannot be used.
+ if (Opc == TargetOpcode::LIFETIME_START ||
+ Opc == TargetOpcode::LIFETIME_END)
+ continue;
+ bool Store = false;
+ if (MI->isInlineAsm() || !MI->isSafeToMove(nullptr, Store))
+ continue;
+
+ bool AllDead = true;
+ SmallVector<unsigned,2> Regs;
+ for (const MachineOperand &MO : MI->operands()) {
+ if (!MO.isReg() || !MO.isDef())
+ continue;
+ unsigned R = MO.getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(R) ||
+ !MRI->use_nodbg_empty(R)) {
+ AllDead = false;
+ break;
+ }
+ Regs.push_back(R);
+ }
+ if (!AllDead)
+ continue;
+
+ B->erase(MI);
+ for (unsigned I = 0, N = Regs.size(); I != N; ++I)
+ MRI->markUsesInDebugValueAsUndef(Regs[I]);
+ Changed = true;
+ }
+
+ return Changed;
+}
+
+bool HexagonGenInsert::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
+ bool Timing = OptTiming, TimingDetail = Timing && OptTimingDetail;
+ bool Changed = false;
+
+ // Sanity check: one, but not both.
+ assert(!OptSelectAll0 || !OptSelectHas0);
+
+ IFMap.clear();
+ BaseOrd.clear();
+ CellOrd.clear();
+
+ const auto &ST = MF.getSubtarget<HexagonSubtarget>();
+ HII = ST.getInstrInfo();
+ HRI = ST.getRegisterInfo();
+ MFN = &MF;
+ MRI = &MF.getRegInfo();
+ MDT = &getAnalysis<MachineDominatorTree>();
+
+ // Clean up before any further processing, so that dead code does not
+ // get used in a newly generated "insert" instruction. Have a custom
+ // version of DCE that preserves lifetime markers. Without it, merging
+ // of stack objects can fail to recognize and merge disjoint objects
+ // leading to unnecessary stack growth.
+ Changed = removeDeadCode(MDT->getRootNode());
+
+ const HexagonEvaluator HE(*HRI, *MRI, *HII, MF);
+ BitTracker BTLoc(HE, MF);
+ BTLoc.trace(isDebug());
+ BTLoc.run();
+ CellMapShadow MS(BTLoc);
+ CMS = &MS;
+
+ buildOrderingMF(BaseOrd);
+ buildOrderingBT(BaseOrd, CellOrd);
+
+ if (isDebug()) {
+ dbgs() << "Cell ordering:\n";
+ for (RegisterOrdering::iterator I = CellOrd.begin(), E = CellOrd.end();
+ I != E; ++I) {
+ unsigned VR = I->first, Pos = I->second;
+ dbgs() << PrintReg(VR, HRI) << " -> " << Pos << "\n";
+ }
+ }
+
+ // Collect candidates for conversion into the insert forms.
+ MachineBasicBlock *RootB = MDT->getRoot();
+ OrderedRegisterList AvailR(CellOrd);
+
+ const char *const TGName = "hexinsert";
+ const char *const TGDesc = "Generate Insert Instructions";
+
+ {
+ NamedRegionTimer _T("collection", "collection", TGName, TGDesc,
+ TimingDetail);
+ collectInBlock(RootB, AvailR);
+ // Complete the information gathered in IFMap.
+ computeRemovableRegisters();
+ }
+
+ if (isDebug()) {
+ dbgs() << "Candidates after collection:\n";
+ dump_map();
+ }
+
+ if (IFMap.empty())
+ return Changed;
+
+ {
+ NamedRegionTimer _T("pruning", "pruning", TGName, TGDesc, TimingDetail);
+ pruneCandidates();
+ }
+
+ if (isDebug()) {
+ dbgs() << "Candidates after pruning:\n";
+ dump_map();
+ }
+
+ if (IFMap.empty())
+ return Changed;
+
+ {
+ NamedRegionTimer _T("selection", "selection", TGName, TGDesc, TimingDetail);
+ selectCandidates();
+ }
+
+ if (isDebug()) {
+ dbgs() << "Candidates after selection:\n";
+ dump_map();
+ }
+
+ // Filter out vregs beyond the cutoff.
+ if (VRegIndexCutoff.getPosition()) {
+ unsigned Cutoff = VRegIndexCutoff;
+ typedef SmallVector<IFMapType::iterator,16> IterListType;
+ IterListType Out;
+ for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) {
+ unsigned Idx = TargetRegisterInfo::virtReg2Index(I->first);
+ if (Idx >= Cutoff)
+ Out.push_back(I);
+ }
+ for (unsigned i = 0, n = Out.size(); i < n; ++i)
+ IFMap.erase(Out[i]);
+ }
+ if (IFMap.empty())
+ return Changed;
+
+ {
+ NamedRegionTimer _T("generation", "generation", TGName, TGDesc,
+ TimingDetail);
+ generateInserts();
+ }
+
+ return true;
+}
+
+FunctionPass *llvm::createHexagonGenInsert() {
+ return new HexagonGenInsert();
+}
+
+//===----------------------------------------------------------------------===//
+// Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+INITIALIZE_PASS_BEGIN(HexagonGenInsert, "hexinsert",
+ "Hexagon generate \"insert\" instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(HexagonGenInsert, "hexinsert",
+ "Hexagon generate \"insert\" instructions", false, false)
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonGenMux.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonGenMux.cpp
new file mode 100644
index 000000000000..a718df9c70ab
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonGenMux.cpp
@@ -0,0 +1,351 @@
+//===--- HexagonGenMux.cpp ------------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// During instruction selection, MUX instructions are generated for
+// conditional assignments. Since such assignments often present an
+// opportunity to predicate instructions, HexagonExpandCondsets
+// expands MUXes into pairs of conditional transfers, and then proceeds
+// with predication of the producers/consumers of the registers involved.
+// This happens after exiting from the SSA form, but before the machine
+// instruction scheduler. After the scheduler and after the register
+// allocation there can be cases of pairs of conditional transfers
+// resulting from a MUX where neither of them was further predicated. If
+// these transfers are now placed far enough from the instruction defining
+// the predicate register, they cannot use the .new form. In such cases it
+// is better to collapse them back to a single MUX instruction.
+
+#define DEBUG_TYPE "hexmux"
+
+#include "HexagonInstrInfo.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <limits>
+#include <iterator>
+#include <utility>
+
+using namespace llvm;
+
+namespace llvm {
+
+ FunctionPass *createHexagonGenMux();
+ void initializeHexagonGenMuxPass(PassRegistry& Registry);
+
+} // end namespace llvm
+
+namespace {
+
+ class HexagonGenMux : public MachineFunctionPass {
+ public:
+ static char ID;
+
+ HexagonGenMux() : MachineFunctionPass(ID), HII(nullptr), HRI(nullptr) {
+ initializeHexagonGenMuxPass(*PassRegistry::getPassRegistry());
+ }
+
+ StringRef getPassName() const override {
+ return "Hexagon generate mux instructions";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+ private:
+ const HexagonInstrInfo *HII;
+ const HexagonRegisterInfo *HRI;
+
+ struct CondsetInfo {
+ unsigned PredR = 0;
+ unsigned TrueX = std::numeric_limits<unsigned>::max();
+ unsigned FalseX = std::numeric_limits<unsigned>::max();
+
+ CondsetInfo() = default;
+ };
+
+ struct DefUseInfo {
+ BitVector Defs, Uses;
+
+ DefUseInfo() = default;
+ DefUseInfo(const BitVector &D, const BitVector &U) : Defs(D), Uses(U) {}
+ };
+
+ struct MuxInfo {
+ MachineBasicBlock::iterator At;
+ unsigned DefR, PredR;
+ MachineOperand *SrcT, *SrcF;
+ MachineInstr *Def1, *Def2;
+
+ MuxInfo(MachineBasicBlock::iterator It, unsigned DR, unsigned PR,
+ MachineOperand *TOp, MachineOperand *FOp, MachineInstr &D1,
+ MachineInstr &D2)
+ : At(It), DefR(DR), PredR(PR), SrcT(TOp), SrcF(FOp), Def1(&D1),
+ Def2(&D2) {}
+ };
+
+ typedef DenseMap<MachineInstr*,unsigned> InstrIndexMap;
+ typedef DenseMap<unsigned,DefUseInfo> DefUseInfoMap;
+ typedef SmallVector<MuxInfo,4> MuxInfoList;
+
+ bool isRegPair(unsigned Reg) const {
+ return Hexagon::DoubleRegsRegClass.contains(Reg);
+ }
+
+ void getSubRegs(unsigned Reg, BitVector &SRs) const;
+ void expandReg(unsigned Reg, BitVector &Set) const;
+ void getDefsUses(const MachineInstr *MI, BitVector &Defs,
+ BitVector &Uses) const;
+ void buildMaps(MachineBasicBlock &B, InstrIndexMap &I2X,
+ DefUseInfoMap &DUM);
+ bool isCondTransfer(unsigned Opc) const;
+ unsigned getMuxOpcode(const MachineOperand &Src1,
+ const MachineOperand &Src2) const;
+ bool genMuxInBlock(MachineBasicBlock &B);
+ };
+
+ char HexagonGenMux::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(HexagonGenMux, "hexagon-mux",
+ "Hexagon generate mux instructions", false, false)
+
+void HexagonGenMux::getSubRegs(unsigned Reg, BitVector &SRs) const {
+ for (MCSubRegIterator I(Reg, HRI); I.isValid(); ++I)
+ SRs[*I] = true;
+}
+
+void HexagonGenMux::expandReg(unsigned Reg, BitVector &Set) const {
+ if (isRegPair(Reg))
+ getSubRegs(Reg, Set);
+ else
+ Set[Reg] = true;
+}
+
+void HexagonGenMux::getDefsUses(const MachineInstr *MI, BitVector &Defs,
+ BitVector &Uses) const {
+ // First, get the implicit defs and uses for this instruction.
+ unsigned Opc = MI->getOpcode();
+ const MCInstrDesc &D = HII->get(Opc);
+ if (const MCPhysReg *R = D.ImplicitDefs)
+ while (*R)
+ expandReg(*R++, Defs);
+ if (const MCPhysReg *R = D.ImplicitUses)
+ while (*R)
+ expandReg(*R++, Uses);
+
+ // Look over all operands, and collect explicit defs and uses.
+ for (const MachineOperand &MO : MI->operands()) {
+ if (!MO.isReg() || MO.isImplicit())
+ continue;
+ unsigned R = MO.getReg();
+ BitVector &Set = MO.isDef() ? Defs : Uses;
+ expandReg(R, Set);
+ }
+}
+
+void HexagonGenMux::buildMaps(MachineBasicBlock &B, InstrIndexMap &I2X,
+ DefUseInfoMap &DUM) {
+ unsigned Index = 0;
+ unsigned NR = HRI->getNumRegs();
+ BitVector Defs(NR), Uses(NR);
+
+ for (MachineBasicBlock::iterator I = B.begin(), E = B.end(); I != E; ++I) {
+ MachineInstr *MI = &*I;
+ I2X.insert(std::make_pair(MI, Index));
+ Defs.reset();
+ Uses.reset();
+ getDefsUses(MI, Defs, Uses);
+ DUM.insert(std::make_pair(Index, DefUseInfo(Defs, Uses)));
+ Index++;
+ }
+}
+
+bool HexagonGenMux::isCondTransfer(unsigned Opc) const {
+ switch (Opc) {
+ case Hexagon::A2_tfrt:
+ case Hexagon::A2_tfrf:
+ case Hexagon::C2_cmoveit:
+ case Hexagon::C2_cmoveif:
+ return true;
+ }
+ return false;
+}
+
+unsigned HexagonGenMux::getMuxOpcode(const MachineOperand &Src1,
+ const MachineOperand &Src2) const {
+ bool IsReg1 = Src1.isReg(), IsReg2 = Src2.isReg();
+ if (IsReg1)
+ return IsReg2 ? Hexagon::C2_mux : Hexagon::C2_muxir;
+ if (IsReg2)
+ return Hexagon::C2_muxri;
+
+ // Neither is a register. The first source is extendable, but the second
+ // is not (s8).
+ if (Src2.isImm() && isInt<8>(Src2.getImm()))
+ return Hexagon::C2_muxii;
+
+ return 0;
+}
+
+bool HexagonGenMux::genMuxInBlock(MachineBasicBlock &B) {
+ bool Changed = false;
+ InstrIndexMap I2X;
+ DefUseInfoMap DUM;
+ buildMaps(B, I2X, DUM);
+
+ typedef DenseMap<unsigned,CondsetInfo> CondsetMap;
+ CondsetMap CM;
+ MuxInfoList ML;
+
+ MachineBasicBlock::iterator NextI, End = B.end();
+ for (MachineBasicBlock::iterator I = B.begin(); I != End; I = NextI) {
+ MachineInstr *MI = &*I;
+ NextI = std::next(I);
+ unsigned Opc = MI->getOpcode();
+ if (!isCondTransfer(Opc))
+ continue;
+ unsigned DR = MI->getOperand(0).getReg();
+ if (isRegPair(DR))
+ continue;
+
+ unsigned PR = MI->getOperand(1).getReg();
+ unsigned Idx = I2X.lookup(MI);
+ CondsetMap::iterator F = CM.find(DR);
+ bool IfTrue = HII->isPredicatedTrue(Opc);
+
+ // If there is no record of a conditional transfer for this register,
+ // or the predicate register differs, create a new record for it.
+ if (F != CM.end() && F->second.PredR != PR) {
+ CM.erase(F);
+ F = CM.end();
+ }
+ if (F == CM.end()) {
+ auto It = CM.insert(std::make_pair(DR, CondsetInfo()));
+ F = It.first;
+ F->second.PredR = PR;
+ }
+ CondsetInfo &CI = F->second;
+ if (IfTrue)
+ CI.TrueX = Idx;
+ else
+ CI.FalseX = Idx;
+ if (CI.TrueX == std::numeric_limits<unsigned>::max() ||
+ CI.FalseX == std::numeric_limits<unsigned>::max())
+ continue;
+
+ // There is now a complete definition of DR, i.e. we have the predicate
+ // register, the definition if-true, and definition if-false.
+
+ // First, check if both definitions are far enough from the definition
+ // of the predicate register.
+ unsigned MinX = std::min(CI.TrueX, CI.FalseX);
+ unsigned MaxX = std::max(CI.TrueX, CI.FalseX);
+ unsigned SearchX = (MaxX > 4) ? MaxX-4 : 0;
+ bool NearDef = false;
+ for (unsigned X = SearchX; X < MaxX; ++X) {
+ const DefUseInfo &DU = DUM.lookup(X);
+ if (!DU.Defs[PR])
+ continue;
+ NearDef = true;
+ break;
+ }
+ if (NearDef)
+ continue;
+
+ // The predicate register is not defined in the last few instructions.
+ // Check if the conversion to MUX is possible (either "up", i.e. at the
+ // place of the earlier partial definition, or "down", where the later
+ // definition is located). Examine all defs and uses between these two
+ // definitions.
+ // SR1, SR2 - source registers from the first and the second definition.
+ MachineBasicBlock::iterator It1 = B.begin(), It2 = B.begin();
+ std::advance(It1, MinX);
+ std::advance(It2, MaxX);
+ MachineInstr &Def1 = *It1, &Def2 = *It2;
+ MachineOperand *Src1 = &Def1.getOperand(2), *Src2 = &Def2.getOperand(2);
+ unsigned SR1 = Src1->isReg() ? Src1->getReg() : 0;
+ unsigned SR2 = Src2->isReg() ? Src2->getReg() : 0;
+ bool Failure = false, CanUp = true, CanDown = true;
+ for (unsigned X = MinX+1; X < MaxX; X++) {
+ const DefUseInfo &DU = DUM.lookup(X);
+ if (DU.Defs[PR] || DU.Defs[DR] || DU.Uses[DR]) {
+ Failure = true;
+ break;
+ }
+ if (CanDown && DU.Defs[SR1])
+ CanDown = false;
+ if (CanUp && DU.Defs[SR2])
+ CanUp = false;
+ }
+ if (Failure || (!CanUp && !CanDown))
+ continue;
+
+ MachineOperand *SrcT = (MinX == CI.TrueX) ? Src1 : Src2;
+ MachineOperand *SrcF = (MinX == CI.FalseX) ? Src1 : Src2;
+ // Prefer "down", since this will move the MUX farther away from the
+ // predicate definition.
+ MachineBasicBlock::iterator At = CanDown ? Def2 : Def1;
+ ML.push_back(MuxInfo(At, DR, PR, SrcT, SrcF, Def1, Def2));
+ }
+
+ for (unsigned I = 0, N = ML.size(); I < N; ++I) {
+ MuxInfo &MX = ML[I];
+ MachineBasicBlock &B = *MX.At->getParent();
+ DebugLoc DL = MX.At->getDebugLoc();
+ unsigned MxOpc = getMuxOpcode(*MX.SrcT, *MX.SrcF);
+ if (!MxOpc)
+ continue;
+ BuildMI(B, MX.At, DL, HII->get(MxOpc), MX.DefR)
+ .addReg(MX.PredR)
+ .addOperand(*MX.SrcT)
+ .addOperand(*MX.SrcF);
+ B.erase(MX.Def1);
+ B.erase(MX.Def2);
+ Changed = true;
+ }
+
+ return Changed;
+}
+
+bool HexagonGenMux::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+ HII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
+ HRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+ bool Changed = false;
+ for (auto &I : MF)
+ Changed |= genMuxInBlock(I);
+ return Changed;
+}
+
+FunctionPass *llvm::createHexagonGenMux() {
+ return new HexagonGenMux();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
new file mode 100644
index 000000000000..f14c733dcf51
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
@@ -0,0 +1,538 @@
+//===--- HexagonGenPredicate.cpp ------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "gen-pred"
+
+#include "HexagonInstrInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <cassert>
+#include <iterator>
+#include <map>
+#include <queue>
+#include <set>
+#include <utility>
+
+using namespace llvm;
+
+namespace llvm {
+
+ void initializeHexagonGenPredicatePass(PassRegistry& Registry);
+ FunctionPass *createHexagonGenPredicate();
+
+} // end namespace llvm
+
+namespace {
+
+ struct Register {
+ unsigned R, S;
+
+ Register(unsigned r = 0, unsigned s = 0) : R(r), S(s) {}
+ Register(const MachineOperand &MO) : R(MO.getReg()), S(MO.getSubReg()) {}
+
+ bool operator== (const Register &Reg) const {
+ return R == Reg.R && S == Reg.S;
+ }
+
+ bool operator< (const Register &Reg) const {
+ return R < Reg.R || (R == Reg.R && S < Reg.S);
+ }
+ };
+
+ struct PrintRegister {
+ friend raw_ostream &operator<< (raw_ostream &OS, const PrintRegister &PR);
+
+ PrintRegister(Register R, const TargetRegisterInfo &I) : Reg(R), TRI(I) {}
+
+ private:
+ Register Reg;
+ const TargetRegisterInfo &TRI;
+ };
+
+ raw_ostream &operator<< (raw_ostream &OS, const PrintRegister &PR)
+ LLVM_ATTRIBUTE_UNUSED;
+ raw_ostream &operator<< (raw_ostream &OS, const PrintRegister &PR) {
+ return OS << PrintReg(PR.Reg.R, &PR.TRI, PR.Reg.S);
+ }
+
+ class HexagonGenPredicate : public MachineFunctionPass {
+ public:
+ static char ID;
+
+ HexagonGenPredicate() : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr),
+ MRI(nullptr) {
+ initializeHexagonGenPredicatePass(*PassRegistry::getPassRegistry());
+ }
+
+ StringRef getPassName() const override {
+ return "Hexagon generate predicate operations";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ private:
+ typedef SetVector<MachineInstr*> VectOfInst;
+ typedef std::set<Register> SetOfReg;
+ typedef std::map<Register,Register> RegToRegMap;
+
+ const HexagonInstrInfo *TII;
+ const HexagonRegisterInfo *TRI;
+ MachineRegisterInfo *MRI;
+ SetOfReg PredGPRs;
+ VectOfInst PUsers;
+ RegToRegMap G2P;
+
+ bool isPredReg(unsigned R);
+ void collectPredicateGPR(MachineFunction &MF);
+ void processPredicateGPR(const Register &Reg);
+ unsigned getPredForm(unsigned Opc);
+ bool isConvertibleToPredForm(const MachineInstr *MI);
+ bool isScalarCmp(unsigned Opc);
+ bool isScalarPred(Register PredReg);
+ Register getPredRegFor(const Register &Reg);
+ bool convertToPredForm(MachineInstr *MI);
+ bool eliminatePredCopies(MachineFunction &MF);
+ };
+
+ char HexagonGenPredicate::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(HexagonGenPredicate, "hexagon-gen-pred",
+ "Hexagon generate predicate operations", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(HexagonGenPredicate, "hexagon-gen-pred",
+ "Hexagon generate predicate operations", false, false)
+
+bool HexagonGenPredicate::isPredReg(unsigned R) {
+ if (!TargetRegisterInfo::isVirtualRegister(R))
+ return false;
+ const TargetRegisterClass *RC = MRI->getRegClass(R);
+ return RC == &Hexagon::PredRegsRegClass;
+}
+
+unsigned HexagonGenPredicate::getPredForm(unsigned Opc) {
+ using namespace Hexagon;
+
+ switch (Opc) {
+ case A2_and:
+ case A2_andp:
+ return C2_and;
+ case A4_andn:
+ case A4_andnp:
+ return C2_andn;
+ case M4_and_and:
+ return C4_and_and;
+ case M4_and_andn:
+ return C4_and_andn;
+ case M4_and_or:
+ return C4_and_or;
+
+ case A2_or:
+ case A2_orp:
+ return C2_or;
+ case A4_orn:
+ case A4_ornp:
+ return C2_orn;
+ case M4_or_and:
+ return C4_or_and;
+ case M4_or_andn:
+ return C4_or_andn;
+ case M4_or_or:
+ return C4_or_or;
+
+ case A2_xor:
+ case A2_xorp:
+ return C2_xor;
+
+ case C2_tfrrp:
+ return COPY;
+ }
+ // The opcode corresponding to 0 is TargetOpcode::PHI. We can use 0 here
+ // to denote "none", but we need to make sure that none of the valid opcodes
+ // that we return will ever be 0.
+ static_assert(PHI == 0, "Use different value for <none>");
+ return 0;
+}
+
+bool HexagonGenPredicate::isConvertibleToPredForm(const MachineInstr *MI) {
+ unsigned Opc = MI->getOpcode();
+ if (getPredForm(Opc) != 0)
+ return true;
+
+ // Comparisons against 0 are also convertible. This does not apply to
+ // A4_rcmpeqi or A4_rcmpneqi, since they produce values 0 or 1, which
+ // may not match the value that the predicate register would have if
+ // it was converted to a predicate form.
+ switch (Opc) {
+ case Hexagon::C2_cmpeqi:
+ case Hexagon::C4_cmpneqi:
+ if (MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0)
+ return true;
+ break;
+ }
+ return false;
+}
+
+void HexagonGenPredicate::collectPredicateGPR(MachineFunction &MF) {
+ for (MachineFunction::iterator A = MF.begin(), Z = MF.end(); A != Z; ++A) {
+ MachineBasicBlock &B = *A;
+ for (MachineBasicBlock::iterator I = B.begin(), E = B.end(); I != E; ++I) {
+ MachineInstr *MI = &*I;
+ unsigned Opc = MI->getOpcode();
+ switch (Opc) {
+ case Hexagon::C2_tfrpr:
+ case TargetOpcode::COPY:
+ if (isPredReg(MI->getOperand(1).getReg())) {
+ Register RD = MI->getOperand(0);
+ if (TargetRegisterInfo::isVirtualRegister(RD.R))
+ PredGPRs.insert(RD);
+ }
+ break;
+ }
+ }
+ }
+}
+
+void HexagonGenPredicate::processPredicateGPR(const Register &Reg) {
+ DEBUG(dbgs() << __func__ << ": "
+ << PrintReg(Reg.R, TRI, Reg.S) << "\n");
+ typedef MachineRegisterInfo::use_iterator use_iterator;
+ use_iterator I = MRI->use_begin(Reg.R), E = MRI->use_end();
+ if (I == E) {
+ DEBUG(dbgs() << "Dead reg: " << PrintReg(Reg.R, TRI, Reg.S) << '\n');
+ MachineInstr *DefI = MRI->getVRegDef(Reg.R);
+ DefI->eraseFromParent();
+ return;
+ }
+
+ for (; I != E; ++I) {
+ MachineInstr *UseI = I->getParent();
+ if (isConvertibleToPredForm(UseI))
+ PUsers.insert(UseI);
+ }
+}
+
+Register HexagonGenPredicate::getPredRegFor(const Register &Reg) {
+ // Create a predicate register for a given Reg. The newly created register
+ // will have its value copied from Reg, so that it can be later used as
+ // an operand in other instructions.
+ assert(TargetRegisterInfo::isVirtualRegister(Reg.R));
+ RegToRegMap::iterator F = G2P.find(Reg);
+ if (F != G2P.end())
+ return F->second;
+
+ DEBUG(dbgs() << __func__ << ": " << PrintRegister(Reg, *TRI));
+ MachineInstr *DefI = MRI->getVRegDef(Reg.R);
+ assert(DefI);
+ unsigned Opc = DefI->getOpcode();
+ if (Opc == Hexagon::C2_tfrpr || Opc == TargetOpcode::COPY) {
+ assert(DefI->getOperand(0).isDef() && DefI->getOperand(1).isUse());
+ Register PR = DefI->getOperand(1);
+ G2P.insert(std::make_pair(Reg, PR));
+ DEBUG(dbgs() << " -> " << PrintRegister(PR, *TRI) << '\n');
+ return PR;
+ }
+
+ MachineBasicBlock &B = *DefI->getParent();
+ DebugLoc DL = DefI->getDebugLoc();
+ const TargetRegisterClass *PredRC = &Hexagon::PredRegsRegClass;
+ unsigned NewPR = MRI->createVirtualRegister(PredRC);
+
+ // For convertible instructions, do not modify them, so that they can
+ // be converted later. Generate a copy from Reg to NewPR.
+ if (isConvertibleToPredForm(DefI)) {
+ MachineBasicBlock::iterator DefIt = DefI;
+ BuildMI(B, std::next(DefIt), DL, TII->get(TargetOpcode::COPY), NewPR)
+ .addReg(Reg.R, 0, Reg.S);
+ G2P.insert(std::make_pair(Reg, Register(NewPR)));
+ DEBUG(dbgs() << " -> !" << PrintRegister(Register(NewPR), *TRI) << '\n');
+ return Register(NewPR);
+ }
+
+ llvm_unreachable("Invalid argument");
+}
+
+bool HexagonGenPredicate::isScalarCmp(unsigned Opc) {
+ switch (Opc) {
+ case Hexagon::C2_cmpeq:
+ case Hexagon::C2_cmpgt:
+ case Hexagon::C2_cmpgtu:
+ case Hexagon::C2_cmpeqp:
+ case Hexagon::C2_cmpgtp:
+ case Hexagon::C2_cmpgtup:
+ case Hexagon::C2_cmpeqi:
+ case Hexagon::C2_cmpgti:
+ case Hexagon::C2_cmpgtui:
+ case Hexagon::C2_cmpgei:
+ case Hexagon::C2_cmpgeui:
+ case Hexagon::C4_cmpneqi:
+ case Hexagon::C4_cmpltei:
+ case Hexagon::C4_cmplteui:
+ case Hexagon::C4_cmpneq:
+ case Hexagon::C4_cmplte:
+ case Hexagon::C4_cmplteu:
+ case Hexagon::A4_cmpbeq:
+ case Hexagon::A4_cmpbeqi:
+ case Hexagon::A4_cmpbgtu:
+ case Hexagon::A4_cmpbgtui:
+ case Hexagon::A4_cmpbgt:
+ case Hexagon::A4_cmpbgti:
+ case Hexagon::A4_cmpheq:
+ case Hexagon::A4_cmphgt:
+ case Hexagon::A4_cmphgtu:
+ case Hexagon::A4_cmpheqi:
+ case Hexagon::A4_cmphgti:
+ case Hexagon::A4_cmphgtui:
+ return true;
+ }
+ return false;
+}
+
+bool HexagonGenPredicate::isScalarPred(Register PredReg) {
+ std::queue<Register> WorkQ;
+ WorkQ.push(PredReg);
+
+ while (!WorkQ.empty()) {
+ Register PR = WorkQ.front();
+ WorkQ.pop();
+ const MachineInstr *DefI = MRI->getVRegDef(PR.R);
+ if (!DefI)
+ return false;
+ unsigned DefOpc = DefI->getOpcode();
+ switch (DefOpc) {
+ case TargetOpcode::COPY: {
+ const TargetRegisterClass *PredRC = &Hexagon::PredRegsRegClass;
+ if (MRI->getRegClass(PR.R) != PredRC)
+ return false;
+ // If it is a copy between two predicate registers, fall through.
+ }
+ case Hexagon::C2_and:
+ case Hexagon::C2_andn:
+ case Hexagon::C4_and_and:
+ case Hexagon::C4_and_andn:
+ case Hexagon::C4_and_or:
+ case Hexagon::C2_or:
+ case Hexagon::C2_orn:
+ case Hexagon::C4_or_and:
+ case Hexagon::C4_or_andn:
+ case Hexagon::C4_or_or:
+ case Hexagon::C4_or_orn:
+ case Hexagon::C2_xor:
+ // Add operands to the queue.
+ for (const MachineOperand &MO : DefI->operands())
+ if (MO.isReg() && MO.isUse())
+ WorkQ.push(Register(MO.getReg()));
+ break;
+
+ // All non-vector compares are ok, everything else is bad.
+ default:
+ return isScalarCmp(DefOpc);
+ }
+ }
+
+ return true;
+}
+
+bool HexagonGenPredicate::convertToPredForm(MachineInstr *MI) {
+ DEBUG(dbgs() << __func__ << ": " << MI << " " << *MI);
+
+ unsigned Opc = MI->getOpcode();
+ assert(isConvertibleToPredForm(MI));
+ unsigned NumOps = MI->getNumOperands();
+ for (unsigned i = 0; i < NumOps; ++i) {
+ MachineOperand &MO = MI->getOperand(i);
+ if (!MO.isReg() || !MO.isUse())
+ continue;
+ Register Reg(MO);
+ if (Reg.S && Reg.S != Hexagon::isub_lo)
+ return false;
+ if (!PredGPRs.count(Reg))
+ return false;
+ }
+
+ MachineBasicBlock &B = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+
+ unsigned NewOpc = getPredForm(Opc);
+ // Special case for comparisons against 0.
+ if (NewOpc == 0) {
+ switch (Opc) {
+ case Hexagon::C2_cmpeqi:
+ NewOpc = Hexagon::C2_not;
+ break;
+ case Hexagon::C4_cmpneqi:
+ NewOpc = TargetOpcode::COPY;
+ break;
+ default:
+ return false;
+ }
+
+ // If it's a scalar predicate register, then all bits in it are
+ // the same. Otherwise, to determine whether all bits are 0 or not
+ // we would need to use any8.
+ Register PR = getPredRegFor(MI->getOperand(1));
+ if (!isScalarPred(PR))
+ return false;
+ // This will skip the immediate argument when creating the predicate
+ // version instruction.
+ NumOps = 2;
+ }
+
+ // Some sanity: check that def is in operand #0.
+ MachineOperand &Op0 = MI->getOperand(0);
+ assert(Op0.isDef());
+ Register OutR(Op0);
+
+ // Don't use getPredRegFor, since it will create an association between
+ // the argument and a created predicate register (i.e. it will insert a
+ // copy if a new predicate register is created).
+ const TargetRegisterClass *PredRC = &Hexagon::PredRegsRegClass;
+ Register NewPR = MRI->createVirtualRegister(PredRC);
+ MachineInstrBuilder MIB = BuildMI(B, MI, DL, TII->get(NewOpc), NewPR.R);
+
+ // Add predicate counterparts of the GPRs.
+ for (unsigned i = 1; i < NumOps; ++i) {
+ Register GPR = MI->getOperand(i);
+ Register Pred = getPredRegFor(GPR);
+ MIB.addReg(Pred.R, 0, Pred.S);
+ }
+ DEBUG(dbgs() << "generated: " << *MIB);
+
+ // Generate a copy-out: NewGPR = NewPR, and replace all uses of OutR
+ // with NewGPR.
+ const TargetRegisterClass *RC = MRI->getRegClass(OutR.R);
+ unsigned NewOutR = MRI->createVirtualRegister(RC);
+ BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), NewOutR)
+ .addReg(NewPR.R, 0, NewPR.S);
+ MRI->replaceRegWith(OutR.R, NewOutR);
+ MI->eraseFromParent();
+
+ // If the processed instruction was C2_tfrrp (i.e. Rn = Pm; Pk = Rn),
+ // then the output will be a predicate register. Do not visit the
+ // users of it.
+ if (!isPredReg(NewOutR)) {
+ Register R(NewOutR);
+ PredGPRs.insert(R);
+ processPredicateGPR(R);
+ }
+ return true;
+}
+
+bool HexagonGenPredicate::eliminatePredCopies(MachineFunction &MF) {
+ DEBUG(dbgs() << __func__ << "\n");
+ const TargetRegisterClass *PredRC = &Hexagon::PredRegsRegClass;
+ bool Changed = false;
+ VectOfInst Erase;
+
+ // First, replace copies
+ // IntR = PredR1
+ // PredR2 = IntR
+ // with
+ // PredR2 = PredR1
+ // Such sequences can be generated when a copy-into-pred is generated from
+ // a gpr register holding a result of a convertible instruction. After
+ // the convertible instruction is converted, its predicate result will be
+ // copied back into the original gpr.
+
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ if (MI.getOpcode() != TargetOpcode::COPY)
+ continue;
+ Register DR = MI.getOperand(0);
+ Register SR = MI.getOperand(1);
+ if (!TargetRegisterInfo::isVirtualRegister(DR.R))
+ continue;
+ if (!TargetRegisterInfo::isVirtualRegister(SR.R))
+ continue;
+ if (MRI->getRegClass(DR.R) != PredRC)
+ continue;
+ if (MRI->getRegClass(SR.R) != PredRC)
+ continue;
+ assert(!DR.S && !SR.S && "Unexpected subregister");
+ MRI->replaceRegWith(DR.R, SR.R);
+ Erase.insert(&MI);
+ Changed = true;
+ }
+ }
+
+ for (VectOfInst::iterator I = Erase.begin(), E = Erase.end(); I != E; ++I)
+ (*I)->eraseFromParent();
+
+ return Changed;
+}
+
+bool HexagonGenPredicate::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
+ TII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
+ TRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+ MRI = &MF.getRegInfo();
+ PredGPRs.clear();
+ PUsers.clear();
+ G2P.clear();
+
+ bool Changed = false;
+ collectPredicateGPR(MF);
+ for (SetOfReg::iterator I = PredGPRs.begin(), E = PredGPRs.end(); I != E; ++I)
+ processPredicateGPR(*I);
+
+ bool Again;
+ do {
+ Again = false;
+ VectOfInst Processed, Copy;
+
+ typedef VectOfInst::iterator iterator;
+ Copy = PUsers;
+ for (iterator I = Copy.begin(), E = Copy.end(); I != E; ++I) {
+ MachineInstr *MI = *I;
+ bool Done = convertToPredForm(MI);
+ if (Done) {
+ Processed.insert(MI);
+ Again = true;
+ }
+ }
+ Changed |= Again;
+
+ auto Done = [Processed] (MachineInstr *MI) -> bool {
+ return Processed.count(MI);
+ };
+ PUsers.remove_if(Done);
+ } while (Again);
+
+ Changed |= eliminatePredCopies(MF);
+ return Changed;
+}
+
+FunctionPass *llvm::createHexagonGenPredicate() {
+ return new HexagonGenPredicate();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
new file mode 100644
index 000000000000..e477dcc0f64a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
@@ -0,0 +1,1980 @@
+//===-- HexagonHardwareLoops.cpp - Identify and generate hardware loops ---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass identifies loops where we can generate the Hexagon hardware
+// loop instruction. The hardware loop can perform loop branches with a
+// zero-cycle overhead.
+//
+// The pattern that defines the induction variable can changed depending on
+// prior optimizations. For example, the IndVarSimplify phase run by 'opt'
+// normalizes induction variables, and the Loop Strength Reduction pass
+// run by 'llc' may also make changes to the induction variable.
+// The pattern detected by this phase is due to running Strength Reduction.
+//
+// Criteria for hardware loops:
+// - Countable loops (w/ ind. var for a trip count)
+// - Assumes loops are normalized by IndVarSimplify
+// - Try inner-most loops first
+// - No function calls in loops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonInstrInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <iterator>
+#include <map>
+#include <set>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "hwloops"
+
+#ifndef NDEBUG
+static cl::opt<int> HWLoopLimit("hexagon-max-hwloop", cl::Hidden, cl::init(-1));
+
+// Option to create preheader only for a specific function.
+static cl::opt<std::string> PHFn("hexagon-hwloop-phfn", cl::Hidden,
+ cl::init(""));
+#endif
+
+// Option to create a preheader if one doesn't exist.
+static cl::opt<bool> HWCreatePreheader("hexagon-hwloop-preheader",
+ cl::Hidden, cl::init(true),
+ cl::desc("Add a preheader to a hardware loop if one doesn't exist"));
+
+// Turn it off by default. If a preheader block is not created here, the
+// software pipeliner may be unable to find a block suitable to serve as
+// a preheader. In that case SWP will not run.
+static cl::opt<bool> SpecPreheader("hwloop-spec-preheader", cl::init(false),
+ cl::Hidden, cl::ZeroOrMore, cl::desc("Allow speculation of preheader "
+ "instructions"));
+
+STATISTIC(NumHWLoops, "Number of loops converted to hardware loops");
+
+namespace llvm {
+
+ FunctionPass *createHexagonHardwareLoops();
+ void initializeHexagonHardwareLoopsPass(PassRegistry&);
+
+} // end namespace llvm
+
+namespace {
+
+ class CountValue;
+
+ struct HexagonHardwareLoops : public MachineFunctionPass {
+ MachineLoopInfo *MLI;
+ MachineRegisterInfo *MRI;
+ MachineDominatorTree *MDT;
+ const HexagonInstrInfo *TII;
+#ifndef NDEBUG
+ static int Counter;
+#endif
+
+ public:
+ static char ID;
+
+ HexagonHardwareLoops() : MachineFunctionPass(ID) {
+ initializeHexagonHardwareLoopsPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override { return "Hexagon Hardware Loops"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<MachineLoopInfo>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ private:
+ typedef std::map<unsigned, MachineInstr *> LoopFeederMap;
+
+ /// Kinds of comparisons in the compare instructions.
+ struct Comparison {
+ enum Kind {
+ EQ = 0x01,
+ NE = 0x02,
+ L = 0x04,
+ G = 0x08,
+ U = 0x40,
+ LTs = L,
+ LEs = L | EQ,
+ GTs = G,
+ GEs = G | EQ,
+ LTu = L | U,
+ LEu = L | EQ | U,
+ GTu = G | U,
+ GEu = G | EQ | U
+ };
+
+ static Kind getSwappedComparison(Kind Cmp) {
+ assert ((!((Cmp & L) && (Cmp & G))) && "Malformed comparison operator");
+ if ((Cmp & L) || (Cmp & G))
+ return (Kind)(Cmp ^ (L|G));
+ return Cmp;
+ }
+
+ static Kind getNegatedComparison(Kind Cmp) {
+ if ((Cmp & L) || (Cmp & G))
+ return (Kind)((Cmp ^ (L | G)) ^ EQ);
+ if ((Cmp & NE) || (Cmp & EQ))
+ return (Kind)(Cmp ^ (EQ | NE));
+ return (Kind)0;
+ }
+
+ static bool isSigned(Kind Cmp) {
+ return (Cmp & (L | G) && !(Cmp & U));
+ }
+
+ static bool isUnsigned(Kind Cmp) {
+ return (Cmp & U);
+ }
+ };
+
+ /// \brief Find the register that contains the loop controlling
+ /// induction variable.
+ /// If successful, it will return true and set the \p Reg, \p IVBump
+ /// and \p IVOp arguments. Otherwise it will return false.
+ /// The returned induction register is the register R that follows the
+ /// following induction pattern:
+ /// loop:
+ /// R = phi ..., [ R.next, LatchBlock ]
+ /// R.next = R + #bump
+ /// if (R.next < #N) goto loop
+ /// IVBump is the immediate value added to R, and IVOp is the instruction
+ /// "R.next = R + #bump".
+ bool findInductionRegister(MachineLoop *L, unsigned &Reg,
+ int64_t &IVBump, MachineInstr *&IVOp) const;
+
+ /// \brief Return the comparison kind for the specified opcode.
+ Comparison::Kind getComparisonKind(unsigned CondOpc,
+ MachineOperand *InitialValue,
+ const MachineOperand *Endvalue,
+ int64_t IVBump) const;
+
+ /// \brief Analyze the statements in a loop to determine if the loop
+ /// has a computable trip count and, if so, return a value that represents
+ /// the trip count expression.
+ CountValue *getLoopTripCount(MachineLoop *L,
+ SmallVectorImpl<MachineInstr *> &OldInsts);
+
+ /// \brief Return the expression that represents the number of times
+ /// a loop iterates. The function takes the operands that represent the
+ /// loop start value, loop end value, and induction value. Based upon
+ /// these operands, the function attempts to compute the trip count.
+ /// If the trip count is not directly available (as an immediate value,
+ /// or a register), the function will attempt to insert computation of it
+ /// to the loop's preheader.
+ CountValue *computeCount(MachineLoop *Loop, const MachineOperand *Start,
+ const MachineOperand *End, unsigned IVReg,
+ int64_t IVBump, Comparison::Kind Cmp) const;
+
+ /// \brief Return true if the instruction is not valid within a hardware
+ /// loop.
+ bool isInvalidLoopOperation(const MachineInstr *MI,
+ bool IsInnerHWLoop) const;
+
+ /// \brief Return true if the loop contains an instruction that inhibits
+ /// using the hardware loop.
+ bool containsInvalidInstruction(MachineLoop *L, bool IsInnerHWLoop) const;
+
+ /// \brief Given a loop, check if we can convert it to a hardware loop.
+ /// If so, then perform the conversion and return true.
+ bool convertToHardwareLoop(MachineLoop *L, bool &L0used, bool &L1used);
+
+ /// \brief Return true if the instruction is now dead.
+ bool isDead(const MachineInstr *MI,
+ SmallVectorImpl<MachineInstr *> &DeadPhis) const;
+
+ /// \brief Remove the instruction if it is now dead.
+ void removeIfDead(MachineInstr *MI);
+
+ /// \brief Make sure that the "bump" instruction executes before the
+ /// compare. We need that for the IV fixup, so that the compare
+ /// instruction would not use a bumped value that has not yet been
+ /// defined. If the instructions are out of order, try to reorder them.
+ bool orderBumpCompare(MachineInstr *BumpI, MachineInstr *CmpI);
+
+ /// \brief Return true if MO and MI pair is visited only once. If visited
+ /// more than once, this indicates there is recursion. In such a case,
+ /// return false.
+ bool isLoopFeeder(MachineLoop *L, MachineBasicBlock *A, MachineInstr *MI,
+ const MachineOperand *MO,
+ LoopFeederMap &LoopFeederPhi) const;
+
+ /// \brief Return true if the Phi may generate a value that may underflow,
+ /// or may wrap.
+ bool phiMayWrapOrUnderflow(MachineInstr *Phi, const MachineOperand *EndVal,
+ MachineBasicBlock *MBB, MachineLoop *L,
+ LoopFeederMap &LoopFeederPhi) const;
+
+ /// \brief Return true if the induction variable may underflow an unsigned
+ /// value in the first iteration.
+ bool loopCountMayWrapOrUnderFlow(const MachineOperand *InitVal,
+ const MachineOperand *EndVal,
+ MachineBasicBlock *MBB, MachineLoop *L,
+ LoopFeederMap &LoopFeederPhi) const;
+
+ /// \brief Check if the given operand has a compile-time known constant
+ /// value. Return true if yes, and false otherwise. When returning true, set
+ /// Val to the corresponding constant value.
+ bool checkForImmediate(const MachineOperand &MO, int64_t &Val) const;
+
+ /// \brief Check if the operand has a compile-time known constant value.
+ bool isImmediate(const MachineOperand &MO) const {
+ int64_t V;
+ return checkForImmediate(MO, V);
+ }
+
+ /// \brief Return the immediate for the specified operand.
+ int64_t getImmediate(const MachineOperand &MO) const {
+ int64_t V;
+ if (!checkForImmediate(MO, V))
+ llvm_unreachable("Invalid operand");
+ return V;
+ }
+
+ /// \brief Reset the given machine operand to now refer to a new immediate
+ /// value. Assumes that the operand was already referencing an immediate
+ /// value, either directly, or via a register.
+ void setImmediate(MachineOperand &MO, int64_t Val);
+
+ /// \brief Fix the data flow of the induction varible.
+ /// The desired flow is: phi ---> bump -+-> comparison-in-latch.
+ /// |
+ /// +-> back to phi
+ /// where "bump" is the increment of the induction variable:
+ /// iv = iv + #const.
+ /// Due to some prior code transformations, the actual flow may look
+ /// like this:
+ /// phi -+-> bump ---> back to phi
+ /// |
+ /// +-> comparison-in-latch (against upper_bound-bump),
+ /// i.e. the comparison that controls the loop execution may be using
+ /// the value of the induction variable from before the increment.
+ ///
+ /// Return true if the loop's flow is the desired one (i.e. it's
+ /// either been fixed, or no fixing was necessary).
+ /// Otherwise, return false. This can happen if the induction variable
+ /// couldn't be identified, or if the value in the latch's comparison
+ /// cannot be adjusted to reflect the post-bump value.
+ bool fixupInductionVariable(MachineLoop *L);
+
+ /// \brief Given a loop, if it does not have a preheader, create one.
+ /// Return the block that is the preheader.
+ MachineBasicBlock *createPreheaderForLoop(MachineLoop *L);
+ };
+
+ char HexagonHardwareLoops::ID = 0;
+#ifndef NDEBUG
+ int HexagonHardwareLoops::Counter = 0;
+#endif
+
+ /// \brief Abstraction for a trip count of a loop. A smaller version
+ /// of the MachineOperand class without the concerns of changing the
+ /// operand representation.
+ class CountValue {
+ public:
+ enum CountValueType {
+ CV_Register,
+ CV_Immediate
+ };
+
+ private:
+ CountValueType Kind;
+ union Values {
+ struct {
+ unsigned Reg;
+ unsigned Sub;
+ } R;
+ unsigned ImmVal;
+ } Contents;
+
+ public:
+ explicit CountValue(CountValueType t, unsigned v, unsigned u = 0) {
+ Kind = t;
+ if (Kind == CV_Register) {
+ Contents.R.Reg = v;
+ Contents.R.Sub = u;
+ } else {
+ Contents.ImmVal = v;
+ }
+ }
+
+ bool isReg() const { return Kind == CV_Register; }
+ bool isImm() const { return Kind == CV_Immediate; }
+
+ unsigned getReg() const {
+ assert(isReg() && "Wrong CountValue accessor");
+ return Contents.R.Reg;
+ }
+ unsigned getSubReg() const {
+ assert(isReg() && "Wrong CountValue accessor");
+ return Contents.R.Sub;
+ }
+ unsigned getImm() const {
+ assert(isImm() && "Wrong CountValue accessor");
+ return Contents.ImmVal;
+ }
+
+ void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr) const {
+ if (isReg()) { OS << PrintReg(Contents.R.Reg, TRI, Contents.R.Sub); }
+ if (isImm()) { OS << Contents.ImmVal; }
+ }
+ };
+
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(HexagonHardwareLoops, "hwloops",
+ "Hexagon Hardware Loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(HexagonHardwareLoops, "hwloops",
+ "Hexagon Hardware Loops", false, false)
+
+FunctionPass *llvm::createHexagonHardwareLoops() {
+ return new HexagonHardwareLoops();
+}
+
+bool HexagonHardwareLoops::runOnMachineFunction(MachineFunction &MF) {
+ DEBUG(dbgs() << "********* Hexagon Hardware Loops *********\n");
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
+ bool Changed = false;
+
+ MLI = &getAnalysis<MachineLoopInfo>();
+ MRI = &MF.getRegInfo();
+ MDT = &getAnalysis<MachineDominatorTree>();
+ TII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
+
+ for (auto &L : *MLI)
+ if (!L->getParentLoop()) {
+ bool L0Used = false;
+ bool L1Used = false;
+ Changed |= convertToHardwareLoop(L, L0Used, L1Used);
+ }
+
+ return Changed;
+}
+
+bool HexagonHardwareLoops::findInductionRegister(MachineLoop *L,
+ unsigned &Reg,
+ int64_t &IVBump,
+ MachineInstr *&IVOp
+ ) const {
+ MachineBasicBlock *Header = L->getHeader();
+ MachineBasicBlock *Preheader = MLI->findLoopPreheader(L, SpecPreheader);
+ MachineBasicBlock *Latch = L->getLoopLatch();
+ MachineBasicBlock *ExitingBlock = L->findLoopControlBlock();
+ if (!Header || !Preheader || !Latch || !ExitingBlock)
+ return false;
+
+ // This pair represents an induction register together with an immediate
+ // value that will be added to it in each loop iteration.
+ typedef std::pair<unsigned,int64_t> RegisterBump;
+
+ // Mapping: R.next -> (R, bump), where R, R.next and bump are derived
+ // from an induction operation
+ // R.next = R + bump
+ // where bump is an immediate value.
+ typedef std::map<unsigned,RegisterBump> InductionMap;
+
+ InductionMap IndMap;
+
+ typedef MachineBasicBlock::instr_iterator instr_iterator;
+ for (instr_iterator I = Header->instr_begin(), E = Header->instr_end();
+ I != E && I->isPHI(); ++I) {
+ MachineInstr *Phi = &*I;
+
+ // Have a PHI instruction. Get the operand that corresponds to the
+ // latch block, and see if is a result of an addition of form "reg+imm",
+ // where the "reg" is defined by the PHI node we are looking at.
+ for (unsigned i = 1, n = Phi->getNumOperands(); i < n; i += 2) {
+ if (Phi->getOperand(i+1).getMBB() != Latch)
+ continue;
+
+ unsigned PhiOpReg = Phi->getOperand(i).getReg();
+ MachineInstr *DI = MRI->getVRegDef(PhiOpReg);
+
+ if (DI->getDesc().isAdd()) {
+ // If the register operand to the add is the PHI we're looking at, this
+ // meets the induction pattern.
+ unsigned IndReg = DI->getOperand(1).getReg();
+ MachineOperand &Opnd2 = DI->getOperand(2);
+ int64_t V;
+ if (MRI->getVRegDef(IndReg) == Phi && checkForImmediate(Opnd2, V)) {
+ unsigned UpdReg = DI->getOperand(0).getReg();
+ IndMap.insert(std::make_pair(UpdReg, std::make_pair(IndReg, V)));
+ }
+ }
+ } // for (i)
+ } // for (instr)
+
+ SmallVector<MachineOperand,2> Cond;
+ MachineBasicBlock *TB = nullptr, *FB = nullptr;
+ bool NotAnalyzed = TII->analyzeBranch(*ExitingBlock, TB, FB, Cond, false);
+ if (NotAnalyzed)
+ return false;
+
+ unsigned PredR, PredPos, PredRegFlags;
+ if (!TII->getPredReg(Cond, PredR, PredPos, PredRegFlags))
+ return false;
+
+ MachineInstr *PredI = MRI->getVRegDef(PredR);
+ if (!PredI->isCompare())
+ return false;
+
+ unsigned CmpReg1 = 0, CmpReg2 = 0;
+ int CmpImm = 0, CmpMask = 0;
+ bool CmpAnalyzed =
+ TII->analyzeCompare(*PredI, CmpReg1, CmpReg2, CmpMask, CmpImm);
+ // Fail if the compare was not analyzed, or it's not comparing a register
+ // with an immediate value. Not checking the mask here, since we handle
+ // the individual compare opcodes (including A4_cmpb*) later on.
+ if (!CmpAnalyzed)
+ return false;
+
+ // Exactly one of the input registers to the comparison should be among
+ // the induction registers.
+ InductionMap::iterator IndMapEnd = IndMap.end();
+ InductionMap::iterator F = IndMapEnd;
+ if (CmpReg1 != 0) {
+ InductionMap::iterator F1 = IndMap.find(CmpReg1);
+ if (F1 != IndMapEnd)
+ F = F1;
+ }
+ if (CmpReg2 != 0) {
+ InductionMap::iterator F2 = IndMap.find(CmpReg2);
+ if (F2 != IndMapEnd) {
+ if (F != IndMapEnd)
+ return false;
+ F = F2;
+ }
+ }
+ if (F == IndMapEnd)
+ return false;
+
+ Reg = F->second.first;
+ IVBump = F->second.second;
+ IVOp = MRI->getVRegDef(F->first);
+ return true;
+}
+
+// Return the comparison kind for the specified opcode.
+HexagonHardwareLoops::Comparison::Kind
+HexagonHardwareLoops::getComparisonKind(unsigned CondOpc,
+ MachineOperand *InitialValue,
+ const MachineOperand *EndValue,
+ int64_t IVBump) const {
+ Comparison::Kind Cmp = (Comparison::Kind)0;
+ switch (CondOpc) {
+ case Hexagon::C2_cmpeqi:
+ case Hexagon::C2_cmpeq:
+ case Hexagon::C2_cmpeqp:
+ Cmp = Comparison::EQ;
+ break;
+ case Hexagon::C4_cmpneq:
+ case Hexagon::C4_cmpneqi:
+ Cmp = Comparison::NE;
+ break;
+ case Hexagon::C4_cmplte:
+ Cmp = Comparison::LEs;
+ break;
+ case Hexagon::C4_cmplteu:
+ Cmp = Comparison::LEu;
+ break;
+ case Hexagon::C2_cmpgtui:
+ case Hexagon::C2_cmpgtu:
+ case Hexagon::C2_cmpgtup:
+ Cmp = Comparison::GTu;
+ break;
+ case Hexagon::C2_cmpgti:
+ case Hexagon::C2_cmpgt:
+ case Hexagon::C2_cmpgtp:
+ Cmp = Comparison::GTs;
+ break;
+ default:
+ return (Comparison::Kind)0;
+ }
+ return Cmp;
+}
+
+/// \brief Analyze the statements in a loop to determine if the loop has
+/// a computable trip count and, if so, return a value that represents
+/// the trip count expression.
+///
+/// This function iterates over the phi nodes in the loop to check for
+/// induction variable patterns that are used in the calculation for
+/// the number of time the loop is executed.
+CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L,
+ SmallVectorImpl<MachineInstr *> &OldInsts) {
+ MachineBasicBlock *TopMBB = L->getTopBlock();
+ MachineBasicBlock::pred_iterator PI = TopMBB->pred_begin();
+ assert(PI != TopMBB->pred_end() &&
+ "Loop must have more than one incoming edge!");
+ MachineBasicBlock *Backedge = *PI++;
+ if (PI == TopMBB->pred_end()) // dead loop?
+ return nullptr;
+ MachineBasicBlock *Incoming = *PI++;
+ if (PI != TopMBB->pred_end()) // multiple backedges?
+ return nullptr;
+
+ // Make sure there is one incoming and one backedge and determine which
+ // is which.
+ if (L->contains(Incoming)) {
+ if (L->contains(Backedge))
+ return nullptr;
+ std::swap(Incoming, Backedge);
+ } else if (!L->contains(Backedge))
+ return nullptr;
+
+ // Look for the cmp instruction to determine if we can get a useful trip
+ // count. The trip count can be either a register or an immediate. The
+ // location of the value depends upon the type (reg or imm).
+ MachineBasicBlock *ExitingBlock = L->findLoopControlBlock();
+ if (!ExitingBlock)
+ return nullptr;
+
+ unsigned IVReg = 0;
+ int64_t IVBump = 0;
+ MachineInstr *IVOp;
+ bool FoundIV = findInductionRegister(L, IVReg, IVBump, IVOp);
+ if (!FoundIV)
+ return nullptr;
+
+ MachineBasicBlock *Preheader = MLI->findLoopPreheader(L, SpecPreheader);
+
+ MachineOperand *InitialValue = nullptr;
+ MachineInstr *IV_Phi = MRI->getVRegDef(IVReg);
+ MachineBasicBlock *Latch = L->getLoopLatch();
+ for (unsigned i = 1, n = IV_Phi->getNumOperands(); i < n; i += 2) {
+ MachineBasicBlock *MBB = IV_Phi->getOperand(i+1).getMBB();
+ if (MBB == Preheader)
+ InitialValue = &IV_Phi->getOperand(i);
+ else if (MBB == Latch)
+ IVReg = IV_Phi->getOperand(i).getReg(); // Want IV reg after bump.
+ }
+ if (!InitialValue)
+ return nullptr;
+
+ SmallVector<MachineOperand,2> Cond;
+ MachineBasicBlock *TB = nullptr, *FB = nullptr;
+ bool NotAnalyzed = TII->analyzeBranch(*ExitingBlock, TB, FB, Cond, false);
+ if (NotAnalyzed)
+ return nullptr;
+
+ MachineBasicBlock *Header = L->getHeader();
+ // TB must be non-null. If FB is also non-null, one of them must be
+ // the header. Otherwise, branch to TB could be exiting the loop, and
+ // the fall through can go to the header.
+ assert (TB && "Exit block without a branch?");
+ if (ExitingBlock != Latch && (TB == Latch || FB == Latch)) {
+ MachineBasicBlock *LTB = nullptr, *LFB = nullptr;
+ SmallVector<MachineOperand,2> LCond;
+ bool NotAnalyzed = TII->analyzeBranch(*Latch, LTB, LFB, LCond, false);
+ if (NotAnalyzed)
+ return nullptr;
+ if (TB == Latch)
+ TB = (LTB == Header) ? LTB : LFB;
+ else
+ FB = (LTB == Header) ? LTB: LFB;
+ }
+ assert ((!FB || TB == Header || FB == Header) && "Branches not to header?");
+ if (!TB || (FB && TB != Header && FB != Header))
+ return nullptr;
+
+ // Branches of form "if (!P) ..." cause HexagonInstrInfo::AnalyzeBranch
+ // to put imm(0), followed by P in the vector Cond.
+ // If TB is not the header, it means that the "not-taken" path must lead
+ // to the header.
+ bool Negated = TII->predOpcodeHasNot(Cond) ^ (TB != Header);
+ unsigned PredReg, PredPos, PredRegFlags;
+ if (!TII->getPredReg(Cond, PredReg, PredPos, PredRegFlags))
+ return nullptr;
+ MachineInstr *CondI = MRI->getVRegDef(PredReg);
+ unsigned CondOpc = CondI->getOpcode();
+
+ unsigned CmpReg1 = 0, CmpReg2 = 0;
+ int Mask = 0, ImmValue = 0;
+ bool AnalyzedCmp =
+ TII->analyzeCompare(*CondI, CmpReg1, CmpReg2, Mask, ImmValue);
+ if (!AnalyzedCmp)
+ return nullptr;
+
+ // The comparison operator type determines how we compute the loop
+ // trip count.
+ OldInsts.push_back(CondI);
+ OldInsts.push_back(IVOp);
+
+ // Sadly, the following code gets information based on the position
+ // of the operands in the compare instruction. This has to be done
+ // this way, because the comparisons check for a specific relationship
+ // between the operands (e.g. is-less-than), rather than to find out
+ // what relationship the operands are in (as on PPC).
+ Comparison::Kind Cmp;
+ bool isSwapped = false;
+ const MachineOperand &Op1 = CondI->getOperand(1);
+ const MachineOperand &Op2 = CondI->getOperand(2);
+ const MachineOperand *EndValue = nullptr;
+
+ if (Op1.isReg()) {
+ if (Op2.isImm() || Op1.getReg() == IVReg)
+ EndValue = &Op2;
+ else {
+ EndValue = &Op1;
+ isSwapped = true;
+ }
+ }
+
+ if (!EndValue)
+ return nullptr;
+
+ Cmp = getComparisonKind(CondOpc, InitialValue, EndValue, IVBump);
+ if (!Cmp)
+ return nullptr;
+ if (Negated)
+ Cmp = Comparison::getNegatedComparison(Cmp);
+ if (isSwapped)
+ Cmp = Comparison::getSwappedComparison(Cmp);
+
+ if (InitialValue->isReg()) {
+ unsigned R = InitialValue->getReg();
+ MachineBasicBlock *DefBB = MRI->getVRegDef(R)->getParent();
+ if (!MDT->properlyDominates(DefBB, Header))
+ return nullptr;
+ OldInsts.push_back(MRI->getVRegDef(R));
+ }
+ if (EndValue->isReg()) {
+ unsigned R = EndValue->getReg();
+ MachineBasicBlock *DefBB = MRI->getVRegDef(R)->getParent();
+ if (!MDT->properlyDominates(DefBB, Header))
+ return nullptr;
+ OldInsts.push_back(MRI->getVRegDef(R));
+ }
+
+ return computeCount(L, InitialValue, EndValue, IVReg, IVBump, Cmp);
+}
+
+/// \brief Helper function that returns the expression that represents the
+/// number of times a loop iterates. The function takes the operands that
+/// represent the loop start value, loop end value, and induction value.
+/// Based upon these operands, the function attempts to compute the trip count.
+CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
+ const MachineOperand *Start,
+ const MachineOperand *End,
+ unsigned IVReg,
+ int64_t IVBump,
+ Comparison::Kind Cmp) const {
+ // Cannot handle comparison EQ, i.e. while (A == B).
+ if (Cmp == Comparison::EQ)
+ return nullptr;
+
+ // Check if either the start or end values are an assignment of an immediate.
+ // If so, use the immediate value rather than the register.
+ if (Start->isReg()) {
+ const MachineInstr *StartValInstr = MRI->getVRegDef(Start->getReg());
+ if (StartValInstr && (StartValInstr->getOpcode() == Hexagon::A2_tfrsi ||
+ StartValInstr->getOpcode() == Hexagon::A2_tfrpi))
+ Start = &StartValInstr->getOperand(1);
+ }
+ if (End->isReg()) {
+ const MachineInstr *EndValInstr = MRI->getVRegDef(End->getReg());
+ if (EndValInstr && (EndValInstr->getOpcode() == Hexagon::A2_tfrsi ||
+ EndValInstr->getOpcode() == Hexagon::A2_tfrpi))
+ End = &EndValInstr->getOperand(1);
+ }
+
+ if (!Start->isReg() && !Start->isImm())
+ return nullptr;
+ if (!End->isReg() && !End->isImm())
+ return nullptr;
+
+ bool CmpLess = Cmp & Comparison::L;
+ bool CmpGreater = Cmp & Comparison::G;
+ bool CmpHasEqual = Cmp & Comparison::EQ;
+
+ // Avoid certain wrap-arounds. This doesn't detect all wrap-arounds.
+ if (CmpLess && IVBump < 0)
+ // Loop going while iv is "less" with the iv value going down. Must wrap.
+ return nullptr;
+
+ if (CmpGreater && IVBump > 0)
+ // Loop going while iv is "greater" with the iv value going up. Must wrap.
+ return nullptr;
+
+ // Phis that may feed into the loop.
+ LoopFeederMap LoopFeederPhi;
+
+ // Check if the initial value may be zero and can be decremented in the first
+ // iteration. If the value is zero, the endloop instruction will not decrement
+ // the loop counter, so we shouldn't generate a hardware loop in this case.
+ if (loopCountMayWrapOrUnderFlow(Start, End, Loop->getLoopPreheader(), Loop,
+ LoopFeederPhi))
+ return nullptr;
+
+ if (Start->isImm() && End->isImm()) {
+ // Both, start and end are immediates.
+ int64_t StartV = Start->getImm();
+ int64_t EndV = End->getImm();
+ int64_t Dist = EndV - StartV;
+ if (Dist == 0)
+ return nullptr;
+
+ bool Exact = (Dist % IVBump) == 0;
+
+ if (Cmp == Comparison::NE) {
+ if (!Exact)
+ return nullptr;
+ if ((Dist < 0) ^ (IVBump < 0))
+ return nullptr;
+ }
+
+ // For comparisons that include the final value (i.e. include equality
+ // with the final value), we need to increase the distance by 1.
+ if (CmpHasEqual)
+ Dist = Dist > 0 ? Dist+1 : Dist-1;
+
+ // For the loop to iterate, CmpLess should imply Dist > 0. Similarly,
+ // CmpGreater should imply Dist < 0. These conditions could actually
+ // fail, for example, in unreachable code (which may still appear to be
+ // reachable in the CFG).
+ if ((CmpLess && Dist < 0) || (CmpGreater && Dist > 0))
+ return nullptr;
+
+ // "Normalized" distance, i.e. with the bump set to +-1.
+ int64_t Dist1 = (IVBump > 0) ? (Dist + (IVBump - 1)) / IVBump
+ : (-Dist + (-IVBump - 1)) / (-IVBump);
+ assert (Dist1 > 0 && "Fishy thing. Both operands have the same sign.");
+
+ uint64_t Count = Dist1;
+
+ if (Count > 0xFFFFFFFFULL)
+ return nullptr;
+
+ return new CountValue(CountValue::CV_Immediate, Count);
+ }
+
+ // A general case: Start and End are some values, but the actual
+ // iteration count may not be available. If it is not, insert
+ // a computation of it into the preheader.
+
+ // If the induction variable bump is not a power of 2, quit.
+ // Othwerise we'd need a general integer division.
+ if (!isPowerOf2_64(std::abs(IVBump)))
+ return nullptr;
+
+ MachineBasicBlock *PH = MLI->findLoopPreheader(Loop, SpecPreheader);
+ assert (PH && "Should have a preheader by now");
+ MachineBasicBlock::iterator InsertPos = PH->getFirstTerminator();
+ DebugLoc DL;
+ if (InsertPos != PH->end())
+ DL = InsertPos->getDebugLoc();
+
+ // If Start is an immediate and End is a register, the trip count
+ // will be "reg - imm". Hexagon's "subtract immediate" instruction
+ // is actually "reg + -imm".
+
+ // If the loop IV is going downwards, i.e. if the bump is negative,
+ // then the iteration count (computed as End-Start) will need to be
+ // negated. To avoid the negation, just swap Start and End.
+ if (IVBump < 0) {
+ std::swap(Start, End);
+ IVBump = -IVBump;
+ }
+ // Cmp may now have a wrong direction, e.g. LEs may now be GEs.
+ // Signedness, and "including equality" are preserved.
+
+ bool RegToImm = Start->isReg() && End->isImm(); // for (reg..imm)
+ bool RegToReg = Start->isReg() && End->isReg(); // for (reg..reg)
+
+ int64_t StartV = 0, EndV = 0;
+ if (Start->isImm())
+ StartV = Start->getImm();
+ if (End->isImm())
+ EndV = End->getImm();
+
+ int64_t AdjV = 0;
+ // To compute the iteration count, we would need this computation:
+ // Count = (End - Start + (IVBump-1)) / IVBump
+ // or, when CmpHasEqual:
+ // Count = (End - Start + (IVBump-1)+1) / IVBump
+ // The "IVBump-1" part is the adjustment (AdjV). We can avoid
+ // generating an instruction specifically to add it if we can adjust
+ // the immediate values for Start or End.
+
+ if (CmpHasEqual) {
+ // Need to add 1 to the total iteration count.
+ if (Start->isImm())
+ StartV--;
+ else if (End->isImm())
+ EndV++;
+ else
+ AdjV += 1;
+ }
+
+ if (Cmp != Comparison::NE) {
+ if (Start->isImm())
+ StartV -= (IVBump-1);
+ else if (End->isImm())
+ EndV += (IVBump-1);
+ else
+ AdjV += (IVBump-1);
+ }
+
+ unsigned R = 0, SR = 0;
+ if (Start->isReg()) {
+ R = Start->getReg();
+ SR = Start->getSubReg();
+ } else {
+ R = End->getReg();
+ SR = End->getSubReg();
+ }
+ const TargetRegisterClass *RC = MRI->getRegClass(R);
+ // Hardware loops cannot handle 64-bit registers. If it's a double
+ // register, it has to have a subregister.
+ if (!SR && RC == &Hexagon::DoubleRegsRegClass)
+ return nullptr;
+ const TargetRegisterClass *IntRC = &Hexagon::IntRegsRegClass;
+
+ // Compute DistR (register with the distance between Start and End).
+ unsigned DistR, DistSR;
+
+ // Avoid special case, where the start value is an imm(0).
+ if (Start->isImm() && StartV == 0) {
+ DistR = End->getReg();
+ DistSR = End->getSubReg();
+ } else {
+ const MCInstrDesc &SubD = RegToReg ? TII->get(Hexagon::A2_sub) :
+ (RegToImm ? TII->get(Hexagon::A2_subri) :
+ TII->get(Hexagon::A2_addi));
+ if (RegToReg || RegToImm) {
+ unsigned SubR = MRI->createVirtualRegister(IntRC);
+ MachineInstrBuilder SubIB =
+ BuildMI(*PH, InsertPos, DL, SubD, SubR);
+
+ if (RegToReg)
+ SubIB.addReg(End->getReg(), 0, End->getSubReg())
+ .addReg(Start->getReg(), 0, Start->getSubReg());
+ else
+ SubIB.addImm(EndV)
+ .addReg(Start->getReg(), 0, Start->getSubReg());
+ DistR = SubR;
+ } else {
+ // If the loop has been unrolled, we should use the original loop count
+ // instead of recalculating the value. This will avoid additional
+ // 'Add' instruction.
+ const MachineInstr *EndValInstr = MRI->getVRegDef(End->getReg());
+ if (EndValInstr->getOpcode() == Hexagon::A2_addi &&
+ EndValInstr->getOperand(2).getImm() == StartV) {
+ DistR = EndValInstr->getOperand(1).getReg();
+ } else {
+ unsigned SubR = MRI->createVirtualRegister(IntRC);
+ MachineInstrBuilder SubIB =
+ BuildMI(*PH, InsertPos, DL, SubD, SubR);
+ SubIB.addReg(End->getReg(), 0, End->getSubReg())
+ .addImm(-StartV);
+ DistR = SubR;
+ }
+ }
+ DistSR = 0;
+ }
+
+ // From DistR, compute AdjR (register with the adjusted distance).
+ unsigned AdjR, AdjSR;
+
+ if (AdjV == 0) {
+ AdjR = DistR;
+ AdjSR = DistSR;
+ } else {
+ // Generate CountR = ADD DistR, AdjVal
+ unsigned AddR = MRI->createVirtualRegister(IntRC);
+ MCInstrDesc const &AddD = TII->get(Hexagon::A2_addi);
+ BuildMI(*PH, InsertPos, DL, AddD, AddR)
+ .addReg(DistR, 0, DistSR)
+ .addImm(AdjV);
+
+ AdjR = AddR;
+ AdjSR = 0;
+ }
+
+ // From AdjR, compute CountR (register with the final count).
+ unsigned CountR, CountSR;
+
+ if (IVBump == 1) {
+ CountR = AdjR;
+ CountSR = AdjSR;
+ } else {
+ // The IV bump is a power of two. Log_2(IV bump) is the shift amount.
+ unsigned Shift = Log2_32(IVBump);
+
+ // Generate NormR = LSR DistR, Shift.
+ unsigned LsrR = MRI->createVirtualRegister(IntRC);
+ const MCInstrDesc &LsrD = TII->get(Hexagon::S2_lsr_i_r);
+ BuildMI(*PH, InsertPos, DL, LsrD, LsrR)
+ .addReg(AdjR, 0, AdjSR)
+ .addImm(Shift);
+
+ CountR = LsrR;
+ CountSR = 0;
+ }
+
+ return new CountValue(CountValue::CV_Register, CountR, CountSR);
+}
+
+/// \brief Return true if the operation is invalid within hardware loop.
+bool HexagonHardwareLoops::isInvalidLoopOperation(const MachineInstr *MI,
+ bool IsInnerHWLoop) const {
+
+ // Call is not allowed because the callee may use a hardware loop except for
+ // the case when the call never returns.
+ if (MI->getDesc().isCall())
+ return !TII->doesNotReturn(*MI);
+
+ // Check if the instruction defines a hardware loop register.
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+ if (!MO.isReg() || !MO.isDef())
+ continue;
+ unsigned R = MO.getReg();
+ if (IsInnerHWLoop && (R == Hexagon::LC0 || R == Hexagon::SA0 ||
+ R == Hexagon::LC1 || R == Hexagon::SA1))
+ return true;
+ if (!IsInnerHWLoop && (R == Hexagon::LC1 || R == Hexagon::SA1))
+ return true;
+ }
+ return false;
+}
+
+/// \brief Return true if the loop contains an instruction that inhibits
+/// the use of the hardware loop instruction.
+bool HexagonHardwareLoops::containsInvalidInstruction(MachineLoop *L,
+ bool IsInnerHWLoop) const {
+ const std::vector<MachineBasicBlock *> &Blocks = L->getBlocks();
+ DEBUG(dbgs() << "\nhw_loop head, BB#" << Blocks[0]->getNumber(););
+ for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+ MachineBasicBlock *MBB = Blocks[i];
+ for (MachineBasicBlock::iterator
+ MII = MBB->begin(), E = MBB->end(); MII != E; ++MII) {
+ const MachineInstr *MI = &*MII;
+ if (isInvalidLoopOperation(MI, IsInnerHWLoop)) {
+ DEBUG(dbgs()<< "\nCannot convert to hw_loop due to:"; MI->dump(););
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+/// \brief Returns true if the instruction is dead. This was essentially
+/// copied from DeadMachineInstructionElim::isDead, but with special cases
+/// for inline asm, physical registers and instructions with side effects
+/// removed.
+bool HexagonHardwareLoops::isDead(const MachineInstr *MI,
+ SmallVectorImpl<MachineInstr *> &DeadPhis) const {
+ // Examine each operand.
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+ if (!MO.isReg() || !MO.isDef())
+ continue;
+
+ unsigned Reg = MO.getReg();
+ if (MRI->use_nodbg_empty(Reg))
+ continue;
+
+ typedef MachineRegisterInfo::use_nodbg_iterator use_nodbg_iterator;
+
+ // This instruction has users, but if the only user is the phi node for the
+ // parent block, and the only use of that phi node is this instruction, then
+ // this instruction is dead: both it (and the phi node) can be removed.
+ use_nodbg_iterator I = MRI->use_nodbg_begin(Reg);
+ use_nodbg_iterator End = MRI->use_nodbg_end();
+ if (std::next(I) != End || !I->getParent()->isPHI())
+ return false;
+
+ MachineInstr *OnePhi = I->getParent();
+ for (unsigned j = 0, f = OnePhi->getNumOperands(); j != f; ++j) {
+ const MachineOperand &OPO = OnePhi->getOperand(j);
+ if (!OPO.isReg() || !OPO.isDef())
+ continue;
+
+ unsigned OPReg = OPO.getReg();
+ use_nodbg_iterator nextJ;
+ for (use_nodbg_iterator J = MRI->use_nodbg_begin(OPReg);
+ J != End; J = nextJ) {
+ nextJ = std::next(J);
+ MachineOperand &Use = *J;
+ MachineInstr *UseMI = Use.getParent();
+
+ // If the phi node has a user that is not MI, bail.
+ if (MI != UseMI)
+ return false;
+ }
+ }
+ DeadPhis.push_back(OnePhi);
+ }
+
+ // If there are no defs with uses, the instruction is dead.
+ return true;
+}
+
+void HexagonHardwareLoops::removeIfDead(MachineInstr *MI) {
+ // This procedure was essentially copied from DeadMachineInstructionElim.
+
+ SmallVector<MachineInstr*, 1> DeadPhis;
+ if (isDead(MI, DeadPhis)) {
+ DEBUG(dbgs() << "HW looping will remove: " << *MI);
+
+ // It is possible that some DBG_VALUE instructions refer to this
+ // instruction. Examine each def operand for such references;
+ // if found, mark the DBG_VALUE as undef (but don't delete it).
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+ if (!MO.isReg() || !MO.isDef())
+ continue;
+ unsigned Reg = MO.getReg();
+ MachineRegisterInfo::use_iterator nextI;
+ for (MachineRegisterInfo::use_iterator I = MRI->use_begin(Reg),
+ E = MRI->use_end(); I != E; I = nextI) {
+ nextI = std::next(I); // I is invalidated by the setReg
+ MachineOperand &Use = *I;
+ MachineInstr *UseMI = I->getParent();
+ if (UseMI == MI)
+ continue;
+ if (Use.isDebug())
+ UseMI->getOperand(0).setReg(0U);
+ }
+ }
+
+ MI->eraseFromParent();
+ for (unsigned i = 0; i < DeadPhis.size(); ++i)
+ DeadPhis[i]->eraseFromParent();
+ }
+}
+
+/// \brief Check if the loop is a candidate for converting to a hardware
+/// loop. If so, then perform the transformation.
+///
+/// This function works on innermost loops first. A loop can be converted
+/// if it is a counting loop; either a register value or an immediate.
+///
+/// The code makes several assumptions about the representation of the loop
+/// in llvm.
+bool HexagonHardwareLoops::convertToHardwareLoop(MachineLoop *L,
+ bool &RecL0used,
+ bool &RecL1used) {
+ // This is just for sanity.
+ assert(L->getHeader() && "Loop without a header?");
+
+ bool Changed = false;
+ bool L0Used = false;
+ bool L1Used = false;
+
+ // Process nested loops first.
+ for (MachineLoop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
+ Changed |= convertToHardwareLoop(*I, RecL0used, RecL1used);
+ L0Used |= RecL0used;
+ L1Used |= RecL1used;
+ }
+
+ // If a nested loop has been converted, then we can't convert this loop.
+ if (Changed && L0Used && L1Used)
+ return Changed;
+
+ unsigned LOOP_i;
+ unsigned LOOP_r;
+ unsigned ENDLOOP;
+
+ // Flag used to track loopN instruction:
+ // 1 - Hardware loop is being generated for the inner most loop.
+ // 0 - Hardware loop is being generated for the outer loop.
+ unsigned IsInnerHWLoop = 1;
+
+ if (L0Used) {
+ LOOP_i = Hexagon::J2_loop1i;
+ LOOP_r = Hexagon::J2_loop1r;
+ ENDLOOP = Hexagon::ENDLOOP1;
+ IsInnerHWLoop = 0;
+ } else {
+ LOOP_i = Hexagon::J2_loop0i;
+ LOOP_r = Hexagon::J2_loop0r;
+ ENDLOOP = Hexagon::ENDLOOP0;
+ }
+
+#ifndef NDEBUG
+ // Stop trying after reaching the limit (if any).
+ int Limit = HWLoopLimit;
+ if (Limit >= 0) {
+ if (Counter >= HWLoopLimit)
+ return false;
+ Counter++;
+ }
+#endif
+
+ // Does the loop contain any invalid instructions?
+ if (containsInvalidInstruction(L, IsInnerHWLoop))
+ return false;
+
+ MachineBasicBlock *LastMBB = L->findLoopControlBlock();
+ // Don't generate hw loop if the loop has more than one exit.
+ if (!LastMBB)
+ return false;
+
+ MachineBasicBlock::iterator LastI = LastMBB->getFirstTerminator();
+ if (LastI == LastMBB->end())
+ return false;
+
+ // Is the induction variable bump feeding the latch condition?
+ if (!fixupInductionVariable(L))
+ return false;
+
+ // Ensure the loop has a preheader: the loop instruction will be
+ // placed there.
+ MachineBasicBlock *Preheader = MLI->findLoopPreheader(L, SpecPreheader);
+ if (!Preheader) {
+ Preheader = createPreheaderForLoop(L);
+ if (!Preheader)
+ return false;
+ }
+
+ MachineBasicBlock::iterator InsertPos = Preheader->getFirstTerminator();
+
+ SmallVector<MachineInstr*, 2> OldInsts;
+ // Are we able to determine the trip count for the loop?
+ CountValue *TripCount = getLoopTripCount(L, OldInsts);
+ if (!TripCount)
+ return false;
+
+ // Is the trip count available in the preheader?
+ if (TripCount->isReg()) {
+ // There will be a use of the register inserted into the preheader,
+ // so make sure that the register is actually defined at that point.
+ MachineInstr *TCDef = MRI->getVRegDef(TripCount->getReg());
+ MachineBasicBlock *BBDef = TCDef->getParent();
+ if (!MDT->dominates(BBDef, Preheader))
+ return false;
+ }
+
+ // Determine the loop start.
+ MachineBasicBlock *TopBlock = L->getTopBlock();
+ MachineBasicBlock *ExitingBlock = L->findLoopControlBlock();
+ MachineBasicBlock *LoopStart = nullptr;
+ if (ExitingBlock != L->getLoopLatch()) {
+ MachineBasicBlock *TB = nullptr, *FB = nullptr;
+ SmallVector<MachineOperand, 2> Cond;
+
+ if (TII->analyzeBranch(*ExitingBlock, TB, FB, Cond, false))
+ return false;
+
+ if (L->contains(TB))
+ LoopStart = TB;
+ else if (L->contains(FB))
+ LoopStart = FB;
+ else
+ return false;
+ }
+ else
+ LoopStart = TopBlock;
+
+ // Convert the loop to a hardware loop.
+ DEBUG(dbgs() << "Change to hardware loop at "; L->dump());
+ DebugLoc DL;
+ if (InsertPos != Preheader->end())
+ DL = InsertPos->getDebugLoc();
+
+ if (TripCount->isReg()) {
+ // Create a copy of the loop count register.
+ unsigned CountReg = MRI->createVirtualRegister(&Hexagon::IntRegsRegClass);
+ BuildMI(*Preheader, InsertPos, DL, TII->get(TargetOpcode::COPY), CountReg)
+ .addReg(TripCount->getReg(), 0, TripCount->getSubReg());
+ // Add the Loop instruction to the beginning of the loop.
+ BuildMI(*Preheader, InsertPos, DL, TII->get(LOOP_r)).addMBB(LoopStart)
+ .addReg(CountReg);
+ } else {
+ assert(TripCount->isImm() && "Expecting immediate value for trip count");
+ // Add the Loop immediate instruction to the beginning of the loop,
+ // if the immediate fits in the instructions. Otherwise, we need to
+ // create a new virtual register.
+ int64_t CountImm = TripCount->getImm();
+ if (!TII->isValidOffset(LOOP_i, CountImm)) {
+ unsigned CountReg = MRI->createVirtualRegister(&Hexagon::IntRegsRegClass);
+ BuildMI(*Preheader, InsertPos, DL, TII->get(Hexagon::A2_tfrsi), CountReg)
+ .addImm(CountImm);
+ BuildMI(*Preheader, InsertPos, DL, TII->get(LOOP_r))
+ .addMBB(LoopStart).addReg(CountReg);
+ } else
+ BuildMI(*Preheader, InsertPos, DL, TII->get(LOOP_i))
+ .addMBB(LoopStart).addImm(CountImm);
+ }
+
+ // Make sure the loop start always has a reference in the CFG. We need
+ // to create a BlockAddress operand to get this mechanism to work both the
+ // MachineBasicBlock and BasicBlock objects need the flag set.
+ LoopStart->setHasAddressTaken();
+ // This line is needed to set the hasAddressTaken flag on the BasicBlock
+ // object.
+ BlockAddress::get(const_cast<BasicBlock *>(LoopStart->getBasicBlock()));
+
+ // Replace the loop branch with an endloop instruction.
+ DebugLoc LastIDL = LastI->getDebugLoc();
+ BuildMI(*LastMBB, LastI, LastIDL, TII->get(ENDLOOP)).addMBB(LoopStart);
+
+ // The loop ends with either:
+ // - a conditional branch followed by an unconditional branch, or
+ // - a conditional branch to the loop start.
+ if (LastI->getOpcode() == Hexagon::J2_jumpt ||
+ LastI->getOpcode() == Hexagon::J2_jumpf) {
+ // Delete one and change/add an uncond. branch to out of the loop.
+ MachineBasicBlock *BranchTarget = LastI->getOperand(1).getMBB();
+ LastI = LastMBB->erase(LastI);
+ if (!L->contains(BranchTarget)) {
+ if (LastI != LastMBB->end())
+ LastI = LastMBB->erase(LastI);
+ SmallVector<MachineOperand, 0> Cond;
+ TII->insertBranch(*LastMBB, BranchTarget, nullptr, Cond, LastIDL);
+ }
+ } else {
+ // Conditional branch to loop start; just delete it.
+ LastMBB->erase(LastI);
+ }
+ delete TripCount;
+
+ // The induction operation and the comparison may now be
+ // unneeded. If these are unneeded, then remove them.
+ for (unsigned i = 0; i < OldInsts.size(); ++i)
+ removeIfDead(OldInsts[i]);
+
+ ++NumHWLoops;
+
+ // Set RecL1used and RecL0used only after hardware loop has been
+ // successfully generated. Doing it earlier can cause wrong loop instruction
+ // to be used.
+ if (L0Used) // Loop0 was already used. So, the correct loop must be loop1.
+ RecL1used = true;
+ else
+ RecL0used = true;
+
+ return true;
+}
+
+bool HexagonHardwareLoops::orderBumpCompare(MachineInstr *BumpI,
+ MachineInstr *CmpI) {
+ assert (BumpI != CmpI && "Bump and compare in the same instruction?");
+
+ MachineBasicBlock *BB = BumpI->getParent();
+ if (CmpI->getParent() != BB)
+ return false;
+
+ typedef MachineBasicBlock::instr_iterator instr_iterator;
+ // Check if things are in order to begin with.
+ for (instr_iterator I(BumpI), E = BB->instr_end(); I != E; ++I)
+ if (&*I == CmpI)
+ return true;
+
+ // Out of order.
+ unsigned PredR = CmpI->getOperand(0).getReg();
+ bool FoundBump = false;
+ instr_iterator CmpIt = CmpI->getIterator(), NextIt = std::next(CmpIt);
+ for (instr_iterator I = NextIt, E = BB->instr_end(); I != E; ++I) {
+ MachineInstr *In = &*I;
+ for (unsigned i = 0, n = In->getNumOperands(); i < n; ++i) {
+ MachineOperand &MO = In->getOperand(i);
+ if (MO.isReg() && MO.isUse()) {
+ if (MO.getReg() == PredR) // Found an intervening use of PredR.
+ return false;
+ }
+ }
+
+ if (In == BumpI) {
+ BB->splice(++BumpI->getIterator(), BB, CmpI->getIterator());
+ FoundBump = true;
+ break;
+ }
+ }
+ assert (FoundBump && "Cannot determine instruction order");
+ return FoundBump;
+}
+
+/// This function is required to break recursion. Visiting phis in a loop may
+/// result in recursion during compilation. We break the recursion by making
+/// sure that we visit a MachineOperand and its definition in a
+/// MachineInstruction only once. If we attempt to visit more than once, then
+/// there is recursion, and will return false.
+bool HexagonHardwareLoops::isLoopFeeder(MachineLoop *L, MachineBasicBlock *A,
+ MachineInstr *MI,
+ const MachineOperand *MO,
+ LoopFeederMap &LoopFeederPhi) const {
+ if (LoopFeederPhi.find(MO->getReg()) == LoopFeederPhi.end()) {
+ const std::vector<MachineBasicBlock *> &Blocks = L->getBlocks();
+ DEBUG(dbgs() << "\nhw_loop head, BB#" << Blocks[0]->getNumber(););
+ // Ignore all BBs that form Loop.
+ for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+ MachineBasicBlock *MBB = Blocks[i];
+ if (A == MBB)
+ return false;
+ }
+ MachineInstr *Def = MRI->getVRegDef(MO->getReg());
+ LoopFeederPhi.insert(std::make_pair(MO->getReg(), Def));
+ return true;
+ } else
+ // Already visited node.
+ return false;
+}
+
+/// Return true if a Phi may generate a value that can underflow.
+/// This function calls loopCountMayWrapOrUnderFlow for each Phi operand.
+bool HexagonHardwareLoops::phiMayWrapOrUnderflow(
+ MachineInstr *Phi, const MachineOperand *EndVal, MachineBasicBlock *MBB,
+ MachineLoop *L, LoopFeederMap &LoopFeederPhi) const {
+ assert(Phi->isPHI() && "Expecting a Phi.");
+ // Walk through each Phi, and its used operands. Make sure that
+ // if there is recursion in Phi, we won't generate hardware loops.
+ for (int i = 1, n = Phi->getNumOperands(); i < n; i += 2)
+ if (isLoopFeeder(L, MBB, Phi, &(Phi->getOperand(i)), LoopFeederPhi))
+ if (loopCountMayWrapOrUnderFlow(&(Phi->getOperand(i)), EndVal,
+ Phi->getParent(), L, LoopFeederPhi))
+ return true;
+ return false;
+}
+
+/// Return true if the induction variable can underflow in the first iteration.
+/// An example, is an initial unsigned value that is 0 and is decrement in the
+/// first itertion of a do-while loop. In this case, we cannot generate a
+/// hardware loop because the endloop instruction does not decrement the loop
+/// counter if it is <= 1. We only need to perform this analysis if the
+/// initial value is a register.
+///
+/// This function assumes the initial value may underfow unless proven
+/// otherwise. If the type is signed, then we don't care because signed
+/// underflow is undefined. We attempt to prove the initial value is not
+/// zero by perfoming a crude analysis of the loop counter. This function
+/// checks if the initial value is used in any comparison prior to the loop
+/// and, if so, assumes the comparison is a range check. This is inexact,
+/// but will catch the simple cases.
+bool HexagonHardwareLoops::loopCountMayWrapOrUnderFlow(
+ const MachineOperand *InitVal, const MachineOperand *EndVal,
+ MachineBasicBlock *MBB, MachineLoop *L,
+ LoopFeederMap &LoopFeederPhi) const {
+ // Only check register values since they are unknown.
+ if (!InitVal->isReg())
+ return false;
+
+ if (!EndVal->isImm())
+ return false;
+
+ // A register value that is assigned an immediate is a known value, and it
+ // won't underflow in the first iteration.
+ int64_t Imm;
+ if (checkForImmediate(*InitVal, Imm))
+ return (EndVal->getImm() == Imm);
+
+ unsigned Reg = InitVal->getReg();
+
+ // We don't know the value of a physical register.
+ if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ return true;
+
+ MachineInstr *Def = MRI->getVRegDef(Reg);
+ if (!Def)
+ return true;
+
+ // If the initial value is a Phi or copy and the operands may not underflow,
+ // then the definition cannot be underflow either.
+ if (Def->isPHI() && !phiMayWrapOrUnderflow(Def, EndVal, Def->getParent(),
+ L, LoopFeederPhi))
+ return false;
+ if (Def->isCopy() && !loopCountMayWrapOrUnderFlow(&(Def->getOperand(1)),
+ EndVal, Def->getParent(),
+ L, LoopFeederPhi))
+ return false;
+
+ // Iterate over the uses of the initial value. If the initial value is used
+ // in a compare, then we assume this is a range check that ensures the loop
+ // doesn't underflow. This is not an exact test and should be improved.
+ for (MachineRegisterInfo::use_instr_nodbg_iterator I = MRI->use_instr_nodbg_begin(Reg),
+ E = MRI->use_instr_nodbg_end(); I != E; ++I) {
+ MachineInstr *MI = &*I;
+ unsigned CmpReg1 = 0, CmpReg2 = 0;
+ int CmpMask = 0, CmpValue = 0;
+
+ if (!TII->analyzeCompare(*MI, CmpReg1, CmpReg2, CmpMask, CmpValue))
+ continue;
+
+ MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+ SmallVector<MachineOperand, 2> Cond;
+ if (TII->analyzeBranch(*MI->getParent(), TBB, FBB, Cond, false))
+ continue;
+
+ Comparison::Kind Cmp =
+ getComparisonKind(MI->getOpcode(), nullptr, nullptr, 0);
+ if (Cmp == 0)
+ continue;
+ if (TII->predOpcodeHasNot(Cond) ^ (TBB != MBB))
+ Cmp = Comparison::getNegatedComparison(Cmp);
+ if (CmpReg2 != 0 && CmpReg2 == Reg)
+ Cmp = Comparison::getSwappedComparison(Cmp);
+
+ // Signed underflow is undefined.
+ if (Comparison::isSigned(Cmp))
+ return false;
+
+ // Check if there is a comparison of the initial value. If the initial value
+ // is greater than or not equal to another value, then assume this is a
+ // range check.
+ if ((Cmp & Comparison::G) || Cmp == Comparison::NE)
+ return false;
+ }
+
+ // OK - this is a hack that needs to be improved. We really need to analyze
+ // the instructions performed on the initial value. This works on the simplest
+ // cases only.
+ if (!Def->isCopy() && !Def->isPHI())
+ return false;
+
+ return true;
+}
+
+bool HexagonHardwareLoops::checkForImmediate(const MachineOperand &MO,
+ int64_t &Val) const {
+ if (MO.isImm()) {
+ Val = MO.getImm();
+ return true;
+ }
+ if (!MO.isReg())
+ return false;
+
+ // MO is a register. Check whether it is defined as an immediate value,
+ // and if so, get the value of it in TV. That value will then need to be
+ // processed to handle potential subregisters in MO.
+ int64_t TV;
+
+ unsigned R = MO.getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(R))
+ return false;
+ MachineInstr *DI = MRI->getVRegDef(R);
+ unsigned DOpc = DI->getOpcode();
+ switch (DOpc) {
+ case TargetOpcode::COPY:
+ case Hexagon::A2_tfrsi:
+ case Hexagon::A2_tfrpi:
+ case Hexagon::CONST32:
+ case Hexagon::CONST64: {
+ // Call recursively to avoid an extra check whether operand(1) is
+ // indeed an immediate (it could be a global address, for example),
+ // plus we can handle COPY at the same time.
+ if (!checkForImmediate(DI->getOperand(1), TV))
+ return false;
+ break;
+ }
+ case Hexagon::A2_combineii:
+ case Hexagon::A4_combineir:
+ case Hexagon::A4_combineii:
+ case Hexagon::A4_combineri:
+ case Hexagon::A2_combinew: {
+ const MachineOperand &S1 = DI->getOperand(1);
+ const MachineOperand &S2 = DI->getOperand(2);
+ int64_t V1, V2;
+ if (!checkForImmediate(S1, V1) || !checkForImmediate(S2, V2))
+ return false;
+ TV = V2 | (V1 << 32);
+ break;
+ }
+ case TargetOpcode::REG_SEQUENCE: {
+ const MachineOperand &S1 = DI->getOperand(1);
+ const MachineOperand &S3 = DI->getOperand(3);
+ int64_t V1, V3;
+ if (!checkForImmediate(S1, V1) || !checkForImmediate(S3, V3))
+ return false;
+ unsigned Sub2 = DI->getOperand(2).getImm();
+ unsigned Sub4 = DI->getOperand(4).getImm();
+ if (Sub2 == Hexagon::isub_lo && Sub4 == Hexagon::isub_hi)
+ TV = V1 | (V3 << 32);
+ else if (Sub2 == Hexagon::isub_hi && Sub4 == Hexagon::isub_lo)
+ TV = V3 | (V1 << 32);
+ else
+ llvm_unreachable("Unexpected form of REG_SEQUENCE");
+ break;
+ }
+
+ default:
+ return false;
+ }
+
+ // By now, we should have successfully obtained the immediate value defining
+ // the register referenced in MO. Handle a potential use of a subregister.
+ switch (MO.getSubReg()) {
+ case Hexagon::isub_lo:
+ Val = TV & 0xFFFFFFFFULL;
+ break;
+ case Hexagon::isub_hi:
+ Val = (TV >> 32) & 0xFFFFFFFFULL;
+ break;
+ default:
+ Val = TV;
+ break;
+ }
+ return true;
+}
+
+void HexagonHardwareLoops::setImmediate(MachineOperand &MO, int64_t Val) {
+ if (MO.isImm()) {
+ MO.setImm(Val);
+ return;
+ }
+
+ assert(MO.isReg());
+ unsigned R = MO.getReg();
+ MachineInstr *DI = MRI->getVRegDef(R);
+
+ const TargetRegisterClass *RC = MRI->getRegClass(R);
+ unsigned NewR = MRI->createVirtualRegister(RC);
+ MachineBasicBlock &B = *DI->getParent();
+ DebugLoc DL = DI->getDebugLoc();
+ BuildMI(B, DI, DL, TII->get(DI->getOpcode()), NewR).addImm(Val);
+ MO.setReg(NewR);
+}
+
+static bool isImmValidForOpcode(unsigned CmpOpc, int64_t Imm) {
+ // These two instructions are not extendable.
+ if (CmpOpc == Hexagon::A4_cmpbeqi)
+ return isUInt<8>(Imm);
+ if (CmpOpc == Hexagon::A4_cmpbgti)
+ return isInt<8>(Imm);
+ // The rest of the comparison-with-immediate instructions are extendable.
+ return true;
+}
+
+bool HexagonHardwareLoops::fixupInductionVariable(MachineLoop *L) {
+ MachineBasicBlock *Header = L->getHeader();
+ MachineBasicBlock *Latch = L->getLoopLatch();
+ MachineBasicBlock *ExitingBlock = L->findLoopControlBlock();
+
+ if (!(Header && Latch && ExitingBlock))
+ return false;
+
+ // These data structures follow the same concept as the corresponding
+ // ones in findInductionRegister (where some comments are).
+ typedef std::pair<unsigned,int64_t> RegisterBump;
+ typedef std::pair<unsigned,RegisterBump> RegisterInduction;
+ typedef std::set<RegisterInduction> RegisterInductionSet;
+
+ // Register candidates for induction variables, with their associated bumps.
+ RegisterInductionSet IndRegs;
+
+ // Look for induction patterns:
+ // vreg1 = PHI ..., [ latch, vreg2 ]
+ // vreg2 = ADD vreg1, imm
+ typedef MachineBasicBlock::instr_iterator instr_iterator;
+ for (instr_iterator I = Header->instr_begin(), E = Header->instr_end();
+ I != E && I->isPHI(); ++I) {
+ MachineInstr *Phi = &*I;
+
+ // Have a PHI instruction.
+ for (unsigned i = 1, n = Phi->getNumOperands(); i < n; i += 2) {
+ if (Phi->getOperand(i+1).getMBB() != Latch)
+ continue;
+
+ unsigned PhiReg = Phi->getOperand(i).getReg();
+ MachineInstr *DI = MRI->getVRegDef(PhiReg);
+
+ if (DI->getDesc().isAdd()) {
+ // If the register operand to the add/sub is the PHI we are looking
+ // at, this meets the induction pattern.
+ unsigned IndReg = DI->getOperand(1).getReg();
+ MachineOperand &Opnd2 = DI->getOperand(2);
+ int64_t V;
+ if (MRI->getVRegDef(IndReg) == Phi && checkForImmediate(Opnd2, V)) {
+ unsigned UpdReg = DI->getOperand(0).getReg();
+ IndRegs.insert(std::make_pair(UpdReg, std::make_pair(IndReg, V)));
+ }
+ }
+ } // for (i)
+ } // for (instr)
+
+ if (IndRegs.empty())
+ return false;
+
+ MachineBasicBlock *TB = nullptr, *FB = nullptr;
+ SmallVector<MachineOperand,2> Cond;
+ // AnalyzeBranch returns true if it fails to analyze branch.
+ bool NotAnalyzed = TII->analyzeBranch(*ExitingBlock, TB, FB, Cond, false);
+ if (NotAnalyzed || Cond.empty())
+ return false;
+
+ if (ExitingBlock != Latch && (TB == Latch || FB == Latch)) {
+ MachineBasicBlock *LTB = nullptr, *LFB = nullptr;
+ SmallVector<MachineOperand,2> LCond;
+ bool NotAnalyzed = TII->analyzeBranch(*Latch, LTB, LFB, LCond, false);
+ if (NotAnalyzed)
+ return false;
+
+ // Since latch is not the exiting block, the latch branch should be an
+ // unconditional branch to the loop header.
+ if (TB == Latch)
+ TB = (LTB == Header) ? LTB : LFB;
+ else
+ FB = (LTB == Header) ? LTB : LFB;
+ }
+ if (TB != Header) {
+ if (FB != Header) {
+ // The latch/exit block does not go back to the header.
+ return false;
+ }
+ // FB is the header (i.e., uncond. jump to branch header)
+ // In this case, the LoopBody -> TB should not be a back edge otherwise
+ // it could result in an infinite loop after conversion to hw_loop.
+ // This case can happen when the Latch has two jumps like this:
+ // Jmp_c OuterLoopHeader <-- TB
+ // Jmp InnerLoopHeader <-- FB
+ if (MDT->dominates(TB, FB))
+ return false;
+ }
+
+ // Expecting a predicate register as a condition. It won't be a hardware
+ // predicate register at this point yet, just a vreg.
+ // HexagonInstrInfo::AnalyzeBranch for negated branches inserts imm(0)
+ // into Cond, followed by the predicate register. For non-negated branches
+ // it's just the register.
+ unsigned CSz = Cond.size();
+ if (CSz != 1 && CSz != 2)
+ return false;
+
+ if (!Cond[CSz-1].isReg())
+ return false;
+
+ unsigned P = Cond[CSz-1].getReg();
+ MachineInstr *PredDef = MRI->getVRegDef(P);
+
+ if (!PredDef->isCompare())
+ return false;
+
+ SmallSet<unsigned,2> CmpRegs;
+ MachineOperand *CmpImmOp = nullptr;
+
+ // Go over all operands to the compare and look for immediate and register
+ // operands. Assume that if the compare has a single register use and a
+ // single immediate operand, then the register is being compared with the
+ // immediate value.
+ for (unsigned i = 0, n = PredDef->getNumOperands(); i < n; ++i) {
+ MachineOperand &MO = PredDef->getOperand(i);
+ if (MO.isReg()) {
+ // Skip all implicit references. In one case there was:
+ // %vreg140<def> = FCMPUGT32_rr %vreg138, %vreg139, %USR<imp-use>
+ if (MO.isImplicit())
+ continue;
+ if (MO.isUse()) {
+ if (!isImmediate(MO)) {
+ CmpRegs.insert(MO.getReg());
+ continue;
+ }
+ // Consider the register to be the "immediate" operand.
+ if (CmpImmOp)
+ return false;
+ CmpImmOp = &MO;
+ }
+ } else if (MO.isImm()) {
+ if (CmpImmOp) // A second immediate argument? Confusing. Bail out.
+ return false;
+ CmpImmOp = &MO;
+ }
+ }
+
+ if (CmpRegs.empty())
+ return false;
+
+ // Check if the compared register follows the order we want. Fix if needed.
+ for (RegisterInductionSet::iterator I = IndRegs.begin(), E = IndRegs.end();
+ I != E; ++I) {
+ // This is a success. If the register used in the comparison is one that
+ // we have identified as a bumped (updated) induction register, there is
+ // nothing to do.
+ if (CmpRegs.count(I->first))
+ return true;
+
+ // Otherwise, if the register being compared comes out of a PHI node,
+ // and has been recognized as following the induction pattern, and is
+ // compared against an immediate, we can fix it.
+ const RegisterBump &RB = I->second;
+ if (CmpRegs.count(RB.first)) {
+ if (!CmpImmOp) {
+ // If both operands to the compare instruction are registers, see if
+ // it can be changed to use induction register as one of the operands.
+ MachineInstr *IndI = nullptr;
+ MachineInstr *nonIndI = nullptr;
+ MachineOperand *IndMO = nullptr;
+ MachineOperand *nonIndMO = nullptr;
+
+ for (unsigned i = 1, n = PredDef->getNumOperands(); i < n; ++i) {
+ MachineOperand &MO = PredDef->getOperand(i);
+ if (MO.isReg() && MO.getReg() == RB.first) {
+ DEBUG(dbgs() << "\n DefMI(" << i << ") = "
+ << *(MRI->getVRegDef(I->first)));
+ if (IndI)
+ return false;
+
+ IndI = MRI->getVRegDef(I->first);
+ IndMO = &MO;
+ } else if (MO.isReg()) {
+ DEBUG(dbgs() << "\n DefMI(" << i << ") = "
+ << *(MRI->getVRegDef(MO.getReg())));
+ if (nonIndI)
+ return false;
+
+ nonIndI = MRI->getVRegDef(MO.getReg());
+ nonIndMO = &MO;
+ }
+ }
+ if (IndI && nonIndI &&
+ nonIndI->getOpcode() == Hexagon::A2_addi &&
+ nonIndI->getOperand(2).isImm() &&
+ nonIndI->getOperand(2).getImm() == - RB.second) {
+ bool Order = orderBumpCompare(IndI, PredDef);
+ if (Order) {
+ IndMO->setReg(I->first);
+ nonIndMO->setReg(nonIndI->getOperand(1).getReg());
+ return true;
+ }
+ }
+ return false;
+ }
+
+ // It is not valid to do this transformation on an unsigned comparison
+ // because it may underflow.
+ Comparison::Kind Cmp =
+ getComparisonKind(PredDef->getOpcode(), nullptr, nullptr, 0);
+ if (!Cmp || Comparison::isUnsigned(Cmp))
+ return false;
+
+ // If the register is being compared against an immediate, try changing
+ // the compare instruction to use induction register and adjust the
+ // immediate operand.
+ int64_t CmpImm = getImmediate(*CmpImmOp);
+ int64_t V = RB.second;
+ // Handle Overflow (64-bit).
+ if (((V > 0) && (CmpImm > INT64_MAX - V)) ||
+ ((V < 0) && (CmpImm < INT64_MIN - V)))
+ return false;
+ CmpImm += V;
+ // Most comparisons of register against an immediate value allow
+ // the immediate to be constant-extended. There are some exceptions
+ // though. Make sure the new combination will work.
+ if (CmpImmOp->isImm())
+ if (!isImmValidForOpcode(PredDef->getOpcode(), CmpImm))
+ return false;
+
+ // Make sure that the compare happens after the bump. Otherwise,
+ // after the fixup, the compare would use a yet-undefined register.
+ MachineInstr *BumpI = MRI->getVRegDef(I->first);
+ bool Order = orderBumpCompare(BumpI, PredDef);
+ if (!Order)
+ return false;
+
+ // Finally, fix the compare instruction.
+ setImmediate(*CmpImmOp, CmpImm);
+ for (unsigned i = 0, n = PredDef->getNumOperands(); i < n; ++i) {
+ MachineOperand &MO = PredDef->getOperand(i);
+ if (MO.isReg() && MO.getReg() == RB.first) {
+ MO.setReg(I->first);
+ return true;
+ }
+ }
+ }
+ }
+
+ return false;
+}
+
+/// createPreheaderForLoop - Create a preheader for a given loop.
+MachineBasicBlock *HexagonHardwareLoops::createPreheaderForLoop(
+ MachineLoop *L) {
+ if (MachineBasicBlock *TmpPH = MLI->findLoopPreheader(L, SpecPreheader))
+ return TmpPH;
+ if (!HWCreatePreheader)
+ return nullptr;
+
+ MachineBasicBlock *Header = L->getHeader();
+ MachineBasicBlock *Latch = L->getLoopLatch();
+ MachineBasicBlock *ExitingBlock = L->findLoopControlBlock();
+ MachineFunction *MF = Header->getParent();
+ DebugLoc DL;
+
+#ifndef NDEBUG
+ if ((PHFn != "") && (PHFn != MF->getName()))
+ return nullptr;
+#endif
+
+ if (!Latch || !ExitingBlock || Header->hasAddressTaken())
+ return nullptr;
+
+ typedef MachineBasicBlock::instr_iterator instr_iterator;
+
+ // Verify that all existing predecessors have analyzable branches
+ // (or no branches at all).
+ typedef std::vector<MachineBasicBlock*> MBBVector;
+ MBBVector Preds(Header->pred_begin(), Header->pred_end());
+ SmallVector<MachineOperand,2> Tmp1;
+ MachineBasicBlock *TB = nullptr, *FB = nullptr;
+
+ if (TII->analyzeBranch(*ExitingBlock, TB, FB, Tmp1, false))
+ return nullptr;
+
+ for (MBBVector::iterator I = Preds.begin(), E = Preds.end(); I != E; ++I) {
+ MachineBasicBlock *PB = *I;
+ bool NotAnalyzed = TII->analyzeBranch(*PB, TB, FB, Tmp1, false);
+ if (NotAnalyzed)
+ return nullptr;
+ }
+
+ MachineBasicBlock *NewPH = MF->CreateMachineBasicBlock();
+ MF->insert(Header->getIterator(), NewPH);
+
+ if (Header->pred_size() > 2) {
+ // Ensure that the header has only two predecessors: the preheader and
+ // the loop latch. Any additional predecessors of the header should
+ // join at the newly created preheader. Inspect all PHI nodes from the
+ // header and create appropriate corresponding PHI nodes in the preheader.
+
+ for (instr_iterator I = Header->instr_begin(), E = Header->instr_end();
+ I != E && I->isPHI(); ++I) {
+ MachineInstr *PN = &*I;
+
+ const MCInstrDesc &PD = TII->get(TargetOpcode::PHI);
+ MachineInstr *NewPN = MF->CreateMachineInstr(PD, DL);
+ NewPH->insert(NewPH->end(), NewPN);
+
+ unsigned PR = PN->getOperand(0).getReg();
+ const TargetRegisterClass *RC = MRI->getRegClass(PR);
+ unsigned NewPR = MRI->createVirtualRegister(RC);
+ NewPN->addOperand(MachineOperand::CreateReg(NewPR, true));
+
+ // Copy all non-latch operands of a header's PHI node to the newly
+ // created PHI node in the preheader.
+ for (unsigned i = 1, n = PN->getNumOperands(); i < n; i += 2) {
+ unsigned PredR = PN->getOperand(i).getReg();
+ unsigned PredRSub = PN->getOperand(i).getSubReg();
+ MachineBasicBlock *PredB = PN->getOperand(i+1).getMBB();
+ if (PredB == Latch)
+ continue;
+
+ MachineOperand MO = MachineOperand::CreateReg(PredR, false);
+ MO.setSubReg(PredRSub);
+ NewPN->addOperand(MO);
+ NewPN->addOperand(MachineOperand::CreateMBB(PredB));
+ }
+
+ // Remove copied operands from the old PHI node and add the value
+ // coming from the preheader's PHI.
+ for (int i = PN->getNumOperands()-2; i > 0; i -= 2) {
+ MachineBasicBlock *PredB = PN->getOperand(i+1).getMBB();
+ if (PredB != Latch) {
+ PN->RemoveOperand(i+1);
+ PN->RemoveOperand(i);
+ }
+ }
+ PN->addOperand(MachineOperand::CreateReg(NewPR, false));
+ PN->addOperand(MachineOperand::CreateMBB(NewPH));
+ }
+ } else {
+ assert(Header->pred_size() == 2);
+
+ // The header has only two predecessors, but the non-latch predecessor
+ // is not a preheader (e.g. it has other successors, etc.)
+ // In such a case we don't need any extra PHI nodes in the new preheader,
+ // all we need is to adjust existing PHIs in the header to now refer to
+ // the new preheader.
+ for (instr_iterator I = Header->instr_begin(), E = Header->instr_end();
+ I != E && I->isPHI(); ++I) {
+ MachineInstr *PN = &*I;
+ for (unsigned i = 1, n = PN->getNumOperands(); i < n; i += 2) {
+ MachineOperand &MO = PN->getOperand(i+1);
+ if (MO.getMBB() != Latch)
+ MO.setMBB(NewPH);
+ }
+ }
+ }
+
+ // "Reroute" the CFG edges to link in the new preheader.
+ // If any of the predecessors falls through to the header, insert a branch
+ // to the new preheader in that place.
+ SmallVector<MachineOperand,1> Tmp2;
+ SmallVector<MachineOperand,1> EmptyCond;
+
+ TB = FB = nullptr;
+
+ for (MBBVector::iterator I = Preds.begin(), E = Preds.end(); I != E; ++I) {
+ MachineBasicBlock *PB = *I;
+ if (PB != Latch) {
+ Tmp2.clear();
+ bool NotAnalyzed = TII->analyzeBranch(*PB, TB, FB, Tmp2, false);
+ (void)NotAnalyzed; // suppress compiler warning
+ assert (!NotAnalyzed && "Should be analyzable!");
+ if (TB != Header && (Tmp2.empty() || FB != Header))
+ TII->insertBranch(*PB, NewPH, nullptr, EmptyCond, DL);
+ PB->ReplaceUsesOfBlockWith(Header, NewPH);
+ }
+ }
+
+ // It can happen that the latch block will fall through into the header.
+ // Insert an unconditional branch to the header.
+ TB = FB = nullptr;
+ bool LatchNotAnalyzed = TII->analyzeBranch(*Latch, TB, FB, Tmp2, false);
+ (void)LatchNotAnalyzed; // suppress compiler warning
+ assert (!LatchNotAnalyzed && "Should be analyzable!");
+ if (!TB && !FB)
+ TII->insertBranch(*Latch, Header, nullptr, EmptyCond, DL);
+
+ // Finally, the branch from the preheader to the header.
+ TII->insertBranch(*NewPH, Header, nullptr, EmptyCond, DL);
+ NewPH->addSuccessor(Header);
+
+ MachineLoop *ParentLoop = L->getParentLoop();
+ if (ParentLoop)
+ ParentLoop->addBasicBlockToLoop(NewPH, MLI->getBase());
+
+ // Update the dominator information with the new preheader.
+ if (MDT) {
+ if (MachineDomTreeNode *HN = MDT->getNode(Header)) {
+ if (MachineDomTreeNode *DHN = HN->getIDom()) {
+ MDT->addNewBlock(NewPH, DHN->getBlock());
+ MDT->changeImmediateDominator(Header, NewPH);
+ }
+ }
+ }
+
+ return NewPH;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.cpp
new file mode 100644
index 000000000000..036b18678709
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.cpp
@@ -0,0 +1,140 @@
+//===-- HexagonHazardRecognizer.cpp - Hexagon Post RA Hazard Recognizer ---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the hazard recognizer for scheduling on Hexagon.
+// Use a DFA based hazard recognizer.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonHazardRecognizer.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "post-RA-sched"
+
+void HexagonHazardRecognizer::Reset() {
+ DEBUG(dbgs() << "Reset hazard recognizer\n");
+ Resources->clearResources();
+ PacketNum = 0;
+ UsesDotCur = nullptr;
+ DotCurPNum = -1;
+ RegDefs.clear();
+}
+
+ScheduleHazardRecognizer::HazardType
+HexagonHazardRecognizer::getHazardType(SUnit *SU, int stalls) {
+ MachineInstr *MI = SU->getInstr();
+ if (!MI || TII->isZeroCost(MI->getOpcode()))
+ return NoHazard;
+
+ if (!Resources->canReserveResources(*MI)) {
+ DEBUG(dbgs() << "*** Hazard in cycle " << PacketNum << ", " << *MI);
+ HazardType RetVal = Hazard;
+ if (TII->mayBeNewStore(*MI)) {
+ // Make sure the register to be stored is defined by an instruction in the
+ // packet.
+ MachineOperand &MO = MI->getOperand(MI->getNumOperands() - 1);
+ if (!MO.isReg() || RegDefs.count(MO.getReg()) == 0)
+ return Hazard;
+ // The .new store version uses different resources so check if it
+ // causes a hazard.
+ MachineFunction *MF = MI->getParent()->getParent();
+ MachineInstr *NewMI =
+ MF->CreateMachineInstr(TII->get(TII->getDotNewOp(*MI)),
+ MI->getDebugLoc());
+ if (Resources->canReserveResources(*NewMI))
+ RetVal = NoHazard;
+ DEBUG(dbgs() << "*** Try .new version? " << (RetVal == NoHazard) << "\n");
+ MF->DeleteMachineInstr(NewMI);
+ }
+ return RetVal;
+ }
+
+ if (SU == UsesDotCur && DotCurPNum != (int)PacketNum) {
+ DEBUG(dbgs() << "*** .cur Hazard in cycle " << PacketNum << ", " << *MI);
+ return Hazard;
+ }
+
+ return NoHazard;
+}
+
+void HexagonHazardRecognizer::AdvanceCycle() {
+ DEBUG(dbgs() << "Advance cycle, clear state\n");
+ Resources->clearResources();
+ if (DotCurPNum != -1 && DotCurPNum != (int)PacketNum) {
+ UsesDotCur = nullptr;
+ DotCurPNum = -1;
+ }
+ PacketNum++;
+ RegDefs.clear();
+}
+
+/// If a packet contains a dot cur instruction, then we may prefer the
+/// instruction that can use the dot cur result. Or, if the use
+/// isn't scheduled in the same packet, then prefer other instructions
+/// in the subsequent packet.
+bool HexagonHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
+ return UsesDotCur && ((SU == UsesDotCur) ^ (DotCurPNum == (int)PacketNum));
+}
+
+void HexagonHazardRecognizer::EmitInstruction(SUnit *SU) {
+ MachineInstr *MI = SU->getInstr();
+ if (!MI)
+ return;
+
+ // Keep the set of definitions for each packet, which is used to determine
+ // if a .new can be used.
+ for (const MachineOperand &MO : MI->operands())
+ if (MO.isReg() && MO.isDef() && !MO.isImplicit())
+ RegDefs.insert(MO.getReg());
+
+ if (TII->isZeroCost(MI->getOpcode()))
+ return;
+
+ if (!Resources->canReserveResources(*MI)) {
+ // It must be a .new store since other instructions must be able to be
+ // reserved at this point.
+ assert(TII->mayBeNewStore(*MI) && "Expecting .new store");
+ MachineFunction *MF = MI->getParent()->getParent();
+ MachineInstr *NewMI =
+ MF->CreateMachineInstr(TII->get(TII->getDotNewOp(*MI)),
+ MI->getDebugLoc());
+ assert(Resources->canReserveResources(*NewMI));
+ Resources->reserveResources(*NewMI);
+ MF->DeleteMachineInstr(NewMI);
+ }
+ else
+ Resources->reserveResources(*MI);
+ DEBUG(dbgs() << " Add instruction " << *MI);
+
+ // When scheduling a dot cur instruction, check if there is an instruction
+ // that can use the dot cur in the same packet. If so, we'll attempt to
+ // schedule it before other instructions. We only do this if the use has
+ // the same height as the dot cur. Otherwise, we may miss scheduling an
+ // instruction with a greater height, which is more important.
+ if (TII->mayBeCurLoad(*MI))
+ for (auto &S : SU->Succs)
+ if (S.isAssignedRegDep() && S.getLatency() == 0 &&
+ SU->getHeight() == S.getSUnit()->getHeight()) {
+ UsesDotCur = S.getSUnit();
+ DotCurPNum = PacketNum;
+ break;
+ }
+ if (SU == UsesDotCur) {
+ UsesDotCur = nullptr;
+ DotCurPNum = -1;
+ }
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.h b/contrib/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.h
new file mode 100644
index 000000000000..70efcb7a9f76
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.h
@@ -0,0 +1,78 @@
+//===--- HexagonHazardRecognizer.h - Hexagon Post RA Hazard Recognizer ----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This file defines the hazard recognizer for scheduling on Hexagon.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONPROFITRECOGNIZER_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONPROFITRECOGNIZER_H
+
+#include "HexagonInstrInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/DFAPacketizer.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+
+namespace llvm {
+
+class HexagonHazardRecognizer : public ScheduleHazardRecognizer {
+ DFAPacketizer *Resources;
+ const HexagonInstrInfo *TII;
+ unsigned PacketNum;
+ // If the packet contains a potential dot cur instruction. This is
+ // used for the scheduling priority function.
+ SUnit *UsesDotCur;
+ // The packet number when a dor cur is emitted. If its use is not generated
+ // in the same packet, then try to wait another cycle before emitting.
+ int DotCurPNum;
+ // The set of registers defined by instructions in the current packet.
+ SmallSet<unsigned, 8> RegDefs;
+
+public:
+ HexagonHazardRecognizer(const InstrItineraryData *II,
+ const HexagonInstrInfo *HII,
+ const HexagonSubtarget &ST)
+ : Resources(ST.createDFAPacketizer(II)), TII(HII), PacketNum(0),
+ UsesDotCur(nullptr), DotCurPNum(-1) { }
+
+ ~HexagonHazardRecognizer() override {
+ if (Resources)
+ delete Resources;
+ }
+
+ /// This callback is invoked when a new block of instructions is about to be
+ /// scheduled. The hazard state is set to an initialized state.
+ void Reset() override;
+
+ /// Return the hazard type of emitting this node. There are three
+ /// possible results. Either:
+ /// * NoHazard: it is legal to issue this instruction on this cycle.
+ /// * Hazard: issuing this instruction would stall the machine. If some
+ /// other instruction is available, issue it first.
+ HazardType getHazardType(SUnit *SU, int stalls) override;
+
+ /// This callback is invoked when an instruction is emitted to be scheduled,
+ /// to advance the hazard state.
+ void EmitInstruction(SUnit *) override;
+
+ /// This callback may be invoked if getHazardType returns NoHazard. If, even
+ /// though there is no hazard, it would be better to schedule another
+ /// available instruction, this callback should return true.
+ bool ShouldPreferAnother(SUnit *) override;
+
+ /// This callback is invoked whenever the next top-down instruction to be
+ /// scheduled cannot issue in the current cycle, either because of latency
+ /// or resource conflicts. This should increment the internal state of the
+ /// hazard recognizer so that previously "Hazard" instructions will now not
+ /// be hazards.
+ void AdvanceCycle() override;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_HEXAGON_HEXAGONPROFITRECOGNIZER_H
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
new file mode 100644
index 000000000000..f6012d29d422
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -0,0 +1,2002 @@
+//===-- HexagonISelDAGToDAG.cpp - A dag to dag inst selector for Hexagon --===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the Hexagon target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "HexagonISelLowering.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagon-isel"
+
+static
+cl::opt<bool>
+EnableAddressRebalancing("isel-rebalance-addr", cl::Hidden, cl::init(true),
+ cl::desc("Rebalance address calculation trees to improve "
+ "instruction selection"));
+
+// Rebalance only if this allows e.g. combining a GA with an offset or
+// factoring out a shift.
+static
+cl::opt<bool>
+RebalanceOnlyForOptimizations("rebalance-only-opt", cl::Hidden, cl::init(false),
+ cl::desc("Rebalance address tree only if this allows optimizations"));
+
+static
+cl::opt<bool>
+RebalanceOnlyImbalancedTrees("rebalance-only-imbal", cl::Hidden,
+ cl::init(false), cl::desc("Rebalance address tree only if it is imbalanced"));
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+//===--------------------------------------------------------------------===//
+/// HexagonDAGToDAGISel - Hexagon specific code to select Hexagon machine
+/// instructions for SelectionDAG operations.
+///
+namespace {
+class HexagonDAGToDAGISel : public SelectionDAGISel {
+ const HexagonSubtarget *HST;
+ const HexagonInstrInfo *HII;
+ const HexagonRegisterInfo *HRI;
+public:
+ explicit HexagonDAGToDAGISel(HexagonTargetMachine &tm,
+ CodeGenOpt::Level OptLevel)
+ : SelectionDAGISel(tm, OptLevel), HST(nullptr), HII(nullptr),
+ HRI(nullptr) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ // Reset the subtarget each time through.
+ HST = &MF.getSubtarget<HexagonSubtarget>();
+ HII = HST->getInstrInfo();
+ HRI = HST->getRegisterInfo();
+ SelectionDAGISel::runOnMachineFunction(MF);
+ return true;
+ }
+
+ void PreprocessISelDAG() override;
+ void EmitFunctionEntryCode() override;
+
+ void Select(SDNode *N) override;
+
+ // Complex Pattern Selectors.
+ inline bool SelectAddrGA(SDValue &N, SDValue &R);
+ inline bool SelectAddrGP(SDValue &N, SDValue &R);
+ bool SelectGlobalAddress(SDValue &N, SDValue &R, bool UseGP);
+ bool SelectAddrFI(SDValue &N, SDValue &R);
+
+ StringRef getPassName() const override {
+ return "Hexagon DAG->DAG Pattern Instruction Selection";
+ }
+
+ // Generate a machine instruction node corresponding to the circ/brev
+ // load intrinsic.
+ MachineSDNode *LoadInstrForLoadIntrinsic(SDNode *IntN);
+ // Given the circ/brev load intrinsic and the already generated machine
+ // instruction, generate the appropriate store (that is a part of the
+ // intrinsic's functionality).
+ SDNode *StoreInstrForLoadIntrinsic(MachineSDNode *LoadN, SDNode *IntN);
+
+ void SelectFrameIndex(SDNode *N);
+ /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+ /// inline asm expressions.
+ bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+ unsigned ConstraintID,
+ std::vector<SDValue> &OutOps) override;
+ bool tryLoadOfLoadIntrinsic(LoadSDNode *N);
+ void SelectLoad(SDNode *N);
+ void SelectIndexedLoad(LoadSDNode *LD, const SDLoc &dl);
+ void SelectIndexedStore(StoreSDNode *ST, const SDLoc &dl);
+ void SelectStore(SDNode *N);
+ void SelectSHL(SDNode *N);
+ void SelectMul(SDNode *N);
+ void SelectZeroExtend(SDNode *N);
+ void SelectIntrinsicWChain(SDNode *N);
+ void SelectIntrinsicWOChain(SDNode *N);
+ void SelectConstant(SDNode *N);
+ void SelectConstantFP(SDNode *N);
+ void SelectBitcast(SDNode *N);
+
+ // Include the pieces autogenerated from the target description.
+ #include "HexagonGenDAGISel.inc"
+
+private:
+ bool isValueExtension(const SDValue &Val, unsigned FromBits, SDValue &Src);
+ bool isOrEquivalentToAdd(const SDNode *N) const;
+ bool isAlignedMemNode(const MemSDNode *N) const;
+ bool isPositiveHalfWord(const SDNode *N) const;
+
+ SmallDenseMap<SDNode *,int> RootWeights;
+ SmallDenseMap<SDNode *,int> RootHeights;
+ SmallDenseMap<const Value *,int> GAUsesInFunction;
+ int getWeight(SDNode *N);
+ int getHeight(SDNode *N);
+ SDValue getMultiplierForSHL(SDNode *N);
+ SDValue factorOutPowerOf2(SDValue V, unsigned Power);
+ unsigned getUsesInFunction(const Value *V);
+ SDValue balanceSubTree(SDNode *N, bool Factorize = false);
+ void rebalanceAddressTrees();
+}; // end HexagonDAGToDAGISel
+} // end anonymous namespace
+
+
+/// createHexagonISelDag - This pass converts a legalized DAG into a
+/// Hexagon-specific DAG, ready for instruction scheduling.
+///
+namespace llvm {
+FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM,
+ CodeGenOpt::Level OptLevel) {
+ return new HexagonDAGToDAGISel(TM, OptLevel);
+}
+}
+
+// Intrinsics that return a a predicate.
+static bool doesIntrinsicReturnPredicate(unsigned ID) {
+ switch (ID) {
+ default:
+ return false;
+ case Intrinsic::hexagon_C2_cmpeq:
+ case Intrinsic::hexagon_C2_cmpgt:
+ case Intrinsic::hexagon_C2_cmpgtu:
+ case Intrinsic::hexagon_C2_cmpgtup:
+ case Intrinsic::hexagon_C2_cmpgtp:
+ case Intrinsic::hexagon_C2_cmpeqp:
+ case Intrinsic::hexagon_C2_bitsset:
+ case Intrinsic::hexagon_C2_bitsclr:
+ case Intrinsic::hexagon_C2_cmpeqi:
+ case Intrinsic::hexagon_C2_cmpgti:
+ case Intrinsic::hexagon_C2_cmpgtui:
+ case Intrinsic::hexagon_C2_cmpgei:
+ case Intrinsic::hexagon_C2_cmpgeui:
+ case Intrinsic::hexagon_C2_cmplt:
+ case Intrinsic::hexagon_C2_cmpltu:
+ case Intrinsic::hexagon_C2_bitsclri:
+ case Intrinsic::hexagon_C2_and:
+ case Intrinsic::hexagon_C2_or:
+ case Intrinsic::hexagon_C2_xor:
+ case Intrinsic::hexagon_C2_andn:
+ case Intrinsic::hexagon_C2_not:
+ case Intrinsic::hexagon_C2_orn:
+ case Intrinsic::hexagon_C2_pxfer_map:
+ case Intrinsic::hexagon_C2_any8:
+ case Intrinsic::hexagon_C2_all8:
+ case Intrinsic::hexagon_A2_vcmpbeq:
+ case Intrinsic::hexagon_A2_vcmpbgtu:
+ case Intrinsic::hexagon_A2_vcmpheq:
+ case Intrinsic::hexagon_A2_vcmphgt:
+ case Intrinsic::hexagon_A2_vcmphgtu:
+ case Intrinsic::hexagon_A2_vcmpweq:
+ case Intrinsic::hexagon_A2_vcmpwgt:
+ case Intrinsic::hexagon_A2_vcmpwgtu:
+ case Intrinsic::hexagon_C2_tfrrp:
+ case Intrinsic::hexagon_S2_tstbit_i:
+ case Intrinsic::hexagon_S2_tstbit_r:
+ return true;
+ }
+}
+
+void HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, const SDLoc &dl) {
+ SDValue Chain = LD->getChain();
+ SDValue Base = LD->getBasePtr();
+ SDValue Offset = LD->getOffset();
+ int32_t Inc = cast<ConstantSDNode>(Offset.getNode())->getSExtValue();
+ EVT LoadedVT = LD->getMemoryVT();
+ unsigned Opcode = 0;
+
+ // Check for zero extended loads. Treat any-extend loads as zero extended
+ // loads.
+ ISD::LoadExtType ExtType = LD->getExtensionType();
+ bool IsZeroExt = (ExtType == ISD::ZEXTLOAD || ExtType == ISD::EXTLOAD);
+ bool IsValidInc = HII->isValidAutoIncImm(LoadedVT, Inc);
+
+ assert(LoadedVT.isSimple());
+ switch (LoadedVT.getSimpleVT().SimpleTy) {
+ case MVT::i8:
+ if (IsZeroExt)
+ Opcode = IsValidInc ? Hexagon::L2_loadrub_pi : Hexagon::L2_loadrub_io;
+ else
+ Opcode = IsValidInc ? Hexagon::L2_loadrb_pi : Hexagon::L2_loadrb_io;
+ break;
+ case MVT::i16:
+ if (IsZeroExt)
+ Opcode = IsValidInc ? Hexagon::L2_loadruh_pi : Hexagon::L2_loadruh_io;
+ else
+ Opcode = IsValidInc ? Hexagon::L2_loadrh_pi : Hexagon::L2_loadrh_io;
+ break;
+ case MVT::i32:
+ Opcode = IsValidInc ? Hexagon::L2_loadri_pi : Hexagon::L2_loadri_io;
+ break;
+ case MVT::i64:
+ Opcode = IsValidInc ? Hexagon::L2_loadrd_pi : Hexagon::L2_loadrd_io;
+ break;
+ // 64B
+ case MVT::v64i8:
+ case MVT::v32i16:
+ case MVT::v16i32:
+ case MVT::v8i64:
+ if (isAlignedMemNode(LD))
+ Opcode = IsValidInc ? Hexagon::V6_vL32b_pi : Hexagon::V6_vL32b_ai;
+ else
+ Opcode = IsValidInc ? Hexagon::V6_vL32Ub_pi : Hexagon::V6_vL32Ub_ai;
+ break;
+ // 128B
+ case MVT::v128i8:
+ case MVT::v64i16:
+ case MVT::v32i32:
+ case MVT::v16i64:
+ if (isAlignedMemNode(LD))
+ Opcode = IsValidInc ? Hexagon::V6_vL32b_pi_128B
+ : Hexagon::V6_vL32b_ai_128B;
+ else
+ Opcode = IsValidInc ? Hexagon::V6_vL32Ub_pi_128B
+ : Hexagon::V6_vL32Ub_ai_128B;
+ break;
+ default:
+ llvm_unreachable("Unexpected memory type in indexed load");
+ }
+
+ SDValue IncV = CurDAG->getTargetConstant(Inc, dl, MVT::i32);
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = LD->getMemOperand();
+
+ auto getExt64 = [this,ExtType] (MachineSDNode *N, const SDLoc &dl)
+ -> MachineSDNode* {
+ if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::EXTLOAD) {
+ SDValue Zero = CurDAG->getTargetConstant(0, dl, MVT::i32);
+ return CurDAG->getMachineNode(Hexagon::A4_combineir, dl, MVT::i64,
+ Zero, SDValue(N, 0));
+ }
+ if (ExtType == ISD::SEXTLOAD)
+ return CurDAG->getMachineNode(Hexagon::A2_sxtw, dl, MVT::i64,
+ SDValue(N, 0));
+ return N;
+ };
+
+ // Loaded value Next address Chain
+ SDValue From[3] = { SDValue(LD,0), SDValue(LD,1), SDValue(LD,2) };
+ SDValue To[3];
+
+ EVT ValueVT = LD->getValueType(0);
+ if (ValueVT == MVT::i64 && ExtType != ISD::NON_EXTLOAD) {
+ // A load extending to i64 will actually produce i32, which will then
+ // need to be extended to i64.
+ assert(LoadedVT.getSizeInBits() <= 32);
+ ValueVT = MVT::i32;
+ }
+
+ if (IsValidInc) {
+ MachineSDNode *L = CurDAG->getMachineNode(Opcode, dl, ValueVT,
+ MVT::i32, MVT::Other, Base,
+ IncV, Chain);
+ L->setMemRefs(MemOp, MemOp+1);
+ To[1] = SDValue(L, 1); // Next address.
+ To[2] = SDValue(L, 2); // Chain.
+ // Handle special case for extension to i64.
+ if (LD->getValueType(0) == MVT::i64)
+ L = getExt64(L, dl);
+ To[0] = SDValue(L, 0); // Loaded (extended) value.
+ } else {
+ SDValue Zero = CurDAG->getTargetConstant(0, dl, MVT::i32);
+ MachineSDNode *L = CurDAG->getMachineNode(Opcode, dl, ValueVT, MVT::Other,
+ Base, Zero, Chain);
+ L->setMemRefs(MemOp, MemOp+1);
+ To[2] = SDValue(L, 1); // Chain.
+ MachineSDNode *A = CurDAG->getMachineNode(Hexagon::A2_addi, dl, MVT::i32,
+ Base, IncV);
+ To[1] = SDValue(A, 0); // Next address.
+ // Handle special case for extension to i64.
+ if (LD->getValueType(0) == MVT::i64)
+ L = getExt64(L, dl);
+ To[0] = SDValue(L, 0); // Loaded (extended) value.
+ }
+ ReplaceUses(From, To, 3);
+ CurDAG->RemoveDeadNode(LD);
+}
+
+
+MachineSDNode *HexagonDAGToDAGISel::LoadInstrForLoadIntrinsic(SDNode *IntN) {
+ if (IntN->getOpcode() != ISD::INTRINSIC_W_CHAIN)
+ return nullptr;
+
+ SDLoc dl(IntN);
+ unsigned IntNo = cast<ConstantSDNode>(IntN->getOperand(1))->getZExtValue();
+
+ static std::map<unsigned,unsigned> LoadPciMap = {
+ { Intrinsic::hexagon_circ_ldb, Hexagon::L2_loadrb_pci },
+ { Intrinsic::hexagon_circ_ldub, Hexagon::L2_loadrub_pci },
+ { Intrinsic::hexagon_circ_ldh, Hexagon::L2_loadrh_pci },
+ { Intrinsic::hexagon_circ_lduh, Hexagon::L2_loadruh_pci },
+ { Intrinsic::hexagon_circ_ldw, Hexagon::L2_loadri_pci },
+ { Intrinsic::hexagon_circ_ldd, Hexagon::L2_loadrd_pci },
+ };
+ auto FLC = LoadPciMap.find(IntNo);
+ if (FLC != LoadPciMap.end()) {
+ SDNode *Mod = CurDAG->getMachineNode(Hexagon::A2_tfrrcr, dl, MVT::i32,
+ IntN->getOperand(4));
+ EVT ValTy = (IntNo == Intrinsic::hexagon_circ_ldd) ? MVT::i64 : MVT::i32;
+ EVT RTys[] = { ValTy, MVT::i32, MVT::Other };
+ // Operands: { Base, Increment, Modifier, Chain }
+ auto Inc = cast<ConstantSDNode>(IntN->getOperand(5));
+ SDValue I = CurDAG->getTargetConstant(Inc->getSExtValue(), dl, MVT::i32);
+ MachineSDNode *Res = CurDAG->getMachineNode(FLC->second, dl, RTys,
+ { IntN->getOperand(2), I, SDValue(Mod,0), IntN->getOperand(0) });
+ return Res;
+ }
+
+ static std::map<unsigned,unsigned> LoadPbrMap = {
+ { Intrinsic::hexagon_brev_ldb, Hexagon::L2_loadrb_pbr },
+ { Intrinsic::hexagon_brev_ldub, Hexagon::L2_loadrub_pbr },
+ { Intrinsic::hexagon_brev_ldh, Hexagon::L2_loadrh_pbr },
+ { Intrinsic::hexagon_brev_lduh, Hexagon::L2_loadruh_pbr },
+ { Intrinsic::hexagon_brev_ldw, Hexagon::L2_loadri_pbr },
+ { Intrinsic::hexagon_brev_ldd, Hexagon::L2_loadrd_pbr },
+ };
+ auto FLB = LoadPbrMap.find(IntNo);
+ if (FLB != LoadPbrMap.end()) {
+ SDNode *Mod = CurDAG->getMachineNode(Hexagon::A2_tfrrcr, dl, MVT::i32,
+ IntN->getOperand(4));
+ EVT ValTy = (IntNo == Intrinsic::hexagon_brev_ldd) ? MVT::i64 : MVT::i32;
+ EVT RTys[] = { ValTy, MVT::i32, MVT::Other };
+ // Operands: { Base, Modifier, Chain }
+ MachineSDNode *Res = CurDAG->getMachineNode(FLB->second, dl, RTys,
+ { IntN->getOperand(2), SDValue(Mod,0), IntN->getOperand(0) });
+ return Res;
+ }
+
+ return nullptr;
+}
+
+SDNode *HexagonDAGToDAGISel::StoreInstrForLoadIntrinsic(MachineSDNode *LoadN,
+ SDNode *IntN) {
+ // The "LoadN" is just a machine load instruction. The intrinsic also
+ // involves storing it. Generate an appropriate store to the location
+ // given in the intrinsic's operand(3).
+ uint64_t F = HII->get(LoadN->getMachineOpcode()).TSFlags;
+ unsigned SizeBits = (F >> HexagonII::MemAccessSizePos) &
+ HexagonII::MemAccesSizeMask;
+ unsigned Size = 1U << (SizeBits-1);
+
+ SDLoc dl(IntN);
+ MachinePointerInfo PI;
+ SDValue TS;
+ SDValue Loc = IntN->getOperand(3);
+
+ if (Size >= 4)
+ TS = CurDAG->getStore(SDValue(LoadN, 2), dl, SDValue(LoadN, 0), Loc, PI,
+ Size);
+ else
+ TS = CurDAG->getTruncStore(SDValue(LoadN, 2), dl, SDValue(LoadN, 0), Loc,
+ PI, MVT::getIntegerVT(Size * 8), Size);
+
+ SDNode *StoreN;
+ {
+ HandleSDNode Handle(TS);
+ SelectStore(TS.getNode());
+ StoreN = Handle.getValue().getNode();
+ }
+
+ // Load's results are { Loaded value, Updated pointer, Chain }
+ ReplaceUses(SDValue(IntN, 0), SDValue(LoadN, 1));
+ ReplaceUses(SDValue(IntN, 1), SDValue(StoreN, 0));
+ return StoreN;
+}
+
+bool HexagonDAGToDAGISel::tryLoadOfLoadIntrinsic(LoadSDNode *N) {
+ // The intrinsics for load circ/brev perform two operations:
+ // 1. Load a value V from the specified location, using the addressing
+ // mode corresponding to the intrinsic.
+ // 2. Store V into a specified location. This location is typically a
+ // local, temporary object.
+ // In many cases, the program using these intrinsics will immediately
+ // load V again from the local object. In those cases, when certain
+ // conditions are met, the last load can be removed.
+ // This function identifies and optimizes this pattern. If the pattern
+ // cannot be optimized, it returns nullptr, which will cause the load
+ // to be selected separately from the intrinsic (which will be handled
+ // in SelectIntrinsicWChain).
+
+ SDValue Ch = N->getOperand(0);
+ SDValue Loc = N->getOperand(1);
+
+ // Assume that the load and the intrinsic are connected directly with a
+ // chain:
+ // t1: i32,ch = int.load ..., ..., ..., Loc, ... // <-- C
+ // t2: i32,ch = load t1:1, Loc, ...
+ SDNode *C = Ch.getNode();
+
+ if (C->getOpcode() != ISD::INTRINSIC_W_CHAIN)
+ return false;
+
+ // The second load can only be eliminated if its extension type matches
+ // that of the load instruction corresponding to the intrinsic. The user
+ // can provide an address of an unsigned variable to store the result of
+ // a sign-extending intrinsic into (or the other way around).
+ ISD::LoadExtType IntExt;
+ switch (cast<ConstantSDNode>(C->getOperand(1))->getZExtValue()) {
+ case Intrinsic::hexagon_brev_ldub:
+ case Intrinsic::hexagon_brev_lduh:
+ case Intrinsic::hexagon_circ_ldub:
+ case Intrinsic::hexagon_circ_lduh:
+ IntExt = ISD::ZEXTLOAD;
+ break;
+ case Intrinsic::hexagon_brev_ldw:
+ case Intrinsic::hexagon_brev_ldd:
+ case Intrinsic::hexagon_circ_ldw:
+ case Intrinsic::hexagon_circ_ldd:
+ IntExt = ISD::NON_EXTLOAD;
+ break;
+ default:
+ IntExt = ISD::SEXTLOAD;
+ break;
+ }
+ if (N->getExtensionType() != IntExt)
+ return false;
+
+ // Make sure the target location for the loaded value in the load intrinsic
+ // is the location from which LD (or N) is loading.
+ if (C->getNumOperands() < 4 || Loc.getNode() != C->getOperand(3).getNode())
+ return false;
+
+ if (MachineSDNode *L = LoadInstrForLoadIntrinsic(C)) {
+ SDNode *S = StoreInstrForLoadIntrinsic(L, C);
+ SDValue F[] = { SDValue(N,0), SDValue(N,1), SDValue(C,0), SDValue(C,1) };
+ SDValue T[] = { SDValue(L,0), SDValue(S,0), SDValue(L,1), SDValue(S,0) };
+ ReplaceUses(F, T, array_lengthof(T));
+ // This transformation will leave the intrinsic dead. If it remains in
+ // the DAG, the selection code will see it again, but without the load,
+ // and it will generate a store that is normally required for it.
+ CurDAG->RemoveDeadNode(C);
+ return true;
+ }
+
+ return false;
+}
+
+void HexagonDAGToDAGISel::SelectLoad(SDNode *N) {
+ SDLoc dl(N);
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ ISD::MemIndexedMode AM = LD->getAddressingMode();
+
+ // Handle indexed loads.
+ if (AM != ISD::UNINDEXED) {
+ SelectIndexedLoad(LD, dl);
+ return;
+ }
+
+ // Handle patterns using circ/brev load intrinsics.
+ if (tryLoadOfLoadIntrinsic(LD))
+ return;
+
+ SelectCode(LD);
+}
+
+void HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, const SDLoc &dl) {
+ SDValue Chain = ST->getChain();
+ SDValue Base = ST->getBasePtr();
+ SDValue Offset = ST->getOffset();
+ SDValue Value = ST->getValue();
+ // Get the constant value.
+ int32_t Inc = cast<ConstantSDNode>(Offset.getNode())->getSExtValue();
+ EVT StoredVT = ST->getMemoryVT();
+ EVT ValueVT = Value.getValueType();
+
+ bool IsValidInc = HII->isValidAutoIncImm(StoredVT, Inc);
+ unsigned Opcode = 0;
+
+ assert(StoredVT.isSimple());
+ switch (StoredVT.getSimpleVT().SimpleTy) {
+ case MVT::i8:
+ Opcode = IsValidInc ? Hexagon::S2_storerb_pi : Hexagon::S2_storerb_io;
+ break;
+ case MVT::i16:
+ Opcode = IsValidInc ? Hexagon::S2_storerh_pi : Hexagon::S2_storerh_io;
+ break;
+ case MVT::i32:
+ Opcode = IsValidInc ? Hexagon::S2_storeri_pi : Hexagon::S2_storeri_io;
+ break;
+ case MVT::i64:
+ Opcode = IsValidInc ? Hexagon::S2_storerd_pi : Hexagon::S2_storerd_io;
+ break;
+ // 64B
+ case MVT::v64i8:
+ case MVT::v32i16:
+ case MVT::v16i32:
+ case MVT::v8i64:
+ if (isAlignedMemNode(ST))
+ Opcode = IsValidInc ? Hexagon::V6_vS32b_pi : Hexagon::V6_vS32b_ai;
+ else
+ Opcode = IsValidInc ? Hexagon::V6_vS32Ub_pi : Hexagon::V6_vS32Ub_ai;
+ break;
+ // 128B
+ case MVT::v128i8:
+ case MVT::v64i16:
+ case MVT::v32i32:
+ case MVT::v16i64:
+ if (isAlignedMemNode(ST))
+ Opcode = IsValidInc ? Hexagon::V6_vS32b_pi_128B
+ : Hexagon::V6_vS32b_ai_128B;
+ else
+ Opcode = IsValidInc ? Hexagon::V6_vS32Ub_pi_128B
+ : Hexagon::V6_vS32Ub_ai_128B;
+ break;
+ default:
+ llvm_unreachable("Unexpected memory type in indexed store");
+ }
+
+ if (ST->isTruncatingStore() && ValueVT.getSizeInBits() == 64) {
+ assert(StoredVT.getSizeInBits() < 64 && "Not a truncating store");
+ Value = CurDAG->getTargetExtractSubreg(Hexagon::isub_lo,
+ dl, MVT::i32, Value);
+ }
+
+ SDValue IncV = CurDAG->getTargetConstant(Inc, dl, MVT::i32);
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = ST->getMemOperand();
+
+ // Next address Chain
+ SDValue From[2] = { SDValue(ST,0), SDValue(ST,1) };
+ SDValue To[2];
+
+ if (IsValidInc) {
+ // Build post increment store.
+ SDValue Ops[] = { Base, IncV, Value, Chain };
+ MachineSDNode *S = CurDAG->getMachineNode(Opcode, dl, MVT::i32, MVT::Other,
+ Ops);
+ S->setMemRefs(MemOp, MemOp + 1);
+ To[0] = SDValue(S, 0);
+ To[1] = SDValue(S, 1);
+ } else {
+ SDValue Zero = CurDAG->getTargetConstant(0, dl, MVT::i32);
+ SDValue Ops[] = { Base, Zero, Value, Chain };
+ MachineSDNode *S = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
+ S->setMemRefs(MemOp, MemOp + 1);
+ To[1] = SDValue(S, 0);
+ MachineSDNode *A = CurDAG->getMachineNode(Hexagon::A2_addi, dl, MVT::i32,
+ Base, IncV);
+ To[0] = SDValue(A, 0);
+ }
+
+ ReplaceUses(From, To, 2);
+ CurDAG->RemoveDeadNode(ST);
+}
+
+void HexagonDAGToDAGISel::SelectStore(SDNode *N) {
+ SDLoc dl(N);
+ StoreSDNode *ST = cast<StoreSDNode>(N);
+ ISD::MemIndexedMode AM = ST->getAddressingMode();
+
+ // Handle indexed stores.
+ if (AM != ISD::UNINDEXED) {
+ SelectIndexedStore(ST, dl);
+ return;
+ }
+
+ SelectCode(ST);
+}
+
+void HexagonDAGToDAGISel::SelectMul(SDNode *N) {
+ SDLoc dl(N);
+
+ // %conv.i = sext i32 %tmp1 to i64
+ // %conv2.i = sext i32 %add to i64
+ // %mul.i = mul nsw i64 %conv2.i, %conv.i
+ //
+ // --- match with the following ---
+ //
+ // %mul.i = mpy (%tmp1, %add)
+ //
+
+ if (N->getValueType(0) == MVT::i64) {
+ // Shifting a i64 signed multiply.
+ SDValue MulOp0 = N->getOperand(0);
+ SDValue MulOp1 = N->getOperand(1);
+
+ SDValue OP0;
+ SDValue OP1;
+
+ // Handle sign_extend and sextload.
+ if (MulOp0.getOpcode() == ISD::SIGN_EXTEND) {
+ SDValue Sext0 = MulOp0.getOperand(0);
+ if (Sext0.getNode()->getValueType(0) != MVT::i32) {
+ SelectCode(N);
+ return;
+ }
+ OP0 = Sext0;
+ } else if (MulOp0.getOpcode() == ISD::LOAD) {
+ LoadSDNode *LD = cast<LoadSDNode>(MulOp0.getNode());
+ if (LD->getMemoryVT() != MVT::i32 ||
+ LD->getExtensionType() != ISD::SEXTLOAD ||
+ LD->getAddressingMode() != ISD::UNINDEXED) {
+ SelectCode(N);
+ return;
+ }
+ SDValue Chain = LD->getChain();
+ SDValue TargetConst0 = CurDAG->getTargetConstant(0, dl, MVT::i32);
+ OP0 = SDValue(CurDAG->getMachineNode(Hexagon::L2_loadri_io, dl, MVT::i32,
+ MVT::Other,
+ LD->getBasePtr(), TargetConst0,
+ Chain), 0);
+ } else {
+ SelectCode(N);
+ return;
+ }
+
+ // Same goes for the second operand.
+ if (MulOp1.getOpcode() == ISD::SIGN_EXTEND) {
+ SDValue Sext1 = MulOp1.getOperand(0);
+ if (Sext1.getNode()->getValueType(0) != MVT::i32) {
+ SelectCode(N);
+ return;
+ }
+ OP1 = Sext1;
+ } else if (MulOp1.getOpcode() == ISD::LOAD) {
+ LoadSDNode *LD = cast<LoadSDNode>(MulOp1.getNode());
+ if (LD->getMemoryVT() != MVT::i32 ||
+ LD->getExtensionType() != ISD::SEXTLOAD ||
+ LD->getAddressingMode() != ISD::UNINDEXED) {
+ SelectCode(N);
+ return;
+ }
+ SDValue Chain = LD->getChain();
+ SDValue TargetConst0 = CurDAG->getTargetConstant(0, dl, MVT::i32);
+ OP1 = SDValue(CurDAG->getMachineNode(Hexagon::L2_loadri_io, dl, MVT::i32,
+ MVT::Other,
+ LD->getBasePtr(), TargetConst0,
+ Chain), 0);
+ } else {
+ SelectCode(N);
+ return;
+ }
+
+ // Generate a mpy instruction.
+ SDNode *Result = CurDAG->getMachineNode(Hexagon::M2_dpmpyss_s0, dl,
+ MVT::i64, OP0, OP1);
+ ReplaceNode(N, Result);
+ return;
+ }
+
+ SelectCode(N);
+}
+
+void HexagonDAGToDAGISel::SelectSHL(SDNode *N) {
+ SDLoc dl(N);
+ SDValue Shl_0 = N->getOperand(0);
+ SDValue Shl_1 = N->getOperand(1);
+
+ auto Default = [this,N] () -> void { SelectCode(N); };
+
+ if (N->getValueType(0) != MVT::i32 || Shl_1.getOpcode() != ISD::Constant)
+ return Default();
+
+ // RHS is const.
+ int32_t ShlConst = cast<ConstantSDNode>(Shl_1)->getSExtValue();
+
+ if (Shl_0.getOpcode() == ISD::MUL) {
+ SDValue Mul_0 = Shl_0.getOperand(0); // Val
+ SDValue Mul_1 = Shl_0.getOperand(1); // Const
+ // RHS of mul is const.
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mul_1)) {
+ int32_t ValConst = C->getSExtValue() << ShlConst;
+ if (isInt<9>(ValConst)) {
+ SDValue Val = CurDAG->getTargetConstant(ValConst, dl, MVT::i32);
+ SDNode *Result = CurDAG->getMachineNode(Hexagon::M2_mpysmi, dl,
+ MVT::i32, Mul_0, Val);
+ ReplaceNode(N, Result);
+ return;
+ }
+ }
+ return Default();
+ }
+
+ if (Shl_0.getOpcode() == ISD::SUB) {
+ SDValue Sub_0 = Shl_0.getOperand(0); // Const 0
+ SDValue Sub_1 = Shl_0.getOperand(1); // Val
+ if (ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(Sub_0)) {
+ if (C1->getSExtValue() != 0 || Sub_1.getOpcode() != ISD::SHL)
+ return Default();
+ SDValue Shl2_0 = Sub_1.getOperand(0); // Val
+ SDValue Shl2_1 = Sub_1.getOperand(1); // Const
+ if (ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(Shl2_1)) {
+ int32_t ValConst = 1 << (ShlConst + C2->getSExtValue());
+ if (isInt<9>(-ValConst)) {
+ SDValue Val = CurDAG->getTargetConstant(-ValConst, dl, MVT::i32);
+ SDNode *Result = CurDAG->getMachineNode(Hexagon::M2_mpysmi, dl,
+ MVT::i32, Shl2_0, Val);
+ ReplaceNode(N, Result);
+ return;
+ }
+ }
+ }
+ }
+
+ return Default();
+}
+
+
+//
+// If there is an zero_extend followed an intrinsic in DAG (this means - the
+// result of the intrinsic is predicate); convert the zero_extend to
+// transfer instruction.
+//
+// Zero extend -> transfer is lowered here. Otherwise, zero_extend will be
+// converted into a MUX as predicate registers defined as 1 bit in the
+// compiler. Architecture defines them as 8-bit registers.
+// We want to preserve all the lower 8-bits and, not just 1 LSB bit.
+//
+void HexagonDAGToDAGISel::SelectZeroExtend(SDNode *N) {
+ SDLoc dl(N);
+
+ SDValue Op0 = N->getOperand(0);
+ EVT OpVT = Op0.getValueType();
+ unsigned OpBW = OpVT.getSizeInBits();
+
+ // Special handling for zero-extending a vector of booleans.
+ if (OpVT.isVector() && OpVT.getVectorElementType() == MVT::i1 && OpBW <= 64) {
+ SDNode *Mask = CurDAG->getMachineNode(Hexagon::C2_mask, dl, MVT::i64, Op0);
+ unsigned NE = OpVT.getVectorNumElements();
+ EVT ExVT = N->getValueType(0);
+ unsigned ES = ExVT.getScalarSizeInBits();
+ uint64_t MV = 0, Bit = 1;
+ for (unsigned i = 0; i < NE; ++i) {
+ MV |= Bit;
+ Bit <<= ES;
+ }
+ SDValue Ones = CurDAG->getTargetConstant(MV, dl, MVT::i64);
+ SDNode *OnesReg = CurDAG->getMachineNode(Hexagon::CONST64, dl,
+ MVT::i64, Ones);
+ if (ExVT.getSizeInBits() == 32) {
+ SDNode *And = CurDAG->getMachineNode(Hexagon::A2_andp, dl, MVT::i64,
+ SDValue(Mask,0), SDValue(OnesReg,0));
+ SDValue SubR = CurDAG->getTargetConstant(Hexagon::isub_lo, dl, MVT::i32);
+ ReplaceNode(N, CurDAG->getMachineNode(Hexagon::EXTRACT_SUBREG, dl, ExVT,
+ SDValue(And, 0), SubR));
+ return;
+ }
+ ReplaceNode(N,
+ CurDAG->getMachineNode(Hexagon::A2_andp, dl, ExVT,
+ SDValue(Mask, 0), SDValue(OnesReg, 0)));
+ return;
+ }
+
+ SDNode *Int = N->getOperand(0).getNode();
+ if ((Int->getOpcode() == ISD::INTRINSIC_WO_CHAIN)) {
+ unsigned ID = cast<ConstantSDNode>(Int->getOperand(0))->getZExtValue();
+ if (doesIntrinsicReturnPredicate(ID)) {
+ // Now we need to differentiate target data types.
+ if (N->getValueType(0) == MVT::i64) {
+ // Convert the zero_extend to Rs = Pd followed by A2_combinew(0,Rs).
+ SDValue TargetConst0 = CurDAG->getTargetConstant(0, dl, MVT::i32);
+ SDNode *Result_1 = CurDAG->getMachineNode(Hexagon::C2_tfrpr, dl,
+ MVT::i32, SDValue(Int, 0));
+ SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::A2_tfrsi, dl,
+ MVT::i32, TargetConst0);
+ SDNode *Result_3 = CurDAG->getMachineNode(Hexagon::A2_combinew, dl,
+ MVT::i64, MVT::Other,
+ SDValue(Result_2, 0),
+ SDValue(Result_1, 0));
+ ReplaceNode(N, Result_3);
+ return;
+ }
+ if (N->getValueType(0) == MVT::i32) {
+ // Convert the zero_extend to Rs = Pd
+ SDNode* RsPd = CurDAG->getMachineNode(Hexagon::C2_tfrpr, dl,
+ MVT::i32, SDValue(Int, 0));
+ ReplaceNode(N, RsPd);
+ return;
+ }
+ llvm_unreachable("Unexpected value type");
+ }
+ }
+ SelectCode(N);
+}
+
+
+//
+// Handling intrinsics for circular load and bitreverse load.
+//
+void HexagonDAGToDAGISel::SelectIntrinsicWChain(SDNode *N) {
+ if (MachineSDNode *L = LoadInstrForLoadIntrinsic(N)) {
+ StoreInstrForLoadIntrinsic(L, N);
+ CurDAG->RemoveDeadNode(N);
+ return;
+ }
+ SelectCode(N);
+}
+
+void HexagonDAGToDAGISel::SelectIntrinsicWOChain(SDNode *N) {
+ unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+ unsigned Bits;
+ switch (IID) {
+ case Intrinsic::hexagon_S2_vsplatrb:
+ Bits = 8;
+ break;
+ case Intrinsic::hexagon_S2_vsplatrh:
+ Bits = 16;
+ break;
+ default:
+ SelectCode(N);
+ return;
+ }
+
+ SDValue V = N->getOperand(1);
+ SDValue U;
+ if (isValueExtension(V, Bits, U)) {
+ SDValue R = CurDAG->getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
+ N->getOperand(0), U);
+ ReplaceNode(N, R.getNode());
+ SelectCode(R.getNode());
+ return;
+ }
+ SelectCode(N);
+}
+
+//
+// Map floating point constant values.
+//
+void HexagonDAGToDAGISel::SelectConstantFP(SDNode *N) {
+ SDLoc dl(N);
+ ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N);
+ APInt A = CN->getValueAPF().bitcastToAPInt();
+ if (N->getValueType(0) == MVT::f32) {
+ SDValue V = CurDAG->getTargetConstant(A.getZExtValue(), dl, MVT::i32);
+ ReplaceNode(N, CurDAG->getMachineNode(Hexagon::A2_tfrsi, dl, MVT::f32, V));
+ return;
+ }
+ if (N->getValueType(0) == MVT::f64) {
+ SDValue V = CurDAG->getTargetConstant(A.getZExtValue(), dl, MVT::i64);
+ ReplaceNode(N, CurDAG->getMachineNode(Hexagon::CONST64, dl, MVT::f64, V));
+ return;
+ }
+
+ SelectCode(N);
+}
+
+//
+// Map boolean values.
+//
+void HexagonDAGToDAGISel::SelectConstant(SDNode *N) {
+ if (N->getValueType(0) == MVT::i1) {
+ assert(!(cast<ConstantSDNode>(N)->getZExtValue() >> 1));
+ unsigned Opc = (cast<ConstantSDNode>(N)->getSExtValue() != 0)
+ ? Hexagon::PS_true
+ : Hexagon::PS_false;
+ ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), MVT::i1));
+ return;
+ }
+
+ SelectCode(N);
+}
+
+
+void HexagonDAGToDAGISel::SelectFrameIndex(SDNode *N) {
+ MachineFrameInfo &MFI = MF->getFrameInfo();
+ const HexagonFrameLowering *HFI = HST->getFrameLowering();
+ int FX = cast<FrameIndexSDNode>(N)->getIndex();
+ unsigned StkA = HFI->getStackAlignment();
+ unsigned MaxA = MFI.getMaxAlignment();
+ SDValue FI = CurDAG->getTargetFrameIndex(FX, MVT::i32);
+ SDLoc DL(N);
+ SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
+ SDNode *R = nullptr;
+
+ // Use PS_fi when:
+ // - the object is fixed, or
+ // - there are no objects with higher-than-default alignment, or
+ // - there are no dynamically allocated objects.
+ // Otherwise, use PS_fia.
+ if (FX < 0 || MaxA <= StkA || !MFI.hasVarSizedObjects()) {
+ R = CurDAG->getMachineNode(Hexagon::PS_fi, DL, MVT::i32, FI, Zero);
+ } else {
+ auto &HMFI = *MF->getInfo<HexagonMachineFunctionInfo>();
+ unsigned AR = HMFI.getStackAlignBaseVReg();
+ SDValue CH = CurDAG->getEntryNode();
+ SDValue Ops[] = { CurDAG->getCopyFromReg(CH, DL, AR, MVT::i32), FI, Zero };
+ R = CurDAG->getMachineNode(Hexagon::PS_fia, DL, MVT::i32, Ops);
+ }
+
+ ReplaceNode(N, R);
+}
+
+
+void HexagonDAGToDAGISel::SelectBitcast(SDNode *N) {
+ EVT SVT = N->getOperand(0).getValueType();
+ EVT DVT = N->getValueType(0);
+ if (!SVT.isVector() || !DVT.isVector() ||
+ SVT.getVectorElementType() == MVT::i1 ||
+ DVT.getVectorElementType() == MVT::i1 ||
+ SVT.getSizeInBits() != DVT.getSizeInBits()) {
+ SelectCode(N);
+ return;
+ }
+
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N,0), N->getOperand(0));
+ CurDAG->RemoveDeadNode(N);
+}
+
+
+void HexagonDAGToDAGISel::Select(SDNode *N) {
+ if (N->isMachineOpcode()) {
+ N->setNodeId(-1);
+ return; // Already selected.
+ }
+
+ switch (N->getOpcode()) {
+ case ISD::Constant:
+ SelectConstant(N);
+ return;
+
+ case ISD::ConstantFP:
+ SelectConstantFP(N);
+ return;
+
+ case ISD::FrameIndex:
+ SelectFrameIndex(N);
+ return;
+
+ case ISD::BITCAST:
+ SelectBitcast(N);
+ return;
+
+ case ISD::SHL:
+ SelectSHL(N);
+ return;
+
+ case ISD::LOAD:
+ SelectLoad(N);
+ return;
+
+ case ISD::STORE:
+ SelectStore(N);
+ return;
+
+ case ISD::MUL:
+ SelectMul(N);
+ return;
+
+ case ISD::ZERO_EXTEND:
+ SelectZeroExtend(N);
+ return;
+
+ case ISD::INTRINSIC_W_CHAIN:
+ SelectIntrinsicWChain(N);
+ return;
+
+ case ISD::INTRINSIC_WO_CHAIN:
+ SelectIntrinsicWOChain(N);
+ return;
+ }
+
+ SelectCode(N);
+}
+
+bool HexagonDAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
+ std::vector<SDValue> &OutOps) {
+ SDValue Inp = Op, Res;
+
+ switch (ConstraintID) {
+ default:
+ return true;
+ case InlineAsm::Constraint_i:
+ case InlineAsm::Constraint_o: // Offsetable.
+ case InlineAsm::Constraint_v: // Not offsetable.
+ case InlineAsm::Constraint_m: // Memory.
+ if (SelectAddrFI(Inp, Res))
+ OutOps.push_back(Res);
+ else
+ OutOps.push_back(Inp);
+ break;
+ }
+
+ OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32));
+ return false;
+}
+
+
+void HexagonDAGToDAGISel::PreprocessISelDAG() {
+ SelectionDAG &DAG = *CurDAG;
+ std::vector<SDNode*> Nodes;
+ for (SDNode &Node : DAG.allnodes())
+ Nodes.push_back(&Node);
+
+ // Simplify: (or (select c x 0) z) -> (select c (or x z) z)
+ // (or (select c 0 y) z) -> (select c z (or y z))
+ // This may not be the right thing for all targets, so do it here.
+ for (auto I : Nodes) {
+ if (I->getOpcode() != ISD::OR)
+ continue;
+
+ auto IsZero = [] (const SDValue &V) -> bool {
+ if (ConstantSDNode *SC = dyn_cast<ConstantSDNode>(V.getNode()))
+ return SC->isNullValue();
+ return false;
+ };
+ auto IsSelect0 = [IsZero] (const SDValue &Op) -> bool {
+ if (Op.getOpcode() != ISD::SELECT)
+ return false;
+ return IsZero(Op.getOperand(1)) || IsZero(Op.getOperand(2));
+ };
+
+ SDValue N0 = I->getOperand(0), N1 = I->getOperand(1);
+ EVT VT = I->getValueType(0);
+ bool SelN0 = IsSelect0(N0);
+ SDValue SOp = SelN0 ? N0 : N1;
+ SDValue VOp = SelN0 ? N1 : N0;
+
+ if (SOp.getOpcode() == ISD::SELECT && SOp.getNode()->hasOneUse()) {
+ SDValue SC = SOp.getOperand(0);
+ SDValue SX = SOp.getOperand(1);
+ SDValue SY = SOp.getOperand(2);
+ SDLoc DLS = SOp;
+ if (IsZero(SY)) {
+ SDValue NewOr = DAG.getNode(ISD::OR, DLS, VT, SX, VOp);
+ SDValue NewSel = DAG.getNode(ISD::SELECT, DLS, VT, SC, NewOr, VOp);
+ DAG.ReplaceAllUsesWith(I, NewSel.getNode());
+ } else if (IsZero(SX)) {
+ SDValue NewOr = DAG.getNode(ISD::OR, DLS, VT, SY, VOp);
+ SDValue NewSel = DAG.getNode(ISD::SELECT, DLS, VT, SC, VOp, NewOr);
+ DAG.ReplaceAllUsesWith(I, NewSel.getNode());
+ }
+ }
+ }
+
+ // Transform: (store ch addr (add x (add (shl y c) e)))
+ // to: (store ch addr (add x (shl (add y d) c))),
+ // where e = (shl d c) for some integer d.
+ // The purpose of this is to enable generation of loads/stores with
+ // shifted addressing mode, i.e. mem(x+y<<#c). For that, the shift
+ // value c must be 0, 1 or 2.
+ for (auto I : Nodes) {
+ if (I->getOpcode() != ISD::STORE)
+ continue;
+
+ // I matched: (store ch addr Off)
+ SDValue Off = I->getOperand(2);
+ // Off needs to match: (add x (add (shl y c) (shl d c))))
+ if (Off.getOpcode() != ISD::ADD)
+ continue;
+ // Off matched: (add x T0)
+ SDValue T0 = Off.getOperand(1);
+ // T0 needs to match: (add T1 T2):
+ if (T0.getOpcode() != ISD::ADD)
+ continue;
+ // T0 matched: (add T1 T2)
+ SDValue T1 = T0.getOperand(0);
+ SDValue T2 = T0.getOperand(1);
+ // T1 needs to match: (shl y c)
+ if (T1.getOpcode() != ISD::SHL)
+ continue;
+ SDValue C = T1.getOperand(1);
+ ConstantSDNode *CN = dyn_cast<ConstantSDNode>(C.getNode());
+ if (CN == nullptr)
+ continue;
+ unsigned CV = CN->getZExtValue();
+ if (CV > 2)
+ continue;
+ // T2 needs to match e, where e = (shl d c) for some d.
+ ConstantSDNode *EN = dyn_cast<ConstantSDNode>(T2.getNode());
+ if (EN == nullptr)
+ continue;
+ unsigned EV = EN->getZExtValue();
+ if (EV % (1 << CV) != 0)
+ continue;
+ unsigned DV = EV / (1 << CV);
+
+ // Replace T0 with: (shl (add y d) c)
+ SDLoc DL = SDLoc(I);
+ EVT VT = T0.getValueType();
+ SDValue D = DAG.getConstant(DV, DL, VT);
+ // NewAdd = (add y d)
+ SDValue NewAdd = DAG.getNode(ISD::ADD, DL, VT, T1.getOperand(0), D);
+ // NewShl = (shl NewAdd c)
+ SDValue NewShl = DAG.getNode(ISD::SHL, DL, VT, NewAdd, C);
+ ReplaceNode(T0.getNode(), NewShl.getNode());
+ }
+
+ if (EnableAddressRebalancing) {
+ rebalanceAddressTrees();
+
+ DEBUG(
+ dbgs() << "************* SelectionDAG after preprocessing: ***********\n";
+ CurDAG->dump();
+ dbgs() << "************* End SelectionDAG after preprocessing ********\n";
+ );
+ }
+}
+
+void HexagonDAGToDAGISel::EmitFunctionEntryCode() {
+ auto &HST = static_cast<const HexagonSubtarget&>(MF->getSubtarget());
+ auto &HFI = *HST.getFrameLowering();
+ if (!HFI.needsAligna(*MF))
+ return;
+
+ MachineFrameInfo &MFI = MF->getFrameInfo();
+ MachineBasicBlock *EntryBB = &MF->front();
+ unsigned AR = FuncInfo->CreateReg(MVT::i32);
+ unsigned MaxA = MFI.getMaxAlignment();
+ BuildMI(EntryBB, DebugLoc(), HII->get(Hexagon::PS_aligna), AR)
+ .addImm(MaxA);
+ MF->getInfo<HexagonMachineFunctionInfo>()->setStackAlignBaseVReg(AR);
+}
+
+// Match a frame index that can be used in an addressing mode.
+bool HexagonDAGToDAGISel::SelectAddrFI(SDValue& N, SDValue &R) {
+ if (N.getOpcode() != ISD::FrameIndex)
+ return false;
+ auto &HFI = *HST->getFrameLowering();
+ MachineFrameInfo &MFI = MF->getFrameInfo();
+ int FX = cast<FrameIndexSDNode>(N)->getIndex();
+ if (!MFI.isFixedObjectIndex(FX) && HFI.needsAligna(*MF))
+ return false;
+ R = CurDAG->getTargetFrameIndex(FX, MVT::i32);
+ return true;
+}
+
+inline bool HexagonDAGToDAGISel::SelectAddrGA(SDValue &N, SDValue &R) {
+ return SelectGlobalAddress(N, R, false);
+}
+
+inline bool HexagonDAGToDAGISel::SelectAddrGP(SDValue &N, SDValue &R) {
+ return SelectGlobalAddress(N, R, true);
+}
+
+bool HexagonDAGToDAGISel::SelectGlobalAddress(SDValue &N, SDValue &R,
+ bool UseGP) {
+ switch (N.getOpcode()) {
+ case ISD::ADD: {
+ SDValue N0 = N.getOperand(0);
+ SDValue N1 = N.getOperand(1);
+ unsigned GAOpc = N0.getOpcode();
+ if (UseGP && GAOpc != HexagonISD::CONST32_GP)
+ return false;
+ if (!UseGP && GAOpc != HexagonISD::CONST32)
+ return false;
+ if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N1)) {
+ SDValue Addr = N0.getOperand(0);
+ if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Addr)) {
+ if (GA->getOpcode() == ISD::TargetGlobalAddress) {
+ uint64_t NewOff = GA->getOffset() + (uint64_t)Const->getSExtValue();
+ R = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(Const),
+ N.getValueType(), NewOff);
+ return true;
+ }
+ }
+ }
+ break;
+ }
+ case HexagonISD::CONST32:
+ // The operand(0) of CONST32 is TargetGlobalAddress, which is what we
+ // want in the instruction.
+ if (!UseGP)
+ R = N.getOperand(0);
+ return !UseGP;
+ case HexagonISD::CONST32_GP:
+ if (UseGP)
+ R = N.getOperand(0);
+ return UseGP;
+ default:
+ return false;
+ }
+
+ return false;
+}
+
+bool HexagonDAGToDAGISel::isValueExtension(const SDValue &Val,
+ unsigned FromBits, SDValue &Src) {
+ unsigned Opc = Val.getOpcode();
+ switch (Opc) {
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND:
+ case ISD::ANY_EXTEND: {
+ SDValue const &Op0 = Val.getOperand(0);
+ EVT T = Op0.getValueType();
+ if (T.isInteger() && T.getSizeInBits() == FromBits) {
+ Src = Op0;
+ return true;
+ }
+ break;
+ }
+ case ISD::SIGN_EXTEND_INREG:
+ case ISD::AssertSext:
+ case ISD::AssertZext:
+ if (Val.getOperand(0).getValueType().isInteger()) {
+ VTSDNode *T = cast<VTSDNode>(Val.getOperand(1));
+ if (T->getVT().getSizeInBits() == FromBits) {
+ Src = Val.getOperand(0);
+ return true;
+ }
+ }
+ break;
+ case ISD::AND: {
+ // Check if this is an AND with "FromBits" of lower bits set to 1.
+ uint64_t FromMask = (1 << FromBits) - 1;
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val.getOperand(0))) {
+ if (C->getZExtValue() == FromMask) {
+ Src = Val.getOperand(1);
+ return true;
+ }
+ }
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val.getOperand(1))) {
+ if (C->getZExtValue() == FromMask) {
+ Src = Val.getOperand(0);
+ return true;
+ }
+ }
+ break;
+ }
+ case ISD::OR:
+ case ISD::XOR: {
+ // OR/XOR with the lower "FromBits" bits set to 0.
+ uint64_t FromMask = (1 << FromBits) - 1;
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val.getOperand(0))) {
+ if ((C->getZExtValue() & FromMask) == 0) {
+ Src = Val.getOperand(1);
+ return true;
+ }
+ }
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val.getOperand(1))) {
+ if ((C->getZExtValue() & FromMask) == 0) {
+ Src = Val.getOperand(0);
+ return true;
+ }
+ }
+ }
+ default:
+ break;
+ }
+ return false;
+}
+
+
+bool HexagonDAGToDAGISel::isOrEquivalentToAdd(const SDNode *N) const {
+ assert(N->getOpcode() == ISD::OR);
+ auto *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ assert(C);
+
+ // Detect when "or" is used to add an offset to a stack object.
+ if (auto *FN = dyn_cast<FrameIndexSDNode>(N->getOperand(0))) {
+ MachineFrameInfo &MFI = MF->getFrameInfo();
+ unsigned A = MFI.getObjectAlignment(FN->getIndex());
+ assert(isPowerOf2_32(A));
+ int32_t Off = C->getSExtValue();
+ // If the alleged offset fits in the zero bits guaranteed by
+ // the alignment, then this or is really an add.
+ return (Off >= 0) && (((A-1) & Off) == unsigned(Off));
+ }
+ return false;
+}
+
+bool HexagonDAGToDAGISel::isAlignedMemNode(const MemSDNode *N) const {
+ return N->getAlignment() >= N->getMemoryVT().getStoreSize();
+}
+
+// Return true when the given node fits in a positive half word.
+bool HexagonDAGToDAGISel::isPositiveHalfWord(const SDNode *N) const {
+ if (const ConstantSDNode *CN = dyn_cast<const ConstantSDNode>(N)) {
+ int64_t V = CN->getSExtValue();
+ return V > 0 && isInt<16>(V);
+ }
+ if (N->getOpcode() == ISD::SIGN_EXTEND_INREG) {
+ const VTSDNode *VN = dyn_cast<const VTSDNode>(N->getOperand(1));
+ return VN->getVT().getSizeInBits() <= 16;
+ }
+ return false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Rebalancing of address calculation trees
+
+static bool isOpcodeHandled(const SDNode *N) {
+ switch (N->getOpcode()) {
+ case ISD::ADD:
+ case ISD::MUL:
+ return true;
+ case ISD::SHL:
+ // We only handle constant shifts because these can be easily flattened
+ // into multiplications by 2^Op1.
+ return isa<ConstantSDNode>(N->getOperand(1).getNode());
+ default:
+ return false;
+ }
+}
+
+/// \brief Return the weight of an SDNode
+int HexagonDAGToDAGISel::getWeight(SDNode *N) {
+ if (!isOpcodeHandled(N))
+ return 1;
+ assert(RootWeights.count(N) && "Cannot get weight of unseen root!");
+ assert(RootWeights[N] != -1 && "Cannot get weight of unvisited root!");
+ assert(RootWeights[N] != -2 && "Cannot get weight of RAWU'd root!");
+ return RootWeights[N];
+}
+
+int HexagonDAGToDAGISel::getHeight(SDNode *N) {
+ if (!isOpcodeHandled(N))
+ return 0;
+ assert(RootWeights.count(N) && RootWeights[N] >= 0 &&
+ "Cannot query height of unvisited/RAUW'd node!");
+ return RootHeights[N];
+}
+
+namespace {
+struct WeightedLeaf {
+ SDValue Value;
+ int Weight;
+ int InsertionOrder;
+
+ WeightedLeaf() : Value(SDValue()) { }
+
+ WeightedLeaf(SDValue Value, int Weight, int InsertionOrder) :
+ Value(Value), Weight(Weight), InsertionOrder(InsertionOrder) {
+ assert(Weight >= 0 && "Weight must be >= 0");
+ }
+
+ static bool Compare(const WeightedLeaf &A, const WeightedLeaf &B) {
+ assert(A.Value.getNode() && B.Value.getNode());
+ return A.Weight == B.Weight ?
+ (A.InsertionOrder > B.InsertionOrder) :
+ (A.Weight > B.Weight);
+ }
+};
+
+/// A specialized priority queue for WeigthedLeaves. It automatically folds
+/// constants and allows removal of non-top elements while maintaining the
+/// priority order.
+class LeafPrioQueue {
+ SmallVector<WeightedLeaf, 8> Q;
+ bool HaveConst;
+ WeightedLeaf ConstElt;
+ unsigned Opcode;
+
+public:
+ bool empty() {
+ return (!HaveConst && Q.empty());
+ }
+
+ size_t size() {
+ return Q.size() + HaveConst;
+ }
+
+ bool hasConst() {
+ return HaveConst;
+ }
+
+ const WeightedLeaf &top() {
+ if (HaveConst)
+ return ConstElt;
+ return Q.front();
+ }
+
+ WeightedLeaf pop() {
+ if (HaveConst) {
+ HaveConst = false;
+ return ConstElt;
+ }
+ std::pop_heap(Q.begin(), Q.end(), WeightedLeaf::Compare);
+ return Q.pop_back_val();
+ }
+
+ void push(WeightedLeaf L, bool SeparateConst=true) {
+ if (!HaveConst && SeparateConst && isa<ConstantSDNode>(L.Value)) {
+ if (Opcode == ISD::MUL &&
+ cast<ConstantSDNode>(L.Value)->getSExtValue() == 1)
+ return;
+ if (Opcode == ISD::ADD &&
+ cast<ConstantSDNode>(L.Value)->getSExtValue() == 0)
+ return;
+
+ HaveConst = true;
+ ConstElt = L;
+ } else {
+ Q.push_back(L);
+ std::push_heap(Q.begin(), Q.end(), WeightedLeaf::Compare);
+ }
+ }
+
+ /// Push L to the bottom of the queue regardless of its weight. If L is
+ /// constant, it will not be folded with other constants in the queue.
+ void pushToBottom(WeightedLeaf L) {
+ L.Weight = 1000;
+ push(L, false);
+ }
+
+ /// Search for a SHL(x, [<=MaxAmount]) subtree in the queue, return the one of
+ /// lowest weight and remove it from the queue.
+ WeightedLeaf findSHL(uint64_t MaxAmount);
+
+ WeightedLeaf findMULbyConst();
+
+ LeafPrioQueue(unsigned Opcode) :
+ HaveConst(false), Opcode(Opcode) { }
+};
+} // end anonymous namespace
+
+WeightedLeaf LeafPrioQueue::findSHL(uint64_t MaxAmount) {
+ int ResultPos;
+ WeightedLeaf Result;
+
+ for (int Pos = 0, End = Q.size(); Pos != End; ++Pos) {
+ const WeightedLeaf &L = Q[Pos];
+ const SDValue &Val = L.Value;
+ if (Val.getOpcode() != ISD::SHL ||
+ !isa<ConstantSDNode>(Val.getOperand(1)) ||
+ Val.getConstantOperandVal(1) > MaxAmount)
+ continue;
+ if (!Result.Value.getNode() || Result.Weight > L.Weight ||
+ (Result.Weight == L.Weight && Result.InsertionOrder > L.InsertionOrder))
+ {
+ Result = L;
+ ResultPos = Pos;
+ }
+ }
+
+ if (Result.Value.getNode()) {
+ Q.erase(&Q[ResultPos]);
+ std::make_heap(Q.begin(), Q.end(), WeightedLeaf::Compare);
+ }
+
+ return Result;
+}
+
+WeightedLeaf LeafPrioQueue::findMULbyConst() {
+ int ResultPos;
+ WeightedLeaf Result;
+
+ for (int Pos = 0, End = Q.size(); Pos != End; ++Pos) {
+ const WeightedLeaf &L = Q[Pos];
+ const SDValue &Val = L.Value;
+ if (Val.getOpcode() != ISD::MUL ||
+ !isa<ConstantSDNode>(Val.getOperand(1)) ||
+ Val.getConstantOperandVal(1) > 127)
+ continue;
+ if (!Result.Value.getNode() || Result.Weight > L.Weight ||
+ (Result.Weight == L.Weight && Result.InsertionOrder > L.InsertionOrder))
+ {
+ Result = L;
+ ResultPos = Pos;
+ }
+ }
+
+ if (Result.Value.getNode()) {
+ Q.erase(&Q[ResultPos]);
+ std::make_heap(Q.begin(), Q.end(), WeightedLeaf::Compare);
+ }
+
+ return Result;
+}
+
+SDValue HexagonDAGToDAGISel::getMultiplierForSHL(SDNode *N) {
+ uint64_t MulFactor = 1ull << N->getConstantOperandVal(1);
+ return CurDAG->getConstant(MulFactor, SDLoc(N),
+ N->getOperand(1).getValueType());
+}
+
+/// @returns the value x for which 2^x is a factor of Val
+static unsigned getPowerOf2Factor(SDValue Val) {
+ if (Val.getOpcode() == ISD::MUL) {
+ unsigned MaxFactor = 0;
+ for (int i = 0; i < 2; ++i) {
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val.getOperand(i));
+ if (!C)
+ continue;
+ const APInt &CInt = C->getAPIntValue();
+ if (CInt.getBoolValue())
+ MaxFactor = CInt.countTrailingZeros();
+ }
+ return MaxFactor;
+ }
+ if (Val.getOpcode() == ISD::SHL) {
+ if (!isa<ConstantSDNode>(Val.getOperand(1).getNode()))
+ return 0;
+ return (unsigned) Val.getConstantOperandVal(1);
+ }
+
+ return 0;
+}
+
+/// @returns true if V>>Amount will eliminate V's operation on its child
+static bool willShiftRightEliminate(SDValue V, unsigned Amount) {
+ if (V.getOpcode() == ISD::MUL) {
+ SDValue Ops[] = { V.getOperand(0), V.getOperand(1) };
+ for (int i = 0; i < 2; ++i)
+ if (isa<ConstantSDNode>(Ops[i].getNode()) &&
+ V.getConstantOperandVal(i) % (1ULL << Amount) == 0) {
+ uint64_t NewConst = V.getConstantOperandVal(i) >> Amount;
+ return (NewConst == 1);
+ }
+ } else if (V.getOpcode() == ISD::SHL) {
+ return (Amount == V.getConstantOperandVal(1));
+ }
+
+ return false;
+}
+
+SDValue HexagonDAGToDAGISel::factorOutPowerOf2(SDValue V, unsigned Power) {
+ SDValue Ops[] = { V.getOperand(0), V.getOperand(1) };
+ if (V.getOpcode() == ISD::MUL) {
+ for (int i=0; i < 2; ++i) {
+ if (isa<ConstantSDNode>(Ops[i].getNode()) &&
+ V.getConstantOperandVal(i) % ((uint64_t)1 << Power) == 0) {
+ uint64_t NewConst = V.getConstantOperandVal(i) >> Power;
+ if (NewConst == 1)
+ return Ops[!i];
+ Ops[i] = CurDAG->getConstant(NewConst,
+ SDLoc(V), V.getValueType());
+ break;
+ }
+ }
+ } else if (V.getOpcode() == ISD::SHL) {
+ uint64_t ShiftAmount = V.getConstantOperandVal(1);
+ if (ShiftAmount == Power)
+ return Ops[0];
+ Ops[1] = CurDAG->getConstant(ShiftAmount - Power,
+ SDLoc(V), V.getValueType());
+ }
+
+ return CurDAG->getNode(V.getOpcode(), SDLoc(V), V.getValueType(), Ops);
+}
+
+static bool isTargetConstant(const SDValue &V) {
+ return V.getOpcode() == HexagonISD::CONST32 ||
+ V.getOpcode() == HexagonISD::CONST32_GP;
+}
+
+unsigned HexagonDAGToDAGISel::getUsesInFunction(const Value *V) {
+ if (GAUsesInFunction.count(V))
+ return GAUsesInFunction[V];
+
+ unsigned Result = 0;
+ const Function *CurF = CurDAG->getMachineFunction().getFunction();
+ for (const User *U : V->users()) {
+ if (isa<Instruction>(U) &&
+ cast<Instruction>(U)->getParent()->getParent() == CurF)
+ ++Result;
+ }
+
+ GAUsesInFunction[V] = Result;
+
+ return Result;
+}
+
+/// Note - After calling this, N may be dead. It may have been replaced by a
+/// new node, so always use the returned value in place of N.
+///
+/// @returns The SDValue taking the place of N (which could be N if it is
+/// unchanged)
+SDValue HexagonDAGToDAGISel::balanceSubTree(SDNode *N, bool TopLevel) {
+ assert(RootWeights.count(N) && "Cannot balance non-root node.");
+ assert(RootWeights[N] != -2 && "This node was RAUW'd!");
+ assert(!TopLevel || N->getOpcode() == ISD::ADD);
+
+ // Return early if this node was already visited
+ if (RootWeights[N] != -1)
+ return SDValue(N, 0);
+
+ assert(isOpcodeHandled(N));
+
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+
+ // Return early if the operands will remain unchanged or are all roots
+ if ((!isOpcodeHandled(Op0.getNode()) || RootWeights.count(Op0.getNode())) &&
+ (!isOpcodeHandled(Op1.getNode()) || RootWeights.count(Op1.getNode()))) {
+ SDNode *Op0N = Op0.getNode();
+ int Weight;
+ if (isOpcodeHandled(Op0N) && RootWeights[Op0N] == -1) {
+ Weight = getWeight(balanceSubTree(Op0N).getNode());
+ // Weight = calculateWeight(Op0N);
+ } else
+ Weight = getWeight(Op0N);
+
+ SDNode *Op1N = N->getOperand(1).getNode(); // Op1 may have been RAUWd
+ if (isOpcodeHandled(Op1N) && RootWeights[Op1N] == -1) {
+ Weight += getWeight(balanceSubTree(Op1N).getNode());
+ // Weight += calculateWeight(Op1N);
+ } else
+ Weight += getWeight(Op1N);
+
+ RootWeights[N] = Weight;
+ RootHeights[N] = std::max(getHeight(N->getOperand(0).getNode()),
+ getHeight(N->getOperand(1).getNode())) + 1;
+
+ DEBUG(dbgs() << "--> No need to balance root (Weight=" << Weight
+ << " Height=" << RootHeights[N] << "): ");
+ DEBUG(N->dump());
+
+ return SDValue(N, 0);
+ }
+
+ DEBUG(dbgs() << "** Balancing root node: ");
+ DEBUG(N->dump());
+
+ unsigned NOpcode = N->getOpcode();
+
+ LeafPrioQueue Leaves(NOpcode);
+ SmallVector<SDValue, 4> Worklist;
+ Worklist.push_back(SDValue(N, 0));
+
+ // SHL nodes will be converted to MUL nodes
+ if (NOpcode == ISD::SHL)
+ NOpcode = ISD::MUL;
+
+ bool CanFactorize = false;
+ WeightedLeaf Mul1, Mul2;
+ unsigned MaxPowerOf2 = 0;
+ WeightedLeaf GA;
+
+ // Do not try to factor out a shift if there is already a shift at the tip of
+ // the tree.
+ bool HaveTopLevelShift = false;
+ if (TopLevel &&
+ ((isOpcodeHandled(Op0.getNode()) && Op0.getOpcode() == ISD::SHL &&
+ Op0.getConstantOperandVal(1) < 4) ||
+ (isOpcodeHandled(Op1.getNode()) && Op1.getOpcode() == ISD::SHL &&
+ Op1.getConstantOperandVal(1) < 4)))
+ HaveTopLevelShift = true;
+
+ // Flatten the subtree into an ordered list of leaves; at the same time
+ // determine whether the tree is already balanced.
+ int InsertionOrder = 0;
+ SmallDenseMap<SDValue, int> NodeHeights;
+ bool Imbalanced = false;
+ int CurrentWeight = 0;
+ while (!Worklist.empty()) {
+ SDValue Child = Worklist.pop_back_val();
+
+ if (Child.getNode() != N && RootWeights.count(Child.getNode())) {
+ // CASE 1: Child is a root note
+
+ int Weight = RootWeights[Child.getNode()];
+ if (Weight == -1) {
+ Child = balanceSubTree(Child.getNode());
+ // calculateWeight(Child.getNode());
+ Weight = getWeight(Child.getNode());
+ } else if (Weight == -2) {
+ // Whoops, this node was RAUWd by one of the balanceSubTree calls we
+ // made. Our worklist isn't up to date anymore.
+ // Restart the whole process.
+ DEBUG(dbgs() << "--> Subtree was RAUWd. Restarting...\n");
+ return balanceSubTree(N, TopLevel);
+ }
+
+ NodeHeights[Child] = 1;
+ CurrentWeight += Weight;
+
+ unsigned PowerOf2;
+ if (TopLevel && !CanFactorize && !HaveTopLevelShift &&
+ (Child.getOpcode() == ISD::MUL || Child.getOpcode() == ISD::SHL) &&
+ Child.hasOneUse() && (PowerOf2 = getPowerOf2Factor(Child))) {
+ // Try to identify two factorizable MUL/SHL children greedily. Leave
+ // them out of the priority queue for now so we can deal with them
+ // after.
+ if (!Mul1.Value.getNode()) {
+ Mul1 = WeightedLeaf(Child, Weight, InsertionOrder++);
+ MaxPowerOf2 = PowerOf2;
+ } else {
+ Mul2 = WeightedLeaf(Child, Weight, InsertionOrder++);
+ MaxPowerOf2 = std::min(MaxPowerOf2, PowerOf2);
+
+ // Our addressing modes can only shift by a maximum of 3
+ if (MaxPowerOf2 > 3)
+ MaxPowerOf2 = 3;
+
+ CanFactorize = true;
+ }
+ } else
+ Leaves.push(WeightedLeaf(Child, Weight, InsertionOrder++));
+ } else if (!isOpcodeHandled(Child.getNode())) {
+ // CASE 2: Child is an unhandled kind of node (e.g. constant)
+ int Weight = getWeight(Child.getNode());
+
+ NodeHeights[Child] = getHeight(Child.getNode());
+ CurrentWeight += Weight;
+
+ if (isTargetConstant(Child) && !GA.Value.getNode())
+ GA = WeightedLeaf(Child, Weight, InsertionOrder++);
+ else
+ Leaves.push(WeightedLeaf(Child, Weight, InsertionOrder++));
+ } else {
+ // CASE 3: Child is a subtree of same opcode
+ // Visit children first, then flatten.
+ unsigned ChildOpcode = Child.getOpcode();
+ assert(ChildOpcode == NOpcode ||
+ (NOpcode == ISD::MUL && ChildOpcode == ISD::SHL));
+
+ // Convert SHL to MUL
+ SDValue Op1;
+ if (ChildOpcode == ISD::SHL)
+ Op1 = getMultiplierForSHL(Child.getNode());
+ else
+ Op1 = Child->getOperand(1);
+
+ if (!NodeHeights.count(Op1) || !NodeHeights.count(Child->getOperand(0))) {
+ assert(!NodeHeights.count(Child) && "Parent visited before children?");
+ // Visit children first, then re-visit this node
+ Worklist.push_back(Child);
+ Worklist.push_back(Op1);
+ Worklist.push_back(Child->getOperand(0));
+ } else {
+ // Back at this node after visiting the children
+ if (std::abs(NodeHeights[Op1] - NodeHeights[Child->getOperand(0)]) > 1)
+ Imbalanced = true;
+
+ NodeHeights[Child] = std::max(NodeHeights[Op1],
+ NodeHeights[Child->getOperand(0)]) + 1;
+ }
+ }
+ }
+
+ DEBUG(dbgs() << "--> Current height=" << NodeHeights[SDValue(N, 0)]
+ << " weight=" << CurrentWeight << " imbalanced="
+ << Imbalanced << "\n");
+
+ // Transform MUL(x, C * 2^Y) + SHL(z, Y) -> SHL(ADD(MUL(x, C), z), Y)
+ // This factors out a shift in order to match memw(a<<Y+b).
+ if (CanFactorize && (willShiftRightEliminate(Mul1.Value, MaxPowerOf2) ||
+ willShiftRightEliminate(Mul2.Value, MaxPowerOf2))) {
+ DEBUG(dbgs() << "--> Found common factor for two MUL children!\n");
+ int Weight = Mul1.Weight + Mul2.Weight;
+ int Height = std::max(NodeHeights[Mul1.Value], NodeHeights[Mul2.Value]) + 1;
+ SDValue Mul1Factored = factorOutPowerOf2(Mul1.Value, MaxPowerOf2);
+ SDValue Mul2Factored = factorOutPowerOf2(Mul2.Value, MaxPowerOf2);
+ SDValue Sum = CurDAG->getNode(ISD::ADD, SDLoc(N), Mul1.Value.getValueType(),
+ Mul1Factored, Mul2Factored);
+ SDValue Const = CurDAG->getConstant(MaxPowerOf2, SDLoc(N),
+ Mul1.Value.getValueType());
+ SDValue New = CurDAG->getNode(ISD::SHL, SDLoc(N), Mul1.Value.getValueType(),
+ Sum, Const);
+ NodeHeights[New] = Height;
+ Leaves.push(WeightedLeaf(New, Weight, Mul1.InsertionOrder));
+ } else if (Mul1.Value.getNode()) {
+ // We failed to factorize two MULs, so now the Muls are left outside the
+ // queue... add them back.
+ Leaves.push(Mul1);
+ if (Mul2.Value.getNode())
+ Leaves.push(Mul2);
+ CanFactorize = false;
+ }
+
+ // Combine GA + Constant -> GA+Offset, but only if GA is not used elsewhere
+ // and the root node itself is not used more than twice. This reduces the
+ // amount of additional constant extenders introduced by this optimization.
+ bool CombinedGA = false;
+ if (NOpcode == ISD::ADD && GA.Value.getNode() && Leaves.hasConst() &&
+ GA.Value.hasOneUse() && N->use_size() < 3) {
+ GlobalAddressSDNode *GANode =
+ cast<GlobalAddressSDNode>(GA.Value.getOperand(0));
+ ConstantSDNode *Offset = cast<ConstantSDNode>(Leaves.top().Value);
+
+ if (getUsesInFunction(GANode->getGlobal()) == 1 && Offset->hasOneUse() &&
+ getTargetLowering()->isOffsetFoldingLegal(GANode)) {
+ DEBUG(dbgs() << "--> Combining GA and offset (" << Offset->getSExtValue()
+ << "): ");
+ DEBUG(GANode->dump());
+
+ SDValue NewTGA =
+ CurDAG->getTargetGlobalAddress(GANode->getGlobal(), SDLoc(GA.Value),
+ GANode->getValueType(0),
+ GANode->getOffset() + (uint64_t)Offset->getSExtValue());
+ GA.Value = CurDAG->getNode(GA.Value.getOpcode(), SDLoc(GA.Value),
+ GA.Value.getValueType(), NewTGA);
+ GA.Weight += Leaves.top().Weight;
+
+ NodeHeights[GA.Value] = getHeight(GA.Value.getNode());
+ CombinedGA = true;
+
+ Leaves.pop(); // Remove the offset constant from the queue
+ }
+ }
+
+ if ((RebalanceOnlyForOptimizations && !CanFactorize && !CombinedGA) ||
+ (RebalanceOnlyImbalancedTrees && !Imbalanced)) {
+ RootWeights[N] = CurrentWeight;
+ RootHeights[N] = NodeHeights[SDValue(N, 0)];
+
+ return SDValue(N, 0);
+ }
+
+ // Combine GA + SHL(x, C<=31) so we will match Rx=add(#u8,asl(Rx,#U5))
+ if (NOpcode == ISD::ADD && GA.Value.getNode()) {
+ WeightedLeaf SHL = Leaves.findSHL(31);
+ if (SHL.Value.getNode()) {
+ int Height = std::max(NodeHeights[GA.Value], NodeHeights[SHL.Value]) + 1;
+ GA.Value = CurDAG->getNode(ISD::ADD, SDLoc(GA.Value),
+ GA.Value.getValueType(),
+ GA.Value, SHL.Value);
+ GA.Weight = SHL.Weight; // Specifically ignore the GA weight here
+ NodeHeights[GA.Value] = Height;
+ }
+ }
+
+ if (GA.Value.getNode())
+ Leaves.push(GA);
+
+ // If this is the top level and we haven't factored out a shift, we should try
+ // to move a constant to the bottom to match addressing modes like memw(rX+C)
+ if (TopLevel && !CanFactorize && Leaves.hasConst()) {
+ DEBUG(dbgs() << "--> Pushing constant to tip of tree.");
+ Leaves.pushToBottom(Leaves.pop());
+ }
+
+ const DataLayout &DL = CurDAG->getDataLayout();
+ const TargetLowering &TLI = *getTargetLowering();
+
+ // Rebuild the tree using Huffman's algorithm
+ while (Leaves.size() > 1) {
+ WeightedLeaf L0 = Leaves.pop();
+
+ // See whether we can grab a MUL to form an add(Rx,mpyi(Ry,#u6)),
+ // otherwise just get the next leaf
+ WeightedLeaf L1 = Leaves.findMULbyConst();
+ if (!L1.Value.getNode())
+ L1 = Leaves.pop();
+
+ assert(L0.Weight <= L1.Weight && "Priority queue is broken!");
+
+ SDValue V0 = L0.Value;
+ int V0Weight = L0.Weight;
+ SDValue V1 = L1.Value;
+ int V1Weight = L1.Weight;
+
+ // Make sure that none of these nodes have been RAUW'd
+ if ((RootWeights.count(V0.getNode()) && RootWeights[V0.getNode()] == -2) ||
+ (RootWeights.count(V1.getNode()) && RootWeights[V1.getNode()] == -2)) {
+ DEBUG(dbgs() << "--> Subtree was RAUWd. Restarting...\n");
+ return balanceSubTree(N, TopLevel);
+ }
+
+ ConstantSDNode *V0C = dyn_cast<ConstantSDNode>(V0);
+ ConstantSDNode *V1C = dyn_cast<ConstantSDNode>(V1);
+ EVT VT = N->getValueType(0);
+ SDValue NewNode;
+
+ if (V0C && !V1C) {
+ std::swap(V0, V1);
+ std::swap(V0C, V1C);
+ }
+
+ // Calculate height of this node
+ assert(NodeHeights.count(V0) && NodeHeights.count(V1) &&
+ "Children must have been visited before re-combining them!");
+ int Height = std::max(NodeHeights[V0], NodeHeights[V1]) + 1;
+
+ // Rebuild this node (and restore SHL from MUL if needed)
+ if (V1C && NOpcode == ISD::MUL && V1C->getAPIntValue().isPowerOf2())
+ NewNode = CurDAG->getNode(
+ ISD::SHL, SDLoc(V0), VT, V0,
+ CurDAG->getConstant(
+ V1C->getAPIntValue().logBase2(), SDLoc(N),
+ TLI.getScalarShiftAmountTy(DL, V0.getValueType())));
+ else
+ NewNode = CurDAG->getNode(NOpcode, SDLoc(N), VT, V0, V1);
+
+ NodeHeights[NewNode] = Height;
+
+ int Weight = V0Weight + V1Weight;
+ Leaves.push(WeightedLeaf(NewNode, Weight, L0.InsertionOrder));
+
+ DEBUG(dbgs() << "--> Built new node (Weight=" << Weight << ",Height="
+ << Height << "):\n");
+ DEBUG(NewNode.dump());
+ }
+
+ assert(Leaves.size() == 1);
+ SDValue NewRoot = Leaves.top().Value;
+
+ assert(NodeHeights.count(NewRoot));
+ int Height = NodeHeights[NewRoot];
+
+ // Restore SHL if we earlier converted it to a MUL
+ if (NewRoot.getOpcode() == ISD::MUL) {
+ ConstantSDNode *V1C = dyn_cast<ConstantSDNode>(NewRoot.getOperand(1));
+ if (V1C && V1C->getAPIntValue().isPowerOf2()) {
+ EVT VT = NewRoot.getValueType();
+ SDValue V0 = NewRoot.getOperand(0);
+ NewRoot = CurDAG->getNode(
+ ISD::SHL, SDLoc(NewRoot), VT, V0,
+ CurDAG->getConstant(
+ V1C->getAPIntValue().logBase2(), SDLoc(NewRoot),
+ TLI.getScalarShiftAmountTy(DL, V0.getValueType())));
+ }
+ }
+
+ if (N != NewRoot.getNode()) {
+ DEBUG(dbgs() << "--> Root is now: ");
+ DEBUG(NewRoot.dump());
+
+ // Replace all uses of old root by new root
+ CurDAG->ReplaceAllUsesWith(N, NewRoot.getNode());
+ // Mark that we have RAUW'd N
+ RootWeights[N] = -2;
+ } else {
+ DEBUG(dbgs() << "--> Root unchanged.\n");
+ }
+
+ RootWeights[NewRoot.getNode()] = Leaves.top().Weight;
+ RootHeights[NewRoot.getNode()] = Height;
+
+ return NewRoot;
+}
+
+void HexagonDAGToDAGISel::rebalanceAddressTrees() {
+ for (auto I = CurDAG->allnodes_begin(), E = CurDAG->allnodes_end(); I != E;) {
+ SDNode *N = &*I++;
+ if (N->getOpcode() != ISD::LOAD && N->getOpcode() != ISD::STORE)
+ continue;
+
+ SDValue BasePtr = cast<MemSDNode>(N)->getBasePtr();
+ if (BasePtr.getOpcode() != ISD::ADD)
+ continue;
+
+ // We've already processed this node
+ if (RootWeights.count(BasePtr.getNode()))
+ continue;
+
+ DEBUG(dbgs() << "** Rebalancing address calculation in node: ");
+ DEBUG(N->dump());
+
+ // FindRoots
+ SmallVector<SDNode *, 4> Worklist;
+
+ Worklist.push_back(BasePtr.getOperand(0).getNode());
+ Worklist.push_back(BasePtr.getOperand(1).getNode());
+
+ while (!Worklist.empty()) {
+ SDNode *N = Worklist.pop_back_val();
+ unsigned Opcode = N->getOpcode();
+
+ if (!isOpcodeHandled(N))
+ continue;
+
+ Worklist.push_back(N->getOperand(0).getNode());
+ Worklist.push_back(N->getOperand(1).getNode());
+
+ // Not a root if it has only one use and same opcode as its parent
+ if (N->hasOneUse() && Opcode == N->use_begin()->getOpcode())
+ continue;
+
+ // This root node has already been processed
+ if (RootWeights.count(N))
+ continue;
+
+ RootWeights[N] = -1;
+ }
+
+ // Balance node itself
+ RootWeights[BasePtr.getNode()] = -1;
+ SDValue NewBasePtr = balanceSubTree(BasePtr.getNode(), /*TopLevel=*/ true);
+
+ if (N->getOpcode() == ISD::LOAD)
+ N = CurDAG->UpdateNodeOperands(N, N->getOperand(0),
+ NewBasePtr, N->getOperand(2));
+ else
+ N = CurDAG->UpdateNodeOperands(N, N->getOperand(0), N->getOperand(1),
+ NewBasePtr, N->getOperand(3));
+
+ DEBUG(dbgs() << "--> Final node: ");
+ DEBUG(N->dump());
+ }
+
+ CurDAG->RemoveDeadNodes();
+ GAUsesInFunction.clear();
+ RootHeights.clear();
+ RootWeights.clear();
+}
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
new file mode 100644
index 000000000000..e87e1e6a7e0f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -0,0 +1,3323 @@
+//===-- HexagonISelLowering.cpp - Hexagon DAG Lowering Implementation -----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the interfaces that Hexagon uses to lower LLVM code
+// into a selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "HexagonISelLowering.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "HexagonTargetObjectFile.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetCallingConv.h"
+#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagon-lowering"
+
+static cl::opt<bool> EmitJumpTables("hexagon-emit-jump-tables",
+ cl::init(true), cl::Hidden,
+ cl::desc("Control jump table emission on Hexagon target"));
+
+static cl::opt<bool> EnableHexSDNodeSched("enable-hexagon-sdnode-sched",
+ cl::Hidden, cl::ZeroOrMore, cl::init(false),
+ cl::desc("Enable Hexagon SDNode scheduling"));
+
+static cl::opt<bool> EnableFastMath("ffast-math",
+ cl::Hidden, cl::ZeroOrMore, cl::init(false),
+ cl::desc("Enable Fast Math processing"));
+
+static cl::opt<int> MinimumJumpTables("minimum-jump-tables",
+ cl::Hidden, cl::ZeroOrMore, cl::init(5),
+ cl::desc("Set minimum jump tables"));
+
+static cl::opt<int> MaxStoresPerMemcpyCL("max-store-memcpy",
+ cl::Hidden, cl::ZeroOrMore, cl::init(6),
+ cl::desc("Max #stores to inline memcpy"));
+
+static cl::opt<int> MaxStoresPerMemcpyOptSizeCL("max-store-memcpy-Os",
+ cl::Hidden, cl::ZeroOrMore, cl::init(4),
+ cl::desc("Max #stores to inline memcpy"));
+
+static cl::opt<int> MaxStoresPerMemmoveCL("max-store-memmove",
+ cl::Hidden, cl::ZeroOrMore, cl::init(6),
+ cl::desc("Max #stores to inline memmove"));
+
+static cl::opt<int> MaxStoresPerMemmoveOptSizeCL("max-store-memmove-Os",
+ cl::Hidden, cl::ZeroOrMore, cl::init(4),
+ cl::desc("Max #stores to inline memmove"));
+
+static cl::opt<int> MaxStoresPerMemsetCL("max-store-memset",
+ cl::Hidden, cl::ZeroOrMore, cl::init(8),
+ cl::desc("Max #stores to inline memset"));
+
+static cl::opt<int> MaxStoresPerMemsetOptSizeCL("max-store-memset-Os",
+ cl::Hidden, cl::ZeroOrMore, cl::init(4),
+ cl::desc("Max #stores to inline memset"));
+
+
+namespace {
+
+ class HexagonCCState : public CCState {
+ unsigned NumNamedVarArgParams;
+
+ public:
+ HexagonCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
+ SmallVectorImpl<CCValAssign> &locs, LLVMContext &C,
+ int NumNamedVarArgParams)
+ : CCState(CC, isVarArg, MF, locs, C),
+ NumNamedVarArgParams(NumNamedVarArgParams) {}
+
+ unsigned getNumNamedVarArgParams() const { return NumNamedVarArgParams; }
+ };
+
+ enum StridedLoadKind {
+ Even = 0,
+ Odd,
+ NoPattern
+ };
+
+} // end anonymous namespace
+
+// Implement calling convention for Hexagon.
+
+static bool isHvxVectorType(MVT ty);
+
+static bool
+CC_Hexagon(unsigned ValNo, MVT ValVT,
+ MVT LocVT, CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State);
+
+static bool
+CC_Hexagon32(unsigned ValNo, MVT ValVT,
+ MVT LocVT, CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State);
+
+static bool
+CC_Hexagon64(unsigned ValNo, MVT ValVT,
+ MVT LocVT, CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State);
+
+static bool
+CC_HexagonVector(unsigned ValNo, MVT ValVT,
+ MVT LocVT, CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State);
+
+static bool
+RetCC_Hexagon(unsigned ValNo, MVT ValVT,
+ MVT LocVT, CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State);
+
+static bool
+RetCC_Hexagon32(unsigned ValNo, MVT ValVT,
+ MVT LocVT, CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State);
+
+static bool
+RetCC_Hexagon64(unsigned ValNo, MVT ValVT,
+ MVT LocVT, CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State);
+
+static bool
+RetCC_HexagonVector(unsigned ValNo, MVT ValVT,
+ MVT LocVT, CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State);
+
+static bool
+CC_Hexagon_VarArg (unsigned ValNo, MVT ValVT,
+ MVT LocVT, CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State) {
+ HexagonCCState &HState = static_cast<HexagonCCState &>(State);
+
+ if (ValNo < HState.getNumNamedVarArgParams()) {
+ // Deal with named arguments.
+ return CC_Hexagon(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State);
+ }
+
+ // Deal with un-named arguments.
+ unsigned Offset;
+ if (ArgFlags.isByVal()) {
+ // If pass-by-value, the size allocated on stack is decided
+ // by ArgFlags.getByValSize(), not by the size of LocVT.
+ Offset = State.AllocateStack(ArgFlags.getByValSize(),
+ ArgFlags.getByValAlign());
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ return false;
+ }
+ if (LocVT == MVT::i1 || LocVT == MVT::i8 || LocVT == MVT::i16) {
+ LocVT = MVT::i32;
+ ValVT = MVT::i32;
+ if (ArgFlags.isSExt())
+ LocInfo = CCValAssign::SExt;
+ else if (ArgFlags.isZExt())
+ LocInfo = CCValAssign::ZExt;
+ else
+ LocInfo = CCValAssign::AExt;
+ }
+ if (LocVT == MVT::i32 || LocVT == MVT::f32) {
+ Offset = State.AllocateStack(4, 4);
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ return false;
+ }
+ if (LocVT == MVT::i64 || LocVT == MVT::f64) {
+ Offset = State.AllocateStack(8, 8);
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ return false;
+ }
+ if (LocVT == MVT::v2i64 || LocVT == MVT::v4i32 || LocVT == MVT::v8i16 ||
+ LocVT == MVT::v16i8) {
+ Offset = State.AllocateStack(16, 16);
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ return false;
+ }
+ if (LocVT == MVT::v4i64 || LocVT == MVT::v8i32 || LocVT == MVT::v16i16 ||
+ LocVT == MVT::v32i8) {
+ Offset = State.AllocateStack(32, 32);
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ return false;
+ }
+ if (LocVT == MVT::v8i64 || LocVT == MVT::v16i32 || LocVT == MVT::v32i16 ||
+ LocVT == MVT::v64i8 || LocVT == MVT::v512i1) {
+ Offset = State.AllocateStack(64, 64);
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ return false;
+ }
+ if (LocVT == MVT::v16i64 || LocVT == MVT::v32i32 || LocVT == MVT::v64i16 ||
+ LocVT == MVT::v128i8 || LocVT == MVT::v1024i1) {
+ Offset = State.AllocateStack(128, 128);
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ return false;
+ }
+ if (LocVT == MVT::v32i64 || LocVT == MVT::v64i32 || LocVT == MVT::v128i16 ||
+ LocVT == MVT::v256i8) {
+ Offset = State.AllocateStack(256, 256);
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ return false;
+ }
+
+ llvm_unreachable(nullptr);
+}
+
+static bool CC_Hexagon (unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State) {
+ if (ArgFlags.isByVal()) {
+ // Passed on stack.
+ unsigned Offset = State.AllocateStack(ArgFlags.getByValSize(),
+ ArgFlags.getByValAlign());
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ return false;
+ }
+
+ if (LocVT == MVT::i1 || LocVT == MVT::i8 || LocVT == MVT::i16) {
+ LocVT = MVT::i32;
+ ValVT = MVT::i32;
+ if (ArgFlags.isSExt())
+ LocInfo = CCValAssign::SExt;
+ else if (ArgFlags.isZExt())
+ LocInfo = CCValAssign::ZExt;
+ else
+ LocInfo = CCValAssign::AExt;
+ } else if (LocVT == MVT::v4i8 || LocVT == MVT::v2i16) {
+ LocVT = MVT::i32;
+ LocInfo = CCValAssign::BCvt;
+ } else if (LocVT == MVT::v8i8 || LocVT == MVT::v4i16 || LocVT == MVT::v2i32) {
+ LocVT = MVT::i64;
+ LocInfo = CCValAssign::BCvt;
+ }
+
+ if (LocVT == MVT::i32 || LocVT == MVT::f32) {
+ if (!CC_Hexagon32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
+ return false;
+ }
+
+ if (LocVT == MVT::i64 || LocVT == MVT::f64) {
+ if (!CC_Hexagon64(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
+ return false;
+ }
+
+ if (LocVT == MVT::v8i32 || LocVT == MVT::v16i16 || LocVT == MVT::v32i8) {
+ unsigned Offset = State.AllocateStack(ArgFlags.getByValSize(), 32);
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ return false;
+ }
+
+ if (isHvxVectorType(LocVT)) {
+ if (!CC_HexagonVector(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
+ return false;
+ }
+
+ return true; // CC didn't match.
+}
+
+
+static bool CC_Hexagon32(unsigned ValNo, MVT ValVT,
+ MVT LocVT, CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State) {
+ static const MCPhysReg RegList[] = {
+ Hexagon::R0, Hexagon::R1, Hexagon::R2, Hexagon::R3, Hexagon::R4,
+ Hexagon::R5
+ };
+ if (unsigned Reg = State.AllocateReg(RegList)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return false;
+ }
+
+ unsigned Offset = State.AllocateStack(4, 4);
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ return false;
+}
+
+static bool CC_Hexagon64(unsigned ValNo, MVT ValVT,
+ MVT LocVT, CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State) {
+ if (unsigned Reg = State.AllocateReg(Hexagon::D0)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return false;
+ }
+
+ static const MCPhysReg RegList1[] = {
+ Hexagon::D1, Hexagon::D2
+ };
+ static const MCPhysReg RegList2[] = {
+ Hexagon::R1, Hexagon::R3
+ };
+ if (unsigned Reg = State.AllocateReg(RegList1, RegList2)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return false;
+ }
+
+ unsigned Offset = State.AllocateStack(8, 8, Hexagon::D2);
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ return false;
+}
+
+static bool CC_HexagonVector(unsigned ValNo, MVT ValVT,
+ MVT LocVT, CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State) {
+ static const MCPhysReg VecLstS[] = {
+ Hexagon::V0, Hexagon::V1, Hexagon::V2, Hexagon::V3, Hexagon::V4,
+ Hexagon::V5, Hexagon::V6, Hexagon::V7, Hexagon::V8, Hexagon::V9,
+ Hexagon::V10, Hexagon::V11, Hexagon::V12, Hexagon::V13, Hexagon::V14,
+ Hexagon::V15
+ };
+ static const MCPhysReg VecLstD[] = {
+ Hexagon::W0, Hexagon::W1, Hexagon::W2, Hexagon::W3, Hexagon::W4,
+ Hexagon::W5, Hexagon::W6, Hexagon::W7
+ };
+ auto &MF = State.getMachineFunction();
+ auto &HST = MF.getSubtarget<HexagonSubtarget>();
+ bool UseHVX = HST.useHVXOps();
+ bool UseHVXDbl = HST.useHVXDblOps();
+
+ if ((UseHVX && !UseHVXDbl) &&
+ (LocVT == MVT::v8i64 || LocVT == MVT::v16i32 || LocVT == MVT::v32i16 ||
+ LocVT == MVT::v64i8 || LocVT == MVT::v512i1)) {
+ if (unsigned Reg = State.AllocateReg(VecLstS)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return false;
+ }
+ unsigned Offset = State.AllocateStack(64, 64);
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ return false;
+ }
+ if ((UseHVX && !UseHVXDbl) &&
+ (LocVT == MVT::v16i64 || LocVT == MVT::v32i32 || LocVT == MVT::v64i16 ||
+ LocVT == MVT::v128i8)) {
+ if (unsigned Reg = State.AllocateReg(VecLstD)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return false;
+ }
+ unsigned Offset = State.AllocateStack(128, 128);
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ return false;
+ }
+ // 128B Mode
+ if ((UseHVX && UseHVXDbl) &&
+ (LocVT == MVT::v32i64 || LocVT == MVT::v64i32 || LocVT == MVT::v128i16 ||
+ LocVT == MVT::v256i8)) {
+ if (unsigned Reg = State.AllocateReg(VecLstD)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return false;
+ }
+ unsigned Offset = State.AllocateStack(256, 256);
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ return false;
+ }
+ if ((UseHVX && UseHVXDbl) &&
+ (LocVT == MVT::v16i64 || LocVT == MVT::v32i32 || LocVT == MVT::v64i16 ||
+ LocVT == MVT::v128i8 || LocVT == MVT::v1024i1)) {
+ if (unsigned Reg = State.AllocateReg(VecLstS)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return false;
+ }
+ unsigned Offset = State.AllocateStack(128, 128);
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ return false;
+ }
+ return true;
+}
+
+static bool RetCC_Hexagon(unsigned ValNo, MVT ValVT,
+ MVT LocVT, CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State) {
+ auto &MF = State.getMachineFunction();
+ auto &HST = MF.getSubtarget<HexagonSubtarget>();
+ bool UseHVX = HST.useHVXOps();
+ bool UseHVXDbl = HST.useHVXDblOps();
+
+ if (LocVT == MVT::i1) {
+ // Return values of type MVT::i1 still need to be assigned to R0, but
+ // the value type needs to remain i1. LowerCallResult will deal with it,
+ // but it needs to recognize i1 as the value type.
+ LocVT = MVT::i32;
+ } else if (LocVT == MVT::i8 || LocVT == MVT::i16) {
+ LocVT = MVT::i32;
+ ValVT = MVT::i32;
+ if (ArgFlags.isSExt())
+ LocInfo = CCValAssign::SExt;
+ else if (ArgFlags.isZExt())
+ LocInfo = CCValAssign::ZExt;
+ else
+ LocInfo = CCValAssign::AExt;
+ } else if (LocVT == MVT::v4i8 || LocVT == MVT::v2i16) {
+ LocVT = MVT::i32;
+ LocInfo = CCValAssign::BCvt;
+ } else if (LocVT == MVT::v8i8 || LocVT == MVT::v4i16 || LocVT == MVT::v2i32) {
+ LocVT = MVT::i64;
+ LocInfo = CCValAssign::BCvt;
+ } else if (LocVT == MVT::v64i8 || LocVT == MVT::v32i16 ||
+ LocVT == MVT::v16i32 || LocVT == MVT::v8i64 ||
+ LocVT == MVT::v512i1) {
+ LocVT = MVT::v16i32;
+ ValVT = MVT::v16i32;
+ LocInfo = CCValAssign::Full;
+ } else if (LocVT == MVT::v128i8 || LocVT == MVT::v64i16 ||
+ LocVT == MVT::v32i32 || LocVT == MVT::v16i64 ||
+ (LocVT == MVT::v1024i1 && UseHVX && UseHVXDbl)) {
+ LocVT = MVT::v32i32;
+ ValVT = MVT::v32i32;
+ LocInfo = CCValAssign::Full;
+ } else if (LocVT == MVT::v256i8 || LocVT == MVT::v128i16 ||
+ LocVT == MVT::v64i32 || LocVT == MVT::v32i64) {
+ LocVT = MVT::v64i32;
+ ValVT = MVT::v64i32;
+ LocInfo = CCValAssign::Full;
+ }
+ if (LocVT == MVT::i32 || LocVT == MVT::f32) {
+ if (!RetCC_Hexagon32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
+ return false;
+ }
+
+ if (LocVT == MVT::i64 || LocVT == MVT::f64) {
+ if (!RetCC_Hexagon64(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
+ return false;
+ }
+ if (LocVT == MVT::v16i32 || LocVT == MVT::v32i32 || LocVT == MVT::v64i32) {
+ if (!RetCC_HexagonVector(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
+ return false;
+ }
+ return true; // CC didn't match.
+}
+
+static bool RetCC_Hexagon32(unsigned ValNo, MVT ValVT,
+ MVT LocVT, CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State) {
+ if (LocVT == MVT::i32 || LocVT == MVT::f32) {
+ // Note that use of registers beyond R1 is not ABI compliant. However there
+ // are (experimental) IR passes which generate internal functions that
+ // return structs using these additional registers.
+ static const uint16_t RegList[] = { Hexagon::R0, Hexagon::R1,
+ Hexagon::R2, Hexagon::R3,
+ Hexagon::R4, Hexagon::R5 };
+ if (unsigned Reg = State.AllocateReg(RegList)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return false;
+ }
+ }
+
+ unsigned Offset = State.AllocateStack(4, 4);
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ return false;
+}
+
+static bool RetCC_Hexagon64(unsigned ValNo, MVT ValVT,
+ MVT LocVT, CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State) {
+ if (LocVT == MVT::i64 || LocVT == MVT::f64) {
+ if (unsigned Reg = State.AllocateReg(Hexagon::D0)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return false;
+ }
+ }
+
+ unsigned Offset = State.AllocateStack(8, 8);
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ return false;
+}
+
+static bool RetCC_HexagonVector(unsigned ValNo, MVT ValVT,
+ MVT LocVT, CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State) {
+ auto &MF = State.getMachineFunction();
+ auto &HST = MF.getSubtarget<HexagonSubtarget>();
+ bool UseHVX = HST.useHVXOps();
+ bool UseHVXDbl = HST.useHVXDblOps();
+
+ unsigned OffSiz = 64;
+ if (LocVT == MVT::v16i32) {
+ if (unsigned Reg = State.AllocateReg(Hexagon::V0)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return false;
+ }
+ } else if (LocVT == MVT::v32i32) {
+ unsigned Req = (UseHVX && UseHVXDbl) ? Hexagon::V0 : Hexagon::W0;
+ if (unsigned Reg = State.AllocateReg(Req)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return false;
+ }
+ OffSiz = 128;
+ } else if (LocVT == MVT::v64i32) {
+ if (unsigned Reg = State.AllocateReg(Hexagon::W0)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return false;
+ }
+ OffSiz = 256;
+ }
+
+ unsigned Offset = State.AllocateStack(OffSiz, OffSiz);
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ return false;
+}
+
+void HexagonTargetLowering::promoteLdStType(MVT VT, MVT PromotedLdStVT) {
+ if (VT != PromotedLdStVT) {
+ setOperationAction(ISD::LOAD, VT, Promote);
+ AddPromotedToType(ISD::LOAD, VT, PromotedLdStVT);
+
+ setOperationAction(ISD::STORE, VT, Promote);
+ AddPromotedToType(ISD::STORE, VT, PromotedLdStVT);
+ }
+}
+
+SDValue
+HexagonTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG)
+ const {
+ return SDValue();
+}
+
+/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
+/// by "Src" to address "Dst" of size "Size". Alignment information is
+/// specified by the specific parameter attribute. The copy will be passed as
+/// a byval function parameter. Sometimes what we are copying is the end of a
+/// larger object, the part that does not fit in registers.
+static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
+ SDValue Chain, ISD::ArgFlagsTy Flags,
+ SelectionDAG &DAG, const SDLoc &dl) {
+ SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
+ return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
+ /*isVolatile=*/false, /*AlwaysInline=*/false,
+ /*isTailCall=*/false,
+ MachinePointerInfo(), MachinePointerInfo());
+}
+
+static bool isHvxVectorType(MVT Ty) {
+ switch (Ty.SimpleTy) {
+ case MVT::v8i64:
+ case MVT::v16i32:
+ case MVT::v32i16:
+ case MVT::v64i8:
+ case MVT::v16i64:
+ case MVT::v32i32:
+ case MVT::v64i16:
+ case MVT::v128i8:
+ case MVT::v32i64:
+ case MVT::v64i32:
+ case MVT::v128i16:
+ case MVT::v256i8:
+ case MVT::v512i1:
+ case MVT::v1024i1:
+ return true;
+ default:
+ return false;
+ }
+}
+
+// LowerReturn - Lower ISD::RET. If a struct is larger than 8 bytes and is
+// passed by value, the function prototype is modified to return void and
+// the value is stored in memory pointed by a pointer passed by caller.
+SDValue
+HexagonTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SDLoc &dl, SelectionDAG &DAG) const {
+ // CCValAssign - represent the assignment of the return value to locations.
+ SmallVector<CCValAssign, 16> RVLocs;
+
+ // CCState - Info about the registers and stack slot.
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+ *DAG.getContext());
+
+ // Analyze return values of ISD::RET
+ CCInfo.AnalyzeReturn(Outs, RetCC_Hexagon);
+
+ SDValue Flag;
+ SmallVector<SDValue, 4> RetOps(1, Chain);
+
+ // Copy the result values into the output registers.
+ for (unsigned i = 0; i != RVLocs.size(); ++i) {
+ CCValAssign &VA = RVLocs[i];
+
+ Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), OutVals[i], Flag);
+
+ // Guarantee that all emitted copies are stuck together with flags.
+ Flag = Chain.getValue(1);
+ RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+ }
+
+ RetOps[0] = Chain; // Update chain.
+
+ // Add the flag if we have it.
+ if (Flag.getNode())
+ RetOps.push_back(Flag);
+
+ return DAG.getNode(HexagonISD::RET_FLAG, dl, MVT::Other, RetOps);
+}
+
+bool HexagonTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
+ // If either no tail call or told not to tail call at all, don't.
+ auto Attr =
+ CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
+ if (!CI->isTailCall() || Attr.getValueAsString() == "true")
+ return false;
+
+ return true;
+}
+
+/// LowerCallResult - Lower the result values of an ISD::CALL into the
+/// appropriate copies out of appropriate physical registers. This assumes that
+/// Chain/InFlag are the input chain/flag to use, and that TheCall is the call
+/// being lowered. Returns a SDNode with the same number of values as the
+/// ISD::CALL.
+SDValue HexagonTargetLowering::LowerCallResult(
+ SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
+ const SmallVectorImpl<SDValue> &OutVals, SDValue Callee) const {
+ // Assign locations to each value returned by this call.
+ SmallVector<CCValAssign, 16> RVLocs;
+
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+ *DAG.getContext());
+
+ CCInfo.AnalyzeCallResult(Ins, RetCC_Hexagon);
+
+ // Copy all of the result registers out of their specified physreg.
+ for (unsigned i = 0; i != RVLocs.size(); ++i) {
+ SDValue RetVal;
+ if (RVLocs[i].getValVT() == MVT::i1) {
+ // Return values of type MVT::i1 require special handling. The reason
+ // is that MVT::i1 is associated with the PredRegs register class, but
+ // values of that type are still returned in R0. Generate an explicit
+ // copy into a predicate register from R0, and treat the value of the
+ // predicate register as the call result.
+ auto &MRI = DAG.getMachineFunction().getRegInfo();
+ SDValue FR0 = DAG.getCopyFromReg(Chain, dl, RVLocs[i].getLocReg(),
+ MVT::i32, InFlag);
+ // FR0 = (Value, Chain, Glue)
+ unsigned PredR = MRI.createVirtualRegister(&Hexagon::PredRegsRegClass);
+ SDValue TPR = DAG.getCopyToReg(FR0.getValue(1), dl, PredR,
+ FR0.getValue(0), FR0.getValue(2));
+ // TPR = (Chain, Glue)
+ RetVal = DAG.getCopyFromReg(TPR.getValue(0), dl, PredR, MVT::i1,
+ TPR.getValue(1));
+ } else {
+ RetVal = DAG.getCopyFromReg(Chain, dl, RVLocs[i].getLocReg(),
+ RVLocs[i].getValVT(), InFlag);
+ }
+ InVals.push_back(RetVal.getValue(0));
+ Chain = RetVal.getValue(1);
+ InFlag = RetVal.getValue(2);
+ }
+
+ return Chain;
+}
+
+/// LowerCall - Functions arguments are copied from virtual regs to
+/// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
+SDValue
+HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const {
+ SelectionDAG &DAG = CLI.DAG;
+ SDLoc &dl = CLI.DL;
+ SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+ SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+ SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
+ SDValue Chain = CLI.Chain;
+ SDValue Callee = CLI.Callee;
+ bool &IsTailCall = CLI.IsTailCall;
+ CallingConv::ID CallConv = CLI.CallConv;
+ bool IsVarArg = CLI.IsVarArg;
+ bool DoesNotReturn = CLI.DoesNotReturn;
+
+ bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
+ MachineFunction &MF = DAG.getMachineFunction();
+ auto PtrVT = getPointerTy(MF.getDataLayout());
+
+ // Check for varargs.
+ unsigned NumNamedVarArgParams = -1U;
+ if (GlobalAddressSDNode *GAN = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ const GlobalValue *GV = GAN->getGlobal();
+ Callee = DAG.getTargetGlobalAddress(GV, dl, MVT::i32);
+ if (const Function* F = dyn_cast<Function>(GV)) {
+ // If a function has zero args and is a vararg function, that's
+ // disallowed so it must be an undeclared function. Do not assume
+ // varargs if the callee is undefined.
+ if (F->isVarArg() && F->getFunctionType()->getNumParams() != 0)
+ NumNamedVarArgParams = F->getFunctionType()->getNumParams();
+ }
+ }
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ HexagonCCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
+ *DAG.getContext(), NumNamedVarArgParams);
+
+ if (IsVarArg)
+ CCInfo.AnalyzeCallOperands(Outs, CC_Hexagon_VarArg);
+ else
+ CCInfo.AnalyzeCallOperands(Outs, CC_Hexagon);
+
+ auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
+ if (Attr.getValueAsString() == "true")
+ IsTailCall = false;
+
+ if (IsTailCall) {
+ bool StructAttrFlag = MF.getFunction()->hasStructRetAttr();
+ IsTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
+ IsVarArg, IsStructRet,
+ StructAttrFlag,
+ Outs, OutVals, Ins, DAG);
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ if (VA.isMemLoc()) {
+ IsTailCall = false;
+ break;
+ }
+ }
+ DEBUG(dbgs() << (IsTailCall ? "Eligible for Tail Call\n"
+ : "Argument must be passed on stack. "
+ "Not eligible for Tail Call\n"));
+ }
+ // Get a count of how many bytes are to be pushed on the stack.
+ unsigned NumBytes = CCInfo.getNextStackOffset();
+ SmallVector<std::pair<unsigned, SDValue>, 16> RegsToPass;
+ SmallVector<SDValue, 8> MemOpChains;
+
+ auto &HRI = *Subtarget.getRegisterInfo();
+ SDValue StackPtr =
+ DAG.getCopyFromReg(Chain, dl, HRI.getStackRegister(), PtrVT);
+
+ bool NeedsArgAlign = false;
+ unsigned LargestAlignSeen = 0;
+ // Walk the register/memloc assignments, inserting copies/loads.
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ SDValue Arg = OutVals[i];
+ ISD::ArgFlagsTy Flags = Outs[i].Flags;
+ // Record if we need > 8 byte alignment on an argument.
+ bool ArgAlign = isHvxVectorType(VA.getValVT());
+ NeedsArgAlign |= ArgAlign;
+
+ // Promote the value if needed.
+ switch (VA.getLocInfo()) {
+ default:
+ // Loc info must be one of Full, SExt, ZExt, or AExt.
+ llvm_unreachable("Unknown loc info!");
+ case CCValAssign::BCvt:
+ case CCValAssign::Full:
+ break;
+ case CCValAssign::SExt:
+ Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::ZExt:
+ Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::AExt:
+ Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+ break;
+ }
+
+ if (VA.isMemLoc()) {
+ unsigned LocMemOffset = VA.getLocMemOffset();
+ SDValue MemAddr = DAG.getConstant(LocMemOffset, dl,
+ StackPtr.getValueType());
+ MemAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, MemAddr);
+ if (ArgAlign)
+ LargestAlignSeen = std::max(LargestAlignSeen,
+ VA.getLocVT().getStoreSizeInBits() >> 3);
+ if (Flags.isByVal()) {
+ // The argument is a struct passed by value. According to LLVM, "Arg"
+ // is is pointer.
+ MemOpChains.push_back(CreateCopyOfByValArgument(Arg, MemAddr, Chain,
+ Flags, DAG, dl));
+ } else {
+ MachinePointerInfo LocPI = MachinePointerInfo::getStack(
+ DAG.getMachineFunction(), LocMemOffset);
+ SDValue S = DAG.getStore(Chain, dl, Arg, MemAddr, LocPI);
+ MemOpChains.push_back(S);
+ }
+ continue;
+ }
+
+ // Arguments that can be passed on register must be kept at RegsToPass
+ // vector.
+ if (VA.isRegLoc())
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+ }
+
+ if (NeedsArgAlign && Subtarget.hasV60TOps()) {
+ DEBUG(dbgs() << "Function needs byte stack align due to call args\n");
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ // V6 vectors passed by value have 64 or 128 byte alignment depending
+ // on whether we are 64 byte vector mode or 128 byte.
+ bool UseHVXDbl = Subtarget.useHVXDblOps();
+ assert(Subtarget.useHVXOps());
+ const unsigned ObjAlign = UseHVXDbl ? 128 : 64;
+ LargestAlignSeen = std::max(LargestAlignSeen, ObjAlign);
+ MFI.ensureMaxAlignment(LargestAlignSeen);
+ }
+ // Transform all store nodes into one single node because all store
+ // nodes are independent of each other.
+ if (!MemOpChains.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+
+ if (!IsTailCall) {
+ SDValue C = DAG.getConstant(NumBytes, dl, PtrVT, true);
+ Chain = DAG.getCALLSEQ_START(Chain, C, dl);
+ }
+
+ // Build a sequence of copy-to-reg nodes chained together with token
+ // chain and flag operands which copy the outgoing args into registers.
+ // The Glue is necessary since all emitted instructions must be
+ // stuck together.
+ SDValue Glue;
+ if (!IsTailCall) {
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+ Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+ RegsToPass[i].second, Glue);
+ Glue = Chain.getValue(1);
+ }
+ } else {
+ // For tail calls lower the arguments to the 'real' stack slot.
+ //
+ // Force all the incoming stack arguments to be loaded from the stack
+ // before any new outgoing arguments are stored to the stack, because the
+ // outgoing stack slots may alias the incoming argument stack slots, and
+ // the alias isn't otherwise explicit. This is slightly more conservative
+ // than necessary, because it means that each store effectively depends
+ // on every argument instead of just those arguments it would clobber.
+ //
+ // Do not flag preceding copytoreg stuff together with the following stuff.
+ Glue = SDValue();
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+ Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+ RegsToPass[i].second, Glue);
+ Glue = Chain.getValue(1);
+ }
+ Glue = SDValue();
+ }
+
+ bool LongCalls = MF.getSubtarget<HexagonSubtarget>().useLongCalls();
+ unsigned Flags = LongCalls ? HexagonII::HMOTF_ConstExtended : 0;
+
+ // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
+ // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
+ // node so that legalize doesn't hack it.
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, PtrVT, 0, Flags);
+ } else if (ExternalSymbolSDNode *S =
+ dyn_cast<ExternalSymbolSDNode>(Callee)) {
+ Callee = DAG.getTargetExternalSymbol(S->getSymbol(), PtrVT, Flags);
+ }
+
+ // Returns a chain & a flag for retval copy to use.
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(Chain);
+ Ops.push_back(Callee);
+
+ // Add argument registers to the end of the list so that they are
+ // known live into the call.
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+ Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+ RegsToPass[i].second.getValueType()));
+ }
+
+ if (Glue.getNode())
+ Ops.push_back(Glue);
+
+ if (IsTailCall) {
+ MF.getFrameInfo().setHasTailCall();
+ return DAG.getNode(HexagonISD::TC_RETURN, dl, NodeTys, Ops);
+ }
+
+ unsigned OpCode = DoesNotReturn ? HexagonISD::CALLnr : HexagonISD::CALL;
+ Chain = DAG.getNode(OpCode, dl, NodeTys, Ops);
+ Glue = Chain.getValue(1);
+
+ // Create the CALLSEQ_END node.
+ Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
+ DAG.getIntPtrConstant(0, dl, true), Glue, dl);
+ Glue = Chain.getValue(1);
+
+ // Handle result values, copying them out of physregs into vregs that we
+ // return.
+ return LowerCallResult(Chain, Glue, CallConv, IsVarArg, Ins, dl, DAG,
+ InVals, OutVals, Callee);
+}
+
+static bool getIndexedAddressParts(SDNode *Ptr, EVT VT,
+ SDValue &Base, SDValue &Offset,
+ bool &IsInc, SelectionDAG &DAG) {
+ if (Ptr->getOpcode() != ISD::ADD)
+ return false;
+
+ auto &HST = static_cast<const HexagonSubtarget&>(DAG.getSubtarget());
+ bool UseHVX = HST.useHVXOps();
+ bool UseHVXDbl = HST.useHVXDblOps();
+
+ bool ValidHVXDblType =
+ (UseHVX && UseHVXDbl) && (VT == MVT::v32i32 || VT == MVT::v16i64 ||
+ VT == MVT::v64i16 || VT == MVT::v128i8);
+ bool ValidHVXType =
+ UseHVX && !UseHVXDbl && (VT == MVT::v16i32 || VT == MVT::v8i64 ||
+ VT == MVT::v32i16 || VT == MVT::v64i8);
+
+ if (ValidHVXDblType || ValidHVXType ||
+ VT == MVT::i64 || VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8) {
+ IsInc = (Ptr->getOpcode() == ISD::ADD);
+ Base = Ptr->getOperand(0);
+ Offset = Ptr->getOperand(1);
+ // Ensure that Offset is a constant.
+ return isa<ConstantSDNode>(Offset);
+ }
+
+ return false;
+}
+
+/// getPostIndexedAddressParts - returns true by value, base pointer and
+/// offset pointer and addressing mode by reference if this node can be
+/// combined with a load / store to form a post-indexed load / store.
+bool HexagonTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+ SDValue &Base,
+ SDValue &Offset,
+ ISD::MemIndexedMode &AM,
+ SelectionDAG &DAG) const
+{
+ EVT VT;
+ SDValue Ptr;
+
+ if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+ VT = LD->getMemoryVT();
+ } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+ VT = ST->getMemoryVT();
+ if (ST->getValue().getValueType() == MVT::i64 && ST->isTruncatingStore())
+ return false;
+ } else {
+ return false;
+ }
+
+ bool IsInc = false;
+ bool isLegal = getIndexedAddressParts(Op, VT, Base, Offset, IsInc, DAG);
+ if (isLegal) {
+ auto &HII = *Subtarget.getInstrInfo();
+ int32_t OffsetVal = cast<ConstantSDNode>(Offset.getNode())->getSExtValue();
+ if (HII.isValidAutoIncImm(VT, OffsetVal)) {
+ AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+SDValue
+HexagonTargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const {
+ SDNode *Node = Op.getNode();
+ MachineFunction &MF = DAG.getMachineFunction();
+ auto &FuncInfo = *MF.getInfo<HexagonMachineFunctionInfo>();
+ switch (Node->getOpcode()) {
+ case ISD::INLINEASM: {
+ unsigned NumOps = Node->getNumOperands();
+ if (Node->getOperand(NumOps-1).getValueType() == MVT::Glue)
+ --NumOps; // Ignore the flag operand.
+
+ for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
+ if (FuncInfo.hasClobberLR())
+ break;
+ unsigned Flags =
+ cast<ConstantSDNode>(Node->getOperand(i))->getZExtValue();
+ unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+ ++i; // Skip the ID value.
+
+ switch (InlineAsm::getKind(Flags)) {
+ default: llvm_unreachable("Bad flags!");
+ case InlineAsm::Kind_RegDef:
+ case InlineAsm::Kind_RegUse:
+ case InlineAsm::Kind_Imm:
+ case InlineAsm::Kind_Clobber:
+ case InlineAsm::Kind_Mem: {
+ for (; NumVals; --NumVals, ++i) {}
+ break;
+ }
+ case InlineAsm::Kind_RegDefEarlyClobber: {
+ for (; NumVals; --NumVals, ++i) {
+ unsigned Reg =
+ cast<RegisterSDNode>(Node->getOperand(i))->getReg();
+
+ // Check it to be lr
+ const HexagonRegisterInfo *QRI = Subtarget.getRegisterInfo();
+ if (Reg == QRI->getRARegister()) {
+ FuncInfo.setHasClobberLR(true);
+ break;
+ }
+ }
+ break;
+ }
+ }
+ }
+ }
+ } // Node->getOpcode
+ return Op;
+}
+
+// Need to transform ISD::PREFETCH into something that doesn't inherit
+// all of the properties of ISD::PREFETCH, specifically SDNPMayLoad and
+// SDNPMayStore.
+SDValue HexagonTargetLowering::LowerPREFETCH(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue Chain = Op.getOperand(0);
+ SDValue Addr = Op.getOperand(1);
+ // Lower it to DCFETCH($reg, #0). A "pat" will try to merge the offset in,
+ // if the "reg" is fed by an "add".
+ SDLoc DL(Op);
+ SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
+ return DAG.getNode(HexagonISD::DCFETCH, DL, MVT::Other, Chain, Addr, Zero);
+}
+
+SDValue HexagonTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue Chain = Op.getOperand(0);
+ unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+ // Lower the hexagon_prefetch builtin to DCFETCH, as above.
+ if (IntNo == Intrinsic::hexagon_prefetch) {
+ SDValue Addr = Op.getOperand(2);
+ SDLoc DL(Op);
+ SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
+ return DAG.getNode(HexagonISD::DCFETCH, DL, MVT::Other, Chain, Addr, Zero);
+ }
+ return SDValue();
+}
+
+SDValue
+HexagonTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue Chain = Op.getOperand(0);
+ SDValue Size = Op.getOperand(1);
+ SDValue Align = Op.getOperand(2);
+ SDLoc dl(Op);
+
+ ConstantSDNode *AlignConst = dyn_cast<ConstantSDNode>(Align);
+ assert(AlignConst && "Non-constant Align in LowerDYNAMIC_STACKALLOC");
+
+ unsigned A = AlignConst->getSExtValue();
+ auto &HFI = *Subtarget.getFrameLowering();
+ // "Zero" means natural stack alignment.
+ if (A == 0)
+ A = HFI.getStackAlignment();
+
+ DEBUG({
+ dbgs () << __func__ << " Align: " << A << " Size: ";
+ Size.getNode()->dump(&DAG);
+ dbgs() << "\n";
+ });
+
+ SDValue AC = DAG.getConstant(A, dl, MVT::i32);
+ SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
+ SDValue AA = DAG.getNode(HexagonISD::ALLOCA, dl, VTs, Chain, Size, AC);
+
+ DAG.ReplaceAllUsesOfValueWith(Op, AA);
+ return AA;
+}
+
+SDValue HexagonTargetLowering::LowerFormalArguments(
+ SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MachineRegisterInfo &RegInfo = MF.getRegInfo();
+ auto &FuncInfo = *MF.getInfo<HexagonMachineFunctionInfo>();
+
+ // Assign locations to all of the incoming arguments.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+ *DAG.getContext());
+
+ CCInfo.AnalyzeFormalArguments(Ins, CC_Hexagon);
+
+ // For LLVM, in the case when returning a struct by value (>8byte),
+ // the first argument is a pointer that points to the location on caller's
+ // stack where the return value will be stored. For Hexagon, the location on
+ // caller's stack is passed only when the struct size is smaller than (and
+ // equal to) 8 bytes. If not, no address will be passed into callee and
+ // callee return the result direclty through R0/R1.
+
+ SmallVector<SDValue, 8> MemOps;
+ bool UseHVX = Subtarget.useHVXOps(), UseHVXDbl = Subtarget.useHVXDblOps();
+
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ ISD::ArgFlagsTy Flags = Ins[i].Flags;
+ unsigned ObjSize;
+ unsigned StackLocation;
+ int FI;
+
+ if ( (VA.isRegLoc() && !Flags.isByVal())
+ || (VA.isRegLoc() && Flags.isByVal() && Flags.getByValSize() > 8)) {
+ // Arguments passed in registers
+ // 1. int, long long, ptr args that get allocated in register.
+ // 2. Large struct that gets an register to put its address in.
+ EVT RegVT = VA.getLocVT();
+ if (RegVT == MVT::i8 || RegVT == MVT::i16 ||
+ RegVT == MVT::i32 || RegVT == MVT::f32) {
+ unsigned VReg =
+ RegInfo.createVirtualRegister(&Hexagon::IntRegsRegClass);
+ RegInfo.addLiveIn(VA.getLocReg(), VReg);
+ InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
+ } else if (RegVT == MVT::i64 || RegVT == MVT::f64) {
+ unsigned VReg =
+ RegInfo.createVirtualRegister(&Hexagon::DoubleRegsRegClass);
+ RegInfo.addLiveIn(VA.getLocReg(), VReg);
+ InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
+
+ // Single Vector
+ } else if ((RegVT == MVT::v8i64 || RegVT == MVT::v16i32 ||
+ RegVT == MVT::v32i16 || RegVT == MVT::v64i8)) {
+ unsigned VReg =
+ RegInfo.createVirtualRegister(&Hexagon::VectorRegsRegClass);
+ RegInfo.addLiveIn(VA.getLocReg(), VReg);
+ InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
+ } else if (UseHVX && UseHVXDbl &&
+ ((RegVT == MVT::v16i64 || RegVT == MVT::v32i32 ||
+ RegVT == MVT::v64i16 || RegVT == MVT::v128i8))) {
+ unsigned VReg =
+ RegInfo.createVirtualRegister(&Hexagon::VectorRegs128BRegClass);
+ RegInfo.addLiveIn(VA.getLocReg(), VReg);
+ InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
+
+ // Double Vector
+ } else if ((RegVT == MVT::v16i64 || RegVT == MVT::v32i32 ||
+ RegVT == MVT::v64i16 || RegVT == MVT::v128i8)) {
+ unsigned VReg =
+ RegInfo.createVirtualRegister(&Hexagon::VecDblRegsRegClass);
+ RegInfo.addLiveIn(VA.getLocReg(), VReg);
+ InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
+ } else if (UseHVX && UseHVXDbl &&
+ ((RegVT == MVT::v32i64 || RegVT == MVT::v64i32 ||
+ RegVT == MVT::v128i16 || RegVT == MVT::v256i8))) {
+ unsigned VReg =
+ RegInfo.createVirtualRegister(&Hexagon::VecDblRegs128BRegClass);
+ RegInfo.addLiveIn(VA.getLocReg(), VReg);
+ InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
+ } else if (RegVT == MVT::v512i1 || RegVT == MVT::v1024i1) {
+ assert(0 && "need to support VecPred regs");
+ unsigned VReg =
+ RegInfo.createVirtualRegister(&Hexagon::VecPredRegsRegClass);
+ RegInfo.addLiveIn(VA.getLocReg(), VReg);
+ InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
+ } else {
+ assert (0);
+ }
+ } else if (VA.isRegLoc() && Flags.isByVal() && Flags.getByValSize() <= 8) {
+ assert (0 && "ByValSize must be bigger than 8 bytes");
+ } else {
+ // Sanity check.
+ assert(VA.isMemLoc());
+
+ if (Flags.isByVal()) {
+ // If it's a byval parameter, then we need to compute the
+ // "real" size, not the size of the pointer.
+ ObjSize = Flags.getByValSize();
+ } else {
+ ObjSize = VA.getLocVT().getStoreSizeInBits() >> 3;
+ }
+
+ StackLocation = HEXAGON_LRFP_SIZE + VA.getLocMemOffset();
+ // Create the frame index object for this incoming parameter...
+ FI = MFI.CreateFixedObject(ObjSize, StackLocation, true);
+
+ // Create the SelectionDAG nodes cordl, responding to a load
+ // from this parameter.
+ SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+
+ if (Flags.isByVal()) {
+ // If it's a pass-by-value aggregate, then do not dereference the stack
+ // location. Instead, we should generate a reference to the stack
+ // location.
+ InVals.push_back(FIN);
+ } else {
+ InVals.push_back(
+ DAG.getLoad(VA.getLocVT(), dl, Chain, FIN, MachinePointerInfo()));
+ }
+ }
+ }
+
+ if (!MemOps.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
+
+ if (isVarArg) {
+ // This will point to the next argument passed via stack.
+ int FrameIndex = MFI.CreateFixedObject(Hexagon_PointerSize,
+ HEXAGON_LRFP_SIZE +
+ CCInfo.getNextStackOffset(),
+ true);
+ FuncInfo.setVarArgsFrameIndex(FrameIndex);
+ }
+
+ return Chain;
+}
+
+SDValue
+HexagonTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+ // VASTART stores the address of the VarArgsFrameIndex slot into the
+ // memory location argument.
+ MachineFunction &MF = DAG.getMachineFunction();
+ HexagonMachineFunctionInfo *QFI = MF.getInfo<HexagonMachineFunctionInfo>();
+ SDValue Addr = DAG.getFrameIndex(QFI->getVarArgsFrameIndex(), MVT::i32);
+ const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+ return DAG.getStore(Op.getOperand(0), SDLoc(Op), Addr, Op.getOperand(1),
+ MachinePointerInfo(SV));
+}
+
+// Creates a SPLAT instruction for a constant value VAL.
+static SDValue createSplat(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
+ SDValue Val) {
+ if (VT.getSimpleVT() == MVT::v4i8)
+ return DAG.getNode(HexagonISD::VSPLATB, dl, VT, Val);
+
+ if (VT.getSimpleVT() == MVT::v4i16)
+ return DAG.getNode(HexagonISD::VSPLATH, dl, VT, Val);
+
+ return SDValue();
+}
+
+static bool isSExtFree(SDValue N) {
+ // A sign-extend of a truncate of a sign-extend is free.
+ if (N.getOpcode() == ISD::TRUNCATE &&
+ N.getOperand(0).getOpcode() == ISD::AssertSext)
+ return true;
+ // We have sign-extended loads.
+ if (N.getOpcode() == ISD::LOAD)
+ return true;
+ return false;
+}
+
+SDValue HexagonTargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ SDValue InpVal = Op.getOperand(0);
+ if (isa<ConstantSDNode>(InpVal)) {
+ uint64_t V = cast<ConstantSDNode>(InpVal)->getZExtValue();
+ return DAG.getTargetConstant(countPopulation(V), dl, MVT::i64);
+ }
+ SDValue PopOut = DAG.getNode(HexagonISD::POPCOUNT, dl, MVT::i32, InpVal);
+ return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, PopOut);
+}
+
+SDValue HexagonTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ SDValue Cmp = Op.getOperand(2);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cmp)->get();
+
+ EVT VT = Op.getValueType();
+ EVT LHSVT = LHS.getValueType();
+ EVT RHSVT = RHS.getValueType();
+
+ if (LHSVT == MVT::v2i16) {
+ assert(ISD::isSignedIntSetCC(CC) || ISD::isUnsignedIntSetCC(CC));
+ unsigned ExtOpc = ISD::isSignedIntSetCC(CC) ? ISD::SIGN_EXTEND
+ : ISD::ZERO_EXTEND;
+ SDValue LX = DAG.getNode(ExtOpc, dl, MVT::v2i32, LHS);
+ SDValue RX = DAG.getNode(ExtOpc, dl, MVT::v2i32, RHS);
+ SDValue SC = DAG.getNode(ISD::SETCC, dl, MVT::v2i1, LX, RX, Cmp);
+ return SC;
+ }
+
+ // Treat all other vector types as legal.
+ if (VT.isVector())
+ return Op;
+
+ // Equals and not equals should use sign-extend, not zero-extend, since
+ // we can represent small negative values in the compare instructions.
+ // The LLVM default is to use zero-extend arbitrarily in these cases.
+ if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
+ (RHSVT == MVT::i8 || RHSVT == MVT::i16) &&
+ (LHSVT == MVT::i8 || LHSVT == MVT::i16)) {
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS);
+ if (C && C->getAPIntValue().isNegative()) {
+ LHS = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, LHS);
+ RHS = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, RHS);
+ return DAG.getNode(ISD::SETCC, dl, Op.getValueType(),
+ LHS, RHS, Op.getOperand(2));
+ }
+ if (isSExtFree(LHS) || isSExtFree(RHS)) {
+ LHS = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, LHS);
+ RHS = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, RHS);
+ return DAG.getNode(ISD::SETCC, dl, Op.getValueType(),
+ LHS, RHS, Op.getOperand(2));
+ }
+ }
+ return SDValue();
+}
+
+SDValue
+HexagonTargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
+ SDValue PredOp = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1), Op2 = Op.getOperand(2);
+ EVT OpVT = Op1.getValueType();
+ SDLoc DL(Op);
+
+ if (OpVT == MVT::v2i16) {
+ SDValue X1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i32, Op1);
+ SDValue X2 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i32, Op2);
+ SDValue SL = DAG.getNode(ISD::VSELECT, DL, MVT::v2i32, PredOp, X1, X2);
+ SDValue TR = DAG.getNode(ISD::TRUNCATE, DL, MVT::v2i16, SL);
+ return TR;
+ }
+
+ return SDValue();
+}
+
+// Handle only specific vector loads.
+SDValue HexagonTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ SDLoc DL(Op);
+ LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
+ SDValue Chain = LoadNode->getChain();
+ SDValue Ptr = Op.getOperand(1);
+ SDValue LoweredLoad;
+ SDValue Result;
+ SDValue Base = LoadNode->getBasePtr();
+ ISD::LoadExtType Ext = LoadNode->getExtensionType();
+ unsigned Alignment = LoadNode->getAlignment();
+ SDValue LoadChain;
+
+ if(Ext == ISD::NON_EXTLOAD)
+ Ext = ISD::ZEXTLOAD;
+
+ if (VT == MVT::v4i16) {
+ if (Alignment == 2) {
+ SDValue Loads[4];
+ // Base load.
+ Loads[0] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Base,
+ LoadNode->getPointerInfo(), MVT::i16, Alignment,
+ LoadNode->getMemOperand()->getFlags());
+ // Base+2 load.
+ SDValue Increment = DAG.getConstant(2, DL, MVT::i32);
+ Ptr = DAG.getNode(ISD::ADD, DL, Base.getValueType(), Base, Increment);
+ Loads[1] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Ptr,
+ LoadNode->getPointerInfo(), MVT::i16, Alignment,
+ LoadNode->getMemOperand()->getFlags());
+ // SHL 16, then OR base and base+2.
+ SDValue ShiftAmount = DAG.getConstant(16, DL, MVT::i32);
+ SDValue Tmp1 = DAG.getNode(ISD::SHL, DL, MVT::i32, Loads[1], ShiftAmount);
+ SDValue Tmp2 = DAG.getNode(ISD::OR, DL, MVT::i32, Tmp1, Loads[0]);
+ // Base + 4.
+ Increment = DAG.getConstant(4, DL, MVT::i32);
+ Ptr = DAG.getNode(ISD::ADD, DL, Base.getValueType(), Base, Increment);
+ Loads[2] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Ptr,
+ LoadNode->getPointerInfo(), MVT::i16, Alignment,
+ LoadNode->getMemOperand()->getFlags());
+ // Base + 6.
+ Increment = DAG.getConstant(6, DL, MVT::i32);
+ Ptr = DAG.getNode(ISD::ADD, DL, Base.getValueType(), Base, Increment);
+ Loads[3] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Ptr,
+ LoadNode->getPointerInfo(), MVT::i16, Alignment,
+ LoadNode->getMemOperand()->getFlags());
+ // SHL 16, then OR base+4 and base+6.
+ Tmp1 = DAG.getNode(ISD::SHL, DL, MVT::i32, Loads[3], ShiftAmount);
+ SDValue Tmp4 = DAG.getNode(ISD::OR, DL, MVT::i32, Tmp1, Loads[2]);
+ // Combine to i64. This could be optimised out later if we can
+ // affect reg allocation of this code.
+ Result = DAG.getNode(HexagonISD::COMBINE, DL, MVT::i64, Tmp4, Tmp2);
+ LoadChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+ Loads[0].getValue(1), Loads[1].getValue(1),
+ Loads[2].getValue(1), Loads[3].getValue(1));
+ } else {
+ // Perform default type expansion.
+ Result = DAG.getLoad(MVT::i64, DL, Chain, Ptr, LoadNode->getPointerInfo(),
+ LoadNode->getAlignment(),
+ LoadNode->getMemOperand()->getFlags());
+ LoadChain = Result.getValue(1);
+ }
+ } else
+ llvm_unreachable("Custom lowering unsupported load");
+
+ Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
+ // Since we pretend to lower a load, we need the original chain
+ // info attached to the result.
+ SDValue Ops[] = { Result, LoadChain };
+
+ return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue
+HexagonTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
+ EVT ValTy = Op.getValueType();
+ ConstantPoolSDNode *CPN = cast<ConstantPoolSDNode>(Op);
+ unsigned Align = CPN->getAlignment();
+ bool IsPositionIndependent = isPositionIndependent();
+ unsigned char TF = IsPositionIndependent ? HexagonII::MO_PCREL : 0;
+
+ unsigned Offset = 0;
+ SDValue T;
+ if (CPN->isMachineConstantPoolEntry())
+ T = DAG.getTargetConstantPool(CPN->getMachineCPVal(), ValTy, Align, Offset,
+ TF);
+ else
+ T = DAG.getTargetConstantPool(CPN->getConstVal(), ValTy, Align, Offset,
+ TF);
+
+ assert(cast<ConstantPoolSDNode>(T)->getTargetFlags() == TF &&
+ "Inconsistent target flag encountered");
+
+ if (IsPositionIndependent)
+ return DAG.getNode(HexagonISD::AT_PCREL, SDLoc(Op), ValTy, T);
+ return DAG.getNode(HexagonISD::CP, SDLoc(Op), ValTy, T);
+}
+
+SDValue
+HexagonTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ int Idx = cast<JumpTableSDNode>(Op)->getIndex();
+ if (isPositionIndependent()) {
+ SDValue T = DAG.getTargetJumpTable(Idx, VT, HexagonII::MO_PCREL);
+ return DAG.getNode(HexagonISD::AT_PCREL, SDLoc(Op), VT, T);
+ }
+
+ SDValue T = DAG.getTargetJumpTable(Idx, VT);
+ return DAG.getNode(HexagonISD::JT, SDLoc(Op), VT, T);
+}
+
+SDValue
+HexagonTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
+ const HexagonRegisterInfo &HRI = *Subtarget.getRegisterInfo();
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MFI.setReturnAddressIsTaken(true);
+
+ if (verifyReturnAddressArgumentIsConstant(Op, DAG))
+ return SDValue();
+
+ EVT VT = Op.getValueType();
+ SDLoc dl(Op);
+ unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ if (Depth) {
+ SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+ SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
+ return DAG.getLoad(VT, dl, DAG.getEntryNode(),
+ DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
+ MachinePointerInfo());
+ }
+
+ // Return LR, which contains the return address. Mark it an implicit live-in.
+ unsigned Reg = MF.addLiveIn(HRI.getRARegister(), getRegClassFor(MVT::i32));
+ return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
+}
+
+SDValue
+HexagonTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
+ const HexagonRegisterInfo &HRI = *Subtarget.getRegisterInfo();
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ MFI.setFrameAddressIsTaken(true);
+
+ EVT VT = Op.getValueType();
+ SDLoc dl(Op);
+ unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
+ HRI.getFrameRegister(), VT);
+ while (Depth--)
+ FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
+ MachinePointerInfo());
+ return FrameAddr;
+}
+
+SDValue
+HexagonTargetLowering::LowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const {
+ SDLoc dl(Op);
+ return DAG.getNode(HexagonISD::BARRIER, dl, MVT::Other, Op.getOperand(0));
+}
+
+SDValue
+HexagonTargetLowering::LowerGLOBALADDRESS(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ auto *GAN = cast<GlobalAddressSDNode>(Op);
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+ auto *GV = GAN->getGlobal();
+ int64_t Offset = GAN->getOffset();
+
+ auto &HLOF = *HTM.getObjFileLowering();
+ Reloc::Model RM = HTM.getRelocationModel();
+
+ if (RM == Reloc::Static) {
+ SDValue GA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
+ const GlobalObject *GO = GV->getBaseObject();
+ if (GO && HLOF.isGlobalInSmallSection(GO, HTM))
+ return DAG.getNode(HexagonISD::CONST32_GP, dl, PtrVT, GA);
+ return DAG.getNode(HexagonISD::CONST32, dl, PtrVT, GA);
+ }
+
+ bool UsePCRel = getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
+ if (UsePCRel) {
+ SDValue GA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset,
+ HexagonII::MO_PCREL);
+ return DAG.getNode(HexagonISD::AT_PCREL, dl, PtrVT, GA);
+ }
+
+ // Use GOT index.
+ SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT);
+ SDValue GA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, HexagonII::MO_GOT);
+ SDValue Off = DAG.getConstant(Offset, dl, MVT::i32);
+ return DAG.getNode(HexagonISD::AT_GOT, dl, PtrVT, GOT, GA, Off);
+}
+
+// Specifies that for loads and stores VT can be promoted to PromotedLdStVT.
+SDValue
+HexagonTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
+ const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+ SDLoc dl(Op);
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+ Reloc::Model RM = HTM.getRelocationModel();
+ if (RM == Reloc::Static) {
+ SDValue A = DAG.getTargetBlockAddress(BA, PtrVT);
+ return DAG.getNode(HexagonISD::CONST32_GP, dl, PtrVT, A);
+ }
+
+ SDValue A = DAG.getTargetBlockAddress(BA, PtrVT, 0, HexagonII::MO_PCREL);
+ return DAG.getNode(HexagonISD::AT_PCREL, dl, PtrVT, A);
+}
+
+SDValue
+HexagonTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG)
+ const {
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue GOTSym = DAG.getTargetExternalSymbol(HEXAGON_GOT_SYM_NAME, PtrVT,
+ HexagonII::MO_PCREL);
+ return DAG.getNode(HexagonISD::AT_PCREL, SDLoc(Op), PtrVT, GOTSym);
+}
+
+SDValue
+HexagonTargetLowering::GetDynamicTLSAddr(SelectionDAG &DAG, SDValue Chain,
+ GlobalAddressSDNode *GA, SDValue *InFlag, EVT PtrVT, unsigned ReturnReg,
+ unsigned char OperandFlags) const {
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDLoc dl(GA);
+ SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
+ GA->getValueType(0),
+ GA->getOffset(),
+ OperandFlags);
+ // Create Operands for the call.The Operands should have the following:
+ // 1. Chain SDValue
+ // 2. Callee which in this case is the Global address value.
+ // 3. Registers live into the call.In this case its R0, as we
+ // have just one argument to be passed.
+ // 4. InFlag if there is any.
+ // Note: The order is important.
+
+ if (InFlag) {
+ SDValue Ops[] = { Chain, TGA,
+ DAG.getRegister(Hexagon::R0, PtrVT), *InFlag };
+ Chain = DAG.getNode(HexagonISD::CALL, dl, NodeTys, Ops);
+ } else {
+ SDValue Ops[] = { Chain, TGA, DAG.getRegister(Hexagon::R0, PtrVT)};
+ Chain = DAG.getNode(HexagonISD::CALL, dl, NodeTys, Ops);
+ }
+
+ // Inform MFI that function has calls.
+ MFI.setAdjustsStack(true);
+
+ SDValue Flag = Chain.getValue(1);
+ return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
+}
+
+//
+// Lower using the intial executable model for TLS addresses
+//
+SDValue
+HexagonTargetLowering::LowerToTLSInitialExecModel(GlobalAddressSDNode *GA,
+ SelectionDAG &DAG) const {
+ SDLoc dl(GA);
+ int64_t Offset = GA->getOffset();
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+ // Get the thread pointer.
+ SDValue TP = DAG.getCopyFromReg(DAG.getEntryNode(), dl, Hexagon::UGP, PtrVT);
+
+ bool IsPositionIndependent = isPositionIndependent();
+ unsigned char TF =
+ IsPositionIndependent ? HexagonII::MO_IEGOT : HexagonII::MO_IE;
+
+ // First generate the TLS symbol address
+ SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, PtrVT,
+ Offset, TF);
+
+ SDValue Sym = DAG.getNode(HexagonISD::CONST32, dl, PtrVT, TGA);
+
+ if (IsPositionIndependent) {
+ // Generate the GOT pointer in case of position independent code
+ SDValue GOT = LowerGLOBAL_OFFSET_TABLE(Sym, DAG);
+
+ // Add the TLS Symbol address to GOT pointer.This gives
+ // GOT relative relocation for the symbol.
+ Sym = DAG.getNode(ISD::ADD, dl, PtrVT, GOT, Sym);
+ }
+
+ // Load the offset value for TLS symbol.This offset is relative to
+ // thread pointer.
+ SDValue LoadOffset =
+ DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Sym, MachinePointerInfo());
+
+ // Address of the thread local variable is the add of thread
+ // pointer and the offset of the variable.
+ return DAG.getNode(ISD::ADD, dl, PtrVT, TP, LoadOffset);
+}
+
+//
+// Lower using the local executable model for TLS addresses
+//
+SDValue
+HexagonTargetLowering::LowerToTLSLocalExecModel(GlobalAddressSDNode *GA,
+ SelectionDAG &DAG) const {
+ SDLoc dl(GA);
+ int64_t Offset = GA->getOffset();
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+ // Get the thread pointer.
+ SDValue TP = DAG.getCopyFromReg(DAG.getEntryNode(), dl, Hexagon::UGP, PtrVT);
+ // Generate the TLS symbol address
+ SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, PtrVT, Offset,
+ HexagonII::MO_TPREL);
+ SDValue Sym = DAG.getNode(HexagonISD::CONST32, dl, PtrVT, TGA);
+
+ // Address of the thread local variable is the add of thread
+ // pointer and the offset of the variable.
+ return DAG.getNode(ISD::ADD, dl, PtrVT, TP, Sym);
+}
+
+//
+// Lower using the general dynamic model for TLS addresses
+//
+SDValue
+HexagonTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
+ SelectionDAG &DAG) const {
+ SDLoc dl(GA);
+ int64_t Offset = GA->getOffset();
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+ // First generate the TLS symbol address
+ SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, PtrVT, Offset,
+ HexagonII::MO_GDGOT);
+
+ // Then, generate the GOT pointer
+ SDValue GOT = LowerGLOBAL_OFFSET_TABLE(TGA, DAG);
+
+ // Add the TLS symbol and the GOT pointer
+ SDValue Sym = DAG.getNode(HexagonISD::CONST32, dl, PtrVT, TGA);
+ SDValue Chain = DAG.getNode(ISD::ADD, dl, PtrVT, GOT, Sym);
+
+ // Copy over the argument to R0
+ SDValue InFlag;
+ Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, Hexagon::R0, Chain, InFlag);
+ InFlag = Chain.getValue(1);
+
+ return GetDynamicTLSAddr(DAG, Chain, GA, &InFlag, PtrVT,
+ Hexagon::R0, HexagonII::MO_GDPLT);
+}
+
+//
+// Lower TLS addresses.
+//
+// For now for dynamic models, we only support the general dynamic model.
+//
+SDValue
+HexagonTargetLowering::LowerGlobalTLSAddress(SDValue Op,
+ SelectionDAG &DAG) const {
+ GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+
+ switch (HTM.getTLSModel(GA->getGlobal())) {
+ case TLSModel::GeneralDynamic:
+ case TLSModel::LocalDynamic:
+ return LowerToTLSGeneralDynamicModel(GA, DAG);
+ case TLSModel::InitialExec:
+ return LowerToTLSInitialExecModel(GA, DAG);
+ case TLSModel::LocalExec:
+ return LowerToTLSLocalExecModel(GA, DAG);
+ }
+ llvm_unreachable("Bogus TLS model");
+}
+
+//===----------------------------------------------------------------------===//
+// TargetLowering Implementation
+//===----------------------------------------------------------------------===//
+
+HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
+ const HexagonSubtarget &ST)
+ : TargetLowering(TM), HTM(static_cast<const HexagonTargetMachine&>(TM)),
+ Subtarget(ST) {
+ bool IsV4 = !Subtarget.hasV5TOps();
+ auto &HRI = *Subtarget.getRegisterInfo();
+ bool UseHVX = Subtarget.useHVXOps();
+ bool UseHVXSgl = Subtarget.useHVXSglOps();
+ bool UseHVXDbl = Subtarget.useHVXDblOps();
+
+ setPrefLoopAlignment(4);
+ setPrefFunctionAlignment(4);
+ setMinFunctionAlignment(2);
+ setStackPointerRegisterToSaveRestore(HRI.getStackRegister());
+
+ setMaxAtomicSizeInBitsSupported(64);
+ setMinCmpXchgSizeInBits(32);
+
+ if (EnableHexSDNodeSched)
+ setSchedulingPreference(Sched::VLIW);
+ else
+ setSchedulingPreference(Sched::Source);
+
+ // Limits for inline expansion of memcpy/memmove
+ MaxStoresPerMemcpy = MaxStoresPerMemcpyCL;
+ MaxStoresPerMemcpyOptSize = MaxStoresPerMemcpyOptSizeCL;
+ MaxStoresPerMemmove = MaxStoresPerMemmoveCL;
+ MaxStoresPerMemmoveOptSize = MaxStoresPerMemmoveOptSizeCL;
+ MaxStoresPerMemset = MaxStoresPerMemsetCL;
+ MaxStoresPerMemsetOptSize = MaxStoresPerMemsetOptSizeCL;
+
+ //
+ // Set up register classes.
+ //
+
+ addRegisterClass(MVT::i1, &Hexagon::PredRegsRegClass);
+ addRegisterClass(MVT::v2i1, &Hexagon::PredRegsRegClass); // bbbbaaaa
+ addRegisterClass(MVT::v4i1, &Hexagon::PredRegsRegClass); // ddccbbaa
+ addRegisterClass(MVT::v8i1, &Hexagon::PredRegsRegClass); // hgfedcba
+ addRegisterClass(MVT::i32, &Hexagon::IntRegsRegClass);
+ addRegisterClass(MVT::v4i8, &Hexagon::IntRegsRegClass);
+ addRegisterClass(MVT::v2i16, &Hexagon::IntRegsRegClass);
+ addRegisterClass(MVT::i64, &Hexagon::DoubleRegsRegClass);
+ addRegisterClass(MVT::v8i8, &Hexagon::DoubleRegsRegClass);
+ addRegisterClass(MVT::v4i16, &Hexagon::DoubleRegsRegClass);
+ addRegisterClass(MVT::v2i32, &Hexagon::DoubleRegsRegClass);
+
+ if (Subtarget.hasV5TOps()) {
+ addRegisterClass(MVT::f32, &Hexagon::IntRegsRegClass);
+ addRegisterClass(MVT::f64, &Hexagon::DoubleRegsRegClass);
+ }
+
+ if (Subtarget.hasV60TOps()) {
+ if (Subtarget.useHVXSglOps()) {
+ addRegisterClass(MVT::v64i8, &Hexagon::VectorRegsRegClass);
+ addRegisterClass(MVT::v32i16, &Hexagon::VectorRegsRegClass);
+ addRegisterClass(MVT::v16i32, &Hexagon::VectorRegsRegClass);
+ addRegisterClass(MVT::v8i64, &Hexagon::VectorRegsRegClass);
+ addRegisterClass(MVT::v128i8, &Hexagon::VecDblRegsRegClass);
+ addRegisterClass(MVT::v64i16, &Hexagon::VecDblRegsRegClass);
+ addRegisterClass(MVT::v32i32, &Hexagon::VecDblRegsRegClass);
+ addRegisterClass(MVT::v16i64, &Hexagon::VecDblRegsRegClass);
+ addRegisterClass(MVT::v512i1, &Hexagon::VecPredRegsRegClass);
+ } else if (Subtarget.useHVXDblOps()) {
+ addRegisterClass(MVT::v128i8, &Hexagon::VectorRegs128BRegClass);
+ addRegisterClass(MVT::v64i16, &Hexagon::VectorRegs128BRegClass);
+ addRegisterClass(MVT::v32i32, &Hexagon::VectorRegs128BRegClass);
+ addRegisterClass(MVT::v16i64, &Hexagon::VectorRegs128BRegClass);
+ addRegisterClass(MVT::v256i8, &Hexagon::VecDblRegs128BRegClass);
+ addRegisterClass(MVT::v128i16, &Hexagon::VecDblRegs128BRegClass);
+ addRegisterClass(MVT::v64i32, &Hexagon::VecDblRegs128BRegClass);
+ addRegisterClass(MVT::v32i64, &Hexagon::VecDblRegs128BRegClass);
+ addRegisterClass(MVT::v1024i1, &Hexagon::VecPredRegs128BRegClass);
+ }
+ }
+
+ //
+ // Handling of scalar operations.
+ //
+ // All operations default to "legal", except:
+ // - indexed loads and stores (pre-/post-incremented),
+ // - ANY_EXTEND_VECTOR_INREG, ATOMIC_CMP_SWAP_WITH_SUCCESS, CONCAT_VECTORS,
+ // ConstantFP, DEBUGTRAP, FCEIL, FCOPYSIGN, FEXP, FEXP2, FFLOOR, FGETSIGN,
+ // FLOG, FLOG2, FLOG10, FMAXNUM, FMINNUM, FNEARBYINT, FRINT, FROUND, TRAP,
+ // FTRUNC, PREFETCH, SIGN_EXTEND_VECTOR_INREG, ZERO_EXTEND_VECTOR_INREG,
+ // which default to "expand" for at least one type.
+
+ // Misc operations.
+ setOperationAction(ISD::ConstantFP, MVT::f32, Legal); // Default: expand
+ setOperationAction(ISD::ConstantFP, MVT::f64, Legal); // Default: expand
+
+ setOperationAction(ISD::ConstantPool, MVT::i32, Custom);
+ setOperationAction(ISD::JumpTable, MVT::i32, Custom);
+ setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+ setOperationAction(ISD::INLINEASM, MVT::Other, Custom);
+ setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+ setOperationAction(ISD::EH_RETURN, MVT::Other, Custom);
+ setOperationAction(ISD::GLOBAL_OFFSET_TABLE, MVT::i32, Custom);
+ setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
+ setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
+
+ // Custom legalize GlobalAddress nodes into CONST32.
+ setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+ setOperationAction(ISD::GlobalAddress, MVT::i8, Custom);
+ setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
+
+ // Hexagon needs to optimize cases with negative constants.
+ setOperationAction(ISD::SETCC, MVT::i8, Custom);
+ setOperationAction(ISD::SETCC, MVT::i16, Custom);
+
+ // VASTART needs to be custom lowered to use the VarArgsFrameIndex.
+ setOperationAction(ISD::VASTART, MVT::Other, Custom);
+ setOperationAction(ISD::VAEND, MVT::Other, Expand);
+ setOperationAction(ISD::VAARG, MVT::Other, Expand);
+
+ setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+ setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+
+ if (EmitJumpTables)
+ setMinimumJumpTableEntries(MinimumJumpTables);
+ else
+ setMinimumJumpTableEntries(std::numeric_limits<int>::max());
+ setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+
+ // Hexagon has instructions for add/sub with carry. The problem with
+ // modeling these instructions is that they produce 2 results: Rdd and Px.
+ // To model the update of Px, we will have to use Defs[p0..p3] which will
+ // cause any predicate live range to spill. So, we pretend we dont't have
+ // these instructions.
+ setOperationAction(ISD::ADDE, MVT::i8, Expand);
+ setOperationAction(ISD::ADDE, MVT::i16, Expand);
+ setOperationAction(ISD::ADDE, MVT::i32, Expand);
+ setOperationAction(ISD::ADDE, MVT::i64, Expand);
+ setOperationAction(ISD::SUBE, MVT::i8, Expand);
+ setOperationAction(ISD::SUBE, MVT::i16, Expand);
+ setOperationAction(ISD::SUBE, MVT::i32, Expand);
+ setOperationAction(ISD::SUBE, MVT::i64, Expand);
+ setOperationAction(ISD::ADDC, MVT::i8, Expand);
+ setOperationAction(ISD::ADDC, MVT::i16, Expand);
+ setOperationAction(ISD::ADDC, MVT::i32, Expand);
+ setOperationAction(ISD::ADDC, MVT::i64, Expand);
+ setOperationAction(ISD::SUBC, MVT::i8, Expand);
+ setOperationAction(ISD::SUBC, MVT::i16, Expand);
+ setOperationAction(ISD::SUBC, MVT::i32, Expand);
+ setOperationAction(ISD::SUBC, MVT::i64, Expand);
+
+ // Only add and sub that detect overflow are the saturating ones.
+ for (MVT VT : MVT::integer_valuetypes()) {
+ setOperationAction(ISD::UADDO, VT, Expand);
+ setOperationAction(ISD::SADDO, VT, Expand);
+ setOperationAction(ISD::USUBO, VT, Expand);
+ setOperationAction(ISD::SSUBO, VT, Expand);
+ }
+
+ setOperationAction(ISD::CTLZ, MVT::i8, Promote);
+ setOperationAction(ISD::CTLZ, MVT::i16, Promote);
+ setOperationAction(ISD::CTTZ, MVT::i8, Promote);
+ setOperationAction(ISD::CTTZ, MVT::i16, Promote);
+
+ // In V5, popcount can count # of 1s in i64 but returns i32.
+ // On V4 it will be expanded (set later).
+ setOperationAction(ISD::CTPOP, MVT::i8, Promote);
+ setOperationAction(ISD::CTPOP, MVT::i16, Promote);
+ setOperationAction(ISD::CTPOP, MVT::i32, Promote);
+ setOperationAction(ISD::CTPOP, MVT::i64, Custom);
+
+ // We custom lower i64 to i64 mul, so that it is not considered as a legal
+ // operation. There is a pattern that will match i64 mul and transform it
+ // to a series of instructions.
+ setOperationAction(ISD::MUL, MVT::i64, Expand);
+
+ for (unsigned IntExpOp :
+ { ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM,
+ ISD::SDIVREM, ISD::UDIVREM, ISD::ROTL, ISD::ROTR,
+ ISD::BSWAP, ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS,
+ ISD::SMUL_LOHI, ISD::UMUL_LOHI }) {
+ setOperationAction(IntExpOp, MVT::i32, Expand);
+ setOperationAction(IntExpOp, MVT::i64, Expand);
+ }
+
+ for (unsigned FPExpOp :
+ {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, ISD::FSINCOS,
+ ISD::FPOW, ISD::FCOPYSIGN}) {
+ setOperationAction(FPExpOp, MVT::f32, Expand);
+ setOperationAction(FPExpOp, MVT::f64, Expand);
+ }
+
+ // No extending loads from i32.
+ for (MVT VT : MVT::integer_valuetypes()) {
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
+ }
+ // Turn FP truncstore into trunc + store.
+ setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+ // Turn FP extload into load/fpextend.
+ for (MVT VT : MVT::fp_valuetypes())
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
+
+ // Expand BR_CC and SELECT_CC for all integer and fp types.
+ for (MVT VT : MVT::integer_valuetypes()) {
+ setOperationAction(ISD::BR_CC, VT, Expand);
+ setOperationAction(ISD::SELECT_CC, VT, Expand);
+ }
+ for (MVT VT : MVT::fp_valuetypes()) {
+ setOperationAction(ISD::BR_CC, VT, Expand);
+ setOperationAction(ISD::SELECT_CC, VT, Expand);
+ }
+ setOperationAction(ISD::BR_CC, MVT::Other, Expand);
+
+ //
+ // Handling of vector operations.
+ //
+
+ // Custom lower v4i16 load only. Let v4i16 store to be
+ // promoted for now.
+ promoteLdStType(MVT::v4i8, MVT::i32);
+ promoteLdStType(MVT::v2i16, MVT::i32);
+ promoteLdStType(MVT::v8i8, MVT::i64);
+ promoteLdStType(MVT::v2i32, MVT::i64);
+
+ setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
+ setOperationAction(ISD::STORE, MVT::v4i16, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::i64);
+ AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::i64);
+
+ // Set the action for vector operations to "expand", then override it with
+ // either "custom" or "legal" for specific cases.
+ static const unsigned VectExpOps[] = {
+ // Integer arithmetic:
+ ISD::ADD, ISD::SUB, ISD::MUL, ISD::SDIV, ISD::UDIV,
+ ISD::SREM, ISD::UREM, ISD::SDIVREM, ISD::UDIVREM, ISD::ADDC,
+ ISD::SUBC, ISD::SADDO, ISD::UADDO, ISD::SSUBO, ISD::USUBO,
+ ISD::SMUL_LOHI, ISD::UMUL_LOHI,
+ // Logical/bit:
+ ISD::AND, ISD::OR, ISD::XOR, ISD::ROTL, ISD::ROTR,
+ ISD::CTPOP, ISD::CTLZ, ISD::CTTZ,
+ // Floating point arithmetic/math functions:
+ ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FMA, ISD::FDIV,
+ ISD::FREM, ISD::FNEG, ISD::FABS, ISD::FSQRT, ISD::FSIN,
+ ISD::FCOS, ISD::FPOWI, ISD::FPOW, ISD::FLOG, ISD::FLOG2,
+ ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FCEIL, ISD::FTRUNC,
+ ISD::FRINT, ISD::FNEARBYINT, ISD::FROUND, ISD::FFLOOR,
+ ISD::FMINNUM, ISD::FMAXNUM, ISD::FSINCOS,
+ // Misc:
+ ISD::BR_CC, ISD::SELECT_CC, ISD::ConstantPool,
+ // Vector:
+ ISD::BUILD_VECTOR, ISD::SCALAR_TO_VECTOR,
+ ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT,
+ ISD::EXTRACT_SUBVECTOR, ISD::INSERT_SUBVECTOR,
+ ISD::CONCAT_VECTORS, ISD::VECTOR_SHUFFLE
+ };
+
+ for (MVT VT : MVT::vector_valuetypes()) {
+ for (unsigned VectExpOp : VectExpOps)
+ setOperationAction(VectExpOp, VT, Expand);
+
+ // Expand all extending loads and truncating stores:
+ for (MVT TargetVT : MVT::vector_valuetypes()) {
+ if (TargetVT == VT)
+ continue;
+ setLoadExtAction(ISD::EXTLOAD, TargetVT, VT, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, TargetVT, VT, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, TargetVT, VT, Expand);
+ setTruncStoreAction(VT, TargetVT, Expand);
+ }
+
+ // Normalize all inputs to SELECT to be vectors of i32.
+ if (VT.getVectorElementType() != MVT::i32) {
+ MVT VT32 = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
+ setOperationAction(ISD::SELECT, VT, Promote);
+ AddPromotedToType(ISD::SELECT, VT, VT32);
+ }
+ setOperationAction(ISD::SRA, VT, Custom);
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::SRL, VT, Custom);
+ }
+
+ // Types natively supported:
+ for (MVT NativeVT : {MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v32i1, MVT::v64i1,
+ MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16, MVT::v1i32,
+ MVT::v2i32, MVT::v1i64}) {
+ setOperationAction(ISD::BUILD_VECTOR, NativeVT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, NativeVT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, NativeVT, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, NativeVT, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, NativeVT, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, NativeVT, Custom);
+
+ setOperationAction(ISD::ADD, NativeVT, Legal);
+ setOperationAction(ISD::SUB, NativeVT, Legal);
+ setOperationAction(ISD::MUL, NativeVT, Legal);
+ setOperationAction(ISD::AND, NativeVT, Legal);
+ setOperationAction(ISD::OR, NativeVT, Legal);
+ setOperationAction(ISD::XOR, NativeVT, Legal);
+ }
+
+ setOperationAction(ISD::SETCC, MVT::v2i16, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v2i16, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom);
+
+ if (UseHVX) {
+ if (UseHVXSgl) {
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v128i8, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i16, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i64, Custom);
+ // We try to generate the vpack{e/o} instructions. If we fail
+ // we fall back upon ExpandOp.
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v64i8, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i16, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i32, Custom);
+ } else if (UseHVXDbl) {
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v256i8, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v128i16, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i64, Custom);
+ // We try to generate the vpack{e/o} instructions. If we fail
+ // we fall back upon ExpandOp.
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v128i8, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i16, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v128i8, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v64i16, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom);
+ } else {
+ llvm_unreachable("Unrecognized HVX mode");
+ }
+ }
+ // Subtarget-specific operation actions.
+ //
+ if (Subtarget.hasV5TOps()) {
+ setOperationAction(ISD::FMA, MVT::f64, Expand);
+ setOperationAction(ISD::FADD, MVT::f64, Expand);
+ setOperationAction(ISD::FSUB, MVT::f64, Expand);
+ setOperationAction(ISD::FMUL, MVT::f64, Expand);
+
+ setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
+ setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
+
+ setOperationAction(ISD::FP_TO_UINT, MVT::i1, Promote);
+ setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
+ setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
+ setOperationAction(ISD::FP_TO_SINT, MVT::i1, Promote);
+ setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
+ setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
+ setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote);
+ setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
+ setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
+ setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote);
+ setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
+ setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
+ } else { // V4
+ setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand);
+ setOperationAction(ISD::SINT_TO_FP, MVT::i64, Expand);
+ setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
+ setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
+ setOperationAction(ISD::FP_TO_SINT, MVT::f64, Expand);
+ setOperationAction(ISD::FP_TO_SINT, MVT::f32, Expand);
+ setOperationAction(ISD::FP_EXTEND, MVT::f32, Expand);
+ setOperationAction(ISD::FP_ROUND, MVT::f64, Expand);
+ setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
+
+ setOperationAction(ISD::CTPOP, MVT::i8, Expand);
+ setOperationAction(ISD::CTPOP, MVT::i16, Expand);
+ setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+ setOperationAction(ISD::CTPOP, MVT::i64, Expand);
+
+ // Expand these operations for both f32 and f64:
+ for (unsigned FPExpOpV4 :
+ {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FABS, ISD::FNEG, ISD::FMA}) {
+ setOperationAction(FPExpOpV4, MVT::f32, Expand);
+ setOperationAction(FPExpOpV4, MVT::f64, Expand);
+ }
+
+ for (ISD::CondCode FPExpCCV4 :
+ {ISD::SETOEQ, ISD::SETOGT, ISD::SETOLT, ISD::SETOGE, ISD::SETOLE,
+ ISD::SETUO, ISD::SETO}) {
+ setCondCodeAction(FPExpCCV4, MVT::f32, Expand);
+ setCondCodeAction(FPExpCCV4, MVT::f64, Expand);
+ }
+ }
+
+ // Handling of indexed loads/stores: default is "expand".
+ //
+ for (MVT VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
+ setIndexedLoadAction(ISD::POST_INC, VT, Legal);
+ setIndexedStoreAction(ISD::POST_INC, VT, Legal);
+ }
+
+ if (UseHVXSgl) {
+ for (MVT VT : {MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
+ MVT::v128i8, MVT::v64i16, MVT::v32i32, MVT::v16i64}) {
+ setIndexedLoadAction(ISD::POST_INC, VT, Legal);
+ setIndexedStoreAction(ISD::POST_INC, VT, Legal);
+ }
+ } else if (UseHVXDbl) {
+ for (MVT VT : {MVT::v128i8, MVT::v64i16, MVT::v32i32, MVT::v16i64,
+ MVT::v256i8, MVT::v128i16, MVT::v64i32, MVT::v32i64}) {
+ setIndexedLoadAction(ISD::POST_INC, VT, Legal);
+ setIndexedStoreAction(ISD::POST_INC, VT, Legal);
+ }
+ }
+
+ computeRegisterProperties(&HRI);
+
+ //
+ // Library calls for unsupported operations
+ //
+ bool FastMath = EnableFastMath;
+
+ setLibcallName(RTLIB::SDIV_I32, "__hexagon_divsi3");
+ setLibcallName(RTLIB::SDIV_I64, "__hexagon_divdi3");
+ setLibcallName(RTLIB::UDIV_I32, "__hexagon_udivsi3");
+ setLibcallName(RTLIB::UDIV_I64, "__hexagon_udivdi3");
+ setLibcallName(RTLIB::SREM_I32, "__hexagon_modsi3");
+ setLibcallName(RTLIB::SREM_I64, "__hexagon_moddi3");
+ setLibcallName(RTLIB::UREM_I32, "__hexagon_umodsi3");
+ setLibcallName(RTLIB::UREM_I64, "__hexagon_umoddi3");
+
+ setLibcallName(RTLIB::SINTTOFP_I128_F64, "__hexagon_floattidf");
+ setLibcallName(RTLIB::SINTTOFP_I128_F32, "__hexagon_floattisf");
+ setLibcallName(RTLIB::FPTOUINT_F32_I128, "__hexagon_fixunssfti");
+ setLibcallName(RTLIB::FPTOUINT_F64_I128, "__hexagon_fixunsdfti");
+ setLibcallName(RTLIB::FPTOSINT_F32_I128, "__hexagon_fixsfti");
+ setLibcallName(RTLIB::FPTOSINT_F64_I128, "__hexagon_fixdfti");
+
+ if (IsV4) {
+ // Handle single-precision floating point operations on V4.
+ if (FastMath) {
+ setLibcallName(RTLIB::ADD_F32, "__hexagon_fast_addsf3");
+ setLibcallName(RTLIB::SUB_F32, "__hexagon_fast_subsf3");
+ setLibcallName(RTLIB::MUL_F32, "__hexagon_fast_mulsf3");
+ setLibcallName(RTLIB::OGT_F32, "__hexagon_fast_gtsf2");
+ setLibcallName(RTLIB::OLT_F32, "__hexagon_fast_ltsf2");
+ // Double-precision compares.
+ setLibcallName(RTLIB::OGT_F64, "__hexagon_fast_gtdf2");
+ setLibcallName(RTLIB::OLT_F64, "__hexagon_fast_ltdf2");
+ } else {
+ setLibcallName(RTLIB::ADD_F32, "__hexagon_addsf3");
+ setLibcallName(RTLIB::SUB_F32, "__hexagon_subsf3");
+ setLibcallName(RTLIB::MUL_F32, "__hexagon_mulsf3");
+ setLibcallName(RTLIB::OGT_F32, "__hexagon_gtsf2");
+ setLibcallName(RTLIB::OLT_F32, "__hexagon_ltsf2");
+ // Double-precision compares.
+ setLibcallName(RTLIB::OGT_F64, "__hexagon_gtdf2");
+ setLibcallName(RTLIB::OLT_F64, "__hexagon_ltdf2");
+ }
+ }
+
+ // This is the only fast library function for sqrtd.
+ if (FastMath)
+ setLibcallName(RTLIB::SQRT_F64, "__hexagon_fast2_sqrtdf2");
+
+ // Prefix is: nothing for "slow-math",
+ // "fast2_" for V4 fast-math and V5+ fast-math double-precision
+ // (actually, keep fast-math and fast-math2 separate for now)
+ if (FastMath) {
+ setLibcallName(RTLIB::ADD_F64, "__hexagon_fast_adddf3");
+ setLibcallName(RTLIB::SUB_F64, "__hexagon_fast_subdf3");
+ setLibcallName(RTLIB::MUL_F64, "__hexagon_fast_muldf3");
+ setLibcallName(RTLIB::DIV_F64, "__hexagon_fast_divdf3");
+ // Calling __hexagon_fast2_divsf3 with fast-math on V5 (ok).
+ setLibcallName(RTLIB::DIV_F32, "__hexagon_fast_divsf3");
+ } else {
+ setLibcallName(RTLIB::ADD_F64, "__hexagon_adddf3");
+ setLibcallName(RTLIB::SUB_F64, "__hexagon_subdf3");
+ setLibcallName(RTLIB::MUL_F64, "__hexagon_muldf3");
+ setLibcallName(RTLIB::DIV_F64, "__hexagon_divdf3");
+ setLibcallName(RTLIB::DIV_F32, "__hexagon_divsf3");
+ }
+
+ if (Subtarget.hasV5TOps()) {
+ if (FastMath)
+ setLibcallName(RTLIB::SQRT_F32, "__hexagon_fast2_sqrtf");
+ else
+ setLibcallName(RTLIB::SQRT_F32, "__hexagon_sqrtf");
+ } else {
+ // V4
+ setLibcallName(RTLIB::SINTTOFP_I32_F32, "__hexagon_floatsisf");
+ setLibcallName(RTLIB::SINTTOFP_I32_F64, "__hexagon_floatsidf");
+ setLibcallName(RTLIB::SINTTOFP_I64_F32, "__hexagon_floatdisf");
+ setLibcallName(RTLIB::SINTTOFP_I64_F64, "__hexagon_floatdidf");
+ setLibcallName(RTLIB::UINTTOFP_I32_F32, "__hexagon_floatunsisf");
+ setLibcallName(RTLIB::UINTTOFP_I32_F64, "__hexagon_floatunsidf");
+ setLibcallName(RTLIB::UINTTOFP_I64_F32, "__hexagon_floatundisf");
+ setLibcallName(RTLIB::UINTTOFP_I64_F64, "__hexagon_floatundidf");
+ setLibcallName(RTLIB::FPTOUINT_F32_I32, "__hexagon_fixunssfsi");
+ setLibcallName(RTLIB::FPTOUINT_F32_I64, "__hexagon_fixunssfdi");
+ setLibcallName(RTLIB::FPTOUINT_F64_I32, "__hexagon_fixunsdfsi");
+ setLibcallName(RTLIB::FPTOUINT_F64_I64, "__hexagon_fixunsdfdi");
+ setLibcallName(RTLIB::FPTOSINT_F32_I32, "__hexagon_fixsfsi");
+ setLibcallName(RTLIB::FPTOSINT_F32_I64, "__hexagon_fixsfdi");
+ setLibcallName(RTLIB::FPTOSINT_F64_I32, "__hexagon_fixdfsi");
+ setLibcallName(RTLIB::FPTOSINT_F64_I64, "__hexagon_fixdfdi");
+ setLibcallName(RTLIB::FPEXT_F32_F64, "__hexagon_extendsfdf2");
+ setLibcallName(RTLIB::FPROUND_F64_F32, "__hexagon_truncdfsf2");
+ setLibcallName(RTLIB::OEQ_F32, "__hexagon_eqsf2");
+ setLibcallName(RTLIB::OEQ_F64, "__hexagon_eqdf2");
+ setLibcallName(RTLIB::OGE_F32, "__hexagon_gesf2");
+ setLibcallName(RTLIB::OGE_F64, "__hexagon_gedf2");
+ setLibcallName(RTLIB::OLE_F32, "__hexagon_lesf2");
+ setLibcallName(RTLIB::OLE_F64, "__hexagon_ledf2");
+ setLibcallName(RTLIB::UNE_F32, "__hexagon_nesf2");
+ setLibcallName(RTLIB::UNE_F64, "__hexagon_nedf2");
+ setLibcallName(RTLIB::UO_F32, "__hexagon_unordsf2");
+ setLibcallName(RTLIB::UO_F64, "__hexagon_unorddf2");
+ setLibcallName(RTLIB::O_F32, "__hexagon_unordsf2");
+ setLibcallName(RTLIB::O_F64, "__hexagon_unorddf2");
+ }
+
+ // These cause problems when the shift amount is non-constant.
+ setLibcallName(RTLIB::SHL_I128, nullptr);
+ setLibcallName(RTLIB::SRL_I128, nullptr);
+ setLibcallName(RTLIB::SRA_I128, nullptr);
+}
+
+const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
+ switch ((HexagonISD::NodeType)Opcode) {
+ case HexagonISD::ALLOCA: return "HexagonISD::ALLOCA";
+ case HexagonISD::AT_GOT: return "HexagonISD::AT_GOT";
+ case HexagonISD::AT_PCREL: return "HexagonISD::AT_PCREL";
+ case HexagonISD::BARRIER: return "HexagonISD::BARRIER";
+ case HexagonISD::CALL: return "HexagonISD::CALL";
+ case HexagonISD::CALLnr: return "HexagonISD::CALLnr";
+ case HexagonISD::CALLR: return "HexagonISD::CALLR";
+ case HexagonISD::COMBINE: return "HexagonISD::COMBINE";
+ case HexagonISD::CONST32_GP: return "HexagonISD::CONST32_GP";
+ case HexagonISD::CONST32: return "HexagonISD::CONST32";
+ case HexagonISD::CP: return "HexagonISD::CP";
+ case HexagonISD::DCFETCH: return "HexagonISD::DCFETCH";
+ case HexagonISD::EH_RETURN: return "HexagonISD::EH_RETURN";
+ case HexagonISD::EXTRACTU: return "HexagonISD::EXTRACTU";
+ case HexagonISD::EXTRACTURP: return "HexagonISD::EXTRACTURP";
+ case HexagonISD::INSERT: return "HexagonISD::INSERT";
+ case HexagonISD::INSERTRP: return "HexagonISD::INSERTRP";
+ case HexagonISD::JT: return "HexagonISD::JT";
+ case HexagonISD::PACKHL: return "HexagonISD::PACKHL";
+ case HexagonISD::POPCOUNT: return "HexagonISD::POPCOUNT";
+ case HexagonISD::RET_FLAG: return "HexagonISD::RET_FLAG";
+ case HexagonISD::SHUFFEB: return "HexagonISD::SHUFFEB";
+ case HexagonISD::SHUFFEH: return "HexagonISD::SHUFFEH";
+ case HexagonISD::SHUFFOB: return "HexagonISD::SHUFFOB";
+ case HexagonISD::SHUFFOH: return "HexagonISD::SHUFFOH";
+ case HexagonISD::TC_RETURN: return "HexagonISD::TC_RETURN";
+ case HexagonISD::VCMPBEQ: return "HexagonISD::VCMPBEQ";
+ case HexagonISD::VCMPBGT: return "HexagonISD::VCMPBGT";
+ case HexagonISD::VCMPBGTU: return "HexagonISD::VCMPBGTU";
+ case HexagonISD::VCMPHEQ: return "HexagonISD::VCMPHEQ";
+ case HexagonISD::VCMPHGT: return "HexagonISD::VCMPHGT";
+ case HexagonISD::VCMPHGTU: return "HexagonISD::VCMPHGTU";
+ case HexagonISD::VCMPWEQ: return "HexagonISD::VCMPWEQ";
+ case HexagonISD::VCMPWGT: return "HexagonISD::VCMPWGT";
+ case HexagonISD::VCMPWGTU: return "HexagonISD::VCMPWGTU";
+ case HexagonISD::VCOMBINE: return "HexagonISD::VCOMBINE";
+ case HexagonISD::VPACK: return "HexagonISD::VPACK";
+ case HexagonISD::VSHLH: return "HexagonISD::VSHLH";
+ case HexagonISD::VSHLW: return "HexagonISD::VSHLW";
+ case HexagonISD::VSPLATB: return "HexagonISD::VSPLTB";
+ case HexagonISD::VSPLATH: return "HexagonISD::VSPLATH";
+ case HexagonISD::VSRAH: return "HexagonISD::VSRAH";
+ case HexagonISD::VSRAW: return "HexagonISD::VSRAW";
+ case HexagonISD::VSRLH: return "HexagonISD::VSRLH";
+ case HexagonISD::VSRLW: return "HexagonISD::VSRLW";
+ case HexagonISD::VSXTBH: return "HexagonISD::VSXTBH";
+ case HexagonISD::VSXTBW: return "HexagonISD::VSXTBW";
+ case HexagonISD::OP_END: break;
+ }
+ return nullptr;
+}
+
+bool HexagonTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
+ EVT MTy1 = EVT::getEVT(Ty1);
+ EVT MTy2 = EVT::getEVT(Ty2);
+ if (!MTy1.isSimple() || !MTy2.isSimple())
+ return false;
+ return (MTy1.getSimpleVT() == MVT::i64) && (MTy2.getSimpleVT() == MVT::i32);
+}
+
+bool HexagonTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
+ if (!VT1.isSimple() || !VT2.isSimple())
+ return false;
+ return (VT1.getSimpleVT() == MVT::i64) && (VT2.getSimpleVT() == MVT::i32);
+}
+
+bool HexagonTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+ return isOperationLegalOrCustom(ISD::FMA, VT);
+}
+
+// Should we expand the build vector with shuffles?
+bool HexagonTargetLowering::shouldExpandBuildVectorWithShuffles(EVT VT,
+ unsigned DefinedValues) const {
+ // Hexagon vector shuffle operates on element sizes of bytes or halfwords
+ EVT EltVT = VT.getVectorElementType();
+ int EltBits = EltVT.getSizeInBits();
+ if ((EltBits != 8) && (EltBits != 16))
+ return false;
+
+ return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues);
+}
+
+static StridedLoadKind isStridedLoad(const ArrayRef<int> &Mask) {
+ int even_start = -2;
+ int odd_start = -1;
+ size_t mask_len = Mask.size();
+ for (auto idx : Mask) {
+ if ((idx - even_start) == 2)
+ even_start = idx;
+ else
+ break;
+ }
+ if (even_start == (int)(mask_len * 2) - 2)
+ return StridedLoadKind::Even;
+ for (auto idx : Mask) {
+ if ((idx - odd_start) == 2)
+ odd_start = idx;
+ else
+ break;
+ }
+ if (odd_start == (int)(mask_len * 2) - 1)
+ return StridedLoadKind::Odd;
+
+ return StridedLoadKind::NoPattern;
+}
+
+bool HexagonTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &Mask,
+ EVT VT) const {
+ if (Subtarget.useHVXOps())
+ return isStridedLoad(Mask) != StridedLoadKind::NoPattern;
+ return true;
+}
+
+// Lower a vector shuffle (V1, V2, V3). V1 and V2 are the two vectors
+// to select data from, V3 is the permutation.
+SDValue
+HexagonTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG)
+ const {
+ const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+ bool UseHVX = Subtarget.useHVXOps();
+
+ if (V2.isUndef())
+ V2 = V1;
+
+ if (SVN->isSplat()) {
+ int Lane = SVN->getSplatIndex();
+ if (Lane == -1) Lane = 0;
+
+ // Test if V1 is a SCALAR_TO_VECTOR.
+ if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
+ return createSplat(DAG, dl, VT, V1.getOperand(0));
+
+ // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
+ // (and probably will turn into a SCALAR_TO_VECTOR once legalization
+ // reaches it).
+ if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
+ !isa<ConstantSDNode>(V1.getOperand(0))) {
+ bool IsScalarToVector = true;
+ for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i) {
+ if (!V1.getOperand(i).isUndef()) {
+ IsScalarToVector = false;
+ break;
+ }
+ }
+ if (IsScalarToVector)
+ return createSplat(DAG, dl, VT, V1.getOperand(0));
+ }
+ return createSplat(DAG, dl, VT, DAG.getConstant(Lane, dl, MVT::i32));
+ }
+
+ if (UseHVX) {
+ ArrayRef<int> Mask = SVN->getMask();
+ size_t MaskLen = Mask.size();
+ int ElemSizeInBits = VT.getScalarSizeInBits();
+ if ((Subtarget.useHVXSglOps() && (ElemSizeInBits * MaskLen) == 64 * 8) ||
+ (Subtarget.useHVXDblOps() && (ElemSizeInBits * MaskLen) == 128 * 8)) {
+ // Return 1 for odd and 2 of even
+ StridedLoadKind Pattern = isStridedLoad(Mask);
+
+ if (Pattern == StridedLoadKind::NoPattern)
+ return SDValue();
+
+ SDValue Vec0 = Op.getOperand(0);
+ SDValue Vec1 = Op.getOperand(1);
+ SDValue StridePattern = DAG.getConstant(Pattern, dl, MVT::i32);
+ SDValue Ops[] = { Vec1, Vec0, StridePattern };
+ return DAG.getNode(HexagonISD::VPACK, dl, VT, Ops);
+ }
+ // We used to assert in the "else" part here, but that is bad for Halide
+ // Halide creates intermediate double registers by interleaving two
+ // concatenated vector registers. The interleaving requires vector_shuffle
+ // nodes and we shouldn't barf on a double register result of a
+ // vector_shuffle because it is most likely an intermediate result.
+ }
+ // FIXME: We need to support more general vector shuffles. See
+ // below the comment from the ARM backend that deals in the general
+ // case with the vector shuffles. For now, let expand handle these.
+ return SDValue();
+
+ // If the shuffle is not directly supported and it has 4 elements, use
+ // the PerfectShuffle-generated table to synthesize it from other shuffles.
+}
+
+// If BUILD_VECTOR has same base element repeated several times,
+// report true.
+static bool isCommonSplatElement(BuildVectorSDNode *BVN) {
+ unsigned NElts = BVN->getNumOperands();
+ SDValue V0 = BVN->getOperand(0);
+
+ for (unsigned i = 1, e = NElts; i != e; ++i) {
+ if (BVN->getOperand(i) != V0)
+ return false;
+ }
+ return true;
+}
+
+// Lower a vector shift. Try to convert
+// <VT> = SHL/SRA/SRL <VT> by <VT> to Hexagon specific
+// <VT> = SHL/SRA/SRL <VT> by <IT/i32>.
+SDValue
+HexagonTargetLowering::LowerVECTOR_SHIFT(SDValue Op, SelectionDAG &DAG) const {
+ BuildVectorSDNode *BVN = nullptr;
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ SDValue V3;
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+
+ if ((BVN = dyn_cast<BuildVectorSDNode>(V1.getNode())) &&
+ isCommonSplatElement(BVN))
+ V3 = V2;
+ else if ((BVN = dyn_cast<BuildVectorSDNode>(V2.getNode())) &&
+ isCommonSplatElement(BVN))
+ V3 = V1;
+ else
+ return SDValue();
+
+ SDValue CommonSplat = BVN->getOperand(0);
+ SDValue Result;
+
+ if (VT.getSimpleVT() == MVT::v4i16) {
+ switch (Op.getOpcode()) {
+ case ISD::SRA:
+ Result = DAG.getNode(HexagonISD::VSRAH, dl, VT, V3, CommonSplat);
+ break;
+ case ISD::SHL:
+ Result = DAG.getNode(HexagonISD::VSHLH, dl, VT, V3, CommonSplat);
+ break;
+ case ISD::SRL:
+ Result = DAG.getNode(HexagonISD::VSRLH, dl, VT, V3, CommonSplat);
+ break;
+ default:
+ return SDValue();
+ }
+ } else if (VT.getSimpleVT() == MVT::v2i32) {
+ switch (Op.getOpcode()) {
+ case ISD::SRA:
+ Result = DAG.getNode(HexagonISD::VSRAW, dl, VT, V3, CommonSplat);
+ break;
+ case ISD::SHL:
+ Result = DAG.getNode(HexagonISD::VSHLW, dl, VT, V3, CommonSplat);
+ break;
+ case ISD::SRL:
+ Result = DAG.getNode(HexagonISD::VSRLW, dl, VT, V3, CommonSplat);
+ break;
+ default:
+ return SDValue();
+ }
+ } else {
+ return SDValue();
+ }
+
+ return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+}
+
+SDValue
+HexagonTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
+ BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+
+ unsigned Size = VT.getSizeInBits();
+
+ // Only handle vectors of 64 bits or shorter.
+ if (Size > 64)
+ return SDValue();
+
+ APInt APSplatBits, APSplatUndef;
+ unsigned SplatBitSize;
+ bool HasAnyUndefs;
+ unsigned NElts = BVN->getNumOperands();
+
+ // Try to generate a SPLAT instruction.
+ if ((VT.getSimpleVT() == MVT::v4i8 || VT.getSimpleVT() == MVT::v4i16) &&
+ (BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
+ HasAnyUndefs, 0, true) && SplatBitSize <= 16)) {
+ unsigned SplatBits = APSplatBits.getZExtValue();
+ int32_t SextVal = ((int32_t) (SplatBits << (32 - SplatBitSize)) >>
+ (32 - SplatBitSize));
+ return createSplat(DAG, dl, VT, DAG.getConstant(SextVal, dl, MVT::i32));
+ }
+
+ // Try to generate COMBINE to build v2i32 vectors.
+ if (VT.getSimpleVT() == MVT::v2i32) {
+ SDValue V0 = BVN->getOperand(0);
+ SDValue V1 = BVN->getOperand(1);
+
+ if (V0.isUndef())
+ V0 = DAG.getConstant(0, dl, MVT::i32);
+ if (V1.isUndef())
+ V1 = DAG.getConstant(0, dl, MVT::i32);
+
+ ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(V0);
+ ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(V1);
+ // If the element isn't a constant, it is in a register:
+ // generate a COMBINE Register Register instruction.
+ if (!C0 || !C1)
+ return DAG.getNode(HexagonISD::COMBINE, dl, VT, V1, V0);
+
+ // If one of the operands is an 8 bit integer constant, generate
+ // a COMBINE Immediate Immediate instruction.
+ if (isInt<8>(C0->getSExtValue()) ||
+ isInt<8>(C1->getSExtValue()))
+ return DAG.getNode(HexagonISD::COMBINE, dl, VT, V1, V0);
+ }
+
+ // Try to generate a S2_packhl to build v2i16 vectors.
+ if (VT.getSimpleVT() == MVT::v2i16) {
+ for (unsigned i = 0, e = NElts; i != e; ++i) {
+ if (BVN->getOperand(i).isUndef())
+ continue;
+ ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(BVN->getOperand(i));
+ // If the element isn't a constant, it is in a register:
+ // generate a S2_packhl instruction.
+ if (!Cst) {
+ SDValue pack = DAG.getNode(HexagonISD::PACKHL, dl, MVT::v4i16,
+ BVN->getOperand(1), BVN->getOperand(0));
+
+ return DAG.getTargetExtractSubreg(Hexagon::isub_lo, dl, MVT::v2i16,
+ pack);
+ }
+ }
+ }
+
+ // In the general case, generate a CONST32 or a CONST64 for constant vectors,
+ // and insert_vector_elt for all the other cases.
+ uint64_t Res = 0;
+ unsigned EltSize = Size / NElts;
+ SDValue ConstVal;
+ uint64_t Mask = ~uint64_t(0ULL) >> (64 - EltSize);
+ bool HasNonConstantElements = false;
+
+ for (unsigned i = 0, e = NElts; i != e; ++i) {
+ // LLVM's BUILD_VECTOR operands are in Little Endian mode, whereas Hexagon's
+ // combine, const64, etc. are Big Endian.
+ unsigned OpIdx = NElts - i - 1;
+ SDValue Operand = BVN->getOperand(OpIdx);
+ if (Operand.isUndef())
+ continue;
+
+ int64_t Val = 0;
+ if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Operand))
+ Val = Cst->getSExtValue();
+ else
+ HasNonConstantElements = true;
+
+ Val &= Mask;
+ Res = (Res << EltSize) | Val;
+ }
+
+ if (Size > 64)
+ return SDValue();
+
+ if (Size == 64)
+ ConstVal = DAG.getConstant(Res, dl, MVT::i64);
+ else
+ ConstVal = DAG.getConstant(Res, dl, MVT::i32);
+
+ // When there are non constant operands, add them with INSERT_VECTOR_ELT to
+ // ConstVal, the constant part of the vector.
+ if (HasNonConstantElements) {
+ EVT EltVT = VT.getVectorElementType();
+ SDValue Width = DAG.getConstant(EltVT.getSizeInBits(), dl, MVT::i64);
+ SDValue Shifted = DAG.getNode(ISD::SHL, dl, MVT::i64, Width,
+ DAG.getConstant(32, dl, MVT::i64));
+
+ for (unsigned i = 0, e = NElts; i != e; ++i) {
+ // LLVM's BUILD_VECTOR operands are in Little Endian mode, whereas Hexagon
+ // is Big Endian.
+ unsigned OpIdx = NElts - i - 1;
+ SDValue Operand = BVN->getOperand(OpIdx);
+ if (isa<ConstantSDNode>(Operand))
+ // This operand is already in ConstVal.
+ continue;
+
+ if (VT.getSizeInBits() == 64 &&
+ Operand.getValueSizeInBits() == 32) {
+ SDValue C = DAG.getConstant(0, dl, MVT::i32);
+ Operand = DAG.getNode(HexagonISD::COMBINE, dl, VT, C, Operand);
+ }
+
+ SDValue Idx = DAG.getConstant(OpIdx, dl, MVT::i64);
+ SDValue Offset = DAG.getNode(ISD::MUL, dl, MVT::i64, Idx, Width);
+ SDValue Combined = DAG.getNode(ISD::OR, dl, MVT::i64, Shifted, Offset);
+ const SDValue Ops[] = {ConstVal, Operand, Combined};
+
+ if (VT.getSizeInBits() == 32)
+ ConstVal = DAG.getNode(HexagonISD::INSERTRP, dl, MVT::i32, Ops);
+ else
+ ConstVal = DAG.getNode(HexagonISD::INSERTRP, dl, MVT::i64, Ops);
+ }
+ }
+
+ return DAG.getNode(ISD::BITCAST, dl, VT, ConstVal);
+}
+
+SDValue
+HexagonTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ bool UseHVX = Subtarget.useHVXOps();
+ EVT VT = Op.getValueType();
+ unsigned NElts = Op.getNumOperands();
+ SDValue Vec0 = Op.getOperand(0);
+ EVT VecVT = Vec0.getValueType();
+ unsigned Width = VecVT.getSizeInBits();
+
+ if (NElts == 2) {
+ MVT ST = VecVT.getSimpleVT();
+ // We are trying to concat two v2i16 to a single v4i16, or two v4i8
+ // into a single v8i8.
+ if (ST == MVT::v2i16 || ST == MVT::v4i8)
+ return DAG.getNode(HexagonISD::COMBINE, dl, VT, Op.getOperand(1), Vec0);
+
+ if (UseHVX) {
+ assert((Width == 64*8 && Subtarget.useHVXSglOps()) ||
+ (Width == 128*8 && Subtarget.useHVXDblOps()));
+ SDValue Vec1 = Op.getOperand(1);
+ MVT OpTy = Subtarget.useHVXSglOps() ? MVT::v16i32 : MVT::v32i32;
+ MVT ReTy = Subtarget.useHVXSglOps() ? MVT::v32i32 : MVT::v64i32;
+ SDValue B0 = DAG.getNode(ISD::BITCAST, dl, OpTy, Vec0);
+ SDValue B1 = DAG.getNode(ISD::BITCAST, dl, OpTy, Vec1);
+ SDValue VC = DAG.getNode(HexagonISD::VCOMBINE, dl, ReTy, B1, B0);
+ return DAG.getNode(ISD::BITCAST, dl, VT, VC);
+ }
+ }
+
+ if (VT.getSizeInBits() != 32 && VT.getSizeInBits() != 64)
+ return SDValue();
+
+ SDValue C0 = DAG.getConstant(0, dl, MVT::i64);
+ SDValue C32 = DAG.getConstant(32, dl, MVT::i64);
+ SDValue W = DAG.getConstant(Width, dl, MVT::i64);
+ // Create the "width" part of the argument to insert_rp/insertp_rp.
+ SDValue S = DAG.getNode(ISD::SHL, dl, MVT::i64, W, C32);
+ SDValue V = C0;
+
+ for (unsigned i = 0, e = NElts; i != e; ++i) {
+ unsigned N = NElts-i-1;
+ SDValue OpN = Op.getOperand(N);
+
+ if (VT.getSizeInBits() == 64 && OpN.getValueSizeInBits() == 32) {
+ SDValue C = DAG.getConstant(0, dl, MVT::i32);
+ OpN = DAG.getNode(HexagonISD::COMBINE, dl, VT, C, OpN);
+ }
+ SDValue Idx = DAG.getConstant(N, dl, MVT::i64);
+ SDValue Offset = DAG.getNode(ISD::MUL, dl, MVT::i64, Idx, W);
+ SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i64, S, Offset);
+ if (VT.getSizeInBits() == 32)
+ V = DAG.getNode(HexagonISD::INSERTRP, dl, MVT::i32, {V, OpN, Or});
+ else if (VT.getSizeInBits() == 64)
+ V = DAG.getNode(HexagonISD::INSERTRP, dl, MVT::i64, {V, OpN, Or});
+ else
+ return SDValue();
+ }
+
+ return DAG.getNode(ISD::BITCAST, dl, VT, V);
+}
+
+SDValue
+HexagonTargetLowering::LowerEXTRACT_SUBVECTOR_HVX(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getOperand(0).getValueType();
+ SDLoc dl(Op);
+ bool UseHVX = Subtarget.useHVXOps();
+ bool UseHVXSgl = Subtarget.useHVXSglOps();
+ // Just in case...
+
+ if (!VT.isVector() || !UseHVX)
+ return SDValue();
+
+ EVT ResVT = Op.getValueType();
+ unsigned ResSize = ResVT.getSizeInBits();
+ unsigned VectorSizeInBits = UseHVXSgl ? (64 * 8) : (128 * 8);
+ unsigned OpSize = VT.getSizeInBits();
+
+ // We deal only with cases where the result is the vector size
+ // and the vector operand is a double register.
+ if (!(ResVT.isByteSized() && ResSize == VectorSizeInBits) ||
+ !(VT.isByteSized() && OpSize == 2 * VectorSizeInBits))
+ return SDValue();
+
+ ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+ if (!Cst)
+ return SDValue();
+ unsigned Val = Cst->getZExtValue();
+
+ // These two will get lowered to an appropriate EXTRACT_SUBREG in ISel.
+ if (Val == 0) {
+ SDValue Vec = Op.getOperand(0);
+ return DAG.getTargetExtractSubreg(Hexagon::vsub_lo, dl, ResVT, Vec);
+ }
+
+ if (ResVT.getVectorNumElements() == Val) {
+ SDValue Vec = Op.getOperand(0);
+ return DAG.getTargetExtractSubreg(Hexagon::vsub_hi, dl, ResVT, Vec);
+ }
+
+ return SDValue();
+}
+
+SDValue
+HexagonTargetLowering::LowerEXTRACT_VECTOR(SDValue Op,
+ SelectionDAG &DAG) const {
+ // If we are dealing with EXTRACT_SUBVECTOR on a HVX type, we may
+ // be able to simplify it to an EXTRACT_SUBREG.
+ if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR && Subtarget.useHVXOps() &&
+ isHvxVectorType(Op.getValueType().getSimpleVT()))
+ return LowerEXTRACT_SUBVECTOR_HVX(Op, DAG);
+
+ EVT VT = Op.getValueType();
+ int VTN = VT.isVector() ? VT.getVectorNumElements() : 1;
+ SDLoc dl(Op);
+ SDValue Idx = Op.getOperand(1);
+ SDValue Vec = Op.getOperand(0);
+ EVT VecVT = Vec.getValueType();
+ EVT EltVT = VecVT.getVectorElementType();
+ int EltSize = EltVT.getSizeInBits();
+ SDValue Width = DAG.getConstant(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT ?
+ EltSize : VTN * EltSize, dl, MVT::i64);
+
+ // Constant element number.
+ if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Idx)) {
+ uint64_t X = CI->getZExtValue();
+ SDValue Offset = DAG.getConstant(X * EltSize, dl, MVT::i32);
+ const SDValue Ops[] = {Vec, Width, Offset};
+
+ ConstantSDNode *CW = dyn_cast<ConstantSDNode>(Width);
+ assert(CW && "Non constant width in LowerEXTRACT_VECTOR");
+
+ SDValue N;
+ MVT SVT = VecVT.getSimpleVT();
+ uint64_t W = CW->getZExtValue();
+
+ if (W == 32) {
+ // Translate this node into EXTRACT_SUBREG.
+ unsigned Subreg = (X == 0) ? Hexagon::isub_lo : 0;
+
+ if (X == 0)
+ Subreg = Hexagon::isub_lo;
+ else if (SVT == MVT::v2i32 && X == 1)
+ Subreg = Hexagon::isub_hi;
+ else if (SVT == MVT::v4i16 && X == 2)
+ Subreg = Hexagon::isub_hi;
+ else if (SVT == MVT::v8i8 && X == 4)
+ Subreg = Hexagon::isub_hi;
+ else
+ llvm_unreachable("Bad offset");
+ N = DAG.getTargetExtractSubreg(Subreg, dl, MVT::i32, Vec);
+
+ } else if (SVT.getSizeInBits() == 32) {
+ N = DAG.getNode(HexagonISD::EXTRACTU, dl, MVT::i32, Ops);
+ } else if (SVT.getSizeInBits() == 64) {
+ N = DAG.getNode(HexagonISD::EXTRACTU, dl, MVT::i64, Ops);
+ if (VT.getSizeInBits() == 32)
+ N = DAG.getTargetExtractSubreg(Hexagon::isub_lo, dl, MVT::i32, N);
+ } else
+ return SDValue();
+
+ return DAG.getNode(ISD::BITCAST, dl, VT, N);
+ }
+
+ // Variable element number.
+ SDValue Offset = DAG.getNode(ISD::MUL, dl, MVT::i32, Idx,
+ DAG.getConstant(EltSize, dl, MVT::i32));
+ SDValue Shifted = DAG.getNode(ISD::SHL, dl, MVT::i64, Width,
+ DAG.getConstant(32, dl, MVT::i64));
+ SDValue Combined = DAG.getNode(ISD::OR, dl, MVT::i64, Shifted, Offset);
+
+ const SDValue Ops[] = {Vec, Combined};
+
+ SDValue N;
+ if (VecVT.getSizeInBits() == 32) {
+ N = DAG.getNode(HexagonISD::EXTRACTURP, dl, MVT::i32, Ops);
+ } else {
+ N = DAG.getNode(HexagonISD::EXTRACTURP, dl, MVT::i64, Ops);
+ if (VT.getSizeInBits() == 32)
+ N = DAG.getTargetExtractSubreg(Hexagon::isub_lo, dl, MVT::i32, N);
+ }
+ return DAG.getNode(ISD::BITCAST, dl, VT, N);
+}
+
+SDValue
+HexagonTargetLowering::LowerINSERT_VECTOR(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ int VTN = VT.isVector() ? VT.getVectorNumElements() : 1;
+ SDLoc dl(Op);
+ SDValue Vec = Op.getOperand(0);
+ SDValue Val = Op.getOperand(1);
+ SDValue Idx = Op.getOperand(2);
+ EVT VecVT = Vec.getValueType();
+ EVT EltVT = VecVT.getVectorElementType();
+ int EltSize = EltVT.getSizeInBits();
+ SDValue Width = DAG.getConstant(Op.getOpcode() == ISD::INSERT_VECTOR_ELT ?
+ EltSize : VTN * EltSize, dl, MVT::i64);
+
+ if (ConstantSDNode *C = cast<ConstantSDNode>(Idx)) {
+ SDValue Offset = DAG.getConstant(C->getSExtValue() * EltSize, dl, MVT::i32);
+ const SDValue Ops[] = {Vec, Val, Width, Offset};
+
+ SDValue N;
+ if (VT.getSizeInBits() == 32)
+ N = DAG.getNode(HexagonISD::INSERT, dl, MVT::i32, Ops);
+ else if (VT.getSizeInBits() == 64)
+ N = DAG.getNode(HexagonISD::INSERT, dl, MVT::i64, Ops);
+ else
+ return SDValue();
+
+ return DAG.getNode(ISD::BITCAST, dl, VT, N);
+ }
+
+ // Variable element number.
+ SDValue Offset = DAG.getNode(ISD::MUL, dl, MVT::i32, Idx,
+ DAG.getConstant(EltSize, dl, MVT::i32));
+ SDValue Shifted = DAG.getNode(ISD::SHL, dl, MVT::i64, Width,
+ DAG.getConstant(32, dl, MVT::i64));
+ SDValue Combined = DAG.getNode(ISD::OR, dl, MVT::i64, Shifted, Offset);
+
+ if (VT.getSizeInBits() == 64 && Val.getValueSizeInBits() == 32) {
+ SDValue C = DAG.getConstant(0, dl, MVT::i32);
+ Val = DAG.getNode(HexagonISD::COMBINE, dl, VT, C, Val);
+ }
+
+ const SDValue Ops[] = {Vec, Val, Combined};
+
+ SDValue N;
+ if (VT.getSizeInBits() == 32)
+ N = DAG.getNode(HexagonISD::INSERTRP, dl, MVT::i32, Ops);
+ else if (VT.getSizeInBits() == 64)
+ N = DAG.getNode(HexagonISD::INSERTRP, dl, MVT::i64, Ops);
+ else
+ return SDValue();
+
+ return DAG.getNode(ISD::BITCAST, dl, VT, N);
+}
+
+bool
+HexagonTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
+ // Assuming the caller does not have either a signext or zeroext modifier, and
+ // only one value is accepted, any reasonable truncation is allowed.
+ if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+ return false;
+
+ // FIXME: in principle up to 64-bit could be made safe, but it would be very
+ // fragile at the moment: any support for multiple value returns would be
+ // liable to disallow tail calls involving i64 -> iN truncation in many cases.
+ return Ty1->getPrimitiveSizeInBits() <= 32;
+}
+
+SDValue
+HexagonTargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
+ SDValue Chain = Op.getOperand(0);
+ SDValue Offset = Op.getOperand(1);
+ SDValue Handler = Op.getOperand(2);
+ SDLoc dl(Op);
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+ // Mark function as containing a call to EH_RETURN.
+ HexagonMachineFunctionInfo *FuncInfo =
+ DAG.getMachineFunction().getInfo<HexagonMachineFunctionInfo>();
+ FuncInfo->setHasEHReturn();
+
+ unsigned OffsetReg = Hexagon::R28;
+
+ SDValue StoreAddr =
+ DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getRegister(Hexagon::R30, PtrVT),
+ DAG.getIntPtrConstant(4, dl));
+ Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
+ Chain = DAG.getCopyToReg(Chain, dl, OffsetReg, Offset);
+
+ // Not needed we already use it as explict input to EH_RETURN.
+ // MF.getRegInfo().addLiveOut(OffsetReg);
+
+ return DAG.getNode(HexagonISD::EH_RETURN, dl, MVT::Other, Chain);
+}
+
+SDValue
+HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+ unsigned Opc = Op.getOpcode();
+ switch (Opc) {
+ default:
+#ifndef NDEBUG
+ Op.getNode()->dumpr(&DAG);
+ if (Opc > HexagonISD::OP_BEGIN && Opc < HexagonISD::OP_END)
+ errs() << "Check for a non-legal type in this operation\n";
+#endif
+ llvm_unreachable("Should not custom lower this!");
+ case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
+ case ISD::INSERT_SUBVECTOR: return LowerINSERT_VECTOR(Op, DAG);
+ case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR(Op, DAG);
+ case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_VECTOR(Op, DAG);
+ case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR(Op, DAG);
+ case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
+ case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
+ case ISD::SRA:
+ case ISD::SHL:
+ case ISD::SRL: return LowerVECTOR_SHIFT(Op, DAG);
+ case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
+ case ISD::JumpTable: return LowerJumpTable(Op, DAG);
+ case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
+ // Frame & Return address. Currently unimplemented.
+ case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
+ case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
+ case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
+ case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG);
+ case ISD::GlobalAddress: return LowerGLOBALADDRESS(Op, DAG);
+ case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
+ case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG);
+ case ISD::VASTART: return LowerVASTART(Op, DAG);
+ // Custom lower some vector loads.
+ case ISD::LOAD: return LowerLOAD(Op, DAG);
+ case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
+ case ISD::SETCC: return LowerSETCC(Op, DAG);
+ case ISD::VSELECT: return LowerVSELECT(Op, DAG);
+ case ISD::CTPOP: return LowerCTPOP(Op, DAG);
+ case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+ case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
+ case ISD::INLINEASM: return LowerINLINEASM(Op, DAG);
+ case ISD::PREFETCH: return LowerPREFETCH(Op, DAG);
+ }
+}
+
+/// Returns relocation base for the given PIC jumptable.
+SDValue
+HexagonTargetLowering::getPICJumpTableRelocBase(SDValue Table,
+ SelectionDAG &DAG) const {
+ int Idx = cast<JumpTableSDNode>(Table)->getIndex();
+ EVT VT = Table.getValueType();
+ SDValue T = DAG.getTargetJumpTable(Idx, VT, HexagonII::MO_PCREL);
+ return DAG.getNode(HexagonISD::AT_PCREL, SDLoc(Table), VT, T);
+}
+
+//===----------------------------------------------------------------------===//
+// Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+TargetLowering::ConstraintType
+HexagonTargetLowering::getConstraintType(StringRef Constraint) const {
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ case 'q':
+ case 'v':
+ if (Subtarget.useHVXOps())
+ return C_Register;
+ break;
+ }
+ }
+ return TargetLowering::getConstraintType(Constraint);
+}
+
+std::pair<unsigned, const TargetRegisterClass*>
+HexagonTargetLowering::getRegForInlineAsmConstraint(
+ const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
+ bool UseHVX = Subtarget.useHVXOps(), UseHVXDbl = Subtarget.useHVXDblOps();
+
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ case 'r': // R0-R31
+ switch (VT.SimpleTy) {
+ default:
+ llvm_unreachable("getRegForInlineAsmConstraint Unhandled data type");
+ case MVT::i1:
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32:
+ case MVT::f32:
+ return std::make_pair(0U, &Hexagon::IntRegsRegClass);
+ case MVT::i64:
+ case MVT::f64:
+ return std::make_pair(0U, &Hexagon::DoubleRegsRegClass);
+ }
+ case 'q': // q0-q3
+ switch (VT.SimpleTy) {
+ default:
+ llvm_unreachable("getRegForInlineAsmConstraint Unhandled data type");
+ case MVT::v1024i1:
+ case MVT::v512i1:
+ case MVT::v32i16:
+ case MVT::v16i32:
+ case MVT::v64i8:
+ case MVT::v8i64:
+ return std::make_pair(0U, &Hexagon::VecPredRegsRegClass);
+ }
+ case 'v': // V0-V31
+ switch (VT.SimpleTy) {
+ default:
+ llvm_unreachable("getRegForInlineAsmConstraint Unhandled data type");
+ case MVT::v16i32:
+ case MVT::v32i16:
+ case MVT::v64i8:
+ case MVT::v8i64:
+ return std::make_pair(0U, &Hexagon::VectorRegsRegClass);
+ case MVT::v32i32:
+ case MVT::v64i16:
+ case MVT::v16i64:
+ case MVT::v128i8:
+ if (Subtarget.hasV60TOps() && UseHVX && UseHVXDbl)
+ return std::make_pair(0U, &Hexagon::VectorRegs128BRegClass);
+ return std::make_pair(0U, &Hexagon::VecDblRegsRegClass);
+ case MVT::v256i8:
+ case MVT::v128i16:
+ case MVT::v64i32:
+ case MVT::v32i64:
+ return std::make_pair(0U, &Hexagon::VecDblRegs128BRegClass);
+ }
+
+ default:
+ llvm_unreachable("Unknown asm register class");
+ }
+ }
+
+ return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+}
+
+/// isFPImmLegal - Returns true if the target can instruction select the
+/// specified FP immediate natively. If false, the legalizer will
+/// materialize the FP immediate as a load from a constant pool.
+bool HexagonTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+ return Subtarget.hasV5TOps();
+}
+
+/// isLegalAddressingMode - Return true if the addressing mode represented by
+/// AM is legal for this target, for a load/store of the specified type.
+bool HexagonTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+ const AddrMode &AM, Type *Ty,
+ unsigned AS) const {
+ if (Ty->isSized()) {
+ // When LSR detects uses of the same base address to access different
+ // types (e.g. unions), it will assume a conservative type for these
+ // uses:
+ // LSR Use: Kind=Address of void in addrspace(4294967295), ...
+ // The type Ty passed here would then be "void". Skip the alignment
+ // checks, but do not return false right away, since that confuses
+ // LSR into crashing.
+ unsigned A = DL.getABITypeAlignment(Ty);
+ // The base offset must be a multiple of the alignment.
+ if ((AM.BaseOffs % A) != 0)
+ return false;
+ // The shifted offset must fit in 11 bits.
+ if (!isInt<11>(AM.BaseOffs >> Log2_32(A)))
+ return false;
+ }
+
+ // No global is ever allowed as a base.
+ if (AM.BaseGV)
+ return false;
+
+ int Scale = AM.Scale;
+ if (Scale < 0)
+ Scale = -Scale;
+ switch (Scale) {
+ case 0: // No scale reg, "r+i", "r", or just "i".
+ break;
+ default: // No scaled addressing mode.
+ return false;
+ }
+ return true;
+}
+
+/// Return true if folding a constant offset with the given GlobalAddress is
+/// legal. It is frequently not legal in PIC relocation models.
+bool HexagonTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA)
+ const {
+ return HTM.getRelocationModel() == Reloc::Static;
+}
+
+/// isLegalICmpImmediate - Return true if the specified immediate is legal
+/// icmp immediate, that is the target has icmp instructions which can compare
+/// a register against the immediate without having to materialize the
+/// immediate into a register.
+bool HexagonTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
+ return Imm >= -512 && Imm <= 511;
+}
+
+/// IsEligibleForTailCallOptimization - Check whether the call is eligible
+/// for tail call optimization. Targets which want to do tail call
+/// optimization should implement this function.
+bool HexagonTargetLowering::IsEligibleForTailCallOptimization(
+ SDValue Callee,
+ CallingConv::ID CalleeCC,
+ bool isVarArg,
+ bool isCalleeStructRet,
+ bool isCallerStructRet,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ SelectionDAG& DAG) const {
+ const Function *CallerF = DAG.getMachineFunction().getFunction();
+ CallingConv::ID CallerCC = CallerF->getCallingConv();
+ bool CCMatch = CallerCC == CalleeCC;
+
+ // ***************************************************************************
+ // Look for obvious safe cases to perform tail call optimization that do not
+ // require ABI changes.
+ // ***************************************************************************
+
+ // If this is a tail call via a function pointer, then don't do it!
+ if (!isa<GlobalAddressSDNode>(Callee) &&
+ !isa<ExternalSymbolSDNode>(Callee)) {
+ return false;
+ }
+
+ // Do not optimize if the calling conventions do not match and the conventions
+ // used are not C or Fast.
+ if (!CCMatch) {
+ bool R = (CallerCC == CallingConv::C || CallerCC == CallingConv::Fast);
+ bool E = (CalleeCC == CallingConv::C || CalleeCC == CallingConv::Fast);
+ // If R & E, then ok.
+ if (!R || !E)
+ return false;
+ }
+
+ // Do not tail call optimize vararg calls.
+ if (isVarArg)
+ return false;
+
+ // Also avoid tail call optimization if either caller or callee uses struct
+ // return semantics.
+ if (isCalleeStructRet || isCallerStructRet)
+ return false;
+
+ // In addition to the cases above, we also disable Tail Call Optimization if
+ // the calling convention code that at least one outgoing argument needs to
+ // go on the stack. We cannot check that here because at this point that
+ // information is not available.
+ return true;
+}
+
+/// Returns the target specific optimal type for load and store operations as
+/// a result of memset, memcpy, and memmove lowering.
+///
+/// If DstAlign is zero that means it's safe to destination alignment can
+/// satisfy any constraint. Similarly if SrcAlign is zero it means there isn't
+/// a need to check it against alignment requirement, probably because the
+/// source does not need to be loaded. If 'IsMemset' is true, that means it's
+/// expanding a memset. If 'ZeroMemset' is true, that means it's a memset of
+/// zero. 'MemcpyStrSrc' indicates whether the memcpy source is constant so it
+/// does not need to be loaded. It returns EVT::Other if the type should be
+/// determined using generic target-independent logic.
+EVT HexagonTargetLowering::getOptimalMemOpType(uint64_t Size,
+ unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset,
+ bool MemcpyStrSrc, MachineFunction &MF) const {
+
+ auto Aligned = [](unsigned GivenA, unsigned MinA) -> bool {
+ return (GivenA % MinA) == 0;
+ };
+
+ if (Size >= 8 && Aligned(DstAlign, 8) && (IsMemset || Aligned(SrcAlign, 8)))
+ return MVT::i64;
+ if (Size >= 4 && Aligned(DstAlign, 4) && (IsMemset || Aligned(SrcAlign, 4)))
+ return MVT::i32;
+ if (Size >= 2 && Aligned(DstAlign, 2) && (IsMemset || Aligned(SrcAlign, 2)))
+ return MVT::i16;
+
+ return MVT::Other;
+}
+
+bool HexagonTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+ unsigned AS, unsigned Align, bool *Fast) const {
+ if (Fast)
+ *Fast = false;
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::v64i8:
+ case MVT::v128i8:
+ case MVT::v256i8:
+ case MVT::v32i16:
+ case MVT::v64i16:
+ case MVT::v128i16:
+ case MVT::v16i32:
+ case MVT::v32i32:
+ case MVT::v64i32:
+ case MVT::v8i64:
+ case MVT::v16i64:
+ case MVT::v32i64:
+ return true;
+ }
+ return false;
+}
+
+std::pair<const TargetRegisterClass*, uint8_t>
+HexagonTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
+ MVT VT) const {
+ const TargetRegisterClass *RRC = nullptr;
+
+ uint8_t Cost = 1;
+ switch (VT.SimpleTy) {
+ default:
+ return TargetLowering::findRepresentativeClass(TRI, VT);
+ case MVT::v64i8:
+ case MVT::v32i16:
+ case MVT::v16i32:
+ case MVT::v8i64:
+ RRC = &Hexagon::VectorRegsRegClass;
+ break;
+ case MVT::v128i8:
+ case MVT::v64i16:
+ case MVT::v32i32:
+ case MVT::v16i64:
+ if (Subtarget.hasV60TOps() && Subtarget.useHVXOps() &&
+ Subtarget.useHVXDblOps())
+ RRC = &Hexagon::VectorRegs128BRegClass;
+ else
+ RRC = &Hexagon::VecDblRegsRegClass;
+ break;
+ case MVT::v256i8:
+ case MVT::v128i16:
+ case MVT::v64i32:
+ case MVT::v32i64:
+ RRC = &Hexagon::VecDblRegs128BRegClass;
+ break;
+ }
+ return std::make_pair(RRC, Cost);
+}
+
+Value *HexagonTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+ AtomicOrdering Ord) const {
+ BasicBlock *BB = Builder.GetInsertBlock();
+ Module *M = BB->getParent()->getParent();
+ Type *Ty = cast<PointerType>(Addr->getType())->getElementType();
+ unsigned SZ = Ty->getPrimitiveSizeInBits();
+ assert((SZ == 32 || SZ == 64) && "Only 32/64-bit atomic loads supported");
+ Intrinsic::ID IntID = (SZ == 32) ? Intrinsic::hexagon_L2_loadw_locked
+ : Intrinsic::hexagon_L4_loadd_locked;
+ Value *Fn = Intrinsic::getDeclaration(M, IntID);
+ return Builder.CreateCall(Fn, Addr, "larx");
+}
+
+/// Perform a store-conditional operation to Addr. Return the status of the
+/// store. This should be 0 if the store succeeded, non-zero otherwise.
+Value *HexagonTargetLowering::emitStoreConditional(IRBuilder<> &Builder,
+ Value *Val, Value *Addr, AtomicOrdering Ord) const {
+ BasicBlock *BB = Builder.GetInsertBlock();
+ Module *M = BB->getParent()->getParent();
+ Type *Ty = Val->getType();
+ unsigned SZ = Ty->getPrimitiveSizeInBits();
+ assert((SZ == 32 || SZ == 64) && "Only 32/64-bit atomic stores supported");
+ Intrinsic::ID IntID = (SZ == 32) ? Intrinsic::hexagon_S2_storew_locked
+ : Intrinsic::hexagon_S4_stored_locked;
+ Value *Fn = Intrinsic::getDeclaration(M, IntID);
+ Value *Call = Builder.CreateCall(Fn, {Addr, Val}, "stcx");
+ Value *Cmp = Builder.CreateICmpEQ(Call, Builder.getInt32(0), "");
+ Value *Ext = Builder.CreateZExt(Cmp, Type::getInt32Ty(M->getContext()));
+ return Ext;
+}
+
+TargetLowering::AtomicExpansionKind
+HexagonTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
+ // Do not expand loads and stores that don't exceed 64 bits.
+ return LI->getType()->getPrimitiveSizeInBits() > 64
+ ? AtomicExpansionKind::LLOnly
+ : AtomicExpansionKind::None;
+}
+
+bool HexagonTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
+ // Do not expand loads and stores that don't exceed 64 bits.
+ return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() > 64;
+}
+
+bool HexagonTargetLowering::shouldExpandAtomicCmpXchgInIR(
+ AtomicCmpXchgInst *AI) const {
+ const DataLayout &DL = AI->getModule()->getDataLayout();
+ unsigned Size = DL.getTypeStoreSize(AI->getCompareOperand()->getType());
+ return Size >= 4 && Size <= 8;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.h
new file mode 100644
index 000000000000..a8ed29e585d4
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.h
@@ -0,0 +1,294 @@
+//===-- HexagonISelLowering.h - Hexagon DAG Lowering Interface --*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Hexagon uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONISELLOWERING_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONISELLOWERING_H
+
+#include "Hexagon.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/Target/TargetLowering.h"
+#include <cstdint>
+#include <utility>
+
+namespace llvm {
+
+namespace HexagonISD {
+
+ enum NodeType : unsigned {
+ OP_BEGIN = ISD::BUILTIN_OP_END,
+
+ CONST32 = OP_BEGIN,
+ CONST32_GP, // For marking data present in GP.
+ ALLOCA,
+
+ AT_GOT, // Index in GOT.
+ AT_PCREL, // Offset relative to PC.
+
+ CALL, // Function call.
+ CALLnr, // Function call that does not return.
+ CALLR,
+
+ RET_FLAG, // Return with a flag operand.
+ BARRIER, // Memory barrier.
+ JT, // Jump table.
+ CP, // Constant pool.
+
+ POPCOUNT,
+ COMBINE,
+ PACKHL,
+ VSPLATB,
+ VSPLATH,
+ SHUFFEB,
+ SHUFFEH,
+ SHUFFOB,
+ SHUFFOH,
+ VSXTBH,
+ VSXTBW,
+ VSRAW,
+ VSRAH,
+ VSRLW,
+ VSRLH,
+ VSHLW,
+ VSHLH,
+ VCMPBEQ,
+ VCMPBGT,
+ VCMPBGTU,
+ VCMPHEQ,
+ VCMPHGT,
+ VCMPHGTU,
+ VCMPWEQ,
+ VCMPWGT,
+ VCMPWGTU,
+
+ INSERT,
+ INSERTRP,
+ EXTRACTU,
+ EXTRACTURP,
+ VCOMBINE,
+ VPACK,
+ TC_RETURN,
+ EH_RETURN,
+ DCFETCH,
+
+ OP_END
+ };
+
+} // end namespace HexagonISD
+
+ class HexagonSubtarget;
+
+ class HexagonTargetLowering : public TargetLowering {
+ int VarArgsFrameOffset; // Frame offset to start of varargs area.
+ const HexagonTargetMachine &HTM;
+ const HexagonSubtarget &Subtarget;
+
+ bool CanReturnSmallStruct(const Function* CalleeFn, unsigned& RetSize)
+ const;
+ void promoteLdStType(MVT VT, MVT PromotedLdStVT);
+
+ public:
+ explicit HexagonTargetLowering(const TargetMachine &TM,
+ const HexagonSubtarget &ST);
+
+ /// IsEligibleForTailCallOptimization - Check whether the call is eligible
+ /// for tail call optimization. Targets which want to do tail call
+ /// optimization should implement this function.
+ bool IsEligibleForTailCallOptimization(SDValue Callee,
+ CallingConv::ID CalleeCC, bool isVarArg, bool isCalleeStructRet,
+ bool isCallerStructRet, const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG& DAG) const;
+
+ bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
+ bool isTruncateFree(EVT VT1, EVT VT2) const override;
+
+ bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;
+
+ /// Return true if an FMA operation is faster than a pair of mul and add
+ /// instructions. fmuladd intrinsics will be expanded to FMAs when this
+ /// method returns true (and FMAs are legal), otherwise fmuladd is
+ /// expanded to mul + add.
+ bool isFMAFasterThanFMulAndFAdd(EVT) const override;
+
+ // Should we expand the build vector with shuffles?
+ bool shouldExpandBuildVectorWithShuffles(EVT VT,
+ unsigned DefinedValues) const override;
+
+ bool isShuffleMaskLegal(const SmallVectorImpl<int> &Mask, EVT VT)
+ const override;
+
+ SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+ const char *getTargetNodeName(unsigned Opcode) const override;
+ SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerEXTRACT_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerEXTRACT_SUBVECTOR_HVX(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINSERT_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVECTOR_SHIFT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerEH_LABEL(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
+ SDValue
+ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const override;
+ SDValue LowerGLOBALADDRESS(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
+ SelectionDAG &DAG) const;
+ SDValue LowerToTLSInitialExecModel(GlobalAddressSDNode *GA,
+ SelectionDAG &DAG) const;
+ SDValue LowerToTLSLocalExecModel(GlobalAddressSDNode *GA,
+ SelectionDAG &DAG) const;
+ SDValue GetDynamicTLSAddr(SelectionDAG &DAG, SDValue Chain,
+ GlobalAddressSDNode *GA, SDValue *InFlag, EVT PtrVT,
+ unsigned ReturnReg, unsigned char OperandFlags) const;
+ SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const override;
+ SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+ CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals,
+ const SmallVectorImpl<SDValue> &OutVals,
+ SDValue Callee) const;
+
+ SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const;
+ SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SDLoc &dl, SelectionDAG &DAG) const override;
+
+ bool mayBeEmittedAsTailCall(CallInst *CI) const override;
+
+ /// If a physical register, this returns the register that receives the
+ /// exception address on entry to an EH pad.
+ unsigned
+ getExceptionPointerRegister(const Constant *PersonalityFn) const override {
+ return Hexagon::R0;
+ }
+
+ /// If a physical register, this returns the register that receives the
+ /// exception typeid on entry to a landing pad.
+ unsigned
+ getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
+ return Hexagon::R1;
+ }
+
+ SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+
+ EVT getSetCCResultType(const DataLayout &, LLVMContext &C,
+ EVT VT) const override {
+ if (!VT.isVector())
+ return MVT::i1;
+ else
+ return EVT::getVectorVT(C, MVT::i1, VT.getVectorNumElements());
+ }
+
+ bool getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+ SDValue &Base, SDValue &Offset,
+ ISD::MemIndexedMode &AM,
+ SelectionDAG &DAG) const override;
+
+ ConstraintType getConstraintType(StringRef Constraint) const override;
+
+ std::pair<unsigned, const TargetRegisterClass *>
+ getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ StringRef Constraint, MVT VT) const override;
+
+ unsigned
+ getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
+ if (ConstraintCode == "o")
+ return InlineAsm::Constraint_o;
+ return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
+ }
+
+ // Intrinsics
+ SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
+ /// isLegalAddressingMode - Return true if the addressing mode represented
+ /// by AM is legal for this target, for a load/store of the specified type.
+ /// The type may be VoidTy, in which case only return true if the addressing
+ /// mode is legal for a load/store of any legal type.
+ /// TODO: Handle pre/postinc as well.
+ bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
+ Type *Ty, unsigned AS) const override;
+ /// Return true if folding a constant offset with the given GlobalAddress
+ /// is legal. It is frequently not legal in PIC relocation models.
+ bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+
+ bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+
+ /// isLegalICmpImmediate - Return true if the specified immediate is legal
+ /// icmp immediate, that is the target has icmp instructions which can
+ /// compare a register against the immediate without having to materialize
+ /// the immediate into a register.
+ bool isLegalICmpImmediate(int64_t Imm) const override;
+
+ EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
+ unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
+ MachineFunction &MF) const override;
+
+ bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace,
+ unsigned Align, bool *Fast) const override;
+
+ /// Returns relocation base for the given PIC jumptable.
+ SDValue getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG)
+ const override;
+
+ // Handling of atomic RMW instructions.
+ Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+ AtomicOrdering Ord) const override;
+ Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
+ Value *Addr, AtomicOrdering Ord) const override;
+ AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
+ bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
+ bool shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
+
+ AtomicExpansionKind
+ shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override {
+ return AtomicExpansionKind::LLSC;
+ }
+
+ protected:
+ std::pair<const TargetRegisterClass*, uint8_t>
+ findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT)
+ const override;
+ };
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_HEXAGON_HEXAGONISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrAlias.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrAlias.td
new file mode 100644
index 000000000000..7283d94ee759
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrAlias.td
@@ -0,0 +1,652 @@
+//==- HexagonInstrAlias.td - Hexagon Instruction Aliases ---*- tablegen -*--==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Hexagon Instruction Mappings
+//===----------------------------------------------------------------------===//
+
+
+def : InstAlias<"memb({GP}+#$addr) = $Nt.new",
+ (S2_storerbnewgp u16_0Imm:$addr, IntRegs:$Nt)>;
+def : InstAlias<"memh({GP}+#$addr) = $Nt.new",
+ (S2_storerhnewgp u16_1Imm:$addr, IntRegs:$Nt)>;
+def : InstAlias<"memw({GP}+#$addr) = $Nt.new",
+ (S2_storerinewgp u16_2Imm:$addr, IntRegs:$Nt)>;
+def : InstAlias<"memb({GP}+#$addr) = $Nt",
+ (S2_storerbgp u16_0Imm:$addr, IntRegs:$Nt)>;
+def : InstAlias<"memh({GP}+#$addr) = $Nt",
+ (S2_storerhgp u16_1Imm:$addr, IntRegs:$Nt)>;
+def : InstAlias<"memh({GP}+#$addr) = $Nt.h",
+ (S2_storerfgp u16_1Imm:$addr, IntRegs:$Nt)>;
+def : InstAlias<"memw({GP}+#$addr) = $Nt",
+ (S2_storerigp u16_2Imm:$addr, IntRegs:$Nt)>;
+def : InstAlias<"memd({GP}+#$addr) = $Nt",
+ (S2_storerdgp u16_3Imm:$addr, DoubleRegs:$Nt)>;
+
+def : InstAlias<"$Nt = memb({GP}+#$addr)",
+ (L2_loadrbgp IntRegs:$Nt, u16_0Imm:$addr)>;
+def : InstAlias<"$Nt = memub({GP}+#$addr)",
+ (L2_loadrubgp IntRegs:$Nt, u16_0Imm:$addr)>;
+def : InstAlias<"$Nt = memh({GP}+#$addr)",
+ (L2_loadrhgp IntRegs:$Nt, u16_1Imm:$addr)>;
+def : InstAlias<"$Nt = memuh({GP}+#$addr)",
+ (L2_loadruhgp IntRegs:$Nt, u16_1Imm:$addr)>;
+def : InstAlias<"$Nt = memw({GP}+#$addr)",
+ (L2_loadrigp IntRegs:$Nt, u16_2Imm:$addr)>;
+def : InstAlias<"$Nt = memd({GP}+#$addr)",
+ (L2_loadrdgp DoubleRegs:$Nt, u16_3Imm:$addr)>;
+
+// Alias of: memXX($Rs+#XX) = $Rt to memXX($Rs) = $Rt
+def : InstAlias<"memb($Rs) = $Rt",
+ (S2_storerb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"memh($Rs) = $Rt",
+ (S2_storerh_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"memh($Rs) = $Rt.h",
+ (S2_storerf_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"memw($Rs) = $Rt",
+ (S2_storeri_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"memb($Rs) = $Rt.new",
+ (S2_storerbnew_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"memh($Rs) = $Rt.new",
+ (S2_storerhnew_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"memw($Rs) = $Rt.new",
+ (S2_storerinew_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"memb($Rs) = #$S8",
+ (S4_storeirb_io IntRegs:$Rs, 0, s8_0Ext:$S8), 0>;
+
+def : InstAlias<"memh($Rs) = #$S8",
+ (S4_storeirh_io IntRegs:$Rs, 0, s8_0Ext:$S8), 0>;
+
+def : InstAlias<"memw($Rs) = #$S8",
+ (S4_storeiri_io IntRegs:$Rs, 0, s8_0Ext:$S8), 0>;
+
+def : InstAlias<"memd($Rs) = $Rtt",
+ (S2_storerd_io IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>;
+
+def : InstAlias<"memb($Rs) = setbit(#$U5)",
+ (L4_ior_memopb_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>;
+
+def : InstAlias<"memh($Rs) = setbit(#$U5)",
+ (L4_ior_memoph_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>;
+
+def : InstAlias<"memw($Rs) = setbit(#$U5)",
+ (L4_ior_memopw_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>;
+
+def : InstAlias<"memb($Rs) = clrbit(#$U5)",
+ (L4_iand_memopb_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>;
+
+def : InstAlias<"memh($Rs) = clrbit(#$U5)",
+ (L4_iand_memoph_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>;
+
+def : InstAlias<"memw($Rs) = clrbit(#$U5)",
+ (L4_iand_memopw_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>;
+
+// Alias of: $Rd = memXX($Rs+#XX) to $Rd = memXX($Rs)
+def : InstAlias<"$Rd = memb($Rs)",
+ (L2_loadrb_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rd = memub($Rs)",
+ (L2_loadrub_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rd = memh($Rs)",
+ (L2_loadrh_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rd = memuh($Rs)",
+ (L2_loadruh_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rd = memw($Rs)",
+ (L2_loadri_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rdd = memd($Rs)",
+ (L2_loadrd_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rd = memubh($Rs)",
+ (L2_loadbzw2_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rdd = memubh($Rs)",
+ (L2_loadbzw4_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rd = membh($Rs)",
+ (L2_loadbsw2_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rdd = membh($Rs)",
+ (L2_loadbsw4_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rdd = memb_fifo($Rs)",
+ (L2_loadalignb_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rdd = memh_fifo($Rs)",
+ (L2_loadalignh_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>;
+
+// Alias of: if ($Pt) $Rd = memXX($Rs + #$u6_X)
+// to: if ($Pt) $Rd = memXX($Rs)
+def : InstAlias<"if ($Pt) $Rd = memb($Rs)",
+ (L2_ploadrbt_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt) $Rd = memub($Rs)",
+ (L2_ploadrubt_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt) $Rd = memh($Rs)",
+ (L2_ploadrht_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt) $Rd = memuh($Rs)",
+ (L2_ploadruht_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt) $Rd = memw($Rs)",
+ (L2_ploadrit_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt) $Rdd = memd($Rs)",
+ (L2_ploadrdt_io DoubleRegs:$Rdd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+// Alias of: if ($Pt) memXX($Rs + #$u6_X) = $Rt
+// to: if ($Pt) memXX($Rs) = $Rt
+def : InstAlias<"if ($Pt) memb($Rs) = $Rt",
+ (S2_pstorerbt_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt) memh($Rs) = $Rt",
+ (S2_pstorerht_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt) memh($Rs) = $Rt.h",
+ (S2_pstorerft_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt) memw($Rs) = $Rt",
+ (S2_pstorerit_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt) memd($Rs) = $Rtt",
+ (S2_pstorerdt_io PredRegs:$Pt, IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>;
+
+def : InstAlias<"if ($Pt) memb($Rs) = $Rt.new",
+ (S2_pstorerbnewt_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt) memh($Rs) = $Rt.new",
+ (S2_pstorerhnewt_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt) memw($Rs) = $Rt.new",
+ (S2_pstorerinewt_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt.new) memb($Rs) = $Rt.new",
+ (S4_pstorerbnewtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt.new) memh($Rs) = $Rt.new",
+ (S4_pstorerhnewtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt.new) memw($Rs) = $Rt.new",
+ (S4_pstorerinewtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+
+// Alias of: if (!$Pt) $Rd = memXX($Rs + #$u6_X)
+// to: if (!$Pt) $Rd = memXX($Rs)
+def : InstAlias<"if (!$Pt) $Rd = memb($Rs)",
+ (L2_ploadrbf_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt) $Rd = memub($Rs)",
+ (L2_ploadrubf_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt) $Rd = memh($Rs)",
+ (L2_ploadrhf_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt) $Rd = memuh($Rs)",
+ (L2_ploadruhf_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt) $Rd = memw($Rs)",
+ (L2_ploadrif_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt) $Rdd = memd($Rs)",
+ (L2_ploadrdf_io DoubleRegs:$Rdd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+// Alias of: if (!$Pt) memXX($Rs + #$u6_X) = $Rt
+// to: if (!$Pt) memXX($Rs) = $Rt
+def : InstAlias<"if (!$Pt) memb($Rs) = $Rt",
+ (S2_pstorerbf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pt) memh($Rs) = $Rt",
+ (S2_pstorerhf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pt) memh($Rs) = $Rt.h",
+ (S2_pstorerff_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pt) memw($Rs) = $Rt",
+ (S2_pstorerif_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pt) memd($Rs) = $Rtt",
+ (S2_pstorerdf_io PredRegs:$Pt, IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>;
+
+def : InstAlias<"if (!$Pt) memb($Rs) = $Rt.new",
+ (S2_pstorerbnewf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pt) memh($Rs) = $Rt.new",
+ (S2_pstorerhnewf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pt) memw($Rs) = $Rt.new",
+ (S2_pstorerinewf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pt.new) memb($Rs) = $Rt.new",
+ (S4_pstorerbnewfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pt.new) memh($Rs) = $Rt.new",
+ (S4_pstorerhnewfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pt.new) memw($Rs) = $Rt.new",
+ (S4_pstorerinewfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt) memb($Rs) = #$S6",
+ (S4_storeirbt_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
+
+def : InstAlias<"if ($Pt) memh($Rs) = #$S6",
+ (S4_storeirht_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
+
+def : InstAlias<"if ($Pt) memw($Rs) = #$S6",
+ (S4_storeirit_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
+
+def : InstAlias<"if ($Pt.new) memb($Rs) = #$S6",
+ (S4_storeirbtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
+
+def : InstAlias<"if ($Pt.new) memh($Rs) = #$S6",
+ (S4_storeirhtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
+
+def : InstAlias<"if ($Pt.new) memw($Rs) = #$S6",
+ (S4_storeiritnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
+
+def : InstAlias<"if (!$Pt) memb($Rs) = #$S6",
+ (S4_storeirbf_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
+
+def : InstAlias<"if (!$Pt) memh($Rs) = #$S6",
+ (S4_storeirhf_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
+
+def : InstAlias<"if (!$Pt) memw($Rs) = #$S6",
+ (S4_storeirif_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
+
+def : InstAlias<"if (!$Pt.new) memb($Rs) = #$S6",
+ (S4_storeirbfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
+
+def : InstAlias<"if (!$Pt.new) memh($Rs) = #$S6",
+ (S4_storeirhfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
+
+def : InstAlias<"if (!$Pt.new) memw($Rs) = #$S6",
+ (S4_storeirifnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
+
+// Alias of: memXX($Rs + $u6_X) |= $Rt, also &=, +=, -=
+// to: memXX($Rs) |= $Rt
+def : InstAlias<"memb($Rs) &= $Rt",
+ (L4_and_memopb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+ Requires<[UseMEMOP]>;
+
+def : InstAlias<"memb($Rs) |= $Rt",
+ (L4_or_memopb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+ Requires<[UseMEMOP]>;
+
+def : InstAlias<"memb($Rs) += $Rt",
+ (L4_add_memopb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+ Requires<[UseMEMOP]>;
+
+def : InstAlias<"memb($Rs) -= $Rt",
+ (L4_sub_memopb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+ Requires<[UseMEMOP]>;
+
+def : InstAlias<"memb($Rs) += #$U5",
+ (L4_iadd_memopb_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>,
+ Requires<[UseMEMOP]>;
+
+def : InstAlias<"memb($Rs) -= #$U5",
+ (L4_isub_memopb_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>,
+ Requires<[UseMEMOP]>;
+
+def : InstAlias<"memh($Rs) &= $Rt",
+ (L4_and_memoph_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+ Requires<[UseMEMOP]>;
+
+def : InstAlias<"memh($Rs) |= $Rt",
+ (L4_or_memoph_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+ Requires<[UseMEMOP]>;
+
+def : InstAlias<"memh($Rs) += $Rt",
+ (L4_add_memoph_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+ Requires<[UseMEMOP]>;
+
+def : InstAlias<"memh($Rs) -= $Rt",
+ (L4_sub_memoph_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+ Requires<[UseMEMOP]>;
+
+def : InstAlias<"memh($Rs) += #$U5",
+ (L4_iadd_memoph_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>,
+ Requires<[UseMEMOP]>;
+
+def : InstAlias<"memh($Rs) -= #$U5",
+ (L4_isub_memoph_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>,
+ Requires<[UseMEMOP]>;
+
+def : InstAlias<"memw($Rs) &= $Rt",
+ (L4_and_memopw_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+ Requires<[UseMEMOP]>;
+
+def : InstAlias<"memw($Rs) |= $Rt",
+ (L4_or_memopw_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+ Requires<[UseMEMOP]>;
+
+def : InstAlias<"memw($Rs) += $Rt",
+ (L4_add_memopw_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+ Requires<[UseMEMOP]>;
+
+def : InstAlias<"memw($Rs) -= $Rt",
+ (L4_sub_memopw_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+ Requires<[UseMEMOP]>;
+
+def : InstAlias<"memw($Rs) += #$U5",
+ (L4_iadd_memopw_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>,
+ Requires<[UseMEMOP]>;
+
+def : InstAlias<"memw($Rs) -= #$U5",
+ (L4_isub_memopw_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>,
+ Requires<[UseMEMOP]>;
+
+//
+// Alias of: if ($Pv.new) memX($Rs) = $Rt
+// to: if (p3.new) memX(r17 + #0) = $Rt
+def : InstAlias<"if ($Pv.new) memb($Rs) = $Rt",
+ (S4_pstorerbtnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pv.new) memh($Rs) = $Rt",
+ (S4_pstorerhtnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pv.new) memh($Rs) = $Rt.h",
+ (S4_pstorerftnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pv.new) memw($Rs) = $Rt",
+ (S4_pstoreritnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pv.new) memd($Rs) = $Rtt",
+ (S4_pstorerdtnew_io
+ PredRegs:$Pv, IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>;
+
+def : InstAlias<"if (!$Pv.new) memb($Rs) = $Rt",
+ (S4_pstorerbfnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pv.new) memh($Rs) = $Rt",
+ (S4_pstorerhfnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pv.new) memh($Rs) = $Rt.h",
+ (S4_pstorerffnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pv.new) memw($Rs) = $Rt",
+ (S4_pstorerifnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pv.new) memd($Rs) = $Rtt",
+ (S4_pstorerdfnew_io
+ PredRegs:$Pv, IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>;
+
+//
+// Alias of: if ($Pt.new) $Rd = memub($Rs) -- And if (!$Pt.new) ...
+// to: if ($Pt.new) $Rd = memub($Rs + #$u6_0)
+def : InstAlias<"if ($Pt.new) $Rd = memub($Rs)",
+ (L2_ploadrubtnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt.new) $Rd = memb($Rs)",
+ (L2_ploadrbtnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt.new) $Rd = memh($Rs)",
+ (L2_ploadrhtnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt.new) $Rd = memuh($Rs)",
+ (L2_ploadruhtnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt.new) $Rd = memw($Rs)",
+ (L2_ploadritnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt.new) $Rdd = memd($Rs)",
+ (L2_ploadrdtnew_io DoubleRegs:$Rdd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt.new) $Rd = memub($Rs)",
+ (L2_ploadrubfnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt.new) $Rd = memb($Rs)",
+ (L2_ploadrbfnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt.new) $Rd = memh($Rs)",
+ (L2_ploadrhfnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt.new) $Rd = memuh($Rs)",
+ (L2_ploadruhfnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt.new) $Rd = memw($Rs)",
+ (L2_ploadrifnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt.new) $Rdd = memd($Rs)",
+ (L2_ploadrdfnew_io DoubleRegs:$Rdd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"dcfetch($Rs)",
+ (Y2_dcfetchbo IntRegs:$Rs, 0), 0>;
+
+// Alias of some insn mappings, others must be handled by the parser
+def : InstAlias<"$Pd=cmp.lt($Rs, $Rt)",
+ (C2_cmpgt PredRegs:$Pd, IntRegs:$Rt, IntRegs:$Rs), 0>;
+def : InstAlias<"$Pd=cmp.ltu($Rs, $Rt)",
+ (C2_cmpgtu PredRegs:$Pd, IntRegs:$Rt, IntRegs:$Rs), 0>;
+
+// Rd=neg(Rs) is aliased to Rd=sub(#0,Rs)
+def : InstAlias<"$Rd = neg($Rs)",
+ (A2_subri IntRegs:$Rd, 0, IntRegs:$Rs), 0>;
+
+def : InstAlias<"m0 = $Rs", (A2_tfrrcr C6, IntRegs:$Rs)>;
+def : InstAlias<"$Rd = m0", (A2_tfrcrr IntRegs:$Rd, C6)>;
+def : InstAlias<"m1 = $Rs", (A2_tfrrcr C7, IntRegs:$Rs)>;
+def : InstAlias<"$Rd = m1", (A2_tfrcrr IntRegs:$Rd, C7)>;
+
+def : InstAlias<"$Pd = $Ps",
+ (C2_or PredRegs:$Pd, PredRegs:$Ps, PredRegs:$Ps), 0>;
+
+def : InstAlias<"$Rdd = vaddb($Rss, $Rtt)",
+ (A2_vaddub DoubleRegs:$Rdd, DoubleRegs:$Rss, DoubleRegs:$Rtt), 1>;
+
+def : InstAlias<"$Rdd = vsubb($Rss,$Rtt)",
+ (A2_vsubub DoubleRegs:$Rdd, DoubleRegs:$Rss, DoubleRegs:$Rtt), 0>;
+
+def : InstAlias<"$Rd = mpyui($Rs,$Rt)",
+ (M2_mpyi IntRegs:$Rd, IntRegs:$Rs, IntRegs:$Rt), 0>;
+
+// Assembler mapped insns: cmp.lt(a,b) -> cmp.gt(b,a)
+def : InstAlias<"$Pd=cmp.lt($Rs, $Rt)",
+ (C2_cmpgt PredRegs:$Pd, IntRegs:$Rt, IntRegs:$Rs), 0>;
+def : InstAlias<"$Pd=cmp.ltu($Rs, $Rt)",
+ (C2_cmpgtu PredRegs:$Pd, IntRegs:$Rt, IntRegs:$Rs), 0>;
+
+// maps if (!Pu) jumpr Rs -> if (!Pu) jumpr:nt Rs
+def : InstAlias<"if (!$Pu) jumpr $Rs",
+ (J2_jumprf PredRegs:$Pu, IntRegs:$Rs)>,
+ Requires<[HasV60T]>;
+
+// maps if (Pu) jumpr Rs -> if (Pu) jumpr:nt Rs
+def : InstAlias<"if ($Pu) jumpr $Rs",
+ (J2_jumprt PredRegs:$Pu, IntRegs:$Rs)>,
+ Requires<[HasV60T]>;
+
+// maps if (!Pu) jump $r15_2 -> if (!Pu) jump:nt $r15_2
+def : InstAlias<"if (!$Pu) jump $r15_2",
+ (J2_jumpf PredRegs:$Pu, brtarget:$r15_2)>,
+ Requires<[HasV60T]>;
+
+// maps if (Pu) jump $r15_2 -> if (Pu) jump:nt $r15_2
+def : InstAlias<"if ($Pu) jump $r15_2",
+ (J2_jumpt PredRegs:$Pu, brtarget:$r15_2)>,
+ Requires<[HasV60T]>;
+
+def : InstAlias<"if ($src) jump $r15_2",
+ (J2_jumpt PredRegs:$src, brtarget:$r15_2), 0>;
+
+def : InstAlias<"if (!$src) jump $r15_2",
+ (J2_jumpf PredRegs:$src, brtarget:$r15_2), 0>;
+
+def : InstAlias<"if ($src1) jumpr $src2",
+ (J2_jumprt PredRegs:$src1, IntRegs:$src2), 0>;
+
+def : InstAlias<"if (!$src1) jumpr $src2",
+ (J2_jumprf PredRegs:$src1, IntRegs:$src2), 0>;
+
+// maps Vdd = Vss to Vdd = V6_vassignp(Vss)
+def : InstAlias<"$Vdd = $Vss",
+ (V6_vassignp VecDblRegs:$Vdd, VecDblRegs:$Vss)>,
+ Requires<[HasV60T]>;
+
+// maps Vd = #0 to Vd = vxor(Vd, Vd)
+def : InstAlias<"$Vd = #0",
+ (V6_vxor VectorRegs:$Vd, VectorRegs:$Vd, VectorRegs:$Vd)>,
+ Requires<[HasV60T]>;
+
+// maps Vdd = #0 to Vdd = vsub(Vdd, Vdd)
+def : InstAlias<"$Vdd = #0",
+ (V6_vsubw_dv VecDblRegs:$Vdd, VecDblRegs:$Vdd, VecDblRegs:$Vdd)>,
+ Requires<[HasV60T]>;
+
+// maps "$Qd = vcmp.eq($Vu.uh, $Vv.uh)" -> "$Qd = vcmp.eq($Vu.h, $Vv.h)"
+def : InstAlias<"$Qd = vcmp.eq($Vu.uh, $Vv.uh)",
+ (V6_veqh VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
+ Requires<[HasV60T]>;
+
+// maps "$Qd &= vcmp.eq($Vu.uh, $Vv.uh)" -> "$Qd &= vcmp.eq($Vu.h, $Vv.h)"
+def : InstAlias<"$Qd &= vcmp.eq($Vu.uh, $Vv.uh)",
+ (V6_veqh_and VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
+ Requires<[HasV60T]>;
+
+// maps "$Qd |= vcmp.eq($Vu.uh, $Vv.uh)" -> "$Qd |= vcmp.eq($Vu.h, $Vv.h)"
+def : InstAlias<"$Qd |= vcmp.eq($Vu.uh, $Vv.uh)",
+ (V6_veqh_or VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
+ Requires<[HasV60T]>;
+
+// maps "$Qd ^= vcmp.eq($Vu.uh, $Vv.uh)" -> "$Qd ^= vcmp.eq($Vu.h, $Vv.h)"
+def : InstAlias<"$Qd ^= vcmp.eq($Vu.uh, $Vv.uh)",
+ (V6_veqh_xor VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
+ Requires<[HasV60T]>;
+
+// maps "$Qd = vcmp.eq($Vu.uw, $Vv.uw)" -> "$Qd = vcmp.eq($Vu.w, $Vv.w)"
+def : InstAlias<"$Qd = vcmp.eq($Vu.uw, $Vv.uw)",
+ (V6_veqw VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
+ Requires<[HasV60T]>;
+
+// maps "$Qd &= vcmp.eq($Vu.uw, $Vv.uw)" -> "$Qd &= vcmp.eq($Vu.w, $Vv.w)"
+def : InstAlias<"$Qd &= vcmp.eq($Vu.uw, $Vv.uw)",
+ (V6_veqw_and VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
+ Requires<[HasV60T]>;
+
+// maps "$Qd |= vcmp.eq($Vu.uw, $Vv.uw)" -> "$Qd |= vcmp.eq($Vu.w, $Vv.w)"
+def : InstAlias<"$Qd |= vcmp.eq($Vu.uw, $Vv.uw)",
+ (V6_veqh_or VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
+ Requires<[HasV60T]>;
+
+// maps "$Qd ^= vcmp.eq($Vu.uw, $Vv.uw)" -> "$Qd ^= vcmp.eq($Vu.w, $Vv.w)"
+def : InstAlias<"$Qd ^= vcmp.eq($Vu.uw, $Vv.uw)",
+ (V6_veqw_xor VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
+ Requires<[HasV60T]>;
+
+// maps "$Qd = vcmp.eq($Vu.ub, $Vv.ub)" -> "$Qd = vcmp.eq($Vu.b, $Vv.b)"
+def : InstAlias<"$Qd = vcmp.eq($Vu.ub, $Vv.ub)",
+ (V6_veqb VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
+ Requires<[HasV60T]>;
+
+// maps "$Qd &= vcmp.eq($Vu.ub, $Vv.ub)" -> "$Qd &= vcmp.eq($Vu.b, $Vv.b)"
+def : InstAlias<"$Qd &= vcmp.eq($Vu.ub, $Vv.ub)",
+ (V6_veqb_and VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
+ Requires<[HasV60T]>;
+
+// maps "$Qd |= vcmp.eq($Vu.ub, $Vv.ub)" -> "$Qd |= vcmp.eq($Vu.b, $Vv.b)"
+def : InstAlias<"$Qd |= vcmp.eq($Vu.ub, $Vv.ub)",
+ (V6_veqb_or VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
+ Requires<[HasV60T]>;
+
+// maps "$Qd ^= vcmp.eq($Vu.ub, $Vv.ub)" -> "$Qd ^= vcmp.eq($Vu.b, $Vv.b)"
+def : InstAlias<"$Qd ^= vcmp.eq($Vu.ub, $Vv.ub)",
+ (V6_veqb_xor VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
+ Requires<[HasV60T]>;
+
+// maps "$Rd.w = vextract($Vu, $Rs)" -> "$Rd = vextract($Vu, $Rs)"
+def : InstAlias<"$Rd.w = vextract($Vu, $Rs)",
+ (V6_extractw IntRegs:$Rd, VectorRegs:$Vu, IntRegs:$Rs)>,
+ Requires<[HasV60T]>;
+
+// Mapping from vtrans2x2(Vy32,Vx32,Rt32) to vshuff(Vy32,Vx32,Rt32)
+def : InstAlias<"vtrans2x2($Vy, $Vx, $Rt)",
+ (V6_vshuff VectorRegs:$Vy, VectorRegs:$Vx, IntRegs:$Rt)>,
+ Requires<[HasV60T]>;
+
+def : InstAlias<"$Vt=vmem($Rs)",
+ (V6_vL32b_ai VectorRegs:$Vt, IntRegs:$Rs, 0)>,
+ Requires<[HasV60T]>;
+
+def : InstAlias<"$Vt=vmem($Rs):nt",
+ (V6_vL32b_nt_ai VectorRegs:$Vt, IntRegs:$Rs, 0)>,
+ Requires<[HasV60T]>;
+
+def : InstAlias<"vmem($Rs)=$Vt",
+ (V6_vS32b_ai IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+ Requires<[HasV60T]>;
+
+def : InstAlias<"vmem($Rs):nt=$Vt",
+ (V6_vS32b_nt_ai IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+ Requires<[HasV60T]>;
+
+def : InstAlias<"vmem($Rs)=$Vt.new",
+ (V6_vS32b_new_ai IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+ Requires<[HasV60T]>;
+
+def : InstAlias<"vmem($Rs):nt=$Vt.new",
+ (V6_vS32b_nt_new_ai IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+ Requires<[HasV60T]>;
+
+def : InstAlias<"if ($Qv) vmem($Rs)=$Vt",
+ (V6_vS32b_qpred_ai VecPredRegs:$Qv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+ Requires<[HasV60T]>;
+
+def : InstAlias<"if (!$Qv) vmem($Rs)=$Vt",
+ (V6_vS32b_nqpred_ai VecPredRegs:$Qv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+ Requires<[HasV60T]>;
+
+def : InstAlias<"if ($Qv) vmem($Rs):nt=$Vt",
+ (V6_vS32b_nt_qpred_ai VecPredRegs:$Qv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+ Requires<[HasV60T]>;
+
+def : InstAlias<"if (!$Qv) vmem($Rs):nt=$Vt",
+ (V6_vS32b_nt_nqpred_ai VecPredRegs:$Qv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+ Requires<[HasV60T]>;
+
+def : InstAlias<"if ($Pv) vmem($Rs)=$Vt",
+ (V6_vS32b_pred_ai PredRegs:$Pv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+ Requires<[HasV60T]>;
+
+def : InstAlias<"if (!$Pv) vmem($Rs)=$Vt",
+ (V6_vS32b_npred_ai PredRegs:$Pv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+ Requires<[HasV60T]>;
+
+def : InstAlias<"if ($Pv) vmem($Rs):nt=$Vt",
+ (V6_vS32b_nt_pred_ai PredRegs:$Pv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+ Requires<[HasV60T]>;
+
+def : InstAlias<"if (!$Pv) vmem($Rs):nt=$Vt",
+ (V6_vS32b_nt_npred_ai PredRegs:$Pv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+ Requires<[HasV60T]>;
+
+def : InstAlias<"$Vt=vmemu($Rs)",
+ (V6_vL32Ub_ai VectorRegs:$Vt, IntRegs:$Rs, 0)>,
+ Requires<[HasV60T]>;
+
+def : InstAlias<"vmemu($Rs)=$Vt",
+ (V6_vS32Ub_ai IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+ Requires<[HasV60T]>;
+
+def : InstAlias<"if ($Pv) vmemu($Rs)=$Vt",
+ (V6_vS32Ub_pred_ai PredRegs:$Pv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+ Requires<[HasV60T]>;
+
+def : InstAlias<"if (!$Pv) vmemu($Rs)=$Vt",
+ (V6_vS32Ub_npred_ai PredRegs:$Pv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+ Requires<[HasV60T]>;
+
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrEnc.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrEnc.td
new file mode 100644
index 000000000000..280832fd167f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrEnc.td
@@ -0,0 +1,1019 @@
+class Enc_COPROC_VX_3op_v<bits<15> opc> : OpcodeHexagon {
+ bits<5> dst;
+ bits<5> src1;
+ bits<5> src2;
+
+ let Inst{31-16} = { opc{14-4}, src2};
+ let Inst{13-0} = { opc{3}, src1, opc{2-0}, dst};
+}
+
+class V6_vtmpyb_enc : Enc_COPROC_VX_3op_v<0b000110010000000>;
+class V6_vtmpybus_enc : Enc_COPROC_VX_3op_v<0b000110010000001>;
+class V6_vdmpyhb_enc : Enc_COPROC_VX_3op_v<0b000110010000010>;
+class V6_vrmpyub_enc : Enc_COPROC_VX_3op_v<0b000110010000011>;
+class V6_vrmpybus_enc : Enc_COPROC_VX_3op_v<0b000110010000100>;
+class V6_vdsaduh_enc : Enc_COPROC_VX_3op_v<0b000110010000101>;
+class V6_vdmpybus_enc : Enc_COPROC_VX_3op_v<0b000110010000110>;
+class V6_vdmpybus_dv_enc : Enc_COPROC_VX_3op_v<0b000110010000111>;
+class V6_vtmpyb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001000>;
+class V6_vtmpybus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001001>;
+class V6_vtmpyhb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001010>;
+class V6_vdmpyhb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001011>;
+class V6_vrmpyub_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001100>;
+class V6_vrmpybus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001101>;
+class V6_vdmpybus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001110>;
+class V6_vdmpybus_dv_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001111>;
+class V6_vdmpyhsusat_enc : Enc_COPROC_VX_3op_v<0b000110010010000>;
+class V6_vdmpyhsuisat_enc : Enc_COPROC_VX_3op_v<0b000110010010001>;
+class V6_vdmpyhsat_enc : Enc_COPROC_VX_3op_v<0b000110010010010>;
+class V6_vdmpyhisat_enc : Enc_COPROC_VX_3op_v<0b000110010010011>;
+class V6_vdmpyhb_dv_enc : Enc_COPROC_VX_3op_v<0b000110010010100>;
+class V6_vmpybus_enc : Enc_COPROC_VX_3op_v<0b000110010010101>;
+class V6_vmpabus_enc : Enc_COPROC_VX_3op_v<0b000110010010110>;
+class V6_vmpahb_enc : Enc_COPROC_VX_3op_v<0b000110010010111>;
+class V6_vdmpyhsusat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011000>;
+class V6_vdmpyhsuisat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011001>;
+class V6_vdmpyhisat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011010>;
+class V6_vdmpyhsat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011011>;
+class V6_vdmpyhb_dv_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011100>;
+class V6_vmpybus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011101>;
+class V6_vmpabus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011110>;
+class V6_vmpahb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011111>;
+class V6_vmpyh_enc : Enc_COPROC_VX_3op_v<0b000110010100000>;
+class V6_vmpyhss_enc : Enc_COPROC_VX_3op_v<0b000110010100001>;
+class V6_vmpyhsrs_enc : Enc_COPROC_VX_3op_v<0b000110010100010>;
+class V6_vmpyuh_enc : Enc_COPROC_VX_3op_v<0b000110010100011>;
+class V6_vmpyhsat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010101000>;
+class V6_vmpyuh_acc_enc : Enc_COPROC_VX_3op_v<0b000110010101001>;
+class V6_vmpyiwb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010101010>;
+class V6_vmpyiwh_acc_enc : Enc_COPROC_VX_3op_v<0b000110010101011>;
+class V6_vmpyihb_enc : Enc_COPROC_VX_3op_v<0b000110010110000>;
+class V6_vror_enc : Enc_COPROC_VX_3op_v<0b000110010110001>;
+class V6_vasrw_enc : Enc_COPROC_VX_3op_v<0b000110010110101>;
+class V6_vasrh_enc : Enc_COPROC_VX_3op_v<0b000110010110110>;
+class V6_vaslw_enc : Enc_COPROC_VX_3op_v<0b000110010110111>;
+class V6_vdsaduh_acc_enc : Enc_COPROC_VX_3op_v<0b000110010111000>;
+class V6_vmpyihb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010111001>;
+class V6_vaslw_acc_enc : Enc_COPROC_VX_3op_v<0b000110010111010>;
+class V6_vasrw_acc_enc : Enc_COPROC_VX_3op_v<0b000110010111101>;
+class V6_vaslh_enc : Enc_COPROC_VX_3op_v<0b000110011000000>;
+class V6_vlsrw_enc : Enc_COPROC_VX_3op_v<0b000110011000001>;
+class V6_vlsrh_enc : Enc_COPROC_VX_3op_v<0b000110011000010>;
+class V6_vmpyiwh_enc : Enc_COPROC_VX_3op_v<0b000110011000111>;
+class V6_vmpyub_acc_enc : Enc_COPROC_VX_3op_v<0b000110011001000>;
+class V6_vmpyiwb_enc : Enc_COPROC_VX_3op_v<0b000110011010000>;
+class V6_vtmpyhb_enc : Enc_COPROC_VX_3op_v<0b000110011010100>;
+class V6_vmpyub_enc : Enc_COPROC_VX_3op_v<0b000110011100000>;
+class V6_vrmpyubv_enc : Enc_COPROC_VX_3op_v<0b000111000000000>;
+class V6_vrmpybv_enc : Enc_COPROC_VX_3op_v<0b000111000000001>;
+class V6_vrmpybusv_enc : Enc_COPROC_VX_3op_v<0b000111000000010>;
+class V6_vdmpyhvsat_enc : Enc_COPROC_VX_3op_v<0b000111000000011>;
+class V6_vmpybv_enc : Enc_COPROC_VX_3op_v<0b000111000000100>;
+class V6_vmpyubv_enc : Enc_COPROC_VX_3op_v<0b000111000000101>;
+class V6_vmpybusv_enc : Enc_COPROC_VX_3op_v<0b000111000000110>;
+class V6_vmpyhv_enc : Enc_COPROC_VX_3op_v<0b000111000000111>;
+class V6_vrmpyubv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001000>;
+class V6_vrmpybv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001001>;
+class V6_vrmpybusv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001010>;
+class V6_vdmpyhvsat_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001011>;
+class V6_vmpybv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001100>;
+class V6_vmpyubv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001101>;
+class V6_vmpybusv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001110>;
+class V6_vmpyhv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001111>;
+class V6_vmpyuhv_enc : Enc_COPROC_VX_3op_v<0b000111000010000>;
+class V6_vmpyhvsrs_enc : Enc_COPROC_VX_3op_v<0b000111000010001>;
+class V6_vmpyhus_enc : Enc_COPROC_VX_3op_v<0b000111000010010>;
+class V6_vmpabusv_enc : Enc_COPROC_VX_3op_v<0b000111000010011>;
+class V6_vmpyih_enc : Enc_COPROC_VX_3op_v<0b000111000010100>;
+class V6_vand_enc : Enc_COPROC_VX_3op_v<0b000111000010101>;
+class V6_vor_enc : Enc_COPROC_VX_3op_v<0b000111000010110>;
+class V6_vxor_enc : Enc_COPROC_VX_3op_v<0b000111000010111>;
+class V6_vmpyuhv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000011000>;
+class V6_vmpyhus_acc_enc : Enc_COPROC_VX_3op_v<0b000111000011001>;
+class V6_vmpyih_acc_enc : Enc_COPROC_VX_3op_v<0b000111000011100>;
+class V6_vmpyiewuh_acc_enc : Enc_COPROC_VX_3op_v<0b000111000011101>;
+class V6_vmpyowh_sacc_enc : Enc_COPROC_VX_3op_v<0b000111000011110>;
+class V6_vmpyowh_rnd_sacc_enc : Enc_COPROC_VX_3op_v<0b000111000011111>;
+class V6_vaddw_enc : Enc_COPROC_VX_3op_v<0b000111000100000>;
+class V6_vaddubsat_enc : Enc_COPROC_VX_3op_v<0b000111000100001>;
+class V6_vadduhsat_enc : Enc_COPROC_VX_3op_v<0b000111000100010>;
+class V6_vaddhsat_enc : Enc_COPROC_VX_3op_v<0b000111000100011>;
+class V6_vaddwsat_enc : Enc_COPROC_VX_3op_v<0b000111000100100>;
+class V6_vsubb_enc : Enc_COPROC_VX_3op_v<0b000111000100101>;
+class V6_vsubh_enc : Enc_COPROC_VX_3op_v<0b000111000100110>;
+class V6_vsubw_enc : Enc_COPROC_VX_3op_v<0b000111000100111>;
+class V6_vmpyiewh_acc_enc : Enc_COPROC_VX_3op_v<0b000111000101000>;
+class V6_vsububsat_enc : Enc_COPROC_VX_3op_v<0b000111000110000>;
+class V6_vsubuhsat_enc : Enc_COPROC_VX_3op_v<0b000111000110001>;
+class V6_vsubhsat_enc : Enc_COPROC_VX_3op_v<0b000111000110010>;
+class V6_vsubwsat_enc : Enc_COPROC_VX_3op_v<0b000111000110011>;
+class V6_vaddb_dv_enc : Enc_COPROC_VX_3op_v<0b000111000110100>;
+class V6_vaddh_dv_enc : Enc_COPROC_VX_3op_v<0b000111000110101>;
+class V6_vaddw_dv_enc : Enc_COPROC_VX_3op_v<0b000111000110110>;
+class V6_vaddubsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111000110111>;
+class V6_vadduhsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000000>;
+class V6_vaddhsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000001>;
+class V6_vaddwsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000010>;
+class V6_vsubb_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000011>;
+class V6_vsubh_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000100>;
+class V6_vsubw_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000101>;
+class V6_vsububsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000110>;
+class V6_vsubuhsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000111>;
+class V6_vsubhsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001010000>;
+class V6_vsubwsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001010001>;
+class V6_vaddubh_enc : Enc_COPROC_VX_3op_v<0b000111001010010>;
+class V6_vadduhw_enc : Enc_COPROC_VX_3op_v<0b000111001010011>;
+class V6_vaddhw_enc : Enc_COPROC_VX_3op_v<0b000111001010100>;
+class V6_vsububh_enc : Enc_COPROC_VX_3op_v<0b000111001010101>;
+class V6_vsubuhw_enc : Enc_COPROC_VX_3op_v<0b000111001010110>;
+class V6_vsubhw_enc : Enc_COPROC_VX_3op_v<0b000111001010111>;
+class V6_vabsdiffub_enc : Enc_COPROC_VX_3op_v<0b000111001100000>;
+class V6_vabsdiffh_enc : Enc_COPROC_VX_3op_v<0b000111001100001>;
+class V6_vabsdiffuh_enc : Enc_COPROC_VX_3op_v<0b000111001100010>;
+class V6_vabsdiffw_enc : Enc_COPROC_VX_3op_v<0b000111001100011>;
+class V6_vavgub_enc : Enc_COPROC_VX_3op_v<0b000111001100100>;
+class V6_vavguh_enc : Enc_COPROC_VX_3op_v<0b000111001100101>;
+class V6_vavgh_enc : Enc_COPROC_VX_3op_v<0b000111001100110>;
+class V6_vavgw_enc : Enc_COPROC_VX_3op_v<0b000111001100111>;
+class V6_vnavgub_enc : Enc_COPROC_VX_3op_v<0b000111001110000>;
+class V6_vnavgh_enc : Enc_COPROC_VX_3op_v<0b000111001110001>;
+class V6_vnavgw_enc : Enc_COPROC_VX_3op_v<0b000111001110010>;
+class V6_vavgubrnd_enc : Enc_COPROC_VX_3op_v<0b000111001110011>;
+class V6_vavguhrnd_enc : Enc_COPROC_VX_3op_v<0b000111001110100>;
+class V6_vavghrnd_enc : Enc_COPROC_VX_3op_v<0b000111001110101>;
+class V6_vavgwrnd_enc : Enc_COPROC_VX_3op_v<0b000111001110110>;
+class V6_vmpabuuv_enc : Enc_COPROC_VX_3op_v<0b000111001110111>;
+class V6_vminub_enc : Enc_COPROC_VX_3op_v<0b000111110000001>;
+class V6_vminuh_enc : Enc_COPROC_VX_3op_v<0b000111110000010>;
+class V6_vminh_enc : Enc_COPROC_VX_3op_v<0b000111110000011>;
+class V6_vminw_enc : Enc_COPROC_VX_3op_v<0b000111110000100>;
+class V6_vmaxub_enc : Enc_COPROC_VX_3op_v<0b000111110000101>;
+class V6_vmaxuh_enc : Enc_COPROC_VX_3op_v<0b000111110000110>;
+class V6_vmaxh_enc : Enc_COPROC_VX_3op_v<0b000111110000111>;
+class V6_vmaxw_enc : Enc_COPROC_VX_3op_v<0b000111110010000>;
+class V6_vdelta_enc : Enc_COPROC_VX_3op_v<0b000111110010001>;
+class V6_vrdelta_enc : Enc_COPROC_VX_3op_v<0b000111110010011>;
+class V6_vdealb4w_enc : Enc_COPROC_VX_3op_v<0b000111110010111>;
+class V6_vmpyowh_rnd_enc : Enc_COPROC_VX_3op_v<0b000111110100000>;
+class V6_vshuffeb_enc : Enc_COPROC_VX_3op_v<0b000111110100001>;
+class V6_vshuffob_enc : Enc_COPROC_VX_3op_v<0b000111110100010>;
+class V6_vshufeh_enc : Enc_COPROC_VX_3op_v<0b000111110100011>;
+class V6_vshufoh_enc : Enc_COPROC_VX_3op_v<0b000111110100100>;
+class V6_vshufoeh_enc : Enc_COPROC_VX_3op_v<0b000111110100101>;
+class V6_vshufoeb_enc : Enc_COPROC_VX_3op_v<0b000111110100110>;
+class V6_vcombine_enc : Enc_COPROC_VX_3op_v<0b000111110100111>;
+class V6_vmpyieoh_enc : Enc_COPROC_VX_3op_v<0b000111110110000>;
+class V6_vsathub_enc : Enc_COPROC_VX_3op_v<0b000111110110010>;
+class V6_vsatwh_enc : Enc_COPROC_VX_3op_v<0b000111110110011>;
+class V6_vroundwh_enc : Enc_COPROC_VX_3op_v<0b000111110110100>;
+class V6_vroundwuh_enc : Enc_COPROC_VX_3op_v<0b000111110110101>;
+class V6_vroundhb_enc : Enc_COPROC_VX_3op_v<0b000111110110110>;
+class V6_vroundhub_enc : Enc_COPROC_VX_3op_v<0b000111110110111>;
+class V6_vasrwv_enc : Enc_COPROC_VX_3op_v<0b000111111010000>;
+class V6_vlsrwv_enc : Enc_COPROC_VX_3op_v<0b000111111010001>;
+class V6_vlsrhv_enc : Enc_COPROC_VX_3op_v<0b000111111010010>;
+class V6_vasrhv_enc : Enc_COPROC_VX_3op_v<0b000111111010011>;
+class V6_vaslwv_enc : Enc_COPROC_VX_3op_v<0b000111111010100>;
+class V6_vaslhv_enc : Enc_COPROC_VX_3op_v<0b000111111010101>;
+class V6_vaddb_enc : Enc_COPROC_VX_3op_v<0b000111111010110>;
+class V6_vaddh_enc : Enc_COPROC_VX_3op_v<0b000111111010111>;
+class V6_vmpyiewuh_enc : Enc_COPROC_VX_3op_v<0b000111111100000>;
+class V6_vmpyiowh_enc : Enc_COPROC_VX_3op_v<0b000111111100001>;
+class V6_vpackeb_enc : Enc_COPROC_VX_3op_v<0b000111111100010>;
+class V6_vpackeh_enc : Enc_COPROC_VX_3op_v<0b000111111100011>;
+class V6_vpackhub_sat_enc : Enc_COPROC_VX_3op_v<0b000111111100101>;
+class V6_vpackhb_sat_enc : Enc_COPROC_VX_3op_v<0b000111111100110>;
+class V6_vpackwuh_sat_enc : Enc_COPROC_VX_3op_v<0b000111111100111>;
+class V6_vpackwh_sat_enc : Enc_COPROC_VX_3op_v<0b000111111110000>;
+class V6_vpackob_enc : Enc_COPROC_VX_3op_v<0b000111111110001>;
+class V6_vpackoh_enc : Enc_COPROC_VX_3op_v<0b000111111110010>;
+class V6_vmpyewuh_enc : Enc_COPROC_VX_3op_v<0b000111111110101>;
+class V6_vmpyowh_enc : Enc_COPROC_VX_3op_v<0b000111111110111>;
+class V6_extractw_enc : Enc_COPROC_VX_3op_v<0b100100100000001>;
+class M6_vabsdiffub_enc : Enc_COPROC_VX_3op_v<0b111010001010000>;
+class M6_vabsdiffb_enc : Enc_COPROC_VX_3op_v<0b111010001110000>;
+
+class Enc_COPROC_VX_cmp<bits<13> opc> : OpcodeHexagon {
+ bits<2> dst;
+ bits<5> src1;
+ bits<5> src2;
+
+ let Inst{31-16} = { 0b00011, opc{12-7}, src2{4-0} };
+ let Inst{13-0} = { opc{6}, src1{4-0}, opc{5-0}, dst{1-0} };
+}
+
+class V6_vandvrt_acc_enc : Enc_COPROC_VX_cmp<0b0010111100000>;
+class V6_vandvrt_enc : Enc_COPROC_VX_cmp<0b0011010010010>;
+class V6_veqb_and_enc : Enc_COPROC_VX_cmp<0b1001001000000>;
+class V6_veqh_and_enc : Enc_COPROC_VX_cmp<0b1001001000001>;
+class V6_veqw_and_enc : Enc_COPROC_VX_cmp<0b1001001000010>;
+class V6_vgtb_and_enc : Enc_COPROC_VX_cmp<0b1001001000100>;
+class V6_vgth_and_enc : Enc_COPROC_VX_cmp<0b1001001000101>;
+class V6_vgtw_and_enc : Enc_COPROC_VX_cmp<0b1001001000110>;
+class V6_vgtub_and_enc : Enc_COPROC_VX_cmp<0b1001001001000>;
+class V6_vgtuh_and_enc : Enc_COPROC_VX_cmp<0b1001001001001>;
+class V6_vgtuw_and_enc : Enc_COPROC_VX_cmp<0b1001001001010>;
+class V6_veqb_or_enc : Enc_COPROC_VX_cmp<0b1001001010000>;
+class V6_veqh_or_enc : Enc_COPROC_VX_cmp<0b1001001010001>;
+class V6_veqw_or_enc : Enc_COPROC_VX_cmp<0b1001001010010>;
+class V6_vgtb_or_enc : Enc_COPROC_VX_cmp<0b1001001010100>;
+class V6_vgth_or_enc : Enc_COPROC_VX_cmp<0b1001001010101>;
+class V6_vgtw_or_enc : Enc_COPROC_VX_cmp<0b1001001010110>;
+class V6_vgtub_or_enc : Enc_COPROC_VX_cmp<0b1001001011000>;
+class V6_vgtuh_or_enc : Enc_COPROC_VX_cmp<0b1001001011001>;
+class V6_vgtuw_or_enc : Enc_COPROC_VX_cmp<0b1001001011010>;
+class V6_veqb_xor_enc : Enc_COPROC_VX_cmp<0b1001001100000>;
+class V6_veqh_xor_enc : Enc_COPROC_VX_cmp<0b1001001100001>;
+class V6_veqw_xor_enc : Enc_COPROC_VX_cmp<0b1001001100010>;
+class V6_vgtb_xor_enc : Enc_COPROC_VX_cmp<0b1001001100100>;
+class V6_vgth_xor_enc : Enc_COPROC_VX_cmp<0b1001001100101>;
+class V6_vgtw_xor_enc : Enc_COPROC_VX_cmp<0b1001001100110>;
+class V6_vgtub_xor_enc : Enc_COPROC_VX_cmp<0b1001001101000>;
+class V6_vgtuh_xor_enc : Enc_COPROC_VX_cmp<0b1001001101001>;
+class V6_vgtuw_xor_enc : Enc_COPROC_VX_cmp<0b1001001101010>;
+class V6_veqb_enc : Enc_COPROC_VX_cmp<0b1111000000000>;
+class V6_veqh_enc : Enc_COPROC_VX_cmp<0b1111000000001>;
+class V6_veqw_enc : Enc_COPROC_VX_cmp<0b1111000000010>;
+class V6_vgtb_enc : Enc_COPROC_VX_cmp<0b1111000000100>;
+class V6_vgth_enc : Enc_COPROC_VX_cmp<0b1111000000101>;
+class V6_vgtw_enc : Enc_COPROC_VX_cmp<0b1111000000110>;
+class V6_vgtub_enc : Enc_COPROC_VX_cmp<0b1111000001000>;
+class V6_vgtuh_enc : Enc_COPROC_VX_cmp<0b1111000001001>;
+class V6_vgtuw_enc : Enc_COPROC_VX_cmp<0b1111000001010>;
+
+class Enc_COPROC_VX_p2op<bits<5> opc> : OpcodeHexagon {
+ bits<2> src1;
+ bits<5> dst;
+ bits<5> src2;
+
+ let Inst{31-16} = { 0b00011110, src1{1-0}, 0b0000, opc{4-3} };
+ let Inst{13-0} = { 1, src2{4-0}, opc{2-0}, dst{4-0} };
+}
+
+class V6_vaddbq_enc : Enc_COPROC_VX_p2op<0b01000>;
+class V6_vaddhq_enc : Enc_COPROC_VX_p2op<0b01001>;
+class V6_vaddwq_enc : Enc_COPROC_VX_p2op<0b01010>;
+class V6_vaddbnq_enc : Enc_COPROC_VX_p2op<0b01011>;
+class V6_vaddhnq_enc : Enc_COPROC_VX_p2op<0b01100>;
+class V6_vaddwnq_enc : Enc_COPROC_VX_p2op<0b01101>;
+class V6_vsubbq_enc : Enc_COPROC_VX_p2op<0b01110>;
+class V6_vsubhq_enc : Enc_COPROC_VX_p2op<0b01111>;
+class V6_vsubwq_enc : Enc_COPROC_VX_p2op<0b10000>;
+class V6_vsubbnq_enc : Enc_COPROC_VX_p2op<0b10001>;
+class V6_vsubhnq_enc : Enc_COPROC_VX_p2op<0b10010>;
+class V6_vsubwnq_enc : Enc_COPROC_VX_p2op<0b10011>;
+
+class Enc_COPROC_VX_2op<bits<6> opc> : OpcodeHexagon {
+ bits<5> dst;
+ bits<5> src1;
+
+ let Inst{31-16} = { 0b00011110000000, opc{5-4} };
+ let Inst{13-0} = { opc{3}, src1{4-0}, opc{2-0}, dst{4-0} };
+}
+
+class V6_vabsh_enc : Enc_COPROC_VX_2op<0b000000>;
+class V6_vabsh_sat_enc : Enc_COPROC_VX_2op<0b000001>;
+class V6_vabsw_enc : Enc_COPROC_VX_2op<0b000010>;
+class V6_vabsw_sat_enc : Enc_COPROC_VX_2op<0b000011>;
+class V6_vnot_enc : Enc_COPROC_VX_2op<0b000100>;
+class V6_vdealh_enc : Enc_COPROC_VX_2op<0b000110>;
+class V6_vdealb_enc : Enc_COPROC_VX_2op<0b000111>;
+class V6_vunpackob_enc : Enc_COPROC_VX_2op<0b001000>;
+class V6_vunpackoh_enc : Enc_COPROC_VX_2op<0b001001>;
+class V6_vunpackub_enc : Enc_COPROC_VX_2op<0b010000>;
+class V6_vunpackuh_enc : Enc_COPROC_VX_2op<0b010001>;
+class V6_vunpackb_enc : Enc_COPROC_VX_2op<0b010010>;
+class V6_vunpackh_enc : Enc_COPROC_VX_2op<0b010011>;
+class V6_vshuffh_enc : Enc_COPROC_VX_2op<0b010111>;
+class V6_vshuffb_enc : Enc_COPROC_VX_2op<0b100000>;
+class V6_vzb_enc : Enc_COPROC_VX_2op<0b100001>;
+class V6_vzh_enc : Enc_COPROC_VX_2op<0b100010>;
+class V6_vsb_enc : Enc_COPROC_VX_2op<0b100011>;
+class V6_vsh_enc : Enc_COPROC_VX_2op<0b100100>;
+class V6_vcl0w_enc : Enc_COPROC_VX_2op<0b100101>;
+class V6_vpopcounth_enc : Enc_COPROC_VX_2op<0b100110>;
+class V6_vcl0h_enc : Enc_COPROC_VX_2op<0b100111>;
+class V6_vnormamtw_enc : Enc_COPROC_VX_2op<0b110100>;
+class V6_vnormamth_enc : Enc_COPROC_VX_2op<0b110101>;
+class V6_vassign_enc : Enc_COPROC_VX_2op<0b111111>;
+
+class Enc_COPROC_VMEM_vL32_b_ai<bits<4> opc> : OpcodeHexagon {
+ bits<5> dst;
+ bits<5> src1;
+ bits<10> src2;
+ bits<4> src2_vector;
+
+ let src2_vector = src2{9-6};
+ let Inst{31-16} = { 0b001010000, opc{3}, 0, src1{4-0} };
+ let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, opc{2-0}, dst{4-0} };
+}
+
+class V6_vL32b_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b0000>;
+class V6_vL32b_cur_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b0001>;
+class V6_vL32b_tmp_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b0010>;
+class V6_vL32Ub_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b0111>;
+class V6_vL32b_nt_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b1000>;
+class V6_vL32b_nt_cur_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b1001>;
+class V6_vL32b_nt_tmp_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b1010>;
+
+class Enc_COPROC_VMEM_vL32_b_ai_128B<bits<4> opc> : OpcodeHexagon {
+ bits<5> dst;
+ bits<5> src1;
+ bits<11> src2;
+ bits<4> src2_vector;
+
+ let src2_vector = src2{10-7};
+ let Inst{31-16} = { 0b001010000, opc{3}, 0, src1{4-0} };
+ let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, opc{2-0}, dst{4-0} };
+}
+
+class V6_vL32b_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b0000>;
+class V6_vL32b_cur_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b0001>;
+class V6_vL32b_tmp_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b0010>;
+class V6_vL32Ub_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b0111>;
+class V6_vL32b_nt_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b1000>;
+class V6_vL32b_nt_cur_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b1001>;
+class V6_vL32b_nt_tmp_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b1010>;
+
+class Enc_COPROC_VMEM_vS32_b_ai_64B<bits<4> opc> : OpcodeHexagon {
+ bits<5> src1;
+ bits<10> src2;
+ bits<4> src2_vector;
+ bits<5> src3;
+
+ let src2_vector = src2{9-6};
+ let Inst{31-16} = { 0b001010000, opc{3}, 1, src1{4-0} };
+ let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, opc{2-0}, src3{4-0} };
+}
+
+class Enc_COPROC_VMEM_vS32_b_ai_128B<bits<4> opc> : OpcodeHexagon {
+ bits<5> src1;
+ bits<11> src2;
+ bits<4> src2_vector;
+ bits<5> src3;
+
+ let src2_vector = src2{10-7};
+ let Inst{31-16} = { 0b001010000, opc{3}, 1, src1{4-0} };
+ let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, opc{2-0}, src3{4-0} };
+}
+
+class V6_vS32b_ai_enc : Enc_COPROC_VMEM_vS32_b_ai_64B<0b0000>;
+class V6_vS32Ub_ai_enc : Enc_COPROC_VMEM_vS32_b_ai_64B<0b0111>;
+class V6_vS32b_nt_ai_enc : Enc_COPROC_VMEM_vS32_b_ai_64B<0b1000>;
+
+class V6_vS32b_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_ai_128B<0b0000>;
+class V6_vS32Ub_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_ai_128B<0b0111>;
+class V6_vS32b_nt_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_ai_128B<0b1000>;
+
+class Enc_COPROC_VMEM_vS32b_n_ew_ai_64B<bits<1> opc> : OpcodeHexagon {
+ bits<5> src1;
+ bits<10> src2;
+ bits<4> src2_vector;
+ bits<3> src3;
+
+ let src2_vector = src2{9-6};
+ let Inst{31-16} = { 0b001010000, opc{0}, 1, src1{4-0} };
+ let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, 0b00100, src3{2-0} };
+}
+
+class V6_vS32b_new_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_ai_64B<0>;
+class V6_vS32b_nt_new_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_ai_64B<1>;
+
+class Enc_COPROC_VMEM_vS32b_n_ew_ai_128B<bits<1> opc> : OpcodeHexagon {
+ bits<5> src1;
+ bits<11> src2;
+ bits<4> src2_vector;
+ bits<3> src3;
+
+ let src2_vector = src2{10-7};
+ let Inst{31-16} = { 0b001010000, opc{0}, 1, src1{4-0} };
+ let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, 0b00100, src3{2-0} };
+}
+
+class V6_vS32b_new_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_ai_128B<0>;
+class V6_vS32b_nt_new_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_ai_128B<1>;
+
+class Enc_COPROC_VMEM_vS32_b_pred_ai<bits<5> opc> : OpcodeHexagon {
+ bits<2> src1;
+ bits<5> src2;
+ bits<10> src3;
+ bits<4> src3_vector;
+ bits<5> src4;
+
+ let src3_vector = src3{9-6};
+ let Inst{31-16} = { 0b001010001, opc{4-3}, src2{4-0} };
+ let Inst{13-0} = { src3_vector{3}, src1{1-0}, src3_vector{2-0}, opc{2-0}, src4{4-0} };
+}
+
+class Enc_COPROC_VMEM_vS32_b_pred_ai_128B<bits<5> opc> : OpcodeHexagon {
+ bits<2> src1;
+ bits<5> src2;
+ bits<11> src3;
+ bits<4> src3_vector;
+ bits<5> src4;
+
+ let src3_vector = src3{10-7};
+ let Inst{31-16} = { 0b001010001, opc{4-3}, src2{4-0} };
+ let Inst{13-0} = { src3_vector{3}, src1{1-0}, src3_vector{2-0}, opc{2-0}, src4{4-0} };
+}
+
+class V6_vS32b_qpred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b00000>;
+class V6_vS32b_nqpred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b00001>;
+class V6_vS32b_pred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b01000>;
+class V6_vS32b_npred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b01001>;
+class V6_vS32Ub_pred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b01110>;
+class V6_vS32Ub_npred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b01111>;
+class V6_vS32b_nt_qpred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b10000>;
+class V6_vS32b_nt_nqpred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b10001>;
+class V6_vS32b_nt_pred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b11000>;
+class V6_vS32b_nt_npred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b11001>;
+
+class V6_vS32b_qpred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b00000>;
+class V6_vS32b_nqpred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b00001>;
+class V6_vS32b_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b01000>;
+class V6_vS32b_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b01001>;
+class V6_vS32Ub_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b01110>;
+class V6_vS32Ub_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b01111>;
+class V6_vS32b_nt_qpred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b10000>;
+class V6_vS32b_nt_nqpred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b10001>;
+class V6_vS32b_nt_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b11000>;
+class V6_vS32b_nt_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b11001>;
+
+class Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<bits<4> opc> : OpcodeHexagon {
+ bits<2> src1;
+ bits<5> src2;
+ bits<10> src3;
+ bits<4> src3_vector;
+ bits<3> src4;
+
+ let src3_vector = src3{9-6};
+ let Inst{31-16} = { 0b001010001, opc{3}, 1, src2{4-0} };
+ let Inst{13-0} = { src3_vector{3}, src1{1-0}, src3_vector{2-0}, 0b01, opc{2-0}, src4{2-0} };
+}
+
+class V6_vS32b_new_pred_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<0b0000>;
+class V6_vS32b_new_npred_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<0b0101>;
+class V6_vS32b_nt_new_pred_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<0b1010>;
+class V6_vS32b_nt_new_npred_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<0b1111>;
+
+class Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<bits<4> opc> : OpcodeHexagon {
+ bits<2> src1;
+ bits<5> src2;
+ bits<11> src3;
+ bits<4> src3_vector;
+ bits<3> src4;
+
+ let src3_vector = src3{10-7};
+ let Inst{31-16} = { 0b001010001, opc{3}, 1, src2{4-0} };
+ let Inst{13-0} = { src3_vector{3}, src1{1-0}, src3_vector{2-0}, 0b01, opc{2-0}, src4{2-0} };
+}
+
+class V6_vS32b_new_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<0b0000>;
+class V6_vS32b_new_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<0b0101>;
+class V6_vS32b_nt_new_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<0b1010>;
+class V6_vS32b_nt_new_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<0b1111>;
+
+// TODO: Change script to generate dst, src1, src2 instead of
+// dst, dst2, src1.
+class Enc_COPROC_VMEM_vL32_b_pi<bits<4> opc> : OpcodeHexagon {
+ bits<5> dst;
+ bits<5> src1;
+ bits<9> src2;
+ bits<3> src2_vector;
+
+ let src2_vector = src2{8-6};
+ let Inst{31-16} = { 0b001010010, opc{3}, 0, src1{4-0} };
+ let Inst{13-0} = { 0b000, src2_vector{2-0}, opc{2-0}, dst{4-0} };
+}
+
+class V6_vL32b_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b0000>;
+class V6_vL32b_cur_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b0001>;
+class V6_vL32b_tmp_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b0010>;
+class V6_vL32Ub_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b0111>;
+class V6_vL32b_nt_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b1000>;
+class V6_vL32b_nt_cur_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b1001>;
+class V6_vL32b_nt_tmp_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b1010>;
+
+class Enc_COPROC_VMEM_vL32_b_pi_128B<bits<4> opc> : OpcodeHexagon {
+ bits<5> dst;
+ bits<5> src1;
+ bits<10> src2;
+ bits<3> src2_vector;
+
+ let src2_vector = src2{9-7};
+ let Inst{31-16} = { 0b001010010, opc{3}, 0, src1{4-0} };
+ let Inst{13-0} = { 0b000, src2_vector{2-0}, opc{2-0}, dst{4-0} };
+}
+
+class V6_vL32b_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b0000>;
+class V6_vL32b_cur_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b0001>;
+class V6_vL32b_tmp_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b0010>;
+class V6_vL32Ub_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b0111>;
+class V6_vL32b_nt_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b1000>;
+class V6_vL32b_nt_cur_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b1001>;
+class V6_vL32b_nt_tmp_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b1010>;
+
+
+// TODO: Change script to generate src1, src2 and src3 instead of
+// dst, src1, src2.
+class Enc_COPROC_VMEM_vS32_b_pi<bits<4> opc> : OpcodeHexagon {
+ bits<5> src1;
+ bits<9> src2;
+ bits<3> src2_vector;
+ bits<5> src3;
+
+ let src2_vector = src2{8-6};
+ let Inst{31-16} = { 0b001010010, opc{3}, 1, src1{4-0} };
+ let Inst{10-0} = {src2_vector{2-0}, opc{2-0}, src3{4-0} };
+}
+
+class V6_vS32b_pi_enc : Enc_COPROC_VMEM_vS32_b_pi<0b0000>;
+class V6_vS32Ub_pi_enc : Enc_COPROC_VMEM_vS32_b_pi<0b0111>;
+class V6_vS32b_nt_pi_enc : Enc_COPROC_VMEM_vS32_b_pi<0b1000>;
+
+class Enc_COPROC_VMEM_vS32_b_pi_128B<bits<4> opc> : OpcodeHexagon {
+ bits<5> src1;
+ bits<10> src2;
+ bits<3> src2_vector;
+ bits<5> src3;
+
+ let src2_vector = src2{9-7};
+ let Inst{31-16} = { 0b001010010, opc{3}, 1, src1{4-0} };
+ let Inst{10-0} = {src2_vector{2-0}, opc{2-0}, src3{4-0} };
+}
+
+class V6_vS32b_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pi_128B<0b0000>;
+class V6_vS32Ub_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pi_128B<0b0111>;
+class V6_vS32b_nt_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pi_128B<0b1000>;
+
+// TODO: Change script to generate src1, src2 and src3 instead of
+// dst, src1, src2.
+class Enc_COPROC_VMEM_vS32b_n_ew_pi<bits<1> opc> : OpcodeHexagon {
+ bits<5> src1;
+ bits<9> src2;
+ bits<3> src2_vector;
+ bits<3> src3;
+
+ let src2_vector = src2{8-6};
+ let Inst{31-16} = { 0b001010010, opc{0}, 1, src1{4-0} };
+ let Inst{13-0} = { 0b000, src2_vector{2-0}, 0b00100, src3{2-0} };
+}
+
+class V6_vS32b_new_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pi<0>;
+class V6_vS32b_nt_new_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pi<1>;
+
+class Enc_COPROC_VMEM_vS32b_n_ew_pi_128B<bits<1> opc> : OpcodeHexagon {
+ bits<5> src1;
+ bits<10> src2;
+ bits<3> src2_vector;
+ bits<3> src3;
+
+ let src2_vector = src2{9-7};
+ let Inst{31-16} = { 0b001010010, opc{0}, 1, src1{4-0} };
+ let Inst{13-0} = { 0b000, src2_vector{2-0}, 0b00100, src3{2-0} };
+}
+
+class V6_vS32b_new_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pi_128B<0>;
+class V6_vS32b_nt_new_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pi_128B<1>;
+
+// TODO: Change script to generate src1, src2,src3 and src4 instead of
+// dst, src1, src2, src3.
+class Enc_COPROC_VMEM_vS32_b_pred_pi<bits<5> opc> : OpcodeHexagon {
+ bits<2> src1;
+ bits<5> src2;
+ bits<9> src3;
+ bits<3> src3_vector;
+ bits<5> src4;
+
+ let src3_vector = src3{8-6};
+ let Inst{31-16} = { 0b001010011, opc{4-3}, src2{4-0} };
+ let Inst{13-0} = { 0, src1{1-0}, src3_vector{2-0}, opc{2-0}, src4{4-0} };
+}
+
+class V6_vS32b_qpred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b00000>;
+class V6_vS32b_nqpred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b00001>;
+class V6_vS32b_pred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b01000>;
+class V6_vS32b_npred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b01001>;
+class V6_vS32Ub_pred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b01110>;
+class V6_vS32Ub_npred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b01111>;
+class V6_vS32b_nt_qpred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b10000>;
+class V6_vS32b_nt_nqpred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b10001>;
+class V6_vS32b_nt_pred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b11000>;
+class V6_vS32b_nt_npred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b11001>;
+
+// TODO: Change script to generate src1, src2,src3 and src4 instead of
+// dst, src1, src2, src3.
+class Enc_COPROC_VMEM_vS32_b_pred_pi_128B<bits<5> opc> : OpcodeHexagon {
+ bits<2> src1;
+ bits<5> src2;
+ bits<10> src3;
+ bits<3> src3_vector;
+ bits<5> src4;
+
+ let src3_vector = src3{9-7};
+ let Inst{31-16} = { 0b001010011, opc{4-3}, src2{4-0} };
+ let Inst{13-0} = { 0, src1{1-0}, src3_vector{2-0}, opc{2-0}, src4{4-0} };
+}
+
+class V6_vS32b_qpred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b00000>;
+class V6_vS32b_nqpred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b00001>;
+class V6_vS32b_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b01000>;
+class V6_vS32b_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b01001>;
+class V6_vS32Ub_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b01110>;
+class V6_vS32Ub_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b01111>;
+class V6_vS32b_nt_qpred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b10000>;
+class V6_vS32b_nt_nqpred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b10001>;
+class V6_vS32b_nt_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b11000>;
+class V6_vS32b_nt_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b11001>;
+
+class Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<bits<4> opc> : OpcodeHexagon {
+ bits<2> src1;
+ bits<5> src2;
+ bits<9> src3;
+ bits<3> src3_vector;
+ bits<3> src4;
+
+ let src3_vector = src3{8-6};
+ let Inst{31-16} = { 0b001010011, opc{3}, 1, src2{4-0} };
+ let Inst{13-0} = { 0, src1{1-0}, src3_vector{2-0}, 0b01, opc{2-0}, src4{2-0} };
+}
+
+class V6_vS32b_new_pred_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<0b0000>;
+class V6_vS32b_new_npred_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<0b0101>;
+class V6_vS32b_nt_new_pred_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<0b1010>;
+class V6_vS32b_nt_new_npred_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<0b1111>;
+
+class Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<bits<4> opc> : OpcodeHexagon {
+ bits<2> src1;
+ bits<5> src2;
+ bits<10> src3;
+ bits<3> src3_vector;
+ bits<3> src4;
+
+ let src3_vector = src3{9-7};
+ let Inst{31-16} = { 0b001010011, opc{3}, 1, src2{4-0} };
+ let Inst{13-0} = { 0, src1{1-0}, src3_vector{2-0}, 0b01, opc{2-0}, src4{2-0} };
+}
+
+class V6_vS32b_new_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<0b0000>;
+class V6_vS32b_new_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<0b0101>;
+class V6_vS32b_nt_new_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<0b1010>;
+class V6_vS32b_nt_new_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<0b1111>;
+
+class Enc_LD_load_m<bits<13> opc> : OpcodeHexagon {
+ bits<5> dst;
+ bits<5> src1;
+ bits<1> src2;
+
+ let Inst{31-16} = { opc{12}, 0, opc{11-10}, 1, opc{9-4}, src1{4-0} };
+ let Inst{13-0} = { src2{0}, 0b000, opc{3}, 0, opc{2-0}, dst{4-0} };
+}
+
+class V6_vL32b_ppu_enc : Enc_LD_load_m<0b0100110000000>;
+class V6_vL32b_cur_ppu_enc : Enc_LD_load_m<0b0100110000001>;
+class V6_vL32b_tmp_ppu_enc : Enc_LD_load_m<0b0100110000010>;
+class V6_vL32Ub_ppu_enc : Enc_LD_load_m<0b0100110000111>;
+class V6_vL32b_nt_ppu_enc : Enc_LD_load_m<0b0100110100000>;
+class V6_vL32b_nt_cur_ppu_enc : Enc_LD_load_m<0b0100110100001>;
+class V6_vL32b_nt_tmp_ppu_enc : Enc_LD_load_m<0b0100110100010>;
+
+class Enc_COPROC_VMEM_vS32_b_ppu<bits<4> opc> : OpcodeHexagon {
+ bits<5> src1;
+ bits<1> src2;
+ bits<5> src3;
+
+ let Inst{31-16} = { 0b001010110, opc{3}, 1, src1{4-0} };
+ let Inst{13-0} = { src2{0}, 0b00000, opc{2-0}, src3{4-0} };
+}
+
+class V6_vS32b_ppu_enc : Enc_COPROC_VMEM_vS32_b_ppu<0b0000>;
+class V6_vS32Ub_ppu_enc : Enc_COPROC_VMEM_vS32_b_ppu<0b0111>;
+class V6_vS32b_nt_ppu_enc : Enc_COPROC_VMEM_vS32_b_ppu<0b1000>;
+
+class Enc_COPROC_VMEM_vS32b_new_ppu<bits<1> opc> : OpcodeHexagon {
+ bits<5> src1;
+ bits<1> src2;
+ bits<3> src3;
+
+ let Inst{31-16} = { 0b001010110, opc{0}, 1, src1{4-0} };
+ let Inst{13-0} = { src2{0}, 0b0000000100, src3{2-0} };
+}
+
+class V6_vS32b_new_ppu_enc : Enc_COPROC_VMEM_vS32b_new_ppu<0>;
+class V6_vS32b_nt_new_ppu_enc : Enc_COPROC_VMEM_vS32b_new_ppu<1>;
+
+class Enc_COPROC_VMEM_vS32_b_pred_ppu<bits<5> opc> : OpcodeHexagon {
+ bits<2> src1;
+ bits<5> src2;
+ bits<1> src3;
+ bits<5> src4;
+
+ let Inst{31-16} = { 0b001010111, opc{4-3}, src2{4-0} };
+ let Inst{13-0} = { src3{0}, src1{1-0}, 0b000, opc{2-0}, src4{4-0} };
+}
+
+class V6_vS32b_qpred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b00000>;
+class V6_vS32b_nqpred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b00001>;
+class V6_vS32b_pred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b01000>;
+class V6_vS32b_npred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b01001>;
+class V6_vS32Ub_pred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b01110>;
+class V6_vS32Ub_npred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b01111>;
+class V6_vS32b_nt_qpred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b10000>;
+class V6_vS32b_nt_nqpred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b10001>;
+class V6_vS32b_nt_pred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b11000>;
+class V6_vS32b_nt_npred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b11001>;
+
+class Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<bits<4> opc> : OpcodeHexagon {
+ bits<2> src1;
+ bits<5> src2;
+ bits<1> src3;
+ bits<3> src4;
+
+ let Inst{31-16} = { 0b001010111, opc{3}, 1, src2{4-0} };
+ let Inst{13-0} = { src3{0}, src1{1-0}, 0b00001, opc{2-0}, src4{2-0} };
+}
+
+class V6_vS32b_new_pred_ppu_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<0b0000>;
+class V6_vS32b_new_npred_ppu_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<0b0101>;
+class V6_vS32b_nt_new_pred_ppu_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<0b1010>;
+class V6_vS32b_nt_new_npred_ppu_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<0b1111>;
+
+
+class Enc_COPROC_VX_4op_i<bits<5> opc> : OpcodeHexagon {
+ bits<5> dst;
+ bits<5> src1;
+ bits<5> src2;
+ bits<1> src3;
+
+ let Inst{31-16} = { 0b00011001, opc{4-2}, src2{4-0} };
+ let Inst{13-0} = { opc{1}, src1{4-0}, 1, opc{0}, src3{0}, dst{4-0} };
+}
+
+class V6_vrmpybusi_enc : Enc_COPROC_VX_4op_i<0b01000>;
+class V6_vrsadubi_enc : Enc_COPROC_VX_4op_i<0b01001>;
+class V6_vrmpybusi_acc_enc : Enc_COPROC_VX_4op_i<0b01010>;
+class V6_vrsadubi_acc_enc : Enc_COPROC_VX_4op_i<0b01011>;
+class V6_vrmpyubi_acc_enc : Enc_COPROC_VX_4op_i<0b01111>;
+class V6_vrmpyubi_enc : Enc_COPROC_VX_4op_i<0b10101>;
+
+class Enc_COPROC_VX_vandqrt<bits<5> opc> : OpcodeHexagon {
+ bits<5> dst;
+ bits<2> src1;
+ bits<5> src2;
+
+ let Inst{31-16} = { 0b00011001, opc{4-3}, 1, src2{4-0} };
+ let Inst{13-0} = { opc{2}, 0b000, src1{1-0}, opc{1-0}, 1, dst{4-0} };
+}
+
+class V6_vandqrt_acc_enc : Enc_COPROC_VX_vandqrt<0b01101>;
+class V6_vandqrt_enc : Enc_COPROC_VX_vandqrt<0b10010>;
+
+class Enc_COPROC_VX_cards<bits<2> opc> : OpcodeHexagon {
+ bits<5> src1;
+ bits<5> src2;
+ bits<5> src3;
+
+ let Inst{31-16} = { 0b00011001111, src3{4-0} };
+ let Inst{13-0} = { 1, src1{4-0}, 0, opc{1-0}, src2{4-0} };
+}
+
+class V6_vshuff_enc : Enc_COPROC_VX_cards<0b01>;
+class V6_vdeal_enc : Enc_COPROC_VX_cards<0b10>;
+
+
+class Enc_COPROC_VX_v_cmov<bits<1> opc> : OpcodeHexagon {
+ bits<2> src1;
+ bits<5> dst;
+ bits<5> src2;
+
+ let Inst{31-16} = { 0b0001101000, opc{0}, 0b00000 };
+ let Inst{13-0} = { 0, src2{4-0}, 0, src1{1-0}, dst{4-0} };
+}
+
+class V6_vcmov_enc : Enc_COPROC_VX_v_cmov<0>;
+class V6_vncmov_enc : Enc_COPROC_VX_v_cmov<1>;
+
+class Enc_X_p3op<bits<8> opc> : OpcodeHexagon {
+ bits<2> src1;
+ bits<5> dst;
+ bits<5> src2;
+ bits<5> src3;
+
+ let Inst{31-16} = { opc{7-5}, 0b1101, opc{4}, 0, opc{3-2}, src3{4-0} };
+ let Inst{13-0} = { opc{1}, src2{4-0}, opc{0}, src1{1-0}, dst{4-0} };
+}
+
+class V6_vnccombine_enc : Enc_X_p3op<0b00001000>;
+class V6_vccombine_enc : Enc_X_p3op<0b00001100>;
+
+class Enc_COPROC_VX_4op_r<bits<4> opc> : OpcodeHexagon {
+ bits<5> dst;
+ bits<5> src1;
+ bits<5> src2;
+ bits<3> src3;
+
+ let Inst{31-16} = { 0b00011011, src2{4-0}, src3{2-0} };
+ let Inst{13-0} = { opc{3}, src1{4-0}, opc{2-0}, dst{4-0} };
+}
+
+class V6_valignb_enc : Enc_COPROC_VX_4op_r<0b0000>;
+class V6_vlalignb_enc : Enc_COPROC_VX_4op_r<0b0001>;
+class V6_vasrwh_enc : Enc_COPROC_VX_4op_r<0b0010>;
+class V6_vasrwhsat_enc : Enc_COPROC_VX_4op_r<0b0011>;
+class V6_vasrwhrndsat_enc : Enc_COPROC_VX_4op_r<0b0100>;
+class V6_vasrwuhsat_enc : Enc_COPROC_VX_4op_r<0b0101>;
+class V6_vasrhubsat_enc : Enc_COPROC_VX_4op_r<0b0110>;
+class V6_vasrhubrndsat_enc : Enc_COPROC_VX_4op_r<0b0111>;
+class V6_vasrhbrndsat_enc : Enc_COPROC_VX_4op_r<0b1000>;
+class V6_vlutvvb_enc : Enc_COPROC_VX_4op_r<0b1001>;
+class V6_vshuffvdd_enc : Enc_COPROC_VX_4op_r<0b1011>;
+class V6_vdealvdd_enc : Enc_COPROC_VX_4op_r<0b1100>;
+class V6_vlutvvb_oracc_enc : Enc_COPROC_VX_4op_r<0b1101>;
+class V6_vlutvwh_enc : Enc_COPROC_VX_4op_r<0b1110>;
+class V6_vlutvwh_oracc_enc : Enc_COPROC_VX_4op_r<0b1111>;
+
+class Enc_S_3op_valign_i<bits<9> opc> : OpcodeHexagon {
+ bits<5> dst;
+ bits<5> src1;
+ bits<5> src2;
+ bits<3> src3;
+
+ let Inst{31-16} = { opc{8-7}, 0, opc{6-3}, 0b00, opc{2-1}, src2{4-0} };
+ let Inst{13-0} = { opc{0}, src1{4-0}, src3{2-0}, dst{4-0} };
+}
+
+class V6_vlutb_enc : Enc_S_3op_valign_i<0b001100000>;
+class V6_vlutb_dv_enc : Enc_S_3op_valign_i<0b001100010>;
+class V6_vlutb_acc_enc : Enc_S_3op_valign_i<0b001100100>;
+class V6_vlutb_dv_acc_enc : Enc_S_3op_valign_i<0b001100110>;
+class V6_valignbi_enc : Enc_S_3op_valign_i<0b001111011>;
+class V6_vlalignbi_enc : Enc_S_3op_valign_i<0b001111111>;
+class S2_valignib_enc : Enc_S_3op_valign_i<0b110000000>;
+class S2_addasl_rrri_enc : Enc_S_3op_valign_i<0b110010000>;
+
+class Enc_COPROC_VX_3op_q<bits<3> opc> : OpcodeHexagon {
+ bits<2> dst;
+ bits<2> src1;
+ bits<2> src2;
+
+ let Inst{31-16} = { 0b00011110, src2{1-0}, 0b000011 };
+ let Inst{13-0} = { 0b0000, src1{1-0}, 0b000, opc{2-0}, dst{1-0} };
+}
+
+class V6_pred_and_enc : Enc_COPROC_VX_3op_q<0b000>;
+class V6_pred_or_enc : Enc_COPROC_VX_3op_q<0b001>;
+class V6_pred_xor_enc : Enc_COPROC_VX_3op_q<0b011>;
+class V6_pred_or_n_enc : Enc_COPROC_VX_3op_q<0b100>;
+class V6_pred_and_n_enc : Enc_COPROC_VX_3op_q<0b101>;
+
+class V6_pred_not_enc : OpcodeHexagon {
+ bits<2> dst;
+ bits<2> src1;
+
+ let Inst{31-16} = { 0b0001111000000011 };
+ let Inst{13-0} = { 0b0000, src1{1-0}, 0b000010, dst{1-0} };
+}
+
+class Enc_COPROC_VX_4op_q<bits<1> opc> : OpcodeHexagon {
+ bits<5> dst;
+ bits<2> src1;
+ bits<5> src2;
+ bits<5> src3;
+
+ let Inst{31-16} = { 0b000111101, opc{0}, 1, src3{4-0} };
+ let Inst{13-0} = { 1, src2{4-0}, 0, src1{1-0}, dst{4-0} };
+}
+
+class V6_vswap_enc : Enc_COPROC_VX_4op_q<0>;
+class V6_vmux_enc : Enc_COPROC_VX_4op_q<1>;
+
+class Enc_X_2op<bits<16> opc> : OpcodeHexagon {
+ bits<5> dst;
+ bits<5> src1;
+
+ let Inst{31-16} = { opc{15-5}, src1{4-0} };
+ let Inst{13-0} = { opc{4-3}, 0b0000, opc{2-0}, dst{4-0} };
+}
+
+class V6_lvsplatw_enc : Enc_X_2op<0b0001100110100001>;
+class V6_vinsertwr_enc : Enc_X_2op<0b0001100110110001>;
+class S6_vsplatrbp_enc : Enc_X_2op<0b1000010001000100>;
+
+
+class Enc_CR_2op_r<bits<12> opc> : OpcodeHexagon {
+ bits<2> dst;
+ bits<5> src1;
+
+ let Inst{31-16} = { opc{11}, 0, opc{10-7}, 0, opc{6-3}, src1{4-0} };
+ let Inst{13-0} = { opc{2}, 0b000000, opc{1}, 0b000, opc{0}, dst{1-0} };
+}
+
+class V6_pred_scalar2_enc : Enc_CR_2op_r<0b001101101011>;
+class Y5_l2locka_enc : Enc_CR_2op_r<0b110000111100>;
+
+class Enc_S_3op_i6<bits<9> opc> : OpcodeHexagon {
+ bits<5> dst;
+ bits<5> src1;
+ bits<6> src2;
+
+ let Inst{31-16} = { 0b1000, opc{8-6}, 0, opc{5-3}, src1{4-0} };
+ let Inst{13-0} = { src2{5-0}, opc{2-0}, dst{4-0} };
+}
+
+class S6_rol_i_p_enc : Enc_S_3op_i6<0b000000011>;
+class S6_rol_i_p_nac_enc : Enc_S_3op_i6<0b001000011>;
+class S6_rol_i_p_acc_enc : Enc_S_3op_i6<0b001000111>;
+class S6_rol_i_p_and_enc : Enc_S_3op_i6<0b001010011>;
+class S6_rol_i_p_or_enc : Enc_S_3op_i6<0b001010111>;
+class S6_rol_i_p_xacc_enc : Enc_S_3op_i6<0b001100011>;
+
+class Enc_X_3op_r<bits<15> opc> : OpcodeHexagon {
+ bits<5> dst;
+ bits<5> src1;
+ bits<5> src2;
+
+ let Inst{31-16} = { opc{14-4}, src1{4-0} };
+ let Inst{13-0} = { opc{3}, src2{4-0}, opc{2-0}, dst{4-0} };
+}
+
+class S6_rol_i_r_enc : Enc_X_3op_r<0b100011000000011>;
+class S6_rol_i_r_nac_enc : Enc_X_3op_r<0b100011100000011>;
+class S6_rol_i_r_acc_enc : Enc_X_3op_r<0b100011100000111>;
+class S6_rol_i_r_and_enc : Enc_X_3op_r<0b100011100100011>;
+class S6_rol_i_r_or_enc : Enc_X_3op_r<0b100011100100111>;
+class S6_rol_i_r_xacc_enc : Enc_X_3op_r<0b100011101000011>;
+class S6_vtrunehb_ppp_enc : Enc_X_3op_r<0b110000011000011>;
+class S6_vtrunohb_ppp_enc : Enc_X_3op_r<0b110000011000101>;
+
+class Enc_no_operands<bits<25> opc> : OpcodeHexagon {
+
+ let Inst{31-16} = { opc{24-10}, 0 };
+ let Inst{13-0} = { opc{9-7}, 0b000, opc{6-0}, 0 };
+}
+
+class Y5_l2gunlock_enc : Enc_no_operands<0b1010100000100000010000000>;
+class Y5_l2gclean_enc : Enc_no_operands<0b1010100000100000100000000>;
+class Y5_l2gcleaninv_enc : Enc_no_operands<0b1010100000100000110000000>;
+class V6_vhist_enc : Enc_no_operands<0b0001111000000001001000000>;
+
+class Enc_J_jumpr<bits<13> opc> : OpcodeHexagon {
+ bits<5> src1;
+
+ let Inst{31-16} = { opc{12-6}, 0, opc{5-3}, src1{4-0} };
+ let Inst{13-0} = { 0b00, opc{2}, 0b0000, opc{1-0}, 0b00000 };
+}
+
+class Y5_l2unlocka_enc : Enc_J_jumpr<0b1010011011000>;
+class Y2_l2cleaninvidx_enc : Enc_J_jumpr<0b1010100011000>;
+
+class Enc_ST_l2gclean_pa<bits<2> opc> : OpcodeHexagon {
+ bits<5> src1;
+
+ let Inst{31-16} = { 0b101001101, opc{1-0}, 0b00000 };
+ let Inst{13-0} = { 0, src1{4-0}, 0b00000000 };
+}
+
+class Y6_l2gcleanpa_enc : Enc_ST_l2gclean_pa<0b01>;
+class Y6_l2gcleaninvpa_enc : Enc_ST_l2gclean_pa<0b10>;
+
+class A5_ACS_enc : OpcodeHexagon {
+ bits<5> dst1;
+ bits<2> dst2;
+ bits<5> src1;
+ bits<5> src2;
+
+ let Inst{31-16} = { 0b11101010101, src1{4-0} };
+ let Inst{13-0} = { 0, src2{4-0}, 0, dst2{1-0}, dst1{4-0} };
+}
+
+class Enc_X_4op_r<bits<8> opc> : OpcodeHexagon {
+ bits<5> dst;
+ bits<5> src1;
+ bits<5> src2;
+ bits<2> src3;
+
+ let Inst{31-16} = { 0b11, opc{7}, 0, opc{6-5}, 1, opc{4-1}, src1{4-0} };
+ let Inst{13-0} = { 0, src2{4-0}, opc{0}, src3{1-0}, dst{4-0} };
+}
+
+class S2_vsplicerb_enc : Enc_X_4op_r<0b00001000>;
+class S2_cabacencbin_enc : Enc_X_4op_r<0b00001010>;
+class F2_sffma_sc_enc : Enc_X_4op_r<0b11110111>;
+
+class V6_vhistq_enc : OpcodeHexagon {
+ bits<2> src1;
+
+ let Inst{31-16} = { 0b00011110, src1{1-0}, 0b000010 };
+ let Inst{13-0} = { 0b10000010000000 };
+}
+
+// TODO: Change script to generate dst1 instead of dst.
+class A6_vminub_RdP_enc : OpcodeHexagon {
+ bits<5> dst1;
+ bits<2> dst2;
+ bits<5> src1;
+ bits<5> src2;
+
+ let Inst{31-16} = { 0b11101010111, src2{4-0} };
+ let Inst{13-0} = { 0, src1{4-0}, 0, dst2{1-0}, dst1{4-0} };
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormats.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormats.td
new file mode 100644
index 000000000000..fa3cccbd0879
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormats.td
@@ -0,0 +1,445 @@
+//==- HexagonInstrFormats.td - Hexagon Instruction Formats --*- tablegen -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Hexagon Instruction Flags +
+//
+// *** Must match HexagonBaseInfo.h ***
+//===----------------------------------------------------------------------===//
+
+class IType<bits<5> t> {
+ bits<5> Value = t;
+}
+def TypePSEUDO : IType<0>;
+def TypeALU32 : IType<1>;
+def TypeCR : IType<2>;
+def TypeJR : IType<3>;
+def TypeJ : IType<4>;
+def TypeLD : IType<5>;
+def TypeST : IType<6>;
+def TypeSYSTEM : IType<7>;
+def TypeXTYPE : IType<8>;
+def TypeENDLOOP: IType<31>;
+
+// Maintain list of valid subtargets for each instruction.
+class SubTarget<bits<6> value> {
+ bits<6> Value = value;
+}
+
+def HasAnySubT : SubTarget<0x3f>; // 111111
+def HasV5SubT : SubTarget<0x3e>; // 111110
+def HasV55SubT : SubTarget<0x3c>; // 111100
+def HasV60SubT : SubTarget<0x38>; // 111000
+
+// Addressing modes for load/store instructions
+class AddrModeType<bits<3> value> {
+ bits<3> Value = value;
+}
+
+def NoAddrMode : AddrModeType<0>; // No addressing mode
+def Absolute : AddrModeType<1>; // Absolute addressing mode
+def AbsoluteSet : AddrModeType<2>; // Absolute set addressing mode
+def BaseImmOffset : AddrModeType<3>; // Indirect with offset
+def BaseLongOffset : AddrModeType<4>; // Indirect with long offset
+def BaseRegOffset : AddrModeType<5>; // Indirect with register offset
+def PostInc : AddrModeType<6>; // Post increment addressing mode
+
+class MemAccessSize<bits<4> value> {
+ bits<4> Value = value;
+}
+
+def NoMemAccess : MemAccessSize<0>;// Not a memory access instruction.
+def ByteAccess : MemAccessSize<1>;// Byte access instruction (memb).
+def HalfWordAccess : MemAccessSize<2>;// Half word access instruction (memh).
+def WordAccess : MemAccessSize<3>;// Word access instruction (memw).
+def DoubleWordAccess : MemAccessSize<4>;// Double word access instruction (memd)
+def Vector64Access : MemAccessSize<7>;// Vector access instruction (memv)
+def Vector128Access : MemAccessSize<8>;// Vector access instruction (memv)
+
+
+//===----------------------------------------------------------------------===//
+// Instruction Class Declaration +
+//===----------------------------------------------------------------------===//
+
+class OpcodeHexagon {
+ field bits<32> Inst = ?; // Default to an invalid insn.
+ bits<4> IClass = 0; // ICLASS
+
+ let Inst{31-28} = IClass;
+
+ bits<1> zero = 0;
+}
+
+class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
+ string cstr, InstrItinClass itin, IType type>
+ : Instruction {
+ let Namespace = "Hexagon";
+
+ dag OutOperandList = outs;
+ dag InOperandList = ins;
+ let AsmString = asmstr;
+ let Pattern = pattern;
+ let Constraints = cstr;
+ let Itinerary = itin;
+ let Size = 4;
+
+ // SoftFail is a field the disassembler can use to provide a way for
+ // instructions to not match without killing the whole decode process. It is
+ // mainly used for ARM, but Tablegen expects this field to exist or it fails
+ // to build the decode table.
+ field bits<32> SoftFail = 0;
+
+ // *** Must match MCTargetDesc/HexagonBaseInfo.h ***
+
+ // Instruction type according to the ISA.
+ IType Type = type;
+ let TSFlags{4-0} = Type.Value;
+
+ // Solo instructions, i.e., those that cannot be in a packet with others.
+ bits<1> isSolo = 0;
+ let TSFlags{5} = isSolo;
+ // Packed only with A or X-type instructions.
+ bits<1> isSoloAX = 0;
+ let TSFlags{6} = isSoloAX;
+ // Only A-type instruction in first slot or nothing.
+ bits<1> isSoloAin1 = 0;
+ let TSFlags{7} = isSoloAin1;
+
+ // Predicated instructions.
+ bits<1> isPredicated = 0;
+ let TSFlags{8} = isPredicated;
+ bits<1> isPredicatedFalse = 0;
+ let TSFlags{9} = isPredicatedFalse;
+ bits<1> isPredicatedNew = 0;
+ let TSFlags{10} = isPredicatedNew;
+ bits<1> isPredicateLate = 0;
+ let TSFlags{11} = isPredicateLate; // Late predicate producer insn.
+
+ // New-value insn helper fields.
+ bits<1> isNewValue = 0;
+ let TSFlags{12} = isNewValue; // New-value consumer insn.
+ bits<1> hasNewValue = 0;
+ let TSFlags{13} = hasNewValue; // New-value producer insn.
+ bits<3> opNewValue = 0;
+ let TSFlags{16-14} = opNewValue; // New-value produced operand.
+ bits<1> isNVStorable = 0;
+ let TSFlags{17} = isNVStorable; // Store that can become new-value store.
+ bits<1> isNVStore = 0;
+ let TSFlags{18} = isNVStore; // New-value store insn.
+ bits<1> isCVLoadable = 0;
+ let TSFlags{19} = isCVLoadable; // Load that can become cur-value load.
+ bits<1> isCVLoad = 0;
+ let TSFlags{20} = isCVLoad; // Cur-value load insn.
+
+ // Immediate extender helper fields.
+ bits<1> isExtendable = 0;
+ let TSFlags{21} = isExtendable; // Insn may be extended.
+ bits<1> isExtended = 0;
+ let TSFlags{22} = isExtended; // Insn must be extended.
+ bits<3> opExtendable = 0;
+ let TSFlags{25-23} = opExtendable; // Which operand may be extended.
+ bits<1> isExtentSigned = 0;
+ let TSFlags{26} = isExtentSigned; // Signed or unsigned range.
+ bits<5> opExtentBits = 0;
+ let TSFlags{31-27} = opExtentBits; //Number of bits of range before extending.
+ bits<2> opExtentAlign = 0;
+ let TSFlags{33-32} = opExtentAlign; // Alignment exponent before extending.
+
+ // If an instruction is valid on a subtarget, set the corresponding
+ // bit from validSubTargets.
+ // By default, instruction is valid on all subtargets.
+ SubTarget validSubTargets = HasAnySubT;
+ let TSFlags{39-34} = validSubTargets.Value;
+
+ // Addressing mode for load/store instructions.
+ AddrModeType addrMode = NoAddrMode;
+ let TSFlags{42-40} = addrMode.Value;
+
+ // Memory access size for mem access instructions (load/store)
+ MemAccessSize accessSize = NoMemAccess;
+ let TSFlags{46-43} = accessSize.Value;
+
+ bits<1> isTaken = 0;
+ let TSFlags {47} = isTaken; // Branch prediction.
+
+ bits<1> isFP = 0;
+ let TSFlags {48} = isFP; // Floating-point.
+
+ bits<1> hasNewValue2 = 0;
+ let TSFlags{50} = hasNewValue2; // Second New-value producer insn.
+ bits<3> opNewValue2 = 0;
+ let TSFlags{53-51} = opNewValue2; // Second New-value produced operand.
+
+ bits<1> isAccumulator = 0;
+ let TSFlags{54} = isAccumulator;
+
+ bit cofMax1 = 0;
+ let TSFlags{60} = cofMax1;
+
+ // Fields used for relation models.
+ bit isNonTemporal = 0;
+ string isNT = ""; // set to "true" for non-temporal vector stores.
+ string BaseOpcode = "";
+ string CextOpcode = "";
+ string PredSense = "";
+ string PNewValue = "";
+ string NValueST = ""; // Set to "true" for new-value stores.
+ string InputType = ""; // Input is "imm" or "reg" type.
+ string isFloat = "false"; // Set to "true" for the floating-point load/store.
+ string isBrTaken = !if(isTaken, "true", "false"); // Set to "true"/"false" for jump instructions
+
+ let PredSense = !if(isPredicated, !if(isPredicatedFalse, "false", "true"),
+ "");
+ let PNewValue = !if(isPredicatedNew, "new", "");
+ let NValueST = !if(isNVStore, "true", "false");
+ let isNT = !if(isNonTemporal, "true", "false");
+
+ // *** Must match MCTargetDesc/HexagonBaseInfo.h ***
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction Classes Definitions +
+//===----------------------------------------------------------------------===//
+
+// LD Instruction Class in V2/V3/V4.
+// Definition of the instruction class NOT CHANGED.
+let mayLoad = 1 in
+class LDInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = LD_tc_ld_SLOT01>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeLD>, OpcodeHexagon;
+
+let mayLoad = 1 in
+class LDInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "">
+ : LDInst<outs, ins, asmstr, pattern, cstr>;
+
+class CONSTLDInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "">
+ : LDInst<outs, ins, asmstr, pattern, cstr>;
+
+// LD Instruction Class in V2/V3/V4.
+// Definition of the instruction class NOT CHANGED.
+class LDInstPost<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "">
+ : LDInst<outs, ins, asmstr, pattern, cstr>;
+
+let mayLoad = 1 in
+class LD0Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin=LD_tc_ld_SLOT0>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeLD>, OpcodeHexagon;
+
+let mayLoad = 1 in
+class LD1Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin=LD_tc_ld_SLOT0>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeLD>;
+
+// ST Instruction Class in V2/V3 can take SLOT0 only.
+// ST Instruction Class in V4 can take SLOT0 & SLOT1.
+// Definition of the instruction class CHANGED from V2/V3 to V4.
+let mayStore = 1 in
+class STInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = ST_tc_st_SLOT01>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeST>, OpcodeHexagon;
+
+class STInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "">
+ : STInst<outs, ins, asmstr, pattern, cstr>;
+
+let mayStore = 1 in
+class ST0Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = ST_tc_ld_SLOT0>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeST>, OpcodeHexagon;
+
+// Same as ST0Inst but doesn't derive from OpcodeHexagon.
+let mayStore = 1 in
+class ST1Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = ST_tc_st_SLOT0>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeST>;
+
+// ST Instruction Class in V2/V3 can take SLOT0 only.
+// ST Instruction Class in V4 can take SLOT0 & SLOT1.
+// Definition of the instruction class CHANGED from V2/V3 to V4.
+class STInstPost<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = ST_tc_st_SLOT01>
+ : STInst<outs, ins, asmstr, pattern, cstr, itin>;
+
+// SYSTEM Instruction Class in V4 can take SLOT0 only
+// In V2/V3 we used ST for this but in v4 ST can take SLOT0 or SLOT1.
+class SYSInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = ST_tc_3stall_SLOT0>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeSYSTEM>,
+ OpcodeHexagon;
+
+// ALU32 Instruction Class in V2/V3/V4.
+// Definition of the instruction class NOT CHANGED.
+class ALU32Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeALU32>, OpcodeHexagon;
+
+// ALU64 Instruction Class in V2/V3.
+// XTYPE Instruction Class in V4.
+// Definition of the instruction class NOT CHANGED.
+// Name of the Instruction Class changed from ALU64 to XTYPE from V2/V3 to V4.
+class ALU64Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = ALU64_tc_2_SLOT23>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>,
+ OpcodeHexagon;
+
+class ALU64_acc<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = ALU64_tc_2_SLOT23>
+ : ALU64Inst<outs, ins, asmstr, pattern, cstr, itin>;
+
+
+// M Instruction Class in V2/V3.
+// XTYPE Instruction Class in V4.
+// Definition of the instruction class NOT CHANGED.
+// Name of the Instruction Class changed from M to XTYPE from V2/V3 to V4.
+class MInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = M_tc_3x_SLOT23>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>,
+ OpcodeHexagon;
+
+// Same as above but doesn't derive from OpcodeHexagon
+class MInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = M_tc_3x_SLOT23>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>;
+
+// M Instruction Class in V2/V3.
+// XTYPE Instruction Class in V4.
+// Definition of the instruction class NOT CHANGED.
+// Name of the Instruction Class changed from M to XTYPE from V2/V3 to V4.
+class MInst_acc<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = M_tc_2_SLOT23>
+ : MInst<outs, ins, asmstr, pattern, cstr, itin>;
+
+// S Instruction Class in V2/V3.
+// XTYPE Instruction Class in V4.
+// Definition of the instruction class NOT CHANGED.
+// Name of the Instruction Class changed from S to XTYPE from V2/V3 to V4.
+class SInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = S_2op_tc_1_SLOT23>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>,
+ OpcodeHexagon;
+
+class SInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = S_2op_tc_1_SLOT23>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>;
+
+// S Instruction Class in V2/V3.
+// XTYPE Instruction Class in V4.
+// Definition of the instruction class NOT CHANGED.
+// Name of the Instruction Class changed from S to XTYPE from V2/V3 to V4.
+class SInst_acc<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = S_3op_tc_1_SLOT23>
+ : SInst<outs, ins, asmstr, pattern, cstr, itin>;
+
+// J Instruction Class in V2/V3/V4.
+// Definition of the instruction class NOT CHANGED.
+class JInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = J_tc_2early_SLOT23>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeJ>, OpcodeHexagon;
+
+class JInst_CJUMP_UCJUMP<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeJ>, OpcodeHexagon;
+
+// JR Instruction Class in V2/V3/V4.
+// Definition of the instruction class NOT CHANGED.
+class JRInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = J_tc_2early_SLOT2>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeJR>, OpcodeHexagon;
+
+// CR Instruction Class in V2/V3/V4.
+// Definition of the instruction class NOT CHANGED.
+class CRInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = CR_tc_2early_SLOT3>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCR>, OpcodeHexagon;
+
+let isCodeGenOnly = 1, isPseudo = 1 in
+class Endloop<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = J_tc_2early_SLOT0123>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeENDLOOP>,
+ OpcodeHexagon;
+
+let isCodeGenOnly = 1, isPseudo = 1 in
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "">
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, PSEUDO, TypePSEUDO>,
+ OpcodeHexagon;
+
+let isCodeGenOnly = 1, isPseudo = 1 in
+class PseudoM<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr="">
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, PSEUDOM, TypePSEUDO>,
+ OpcodeHexagon;
+
+//===----------------------------------------------------------------------===//
+// Instruction Classes Definitions -
+//===----------------------------------------------------------------------===//
+
+
+//
+// ALU32 patterns
+//.
+class ALU32_rr<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
+ : ALU32Inst<outs, ins, asmstr, pattern, cstr, itin>;
+
+class ALU32_ir<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
+ : ALU32Inst<outs, ins, asmstr, pattern, cstr, itin>;
+
+class ALU32_ri<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
+ : ALU32Inst<outs, ins, asmstr, pattern, cstr, itin>;
+
+class ALU32_ii<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
+ : ALU32Inst<outs, ins, asmstr, pattern, cstr, itin>;
+
+//
+// ALU64 patterns.
+//
+class ALU64_rr<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = ALU64_tc_1_SLOT23>
+ : ALU64Inst<outs, ins, asmstr, pattern, cstr, itin>;
+
+class ALU64_ri<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = ALU64_tc_1_SLOT23>
+ : ALU64Inst<outs, ins, asmstr, pattern, cstr, itin>;
+
+// Post increment ST Instruction.
+class STInstPI<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "">
+ : STInst<outs, ins, asmstr, pattern, cstr>;
+
+// Post increment LD Instruction.
+class LDInstPI<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "">
+ : LDInst<outs, ins, asmstr, pattern, cstr>;
+
+//===----------------------------------------------------------------------===//
+// V4 Instruction Format Definitions +
+//===----------------------------------------------------------------------===//
+
+include "HexagonInstrFormatsV4.td"
+
+//===----------------------------------------------------------------------===//
+// V4 Instruction Format Definitions +
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// V60 Instruction Format Definitions +
+//===----------------------------------------------------------------------===//
+
+include "HexagonInstrFormatsV60.td"
+
+//===----------------------------------------------------------------------===//
+// V60 Instruction Format Definitions +
+//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV4.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV4.td
new file mode 100644
index 000000000000..493d04703da9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV4.td
@@ -0,0 +1,160 @@
+//==- HexagonInstrFormats.td - Hexagon Instruction Formats --*- tablegen -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon V4 instruction classes in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//----------------------------------------------------------------------------//
+// Hexagon Instruction Flags
+//
+// *** Must match BaseInfo.h ***
+//----------------------------------------------------------------------------//
+
+def TypeV4LDST : IType<9>;
+def TypeNV : IType<10>;
+def TypeDUPLEX : IType<11>;
+def TypeCOMPOUND : IType<12>;
+def TypePREFIX : IType<30>;
+
+// Duplex Instruction Class Declaration
+//===----------------------------------------------------------------------===//
+
+class OpcodeDuplex {
+ field bits<32> Inst = ?; // Default to an invalid insn.
+ bits<4> IClass = 0; // ICLASS
+ bits<13> ISubHi = 0; // Low sub-insn
+ bits<13> ISubLo = 0; // High sub-insn
+
+ let Inst{31-29} = IClass{3-1};
+ let Inst{13} = IClass{0};
+ let Inst{15-14} = 0;
+ let Inst{28-16} = ISubHi;
+ let Inst{12-0} = ISubLo;
+}
+
+class InstDuplex<bits<4> iClass, list<dag> pattern = [],
+ string cstr = "">
+ : Instruction, OpcodeDuplex {
+ let Namespace = "Hexagon";
+ IType Type = TypeDUPLEX; // uses slot 0,1
+ let isCodeGenOnly = 1;
+ let hasSideEffects = 0;
+ dag OutOperandList = (outs);
+ dag InOperandList = (ins);
+ let IClass = iClass;
+ let Constraints = cstr;
+ let Itinerary = DUPLEX;
+ let Size = 4;
+
+ // SoftFail is a field the disassembler can use to provide a way for
+ // instructions to not match without killing the whole decode process. It is
+ // mainly used for ARM, but Tablegen expects this field to exist or it fails
+ // to build the decode table.
+ field bits<32> SoftFail = 0;
+
+ // *** Must match MCTargetDesc/HexagonBaseInfo.h ***
+
+ let TSFlags{4-0} = Type.Value;
+
+ // Predicated instructions.
+ bits<1> isPredicated = 0;
+ let TSFlags{6} = isPredicated;
+ bits<1> isPredicatedFalse = 0;
+ let TSFlags{7} = isPredicatedFalse;
+ bits<1> isPredicatedNew = 0;
+ let TSFlags{8} = isPredicatedNew;
+
+ // New-value insn helper fields.
+ bits<1> isNewValue = 0;
+ let TSFlags{9} = isNewValue; // New-value consumer insn.
+ bits<1> hasNewValue = 0;
+ let TSFlags{10} = hasNewValue; // New-value producer insn.
+ bits<3> opNewValue = 0;
+ let TSFlags{13-11} = opNewValue; // New-value produced operand.
+ bits<1> isNVStorable = 0;
+ let TSFlags{14} = isNVStorable; // Store that can become new-value store.
+ bits<1> isNVStore = 0;
+ let TSFlags{15} = isNVStore; // New-value store insn.
+
+ // Immediate extender helper fields.
+ bits<1> isExtendable = 0;
+ let TSFlags{16} = isExtendable; // Insn may be extended.
+ bits<1> isExtended = 0;
+ let TSFlags{17} = isExtended; // Insn must be extended.
+ bits<3> opExtendable = 0;
+ let TSFlags{20-18} = opExtendable; // Which operand may be extended.
+ bits<1> isExtentSigned = 0;
+ let TSFlags{21} = isExtentSigned; // Signed or unsigned range.
+ bits<5> opExtentBits = 0;
+ let TSFlags{26-22} = opExtentBits; //Number of bits of range before extending.
+ bits<2> opExtentAlign = 0;
+ let TSFlags{28-27} = opExtentAlign; // Alignment exponent before extending.
+}
+
+//----------------------------------------------------------------------------//
+// Instruction Classes Definitions
+//----------------------------------------------------------------------------//
+
+//
+// NV type instructions.
+//
+class NVInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = NCJ_tc_3or4stall_SLOT0>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeNV>, OpcodeHexagon;
+
+class NVInst_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = NCJ_tc_3or4stall_SLOT0>
+ : NVInst<outs, ins, asmstr, pattern, cstr, itin>;
+
+// Definition of Post increment new value store.
+class NVInstPost_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = ST_tc_st_SLOT0>
+ : NVInst<outs, ins, asmstr, pattern, cstr, itin>;
+
+// Post increment ST Instruction.
+let mayStore = 1 in
+class NVInstPI_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = ST_tc_st_SLOT0>
+ : NVInst<outs, ins, asmstr, pattern, cstr, itin>;
+
+// New-value conditional branch.
+class NCJInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "">
+ : NVInst<outs, ins, asmstr, pattern, cstr>;
+
+let mayLoad = 1, mayStore = 1 in
+class MEMInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = V4LDST_tc_st_SLOT0>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeV4LDST>,
+ OpcodeHexagon;
+
+class MEMInst_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = V4LDST_tc_st_SLOT0>
+ : MEMInst<outs, ins, asmstr, pattern, cstr, itin>;
+
+class EXTENDERInst<dag outs, dag ins, string asmstr, list<dag> pattern = []>
+ : InstHexagon<outs, ins, asmstr, pattern, "", EXTENDER_tc_1_SLOT0123,
+ TypePREFIX>, OpcodeHexagon;
+
+class SUBInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "">
+ : InstHexagon<outs, ins, asmstr, pattern, "", PREFIX, TypeDUPLEX>,
+ OpcodeHexagon;
+
+class CJInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "">
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, COMPOUND_CJ_ARCHDEPSLOT, TypeCOMPOUND>,
+ OpcodeHexagon;
+
+class CJInst_JMPSET<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "">
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, COMPOUND, TypeCOMPOUND>,
+ OpcodeHexagon;
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV60.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV60.td
new file mode 100644
index 000000000000..b9f4373a0b79
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV60.td
@@ -0,0 +1,238 @@
+//==- HexagonInstrFormatsV60.td - Hexagon Instruction Formats -*- tablegen -==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon V60 instruction classes in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//----------------------------------------------------------------------------//
+// Hexagon Instruction Flags +
+//
+// *** Must match BaseInfo.h ***
+//----------------------------------------------------------------------------//
+
+def TypeCVI_VA : IType<13>;
+def TypeCVI_VA_DV : IType<14>;
+def TypeCVI_VX : IType<15>;
+def TypeCVI_VX_DV : IType<16>;
+def TypeCVI_VP : IType<17>;
+def TypeCVI_VP_VS : IType<18>;
+def TypeCVI_VS : IType<19>;
+def TypeCVI_VINLANESAT : IType<20>;
+def TypeCVI_VM_LD : IType<21>;
+def TypeCVI_VM_TMP_LD : IType<22>;
+def TypeCVI_VM_CUR_LD : IType<23>;
+def TypeCVI_VM_VP_LDU : IType<24>;
+def TypeCVI_VM_ST : IType<25>;
+def TypeCVI_VM_NEW_ST : IType<26>;
+def TypeCVI_VM_STU : IType<27>;
+def TypeCVI_HIST : IType<28>;
+//----------------------------------------------------------------------------//
+// Instruction Classes Definitions +
+//----------------------------------------------------------------------------//
+
+let validSubTargets = HasV60SubT in
+{
+class CVI_VA_Resource<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VA>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VA>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VA_DV_Resource<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VA_DV>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VA_DV>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VX_Resource_long<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VX_LONG>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VX_Resource_late<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VX_LATE>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX>,
+ Requires<[HasV60T, UseHVX]>;
+
+class CVI_VX_Resource<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VX>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VX_DV_Resource<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VX_DV>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX_DV>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VX_DV_Slot2_Resource<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VX_DV_SLOT2>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX_DV>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VX_DV_Resource_long<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VX_DV_LONG>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX_DV>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VP_Resource_long<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VP_LONG>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VP>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VP_VS_Resource_early<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VP_VS_EARLY>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VP_VS>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VP_VS_Resource_long<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VP_VS_LONG>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VP_VS>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VP_VS_Resource_long_early<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VP_VS_LONG_EARLY>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VP_VS>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VS_Resource<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VS>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VS>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VINLANESAT_Resource<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VINLANESAT>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VINLANESAT>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VS_Resource_long<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VS>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VS>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_LD_Resource<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VM_LD>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_LD>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_LD_Resource_long<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VM_LD>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_LD>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_TMP_LD_Resource<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VM_TMP_LD>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_TMP_LD>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_TMP_LD_Resource_long<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VM_TMP_LD>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_TMP_LD>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_CUR_LD_Resource<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VM_CUR_LD>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_CUR_LD>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_VP_LDU_Resource<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VM_VP_LDU>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_VP_LDU>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_VP_LDU_Resource_long<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VM_VP_LDU>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_VP_LDU>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_ST_Resource<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VM_ST>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_ST>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_ST_Resource_long<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VM_ST>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_ST>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_NEW_ST_Resource<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VM_NEW_ST>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_NEW_ST>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_NEW_ST_Resource_long<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VM_NEW_ST>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_NEW_ST>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_STU_Resource<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VM_STU>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_STU>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_STU_Resource_long<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VM_STU>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_STU>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_HIST_Resource<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_HIST>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_HIST>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+}
+
+let validSubTargets = HasV60SubT in
+{
+class CVI_VA_Resource1<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VA>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VA>,
+ Requires<[HasV60T, UseHVX]>;
+
+class CVI_VX_DV_Resource1<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VX_DV>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX_DV>,
+ Requires<[HasV60T, UseHVX]>;
+
+class CVI_HIST_Resource1<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_HIST>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_HIST>,
+ Requires<[HasV60T, UseHVX]>;
+}
+
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
new file mode 100644
index 000000000000..34ce3e652995
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -0,0 +1,4283 @@
+//===-- HexagonInstrInfo.cpp - Hexagon Instruction Information ------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Hexagon implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonHazardRecognizer.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/DFAPacketizer.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cctype>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagon-instrinfo"
+
+#define GET_INSTRINFO_CTOR_DTOR
+#define GET_INSTRMAP_INFO
+#include "HexagonGenInstrInfo.inc"
+#include "HexagonGenDFAPacketizer.inc"
+
+cl::opt<bool> ScheduleInlineAsm("hexagon-sched-inline-asm", cl::Hidden,
+ cl::init(false), cl::desc("Do not consider inline-asm a scheduling/"
+ "packetization boundary."));
+
+static cl::opt<bool> EnableBranchPrediction("hexagon-enable-branch-prediction",
+ cl::Hidden, cl::init(true), cl::desc("Enable branch prediction"));
+
+static cl::opt<bool> DisableNVSchedule("disable-hexagon-nv-schedule",
+ cl::Hidden, cl::ZeroOrMore, cl::init(false),
+ cl::desc("Disable schedule adjustment for new value stores."));
+
+static cl::opt<bool> EnableTimingClassLatency(
+ "enable-timing-class-latency", cl::Hidden, cl::init(false),
+ cl::desc("Enable timing class latency"));
+
+static cl::opt<bool> EnableALUForwarding(
+ "enable-alu-forwarding", cl::Hidden, cl::init(true),
+ cl::desc("Enable vec alu forwarding"));
+
+static cl::opt<bool> EnableACCForwarding(
+ "enable-acc-forwarding", cl::Hidden, cl::init(true),
+ cl::desc("Enable vec acc forwarding"));
+
+static cl::opt<bool> BranchRelaxAsmLarge("branch-relax-asm-large",
+ cl::init(true), cl::Hidden, cl::ZeroOrMore, cl::desc("branch relax asm"));
+
+static cl::opt<bool> UseDFAHazardRec("dfa-hazard-rec",
+ cl::init(true), cl::Hidden, cl::ZeroOrMore,
+ cl::desc("Use the DFA based hazard recognizer."));
+
+///
+/// Constants for Hexagon instructions.
+///
+const int Hexagon_MEMV_OFFSET_MAX_128B = 896; // #s4: -8*128...7*128
+const int Hexagon_MEMV_OFFSET_MIN_128B = -1024; // #s4
+const int Hexagon_MEMV_OFFSET_MAX = 448; // #s4: -8*64...7*64
+const int Hexagon_MEMV_OFFSET_MIN = -512; // #s4
+const int Hexagon_MEMW_OFFSET_MAX = 4095;
+const int Hexagon_MEMW_OFFSET_MIN = -4096;
+const int Hexagon_MEMD_OFFSET_MAX = 8191;
+const int Hexagon_MEMD_OFFSET_MIN = -8192;
+const int Hexagon_MEMH_OFFSET_MAX = 2047;
+const int Hexagon_MEMH_OFFSET_MIN = -2048;
+const int Hexagon_MEMB_OFFSET_MAX = 1023;
+const int Hexagon_MEMB_OFFSET_MIN = -1024;
+const int Hexagon_ADDI_OFFSET_MAX = 32767;
+const int Hexagon_ADDI_OFFSET_MIN = -32768;
+const int Hexagon_MEMD_AUTOINC_MAX = 56;
+const int Hexagon_MEMD_AUTOINC_MIN = -64;
+const int Hexagon_MEMW_AUTOINC_MAX = 28;
+const int Hexagon_MEMW_AUTOINC_MIN = -32;
+const int Hexagon_MEMH_AUTOINC_MAX = 14;
+const int Hexagon_MEMH_AUTOINC_MIN = -16;
+const int Hexagon_MEMB_AUTOINC_MAX = 7;
+const int Hexagon_MEMB_AUTOINC_MIN = -8;
+const int Hexagon_MEMV_AUTOINC_MAX = 192; // #s3
+const int Hexagon_MEMV_AUTOINC_MIN = -256; // #s3
+const int Hexagon_MEMV_AUTOINC_MAX_128B = 384; // #s3
+const int Hexagon_MEMV_AUTOINC_MIN_128B = -512; // #s3
+
+// Pin the vtable to this file.
+void HexagonInstrInfo::anchor() {}
+
+HexagonInstrInfo::HexagonInstrInfo(HexagonSubtarget &ST)
+ : HexagonGenInstrInfo(Hexagon::ADJCALLSTACKDOWN, Hexagon::ADJCALLSTACKUP),
+ RI() {}
+
+
+static bool isIntRegForSubInst(unsigned Reg) {
+ return (Reg >= Hexagon::R0 && Reg <= Hexagon::R7) ||
+ (Reg >= Hexagon::R16 && Reg <= Hexagon::R23);
+}
+
+
+static bool isDblRegForSubInst(unsigned Reg, const HexagonRegisterInfo &HRI) {
+ return isIntRegForSubInst(HRI.getSubReg(Reg, Hexagon::isub_lo)) &&
+ isIntRegForSubInst(HRI.getSubReg(Reg, Hexagon::isub_hi));
+}
+
+
+/// Calculate number of instructions excluding the debug instructions.
+static unsigned nonDbgMICount(MachineBasicBlock::const_instr_iterator MIB,
+ MachineBasicBlock::const_instr_iterator MIE) {
+ unsigned Count = 0;
+ for (; MIB != MIE; ++MIB) {
+ if (!MIB->isDebugValue())
+ ++Count;
+ }
+ return Count;
+}
+
+
+/// Find the hardware loop instruction used to set-up the specified loop.
+/// On Hexagon, we have two instructions used to set-up the hardware loop
+/// (LOOP0, LOOP1) with corresponding endloop (ENDLOOP0, ENDLOOP1) instructions
+/// to indicate the end of a loop.
+static MachineInstr *findLoopInstr(MachineBasicBlock *BB, int EndLoopOp,
+ SmallPtrSet<MachineBasicBlock *, 8> &Visited) {
+ int LOOPi;
+ int LOOPr;
+ if (EndLoopOp == Hexagon::ENDLOOP0) {
+ LOOPi = Hexagon::J2_loop0i;
+ LOOPr = Hexagon::J2_loop0r;
+ } else { // EndLoopOp == Hexagon::EndLOOP1
+ LOOPi = Hexagon::J2_loop1i;
+ LOOPr = Hexagon::J2_loop1r;
+ }
+
+ // The loop set-up instruction will be in a predecessor block
+ for (MachineBasicBlock::pred_iterator PB = BB->pred_begin(),
+ PE = BB->pred_end(); PB != PE; ++PB) {
+ // If this has been visited, already skip it.
+ if (!Visited.insert(*PB).second)
+ continue;
+ if (*PB == BB)
+ continue;
+ for (MachineBasicBlock::reverse_instr_iterator I = (*PB)->instr_rbegin(),
+ E = (*PB)->instr_rend(); I != E; ++I) {
+ int Opc = I->getOpcode();
+ if (Opc == LOOPi || Opc == LOOPr)
+ return &*I;
+ // We've reached a different loop, which means the loop0 has been removed.
+ if (Opc == EndLoopOp)
+ return 0;
+ }
+ // Check the predecessors for the LOOP instruction.
+ MachineInstr *loop = findLoopInstr(*PB, EndLoopOp, Visited);
+ if (loop)
+ return loop;
+ }
+ return 0;
+}
+
+
+/// Gather register def/uses from MI.
+/// This treats possible (predicated) defs as actually happening ones
+/// (conservatively).
+static inline void parseOperands(const MachineInstr &MI,
+ SmallVector<unsigned, 4> &Defs, SmallVector<unsigned, 8> &Uses) {
+ Defs.clear();
+ Uses.clear();
+
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI.getOperand(i);
+
+ if (!MO.isReg())
+ continue;
+
+ unsigned Reg = MO.getReg();
+ if (!Reg)
+ continue;
+
+ if (MO.isUse())
+ Uses.push_back(MO.getReg());
+
+ if (MO.isDef())
+ Defs.push_back(MO.getReg());
+ }
+}
+
+
+// Position dependent, so check twice for swap.
+static bool isDuplexPairMatch(unsigned Ga, unsigned Gb) {
+ switch (Ga) {
+ case HexagonII::HSIG_None:
+ default:
+ return false;
+ case HexagonII::HSIG_L1:
+ return (Gb == HexagonII::HSIG_L1 || Gb == HexagonII::HSIG_A);
+ case HexagonII::HSIG_L2:
+ return (Gb == HexagonII::HSIG_L1 || Gb == HexagonII::HSIG_L2 ||
+ Gb == HexagonII::HSIG_A);
+ case HexagonII::HSIG_S1:
+ return (Gb == HexagonII::HSIG_L1 || Gb == HexagonII::HSIG_L2 ||
+ Gb == HexagonII::HSIG_S1 || Gb == HexagonII::HSIG_A);
+ case HexagonII::HSIG_S2:
+ return (Gb == HexagonII::HSIG_L1 || Gb == HexagonII::HSIG_L2 ||
+ Gb == HexagonII::HSIG_S1 || Gb == HexagonII::HSIG_S2 ||
+ Gb == HexagonII::HSIG_A);
+ case HexagonII::HSIG_A:
+ return (Gb == HexagonII::HSIG_A);
+ case HexagonII::HSIG_Compound:
+ return (Gb == HexagonII::HSIG_Compound);
+ }
+ return false;
+}
+
+
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the destination along with the FrameIndex of the loaded stack slot. If
+/// not, return 0. This predicate must return 0 if the instruction has
+/// any side effects other than loading from the stack slot.
+unsigned HexagonInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const {
+ switch (MI.getOpcode()) {
+ default:
+ break;
+ case Hexagon::L2_loadri_io:
+ case Hexagon::L2_loadrd_io:
+ case Hexagon::V6_vL32b_ai:
+ case Hexagon::V6_vL32b_ai_128B:
+ case Hexagon::V6_vL32Ub_ai:
+ case Hexagon::V6_vL32Ub_ai_128B:
+ case Hexagon::LDriw_pred:
+ case Hexagon::LDriw_mod:
+ case Hexagon::PS_vloadrq_ai:
+ case Hexagon::PS_vloadrw_ai:
+ case Hexagon::PS_vloadrq_ai_128B:
+ case Hexagon::PS_vloadrw_ai_128B: {
+ const MachineOperand OpFI = MI.getOperand(1);
+ if (!OpFI.isFI())
+ return 0;
+ const MachineOperand OpOff = MI.getOperand(2);
+ if (!OpOff.isImm() || OpOff.getImm() != 0)
+ return 0;
+ FrameIndex = OpFI.getIndex();
+ return MI.getOperand(0).getReg();
+ }
+
+ case Hexagon::L2_ploadrit_io:
+ case Hexagon::L2_ploadrif_io:
+ case Hexagon::L2_ploadrdt_io:
+ case Hexagon::L2_ploadrdf_io: {
+ const MachineOperand OpFI = MI.getOperand(2);
+ if (!OpFI.isFI())
+ return 0;
+ const MachineOperand OpOff = MI.getOperand(3);
+ if (!OpOff.isImm() || OpOff.getImm() != 0)
+ return 0;
+ FrameIndex = OpFI.getIndex();
+ return MI.getOperand(0).getReg();
+ }
+ }
+
+ return 0;
+}
+
+
+/// isStoreToStackSlot - If the specified machine instruction is a direct
+/// store to a stack slot, return the virtual or physical register number of
+/// the source reg along with the FrameIndex of the loaded stack slot. If
+/// not, return 0. This predicate must return 0 if the instruction has
+/// any side effects other than storing to the stack slot.
+unsigned HexagonInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const {
+ switch (MI.getOpcode()) {
+ default:
+ break;
+ case Hexagon::S2_storerb_io:
+ case Hexagon::S2_storerh_io:
+ case Hexagon::S2_storeri_io:
+ case Hexagon::S2_storerd_io:
+ case Hexagon::V6_vS32b_ai:
+ case Hexagon::V6_vS32b_ai_128B:
+ case Hexagon::V6_vS32Ub_ai:
+ case Hexagon::V6_vS32Ub_ai_128B:
+ case Hexagon::STriw_pred:
+ case Hexagon::STriw_mod:
+ case Hexagon::PS_vstorerq_ai:
+ case Hexagon::PS_vstorerw_ai:
+ case Hexagon::PS_vstorerq_ai_128B:
+ case Hexagon::PS_vstorerw_ai_128B: {
+ const MachineOperand &OpFI = MI.getOperand(0);
+ if (!OpFI.isFI())
+ return 0;
+ const MachineOperand &OpOff = MI.getOperand(1);
+ if (!OpOff.isImm() || OpOff.getImm() != 0)
+ return 0;
+ FrameIndex = OpFI.getIndex();
+ return MI.getOperand(2).getReg();
+ }
+
+ case Hexagon::S2_pstorerbt_io:
+ case Hexagon::S2_pstorerbf_io:
+ case Hexagon::S2_pstorerht_io:
+ case Hexagon::S2_pstorerhf_io:
+ case Hexagon::S2_pstorerit_io:
+ case Hexagon::S2_pstorerif_io:
+ case Hexagon::S2_pstorerdt_io:
+ case Hexagon::S2_pstorerdf_io: {
+ const MachineOperand &OpFI = MI.getOperand(1);
+ if (!OpFI.isFI())
+ return 0;
+ const MachineOperand &OpOff = MI.getOperand(2);
+ if (!OpOff.isImm() || OpOff.getImm() != 0)
+ return 0;
+ FrameIndex = OpFI.getIndex();
+ return MI.getOperand(3).getReg();
+ }
+ }
+
+ return 0;
+}
+
+
+/// This function can analyze one/two way branching only and should (mostly) be
+/// called by target independent side.
+/// First entry is always the opcode of the branching instruction, except when
+/// the Cond vector is supposed to be empty, e.g., when AnalyzeBranch fails, a
+/// BB with only unconditional jump. Subsequent entries depend upon the opcode,
+/// e.g. Jump_c p will have
+/// Cond[0] = Jump_c
+/// Cond[1] = p
+/// HW-loop ENDLOOP:
+/// Cond[0] = ENDLOOP
+/// Cond[1] = MBB
+/// New value jump:
+/// Cond[0] = Hexagon::CMPEQri_f_Jumpnv_t_V4 -- specific opcode
+/// Cond[1] = R
+/// Cond[2] = Imm
+///
+bool HexagonInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const {
+ TBB = nullptr;
+ FBB = nullptr;
+ Cond.clear();
+
+ // If the block has no terminators, it just falls into the block after it.
+ MachineBasicBlock::instr_iterator I = MBB.instr_end();
+ if (I == MBB.instr_begin())
+ return false;
+
+ // A basic block may looks like this:
+ //
+ // [ insn
+ // EH_LABEL
+ // insn
+ // insn
+ // insn
+ // EH_LABEL
+ // insn ]
+ //
+ // It has two succs but does not have a terminator
+ // Don't know how to handle it.
+ do {
+ --I;
+ if (I->isEHLabel())
+ // Don't analyze EH branches.
+ return true;
+ } while (I != MBB.instr_begin());
+
+ I = MBB.instr_end();
+ --I;
+
+ while (I->isDebugValue()) {
+ if (I == MBB.instr_begin())
+ return false;
+ --I;
+ }
+
+ bool JumpToBlock = I->getOpcode() == Hexagon::J2_jump &&
+ I->getOperand(0).isMBB();
+ // Delete the J2_jump if it's equivalent to a fall-through.
+ if (AllowModify && JumpToBlock &&
+ MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
+ DEBUG(dbgs()<< "\nErasing the jump to successor block\n";);
+ I->eraseFromParent();
+ I = MBB.instr_end();
+ if (I == MBB.instr_begin())
+ return false;
+ --I;
+ }
+ if (!isUnpredicatedTerminator(*I))
+ return false;
+
+ // Get the last instruction in the block.
+ MachineInstr *LastInst = &*I;
+ MachineInstr *SecondLastInst = nullptr;
+ // Find one more terminator if present.
+ for (;;) {
+ if (&*I != LastInst && !I->isBundle() && isUnpredicatedTerminator(*I)) {
+ if (!SecondLastInst)
+ SecondLastInst = &*I;
+ else
+ // This is a third branch.
+ return true;
+ }
+ if (I == MBB.instr_begin())
+ break;
+ --I;
+ }
+
+ int LastOpcode = LastInst->getOpcode();
+ int SecLastOpcode = SecondLastInst ? SecondLastInst->getOpcode() : 0;
+ // If the branch target is not a basic block, it could be a tail call.
+ // (It is, if the target is a function.)
+ if (LastOpcode == Hexagon::J2_jump && !LastInst->getOperand(0).isMBB())
+ return true;
+ if (SecLastOpcode == Hexagon::J2_jump &&
+ !SecondLastInst->getOperand(0).isMBB())
+ return true;
+
+ bool LastOpcodeHasJMP_c = PredOpcodeHasJMP_c(LastOpcode);
+ bool LastOpcodeHasNVJump = isNewValueJump(*LastInst);
+
+ if (LastOpcodeHasJMP_c && !LastInst->getOperand(1).isMBB())
+ return true;
+
+ // If there is only one terminator instruction, process it.
+ if (LastInst && !SecondLastInst) {
+ if (LastOpcode == Hexagon::J2_jump) {
+ TBB = LastInst->getOperand(0).getMBB();
+ return false;
+ }
+ if (isEndLoopN(LastOpcode)) {
+ TBB = LastInst->getOperand(0).getMBB();
+ Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
+ Cond.push_back(LastInst->getOperand(0));
+ return false;
+ }
+ if (LastOpcodeHasJMP_c) {
+ TBB = LastInst->getOperand(1).getMBB();
+ Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
+ Cond.push_back(LastInst->getOperand(0));
+ return false;
+ }
+ // Only supporting rr/ri versions of new-value jumps.
+ if (LastOpcodeHasNVJump && (LastInst->getNumExplicitOperands() == 3)) {
+ TBB = LastInst->getOperand(2).getMBB();
+ Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
+ Cond.push_back(LastInst->getOperand(0));
+ Cond.push_back(LastInst->getOperand(1));
+ return false;
+ }
+ DEBUG(dbgs() << "\nCant analyze BB#" << MBB.getNumber()
+ << " with one jump\n";);
+ // Otherwise, don't know what this is.
+ return true;
+ }
+
+ bool SecLastOpcodeHasJMP_c = PredOpcodeHasJMP_c(SecLastOpcode);
+ bool SecLastOpcodeHasNVJump = isNewValueJump(*SecondLastInst);
+ if (SecLastOpcodeHasJMP_c && (LastOpcode == Hexagon::J2_jump)) {
+ if (!SecondLastInst->getOperand(1).isMBB())
+ return true;
+ TBB = SecondLastInst->getOperand(1).getMBB();
+ Cond.push_back(MachineOperand::CreateImm(SecondLastInst->getOpcode()));
+ Cond.push_back(SecondLastInst->getOperand(0));
+ FBB = LastInst->getOperand(0).getMBB();
+ return false;
+ }
+
+ // Only supporting rr/ri versions of new-value jumps.
+ if (SecLastOpcodeHasNVJump &&
+ (SecondLastInst->getNumExplicitOperands() == 3) &&
+ (LastOpcode == Hexagon::J2_jump)) {
+ TBB = SecondLastInst->getOperand(2).getMBB();
+ Cond.push_back(MachineOperand::CreateImm(SecondLastInst->getOpcode()));
+ Cond.push_back(SecondLastInst->getOperand(0));
+ Cond.push_back(SecondLastInst->getOperand(1));
+ FBB = LastInst->getOperand(0).getMBB();
+ return false;
+ }
+
+ // If the block ends with two Hexagon:JMPs, handle it. The second one is not
+ // executed, so remove it.
+ if (SecLastOpcode == Hexagon::J2_jump && LastOpcode == Hexagon::J2_jump) {
+ TBB = SecondLastInst->getOperand(0).getMBB();
+ I = LastInst->getIterator();
+ if (AllowModify)
+ I->eraseFromParent();
+ return false;
+ }
+
+ // If the block ends with an ENDLOOP, and J2_jump, handle it.
+ if (isEndLoopN(SecLastOpcode) && LastOpcode == Hexagon::J2_jump) {
+ TBB = SecondLastInst->getOperand(0).getMBB();
+ Cond.push_back(MachineOperand::CreateImm(SecondLastInst->getOpcode()));
+ Cond.push_back(SecondLastInst->getOperand(0));
+ FBB = LastInst->getOperand(0).getMBB();
+ return false;
+ }
+ DEBUG(dbgs() << "\nCant analyze BB#" << MBB.getNumber()
+ << " with two jumps";);
+ // Otherwise, can't handle this.
+ return true;
+}
+
+
+unsigned HexagonInstrInfo::removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved) const {
+ assert(!BytesRemoved && "code size not handled");
+
+ DEBUG(dbgs() << "\nRemoving branches out of BB#" << MBB.getNumber());
+ MachineBasicBlock::iterator I = MBB.end();
+ unsigned Count = 0;
+ while (I != MBB.begin()) {
+ --I;
+ if (I->isDebugValue())
+ continue;
+ // Only removing branches from end of MBB.
+ if (!I->isBranch())
+ return Count;
+ if (Count && (I->getOpcode() == Hexagon::J2_jump))
+ llvm_unreachable("Malformed basic block: unconditional branch not last");
+ MBB.erase(&MBB.back());
+ I = MBB.end();
+ ++Count;
+ }
+ return Count;
+}
+
+unsigned HexagonInstrInfo::insertBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB,
+ ArrayRef<MachineOperand> Cond,
+ const DebugLoc &DL,
+ int *BytesAdded) const {
+ unsigned BOpc = Hexagon::J2_jump;
+ unsigned BccOpc = Hexagon::J2_jumpt;
+ assert(validateBranchCond(Cond) && "Invalid branching condition");
+ assert(TBB && "insertBranch must not be told to insert a fallthrough");
+ assert(!BytesAdded && "code size not handled");
+
+ // Check if reverseBranchCondition has asked to reverse this branch
+ // If we want to reverse the branch an odd number of times, we want
+ // J2_jumpf.
+ if (!Cond.empty() && Cond[0].isImm())
+ BccOpc = Cond[0].getImm();
+
+ if (!FBB) {
+ if (Cond.empty()) {
+ // Due to a bug in TailMerging/CFG Optimization, we need to add a
+ // special case handling of a predicated jump followed by an
+ // unconditional jump. If not, Tail Merging and CFG Optimization go
+ // into an infinite loop.
+ MachineBasicBlock *NewTBB, *NewFBB;
+ SmallVector<MachineOperand, 4> Cond;
+ auto Term = MBB.getFirstTerminator();
+ if (Term != MBB.end() && isPredicated(*Term) &&
+ !analyzeBranch(MBB, NewTBB, NewFBB, Cond, false) &&
+ MachineFunction::iterator(NewTBB) == ++MBB.getIterator()) {
+ reverseBranchCondition(Cond);
+ removeBranch(MBB);
+ return insertBranch(MBB, TBB, nullptr, Cond, DL);
+ }
+ BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB);
+ } else if (isEndLoopN(Cond[0].getImm())) {
+ int EndLoopOp = Cond[0].getImm();
+ assert(Cond[1].isMBB());
+ // Since we're adding an ENDLOOP, there better be a LOOP instruction.
+ // Check for it, and change the BB target if needed.
+ SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs;
+ MachineInstr *Loop = findLoopInstr(TBB, EndLoopOp, VisitedBBs);
+ assert(Loop != 0 && "Inserting an ENDLOOP without a LOOP");
+ Loop->getOperand(0).setMBB(TBB);
+ // Add the ENDLOOP after the finding the LOOP0.
+ BuildMI(&MBB, DL, get(EndLoopOp)).addMBB(TBB);
+ } else if (isNewValueJump(Cond[0].getImm())) {
+ assert((Cond.size() == 3) && "Only supporting rr/ri version of nvjump");
+ // New value jump
+ // (ins IntRegs:$src1, IntRegs:$src2, brtarget:$offset)
+ // (ins IntRegs:$src1, u5Imm:$src2, brtarget:$offset)
+ unsigned Flags1 = getUndefRegState(Cond[1].isUndef());
+ DEBUG(dbgs() << "\nInserting NVJump for BB#" << MBB.getNumber(););
+ if (Cond[2].isReg()) {
+ unsigned Flags2 = getUndefRegState(Cond[2].isUndef());
+ BuildMI(&MBB, DL, get(BccOpc)).addReg(Cond[1].getReg(), Flags1).
+ addReg(Cond[2].getReg(), Flags2).addMBB(TBB);
+ } else if(Cond[2].isImm()) {
+ BuildMI(&MBB, DL, get(BccOpc)).addReg(Cond[1].getReg(), Flags1).
+ addImm(Cond[2].getImm()).addMBB(TBB);
+ } else
+ llvm_unreachable("Invalid condition for branching");
+ } else {
+ assert((Cond.size() == 2) && "Malformed cond vector");
+ const MachineOperand &RO = Cond[1];
+ unsigned Flags = getUndefRegState(RO.isUndef());
+ BuildMI(&MBB, DL, get(BccOpc)).addReg(RO.getReg(), Flags).addMBB(TBB);
+ }
+ return 1;
+ }
+ assert((!Cond.empty()) &&
+ "Cond. cannot be empty when multiple branchings are required");
+ assert((!isNewValueJump(Cond[0].getImm())) &&
+ "NV-jump cannot be inserted with another branch");
+ // Special case for hardware loops. The condition is a basic block.
+ if (isEndLoopN(Cond[0].getImm())) {
+ int EndLoopOp = Cond[0].getImm();
+ assert(Cond[1].isMBB());
+ // Since we're adding an ENDLOOP, there better be a LOOP instruction.
+ // Check for it, and change the BB target if needed.
+ SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs;
+ MachineInstr *Loop = findLoopInstr(TBB, EndLoopOp, VisitedBBs);
+ assert(Loop != 0 && "Inserting an ENDLOOP without a LOOP");
+ Loop->getOperand(0).setMBB(TBB);
+ // Add the ENDLOOP after the finding the LOOP0.
+ BuildMI(&MBB, DL, get(EndLoopOp)).addMBB(TBB);
+ } else {
+ const MachineOperand &RO = Cond[1];
+ unsigned Flags = getUndefRegState(RO.isUndef());
+ BuildMI(&MBB, DL, get(BccOpc)).addReg(RO.getReg(), Flags).addMBB(TBB);
+ }
+ BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB);
+
+ return 2;
+}
+
+/// Analyze the loop code to find the loop induction variable and compare used
+/// to compute the number of iterations. Currently, we analyze loop that are
+/// controlled using hardware loops. In this case, the induction variable
+/// instruction is null. For all other cases, this function returns true, which
+/// means we're unable to analyze it.
+bool HexagonInstrInfo::analyzeLoop(MachineLoop &L,
+ MachineInstr *&IndVarInst,
+ MachineInstr *&CmpInst) const {
+
+ MachineBasicBlock *LoopEnd = L.getBottomBlock();
+ MachineBasicBlock::iterator I = LoopEnd->getFirstTerminator();
+ // We really "analyze" only hardware loops right now.
+ if (I != LoopEnd->end() && isEndLoopN(I->getOpcode())) {
+ IndVarInst = nullptr;
+ CmpInst = &*I;
+ return false;
+ }
+ return true;
+}
+
+/// Generate code to reduce the loop iteration by one and check if the loop is
+/// finished. Return the value/register of the new loop count. this function
+/// assumes the nth iteration is peeled first.
+unsigned HexagonInstrInfo::reduceLoopCount(MachineBasicBlock &MBB,
+ MachineInstr *IndVar, MachineInstr &Cmp,
+ SmallVectorImpl<MachineOperand> &Cond,
+ SmallVectorImpl<MachineInstr *> &PrevInsts,
+ unsigned Iter, unsigned MaxIter) const {
+ // We expect a hardware loop currently. This means that IndVar is set
+ // to null, and the compare is the ENDLOOP instruction.
+ assert((!IndVar) && isEndLoopN(Cmp.getOpcode())
+ && "Expecting a hardware loop");
+ MachineFunction *MF = MBB.getParent();
+ DebugLoc DL = Cmp.getDebugLoc();
+ SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs;
+ MachineInstr *Loop = findLoopInstr(&MBB, Cmp.getOpcode(), VisitedBBs);
+ if (!Loop)
+ return 0;
+ // If the loop trip count is a compile-time value, then just change the
+ // value.
+ if (Loop->getOpcode() == Hexagon::J2_loop0i ||
+ Loop->getOpcode() == Hexagon::J2_loop1i) {
+ int64_t Offset = Loop->getOperand(1).getImm();
+ if (Offset <= 1)
+ Loop->eraseFromParent();
+ else
+ Loop->getOperand(1).setImm(Offset - 1);
+ return Offset - 1;
+ }
+ // The loop trip count is a run-time value. We generate code to subtract
+ // one from the trip count, and update the loop instruction.
+ assert(Loop->getOpcode() == Hexagon::J2_loop0r && "Unexpected instruction");
+ unsigned LoopCount = Loop->getOperand(1).getReg();
+ // Check if we're done with the loop.
+ unsigned LoopEnd = createVR(MF, MVT::i1);
+ MachineInstr *NewCmp = BuildMI(&MBB, DL, get(Hexagon::C2_cmpgtui), LoopEnd).
+ addReg(LoopCount).addImm(1);
+ unsigned NewLoopCount = createVR(MF, MVT::i32);
+ MachineInstr *NewAdd = BuildMI(&MBB, DL, get(Hexagon::A2_addi), NewLoopCount).
+ addReg(LoopCount).addImm(-1);
+ // Update the previously generated instructions with the new loop counter.
+ for (SmallVectorImpl<MachineInstr *>::iterator I = PrevInsts.begin(),
+ E = PrevInsts.end(); I != E; ++I)
+ (*I)->substituteRegister(LoopCount, NewLoopCount, 0, getRegisterInfo());
+ PrevInsts.clear();
+ PrevInsts.push_back(NewCmp);
+ PrevInsts.push_back(NewAdd);
+ // Insert the new loop instruction if this is the last time the loop is
+ // decremented.
+ if (Iter == MaxIter)
+ BuildMI(&MBB, DL, get(Hexagon::J2_loop0r)).
+ addMBB(Loop->getOperand(0).getMBB()).addReg(NewLoopCount);
+ // Delete the old loop instruction.
+ if (Iter == 0)
+ Loop->eraseFromParent();
+ Cond.push_back(MachineOperand::CreateImm(Hexagon::J2_jumpf));
+ Cond.push_back(NewCmp->getOperand(0));
+ return NewLoopCount;
+}
+
+bool HexagonInstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB,
+ unsigned NumCycles, unsigned ExtraPredCycles,
+ BranchProbability Probability) const {
+ return nonDbgBBSize(&MBB) <= 3;
+}
+
+
+bool HexagonInstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB,
+ unsigned NumTCycles, unsigned ExtraTCycles, MachineBasicBlock &FMBB,
+ unsigned NumFCycles, unsigned ExtraFCycles, BranchProbability Probability)
+ const {
+ return nonDbgBBSize(&TMBB) <= 3 && nonDbgBBSize(&FMBB) <= 3;
+}
+
+
+bool HexagonInstrInfo::isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
+ unsigned NumInstrs, BranchProbability Probability) const {
+ return NumInstrs <= 4;
+}
+
+void HexagonInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL, unsigned DestReg,
+ unsigned SrcReg, bool KillSrc) const {
+ auto &HRI = getRegisterInfo();
+ unsigned KillFlag = getKillRegState(KillSrc);
+
+ if (Hexagon::IntRegsRegClass.contains(SrcReg, DestReg)) {
+ BuildMI(MBB, I, DL, get(Hexagon::A2_tfr), DestReg)
+ .addReg(SrcReg, KillFlag);
+ return;
+ }
+ if (Hexagon::DoubleRegsRegClass.contains(SrcReg, DestReg)) {
+ BuildMI(MBB, I, DL, get(Hexagon::A2_tfrp), DestReg)
+ .addReg(SrcReg, KillFlag);
+ return;
+ }
+ if (Hexagon::PredRegsRegClass.contains(SrcReg, DestReg)) {
+ // Map Pd = Ps to Pd = or(Ps, Ps).
+ BuildMI(MBB, I, DL, get(Hexagon::C2_or), DestReg)
+ .addReg(SrcReg).addReg(SrcReg, KillFlag);
+ return;
+ }
+ if (Hexagon::CtrRegsRegClass.contains(DestReg) &&
+ Hexagon::IntRegsRegClass.contains(SrcReg)) {
+ BuildMI(MBB, I, DL, get(Hexagon::A2_tfrrcr), DestReg)
+ .addReg(SrcReg, KillFlag);
+ return;
+ }
+ if (Hexagon::IntRegsRegClass.contains(DestReg) &&
+ Hexagon::CtrRegsRegClass.contains(SrcReg)) {
+ BuildMI(MBB, I, DL, get(Hexagon::A2_tfrcrr), DestReg)
+ .addReg(SrcReg, KillFlag);
+ return;
+ }
+ if (Hexagon::ModRegsRegClass.contains(DestReg) &&
+ Hexagon::IntRegsRegClass.contains(SrcReg)) {
+ BuildMI(MBB, I, DL, get(Hexagon::A2_tfrrcr), DestReg)
+ .addReg(SrcReg, KillFlag);
+ return;
+ }
+ if (Hexagon::PredRegsRegClass.contains(SrcReg) &&
+ Hexagon::IntRegsRegClass.contains(DestReg)) {
+ BuildMI(MBB, I, DL, get(Hexagon::C2_tfrpr), DestReg)
+ .addReg(SrcReg, KillFlag);
+ return;
+ }
+ if (Hexagon::IntRegsRegClass.contains(SrcReg) &&
+ Hexagon::PredRegsRegClass.contains(DestReg)) {
+ BuildMI(MBB, I, DL, get(Hexagon::C2_tfrrp), DestReg)
+ .addReg(SrcReg, KillFlag);
+ return;
+ }
+ if (Hexagon::PredRegsRegClass.contains(SrcReg) &&
+ Hexagon::IntRegsRegClass.contains(DestReg)) {
+ BuildMI(MBB, I, DL, get(Hexagon::C2_tfrpr), DestReg)
+ .addReg(SrcReg, KillFlag);
+ return;
+ }
+ if (Hexagon::VectorRegsRegClass.contains(SrcReg, DestReg)) {
+ BuildMI(MBB, I, DL, get(Hexagon::V6_vassign), DestReg).
+ addReg(SrcReg, KillFlag);
+ return;
+ }
+ if (Hexagon::VecDblRegsRegClass.contains(SrcReg, DestReg)) {
+ unsigned LoSrc = HRI.getSubReg(SrcReg, Hexagon::vsub_lo);
+ unsigned HiSrc = HRI.getSubReg(SrcReg, Hexagon::vsub_hi);
+ BuildMI(MBB, I, DL, get(Hexagon::V6_vcombine), DestReg)
+ .addReg(HiSrc, KillFlag)
+ .addReg(LoSrc, KillFlag);
+ return;
+ }
+ if (Hexagon::VecPredRegsRegClass.contains(SrcReg, DestReg)) {
+ BuildMI(MBB, I, DL, get(Hexagon::V6_pred_and), DestReg)
+ .addReg(SrcReg)
+ .addReg(SrcReg, KillFlag);
+ return;
+ }
+ if (Hexagon::VecPredRegsRegClass.contains(SrcReg) &&
+ Hexagon::VectorRegsRegClass.contains(DestReg)) {
+ llvm_unreachable("Unimplemented pred to vec");
+ return;
+ }
+ if (Hexagon::VecPredRegsRegClass.contains(DestReg) &&
+ Hexagon::VectorRegsRegClass.contains(SrcReg)) {
+ llvm_unreachable("Unimplemented vec to pred");
+ return;
+ }
+ if (Hexagon::VecPredRegs128BRegClass.contains(SrcReg, DestReg)) {
+ unsigned HiDst = HRI.getSubReg(DestReg, Hexagon::vsub_hi);
+ unsigned LoDst = HRI.getSubReg(DestReg, Hexagon::vsub_lo);
+ unsigned HiSrc = HRI.getSubReg(SrcReg, Hexagon::vsub_hi);
+ unsigned LoSrc = HRI.getSubReg(SrcReg, Hexagon::vsub_lo);
+ BuildMI(MBB, I, DL, get(Hexagon::V6_pred_and), HiDst)
+ .addReg(HiSrc, KillFlag);
+ BuildMI(MBB, I, DL, get(Hexagon::V6_pred_and), LoDst)
+ .addReg(LoSrc, KillFlag);
+ return;
+ }
+
+#ifndef NDEBUG
+ // Show the invalid registers to ease debugging.
+ dbgs() << "Invalid registers for copy in BB#" << MBB.getNumber()
+ << ": " << PrintReg(DestReg, &HRI)
+ << " = " << PrintReg(SrcReg, &HRI) << '\n';
+#endif
+ llvm_unreachable("Unimplemented");
+}
+
+
+void HexagonInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I, unsigned SrcReg, bool isKill, int FI,
+ const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const {
+ DebugLoc DL = MBB.findDebugLoc(I);
+ MachineFunction &MF = *MBB.getParent();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ unsigned Align = MFI.getObjectAlignment(FI);
+ unsigned KillFlag = getKillRegState(isKill);
+
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore,
+ MFI.getObjectSize(FI), Align);
+
+ if (Hexagon::IntRegsRegClass.hasSubClassEq(RC)) {
+ BuildMI(MBB, I, DL, get(Hexagon::S2_storeri_io))
+ .addFrameIndex(FI).addImm(0)
+ .addReg(SrcReg, KillFlag).addMemOperand(MMO);
+ } else if (Hexagon::DoubleRegsRegClass.hasSubClassEq(RC)) {
+ BuildMI(MBB, I, DL, get(Hexagon::S2_storerd_io))
+ .addFrameIndex(FI).addImm(0)
+ .addReg(SrcReg, KillFlag).addMemOperand(MMO);
+ } else if (Hexagon::PredRegsRegClass.hasSubClassEq(RC)) {
+ BuildMI(MBB, I, DL, get(Hexagon::STriw_pred))
+ .addFrameIndex(FI).addImm(0)
+ .addReg(SrcReg, KillFlag).addMemOperand(MMO);
+ } else if (Hexagon::ModRegsRegClass.hasSubClassEq(RC)) {
+ BuildMI(MBB, I, DL, get(Hexagon::STriw_mod))
+ .addFrameIndex(FI).addImm(0)
+ .addReg(SrcReg, KillFlag).addMemOperand(MMO);
+ } else if (Hexagon::VecPredRegs128BRegClass.hasSubClassEq(RC)) {
+ BuildMI(MBB, I, DL, get(Hexagon::PS_vstorerq_ai_128B))
+ .addFrameIndex(FI).addImm(0)
+ .addReg(SrcReg, KillFlag).addMemOperand(MMO);
+ } else if (Hexagon::VecPredRegsRegClass.hasSubClassEq(RC)) {
+ BuildMI(MBB, I, DL, get(Hexagon::PS_vstorerq_ai))
+ .addFrameIndex(FI).addImm(0)
+ .addReg(SrcReg, KillFlag).addMemOperand(MMO);
+ } else if (Hexagon::VectorRegs128BRegClass.hasSubClassEq(RC)) {
+ unsigned Opc = Align < 128 ? Hexagon::V6_vS32Ub_ai_128B
+ : Hexagon::V6_vS32b_ai_128B;
+ BuildMI(MBB, I, DL, get(Opc))
+ .addFrameIndex(FI).addImm(0)
+ .addReg(SrcReg, KillFlag).addMemOperand(MMO);
+ } else if (Hexagon::VectorRegsRegClass.hasSubClassEq(RC)) {
+ unsigned Opc = Align < 64 ? Hexagon::V6_vS32Ub_ai
+ : Hexagon::V6_vS32b_ai;
+ BuildMI(MBB, I, DL, get(Opc))
+ .addFrameIndex(FI).addImm(0)
+ .addReg(SrcReg, KillFlag).addMemOperand(MMO);
+ } else if (Hexagon::VecDblRegsRegClass.hasSubClassEq(RC)) {
+ unsigned Opc = Align < 64 ? Hexagon::PS_vstorerwu_ai
+ : Hexagon::PS_vstorerw_ai;
+ BuildMI(MBB, I, DL, get(Opc))
+ .addFrameIndex(FI).addImm(0)
+ .addReg(SrcReg, KillFlag).addMemOperand(MMO);
+ } else if (Hexagon::VecDblRegs128BRegClass.hasSubClassEq(RC)) {
+ unsigned Opc = Align < 128 ? Hexagon::PS_vstorerwu_ai_128B
+ : Hexagon::PS_vstorerw_ai_128B;
+ BuildMI(MBB, I, DL, get(Opc))
+ .addFrameIndex(FI).addImm(0)
+ .addReg(SrcReg, KillFlag).addMemOperand(MMO);
+ } else {
+ llvm_unreachable("Unimplemented");
+ }
+}
+
+void HexagonInstrInfo::loadRegFromStackSlot(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator I, unsigned DestReg,
+ int FI, const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ DebugLoc DL = MBB.findDebugLoc(I);
+ MachineFunction &MF = *MBB.getParent();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ unsigned Align = MFI.getObjectAlignment(FI);
+
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad,
+ MFI.getObjectSize(FI), Align);
+
+ if (Hexagon::IntRegsRegClass.hasSubClassEq(RC)) {
+ BuildMI(MBB, I, DL, get(Hexagon::L2_loadri_io), DestReg)
+ .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+ } else if (Hexagon::DoubleRegsRegClass.hasSubClassEq(RC)) {
+ BuildMI(MBB, I, DL, get(Hexagon::L2_loadrd_io), DestReg)
+ .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+ } else if (Hexagon::PredRegsRegClass.hasSubClassEq(RC)) {
+ BuildMI(MBB, I, DL, get(Hexagon::LDriw_pred), DestReg)
+ .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+ } else if (Hexagon::ModRegsRegClass.hasSubClassEq(RC)) {
+ BuildMI(MBB, I, DL, get(Hexagon::LDriw_mod), DestReg)
+ .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+ } else if (Hexagon::VecPredRegs128BRegClass.hasSubClassEq(RC)) {
+ BuildMI(MBB, I, DL, get(Hexagon::PS_vloadrq_ai_128B), DestReg)
+ .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+ } else if (Hexagon::VecPredRegsRegClass.hasSubClassEq(RC)) {
+ BuildMI(MBB, I, DL, get(Hexagon::PS_vloadrq_ai), DestReg)
+ .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+ } else if (Hexagon::VecDblRegs128BRegClass.hasSubClassEq(RC)) {
+ unsigned Opc = Align < 128 ? Hexagon::PS_vloadrwu_ai_128B
+ : Hexagon::PS_vloadrw_ai_128B;
+ BuildMI(MBB, I, DL, get(Opc), DestReg)
+ .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+ } else if (Hexagon::VectorRegs128BRegClass.hasSubClassEq(RC)) {
+ unsigned Opc = Align < 128 ? Hexagon::V6_vL32Ub_ai_128B
+ : Hexagon::V6_vL32b_ai_128B;
+ BuildMI(MBB, I, DL, get(Opc), DestReg)
+ .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+ } else if (Hexagon::VectorRegsRegClass.hasSubClassEq(RC)) {
+ unsigned Opc = Align < 64 ? Hexagon::V6_vL32Ub_ai
+ : Hexagon::V6_vL32b_ai;
+ BuildMI(MBB, I, DL, get(Opc), DestReg)
+ .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+ } else if (Hexagon::VecDblRegsRegClass.hasSubClassEq(RC)) {
+ unsigned Opc = Align < 64 ? Hexagon::PS_vloadrwu_ai
+ : Hexagon::PS_vloadrw_ai;
+ BuildMI(MBB, I, DL, get(Opc), DestReg)
+ .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+ } else {
+ llvm_unreachable("Can't store this register to stack slot");
+ }
+}
+
+
+static void getLiveRegsAt(LivePhysRegs &Regs, const MachineInstr &MI) {
+ const MachineBasicBlock &B = *MI.getParent();
+ Regs.addLiveOuts(B);
+ auto E = ++MachineBasicBlock::const_iterator(MI.getIterator()).getReverse();
+ for (auto I = B.rbegin(); I != E; ++I)
+ Regs.stepBackward(*I);
+}
+
+/// expandPostRAPseudo - This function is called for all pseudo instructions
+/// that remain after register allocation. Many pseudo instructions are
+/// created to help register allocation. This is the place to convert them
+/// into real instructions. The target can edit MI in place, or it can insert
+/// new instructions and erase MI. The function should return true if
+/// anything was changed.
+bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+ const HexagonRegisterInfo &HRI = getRegisterInfo();
+ MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ MachineBasicBlock &MBB = *MI.getParent();
+ DebugLoc DL = MI.getDebugLoc();
+ unsigned Opc = MI.getOpcode();
+ const unsigned VecOffset = 1;
+
+ switch (Opc) {
+ case TargetOpcode::COPY: {
+ MachineOperand &MD = MI.getOperand(0);
+ MachineOperand &MS = MI.getOperand(1);
+ MachineBasicBlock::iterator MBBI = MI.getIterator();
+ if (MD.getReg() != MS.getReg() && !MS.isUndef()) {
+ copyPhysReg(MBB, MI, DL, MD.getReg(), MS.getReg(), MS.isKill());
+ std::prev(MBBI)->copyImplicitOps(*MBB.getParent(), MI);
+ }
+ MBB.erase(MBBI);
+ return true;
+ }
+ case Hexagon::PS_aligna:
+ BuildMI(MBB, MI, DL, get(Hexagon::A2_andir), MI.getOperand(0).getReg())
+ .addReg(HRI.getFrameRegister())
+ .addImm(-MI.getOperand(1).getImm());
+ MBB.erase(MI);
+ return true;
+ case Hexagon::V6_vassignp_128B:
+ case Hexagon::V6_vassignp: {
+ unsigned SrcReg = MI.getOperand(1).getReg();
+ unsigned DstReg = MI.getOperand(0).getReg();
+ unsigned Kill = getKillRegState(MI.getOperand(1).isKill());
+ BuildMI(MBB, MI, DL, get(Hexagon::V6_vcombine), DstReg)
+ .addReg(HRI.getSubReg(SrcReg, Hexagon::vsub_hi), Kill)
+ .addReg(HRI.getSubReg(SrcReg, Hexagon::vsub_lo), Kill);
+ MBB.erase(MI);
+ return true;
+ }
+ case Hexagon::V6_lo_128B:
+ case Hexagon::V6_lo: {
+ unsigned SrcReg = MI.getOperand(1).getReg();
+ unsigned DstReg = MI.getOperand(0).getReg();
+ unsigned SrcSubLo = HRI.getSubReg(SrcReg, Hexagon::vsub_lo);
+ copyPhysReg(MBB, MI, DL, DstReg, SrcSubLo, MI.getOperand(1).isKill());
+ MBB.erase(MI);
+ MRI.clearKillFlags(SrcSubLo);
+ return true;
+ }
+ case Hexagon::V6_hi_128B:
+ case Hexagon::V6_hi: {
+ unsigned SrcReg = MI.getOperand(1).getReg();
+ unsigned DstReg = MI.getOperand(0).getReg();
+ unsigned SrcSubHi = HRI.getSubReg(SrcReg, Hexagon::vsub_hi);
+ copyPhysReg(MBB, MI, DL, DstReg, SrcSubHi, MI.getOperand(1).isKill());
+ MBB.erase(MI);
+ MRI.clearKillFlags(SrcSubHi);
+ return true;
+ }
+ case Hexagon::PS_vstorerw_ai:
+ case Hexagon::PS_vstorerwu_ai:
+ case Hexagon::PS_vstorerw_ai_128B:
+ case Hexagon::PS_vstorerwu_ai_128B: {
+ bool Is128B = (Opc == Hexagon::PS_vstorerw_ai_128B ||
+ Opc == Hexagon::PS_vstorerwu_ai_128B);
+ bool Aligned = (Opc == Hexagon::PS_vstorerw_ai ||
+ Opc == Hexagon::PS_vstorerw_ai_128B);
+ unsigned SrcReg = MI.getOperand(2).getReg();
+ unsigned SrcSubHi = HRI.getSubReg(SrcReg, Hexagon::vsub_hi);
+ unsigned SrcSubLo = HRI.getSubReg(SrcReg, Hexagon::vsub_lo);
+ unsigned NewOpc;
+ if (Aligned)
+ NewOpc = Is128B ? Hexagon::V6_vS32b_ai_128B
+ : Hexagon::V6_vS32b_ai;
+ else
+ NewOpc = Is128B ? Hexagon::V6_vS32Ub_ai_128B
+ : Hexagon::V6_vS32Ub_ai;
+
+ unsigned Offset = Is128B ? VecOffset << 7 : VecOffset << 6;
+ MachineInstr *MI1New =
+ BuildMI(MBB, MI, DL, get(NewOpc))
+ .addOperand(MI.getOperand(0))
+ .addImm(MI.getOperand(1).getImm())
+ .addReg(SrcSubLo)
+ .setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ MI1New->getOperand(0).setIsKill(false);
+ BuildMI(MBB, MI, DL, get(NewOpc))
+ .addOperand(MI.getOperand(0))
+ // The Vectors are indexed in multiples of vector size.
+ .addImm(MI.getOperand(1).getImm() + Offset)
+ .addReg(SrcSubHi)
+ .setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ MBB.erase(MI);
+ return true;
+ }
+ case Hexagon::PS_vloadrw_ai:
+ case Hexagon::PS_vloadrwu_ai:
+ case Hexagon::PS_vloadrw_ai_128B:
+ case Hexagon::PS_vloadrwu_ai_128B: {
+ bool Is128B = (Opc == Hexagon::PS_vloadrw_ai_128B ||
+ Opc == Hexagon::PS_vloadrwu_ai_128B);
+ bool Aligned = (Opc == Hexagon::PS_vloadrw_ai ||
+ Opc == Hexagon::PS_vloadrw_ai_128B);
+ unsigned NewOpc;
+ if (Aligned)
+ NewOpc = Is128B ? Hexagon::V6_vL32b_ai_128B
+ : Hexagon::V6_vL32b_ai;
+ else
+ NewOpc = Is128B ? Hexagon::V6_vL32Ub_ai_128B
+ : Hexagon::V6_vL32Ub_ai;
+
+ unsigned DstReg = MI.getOperand(0).getReg();
+ unsigned Offset = Is128B ? VecOffset << 7 : VecOffset << 6;
+ MachineInstr *MI1New =
+ BuildMI(MBB, MI, DL, get(NewOpc),
+ HRI.getSubReg(DstReg, Hexagon::vsub_lo))
+ .addOperand(MI.getOperand(1))
+ .addImm(MI.getOperand(2).getImm());
+ MI1New->getOperand(1).setIsKill(false);
+ BuildMI(MBB, MI, DL, get(NewOpc),
+ HRI.getSubReg(DstReg, Hexagon::vsub_hi))
+ .addOperand(MI.getOperand(1))
+ // The Vectors are indexed in multiples of vector size.
+ .addImm(MI.getOperand(2).getImm() + Offset)
+ .setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ MBB.erase(MI);
+ return true;
+ }
+ case Hexagon::PS_true: {
+ unsigned Reg = MI.getOperand(0).getReg();
+ BuildMI(MBB, MI, DL, get(Hexagon::C2_orn), Reg)
+ .addReg(Reg, RegState::Undef)
+ .addReg(Reg, RegState::Undef);
+ MBB.erase(MI);
+ return true;
+ }
+ case Hexagon::PS_false: {
+ unsigned Reg = MI.getOperand(0).getReg();
+ BuildMI(MBB, MI, DL, get(Hexagon::C2_andn), Reg)
+ .addReg(Reg, RegState::Undef)
+ .addReg(Reg, RegState::Undef);
+ MBB.erase(MI);
+ return true;
+ }
+ case Hexagon::PS_vmulw: {
+ // Expand a 64-bit vector multiply into 2 32-bit scalar multiplies.
+ unsigned DstReg = MI.getOperand(0).getReg();
+ unsigned Src1Reg = MI.getOperand(1).getReg();
+ unsigned Src2Reg = MI.getOperand(2).getReg();
+ unsigned Src1SubHi = HRI.getSubReg(Src1Reg, Hexagon::isub_hi);
+ unsigned Src1SubLo = HRI.getSubReg(Src1Reg, Hexagon::isub_lo);
+ unsigned Src2SubHi = HRI.getSubReg(Src2Reg, Hexagon::isub_hi);
+ unsigned Src2SubLo = HRI.getSubReg(Src2Reg, Hexagon::isub_lo);
+ BuildMI(MBB, MI, MI.getDebugLoc(), get(Hexagon::M2_mpyi),
+ HRI.getSubReg(DstReg, Hexagon::isub_hi))
+ .addReg(Src1SubHi)
+ .addReg(Src2SubHi);
+ BuildMI(MBB, MI, MI.getDebugLoc(), get(Hexagon::M2_mpyi),
+ HRI.getSubReg(DstReg, Hexagon::isub_lo))
+ .addReg(Src1SubLo)
+ .addReg(Src2SubLo);
+ MBB.erase(MI);
+ MRI.clearKillFlags(Src1SubHi);
+ MRI.clearKillFlags(Src1SubLo);
+ MRI.clearKillFlags(Src2SubHi);
+ MRI.clearKillFlags(Src2SubLo);
+ return true;
+ }
+ case Hexagon::PS_vmulw_acc: {
+ // Expand 64-bit vector multiply with addition into 2 scalar multiplies.
+ unsigned DstReg = MI.getOperand(0).getReg();
+ unsigned Src1Reg = MI.getOperand(1).getReg();
+ unsigned Src2Reg = MI.getOperand(2).getReg();
+ unsigned Src3Reg = MI.getOperand(3).getReg();
+ unsigned Src1SubHi = HRI.getSubReg(Src1Reg, Hexagon::isub_hi);
+ unsigned Src1SubLo = HRI.getSubReg(Src1Reg, Hexagon::isub_lo);
+ unsigned Src2SubHi = HRI.getSubReg(Src2Reg, Hexagon::isub_hi);
+ unsigned Src2SubLo = HRI.getSubReg(Src2Reg, Hexagon::isub_lo);
+ unsigned Src3SubHi = HRI.getSubReg(Src3Reg, Hexagon::isub_hi);
+ unsigned Src3SubLo = HRI.getSubReg(Src3Reg, Hexagon::isub_lo);
+ BuildMI(MBB, MI, MI.getDebugLoc(), get(Hexagon::M2_maci),
+ HRI.getSubReg(DstReg, Hexagon::isub_hi))
+ .addReg(Src1SubHi)
+ .addReg(Src2SubHi)
+ .addReg(Src3SubHi);
+ BuildMI(MBB, MI, MI.getDebugLoc(), get(Hexagon::M2_maci),
+ HRI.getSubReg(DstReg, Hexagon::isub_lo))
+ .addReg(Src1SubLo)
+ .addReg(Src2SubLo)
+ .addReg(Src3SubLo);
+ MBB.erase(MI);
+ MRI.clearKillFlags(Src1SubHi);
+ MRI.clearKillFlags(Src1SubLo);
+ MRI.clearKillFlags(Src2SubHi);
+ MRI.clearKillFlags(Src2SubLo);
+ MRI.clearKillFlags(Src3SubHi);
+ MRI.clearKillFlags(Src3SubLo);
+ return true;
+ }
+ case Hexagon::PS_pselect: {
+ const MachineOperand &Op0 = MI.getOperand(0);
+ const MachineOperand &Op1 = MI.getOperand(1);
+ const MachineOperand &Op2 = MI.getOperand(2);
+ const MachineOperand &Op3 = MI.getOperand(3);
+ unsigned Rd = Op0.getReg();
+ unsigned Pu = Op1.getReg();
+ unsigned Rs = Op2.getReg();
+ unsigned Rt = Op3.getReg();
+ DebugLoc DL = MI.getDebugLoc();
+ unsigned K1 = getKillRegState(Op1.isKill());
+ unsigned K2 = getKillRegState(Op2.isKill());
+ unsigned K3 = getKillRegState(Op3.isKill());
+ if (Rd != Rs)
+ BuildMI(MBB, MI, DL, get(Hexagon::A2_tfrpt), Rd)
+ .addReg(Pu, (Rd == Rt) ? K1 : 0)
+ .addReg(Rs, K2);
+ if (Rd != Rt)
+ BuildMI(MBB, MI, DL, get(Hexagon::A2_tfrpf), Rd)
+ .addReg(Pu, K1)
+ .addReg(Rt, K3);
+ MBB.erase(MI);
+ return true;
+ }
+ case Hexagon::PS_vselect:
+ case Hexagon::PS_vselect_128B: {
+ const MachineOperand &Op0 = MI.getOperand(0);
+ const MachineOperand &Op1 = MI.getOperand(1);
+ const MachineOperand &Op2 = MI.getOperand(2);
+ const MachineOperand &Op3 = MI.getOperand(3);
+ LivePhysRegs LiveAtMI(&HRI);
+ getLiveRegsAt(LiveAtMI, MI);
+ bool IsDestLive = !LiveAtMI.available(MRI, Op0.getReg());
+ if (Op0.getReg() != Op2.getReg()) {
+ auto T = BuildMI(MBB, MI, DL, get(Hexagon::V6_vcmov))
+ .addOperand(Op0)
+ .addOperand(Op1)
+ .addOperand(Op2);
+ if (IsDestLive)
+ T.addReg(Op0.getReg(), RegState::Implicit);
+ IsDestLive = true;
+ }
+ if (Op0.getReg() != Op3.getReg()) {
+ auto T = BuildMI(MBB, MI, DL, get(Hexagon::V6_vncmov))
+ .addOperand(Op0)
+ .addOperand(Op1)
+ .addOperand(Op3);
+ if (IsDestLive)
+ T.addReg(Op0.getReg(), RegState::Implicit);
+ }
+ MBB.erase(MI);
+ return true;
+ }
+ case Hexagon::PS_wselect:
+ case Hexagon::PS_wselect_128B: {
+ MachineOperand &Op0 = MI.getOperand(0);
+ MachineOperand &Op1 = MI.getOperand(1);
+ MachineOperand &Op2 = MI.getOperand(2);
+ MachineOperand &Op3 = MI.getOperand(3);
+ LivePhysRegs LiveAtMI(&HRI);
+ getLiveRegsAt(LiveAtMI, MI);
+ bool IsDestLive = !LiveAtMI.available(MRI, Op0.getReg());
+
+ if (Op0.getReg() != Op2.getReg()) {
+ unsigned SrcLo = HRI.getSubReg(Op2.getReg(), Hexagon::vsub_lo);
+ unsigned SrcHi = HRI.getSubReg(Op2.getReg(), Hexagon::vsub_hi);
+ auto T = BuildMI(MBB, MI, DL, get(Hexagon::V6_vccombine))
+ .addOperand(Op0)
+ .addOperand(Op1)
+ .addReg(SrcHi)
+ .addReg(SrcLo);
+ if (IsDestLive)
+ T.addReg(Op0.getReg(), RegState::Implicit);
+ IsDestLive = true;
+ }
+ if (Op0.getReg() != Op3.getReg()) {
+ unsigned SrcLo = HRI.getSubReg(Op3.getReg(), Hexagon::vsub_lo);
+ unsigned SrcHi = HRI.getSubReg(Op3.getReg(), Hexagon::vsub_hi);
+ auto T = BuildMI(MBB, MI, DL, get(Hexagon::V6_vnccombine))
+ .addOperand(Op0)
+ .addOperand(Op1)
+ .addReg(SrcHi)
+ .addReg(SrcLo);
+ if (IsDestLive)
+ T.addReg(Op0.getReg(), RegState::Implicit);
+ }
+ MBB.erase(MI);
+ return true;
+ }
+ case Hexagon::PS_tailcall_i:
+ MI.setDesc(get(Hexagon::J2_jump));
+ return true;
+ case Hexagon::PS_tailcall_r:
+ case Hexagon::PS_jmpret:
+ MI.setDesc(get(Hexagon::J2_jumpr));
+ return true;
+ case Hexagon::PS_jmprett:
+ MI.setDesc(get(Hexagon::J2_jumprt));
+ return true;
+ case Hexagon::PS_jmpretf:
+ MI.setDesc(get(Hexagon::J2_jumprf));
+ return true;
+ case Hexagon::PS_jmprettnewpt:
+ MI.setDesc(get(Hexagon::J2_jumprtnewpt));
+ return true;
+ case Hexagon::PS_jmpretfnewpt:
+ MI.setDesc(get(Hexagon::J2_jumprfnewpt));
+ return true;
+ case Hexagon::PS_jmprettnew:
+ MI.setDesc(get(Hexagon::J2_jumprtnew));
+ return true;
+ case Hexagon::PS_jmpretfnew:
+ MI.setDesc(get(Hexagon::J2_jumprfnew));
+ return true;
+ }
+
+ return false;
+}
+
+
+// We indicate that we want to reverse the branch by
+// inserting the reversed branching opcode.
+bool HexagonInstrInfo::reverseBranchCondition(
+ SmallVectorImpl<MachineOperand> &Cond) const {
+ if (Cond.empty())
+ return true;
+ assert(Cond[0].isImm() && "First entry in the cond vector not imm-val");
+ unsigned opcode = Cond[0].getImm();
+ //unsigned temp;
+ assert(get(opcode).isBranch() && "Should be a branching condition.");
+ if (isEndLoopN(opcode))
+ return true;
+ unsigned NewOpcode = getInvertedPredicatedOpcode(opcode);
+ Cond[0].setImm(NewOpcode);
+ return false;
+}
+
+
+void HexagonInstrInfo::insertNoop(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI) const {
+ DebugLoc DL;
+ BuildMI(MBB, MI, DL, get(Hexagon::A2_nop));
+}
+
+
+bool HexagonInstrInfo::isPostIncrement(const MachineInstr &MI) const {
+ return getAddrMode(MI) == HexagonII::PostInc;
+}
+
+
+// Returns true if an instruction is predicated irrespective of the predicate
+// sense. For example, all of the following will return true.
+// if (p0) R1 = add(R2, R3)
+// if (!p0) R1 = add(R2, R3)
+// if (p0.new) R1 = add(R2, R3)
+// if (!p0.new) R1 = add(R2, R3)
+// Note: New-value stores are not included here as in the current
+// implementation, we don't need to check their predicate sense.
+bool HexagonInstrInfo::isPredicated(const MachineInstr &MI) const {
+ const uint64_t F = MI.getDesc().TSFlags;
+ return (F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask;
+}
+
+
+bool HexagonInstrInfo::PredicateInstruction(
+ MachineInstr &MI, ArrayRef<MachineOperand> Cond) const {
+ if (Cond.empty() || isNewValueJump(Cond[0].getImm()) ||
+ isEndLoopN(Cond[0].getImm())) {
+ DEBUG(dbgs() << "\nCannot predicate:"; MI.dump(););
+ return false;
+ }
+ int Opc = MI.getOpcode();
+ assert (isPredicable(MI) && "Expected predicable instruction");
+ bool invertJump = predOpcodeHasNot(Cond);
+
+ // We have to predicate MI "in place", i.e. after this function returns,
+ // MI will need to be transformed into a predicated form. To avoid com-
+ // plicated manipulations with the operands (handling tied operands,
+ // etc.), build a new temporary instruction, then overwrite MI with it.
+
+ MachineBasicBlock &B = *MI.getParent();
+ DebugLoc DL = MI.getDebugLoc();
+ unsigned PredOpc = getCondOpcode(Opc, invertJump);
+ MachineInstrBuilder T = BuildMI(B, MI, DL, get(PredOpc));
+ unsigned NOp = 0, NumOps = MI.getNumOperands();
+ while (NOp < NumOps) {
+ MachineOperand &Op = MI.getOperand(NOp);
+ if (!Op.isReg() || !Op.isDef() || Op.isImplicit())
+ break;
+ T.addOperand(Op);
+ NOp++;
+ }
+
+ unsigned PredReg, PredRegPos, PredRegFlags;
+ bool GotPredReg = getPredReg(Cond, PredReg, PredRegPos, PredRegFlags);
+ (void)GotPredReg;
+ assert(GotPredReg);
+ T.addReg(PredReg, PredRegFlags);
+ while (NOp < NumOps)
+ T.addOperand(MI.getOperand(NOp++));
+
+ MI.setDesc(get(PredOpc));
+ while (unsigned n = MI.getNumOperands())
+ MI.RemoveOperand(n-1);
+ for (unsigned i = 0, n = T->getNumOperands(); i < n; ++i)
+ MI.addOperand(T->getOperand(i));
+
+ MachineBasicBlock::instr_iterator TI = T->getIterator();
+ B.erase(TI);
+
+ MachineRegisterInfo &MRI = B.getParent()->getRegInfo();
+ MRI.clearKillFlags(PredReg);
+ return true;
+}
+
+
+bool HexagonInstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
+ ArrayRef<MachineOperand> Pred2) const {
+ // TODO: Fix this
+ return false;
+}
+
+
+bool HexagonInstrInfo::DefinesPredicate(
+ MachineInstr &MI, std::vector<MachineOperand> &Pred) const {
+ auto &HRI = getRegisterInfo();
+ for (unsigned oper = 0; oper < MI.getNumOperands(); ++oper) {
+ MachineOperand MO = MI.getOperand(oper);
+ if (MO.isReg() && MO.isDef()) {
+ const TargetRegisterClass* RC = HRI.getMinimalPhysRegClass(MO.getReg());
+ if (RC == &Hexagon::PredRegsRegClass) {
+ Pred.push_back(MO);
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+
+bool HexagonInstrInfo::isPredicable(MachineInstr &MI) const {
+ return MI.getDesc().isPredicable();
+}
+
+bool HexagonInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
+ const MachineBasicBlock *MBB,
+ const MachineFunction &MF) const {
+ // Debug info is never a scheduling boundary. It's necessary to be explicit
+ // due to the special treatment of IT instructions below, otherwise a
+ // dbg_value followed by an IT will result in the IT instruction being
+ // considered a scheduling hazard, which is wrong. It should be the actual
+ // instruction preceding the dbg_value instruction(s), just like it is
+ // when debug info is not present.
+ if (MI.isDebugValue())
+ return false;
+
+ // Throwing call is a boundary.
+ if (MI.isCall()) {
+ // Don't mess around with no return calls.
+ if (doesNotReturn(MI))
+ return true;
+ // If any of the block's successors is a landing pad, this could be a
+ // throwing call.
+ for (auto I : MBB->successors())
+ if (I->isEHPad())
+ return true;
+ }
+
+ // Terminators and labels can't be scheduled around.
+ if (MI.getDesc().isTerminator() || MI.isPosition())
+ return true;
+
+ if (MI.isInlineAsm() && !ScheduleInlineAsm)
+ return true;
+
+ return false;
+}
+
+
+/// Measure the specified inline asm to determine an approximation of its
+/// length.
+/// Comments (which run till the next SeparatorString or newline) do not
+/// count as an instruction.
+/// Any other non-whitespace text is considered an instruction, with
+/// multiple instructions separated by SeparatorString or newlines.
+/// Variable-length instructions are not handled here; this function
+/// may be overloaded in the target code to do that.
+/// Hexagon counts the number of ##'s and adjust for that many
+/// constant exenders.
+unsigned HexagonInstrInfo::getInlineAsmLength(const char *Str,
+ const MCAsmInfo &MAI) const {
+ StringRef AStr(Str);
+ // Count the number of instructions in the asm.
+ bool atInsnStart = true;
+ unsigned Length = 0;
+ for (; *Str; ++Str) {
+ if (*Str == '\n' || strncmp(Str, MAI.getSeparatorString(),
+ strlen(MAI.getSeparatorString())) == 0)
+ atInsnStart = true;
+ if (atInsnStart && !std::isspace(static_cast<unsigned char>(*Str))) {
+ Length += MAI.getMaxInstLength();
+ atInsnStart = false;
+ }
+ if (atInsnStart && strncmp(Str, MAI.getCommentString().data(),
+ MAI.getCommentString().size()) == 0)
+ atInsnStart = false;
+ }
+
+ // Add to size number of constant extenders seen * 4.
+ StringRef Occ("##");
+ Length += AStr.count(Occ)*4;
+ return Length;
+}
+
+
+ScheduleHazardRecognizer*
+HexagonInstrInfo::CreateTargetPostRAHazardRecognizer(
+ const InstrItineraryData *II, const ScheduleDAG *DAG) const {
+ if (UseDFAHazardRec) {
+ auto &HST = DAG->MF.getSubtarget<HexagonSubtarget>();
+ return new HexagonHazardRecognizer(II, this, HST);
+ }
+ return TargetInstrInfo::CreateTargetPostRAHazardRecognizer(II, DAG);
+}
+
+
+/// \brief For a comparison instruction, return the source registers in
+/// \p SrcReg and \p SrcReg2 if having two register operands, and the value it
+/// compares against in CmpValue. Return true if the comparison instruction
+/// can be analyzed.
+bool HexagonInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
+ unsigned &SrcReg2, int &Mask,
+ int &Value) const {
+ unsigned Opc = MI.getOpcode();
+
+ // Set mask and the first source register.
+ switch (Opc) {
+ case Hexagon::C2_cmpeq:
+ case Hexagon::C2_cmpeqp:
+ case Hexagon::C2_cmpgt:
+ case Hexagon::C2_cmpgtp:
+ case Hexagon::C2_cmpgtu:
+ case Hexagon::C2_cmpgtup:
+ case Hexagon::C4_cmpneq:
+ case Hexagon::C4_cmplte:
+ case Hexagon::C4_cmplteu:
+ case Hexagon::C2_cmpeqi:
+ case Hexagon::C2_cmpgti:
+ case Hexagon::C2_cmpgtui:
+ case Hexagon::C4_cmpneqi:
+ case Hexagon::C4_cmplteui:
+ case Hexagon::C4_cmpltei:
+ SrcReg = MI.getOperand(1).getReg();
+ Mask = ~0;
+ break;
+ case Hexagon::A4_cmpbeq:
+ case Hexagon::A4_cmpbgt:
+ case Hexagon::A4_cmpbgtu:
+ case Hexagon::A4_cmpbeqi:
+ case Hexagon::A4_cmpbgti:
+ case Hexagon::A4_cmpbgtui:
+ SrcReg = MI.getOperand(1).getReg();
+ Mask = 0xFF;
+ break;
+ case Hexagon::A4_cmpheq:
+ case Hexagon::A4_cmphgt:
+ case Hexagon::A4_cmphgtu:
+ case Hexagon::A4_cmpheqi:
+ case Hexagon::A4_cmphgti:
+ case Hexagon::A4_cmphgtui:
+ SrcReg = MI.getOperand(1).getReg();
+ Mask = 0xFFFF;
+ break;
+ }
+
+ // Set the value/second source register.
+ switch (Opc) {
+ case Hexagon::C2_cmpeq:
+ case Hexagon::C2_cmpeqp:
+ case Hexagon::C2_cmpgt:
+ case Hexagon::C2_cmpgtp:
+ case Hexagon::C2_cmpgtu:
+ case Hexagon::C2_cmpgtup:
+ case Hexagon::A4_cmpbeq:
+ case Hexagon::A4_cmpbgt:
+ case Hexagon::A4_cmpbgtu:
+ case Hexagon::A4_cmpheq:
+ case Hexagon::A4_cmphgt:
+ case Hexagon::A4_cmphgtu:
+ case Hexagon::C4_cmpneq:
+ case Hexagon::C4_cmplte:
+ case Hexagon::C4_cmplteu:
+ SrcReg2 = MI.getOperand(2).getReg();
+ return true;
+
+ case Hexagon::C2_cmpeqi:
+ case Hexagon::C2_cmpgtui:
+ case Hexagon::C2_cmpgti:
+ case Hexagon::C4_cmpneqi:
+ case Hexagon::C4_cmplteui:
+ case Hexagon::C4_cmpltei:
+ case Hexagon::A4_cmpbeqi:
+ case Hexagon::A4_cmpbgti:
+ case Hexagon::A4_cmpbgtui:
+ case Hexagon::A4_cmpheqi:
+ case Hexagon::A4_cmphgti:
+ case Hexagon::A4_cmphgtui:
+ SrcReg2 = 0;
+ Value = MI.getOperand(2).getImm();
+ return true;
+ }
+
+ return false;
+}
+
+unsigned HexagonInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
+ const MachineInstr &MI,
+ unsigned *PredCost) const {
+ return getInstrTimingClassLatency(ItinData, MI);
+}
+
+
+DFAPacketizer *HexagonInstrInfo::CreateTargetScheduleState(
+ const TargetSubtargetInfo &STI) const {
+ const InstrItineraryData *II = STI.getInstrItineraryData();
+ return static_cast<const HexagonSubtarget&>(STI).createDFAPacketizer(II);
+}
+
+
+// Inspired by this pair:
+// %R13<def> = L2_loadri_io %R29, 136; mem:LD4[FixedStack0]
+// S2_storeri_io %R29, 132, %R1<kill>; flags: mem:ST4[FixedStack1]
+// Currently AA considers the addresses in these instructions to be aliasing.
+bool HexagonInstrInfo::areMemAccessesTriviallyDisjoint(
+ MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA) const {
+ int OffsetA = 0, OffsetB = 0;
+ unsigned SizeA = 0, SizeB = 0;
+
+ if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
+ MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
+ return false;
+
+ // Instructions that are pure loads, not loads and stores like memops are not
+ // dependent.
+ if (MIa.mayLoad() && !isMemOp(MIa) && MIb.mayLoad() && !isMemOp(MIb))
+ return true;
+
+ // Get base, offset, and access size in MIa.
+ unsigned BaseRegA = getBaseAndOffset(MIa, OffsetA, SizeA);
+ if (!BaseRegA || !SizeA)
+ return false;
+
+ // Get base, offset, and access size in MIb.
+ unsigned BaseRegB = getBaseAndOffset(MIb, OffsetB, SizeB);
+ if (!BaseRegB || !SizeB)
+ return false;
+
+ if (BaseRegA != BaseRegB)
+ return false;
+
+ // This is a mem access with the same base register and known offsets from it.
+ // Reason about it.
+ if (OffsetA > OffsetB) {
+ uint64_t offDiff = (uint64_t)((int64_t)OffsetA - (int64_t)OffsetB);
+ return (SizeB <= offDiff);
+ } else if (OffsetA < OffsetB) {
+ uint64_t offDiff = (uint64_t)((int64_t)OffsetB - (int64_t)OffsetA);
+ return (SizeA <= offDiff);
+ }
+
+ return false;
+}
+
+
+/// If the instruction is an increment of a constant value, return the amount.
+bool HexagonInstrInfo::getIncrementValue(const MachineInstr &MI,
+ int &Value) const {
+ if (isPostIncrement(MI)) {
+ unsigned AccessSize;
+ return getBaseAndOffset(MI, Value, AccessSize);
+ }
+ if (MI.getOpcode() == Hexagon::A2_addi) {
+ Value = MI.getOperand(2).getImm();
+ return true;
+ }
+
+ return false;
+}
+
+
+unsigned HexagonInstrInfo::createVR(MachineFunction *MF, MVT VT) const {
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ const TargetRegisterClass *TRC;
+ if (VT == MVT::i1) {
+ TRC = &Hexagon::PredRegsRegClass;
+ } else if (VT == MVT::i32 || VT == MVT::f32) {
+ TRC = &Hexagon::IntRegsRegClass;
+ } else if (VT == MVT::i64 || VT == MVT::f64) {
+ TRC = &Hexagon::DoubleRegsRegClass;
+ } else {
+ llvm_unreachable("Cannot handle this register class");
+ }
+
+ unsigned NewReg = MRI.createVirtualRegister(TRC);
+ return NewReg;
+}
+
+
+bool HexagonInstrInfo::isAbsoluteSet(const MachineInstr &MI) const {
+ return (getAddrMode(MI) == HexagonII::AbsoluteSet);
+}
+
+
+bool HexagonInstrInfo::isAccumulator(const MachineInstr &MI) const {
+ const uint64_t F = MI.getDesc().TSFlags;
+ return((F >> HexagonII::AccumulatorPos) & HexagonII::AccumulatorMask);
+}
+
+
+bool HexagonInstrInfo::isComplex(const MachineInstr &MI) const {
+ const MachineFunction *MF = MI.getParent()->getParent();
+ const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+ const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+
+ if (!(isTC1(MI))
+ && !(QII->isTC2Early(MI))
+ && !(MI.getDesc().mayLoad())
+ && !(MI.getDesc().mayStore())
+ && (MI.getDesc().getOpcode() != Hexagon::S2_allocframe)
+ && (MI.getDesc().getOpcode() != Hexagon::L2_deallocframe)
+ && !(QII->isMemOp(MI))
+ && !(MI.isBranch())
+ && !(MI.isReturn())
+ && !MI.isCall())
+ return true;
+
+ return false;
+}
+
+
+// Return true if the instruction is a compund branch instruction.
+bool HexagonInstrInfo::isCompoundBranchInstr(const MachineInstr &MI) const {
+ return (getType(MI) == HexagonII::TypeCOMPOUND && MI.isBranch());
+}
+
+
+bool HexagonInstrInfo::isCondInst(const MachineInstr &MI) const {
+ return (MI.isBranch() && isPredicated(MI)) ||
+ isConditionalTransfer(MI) ||
+ isConditionalALU32(MI) ||
+ isConditionalLoad(MI) ||
+ // Predicated stores which don't have a .new on any operands.
+ (MI.mayStore() && isPredicated(MI) && !isNewValueStore(MI) &&
+ !isPredicatedNew(MI));
+}
+
+
+bool HexagonInstrInfo::isConditionalALU32(const MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ case Hexagon::A2_paddf:
+ case Hexagon::A2_paddfnew:
+ case Hexagon::A2_paddif:
+ case Hexagon::A2_paddifnew:
+ case Hexagon::A2_paddit:
+ case Hexagon::A2_padditnew:
+ case Hexagon::A2_paddt:
+ case Hexagon::A2_paddtnew:
+ case Hexagon::A2_pandf:
+ case Hexagon::A2_pandfnew:
+ case Hexagon::A2_pandt:
+ case Hexagon::A2_pandtnew:
+ case Hexagon::A2_porf:
+ case Hexagon::A2_porfnew:
+ case Hexagon::A2_port:
+ case Hexagon::A2_portnew:
+ case Hexagon::A2_psubf:
+ case Hexagon::A2_psubfnew:
+ case Hexagon::A2_psubt:
+ case Hexagon::A2_psubtnew:
+ case Hexagon::A2_pxorf:
+ case Hexagon::A2_pxorfnew:
+ case Hexagon::A2_pxort:
+ case Hexagon::A2_pxortnew:
+ case Hexagon::A4_paslhf:
+ case Hexagon::A4_paslhfnew:
+ case Hexagon::A4_paslht:
+ case Hexagon::A4_paslhtnew:
+ case Hexagon::A4_pasrhf:
+ case Hexagon::A4_pasrhfnew:
+ case Hexagon::A4_pasrht:
+ case Hexagon::A4_pasrhtnew:
+ case Hexagon::A4_psxtbf:
+ case Hexagon::A4_psxtbfnew:
+ case Hexagon::A4_psxtbt:
+ case Hexagon::A4_psxtbtnew:
+ case Hexagon::A4_psxthf:
+ case Hexagon::A4_psxthfnew:
+ case Hexagon::A4_psxtht:
+ case Hexagon::A4_psxthtnew:
+ case Hexagon::A4_pzxtbf:
+ case Hexagon::A4_pzxtbfnew:
+ case Hexagon::A4_pzxtbt:
+ case Hexagon::A4_pzxtbtnew:
+ case Hexagon::A4_pzxthf:
+ case Hexagon::A4_pzxthfnew:
+ case Hexagon::A4_pzxtht:
+ case Hexagon::A4_pzxthtnew:
+ case Hexagon::C2_ccombinewf:
+ case Hexagon::C2_ccombinewt:
+ return true;
+ }
+ return false;
+}
+
+
+// FIXME - Function name and it's functionality don't match.
+// It should be renamed to hasPredNewOpcode()
+bool HexagonInstrInfo::isConditionalLoad(const MachineInstr &MI) const {
+ if (!MI.getDesc().mayLoad() || !isPredicated(MI))
+ return false;
+
+ int PNewOpcode = Hexagon::getPredNewOpcode(MI.getOpcode());
+ // Instruction with valid predicated-new opcode can be promoted to .new.
+ return PNewOpcode >= 0;
+}
+
+
+// Returns true if an instruction is a conditional store.
+//
+// Note: It doesn't include conditional new-value stores as they can't be
+// converted to .new predicate.
+bool HexagonInstrInfo::isConditionalStore(const MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ default: return false;
+ case Hexagon::S4_storeirbt_io:
+ case Hexagon::S4_storeirbf_io:
+ case Hexagon::S4_pstorerbt_rr:
+ case Hexagon::S4_pstorerbf_rr:
+ case Hexagon::S2_pstorerbt_io:
+ case Hexagon::S2_pstorerbf_io:
+ case Hexagon::S2_pstorerbt_pi:
+ case Hexagon::S2_pstorerbf_pi:
+ case Hexagon::S2_pstorerdt_io:
+ case Hexagon::S2_pstorerdf_io:
+ case Hexagon::S4_pstorerdt_rr:
+ case Hexagon::S4_pstorerdf_rr:
+ case Hexagon::S2_pstorerdt_pi:
+ case Hexagon::S2_pstorerdf_pi:
+ case Hexagon::S2_pstorerht_io:
+ case Hexagon::S2_pstorerhf_io:
+ case Hexagon::S4_storeirht_io:
+ case Hexagon::S4_storeirhf_io:
+ case Hexagon::S4_pstorerht_rr:
+ case Hexagon::S4_pstorerhf_rr:
+ case Hexagon::S2_pstorerht_pi:
+ case Hexagon::S2_pstorerhf_pi:
+ case Hexagon::S2_pstorerit_io:
+ case Hexagon::S2_pstorerif_io:
+ case Hexagon::S4_storeirit_io:
+ case Hexagon::S4_storeirif_io:
+ case Hexagon::S4_pstorerit_rr:
+ case Hexagon::S4_pstorerif_rr:
+ case Hexagon::S2_pstorerit_pi:
+ case Hexagon::S2_pstorerif_pi:
+
+ // V4 global address store before promoting to dot new.
+ case Hexagon::S4_pstorerdt_abs:
+ case Hexagon::S4_pstorerdf_abs:
+ case Hexagon::S4_pstorerbt_abs:
+ case Hexagon::S4_pstorerbf_abs:
+ case Hexagon::S4_pstorerht_abs:
+ case Hexagon::S4_pstorerhf_abs:
+ case Hexagon::S4_pstorerit_abs:
+ case Hexagon::S4_pstorerif_abs:
+ return true;
+
+ // Predicated new value stores (i.e. if (p0) memw(..)=r0.new) are excluded
+ // from the "Conditional Store" list. Because a predicated new value store
+ // would NOT be promoted to a double dot new store.
+ // This function returns yes for those stores that are predicated but not
+ // yet promoted to predicate dot new instructions.
+ }
+}
+
+
+bool HexagonInstrInfo::isConditionalTransfer(const MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ case Hexagon::A2_tfrt:
+ case Hexagon::A2_tfrf:
+ case Hexagon::C2_cmoveit:
+ case Hexagon::C2_cmoveif:
+ case Hexagon::A2_tfrtnew:
+ case Hexagon::A2_tfrfnew:
+ case Hexagon::C2_cmovenewit:
+ case Hexagon::C2_cmovenewif:
+ case Hexagon::A2_tfrpt:
+ case Hexagon::A2_tfrpf:
+ return true;
+
+ default:
+ return false;
+ }
+ return false;
+}
+
+
+// TODO: In order to have isExtendable for fpimm/f32Ext, we need to handle
+// isFPImm and later getFPImm as well.
+bool HexagonInstrInfo::isConstExtended(const MachineInstr &MI) const {
+ const uint64_t F = MI.getDesc().TSFlags;
+ unsigned isExtended = (F >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask;
+ if (isExtended) // Instruction must be extended.
+ return true;
+
+ unsigned isExtendable =
+ (F >> HexagonII::ExtendablePos) & HexagonII::ExtendableMask;
+ if (!isExtendable)
+ return false;
+
+ if (MI.isCall())
+ return false;
+
+ short ExtOpNum = getCExtOpNum(MI);
+ const MachineOperand &MO = MI.getOperand(ExtOpNum);
+ // Use MO operand flags to determine if MO
+ // has the HMOTF_ConstExtended flag set.
+ if (MO.getTargetFlags() && HexagonII::HMOTF_ConstExtended)
+ return true;
+ // If this is a Machine BB address we are talking about, and it is
+ // not marked as extended, say so.
+ if (MO.isMBB())
+ return false;
+
+ // We could be using an instruction with an extendable immediate and shoehorn
+ // a global address into it. If it is a global address it will be constant
+ // extended. We do this for COMBINE.
+ // We currently only handle isGlobal() because it is the only kind of
+ // object we are going to end up with here for now.
+ // In the future we probably should add isSymbol(), etc.
+ if (MO.isGlobal() || MO.isSymbol() || MO.isBlockAddress() ||
+ MO.isJTI() || MO.isCPI() || MO.isFPImm())
+ return true;
+
+ // If the extendable operand is not 'Immediate' type, the instruction should
+ // have 'isExtended' flag set.
+ assert(MO.isImm() && "Extendable operand must be Immediate type");
+
+ int MinValue = getMinValue(MI);
+ int MaxValue = getMaxValue(MI);
+ int ImmValue = MO.getImm();
+
+ return (ImmValue < MinValue || ImmValue > MaxValue);
+}
+
+
+bool HexagonInstrInfo::isDeallocRet(const MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ case Hexagon::L4_return :
+ case Hexagon::L4_return_t :
+ case Hexagon::L4_return_f :
+ case Hexagon::L4_return_tnew_pnt :
+ case Hexagon::L4_return_fnew_pnt :
+ case Hexagon::L4_return_tnew_pt :
+ case Hexagon::L4_return_fnew_pt :
+ return true;
+ }
+ return false;
+}
+
+
+// Return true when ConsMI uses a register defined by ProdMI.
+bool HexagonInstrInfo::isDependent(const MachineInstr &ProdMI,
+ const MachineInstr &ConsMI) const {
+ if (!ProdMI.getDesc().getNumDefs())
+ return false;
+
+ auto &HRI = getRegisterInfo();
+
+ SmallVector<unsigned, 4> DefsA;
+ SmallVector<unsigned, 4> DefsB;
+ SmallVector<unsigned, 8> UsesA;
+ SmallVector<unsigned, 8> UsesB;
+
+ parseOperands(ProdMI, DefsA, UsesA);
+ parseOperands(ConsMI, DefsB, UsesB);
+
+ for (auto &RegA : DefsA)
+ for (auto &RegB : UsesB) {
+ // True data dependency.
+ if (RegA == RegB)
+ return true;
+
+ if (Hexagon::DoubleRegsRegClass.contains(RegA))
+ for (MCSubRegIterator SubRegs(RegA, &HRI); SubRegs.isValid(); ++SubRegs)
+ if (RegB == *SubRegs)
+ return true;
+
+ if (Hexagon::DoubleRegsRegClass.contains(RegB))
+ for (MCSubRegIterator SubRegs(RegB, &HRI); SubRegs.isValid(); ++SubRegs)
+ if (RegA == *SubRegs)
+ return true;
+ }
+
+ return false;
+}
+
+
+// Returns true if the instruction is alread a .cur.
+bool HexagonInstrInfo::isDotCurInst(const MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ case Hexagon::V6_vL32b_cur_pi:
+ case Hexagon::V6_vL32b_cur_ai:
+ case Hexagon::V6_vL32b_cur_pi_128B:
+ case Hexagon::V6_vL32b_cur_ai_128B:
+ return true;
+ }
+ return false;
+}
+
+
+// Returns true, if any one of the operands is a dot new
+// insn, whether it is predicated dot new or register dot new.
+bool HexagonInstrInfo::isDotNewInst(const MachineInstr &MI) const {
+ if (isNewValueInst(MI) || (isPredicated(MI) && isPredicatedNew(MI)))
+ return true;
+
+ return false;
+}
+
+
+/// Symmetrical. See if these two instructions are fit for duplex pair.
+bool HexagonInstrInfo::isDuplexPair(const MachineInstr &MIa,
+ const MachineInstr &MIb) const {
+ HexagonII::SubInstructionGroup MIaG = getDuplexCandidateGroup(MIa);
+ HexagonII::SubInstructionGroup MIbG = getDuplexCandidateGroup(MIb);
+ return (isDuplexPairMatch(MIaG, MIbG) || isDuplexPairMatch(MIbG, MIaG));
+}
+
+
+bool HexagonInstrInfo::isEarlySourceInstr(const MachineInstr &MI) const {
+ if (MI.mayLoad() || MI.mayStore() || MI.isCompare())
+ return true;
+
+ // Multiply
+ unsigned SchedClass = MI.getDesc().getSchedClass();
+ if (SchedClass == Hexagon::Sched::M_tc_3or4x_SLOT23)
+ return true;
+ return false;
+}
+
+
+bool HexagonInstrInfo::isEndLoopN(unsigned Opcode) const {
+ return (Opcode == Hexagon::ENDLOOP0 ||
+ Opcode == Hexagon::ENDLOOP1);
+}
+
+
+bool HexagonInstrInfo::isExpr(unsigned OpType) const {
+ switch(OpType) {
+ case MachineOperand::MO_MachineBasicBlock:
+ case MachineOperand::MO_GlobalAddress:
+ case MachineOperand::MO_ExternalSymbol:
+ case MachineOperand::MO_JumpTableIndex:
+ case MachineOperand::MO_ConstantPoolIndex:
+ case MachineOperand::MO_BlockAddress:
+ return true;
+ default:
+ return false;
+ }
+}
+
+
+bool HexagonInstrInfo::isExtendable(const MachineInstr &MI) const {
+ const MCInstrDesc &MID = MI.getDesc();
+ const uint64_t F = MID.TSFlags;
+ if ((F >> HexagonII::ExtendablePos) & HexagonII::ExtendableMask)
+ return true;
+
+ // TODO: This is largely obsolete now. Will need to be removed
+ // in consecutive patches.
+ switch (MI.getOpcode()) {
+ // PS_fi and PS_fia remain special cases.
+ case Hexagon::PS_fi:
+ case Hexagon::PS_fia:
+ return true;
+ default:
+ return false;
+ }
+ return false;
+}
+
+
+// This returns true in two cases:
+// - The OP code itself indicates that this is an extended instruction.
+// - One of MOs has been marked with HMOTF_ConstExtended flag.
+bool HexagonInstrInfo::isExtended(const MachineInstr &MI) const {
+ // First check if this is permanently extended op code.
+ const uint64_t F = MI.getDesc().TSFlags;
+ if ((F >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask)
+ return true;
+ // Use MO operand flags to determine if one of MI's operands
+ // has HMOTF_ConstExtended flag set.
+ for (MachineInstr::const_mop_iterator I = MI.operands_begin(),
+ E = MI.operands_end(); I != E; ++I) {
+ if (I->getTargetFlags() && HexagonII::HMOTF_ConstExtended)
+ return true;
+ }
+ return false;
+}
+
+
+bool HexagonInstrInfo::isFloat(const MachineInstr &MI) const {
+ unsigned Opcode = MI.getOpcode();
+ const uint64_t F = get(Opcode).TSFlags;
+ return (F >> HexagonII::FPPos) & HexagonII::FPMask;
+}
+
+
+// No V60 HVX VMEM with A_INDIRECT.
+bool HexagonInstrInfo::isHVXMemWithAIndirect(const MachineInstr &I,
+ const MachineInstr &J) const {
+ if (!isV60VectorInstruction(I))
+ return false;
+ if (!I.mayLoad() && !I.mayStore())
+ return false;
+ return J.isIndirectBranch() || isIndirectCall(J) || isIndirectL4Return(J);
+}
+
+
+bool HexagonInstrInfo::isIndirectCall(const MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ case Hexagon::J2_callr :
+ case Hexagon::J2_callrf :
+ case Hexagon::J2_callrt :
+ case Hexagon::PS_call_nr :
+ return true;
+ }
+ return false;
+}
+
+
+bool HexagonInstrInfo::isIndirectL4Return(const MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ case Hexagon::L4_return :
+ case Hexagon::L4_return_t :
+ case Hexagon::L4_return_f :
+ case Hexagon::L4_return_fnew_pnt :
+ case Hexagon::L4_return_fnew_pt :
+ case Hexagon::L4_return_tnew_pnt :
+ case Hexagon::L4_return_tnew_pt :
+ return true;
+ }
+ return false;
+}
+
+
+bool HexagonInstrInfo::isJumpR(const MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ case Hexagon::J2_jumpr :
+ case Hexagon::J2_jumprt :
+ case Hexagon::J2_jumprf :
+ case Hexagon::J2_jumprtnewpt :
+ case Hexagon::J2_jumprfnewpt :
+ case Hexagon::J2_jumprtnew :
+ case Hexagon::J2_jumprfnew :
+ return true;
+ }
+ return false;
+}
+
+
+// Return true if a given MI can accommodate given offset.
+// Use abs estimate as oppose to the exact number.
+// TODO: This will need to be changed to use MC level
+// definition of instruction extendable field size.
+bool HexagonInstrInfo::isJumpWithinBranchRange(const MachineInstr &MI,
+ unsigned offset) const {
+ // This selection of jump instructions matches to that what
+ // AnalyzeBranch can parse, plus NVJ.
+ if (isNewValueJump(MI)) // r9:2
+ return isInt<11>(offset);
+
+ switch (MI.getOpcode()) {
+ // Still missing Jump to address condition on register value.
+ default:
+ return false;
+ case Hexagon::J2_jump: // bits<24> dst; // r22:2
+ case Hexagon::J2_call:
+ case Hexagon::PS_call_nr:
+ return isInt<24>(offset);
+ case Hexagon::J2_jumpt: //bits<17> dst; // r15:2
+ case Hexagon::J2_jumpf:
+ case Hexagon::J2_jumptnew:
+ case Hexagon::J2_jumptnewpt:
+ case Hexagon::J2_jumpfnew:
+ case Hexagon::J2_jumpfnewpt:
+ case Hexagon::J2_callt:
+ case Hexagon::J2_callf:
+ return isInt<17>(offset);
+ case Hexagon::J2_loop0i:
+ case Hexagon::J2_loop0iext:
+ case Hexagon::J2_loop0r:
+ case Hexagon::J2_loop0rext:
+ case Hexagon::J2_loop1i:
+ case Hexagon::J2_loop1iext:
+ case Hexagon::J2_loop1r:
+ case Hexagon::J2_loop1rext:
+ return isInt<9>(offset);
+ // TODO: Add all the compound branches here. Can we do this in Relation model?
+ case Hexagon::J4_cmpeqi_tp0_jump_nt:
+ case Hexagon::J4_cmpeqi_tp1_jump_nt:
+ return isInt<11>(offset);
+ }
+}
+
+
+bool HexagonInstrInfo::isLateInstrFeedsEarlyInstr(const MachineInstr &LRMI,
+ const MachineInstr &ESMI) const {
+ bool isLate = isLateResultInstr(LRMI);
+ bool isEarly = isEarlySourceInstr(ESMI);
+
+ DEBUG(dbgs() << "V60" << (isLate ? "-LR " : " -- "));
+ DEBUG(LRMI.dump());
+ DEBUG(dbgs() << "V60" << (isEarly ? "-ES " : " -- "));
+ DEBUG(ESMI.dump());
+
+ if (isLate && isEarly) {
+ DEBUG(dbgs() << "++Is Late Result feeding Early Source\n");
+ return true;
+ }
+
+ return false;
+}
+
+
+bool HexagonInstrInfo::isLateResultInstr(const MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ case TargetOpcode::EXTRACT_SUBREG:
+ case TargetOpcode::INSERT_SUBREG:
+ case TargetOpcode::SUBREG_TO_REG:
+ case TargetOpcode::REG_SEQUENCE:
+ case TargetOpcode::IMPLICIT_DEF:
+ case TargetOpcode::COPY:
+ case TargetOpcode::INLINEASM:
+ case TargetOpcode::PHI:
+ return false;
+ default:
+ break;
+ }
+
+ unsigned SchedClass = MI.getDesc().getSchedClass();
+
+ switch (SchedClass) {
+ case Hexagon::Sched::ALU32_2op_tc_1_SLOT0123:
+ case Hexagon::Sched::ALU32_3op_tc_1_SLOT0123:
+ case Hexagon::Sched::ALU32_ADDI_tc_1_SLOT0123:
+ case Hexagon::Sched::ALU64_tc_1_SLOT23:
+ case Hexagon::Sched::EXTENDER_tc_1_SLOT0123:
+ case Hexagon::Sched::S_2op_tc_1_SLOT23:
+ case Hexagon::Sched::S_3op_tc_1_SLOT23:
+ case Hexagon::Sched::V2LDST_tc_ld_SLOT01:
+ case Hexagon::Sched::V2LDST_tc_st_SLOT0:
+ case Hexagon::Sched::V2LDST_tc_st_SLOT01:
+ case Hexagon::Sched::V4LDST_tc_ld_SLOT01:
+ case Hexagon::Sched::V4LDST_tc_st_SLOT0:
+ case Hexagon::Sched::V4LDST_tc_st_SLOT01:
+ return false;
+ }
+ return true;
+}
+
+
+bool HexagonInstrInfo::isLateSourceInstr(const MachineInstr &MI) const {
+ // Instructions with iclass A_CVI_VX and attribute A_CVI_LATE uses a multiply
+ // resource, but all operands can be received late like an ALU instruction.
+ return MI.getDesc().getSchedClass() == Hexagon::Sched::CVI_VX_LATE;
+}
+
+
+bool HexagonInstrInfo::isLoopN(const MachineInstr &MI) const {
+ unsigned Opcode = MI.getOpcode();
+ return Opcode == Hexagon::J2_loop0i ||
+ Opcode == Hexagon::J2_loop0r ||
+ Opcode == Hexagon::J2_loop0iext ||
+ Opcode == Hexagon::J2_loop0rext ||
+ Opcode == Hexagon::J2_loop1i ||
+ Opcode == Hexagon::J2_loop1r ||
+ Opcode == Hexagon::J2_loop1iext ||
+ Opcode == Hexagon::J2_loop1rext;
+}
+
+
+bool HexagonInstrInfo::isMemOp(const MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ default: return false;
+ case Hexagon::L4_iadd_memopw_io :
+ case Hexagon::L4_isub_memopw_io :
+ case Hexagon::L4_add_memopw_io :
+ case Hexagon::L4_sub_memopw_io :
+ case Hexagon::L4_and_memopw_io :
+ case Hexagon::L4_or_memopw_io :
+ case Hexagon::L4_iadd_memoph_io :
+ case Hexagon::L4_isub_memoph_io :
+ case Hexagon::L4_add_memoph_io :
+ case Hexagon::L4_sub_memoph_io :
+ case Hexagon::L4_and_memoph_io :
+ case Hexagon::L4_or_memoph_io :
+ case Hexagon::L4_iadd_memopb_io :
+ case Hexagon::L4_isub_memopb_io :
+ case Hexagon::L4_add_memopb_io :
+ case Hexagon::L4_sub_memopb_io :
+ case Hexagon::L4_and_memopb_io :
+ case Hexagon::L4_or_memopb_io :
+ case Hexagon::L4_ior_memopb_io:
+ case Hexagon::L4_ior_memoph_io:
+ case Hexagon::L4_ior_memopw_io:
+ case Hexagon::L4_iand_memopb_io:
+ case Hexagon::L4_iand_memoph_io:
+ case Hexagon::L4_iand_memopw_io:
+ return true;
+ }
+ return false;
+}
+
+
+bool HexagonInstrInfo::isNewValue(const MachineInstr &MI) const {
+ const uint64_t F = MI.getDesc().TSFlags;
+ return (F >> HexagonII::NewValuePos) & HexagonII::NewValueMask;
+}
+
+
+bool HexagonInstrInfo::isNewValue(unsigned Opcode) const {
+ const uint64_t F = get(Opcode).TSFlags;
+ return (F >> HexagonII::NewValuePos) & HexagonII::NewValueMask;
+}
+
+
+bool HexagonInstrInfo::isNewValueInst(const MachineInstr &MI) const {
+ return isNewValueJump(MI) || isNewValueStore(MI);
+}
+
+
+bool HexagonInstrInfo::isNewValueJump(const MachineInstr &MI) const {
+ return isNewValue(MI) && MI.isBranch();
+}
+
+
+bool HexagonInstrInfo::isNewValueJump(unsigned Opcode) const {
+ return isNewValue(Opcode) && get(Opcode).isBranch() && isPredicated(Opcode);
+}
+
+
+bool HexagonInstrInfo::isNewValueStore(const MachineInstr &MI) const {
+ const uint64_t F = MI.getDesc().TSFlags;
+ return (F >> HexagonII::NVStorePos) & HexagonII::NVStoreMask;
+}
+
+
+bool HexagonInstrInfo::isNewValueStore(unsigned Opcode) const {
+ const uint64_t F = get(Opcode).TSFlags;
+ return (F >> HexagonII::NVStorePos) & HexagonII::NVStoreMask;
+}
+
+
+// Returns true if a particular operand is extendable for an instruction.
+bool HexagonInstrInfo::isOperandExtended(const MachineInstr &MI,
+ unsigned OperandNum) const {
+ const uint64_t F = MI.getDesc().TSFlags;
+ return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask)
+ == OperandNum;
+}
+
+
+bool HexagonInstrInfo::isPredicatedNew(const MachineInstr &MI) const {
+ const uint64_t F = MI.getDesc().TSFlags;
+ assert(isPredicated(MI));
+ return (F >> HexagonII::PredicatedNewPos) & HexagonII::PredicatedNewMask;
+}
+
+
+bool HexagonInstrInfo::isPredicatedNew(unsigned Opcode) const {
+ const uint64_t F = get(Opcode).TSFlags;
+ assert(isPredicated(Opcode));
+ return (F >> HexagonII::PredicatedNewPos) & HexagonII::PredicatedNewMask;
+}
+
+
+bool HexagonInstrInfo::isPredicatedTrue(const MachineInstr &MI) const {
+ const uint64_t F = MI.getDesc().TSFlags;
+ return !((F >> HexagonII::PredicatedFalsePos) &
+ HexagonII::PredicatedFalseMask);
+}
+
+
+bool HexagonInstrInfo::isPredicatedTrue(unsigned Opcode) const {
+ const uint64_t F = get(Opcode).TSFlags;
+ // Make sure that the instruction is predicated.
+ assert((F>> HexagonII::PredicatedPos) & HexagonII::PredicatedMask);
+ return !((F >> HexagonII::PredicatedFalsePos) &
+ HexagonII::PredicatedFalseMask);
+}
+
+
+bool HexagonInstrInfo::isPredicated(unsigned Opcode) const {
+ const uint64_t F = get(Opcode).TSFlags;
+ return (F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask;
+}
+
+
+bool HexagonInstrInfo::isPredicateLate(unsigned Opcode) const {
+ const uint64_t F = get(Opcode).TSFlags;
+ return ~(F >> HexagonII::PredicateLatePos) & HexagonII::PredicateLateMask;
+}
+
+
+bool HexagonInstrInfo::isPredictedTaken(unsigned Opcode) const {
+ const uint64_t F = get(Opcode).TSFlags;
+ assert(get(Opcode).isBranch() &&
+ (isPredicatedNew(Opcode) || isNewValue(Opcode)));
+ return (F >> HexagonII::TakenPos) & HexagonII::TakenMask;
+}
+
+
+bool HexagonInstrInfo::isSaveCalleeSavedRegsCall(const MachineInstr &MI) const {
+ return MI.getOpcode() == Hexagon::SAVE_REGISTERS_CALL_V4 ||
+ MI.getOpcode() == Hexagon::SAVE_REGISTERS_CALL_V4_EXT ||
+ MI.getOpcode() == Hexagon::SAVE_REGISTERS_CALL_V4_PIC ||
+ MI.getOpcode() == Hexagon::SAVE_REGISTERS_CALL_V4_EXT_PIC;
+}
+
+bool HexagonInstrInfo::isSignExtendingLoad(const MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ // Byte
+ case Hexagon::L2_loadrb_io:
+ case Hexagon::L4_loadrb_ur:
+ case Hexagon::L4_loadrb_ap:
+ case Hexagon::L2_loadrb_pr:
+ case Hexagon::L2_loadrb_pbr:
+ case Hexagon::L2_loadrb_pi:
+ case Hexagon::L2_loadrb_pci:
+ case Hexagon::L2_loadrb_pcr:
+ case Hexagon::L2_loadbsw2_io:
+ case Hexagon::L4_loadbsw2_ur:
+ case Hexagon::L4_loadbsw2_ap:
+ case Hexagon::L2_loadbsw2_pr:
+ case Hexagon::L2_loadbsw2_pbr:
+ case Hexagon::L2_loadbsw2_pi:
+ case Hexagon::L2_loadbsw2_pci:
+ case Hexagon::L2_loadbsw2_pcr:
+ case Hexagon::L2_loadbsw4_io:
+ case Hexagon::L4_loadbsw4_ur:
+ case Hexagon::L4_loadbsw4_ap:
+ case Hexagon::L2_loadbsw4_pr:
+ case Hexagon::L2_loadbsw4_pbr:
+ case Hexagon::L2_loadbsw4_pi:
+ case Hexagon::L2_loadbsw4_pci:
+ case Hexagon::L2_loadbsw4_pcr:
+ case Hexagon::L4_loadrb_rr:
+ case Hexagon::L2_ploadrbt_io:
+ case Hexagon::L2_ploadrbt_pi:
+ case Hexagon::L2_ploadrbf_io:
+ case Hexagon::L2_ploadrbf_pi:
+ case Hexagon::L2_ploadrbtnew_io:
+ case Hexagon::L2_ploadrbfnew_io:
+ case Hexagon::L4_ploadrbt_rr:
+ case Hexagon::L4_ploadrbf_rr:
+ case Hexagon::L4_ploadrbtnew_rr:
+ case Hexagon::L4_ploadrbfnew_rr:
+ case Hexagon::L2_ploadrbtnew_pi:
+ case Hexagon::L2_ploadrbfnew_pi:
+ case Hexagon::L4_ploadrbt_abs:
+ case Hexagon::L4_ploadrbf_abs:
+ case Hexagon::L4_ploadrbtnew_abs:
+ case Hexagon::L4_ploadrbfnew_abs:
+ case Hexagon::L2_loadrbgp:
+ // Half
+ case Hexagon::L2_loadrh_io:
+ case Hexagon::L4_loadrh_ur:
+ case Hexagon::L4_loadrh_ap:
+ case Hexagon::L2_loadrh_pr:
+ case Hexagon::L2_loadrh_pbr:
+ case Hexagon::L2_loadrh_pi:
+ case Hexagon::L2_loadrh_pci:
+ case Hexagon::L2_loadrh_pcr:
+ case Hexagon::L4_loadrh_rr:
+ case Hexagon::L2_ploadrht_io:
+ case Hexagon::L2_ploadrht_pi:
+ case Hexagon::L2_ploadrhf_io:
+ case Hexagon::L2_ploadrhf_pi:
+ case Hexagon::L2_ploadrhtnew_io:
+ case Hexagon::L2_ploadrhfnew_io:
+ case Hexagon::L4_ploadrht_rr:
+ case Hexagon::L4_ploadrhf_rr:
+ case Hexagon::L4_ploadrhtnew_rr:
+ case Hexagon::L4_ploadrhfnew_rr:
+ case Hexagon::L2_ploadrhtnew_pi:
+ case Hexagon::L2_ploadrhfnew_pi:
+ case Hexagon::L4_ploadrht_abs:
+ case Hexagon::L4_ploadrhf_abs:
+ case Hexagon::L4_ploadrhtnew_abs:
+ case Hexagon::L4_ploadrhfnew_abs:
+ case Hexagon::L2_loadrhgp:
+ return true;
+ default:
+ return false;
+ }
+}
+
+
+bool HexagonInstrInfo::isSolo(const MachineInstr &MI) const {
+ const uint64_t F = MI.getDesc().TSFlags;
+ return (F >> HexagonII::SoloPos) & HexagonII::SoloMask;
+}
+
+
+bool HexagonInstrInfo::isSpillPredRegOp(const MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ case Hexagon::STriw_pred :
+ case Hexagon::LDriw_pred :
+ return true;
+ default:
+ return false;
+ }
+}
+
+
+bool HexagonInstrInfo::isTailCall(const MachineInstr &MI) const {
+ if (!MI.isBranch())
+ return false;
+
+ for (auto &Op : MI.operands())
+ if (Op.isGlobal() || Op.isSymbol())
+ return true;
+ return false;
+}
+
+
+// Returns true when SU has a timing class TC1.
+bool HexagonInstrInfo::isTC1(const MachineInstr &MI) const {
+ unsigned SchedClass = MI.getDesc().getSchedClass();
+ switch (SchedClass) {
+ case Hexagon::Sched::ALU32_2op_tc_1_SLOT0123:
+ case Hexagon::Sched::ALU32_3op_tc_1_SLOT0123:
+ case Hexagon::Sched::ALU32_ADDI_tc_1_SLOT0123:
+ case Hexagon::Sched::ALU64_tc_1_SLOT23:
+ case Hexagon::Sched::EXTENDER_tc_1_SLOT0123:
+ //case Hexagon::Sched::M_tc_1_SLOT23:
+ case Hexagon::Sched::S_2op_tc_1_SLOT23:
+ case Hexagon::Sched::S_3op_tc_1_SLOT23:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+
+bool HexagonInstrInfo::isTC2(const MachineInstr &MI) const {
+ unsigned SchedClass = MI.getDesc().getSchedClass();
+ switch (SchedClass) {
+ case Hexagon::Sched::ALU32_3op_tc_2_SLOT0123:
+ case Hexagon::Sched::ALU64_tc_2_SLOT23:
+ case Hexagon::Sched::CR_tc_2_SLOT3:
+ case Hexagon::Sched::M_tc_2_SLOT23:
+ case Hexagon::Sched::S_2op_tc_2_SLOT23:
+ case Hexagon::Sched::S_3op_tc_2_SLOT23:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+
+bool HexagonInstrInfo::isTC2Early(const MachineInstr &MI) const {
+ unsigned SchedClass = MI.getDesc().getSchedClass();
+ switch (SchedClass) {
+ case Hexagon::Sched::ALU32_2op_tc_2early_SLOT0123:
+ case Hexagon::Sched::ALU32_3op_tc_2early_SLOT0123:
+ case Hexagon::Sched::ALU64_tc_2early_SLOT23:
+ case Hexagon::Sched::CR_tc_2early_SLOT23:
+ case Hexagon::Sched::CR_tc_2early_SLOT3:
+ case Hexagon::Sched::J_tc_2early_SLOT0123:
+ case Hexagon::Sched::J_tc_2early_SLOT2:
+ case Hexagon::Sched::J_tc_2early_SLOT23:
+ case Hexagon::Sched::S_2op_tc_2early_SLOT23:
+ case Hexagon::Sched::S_3op_tc_2early_SLOT23:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+
+bool HexagonInstrInfo::isTC4x(const MachineInstr &MI) const {
+ unsigned SchedClass = MI.getDesc().getSchedClass();
+ return SchedClass == Hexagon::Sched::M_tc_3or4x_SLOT23;
+}
+
+
+// Schedule this ASAP.
+bool HexagonInstrInfo::isToBeScheduledASAP(const MachineInstr &MI1,
+ const MachineInstr &MI2) const {
+ if (mayBeCurLoad(MI1)) {
+ // if (result of SU is used in Next) return true;
+ unsigned DstReg = MI1.getOperand(0).getReg();
+ int N = MI2.getNumOperands();
+ for (int I = 0; I < N; I++)
+ if (MI2.getOperand(I).isReg() && DstReg == MI2.getOperand(I).getReg())
+ return true;
+ }
+ if (mayBeNewStore(MI2))
+ if (MI2.getOpcode() == Hexagon::V6_vS32b_pi)
+ if (MI1.getOperand(0).isReg() && MI2.getOperand(3).isReg() &&
+ MI1.getOperand(0).getReg() == MI2.getOperand(3).getReg())
+ return true;
+ return false;
+}
+
+
+bool HexagonInstrInfo::isV60VectorInstruction(const MachineInstr &MI) const {
+ const uint64_t V = getType(MI);
+ return HexagonII::TypeCVI_FIRST <= V && V <= HexagonII::TypeCVI_LAST;
+}
+
+
+// Check if the Offset is a valid auto-inc imm by Load/Store Type.
+//
+bool HexagonInstrInfo::isValidAutoIncImm(const EVT VT, const int Offset) const {
+ if (VT == MVT::v16i32 || VT == MVT::v8i64 ||
+ VT == MVT::v32i16 || VT == MVT::v64i8) {
+ return (Offset >= Hexagon_MEMV_AUTOINC_MIN &&
+ Offset <= Hexagon_MEMV_AUTOINC_MAX &&
+ (Offset & 0x3f) == 0);
+ }
+ // 128B
+ if (VT == MVT::v32i32 || VT == MVT::v16i64 ||
+ VT == MVT::v64i16 || VT == MVT::v128i8) {
+ return (Offset >= Hexagon_MEMV_AUTOINC_MIN_128B &&
+ Offset <= Hexagon_MEMV_AUTOINC_MAX_128B &&
+ (Offset & 0x7f) == 0);
+ }
+ if (VT == MVT::i64) {
+ return (Offset >= Hexagon_MEMD_AUTOINC_MIN &&
+ Offset <= Hexagon_MEMD_AUTOINC_MAX &&
+ (Offset & 0x7) == 0);
+ }
+ if (VT == MVT::i32) {
+ return (Offset >= Hexagon_MEMW_AUTOINC_MIN &&
+ Offset <= Hexagon_MEMW_AUTOINC_MAX &&
+ (Offset & 0x3) == 0);
+ }
+ if (VT == MVT::i16) {
+ return (Offset >= Hexagon_MEMH_AUTOINC_MIN &&
+ Offset <= Hexagon_MEMH_AUTOINC_MAX &&
+ (Offset & 0x1) == 0);
+ }
+ if (VT == MVT::i8) {
+ return (Offset >= Hexagon_MEMB_AUTOINC_MIN &&
+ Offset <= Hexagon_MEMB_AUTOINC_MAX);
+ }
+ llvm_unreachable("Not an auto-inc opc!");
+}
+
+
+bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
+ bool Extend) const {
+ // This function is to check whether the "Offset" is in the correct range of
+ // the given "Opcode". If "Offset" is not in the correct range, "A2_addi" is
+ // inserted to calculate the final address. Due to this reason, the function
+ // assumes that the "Offset" has correct alignment.
+ // We used to assert if the offset was not properly aligned, however,
+ // there are cases where a misaligned pointer recast can cause this
+ // problem, and we need to allow for it. The front end warns of such
+ // misaligns with respect to load size.
+
+ switch (Opcode) {
+ case Hexagon::PS_vstorerq_ai:
+ case Hexagon::PS_vstorerw_ai:
+ case Hexagon::PS_vloadrq_ai:
+ case Hexagon::PS_vloadrw_ai:
+ case Hexagon::V6_vL32b_ai:
+ case Hexagon::V6_vS32b_ai:
+ case Hexagon::V6_vL32Ub_ai:
+ case Hexagon::V6_vS32Ub_ai:
+ return (Offset >= Hexagon_MEMV_OFFSET_MIN) &&
+ (Offset <= Hexagon_MEMV_OFFSET_MAX);
+
+ case Hexagon::PS_vstorerq_ai_128B:
+ case Hexagon::PS_vstorerw_ai_128B:
+ case Hexagon::PS_vloadrq_ai_128B:
+ case Hexagon::PS_vloadrw_ai_128B:
+ case Hexagon::V6_vL32b_ai_128B:
+ case Hexagon::V6_vS32b_ai_128B:
+ case Hexagon::V6_vL32Ub_ai_128B:
+ case Hexagon::V6_vS32Ub_ai_128B:
+ return (Offset >= Hexagon_MEMV_OFFSET_MIN_128B) &&
+ (Offset <= Hexagon_MEMV_OFFSET_MAX_128B);
+
+ case Hexagon::J2_loop0i:
+ case Hexagon::J2_loop1i:
+ return isUInt<10>(Offset);
+
+ case Hexagon::S4_storeirb_io:
+ case Hexagon::S4_storeirbt_io:
+ case Hexagon::S4_storeirbf_io:
+ return isUInt<6>(Offset);
+
+ case Hexagon::S4_storeirh_io:
+ case Hexagon::S4_storeirht_io:
+ case Hexagon::S4_storeirhf_io:
+ return isShiftedUInt<6,1>(Offset);
+
+ case Hexagon::S4_storeiri_io:
+ case Hexagon::S4_storeirit_io:
+ case Hexagon::S4_storeirif_io:
+ return isShiftedUInt<6,2>(Offset);
+ }
+
+ if (Extend)
+ return true;
+
+ switch (Opcode) {
+ case Hexagon::L2_loadri_io:
+ case Hexagon::S2_storeri_io:
+ return (Offset >= Hexagon_MEMW_OFFSET_MIN) &&
+ (Offset <= Hexagon_MEMW_OFFSET_MAX);
+
+ case Hexagon::L2_loadrd_io:
+ case Hexagon::S2_storerd_io:
+ return (Offset >= Hexagon_MEMD_OFFSET_MIN) &&
+ (Offset <= Hexagon_MEMD_OFFSET_MAX);
+
+ case Hexagon::L2_loadrh_io:
+ case Hexagon::L2_loadruh_io:
+ case Hexagon::S2_storerh_io:
+ return (Offset >= Hexagon_MEMH_OFFSET_MIN) &&
+ (Offset <= Hexagon_MEMH_OFFSET_MAX);
+
+ case Hexagon::L2_loadrb_io:
+ case Hexagon::L2_loadrub_io:
+ case Hexagon::S2_storerb_io:
+ return (Offset >= Hexagon_MEMB_OFFSET_MIN) &&
+ (Offset <= Hexagon_MEMB_OFFSET_MAX);
+
+ case Hexagon::A2_addi:
+ return (Offset >= Hexagon_ADDI_OFFSET_MIN) &&
+ (Offset <= Hexagon_ADDI_OFFSET_MAX);
+
+ case Hexagon::L4_iadd_memopw_io :
+ case Hexagon::L4_isub_memopw_io :
+ case Hexagon::L4_add_memopw_io :
+ case Hexagon::L4_sub_memopw_io :
+ case Hexagon::L4_and_memopw_io :
+ case Hexagon::L4_or_memopw_io :
+ return (0 <= Offset && Offset <= 255);
+
+ case Hexagon::L4_iadd_memoph_io :
+ case Hexagon::L4_isub_memoph_io :
+ case Hexagon::L4_add_memoph_io :
+ case Hexagon::L4_sub_memoph_io :
+ case Hexagon::L4_and_memoph_io :
+ case Hexagon::L4_or_memoph_io :
+ return (0 <= Offset && Offset <= 127);
+
+ case Hexagon::L4_iadd_memopb_io :
+ case Hexagon::L4_isub_memopb_io :
+ case Hexagon::L4_add_memopb_io :
+ case Hexagon::L4_sub_memopb_io :
+ case Hexagon::L4_and_memopb_io :
+ case Hexagon::L4_or_memopb_io :
+ return (0 <= Offset && Offset <= 63);
+
+ // LDriw_xxx and STriw_xxx are pseudo operations, so it has to take offset of
+ // any size. Later pass knows how to handle it.
+ case Hexagon::STriw_pred:
+ case Hexagon::LDriw_pred:
+ case Hexagon::STriw_mod:
+ case Hexagon::LDriw_mod:
+ return true;
+
+ case Hexagon::PS_fi:
+ case Hexagon::PS_fia:
+ case Hexagon::INLINEASM:
+ return true;
+
+ case Hexagon::L2_ploadrbt_io:
+ case Hexagon::L2_ploadrbf_io:
+ case Hexagon::L2_ploadrubt_io:
+ case Hexagon::L2_ploadrubf_io:
+ case Hexagon::S2_pstorerbt_io:
+ case Hexagon::S2_pstorerbf_io:
+ return isUInt<6>(Offset);
+
+ case Hexagon::L2_ploadrht_io:
+ case Hexagon::L2_ploadrhf_io:
+ case Hexagon::L2_ploadruht_io:
+ case Hexagon::L2_ploadruhf_io:
+ case Hexagon::S2_pstorerht_io:
+ case Hexagon::S2_pstorerhf_io:
+ return isShiftedUInt<6,1>(Offset);
+
+ case Hexagon::L2_ploadrit_io:
+ case Hexagon::L2_ploadrif_io:
+ case Hexagon::S2_pstorerit_io:
+ case Hexagon::S2_pstorerif_io:
+ return isShiftedUInt<6,2>(Offset);
+
+ case Hexagon::L2_ploadrdt_io:
+ case Hexagon::L2_ploadrdf_io:
+ case Hexagon::S2_pstorerdt_io:
+ case Hexagon::S2_pstorerdf_io:
+ return isShiftedUInt<6,3>(Offset);
+ } // switch
+
+ llvm_unreachable("No offset range is defined for this opcode. "
+ "Please define it in the above switch statement!");
+}
+
+
+bool HexagonInstrInfo::isVecAcc(const MachineInstr &MI) const {
+ return isV60VectorInstruction(MI) && isAccumulator(MI);
+}
+
+
+bool HexagonInstrInfo::isVecALU(const MachineInstr &MI) const {
+ const uint64_t F = get(MI.getOpcode()).TSFlags;
+ const uint64_t V = ((F >> HexagonII::TypePos) & HexagonII::TypeMask);
+ return
+ V == HexagonII::TypeCVI_VA ||
+ V == HexagonII::TypeCVI_VA_DV;
+}
+
+
+bool HexagonInstrInfo::isVecUsableNextPacket(const MachineInstr &ProdMI,
+ const MachineInstr &ConsMI) const {
+ if (EnableACCForwarding && isVecAcc(ProdMI) && isVecAcc(ConsMI))
+ return true;
+
+ if (EnableALUForwarding && (isVecALU(ConsMI) || isLateSourceInstr(ConsMI)))
+ return true;
+
+ if (mayBeNewStore(ConsMI))
+ return true;
+
+ return false;
+}
+
+bool HexagonInstrInfo::isZeroExtendingLoad(const MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ // Byte
+ case Hexagon::L2_loadrub_io:
+ case Hexagon::L4_loadrub_ur:
+ case Hexagon::L4_loadrub_ap:
+ case Hexagon::L2_loadrub_pr:
+ case Hexagon::L2_loadrub_pbr:
+ case Hexagon::L2_loadrub_pi:
+ case Hexagon::L2_loadrub_pci:
+ case Hexagon::L2_loadrub_pcr:
+ case Hexagon::L2_loadbzw2_io:
+ case Hexagon::L4_loadbzw2_ur:
+ case Hexagon::L4_loadbzw2_ap:
+ case Hexagon::L2_loadbzw2_pr:
+ case Hexagon::L2_loadbzw2_pbr:
+ case Hexagon::L2_loadbzw2_pi:
+ case Hexagon::L2_loadbzw2_pci:
+ case Hexagon::L2_loadbzw2_pcr:
+ case Hexagon::L2_loadbzw4_io:
+ case Hexagon::L4_loadbzw4_ur:
+ case Hexagon::L4_loadbzw4_ap:
+ case Hexagon::L2_loadbzw4_pr:
+ case Hexagon::L2_loadbzw4_pbr:
+ case Hexagon::L2_loadbzw4_pi:
+ case Hexagon::L2_loadbzw4_pci:
+ case Hexagon::L2_loadbzw4_pcr:
+ case Hexagon::L4_loadrub_rr:
+ case Hexagon::L2_ploadrubt_io:
+ case Hexagon::L2_ploadrubt_pi:
+ case Hexagon::L2_ploadrubf_io:
+ case Hexagon::L2_ploadrubf_pi:
+ case Hexagon::L2_ploadrubtnew_io:
+ case Hexagon::L2_ploadrubfnew_io:
+ case Hexagon::L4_ploadrubt_rr:
+ case Hexagon::L4_ploadrubf_rr:
+ case Hexagon::L4_ploadrubtnew_rr:
+ case Hexagon::L4_ploadrubfnew_rr:
+ case Hexagon::L2_ploadrubtnew_pi:
+ case Hexagon::L2_ploadrubfnew_pi:
+ case Hexagon::L4_ploadrubt_abs:
+ case Hexagon::L4_ploadrubf_abs:
+ case Hexagon::L4_ploadrubtnew_abs:
+ case Hexagon::L4_ploadrubfnew_abs:
+ case Hexagon::L2_loadrubgp:
+ // Half
+ case Hexagon::L2_loadruh_io:
+ case Hexagon::L4_loadruh_ur:
+ case Hexagon::L4_loadruh_ap:
+ case Hexagon::L2_loadruh_pr:
+ case Hexagon::L2_loadruh_pbr:
+ case Hexagon::L2_loadruh_pi:
+ case Hexagon::L2_loadruh_pci:
+ case Hexagon::L2_loadruh_pcr:
+ case Hexagon::L4_loadruh_rr:
+ case Hexagon::L2_ploadruht_io:
+ case Hexagon::L2_ploadruht_pi:
+ case Hexagon::L2_ploadruhf_io:
+ case Hexagon::L2_ploadruhf_pi:
+ case Hexagon::L2_ploadruhtnew_io:
+ case Hexagon::L2_ploadruhfnew_io:
+ case Hexagon::L4_ploadruht_rr:
+ case Hexagon::L4_ploadruhf_rr:
+ case Hexagon::L4_ploadruhtnew_rr:
+ case Hexagon::L4_ploadruhfnew_rr:
+ case Hexagon::L2_ploadruhtnew_pi:
+ case Hexagon::L2_ploadruhfnew_pi:
+ case Hexagon::L4_ploadruht_abs:
+ case Hexagon::L4_ploadruhf_abs:
+ case Hexagon::L4_ploadruhtnew_abs:
+ case Hexagon::L4_ploadruhfnew_abs:
+ case Hexagon::L2_loadruhgp:
+ return true;
+ default:
+ return false;
+ }
+}
+
+
+// Add latency to instruction.
+bool HexagonInstrInfo::addLatencyToSchedule(const MachineInstr &MI1,
+ const MachineInstr &MI2) const {
+ if (isV60VectorInstruction(MI1) && isV60VectorInstruction(MI2))
+ if (!isVecUsableNextPacket(MI1, MI2))
+ return true;
+ return false;
+}
+
+
+/// \brief Get the base register and byte offset of a load/store instr.
+bool HexagonInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt,
+ unsigned &BaseReg, int64_t &Offset, const TargetRegisterInfo *TRI)
+ const {
+ unsigned AccessSize = 0;
+ int OffsetVal = 0;
+ BaseReg = getBaseAndOffset(LdSt, OffsetVal, AccessSize);
+ Offset = OffsetVal;
+ return BaseReg != 0;
+}
+
+
+/// \brief Can these instructions execute at the same time in a bundle.
+bool HexagonInstrInfo::canExecuteInBundle(const MachineInstr &First,
+ const MachineInstr &Second) const {
+ if (DisableNVSchedule)
+ return false;
+ if (mayBeNewStore(Second)) {
+ // Make sure the definition of the first instruction is the value being
+ // stored.
+ const MachineOperand &Stored =
+ Second.getOperand(Second.getNumOperands() - 1);
+ if (!Stored.isReg())
+ return false;
+ for (unsigned i = 0, e = First.getNumOperands(); i < e; ++i) {
+ const MachineOperand &Op = First.getOperand(i);
+ if (Op.isReg() && Op.isDef() && Op.getReg() == Stored.getReg())
+ return true;
+ }
+ }
+ return false;
+}
+
+
+bool HexagonInstrInfo::doesNotReturn(const MachineInstr &CallMI) const {
+ unsigned Opc = CallMI.getOpcode();
+ return Opc == Hexagon::PS_call_nr || Opc == Hexagon::PS_callr_nr;
+}
+
+
+bool HexagonInstrInfo::hasEHLabel(const MachineBasicBlock *B) const {
+ for (auto &I : *B)
+ if (I.isEHLabel())
+ return true;
+ return false;
+}
+
+
+// Returns true if an instruction can be converted into a non-extended
+// equivalent instruction.
+bool HexagonInstrInfo::hasNonExtEquivalent(const MachineInstr &MI) const {
+ short NonExtOpcode;
+ // Check if the instruction has a register form that uses register in place
+ // of the extended operand, if so return that as the non-extended form.
+ if (Hexagon::getRegForm(MI.getOpcode()) >= 0)
+ return true;
+
+ if (MI.getDesc().mayLoad() || MI.getDesc().mayStore()) {
+ // Check addressing mode and retrieve non-ext equivalent instruction.
+
+ switch (getAddrMode(MI)) {
+ case HexagonII::Absolute :
+ // Load/store with absolute addressing mode can be converted into
+ // base+offset mode.
+ NonExtOpcode = Hexagon::getBaseWithImmOffset(MI.getOpcode());
+ break;
+ case HexagonII::BaseImmOffset :
+ // Load/store with base+offset addressing mode can be converted into
+ // base+register offset addressing mode. However left shift operand should
+ // be set to 0.
+ NonExtOpcode = Hexagon::getBaseWithRegOffset(MI.getOpcode());
+ break;
+ case HexagonII::BaseLongOffset:
+ NonExtOpcode = Hexagon::getRegShlForm(MI.getOpcode());
+ break;
+ default:
+ return false;
+ }
+ if (NonExtOpcode < 0)
+ return false;
+ return true;
+ }
+ return false;
+}
+
+
+bool HexagonInstrInfo::hasPseudoInstrPair(const MachineInstr &MI) const {
+ return Hexagon::getRealHWInstr(MI.getOpcode(),
+ Hexagon::InstrType_Pseudo) >= 0;
+}
+
+
+bool HexagonInstrInfo::hasUncondBranch(const MachineBasicBlock *B)
+ const {
+ MachineBasicBlock::const_iterator I = B->getFirstTerminator(), E = B->end();
+ while (I != E) {
+ if (I->isBarrier())
+ return true;
+ ++I;
+ }
+ return false;
+}
+
+
+// Returns true, if a LD insn can be promoted to a cur load.
+bool HexagonInstrInfo::mayBeCurLoad(const MachineInstr &MI) const {
+ auto &HST = MI.getParent()->getParent()->getSubtarget<HexagonSubtarget>();
+ const uint64_t F = MI.getDesc().TSFlags;
+ return ((F >> HexagonII::mayCVLoadPos) & HexagonII::mayCVLoadMask) &&
+ HST.hasV60TOps();
+}
+
+
+// Returns true, if a ST insn can be promoted to a new-value store.
+bool HexagonInstrInfo::mayBeNewStore(const MachineInstr &MI) const {
+ const uint64_t F = MI.getDesc().TSFlags;
+ return (F >> HexagonII::mayNVStorePos) & HexagonII::mayNVStoreMask;
+}
+
+
+bool HexagonInstrInfo::producesStall(const MachineInstr &ProdMI,
+ const MachineInstr &ConsMI) const {
+ // There is no stall when ProdMI is not a V60 vector.
+ if (!isV60VectorInstruction(ProdMI))
+ return false;
+
+ // There is no stall when ProdMI and ConsMI are not dependent.
+ if (!isDependent(ProdMI, ConsMI))
+ return false;
+
+ // When Forward Scheduling is enabled, there is no stall if ProdMI and ConsMI
+ // are scheduled in consecutive packets.
+ if (isVecUsableNextPacket(ProdMI, ConsMI))
+ return false;
+
+ return true;
+}
+
+
+bool HexagonInstrInfo::producesStall(const MachineInstr &MI,
+ MachineBasicBlock::const_instr_iterator BII) const {
+ // There is no stall when I is not a V60 vector.
+ if (!isV60VectorInstruction(MI))
+ return false;
+
+ MachineBasicBlock::const_instr_iterator MII = BII;
+ MachineBasicBlock::const_instr_iterator MIE = MII->getParent()->instr_end();
+
+ if (!MII->isBundle()) {
+ const MachineInstr &J = *MII;
+ if (!isV60VectorInstruction(J))
+ return false;
+ else if (isVecUsableNextPacket(J, MI))
+ return false;
+ return true;
+ }
+
+ for (++MII; MII != MIE && MII->isInsideBundle(); ++MII) {
+ const MachineInstr &J = *MII;
+ if (producesStall(J, MI))
+ return true;
+ }
+ return false;
+}
+
+
+bool HexagonInstrInfo::predCanBeUsedAsDotNew(const MachineInstr &MI,
+ unsigned PredReg) const {
+ for (unsigned opNum = 0; opNum < MI.getNumOperands(); opNum++) {
+ const MachineOperand &MO = MI.getOperand(opNum);
+ if (MO.isReg() && MO.isDef() && MO.isImplicit() && (MO.getReg() == PredReg))
+ return false; // Predicate register must be explicitly defined.
+ }
+
+ // Hexagon Programmer's Reference says that decbin, memw_locked, and
+ // memd_locked cannot be used as .new as well,
+ // but we don't seem to have these instructions defined.
+ return MI.getOpcode() != Hexagon::A4_tlbmatch;
+}
+
+
+bool HexagonInstrInfo::PredOpcodeHasJMP_c(unsigned Opcode) const {
+ return (Opcode == Hexagon::J2_jumpt) ||
+ (Opcode == Hexagon::J2_jumpf) ||
+ (Opcode == Hexagon::J2_jumptnew) ||
+ (Opcode == Hexagon::J2_jumpfnew) ||
+ (Opcode == Hexagon::J2_jumptnewpt) ||
+ (Opcode == Hexagon::J2_jumpfnewpt);
+}
+
+
+bool HexagonInstrInfo::predOpcodeHasNot(ArrayRef<MachineOperand> Cond) const {
+ if (Cond.empty() || !isPredicated(Cond[0].getImm()))
+ return false;
+ return !isPredicatedTrue(Cond[0].getImm());
+}
+
+
+short HexagonInstrInfo::getAbsoluteForm(const MachineInstr &MI) const {
+ return Hexagon::getAbsoluteForm(MI.getOpcode());
+}
+
+
+unsigned HexagonInstrInfo::getAddrMode(const MachineInstr &MI) const {
+ const uint64_t F = MI.getDesc().TSFlags;
+ return (F >> HexagonII::AddrModePos) & HexagonII::AddrModeMask;
+}
+
+
+// Returns the base register in a memory access (load/store). The offset is
+// returned in Offset and the access size is returned in AccessSize.
+unsigned HexagonInstrInfo::getBaseAndOffset(const MachineInstr &MI,
+ int &Offset, unsigned &AccessSize) const {
+ // Return if it is not a base+offset type instruction or a MemOp.
+ if (getAddrMode(MI) != HexagonII::BaseImmOffset &&
+ getAddrMode(MI) != HexagonII::BaseLongOffset &&
+ !isMemOp(MI) && !isPostIncrement(MI))
+ return 0;
+
+ // Since it is a memory access instruction, getMemAccessSize() should never
+ // return 0.
+ assert (getMemAccessSize(MI) &&
+ "BaseImmOffset or BaseLongOffset or MemOp without accessSize");
+
+ // Return Values of getMemAccessSize() are
+ // 0 - Checked in the assert above.
+ // 1, 2, 3, 4 & 7, 8 - The statement below is correct for all these.
+ // MemAccessSize is represented as 1+log2(N) where N is size in bits.
+ AccessSize = (1U << (getMemAccessSize(MI) - 1));
+
+ unsigned basePos = 0, offsetPos = 0;
+ if (!getBaseAndOffsetPosition(MI, basePos, offsetPos))
+ return 0;
+
+ // Post increment updates its EA after the mem access,
+ // so we need to treat its offset as zero.
+ if (isPostIncrement(MI))
+ Offset = 0;
+ else {
+ Offset = MI.getOperand(offsetPos).getImm();
+ }
+
+ return MI.getOperand(basePos).getReg();
+}
+
+
+/// Return the position of the base and offset operands for this instruction.
+bool HexagonInstrInfo::getBaseAndOffsetPosition(const MachineInstr &MI,
+ unsigned &BasePos, unsigned &OffsetPos) const {
+ // Deal with memops first.
+ if (isMemOp(MI)) {
+ BasePos = 0;
+ OffsetPos = 1;
+ } else if (MI.mayStore()) {
+ BasePos = 0;
+ OffsetPos = 1;
+ } else if (MI.mayLoad()) {
+ BasePos = 1;
+ OffsetPos = 2;
+ } else
+ return false;
+
+ if (isPredicated(MI)) {
+ BasePos++;
+ OffsetPos++;
+ }
+ if (isPostIncrement(MI)) {
+ BasePos++;
+ OffsetPos++;
+ }
+
+ if (!MI.getOperand(BasePos).isReg() || !MI.getOperand(OffsetPos).isImm())
+ return false;
+
+ return true;
+}
+
+
+// Inserts branching instructions in reverse order of their occurrence.
+// e.g. jump_t t1 (i1)
+// jump t2 (i2)
+// Jumpers = {i2, i1}
+SmallVector<MachineInstr*, 2> HexagonInstrInfo::getBranchingInstrs(
+ MachineBasicBlock& MBB) const {
+ SmallVector<MachineInstr*, 2> Jumpers;
+ // If the block has no terminators, it just falls into the block after it.
+ MachineBasicBlock::instr_iterator I = MBB.instr_end();
+ if (I == MBB.instr_begin())
+ return Jumpers;
+
+ // A basic block may looks like this:
+ //
+ // [ insn
+ // EH_LABEL
+ // insn
+ // insn
+ // insn
+ // EH_LABEL
+ // insn ]
+ //
+ // It has two succs but does not have a terminator
+ // Don't know how to handle it.
+ do {
+ --I;
+ if (I->isEHLabel())
+ return Jumpers;
+ } while (I != MBB.instr_begin());
+
+ I = MBB.instr_end();
+ --I;
+
+ while (I->isDebugValue()) {
+ if (I == MBB.instr_begin())
+ return Jumpers;
+ --I;
+ }
+ if (!isUnpredicatedTerminator(*I))
+ return Jumpers;
+
+ // Get the last instruction in the block.
+ MachineInstr *LastInst = &*I;
+ Jumpers.push_back(LastInst);
+ MachineInstr *SecondLastInst = nullptr;
+ // Find one more terminator if present.
+ do {
+ if (&*I != LastInst && !I->isBundle() && isUnpredicatedTerminator(*I)) {
+ if (!SecondLastInst) {
+ SecondLastInst = &*I;
+ Jumpers.push_back(SecondLastInst);
+ } else // This is a third branch.
+ return Jumpers;
+ }
+ if (I == MBB.instr_begin())
+ break;
+ --I;
+ } while (true);
+ return Jumpers;
+}
+
+
+short HexagonInstrInfo::getBaseWithLongOffset(short Opcode) const {
+ if (Opcode < 0)
+ return -1;
+ return Hexagon::getBaseWithLongOffset(Opcode);
+}
+
+
+short HexagonInstrInfo::getBaseWithLongOffset(const MachineInstr &MI) const {
+ return Hexagon::getBaseWithLongOffset(MI.getOpcode());
+}
+
+
+short HexagonInstrInfo::getBaseWithRegOffset(const MachineInstr &MI) const {
+ return Hexagon::getBaseWithRegOffset(MI.getOpcode());
+}
+
+
+// Returns Operand Index for the constant extended instruction.
+unsigned HexagonInstrInfo::getCExtOpNum(const MachineInstr &MI) const {
+ const uint64_t F = MI.getDesc().TSFlags;
+ return (F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask;
+}
+
+// See if instruction could potentially be a duplex candidate.
+// If so, return its group. Zero otherwise.
+HexagonII::CompoundGroup HexagonInstrInfo::getCompoundCandidateGroup(
+ const MachineInstr &MI) const {
+ unsigned DstReg, SrcReg, Src1Reg, Src2Reg;
+
+ switch (MI.getOpcode()) {
+ default:
+ return HexagonII::HCG_None;
+ //
+ // Compound pairs.
+ // "p0=cmp.eq(Rs16,Rt16); if (p0.new) jump:nt #r9:2"
+ // "Rd16=#U6 ; jump #r9:2"
+ // "Rd16=Rs16 ; jump #r9:2"
+ //
+ case Hexagon::C2_cmpeq:
+ case Hexagon::C2_cmpgt:
+ case Hexagon::C2_cmpgtu:
+ DstReg = MI.getOperand(0).getReg();
+ Src1Reg = MI.getOperand(1).getReg();
+ Src2Reg = MI.getOperand(2).getReg();
+ if (Hexagon::PredRegsRegClass.contains(DstReg) &&
+ (Hexagon::P0 == DstReg || Hexagon::P1 == DstReg) &&
+ isIntRegForSubInst(Src1Reg) && isIntRegForSubInst(Src2Reg))
+ return HexagonII::HCG_A;
+ break;
+ case Hexagon::C2_cmpeqi:
+ case Hexagon::C2_cmpgti:
+ case Hexagon::C2_cmpgtui:
+ // P0 = cmp.eq(Rs,#u2)
+ DstReg = MI.getOperand(0).getReg();
+ SrcReg = MI.getOperand(1).getReg();
+ if (Hexagon::PredRegsRegClass.contains(DstReg) &&
+ (Hexagon::P0 == DstReg || Hexagon::P1 == DstReg) &&
+ isIntRegForSubInst(SrcReg) && MI.getOperand(2).isImm() &&
+ ((isUInt<5>(MI.getOperand(2).getImm())) ||
+ (MI.getOperand(2).getImm() == -1)))
+ return HexagonII::HCG_A;
+ break;
+ case Hexagon::A2_tfr:
+ // Rd = Rs
+ DstReg = MI.getOperand(0).getReg();
+ SrcReg = MI.getOperand(1).getReg();
+ if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg))
+ return HexagonII::HCG_A;
+ break;
+ case Hexagon::A2_tfrsi:
+ // Rd = #u6
+ // Do not test for #u6 size since the const is getting extended
+ // regardless and compound could be formed.
+ DstReg = MI.getOperand(0).getReg();
+ if (isIntRegForSubInst(DstReg))
+ return HexagonII::HCG_A;
+ break;
+ case Hexagon::S2_tstbit_i:
+ DstReg = MI.getOperand(0).getReg();
+ Src1Reg = MI.getOperand(1).getReg();
+ if (Hexagon::PredRegsRegClass.contains(DstReg) &&
+ (Hexagon::P0 == DstReg || Hexagon::P1 == DstReg) &&
+ MI.getOperand(2).isImm() &&
+ isIntRegForSubInst(Src1Reg) && (MI.getOperand(2).getImm() == 0))
+ return HexagonII::HCG_A;
+ break;
+ // The fact that .new form is used pretty much guarantees
+ // that predicate register will match. Nevertheless,
+ // there could be some false positives without additional
+ // checking.
+ case Hexagon::J2_jumptnew:
+ case Hexagon::J2_jumpfnew:
+ case Hexagon::J2_jumptnewpt:
+ case Hexagon::J2_jumpfnewpt:
+ Src1Reg = MI.getOperand(0).getReg();
+ if (Hexagon::PredRegsRegClass.contains(Src1Reg) &&
+ (Hexagon::P0 == Src1Reg || Hexagon::P1 == Src1Reg))
+ return HexagonII::HCG_B;
+ break;
+ // Transfer and jump:
+ // Rd=#U6 ; jump #r9:2
+ // Rd=Rs ; jump #r9:2
+ // Do not test for jump range here.
+ case Hexagon::J2_jump:
+ case Hexagon::RESTORE_DEALLOC_RET_JMP_V4:
+ case Hexagon::RESTORE_DEALLOC_RET_JMP_V4_PIC:
+ return HexagonII::HCG_C;
+ break;
+ }
+
+ return HexagonII::HCG_None;
+}
+
+
+// Returns -1 when there is no opcode found.
+unsigned HexagonInstrInfo::getCompoundOpcode(const MachineInstr &GA,
+ const MachineInstr &GB) const {
+ assert(getCompoundCandidateGroup(GA) == HexagonII::HCG_A);
+ assert(getCompoundCandidateGroup(GB) == HexagonII::HCG_B);
+ if ((GA.getOpcode() != Hexagon::C2_cmpeqi) ||
+ (GB.getOpcode() != Hexagon::J2_jumptnew))
+ return -1;
+ unsigned DestReg = GA.getOperand(0).getReg();
+ if (!GB.readsRegister(DestReg))
+ return -1;
+ if (DestReg == Hexagon::P0)
+ return Hexagon::J4_cmpeqi_tp0_jump_nt;
+ if (DestReg == Hexagon::P1)
+ return Hexagon::J4_cmpeqi_tp1_jump_nt;
+ return -1;
+}
+
+
+int HexagonInstrInfo::getCondOpcode(int Opc, bool invertPredicate) const {
+ enum Hexagon::PredSense inPredSense;
+ inPredSense = invertPredicate ? Hexagon::PredSense_false :
+ Hexagon::PredSense_true;
+ int CondOpcode = Hexagon::getPredOpcode(Opc, inPredSense);
+ if (CondOpcode >= 0) // Valid Conditional opcode/instruction
+ return CondOpcode;
+
+ llvm_unreachable("Unexpected predicable instruction");
+}
+
+
+// Return the cur value instruction for a given store.
+int HexagonInstrInfo::getDotCurOp(const MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ default: llvm_unreachable("Unknown .cur type");
+ case Hexagon::V6_vL32b_pi:
+ return Hexagon::V6_vL32b_cur_pi;
+ case Hexagon::V6_vL32b_ai:
+ return Hexagon::V6_vL32b_cur_ai;
+ //128B
+ case Hexagon::V6_vL32b_pi_128B:
+ return Hexagon::V6_vL32b_cur_pi_128B;
+ case Hexagon::V6_vL32b_ai_128B:
+ return Hexagon::V6_vL32b_cur_ai_128B;
+ }
+ return 0;
+}
+
+
+
+// The diagram below shows the steps involved in the conversion of a predicated
+// store instruction to its .new predicated new-value form.
+//
+// p.new NV store [ if(p0.new)memw(R0+#0)=R2.new ]
+// ^ ^
+// / \ (not OK. it will cause new-value store to be
+// / X conditional on p0.new while R2 producer is
+// / \ on p0)
+// / \.
+// p.new store p.old NV store
+// [if(p0.new)memw(R0+#0)=R2] [if(p0)memw(R0+#0)=R2.new]
+// ^ ^
+// \ /
+// \ /
+// \ /
+// p.old store
+// [if (p0)memw(R0+#0)=R2]
+//
+//
+// The following set of instructions further explains the scenario where
+// conditional new-value store becomes invalid when promoted to .new predicate
+// form.
+//
+// { 1) if (p0) r0 = add(r1, r2)
+// 2) p0 = cmp.eq(r3, #0) }
+//
+// 3) if (p0) memb(r1+#0) = r0 --> this instruction can't be grouped with
+// the first two instructions because in instr 1, r0 is conditional on old value
+// of p0 but its use in instr 3 is conditional on p0 modified by instr 2 which
+// is not valid for new-value stores.
+// Predicated new value stores (i.e. if (p0) memw(..)=r0.new) are excluded
+// from the "Conditional Store" list. Because a predicated new value store
+// would NOT be promoted to a double dot new store. See diagram below:
+// This function returns yes for those stores that are predicated but not
+// yet promoted to predicate dot new instructions.
+//
+// +---------------------+
+// /-----| if (p0) memw(..)=r0 |---------\~
+// || +---------------------+ ||
+// promote || /\ /\ || promote
+// || /||\ /||\ ||
+// \||/ demote || \||/
+// \/ || || \/
+// +-------------------------+ || +-------------------------+
+// | if (p0.new) memw(..)=r0 | || | if (p0) memw(..)=r0.new |
+// +-------------------------+ || +-------------------------+
+// || || ||
+// || demote \||/
+// promote || \/ NOT possible
+// || || /\~
+// \||/ || /||\~
+// \/ || ||
+// +-----------------------------+
+// | if (p0.new) memw(..)=r0.new |
+// +-----------------------------+
+// Double Dot New Store
+//
+// Returns the most basic instruction for the .new predicated instructions and
+// new-value stores.
+// For example, all of the following instructions will be converted back to the
+// same instruction:
+// 1) if (p0.new) memw(R0+#0) = R1.new --->
+// 2) if (p0) memw(R0+#0)= R1.new -------> if (p0) memw(R0+#0) = R1
+// 3) if (p0.new) memw(R0+#0) = R1 --->
+//
+// To understand the translation of instruction 1 to its original form, consider
+// a packet with 3 instructions.
+// { p0 = cmp.eq(R0,R1)
+// if (p0.new) R2 = add(R3, R4)
+// R5 = add (R3, R1)
+// }
+// if (p0) memw(R5+#0) = R2 <--- trying to include it in the previous packet
+//
+// This instruction can be part of the previous packet only if both p0 and R2
+// are promoted to .new values. This promotion happens in steps, first
+// predicate register is promoted to .new and in the next iteration R2 is
+// promoted. Therefore, in case of dependence check failure (due to R5) during
+// next iteration, it should be converted back to its most basic form.
+
+
+// Return the new value instruction for a given store.
+int HexagonInstrInfo::getDotNewOp(const MachineInstr &MI) const {
+ int NVOpcode = Hexagon::getNewValueOpcode(MI.getOpcode());
+ if (NVOpcode >= 0) // Valid new-value store instruction.
+ return NVOpcode;
+
+ switch (MI.getOpcode()) {
+ default: llvm_unreachable("Unknown .new type");
+ case Hexagon::S4_storerb_ur:
+ return Hexagon::S4_storerbnew_ur;
+
+ case Hexagon::S2_storerb_pci:
+ return Hexagon::S2_storerb_pci;
+
+ case Hexagon::S2_storeri_pci:
+ return Hexagon::S2_storeri_pci;
+
+ case Hexagon::S2_storerh_pci:
+ return Hexagon::S2_storerh_pci;
+
+ case Hexagon::S2_storerd_pci:
+ return Hexagon::S2_storerd_pci;
+
+ case Hexagon::S2_storerf_pci:
+ return Hexagon::S2_storerf_pci;
+
+ case Hexagon::V6_vS32b_ai:
+ return Hexagon::V6_vS32b_new_ai;
+
+ case Hexagon::V6_vS32b_pi:
+ return Hexagon::V6_vS32b_new_pi;
+
+ // 128B
+ case Hexagon::V6_vS32b_ai_128B:
+ return Hexagon::V6_vS32b_new_ai_128B;
+
+ case Hexagon::V6_vS32b_pi_128B:
+ return Hexagon::V6_vS32b_new_pi_128B;
+ }
+ return 0;
+}
+
+
+// Returns the opcode to use when converting MI, which is a conditional jump,
+// into a conditional instruction which uses the .new value of the predicate.
+// We also use branch probabilities to add a hint to the jump.
+int HexagonInstrInfo::getDotNewPredJumpOp(const MachineInstr &MI,
+ const MachineBranchProbabilityInfo *MBPI) const {
+ // We assume that block can have at most two successors.
+ bool taken = false;
+ const MachineBasicBlock *Src = MI.getParent();
+ const MachineOperand &BrTarget = MI.getOperand(1);
+ const MachineBasicBlock *Dst = BrTarget.getMBB();
+
+ const BranchProbability Prediction = MBPI->getEdgeProbability(Src, Dst);
+ if (Prediction >= BranchProbability(1,2))
+ taken = true;
+
+ switch (MI.getOpcode()) {
+ case Hexagon::J2_jumpt:
+ return taken ? Hexagon::J2_jumptnewpt : Hexagon::J2_jumptnew;
+ case Hexagon::J2_jumpf:
+ return taken ? Hexagon::J2_jumpfnewpt : Hexagon::J2_jumpfnew;
+
+ default:
+ llvm_unreachable("Unexpected jump instruction.");
+ }
+}
+
+
+// Return .new predicate version for an instruction.
+int HexagonInstrInfo::getDotNewPredOp(const MachineInstr &MI,
+ const MachineBranchProbabilityInfo *MBPI) const {
+ int NewOpcode = Hexagon::getPredNewOpcode(MI.getOpcode());
+ if (NewOpcode >= 0) // Valid predicate new instruction
+ return NewOpcode;
+
+ switch (MI.getOpcode()) {
+ // Condtional Jumps
+ case Hexagon::J2_jumpt:
+ case Hexagon::J2_jumpf:
+ return getDotNewPredJumpOp(MI, MBPI);
+
+ default:
+ assert(0 && "Unknown .new type");
+ }
+ return 0;
+}
+
+
+int HexagonInstrInfo::getDotOldOp(const int opc) const {
+ int NewOp = opc;
+ if (isPredicated(NewOp) && isPredicatedNew(NewOp)) { // Get predicate old form
+ NewOp = Hexagon::getPredOldOpcode(NewOp);
+ assert(NewOp >= 0 &&
+ "Couldn't change predicate new instruction to its old form.");
+ }
+
+ if (isNewValueStore(NewOp)) { // Convert into non-new-value format
+ NewOp = Hexagon::getNonNVStore(NewOp);
+ assert(NewOp >= 0 && "Couldn't change new-value store to its old form.");
+ }
+ return NewOp;
+}
+
+
+// See if instruction could potentially be a duplex candidate.
+// If so, return its group. Zero otherwise.
+HexagonII::SubInstructionGroup HexagonInstrInfo::getDuplexCandidateGroup(
+ const MachineInstr &MI) const {
+ unsigned DstReg, SrcReg, Src1Reg, Src2Reg;
+ auto &HRI = getRegisterInfo();
+
+ switch (MI.getOpcode()) {
+ default:
+ return HexagonII::HSIG_None;
+ //
+ // Group L1:
+ //
+ // Rd = memw(Rs+#u4:2)
+ // Rd = memub(Rs+#u4:0)
+ case Hexagon::L2_loadri_io:
+ DstReg = MI.getOperand(0).getReg();
+ SrcReg = MI.getOperand(1).getReg();
+ // Special case this one from Group L2.
+ // Rd = memw(r29+#u5:2)
+ if (isIntRegForSubInst(DstReg)) {
+ if (Hexagon::IntRegsRegClass.contains(SrcReg) &&
+ HRI.getStackRegister() == SrcReg &&
+ MI.getOperand(2).isImm() &&
+ isShiftedUInt<5,2>(MI.getOperand(2).getImm()))
+ return HexagonII::HSIG_L2;
+ // Rd = memw(Rs+#u4:2)
+ if (isIntRegForSubInst(SrcReg) &&
+ (MI.getOperand(2).isImm() &&
+ isShiftedUInt<4,2>(MI.getOperand(2).getImm())))
+ return HexagonII::HSIG_L1;
+ }
+ break;
+ case Hexagon::L2_loadrub_io:
+ // Rd = memub(Rs+#u4:0)
+ DstReg = MI.getOperand(0).getReg();
+ SrcReg = MI.getOperand(1).getReg();
+ if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg) &&
+ MI.getOperand(2).isImm() && isUInt<4>(MI.getOperand(2).getImm()))
+ return HexagonII::HSIG_L1;
+ break;
+ //
+ // Group L2:
+ //
+ // Rd = memh/memuh(Rs+#u3:1)
+ // Rd = memb(Rs+#u3:0)
+ // Rd = memw(r29+#u5:2) - Handled above.
+ // Rdd = memd(r29+#u5:3)
+ // deallocframe
+ // [if ([!]p0[.new])] dealloc_return
+ // [if ([!]p0[.new])] jumpr r31
+ case Hexagon::L2_loadrh_io:
+ case Hexagon::L2_loadruh_io:
+ // Rd = memh/memuh(Rs+#u3:1)
+ DstReg = MI.getOperand(0).getReg();
+ SrcReg = MI.getOperand(1).getReg();
+ if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg) &&
+ MI.getOperand(2).isImm() &&
+ isShiftedUInt<3,1>(MI.getOperand(2).getImm()))
+ return HexagonII::HSIG_L2;
+ break;
+ case Hexagon::L2_loadrb_io:
+ // Rd = memb(Rs+#u3:0)
+ DstReg = MI.getOperand(0).getReg();
+ SrcReg = MI.getOperand(1).getReg();
+ if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg) &&
+ MI.getOperand(2).isImm() &&
+ isUInt<3>(MI.getOperand(2).getImm()))
+ return HexagonII::HSIG_L2;
+ break;
+ case Hexagon::L2_loadrd_io:
+ // Rdd = memd(r29+#u5:3)
+ DstReg = MI.getOperand(0).getReg();
+ SrcReg = MI.getOperand(1).getReg();
+ if (isDblRegForSubInst(DstReg, HRI) &&
+ Hexagon::IntRegsRegClass.contains(SrcReg) &&
+ HRI.getStackRegister() == SrcReg &&
+ MI.getOperand(2).isImm() &&
+ isShiftedUInt<5,3>(MI.getOperand(2).getImm()))
+ return HexagonII::HSIG_L2;
+ break;
+ // dealloc_return is not documented in Hexagon Manual, but marked
+ // with A_SUBINSN attribute in iset_v4classic.py.
+ case Hexagon::RESTORE_DEALLOC_RET_JMP_V4:
+ case Hexagon::RESTORE_DEALLOC_RET_JMP_V4_PIC:
+ case Hexagon::L4_return:
+ case Hexagon::L2_deallocframe:
+ return HexagonII::HSIG_L2;
+ case Hexagon::EH_RETURN_JMPR:
+ case Hexagon::PS_jmpret:
+ // jumpr r31
+ // Actual form JMPR %PC<imp-def>, %R31<imp-use>, %R0<imp-use,internal>.
+ DstReg = MI.getOperand(0).getReg();
+ if (Hexagon::IntRegsRegClass.contains(DstReg) && (Hexagon::R31 == DstReg))
+ return HexagonII::HSIG_L2;
+ break;
+ case Hexagon::PS_jmprett:
+ case Hexagon::PS_jmpretf:
+ case Hexagon::PS_jmprettnewpt:
+ case Hexagon::PS_jmpretfnewpt:
+ case Hexagon::PS_jmprettnew:
+ case Hexagon::PS_jmpretfnew:
+ DstReg = MI.getOperand(1).getReg();
+ SrcReg = MI.getOperand(0).getReg();
+ // [if ([!]p0[.new])] jumpr r31
+ if ((Hexagon::PredRegsRegClass.contains(SrcReg) &&
+ (Hexagon::P0 == SrcReg)) &&
+ (Hexagon::IntRegsRegClass.contains(DstReg) && (Hexagon::R31 == DstReg)))
+ return HexagonII::HSIG_L2;
+ break;
+ case Hexagon::L4_return_t :
+ case Hexagon::L4_return_f :
+ case Hexagon::L4_return_tnew_pnt :
+ case Hexagon::L4_return_fnew_pnt :
+ case Hexagon::L4_return_tnew_pt :
+ case Hexagon::L4_return_fnew_pt :
+ // [if ([!]p0[.new])] dealloc_return
+ SrcReg = MI.getOperand(0).getReg();
+ if (Hexagon::PredRegsRegClass.contains(SrcReg) && (Hexagon::P0 == SrcReg))
+ return HexagonII::HSIG_L2;
+ break;
+ //
+ // Group S1:
+ //
+ // memw(Rs+#u4:2) = Rt
+ // memb(Rs+#u4:0) = Rt
+ case Hexagon::S2_storeri_io:
+ // Special case this one from Group S2.
+ // memw(r29+#u5:2) = Rt
+ Src1Reg = MI.getOperand(0).getReg();
+ Src2Reg = MI.getOperand(2).getReg();
+ if (Hexagon::IntRegsRegClass.contains(Src1Reg) &&
+ isIntRegForSubInst(Src2Reg) &&
+ HRI.getStackRegister() == Src1Reg && MI.getOperand(1).isImm() &&
+ isShiftedUInt<5,2>(MI.getOperand(1).getImm()))
+ return HexagonII::HSIG_S2;
+ // memw(Rs+#u4:2) = Rt
+ if (isIntRegForSubInst(Src1Reg) && isIntRegForSubInst(Src2Reg) &&
+ MI.getOperand(1).isImm() &&
+ isShiftedUInt<4,2>(MI.getOperand(1).getImm()))
+ return HexagonII::HSIG_S1;
+ break;
+ case Hexagon::S2_storerb_io:
+ // memb(Rs+#u4:0) = Rt
+ Src1Reg = MI.getOperand(0).getReg();
+ Src2Reg = MI.getOperand(2).getReg();
+ if (isIntRegForSubInst(Src1Reg) && isIntRegForSubInst(Src2Reg) &&
+ MI.getOperand(1).isImm() && isUInt<4>(MI.getOperand(1).getImm()))
+ return HexagonII::HSIG_S1;
+ break;
+ //
+ // Group S2:
+ //
+ // memh(Rs+#u3:1) = Rt
+ // memw(r29+#u5:2) = Rt
+ // memd(r29+#s6:3) = Rtt
+ // memw(Rs+#u4:2) = #U1
+ // memb(Rs+#u4) = #U1
+ // allocframe(#u5:3)
+ case Hexagon::S2_storerh_io:
+ // memh(Rs+#u3:1) = Rt
+ Src1Reg = MI.getOperand(0).getReg();
+ Src2Reg = MI.getOperand(2).getReg();
+ if (isIntRegForSubInst(Src1Reg) && isIntRegForSubInst(Src2Reg) &&
+ MI.getOperand(1).isImm() &&
+ isShiftedUInt<3,1>(MI.getOperand(1).getImm()))
+ return HexagonII::HSIG_S1;
+ break;
+ case Hexagon::S2_storerd_io:
+ // memd(r29+#s6:3) = Rtt
+ Src1Reg = MI.getOperand(0).getReg();
+ Src2Reg = MI.getOperand(2).getReg();
+ if (isDblRegForSubInst(Src2Reg, HRI) &&
+ Hexagon::IntRegsRegClass.contains(Src1Reg) &&
+ HRI.getStackRegister() == Src1Reg && MI.getOperand(1).isImm() &&
+ isShiftedInt<6,3>(MI.getOperand(1).getImm()))
+ return HexagonII::HSIG_S2;
+ break;
+ case Hexagon::S4_storeiri_io:
+ // memw(Rs+#u4:2) = #U1
+ Src1Reg = MI.getOperand(0).getReg();
+ if (isIntRegForSubInst(Src1Reg) && MI.getOperand(1).isImm() &&
+ isShiftedUInt<4,2>(MI.getOperand(1).getImm()) &&
+ MI.getOperand(2).isImm() && isUInt<1>(MI.getOperand(2).getImm()))
+ return HexagonII::HSIG_S2;
+ break;
+ case Hexagon::S4_storeirb_io:
+ // memb(Rs+#u4) = #U1
+ Src1Reg = MI.getOperand(0).getReg();
+ if (isIntRegForSubInst(Src1Reg) &&
+ MI.getOperand(1).isImm() && isUInt<4>(MI.getOperand(1).getImm()) &&
+ MI.getOperand(2).isImm() && isUInt<1>(MI.getOperand(2).getImm()))
+ return HexagonII::HSIG_S2;
+ break;
+ case Hexagon::S2_allocframe:
+ if (MI.getOperand(0).isImm() &&
+ isShiftedUInt<5,3>(MI.getOperand(0).getImm()))
+ return HexagonII::HSIG_S1;
+ break;
+ //
+ // Group A:
+ //
+ // Rx = add(Rx,#s7)
+ // Rd = Rs
+ // Rd = #u6
+ // Rd = #-1
+ // if ([!]P0[.new]) Rd = #0
+ // Rd = add(r29,#u6:2)
+ // Rx = add(Rx,Rs)
+ // P0 = cmp.eq(Rs,#u2)
+ // Rdd = combine(#0,Rs)
+ // Rdd = combine(Rs,#0)
+ // Rdd = combine(#u2,#U2)
+ // Rd = add(Rs,#1)
+ // Rd = add(Rs,#-1)
+ // Rd = sxth/sxtb/zxtb/zxth(Rs)
+ // Rd = and(Rs,#1)
+ case Hexagon::A2_addi:
+ DstReg = MI.getOperand(0).getReg();
+ SrcReg = MI.getOperand(1).getReg();
+ if (isIntRegForSubInst(DstReg)) {
+ // Rd = add(r29,#u6:2)
+ if (Hexagon::IntRegsRegClass.contains(SrcReg) &&
+ HRI.getStackRegister() == SrcReg && MI.getOperand(2).isImm() &&
+ isShiftedUInt<6,2>(MI.getOperand(2).getImm()))
+ return HexagonII::HSIG_A;
+ // Rx = add(Rx,#s7)
+ if ((DstReg == SrcReg) && MI.getOperand(2).isImm() &&
+ isInt<7>(MI.getOperand(2).getImm()))
+ return HexagonII::HSIG_A;
+ // Rd = add(Rs,#1)
+ // Rd = add(Rs,#-1)
+ if (isIntRegForSubInst(SrcReg) && MI.getOperand(2).isImm() &&
+ ((MI.getOperand(2).getImm() == 1) ||
+ (MI.getOperand(2).getImm() == -1)))
+ return HexagonII::HSIG_A;
+ }
+ break;
+ case Hexagon::A2_add:
+ // Rx = add(Rx,Rs)
+ DstReg = MI.getOperand(0).getReg();
+ Src1Reg = MI.getOperand(1).getReg();
+ Src2Reg = MI.getOperand(2).getReg();
+ if (isIntRegForSubInst(DstReg) && (DstReg == Src1Reg) &&
+ isIntRegForSubInst(Src2Reg))
+ return HexagonII::HSIG_A;
+ break;
+ case Hexagon::A2_andir:
+ // Same as zxtb.
+ // Rd16=and(Rs16,#255)
+ // Rd16=and(Rs16,#1)
+ DstReg = MI.getOperand(0).getReg();
+ SrcReg = MI.getOperand(1).getReg();
+ if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg) &&
+ MI.getOperand(2).isImm() &&
+ ((MI.getOperand(2).getImm() == 1) ||
+ (MI.getOperand(2).getImm() == 255)))
+ return HexagonII::HSIG_A;
+ break;
+ case Hexagon::A2_tfr:
+ // Rd = Rs
+ DstReg = MI.getOperand(0).getReg();
+ SrcReg = MI.getOperand(1).getReg();
+ if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg))
+ return HexagonII::HSIG_A;
+ break;
+ case Hexagon::A2_tfrsi:
+ // Rd = #u6
+ // Do not test for #u6 size since the const is getting extended
+ // regardless and compound could be formed.
+ // Rd = #-1
+ DstReg = MI.getOperand(0).getReg();
+ if (isIntRegForSubInst(DstReg))
+ return HexagonII::HSIG_A;
+ break;
+ case Hexagon::C2_cmoveit:
+ case Hexagon::C2_cmovenewit:
+ case Hexagon::C2_cmoveif:
+ case Hexagon::C2_cmovenewif:
+ // if ([!]P0[.new]) Rd = #0
+ // Actual form:
+ // %R16<def> = C2_cmovenewit %P0<internal>, 0, %R16<imp-use,undef>;
+ DstReg = MI.getOperand(0).getReg();
+ SrcReg = MI.getOperand(1).getReg();
+ if (isIntRegForSubInst(DstReg) &&
+ Hexagon::PredRegsRegClass.contains(SrcReg) && Hexagon::P0 == SrcReg &&
+ MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0)
+ return HexagonII::HSIG_A;
+ break;
+ case Hexagon::C2_cmpeqi:
+ // P0 = cmp.eq(Rs,#u2)
+ DstReg = MI.getOperand(0).getReg();
+ SrcReg = MI.getOperand(1).getReg();
+ if (Hexagon::PredRegsRegClass.contains(DstReg) &&
+ Hexagon::P0 == DstReg && isIntRegForSubInst(SrcReg) &&
+ MI.getOperand(2).isImm() && isUInt<2>(MI.getOperand(2).getImm()))
+ return HexagonII::HSIG_A;
+ break;
+ case Hexagon::A2_combineii:
+ case Hexagon::A4_combineii:
+ // Rdd = combine(#u2,#U2)
+ DstReg = MI.getOperand(0).getReg();
+ if (isDblRegForSubInst(DstReg, HRI) &&
+ ((MI.getOperand(1).isImm() && isUInt<2>(MI.getOperand(1).getImm())) ||
+ (MI.getOperand(1).isGlobal() &&
+ isUInt<2>(MI.getOperand(1).getOffset()))) &&
+ ((MI.getOperand(2).isImm() && isUInt<2>(MI.getOperand(2).getImm())) ||
+ (MI.getOperand(2).isGlobal() &&
+ isUInt<2>(MI.getOperand(2).getOffset()))))
+ return HexagonII::HSIG_A;
+ break;
+ case Hexagon::A4_combineri:
+ // Rdd = combine(Rs,#0)
+ DstReg = MI.getOperand(0).getReg();
+ SrcReg = MI.getOperand(1).getReg();
+ if (isDblRegForSubInst(DstReg, HRI) && isIntRegForSubInst(SrcReg) &&
+ ((MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) ||
+ (MI.getOperand(2).isGlobal() && MI.getOperand(2).getOffset() == 0)))
+ return HexagonII::HSIG_A;
+ break;
+ case Hexagon::A4_combineir:
+ // Rdd = combine(#0,Rs)
+ DstReg = MI.getOperand(0).getReg();
+ SrcReg = MI.getOperand(2).getReg();
+ if (isDblRegForSubInst(DstReg, HRI) && isIntRegForSubInst(SrcReg) &&
+ ((MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) ||
+ (MI.getOperand(1).isGlobal() && MI.getOperand(1).getOffset() == 0)))
+ return HexagonII::HSIG_A;
+ break;
+ case Hexagon::A2_sxtb:
+ case Hexagon::A2_sxth:
+ case Hexagon::A2_zxtb:
+ case Hexagon::A2_zxth:
+ // Rd = sxth/sxtb/zxtb/zxth(Rs)
+ DstReg = MI.getOperand(0).getReg();
+ SrcReg = MI.getOperand(1).getReg();
+ if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg))
+ return HexagonII::HSIG_A;
+ break;
+ }
+
+ return HexagonII::HSIG_None;
+}
+
+
+short HexagonInstrInfo::getEquivalentHWInstr(const MachineInstr &MI) const {
+ return Hexagon::getRealHWInstr(MI.getOpcode(), Hexagon::InstrType_Real);
+}
+
+
+// Return first non-debug instruction in the basic block.
+MachineInstr *HexagonInstrInfo::getFirstNonDbgInst(MachineBasicBlock *BB)
+ const {
+ for (auto MII = BB->instr_begin(), End = BB->instr_end(); MII != End; MII++) {
+ MachineInstr &MI = *MII;
+ if (MI.isDebugValue())
+ continue;
+ return &MI;
+ }
+ return nullptr;
+}
+
+
+unsigned HexagonInstrInfo::getInstrTimingClassLatency(
+ const InstrItineraryData *ItinData, const MachineInstr &MI) const {
+ // Default to one cycle for no itinerary. However, an "empty" itinerary may
+ // still have a MinLatency property, which getStageLatency checks.
+ if (!ItinData)
+ return getInstrLatency(ItinData, MI);
+
+ // Get the latency embedded in the itinerary. If we're not using timing class
+ // latencies or if we using BSB scheduling, then restrict the maximum latency
+ // to 1 (that is, either 0 or 1).
+ if (MI.isTransient())
+ return 0;
+ unsigned Latency = ItinData->getStageLatency(MI.getDesc().getSchedClass());
+ if (!EnableTimingClassLatency ||
+ MI.getParent()->getParent()->getSubtarget<HexagonSubtarget>().
+ useBSBScheduling())
+ if (Latency > 1)
+ Latency = 1;
+ return Latency;
+}
+
+
+// inverts the predication logic.
+// p -> NotP
+// NotP -> P
+bool HexagonInstrInfo::getInvertedPredSense(
+ SmallVectorImpl<MachineOperand> &Cond) const {
+ if (Cond.empty())
+ return false;
+ unsigned Opc = getInvertedPredicatedOpcode(Cond[0].getImm());
+ Cond[0].setImm(Opc);
+ return true;
+}
+
+
+unsigned HexagonInstrInfo::getInvertedPredicatedOpcode(const int Opc) const {
+ int InvPredOpcode;
+ InvPredOpcode = isPredicatedTrue(Opc) ? Hexagon::getFalsePredOpcode(Opc)
+ : Hexagon::getTruePredOpcode(Opc);
+ if (InvPredOpcode >= 0) // Valid instruction with the inverted predicate.
+ return InvPredOpcode;
+
+ llvm_unreachable("Unexpected predicated instruction");
+}
+
+
+// Returns the max value that doesn't need to be extended.
+int HexagonInstrInfo::getMaxValue(const MachineInstr &MI) const {
+ const uint64_t F = MI.getDesc().TSFlags;
+ unsigned isSigned = (F >> HexagonII::ExtentSignedPos)
+ & HexagonII::ExtentSignedMask;
+ unsigned bits = (F >> HexagonII::ExtentBitsPos)
+ & HexagonII::ExtentBitsMask;
+
+ if (isSigned) // if value is signed
+ return ~(-1U << (bits - 1));
+ else
+ return ~(-1U << bits);
+}
+
+
+unsigned HexagonInstrInfo::getMemAccessSize(const MachineInstr &MI) const {
+ const uint64_t F = MI.getDesc().TSFlags;
+ return (F >> HexagonII::MemAccessSizePos) & HexagonII::MemAccesSizeMask;
+}
+
+
+// Returns the min value that doesn't need to be extended.
+int HexagonInstrInfo::getMinValue(const MachineInstr &MI) const {
+ const uint64_t F = MI.getDesc().TSFlags;
+ unsigned isSigned = (F >> HexagonII::ExtentSignedPos)
+ & HexagonII::ExtentSignedMask;
+ unsigned bits = (F >> HexagonII::ExtentBitsPos)
+ & HexagonII::ExtentBitsMask;
+
+ if (isSigned) // if value is signed
+ return -1U << (bits - 1);
+ else
+ return 0;
+}
+
+
+// Returns opcode of the non-extended equivalent instruction.
+short HexagonInstrInfo::getNonExtOpcode(const MachineInstr &MI) const {
+ // Check if the instruction has a register form that uses register in place
+ // of the extended operand, if so return that as the non-extended form.
+ short NonExtOpcode = Hexagon::getRegForm(MI.getOpcode());
+ if (NonExtOpcode >= 0)
+ return NonExtOpcode;
+
+ if (MI.getDesc().mayLoad() || MI.getDesc().mayStore()) {
+ // Check addressing mode and retrieve non-ext equivalent instruction.
+ switch (getAddrMode(MI)) {
+ case HexagonII::Absolute :
+ return Hexagon::getBaseWithImmOffset(MI.getOpcode());
+ case HexagonII::BaseImmOffset :
+ return Hexagon::getBaseWithRegOffset(MI.getOpcode());
+ case HexagonII::BaseLongOffset:
+ return Hexagon::getRegShlForm(MI.getOpcode());
+
+ default:
+ return -1;
+ }
+ }
+ return -1;
+}
+
+
+bool HexagonInstrInfo::getPredReg(ArrayRef<MachineOperand> Cond,
+ unsigned &PredReg, unsigned &PredRegPos, unsigned &PredRegFlags) const {
+ if (Cond.empty())
+ return false;
+ assert(Cond.size() == 2);
+ if (isNewValueJump(Cond[0].getImm()) || Cond[1].isMBB()) {
+ DEBUG(dbgs() << "No predregs for new-value jumps/endloop");
+ return false;
+ }
+ PredReg = Cond[1].getReg();
+ PredRegPos = 1;
+ // See IfConversion.cpp why we add RegState::Implicit | RegState::Undef
+ PredRegFlags = 0;
+ if (Cond[1].isImplicit())
+ PredRegFlags = RegState::Implicit;
+ if (Cond[1].isUndef())
+ PredRegFlags |= RegState::Undef;
+ return true;
+}
+
+
+short HexagonInstrInfo::getPseudoInstrPair(const MachineInstr &MI) const {
+ return Hexagon::getRealHWInstr(MI.getOpcode(), Hexagon::InstrType_Pseudo);
+}
+
+
+short HexagonInstrInfo::getRegForm(const MachineInstr &MI) const {
+ return Hexagon::getRegForm(MI.getOpcode());
+}
+
+
+// Return the number of bytes required to encode the instruction.
+// Hexagon instructions are fixed length, 4 bytes, unless they
+// use a constant extender, which requires another 4 bytes.
+// For debug instructions and prolog labels, return 0.
+unsigned HexagonInstrInfo::getSize(const MachineInstr &MI) const {
+ if (MI.isDebugValue() || MI.isPosition())
+ return 0;
+
+ unsigned Size = MI.getDesc().getSize();
+ if (!Size)
+ // Assume the default insn size in case it cannot be determined
+ // for whatever reason.
+ Size = HEXAGON_INSTR_SIZE;
+
+ if (isConstExtended(MI) || isExtended(MI))
+ Size += HEXAGON_INSTR_SIZE;
+
+ // Try and compute number of instructions in asm.
+ if (BranchRelaxAsmLarge && MI.getOpcode() == Hexagon::INLINEASM) {
+ const MachineBasicBlock &MBB = *MI.getParent();
+ const MachineFunction *MF = MBB.getParent();
+ const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
+
+ // Count the number of register definitions to find the asm string.
+ unsigned NumDefs = 0;
+ for (; MI.getOperand(NumDefs).isReg() && MI.getOperand(NumDefs).isDef();
+ ++NumDefs)
+ assert(NumDefs != MI.getNumOperands()-2 && "No asm string?");
+
+ assert(MI.getOperand(NumDefs).isSymbol() && "No asm string?");
+ // Disassemble the AsmStr and approximate number of instructions.
+ const char *AsmStr = MI.getOperand(NumDefs).getSymbolName();
+ Size = getInlineAsmLength(AsmStr, *MAI);
+ }
+
+ return Size;
+}
+
+
+uint64_t HexagonInstrInfo::getType(const MachineInstr &MI) const {
+ const uint64_t F = MI.getDesc().TSFlags;
+ return (F >> HexagonII::TypePos) & HexagonII::TypeMask;
+}
+
+
+unsigned HexagonInstrInfo::getUnits(const MachineInstr &MI) const {
+ const TargetSubtargetInfo &ST = MI.getParent()->getParent()->getSubtarget();
+ const InstrItineraryData &II = *ST.getInstrItineraryData();
+ const InstrStage &IS = *II.beginStage(MI.getDesc().getSchedClass());
+
+ return IS.getUnits();
+}
+
+
+unsigned HexagonInstrInfo::getValidSubTargets(const unsigned Opcode) const {
+ const uint64_t F = get(Opcode).TSFlags;
+ return (F >> HexagonII::validSubTargetPos) & HexagonII::validSubTargetMask;
+}
+
+
+// Calculate size of the basic block without debug instructions.
+unsigned HexagonInstrInfo::nonDbgBBSize(const MachineBasicBlock *BB) const {
+ return nonDbgMICount(BB->instr_begin(), BB->instr_end());
+}
+
+
+unsigned HexagonInstrInfo::nonDbgBundleSize(
+ MachineBasicBlock::const_iterator BundleHead) const {
+ assert(BundleHead->isBundle() && "Not a bundle header");
+ auto MII = BundleHead.getInstrIterator();
+ // Skip the bundle header.
+ return nonDbgMICount(++MII, getBundleEnd(BundleHead.getInstrIterator()));
+}
+
+
+/// immediateExtend - Changes the instruction in place to one using an immediate
+/// extender.
+void HexagonInstrInfo::immediateExtend(MachineInstr &MI) const {
+ assert((isExtendable(MI)||isConstExtended(MI)) &&
+ "Instruction must be extendable");
+ // Find which operand is extendable.
+ short ExtOpNum = getCExtOpNum(MI);
+ MachineOperand &MO = MI.getOperand(ExtOpNum);
+ // This needs to be something we understand.
+ assert((MO.isMBB() || MO.isImm()) &&
+ "Branch with unknown extendable field type");
+ // Mark given operand as extended.
+ MO.addTargetFlag(HexagonII::HMOTF_ConstExtended);
+}
+
+
+bool HexagonInstrInfo::invertAndChangeJumpTarget(
+ MachineInstr &MI, MachineBasicBlock *NewTarget) const {
+ DEBUG(dbgs() << "\n[invertAndChangeJumpTarget] to BB#"
+ << NewTarget->getNumber(); MI.dump(););
+ assert(MI.isBranch());
+ unsigned NewOpcode = getInvertedPredicatedOpcode(MI.getOpcode());
+ int TargetPos = MI.getNumOperands() - 1;
+ // In general branch target is the last operand,
+ // but some implicit defs added at the end might change it.
+ while ((TargetPos > -1) && !MI.getOperand(TargetPos).isMBB())
+ --TargetPos;
+ assert((TargetPos >= 0) && MI.getOperand(TargetPos).isMBB());
+ MI.getOperand(TargetPos).setMBB(NewTarget);
+ if (EnableBranchPrediction && isPredicatedNew(MI)) {
+ NewOpcode = reversePrediction(NewOpcode);
+ }
+ MI.setDesc(get(NewOpcode));
+ return true;
+}
+
+
+void HexagonInstrInfo::genAllInsnTimingClasses(MachineFunction &MF) const {
+ /* +++ The code below is used to generate complete set of Hexagon Insn +++ */
+ MachineFunction::iterator A = MF.begin();
+ MachineBasicBlock &B = *A;
+ MachineBasicBlock::iterator I = B.begin();
+ DebugLoc DL = I->getDebugLoc();
+ MachineInstr *NewMI;
+
+ for (unsigned insn = TargetOpcode::GENERIC_OP_END+1;
+ insn < Hexagon::INSTRUCTION_LIST_END; ++insn) {
+ NewMI = BuildMI(B, I, DL, get(insn));
+ DEBUG(dbgs() << "\n" << getName(NewMI->getOpcode()) <<
+ " Class: " << NewMI->getDesc().getSchedClass());
+ NewMI->eraseFromParent();
+ }
+ /* --- The code above is used to generate complete set of Hexagon Insn --- */
+}
+
+
+// inverts the predication logic.
+// p -> NotP
+// NotP -> P
+bool HexagonInstrInfo::reversePredSense(MachineInstr &MI) const {
+ DEBUG(dbgs() << "\nTrying to reverse pred. sense of:"; MI.dump());
+ MI.setDesc(get(getInvertedPredicatedOpcode(MI.getOpcode())));
+ return true;
+}
+
+
+// Reverse the branch prediction.
+unsigned HexagonInstrInfo::reversePrediction(unsigned Opcode) const {
+ int PredRevOpcode = -1;
+ if (isPredictedTaken(Opcode))
+ PredRevOpcode = Hexagon::notTakenBranchPrediction(Opcode);
+ else
+ PredRevOpcode = Hexagon::takenBranchPrediction(Opcode);
+ assert(PredRevOpcode > 0);
+ return PredRevOpcode;
+}
+
+
+// TODO: Add more rigorous validation.
+bool HexagonInstrInfo::validateBranchCond(const ArrayRef<MachineOperand> &Cond)
+ const {
+ return Cond.empty() || (Cond[0].isImm() && (Cond.size() != 1));
+}
+
+
+short HexagonInstrInfo::xformRegToImmOffset(const MachineInstr &MI) const {
+ return Hexagon::xformRegToImmOffset(MI.getOpcode());
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
new file mode 100644
index 000000000000..2d184d1484e9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -0,0 +1,443 @@
+//===- HexagonInstrInfo.h - Hexagon Instruction Information -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Hexagon implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONINSTRINFO_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONINSTRINFO_H
+
+#include "HexagonRegisterInfo.h"
+#include "MCTargetDesc/HexagonBaseInfo.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "HexagonGenInstrInfo.inc"
+
+namespace llvm {
+
+struct EVT;
+class HexagonSubtarget;
+
+class HexagonInstrInfo : public HexagonGenInstrInfo {
+ virtual void anchor();
+ const HexagonRegisterInfo RI;
+
+public:
+ explicit HexagonInstrInfo(HexagonSubtarget &ST);
+
+ /// TargetInstrInfo overrides.
+ ///
+
+ /// If the specified machine instruction is a direct
+ /// load from a stack slot, return the virtual or physical register number of
+ /// the destination along with the FrameIndex of the loaded stack slot. If
+ /// not, return 0. This predicate must return 0 if the instruction has
+ /// any side effects other than loading from the stack slot.
+ unsigned isLoadFromStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const override;
+
+ /// If the specified machine instruction is a direct
+ /// store to a stack slot, return the virtual or physical register number of
+ /// the source reg along with the FrameIndex of the loaded stack slot. If
+ /// not, return 0. This predicate must return 0 if the instruction has
+ /// any side effects other than storing to the stack slot.
+ unsigned isStoreToStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const override;
+
+ /// Analyze the branching code at the end of MBB, returning
+ /// true if it cannot be understood (e.g. it's a switch dispatch or isn't
+ /// implemented for a target). Upon success, this returns false and returns
+ /// with the following information in various cases:
+ ///
+ /// 1. If this block ends with no branches (it just falls through to its succ)
+ /// just return false, leaving TBB/FBB null.
+ /// 2. If this block ends with only an unconditional branch, it sets TBB to be
+ /// the destination block.
+ /// 3. If this block ends with a conditional branch and it falls through to a
+ /// successor block, it sets TBB to be the branch destination block and a
+ /// list of operands that evaluate the condition. These operands can be
+ /// passed to other TargetInstrInfo methods to create new branches.
+ /// 4. If this block ends with a conditional branch followed by an
+ /// unconditional branch, it returns the 'true' destination in TBB, the
+ /// 'false' destination in FBB, and a list of operands that evaluate the
+ /// condition. These operands can be passed to other TargetInstrInfo
+ /// methods to create new branches.
+ ///
+ /// Note that removeBranch and insertBranch must be implemented to support
+ /// cases where this method returns success.
+ ///
+ /// If AllowModify is true, then this routine is allowed to modify the basic
+ /// block (e.g. delete instructions after the unconditional branch).
+ ///
+ bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const override;
+
+ /// Remove the branching code at the end of the specific MBB.
+ /// This is only invoked in cases where AnalyzeBranch returns success. It
+ /// returns the number of instructions that were removed.
+ unsigned removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved = nullptr) const override;
+
+ /// Insert branch code into the end of the specified MachineBasicBlock.
+ /// The operands to this method are the same as those
+ /// returned by AnalyzeBranch. This is only invoked in cases where
+ /// AnalyzeBranch returns success. It returns the number of instructions
+ /// inserted.
+ ///
+ /// It is also invoked by tail merging to add unconditional branches in
+ /// cases where AnalyzeBranch doesn't apply because there was no original
+ /// branch to analyze. At least this much must be implemented, else tail
+ /// merging needs to be disabled.
+ unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+ const DebugLoc &DL,
+ int *BytesAdded = nullptr) const override;
+
+ /// Analyze the loop code, return true if it cannot be understood. Upon
+ /// success, this function returns false and returns information about the
+ /// induction variable and compare instruction used at the end.
+ bool analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst,
+ MachineInstr *&CmpInst) const override;
+
+ /// Generate code to reduce the loop iteration by one and check if the loop is
+ /// finished. Return the value/register of the the new loop count. We need
+ /// this function when peeling off one or more iterations of a loop. This
+ /// function assumes the nth iteration is peeled first.
+ unsigned reduceLoopCount(MachineBasicBlock &MBB,
+ MachineInstr *IndVar, MachineInstr &Cmp,
+ SmallVectorImpl<MachineOperand> &Cond,
+ SmallVectorImpl<MachineInstr *> &PrevInsts,
+ unsigned Iter, unsigned MaxIter) const override;
+
+ /// Return true if it's profitable to predicate
+ /// instructions with accumulated instruction latency of "NumCycles"
+ /// of the specified basic block, where the probability of the instructions
+ /// being executed is given by Probability, and Confidence is a measure
+ /// of our confidence that it will be properly predicted.
+ bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
+ unsigned ExtraPredCycles,
+ BranchProbability Probability) const override;
+
+ /// Second variant of isProfitableToIfCvt. This one
+ /// checks for the case where two basic blocks from true and false path
+ /// of a if-then-else (diamond) are predicated on mutally exclusive
+ /// predicates, where the probability of the true path being taken is given
+ /// by Probability, and Confidence is a measure of our confidence that it
+ /// will be properly predicted.
+ bool isProfitableToIfCvt(MachineBasicBlock &TMBB,
+ unsigned NumTCycles, unsigned ExtraTCycles,
+ MachineBasicBlock &FMBB,
+ unsigned NumFCycles, unsigned ExtraFCycles,
+ BranchProbability Probability) const override;
+
+ /// Return true if it's profitable for if-converter to duplicate instructions
+ /// of specified accumulated instruction latencies in the specified MBB to
+ /// enable if-conversion.
+ /// The probability of the instructions being executed is given by
+ /// Probability, and Confidence is a measure of our confidence that it
+ /// will be properly predicted.
+ bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
+ BranchProbability Probability) const override;
+
+ /// Emit instructions to copy a pair of physical registers.
+ ///
+ /// This function should support copies within any legal register class as
+ /// well as any cross-class copies created during instruction selection.
+ ///
+ /// The source and destination registers may overlap, which may require a
+ /// careful implementation when multiple copy instructions are required for
+ /// large registers. See for example the ARM target.
+ void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+ bool KillSrc) const override;
+
+ /// Store the specified register of the given register class to the specified
+ /// stack frame index. The store instruction is to be added to the given
+ /// machine basic block before the specified machine instruction. If isKill
+ /// is true, the register operand is the last use and must be marked kill.
+ void storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ unsigned SrcReg, bool isKill, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
+
+ /// Load the specified register of the given register class from the specified
+ /// stack frame index. The load instruction is to be added to the given
+ /// machine basic block before the specified machine instruction.
+ void loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ unsigned DestReg, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
+
+ /// This function is called for all pseudo instructions
+ /// that remain after register allocation. Many pseudo instructions are
+ /// created to help register allocation. This is the place to convert them
+ /// into real instructions. The target can edit MI in place, or it can insert
+ /// new instructions and erase MI. The function should return true if
+ /// anything was changed.
+ bool expandPostRAPseudo(MachineInstr &MI) const override;
+
+ /// \brief Get the base register and byte offset of a load/store instr.
+ bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
+ int64_t &Offset,
+ const TargetRegisterInfo *TRI) const override;
+
+ /// Reverses the branch condition of the specified condition list,
+ /// returning false on success and true if it cannot be reversed.
+ bool reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond)
+ const override;
+
+ /// Insert a noop into the instruction stream at the specified point.
+ void insertNoop(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI) const override;
+
+ /// Returns true if the instruction is already predicated.
+ bool isPredicated(const MachineInstr &MI) const override;
+
+ /// Return true for post-incremented instructions.
+ bool isPostIncrement(const MachineInstr &MI) const override;
+
+ /// Convert the instruction into a predicated instruction.
+ /// It returns true if the operation was successful.
+ bool PredicateInstruction(MachineInstr &MI,
+ ArrayRef<MachineOperand> Cond) const override;
+
+ /// Returns true if the first specified predicate
+ /// subsumes the second, e.g. GE subsumes GT.
+ bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
+ ArrayRef<MachineOperand> Pred2) const override;
+
+ /// If the specified instruction defines any predicate
+ /// or condition code register(s) used for predication, returns true as well
+ /// as the definition predicate(s) by reference.
+ bool DefinesPredicate(MachineInstr &MI,
+ std::vector<MachineOperand> &Pred) const override;
+
+ /// Return true if the specified instruction can be predicated.
+ /// By default, this returns true for every instruction with a
+ /// PredicateOperand.
+ bool isPredicable(MachineInstr &MI) const override;
+
+ /// Test if the given instruction should be considered a scheduling boundary.
+ /// This primarily includes labels and terminators.
+ bool isSchedulingBoundary(const MachineInstr &MI,
+ const MachineBasicBlock *MBB,
+ const MachineFunction &MF) const override;
+
+ /// Measure the specified inline asm to determine an approximation of its
+ /// length.
+ unsigned getInlineAsmLength(const char *Str,
+ const MCAsmInfo &MAI) const override;
+
+ /// Allocate and return a hazard recognizer to use for this target when
+ /// scheduling the machine instructions after register allocation.
+ ScheduleHazardRecognizer*
+ CreateTargetPostRAHazardRecognizer(const InstrItineraryData*,
+ const ScheduleDAG *DAG) const override;
+
+ /// For a comparison instruction, return the source registers
+ /// in SrcReg and SrcReg2 if having two register operands, and the value it
+ /// compares against in CmpValue. Return true if the comparison instruction
+ /// can be analyzed.
+ bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
+ unsigned &SrcReg2, int &Mask, int &Value) const override;
+
+ /// Compute the instruction latency of a given instruction.
+ /// If the instruction has higher cost when predicated, it's returned via
+ /// PredCost.
+ unsigned getInstrLatency(const InstrItineraryData *ItinData,
+ const MachineInstr &MI,
+ unsigned *PredCost = 0) const override;
+
+ /// Create machine specific model for scheduling.
+ DFAPacketizer *
+ CreateTargetScheduleState(const TargetSubtargetInfo &STI) const override;
+
+ // Sometimes, it is possible for the target
+ // to tell, even without aliasing information, that two MIs access different
+ // memory addresses. This function returns true if two MIs access different
+ // memory addresses and false otherwise.
+ bool
+ areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
+ AliasAnalysis *AA = nullptr) const override;
+
+ /// For instructions with a base and offset, return the position of the
+ /// base register and offset operands.
+ bool getBaseAndOffsetPosition(const MachineInstr &MI, unsigned &BasePos,
+ unsigned &OffsetPos) const override;
+
+ /// If the instruction is an increment of a constant value, return the amount.
+ bool getIncrementValue(const MachineInstr &MI, int &Value) const override;
+
+ bool isTailCall(const MachineInstr &MI) const override;
+
+ /// HexagonInstrInfo specifics.
+ ///
+
+ const HexagonRegisterInfo &getRegisterInfo() const { return RI; }
+
+ unsigned createVR(MachineFunction* MF, MVT VT) const;
+
+ bool isAbsoluteSet(const MachineInstr &MI) const;
+ bool isAccumulator(const MachineInstr &MI) const;
+ bool isComplex(const MachineInstr &MI) const;
+ bool isCompoundBranchInstr(const MachineInstr &MI) const;
+ bool isCondInst(const MachineInstr &MI) const;
+ bool isConditionalALU32 (const MachineInstr &MI) const;
+ bool isConditionalLoad(const MachineInstr &MI) const;
+ bool isConditionalStore(const MachineInstr &MI) const;
+ bool isConditionalTransfer(const MachineInstr &MI) const;
+ bool isConstExtended(const MachineInstr &MI) const;
+ bool isDeallocRet(const MachineInstr &MI) const;
+ bool isDependent(const MachineInstr &ProdMI,
+ const MachineInstr &ConsMI) const;
+ bool isDotCurInst(const MachineInstr &MI) const;
+ bool isDotNewInst(const MachineInstr &MI) const;
+ bool isDuplexPair(const MachineInstr &MIa, const MachineInstr &MIb) const;
+ bool isEarlySourceInstr(const MachineInstr &MI) const;
+ bool isEndLoopN(unsigned Opcode) const;
+ bool isExpr(unsigned OpType) const;
+ bool isExtendable(const MachineInstr &MI) const;
+ bool isExtended(const MachineInstr &MI) const;
+ bool isFloat(const MachineInstr &MI) const;
+ bool isHVXMemWithAIndirect(const MachineInstr &I,
+ const MachineInstr &J) const;
+ bool isIndirectCall(const MachineInstr &MI) const;
+ bool isIndirectL4Return(const MachineInstr &MI) const;
+ bool isJumpR(const MachineInstr &MI) const;
+ bool isJumpWithinBranchRange(const MachineInstr &MI, unsigned offset) const;
+ bool isLateInstrFeedsEarlyInstr(const MachineInstr &LRMI,
+ const MachineInstr &ESMI) const;
+ bool isLateResultInstr(const MachineInstr &MI) const;
+ bool isLateSourceInstr(const MachineInstr &MI) const;
+ bool isLoopN(const MachineInstr &MI) const;
+ bool isMemOp(const MachineInstr &MI) const;
+ bool isNewValue(const MachineInstr &MI) const;
+ bool isNewValue(unsigned Opcode) const;
+ bool isNewValueInst(const MachineInstr &MI) const;
+ bool isNewValueJump(const MachineInstr &MI) const;
+ bool isNewValueJump(unsigned Opcode) const;
+ bool isNewValueStore(const MachineInstr &MI) const;
+ bool isNewValueStore(unsigned Opcode) const;
+ bool isOperandExtended(const MachineInstr &MI, unsigned OperandNum) const;
+ bool isPredicatedNew(const MachineInstr &MI) const;
+ bool isPredicatedNew(unsigned Opcode) const;
+ bool isPredicatedTrue(const MachineInstr &MI) const;
+ bool isPredicatedTrue(unsigned Opcode) const;
+ bool isPredicated(unsigned Opcode) const;
+ bool isPredicateLate(unsigned Opcode) const;
+ bool isPredictedTaken(unsigned Opcode) const;
+ bool isSaveCalleeSavedRegsCall(const MachineInstr &MI) const;
+ bool isSignExtendingLoad(const MachineInstr &MI) const;
+ bool isSolo(const MachineInstr &MI) const;
+ bool isSpillPredRegOp(const MachineInstr &MI) const;
+ bool isTC1(const MachineInstr &MI) const;
+ bool isTC2(const MachineInstr &MI) const;
+ bool isTC2Early(const MachineInstr &MI) const;
+ bool isTC4x(const MachineInstr &MI) const;
+ bool isToBeScheduledASAP(const MachineInstr &MI1,
+ const MachineInstr &MI2) const;
+ bool isV60VectorInstruction(const MachineInstr &MI) const;
+ bool isValidAutoIncImm(const EVT VT, const int Offset) const;
+ bool isValidOffset(unsigned Opcode, int Offset, bool Extend = true) const;
+ bool isVecAcc(const MachineInstr &MI) const;
+ bool isVecALU(const MachineInstr &MI) const;
+ bool isVecUsableNextPacket(const MachineInstr &ProdMI,
+ const MachineInstr &ConsMI) const;
+ bool isZeroExtendingLoad(const MachineInstr &MI) const;
+
+ bool addLatencyToSchedule(const MachineInstr &MI1,
+ const MachineInstr &MI2) const;
+ bool canExecuteInBundle(const MachineInstr &First,
+ const MachineInstr &Second) const;
+ bool doesNotReturn(const MachineInstr &CallMI) const;
+ bool hasEHLabel(const MachineBasicBlock *B) const;
+ bool hasNonExtEquivalent(const MachineInstr &MI) const;
+ bool hasPseudoInstrPair(const MachineInstr &MI) const;
+ bool hasUncondBranch(const MachineBasicBlock *B) const;
+ bool mayBeCurLoad(const MachineInstr &MI) const;
+ bool mayBeNewStore(const MachineInstr &MI) const;
+ bool producesStall(const MachineInstr &ProdMI,
+ const MachineInstr &ConsMI) const;
+ bool producesStall(const MachineInstr &MI,
+ MachineBasicBlock::const_instr_iterator MII) const;
+ bool predCanBeUsedAsDotNew(const MachineInstr &MI, unsigned PredReg) const;
+ bool PredOpcodeHasJMP_c(unsigned Opcode) const;
+ bool predOpcodeHasNot(ArrayRef<MachineOperand> Cond) const;
+
+
+ short getAbsoluteForm(const MachineInstr &MI) const;
+ unsigned getAddrMode(const MachineInstr &MI) const;
+ unsigned getBaseAndOffset(const MachineInstr &MI, int &Offset,
+ unsigned &AccessSize) const;
+ short getBaseWithLongOffset(short Opcode) const;
+ short getBaseWithLongOffset(const MachineInstr &MI) const;
+ short getBaseWithRegOffset(const MachineInstr &MI) const;
+ SmallVector<MachineInstr*,2> getBranchingInstrs(MachineBasicBlock& MBB) const;
+ unsigned getCExtOpNum(const MachineInstr &MI) const;
+ HexagonII::CompoundGroup
+ getCompoundCandidateGroup(const MachineInstr &MI) const;
+ unsigned getCompoundOpcode(const MachineInstr &GA,
+ const MachineInstr &GB) const;
+ int getCondOpcode(int Opc, bool sense) const;
+ int getDotCurOp(const MachineInstr &MI) const;
+ int getDotNewOp(const MachineInstr &MI) const;
+ int getDotNewPredJumpOp(const MachineInstr &MI,
+ const MachineBranchProbabilityInfo *MBPI) const;
+ int getDotNewPredOp(const MachineInstr &MI,
+ const MachineBranchProbabilityInfo *MBPI) const;
+ int getDotOldOp(const int opc) const;
+ HexagonII::SubInstructionGroup getDuplexCandidateGroup(const MachineInstr &MI)
+ const;
+ short getEquivalentHWInstr(const MachineInstr &MI) const;
+ MachineInstr *getFirstNonDbgInst(MachineBasicBlock *BB) const;
+ unsigned getInstrTimingClassLatency(const InstrItineraryData *ItinData,
+ const MachineInstr &MI) const;
+ bool getInvertedPredSense(SmallVectorImpl<MachineOperand> &Cond) const;
+ unsigned getInvertedPredicatedOpcode(const int Opc) const;
+ int getMaxValue(const MachineInstr &MI) const;
+ unsigned getMemAccessSize(const MachineInstr &MI) const;
+ int getMinValue(const MachineInstr &MI) const;
+ short getNonExtOpcode(const MachineInstr &MI) const;
+ bool getPredReg(ArrayRef<MachineOperand> Cond, unsigned &PredReg,
+ unsigned &PredRegPos, unsigned &PredRegFlags) const;
+ short getPseudoInstrPair(const MachineInstr &MI) const;
+ short getRegForm(const MachineInstr &MI) const;
+ unsigned getSize(const MachineInstr &MI) const;
+ uint64_t getType(const MachineInstr &MI) const;
+ unsigned getUnits(const MachineInstr &MI) const;
+ unsigned getValidSubTargets(const unsigned Opcode) const;
+
+
+ /// getInstrTimingClassLatency - Compute the instruction latency of a given
+ /// instruction using Timing Class information, if available.
+ unsigned nonDbgBBSize(const MachineBasicBlock *BB) const;
+ unsigned nonDbgBundleSize(MachineBasicBlock::const_iterator BundleHead) const;
+
+
+ void immediateExtend(MachineInstr &MI) const;
+ bool invertAndChangeJumpTarget(MachineInstr &MI,
+ MachineBasicBlock* NewTarget) const;
+ void genAllInsnTimingClasses(MachineFunction &MF) const;
+ bool reversePredSense(MachineInstr &MI) const;
+ unsigned reversePrediction(unsigned Opcode) const;
+ bool validateBranchCond(const ArrayRef<MachineOperand> &Cond) const;
+ short xformRegToImmOffset(const MachineInstr &MI) const;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.td
new file mode 100644
index 000000000000..c5719ad5b6d8
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.td
@@ -0,0 +1,4799 @@
+//==- HexagonInstrInfo.td - Target Description for Hexagon -*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+include "HexagonInstrFormats.td"
+include "HexagonOperands.td"
+include "HexagonInstrEnc.td"
+
+//===----------------------------------------------------------------------===//
+// Compare
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0, isCompare = 1, InputType = "imm", isExtendable = 1,
+ opExtendable = 2 in
+class T_CMP <string mnemonic, bits<2> MajOp, bit isNot, Operand ImmOp>
+ : ALU32Inst <(outs PredRegs:$dst),
+ (ins IntRegs:$src1, ImmOp:$src2),
+ "$dst = "#!if(isNot, "!","")#mnemonic#"($src1, #$src2)",
+ [], "",ALU32_2op_tc_2early_SLOT0123 >, ImmRegRel {
+ bits<2> dst;
+ bits<5> src1;
+ bits<10> src2;
+ let CextOpcode = mnemonic;
+ let opExtentBits = !if(!eq(mnemonic, "cmp.gtu"), 9, 10);
+ let isExtentSigned = !if(!eq(mnemonic, "cmp.gtu"), 0, 1);
+
+ let IClass = 0b0111;
+
+ let Inst{27-24} = 0b0101;
+ let Inst{23-22} = MajOp;
+ let Inst{21} = !if(!eq(mnemonic, "cmp.gtu"), 0, src2{9});
+ let Inst{20-16} = src1;
+ let Inst{13-5} = src2{8-0};
+ let Inst{4} = isNot;
+ let Inst{3-2} = 0b00;
+ let Inst{1-0} = dst;
+ }
+
+def C2_cmpeqi : T_CMP <"cmp.eq", 0b00, 0, s10_0Ext>;
+def C2_cmpgti : T_CMP <"cmp.gt", 0b01, 0, s10_0Ext>;
+def C2_cmpgtui : T_CMP <"cmp.gtu", 0b10, 0, u9_0Ext>;
+
+//===----------------------------------------------------------------------===//
+// ALU32/ALU +
+//===----------------------------------------------------------------------===//
+// Add.
+
+let hasSideEffects = 0, hasNewValue = 1, InputType = "reg" in
+class T_ALU32_3op<string mnemonic, bits<3> MajOp, bits<3> MinOp, bit OpsRev,
+ bit IsComm>
+ : ALU32_rr<(outs IntRegs:$Rd), (ins IntRegs:$Rs, IntRegs:$Rt),
+ "$Rd = "#mnemonic#"($Rs, $Rt)",
+ [], "", ALU32_3op_tc_1_SLOT0123>, ImmRegRel, PredRel {
+ let isCommutable = IsComm;
+ let BaseOpcode = mnemonic#_rr;
+ let CextOpcode = mnemonic;
+
+ bits<5> Rs;
+ bits<5> Rt;
+ bits<5> Rd;
+
+ let IClass = 0b1111;
+ let Inst{27} = 0b0;
+ let Inst{26-24} = MajOp;
+ let Inst{23-21} = MinOp;
+ let Inst{20-16} = !if(OpsRev,Rt,Rs);
+ let Inst{12-8} = !if(OpsRev,Rs,Rt);
+ let Inst{4-0} = Rd;
+}
+
+let hasSideEffects = 0, hasNewValue = 1 in
+class T_ALU32_3op_pred<string mnemonic, bits<3> MajOp, bits<3> MinOp,
+ bit OpsRev, bit PredNot, bit PredNew>
+ : ALU32_rr<(outs IntRegs:$Rd), (ins PredRegs:$Pu, IntRegs:$Rs, IntRegs:$Rt),
+ "if ("#!if(PredNot,"!","")#"$Pu"#!if(PredNew,".new","")#") "#
+ "$Rd = "#mnemonic#"($Rs, $Rt)",
+ [], "", ALU32_3op_tc_1_SLOT0123>, ImmRegRel, PredNewRel {
+ let isPredicated = 1;
+ let isPredicatedFalse = PredNot;
+ let isPredicatedNew = PredNew;
+ let BaseOpcode = mnemonic#_rr;
+ let CextOpcode = mnemonic;
+
+ bits<2> Pu;
+ bits<5> Rs;
+ bits<5> Rt;
+ bits<5> Rd;
+
+ let IClass = 0b1111;
+ let Inst{27} = 0b1;
+ let Inst{26-24} = MajOp;
+ let Inst{23-21} = MinOp;
+ let Inst{20-16} = !if(OpsRev,Rt,Rs);
+ let Inst{13} = PredNew;
+ let Inst{12-8} = !if(OpsRev,Rs,Rt);
+ let Inst{7} = PredNot;
+ let Inst{6-5} = Pu;
+ let Inst{4-0} = Rd;
+}
+
+class T_ALU32_combineh<string Op1, string Op2, bits<3> MajOp, bits<3> MinOp,
+ bit OpsRev>
+ : T_ALU32_3op<"", MajOp, MinOp, OpsRev, 0> {
+ let AsmString = "$Rd = combine($Rs"#Op1#", $Rt"#Op2#")";
+}
+
+def A2_combine_hh : T_ALU32_combineh<".h", ".h", 0b011, 0b100, 1>;
+def A2_combine_hl : T_ALU32_combineh<".h", ".l", 0b011, 0b101, 1>;
+def A2_combine_lh : T_ALU32_combineh<".l", ".h", 0b011, 0b110, 1>;
+def A2_combine_ll : T_ALU32_combineh<".l", ".l", 0b011, 0b111, 1>;
+
+class T_ALU32_3op_sfx<string mnemonic, string suffix, bits<3> MajOp,
+ bits<3> MinOp, bit OpsRev, bit IsComm>
+ : T_ALU32_3op<"", MajOp, MinOp, OpsRev, IsComm> {
+ let AsmString = "$Rd = "#mnemonic#"($Rs, $Rt)"#suffix;
+}
+
+def A2_svaddh : T_ALU32_3op<"vaddh", 0b110, 0b000, 0, 1>;
+def A2_svsubh : T_ALU32_3op<"vsubh", 0b110, 0b100, 1, 0>;
+
+let Defs = [USR_OVF], Itinerary = ALU32_3op_tc_2_SLOT0123 in {
+ def A2_svaddhs : T_ALU32_3op_sfx<"vaddh", ":sat", 0b110, 0b001, 0, 1>;
+ def A2_addsat : T_ALU32_3op_sfx<"add", ":sat", 0b110, 0b010, 0, 1>;
+ def A2_svadduhs : T_ALU32_3op_sfx<"vadduh", ":sat", 0b110, 0b011, 0, 1>;
+ def A2_svsubhs : T_ALU32_3op_sfx<"vsubh", ":sat", 0b110, 0b101, 1, 0>;
+ def A2_subsat : T_ALU32_3op_sfx<"sub", ":sat", 0b110, 0b110, 1, 0>;
+ def A2_svsubuhs : T_ALU32_3op_sfx<"vsubuh", ":sat", 0b110, 0b111, 1, 0>;
+}
+
+let Itinerary = ALU32_3op_tc_2_SLOT0123 in
+def A2_svavghs : T_ALU32_3op_sfx<"vavgh", ":rnd", 0b111, 0b001, 0, 1>;
+
+def A2_svavgh : T_ALU32_3op<"vavgh", 0b111, 0b000, 0, 1>;
+def A2_svnavgh : T_ALU32_3op<"vnavgh", 0b111, 0b011, 1, 0>;
+
+multiclass T_ALU32_3op_p<string mnemonic, bits<3> MajOp, bits<3> MinOp,
+ bit OpsRev> {
+ def t : T_ALU32_3op_pred<mnemonic, MajOp, MinOp, OpsRev, 0, 0>;
+ def f : T_ALU32_3op_pred<mnemonic, MajOp, MinOp, OpsRev, 1, 0>;
+ def tnew : T_ALU32_3op_pred<mnemonic, MajOp, MinOp, OpsRev, 0, 1>;
+ def fnew : T_ALU32_3op_pred<mnemonic, MajOp, MinOp, OpsRev, 1, 1>;
+}
+
+multiclass T_ALU32_3op_A2<string mnemonic, bits<3> MajOp, bits<3> MinOp,
+ bit OpsRev, bit IsComm> {
+ let isPredicable = 1 in
+ def A2_#NAME : T_ALU32_3op <mnemonic, MajOp, MinOp, OpsRev, IsComm>;
+ defm A2_p#NAME : T_ALU32_3op_p<mnemonic, MajOp, MinOp, OpsRev>;
+}
+
+defm add : T_ALU32_3op_A2<"add", 0b011, 0b000, 0, 1>;
+defm and : T_ALU32_3op_A2<"and", 0b001, 0b000, 0, 1>;
+defm or : T_ALU32_3op_A2<"or", 0b001, 0b001, 0, 1>;
+defm sub : T_ALU32_3op_A2<"sub", 0b011, 0b001, 1, 0>;
+defm xor : T_ALU32_3op_A2<"xor", 0b001, 0b011, 0, 1>;
+
+// A few special cases producing register pairs:
+let OutOperandList = (outs DoubleRegs:$Rd), hasNewValue = 0 in {
+ def S2_packhl : T_ALU32_3op <"packhl", 0b101, 0b100, 0, 0>;
+
+ let isPredicable = 1 in
+ def A2_combinew : T_ALU32_3op <"combine", 0b101, 0b000, 0, 0>;
+
+ // Conditional combinew uses "newt/f" instead of "t/fnew".
+ def C2_ccombinewt : T_ALU32_3op_pred<"combine", 0b101, 0b000, 0, 0, 0>;
+ def C2_ccombinewf : T_ALU32_3op_pred<"combine", 0b101, 0b000, 0, 1, 0>;
+ def C2_ccombinewnewt : T_ALU32_3op_pred<"combine", 0b101, 0b000, 0, 0, 1>;
+ def C2_ccombinewnewf : T_ALU32_3op_pred<"combine", 0b101, 0b000, 0, 1, 1>;
+}
+
+let hasSideEffects = 0, hasNewValue = 1, isCompare = 1, InputType = "reg" in
+class T_ALU32_3op_cmp<string mnemonic, bits<2> MinOp, bit IsNeg, bit IsComm>
+ : ALU32_rr<(outs PredRegs:$Pd), (ins IntRegs:$Rs, IntRegs:$Rt),
+ "$Pd = "#mnemonic#"($Rs, $Rt)",
+ [], "", ALU32_3op_tc_1_SLOT0123>, ImmRegRel {
+ let CextOpcode = mnemonic;
+ let isCommutable = IsComm;
+ bits<5> Rs;
+ bits<5> Rt;
+ bits<2> Pd;
+
+ let IClass = 0b1111;
+ let Inst{27-24} = 0b0010;
+ let Inst{22-21} = MinOp;
+ let Inst{20-16} = Rs;
+ let Inst{12-8} = Rt;
+ let Inst{4} = IsNeg;
+ let Inst{3-2} = 0b00;
+ let Inst{1-0} = Pd;
+}
+
+let Itinerary = ALU32_3op_tc_2early_SLOT0123 in {
+ def C2_cmpeq : T_ALU32_3op_cmp< "cmp.eq", 0b00, 0, 1>;
+ def C2_cmpgt : T_ALU32_3op_cmp< "cmp.gt", 0b10, 0, 0>;
+ def C2_cmpgtu : T_ALU32_3op_cmp< "cmp.gtu", 0b11, 0, 0>;
+}
+
+let CextOpcode = "MUX", InputType = "reg", hasNewValue = 1 in
+def C2_mux: ALU32_rr<(outs IntRegs:$Rd),
+ (ins PredRegs:$Pu, IntRegs:$Rs, IntRegs:$Rt),
+ "$Rd = mux($Pu, $Rs, $Rt)", [], "", ALU32_3op_tc_1_SLOT0123>, ImmRegRel {
+ bits<5> Rd;
+ bits<2> Pu;
+ bits<5> Rs;
+ bits<5> Rt;
+
+ let CextOpcode = "mux";
+ let InputType = "reg";
+ let hasSideEffects = 0;
+ let IClass = 0b1111;
+
+ let Inst{27-24} = 0b0100;
+ let Inst{20-16} = Rs;
+ let Inst{12-8} = Rt;
+ let Inst{6-5} = Pu;
+ let Inst{4-0} = Rd;
+}
+
+// Combines the two immediates into a double register.
+// Increase complexity to make it greater than any complexity of a combine
+// that involves a register.
+
+let isReMaterializable = 1, isMoveImm = 1, isAsCheapAsAMove = 1,
+ isExtentSigned = 1, isExtendable = 1, opExtentBits = 8, opExtendable = 1,
+ AddedComplexity = 75 in
+def A2_combineii: ALU32Inst <(outs DoubleRegs:$Rdd), (ins s8_0Ext:$s8, s8_0Imm:$S8),
+ "$Rdd = combine(#$s8, #$S8)",
+ []> {
+ bits<5> Rdd;
+ bits<8> s8;
+ bits<8> S8;
+
+ let IClass = 0b0111;
+ let Inst{27-23} = 0b11000;
+ let Inst{22-16} = S8{7-1};
+ let Inst{13} = S8{0};
+ let Inst{12-5} = s8;
+ let Inst{4-0} = Rdd;
+ }
+
+//===----------------------------------------------------------------------===//
+// Template class for predicated ADD of a reg and an Immediate value.
+//===----------------------------------------------------------------------===//
+let hasNewValue = 1, hasSideEffects = 0 in
+class T_Addri_Pred <bit PredNot, bit PredNew>
+ : ALU32_ri <(outs IntRegs:$Rd),
+ (ins PredRegs:$Pu, IntRegs:$Rs, s8_0Ext:$s8),
+ !if(PredNot, "if (!$Pu", "if ($Pu")#!if(PredNew,".new) $Rd = ",
+ ") $Rd = ")#"add($Rs, #$s8)"> {
+ bits<5> Rd;
+ bits<2> Pu;
+ bits<5> Rs;
+ bits<8> s8;
+
+ let isPredicatedNew = PredNew;
+ let IClass = 0b0111;
+
+ let Inst{27-24} = 0b0100;
+ let Inst{23} = PredNot;
+ let Inst{22-21} = Pu;
+ let Inst{20-16} = Rs;
+ let Inst{13} = PredNew;
+ let Inst{12-5} = s8;
+ let Inst{4-0} = Rd;
+ }
+
+//===----------------------------------------------------------------------===//
+// A2_addi: Add a signed immediate to a register.
+//===----------------------------------------------------------------------===//
+let hasNewValue = 1, hasSideEffects = 0 in
+class T_Addri <Operand immOp>
+ : ALU32_ri <(outs IntRegs:$Rd),
+ (ins IntRegs:$Rs, immOp:$s16),
+ "$Rd = add($Rs, #$s16)", [], "", ALU32_ADDI_tc_1_SLOT0123> {
+ bits<5> Rd;
+ bits<5> Rs;
+ bits<16> s16;
+
+ let IClass = 0b1011;
+
+ let Inst{27-21} = s16{15-9};
+ let Inst{20-16} = Rs;
+ let Inst{13-5} = s16{8-0};
+ let Inst{4-0} = Rd;
+ }
+
+//===----------------------------------------------------------------------===//
+// Multiclass for ADD of a register and an immediate value.
+//===----------------------------------------------------------------------===//
+multiclass Addri_Pred<string mnemonic, bit PredNot> {
+ let isPredicatedFalse = PredNot in {
+ def NAME : T_Addri_Pred<PredNot, 0>;
+ // Predicate new
+ def NAME#new : T_Addri_Pred<PredNot, 1>;
+ }
+}
+
+let isExtendable = 1, isExtentSigned = 1, InputType = "imm" in
+multiclass Addri_base<string mnemonic, SDNode OpNode> {
+ let CextOpcode = mnemonic, BaseOpcode = mnemonic#_ri in {
+ let opExtendable = 2, opExtentBits = 16, isPredicable = 1, isAdd = 1 in
+ def A2_#NAME : T_Addri<s16_0Ext>;
+
+ let opExtendable = 3, opExtentBits = 8, isPredicated = 1 in {
+ defm A2_p#NAME#t : Addri_Pred<mnemonic, 0>;
+ defm A2_p#NAME#f : Addri_Pred<mnemonic, 1>;
+ }
+ }
+}
+
+defm addi : Addri_base<"add", add>, ImmRegRel, PredNewRel;
+
+let hasNewValue = 1, hasSideEffects = 0, isPseudo = 1 in
+def A2_iconst
+ : ALU32_ri <(outs IntRegs:$Rd),
+ (ins s23_2Imm:$s23_2),
+ "$Rd = iconst(#$s23_2)"> {}
+
+//===----------------------------------------------------------------------===//
+// Template class used for the following ALU32 instructions.
+// Rd=and(Rs,#s10)
+// Rd=or(Rs,#s10)
+//===----------------------------------------------------------------------===//
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 10,
+InputType = "imm", hasNewValue = 1 in
+class T_ALU32ri_logical <string mnemonic, SDNode OpNode, bits<2> MinOp>
+ : ALU32_ri <(outs IntRegs:$Rd),
+ (ins IntRegs:$Rs, s10_0Ext:$s10),
+ "$Rd = "#mnemonic#"($Rs, #$s10)" ,
+ []> {
+ bits<5> Rd;
+ bits<5> Rs;
+ bits<10> s10;
+ let CextOpcode = mnemonic;
+
+ let IClass = 0b0111;
+
+ let Inst{27-24} = 0b0110;
+ let Inst{23-22} = MinOp;
+ let Inst{21} = s10{9};
+ let Inst{20-16} = Rs;
+ let Inst{13-5} = s10{8-0};
+ let Inst{4-0} = Rd;
+ }
+
+def A2_orir : T_ALU32ri_logical<"or", or, 0b10>, ImmRegRel;
+def A2_andir : T_ALU32ri_logical<"and", and, 0b00>, ImmRegRel;
+
+// Subtract register from immediate
+// Rd32=sub(#s10,Rs32)
+let isExtendable = 1, CextOpcode = "sub", opExtendable = 1, isExtentSigned = 1,
+ opExtentBits = 10, InputType = "imm", hasNewValue = 1, hasSideEffects = 0 in
+def A2_subri: ALU32_ri <(outs IntRegs:$Rd), (ins s10_0Ext:$s10, IntRegs:$Rs),
+ "$Rd = sub(#$s10, $Rs)", []>, ImmRegRel {
+ bits<5> Rd;
+ bits<10> s10;
+ bits<5> Rs;
+
+ let IClass = 0b0111;
+
+ let Inst{27-22} = 0b011001;
+ let Inst{21} = s10{9};
+ let Inst{20-16} = Rs;
+ let Inst{13-5} = s10{8-0};
+ let Inst{4-0} = Rd;
+ }
+
+// Nop.
+let hasSideEffects = 0 in
+def A2_nop: ALU32Inst <(outs), (ins), "nop" > {
+ let IClass = 0b0111;
+ let Inst{27-24} = 0b1111;
+}
+
+let hasSideEffects = 0, hasNewValue = 1 in
+class T_tfr16<bit isHi>
+ : ALU32Inst <(outs IntRegs:$Rx), (ins IntRegs:$src1, u16_0Imm:$u16),
+ "$Rx"#!if(isHi, ".h", ".l")#" = #$u16",
+ [], "$src1 = $Rx" > {
+ bits<5> Rx;
+ bits<16> u16;
+
+ let IClass = 0b0111;
+ let Inst{27-26} = 0b00;
+ let Inst{25-24} = !if(isHi, 0b10, 0b01);
+ let Inst{23-22} = u16{15-14};
+ let Inst{21} = 0b1;
+ let Inst{20-16} = Rx;
+ let Inst{13-0} = u16{13-0};
+ }
+
+def A2_tfril: T_tfr16<0>;
+def A2_tfrih: T_tfr16<1>;
+
+// Conditional transfer is an alias to conditional "Rd = add(Rs, #0)".
+let isPredicated = 1, hasNewValue = 1, opNewValue = 0 in
+class T_tfr_pred<bit isPredNot, bit isPredNew>
+ : ALU32Inst<(outs IntRegs:$dst),
+ (ins PredRegs:$src1, IntRegs:$src2),
+ "if ("#!if(isPredNot, "!", "")#
+ "$src1"#!if(isPredNew, ".new", "")#
+ ") $dst = $src2"> {
+ bits<5> dst;
+ bits<2> src1;
+ bits<5> src2;
+
+ let isPredicatedFalse = isPredNot;
+ let isPredicatedNew = isPredNew;
+ let IClass = 0b0111;
+
+ let Inst{27-24} = 0b0100;
+ let Inst{23} = isPredNot;
+ let Inst{13} = isPredNew;
+ let Inst{12-5} = 0;
+ let Inst{4-0} = dst;
+ let Inst{22-21} = src1;
+ let Inst{20-16} = src2;
+ }
+
+let isPredicable = 1 in
+class T_tfr : ALU32Inst<(outs IntRegs:$dst), (ins IntRegs:$src),
+ "$dst = $src"> {
+ bits<5> dst;
+ bits<5> src;
+
+ let IClass = 0b0111;
+
+ let Inst{27-21} = 0b0000011;
+ let Inst{20-16} = src;
+ let Inst{13} = 0b0;
+ let Inst{4-0} = dst;
+ }
+
+let InputType = "reg", hasNewValue = 1, hasSideEffects = 0 in
+multiclass tfr_base<string CextOp> {
+ let CextOpcode = CextOp, BaseOpcode = CextOp in {
+ def NAME : T_tfr;
+
+ // Predicate
+ def t : T_tfr_pred<0, 0>;
+ def f : T_tfr_pred<1, 0>;
+ // Predicate new
+ def tnew : T_tfr_pred<0, 1>;
+ def fnew : T_tfr_pred<1, 1>;
+ }
+}
+
+// Assembler mapped to C2_ccombinew[t|f|newt|newf].
+// Please don't add bits to this instruction as it'll be converted into
+// 'combine' before object code emission.
+let isPredicated = 1 in
+class T_tfrp_pred<bit PredNot, bit PredNew>
+ : ALU32_rr <(outs DoubleRegs:$dst),
+ (ins PredRegs:$src1, DoubleRegs:$src2),
+ "if ("#!if(PredNot, "!", "")#"$src1"
+ #!if(PredNew, ".new", "")#") $dst = $src2" > {
+ let isPredicatedFalse = PredNot;
+ let isPredicatedNew = PredNew;
+ }
+
+// Assembler mapped to A2_combinew.
+// Please don't add bits to this instruction as it'll be converted into
+// 'combine' before object code emission.
+class T_tfrp : ALU32Inst <(outs DoubleRegs:$dst),
+ (ins DoubleRegs:$src),
+ "$dst = $src">;
+
+let hasSideEffects = 0 in
+multiclass TFR64_base<string BaseName> {
+ let BaseOpcode = BaseName in {
+ let isPredicable = 1 in
+ def NAME : T_tfrp;
+ // Predicate
+ def t : T_tfrp_pred <0, 0>;
+ def f : T_tfrp_pred <1, 0>;
+ // Predicate new
+ def tnew : T_tfrp_pred <0, 1>;
+ def fnew : T_tfrp_pred <1, 1>;
+ }
+}
+
+let InputType = "imm", isExtendable = 1, isExtentSigned = 1, opExtentBits = 12,
+ isMoveImm = 1, opExtendable = 2, BaseOpcode = "TFRI", CextOpcode = "TFR",
+ hasSideEffects = 0, isPredicated = 1, hasNewValue = 1 in
+class T_TFRI_Pred<bit PredNot, bit PredNew>
+ : ALU32_ri<(outs IntRegs:$Rd), (ins PredRegs:$Pu, s12_0Ext:$s12),
+ "if ("#!if(PredNot,"!","")#"$Pu"#!if(PredNew,".new","")#") $Rd = #$s12",
+ [], "", ALU32_2op_tc_1_SLOT0123>, ImmRegRel, PredNewRel {
+ let isPredicatedFalse = PredNot;
+ let isPredicatedNew = PredNew;
+
+ bits<5> Rd;
+ bits<2> Pu;
+ bits<12> s12;
+
+ let IClass = 0b0111;
+ let Inst{27-24} = 0b1110;
+ let Inst{23} = PredNot;
+ let Inst{22-21} = Pu;
+ let Inst{20} = 0b0;
+ let Inst{19-16,12-5} = s12;
+ let Inst{13} = PredNew;
+ let Inst{4-0} = Rd;
+}
+
+def C2_cmoveit : T_TFRI_Pred<0, 0>;
+def C2_cmoveif : T_TFRI_Pred<1, 0>;
+def C2_cmovenewit : T_TFRI_Pred<0, 1>;
+def C2_cmovenewif : T_TFRI_Pred<1, 1>;
+
+let InputType = "imm", isExtendable = 1, isExtentSigned = 1,
+ CextOpcode = "TFR", BaseOpcode = "TFRI", hasNewValue = 1, opNewValue = 0,
+ isAsCheapAsAMove = 1 , opExtendable = 1, opExtentBits = 16, isMoveImm = 1,
+ isPredicated = 0, isPredicable = 1, isReMaterializable = 1 in
+def A2_tfrsi : ALU32Inst<(outs IntRegs:$Rd), (ins s16_0Ext:$s16), "$Rd = #$s16",
+ [], "", ALU32_2op_tc_1_SLOT0123>,
+ ImmRegRel, PredRel {
+ bits<5> Rd;
+ bits<16> s16;
+
+ let IClass = 0b0111;
+ let Inst{27-24} = 0b1000;
+ let Inst{23-22,20-16,13-5} = s16;
+ let Inst{4-0} = Rd;
+}
+
+defm A2_tfr : tfr_base<"TFR">, ImmRegRel, PredNewRel;
+let isAsmParserOnly = 1 in
+defm A2_tfrp : TFR64_base<"TFR64">, PredNewRel;
+
+// Assembler mapped
+let isReMaterializable = 1, isMoveImm = 1, isAsCheapAsAMove = 1,
+ isAsmParserOnly = 1 in
+def A2_tfrpi : ALU64_rr<(outs DoubleRegs:$dst), (ins s8_0Imm64:$src1),
+ "$dst = #$src1",
+ []>;
+
+// TODO: see if this instruction can be deleted..
+let isExtendable = 1, opExtendable = 1, opExtentBits = 6,
+ isAsmParserOnly = 1 in {
+def TFRI64_V4 : ALU64_rr<(outs DoubleRegs:$dst), (ins u64_0Imm:$src1),
+ "$dst = #$src1">;
+def TFRI64_V2_ext : ALU64_rr<(outs DoubleRegs:$dst),
+ (ins s8_0Ext:$src1, s8_0Imm:$src2),
+ "$dst = combine(##$src1, #$src2)">;
+}
+
+//===----------------------------------------------------------------------===//
+// ALU32/ALU -
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// ALU32/PERM +
+//===----------------------------------------------------------------------===//
+// Scalar mux register immediate.
+let hasSideEffects = 0, isExtentSigned = 1, CextOpcode = "MUX",
+ InputType = "imm", hasNewValue = 1, isExtendable = 1, opExtentBits = 8 in
+class T_MUX1 <bit MajOp, dag ins, string AsmStr>
+ : ALU32Inst <(outs IntRegs:$Rd), ins, AsmStr>, ImmRegRel {
+ bits<5> Rd;
+ bits<2> Pu;
+ bits<8> s8;
+ bits<5> Rs;
+
+ let IClass = 0b0111;
+ let Inst{27-24} = 0b0011;
+ let Inst{23} = MajOp;
+ let Inst{22-21} = Pu;
+ let Inst{20-16} = Rs;
+ let Inst{13} = 0b0;
+ let Inst{12-5} = s8;
+ let Inst{4-0} = Rd;
+}
+
+let opExtendable = 2 in
+def C2_muxri : T_MUX1<0b1, (ins PredRegs:$Pu, s8_0Ext:$s8, IntRegs:$Rs),
+ "$Rd = mux($Pu, #$s8, $Rs)">;
+
+let opExtendable = 3 in
+def C2_muxir : T_MUX1<0b0, (ins PredRegs:$Pu, IntRegs:$Rs, s8_0Ext:$s8),
+ "$Rd = mux($Pu, $Rs, #$s8)">;
+
+// C2_muxii: Scalar mux immediates.
+let isExtentSigned = 1, hasNewValue = 1, isExtendable = 1,
+ opExtentBits = 8, opExtendable = 2 in
+def C2_muxii: ALU32Inst <(outs IntRegs:$Rd),
+ (ins PredRegs:$Pu, s8_0Ext:$s8, s8_0Imm:$S8),
+ "$Rd = mux($Pu, #$s8, #$S8)" ,
+ []> {
+ bits<5> Rd;
+ bits<2> Pu;
+ bits<8> s8;
+ bits<8> S8;
+
+ let IClass = 0b0111;
+
+ let Inst{27-25} = 0b101;
+ let Inst{24-23} = Pu;
+ let Inst{22-16} = S8{7-1};
+ let Inst{13} = S8{0};
+ let Inst{12-5} = s8;
+ let Inst{4-0} = Rd;
+ }
+
+let isCodeGenOnly = 1, isPseudo = 1 in
+def PS_pselect : ALU64_rr<(outs DoubleRegs:$Rd),
+ (ins PredRegs:$Pu, DoubleRegs:$Rs, DoubleRegs:$Rt),
+ ".error \"should not emit\" ", []>;
+
+
+//===----------------------------------------------------------------------===//
+// template class for non-predicated alu32_2op instructions
+// - aslh, asrh, sxtb, sxth, zxth
+//===----------------------------------------------------------------------===//
+let hasNewValue = 1, opNewValue = 0 in
+class T_ALU32_2op <string mnemonic, bits<3> minOp> :
+ ALU32Inst <(outs IntRegs:$Rd), (ins IntRegs:$Rs),
+ "$Rd = "#mnemonic#"($Rs)", [] > {
+ bits<5> Rd;
+ bits<5> Rs;
+
+ let IClass = 0b0111;
+
+ let Inst{27-24} = 0b0000;
+ let Inst{23-21} = minOp;
+ let Inst{13} = 0b0;
+ let Inst{4-0} = Rd;
+ let Inst{20-16} = Rs;
+}
+
+//===----------------------------------------------------------------------===//
+// template class for predicated alu32_2op instructions
+// - aslh, asrh, sxtb, sxth, zxtb, zxth
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+class T_ALU32_2op_Pred <string mnemonic, bits<3> minOp, bit isPredNot,
+ bit isPredNew > :
+ ALU32Inst <(outs IntRegs:$Rd), (ins PredRegs:$Pu, IntRegs:$Rs),
+ !if(isPredNot, "if (!$Pu", "if ($Pu")
+ #!if(isPredNew, ".new) ",") ")#"$Rd = "#mnemonic#"($Rs)"> {
+ bits<5> Rd;
+ bits<2> Pu;
+ bits<5> Rs;
+
+ let IClass = 0b0111;
+
+ let Inst{27-24} = 0b0000;
+ let Inst{23-21} = minOp;
+ let Inst{13} = 0b1;
+ let Inst{11} = isPredNot;
+ let Inst{10} = isPredNew;
+ let Inst{4-0} = Rd;
+ let Inst{9-8} = Pu;
+ let Inst{20-16} = Rs;
+}
+
+multiclass ALU32_2op_Pred<string mnemonic, bits<3> minOp, bit PredNot> {
+ let isPredicatedFalse = PredNot in {
+ def NAME : T_ALU32_2op_Pred<mnemonic, minOp, PredNot, 0>;
+
+ // Predicate new
+ let isPredicatedNew = 1 in
+ def NAME#new : T_ALU32_2op_Pred<mnemonic, minOp, PredNot, 1>;
+ }
+}
+
+multiclass ALU32_2op_base<string mnemonic, bits<3> minOp> {
+ let BaseOpcode = mnemonic in {
+ let isPredicable = 1, hasSideEffects = 0 in
+ def A2_#NAME : T_ALU32_2op<mnemonic, minOp>;
+
+ let isPredicated = 1, hasSideEffects = 0 in {
+ defm A4_p#NAME#t : ALU32_2op_Pred<mnemonic, minOp, 0>;
+ defm A4_p#NAME#f : ALU32_2op_Pred<mnemonic, minOp, 1>;
+ }
+ }
+}
+
+defm aslh : ALU32_2op_base<"aslh", 0b000>, PredNewRel;
+defm asrh : ALU32_2op_base<"asrh", 0b001>, PredNewRel;
+defm sxtb : ALU32_2op_base<"sxtb", 0b101>, PredNewRel;
+defm sxth : ALU32_2op_base<"sxth", 0b111>, PredNewRel;
+defm zxth : ALU32_2op_base<"zxth", 0b110>, PredNewRel;
+
+// Rd=zxtb(Rs): assembler mapped to Rd=and(Rs,#255).
+// Compiler would want to generate 'zxtb' instead of 'and' because 'zxtb' has
+// predicated forms while 'and' doesn't. Since integrated assembler can't
+// handle 'mapped' instructions, we need to encode 'zxtb' same as 'and' where
+// immediate operand is set to '255'.
+
+let hasNewValue = 1, opNewValue = 0 in
+class T_ZXTB: ALU32Inst < (outs IntRegs:$Rd), (ins IntRegs:$Rs),
+ "$Rd = zxtb($Rs)", [] > { // Rd = and(Rs,255)
+ bits<5> Rd;
+ bits<5> Rs;
+ bits<10> s10 = 255;
+
+ let IClass = 0b0111;
+
+ let Inst{27-22} = 0b011000;
+ let Inst{4-0} = Rd;
+ let Inst{20-16} = Rs;
+ let Inst{21} = s10{9};
+ let Inst{13-5} = s10{8-0};
+}
+
+//Rd=zxtb(Rs): assembler mapped to "Rd=and(Rs,#255)
+multiclass ZXTB_base <string mnemonic, bits<3> minOp> {
+ let BaseOpcode = mnemonic in {
+ let isPredicable = 1, hasSideEffects = 0 in
+ def A2_#NAME : T_ZXTB;
+
+ let isPredicated = 1, hasSideEffects = 0 in {
+ defm A4_p#NAME#t : ALU32_2op_Pred<mnemonic, minOp, 0>;
+ defm A4_p#NAME#f : ALU32_2op_Pred<mnemonic, minOp, 1>;
+ }
+ }
+}
+
+defm zxtb : ZXTB_base<"zxtb",0b100>, PredNewRel;
+
+//===----------------------------------------------------------------------===//
+// Template class for vector add and avg
+//===----------------------------------------------------------------------===//
+
+class T_VectALU_64 <string opc, bits<3> majOp, bits<3> minOp,
+ bit isSat, bit isRnd, bit isCrnd, bit SwapOps >
+ : ALU64_rr < (outs DoubleRegs:$Rdd),
+ (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
+ "$Rdd = "#opc#"($Rss, $Rtt)"#!if(isRnd, ":rnd", "")
+ #!if(isCrnd,":crnd","")
+ #!if(isSat, ":sat", ""),
+ [], "", ALU64_tc_2_SLOT23 > {
+ bits<5> Rdd;
+ bits<5> Rss;
+ bits<5> Rtt;
+
+ let IClass = 0b1101;
+
+ let Inst{27-24} = 0b0011;
+ let Inst{23-21} = majOp;
+ let Inst{20-16} = !if (SwapOps, Rtt, Rss);
+ let Inst{12-8} = !if (SwapOps, Rss, Rtt);
+ let Inst{7-5} = minOp;
+ let Inst{4-0} = Rdd;
+ }
+
+// ALU64 - Vector add
+// Rdd=vadd[u][bhw](Rss,Rtt)
+let Itinerary = ALU64_tc_1_SLOT23 in {
+ def A2_vaddub : T_VectALU_64 < "vaddub", 0b000, 0b000, 0, 0, 0, 0>;
+ def A2_vaddh : T_VectALU_64 < "vaddh", 0b000, 0b010, 0, 0, 0, 0>;
+ def A2_vaddw : T_VectALU_64 < "vaddw", 0b000, 0b101, 0, 0, 0, 0>;
+}
+
+// Rdd=vadd[u][bhw](Rss,Rtt):sat
+let Defs = [USR_OVF] in {
+ def A2_vaddubs : T_VectALU_64 < "vaddub", 0b000, 0b001, 1, 0, 0, 0>;
+ def A2_vaddhs : T_VectALU_64 < "vaddh", 0b000, 0b011, 1, 0, 0, 0>;
+ def A2_vadduhs : T_VectALU_64 < "vadduh", 0b000, 0b100, 1, 0, 0, 0>;
+ def A2_vaddws : T_VectALU_64 < "vaddw", 0b000, 0b110, 1, 0, 0, 0>;
+}
+
+// ALU64 - Vector average
+// Rdd=vavg[u][bhw](Rss,Rtt)
+let Itinerary = ALU64_tc_1_SLOT23 in {
+ def A2_vavgub : T_VectALU_64 < "vavgub", 0b010, 0b000, 0, 0, 0, 0>;
+ def A2_vavgh : T_VectALU_64 < "vavgh", 0b010, 0b010, 0, 0, 0, 0>;
+ def A2_vavguh : T_VectALU_64 < "vavguh", 0b010, 0b101, 0, 0, 0, 0>;
+ def A2_vavgw : T_VectALU_64 < "vavgw", 0b011, 0b000, 0, 0, 0, 0>;
+ def A2_vavguw : T_VectALU_64 < "vavguw", 0b011, 0b011, 0, 0, 0, 0>;
+}
+
+// Rdd=vavg[u][bhw](Rss,Rtt)[:rnd|:crnd]
+def A2_vavgubr : T_VectALU_64 < "vavgub", 0b010, 0b001, 0, 1, 0, 0>;
+def A2_vavghr : T_VectALU_64 < "vavgh", 0b010, 0b011, 0, 1, 0, 0>;
+def A2_vavghcr : T_VectALU_64 < "vavgh", 0b010, 0b100, 0, 0, 1, 0>;
+def A2_vavguhr : T_VectALU_64 < "vavguh", 0b010, 0b110, 0, 1, 0, 0>;
+
+def A2_vavgwr : T_VectALU_64 < "vavgw", 0b011, 0b001, 0, 1, 0, 0>;
+def A2_vavgwcr : T_VectALU_64 < "vavgw", 0b011, 0b010, 0, 0, 1, 0>;
+def A2_vavguwr : T_VectALU_64 < "vavguw", 0b011, 0b100, 0, 1, 0, 0>;
+
+// Rdd=vnavg[bh](Rss,Rtt)
+let Itinerary = ALU64_tc_1_SLOT23 in {
+ def A2_vnavgh : T_VectALU_64 < "vnavgh", 0b100, 0b000, 0, 0, 0, 1>;
+ def A2_vnavgw : T_VectALU_64 < "vnavgw", 0b100, 0b011, 0, 0, 0, 1>;
+}
+
+// Rdd=vnavg[bh](Rss,Rtt)[:rnd|:crnd]:sat
+let Defs = [USR_OVF] in {
+ def A2_vnavghr : T_VectALU_64 < "vnavgh", 0b100, 0b001, 1, 1, 0, 1>;
+ def A2_vnavghcr : T_VectALU_64 < "vnavgh", 0b100, 0b010, 1, 0, 1, 1>;
+ def A2_vnavgwr : T_VectALU_64 < "vnavgw", 0b100, 0b100, 1, 1, 0, 1>;
+ def A2_vnavgwcr : T_VectALU_64 < "vnavgw", 0b100, 0b110, 1, 0, 1, 1>;
+}
+
+// Rdd=vsub[u][bh](Rss,Rtt)
+let Itinerary = ALU64_tc_1_SLOT23 in {
+ def A2_vsubub : T_VectALU_64 < "vsubub", 0b001, 0b000, 0, 0, 0, 1>;
+ def A2_vsubh : T_VectALU_64 < "vsubh", 0b001, 0b010, 0, 0, 0, 1>;
+ def A2_vsubw : T_VectALU_64 < "vsubw", 0b001, 0b101, 0, 0, 0, 1>;
+}
+
+// Rdd=vsub[u][bh](Rss,Rtt):sat
+let Defs = [USR_OVF] in {
+ def A2_vsububs : T_VectALU_64 < "vsubub", 0b001, 0b001, 1, 0, 0, 1>;
+ def A2_vsubhs : T_VectALU_64 < "vsubh", 0b001, 0b011, 1, 0, 0, 1>;
+ def A2_vsubuhs : T_VectALU_64 < "vsubuh", 0b001, 0b100, 1, 0, 0, 1>;
+ def A2_vsubws : T_VectALU_64 < "vsubw", 0b001, 0b110, 1, 0, 0, 1>;
+}
+
+// Rdd=vmax[u][bhw](Rss,Rtt)
+def A2_vmaxb : T_VectALU_64 < "vmaxb", 0b110, 0b110, 0, 0, 0, 1>;
+def A2_vmaxub : T_VectALU_64 < "vmaxub", 0b110, 0b000, 0, 0, 0, 1>;
+def A2_vmaxh : T_VectALU_64 < "vmaxh", 0b110, 0b001, 0, 0, 0, 1>;
+def A2_vmaxuh : T_VectALU_64 < "vmaxuh", 0b110, 0b010, 0, 0, 0, 1>;
+def A2_vmaxw : T_VectALU_64 < "vmaxw", 0b110, 0b011, 0, 0, 0, 1>;
+def A2_vmaxuw : T_VectALU_64 < "vmaxuw", 0b101, 0b101, 0, 0, 0, 1>;
+
+// Rdd=vmin[u][bhw](Rss,Rtt)
+def A2_vminb : T_VectALU_64 < "vminb", 0b110, 0b111, 0, 0, 0, 1>;
+def A2_vminub : T_VectALU_64 < "vminub", 0b101, 0b000, 0, 0, 0, 1>;
+def A2_vminh : T_VectALU_64 < "vminh", 0b101, 0b001, 0, 0, 0, 1>;
+def A2_vminuh : T_VectALU_64 < "vminuh", 0b101, 0b010, 0, 0, 0, 1>;
+def A2_vminw : T_VectALU_64 < "vminw", 0b101, 0b011, 0, 0, 0, 1>;
+def A2_vminuw : T_VectALU_64 < "vminuw", 0b101, 0b100, 0, 0, 0, 1>;
+
+//===----------------------------------------------------------------------===//
+// Template class for vector compare
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0 in
+class T_vcmp <string Str, bits<4> minOp>
+ : ALU64_rr <(outs PredRegs:$Pd),
+ (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
+ "$Pd = "#Str#"($Rss, $Rtt)", [],
+ "", ALU64_tc_2early_SLOT23> {
+ bits<2> Pd;
+ bits<5> Rss;
+ bits<5> Rtt;
+
+ let IClass = 0b1101;
+
+ let Inst{27-23} = 0b00100;
+ let Inst{13} = minOp{3};
+ let Inst{7-5} = minOp{2-0};
+ let Inst{1-0} = Pd;
+ let Inst{20-16} = Rss;
+ let Inst{12-8} = Rtt;
+ }
+
+// Vector compare bytes
+def A2_vcmpbeq : T_vcmp <"vcmpb.eq", 0b0110>;
+def A2_vcmpbgtu : T_vcmp <"vcmpb.gtu", 0b0111>;
+
+// Vector compare halfwords
+def A2_vcmpheq : T_vcmp <"vcmph.eq", 0b0011>;
+def A2_vcmphgt : T_vcmp <"vcmph.gt", 0b0100>;
+def A2_vcmphgtu : T_vcmp <"vcmph.gtu", 0b0101>;
+
+// Vector compare words
+def A2_vcmpweq : T_vcmp <"vcmpw.eq", 0b0000>;
+def A2_vcmpwgt : T_vcmp <"vcmpw.gt", 0b0001>;
+def A2_vcmpwgtu : T_vcmp <"vcmpw.gtu", 0b0010>;
+
+//===----------------------------------------------------------------------===//
+// ALU32/PERM -
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// ALU32/PRED +
+//===----------------------------------------------------------------------===//
+// No bits needed. If cmp.ge is found the assembler parser will
+// transform it to cmp.gt subtracting 1 from the immediate.
+let isPseudo = 1 in {
+def C2_cmpgei: ALU32Inst <
+ (outs PredRegs:$Pd), (ins IntRegs:$Rs, s8_0Ext:$s8),
+ "$Pd = cmp.ge($Rs, #$s8)">;
+def C2_cmpgeui: ALU32Inst <
+ (outs PredRegs:$Pd), (ins IntRegs:$Rs, u8_0Ext:$s8),
+ "$Pd = cmp.geu($Rs, #$s8)">;
+}
+
+
+//===----------------------------------------------------------------------===//
+// ALU32/PRED -
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// ALU64/ALU +
+//===----------------------------------------------------------------------===//
+// Add.
+//===----------------------------------------------------------------------===//
+// Template Class
+// Add/Subtract halfword
+// Rd=add(Rt.L,Rs.[HL])[:sat]
+// Rd=sub(Rt.L,Rs.[HL])[:sat]
+// Rd=add(Rt.[LH],Rs.[HL])[:sat][:<16]
+// Rd=sub(Rt.[LH],Rs.[HL])[:sat][:<16]
+//===----------------------------------------------------------------------===//
+
+let hasNewValue = 1, opNewValue = 0 in
+class T_XTYPE_ADD_SUB <bits<2> LHbits, bit isSat, bit hasShift, bit isSub>
+ : ALU64Inst <(outs IntRegs:$Rd), (ins IntRegs:$Rt, IntRegs:$Rs),
+ "$Rd = "#!if(isSub,"sub","add")#"($Rt."
+ #!if(hasShift, !if(LHbits{1},"h","l"),"l") #", $Rs."
+ #!if(hasShift, !if(LHbits{0},"h)","l)"), !if(LHbits{1},"h)","l)"))
+ #!if(isSat,":sat","")
+ #!if(hasShift,":<<16",""), [], "", ALU64_tc_1_SLOT23> {
+ bits<5> Rd;
+ bits<5> Rt;
+ bits<5> Rs;
+ let IClass = 0b1101;
+
+ let Inst{27-23} = 0b01010;
+ let Inst{22} = hasShift;
+ let Inst{21} = isSub;
+ let Inst{7} = isSat;
+ let Inst{6-5} = LHbits;
+ let Inst{4-0} = Rd;
+ let Inst{12-8} = Rt;
+ let Inst{20-16} = Rs;
+ }
+
+//Rd=sub(Rt.L,Rs.[LH])
+def A2_subh_l16_ll : T_XTYPE_ADD_SUB <0b00, 0, 0, 1>;
+def A2_subh_l16_hl : T_XTYPE_ADD_SUB <0b10, 0, 0, 1>;
+
+//Rd=add(Rt.L,Rs.[LH])
+def A2_addh_l16_ll : T_XTYPE_ADD_SUB <0b00, 0, 0, 0>;
+def A2_addh_l16_hl : T_XTYPE_ADD_SUB <0b10, 0, 0, 0>;
+
+let Itinerary = ALU64_tc_2_SLOT23, Defs = [USR_OVF] in {
+ //Rd=sub(Rt.L,Rs.[LH]):sat
+ def A2_subh_l16_sat_ll : T_XTYPE_ADD_SUB <0b00, 1, 0, 1>;
+ def A2_subh_l16_sat_hl : T_XTYPE_ADD_SUB <0b10, 1, 0, 1>;
+
+ //Rd=add(Rt.L,Rs.[LH]):sat
+ def A2_addh_l16_sat_ll : T_XTYPE_ADD_SUB <0b00, 1, 0, 0>;
+ def A2_addh_l16_sat_hl : T_XTYPE_ADD_SUB <0b10, 1, 0, 0>;
+}
+
+//Rd=sub(Rt.[LH],Rs.[LH]):<<16
+def A2_subh_h16_ll : T_XTYPE_ADD_SUB <0b00, 0, 1, 1>;
+def A2_subh_h16_lh : T_XTYPE_ADD_SUB <0b01, 0, 1, 1>;
+def A2_subh_h16_hl : T_XTYPE_ADD_SUB <0b10, 0, 1, 1>;
+def A2_subh_h16_hh : T_XTYPE_ADD_SUB <0b11, 0, 1, 1>;
+
+//Rd=add(Rt.[LH],Rs.[LH]):<<16
+def A2_addh_h16_ll : T_XTYPE_ADD_SUB <0b00, 0, 1, 0>;
+def A2_addh_h16_lh : T_XTYPE_ADD_SUB <0b01, 0, 1, 0>;
+def A2_addh_h16_hl : T_XTYPE_ADD_SUB <0b10, 0, 1, 0>;
+def A2_addh_h16_hh : T_XTYPE_ADD_SUB <0b11, 0, 1, 0>;
+
+let Itinerary = ALU64_tc_2_SLOT23, Defs = [USR_OVF] in {
+ //Rd=sub(Rt.[LH],Rs.[LH]):sat:<<16
+ def A2_subh_h16_sat_ll : T_XTYPE_ADD_SUB <0b00, 1, 1, 1>;
+ def A2_subh_h16_sat_lh : T_XTYPE_ADD_SUB <0b01, 1, 1, 1>;
+ def A2_subh_h16_sat_hl : T_XTYPE_ADD_SUB <0b10, 1, 1, 1>;
+ def A2_subh_h16_sat_hh : T_XTYPE_ADD_SUB <0b11, 1, 1, 1>;
+
+ //Rd=add(Rt.[LH],Rs.[LH]):sat:<<16
+ def A2_addh_h16_sat_ll : T_XTYPE_ADD_SUB <0b00, 1, 1, 0>;
+ def A2_addh_h16_sat_lh : T_XTYPE_ADD_SUB <0b01, 1, 1, 0>;
+ def A2_addh_h16_sat_hl : T_XTYPE_ADD_SUB <0b10, 1, 1, 0>;
+ def A2_addh_h16_sat_hh : T_XTYPE_ADD_SUB <0b11, 1, 1, 0>;
+}
+
+let hasSideEffects = 0, hasNewValue = 1 in
+def S2_parityp: ALU64Inst<(outs IntRegs:$Rd),
+ (ins DoubleRegs:$Rs, DoubleRegs:$Rt),
+ "$Rd = parity($Rs, $Rt)", [], "", ALU64_tc_2_SLOT23> {
+ bits<5> Rd;
+ bits<5> Rs;
+ bits<5> Rt;
+
+ let IClass = 0b1101;
+ let Inst{27-24} = 0b0000;
+ let Inst{20-16} = Rs;
+ let Inst{12-8} = Rt;
+ let Inst{4-0} = Rd;
+}
+
+let hasNewValue = 1, opNewValue = 0, hasSideEffects = 0 in
+class T_XTYPE_MIN_MAX < bit isMax, bit isUnsigned >
+ : ALU64Inst < (outs IntRegs:$Rd), (ins IntRegs:$Rt, IntRegs:$Rs),
+ "$Rd = "#!if(isMax,"max","min")#!if(isUnsigned,"u","")
+ #"($Rt, $Rs)", [], "", ALU64_tc_2_SLOT23> {
+ bits<5> Rd;
+ bits<5> Rt;
+ bits<5> Rs;
+
+ let IClass = 0b1101;
+
+ let Inst{27-23} = 0b01011;
+ let Inst{22-21} = !if(isMax, 0b10, 0b01);
+ let Inst{7} = isUnsigned;
+ let Inst{4-0} = Rd;
+ let Inst{12-8} = !if(isMax, Rs, Rt);
+ let Inst{20-16} = !if(isMax, Rt, Rs);
+ }
+
+def A2_min : T_XTYPE_MIN_MAX < 0, 0 >;
+def A2_minu : T_XTYPE_MIN_MAX < 0, 1 >;
+def A2_max : T_XTYPE_MIN_MAX < 1, 0 >;
+def A2_maxu : T_XTYPE_MIN_MAX < 1, 1 >;
+
+class T_cmp64_rr<string mnemonic, bits<3> MinOp, bit IsComm>
+ : ALU64_rr<(outs PredRegs:$Pd), (ins DoubleRegs:$Rs, DoubleRegs:$Rt),
+ "$Pd = "#mnemonic#"($Rs, $Rt)", [], "", ALU64_tc_2early_SLOT23> {
+ let isCompare = 1;
+ let isCommutable = IsComm;
+ let hasSideEffects = 0;
+
+ bits<2> Pd;
+ bits<5> Rs;
+ bits<5> Rt;
+
+ let IClass = 0b1101;
+ let Inst{27-21} = 0b0010100;
+ let Inst{20-16} = Rs;
+ let Inst{12-8} = Rt;
+ let Inst{7-5} = MinOp;
+ let Inst{1-0} = Pd;
+}
+
+def C2_cmpeqp : T_cmp64_rr<"cmp.eq", 0b000, 1>;
+def C2_cmpgtp : T_cmp64_rr<"cmp.gt", 0b010, 0>;
+def C2_cmpgtup : T_cmp64_rr<"cmp.gtu", 0b100, 0>;
+
+def C2_vmux : ALU64_rr<(outs DoubleRegs:$Rd),
+ (ins PredRegs:$Pu, DoubleRegs:$Rs, DoubleRegs:$Rt),
+ "$Rd = vmux($Pu, $Rs, $Rt)", [], "", ALU64_tc_1_SLOT23> {
+ let hasSideEffects = 0;
+
+ bits<5> Rd;
+ bits<2> Pu;
+ bits<5> Rs;
+ bits<5> Rt;
+
+ let IClass = 0b1101;
+ let Inst{27-24} = 0b0001;
+ let Inst{20-16} = Rs;
+ let Inst{12-8} = Rt;
+ let Inst{6-5} = Pu;
+ let Inst{4-0} = Rd;
+}
+
+class T_ALU64_rr<string mnemonic, string suffix, bits<4> RegType,
+ bits<3> MajOp, bits<3> MinOp, bit OpsRev, bit IsComm,
+ string Op2Pfx>
+ : ALU64_rr<(outs DoubleRegs:$Rd), (ins DoubleRegs:$Rs, DoubleRegs:$Rt),
+ "$Rd = " #mnemonic# "($Rs, " #Op2Pfx# "$Rt)" #suffix, [],
+ "", ALU64_tc_1_SLOT23> {
+ let hasSideEffects = 0;
+ let isCommutable = IsComm;
+
+ bits<5> Rs;
+ bits<5> Rt;
+ bits<5> Rd;
+
+ let IClass = 0b1101;
+ let Inst{27-24} = RegType;
+ let Inst{23-21} = MajOp;
+ let Inst{20-16} = !if (OpsRev,Rt,Rs);
+ let Inst{12-8} = !if (OpsRev,Rs,Rt);
+ let Inst{7-5} = MinOp;
+ let Inst{4-0} = Rd;
+}
+
+class T_ALU64_arith<string mnemonic, bits<3> MajOp, bits<3> MinOp, bit IsSat,
+ bit OpsRev, bit IsComm>
+ : T_ALU64_rr<mnemonic, !if(IsSat,":sat",""), 0b0011, MajOp, MinOp, OpsRev,
+ IsComm, "">;
+
+let isAdd = 1 in
+def A2_addp : T_ALU64_arith<"add", 0b000, 0b111, 0, 0, 1>;
+def A2_subp : T_ALU64_arith<"sub", 0b001, 0b111, 0, 1, 0>;
+
+class T_ALU64_logical<string mnemonic, bits<3> MinOp, bit OpsRev, bit IsComm,
+ bit IsNeg>
+ : T_ALU64_rr<mnemonic, "", 0b0011, 0b111, MinOp, OpsRev, IsComm,
+ !if(IsNeg,"~","")>;
+
+def A2_andp : T_ALU64_logical<"and", 0b000, 0, 1, 0>;
+def A2_orp : T_ALU64_logical<"or", 0b010, 0, 1, 0>;
+def A2_xorp : T_ALU64_logical<"xor", 0b100, 0, 1, 0>;
+
+//===----------------------------------------------------------------------===//
+// ALU64/ALU -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ALU64/BIT +
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+// ALU64/BIT -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ALU64/PERM +
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+// ALU64/PERM -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// CR +
+//===----------------------------------------------------------------------===//
+// Logical reductions on predicates.
+
+// Looping instructions.
+
+// Pipelined looping instructions.
+
+// Logical operations on predicates.
+let hasSideEffects = 0 in
+class T_LOGICAL_1OP<string MnOp, bits<2> OpBits>
+ : CRInst<(outs PredRegs:$Pd), (ins PredRegs:$Ps),
+ "$Pd = " # MnOp # "($Ps)", [], "", CR_tc_2early_SLOT23> {
+ bits<2> Pd;
+ bits<2> Ps;
+
+ let IClass = 0b0110;
+ let Inst{27-23} = 0b10111;
+ let Inst{22-21} = OpBits;
+ let Inst{20} = 0b0;
+ let Inst{17-16} = Ps;
+ let Inst{13} = 0b0;
+ let Inst{1-0} = Pd;
+}
+
+def C2_any8 : T_LOGICAL_1OP<"any8", 0b00>;
+def C2_all8 : T_LOGICAL_1OP<"all8", 0b01>;
+def C2_not : T_LOGICAL_1OP<"not", 0b10>;
+
+let hasSideEffects = 0 in
+class T_LOGICAL_2OP<string MnOp, bits<3> OpBits, bit IsNeg, bit Rev>
+ : CRInst<(outs PredRegs:$Pd), (ins PredRegs:$Ps, PredRegs:$Pt),
+ "$Pd = " # MnOp # "($Ps, " # !if (IsNeg,"!","") # "$Pt)",
+ [], "", CR_tc_2early_SLOT23> {
+ bits<2> Pd;
+ bits<2> Ps;
+ bits<2> Pt;
+
+ let IClass = 0b0110;
+ let Inst{27-24} = 0b1011;
+ let Inst{23-21} = OpBits;
+ let Inst{20} = 0b0;
+ let Inst{17-16} = !if(Rev,Pt,Ps); // Rs and Rt are reversed for some
+ let Inst{13} = 0b0; // instructions.
+ let Inst{9-8} = !if(Rev,Ps,Pt);
+ let Inst{1-0} = Pd;
+}
+
+def C2_and : T_LOGICAL_2OP<"and", 0b000, 0, 1>;
+def C2_or : T_LOGICAL_2OP<"or", 0b001, 0, 1>;
+def C2_xor : T_LOGICAL_2OP<"xor", 0b010, 0, 0>;
+def C2_andn : T_LOGICAL_2OP<"and", 0b011, 1, 1>;
+def C2_orn : T_LOGICAL_2OP<"or", 0b111, 1, 1>;
+
+let hasSideEffects = 0, hasNewValue = 1 in
+def C2_vitpack : SInst<(outs IntRegs:$Rd), (ins PredRegs:$Ps, PredRegs:$Pt),
+ "$Rd = vitpack($Ps, $Pt)", [], "", S_2op_tc_1_SLOT23> {
+ bits<5> Rd;
+ bits<2> Ps;
+ bits<2> Pt;
+
+ let IClass = 0b1000;
+ let Inst{27-24} = 0b1001;
+ let Inst{22-21} = 0b00;
+ let Inst{17-16} = Ps;
+ let Inst{9-8} = Pt;
+ let Inst{4-0} = Rd;
+}
+
+let hasSideEffects = 0 in
+def C2_mask : SInst<(outs DoubleRegs:$Rd), (ins PredRegs:$Pt),
+ "$Rd = mask($Pt)", [], "", S_2op_tc_1_SLOT23> {
+ bits<5> Rd;
+ bits<2> Pt;
+
+ let IClass = 0b1000;
+ let Inst{27-24} = 0b0110;
+ let Inst{9-8} = Pt;
+ let Inst{4-0} = Rd;
+}
+
+// User control register transfer.
+//===----------------------------------------------------------------------===//
+// CR -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// JR +
+//===----------------------------------------------------------------------===//
+
+class CondStr<string CReg, bit True, bit New> {
+ string S = "if (" # !if(True,"","!") # CReg # !if(New,".new","") # ") ";
+}
+class JumpOpcStr<string Mnemonic, bit New, bit Taken> {
+ string S = Mnemonic # !if(Taken, ":t", ":nt");
+}
+
+let isBranch = 1, isBarrier = 1, Defs = [PC], hasSideEffects = 0,
+ isPredicable = 1,
+ isExtendable = 1, opExtendable = 0, isExtentSigned = 1,
+ opExtentBits = 24, opExtentAlign = 2, InputType = "imm" in
+class T_JMP<string ExtStr>
+ : JInst_CJUMP_UCJUMP<(outs), (ins brtarget:$dst),
+ "jump " # ExtStr # "$dst",
+ [], "", J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT> {
+ bits<24> dst;
+ let IClass = 0b0101;
+
+ let Inst{27-25} = 0b100;
+ let Inst{24-16} = dst{23-15};
+ let Inst{13-1} = dst{14-2};
+}
+
+let isBranch = 1, Defs = [PC], hasSideEffects = 0, isPredicated = 1,
+ isExtendable = 1, opExtendable = 1, isExtentSigned = 1,
+ opExtentBits = 17, opExtentAlign = 2, InputType = "imm" in
+class T_JMP_c<bit PredNot, bit isPredNew, bit isTak, string ExtStr>
+ : JInst_CJUMP_UCJUMP<(outs), (ins PredRegs:$src, brtarget:$dst),
+ CondStr<"$src", !if(PredNot,0,1), isPredNew>.S #
+ JumpOpcStr<"jump", isPredNew, isTak>.S # " " #
+ ExtStr # "$dst",
+ [], "", J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT>, ImmRegRel {
+ let isTaken = isTak;
+ let isPredicatedFalse = PredNot;
+ let isPredicatedNew = isPredNew;
+ bits<2> src;
+ bits<17> dst;
+
+ let IClass = 0b0101;
+
+ let Inst{27-24} = 0b1100;
+ let Inst{21} = PredNot;
+ let Inst{12} = isTak;
+ let Inst{11} = isPredNew;
+ let Inst{9-8} = src;
+ let Inst{23-22} = dst{16-15};
+ let Inst{20-16} = dst{14-10};
+ let Inst{13} = dst{9};
+ let Inst{7-1} = dst{8-2};
+ }
+
+multiclass JMP_Pred<bit PredNot, string ExtStr> {
+ def NAME : T_JMP_c<PredNot, 0, 0, ExtStr>; // not taken
+ // Predicate new
+ def NAME#newpt : T_JMP_c<PredNot, 1, 1, ExtStr>; // taken
+ def NAME#new : T_JMP_c<PredNot, 1, 0, ExtStr>; // not taken
+}
+
+multiclass JMP_base<string BaseOp, string ExtStr> {
+ let BaseOpcode = BaseOp in {
+ def NAME : T_JMP<ExtStr>;
+ defm t : JMP_Pred<0, ExtStr>;
+ defm f : JMP_Pred<1, ExtStr>;
+ }
+}
+
+// Jumps to address stored in a register, JUMPR_MISC
+// if ([[!]P[.new]]) jumpr[:t/nt] Rs
+let isBranch = 1, isIndirectBranch = 1, isBarrier = 1, Defs = [PC],
+ isPredicable = 1, hasSideEffects = 0, InputType = "reg" in
+class T_JMPr
+ : JRInst<(outs), (ins IntRegs:$dst),
+ "jumpr $dst", [], "", J_tc_2early_SLOT2> {
+ bits<5> dst;
+
+ let IClass = 0b0101;
+ let Inst{27-21} = 0b0010100;
+ let Inst{20-16} = dst;
+}
+
+let isBranch = 1, isIndirectBranch = 1, Defs = [PC], isPredicated = 1,
+ hasSideEffects = 0, InputType = "reg" in
+class T_JMPr_c <bit PredNot, bit isPredNew, bit isTak>
+ : JRInst <(outs), (ins PredRegs:$src, IntRegs:$dst),
+ CondStr<"$src", !if(PredNot,0,1), isPredNew>.S #
+ JumpOpcStr<"jumpr", isPredNew, isTak>.S # " $dst", [],
+ "", J_tc_2early_SLOT2> {
+
+ let isTaken = isTak;
+ let isPredicatedFalse = PredNot;
+ let isPredicatedNew = isPredNew;
+ bits<2> src;
+ bits<5> dst;
+
+ let IClass = 0b0101;
+
+ let Inst{27-22} = 0b001101;
+ let Inst{21} = PredNot;
+ let Inst{20-16} = dst;
+ let Inst{12} = isTak;
+ let Inst{11} = isPredNew;
+ let Inst{9-8} = src;
+}
+
+multiclass JMPR_Pred<bit PredNot> {
+ def NAME : T_JMPr_c<PredNot, 0, 0>; // not taken
+ // Predicate new
+ def NAME#newpt : T_JMPr_c<PredNot, 1, 1>; // taken
+ def NAME#new : T_JMPr_c<PredNot, 1, 0>; // not taken
+}
+
+multiclass JMPR_base<string BaseOp> {
+ let BaseOpcode = BaseOp in {
+ def NAME : T_JMPr;
+ defm t : JMPR_Pred<0>;
+ defm f : JMPR_Pred<1>;
+ }
+}
+
+let isCall = 1, hasSideEffects = 1 in
+class JUMPR_MISC_CALLR<bit isPred, bit isPredNot,
+ dag InputDag = (ins IntRegs:$Rs)>
+ : JRInst<(outs), InputDag,
+ !if(isPred, !if(isPredNot, "if (!$Pu) callr $Rs",
+ "if ($Pu) callr $Rs"),
+ "callr $Rs"),
+ [], "", J_tc_2early_SLOT2> {
+ bits<5> Rs;
+ bits<2> Pu;
+ let isPredicated = isPred;
+ let isPredicatedFalse = isPredNot;
+
+ let IClass = 0b0101;
+ let Inst{27-25} = 0b000;
+ let Inst{24-23} = !if (isPred, 0b10, 0b01);
+ let Inst{22} = 0;
+ let Inst{21} = isPredNot;
+ let Inst{9-8} = !if (isPred, Pu, 0b00);
+ let Inst{20-16} = Rs;
+
+ }
+
+let Defs = VolatileV3.Regs in {
+ def J2_callrt : JUMPR_MISC_CALLR<1, 0, (ins PredRegs:$Pu, IntRegs:$Rs)>;
+ def J2_callrf : JUMPR_MISC_CALLR<1, 1, (ins PredRegs:$Pu, IntRegs:$Rs)>;
+}
+
+let isTerminator = 1, hasSideEffects = 0 in {
+ defm J2_jump : JMP_base<"JMP", "">, PredNewRel;
+
+ defm J2_jumpr : JMPR_base<"JMPr">, PredNewRel;
+
+ let isReturn = 1, isPseudo = 1, isCodeGenOnly = 1 in
+ defm PS_jmpret : JMPR_base<"JMPret">, PredNewRel;
+}
+
+let validSubTargets = HasV60SubT in
+multiclass JMPpt_base<string BaseOp> {
+ let BaseOpcode = BaseOp in {
+ def tpt : T_JMP_c <0, 0, 1, "">; // Predicate true - taken
+ def fpt : T_JMP_c <1, 0, 1, "">; // Predicate false - taken
+ }
+}
+
+let validSubTargets = HasV60SubT in
+multiclass JMPRpt_base<string BaseOp> {
+ let BaseOpcode = BaseOp in {
+ def tpt : T_JMPr_c<0, 0, 1>; // predicate true - taken
+ def fpt : T_JMPr_c<1, 0, 1>; // predicate false - taken
+ }
+}
+
+defm J2_jumpr : JMPRpt_base<"JMPr">;
+defm J2_jump : JMPpt_base<"JMP">;
+
+// A return through builtin_eh_return.
+let isReturn = 1, isTerminator = 1, isBarrier = 1, hasSideEffects = 0,
+ isCodeGenOnly = 1, Defs = [PC], Uses = [R28], isPredicable = 0 in
+def EH_RETURN_JMPR : T_JMPr;
+
+//===----------------------------------------------------------------------===//
+// JR -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// LD +
+//===----------------------------------------------------------------------===//
+
+// Load - Base with Immediate offset addressing mode
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, AddedComplexity = 20 in
+class T_load_io <string mnemonic, RegisterClass RC, bits<4> MajOp,
+ Operand ImmOp>
+ : LDInst<(outs RC:$dst), (ins IntRegs:$src1, ImmOp:$offset),
+ "$dst = "#mnemonic#"($src1 + #$offset)", []>, AddrModeRel {
+ bits<4> name;
+ bits<5> dst;
+ bits<5> src1;
+ bits<14> offset;
+ bits<11> offsetBits;
+
+ string ImmOpStr = !cast<string>(ImmOp);
+ let offsetBits = !if (!eq(ImmOpStr, "s11_3Ext"), offset{13-3},
+ !if (!eq(ImmOpStr, "s11_2Ext"), offset{12-2},
+ !if (!eq(ImmOpStr, "s11_1Ext"), offset{11-1},
+ /* s11_0Ext */ offset{10-0})));
+ let opExtentBits = !if (!eq(ImmOpStr, "s11_3Ext"), 14,
+ !if (!eq(ImmOpStr, "s11_2Ext"), 13,
+ !if (!eq(ImmOpStr, "s11_1Ext"), 12,
+ /* s11_0Ext */ 11)));
+ let hasNewValue = !if (!eq(!cast<string>(RC), "DoubleRegs"), 0, 1);
+
+ let IClass = 0b1001;
+
+ let Inst{27} = 0b0;
+ let Inst{26-25} = offsetBits{10-9};
+ let Inst{24-21} = MajOp;
+ let Inst{20-16} = src1;
+ let Inst{13-5} = offsetBits{8-0};
+ let Inst{4-0} = dst;
+ }
+
+let opExtendable = 3, isExtentSigned = 0, isPredicated = 1 in
+class T_pload_io <string mnemonic, RegisterClass RC, bits<4>MajOp,
+ Operand ImmOp, bit isNot, bit isPredNew>
+ : LDInst<(outs RC:$dst),
+ (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$offset),
+ "if ("#!if(isNot, "!$src1", "$src1")
+ #!if(isPredNew, ".new", "")
+ #") $dst = "#mnemonic#"($src2 + #$offset)",
+ [],"", V2LDST_tc_ld_SLOT01> , AddrModeRel {
+ bits<5> dst;
+ bits<2> src1;
+ bits<5> src2;
+ bits<9> offset;
+ bits<6> offsetBits;
+ string ImmOpStr = !cast<string>(ImmOp);
+
+ let offsetBits = !if (!eq(ImmOpStr, "u6_3Ext"), offset{8-3},
+ !if (!eq(ImmOpStr, "u6_2Ext"), offset{7-2},
+ !if (!eq(ImmOpStr, "u6_1Ext"), offset{6-1},
+ /* u6_0Ext */ offset{5-0})));
+ let opExtentBits = !if (!eq(ImmOpStr, "u6_3Ext"), 9,
+ !if (!eq(ImmOpStr, "u6_2Ext"), 8,
+ !if (!eq(ImmOpStr, "u6_1Ext"), 7,
+ /* u6_0Ext */ 6)));
+ let hasNewValue = !if (!eq(ImmOpStr, "u6_3Ext"), 0, 1);
+ let isPredicatedNew = isPredNew;
+ let isPredicatedFalse = isNot;
+
+ let IClass = 0b0100;
+
+ let Inst{27} = 0b0;
+ let Inst{27} = 0b0;
+ let Inst{26} = isNot;
+ let Inst{25} = isPredNew;
+ let Inst{24-21} = MajOp;
+ let Inst{20-16} = src2;
+ let Inst{13} = 0b0;
+ let Inst{12-11} = src1;
+ let Inst{10-5} = offsetBits;
+ let Inst{4-0} = dst;
+ }
+
+let isExtendable = 1, hasSideEffects = 0, addrMode = BaseImmOffset in
+multiclass LD_Idxd<string mnemonic, string CextOp, RegisterClass RC,
+ Operand ImmOp, Operand predImmOp, bits<4>MajOp> {
+ let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed in {
+ let isPredicable = 1 in
+ def L2_#NAME#_io : T_load_io <mnemonic, RC, MajOp, ImmOp>;
+
+ // Predicated
+ def L2_p#NAME#t_io : T_pload_io <mnemonic, RC, MajOp, predImmOp, 0, 0>;
+ def L2_p#NAME#f_io : T_pload_io <mnemonic, RC, MajOp, predImmOp, 1, 0>;
+
+ // Predicated new
+ def L2_p#NAME#tnew_io : T_pload_io <mnemonic, RC, MajOp, predImmOp, 0, 1>;
+ def L2_p#NAME#fnew_io : T_pload_io <mnemonic, RC, MajOp, predImmOp, 1, 1>;
+ }
+}
+
+let accessSize = ByteAccess in {
+ defm loadrb: LD_Idxd <"memb", "LDrib", IntRegs, s11_0Ext, u6_0Ext, 0b1000>;
+ defm loadrub: LD_Idxd <"memub", "LDriub", IntRegs, s11_0Ext, u6_0Ext, 0b1001>;
+}
+
+let accessSize = HalfWordAccess, opExtentAlign = 1 in {
+ defm loadrh: LD_Idxd <"memh", "LDrih", IntRegs, s11_1Ext, u6_1Ext, 0b1010>;
+ defm loadruh: LD_Idxd <"memuh", "LDriuh", IntRegs, s11_1Ext, u6_1Ext, 0b1011>;
+}
+
+let accessSize = WordAccess, opExtentAlign = 2 in
+defm loadri: LD_Idxd <"memw", "LDriw", IntRegs, s11_2Ext, u6_2Ext, 0b1100>;
+
+let accessSize = DoubleWordAccess, opExtentAlign = 3 in
+defm loadrd: LD_Idxd <"memd", "LDrid", DoubleRegs, s11_3Ext, u6_3Ext, 0b1110>;
+
+let accessSize = HalfWordAccess, opExtentAlign = 1 in {
+ def L2_loadbsw2_io: T_load_io<"membh", IntRegs, 0b0001, s11_1Ext>;
+ def L2_loadbzw2_io: T_load_io<"memubh", IntRegs, 0b0011, s11_1Ext>;
+}
+
+let accessSize = WordAccess, opExtentAlign = 2 in {
+ def L2_loadbzw4_io: T_load_io<"memubh", DoubleRegs, 0b0101, s11_2Ext>;
+ def L2_loadbsw4_io: T_load_io<"membh", DoubleRegs, 0b0111, s11_2Ext>;
+}
+
+let addrMode = BaseImmOffset, isExtendable = 1, hasSideEffects = 0,
+ opExtendable = 3, isExtentSigned = 1 in
+class T_loadalign_io <string str, bits<4> MajOp, Operand ImmOp>
+ : LDInst<(outs DoubleRegs:$dst),
+ (ins DoubleRegs:$src1, IntRegs:$src2, ImmOp:$offset),
+ "$dst = "#str#"($src2 + #$offset)", [],
+ "$src1 = $dst">, AddrModeRel {
+ bits<4> name;
+ bits<5> dst;
+ bits<5> src2;
+ bits<12> offset;
+ bits<11> offsetBits;
+
+ let offsetBits = !if (!eq(!cast<string>(ImmOp), "s11_1Ext"), offset{11-1},
+ /* s11_0Ext */ offset{10-0});
+ let IClass = 0b1001;
+
+ let Inst{27} = 0b0;
+ let Inst{26-25} = offsetBits{10-9};
+ let Inst{24-21} = MajOp;
+ let Inst{20-16} = src2;
+ let Inst{13-5} = offsetBits{8-0};
+ let Inst{4-0} = dst;
+ }
+
+let accessSize = HalfWordAccess, opExtentBits = 12, opExtentAlign = 1 in
+def L2_loadalignh_io: T_loadalign_io <"memh_fifo", 0b0010, s11_1Ext>;
+
+let accessSize = ByteAccess, opExtentBits = 11 in
+def L2_loadalignb_io: T_loadalign_io <"memb_fifo", 0b0100, s11_0Ext>;
+
+//===----------------------------------------------------------------------===//
+// Post increment load
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// Template class for non-predicated post increment loads with immediate offset.
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0, addrMode = PostInc in
+class T_load_pi <string mnemonic, RegisterClass RC, Operand ImmOp,
+ bits<4> MajOp >
+ : LDInstPI <(outs RC:$dst, IntRegs:$dst2),
+ (ins IntRegs:$src1, ImmOp:$offset),
+ "$dst = "#mnemonic#"($src1++#$offset)" ,
+ [],
+ "$src1 = $dst2" > ,
+ PredNewRel {
+ bits<5> dst;
+ bits<5> src1;
+ bits<7> offset;
+ bits<4> offsetBits;
+
+ string ImmOpStr = !cast<string>(ImmOp);
+ let offsetBits = !if (!eq(ImmOpStr, "s4_3Imm"), offset{6-3},
+ !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
+ !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
+ /* s4_0Imm */ offset{3-0})));
+ let hasNewValue = !if (!eq(ImmOpStr, "s4_3Imm"), 0, 1);
+
+ let IClass = 0b1001;
+
+ let Inst{27-25} = 0b101;
+ let Inst{24-21} = MajOp;
+ let Inst{20-16} = src1;
+ let Inst{13-12} = 0b00;
+ let Inst{8-5} = offsetBits;
+ let Inst{4-0} = dst;
+ }
+
+//===----------------------------------------------------------------------===//
+// Template class for predicated post increment loads with immediate offset.
+//===----------------------------------------------------------------------===//
+let isPredicated = 1, hasSideEffects = 0, addrMode = PostInc in
+class T_pload_pi <string mnemonic, RegisterClass RC, Operand ImmOp,
+ bits<4> MajOp, bit isPredNot, bit isPredNew >
+ : LDInst <(outs RC:$dst, IntRegs:$dst2),
+ (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$offset),
+ !if(isPredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+ ") ")#"$dst = "#mnemonic#"($src2++#$offset)",
+ [] ,
+ "$src2 = $dst2" > ,
+ PredNewRel {
+ bits<5> dst;
+ bits<2> src1;
+ bits<5> src2;
+ bits<7> offset;
+ bits<4> offsetBits;
+
+ let isPredicatedNew = isPredNew;
+ let isPredicatedFalse = isPredNot;
+
+ string ImmOpStr = !cast<string>(ImmOp);
+ let offsetBits = !if (!eq(ImmOpStr, "s4_3Imm"), offset{6-3},
+ !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
+ !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
+ /* s4_0Imm */ offset{3-0})));
+ let hasNewValue = !if (!eq(ImmOpStr, "s4_3Imm"), 0, 1);
+
+ let IClass = 0b1001;
+
+ let Inst{27-25} = 0b101;
+ let Inst{24-21} = MajOp;
+ let Inst{20-16} = src2;
+ let Inst{13} = 0b1;
+ let Inst{12} = isPredNew;
+ let Inst{11} = isPredNot;
+ let Inst{10-9} = src1;
+ let Inst{8-5} = offsetBits;
+ let Inst{4-0} = dst;
+ }
+
+//===----------------------------------------------------------------------===//
+// Multiclass for post increment loads with immediate offset.
+//===----------------------------------------------------------------------===//
+
+multiclass LD_PostInc <string mnemonic, string BaseOp, RegisterClass RC,
+ Operand ImmOp, bits<4> MajOp> {
+ let BaseOpcode = "POST_"#BaseOp in {
+ let isPredicable = 1 in
+ def L2_#NAME#_pi : T_load_pi < mnemonic, RC, ImmOp, MajOp>;
+
+ // Predicated
+ def L2_p#NAME#t_pi : T_pload_pi < mnemonic, RC, ImmOp, MajOp, 0, 0>;
+ def L2_p#NAME#f_pi : T_pload_pi < mnemonic, RC, ImmOp, MajOp, 1, 0>;
+
+ // Predicated new
+ def L2_p#NAME#tnew_pi : T_pload_pi < mnemonic, RC, ImmOp, MajOp, 0, 1>;
+ def L2_p#NAME#fnew_pi : T_pload_pi < mnemonic, RC, ImmOp, MajOp, 1, 1>;
+ }
+}
+
+// post increment byte loads with immediate offset
+let accessSize = ByteAccess in {
+ defm loadrb : LD_PostInc <"memb", "LDrib", IntRegs, s4_0Imm, 0b1000>;
+ defm loadrub : LD_PostInc <"memub", "LDriub", IntRegs, s4_0Imm, 0b1001>;
+}
+
+// post increment halfword loads with immediate offset
+let accessSize = HalfWordAccess, opExtentAlign = 1 in {
+ defm loadrh : LD_PostInc <"memh", "LDrih", IntRegs, s4_1Imm, 0b1010>;
+ defm loadruh : LD_PostInc <"memuh", "LDriuh", IntRegs, s4_1Imm, 0b1011>;
+}
+
+// post increment word loads with immediate offset
+let accessSize = WordAccess, opExtentAlign = 2 in
+defm loadri : LD_PostInc <"memw", "LDriw", IntRegs, s4_2Imm, 0b1100>;
+
+// post increment doubleword loads with immediate offset
+let accessSize = DoubleWordAccess, opExtentAlign = 3 in
+defm loadrd : LD_PostInc <"memd", "LDrid", DoubleRegs, s4_3Imm, 0b1110>;
+
+// Rd=memb[u]h(Rx++#s4:1)
+// Rdd=memb[u]h(Rx++#s4:2)
+let accessSize = HalfWordAccess, opExtentAlign = 1 in {
+ def L2_loadbsw2_pi : T_load_pi <"membh", IntRegs, s4_1Imm, 0b0001>;
+ def L2_loadbzw2_pi : T_load_pi <"memubh", IntRegs, s4_1Imm, 0b0011>;
+}
+let accessSize = WordAccess, opExtentAlign = 2, hasNewValue = 0 in {
+ def L2_loadbsw4_pi : T_load_pi <"membh", DoubleRegs, s4_2Imm, 0b0111>;
+ def L2_loadbzw4_pi : T_load_pi <"memubh", DoubleRegs, s4_2Imm, 0b0101>;
+}
+
+//===----------------------------------------------------------------------===//
+// Template class for post increment fifo loads with immediate offset.
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0, addrMode = PostInc in
+class T_loadalign_pi <string mnemonic, Operand ImmOp, bits<4> MajOp >
+ : LDInstPI <(outs DoubleRegs:$dst, IntRegs:$dst2),
+ (ins DoubleRegs:$src1, IntRegs:$src2, ImmOp:$offset),
+ "$dst = "#mnemonic#"($src2++#$offset)" ,
+ [], "$src2 = $dst2, $src1 = $dst" > ,
+ PredNewRel {
+ bits<5> dst;
+ bits<5> src2;
+ bits<5> offset;
+ bits<4> offsetBits;
+
+ let offsetBits = !if (!eq(!cast<string>(ImmOp), "s4_1Imm"), offset{4-1},
+ /* s4_0Imm */ offset{3-0});
+ let IClass = 0b1001;
+
+ let Inst{27-25} = 0b101;
+ let Inst{24-21} = MajOp;
+ let Inst{20-16} = src2;
+ let Inst{13-12} = 0b00;
+ let Inst{8-5} = offsetBits;
+ let Inst{4-0} = dst;
+ }
+
+// Ryy=memh_fifo(Rx++#s4:1)
+// Ryy=memb_fifo(Rx++#s4:0)
+let accessSize = ByteAccess in
+def L2_loadalignb_pi : T_loadalign_pi <"memb_fifo", s4_0Imm, 0b0100>;
+
+let accessSize = HalfWordAccess, opExtentAlign = 1 in
+def L2_loadalignh_pi : T_loadalign_pi <"memh_fifo", s4_1Imm, 0b0010>;
+
+//===----------------------------------------------------------------------===//
+// Template class for post increment loads with register offset.
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0, addrMode = PostInc in
+class T_load_pr <string mnemonic, RegisterClass RC, bits<4> MajOp,
+ MemAccessSize AccessSz>
+ : LDInstPI <(outs RC:$dst, IntRegs:$_dst_),
+ (ins IntRegs:$src1, ModRegs:$src2),
+ "$dst = "#mnemonic#"($src1++$src2)" ,
+ [], "$src1 = $_dst_" > {
+ bits<5> dst;
+ bits<5> src1;
+ bits<1> src2;
+
+ let accessSize = AccessSz;
+ let IClass = 0b1001;
+
+ let Inst{27-25} = 0b110;
+ let Inst{24-21} = MajOp;
+ let Inst{20-16} = src1;
+ let Inst{13} = src2;
+ let Inst{12} = 0b0;
+ let Inst{7} = 0b0;
+ let Inst{4-0} = dst;
+ }
+
+let hasNewValue = 1 in {
+ def L2_loadrb_pr : T_load_pr <"memb", IntRegs, 0b1000, ByteAccess>;
+ def L2_loadrub_pr : T_load_pr <"memub", IntRegs, 0b1001, ByteAccess>;
+ def L2_loadrh_pr : T_load_pr <"memh", IntRegs, 0b1010, HalfWordAccess>;
+ def L2_loadruh_pr : T_load_pr <"memuh", IntRegs, 0b1011, HalfWordAccess>;
+ def L2_loadri_pr : T_load_pr <"memw", IntRegs, 0b1100, WordAccess>;
+
+ def L2_loadbzw2_pr : T_load_pr <"memubh", IntRegs, 0b0011, HalfWordAccess>;
+}
+
+def L2_loadrd_pr : T_load_pr <"memd", DoubleRegs, 0b1110, DoubleWordAccess>;
+def L2_loadbzw4_pr : T_load_pr <"memubh", DoubleRegs, 0b0101, WordAccess>;
+
+// Load predicate.
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13,
+ isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
+def LDriw_pred : LDInst<(outs PredRegs:$dst),
+ (ins IntRegs:$addr, s11_2Ext:$off),
+ ".error \"should not emit\"", []>;
+// Load modifier.
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13,
+ isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
+def LDriw_mod : LDInst<(outs ModRegs:$dst),
+ (ins IntRegs:$addr, s11_2Ext:$off),
+ ".error \"should not emit\"", []>;
+
+let Defs = [R29, R30, R31], Uses = [R30], hasSideEffects = 0 in
+ def L2_deallocframe : LDInst<(outs), (ins),
+ "deallocframe",
+ []> {
+ let IClass = 0b1001;
+
+ let Inst{27-16} = 0b000000011110;
+ let Inst{13} = 0b0;
+ let Inst{4-0} = 0b11110;
+}
+
+// Load / Post increment circular addressing mode.
+let Uses = [CS], hasSideEffects = 0, addrMode = PostInc in
+class T_load_pcr<string mnemonic, RegisterClass RC, bits<4> MajOp>
+ : LDInst <(outs RC:$dst, IntRegs:$_dst_),
+ (ins IntRegs:$Rz, ModRegs:$Mu),
+ "$dst = "#mnemonic#"($Rz ++ I:circ($Mu))", [],
+ "$Rz = $_dst_" > {
+ bits<5> dst;
+ bits<5> Rz;
+ bit Mu;
+
+ let hasNewValue = !if (!eq(!cast<string>(RC), "DoubleRegs"), 0, 1);
+ let IClass = 0b1001;
+
+ let Inst{27-25} = 0b100;
+ let Inst{24-21} = MajOp;
+ let Inst{20-16} = Rz;
+ let Inst{13} = Mu;
+ let Inst{12} = 0b0;
+ let Inst{9} = 0b1;
+ let Inst{7} = 0b0;
+ let Inst{4-0} = dst;
+ }
+
+let accessSize = ByteAccess in {
+ def L2_loadrb_pcr : T_load_pcr <"memb", IntRegs, 0b1000>;
+ def L2_loadrub_pcr : T_load_pcr <"memub", IntRegs, 0b1001>;
+}
+
+let accessSize = HalfWordAccess in {
+ def L2_loadrh_pcr : T_load_pcr <"memh", IntRegs, 0b1010>;
+ def L2_loadruh_pcr : T_load_pcr <"memuh", IntRegs, 0b1011>;
+ def L2_loadbsw2_pcr : T_load_pcr <"membh", IntRegs, 0b0001>;
+ def L2_loadbzw2_pcr : T_load_pcr <"memubh", IntRegs, 0b0011>;
+}
+
+let accessSize = WordAccess in {
+ def L2_loadri_pcr : T_load_pcr <"memw", IntRegs, 0b1100>;
+ let hasNewValue = 0 in {
+ def L2_loadbzw4_pcr : T_load_pcr <"memubh", DoubleRegs, 0b0101>;
+ def L2_loadbsw4_pcr : T_load_pcr <"membh", DoubleRegs, 0b0111>;
+ }
+}
+
+let accessSize = DoubleWordAccess in
+def L2_loadrd_pcr : T_load_pcr <"memd", DoubleRegs, 0b1110>;
+
+// Load / Post increment circular addressing mode.
+let Uses = [CS], hasSideEffects = 0, addrMode = PostInc in
+class T_loadalign_pcr<string mnemonic, bits<4> MajOp, MemAccessSize AccessSz >
+ : LDInst <(outs DoubleRegs:$dst, IntRegs:$_dst_),
+ (ins DoubleRegs:$_src_, IntRegs:$Rz, ModRegs:$Mu),
+ "$dst = "#mnemonic#"($Rz ++ I:circ($Mu))", [],
+ "$Rz = $_dst_, $dst = $_src_" > {
+ bits<5> dst;
+ bits<5> Rz;
+ bit Mu;
+
+ let accessSize = AccessSz;
+ let IClass = 0b1001;
+
+ let Inst{27-25} = 0b100;
+ let Inst{24-21} = MajOp;
+ let Inst{20-16} = Rz;
+ let Inst{13} = Mu;
+ let Inst{12} = 0b0;
+ let Inst{9} = 0b1;
+ let Inst{7} = 0b0;
+ let Inst{4-0} = dst;
+ }
+
+def L2_loadalignb_pcr : T_loadalign_pcr <"memb_fifo", 0b0100, ByteAccess>;
+def L2_loadalignh_pcr : T_loadalign_pcr <"memh_fifo", 0b0010, HalfWordAccess>;
+
+//===----------------------------------------------------------------------===//
+// Circular loads with immediate offset.
+//===----------------------------------------------------------------------===//
+let Uses = [CS], mayLoad = 1, hasSideEffects = 0, addrMode = PostInc in
+class T_load_pci <string mnemonic, RegisterClass RC,
+ Operand ImmOp, bits<4> MajOp>
+ : LDInstPI<(outs RC:$dst, IntRegs:$_dst_),
+ (ins IntRegs:$Rz, ImmOp:$offset, ModRegs:$Mu),
+ "$dst = "#mnemonic#"($Rz ++ #$offset:circ($Mu))", [],
+ "$Rz = $_dst_"> {
+ bits<5> dst;
+ bits<5> Rz;
+ bits<1> Mu;
+ bits<7> offset;
+ bits<4> offsetBits;
+
+ string ImmOpStr = !cast<string>(ImmOp);
+ let hasNewValue = !if (!eq(!cast<string>(RC), "DoubleRegs"), 0, 1);
+ let offsetBits = !if (!eq(ImmOpStr, "s4_3Imm"), offset{6-3},
+ !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
+ !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
+ /* s4_0Imm */ offset{3-0})));
+ let IClass = 0b1001;
+ let Inst{27-25} = 0b100;
+ let Inst{24-21} = MajOp;
+ let Inst{20-16} = Rz;
+ let Inst{13} = Mu;
+ let Inst{12} = 0b0;
+ let Inst{9} = 0b0;
+ let Inst{8-5} = offsetBits;
+ let Inst{4-0} = dst;
+ }
+
+// Byte variants of circ load
+let accessSize = ByteAccess in {
+ def L2_loadrb_pci : T_load_pci <"memb", IntRegs, s4_0Imm, 0b1000>;
+ def L2_loadrub_pci : T_load_pci <"memub", IntRegs, s4_0Imm, 0b1001>;
+}
+
+// Half word variants of circ load
+let accessSize = HalfWordAccess in {
+ def L2_loadrh_pci : T_load_pci <"memh", IntRegs, s4_1Imm, 0b1010>;
+ def L2_loadruh_pci : T_load_pci <"memuh", IntRegs, s4_1Imm, 0b1011>;
+ def L2_loadbzw2_pci : T_load_pci <"memubh", IntRegs, s4_1Imm, 0b0011>;
+ def L2_loadbsw2_pci : T_load_pci <"membh", IntRegs, s4_1Imm, 0b0001>;
+}
+
+// Word variants of circ load
+let accessSize = WordAccess in
+def L2_loadri_pci : T_load_pci <"memw", IntRegs, s4_2Imm, 0b1100>;
+
+let accessSize = WordAccess, hasNewValue = 0 in {
+ def L2_loadbzw4_pci : T_load_pci <"memubh", DoubleRegs, s4_2Imm, 0b0101>;
+ def L2_loadbsw4_pci : T_load_pci <"membh", DoubleRegs, s4_2Imm, 0b0111>;
+}
+
+let accessSize = DoubleWordAccess, hasNewValue = 0 in
+def L2_loadrd_pci : T_load_pci <"memd", DoubleRegs, s4_3Imm, 0b1110>;
+
+
+// TODO: memb_fifo and memh_fifo must take destination register as input.
+// One-off circ loads - not enough in common to break into a class.
+let accessSize = ByteAccess in
+def L2_loadalignb_pci : T_load_pci <"memb_fifo", DoubleRegs, s4_0Imm, 0b0100>;
+
+let accessSize = HalfWordAccess, opExtentAlign = 1 in
+def L2_loadalignh_pci : T_load_pci <"memh_fifo", DoubleRegs, s4_1Imm, 0b0010>;
+
+// L[24]_load[wd]_locked: Load word/double with lock.
+let isSoloAX = 1 in
+class T_load_locked <string mnemonic, RegisterClass RC>
+ : LD0Inst <(outs RC:$dst),
+ (ins IntRegs:$src),
+ "$dst = "#mnemonic#"($src)"> {
+ bits<5> dst;
+ bits<5> src;
+ let IClass = 0b1001;
+ let Inst{27-21} = 0b0010000;
+ let Inst{20-16} = src;
+ let Inst{13-12} = !if (!eq(mnemonic, "memd_locked"), 0b01, 0b00);
+ let Inst{5} = 0;
+ let Inst{4-0} = dst;
+}
+let hasNewValue = 1, accessSize = WordAccess, opNewValue = 0 in
+ def L2_loadw_locked : T_load_locked <"memw_locked", IntRegs>;
+let accessSize = DoubleWordAccess in
+ def L4_loadd_locked : T_load_locked <"memd_locked", DoubleRegs>;
+
+// S[24]_store[wd]_locked: Store word/double conditionally.
+let isSoloAX = 1, isPredicateLate = 1 in
+class T_store_locked <string mnemonic, RegisterClass RC>
+ : ST0Inst <(outs PredRegs:$Pd), (ins IntRegs:$Rs, RC:$Rt),
+ mnemonic#"($Rs, $Pd) = $Rt"> {
+ bits<2> Pd;
+ bits<5> Rs;
+ bits<5> Rt;
+
+ let IClass = 0b1010;
+ let Inst{27-23} = 0b00001;
+ let Inst{22} = !if (!eq(mnemonic, "memw_locked"), 0b0, 0b1);
+ let Inst{21} = 0b1;
+ let Inst{20-16} = Rs;
+ let Inst{12-8} = Rt;
+ let Inst{1-0} = Pd;
+}
+
+let accessSize = WordAccess in
+def S2_storew_locked : T_store_locked <"memw_locked", IntRegs>;
+
+let accessSize = DoubleWordAccess in
+def S4_stored_locked : T_store_locked <"memd_locked", DoubleRegs>;
+
+//===----------------------------------------------------------------------===//
+// Bit-reversed loads with auto-increment register
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0, addrMode = PostInc in
+class T_load_pbr<string mnemonic, RegisterClass RC,
+ MemAccessSize addrSize, bits<4> majOp>
+ : LDInst
+ <(outs RC:$dst, IntRegs:$_dst_),
+ (ins IntRegs:$Rz, ModRegs:$Mu),
+ "$dst = "#mnemonic#"($Rz ++ $Mu:brev)" ,
+ [] , "$Rz = $_dst_" > {
+
+ let accessSize = addrSize;
+
+ bits<5> dst;
+ bits<5> Rz;
+ bits<1> Mu;
+
+ let IClass = 0b1001;
+
+ let Inst{27-25} = 0b111;
+ let Inst{24-21} = majOp;
+ let Inst{20-16} = Rz;
+ let Inst{13} = Mu;
+ let Inst{12} = 0b0;
+ let Inst{7} = 0b0;
+ let Inst{4-0} = dst;
+ }
+
+let hasNewValue =1, opNewValue = 0 in {
+ def L2_loadrb_pbr : T_load_pbr <"memb", IntRegs, ByteAccess, 0b1000>;
+ def L2_loadrub_pbr : T_load_pbr <"memub", IntRegs, ByteAccess, 0b1001>;
+ def L2_loadrh_pbr : T_load_pbr <"memh", IntRegs, HalfWordAccess, 0b1010>;
+ def L2_loadruh_pbr : T_load_pbr <"memuh", IntRegs, HalfWordAccess, 0b1011>;
+ def L2_loadbsw2_pbr : T_load_pbr <"membh", IntRegs, HalfWordAccess, 0b0001>;
+ def L2_loadbzw2_pbr : T_load_pbr <"memubh", IntRegs, HalfWordAccess, 0b0011>;
+ def L2_loadri_pbr : T_load_pbr <"memw", IntRegs, WordAccess, 0b1100>;
+}
+
+def L2_loadbzw4_pbr : T_load_pbr <"memubh", DoubleRegs, WordAccess, 0b0101>;
+def L2_loadbsw4_pbr : T_load_pbr <"membh", DoubleRegs, WordAccess, 0b0111>;
+def L2_loadrd_pbr : T_load_pbr <"memd", DoubleRegs, DoubleWordAccess, 0b1110>;
+
+def L2_loadalignb_pbr :T_load_pbr <"memb_fifo", DoubleRegs, ByteAccess, 0b0100>;
+def L2_loadalignh_pbr :T_load_pbr <"memh_fifo", DoubleRegs,
+ HalfWordAccess, 0b0010>;
+
+//===----------------------------------------------------------------------===//
+// LD -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MTYPE/ALU +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// MTYPE/ALU -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MTYPE/COMPLEX +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// MTYPE/COMPLEX -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MTYPE/MPYH +
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Template Class
+// MPYS / Multipy signed/unsigned halfwords
+//Rd=mpy[u](Rs.[H|L],Rt.[H|L])[:<<1][:rnd][:sat]
+//===----------------------------------------------------------------------===//
+
+let hasNewValue = 1, opNewValue = 0 in
+class T_M2_mpy < bits<2> LHbits, bit isSat, bit isRnd,
+ bit hasShift, bit isUnsigned>
+ : MInst < (outs IntRegs:$Rd), (ins IntRegs:$Rs, IntRegs:$Rt),
+ "$Rd = "#!if(isUnsigned,"mpyu","mpy")#"($Rs."#!if(LHbits{1},"h","l")
+ #", $Rt."#!if(LHbits{0},"h)","l)")
+ #!if(hasShift,":<<1","")
+ #!if(isRnd,":rnd","")
+ #!if(isSat,":sat",""),
+ [], "", M_tc_3x_SLOT23 > {
+ bits<5> Rd;
+ bits<5> Rs;
+ bits<5> Rt;
+
+ let IClass = 0b1110;
+
+ let Inst{27-24} = 0b1100;
+ let Inst{23} = hasShift;
+ let Inst{22} = isUnsigned;
+ let Inst{21} = isRnd;
+ let Inst{7} = isSat;
+ let Inst{6-5} = LHbits;
+ let Inst{4-0} = Rd;
+ let Inst{20-16} = Rs;
+ let Inst{12-8} = Rt;
+ }
+
+//Rd=mpy(Rs.[H|L],Rt.[H|L])[:<<1]
+def M2_mpy_ll_s1: T_M2_mpy<0b00, 0, 0, 1, 0>;
+def M2_mpy_ll_s0: T_M2_mpy<0b00, 0, 0, 0, 0>;
+def M2_mpy_lh_s1: T_M2_mpy<0b01, 0, 0, 1, 0>;
+def M2_mpy_lh_s0: T_M2_mpy<0b01, 0, 0, 0, 0>;
+def M2_mpy_hl_s1: T_M2_mpy<0b10, 0, 0, 1, 0>;
+def M2_mpy_hl_s0: T_M2_mpy<0b10, 0, 0, 0, 0>;
+def M2_mpy_hh_s1: T_M2_mpy<0b11, 0, 0, 1, 0>;
+def M2_mpy_hh_s0: T_M2_mpy<0b11, 0, 0, 0, 0>;
+
+//Rd=mpyu(Rs.[H|L],Rt.[H|L])[:<<1]
+def M2_mpyu_ll_s1: T_M2_mpy<0b00, 0, 0, 1, 1>;
+def M2_mpyu_ll_s0: T_M2_mpy<0b00, 0, 0, 0, 1>;
+def M2_mpyu_lh_s1: T_M2_mpy<0b01, 0, 0, 1, 1>;
+def M2_mpyu_lh_s0: T_M2_mpy<0b01, 0, 0, 0, 1>;
+def M2_mpyu_hl_s1: T_M2_mpy<0b10, 0, 0, 1, 1>;
+def M2_mpyu_hl_s0: T_M2_mpy<0b10, 0, 0, 0, 1>;
+def M2_mpyu_hh_s1: T_M2_mpy<0b11, 0, 0, 1, 1>;
+def M2_mpyu_hh_s0: T_M2_mpy<0b11, 0, 0, 0, 1>;
+
+//Rd=mpy(Rs.[H|L],Rt.[H|L])[:<<1]:rnd
+def M2_mpy_rnd_ll_s1: T_M2_mpy <0b00, 0, 1, 1, 0>;
+def M2_mpy_rnd_ll_s0: T_M2_mpy <0b00, 0, 1, 0, 0>;
+def M2_mpy_rnd_lh_s1: T_M2_mpy <0b01, 0, 1, 1, 0>;
+def M2_mpy_rnd_lh_s0: T_M2_mpy <0b01, 0, 1, 0, 0>;
+def M2_mpy_rnd_hl_s1: T_M2_mpy <0b10, 0, 1, 1, 0>;
+def M2_mpy_rnd_hl_s0: T_M2_mpy <0b10, 0, 1, 0, 0>;
+def M2_mpy_rnd_hh_s1: T_M2_mpy <0b11, 0, 1, 1, 0>;
+def M2_mpy_rnd_hh_s0: T_M2_mpy <0b11, 0, 1, 0, 0>;
+
+//Rd=mpy(Rs.[H|L],Rt.[H|L])[:<<1][:sat]
+//Rd=mpy(Rs.[H|L],Rt.[H|L])[:<<1][:rnd][:sat]
+let Defs = [USR_OVF] in {
+ def M2_mpy_sat_ll_s1: T_M2_mpy <0b00, 1, 0, 1, 0>;
+ def M2_mpy_sat_ll_s0: T_M2_mpy <0b00, 1, 0, 0, 0>;
+ def M2_mpy_sat_lh_s1: T_M2_mpy <0b01, 1, 0, 1, 0>;
+ def M2_mpy_sat_lh_s0: T_M2_mpy <0b01, 1, 0, 0, 0>;
+ def M2_mpy_sat_hl_s1: T_M2_mpy <0b10, 1, 0, 1, 0>;
+ def M2_mpy_sat_hl_s0: T_M2_mpy <0b10, 1, 0, 0, 0>;
+ def M2_mpy_sat_hh_s1: T_M2_mpy <0b11, 1, 0, 1, 0>;
+ def M2_mpy_sat_hh_s0: T_M2_mpy <0b11, 1, 0, 0, 0>;
+
+ def M2_mpy_sat_rnd_ll_s1: T_M2_mpy <0b00, 1, 1, 1, 0>;
+ def M2_mpy_sat_rnd_ll_s0: T_M2_mpy <0b00, 1, 1, 0, 0>;
+ def M2_mpy_sat_rnd_lh_s1: T_M2_mpy <0b01, 1, 1, 1, 0>;
+ def M2_mpy_sat_rnd_lh_s0: T_M2_mpy <0b01, 1, 1, 0, 0>;
+ def M2_mpy_sat_rnd_hl_s1: T_M2_mpy <0b10, 1, 1, 1, 0>;
+ def M2_mpy_sat_rnd_hl_s0: T_M2_mpy <0b10, 1, 1, 0, 0>;
+ def M2_mpy_sat_rnd_hh_s1: T_M2_mpy <0b11, 1, 1, 1, 0>;
+ def M2_mpy_sat_rnd_hh_s0: T_M2_mpy <0b11, 1, 1, 0, 0>;
+}
+
+//===----------------------------------------------------------------------===//
+// Template Class
+// MPYS / Multipy signed/unsigned halfwords and add/subtract the
+// result from the accumulator.
+//Rx [-+]= mpy[u](Rs.[H|L],Rt.[H|L])[:<<1][:sat]
+//===----------------------------------------------------------------------===//
+
+let hasNewValue = 1, opNewValue = 0 in
+class T_M2_mpy_acc < bits<2> LHbits, bit isSat, bit isNac,
+ bit hasShift, bit isUnsigned >
+ : MInst_acc<(outs IntRegs:$Rx), (ins IntRegs:$dst2, IntRegs:$Rs, IntRegs:$Rt),
+ "$Rx "#!if(isNac,"-= ","+= ")#!if(isUnsigned,"mpyu","mpy")
+ #"($Rs."#!if(LHbits{1},"h","l")
+ #", $Rt."#!if(LHbits{0},"h)","l)")
+ #!if(hasShift,":<<1","")
+ #!if(isSat,":sat",""),
+ [], "$dst2 = $Rx", M_tc_3x_SLOT23 > {
+ bits<5> Rx;
+ bits<5> Rs;
+ bits<5> Rt;
+
+ let IClass = 0b1110;
+ let Inst{27-24} = 0b1110;
+ let Inst{23} = hasShift;
+ let Inst{22} = isUnsigned;
+ let Inst{21} = isNac;
+ let Inst{7} = isSat;
+ let Inst{6-5} = LHbits;
+ let Inst{4-0} = Rx;
+ let Inst{20-16} = Rs;
+ let Inst{12-8} = Rt;
+ }
+
+//Rx += mpy(Rs.[H|L],Rt.[H|L])[:<<1]
+def M2_mpy_acc_ll_s1: T_M2_mpy_acc <0b00, 0, 0, 1, 0>;
+def M2_mpy_acc_ll_s0: T_M2_mpy_acc <0b00, 0, 0, 0, 0>;
+def M2_mpy_acc_lh_s1: T_M2_mpy_acc <0b01, 0, 0, 1, 0>;
+def M2_mpy_acc_lh_s0: T_M2_mpy_acc <0b01, 0, 0, 0, 0>;
+def M2_mpy_acc_hl_s1: T_M2_mpy_acc <0b10, 0, 0, 1, 0>;
+def M2_mpy_acc_hl_s0: T_M2_mpy_acc <0b10, 0, 0, 0, 0>;
+def M2_mpy_acc_hh_s1: T_M2_mpy_acc <0b11, 0, 0, 1, 0>;
+def M2_mpy_acc_hh_s0: T_M2_mpy_acc <0b11, 0, 0, 0, 0>;
+
+//Rx += mpyu(Rs.[H|L],Rt.[H|L])[:<<1]
+def M2_mpyu_acc_ll_s1: T_M2_mpy_acc <0b00, 0, 0, 1, 1>;
+def M2_mpyu_acc_ll_s0: T_M2_mpy_acc <0b00, 0, 0, 0, 1>;
+def M2_mpyu_acc_lh_s1: T_M2_mpy_acc <0b01, 0, 0, 1, 1>;
+def M2_mpyu_acc_lh_s0: T_M2_mpy_acc <0b01, 0, 0, 0, 1>;
+def M2_mpyu_acc_hl_s1: T_M2_mpy_acc <0b10, 0, 0, 1, 1>;
+def M2_mpyu_acc_hl_s0: T_M2_mpy_acc <0b10, 0, 0, 0, 1>;
+def M2_mpyu_acc_hh_s1: T_M2_mpy_acc <0b11, 0, 0, 1, 1>;
+def M2_mpyu_acc_hh_s0: T_M2_mpy_acc <0b11, 0, 0, 0, 1>;
+
+//Rx -= mpy(Rs.[H|L],Rt.[H|L])[:<<1]
+def M2_mpy_nac_ll_s1: T_M2_mpy_acc <0b00, 0, 1, 1, 0>;
+def M2_mpy_nac_ll_s0: T_M2_mpy_acc <0b00, 0, 1, 0, 0>;
+def M2_mpy_nac_lh_s1: T_M2_mpy_acc <0b01, 0, 1, 1, 0>;
+def M2_mpy_nac_lh_s0: T_M2_mpy_acc <0b01, 0, 1, 0, 0>;
+def M2_mpy_nac_hl_s1: T_M2_mpy_acc <0b10, 0, 1, 1, 0>;
+def M2_mpy_nac_hl_s0: T_M2_mpy_acc <0b10, 0, 1, 0, 0>;
+def M2_mpy_nac_hh_s1: T_M2_mpy_acc <0b11, 0, 1, 1, 0>;
+def M2_mpy_nac_hh_s0: T_M2_mpy_acc <0b11, 0, 1, 0, 0>;
+
+//Rx -= mpyu(Rs.[H|L],Rt.[H|L])[:<<1]
+def M2_mpyu_nac_ll_s1: T_M2_mpy_acc <0b00, 0, 1, 1, 1>;
+def M2_mpyu_nac_ll_s0: T_M2_mpy_acc <0b00, 0, 1, 0, 1>;
+def M2_mpyu_nac_lh_s1: T_M2_mpy_acc <0b01, 0, 1, 1, 1>;
+def M2_mpyu_nac_lh_s0: T_M2_mpy_acc <0b01, 0, 1, 0, 1>;
+def M2_mpyu_nac_hl_s1: T_M2_mpy_acc <0b10, 0, 1, 1, 1>;
+def M2_mpyu_nac_hl_s0: T_M2_mpy_acc <0b10, 0, 1, 0, 1>;
+def M2_mpyu_nac_hh_s1: T_M2_mpy_acc <0b11, 0, 1, 1, 1>;
+def M2_mpyu_nac_hh_s0: T_M2_mpy_acc <0b11, 0, 1, 0, 1>;
+
+//Rx += mpy(Rs.[H|L],Rt.[H|L])[:<<1]:sat
+def M2_mpy_acc_sat_ll_s1: T_M2_mpy_acc <0b00, 1, 0, 1, 0>;
+def M2_mpy_acc_sat_ll_s0: T_M2_mpy_acc <0b00, 1, 0, 0, 0>;
+def M2_mpy_acc_sat_lh_s1: T_M2_mpy_acc <0b01, 1, 0, 1, 0>;
+def M2_mpy_acc_sat_lh_s0: T_M2_mpy_acc <0b01, 1, 0, 0, 0>;
+def M2_mpy_acc_sat_hl_s1: T_M2_mpy_acc <0b10, 1, 0, 1, 0>;
+def M2_mpy_acc_sat_hl_s0: T_M2_mpy_acc <0b10, 1, 0, 0, 0>;
+def M2_mpy_acc_sat_hh_s1: T_M2_mpy_acc <0b11, 1, 0, 1, 0>;
+def M2_mpy_acc_sat_hh_s0: T_M2_mpy_acc <0b11, 1, 0, 0, 0>;
+
+//Rx -= mpy(Rs.[H|L],Rt.[H|L])[:<<1]:sat
+def M2_mpy_nac_sat_ll_s1: T_M2_mpy_acc <0b00, 1, 1, 1, 0>;
+def M2_mpy_nac_sat_ll_s0: T_M2_mpy_acc <0b00, 1, 1, 0, 0>;
+def M2_mpy_nac_sat_lh_s1: T_M2_mpy_acc <0b01, 1, 1, 1, 0>;
+def M2_mpy_nac_sat_lh_s0: T_M2_mpy_acc <0b01, 1, 1, 0, 0>;
+def M2_mpy_nac_sat_hl_s1: T_M2_mpy_acc <0b10, 1, 1, 1, 0>;
+def M2_mpy_nac_sat_hl_s0: T_M2_mpy_acc <0b10, 1, 1, 0, 0>;
+def M2_mpy_nac_sat_hh_s1: T_M2_mpy_acc <0b11, 1, 1, 1, 0>;
+def M2_mpy_nac_sat_hh_s0: T_M2_mpy_acc <0b11, 1, 1, 0, 0>;
+
+//===----------------------------------------------------------------------===//
+// Template Class
+// MPYS / Multipy signed/unsigned halfwords and add/subtract the
+// result from the 64-bit destination register.
+//Rxx [-+]= mpy[u](Rs.[H|L],Rt.[H|L])[:<<1][:sat]
+//===----------------------------------------------------------------------===//
+
+class T_M2_mpyd_acc < bits<2> LHbits, bit isNac, bit hasShift, bit isUnsigned>
+ : MInst_acc<(outs DoubleRegs:$Rxx),
+ (ins DoubleRegs:$dst2, IntRegs:$Rs, IntRegs:$Rt),
+ "$Rxx "#!if(isNac,"-= ","+= ")#!if(isUnsigned,"mpyu","mpy")
+ #"($Rs."#!if(LHbits{1},"h","l")
+ #", $Rt."#!if(LHbits{0},"h)","l)")
+ #!if(hasShift,":<<1",""),
+ [], "$dst2 = $Rxx", M_tc_3x_SLOT23 > {
+ bits<5> Rxx;
+ bits<5> Rs;
+ bits<5> Rt;
+
+ let IClass = 0b1110;
+
+ let Inst{27-24} = 0b0110;
+ let Inst{23} = hasShift;
+ let Inst{22} = isUnsigned;
+ let Inst{21} = isNac;
+ let Inst{7} = 0;
+ let Inst{6-5} = LHbits;
+ let Inst{4-0} = Rxx;
+ let Inst{20-16} = Rs;
+ let Inst{12-8} = Rt;
+ }
+
+def M2_mpyd_acc_hh_s0: T_M2_mpyd_acc <0b11, 0, 0, 0>;
+def M2_mpyd_acc_hl_s0: T_M2_mpyd_acc <0b10, 0, 0, 0>;
+def M2_mpyd_acc_lh_s0: T_M2_mpyd_acc <0b01, 0, 0, 0>;
+def M2_mpyd_acc_ll_s0: T_M2_mpyd_acc <0b00, 0, 0, 0>;
+
+def M2_mpyd_acc_hh_s1: T_M2_mpyd_acc <0b11, 0, 1, 0>;
+def M2_mpyd_acc_hl_s1: T_M2_mpyd_acc <0b10, 0, 1, 0>;
+def M2_mpyd_acc_lh_s1: T_M2_mpyd_acc <0b01, 0, 1, 0>;
+def M2_mpyd_acc_ll_s1: T_M2_mpyd_acc <0b00, 0, 1, 0>;
+
+def M2_mpyd_nac_hh_s0: T_M2_mpyd_acc <0b11, 1, 0, 0>;
+def M2_mpyd_nac_hl_s0: T_M2_mpyd_acc <0b10, 1, 0, 0>;
+def M2_mpyd_nac_lh_s0: T_M2_mpyd_acc <0b01, 1, 0, 0>;
+def M2_mpyd_nac_ll_s0: T_M2_mpyd_acc <0b00, 1, 0, 0>;
+
+def M2_mpyd_nac_hh_s1: T_M2_mpyd_acc <0b11, 1, 1, 0>;
+def M2_mpyd_nac_hl_s1: T_M2_mpyd_acc <0b10, 1, 1, 0>;
+def M2_mpyd_nac_lh_s1: T_M2_mpyd_acc <0b01, 1, 1, 0>;
+def M2_mpyd_nac_ll_s1: T_M2_mpyd_acc <0b00, 1, 1, 0>;
+
+def M2_mpyud_acc_hh_s0: T_M2_mpyd_acc <0b11, 0, 0, 1>;
+def M2_mpyud_acc_hl_s0: T_M2_mpyd_acc <0b10, 0, 0, 1>;
+def M2_mpyud_acc_lh_s0: T_M2_mpyd_acc <0b01, 0, 0, 1>;
+def M2_mpyud_acc_ll_s0: T_M2_mpyd_acc <0b00, 0, 0, 1>;
+
+def M2_mpyud_acc_hh_s1: T_M2_mpyd_acc <0b11, 0, 1, 1>;
+def M2_mpyud_acc_hl_s1: T_M2_mpyd_acc <0b10, 0, 1, 1>;
+def M2_mpyud_acc_lh_s1: T_M2_mpyd_acc <0b01, 0, 1, 1>;
+def M2_mpyud_acc_ll_s1: T_M2_mpyd_acc <0b00, 0, 1, 1>;
+
+def M2_mpyud_nac_hh_s0: T_M2_mpyd_acc <0b11, 1, 0, 1>;
+def M2_mpyud_nac_hl_s0: T_M2_mpyd_acc <0b10, 1, 0, 1>;
+def M2_mpyud_nac_lh_s0: T_M2_mpyd_acc <0b01, 1, 0, 1>;
+def M2_mpyud_nac_ll_s0: T_M2_mpyd_acc <0b00, 1, 0, 1>;
+
+def M2_mpyud_nac_hh_s1: T_M2_mpyd_acc <0b11, 1, 1, 1>;
+def M2_mpyud_nac_hl_s1: T_M2_mpyd_acc <0b10, 1, 1, 1>;
+def M2_mpyud_nac_lh_s1: T_M2_mpyd_acc <0b01, 1, 1, 1>;
+def M2_mpyud_nac_ll_s1: T_M2_mpyd_acc <0b00, 1, 1, 1>;
+
+//===----------------------------------------------------------------------===//
+// Template Class -- Vector Multipy
+// Used for complex multiply real or imaginary, dual multiply and even halfwords
+//===----------------------------------------------------------------------===//
+class T_M2_vmpy < string opc, bits<3> MajOp, bits<3> MinOp, bit hasShift,
+ bit isRnd, bit isSat >
+ : MInst <(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
+ "$Rdd = "#opc#"($Rss, $Rtt)"#!if(hasShift,":<<1","")
+ #!if(isRnd,":rnd","")
+ #!if(isSat,":sat",""),
+ [] > {
+ bits<5> Rdd;
+ bits<5> Rss;
+ bits<5> Rtt;
+
+ let IClass = 0b1110;
+
+ let Inst{27-24} = 0b1000;
+ let Inst{23-21} = MajOp;
+ let Inst{7-5} = MinOp;
+ let Inst{4-0} = Rdd;
+ let Inst{20-16} = Rss;
+ let Inst{12-8} = Rtt;
+ }
+
+// Vector complex multiply imaginary: Rdd=vcmpyi(Rss,Rtt)[:<<1]:sat
+let Defs = [USR_OVF] in {
+def M2_vcmpy_s1_sat_i: T_M2_vmpy <"vcmpyi", 0b110, 0b110, 1, 0, 1>;
+def M2_vcmpy_s0_sat_i: T_M2_vmpy <"vcmpyi", 0b010, 0b110, 0, 0, 1>;
+
+// Vector complex multiply real: Rdd=vcmpyr(Rss,Rtt)[:<<1]:sat
+def M2_vcmpy_s1_sat_r: T_M2_vmpy <"vcmpyr", 0b101, 0b110, 1, 0, 1>;
+def M2_vcmpy_s0_sat_r: T_M2_vmpy <"vcmpyr", 0b001, 0b110, 0, 0, 1>;
+
+// Vector dual multiply: Rdd=vdmpy(Rss,Rtt)[:<<1]:sat
+def M2_vdmpys_s1: T_M2_vmpy <"vdmpy", 0b100, 0b100, 1, 0, 1>;
+def M2_vdmpys_s0: T_M2_vmpy <"vdmpy", 0b000, 0b100, 0, 0, 1>;
+
+// Vector multiply even halfwords: Rdd=vmpyeh(Rss,Rtt)[:<<1]:sat
+def M2_vmpy2es_s1: T_M2_vmpy <"vmpyeh", 0b100, 0b110, 1, 0, 1>;
+def M2_vmpy2es_s0: T_M2_vmpy <"vmpyeh", 0b000, 0b110, 0, 0, 1>;
+
+//Rdd=vmpywoh(Rss,Rtt)[:<<1][:rnd]:sat
+def M2_mmpyh_s0: T_M2_vmpy <"vmpywoh", 0b000, 0b111, 0, 0, 1>;
+def M2_mmpyh_s1: T_M2_vmpy <"vmpywoh", 0b100, 0b111, 1, 0, 1>;
+def M2_mmpyh_rs0: T_M2_vmpy <"vmpywoh", 0b001, 0b111, 0, 1, 1>;
+def M2_mmpyh_rs1: T_M2_vmpy <"vmpywoh", 0b101, 0b111, 1, 1, 1>;
+
+//Rdd=vmpyweh(Rss,Rtt)[:<<1][:rnd]:sat
+def M2_mmpyl_s0: T_M2_vmpy <"vmpyweh", 0b000, 0b101, 0, 0, 1>;
+def M2_mmpyl_s1: T_M2_vmpy <"vmpyweh", 0b100, 0b101, 1, 0, 1>;
+def M2_mmpyl_rs0: T_M2_vmpy <"vmpyweh", 0b001, 0b101, 0, 1, 1>;
+def M2_mmpyl_rs1: T_M2_vmpy <"vmpyweh", 0b101, 0b101, 1, 1, 1>;
+
+//Rdd=vmpywouh(Rss,Rtt)[:<<1][:rnd]:sat
+def M2_mmpyuh_s0: T_M2_vmpy <"vmpywouh", 0b010, 0b111, 0, 0, 1>;
+def M2_mmpyuh_s1: T_M2_vmpy <"vmpywouh", 0b110, 0b111, 1, 0, 1>;
+def M2_mmpyuh_rs0: T_M2_vmpy <"vmpywouh", 0b011, 0b111, 0, 1, 1>;
+def M2_mmpyuh_rs1: T_M2_vmpy <"vmpywouh", 0b111, 0b111, 1, 1, 1>;
+
+//Rdd=vmpyweuh(Rss,Rtt)[:<<1][:rnd]:sat
+def M2_mmpyul_s0: T_M2_vmpy <"vmpyweuh", 0b010, 0b101, 0, 0, 1>;
+def M2_mmpyul_s1: T_M2_vmpy <"vmpyweuh", 0b110, 0b101, 1, 0, 1>;
+def M2_mmpyul_rs0: T_M2_vmpy <"vmpyweuh", 0b011, 0b101, 0, 1, 1>;
+def M2_mmpyul_rs1: T_M2_vmpy <"vmpyweuh", 0b111, 0b101, 1, 1, 1>;
+}
+
+let hasNewValue = 1, opNewValue = 0 in
+class T_MType_mpy <string mnemonic, bits<4> RegTyBits, RegisterClass RC,
+ bits<3> MajOp, bits<3> MinOp, bit isSat = 0, bit isRnd = 0,
+ string op2Suffix = "", bit isRaw = 0, bit isHi = 0 >
+ : MInst <(outs IntRegs:$dst), (ins RC:$src1, RC:$src2),
+ "$dst = "#mnemonic
+ #"($src1, $src2"#op2Suffix#")"
+ #!if(MajOp{2}, ":<<1", "")
+ #!if(isRnd, ":rnd", "")
+ #!if(isSat, ":sat", "")
+ #!if(isRaw, !if(isHi, ":raw:hi", ":raw:lo"), ""), [] > {
+ bits<5> dst;
+ bits<5> src1;
+ bits<5> src2;
+
+ let IClass = 0b1110;
+
+ let Inst{27-24} = RegTyBits;
+ let Inst{23-21} = MajOp;
+ let Inst{20-16} = src1;
+ let Inst{13} = 0b0;
+ let Inst{12-8} = src2;
+ let Inst{7-5} = MinOp;
+ let Inst{4-0} = dst;
+ }
+
+class T_MType_vrcmpy <string mnemonic, bits<3> MajOp, bits<3> MinOp, bit isHi>
+ : T_MType_mpy <mnemonic, 0b1001, DoubleRegs, MajOp, MinOp, 1, 1, "", 1, isHi>;
+
+class T_MType_dd <string mnemonic, bits<3> MajOp, bits<3> MinOp,
+ bit isSat = 0, bit isRnd = 0 >
+ : T_MType_mpy <mnemonic, 0b1001, DoubleRegs, MajOp, MinOp, isSat, isRnd>;
+
+class T_MType_rr1 <string mnemonic, bits<3> MajOp, bits<3> MinOp,
+ bit isSat = 0, bit isRnd = 0 >
+ : T_MType_mpy<mnemonic, 0b1101, IntRegs, MajOp, MinOp, isSat, isRnd>;
+
+class T_MType_rr2 <string mnemonic, bits<3> MajOp, bits<3> MinOp,
+ bit isSat = 0, bit isRnd = 0, string op2str = "" >
+ : T_MType_mpy<mnemonic, 0b1101, IntRegs, MajOp, MinOp, isSat, isRnd, op2str>;
+
+def M2_vradduh : T_MType_dd <"vradduh", 0b000, 0b001, 0, 0>;
+def M2_vdmpyrs_s0 : T_MType_dd <"vdmpy", 0b000, 0b000, 1, 1>;
+def M2_vdmpyrs_s1 : T_MType_dd <"vdmpy", 0b100, 0b000, 1, 1>;
+
+let CextOpcode = "mpyi", InputType = "reg" in
+def M2_mpyi : T_MType_rr1 <"mpyi", 0b000, 0b000>, ImmRegRel;
+
+def M2_mpy_up : T_MType_rr1 <"mpy", 0b000, 0b001>;
+def M2_mpyu_up : T_MType_rr1 <"mpyu", 0b010, 0b001>;
+
+def M2_dpmpyss_rnd_s0 : T_MType_rr1 <"mpy", 0b001, 0b001, 0, 1>;
+
+def M2_vmpy2s_s0pack : T_MType_rr1 <"vmpyh", 0b001, 0b111, 1, 1>;
+def M2_vmpy2s_s1pack : T_MType_rr1 <"vmpyh", 0b101, 0b111, 1, 1>;
+
+def M2_hmmpyh_rs1 : T_MType_rr2 <"mpy", 0b101, 0b100, 1, 1, ".h">;
+def M2_hmmpyl_rs1 : T_MType_rr2 <"mpy", 0b111, 0b100, 1, 1, ".l">;
+
+def M2_cmpyrs_s0 : T_MType_rr2 <"cmpy", 0b001, 0b110, 1, 1>;
+def M2_cmpyrs_s1 : T_MType_rr2 <"cmpy", 0b101, 0b110, 1, 1>;
+def M2_cmpyrsc_s0 : T_MType_rr2 <"cmpy", 0b011, 0b110, 1, 1, "*">;
+def M2_cmpyrsc_s1 : T_MType_rr2 <"cmpy", 0b111, 0b110, 1, 1, "*">;
+
+// V4 Instructions
+def M2_vraddh : T_MType_dd <"vraddh", 0b001, 0b111, 0>;
+def M2_mpysu_up : T_MType_rr1 <"mpysu", 0b011, 0b001, 0>;
+def M2_mpy_up_s1 : T_MType_rr1 <"mpy", 0b101, 0b010, 0>;
+def M2_mpy_up_s1_sat : T_MType_rr1 <"mpy", 0b111, 0b000, 1>;
+
+def M2_hmmpyh_s1 : T_MType_rr2 <"mpy", 0b101, 0b000, 1, 0, ".h">;
+def M2_hmmpyl_s1 : T_MType_rr2 <"mpy", 0b101, 0b001, 1, 0, ".l">;
+
+let hasNewValue = 1, opNewValue = 0 in
+class T_MType_mpy_ri <bit isNeg, Operand ImmOp, list<dag> pattern>
+ : MInst < (outs IntRegs:$Rd), (ins IntRegs:$Rs, ImmOp:$u8),
+ "$Rd ="#!if(isNeg, "- ", "+ ")#"mpyi($Rs, #$u8)" ,
+ pattern, "", M_tc_3x_SLOT23> {
+ bits<5> Rd;
+ bits<5> Rs;
+ bits<8> u8;
+
+ let IClass = 0b1110;
+
+ let Inst{27-24} = 0b0000;
+ let Inst{23} = isNeg;
+ let Inst{13} = 0b0;
+ let Inst{4-0} = Rd;
+ let Inst{20-16} = Rs;
+ let Inst{12-5} = u8;
+ }
+
+let isExtendable = 1, opExtentBits = 8, opExtendable = 2 in
+def M2_mpysip : T_MType_mpy_ri <0, u8_0Ext, []>;
+
+def M2_mpysin : T_MType_mpy_ri <1, u8_0Imm, []>;
+
+// Assember mapped to M2_mpyi
+let isAsmParserOnly = 1 in
+def M2_mpyui : MInst<(outs IntRegs:$dst),
+ (ins IntRegs:$src1, IntRegs:$src2),
+ "$dst = mpyui($src1, $src2)">;
+
+// Rd=mpyi(Rs,#m9)
+// s9 is NOT the same as m9 - but it works.. so far.
+// Assembler maps to either Rd=+mpyi(Rs,#u8) or Rd=-mpyi(Rs,#u8)
+// depending on the value of m9. See Arch Spec.
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 9,
+ CextOpcode = "mpyi", InputType = "imm", hasNewValue = 1,
+ isAsmParserOnly = 1 in
+def M2_mpysmi : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, s9_0Ext:$src2),
+ "$dst = mpyi($src1, #$src2)", []>, ImmRegRel;
+
+let hasNewValue = 1, isExtendable = 1, opExtentBits = 8, opExtendable = 3,
+ InputType = "imm" in
+class T_MType_acc_ri <string mnemonic, bits<3> MajOp, Operand ImmOp,
+ list<dag> pattern = []>
+ : MInst < (outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2, ImmOp:$src3),
+ "$dst "#mnemonic#"($src2, #$src3)",
+ pattern, "$src1 = $dst", M_tc_2_SLOT23> {
+ bits<5> dst;
+ bits<5> src2;
+ bits<8> src3;
+
+ let IClass = 0b1110;
+
+ let Inst{27-26} = 0b00;
+ let Inst{25-23} = MajOp;
+ let Inst{20-16} = src2;
+ let Inst{13} = 0b0;
+ let Inst{12-5} = src3;
+ let Inst{4-0} = dst;
+ }
+
+let InputType = "reg", hasNewValue = 1 in
+class T_MType_acc_rr <string mnemonic, bits<3> MajOp, bits<3> MinOp,
+ bit isSwap = 0, list<dag> pattern = [], bit hasNot = 0,
+ bit isSat = 0, bit isShift = 0>
+ : MInst < (outs IntRegs:$dst),
+ (ins IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+ "$dst "#mnemonic#"($src2, "#!if(hasNot, "~$src3)","$src3)")
+ #!if(isShift, ":<<1", "")
+ #!if(isSat, ":sat", ""),
+ pattern, "$src1 = $dst", M_tc_2_SLOT23 > {
+ bits<5> dst;
+ bits<5> src2;
+ bits<5> src3;
+
+ let IClass = 0b1110;
+
+ let Inst{27-24} = 0b1111;
+ let Inst{23-21} = MajOp;
+ let Inst{20-16} = !if(isSwap, src3, src2);
+ let Inst{13} = 0b0;
+ let Inst{12-8} = !if(isSwap, src2, src3);
+ let Inst{7-5} = MinOp;
+ let Inst{4-0} = dst;
+ }
+
+let CextOpcode = "MPYI_acc", Itinerary = M_tc_3x_SLOT23 in {
+ def M2_macsip : T_MType_acc_ri <"+= mpyi", 0b010, u8_0Ext, []>, ImmRegRel;
+
+ def M2_maci : T_MType_acc_rr <"+= mpyi", 0b000, 0b000, 0, []>, ImmRegRel;
+}
+
+let CextOpcode = "ADD_acc" in {
+ let isExtentSigned = 1 in
+ def M2_accii : T_MType_acc_ri <"+= add", 0b100, s8_0Ext, []>, ImmRegRel;
+
+ def M2_acci : T_MType_acc_rr <"+= add", 0b000, 0b001, 0, []>, ImmRegRel;
+}
+
+let CextOpcode = "SUB_acc" in {
+ let isExtentSigned = 1 in
+ def M2_naccii : T_MType_acc_ri <"-= add", 0b101, s8_0Ext>, ImmRegRel;
+
+ def M2_nacci : T_MType_acc_rr <"-= add", 0b100, 0b001, 0>, ImmRegRel;
+}
+
+let Itinerary = M_tc_3x_SLOT23 in
+def M2_macsin : T_MType_acc_ri <"-= mpyi", 0b011, u8_0Ext>;
+
+def M2_xor_xacc : T_MType_acc_rr < "^= xor", 0b100, 0b011, 0>;
+def M2_subacc : T_MType_acc_rr <"+= sub", 0b000, 0b011, 1>;
+
+//===----------------------------------------------------------------------===//
+// Template Class -- XType Vector Instructions
+//===----------------------------------------------------------------------===//
+class T_XTYPE_Vect < string opc, bits<3> MajOp, bits<3> MinOp, bit isConj >
+ : MInst <(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
+ "$Rdd = "#opc#"($Rss, $Rtt"#!if(isConj,"*)",")"),
+ [] > {
+ bits<5> Rdd;
+ bits<5> Rss;
+ bits<5> Rtt;
+
+ let IClass = 0b1110;
+
+ let Inst{27-24} = 0b1000;
+ let Inst{23-21} = MajOp;
+ let Inst{7-5} = MinOp;
+ let Inst{4-0} = Rdd;
+ let Inst{20-16} = Rss;
+ let Inst{12-8} = Rtt;
+ }
+
+class T_XTYPE_Vect_acc < string opc, bits<3> MajOp, bits<3> MinOp, bit isConj >
+ : MInst <(outs DoubleRegs:$Rdd),
+ (ins DoubleRegs:$dst2, DoubleRegs:$Rss, DoubleRegs:$Rtt),
+ "$Rdd += "#opc#"($Rss, $Rtt"#!if(isConj,"*)",")"),
+ [], "$dst2 = $Rdd",M_tc_3x_SLOT23 > {
+ bits<5> Rdd;
+ bits<5> Rss;
+ bits<5> Rtt;
+
+ let IClass = 0b1110;
+
+ let Inst{27-24} = 0b1010;
+ let Inst{23-21} = MajOp;
+ let Inst{7-5} = MinOp;
+ let Inst{4-0} = Rdd;
+ let Inst{20-16} = Rss;
+ let Inst{12-8} = Rtt;
+ }
+
+class T_XTYPE_Vect_diff < bits<3> MajOp, string opc >
+ : MInst <(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rtt, DoubleRegs:$Rss),
+ "$Rdd = "#opc#"($Rtt, $Rss)",
+ [], "",M_tc_2_SLOT23 > {
+ bits<5> Rdd;
+ bits<5> Rss;
+ bits<5> Rtt;
+
+ let IClass = 0b1110;
+
+ let Inst{27-24} = 0b1000;
+ let Inst{23-21} = MajOp;
+ let Inst{7-5} = 0b000;
+ let Inst{4-0} = Rdd;
+ let Inst{20-16} = Rss;
+ let Inst{12-8} = Rtt;
+ }
+
+// Vector reduce add unsigned bytes: Rdd32=vrmpybu(Rss32,Rtt32)
+def A2_vraddub: T_XTYPE_Vect <"vraddub", 0b010, 0b001, 0>;
+def A2_vraddub_acc: T_XTYPE_Vect_acc <"vraddub", 0b010, 0b001, 0>;
+
+// Vector sum of absolute differences unsigned bytes: Rdd=vrsadub(Rss,Rtt)
+def A2_vrsadub: T_XTYPE_Vect <"vrsadub", 0b010, 0b010, 0>;
+def A2_vrsadub_acc: T_XTYPE_Vect_acc <"vrsadub", 0b010, 0b010, 0>;
+
+// Vector absolute difference: Rdd=vabsdiffh(Rtt,Rss)
+def M2_vabsdiffh: T_XTYPE_Vect_diff<0b011, "vabsdiffh">;
+
+// Vector absolute difference words: Rdd=vabsdiffw(Rtt,Rss)
+def M2_vabsdiffw: T_XTYPE_Vect_diff<0b001, "vabsdiffw">;
+
+// Vector reduce complex multiply real or imaginary:
+// Rdd[+]=vrcmpy[ir](Rss,Rtt[*])
+def M2_vrcmpyi_s0: T_XTYPE_Vect <"vrcmpyi", 0b000, 0b000, 0>;
+def M2_vrcmpyi_s0c: T_XTYPE_Vect <"vrcmpyi", 0b010, 0b000, 1>;
+def M2_vrcmaci_s0: T_XTYPE_Vect_acc <"vrcmpyi", 0b000, 0b000, 0>;
+def M2_vrcmaci_s0c: T_XTYPE_Vect_acc <"vrcmpyi", 0b010, 0b000, 1>;
+
+def M2_vrcmpyr_s0: T_XTYPE_Vect <"vrcmpyr", 0b000, 0b001, 0>;
+def M2_vrcmpyr_s0c: T_XTYPE_Vect <"vrcmpyr", 0b011, 0b001, 1>;
+def M2_vrcmacr_s0: T_XTYPE_Vect_acc <"vrcmpyr", 0b000, 0b001, 0>;
+def M2_vrcmacr_s0c: T_XTYPE_Vect_acc <"vrcmpyr", 0b011, 0b001, 1>;
+
+// Vector reduce halfwords:
+// Rdd[+]=vrmpyh(Rss,Rtt)
+def M2_vrmpy_s0: T_XTYPE_Vect <"vrmpyh", 0b000, 0b010, 0>;
+def M2_vrmac_s0: T_XTYPE_Vect_acc <"vrmpyh", 0b000, 0b010, 0>;
+
+//===----------------------------------------------------------------------===//
+// Template Class -- Vector Multipy with accumulation.
+// Used for complex multiply real or imaginary, dual multiply and even halfwords
+//===----------------------------------------------------------------------===//
+let Defs = [USR_OVF] in
+class T_M2_vmpy_acc_sat < string opc, bits<3> MajOp, bits<3> MinOp,
+ bit hasShift, bit isRnd >
+ : MInst <(outs DoubleRegs:$Rxx),
+ (ins DoubleRegs:$dst2, DoubleRegs:$Rss, DoubleRegs:$Rtt),
+ "$Rxx += "#opc#"($Rss, $Rtt)"#!if(hasShift,":<<1","")
+ #!if(isRnd,":rnd","")#":sat",
+ [], "$dst2 = $Rxx",M_tc_3x_SLOT23 > {
+ bits<5> Rxx;
+ bits<5> Rss;
+ bits<5> Rtt;
+
+ let IClass = 0b1110;
+
+ let Inst{27-24} = 0b1010;
+ let Inst{23-21} = MajOp;
+ let Inst{7-5} = MinOp;
+ let Inst{4-0} = Rxx;
+ let Inst{20-16} = Rss;
+ let Inst{12-8} = Rtt;
+ }
+
+class T_M2_vmpy_acc < string opc, bits<3> MajOp, bits<3> MinOp,
+ bit hasShift, bit isRnd >
+ : MInst <(outs DoubleRegs:$Rxx),
+ (ins DoubleRegs:$dst2, DoubleRegs:$Rss, DoubleRegs:$Rtt),
+ "$Rxx += "#opc#"($Rss, $Rtt)"#!if(hasShift,":<<1","")
+ #!if(isRnd,":rnd",""),
+ [], "$dst2 = $Rxx",M_tc_3x_SLOT23 > {
+ bits<5> Rxx;
+ bits<5> Rss;
+ bits<5> Rtt;
+
+ let IClass = 0b1110;
+
+ let Inst{27-24} = 0b1010;
+ let Inst{23-21} = MajOp;
+ let Inst{7-5} = MinOp;
+ let Inst{4-0} = Rxx;
+ let Inst{20-16} = Rss;
+ let Inst{12-8} = Rtt;
+ }
+
+// Vector multiply word by signed half with accumulation
+// Rxx+=vmpyw[eo]h(Rss,Rtt)[:<<1][:rnd]:sat
+def M2_mmacls_s1: T_M2_vmpy_acc_sat <"vmpyweh", 0b100, 0b101, 1, 0>;
+def M2_mmacls_s0: T_M2_vmpy_acc_sat <"vmpyweh", 0b000, 0b101, 0, 0>;
+def M2_mmacls_rs1: T_M2_vmpy_acc_sat <"vmpyweh", 0b101, 0b101, 1, 1>;
+def M2_mmacls_rs0: T_M2_vmpy_acc_sat <"vmpyweh", 0b001, 0b101, 0, 1>;
+
+def M2_mmachs_s1: T_M2_vmpy_acc_sat <"vmpywoh", 0b100, 0b111, 1, 0>;
+def M2_mmachs_s0: T_M2_vmpy_acc_sat <"vmpywoh", 0b000, 0b111, 0, 0>;
+def M2_mmachs_rs1: T_M2_vmpy_acc_sat <"vmpywoh", 0b101, 0b111, 1, 1>;
+def M2_mmachs_rs0: T_M2_vmpy_acc_sat <"vmpywoh", 0b001, 0b111, 0, 1>;
+
+// Vector multiply word by unsigned half with accumulation
+// Rxx+=vmpyw[eo]uh(Rss,Rtt)[:<<1][:rnd]:sat
+def M2_mmaculs_s1: T_M2_vmpy_acc_sat <"vmpyweuh", 0b110, 0b101, 1, 0>;
+def M2_mmaculs_s0: T_M2_vmpy_acc_sat <"vmpyweuh", 0b010, 0b101, 0, 0>;
+def M2_mmaculs_rs1: T_M2_vmpy_acc_sat <"vmpyweuh", 0b111, 0b101, 1, 1>;
+def M2_mmaculs_rs0: T_M2_vmpy_acc_sat <"vmpyweuh", 0b011, 0b101, 0, 1>;
+
+def M2_mmacuhs_s1: T_M2_vmpy_acc_sat <"vmpywouh", 0b110, 0b111, 1, 0>;
+def M2_mmacuhs_s0: T_M2_vmpy_acc_sat <"vmpywouh", 0b010, 0b111, 0, 0>;
+def M2_mmacuhs_rs1: T_M2_vmpy_acc_sat <"vmpywouh", 0b111, 0b111, 1, 1>;
+def M2_mmacuhs_rs0: T_M2_vmpy_acc_sat <"vmpywouh", 0b011, 0b111, 0, 1>;
+
+// Vector multiply even halfwords with accumulation
+// Rxx+=vmpyeh(Rss,Rtt)[:<<1][:sat]
+def M2_vmac2es: T_M2_vmpy_acc <"vmpyeh", 0b001, 0b010, 0, 0>;
+def M2_vmac2es_s1: T_M2_vmpy_acc_sat <"vmpyeh", 0b100, 0b110, 1, 0>;
+def M2_vmac2es_s0: T_M2_vmpy_acc_sat <"vmpyeh", 0b000, 0b110, 0, 0>;
+
+// Vector dual multiply with accumulation
+// Rxx+=vdmpy(Rss,Rtt)[:sat]
+def M2_vdmacs_s1: T_M2_vmpy_acc_sat <"vdmpy", 0b100, 0b100, 1, 0>;
+def M2_vdmacs_s0: T_M2_vmpy_acc_sat <"vdmpy", 0b000, 0b100, 0, 0>;
+
+// Vector complex multiply real or imaginary with accumulation
+// Rxx+=vcmpy[ir](Rss,Rtt):sat
+def M2_vcmac_s0_sat_r: T_M2_vmpy_acc_sat <"vcmpyr", 0b001, 0b100, 0, 0>;
+def M2_vcmac_s0_sat_i: T_M2_vmpy_acc_sat <"vcmpyi", 0b010, 0b100, 0, 0>;
+
+//===----------------------------------------------------------------------===//
+// Template Class -- Multiply signed/unsigned halfwords with and without
+// saturation and rounding
+//===----------------------------------------------------------------------===//
+class T_M2_mpyd < bits<2> LHbits, bit isRnd, bit hasShift, bit isUnsigned >
+ : MInst < (outs DoubleRegs:$Rdd), (ins IntRegs:$Rs, IntRegs:$Rt),
+ "$Rdd = "#!if(isUnsigned,"mpyu","mpy")#"($Rs."#!if(LHbits{1},"h","l")
+ #", $Rt."#!if(LHbits{0},"h)","l)")
+ #!if(hasShift,":<<1","")
+ #!if(isRnd,":rnd",""),
+ [] > {
+ bits<5> Rdd;
+ bits<5> Rs;
+ bits<5> Rt;
+
+ let IClass = 0b1110;
+
+ let Inst{27-24} = 0b0100;
+ let Inst{23} = hasShift;
+ let Inst{22} = isUnsigned;
+ let Inst{21} = isRnd;
+ let Inst{6-5} = LHbits;
+ let Inst{4-0} = Rdd;
+ let Inst{20-16} = Rs;
+ let Inst{12-8} = Rt;
+}
+
+def M2_mpyd_hh_s0: T_M2_mpyd<0b11, 0, 0, 0>;
+def M2_mpyd_hl_s0: T_M2_mpyd<0b10, 0, 0, 0>;
+def M2_mpyd_lh_s0: T_M2_mpyd<0b01, 0, 0, 0>;
+def M2_mpyd_ll_s0: T_M2_mpyd<0b00, 0, 0, 0>;
+
+def M2_mpyd_hh_s1: T_M2_mpyd<0b11, 0, 1, 0>;
+def M2_mpyd_hl_s1: T_M2_mpyd<0b10, 0, 1, 0>;
+def M2_mpyd_lh_s1: T_M2_mpyd<0b01, 0, 1, 0>;
+def M2_mpyd_ll_s1: T_M2_mpyd<0b00, 0, 1, 0>;
+
+def M2_mpyd_rnd_hh_s0: T_M2_mpyd<0b11, 1, 0, 0>;
+def M2_mpyd_rnd_hl_s0: T_M2_mpyd<0b10, 1, 0, 0>;
+def M2_mpyd_rnd_lh_s0: T_M2_mpyd<0b01, 1, 0, 0>;
+def M2_mpyd_rnd_ll_s0: T_M2_mpyd<0b00, 1, 0, 0>;
+
+def M2_mpyd_rnd_hh_s1: T_M2_mpyd<0b11, 1, 1, 0>;
+def M2_mpyd_rnd_hl_s1: T_M2_mpyd<0b10, 1, 1, 0>;
+def M2_mpyd_rnd_lh_s1: T_M2_mpyd<0b01, 1, 1, 0>;
+def M2_mpyd_rnd_ll_s1: T_M2_mpyd<0b00, 1, 1, 0>;
+
+//Rdd=mpyu(Rs.[HL],Rt.[HL])[:<<1]
+def M2_mpyud_hh_s0: T_M2_mpyd<0b11, 0, 0, 1>;
+def M2_mpyud_hl_s0: T_M2_mpyd<0b10, 0, 0, 1>;
+def M2_mpyud_lh_s0: T_M2_mpyd<0b01, 0, 0, 1>;
+def M2_mpyud_ll_s0: T_M2_mpyd<0b00, 0, 0, 1>;
+
+def M2_mpyud_hh_s1: T_M2_mpyd<0b11, 0, 1, 1>;
+def M2_mpyud_hl_s1: T_M2_mpyd<0b10, 0, 1, 1>;
+def M2_mpyud_lh_s1: T_M2_mpyd<0b01, 0, 1, 1>;
+def M2_mpyud_ll_s1: T_M2_mpyd<0b00, 0, 1, 1>;
+
+//===----------------------------------------------------------------------===//
+// Template Class for xtype mpy:
+// Vector multiply
+// Complex multiply
+// multiply 32X32 and use full result
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0 in
+class T_XTYPE_mpy64 <string mnemonic, bits<3> MajOp, bits<3> MinOp,
+ bit isSat, bit hasShift, bit isConj>
+ : MInst <(outs DoubleRegs:$Rdd),
+ (ins IntRegs:$Rs, IntRegs:$Rt),
+ "$Rdd = "#mnemonic#"($Rs, $Rt"#!if(isConj,"*)",")")
+ #!if(hasShift,":<<1","")
+ #!if(isSat,":sat",""),
+ [] > {
+ bits<5> Rdd;
+ bits<5> Rs;
+ bits<5> Rt;
+
+ let IClass = 0b1110;
+
+ let Inst{27-24} = 0b0101;
+ let Inst{23-21} = MajOp;
+ let Inst{20-16} = Rs;
+ let Inst{12-8} = Rt;
+ let Inst{7-5} = MinOp;
+ let Inst{4-0} = Rdd;
+ }
+
+//===----------------------------------------------------------------------===//
+// Template Class for xtype mpy with accumulation into 64-bit:
+// Vector multiply
+// Complex multiply
+// multiply 32X32 and use full result
+//===----------------------------------------------------------------------===//
+class T_XTYPE_mpy64_acc <string op1, string op2, bits<3> MajOp, bits<3> MinOp,
+ bit isSat, bit hasShift, bit isConj>
+ : MInst <(outs DoubleRegs:$Rxx),
+ (ins DoubleRegs:$dst2, IntRegs:$Rs, IntRegs:$Rt),
+ "$Rxx "#op2#"= "#op1#"($Rs, $Rt"#!if(isConj,"*)",")")
+ #!if(hasShift,":<<1","")
+ #!if(isSat,":sat",""),
+
+ [] , "$dst2 = $Rxx" > {
+ bits<5> Rxx;
+ bits<5> Rs;
+ bits<5> Rt;
+
+ let IClass = 0b1110;
+
+ let Inst{27-24} = 0b0111;
+ let Inst{23-21} = MajOp;
+ let Inst{20-16} = Rs;
+ let Inst{12-8} = Rt;
+ let Inst{7-5} = MinOp;
+ let Inst{4-0} = Rxx;
+ }
+
+// MPY - Multiply and use full result
+// Rdd = mpy[u](Rs,Rt)
+def M2_dpmpyss_s0 : T_XTYPE_mpy64 < "mpy", 0b000, 0b000, 0, 0, 0>;
+def M2_dpmpyuu_s0 : T_XTYPE_mpy64 < "mpyu", 0b010, 0b000, 0, 0, 0>;
+
+// Rxx[+-]= mpy[u](Rs,Rt)
+def M2_dpmpyss_acc_s0 : T_XTYPE_mpy64_acc < "mpy", "+", 0b000, 0b000, 0, 0, 0>;
+def M2_dpmpyss_nac_s0 : T_XTYPE_mpy64_acc < "mpy", "-", 0b001, 0b000, 0, 0, 0>;
+def M2_dpmpyuu_acc_s0 : T_XTYPE_mpy64_acc < "mpyu", "+", 0b010, 0b000, 0, 0, 0>;
+def M2_dpmpyuu_nac_s0 : T_XTYPE_mpy64_acc < "mpyu", "-", 0b011, 0b000, 0, 0, 0>;
+
+// Complex multiply real or imaginary
+// Rxx=cmpy[ir](Rs,Rt)
+def M2_cmpyi_s0 : T_XTYPE_mpy64 < "cmpyi", 0b000, 0b001, 0, 0, 0>;
+def M2_cmpyr_s0 : T_XTYPE_mpy64 < "cmpyr", 0b000, 0b010, 0, 0, 0>;
+
+// Rxx+=cmpy[ir](Rs,Rt)
+def M2_cmaci_s0 : T_XTYPE_mpy64_acc < "cmpyi", "+", 0b000, 0b001, 0, 0, 0>;
+def M2_cmacr_s0 : T_XTYPE_mpy64_acc < "cmpyr", "+", 0b000, 0b010, 0, 0, 0>;
+
+// Complex multiply
+// Rdd=cmpy(Rs,Rt)[:<<]:sat
+def M2_cmpys_s0 : T_XTYPE_mpy64 < "cmpy", 0b000, 0b110, 1, 0, 0>;
+def M2_cmpys_s1 : T_XTYPE_mpy64 < "cmpy", 0b100, 0b110, 1, 1, 0>;
+
+// Rdd=cmpy(Rs,Rt*)[:<<]:sat
+def M2_cmpysc_s0 : T_XTYPE_mpy64 < "cmpy", 0b010, 0b110, 1, 0, 1>;
+def M2_cmpysc_s1 : T_XTYPE_mpy64 < "cmpy", 0b110, 0b110, 1, 1, 1>;
+
+// Rxx[-+]=cmpy(Rs,Rt)[:<<1]:sat
+def M2_cmacs_s0 : T_XTYPE_mpy64_acc < "cmpy", "+", 0b000, 0b110, 1, 0, 0>;
+def M2_cnacs_s0 : T_XTYPE_mpy64_acc < "cmpy", "-", 0b000, 0b111, 1, 0, 0>;
+def M2_cmacs_s1 : T_XTYPE_mpy64_acc < "cmpy", "+", 0b100, 0b110, 1, 1, 0>;
+def M2_cnacs_s1 : T_XTYPE_mpy64_acc < "cmpy", "-", 0b100, 0b111, 1, 1, 0>;
+
+// Rxx[-+]=cmpy(Rs,Rt*)[:<<1]:sat
+def M2_cmacsc_s0 : T_XTYPE_mpy64_acc < "cmpy", "+", 0b010, 0b110, 1, 0, 1>;
+def M2_cnacsc_s0 : T_XTYPE_mpy64_acc < "cmpy", "-", 0b010, 0b111, 1, 0, 1>;
+def M2_cmacsc_s1 : T_XTYPE_mpy64_acc < "cmpy", "+", 0b110, 0b110, 1, 1, 1>;
+def M2_cnacsc_s1 : T_XTYPE_mpy64_acc < "cmpy", "-", 0b110, 0b111, 1, 1, 1>;
+
+// Vector multiply halfwords
+// Rdd=vmpyh(Rs,Rt)[:<<]:sat
+//let Defs = [USR_OVF] in {
+ def M2_vmpy2s_s1 : T_XTYPE_mpy64 < "vmpyh", 0b100, 0b101, 1, 1, 0>;
+ def M2_vmpy2s_s0 : T_XTYPE_mpy64 < "vmpyh", 0b000, 0b101, 1, 0, 0>;
+//}
+
+// Rxx+=vmpyh(Rs,Rt)[:<<1][:sat]
+def M2_vmac2 : T_XTYPE_mpy64_acc < "vmpyh", "+", 0b001, 0b001, 0, 0, 0>;
+def M2_vmac2s_s1 : T_XTYPE_mpy64_acc < "vmpyh", "+", 0b100, 0b101, 1, 1, 0>;
+def M2_vmac2s_s0 : T_XTYPE_mpy64_acc < "vmpyh", "+", 0b000, 0b101, 1, 0, 0>;
+
+//===----------------------------------------------------------------------===//
+// MTYPE/MPYH -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MTYPE/MPYS +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// MTYPE/MPYS -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MTYPE/VB +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// MTYPE/VB -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MTYPE/VH +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// MTYPE/VH -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ST +
+//===----------------------------------------------------------------------===//
+///
+// Store doubleword.
+//===----------------------------------------------------------------------===//
+// Template class for non-predicated post increment stores with immediate offset
+//===----------------------------------------------------------------------===//
+let isPredicable = 1, hasSideEffects = 0, addrMode = PostInc in
+class T_store_pi <string mnemonic, RegisterClass RC, Operand ImmOp,
+ bits<4> MajOp, bit isHalf >
+ : STInst <(outs IntRegs:$_dst_),
+ (ins IntRegs:$src1, ImmOp:$offset, RC:$src2),
+ mnemonic#"($src1++#$offset) = $src2"#!if(isHalf, ".h", ""),
+ [], "$src1 = $_dst_" >,
+ AddrModeRel {
+ bits<5> src1;
+ bits<5> src2;
+ bits<7> offset;
+ bits<4> offsetBits;
+
+ string ImmOpStr = !cast<string>(ImmOp);
+ let offsetBits = !if (!eq(ImmOpStr, "s4_3Imm"), offset{6-3},
+ !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
+ !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
+ /* s4_0Imm */ offset{3-0})));
+ // Store upper-half and store doubleword cannot be NV.
+ let isNVStorable = !if (!eq(ImmOpStr, "s4_3Imm"), 0, !if(isHalf,0,1));
+
+ let IClass = 0b1010;
+
+ let Inst{27-25} = 0b101;
+ let Inst{24-21} = MajOp;
+ let Inst{20-16} = src1;
+ let Inst{13} = 0b0;
+ let Inst{12-8} = src2;
+ let Inst{7} = 0b0;
+ let Inst{6-3} = offsetBits;
+ let Inst{1} = 0b0;
+ }
+
+//===----------------------------------------------------------------------===//
+// Template class for predicated post increment stores with immediate offset
+//===----------------------------------------------------------------------===//
+let isPredicated = 1, hasSideEffects = 0, addrMode = PostInc in
+class T_pstore_pi <string mnemonic, RegisterClass RC, Operand ImmOp,
+ bits<4> MajOp, bit isHalf, bit isPredNot, bit isPredNew>
+ : STInst <(outs IntRegs:$_dst_),
+ (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$offset, RC:$src3),
+ !if(isPredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+ ") ")#mnemonic#"($src2++#$offset) = $src3"#!if(isHalf, ".h", ""),
+ [], "$src2 = $_dst_" >,
+ AddrModeRel {
+ bits<2> src1;
+ bits<5> src2;
+ bits<7> offset;
+ bits<5> src3;
+ bits<4> offsetBits;
+
+ string ImmOpStr = !cast<string>(ImmOp);
+ let offsetBits = !if (!eq(ImmOpStr, "s4_3Imm"), offset{6-3},
+ !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
+ !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
+ /* s4_0Imm */ offset{3-0})));
+
+ // Store upper-half and store doubleword cannot be NV.
+ let isNVStorable = !if (!eq(ImmOpStr, "s4_3Imm"), 0, !if(isHalf,0,1));
+ let isPredicatedNew = isPredNew;
+ let isPredicatedFalse = isPredNot;
+
+ let IClass = 0b1010;
+
+ let Inst{27-25} = 0b101;
+ let Inst{24-21} = MajOp;
+ let Inst{20-16} = src2;
+ let Inst{13} = 0b1;
+ let Inst{12-8} = src3;
+ let Inst{7} = isPredNew;
+ let Inst{6-3} = offsetBits;
+ let Inst{2} = isPredNot;
+ let Inst{1-0} = src1;
+ }
+
+multiclass ST_PostInc<string mnemonic, string BaseOp, RegisterClass RC,
+ Operand ImmOp, bits<4> MajOp, bit isHalf = 0 > {
+
+ let BaseOpcode = "POST_"#BaseOp in {
+ def S2_#NAME#_pi : T_store_pi <mnemonic, RC, ImmOp, MajOp, isHalf>;
+
+ // Predicated
+ def S2_p#NAME#t_pi : T_pstore_pi <mnemonic, RC, ImmOp, MajOp, isHalf, 0, 0>;
+ def S2_p#NAME#f_pi : T_pstore_pi <mnemonic, RC, ImmOp, MajOp, isHalf, 1, 0>;
+
+ // Predicated new
+ def S2_p#NAME#tnew_pi : T_pstore_pi <mnemonic, RC, ImmOp, MajOp,
+ isHalf, 0, 1>;
+ def S2_p#NAME#fnew_pi : T_pstore_pi <mnemonic, RC, ImmOp, MajOp,
+ isHalf, 1, 1>;
+ }
+}
+
+let accessSize = ByteAccess in
+defm storerb: ST_PostInc <"memb", "STrib", IntRegs, s4_0Imm, 0b1000>;
+
+let accessSize = HalfWordAccess in
+defm storerh: ST_PostInc <"memh", "STrih", IntRegs, s4_1Imm, 0b1010>;
+
+let accessSize = WordAccess in
+defm storeri: ST_PostInc <"memw", "STriw", IntRegs, s4_2Imm, 0b1100>;
+
+let accessSize = DoubleWordAccess in
+defm storerd: ST_PostInc <"memd", "STrid", DoubleRegs, s4_3Imm, 0b1110>;
+
+let accessSize = HalfWordAccess, isNVStorable = 0 in
+defm storerf: ST_PostInc <"memh", "STrih_H", IntRegs, s4_1Imm, 0b1011, 1>;
+
+//===----------------------------------------------------------------------===//
+// Template class for post increment stores with register offset.
+//===----------------------------------------------------------------------===//
+class T_store_pr <string mnemonic, RegisterClass RC, bits<3> MajOp,
+ MemAccessSize AccessSz, bit isHalf = 0>
+ : STInst <(outs IntRegs:$_dst_),
+ (ins IntRegs:$src1, ModRegs:$src2, RC:$src3),
+ mnemonic#"($src1++$src2) = $src3"#!if(isHalf, ".h", ""),
+ [], "$src1 = $_dst_" > {
+ bits<5> src1;
+ bits<1> src2;
+ bits<5> src3;
+ let accessSize = AccessSz;
+
+ // Store upper-half and store doubleword cannot be NV.
+ let isNVStorable = !if(!eq(mnemonic,"memd"), 0, !if(isHalf,0,1));
+
+ let IClass = 0b1010;
+
+ let Inst{27-24} = 0b1101;
+ let Inst{23-21} = MajOp;
+ let Inst{20-16} = src1;
+ let Inst{13} = src2;
+ let Inst{12-8} = src3;
+ let Inst{7} = 0b0;
+ }
+
+def S2_storerb_pr : T_store_pr<"memb", IntRegs, 0b000, ByteAccess>;
+def S2_storerh_pr : T_store_pr<"memh", IntRegs, 0b010, HalfWordAccess>;
+def S2_storeri_pr : T_store_pr<"memw", IntRegs, 0b100, WordAccess>;
+def S2_storerd_pr : T_store_pr<"memd", DoubleRegs, 0b110, DoubleWordAccess>;
+def S2_storerf_pr : T_store_pr<"memh", IntRegs, 0b011, HalfWordAccess, 1>;
+
+let opExtendable = 1, isExtentSigned = 1, isPredicable = 1 in
+class T_store_io <string mnemonic, RegisterClass RC, Operand ImmOp,
+ bits<3> MajOp, bit isH = 0>
+ : STInst <(outs),
+ (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
+ mnemonic#"($src1+#$src2) = $src3"#!if(isH,".h","")>,
+ AddrModeRel, ImmRegRel {
+ bits<5> src1;
+ bits<14> src2; // Actual address offset
+ bits<5> src3;
+ bits<11> offsetBits; // Represents offset encoding
+
+ string ImmOpStr = !cast<string>(ImmOp);
+
+ let opExtentBits = !if (!eq(ImmOpStr, "s11_3Ext"), 14,
+ !if (!eq(ImmOpStr, "s11_2Ext"), 13,
+ !if (!eq(ImmOpStr, "s11_1Ext"), 12,
+ /* s11_0Ext */ 11)));
+ let offsetBits = !if (!eq(ImmOpStr, "s11_3Ext"), src2{13-3},
+ !if (!eq(ImmOpStr, "s11_2Ext"), src2{12-2},
+ !if (!eq(ImmOpStr, "s11_1Ext"), src2{11-1},
+ /* s11_0Ext */ src2{10-0})));
+ // Store upper-half and store doubleword cannot be NV.
+ let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isH,0,1));
+ let IClass = 0b1010;
+
+ let Inst{27} = 0b0;
+ let Inst{26-25} = offsetBits{10-9};
+ let Inst{24} = 0b1;
+ let Inst{23-21} = MajOp;
+ let Inst{20-16} = src1;
+ let Inst{13} = offsetBits{8};
+ let Inst{12-8} = src3;
+ let Inst{7-0} = offsetBits{7-0};
+ }
+
+let opExtendable = 2, isPredicated = 1 in
+class T_pstore_io <string mnemonic, RegisterClass RC, Operand ImmOp,
+ bits<3>MajOp, bit PredNot, bit isPredNew, bit isH = 0>
+ : STInst <(outs),
+ (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
+ !if(PredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+ ") ")#mnemonic#"($src2+#$src3) = $src4"#!if(isH,".h",""),
+ [],"",V2LDST_tc_st_SLOT01 >,
+ AddrModeRel, ImmRegRel {
+ bits<2> src1;
+ bits<5> src2;
+ bits<9> src3; // Actual address offset
+ bits<5> src4;
+ bits<6> offsetBits; // Represents offset encoding
+
+ let isPredicatedNew = isPredNew;
+ let isPredicatedFalse = PredNot;
+
+ string ImmOpStr = !cast<string>(ImmOp);
+ let opExtentBits = !if (!eq(ImmOpStr, "u6_3Ext"), 9,
+ !if (!eq(ImmOpStr, "u6_2Ext"), 8,
+ !if (!eq(ImmOpStr, "u6_1Ext"), 7,
+ /* u6_0Ext */ 6)));
+ let offsetBits = !if (!eq(ImmOpStr, "u6_3Ext"), src3{8-3},
+ !if (!eq(ImmOpStr, "u6_2Ext"), src3{7-2},
+ !if (!eq(ImmOpStr, "u6_1Ext"), src3{6-1},
+ /* u6_0Ext */ src3{5-0})));
+ // Store upper-half and store doubleword cannot be NV.
+ let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isH,0,1));
+
+ let IClass = 0b0100;
+
+ let Inst{27} = 0b0;
+ let Inst{26} = PredNot;
+ let Inst{25} = isPredNew;
+ let Inst{24} = 0b0;
+ let Inst{23-21} = MajOp;
+ let Inst{20-16} = src2;
+ let Inst{13} = offsetBits{5};
+ let Inst{12-8} = src4;
+ let Inst{7-3} = offsetBits{4-0};
+ let Inst{1-0} = src1;
+ }
+
+let isExtendable = 1, hasSideEffects = 0 in
+multiclass ST_Idxd<string mnemonic, string CextOp, RegisterClass RC,
+ Operand ImmOp, Operand predImmOp, bits<3> MajOp, bit isH = 0> {
+ let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed in {
+ def S2_#NAME#_io : T_store_io <mnemonic, RC, ImmOp, MajOp, isH>;
+
+ // Predicated
+ def S2_p#NAME#t_io : T_pstore_io<mnemonic, RC, predImmOp, MajOp, 0, 0, isH>;
+ def S2_p#NAME#f_io : T_pstore_io<mnemonic, RC, predImmOp, MajOp, 1, 0, isH>;
+
+ // Predicated new
+ def S4_p#NAME#tnew_io : T_pstore_io <mnemonic, RC, predImmOp,
+ MajOp, 0, 1, isH>;
+ def S4_p#NAME#fnew_io : T_pstore_io <mnemonic, RC, predImmOp,
+ MajOp, 1, 1, isH>;
+ }
+}
+
+let addrMode = BaseImmOffset, InputType = "imm" in {
+ let accessSize = ByteAccess in
+ defm storerb: ST_Idxd < "memb", "STrib", IntRegs, s11_0Ext, u6_0Ext, 0b000>;
+
+ let accessSize = HalfWordAccess, opExtentAlign = 1 in
+ defm storerh: ST_Idxd < "memh", "STrih", IntRegs, s11_1Ext, u6_1Ext, 0b010>;
+
+ let accessSize = WordAccess, opExtentAlign = 2 in
+ defm storeri: ST_Idxd < "memw", "STriw", IntRegs, s11_2Ext, u6_2Ext, 0b100>;
+
+ let accessSize = DoubleWordAccess, isNVStorable = 0, opExtentAlign = 3 in
+ defm storerd: ST_Idxd < "memd", "STrid", DoubleRegs, s11_3Ext,
+ u6_3Ext, 0b110>;
+
+ let accessSize = HalfWordAccess, opExtentAlign = 1 in
+ defm storerf: ST_Idxd < "memh", "STrif", IntRegs, s11_1Ext,
+ u6_1Ext, 0b011, 1>;
+}
+
+// Store predicate.
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13,
+ isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
+def STriw_pred : STInst<(outs),
+ (ins IntRegs:$addr, s11_2Ext:$off, PredRegs:$src1),
+ ".error \"should not emit\"", []>;
+// Store modifier.
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13,
+ isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
+def STriw_mod : STInst<(outs),
+ (ins IntRegs:$addr, s11_2Ext:$off, ModRegs:$src1),
+ ".error \"should not emit\"", []>;
+
+// S2_allocframe: Allocate stack frame.
+let Defs = [R29, R30], Uses = [R29, R31, R30],
+ hasSideEffects = 0, accessSize = DoubleWordAccess in
+def S2_allocframe: ST0Inst <
+ (outs), (ins u11_3Imm:$u11_3),
+ "allocframe(#$u11_3)" > {
+ bits<14> u11_3;
+
+ let IClass = 0b1010;
+ let Inst{27-16} = 0b000010011101;
+ let Inst{13-11} = 0b000;
+ let Inst{10-0} = u11_3{13-3};
+ }
+
+// S2_storer[bhwdf]_pci: Store byte/half/word/double.
+// S2_storer[bhwdf]_pci -> S2_storerbnew_pci
+let Uses = [CS], addrMode = PostInc in
+class T_store_pci <string mnemonic, RegisterClass RC,
+ Operand Imm, bits<4>MajOp,
+ MemAccessSize AlignSize, string RegSrc = "Rt">
+ : STInst <(outs IntRegs:$_dst_),
+ (ins IntRegs:$Rz, Imm:$offset, ModRegs:$Mu, RC:$Rt),
+ #mnemonic#"($Rz ++ #$offset:circ($Mu)) = $"#RegSrc#"",
+ [] ,
+ "$Rz = $_dst_" > {
+ bits<5> Rz;
+ bits<7> offset;
+ bits<1> Mu;
+ bits<5> Rt;
+ let accessSize = AlignSize;
+ let isNVStorable = !if(!eq(mnemonic,"memd"), 0,
+ !if(!eq(RegSrc,"Rt.h"), 0, 1));
+
+ let IClass = 0b1010;
+ let Inst{27-25} = 0b100;
+ let Inst{24-21} = MajOp;
+ let Inst{20-16} = Rz;
+ let Inst{13} = Mu;
+ let Inst{12-8} = Rt;
+ let Inst{7} = 0b0;
+ let Inst{6-3} =
+ !if (!eq(!cast<string>(AlignSize), "DoubleWordAccess"), offset{6-3},
+ !if (!eq(!cast<string>(AlignSize), "WordAccess"), offset{5-2},
+ !if (!eq(!cast<string>(AlignSize), "HalfWordAccess"), offset{4-1},
+ /* ByteAccess */ offset{3-0})));
+ let Inst{1} = 0b0;
+ }
+
+def S2_storerb_pci : T_store_pci<"memb", IntRegs, s4_0Imm, 0b1000,
+ ByteAccess>;
+def S2_storerh_pci : T_store_pci<"memh", IntRegs, s4_1Imm, 0b1010,
+ HalfWordAccess>;
+def S2_storerf_pci : T_store_pci<"memh", IntRegs, s4_1Imm, 0b1011,
+ HalfWordAccess, "Rt.h">;
+def S2_storeri_pci : T_store_pci<"memw", IntRegs, s4_2Imm, 0b1100,
+ WordAccess>;
+def S2_storerd_pci : T_store_pci<"memd", DoubleRegs, s4_3Imm, 0b1110,
+ DoubleWordAccess>;
+
+let Uses = [CS], isNewValue = 1, mayStore = 1, isNVStore = 1, opNewValue = 4,
+ addrMode = PostInc in
+class T_storenew_pci <string mnemonic, Operand Imm,
+ bits<2>MajOp, MemAccessSize AlignSize>
+ : NVInst < (outs IntRegs:$_dst_),
+ (ins IntRegs:$Rz, Imm:$offset, ModRegs:$Mu, IntRegs:$Nt),
+ #mnemonic#"($Rz ++ #$offset:circ($Mu)) = $Nt.new",
+ [],
+ "$Rz = $_dst_"> {
+ bits<5> Rz;
+ bits<6> offset;
+ bits<1> Mu;
+ bits<3> Nt;
+
+ let accessSize = AlignSize;
+
+ let IClass = 0b1010;
+ let Inst{27-21} = 0b1001101;
+ let Inst{20-16} = Rz;
+ let Inst{13} = Mu;
+ let Inst{12-11} = MajOp;
+ let Inst{10-8} = Nt;
+ let Inst{7} = 0b0;
+ let Inst{6-3} =
+ !if (!eq(!cast<string>(AlignSize), "WordAccess"), offset{5-2},
+ !if (!eq(!cast<string>(AlignSize), "HalfWordAccess"), offset{4-1},
+ /* ByteAccess */ offset{3-0}));
+ let Inst{1} = 0b0;
+ }
+
+def S2_storerbnew_pci : T_storenew_pci <"memb", s4_0Imm, 0b00, ByteAccess>;
+def S2_storerhnew_pci : T_storenew_pci <"memh", s4_1Imm, 0b01, HalfWordAccess>;
+def S2_storerinew_pci : T_storenew_pci <"memw", s4_2Imm, 0b10, WordAccess>;
+
+//===----------------------------------------------------------------------===//
+// Circular stores with auto-increment register
+//===----------------------------------------------------------------------===//
+let Uses = [CS], addrMode = PostInc in
+class T_store_pcr <string mnemonic, RegisterClass RC, bits<4>MajOp,
+ MemAccessSize AlignSize, string RegSrc = "Rt">
+ : STInst <(outs IntRegs:$_dst_),
+ (ins IntRegs:$Rz, ModRegs:$Mu, RC:$Rt),
+ #mnemonic#"($Rz ++ I:circ($Mu)) = $"#RegSrc#"",
+ [],
+ "$Rz = $_dst_" > {
+ bits<5> Rz;
+ bits<1> Mu;
+ bits<5> Rt;
+
+ let accessSize = AlignSize;
+ let isNVStorable = !if(!eq(mnemonic,"memd"), 0,
+ !if(!eq(RegSrc,"Rt.h"), 0, 1));
+
+ let IClass = 0b1010;
+ let Inst{27-25} = 0b100;
+ let Inst{24-21} = MajOp;
+ let Inst{20-16} = Rz;
+ let Inst{13} = Mu;
+ let Inst{12-8} = Rt;
+ let Inst{7} = 0b0;
+ let Inst{1} = 0b1;
+ }
+
+def S2_storerb_pcr : T_store_pcr<"memb", IntRegs, 0b1000, ByteAccess>;
+def S2_storerh_pcr : T_store_pcr<"memh", IntRegs, 0b1010, HalfWordAccess>;
+def S2_storeri_pcr : T_store_pcr<"memw", IntRegs, 0b1100, WordAccess>;
+def S2_storerd_pcr : T_store_pcr<"memd", DoubleRegs, 0b1110, DoubleWordAccess>;
+def S2_storerf_pcr : T_store_pcr<"memh", IntRegs, 0b1011,
+ HalfWordAccess, "Rt.h">;
+
+//===----------------------------------------------------------------------===//
+// Circular .new stores with auto-increment register
+//===----------------------------------------------------------------------===//
+let Uses = [CS], isNewValue = 1, mayStore = 1, isNVStore = 1, opNewValue = 3,
+ addrMode = PostInc in
+class T_storenew_pcr <string mnemonic, bits<2>MajOp,
+ MemAccessSize AlignSize>
+ : NVInst <(outs IntRegs:$_dst_),
+ (ins IntRegs:$Rz, ModRegs:$Mu, IntRegs:$Nt),
+ #mnemonic#"($Rz ++ I:circ($Mu)) = $Nt.new" ,
+ [] ,
+ "$Rz = $_dst_"> {
+ bits<5> Rz;
+ bits<1> Mu;
+ bits<3> Nt;
+
+ let accessSize = AlignSize;
+
+ let IClass = 0b1010;
+ let Inst{27-21} = 0b1001101;
+ let Inst{20-16} = Rz;
+ let Inst{13} = Mu;
+ let Inst{12-11} = MajOp;
+ let Inst{10-8} = Nt;
+ let Inst{7} = 0b0;
+ let Inst{1} = 0b1;
+ }
+
+def S2_storerbnew_pcr : T_storenew_pcr <"memb", 0b00, ByteAccess>;
+def S2_storerhnew_pcr : T_storenew_pcr <"memh", 0b01, HalfWordAccess>;
+def S2_storerinew_pcr : T_storenew_pcr <"memw", 0b10, WordAccess>;
+
+//===----------------------------------------------------------------------===//
+// Bit-reversed stores with auto-increment register
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0, addrMode = PostInc in
+class T_store_pbr<string mnemonic, RegisterClass RC,
+ MemAccessSize addrSize, bits<3> majOp,
+ bit isHalf = 0>
+ : STInst
+ <(outs IntRegs:$_dst_),
+ (ins IntRegs:$Rz, ModRegs:$Mu, RC:$src),
+ #mnemonic#"($Rz ++ $Mu:brev) = $src"#!if (!eq(isHalf, 1), ".h", ""),
+ [], "$Rz = $_dst_" > {
+
+ let accessSize = addrSize;
+
+ bits<5> Rz;
+ bits<1> Mu;
+ bits<5> src;
+
+ let IClass = 0b1010;
+
+ let Inst{27-24} = 0b1111;
+ let Inst{23-21} = majOp;
+ let Inst{7} = 0b0;
+ let Inst{20-16} = Rz;
+ let Inst{13} = Mu;
+ let Inst{12-8} = src;
+ }
+
+let isNVStorable = 1 in {
+ let BaseOpcode = "S2_storerb_pbr" in
+ def S2_storerb_pbr : T_store_pbr<"memb", IntRegs, ByteAccess,
+ 0b000>, NewValueRel;
+ let BaseOpcode = "S2_storerh_pbr" in
+ def S2_storerh_pbr : T_store_pbr<"memh", IntRegs, HalfWordAccess,
+ 0b010>, NewValueRel;
+ let BaseOpcode = "S2_storeri_pbr" in
+ def S2_storeri_pbr : T_store_pbr<"memw", IntRegs, WordAccess,
+ 0b100>, NewValueRel;
+}
+
+def S2_storerf_pbr : T_store_pbr<"memh", IntRegs, HalfWordAccess, 0b011, 1>;
+def S2_storerd_pbr : T_store_pbr<"memd", DoubleRegs, DoubleWordAccess, 0b110>;
+
+//===----------------------------------------------------------------------===//
+// Bit-reversed .new stores with auto-increment register
+//===----------------------------------------------------------------------===//
+let isNewValue = 1, mayStore = 1, isNVStore = 1, opNewValue = 3,
+ hasSideEffects = 0, addrMode = PostInc in
+class T_storenew_pbr<string mnemonic, MemAccessSize addrSize, bits<2> majOp>
+ : NVInst <(outs IntRegs:$_dst_),
+ (ins IntRegs:$Rz, ModRegs:$Mu, IntRegs:$Nt),
+ #mnemonic#"($Rz ++ $Mu:brev) = $Nt.new", [],
+ "$Rz = $_dst_">, NewValueRel {
+ let accessSize = addrSize;
+ bits<5> Rz;
+ bits<1> Mu;
+ bits<3> Nt;
+
+ let IClass = 0b1010;
+
+ let Inst{27-21} = 0b1111101;
+ let Inst{12-11} = majOp;
+ let Inst{7} = 0b0;
+ let Inst{20-16} = Rz;
+ let Inst{13} = Mu;
+ let Inst{10-8} = Nt;
+ }
+
+let BaseOpcode = "S2_storerb_pbr" in
+def S2_storerbnew_pbr : T_storenew_pbr<"memb", ByteAccess, 0b00>;
+
+let BaseOpcode = "S2_storerh_pbr" in
+def S2_storerhnew_pbr : T_storenew_pbr<"memh", HalfWordAccess, 0b01>;
+
+let BaseOpcode = "S2_storeri_pbr" in
+def S2_storerinew_pbr : T_storenew_pbr<"memw", WordAccess, 0b10>;
+
+//===----------------------------------------------------------------------===//
+// ST -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Template class for S_2op instructions.
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0 in
+class T_S2op_1 <string mnemonic, bits<4> RegTyBits, RegisterClass RCOut,
+ RegisterClass RCIn, bits<2> MajOp, bits<3> MinOp, bit isSat>
+ : SInst <(outs RCOut:$dst), (ins RCIn:$src),
+ "$dst = "#mnemonic#"($src)"#!if(isSat, ":sat", ""),
+ [], "", S_2op_tc_1_SLOT23 > {
+ bits<5> dst;
+ bits<5> src;
+
+ let IClass = 0b1000;
+
+ let Inst{27-24} = RegTyBits;
+ let Inst{23-22} = MajOp;
+ let Inst{21} = 0b0;
+ let Inst{20-16} = src;
+ let Inst{7-5} = MinOp;
+ let Inst{4-0} = dst;
+ }
+
+class T_S2op_1_di <string mnemonic, bits<2> MajOp, bits<3> MinOp>
+ : T_S2op_1 <mnemonic, 0b0100, DoubleRegs, IntRegs, MajOp, MinOp, 0>;
+
+let hasNewValue = 1 in
+class T_S2op_1_id <string mnemonic, bits<2> MajOp, bits<3> MinOp, bit isSat = 0>
+ : T_S2op_1 <mnemonic, 0b1000, IntRegs, DoubleRegs, MajOp, MinOp, isSat>;
+
+let hasNewValue = 1 in
+class T_S2op_1_ii <string mnemonic, bits<2> MajOp, bits<3> MinOp, bit isSat = 0>
+ : T_S2op_1 <mnemonic, 0b1100, IntRegs, IntRegs, MajOp, MinOp, isSat>;
+
+// Vector sign/zero extend
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+ def S2_vsxtbh : T_S2op_1_di <"vsxtbh", 0b00, 0b000>;
+ def S2_vsxthw : T_S2op_1_di <"vsxthw", 0b00, 0b100>;
+ def S2_vzxtbh : T_S2op_1_di <"vzxtbh", 0b00, 0b010>;
+ def S2_vzxthw : T_S2op_1_di <"vzxthw", 0b00, 0b110>;
+}
+
+// Vector splat bytes/halfwords
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+ def S2_vsplatrb : T_S2op_1_ii <"vsplatb", 0b01, 0b111>;
+ def S2_vsplatrh : T_S2op_1_di <"vsplath", 0b01, 0b010>;
+}
+
+// Sign extend word to doubleword
+def A2_sxtw : T_S2op_1_di <"sxtw", 0b01, 0b000>;
+
+// Vector saturate and pack
+let Defs = [USR_OVF] in {
+ def S2_svsathb : T_S2op_1_ii <"vsathb", 0b10, 0b000>;
+ def S2_svsathub : T_S2op_1_ii <"vsathub", 0b10, 0b010>;
+ def S2_vsathb : T_S2op_1_id <"vsathb", 0b00, 0b110>;
+ def S2_vsathub : T_S2op_1_id <"vsathub", 0b00, 0b000>;
+ def S2_vsatwh : T_S2op_1_id <"vsatwh", 0b00, 0b010>;
+ def S2_vsatwuh : T_S2op_1_id <"vsatwuh", 0b00, 0b100>;
+}
+
+// Vector truncate
+def S2_vtrunohb : T_S2op_1_id <"vtrunohb", 0b10, 0b000>;
+def S2_vtrunehb : T_S2op_1_id <"vtrunehb", 0b10, 0b010>;
+
+// Swizzle the bytes of a word
+def A2_swiz : T_S2op_1_ii <"swiz", 0b10, 0b111>;
+
+// Saturate
+let Defs = [USR_OVF] in {
+ def A2_sat : T_S2op_1_id <"sat", 0b11, 0b000>;
+ def A2_satb : T_S2op_1_ii <"satb", 0b11, 0b111>;
+ def A2_satub : T_S2op_1_ii <"satub", 0b11, 0b110>;
+ def A2_sath : T_S2op_1_ii <"sath", 0b11, 0b100>;
+ def A2_satuh : T_S2op_1_ii <"satuh", 0b11, 0b101>;
+ def A2_roundsat : T_S2op_1_id <"round", 0b11, 0b001, 0b1>;
+}
+
+let Itinerary = S_2op_tc_2_SLOT23 in {
+ // Vector round and pack
+ def S2_vrndpackwh : T_S2op_1_id <"vrndwh", 0b10, 0b100>;
+
+ let Defs = [USR_OVF] in
+ def S2_vrndpackwhs : T_S2op_1_id <"vrndwh", 0b10, 0b110, 1>;
+
+ // Bit reverse
+ def S2_brev : T_S2op_1_ii <"brev", 0b01, 0b110>;
+
+ // Absolute value word
+ def A2_abs : T_S2op_1_ii <"abs", 0b10, 0b100>;
+
+ let Defs = [USR_OVF] in
+ def A2_abssat : T_S2op_1_ii <"abs", 0b10, 0b101, 1>;
+
+ // Negate with saturation
+ let Defs = [USR_OVF] in
+ def A2_negsat : T_S2op_1_ii <"neg", 0b10, 0b110, 1>;
+}
+
+class T_S2op_2 <string mnemonic, bits<4> RegTyBits, RegisterClass RCOut,
+ RegisterClass RCIn, bits<3> MajOp, bits<3> MinOp,
+ bit isSat, bit isRnd, list<dag> pattern = []>
+ : SInst <(outs RCOut:$dst),
+ (ins RCIn:$src, u5_0Imm:$u5),
+ "$dst = "#mnemonic#"($src, #$u5)"#!if(isSat, ":sat", "")
+ #!if(isRnd, ":rnd", ""),
+ pattern, "", S_2op_tc_2_SLOT23> {
+ bits<5> dst;
+ bits<5> src;
+ bits<5> u5;
+
+ let IClass = 0b1000;
+
+ let Inst{27-24} = RegTyBits;
+ let Inst{23-21} = MajOp;
+ let Inst{20-16} = src;
+ let Inst{13} = 0b0;
+ let Inst{12-8} = u5;
+ let Inst{7-5} = MinOp;
+ let Inst{4-0} = dst;
+ }
+
+class T_S2op_2_di <string mnemonic, bits<3> MajOp, bits<3> MinOp>
+ : T_S2op_2 <mnemonic, 0b1000, DoubleRegs, IntRegs, MajOp, MinOp, 0, 0>;
+
+let hasNewValue = 1 in
+class T_S2op_2_id <string mnemonic, bits<3> MajOp, bits<3> MinOp>
+ : T_S2op_2 <mnemonic, 0b1000, IntRegs, DoubleRegs, MajOp, MinOp, 0, 0>;
+
+let hasNewValue = 1 in
+class T_S2op_2_ii <string mnemonic, bits<3> MajOp, bits<3> MinOp,
+ bit isSat = 0, bit isRnd = 0, list<dag> pattern = []>
+ : T_S2op_2 <mnemonic, 0b1100, IntRegs, IntRegs, MajOp, MinOp,
+ isSat, isRnd, pattern>;
+
+class T_S2op_shift <string mnemonic, bits<3> MajOp, bits<3> MinOp, SDNode OpNd>
+ : T_S2op_2_ii <mnemonic, MajOp, MinOp, 0, 0, []>;
+
+// Vector arithmetic shift right by immediate with truncate and pack
+def S2_asr_i_svw_trun : T_S2op_2_id <"vasrw", 0b110, 0b010>;
+
+// Arithmetic/logical shift right/left by immediate
+let Itinerary = S_2op_tc_1_SLOT23 in {
+ def S2_asr_i_r : T_S2op_shift <"asr", 0b000, 0b000, sra>;
+ def S2_lsr_i_r : T_S2op_shift <"lsr", 0b000, 0b001, srl>;
+ def S2_asl_i_r : T_S2op_shift <"asl", 0b000, 0b010, shl>;
+}
+
+// Shift left by immediate with saturation
+let Defs = [USR_OVF] in
+def S2_asl_i_r_sat : T_S2op_2_ii <"asl", 0b010, 0b010, 1>;
+
+// Shift right with round
+def S2_asr_i_r_rnd : T_S2op_2_ii <"asr", 0b010, 0b000, 0, 1>;
+
+let isAsmParserOnly = 1 in
+def S2_asr_i_r_rnd_goodsyntax
+ : SInst <(outs IntRegs:$dst), (ins IntRegs:$src, u5_0Imm:$u5),
+ "$dst = asrrnd($src, #$u5)",
+ [], "", S_2op_tc_1_SLOT23>;
+
+let isAsmParserOnly = 1 in
+def A2_not: ALU32_rr<(outs IntRegs:$dst),(ins IntRegs:$src),
+ "$dst = not($src)">;
+
+class T_S2op_3<string opc, bits<2>MajOp, bits<3>minOp, bits<1> sat = 0>
+ : SInst<(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rss),
+ "$Rdd = "#opc#"($Rss)"#!if(!eq(sat, 1),":sat","")> {
+ bits<5> Rss;
+ bits<5> Rdd;
+ let IClass = 0b1000;
+ let Inst{27-24} = 0;
+ let Inst{23-22} = MajOp;
+ let Inst{20-16} = Rss;
+ let Inst{7-5} = minOp;
+ let Inst{4-0} = Rdd;
+}
+
+def A2_absp : T_S2op_3 <"abs", 0b10, 0b110>;
+def A2_negp : T_S2op_3 <"neg", 0b10, 0b101>;
+def A2_notp : T_S2op_3 <"not", 0b10, 0b100>;
+
+// Innterleave/deinterleave
+def S2_interleave : T_S2op_3 <"interleave", 0b11, 0b101>;
+def S2_deinterleave : T_S2op_3 <"deinterleave", 0b11, 0b100>;
+
+// Vector Complex conjugate
+def A2_vconj : T_S2op_3 <"vconj", 0b10, 0b111, 1>;
+
+// Vector saturate without pack
+def S2_vsathb_nopack : T_S2op_3 <"vsathb", 0b00, 0b111>;
+def S2_vsathub_nopack : T_S2op_3 <"vsathub", 0b00, 0b100>;
+def S2_vsatwh_nopack : T_S2op_3 <"vsatwh", 0b00, 0b110>;
+def S2_vsatwuh_nopack : T_S2op_3 <"vsatwuh", 0b00, 0b101>;
+
+// Vector absolute value halfwords with and without saturation
+// Rdd64=vabsh(Rss64)[:sat]
+def A2_vabsh : T_S2op_3 <"vabsh", 0b01, 0b100>;
+def A2_vabshsat : T_S2op_3 <"vabsh", 0b01, 0b101, 1>;
+
+// Vector absolute value words with and without saturation
+def A2_vabsw : T_S2op_3 <"vabsw", 0b01, 0b110>;
+def A2_vabswsat : T_S2op_3 <"vabsw", 0b01, 0b111, 1>;
+
+//===----------------------------------------------------------------------===//
+// STYPE/BIT +
+//===----------------------------------------------------------------------===//
+// Bit count
+
+let hasSideEffects = 0, hasNewValue = 1 in
+class T_COUNT_LEADING<string MnOp, bits<3> MajOp, bits<3> MinOp, bit Is32,
+ dag Out, dag Inp>
+ : SInst<Out, Inp, "$Rd = "#MnOp#"($Rs)", [], "", S_2op_tc_1_SLOT23> {
+ bits<5> Rs;
+ bits<5> Rd;
+ let IClass = 0b1000;
+ let Inst{27} = 0b1;
+ let Inst{26} = Is32;
+ let Inst{25-24} = 0b00;
+ let Inst{23-21} = MajOp;
+ let Inst{20-16} = Rs;
+ let Inst{7-5} = MinOp;
+ let Inst{4-0} = Rd;
+}
+
+class T_COUNT_LEADING_32<string MnOp, bits<3> MajOp, bits<3> MinOp>
+ : T_COUNT_LEADING<MnOp, MajOp, MinOp, 0b1,
+ (outs IntRegs:$Rd), (ins IntRegs:$Rs)>;
+
+class T_COUNT_LEADING_64<string MnOp, bits<3> MajOp, bits<3> MinOp>
+ : T_COUNT_LEADING<MnOp, MajOp, MinOp, 0b0,
+ (outs IntRegs:$Rd), (ins DoubleRegs:$Rs)>;
+
+def S2_cl0 : T_COUNT_LEADING_32<"cl0", 0b000, 0b101>;
+def S2_cl1 : T_COUNT_LEADING_32<"cl1", 0b000, 0b110>;
+def S2_ct0 : T_COUNT_LEADING_32<"ct0", 0b010, 0b100>;
+def S2_ct1 : T_COUNT_LEADING_32<"ct1", 0b010, 0b101>;
+def S2_cl0p : T_COUNT_LEADING_64<"cl0", 0b010, 0b010>;
+def S2_cl1p : T_COUNT_LEADING_64<"cl1", 0b010, 0b100>;
+def S2_clb : T_COUNT_LEADING_32<"clb", 0b000, 0b100>;
+def S2_clbp : T_COUNT_LEADING_64<"clb", 0b010, 0b000>;
+def S2_clbnorm : T_COUNT_LEADING_32<"normamt", 0b000, 0b111>;
+
+// The 64-bit counts leading/trailing are defined in HexagonInstrInfoV4.td.
+
+// Bit set/clear/toggle
+
+let hasSideEffects = 0, hasNewValue = 1 in
+class T_SCT_BIT_IMM<string MnOp, bits<3> MinOp>
+ : SInst<(outs IntRegs:$Rd), (ins IntRegs:$Rs, u5_0Imm:$u5),
+ "$Rd = "#MnOp#"($Rs, #$u5)", [], "", S_2op_tc_1_SLOT23> {
+ bits<5> Rd;
+ bits<5> Rs;
+ bits<5> u5;
+ let IClass = 0b1000;
+ let Inst{27-21} = 0b1100110;
+ let Inst{20-16} = Rs;
+ let Inst{13} = 0b0;
+ let Inst{12-8} = u5;
+ let Inst{7-5} = MinOp;
+ let Inst{4-0} = Rd;
+}
+
+let hasSideEffects = 0, hasNewValue = 1 in
+class T_SCT_BIT_REG<string MnOp, bits<2> MinOp>
+ : SInst<(outs IntRegs:$Rd), (ins IntRegs:$Rs, IntRegs:$Rt),
+ "$Rd = "#MnOp#"($Rs, $Rt)", [], "", S_3op_tc_1_SLOT23> {
+ bits<5> Rd;
+ bits<5> Rs;
+ bits<5> Rt;
+ let IClass = 0b1100;
+ let Inst{27-22} = 0b011010;
+ let Inst{20-16} = Rs;
+ let Inst{12-8} = Rt;
+ let Inst{7-6} = MinOp;
+ let Inst{4-0} = Rd;
+}
+
+def S2_clrbit_i : T_SCT_BIT_IMM<"clrbit", 0b001>;
+def S2_setbit_i : T_SCT_BIT_IMM<"setbit", 0b000>;
+def S2_togglebit_i : T_SCT_BIT_IMM<"togglebit", 0b010>;
+def S2_clrbit_r : T_SCT_BIT_REG<"clrbit", 0b01>;
+def S2_setbit_r : T_SCT_BIT_REG<"setbit", 0b00>;
+def S2_togglebit_r : T_SCT_BIT_REG<"togglebit", 0b10>;
+
+// Bit test
+
+let hasSideEffects = 0 in
+class T_TEST_BIT_IMM<string MnOp, bits<3> MajOp>
+ : SInst<(outs PredRegs:$Pd), (ins IntRegs:$Rs, u5_0Imm:$u5),
+ "$Pd = "#MnOp#"($Rs, #$u5)",
+ [], "", S_2op_tc_2early_SLOT23> {
+ bits<2> Pd;
+ bits<5> Rs;
+ bits<5> u5;
+ let IClass = 0b1000;
+ let Inst{27-24} = 0b0101;
+ let Inst{23-21} = MajOp;
+ let Inst{20-16} = Rs;
+ let Inst{13} = 0;
+ let Inst{12-8} = u5;
+ let Inst{1-0} = Pd;
+}
+
+let hasSideEffects = 0 in
+class T_TEST_BIT_REG<string MnOp, bit IsNeg>
+ : SInst<(outs PredRegs:$Pd), (ins IntRegs:$Rs, IntRegs:$Rt),
+ "$Pd = "#MnOp#"($Rs, $Rt)",
+ [], "", S_3op_tc_2early_SLOT23> {
+ bits<2> Pd;
+ bits<5> Rs;
+ bits<5> Rt;
+ let IClass = 0b1100;
+ let Inst{27-22} = 0b011100;
+ let Inst{21} = IsNeg;
+ let Inst{20-16} = Rs;
+ let Inst{12-8} = Rt;
+ let Inst{1-0} = Pd;
+}
+
+def S2_tstbit_i : T_TEST_BIT_IMM<"tstbit", 0b000>;
+def S2_tstbit_r : T_TEST_BIT_REG<"tstbit", 0>;
+
+let hasSideEffects = 0 in
+class T_TEST_BITS_IMM<string MnOp, bits<2> MajOp, bit IsNeg>
+ : SInst<(outs PredRegs:$Pd), (ins IntRegs:$Rs, u6_0Imm:$u6),
+ "$Pd = "#MnOp#"($Rs, #$u6)",
+ [], "", S_2op_tc_2early_SLOT23> {
+ bits<2> Pd;
+ bits<5> Rs;
+ bits<6> u6;
+ let IClass = 0b1000;
+ let Inst{27-24} = 0b0101;
+ let Inst{23-22} = MajOp;
+ let Inst{21} = IsNeg;
+ let Inst{20-16} = Rs;
+ let Inst{13-8} = u6;
+ let Inst{1-0} = Pd;
+}
+
+let hasSideEffects = 0 in
+class T_TEST_BITS_REG<string MnOp, bits<2> MajOp, bit IsNeg>
+ : SInst<(outs PredRegs:$Pd), (ins IntRegs:$Rs, IntRegs:$Rt),
+ "$Pd = "#MnOp#"($Rs, $Rt)",
+ [], "", S_3op_tc_2early_SLOT23> {
+ bits<2> Pd;
+ bits<5> Rs;
+ bits<5> Rt;
+ let IClass = 0b1100;
+ let Inst{27-24} = 0b0111;
+ let Inst{23-22} = MajOp;
+ let Inst{21} = IsNeg;
+ let Inst{20-16} = Rs;
+ let Inst{12-8} = Rt;
+ let Inst{1-0} = Pd;
+}
+
+def C2_bitsclri : T_TEST_BITS_IMM<"bitsclr", 0b10, 0>;
+def C2_bitsclr : T_TEST_BITS_REG<"bitsclr", 0b10, 0>;
+def C2_bitsset : T_TEST_BITS_REG<"bitsset", 0b01, 0>;
+
+//===----------------------------------------------------------------------===//
+// STYPE/BIT -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// STYPE/COMPLEX +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// STYPE/COMPLEX -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// XTYPE/PERM +
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// XTYPE/PERM -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// STYPE/PRED +
+//===----------------------------------------------------------------------===//
+
+// Predicate transfer.
+let hasSideEffects = 0, hasNewValue = 1 in
+def C2_tfrpr : SInst<(outs IntRegs:$Rd), (ins PredRegs:$Ps),
+ "$Rd = $Ps", [], "", S_2op_tc_1_SLOT23> {
+ bits<5> Rd;
+ bits<2> Ps;
+
+ let IClass = 0b1000;
+ let Inst{27-24} = 0b1001;
+ let Inst{22} = 0b1;
+ let Inst{17-16} = Ps;
+ let Inst{4-0} = Rd;
+}
+
+// Transfer general register to predicate.
+let hasSideEffects = 0 in
+def C2_tfrrp: SInst<(outs PredRegs:$Pd), (ins IntRegs:$Rs),
+ "$Pd = $Rs", [], "", S_2op_tc_2early_SLOT23> {
+ bits<2> Pd;
+ bits<5> Rs;
+
+ let IClass = 0b1000;
+ let Inst{27-21} = 0b0101010;
+ let Inst{20-16} = Rs;
+ let Inst{1-0} = Pd;
+}
+
+let hasSideEffects = 0, isCodeGenOnly = 1 in
+def C2_pxfer_map: SInst<(outs PredRegs:$dst), (ins PredRegs:$src),
+ "$dst = $src">;
+
+//===----------------------------------------------------------------------===//
+// STYPE/PRED -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// STYPE/SHIFT +
+//===----------------------------------------------------------------------===//
+class S_2OpInstImm<string Mnemonic, bits<3>MajOp, bits<3>MinOp,
+ Operand Imm, list<dag> pattern = [], bit isRnd = 0>
+ : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, Imm:$src2),
+ "$dst = "#Mnemonic#"($src1, #$src2)"#!if(isRnd, ":rnd", ""),
+ pattern> {
+ bits<5> src1;
+ bits<5> dst;
+ let IClass = 0b1000;
+ let Inst{27-24} = 0;
+ let Inst{23-21} = MajOp;
+ let Inst{20-16} = src1;
+ let Inst{7-5} = MinOp;
+ let Inst{4-0} = dst;
+}
+
+class S_2OpInstImmI6<string Mnemonic, SDNode OpNode, bits<3>MinOp>
+ : S_2OpInstImm<Mnemonic, 0b000, MinOp, u6_0Imm, []> {
+ bits<6> src2;
+ let Inst{13-8} = src2;
+}
+
+// Shift by immediate.
+def S2_asr_i_p : S_2OpInstImmI6<"asr", sra, 0b000>;
+def S2_asl_i_p : S_2OpInstImmI6<"asl", shl, 0b010>;
+def S2_lsr_i_p : S_2OpInstImmI6<"lsr", srl, 0b001>;
+
+// Shift left by small amount and add.
+let AddedComplexity = 100, hasNewValue = 1, hasSideEffects = 0 in
+def S2_addasl_rrri: SInst <(outs IntRegs:$Rd),
+ (ins IntRegs:$Rt, IntRegs:$Rs, u3_0Imm:$u3),
+ "$Rd = addasl($Rt, $Rs, #$u3)" , [],
+ "", S_3op_tc_2_SLOT23> {
+ bits<5> Rd;
+ bits<5> Rt;
+ bits<5> Rs;
+ bits<3> u3;
+
+ let IClass = 0b1100;
+
+ let Inst{27-21} = 0b0100000;
+ let Inst{20-16} = Rs;
+ let Inst{13} = 0b0;
+ let Inst{12-8} = Rt;
+ let Inst{7-5} = u3;
+ let Inst{4-0} = Rd;
+ }
+
+//===----------------------------------------------------------------------===//
+// STYPE/SHIFT -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// STYPE/VH +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// STYPE/VH -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// STYPE/VW +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// STYPE/VW -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SYSTEM/SUPER +
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SYSTEM/USER +
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 1, isSoloAX = 1 in
+def Y2_barrier : SYSInst<(outs), (ins), "barrier", [],"",ST_tc_st_SLOT0> {
+ let Inst{31-28} = 0b1010;
+ let Inst{27-21} = 0b1000000;
+}
+
+//===----------------------------------------------------------------------===//
+// SYSTEM/SUPER -
+//===----------------------------------------------------------------------===//
+
+// Generate frameindex addresses. The main reason for the offset operand is
+// that every instruction that is allowed to have frame index as an operand
+// will then have that operand followed by an immediate operand (the offset).
+// This simplifies the frame-index elimination code.
+//
+let isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1,
+ isPseudo = 1, isCodeGenOnly = 1, hasSideEffects = 0 in {
+ def PS_fi : ALU32_ri<(outs IntRegs:$Rd),
+ (ins IntRegs:$fi, s32_0Imm:$off), "">;
+ def PS_fia : ALU32_ri<(outs IntRegs:$Rd),
+ (ins IntRegs:$Rs, IntRegs:$fi, s32_0Imm:$off), "">;
+}
+
+//===----------------------------------------------------------------------===//
+// CRUSER - Type.
+//===----------------------------------------------------------------------===//
+// HW loop
+let isExtendable = 1, isExtentSigned = 1, opExtentBits = 9, opExtentAlign = 2,
+ opExtendable = 0, hasSideEffects = 0 in
+class LOOP_iBase<string mnemonic, Operand brOp, bit mustExtend = 0>
+ : CRInst<(outs), (ins brOp:$offset, u10_0Imm:$src2),
+ #mnemonic#"($offset, #$src2)",
+ [], "" , CR_tc_3x_SLOT3> {
+ bits<9> offset;
+ bits<10> src2;
+
+ let IClass = 0b0110;
+
+ let Inst{27-22} = 0b100100;
+ let Inst{21} = !if (!eq(mnemonic, "loop0"), 0b0, 0b1);
+ let Inst{20-16} = src2{9-5};
+ let Inst{12-8} = offset{8-4};
+ let Inst{7-5} = src2{4-2};
+ let Inst{4-3} = offset{3-2};
+ let Inst{1-0} = src2{1-0};
+}
+
+let isExtendable = 1, isExtentSigned = 1, opExtentBits = 9, opExtentAlign = 2,
+ opExtendable = 0, hasSideEffects = 0 in
+class LOOP_rBase<string mnemonic, Operand brOp, bit mustExtend = 0>
+ : CRInst<(outs), (ins brOp:$offset, IntRegs:$src2),
+ #mnemonic#"($offset, $src2)",
+ [], "" ,CR_tc_3x_SLOT3> {
+ bits<9> offset;
+ bits<5> src2;
+
+ let IClass = 0b0110;
+
+ let Inst{27-22} = 0b000000;
+ let Inst{21} = !if (!eq(mnemonic, "loop0"), 0b0, 0b1);
+ let Inst{20-16} = src2;
+ let Inst{12-8} = offset{8-4};
+ let Inst{4-3} = offset{3-2};
+ }
+
+multiclass LOOP_ri<string mnemonic> {
+ def i : LOOP_iBase<mnemonic, brtarget>;
+ def r : LOOP_rBase<mnemonic, brtarget>;
+
+ let isCodeGenOnly = 1, isExtended = 1, opExtendable = 0 in {
+ def iext: LOOP_iBase<mnemonic, brtargetExt, 1>;
+ def rext: LOOP_rBase<mnemonic, brtargetExt, 1>;
+ }
+}
+
+
+let Defs = [SA0, LC0, USR] in
+defm J2_loop0 : LOOP_ri<"loop0">;
+
+// Interestingly only loop0's appear to set usr.lpcfg
+let Defs = [SA1, LC1] in
+defm J2_loop1 : LOOP_ri<"loop1">;
+
+let isBranch = 1, isTerminator = 1, hasSideEffects = 0,
+ Defs = [PC, LC0], Uses = [SA0, LC0] in {
+def ENDLOOP0 : Endloop<(outs), (ins brtarget:$offset),
+ ":endloop0",
+ []>;
+}
+
+let isBranch = 1, isTerminator = 1, hasSideEffects = 0,
+ Defs = [PC, LC1], Uses = [SA1, LC1] in {
+def ENDLOOP1 : Endloop<(outs), (ins brtarget:$offset),
+ ":endloop1",
+ []>;
+}
+
+// Pipelined loop instructions, sp[123]loop0
+let Defs = [LC0, SA0, P3, USR], hasSideEffects = 0,
+ isExtentSigned = 1, isExtendable = 1, opExtentBits = 9, opExtentAlign = 2,
+ opExtendable = 0, isPredicateLate = 1 in
+class SPLOOP_iBase<string SP, bits<2> op>
+ : CRInst <(outs), (ins brtarget:$r7_2, u10_0Imm:$U10),
+ "p3 = sp"#SP#"loop0($r7_2, #$U10)" > {
+ bits<9> r7_2;
+ bits<10> U10;
+
+ let IClass = 0b0110;
+
+ let Inst{22-21} = op;
+ let Inst{27-23} = 0b10011;
+ let Inst{20-16} = U10{9-5};
+ let Inst{12-8} = r7_2{8-4};
+ let Inst{7-5} = U10{4-2};
+ let Inst{4-3} = r7_2{3-2};
+ let Inst{1-0} = U10{1-0};
+ }
+
+let Defs = [LC0, SA0, P3, USR], hasSideEffects = 0,
+ isExtentSigned = 1, isExtendable = 1, opExtentBits = 9, opExtentAlign = 2,
+ opExtendable = 0, isPredicateLate = 1 in
+class SPLOOP_rBase<string SP, bits<2> op>
+ : CRInst <(outs), (ins brtarget:$r7_2, IntRegs:$Rs),
+ "p3 = sp"#SP#"loop0($r7_2, $Rs)" > {
+ bits<9> r7_2;
+ bits<5> Rs;
+
+ let IClass = 0b0110;
+
+ let Inst{22-21} = op;
+ let Inst{27-23} = 0b00001;
+ let Inst{20-16} = Rs;
+ let Inst{12-8} = r7_2{8-4};
+ let Inst{4-3} = r7_2{3-2};
+ }
+
+multiclass SPLOOP_ri<string mnemonic, bits<2> op> {
+ def i : SPLOOP_iBase<mnemonic, op>;
+ def r : SPLOOP_rBase<mnemonic, op>;
+}
+
+defm J2_ploop1s : SPLOOP_ri<"1", 0b01>;
+defm J2_ploop2s : SPLOOP_ri<"2", 0b10>;
+defm J2_ploop3s : SPLOOP_ri<"3", 0b11>;
+
+// if (Rs[!>=<]=#0) jump:[t/nt]
+let Defs = [PC], isPredicated = 1, isBranch = 1, hasSideEffects = 0,
+ hasSideEffects = 0 in
+class J2_jump_0_Base<string compare, bit isTak, bits<2> op>
+ : CRInst <(outs), (ins IntRegs:$Rs, brtarget:$r13_2),
+ "if ($Rs"#compare#"#0) jump"#!if(isTak, ":t", ":nt")#" $r13_2" > {
+ bits<5> Rs;
+ bits<15> r13_2;
+
+ let IClass = 0b0110;
+
+ let Inst{27-24} = 0b0001;
+ let Inst{23-22} = op;
+ let Inst{12} = isTak;
+ let Inst{21} = r13_2{14};
+ let Inst{20-16} = Rs;
+ let Inst{11-1} = r13_2{12-2};
+ let Inst{13} = r13_2{13};
+ }
+
+multiclass J2_jump_compare_0<string compare, bits<2> op> {
+ def NAME : J2_jump_0_Base<compare, 0, op>;
+ def NAME#pt : J2_jump_0_Base<compare, 1, op>;
+}
+
+defm J2_jumprz : J2_jump_compare_0<"!=", 0b00>;
+defm J2_jumprgtez : J2_jump_compare_0<">=", 0b01>;
+defm J2_jumprnz : J2_jump_compare_0<"==", 0b10>;
+defm J2_jumprltez : J2_jump_compare_0<"<=", 0b11>;
+
+// Transfer to/from Control/GPR Guest/GPR
+let hasSideEffects = 0 in
+class TFR_CR_RS_base<RegisterClass CTRC, RegisterClass RC, bit isDouble>
+ : CRInst <(outs CTRC:$dst), (ins RC:$src),
+ "$dst = $src", [], "", CR_tc_3x_SLOT3> {
+ bits<5> dst;
+ bits<5> src;
+
+ let IClass = 0b0110;
+
+ let Inst{27-25} = 0b001;
+ let Inst{24} = isDouble;
+ let Inst{23-21} = 0b001;
+ let Inst{20-16} = src;
+ let Inst{4-0} = dst;
+ }
+
+def A2_tfrrcr : TFR_CR_RS_base<CtrRegs, IntRegs, 0b0>;
+def A4_tfrpcp : TFR_CR_RS_base<CtrRegs64, DoubleRegs, 0b1>;
+def : InstAlias<"m0 = $Rs", (A2_tfrrcr C6, IntRegs:$Rs)>;
+def : InstAlias<"m1 = $Rs", (A2_tfrrcr C7, IntRegs:$Rs)>;
+
+let hasSideEffects = 0 in
+class TFR_RD_CR_base<RegisterClass RC, RegisterClass CTRC, bit isSingle>
+ : CRInst <(outs RC:$dst), (ins CTRC:$src),
+ "$dst = $src", [], "", CR_tc_3x_SLOT3> {
+ bits<5> dst;
+ bits<5> src;
+
+ let IClass = 0b0110;
+
+ let Inst{27-26} = 0b10;
+ let Inst{25} = isSingle;
+ let Inst{24-21} = 0b0000;
+ let Inst{20-16} = src;
+ let Inst{4-0} = dst;
+ }
+
+let hasNewValue = 1, opNewValue = 0 in
+def A2_tfrcrr : TFR_RD_CR_base<IntRegs, CtrRegs, 1>;
+def A4_tfrcpp : TFR_RD_CR_base<DoubleRegs, CtrRegs64, 0>;
+def : InstAlias<"$Rd = m0", (A2_tfrcrr IntRegs:$Rd, C6)>;
+def : InstAlias<"$Rd = m1", (A2_tfrcrr IntRegs:$Rd, C7)>;
+
+// Y4_trace: Send value to etm trace.
+let isSoloAX = 1, hasSideEffects = 0 in
+def Y4_trace: CRInst <(outs), (ins IntRegs:$Rs),
+ "trace($Rs)"> {
+ bits<5> Rs;
+
+ let IClass = 0b0110;
+ let Inst{27-21} = 0b0010010;
+ let Inst{20-16} = Rs;
+ }
+
+// HI/LO Instructions
+let isReMaterializable = 1, isMoveImm = 1, hasSideEffects = 0,
+ hasNewValue = 1, opNewValue = 0 in
+class REG_IMMED<string RegHalf, bit Rs, bits<3> MajOp, bit MinOp>
+ : ALU32_ri<(outs IntRegs:$dst),
+ (ins u16_0Imm:$imm_value),
+ "$dst"#RegHalf#" = $imm_value", []> {
+ bits<5> dst;
+ bits<32> imm_value;
+ let IClass = 0b0111;
+
+ let Inst{27} = Rs;
+ let Inst{26-24} = MajOp;
+ let Inst{21} = MinOp;
+ let Inst{20-16} = dst;
+ let Inst{23-22} = imm_value{15-14};
+ let Inst{13-0} = imm_value{13-0};
+}
+
+let isAsmParserOnly = 1 in {
+ def LO : REG_IMMED<".l", 0b0, 0b001, 0b1>;
+ def HI : REG_IMMED<".h", 0b0, 0b010, 0b1>;
+}
+
+let isReMaterializable = 1, isMoveImm = 1, isAsmParserOnly = 1 in {
+ def CONST32 : CONSTLDInst<(outs IntRegs:$Rd), (ins i32imm:$v),
+ "$Rd = CONST32(#$v)", []>;
+ def CONST64 : CONSTLDInst<(outs DoubleRegs:$Rd), (ins i64imm:$v),
+ "$Rd = CONST64(#$v)", []>;
+}
+
+let hasSideEffects = 0, isReMaterializable = 1, isPseudo = 1,
+ isCodeGenOnly = 1 in
+def PS_true : SInst<(outs PredRegs:$dst), (ins), "", []>;
+
+let hasSideEffects = 0, isReMaterializable = 1, isPseudo = 1,
+ isCodeGenOnly = 1 in
+def PS_false : SInst<(outs PredRegs:$dst), (ins), "", []>;
+
+let Defs = [R29, R30], Uses = [R31, R30, R29], isPseudo = 1 in
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
+ ".error \"should not emit\" ", []>;
+
+let Defs = [R29, R30, R31], Uses = [R29], isPseudo = 1 in
+def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+ ".error \"should not emit\" ", []>;
+
+// Call subroutine indirectly.
+let Defs = VolatileV3.Regs in
+def J2_callr : JUMPR_MISC_CALLR<0, 1>;
+
+// Indirect tail-call.
+let isPseudo = 1, isCall = 1, isReturn = 1, isBarrier = 1, isPredicable = 0,
+ isTerminator = 1, isCodeGenOnly = 1 in
+def PS_tailcall_r : T_JMPr;
+
+// Direct tail-calls.
+let isPseudo = 1, isCall = 1, isReturn = 1, isBarrier = 1, isPredicable = 0,
+ isTerminator = 1, isCodeGenOnly = 1 in
+def PS_tailcall_i : JInst<(outs), (ins calltarget:$dst), "", []>;
+
+// The reason for the custom inserter is to record all ALLOCA instructions
+// in MachineFunctionInfo.
+let Defs = [R29], isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 1 in
+def PS_alloca: ALU32Inst<(outs IntRegs:$Rd),
+ (ins IntRegs:$Rs, u32_0Imm:$A), "", []>;
+
+let isCodeGenOnly = 1, isPseudo = 1, Uses = [R30], hasSideEffects = 0 in
+def PS_aligna : ALU32Inst<(outs IntRegs:$Rd), (ins u32_0Imm:$A), "", []>;
+
+// XTYPE/SHIFT
+//
+//===----------------------------------------------------------------------===//
+// Template Class
+// Shift by immediate/register and accumulate/logical
+//===----------------------------------------------------------------------===//
+
+// Rx[+-&|]=asr(Rs,#u5)
+// Rx[+-&|^]=lsr(Rs,#u5)
+// Rx[+-&|^]=asl(Rs,#u5)
+
+let hasNewValue = 1, opNewValue = 0 in
+class T_shift_imm_acc_r <string opc1, string opc2, SDNode OpNode1,
+ SDNode OpNode2, bits<3> majOp, bits<2> minOp>
+ : SInst_acc<(outs IntRegs:$Rx),
+ (ins IntRegs:$src1, IntRegs:$Rs, u5_0Imm:$u5),
+ "$Rx "#opc2#opc1#"($Rs, #$u5)", [],
+ "$src1 = $Rx", S_2op_tc_2_SLOT23> {
+ bits<5> Rx;
+ bits<5> Rs;
+ bits<5> u5;
+
+ let IClass = 0b1000;
+
+ let Inst{27-24} = 0b1110;
+ let Inst{23-22} = majOp{2-1};
+ let Inst{13} = 0b0;
+ let Inst{7} = majOp{0};
+ let Inst{6-5} = minOp;
+ let Inst{4-0} = Rx;
+ let Inst{20-16} = Rs;
+ let Inst{12-8} = u5;
+ }
+
+// Rx[+-&|]=asr(Rs,Rt)
+// Rx[+-&|^]=lsr(Rs,Rt)
+// Rx[+-&|^]=asl(Rs,Rt)
+
+let hasNewValue = 1, opNewValue = 0 in
+class T_shift_reg_acc_r <string opc1, string opc2, SDNode OpNode1,
+ SDNode OpNode2, bits<2> majOp, bits<2> minOp>
+ : SInst_acc<(outs IntRegs:$Rx),
+ (ins IntRegs:$src1, IntRegs:$Rs, IntRegs:$Rt),
+ "$Rx "#opc2#opc1#"($Rs, $Rt)", [],
+ "$src1 = $Rx", S_3op_tc_2_SLOT23 > {
+ bits<5> Rx;
+ bits<5> Rs;
+ bits<5> Rt;
+
+ let IClass = 0b1100;
+
+ let Inst{27-24} = 0b1100;
+ let Inst{23-22} = majOp;
+ let Inst{7-6} = minOp;
+ let Inst{4-0} = Rx;
+ let Inst{20-16} = Rs;
+ let Inst{12-8} = Rt;
+ }
+
+// Rxx[+-&|]=asr(Rss,#u6)
+// Rxx[+-&|^]=lsr(Rss,#u6)
+// Rxx[+-&|^]=asl(Rss,#u6)
+
+class T_shift_imm_acc_p <string opc1, string opc2, SDNode OpNode1,
+ SDNode OpNode2, bits<3> majOp, bits<2> minOp>
+ : SInst_acc<(outs DoubleRegs:$Rxx),
+ (ins DoubleRegs:$src1, DoubleRegs:$Rss, u6_0Imm:$u6),
+ "$Rxx "#opc2#opc1#"($Rss, #$u6)", [],
+ "$src1 = $Rxx", S_2op_tc_2_SLOT23> {
+ bits<5> Rxx;
+ bits<5> Rss;
+ bits<6> u6;
+
+ let IClass = 0b1000;
+
+ let Inst{27-24} = 0b0010;
+ let Inst{23-22} = majOp{2-1};
+ let Inst{7} = majOp{0};
+ let Inst{6-5} = minOp;
+ let Inst{4-0} = Rxx;
+ let Inst{20-16} = Rss;
+ let Inst{13-8} = u6;
+ }
+
+
+// Rxx[+-&|]=asr(Rss,Rt)
+// Rxx[+-&|^]=lsr(Rss,Rt)
+// Rxx[+-&|^]=asl(Rss,Rt)
+// Rxx[+-&|^]=lsl(Rss,Rt)
+
+class T_shift_reg_acc_p <string opc1, string opc2, SDNode OpNode1,
+ SDNode OpNode2, bits<3> majOp, bits<2> minOp>
+ : SInst_acc<(outs DoubleRegs:$Rxx),
+ (ins DoubleRegs:$src1, DoubleRegs:$Rss, IntRegs:$Rt),
+ "$Rxx "#opc2#opc1#"($Rss, $Rt)", [],
+ "$src1 = $Rxx", S_3op_tc_2_SLOT23> {
+ bits<5> Rxx;
+ bits<5> Rss;
+ bits<5> Rt;
+
+ let IClass = 0b1100;
+
+ let Inst{27-24} = 0b1011;
+ let Inst{23-21} = majOp;
+ let Inst{20-16} = Rss;
+ let Inst{12-8} = Rt;
+ let Inst{7-6} = minOp;
+ let Inst{4-0} = Rxx;
+ }
+
+//===----------------------------------------------------------------------===//
+// Multi-class for the shift instructions with logical/arithmetic operators.
+//===----------------------------------------------------------------------===//
+
+multiclass xtype_imm_base<string OpcStr1, string OpcStr2, SDNode OpNode1,
+ SDNode OpNode2, bits<3> majOp, bits<2> minOp > {
+ def _i_r#NAME : T_shift_imm_acc_r< OpcStr1, OpcStr2, OpNode1,
+ OpNode2, majOp, minOp >;
+ def _i_p#NAME : T_shift_imm_acc_p< OpcStr1, OpcStr2, OpNode1,
+ OpNode2, majOp, minOp >;
+}
+
+multiclass xtype_imm_acc<string opc1, SDNode OpNode, bits<2>minOp> {
+ let AddedComplexity = 100 in
+ defm _acc : xtype_imm_base< opc1, "+= ", OpNode, add, 0b001, minOp>;
+
+ defm _nac : xtype_imm_base< opc1, "-= ", OpNode, sub, 0b000, minOp>;
+ defm _and : xtype_imm_base< opc1, "&= ", OpNode, and, 0b010, minOp>;
+ defm _or : xtype_imm_base< opc1, "|= ", OpNode, or, 0b011, minOp>;
+}
+
+multiclass xtype_xor_imm_acc<string opc1, SDNode OpNode, bits<2>minOp> {
+let AddedComplexity = 100 in
+ defm _xacc : xtype_imm_base< opc1, "^= ", OpNode, xor, 0b100, minOp>;
+}
+
+defm S2_asr : xtype_imm_acc<"asr", sra, 0b00>;
+
+defm S2_lsr : xtype_imm_acc<"lsr", srl, 0b01>,
+ xtype_xor_imm_acc<"lsr", srl, 0b01>;
+
+defm S2_asl : xtype_imm_acc<"asl", shl, 0b10>,
+ xtype_xor_imm_acc<"asl", shl, 0b10>;
+
+multiclass xtype_reg_acc_r<string opc1, SDNode OpNode, bits<2>minOp> {
+ let AddedComplexity = 100 in
+ def _acc : T_shift_reg_acc_r <opc1, "+= ", OpNode, add, 0b11, minOp>;
+
+ def _nac : T_shift_reg_acc_r <opc1, "-= ", OpNode, sub, 0b10, minOp>;
+ def _and : T_shift_reg_acc_r <opc1, "&= ", OpNode, and, 0b01, minOp>;
+ def _or : T_shift_reg_acc_r <opc1, "|= ", OpNode, or, 0b00, minOp>;
+}
+
+multiclass xtype_reg_acc_p<string opc1, SDNode OpNode, bits<2>minOp> {
+ let AddedComplexity = 100 in
+ def _acc : T_shift_reg_acc_p <opc1, "+= ", OpNode, add, 0b110, minOp>;
+
+ def _nac : T_shift_reg_acc_p <opc1, "-= ", OpNode, sub, 0b100, minOp>;
+ def _and : T_shift_reg_acc_p <opc1, "&= ", OpNode, and, 0b010, minOp>;
+ def _or : T_shift_reg_acc_p <opc1, "|= ", OpNode, or, 0b000, minOp>;
+ def _xor : T_shift_reg_acc_p <opc1, "^= ", OpNode, xor, 0b011, minOp>;
+}
+
+multiclass xtype_reg_acc<string OpcStr, SDNode OpNode, bits<2> minOp > {
+ defm _r_r : xtype_reg_acc_r <OpcStr, OpNode, minOp>;
+ defm _r_p : xtype_reg_acc_p <OpcStr, OpNode, minOp>;
+}
+
+defm S2_asl : xtype_reg_acc<"asl", shl, 0b10>;
+defm S2_asr : xtype_reg_acc<"asr", sra, 0b00>;
+defm S2_lsr : xtype_reg_acc<"lsr", srl, 0b01>;
+defm S2_lsl : xtype_reg_acc<"lsl", shl, 0b11>;
+
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0 in
+class T_S3op_1 <string mnemonic, RegisterClass RC, bits<2> MajOp, bits<3> MinOp,
+ bit SwapOps, bit isSat = 0, bit isRnd = 0, bit hasShift = 0>
+ : SInst <(outs RC:$dst),
+ (ins DoubleRegs:$src1, DoubleRegs:$src2),
+ "$dst = "#mnemonic#"($src1, $src2)"#!if(isRnd, ":rnd", "")
+ #!if(hasShift,":>>1","")
+ #!if(isSat, ":sat", ""),
+ [], "", S_3op_tc_2_SLOT23 > {
+ bits<5> dst;
+ bits<5> src1;
+ bits<5> src2;
+
+ let IClass = 0b1100;
+
+ let Inst{27-24} = 0b0001;
+ let Inst{23-22} = MajOp;
+ let Inst{20-16} = !if (SwapOps, src2, src1);
+ let Inst{12-8} = !if (SwapOps, src1, src2);
+ let Inst{7-5} = MinOp;
+ let Inst{4-0} = dst;
+ }
+
+class T_S3op_64 <string mnemonic, bits<2> MajOp, bits<3> MinOp, bit SwapOps,
+ bit isSat = 0, bit isRnd = 0, bit hasShift = 0 >
+ : T_S3op_1 <mnemonic, DoubleRegs, MajOp, MinOp, SwapOps,
+ isSat, isRnd, hasShift>;
+
+let Itinerary = S_3op_tc_1_SLOT23 in {
+ def S2_shuffeb : T_S3op_64 < "shuffeb", 0b00, 0b010, 0>;
+ def S2_shuffeh : T_S3op_64 < "shuffeh", 0b00, 0b110, 0>;
+ def S2_shuffob : T_S3op_64 < "shuffob", 0b00, 0b100, 1>;
+ def S2_shuffoh : T_S3op_64 < "shuffoh", 0b10, 0b000, 1>;
+
+ def S2_vtrunewh : T_S3op_64 < "vtrunewh", 0b10, 0b010, 0>;
+ def S2_vtrunowh : T_S3op_64 < "vtrunowh", 0b10, 0b100, 0>;
+}
+
+def S2_lfsp : T_S3op_64 < "lfs", 0b10, 0b110, 0>;
+
+let hasSideEffects = 0 in
+class T_S3op_2 <string mnemonic, bits<3> MajOp, bit SwapOps>
+ : SInst < (outs DoubleRegs:$Rdd),
+ (ins DoubleRegs:$Rss, DoubleRegs:$Rtt, PredRegs:$Pu),
+ "$Rdd = "#mnemonic#"($Rss, $Rtt, $Pu)",
+ [], "", S_3op_tc_1_SLOT23 > {
+ bits<5> Rdd;
+ bits<5> Rss;
+ bits<5> Rtt;
+ bits<2> Pu;
+
+ let IClass = 0b1100;
+
+ let Inst{27-24} = 0b0010;
+ let Inst{23-21} = MajOp;
+ let Inst{20-16} = !if (SwapOps, Rtt, Rss);
+ let Inst{12-8} = !if (SwapOps, Rss, Rtt);
+ let Inst{6-5} = Pu;
+ let Inst{4-0} = Rdd;
+ }
+
+def S2_valignrb : T_S3op_2 < "valignb", 0b000, 1>;
+def S2_vsplicerb : T_S3op_2 < "vspliceb", 0b100, 0>;
+
+//===----------------------------------------------------------------------===//
+// Template class used by vector shift, vector rotate, vector neg,
+// 32-bit shift, 64-bit shifts, etc.
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 0 in
+class T_S3op_3 <string mnemonic, RegisterClass RC, bits<2> MajOp,
+ bits<2> MinOp, bit isSat = 0, list<dag> pattern = [] >
+ : SInst <(outs RC:$dst),
+ (ins RC:$src1, IntRegs:$src2),
+ "$dst = "#mnemonic#"($src1, $src2)"#!if(isSat, ":sat", ""),
+ pattern, "", S_3op_tc_1_SLOT23> {
+ bits<5> dst;
+ bits<5> src1;
+ bits<5> src2;
+
+ let IClass = 0b1100;
+
+ let Inst{27-24} = !if(!eq(!cast<string>(RC), "IntRegs"), 0b0110, 0b0011);
+ let Inst{23-22} = MajOp;
+ let Inst{20-16} = src1;
+ let Inst{12-8} = src2;
+ let Inst{7-6} = MinOp;
+ let Inst{4-0} = dst;
+ }
+
+let hasNewValue = 1 in
+class T_S3op_shift32 <string mnemonic, SDNode OpNode, bits<2> MinOp>
+ : T_S3op_3 <mnemonic, IntRegs, 0b01, MinOp, 0, []>;
+
+let hasNewValue = 1, Itinerary = S_3op_tc_2_SLOT23 in
+class T_S3op_shift32_Sat <string mnemonic, bits<2> MinOp>
+ : T_S3op_3 <mnemonic, IntRegs, 0b00, MinOp, 1, []>;
+
+
+class T_S3op_shift64 <string mnemonic, SDNode OpNode, bits<2> MinOp>
+ : T_S3op_3 <mnemonic, DoubleRegs, 0b10, MinOp, 0, []>;
+
+
+class T_S3op_shiftVect <string mnemonic, bits<2> MajOp, bits<2> MinOp>
+ : T_S3op_3 <mnemonic, DoubleRegs, MajOp, MinOp, 0, []>;
+
+
+// Shift by register
+// Rdd=[asr|lsr|asl|lsl](Rss,Rt)
+
+def S2_asr_r_p : T_S3op_shift64 < "asr", sra, 0b00>;
+def S2_lsr_r_p : T_S3op_shift64 < "lsr", srl, 0b01>;
+def S2_asl_r_p : T_S3op_shift64 < "asl", shl, 0b10>;
+def S2_lsl_r_p : T_S3op_shift64 < "lsl", shl, 0b11>;
+
+// Rd=[asr|lsr|asl|lsl](Rs,Rt)
+
+def S2_asr_r_r : T_S3op_shift32<"asr", sra, 0b00>;
+def S2_lsr_r_r : T_S3op_shift32<"lsr", srl, 0b01>;
+def S2_asl_r_r : T_S3op_shift32<"asl", shl, 0b10>;
+def S2_lsl_r_r : T_S3op_shift32<"lsl", shl, 0b11>;
+
+// Shift by register with saturation
+// Rd=asr(Rs,Rt):sat
+// Rd=asl(Rs,Rt):sat
+
+let Defs = [USR_OVF] in {
+ def S2_asr_r_r_sat : T_S3op_shift32_Sat<"asr", 0b00>;
+ def S2_asl_r_r_sat : T_S3op_shift32_Sat<"asl", 0b10>;
+}
+
+let hasNewValue = 1, hasSideEffects = 0 in
+class T_S3op_8 <string opc, bits<3> MinOp, bit isSat, bit isRnd, bit hasShift, bit hasSplat = 0>
+ : SInst < (outs IntRegs:$Rd),
+ (ins DoubleRegs:$Rss, IntRegs:$Rt),
+ "$Rd = "#opc#"($Rss, $Rt"#!if(hasSplat, "*", "")#")"
+ #!if(hasShift, ":<<1", "")
+ #!if(isRnd, ":rnd", "")
+ #!if(isSat, ":sat", ""),
+ [], "", S_3op_tc_1_SLOT23 > {
+ bits<5> Rd;
+ bits<5> Rss;
+ bits<5> Rt;
+
+ let IClass = 0b1100;
+
+ let Inst{27-24} = 0b0101;
+ let Inst{20-16} = Rss;
+ let Inst{12-8} = Rt;
+ let Inst{7-5} = MinOp;
+ let Inst{4-0} = Rd;
+ }
+
+def S2_asr_r_svw_trun : T_S3op_8<"vasrw", 0b010, 0, 0, 0>;
+
+let Defs = [USR_OVF], Itinerary = S_3op_tc_2_SLOT23 in
+def S2_vcrotate : T_S3op_shiftVect < "vcrotate", 0b11, 0b00>;
+
+let hasSideEffects = 0 in
+class T_S3op_7 <string mnemonic, bit MajOp >
+ : SInst <(outs DoubleRegs:$Rdd),
+ (ins DoubleRegs:$Rss, DoubleRegs:$Rtt, u3_0Imm:$u3),
+ "$Rdd = "#mnemonic#"($Rss, $Rtt, #$u3)" ,
+ [], "", S_3op_tc_1_SLOT23 > {
+ bits<5> Rdd;
+ bits<5> Rss;
+ bits<5> Rtt;
+ bits<3> u3;
+
+ let IClass = 0b1100;
+
+ let Inst{27-24} = 0b0000;
+ let Inst{23} = MajOp;
+ let Inst{20-16} = !if(MajOp, Rss, Rtt);
+ let Inst{12-8} = !if(MajOp, Rtt, Rss);
+ let Inst{7-5} = u3;
+ let Inst{4-0} = Rdd;
+ }
+
+def S2_valignib : T_S3op_7 < "valignb", 0>;
+def S2_vspliceib : T_S3op_7 < "vspliceb", 1>;
+
+//===----------------------------------------------------------------------===//
+// Template class for 'insert bitfield' instructions
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0 in
+class T_S3op_insert <string mnemonic, RegisterClass RC>
+ : SInst <(outs RC:$dst),
+ (ins RC:$src1, RC:$src2, DoubleRegs:$src3),
+ "$dst = "#mnemonic#"($src2, $src3)" ,
+ [], "$src1 = $dst", S_3op_tc_1_SLOT23 > {
+ bits<5> dst;
+ bits<5> src2;
+ bits<5> src3;
+
+ let IClass = 0b1100;
+
+ let Inst{27-26} = 0b10;
+ let Inst{25-24} = !if(!eq(!cast<string>(RC), "IntRegs"), 0b00, 0b10);
+ let Inst{23} = 0b0;
+ let Inst{20-16} = src2;
+ let Inst{12-8} = src3;
+ let Inst{4-0} = dst;
+ }
+
+let hasSideEffects = 0 in
+class T_S2op_insert <bits<4> RegTyBits, RegisterClass RC, Operand ImmOp>
+ : SInst <(outs RC:$dst), (ins RC:$dst2, RC:$src1, ImmOp:$src2, ImmOp:$src3),
+ "$dst = insert($src1, #$src2, #$src3)",
+ [], "$dst2 = $dst", S_2op_tc_2_SLOT23> {
+ bits<5> dst;
+ bits<5> src1;
+ bits<6> src2;
+ bits<6> src3;
+ bit bit23;
+ bit bit13;
+ string ImmOpStr = !cast<string>(ImmOp);
+
+ let bit23 = !if (!eq(ImmOpStr, "u6_0Imm"), src3{5}, 0);
+ let bit13 = !if (!eq(ImmOpStr, "u6_0Imm"), src2{5}, 0);
+
+ let IClass = 0b1000;
+
+ let Inst{27-24} = RegTyBits;
+ let Inst{23} = bit23;
+ let Inst{22-21} = src3{4-3};
+ let Inst{20-16} = src1;
+ let Inst{13} = bit13;
+ let Inst{12-8} = src2{4-0};
+ let Inst{7-5} = src3{2-0};
+ let Inst{4-0} = dst;
+ }
+
+// Rx=insert(Rs,Rtt)
+// Rx=insert(Rs,#u5,#U5)
+let hasNewValue = 1 in {
+ def S2_insert_rp : T_S3op_insert <"insert", IntRegs>;
+ def S2_insert : T_S2op_insert <0b1111, IntRegs, u5_0Imm>;
+}
+
+// Rxx=insert(Rss,Rtt)
+// Rxx=insert(Rss,#u6,#U6)
+def S2_insertp_rp : T_S3op_insert<"insert", DoubleRegs>;
+def S2_insertp : T_S2op_insert <0b0011, DoubleRegs, u6_0Imm>;
+
+
+//===----------------------------------------------------------------------===//
+// Template class for 'extract bitfield' instructions
+//===----------------------------------------------------------------------===//
+let hasNewValue = 1, hasSideEffects = 0 in
+class T_S3op_extract <string mnemonic, bits<2> MinOp>
+ : SInst <(outs IntRegs:$Rd), (ins IntRegs:$Rs, DoubleRegs:$Rtt),
+ "$Rd = "#mnemonic#"($Rs, $Rtt)",
+ [], "", S_3op_tc_2_SLOT23 > {
+ bits<5> Rd;
+ bits<5> Rs;
+ bits<5> Rtt;
+
+ let IClass = 0b1100;
+
+ let Inst{27-22} = 0b100100;
+ let Inst{20-16} = Rs;
+ let Inst{12-8} = Rtt;
+ let Inst{7-6} = MinOp;
+ let Inst{4-0} = Rd;
+ }
+
+let hasSideEffects = 0 in
+class T_S2op_extract <string mnemonic, bits<4> RegTyBits,
+ RegisterClass RC, Operand ImmOp>
+ : SInst <(outs RC:$dst), (ins RC:$src1, ImmOp:$src2, ImmOp:$src3),
+ "$dst = "#mnemonic#"($src1, #$src2, #$src3)",
+ [], "", S_2op_tc_2_SLOT23> {
+ bits<5> dst;
+ bits<5> src1;
+ bits<6> src2;
+ bits<6> src3;
+ bit bit23;
+ bit bit13;
+ string ImmOpStr = !cast<string>(ImmOp);
+
+ let bit23 = !if (!eq(ImmOpStr, "u6_0Imm"), src3{5},
+ !if (!eq(mnemonic, "extractu"), 0, 1));
+
+ let bit13 = !if (!eq(ImmOpStr, "u6_0Imm"), src2{5}, 0);
+
+ let IClass = 0b1000;
+
+ let Inst{27-24} = RegTyBits;
+ let Inst{23} = bit23;
+ let Inst{22-21} = src3{4-3};
+ let Inst{20-16} = src1;
+ let Inst{13} = bit13;
+ let Inst{12-8} = src2{4-0};
+ let Inst{7-5} = src3{2-0};
+ let Inst{4-0} = dst;
+ }
+
+// Extract bitfield
+
+// Rdd=extractu(Rss,Rtt)
+// Rdd=extractu(Rss,#u6,#U6)
+def S2_extractup_rp : T_S3op_64 < "extractu", 0b00, 0b000, 0>;
+def S2_extractup : T_S2op_extract <"extractu", 0b0001, DoubleRegs, u6_0Imm>;
+
+// Rd=extractu(Rs,Rtt)
+// Rd=extractu(Rs,#u5,#U5)
+let hasNewValue = 1 in {
+ def S2_extractu_rp : T_S3op_extract<"extractu", 0b00>;
+ def S2_extractu : T_S2op_extract <"extractu", 0b1101, IntRegs, u5_0Imm>;
+}
+
+//===----------------------------------------------------------------------===//
+// :raw for of tableindx[bdhw] insns
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+class tableidxRaw<string OpStr, bits<2>MinOp>
+ : SInst <(outs IntRegs:$Rx),
+ (ins IntRegs:$_dst_, IntRegs:$Rs, u4_0Imm:$u4, s6_0Imm:$S6),
+ "$Rx = "#OpStr#"($Rs, #$u4, #$S6):raw",
+ [], "$Rx = $_dst_" > {
+ bits<5> Rx;
+ bits<5> Rs;
+ bits<4> u4;
+ bits<6> S6;
+
+ let IClass = 0b1000;
+
+ let Inst{27-24} = 0b0111;
+ let Inst{23-22} = MinOp;
+ let Inst{21} = u4{3};
+ let Inst{20-16} = Rs;
+ let Inst{13-8} = S6;
+ let Inst{7-5} = u4{2-0};
+ let Inst{4-0} = Rx;
+ }
+
+def S2_tableidxb : tableidxRaw<"tableidxb", 0b00>;
+def S2_tableidxh : tableidxRaw<"tableidxh", 0b01>;
+def S2_tableidxw : tableidxRaw<"tableidxw", 0b10>;
+def S2_tableidxd : tableidxRaw<"tableidxd", 0b11>;
+
+//===----------------------------------------------------------------------===//
+// Template class for 'table index' instructions which are assembler mapped
+// to their :raw format.
+//===----------------------------------------------------------------------===//
+let isPseudo = 1 in
+class tableidx_goodsyntax <string mnemonic>
+ : SInst <(outs IntRegs:$Rx),
+ (ins IntRegs:$_dst_, IntRegs:$Rs, u4_0Imm:$u4, u5_0Imm:$u5),
+ "$Rx = "#mnemonic#"($Rs, #$u4, #$u5)",
+ [], "$Rx = $_dst_" >;
+
+def S2_tableidxb_goodsyntax : tableidx_goodsyntax<"tableidxb">;
+def S2_tableidxh_goodsyntax : tableidx_goodsyntax<"tableidxh">;
+def S2_tableidxw_goodsyntax : tableidx_goodsyntax<"tableidxw">;
+def S2_tableidxd_goodsyntax : tableidx_goodsyntax<"tableidxd">;
+
+//===----------------------------------------------------------------------===//
+// V3 Instructions +
+//===----------------------------------------------------------------------===//
+
+include "HexagonInstrInfoV3.td"
+
+//===----------------------------------------------------------------------===//
+// V3 Instructions -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// V4 Instructions +
+//===----------------------------------------------------------------------===//
+
+include "HexagonInstrInfoV4.td"
+
+//===----------------------------------------------------------------------===//
+// V4 Instructions -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// V5 Instructions +
+//===----------------------------------------------------------------------===//
+
+include "HexagonInstrInfoV5.td"
+
+//===----------------------------------------------------------------------===//
+// V5 Instructions -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// V60 Instructions +
+//===----------------------------------------------------------------------===//
+
+include "HexagonInstrInfoV60.td"
+
+//===----------------------------------------------------------------------===//
+// V60 Instructions -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ALU32/64/Vector +
+//===----------------------------------------------------------------------===///
+
+include "HexagonInstrInfoVector.td"
+
+include "HexagonInstrAlias.td"
+include "HexagonSystemInst.td"
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV3.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV3.td
new file mode 100644
index 000000000000..225f94405076
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV3.td
@@ -0,0 +1,215 @@
+//=- HexagonInstrInfoV3.td - Target Desc. for Hexagon Target -*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon V3 instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// J +
+//===----------------------------------------------------------------------===//
+// Call subroutine.
+let isCall = 1, hasSideEffects = 1, isPredicable = 1,
+ isExtended = 0, isExtendable = 1, opExtendable = 0,
+ isExtentSigned = 1, opExtentBits = 24, opExtentAlign = 2 in
+class T_Call<bit CSR, string ExtStr>
+ : JInst<(outs), (ins calltarget:$dst),
+ "call " # ExtStr # "$dst", [], "", J_tc_2early_SLOT23> {
+ let BaseOpcode = "call";
+ bits<24> dst;
+
+ let Defs = !if (CSR, VolatileV3.Regs, []);
+ let IClass = 0b0101;
+ let Inst{27-25} = 0b101;
+ let Inst{24-16,13-1} = dst{23-2};
+ let Inst{0} = 0b0;
+}
+
+let isCall = 1, hasSideEffects = 1, isPredicated = 1,
+ isExtended = 0, isExtendable = 1, opExtendable = 1,
+ isExtentSigned = 1, opExtentBits = 17, opExtentAlign = 2 in
+class T_CallPred<bit CSR, bit IfTrue, string ExtStr>
+ : JInst<(outs), (ins PredRegs:$Pu, calltarget:$dst),
+ CondStr<"$Pu", IfTrue, 0>.S # "call " # ExtStr # "$dst",
+ [], "", J_tc_2early_SLOT23> {
+ let BaseOpcode = "call";
+ let isPredicatedFalse = !if(IfTrue,0,1);
+ bits<2> Pu;
+ bits<17> dst;
+
+ let Defs = !if (CSR, VolatileV3.Regs, []);
+ let IClass = 0b0101;
+ let Inst{27-24} = 0b1101;
+ let Inst{23-22,20-16,13,7-1} = dst{16-2};
+ let Inst{21} = !if(IfTrue,0,1);
+ let Inst{11} = 0b0;
+ let Inst{9-8} = Pu;
+}
+
+multiclass T_Calls<bit CSR, string ExtStr> {
+ def NAME : T_Call<CSR, ExtStr>;
+ def t : T_CallPred<CSR, 1, ExtStr>;
+ def f : T_CallPred<CSR, 0, ExtStr>;
+}
+
+defm J2_call: T_Calls<1, "">, PredRel;
+
+let isCodeGenOnly = 1, isCall = 1, hasSideEffects = 1,
+ Defs = VolatileV3.Regs in
+def PS_call_nr : T_Call<1, "">, PredRel;
+
+let isCodeGenOnly = 1, isCall = 1, hasSideEffects = 1,
+ Defs = [PC, R31, R6, R7, P0] in
+def PS_call_stk : T_Call<0, "">, PredRel;
+
+//===----------------------------------------------------------------------===//
+// J -
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// JR +
+//===----------------------------------------------------------------------===//
+// Call subroutine from register.
+
+let isCodeGenOnly = 1, Defs = VolatileV3.Regs in {
+ def PS_callr_nr : JUMPR_MISC_CALLR<0, 1>; // Call, no return.
+}
+
+//===----------------------------------------------------------------------===//
+// JR -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ALU64/ALU +
+//===----------------------------------------------------------------------===//
+
+let Defs = [USR_OVF], Itinerary = ALU64_tc_2_SLOT23 in
+def A2_addpsat : T_ALU64_arith<"add", 0b011, 0b101, 1, 0, 1>;
+
+class T_ALU64_addsp_hl<string suffix, bits<3> MinOp>
+ : T_ALU64_rr<"add", suffix, 0b0011, 0b011, MinOp, 0, 0, "">;
+
+def A2_addspl : T_ALU64_addsp_hl<":raw:lo", 0b110>;
+def A2_addsph : T_ALU64_addsp_hl<":raw:hi", 0b111>;
+
+let hasSideEffects = 0, isAsmParserOnly = 1 in
+def A2_addsp : ALU64_rr<(outs DoubleRegs:$Rd),
+ (ins IntRegs:$Rs, DoubleRegs:$Rt), "$Rd = add($Rs, $Rt)", [],
+ "", ALU64_tc_1_SLOT23>;
+
+
+let hasSideEffects = 0 in
+class T_XTYPE_MIN_MAX_P<bit isMax, bit isUnsigned>
+ : ALU64Inst<(outs DoubleRegs:$Rd), (ins DoubleRegs:$Rt, DoubleRegs:$Rs),
+ "$Rd = "#!if(isMax,"max","min")#!if(isUnsigned,"u","")
+ #"($Rt, $Rs)", [], "", ALU64_tc_2_SLOT23> {
+ bits<5> Rd;
+ bits<5> Rs;
+ bits<5> Rt;
+
+ let IClass = 0b1101;
+
+ let Inst{27-23} = 0b00111;
+ let Inst{22-21} = !if(isMax, 0b10, 0b01);
+ let Inst{20-16} = !if(isMax, Rt, Rs);
+ let Inst{12-8} = !if(isMax, Rs, Rt);
+ let Inst{7} = 0b1;
+ let Inst{6} = !if(isMax, 0b0, 0b1);
+ let Inst{5} = isUnsigned;
+ let Inst{4-0} = Rd;
+}
+
+def A2_minp : T_XTYPE_MIN_MAX_P<0, 0>;
+def A2_minup : T_XTYPE_MIN_MAX_P<0, 1>;
+def A2_maxp : T_XTYPE_MIN_MAX_P<1, 0>;
+def A2_maxup : T_XTYPE_MIN_MAX_P<1, 1>;
+
+//===----------------------------------------------------------------------===//
+// ALU64/ALU -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// :raw form of vrcmpys:hi/lo insns
+//===----------------------------------------------------------------------===//
+// Vector reduce complex multiply by scalar.
+let Defs = [USR_OVF], hasSideEffects = 0 in
+class T_vrcmpRaw<string HiLo, bits<3>MajOp>:
+ MInst<(outs DoubleRegs:$Rdd),
+ (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
+ "$Rdd = vrcmpys($Rss, $Rtt):<<1:sat:raw:"#HiLo, []> {
+ bits<5> Rdd;
+ bits<5> Rss;
+ bits<5> Rtt;
+
+ let IClass = 0b1110;
+
+ let Inst{27-24} = 0b1000;
+ let Inst{23-21} = MajOp;
+ let Inst{20-16} = Rss;
+ let Inst{12-8} = Rtt;
+ let Inst{7-5} = 0b100;
+ let Inst{4-0} = Rdd;
+}
+
+def M2_vrcmpys_s1_h: T_vrcmpRaw<"hi", 0b101>;
+def M2_vrcmpys_s1_l: T_vrcmpRaw<"lo", 0b111>;
+
+// Assembler mapped to M2_vrcmpys_s1_h or M2_vrcmpys_s1_l
+let hasSideEffects = 0, isAsmParserOnly = 1 in
+def M2_vrcmpys_s1
+ : MInst<(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rss, IntRegs:$Rt),
+ "$Rdd=vrcmpys($Rss,$Rt):<<1:sat">;
+
+// Vector reduce complex multiply by scalar with accumulation.
+let Defs = [USR_OVF], hasSideEffects = 0 in
+class T_vrcmpys_acc<string HiLo, bits<3>MajOp>:
+ MInst <(outs DoubleRegs:$Rxx),
+ (ins DoubleRegs:$_src_, DoubleRegs:$Rss, DoubleRegs:$Rtt),
+ "$Rxx += vrcmpys($Rss, $Rtt):<<1:sat:raw:"#HiLo, [],
+ "$Rxx = $_src_"> {
+ bits<5> Rxx;
+ bits<5> Rss;
+ bits<5> Rtt;
+
+ let IClass = 0b1110;
+
+ let Inst{27-24} = 0b1010;
+ let Inst{23-21} = MajOp;
+ let Inst{20-16} = Rss;
+ let Inst{12-8} = Rtt;
+ let Inst{7-5} = 0b100;
+ let Inst{4-0} = Rxx;
+ }
+
+def M2_vrcmpys_acc_s1_h: T_vrcmpys_acc<"hi", 0b101>;
+def M2_vrcmpys_acc_s1_l: T_vrcmpys_acc<"lo", 0b111>;
+
+// Assembler mapped to M2_vrcmpys_acc_s1_h or M2_vrcmpys_acc_s1_l
+
+let isAsmParserOnly = 1 in
+def M2_vrcmpys_acc_s1
+ : MInst <(outs DoubleRegs:$dst),
+ (ins DoubleRegs:$dst2, DoubleRegs:$src1, IntRegs:$src2),
+ "$dst += vrcmpys($src1, $src2):<<1:sat", [],
+ "$dst2 = $dst">;
+
+def M2_vrcmpys_s1rp_h : T_MType_vrcmpy <"vrcmpys", 0b101, 0b110, 1>;
+def M2_vrcmpys_s1rp_l : T_MType_vrcmpy <"vrcmpys", 0b101, 0b111, 0>;
+
+// Assembler mapped to M2_vrcmpys_s1rp_h or M2_vrcmpys_s1rp_l
+let isAsmParserOnly = 1 in
+def M2_vrcmpys_s1rp
+ : MInst <(outs IntRegs:$Rd), (ins DoubleRegs:$Rss, IntRegs:$Rt),
+ "$Rd=vrcmpys($Rss,$Rt):<<1:rnd:sat">;
+
+
+// S2_cabacdecbin: Cabac decode bin.
+let Defs = [P0], isPredicateLate = 1, Itinerary = S_3op_tc_1_SLOT23 in
+def S2_cabacdecbin : T_S3op_64 < "decbin", 0b11, 0b110, 0>;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV4.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV4.td
new file mode 100644
index 000000000000..18943a082d28
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV4.td
@@ -0,0 +1,3301 @@
+//=- HexagonInstrInfoV4.td - Target Desc. for Hexagon Target -*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon V4 instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+def DuplexIClass0: InstDuplex < 0 >;
+def DuplexIClass1: InstDuplex < 1 >;
+def DuplexIClass2: InstDuplex < 2 >;
+let isExtendable = 1 in {
+ def DuplexIClass3: InstDuplex < 3 >;
+ def DuplexIClass4: InstDuplex < 4 >;
+ def DuplexIClass5: InstDuplex < 5 >;
+ def DuplexIClass6: InstDuplex < 6 >;
+ def DuplexIClass7: InstDuplex < 7 >;
+}
+def DuplexIClass8: InstDuplex < 8 >;
+def DuplexIClass9: InstDuplex < 9 >;
+def DuplexIClassA: InstDuplex < 0xA >;
+def DuplexIClassB: InstDuplex < 0xB >;
+def DuplexIClassC: InstDuplex < 0xC >;
+def DuplexIClassD: InstDuplex < 0xD >;
+def DuplexIClassE: InstDuplex < 0xE >;
+def DuplexIClassF: InstDuplex < 0xF >;
+
+let hasSideEffects = 0 in
+class T_Immext<Operand ImmType>
+ : EXTENDERInst<(outs), (ins ImmType:$imm),
+ "immext(#$imm)", []> {
+ bits<32> imm;
+ let IClass = 0b0000;
+
+ let Inst{27-16} = imm{31-20};
+ let Inst{13-0} = imm{19-6};
+ }
+
+def A4_ext : T_Immext<u26_6Imm>;
+let isCodeGenOnly = 1 in {
+ let isBranch = 1 in
+ def A4_ext_b : T_Immext<brtarget>;
+ let isCall = 1 in
+ def A4_ext_c : T_Immext<calltarget>;
+ def A4_ext_g : T_Immext<globaladdress>;
+}
+
+// Hexagon V4 Architecture spec defines 8 instruction classes:
+// LD ST ALU32 XTYPE J JR MEMOP NV CR SYSTEM(system is not implemented in the
+// compiler)
+
+// LD Instructions:
+// ========================================
+// Loads (8/16/32/64 bit)
+// Deallocframe
+
+// ST Instructions:
+// ========================================
+// Stores (8/16/32/64 bit)
+// Allocframe
+
+// ALU32 Instructions:
+// ========================================
+// Arithmetic / Logical (32 bit)
+// Vector Halfword
+
+// XTYPE Instructions (32/64 bit):
+// ========================================
+// Arithmetic, Logical, Bit Manipulation
+// Multiply (Integer, Fractional, Complex)
+// Permute / Vector Permute Operations
+// Predicate Operations
+// Shift / Shift with Add/Sub/Logical
+// Vector Byte ALU
+// Vector Halfword (ALU, Shift, Multiply)
+// Vector Word (ALU, Shift)
+
+// J Instructions:
+// ========================================
+// Jump/Call PC-relative
+
+// JR Instructions:
+// ========================================
+// Jump/Call Register
+
+// MEMOP Instructions:
+// ========================================
+// Operation on memory (8/16/32 bit)
+
+// NV Instructions:
+// ========================================
+// New-value Jumps
+// New-value Stores
+
+// CR Instructions:
+// ========================================
+// Control-Register Transfers
+// Hardware Loop Setup
+// Predicate Logicals & Reductions
+
+// SYSTEM Instructions (not implemented in the compiler):
+// ========================================
+// Prefetch
+// Cache Maintenance
+// Bus Operations
+
+
+//===----------------------------------------------------------------------===//
+// ALU32 +
+//===----------------------------------------------------------------------===//
+
+class T_ALU32_3op_not<string mnemonic, bits<3> MajOp, bits<3> MinOp,
+ bit OpsRev>
+ : T_ALU32_3op<mnemonic, MajOp, MinOp, OpsRev, 0> {
+ let AsmString = "$Rd = "#mnemonic#"($Rs, ~$Rt)";
+}
+
+let BaseOpcode = "andn_rr", CextOpcode = "andn" in
+def A4_andn : T_ALU32_3op_not<"and", 0b001, 0b100, 1>;
+let BaseOpcode = "orn_rr", CextOpcode = "orn" in
+def A4_orn : T_ALU32_3op_not<"or", 0b001, 0b101, 1>;
+
+let CextOpcode = "rcmp.eq" in
+def A4_rcmpeq : T_ALU32_3op<"cmp.eq", 0b011, 0b010, 0, 1>;
+let CextOpcode = "!rcmp.eq" in
+def A4_rcmpneq : T_ALU32_3op<"!cmp.eq", 0b011, 0b011, 0, 1>;
+
+def C4_cmpneq : T_ALU32_3op_cmp<"!cmp.eq", 0b00, 1, 1>;
+def C4_cmplte : T_ALU32_3op_cmp<"!cmp.gt", 0b10, 1, 0>;
+def C4_cmplteu : T_ALU32_3op_cmp<"!cmp.gtu", 0b11, 1, 0>;
+
+class T_CMP_rrbh<string mnemonic, bits<3> MinOp, bit IsComm>
+ : SInst<(outs PredRegs:$Pd), (ins IntRegs:$Rs, IntRegs:$Rt),
+ "$Pd = "#mnemonic#"($Rs, $Rt)", [], "", S_3op_tc_2early_SLOT23>,
+ ImmRegRel {
+ let InputType = "reg";
+ let CextOpcode = mnemonic;
+ let isCompare = 1;
+ let isCommutable = IsComm;
+ let hasSideEffects = 0;
+
+ bits<2> Pd;
+ bits<5> Rs;
+ bits<5> Rt;
+
+ let IClass = 0b1100;
+ let Inst{27-21} = 0b0111110;
+ let Inst{20-16} = Rs;
+ let Inst{12-8} = Rt;
+ let Inst{7-5} = MinOp;
+ let Inst{1-0} = Pd;
+}
+
+def A4_cmpbeq : T_CMP_rrbh<"cmpb.eq", 0b110, 1>;
+def A4_cmpbgt : T_CMP_rrbh<"cmpb.gt", 0b010, 0>;
+def A4_cmpbgtu : T_CMP_rrbh<"cmpb.gtu", 0b111, 0>;
+def A4_cmpheq : T_CMP_rrbh<"cmph.eq", 0b011, 1>;
+def A4_cmphgt : T_CMP_rrbh<"cmph.gt", 0b100, 0>;
+def A4_cmphgtu : T_CMP_rrbh<"cmph.gtu", 0b101, 0>;
+
+class T_CMP_ribh<string mnemonic, bits<2> MajOp, bit IsHalf, bit IsComm,
+ Operand ImmType, bit IsImmExt, bit IsImmSigned, int ImmBits>
+ : ALU64Inst<(outs PredRegs:$Pd), (ins IntRegs:$Rs, ImmType:$Imm),
+ "$Pd = "#mnemonic#"($Rs, #$Imm)", [], "", ALU64_tc_2early_SLOT23>,
+ ImmRegRel {
+ let InputType = "imm";
+ let CextOpcode = mnemonic;
+ let isCompare = 1;
+ let isCommutable = IsComm;
+ let hasSideEffects = 0;
+ let isExtendable = IsImmExt;
+ let opExtendable = !if (IsImmExt, 2, 0);
+ let isExtentSigned = IsImmSigned;
+ let opExtentBits = ImmBits;
+
+ bits<2> Pd;
+ bits<5> Rs;
+ bits<8> Imm;
+
+ let IClass = 0b1101;
+ let Inst{27-24} = 0b1101;
+ let Inst{22-21} = MajOp;
+ let Inst{20-16} = Rs;
+ let Inst{12-5} = Imm;
+ let Inst{4} = 0b0;
+ let Inst{3} = IsHalf;
+ let Inst{1-0} = Pd;
+}
+
+def A4_cmpbeqi : T_CMP_ribh<"cmpb.eq", 0b00, 0, 1, u8_0Imm, 0, 0, 8>;
+def A4_cmpbgti : T_CMP_ribh<"cmpb.gt", 0b01, 0, 0, s8_0Imm, 0, 1, 8>;
+def A4_cmpbgtui : T_CMP_ribh<"cmpb.gtu", 0b10, 0, 0, u7_0Ext, 1, 0, 7>;
+def A4_cmpheqi : T_CMP_ribh<"cmph.eq", 0b00, 1, 1, s8_0Ext, 1, 1, 8>;
+def A4_cmphgti : T_CMP_ribh<"cmph.gt", 0b01, 1, 0, s8_0Ext, 1, 1, 8>;
+def A4_cmphgtui : T_CMP_ribh<"cmph.gtu", 0b10, 1, 0, u7_0Ext, 1, 0, 7>;
+
+class T_RCMP_EQ_ri<string mnemonic, bit IsNeg>
+ : ALU32_ri<(outs IntRegs:$Rd), (ins IntRegs:$Rs, s8_0Ext:$s8),
+ "$Rd = "#mnemonic#"($Rs, #$s8)", [], "", ALU32_2op_tc_1_SLOT0123>,
+ ImmRegRel {
+ let InputType = "imm";
+ let CextOpcode = !if (IsNeg, "!rcmp.eq", "rcmp.eq");
+ let isExtendable = 1;
+ let opExtendable = 2;
+ let isExtentSigned = 1;
+ let opExtentBits = 8;
+ let hasNewValue = 1;
+
+ bits<5> Rd;
+ bits<5> Rs;
+ bits<8> s8;
+
+ let IClass = 0b0111;
+ let Inst{27-24} = 0b0011;
+ let Inst{22} = 0b1;
+ let Inst{21} = IsNeg;
+ let Inst{20-16} = Rs;
+ let Inst{13} = 0b1;
+ let Inst{12-5} = s8;
+ let Inst{4-0} = Rd;
+}
+
+def A4_rcmpeqi : T_RCMP_EQ_ri<"cmp.eq", 0>;
+def A4_rcmpneqi : T_RCMP_EQ_ri<"!cmp.eq", 1>;
+
+//===----------------------------------------------------------------------===//
+// ALU32 -
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// ALU32/PERM +
+//===----------------------------------------------------------------------===//
+
+// Combine a word and an immediate into a register pair.
+let hasSideEffects = 0, isExtentSigned = 1, isExtendable = 1,
+ opExtentBits = 8 in
+class T_Combine1 <bits<2> MajOp, dag ins, string AsmStr>
+ : ALU32Inst <(outs DoubleRegs:$Rdd), ins, AsmStr> {
+ bits<5> Rdd;
+ bits<5> Rs;
+ bits<8> s8;
+
+ let IClass = 0b0111;
+ let Inst{27-24} = 0b0011;
+ let Inst{22-21} = MajOp;
+ let Inst{20-16} = Rs;
+ let Inst{13} = 0b1;
+ let Inst{12-5} = s8;
+ let Inst{4-0} = Rdd;
+ }
+
+let opExtendable = 2 in
+def A4_combineri : T_Combine1<0b00, (ins IntRegs:$Rs, s8_0Ext:$s8),
+ "$Rdd = combine($Rs, #$s8)">;
+
+let opExtendable = 1 in
+def A4_combineir : T_Combine1<0b01, (ins s8_0Ext:$s8, IntRegs:$Rs),
+ "$Rdd = combine(#$s8, $Rs)">;
+
+// A4_combineii: Set two small immediates.
+let hasSideEffects = 0, isExtendable = 1, opExtentBits = 6, opExtendable = 2 in
+def A4_combineii: ALU32Inst<(outs DoubleRegs:$Rdd), (ins s8_0Imm:$s8, u6_0Ext:$U6),
+ "$Rdd = combine(#$s8, #$U6)"> {
+ bits<5> Rdd;
+ bits<8> s8;
+ bits<6> U6;
+
+ let IClass = 0b0111;
+ let Inst{27-23} = 0b11001;
+ let Inst{20-16} = U6{5-1};
+ let Inst{13} = U6{0};
+ let Inst{12-5} = s8;
+ let Inst{4-0} = Rdd;
+ }
+
+//===----------------------------------------------------------------------===//
+// ALU32/PERM -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// LD +
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Template class for load instructions with Absolute set addressing mode.
+//===----------------------------------------------------------------------===//
+let isExtended = 1, opExtendable = 2, opExtentBits = 6, addrMode = AbsoluteSet,
+ hasSideEffects = 0 in
+class T_LD_abs_set<string mnemonic, RegisterClass RC, bits<4>MajOp>:
+ LDInst<(outs RC:$dst1, IntRegs:$dst2),
+ (ins u6_0Ext:$addr),
+ "$dst1 = "#mnemonic#"($dst2 = #$addr)",
+ []> {
+ bits<7> name;
+ bits<5> dst1;
+ bits<5> dst2;
+ bits<6> addr;
+
+ let IClass = 0b1001;
+ let Inst{27-25} = 0b101;
+ let Inst{24-21} = MajOp;
+ let Inst{13-12} = 0b01;
+ let Inst{4-0} = dst1;
+ let Inst{20-16} = dst2;
+ let Inst{11-8} = addr{5-2};
+ let Inst{6-5} = addr{1-0};
+}
+
+let accessSize = ByteAccess, hasNewValue = 1 in {
+ def L4_loadrb_ap : T_LD_abs_set <"memb", IntRegs, 0b1000>;
+ def L4_loadrub_ap : T_LD_abs_set <"memub", IntRegs, 0b1001>;
+}
+
+let accessSize = HalfWordAccess, hasNewValue = 1 in {
+ def L4_loadrh_ap : T_LD_abs_set <"memh", IntRegs, 0b1010>;
+ def L4_loadruh_ap : T_LD_abs_set <"memuh", IntRegs, 0b1011>;
+ def L4_loadbsw2_ap : T_LD_abs_set <"membh", IntRegs, 0b0001>;
+ def L4_loadbzw2_ap : T_LD_abs_set <"memubh", IntRegs, 0b0011>;
+}
+
+let accessSize = WordAccess, hasNewValue = 1 in
+ def L4_loadri_ap : T_LD_abs_set <"memw", IntRegs, 0b1100>;
+
+let accessSize = WordAccess in {
+ def L4_loadbzw4_ap : T_LD_abs_set <"memubh", DoubleRegs, 0b0101>;
+ def L4_loadbsw4_ap : T_LD_abs_set <"membh", DoubleRegs, 0b0111>;
+}
+
+let accessSize = DoubleWordAccess in
+def L4_loadrd_ap : T_LD_abs_set <"memd", DoubleRegs, 0b1110>;
+
+let accessSize = ByteAccess in
+ def L4_loadalignb_ap : T_LD_abs_set <"memb_fifo", DoubleRegs, 0b0100>;
+
+let accessSize = HalfWordAccess in
+def L4_loadalignh_ap : T_LD_abs_set <"memh_fifo", DoubleRegs, 0b0010>;
+
+// Load - Indirect with long offset
+let InputType = "imm", addrMode = BaseLongOffset, isExtended = 1,
+opExtentBits = 6, opExtendable = 3 in
+class T_LoadAbsReg <string mnemonic, string CextOp, RegisterClass RC,
+ bits<4> MajOp>
+ : LDInst <(outs RC:$dst), (ins IntRegs:$src1, u2_0Imm:$src2, u6_0Ext:$src3),
+ "$dst = "#mnemonic#"($src1<<#$src2 + #$src3)",
+ [] >, ImmRegShl {
+ bits<5> dst;
+ bits<5> src1;
+ bits<2> src2;
+ bits<6> src3;
+ let CextOpcode = CextOp;
+ let hasNewValue = !if (!eq(!cast<string>(RC), "DoubleRegs"), 0, 1);
+
+ let IClass = 0b1001;
+ let Inst{27-25} = 0b110;
+ let Inst{24-21} = MajOp;
+ let Inst{20-16} = src1;
+ let Inst{13} = src2{1};
+ let Inst{12} = 0b1;
+ let Inst{11-8} = src3{5-2};
+ let Inst{7} = src2{0};
+ let Inst{6-5} = src3{1-0};
+ let Inst{4-0} = dst;
+ }
+
+let accessSize = ByteAccess in {
+ def L4_loadrb_ur : T_LoadAbsReg<"memb", "LDrib", IntRegs, 0b1000>;
+ def L4_loadrub_ur : T_LoadAbsReg<"memub", "LDriub", IntRegs, 0b1001>;
+ def L4_loadalignb_ur : T_LoadAbsReg<"memb_fifo", "LDrib_fifo",
+ DoubleRegs, 0b0100>;
+}
+
+let accessSize = HalfWordAccess in {
+ def L4_loadrh_ur : T_LoadAbsReg<"memh", "LDrih", IntRegs, 0b1010>;
+ def L4_loadruh_ur : T_LoadAbsReg<"memuh", "LDriuh", IntRegs, 0b1011>;
+ def L4_loadbsw2_ur : T_LoadAbsReg<"membh", "LDribh2", IntRegs, 0b0001>;
+ def L4_loadbzw2_ur : T_LoadAbsReg<"memubh", "LDriubh2", IntRegs, 0b0011>;
+ def L4_loadalignh_ur : T_LoadAbsReg<"memh_fifo", "LDrih_fifo",
+ DoubleRegs, 0b0010>;
+}
+
+let accessSize = WordAccess in {
+ def L4_loadri_ur : T_LoadAbsReg<"memw", "LDriw", IntRegs, 0b1100>;
+ def L4_loadbsw4_ur : T_LoadAbsReg<"membh", "LDribh4", DoubleRegs, 0b0111>;
+ def L4_loadbzw4_ur : T_LoadAbsReg<"memubh", "LDriubh4", DoubleRegs, 0b0101>;
+}
+
+let accessSize = DoubleWordAccess in
+def L4_loadrd_ur : T_LoadAbsReg<"memd", "LDrid", DoubleRegs, 0b1110>;
+
+
+//===----------------------------------------------------------------------===//
+// Template classes for the non-predicated load instructions with
+// base + register offset addressing mode
+//===----------------------------------------------------------------------===//
+class T_load_rr <string mnemonic, RegisterClass RC, bits<3> MajOp>:
+ LDInst<(outs RC:$dst), (ins IntRegs:$src1, IntRegs:$src2, u2_0Imm:$u2),
+ "$dst = "#mnemonic#"($src1 + $src2<<#$u2)",
+ [], "", V4LDST_tc_ld_SLOT01>, ImmRegShl, AddrModeRel {
+ bits<5> dst;
+ bits<5> src1;
+ bits<5> src2;
+ bits<2> u2;
+
+ let IClass = 0b0011;
+
+ let Inst{27-24} = 0b1010;
+ let Inst{23-21} = MajOp;
+ let Inst{20-16} = src1;
+ let Inst{12-8} = src2;
+ let Inst{13} = u2{1};
+ let Inst{7} = u2{0};
+ let Inst{4-0} = dst;
+ }
+
+//===----------------------------------------------------------------------===//
+// Template classes for the predicated load instructions with
+// base + register offset addressing mode
+//===----------------------------------------------------------------------===//
+let isPredicated = 1 in
+class T_pload_rr <string mnemonic, RegisterClass RC, bits<3> MajOp,
+ bit isNot, bit isPredNew>:
+ LDInst <(outs RC:$dst),
+ (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2_0Imm:$u2),
+ !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+ ") ")#"$dst = "#mnemonic#"($src2+$src3<<#$u2)",
+ [], "", V4LDST_tc_ld_SLOT01>, AddrModeRel {
+ bits<5> dst;
+ bits<2> src1;
+ bits<5> src2;
+ bits<5> src3;
+ bits<2> u2;
+
+ let isPredicatedFalse = isNot;
+ let isPredicatedNew = isPredNew;
+
+ let IClass = 0b0011;
+
+ let Inst{27-26} = 0b00;
+ let Inst{25} = isPredNew;
+ let Inst{24} = isNot;
+ let Inst{23-21} = MajOp;
+ let Inst{20-16} = src2;
+ let Inst{12-8} = src3;
+ let Inst{13} = u2{1};
+ let Inst{7} = u2{0};
+ let Inst{6-5} = src1;
+ let Inst{4-0} = dst;
+ }
+
+//===----------------------------------------------------------------------===//
+// multiclass for load instructions with base + register offset
+// addressing mode
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0, addrMode = BaseRegOffset in
+multiclass ld_idxd_shl <string mnemonic, string CextOp, RegisterClass RC,
+ bits<3> MajOp > {
+ let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed_shl,
+ InputType = "reg" in {
+ let isPredicable = 1 in
+ def L4_#NAME#_rr : T_load_rr <mnemonic, RC, MajOp>;
+
+ // Predicated
+ def L4_p#NAME#t_rr : T_pload_rr <mnemonic, RC, MajOp, 0, 0>;
+ def L4_p#NAME#f_rr : T_pload_rr <mnemonic, RC, MajOp, 1, 0>;
+
+ // Predicated new
+ def L4_p#NAME#tnew_rr : T_pload_rr <mnemonic, RC, MajOp, 0, 1>;
+ def L4_p#NAME#fnew_rr : T_pload_rr <mnemonic, RC, MajOp, 1, 1>;
+ }
+}
+
+let hasNewValue = 1, accessSize = ByteAccess in {
+ defm loadrb : ld_idxd_shl<"memb", "LDrib", IntRegs, 0b000>;
+ defm loadrub : ld_idxd_shl<"memub", "LDriub", IntRegs, 0b001>;
+}
+
+let hasNewValue = 1, accessSize = HalfWordAccess in {
+ defm loadrh : ld_idxd_shl<"memh", "LDrih", IntRegs, 0b010>;
+ defm loadruh : ld_idxd_shl<"memuh", "LDriuh", IntRegs, 0b011>;
+}
+
+let hasNewValue = 1, accessSize = WordAccess in
+defm loadri : ld_idxd_shl<"memw", "LDriw", IntRegs, 0b100>;
+
+let accessSize = DoubleWordAccess in
+defm loadrd : ld_idxd_shl<"memd", "LDrid", DoubleRegs, 0b110>;
+
+//===----------------------------------------------------------------------===//
+// LD -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ST +
+//===----------------------------------------------------------------------===//
+///
+//===----------------------------------------------------------------------===//
+// Template class for store instructions with Absolute set addressing mode.
+//===----------------------------------------------------------------------===//
+let isExtended = 1, opExtendable = 1, opExtentBits = 6,
+ addrMode = AbsoluteSet in
+class T_ST_absset <string mnemonic, string BaseOp, RegisterClass RC,
+ bits<3> MajOp, MemAccessSize AccessSz, bit isHalf = 0>
+ : STInst<(outs IntRegs:$dst),
+ (ins u6_0Ext:$addr, RC:$src),
+ mnemonic#"($dst = #$addr) = $src"#!if(isHalf, ".h","")>, NewValueRel {
+ bits<5> dst;
+ bits<6> addr;
+ bits<5> src;
+ let accessSize = AccessSz;
+ let BaseOpcode = BaseOp#"_AbsSet";
+
+ // Store upper-half and store doubleword cannot be NV.
+ let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isHalf,0,1));
+
+ let IClass = 0b1010;
+
+ let Inst{27-24} = 0b1011;
+ let Inst{23-21} = MajOp;
+ let Inst{20-16} = dst;
+ let Inst{13} = 0b0;
+ let Inst{12-8} = src;
+ let Inst{7} = 0b1;
+ let Inst{5-0} = addr;
+ }
+
+def S4_storerb_ap : T_ST_absset <"memb", "STrib", IntRegs, 0b000, ByteAccess>;
+def S4_storerh_ap : T_ST_absset <"memh", "STrih", IntRegs, 0b010,
+ HalfWordAccess>;
+def S4_storeri_ap : T_ST_absset <"memw", "STriw", IntRegs, 0b100, WordAccess>;
+
+let isNVStorable = 0 in {
+ def S4_storerf_ap : T_ST_absset <"memh", "STrif", IntRegs,
+ 0b011, HalfWordAccess, 1>;
+ def S4_storerd_ap : T_ST_absset <"memd", "STrid", DoubleRegs,
+ 0b110, DoubleWordAccess>;
+}
+
+let opExtendable = 1, isNewValue = 1, isNVStore = 1, opNewValue = 2,
+isExtended = 1, opExtentBits= 6 in
+class T_ST_absset_nv <string mnemonic, string BaseOp, bits<2> MajOp,
+ MemAccessSize AccessSz >
+ : NVInst <(outs IntRegs:$dst),
+ (ins u6_0Ext:$addr, IntRegs:$src),
+ mnemonic#"($dst = #$addr) = $src.new">, NewValueRel {
+ bits<5> dst;
+ bits<6> addr;
+ bits<3> src;
+ let accessSize = AccessSz;
+ let BaseOpcode = BaseOp#"_AbsSet";
+
+ let IClass = 0b1010;
+
+ let Inst{27-21} = 0b1011101;
+ let Inst{20-16} = dst;
+ let Inst{13-11} = 0b000;
+ let Inst{12-11} = MajOp;
+ let Inst{10-8} = src;
+ let Inst{7} = 0b1;
+ let Inst{5-0} = addr;
+ }
+
+let mayStore = 1, addrMode = AbsoluteSet in {
+ def S4_storerbnew_ap : T_ST_absset_nv <"memb", "STrib", 0b00, ByteAccess>;
+ def S4_storerhnew_ap : T_ST_absset_nv <"memh", "STrih", 0b01, HalfWordAccess>;
+ def S4_storerinew_ap : T_ST_absset_nv <"memw", "STriw", 0b10, WordAccess>;
+}
+
+let isExtended = 1, opExtendable = 2, opExtentBits = 6, InputType = "imm",
+ addrMode = BaseLongOffset, AddedComplexity = 40 in
+class T_StoreAbsReg <string mnemonic, string CextOp, RegisterClass RC,
+ bits<3> MajOp, MemAccessSize AccessSz, bit isHalf = 0>
+ : STInst<(outs),
+ (ins IntRegs:$src1, u2_0Imm:$src2, u6_0Ext:$src3, RC:$src4),
+ mnemonic#"($src1<<#$src2 + #$src3) = $src4"#!if(isHalf, ".h",""),
+ []>, ImmRegShl, NewValueRel {
+
+ bits<5> src1;
+ bits<2> src2;
+ bits<6> src3;
+ bits<5> src4;
+
+ let accessSize = AccessSz;
+ let CextOpcode = CextOp;
+ let BaseOpcode = CextOp#"_shl";
+
+ // Store upper-half and store doubleword cannot be NV.
+ let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isHalf,0,1));
+
+ let IClass = 0b1010;
+
+ let Inst{27-24} =0b1101;
+ let Inst{23-21} = MajOp;
+ let Inst{20-16} = src1;
+ let Inst{13} = src2{1};
+ let Inst{12-8} = src4;
+ let Inst{7} = 0b1;
+ let Inst{6} = src2{0};
+ let Inst{5-0} = src3;
+}
+
+def S4_storerb_ur : T_StoreAbsReg <"memb", "STrib", IntRegs, 0b000, ByteAccess>;
+def S4_storerh_ur : T_StoreAbsReg <"memh", "STrih", IntRegs, 0b010,
+ HalfWordAccess>;
+def S4_storerf_ur : T_StoreAbsReg <"memh", "STrif", IntRegs, 0b011,
+ HalfWordAccess, 1>;
+def S4_storeri_ur : T_StoreAbsReg <"memw", "STriw", IntRegs, 0b100, WordAccess>;
+def S4_storerd_ur : T_StoreAbsReg <"memd", "STrid", DoubleRegs, 0b110,
+ DoubleWordAccess>;
+
+let mayStore = 1, isNVStore = 1, isExtended = 1, addrMode = BaseLongOffset,
+ opExtentBits = 6, isNewValue = 1, opNewValue = 3, opExtendable = 2 in
+class T_StoreAbsRegNV <string mnemonic, string CextOp, bits<2> MajOp,
+ MemAccessSize AccessSz>
+ : NVInst <(outs ),
+ (ins IntRegs:$src1, u2_0Imm:$src2, u6_0Ext:$src3, IntRegs:$src4),
+ mnemonic#"($src1<<#$src2 + #$src3) = $src4.new">, NewValueRel {
+ bits<5> src1;
+ bits<2> src2;
+ bits<6> src3;
+ bits<3> src4;
+
+ let CextOpcode = CextOp;
+ let BaseOpcode = CextOp#"_shl";
+ let IClass = 0b1010;
+
+ let Inst{27-21} = 0b1101101;
+ let Inst{12-11} = 0b00;
+ let Inst{7} = 0b1;
+ let Inst{20-16} = src1;
+ let Inst{13} = src2{1};
+ let Inst{12-11} = MajOp;
+ let Inst{10-8} = src4;
+ let Inst{6} = src2{0};
+ let Inst{5-0} = src3;
+ }
+
+def S4_storerbnew_ur : T_StoreAbsRegNV <"memb", "STrib", 0b00, ByteAccess>;
+def S4_storerhnew_ur : T_StoreAbsRegNV <"memh", "STrih", 0b01, HalfWordAccess>;
+def S4_storerinew_ur : T_StoreAbsRegNV <"memw", "STriw", 0b10, WordAccess>;
+
+//===----------------------------------------------------------------------===//
+// Template classes for the non-predicated store instructions with
+// base + register offset addressing mode
+//===----------------------------------------------------------------------===//
+let isPredicable = 1 in
+class T_store_rr <string mnemonic, RegisterClass RC, bits<3> MajOp, bit isH>
+ : STInst < (outs ), (ins IntRegs:$Rs, IntRegs:$Ru, u2_0Imm:$u2, RC:$Rt),
+ mnemonic#"($Rs + $Ru<<#$u2) = $Rt"#!if(isH, ".h",""),
+ [],"",V4LDST_tc_st_SLOT01>, ImmRegShl, AddrModeRel {
+
+ bits<5> Rs;
+ bits<5> Ru;
+ bits<2> u2;
+ bits<5> Rt;
+
+ // Store upper-half and store doubleword cannot be NV.
+ let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isH,0,1));
+
+ let IClass = 0b0011;
+
+ let Inst{27-24} = 0b1011;
+ let Inst{23-21} = MajOp;
+ let Inst{20-16} = Rs;
+ let Inst{12-8} = Ru;
+ let Inst{13} = u2{1};
+ let Inst{7} = u2{0};
+ let Inst{4-0} = Rt;
+ }
+
+//===----------------------------------------------------------------------===//
+// Template classes for the predicated store instructions with
+// base + register offset addressing mode
+//===----------------------------------------------------------------------===//
+let isPredicated = 1 in
+class T_pstore_rr <string mnemonic, RegisterClass RC, bits<3> MajOp,
+ bit isNot, bit isPredNew, bit isH>
+ : STInst <(outs),
+ (ins PredRegs:$Pv, IntRegs:$Rs, IntRegs:$Ru, u2_0Imm:$u2, RC:$Rt),
+
+ !if(isNot, "if (!$Pv", "if ($Pv")#!if(isPredNew, ".new) ",
+ ") ")#mnemonic#"($Rs+$Ru<<#$u2) = $Rt"#!if(isH, ".h",""),
+ [], "", V4LDST_tc_st_SLOT01> , AddrModeRel{
+ bits<2> Pv;
+ bits<5> Rs;
+ bits<5> Ru;
+ bits<2> u2;
+ bits<5> Rt;
+
+ let isPredicatedFalse = isNot;
+ let isPredicatedNew = isPredNew;
+ // Store upper-half and store doubleword cannot be NV.
+ let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isH,0,1));
+
+ let IClass = 0b0011;
+
+ let Inst{27-26} = 0b01;
+ let Inst{25} = isPredNew;
+ let Inst{24} = isNot;
+ let Inst{23-21} = MajOp;
+ let Inst{20-16} = Rs;
+ let Inst{12-8} = Ru;
+ let Inst{13} = u2{1};
+ let Inst{7} = u2{0};
+ let Inst{6-5} = Pv;
+ let Inst{4-0} = Rt;
+ }
+
+//===----------------------------------------------------------------------===//
+// Template classes for the new-value store instructions with
+// base + register offset addressing mode
+//===----------------------------------------------------------------------===//
+let isPredicable = 1, isNewValue = 1, opNewValue = 3 in
+class T_store_new_rr <string mnemonic, bits<2> MajOp> :
+ NVInst < (outs ), (ins IntRegs:$Rs, IntRegs:$Ru, u2_0Imm:$u2, IntRegs:$Nt),
+ mnemonic#"($Rs + $Ru<<#$u2) = $Nt.new",
+ [],"",V4LDST_tc_st_SLOT0>, ImmRegShl, AddrModeRel {
+
+ bits<5> Rs;
+ bits<5> Ru;
+ bits<2> u2;
+ bits<3> Nt;
+
+ let IClass = 0b0011;
+
+ let Inst{27-21} = 0b1011101;
+ let Inst{20-16} = Rs;
+ let Inst{12-8} = Ru;
+ let Inst{13} = u2{1};
+ let Inst{7} = u2{0};
+ let Inst{4-3} = MajOp;
+ let Inst{2-0} = Nt;
+ }
+
+//===----------------------------------------------------------------------===//
+// Template classes for the predicated new-value store instructions with
+// base + register offset addressing mode
+//===----------------------------------------------------------------------===//
+let isPredicated = 1, isNewValue = 1, opNewValue = 4 in
+class T_pstore_new_rr <string mnemonic, bits<2> MajOp, bit isNot, bit isPredNew>
+ : NVInst<(outs),
+ (ins PredRegs:$Pv, IntRegs:$Rs, IntRegs:$Ru, u2_0Imm:$u2, IntRegs:$Nt),
+ !if(isNot, "if (!$Pv", "if ($Pv")#!if(isPredNew, ".new) ",
+ ") ")#mnemonic#"($Rs+$Ru<<#$u2) = $Nt.new",
+ [], "", V4LDST_tc_st_SLOT0>, AddrModeRel {
+ bits<2> Pv;
+ bits<5> Rs;
+ bits<5> Ru;
+ bits<2> u2;
+ bits<3> Nt;
+
+ let isPredicatedFalse = isNot;
+ let isPredicatedNew = isPredNew;
+
+ let IClass = 0b0011;
+ let Inst{27-26} = 0b01;
+ let Inst{25} = isPredNew;
+ let Inst{24} = isNot;
+ let Inst{23-21} = 0b101;
+ let Inst{20-16} = Rs;
+ let Inst{12-8} = Ru;
+ let Inst{13} = u2{1};
+ let Inst{7} = u2{0};
+ let Inst{6-5} = Pv;
+ let Inst{4-3} = MajOp;
+ let Inst{2-0} = Nt;
+ }
+
+//===----------------------------------------------------------------------===//
+// multiclass for store instructions with base + register offset addressing
+// mode
+//===----------------------------------------------------------------------===//
+let isNVStorable = 1 in
+multiclass ST_Idxd_shl<string mnemonic, string CextOp, RegisterClass RC,
+ bits<3> MajOp, bit isH = 0> {
+ let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed_shl in {
+ def S4_#NAME#_rr : T_store_rr <mnemonic, RC, MajOp, isH>;
+
+ // Predicated
+ def S4_p#NAME#t_rr : T_pstore_rr <mnemonic, RC, MajOp, 0, 0, isH>;
+ def S4_p#NAME#f_rr : T_pstore_rr <mnemonic, RC, MajOp, 1, 0, isH>;
+
+ // Predicated new
+ def S4_p#NAME#tnew_rr : T_pstore_rr <mnemonic, RC, MajOp, 0, 1, isH>;
+ def S4_p#NAME#fnew_rr : T_pstore_rr <mnemonic, RC, MajOp, 1, 1, isH>;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// multiclass for new-value store instructions with base + register offset
+// addressing mode.
+//===----------------------------------------------------------------------===//
+let mayStore = 1, isNVStore = 1 in
+multiclass ST_Idxd_shl_nv <string mnemonic, string CextOp, RegisterClass RC,
+ bits<2> MajOp> {
+ let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed_shl in {
+ def S4_#NAME#new_rr : T_store_new_rr<mnemonic, MajOp>;
+
+ // Predicated
+ def S4_p#NAME#newt_rr : T_pstore_new_rr <mnemonic, MajOp, 0, 0>;
+ def S4_p#NAME#newf_rr : T_pstore_new_rr <mnemonic, MajOp, 1, 0>;
+
+ // Predicated new
+ def S4_p#NAME#newtnew_rr : T_pstore_new_rr <mnemonic, MajOp, 0, 1>;
+ def S4_p#NAME#newfnew_rr : T_pstore_new_rr <mnemonic, MajOp, 1, 1>;
+ }
+}
+
+let addrMode = BaseRegOffset, InputType = "reg", hasSideEffects = 0 in {
+ let accessSize = ByteAccess in
+ defm storerb: ST_Idxd_shl<"memb", "STrib", IntRegs, 0b000>,
+ ST_Idxd_shl_nv<"memb", "STrib", IntRegs, 0b00>;
+
+ let accessSize = HalfWordAccess in
+ defm storerh: ST_Idxd_shl<"memh", "STrih", IntRegs, 0b010>,
+ ST_Idxd_shl_nv<"memh", "STrih", IntRegs, 0b01>;
+
+ let accessSize = WordAccess in
+ defm storeri: ST_Idxd_shl<"memw", "STriw", IntRegs, 0b100>,
+ ST_Idxd_shl_nv<"memw", "STriw", IntRegs, 0b10>;
+
+ let isNVStorable = 0, accessSize = DoubleWordAccess in
+ defm storerd: ST_Idxd_shl<"memd", "STrid", DoubleRegs, 0b110>;
+
+ let isNVStorable = 0, accessSize = HalfWordAccess in
+ defm storerf: ST_Idxd_shl<"memh", "STrif", IntRegs, 0b011, 1>;
+}
+
+//===----------------------------------------------------------------------===//
+// Template class
+//===----------------------------------------------------------------------===//
+let isPredicable = 1, isExtendable = 1, isExtentSigned = 1, opExtentBits = 8,
+ opExtendable = 2 in
+class T_StoreImm <string mnemonic, Operand OffsetOp, bits<2> MajOp >
+ : STInst <(outs ), (ins IntRegs:$Rs, OffsetOp:$offset, s8_0Ext:$S8),
+ mnemonic#"($Rs+#$offset)=#$S8",
+ [], "", V4LDST_tc_st_SLOT01>,
+ ImmRegRel, PredNewRel {
+ bits<5> Rs;
+ bits<8> S8;
+ bits<8> offset;
+ bits<6> offsetBits;
+
+ string OffsetOpStr = !cast<string>(OffsetOp);
+ let offsetBits = !if (!eq(OffsetOpStr, "u6_2Imm"), offset{7-2},
+ !if (!eq(OffsetOpStr, "u6_1Imm"), offset{6-1},
+ /* u6_0Imm */ offset{5-0}));
+
+ let IClass = 0b0011;
+
+ let Inst{27-25} = 0b110;
+ let Inst{22-21} = MajOp;
+ let Inst{20-16} = Rs;
+ let Inst{12-7} = offsetBits;
+ let Inst{13} = S8{7};
+ let Inst{6-0} = S8{6-0};
+ }
+
+let isPredicated = 1, isExtendable = 1, isExtentSigned = 1, opExtentBits = 6,
+ opExtendable = 3 in
+class T_StoreImm_pred <string mnemonic, Operand OffsetOp, bits<2> MajOp,
+ bit isPredNot, bit isPredNew >
+ : STInst <(outs ),
+ (ins PredRegs:$Pv, IntRegs:$Rs, OffsetOp:$offset, s6_0Ext:$S6),
+ !if(isPredNot, "if (!$Pv", "if ($Pv")#!if(isPredNew, ".new) ",
+ ") ")#mnemonic#"($Rs+#$offset)=#$S6",
+ [], "", V4LDST_tc_st_SLOT01>,
+ ImmRegRel, PredNewRel {
+ bits<2> Pv;
+ bits<5> Rs;
+ bits<6> S6;
+ bits<8> offset;
+ bits<6> offsetBits;
+
+ string OffsetOpStr = !cast<string>(OffsetOp);
+ let offsetBits = !if (!eq(OffsetOpStr, "u6_2Imm"), offset{7-2},
+ !if (!eq(OffsetOpStr, "u6_1Imm"), offset{6-1},
+ /* u6_0Imm */ offset{5-0}));
+ let isPredicatedNew = isPredNew;
+ let isPredicatedFalse = isPredNot;
+
+ let IClass = 0b0011;
+
+ let Inst{27-25} = 0b100;
+ let Inst{24} = isPredNew;
+ let Inst{23} = isPredNot;
+ let Inst{22-21} = MajOp;
+ let Inst{20-16} = Rs;
+ let Inst{13} = S6{5};
+ let Inst{12-7} = offsetBits;
+ let Inst{6-5} = Pv;
+ let Inst{4-0} = S6{4-0};
+ }
+
+
+//===----------------------------------------------------------------------===//
+// multiclass for store instructions with base + immediate offset
+// addressing mode and immediate stored value.
+// mem[bhw](Rx++#s4:3)=#s8
+// if ([!]Pv[.new]) mem[bhw](Rx++#s4:3)=#s6
+//===----------------------------------------------------------------------===//
+
+multiclass ST_Imm_Pred <string mnemonic, Operand OffsetOp, bits<2> MajOp,
+ bit PredNot> {
+ def _io : T_StoreImm_pred <mnemonic, OffsetOp, MajOp, PredNot, 0>;
+ // Predicate new
+ def new_io : T_StoreImm_pred <mnemonic, OffsetOp, MajOp, PredNot, 1>;
+}
+
+multiclass ST_Imm <string mnemonic, string CextOp, Operand OffsetOp,
+ bits<2> MajOp> {
+ let CextOpcode = CextOp, BaseOpcode = CextOp#_imm in {
+ def _io : T_StoreImm <mnemonic, OffsetOp, MajOp>;
+
+ defm t : ST_Imm_Pred <mnemonic, OffsetOp, MajOp, 0>;
+ defm f : ST_Imm_Pred <mnemonic, OffsetOp, MajOp, 1>;
+ }
+}
+
+let hasSideEffects = 0, addrMode = BaseImmOffset,
+ InputType = "imm" in {
+ let accessSize = ByteAccess in
+ defm S4_storeirb : ST_Imm<"memb", "STrib", u6_0Imm, 0b00>;
+
+ let accessSize = HalfWordAccess in
+ defm S4_storeirh : ST_Imm<"memh", "STrih", u6_1Imm, 0b01>;
+
+ let accessSize = WordAccess in
+ defm S4_storeiri : ST_Imm<"memw", "STriw", u6_2Imm, 0b10>;
+}
+
+//===----------------------------------------------------------------------===
+// ST -
+//===----------------------------------------------------------------------===
+
+
+//===----------------------------------------------------------------------===//
+// NV/ST +
+//===----------------------------------------------------------------------===//
+
+let opNewValue = 2, opExtendable = 1, isExtentSigned = 1, isPredicable = 1 in
+class T_store_io_nv <string mnemonic, RegisterClass RC,
+ Operand ImmOp, bits<2>MajOp>
+ : NVInst_V4 <(outs),
+ (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
+ mnemonic#"($src1+#$src2) = $src3.new",
+ [],"",ST_tc_st_SLOT0> {
+ bits<5> src1;
+ bits<13> src2; // Actual address offset
+ bits<3> src3;
+ bits<11> offsetBits; // Represents offset encoding
+
+ let opExtentBits = !if (!eq(mnemonic, "memb"), 11,
+ !if (!eq(mnemonic, "memh"), 12,
+ !if (!eq(mnemonic, "memw"), 13, 0)));
+
+ let opExtentAlign = !if (!eq(mnemonic, "memb"), 0,
+ !if (!eq(mnemonic, "memh"), 1,
+ !if (!eq(mnemonic, "memw"), 2, 0)));
+
+ let offsetBits = !if (!eq(mnemonic, "memb"), src2{10-0},
+ !if (!eq(mnemonic, "memh"), src2{11-1},
+ !if (!eq(mnemonic, "memw"), src2{12-2}, 0)));
+
+ let IClass = 0b1010;
+
+ let Inst{27} = 0b0;
+ let Inst{26-25} = offsetBits{10-9};
+ let Inst{24-21} = 0b1101;
+ let Inst{20-16} = src1;
+ let Inst{13} = offsetBits{8};
+ let Inst{12-11} = MajOp;
+ let Inst{10-8} = src3;
+ let Inst{7-0} = offsetBits{7-0};
+ }
+
+let opExtendable = 2, opNewValue = 3, isPredicated = 1 in
+class T_pstore_io_nv <string mnemonic, RegisterClass RC, Operand predImmOp,
+ bits<2>MajOp, bit PredNot, bit isPredNew>
+ : NVInst_V4 <(outs),
+ (ins PredRegs:$src1, IntRegs:$src2, predImmOp:$src3, RC:$src4),
+ !if(PredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+ ") ")#mnemonic#"($src2+#$src3) = $src4.new",
+ [],"",V2LDST_tc_st_SLOT0> {
+ bits<2> src1;
+ bits<5> src2;
+ bits<9> src3;
+ bits<3> src4;
+ bits<6> offsetBits; // Represents offset encoding
+
+ let isPredicatedNew = isPredNew;
+ let isPredicatedFalse = PredNot;
+ let opExtentBits = !if (!eq(mnemonic, "memb"), 6,
+ !if (!eq(mnemonic, "memh"), 7,
+ !if (!eq(mnemonic, "memw"), 8, 0)));
+
+ let opExtentAlign = !if (!eq(mnemonic, "memb"), 0,
+ !if (!eq(mnemonic, "memh"), 1,
+ !if (!eq(mnemonic, "memw"), 2, 0)));
+
+ let offsetBits = !if (!eq(mnemonic, "memb"), src3{5-0},
+ !if (!eq(mnemonic, "memh"), src3{6-1},
+ !if (!eq(mnemonic, "memw"), src3{7-2}, 0)));
+
+ let IClass = 0b0100;
+
+ let Inst{27} = 0b0;
+ let Inst{26} = PredNot;
+ let Inst{25} = isPredNew;
+ let Inst{24-21} = 0b0101;
+ let Inst{20-16} = src2;
+ let Inst{13} = offsetBits{5};
+ let Inst{12-11} = MajOp;
+ let Inst{10-8} = src4;
+ let Inst{7-3} = offsetBits{4-0};
+ let Inst{2} = 0b0;
+ let Inst{1-0} = src1;
+ }
+
+// multiclass for new-value store instructions with base + immediate offset.
+//
+let mayStore = 1, isNVStore = 1, isNewValue = 1, hasSideEffects = 0,
+ isExtendable = 1 in
+multiclass ST_Idxd_nv<string mnemonic, string CextOp, RegisterClass RC,
+ Operand ImmOp, Operand predImmOp, bits<2> MajOp> {
+
+ let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed in {
+ def S2_#NAME#new_io : T_store_io_nv <mnemonic, RC, ImmOp, MajOp>;
+ // Predicated
+ def S2_p#NAME#newt_io :T_pstore_io_nv <mnemonic, RC, predImmOp, MajOp, 0, 0>;
+ def S2_p#NAME#newf_io :T_pstore_io_nv <mnemonic, RC, predImmOp, MajOp, 1, 0>;
+ // Predicated new
+ def S4_p#NAME#newtnew_io :T_pstore_io_nv <mnemonic, RC, predImmOp,
+ MajOp, 0, 1>;
+ def S4_p#NAME#newfnew_io :T_pstore_io_nv <mnemonic, RC, predImmOp,
+ MajOp, 1, 1>;
+ }
+}
+
+let addrMode = BaseImmOffset, InputType = "imm" in {
+ let accessSize = ByteAccess in
+ defm storerb: ST_Idxd_nv<"memb", "STrib", IntRegs, s11_0Ext,
+ u6_0Ext, 0b00>, AddrModeRel;
+
+ let accessSize = HalfWordAccess, opExtentAlign = 1 in
+ defm storerh: ST_Idxd_nv<"memh", "STrih", IntRegs, s11_1Ext,
+ u6_1Ext, 0b01>, AddrModeRel;
+
+ let accessSize = WordAccess, opExtentAlign = 2 in
+ defm storeri: ST_Idxd_nv<"memw", "STriw", IntRegs, s11_2Ext,
+ u6_2Ext, 0b10>, AddrModeRel;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment loads with register offset.
+//===----------------------------------------------------------------------===//
+
+let hasNewValue = 1 in
+def L2_loadbsw2_pr : T_load_pr <"membh", IntRegs, 0b0001, HalfWordAccess>;
+
+def L2_loadbsw4_pr : T_load_pr <"membh", DoubleRegs, 0b0111, WordAccess>;
+
+let hasSideEffects = 0, addrMode = PostInc in
+class T_loadalign_pr <string mnemonic, bits<4> MajOp, MemAccessSize AccessSz>
+ : LDInstPI <(outs DoubleRegs:$dst, IntRegs:$_dst_),
+ (ins DoubleRegs:$src1, IntRegs:$src2, ModRegs:$src3),
+ "$dst = "#mnemonic#"($src2++$src3)", [],
+ "$src1 = $dst, $src2 = $_dst_"> {
+ bits<5> dst;
+ bits<5> src2;
+ bits<1> src3;
+
+ let accessSize = AccessSz;
+ let IClass = 0b1001;
+
+ let Inst{27-25} = 0b110;
+ let Inst{24-21} = MajOp;
+ let Inst{20-16} = src2;
+ let Inst{13} = src3;
+ let Inst{12} = 0b0;
+ let Inst{7} = 0b0;
+ let Inst{4-0} = dst;
+ }
+
+def L2_loadalignb_pr : T_loadalign_pr <"memb_fifo", 0b0100, ByteAccess>;
+def L2_loadalignh_pr : T_loadalign_pr <"memh_fifo", 0b0010, HalfWordAccess>;
+
+//===----------------------------------------------------------------------===//
+// Template class for non-predicated post increment .new stores
+// mem[bhwd](Rx++#s4:[0123])=Nt.new
+//===----------------------------------------------------------------------===//
+let isPredicable = 1, hasSideEffects = 0, addrMode = PostInc, isNVStore = 1,
+ isNewValue = 1, opNewValue = 3 in
+class T_StorePI_nv <string mnemonic, Operand ImmOp, bits<2> MajOp >
+ : NVInstPI_V4 <(outs IntRegs:$_dst_),
+ (ins IntRegs:$src1, ImmOp:$offset, IntRegs:$src2),
+ mnemonic#"($src1++#$offset) = $src2.new",
+ [], "$src1 = $_dst_">,
+ AddrModeRel {
+ bits<5> src1;
+ bits<3> src2;
+ bits<7> offset;
+ bits<4> offsetBits;
+
+ string ImmOpStr = !cast<string>(ImmOp);
+ let offsetBits = !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
+ !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
+ /* s4_0Imm */ offset{3-0}));
+ let IClass = 0b1010;
+
+ let Inst{27-21} = 0b1011101;
+ let Inst{20-16} = src1;
+ let Inst{13} = 0b0;
+ let Inst{12-11} = MajOp;
+ let Inst{10-8} = src2;
+ let Inst{7} = 0b0;
+ let Inst{6-3} = offsetBits;
+ let Inst{1} = 0b0;
+ }
+
+//===----------------------------------------------------------------------===//
+// Template class for predicated post increment .new stores
+// if([!]Pv[.new]) mem[bhwd](Rx++#s4:[0123])=Nt.new
+//===----------------------------------------------------------------------===//
+let isPredicated = 1, hasSideEffects = 0, addrMode = PostInc, isNVStore = 1,
+ isNewValue = 1, opNewValue = 4 in
+class T_StorePI_nv_pred <string mnemonic, Operand ImmOp,
+ bits<2> MajOp, bit isPredNot, bit isPredNew >
+ : NVInstPI_V4 <(outs IntRegs:$_dst_),
+ (ins PredRegs:$src1, IntRegs:$src2,
+ ImmOp:$offset, IntRegs:$src3),
+ !if(isPredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+ ") ")#mnemonic#"($src2++#$offset) = $src3.new",
+ [], "$src2 = $_dst_">,
+ AddrModeRel {
+ bits<2> src1;
+ bits<5> src2;
+ bits<3> src3;
+ bits<7> offset;
+ bits<4> offsetBits;
+
+ string ImmOpStr = !cast<string>(ImmOp);
+ let offsetBits = !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
+ !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
+ /* s4_0Imm */ offset{3-0}));
+ let isPredicatedNew = isPredNew;
+ let isPredicatedFalse = isPredNot;
+
+ let IClass = 0b1010;
+
+ let Inst{27-21} = 0b1011101;
+ let Inst{20-16} = src2;
+ let Inst{13} = 0b1;
+ let Inst{12-11} = MajOp;
+ let Inst{10-8} = src3;
+ let Inst{7} = isPredNew;
+ let Inst{6-3} = offsetBits;
+ let Inst{2} = isPredNot;
+ let Inst{1-0} = src1;
+ }
+
+multiclass ST_PostInc_Pred_nv<string mnemonic, Operand ImmOp,
+ bits<2> MajOp, bit PredNot> {
+ def _pi : T_StorePI_nv_pred <mnemonic, ImmOp, MajOp, PredNot, 0>;
+
+ // Predicate new
+ def new_pi : T_StorePI_nv_pred <mnemonic, ImmOp, MajOp, PredNot, 1>;
+}
+
+multiclass ST_PostInc_nv<string mnemonic, string BaseOp, Operand ImmOp,
+ bits<2> MajOp> {
+ let BaseOpcode = "POST_"#BaseOp in {
+ def S2_#NAME#_pi : T_StorePI_nv <mnemonic, ImmOp, MajOp>;
+
+ // Predicated
+ defm S2_p#NAME#t : ST_PostInc_Pred_nv <mnemonic, ImmOp, MajOp, 0>;
+ defm S2_p#NAME#f : ST_PostInc_Pred_nv <mnemonic, ImmOp, MajOp, 1>;
+ }
+}
+
+let accessSize = ByteAccess in
+defm storerbnew: ST_PostInc_nv <"memb", "STrib", s4_0Imm, 0b00>;
+
+let accessSize = HalfWordAccess in
+defm storerhnew: ST_PostInc_nv <"memh", "STrih", s4_1Imm, 0b01>;
+
+let accessSize = WordAccess in
+defm storerinew: ST_PostInc_nv <"memw", "STriw", s4_2Imm, 0b10>;
+
+//===----------------------------------------------------------------------===//
+// Template class for post increment .new stores with register offset
+//===----------------------------------------------------------------------===//
+let isNewValue = 1, mayStore = 1, isNVStore = 1, opNewValue = 3 in
+class T_StorePI_RegNV <string mnemonic, bits<2> MajOp, MemAccessSize AccessSz>
+ : NVInstPI_V4 <(outs IntRegs:$_dst_),
+ (ins IntRegs:$src1, ModRegs:$src2, IntRegs:$src3),
+ #mnemonic#"($src1++$src2) = $src3.new",
+ [], "$src1 = $_dst_"> {
+ bits<5> src1;
+ bits<1> src2;
+ bits<3> src3;
+ let accessSize = AccessSz;
+
+ let IClass = 0b1010;
+
+ let Inst{27-21} = 0b1101101;
+ let Inst{20-16} = src1;
+ let Inst{13} = src2;
+ let Inst{12-11} = MajOp;
+ let Inst{10-8} = src3;
+ let Inst{7} = 0b0;
+ }
+
+def S2_storerbnew_pr : T_StorePI_RegNV<"memb", 0b00, ByteAccess>;
+def S2_storerhnew_pr : T_StorePI_RegNV<"memh", 0b01, HalfWordAccess>;
+def S2_storerinew_pr : T_StorePI_RegNV<"memw", 0b10, WordAccess>;
+
+// memb(Rx++#s4:0:circ(Mu))=Nt.new
+// memb(Rx++I:circ(Mu))=Nt.new
+// memb(Rx++Mu:brev)=Nt.new
+// memh(Rx++#s4:1:circ(Mu))=Nt.new
+// memh(Rx++I:circ(Mu))=Nt.new
+// memh(Rx++Mu)=Nt.new
+// memh(Rx++Mu:brev)=Nt.new
+
+// memw(Rx++#s4:2:circ(Mu))=Nt.new
+// memw(Rx++I:circ(Mu))=Nt.new
+// memw(Rx++Mu)=Nt.new
+// memw(Rx++Mu:brev)=Nt.new
+
+//===----------------------------------------------------------------------===//
+// NV/ST -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// NV/J +
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// multiclass/template class for the new-value compare jumps with the register
+// operands.
+//===----------------------------------------------------------------------===//
+
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 11,
+ opExtentAlign = 2 in
+class NVJrr_template<string mnemonic, bits<3> majOp, bit NvOpNum,
+ bit isNegCond, bit isTak>
+ : NVInst_V4<(outs),
+ (ins IntRegs:$src1, IntRegs:$src2, brtarget:$offset),
+ "if ("#!if(isNegCond, "!","")#mnemonic#
+ "($src1"#!if(!eq(NvOpNum, 0),".new, ",", ")#
+ "$src2"#!if(!eq(NvOpNum, 1),".new))","))")#" jump:"
+ #!if(isTak, "t","nt")#" $offset", []> {
+
+ bits<5> src1;
+ bits<5> src2;
+ bits<3> Ns; // New-Value Operand
+ bits<5> RegOp; // Non-New-Value Operand
+ bits<11> offset;
+
+ let isTaken = isTak;
+ let isPredicatedFalse = isNegCond;
+ let opNewValue{0} = NvOpNum;
+
+ let Ns = !if(!eq(NvOpNum, 0), src1{2-0}, src2{2-0});
+ let RegOp = !if(!eq(NvOpNum, 0), src2, src1);
+
+ let IClass = 0b0010;
+ let Inst{27-26} = 0b00;
+ let Inst{25-23} = majOp;
+ let Inst{22} = isNegCond;
+ let Inst{18-16} = Ns;
+ let Inst{13} = isTak;
+ let Inst{12-8} = RegOp;
+ let Inst{21-20} = offset{10-9};
+ let Inst{7-1} = offset{8-2};
+}
+
+
+multiclass NVJrr_cond<string mnemonic, bits<3> majOp, bit NvOpNum,
+ bit isNegCond> {
+ // Branch not taken:
+ def _nt: NVJrr_template<mnemonic, majOp, NvOpNum, isNegCond, 0>;
+ // Branch taken:
+ def _t : NVJrr_template<mnemonic, majOp, NvOpNum, isNegCond, 1>;
+}
+
+// NvOpNum = 0 -> First Operand is a new-value Register
+// NvOpNum = 1 -> Second Operand is a new-value Register
+
+multiclass NVJrr_base<string mnemonic, string BaseOp, bits<3> majOp,
+ bit NvOpNum> {
+ let BaseOpcode = BaseOp#_NVJ in {
+ defm _t_jumpnv : NVJrr_cond<mnemonic, majOp, NvOpNum, 0>; // True cond
+ defm _f_jumpnv : NVJrr_cond<mnemonic, majOp, NvOpNum, 1>; // False cond
+ }
+}
+
+// if ([!]cmp.eq(Ns.new,Rt)) jump:[n]t #r9:2
+// if ([!]cmp.gt(Ns.new,Rt)) jump:[n]t #r9:2
+// if ([!]cmp.gtu(Ns.new,Rt)) jump:[n]t #r9:2
+// if ([!]cmp.gt(Rt,Ns.new)) jump:[n]t #r9:2
+// if ([!]cmp.gtu(Rt,Ns.new)) jump:[n]t #r9:2
+
+let isPredicated = 1, isBranch = 1, isNewValue = 1, isTerminator = 1,
+ Defs = [PC], hasSideEffects = 0 in {
+ defm J4_cmpeq : NVJrr_base<"cmp.eq", "CMPEQ", 0b000, 0>, PredRel;
+ defm J4_cmpgt : NVJrr_base<"cmp.gt", "CMPGT", 0b001, 0>, PredRel;
+ defm J4_cmpgtu : NVJrr_base<"cmp.gtu", "CMPGTU", 0b010, 0>, PredRel;
+ defm J4_cmplt : NVJrr_base<"cmp.gt", "CMPLT", 0b011, 1>, PredRel;
+ defm J4_cmpltu : NVJrr_base<"cmp.gtu", "CMPLTU", 0b100, 1>, PredRel;
+}
+
+//===----------------------------------------------------------------------===//
+// multiclass/template class for the new-value compare jumps instruction
+// with a register and an unsigned immediate (U5) operand.
+//===----------------------------------------------------------------------===//
+
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 11,
+ opExtentAlign = 2 in
+class NVJri_template<string mnemonic, bits<3> majOp, bit isNegCond,
+ bit isTak>
+ : NVInst_V4<(outs),
+ (ins IntRegs:$src1, u5_0Imm:$src2, brtarget:$offset),
+ "if ("#!if(isNegCond, "!","")#mnemonic#"($src1.new, #$src2)) jump:"
+ #!if(isTak, "t","nt")#" $offset", []> {
+
+ let isTaken = isTak;
+ let isPredicatedFalse = isNegCond;
+ let isTaken = isTak;
+
+ bits<3> src1;
+ bits<5> src2;
+ bits<11> offset;
+
+ let IClass = 0b0010;
+ let Inst{26} = 0b1;
+ let Inst{25-23} = majOp;
+ let Inst{22} = isNegCond;
+ let Inst{18-16} = src1;
+ let Inst{13} = isTak;
+ let Inst{12-8} = src2;
+ let Inst{21-20} = offset{10-9};
+ let Inst{7-1} = offset{8-2};
+}
+
+multiclass NVJri_cond<string mnemonic, bits<3> majOp, bit isNegCond> {
+ // Branch not taken:
+ def _nt: NVJri_template<mnemonic, majOp, isNegCond, 0>;
+ // Branch taken:
+ def _t : NVJri_template<mnemonic, majOp, isNegCond, 1>;
+}
+
+multiclass NVJri_base<string mnemonic, string BaseOp, bits<3> majOp> {
+ let BaseOpcode = BaseOp#_NVJri in {
+ defm _t_jumpnv : NVJri_cond<mnemonic, majOp, 0>; // True Cond
+ defm _f_jumpnv : NVJri_cond<mnemonic, majOp, 1>; // False cond
+ }
+}
+
+// if ([!]cmp.eq(Ns.new,#U5)) jump:[n]t #r9:2
+// if ([!]cmp.gt(Ns.new,#U5)) jump:[n]t #r9:2
+// if ([!]cmp.gtu(Ns.new,#U5)) jump:[n]t #r9:2
+
+let isPredicated = 1, isBranch = 1, isNewValue = 1, isTerminator = 1,
+ Defs = [PC], hasSideEffects = 0 in {
+ defm J4_cmpeqi : NVJri_base<"cmp.eq", "CMPEQ", 0b000>, PredRel;
+ defm J4_cmpgti : NVJri_base<"cmp.gt", "CMPGT", 0b001>, PredRel;
+ defm J4_cmpgtui : NVJri_base<"cmp.gtu", "CMPGTU", 0b010>, PredRel;
+}
+
+//===----------------------------------------------------------------------===//
+// multiclass/template class for the new-value compare jumps instruction
+// with a register and an hardcoded 0/-1 immediate value.
+//===----------------------------------------------------------------------===//
+
+let isExtendable = 1, isExtentSigned = 1, opExtentBits = 11,
+ opExtentAlign = 2 in
+class NVJ_ConstImm_template<string mnemonic, bits<3> majOp, string ImmVal,
+ bit isNegCond, bit isTak>
+ : NVInst_V4<(outs),
+ !if(!eq(ImmVal, "{-1}"),
+ (ins IntRegs:$src1, n1Const:$n1, brtarget:$offset),
+ (ins IntRegs:$src1, brtarget:$offset)),
+ "if ("#!if(isNegCond, "!","")#mnemonic
+ #"($src1.new, #" # !if(!eq(ImmVal, "{-1}"), "$n1", ImmVal) # ")) jump:"
+ #!if(isTak, "t","nt")#" $offset", []> {
+
+ let isTaken = isTak;
+ let isPredicatedFalse = isNegCond;
+ let isTaken = isTak;
+ let opExtendable = !if(!eq(ImmVal, "{-1}"), 2, 1);
+
+ bits<3> src1;
+ bits<11> offset;
+ let IClass = 0b0010;
+ let Inst{26} = 0b1;
+ let Inst{25-23} = majOp;
+ let Inst{22} = isNegCond;
+ let Inst{18-16} = src1;
+ let Inst{13} = isTak;
+ let Inst{21-20} = offset{10-9};
+ let Inst{7-1} = offset{8-2};
+}
+
+multiclass NVJ_ConstImm_cond<string mnemonic, bits<3> majOp, string ImmVal,
+ bit isNegCond> {
+ // Branch not taken:
+ def _nt: NVJ_ConstImm_template<mnemonic, majOp, ImmVal, isNegCond, 0>;
+ // Branch taken:
+ def _t : NVJ_ConstImm_template<mnemonic, majOp, ImmVal, isNegCond, 1>;
+}
+
+multiclass NVJ_ConstImm_base<string mnemonic, string BaseOp, bits<3> majOp,
+ string ImmVal> {
+ let BaseOpcode = BaseOp#_NVJ_ConstImm in {
+ defm _t_jumpnv : NVJ_ConstImm_cond<mnemonic, majOp, ImmVal, 0>; // True
+ defm _f_jumpnv : NVJ_ConstImm_cond<mnemonic, majOp, ImmVal, 1>; // False
+ }
+}
+
+// if ([!]tstbit(Ns.new,#0)) jump:[n]t #r9:2
+// if ([!]cmp.eq(Ns.new,#-1)) jump:[n]t #r9:2
+// if ([!]cmp.gt(Ns.new,#-1)) jump:[n]t #r9:2
+
+let isPredicated = 1, isBranch = 1, isNewValue = 1, isTerminator=1,
+ Defs = [PC], hasSideEffects = 0 in {
+ defm J4_tstbit0 : NVJ_ConstImm_base<"tstbit", "TSTBIT", 0b011, "0">, PredRel;
+ defm J4_cmpeqn1 : NVJ_ConstImm_base<"cmp.eq", "CMPEQ", 0b100, "{-1}">, PredRel;
+ defm J4_cmpgtn1 : NVJ_ConstImm_base<"cmp.gt", "CMPGT", 0b101, "{-1}">, PredRel;
+}
+
+// J4_hintjumpr: Hint indirect conditional jump.
+let isBranch = 1, isIndirectBranch = 1, hasSideEffects = 0 in
+def J4_hintjumpr: JRInst <
+ (outs),
+ (ins IntRegs:$Rs),
+ "hintjr($Rs)"> {
+ bits<5> Rs;
+ let IClass = 0b0101;
+ let Inst{27-21} = 0b0010101;
+ let Inst{20-16} = Rs;
+ }
+
+//===----------------------------------------------------------------------===//
+// NV/J -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// CR +
+//===----------------------------------------------------------------------===//
+
+// PC-relative add
+let hasNewValue = 1, isExtendable = 1, opExtendable = 1,
+ isExtentSigned = 0, opExtentBits = 6, hasSideEffects = 0, Uses = [PC] in
+def C4_addipc : CRInst <(outs IntRegs:$Rd), (ins u6_0Ext:$u6),
+ "$Rd = add(pc, #$u6)", [], "", CR_tc_2_SLOT3 > {
+ bits<5> Rd;
+ bits<6> u6;
+
+ let IClass = 0b0110;
+ let Inst{27-16} = 0b101001001001;
+ let Inst{12-7} = u6;
+ let Inst{4-0} = Rd;
+ }
+
+
+
+let hasSideEffects = 0 in
+class T_LOGICAL_3OP<string MnOp1, string MnOp2, bits<2> OpBits, bit IsNeg>
+ : CRInst<(outs PredRegs:$Pd),
+ (ins PredRegs:$Ps, PredRegs:$Pt, PredRegs:$Pu),
+ "$Pd = " # MnOp1 # "($Ps, " # MnOp2 # "($Pt, " #
+ !if (IsNeg,"!","") # "$Pu))",
+ [], "", CR_tc_2early_SLOT23> {
+ bits<2> Pd;
+ bits<2> Ps;
+ bits<2> Pt;
+ bits<2> Pu;
+
+ let IClass = 0b0110;
+ let Inst{27-24} = 0b1011;
+ let Inst{23} = IsNeg;
+ let Inst{22-21} = OpBits;
+ let Inst{20} = 0b1;
+ let Inst{17-16} = Ps;
+ let Inst{13} = 0b0;
+ let Inst{9-8} = Pt;
+ let Inst{7-6} = Pu;
+ let Inst{1-0} = Pd;
+}
+
+def C4_and_and : T_LOGICAL_3OP<"and", "and", 0b00, 0>;
+def C4_and_or : T_LOGICAL_3OP<"and", "or", 0b01, 0>;
+def C4_or_and : T_LOGICAL_3OP<"or", "and", 0b10, 0>;
+def C4_or_or : T_LOGICAL_3OP<"or", "or", 0b11, 0>;
+def C4_and_andn : T_LOGICAL_3OP<"and", "and", 0b00, 1>;
+def C4_and_orn : T_LOGICAL_3OP<"and", "or", 0b01, 1>;
+def C4_or_andn : T_LOGICAL_3OP<"or", "and", 0b10, 1>;
+def C4_or_orn : T_LOGICAL_3OP<"or", "or", 0b11, 1>;
+
+//===----------------------------------------------------------------------===//
+// CR -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// XTYPE/ALU +
+//===----------------------------------------------------------------------===//
+
+// Logical with-not instructions.
+def A4_andnp : T_ALU64_logical<"and", 0b001, 1, 0, 1>;
+def A4_ornp : T_ALU64_logical<"or", 0b011, 1, 0, 1>;
+
+let hasNewValue = 1, hasSideEffects = 0 in
+def S4_parity: ALU64Inst<(outs IntRegs:$Rd), (ins IntRegs:$Rs, IntRegs:$Rt),
+ "$Rd = parity($Rs, $Rt)", [], "", ALU64_tc_2_SLOT23> {
+ bits<5> Rd;
+ bits<5> Rs;
+ bits<5> Rt;
+
+ let IClass = 0b1101;
+ let Inst{27-21} = 0b0101111;
+ let Inst{20-16} = Rs;
+ let Inst{12-8} = Rt;
+ let Inst{4-0} = Rd;
+}
+
+// Add and accumulate.
+// Rd=add(Rs,add(Ru,#s6))
+let isExtentSigned = 1, hasNewValue = 1, isExtendable = 1, opExtentBits = 6,
+ opExtendable = 3 in
+def S4_addaddi : ALU64Inst <(outs IntRegs:$Rd),
+ (ins IntRegs:$Rs, IntRegs:$Ru, s6_0Ext:$s6),
+ "$Rd = add($Rs, add($Ru, #$s6))" , [],
+ "", ALU64_tc_2_SLOT23> {
+ bits<5> Rd;
+ bits<5> Rs;
+ bits<5> Ru;
+ bits<6> s6;
+
+ let IClass = 0b1101;
+
+ let Inst{27-23} = 0b10110;
+ let Inst{22-21} = s6{5-4};
+ let Inst{20-16} = Rs;
+ let Inst{13} = s6{3};
+ let Inst{12-8} = Rd;
+ let Inst{7-5} = s6{2-0};
+ let Inst{4-0} = Ru;
+ }
+
+let isExtentSigned = 1, hasSideEffects = 0, hasNewValue = 1, isExtendable = 1,
+ opExtentBits = 6, opExtendable = 2 in
+def S4_subaddi: ALU64Inst <(outs IntRegs:$Rd),
+ (ins IntRegs:$Rs, s6_0Ext:$s6, IntRegs:$Ru),
+ "$Rd = add($Rs, sub(#$s6, $Ru))",
+ [], "", ALU64_tc_2_SLOT23> {
+ bits<5> Rd;
+ bits<5> Rs;
+ bits<6> s6;
+ bits<5> Ru;
+
+ let IClass = 0b1101;
+
+ let Inst{27-23} = 0b10111;
+ let Inst{22-21} = s6{5-4};
+ let Inst{20-16} = Rs;
+ let Inst{13} = s6{3};
+ let Inst{12-8} = Rd;
+ let Inst{7-5} = s6{2-0};
+ let Inst{4-0} = Ru;
+ }
+
+def S4_extractp_rp : T_S3op_64 < "extract", 0b11, 0b100, 0>;
+def S4_extractp : T_S2op_extract <"extract", 0b1010, DoubleRegs, u6_0Imm>;
+
+let hasNewValue = 1 in {
+ def S4_extract_rp : T_S3op_extract<"extract", 0b01>;
+ def S4_extract : T_S2op_extract <"extract", 0b1101, IntRegs, u5_0Imm>;
+}
+
+// Complex add/sub halfwords/words
+let Defs = [USR_OVF] in {
+ def S4_vxaddsubh : T_S3op_64 < "vxaddsubh", 0b01, 0b100, 0, 1>;
+ def S4_vxaddsubw : T_S3op_64 < "vxaddsubw", 0b01, 0b000, 0, 1>;
+ def S4_vxsubaddh : T_S3op_64 < "vxsubaddh", 0b01, 0b110, 0, 1>;
+ def S4_vxsubaddw : T_S3op_64 < "vxsubaddw", 0b01, 0b010, 0, 1>;
+}
+
+let Defs = [USR_OVF] in {
+ def S4_vxaddsubhr : T_S3op_64 < "vxaddsubh", 0b11, 0b000, 0, 1, 1, 1>;
+ def S4_vxsubaddhr : T_S3op_64 < "vxsubaddh", 0b11, 0b010, 0, 1, 1, 1>;
+}
+
+let Itinerary = M_tc_3x_SLOT23, Defs = [USR_OVF] in {
+ def M4_mac_up_s1_sat: T_MType_acc_rr<"+= mpy", 0b011, 0b000, 0, [], 0, 1, 1>;
+ def M4_nac_up_s1_sat: T_MType_acc_rr<"-= mpy", 0b011, 0b001, 0, [], 0, 1, 1>;
+}
+
+// Logical xor with xor accumulation.
+// Rxx^=xor(Rss,Rtt)
+let hasSideEffects = 0 in
+def M4_xor_xacc
+ : SInst <(outs DoubleRegs:$Rxx),
+ (ins DoubleRegs:$dst2, DoubleRegs:$Rss, DoubleRegs:$Rtt),
+ "$Rxx ^= xor($Rss, $Rtt)", [],
+ "$dst2 = $Rxx", S_3op_tc_1_SLOT23> {
+ bits<5> Rxx;
+ bits<5> Rss;
+ bits<5> Rtt;
+
+ let IClass = 0b1100;
+
+ let Inst{27-22} = 0b101010;
+ let Inst{20-16} = Rss;
+ let Inst{12-8} = Rtt;
+ let Inst{7-5} = 0b000;
+ let Inst{4-0} = Rxx;
+ }
+
+// Rotate and reduce bytes
+// Rdd=vrcrotate(Rss,Rt,#u2)
+let hasSideEffects = 0 in
+def S4_vrcrotate
+ : SInst <(outs DoubleRegs:$Rdd),
+ (ins DoubleRegs:$Rss, IntRegs:$Rt, u2_0Imm:$u2),
+ "$Rdd = vrcrotate($Rss, $Rt, #$u2)",
+ [], "", S_3op_tc_3x_SLOT23> {
+ bits<5> Rdd;
+ bits<5> Rss;
+ bits<5> Rt;
+ bits<2> u2;
+
+ let IClass = 0b1100;
+
+ let Inst{27-22} = 0b001111;
+ let Inst{20-16} = Rss;
+ let Inst{13} = u2{1};
+ let Inst{12-8} = Rt;
+ let Inst{7-6} = 0b11;
+ let Inst{5} = u2{0};
+ let Inst{4-0} = Rdd;
+ }
+
+// Rotate and reduce bytes with accumulation
+// Rxx+=vrcrotate(Rss,Rt,#u2)
+let hasSideEffects = 0 in
+def S4_vrcrotate_acc
+ : SInst <(outs DoubleRegs:$Rxx),
+ (ins DoubleRegs:$dst2, DoubleRegs:$Rss, IntRegs:$Rt, u2_0Imm:$u2),
+ "$Rxx += vrcrotate($Rss, $Rt, #$u2)", [],
+ "$dst2 = $Rxx", S_3op_tc_3x_SLOT23> {
+ bits<5> Rxx;
+ bits<5> Rss;
+ bits<5> Rt;
+ bits<2> u2;
+
+ let IClass = 0b1100;
+
+ let Inst{27-21} = 0b1011101;
+ let Inst{20-16} = Rss;
+ let Inst{13} = u2{1};
+ let Inst{12-8} = Rt;
+ let Inst{5} = u2{0};
+ let Inst{4-0} = Rxx;
+ }
+
+// Vector reduce conditional negate halfwords
+let hasSideEffects = 0 in
+def S2_vrcnegh
+ : SInst <(outs DoubleRegs:$Rxx),
+ (ins DoubleRegs:$dst2, DoubleRegs:$Rss, IntRegs:$Rt),
+ "$Rxx += vrcnegh($Rss, $Rt)", [],
+ "$dst2 = $Rxx", S_3op_tc_3x_SLOT23> {
+ bits<5> Rxx;
+ bits<5> Rss;
+ bits<5> Rt;
+
+ let IClass = 0b1100;
+
+ let Inst{27-21} = 0b1011001;
+ let Inst{20-16} = Rss;
+ let Inst{13} = 0b1;
+ let Inst{12-8} = Rt;
+ let Inst{7-5} = 0b111;
+ let Inst{4-0} = Rxx;
+ }
+
+// Split bitfield
+def A4_bitspliti : T_S2op_2_di <"bitsplit", 0b110, 0b100>;
+
+// Arithmetic/Convergent round
+def A4_cround_ri : T_S2op_2_ii <"cround", 0b111, 0b000>;
+
+def A4_round_ri : T_S2op_2_ii <"round", 0b111, 0b100>;
+
+let Defs = [USR_OVF] in
+def A4_round_ri_sat : T_S2op_2_ii <"round", 0b111, 0b110, 1>;
+
+// Logical-logical words.
+// Compound or-and -- Rx=or(Ru,and(Rx,#s10))
+let isExtentSigned = 1, hasNewValue = 1, isExtendable = 1, opExtentBits = 10,
+ opExtendable = 3 in
+def S4_or_andix:
+ ALU64Inst<(outs IntRegs:$Rx),
+ (ins IntRegs:$Ru, IntRegs:$_src_, s10_0Ext:$s10),
+ "$Rx = or($Ru, and($_src_, #$s10))" , [] ,
+ "$_src_ = $Rx", ALU64_tc_2_SLOT23> {
+ bits<5> Rx;
+ bits<5> Ru;
+ bits<10> s10;
+
+ let IClass = 0b1101;
+
+ let Inst{27-22} = 0b101001;
+ let Inst{20-16} = Rx;
+ let Inst{21} = s10{9};
+ let Inst{13-5} = s10{8-0};
+ let Inst{4-0} = Ru;
+ }
+
+// Miscellaneous ALU64 instructions.
+//
+let hasNewValue = 1, hasSideEffects = 0 in
+def A4_modwrapu: ALU64Inst<(outs IntRegs:$Rd), (ins IntRegs:$Rs, IntRegs:$Rt),
+ "$Rd = modwrap($Rs, $Rt)", [], "", ALU64_tc_2_SLOT23> {
+ bits<5> Rd;
+ bits<5> Rs;
+ bits<5> Rt;
+
+ let IClass = 0b1101;
+ let Inst{27-21} = 0b0011111;
+ let Inst{20-16} = Rs;
+ let Inst{12-8} = Rt;
+ let Inst{7-5} = 0b111;
+ let Inst{4-0} = Rd;
+}
+
+let hasSideEffects = 0 in
+def A4_bitsplit: ALU64Inst<(outs DoubleRegs:$Rd),
+ (ins IntRegs:$Rs, IntRegs:$Rt),
+ "$Rd = bitsplit($Rs, $Rt)", [], "", ALU64_tc_1_SLOT23> {
+ bits<5> Rd;
+ bits<5> Rs;
+ bits<5> Rt;
+
+ let IClass = 0b1101;
+ let Inst{27-24} = 0b0100;
+ let Inst{21} = 0b1;
+ let Inst{20-16} = Rs;
+ let Inst{12-8} = Rt;
+ let Inst{4-0} = Rd;
+}
+
+let hasSideEffects = 0 in
+def dep_S2_packhl: ALU64Inst<(outs DoubleRegs:$Rd),
+ (ins IntRegs:$Rs, IntRegs:$Rt),
+ "$Rd = packhl($Rs, $Rt):deprecated", [], "", ALU64_tc_1_SLOT23> {
+ bits<5> Rd;
+ bits<5> Rs;
+ bits<5> Rt;
+
+ let IClass = 0b1101;
+ let Inst{27-24} = 0b0100;
+ let Inst{21} = 0b0;
+ let Inst{20-16} = Rs;
+ let Inst{12-8} = Rt;
+ let Inst{4-0} = Rd;
+}
+
+let hasNewValue = 1, hasSideEffects = 0 in
+def dep_A2_addsat: ALU64Inst<(outs IntRegs:$Rd),
+ (ins IntRegs:$Rs, IntRegs:$Rt),
+ "$Rd = add($Rs, $Rt):sat:deprecated", [], "", ALU64_tc_2_SLOT23> {
+ bits<5> Rd;
+ bits<5> Rs;
+ bits<5> Rt;
+
+ let IClass = 0b1101;
+ let Inst{27-21} = 0b0101100;
+ let Inst{20-16} = Rs;
+ let Inst{12-8} = Rt;
+ let Inst{7} = 0b0;
+ let Inst{4-0} = Rd;
+}
+
+let hasNewValue = 1, hasSideEffects = 0 in
+def dep_A2_subsat: ALU64Inst<(outs IntRegs:$Rd),
+ (ins IntRegs:$Rs, IntRegs:$Rt),
+ "$Rd = sub($Rs, $Rt):sat:deprecated", [], "", ALU64_tc_2_SLOT23> {
+ bits<5> Rd;
+ bits<5> Rs;
+ bits<5> Rt;
+
+ let IClass = 0b1101;
+ let Inst{27-21} = 0b0101100;
+ let Inst{20-16} = Rt;
+ let Inst{12-8} = Rs;
+ let Inst{7} = 0b1;
+ let Inst{4-0} = Rd;
+}
+
+// Rx[&|]=xor(Rs,Rt)
+def M4_or_xor : T_MType_acc_rr < "|= xor", 0b110, 0b001, 0>;
+def M4_and_xor : T_MType_acc_rr < "&= xor", 0b010, 0b010, 0>;
+
+// Rx[&|^]=or(Rs,Rt)
+def M4_xor_or : T_MType_acc_rr < "^= or", 0b110, 0b011, 0>;
+
+let CextOpcode = "ORr_ORr" in
+def M4_or_or : T_MType_acc_rr < "|= or", 0b110, 0b000, 0>;
+def M4_and_or : T_MType_acc_rr < "&= or", 0b010, 0b001, 0>;
+
+// Rx[&|^]=and(Rs,Rt)
+def M4_xor_and : T_MType_acc_rr < "^= and", 0b110, 0b010, 0>;
+
+let CextOpcode = "ORr_ANDr" in
+def M4_or_and : T_MType_acc_rr < "|= and", 0b010, 0b011, 0>;
+def M4_and_and : T_MType_acc_rr < "&= and", 0b010, 0b000, 0>;
+
+// Rx[&|^]=and(Rs,~Rt)
+def M4_xor_andn : T_MType_acc_rr < "^= and", 0b001, 0b010, 0, [], 1>;
+def M4_or_andn : T_MType_acc_rr < "|= and", 0b001, 0b000, 0, [], 1>;
+def M4_and_andn : T_MType_acc_rr < "&= and", 0b001, 0b001, 0, [], 1>;
+
+// Compound or-or and or-and
+let isExtentSigned = 1, InputType = "imm", hasNewValue = 1, isExtendable = 1,
+ opExtentBits = 10, opExtendable = 3 in
+class T_CompOR <string mnemonic, bits<2> MajOp, SDNode OpNode>
+ : MInst_acc <(outs IntRegs:$Rx),
+ (ins IntRegs:$src1, IntRegs:$Rs, s10_0Ext:$s10),
+ "$Rx |= "#mnemonic#"($Rs, #$s10)", [],
+ "$src1 = $Rx", ALU64_tc_2_SLOT23>, ImmRegRel {
+ bits<5> Rx;
+ bits<5> Rs;
+ bits<10> s10;
+
+ let IClass = 0b1101;
+
+ let Inst{27-24} = 0b1010;
+ let Inst{23-22} = MajOp;
+ let Inst{20-16} = Rs;
+ let Inst{21} = s10{9};
+ let Inst{13-5} = s10{8-0};
+ let Inst{4-0} = Rx;
+ }
+
+let CextOpcode = "ORr_ANDr" in
+def S4_or_andi : T_CompOR <"and", 0b00, and>;
+
+let CextOpcode = "ORr_ORr" in
+def S4_or_ori : T_CompOR <"or", 0b10, or>;
+
+// Modulo wrap
+// Rd=modwrap(Rs,Rt)
+// Round
+// Rd=cround(Rs,#u5)
+// Rd=cround(Rs,Rt)
+// Rd=round(Rs,#u5)[:sat]
+// Rd=round(Rs,Rt)[:sat]
+// Vector reduce add unsigned halfwords
+// Rd=vraddh(Rss,Rtt)
+// Vector add bytes
+// Rdd=vaddb(Rss,Rtt)
+// Vector conditional negate
+// Rdd=vcnegh(Rss,Rt)
+// Rxx+=vrcnegh(Rss,Rt)
+// Vector maximum bytes
+// Rdd=vmaxb(Rtt,Rss)
+// Vector reduce maximum halfwords
+// Rxx=vrmaxh(Rss,Ru)
+// Rxx=vrmaxuh(Rss,Ru)
+// Vector reduce maximum words
+// Rxx=vrmaxuw(Rss,Ru)
+// Rxx=vrmaxw(Rss,Ru)
+// Vector minimum bytes
+// Rdd=vminb(Rtt,Rss)
+// Vector reduce minimum halfwords
+// Rxx=vrminh(Rss,Ru)
+// Rxx=vrminuh(Rss,Ru)
+// Vector reduce minimum words
+// Rxx=vrminuw(Rss,Ru)
+// Rxx=vrminw(Rss,Ru)
+// Vector subtract bytes
+// Rdd=vsubb(Rss,Rtt)
+
+//===----------------------------------------------------------------------===//
+// XTYPE/ALU -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// XTYPE/BIT +
+//===----------------------------------------------------------------------===//
+
+// Bit reverse
+def S2_brevp : T_S2op_3 <"brev", 0b11, 0b110>;
+
+// Bit count
+def S2_ct0p : T_COUNT_LEADING_64<"ct0", 0b111, 0b010>;
+def S2_ct1p : T_COUNT_LEADING_64<"ct1", 0b111, 0b100>;
+def S4_clbpnorm : T_COUNT_LEADING_64<"normamt", 0b011, 0b000>;
+
+let hasSideEffects = 0, hasNewValue = 1 in
+def S4_clbaddi : SInst<(outs IntRegs:$Rd), (ins IntRegs:$Rs, s6_0Imm:$s6),
+ "$Rd = add(clb($Rs), #$s6)", [], "", S_2op_tc_2_SLOT23> {
+ bits<5> Rs;
+ bits<5> Rd;
+ bits<6> s6;
+ let IClass = 0b1000;
+ let Inst{27-24} = 0b1100;
+ let Inst{23-21} = 0b001;
+ let Inst{20-16} = Rs;
+ let Inst{13-8} = s6;
+ let Inst{7-5} = 0b000;
+ let Inst{4-0} = Rd;
+}
+
+let hasSideEffects = 0, hasNewValue = 1 in
+def S4_clbpaddi : SInst<(outs IntRegs:$Rd), (ins DoubleRegs:$Rs, s6_0Imm:$s6),
+ "$Rd = add(clb($Rs), #$s6)", [], "", S_2op_tc_2_SLOT23> {
+ bits<5> Rs;
+ bits<5> Rd;
+ bits<6> s6;
+ let IClass = 0b1000;
+ let Inst{27-24} = 0b1000;
+ let Inst{23-21} = 0b011;
+ let Inst{20-16} = Rs;
+ let Inst{13-8} = s6;
+ let Inst{7-5} = 0b010;
+ let Inst{4-0} = Rd;
+}
+
+
+// Bit test/set/clear
+def S4_ntstbit_i : T_TEST_BIT_IMM<"!tstbit", 0b001>;
+def S4_ntstbit_r : T_TEST_BIT_REG<"!tstbit", 1>;
+
+def C4_nbitsset : T_TEST_BITS_REG<"!bitsset", 0b01, 1>;
+def C4_nbitsclr : T_TEST_BITS_REG<"!bitsclr", 0b10, 1>;
+def C4_nbitsclri : T_TEST_BITS_IMM<"!bitsclr", 0b10, 1>;
+
+//===----------------------------------------------------------------------===//
+// XTYPE/BIT -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// XTYPE/MPY +
+//===----------------------------------------------------------------------===//
+
+// Rd=add(#u6,mpyi(Rs,#U6)) -- Multiply by immed and add immed.
+
+let hasNewValue = 1, isExtendable = 1, opExtentBits = 6, opExtendable = 1 in
+def M4_mpyri_addi : MInst<(outs IntRegs:$Rd),
+ (ins u6_0Ext:$u6, IntRegs:$Rs, u6_0Imm:$U6),
+ "$Rd = add(#$u6, mpyi($Rs, #$U6))" , [],"",ALU64_tc_3x_SLOT23> {
+ bits<5> Rd;
+ bits<6> u6;
+ bits<5> Rs;
+ bits<6> U6;
+
+ let IClass = 0b1101;
+
+ let Inst{27-24} = 0b1000;
+ let Inst{23} = U6{5};
+ let Inst{22-21} = u6{5-4};
+ let Inst{20-16} = Rs;
+ let Inst{13} = u6{3};
+ let Inst{12-8} = Rd;
+ let Inst{7-5} = u6{2-0};
+ let Inst{4-0} = U6{4-0};
+ }
+
+// Rd=add(#u6,mpyi(Rs,Rt))
+let CextOpcode = "ADD_MPY", InputType = "imm", hasNewValue = 1,
+ isExtendable = 1, opExtentBits = 6, opExtendable = 1 in
+def M4_mpyrr_addi : MInst <(outs IntRegs:$Rd),
+ (ins u6_0Ext:$u6, IntRegs:$Rs, IntRegs:$Rt),
+ "$Rd = add(#$u6, mpyi($Rs, $Rt))" , [], "", ALU64_tc_3x_SLOT23>, ImmRegRel {
+ bits<5> Rd;
+ bits<6> u6;
+ bits<5> Rs;
+ bits<5> Rt;
+
+ let IClass = 0b1101;
+
+ let Inst{27-23} = 0b01110;
+ let Inst{22-21} = u6{5-4};
+ let Inst{20-16} = Rs;
+ let Inst{13} = u6{3};
+ let Inst{12-8} = Rt;
+ let Inst{7-5} = u6{2-0};
+ let Inst{4-0} = Rd;
+ }
+
+let hasNewValue = 1 in
+class T_AddMpy <bit MajOp, PatLeaf ImmPred, dag ins>
+ : ALU64Inst <(outs IntRegs:$dst), ins,
+ "$dst = add($src1, mpyi("#!if(MajOp,"$src3, #$src2))",
+ "#$src2, $src3))"), [],
+ "", ALU64_tc_3x_SLOT23> {
+ bits<5> dst;
+ bits<5> src1;
+ bits<8> src2;
+ bits<5> src3;
+
+ let IClass = 0b1101;
+
+ bits<6> ImmValue = !if(MajOp, src2{5-0}, src2{7-2});
+
+ let Inst{27-24} = 0b1111;
+ let Inst{23} = MajOp;
+ let Inst{22-21} = ImmValue{5-4};
+ let Inst{20-16} = src3;
+ let Inst{13} = ImmValue{3};
+ let Inst{12-8} = dst;
+ let Inst{7-5} = ImmValue{2-0};
+ let Inst{4-0} = src1;
+ }
+
+def M4_mpyri_addr_u2 : T_AddMpy<0b0, u6_2ImmPred,
+ (ins IntRegs:$src1, u6_2Imm:$src2, IntRegs:$src3)>;
+
+let isExtendable = 1, opExtentBits = 6, opExtendable = 3,
+ CextOpcode = "ADD_MPY", InputType = "imm" in
+def M4_mpyri_addr : T_AddMpy<0b1, u32_0ImmPred,
+ (ins IntRegs:$src1, IntRegs:$src3, u6_0Ext:$src2)>, ImmRegRel;
+
+// Rx=add(Ru,mpyi(Rx,Rs))
+let CextOpcode = "ADD_MPY", InputType = "reg", hasNewValue = 1 in
+def M4_mpyrr_addr: MInst_acc <(outs IntRegs:$Rx),
+ (ins IntRegs:$Ru, IntRegs:$_src_, IntRegs:$Rs),
+ "$Rx = add($Ru, mpyi($_src_, $Rs))", [],
+ "$_src_ = $Rx", M_tc_3x_SLOT23>, ImmRegRel {
+ bits<5> Rx;
+ bits<5> Ru;
+ bits<5> Rs;
+
+ let IClass = 0b1110;
+
+ let Inst{27-21} = 0b0011000;
+ let Inst{12-8} = Rx;
+ let Inst{4-0} = Ru;
+ let Inst{20-16} = Rs;
+ }
+
+
+// Vector reduce multiply word by signed half (32x16)
+//Rdd=vrmpyweh(Rss,Rtt)[:<<1]
+def M4_vrmpyeh_s0 : T_M2_vmpy<"vrmpyweh", 0b010, 0b100, 0, 0, 0>;
+def M4_vrmpyeh_s1 : T_M2_vmpy<"vrmpyweh", 0b110, 0b100, 1, 0, 0>;
+
+//Rdd=vrmpywoh(Rss,Rtt)[:<<1]
+def M4_vrmpyoh_s0 : T_M2_vmpy<"vrmpywoh", 0b001, 0b010, 0, 0, 0>;
+def M4_vrmpyoh_s1 : T_M2_vmpy<"vrmpywoh", 0b101, 0b010, 1, 0, 0>;
+
+//Rdd+=vrmpyweh(Rss,Rtt)[:<<1]
+def M4_vrmpyeh_acc_s0: T_M2_vmpy_acc<"vrmpyweh", 0b001, 0b110, 0, 0>;
+def M4_vrmpyeh_acc_s1: T_M2_vmpy_acc<"vrmpyweh", 0b101, 0b110, 1, 0>;
+
+//Rdd=vrmpywoh(Rss,Rtt)[:<<1]
+def M4_vrmpyoh_acc_s0: T_M2_vmpy_acc<"vrmpywoh", 0b011, 0b110, 0, 0>;
+def M4_vrmpyoh_acc_s1: T_M2_vmpy_acc<"vrmpywoh", 0b111, 0b110, 1, 0>;
+
+// Vector multiply halfwords, signed by unsigned
+// Rdd=vmpyhsu(Rs,Rt)[:<<]:sat
+def M2_vmpy2su_s0 : T_XTYPE_mpy64 < "vmpyhsu", 0b000, 0b111, 1, 0, 0>;
+def M2_vmpy2su_s1 : T_XTYPE_mpy64 < "vmpyhsu", 0b100, 0b111, 1, 1, 0>;
+
+// Rxx+=vmpyhsu(Rs,Rt)[:<<1]:sat
+def M2_vmac2su_s0 : T_XTYPE_mpy64_acc < "vmpyhsu", "+", 0b011, 0b101, 1, 0, 0>;
+def M2_vmac2su_s1 : T_XTYPE_mpy64_acc < "vmpyhsu", "+", 0b111, 0b101, 1, 1, 0>;
+
+// Vector polynomial multiply halfwords
+// Rdd=vpmpyh(Rs,Rt)
+def M4_vpmpyh : T_XTYPE_mpy64 < "vpmpyh", 0b110, 0b111, 0, 0, 0>;
+
+// Rxx^=vpmpyh(Rs,Rt)
+def M4_vpmpyh_acc : T_XTYPE_mpy64_acc < "vpmpyh", "^", 0b101, 0b111, 0, 0, 0>;
+
+// Polynomial multiply words
+// Rdd=pmpyw(Rs,Rt)
+def M4_pmpyw : T_XTYPE_mpy64 < "pmpyw", 0b010, 0b111, 0, 0, 0>;
+
+// Rxx^=pmpyw(Rs,Rt)
+def M4_pmpyw_acc : T_XTYPE_mpy64_acc < "pmpyw", "^", 0b001, 0b111, 0, 0, 0>;
+
+//===----------------------------------------------------------------------===//
+// XTYPE/MPY -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ALU64/Vector compare
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// Template class for vector compare
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 0 in
+class T_vcmpImm <string Str, bits<2> cmpOp, bits<2> minOp, Operand ImmOprnd>
+ : ALU64_rr <(outs PredRegs:$Pd),
+ (ins DoubleRegs:$Rss, ImmOprnd:$Imm),
+ "$Pd = "#Str#"($Rss, #$Imm)",
+ [], "", ALU64_tc_2early_SLOT23> {
+ bits<2> Pd;
+ bits<5> Rss;
+ bits<32> Imm;
+ bits<8> ImmBits;
+ let ImmBits{6-0} = Imm{6-0};
+ let ImmBits{7} = !if (!eq(cmpOp,0b10), 0b0, Imm{7}); // 0 for vcmp[bhw].gtu
+
+ let IClass = 0b1101;
+
+ let Inst{27-24} = 0b1100;
+ let Inst{22-21} = cmpOp;
+ let Inst{20-16} = Rss;
+ let Inst{12-5} = ImmBits;
+ let Inst{4-3} = minOp;
+ let Inst{1-0} = Pd;
+ }
+
+// Vector compare bytes
+def A4_vcmpbgt : T_vcmp <"vcmpb.gt", 0b1010>;
+
+let AsmString = "$Pd = any8(vcmpb.eq($Rss, $Rtt))" in
+def A4_vcmpbeq_any : T_vcmp <"any8(vcmpb.gt", 0b1000>;
+
+def A4_vcmpbeqi : T_vcmpImm <"vcmpb.eq", 0b00, 0b00, u8_0Imm>;
+def A4_vcmpbgti : T_vcmpImm <"vcmpb.gt", 0b01, 0b00, s8_0Imm>;
+def A4_vcmpbgtui : T_vcmpImm <"vcmpb.gtu", 0b10, 0b00, u7_0Imm>;
+
+// Vector compare halfwords
+def A4_vcmpheqi : T_vcmpImm <"vcmph.eq", 0b00, 0b01, s8_0Imm>;
+def A4_vcmphgti : T_vcmpImm <"vcmph.gt", 0b01, 0b01, s8_0Imm>;
+def A4_vcmphgtui : T_vcmpImm <"vcmph.gtu", 0b10, 0b01, u7_0Imm>;
+
+// Vector compare words
+def A4_vcmpweqi : T_vcmpImm <"vcmpw.eq", 0b00, 0b10, s8_0Imm>;
+def A4_vcmpwgti : T_vcmpImm <"vcmpw.gt", 0b01, 0b10, s8_0Imm>;
+def A4_vcmpwgtui : T_vcmpImm <"vcmpw.gtu", 0b10, 0b10, u7_0Imm>;
+
+//===----------------------------------------------------------------------===//
+// XTYPE/SHIFT +
+//===----------------------------------------------------------------------===//
+// Shift by immediate and accumulate/logical.
+// Rx=add(#u8,asl(Rx,#U5)) Rx=add(#u8,lsr(Rx,#U5))
+// Rx=sub(#u8,asl(Rx,#U5)) Rx=sub(#u8,lsr(Rx,#U5))
+// Rx=and(#u8,asl(Rx,#U5)) Rx=and(#u8,lsr(Rx,#U5))
+// Rx=or(#u8,asl(Rx,#U5)) Rx=or(#u8,lsr(Rx,#U5))
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 8,
+ hasNewValue = 1, opNewValue = 0 in
+class T_S4_ShiftOperate<string MnOp, string MnSh, bit asl_lsr,
+ bits<2> MajOp, InstrItinClass Itin>
+ : MInst_acc<(outs IntRegs:$Rd), (ins u8_0Ext:$u8, IntRegs:$Rx, u5_0Imm:$U5),
+ "$Rd = "#MnOp#"(#$u8, "#MnSh#"($Rx, #$U5))",
+ [], "$Rd = $Rx", Itin> {
+
+ bits<5> Rd;
+ bits<8> u8;
+ bits<5> Rx;
+ bits<5> U5;
+
+ let IClass = 0b1101;
+ let Inst{27-24} = 0b1110;
+ let Inst{23-21} = u8{7-5};
+ let Inst{20-16} = Rd;
+ let Inst{13} = u8{4};
+ let Inst{12-8} = U5;
+ let Inst{7-5} = u8{3-1};
+ let Inst{4} = asl_lsr;
+ let Inst{3} = u8{0};
+ let Inst{2-1} = MajOp;
+}
+
+multiclass T_ShiftOperate<string mnemonic, bits<2> MajOp, InstrItinClass Itin> {
+ def _asl_ri : T_S4_ShiftOperate<mnemonic, "asl", 0, MajOp, Itin>;
+ def _lsr_ri : T_S4_ShiftOperate<mnemonic, "lsr", 1, MajOp, Itin>;
+}
+
+defm S4_addi : T_ShiftOperate<"add", 0b10, ALU64_tc_2_SLOT23>;
+defm S4_andi : T_ShiftOperate<"and", 0b00, ALU64_tc_2_SLOT23>;
+defm S4_ori : T_ShiftOperate<"or", 0b01, ALU64_tc_1_SLOT23>;
+defm S4_subi : T_ShiftOperate<"sub", 0b11, ALU64_tc_1_SLOT23>;
+
+// Vector conditional negate
+// Rdd=vcnegh(Rss,Rt)
+let Defs = [USR_OVF], Itinerary = S_3op_tc_2_SLOT23 in
+def S2_vcnegh : T_S3op_shiftVect < "vcnegh", 0b11, 0b01>;
+
+// Rd=[cround|round](Rs,Rt)
+let hasNewValue = 1, Itinerary = S_3op_tc_2_SLOT23 in {
+ def A4_cround_rr : T_S3op_3 < "cround", IntRegs, 0b11, 0b00>;
+ def A4_round_rr : T_S3op_3 < "round", IntRegs, 0b11, 0b10>;
+}
+
+// Rd=round(Rs,Rt):sat
+let hasNewValue = 1, Defs = [USR_OVF], Itinerary = S_3op_tc_2_SLOT23 in
+def A4_round_rr_sat : T_S3op_3 < "round", IntRegs, 0b11, 0b11, 1>;
+
+// Rd=[cmpyiwh|cmpyrwh](Rss,Rt):<<1:rnd:sat
+let Defs = [USR_OVF], Itinerary = S_3op_tc_3x_SLOT23 in {
+ def M4_cmpyi_wh : T_S3op_8<"cmpyiwh", 0b100, 1, 1, 1>;
+ def M4_cmpyr_wh : T_S3op_8<"cmpyrwh", 0b110, 1, 1, 1>;
+}
+
+// Rdd=[add|sub](Rss,Rtt,Px):carry
+let isPredicateLate = 1, hasSideEffects = 0 in
+class T_S3op_carry <string mnemonic, bits<3> MajOp>
+ : SInst < (outs DoubleRegs:$Rdd, PredRegs:$Px),
+ (ins DoubleRegs:$Rss, DoubleRegs:$Rtt, PredRegs:$Pu),
+ "$Rdd = "#mnemonic#"($Rss, $Rtt, $Pu):carry",
+ [], "$Px = $Pu", S_3op_tc_1_SLOT23 > {
+ bits<5> Rdd;
+ bits<5> Rss;
+ bits<5> Rtt;
+ bits<2> Pu;
+
+ let IClass = 0b1100;
+
+ let Inst{27-24} = 0b0010;
+ let Inst{23-21} = MajOp;
+ let Inst{20-16} = Rss;
+ let Inst{12-8} = Rtt;
+ let Inst{6-5} = Pu;
+ let Inst{4-0} = Rdd;
+ }
+
+def A4_addp_c : T_S3op_carry < "add", 0b110 >;
+def A4_subp_c : T_S3op_carry < "sub", 0b111 >;
+
+let Itinerary = S_3op_tc_3_SLOT23, hasSideEffects = 0 in
+class T_S3op_6 <string mnemonic, bits<3> MinOp, bit isUnsigned>
+ : SInst <(outs DoubleRegs:$Rxx),
+ (ins DoubleRegs:$dst2, DoubleRegs:$Rss, IntRegs:$Ru),
+ "$Rxx = "#mnemonic#"($Rss, $Ru)" ,
+ [] , "$dst2 = $Rxx"> {
+ bits<5> Rxx;
+ bits<5> Rss;
+ bits<5> Ru;
+
+ let IClass = 0b1100;
+
+ let Inst{27-21} = 0b1011001;
+ let Inst{20-16} = Rss;
+ let Inst{13} = isUnsigned;
+ let Inst{12-8} = Rxx;
+ let Inst{7-5} = MinOp;
+ let Inst{4-0} = Ru;
+ }
+
+// Vector reduce maximum halfwords
+// Rxx=vrmax[u]h(Rss,Ru)
+def A4_vrmaxh : T_S3op_6 < "vrmaxh", 0b001, 0>;
+def A4_vrmaxuh : T_S3op_6 < "vrmaxuh", 0b001, 1>;
+
+// Vector reduce maximum words
+// Rxx=vrmax[u]w(Rss,Ru)
+def A4_vrmaxw : T_S3op_6 < "vrmaxw", 0b010, 0>;
+def A4_vrmaxuw : T_S3op_6 < "vrmaxuw", 0b010, 1>;
+
+// Vector reduce minimum halfwords
+// Rxx=vrmin[u]h(Rss,Ru)
+def A4_vrminh : T_S3op_6 < "vrminh", 0b101, 0>;
+def A4_vrminuh : T_S3op_6 < "vrminuh", 0b101, 1>;
+
+// Vector reduce minimum words
+// Rxx=vrmin[u]w(Rss,Ru)
+def A4_vrminw : T_S3op_6 < "vrminw", 0b110, 0>;
+def A4_vrminuw : T_S3op_6 < "vrminuw", 0b110, 1>;
+
+// Shift an immediate left by register amount.
+let hasNewValue = 1, hasSideEffects = 0 in
+def S4_lsli: SInst <(outs IntRegs:$Rd), (ins s6_0Imm:$s6, IntRegs:$Rt),
+ "$Rd = lsl(#$s6, $Rt)" , [], "", S_3op_tc_1_SLOT23> {
+ bits<5> Rd;
+ bits<6> s6;
+ bits<5> Rt;
+
+ let IClass = 0b1100;
+
+ let Inst{27-22} = 0b011010;
+ let Inst{20-16} = s6{5-1};
+ let Inst{12-8} = Rt;
+ let Inst{7-6} = 0b11;
+ let Inst{4-0} = Rd;
+ let Inst{5} = s6{0};
+ }
+
+//===----------------------------------------------------------------------===//
+// XTYPE/SHIFT -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MEMOP
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// Template class for MemOp instructions with the register value.
+//===----------------------------------------------------------------------===//
+class MemOp_rr_base <string opc, bits<2> opcBits, Operand ImmOp,
+ string memOp, bits<2> memOpBits> :
+ MEMInst_V4<(outs),
+ (ins IntRegs:$base, ImmOp:$offset, IntRegs:$delta),
+ opc#"($base+#$offset)"#memOp#"$delta",
+ []>,
+ Requires<[UseMEMOP]> {
+
+ bits<5> base;
+ bits<5> delta;
+ bits<32> offset;
+ bits<6> offsetBits; // memb - u6:0 , memh - u6:1, memw - u6:2
+
+ let offsetBits = !if (!eq(opcBits, 0b00), offset{5-0},
+ !if (!eq(opcBits, 0b01), offset{6-1},
+ !if (!eq(opcBits, 0b10), offset{7-2},0)));
+
+ let opExtentAlign = opcBits;
+ let IClass = 0b0011;
+ let Inst{27-24} = 0b1110;
+ let Inst{22-21} = opcBits;
+ let Inst{20-16} = base;
+ let Inst{13} = 0b0;
+ let Inst{12-7} = offsetBits;
+ let Inst{6-5} = memOpBits;
+ let Inst{4-0} = delta;
+}
+
+//===----------------------------------------------------------------------===//
+// Template class for MemOp instructions with the immediate value.
+//===----------------------------------------------------------------------===//
+class MemOp_ri_base <string opc, bits<2> opcBits, Operand ImmOp,
+ string memOp, bits<2> memOpBits> :
+ MEMInst_V4 <(outs),
+ (ins IntRegs:$base, ImmOp:$offset, u5_0Imm:$delta),
+ opc#"($base+#$offset)"#memOp#"#$delta"
+ #!if(memOpBits{1},")", ""), // clrbit, setbit - include ')'
+ []>,
+ Requires<[UseMEMOP]> {
+
+ bits<5> base;
+ bits<5> delta;
+ bits<32> offset;
+ bits<6> offsetBits; // memb - u6:0 , memh - u6:1, memw - u6:2
+
+ let offsetBits = !if (!eq(opcBits, 0b00), offset{5-0},
+ !if (!eq(opcBits, 0b01), offset{6-1},
+ !if (!eq(opcBits, 0b10), offset{7-2},0)));
+
+ let opExtentAlign = opcBits;
+ let IClass = 0b0011;
+ let Inst{27-24} = 0b1111;
+ let Inst{22-21} = opcBits;
+ let Inst{20-16} = base;
+ let Inst{13} = 0b0;
+ let Inst{12-7} = offsetBits;
+ let Inst{6-5} = memOpBits;
+ let Inst{4-0} = delta;
+}
+
+// multiclass to define MemOp instructions with register operand.
+multiclass MemOp_rr<string opc, bits<2> opcBits, Operand ImmOp> {
+ def L4_add#NAME : MemOp_rr_base <opc, opcBits, ImmOp, " += ", 0b00>; // add
+ def L4_sub#NAME : MemOp_rr_base <opc, opcBits, ImmOp, " -= ", 0b01>; // sub
+ def L4_and#NAME : MemOp_rr_base <opc, opcBits, ImmOp, " &= ", 0b10>; // and
+ def L4_or#NAME : MemOp_rr_base <opc, opcBits, ImmOp, " |= ", 0b11>; // or
+}
+
+// multiclass to define MemOp instructions with immediate Operand.
+multiclass MemOp_ri<string opc, bits<2> opcBits, Operand ImmOp> {
+ def L4_iadd#NAME : MemOp_ri_base <opc, opcBits, ImmOp, " += ", 0b00 >;
+ def L4_isub#NAME : MemOp_ri_base <opc, opcBits, ImmOp, " -= ", 0b01 >;
+ def L4_iand#NAME : MemOp_ri_base<opc, opcBits, ImmOp, " = clrbit(", 0b10>;
+ def L4_ior#NAME : MemOp_ri_base<opc, opcBits, ImmOp, " = setbit(", 0b11>;
+}
+
+multiclass MemOp_base <string opc, bits<2> opcBits, Operand ImmOp> {
+ defm _#NAME : MemOp_rr <opc, opcBits, ImmOp>;
+ defm _#NAME : MemOp_ri <opc, opcBits, ImmOp>;
+}
+
+// Define MemOp instructions.
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 0 in {
+ let opExtentBits = 6, accessSize = ByteAccess in
+ defm memopb_io : MemOp_base <"memb", 0b00, u6_0Ext>;
+
+ let opExtentBits = 7, accessSize = HalfWordAccess in
+ defm memoph_io : MemOp_base <"memh", 0b01, u6_1Ext>;
+
+ let opExtentBits = 8, accessSize = WordAccess in
+ defm memopw_io : MemOp_base <"memw", 0b10, u6_2Ext>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// XTYPE/PRED +
+//===----------------------------------------------------------------------===//
+
+// Hexagon V4 only supports these flavors of byte/half compare instructions:
+// EQ/GT/GTU. Other flavors like GE/GEU/LT/LTU/LE/LEU are not supported by
+// hardware. However, compiler can still implement these patterns through
+// appropriate patterns combinations based on current implemented patterns.
+// The implemented patterns are: EQ/GT/GTU.
+// Missing patterns are: GE/GEU/LT/LTU/LE/LEU.
+
+// Following instruction is not being extended as it results into the
+// incorrect code for negative numbers.
+// Pd=cmpb.eq(Rs,#u8)
+
+// p=!cmp.eq(r1,#s10)
+def C4_cmpneqi : T_CMP <"cmp.eq", 0b00, 1, s10_0Ext>;
+def C4_cmpltei : T_CMP <"cmp.gt", 0b01, 1, s10_0Ext>;
+def C4_cmplteui : T_CMP <"cmp.gtu", 0b10, 1, u9_0Ext>;
+
+//===----------------------------------------------------------------------===//
+// XTYPE/PRED -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Multiclass for DeallocReturn
+//===----------------------------------------------------------------------===//
+class L4_RETURN<string mnemonic, bit isNot, bit isPredNew, bit isTak>
+ : LD0Inst<(outs), (ins PredRegs:$src),
+ !if(isNot, "if (!$src", "if ($src")#
+ !if(isPredNew, ".new) ", ") ")#mnemonic#
+ !if(isPredNew, #!if(isTak,":t", ":nt"),""),
+ [], "", LD_tc_3or4stall_SLOT0> {
+
+ bits<2> src;
+ let BaseOpcode = "L4_RETURN";
+ let isPredicatedFalse = isNot;
+ let isPredicatedNew = isPredNew;
+ let isTaken = isTak;
+ let IClass = 0b1001;
+
+ let Inst{27-16} = 0b011000011110;
+
+ let Inst{13} = isNot;
+ let Inst{12} = isTak;
+ let Inst{11} = isPredNew;
+ let Inst{10} = 0b0;
+ let Inst{9-8} = src;
+ let Inst{4-0} = 0b11110;
+ }
+
+// Produce all predicated forms, p, !p, p.new, !p.new, :t, :nt
+multiclass L4_RETURN_PRED<string mnemonic, bit PredNot> {
+ let isPredicated = 1 in {
+ def _#NAME# : L4_RETURN <mnemonic, PredNot, 0, 1>;
+ def _#NAME#new_pnt : L4_RETURN <mnemonic, PredNot, 1, 0>;
+ def _#NAME#new_pt : L4_RETURN <mnemonic, PredNot, 1, 1>;
+ }
+}
+
+multiclass LD_MISC_L4_RETURN<string mnemonic> {
+ let isBarrier = 1, isPredicable = 1 in
+ def NAME : LD0Inst <(outs), (ins), mnemonic, [], "",
+ LD_tc_3or4stall_SLOT0> {
+ let BaseOpcode = "L4_RETURN";
+ let IClass = 0b1001;
+ let Inst{27-16} = 0b011000011110;
+ let Inst{13-10} = 0b0000;
+ let Inst{4-0} = 0b11110;
+ }
+ defm t : L4_RETURN_PRED<mnemonic, 0 >;
+ defm f : L4_RETURN_PRED<mnemonic, 1 >;
+}
+
+let isReturn = 1, isTerminator = 1,
+ Defs = [R29, R30, R31, PC], Uses = [R30], hasSideEffects = 0 in
+defm L4_return: LD_MISC_L4_RETURN <"dealloc_return">, PredNewRel;
+
+// Restore registers and dealloc return function call.
+let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1,
+ Defs = [R29, R30, R31, PC], isPredicable = 0, isAsmParserOnly = 1 in {
+ def RESTORE_DEALLOC_RET_JMP_V4 : T_JMP<"">;
+
+ let isExtended = 1, opExtendable = 0 in
+ def RESTORE_DEALLOC_RET_JMP_V4_EXT : T_JMP<"">;
+
+ let Defs = [R14, R15, R28, R29, R30, R31, PC] in {
+ def RESTORE_DEALLOC_RET_JMP_V4_PIC : T_JMP<"">;
+
+ let isExtended = 1, opExtendable = 0 in
+ def RESTORE_DEALLOC_RET_JMP_V4_EXT_PIC : T_JMP<"">;
+ }
+}
+
+// Restore registers and dealloc frame before a tail call.
+let isCall = 1, Defs = [R29, R30, R31, PC], isAsmParserOnly = 1 in {
+ def RESTORE_DEALLOC_BEFORE_TAILCALL_V4 : T_Call<0, "">, PredRel;
+
+ let isExtended = 1, opExtendable = 0 in
+ def RESTORE_DEALLOC_BEFORE_TAILCALL_V4_EXT : T_Call<0, "">, PredRel;
+
+ let Defs = [R14, R15, R28, R29, R30, R31, PC] in {
+ def RESTORE_DEALLOC_BEFORE_TAILCALL_V4_PIC : T_Call<0, "">, PredRel;
+
+ let isExtended = 1, opExtendable = 0 in
+ def RESTORE_DEALLOC_BEFORE_TAILCALL_V4_EXT_PIC : T_Call<0, "">, PredRel;
+ }
+}
+
+// Save registers function call.
+let isCall = 1, Uses = [R29, R31], isAsmParserOnly = 1 in {
+ def SAVE_REGISTERS_CALL_V4 : T_Call<0, "">, PredRel;
+
+ let isExtended = 1, opExtendable = 0 in
+ def SAVE_REGISTERS_CALL_V4_EXT : T_Call<0, "">, PredRel;
+
+ let Defs = [P0] in
+ def SAVE_REGISTERS_CALL_V4STK : T_Call<0, "">, PredRel;
+
+ let Defs = [P0], isExtended = 1, opExtendable = 0 in
+ def SAVE_REGISTERS_CALL_V4STK_EXT : T_Call<0, "">, PredRel;
+
+ let Defs = [R14, R15, R28] in
+ def SAVE_REGISTERS_CALL_V4_PIC : T_Call<0, "">, PredRel;
+
+ let Defs = [R14, R15, R28], isExtended = 1, opExtendable = 0 in
+ def SAVE_REGISTERS_CALL_V4_EXT_PIC : T_Call<0, "">, PredRel;
+
+ let Defs = [R14, R15, R28, P0] in
+ def SAVE_REGISTERS_CALL_V4STK_PIC : T_Call<0, "">, PredRel;
+
+ let Defs = [R14, R15, R28, P0], isExtended = 1, opExtendable = 0 in
+ def SAVE_REGISTERS_CALL_V4STK_EXT_PIC : T_Call<0, "">, PredRel;
+}
+
+//===----------------------------------------------------------------------===//
+// Template class for non predicated store instructions with
+// GP-Relative or absolute addressing.
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0, isPredicable = 1 in
+class T_StoreAbsGP <string mnemonic, RegisterClass RC, Operand ImmOp,
+ bits<2>MajOp, bit isAbs, bit isHalf>
+ : STInst<(outs), (ins ImmOp:$addr, RC:$src),
+ mnemonic # "(#$addr) = $src"#!if(isHalf, ".h",""),
+ [], "", V2LDST_tc_st_SLOT01> {
+ bits<19> addr;
+ bits<5> src;
+ bits<16> offsetBits;
+
+ string ImmOpStr = !cast<string>(ImmOp);
+ let offsetBits = !if (!eq(ImmOpStr, "u16_3Imm"), addr{18-3},
+ !if (!eq(ImmOpStr, "u16_2Imm"), addr{17-2},
+ !if (!eq(ImmOpStr, "u16_1Imm"), addr{16-1},
+ /* u16_0Imm */ addr{15-0})));
+ // Store upper-half and store doubleword cannot be NV.
+ let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isHalf,0,1));
+ let Uses = !if (isAbs, [], [GP]);
+
+ let IClass = 0b0100;
+ let Inst{27} = 1;
+ let Inst{26-25} = offsetBits{15-14};
+ let Inst{24} = 0b0;
+ let Inst{23-22} = MajOp;
+ let Inst{21} = isHalf;
+ let Inst{20-16} = offsetBits{13-9};
+ let Inst{13} = offsetBits{8};
+ let Inst{12-8} = src;
+ let Inst{7-0} = offsetBits{7-0};
+ }
+
+//===----------------------------------------------------------------------===//
+// Template class for predicated store instructions with
+// GP-Relative or absolute addressing.
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0, isPredicated = 1, opExtentBits = 6, opExtendable = 1 in
+class T_StoreAbs_Pred <string mnemonic, RegisterClass RC, bits<2> MajOp,
+ bit isHalf, bit isNot, bit isNew>
+ : STInst<(outs), (ins PredRegs:$src1, u32_0MustExt:$absaddr, RC: $src2),
+ !if(isNot, "if (!$src1", "if ($src1")#!if(isNew, ".new) ",
+ ") ")#mnemonic#"(#$absaddr) = $src2"#!if(isHalf, ".h",""),
+ [], "", ST_tc_st_SLOT01>, AddrModeRel {
+ bits<2> src1;
+ bits<6> absaddr;
+ bits<5> src2;
+
+ let isPredicatedNew = isNew;
+ let isPredicatedFalse = isNot;
+ // Store upper-half and store doubleword cannot be NV.
+ let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isHalf,0,1));
+
+ let IClass = 0b1010;
+
+ let Inst{27-24} = 0b1111;
+ let Inst{23-22} = MajOp;
+ let Inst{21} = isHalf;
+ let Inst{17-16} = absaddr{5-4};
+ let Inst{13} = isNew;
+ let Inst{12-8} = src2;
+ let Inst{7} = 0b1;
+ let Inst{6-3} = absaddr{3-0};
+ let Inst{2} = isNot;
+ let Inst{1-0} = src1;
+ }
+
+//===----------------------------------------------------------------------===//
+// Template class for predicated store instructions with absolute addressing.
+//===----------------------------------------------------------------------===//
+class T_StoreAbs <string mnemonic, RegisterClass RC, Operand ImmOp,
+ bits<2> MajOp, bit isHalf>
+ : T_StoreAbsGP <mnemonic, RC, u32_0MustExt, MajOp, 1, isHalf>,
+ AddrModeRel {
+ string ImmOpStr = !cast<string>(ImmOp);
+ let opExtentBits = !if (!eq(ImmOpStr, "u16_3Imm"), 19,
+ !if (!eq(ImmOpStr, "u16_2Imm"), 18,
+ !if (!eq(ImmOpStr, "u16_1Imm"), 17,
+ /* u16_0Imm */ 16)));
+
+ let opExtentAlign = !if (!eq(ImmOpStr, "u16_3Imm"), 3,
+ !if (!eq(ImmOpStr, "u16_2Imm"), 2,
+ !if (!eq(ImmOpStr, "u16_1Imm"), 1,
+ /* u16_0Imm */ 0)));
+}
+
+//===----------------------------------------------------------------------===//
+// Multiclass for store instructions with absolute addressing.
+//===----------------------------------------------------------------------===//
+let addrMode = Absolute, isExtended = 1 in
+multiclass ST_Abs<string mnemonic, string CextOp, RegisterClass RC,
+ Operand ImmOp, bits<2> MajOp, bit isHalf = 0> {
+ let CextOpcode = CextOp, BaseOpcode = CextOp#_abs in {
+ let opExtendable = 0, isPredicable = 1 in
+ def PS_#NAME#abs : T_StoreAbs <mnemonic, RC, ImmOp, MajOp, isHalf>;
+
+ // Predicated
+ def S4_p#NAME#t_abs : T_StoreAbs_Pred<mnemonic, RC, MajOp, isHalf, 0, 0>;
+ def S4_p#NAME#f_abs : T_StoreAbs_Pred<mnemonic, RC, MajOp, isHalf, 1, 0>;
+
+ // .new Predicated
+ def S4_p#NAME#tnew_abs : T_StoreAbs_Pred<mnemonic, RC, MajOp, isHalf, 0, 1>;
+ def S4_p#NAME#fnew_abs : T_StoreAbs_Pred<mnemonic, RC, MajOp, isHalf, 1, 1>;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Template class for non predicated new-value store instructions with
+// GP-Relative or absolute addressing.
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0, isPredicable = 1, mayStore = 1, isNVStore = 1,
+ isNewValue = 1, opNewValue = 1 in
+class T_StoreAbsGP_NV <string mnemonic, Operand ImmOp, bits<2>MajOp>
+ : NVInst_V4<(outs), (ins ImmOp:$addr, IntRegs:$src),
+ mnemonic #"(#$addr) = $src.new",
+ [], "", V2LDST_tc_st_SLOT0> {
+ bits<19> addr;
+ bits<3> src;
+ bits<16> offsetBits;
+
+ string ImmOpStr = !cast<string>(ImmOp);
+ let offsetBits = !if (!eq(ImmOpStr, "u16_3Imm"), addr{18-3},
+ !if (!eq(ImmOpStr, "u16_2Imm"), addr{17-2},
+ !if (!eq(ImmOpStr, "u16_1Imm"), addr{16-1},
+ /* u16_0Imm */ addr{15-0})));
+ let IClass = 0b0100;
+
+ let Inst{27} = 1;
+ let Inst{26-25} = offsetBits{15-14};
+ let Inst{24-21} = 0b0101;
+ let Inst{20-16} = offsetBits{13-9};
+ let Inst{13} = offsetBits{8};
+ let Inst{12-11} = MajOp;
+ let Inst{10-8} = src;
+ let Inst{7-0} = offsetBits{7-0};
+ }
+
+//===----------------------------------------------------------------------===//
+// Template class for predicated new-value store instructions with
+// absolute addressing.
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0, isPredicated = 1, mayStore = 1, isNVStore = 1,
+ isNewValue = 1, opNewValue = 2, opExtentBits = 6, opExtendable = 1 in
+class T_StoreAbs_NV_Pred <string mnemonic, bits<2> MajOp, bit isNot, bit isNew>
+ : NVInst_V4<(outs), (ins PredRegs:$src1, u32_0MustExt:$absaddr, IntRegs:$src2),
+ !if(isNot, "if (!$src1", "if ($src1")#!if(isNew, ".new) ",
+ ") ")#mnemonic#"(#$absaddr) = $src2.new",
+ [], "", ST_tc_st_SLOT0>, AddrModeRel {
+ bits<2> src1;
+ bits<6> absaddr;
+ bits<3> src2;
+
+ let isPredicatedNew = isNew;
+ let isPredicatedFalse = isNot;
+
+ let IClass = 0b1010;
+
+ let Inst{27-24} = 0b1111;
+ let Inst{23-21} = 0b101;
+ let Inst{17-16} = absaddr{5-4};
+ let Inst{13} = isNew;
+ let Inst{12-11} = MajOp;
+ let Inst{10-8} = src2;
+ let Inst{7} = 0b1;
+ let Inst{6-3} = absaddr{3-0};
+ let Inst{2} = isNot;
+ let Inst{1-0} = src1;
+}
+
+//===----------------------------------------------------------------------===//
+// Template class for non-predicated new-value store instructions with
+// absolute addressing.
+//===----------------------------------------------------------------------===//
+class T_StoreAbs_NV <string mnemonic, Operand ImmOp, bits<2> MajOp>
+ : T_StoreAbsGP_NV <mnemonic, u32_0MustExt, MajOp>, AddrModeRel {
+
+ string ImmOpStr = !cast<string>(ImmOp);
+ let opExtentBits = !if (!eq(ImmOpStr, "u16_3Imm"), 19,
+ !if (!eq(ImmOpStr, "u16_2Imm"), 18,
+ !if (!eq(ImmOpStr, "u16_1Imm"), 17,
+ /* u16_0Imm */ 16)));
+
+ let opExtentAlign = !if (!eq(ImmOpStr, "u16_3Imm"), 3,
+ !if (!eq(ImmOpStr, "u16_2Imm"), 2,
+ !if (!eq(ImmOpStr, "u16_1Imm"), 1,
+ /* u16_0Imm */ 0)));
+}
+
+//===----------------------------------------------------------------------===//
+// Multiclass for new-value store instructions with absolute addressing.
+//===----------------------------------------------------------------------===//
+let addrMode = Absolute, isExtended = 1 in
+multiclass ST_Abs_NV <string mnemonic, string CextOp, Operand ImmOp,
+ bits<2> MajOp> {
+ let CextOpcode = CextOp, BaseOpcode = CextOp#_abs in {
+ let opExtendable = 0, isPredicable = 1 in
+ def PS_#NAME#newabs : T_StoreAbs_NV <mnemonic, ImmOp, MajOp>;
+
+ // Predicated
+ def S4_p#NAME#newt_abs : T_StoreAbs_NV_Pred <mnemonic, MajOp, 0, 0>;
+ def S4_p#NAME#newf_abs : T_StoreAbs_NV_Pred <mnemonic, MajOp, 1, 0>;
+
+ // .new Predicated
+ def S4_p#NAME#newtnew_abs : T_StoreAbs_NV_Pred <mnemonic, MajOp, 0, 1>;
+ def S4_p#NAME#newfnew_abs : T_StoreAbs_NV_Pred <mnemonic, MajOp, 1, 1>;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Stores with absolute addressing
+//===----------------------------------------------------------------------===//
+let accessSize = ByteAccess in
+defm storerb : ST_Abs <"memb", "STrib", IntRegs, u16_0Imm, 0b00>,
+ ST_Abs_NV <"memb", "STrib", u16_0Imm, 0b00>;
+
+let accessSize = HalfWordAccess in
+defm storerh : ST_Abs <"memh", "STrih", IntRegs, u16_1Imm, 0b01>,
+ ST_Abs_NV <"memh", "STrih", u16_1Imm, 0b01>;
+
+let accessSize = WordAccess in
+defm storeri : ST_Abs <"memw", "STriw", IntRegs, u16_2Imm, 0b10>,
+ ST_Abs_NV <"memw", "STriw", u16_2Imm, 0b10>;
+
+let isNVStorable = 0, accessSize = DoubleWordAccess in
+defm storerd : ST_Abs <"memd", "STrid", DoubleRegs, u16_3Imm, 0b11>;
+
+let isNVStorable = 0, accessSize = HalfWordAccess in
+defm storerf : ST_Abs <"memh", "STrif", IntRegs, u16_1Imm, 0b01, 1>;
+
+//===----------------------------------------------------------------------===//
+// GP-relative stores.
+// mem[bhwd](#global)=Rt
+// Once predicated, these instructions map to absolute addressing mode.
+// if ([!]Pv[.new]) mem[bhwd](##global)=Rt
+//===----------------------------------------------------------------------===//
+
+let Uses = [GP], isAsmParserOnly = 1 in
+class T_StoreGP <string mnemonic, string BaseOp, RegisterClass RC,
+ Operand ImmOp, bits<2> MajOp, bit isHalf = 0>
+ : T_StoreAbsGP <mnemonic, RC, ImmOp, MajOp, 0, isHalf> {
+ // Set BaseOpcode same as absolute addressing instructions so that
+ // non-predicated GP-Rel instructions can have relate with predicated
+ // Absolute instruction.
+ let BaseOpcode = BaseOp#_abs;
+ }
+
+let Uses = [GP], isAsmParserOnly = 1 in
+multiclass ST_GP <string mnemonic, string BaseOp, Operand ImmOp,
+ bits<2> MajOp, bit isHalf = 0> {
+ // Set BaseOpcode same as absolute addressing instructions so that
+ // non-predicated GP-Rel instructions can have relate with predicated
+ // Absolute instruction.
+ let BaseOpcode = BaseOp#_abs in {
+ def NAME#gp : T_StoreAbsGP <mnemonic, IntRegs, ImmOp, MajOp,
+ 0, isHalf>;
+ // New-value store
+ def NAME#newgp : T_StoreAbsGP_NV <mnemonic, ImmOp, MajOp> ;
+ }
+}
+
+let accessSize = ByteAccess in
+defm S2_storerb : ST_GP<"memb", "STrib", u16_0Imm, 0b00>, NewValueRel;
+
+let accessSize = HalfWordAccess in
+defm S2_storerh : ST_GP<"memh", "STrih", u16_1Imm, 0b01>, NewValueRel;
+
+let accessSize = WordAccess in
+defm S2_storeri : ST_GP<"memw", "STriw", u16_2Imm, 0b10>, NewValueRel;
+
+let isNVStorable = 0, accessSize = DoubleWordAccess in
+def S2_storerdgp : T_StoreGP <"memd", "STrid", DoubleRegs,
+ u16_3Imm, 0b11>, PredNewRel;
+
+let isNVStorable = 0, accessSize = HalfWordAccess in
+def S2_storerfgp : T_StoreGP <"memh", "STrif", IntRegs,
+ u16_1Imm, 0b01, 1>, PredNewRel;
+
+//===----------------------------------------------------------------------===//
+// Template class for non predicated load instructions with
+// absolute addressing mode.
+//===----------------------------------------------------------------------===//
+let isPredicable = 1, hasSideEffects = 0 in
+class T_LoadAbsGP <string mnemonic, RegisterClass RC, Operand ImmOp,
+ bits<3> MajOp>
+ : LDInst <(outs RC:$dst), (ins ImmOp:$addr),
+ "$dst = "#mnemonic# "(#$addr)",
+ [], "", V2LDST_tc_ld_SLOT01> {
+ bits<5> dst;
+ bits<19> addr;
+ bits<16> offsetBits;
+
+ string ImmOpStr = !cast<string>(ImmOp);
+ let offsetBits = !if (!eq(ImmOpStr, "u16_3Imm"), addr{18-3},
+ !if (!eq(ImmOpStr, "u16_2Imm"), addr{17-2},
+ !if (!eq(ImmOpStr, "u16_1Imm"), addr{16-1},
+ /* u16_0Imm */ addr{15-0})));
+
+ let IClass = 0b0100;
+
+ let Inst{27} = 0b1;
+ let Inst{26-25} = offsetBits{15-14};
+ let Inst{24} = 0b1;
+ let Inst{23-21} = MajOp;
+ let Inst{20-16} = offsetBits{13-9};
+ let Inst{13-5} = offsetBits{8-0};
+ let Inst{4-0} = dst;
+ }
+
+class T_LoadAbs <string mnemonic, RegisterClass RC, Operand ImmOp,
+ bits<3> MajOp>
+ : T_LoadAbsGP <mnemonic, RC, u32_0MustExt, MajOp>, AddrModeRel {
+
+ string ImmOpStr = !cast<string>(ImmOp);
+ let opExtentBits = !if (!eq(ImmOpStr, "u16_3Imm"), 19,
+ !if (!eq(ImmOpStr, "u16_2Imm"), 18,
+ !if (!eq(ImmOpStr, "u16_1Imm"), 17,
+ /* u16_0Imm */ 16)));
+
+ let opExtentAlign = !if (!eq(ImmOpStr, "u16_3Imm"), 3,
+ !if (!eq(ImmOpStr, "u16_2Imm"), 2,
+ !if (!eq(ImmOpStr, "u16_1Imm"), 1,
+ /* u16_0Imm */ 0)));
+ }
+
+//===----------------------------------------------------------------------===//
+// Template class for predicated load instructions with
+// absolute addressing mode.
+//===----------------------------------------------------------------------===//
+let isPredicated = 1, hasSideEffects = 0, hasNewValue = 1, opExtentBits = 6,
+ opExtendable = 2 in
+class T_LoadAbs_Pred <string mnemonic, RegisterClass RC, bits<3> MajOp,
+ bit isPredNot, bit isPredNew>
+ : LDInst <(outs RC:$dst), (ins PredRegs:$src1, u32_0MustExt:$absaddr),
+ !if(isPredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+ ") ")#"$dst = "#mnemonic#"(#$absaddr)">, AddrModeRel {
+ bits<5> dst;
+ bits<2> src1;
+ bits<6> absaddr;
+
+ let isPredicatedNew = isPredNew;
+ let isPredicatedFalse = isPredNot;
+ let hasNewValue = !if (!eq(!cast<string>(RC), "DoubleRegs"), 0, 1);
+
+ let IClass = 0b1001;
+
+ let Inst{27-24} = 0b1111;
+ let Inst{23-21} = MajOp;
+ let Inst{20-16} = absaddr{5-1};
+ let Inst{13} = 0b1;
+ let Inst{12} = isPredNew;
+ let Inst{11} = isPredNot;
+ let Inst{10-9} = src1;
+ let Inst{8} = absaddr{0};
+ let Inst{7} = 0b1;
+ let Inst{4-0} = dst;
+ }
+
+//===----------------------------------------------------------------------===//
+// Multiclass for the load instructions with absolute addressing mode.
+//===----------------------------------------------------------------------===//
+multiclass LD_Abs_Pred<string mnemonic, RegisterClass RC, bits<3> MajOp,
+ bit PredNot> {
+ def _abs : T_LoadAbs_Pred <mnemonic, RC, MajOp, PredNot, 0>;
+ // Predicate new
+ def new_abs : T_LoadAbs_Pred <mnemonic, RC, MajOp, PredNot, 1>;
+}
+
+let addrMode = Absolute, isExtended = 1 in
+multiclass LD_Abs<string mnemonic, string CextOp, RegisterClass RC,
+ Operand ImmOp, bits<3> MajOp> {
+ let CextOpcode = CextOp, BaseOpcode = CextOp#_abs in {
+ let opExtendable = 1, isPredicable = 1 in
+ def PS_#NAME#abs: T_LoadAbs <mnemonic, RC, ImmOp, MajOp>;
+
+ // Predicated
+ defm L4_p#NAME#t : LD_Abs_Pred<mnemonic, RC, MajOp, 0>;
+ defm L4_p#NAME#f : LD_Abs_Pred<mnemonic, RC, MajOp, 1>;
+ }
+}
+
+let accessSize = ByteAccess, hasNewValue = 1 in {
+ defm loadrb : LD_Abs<"memb", "LDrib", IntRegs, u16_0Imm, 0b000>;
+ defm loadrub : LD_Abs<"memub", "LDriub", IntRegs, u16_0Imm, 0b001>;
+}
+
+let accessSize = HalfWordAccess, hasNewValue = 1 in {
+ defm loadrh : LD_Abs<"memh", "LDrih", IntRegs, u16_1Imm, 0b010>;
+ defm loadruh : LD_Abs<"memuh", "LDriuh", IntRegs, u16_1Imm, 0b011>;
+}
+
+let accessSize = WordAccess, hasNewValue = 1 in
+defm loadri : LD_Abs<"memw", "LDriw", IntRegs, u16_2Imm, 0b100>;
+
+let accessSize = DoubleWordAccess in
+defm loadrd : LD_Abs<"memd", "LDrid", DoubleRegs, u16_3Imm, 0b110>;
+
+//===----------------------------------------------------------------------===//
+// multiclass for load instructions with GP-relative addressing mode.
+// Rx=mem[bhwd](##global)
+// Once predicated, these instructions map to absolute addressing mode.
+// if ([!]Pv[.new]) Rx=mem[bhwd](##global)
+//===----------------------------------------------------------------------===//
+
+let isAsmParserOnly = 1, Uses = [GP] in
+class T_LoadGP <string mnemonic, string BaseOp, RegisterClass RC, Operand ImmOp,
+ bits<3> MajOp>
+ : T_LoadAbsGP <mnemonic, RC, ImmOp, MajOp>, PredNewRel {
+ let BaseOpcode = BaseOp#_abs;
+ }
+
+let accessSize = ByteAccess, hasNewValue = 1 in {
+ def L2_loadrbgp : T_LoadGP<"memb", "LDrib", IntRegs, u16_0Imm, 0b000>;
+ def L2_loadrubgp : T_LoadGP<"memub", "LDriub", IntRegs, u16_0Imm, 0b001>;
+}
+
+let accessSize = HalfWordAccess, hasNewValue = 1 in {
+ def L2_loadrhgp : T_LoadGP<"memh", "LDrih", IntRegs, u16_1Imm, 0b010>;
+ def L2_loadruhgp : T_LoadGP<"memuh", "LDriuh", IntRegs, u16_1Imm, 0b011>;
+}
+
+let accessSize = WordAccess, hasNewValue = 1 in
+def L2_loadrigp : T_LoadGP<"memw", "LDriw", IntRegs, u16_2Imm, 0b100>;
+
+let accessSize = DoubleWordAccess in
+def L2_loadrdgp : T_LoadGP<"memd", "LDrid", DoubleRegs, u16_3Imm, 0b110>;
+
+//===----------------------------------------------------------------------===//
+// :raw for of boundscheck:hi:lo insns
+//===----------------------------------------------------------------------===//
+
+// A4_boundscheck_lo: Detect if a register is within bounds.
+let hasSideEffects = 0 in
+def A4_boundscheck_lo: ALU64Inst <
+ (outs PredRegs:$Pd),
+ (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
+ "$Pd = boundscheck($Rss, $Rtt):raw:lo"> {
+ bits<2> Pd;
+ bits<5> Rss;
+ bits<5> Rtt;
+
+ let IClass = 0b1101;
+
+ let Inst{27-23} = 0b00100;
+ let Inst{13} = 0b1;
+ let Inst{7-5} = 0b100;
+ let Inst{1-0} = Pd;
+ let Inst{20-16} = Rss;
+ let Inst{12-8} = Rtt;
+ }
+
+// A4_boundscheck_hi: Detect if a register is within bounds.
+let hasSideEffects = 0 in
+def A4_boundscheck_hi: ALU64Inst <
+ (outs PredRegs:$Pd),
+ (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
+ "$Pd = boundscheck($Rss, $Rtt):raw:hi"> {
+ bits<2> Pd;
+ bits<5> Rss;
+ bits<5> Rtt;
+
+ let IClass = 0b1101;
+
+ let Inst{27-23} = 0b00100;
+ let Inst{13} = 0b1;
+ let Inst{7-5} = 0b101;
+ let Inst{1-0} = Pd;
+ let Inst{20-16} = Rss;
+ let Inst{12-8} = Rtt;
+ }
+
+let hasSideEffects = 0, isAsmParserOnly = 1 in
+def A4_boundscheck : MInst <
+ (outs PredRegs:$Pd), (ins IntRegs:$Rs, DoubleRegs:$Rtt),
+ "$Pd=boundscheck($Rs,$Rtt)">;
+
+// A4_tlbmatch: Detect if a VA/ASID matches a TLB entry.
+let isPredicateLate = 1, hasSideEffects = 0 in
+def A4_tlbmatch : ALU64Inst<(outs PredRegs:$Pd),
+ (ins DoubleRegs:$Rs, IntRegs:$Rt),
+ "$Pd = tlbmatch($Rs, $Rt)",
+ [], "", ALU64_tc_2early_SLOT23> {
+ bits<2> Pd;
+ bits<5> Rs;
+ bits<5> Rt;
+
+ let IClass = 0b1101;
+ let Inst{27-23} = 0b00100;
+ let Inst{20-16} = Rs;
+ let Inst{13} = 0b1;
+ let Inst{12-8} = Rt;
+ let Inst{7-5} = 0b011;
+ let Inst{1-0} = Pd;
+ }
+
+// Use LD0Inst for dcfetch, but set "mayLoad" to 0 because this doesn't
+// really do a load.
+let hasSideEffects = 1, mayLoad = 0 in
+def Y2_dcfetchbo : LD0Inst<(outs), (ins IntRegs:$Rs, u11_3Imm:$u11_3),
+ "dcfetch($Rs + #$u11_3)",
+ [], "", LD_tc_ld_SLOT0> {
+ bits<5> Rs;
+ bits<14> u11_3;
+
+ let IClass = 0b1001;
+ let Inst{27-21} = 0b0100000;
+ let Inst{20-16} = Rs;
+ let Inst{13} = 0b0;
+ let Inst{10-0} = u11_3{13-3};
+}
+
+
+//===----------------------------------------------------------------------===//
+// Compound instructions
+//===----------------------------------------------------------------------===//
+
+let isBranch = 1, hasSideEffects = 0, isExtentSigned = 1,
+ isPredicated = 1, isPredicatedNew = 1, isExtendable = 1,
+ opExtentBits = 11, opExtentAlign = 2, opExtendable = 1,
+ isTerminator = 1 in
+class CJInst_tstbit_R0<string px, bit np, string tnt>
+ : InstHexagon<(outs), (ins IntRegs:$Rs, brtarget:$r9_2),
+ ""#px#" = tstbit($Rs, #0); if ("
+ #!if(np, "!","")#""#px#".new) jump:"#tnt#" $r9_2",
+ [], "", COMPOUND_CJ_ARCHDEPSLOT, TypeCOMPOUND>, OpcodeHexagon {
+ bits<4> Rs;
+ bits<11> r9_2;
+
+ // np: !p[01]
+ let isPredicatedFalse = np;
+ // tnt: Taken/Not Taken
+ let isBrTaken = !if (!eq(tnt, "t"), "true", "false");
+ let isTaken = !if (!eq(tnt, "t"), 1, 0);
+
+ let IClass = 0b0001;
+ let Inst{27-26} = 0b00;
+ let Inst{25} = !if (!eq(px, "!p1"), 1,
+ !if (!eq(px, "p1"), 1, 0));
+ let Inst{24-23} = 0b11;
+ let Inst{22} = np;
+ let Inst{21-20} = r9_2{10-9};
+ let Inst{19-16} = Rs;
+ let Inst{13} = !if (!eq(tnt, "t"), 1, 0);
+ let Inst{9-8} = 0b11;
+ let Inst{7-1} = r9_2{8-2};
+}
+
+let Defs = [PC, P0], Uses = [P0] in {
+ def J4_tstbit0_tp0_jump_nt : CJInst_tstbit_R0<"p0", 0, "nt">;
+ def J4_tstbit0_tp0_jump_t : CJInst_tstbit_R0<"p0", 0, "t">;
+ def J4_tstbit0_fp0_jump_nt : CJInst_tstbit_R0<"p0", 1, "nt">;
+ def J4_tstbit0_fp0_jump_t : CJInst_tstbit_R0<"p0", 1, "t">;
+}
+
+let Defs = [PC, P1], Uses = [P1] in {
+ def J4_tstbit0_tp1_jump_nt : CJInst_tstbit_R0<"p1", 0, "nt">;
+ def J4_tstbit0_tp1_jump_t : CJInst_tstbit_R0<"p1", 0, "t">;
+ def J4_tstbit0_fp1_jump_nt : CJInst_tstbit_R0<"p1", 1, "nt">;
+ def J4_tstbit0_fp1_jump_t : CJInst_tstbit_R0<"p1", 1, "t">;
+}
+
+
+let isBranch = 1, hasSideEffects = 0,
+ isExtentSigned = 1, isPredicated = 1, isPredicatedNew = 1,
+ isExtendable = 1, opExtentBits = 11, opExtentAlign = 2,
+ opExtendable = 2, isTerminator = 1 in
+class CJInst_RR<string px, string op, bit np, string tnt>
+ : InstHexagon<(outs), (ins IntRegs:$Rs, IntRegs:$Rt, brtarget:$r9_2),
+ ""#px#" = cmp."#op#"($Rs, $Rt); if ("
+ #!if(np, "!","")#""#px#".new) jump:"#tnt#" $r9_2",
+ [], "", COMPOUND_CJ_ARCHDEPSLOT, TypeCOMPOUND>, OpcodeHexagon {
+ bits<4> Rs;
+ bits<4> Rt;
+ bits<11> r9_2;
+
+ // np: !p[01]
+ let isPredicatedFalse = np;
+ // tnt: Taken/Not Taken
+ let isBrTaken = !if (!eq(tnt, "t"), "true", "false");
+ let isTaken = !if (!eq(tnt, "t"), 1, 0);
+
+ let IClass = 0b0001;
+ let Inst{27-23} = !if (!eq(op, "eq"), 0b01000,
+ !if (!eq(op, "gt"), 0b01001,
+ !if (!eq(op, "gtu"), 0b01010, 0)));
+ let Inst{22} = np;
+ let Inst{21-20} = r9_2{10-9};
+ let Inst{19-16} = Rs;
+ let Inst{13} = !if (!eq(tnt, "t"), 1, 0);
+ // px: Predicate reg 0/1
+ let Inst{12} = !if (!eq(px, "!p1"), 1,
+ !if (!eq(px, "p1"), 1, 0));
+ let Inst{11-8} = Rt;
+ let Inst{7-1} = r9_2{8-2};
+}
+
+// P[10] taken/not taken.
+multiclass T_tnt_CJInst_RR<string op, bit np> {
+ let Defs = [PC, P0], Uses = [P0] in {
+ def NAME#p0_jump_nt : CJInst_RR<"p0", op, np, "nt">;
+ def NAME#p0_jump_t : CJInst_RR<"p0", op, np, "t">;
+ }
+ let Defs = [PC, P1], Uses = [P1] in {
+ def NAME#p1_jump_nt : CJInst_RR<"p1", op, np, "nt">;
+ def NAME#p1_jump_t : CJInst_RR<"p1", op, np, "t">;
+ }
+}
+// Predicate / !Predicate
+multiclass T_pnp_CJInst_RR<string op>{
+ defm J4_cmp#NAME#_t : T_tnt_CJInst_RR<op, 0>;
+ defm J4_cmp#NAME#_f : T_tnt_CJInst_RR<op, 1>;
+}
+// TypeCJ Instructions compare RR and jump
+defm eq : T_pnp_CJInst_RR<"eq">;
+defm gt : T_pnp_CJInst_RR<"gt">;
+defm gtu : T_pnp_CJInst_RR<"gtu">;
+
+let isBranch = 1, hasSideEffects = 0, isExtentSigned = 1,
+ isPredicated = 1, isPredicatedNew = 1, isExtendable = 1, opExtentBits = 11,
+ opExtentAlign = 2, opExtendable = 2, isTerminator = 1 in
+class CJInst_RU5<string px, string op, bit np, string tnt>
+ : InstHexagon<(outs), (ins IntRegs:$Rs, u5_0Imm:$U5, brtarget:$r9_2),
+ ""#px#" = cmp."#op#"($Rs, #$U5); if ("
+ #!if(np, "!","")#""#px#".new) jump:"#tnt#" $r9_2",
+ [], "", COMPOUND_CJ_ARCHDEPSLOT, TypeCOMPOUND>, OpcodeHexagon {
+ bits<4> Rs;
+ bits<5> U5;
+ bits<11> r9_2;
+
+ // np: !p[01]
+ let isPredicatedFalse = np;
+ // tnt: Taken/Not Taken
+ let isBrTaken = !if (!eq(tnt, "t"), "true", "false");
+ let isTaken = !if (!eq(tnt, "t"), 1, 0);
+
+ let IClass = 0b0001;
+ let Inst{27-26} = 0b00;
+ // px: Predicate reg 0/1
+ let Inst{25} = !if (!eq(px, "!p1"), 1,
+ !if (!eq(px, "p1"), 1, 0));
+ let Inst{24-23} = !if (!eq(op, "eq"), 0b00,
+ !if (!eq(op, "gt"), 0b01,
+ !if (!eq(op, "gtu"), 0b10, 0)));
+ let Inst{22} = np;
+ let Inst{21-20} = r9_2{10-9};
+ let Inst{19-16} = Rs;
+ let Inst{13} = !if (!eq(tnt, "t"), 1, 0);
+ let Inst{12-8} = U5;
+ let Inst{7-1} = r9_2{8-2};
+}
+// P[10] taken/not taken.
+multiclass T_tnt_CJInst_RU5<string op, bit np> {
+ let Defs = [PC, P0], Uses = [P0] in {
+ def NAME#p0_jump_nt : CJInst_RU5<"p0", op, np, "nt">;
+ def NAME#p0_jump_t : CJInst_RU5<"p0", op, np, "t">;
+ }
+ let Defs = [PC, P1], Uses = [P1] in {
+ def NAME#p1_jump_nt : CJInst_RU5<"p1", op, np, "nt">;
+ def NAME#p1_jump_t : CJInst_RU5<"p1", op, np, "t">;
+ }
+}
+// Predicate / !Predicate
+multiclass T_pnp_CJInst_RU5<string op>{
+ defm J4_cmp#NAME#i_t : T_tnt_CJInst_RU5<op, 0>;
+ defm J4_cmp#NAME#i_f : T_tnt_CJInst_RU5<op, 1>;
+}
+// TypeCJ Instructions compare RI and jump
+defm eq : T_pnp_CJInst_RU5<"eq">;
+defm gt : T_pnp_CJInst_RU5<"gt">;
+defm gtu : T_pnp_CJInst_RU5<"gtu">;
+
+let isBranch = 1, hasSideEffects = 0, isExtentSigned = 1,
+ isPredicated = 1, isPredicatedFalse = 1, isPredicatedNew = 1,
+ isExtendable = 1, opExtentBits = 11, opExtentAlign = 2, opExtendable = 2,
+ isTerminator = 1 in
+class CJInst_Rn1<string px, string op, bit np, string tnt>
+ : InstHexagon<(outs), (ins IntRegs:$Rs, n1Const:$n1, brtarget:$r9_2),
+ ""#px#" = cmp."#op#"($Rs,#$n1); if ("
+ #!if(np, "!","")#""#px#".new) jump:"#tnt#" $r9_2",
+ [], "", COMPOUND_CJ_ARCHDEPSLOT, TypeCOMPOUND>, OpcodeHexagon {
+ bits<4> Rs;
+ bits<11> r9_2;
+
+ // np: !p[01]
+ let isPredicatedFalse = np;
+ // tnt: Taken/Not Taken
+ let isBrTaken = !if (!eq(tnt, "t"), "true", "false");
+ let isTaken = !if (!eq(tnt, "t"), 1, 0);
+
+ let IClass = 0b0001;
+ let Inst{27-26} = 0b00;
+ let Inst{25} = !if (!eq(px, "!p1"), 1,
+ !if (!eq(px, "p1"), 1, 0));
+
+ let Inst{24-23} = 0b11;
+ let Inst{22} = np;
+ let Inst{21-20} = r9_2{10-9};
+ let Inst{19-16} = Rs;
+ let Inst{13} = !if (!eq(tnt, "t"), 1, 0);
+ let Inst{9-8} = !if (!eq(op, "eq"), 0b00,
+ !if (!eq(op, "gt"), 0b01, 0));
+ let Inst{7-1} = r9_2{8-2};
+}
+
+// P[10] taken/not taken.
+multiclass T_tnt_CJInst_Rn1<string op, bit np> {
+ let Defs = [PC, P0], Uses = [P0] in {
+ def NAME#p0_jump_nt : CJInst_Rn1<"p0", op, np, "nt">;
+ def NAME#p0_jump_t : CJInst_Rn1<"p0", op, np, "t">;
+ }
+ let Defs = [PC, P1], Uses = [P1] in {
+ def NAME#p1_jump_nt : CJInst_Rn1<"p1", op, np, "nt">;
+ def NAME#p1_jump_t : CJInst_Rn1<"p1", op, np, "t">;
+ }
+}
+// Predicate / !Predicate
+multiclass T_pnp_CJInst_Rn1<string op>{
+ defm J4_cmp#NAME#n1_t : T_tnt_CJInst_Rn1<op, 0>;
+ defm J4_cmp#NAME#n1_f : T_tnt_CJInst_Rn1<op, 1>;
+}
+// TypeCJ Instructions compare -1 and jump
+defm eq : T_pnp_CJInst_Rn1<"eq">;
+defm gt : T_pnp_CJInst_Rn1<"gt">;
+
+// J4_jumpseti: Direct unconditional jump and set register to immediate.
+let Defs = [PC], isBranch = 1, hasSideEffects = 0, hasNewValue = 1,
+ isExtentSigned = 1, opNewValue = 0, isExtendable = 1, opExtentBits = 11,
+ opExtentAlign = 2, opExtendable = 2 in
+def J4_jumpseti: CJInst_JMPSET <
+ (outs IntRegs:$Rd),
+ (ins u6_0Imm:$U6, brtarget:$r9_2),
+ "$Rd = #$U6 ; jump $r9_2"> {
+ bits<4> Rd;
+ bits<6> U6;
+ bits<11> r9_2;
+
+ let IClass = 0b0001;
+ let Inst{27-24} = 0b0110;
+ let Inst{21-20} = r9_2{10-9};
+ let Inst{19-16} = Rd;
+ let Inst{13-8} = U6;
+ let Inst{7-1} = r9_2{8-2};
+ }
+
+// J4_jumpsetr: Direct unconditional jump and transfer register.
+let Defs = [PC], isBranch = 1, hasSideEffects = 0, hasNewValue = 1,
+ isExtentSigned = 1, opNewValue = 0, isExtendable = 1, opExtentBits = 11,
+ opExtentAlign = 2, opExtendable = 2 in
+def J4_jumpsetr: CJInst_JMPSET <
+ (outs IntRegs:$Rd),
+ (ins IntRegs:$Rs, brtarget:$r9_2),
+ "$Rd = $Rs ; jump $r9_2"> {
+ bits<4> Rd;
+ bits<4> Rs;
+ bits<11> r9_2;
+
+ let IClass = 0b0001;
+ let Inst{27-24} = 0b0111;
+ let Inst{21-20} = r9_2{10-9};
+ let Inst{11-8} = Rd;
+ let Inst{19-16} = Rs;
+ let Inst{7-1} = r9_2{8-2};
+ }
+
+// Duplex instructions
+//===----------------------------------------------------------------------===//
+include "HexagonIsetDx.td"
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV5.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV5.td
new file mode 100644
index 000000000000..cd19b6916f21
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV5.td
@@ -0,0 +1,497 @@
+//=- HexagonInstrInfoV5.td - Target Desc. for Hexagon Target -*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon V5 instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// XTYPE/MPY
+//===----------------------------------------------------------------------===//
+
+ //Rdd[+]=vrmpybsu(Rss,Rtt)
+let Predicates = [HasV5T] in {
+ def M5_vrmpybsu: T_XTYPE_Vect<"vrmpybsu", 0b110, 0b001, 0>;
+ def M5_vrmacbsu: T_XTYPE_Vect_acc<"vrmpybsu", 0b110, 0b001, 0>;
+
+ //Rdd[+]=vrmpybu(Rss,Rtt)
+ def M5_vrmpybuu: T_XTYPE_Vect<"vrmpybu", 0b100, 0b001, 0>;
+ def M5_vrmacbuu: T_XTYPE_Vect_acc<"vrmpybu", 0b100, 0b001, 0>;
+
+ def M5_vdmpybsu: T_M2_vmpy<"vdmpybsu", 0b101, 0b001, 0, 0, 1>;
+ def M5_vdmacbsu: T_M2_vmpy_acc_sat <"vdmpybsu", 0b001, 0b001, 0, 0>;
+}
+
+// Vector multiply bytes
+// Rdd=vmpyb[s]u(Rs,Rt)
+let Predicates = [HasV5T] in {
+ def M5_vmpybsu: T_XTYPE_mpy64 <"vmpybsu", 0b010, 0b001, 0, 0, 0>;
+ def M5_vmpybuu: T_XTYPE_mpy64 <"vmpybu", 0b100, 0b001, 0, 0, 0>;
+
+ // Rxx+=vmpyb[s]u(Rs,Rt)
+ def M5_vmacbsu: T_XTYPE_mpy64_acc <"vmpybsu", "+", 0b110, 0b001, 0, 0, 0>;
+ def M5_vmacbuu: T_XTYPE_mpy64_acc <"vmpybu", "+", 0b100, 0b001, 0, 0, 0>;
+
+ // Rd=vaddhub(Rss,Rtt):sat
+ let hasNewValue = 1, opNewValue = 0 in
+ def A5_vaddhubs: T_S3op_1 <"vaddhub", IntRegs, 0b01, 0b001, 0, 1>;
+}
+
+def S2_asr_i_p_rnd : S_2OpInstImm<"asr", 0b110, 0b111, u6_0Imm, [], 1>,
+ Requires<[HasV5T]> {
+ bits<6> src2;
+ let Inst{13-8} = src2;
+}
+
+let isAsmParserOnly = 1 in
+def S2_asr_i_p_rnd_goodsyntax
+ : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6_0Imm:$src2),
+ "$dst = asrrnd($src1, #$src2)">;
+
+def C4_fastcorner9 : T_LOGICAL_2OP<"fastcorner9", 0b000, 0, 0>,
+ Requires<[HasV5T]> {
+ let Inst{13,7,4} = 0b111;
+}
+
+def C4_fastcorner9_not : T_LOGICAL_2OP<"!fastcorner9", 0b000, 0, 0>,
+ Requires<[HasV5T]> {
+ let Inst{20,13,7,4} = 0b1111;
+}
+
+let hasNewValue = 1, validSubTargets = HasV5SubT in
+def S5_popcountp : ALU64_rr<(outs IntRegs:$Rd), (ins DoubleRegs:$Rss),
+ "$Rd = popcount($Rss)", [], "", S_2op_tc_2_SLOT23>,
+ Requires<[HasV5T]> {
+ bits<5> Rd;
+ bits<5> Rss;
+
+ let IClass = 0b1000;
+
+ let Inst{27-21} = 0b1000011;
+ let Inst{7-5} = 0b011;
+ let Inst{4-0} = Rd;
+ let Inst{20-16} = Rss;
+ }
+
+let isFP = 1, hasNewValue = 1, opNewValue = 0 in
+class T_MInstFloat <string mnemonic, bits<3> MajOp, bits<3> MinOp>
+ : MInst<(outs IntRegs:$Rd),
+ (ins IntRegs:$Rs, IntRegs:$Rt),
+ "$Rd = "#mnemonic#"($Rs, $Rt)", [],
+ "" , M_tc_3or4x_SLOT23 > ,
+ Requires<[HasV5T]> {
+ bits<5> Rd;
+ bits<5> Rs;
+ bits<5> Rt;
+
+ let IClass = 0b1110;
+
+ let Inst{27-24} = 0b1011;
+ let Inst{23-21} = MajOp;
+ let Inst{20-16} = Rs;
+ let Inst{13} = 0b0;
+ let Inst{12-8} = Rt;
+ let Inst{7-5} = MinOp;
+ let Inst{4-0} = Rd;
+ }
+
+let isCommutable = 1 in {
+ def F2_sfadd : T_MInstFloat < "sfadd", 0b000, 0b000>;
+ def F2_sfmpy : T_MInstFloat < "sfmpy", 0b010, 0b000>;
+}
+
+def F2_sfsub : T_MInstFloat < "sfsub", 0b000, 0b001>;
+
+let Itinerary = M_tc_3x_SLOT23 in {
+ def F2_sfmax : T_MInstFloat < "sfmax", 0b100, 0b000>;
+ def F2_sfmin : T_MInstFloat < "sfmin", 0b100, 0b001>;
+}
+
+let Itinerary = M_tc_3or4x_SLOT23 in {
+def F2_sffixupn : T_MInstFloat < "sffixupn", 0b110, 0b000>;
+def F2_sffixupd : T_MInstFloat < "sffixupd", 0b110, 0b001>;
+}
+
+// F2_sfrecipa: Reciprocal approximation for division.
+let Uses = [USR], isPredicateLate = 1, isFP = 1,
+ hasSideEffects = 0, hasNewValue = 1, Itinerary = M_tc_3or4x_SLOT23 in
+def F2_sfrecipa: MInst <
+ (outs IntRegs:$Rd, PredRegs:$Pe),
+ (ins IntRegs:$Rs, IntRegs:$Rt),
+ "$Rd, $Pe = sfrecipa($Rs, $Rt)">,
+ Requires<[HasV5T]> {
+ bits<5> Rd;
+ bits<2> Pe;
+ bits<5> Rs;
+ bits<5> Rt;
+
+ let IClass = 0b1110;
+ let Inst{27-21} = 0b1011111;
+ let Inst{20-16} = Rs;
+ let Inst{13} = 0b0;
+ let Inst{12-8} = Rt;
+ let Inst{7} = 0b1;
+ let Inst{6-5} = Pe;
+ let Inst{4-0} = Rd;
+ }
+
+// F2_dfcmpeq: Floating point compare for equal.
+let Uses = [USR], isCompare = 1, isFP = 1 in
+class T_fcmp <string mnemonic, RegisterClass RC, bits<3> MinOp,
+ list<dag> pattern = [] >
+ : ALU64Inst <(outs PredRegs:$dst), (ins RC:$src1, RC:$src2),
+ "$dst = "#mnemonic#"($src1, $src2)", pattern,
+ "" , ALU64_tc_2early_SLOT23 > ,
+ Requires<[HasV5T]> {
+ bits<2> dst;
+ bits<5> src1;
+ bits<5> src2;
+
+ let IClass = 0b1101;
+
+ let Inst{27-21} = 0b0010111;
+ let Inst{20-16} = src1;
+ let Inst{12-8} = src2;
+ let Inst{7-5} = MinOp;
+ let Inst{1-0} = dst;
+ }
+
+class T_fcmp64 <string mnemonic, PatFrag OpNode, bits<3> MinOp>
+ : T_fcmp <mnemonic, DoubleRegs, MinOp, []> {
+ let IClass = 0b1101;
+ let Inst{27-21} = 0b0010111;
+}
+
+class T_fcmp32 <string mnemonic, PatFrag OpNode, bits<3> MinOp>
+ : T_fcmp <mnemonic, IntRegs, MinOp, []> {
+ let IClass = 0b1100;
+ let Inst{27-21} = 0b0111111;
+}
+
+def F2_dfcmpeq : T_fcmp64<"dfcmp.eq", setoeq, 0b000>;
+def F2_dfcmpgt : T_fcmp64<"dfcmp.gt", setogt, 0b001>;
+def F2_dfcmpge : T_fcmp64<"dfcmp.ge", setoge, 0b010>;
+def F2_dfcmpuo : T_fcmp64<"dfcmp.uo", setuo, 0b011>;
+
+def F2_sfcmpge : T_fcmp32<"sfcmp.ge", setoge, 0b000>;
+def F2_sfcmpuo : T_fcmp32<"sfcmp.uo", setuo, 0b001>;
+def F2_sfcmpeq : T_fcmp32<"sfcmp.eq", setoeq, 0b011>;
+def F2_sfcmpgt : T_fcmp32<"sfcmp.gt", setogt, 0b100>;
+
+// F2 convert template classes:
+let Uses = [USR], isFP = 1 in
+class F2_RDD_RSS_CONVERT<string mnemonic, bits<3> MinOp,
+ string chop ="">
+ : SInst <(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rss),
+ "$Rdd = "#mnemonic#"($Rss)"#chop, [], "",
+ S_2op_tc_3or4x_SLOT23> {
+ bits<5> Rdd;
+ bits<5> Rss;
+
+ let IClass = 0b1000;
+
+ let Inst{27-21} = 0b0000111;
+ let Inst{20-16} = Rss;
+ let Inst{7-5} = MinOp;
+ let Inst{4-0} = Rdd;
+ }
+
+let Uses = [USR], isFP = 1 in
+class F2_RDD_RS_CONVERT<string mnemonic, bits<3> MinOp,
+ string chop ="">
+ : SInst <(outs DoubleRegs:$Rdd), (ins IntRegs:$Rs),
+ "$Rdd = "#mnemonic#"($Rs)"#chop, [], "",
+ S_2op_tc_3or4x_SLOT23> {
+ bits<5> Rdd;
+ bits<5> Rs;
+
+ let IClass = 0b1000;
+
+ let Inst{27-21} = 0b0100100;
+ let Inst{20-16} = Rs;
+ let Inst{7-5} = MinOp;
+ let Inst{4-0} = Rdd;
+ }
+
+let Uses = [USR], isFP = 1, hasNewValue = 1 in
+class F2_RD_RSS_CONVERT<string mnemonic, bits<3> MinOp,
+ string chop ="">
+ : SInst <(outs IntRegs:$Rd), (ins DoubleRegs:$Rss),
+ "$Rd = "#mnemonic#"($Rss)"#chop, [], "",
+ S_2op_tc_3or4x_SLOT23> {
+ bits<5> Rd;
+ bits<5> Rss;
+
+ let IClass = 0b1000;
+
+ let Inst{27-24} = 0b1000;
+ let Inst{23-21} = MinOp;
+ let Inst{20-16} = Rss;
+ let Inst{7-5} = 0b001;
+ let Inst{4-0} = Rd;
+ }
+
+let Uses = [USR], isFP = 1, hasNewValue = 1 in
+class F2_RD_RS_CONVERT<string mnemonic, bits<3> MajOp, bits<3> MinOp,
+ string chop ="">
+ : SInst <(outs IntRegs:$Rd), (ins IntRegs:$Rs),
+ "$Rd = "#mnemonic#"($Rs)"#chop, [], "",
+ S_2op_tc_3or4x_SLOT23> {
+ bits<5> Rd;
+ bits<5> Rs;
+
+ let IClass = 0b1000;
+
+ let Inst{27-24} = 0b1011;
+ let Inst{23-21} = MajOp;
+ let Inst{20-16} = Rs;
+ let Inst{7-5} = MinOp;
+ let Inst{4-0} = Rd;
+ }
+
+// Convert single precision to double precision and vice-versa.
+def F2_conv_sf2df : F2_RDD_RS_CONVERT <"convert_sf2df", 0b000>;
+def F2_conv_df2sf : F2_RD_RSS_CONVERT <"convert_df2sf", 0b000>;
+
+// Convert Integer to Floating Point.
+def F2_conv_d2sf : F2_RD_RSS_CONVERT <"convert_d2sf", 0b010>;
+def F2_conv_ud2sf : F2_RD_RSS_CONVERT <"convert_ud2sf", 0b001>;
+def F2_conv_uw2sf : F2_RD_RS_CONVERT <"convert_uw2sf", 0b001, 0b000>;
+def F2_conv_w2sf : F2_RD_RS_CONVERT <"convert_w2sf", 0b010, 0b000>;
+def F2_conv_d2df : F2_RDD_RSS_CONVERT <"convert_d2df", 0b011>;
+def F2_conv_ud2df : F2_RDD_RSS_CONVERT <"convert_ud2df", 0b010>;
+def F2_conv_uw2df : F2_RDD_RS_CONVERT <"convert_uw2df", 0b001>;
+def F2_conv_w2df : F2_RDD_RS_CONVERT <"convert_w2df", 0b010>;
+
+// Convert Floating Point to Integer.
+def F2_conv_df2uw_chop : F2_RD_RSS_CONVERT <"convert_df2uw", 0b101, ":chop">;
+def F2_conv_df2w_chop : F2_RD_RSS_CONVERT <"convert_df2w", 0b111, ":chop">;
+def F2_conv_sf2uw_chop : F2_RD_RS_CONVERT <"convert_sf2uw", 0b011, 0b001,
+ ":chop">;
+def F2_conv_sf2w_chop : F2_RD_RS_CONVERT <"convert_sf2w", 0b100, 0b001,
+ ":chop">;
+def F2_conv_df2d_chop : F2_RDD_RSS_CONVERT <"convert_df2d", 0b110, ":chop">;
+def F2_conv_df2ud_chop : F2_RDD_RSS_CONVERT <"convert_df2ud", 0b111, ":chop">;
+def F2_conv_sf2d_chop : F2_RDD_RS_CONVERT <"convert_sf2d", 0b110, ":chop">;
+def F2_conv_sf2ud_chop : F2_RDD_RS_CONVERT <"convert_sf2ud", 0b101, ":chop">;
+
+// Convert Floating Point to Integer: non-chopped.
+let AddedComplexity = 20, Predicates = [HasV5T] in {
+ def F2_conv_df2d : F2_RDD_RSS_CONVERT <"convert_df2d", 0b000>;
+ def F2_conv_df2ud : F2_RDD_RSS_CONVERT <"convert_df2ud", 0b001>;
+ def F2_conv_sf2ud : F2_RDD_RS_CONVERT <"convert_sf2ud", 0b011>;
+ def F2_conv_sf2d : F2_RDD_RS_CONVERT <"convert_sf2d", 0b100>;
+ def F2_conv_df2uw : F2_RD_RSS_CONVERT <"convert_df2uw", 0b011>;
+ def F2_conv_df2w : F2_RD_RSS_CONVERT <"convert_df2w", 0b100>;
+ def F2_conv_sf2uw : F2_RD_RS_CONVERT <"convert_sf2uw", 0b011, 0b000>;
+ def F2_conv_sf2w : F2_RD_RS_CONVERT <"convert_sf2w", 0b100, 0b000>;
+}
+
+// Fix up radicand.
+let Uses = [USR], isFP = 1, hasNewValue = 1 in
+def F2_sffixupr: SInst<(outs IntRegs:$Rd), (ins IntRegs:$Rs),
+ "$Rd = sffixupr($Rs)",
+ [], "" , S_2op_tc_3or4x_SLOT23>, Requires<[HasV5T]> {
+ bits<5> Rd;
+ bits<5> Rs;
+
+ let IClass = 0b1000;
+
+ let Inst{27-21} = 0b1011101;
+ let Inst{20-16} = Rs;
+ let Inst{7-5} = 0b000;
+ let Inst{4-0} = Rd;
+ }
+
+// F2_sffma: Floating-point fused multiply add.
+let Uses = [USR], isFP = 1, hasNewValue = 1 in
+class T_sfmpy_acc <bit isSub, bit isLib>
+ : MInst<(outs IntRegs:$Rx),
+ (ins IntRegs:$dst2, IntRegs:$Rs, IntRegs:$Rt),
+ "$Rx "#!if(isSub, "-=","+=")#" sfmpy($Rs, $Rt)"#!if(isLib, ":lib",""),
+ [], "$dst2 = $Rx" , M_tc_3or4x_SLOT23 > ,
+ Requires<[HasV5T]> {
+ bits<5> Rx;
+ bits<5> Rs;
+ bits<5> Rt;
+
+ let IClass = 0b1110;
+
+ let Inst{27-21} = 0b1111000;
+ let Inst{20-16} = Rs;
+ let Inst{13} = 0b0;
+ let Inst{12-8} = Rt;
+ let Inst{7} = 0b1;
+ let Inst{6} = isLib;
+ let Inst{5} = isSub;
+ let Inst{4-0} = Rx;
+ }
+
+def F2_sffma: T_sfmpy_acc <0, 0>;
+def F2_sffms: T_sfmpy_acc <1, 0>;
+def F2_sffma_lib: T_sfmpy_acc <0, 1>;
+def F2_sffms_lib: T_sfmpy_acc <1, 1>;
+
+// Floating-point fused multiply add w/ additional scaling (2**pu).
+let Uses = [USR], isFP = 1, hasNewValue = 1 in
+def F2_sffma_sc: MInst <
+ (outs IntRegs:$Rx),
+ (ins IntRegs:$dst2, IntRegs:$Rs, IntRegs:$Rt, PredRegs:$Pu),
+ "$Rx += sfmpy($Rs, $Rt, $Pu):scale" ,
+ [], "$dst2 = $Rx" , M_tc_3or4x_SLOT23 > ,
+ Requires<[HasV5T]> {
+ bits<5> Rx;
+ bits<5> Rs;
+ bits<5> Rt;
+ bits<2> Pu;
+
+ let IClass = 0b1110;
+
+ let Inst{27-21} = 0b1111011;
+ let Inst{20-16} = Rs;
+ let Inst{13} = 0b0;
+ let Inst{12-8} = Rt;
+ let Inst{7} = 0b1;
+ let Inst{6-5} = Pu;
+ let Inst{4-0} = Rx;
+ }
+
+//===----------------------------------------------------------------------===//
+// :natural forms of vasrh and vasrhub insns
+//===----------------------------------------------------------------------===//
+// S5_asrhub_rnd_sat: Vector arithmetic shift right by immediate with round,
+// saturate, and pack.
+let Defs = [USR_OVF], hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+class T_ASRHUB<bit isSat>
+ : SInst <(outs IntRegs:$Rd),
+ (ins DoubleRegs:$Rss, u4_0Imm:$u4),
+ "$Rd = vasrhub($Rss, #$u4):"#!if(isSat, "sat", "raw"),
+ [], "", S_2op_tc_2_SLOT23>,
+ Requires<[HasV5T]> {
+ bits<5> Rd;
+ bits<5> Rss;
+ bits<4> u4;
+
+ let IClass = 0b1000;
+
+ let Inst{27-21} = 0b1000011;
+ let Inst{20-16} = Rss;
+ let Inst{13-12} = 0b00;
+ let Inst{11-8} = u4;
+ let Inst{7-6} = 0b10;
+ let Inst{5} = isSat;
+ let Inst{4-0} = Rd;
+ }
+
+def S5_asrhub_rnd_sat : T_ASRHUB <0>;
+def S5_asrhub_sat : T_ASRHUB <1>;
+
+let isAsmParserOnly = 1 in
+def S5_asrhub_rnd_sat_goodsyntax
+ : SInst <(outs IntRegs:$Rd), (ins DoubleRegs:$Rss, u4_0Imm:$u4),
+ "$Rd = vasrhub($Rss, #$u4):rnd:sat">, Requires<[HasV5T]>;
+
+// S5_vasrhrnd: Vector arithmetic shift right by immediate with round.
+let hasSideEffects = 0 in
+def S5_vasrhrnd : SInst <(outs DoubleRegs:$Rdd),
+ (ins DoubleRegs:$Rss, u4_0Imm:$u4),
+ "$Rdd = vasrh($Rss, #$u4):raw">,
+ Requires<[HasV5T]> {
+ bits<5> Rdd;
+ bits<5> Rss;
+ bits<4> u4;
+
+ let IClass = 0b1000;
+
+ let Inst{27-21} = 0b0000001;
+ let Inst{20-16} = Rss;
+ let Inst{13-12} = 0b00;
+ let Inst{11-8} = u4;
+ let Inst{7-5} = 0b000;
+ let Inst{4-0} = Rdd;
+ }
+
+let isAsmParserOnly = 1 in
+def S5_vasrhrnd_goodsyntax
+ : SInst <(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rss, u4_0Imm:$u4),
+ "$Rdd = vasrh($Rss,#$u4):rnd">, Requires<[HasV5T]>;
+
+// Floating point reciprocal square root approximation
+let Uses = [USR], isPredicateLate = 1, isFP = 1,
+ hasSideEffects = 0, hasNewValue = 1, opNewValue = 0,
+ validSubTargets = HasV5SubT in
+def F2_sfinvsqrta: SInst <
+ (outs IntRegs:$Rd, PredRegs:$Pe),
+ (ins IntRegs:$Rs),
+ "$Rd, $Pe = sfinvsqrta($Rs)" > ,
+ Requires<[HasV5T]> {
+ bits<5> Rd;
+ bits<2> Pe;
+ bits<5> Rs;
+
+ let IClass = 0b1000;
+
+ let Inst{27-21} = 0b1011111;
+ let Inst{20-16} = Rs;
+ let Inst{7} = 0b0;
+ let Inst{6-5} = Pe;
+ let Inst{4-0} = Rd;
+ }
+
+// Complex multiply 32x16
+let Defs = [USR_OVF], Itinerary = S_3op_tc_3x_SLOT23 in {
+ def M4_cmpyi_whc : T_S3op_8<"cmpyiwh", 0b101, 1, 1, 1, 1>;
+ def M4_cmpyr_whc : T_S3op_8<"cmpyrwh", 0b111, 1, 1, 1, 1>;
+}
+
+// Classify floating-point value
+let Uses = [USR], isFP = 1 in
+def F2_sfclass : T_TEST_BIT_IMM<"sfclass", 0b111>, Requires<[HasV5T]>;
+
+let Uses = [USR], isFP = 1 in
+def F2_dfclass: ALU64Inst<(outs PredRegs:$Pd), (ins DoubleRegs:$Rss, u5_0Imm:$u5),
+ "$Pd = dfclass($Rss, #$u5)",
+ [], "" , ALU64_tc_2early_SLOT23 > , Requires<[HasV5T]> {
+ bits<2> Pd;
+ bits<5> Rss;
+ bits<5> u5;
+
+ let IClass = 0b1101;
+ let Inst{27-21} = 0b1100100;
+ let Inst{20-16} = Rss;
+ let Inst{12-10} = 0b000;
+ let Inst{9-5} = u5;
+ let Inst{4-3} = 0b10;
+ let Inst{1-0} = Pd;
+ }
+
+// Instructions to create floating point constant
+class T_fimm <string mnemonic, RegisterClass RC, bits<4> RegType, bit isNeg>
+ : ALU64Inst<(outs RC:$dst), (ins u10_0Imm:$src),
+ "$dst = "#mnemonic#"(#$src)"#!if(isNeg, ":neg", ":pos"),
+ [], "", ALU64_tc_2_SLOT23>, Requires<[HasV5T]> {
+ bits<5> dst;
+ bits<10> src;
+
+ let IClass = 0b1101;
+ let Inst{27-24} = RegType;
+ let Inst{23} = 0b0;
+ let Inst{22} = isNeg;
+ let Inst{21} = src{9};
+ let Inst{13-5} = src{8-0};
+ let Inst{4-0} = dst;
+ }
+
+let hasNewValue = 1, opNewValue = 0 in {
+ def F2_sfimm_p : T_fimm <"sfmake", IntRegs, 0b0110, 0>;
+ def F2_sfimm_n : T_fimm <"sfmake", IntRegs, 0b0110, 1>;
+}
+
+def F2_dfimm_p : T_fimm <"dfmake", DoubleRegs, 0b1001, 0>;
+def F2_dfimm_n : T_fimm <"dfmake", DoubleRegs, 0b1001, 1>;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV60.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV60.td
new file mode 100644
index 000000000000..c50141b18ead
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV60.td
@@ -0,0 +1,2068 @@
+//=- HexagonInstrInfoV60.td - Target Desc. for Hexagon Target -*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon V60 instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+// Vector load
+let Predicates = [HasV60T, UseHVX] in
+let mayLoad = 1, validSubTargets = HasV60SubT, hasSideEffects = 0 in
+ class V6_LDInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = CVI_VM_LD,
+ IType type = TypeCVI_VM_LD>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, type>;
+
+// Vector store
+let Predicates = [HasV60T, UseHVX] in
+let mayStore = 1, validSubTargets = HasV60SubT, hasSideEffects = 0 in
+class V6_STInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = CVI_VM_ST,
+ IType type = TypeCVI_VM_ST>
+: InstHexagon<outs, ins, asmstr, pattern, cstr, itin, type>;
+
+//===----------------------------------------------------------------------===//
+// Vector loads with base + immediate offset
+//===----------------------------------------------------------------------===//
+let addrMode = BaseImmOffset, accessSize = Vector64Access in
+class T_vload_ai<string asmStr>
+ : V6_LDInst <(outs VectorRegs:$dst), (ins IntRegs:$src1, s4_6Imm:$src2),
+ asmStr>;
+
+let isCodeGenOnly = 1, addrMode = BaseImmOffset, accessSize = Vector128Access in
+class T_vload_ai_128B<string asmStr>
+ : V6_LDInst <(outs VectorRegs128B:$dst), (ins IntRegs:$src1, s4_7Imm:$src2),
+ asmStr>;
+
+let isCVLoadable = 1, hasNewValue = 1 in {
+ def V6_vL32b_ai : T_vload_ai <"$dst = vmem($src1+#$src2)">,
+ V6_vL32b_ai_enc;
+ def V6_vL32b_nt_ai : T_vload_ai <"$dst = vmem($src1+#$src2):nt">,
+ V6_vL32b_nt_ai_enc;
+ // 128B
+ def V6_vL32b_ai_128B : T_vload_ai_128B <"$dst = vmem($src1+#$src2)">,
+ V6_vL32b_ai_128B_enc;
+ def V6_vL32b_nt_ai_128B : T_vload_ai_128B <"$dst = vmem($src1+#$src2):nt">,
+ V6_vL32b_nt_ai_128B_enc;
+}
+
+let Itinerary = CVI_VM_VP_LDU, Type = TypeCVI_VM_VP_LDU, hasNewValue = 1 in {
+ def V6_vL32Ub_ai : T_vload_ai <"$dst = vmemu($src1+#$src2)">,
+ V6_vL32Ub_ai_enc;
+ def V6_vL32Ub_ai_128B : T_vload_ai_128B <"$dst = vmemu($src1+#$src2)">,
+ V6_vL32Ub_ai_128B_enc;
+}
+
+let Itinerary = CVI_VM_LD, Type = TypeCVI_VM_LD, isCVLoad = 1,
+ hasNewValue = 1 in {
+ def V6_vL32b_cur_ai : T_vload_ai <"$dst.cur = vmem($src1+#$src2)">,
+ V6_vL32b_cur_ai_enc;
+ def V6_vL32b_nt_cur_ai : T_vload_ai <"$dst.cur = vmem($src1+#$src2):nt">,
+ V6_vL32b_nt_cur_ai_enc;
+ // 128B
+ def V6_vL32b_cur_ai_128B : T_vload_ai_128B
+ <"$dst.cur = vmem($src1+#$src2)">,
+ V6_vL32b_cur_ai_128B_enc;
+ def V6_vL32b_nt_cur_ai_128B : T_vload_ai_128B
+ <"$dst.cur = vmem($src1+#$src2):nt">,
+ V6_vL32b_nt_cur_ai_128B_enc;
+}
+
+
+let Itinerary = CVI_VM_TMP_LD, Type = TypeCVI_VM_TMP_LD, hasNewValue = 1 in {
+ def V6_vL32b_tmp_ai : T_vload_ai <"$dst.tmp = vmem($src1+#$src2)">,
+ V6_vL32b_tmp_ai_enc;
+ def V6_vL32b_nt_tmp_ai : T_vload_ai <"$dst.tmp = vmem($src1+#$src2):nt">,
+ V6_vL32b_nt_tmp_ai_enc;
+ // 128B
+ def V6_vL32b_tmp_ai_128B : T_vload_ai_128B
+ <"$dst.tmp = vmem($src1+#$src2)">,
+ V6_vL32b_tmp_ai_128B_enc;
+ def V6_vL32b_nt_tmp_ai_128B : T_vload_ai_128B
+ <"$dst.tmp = vmem($src1+#$src2)">,
+ V6_vL32b_nt_tmp_ai_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Vector stores with base + immediate offset - unconditional
+//===----------------------------------------------------------------------===//
+let addrMode = BaseImmOffset, accessSize = Vector64Access, isPredicable = 1 in
+class T_vstore_ai <string mnemonic, string baseOp, Operand ImmOp,
+ RegisterClass RC, bit isNT>
+ : V6_STInst <(outs), (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
+ mnemonic#"($src1+#$src2)"#!if(isNT, ":nt", "")#" = $src3">, NewValueRel {
+ let BaseOpcode = baseOp;
+}
+
+let accessSize = Vector64Access in
+class T_vstore_ai_64B <string mnemonic, string baseOp, bit isNT = 0>
+ : T_vstore_ai <mnemonic, baseOp, s4_6Imm, VectorRegs, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_ai_128B <string mnemonic, string baseOp, bit isNT = 0>
+ : T_vstore_ai <mnemonic, baseOp#"128B", s4_7Imm, VectorRegs128B, isNT>;
+
+let isNVStorable = 1 in {
+ def V6_vS32b_ai : T_vstore_ai_64B <"vmem", "vS32b_ai">,
+ V6_vS32b_ai_enc;
+ def V6_vS32b_ai_128B : T_vstore_ai_128B <"vmem", "vS32b_ai">,
+ V6_vS32b_ai_128B_enc;
+}
+
+let isNVStorable = 1, isNonTemporal = 1 in {
+ def V6_vS32b_nt_ai : T_vstore_ai_64B <"vmem", "vS32b_ai", 1>,
+ V6_vS32b_nt_ai_enc;
+ def V6_vS32b_nt_ai_128B : T_vstore_ai_128B <"vmem", "vS32b_ai", 1>,
+ V6_vS32b_nt_ai_128B_enc;
+}
+
+let Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in {
+ def V6_vS32Ub_ai : T_vstore_ai_64B <"vmemu", "vS32Ub_ai">,
+ V6_vS32Ub_ai_enc;
+ def V6_vS32Ub_ai_128B : T_vstore_ai_128B <"vmemu", "vS32Ub_ai">,
+ V6_vS32Ub_ai_128B_enc;
+}
+//===----------------------------------------------------------------------===//
+// Vector stores with base + immediate offset - unconditional new
+//===----------------------------------------------------------------------===//
+let addrMode = BaseImmOffset, isNewValue = 1, opNewValue = 2, isNVStore = 1,
+ isPredicable = 1, Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST in
+class T_vstore_new_ai <string baseOp, Operand ImmOp, RegisterClass RC, bit isNT>
+ : V6_STInst <(outs ), (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
+ "vmem($src1+#$src2)"#!if(isNT, ":nt", "")#" = $src3.new">, NewValueRel {
+ let BaseOpcode = baseOp;
+}
+
+let accessSize = Vector64Access in
+class T_vstore_new_ai_64B <string baseOp, bit isNT = 0>
+ : T_vstore_new_ai <baseOp, s4_6Imm, VectorRegs, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_new_ai_128B <string baseOp, bit isNT = 0>
+ : T_vstore_new_ai <baseOp#"128B", s4_7Imm, VectorRegs128B, isNT>;
+
+def V6_vS32b_new_ai : T_vstore_new_ai_64B <"vS32b_ai">, V6_vS32b_new_ai_enc;
+def V6_vS32b_new_ai_128B : T_vstore_new_ai_128B <"vS32b_ai">,
+ V6_vS32b_new_ai_128B_enc;
+
+let isNonTemporal = 1 in {
+ def V6_vS32b_nt_new_ai : T_vstore_new_ai_64B<"vS32b_ai", 1>,
+ V6_vS32b_nt_new_ai_enc;
+ def V6_vS32b_nt_new_ai_128B : T_vstore_new_ai_128B<"vS32b_ai", 1>,
+ V6_vS32b_nt_new_ai_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Vector stores with base + immediate offset - conditional
+//===----------------------------------------------------------------------===//
+let addrMode = BaseImmOffset, isPredicated = 1 in
+class T_vstore_pred_ai <string mnemonic, string baseOp, Operand ImmOp,
+ RegisterClass RC, bit isPredNot = 0, bit isNT = 0>
+ : V6_STInst <(outs),
+ (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
+ "if ("#!if(isPredNot, "!", "")#"$src1) "
+ #mnemonic#"($src2+#$src3)"#!if(isNT, ":nt", "")#" = $src4">, NewValueRel {
+ let isPredicatedFalse = isPredNot;
+ let BaseOpcode = baseOp;
+}
+
+let accessSize = Vector64Access in
+class T_vstore_pred_ai_64B <string mnemonic, string baseOp,
+ bit isPredNot = 0, bit isNT = 0>
+ : T_vstore_pred_ai <mnemonic, baseOp, s4_6Imm, VectorRegs, isPredNot, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_pred_ai_128B <string mnemonic, string baseOp,
+ bit isPredNot = 0, bit isNT = 0>
+ : T_vstore_pred_ai <mnemonic, baseOp#"128B", s4_7Imm, VectorRegs128B,
+ isPredNot, isNT>;
+
+let isNVStorable = 1 in {
+ def V6_vS32b_pred_ai : T_vstore_pred_ai_64B <"vmem", "vS32b_ai">,
+ V6_vS32b_pred_ai_enc;
+ def V6_vS32b_npred_ai : T_vstore_pred_ai_64B <"vmem", "vS32b_ai", 1>,
+ V6_vS32b_npred_ai_enc;
+ // 128B
+ def V6_vS32b_pred_ai_128B : T_vstore_pred_ai_128B <"vmem", "vS32b_ai">,
+ V6_vS32b_pred_ai_128B_enc;
+ def V6_vS32b_npred_ai_128B : T_vstore_pred_ai_128B <"vmem", "vS32b_ai", 1>,
+ V6_vS32b_npred_ai_128B_enc;
+}
+
+
+let isNVStorable = 1, isNonTemporal = 1 in {
+ def V6_vS32b_nt_pred_ai : T_vstore_pred_ai_64B <"vmem", "vS32b_ai", 0, 1>,
+ V6_vS32b_nt_pred_ai_enc;
+ def V6_vS32b_nt_npred_ai : T_vstore_pred_ai_64B <"vmem", "vS32b_ai", 1, 1>,
+ V6_vS32b_nt_npred_ai_enc;
+ // 128B
+ def V6_vS32b_nt_pred_ai_128B : T_vstore_pred_ai_128B
+ <"vmem", "vS32b_ai", 0, 1>,
+ V6_vS32b_nt_pred_ai_128B_enc;
+ def V6_vS32b_nt_npred_ai_128B : T_vstore_pred_ai_128B
+ <"vmem", "vS32b_ai", 1, 1>,
+ V6_vS32b_nt_npred_ai_128B_enc;
+}
+
+let Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in {
+ def V6_vS32Ub_pred_ai : T_vstore_pred_ai_64B <"vmemu", "vS32Ub_ai">,
+ V6_vS32Ub_pred_ai_enc;
+ def V6_vS32Ub_npred_ai : T_vstore_pred_ai_64B <"vmemu", "vS32Ub_ai", 1>,
+ V6_vS32Ub_npred_ai_enc;
+ // 128B
+ def V6_vS32Ub_pred_ai_128B :T_vstore_pred_ai_128B <"vmemu", "vS32Ub_ai">,
+ V6_vS32Ub_pred_ai_128B_enc;
+ def V6_vS32Ub_npred_ai_128B :T_vstore_pred_ai_128B <"vmemu", "vS32Ub_ai", 1>,
+ V6_vS32Ub_npred_ai_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Vector stores with base + immediate offset - byte-enabled aligned
+//===----------------------------------------------------------------------===//
+let addrMode = BaseImmOffset in
+class T_vstore_qpred_ai <Operand ImmOp, RegisterClass RC,
+ bit isPredNot = 0, bit isNT = 0>
+ : V6_STInst <(outs),
+ (ins VecPredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
+ "if ("#!if(isPredNot, "!", "")#"$src1) vmem($src2+#$src3)"
+ #!if(isNT, ":nt", "")#" = $src4"> {
+ let isPredicatedFalse = isPredNot;
+}
+
+let accessSize = Vector64Access in
+class T_vstore_qpred_ai_64B <bit isPredNot = 0, bit isNT = 0>
+ : T_vstore_qpred_ai <s4_6Imm, VectorRegs, isPredNot, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_qpred_ai_128B <bit isPredNot = 0, bit isNT = 0>
+ : T_vstore_qpred_ai <s4_7Imm, VectorRegs128B, isPredNot, isNT>;
+
+def V6_vS32b_qpred_ai : T_vstore_qpred_ai_64B, V6_vS32b_qpred_ai_enc;
+def V6_vS32b_nqpred_ai : T_vstore_qpred_ai_64B <1>,
+ V6_vS32b_nqpred_ai_enc;
+def V6_vS32b_nt_qpred_ai : T_vstore_qpred_ai_64B <0, 1>,
+ V6_vS32b_nt_qpred_ai_enc;
+def V6_vS32b_nt_nqpred_ai : T_vstore_qpred_ai_64B <1, 1>,
+ V6_vS32b_nt_nqpred_ai_enc;
+// 128B
+def V6_vS32b_qpred_ai_128B : T_vstore_qpred_ai_128B, V6_vS32b_qpred_ai_128B_enc;
+def V6_vS32b_nqpred_ai_128B : T_vstore_qpred_ai_128B<1>,
+ V6_vS32b_nqpred_ai_128B_enc;
+def V6_vS32b_nt_qpred_ai_128B : T_vstore_qpred_ai_128B<0, 1>,
+ V6_vS32b_nt_qpred_ai_128B_enc;
+def V6_vS32b_nt_nqpred_ai_128B : T_vstore_qpred_ai_128B<1, 1>,
+ V6_vS32b_nt_nqpred_ai_128B_enc;
+
+
+//===----------------------------------------------------------------------===//
+// Vector stores with base + immediate offset - conditional new
+//===----------------------------------------------------------------------===//
+let addrMode = BaseImmOffset, isPredicated = 1, isNewValue = 1, opNewValue = 3,
+ isNVStore = 1, Type = TypeCVI_VM_NEW_ST, Itinerary = CVI_VM_NEW_ST in
+class T_vstore_new_pred_ai <string baseOp, Operand ImmOp, RegisterClass RC,
+ bit isPredNot, bit isNT>
+ : V6_STInst <(outs),
+ (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
+ "if("#!if(isPredNot, "!", "")#"$src1) vmem($src2+#$src3)"
+ #!if(isNT, ":nt", "")#" = $src4.new">, NewValueRel {
+ let isPredicatedFalse = isPredNot;
+ let BaseOpcode = baseOp;
+}
+
+let accessSize = Vector64Access in
+class T_vstore_new_pred_ai_64B <string baseOp, bit isPredNot = 0, bit isNT = 0>
+ : T_vstore_new_pred_ai <baseOp, s4_6Imm, VectorRegs, isPredNot, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_new_pred_ai_128B <string baseOp, bit isPredNot = 0, bit isNT = 0>
+ : T_vstore_new_pred_ai <baseOp#"128B", s4_7Imm, VectorRegs128B,
+ isPredNot, isNT>;
+
+
+def V6_vS32b_new_pred_ai : T_vstore_new_pred_ai_64B <"vS32b_ai">,
+ V6_vS32b_new_pred_ai_enc;
+def V6_vS32b_new_npred_ai : T_vstore_new_pred_ai_64B <"vS32b_ai", 1>,
+ V6_vS32b_new_npred_ai_enc;
+// 128B
+def V6_vS32b_new_pred_ai_128B : T_vstore_new_pred_ai_128B <"vS32b_ai">,
+ V6_vS32b_new_pred_ai_128B_enc;
+def V6_vS32b_new_npred_ai_128B : T_vstore_new_pred_ai_128B <"vS32b_ai", 1>,
+ V6_vS32b_new_npred_ai_128B_enc;
+let isNonTemporal = 1 in {
+ def V6_vS32b_nt_new_pred_ai : T_vstore_new_pred_ai_64B <"vS32b_ai", 0, 1>,
+ V6_vS32b_nt_new_pred_ai_enc;
+ def V6_vS32b_nt_new_npred_ai : T_vstore_new_pred_ai_64B <"vS32b_ai", 1, 1>,
+ V6_vS32b_nt_new_npred_ai_enc;
+ // 128B
+ def V6_vS32b_nt_new_pred_ai_128B : T_vstore_new_pred_ai_128B
+ <"vS32b_ai", 0, 1>,
+ V6_vS32b_nt_new_pred_ai_128B_enc;
+ def V6_vS32b_nt_new_npred_ai_128B : T_vstore_new_pred_ai_128B
+ <"vS32b_ai", 1, 1>,
+ V6_vS32b_nt_new_npred_ai_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment vector loads with immediate offset.
+//===----------------------------------------------------------------------===//
+let addrMode = PostInc, hasNewValue = 1 in
+class T_vload_pi<string asmStr, Operand ImmOp, RegisterClass RC>
+ : V6_LDInst <(outs RC:$dst, IntRegs:$_dst_),
+ (ins IntRegs:$src1, ImmOp:$src2), asmStr, [],
+ "$src1 = $_dst_">;
+
+let accessSize = Vector64Access in
+class T_vload_pi_64B <string asmStr>
+ : T_vload_pi <asmStr, s3_6Imm, VectorRegs>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vload_pi_128B <string asmStr>
+ : T_vload_pi <asmStr, s3_7Imm, VectorRegs128B>;
+
+let isCVLoadable = 1 in {
+ def V6_vL32b_pi : T_vload_pi_64B <"$dst = vmem($src1++#$src2)">,
+ V6_vL32b_pi_enc;
+ def V6_vL32b_nt_pi : T_vload_pi_64B <"$dst = vmem($src1++#$src2):nt">,
+ V6_vL32b_nt_pi_enc;
+ // 128B
+ def V6_vL32b_pi_128B : T_vload_pi_128B <"$dst = vmem($src1++#$src2)">,
+ V6_vL32b_pi_128B_enc;
+ def V6_vL32b_nt_pi_128B : T_vload_pi_128B <"$dst = vmem($src1++#$src2):nt">,
+ V6_vL32b_nt_pi_128B_enc;
+}
+
+let Itinerary = CVI_VM_VP_LDU, Type = TypeCVI_VM_VP_LDU in {
+ def V6_vL32Ub_pi : T_vload_pi_64B <"$dst = vmemu($src1++#$src2)">,
+ V6_vL32Ub_pi_enc;
+ // 128B
+ def V6_vL32Ub_pi_128B : T_vload_pi_128B <"$dst = vmemu($src1++#$src2)">,
+ V6_vL32Ub_pi_128B_enc;
+}
+
+let isCVLoad = 1, Itinerary = CVI_VM_LD, Type = TypeCVI_VM_LD in {
+ def V6_vL32b_cur_pi : T_vload_pi_64B <"$dst.cur = vmem($src1++#$src2)">,
+ V6_vL32b_cur_pi_enc;
+ def V6_vL32b_nt_cur_pi : T_vload_pi_64B <"$dst.cur = vmem($src1++#$src2):nt">,
+ V6_vL32b_nt_cur_pi_enc;
+ // 128B
+ def V6_vL32b_cur_pi_128B : T_vload_pi_128B
+ <"$dst.cur = vmem($src1++#$src2)">,
+ V6_vL32b_cur_pi_128B_enc;
+ def V6_vL32b_nt_cur_pi_128B : T_vload_pi_128B
+ <"$dst.cur = vmem($src1++#$src2):nt">,
+ V6_vL32b_nt_cur_pi_128B_enc;
+}
+
+let Itinerary = CVI_VM_TMP_LD, Type = TypeCVI_VM_TMP_LD in {
+ def V6_vL32b_tmp_pi : T_vload_pi_64B <"$dst.tmp = vmem($src1++#$src2)">,
+ V6_vL32b_tmp_pi_enc;
+ def V6_vL32b_nt_tmp_pi : T_vload_pi_64B <"$dst.tmp = vmem($src1++#$src2):nt">,
+ V6_vL32b_nt_tmp_pi_enc;
+ //128B
+ def V6_vL32b_tmp_pi_128B : T_vload_pi_128B
+ <"$dst.tmp = vmem($src1++#$src2)">,
+ V6_vL32b_tmp_pi_128B_enc;
+ def V6_vL32b_nt_tmp_pi_128B : T_vload_pi_128B
+ <"$dst.tmp = vmem($src1++#$src2):nt">,
+ V6_vL32b_nt_tmp_pi_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment vector stores with immediate offset.
+//===----------------------------------------------------------------------===//
+let addrMode = PostInc, isPredicable = 1 in
+class T_vstore_pi <string mnemonic, string baseOp, Operand ImmOp,
+ RegisterClass RC, bit isNT>
+ : V6_STInst <(outs IntRegs:$_dst_),
+ (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
+ mnemonic#"($src1++#$src2)"#!if(isNT, ":nt", "")#" = $src3", [],
+ "$src1 = $_dst_">, NewValueRel {
+ let BaseOpcode = baseOp;
+}
+
+let accessSize = Vector64Access in
+class T_vstore_pi_64B <string mnemonic, string baseOp, bit isNT = 0>
+ : T_vstore_pi <mnemonic, baseOp, s3_6Imm, VectorRegs, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_pi_128B <string mnemonic, string baseOp, bit isNT = 0>
+ : T_vstore_pi <mnemonic, baseOp#"128B", s3_7Imm, VectorRegs128B, isNT>;
+
+let isNVStorable = 1 in {
+ def V6_vS32b_pi : T_vstore_pi_64B <"vmem", "vS32b_pi">, V6_vS32b_pi_enc;
+ def V6_vS32b_pi_128B : T_vstore_pi_128B <"vmem", "vS32b_pi">,
+ V6_vS32b_pi_128B_enc;
+}
+
+let isNVStorable = 1 , isNonTemporal = 1 in {
+ def V6_vS32b_nt_pi : T_vstore_pi_64B <"vmem", "vS32b_pi", 1>,
+ V6_vS32b_nt_pi_enc;
+ def V6_vS32b_nt_pi_128B : T_vstore_pi_128B <"vmem", "vS32b_pi", 1>,
+ V6_vS32b_nt_pi_128B_enc;
+}
+
+
+let Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in {
+ def V6_vS32Ub_pi : T_vstore_pi_64B <"vmemu", "vS32Ub_pi">,
+ V6_vS32Ub_pi_enc;
+ def V6_vS32Ub_pi_128B : T_vstore_pi_128B <"vmemu", "vS32Ub_pi">,
+ V6_vS32Ub_pi_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment unconditional .new vector stores with immediate offset.
+//===----------------------------------------------------------------------===//
+let addrMode = PostInc, isNVStore = 1 in
+let Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST, isNewValue = 1,
+ isPredicable = 1, opNewValue = 3, isNVStore = 1 in
+class T_vstore_new_pi <string baseOp, Operand ImmOp, RegisterClass RC, bit isNT>
+ : V6_STInst <(outs IntRegs:$_dst_),
+ (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
+ "vmem($src1++#$src2)"#!if(isNT, ":nt", "")#" = $src3.new", [],
+ "$src1 = $_dst_">, NewValueRel {
+ let BaseOpcode = baseOp;
+}
+
+let accessSize = Vector64Access in
+class T_vstore_new_pi_64B <string baseOp, bit isNT = 0>
+ : T_vstore_new_pi <baseOp, s3_6Imm, VectorRegs, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_new_pi_128B <string baseOp, bit isNT = 0>
+ : T_vstore_new_pi <baseOp#"128B", s3_7Imm, VectorRegs128B, isNT>;
+
+
+def V6_vS32b_new_pi : T_vstore_new_pi_64B <"vS32b_pi">,
+ V6_vS32b_new_pi_enc;
+def V6_vS32b_new_pi_128B : T_vstore_new_pi_128B <"vS32b_pi">,
+ V6_vS32b_new_pi_128B_enc;
+
+let isNonTemporal = 1 in {
+ def V6_vS32b_nt_new_pi : T_vstore_new_pi_64B <"vS32b_pi", 1>,
+ V6_vS32b_nt_new_pi_enc;
+ def V6_vS32b_nt_new_pi_128B : T_vstore_new_pi_128B <"vS32b_pi", 1>,
+ V6_vS32b_nt_new_pi_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment conditional vector stores with immediate offset
+//===----------------------------------------------------------------------===//
+let isPredicated = 1, addrMode = PostInc in
+class T_vstore_pred_pi <string mnemonic, string baseOp, Operand ImmOp,
+ RegisterClass RC, bit isPredNot, bit isNT>
+ : V6_STInst<(outs IntRegs:$_dst_),
+ (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
+ "if ("#!if(isPredNot, "!", "")#"$src1) "#mnemonic#"($src2++#$src3)"
+ #!if(isNT, ":nt", "")#" = $src4", [],
+ "$src2 = $_dst_">, NewValueRel {
+ let isPredicatedFalse = isPredNot;
+ let BaseOpcode = baseOp;
+}
+
+let accessSize = Vector64Access in
+class T_vstore_pred_pi_64B <string mnemonic, string baseOp,
+ bit isPredNot = 0, bit isNT = 0>
+ : T_vstore_pred_pi <mnemonic, baseOp, s3_6Imm, VectorRegs, isPredNot, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_pred_pi_128B <string mnemonic, string baseOp,
+ bit isPredNot = 0, bit isNT = 0>
+ : T_vstore_pred_pi <mnemonic, baseOp#"128B", s3_7Imm, VectorRegs128B,
+ isPredNot, isNT>;
+
+let isNVStorable = 1 in {
+ def V6_vS32b_pred_pi : T_vstore_pred_pi_64B <"vmem", "vS32b_pi">,
+ V6_vS32b_pred_pi_enc;
+ def V6_vS32b_npred_pi : T_vstore_pred_pi_64B <"vmem", "vS32b_pi", 1>,
+ V6_vS32b_npred_pi_enc;
+ // 128B
+ def V6_vS32b_pred_pi_128B : T_vstore_pred_pi_128B <"vmem", "vS32b_pi">,
+ V6_vS32b_pred_pi_128B_enc;
+ def V6_vS32b_npred_pi_128B : T_vstore_pred_pi_128B <"vmem", "vS32b_pi", 1>,
+ V6_vS32b_npred_pi_128B_enc;
+}
+let isNVStorable = 1, isNonTemporal = 1 in {
+ def V6_vS32b_nt_pred_pi : T_vstore_pred_pi_64B <"vmem", "vS32b_pi", 0, 1>,
+ V6_vS32b_nt_pred_pi_enc;
+ def V6_vS32b_nt_npred_pi : T_vstore_pred_pi_64B <"vmem", "vS32b_pi", 1, 1>,
+ V6_vS32b_nt_npred_pi_enc;
+ // 128B
+ def V6_vS32b_nt_pred_pi_128B : T_vstore_pred_pi_128B
+ <"vmem", "vS32b_pi", 0, 1>,
+ V6_vS32b_nt_pred_pi_128B_enc;
+ def V6_vS32b_nt_npred_pi_128B : T_vstore_pred_pi_128B
+ <"vmem", "vS32b_pi", 1, 1>,
+ V6_vS32b_nt_npred_pi_128B_enc;
+}
+
+let Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in {
+ def V6_vS32Ub_pred_pi : T_vstore_pred_pi_64B <"vmemu", "vS32Ub_pi">,
+ V6_vS32Ub_pred_pi_enc;
+ def V6_vS32Ub_npred_pi : T_vstore_pred_pi_64B <"vmemu", "vS32Ub_pi", 1>,
+ V6_vS32Ub_npred_pi_enc;
+ // 128B
+ def V6_vS32Ub_pred_pi_128B : T_vstore_pred_pi_128B <"vmemu", "vS32Ub_pi">,
+ V6_vS32Ub_pred_pi_128B_enc;
+ def V6_vS32Ub_npred_pi_128B : T_vstore_pred_pi_128B <"vmemu", "vS32Ub_pi", 1>,
+ V6_vS32Ub_npred_pi_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment vector stores with immediate offset - byte-enabled aligned
+//===----------------------------------------------------------------------===//
+let addrMode = PostInc in
+class T_vstore_qpred_pi <Operand ImmOp, RegisterClass RC, bit isPredNot = 0,
+ bit isNT = 0>
+ : V6_STInst <(outs IntRegs:$_dst_),
+ (ins VecPredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
+ "if ("#!if(isPredNot, "!", "")#"$src1) vmem($src2++#$src3)"
+ #!if(isNT, ":nt", "")#" = $src4", [],
+ "$src2 = $_dst_">;
+
+let accessSize = Vector64Access in
+class T_vstore_qpred_pi_64B <bit isPredNot = 0, bit isNT = 0>
+ : T_vstore_qpred_pi <s3_6Imm, VectorRegs, isPredNot, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_qpred_pi_128B <bit isPredNot = 0, bit isNT = 0>
+ : T_vstore_qpred_pi <s3_7Imm, VectorRegs128B, isPredNot, isNT>;
+
+def V6_vS32b_qpred_pi : T_vstore_qpred_pi_64B, V6_vS32b_qpred_pi_enc;
+def V6_vS32b_nqpred_pi : T_vstore_qpred_pi_64B <1>, V6_vS32b_nqpred_pi_enc;
+// 128B
+def V6_vS32b_qpred_pi_128B : T_vstore_qpred_pi_128B,
+ V6_vS32b_qpred_pi_128B_enc;
+def V6_vS32b_nqpred_pi_128B : T_vstore_qpred_pi_128B<1>,
+ V6_vS32b_nqpred_pi_128B_enc;
+
+let isNonTemporal = 1 in {
+ def V6_vS32b_nt_qpred_pi : T_vstore_qpred_pi_64B <0, 1>,
+ V6_vS32b_nt_qpred_pi_enc;
+ def V6_vS32b_nt_nqpred_pi : T_vstore_qpred_pi_64B <1, 1>,
+ V6_vS32b_nt_nqpred_pi_enc;
+ // 128B
+ def V6_vS32b_nt_qpred_pi_128B : T_vstore_qpred_pi_128B<0, 1>,
+ V6_vS32b_nt_qpred_pi_128B_enc;
+ def V6_vS32b_nt_nqpred_pi_128B : T_vstore_qpred_pi_128B<1, 1>,
+ V6_vS32b_nt_nqpred_pi_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment conditional .new vector stores with immediate offset
+//===----------------------------------------------------------------------===//
+let Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST, isPredicated = 1,
+ isNewValue = 1, opNewValue = 4, addrMode = PostInc, isNVStore = 1 in
+class T_vstore_new_pred_pi <string baseOp, Operand ImmOp, RegisterClass RC,
+ bit isPredNot, bit isNT>
+ : V6_STInst <(outs IntRegs:$_dst_),
+ (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
+ "if("#!if(isPredNot, "!", "")#"$src1) vmem($src2++#$src3)"
+ #!if(isNT, ":nt", "")#" = $src4.new", [],
+ "$src2 = $_dst_"> , NewValueRel {
+ let isPredicatedFalse = isPredNot;
+ let BaseOpcode = baseOp;
+}
+
+let accessSize = Vector64Access in
+class T_vstore_new_pred_pi_64B <string baseOp, bit isPredNot = 0, bit isNT = 0>
+ : T_vstore_new_pred_pi <baseOp, s3_6Imm, VectorRegs, isPredNot, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_new_pred_pi_128B <string baseOp, bit isPredNot = 0, bit isNT = 0>
+ : T_vstore_new_pred_pi <baseOp#"128B", s3_7Imm, VectorRegs128B,
+ isPredNot, isNT>;
+
+def V6_vS32b_new_pred_pi : T_vstore_new_pred_pi_64B <"vS32b_pi">,
+ V6_vS32b_new_pred_pi_enc;
+def V6_vS32b_new_npred_pi : T_vstore_new_pred_pi_64B <"vS32b_pi", 1>,
+ V6_vS32b_new_npred_pi_enc;
+// 128B
+def V6_vS32b_new_pred_pi_128B : T_vstore_new_pred_pi_128B <"vS32b_pi">,
+ V6_vS32b_new_pred_pi_128B_enc;
+def V6_vS32b_new_npred_pi_128B : T_vstore_new_pred_pi_128B <"vS32b_pi", 1>,
+ V6_vS32b_new_npred_pi_128B_enc;
+let isNonTemporal = 1 in {
+ def V6_vS32b_nt_new_pred_pi : T_vstore_new_pred_pi_64B <"vS32b_pi", 0, 1>,
+ V6_vS32b_nt_new_pred_pi_enc;
+ def V6_vS32b_nt_new_npred_pi : T_vstore_new_pred_pi_64B <"vS32b_pi", 1, 1>,
+ V6_vS32b_nt_new_npred_pi_enc;
+ // 128B
+ def V6_vS32b_nt_new_pred_pi_128B : T_vstore_new_pred_pi_128B
+ <"vS32b_pi", 0, 1>,
+ V6_vS32b_nt_new_pred_pi_128B_enc;
+ def V6_vS32b_nt_new_npred_pi_128B : T_vstore_new_pred_pi_128B
+ <"vS32b_pi", 1, 1>,
+ V6_vS32b_nt_new_npred_pi_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment vector loads with register offset
+//===----------------------------------------------------------------------===//
+let hasNewValue = 1 in
+class T_vload_ppu<string asmStr>
+ : V6_LDInst <(outs VectorRegs:$dst, IntRegs:$_dst_),
+ (ins IntRegs:$src1, ModRegs:$src2), asmStr, [],
+ "$src1 = $_dst_">, NewValueRel;
+
+let isCVLoadable = 1 in {
+ def V6_vL32b_ppu : T_vload_ppu <"$dst = vmem($src1++$src2)">,
+ V6_vL32b_ppu_enc;
+ def V6_vL32b_nt_ppu : T_vload_ppu <"$dst = vmem($src1++$src2):nt">,
+ V6_vL32b_nt_ppu_enc;
+}
+
+let Itinerary = CVI_VM_VP_LDU, Type = TypeCVI_VM_VP_LDU in
+def V6_vL32Ub_ppu : T_vload_ppu <"$dst = vmemu($src1++$src2)">,
+ V6_vL32Ub_ppu_enc;
+
+let isCVLoad = 1, Itinerary = CVI_VM_CUR_LD, Type = TypeCVI_VM_CUR_LD in {
+ def V6_vL32b_cur_ppu : T_vload_ppu <"$dst.cur = vmem($src1++$src2)">,
+ V6_vL32b_cur_ppu_enc;
+ def V6_vL32b_nt_cur_ppu : T_vload_ppu <"$dst.cur = vmem($src1++$src2):nt">,
+ V6_vL32b_nt_cur_ppu_enc;
+}
+
+let Itinerary = CVI_VM_TMP_LD, Type = TypeCVI_VM_TMP_LD in {
+ def V6_vL32b_tmp_ppu : T_vload_ppu <"$dst.tmp = vmem($src1++$src2)">,
+ V6_vL32b_tmp_ppu_enc;
+ def V6_vL32b_nt_tmp_ppu : T_vload_ppu <"$dst.tmp = vmem($src1++$src2):nt">,
+ V6_vL32b_nt_tmp_ppu_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment vector stores with register offset
+//===----------------------------------------------------------------------===//
+let isPredicable = 1 in
+class T_vstore_ppu <string mnemonic, bit isNT = 0>
+ : V6_STInst <(outs IntRegs:$_dst_),
+ (ins IntRegs:$src1, ModRegs:$src2, VectorRegs:$src3),
+ mnemonic#"($src1++$src2)"#!if(isNT, ":nt", "")#" = $src3", [],
+ "$src1 = $_dst_">, NewValueRel;
+
+let isNVStorable = 1, BaseOpcode = "vS32b_ppu" in {
+ def V6_vS32b_ppu : T_vstore_ppu <"vmem">,
+ V6_vS32b_ppu_enc;
+ let isNonTemporal = 1, BaseOpcode = "vS32b_ppu" in
+ def V6_vS32b_nt_ppu : T_vstore_ppu <"vmem", 1>,
+ V6_vS32b_nt_ppu_enc;
+}
+
+let BaseOpcode = "vS32Ub_ppu", Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in
+def V6_vS32Ub_ppu : T_vstore_ppu <"vmemu">, V6_vS32Ub_ppu_enc;
+
+//===----------------------------------------------------------------------===//
+// Post increment .new vector stores with register offset
+//===----------------------------------------------------------------------===//
+let Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST, isNewValue = 1,
+ isPredicable = 1, opNewValue = 3, isNVStore = 1 in
+class T_vstore_new_ppu <bit isNT = 0>
+ : V6_STInst <(outs IntRegs:$_dst_),
+ (ins IntRegs:$src1, ModRegs:$src2, VectorRegs:$src3),
+ "vmem($src1++$src2)"#!if(isNT, ":nt", "")#" = $src3.new", [],
+ "$src1 = $_dst_">, NewValueRel;
+
+let BaseOpcode = "vS32b_ppu" in
+def V6_vS32b_new_ppu : T_vstore_new_ppu, V6_vS32b_new_ppu_enc;
+
+let BaseOpcode = "vS32b_ppu", isNonTemporal = 1 in
+def V6_vS32b_nt_new_ppu : T_vstore_new_ppu<1>, V6_vS32b_nt_new_ppu_enc;
+
+//===----------------------------------------------------------------------===//
+// Post increment conditional .new vector stores with register offset
+//===----------------------------------------------------------------------===//
+let isPredicated = 1 in
+class T_vstore_pred_ppu <string mnemonic, bit isPredNot = 0, bit isNT = 0>
+ : V6_STInst<(outs IntRegs:$_dst_),
+ (ins PredRegs:$src1, IntRegs:$src2, ModRegs:$src3, VectorRegs:$src4),
+ "if ("#!if(isPredNot, "!", "")#"$src1) "#mnemonic#"($src2++$src3)"
+ #!if(isNT, ":nt", "")#" = $src4", [],
+ "$src2 = $_dst_">, NewValueRel {
+ let isPredicatedFalse = isPredNot;
+}
+
+let isNVStorable = 1, BaseOpcode = "vS32b_ppu" in {
+ def V6_vS32b_pred_ppu : T_vstore_pred_ppu<"vmem">, V6_vS32b_pred_ppu_enc;
+ def V6_vS32b_npred_ppu: T_vstore_pred_ppu<"vmem", 1>, V6_vS32b_npred_ppu_enc;
+}
+
+let isNVStorable = 1, BaseOpcode = "vS32b_ppu", isNonTemporal = 1 in {
+ def V6_vS32b_nt_pred_ppu : T_vstore_pred_ppu <"vmem", 0, 1>,
+ V6_vS32b_nt_pred_ppu_enc;
+ def V6_vS32b_nt_npred_ppu : T_vstore_pred_ppu <"vmem", 1, 1>,
+ V6_vS32b_nt_npred_ppu_enc;
+}
+
+let BaseOpcode = "vS32Ub_ppu", Itinerary = CVI_VM_STU,
+ Type = TypeCVI_VM_STU in {
+ def V6_vS32Ub_pred_ppu : T_vstore_pred_ppu <"vmemu">,
+ V6_vS32Ub_pred_ppu_enc;
+ def V6_vS32Ub_npred_ppu : T_vstore_pred_ppu <"vmemu", 1>,
+ V6_vS32Ub_npred_ppu_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment vector stores with register offset - byte-enabled aligned
+//===----------------------------------------------------------------------===//
+class T_vstore_qpred_ppu <bit isPredNot = 0, bit isNT = 0>
+ : V6_STInst <(outs IntRegs:$_dst_),
+ (ins VecPredRegs:$src1, IntRegs:$src2, ModRegs:$src3, VectorRegs:$src4),
+ "if ("#!if(isPredNot, "!", "")#"$src1) vmem($src2++$src3)"
+ #!if(isNT, ":nt", "")#" = $src4", [],
+ "$src2 = $_dst_">, NewValueRel;
+
+def V6_vS32b_qpred_ppu : T_vstore_qpred_ppu, V6_vS32b_qpred_ppu_enc;
+def V6_vS32b_nqpred_ppu : T_vstore_qpred_ppu<1>, V6_vS32b_nqpred_ppu_enc;
+def V6_vS32b_nt_qpred_ppu : T_vstore_qpred_ppu<0, 1>,
+ V6_vS32b_nt_qpred_ppu_enc;
+def V6_vS32b_nt_nqpred_ppu : T_vstore_qpred_ppu<1, 1>,
+ V6_vS32b_nt_nqpred_ppu_enc;
+
+//===----------------------------------------------------------------------===//
+// Post increment conditional .new vector stores with register offset
+//===----------------------------------------------------------------------===//
+let Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST, isPredicated = 1,
+ isNewValue = 1, opNewValue = 4, isNVStore = 1 in
+class T_vstore_new_pred_ppu <bit isPredNot = 0, bit isNT = 0>
+ : V6_STInst <(outs IntRegs:$_dst_),
+ (ins PredRegs:$src1, IntRegs:$src2, ModRegs:$src3, VectorRegs:$src4),
+ "if("#!if(isPredNot, "!", "")#"$src1) vmem($src2++$src3)"
+ #!if(isNT, ":nt", "")#" = $src4.new", [],
+ "$src2 = $_dst_">, NewValueRel {
+ let isPredicatedFalse = isPredNot;
+}
+
+let BaseOpcode = "vS32b_ppu" in {
+ def V6_vS32b_new_pred_ppu : T_vstore_new_pred_ppu,
+ V6_vS32b_new_pred_ppu_enc;
+ def V6_vS32b_new_npred_ppu : T_vstore_new_pred_ppu<1>,
+ V6_vS32b_new_npred_ppu_enc;
+}
+
+let BaseOpcode = "vS32b_ppu", isNonTemporal = 1 in {
+def V6_vS32b_nt_new_pred_ppu : T_vstore_new_pred_ppu<0, 1>,
+ V6_vS32b_nt_new_pred_ppu_enc;
+def V6_vS32b_nt_new_npred_ppu : T_vstore_new_pred_ppu<1, 1>,
+ V6_vS32b_nt_new_npred_ppu_enc;
+}
+
+
+// Vector load/store pseudos
+
+let isPseudo = 1, isCodeGenOnly = 1, validSubTargets = HasV60SubT in
+class STrivv_template<RegisterClass RC>
+ : V6_STInst<(outs), (ins IntRegs:$addr, s32_0Imm:$off, RC:$src), "", []>;
+
+def PS_vstorerw_ai: STrivv_template<VecDblRegs>,
+ Requires<[HasV60T,UseHVXSgl]>;
+def PS_vstorerwu_ai: STrivv_template<VecDblRegs>,
+ Requires<[HasV60T,UseHVXSgl]>;
+def PS_vstorerw_ai_128B: STrivv_template<VecDblRegs128B>,
+ Requires<[HasV60T,UseHVXDbl]>;
+def PS_vstorerwu_ai_128B: STrivv_template<VecDblRegs128B>,
+ Requires<[HasV60T,UseHVXDbl]>;
+
+
+let isPseudo = 1, isCodeGenOnly = 1, validSubTargets = HasV60SubT in
+class LDrivv_template<RegisterClass RC>
+ : V6_LDInst<(outs RC:$dst), (ins IntRegs:$addr, s32_0Imm:$off), "", []>;
+
+def PS_vloadrw_ai: LDrivv_template<VecDblRegs>,
+ Requires<[HasV60T,UseHVXSgl]>;
+def PS_vloadrwu_ai: LDrivv_template<VecDblRegs>,
+ Requires<[HasV60T,UseHVXSgl]>;
+def PS_vloadrw_ai_128B: LDrivv_template<VecDblRegs128B>,
+ Requires<[HasV60T,UseHVXDbl]>;
+def PS_vloadrwu_ai_128B: LDrivv_template<VecDblRegs128B>,
+ Requires<[HasV60T,UseHVXDbl]>;
+
+// Store vector predicate pseudo.
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13,
+ isCodeGenOnly = 1, isPseudo = 1, mayStore = 1, hasSideEffects = 0 in {
+ def PS_vstorerq_ai : STInst<(outs),
+ (ins IntRegs:$base, s32_0Imm:$offset, VecPredRegs:$src1),
+ ".error \"should not emit\"", []>,
+ Requires<[HasV60T,UseHVXSgl]>;
+ def PS_vstorerq_ai_128B : STInst<(outs),
+ (ins IntRegs:$base, s32_0Imm:$offset, VecPredRegs128B:$src1),
+ ".error \"should not emit\"", []>,
+ Requires<[HasV60T,UseHVXDbl]>;
+}
+
+// Load vector predicate pseudo.
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13,
+ opExtentAlign = 2, isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in {
+ def PS_vloadrq_ai : LDInst<(outs VecPredRegs:$dst),
+ (ins IntRegs:$base, s32_0Imm:$offset),
+ ".error \"should not emit\"", []>,
+ Requires<[HasV60T,UseHVXSgl]>;
+ def PS_vloadrq_ai_128B : LDInst<(outs VecPredRegs128B:$dst),
+ (ins IntRegs:$base, s32_0Imm:$offset),
+ ".error \"should not emit\"", []>,
+ Requires<[HasV60T,UseHVXDbl]>;
+}
+
+class VSELInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = CVI_VA_DV,
+ IType type = TypeCVI_VA_DV>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, type>;
+
+let isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in {
+ def PS_vselect: VSELInst<(outs VectorRegs:$dst),
+ (ins PredRegs:$src1, VectorRegs:$src2, VectorRegs:$src3), "", []>,
+ Requires<[HasV60T,UseHVXSgl]>;
+ def PS_vselect_128B: VSELInst<(outs VectorRegs128B:$dst),
+ (ins PredRegs:$src1, VectorRegs128B:$src2, VectorRegs128B:$src3),
+ "", []>, Requires<[HasV60T,UseHVXDbl]>;
+ def PS_wselect: VSELInst<(outs VecDblRegs:$dst),
+ (ins PredRegs:$src1, VecDblRegs:$src2, VecDblRegs:$src3), "", []>,
+ Requires<[HasV60T,UseHVXSgl]>;
+ def PS_wselect_128B: VSELInst<(outs VecDblRegs128B:$dst),
+ (ins PredRegs:$src1, VecDblRegs128B:$src2, VecDblRegs128B:$src3),
+ "", []>, Requires<[HasV60T,UseHVXDbl]>;
+}
+
+let hasNewValue = 1 in
+class T_vmpy <string asmString, RegisterClass RCout, RegisterClass RCin>
+ : CVI_VX_DV_Resource1<(outs RCout:$dst), (ins RCin:$src1, IntRegs:$src2),
+ asmString >;
+
+multiclass T_vmpy <string asmString, RegisterClass RCout,
+ RegisterClass RCin> {
+ def NAME : T_vmpy <asmString, RCout, RCin>;
+ let isCodeGenOnly = 1 in
+ def NAME#_128B : T_vmpy <asmString, !cast<RegisterClass>(RCout#"128B"),
+ !cast<RegisterClass>(RCin#"128B")>;
+}
+
+multiclass T_vmpy_VV <string asmString>:
+ T_vmpy <asmString, VectorRegs, VectorRegs>;
+
+multiclass T_vmpy_WW <string asmString>:
+ T_vmpy <asmString, VecDblRegs, VecDblRegs>;
+
+multiclass T_vmpy_VW <string asmString>:
+ T_vmpy <asmString, VectorRegs, VecDblRegs>;
+
+multiclass T_vmpy_WV <string asmString>:
+ T_vmpy <asmString, VecDblRegs, VectorRegs>;
+
+defm V6_vtmpyb :T_vmpy_WW<"$dst.h = vtmpy($src1.b,$src2.b)">, V6_vtmpyb_enc;
+defm V6_vtmpybus :T_vmpy_WW<"$dst.h = vtmpy($src1.ub,$src2.b)">, V6_vtmpybus_enc;
+defm V6_vdsaduh :T_vmpy_WW<"$dst.uw = vdsad($src1.uh,$src2.uh)">, V6_vdsaduh_enc;
+defm V6_vmpybus :T_vmpy_WV<"$dst.h = vmpy($src1.ub,$src2.b)">, V6_vmpybus_enc;
+defm V6_vmpabus :T_vmpy_WW<"$dst.h = vmpa($src1.ub,$src2.b)">, V6_vmpabus_enc;
+defm V6_vmpahb :T_vmpy_WW<"$dst.w = vmpa($src1.h,$src2.b)">, V6_vmpahb_enc;
+defm V6_vmpyh :T_vmpy_WV<"$dst.w = vmpy($src1.h,$src2.h)">, V6_vmpyh_enc;
+defm V6_vmpyuh :T_vmpy_WV<"$dst.uw = vmpy($src1.uh,$src2.uh)">, V6_vmpyuh_enc;
+defm V6_vmpyiwh :T_vmpy_VV<"$dst.w = vmpyi($src1.w,$src2.h)">, V6_vmpyiwh_enc;
+defm V6_vtmpyhb :T_vmpy_WW<"$dst.w = vtmpy($src1.h,$src2.b)">, V6_vtmpyhb_enc;
+defm V6_vmpyub :T_vmpy_WV<"$dst.uh = vmpy($src1.ub,$src2.ub)">, V6_vmpyub_enc;
+
+let Itinerary = CVI_VX_LONG, Type = TypeCVI_VX in
+defm V6_vmpyihb :T_vmpy_VV<"$dst.h = vmpyi($src1.h,$src2.b)">, V6_vmpyihb_enc;
+
+defm V6_vdmpybus_dv :
+ T_vmpy_WW <"$dst.h = vdmpy($src1.ub,$src2.b)">, V6_vdmpybus_dv_enc;
+defm V6_vdmpyhsusat :
+ T_vmpy_VV <"$dst.w = vdmpy($src1.h,$src2.uh):sat">, V6_vdmpyhsusat_enc;
+defm V6_vdmpyhsuisat :
+ T_vmpy_VW <"$dst.w = vdmpy($src1.h,$src2.uh,#1):sat">, V6_vdmpyhsuisat_enc;
+defm V6_vdmpyhsat :
+ T_vmpy_VV <"$dst.w = vdmpy($src1.h,$src2.h):sat">, V6_vdmpyhsat_enc;
+defm V6_vdmpyhisat :
+ T_vmpy_VW <"$dst.w = vdmpy($src1.h,$src2.h):sat">, V6_vdmpyhisat_enc;
+defm V6_vdmpyhb_dv :
+ T_vmpy_WW <"$dst.w = vdmpy($src1.h,$src2.b)">, V6_vdmpyhb_dv_enc;
+defm V6_vmpyhss :
+ T_vmpy_VV <"$dst.h = vmpy($src1.h,$src2.h):<<1:sat">, V6_vmpyhss_enc;
+defm V6_vmpyhsrs :
+ T_vmpy_VV <"$dst.h = vmpy($src1.h,$src2.h):<<1:rnd:sat">, V6_vmpyhsrs_enc;
+
+let Itinerary = CVI_VP, Type = TypeCVI_VP in
+defm V6_vror : T_vmpy_VV <"$dst = vror($src1,$src2)">, V6_vror_enc;
+
+let Itinerary = CVI_VX, Type = TypeCVI_VX in {
+defm V6_vdmpyhb : T_vmpy_VV<"$dst.w = vdmpy($src1.h,$src2.b)">, V6_vdmpyhb_enc;
+defm V6_vrmpybus : T_vmpy_VV<"$dst.w = vrmpy($src1.ub,$src2.b)">, V6_vrmpybus_enc;
+defm V6_vdmpybus : T_vmpy_VV<"$dst.h = vdmpy($src1.ub,$src2.b)">, V6_vdmpybus_enc;
+defm V6_vmpyiwb : T_vmpy_VV<"$dst.w = vmpyi($src1.w,$src2.b)">, V6_vmpyiwb_enc;
+defm V6_vrmpyub : T_vmpy_VV<"$dst.uw = vrmpy($src1.ub,$src2.ub)">, V6_vrmpyub_enc;
+}
+
+let Itinerary = CVI_VS, Type = TypeCVI_VS in {
+defm V6_vasrw : T_vmpy_VV <"$dst.w = vasr($src1.w,$src2)">, V6_vasrw_enc;
+defm V6_vasrh : T_vmpy_VV <"$dst.h = vasr($src1.h,$src2)">, V6_vasrh_enc;
+defm V6_vaslw : T_vmpy_VV <"$dst.w = vasl($src1.w,$src2)">, V6_vaslw_enc;
+defm V6_vaslh : T_vmpy_VV <"$dst.h = vasl($src1.h,$src2)">, V6_vaslh_enc;
+defm V6_vlsrw : T_vmpy_VV <"$dst.uw = vlsr($src1.uw,$src2)">, V6_vlsrw_enc;
+defm V6_vlsrh : T_vmpy_VV <"$dst.uh = vlsr($src1.uh,$src2)">, V6_vlsrh_enc;
+}
+
+let hasNewValue = 1 in
+class T_HVX_alu <string asmString, InstrItinClass itin,
+ RegisterClass RCout, RegisterClass RCin>
+ : CVI_VA_Resource1 <(outs RCout:$dst), (ins RCin:$src1, RCin:$src2),
+ asmString >{
+ let Itinerary = itin;
+ let Type = !cast<IType>("Type"#itin);
+}
+
+multiclass T_HVX_alu <string asmString, RegisterClass RCout,
+ RegisterClass RCin, InstrItinClass itin> {
+ def NAME : T_HVX_alu <asmString, itin, RCout, RCin>;
+ let isCodeGenOnly = 1 in
+ def NAME#_128B : T_HVX_alu <asmString, itin,
+ !cast<RegisterClass>(RCout#"128B"),
+ !cast<RegisterClass>(RCin#"128B")>;
+}
+
+multiclass T_HVX_alu_VV <string asmString>:
+ T_HVX_alu <asmString, VectorRegs, VectorRegs, CVI_VA>;
+
+multiclass T_HVX_alu_WW <string asmString>:
+ T_HVX_alu <asmString, VecDblRegs, VecDblRegs, CVI_VA_DV>;
+
+multiclass T_HVX_alu_WV <string asmString>:
+ T_HVX_alu <asmString, VecDblRegs, VectorRegs, CVI_VX_DV>;
+
+
+let Itinerary = CVI_VX, Type = TypeCVI_VX in {
+defm V6_vrmpyubv :
+ T_HVX_alu_VV <"$dst.uw = vrmpy($src1.ub,$src2.ub)">, V6_vrmpyubv_enc;
+defm V6_vrmpybv :
+ T_HVX_alu_VV <"$dst.w = vrmpy($src1.b,$src2.b)">, V6_vrmpybv_enc;
+defm V6_vrmpybusv :
+ T_HVX_alu_VV <"$dst.w = vrmpy($src1.ub,$src2.b)">, V6_vrmpybusv_enc;
+defm V6_vabsdiffub :
+ T_HVX_alu_VV <"$dst.ub = vabsdiff($src1.ub,$src2.ub)">, V6_vabsdiffub_enc;
+defm V6_vabsdiffh :
+ T_HVX_alu_VV <"$dst.uh = vabsdiff($src1.h,$src2.h)">, V6_vabsdiffh_enc;
+defm V6_vabsdiffuh :
+ T_HVX_alu_VV <"$dst.uh = vabsdiff($src1.uh,$src2.uh)">, V6_vabsdiffuh_enc;
+defm V6_vabsdiffw :
+ T_HVX_alu_VV <"$dst.uw = vabsdiff($src1.w,$src2.w)">, V6_vabsdiffw_enc;
+}
+
+let Itinerary = CVI_VX_DV, Type = TypeCVI_VX_DV in {
+defm V6_vdmpyhvsat :
+ T_HVX_alu_VV <"$dst.w = vdmpy($src1.h,$src2.h):sat">, V6_vdmpyhvsat_enc;
+defm V6_vmpyhvsrs :
+ T_HVX_alu_VV<"$dst.h = vmpy($src1.h,$src2.h):<<1:rnd:sat">, V6_vmpyhvsrs_enc;
+defm V6_vmpyih :
+ T_HVX_alu_VV <"$dst.h = vmpyi($src1.h,$src2.h)">, V6_vmpyih_enc;
+}
+
+defm V6_vand :
+ T_HVX_alu_VV <"$dst = vand($src1,$src2)">, V6_vand_enc;
+defm V6_vor :
+ T_HVX_alu_VV <"$dst = vor($src1,$src2)">, V6_vor_enc;
+defm V6_vxor :
+ T_HVX_alu_VV <"$dst = vxor($src1,$src2)">, V6_vxor_enc;
+defm V6_vaddw :
+ T_HVX_alu_VV <"$dst.w = vadd($src1.w,$src2.w)">, V6_vaddw_enc;
+defm V6_vaddubsat :
+ T_HVX_alu_VV <"$dst.ub = vadd($src1.ub,$src2.ub):sat">, V6_vaddubsat_enc;
+defm V6_vadduhsat :
+ T_HVX_alu_VV <"$dst.uh = vadd($src1.uh,$src2.uh):sat">, V6_vadduhsat_enc;
+defm V6_vaddhsat :
+ T_HVX_alu_VV <"$dst.h = vadd($src1.h,$src2.h):sat">, V6_vaddhsat_enc;
+defm V6_vaddwsat :
+ T_HVX_alu_VV <"$dst.w = vadd($src1.w,$src2.w):sat">, V6_vaddwsat_enc;
+defm V6_vsubb :
+ T_HVX_alu_VV <"$dst.b = vsub($src1.b,$src2.b)">, V6_vsubb_enc;
+defm V6_vsubh :
+ T_HVX_alu_VV <"$dst.h = vsub($src1.h,$src2.h)">, V6_vsubh_enc;
+defm V6_vsubw :
+ T_HVX_alu_VV <"$dst.w = vsub($src1.w,$src2.w)">, V6_vsubw_enc;
+defm V6_vsububsat :
+ T_HVX_alu_VV <"$dst.ub = vsub($src1.ub,$src2.ub):sat">, V6_vsububsat_enc;
+defm V6_vsubuhsat :
+ T_HVX_alu_VV <"$dst.uh = vsub($src1.uh,$src2.uh):sat">, V6_vsubuhsat_enc;
+defm V6_vsubhsat :
+ T_HVX_alu_VV <"$dst.h = vsub($src1.h,$src2.h):sat">, V6_vsubhsat_enc;
+defm V6_vsubwsat :
+ T_HVX_alu_VV <"$dst.w = vsub($src1.w,$src2.w):sat">, V6_vsubwsat_enc;
+defm V6_vavgub :
+ T_HVX_alu_VV <"$dst.ub = vavg($src1.ub,$src2.ub)">, V6_vavgub_enc;
+defm V6_vavguh :
+ T_HVX_alu_VV <"$dst.uh = vavg($src1.uh,$src2.uh)">, V6_vavguh_enc;
+defm V6_vavgh :
+ T_HVX_alu_VV <"$dst.h = vavg($src1.h,$src2.h)">, V6_vavgh_enc;
+defm V6_vavgw :
+ T_HVX_alu_VV <"$dst.w = vavg($src1.w,$src2.w)">, V6_vavgw_enc;
+defm V6_vnavgub :
+ T_HVX_alu_VV <"$dst.b = vnavg($src1.ub,$src2.ub)">, V6_vnavgub_enc;
+defm V6_vnavgh :
+ T_HVX_alu_VV <"$dst.h = vnavg($src1.h,$src2.h)">, V6_vnavgh_enc;
+defm V6_vnavgw :
+ T_HVX_alu_VV <"$dst.w = vnavg($src1.w,$src2.w)">, V6_vnavgw_enc;
+defm V6_vavgubrnd :
+ T_HVX_alu_VV <"$dst.ub = vavg($src1.ub,$src2.ub):rnd">, V6_vavgubrnd_enc;
+defm V6_vavguhrnd :
+ T_HVX_alu_VV <"$dst.uh = vavg($src1.uh,$src2.uh):rnd">, V6_vavguhrnd_enc;
+defm V6_vavghrnd :
+ T_HVX_alu_VV <"$dst.h = vavg($src1.h,$src2.h):rnd">, V6_vavghrnd_enc;
+defm V6_vavgwrnd :
+ T_HVX_alu_VV <"$dst.w = vavg($src1.w,$src2.w):rnd">, V6_vavgwrnd_enc;
+
+defm V6_vmpybv :
+ T_HVX_alu_WV <"$dst.h = vmpy($src1.b,$src2.b)">, V6_vmpybv_enc;
+defm V6_vmpyubv :
+ T_HVX_alu_WV <"$dst.uh = vmpy($src1.ub,$src2.ub)">, V6_vmpyubv_enc;
+defm V6_vmpybusv :
+ T_HVX_alu_WV <"$dst.h = vmpy($src1.ub,$src2.b)">, V6_vmpybusv_enc;
+defm V6_vmpyhv :
+ T_HVX_alu_WV <"$dst.w = vmpy($src1.h,$src2.h)">, V6_vmpyhv_enc;
+defm V6_vmpyuhv :
+ T_HVX_alu_WV <"$dst.uw = vmpy($src1.uh,$src2.uh)">, V6_vmpyuhv_enc;
+defm V6_vmpyhus :
+ T_HVX_alu_WV <"$dst.w = vmpy($src1.h,$src2.uh)">, V6_vmpyhus_enc;
+defm V6_vaddubh :
+ T_HVX_alu_WV <"$dst.h = vadd($src1.ub,$src2.ub)">, V6_vaddubh_enc;
+defm V6_vadduhw :
+ T_HVX_alu_WV <"$dst.w = vadd($src1.uh,$src2.uh)">, V6_vadduhw_enc;
+defm V6_vaddhw :
+ T_HVX_alu_WV <"$dst.w = vadd($src1.h,$src2.h)">, V6_vaddhw_enc;
+defm V6_vsububh :
+ T_HVX_alu_WV <"$dst.h = vsub($src1.ub,$src2.ub)">, V6_vsububh_enc;
+defm V6_vsubuhw :
+ T_HVX_alu_WV <"$dst.w = vsub($src1.uh,$src2.uh)">, V6_vsubuhw_enc;
+defm V6_vsubhw :
+ T_HVX_alu_WV <"$dst.w = vsub($src1.h,$src2.h)">, V6_vsubhw_enc;
+
+defm V6_vaddb_dv :
+ T_HVX_alu_WW <"$dst.b = vadd($src1.b,$src2.b)">, V6_vaddb_dv_enc;
+defm V6_vaddh_dv :
+ T_HVX_alu_WW <"$dst.h = vadd($src1.h,$src2.h)">, V6_vaddh_dv_enc;
+defm V6_vaddw_dv :
+ T_HVX_alu_WW <"$dst.w = vadd($src1.w,$src2.w)">, V6_vaddw_dv_enc;
+defm V6_vaddubsat_dv :
+ T_HVX_alu_WW <"$dst.ub = vadd($src1.ub,$src2.ub):sat">, V6_vaddubsat_dv_enc;
+defm V6_vadduhsat_dv :
+ T_HVX_alu_WW <"$dst.uh = vadd($src1.uh,$src2.uh):sat">, V6_vadduhsat_dv_enc;
+defm V6_vaddhsat_dv :
+ T_HVX_alu_WW <"$dst.h = vadd($src1.h,$src2.h):sat">, V6_vaddhsat_dv_enc;
+defm V6_vaddwsat_dv :
+ T_HVX_alu_WW <"$dst.w = vadd($src1.w,$src2.w):sat">, V6_vaddwsat_dv_enc;
+defm V6_vsubb_dv :
+ T_HVX_alu_WW <"$dst.b = vsub($src1.b,$src2.b)">, V6_vsubb_dv_enc;
+defm V6_vsubh_dv :
+ T_HVX_alu_WW <"$dst.h = vsub($src1.h,$src2.h)">, V6_vsubh_dv_enc;
+defm V6_vsubw_dv :
+ T_HVX_alu_WW <"$dst.w = vsub($src1.w,$src2.w)">, V6_vsubw_dv_enc;
+defm V6_vsububsat_dv :
+ T_HVX_alu_WW <"$dst.ub = vsub($src1.ub,$src2.ub):sat">, V6_vsububsat_dv_enc;
+defm V6_vsubuhsat_dv :
+ T_HVX_alu_WW <"$dst.uh = vsub($src1.uh,$src2.uh):sat">, V6_vsubuhsat_dv_enc;
+defm V6_vsubhsat_dv :
+ T_HVX_alu_WW <"$dst.h = vsub($src1.h,$src2.h):sat">, V6_vsubhsat_dv_enc;
+defm V6_vsubwsat_dv :
+ T_HVX_alu_WW <"$dst.w = vsub($src1.w,$src2.w):sat">, V6_vsubwsat_dv_enc;
+
+let Itinerary = CVI_VX_DV_LONG, Type = TypeCVI_VX_DV in {
+defm V6_vmpabusv :
+ T_HVX_alu_WW <"$dst.h = vmpa($src1.ub,$src2.b)">, V6_vmpabusv_enc;
+defm V6_vmpabuuv :
+ T_HVX_alu_WW <"$dst.h = vmpa($src1.ub,$src2.ub)">, V6_vmpabuuv_enc;
+}
+
+let isAccumulator = 1, hasNewValue = 1 in
+class T_HVX_vmpyacc <string asmString, InstrItinClass itin, RegisterClass RCout,
+ RegisterClass RCin1, RegisterClass RCin2>
+ : CVI_VA_Resource1 <(outs RCout:$dst),
+ (ins RCout:$_src_, RCin1:$src1, RCin2:$src2), asmString,
+ [], "$dst = $_src_" > {
+ let Itinerary = itin;
+ let Type = !cast<IType>("Type"#itin);
+}
+
+multiclass T_HVX_vmpyacc_both <string asmString, RegisterClass RCout,
+ RegisterClass RCin1, RegisterClass RCin2, InstrItinClass itin > {
+ def NAME : T_HVX_vmpyacc <asmString, itin, RCout, RCin1, RCin2>;
+ let isCodeGenOnly = 1 in
+ def NAME#_128B : T_HVX_vmpyacc <asmString, itin,
+ !cast<RegisterClass>(RCout#"128B"),
+ !cast<RegisterClass>(RCin1#"128B"),
+ !cast<RegisterClass>(RCin2#
+ !if(!eq (!cast<string>(RCin2), "IntRegs"), "", "128B"))>;
+}
+
+multiclass T_HVX_vmpyacc_VVR <string asmString>:
+ T_HVX_vmpyacc_both <asmString, VectorRegs, VectorRegs, IntRegs, CVI_VX>;
+
+multiclass T_HVX_vmpyacc_VWR <string asmString>:
+ T_HVX_vmpyacc_both <asmString, VectorRegs, VecDblRegs, IntRegs, CVI_VX_DV>;
+
+multiclass T_HVX_vmpyacc_WVR <string asmString>:
+ T_HVX_vmpyacc_both <asmString, VecDblRegs, VectorRegs, IntRegs, CVI_VX_DV>;
+
+multiclass T_HVX_vmpyacc_WWR <string asmString>:
+ T_HVX_vmpyacc_both <asmString, VecDblRegs, VecDblRegs, IntRegs, CVI_VX_DV>;
+
+multiclass T_HVX_vmpyacc_VVV <string asmString>:
+ T_HVX_vmpyacc_both <asmString, VectorRegs, VectorRegs, VectorRegs, CVI_VX_DV>;
+
+multiclass T_HVX_vmpyacc_WVV <string asmString>:
+ T_HVX_vmpyacc_both <asmString, VecDblRegs, VectorRegs, VectorRegs, CVI_VX_DV>;
+
+
+defm V6_vtmpyb_acc :
+ T_HVX_vmpyacc_WWR <"$dst.h += vtmpy($src1.b,$src2.b)">,
+ V6_vtmpyb_acc_enc;
+defm V6_vtmpybus_acc :
+ T_HVX_vmpyacc_WWR <"$dst.h += vtmpy($src1.ub,$src2.b)">,
+ V6_vtmpybus_acc_enc;
+defm V6_vtmpyhb_acc :
+ T_HVX_vmpyacc_WWR <"$dst.w += vtmpy($src1.h,$src2.b)">,
+ V6_vtmpyhb_acc_enc;
+defm V6_vdmpyhb_acc :
+ T_HVX_vmpyacc_VVR <"$dst.w += vdmpy($src1.h,$src2.b)">,
+ V6_vdmpyhb_acc_enc;
+defm V6_vrmpyub_acc :
+ T_HVX_vmpyacc_VVR <"$dst.uw += vrmpy($src1.ub,$src2.ub)">,
+ V6_vrmpyub_acc_enc;
+defm V6_vrmpybus_acc :
+ T_HVX_vmpyacc_VVR <"$dst.w += vrmpy($src1.ub,$src2.b)">,
+ V6_vrmpybus_acc_enc;
+defm V6_vdmpybus_acc :
+ T_HVX_vmpyacc_VVR <"$dst.h += vdmpy($src1.ub,$src2.b)">,
+ V6_vdmpybus_acc_enc;
+defm V6_vdmpybus_dv_acc :
+ T_HVX_vmpyacc_WWR <"$dst.h += vdmpy($src1.ub,$src2.b)">,
+ V6_vdmpybus_dv_acc_enc;
+defm V6_vdmpyhsuisat_acc :
+ T_HVX_vmpyacc_VWR <"$dst.w += vdmpy($src1.h,$src2.uh,#1):sat">,
+ V6_vdmpyhsuisat_acc_enc;
+defm V6_vdmpyhisat_acc :
+ T_HVX_vmpyacc_VWR <"$dst.w += vdmpy($src1.h,$src2.h):sat">,
+ V6_vdmpyhisat_acc_enc;
+defm V6_vdmpyhb_dv_acc :
+ T_HVX_vmpyacc_WWR <"$dst.w += vdmpy($src1.h,$src2.b)">,
+ V6_vdmpyhb_dv_acc_enc;
+defm V6_vmpybus_acc :
+ T_HVX_vmpyacc_WVR <"$dst.h += vmpy($src1.ub,$src2.b)">,
+ V6_vmpybus_acc_enc;
+defm V6_vmpabus_acc :
+ T_HVX_vmpyacc_WWR <"$dst.h += vmpa($src1.ub,$src2.b)">,
+ V6_vmpabus_acc_enc;
+defm V6_vmpahb_acc :
+ T_HVX_vmpyacc_WWR <"$dst.w += vmpa($src1.h,$src2.b)">,
+ V6_vmpahb_acc_enc;
+defm V6_vmpyhsat_acc :
+ T_HVX_vmpyacc_WVR <"$dst.w += vmpy($src1.h,$src2.h):sat">,
+ V6_vmpyhsat_acc_enc;
+defm V6_vmpyuh_acc :
+ T_HVX_vmpyacc_WVR <"$dst.uw += vmpy($src1.uh,$src2.uh)">,
+ V6_vmpyuh_acc_enc;
+defm V6_vmpyiwb_acc :
+ T_HVX_vmpyacc_VVR <"$dst.w += vmpyi($src1.w,$src2.b)">,
+ V6_vmpyiwb_acc_enc;
+defm V6_vdsaduh_acc :
+ T_HVX_vmpyacc_WWR <"$dst.uw += vdsad($src1.uh,$src2.uh)">,
+ V6_vdsaduh_acc_enc;
+defm V6_vmpyihb_acc :
+ T_HVX_vmpyacc_VVR <"$dst.h += vmpyi($src1.h,$src2.b)">,
+ V6_vmpyihb_acc_enc;
+defm V6_vmpyub_acc :
+ T_HVX_vmpyacc_WVR <"$dst.uh += vmpy($src1.ub,$src2.ub)">,
+ V6_vmpyub_acc_enc;
+
+let Itinerary = CVI_VX_DV, Type = TypeCVI_VX_DV in {
+defm V6_vdmpyhsusat_acc :
+ T_HVX_vmpyacc_VVR <"$dst.w += vdmpy($src1.h,$src2.uh):sat">,
+ V6_vdmpyhsusat_acc_enc;
+defm V6_vdmpyhsat_acc :
+ T_HVX_vmpyacc_VVR <"$dst.w += vdmpy($src1.h,$src2.h):sat">,
+ V6_vdmpyhsat_acc_enc;
+defm V6_vmpyiwh_acc : T_HVX_vmpyacc_VVR
+ <"$dst.w += vmpyi($src1.w,$src2.h)">, V6_vmpyiwh_acc_enc;
+}
+
+let Itinerary = CVI_VS, Type = TypeCVI_VS in {
+defm V6_vaslw_acc :
+ T_HVX_vmpyacc_VVR <"$dst.w += vasl($src1.w,$src2)">, V6_vaslw_acc_enc;
+defm V6_vasrw_acc :
+ T_HVX_vmpyacc_VVR <"$dst.w += vasr($src1.w,$src2)">, V6_vasrw_acc_enc;
+}
+
+defm V6_vdmpyhvsat_acc :
+ T_HVX_vmpyacc_VVV <"$dst.w += vdmpy($src1.h,$src2.h):sat">,
+ V6_vdmpyhvsat_acc_enc;
+defm V6_vmpybusv_acc :
+ T_HVX_vmpyacc_WVV <"$dst.h += vmpy($src1.ub,$src2.b)">,
+ V6_vmpybusv_acc_enc;
+defm V6_vmpybv_acc :
+ T_HVX_vmpyacc_WVV <"$dst.h += vmpy($src1.b,$src2.b)">, V6_vmpybv_acc_enc;
+defm V6_vmpyhus_acc :
+ T_HVX_vmpyacc_WVV <"$dst.w += vmpy($src1.h,$src2.uh)">, V6_vmpyhus_acc_enc;
+defm V6_vmpyhv_acc :
+ T_HVX_vmpyacc_WVV <"$dst.w += vmpy($src1.h,$src2.h)">, V6_vmpyhv_acc_enc;
+defm V6_vmpyiewh_acc :
+ T_HVX_vmpyacc_VVV <"$dst.w += vmpyie($src1.w,$src2.h)">,
+ V6_vmpyiewh_acc_enc;
+defm V6_vmpyiewuh_acc :
+ T_HVX_vmpyacc_VVV <"$dst.w += vmpyie($src1.w,$src2.uh)">,
+ V6_vmpyiewuh_acc_enc;
+defm V6_vmpyih_acc :
+ T_HVX_vmpyacc_VVV <"$dst.h += vmpyi($src1.h,$src2.h)">, V6_vmpyih_acc_enc;
+defm V6_vmpyowh_rnd_sacc :
+ T_HVX_vmpyacc_VVV <"$dst.w += vmpyo($src1.w,$src2.h):<<1:rnd:sat:shift">,
+ V6_vmpyowh_rnd_sacc_enc;
+defm V6_vmpyowh_sacc :
+ T_HVX_vmpyacc_VVV <"$dst.w += vmpyo($src1.w,$src2.h):<<1:sat:shift">,
+ V6_vmpyowh_sacc_enc;
+defm V6_vmpyubv_acc :
+ T_HVX_vmpyacc_WVV <"$dst.uh += vmpy($src1.ub,$src2.ub)">,
+ V6_vmpyubv_acc_enc;
+defm V6_vmpyuhv_acc :
+ T_HVX_vmpyacc_WVV <"$dst.uw += vmpy($src1.uh,$src2.uh)">,
+ V6_vmpyuhv_acc_enc;
+defm V6_vrmpybusv_acc :
+ T_HVX_vmpyacc_VVV <"$dst.w += vrmpy($src1.ub,$src2.b)">,
+ V6_vrmpybusv_acc_enc;
+defm V6_vrmpybv_acc :
+ T_HVX_vmpyacc_VVV <"$dst.w += vrmpy($src1.b,$src2.b)">, V6_vrmpybv_acc_enc;
+defm V6_vrmpyubv_acc :
+ T_HVX_vmpyacc_VVV <"$dst.uw += vrmpy($src1.ub,$src2.ub)">,
+ V6_vrmpyubv_acc_enc;
+
+
+class T_HVX_vcmp <string asmString, RegisterClass RCout, RegisterClass RCin>
+ : CVI_VA_Resource1 <(outs RCout:$dst),
+ (ins RCout:$_src_, RCin:$src1, RCin:$src2), asmString,
+ [], "$dst = $_src_" > {
+ let Itinerary = CVI_VA;
+ let Type = TypeCVI_VA;
+}
+
+multiclass T_HVX_vcmp <string asmString> {
+ def NAME : T_HVX_vcmp <asmString, VecPredRegs, VectorRegs>;
+ let isCodeGenOnly = 1 in
+ def NAME#_128B : T_HVX_vcmp <asmString, VecPredRegs128B, VectorRegs128B>;
+}
+
+defm V6_veqb_and :
+ T_HVX_vcmp <"$dst &= vcmp.eq($src1.b,$src2.b)">, V6_veqb_and_enc;
+defm V6_veqh_and :
+ T_HVX_vcmp <"$dst &= vcmp.eq($src1.h,$src2.h)">, V6_veqh_and_enc;
+defm V6_veqw_and :
+ T_HVX_vcmp <"$dst &= vcmp.eq($src1.w,$src2.w)">, V6_veqw_and_enc;
+defm V6_vgtb_and :
+ T_HVX_vcmp <"$dst &= vcmp.gt($src1.b,$src2.b)">, V6_vgtb_and_enc;
+defm V6_vgth_and :
+ T_HVX_vcmp <"$dst &= vcmp.gt($src1.h,$src2.h)">, V6_vgth_and_enc;
+defm V6_vgtw_and :
+ T_HVX_vcmp <"$dst &= vcmp.gt($src1.w,$src2.w)">, V6_vgtw_and_enc;
+defm V6_vgtub_and :
+ T_HVX_vcmp <"$dst &= vcmp.gt($src1.ub,$src2.ub)">, V6_vgtub_and_enc;
+defm V6_vgtuh_and :
+ T_HVX_vcmp <"$dst &= vcmp.gt($src1.uh,$src2.uh)">, V6_vgtuh_and_enc;
+defm V6_vgtuw_and :
+ T_HVX_vcmp <"$dst &= vcmp.gt($src1.uw,$src2.uw)">, V6_vgtuw_and_enc;
+defm V6_veqb_or :
+ T_HVX_vcmp <"$dst |= vcmp.eq($src1.b,$src2.b)">, V6_veqb_or_enc;
+defm V6_veqh_or :
+ T_HVX_vcmp <"$dst |= vcmp.eq($src1.h,$src2.h)">, V6_veqh_or_enc;
+defm V6_veqw_or :
+ T_HVX_vcmp <"$dst |= vcmp.eq($src1.w,$src2.w)">, V6_veqw_or_enc;
+defm V6_vgtb_or :
+ T_HVX_vcmp <"$dst |= vcmp.gt($src1.b,$src2.b)">, V6_vgtb_or_enc;
+defm V6_vgth_or :
+ T_HVX_vcmp <"$dst |= vcmp.gt($src1.h,$src2.h)">, V6_vgth_or_enc;
+defm V6_vgtw_or :
+ T_HVX_vcmp <"$dst |= vcmp.gt($src1.w,$src2.w)">, V6_vgtw_or_enc;
+defm V6_vgtub_or :
+ T_HVX_vcmp <"$dst |= vcmp.gt($src1.ub,$src2.ub)">, V6_vgtub_or_enc;
+defm V6_vgtuh_or :
+ T_HVX_vcmp <"$dst |= vcmp.gt($src1.uh,$src2.uh)">, V6_vgtuh_or_enc;
+defm V6_vgtuw_or :
+ T_HVX_vcmp <"$dst |= vcmp.gt($src1.uw,$src2.uw)">, V6_vgtuw_or_enc;
+defm V6_veqb_xor :
+ T_HVX_vcmp <"$dst ^= vcmp.eq($src1.b,$src2.b)">, V6_veqb_xor_enc;
+defm V6_veqh_xor :
+ T_HVX_vcmp <"$dst ^= vcmp.eq($src1.h,$src2.h)">, V6_veqh_xor_enc;
+defm V6_veqw_xor :
+ T_HVX_vcmp <"$dst ^= vcmp.eq($src1.w,$src2.w)">, V6_veqw_xor_enc;
+defm V6_vgtb_xor :
+ T_HVX_vcmp <"$dst ^= vcmp.gt($src1.b,$src2.b)">, V6_vgtb_xor_enc;
+defm V6_vgth_xor :
+ T_HVX_vcmp <"$dst ^= vcmp.gt($src1.h,$src2.h)">, V6_vgth_xor_enc;
+defm V6_vgtw_xor :
+ T_HVX_vcmp <"$dst ^= vcmp.gt($src1.w,$src2.w)">, V6_vgtw_xor_enc;
+defm V6_vgtub_xor :
+ T_HVX_vcmp <"$dst ^= vcmp.gt($src1.ub,$src2.ub)">, V6_vgtub_xor_enc;
+defm V6_vgtuh_xor :
+ T_HVX_vcmp <"$dst ^= vcmp.gt($src1.uh,$src2.uh)">, V6_vgtuh_xor_enc;
+defm V6_vgtuw_xor :
+ T_HVX_vcmp <"$dst ^= vcmp.gt($src1.uw,$src2.uw)">, V6_vgtuw_xor_enc;
+
+defm V6_vminub :
+ T_HVX_alu_VV <"$dst.ub = vmin($src1.ub,$src2.ub)">, V6_vminub_enc;
+defm V6_vminuh :
+ T_HVX_alu_VV <"$dst.uh = vmin($src1.uh,$src2.uh)">, V6_vminuh_enc;
+defm V6_vminh :
+ T_HVX_alu_VV <"$dst.h = vmin($src1.h,$src2.h)">, V6_vminh_enc;
+defm V6_vminw :
+ T_HVX_alu_VV <"$dst.w = vmin($src1.w,$src2.w)">, V6_vminw_enc;
+defm V6_vmaxub :
+ T_HVX_alu_VV <"$dst.ub = vmax($src1.ub,$src2.ub)">, V6_vmaxub_enc;
+defm V6_vmaxuh :
+ T_HVX_alu_VV <"$dst.uh = vmax($src1.uh,$src2.uh)">, V6_vmaxuh_enc;
+defm V6_vmaxh :
+ T_HVX_alu_VV <"$dst.h = vmax($src1.h,$src2.h)">, V6_vmaxh_enc;
+defm V6_vmaxw :
+ T_HVX_alu_VV <"$dst.w = vmax($src1.w,$src2.w)">, V6_vmaxw_enc;
+defm V6_vshuffeb :
+ T_HVX_alu_VV <"$dst.b = vshuffe($src1.b,$src2.b)">, V6_vshuffeb_enc;
+defm V6_vshuffob :
+ T_HVX_alu_VV <"$dst.b = vshuffo($src1.b,$src2.b)">, V6_vshuffob_enc;
+defm V6_vshufeh :
+ T_HVX_alu_VV <"$dst.h = vshuffe($src1.h,$src2.h)">, V6_vshufeh_enc;
+defm V6_vshufoh :
+ T_HVX_alu_VV <"$dst.h = vshuffo($src1.h,$src2.h)">, V6_vshufoh_enc;
+
+let Itinerary = CVI_VX_DV, Type = TypeCVI_VX_DV in {
+defm V6_vmpyowh_rnd :
+ T_HVX_alu_VV <"$dst.w = vmpyo($src1.w,$src2.h):<<1:rnd:sat">,
+ V6_vmpyowh_rnd_enc;
+defm V6_vmpyiewuh :
+ T_HVX_alu_VV <"$dst.w = vmpyie($src1.w,$src2.uh)">, V6_vmpyiewuh_enc;
+defm V6_vmpyewuh :
+ T_HVX_alu_VV <"$dst.w = vmpye($src1.w,$src2.uh)">, V6_vmpyewuh_enc;
+defm V6_vmpyowh :
+ T_HVX_alu_VV <"$dst.w = vmpyo($src1.w,$src2.h):<<1:sat">, V6_vmpyowh_enc;
+defm V6_vmpyiowh :
+ T_HVX_alu_VV <"$dst.w = vmpyio($src1.w,$src2.h)">, V6_vmpyiowh_enc;
+}
+let Itinerary = CVI_VX, Type = TypeCVI_VX in
+defm V6_vmpyieoh :
+ T_HVX_alu_VV <"$dst.w = vmpyieo($src1.h,$src2.h)">, V6_vmpyieoh_enc;
+
+let Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV in {
+defm V6_vshufoeh :
+ T_HVX_alu_WV <"$dst.h = vshuffoe($src1.h,$src2.h)">, V6_vshufoeh_enc;
+defm V6_vshufoeb :
+ T_HVX_alu_WV <"$dst.b = vshuffoe($src1.b,$src2.b)">, V6_vshufoeb_enc;
+}
+
+let isRegSequence = 1, Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV in
+defm V6_vcombine :
+ T_HVX_alu_WV <"$dst = vcombine($src1,$src2)">, V6_vcombine_enc;
+
+let Itinerary = CVI_VINLANESAT, Type = TypeCVI_VINLANESAT in {
+defm V6_vsathub :
+ T_HVX_alu_VV <"$dst.ub = vsat($src1.h,$src2.h)">, V6_vsathub_enc;
+defm V6_vsatwh :
+ T_HVX_alu_VV <"$dst.h = vsat($src1.w,$src2.w)">, V6_vsatwh_enc;
+}
+
+let Itinerary = CVI_VS, Type = TypeCVI_VS in {
+defm V6_vroundwh :
+ T_HVX_alu_VV <"$dst.h = vround($src1.w,$src2.w):sat">, V6_vroundwh_enc;
+defm V6_vroundwuh :
+ T_HVX_alu_VV <"$dst.uh = vround($src1.w,$src2.w):sat">, V6_vroundwuh_enc;
+defm V6_vroundhb :
+ T_HVX_alu_VV <"$dst.b = vround($src1.h,$src2.h):sat">, V6_vroundhb_enc;
+defm V6_vroundhub :
+ T_HVX_alu_VV <"$dst.ub = vround($src1.h,$src2.h):sat">, V6_vroundhub_enc;
+defm V6_vasrwv :
+ T_HVX_alu_VV <"$dst.w = vasr($src1.w,$src2.w)">, V6_vasrwv_enc;
+defm V6_vlsrwv :
+ T_HVX_alu_VV <"$dst.w = vlsr($src1.w,$src2.w)">, V6_vlsrwv_enc;
+defm V6_vlsrhv :
+ T_HVX_alu_VV <"$dst.h = vlsr($src1.h,$src2.h)">, V6_vlsrhv_enc;
+defm V6_vasrhv :
+ T_HVX_alu_VV <"$dst.h = vasr($src1.h,$src2.h)">, V6_vasrhv_enc;
+defm V6_vaslwv :
+ T_HVX_alu_VV <"$dst.w = vasl($src1.w,$src2.w)">, V6_vaslwv_enc;
+defm V6_vaslhv :
+ T_HVX_alu_VV <"$dst.h = vasl($src1.h,$src2.h)">, V6_vaslhv_enc;
+}
+
+defm V6_vaddb :
+ T_HVX_alu_VV <"$dst.b = vadd($src1.b,$src2.b)">, V6_vaddb_enc;
+defm V6_vaddh :
+ T_HVX_alu_VV <"$dst.h = vadd($src1.h,$src2.h)">, V6_vaddh_enc;
+
+let Itinerary = CVI_VP, Type = TypeCVI_VP in {
+defm V6_vdelta :
+ T_HVX_alu_VV <"$dst = vdelta($src1,$src2)">, V6_vdelta_enc;
+defm V6_vrdelta :
+ T_HVX_alu_VV <"$dst = vrdelta($src1,$src2)">, V6_vrdelta_enc;
+defm V6_vdealb4w :
+ T_HVX_alu_VV <"$dst.b = vdeale($src1.b,$src2.b)">, V6_vdealb4w_enc;
+defm V6_vpackeb :
+ T_HVX_alu_VV <"$dst.b = vpacke($src1.h,$src2.h)">, V6_vpackeb_enc;
+defm V6_vpackeh :
+ T_HVX_alu_VV <"$dst.h = vpacke($src1.w,$src2.w)">, V6_vpackeh_enc;
+defm V6_vpackhub_sat :
+ T_HVX_alu_VV <"$dst.ub = vpack($src1.h,$src2.h):sat">, V6_vpackhub_sat_enc;
+defm V6_vpackhb_sat :
+ T_HVX_alu_VV <"$dst.b = vpack($src1.h,$src2.h):sat">, V6_vpackhb_sat_enc;
+defm V6_vpackwuh_sat :
+ T_HVX_alu_VV <"$dst.uh = vpack($src1.w,$src2.w):sat">, V6_vpackwuh_sat_enc;
+defm V6_vpackwh_sat :
+ T_HVX_alu_VV <"$dst.h = vpack($src1.w,$src2.w):sat">, V6_vpackwh_sat_enc;
+defm V6_vpackob :
+ T_HVX_alu_VV <"$dst.b = vpacko($src1.h,$src2.h)">, V6_vpackob_enc;
+defm V6_vpackoh :
+ T_HVX_alu_VV <"$dst.h = vpacko($src1.w,$src2.w)">, V6_vpackoh_enc;
+}
+
+let hasNewValue = 1, hasSideEffects = 0 in
+class T_HVX_condALU <string asmString, RegisterClass RC1, RegisterClass RC2>
+ : CVI_VA_Resource1 <(outs RC2:$dst),
+ (ins RC1:$src1, RC2:$_src_, RC2:$src2), asmString,
+ [], "$dst = $_src_" > {
+ let Itinerary = CVI_VA;
+ let Type = TypeCVI_VA;
+}
+
+multiclass T_HVX_condALU <string asmString> {
+ def NAME : T_HVX_condALU <asmString, VecPredRegs, VectorRegs>;
+ let isCodeGenOnly = 1 in
+ def NAME#_128B : T_HVX_condALU <asmString, VecPredRegs128B, VectorRegs128B>;
+}
+
+defm V6_vaddbq : T_HVX_condALU <"if ($src1) $dst.b += $src2.b">,
+ V6_vaddbq_enc;
+defm V6_vaddhq : T_HVX_condALU <"if ($src1) $dst.h += $src2.h">,
+ V6_vaddhq_enc;
+defm V6_vaddwq : T_HVX_condALU <"if ($src1) $dst.w += $src2.w">,
+ V6_vaddwq_enc;
+defm V6_vsubbq : T_HVX_condALU <"if ($src1) $dst.b -= $src2.b">,
+ V6_vsubbq_enc;
+defm V6_vsubhq : T_HVX_condALU <"if ($src1) $dst.h -= $src2.h">,
+ V6_vsubhq_enc;
+defm V6_vsubwq : T_HVX_condALU <"if ($src1) $dst.w -= $src2.w">,
+ V6_vsubwq_enc;
+defm V6_vaddbnq : T_HVX_condALU <"if (!$src1) $dst.b += $src2.b">,
+ V6_vaddbnq_enc;
+defm V6_vaddhnq : T_HVX_condALU <"if (!$src1) $dst.h += $src2.h">,
+ V6_vaddhnq_enc;
+defm V6_vaddwnq : T_HVX_condALU <"if (!$src1) $dst.w += $src2.w">,
+ V6_vaddwnq_enc;
+defm V6_vsubbnq : T_HVX_condALU <"if (!$src1) $dst.b -= $src2.b">,
+ V6_vsubbnq_enc;
+defm V6_vsubhnq : T_HVX_condALU <"if (!$src1) $dst.h -= $src2.h">,
+ V6_vsubhnq_enc;
+defm V6_vsubwnq : T_HVX_condALU <"if (!$src1) $dst.w -= $src2.w">,
+ V6_vsubwnq_enc;
+
+let hasNewValue = 1 in
+class T_HVX_alu_2op <string asmString, InstrItinClass itin,
+ RegisterClass RCout, RegisterClass RCin>
+ : CVI_VA_Resource1 <(outs RCout:$dst), (ins RCin:$src1),
+ asmString >{
+ let Itinerary = itin;
+ let Type = !cast<IType>("Type"#itin);
+}
+
+multiclass T_HVX_alu_2op <string asmString, RegisterClass RCout,
+ RegisterClass RCin, InstrItinClass itin> {
+ def NAME : T_HVX_alu_2op <asmString, itin, RCout, RCin>;
+ let isCodeGenOnly = 1 in
+ def NAME#_128B : T_HVX_alu_2op <asmString, itin,
+ !cast<RegisterClass>(RCout#"128B"),
+ !cast<RegisterClass>(RCin#"128B")>;
+}
+
+let hasNewValue = 1 in
+multiclass T_HVX_alu_2op_VV <string asmString>:
+ T_HVX_alu_2op <asmString, VectorRegs, VectorRegs, CVI_VA>;
+
+multiclass T_HVX_alu_2op_WV <string asmString>:
+ T_HVX_alu_2op <asmString, VecDblRegs, VectorRegs, CVI_VA_DV>;
+
+
+defm V6_vabsh : T_HVX_alu_2op_VV <"$dst.h = vabs($src1.h)">,
+ V6_vabsh_enc;
+defm V6_vabsw : T_HVX_alu_2op_VV <"$dst.w = vabs($src1.w)">,
+ V6_vabsw_enc;
+defm V6_vabsh_sat : T_HVX_alu_2op_VV <"$dst.h = vabs($src1.h):sat">,
+ V6_vabsh_sat_enc;
+defm V6_vabsw_sat : T_HVX_alu_2op_VV <"$dst.w = vabs($src1.w):sat">,
+ V6_vabsw_sat_enc;
+defm V6_vnot : T_HVX_alu_2op_VV <"$dst = vnot($src1)">,
+ V6_vnot_enc;
+defm V6_vassign : T_HVX_alu_2op_VV <"$dst = $src1">,
+ V6_vassign_enc;
+
+defm V6_vzb : T_HVX_alu_2op_WV <"$dst.uh = vzxt($src1.ub)">,
+ V6_vzb_enc;
+defm V6_vzh : T_HVX_alu_2op_WV <"$dst.uw = vzxt($src1.uh)">,
+ V6_vzh_enc;
+defm V6_vsb : T_HVX_alu_2op_WV <"$dst.h = vsxt($src1.b)">,
+ V6_vsb_enc;
+defm V6_vsh : T_HVX_alu_2op_WV <"$dst.w = vsxt($src1.h)">,
+ V6_vsh_enc;
+
+let Itinerary = CVI_VP, Type = TypeCVI_VP in {
+defm V6_vdealh : T_HVX_alu_2op_VV <"$dst.h = vdeal($src1.h)">,
+ V6_vdealh_enc;
+defm V6_vdealb : T_HVX_alu_2op_VV <"$dst.b = vdeal($src1.b)">,
+ V6_vdealb_enc;
+defm V6_vshuffh : T_HVX_alu_2op_VV <"$dst.h = vshuff($src1.h)">,
+ V6_vshuffh_enc;
+defm V6_vshuffb : T_HVX_alu_2op_VV <"$dst.b = vshuff($src1.b)">,
+ V6_vshuffb_enc;
+}
+
+let Itinerary = CVI_VP_VS, Type = TypeCVI_VP_VS in {
+defm V6_vunpackub : T_HVX_alu_2op_WV <"$dst.uh = vunpack($src1.ub)">,
+ V6_vunpackub_enc;
+defm V6_vunpackuh : T_HVX_alu_2op_WV <"$dst.uw = vunpack($src1.uh)">,
+ V6_vunpackuh_enc;
+defm V6_vunpackb : T_HVX_alu_2op_WV <"$dst.h = vunpack($src1.b)">,
+ V6_vunpackb_enc;
+defm V6_vunpackh : T_HVX_alu_2op_WV <"$dst.w = vunpack($src1.h)">,
+ V6_vunpackh_enc;
+}
+
+let Itinerary = CVI_VS, Type = TypeCVI_VS in {
+defm V6_vcl0w : T_HVX_alu_2op_VV <"$dst.uw = vcl0($src1.uw)">,
+ V6_vcl0w_enc;
+defm V6_vcl0h : T_HVX_alu_2op_VV <"$dst.uh = vcl0($src1.uh)">,
+ V6_vcl0h_enc;
+defm V6_vnormamtw : T_HVX_alu_2op_VV <"$dst.w = vnormamt($src1.w)">,
+ V6_vnormamtw_enc;
+defm V6_vnormamth : T_HVX_alu_2op_VV <"$dst.h = vnormamt($src1.h)">,
+ V6_vnormamth_enc;
+defm V6_vpopcounth : T_HVX_alu_2op_VV <"$dst.h = vpopcount($src1.h)">,
+ V6_vpopcounth_enc;
+}
+
+let isAccumulator = 1, hasNewValue = 1, Itinerary = CVI_VX_DV_LONG,
+ Type = TypeCVI_VX_DV in
+class T_HVX_vmpyacc2 <string asmString, RegisterClass RC>
+ : CVI_VA_Resource1 <(outs RC:$dst),
+ (ins RC:$_src_, RC:$src1, IntRegs:$src2, u1_0Imm:$src3),
+ asmString, [], "$dst = $_src_" > ;
+
+
+multiclass T_HVX_vmpyacc2 <string asmString> {
+ def NAME : T_HVX_vmpyacc2 <asmString, VecDblRegs>;
+
+ let isCodeGenOnly = 1 in
+ def NAME#_128B : T_HVX_vmpyacc2 <asmString, VecDblRegs128B>;
+}
+
+defm V6_vrmpybusi_acc :
+ T_HVX_vmpyacc2<"$dst.w += vrmpy($src1.ub,$src2.b,#$src3)">,
+ V6_vrmpybusi_acc_enc;
+defm V6_vrsadubi_acc :
+ T_HVX_vmpyacc2<"$dst.uw += vrsad($src1.ub,$src2.ub,#$src3)">,
+ V6_vrsadubi_acc_enc;
+defm V6_vrmpyubi_acc :
+ T_HVX_vmpyacc2<"$dst.uw += vrmpy($src1.ub,$src2.ub,#$src3)">,
+ V6_vrmpyubi_acc_enc;
+
+
+let Itinerary = CVI_VX_DV_LONG, Type = TypeCVI_VX_DV, hasNewValue = 1 in
+class T_HVX_vmpy2 <string asmString, RegisterClass RC>
+ : CVI_VA_Resource1<(outs RC:$dst), (ins RC:$src1, IntRegs:$src2, u1_0Imm:$src3),
+ asmString>;
+
+
+multiclass T_HVX_vmpy2 <string asmString> {
+ def NAME : T_HVX_vmpy2 <asmString, VecDblRegs>;
+
+ let isCodeGenOnly = 1 in
+ def NAME#_128B : T_HVX_vmpy2 <asmString, VecDblRegs128B>;
+}
+
+defm V6_vrmpybusi :
+ T_HVX_vmpy2 <"$dst.w = vrmpy($src1.ub,$src2.b,#$src3)">, V6_vrmpybusi_enc;
+defm V6_vrsadubi :
+ T_HVX_vmpy2 <"$dst.uw = vrsad($src1.ub,$src2.ub,#$src3)">, V6_vrsadubi_enc;
+defm V6_vrmpyubi :
+ T_HVX_vmpy2 <"$dst.uw = vrmpy($src1.ub,$src2.ub,#$src3)">, V6_vrmpyubi_enc;
+
+
+let Itinerary = CVI_VP_VS_LONG_EARLY, Type = TypeCVI_VP_VS,
+ hasSideEffects = 0, hasNewValue2 = 1, opNewValue2 = 1 in
+class T_HVX_perm <string asmString, RegisterClass RC>
+ : CVI_VA_Resource1 <(outs RC:$_dst1_, RC:$_dst2_),
+ (ins RC:$src1, RC:$src2, IntRegs:$src3),
+ asmString, [], "$_dst1_ = $src1, $_dst2_ = $src2" >;
+
+multiclass T_HVX_perm <string asmString> {
+ def NAME : T_HVX_perm <asmString, VectorRegs>;
+
+ let isCodeGenOnly = 1 in
+ def NAME#_128B : T_HVX_perm <asmString, VectorRegs128B>;
+}
+
+let hasNewValue = 1, opNewValue = 0, hasNewValue2 = 1, opNewValue2 = 1 in {
+ defm V6_vshuff : T_HVX_perm <"vshuff($src1,$src2,$src3)">, V6_vshuff_enc;
+ defm V6_vdeal : T_HVX_perm <"vdeal($src1,$src2,$src3)">, V6_vdeal_enc;
+}
+
+// Conditional vector move.
+let isPredicated = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+class T_HVX_cmov <bit isPredNot, RegisterClass RC>
+ : CVI_VA_Resource1 <(outs RC:$dst), (ins PredRegs:$src1, RC:$src2),
+ "if ("#!if(isPredNot, "!", "")#"$src1) $dst = $src2"> {
+ let isPredicatedFalse = isPredNot;
+}
+
+multiclass T_HVX_cmov <bit isPredNot = 0> {
+ def NAME : T_HVX_cmov <isPredNot, VectorRegs>;
+
+ let isCodeGenOnly = 1 in
+ def NAME#_128B : T_HVX_cmov <isPredNot, VectorRegs128B>;
+}
+
+defm V6_vcmov : T_HVX_cmov, V6_vcmov_enc;
+defm V6_vncmov : T_HVX_cmov<1>, V6_vncmov_enc;
+
+// Conditional vector combine.
+let Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV, isPredicated = 1,
+ hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+class T_HVX_ccombine <bit isPredNot, RegisterClass RCout, RegisterClass RCin>
+ : CVI_VA_Resource1 < (outs RCout:$dst),
+ (ins PredRegs:$src1, RCin:$src2, RCin:$src3),
+ "if ("#!if(isPredNot, "!", "")#"$src1) $dst = vcombine($src2,$src3)"> {
+ let isPredicatedFalse = isPredNot;
+}
+
+multiclass T_HVX_ccombine <bit isPredNot = 0> {
+ def NAME : T_HVX_ccombine <isPredNot, VecDblRegs, VectorRegs>;
+
+ let isCodeGenOnly = 1 in
+ def NAME#_128B : T_HVX_ccombine <isPredNot, VecDblRegs128B, VectorRegs128B>;
+}
+
+defm V6_vccombine : T_HVX_ccombine, V6_vccombine_enc;
+defm V6_vnccombine : T_HVX_ccombine<1>, V6_vnccombine_enc;
+
+let hasNewValue = 1 in
+class T_HVX_shift <string asmString, RegisterClass RCout, RegisterClass RCin>
+ : CVI_VX_DV_Resource1<(outs RCout:$dst),
+ (ins RCin:$src1, RCin:$src2, IntRegsLow8:$src3),
+ asmString >;
+
+multiclass T_HVX_shift <string asmString, RegisterClass RCout,
+ RegisterClass RCin> {
+ def NAME : T_HVX_shift <asmString, RCout, RCin>;
+ let isCodeGenOnly = 1 in
+ def NAME#_128B : T_HVX_shift <asmString, !cast<RegisterClass>(RCout#"128B"),
+ !cast<RegisterClass>(RCin#"128B")>;
+}
+
+multiclass T_HVX_shift_VV <string asmString>:
+ T_HVX_shift <asmString, VectorRegs, VectorRegs>;
+
+multiclass T_HVX_shift_WV <string asmString>:
+ T_HVX_shift <asmString, VecDblRegs, VectorRegs>;
+
+let Itinerary = CVI_VP_LONG, Type = TypeCVI_VP in {
+defm V6_valignb :
+ T_HVX_shift_VV <"$dst = valign($src1,$src2,$src3)">, V6_valignb_enc;
+defm V6_vlalignb :
+ T_HVX_shift_VV <"$dst = vlalign($src1,$src2,$src3)">, V6_vlalignb_enc;
+}
+
+let Itinerary = CVI_VS, Type = TypeCVI_VS in {
+defm V6_vasrwh :
+ T_HVX_shift_VV <"$dst.h = vasr($src1.w,$src2.w,$src3)">, V6_vasrwh_enc;
+defm V6_vasrwhsat :
+ T_HVX_shift_VV <"$dst.h = vasr($src1.w,$src2.w,$src3):sat">,
+ V6_vasrwhsat_enc;
+defm V6_vasrwhrndsat :
+ T_HVX_shift_VV <"$dst.h = vasr($src1.w,$src2.w,$src3):rnd:sat">,
+ V6_vasrwhrndsat_enc;
+defm V6_vasrwuhsat :
+ T_HVX_shift_VV <"$dst.uh = vasr($src1.w,$src2.w,$src3):sat">,
+ V6_vasrwuhsat_enc;
+defm V6_vasrhubsat :
+ T_HVX_shift_VV <"$dst.ub = vasr($src1.h,$src2.h,$src3):sat">,
+ V6_vasrhubsat_enc;
+defm V6_vasrhubrndsat :
+ T_HVX_shift_VV <"$dst.ub = vasr($src1.h,$src2.h,$src3):rnd:sat">,
+ V6_vasrhubrndsat_enc;
+defm V6_vasrhbrndsat :
+ T_HVX_shift_VV <"$dst.b = vasr($src1.h,$src2.h,$src3):rnd:sat">,
+ V6_vasrhbrndsat_enc;
+}
+
+// Assembler mapped -- alias?
+//defm V6_vtran2x2vdd : T_HVX_shift_VV <"">, V6_vtran2x2vdd_enc;
+let Itinerary = CVI_VP_VS_LONG, Type = TypeCVI_VP_VS in {
+defm V6_vshuffvdd :
+ T_HVX_shift_WV <"$dst = vshuff($src1,$src2,$src3)">, V6_vshuffvdd_enc;
+defm V6_vdealvdd :
+ T_HVX_shift_WV <"$dst = vdeal($src1,$src2,$src3)">, V6_vdealvdd_enc;
+}
+
+let hasNewValue = 1, Itinerary = CVI_VP_VS_LONG, Type = TypeCVI_VP_VS in
+class T_HVX_unpack <string asmString, RegisterClass RCout, RegisterClass RCin>
+ : CVI_VX_DV_Resource1<(outs RCout:$dst), (ins RCout:$_src_, RCin:$src1),
+ asmString, [], "$dst = $_src_">;
+
+multiclass T_HVX_unpack <string asmString> {
+ def NAME : T_HVX_unpack <asmString, VecDblRegs, VectorRegs>;
+ let isCodeGenOnly = 1 in
+ def NAME#_128B : T_HVX_unpack <asmString, VecDblRegs128B, VectorRegs128B>;
+}
+
+defm V6_vunpackob : T_HVX_unpack <"$dst.h |= vunpacko($src1.b)">, V6_vunpackob_enc;
+defm V6_vunpackoh : T_HVX_unpack <"$dst.w |= vunpacko($src1.h)">, V6_vunpackoh_enc;
+
+let Itinerary = CVI_VP_LONG, Type = TypeCVI_VP, hasNewValue = 1,
+ hasSideEffects = 0 in
+class T_HVX_valign <string asmString, RegisterClass RC>
+ : CVI_VA_Resource1<(outs RC:$dst), (ins RC:$src1, RC:$src2, u3_0Imm:$src3),
+ asmString>;
+
+multiclass T_HVX_valign <string asmString> {
+ def NAME : T_HVX_valign <asmString, VectorRegs>;
+
+ let isCodeGenOnly = 1 in
+ def NAME#_128B : T_HVX_valign <asmString, VectorRegs128B>;
+}
+
+defm V6_valignbi :
+ T_HVX_valign <"$dst = valign($src1,$src2,#$src3)">, V6_valignbi_enc;
+defm V6_vlalignbi :
+ T_HVX_valign <"$dst = vlalign($src1,$src2,#$src3)">, V6_vlalignbi_enc;
+
+let Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV in
+class T_HVX_predAlu <string asmString, RegisterClass RC>
+ : CVI_VA_Resource1<(outs RC:$dst), (ins RC:$src1, RC:$src2),
+ asmString>;
+
+multiclass T_HVX_predAlu <string asmString> {
+ def NAME : T_HVX_predAlu <asmString, VecPredRegs>;
+
+ let isCodeGenOnly = 1 in
+ def NAME#_128B : T_HVX_predAlu <asmString, VecPredRegs128B>;
+}
+
+defm V6_pred_and : T_HVX_predAlu <"$dst = and($src1,$src2)">, V6_pred_and_enc;
+defm V6_pred_or : T_HVX_predAlu <"$dst = or($src1,$src2)">, V6_pred_or_enc;
+defm V6_pred_xor : T_HVX_predAlu <"$dst = xor($src1,$src2)">, V6_pred_xor_enc;
+defm V6_pred_or_n : T_HVX_predAlu <"$dst = or($src1,!$src2)">, V6_pred_or_n_enc;
+defm V6_pred_and_n :
+ T_HVX_predAlu <"$dst = and($src1,!$src2)">, V6_pred_and_n_enc;
+
+let Itinerary = CVI_VA, Type = TypeCVI_VA in
+class T_HVX_prednot <RegisterClass RC>
+ : CVI_VA_Resource1<(outs RC:$dst), (ins RC:$src1),
+ "$dst = not($src1)">, V6_pred_not_enc;
+
+def V6_pred_not : T_HVX_prednot <VecPredRegs>;
+let isCodeGenOnly = 1 in
+def V6_pred_not_128B : T_HVX_prednot <VecPredRegs128B>;
+
+let Itinerary = CVI_VA, Type = TypeCVI_VA in
+class T_HVX_vcmp2 <string asmString, RegisterClass RCout, RegisterClass RCin>
+ : CVI_VA_Resource1 <(outs RCout:$dst), (ins RCin:$src1, RCin:$src2),
+ asmString >;
+
+multiclass T_HVX_vcmp2 <string asmString> {
+ def NAME : T_HVX_vcmp2 <asmString, VecPredRegs, VectorRegs>;
+ let isCodeGenOnly = 1 in
+ def NAME#_128B : T_HVX_vcmp2 <asmString, VecPredRegs128B, VectorRegs128B>;
+}
+
+defm V6_veqb : T_HVX_vcmp2 <"$dst = vcmp.eq($src1.b,$src2.b)">, V6_veqb_enc;
+defm V6_veqh : T_HVX_vcmp2 <"$dst = vcmp.eq($src1.h,$src2.h)">, V6_veqh_enc;
+defm V6_veqw : T_HVX_vcmp2 <"$dst = vcmp.eq($src1.w,$src2.w)">, V6_veqw_enc;
+defm V6_vgtb : T_HVX_vcmp2 <"$dst = vcmp.gt($src1.b,$src2.b)">, V6_vgtb_enc;
+defm V6_vgth : T_HVX_vcmp2 <"$dst = vcmp.gt($src1.h,$src2.h)">, V6_vgth_enc;
+defm V6_vgtw : T_HVX_vcmp2 <"$dst = vcmp.gt($src1.w,$src2.w)">, V6_vgtw_enc;
+defm V6_vgtub : T_HVX_vcmp2 <"$dst = vcmp.gt($src1.ub,$src2.ub)">, V6_vgtub_enc;
+defm V6_vgtuh : T_HVX_vcmp2 <"$dst = vcmp.gt($src1.uh,$src2.uh)">, V6_vgtuh_enc;
+defm V6_vgtuw : T_HVX_vcmp2 <"$dst = vcmp.gt($src1.uw,$src2.uw)">, V6_vgtuw_enc;
+
+let isAccumulator = 1, hasNewValue = 1, hasSideEffects = 0 in
+class T_V6_vandqrt_acc <RegisterClass RCout, RegisterClass RCin>
+ : CVI_VX_Resource_late<(outs RCout:$dst),
+ (ins RCout:$_src_, RCin:$src1, IntRegs:$src2),
+ "$dst |= vand($src1,$src2)", [], "$dst = $_src_">, V6_vandqrt_acc_enc;
+
+def V6_vandqrt_acc : T_V6_vandqrt_acc <VectorRegs, VecPredRegs>;
+let isCodeGenOnly = 1 in
+def V6_vandqrt_acc_128B : T_V6_vandqrt_acc <VectorRegs128B, VecPredRegs128B>;
+
+let isAccumulator = 1 in
+class T_V6_vandvrt_acc <RegisterClass RCout, RegisterClass RCin>
+ : CVI_VX_Resource_late<(outs RCout:$dst),
+ (ins RCout:$_src_, RCin:$src1, IntRegs:$src2),
+ "$dst |= vand($src1,$src2)", [], "$dst = $_src_">, V6_vandvrt_acc_enc;
+
+def V6_vandvrt_acc : T_V6_vandvrt_acc <VecPredRegs, VectorRegs>;
+let isCodeGenOnly = 1 in
+def V6_vandvrt_acc_128B : T_V6_vandvrt_acc <VecPredRegs128B, VectorRegs128B>;
+
+let hasNewValue = 1, hasSideEffects = 0 in
+class T_V6_vandqrt <RegisterClass RCout, RegisterClass RCin>
+ : CVI_VX_Resource_late<(outs RCout:$dst),
+ (ins RCin:$src1, IntRegs:$src2),
+ "$dst = vand($src1,$src2)" >, V6_vandqrt_enc;
+
+def V6_vandqrt : T_V6_vandqrt <VectorRegs, VecPredRegs>;
+let isCodeGenOnly = 1 in
+def V6_vandqrt_128B : T_V6_vandqrt <VectorRegs128B, VecPredRegs128B>;
+
+let hasNewValue = 1, hasSideEffects = 0 in
+class T_V6_lvsplatw <RegisterClass RC>
+ : CVI_VX_Resource_late<(outs RC:$dst), (ins IntRegs:$src1),
+ "$dst = vsplat($src1)" >, V6_lvsplatw_enc;
+
+def V6_lvsplatw : T_V6_lvsplatw <VectorRegs>;
+let isCodeGenOnly = 1 in
+def V6_lvsplatw_128B : T_V6_lvsplatw <VectorRegs128B>;
+
+
+let hasNewValue = 1 in
+class T_V6_vinsertwr <RegisterClass RC>
+ : CVI_VX_Resource_late<(outs RC:$dst), (ins RC:$_src_, IntRegs:$src1),
+ "$dst.w = vinsert($src1)", [], "$dst = $_src_">,
+ V6_vinsertwr_enc;
+
+def V6_vinsertwr : T_V6_vinsertwr <VectorRegs>;
+let isCodeGenOnly = 1 in
+def V6_vinsertwr_128B : T_V6_vinsertwr <VectorRegs128B>;
+
+
+let Itinerary = CVI_VP_LONG, Type = TypeCVI_VP in
+class T_V6_pred_scalar2 <RegisterClass RC>
+ : CVI_VA_Resource1<(outs RC:$dst), (ins IntRegs:$src1),
+ "$dst = vsetq($src1)">, V6_pred_scalar2_enc;
+
+def V6_pred_scalar2 : T_V6_pred_scalar2 <VecPredRegs>;
+let isCodeGenOnly = 1 in
+def V6_pred_scalar2_128B : T_V6_pred_scalar2 <VecPredRegs128B>;
+
+class T_V6_vandvrt <RegisterClass RCout, RegisterClass RCin>
+ : CVI_VX_Resource_late<(outs RCout:$dst), (ins RCin:$src1, IntRegs:$src2),
+ "$dst = vand($src1,$src2)">, V6_vandvrt_enc;
+
+def V6_vandvrt : T_V6_vandvrt <VecPredRegs, VectorRegs>;
+let isCodeGenOnly = 1 in
+def V6_vandvrt_128B : T_V6_vandvrt <VecPredRegs128B, VectorRegs128B>;
+
+let validSubTargets = HasV60SubT in
+class T_HVX_rol <string asmString, RegisterClass RC, Operand ImmOp >
+ : SInst2 <(outs RC:$dst), (ins RC:$src1, ImmOp:$src2), asmString>;
+
+class T_HVX_rol_R <string asmString>
+ : T_HVX_rol <asmString, IntRegs, u5_0Imm>;
+class T_HVX_rol_P <string asmString>
+ : T_HVX_rol <asmString, DoubleRegs, u6_0Imm>;
+
+def S6_rol_i_p : T_HVX_rol_P <"$dst = rol($src1,#$src2)">, S6_rol_i_p_enc;
+let hasNewValue = 1, opNewValue = 0 in
+def S6_rol_i_r : T_HVX_rol_R <"$dst = rol($src1,#$src2)">, S6_rol_i_r_enc;
+
+let validSubTargets = HasV60SubT in
+class T_HVX_rol_acc <string asmString, RegisterClass RC, Operand ImmOp>
+ : SInst2 <(outs RC:$dst), (ins RC:$_src_, RC:$src1, ImmOp:$src2),
+ asmString, [], "$dst = $_src_" >;
+
+class T_HVX_rol_acc_P <string asmString>
+ : T_HVX_rol_acc <asmString, DoubleRegs, u6_0Imm>;
+
+class T_HVX_rol_acc_R <string asmString>
+ : T_HVX_rol_acc <asmString, IntRegs, u5_0Imm>;
+
+def S6_rol_i_p_nac :
+ T_HVX_rol_acc_P <"$dst -= rol($src1,#$src2)">, S6_rol_i_p_nac_enc;
+def S6_rol_i_p_acc :
+ T_HVX_rol_acc_P <"$dst += rol($src1,#$src2)">, S6_rol_i_p_acc_enc;
+def S6_rol_i_p_and :
+ T_HVX_rol_acc_P <"$dst &= rol($src1,#$src2)">, S6_rol_i_p_and_enc;
+def S6_rol_i_p_or :
+ T_HVX_rol_acc_P <"$dst |= rol($src1,#$src2)">, S6_rol_i_p_or_enc;
+def S6_rol_i_p_xacc :
+ T_HVX_rol_acc_P<"$dst ^= rol($src1,#$src2)">, S6_rol_i_p_xacc_enc;
+
+let hasNewValue = 1, opNewValue = 0 in {
+def S6_rol_i_r_nac :
+ T_HVX_rol_acc_R <"$dst -= rol($src1,#$src2)">, S6_rol_i_r_nac_enc;
+def S6_rol_i_r_acc :
+ T_HVX_rol_acc_R <"$dst += rol($src1,#$src2)">, S6_rol_i_r_acc_enc;
+def S6_rol_i_r_and :
+ T_HVX_rol_acc_R <"$dst &= rol($src1,#$src2)">, S6_rol_i_r_and_enc;
+def S6_rol_i_r_or :
+ T_HVX_rol_acc_R <"$dst |= rol($src1,#$src2)">, S6_rol_i_r_or_enc;
+def S6_rol_i_r_xacc :
+ T_HVX_rol_acc_R <"$dst ^= rol($src1,#$src2)">, S6_rol_i_r_xacc_enc;
+}
+
+let isSolo = 1, Itinerary = LD_tc_ld_SLOT0, Type = TypeLD in
+class T_V6_extractw <RegisterClass RC>
+ : LD1Inst <(outs IntRegs:$dst), (ins RC:$src1, IntRegs:$src2),
+ "$dst = vextract($src1,$src2)">, V6_extractw_enc;
+
+def V6_extractw : T_V6_extractw <VectorRegs>;
+let isCodeGenOnly = 1 in
+def V6_extractw_128B : T_V6_extractw <VectorRegs128B>;
+
+let Itinerary = ST_tc_st_SLOT0, validSubTargets = HasV55SubT in
+class T_sys0op <string asmString>
+ : ST1Inst <(outs), (ins), asmString>;
+
+let isSolo = 1, validSubTargets = HasV55SubT in {
+def Y5_l2gunlock : T_sys0op <"l2gunlock">, Y5_l2gunlock_enc;
+def Y5_l2gclean : T_sys0op <"l2gclean">, Y5_l2gclean_enc;
+def Y5_l2gcleaninv : T_sys0op <"l2gcleaninv">, Y5_l2gcleaninv_enc;
+}
+
+class T_sys1op <string asmString, RegisterClass RC>
+ : ST1Inst <(outs), (ins RC:$src1), asmString>;
+
+class T_sys1op_R <string asmString> : T_sys1op <asmString, IntRegs>;
+class T_sys1op_P <string asmString> : T_sys1op <asmString, DoubleRegs>;
+
+let isSoloAX = 1, validSubTargets = HasV55SubT in
+def Y5_l2unlocka : T_sys1op_R <"l2unlocka($src1)">, Y5_l2unlocka_enc;
+
+let isSolo = 1, validSubTargets = HasV60SubT in {
+def Y6_l2gcleanpa : T_sys1op_P <"l2gclean($src1)">, Y6_l2gcleanpa_enc;
+def Y6_l2gcleaninvpa : T_sys1op_P <"l2gcleaninv($src1)">, Y6_l2gcleaninvpa_enc;
+}
+
+let Itinerary = ST_tc_3stall_SLOT0, isPredicateLate = 1, isSoloAX = 1,
+ validSubTargets = HasV55SubT in
+def Y5_l2locka : ST1Inst <(outs PredRegs:$dst), (ins IntRegs:$src1),
+ "$dst = l2locka($src1)">, Y5_l2locka_enc;
+
+// not defined on etc side. why?
+// defm S2_cabacencbin : _VV <"Rdd=encbin(Rss,$src2,Pu)">, S2_cabacencbin_enc;
+
+let Defs = [USR_OVF], Itinerary = M_tc_3stall_SLOT23, isPredicateLate = 1,
+ hasSideEffects = 0,
+validSubTargets = HasV55SubT in
+def A5_ACS : MInst2 <(outs DoubleRegs:$dst1, PredRegs:$dst2),
+ (ins DoubleRegs:$_src_, DoubleRegs:$src1, DoubleRegs:$src2),
+ "$dst1,$dst2 = vacsh($src1,$src2)", [],
+ "$dst1 = $_src_" >, Requires<[HasV55T]>, A5_ACS_enc;
+
+let Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV, hasNewValue = 1,
+ hasSideEffects = 0 in
+class T_HVX_alu2 <string asmString, RegisterClass RCout, RegisterClass RCin1,
+ RegisterClass RCin2>
+ : CVI_VA_Resource1<(outs RCout:$dst),
+ (ins RCin1:$src1, RCin2:$src2, RCin2:$src3), asmString>;
+
+multiclass T_HVX_alu2 <string asmString, RegisterClass RC > {
+ def NAME : T_HVX_alu2 <asmString, RC, VecPredRegs, VectorRegs>;
+ let isCodeGenOnly = 1 in
+ def NAME#_128B : T_HVX_alu2 <asmString, !cast<RegisterClass>(RC#"128B"),
+ VecPredRegs128B, VectorRegs128B>;
+}
+
+multiclass T_HVX_alu2_V <string asmString> :
+ T_HVX_alu2 <asmString, VectorRegs>;
+
+multiclass T_HVX_alu2_W <string asmString> :
+ T_HVX_alu2 <asmString, VecDblRegs>;
+
+defm V6_vswap : T_HVX_alu2_W <"$dst = vswap($src1,$src2,$src3)">, V6_vswap_enc;
+
+let Itinerary = CVI_VA, Type = TypeCVI_VA, hasNewValue = 1,
+ hasSideEffects = 0 in
+defm V6_vmux : T_HVX_alu2_V <"$dst = vmux($src1,$src2,$src3)">, V6_vmux_enc;
+
+class T_HVX_vlutb <string asmString, RegisterClass RCout, RegisterClass RCin>
+ : CVI_VA_Resource1<(outs RCout:$dst),
+ (ins RCin:$src1, RCin:$src2, IntRegsLow8:$src3), asmString>;
+
+multiclass T_HVX_vlutb <string asmString, RegisterClass RCout,
+ RegisterClass RCin> {
+ def NAME : T_HVX_vlutb <asmString, RCout, RCin>;
+ let isCodeGenOnly = 1 in
+ def NAME#_128B : T_HVX_vlutb <asmString, !cast<RegisterClass>(RCout#"128B"),
+ !cast<RegisterClass>(RCin#"128B")>;
+}
+
+multiclass T_HVX_vlutb_V <string asmString> :
+ T_HVX_vlutb <asmString, VectorRegs, VectorRegs>;
+
+multiclass T_HVX_vlutb_W <string asmString> :
+ T_HVX_vlutb <asmString, VecDblRegs, VectorRegs>;
+
+let Itinerary = CVI_VP_VS_LONG, Type = TypeCVI_VP_VS, isAccumulator = 1 in
+class T_HVX_vlutb_acc <string asmString, RegisterClass RCout,
+ RegisterClass RCin>
+ : CVI_VA_Resource1<(outs RCout:$dst),
+ (ins RCout:$_src_, RCin:$src1, RCin:$src2, IntRegsLow8:$src3),
+ asmString, [], "$dst = $_src_">;
+
+multiclass T_HVX_vlutb_acc <string asmString, RegisterClass RCout,
+ RegisterClass RCin> {
+ def NAME : T_HVX_vlutb_acc <asmString, RCout, RCin>;
+ let isCodeGenOnly = 1 in
+ def NAME#_128B : T_HVX_vlutb_acc<asmString,
+ !cast<RegisterClass>(RCout#"128B"),
+ !cast<RegisterClass>(RCin#"128B")>;
+}
+
+multiclass T_HVX_vlutb_acc_V <string asmString> :
+ T_HVX_vlutb_acc <asmString, VectorRegs, VectorRegs>;
+
+multiclass T_HVX_vlutb_acc_W <string asmString> :
+ T_HVX_vlutb_acc <asmString, VecDblRegs, VectorRegs>;
+
+
+let Itinerary = CVI_VP_LONG, Type = TypeCVI_VP, hasNewValue = 1 in
+defm V6_vlutvvb:
+ T_HVX_vlutb_V <"$dst.b = vlut32($src1.b,$src2.b,$src3)">, V6_vlutvvb_enc;
+
+let Itinerary = CVI_VP_VS_LONG, Type = TypeCVI_VP_VS, hasNewValue = 1 in
+defm V6_vlutvwh:
+ T_HVX_vlutb_W <"$dst.h = vlut16($src1.b,$src2.h,$src3)">, V6_vlutvwh_enc;
+
+let hasNewValue = 1 in {
+ defm V6_vlutvvb_oracc:
+ T_HVX_vlutb_acc_V <"$dst.b |= vlut32($src1.b,$src2.b,$src3)">,
+ V6_vlutvvb_oracc_enc;
+ defm V6_vlutvwh_oracc:
+ T_HVX_vlutb_acc_W <"$dst.h |= vlut16($src1.b,$src2.h,$src3)">,
+ V6_vlutvwh_oracc_enc;
+}
+
+// It's a fake instruction and should not be defined?
+def S2_cabacencbin
+ : SInst2<(outs DoubleRegs:$dst),
+ (ins DoubleRegs:$src1, DoubleRegs:$src2, PredRegs:$src3),
+ "$dst = encbin($src1,$src2,$src3)">, S2_cabacencbin_enc;
+
+// Vhist instructions
+def V6_vhistq
+ : CVI_HIST_Resource1 <(outs), (ins VecPredRegs:$src1),
+ "vhist($src1)">, V6_vhistq_enc;
+
+def V6_vhist
+ : CVI_HIST_Resource1 <(outs), (ins),
+ "vhist" >, V6_vhist_enc;
+
+
+let isPseudo = 1, isCodeGenOnly = 1, hasSideEffects = 0 in {
+ def V6_vd0: CVI_VA_Resource<(outs VectorRegs:$dst), (ins), "$dst = #0", []>;
+ def V6_vd0_128B: CVI_VA_Resource<(outs VectorRegs128B:$dst), (ins),
+ "$dst = #0", []>;
+
+ def V6_vassignp: CVI_VA_Resource<(outs VecDblRegs:$dst),
+ (ins VecDblRegs:$src), "", []>;
+ def V6_vassignp_128B : CVI_VA_Resource<(outs VecDblRegs128B:$dst),
+ (ins VecDblRegs128B:$src), "", []>;
+
+ def V6_lo: CVI_VA_Resource<(outs VectorRegs:$dst), (ins VecDblRegs:$src1),
+ "", []>;
+ def V6_lo_128B: CVI_VA_Resource<(outs VectorRegs128B:$dst),
+ (ins VecDblRegs128B:$src1), "", []>;
+
+ def V6_hi: CVI_VA_Resource<(outs VectorRegs:$dst), (ins VecDblRegs:$src1),
+ "", []>;
+ def V6_hi_128B: CVI_VA_Resource<(outs VectorRegs128B:$dst),
+ (ins VecDblRegs128B:$src1), "", []>;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoVector.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoVector.td
new file mode 100644
index 000000000000..e3520bd6e515
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoVector.td
@@ -0,0 +1,69 @@
+//===- HexagonInstrInfoVector.td - Hexagon Vector Patterns -*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon Vector instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+// Vector shift support. Vector shifting in Hexagon is rather different
+// from internal representation of LLVM.
+// LLVM assumes all shifts (in vector case) will have the form
+// <VT> = SHL/SRA/SRL <VT> by <VT>
+// while Hexagon has the following format:
+// <VT> = SHL/SRA/SRL <VT> by <IT/i32>
+// As a result, special care is needed to guarantee correctness and
+// performance.
+class vshift_v4i16<SDNode Op, string Str, bits<3>MajOp, bits<3>MinOp>
+ : S_2OpInstImm<Str, MajOp, MinOp, u4_0Imm, []> {
+ bits<4> src2;
+ let Inst{11-8} = src2;
+}
+
+class vshift_v2i32<SDNode Op, string Str, bits<3>MajOp, bits<3>MinOp>
+ : S_2OpInstImm<Str, MajOp, MinOp, u5_0Imm, []> {
+ bits<5> src2;
+ let Inst{12-8} = src2;
+}
+
+def S2_asr_i_vw : vshift_v2i32<sra, "vasrw", 0b010, 0b000>;
+def S2_lsr_i_vw : vshift_v2i32<srl, "vlsrw", 0b010, 0b001>;
+def S2_asl_i_vw : vshift_v2i32<shl, "vaslw", 0b010, 0b010>;
+
+def S2_asr_i_vh : vshift_v4i16<sra, "vasrh", 0b100, 0b000>;
+def S2_lsr_i_vh : vshift_v4i16<srl, "vlsrh", 0b100, 0b001>;
+def S2_asl_i_vh : vshift_v4i16<shl, "vaslh", 0b100, 0b010>;
+
+// Vector shift words by register
+def S2_asr_r_vw : T_S3op_shiftVect < "vasrw", 0b00, 0b00>;
+def S2_lsr_r_vw : T_S3op_shiftVect < "vlsrw", 0b00, 0b01>;
+def S2_asl_r_vw : T_S3op_shiftVect < "vaslw", 0b00, 0b10>;
+def S2_lsl_r_vw : T_S3op_shiftVect < "vlslw", 0b00, 0b11>;
+
+// Vector shift halfwords by register
+def S2_asr_r_vh : T_S3op_shiftVect < "vasrh", 0b01, 0b00>;
+def S2_lsr_r_vh : T_S3op_shiftVect < "vlsrh", 0b01, 0b01>;
+def S2_asl_r_vh : T_S3op_shiftVect < "vaslh", 0b01, 0b10>;
+def S2_lsl_r_vh : T_S3op_shiftVect < "vlslh", 0b01, 0b11>;
+
+
+// Hexagon doesn't have a vector multiply with C semantics.
+// Instead, generate a pseudo instruction that gets expaneded into two
+// scalar MPYI instructions.
+// This is expanded by ExpandPostRAPseudos.
+let isPseudo = 1 in
+def PS_vmulw : PseudoM<(outs DoubleRegs:$Rd),
+ (ins DoubleRegs:$Rs, DoubleRegs:$Rt), "", []>;
+
+let isPseudo = 1 in
+def PS_vmulw_acc : PseudoM<(outs DoubleRegs:$Rd),
+ (ins DoubleRegs:$Rx, DoubleRegs:$Rs, DoubleRegs:$Rt), "", [],
+ "$Rd = $Rx">;
+
+
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsics.td b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsics.td
new file mode 100644
index 000000000000..d4f303bf6ff0
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsics.td
@@ -0,0 +1,1353 @@
+//===-- HexagonIntrinsics.td - Instruction intrinsics ------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This is populated based on the following specs:
+// Hexagon V2 Architecture
+// Application-Level Specification
+// 80-V9418-8 Rev. B
+// March 4, 2008
+//===----------------------------------------------------------------------===//
+
+class T_I_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat <(IntID imm:$Is),
+ (MI imm:$Is)>;
+
+class T_R_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat <(IntID I32:$Rs),
+ (MI I32:$Rs)>;
+
+class T_P_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat <(IntID I64:$Rs),
+ (MI I64:$Rs)>;
+
+class T_II_pat <InstHexagon MI, Intrinsic IntID, PatFrag Imm1, PatFrag Imm2>
+ : Pat<(IntID Imm1:$Is, Imm2:$It),
+ (MI Imm1:$Is, Imm2:$It)>;
+
+class T_RI_pat <InstHexagon MI, Intrinsic IntID,
+ PatLeaf ImmPred = PatLeaf<(i32 imm)>>
+ : Pat<(IntID I32:$Rs, ImmPred:$It),
+ (MI I32:$Rs, ImmPred:$It)>;
+
+class T_IR_pat <InstHexagon MI, Intrinsic IntID,
+ PatFrag ImmPred = PatLeaf<(i32 imm)>>
+ : Pat<(IntID ImmPred:$Is, I32:$Rt),
+ (MI ImmPred:$Is, I32:$Rt)>;
+
+class T_PI_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat<(IntID I64:$Rs, imm:$It),
+ (MI I64:$Rs, imm:$It)>;
+
+class T_RP_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat<(IntID I32:$Rs, I64:$Rt),
+ (MI I32:$Rs, I64:$Rt)>;
+
+class T_RR_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat <(IntID I32:$Rs, I32:$Rt),
+ (MI I32:$Rs, I32:$Rt)>;
+
+class T_PP_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat <(IntID I64:$Rs, I64:$Rt),
+ (MI I64:$Rs, I64:$Rt)>;
+
+class T_QQ_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat <(IntID I32:$Rs, I32:$Rt),
+ (MI (C2_tfrrp I32:$Rs), (C2_tfrrp I32:$Rt))>;
+
+class T_QII_pat <InstHexagon MI, Intrinsic IntID, PatFrag Imm1, PatFrag Imm2>
+ : Pat <(IntID I32:$Rp, Imm1:$Is, Imm2:$It),
+ (MI (C2_tfrrp I32:$Rp), Imm1:$Is, Imm2:$It)>;
+
+class T_QRR_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat <(IntID I32:$Rp, I32:$Rs, I32:$Rt),
+ (MI (C2_tfrrp I32:$Rp), I32:$Rs, I32:$Rt)>;
+
+class T_QRI_pat <InstHexagon MI, Intrinsic IntID, PatFrag ImmPred>
+ : Pat <(IntID I32:$Rp, I32:$Rs, ImmPred:$Is),
+ (MI (C2_tfrrp I32:$Rp), I32:$Rs, ImmPred:$Is)>;
+
+class T_QIR_pat <InstHexagon MI, Intrinsic IntID, PatFrag ImmPred>
+ : Pat <(IntID I32:$Rp, ImmPred:$Is, I32:$Rs),
+ (MI (C2_tfrrp I32:$Rp), ImmPred:$Is, I32:$Rs)>;
+
+class T_QPP_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat <(IntID I32:$Rp, I64:$Rs, I64:$Rt),
+ (MI (C2_tfrrp I32:$Rp), I64:$Rs, I64:$Rt)>;
+
+class T_RRI_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat <(IntID I32:$Rs, I32:$Rt, imm:$Iu),
+ (MI I32:$Rs, I32:$Rt, imm:$Iu)>;
+
+class T_RII_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat <(IntID I32:$Rs, imm:$It, imm:$Iu),
+ (MI I32:$Rs, imm:$It, imm:$Iu)>;
+
+class T_IRI_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat <(IntID imm:$It, I32:$Rs, imm:$Iu),
+ (MI imm:$It, I32:$Rs, imm:$Iu)>;
+
+class T_IRR_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat <(IntID imm:$Is, I32:$Rs, I32:$Rt),
+ (MI imm:$Is, I32:$Rs, I32:$Rt)>;
+
+class T_RIR_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat <(IntID I32:$Rs, imm:$Is, I32:$Rt),
+ (MI I32:$Rs, imm:$Is, I32:$Rt)>;
+
+class T_RRR_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat <(IntID I32:$Rs, I32:$Rt, I32:$Ru),
+ (MI I32:$Rs, I32:$Rt, I32:$Ru)>;
+
+class T_PPI_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat <(IntID I64:$Rs, I64:$Rt, imm:$Iu),
+ (MI I64:$Rs, I64:$Rt, imm:$Iu)>;
+
+class T_PII_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat <(IntID I64:$Rs, imm:$It, imm:$Iu),
+ (MI I64:$Rs, imm:$It, imm:$Iu)>;
+
+class T_PPP_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat <(IntID I64:$Rs, I64:$Rt, I64:$Ru),
+ (MI I64:$Rs, I64:$Rt, I64:$Ru)>;
+
+class T_PPR_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat <(IntID I64:$Rs, I64:$Rt, I32:$Ru),
+ (MI I64:$Rs, I64:$Rt, I32:$Ru)>;
+
+class T_PRR_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat <(IntID I64:$Rs, I32:$Rt, I32:$Ru),
+ (MI I64:$Rs, I32:$Rt, I32:$Ru)>;
+
+class T_PPQ_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat <(IntID I64:$Rs, I64:$Rt, I32:$Rp),
+ (MI I64:$Rs, I64:$Rt, (C2_tfrrp I32:$Rp))>;
+
+class T_PR_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat <(IntID I64:$Rs, I32:$Rt),
+ (MI I64:$Rs, I32:$Rt)>;
+
+class T_D_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat<(IntID (F64:$Rs)),
+ (MI (F64:$Rs))>;
+
+class T_DI_pat <InstHexagon MI, Intrinsic IntID,
+ PatLeaf ImmPred = PatLeaf<(i32 imm)>>
+ : Pat<(IntID F64:$Rs, ImmPred:$It),
+ (MI F64:$Rs, ImmPred:$It)>;
+
+class T_F_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat<(IntID F32:$Rs),
+ (MI F32:$Rs)>;
+
+class T_FI_pat <InstHexagon MI, Intrinsic IntID,
+ PatLeaf ImmPred = PatLeaf<(i32 imm)>>
+ : Pat<(IntID F32:$Rs, ImmPred:$It),
+ (MI F32:$Rs, ImmPred:$It)>;
+
+class T_FF_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat<(IntID F32:$Rs, F32:$Rt),
+ (MI F32:$Rs, F32:$Rt)>;
+
+class T_DD_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat<(IntID F64:$Rs, F64:$Rt),
+ (MI F64:$Rs, F64:$Rt)>;
+
+class T_FFF_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat<(IntID F32:$Rs, F32:$Rt, F32:$Ru),
+ (MI F32:$Rs, F32:$Rt, F32:$Ru)>;
+
+class T_FFFQ_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat <(IntID F32:$Rs, F32:$Rt, F32:$Ru, I32:$Rp),
+ (MI F32:$Rs, F32:$Rt, F32:$Ru, (C2_tfrrp I32:$Rp))>;
+
+class T_Q_RI_pat <InstHexagon MI, Intrinsic IntID,
+ PatLeaf ImmPred = PatLeaf<(i32 imm)>>
+ : Pat<(IntID I32:$Rs, ImmPred:$It),
+ (C2_tfrpr (MI I32:$Rs, ImmPred:$It))>;
+
+class T_Q_RR_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat <(IntID I32:$Rs, I32:$Rt),
+ (C2_tfrpr (MI I32:$Rs, I32:$Rt))>;
+
+class T_Q_RP_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat <(IntID I32:$Rs, I64:$Rt),
+ (C2_tfrpr (MI I32:$Rs, I64:$Rt))>;
+
+class T_Q_PR_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat <(IntID I64:$Rs, I32:$Rt),
+ (C2_tfrpr (MI I64:$Rs, I32:$Rt))>;
+
+class T_Q_PI_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat<(IntID I64:$Rs, imm:$It),
+ (C2_tfrpr (MI I64:$Rs, imm:$It))>;
+
+class T_Q_PP_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat <(IntID I64:$Rs, I64:$Rt),
+ (C2_tfrpr (MI I64:$Rs, I64:$Rt))>;
+
+class T_Q_Q_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat <(IntID I32:$Rp),
+ (C2_tfrpr (MI (C2_tfrrp I32:$Rp)))>;
+
+class T_Q_QQ_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat <(IntID I32:$Rp, I32:$Rq),
+ (C2_tfrpr (MI (C2_tfrrp I32:$Rp), (C2_tfrrp I32:$Rq)))>;
+
+class T_Q_FF_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat<(IntID F32:$Rs, F32:$Rt),
+ (C2_tfrpr (MI F32:$Rs, F32:$Rt))>;
+
+class T_Q_DD_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat<(IntID F64:$Rs, F64:$Rt),
+ (C2_tfrpr (MI F64:$Rs, F64:$Rt))>;
+
+class T_Q_FI_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat<(IntID F32:$Rs, imm:$It),
+ (C2_tfrpr (MI F32:$Rs, imm:$It))>;
+
+class T_Q_DI_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat<(IntID F64:$Rs, imm:$It),
+ (C2_tfrpr (MI F64:$Rs, imm:$It))>;
+
+class T_Q_QQQ_pat <InstHexagon MI, Intrinsic IntID>
+ : Pat <(IntID I32:$Rp, I32:$Rq, I32:$Rs),
+ (C2_tfrpr (MI (C2_tfrrp I32:$Rp), (C2_tfrrp I32:$Rq),
+ (C2_tfrrp I32:$Rs)))>;
+
+//===----------------------------------------------------------------------===//
+// MPYS / Multipy signed/unsigned halfwords
+//Rd=mpy[u](Rs.[H|L],Rt.[H|L])[:<<1][:rnd][:sat]
+//===----------------------------------------------------------------------===//
+
+def : T_RR_pat <M2_mpy_ll_s1, int_hexagon_M2_mpy_ll_s1>;
+def : T_RR_pat <M2_mpy_ll_s0, int_hexagon_M2_mpy_ll_s0>;
+def : T_RR_pat <M2_mpy_lh_s1, int_hexagon_M2_mpy_lh_s1>;
+def : T_RR_pat <M2_mpy_lh_s0, int_hexagon_M2_mpy_lh_s0>;
+def : T_RR_pat <M2_mpy_hl_s1, int_hexagon_M2_mpy_hl_s1>;
+def : T_RR_pat <M2_mpy_hl_s0, int_hexagon_M2_mpy_hl_s0>;
+def : T_RR_pat <M2_mpy_hh_s1, int_hexagon_M2_mpy_hh_s1>;
+def : T_RR_pat <M2_mpy_hh_s0, int_hexagon_M2_mpy_hh_s0>;
+
+def : T_RR_pat <M2_mpyu_ll_s1, int_hexagon_M2_mpyu_ll_s1>;
+def : T_RR_pat <M2_mpyu_ll_s0, int_hexagon_M2_mpyu_ll_s0>;
+def : T_RR_pat <M2_mpyu_lh_s1, int_hexagon_M2_mpyu_lh_s1>;
+def : T_RR_pat <M2_mpyu_lh_s0, int_hexagon_M2_mpyu_lh_s0>;
+def : T_RR_pat <M2_mpyu_hl_s1, int_hexagon_M2_mpyu_hl_s1>;
+def : T_RR_pat <M2_mpyu_hl_s0, int_hexagon_M2_mpyu_hl_s0>;
+def : T_RR_pat <M2_mpyu_hh_s1, int_hexagon_M2_mpyu_hh_s1>;
+def : T_RR_pat <M2_mpyu_hh_s0, int_hexagon_M2_mpyu_hh_s0>;
+
+def : T_RR_pat <M2_mpy_sat_ll_s1, int_hexagon_M2_mpy_sat_ll_s1>;
+def : T_RR_pat <M2_mpy_sat_ll_s0, int_hexagon_M2_mpy_sat_ll_s0>;
+def : T_RR_pat <M2_mpy_sat_lh_s1, int_hexagon_M2_mpy_sat_lh_s1>;
+def : T_RR_pat <M2_mpy_sat_lh_s0, int_hexagon_M2_mpy_sat_lh_s0>;
+def : T_RR_pat <M2_mpy_sat_hl_s1, int_hexagon_M2_mpy_sat_hl_s1>;
+def : T_RR_pat <M2_mpy_sat_hl_s0, int_hexagon_M2_mpy_sat_hl_s0>;
+def : T_RR_pat <M2_mpy_sat_hh_s1, int_hexagon_M2_mpy_sat_hh_s1>;
+def : T_RR_pat <M2_mpy_sat_hh_s0, int_hexagon_M2_mpy_sat_hh_s0>;
+
+def : T_RR_pat <M2_mpy_rnd_ll_s1, int_hexagon_M2_mpy_rnd_ll_s1>;
+def : T_RR_pat <M2_mpy_rnd_ll_s0, int_hexagon_M2_mpy_rnd_ll_s0>;
+def : T_RR_pat <M2_mpy_rnd_lh_s1, int_hexagon_M2_mpy_rnd_lh_s1>;
+def : T_RR_pat <M2_mpy_rnd_lh_s0, int_hexagon_M2_mpy_rnd_lh_s0>;
+def : T_RR_pat <M2_mpy_rnd_hl_s1, int_hexagon_M2_mpy_rnd_hl_s1>;
+def : T_RR_pat <M2_mpy_rnd_hl_s0, int_hexagon_M2_mpy_rnd_hl_s0>;
+def : T_RR_pat <M2_mpy_rnd_hh_s1, int_hexagon_M2_mpy_rnd_hh_s1>;
+def : T_RR_pat <M2_mpy_rnd_hh_s0, int_hexagon_M2_mpy_rnd_hh_s0>;
+
+def : T_RR_pat <M2_mpy_sat_rnd_ll_s1, int_hexagon_M2_mpy_sat_rnd_ll_s1>;
+def : T_RR_pat <M2_mpy_sat_rnd_ll_s0, int_hexagon_M2_mpy_sat_rnd_ll_s0>;
+def : T_RR_pat <M2_mpy_sat_rnd_lh_s1, int_hexagon_M2_mpy_sat_rnd_lh_s1>;
+def : T_RR_pat <M2_mpy_sat_rnd_lh_s0, int_hexagon_M2_mpy_sat_rnd_lh_s0>;
+def : T_RR_pat <M2_mpy_sat_rnd_hl_s1, int_hexagon_M2_mpy_sat_rnd_hl_s1>;
+def : T_RR_pat <M2_mpy_sat_rnd_hl_s0, int_hexagon_M2_mpy_sat_rnd_hl_s0>;
+def : T_RR_pat <M2_mpy_sat_rnd_hh_s1, int_hexagon_M2_mpy_sat_rnd_hh_s1>;
+def : T_RR_pat <M2_mpy_sat_rnd_hh_s0, int_hexagon_M2_mpy_sat_rnd_hh_s0>;
+
+
+//===----------------------------------------------------------------------===//
+// MPYS / Multipy signed/unsigned halfwords and add/subtract the
+// result from the accumulator.
+//Rx [-+]= mpy[u](Rs.[H|L],Rt.[H|L])[:<<1][:sat]
+//===----------------------------------------------------------------------===//
+
+def : T_RRR_pat <M2_mpy_acc_ll_s1, int_hexagon_M2_mpy_acc_ll_s1>;
+def : T_RRR_pat <M2_mpy_acc_ll_s0, int_hexagon_M2_mpy_acc_ll_s0>;
+def : T_RRR_pat <M2_mpy_acc_lh_s1, int_hexagon_M2_mpy_acc_lh_s1>;
+def : T_RRR_pat <M2_mpy_acc_lh_s0, int_hexagon_M2_mpy_acc_lh_s0>;
+def : T_RRR_pat <M2_mpy_acc_hl_s1, int_hexagon_M2_mpy_acc_hl_s1>;
+def : T_RRR_pat <M2_mpy_acc_hl_s0, int_hexagon_M2_mpy_acc_hl_s0>;
+def : T_RRR_pat <M2_mpy_acc_hh_s1, int_hexagon_M2_mpy_acc_hh_s1>;
+def : T_RRR_pat <M2_mpy_acc_hh_s0, int_hexagon_M2_mpy_acc_hh_s0>;
+
+def : T_RRR_pat <M2_mpyu_acc_ll_s1, int_hexagon_M2_mpyu_acc_ll_s1>;
+def : T_RRR_pat <M2_mpyu_acc_ll_s0, int_hexagon_M2_mpyu_acc_ll_s0>;
+def : T_RRR_pat <M2_mpyu_acc_lh_s1, int_hexagon_M2_mpyu_acc_lh_s1>;
+def : T_RRR_pat <M2_mpyu_acc_lh_s0, int_hexagon_M2_mpyu_acc_lh_s0>;
+def : T_RRR_pat <M2_mpyu_acc_hl_s1, int_hexagon_M2_mpyu_acc_hl_s1>;
+def : T_RRR_pat <M2_mpyu_acc_hl_s0, int_hexagon_M2_mpyu_acc_hl_s0>;
+def : T_RRR_pat <M2_mpyu_acc_hh_s1, int_hexagon_M2_mpyu_acc_hh_s1>;
+def : T_RRR_pat <M2_mpyu_acc_hh_s0, int_hexagon_M2_mpyu_acc_hh_s0>;
+
+def : T_RRR_pat <M2_mpy_nac_ll_s1, int_hexagon_M2_mpy_nac_ll_s1>;
+def : T_RRR_pat <M2_mpy_nac_ll_s0, int_hexagon_M2_mpy_nac_ll_s0>;
+def : T_RRR_pat <M2_mpy_nac_lh_s1, int_hexagon_M2_mpy_nac_lh_s1>;
+def : T_RRR_pat <M2_mpy_nac_lh_s0, int_hexagon_M2_mpy_nac_lh_s0>;
+def : T_RRR_pat <M2_mpy_nac_hl_s1, int_hexagon_M2_mpy_nac_hl_s1>;
+def : T_RRR_pat <M2_mpy_nac_hl_s0, int_hexagon_M2_mpy_nac_hl_s0>;
+def : T_RRR_pat <M2_mpy_nac_hh_s1, int_hexagon_M2_mpy_nac_hh_s1>;
+def : T_RRR_pat <M2_mpy_nac_hh_s0, int_hexagon_M2_mpy_nac_hh_s0>;
+
+def : T_RRR_pat <M2_mpyu_nac_ll_s1, int_hexagon_M2_mpyu_nac_ll_s1>;
+def : T_RRR_pat <M2_mpyu_nac_ll_s0, int_hexagon_M2_mpyu_nac_ll_s0>;
+def : T_RRR_pat <M2_mpyu_nac_lh_s1, int_hexagon_M2_mpyu_nac_lh_s1>;
+def : T_RRR_pat <M2_mpyu_nac_lh_s0, int_hexagon_M2_mpyu_nac_lh_s0>;
+def : T_RRR_pat <M2_mpyu_nac_hl_s1, int_hexagon_M2_mpyu_nac_hl_s1>;
+def : T_RRR_pat <M2_mpyu_nac_hl_s0, int_hexagon_M2_mpyu_nac_hl_s0>;
+def : T_RRR_pat <M2_mpyu_nac_hh_s1, int_hexagon_M2_mpyu_nac_hh_s1>;
+def : T_RRR_pat <M2_mpyu_nac_hh_s0, int_hexagon_M2_mpyu_nac_hh_s0>;
+
+def : T_RRR_pat <M2_mpy_acc_sat_ll_s1, int_hexagon_M2_mpy_acc_sat_ll_s1>;
+def : T_RRR_pat <M2_mpy_acc_sat_ll_s0, int_hexagon_M2_mpy_acc_sat_ll_s0>;
+def : T_RRR_pat <M2_mpy_acc_sat_lh_s1, int_hexagon_M2_mpy_acc_sat_lh_s1>;
+def : T_RRR_pat <M2_mpy_acc_sat_lh_s0, int_hexagon_M2_mpy_acc_sat_lh_s0>;
+def : T_RRR_pat <M2_mpy_acc_sat_hl_s1, int_hexagon_M2_mpy_acc_sat_hl_s1>;
+def : T_RRR_pat <M2_mpy_acc_sat_hl_s0, int_hexagon_M2_mpy_acc_sat_hl_s0>;
+def : T_RRR_pat <M2_mpy_acc_sat_hh_s1, int_hexagon_M2_mpy_acc_sat_hh_s1>;
+def : T_RRR_pat <M2_mpy_acc_sat_hh_s0, int_hexagon_M2_mpy_acc_sat_hh_s0>;
+
+def : T_RRR_pat <M2_mpy_nac_sat_ll_s1, int_hexagon_M2_mpy_nac_sat_ll_s1>;
+def : T_RRR_pat <M2_mpy_nac_sat_ll_s0, int_hexagon_M2_mpy_nac_sat_ll_s0>;
+def : T_RRR_pat <M2_mpy_nac_sat_lh_s1, int_hexagon_M2_mpy_nac_sat_lh_s1>;
+def : T_RRR_pat <M2_mpy_nac_sat_lh_s0, int_hexagon_M2_mpy_nac_sat_lh_s0>;
+def : T_RRR_pat <M2_mpy_nac_sat_hl_s1, int_hexagon_M2_mpy_nac_sat_hl_s1>;
+def : T_RRR_pat <M2_mpy_nac_sat_hl_s0, int_hexagon_M2_mpy_nac_sat_hl_s0>;
+def : T_RRR_pat <M2_mpy_nac_sat_hh_s1, int_hexagon_M2_mpy_nac_sat_hh_s1>;
+def : T_RRR_pat <M2_mpy_nac_sat_hh_s0, int_hexagon_M2_mpy_nac_sat_hh_s0>;
+
+
+//===----------------------------------------------------------------------===//
+// Multiply signed/unsigned halfwords with and without saturation and rounding
+// into a 64-bits destination register.
+//===----------------------------------------------------------------------===//
+
+def : T_RR_pat <M2_mpyd_hh_s0, int_hexagon_M2_mpyd_hh_s0>;
+def : T_RR_pat <M2_mpyd_hl_s0, int_hexagon_M2_mpyd_hl_s0>;
+def : T_RR_pat <M2_mpyd_lh_s0, int_hexagon_M2_mpyd_lh_s0>;
+def : T_RR_pat <M2_mpyd_ll_s0, int_hexagon_M2_mpyd_ll_s0>;
+def : T_RR_pat <M2_mpyd_hh_s1, int_hexagon_M2_mpyd_hh_s1>;
+def : T_RR_pat <M2_mpyd_hl_s1, int_hexagon_M2_mpyd_hl_s1>;
+def : T_RR_pat <M2_mpyd_lh_s1, int_hexagon_M2_mpyd_lh_s1>;
+def : T_RR_pat <M2_mpyd_ll_s1, int_hexagon_M2_mpyd_ll_s1>;
+
+def : T_RR_pat <M2_mpyd_rnd_hh_s0, int_hexagon_M2_mpyd_rnd_hh_s0>;
+def : T_RR_pat <M2_mpyd_rnd_hl_s0, int_hexagon_M2_mpyd_rnd_hl_s0>;
+def : T_RR_pat <M2_mpyd_rnd_lh_s0, int_hexagon_M2_mpyd_rnd_lh_s0>;
+def : T_RR_pat <M2_mpyd_rnd_ll_s0, int_hexagon_M2_mpyd_rnd_ll_s0>;
+def : T_RR_pat <M2_mpyd_rnd_hh_s1, int_hexagon_M2_mpyd_rnd_hh_s1>;
+def : T_RR_pat <M2_mpyd_rnd_hl_s1, int_hexagon_M2_mpyd_rnd_hl_s1>;
+def : T_RR_pat <M2_mpyd_rnd_lh_s1, int_hexagon_M2_mpyd_rnd_lh_s1>;
+def : T_RR_pat <M2_mpyd_rnd_ll_s1, int_hexagon_M2_mpyd_rnd_ll_s1>;
+
+def : T_RR_pat <M2_mpyud_hh_s0, int_hexagon_M2_mpyud_hh_s0>;
+def : T_RR_pat <M2_mpyud_hl_s0, int_hexagon_M2_mpyud_hl_s0>;
+def : T_RR_pat <M2_mpyud_lh_s0, int_hexagon_M2_mpyud_lh_s0>;
+def : T_RR_pat <M2_mpyud_ll_s0, int_hexagon_M2_mpyud_ll_s0>;
+def : T_RR_pat <M2_mpyud_hh_s1, int_hexagon_M2_mpyud_hh_s1>;
+def : T_RR_pat <M2_mpyud_hl_s1, int_hexagon_M2_mpyud_hl_s1>;
+def : T_RR_pat <M2_mpyud_lh_s1, int_hexagon_M2_mpyud_lh_s1>;
+def : T_RR_pat <M2_mpyud_ll_s1, int_hexagon_M2_mpyud_ll_s1>;
+
+//===----------------------------------------------------------------------===//
+// MPYS / Multipy signed/unsigned halfwords and add/subtract the
+// result from the 64-bit destination register.
+//Rxx [-+]= mpy[u](Rs.[H|L],Rt.[H|L])[:<<1][:sat]
+//===----------------------------------------------------------------------===//
+
+def : T_PRR_pat <M2_mpyd_acc_hh_s0, int_hexagon_M2_mpyd_acc_hh_s0>;
+def : T_PRR_pat <M2_mpyd_acc_hl_s0, int_hexagon_M2_mpyd_acc_hl_s0>;
+def : T_PRR_pat <M2_mpyd_acc_lh_s0, int_hexagon_M2_mpyd_acc_lh_s0>;
+def : T_PRR_pat <M2_mpyd_acc_ll_s0, int_hexagon_M2_mpyd_acc_ll_s0>;
+
+def : T_PRR_pat <M2_mpyd_acc_hh_s1, int_hexagon_M2_mpyd_acc_hh_s1>;
+def : T_PRR_pat <M2_mpyd_acc_hl_s1, int_hexagon_M2_mpyd_acc_hl_s1>;
+def : T_PRR_pat <M2_mpyd_acc_lh_s1, int_hexagon_M2_mpyd_acc_lh_s1>;
+def : T_PRR_pat <M2_mpyd_acc_ll_s1, int_hexagon_M2_mpyd_acc_ll_s1>;
+
+def : T_PRR_pat <M2_mpyd_nac_hh_s0, int_hexagon_M2_mpyd_nac_hh_s0>;
+def : T_PRR_pat <M2_mpyd_nac_hl_s0, int_hexagon_M2_mpyd_nac_hl_s0>;
+def : T_PRR_pat <M2_mpyd_nac_lh_s0, int_hexagon_M2_mpyd_nac_lh_s0>;
+def : T_PRR_pat <M2_mpyd_nac_ll_s0, int_hexagon_M2_mpyd_nac_ll_s0>;
+
+def : T_PRR_pat <M2_mpyd_nac_hh_s1, int_hexagon_M2_mpyd_nac_hh_s1>;
+def : T_PRR_pat <M2_mpyd_nac_hl_s1, int_hexagon_M2_mpyd_nac_hl_s1>;
+def : T_PRR_pat <M2_mpyd_nac_lh_s1, int_hexagon_M2_mpyd_nac_lh_s1>;
+def : T_PRR_pat <M2_mpyd_nac_ll_s1, int_hexagon_M2_mpyd_nac_ll_s1>;
+
+def : T_PRR_pat <M2_mpyud_acc_hh_s0, int_hexagon_M2_mpyud_acc_hh_s0>;
+def : T_PRR_pat <M2_mpyud_acc_hl_s0, int_hexagon_M2_mpyud_acc_hl_s0>;
+def : T_PRR_pat <M2_mpyud_acc_lh_s0, int_hexagon_M2_mpyud_acc_lh_s0>;
+def : T_PRR_pat <M2_mpyud_acc_ll_s0, int_hexagon_M2_mpyud_acc_ll_s0>;
+
+def : T_PRR_pat <M2_mpyud_acc_hh_s1, int_hexagon_M2_mpyud_acc_hh_s1>;
+def : T_PRR_pat <M2_mpyud_acc_hl_s1, int_hexagon_M2_mpyud_acc_hl_s1>;
+def : T_PRR_pat <M2_mpyud_acc_lh_s1, int_hexagon_M2_mpyud_acc_lh_s1>;
+def : T_PRR_pat <M2_mpyud_acc_ll_s1, int_hexagon_M2_mpyud_acc_ll_s1>;
+
+def : T_PRR_pat <M2_mpyud_nac_hh_s0, int_hexagon_M2_mpyud_nac_hh_s0>;
+def : T_PRR_pat <M2_mpyud_nac_hl_s0, int_hexagon_M2_mpyud_nac_hl_s0>;
+def : T_PRR_pat <M2_mpyud_nac_lh_s0, int_hexagon_M2_mpyud_nac_lh_s0>;
+def : T_PRR_pat <M2_mpyud_nac_ll_s0, int_hexagon_M2_mpyud_nac_ll_s0>;
+
+def : T_PRR_pat <M2_mpyud_nac_hh_s1, int_hexagon_M2_mpyud_nac_hh_s1>;
+def : T_PRR_pat <M2_mpyud_nac_hl_s1, int_hexagon_M2_mpyud_nac_hl_s1>;
+def : T_PRR_pat <M2_mpyud_nac_lh_s1, int_hexagon_M2_mpyud_nac_lh_s1>;
+def : T_PRR_pat <M2_mpyud_nac_ll_s1, int_hexagon_M2_mpyud_nac_ll_s1>;
+
+// Vector complex multiply imaginary: Rdd=vcmpyi(Rss,Rtt)[:<<1]:sat
+def : T_PP_pat <M2_vcmpy_s1_sat_i, int_hexagon_M2_vcmpy_s1_sat_i>;
+def : T_PP_pat <M2_vcmpy_s0_sat_i, int_hexagon_M2_vcmpy_s0_sat_i>;
+
+// Vector complex multiply real: Rdd=vcmpyr(Rss,Rtt)[:<<1]:sat
+def : T_PP_pat <M2_vcmpy_s1_sat_r, int_hexagon_M2_vcmpy_s1_sat_r>;
+def : T_PP_pat <M2_vcmpy_s0_sat_r, int_hexagon_M2_vcmpy_s0_sat_r>;
+
+// Vector dual multiply: Rdd=vdmpy(Rss,Rtt)[:<<1]:sat
+def : T_PP_pat <M2_vdmpys_s1, int_hexagon_M2_vdmpys_s1>;
+def : T_PP_pat <M2_vdmpys_s0, int_hexagon_M2_vdmpys_s0>;
+
+// Vector multiply even halfwords: Rdd=vmpyeh(Rss,Rtt)[:<<1]:sat
+def : T_PP_pat <M2_vmpy2es_s1, int_hexagon_M2_vmpy2es_s1>;
+def : T_PP_pat <M2_vmpy2es_s0, int_hexagon_M2_vmpy2es_s0>;
+
+//Rdd=vmpywoh(Rss,Rtt)[:<<1][:rnd]:sat
+def : T_PP_pat <M2_mmpyh_s0, int_hexagon_M2_mmpyh_s0>;
+def : T_PP_pat <M2_mmpyh_s1, int_hexagon_M2_mmpyh_s1>;
+def : T_PP_pat <M2_mmpyh_rs0, int_hexagon_M2_mmpyh_rs0>;
+def : T_PP_pat <M2_mmpyh_rs1, int_hexagon_M2_mmpyh_rs1>;
+
+//Rdd=vmpyweh(Rss,Rtt)[:<<1][:rnd]:sat
+def : T_PP_pat <M2_mmpyl_s0, int_hexagon_M2_mmpyl_s0>;
+def : T_PP_pat <M2_mmpyl_s1, int_hexagon_M2_mmpyl_s1>;
+def : T_PP_pat <M2_mmpyl_rs0, int_hexagon_M2_mmpyl_rs0>;
+def : T_PP_pat <M2_mmpyl_rs1, int_hexagon_M2_mmpyl_rs1>;
+
+//Rdd=vmpywouh(Rss,Rtt)[:<<1][:rnd]:sat
+def : T_PP_pat <M2_mmpyuh_s0, int_hexagon_M2_mmpyuh_s0>;
+def : T_PP_pat <M2_mmpyuh_s1, int_hexagon_M2_mmpyuh_s1>;
+def : T_PP_pat <M2_mmpyuh_rs0, int_hexagon_M2_mmpyuh_rs0>;
+def : T_PP_pat <M2_mmpyuh_rs1, int_hexagon_M2_mmpyuh_rs1>;
+
+//Rdd=vmpyweuh(Rss,Rtt)[:<<1][:rnd]:sat
+def : T_PP_pat <M2_mmpyul_s0, int_hexagon_M2_mmpyul_s0>;
+def : T_PP_pat <M2_mmpyul_s1, int_hexagon_M2_mmpyul_s1>;
+def : T_PP_pat <M2_mmpyul_rs0, int_hexagon_M2_mmpyul_rs0>;
+def : T_PP_pat <M2_mmpyul_rs1, int_hexagon_M2_mmpyul_rs1>;
+
+// Vector reduce add unsigned bytes: Rdd32[+]=vrmpybu(Rss32,Rtt32)
+def : T_PP_pat <A2_vraddub, int_hexagon_A2_vraddub>;
+def : T_PPP_pat <A2_vraddub_acc, int_hexagon_A2_vraddub_acc>;
+
+// Vector sum of absolute differences unsigned bytes: Rdd=vrsadub(Rss,Rtt)
+def : T_PP_pat <A2_vrsadub, int_hexagon_A2_vrsadub>;
+def : T_PPP_pat <A2_vrsadub_acc, int_hexagon_A2_vrsadub_acc>;
+
+// Vector absolute difference: Rdd=vabsdiffh(Rtt,Rss)
+def : T_PP_pat <M2_vabsdiffh, int_hexagon_M2_vabsdiffh>;
+
+// Vector absolute difference words: Rdd=vabsdiffw(Rtt,Rss)
+def : T_PP_pat <M2_vabsdiffw, int_hexagon_M2_vabsdiffw>;
+
+// Vector reduce complex multiply real or imaginary:
+// Rdd[+]=vrcmpy[ir](Rss,Rtt[*])
+def : T_PP_pat <M2_vrcmpyi_s0, int_hexagon_M2_vrcmpyi_s0>;
+def : T_PP_pat <M2_vrcmpyi_s0c, int_hexagon_M2_vrcmpyi_s0c>;
+def : T_PPP_pat <M2_vrcmaci_s0, int_hexagon_M2_vrcmaci_s0>;
+def : T_PPP_pat <M2_vrcmaci_s0c, int_hexagon_M2_vrcmaci_s0c>;
+
+def : T_PP_pat <M2_vrcmpyr_s0, int_hexagon_M2_vrcmpyr_s0>;
+def : T_PP_pat <M2_vrcmpyr_s0c, int_hexagon_M2_vrcmpyr_s0c>;
+def : T_PPP_pat <M2_vrcmacr_s0, int_hexagon_M2_vrcmacr_s0>;
+def : T_PPP_pat <M2_vrcmacr_s0c, int_hexagon_M2_vrcmacr_s0c>;
+
+// Vector reduce halfwords
+// Rdd[+]=vrmpyh(Rss,Rtt)
+def : T_PP_pat <M2_vrmpy_s0, int_hexagon_M2_vrmpy_s0>;
+def : T_PPP_pat <M2_vrmac_s0, int_hexagon_M2_vrmac_s0>;
+
+//===----------------------------------------------------------------------===//
+// Vector Multipy with accumulation
+//===----------------------------------------------------------------------===//
+
+// Vector multiply word by signed half with accumulation
+// Rxx+=vmpyw[eo]h(Rss,Rtt)[:<<1][:rnd]:sat
+def : T_PPP_pat <M2_mmacls_s1, int_hexagon_M2_mmacls_s1>;
+def : T_PPP_pat <M2_mmacls_s0, int_hexagon_M2_mmacls_s0>;
+def : T_PPP_pat <M2_mmacls_rs1, int_hexagon_M2_mmacls_rs1>;
+def : T_PPP_pat <M2_mmacls_rs0, int_hexagon_M2_mmacls_rs0>;
+def : T_PPP_pat <M2_mmachs_s1, int_hexagon_M2_mmachs_s1>;
+def : T_PPP_pat <M2_mmachs_s0, int_hexagon_M2_mmachs_s0>;
+def : T_PPP_pat <M2_mmachs_rs1, int_hexagon_M2_mmachs_rs1>;
+def : T_PPP_pat <M2_mmachs_rs0, int_hexagon_M2_mmachs_rs0>;
+
+// Vector multiply word by unsigned half with accumulation
+// Rxx+=vmpyw[eo]uh(Rss,Rtt)[:<<1][:rnd]:sat
+def : T_PPP_pat <M2_mmaculs_s1, int_hexagon_M2_mmaculs_s1>;
+def : T_PPP_pat <M2_mmaculs_s0, int_hexagon_M2_mmaculs_s0>;
+def : T_PPP_pat <M2_mmaculs_rs1, int_hexagon_M2_mmaculs_rs1>;
+def : T_PPP_pat <M2_mmaculs_rs0, int_hexagon_M2_mmaculs_rs0>;
+def : T_PPP_pat <M2_mmacuhs_s1, int_hexagon_M2_mmacuhs_s1>;
+def : T_PPP_pat <M2_mmacuhs_s0, int_hexagon_M2_mmacuhs_s0>;
+def : T_PPP_pat <M2_mmacuhs_rs1, int_hexagon_M2_mmacuhs_rs1>;
+def : T_PPP_pat <M2_mmacuhs_rs0, int_hexagon_M2_mmacuhs_rs0>;
+
+// Vector multiply even halfwords with accumulation
+// Rxx+=vmpyeh(Rss,Rtt)[:<<1][:sat]
+def : T_PPP_pat <M2_vmac2es, int_hexagon_M2_vmac2es>;
+def : T_PPP_pat <M2_vmac2es_s1, int_hexagon_M2_vmac2es_s1>;
+def : T_PPP_pat <M2_vmac2es_s0, int_hexagon_M2_vmac2es_s0>;
+
+// Vector dual multiply with accumulation
+// Rxx+=vdmpy(Rss,Rtt)[:sat]
+def : T_PPP_pat <M2_vdmacs_s1, int_hexagon_M2_vdmacs_s1>;
+def : T_PPP_pat <M2_vdmacs_s0, int_hexagon_M2_vdmacs_s0>;
+
+// Vector complex multiply real or imaginary with accumulation
+// Rxx+=vcmpy[ir](Rss,Rtt):sat
+def : T_PPP_pat <M2_vcmac_s0_sat_r, int_hexagon_M2_vcmac_s0_sat_r>;
+def : T_PPP_pat <M2_vcmac_s0_sat_i, int_hexagon_M2_vcmac_s0_sat_i>;
+
+//===----------------------------------------------------------------------===//
+// Add/Subtract halfword
+// Rd=add(Rt.L,Rs.[HL])[:sat]
+// Rd=sub(Rt.L,Rs.[HL])[:sat]
+// Rd=add(Rt.[LH],Rs.[HL])[:sat][:<16]
+// Rd=sub(Rt.[LH],Rs.[HL])[:sat][:<16]
+//===----------------------------------------------------------------------===//
+
+//Rd=add(Rt.L,Rs.[LH])
+def : T_RR_pat <A2_addh_l16_ll, int_hexagon_A2_addh_l16_ll>;
+def : T_RR_pat <A2_addh_l16_hl, int_hexagon_A2_addh_l16_hl>;
+
+//Rd=add(Rt.L,Rs.[LH]):sat
+def : T_RR_pat <A2_addh_l16_sat_ll, int_hexagon_A2_addh_l16_sat_ll>;
+def : T_RR_pat <A2_addh_l16_sat_hl, int_hexagon_A2_addh_l16_sat_hl>;
+
+//Rd=sub(Rt.L,Rs.[LH])
+def : T_RR_pat <A2_subh_l16_ll, int_hexagon_A2_subh_l16_ll>;
+def : T_RR_pat <A2_subh_l16_hl, int_hexagon_A2_subh_l16_hl>;
+
+//Rd=sub(Rt.L,Rs.[LH]):sat
+def : T_RR_pat <A2_subh_l16_sat_ll, int_hexagon_A2_subh_l16_sat_ll>;
+def : T_RR_pat <A2_subh_l16_sat_hl, int_hexagon_A2_subh_l16_sat_hl>;
+
+//Rd=add(Rt.[LH],Rs.[LH]):<<16
+def : T_RR_pat <A2_addh_h16_ll, int_hexagon_A2_addh_h16_ll>;
+def : T_RR_pat <A2_addh_h16_lh, int_hexagon_A2_addh_h16_lh>;
+def : T_RR_pat <A2_addh_h16_hl, int_hexagon_A2_addh_h16_hl>;
+def : T_RR_pat <A2_addh_h16_hh, int_hexagon_A2_addh_h16_hh>;
+
+//Rd=sub(Rt.[LH],Rs.[LH]):<<16
+def : T_RR_pat <A2_subh_h16_ll, int_hexagon_A2_subh_h16_ll>;
+def : T_RR_pat <A2_subh_h16_lh, int_hexagon_A2_subh_h16_lh>;
+def : T_RR_pat <A2_subh_h16_hl, int_hexagon_A2_subh_h16_hl>;
+def : T_RR_pat <A2_subh_h16_hh, int_hexagon_A2_subh_h16_hh>;
+
+//Rd=add(Rt.[LH],Rs.[LH]):sat:<<16
+def : T_RR_pat <A2_addh_h16_sat_ll, int_hexagon_A2_addh_h16_sat_ll>;
+def : T_RR_pat <A2_addh_h16_sat_lh, int_hexagon_A2_addh_h16_sat_lh>;
+def : T_RR_pat <A2_addh_h16_sat_hl, int_hexagon_A2_addh_h16_sat_hl>;
+def : T_RR_pat <A2_addh_h16_sat_hh, int_hexagon_A2_addh_h16_sat_hh>;
+
+//Rd=sub(Rt.[LH],Rs.[LH]):sat:<<16
+def : T_RR_pat <A2_subh_h16_sat_ll, int_hexagon_A2_subh_h16_sat_ll>;
+def : T_RR_pat <A2_subh_h16_sat_lh, int_hexagon_A2_subh_h16_sat_lh>;
+def : T_RR_pat <A2_subh_h16_sat_hl, int_hexagon_A2_subh_h16_sat_hl>;
+def : T_RR_pat <A2_subh_h16_sat_hh, int_hexagon_A2_subh_h16_sat_hh>;
+
+// ALU64 / ALU / min max
+def : T_RR_pat<A2_max, int_hexagon_A2_max>;
+def : T_RR_pat<A2_min, int_hexagon_A2_min>;
+def : T_RR_pat<A2_maxu, int_hexagon_A2_maxu>;
+def : T_RR_pat<A2_minu, int_hexagon_A2_minu>;
+
+// Shift and accumulate
+def : T_RRI_pat <S2_asr_i_r_nac, int_hexagon_S2_asr_i_r_nac>;
+def : T_RRI_pat <S2_lsr_i_r_nac, int_hexagon_S2_lsr_i_r_nac>;
+def : T_RRI_pat <S2_asl_i_r_nac, int_hexagon_S2_asl_i_r_nac>;
+def : T_RRI_pat <S2_asr_i_r_acc, int_hexagon_S2_asr_i_r_acc>;
+def : T_RRI_pat <S2_lsr_i_r_acc, int_hexagon_S2_lsr_i_r_acc>;
+def : T_RRI_pat <S2_asl_i_r_acc, int_hexagon_S2_asl_i_r_acc>;
+
+def : T_RRI_pat <S2_asr_i_r_and, int_hexagon_S2_asr_i_r_and>;
+def : T_RRI_pat <S2_lsr_i_r_and, int_hexagon_S2_lsr_i_r_and>;
+def : T_RRI_pat <S2_asl_i_r_and, int_hexagon_S2_asl_i_r_and>;
+def : T_RRI_pat <S2_asr_i_r_or, int_hexagon_S2_asr_i_r_or>;
+def : T_RRI_pat <S2_lsr_i_r_or, int_hexagon_S2_lsr_i_r_or>;
+def : T_RRI_pat <S2_asl_i_r_or, int_hexagon_S2_asl_i_r_or>;
+def : T_RRI_pat <S2_lsr_i_r_xacc, int_hexagon_S2_lsr_i_r_xacc>;
+def : T_RRI_pat <S2_asl_i_r_xacc, int_hexagon_S2_asl_i_r_xacc>;
+
+def : T_PPI_pat <S2_asr_i_p_nac, int_hexagon_S2_asr_i_p_nac>;
+def : T_PPI_pat <S2_lsr_i_p_nac, int_hexagon_S2_lsr_i_p_nac>;
+def : T_PPI_pat <S2_asl_i_p_nac, int_hexagon_S2_asl_i_p_nac>;
+def : T_PPI_pat <S2_asr_i_p_acc, int_hexagon_S2_asr_i_p_acc>;
+def : T_PPI_pat <S2_lsr_i_p_acc, int_hexagon_S2_lsr_i_p_acc>;
+def : T_PPI_pat <S2_asl_i_p_acc, int_hexagon_S2_asl_i_p_acc>;
+
+def : T_PPI_pat <S2_asr_i_p_and, int_hexagon_S2_asr_i_p_and>;
+def : T_PPI_pat <S2_lsr_i_p_and, int_hexagon_S2_lsr_i_p_and>;
+def : T_PPI_pat <S2_asl_i_p_and, int_hexagon_S2_asl_i_p_and>;
+def : T_PPI_pat <S2_asr_i_p_or, int_hexagon_S2_asr_i_p_or>;
+def : T_PPI_pat <S2_lsr_i_p_or, int_hexagon_S2_lsr_i_p_or>;
+def : T_PPI_pat <S2_asl_i_p_or, int_hexagon_S2_asl_i_p_or>;
+def : T_PPI_pat <S2_lsr_i_p_xacc, int_hexagon_S2_lsr_i_p_xacc>;
+def : T_PPI_pat <S2_asl_i_p_xacc, int_hexagon_S2_asl_i_p_xacc>;
+
+def : T_RRR_pat <S2_asr_r_r_nac, int_hexagon_S2_asr_r_r_nac>;
+def : T_RRR_pat <S2_lsr_r_r_nac, int_hexagon_S2_lsr_r_r_nac>;
+def : T_RRR_pat <S2_asl_r_r_nac, int_hexagon_S2_asl_r_r_nac>;
+def : T_RRR_pat <S2_lsl_r_r_nac, int_hexagon_S2_lsl_r_r_nac>;
+def : T_RRR_pat <S2_asr_r_r_acc, int_hexagon_S2_asr_r_r_acc>;
+def : T_RRR_pat <S2_lsr_r_r_acc, int_hexagon_S2_lsr_r_r_acc>;
+def : T_RRR_pat <S2_asl_r_r_acc, int_hexagon_S2_asl_r_r_acc>;
+def : T_RRR_pat <S2_lsl_r_r_acc, int_hexagon_S2_lsl_r_r_acc>;
+
+def : T_RRR_pat <S2_asr_r_r_and, int_hexagon_S2_asr_r_r_and>;
+def : T_RRR_pat <S2_lsr_r_r_and, int_hexagon_S2_lsr_r_r_and>;
+def : T_RRR_pat <S2_asl_r_r_and, int_hexagon_S2_asl_r_r_and>;
+def : T_RRR_pat <S2_lsl_r_r_and, int_hexagon_S2_lsl_r_r_and>;
+def : T_RRR_pat <S2_asr_r_r_or, int_hexagon_S2_asr_r_r_or>;
+def : T_RRR_pat <S2_lsr_r_r_or, int_hexagon_S2_lsr_r_r_or>;
+def : T_RRR_pat <S2_asl_r_r_or, int_hexagon_S2_asl_r_r_or>;
+def : T_RRR_pat <S2_lsl_r_r_or, int_hexagon_S2_lsl_r_r_or>;
+
+def : T_PPR_pat <S2_asr_r_p_nac, int_hexagon_S2_asr_r_p_nac>;
+def : T_PPR_pat <S2_lsr_r_p_nac, int_hexagon_S2_lsr_r_p_nac>;
+def : T_PPR_pat <S2_asl_r_p_nac, int_hexagon_S2_asl_r_p_nac>;
+def : T_PPR_pat <S2_lsl_r_p_nac, int_hexagon_S2_lsl_r_p_nac>;
+def : T_PPR_pat <S2_asr_r_p_acc, int_hexagon_S2_asr_r_p_acc>;
+def : T_PPR_pat <S2_lsr_r_p_acc, int_hexagon_S2_lsr_r_p_acc>;
+def : T_PPR_pat <S2_asl_r_p_acc, int_hexagon_S2_asl_r_p_acc>;
+def : T_PPR_pat <S2_lsl_r_p_acc, int_hexagon_S2_lsl_r_p_acc>;
+
+def : T_PPR_pat <S2_asr_r_p_and, int_hexagon_S2_asr_r_p_and>;
+def : T_PPR_pat <S2_lsr_r_p_and, int_hexagon_S2_lsr_r_p_and>;
+def : T_PPR_pat <S2_asl_r_p_and, int_hexagon_S2_asl_r_p_and>;
+def : T_PPR_pat <S2_lsl_r_p_and, int_hexagon_S2_lsl_r_p_and>;
+def : T_PPR_pat <S2_asr_r_p_or, int_hexagon_S2_asr_r_p_or>;
+def : T_PPR_pat <S2_lsr_r_p_or, int_hexagon_S2_lsr_r_p_or>;
+def : T_PPR_pat <S2_asl_r_p_or, int_hexagon_S2_asl_r_p_or>;
+def : T_PPR_pat <S2_lsl_r_p_or, int_hexagon_S2_lsl_r_p_or>;
+
+def : T_RRI_pat <S2_asr_i_r_nac, int_hexagon_S2_asr_i_r_nac>;
+def : T_RRI_pat <S2_lsr_i_r_nac, int_hexagon_S2_lsr_i_r_nac>;
+def : T_RRI_pat <S2_asl_i_r_nac, int_hexagon_S2_asl_i_r_nac>;
+def : T_RRI_pat <S2_asr_i_r_acc, int_hexagon_S2_asr_i_r_acc>;
+def : T_RRI_pat <S2_lsr_i_r_acc, int_hexagon_S2_lsr_i_r_acc>;
+def : T_RRI_pat <S2_asl_i_r_acc, int_hexagon_S2_asl_i_r_acc>;
+
+def : T_RRI_pat <S2_asr_i_r_and, int_hexagon_S2_asr_i_r_and>;
+def : T_RRI_pat <S2_lsr_i_r_and, int_hexagon_S2_lsr_i_r_and>;
+def : T_RRI_pat <S2_asl_i_r_and, int_hexagon_S2_asl_i_r_and>;
+def : T_RRI_pat <S2_asr_i_r_or, int_hexagon_S2_asr_i_r_or>;
+def : T_RRI_pat <S2_lsr_i_r_or, int_hexagon_S2_lsr_i_r_or>;
+def : T_RRI_pat <S2_asl_i_r_or, int_hexagon_S2_asl_i_r_or>;
+def : T_RRI_pat <S2_lsr_i_r_xacc, int_hexagon_S2_lsr_i_r_xacc>;
+def : T_RRI_pat <S2_asl_i_r_xacc, int_hexagon_S2_asl_i_r_xacc>;
+
+def : T_PPI_pat <S2_asr_i_p_nac, int_hexagon_S2_asr_i_p_nac>;
+def : T_PPI_pat <S2_lsr_i_p_nac, int_hexagon_S2_lsr_i_p_nac>;
+def : T_PPI_pat <S2_asl_i_p_nac, int_hexagon_S2_asl_i_p_nac>;
+def : T_PPI_pat <S2_asr_i_p_acc, int_hexagon_S2_asr_i_p_acc>;
+def : T_PPI_pat <S2_lsr_i_p_acc, int_hexagon_S2_lsr_i_p_acc>;
+def : T_PPI_pat <S2_asl_i_p_acc, int_hexagon_S2_asl_i_p_acc>;
+
+def : T_PPI_pat <S2_asr_i_p_and, int_hexagon_S2_asr_i_p_and>;
+def : T_PPI_pat <S2_lsr_i_p_and, int_hexagon_S2_lsr_i_p_and>;
+def : T_PPI_pat <S2_asl_i_p_and, int_hexagon_S2_asl_i_p_and>;
+def : T_PPI_pat <S2_asr_i_p_or, int_hexagon_S2_asr_i_p_or>;
+def : T_PPI_pat <S2_lsr_i_p_or, int_hexagon_S2_lsr_i_p_or>;
+def : T_PPI_pat <S2_asl_i_p_or, int_hexagon_S2_asl_i_p_or>;
+def : T_PPI_pat <S2_lsr_i_p_xacc, int_hexagon_S2_lsr_i_p_xacc>;
+def : T_PPI_pat <S2_asl_i_p_xacc, int_hexagon_S2_asl_i_p_xacc>;
+
+def : T_RRR_pat <S2_asr_r_r_nac, int_hexagon_S2_asr_r_r_nac>;
+def : T_RRR_pat <S2_lsr_r_r_nac, int_hexagon_S2_lsr_r_r_nac>;
+def : T_RRR_pat <S2_asl_r_r_nac, int_hexagon_S2_asl_r_r_nac>;
+def : T_RRR_pat <S2_lsl_r_r_nac, int_hexagon_S2_lsl_r_r_nac>;
+def : T_RRR_pat <S2_asr_r_r_acc, int_hexagon_S2_asr_r_r_acc>;
+def : T_RRR_pat <S2_lsr_r_r_acc, int_hexagon_S2_lsr_r_r_acc>;
+def : T_RRR_pat <S2_asl_r_r_acc, int_hexagon_S2_asl_r_r_acc>;
+def : T_RRR_pat <S2_lsl_r_r_acc, int_hexagon_S2_lsl_r_r_acc>;
+
+def : T_RRR_pat <S2_asr_r_r_and, int_hexagon_S2_asr_r_r_and>;
+def : T_RRR_pat <S2_lsr_r_r_and, int_hexagon_S2_lsr_r_r_and>;
+def : T_RRR_pat <S2_asl_r_r_and, int_hexagon_S2_asl_r_r_and>;
+def : T_RRR_pat <S2_lsl_r_r_and, int_hexagon_S2_lsl_r_r_and>;
+def : T_RRR_pat <S2_asr_r_r_or, int_hexagon_S2_asr_r_r_or>;
+def : T_RRR_pat <S2_lsr_r_r_or, int_hexagon_S2_lsr_r_r_or>;
+def : T_RRR_pat <S2_asl_r_r_or, int_hexagon_S2_asl_r_r_or>;
+def : T_RRR_pat <S2_lsl_r_r_or, int_hexagon_S2_lsl_r_r_or>;
+
+def : T_PPR_pat <S2_asr_r_p_nac, int_hexagon_S2_asr_r_p_nac>;
+def : T_PPR_pat <S2_lsr_r_p_nac, int_hexagon_S2_lsr_r_p_nac>;
+def : T_PPR_pat <S2_asl_r_p_nac, int_hexagon_S2_asl_r_p_nac>;
+def : T_PPR_pat <S2_lsl_r_p_nac, int_hexagon_S2_lsl_r_p_nac>;
+def : T_PPR_pat <S2_asr_r_p_acc, int_hexagon_S2_asr_r_p_acc>;
+def : T_PPR_pat <S2_lsr_r_p_acc, int_hexagon_S2_lsr_r_p_acc>;
+def : T_PPR_pat <S2_asl_r_p_acc, int_hexagon_S2_asl_r_p_acc>;
+def : T_PPR_pat <S2_lsl_r_p_acc, int_hexagon_S2_lsl_r_p_acc>;
+
+def : T_PPR_pat <S2_asr_r_p_and, int_hexagon_S2_asr_r_p_and>;
+def : T_PPR_pat <S2_lsr_r_p_and, int_hexagon_S2_lsr_r_p_and>;
+def : T_PPR_pat <S2_asl_r_p_and, int_hexagon_S2_asl_r_p_and>;
+def : T_PPR_pat <S2_lsl_r_p_and, int_hexagon_S2_lsl_r_p_and>;
+def : T_PPR_pat <S2_asr_r_p_or, int_hexagon_S2_asr_r_p_or>;
+def : T_PPR_pat <S2_lsr_r_p_or, int_hexagon_S2_lsr_r_p_or>;
+def : T_PPR_pat <S2_asl_r_p_or, int_hexagon_S2_asl_r_p_or>;
+def : T_PPR_pat <S2_lsl_r_p_or, int_hexagon_S2_lsl_r_p_or>;
+
+//*******************************************************************
+// ALU32/ALU
+//*******************************************************************
+def : T_RR_pat<A2_add, int_hexagon_A2_add>;
+def : T_RI_pat<A2_addi, int_hexagon_A2_addi>;
+def : T_RR_pat<A2_sub, int_hexagon_A2_sub>;
+def : T_IR_pat<A2_subri, int_hexagon_A2_subri>;
+def : T_RR_pat<A2_and, int_hexagon_A2_and>;
+def : T_RI_pat<A2_andir, int_hexagon_A2_andir>;
+def : T_RR_pat<A2_or, int_hexagon_A2_or>;
+def : T_RI_pat<A2_orir, int_hexagon_A2_orir>;
+def : T_RR_pat<A2_xor, int_hexagon_A2_xor>;
+def : T_RR_pat<A2_combinew, int_hexagon_A2_combinew>;
+
+// Assembler mapped from Rd32=not(Rs32) to Rd32=sub(#-1,Rs32)
+def : Pat <(int_hexagon_A2_not I32:$Rs),
+ (A2_subri -1, I32:$Rs)>;
+
+// Assembler mapped from Rd32=neg(Rs32) to Rd32=sub(#0,Rs32)
+def : Pat <(int_hexagon_A2_neg I32:$Rs),
+ (A2_subri 0, I32:$Rs)>;
+
+// Transfer immediate
+def : Pat <(int_hexagon_A2_tfril I32:$Rs, u16_0ImmPred:$Is),
+ (A2_tfril I32:$Rs, u16_0ImmPred:$Is)>;
+def : Pat <(int_hexagon_A2_tfrih I32:$Rs, u16_0ImmPred:$Is),
+ (A2_tfrih I32:$Rs, u16_0ImmPred:$Is)>;
+
+// Transfer Register/immediate.
+def : T_R_pat <A2_tfr, int_hexagon_A2_tfr>;
+def : T_I_pat <A2_tfrsi, int_hexagon_A2_tfrsi>;
+
+def ImmExt64: SDNodeXForm<imm, [{
+ int64_t V = N->getSExtValue();
+ return CurDAG->getTargetConstant(V, SDLoc(N), MVT::i64);
+}]>;
+
+// A2_tfrpi has an operand of type i64. This is necessary, since it is
+// generated from "(set I64:$Rd, imm)". That pattern would not appear
+// in the DAG, if the immediate was not a 64-bit value.
+// The builtin for A2_tfrpi, on the other hand, takes a 32-bit value,
+// which makes it impossible to simply replace it with the instruction.
+// To connect the builtin with the instruction, the builtin's operand
+// needs to be extended to the right type.
+
+def : Pat<(int_hexagon_A2_tfrpi imm:$Is),
+ (A2_tfrpi (ImmExt64 $Is))>;
+
+// Assembler mapped from Rdd32=Rss32 to Rdd32=combine(Rss.H32,Rss.L32)
+def : Pat<(int_hexagon_A2_tfrp I64:$src),
+ (A2_combinew (HiReg I64:$src), (LoReg I64:$src))>;
+
+//*******************************************************************
+// ALU32/PERM
+//*******************************************************************
+// Combine
+def: T_RR_pat<A2_combine_hh, int_hexagon_A2_combine_hh>;
+def: T_RR_pat<A2_combine_hl, int_hexagon_A2_combine_hl>;
+def: T_RR_pat<A2_combine_lh, int_hexagon_A2_combine_lh>;
+def: T_RR_pat<A2_combine_ll, int_hexagon_A2_combine_ll>;
+
+def: T_II_pat<A2_combineii, int_hexagon_A2_combineii, s32_0ImmPred, s8_0ImmPred>;
+
+// Mux
+def : T_QRR_pat<C2_mux, int_hexagon_C2_mux>;
+def : T_QRI_pat<C2_muxir, int_hexagon_C2_muxir, s32_0ImmPred>;
+def : T_QIR_pat<C2_muxri, int_hexagon_C2_muxri, s32_0ImmPred>;
+def : T_QII_pat<C2_muxii, int_hexagon_C2_muxii, s32_0ImmPred, s8_0ImmPred>;
+
+// Shift halfword
+def : T_R_pat<A2_aslh, int_hexagon_A2_aslh>;
+def : T_R_pat<A2_asrh, int_hexagon_A2_asrh>;
+def : T_R_pat<A2_asrh, int_hexagon_SI_to_SXTHI_asrh>;
+
+// Sign/zero extend
+def : T_R_pat<A2_sxth, int_hexagon_A2_sxth>;
+def : T_R_pat<A2_sxtb, int_hexagon_A2_sxtb>;
+def : T_R_pat<A2_zxth, int_hexagon_A2_zxth>;
+def : T_R_pat<A2_zxtb, int_hexagon_A2_zxtb>;
+
+//*******************************************************************
+// ALU32/PRED
+//*******************************************************************
+// Compare
+def : T_Q_RR_pat<C2_cmpeq, int_hexagon_C2_cmpeq>;
+def : T_Q_RR_pat<C2_cmpgt, int_hexagon_C2_cmpgt>;
+def : T_Q_RR_pat<C2_cmpgtu, int_hexagon_C2_cmpgtu>;
+
+def : T_Q_RI_pat<C2_cmpeqi, int_hexagon_C2_cmpeqi, s32_0ImmPred>;
+def : T_Q_RI_pat<C2_cmpgti, int_hexagon_C2_cmpgti, s32_0ImmPred>;
+def : T_Q_RI_pat<C2_cmpgtui, int_hexagon_C2_cmpgtui, u32_0ImmPred>;
+
+def : Pat <(int_hexagon_C2_cmpgei I32:$src1, s32_0ImmPred:$src2),
+ (C2_tfrpr (C2_cmpgti I32:$src1, (SDEC1 s32_0ImmPred:$src2)))>;
+
+def : Pat <(int_hexagon_C2_cmpgeui I32:$src1, u32_0ImmPred:$src2),
+ (C2_tfrpr (C2_cmpgtui I32:$src1, (UDEC1 u32_0ImmPred:$src2)))>;
+
+def : Pat <(int_hexagon_C2_cmpgeui I32:$src, 0),
+ (C2_tfrpr (C2_cmpeq I32:$src, I32:$src))>;
+def : Pat <(int_hexagon_C2_cmplt I32:$src1, I32:$src2),
+ (C2_tfrpr (C2_cmpgt I32:$src2, I32:$src1))>;
+def : Pat <(int_hexagon_C2_cmpltu I32:$src1, I32:$src2),
+ (C2_tfrpr (C2_cmpgtu I32:$src2, I32:$src1))>;
+
+//*******************************************************************
+// ALU32/VH
+//*******************************************************************
+// Vector add, subtract, average halfwords
+def: T_RR_pat<A2_svaddh, int_hexagon_A2_svaddh>;
+def: T_RR_pat<A2_svaddhs, int_hexagon_A2_svaddhs>;
+def: T_RR_pat<A2_svadduhs, int_hexagon_A2_svadduhs>;
+
+def: T_RR_pat<A2_svsubh, int_hexagon_A2_svsubh>;
+def: T_RR_pat<A2_svsubhs, int_hexagon_A2_svsubhs>;
+def: T_RR_pat<A2_svsubuhs, int_hexagon_A2_svsubuhs>;
+
+def: T_RR_pat<A2_svavgh, int_hexagon_A2_svavgh>;
+def: T_RR_pat<A2_svavghs, int_hexagon_A2_svavghs>;
+def: T_RR_pat<A2_svnavgh, int_hexagon_A2_svnavgh>;
+
+//*******************************************************************
+// ALU64/ALU
+//*******************************************************************
+def: T_RR_pat<A2_addsat, int_hexagon_A2_addsat>;
+def: T_RR_pat<A2_subsat, int_hexagon_A2_subsat>;
+def: T_PP_pat<A2_addp, int_hexagon_A2_addp>;
+def: T_PP_pat<A2_subp, int_hexagon_A2_subp>;
+
+def: T_PP_pat<A2_andp, int_hexagon_A2_andp>;
+def: T_PP_pat<A2_orp, int_hexagon_A2_orp>;
+def: T_PP_pat<A2_xorp, int_hexagon_A2_xorp>;
+
+def: T_Q_PP_pat<C2_cmpeqp, int_hexagon_C2_cmpeqp>;
+def: T_Q_PP_pat<C2_cmpgtp, int_hexagon_C2_cmpgtp>;
+def: T_Q_PP_pat<C2_cmpgtup, int_hexagon_C2_cmpgtup>;
+
+def: T_PP_pat<S2_parityp, int_hexagon_S2_parityp>;
+def: T_RR_pat<S2_packhl, int_hexagon_S2_packhl>;
+
+//*******************************************************************
+// ALU64/VB
+//*******************************************************************
+// ALU64 - Vector add
+def : T_PP_pat <A2_vaddub, int_hexagon_A2_vaddub>;
+def : T_PP_pat <A2_vaddubs, int_hexagon_A2_vaddubs>;
+def : T_PP_pat <A2_vaddh, int_hexagon_A2_vaddh>;
+def : T_PP_pat <A2_vaddhs, int_hexagon_A2_vaddhs>;
+def : T_PP_pat <A2_vadduhs, int_hexagon_A2_vadduhs>;
+def : T_PP_pat <A2_vaddw, int_hexagon_A2_vaddw>;
+def : T_PP_pat <A2_vaddws, int_hexagon_A2_vaddws>;
+
+// ALU64 - Vector average
+def : T_PP_pat <A2_vavgub, int_hexagon_A2_vavgub>;
+def : T_PP_pat <A2_vavgubr, int_hexagon_A2_vavgubr>;
+def : T_PP_pat <A2_vavgh, int_hexagon_A2_vavgh>;
+def : T_PP_pat <A2_vavghr, int_hexagon_A2_vavghr>;
+def : T_PP_pat <A2_vavghcr, int_hexagon_A2_vavghcr>;
+def : T_PP_pat <A2_vavguh, int_hexagon_A2_vavguh>;
+def : T_PP_pat <A2_vavguhr, int_hexagon_A2_vavguhr>;
+
+def : T_PP_pat <A2_vavgw, int_hexagon_A2_vavgw>;
+def : T_PP_pat <A2_vavgwr, int_hexagon_A2_vavgwr>;
+def : T_PP_pat <A2_vavgwcr, int_hexagon_A2_vavgwcr>;
+def : T_PP_pat <A2_vavguw, int_hexagon_A2_vavguw>;
+def : T_PP_pat <A2_vavguwr, int_hexagon_A2_vavguwr>;
+
+// ALU64 - Vector negative average
+def : T_PP_pat <A2_vnavgh, int_hexagon_A2_vnavgh>;
+def : T_PP_pat <A2_vnavghr, int_hexagon_A2_vnavghr>;
+def : T_PP_pat <A2_vnavghcr, int_hexagon_A2_vnavghcr>;
+def : T_PP_pat <A2_vnavgw, int_hexagon_A2_vnavgw>;
+def : T_PP_pat <A2_vnavgwr, int_hexagon_A2_vnavgwr>;
+def : T_PP_pat <A2_vnavgwcr, int_hexagon_A2_vnavgwcr>;
+
+// ALU64 - Vector max
+def : T_PP_pat <A2_vmaxh, int_hexagon_A2_vmaxh>;
+def : T_PP_pat <A2_vmaxw, int_hexagon_A2_vmaxw>;
+def : T_PP_pat <A2_vmaxub, int_hexagon_A2_vmaxub>;
+def : T_PP_pat <A2_vmaxuh, int_hexagon_A2_vmaxuh>;
+def : T_PP_pat <A2_vmaxuw, int_hexagon_A2_vmaxuw>;
+
+// ALU64 - Vector min
+def : T_PP_pat <A2_vminh, int_hexagon_A2_vminh>;
+def : T_PP_pat <A2_vminw, int_hexagon_A2_vminw>;
+def : T_PP_pat <A2_vminub, int_hexagon_A2_vminub>;
+def : T_PP_pat <A2_vminuh, int_hexagon_A2_vminuh>;
+def : T_PP_pat <A2_vminuw, int_hexagon_A2_vminuw>;
+
+// ALU64 - Vector sub
+def : T_PP_pat <A2_vsubub, int_hexagon_A2_vsubub>;
+def : T_PP_pat <A2_vsububs, int_hexagon_A2_vsububs>;
+def : T_PP_pat <A2_vsubh, int_hexagon_A2_vsubh>;
+def : T_PP_pat <A2_vsubhs, int_hexagon_A2_vsubhs>;
+def : T_PP_pat <A2_vsubuhs, int_hexagon_A2_vsubuhs>;
+def : T_PP_pat <A2_vsubw, int_hexagon_A2_vsubw>;
+def : T_PP_pat <A2_vsubws, int_hexagon_A2_vsubws>;
+
+// ALU64 - Vector compare bytes
+def : T_Q_PP_pat <A2_vcmpbeq, int_hexagon_A2_vcmpbeq>;
+def : T_Q_PP_pat <A4_vcmpbgt, int_hexagon_A4_vcmpbgt>;
+def : T_Q_PP_pat <A2_vcmpbgtu, int_hexagon_A2_vcmpbgtu>;
+
+// ALU64 - Vector compare halfwords
+def : T_Q_PP_pat <A2_vcmpheq, int_hexagon_A2_vcmpheq>;
+def : T_Q_PP_pat <A2_vcmphgt, int_hexagon_A2_vcmphgt>;
+def : T_Q_PP_pat <A2_vcmphgtu, int_hexagon_A2_vcmphgtu>;
+
+// ALU64 - Vector compare words
+def : T_Q_PP_pat <A2_vcmpweq, int_hexagon_A2_vcmpweq>;
+def : T_Q_PP_pat <A2_vcmpwgt, int_hexagon_A2_vcmpwgt>;
+def : T_Q_PP_pat <A2_vcmpwgtu, int_hexagon_A2_vcmpwgtu>;
+
+// ALU64 / VB / Vector mux.
+def : T_QPP_pat <C2_vmux, int_hexagon_C2_vmux>;
+
+// MPY - Multiply and use full result
+// Rdd = mpy[u](Rs, Rt)
+def : T_RR_pat <M2_dpmpyss_s0, int_hexagon_M2_dpmpyss_s0>;
+def : T_RR_pat <M2_dpmpyuu_s0, int_hexagon_M2_dpmpyuu_s0>;
+
+// Complex multiply real or imaginary
+def : T_RR_pat <M2_cmpyi_s0, int_hexagon_M2_cmpyi_s0>;
+def : T_RR_pat <M2_cmpyr_s0, int_hexagon_M2_cmpyr_s0>;
+
+// Complex multiply
+def : T_RR_pat <M2_cmpys_s0, int_hexagon_M2_cmpys_s0>;
+def : T_RR_pat <M2_cmpysc_s0, int_hexagon_M2_cmpysc_s0>;
+def : T_RR_pat <M2_cmpys_s1, int_hexagon_M2_cmpys_s1>;
+def : T_RR_pat <M2_cmpysc_s1, int_hexagon_M2_cmpysc_s1>;
+
+// Vector multiply halfwords
+// Rdd=vmpyh(Rs,Rt)[:<<1]:sat
+def : T_RR_pat <M2_vmpy2s_s0, int_hexagon_M2_vmpy2s_s0>;
+def : T_RR_pat <M2_vmpy2s_s1, int_hexagon_M2_vmpy2s_s1>;
+
+// Rxx[+-]= mpy[u](Rs,Rt)
+def : T_PRR_pat <M2_dpmpyss_acc_s0, int_hexagon_M2_dpmpyss_acc_s0>;
+def : T_PRR_pat <M2_dpmpyss_nac_s0, int_hexagon_M2_dpmpyss_nac_s0>;
+def : T_PRR_pat <M2_dpmpyuu_acc_s0, int_hexagon_M2_dpmpyuu_acc_s0>;
+def : T_PRR_pat <M2_dpmpyuu_nac_s0, int_hexagon_M2_dpmpyuu_nac_s0>;
+
+// Rxx[-+]=cmpy(Rs,Rt)[:<<1]:sat
+def : T_PRR_pat <M2_cmacs_s0, int_hexagon_M2_cmacs_s0>;
+def : T_PRR_pat <M2_cnacs_s0, int_hexagon_M2_cnacs_s0>;
+def : T_PRR_pat <M2_cmacs_s1, int_hexagon_M2_cmacs_s1>;
+def : T_PRR_pat <M2_cnacs_s1, int_hexagon_M2_cnacs_s1>;
+
+// Rxx[-+]=cmpy(Rs,Rt*)[:<<1]:sat
+def : T_PRR_pat <M2_cmacsc_s0, int_hexagon_M2_cmacsc_s0>;
+def : T_PRR_pat <M2_cnacsc_s0, int_hexagon_M2_cnacsc_s0>;
+def : T_PRR_pat <M2_cmacsc_s1, int_hexagon_M2_cmacsc_s1>;
+def : T_PRR_pat <M2_cnacsc_s1, int_hexagon_M2_cnacsc_s1>;
+
+// Rxx+=cmpy[ir](Rs,Rt)
+def : T_PRR_pat <M2_cmaci_s0, int_hexagon_M2_cmaci_s0>;
+def : T_PRR_pat <M2_cmacr_s0, int_hexagon_M2_cmacr_s0>;
+
+// Rxx+=vmpyh(Rs,Rt)[:<<1][:sat]
+def : T_PRR_pat <M2_vmac2, int_hexagon_M2_vmac2>;
+def : T_PRR_pat <M2_vmac2s_s0, int_hexagon_M2_vmac2s_s0>;
+def : T_PRR_pat <M2_vmac2s_s1, int_hexagon_M2_vmac2s_s1>;
+
+//*******************************************************************
+// CR
+//*******************************************************************
+def: T_Q_Q_pat<C2_not, int_hexagon_C2_not>;
+def: T_Q_Q_pat<C2_all8, int_hexagon_C2_all8>;
+def: T_Q_Q_pat<C2_any8, int_hexagon_C2_any8>;
+def: T_Q_Q_pat<C2_pxfer_map, int_hexagon_C2_pxfer_map>;
+
+def: T_Q_QQ_pat<C2_and, int_hexagon_C2_and>;
+def: T_Q_QQ_pat<C2_andn, int_hexagon_C2_andn>;
+def: T_Q_QQ_pat<C2_or, int_hexagon_C2_or>;
+def: T_Q_QQ_pat<C2_orn, int_hexagon_C2_orn>;
+def: T_Q_QQ_pat<C2_xor, int_hexagon_C2_xor>;
+
+// Multiply 32x32 and use lower result
+def : T_RRI_pat <M2_macsip, int_hexagon_M2_macsip>;
+def : T_RRI_pat <M2_macsin, int_hexagon_M2_macsin>;
+def : T_RRR_pat <M2_maci, int_hexagon_M2_maci>;
+
+// Subtract and accumulate
+def : T_RRR_pat <M2_subacc, int_hexagon_M2_subacc>;
+
+// Add and accumulate
+def : T_RRR_pat <M2_acci, int_hexagon_M2_acci>;
+def : T_RRR_pat <M2_nacci, int_hexagon_M2_nacci>;
+def : T_RRI_pat <M2_accii, int_hexagon_M2_accii>;
+def : T_RRI_pat <M2_naccii, int_hexagon_M2_naccii>;
+
+// XOR and XOR with destination
+def : T_RRR_pat <M2_xor_xacc, int_hexagon_M2_xor_xacc>;
+
+// Vector dual multiply with round and pack
+def : T_PP_pat <M2_vdmpyrs_s0, int_hexagon_M2_vdmpyrs_s0>;
+def : T_PP_pat <M2_vdmpyrs_s1, int_hexagon_M2_vdmpyrs_s1>;
+
+// Vector multiply halfwords with round and pack
+def : T_RR_pat <M2_vmpy2s_s0pack, int_hexagon_M2_vmpy2s_s0pack>;
+def : T_RR_pat <M2_vmpy2s_s1pack, int_hexagon_M2_vmpy2s_s1pack>;
+
+// Multiply and use lower result
+def : T_RR_pat <M2_mpyi, int_hexagon_M2_mpyi>;
+def : T_RI_pat <M2_mpysmi, int_hexagon_M2_mpysmi>;
+
+// Assembler mapped from Rd32=mpyui(Rs32,Rt32) to Rd32=mpyi(Rs32,Rt32)
+def : T_RR_pat <M2_mpyi, int_hexagon_M2_mpyui>;
+
+// Multiply and use upper result
+def : T_RR_pat <M2_mpy_up, int_hexagon_M2_mpy_up>;
+def : T_RR_pat <M2_mpyu_up, int_hexagon_M2_mpyu_up>;
+def : T_RR_pat <M2_hmmpyh_rs1, int_hexagon_M2_hmmpyh_rs1>;
+def : T_RR_pat <M2_hmmpyl_rs1, int_hexagon_M2_hmmpyl_rs1>;
+def : T_RR_pat <M2_dpmpyss_rnd_s0, int_hexagon_M2_dpmpyss_rnd_s0>;
+
+// Complex multiply with round and pack
+// Rxx32+=cmpy(Rs32,[*]Rt32:<<1]:rnd:sat
+def : T_RR_pat <M2_cmpyrs_s0, int_hexagon_M2_cmpyrs_s0>;
+def : T_RR_pat <M2_cmpyrs_s1, int_hexagon_M2_cmpyrs_s1>;
+def : T_RR_pat <M2_cmpyrsc_s0, int_hexagon_M2_cmpyrsc_s0>;
+def : T_RR_pat <M2_cmpyrsc_s1, int_hexagon_M2_cmpyrsc_s1>;
+
+//*******************************************************************
+// STYPE/ALU
+//*******************************************************************
+def : T_P_pat <A2_absp, int_hexagon_A2_absp>;
+def : T_P_pat <A2_negp, int_hexagon_A2_negp>;
+def : T_P_pat <A2_notp, int_hexagon_A2_notp>;
+
+//*******************************************************************
+// STYPE/BIT
+//*******************************************************************
+
+// Count leading/trailing
+def: T_R_pat<S2_cl0, int_hexagon_S2_cl0>;
+def: T_P_pat<S2_cl0p, int_hexagon_S2_cl0p>;
+def: T_R_pat<S2_cl1, int_hexagon_S2_cl1>;
+def: T_P_pat<S2_cl1p, int_hexagon_S2_cl1p>;
+def: T_R_pat<S2_clb, int_hexagon_S2_clb>;
+def: T_P_pat<S2_clbp, int_hexagon_S2_clbp>;
+def: T_R_pat<S2_clbnorm, int_hexagon_S2_clbnorm>;
+def: T_R_pat<S2_ct0, int_hexagon_S2_ct0>;
+def: T_R_pat<S2_ct1, int_hexagon_S2_ct1>;
+
+// Compare bit mask
+def: T_RR_pat<C2_bitsclr, int_hexagon_C2_bitsclr>;
+def: T_RI_pat<C2_bitsclri, int_hexagon_C2_bitsclri>;
+def: T_RR_pat<C2_bitsset, int_hexagon_C2_bitsset>;
+
+// Vector shuffle
+def : T_PP_pat <S2_shuffeb, int_hexagon_S2_shuffeb>;
+def : T_PP_pat <S2_shuffob, int_hexagon_S2_shuffob>;
+def : T_PP_pat <S2_shuffeh, int_hexagon_S2_shuffeh>;
+def : T_PP_pat <S2_shuffoh, int_hexagon_S2_shuffoh>;
+
+// Vector truncate
+def : T_PP_pat <S2_vtrunewh, int_hexagon_S2_vtrunewh>;
+def : T_PP_pat <S2_vtrunowh, int_hexagon_S2_vtrunowh>;
+
+// Linear feedback-shift Iteration.
+def : T_PP_pat <S2_lfsp, int_hexagon_S2_lfsp>;
+
+// Vector align
+// Need custom lowering
+def : T_PPQ_pat <S2_valignrb, int_hexagon_S2_valignrb>;
+def : T_PPI_pat <S2_valignib, int_hexagon_S2_valignib>;
+
+// Vector splice
+def : T_PPQ_pat <S2_vsplicerb, int_hexagon_S2_vsplicerb>;
+def : T_PPI_pat <S2_vspliceib, int_hexagon_S2_vspliceib>;
+
+// Shift by immediate and add
+def : T_RRI_pat<S2_addasl_rrri, int_hexagon_S2_addasl_rrri>;
+
+// Extract bitfield
+def : T_PII_pat<S2_extractup, int_hexagon_S2_extractup>;
+def : T_RII_pat<S2_extractu, int_hexagon_S2_extractu>;
+def : T_RP_pat <S2_extractu_rp, int_hexagon_S2_extractu_rp>;
+def : T_PP_pat <S2_extractup_rp, int_hexagon_S2_extractup_rp>;
+
+// Insert bitfield
+def : Pat <(int_hexagon_S2_insert_rp I32:$src1, I32:$src2, I64:$src3),
+ (S2_insert_rp I32:$src1, I32:$src2, I64:$src3)>;
+
+def : Pat<(i64 (int_hexagon_S2_insertp_rp I64:$src1, I64:$src2, I64:$src3)),
+ (i64 (S2_insertp_rp I64:$src1, I64:$src2, I64:$src3))>;
+
+def : Pat<(int_hexagon_S2_insert I32:$src1, I32:$src2,
+ u5_0ImmPred:$src3, u5_0ImmPred:$src4),
+ (S2_insert I32:$src1, I32:$src2,
+ u5_0ImmPred:$src3, u5_0ImmPred:$src4)>;
+
+def : Pat<(i64 (int_hexagon_S2_insertp I64:$src1, I64:$src2,
+ u6_0ImmPred:$src3, u6_0ImmPred:$src4)),
+ (i64 (S2_insertp I64:$src1, I64:$src2,
+ u6_0ImmPred:$src3, u6_0ImmPred:$src4))>;
+
+// Innterleave/deinterleave
+def : T_P_pat <S2_interleave, int_hexagon_S2_interleave>;
+def : T_P_pat <S2_deinterleave, int_hexagon_S2_deinterleave>;
+
+// Set/Clear/Toggle Bit
+def: T_RI_pat<S2_setbit_i, int_hexagon_S2_setbit_i>;
+def: T_RI_pat<S2_clrbit_i, int_hexagon_S2_clrbit_i>;
+def: T_RI_pat<S2_togglebit_i, int_hexagon_S2_togglebit_i>;
+
+def: T_RR_pat<S2_setbit_r, int_hexagon_S2_setbit_r>;
+def: T_RR_pat<S2_clrbit_r, int_hexagon_S2_clrbit_r>;
+def: T_RR_pat<S2_togglebit_r, int_hexagon_S2_togglebit_r>;
+
+// Test Bit
+def: T_Q_RI_pat<S2_tstbit_i, int_hexagon_S2_tstbit_i>;
+def: T_Q_RR_pat<S2_tstbit_r, int_hexagon_S2_tstbit_r>;
+
+//*******************************************************************
+// STYPE/COMPLEX
+//*******************************************************************
+// Vector Complex conjugate
+def : T_P_pat <A2_vconj, int_hexagon_A2_vconj>;
+
+// Vector Complex rotate
+def : T_PR_pat <S2_vcrotate, int_hexagon_S2_vcrotate>;
+
+//*******************************************************************
+// STYPE/PERM
+//*******************************************************************
+
+// Vector saturate without pack
+def : T_P_pat <S2_vsathb_nopack, int_hexagon_S2_vsathb_nopack>;
+def : T_P_pat <S2_vsathub_nopack, int_hexagon_S2_vsathub_nopack>;
+def : T_P_pat <S2_vsatwh_nopack, int_hexagon_S2_vsatwh_nopack>;
+def : T_P_pat <S2_vsatwuh_nopack, int_hexagon_S2_vsatwuh_nopack>;
+
+//*******************************************************************
+// STYPE/PRED
+//*******************************************************************
+
+// Predicate transfer
+def: Pat<(i32 (int_hexagon_C2_tfrpr I32:$Rs)),
+ (i32 (C2_tfrpr (C2_tfrrp I32:$Rs)))>;
+def: Pat<(i32 (int_hexagon_C2_tfrrp I32:$Rs)),
+ (i32 (C2_tfrpr (C2_tfrrp I32:$Rs)))>;
+
+// Mask generate from predicate
+def: Pat<(i64 (int_hexagon_C2_mask I32:$Rs)),
+ (i64 (C2_mask (C2_tfrrp I32:$Rs)))>;
+
+// Viterbi pack even and odd predicate bits
+def: T_QQ_pat<C2_vitpack, int_hexagon_C2_vitpack>;
+
+//*******************************************************************
+// STYPE/SHIFT
+//*******************************************************************
+
+def : T_PI_pat <S2_asr_i_p, int_hexagon_S2_asr_i_p>;
+def : T_PI_pat <S2_lsr_i_p, int_hexagon_S2_lsr_i_p>;
+def : T_PI_pat <S2_asl_i_p, int_hexagon_S2_asl_i_p>;
+
+def : T_PR_pat <S2_asr_r_p, int_hexagon_S2_asr_r_p>;
+def : T_PR_pat <S2_lsr_r_p, int_hexagon_S2_lsr_r_p>;
+def : T_PR_pat <S2_asl_r_p, int_hexagon_S2_asl_r_p>;
+def : T_PR_pat <S2_lsl_r_p, int_hexagon_S2_lsl_r_p>;
+
+def : T_RR_pat <S2_asr_r_r, int_hexagon_S2_asr_r_r>;
+def : T_RR_pat <S2_lsr_r_r, int_hexagon_S2_lsr_r_r>;
+def : T_RR_pat <S2_asl_r_r, int_hexagon_S2_asl_r_r>;
+def : T_RR_pat <S2_lsl_r_r, int_hexagon_S2_lsl_r_r>;
+
+def : T_RR_pat <S2_asr_r_r_sat, int_hexagon_S2_asr_r_r_sat>;
+def : T_RR_pat <S2_asl_r_r_sat, int_hexagon_S2_asl_r_r_sat>;
+
+def : T_R_pat <S2_vsxtbh, int_hexagon_S2_vsxtbh>;
+def : T_R_pat <S2_vzxtbh, int_hexagon_S2_vzxtbh>;
+def : T_R_pat <S2_vsxthw, int_hexagon_S2_vsxthw>;
+def : T_R_pat <S2_vzxthw, int_hexagon_S2_vzxthw>;
+def : T_R_pat <S2_vsplatrh, int_hexagon_S2_vsplatrh>;
+def : T_R_pat <A2_sxtw, int_hexagon_A2_sxtw>;
+
+// Vector saturate and pack
+def : T_R_pat <S2_svsathb, int_hexagon_S2_svsathb>;
+def : T_R_pat <S2_svsathub, int_hexagon_S2_svsathub>;
+def : T_P_pat <S2_vsathub, int_hexagon_S2_vsathub>;
+def : T_P_pat <S2_vsatwh, int_hexagon_S2_vsatwh>;
+def : T_P_pat <S2_vsatwuh, int_hexagon_S2_vsatwuh>;
+def : T_P_pat <S2_vsathb, int_hexagon_S2_vsathb>;
+
+def : T_P_pat <S2_vtrunohb, int_hexagon_S2_vtrunohb>;
+def : T_P_pat <S2_vtrunehb, int_hexagon_S2_vtrunehb>;
+def : T_P_pat <S2_vrndpackwh, int_hexagon_S2_vrndpackwh>;
+def : T_P_pat <S2_vrndpackwhs, int_hexagon_S2_vrndpackwhs>;
+def : T_R_pat <S2_brev, int_hexagon_S2_brev>;
+def : T_R_pat <S2_vsplatrb, int_hexagon_S2_vsplatrb>;
+
+def : T_R_pat <A2_abs, int_hexagon_A2_abs>;
+def : T_R_pat <A2_abssat, int_hexagon_A2_abssat>;
+def : T_R_pat <A2_negsat, int_hexagon_A2_negsat>;
+
+def : T_R_pat <A2_swiz, int_hexagon_A2_swiz>;
+
+def : T_P_pat <A2_sat, int_hexagon_A2_sat>;
+def : T_R_pat <A2_sath, int_hexagon_A2_sath>;
+def : T_R_pat <A2_satuh, int_hexagon_A2_satuh>;
+def : T_R_pat <A2_satub, int_hexagon_A2_satub>;
+def : T_R_pat <A2_satb, int_hexagon_A2_satb>;
+
+// Vector arithmetic shift right by immediate with truncate and pack.
+def : T_PI_pat<S2_asr_i_svw_trun, int_hexagon_S2_asr_i_svw_trun>;
+
+def : T_RI_pat <S2_asr_i_r, int_hexagon_S2_asr_i_r>;
+def : T_RI_pat <S2_lsr_i_r, int_hexagon_S2_lsr_i_r>;
+def : T_RI_pat <S2_asl_i_r, int_hexagon_S2_asl_i_r>;
+def : T_RI_pat <S2_asr_i_r_rnd, int_hexagon_S2_asr_i_r_rnd>;
+def : T_RI_pat <S2_asr_i_r_rnd_goodsyntax,
+ int_hexagon_S2_asr_i_r_rnd_goodsyntax>;
+
+// Shift left by immediate with saturation.
+def : T_RI_pat <S2_asl_i_r_sat, int_hexagon_S2_asl_i_r_sat>;
+
+//===----------------------------------------------------------------------===//
+// Template 'def pat' to map tableidx[bhwd] intrinsics to :raw instructions.
+//===----------------------------------------------------------------------===//
+class S2op_tableidx_pat <Intrinsic IntID, InstHexagon OutputInst,
+ SDNodeXForm XformImm>
+ : Pat <(IntID I32:$src1, I32:$src2, u4_0ImmPred:$src3, u5_0ImmPred:$src4),
+ (OutputInst I32:$src1, I32:$src2, u4_0ImmPred:$src3,
+ (XformImm u5_0ImmPred:$src4))>;
+
+def SDEC2 : SDNodeXForm<imm, [{
+ int32_t V = N->getSExtValue();
+ return CurDAG->getTargetConstant(V-2, SDLoc(N), MVT::i32);
+}]>;
+
+def SDEC3 : SDNodeXForm<imm, [{
+ int32_t V = N->getSExtValue();
+ return CurDAG->getTargetConstant(V-3, SDLoc(N), MVT::i32);
+}]>;
+
+// Table Index : Extract and insert bits.
+// Map to the real hardware instructions after subtracting appropriate
+// values from the 4th input operand. Please note that subtraction is not
+// needed for int_hexagon_S2_tableidxb_goodsyntax.
+
+def : Pat <(int_hexagon_S2_tableidxb_goodsyntax I32:$src1, I32:$src2,
+ u4_0ImmPred:$src3, u5_0ImmPred:$src4),
+ (S2_tableidxb I32:$src1, I32:$src2,
+ u4_0ImmPred:$src3, u5_0ImmPred:$src4)>;
+
+def : S2op_tableidx_pat <int_hexagon_S2_tableidxh_goodsyntax, S2_tableidxh,
+ SDEC1>;
+def : S2op_tableidx_pat <int_hexagon_S2_tableidxw_goodsyntax, S2_tableidxw,
+ SDEC2>;
+def : S2op_tableidx_pat <int_hexagon_S2_tableidxd_goodsyntax, S2_tableidxd,
+ SDEC3>;
+
+//*******************************************************************
+// STYPE/VH
+//*******************************************************************
+
+// Vector absolute value halfwords with and without saturation
+// Rdd64=vabsh(Rss64)[:sat]
+def : T_P_pat <A2_vabsh, int_hexagon_A2_vabsh>;
+def : T_P_pat <A2_vabshsat, int_hexagon_A2_vabshsat>;
+
+// Vector shift halfwords by immediate
+// Rdd64=[vaslh/vasrh/vlsrh](Rss64,u4)
+def : T_PI_pat <S2_asr_i_vh, int_hexagon_S2_asr_i_vh>;
+def : T_PI_pat <S2_lsr_i_vh, int_hexagon_S2_lsr_i_vh>;
+def : T_PI_pat <S2_asl_i_vh, int_hexagon_S2_asl_i_vh>;
+
+// Vector shift halfwords by register
+// Rdd64=[vaslw/vasrw/vlslw/vlsrw](Rss64,Rt32)
+def : T_PR_pat <S2_asr_r_vh, int_hexagon_S2_asr_r_vh>;
+def : T_PR_pat <S2_lsr_r_vh, int_hexagon_S2_lsr_r_vh>;
+def : T_PR_pat <S2_asl_r_vh, int_hexagon_S2_asl_r_vh>;
+def : T_PR_pat <S2_lsl_r_vh, int_hexagon_S2_lsl_r_vh>;
+
+//*******************************************************************
+// STYPE/VW
+//*******************************************************************
+
+// Vector absolute value words with and without saturation
+def : T_P_pat <A2_vabsw, int_hexagon_A2_vabsw>;
+def : T_P_pat <A2_vabswsat, int_hexagon_A2_vabswsat>;
+
+// Vector shift words by immediate.
+// Rdd64=[vasrw/vlsrw|vaslw](Rss64,u5)
+def : T_PI_pat <S2_asr_i_vw, int_hexagon_S2_asr_i_vw>;
+def : T_PI_pat <S2_lsr_i_vw, int_hexagon_S2_lsr_i_vw>;
+def : T_PI_pat <S2_asl_i_vw, int_hexagon_S2_asl_i_vw>;
+
+// Vector shift words by register.
+// Rdd64=[vasrw/vlsrw|vaslw|vlslw](Rss64,Rt32)
+def : T_PR_pat <S2_asr_r_vw, int_hexagon_S2_asr_r_vw>;
+def : T_PR_pat <S2_lsr_r_vw, int_hexagon_S2_lsr_r_vw>;
+def : T_PR_pat <S2_asl_r_vw, int_hexagon_S2_asl_r_vw>;
+def : T_PR_pat <S2_lsl_r_vw, int_hexagon_S2_lsl_r_vw>;
+
+// Vector shift words with truncate and pack
+def : T_PR_pat <S2_asr_r_svw_trun, int_hexagon_S2_asr_r_svw_trun>;
+
+// Load/store locked.
+def : T_R_pat<L2_loadw_locked, int_hexagon_L2_loadw_locked>;
+def : T_R_pat<L4_loadd_locked, int_hexagon_L4_loadd_locked>;
+
+def : Pat<(int_hexagon_S2_storew_locked I32:$Rs, I32:$Rt),
+ (C2_tfrpr (S2_storew_locked I32:$Rs, I32:$Rt))>;
+def : Pat<(int_hexagon_S4_stored_locked I32:$Rs, I64:$Rt),
+ (C2_tfrpr (S4_stored_locked I32:$Rs, I64:$Rt))>;
+
+//*******************************************************************
+// ST
+//*******************************************************************
+
+class T_stb_pat <InstHexagon MI, Intrinsic IntID, PatLeaf Val>
+ : Pat<(IntID I32:$Rs, Val:$Rt, I32:$Ru),
+ (MI I32:$Rs, I32:$Ru, Val:$Rt)>;
+
+def : T_stb_pat <S2_storerh_pbr, int_hexagon_brev_sth, I32>;
+def : T_stb_pat <S2_storerb_pbr, int_hexagon_brev_stb, I32>;
+def : T_stb_pat <S2_storeri_pbr, int_hexagon_brev_stw, I32>;
+def : T_stb_pat <S2_storerf_pbr, int_hexagon_brev_sthhi, I32>;
+def : T_stb_pat <S2_storerd_pbr, int_hexagon_brev_std, I64>;
+
+class T_stc_pat <InstHexagon MI, Intrinsic IntID, PatLeaf Imm, PatLeaf Val>
+ : Pat<(IntID I32:$Rs, Val:$Rt, I32:$Ru, Imm:$s),
+ (MI I32:$Rs, Imm:$s, I32:$Ru, Val:$Rt)>;
+
+def: T_stc_pat<S2_storerb_pci, int_hexagon_circ_stb, s4_0ImmPred, I32>;
+def: T_stc_pat<S2_storerh_pci, int_hexagon_circ_sth, s4_1ImmPred, I32>;
+def: T_stc_pat<S2_storeri_pci, int_hexagon_circ_stw, s4_2ImmPred, I32>;
+def: T_stc_pat<S2_storerd_pci, int_hexagon_circ_std, s4_3ImmPred, I64>;
+def: T_stc_pat<S2_storerf_pci, int_hexagon_circ_sthhi, s4_1ImmPred, I32>;
+
+include "HexagonIntrinsicsV3.td"
+include "HexagonIntrinsicsV4.td"
+include "HexagonIntrinsicsV5.td"
+include "HexagonIntrinsicsV60.td"
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsDerived.td b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsDerived.td
new file mode 100644
index 000000000000..400c17333f73
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsDerived.td
@@ -0,0 +1,40 @@
+//===-- HexagonIntrinsicsDerived.td - Derived intrinsics ---*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Multiply 64-bit and use lower result
+//
+// Optimized with intrinisics accumulates
+//
+def : Pat <(mul DoubleRegs:$src1, DoubleRegs:$src2),
+ (i64
+ (A2_combinew
+ (M2_maci
+ (M2_maci
+ (i32
+ (EXTRACT_SUBREG
+ (i64
+ (M2_dpmpyuu_s0 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1),
+ isub_lo)),
+ (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2),
+ isub_lo)))),
+ isub_hi)),
+ (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), isub_lo)),
+ (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), isub_hi))),
+ (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), isub_lo)),
+ (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), isub_hi))),
+ (i32
+ (EXTRACT_SUBREG
+ (i64
+ (M2_dpmpyuu_s0
+ (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), isub_lo)),
+ (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2),
+ isub_lo)))), isub_lo))))>;
+
+
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV3.td b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV3.td
new file mode 100644
index 000000000000..6152cb098825
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV3.td
@@ -0,0 +1,27 @@
+//=- HexagonIntrinsicsV3.td - Target Description for Hexagon -*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon V3 Compiler Intrinsics in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+// Vector reduce complex multiply real or imaginary
+def : T_PR_pat <M2_vrcmpys_s1, int_hexagon_M2_vrcmpys_s1>;
+def : T_PPR_pat<M2_vrcmpys_acc_s1, int_hexagon_M2_vrcmpys_acc_s1>;
+def : T_PR_pat <M2_vrcmpys_s1rp, int_hexagon_M2_vrcmpys_s1rp>;
+
+// Vector reduce add unsigned halfwords
+def : T_PP_pat<M2_vradduh, int_hexagon_M2_vradduh>;
+
+def: T_RP_pat<A2_addsp, int_hexagon_A2_addsp>;
+def: T_PP_pat<A2_addpsat, int_hexagon_A2_addpsat>;
+def: T_PP_pat<A2_minp, int_hexagon_A2_minp>;
+def: T_PP_pat<A2_minup, int_hexagon_A2_minup>;
+def: T_PP_pat<A2_maxp, int_hexagon_A2_maxp>;
+def: T_PP_pat<A2_maxup, int_hexagon_A2_maxup>;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV4.td b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV4.td
new file mode 100644
index 000000000000..2affe531515d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV4.td
@@ -0,0 +1,305 @@
+//===- HexagonIntrinsicsV4.td - V4 Instruction intrinsics --*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This is populated based on the following specs:
+// Hexagon V4 Architecture Extensions
+// Application-Level Specification
+// 80-V9418-12 Rev. A
+// June 15, 2010
+
+// Vector reduce multiply word by signed half (32x16)
+//Rdd=vrmpyweh(Rss,Rtt)[:<<1]
+def : T_PP_pat <M4_vrmpyeh_s0, int_hexagon_M4_vrmpyeh_s0>;
+def : T_PP_pat <M4_vrmpyeh_s1, int_hexagon_M4_vrmpyeh_s1>;
+
+//Rdd=vrmpywoh(Rss,Rtt)[:<<1]
+def : T_PP_pat <M4_vrmpyoh_s0, int_hexagon_M4_vrmpyoh_s0>;
+def : T_PP_pat <M4_vrmpyoh_s1, int_hexagon_M4_vrmpyoh_s1>;
+
+//Rdd+=vrmpyweh(Rss,Rtt)[:<<1]
+def : T_PPP_pat <M4_vrmpyeh_acc_s0, int_hexagon_M4_vrmpyeh_acc_s0>;
+def : T_PPP_pat <M4_vrmpyeh_acc_s1, int_hexagon_M4_vrmpyeh_acc_s1>;
+
+//Rdd=vrmpywoh(Rss,Rtt)[:<<1]
+def : T_PPP_pat <M4_vrmpyoh_acc_s0, int_hexagon_M4_vrmpyoh_acc_s0>;
+def : T_PPP_pat <M4_vrmpyoh_acc_s1, int_hexagon_M4_vrmpyoh_acc_s1>;
+
+// Vector multiply halfwords, signed by unsigned
+// Rdd=vmpyhsu(Rs,Rt)[:<<1]:sat
+def : T_RR_pat <M2_vmpy2su_s0, int_hexagon_M2_vmpy2su_s0>;
+def : T_RR_pat <M2_vmpy2su_s1, int_hexagon_M2_vmpy2su_s1>;
+
+// Rxx+=vmpyhsu(Rs,Rt)[:<<1]:sat
+def : T_PRR_pat <M2_vmac2su_s0, int_hexagon_M2_vmac2su_s0>;
+def : T_PRR_pat <M2_vmac2su_s1, int_hexagon_M2_vmac2su_s1>;
+
+// Vector polynomial multiply halfwords
+// Rdd=vpmpyh(Rs,Rt)
+def : T_RR_pat <M4_vpmpyh, int_hexagon_M4_vpmpyh>;
+// Rxx[^]=vpmpyh(Rs,Rt)
+def : T_PRR_pat <M4_vpmpyh_acc, int_hexagon_M4_vpmpyh_acc>;
+
+// Polynomial multiply words
+// Rdd=pmpyw(Rs,Rt)
+def : T_RR_pat <M4_pmpyw, int_hexagon_M4_pmpyw>;
+// Rxx^=pmpyw(Rs,Rt)
+def : T_PRR_pat <M4_pmpyw_acc, int_hexagon_M4_pmpyw_acc>;
+
+//Rxx^=asr(Rss,Rt)
+def : T_PPR_pat <S2_asr_r_p_xor, int_hexagon_S2_asr_r_p_xor>;
+//Rxx^=asl(Rss,Rt)
+def : T_PPR_pat <S2_asl_r_p_xor, int_hexagon_S2_asl_r_p_xor>;
+//Rxx^=lsr(Rss,Rt)
+def : T_PPR_pat <S2_lsr_r_p_xor, int_hexagon_S2_lsr_r_p_xor>;
+//Rxx^=lsl(Rss,Rt)
+def : T_PPR_pat <S2_lsl_r_p_xor, int_hexagon_S2_lsl_r_p_xor>;
+
+// Multiply and use upper result
+def : T_RR_pat <M2_mpysu_up, int_hexagon_M2_mpysu_up>;
+def : T_RR_pat <M2_mpy_up_s1, int_hexagon_M2_mpy_up_s1>;
+def : T_RR_pat <M2_hmmpyh_s1, int_hexagon_M2_hmmpyh_s1>;
+def : T_RR_pat <M2_hmmpyl_s1, int_hexagon_M2_hmmpyl_s1>;
+def : T_RR_pat <M2_mpy_up_s1_sat, int_hexagon_M2_mpy_up_s1_sat>;
+
+def : T_PP_pat <A2_vaddub, int_hexagon_A2_vaddb_map>;
+def : T_PP_pat <A2_vsubub, int_hexagon_A2_vsubb_map>;
+
+// Vector reduce add unsigned halfwords
+def : T_PP_pat <M2_vraddh, int_hexagon_M2_vraddh>;
+
+def: T_P_pat<S2_brevp, int_hexagon_S2_brevp>;
+def: T_P_pat<S2_ct0p, int_hexagon_S2_ct0p>;
+def: T_P_pat<S2_ct1p, int_hexagon_S2_ct1p>;
+
+def: T_Q_RR_pat<C4_nbitsset, int_hexagon_C4_nbitsset>;
+def: T_Q_RR_pat<C4_nbitsclr, int_hexagon_C4_nbitsclr>;
+def: T_Q_RI_pat<C4_nbitsclri, int_hexagon_C4_nbitsclri>;
+
+def : T_Q_PI_pat<A4_vcmpbeqi, int_hexagon_A4_vcmpbeqi>;
+def : T_Q_PI_pat<A4_vcmpbgti, int_hexagon_A4_vcmpbgti>;
+def : T_Q_PI_pat<A4_vcmpbgtui, int_hexagon_A4_vcmpbgtui>;
+def : T_Q_PI_pat<A4_vcmpheqi, int_hexagon_A4_vcmpheqi>;
+def : T_Q_PI_pat<A4_vcmphgti, int_hexagon_A4_vcmphgti>;
+def : T_Q_PI_pat<A4_vcmphgtui, int_hexagon_A4_vcmphgtui>;
+def : T_Q_PI_pat<A4_vcmpweqi, int_hexagon_A4_vcmpweqi>;
+def : T_Q_PI_pat<A4_vcmpwgti, int_hexagon_A4_vcmpwgti>;
+def : T_Q_PI_pat<A4_vcmpwgtui, int_hexagon_A4_vcmpwgtui>;
+def : T_Q_PP_pat<A4_vcmpbeq_any, int_hexagon_A4_vcmpbeq_any>;
+
+def : T_Q_RR_pat<A4_cmpbeq, int_hexagon_A4_cmpbeq>;
+def : T_Q_RR_pat<A4_cmpbgt, int_hexagon_A4_cmpbgt>;
+def : T_Q_RR_pat<A4_cmpbgtu, int_hexagon_A4_cmpbgtu>;
+def : T_Q_RR_pat<A4_cmpheq, int_hexagon_A4_cmpheq>;
+def : T_Q_RR_pat<A4_cmphgt, int_hexagon_A4_cmphgt>;
+def : T_Q_RR_pat<A4_cmphgtu, int_hexagon_A4_cmphgtu>;
+
+def : T_Q_RI_pat<A4_cmpbeqi, int_hexagon_A4_cmpbeqi>;
+def : T_Q_RI_pat<A4_cmpbgti, int_hexagon_A4_cmpbgti>;
+def : T_Q_RI_pat<A4_cmpbgtui, int_hexagon_A4_cmpbgtui>;
+
+def : T_Q_RI_pat<A4_cmpheqi, int_hexagon_A4_cmpheqi>;
+def : T_Q_RI_pat<A4_cmphgti, int_hexagon_A4_cmphgti>;
+def : T_Q_RI_pat<A4_cmphgtui, int_hexagon_A4_cmphgtui>;
+
+def : T_Q_RP_pat<A4_boundscheck, int_hexagon_A4_boundscheck>;
+def : T_Q_PR_pat<A4_tlbmatch, int_hexagon_A4_tlbmatch>;
+
+def : T_RRR_pat <M4_mpyrr_addr, int_hexagon_M4_mpyrr_addr>;
+def : T_IRR_pat <M4_mpyrr_addi, int_hexagon_M4_mpyrr_addi>;
+def : T_IRI_pat <M4_mpyri_addi, int_hexagon_M4_mpyri_addi>;
+def : T_RIR_pat <M4_mpyri_addr_u2, int_hexagon_M4_mpyri_addr_u2>;
+def : T_RRI_pat <M4_mpyri_addr, int_hexagon_M4_mpyri_addr>;
+def : T_RRR_pat <M4_mac_up_s1_sat, int_hexagon_M4_mac_up_s1_sat>;
+def : T_RRR_pat <M4_nac_up_s1_sat, int_hexagon_M4_nac_up_s1_sat>;
+
+// Complex multiply 32x16
+def : T_PR_pat <M4_cmpyi_wh, int_hexagon_M4_cmpyi_wh>;
+def : T_PR_pat <M4_cmpyr_wh, int_hexagon_M4_cmpyr_wh>;
+
+def : T_PR_pat <M4_cmpyi_whc, int_hexagon_M4_cmpyi_whc>;
+def : T_PR_pat <M4_cmpyr_whc, int_hexagon_M4_cmpyr_whc>;
+
+def : T_PP_pat<A4_andnp, int_hexagon_A4_andnp>;
+def : T_PP_pat<A4_ornp, int_hexagon_A4_ornp>;
+
+// Complex add/sub halfwords/words
+def : T_PP_pat <S4_vxaddsubw, int_hexagon_S4_vxaddsubw>;
+def : T_PP_pat <S4_vxsubaddw, int_hexagon_S4_vxsubaddw>;
+def : T_PP_pat <S4_vxaddsubh, int_hexagon_S4_vxaddsubh>;
+def : T_PP_pat <S4_vxsubaddh, int_hexagon_S4_vxsubaddh>;
+
+def : T_PP_pat <S4_vxaddsubhr, int_hexagon_S4_vxaddsubhr>;
+def : T_PP_pat <S4_vxsubaddhr, int_hexagon_S4_vxsubaddhr>;
+
+// Extract bitfield
+def : T_PP_pat <S4_extractp_rp, int_hexagon_S4_extractp_rp>;
+def : T_RP_pat <S4_extract_rp, int_hexagon_S4_extract_rp>;
+def : T_PII_pat <S4_extractp, int_hexagon_S4_extractp>;
+def : T_RII_pat <S4_extract, int_hexagon_S4_extract>;
+
+// Vector conditional negate
+// Rdd=vcnegh(Rss,Rt)
+def : T_PR_pat <S2_vcnegh, int_hexagon_S2_vcnegh>;
+
+// Shift an immediate left by register amount
+def : T_IR_pat<S4_lsli, int_hexagon_S4_lsli>;
+
+// Vector reduce maximum halfwords
+def : T_PPR_pat <A4_vrmaxh, int_hexagon_A4_vrmaxh>;
+def : T_PPR_pat <A4_vrmaxuh, int_hexagon_A4_vrmaxuh>;
+
+// Vector reduce maximum words
+def : T_PPR_pat <A4_vrmaxw, int_hexagon_A4_vrmaxw>;
+def : T_PPR_pat <A4_vrmaxuw, int_hexagon_A4_vrmaxuw>;
+
+// Vector reduce minimum halfwords
+def : T_PPR_pat <A4_vrminh, int_hexagon_A4_vrminh>;
+def : T_PPR_pat <A4_vrminuh, int_hexagon_A4_vrminuh>;
+
+// Vector reduce minimum words
+def : T_PPR_pat <A4_vrminw, int_hexagon_A4_vrminw>;
+def : T_PPR_pat <A4_vrminuw, int_hexagon_A4_vrminuw>;
+
+// Rotate and reduce bytes
+def : Pat <(int_hexagon_S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2,
+ u2_0ImmPred:$src3),
+ (S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2, u2_0ImmPred:$src3)>;
+
+// Rotate and reduce bytes with accumulation
+// Rxx+=vrcrotate(Rss,Rt,#u2)
+def : Pat <(int_hexagon_S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2,
+ IntRegs:$src3, u2_0ImmPred:$src4),
+ (S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2,
+ IntRegs:$src3, u2_0ImmPred:$src4)>;
+
+// Vector conditional negate
+def : T_PPR_pat<S2_vrcnegh, int_hexagon_S2_vrcnegh>;
+
+// Logical xor with xor accumulation
+def : T_PPP_pat<M4_xor_xacc, int_hexagon_M4_xor_xacc>;
+
+// ALU64 - Vector min/max byte
+def : T_PP_pat <A2_vminb, int_hexagon_A2_vminb>;
+def : T_PP_pat <A2_vmaxb, int_hexagon_A2_vmaxb>;
+
+// Shift and add/sub/and/or
+def : T_IRI_pat <S4_andi_asl_ri, int_hexagon_S4_andi_asl_ri>;
+def : T_IRI_pat <S4_ori_asl_ri, int_hexagon_S4_ori_asl_ri>;
+def : T_IRI_pat <S4_addi_asl_ri, int_hexagon_S4_addi_asl_ri>;
+def : T_IRI_pat <S4_subi_asl_ri, int_hexagon_S4_subi_asl_ri>;
+def : T_IRI_pat <S4_andi_lsr_ri, int_hexagon_S4_andi_lsr_ri>;
+def : T_IRI_pat <S4_ori_lsr_ri, int_hexagon_S4_ori_lsr_ri>;
+def : T_IRI_pat <S4_addi_lsr_ri, int_hexagon_S4_addi_lsr_ri>;
+def : T_IRI_pat <S4_subi_lsr_ri, int_hexagon_S4_subi_lsr_ri>;
+
+// Split bitfield
+def : T_RI_pat <A4_bitspliti, int_hexagon_A4_bitspliti>;
+def : T_RR_pat <A4_bitsplit, int_hexagon_A4_bitsplit>;
+
+def: T_RR_pat<S4_parity, int_hexagon_S4_parity>;
+
+def: T_Q_RI_pat<S4_ntstbit_i, int_hexagon_S4_ntstbit_i>;
+def: T_Q_RR_pat<S4_ntstbit_r, int_hexagon_S4_ntstbit_r>;
+
+def: T_RI_pat<S4_clbaddi, int_hexagon_S4_clbaddi>;
+def: T_PI_pat<S4_clbpaddi, int_hexagon_S4_clbpaddi>;
+def: T_P_pat <S4_clbpnorm, int_hexagon_S4_clbpnorm>;
+
+//*******************************************************************
+// ALU32/ALU
+//*******************************************************************
+
+// ALU32 / ALU / Logical Operations.
+def: T_RR_pat<A4_andn, int_hexagon_A4_andn>;
+def: T_RR_pat<A4_orn, int_hexagon_A4_orn>;
+
+//*******************************************************************
+// ALU32/PERM
+//*******************************************************************
+
+// Combine Words Into Doublewords.
+def: T_RI_pat<A4_combineri, int_hexagon_A4_combineri, s32_0ImmPred>;
+def: T_IR_pat<A4_combineir, int_hexagon_A4_combineir, s32_0ImmPred>;
+
+//*******************************************************************
+// ALU32/PRED
+//*******************************************************************
+
+// Compare
+def : T_Q_RI_pat<C4_cmpneqi, int_hexagon_C4_cmpneqi, s32_0ImmPred>;
+def : T_Q_RI_pat<C4_cmpltei, int_hexagon_C4_cmpltei, s32_0ImmPred>;
+def : T_Q_RI_pat<C4_cmplteui, int_hexagon_C4_cmplteui, u32_0ImmPred>;
+
+// Compare To General Register.
+def: T_Q_RR_pat<C4_cmpneq, int_hexagon_C4_cmpneq>;
+def: T_Q_RR_pat<C4_cmplte, int_hexagon_C4_cmplte>;
+def: T_Q_RR_pat<C4_cmplteu, int_hexagon_C4_cmplteu>;
+
+def: T_RR_pat<A4_rcmpeq, int_hexagon_A4_rcmpeq>;
+def: T_RR_pat<A4_rcmpneq, int_hexagon_A4_rcmpneq>;
+
+def: T_RI_pat<A4_rcmpeqi, int_hexagon_A4_rcmpeqi>;
+def: T_RI_pat<A4_rcmpneqi, int_hexagon_A4_rcmpneqi>;
+
+//*******************************************************************
+// CR
+//*******************************************************************
+
+// CR / Logical Operations On Predicates.
+def: T_Q_QQQ_pat<C4_and_and, int_hexagon_C4_and_and>;
+def: T_Q_QQQ_pat<C4_and_andn, int_hexagon_C4_and_andn>;
+def: T_Q_QQQ_pat<C4_and_or, int_hexagon_C4_and_or>;
+def: T_Q_QQQ_pat<C4_and_orn, int_hexagon_C4_and_orn>;
+def: T_Q_QQQ_pat<C4_or_and, int_hexagon_C4_or_and>;
+def: T_Q_QQQ_pat<C4_or_andn, int_hexagon_C4_or_andn>;
+def: T_Q_QQQ_pat<C4_or_or, int_hexagon_C4_or_or>;
+def: T_Q_QQQ_pat<C4_or_orn, int_hexagon_C4_or_orn>;
+
+//*******************************************************************
+// XTYPE/ALU
+//*******************************************************************
+
+// Add And Accumulate.
+
+def : T_RRI_pat <S4_addaddi, int_hexagon_S4_addaddi>;
+def : T_RIR_pat <S4_subaddi, int_hexagon_S4_subaddi>;
+
+
+// XTYPE / ALU / Logical-logical Words.
+def : T_RRR_pat <M4_or_xor, int_hexagon_M4_or_xor>;
+def : T_RRR_pat <M4_and_xor, int_hexagon_M4_and_xor>;
+def : T_RRR_pat <M4_or_and, int_hexagon_M4_or_and>;
+def : T_RRR_pat <M4_and_and, int_hexagon_M4_and_and>;
+def : T_RRR_pat <M4_xor_and, int_hexagon_M4_xor_and>;
+def : T_RRR_pat <M4_or_or, int_hexagon_M4_or_or>;
+def : T_RRR_pat <M4_and_or, int_hexagon_M4_and_or>;
+def : T_RRR_pat <M4_xor_or, int_hexagon_M4_xor_or>;
+def : T_RRR_pat <M4_or_andn, int_hexagon_M4_or_andn>;
+def : T_RRR_pat <M4_and_andn, int_hexagon_M4_and_andn>;
+def : T_RRR_pat <M4_xor_andn, int_hexagon_M4_xor_andn>;
+
+def : T_RRI_pat <S4_or_andi, int_hexagon_S4_or_andi>;
+def : T_RRI_pat <S4_or_andix, int_hexagon_S4_or_andix>;
+def : T_RRI_pat <S4_or_ori, int_hexagon_S4_or_ori>;
+
+// Modulo wrap.
+def : T_RR_pat <A4_modwrapu, int_hexagon_A4_modwrapu>;
+
+// Arithmetic/Convergent round
+// Rd=[cround|round](Rs,Rt)[:sat]
+// Rd=[cround|round](Rs,#u5)[:sat]
+def : T_RI_pat <A4_cround_ri, int_hexagon_A4_cround_ri>;
+def : T_RR_pat <A4_cround_rr, int_hexagon_A4_cround_rr>;
+
+def : T_RI_pat <A4_round_ri, int_hexagon_A4_round_ri>;
+def : T_RR_pat <A4_round_rr, int_hexagon_A4_round_rr>;
+
+def : T_RI_pat <A4_round_ri_sat, int_hexagon_A4_round_ri_sat>;
+def : T_RR_pat <A4_round_rr_sat, int_hexagon_A4_round_rr_sat>;
+
+def : T_P_pat <A2_roundsat, int_hexagon_A2_roundsat>;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV5.td b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV5.td
new file mode 100644
index 000000000000..f27a63e20e61
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV5.td
@@ -0,0 +1,111 @@
+//===- HexagonIntrinsicsV5.td - V5 Instruction intrinsics --*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//Rdd[+]=vrmpybsu(Rss,Rtt)
+//Rdd[+]=vrmpybuu(Rss,Rtt)
+let Predicates = [HasV5T] in {
+def : T_PP_pat <M5_vrmpybsu, int_hexagon_M5_vrmpybsu>;
+def : T_PP_pat <M5_vrmpybuu, int_hexagon_M5_vrmpybuu>;
+
+def : T_PP_pat <M5_vdmpybsu, int_hexagon_M5_vdmpybsu>;
+
+def : T_PPP_pat <M5_vrmacbsu, int_hexagon_M5_vrmacbsu>;
+def : T_PPP_pat <M5_vrmacbuu, int_hexagon_M5_vrmacbuu>;
+//Rxx+=vdmpybsu(Rss,Rtt):sat
+def : T_PPP_pat <M5_vdmacbsu, int_hexagon_M5_vdmacbsu>;
+
+// Vector multiply bytes
+// Rdd=vmpyb[s]u(Rs,Rt)
+def : T_RR_pat <M5_vmpybsu, int_hexagon_M5_vmpybsu>;
+def : T_RR_pat <M5_vmpybuu, int_hexagon_M5_vmpybuu>;
+
+// Rxx+=vmpyb[s]u(Rs,Rt)
+def : T_PRR_pat <M5_vmacbsu, int_hexagon_M5_vmacbsu>;
+def : T_PRR_pat <M5_vmacbuu, int_hexagon_M5_vmacbuu>;
+
+// Rd=vaddhub(Rss,Rtt):sat
+def : T_PP_pat <A5_vaddhubs, int_hexagon_A5_vaddhubs>;
+}
+
+def : T_FF_pat<F2_sfadd, int_hexagon_F2_sfadd>;
+def : T_FF_pat<F2_sfsub, int_hexagon_F2_sfsub>;
+def : T_FF_pat<F2_sfmpy, int_hexagon_F2_sfmpy>;
+def : T_FF_pat<F2_sfmax, int_hexagon_F2_sfmax>;
+def : T_FF_pat<F2_sfmin, int_hexagon_F2_sfmin>;
+
+def : T_FF_pat<F2_sffixupn, int_hexagon_F2_sffixupn>;
+def : T_FF_pat<F2_sffixupd, int_hexagon_F2_sffixupd>;
+def : T_F_pat <F2_sffixupr, int_hexagon_F2_sffixupr>;
+
+def : T_Q_QQ_pat<C4_fastcorner9, int_hexagon_C4_fastcorner9>;
+def : T_Q_QQ_pat<C4_fastcorner9_not, int_hexagon_C4_fastcorner9_not>;
+
+def : T_P_pat <S5_popcountp, int_hexagon_S5_popcountp>;
+def : T_PI_pat <S5_asrhub_sat, int_hexagon_S5_asrhub_sat>;
+
+def : T_PI_pat <S2_asr_i_p_rnd, int_hexagon_S2_asr_i_p_rnd>;
+def : T_PI_pat <S2_asr_i_p_rnd_goodsyntax,
+ int_hexagon_S2_asr_i_p_rnd_goodsyntax>;
+
+def : T_PI_pat <S5_asrhub_rnd_sat_goodsyntax,
+ int_hexagon_S5_asrhub_rnd_sat_goodsyntax>;
+
+def : T_PI_pat <S5_vasrhrnd_goodsyntax, int_hexagon_S5_vasrhrnd_goodsyntax>;
+
+def : T_FFF_pat <F2_sffma, int_hexagon_F2_sffma>;
+def : T_FFF_pat <F2_sffms, int_hexagon_F2_sffms>;
+def : T_FFF_pat <F2_sffma_lib, int_hexagon_F2_sffma_lib>;
+def : T_FFF_pat <F2_sffms_lib, int_hexagon_F2_sffms_lib>;
+def : T_FFFQ_pat <F2_sffma_sc, int_hexagon_F2_sffma_sc>;
+
+// Compare floating-point value
+def : T_Q_FF_pat <F2_sfcmpge, int_hexagon_F2_sfcmpge>;
+def : T_Q_FF_pat <F2_sfcmpuo, int_hexagon_F2_sfcmpuo>;
+def : T_Q_FF_pat <F2_sfcmpeq, int_hexagon_F2_sfcmpeq>;
+def : T_Q_FF_pat <F2_sfcmpgt, int_hexagon_F2_sfcmpgt>;
+
+def : T_Q_DD_pat <F2_dfcmpeq, int_hexagon_F2_dfcmpeq>;
+def : T_Q_DD_pat <F2_dfcmpgt, int_hexagon_F2_dfcmpgt>;
+def : T_Q_DD_pat <F2_dfcmpge, int_hexagon_F2_dfcmpge>;
+def : T_Q_DD_pat <F2_dfcmpuo, int_hexagon_F2_dfcmpuo>;
+
+// Create floating-point value
+def : T_I_pat <F2_sfimm_p, int_hexagon_F2_sfimm_p>;
+def : T_I_pat <F2_sfimm_n, int_hexagon_F2_sfimm_n>;
+def : T_I_pat <F2_dfimm_p, int_hexagon_F2_dfimm_p>;
+def : T_I_pat <F2_dfimm_n, int_hexagon_F2_dfimm_n>;
+
+def : T_Q_DI_pat <F2_dfclass, int_hexagon_F2_dfclass>;
+def : T_Q_FI_pat <F2_sfclass, int_hexagon_F2_sfclass>;
+def : T_F_pat <F2_conv_sf2df, int_hexagon_F2_conv_sf2df>;
+def : T_D_pat <F2_conv_df2sf, int_hexagon_F2_conv_df2sf>;
+def : T_R_pat <F2_conv_uw2sf, int_hexagon_F2_conv_uw2sf>;
+def : T_R_pat <F2_conv_uw2df, int_hexagon_F2_conv_uw2df>;
+def : T_R_pat <F2_conv_w2sf, int_hexagon_F2_conv_w2sf>;
+def : T_R_pat <F2_conv_w2df, int_hexagon_F2_conv_w2df>;
+def : T_P_pat <F2_conv_ud2sf, int_hexagon_F2_conv_ud2sf>;
+def : T_P_pat <F2_conv_ud2df, int_hexagon_F2_conv_ud2df>;
+def : T_P_pat <F2_conv_d2sf, int_hexagon_F2_conv_d2sf>;
+def : T_P_pat <F2_conv_d2df, int_hexagon_F2_conv_d2df>;
+def : T_F_pat <F2_conv_sf2uw, int_hexagon_F2_conv_sf2uw>;
+def : T_F_pat <F2_conv_sf2w, int_hexagon_F2_conv_sf2w>;
+def : T_F_pat <F2_conv_sf2ud, int_hexagon_F2_conv_sf2ud>;
+def : T_F_pat <F2_conv_sf2d, int_hexagon_F2_conv_sf2d>;
+def : T_D_pat <F2_conv_df2uw, int_hexagon_F2_conv_df2uw>;
+def : T_D_pat <F2_conv_df2w, int_hexagon_F2_conv_df2w>;
+def : T_D_pat <F2_conv_df2ud, int_hexagon_F2_conv_df2ud>;
+def : T_D_pat <F2_conv_df2d, int_hexagon_F2_conv_df2d>;
+def : T_F_pat <F2_conv_sf2uw_chop, int_hexagon_F2_conv_sf2uw_chop>;
+def : T_F_pat <F2_conv_sf2w_chop, int_hexagon_F2_conv_sf2w_chop>;
+def : T_F_pat <F2_conv_sf2ud_chop, int_hexagon_F2_conv_sf2ud_chop>;
+def : T_F_pat <F2_conv_sf2d_chop, int_hexagon_F2_conv_sf2d_chop>;
+def : T_D_pat <F2_conv_df2uw_chop, int_hexagon_F2_conv_df2uw_chop>;
+def : T_D_pat <F2_conv_df2w_chop, int_hexagon_F2_conv_df2w_chop>;
+def : T_D_pat <F2_conv_df2ud_chop, int_hexagon_F2_conv_df2ud_chop>;
+def : T_D_pat <F2_conv_df2d_chop, int_hexagon_F2_conv_df2d_chop>;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td
new file mode 100644
index 000000000000..a45e1c9d7be4
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td
@@ -0,0 +1,803 @@
+//=- HexagonIntrinsicsV60.td - Target Description for Hexagon -*- tablegen *-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon V60 Compiler Intrinsics in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+
+let AddedComplexity = 100 in {
+def : Pat < (v16i32 (int_hexagon_V6_lo (v32i32 VecDblRegs:$src1))),
+ (v16i32 (EXTRACT_SUBREG (v32i32 VecDblRegs:$src1), vsub_lo)) >,
+ Requires<[UseHVXSgl]>;
+
+def : Pat < (v16i32 (int_hexagon_V6_hi (v32i32 VecDblRegs:$src1))),
+ (v16i32 (EXTRACT_SUBREG (v32i32 VecDblRegs:$src1), vsub_hi)) >,
+ Requires<[UseHVXSgl]>;
+
+def : Pat < (v32i32 (int_hexagon_V6_lo_128B (v64i32 VecDblRegs128B:$src1))),
+ (v32i32 (EXTRACT_SUBREG (v64i32 VecDblRegs128B:$src1), vsub_lo)) >,
+ Requires<[UseHVXDbl]>;
+
+def : Pat < (v32i32 (int_hexagon_V6_hi_128B (v64i32 VecDblRegs128B:$src1))),
+ (v32i32 (EXTRACT_SUBREG (v64i32 VecDblRegs128B:$src1), vsub_hi)) >,
+ Requires<[UseHVXDbl]>;
+}
+
+def : Pat <(v512i1 (bitconvert (v16i32 VectorRegs:$src1))),
+ (v512i1 (V6_vandvrt(v16i32 VectorRegs:$src1),
+ (A2_tfrsi 0x01010101)))>,
+ Requires<[UseHVXSgl]>;
+
+def : Pat <(v512i1 (bitconvert (v32i16 VectorRegs:$src1))),
+ (v512i1 (V6_vandvrt(v32i16 VectorRegs:$src1),
+ (A2_tfrsi 0x01010101)))>,
+ Requires<[UseHVXSgl]>;
+
+def : Pat <(v512i1 (bitconvert (v64i8 VectorRegs:$src1))),
+ (v512i1 (V6_vandvrt(v64i8 VectorRegs:$src1),
+ (A2_tfrsi 0x01010101)))>,
+ Requires<[UseHVXSgl]>;
+
+def : Pat <(v512i1 (bitconvert (v8i64 VectorRegs:$src1))),
+ (v512i1 (V6_vandvrt(v8i64 VectorRegs:$src1),
+ (A2_tfrsi 0x01010101)))>,
+ Requires<[UseHVXSgl]>;
+
+def : Pat <(v16i32 (bitconvert (v512i1 VecPredRegs:$src1))),
+ (v16i32 (V6_vandqrt(v512i1 VecPredRegs:$src1),
+ (A2_tfrsi 0x01010101)))>,
+ Requires<[UseHVXSgl]>;
+
+def : Pat <(v32i16 (bitconvert (v512i1 VecPredRegs:$src1))),
+ (v32i16 (V6_vandqrt(v512i1 VecPredRegs:$src1),
+ (A2_tfrsi 0x01010101)))>,
+ Requires<[UseHVXSgl]>;
+
+def : Pat <(v64i8 (bitconvert (v512i1 VecPredRegs:$src1))),
+ (v64i8 (V6_vandqrt(v512i1 VecPredRegs:$src1),
+ (A2_tfrsi 0x01010101)))>,
+ Requires<[UseHVXSgl]>;
+
+def : Pat <(v8i64 (bitconvert (v512i1 VecPredRegs:$src1))),
+ (v8i64 (V6_vandqrt(v512i1 VecPredRegs:$src1),
+ (A2_tfrsi 0x01010101)))>,
+ Requires<[UseHVXSgl]>;
+
+def : Pat <(v1024i1 (bitconvert (v32i32 VectorRegs128B:$src1))),
+ (v1024i1 (V6_vandvrt_128B(v32i32 VectorRegs128B:$src1),
+ (A2_tfrsi 0x01010101)))>,
+ Requires<[UseHVXDbl]>;
+
+def : Pat <(v1024i1 (bitconvert (v64i16 VectorRegs128B:$src1))),
+ (v1024i1 (V6_vandvrt_128B(v64i16 VectorRegs128B:$src1),
+ (A2_tfrsi 0x01010101)))>,
+ Requires<[UseHVXDbl]>;
+
+def : Pat <(v1024i1 (bitconvert (v128i8 VectorRegs128B:$src1))),
+ (v1024i1 (V6_vandvrt_128B(v128i8 VectorRegs128B:$src1),
+ (A2_tfrsi 0x01010101)))>,
+ Requires<[UseHVXDbl]>;
+
+def : Pat <(v1024i1 (bitconvert (v16i64 VectorRegs128B:$src1))),
+ (v1024i1 (V6_vandvrt_128B(v16i64 VectorRegs128B:$src1),
+ (A2_tfrsi 0x01010101)))>,
+ Requires<[UseHVXDbl]>;
+
+def : Pat <(v32i32 (bitconvert (v1024i1 VecPredRegs128B:$src1))),
+ (v32i32 (V6_vandqrt_128B(v1024i1 VecPredRegs128B:$src1),
+ (A2_tfrsi 0x01010101)))>,
+ Requires<[UseHVXDbl]>;
+
+def : Pat <(v64i16 (bitconvert (v1024i1 VecPredRegs128B:$src1))),
+ (v64i16 (V6_vandqrt_128B(v1024i1 VecPredRegs128B:$src1),
+ (A2_tfrsi 0x01010101)))>,
+ Requires<[UseHVXDbl]>;
+
+def : Pat <(v128i8 (bitconvert (v1024i1 VecPredRegs128B:$src1))),
+ (v128i8 (V6_vandqrt_128B(v1024i1 VecPredRegs128B:$src1),
+ (A2_tfrsi 0x01010101)))>,
+ Requires<[UseHVXDbl]>;
+
+def : Pat <(v16i64 (bitconvert (v1024i1 VecPredRegs128B:$src1))),
+ (v16i64 (V6_vandqrt_128B(v1024i1 VecPredRegs128B:$src1),
+ (A2_tfrsi 0x01010101)))>,
+ Requires<[UseHVXDbl]>;
+
+let AddedComplexity = 140 in {
+def : Pat <(store (v512i1 VecPredRegs:$src1), (i32 IntRegs:$addr)),
+ (V6_vS32b_ai IntRegs:$addr, 0,
+ (v16i32 (V6_vandqrt (v512i1 VecPredRegs:$src1),
+ (A2_tfrsi 0x01010101))))>,
+ Requires<[UseHVXSgl]>;
+
+def : Pat <(v512i1 (load (i32 IntRegs:$addr))),
+ (v512i1 (V6_vandvrt
+ (v16i32 (V6_vL32b_ai IntRegs:$addr, 0)), (A2_tfrsi 0x01010101)))>,
+ Requires<[UseHVXSgl]>;
+
+def : Pat <(store (v1024i1 VecPredRegs128B:$src1), (i32 IntRegs:$addr)),
+ (V6_vS32b_ai_128B IntRegs:$addr, 0,
+ (v32i32 (V6_vandqrt_128B (v1024i1 VecPredRegs128B:$src1),
+ (A2_tfrsi 0x01010101))))>,
+ Requires<[UseHVXDbl]>;
+
+def : Pat <(v1024i1 (load (i32 IntRegs:$addr))),
+ (v1024i1 (V6_vandvrt_128B
+ (v32i32 (V6_vL32b_ai_128B IntRegs:$addr, 0)),
+ (A2_tfrsi 0x01010101)))>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_R_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID IntRegs:$src1), (MI IntRegs:$src1)>,
+ Requires<[UseHVXSgl]>;
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") IntRegs:$src1),
+ (!cast<InstHexagon>(MI#"_128B") IntRegs:$src1)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_V_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VectorRegs:$src1),
+ (MI VectorRegs:$src1)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1),
+ (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_W_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VecDblRegs:$src1),
+ (MI VecDblRegs:$src1)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1),
+ (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_Q_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VecPredRegs:$src1),
+ (MI VecPredRegs:$src1)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1),
+ (!cast<InstHexagon>(MI#"_128B") VecPredRegs128B:$src1)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WR_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VecDblRegs:$src1, IntRegs:$src2),
+ (MI VecDblRegs:$src1, IntRegs:$src2)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B")VecDblRegs128B:$src1, IntRegs:$src2),
+ (!cast<InstHexagon>(MI#"_128B")VecDblRegs128B:$src1, IntRegs:$src2)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VR_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VectorRegs:$src1, IntRegs:$src2),
+ (MI VectorRegs:$src1, IntRegs:$src2)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B")VectorRegs128B:$src1, IntRegs:$src2),
+ (!cast<InstHexagon>(MI#"_128B")VectorRegs128B:$src1, IntRegs:$src2)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WV_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VecDblRegs:$src1, VectorRegs:$src2),
+ (MI VecDblRegs:$src1, VectorRegs:$src2)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1,
+ VectorRegs128B:$src2),
+ (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1,
+ VectorRegs128B:$src2)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WW_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VecDblRegs:$src1, VecDblRegs:$src2),
+ (MI VecDblRegs:$src1, VecDblRegs:$src2)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1,
+ VecDblRegs128B:$src2),
+ (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1,
+ VecDblRegs128B:$src2)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VV_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2),
+ (MI VectorRegs:$src1, VectorRegs:$src2)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1,
+ VectorRegs128B:$src2),
+ (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1,
+ VectorRegs128B:$src2)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_QR_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VecPredRegs:$src1, IntRegs:$src2),
+ (MI VecPredRegs:$src1, IntRegs:$src2)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1,
+ IntRegs:$src2),
+ (!cast<InstHexagon>(MI#"_128B") VecPredRegs128B:$src1,
+ IntRegs:$src2)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_QQ_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VecPredRegs:$src1, VecPredRegs:$src2),
+ (MI VecPredRegs:$src1, VecPredRegs:$src2)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1,
+ VecPredRegs128B:$src2),
+ (!cast<InstHexagon>(MI#"_128B") VecPredRegs128B:$src1,
+ VecPredRegs128B:$src2)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WWR_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VecDblRegs:$src1, VecDblRegs:$src2, IntRegs:$src3),
+ (MI VecDblRegs:$src1, VecDblRegs:$src2, IntRegs:$src3)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1,
+ VecDblRegs128B:$src2,
+ IntRegs:$src3),
+ (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1,
+ VecDblRegs128B:$src2,
+ IntRegs:$src3)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VVR_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2, IntRegs:$src3),
+ (MI VectorRegs:$src1, VectorRegs:$src2, IntRegs:$src3)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1,
+ VectorRegs128B:$src2,
+ IntRegs:$src3),
+ (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1,
+ VectorRegs128B:$src2,
+ IntRegs:$src3)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WVR_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VecDblRegs:$src1, VectorRegs:$src2, IntRegs:$src3),
+ (MI VecDblRegs:$src1, VectorRegs:$src2, IntRegs:$src3)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1,
+ VectorRegs128B:$src2,
+ IntRegs:$src3),
+ (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1,
+ VectorRegs128B:$src2,
+ IntRegs:$src3)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VWR_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VectorRegs:$src1, VecDblRegs:$src2, IntRegs:$src3),
+ (MI VectorRegs:$src1, VecDblRegs:$src2, IntRegs:$src3)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1,
+ VecDblRegs128B:$src2,
+ IntRegs:$src3),
+ (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1,
+ VecDblRegs128B:$src2,
+ IntRegs:$src3)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VVV_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2, VectorRegs:$src3),
+ (MI VectorRegs:$src1, VectorRegs:$src2, VectorRegs:$src3)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1,
+ VectorRegs128B:$src2,
+ VectorRegs128B:$src3),
+ (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1,
+ VectorRegs128B:$src2,
+ VectorRegs128B:$src3)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WVV_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VecDblRegs:$src1, VectorRegs:$src2, VectorRegs:$src3),
+ (MI VecDblRegs:$src1, VectorRegs:$src2, VectorRegs:$src3)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1,
+ VectorRegs128B:$src2,
+ VectorRegs128B:$src3),
+ (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1,
+ VectorRegs128B:$src2,
+ VectorRegs128B:$src3)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_QVV_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VecPredRegs:$src1, VectorRegs:$src2, VectorRegs:$src3),
+ (MI VecPredRegs:$src1, VectorRegs:$src2, VectorRegs:$src3)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1,
+ VectorRegs128B:$src2,
+ VectorRegs128B:$src3),
+ (!cast<InstHexagon>(MI#"_128B") VecPredRegs128B:$src1,
+ VectorRegs128B:$src2,
+ VectorRegs128B:$src3)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VQR_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VectorRegs:$src1, VecPredRegs:$src2, IntRegs:$src3),
+ (MI VectorRegs:$src1, VecPredRegs:$src2, IntRegs:$src3)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1,
+ VecPredRegs128B:$src2,
+ IntRegs:$src3),
+ (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1,
+ VecPredRegs128B:$src2,
+ IntRegs:$src3)>,
+ Requires<[UseHVXDbl]>;
+}
+
+
+multiclass T_QVR_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VecPredRegs:$src1, VectorRegs:$src2, IntRegs:$src3),
+ (MI VecPredRegs:$src1, VectorRegs:$src2, IntRegs:$src3)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1,
+ VectorRegs128B:$src2,
+ IntRegs:$src3),
+ (!cast<InstHexagon>(MI#"_128B") VecPredRegs128B:$src1,
+ VectorRegs128B:$src2,
+ IntRegs:$src3)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VVI_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2, imm:$src3),
+ (MI VectorRegs:$src1, VectorRegs:$src2, imm:$src3)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1,
+ VectorRegs128B:$src2, imm:$src3),
+ (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1,
+ VectorRegs128B:$src2, imm:$src3)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WRI_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VecDblRegs:$src1, IntRegs:$src2, imm:$src3),
+ (MI VecDblRegs:$src1, IntRegs:$src2, imm:$src3)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1,
+ IntRegs:$src2, imm:$src3),
+ (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1,
+ IntRegs:$src2, imm:$src3)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WWRI_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VecDblRegs:$src1, VecDblRegs:$src2, IntRegs:$src3, imm:$src4),
+ (MI VecDblRegs:$src1, VecDblRegs:$src2, IntRegs:$src3, imm:$src4)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1,
+ VecDblRegs128B:$src2,
+ IntRegs:$src3, imm:$src4),
+ (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1,
+ VecDblRegs128B:$src2,
+ IntRegs:$src3, imm:$src4)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VVVR_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2, VectorRegs:$src3,
+ IntRegs:$src4),
+ (MI VectorRegs:$src1, VectorRegs:$src2, VectorRegs:$src3,
+ IntRegs:$src4)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1,
+ VectorRegs128B:$src2,
+ VectorRegs128B:$src3,
+ IntRegs:$src4),
+ (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1,
+ VectorRegs128B:$src2,
+ VectorRegs128B:$src3,
+ IntRegs:$src4)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WVVR_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VecDblRegs:$src1, VectorRegs:$src2, VectorRegs:$src3,
+ IntRegs:$src4),
+ (MI VecDblRegs:$src1, VectorRegs:$src2, VectorRegs:$src3,
+ IntRegs:$src4)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1,
+ VectorRegs128B:$src2,
+ VectorRegs128B:$src3,
+ IntRegs:$src4),
+ (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1,
+ VectorRegs128B:$src2,
+ VectorRegs128B:$src3,
+ IntRegs:$src4)>,
+ Requires<[UseHVXDbl]>;
+}
+
+defm : T_WR_pat <V6_vtmpyb, int_hexagon_V6_vtmpyb>;
+defm : T_WR_pat <V6_vtmpybus, int_hexagon_V6_vtmpybus>;
+defm : T_VR_pat <V6_vdmpyhb, int_hexagon_V6_vdmpyhb>;
+defm : T_VR_pat <V6_vrmpyub, int_hexagon_V6_vrmpyub>;
+defm : T_VR_pat <V6_vrmpybus, int_hexagon_V6_vrmpybus>;
+defm : T_WR_pat <V6_vdsaduh, int_hexagon_V6_vdsaduh>;
+defm : T_VR_pat <V6_vdmpybus, int_hexagon_V6_vdmpybus>;
+defm : T_WR_pat <V6_vdmpybus_dv, int_hexagon_V6_vdmpybus_dv>;
+defm : T_VR_pat <V6_vdmpyhsusat, int_hexagon_V6_vdmpyhsusat>;
+defm : T_WR_pat <V6_vdmpyhsuisat, int_hexagon_V6_vdmpyhsuisat>;
+defm : T_VR_pat <V6_vdmpyhsat, int_hexagon_V6_vdmpyhsat>;
+defm : T_WR_pat <V6_vdmpyhisat, int_hexagon_V6_vdmpyhisat>;
+defm : T_WR_pat <V6_vdmpyhb_dv, int_hexagon_V6_vdmpyhb_dv>;
+defm : T_VR_pat <V6_vmpybus, int_hexagon_V6_vmpybus>;
+defm : T_WR_pat <V6_vmpabus, int_hexagon_V6_vmpabus>;
+defm : T_WR_pat <V6_vmpahb, int_hexagon_V6_vmpahb>;
+defm : T_VR_pat <V6_vmpyh, int_hexagon_V6_vmpyh>;
+defm : T_VR_pat <V6_vmpyhss, int_hexagon_V6_vmpyhss>;
+defm : T_VR_pat <V6_vmpyhsrs, int_hexagon_V6_vmpyhsrs>;
+defm : T_VR_pat <V6_vmpyuh, int_hexagon_V6_vmpyuh>;
+defm : T_VR_pat <V6_vmpyihb, int_hexagon_V6_vmpyihb>;
+defm : T_VR_pat <V6_vror, int_hexagon_V6_vror>;
+defm : T_VR_pat <V6_vasrw, int_hexagon_V6_vasrw>;
+defm : T_VR_pat <V6_vasrh, int_hexagon_V6_vasrh>;
+defm : T_VR_pat <V6_vaslw, int_hexagon_V6_vaslw>;
+defm : T_VR_pat <V6_vaslh, int_hexagon_V6_vaslh>;
+defm : T_VR_pat <V6_vlsrw, int_hexagon_V6_vlsrw>;
+defm : T_VR_pat <V6_vlsrh, int_hexagon_V6_vlsrh>;
+defm : T_VR_pat <V6_vmpyiwh, int_hexagon_V6_vmpyiwh>;
+defm : T_VR_pat <V6_vmpyiwb, int_hexagon_V6_vmpyiwb>;
+defm : T_WR_pat <V6_vtmpyhb, int_hexagon_V6_vtmpyhb>;
+defm : T_VR_pat <V6_vmpyub, int_hexagon_V6_vmpyub>;
+
+defm : T_VV_pat <V6_vrmpyubv, int_hexagon_V6_vrmpyubv>;
+defm : T_VV_pat <V6_vrmpybv, int_hexagon_V6_vrmpybv>;
+defm : T_VV_pat <V6_vrmpybusv, int_hexagon_V6_vrmpybusv>;
+defm : T_VV_pat <V6_vdmpyhvsat, int_hexagon_V6_vdmpyhvsat>;
+defm : T_VV_pat <V6_vmpybv, int_hexagon_V6_vmpybv>;
+defm : T_VV_pat <V6_vmpyubv, int_hexagon_V6_vmpyubv>;
+defm : T_VV_pat <V6_vmpybusv, int_hexagon_V6_vmpybusv>;
+defm : T_VV_pat <V6_vmpyhv, int_hexagon_V6_vmpyhv>;
+defm : T_VV_pat <V6_vmpyuhv, int_hexagon_V6_vmpyuhv>;
+defm : T_VV_pat <V6_vmpyhvsrs, int_hexagon_V6_vmpyhvsrs>;
+defm : T_VV_pat <V6_vmpyhus, int_hexagon_V6_vmpyhus>;
+defm : T_WW_pat <V6_vmpabusv, int_hexagon_V6_vmpabusv>;
+defm : T_VV_pat <V6_vmpyih, int_hexagon_V6_vmpyih>;
+defm : T_VV_pat <V6_vand, int_hexagon_V6_vand>;
+defm : T_VV_pat <V6_vor, int_hexagon_V6_vor>;
+defm : T_VV_pat <V6_vxor, int_hexagon_V6_vxor>;
+defm : T_VV_pat <V6_vaddw, int_hexagon_V6_vaddw>;
+defm : T_VV_pat <V6_vaddubsat, int_hexagon_V6_vaddubsat>;
+defm : T_VV_pat <V6_vadduhsat, int_hexagon_V6_vadduhsat>;
+defm : T_VV_pat <V6_vaddhsat, int_hexagon_V6_vaddhsat>;
+defm : T_VV_pat <V6_vaddwsat, int_hexagon_V6_vaddwsat>;
+defm : T_VV_pat <V6_vsubb, int_hexagon_V6_vsubb>;
+defm : T_VV_pat <V6_vsubh, int_hexagon_V6_vsubh>;
+defm : T_VV_pat <V6_vsubw, int_hexagon_V6_vsubw>;
+defm : T_VV_pat <V6_vsububsat, int_hexagon_V6_vsububsat>;
+defm : T_VV_pat <V6_vsubuhsat, int_hexagon_V6_vsubuhsat>;
+defm : T_VV_pat <V6_vsubhsat, int_hexagon_V6_vsubhsat>;
+defm : T_VV_pat <V6_vsubwsat, int_hexagon_V6_vsubwsat>;
+defm : T_WW_pat <V6_vaddb_dv, int_hexagon_V6_vaddb_dv>;
+defm : T_WW_pat <V6_vaddh_dv, int_hexagon_V6_vaddh_dv>;
+defm : T_WW_pat <V6_vaddw_dv, int_hexagon_V6_vaddw_dv>;
+defm : T_WW_pat <V6_vaddubsat_dv, int_hexagon_V6_vaddubsat_dv>;
+defm : T_WW_pat <V6_vadduhsat_dv, int_hexagon_V6_vadduhsat_dv>;
+defm : T_WW_pat <V6_vaddhsat_dv, int_hexagon_V6_vaddhsat_dv>;
+defm : T_WW_pat <V6_vaddwsat_dv, int_hexagon_V6_vaddwsat_dv>;
+defm : T_WW_pat <V6_vsubb_dv, int_hexagon_V6_vsubb_dv>;
+defm : T_WW_pat <V6_vsubh_dv, int_hexagon_V6_vsubh_dv>;
+defm : T_WW_pat <V6_vsubw_dv, int_hexagon_V6_vsubw_dv>;
+defm : T_WW_pat <V6_vsububsat_dv, int_hexagon_V6_vsububsat_dv>;
+defm : T_WW_pat <V6_vsubuhsat_dv, int_hexagon_V6_vsubuhsat_dv>;
+defm : T_WW_pat <V6_vsubhsat_dv, int_hexagon_V6_vsubhsat_dv>;
+defm : T_WW_pat <V6_vsubwsat_dv, int_hexagon_V6_vsubwsat_dv>;
+defm : T_VV_pat <V6_vaddubh, int_hexagon_V6_vaddubh>;
+defm : T_VV_pat <V6_vadduhw, int_hexagon_V6_vadduhw>;
+defm : T_VV_pat <V6_vaddhw, int_hexagon_V6_vaddhw>;
+defm : T_VV_pat <V6_vsububh, int_hexagon_V6_vsububh>;
+defm : T_VV_pat <V6_vsubuhw, int_hexagon_V6_vsubuhw>;
+defm : T_VV_pat <V6_vsubhw, int_hexagon_V6_vsubhw>;
+defm : T_VV_pat <V6_vabsdiffub, int_hexagon_V6_vabsdiffub>;
+defm : T_VV_pat <V6_vabsdiffh, int_hexagon_V6_vabsdiffh>;
+defm : T_VV_pat <V6_vabsdiffuh, int_hexagon_V6_vabsdiffuh>;
+defm : T_VV_pat <V6_vabsdiffw, int_hexagon_V6_vabsdiffw>;
+defm : T_VV_pat <V6_vavgub, int_hexagon_V6_vavgub>;
+defm : T_VV_pat <V6_vavguh, int_hexagon_V6_vavguh>;
+defm : T_VV_pat <V6_vavgh, int_hexagon_V6_vavgh>;
+defm : T_VV_pat <V6_vavgw, int_hexagon_V6_vavgw>;
+defm : T_VV_pat <V6_vnavgub, int_hexagon_V6_vnavgub>;
+defm : T_VV_pat <V6_vnavgh, int_hexagon_V6_vnavgh>;
+defm : T_VV_pat <V6_vnavgw, int_hexagon_V6_vnavgw>;
+defm : T_VV_pat <V6_vavgubrnd, int_hexagon_V6_vavgubrnd>;
+defm : T_VV_pat <V6_vavguhrnd, int_hexagon_V6_vavguhrnd>;
+defm : T_VV_pat <V6_vavghrnd, int_hexagon_V6_vavghrnd>;
+defm : T_VV_pat <V6_vavgwrnd, int_hexagon_V6_vavgwrnd>;
+defm : T_WW_pat <V6_vmpabuuv, int_hexagon_V6_vmpabuuv>;
+
+defm : T_VVR_pat <V6_vdmpyhb_acc, int_hexagon_V6_vdmpyhb_acc>;
+defm : T_VVR_pat <V6_vrmpyub_acc, int_hexagon_V6_vrmpyub_acc>;
+defm : T_VVR_pat <V6_vrmpybus_acc, int_hexagon_V6_vrmpybus_acc>;
+defm : T_VVR_pat <V6_vdmpybus_acc, int_hexagon_V6_vdmpybus_acc>;
+defm : T_VVR_pat <V6_vdmpyhsusat_acc, int_hexagon_V6_vdmpyhsusat_acc>;
+defm : T_VVR_pat <V6_vdmpyhsat_acc, int_hexagon_V6_vdmpyhsat_acc>;
+defm : T_VVR_pat <V6_vmpyiwb_acc, int_hexagon_V6_vmpyiwb_acc>;
+defm : T_VVR_pat <V6_vmpyiwh_acc, int_hexagon_V6_vmpyiwh_acc>;
+defm : T_VVR_pat <V6_vmpyihb_acc, int_hexagon_V6_vmpyihb_acc>;
+defm : T_VVR_pat <V6_vaslw_acc, int_hexagon_V6_vaslw_acc>;
+defm : T_VVR_pat <V6_vasrw_acc, int_hexagon_V6_vasrw_acc>;
+
+defm : T_VWR_pat <V6_vdmpyhsuisat_acc, int_hexagon_V6_vdmpyhsuisat_acc>;
+defm : T_VWR_pat <V6_vdmpyhisat_acc, int_hexagon_V6_vdmpyhisat_acc>;
+
+defm : T_WVR_pat <V6_vmpybus_acc, int_hexagon_V6_vmpybus_acc>;
+defm : T_WVR_pat <V6_vmpyhsat_acc, int_hexagon_V6_vmpyhsat_acc>;
+defm : T_WVR_pat <V6_vmpyuh_acc, int_hexagon_V6_vmpyuh_acc>;
+defm : T_WVR_pat <V6_vmpyub_acc, int_hexagon_V6_vmpyub_acc>;
+
+defm : T_WWR_pat <V6_vtmpyb_acc, int_hexagon_V6_vtmpyb_acc>;
+defm : T_WWR_pat <V6_vtmpybus_acc, int_hexagon_V6_vtmpybus_acc>;
+defm : T_WWR_pat <V6_vtmpyhb_acc, int_hexagon_V6_vtmpyhb_acc>;
+defm : T_WWR_pat <V6_vdmpybus_dv_acc, int_hexagon_V6_vdmpybus_dv_acc>;
+defm : T_WWR_pat <V6_vdmpyhb_dv_acc, int_hexagon_V6_vdmpyhb_dv_acc>;
+defm : T_WWR_pat <V6_vmpabus_acc, int_hexagon_V6_vmpabus_acc>;
+defm : T_WWR_pat <V6_vmpahb_acc, int_hexagon_V6_vmpahb_acc>;
+defm : T_WWR_pat <V6_vdsaduh_acc, int_hexagon_V6_vdsaduh_acc>;
+
+defm : T_VVV_pat <V6_vdmpyhvsat_acc, int_hexagon_V6_vdmpyhvsat_acc>;
+defm : T_WVV_pat <V6_vmpybusv_acc, int_hexagon_V6_vmpybusv_acc>;
+defm : T_WVV_pat <V6_vmpybv_acc, int_hexagon_V6_vmpybv_acc>;
+defm : T_WVV_pat <V6_vmpyhus_acc, int_hexagon_V6_vmpyhus_acc>;
+defm : T_WVV_pat <V6_vmpyhv_acc, int_hexagon_V6_vmpyhv_acc>;
+defm : T_VVV_pat <V6_vmpyiewh_acc, int_hexagon_V6_vmpyiewh_acc>;
+defm : T_VVV_pat <V6_vmpyiewuh_acc, int_hexagon_V6_vmpyiewuh_acc>;
+defm : T_VVV_pat <V6_vmpyih_acc, int_hexagon_V6_vmpyih_acc>;
+defm : T_VVV_pat <V6_vmpyowh_rnd_sacc, int_hexagon_V6_vmpyowh_rnd_sacc>;
+defm : T_VVV_pat <V6_vmpyowh_sacc, int_hexagon_V6_vmpyowh_sacc>;
+defm : T_WVV_pat <V6_vmpyubv_acc, int_hexagon_V6_vmpyubv_acc>;
+defm : T_WVV_pat <V6_vmpyuhv_acc, int_hexagon_V6_vmpyuhv_acc>;
+defm : T_VVV_pat <V6_vrmpybusv_acc, int_hexagon_V6_vrmpybusv_acc>;
+defm : T_VVV_pat <V6_vrmpybv_acc, int_hexagon_V6_vrmpybv_acc>;
+defm : T_VVV_pat <V6_vrmpyubv_acc, int_hexagon_V6_vrmpyubv_acc>;
+
+// Compare instructions
+defm : T_QVV_pat <V6_veqb_and, int_hexagon_V6_veqb_and>;
+defm : T_QVV_pat <V6_veqh_and, int_hexagon_V6_veqh_and>;
+defm : T_QVV_pat <V6_veqw_and, int_hexagon_V6_veqw_and>;
+defm : T_QVV_pat <V6_vgtb_and, int_hexagon_V6_vgtb_and>;
+defm : T_QVV_pat <V6_vgth_and, int_hexagon_V6_vgth_and>;
+defm : T_QVV_pat <V6_vgtw_and, int_hexagon_V6_vgtw_and>;
+defm : T_QVV_pat <V6_vgtub_and, int_hexagon_V6_vgtub_and>;
+defm : T_QVV_pat <V6_vgtuh_and, int_hexagon_V6_vgtuh_and>;
+defm : T_QVV_pat <V6_vgtuw_and, int_hexagon_V6_vgtuw_and>;
+defm : T_QVV_pat <V6_veqb_or, int_hexagon_V6_veqb_or>;
+defm : T_QVV_pat <V6_veqh_or, int_hexagon_V6_veqh_or>;
+defm : T_QVV_pat <V6_veqw_or, int_hexagon_V6_veqw_or>;
+defm : T_QVV_pat <V6_vgtb_or, int_hexagon_V6_vgtb_or>;
+defm : T_QVV_pat <V6_vgth_or, int_hexagon_V6_vgth_or>;
+defm : T_QVV_pat <V6_vgtw_or, int_hexagon_V6_vgtw_or>;
+defm : T_QVV_pat <V6_vgtub_or, int_hexagon_V6_vgtub_or>;
+defm : T_QVV_pat <V6_vgtuh_or, int_hexagon_V6_vgtuh_or>;
+defm : T_QVV_pat <V6_vgtuw_or, int_hexagon_V6_vgtuw_or>;
+defm : T_QVV_pat <V6_veqb_xor, int_hexagon_V6_veqb_xor>;
+defm : T_QVV_pat <V6_veqh_xor, int_hexagon_V6_veqh_xor>;
+defm : T_QVV_pat <V6_veqw_xor, int_hexagon_V6_veqw_xor>;
+defm : T_QVV_pat <V6_vgtb_xor, int_hexagon_V6_vgtb_xor>;
+defm : T_QVV_pat <V6_vgth_xor, int_hexagon_V6_vgth_xor>;
+defm : T_QVV_pat <V6_vgtw_xor, int_hexagon_V6_vgtw_xor>;
+defm : T_QVV_pat <V6_vgtub_xor, int_hexagon_V6_vgtub_xor>;
+defm : T_QVV_pat <V6_vgtuh_xor, int_hexagon_V6_vgtuh_xor>;
+defm : T_QVV_pat <V6_vgtuw_xor, int_hexagon_V6_vgtuw_xor>;
+
+defm : T_VV_pat <V6_vminub, int_hexagon_V6_vminub>;
+defm : T_VV_pat <V6_vminuh, int_hexagon_V6_vminuh>;
+defm : T_VV_pat <V6_vminh, int_hexagon_V6_vminh>;
+defm : T_VV_pat <V6_vminw, int_hexagon_V6_vminw>;
+defm : T_VV_pat <V6_vmaxub, int_hexagon_V6_vmaxub>;
+defm : T_VV_pat <V6_vmaxuh, int_hexagon_V6_vmaxuh>;
+defm : T_VV_pat <V6_vmaxh, int_hexagon_V6_vmaxh>;
+defm : T_VV_pat <V6_vmaxw, int_hexagon_V6_vmaxw>;
+defm : T_VV_pat <V6_vdelta, int_hexagon_V6_vdelta>;
+defm : T_VV_pat <V6_vrdelta, int_hexagon_V6_vrdelta>;
+defm : T_VV_pat <V6_vdealb4w, int_hexagon_V6_vdealb4w>;
+defm : T_VV_pat <V6_vmpyowh_rnd, int_hexagon_V6_vmpyowh_rnd>;
+defm : T_VV_pat <V6_vshuffeb, int_hexagon_V6_vshuffeb>;
+defm : T_VV_pat <V6_vshuffob, int_hexagon_V6_vshuffob>;
+defm : T_VV_pat <V6_vshufeh, int_hexagon_V6_vshufeh>;
+defm : T_VV_pat <V6_vshufoh, int_hexagon_V6_vshufoh>;
+defm : T_VV_pat <V6_vshufoeh, int_hexagon_V6_vshufoeh>;
+defm : T_VV_pat <V6_vshufoeb, int_hexagon_V6_vshufoeb>;
+defm : T_VV_pat <V6_vcombine, int_hexagon_V6_vcombine>;
+defm : T_VV_pat <V6_vmpyieoh, int_hexagon_V6_vmpyieoh>;
+defm : T_VV_pat <V6_vsathub, int_hexagon_V6_vsathub>;
+defm : T_VV_pat <V6_vsatwh, int_hexagon_V6_vsatwh>;
+defm : T_VV_pat <V6_vroundwh, int_hexagon_V6_vroundwh>;
+defm : T_VV_pat <V6_vroundwuh, int_hexagon_V6_vroundwuh>;
+defm : T_VV_pat <V6_vroundhb, int_hexagon_V6_vroundhb>;
+defm : T_VV_pat <V6_vroundhub, int_hexagon_V6_vroundhub>;
+defm : T_VV_pat <V6_vasrwv, int_hexagon_V6_vasrwv>;
+defm : T_VV_pat <V6_vlsrwv, int_hexagon_V6_vlsrwv>;
+defm : T_VV_pat <V6_vlsrhv, int_hexagon_V6_vlsrhv>;
+defm : T_VV_pat <V6_vasrhv, int_hexagon_V6_vasrhv>;
+defm : T_VV_pat <V6_vaslwv, int_hexagon_V6_vaslwv>;
+defm : T_VV_pat <V6_vaslhv, int_hexagon_V6_vaslhv>;
+defm : T_VV_pat <V6_vaddb, int_hexagon_V6_vaddb>;
+defm : T_VV_pat <V6_vaddh, int_hexagon_V6_vaddh>;
+defm : T_VV_pat <V6_vmpyiewuh, int_hexagon_V6_vmpyiewuh>;
+defm : T_VV_pat <V6_vmpyiowh, int_hexagon_V6_vmpyiowh>;
+defm : T_VV_pat <V6_vpackeb, int_hexagon_V6_vpackeb>;
+defm : T_VV_pat <V6_vpackeh, int_hexagon_V6_vpackeh>;
+defm : T_VV_pat <V6_vpackhub_sat, int_hexagon_V6_vpackhub_sat>;
+defm : T_VV_pat <V6_vpackhb_sat, int_hexagon_V6_vpackhb_sat>;
+defm : T_VV_pat <V6_vpackwuh_sat, int_hexagon_V6_vpackwuh_sat>;
+defm : T_VV_pat <V6_vpackwh_sat, int_hexagon_V6_vpackwh_sat>;
+defm : T_VV_pat <V6_vpackob, int_hexagon_V6_vpackob>;
+defm : T_VV_pat <V6_vpackoh, int_hexagon_V6_vpackoh>;
+defm : T_VV_pat <V6_vmpyewuh, int_hexagon_V6_vmpyewuh>;
+defm : T_VV_pat <V6_vmpyowh, int_hexagon_V6_vmpyowh>;
+
+defm : T_QVV_pat <V6_vaddbq, int_hexagon_V6_vaddbq>;
+defm : T_QVV_pat <V6_vaddhq, int_hexagon_V6_vaddhq>;
+defm : T_QVV_pat <V6_vaddwq, int_hexagon_V6_vaddwq>;
+defm : T_QVV_pat <V6_vaddbnq, int_hexagon_V6_vaddbnq>;
+defm : T_QVV_pat <V6_vaddhnq, int_hexagon_V6_vaddhnq>;
+defm : T_QVV_pat <V6_vaddwnq, int_hexagon_V6_vaddwnq>;
+defm : T_QVV_pat <V6_vsubbq, int_hexagon_V6_vsubbq>;
+defm : T_QVV_pat <V6_vsubhq, int_hexagon_V6_vsubhq>;
+defm : T_QVV_pat <V6_vsubwq, int_hexagon_V6_vsubwq>;
+defm : T_QVV_pat <V6_vsubbnq, int_hexagon_V6_vsubbnq>;
+defm : T_QVV_pat <V6_vsubhnq, int_hexagon_V6_vsubhnq>;
+defm : T_QVV_pat <V6_vsubwnq, int_hexagon_V6_vsubwnq>;
+
+defm : T_V_pat <V6_vabsh, int_hexagon_V6_vabsh>;
+defm : T_V_pat <V6_vabsw, int_hexagon_V6_vabsw>;
+defm : T_V_pat <V6_vabsw_sat, int_hexagon_V6_vabsw_sat>;
+defm : T_V_pat <V6_vabsh_sat, int_hexagon_V6_vabsh_sat>;
+defm : T_V_pat <V6_vnot, int_hexagon_V6_vnot>;
+defm : T_V_pat <V6_vassign, int_hexagon_V6_vassign>;
+defm : T_V_pat <V6_vzb, int_hexagon_V6_vzb>;
+defm : T_V_pat <V6_vzh, int_hexagon_V6_vzh>;
+defm : T_V_pat <V6_vsb, int_hexagon_V6_vsb>;
+defm : T_V_pat <V6_vsh, int_hexagon_V6_vsh>;
+defm : T_V_pat <V6_vdealh, int_hexagon_V6_vdealh>;
+defm : T_V_pat <V6_vdealb, int_hexagon_V6_vdealb>;
+defm : T_V_pat <V6_vunpackub, int_hexagon_V6_vunpackub>;
+defm : T_V_pat <V6_vunpackuh, int_hexagon_V6_vunpackuh>;
+defm : T_V_pat <V6_vunpackb, int_hexagon_V6_vunpackb>;
+defm : T_V_pat <V6_vunpackh, int_hexagon_V6_vunpackh>;
+defm : T_V_pat <V6_vshuffh, int_hexagon_V6_vshuffh>;
+defm : T_V_pat <V6_vshuffb, int_hexagon_V6_vshuffb>;
+defm : T_V_pat <V6_vcl0w, int_hexagon_V6_vcl0w>;
+defm : T_V_pat <V6_vpopcounth, int_hexagon_V6_vpopcounth>;
+defm : T_V_pat <V6_vcl0h, int_hexagon_V6_vcl0h>;
+defm : T_V_pat <V6_vnormamtw, int_hexagon_V6_vnormamtw>;
+defm : T_V_pat <V6_vnormamth, int_hexagon_V6_vnormamth>;
+
+defm : T_W_pat <V6_lo, int_hexagon_V6_lo>;
+defm : T_W_pat <V6_hi, int_hexagon_V6_hi>;
+defm : T_W_pat <V6_vassignp, int_hexagon_V6_vassignp>;
+
+defm : T_WRI_pat <V6_vrmpybusi, int_hexagon_V6_vrmpybusi>;
+defm : T_WRI_pat <V6_vrsadubi, int_hexagon_V6_vrsadubi>;
+defm : T_WRI_pat <V6_vrmpyubi, int_hexagon_V6_vrmpyubi>;
+
+defm : T_WWRI_pat <V6_vrmpybusi_acc, int_hexagon_V6_vrmpybusi_acc>;
+defm : T_WWRI_pat <V6_vrsadubi_acc, int_hexagon_V6_vrsadubi_acc>;
+defm : T_WWRI_pat <V6_vrmpyubi_acc, int_hexagon_V6_vrmpyubi_acc>;
+
+// assembler mapped.
+//defm : T_V_pat <V6_vtran2x2, int_hexagon_V6_vtran2x2>;
+// not present earlier.. need to add intrinsic
+defm : T_VVR_pat <V6_valignb, int_hexagon_V6_valignb>;
+defm : T_VVR_pat <V6_vlalignb, int_hexagon_V6_vlalignb>;
+defm : T_VVR_pat <V6_vasrwh, int_hexagon_V6_vasrwh>;
+defm : T_VVR_pat <V6_vasrwhsat, int_hexagon_V6_vasrwhsat>;
+defm : T_VVR_pat <V6_vasrwhrndsat, int_hexagon_V6_vasrwhrndsat>;
+defm : T_VVR_pat <V6_vasrwuhsat, int_hexagon_V6_vasrwuhsat>;
+defm : T_VVR_pat <V6_vasrhubsat, int_hexagon_V6_vasrhubsat>;
+defm : T_VVR_pat <V6_vasrhubrndsat, int_hexagon_V6_vasrhubrndsat>;
+defm : T_VVR_pat <V6_vasrhbrndsat, int_hexagon_V6_vasrhbrndsat>;
+
+defm : T_VVR_pat <V6_vshuffvdd, int_hexagon_V6_vshuffvdd>;
+defm : T_VVR_pat <V6_vdealvdd, int_hexagon_V6_vdealvdd>;
+
+defm : T_WV_pat <V6_vunpackob, int_hexagon_V6_vunpackob>;
+defm : T_WV_pat <V6_vunpackoh, int_hexagon_V6_vunpackoh>;
+defm : T_VVI_pat <V6_valignbi, int_hexagon_V6_valignbi>;
+defm : T_VVI_pat <V6_vlalignbi, int_hexagon_V6_vlalignbi>;
+
+defm : T_QVV_pat <V6_vswap, int_hexagon_V6_vswap>;
+defm : T_QVV_pat <V6_vmux, int_hexagon_V6_vmux>;
+defm : T_QQ_pat <V6_pred_and, int_hexagon_V6_pred_and>;
+defm : T_QQ_pat <V6_pred_or, int_hexagon_V6_pred_or>;
+defm : T_Q_pat <V6_pred_not, int_hexagon_V6_pred_not>;
+defm : T_QQ_pat <V6_pred_xor, int_hexagon_V6_pred_xor>;
+defm : T_QQ_pat <V6_pred_or_n, int_hexagon_V6_pred_or_n>;
+defm : T_QQ_pat <V6_pred_and_n, int_hexagon_V6_pred_and_n>;
+defm : T_VV_pat <V6_veqb, int_hexagon_V6_veqb>;
+defm : T_VV_pat <V6_veqh, int_hexagon_V6_veqh>;
+defm : T_VV_pat <V6_veqw, int_hexagon_V6_veqw>;
+defm : T_VV_pat <V6_vgtb, int_hexagon_V6_vgtb>;
+defm : T_VV_pat <V6_vgth, int_hexagon_V6_vgth>;
+defm : T_VV_pat <V6_vgtw, int_hexagon_V6_vgtw>;
+defm : T_VV_pat <V6_vgtub, int_hexagon_V6_vgtub>;
+defm : T_VV_pat <V6_vgtuh, int_hexagon_V6_vgtuh>;
+defm : T_VV_pat <V6_vgtuw, int_hexagon_V6_vgtuw>;
+
+defm : T_VQR_pat <V6_vandqrt_acc, int_hexagon_V6_vandqrt_acc>;
+defm : T_QVR_pat <V6_vandvrt_acc, int_hexagon_V6_vandvrt_acc>;
+defm : T_QR_pat <V6_vandqrt, int_hexagon_V6_vandqrt>;
+defm : T_R_pat <V6_lvsplatw, int_hexagon_V6_lvsplatw>;
+defm : T_R_pat <V6_pred_scalar2, int_hexagon_V6_pred_scalar2>;
+defm : T_VR_pat <V6_vandvrt, int_hexagon_V6_vandvrt>;
+
+defm : T_VVR_pat <V6_vlutvvb, int_hexagon_V6_vlutvvb>;
+defm : T_VVR_pat <V6_vlutvwh, int_hexagon_V6_vlutvwh>;
+defm : T_VVVR_pat <V6_vlutvvb_oracc, int_hexagon_V6_vlutvvb_oracc>;
+defm : T_WVVR_pat <V6_vlutvwh_oracc, int_hexagon_V6_vlutvwh_oracc>;
+
+defm : T_QVR_pat <V6_vandvrt_acc, int_hexagon_V6_vandvrt_acc>;
+def : T_PI_pat <S6_rol_i_p, int_hexagon_S6_rol_i_p>;
+def : T_RI_pat <S6_rol_i_r, int_hexagon_S6_rol_i_r>;
+def : T_PPI_pat <S6_rol_i_p_nac, int_hexagon_S6_rol_i_p_nac>;
+def : T_PPI_pat <S6_rol_i_p_acc, int_hexagon_S6_rol_i_p_acc>;
+def : T_PPI_pat <S6_rol_i_p_and, int_hexagon_S6_rol_i_p_and>;
+def : T_PPI_pat <S6_rol_i_p_or, int_hexagon_S6_rol_i_p_or>;
+def : T_PPI_pat <S6_rol_i_p_xacc, int_hexagon_S6_rol_i_p_xacc>;
+def : T_RRI_pat <S6_rol_i_r_nac, int_hexagon_S6_rol_i_r_nac>;
+def : T_RRI_pat <S6_rol_i_r_acc, int_hexagon_S6_rol_i_r_acc>;
+def : T_RRI_pat <S6_rol_i_r_and, int_hexagon_S6_rol_i_r_and>;
+def : T_RRI_pat <S6_rol_i_r_or, int_hexagon_S6_rol_i_r_or>;
+def : T_RRI_pat <S6_rol_i_r_xacc, int_hexagon_S6_rol_i_r_xacc>;
+
+defm : T_VR_pat <V6_extractw, int_hexagon_V6_extractw>;
+defm : T_VR_pat <V6_vinsertwr, int_hexagon_V6_vinsertwr>;
+
+def : T_PPQ_pat <S2_cabacencbin, int_hexagon_S2_cabacencbin>;
+
+def: Pat<(v64i16 (trunc v64i32:$Vdd)),
+ (v64i16 (V6_vpackwh_sat_128B
+ (v32i32 (V6_hi_128B VecDblRegs128B:$Vdd)),
+ (v32i32 (V6_lo_128B VecDblRegs128B:$Vdd))))>,
+ Requires<[UseHVXDbl]>;
+
+def: Pat<(int_hexagon_V6_vd0), (V6_vd0)>;
+def: Pat<(int_hexagon_V6_vd0_128B), (V6_vd0_128B)>;
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonIsetDx.td b/contrib/llvm/lib/Target/Hexagon/HexagonIsetDx.td
new file mode 100644
index 000000000000..ebedf2cbaf17
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonIsetDx.td
@@ -0,0 +1,728 @@
+//=- HexagonIsetDx.td - Target Desc. for Hexagon Target -*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon duplex instructions.
+//
+//===----------------------------------------------------------------------===//
+
+// SA1_combine1i: Combines.
+let isCodeGenOnly = 1, hasSideEffects = 0 in
+def SA1_combine1i: SUBInst <
+ (outs DoubleRegs:$Rdd),
+ (ins u2_0Imm:$u2),
+ "$Rdd = combine(#1, #$u2)"> {
+ bits<3> Rdd;
+ bits<2> u2;
+
+ let Inst{12-10} = 0b111;
+ let Inst{8} = 0b0;
+ let Inst{4-3} = 0b01;
+ let Inst{2-0} = Rdd;
+ let Inst{6-5} = u2;
+ }
+
+// SL2_jumpr31_f: Indirect conditional jump if false.
+// SL2_jumpr31_f -> SL2_jumpr31_fnew
+let Defs = [PC], Uses = [P0, R31], isCodeGenOnly = 1, isPredicated = 1, isPredicatedFalse = 1, isBranch = 1, isIndirectBranch = 1, hasSideEffects = 0 in
+def SL2_jumpr31_f: SUBInst <
+ (outs ),
+ (ins ),
+ "if (!p0) jumpr r31"> {
+ let Inst{12-6} = 0b1111111;
+ let Inst{2-0} = 0b101;
+ }
+
+// SL2_deallocframe: Deallocate stack frame.
+let Defs = [R31, R29, R30], Uses = [R30], isCodeGenOnly = 1, mayLoad = 1, accessSize = DoubleWordAccess in
+def SL2_deallocframe: SUBInst <
+ (outs ),
+ (ins ),
+ "deallocframe"> {
+ let Inst{12-6} = 0b1111100;
+ let Inst{2} = 0b0;
+ }
+
+// SL2_return_f: Deallocate stack frame and return.
+// SL2_return_f -> SL2_return_fnew
+let Defs = [PC, R31, R29, R30], Uses = [R30, P0], isCodeGenOnly = 1, isPredicated = 1, isPredicatedFalse = 1, mayLoad = 1, accessSize = DoubleWordAccess, isBranch = 1, isIndirectBranch = 1 in
+def SL2_return_f: SUBInst <
+ (outs ),
+ (ins ),
+ "if (!p0) dealloc_return"> {
+ let Inst{12-6} = 0b1111101;
+ let Inst{2-0} = 0b101;
+ }
+
+// SA1_combine3i: Combines.
+let isCodeGenOnly = 1, hasSideEffects = 0 in
+def SA1_combine3i: SUBInst <
+ (outs DoubleRegs:$Rdd),
+ (ins u2_0Imm:$u2),
+ "$Rdd = combine(#3, #$u2)"> {
+ bits<3> Rdd;
+ bits<2> u2;
+
+ let Inst{12-10} = 0b111;
+ let Inst{8} = 0b0;
+ let Inst{4-3} = 0b11;
+ let Inst{2-0} = Rdd;
+ let Inst{6-5} = u2;
+ }
+
+// SS2_storebi0: Store byte.
+let isCodeGenOnly = 1, mayStore = 1, accessSize = ByteAccess in
+def SS2_storebi0: SUBInst <
+ (outs ),
+ (ins IntRegs:$Rs, u4_0Imm:$u4_0),
+ "memb($Rs + #$u4_0)=#0"> {
+ bits<4> Rs;
+ bits<4> u4_0;
+
+ let Inst{12-8} = 0b10010;
+ let Inst{7-4} = Rs;
+ let Inst{3-0} = u4_0;
+ }
+
+// SA1_clrtnew: Clear if true.
+let Uses = [P0], isCodeGenOnly = 1, isPredicated = 1, isPredicatedNew = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def SA1_clrtnew: SUBInst <
+ (outs IntRegs:$Rd),
+ (ins PredRegs:$Pu),
+ "if ($Pu.new) $Rd = #0"> {
+ bits<4> Rd;
+
+ let Inst{12-9} = 0b1101;
+ let Inst{6-4} = 0b100;
+ let Inst{3-0} = Rd;
+ }
+
+// SL2_loadruh_io: Load half.
+let isCodeGenOnly = 1, mayLoad = 1, accessSize = HalfWordAccess, hasNewValue = 1, opNewValue = 0 in
+def SL2_loadruh_io: SUBInst <
+ (outs IntRegs:$Rd),
+ (ins IntRegs:$Rs, u3_1Imm:$u3_1),
+ "$Rd = memuh($Rs + #$u3_1)"> {
+ bits<4> Rd;
+ bits<4> Rs;
+ bits<4> u3_1;
+
+ let Inst{12-11} = 0b01;
+ let Inst{3-0} = Rd;
+ let Inst{7-4} = Rs;
+ let Inst{10-8} = u3_1{3-1};
+ }
+
+// SL2_jumpr31_tnew: Indirect conditional jump if true.
+let Defs = [PC], Uses = [P0, R31], isCodeGenOnly = 1, isPredicated = 1, isPredicatedNew = 1, isBranch = 1, isIndirectBranch = 1, hasSideEffects = 0 in
+def SL2_jumpr31_tnew: SUBInst <
+ (outs ),
+ (ins ),
+ "if (p0.new) jumpr:nt r31"> {
+ let Inst{12-6} = 0b1111111;
+ let Inst{2-0} = 0b110;
+ }
+
+// SA1_addi: Add.
+let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0, isExtendable = 1, isExtentSigned = 1, opExtentBits = 7, opExtendable = 2 in
+def SA1_addi: SUBInst <
+ (outs IntRegs:$Rx),
+ (ins IntRegs:$_src_, s7_0Ext:$s7),
+ "$Rx = add($_src_, #$s7)" ,
+ [] ,
+ "$_src_ = $Rx"> {
+ bits<4> Rx;
+ bits<7> s7;
+
+ let Inst{12-11} = 0b00;
+ let Inst{3-0} = Rx;
+ let Inst{10-4} = s7;
+ }
+
+// SL1_loadrub_io: Load byte.
+let isCodeGenOnly = 1, mayLoad = 1, accessSize = ByteAccess, hasNewValue = 1, opNewValue = 0 in
+def SL1_loadrub_io: SUBInst <
+ (outs IntRegs:$Rd),
+ (ins IntRegs:$Rs, u4_0Imm:$u4_0),
+ "$Rd = memub($Rs + #$u4_0)"> {
+ bits<4> Rd;
+ bits<4> Rs;
+ bits<4> u4_0;
+
+ let Inst{12} = 0b1;
+ let Inst{3-0} = Rd;
+ let Inst{7-4} = Rs;
+ let Inst{11-8} = u4_0;
+ }
+
+// SL1_loadri_io: Load word.
+let isCodeGenOnly = 1, mayLoad = 1, accessSize = WordAccess, hasNewValue = 1, opNewValue = 0 in
+def SL1_loadri_io: SUBInst <
+ (outs IntRegs:$Rd),
+ (ins IntRegs:$Rs, u4_2Imm:$u4_2),
+ "$Rd = memw($Rs + #$u4_2)"> {
+ bits<4> Rd;
+ bits<4> Rs;
+ bits<6> u4_2;
+
+ let Inst{12} = 0b0;
+ let Inst{3-0} = Rd;
+ let Inst{7-4} = Rs;
+ let Inst{11-8} = u4_2{5-2};
+ }
+
+// SA1_cmpeqi: Compareimmed.
+let Defs = [P0], isCodeGenOnly = 1, hasSideEffects = 0 in
+def SA1_cmpeqi: SUBInst <
+ (outs ),
+ (ins IntRegs:$Rs, u2_0Imm:$u2),
+ "p0 = cmp.eq($Rs, #$u2)"> {
+ bits<4> Rs;
+ bits<2> u2;
+
+ let Inst{12-8} = 0b11001;
+ let Inst{7-4} = Rs;
+ let Inst{1-0} = u2;
+ }
+
+// SA1_combinerz: Combines.
+let isCodeGenOnly = 1, hasSideEffects = 0 in
+def SA1_combinerz: SUBInst <
+ (outs DoubleRegs:$Rdd),
+ (ins IntRegs:$Rs),
+ "$Rdd = combine($Rs, #0)"> {
+ bits<3> Rdd;
+ bits<4> Rs;
+
+ let Inst{12-10} = 0b111;
+ let Inst{8} = 0b1;
+ let Inst{3} = 0b1;
+ let Inst{2-0} = Rdd;
+ let Inst{7-4} = Rs;
+ }
+
+// SL2_return_t: Deallocate stack frame and return.
+// SL2_return_t -> SL2_return_tnew
+let Defs = [PC, R31, R29, R30], Uses = [R30, P0], isCodeGenOnly = 1, isPredicated = 1, mayLoad = 1, accessSize = DoubleWordAccess, isBranch = 1, isIndirectBranch = 1 in
+def SL2_return_t: SUBInst <
+ (outs ),
+ (ins ),
+ "if (p0) dealloc_return"> {
+ let Inst{12-6} = 0b1111101;
+ let Inst{2-0} = 0b100;
+ }
+
+// SS2_allocframe: Allocate stack frame.
+let Defs = [R29, R30], Uses = [R30, R31, R29], isCodeGenOnly = 1, mayStore = 1, accessSize = DoubleWordAccess in
+def SS2_allocframe: SUBInst <
+ (outs ),
+ (ins u5_3Imm:$u5_3),
+ "allocframe(#$u5_3)"> {
+ bits<8> u5_3;
+
+ let Inst{12-9} = 0b1110;
+ let Inst{8-4} = u5_3{7-3};
+ }
+
+// SS2_storeh_io: Store half.
+let isCodeGenOnly = 1, mayStore = 1, accessSize = HalfWordAccess in
+def SS2_storeh_io: SUBInst <
+ (outs ),
+ (ins IntRegs:$Rs, u3_1Imm:$u3_1, IntRegs:$Rt),
+ "memh($Rs + #$u3_1) = $Rt"> {
+ bits<4> Rs;
+ bits<4> u3_1;
+ bits<4> Rt;
+
+ let Inst{12-11} = 0b00;
+ let Inst{7-4} = Rs;
+ let Inst{10-8} = u3_1{3-1};
+ let Inst{3-0} = Rt;
+ }
+
+// SS2_storewi0: Store word.
+let isCodeGenOnly = 1, mayStore = 1, accessSize = WordAccess in
+def SS2_storewi0: SUBInst <
+ (outs ),
+ (ins IntRegs:$Rs, u4_2Imm:$u4_2),
+ "memw($Rs + #$u4_2)=#0"> {
+ bits<4> Rs;
+ bits<6> u4_2;
+
+ let Inst{12-8} = 0b10000;
+ let Inst{7-4} = Rs;
+ let Inst{3-0} = u4_2{5-2};
+ }
+
+// SS2_storewi1: Store word.
+let isCodeGenOnly = 1, mayStore = 1, accessSize = WordAccess in
+def SS2_storewi1: SUBInst <
+ (outs ),
+ (ins IntRegs:$Rs, u4_2Imm:$u4_2),
+ "memw($Rs + #$u4_2)=#1"> {
+ bits<4> Rs;
+ bits<6> u4_2;
+
+ let Inst{12-8} = 0b10001;
+ let Inst{7-4} = Rs;
+ let Inst{3-0} = u4_2{5-2};
+ }
+
+// SL2_jumpr31: Indirect conditional jump if true.
+let Defs = [PC], Uses = [R31], isCodeGenOnly = 1, isBranch = 1, isIndirectBranch = 1, hasSideEffects = 0 in
+def SL2_jumpr31: SUBInst <
+ (outs ),
+ (ins ),
+ "jumpr r31"> {
+ let Inst{12-6} = 0b1111111;
+ let Inst{2} = 0b0;
+ }
+
+// SA1_combinezr: Combines.
+let isCodeGenOnly = 1, hasSideEffects = 0 in
+def SA1_combinezr: SUBInst <
+ (outs DoubleRegs:$Rdd),
+ (ins IntRegs:$Rs),
+ "$Rdd = combine(#0, $Rs)"> {
+ bits<3> Rdd;
+ bits<4> Rs;
+
+ let Inst{12-10} = 0b111;
+ let Inst{8} = 0b1;
+ let Inst{3} = 0b0;
+ let Inst{2-0} = Rdd;
+ let Inst{7-4} = Rs;
+ }
+
+// SL2_loadrh_io: Load half.
+let isCodeGenOnly = 1, mayLoad = 1, accessSize = HalfWordAccess, hasNewValue = 1, opNewValue = 0 in
+def SL2_loadrh_io: SUBInst <
+ (outs IntRegs:$Rd),
+ (ins IntRegs:$Rs, u3_1Imm:$u3_1),
+ "$Rd = memh($Rs + #$u3_1)"> {
+ bits<4> Rd;
+ bits<4> Rs;
+ bits<4> u3_1;
+
+ let Inst{12-11} = 0b00;
+ let Inst{3-0} = Rd;
+ let Inst{7-4} = Rs;
+ let Inst{10-8} = u3_1{3-1};
+ }
+
+// SA1_addrx: Add.
+let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def SA1_addrx: SUBInst <
+ (outs IntRegs:$Rx),
+ (ins IntRegs:$_src_, IntRegs:$Rs),
+ "$Rx = add($_src_, $Rs)" ,
+ [] ,
+ "$_src_ = $Rx"> {
+ bits<4> Rx;
+ bits<4> Rs;
+
+ let Inst{12-8} = 0b11000;
+ let Inst{3-0} = Rx;
+ let Inst{7-4} = Rs;
+ }
+
+// SA1_setin1: Set to -1.
+let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def SA1_setin1: SUBInst <
+ (outs IntRegs:$Rd),
+ (ins ),
+ "$Rd = #{-1}"> {
+ bits<4> Rd;
+
+ let Inst{12-9} = 0b1101;
+ let Inst{6} = 0b0;
+ let Inst{3-0} = Rd;
+ }
+
+// SA1_sxth: Sxth.
+let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def SA1_sxth: SUBInst <
+ (outs IntRegs:$Rd),
+ (ins IntRegs:$Rs),
+ "$Rd = sxth($Rs)"> {
+ bits<4> Rd;
+ bits<4> Rs;
+
+ let Inst{12-8} = 0b10100;
+ let Inst{3-0} = Rd;
+ let Inst{7-4} = Rs;
+ }
+
+// SA1_combine0i: Combines.
+let isCodeGenOnly = 1, hasSideEffects = 0 in
+def SA1_combine0i: SUBInst <
+ (outs DoubleRegs:$Rdd),
+ (ins u2_0Imm:$u2),
+ "$Rdd = combine(#0, #$u2)"> {
+ bits<3> Rdd;
+ bits<2> u2;
+
+ let Inst{12-10} = 0b111;
+ let Inst{8} = 0b0;
+ let Inst{4-3} = 0b00;
+ let Inst{2-0} = Rdd;
+ let Inst{6-5} = u2;
+ }
+
+// SA1_combine2i: Combines.
+let isCodeGenOnly = 1, hasSideEffects = 0 in
+def SA1_combine2i: SUBInst <
+ (outs DoubleRegs:$Rdd),
+ (ins u2_0Imm:$u2),
+ "$Rdd = combine(#2, #$u2)"> {
+ bits<3> Rdd;
+ bits<2> u2;
+
+ let Inst{12-10} = 0b111;
+ let Inst{8} = 0b0;
+ let Inst{4-3} = 0b10;
+ let Inst{2-0} = Rdd;
+ let Inst{6-5} = u2;
+ }
+
+// SA1_sxtb: Sxtb.
+let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def SA1_sxtb: SUBInst <
+ (outs IntRegs:$Rd),
+ (ins IntRegs:$Rs),
+ "$Rd = sxtb($Rs)"> {
+ bits<4> Rd;
+ bits<4> Rs;
+
+ let Inst{12-8} = 0b10101;
+ let Inst{3-0} = Rd;
+ let Inst{7-4} = Rs;
+ }
+
+// SA1_clrf: Clear if false.
+// SA1_clrf -> SA1_clrfnew
+let Uses = [P0], isCodeGenOnly = 1, isPredicated = 1, isPredicatedFalse = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def SA1_clrf: SUBInst <
+ (outs IntRegs:$Rd),
+ (ins PredRegs:$Pu),
+ "if (!$Pu) $Rd = #0"> {
+ bits<4> Rd;
+
+ let Inst{12-9} = 0b1101;
+ let Inst{6-4} = 0b111;
+ let Inst{3-0} = Rd;
+ }
+
+// SL2_loadrb_io: Load byte.
+let isCodeGenOnly = 1, mayLoad = 1, accessSize = ByteAccess, hasNewValue = 1, opNewValue = 0 in
+def SL2_loadrb_io: SUBInst <
+ (outs IntRegs:$Rd),
+ (ins IntRegs:$Rs, u3_0Imm:$u3_0),
+ "$Rd = memb($Rs + #$u3_0)"> {
+ bits<4> Rd;
+ bits<4> Rs;
+ bits<3> u3_0;
+
+ let Inst{12-11} = 0b10;
+ let Inst{3-0} = Rd;
+ let Inst{7-4} = Rs;
+ let Inst{10-8} = u3_0;
+ }
+
+// SA1_tfr: Tfr.
+let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def SA1_tfr: SUBInst <
+ (outs IntRegs:$Rd),
+ (ins IntRegs:$Rs),
+ "$Rd = $Rs"> {
+ bits<4> Rd;
+ bits<4> Rs;
+
+ let Inst{12-8} = 0b10000;
+ let Inst{3-0} = Rd;
+ let Inst{7-4} = Rs;
+ }
+
+// SL2_loadrd_sp: Load dword.
+let Uses = [R29], isCodeGenOnly = 1, mayLoad = 1, accessSize = DoubleWordAccess in
+def SL2_loadrd_sp: SUBInst <
+ (outs DoubleRegs:$Rdd),
+ (ins u5_3Imm:$u5_3),
+ "$Rdd = memd(r29 + #$u5_3)"> {
+ bits<3> Rdd;
+ bits<8> u5_3;
+
+ let Inst{12-8} = 0b11110;
+ let Inst{2-0} = Rdd;
+ let Inst{7-3} = u5_3{7-3};
+ }
+
+// SA1_and1: And #1.
+let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def SA1_and1: SUBInst <
+ (outs IntRegs:$Rd),
+ (ins IntRegs:$Rs),
+ "$Rd = and($Rs, #1)"> {
+ bits<4> Rd;
+ bits<4> Rs;
+
+ let Inst{12-8} = 0b10010;
+ let Inst{3-0} = Rd;
+ let Inst{7-4} = Rs;
+ }
+
+// SS2_storebi1: Store byte.
+let isCodeGenOnly = 1, mayStore = 1, accessSize = ByteAccess in
+def SS2_storebi1: SUBInst <
+ (outs ),
+ (ins IntRegs:$Rs, u4_0Imm:$u4_0),
+ "memb($Rs + #$u4_0)=#1"> {
+ bits<4> Rs;
+ bits<4> u4_0;
+
+ let Inst{12-8} = 0b10011;
+ let Inst{7-4} = Rs;
+ let Inst{3-0} = u4_0;
+ }
+
+// SA1_inc: Inc.
+let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def SA1_inc: SUBInst <
+ (outs IntRegs:$Rd),
+ (ins IntRegs:$Rs),
+ "$Rd = add($Rs, #1)"> {
+ bits<4> Rd;
+ bits<4> Rs;
+
+ let Inst{12-8} = 0b10001;
+ let Inst{3-0} = Rd;
+ let Inst{7-4} = Rs;
+ }
+
+// SS2_stored_sp: Store dword.
+let Uses = [R29], isCodeGenOnly = 1, mayStore = 1, accessSize = DoubleWordAccess in
+def SS2_stored_sp: SUBInst <
+ (outs ),
+ (ins s6_3Imm:$s6_3, DoubleRegs:$Rtt),
+ "memd(r29 + #$s6_3) = $Rtt"> {
+ bits<9> s6_3;
+ bits<3> Rtt;
+
+ let Inst{12-9} = 0b0101;
+ let Inst{8-3} = s6_3{8-3};
+ let Inst{2-0} = Rtt;
+ }
+
+// SS2_storew_sp: Store word.
+let Uses = [R29], isCodeGenOnly = 1, mayStore = 1, accessSize = WordAccess in
+def SS2_storew_sp: SUBInst <
+ (outs ),
+ (ins u5_2Imm:$u5_2, IntRegs:$Rt),
+ "memw(r29 + #$u5_2) = $Rt"> {
+ bits<7> u5_2;
+ bits<4> Rt;
+
+ let Inst{12-9} = 0b0100;
+ let Inst{8-4} = u5_2{6-2};
+ let Inst{3-0} = Rt;
+ }
+
+// SL2_jumpr31_fnew: Indirect conditional jump if false.
+let Defs = [PC], Uses = [P0, R31], isCodeGenOnly = 1, isPredicated = 1, isPredicatedFalse = 1, isPredicatedNew = 1, isBranch = 1, isIndirectBranch = 1, hasSideEffects = 0 in
+def SL2_jumpr31_fnew: SUBInst <
+ (outs ),
+ (ins ),
+ "if (!p0.new) jumpr:nt r31"> {
+ let Inst{12-6} = 0b1111111;
+ let Inst{2-0} = 0b111;
+ }
+
+// SA1_clrt: Clear if true.
+// SA1_clrt -> SA1_clrtnew
+let Uses = [P0], isCodeGenOnly = 1, isPredicated = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def SA1_clrt: SUBInst <
+ (outs IntRegs:$Rd),
+ (ins PredRegs:$Pu),
+ "if ($Pu) $Rd = #0"> {
+ bits<4> Rd;
+
+ let Inst{12-9} = 0b1101;
+ let Inst{6-4} = 0b110;
+ let Inst{3-0} = Rd;
+ }
+
+// SL2_return: Deallocate stack frame and return.
+let Defs = [PC, R31, R29, R30], Uses = [R30], isCodeGenOnly = 1, mayLoad = 1, accessSize = DoubleWordAccess, isBranch = 1, isIndirectBranch = 1 in
+def SL2_return: SUBInst <
+ (outs ),
+ (ins ),
+ "dealloc_return"> {
+ let Inst{12-6} = 0b1111101;
+ let Inst{2} = 0b0;
+ }
+
+// SA1_dec: Dec.
+let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def SA1_dec: SUBInst <
+ (outs IntRegs:$Rd),
+ (ins IntRegs:$Rs),
+ "$Rd = add($Rs,#{-1})"> {
+ bits<4> Rd;
+ bits<4> Rs;
+
+ let Inst{12-8} = 0b10011;
+ let Inst{3-0} = Rd;
+ let Inst{7-4} = Rs;
+ }
+
+// SA1_seti: Set immed.
+let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0, isExtendable = 1, isExtentSigned = 0, opExtentBits = 6, opExtendable = 1 in
+def SA1_seti: SUBInst <
+ (outs IntRegs:$Rd),
+ (ins u6_0Ext:$u6),
+ "$Rd = #$u6"> {
+ bits<4> Rd;
+ bits<6> u6;
+
+ let Inst{12-10} = 0b010;
+ let Inst{3-0} = Rd;
+ let Inst{9-4} = u6;
+ }
+
+// SL2_jumpr31_t: Indirect conditional jump if true.
+// SL2_jumpr31_t -> SL2_jumpr31_tnew
+let Defs = [PC], Uses = [P0, R31], isCodeGenOnly = 1, isPredicated = 1, isBranch = 1, isIndirectBranch = 1, hasSideEffects = 0 in
+def SL2_jumpr31_t: SUBInst <
+ (outs ),
+ (ins ),
+ "if (p0) jumpr r31"> {
+ let Inst{12-6} = 0b1111111;
+ let Inst{2-0} = 0b100;
+ }
+
+// SA1_clrfnew: Clear if false.
+let Uses = [P0], isCodeGenOnly = 1, isPredicated = 1, isPredicatedFalse = 1, isPredicatedNew = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def SA1_clrfnew: SUBInst <
+ (outs IntRegs:$Rd),
+ (ins PredRegs:$Pu),
+ "if (!$Pu.new) $Rd = #0"> {
+ bits<4> Rd;
+
+ let Inst{12-9} = 0b1101;
+ let Inst{6-4} = 0b101;
+ let Inst{3-0} = Rd;
+ }
+
+// SS1_storew_io: Store word.
+let isCodeGenOnly = 1, mayStore = 1, accessSize = WordAccess in
+def SS1_storew_io: SUBInst <
+ (outs ),
+ (ins IntRegs:$Rs, u4_2Imm:$u4_2, IntRegs:$Rt),
+ "memw($Rs + #$u4_2) = $Rt"> {
+ bits<4> Rs;
+ bits<6> u4_2;
+ bits<4> Rt;
+
+ let Inst{12} = 0b0;
+ let Inst{7-4} = Rs;
+ let Inst{11-8} = u4_2{5-2};
+ let Inst{3-0} = Rt;
+ }
+
+// SA1_zxtb: Zxtb.
+let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def SA1_zxtb: SUBInst <
+ (outs IntRegs:$Rd),
+ (ins IntRegs:$Rs),
+ "$Rd = and($Rs, #255)"> {
+ bits<4> Rd;
+ bits<4> Rs;
+
+ let Inst{12-8} = 0b10111;
+ let Inst{3-0} = Rd;
+ let Inst{7-4} = Rs;
+ }
+
+// SA1_addsp: Add.
+let Uses = [R29], isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def SA1_addsp: SUBInst <
+ (outs IntRegs:$Rd),
+ (ins u6_2Imm:$u6_2),
+ "$Rd = add(r29, #$u6_2)"> {
+ bits<4> Rd;
+ bits<8> u6_2;
+
+ let Inst{12-10} = 0b011;
+ let Inst{3-0} = Rd;
+ let Inst{9-4} = u6_2{7-2};
+ }
+
+// SL2_loadri_sp: Load word.
+let Uses = [R29], isCodeGenOnly = 1, mayLoad = 1, accessSize = WordAccess, hasNewValue = 1, opNewValue = 0 in
+def SL2_loadri_sp: SUBInst <
+ (outs IntRegs:$Rd),
+ (ins u5_2Imm:$u5_2),
+ "$Rd = memw(r29 + #$u5_2)"> {
+ bits<4> Rd;
+ bits<7> u5_2;
+
+ let Inst{12-9} = 0b1110;
+ let Inst{3-0} = Rd;
+ let Inst{8-4} = u5_2{6-2};
+ }
+
+// SS1_storeb_io: Store byte.
+let isCodeGenOnly = 1, mayStore = 1, accessSize = ByteAccess in
+def SS1_storeb_io: SUBInst <
+ (outs ),
+ (ins IntRegs:$Rs, u4_0Imm:$u4_0, IntRegs:$Rt),
+ "memb($Rs + #$u4_0) = $Rt"> {
+ bits<4> Rs;
+ bits<4> u4_0;
+ bits<4> Rt;
+
+ let Inst{12} = 0b1;
+ let Inst{7-4} = Rs;
+ let Inst{11-8} = u4_0;
+ let Inst{3-0} = Rt;
+ }
+
+// SL2_return_tnew: Deallocate stack frame and return.
+let Defs = [PC, R31, R29, R30], Uses = [R30, P0], isCodeGenOnly = 1, isPredicated = 1, isPredicatedNew = 1, mayLoad = 1, accessSize = DoubleWordAccess, isBranch = 1, isIndirectBranch = 1 in
+def SL2_return_tnew: SUBInst <
+ (outs ),
+ (ins ),
+ "if (p0.new) dealloc_return:nt"> {
+ let Inst{12-6} = 0b1111101;
+ let Inst{2-0} = 0b110;
+ }
+
+// SL2_return_fnew: Deallocate stack frame and return.
+let Defs = [PC, R31, R29, R30], Uses = [R30, P0], isCodeGenOnly = 1, isPredicated = 1, isPredicatedFalse = 1, isPredicatedNew = 1, mayLoad = 1, accessSize = DoubleWordAccess, isBranch = 1, isIndirectBranch = 1 in
+def SL2_return_fnew: SUBInst <
+ (outs ),
+ (ins ),
+ "if (!p0.new) dealloc_return:nt"> {
+ let Inst{12-6} = 0b1111101;
+ let Inst{2-0} = 0b111;
+ }
+
+// SA1_zxth: Zxth.
+let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def SA1_zxth: SUBInst <
+ (outs IntRegs:$Rd),
+ (ins IntRegs:$Rs),
+ "$Rd = zxth($Rs)"> {
+ bits<4> Rd;
+ bits<4> Rs;
+
+ let Inst{12-8} = 0b10110;
+ let Inst{3-0} = Rd;
+ let Inst{7-4} = Rs;
+ }
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp
new file mode 100644
index 000000000000..a5dc002642c8
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp
@@ -0,0 +1,170 @@
+//===- HexagonMCInstLower.cpp - Convert Hexagon MachineInstr to an MCInst -===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower Hexagon MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "HexagonAsmPrinter.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
+
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+
+using namespace llvm;
+
+namespace llvm {
+ void HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI,
+ MCInst &MCB, HexagonAsmPrinter &AP);
+}
+
+static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
+ HexagonAsmPrinter &Printer, bool MustExtend) {
+ MCContext &MC = Printer.OutContext;
+ const MCExpr *ME;
+
+ // Populate the relocation type based on Hexagon target flags
+ // set on an operand
+ MCSymbolRefExpr::VariantKind RelocationType;
+ switch (MO.getTargetFlags()) {
+ default:
+ RelocationType = MCSymbolRefExpr::VK_None;
+ break;
+ case HexagonII::MO_PCREL:
+ RelocationType = MCSymbolRefExpr::VK_Hexagon_PCREL;
+ break;
+ case HexagonII::MO_GOT:
+ RelocationType = MCSymbolRefExpr::VK_GOT;
+ break;
+ case HexagonII::MO_LO16:
+ RelocationType = MCSymbolRefExpr::VK_Hexagon_LO16;
+ break;
+ case HexagonII::MO_HI16:
+ RelocationType = MCSymbolRefExpr::VK_Hexagon_HI16;
+ break;
+ case HexagonII::MO_GPREL:
+ RelocationType = MCSymbolRefExpr::VK_Hexagon_GPREL;
+ break;
+ case HexagonII::MO_GDGOT:
+ RelocationType = MCSymbolRefExpr::VK_Hexagon_GD_GOT;
+ break;
+ case HexagonII::MO_GDPLT:
+ RelocationType = MCSymbolRefExpr::VK_Hexagon_GD_PLT;
+ break;
+ case HexagonII::MO_IE:
+ RelocationType = MCSymbolRefExpr::VK_Hexagon_IE;
+ break;
+ case HexagonII::MO_IEGOT:
+ RelocationType = MCSymbolRefExpr::VK_Hexagon_IE_GOT;
+ break;
+ case HexagonII::MO_TPREL:
+ RelocationType = MCSymbolRefExpr::VK_TPREL;
+ break;
+ }
+
+ ME = MCSymbolRefExpr::create(Symbol, RelocationType, MC);
+
+ if (!MO.isJTI() && MO.getOffset())
+ ME = MCBinaryExpr::createAdd(ME, MCConstantExpr::create(MO.getOffset(), MC),
+ MC);
+
+ ME = HexagonMCExpr::create(ME, MC);
+ HexagonMCInstrInfo::setMustExtend(*ME, MustExtend);
+ return MCOperand::createExpr(ME);
+}
+
+// Create an MCInst from a MachineInstr
+void llvm::HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI,
+ MCInst &MCB, HexagonAsmPrinter &AP) {
+ if (MI->getOpcode() == Hexagon::ENDLOOP0) {
+ HexagonMCInstrInfo::setInnerLoop(MCB);
+ return;
+ }
+ if (MI->getOpcode() == Hexagon::ENDLOOP1) {
+ HexagonMCInstrInfo::setOuterLoop(MCB);
+ return;
+ }
+ MCInst *MCI = new (AP.OutContext) MCInst;
+ MCI->setOpcode(MI->getOpcode());
+ assert(MCI->getOpcode() == static_cast<unsigned>(MI->getOpcode()) &&
+ "MCI opcode should have been set on construction");
+
+ for (unsigned i = 0, e = MI->getNumOperands(); i < e; i++) {
+ const MachineOperand &MO = MI->getOperand(i);
+ MCOperand MCO;
+ bool MustExtend = MO.getTargetFlags() & HexagonII::HMOTF_ConstExtended;
+
+ switch (MO.getType()) {
+ default:
+ MI->dump();
+ llvm_unreachable("unknown operand type");
+ case MachineOperand::MO_Register:
+ // Ignore all implicit register operands.
+ if (MO.isImplicit()) continue;
+ MCO = MCOperand::createReg(MO.getReg());
+ break;
+ case MachineOperand::MO_FPImmediate: {
+ APFloat Val = MO.getFPImm()->getValueAPF();
+ // FP immediates are used only when setting GPRs, so they may be dealt
+ // with like regular immediates from this point on.
+ auto Expr = HexagonMCExpr::create(
+ MCConstantExpr::create(*Val.bitcastToAPInt().getRawData(),
+ AP.OutContext),
+ AP.OutContext);
+ HexagonMCInstrInfo::setMustExtend(*Expr, MustExtend);
+ MCO = MCOperand::createExpr(Expr);
+ break;
+ }
+ case MachineOperand::MO_Immediate: {
+ auto Expr = HexagonMCExpr::create(
+ MCConstantExpr::create(MO.getImm(), AP.OutContext), AP.OutContext);
+ HexagonMCInstrInfo::setMustExtend(*Expr, MustExtend);
+ MCO = MCOperand::createExpr(Expr);
+ break;
+ }
+ case MachineOperand::MO_MachineBasicBlock: {
+ MCExpr const *Expr = MCSymbolRefExpr::create(MO.getMBB()->getSymbol(),
+ AP.OutContext);
+ Expr = HexagonMCExpr::create(Expr, AP.OutContext);
+ HexagonMCInstrInfo::setMustExtend(*Expr, MustExtend);
+ MCO = MCOperand::createExpr(Expr);
+ break;
+ }
+ case MachineOperand::MO_GlobalAddress:
+ MCO = GetSymbolRef(MO, AP.getSymbol(MO.getGlobal()), AP, MustExtend);
+ break;
+ case MachineOperand::MO_ExternalSymbol:
+ MCO = GetSymbolRef(MO, AP.GetExternalSymbolSymbol(MO.getSymbolName()),
+ AP, MustExtend);
+ break;
+ case MachineOperand::MO_JumpTableIndex:
+ MCO = GetSymbolRef(MO, AP.GetJTISymbol(MO.getIndex()), AP, MustExtend);
+ break;
+ case MachineOperand::MO_ConstantPoolIndex:
+ MCO = GetSymbolRef(MO, AP.GetCPISymbol(MO.getIndex()), AP, MustExtend);
+ break;
+ case MachineOperand::MO_BlockAddress:
+ MCO = GetSymbolRef(MO, AP.GetBlockAddressSymbol(MO.getBlockAddress()), AP,
+ MustExtend);
+ break;
+ }
+
+ MCI->addOperand(MCO);
+ }
+ AP.HexagonProcessInstruction(*MCI, *MI);
+ HexagonMCInstrInfo::extendIfNeeded(AP.OutContext, MCII, MCB, *MCI);
+ MCB.addOperand(MCOperand::createInst(MCI));
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.cpp
new file mode 100644
index 000000000000..9579c8b6df16
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.cpp
@@ -0,0 +1,16 @@
+//= HexagonMachineFunctionInfo.cpp - Hexagon machine function info *- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonMachineFunctionInfo.h"
+
+using namespace llvm;
+
+// pin vtable to this file
+void HexagonMachineFunctionInfo::anchor() {}
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.h b/contrib/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
new file mode 100644
index 000000000000..371b52108b9b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
@@ -0,0 +1,80 @@
+//=- HexagonMachineFunctionInfo.h - Hexagon machine function info -*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONMACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONMACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include <map>
+
+namespace llvm {
+
+ namespace Hexagon {
+ const unsigned int StartPacket = 0x1;
+ const unsigned int EndPacket = 0x2;
+ }
+
+
+/// Hexagon target-specific information for each MachineFunction.
+class HexagonMachineFunctionInfo : public MachineFunctionInfo {
+ // SRetReturnReg - Some subtargets require that sret lowering includes
+ // returning the value of the returned struct in a register. This field
+ // holds the virtual register into which the sret argument is passed.
+ unsigned SRetReturnReg;
+ unsigned StackAlignBaseVReg; // Aligned-stack base register (virtual)
+ unsigned StackAlignBasePhysReg; // (physical)
+ int VarArgsFrameIndex;
+ bool HasClobberLR;
+ bool HasEHReturn;
+ std::map<const MachineInstr*, unsigned> PacketInfo;
+ virtual void anchor();
+
+public:
+ HexagonMachineFunctionInfo() : SRetReturnReg(0), StackAlignBaseVReg(0),
+ StackAlignBasePhysReg(0), HasClobberLR(0), HasEHReturn(false) {}
+
+ HexagonMachineFunctionInfo(MachineFunction &MF) : SRetReturnReg(0),
+ StackAlignBaseVReg(0), StackAlignBasePhysReg(0), HasClobberLR(0),
+ HasEHReturn(false) {}
+
+ unsigned getSRetReturnReg() const { return SRetReturnReg; }
+ void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+
+ void setVarArgsFrameIndex(int v) { VarArgsFrameIndex = v; }
+ int getVarArgsFrameIndex() { return VarArgsFrameIndex; }
+
+ void setStartPacket(MachineInstr* MI) {
+ PacketInfo[MI] |= Hexagon::StartPacket;
+ }
+ void setEndPacket(MachineInstr* MI) {
+ PacketInfo[MI] |= Hexagon::EndPacket;
+ }
+ bool isStartPacket(const MachineInstr* MI) const {
+ return (PacketInfo.count(MI) &&
+ (PacketInfo.find(MI)->second & Hexagon::StartPacket));
+ }
+ bool isEndPacket(const MachineInstr* MI) const {
+ return (PacketInfo.count(MI) &&
+ (PacketInfo.find(MI)->second & Hexagon::EndPacket));
+ }
+ void setHasClobberLR(bool v) { HasClobberLR = v; }
+ bool hasClobberLR() const { return HasClobberLR; }
+
+ bool hasEHReturn() const { return HasEHReturn; };
+ void setHasEHReturn(bool H = true) { HasEHReturn = H; };
+
+ void setStackAlignBaseVReg(unsigned R) { StackAlignBaseVReg = R; }
+ unsigned getStackAlignBaseVReg() const { return StackAlignBaseVReg; }
+
+ void setStackAlignBasePhysReg(unsigned R) { StackAlignBasePhysReg = R; }
+ unsigned getStackAlignBasePhysReg() const { return StackAlignBasePhysReg; }
+};
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp
new file mode 100644
index 000000000000..9ff9d93ea0c3
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp
@@ -0,0 +1,1031 @@
+//===- HexagonMachineScheduler.cpp - MI Scheduler for Hexagon -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// MachineScheduler schedules machine instructions after phi elimination. It
+// preserves LiveIntervals so it can be invoked before register allocation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonMachineScheduler.h"
+#include "HexagonSubtarget.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include "llvm/IR/Function.h"
+
+#include <iomanip>
+#include <sstream>
+
+static cl::opt<bool> IgnoreBBRegPressure("ignore-bb-reg-pressure",
+ cl::Hidden, cl::ZeroOrMore, cl::init(false));
+
+static cl::opt<bool> SchedPredsCloser("sched-preds-closer",
+ cl::Hidden, cl::ZeroOrMore, cl::init(true));
+
+static cl::opt<unsigned> SchedDebugVerboseLevel("misched-verbose-level",
+ cl::Hidden, cl::ZeroOrMore, cl::init(1));
+
+static cl::opt<bool> TopUseShorterTie("top-use-shorter-tie",
+ cl::Hidden, cl::ZeroOrMore, cl::init(false));
+
+static cl::opt<bool> BotUseShorterTie("bot-use-shorter-tie",
+ cl::Hidden, cl::ZeroOrMore, cl::init(false));
+
+static cl::opt<bool> DisableTCTie("disable-tc-tie",
+ cl::Hidden, cl::ZeroOrMore, cl::init(false));
+
+static cl::opt<bool> SchedRetvalOptimization("sched-retval-optimization",
+ cl::Hidden, cl::ZeroOrMore, cl::init(true));
+
+// Check if the scheduler should penalize instructions that are available to
+// early due to a zero-latency dependence.
+static cl::opt<bool> CheckEarlyAvail("check-early-avail", cl::Hidden,
+ cl::ZeroOrMore, cl::init(true));
+
+using namespace llvm;
+
+#define DEBUG_TYPE "misched"
+
+namespace {
+class HexagonCallMutation : public ScheduleDAGMutation {
+public:
+ void apply(ScheduleDAGInstrs *DAG) override;
+private:
+ bool shouldTFRICallBind(const HexagonInstrInfo &HII,
+ const SUnit &Inst1, const SUnit &Inst2) const;
+};
+} // end anonymous namespace
+
+// Check if a call and subsequent A2_tfrpi instructions should maintain
+// scheduling affinity. We are looking for the TFRI to be consumed in
+// the next instruction. This should help reduce the instances of
+// double register pairs being allocated and scheduled before a call
+// when not used until after the call. This situation is exacerbated
+// by the fact that we allocate the pair from the callee saves list,
+// leading to excess spills and restores.
+bool HexagonCallMutation::shouldTFRICallBind(const HexagonInstrInfo &HII,
+ const SUnit &Inst1, const SUnit &Inst2) const {
+ if (Inst1.getInstr()->getOpcode() != Hexagon::A2_tfrpi)
+ return false;
+
+ // TypeXTYPE are 64 bit operations.
+ if (HII.getType(*Inst2.getInstr()) == HexagonII::TypeXTYPE)
+ return true;
+ return false;
+}
+
+void HexagonCallMutation::apply(ScheduleDAGInstrs *DAG) {
+ SUnit* LastSequentialCall = nullptr;
+ unsigned VRegHoldingRet = 0;
+ unsigned RetRegister;
+ SUnit* LastUseOfRet = nullptr;
+ auto &TRI = *DAG->MF.getSubtarget().getRegisterInfo();
+ auto &HII = *DAG->MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
+
+ // Currently we only catch the situation when compare gets scheduled
+ // before preceding call.
+ for (unsigned su = 0, e = DAG->SUnits.size(); su != e; ++su) {
+ // Remember the call.
+ if (DAG->SUnits[su].getInstr()->isCall())
+ LastSequentialCall = &DAG->SUnits[su];
+ // Look for a compare that defines a predicate.
+ else if (DAG->SUnits[su].getInstr()->isCompare() && LastSequentialCall)
+ DAG->SUnits[su].addPred(SDep(LastSequentialCall, SDep::Barrier));
+ // Look for call and tfri* instructions.
+ else if (SchedPredsCloser && LastSequentialCall && su > 1 && su < e-1 &&
+ shouldTFRICallBind(HII, DAG->SUnits[su], DAG->SUnits[su+1]))
+ DAG->SUnits[su].addPred(SDep(&DAG->SUnits[su-1], SDep::Barrier));
+ // Prevent redundant register copies between two calls, which are caused by
+ // both the return value and the argument for the next call being in %R0.
+ // Example:
+ // 1: <call1>
+ // 2: %VregX = COPY %R0
+ // 3: <use of %VregX>
+ // 4: %R0 = ...
+ // 5: <call2>
+ // The scheduler would often swap 3 and 4, so an additional register is
+ // needed. This code inserts a Barrier dependence between 3 & 4 to prevent
+ // this. The same applies for %D0 and %V0/%W0, which are also handled.
+ else if (SchedRetvalOptimization) {
+ const MachineInstr *MI = DAG->SUnits[su].getInstr();
+ if (MI->isCopy() && (MI->readsRegister(Hexagon::R0, &TRI) ||
+ MI->readsRegister(Hexagon::V0, &TRI))) {
+ // %vregX = COPY %R0
+ VRegHoldingRet = MI->getOperand(0).getReg();
+ RetRegister = MI->getOperand(1).getReg();
+ LastUseOfRet = nullptr;
+ } else if (VRegHoldingRet && MI->readsVirtualRegister(VRegHoldingRet))
+ // <use of %vregX>
+ LastUseOfRet = &DAG->SUnits[su];
+ else if (LastUseOfRet && MI->definesRegister(RetRegister, &TRI))
+ // %R0 = ...
+ DAG->SUnits[su].addPred(SDep(LastUseOfRet, SDep::Barrier));
+ }
+ }
+}
+
+
+/// Save the last formed packet
+void VLIWResourceModel::savePacket() {
+ OldPacket = Packet;
+}
+
+/// Check if scheduling of this SU is possible
+/// in the current packet.
+/// It is _not_ precise (statefull), it is more like
+/// another heuristic. Many corner cases are figured
+/// empirically.
+bool VLIWResourceModel::isResourceAvailable(SUnit *SU) {
+ if (!SU || !SU->getInstr())
+ return false;
+
+ // First see if the pipeline could receive this instruction
+ // in the current cycle.
+ switch (SU->getInstr()->getOpcode()) {
+ default:
+ if (!ResourcesModel->canReserveResources(*SU->getInstr()))
+ return false;
+ case TargetOpcode::EXTRACT_SUBREG:
+ case TargetOpcode::INSERT_SUBREG:
+ case TargetOpcode::SUBREG_TO_REG:
+ case TargetOpcode::REG_SEQUENCE:
+ case TargetOpcode::IMPLICIT_DEF:
+ case TargetOpcode::COPY:
+ case TargetOpcode::INLINEASM:
+ break;
+ }
+
+ MachineFunction &MF = *SU->getInstr()->getParent()->getParent();
+ auto &QII = *MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
+
+ // Now see if there are no other dependencies to instructions already
+ // in the packet.
+ for (unsigned i = 0, e = Packet.size(); i != e; ++i) {
+ if (Packet[i]->Succs.size() == 0)
+ continue;
+
+ // Enable .cur formation.
+ if (QII.mayBeCurLoad(*Packet[i]->getInstr()))
+ continue;
+
+ for (SUnit::const_succ_iterator I = Packet[i]->Succs.begin(),
+ E = Packet[i]->Succs.end(); I != E; ++I) {
+ // Since we do not add pseudos to packets, might as well
+ // ignore order dependencies.
+ if (I->isCtrl())
+ continue;
+
+ if (I->getSUnit() == SU)
+ return false;
+ }
+ }
+ return true;
+}
+
+/// Keep track of available resources.
+bool VLIWResourceModel::reserveResources(SUnit *SU) {
+ bool startNewCycle = false;
+ // Artificially reset state.
+ if (!SU) {
+ ResourcesModel->clearResources();
+ savePacket();
+ Packet.clear();
+ TotalPackets++;
+ return false;
+ }
+ // If this SU does not fit in the packet
+ // start a new one.
+ if (!isResourceAvailable(SU)) {
+ ResourcesModel->clearResources();
+ savePacket();
+ Packet.clear();
+ TotalPackets++;
+ startNewCycle = true;
+ }
+
+ switch (SU->getInstr()->getOpcode()) {
+ default:
+ ResourcesModel->reserveResources(*SU->getInstr());
+ break;
+ case TargetOpcode::EXTRACT_SUBREG:
+ case TargetOpcode::INSERT_SUBREG:
+ case TargetOpcode::SUBREG_TO_REG:
+ case TargetOpcode::REG_SEQUENCE:
+ case TargetOpcode::IMPLICIT_DEF:
+ case TargetOpcode::KILL:
+ case TargetOpcode::CFI_INSTRUCTION:
+ case TargetOpcode::EH_LABEL:
+ case TargetOpcode::COPY:
+ case TargetOpcode::INLINEASM:
+ break;
+ }
+ Packet.push_back(SU);
+
+#ifndef NDEBUG
+ DEBUG(dbgs() << "Packet[" << TotalPackets << "]:\n");
+ for (unsigned i = 0, e = Packet.size(); i != e; ++i) {
+ DEBUG(dbgs() << "\t[" << i << "] SU(");
+ DEBUG(dbgs() << Packet[i]->NodeNum << ")\t");
+ DEBUG(Packet[i]->getInstr()->dump());
+ }
+#endif
+
+ // If packet is now full, reset the state so in the next cycle
+ // we start fresh.
+ if (Packet.size() >= SchedModel->getIssueWidth()) {
+ ResourcesModel->clearResources();
+ savePacket();
+ Packet.clear();
+ TotalPackets++;
+ startNewCycle = true;
+ }
+
+ return startNewCycle;
+}
+
+/// schedule - Called back from MachineScheduler::runOnMachineFunction
+/// after setting up the current scheduling region. [RegionBegin, RegionEnd)
+/// only includes instructions that have DAG nodes, not scheduling boundaries.
+void VLIWMachineScheduler::schedule() {
+ DEBUG(dbgs()
+ << "********** MI Converging Scheduling VLIW BB#" << BB->getNumber()
+ << " " << BB->getName()
+ << " in_func " << BB->getParent()->getFunction()->getName()
+ << " at loop depth " << MLI->getLoopDepth(BB)
+ << " \n");
+
+ buildDAGWithRegPressure();
+
+ SmallVector<SUnit*, 8> TopRoots, BotRoots;
+ findRootsAndBiasEdges(TopRoots, BotRoots);
+
+ // Initialize the strategy before modifying the DAG.
+ SchedImpl->initialize(this);
+
+ DEBUG(unsigned maxH = 0;
+ for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
+ if (SUnits[su].getHeight() > maxH)
+ maxH = SUnits[su].getHeight();
+ dbgs() << "Max Height " << maxH << "\n";);
+ DEBUG(unsigned maxD = 0;
+ for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
+ if (SUnits[su].getDepth() > maxD)
+ maxD = SUnits[su].getDepth();
+ dbgs() << "Max Depth " << maxD << "\n";);
+ DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
+ SUnits[su].dumpAll(this));
+
+ initQueues(TopRoots, BotRoots);
+
+ bool IsTopNode = false;
+ while (true) {
+ DEBUG(dbgs() << "** VLIWMachineScheduler::schedule picking next node\n");
+ SUnit *SU = SchedImpl->pickNode(IsTopNode);
+ if (!SU) break;
+
+ if (!checkSchedLimit())
+ break;
+
+ scheduleMI(SU, IsTopNode);
+
+ updateQueues(SU, IsTopNode);
+
+ // Notify the scheduling strategy after updating the DAG.
+ SchedImpl->schedNode(SU, IsTopNode);
+ }
+ assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
+
+ placeDebugValues();
+
+ DEBUG({
+ unsigned BBNum = begin()->getParent()->getNumber();
+ dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n";
+ dumpSchedule();
+ dbgs() << '\n';
+ });
+}
+
+void ConvergingVLIWScheduler::initialize(ScheduleDAGMI *dag) {
+ DAG = static_cast<VLIWMachineScheduler*>(dag);
+ SchedModel = DAG->getSchedModel();
+
+ Top.init(DAG, SchedModel);
+ Bot.init(DAG, SchedModel);
+
+ // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or
+ // are disabled, then these HazardRecs will be disabled.
+ const InstrItineraryData *Itin = DAG->getSchedModel()->getInstrItineraries();
+ const TargetSubtargetInfo &STI = DAG->MF.getSubtarget();
+ const TargetInstrInfo *TII = STI.getInstrInfo();
+ delete Top.HazardRec;
+ delete Bot.HazardRec;
+ Top.HazardRec = TII->CreateTargetMIHazardRecognizer(Itin, DAG);
+ Bot.HazardRec = TII->CreateTargetMIHazardRecognizer(Itin, DAG);
+
+ delete Top.ResourceModel;
+ delete Bot.ResourceModel;
+ Top.ResourceModel = new VLIWResourceModel(STI, DAG->getSchedModel());
+ Bot.ResourceModel = new VLIWResourceModel(STI, DAG->getSchedModel());
+
+ assert((!llvm::ForceTopDown || !llvm::ForceBottomUp) &&
+ "-misched-topdown incompatible with -misched-bottomup");
+
+ DAG->addMutation(make_unique<HexagonSubtarget::HexagonDAGMutation>());
+ DAG->addMutation(make_unique<HexagonCallMutation>());
+}
+
+void ConvergingVLIWScheduler::releaseTopNode(SUnit *SU) {
+ if (SU->isScheduled)
+ return;
+
+ for (const SDep &PI : SU->Preds) {
+ unsigned PredReadyCycle = PI.getSUnit()->TopReadyCycle;
+ unsigned MinLatency = PI.getLatency();
+#ifndef NDEBUG
+ Top.MaxMinLatency = std::max(MinLatency, Top.MaxMinLatency);
+#endif
+ if (SU->TopReadyCycle < PredReadyCycle + MinLatency)
+ SU->TopReadyCycle = PredReadyCycle + MinLatency;
+ }
+ Top.releaseNode(SU, SU->TopReadyCycle);
+}
+
+void ConvergingVLIWScheduler::releaseBottomNode(SUnit *SU) {
+ if (SU->isScheduled)
+ return;
+
+ assert(SU->getInstr() && "Scheduled SUnit must have instr");
+
+ for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+ I != E; ++I) {
+ unsigned SuccReadyCycle = I->getSUnit()->BotReadyCycle;
+ unsigned MinLatency = I->getLatency();
+#ifndef NDEBUG
+ Bot.MaxMinLatency = std::max(MinLatency, Bot.MaxMinLatency);
+#endif
+ if (SU->BotReadyCycle < SuccReadyCycle + MinLatency)
+ SU->BotReadyCycle = SuccReadyCycle + MinLatency;
+ }
+ Bot.releaseNode(SU, SU->BotReadyCycle);
+}
+
+/// Does this SU have a hazard within the current instruction group.
+///
+/// The scheduler supports two modes of hazard recognition. The first is the
+/// ScheduleHazardRecognizer API. It is a fully general hazard recognizer that
+/// supports highly complicated in-order reservation tables
+/// (ScoreboardHazardRecognizer) and arbitrary target-specific logic.
+///
+/// The second is a streamlined mechanism that checks for hazards based on
+/// simple counters that the scheduler itself maintains. It explicitly checks
+/// for instruction dispatch limitations, including the number of micro-ops that
+/// can dispatch per cycle.
+///
+/// TODO: Also check whether the SU must start a new group.
+bool ConvergingVLIWScheduler::VLIWSchedBoundary::checkHazard(SUnit *SU) {
+ if (HazardRec->isEnabled())
+ return HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard;
+
+ unsigned uops = SchedModel->getNumMicroOps(SU->getInstr());
+ if (IssueCount + uops > SchedModel->getIssueWidth())
+ return true;
+
+ return false;
+}
+
+void ConvergingVLIWScheduler::VLIWSchedBoundary::releaseNode(SUnit *SU,
+ unsigned ReadyCycle) {
+ if (ReadyCycle < MinReadyCycle)
+ MinReadyCycle = ReadyCycle;
+
+ // Check for interlocks first. For the purpose of other heuristics, an
+ // instruction that cannot issue appears as if it's not in the ReadyQueue.
+ if (ReadyCycle > CurrCycle || checkHazard(SU))
+
+ Pending.push(SU);
+ else
+ Available.push(SU);
+}
+
+/// Move the boundary of scheduled code by one cycle.
+void ConvergingVLIWScheduler::VLIWSchedBoundary::bumpCycle() {
+ unsigned Width = SchedModel->getIssueWidth();
+ IssueCount = (IssueCount <= Width) ? 0 : IssueCount - Width;
+
+ assert(MinReadyCycle < UINT_MAX && "MinReadyCycle uninitialized");
+ unsigned NextCycle = std::max(CurrCycle + 1, MinReadyCycle);
+
+ if (!HazardRec->isEnabled()) {
+ // Bypass HazardRec virtual calls.
+ CurrCycle = NextCycle;
+ } else {
+ // Bypass getHazardType calls in case of long latency.
+ for (; CurrCycle != NextCycle; ++CurrCycle) {
+ if (isTop())
+ HazardRec->AdvanceCycle();
+ else
+ HazardRec->RecedeCycle();
+ }
+ }
+ CheckPending = true;
+
+ DEBUG(dbgs() << "*** Next cycle " << Available.getName() << " cycle "
+ << CurrCycle << '\n');
+}
+
+/// Move the boundary of scheduled code by one SUnit.
+void ConvergingVLIWScheduler::VLIWSchedBoundary::bumpNode(SUnit *SU) {
+ bool startNewCycle = false;
+
+ // Update the reservation table.
+ if (HazardRec->isEnabled()) {
+ if (!isTop() && SU->isCall) {
+ // Calls are scheduled with their preceding instructions. For bottom-up
+ // scheduling, clear the pipeline state before emitting.
+ HazardRec->Reset();
+ }
+ HazardRec->EmitInstruction(SU);
+ }
+
+ // Update DFA model.
+ startNewCycle = ResourceModel->reserveResources(SU);
+
+ // Check the instruction group dispatch limit.
+ // TODO: Check if this SU must end a dispatch group.
+ IssueCount += SchedModel->getNumMicroOps(SU->getInstr());
+ if (startNewCycle) {
+ DEBUG(dbgs() << "*** Max instrs at cycle " << CurrCycle << '\n');
+ bumpCycle();
+ }
+ else
+ DEBUG(dbgs() << "*** IssueCount " << IssueCount
+ << " at cycle " << CurrCycle << '\n');
+}
+
+/// Release pending ready nodes in to the available queue. This makes them
+/// visible to heuristics.
+void ConvergingVLIWScheduler::VLIWSchedBoundary::releasePending() {
+ // If the available queue is empty, it is safe to reset MinReadyCycle.
+ if (Available.empty())
+ MinReadyCycle = UINT_MAX;
+
+ // Check to see if any of the pending instructions are ready to issue. If
+ // so, add them to the available queue.
+ for (unsigned i = 0, e = Pending.size(); i != e; ++i) {
+ SUnit *SU = *(Pending.begin()+i);
+ unsigned ReadyCycle = isTop() ? SU->TopReadyCycle : SU->BotReadyCycle;
+
+ if (ReadyCycle < MinReadyCycle)
+ MinReadyCycle = ReadyCycle;
+
+ if (ReadyCycle > CurrCycle)
+ continue;
+
+ if (checkHazard(SU))
+ continue;
+
+ Available.push(SU);
+ Pending.remove(Pending.begin()+i);
+ --i; --e;
+ }
+ CheckPending = false;
+}
+
+/// Remove SU from the ready set for this boundary.
+void ConvergingVLIWScheduler::VLIWSchedBoundary::removeReady(SUnit *SU) {
+ if (Available.isInQueue(SU))
+ Available.remove(Available.find(SU));
+ else {
+ assert(Pending.isInQueue(SU) && "bad ready count");
+ Pending.remove(Pending.find(SU));
+ }
+}
+
+/// If this queue only has one ready candidate, return it. As a side effect,
+/// advance the cycle until at least one node is ready. If multiple instructions
+/// are ready, return NULL.
+SUnit *ConvergingVLIWScheduler::VLIWSchedBoundary::pickOnlyChoice() {
+ if (CheckPending)
+ releasePending();
+
+ for (unsigned i = 0; Available.empty(); ++i) {
+ assert(i <= (HazardRec->getMaxLookAhead() + MaxMinLatency) &&
+ "permanent hazard"); (void)i;
+ ResourceModel->reserveResources(nullptr);
+ bumpCycle();
+ releasePending();
+ }
+ if (Available.size() == 1)
+ return *Available.begin();
+ return nullptr;
+}
+
+#ifndef NDEBUG
+void ConvergingVLIWScheduler::traceCandidate(const char *Label,
+ const ReadyQueue &Q, SUnit *SU, int Cost, PressureChange P) {
+ dbgs() << Label << " " << Q.getName() << " ";
+ if (P.isValid())
+ dbgs() << DAG->TRI->getRegPressureSetName(P.getPSet()) << ":"
+ << P.getUnitInc() << " ";
+ else
+ dbgs() << " ";
+ dbgs() << "cost(" << Cost << ")\t";
+ SU->dump(DAG);
+}
+
+// Very detailed queue dump, to be used with higher verbosity levels.
+void ConvergingVLIWScheduler::readyQueueVerboseDump(
+ const RegPressureTracker &RPTracker, SchedCandidate &Candidate,
+ ReadyQueue &Q) {
+ RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
+
+ dbgs() << ">>> " << Q.getName() << "\n";
+ for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {
+ RegPressureDelta RPDelta;
+ TempTracker.getMaxPressureDelta((*I)->getInstr(), RPDelta,
+ DAG->getRegionCriticalPSets(),
+ DAG->getRegPressure().MaxSetPressure);
+ std::stringstream dbgstr;
+ dbgstr << "SU(" << std::setw(3) << (*I)->NodeNum << ")";
+ dbgs() << dbgstr.str();
+ SchedulingCost(Q, *I, Candidate, RPDelta, true);
+ dbgs() << "\t";
+ (*I)->getInstr()->dump();
+ }
+ dbgs() << "\n";
+}
+#endif
+
+/// getSingleUnscheduledPred - If there is exactly one unscheduled predecessor
+/// of SU, return it, otherwise return null.
+static SUnit *getSingleUnscheduledPred(SUnit *SU) {
+ SUnit *OnlyAvailablePred = nullptr;
+ for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+ I != E; ++I) {
+ SUnit &Pred = *I->getSUnit();
+ if (!Pred.isScheduled) {
+ // We found an available, but not scheduled, predecessor. If it's the
+ // only one we have found, keep track of it... otherwise give up.
+ if (OnlyAvailablePred && OnlyAvailablePred != &Pred)
+ return nullptr;
+ OnlyAvailablePred = &Pred;
+ }
+ }
+ return OnlyAvailablePred;
+}
+
+/// getSingleUnscheduledSucc - If there is exactly one unscheduled successor
+/// of SU, return it, otherwise return null.
+static SUnit *getSingleUnscheduledSucc(SUnit *SU) {
+ SUnit *OnlyAvailableSucc = nullptr;
+ for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+ I != E; ++I) {
+ SUnit &Succ = *I->getSUnit();
+ if (!Succ.isScheduled) {
+ // We found an available, but not scheduled, successor. If it's the
+ // only one we have found, keep track of it... otherwise give up.
+ if (OnlyAvailableSucc && OnlyAvailableSucc != &Succ)
+ return nullptr;
+ OnlyAvailableSucc = &Succ;
+ }
+ }
+ return OnlyAvailableSucc;
+}
+
+// Constants used to denote relative importance of
+// heuristic components for cost computation.
+static const unsigned PriorityOne = 200;
+static const unsigned PriorityTwo = 50;
+static const unsigned PriorityThree = 75;
+static const unsigned ScaleTwo = 10;
+static const unsigned FactorOne = 2;
+
+/// Single point to compute overall scheduling cost.
+/// TODO: More heuristics will be used soon.
+int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU,
+ SchedCandidate &Candidate,
+ RegPressureDelta &Delta,
+ bool verbose) {
+ // Initial trivial priority.
+ int ResCount = 1;
+
+ // Do not waste time on a node that is already scheduled.
+ if (!SU || SU->isScheduled)
+ return ResCount;
+
+ MachineInstr &Instr = *SU->getInstr();
+
+ DEBUG(if (verbose) dbgs() << ((Q.getID() == TopQID) ? "(top|" : "(bot|"));
+ // Forced priority is high.
+ if (SU->isScheduleHigh) {
+ ResCount += PriorityOne;
+ DEBUG(dbgs() << "H|");
+ }
+
+ // Critical path first.
+ if (Q.getID() == TopQID) {
+ ResCount += (SU->getHeight() * ScaleTwo);
+
+ DEBUG(if (verbose) {
+ std::stringstream dbgstr;
+ dbgstr << "h" << std::setw(3) << SU->getHeight() << "|";
+ dbgs() << dbgstr.str();
+ });
+
+ // If resources are available for it, multiply the
+ // chance of scheduling.
+ if (Top.ResourceModel->isResourceAvailable(SU)) {
+ ResCount <<= FactorOne;
+ ResCount += PriorityThree;
+ DEBUG(if (verbose) dbgs() << "A|");
+ } else
+ DEBUG(if (verbose) dbgs() << " |");
+ } else {
+ ResCount += (SU->getDepth() * ScaleTwo);
+
+ DEBUG(if (verbose) {
+ std::stringstream dbgstr;
+ dbgstr << "d" << std::setw(3) << SU->getDepth() << "|";
+ dbgs() << dbgstr.str();
+ });
+
+ // If resources are available for it, multiply the
+ // chance of scheduling.
+ if (Bot.ResourceModel->isResourceAvailable(SU)) {
+ ResCount <<= FactorOne;
+ ResCount += PriorityThree;
+ DEBUG(if (verbose) dbgs() << "A|");
+ } else
+ DEBUG(if (verbose) dbgs() << " |");
+ }
+
+ unsigned NumNodesBlocking = 0;
+ if (Q.getID() == TopQID) {
+ // How many SUs does it block from scheduling?
+ // Look at all of the successors of this node.
+ // Count the number of nodes that
+ // this node is the sole unscheduled node for.
+ for (const SDep &SI : SU->Succs)
+ if (getSingleUnscheduledPred(SI.getSUnit()) == SU)
+ ++NumNodesBlocking;
+ } else {
+ // How many unscheduled predecessors block this node?
+ for (const SDep &PI : SU->Preds)
+ if (getSingleUnscheduledSucc(PI.getSUnit()) == SU)
+ ++NumNodesBlocking;
+ }
+ ResCount += (NumNodesBlocking * ScaleTwo);
+
+ DEBUG(if (verbose) {
+ std::stringstream dbgstr;
+ dbgstr << "blk " << std::setw(2) << NumNodesBlocking << ")|";
+ dbgs() << dbgstr.str();
+ });
+
+ // Factor in reg pressure as a heuristic.
+ if (!IgnoreBBRegPressure) {
+ // Decrease priority by the amount that register pressure exceeds the limit.
+ ResCount -= (Delta.Excess.getUnitInc()*PriorityOne);
+ // Decrease priority if register pressure exceeds the limit.
+ ResCount -= (Delta.CriticalMax.getUnitInc()*PriorityOne);
+ // Decrease priority slightly if register pressure would increase over the
+ // current maximum.
+ ResCount -= (Delta.CurrentMax.getUnitInc()*PriorityTwo);
+ DEBUG(if (verbose) {
+ dbgs() << "RP " << Delta.Excess.getUnitInc() << "/"
+ << Delta.CriticalMax.getUnitInc() <<"/"
+ << Delta.CurrentMax.getUnitInc() << ")|";
+ });
+ }
+
+ // Give a little extra priority to a .cur instruction if there is a resource
+ // available for it.
+ auto &QST = DAG->MF.getSubtarget<HexagonSubtarget>();
+ auto &QII = *QST.getInstrInfo();
+ if (SU->isInstr() && QII.mayBeCurLoad(*SU->getInstr())) {
+ if (Q.getID() == TopQID && Top.ResourceModel->isResourceAvailable(SU)) {
+ ResCount += PriorityTwo;
+ DEBUG(if (verbose) dbgs() << "C|");
+ } else if (Q.getID() == BotQID &&
+ Bot.ResourceModel->isResourceAvailable(SU)) {
+ ResCount += PriorityTwo;
+ DEBUG(if (verbose) dbgs() << "C|");
+ }
+ }
+
+ // Give preference to a zero latency instruction if the dependent
+ // instruction is in the current packet.
+ if (Q.getID() == TopQID) {
+ for (const SDep &PI : SU->Preds) {
+ if (!PI.getSUnit()->getInstr()->isPseudo() && PI.isAssignedRegDep() &&
+ PI.getLatency() == 0 &&
+ Top.ResourceModel->isInPacket(PI.getSUnit())) {
+ ResCount += PriorityThree;
+ DEBUG(if (verbose) dbgs() << "Z|");
+ }
+ }
+ } else {
+ for (const SDep &SI : SU->Succs) {
+ if (!SI.getSUnit()->getInstr()->isPseudo() && SI.isAssignedRegDep() &&
+ SI.getLatency() == 0 &&
+ Bot.ResourceModel->isInPacket(SI.getSUnit())) {
+ ResCount += PriorityThree;
+ DEBUG(if (verbose) dbgs() << "Z|");
+ }
+ }
+ }
+
+ // Give less preference to an instruction that will cause a stall with
+ // an instruction in the previous packet.
+ if (QII.isV60VectorInstruction(Instr)) {
+ // Check for stalls in the previous packet.
+ if (Q.getID() == TopQID) {
+ for (auto J : Top.ResourceModel->OldPacket)
+ if (QII.producesStall(*J->getInstr(), Instr))
+ ResCount -= PriorityOne;
+ } else {
+ for (auto J : Bot.ResourceModel->OldPacket)
+ if (QII.producesStall(Instr, *J->getInstr()))
+ ResCount -= PriorityOne;
+ }
+ }
+
+ // If the instruction has a non-zero latency dependence with an instruction in
+ // the current packet, then it should not be scheduled yet. The case occurs
+ // when the dependent instruction is scheduled in a new packet, so the
+ // scheduler updates the current cycle and pending instructions become
+ // available.
+ if (CheckEarlyAvail) {
+ if (Q.getID() == TopQID) {
+ for (const auto &PI : SU->Preds) {
+ if (PI.getLatency() > 0 &&
+ Top.ResourceModel->isInPacket(PI.getSUnit())) {
+ ResCount -= PriorityOne;
+ DEBUG(if (verbose) dbgs() << "D|");
+ }
+ }
+ } else {
+ for (const auto &SI : SU->Succs) {
+ if (SI.getLatency() > 0 &&
+ Bot.ResourceModel->isInPacket(SI.getSUnit())) {
+ ResCount -= PriorityOne;
+ DEBUG(if (verbose) dbgs() << "D|");
+ }
+ }
+ }
+ }
+
+ DEBUG(if (verbose) {
+ std::stringstream dbgstr;
+ dbgstr << "Total " << std::setw(4) << ResCount << ")";
+ dbgs() << dbgstr.str();
+ });
+
+ return ResCount;
+}
+
+/// Pick the best candidate from the top queue.
+///
+/// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during
+/// DAG building. To adjust for the current scheduling location we need to
+/// maintain the number of vreg uses remaining to be top-scheduled.
+ConvergingVLIWScheduler::CandResult ConvergingVLIWScheduler::
+pickNodeFromQueue(ReadyQueue &Q, const RegPressureTracker &RPTracker,
+ SchedCandidate &Candidate) {
+ DEBUG(if (SchedDebugVerboseLevel > 1)
+ readyQueueVerboseDump(RPTracker, Candidate, Q);
+ else Q.dump(););
+
+ // getMaxPressureDelta temporarily modifies the tracker.
+ RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
+
+ // BestSU remains NULL if no top candidates beat the best existing candidate.
+ CandResult FoundCandidate = NoCand;
+ for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {
+ RegPressureDelta RPDelta;
+ TempTracker.getMaxPressureDelta((*I)->getInstr(), RPDelta,
+ DAG->getRegionCriticalPSets(),
+ DAG->getRegPressure().MaxSetPressure);
+
+ int CurrentCost = SchedulingCost(Q, *I, Candidate, RPDelta, false);
+
+ // Initialize the candidate if needed.
+ if (!Candidate.SU) {
+ DEBUG(traceCandidate("DCAND", Q, *I, CurrentCost));
+ Candidate.SU = *I;
+ Candidate.RPDelta = RPDelta;
+ Candidate.SCost = CurrentCost;
+ FoundCandidate = NodeOrder;
+ continue;
+ }
+
+ // Best cost.
+ if (CurrentCost > Candidate.SCost) {
+ DEBUG(traceCandidate("CCAND", Q, *I, CurrentCost));
+ Candidate.SU = *I;
+ Candidate.RPDelta = RPDelta;
+ Candidate.SCost = CurrentCost;
+ FoundCandidate = BestCost;
+ continue;
+ }
+
+ // Tie breaker using Timing Class.
+ if (!DisableTCTie) {
+ auto &QST = DAG->MF.getSubtarget<HexagonSubtarget>();
+ auto &QII = *QST.getInstrInfo();
+
+ const MachineInstr *MI = (*I)->getInstr();
+ const MachineInstr *CandI = Candidate.SU->getInstr();
+ const InstrItineraryData *InstrItins = QST.getInstrItineraryData();
+
+ unsigned InstrLatency = QII.getInstrTimingClassLatency(InstrItins, *MI);
+ unsigned CandLatency = QII.getInstrTimingClassLatency(InstrItins, *CandI);
+ DEBUG(dbgs() << "TC Tie Breaker Cand: "
+ << CandLatency << " Instr:" << InstrLatency << "\n"
+ << *MI << *CandI << "\n");
+ if (Q.getID() == TopQID && CurrentCost == Candidate.SCost) {
+ if (InstrLatency < CandLatency && TopUseShorterTie) {
+ Candidate.SU = *I;
+ Candidate.RPDelta = RPDelta;
+ Candidate.SCost = CurrentCost;
+ FoundCandidate = BestCost;
+ DEBUG(dbgs() << "Used top shorter tie breaker\n");
+ continue;
+ } else if (InstrLatency > CandLatency && !TopUseShorterTie) {
+ Candidate.SU = *I;
+ Candidate.RPDelta = RPDelta;
+ Candidate.SCost = CurrentCost;
+ FoundCandidate = BestCost;
+ DEBUG(dbgs() << "Used top longer tie breaker\n");
+ continue;
+ }
+ } else if (Q.getID() == BotQID && CurrentCost == Candidate.SCost) {
+ if (InstrLatency < CandLatency && BotUseShorterTie) {
+ Candidate.SU = *I;
+ Candidate.RPDelta = RPDelta;
+ Candidate.SCost = CurrentCost;
+ FoundCandidate = BestCost;
+ DEBUG(dbgs() << "Used Bot shorter tie breaker\n");
+ continue;
+ } else if (InstrLatency > CandLatency && !BotUseShorterTie) {
+ Candidate.SU = *I;
+ Candidate.RPDelta = RPDelta;
+ Candidate.SCost = CurrentCost;
+ FoundCandidate = BestCost;
+ DEBUG(dbgs() << "Used Bot longer tie breaker\n");
+ continue;
+ }
+ }
+ }
+
+ if (CurrentCost == Candidate.SCost) {
+ if ((Q.getID() == TopQID &&
+ (*I)->Succs.size() > Candidate.SU->Succs.size()) ||
+ (Q.getID() == BotQID &&
+ (*I)->Preds.size() < Candidate.SU->Preds.size())) {
+ DEBUG(traceCandidate("SPCAND", Q, *I, CurrentCost));
+ Candidate.SU = *I;
+ Candidate.RPDelta = RPDelta;
+ Candidate.SCost = CurrentCost;
+ FoundCandidate = BestCost;
+ continue;
+ }
+ }
+
+ // Fall through to original instruction order.
+ // Only consider node order if Candidate was chosen from this Q.
+ if (FoundCandidate == NoCand)
+ continue;
+ }
+ return FoundCandidate;
+}
+
+/// Pick the best candidate node from either the top or bottom queue.
+SUnit *ConvergingVLIWScheduler::pickNodeBidrectional(bool &IsTopNode) {
+ // Schedule as far as possible in the direction of no choice. This is most
+ // efficient, but also provides the best heuristics for CriticalPSets.
+ if (SUnit *SU = Bot.pickOnlyChoice()) {
+ DEBUG(dbgs() << "Picked only Bottom\n");
+ IsTopNode = false;
+ return SU;
+ }
+ if (SUnit *SU = Top.pickOnlyChoice()) {
+ DEBUG(dbgs() << "Picked only Top\n");
+ IsTopNode = true;
+ return SU;
+ }
+ SchedCandidate BotCand;
+ // Prefer bottom scheduling when heuristics are silent.
+ CandResult BotResult = pickNodeFromQueue(Bot.Available,
+ DAG->getBotRPTracker(), BotCand);
+ assert(BotResult != NoCand && "failed to find the first candidate");
+
+ // If either Q has a single candidate that provides the least increase in
+ // Excess pressure, we can immediately schedule from that Q.
+ //
+ // RegionCriticalPSets summarizes the pressure within the scheduled region and
+ // affects picking from either Q. If scheduling in one direction must
+ // increase pressure for one of the excess PSets, then schedule in that
+ // direction first to provide more freedom in the other direction.
+ if (BotResult == SingleExcess || BotResult == SingleCritical) {
+ DEBUG(dbgs() << "Prefered Bottom Node\n");
+ IsTopNode = false;
+ return BotCand.SU;
+ }
+ // Check if the top Q has a better candidate.
+ SchedCandidate TopCand;
+ CandResult TopResult = pickNodeFromQueue(Top.Available,
+ DAG->getTopRPTracker(), TopCand);
+ assert(TopResult != NoCand && "failed to find the first candidate");
+
+ if (TopResult == SingleExcess || TopResult == SingleCritical) {
+ DEBUG(dbgs() << "Prefered Top Node\n");
+ IsTopNode = true;
+ return TopCand.SU;
+ }
+ // If either Q has a single candidate that minimizes pressure above the
+ // original region's pressure pick it.
+ if (BotResult == SingleMax) {
+ DEBUG(dbgs() << "Prefered Bottom Node SingleMax\n");
+ IsTopNode = false;
+ return BotCand.SU;
+ }
+ if (TopResult == SingleMax) {
+ DEBUG(dbgs() << "Prefered Top Node SingleMax\n");
+ IsTopNode = true;
+ return TopCand.SU;
+ }
+ if (TopCand.SCost > BotCand.SCost) {
+ DEBUG(dbgs() << "Prefered Top Node Cost\n");
+ IsTopNode = true;
+ return TopCand.SU;
+ }
+ // Otherwise prefer the bottom candidate in node order.
+ DEBUG(dbgs() << "Prefered Bottom in Node order\n");
+ IsTopNode = false;
+ return BotCand.SU;
+}
+
+/// Pick the best node to balance the schedule. Implements MachineSchedStrategy.
+SUnit *ConvergingVLIWScheduler::pickNode(bool &IsTopNode) {
+ if (DAG->top() == DAG->bottom()) {
+ assert(Top.Available.empty() && Top.Pending.empty() &&
+ Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
+ return nullptr;
+ }
+ SUnit *SU;
+ if (llvm::ForceTopDown) {
+ SU = Top.pickOnlyChoice();
+ if (!SU) {
+ SchedCandidate TopCand;
+ CandResult TopResult =
+ pickNodeFromQueue(Top.Available, DAG->getTopRPTracker(), TopCand);
+ assert(TopResult != NoCand && "failed to find the first candidate");
+ (void)TopResult;
+ SU = TopCand.SU;
+ }
+ IsTopNode = true;
+ } else if (llvm::ForceBottomUp) {
+ SU = Bot.pickOnlyChoice();
+ if (!SU) {
+ SchedCandidate BotCand;
+ CandResult BotResult =
+ pickNodeFromQueue(Bot.Available, DAG->getBotRPTracker(), BotCand);
+ assert(BotResult != NoCand && "failed to find the first candidate");
+ (void)BotResult;
+ SU = BotCand.SU;
+ }
+ IsTopNode = false;
+ } else {
+ SU = pickNodeBidrectional(IsTopNode);
+ }
+ if (SU->isTopReady())
+ Top.removeReady(SU);
+ if (SU->isBottomReady())
+ Bot.removeReady(SU);
+
+ DEBUG(dbgs() << "*** " << (IsTopNode ? "Top" : "Bottom")
+ << " Scheduling Instruction in cycle "
+ << (IsTopNode ? Top.CurrCycle : Bot.CurrCycle) << '\n';
+ SU->dump(DAG));
+ return SU;
+}
+
+/// Update the scheduler's state after scheduling a node. This is the same node
+/// that was just returned by pickNode(). However, VLIWMachineScheduler needs
+/// to update it's state based on the current cycle before MachineSchedStrategy
+/// does.
+void ConvergingVLIWScheduler::schedNode(SUnit *SU, bool IsTopNode) {
+ if (IsTopNode) {
+ SU->TopReadyCycle = Top.CurrCycle;
+ Top.bumpNode(SU);
+ } else {
+ SU->BotReadyCycle = Bot.CurrCycle;
+ Bot.bumpNode(SU);
+ }
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.h b/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.h
new file mode 100644
index 000000000000..dc10028c0424
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.h
@@ -0,0 +1,254 @@
+//===-- HexagonMachineScheduler.h - Custom Hexagon MI scheduler. ----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Custom Hexagon MI scheduler.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONMACHINESCHEDULER_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONMACHINESCHEDULER_H
+
+#include "llvm/ADT/PriorityQueue.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/CodeGen/ResourcePriorityQueue.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+namespace llvm {
+//===----------------------------------------------------------------------===//
+// ConvergingVLIWScheduler - Implementation of the standard
+// MachineSchedStrategy.
+//===----------------------------------------------------------------------===//
+
+class VLIWResourceModel {
+ /// ResourcesModel - Represents VLIW state.
+ /// Not limited to VLIW targets per say, but assumes
+ /// definition of DFA by a target.
+ DFAPacketizer *ResourcesModel;
+
+ const TargetSchedModel *SchedModel;
+
+ /// Local packet/bundle model. Purely
+ /// internal to the MI schedulre at the time.
+ std::vector<SUnit*> Packet;
+
+ /// Total packets created.
+ unsigned TotalPackets;
+
+public:
+ /// Save the last formed packet.
+ std::vector<SUnit*> OldPacket;
+
+public:
+ VLIWResourceModel(const TargetSubtargetInfo &STI, const TargetSchedModel *SM)
+ : SchedModel(SM), TotalPackets(0) {
+ ResourcesModel = STI.getInstrInfo()->CreateTargetScheduleState(STI);
+
+ // This hard requirement could be relaxed,
+ // but for now do not let it proceed.
+ assert(ResourcesModel && "Unimplemented CreateTargetScheduleState.");
+
+ Packet.resize(SchedModel->getIssueWidth());
+ Packet.clear();
+ OldPacket.resize(SchedModel->getIssueWidth());
+ OldPacket.clear();
+ ResourcesModel->clearResources();
+ }
+
+ ~VLIWResourceModel() {
+ delete ResourcesModel;
+ }
+
+ void resetPacketState() {
+ Packet.clear();
+ }
+
+ void resetDFA() {
+ ResourcesModel->clearResources();
+ }
+
+ void reset() {
+ Packet.clear();
+ ResourcesModel->clearResources();
+ }
+
+ bool isResourceAvailable(SUnit *SU);
+ bool reserveResources(SUnit *SU);
+ void savePacket();
+ unsigned getTotalPackets() const { return TotalPackets; }
+
+ bool isInPacket(SUnit *SU) const { return is_contained(Packet, SU); }
+};
+
+/// Extend the standard ScheduleDAGMI to provide more context and override the
+/// top-level schedule() driver.
+class VLIWMachineScheduler : public ScheduleDAGMILive {
+public:
+ VLIWMachineScheduler(MachineSchedContext *C,
+ std::unique_ptr<MachineSchedStrategy> S)
+ : ScheduleDAGMILive(C, std::move(S)) {}
+
+ /// Schedule - This is called back from ScheduleDAGInstrs::Run() when it's
+ /// time to do some work.
+ void schedule() override;
+};
+
+/// ConvergingVLIWScheduler shrinks the unscheduled zone using heuristics
+/// to balance the schedule.
+class ConvergingVLIWScheduler : public MachineSchedStrategy {
+
+ /// Store the state used by ConvergingVLIWScheduler heuristics, required
+ /// for the lifetime of one invocation of pickNode().
+ struct SchedCandidate {
+ // The best SUnit candidate.
+ SUnit *SU;
+
+ // Register pressure values for the best candidate.
+ RegPressureDelta RPDelta;
+
+ // Best scheduling cost.
+ int SCost;
+
+ SchedCandidate(): SU(nullptr), SCost(0) {}
+ };
+ /// Represent the type of SchedCandidate found within a single queue.
+ enum CandResult {
+ NoCand, NodeOrder, SingleExcess, SingleCritical, SingleMax, MultiPressure,
+ BestCost};
+
+ /// Each Scheduling boundary is associated with ready queues. It tracks the
+ /// current cycle in whichever direction at has moved, and maintains the state
+ /// of "hazards" and other interlocks at the current cycle.
+ struct VLIWSchedBoundary {
+ VLIWMachineScheduler *DAG;
+ const TargetSchedModel *SchedModel;
+
+ ReadyQueue Available;
+ ReadyQueue Pending;
+ bool CheckPending;
+
+ ScheduleHazardRecognizer *HazardRec;
+ VLIWResourceModel *ResourceModel;
+
+ unsigned CurrCycle;
+ unsigned IssueCount;
+
+ /// MinReadyCycle - Cycle of the soonest available instruction.
+ unsigned MinReadyCycle;
+
+ // Remember the greatest min operand latency.
+ unsigned MaxMinLatency;
+
+ /// Pending queues extend the ready queues with the same ID and the
+ /// PendingFlag set.
+ VLIWSchedBoundary(unsigned ID, const Twine &Name):
+ DAG(nullptr), SchedModel(nullptr), Available(ID, Name+".A"),
+ Pending(ID << ConvergingVLIWScheduler::LogMaxQID, Name+".P"),
+ CheckPending(false), HazardRec(nullptr), ResourceModel(nullptr),
+ CurrCycle(0), IssueCount(0),
+ MinReadyCycle(UINT_MAX), MaxMinLatency(0) {}
+
+ ~VLIWSchedBoundary() {
+ delete ResourceModel;
+ delete HazardRec;
+ }
+
+ void init(VLIWMachineScheduler *dag, const TargetSchedModel *smodel) {
+ DAG = dag;
+ SchedModel = smodel;
+ IssueCount = 0;
+ }
+
+ bool isTop() const {
+ return Available.getID() == ConvergingVLIWScheduler::TopQID;
+ }
+
+ bool checkHazard(SUnit *SU);
+
+ void releaseNode(SUnit *SU, unsigned ReadyCycle);
+
+ void bumpCycle();
+
+ void bumpNode(SUnit *SU);
+
+ void releasePending();
+
+ void removeReady(SUnit *SU);
+
+ SUnit *pickOnlyChoice();
+ };
+
+ VLIWMachineScheduler *DAG;
+ const TargetSchedModel *SchedModel;
+
+ // State of the top and bottom scheduled instruction boundaries.
+ VLIWSchedBoundary Top;
+ VLIWSchedBoundary Bot;
+
+public:
+ /// SUnit::NodeQueueId: 0 (none), 1 (top), 2 (bot), 3 (both)
+ enum {
+ TopQID = 1,
+ BotQID = 2,
+ LogMaxQID = 2
+ };
+
+ ConvergingVLIWScheduler()
+ : DAG(nullptr), SchedModel(nullptr), Top(TopQID, "TopQ"),
+ Bot(BotQID, "BotQ") {}
+
+ void initialize(ScheduleDAGMI *dag) override;
+
+ SUnit *pickNode(bool &IsTopNode) override;
+
+ void schedNode(SUnit *SU, bool IsTopNode) override;
+
+ void releaseTopNode(SUnit *SU) override;
+
+ void releaseBottomNode(SUnit *SU) override;
+
+ unsigned ReportPackets() {
+ return Top.ResourceModel->getTotalPackets() +
+ Bot.ResourceModel->getTotalPackets();
+ }
+
+protected:
+ SUnit *pickNodeBidrectional(bool &IsTopNode);
+
+ int SchedulingCost(ReadyQueue &Q,
+ SUnit *SU, SchedCandidate &Candidate,
+ RegPressureDelta &Delta, bool verbose);
+
+ CandResult pickNodeFromQueue(ReadyQueue &Q,
+ const RegPressureTracker &RPTracker,
+ SchedCandidate &Candidate);
+#ifndef NDEBUG
+ void traceCandidate(const char *Label, const ReadyQueue &Q, SUnit *SU,
+ int Cost, PressureChange P = PressureChange());
+
+ void readyQueueVerboseDump(const RegPressureTracker &RPTracker,
+ SchedCandidate &Candidate, ReadyQueue &Q);
+#endif
+};
+
+} // namespace
+
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp
new file mode 100644
index 000000000000..72d8011277e6
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp
@@ -0,0 +1,701 @@
+//===----- HexagonNewValueJump.cpp - Hexagon Backend New Value Jump -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements NewValueJump pass in Hexagon.
+// Ideally, we should merge this as a Peephole pass prior to register
+// allocation, but because we have a spill in between the feeder and new value
+// jump instructions, we are forced to write after register allocation.
+// Having said that, we should re-attempt to pull this earlier at some point
+// in future.
+
+// The basic approach looks for sequence of predicated jump, compare instruciton
+// that genereates the predicate and, the feeder to the predicate. Once it finds
+// all, it collapses compare and jump instruction into a new valu jump
+// intstructions.
+//
+//
+//===----------------------------------------------------------------------===//
+#include "Hexagon.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/PassSupport.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagon-nvj"
+
+STATISTIC(NumNVJGenerated, "Number of New Value Jump Instructions created");
+
+static cl::opt<int>
+DbgNVJCount("nvj-count", cl::init(-1), cl::Hidden, cl::desc(
+ "Maximum number of predicated jumps to be converted to New Value Jump"));
+
+static cl::opt<bool> DisableNewValueJumps("disable-nvjump", cl::Hidden,
+ cl::ZeroOrMore, cl::init(false),
+ cl::desc("Disable New Value Jumps"));
+
+namespace llvm {
+ FunctionPass *createHexagonNewValueJump();
+ void initializeHexagonNewValueJumpPass(PassRegistry&);
+}
+
+
+namespace {
+ struct HexagonNewValueJump : public MachineFunctionPass {
+ const HexagonInstrInfo *QII;
+ const HexagonRegisterInfo *QRI;
+
+ public:
+ static char ID;
+
+ HexagonNewValueJump() : MachineFunctionPass(ID) {
+ initializeHexagonNewValueJumpPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineBranchProbabilityInfo>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ StringRef getPassName() const override { return "Hexagon NewValueJump"; }
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+ private:
+ /// \brief A handle to the branch probability pass.
+ const MachineBranchProbabilityInfo *MBPI;
+
+ bool isNewValueJumpCandidate(const MachineInstr &MI) const;
+ };
+
+} // end of anonymous namespace
+
+char HexagonNewValueJump::ID = 0;
+
+INITIALIZE_PASS_BEGIN(HexagonNewValueJump, "hexagon-nvj",
+ "Hexagon NewValueJump", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_END(HexagonNewValueJump, "hexagon-nvj",
+ "Hexagon NewValueJump", false, false)
+
+
+// We have identified this II could be feeder to NVJ,
+// verify that it can be.
+static bool canBeFeederToNewValueJump(const HexagonInstrInfo *QII,
+ const TargetRegisterInfo *TRI,
+ MachineBasicBlock::iterator II,
+ MachineBasicBlock::iterator end,
+ MachineBasicBlock::iterator skip,
+ MachineFunction &MF) {
+
+ // Predicated instruction can not be feeder to NVJ.
+ if (QII->isPredicated(*II))
+ return false;
+
+ // Bail out if feederReg is a paired register (double regs in
+ // our case). One would think that we can check to see if a given
+ // register cmpReg1 or cmpReg2 is a sub register of feederReg
+ // using -- if (QRI->isSubRegister(feederReg, cmpReg1) logic
+ // before the callsite of this function
+ // But we can not as it comes in the following fashion.
+ // %D0<def> = Hexagon_S2_lsr_r_p %D0<kill>, %R2<kill>
+ // %R0<def> = KILL %R0, %D0<imp-use,kill>
+ // %P0<def> = CMPEQri %R0<kill>, 0
+ // Hence, we need to check if it's a KILL instruction.
+ if (II->getOpcode() == TargetOpcode::KILL)
+ return false;
+
+
+ // Make sure there there is no 'def' or 'use' of any of the uses of
+ // feeder insn between it's definition, this MI and jump, jmpInst
+ // skipping compare, cmpInst.
+ // Here's the example.
+ // r21=memub(r22+r24<<#0)
+ // p0 = cmp.eq(r21, #0)
+ // r4=memub(r3+r21<<#0)
+ // if (p0.new) jump:t .LBB29_45
+ // Without this check, it will be converted into
+ // r4=memub(r3+r21<<#0)
+ // r21=memub(r22+r24<<#0)
+ // p0 = cmp.eq(r21, #0)
+ // if (p0.new) jump:t .LBB29_45
+ // and result WAR hazards if converted to New Value Jump.
+
+ for (unsigned i = 0; i < II->getNumOperands(); ++i) {
+ if (II->getOperand(i).isReg() &&
+ (II->getOperand(i).isUse() || II->getOperand(i).isDef())) {
+ MachineBasicBlock::iterator localII = II;
+ ++localII;
+ unsigned Reg = II->getOperand(i).getReg();
+ for (MachineBasicBlock::iterator localBegin = localII;
+ localBegin != end; ++localBegin) {
+ if (localBegin == skip ) continue;
+ // Check for Subregisters too.
+ if (localBegin->modifiesRegister(Reg, TRI) ||
+ localBegin->readsRegister(Reg, TRI))
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+// These are the common checks that need to performed
+// to determine if
+// 1. compare instruction can be moved before jump.
+// 2. feeder to the compare instruction can be moved before jump.
+static bool commonChecksToProhibitNewValueJump(bool afterRA,
+ MachineBasicBlock::iterator MII) {
+
+ // If store in path, bail out.
+ if (MII->getDesc().mayStore())
+ return false;
+
+ // if call in path, bail out.
+ if (MII->isCall())
+ return false;
+
+ // if NVJ is running prior to RA, do the following checks.
+ if (!afterRA) {
+ // The following Target Opcode instructions are spurious
+ // to new value jump. If they are in the path, bail out.
+ // KILL sets kill flag on the opcode. It also sets up a
+ // single register, out of pair.
+ // %D0<def> = S2_lsr_r_p %D0<kill>, %R2<kill>
+ // %R0<def> = KILL %R0, %D0<imp-use,kill>
+ // %P0<def> = C2_cmpeqi %R0<kill>, 0
+ // PHI can be anything after RA.
+ // COPY can remateriaze things in between feeder, compare and nvj.
+ if (MII->getOpcode() == TargetOpcode::KILL ||
+ MII->getOpcode() == TargetOpcode::PHI ||
+ MII->getOpcode() == TargetOpcode::COPY)
+ return false;
+
+ // The following pseudo Hexagon instructions sets "use" and "def"
+ // of registers by individual passes in the backend. At this time,
+ // we don't know the scope of usage and definitions of these
+ // instructions.
+ if (MII->getOpcode() == Hexagon::LDriw_pred ||
+ MII->getOpcode() == Hexagon::STriw_pred)
+ return false;
+ }
+
+ return true;
+}
+
+static bool canCompareBeNewValueJump(const HexagonInstrInfo *QII,
+ const TargetRegisterInfo *TRI,
+ MachineBasicBlock::iterator II,
+ unsigned pReg,
+ bool secondReg,
+ bool optLocation,
+ MachineBasicBlock::iterator end,
+ MachineFunction &MF) {
+
+ MachineInstr &MI = *II;
+
+ // If the second operand of the compare is an imm, make sure it's in the
+ // range specified by the arch.
+ if (!secondReg) {
+ int64_t v = MI.getOperand(2).getImm();
+ bool Valid = false;
+
+ switch (MI.getOpcode()) {
+ case Hexagon::C2_cmpeqi:
+ case Hexagon::C2_cmpgti:
+ Valid = (isUInt<5>(v) || v == -1);
+ break;
+ case Hexagon::C2_cmpgtui:
+ Valid = isUInt<5>(v);
+ break;
+ case Hexagon::S2_tstbit_i:
+ case Hexagon::S4_ntstbit_i:
+ Valid = (v == 0);
+ break;
+ }
+
+ if (!Valid)
+ return false;
+ }
+
+ unsigned cmpReg1, cmpOp2 = 0; // cmpOp2 assignment silences compiler warning.
+ cmpReg1 = MI.getOperand(1).getReg();
+
+ if (secondReg) {
+ cmpOp2 = MI.getOperand(2).getReg();
+
+ // If the same register appears as both operands, we cannot generate a new
+ // value compare. Only one operand may use the .new suffix.
+ if (cmpReg1 == cmpOp2)
+ return false;
+
+ // Make sure that that second register is not from COPY
+ // At machine code level, we don't need this, but if we decide
+ // to move new value jump prior to RA, we would be needing this.
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ if (secondReg && !TargetRegisterInfo::isPhysicalRegister(cmpOp2)) {
+ MachineInstr *def = MRI.getVRegDef(cmpOp2);
+ if (def->getOpcode() == TargetOpcode::COPY)
+ return false;
+ }
+ }
+
+ // Walk the instructions after the compare (predicate def) to the jump,
+ // and satisfy the following conditions.
+ ++II ;
+ for (MachineBasicBlock::iterator localII = II; localII != end;
+ ++localII) {
+ if (localII->isDebugValue())
+ continue;
+
+ // Check 1.
+ // If "common" checks fail, bail out.
+ if (!commonChecksToProhibitNewValueJump(optLocation, localII))
+ return false;
+
+ // Check 2.
+ // If there is a def or use of predicate (result of compare), bail out.
+ if (localII->modifiesRegister(pReg, TRI) ||
+ localII->readsRegister(pReg, TRI))
+ return false;
+
+ // Check 3.
+ // If there is a def of any of the use of the compare (operands of compare),
+ // bail out.
+ // Eg.
+ // p0 = cmp.eq(r2, r0)
+ // r2 = r4
+ // if (p0.new) jump:t .LBB28_3
+ if (localII->modifiesRegister(cmpReg1, TRI) ||
+ (secondReg && localII->modifiesRegister(cmpOp2, TRI)))
+ return false;
+ }
+ return true;
+}
+
+
+// Given a compare operator, return a matching New Value Jump compare operator.
+// Make sure that MI here is included in isNewValueJumpCandidate.
+static unsigned getNewValueJumpOpcode(MachineInstr *MI, int reg,
+ bool secondRegNewified,
+ MachineBasicBlock *jmpTarget,
+ const MachineBranchProbabilityInfo
+ *MBPI) {
+ bool taken = false;
+ MachineBasicBlock *Src = MI->getParent();
+ const BranchProbability Prediction =
+ MBPI->getEdgeProbability(Src, jmpTarget);
+
+ if (Prediction >= BranchProbability(1,2))
+ taken = true;
+
+ switch (MI->getOpcode()) {
+ case Hexagon::C2_cmpeq:
+ return taken ? Hexagon::J4_cmpeq_t_jumpnv_t
+ : Hexagon::J4_cmpeq_t_jumpnv_nt;
+
+ case Hexagon::C2_cmpeqi: {
+ if (reg >= 0)
+ return taken ? Hexagon::J4_cmpeqi_t_jumpnv_t
+ : Hexagon::J4_cmpeqi_t_jumpnv_nt;
+ else
+ return taken ? Hexagon::J4_cmpeqn1_t_jumpnv_t
+ : Hexagon::J4_cmpeqn1_t_jumpnv_nt;
+ }
+
+ case Hexagon::C2_cmpgt: {
+ if (secondRegNewified)
+ return taken ? Hexagon::J4_cmplt_t_jumpnv_t
+ : Hexagon::J4_cmplt_t_jumpnv_nt;
+ else
+ return taken ? Hexagon::J4_cmpgt_t_jumpnv_t
+ : Hexagon::J4_cmpgt_t_jumpnv_nt;
+ }
+
+ case Hexagon::C2_cmpgti: {
+ if (reg >= 0)
+ return taken ? Hexagon::J4_cmpgti_t_jumpnv_t
+ : Hexagon::J4_cmpgti_t_jumpnv_nt;
+ else
+ return taken ? Hexagon::J4_cmpgtn1_t_jumpnv_t
+ : Hexagon::J4_cmpgtn1_t_jumpnv_nt;
+ }
+
+ case Hexagon::C2_cmpgtu: {
+ if (secondRegNewified)
+ return taken ? Hexagon::J4_cmpltu_t_jumpnv_t
+ : Hexagon::J4_cmpltu_t_jumpnv_nt;
+ else
+ return taken ? Hexagon::J4_cmpgtu_t_jumpnv_t
+ : Hexagon::J4_cmpgtu_t_jumpnv_nt;
+ }
+
+ case Hexagon::C2_cmpgtui:
+ return taken ? Hexagon::J4_cmpgtui_t_jumpnv_t
+ : Hexagon::J4_cmpgtui_t_jumpnv_nt;
+
+ case Hexagon::C4_cmpneq:
+ return taken ? Hexagon::J4_cmpeq_f_jumpnv_t
+ : Hexagon::J4_cmpeq_f_jumpnv_nt;
+
+ case Hexagon::C4_cmplte:
+ if (secondRegNewified)
+ return taken ? Hexagon::J4_cmplt_f_jumpnv_t
+ : Hexagon::J4_cmplt_f_jumpnv_nt;
+ return taken ? Hexagon::J4_cmpgt_f_jumpnv_t
+ : Hexagon::J4_cmpgt_f_jumpnv_nt;
+
+ case Hexagon::C4_cmplteu:
+ if (secondRegNewified)
+ return taken ? Hexagon::J4_cmpltu_f_jumpnv_t
+ : Hexagon::J4_cmpltu_f_jumpnv_nt;
+ return taken ? Hexagon::J4_cmpgtu_f_jumpnv_t
+ : Hexagon::J4_cmpgtu_f_jumpnv_nt;
+
+ default:
+ llvm_unreachable("Could not find matching New Value Jump instruction.");
+ }
+ // return *some value* to avoid compiler warning
+ return 0;
+}
+
+bool HexagonNewValueJump::isNewValueJumpCandidate(
+ const MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ case Hexagon::C2_cmpeq:
+ case Hexagon::C2_cmpeqi:
+ case Hexagon::C2_cmpgt:
+ case Hexagon::C2_cmpgti:
+ case Hexagon::C2_cmpgtu:
+ case Hexagon::C2_cmpgtui:
+ case Hexagon::C4_cmpneq:
+ case Hexagon::C4_cmplte:
+ case Hexagon::C4_cmplteu:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+
+bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
+
+ DEBUG(dbgs() << "********** Hexagon New Value Jump **********\n"
+ << "********** Function: "
+ << MF.getName() << "\n");
+
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
+ // If we move NewValueJump before register allocation we'll need live variable
+ // analysis here too.
+
+ QII = static_cast<const HexagonInstrInfo *>(MF.getSubtarget().getInstrInfo());
+ QRI = static_cast<const HexagonRegisterInfo *>(
+ MF.getSubtarget().getRegisterInfo());
+ MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+
+ if (DisableNewValueJumps) {
+ return false;
+ }
+
+ int nvjCount = DbgNVJCount;
+ int nvjGenerated = 0;
+
+ // Loop through all the bb's of the function
+ for (MachineFunction::iterator MBBb = MF.begin(), MBBe = MF.end();
+ MBBb != MBBe; ++MBBb) {
+ MachineBasicBlock *MBB = &*MBBb;
+
+ DEBUG(dbgs() << "** dumping bb ** "
+ << MBB->getNumber() << "\n");
+ DEBUG(MBB->dump());
+ DEBUG(dbgs() << "\n" << "********** dumping instr bottom up **********\n");
+ bool foundJump = false;
+ bool foundCompare = false;
+ bool invertPredicate = false;
+ unsigned predReg = 0; // predicate reg of the jump.
+ unsigned cmpReg1 = 0;
+ int cmpOp2 = 0;
+ bool MO1IsKill = false;
+ bool MO2IsKill = false;
+ MachineBasicBlock::iterator jmpPos;
+ MachineBasicBlock::iterator cmpPos;
+ MachineInstr *cmpInstr = nullptr, *jmpInstr = nullptr;
+ MachineBasicBlock *jmpTarget = nullptr;
+ bool afterRA = false;
+ bool isSecondOpReg = false;
+ bool isSecondOpNewified = false;
+ // Traverse the basic block - bottom up
+ for (MachineBasicBlock::iterator MII = MBB->end(), E = MBB->begin();
+ MII != E;) {
+ MachineInstr &MI = *--MII;
+ if (MI.isDebugValue()) {
+ continue;
+ }
+
+ if ((nvjCount == 0) || (nvjCount > -1 && nvjCount <= nvjGenerated))
+ break;
+
+ DEBUG(dbgs() << "Instr: "; MI.dump(); dbgs() << "\n");
+
+ if (!foundJump && (MI.getOpcode() == Hexagon::J2_jumpt ||
+ MI.getOpcode() == Hexagon::J2_jumptpt ||
+ MI.getOpcode() == Hexagon::J2_jumpf ||
+ MI.getOpcode() == Hexagon::J2_jumpfpt ||
+ MI.getOpcode() == Hexagon::J2_jumptnewpt ||
+ MI.getOpcode() == Hexagon::J2_jumptnew ||
+ MI.getOpcode() == Hexagon::J2_jumpfnewpt ||
+ MI.getOpcode() == Hexagon::J2_jumpfnew)) {
+ // This is where you would insert your compare and
+ // instr that feeds compare
+ jmpPos = MII;
+ jmpInstr = &MI;
+ predReg = MI.getOperand(0).getReg();
+ afterRA = TargetRegisterInfo::isPhysicalRegister(predReg);
+
+ // If ifconverter had not messed up with the kill flags of the
+ // operands, the following check on the kill flag would suffice.
+ // if(!jmpInstr->getOperand(0).isKill()) break;
+
+ // This predicate register is live out out of BB
+ // this would only work if we can actually use Live
+ // variable analysis on phy regs - but LLVM does not
+ // provide LV analysis on phys regs.
+ //if(LVs.isLiveOut(predReg, *MBB)) break;
+
+ // Get all the successors of this block - which will always
+ // be 2. Check if the predicate register is live-in in those
+ // successor. If yes, we can not delete the predicate -
+ // I am doing this only because LLVM does not provide LiveOut
+ // at the BB level.
+ bool predLive = false;
+ for (MachineBasicBlock::const_succ_iterator SI = MBB->succ_begin(),
+ SIE = MBB->succ_end(); SI != SIE; ++SI) {
+ MachineBasicBlock* succMBB = *SI;
+ if (succMBB->isLiveIn(predReg)) {
+ predLive = true;
+ }
+ }
+ if (predLive)
+ break;
+
+ if (!MI.getOperand(1).isMBB())
+ continue;
+ jmpTarget = MI.getOperand(1).getMBB();
+ foundJump = true;
+ if (MI.getOpcode() == Hexagon::J2_jumpf ||
+ MI.getOpcode() == Hexagon::J2_jumpfnewpt ||
+ MI.getOpcode() == Hexagon::J2_jumpfnew) {
+ invertPredicate = true;
+ }
+ continue;
+ }
+
+ // No new value jump if there is a barrier. A barrier has to be in its
+ // own packet. A barrier has zero operands. We conservatively bail out
+ // here if we see any instruction with zero operands.
+ if (foundJump && MI.getNumOperands() == 0)
+ break;
+
+ if (foundJump && !foundCompare && MI.getOperand(0).isReg() &&
+ MI.getOperand(0).getReg() == predReg) {
+
+ // Not all compares can be new value compare. Arch Spec: 7.6.1.1
+ if (isNewValueJumpCandidate(MI)) {
+
+ assert(
+ (MI.getDesc().isCompare()) &&
+ "Only compare instruction can be collapsed into New Value Jump");
+ isSecondOpReg = MI.getOperand(2).isReg();
+
+ if (!canCompareBeNewValueJump(QII, QRI, MII, predReg, isSecondOpReg,
+ afterRA, jmpPos, MF))
+ break;
+
+ cmpInstr = &MI;
+ cmpPos = MII;
+ foundCompare = true;
+
+ // We need cmpReg1 and cmpOp2(imm or reg) while building
+ // new value jump instruction.
+ cmpReg1 = MI.getOperand(1).getReg();
+ if (MI.getOperand(1).isKill())
+ MO1IsKill = true;
+
+ if (isSecondOpReg) {
+ cmpOp2 = MI.getOperand(2).getReg();
+ if (MI.getOperand(2).isKill())
+ MO2IsKill = true;
+ } else
+ cmpOp2 = MI.getOperand(2).getImm();
+ continue;
+ }
+ }
+
+ if (foundCompare && foundJump) {
+
+ // If "common" checks fail, bail out on this BB.
+ if (!commonChecksToProhibitNewValueJump(afterRA, MII))
+ break;
+
+ bool foundFeeder = false;
+ MachineBasicBlock::iterator feederPos = MII;
+ if (MI.getOperand(0).isReg() && MI.getOperand(0).isDef() &&
+ (MI.getOperand(0).getReg() == cmpReg1 ||
+ (isSecondOpReg &&
+ MI.getOperand(0).getReg() == (unsigned)cmpOp2))) {
+
+ unsigned feederReg = MI.getOperand(0).getReg();
+
+ // First try to see if we can get the feeder from the first operand
+ // of the compare. If we can not, and if secondOpReg is true
+ // (second operand of the compare is also register), try that one.
+ // TODO: Try to come up with some heuristic to figure out which
+ // feeder would benefit.
+
+ if (feederReg == cmpReg1) {
+ if (!canBeFeederToNewValueJump(QII, QRI, MII, jmpPos, cmpPos, MF)) {
+ if (!isSecondOpReg)
+ break;
+ else
+ continue;
+ } else
+ foundFeeder = true;
+ }
+
+ if (!foundFeeder &&
+ isSecondOpReg &&
+ feederReg == (unsigned) cmpOp2)
+ if (!canBeFeederToNewValueJump(QII, QRI, MII, jmpPos, cmpPos, MF))
+ break;
+
+ if (isSecondOpReg) {
+ // In case of CMPLT, or CMPLTU, or EQ with the second register
+ // to newify, swap the operands.
+ unsigned COp = cmpInstr->getOpcode();
+ if ((COp == Hexagon::C2_cmpeq || COp == Hexagon::C4_cmpneq) &&
+ (feederReg == (unsigned) cmpOp2)) {
+ unsigned tmp = cmpReg1;
+ bool tmpIsKill = MO1IsKill;
+ cmpReg1 = cmpOp2;
+ MO1IsKill = MO2IsKill;
+ cmpOp2 = tmp;
+ MO2IsKill = tmpIsKill;
+ }
+
+ // Now we have swapped the operands, all we need to check is,
+ // if the second operand (after swap) is the feeder.
+ // And if it is, make a note.
+ if (feederReg == (unsigned)cmpOp2)
+ isSecondOpNewified = true;
+ }
+
+ // Now that we are moving feeder close the jump,
+ // make sure we are respecting the kill values of
+ // the operands of the feeder.
+
+ bool updatedIsKill = false;
+ for (unsigned i = 0; i < MI.getNumOperands(); i++) {
+ MachineOperand &MO = MI.getOperand(i);
+ if (MO.isReg() && MO.isUse()) {
+ unsigned feederReg = MO.getReg();
+ for (MachineBasicBlock::iterator localII = feederPos,
+ end = jmpPos; localII != end; localII++) {
+ MachineInstr &localMI = *localII;
+ for (unsigned j = 0; j < localMI.getNumOperands(); j++) {
+ MachineOperand &localMO = localMI.getOperand(j);
+ if (localMO.isReg() && localMO.isUse() &&
+ localMO.isKill() && feederReg == localMO.getReg()) {
+ // We found that there is kill of a use register
+ // Set up a kill flag on the register
+ localMO.setIsKill(false);
+ MO.setIsKill();
+ updatedIsKill = true;
+ break;
+ }
+ }
+ if (updatedIsKill) break;
+ }
+ }
+ if (updatedIsKill) break;
+ }
+
+ MBB->splice(jmpPos, MI.getParent(), MI);
+ MBB->splice(jmpPos, MI.getParent(), cmpInstr);
+ DebugLoc dl = MI.getDebugLoc();
+ MachineInstr *NewMI;
+
+ assert((isNewValueJumpCandidate(*cmpInstr)) &&
+ "This compare is not a New Value Jump candidate.");
+ unsigned opc = getNewValueJumpOpcode(cmpInstr, cmpOp2,
+ isSecondOpNewified,
+ jmpTarget, MBPI);
+ if (invertPredicate)
+ opc = QII->getInvertedPredicatedOpcode(opc);
+
+ if (isSecondOpReg)
+ NewMI = BuildMI(*MBB, jmpPos, dl,
+ QII->get(opc))
+ .addReg(cmpReg1, getKillRegState(MO1IsKill))
+ .addReg(cmpOp2, getKillRegState(MO2IsKill))
+ .addMBB(jmpTarget);
+
+ else
+ NewMI = BuildMI(*MBB, jmpPos, dl,
+ QII->get(opc))
+ .addReg(cmpReg1, getKillRegState(MO1IsKill))
+ .addImm(cmpOp2)
+ .addMBB(jmpTarget);
+
+ assert(NewMI && "New Value Jump Instruction Not created!");
+ (void)NewMI;
+ if (cmpInstr->getOperand(0).isReg() &&
+ cmpInstr->getOperand(0).isKill())
+ cmpInstr->getOperand(0).setIsKill(false);
+ if (cmpInstr->getOperand(1).isReg() &&
+ cmpInstr->getOperand(1).isKill())
+ cmpInstr->getOperand(1).setIsKill(false);
+ cmpInstr->eraseFromParent();
+ jmpInstr->eraseFromParent();
+ ++nvjGenerated;
+ ++NumNVJGenerated;
+ break;
+ }
+ }
+ }
+ }
+
+ return true;
+
+}
+
+FunctionPass *llvm::createHexagonNewValueJump() {
+ return new HexagonNewValueJump();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonOperands.td b/contrib/llvm/lib/Target/Hexagon/HexagonOperands.td
new file mode 100644
index 000000000000..983310571563
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonOperands.td
@@ -0,0 +1,332 @@
+//===- HexagonImmediates.td - Hexagon immediate processing -*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illnois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def s32_0ImmOperand : AsmOperandClass { let Name = "s32_0Imm"; }
+def s23_2ImmOperand : AsmOperandClass { let Name = "s23_2Imm"; }
+def s8_0ImmOperand : AsmOperandClass { let Name = "s8_0Imm"; }
+def s8_0Imm64Operand : AsmOperandClass { let Name = "s8_0Imm64"; }
+def s6_0ImmOperand : AsmOperandClass { let Name = "s6_0Imm"; }
+def s4_0ImmOperand : AsmOperandClass { let Name = "s4_0Imm"; }
+def s4_1ImmOperand : AsmOperandClass { let Name = "s4_1Imm"; }
+def s4_2ImmOperand : AsmOperandClass { let Name = "s4_2Imm"; }
+def s4_3ImmOperand : AsmOperandClass { let Name = "s4_3Imm"; }
+def s4_6ImmOperand : AsmOperandClass { let Name = "s4_6Imm"; }
+def s3_6ImmOperand : AsmOperandClass { let Name = "s3_6Imm"; }
+def u64_0ImmOperand : AsmOperandClass { let Name = "u64_0Imm"; }
+def u32_0ImmOperand : AsmOperandClass { let Name = "u32_0Imm"; }
+def u26_6ImmOperand : AsmOperandClass { let Name = "u26_6Imm"; }
+def u16_0ImmOperand : AsmOperandClass { let Name = "u16_0Imm"; }
+def u16_1ImmOperand : AsmOperandClass { let Name = "u16_1Imm"; }
+def u16_2ImmOperand : AsmOperandClass { let Name = "u16_2Imm"; }
+def u16_3ImmOperand : AsmOperandClass { let Name = "u16_3Imm"; }
+def u11_3ImmOperand : AsmOperandClass { let Name = "u11_3Imm"; }
+def u10_0ImmOperand : AsmOperandClass { let Name = "u10_0Imm"; }
+def u9_0ImmOperand : AsmOperandClass { let Name = "u9_0Imm"; }
+def u8_0ImmOperand : AsmOperandClass { let Name = "u8_0Imm"; }
+def u7_0ImmOperand : AsmOperandClass { let Name = "u7_0Imm"; }
+def u6_0ImmOperand : AsmOperandClass { let Name = "u6_0Imm"; }
+def u6_1ImmOperand : AsmOperandClass { let Name = "u6_1Imm"; }
+def u6_2ImmOperand : AsmOperandClass { let Name = "u6_2Imm"; }
+def u6_3ImmOperand : AsmOperandClass { let Name = "u6_3Imm"; }
+def u5_0ImmOperand : AsmOperandClass { let Name = "u5_0Imm"; }
+def u4_0ImmOperand : AsmOperandClass { let Name = "u4_0Imm"; }
+def u3_0ImmOperand : AsmOperandClass { let Name = "u3_0Imm"; }
+def u2_0ImmOperand : AsmOperandClass { let Name = "u2_0Imm"; }
+def u1_0ImmOperand : AsmOperandClass { let Name = "u1_0Imm"; }
+def n8_0ImmOperand : AsmOperandClass { let Name = "n8_0Imm"; }
+// Immediate operands.
+
+let OperandType = "OPERAND_IMMEDIATE",
+ DecoderMethod = "unsignedImmDecoder" in {
+ def s32_0Imm : Operand<i32> { let ParserMatchClass = s32_0ImmOperand;
+ let DecoderMethod = "s32_0ImmDecoder"; }
+ def s23_2Imm : Operand<i32> { let ParserMatchClass = s23_2ImmOperand; }
+ def s8_0Imm : Operand<i32> { let ParserMatchClass = s8_0ImmOperand;
+ let DecoderMethod = "s8_0ImmDecoder"; }
+ def s8_0Imm64 : Operand<i64> { let ParserMatchClass = s8_0Imm64Operand;
+ let DecoderMethod = "s8_0ImmDecoder"; }
+ def s6_0Imm : Operand<i32> { let ParserMatchClass = s6_0ImmOperand;
+ let DecoderMethod = "s6_0ImmDecoder"; }
+ def s6_3Imm : Operand<i32>;
+ def s4_0Imm : Operand<i32> { let ParserMatchClass = s4_0ImmOperand;
+ let DecoderMethod = "s4_0ImmDecoder"; }
+ def s4_1Imm : Operand<i32> { let ParserMatchClass = s4_1ImmOperand;
+ let DecoderMethod = "s4_1ImmDecoder"; }
+ def s4_2Imm : Operand<i32> { let ParserMatchClass = s4_2ImmOperand;
+ let DecoderMethod = "s4_2ImmDecoder"; }
+ def s4_3Imm : Operand<i32> { let ParserMatchClass = s4_3ImmOperand;
+ let DecoderMethod = "s4_3ImmDecoder"; }
+ def u64_0Imm : Operand<i64> { let ParserMatchClass = u64_0ImmOperand; }
+ def u32_0Imm : Operand<i32> { let ParserMatchClass = u32_0ImmOperand; }
+ def u26_6Imm : Operand<i32> { let ParserMatchClass = u26_6ImmOperand; }
+ def u16_0Imm : Operand<i32> { let ParserMatchClass = u16_0ImmOperand; }
+ def u16_1Imm : Operand<i32> { let ParserMatchClass = u16_1ImmOperand; }
+ def u16_2Imm : Operand<i32> { let ParserMatchClass = u16_2ImmOperand; }
+ def u16_3Imm : Operand<i32> { let ParserMatchClass = u16_3ImmOperand; }
+ def u11_3Imm : Operand<i32> { let ParserMatchClass = u11_3ImmOperand; }
+ def u10_0Imm : Operand<i32> { let ParserMatchClass = u10_0ImmOperand; }
+ def u9_0Imm : Operand<i32> { let ParserMatchClass = u9_0ImmOperand; }
+ def u8_0Imm : Operand<i32> { let ParserMatchClass = u8_0ImmOperand; }
+ def u7_0Imm : Operand<i32> { let ParserMatchClass = u7_0ImmOperand; }
+ def u6_0Imm : Operand<i32> { let ParserMatchClass = u6_0ImmOperand; }
+ def u6_1Imm : Operand<i32> { let ParserMatchClass = u6_1ImmOperand; }
+ def u6_2Imm : Operand<i32> { let ParserMatchClass = u6_2ImmOperand; }
+ def u6_3Imm : Operand<i32> { let ParserMatchClass = u6_3ImmOperand; }
+ def u5_0Imm : Operand<i32> { let ParserMatchClass = u5_0ImmOperand; }
+ def u5_1Imm : Operand<i32>;
+ def u5_2Imm : Operand<i32>;
+ def u5_3Imm : Operand<i32>;
+ def u4_0Imm : Operand<i32> { let ParserMatchClass = u4_0ImmOperand; }
+ def u4_1Imm : Operand<i32>;
+ def u4_2Imm : Operand<i32>;
+ def u4_3Imm : Operand<i32>;
+ def u3_0Imm : Operand<i32> { let ParserMatchClass = u3_0ImmOperand; }
+ def u3_1Imm : Operand<i32>;
+ def u3_2Imm : Operand<i32>;
+ def u3_3Imm : Operand<i32>;
+ def u2_0Imm : Operand<i32> { let ParserMatchClass = u2_0ImmOperand; }
+ def u1_0Imm : Operand<i32> { let ParserMatchClass = u1_0ImmOperand; }
+ def n8_0Imm : Operand<i32> { let ParserMatchClass = n8_0ImmOperand; }
+}
+
+let OperandType = "OPERAND_IMMEDIATE" in {
+ def s4_6Imm : Operand<i32> { let ParserMatchClass = s4_6ImmOperand;
+ let PrintMethod = "prints4_6ImmOperand";
+ let DecoderMethod = "s4_6ImmDecoder";}
+ def s4_7Imm : Operand<i32> { let PrintMethod = "prints4_7ImmOperand";
+ let DecoderMethod = "s4_6ImmDecoder";}
+ def s3_6Imm : Operand<i32> { let ParserMatchClass = s3_6ImmOperand;
+ let PrintMethod = "prints3_6ImmOperand";
+ let DecoderMethod = "s3_6ImmDecoder";}
+ def s3_7Imm : Operand<i32> { let PrintMethod = "prints3_7ImmOperand";
+ let DecoderMethod = "s3_6ImmDecoder";}
+}
+def n1ConstOperand : AsmOperandClass { let Name = "n1Const"; }
+def n1Const : Operand<i32> { let ParserMatchClass = n1ConstOperand; }
+
+//
+// Immediate predicates
+//
+def s32_0ImmPred : PatLeaf<(i32 imm), [{
+ int64_t v = (int64_t)N->getSExtValue();
+ return isInt<32>(v);
+}]>;
+
+def s31_1ImmPred : PatLeaf<(i32 imm), [{
+ int64_t v = (int64_t)N->getSExtValue();
+ return isShiftedInt<31,1>(v);
+}]>;
+
+def s30_2ImmPred : PatLeaf<(i32 imm), [{
+ int64_t v = (int64_t)N->getSExtValue();
+ return isShiftedInt<30,2>(v);
+}]>;
+
+def s29_3ImmPred : PatLeaf<(i32 imm), [{
+ int64_t v = (int64_t)N->getSExtValue();
+ return isShiftedInt<29,3>(v);
+}]>;
+
+def s10_0ImmPred : PatLeaf<(i32 imm), [{
+ int64_t v = (int64_t)N->getSExtValue();
+ return isInt<10>(v);
+}]>;
+
+def s8_0ImmPred : PatLeaf<(i32 imm), [{
+ int64_t v = (int64_t)N->getSExtValue();
+ return isInt<8>(v);
+}]>;
+
+def s8_0Imm64Pred : PatLeaf<(i64 imm), [{
+ int64_t v = (int64_t)N->getSExtValue();
+ return isInt<8>(v);
+}]>;
+
+def s6_0ImmPred : PatLeaf<(i32 imm), [{
+ int64_t v = (int64_t)N->getSExtValue();
+ return isInt<6>(v);
+}]>;
+
+def s4_0ImmPred : PatLeaf<(i32 imm), [{
+ int64_t v = (int64_t)N->getSExtValue();
+ return isInt<4>(v);
+}]>;
+
+def s4_1ImmPred : PatLeaf<(i32 imm), [{
+ int64_t v = (int64_t)N->getSExtValue();
+ return isShiftedInt<4,1>(v);
+}]>;
+
+def s4_2ImmPred : PatLeaf<(i32 imm), [{
+ int64_t v = (int64_t)N->getSExtValue();
+ return isShiftedInt<4,2>(v);
+}]>;
+
+def s4_3ImmPred : PatLeaf<(i32 imm), [{
+ int64_t v = (int64_t)N->getSExtValue();
+ return isShiftedInt<4,3>(v);
+}]>;
+
+def u32_0ImmPred : PatLeaf<(i32 imm), [{
+ int64_t v = (int64_t)N->getSExtValue();
+ return isUInt<32>(v);
+}]>;
+
+def u16_0ImmPred : PatLeaf<(i32 imm), [{
+ int64_t v = (int64_t)N->getSExtValue();
+ return isUInt<16>(v);
+}]>;
+
+def u11_3ImmPred : PatLeaf<(i32 imm), [{
+ int64_t v = (int64_t)N->getSExtValue();
+ return isShiftedUInt<11,3>(v);
+}]>;
+
+def u9_0ImmPred : PatLeaf<(i32 imm), [{
+ int64_t v = (int64_t)N->getSExtValue();
+ return isUInt<9>(v);
+}]>;
+
+def u8_0ImmPred : PatLeaf<(i32 imm), [{
+ int64_t v = (int64_t)N->getSExtValue();
+ return isUInt<8>(v);
+}]>;
+
+def u6_0ImmPred : PatLeaf<(i32 imm), [{
+ int64_t v = (int64_t)N->getSExtValue();
+ return isUInt<6>(v);
+}]>;
+
+def u6_1ImmPred : PatLeaf<(i32 imm), [{
+ int64_t v = (int64_t)N->getSExtValue();
+ return isShiftedUInt<6,1>(v);
+}]>;
+
+def u6_2ImmPred : PatLeaf<(i32 imm), [{
+ int64_t v = (int64_t)N->getSExtValue();
+ return isShiftedUInt<6,2>(v);
+}]>;
+
+def u5_0ImmPred : PatLeaf<(i32 imm), [{
+ int64_t v = (int64_t)N->getSExtValue();
+ return isUInt<5>(v);
+}]>;
+
+def u4_0ImmPred : PatLeaf<(i32 imm), [{
+ int64_t v = (int64_t)N->getSExtValue();
+ return isUInt<4>(v);
+}]>;
+
+def u3_0ImmPred : PatLeaf<(i32 imm), [{
+ int64_t v = (int64_t)N->getSExtValue();
+ return isUInt<3>(v);
+}]>;
+
+def u2_0ImmPred : PatLeaf<(i32 imm), [{
+ int64_t v = (int64_t)N->getSExtValue();
+ return isUInt<2>(v);
+}]>;
+
+// Extendable immediate operands.
+def f32ExtOperand : AsmOperandClass { let Name = "f32Ext"; }
+def s16_0ExtOperand : AsmOperandClass { let Name = "s16_0Ext"; }
+def s12_0ExtOperand : AsmOperandClass { let Name = "s12_0Ext"; }
+def s10_0ExtOperand : AsmOperandClass { let Name = "s10_0Ext"; }
+def s9_0ExtOperand : AsmOperandClass { let Name = "s9_0Ext"; }
+def s8_0ExtOperand : AsmOperandClass { let Name = "s8_0Ext"; }
+def s7_0ExtOperand : AsmOperandClass { let Name = "s7_0Ext"; }
+def s6_0ExtOperand : AsmOperandClass { let Name = "s6_0Ext"; }
+def s11_0ExtOperand : AsmOperandClass { let Name = "s11_0Ext"; }
+def s11_1ExtOperand : AsmOperandClass { let Name = "s11_1Ext"; }
+def s11_2ExtOperand : AsmOperandClass { let Name = "s11_2Ext"; }
+def s11_3ExtOperand : AsmOperandClass { let Name = "s11_3Ext"; }
+def u6_0ExtOperand : AsmOperandClass { let Name = "u6_0Ext"; }
+def u7_0ExtOperand : AsmOperandClass { let Name = "u7_0Ext"; }
+def u8_0ExtOperand : AsmOperandClass { let Name = "u8_0Ext"; }
+def u9_0ExtOperand : AsmOperandClass { let Name = "u9_0Ext"; }
+def u10_0ExtOperand : AsmOperandClass { let Name = "u10_0Ext"; }
+def u6_1ExtOperand : AsmOperandClass { let Name = "u6_1Ext"; }
+def u6_2ExtOperand : AsmOperandClass { let Name = "u6_2Ext"; }
+def u6_3ExtOperand : AsmOperandClass { let Name = "u6_3Ext"; }
+def u32_0MustExtOperand : AsmOperandClass { let Name = "u32_0MustExt"; }
+
+
+
+let OperandType = "OPERAND_IMMEDIATE", PrintMethod = "printExtOperand",
+ DecoderMethod = "unsignedImmDecoder" in {
+ def f32Ext : Operand<f32> { let ParserMatchClass = f32ExtOperand; }
+ def s16_0Ext : Operand<i32> { let ParserMatchClass = s16_0ExtOperand;
+ let DecoderMethod = "s16_0ImmDecoder"; }
+ def s12_0Ext : Operand<i32> { let ParserMatchClass = s12_0ExtOperand;
+ let DecoderMethod = "s12_0ImmDecoder"; }
+ def s11_0Ext : Operand<i32> { let ParserMatchClass = s11_0ExtOperand;
+ let DecoderMethod = "s11_0ImmDecoder"; }
+ def s11_1Ext : Operand<i32> { let ParserMatchClass = s11_1ExtOperand;
+ let DecoderMethod = "s11_1ImmDecoder"; }
+ def s11_2Ext : Operand<i32> { let ParserMatchClass = s11_2ExtOperand;
+ let DecoderMethod = "s11_2ImmDecoder"; }
+ def s11_3Ext : Operand<i32> { let ParserMatchClass = s11_3ExtOperand;
+ let DecoderMethod = "s11_3ImmDecoder"; }
+ def s10_0Ext : Operand<i32> { let ParserMatchClass = s10_0ExtOperand;
+ let DecoderMethod = "s10_0ImmDecoder"; }
+ def s9_0Ext : Operand<i32> { let ParserMatchClass = s9_0ExtOperand;
+ let DecoderMethod = "s9_0ImmDecoder"; }
+ def s8_0Ext : Operand<i32> { let ParserMatchClass = s8_0ExtOperand;
+ let DecoderMethod = "s8_0ImmDecoder"; }
+ def s7_0Ext : Operand<i32> { let ParserMatchClass = s7_0ExtOperand; }
+ def s6_0Ext : Operand<i32> { let ParserMatchClass = s6_0ExtOperand;
+ let DecoderMethod = "s6_0ImmDecoder"; }
+ def u7_0Ext : Operand<i32> { let ParserMatchClass = u7_0ExtOperand; }
+ def u8_0Ext : Operand<i32> { let ParserMatchClass = u8_0ExtOperand; }
+ def u9_0Ext : Operand<i32> { let ParserMatchClass = u9_0ExtOperand; }
+ def u10_0Ext : Operand<i32> { let ParserMatchClass = u10_0ExtOperand; }
+ def u6_0Ext : Operand<i32> { let ParserMatchClass = u6_0ExtOperand; }
+ def u6_1Ext : Operand<i32> { let ParserMatchClass = u6_1ExtOperand; }
+ def u6_2Ext : Operand<i32> { let ParserMatchClass = u6_2ExtOperand; }
+ def u6_3Ext : Operand<i32> { let ParserMatchClass = u6_3ExtOperand; }
+ def u32_0MustExt : Operand<i32> { let ParserMatchClass = u32_0MustExtOperand; }
+}
+
+
+// This complex pattern exists only to create a machine instruction operand
+// of type "frame index". There doesn't seem to be a way to do that directly
+// in the patterns.
+def AddrFI : ComplexPattern<i32, 1, "SelectAddrFI", [frameindex], []>;
+
+// These complex patterns are not strictly necessary, since global address
+// folding will happen during DAG combining. For distinguishing between GA
+// and GP, pat frags with HexagonCONST32 and HexagonCONST32_GP can be used.
+def AddrGA : ComplexPattern<i32, 1, "SelectAddrGA", [], []>;
+def AddrGP : ComplexPattern<i32, 1, "SelectAddrGP", [], []>;
+
+// Address operands.
+
+let PrintMethod = "printGlobalOperand" in {
+ def globaladdress : Operand<i32>;
+ def globaladdressExt : Operand<i32>;
+}
+
+let PrintMethod = "printJumpTable" in
+def jumptablebase : Operand<i32>;
+
+def brtarget : Operand<OtherVT> {
+ let DecoderMethod = "brtargetDecoder";
+ let PrintMethod = "printBrtarget";
+}
+def brtargetExt : Operand<OtherVT> {
+ let DecoderMethod = "brtargetDecoder";
+ let PrintMethod = "printBrtarget";
+}
+def calltarget : Operand<i32> {
+ let DecoderMethod = "brtargetDecoder";
+ let PrintMethod = "printBrtarget";
+}
+
+def bblabel : Operand<i32>;
+def bbl : SDNode<"ISD::BasicBlock", SDTPtrLeaf, [], "BasicBlockSDNode">;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
new file mode 100644
index 000000000000..89db46799cb3
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
@@ -0,0 +1,678 @@
+//===--- HexagonOptAddrMode.cpp -------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This implements a Hexagon-specific pass to optimize addressing mode for
+// load/store instructions.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "opt-addr-mode"
+
+#include "HexagonInstrInfo.h"
+#include "HexagonSubtarget.h"
+#include "MCTargetDesc/HexagonBaseInfo.h"
+#include "RDFGraph.h"
+#include "RDFLiveness.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominanceFrontier.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+#include <map>
+
+static cl::opt<int> CodeGrowthLimit("hexagon-amode-growth-limit",
+ cl::Hidden, cl::init(0), cl::desc("Code growth limit for address mode "
+ "optimization"));
+
+using namespace llvm;
+using namespace rdf;
+
+namespace llvm {
+
+ FunctionPass *createHexagonOptAddrMode();
+ void initializeHexagonOptAddrModePass(PassRegistry &);
+
+} // end namespace llvm
+
+namespace {
+
+class HexagonOptAddrMode : public MachineFunctionPass {
+public:
+ static char ID;
+
+ HexagonOptAddrMode()
+ : MachineFunctionPass(ID), HII(nullptr), MDT(nullptr), DFG(nullptr),
+ LV(nullptr) {
+ PassRegistry &R = *PassRegistry::getPassRegistry();
+ initializeHexagonOptAddrModePass(R);
+ }
+
+ StringRef getPassName() const override {
+ return "Optimize addressing mode of load/store";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ MachineFunctionPass::getAnalysisUsage(AU);
+ AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<MachineDominanceFrontier>();
+ AU.setPreservesAll();
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+ typedef DenseSet<MachineInstr *> MISetType;
+ typedef DenseMap<MachineInstr *, bool> InstrEvalMap;
+ const HexagonInstrInfo *HII;
+ MachineDominatorTree *MDT;
+ DataFlowGraph *DFG;
+ DataFlowGraph::DefStackMap DefM;
+ std::map<RegisterRef, std::map<NodeId, NodeId>> RDefMap;
+ Liveness *LV;
+ MISetType Deleted;
+
+ bool processBlock(NodeAddr<BlockNode *> BA);
+ bool xformUseMI(MachineInstr *TfrMI, MachineInstr *UseMI,
+ NodeAddr<UseNode *> UseN, unsigned UseMOnum);
+ bool analyzeUses(unsigned DefR, const NodeList &UNodeList,
+ InstrEvalMap &InstrEvalResult, short &SizeInc);
+ bool hasRepForm(MachineInstr &MI, unsigned TfrDefR);
+ bool canRemoveAddasl(NodeAddr<StmtNode *> AddAslSN, MachineInstr &MI,
+ const NodeList &UNodeList);
+ void getAllRealUses(NodeAddr<StmtNode *> SN, NodeList &UNodeList);
+ bool allValidCandidates(NodeAddr<StmtNode *> SA, NodeList &UNodeList);
+ short getBaseWithLongOffset(const MachineInstr &MI) const;
+ void updateMap(NodeAddr<InstrNode *> IA);
+ bool constructDefMap(MachineBasicBlock *B);
+ bool changeStore(MachineInstr *OldMI, MachineOperand ImmOp,
+ unsigned ImmOpNum);
+ bool changeLoad(MachineInstr *OldMI, MachineOperand ImmOp, unsigned ImmOpNum);
+ bool changeAddAsl(NodeAddr<UseNode *> AddAslUN, MachineInstr *AddAslMI,
+ const MachineOperand &ImmOp, unsigned ImmOpNum);
+};
+
+} // end anonymous namespace
+
+char HexagonOptAddrMode::ID = 0;
+
+INITIALIZE_PASS_BEGIN(HexagonOptAddrMode, "opt-amode",
+ "Optimize addressing mode", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier)
+INITIALIZE_PASS_END(HexagonOptAddrMode, "opt-amode", "Optimize addressing mode",
+ false, false)
+
+bool HexagonOptAddrMode::hasRepForm(MachineInstr &MI, unsigned TfrDefR) {
+ const MCInstrDesc &MID = MI.getDesc();
+
+ if ((!MID.mayStore() && !MID.mayLoad()) || HII->isPredicated(MI))
+ return false;
+
+ if (MID.mayStore()) {
+ MachineOperand StOp = MI.getOperand(MI.getNumOperands() - 1);
+ if (StOp.isReg() && StOp.getReg() == TfrDefR)
+ return false;
+ }
+
+ if (HII->getAddrMode(MI) == HexagonII::BaseRegOffset)
+ // Tranform to Absolute plus register offset.
+ return (HII->getBaseWithLongOffset(MI) >= 0);
+ else if (HII->getAddrMode(MI) == HexagonII::BaseImmOffset)
+ // Tranform to absolute addressing mode.
+ return (HII->getAbsoluteForm(MI) >= 0);
+
+ return false;
+}
+
+// Check if addasl instruction can be removed. This is possible only
+// if it's feeding to only load/store instructions with base + register
+// offset as these instruction can be tranformed to use 'absolute plus
+// shifted register offset'.
+// ex:
+// Rs = ##foo
+// Rx = addasl(Rs, Rt, #2)
+// Rd = memw(Rx + #28)
+// Above three instructions can be replaced with Rd = memw(Rt<<#2 + ##foo+28)
+
+bool HexagonOptAddrMode::canRemoveAddasl(NodeAddr<StmtNode *> AddAslSN,
+ MachineInstr &MI,
+ const NodeList &UNodeList) {
+ // check offset size in addasl. if 'offset > 3' return false
+ const MachineOperand &OffsetOp = MI.getOperand(3);
+ if (!OffsetOp.isImm() || OffsetOp.getImm() > 3)
+ return false;
+
+ unsigned OffsetReg = MI.getOperand(2).getReg();
+ RegisterRef OffsetRR;
+ NodeId OffsetRegRD = 0;
+ for (NodeAddr<UseNode *> UA : AddAslSN.Addr->members_if(DFG->IsUse, *DFG)) {
+ RegisterRef RR = UA.Addr->getRegRef(*DFG);
+ if (OffsetReg == RR.Reg) {
+ OffsetRR = RR;
+ OffsetRegRD = UA.Addr->getReachingDef();
+ }
+ }
+
+ for (auto I = UNodeList.rbegin(), E = UNodeList.rend(); I != E; ++I) {
+ NodeAddr<UseNode *> UA = *I;
+ NodeAddr<InstrNode *> IA = UA.Addr->getOwner(*DFG);
+ if ((UA.Addr->getFlags() & NodeAttrs::PhiRef) ||
+ RDefMap[OffsetRR][IA.Id] != OffsetRegRD)
+ return false;
+
+ MachineInstr &UseMI = *NodeAddr<StmtNode *>(IA).Addr->getCode();
+ NodeAddr<DefNode *> OffsetRegDN = DFG->addr<DefNode *>(OffsetRegRD);
+ // Reaching Def to an offset register can't be a phi.
+ if ((OffsetRegDN.Addr->getFlags() & NodeAttrs::PhiRef) &&
+ MI.getParent() != UseMI.getParent())
+ return false;
+
+ const MCInstrDesc &UseMID = UseMI.getDesc();
+ if ((!UseMID.mayLoad() && !UseMID.mayStore()) ||
+ HII->getAddrMode(UseMI) != HexagonII::BaseImmOffset ||
+ getBaseWithLongOffset(UseMI) < 0)
+ return false;
+
+ // Addasl output can't be a store value.
+ if (UseMID.mayStore() && UseMI.getOperand(2).isReg() &&
+ UseMI.getOperand(2).getReg() == MI.getOperand(0).getReg())
+ return false;
+
+ for (auto &Mo : UseMI.operands())
+ if (Mo.isFI())
+ return false;
+ }
+ return true;
+}
+
+bool HexagonOptAddrMode::allValidCandidates(NodeAddr<StmtNode *> SA,
+ NodeList &UNodeList) {
+ for (auto I = UNodeList.rbegin(), E = UNodeList.rend(); I != E; ++I) {
+ NodeAddr<UseNode *> UN = *I;
+ RegisterRef UR = UN.Addr->getRegRef(*DFG);
+ NodeSet Visited, Defs;
+ const auto &ReachingDefs = LV->getAllReachingDefsRec(UR, UN, Visited, Defs);
+ if (ReachingDefs.size() > 1) {
+ DEBUG({
+ dbgs() << "*** Multiple Reaching Defs found!!! ***\n";
+ for (auto DI : ReachingDefs) {
+ NodeAddr<UseNode *> DA = DFG->addr<UseNode *>(DI);
+ NodeAddr<StmtNode *> TempIA = DA.Addr->getOwner(*DFG);
+ dbgs() << "\t\t[Reaching Def]: "
+ << Print<NodeAddr<InstrNode *>>(TempIA, *DFG) << "\n";
+ }
+ });
+ return false;
+ }
+ }
+ return true;
+}
+
+void HexagonOptAddrMode::getAllRealUses(NodeAddr<StmtNode *> SA,
+ NodeList &UNodeList) {
+ for (NodeAddr<DefNode *> DA : SA.Addr->members_if(DFG->IsDef, *DFG)) {
+ DEBUG(dbgs() << "\t\t[DefNode]: " << Print<NodeAddr<DefNode *>>(DA, *DFG)
+ << "\n");
+ RegisterRef DR = DFG->normalizeRef(DA.Addr->getRegRef(*DFG));
+
+ auto UseSet = LV->getAllReachedUses(DR, DA);
+
+ for (auto UI : UseSet) {
+ NodeAddr<UseNode *> UA = DFG->addr<UseNode *>(UI);
+ DEBUG({
+ NodeAddr<StmtNode *> TempIA = UA.Addr->getOwner(*DFG);
+ dbgs() << "\t\t\t[Reached Use]: "
+ << Print<NodeAddr<InstrNode *>>(TempIA, *DFG) << "\n";
+ });
+
+ if (UA.Addr->getFlags() & NodeAttrs::PhiRef) {
+ NodeAddr<PhiNode *> PA = UA.Addr->getOwner(*DFG);
+ NodeId id = PA.Id;
+ const Liveness::RefMap &phiUse = LV->getRealUses(id);
+ DEBUG(dbgs() << "\t\t\t\tphi real Uses"
+ << Print<Liveness::RefMap>(phiUse, *DFG) << "\n");
+ if (!phiUse.empty()) {
+ for (auto I : phiUse) {
+ if (DR.Reg != I.first)
+ continue;
+ auto phiUseSet = I.second;
+ for (auto phiUI : phiUseSet) {
+ NodeAddr<UseNode *> phiUA = DFG->addr<UseNode *>(phiUI.first);
+ UNodeList.push_back(phiUA);
+ }
+ }
+ }
+ } else
+ UNodeList.push_back(UA);
+ }
+ }
+}
+
+bool HexagonOptAddrMode::analyzeUses(unsigned tfrDefR,
+ const NodeList &UNodeList,
+ InstrEvalMap &InstrEvalResult,
+ short &SizeInc) {
+ bool KeepTfr = false;
+ bool HasRepInstr = false;
+ InstrEvalResult.clear();
+
+ for (auto I = UNodeList.rbegin(), E = UNodeList.rend(); I != E; ++I) {
+ bool CanBeReplaced = false;
+ NodeAddr<UseNode *> UN = *I;
+ NodeAddr<StmtNode *> SN = UN.Addr->getOwner(*DFG);
+ MachineInstr &MI = *SN.Addr->getCode();
+ const MCInstrDesc &MID = MI.getDesc();
+ if ((MID.mayLoad() || MID.mayStore())) {
+ if (!hasRepForm(MI, tfrDefR)) {
+ KeepTfr = true;
+ continue;
+ }
+ SizeInc++;
+ CanBeReplaced = true;
+ } else if (MI.getOpcode() == Hexagon::S2_addasl_rrri) {
+ NodeList AddaslUseList;
+
+ DEBUG(dbgs() << "\nGetting ReachedUses for === " << MI << "\n");
+ getAllRealUses(SN, AddaslUseList);
+ // Process phi nodes.
+ if (allValidCandidates(SN, AddaslUseList) &&
+ canRemoveAddasl(SN, MI, AddaslUseList)) {
+ SizeInc += AddaslUseList.size();
+ SizeInc -= 1; // Reduce size by 1 as addasl itself can be removed.
+ CanBeReplaced = true;
+ } else
+ SizeInc++;
+ } else
+ // Currently, only load/store and addasl are handled.
+ // Some other instructions to consider -
+ // A2_add -> A2_addi
+ // M4_mpyrr_addr -> M4_mpyrr_addi
+ KeepTfr = true;
+
+ InstrEvalResult[&MI] = CanBeReplaced;
+ HasRepInstr |= CanBeReplaced;
+ }
+
+ // Reduce total size by 2 if original tfr can be deleted.
+ if (!KeepTfr)
+ SizeInc -= 2;
+
+ return HasRepInstr;
+}
+
+bool HexagonOptAddrMode::changeLoad(MachineInstr *OldMI, MachineOperand ImmOp,
+ unsigned ImmOpNum) {
+ bool Changed = false;
+ MachineBasicBlock *BB = OldMI->getParent();
+ auto UsePos = MachineBasicBlock::iterator(OldMI);
+ MachineBasicBlock::instr_iterator InsertPt = UsePos.getInstrIterator();
+ ++InsertPt;
+ unsigned OpStart;
+ unsigned OpEnd = OldMI->getNumOperands();
+ MachineInstrBuilder MIB;
+
+ if (ImmOpNum == 1) {
+ if (HII->getAddrMode(*OldMI) == HexagonII::BaseRegOffset) {
+ short NewOpCode = HII->getBaseWithLongOffset(*OldMI);
+ assert(NewOpCode >= 0 && "Invalid New opcode\n");
+ MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode));
+ MIB.addOperand(OldMI->getOperand(0));
+ MIB.addOperand(OldMI->getOperand(2));
+ MIB.addOperand(OldMI->getOperand(3));
+ MIB.addOperand(ImmOp);
+ OpStart = 4;
+ Changed = true;
+ } else if (HII->getAddrMode(*OldMI) == HexagonII::BaseImmOffset) {
+ short NewOpCode = HII->getAbsoluteForm(*OldMI);
+ assert(NewOpCode >= 0 && "Invalid New opcode\n");
+ MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode))
+ .addOperand(OldMI->getOperand(0));
+ const GlobalValue *GV = ImmOp.getGlobal();
+ int64_t Offset = ImmOp.getOffset() + OldMI->getOperand(2).getImm();
+
+ MIB.addGlobalAddress(GV, Offset, ImmOp.getTargetFlags());
+ OpStart = 3;
+ Changed = true;
+ } else
+ Changed = false;
+
+ DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
+ DEBUG(dbgs() << "[TO]: " << MIB << "\n");
+ } else if (ImmOpNum == 2 && OldMI->getOperand(3).getImm() == 0) {
+ short NewOpCode = HII->xformRegToImmOffset(*OldMI);
+ assert(NewOpCode >= 0 && "Invalid New opcode\n");
+ MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode));
+ MIB.addOperand(OldMI->getOperand(0));
+ MIB.addOperand(OldMI->getOperand(1));
+ MIB.addOperand(ImmOp);
+ OpStart = 4;
+ Changed = true;
+ DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
+ DEBUG(dbgs() << "[TO]: " << MIB << "\n");
+ }
+
+ if (Changed)
+ for (unsigned i = OpStart; i < OpEnd; ++i)
+ MIB.addOperand(OldMI->getOperand(i));
+
+ return Changed;
+}
+
+bool HexagonOptAddrMode::changeStore(MachineInstr *OldMI, MachineOperand ImmOp,
+ unsigned ImmOpNum) {
+ bool Changed = false;
+ unsigned OpStart;
+ unsigned OpEnd = OldMI->getNumOperands();
+ MachineBasicBlock *BB = OldMI->getParent();
+ auto UsePos = MachineBasicBlock::iterator(OldMI);
+ MachineBasicBlock::instr_iterator InsertPt = UsePos.getInstrIterator();
+ ++InsertPt;
+ MachineInstrBuilder MIB;
+ if (ImmOpNum == 0) {
+ if (HII->getAddrMode(*OldMI) == HexagonII::BaseRegOffset) {
+ short NewOpCode = HII->getBaseWithLongOffset(*OldMI);
+ assert(NewOpCode >= 0 && "Invalid New opcode\n");
+ MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode));
+ MIB.addOperand(OldMI->getOperand(1));
+ MIB.addOperand(OldMI->getOperand(2));
+ MIB.addOperand(ImmOp);
+ MIB.addOperand(OldMI->getOperand(3));
+ OpStart = 4;
+ } else if (HII->getAddrMode(*OldMI) == HexagonII::BaseImmOffset) {
+ short NewOpCode = HII->getAbsoluteForm(*OldMI);
+ assert(NewOpCode >= 0 && "Invalid New opcode\n");
+ MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode));
+ const GlobalValue *GV = ImmOp.getGlobal();
+ int64_t Offset = ImmOp.getOffset() + OldMI->getOperand(1).getImm();
+ MIB.addGlobalAddress(GV, Offset, ImmOp.getTargetFlags());
+ MIB.addOperand(OldMI->getOperand(2));
+ OpStart = 3;
+ }
+ Changed = true;
+ DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
+ DEBUG(dbgs() << "[TO]: " << MIB << "\n");
+ } else if (ImmOpNum == 1 && OldMI->getOperand(2).getImm() == 0) {
+ short NewOpCode = HII->xformRegToImmOffset(*OldMI);
+ assert(NewOpCode >= 0 && "Invalid New opcode\n");
+ MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode));
+ MIB.addOperand(OldMI->getOperand(0));
+ MIB.addOperand(ImmOp);
+ MIB.addOperand(OldMI->getOperand(1));
+ OpStart = 2;
+ Changed = true;
+ DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
+ DEBUG(dbgs() << "[TO]: " << MIB << "\n");
+ }
+ if (Changed)
+ for (unsigned i = OpStart; i < OpEnd; ++i)
+ MIB.addOperand(OldMI->getOperand(i));
+
+ return Changed;
+}
+
+short HexagonOptAddrMode::getBaseWithLongOffset(const MachineInstr &MI) const {
+ if (HII->getAddrMode(MI) == HexagonII::BaseImmOffset) {
+ short TempOpCode = HII->getBaseWithRegOffset(MI);
+ return HII->getBaseWithLongOffset(TempOpCode);
+ } else
+ return HII->getBaseWithLongOffset(MI);
+}
+
+bool HexagonOptAddrMode::changeAddAsl(NodeAddr<UseNode *> AddAslUN,
+ MachineInstr *AddAslMI,
+ const MachineOperand &ImmOp,
+ unsigned ImmOpNum) {
+ NodeAddr<StmtNode *> SA = AddAslUN.Addr->getOwner(*DFG);
+
+ DEBUG(dbgs() << "Processing addasl :" << *AddAslMI << "\n");
+
+ NodeList UNodeList;
+ getAllRealUses(SA, UNodeList);
+
+ for (auto I = UNodeList.rbegin(), E = UNodeList.rend(); I != E; ++I) {
+ NodeAddr<UseNode *> UseUN = *I;
+ assert(!(UseUN.Addr->getFlags() & NodeAttrs::PhiRef) &&
+ "Can't transform this 'AddAsl' instruction!");
+
+ NodeAddr<StmtNode *> UseIA = UseUN.Addr->getOwner(*DFG);
+ DEBUG(dbgs() << "[InstrNode]: " << Print<NodeAddr<InstrNode *>>(UseIA, *DFG)
+ << "\n");
+ MachineInstr *UseMI = UseIA.Addr->getCode();
+ DEBUG(dbgs() << "[MI <BB#" << UseMI->getParent()->getNumber()
+ << ">]: " << *UseMI << "\n");
+ const MCInstrDesc &UseMID = UseMI->getDesc();
+ assert(HII->getAddrMode(*UseMI) == HexagonII::BaseImmOffset);
+
+ auto UsePos = MachineBasicBlock::iterator(UseMI);
+ MachineBasicBlock::instr_iterator InsertPt = UsePos.getInstrIterator();
+ short NewOpCode = getBaseWithLongOffset(*UseMI);
+ assert(NewOpCode >= 0 && "Invalid New opcode\n");
+
+ unsigned OpStart;
+ unsigned OpEnd = UseMI->getNumOperands();
+
+ MachineBasicBlock *BB = UseMI->getParent();
+ MachineInstrBuilder MIB =
+ BuildMI(*BB, InsertPt, UseMI->getDebugLoc(), HII->get(NewOpCode));
+ // change mem(Rs + # ) -> mem(Rt << # + ##)
+ if (UseMID.mayLoad()) {
+ MIB.addOperand(UseMI->getOperand(0));
+ MIB.addOperand(AddAslMI->getOperand(2));
+ MIB.addOperand(AddAslMI->getOperand(3));
+ const GlobalValue *GV = ImmOp.getGlobal();
+ MIB.addGlobalAddress(GV, UseMI->getOperand(2).getImm(),
+ ImmOp.getTargetFlags());
+ OpStart = 3;
+ } else if (UseMID.mayStore()) {
+ MIB.addOperand(AddAslMI->getOperand(2));
+ MIB.addOperand(AddAslMI->getOperand(3));
+ const GlobalValue *GV = ImmOp.getGlobal();
+ MIB.addGlobalAddress(GV, UseMI->getOperand(1).getImm(),
+ ImmOp.getTargetFlags());
+ MIB.addOperand(UseMI->getOperand(2));
+ OpStart = 3;
+ } else
+ llvm_unreachable("Unhandled instruction");
+
+ for (unsigned i = OpStart; i < OpEnd; ++i)
+ MIB.addOperand(UseMI->getOperand(i));
+
+ Deleted.insert(UseMI);
+ }
+
+ return true;
+}
+
+bool HexagonOptAddrMode::xformUseMI(MachineInstr *TfrMI, MachineInstr *UseMI,
+ NodeAddr<UseNode *> UseN,
+ unsigned UseMOnum) {
+ const MachineOperand ImmOp = TfrMI->getOperand(1);
+ const MCInstrDesc &MID = UseMI->getDesc();
+ unsigned Changed = false;
+ if (MID.mayLoad())
+ Changed = changeLoad(UseMI, ImmOp, UseMOnum);
+ else if (MID.mayStore())
+ Changed = changeStore(UseMI, ImmOp, UseMOnum);
+ else if (UseMI->getOpcode() == Hexagon::S2_addasl_rrri)
+ Changed = changeAddAsl(UseN, UseMI, ImmOp, UseMOnum);
+
+ if (Changed)
+ Deleted.insert(UseMI);
+
+ return Changed;
+}
+
+bool HexagonOptAddrMode::processBlock(NodeAddr<BlockNode *> BA) {
+ bool Changed = false;
+
+ for (auto IA : BA.Addr->members(*DFG)) {
+ if (!DFG->IsCode<NodeAttrs::Stmt>(IA))
+ continue;
+
+ NodeAddr<StmtNode *> SA = IA;
+ MachineInstr *MI = SA.Addr->getCode();
+ if (MI->getOpcode() != Hexagon::A2_tfrsi ||
+ !MI->getOperand(1).isGlobal())
+ continue;
+
+ DEBUG(dbgs() << "[Analyzing A2_tfrsi]: " << *MI << "\n");
+ DEBUG(dbgs() << "\t[InstrNode]: " << Print<NodeAddr<InstrNode *>>(IA, *DFG)
+ << "\n");
+
+ NodeList UNodeList;
+ getAllRealUses(SA, UNodeList);
+
+ if (!allValidCandidates(SA, UNodeList))
+ continue;
+
+ short SizeInc = 0;
+ unsigned DefR = MI->getOperand(0).getReg();
+ InstrEvalMap InstrEvalResult;
+
+ // Analyze all uses and calculate increase in size. Perform the optimization
+ // only if there is no increase in size.
+ if (!analyzeUses(DefR, UNodeList, InstrEvalResult, SizeInc))
+ continue;
+ if (SizeInc > CodeGrowthLimit)
+ continue;
+
+ bool KeepTfr = false;
+
+ DEBUG(dbgs() << "\t[Total reached uses] : " << UNodeList.size() << "\n");
+ DEBUG(dbgs() << "\t[Processing Reached Uses] ===\n");
+ for (auto I = UNodeList.rbegin(), E = UNodeList.rend(); I != E; ++I) {
+ NodeAddr<UseNode *> UseN = *I;
+ assert(!(UseN.Addr->getFlags() & NodeAttrs::PhiRef) &&
+ "Found a PhiRef node as a real reached use!!");
+
+ NodeAddr<StmtNode *> OwnerN = UseN.Addr->getOwner(*DFG);
+ MachineInstr *UseMI = OwnerN.Addr->getCode();
+ DEBUG(dbgs() << "\t\t[MI <BB#" << UseMI->getParent()->getNumber()
+ << ">]: " << *UseMI << "\n");
+
+ int UseMOnum = -1;
+ unsigned NumOperands = UseMI->getNumOperands();
+ for (unsigned j = 0; j < NumOperands - 1; ++j) {
+ const MachineOperand &op = UseMI->getOperand(j);
+ if (op.isReg() && op.isUse() && DefR == op.getReg())
+ UseMOnum = j;
+ }
+ assert(UseMOnum >= 0 && "Invalid reached use!");
+
+ if (InstrEvalResult[UseMI])
+ // Change UseMI if replacement is possible.
+ Changed |= xformUseMI(MI, UseMI, UseN, UseMOnum);
+ else
+ KeepTfr = true;
+ }
+ if (!KeepTfr)
+ Deleted.insert(MI);
+ }
+ return Changed;
+}
+
+void HexagonOptAddrMode::updateMap(NodeAddr<InstrNode *> IA) {
+ RegisterSet RRs;
+ for (NodeAddr<RefNode *> RA : IA.Addr->members(*DFG))
+ RRs.insert(RA.Addr->getRegRef(*DFG));
+ bool Common = false;
+ for (auto &R : RDefMap) {
+ if (!RRs.count(R.first))
+ continue;
+ Common = true;
+ break;
+ }
+ if (!Common)
+ return;
+
+ for (auto &R : RDefMap) {
+ auto F = DefM.find(R.first.Reg);
+ if (F == DefM.end() || F->second.empty())
+ continue;
+ R.second[IA.Id] = F->second.top()->Id;
+ }
+}
+
+bool HexagonOptAddrMode::constructDefMap(MachineBasicBlock *B) {
+ bool Changed = false;
+ auto BA = DFG->getFunc().Addr->findBlock(B, *DFG);
+ DFG->markBlock(BA.Id, DefM);
+
+ for (NodeAddr<InstrNode *> IA : BA.Addr->members(*DFG)) {
+ updateMap(IA);
+ DFG->pushDefs(IA, DefM);
+ }
+
+ MachineDomTreeNode *N = MDT->getNode(B);
+ for (auto I : *N)
+ Changed |= constructDefMap(I->getBlock());
+
+ DFG->releaseBlock(BA.Id, DefM);
+ return Changed;
+}
+
+bool HexagonOptAddrMode::runOnMachineFunction(MachineFunction &MF) {
+ bool Changed = false;
+ auto &HST = MF.getSubtarget<HexagonSubtarget>();
+ auto &MRI = MF.getRegInfo();
+ HII = HST.getInstrInfo();
+ const auto &MDF = getAnalysis<MachineDominanceFrontier>();
+ MDT = &getAnalysis<MachineDominatorTree>();
+ const auto &TRI = *MF.getSubtarget().getRegisterInfo();
+ const TargetOperandInfo TOI(*HII);
+
+ DataFlowGraph G(MF, *HII, TRI, *MDT, MDF, TOI);
+ G.build();
+ DFG = &G;
+
+ Liveness L(MRI, *DFG);
+ L.computePhiInfo();
+ LV = &L;
+
+ constructDefMap(&DFG->getMF().front());
+
+ Deleted.clear();
+ NodeAddr<FuncNode *> FA = DFG->getFunc();
+ DEBUG(dbgs() << "==== [RefMap#]=====:\n "
+ << Print<NodeAddr<FuncNode *>>(FA, *DFG) << "\n");
+
+ for (NodeAddr<BlockNode *> BA : FA.Addr->members(*DFG))
+ Changed |= processBlock(BA);
+
+ for (auto MI : Deleted)
+ MI->eraseFromParent();
+
+ if (Changed) {
+ G.build();
+ L.computeLiveIns();
+ L.resetLiveIns();
+ L.resetKills();
+ }
+
+ return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+FunctionPass *llvm::createHexagonOptAddrMode() {
+ return new HexagonOptAddrMode();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp
new file mode 100644
index 000000000000..101de3d8fbee
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp
@@ -0,0 +1,148 @@
+//===- HexagonOptimizeSZextends.cpp - Remove unnecessary argument extends -===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Pass that removes sign extends for function parameters. These parameters
+// are already sign extended by the caller per Hexagon's ABI
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/StackProtector.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+
+#include "Hexagon.h"
+
+using namespace llvm;
+
+namespace llvm {
+ FunctionPass *createHexagonOptimizeSZextends();
+ void initializeHexagonOptimizeSZextendsPass(PassRegistry&);
+}
+
+namespace {
+ struct HexagonOptimizeSZextends : public FunctionPass {
+ public:
+ static char ID;
+ HexagonOptimizeSZextends() : FunctionPass(ID) {
+ initializeHexagonOptimizeSZextendsPass(*PassRegistry::getPassRegistry());
+ }
+ bool runOnFunction(Function &F) override;
+
+ StringRef getPassName() const override { return "Remove sign extends"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addPreserved<StackProtector>();
+ FunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool intrinsicAlreadySextended(Intrinsic::ID IntID);
+ };
+}
+
+char HexagonOptimizeSZextends::ID = 0;
+
+INITIALIZE_PASS(HexagonOptimizeSZextends, "reargs",
+ "Remove Sign and Zero Extends for Args", false, false)
+
+bool HexagonOptimizeSZextends::intrinsicAlreadySextended(Intrinsic::ID IntID) {
+ switch(IntID) {
+ case llvm::Intrinsic::hexagon_A2_addh_l16_sat_ll:
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+
+bool HexagonOptimizeSZextends::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ unsigned Idx = 1;
+ // Try to optimize sign extends in formal parameters. It's relying on
+ // callee already sign extending the values. I'm not sure if our ABI
+ // requires callee to sign extend though.
+ for (auto &Arg : F.args()) {
+ if (F.getAttributes().hasAttribute(Idx, Attribute::SExt)) {
+ if (!isa<PointerType>(Arg.getType())) {
+ for (auto UI = Arg.use_begin(); UI != Arg.use_end();) {
+ if (isa<SExtInst>(*UI)) {
+ Instruction* Use = cast<Instruction>(*UI);
+ SExtInst* SI = new SExtInst(&Arg, Use->getType());
+ assert (EVT::getEVT(SI->getType()) ==
+ (EVT::getEVT(Use->getType())));
+ ++UI;
+ Use->replaceAllUsesWith(SI);
+ Instruction* First = &F.getEntryBlock().front();
+ SI->insertBefore(First);
+ Use->eraseFromParent();
+ } else {
+ ++UI;
+ }
+ }
+ }
+ }
+ ++Idx;
+ }
+
+ // Try to remove redundant sext operations on Hexagon. The hardware
+ // already sign extends many 16 bit intrinsic operations to 32 bits.
+ // For example:
+ // %34 = tail call i32 @llvm.hexagon.A2.addh.l16.sat.ll(i32 %x, i32 %y)
+ // %sext233 = shl i32 %34, 16
+ // %conv52 = ashr exact i32 %sext233, 16
+ for (auto &B : F) {
+ for (auto &I : B) {
+ // Look for arithmetic shift right by 16.
+ BinaryOperator *Ashr = dyn_cast<BinaryOperator>(&I);
+ if (!(Ashr && Ashr->getOpcode() == Instruction::AShr))
+ continue;
+ Value *AshrOp1 = Ashr->getOperand(1);
+ ConstantInt *C = dyn_cast<ConstantInt>(AshrOp1);
+ // Right shifted by 16.
+ if (!(C && C->getSExtValue() == 16))
+ continue;
+
+ // The first operand of Ashr comes from logical shift left.
+ Instruction *Shl = dyn_cast<Instruction>(Ashr->getOperand(0));
+ if (!(Shl && Shl->getOpcode() == Instruction::Shl))
+ continue;
+ Value *Intr = Shl->getOperand(0);
+ Value *ShlOp1 = Shl->getOperand(1);
+ C = dyn_cast<ConstantInt>(ShlOp1);
+ // Left shifted by 16.
+ if (!(C && C->getSExtValue() == 16))
+ continue;
+
+ // The first operand of Shl comes from an intrinsic.
+ if (IntrinsicInst *I = dyn_cast<IntrinsicInst>(Intr)) {
+ if (!intrinsicAlreadySextended(I->getIntrinsicID()))
+ continue;
+ // All is well. Replace all uses of AShr with I.
+ for (auto UI = Ashr->user_begin(), UE = Ashr->user_end();
+ UI != UE; ++UI) {
+ const Use &TheUse = UI.getUse();
+ if (Instruction *J = dyn_cast<Instruction>(TheUse.getUser())) {
+ J->replaceUsesOfWith(Ashr, I);
+ }
+ }
+ }
+ }
+ }
+
+ return true;
+}
+
+
+FunctionPass *llvm::createHexagonOptimizeSZextends() {
+ return new HexagonOptimizeSZextends();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonPatterns.td b/contrib/llvm/lib/Target/Hexagon/HexagonPatterns.td
new file mode 100644
index 000000000000..ad81287007e6
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonPatterns.td
@@ -0,0 +1,3347 @@
+// Pattern fragment that combines the value type and the register class
+// into a single parameter.
+// The pat frags in the definitions below need to have a named register,
+// otherwise i32 will be assumed regardless of the register class. The
+// name of the register does not matter.
+def I1 : PatLeaf<(i1 PredRegs:$R)>;
+def I32 : PatLeaf<(i32 IntRegs:$R)>;
+def I64 : PatLeaf<(i64 DoubleRegs:$R)>;
+def F32 : PatLeaf<(f32 IntRegs:$R)>;
+def F64 : PatLeaf<(f64 DoubleRegs:$R)>;
+
+// Pattern fragments to extract the low and high subregisters from a
+// 64-bit value.
+def LoReg: OutPatFrag<(ops node:$Rs), (EXTRACT_SUBREG (i64 $Rs), isub_lo)>;
+def HiReg: OutPatFrag<(ops node:$Rs), (EXTRACT_SUBREG (i64 $Rs), isub_hi)>;
+
+def IsOrAdd: PatFrag<(ops node:$Addr, node:$off),
+ (or node:$Addr, node:$off), [{ return isOrEquivalentToAdd(N); }]>;
+
+def IsPow2_32 : PatLeaf<(i32 imm), [{
+ uint32_t V = N->getZExtValue();
+ return isPowerOf2_32(V);
+}]>;
+
+def IsPow2_64 : PatLeaf<(i64 imm), [{
+ uint64_t V = N->getZExtValue();
+ return isPowerOf2_64(V);
+}]>;
+
+def IsNPow2_32 : PatLeaf<(i32 imm), [{
+ uint32_t NV = ~N->getZExtValue();
+ return isPowerOf2_32(NV);
+}]>;
+
+def IsPow2_64L : PatLeaf<(i64 imm), [{
+ uint64_t V = N->getZExtValue();
+ return isPowerOf2_64(V) && Log2_64(V) < 32;
+}]>;
+
+def IsPow2_64H : PatLeaf<(i64 imm), [{
+ uint64_t V = N->getZExtValue();
+ return isPowerOf2_64(V) && Log2_64(V) >= 32;
+}]>;
+
+def IsNPow2_64L : PatLeaf<(i64 imm), [{
+ uint64_t NV = ~N->getZExtValue();
+ return isPowerOf2_64(NV) && Log2_64(NV) < 32;
+}]>;
+
+def IsNPow2_64H : PatLeaf<(i64 imm), [{
+ uint64_t NV = ~N->getZExtValue();
+ return isPowerOf2_64(NV) && Log2_64(NV) >= 32;
+}]>;
+
+def SDEC1 : SDNodeXForm<imm, [{
+ int32_t V = N->getSExtValue();
+ return CurDAG->getTargetConstant(V-1, SDLoc(N), MVT::i32);
+}]>;
+
+def UDEC1 : SDNodeXForm<imm, [{
+ uint32_t V = N->getZExtValue();
+ assert(V >= 1);
+ return CurDAG->getTargetConstant(V-1, SDLoc(N), MVT::i32);
+}]>;
+
+def UDEC32 : SDNodeXForm<imm, [{
+ uint32_t V = N->getZExtValue();
+ assert(V >= 32);
+ return CurDAG->getTargetConstant(V-32, SDLoc(N), MVT::i32);
+}]>;
+
+def Log2_32 : SDNodeXForm<imm, [{
+ uint32_t V = N->getZExtValue();
+ return CurDAG->getTargetConstant(Log2_32(V), SDLoc(N), MVT::i32);
+}]>;
+
+def Log2_64 : SDNodeXForm<imm, [{
+ uint64_t V = N->getZExtValue();
+ return CurDAG->getTargetConstant(Log2_64(V), SDLoc(N), MVT::i32);
+}]>;
+
+def LogN2_32 : SDNodeXForm<imm, [{
+ uint32_t NV = ~N->getZExtValue();
+ return CurDAG->getTargetConstant(Log2_32(NV), SDLoc(N), MVT::i32);
+}]>;
+
+def LogN2_64 : SDNodeXForm<imm, [{
+ uint64_t NV = ~N->getZExtValue();
+ return CurDAG->getTargetConstant(Log2_64(NV), SDLoc(N), MVT::i32);
+}]>;
+
+
+class T_CMP_pat <InstHexagon MI, PatFrag OpNode, PatLeaf ImmPred>
+ : Pat<(i1 (OpNode I32:$src1, ImmPred:$src2)),
+ (MI IntRegs:$src1, ImmPred:$src2)>;
+
+def : T_CMP_pat <C2_cmpeqi, seteq, s10_0ImmPred>;
+def : T_CMP_pat <C2_cmpgti, setgt, s10_0ImmPred>;
+def : T_CMP_pat <C2_cmpgtui, setugt, u9_0ImmPred>;
+
+def SDTHexagonI64I32I32 : SDTypeProfile<1, 2,
+ [SDTCisVT<0, i64>, SDTCisVT<1, i32>, SDTCisSameAs<1, 2>]>;
+
+def HexagonCOMBINE : SDNode<"HexagonISD::COMBINE", SDTHexagonI64I32I32>;
+def HexagonPACKHL : SDNode<"HexagonISD::PACKHL", SDTHexagonI64I32I32>;
+
+// Pats for instruction selection.
+class BinOp32_pat<SDNode Op, InstHexagon MI, ValueType ResT>
+ : Pat<(ResT (Op I32:$Rs, I32:$Rt)),
+ (ResT (MI IntRegs:$Rs, IntRegs:$Rt))>;
+
+def: BinOp32_pat<add, A2_add, i32>;
+def: BinOp32_pat<and, A2_and, i32>;
+def: BinOp32_pat<or, A2_or, i32>;
+def: BinOp32_pat<sub, A2_sub, i32>;
+def: BinOp32_pat<xor, A2_xor, i32>;
+
+def: BinOp32_pat<HexagonCOMBINE, A2_combinew, i64>;
+def: BinOp32_pat<HexagonPACKHL, S2_packhl, i64>;
+
+// Patfrag to convert the usual comparison patfrags (e.g. setlt) to ones
+// that reverse the order of the operands.
+class RevCmp<PatFrag F> : PatFrag<(ops node:$rhs, node:$lhs), F.Fragment>;
+
+// Pats for compares. They use PatFrags as operands, not SDNodes,
+// since seteq/setgt/etc. are defined as ParFrags.
+class T_cmp32_rr_pat<InstHexagon MI, PatFrag Op, ValueType VT>
+ : Pat<(VT (Op I32:$Rs, I32:$Rt)),
+ (MI IntRegs:$Rs, IntRegs:$Rt)>;
+
+def: T_cmp32_rr_pat<C2_cmpeq, seteq, i1>;
+def: T_cmp32_rr_pat<C2_cmpgt, setgt, i1>;
+def: T_cmp32_rr_pat<C2_cmpgtu, setugt, i1>;
+
+def: T_cmp32_rr_pat<C2_cmpgt, RevCmp<setlt>, i1>;
+def: T_cmp32_rr_pat<C2_cmpgtu, RevCmp<setult>, i1>;
+
+def: Pat<(select I1:$Pu, I32:$Rs, I32:$Rt),
+ (C2_mux PredRegs:$Pu, IntRegs:$Rs, IntRegs:$Rt)>;
+
+def: Pat<(add I32:$Rs, s32_0ImmPred:$s16),
+ (A2_addi I32:$Rs, imm:$s16)>;
+
+def: Pat<(or I32:$Rs, s32_0ImmPred:$s10),
+ (A2_orir IntRegs:$Rs, imm:$s10)>;
+def: Pat<(and I32:$Rs, s32_0ImmPred:$s10),
+ (A2_andir IntRegs:$Rs, imm:$s10)>;
+
+def: Pat<(sub s32_0ImmPred:$s10, IntRegs:$Rs),
+ (A2_subri imm:$s10, IntRegs:$Rs)>;
+
+// Rd = not(Rs) gets mapped to Rd=sub(#-1, Rs).
+def: Pat<(not I32:$src1),
+ (A2_subri -1, IntRegs:$src1)>;
+
+def: Pat<(s32_0ImmPred:$s16), (A2_tfrsi imm:$s16)>;
+def: Pat<(s8_0Imm64Pred:$s8), (A2_tfrpi imm:$s8)>;
+
+def : Pat<(select I1:$Pu, s32_0ImmPred:$s8, I32:$Rs),
+ (C2_muxri I1:$Pu, imm:$s8, I32:$Rs)>;
+
+def : Pat<(select I1:$Pu, I32:$Rs, s32_0ImmPred:$s8),
+ (C2_muxir I1:$Pu, I32:$Rs, imm:$s8)>;
+
+def : Pat<(select I1:$Pu, s32_0ImmPred:$s8, s8_0ImmPred:$S8),
+ (C2_muxii I1:$Pu, imm:$s8, imm:$S8)>;
+
+def: Pat<(shl I32:$src1, (i32 16)), (A2_aslh I32:$src1)>;
+def: Pat<(sra I32:$src1, (i32 16)), (A2_asrh I32:$src1)>;
+def: Pat<(sext_inreg I32:$src1, i8), (A2_sxtb I32:$src1)>;
+def: Pat<(sext_inreg I32:$src1, i16), (A2_sxth I32:$src1)>;
+
+class T_vcmp_pat<InstHexagon MI, PatFrag Op, ValueType T>
+ : Pat<(i1 (Op (T DoubleRegs:$Rss), (T DoubleRegs:$Rtt))),
+ (i1 (MI DoubleRegs:$Rss, DoubleRegs:$Rtt))>;
+
+def: T_vcmp_pat<A2_vcmpbeq, seteq, v8i8>;
+def: T_vcmp_pat<A2_vcmpbgtu, setugt, v8i8>;
+def: T_vcmp_pat<A2_vcmpheq, seteq, v4i16>;
+def: T_vcmp_pat<A2_vcmphgt, setgt, v4i16>;
+def: T_vcmp_pat<A2_vcmphgtu, setugt, v4i16>;
+def: T_vcmp_pat<A2_vcmpweq, seteq, v2i32>;
+def: T_vcmp_pat<A2_vcmpwgt, setgt, v2i32>;
+def: T_vcmp_pat<A2_vcmpwgtu, setugt, v2i32>;
+
+// Add halfword.
+def: Pat<(sext_inreg (add I32:$src1, I32:$src2), i16),
+ (A2_addh_l16_ll I32:$src1, I32:$src2)>;
+
+def: Pat<(sra (add (shl I32:$src1, (i32 16)), I32:$src2), (i32 16)),
+ (A2_addh_l16_hl I32:$src1, I32:$src2)>;
+
+def: Pat<(shl (add I32:$src1, I32:$src2), (i32 16)),
+ (A2_addh_h16_ll I32:$src1, I32:$src2)>;
+
+// Subtract halfword.
+def: Pat<(sext_inreg (sub I32:$src1, I32:$src2), i16),
+ (A2_subh_l16_ll I32:$src1, I32:$src2)>;
+
+def: Pat<(shl (sub I32:$src1, I32:$src2), (i32 16)),
+ (A2_subh_h16_ll I32:$src1, I32:$src2)>;
+
+// Here, depending on the operand being selected, we'll either generate a
+// min or max instruction.
+// Ex:
+// (a>b)?a:b --> max(a,b) => Here check performed is '>' and the value selected
+// is the larger of two. So, the corresponding HexagonInst is passed in 'Inst'.
+// (a>b)?b:a --> min(a,b) => Here check performed is '>' but the smaller value
+// is selected and the corresponding HexagonInst is passed in 'SwapInst'.
+
+multiclass T_MinMax_pats <PatFrag Op, PatLeaf Val,
+ InstHexagon Inst, InstHexagon SwapInst> {
+ def: Pat<(select (i1 (Op Val:$src1, Val:$src2)), Val:$src1, Val:$src2),
+ (Inst Val:$src1, Val:$src2)>;
+ def: Pat<(select (i1 (Op Val:$src1, Val:$src2)), Val:$src2, Val:$src1),
+ (SwapInst Val:$src1, Val:$src2)>;
+}
+
+def IsPosHalf : PatLeaf<(i32 IntRegs:$a), [{
+ return isPositiveHalfWord(N);
+}]>;
+
+multiclass MinMax_pats <PatFrag Op, InstHexagon Inst, InstHexagon SwapInst> {
+ defm: T_MinMax_pats<Op, I32, Inst, SwapInst>;
+
+ def: Pat<(sext_inreg (select (i1 (Op IsPosHalf:$src1, IsPosHalf:$src2)),
+ IsPosHalf:$src1, IsPosHalf:$src2),
+ i16),
+ (Inst IntRegs:$src1, IntRegs:$src2)>;
+
+ def: Pat<(sext_inreg (select (i1 (Op IsPosHalf:$src1, IsPosHalf:$src2)),
+ IsPosHalf:$src2, IsPosHalf:$src1),
+ i16),
+ (SwapInst IntRegs:$src1, IntRegs:$src2)>;
+}
+
+let AddedComplexity = 200 in {
+ defm: MinMax_pats<setge, A2_max, A2_min>;
+ defm: MinMax_pats<setgt, A2_max, A2_min>;
+ defm: MinMax_pats<setle, A2_min, A2_max>;
+ defm: MinMax_pats<setlt, A2_min, A2_max>;
+ defm: MinMax_pats<setuge, A2_maxu, A2_minu>;
+ defm: MinMax_pats<setugt, A2_maxu, A2_minu>;
+ defm: MinMax_pats<setule, A2_minu, A2_maxu>;
+ defm: MinMax_pats<setult, A2_minu, A2_maxu>;
+}
+
+class T_cmp64_rr_pat<InstHexagon MI, PatFrag CmpOp>
+ : Pat<(i1 (CmpOp I64:$Rs, I64:$Rt)),
+ (i1 (MI DoubleRegs:$Rs, DoubleRegs:$Rt))>;
+
+def: T_cmp64_rr_pat<C2_cmpeqp, seteq>;
+def: T_cmp64_rr_pat<C2_cmpgtp, setgt>;
+def: T_cmp64_rr_pat<C2_cmpgtup, setugt>;
+def: T_cmp64_rr_pat<C2_cmpgtp, RevCmp<setlt>>;
+def: T_cmp64_rr_pat<C2_cmpgtup, RevCmp<setult>>;
+
+def: Pat<(i64 (add I64:$Rs, I64:$Rt)), (A2_addp I64:$Rs, I64:$Rt)>;
+def: Pat<(i64 (sub I64:$Rs, I64:$Rt)), (A2_subp I64:$Rs, I64:$Rt)>;
+
+def: Pat<(i64 (and I64:$Rs, I64:$Rt)), (A2_andp I64:$Rs, I64:$Rt)>;
+def: Pat<(i64 (or I64:$Rs, I64:$Rt)), (A2_orp I64:$Rs, I64:$Rt)>;
+def: Pat<(i64 (xor I64:$Rs, I64:$Rt)), (A2_xorp I64:$Rs, I64:$Rt)>;
+
+def: Pat<(i1 (not I1:$Ps)), (C2_not PredRegs:$Ps)>;
+
+def: Pat<(i1 (and I1:$Ps, I1:$Pt)), (C2_and I1:$Ps, I1:$Pt)>;
+def: Pat<(i1 (or I1:$Ps, I1:$Pt)), (C2_or I1:$Ps, I1:$Pt)>;
+def: Pat<(i1 (xor I1:$Ps, I1:$Pt)), (C2_xor I1:$Ps, I1:$Pt)>;
+def: Pat<(i1 (and I1:$Ps, (not I1:$Pt))), (C2_andn I1:$Ps, I1:$Pt)>;
+def: Pat<(i1 (or I1:$Ps, (not I1:$Pt))), (C2_orn I1:$Ps, I1:$Pt)>;
+
+def retflag : SDNode<"HexagonISD::RET_FLAG", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def eh_return: SDNode<"HexagonISD::EH_RETURN", SDTNone, [SDNPHasChain]>;
+
+def: Pat<(br bb:$dst), (J2_jump brtarget:$dst)>;
+def: Pat<(brcond I1:$src1, bb:$block), (J2_jumpt PredRegs:$src1, bb:$block)>;
+def: Pat<(brind I32:$dst), (J2_jumpr IntRegs:$dst)>;
+
+def: Pat<(retflag), (PS_jmpret (i32 R31))>;
+def: Pat<(eh_return), (EH_RETURN_JMPR (i32 R31))>;
+
+// Patterns to select load-indexed (i.e. load from base+offset).
+multiclass Loadx_pat<PatFrag Load, ValueType VT, PatLeaf ImmPred,
+ InstHexagon MI> {
+ def: Pat<(VT (Load AddrFI:$fi)), (VT (MI AddrFI:$fi, 0))>;
+ def: Pat<(VT (Load (add (i32 AddrFI:$fi), ImmPred:$Off))),
+ (VT (MI AddrFI:$fi, imm:$Off))>;
+ def: Pat<(VT (Load (IsOrAdd (i32 AddrFI:$fi), ImmPred:$Off))),
+ (VT (MI AddrFI:$fi, imm:$Off))>;
+ def: Pat<(VT (Load (add I32:$Rs, ImmPred:$Off))),
+ (VT (MI IntRegs:$Rs, imm:$Off))>;
+ def: Pat<(VT (Load I32:$Rs)), (VT (MI IntRegs:$Rs, 0))>;
+}
+
+let AddedComplexity = 20 in {
+ defm: Loadx_pat<load, i32, s30_2ImmPred, L2_loadri_io>;
+ defm: Loadx_pat<load, i64, s29_3ImmPred, L2_loadrd_io>;
+ defm: Loadx_pat<atomic_load_8 , i32, s32_0ImmPred, L2_loadrub_io>;
+ defm: Loadx_pat<atomic_load_16, i32, s31_1ImmPred, L2_loadruh_io>;
+ defm: Loadx_pat<atomic_load_32, i32, s30_2ImmPred, L2_loadri_io>;
+ defm: Loadx_pat<atomic_load_64, i64, s29_3ImmPred, L2_loadrd_io>;
+
+ defm: Loadx_pat<extloadi1, i32, s32_0ImmPred, L2_loadrub_io>;
+ defm: Loadx_pat<extloadi8, i32, s32_0ImmPred, L2_loadrub_io>;
+ defm: Loadx_pat<extloadi16, i32, s31_1ImmPred, L2_loadruh_io>;
+ defm: Loadx_pat<sextloadi8, i32, s32_0ImmPred, L2_loadrb_io>;
+ defm: Loadx_pat<sextloadi16, i32, s31_1ImmPred, L2_loadrh_io>;
+ defm: Loadx_pat<zextloadi1, i32, s32_0ImmPred, L2_loadrub_io>;
+ defm: Loadx_pat<zextloadi8, i32, s32_0ImmPred, L2_loadrub_io>;
+ defm: Loadx_pat<zextloadi16, i32, s31_1ImmPred, L2_loadruh_io>;
+ // No sextloadi1.
+}
+
+// Sign-extending loads of i1 need to replicate the lowest bit throughout
+// the 32-bit value. Since the loaded value can only be 0 or 1, 0-v should
+// do the trick.
+let AddedComplexity = 20 in
+def: Pat<(i32 (sextloadi1 I32:$Rs)),
+ (A2_subri 0, (L2_loadrub_io IntRegs:$Rs, 0))>;
+
+def: Pat<(i32 (mul I32:$src1, I32:$src2)), (M2_mpyi I32:$src1, I32:$src2)>;
+def: Pat<(i32 (mulhs I32:$src1, I32:$src2)), (M2_mpy_up I32:$src1, I32:$src2)>;
+def: Pat<(i32 (mulhu I32:$src1, I32:$src2)), (M2_mpyu_up I32:$src1, I32:$src2)>;
+
+def: Pat<(mul IntRegs:$Rs, u32_0ImmPred:$u8),
+ (M2_mpysip IntRegs:$Rs, imm:$u8)>;
+def: Pat<(ineg (mul IntRegs:$Rs, u8_0ImmPred:$u8)),
+ (M2_mpysin IntRegs:$Rs, imm:$u8)>;
+def: Pat<(mul IntRegs:$src1, s32_0ImmPred:$src2),
+ (M2_mpysmi IntRegs:$src1, imm:$src2)>;
+def: Pat<(add (mul IntRegs:$src2, u32_0ImmPred:$src3), IntRegs:$src1),
+ (M2_macsip IntRegs:$src1, IntRegs:$src2, imm:$src3)>;
+def: Pat<(add (mul I32:$src2, I32:$src3), I32:$src1),
+ (M2_maci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
+def: Pat<(add (add IntRegs:$src2, u32_0ImmPred:$src3), IntRegs:$src1),
+ (M2_accii IntRegs:$src1, IntRegs:$src2, imm:$src3)>;
+def: Pat<(add (add I32:$src2, I32:$src3), I32:$src1),
+ (M2_acci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
+
+class T_MType_acc_pat1 <InstHexagon MI, SDNode firstOp, SDNode secOp,
+ PatLeaf ImmPred>
+ : Pat <(secOp IntRegs:$src1, (firstOp IntRegs:$src2, ImmPred:$src3)),
+ (MI IntRegs:$src1, IntRegs:$src2, ImmPred:$src3)>;
+
+class T_MType_acc_pat2 <InstHexagon MI, SDNode firstOp, SDNode secOp>
+ : Pat <(i32 (secOp IntRegs:$src1, (firstOp IntRegs:$src2, IntRegs:$src3))),
+ (MI IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
+
+def : T_MType_acc_pat2 <M2_xor_xacc, xor, xor>;
+def : T_MType_acc_pat1 <M2_macsin, mul, sub, u32_0ImmPred>;
+
+def : T_MType_acc_pat1 <M2_naccii, add, sub, s32_0ImmPred>;
+def : T_MType_acc_pat2 <M2_nacci, add, sub>;
+
+def: T_MType_acc_pat2 <M4_or_xor, xor, or>;
+def: T_MType_acc_pat2 <M4_and_xor, xor, and>;
+def: T_MType_acc_pat2 <M4_or_and, and, or>;
+def: T_MType_acc_pat2 <M4_and_and, and, and>;
+def: T_MType_acc_pat2 <M4_xor_and, and, xor>;
+def: T_MType_acc_pat2 <M4_or_or, or, or>;
+def: T_MType_acc_pat2 <M4_and_or, or, and>;
+def: T_MType_acc_pat2 <M4_xor_or, or, xor>;
+
+class T_MType_acc_pat3 <InstHexagon MI, SDNode firstOp, SDNode secOp>
+ : Pat <(secOp I32:$src1, (firstOp I32:$src2, (not I32:$src3))),
+ (MI IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
+
+def: T_MType_acc_pat3 <M4_or_andn, and, or>;
+def: T_MType_acc_pat3 <M4_and_andn, and, and>;
+def: T_MType_acc_pat3 <M4_xor_andn, and, xor>;
+
+def Aext64: PatFrag<(ops node:$Rs), (i64 (anyext node:$Rs))>;
+def Sext64: PatFrag<(ops node:$Rs), (i64 (sext node:$Rs))>;
+def Zext64: PatFrag<(ops node:$Rs), (i64 (zext node:$Rs))>;
+
+// Return true if for a 32 to 64-bit sign-extended load.
+def Sext64Ld : PatLeaf<(i64 DoubleRegs:$src1), [{
+ LoadSDNode *LD = dyn_cast<LoadSDNode>(N);
+ if (!LD)
+ return false;
+ return LD->getExtensionType() == ISD::SEXTLOAD &&
+ LD->getMemoryVT().getScalarType() == MVT::i32;
+}]>;
+
+def: Pat<(mul (Aext64 I32:$src1), (Aext64 I32:$src2)),
+ (M2_dpmpyuu_s0 IntRegs:$src1, IntRegs:$src2)>;
+
+def: Pat<(mul (Sext64 I32:$src1), (Sext64 I32:$src2)),
+ (M2_dpmpyss_s0 IntRegs:$src1, IntRegs:$src2)>;
+
+def: Pat<(mul Sext64Ld:$src1, Sext64Ld:$src2),
+ (M2_dpmpyss_s0 (LoReg DoubleRegs:$src1), (LoReg DoubleRegs:$src2))>;
+
+// Multiply and accumulate, use full result.
+// Rxx[+-]=mpy(Rs,Rt)
+
+def: Pat<(add I64:$src1, (mul (Sext64 I32:$src2), (Sext64 I32:$src3))),
+ (M2_dpmpyss_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
+
+def: Pat<(sub I64:$src1, (mul (Sext64 I32:$src2), (Sext64 I32:$src3))),
+ (M2_dpmpyss_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
+
+def: Pat<(add I64:$src1, (mul (Aext64 I32:$src2), (Aext64 I32:$src3))),
+ (M2_dpmpyuu_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
+
+def: Pat<(add I64:$src1, (mul (Zext64 I32:$src2), (Zext64 I32:$src3))),
+ (M2_dpmpyuu_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
+
+def: Pat<(sub I64:$src1, (mul (Aext64 I32:$src2), (Aext64 I32:$src3))),
+ (M2_dpmpyuu_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
+
+def: Pat<(sub I64:$src1, (mul (Zext64 I32:$src2), (Zext64 I32:$src3))),
+ (M2_dpmpyuu_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
+
+class Storepi_pat<PatFrag Store, PatFrag Value, PatFrag Offset,
+ InstHexagon MI>
+ : Pat<(Store Value:$src1, I32:$src2, Offset:$offset),
+ (MI I32:$src2, imm:$offset, Value:$src1)>;
+
+def: Storepi_pat<post_truncsti8, I32, s4_0ImmPred, S2_storerb_pi>;
+def: Storepi_pat<post_truncsti16, I32, s4_1ImmPred, S2_storerh_pi>;
+def: Storepi_pat<post_store, I32, s4_2ImmPred, S2_storeri_pi>;
+def: Storepi_pat<post_store, I64, s4_3ImmPred, S2_storerd_pi>;
+
+// Patterns for generating stores, where the address takes different forms:
+// - frameindex,
+// - frameindex + offset,
+// - base + offset,
+// - simple (base address without offset).
+// These would usually be used together (via Storex_pat defined below), but
+// in some cases one may want to apply different properties (such as
+// AddedComplexity) to the individual patterns.
+class Storex_fi_pat<PatFrag Store, PatFrag Value, InstHexagon MI>
+ : Pat<(Store Value:$Rs, AddrFI:$fi), (MI AddrFI:$fi, 0, Value:$Rs)>;
+multiclass Storex_fi_add_pat<PatFrag Store, PatFrag Value, PatFrag ImmPred,
+ InstHexagon MI> {
+ def: Pat<(Store Value:$Rs, (add (i32 AddrFI:$fi), ImmPred:$Off)),
+ (MI AddrFI:$fi, imm:$Off, Value:$Rs)>;
+ def: Pat<(Store Value:$Rs, (IsOrAdd (i32 AddrFI:$fi), ImmPred:$Off)),
+ (MI AddrFI:$fi, imm:$Off, Value:$Rs)>;
+}
+multiclass Storex_add_pat<PatFrag Store, PatFrag Value, PatFrag ImmPred,
+ InstHexagon MI> {
+ def: Pat<(Store Value:$Rt, (add I32:$Rs, ImmPred:$Off)),
+ (MI IntRegs:$Rs, imm:$Off, Value:$Rt)>;
+ def: Pat<(Store Value:$Rt, (IsOrAdd I32:$Rs, ImmPred:$Off)),
+ (MI IntRegs:$Rs, imm:$Off, Value:$Rt)>;
+}
+class Storex_simple_pat<PatFrag Store, PatFrag Value, InstHexagon MI>
+ : Pat<(Store Value:$Rt, I32:$Rs),
+ (MI IntRegs:$Rs, 0, Value:$Rt)>;
+
+// Patterns for generating stores, where the address takes different forms,
+// and where the value being stored is transformed through the value modifier
+// ValueMod. The address forms are same as above.
+class Storexm_fi_pat<PatFrag Store, PatFrag Value, PatFrag ValueMod,
+ InstHexagon MI>
+ : Pat<(Store Value:$Rs, AddrFI:$fi),
+ (MI AddrFI:$fi, 0, (ValueMod Value:$Rs))>;
+multiclass Storexm_fi_add_pat<PatFrag Store, PatFrag Value, PatFrag ImmPred,
+ PatFrag ValueMod, InstHexagon MI> {
+ def: Pat<(Store Value:$Rs, (add (i32 AddrFI:$fi), ImmPred:$Off)),
+ (MI AddrFI:$fi, imm:$Off, (ValueMod Value:$Rs))>;
+ def: Pat<(Store Value:$Rs, (IsOrAdd (i32 AddrFI:$fi), ImmPred:$Off)),
+ (MI AddrFI:$fi, imm:$Off, (ValueMod Value:$Rs))>;
+}
+multiclass Storexm_add_pat<PatFrag Store, PatFrag Value, PatFrag ImmPred,
+ PatFrag ValueMod, InstHexagon MI> {
+ def: Pat<(Store Value:$Rt, (add I32:$Rs, ImmPred:$Off)),
+ (MI IntRegs:$Rs, imm:$Off, (ValueMod Value:$Rt))>;
+ def: Pat<(Store Value:$Rt, (IsOrAdd I32:$Rs, ImmPred:$Off)),
+ (MI IntRegs:$Rs, imm:$Off, (ValueMod Value:$Rt))>;
+}
+class Storexm_simple_pat<PatFrag Store, PatFrag Value, PatFrag ValueMod,
+ InstHexagon MI>
+ : Pat<(Store Value:$Rt, I32:$Rs),
+ (MI IntRegs:$Rs, 0, (ValueMod Value:$Rt))>;
+
+multiclass Storex_pat<PatFrag Store, PatFrag Value, PatLeaf ImmPred,
+ InstHexagon MI> {
+ def: Storex_fi_pat <Store, Value, MI>;
+ defm: Storex_fi_add_pat <Store, Value, ImmPred, MI>;
+ defm: Storex_add_pat <Store, Value, ImmPred, MI>;
+}
+
+multiclass Storexm_pat<PatFrag Store, PatFrag Value, PatLeaf ImmPred,
+ PatFrag ValueMod, InstHexagon MI> {
+ def: Storexm_fi_pat <Store, Value, ValueMod, MI>;
+ defm: Storexm_fi_add_pat <Store, Value, ImmPred, ValueMod, MI>;
+ defm: Storexm_add_pat <Store, Value, ImmPred, ValueMod, MI>;
+}
+
+// Regular stores in the DAG have two operands: value and address.
+// Atomic stores also have two, but they are reversed: address, value.
+// To use atomic stores with the patterns, they need to have their operands
+// swapped. This relies on the knowledge that the F.Fragment uses names
+// "ptr" and "val".
+class SwapSt<PatFrag F>
+ : PatFrag<(ops node:$val, node:$ptr), F.Fragment, F.PredicateCode,
+ F.OperandTransform>;
+
+let AddedComplexity = 20 in {
+ defm: Storex_pat<truncstorei8, I32, s32_0ImmPred, S2_storerb_io>;
+ defm: Storex_pat<truncstorei16, I32, s31_1ImmPred, S2_storerh_io>;
+ defm: Storex_pat<store, I32, s30_2ImmPred, S2_storeri_io>;
+ defm: Storex_pat<store, I64, s29_3ImmPred, S2_storerd_io>;
+
+ defm: Storex_pat<SwapSt<atomic_store_8>, I32, s32_0ImmPred, S2_storerb_io>;
+ defm: Storex_pat<SwapSt<atomic_store_16>, I32, s31_1ImmPred, S2_storerh_io>;
+ defm: Storex_pat<SwapSt<atomic_store_32>, I32, s30_2ImmPred, S2_storeri_io>;
+ defm: Storex_pat<SwapSt<atomic_store_64>, I64, s29_3ImmPred, S2_storerd_io>;
+}
+
+// Simple patterns should be tried with the least priority.
+def: Storex_simple_pat<truncstorei8, I32, S2_storerb_io>;
+def: Storex_simple_pat<truncstorei16, I32, S2_storerh_io>;
+def: Storex_simple_pat<store, I32, S2_storeri_io>;
+def: Storex_simple_pat<store, I64, S2_storerd_io>;
+
+def: Storex_simple_pat<SwapSt<atomic_store_8>, I32, S2_storerb_io>;
+def: Storex_simple_pat<SwapSt<atomic_store_16>, I32, S2_storerh_io>;
+def: Storex_simple_pat<SwapSt<atomic_store_32>, I32, S2_storeri_io>;
+def: Storex_simple_pat<SwapSt<atomic_store_64>, I64, S2_storerd_io>;
+
+let AddedComplexity = 20 in {
+ defm: Storexm_pat<truncstorei8, I64, s32_0ImmPred, LoReg, S2_storerb_io>;
+ defm: Storexm_pat<truncstorei16, I64, s31_1ImmPred, LoReg, S2_storerh_io>;
+ defm: Storexm_pat<truncstorei32, I64, s30_2ImmPred, LoReg, S2_storeri_io>;
+}
+
+def: Storexm_simple_pat<truncstorei8, I64, LoReg, S2_storerb_io>;
+def: Storexm_simple_pat<truncstorei16, I64, LoReg, S2_storerh_io>;
+def: Storexm_simple_pat<truncstorei32, I64, LoReg, S2_storeri_io>;
+
+def: Pat <(Sext64 I32:$src), (A2_sxtw I32:$src)>;
+
+def: Pat<(select (i1 (setlt I32:$src, 0)), (sub 0, I32:$src), I32:$src),
+ (A2_abs IntRegs:$src)>;
+
+let AddedComplexity = 50 in
+def: Pat<(xor (add (sra I32:$src, (i32 31)),
+ I32:$src),
+ (sra I32:$src, (i32 31))),
+ (A2_abs IntRegs:$src)>;
+
+def: Pat<(sra I32:$src, u5_0ImmPred:$u5),
+ (S2_asr_i_r IntRegs:$src, imm:$u5)>;
+def: Pat<(srl I32:$src, u5_0ImmPred:$u5),
+ (S2_lsr_i_r IntRegs:$src, imm:$u5)>;
+def: Pat<(shl I32:$src, u5_0ImmPred:$u5),
+ (S2_asl_i_r IntRegs:$src, imm:$u5)>;
+
+def: Pat<(sra (add (sra I32:$src1, u5_0ImmPred:$src2), 1), (i32 1)),
+ (S2_asr_i_r_rnd IntRegs:$src1, u5_0ImmPred:$src2)>;
+
+def : Pat<(not I64:$src1),
+ (A2_notp DoubleRegs:$src1)>;
+
+// Count leading zeros.
+def: Pat<(ctlz I32:$Rs), (S2_cl0 I32:$Rs)>;
+def: Pat<(i32 (trunc (ctlz I64:$Rss))), (S2_cl0p I64:$Rss)>;
+
+// Count trailing zeros: 32-bit.
+def: Pat<(cttz I32:$Rs), (S2_ct0 I32:$Rs)>;
+
+// Count leading ones.
+def: Pat<(ctlz (not I32:$Rs)), (S2_cl1 I32:$Rs)>;
+def: Pat<(i32 (trunc (ctlz (not I64:$Rss)))), (S2_cl1p I64:$Rss)>;
+
+// Count trailing ones: 32-bit.
+def: Pat<(cttz (not I32:$Rs)), (S2_ct1 I32:$Rs)>;
+
+let AddedComplexity = 20 in { // Complexity greater than and/or/xor
+ def: Pat<(and I32:$Rs, IsNPow2_32:$V),
+ (S2_clrbit_i IntRegs:$Rs, (LogN2_32 $V))>;
+ def: Pat<(or I32:$Rs, IsPow2_32:$V),
+ (S2_setbit_i IntRegs:$Rs, (Log2_32 $V))>;
+ def: Pat<(xor I32:$Rs, IsPow2_32:$V),
+ (S2_togglebit_i IntRegs:$Rs, (Log2_32 $V))>;
+
+ def: Pat<(and I32:$Rs, (not (shl 1, I32:$Rt))),
+ (S2_clrbit_r IntRegs:$Rs, IntRegs:$Rt)>;
+ def: Pat<(or I32:$Rs, (shl 1, I32:$Rt)),
+ (S2_setbit_r IntRegs:$Rs, IntRegs:$Rt)>;
+ def: Pat<(xor I32:$Rs, (shl 1, I32:$Rt)),
+ (S2_togglebit_r IntRegs:$Rs, IntRegs:$Rt)>;
+}
+
+// Clr/set/toggle bit for 64-bit values with immediate bit index.
+let AddedComplexity = 20 in { // Complexity greater than and/or/xor
+ def: Pat<(and I64:$Rss, IsNPow2_64L:$V),
+ (REG_SEQUENCE DoubleRegs,
+ (i32 (HiReg $Rss)), isub_hi,
+ (S2_clrbit_i (LoReg $Rss), (LogN2_64 $V)), isub_lo)>;
+ def: Pat<(and I64:$Rss, IsNPow2_64H:$V),
+ (REG_SEQUENCE DoubleRegs,
+ (S2_clrbit_i (HiReg $Rss), (UDEC32 (i32 (LogN2_64 $V)))),
+ isub_hi,
+ (i32 (LoReg $Rss)), isub_lo)>;
+
+ def: Pat<(or I64:$Rss, IsPow2_64L:$V),
+ (REG_SEQUENCE DoubleRegs,
+ (i32 (HiReg $Rss)), isub_hi,
+ (S2_setbit_i (LoReg $Rss), (Log2_64 $V)), isub_lo)>;
+ def: Pat<(or I64:$Rss, IsPow2_64H:$V),
+ (REG_SEQUENCE DoubleRegs,
+ (S2_setbit_i (HiReg $Rss), (UDEC32 (i32 (Log2_64 $V)))),
+ isub_hi,
+ (i32 (LoReg $Rss)), isub_lo)>;
+
+ def: Pat<(xor I64:$Rss, IsPow2_64L:$V),
+ (REG_SEQUENCE DoubleRegs,
+ (i32 (HiReg $Rss)), isub_hi,
+ (S2_togglebit_i (LoReg $Rss), (Log2_64 $V)), isub_lo)>;
+ def: Pat<(xor I64:$Rss, IsPow2_64H:$V),
+ (REG_SEQUENCE DoubleRegs,
+ (S2_togglebit_i (HiReg $Rss), (UDEC32 (i32 (Log2_64 $V)))),
+ isub_hi,
+ (i32 (LoReg $Rss)), isub_lo)>;
+}
+
+let AddedComplexity = 20 in { // Complexity greater than cmp reg-imm.
+ def: Pat<(i1 (setne (and (shl 1, u5_0ImmPred:$u5), I32:$Rs), 0)),
+ (S2_tstbit_i IntRegs:$Rs, u5_0ImmPred:$u5)>;
+ def: Pat<(i1 (setne (and (shl 1, I32:$Rt), I32:$Rs), 0)),
+ (S2_tstbit_r IntRegs:$Rs, IntRegs:$Rt)>;
+ def: Pat<(i1 (trunc I32:$Rs)),
+ (S2_tstbit_i IntRegs:$Rs, 0)>;
+ def: Pat<(i1 (trunc I64:$Rs)),
+ (S2_tstbit_i (LoReg DoubleRegs:$Rs), 0)>;
+}
+
+let AddedComplexity = 20 in { // Complexity greater than compare reg-imm.
+ def: Pat<(i1 (seteq (and I32:$Rs, u6_0ImmPred:$u6), 0)),
+ (C2_bitsclri IntRegs:$Rs, u6_0ImmPred:$u6)>;
+ def: Pat<(i1 (seteq (and I32:$Rs, I32:$Rt), 0)),
+ (C2_bitsclr IntRegs:$Rs, IntRegs:$Rt)>;
+}
+
+let AddedComplexity = 10 in // Complexity greater than compare reg-reg.
+def: Pat<(i1 (seteq (and I32:$Rs, I32:$Rt), IntRegs:$Rt)),
+ (C2_bitsset IntRegs:$Rs, IntRegs:$Rt)>;
+
+def: Pat<(or (or (shl (or (shl (i32 (extloadi8 (add I32:$b, 3))),
+ (i32 8)),
+ (i32 (zextloadi8 (add I32:$b, 2)))),
+ (i32 16)),
+ (shl (i32 (zextloadi8 (add I32:$b, 1))), (i32 8))),
+ (zextloadi8 I32:$b)),
+ (A2_swiz (L2_loadri_io IntRegs:$b, 0))>;
+
+// Patterns for loads of i1:
+def: Pat<(i1 (load AddrFI:$fi)),
+ (C2_tfrrp (L2_loadrub_io AddrFI:$fi, 0))>;
+def: Pat<(i1 (load (add I32:$Rs, s32_0ImmPred:$Off))),
+ (C2_tfrrp (L2_loadrub_io IntRegs:$Rs, imm:$Off))>;
+def: Pat<(i1 (load I32:$Rs)),
+ (C2_tfrrp (L2_loadrub_io IntRegs:$Rs, 0))>;
+
+def I1toI32: OutPatFrag<(ops node:$Rs),
+ (C2_muxii (i1 $Rs), 1, 0)>;
+
+def I32toI1: OutPatFrag<(ops node:$Rs),
+ (i1 (C2_tfrrp (i32 $Rs)))>;
+
+defm: Storexm_pat<store, I1, s32_0ImmPred, I1toI32, S2_storerb_io>;
+def: Storexm_simple_pat<store, I1, I1toI32, S2_storerb_io>;
+
+def: Pat<(sra I64:$src, u6_0ImmPred:$u6),
+ (S2_asr_i_p DoubleRegs:$src, imm:$u6)>;
+def: Pat<(srl I64:$src, u6_0ImmPred:$u6),
+ (S2_lsr_i_p DoubleRegs:$src, imm:$u6)>;
+def: Pat<(shl I64:$src, u6_0ImmPred:$u6),
+ (S2_asl_i_p DoubleRegs:$src, imm:$u6)>;
+
+let AddedComplexity = 100 in
+def: Pat<(add I32:$Rt, (shl I32:$Rs, u3_0ImmPred:$u3)),
+ (S2_addasl_rrri IntRegs:$Rt, IntRegs:$Rs, imm:$u3)>;
+
+def HexagonBARRIER: SDNode<"HexagonISD::BARRIER", SDTNone, [SDNPHasChain]>;
+def: Pat<(HexagonBARRIER), (Y2_barrier)>;
+
+def: Pat<(IsOrAdd (i32 AddrFI:$Rs), s32_0ImmPred:$off),
+ (PS_fi (i32 AddrFI:$Rs), s32_0ImmPred:$off)>;
+
+
+// Support for generating global address.
+// Taken from X86InstrInfo.td.
+def SDTHexagonCONST32 : SDTypeProfile<1, 1, [SDTCisVT<0, i32>,
+ SDTCisVT<1, i32>,
+ SDTCisPtrTy<0>]>;
+def HexagonCONST32 : SDNode<"HexagonISD::CONST32", SDTHexagonCONST32>;
+def HexagonCONST32_GP : SDNode<"HexagonISD::CONST32_GP", SDTHexagonCONST32>;
+
+// Map TLS addressses to A2_tfrsi.
+def: Pat<(HexagonCONST32 tglobaltlsaddr:$addr), (A2_tfrsi s16_0Ext:$addr)>;
+def: Pat<(HexagonCONST32 bbl:$label), (A2_tfrsi s16_0Ext:$label)>;
+
+def: Pat<(i64 imm:$v), (CONST64 imm:$v)>;
+def: Pat<(i1 0), (PS_false)>;
+def: Pat<(i1 1), (PS_true)>;
+
+// Pseudo instructions.
+def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_SPCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>,
+ SDTCisVT<1, i32> ]>;
+
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_SPCallSeqStart,
+ [SDNPHasChain, SDNPOutGlue]>;
+def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_SPCallSeqEnd,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def SDT_SPCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
+
+// For tailcalls a HexagonTCRet SDNode has 3 SDNode Properties - a chain,
+// Optional Flag and Variable Arguments.
+// Its 1 Operand has pointer type.
+def HexagonTCRet : SDNode<"HexagonISD::TC_RETURN", SDT_SPCall,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+
+def: Pat<(callseq_start timm:$amt),
+ (ADJCALLSTACKDOWN imm:$amt)>;
+def: Pat<(callseq_end timm:$amt1, timm:$amt2),
+ (ADJCALLSTACKUP imm:$amt1, imm:$amt2)>;
+
+//Tail calls.
+def: Pat<(HexagonTCRet tglobaladdr:$dst),
+ (PS_tailcall_i tglobaladdr:$dst)>;
+def: Pat<(HexagonTCRet texternalsym:$dst),
+ (PS_tailcall_i texternalsym:$dst)>;
+def: Pat<(HexagonTCRet I32:$dst),
+ (PS_tailcall_r I32:$dst)>;
+
+// Map from r0 = and(r1, 65535) to r0 = zxth(r1)
+def: Pat<(and I32:$src1, 65535),
+ (A2_zxth IntRegs:$src1)>;
+
+// Map from r0 = and(r1, 255) to r0 = zxtb(r1).
+def: Pat<(and I32:$src1, 255),
+ (A2_zxtb IntRegs:$src1)>;
+
+// Map Add(p1, true) to p1 = not(p1).
+// Add(p1, false) should never be produced,
+// if it does, it got to be mapped to NOOP.
+def: Pat<(add I1:$src1, -1),
+ (C2_not PredRegs:$src1)>;
+
+// Map from p0 = pnot(p0); r0 = mux(p0, #i, #j) => r0 = mux(p0, #j, #i).
+def: Pat<(select (not I1:$src1), s8_0ImmPred:$src2, s32_0ImmPred:$src3),
+ (C2_muxii PredRegs:$src1, s32_0ImmPred:$src3, s8_0ImmPred:$src2)>;
+
+// Map from p0 = pnot(p0); r0 = select(p0, #i, r1)
+// => r0 = C2_muxir(p0, r1, #i)
+def: Pat<(select (not I1:$src1), s32_0ImmPred:$src2,
+ I32:$src3),
+ (C2_muxir PredRegs:$src1, IntRegs:$src3, s32_0ImmPred:$src2)>;
+
+// Map from p0 = pnot(p0); r0 = mux(p0, r1, #i)
+// => r0 = C2_muxri (p0, #i, r1)
+def: Pat<(select (not I1:$src1), IntRegs:$src2, s32_0ImmPred:$src3),
+ (C2_muxri PredRegs:$src1, s32_0ImmPred:$src3, IntRegs:$src2)>;
+
+// Map from p0 = pnot(p0); if (p0) jump => if (!p0) jump.
+def: Pat<(brcond (not I1:$src1), bb:$offset),
+ (J2_jumpf PredRegs:$src1, bb:$offset)>;
+
+// Map from Rdd = sign_extend_inreg(Rss, i32) -> Rdd = A2_sxtw(Rss.lo).
+def: Pat<(i64 (sext_inreg I64:$src1, i32)),
+ (A2_sxtw (LoReg DoubleRegs:$src1))>;
+
+// Map from Rdd = sign_extend_inreg(Rss, i16) -> Rdd = A2_sxtw(A2_sxth(Rss.lo)).
+def: Pat<(i64 (sext_inreg I64:$src1, i16)),
+ (A2_sxtw (A2_sxth (LoReg DoubleRegs:$src1)))>;
+
+// Map from Rdd = sign_extend_inreg(Rss, i8) -> Rdd = A2_sxtw(A2_sxtb(Rss.lo)).
+def: Pat<(i64 (sext_inreg I64:$src1, i8)),
+ (A2_sxtw (A2_sxtb (LoReg DoubleRegs:$src1)))>;
+
+// We want to prevent emitting pnot's as much as possible.
+// Map brcond with an unsupported setcc to a J2_jumpf.
+def : Pat <(brcond (i1 (setne I32:$src1, I32:$src2)),
+ bb:$offset),
+ (J2_jumpf (C2_cmpeq I32:$src1, I32:$src2),
+ bb:$offset)>;
+
+def : Pat <(brcond (i1 (setne I32:$src1, s10_0ImmPred:$src2)),
+ bb:$offset),
+ (J2_jumpf (C2_cmpeqi I32:$src1, s10_0ImmPred:$src2), bb:$offset)>;
+
+def: Pat<(brcond (i1 (setne I1:$src1, (i1 -1))), bb:$offset),
+ (J2_jumpf PredRegs:$src1, bb:$offset)>;
+
+def: Pat<(brcond (i1 (setne I1:$src1, (i1 0))), bb:$offset),
+ (J2_jumpt PredRegs:$src1, bb:$offset)>;
+
+// cmp.lt(Rs, Imm) -> !cmp.ge(Rs, Imm) -> !cmp.gt(Rs, Imm-1)
+def: Pat<(brcond (i1 (setlt I32:$src1, s8_0ImmPred:$src2)), bb:$offset),
+ (J2_jumpf (C2_cmpgti IntRegs:$src1, (SDEC1 s8_0ImmPred:$src2)),
+ bb:$offset)>;
+
+// Map from a 64-bit select to an emulated 64-bit mux.
+// Hexagon does not support 64-bit MUXes; so emulate with combines.
+def: Pat<(select I1:$src1, I64:$src2,
+ I64:$src3),
+ (A2_combinew (C2_mux PredRegs:$src1, (HiReg DoubleRegs:$src2),
+ (HiReg DoubleRegs:$src3)),
+ (C2_mux PredRegs:$src1, (LoReg DoubleRegs:$src2),
+ (LoReg DoubleRegs:$src3)))>;
+
+// Map from a 1-bit select to logical ops.
+// From LegalizeDAG.cpp: (B1 ? B2 : B3) <=> (B1 & B2)|(!B1&B3).
+def: Pat<(select I1:$src1, I1:$src2, I1:$src3),
+ (C2_or (C2_and PredRegs:$src1, PredRegs:$src2),
+ (C2_and (C2_not PredRegs:$src1), PredRegs:$src3))>;
+
+// Map for truncating from 64 immediates to 32 bit immediates.
+def: Pat<(i32 (trunc I64:$src)),
+ (LoReg DoubleRegs:$src)>;
+
+// Map for truncating from i64 immediates to i1 bit immediates.
+def: Pat<(i1 (trunc I64:$src)),
+ (C2_tfrrp (LoReg DoubleRegs:$src))>;
+
+// rs <= rt -> !(rs > rt).
+let AddedComplexity = 30 in
+def: Pat<(i1 (setle I32:$src1, s32_0ImmPred:$src2)),
+ (C2_not (C2_cmpgti IntRegs:$src1, s32_0ImmPred:$src2))>;
+
+// rs <= rt -> !(rs > rt).
+def : Pat<(i1 (setle I32:$src1, I32:$src2)),
+ (i1 (C2_not (C2_cmpgt I32:$src1, I32:$src2)))>;
+
+// Rss <= Rtt -> !(Rss > Rtt).
+def: Pat<(i1 (setle I64:$src1, I64:$src2)),
+ (C2_not (C2_cmpgtp DoubleRegs:$src1, DoubleRegs:$src2))>;
+
+// Map cmpne -> cmpeq.
+// Hexagon_TODO: We should improve on this.
+// rs != rt -> !(rs == rt).
+let AddedComplexity = 30 in
+def: Pat<(i1 (setne I32:$src1, s32_0ImmPred:$src2)),
+ (C2_not (C2_cmpeqi IntRegs:$src1, s32_0ImmPred:$src2))>;
+
+// Convert setne back to xor for hexagon since we compute w/ pred registers.
+def: Pat<(i1 (setne I1:$src1, I1:$src2)),
+ (C2_xor PredRegs:$src1, PredRegs:$src2)>;
+
+// Map cmpne(Rss) -> !cmpew(Rss).
+// rs != rt -> !(rs == rt).
+def: Pat<(i1 (setne I64:$src1, I64:$src2)),
+ (C2_not (C2_cmpeqp DoubleRegs:$src1, DoubleRegs:$src2))>;
+
+// Map cmpge(Rs, Rt) -> !cmpgt(Rs, Rt).
+// rs >= rt -> !(rt > rs).
+def : Pat <(i1 (setge I32:$src1, I32:$src2)),
+ (i1 (C2_not (i1 (C2_cmpgt I32:$src2, I32:$src1))))>;
+
+// cmpge(Rs, Imm) -> cmpgt(Rs, Imm-1)
+let AddedComplexity = 30 in
+def: Pat<(i1 (setge I32:$src1, s32_0ImmPred:$src2)),
+ (C2_cmpgti IntRegs:$src1, (SDEC1 s32_0ImmPred:$src2))>;
+
+// Map cmpge(Rss, Rtt) -> !cmpgt(Rtt, Rss).
+// rss >= rtt -> !(rtt > rss).
+def: Pat<(i1 (setge I64:$src1, I64:$src2)),
+ (C2_not (C2_cmpgtp DoubleRegs:$src2, DoubleRegs:$src1))>;
+
+// Map cmplt(Rs, Imm) -> !cmpge(Rs, Imm).
+// !cmpge(Rs, Imm) -> !cmpgt(Rs, Imm-1).
+// rs < rt -> !(rs >= rt).
+let AddedComplexity = 30 in
+def: Pat<(i1 (setlt I32:$src1, s32_0ImmPred:$src2)),
+ (C2_not (C2_cmpgti IntRegs:$src1, (SDEC1 s32_0ImmPred:$src2)))>;
+
+// Generate cmpgeu(Rs, #0) -> cmpeq(Rs, Rs)
+def: Pat<(i1 (setuge I32:$src1, 0)),
+ (C2_cmpeq IntRegs:$src1, IntRegs:$src1)>;
+
+// Generate cmpgeu(Rs, #u8) -> cmpgtu(Rs, #u8 -1)
+def: Pat<(i1 (setuge I32:$src1, u32_0ImmPred:$src2)),
+ (C2_cmpgtui IntRegs:$src1, (UDEC1 u32_0ImmPred:$src2))>;
+
+// Generate cmpgtu(Rs, #u9)
+def: Pat<(i1 (setugt I32:$src1, u32_0ImmPred:$src2)),
+ (C2_cmpgtui IntRegs:$src1, u32_0ImmPred:$src2)>;
+
+// Map from Rs >= Rt -> !(Rt > Rs).
+// rs >= rt -> !(rt > rs).
+def: Pat<(i1 (setuge I64:$src1, I64:$src2)),
+ (C2_not (C2_cmpgtup DoubleRegs:$src2, DoubleRegs:$src1))>;
+
+// Map from cmpleu(Rss, Rtt) -> !cmpgtu(Rss, Rtt-1).
+// Map from (Rs <= Rt) -> !(Rs > Rt).
+def: Pat<(i1 (setule I64:$src1, I64:$src2)),
+ (C2_not (C2_cmpgtup DoubleRegs:$src1, DoubleRegs:$src2))>;
+
+// Sign extends.
+// i1 -> i32
+def: Pat<(i32 (sext I1:$src1)),
+ (C2_muxii PredRegs:$src1, -1, 0)>;
+
+// i1 -> i64
+def: Pat<(i64 (sext I1:$src1)),
+ (A2_combinew (A2_tfrsi -1), (C2_muxii PredRegs:$src1, -1, 0))>;
+
+// Zero extends.
+// i1 -> i32
+def: Pat<(i32 (zext I1:$src1)),
+ (C2_muxii PredRegs:$src1, 1, 0)>;
+
+// Map from Rs = Pd to Pd = mux(Pd, #1, #0)
+def: Pat<(i32 (anyext I1:$src1)),
+ (C2_muxii PredRegs:$src1, 1, 0)>;
+
+// Map from Rss = Pd to Rdd = sxtw (mux(Pd, #1, #0))
+def: Pat<(i64 (anyext I1:$src1)),
+ (A2_sxtw (C2_muxii PredRegs:$src1, 1, 0))>;
+
+// Clear the sign bit in a 64-bit register.
+def ClearSign : OutPatFrag<(ops node:$Rss),
+ (A2_combinew (S2_clrbit_i (HiReg $Rss), 31), (LoReg $Rss))>;
+
+def MulHU : OutPatFrag<(ops node:$Rss, node:$Rtt),
+ (A2_addp
+ (M2_dpmpyuu_acc_s0
+ (S2_lsr_i_p
+ (A2_addp
+ (M2_dpmpyuu_acc_s0
+ (S2_lsr_i_p (M2_dpmpyuu_s0 (LoReg $Rss), (LoReg $Rtt)), 32),
+ (HiReg $Rss),
+ (LoReg $Rtt)),
+ (A2_combinew (A2_tfrsi 0),
+ (LoReg (M2_dpmpyuu_s0 (LoReg $Rss), (HiReg $Rtt))))),
+ 32),
+ (HiReg $Rss),
+ (HiReg $Rtt)),
+ (S2_lsr_i_p (M2_dpmpyuu_s0 (LoReg $Rss), (HiReg $Rtt)), 32))>;
+
+// Multiply 64-bit unsigned and use upper result.
+def : Pat <(mulhu I64:$Rss, I64:$Rtt), (MulHU $Rss, $Rtt)>;
+
+// Multiply 64-bit signed and use upper result.
+//
+// For two signed 64-bit integers A and B, let A' and B' denote A and B
+// with the sign bit cleared. Then A = -2^63*s(A) + A', where s(A) is the
+// sign bit of A (and identically for B). With this notation, the signed
+// product A*B can be written as:
+// AB = (-2^63 s(A) + A') * (-2^63 s(B) + B')
+// = 2^126 s(A)s(B) - 2^63 [s(A)B'+s(B)A'] + A'B'
+// = 2^126 s(A)s(B) + 2^63 [s(A)B'+s(B)A'] + A'B' - 2*2^63 [s(A)B'+s(B)A']
+// = (unsigned product AB) - 2^64 [s(A)B'+s(B)A']
+
+def : Pat <(mulhs I64:$Rss, I64:$Rtt),
+ (A2_subp
+ (MulHU $Rss, $Rtt),
+ (A2_addp
+ (A2_andp (S2_asr_i_p $Rss, 63), (ClearSign $Rtt)),
+ (A2_andp (S2_asr_i_p $Rtt, 63), (ClearSign $Rss))))>;
+
+// Hexagon specific ISD nodes.
+def SDTHexagonALLOCA : SDTypeProfile<1, 2,
+ [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
+def HexagonALLOCA : SDNode<"HexagonISD::ALLOCA", SDTHexagonALLOCA,
+ [SDNPHasChain]>;
+
+
+def: Pat<(HexagonALLOCA I32:$Rs, (i32 imm:$A)),
+ (PS_alloca IntRegs:$Rs, imm:$A)>;
+
+def HexagonJT: SDNode<"HexagonISD::JT", SDTIntUnaryOp>;
+def HexagonCP: SDNode<"HexagonISD::CP", SDTIntUnaryOp>;
+
+def: Pat<(HexagonJT tjumptable:$dst), (A2_tfrsi imm:$dst)>;
+def: Pat<(HexagonCP tconstpool:$dst), (A2_tfrsi imm:$dst)>;
+
+let AddedComplexity = 100 in
+def: Pat<(add I32:$src1, (sra I32:$Rs, u5_0ImmPred:$u5)), (S2_asr_i_r_acc IntRegs:$src1, IntRegs:$Rs, u5_0ImmPred:$u5)>;
+def: Pat<(sub I32:$src1, (sra I32:$Rs, u5_0ImmPred:$u5)), (S2_asr_i_r_nac IntRegs:$src1, IntRegs:$Rs, u5_0ImmPred:$u5)>;
+def: Pat<(and I32:$src1, (sra I32:$Rs, u5_0ImmPred:$u5)), (S2_asr_i_r_and IntRegs:$src1, IntRegs:$Rs, u5_0ImmPred:$u5)>;
+def: Pat<(or I32:$src1, (sra I32:$Rs, u5_0ImmPred:$u5)), (S2_asr_i_r_or IntRegs:$src1, IntRegs:$Rs, u5_0ImmPred:$u5)>;
+
+let AddedComplexity = 100 in
+def: Pat<(add I64:$src1, (sra I64:$Rs, u6_0ImmPred:$u5)), (S2_asr_i_p_acc DoubleRegs:$src1, DoubleRegs:$Rs, u6_0ImmPred:$u5)>;
+def: Pat<(sub I64:$src1, (sra I64:$Rs, u6_0ImmPred:$u5)), (S2_asr_i_p_nac DoubleRegs:$src1, DoubleRegs:$Rs, u6_0ImmPred:$u5)>;
+def: Pat<(and I64:$src1, (sra I64:$Rs, u6_0ImmPred:$u5)), (S2_asr_i_p_and DoubleRegs:$src1, DoubleRegs:$Rs, u6_0ImmPred:$u5)>;
+def: Pat<(or I64:$src1, (sra I64:$Rs, u6_0ImmPred:$u5)), (S2_asr_i_p_or DoubleRegs:$src1, DoubleRegs:$Rs, u6_0ImmPred:$u5)>;
+
+let AddedComplexity = 100 in
+def: Pat<(add I32:$src1, (srl I32:$Rs, u5_0ImmPred:$u5)), (S2_lsr_i_r_acc IntRegs:$src1, IntRegs:$Rs, u5_0ImmPred:$u5)>;
+def: Pat<(sub I32:$src1, (srl I32:$Rs, u5_0ImmPred:$u5)), (S2_lsr_i_r_nac IntRegs:$src1, IntRegs:$Rs, u5_0ImmPred:$u5)>;
+def: Pat<(and I32:$src1, (srl I32:$Rs, u5_0ImmPred:$u5)), (S2_lsr_i_r_and IntRegs:$src1, IntRegs:$Rs, u5_0ImmPred:$u5)>;
+def: Pat<(or I32:$src1, (srl I32:$Rs, u5_0ImmPred:$u5)), (S2_lsr_i_r_or IntRegs:$src1, IntRegs:$Rs, u5_0ImmPred:$u5)>;
+let AddedComplexity = 100 in
+def: Pat<(xor I32:$src1, (srl I32:$Rs, u5_0ImmPred:$u5)), (S2_lsr_i_r_xacc IntRegs:$src1, IntRegs:$Rs, u5_0ImmPred:$u5)>;
+
+let AddedComplexity = 100 in
+def: Pat<(add I64:$src1, (srl I64:$Rs, u6_0ImmPred:$u5)), (S2_lsr_i_p_acc DoubleRegs:$src1, DoubleRegs:$Rs, u6_0ImmPred:$u5)>;
+def: Pat<(sub I64:$src1, (srl I64:$Rs, u6_0ImmPred:$u5)), (S2_lsr_i_p_nac DoubleRegs:$src1, DoubleRegs:$Rs, u6_0ImmPred:$u5)>;
+def: Pat<(and I64:$src1, (srl I64:$Rs, u6_0ImmPred:$u5)), (S2_lsr_i_p_and DoubleRegs:$src1, DoubleRegs:$Rs, u6_0ImmPred:$u5)>;
+def: Pat<(or I64:$src1, (srl I64:$Rs, u6_0ImmPred:$u5)), (S2_lsr_i_p_or DoubleRegs:$src1, DoubleRegs:$Rs, u6_0ImmPred:$u5)>;
+let AddedComplexity = 100 in
+def: Pat<(xor I64:$src1, (srl I64:$Rs, u6_0ImmPred:$u5)), (S2_lsr_i_p_xacc DoubleRegs:$src1, DoubleRegs:$Rs, u6_0ImmPred:$u5)>;
+
+let AddedComplexity = 100 in
+def: Pat<(add I32:$src1, (shl I32:$Rs, u5_0ImmPred:$u5)), (S2_asl_i_r_acc IntRegs:$src1, IntRegs:$Rs, u5_0ImmPred:$u5)>;
+def: Pat<(sub I32:$src1, (shl I32:$Rs, u5_0ImmPred:$u5)), (S2_asl_i_r_nac IntRegs:$src1, IntRegs:$Rs, u5_0ImmPred:$u5)>;
+def: Pat<(and I32:$src1, (shl I32:$Rs, u5_0ImmPred:$u5)), (S2_asl_i_r_and IntRegs:$src1, IntRegs:$Rs, u5_0ImmPred:$u5)>;
+def: Pat<(or I32:$src1, (shl I32:$Rs, u5_0ImmPred:$u5)), (S2_asl_i_r_or IntRegs:$src1, IntRegs:$Rs, u5_0ImmPred:$u5)>;
+let AddedComplexity = 100 in
+def: Pat<(xor I32:$src1, (shl I32:$Rs, u5_0ImmPred:$u5)), (S2_asl_i_r_xacc IntRegs:$src1, IntRegs:$Rs, u5_0ImmPred:$u5)>;
+
+let AddedComplexity = 100 in
+def: Pat<(add I64:$src1, (shl I64:$Rs, u6_0ImmPred:$u5)), (S2_asl_i_p_acc DoubleRegs:$src1, DoubleRegs:$Rs, u6_0ImmPred:$u5)>;
+def: Pat<(sub I64:$src1, (shl I64:$Rs, u6_0ImmPred:$u5)), (S2_asl_i_p_nac DoubleRegs:$src1, DoubleRegs:$Rs, u6_0ImmPred:$u5)>;
+def: Pat<(and I64:$src1, (shl I64:$Rs, u6_0ImmPred:$u5)), (S2_asl_i_p_and DoubleRegs:$src1, DoubleRegs:$Rs, u6_0ImmPred:$u5)>;
+def: Pat<(or I64:$src1, (shl I64:$Rs, u6_0ImmPred:$u5)), (S2_asl_i_p_or DoubleRegs:$src1, DoubleRegs:$Rs, u6_0ImmPred:$u5)>;
+let AddedComplexity = 100 in
+def: Pat<(xor I64:$src1, (shl I64:$Rs, u6_0ImmPred:$u5)), (S2_asl_i_p_xacc DoubleRegs:$src1, DoubleRegs:$Rs, u6_0ImmPred:$u5)>;
+
+let AddedComplexity = 100 in
+def: Pat<(add I32:$src1, (shl I32:$Rs, I32:$Rt)), (S2_asl_r_r_acc IntRegs:$src1, IntRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(sub I32:$src1, (shl I32:$Rs, I32:$Rt)), (S2_asl_r_r_nac IntRegs:$src1, IntRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(and I32:$src1, (shl I32:$Rs, I32:$Rt)), (S2_asl_r_r_and IntRegs:$src1, IntRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(or I32:$src1, (shl I32:$Rs, I32:$Rt)), (S2_asl_r_r_or IntRegs:$src1, IntRegs:$Rs, IntRegs:$Rt)>;
+let AddedComplexity = 100 in
+def: Pat<(add I64:$src1, (shl I64:$Rs, I32:$Rt)), (S2_asl_r_p_acc DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(sub I64:$src1, (shl I64:$Rs, I32:$Rt)), (S2_asl_r_p_nac DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(and I64:$src1, (shl I64:$Rs, I32:$Rt)), (S2_asl_r_p_and DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(or I64:$src1, (shl I64:$Rs, I32:$Rt)), (S2_asl_r_p_or DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(xor I64:$src1, (shl I64:$Rs, I32:$Rt)), (S2_asl_r_p_xor DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+
+let AddedComplexity = 100 in
+def: Pat<(add I32:$src1, (sra I32:$Rs, I32:$Rt)), (S2_asr_r_r_acc IntRegs:$src1, IntRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(sub I32:$src1, (sra I32:$Rs, I32:$Rt)), (S2_asr_r_r_nac IntRegs:$src1, IntRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(and I32:$src1, (sra I32:$Rs, I32:$Rt)), (S2_asr_r_r_and IntRegs:$src1, IntRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(or I32:$src1, (sra I32:$Rs, I32:$Rt)), (S2_asr_r_r_or IntRegs:$src1, IntRegs:$Rs, IntRegs:$Rt)>;
+let AddedComplexity = 100 in
+def: Pat<(add I64:$src1, (sra I64:$Rs, I32:$Rt)), (S2_asr_r_p_acc DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(sub I64:$src1, (sra I64:$Rs, I32:$Rt)), (S2_asr_r_p_nac DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(and I64:$src1, (sra I64:$Rs, I32:$Rt)), (S2_asr_r_p_and DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(or I64:$src1, (sra I64:$Rs, I32:$Rt)), (S2_asr_r_p_or DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(xor I64:$src1, (sra I64:$Rs, I32:$Rt)), (S2_asr_r_p_xor DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+
+let AddedComplexity = 100 in
+def: Pat<(add I32:$src1, (srl I32:$Rs, I32:$Rt)), (S2_lsr_r_r_acc IntRegs:$src1, IntRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(sub I32:$src1, (srl I32:$Rs, I32:$Rt)), (S2_lsr_r_r_nac IntRegs:$src1, IntRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(and I32:$src1, (srl I32:$Rs, I32:$Rt)), (S2_lsr_r_r_and IntRegs:$src1, IntRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(or I32:$src1, (srl I32:$Rs, I32:$Rt)), (S2_lsr_r_r_or IntRegs:$src1, IntRegs:$Rs, IntRegs:$Rt)>;
+let AddedComplexity = 100 in
+def: Pat<(add I64:$src1, (srl I64:$Rs, I32:$Rt)), (S2_lsr_r_p_acc DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(sub I64:$src1, (srl I64:$Rs, I32:$Rt)), (S2_lsr_r_p_nac DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(and I64:$src1, (srl I64:$Rs, I32:$Rt)), (S2_lsr_r_p_and DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(or I64:$src1, (srl I64:$Rs, I32:$Rt)), (S2_lsr_r_p_or DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(xor I64:$src1, (srl I64:$Rs, I32:$Rt)), (S2_lsr_r_p_xor DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+
+let AddedComplexity = 100 in
+def: Pat<(add I32:$src1, (shl I32:$Rs, I32:$Rt)), (S2_lsl_r_r_acc IntRegs:$src1, IntRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(sub I32:$src1, (shl I32:$Rs, I32:$Rt)), (S2_lsl_r_r_nac IntRegs:$src1, IntRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(and I32:$src1, (shl I32:$Rs, I32:$Rt)), (S2_lsl_r_r_and IntRegs:$src1, IntRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(or I32:$src1, (shl I32:$Rs, I32:$Rt)), (S2_lsl_r_r_or IntRegs:$src1, IntRegs:$Rs, IntRegs:$Rt)>;
+let AddedComplexity = 100 in
+def: Pat<(add I64:$src1, (shl I64:$Rs, I32:$Rt)), (S2_lsl_r_p_acc DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(sub I64:$src1, (shl I64:$Rs, I32:$Rt)), (S2_lsl_r_p_nac DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(and I64:$src1, (shl I64:$Rs, I32:$Rt)), (S2_lsl_r_p_and DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(or I64:$src1, (shl I64:$Rs, I32:$Rt)), (S2_lsl_r_p_or DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(xor I64:$src1, (shl I64:$Rs, I32:$Rt)), (S2_lsl_r_p_xor DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+
+def: Pat<(sra I64:$src1, I32:$src2), (S2_asr_r_p DoubleRegs:$src1, IntRegs:$src2)>;
+def: Pat<(srl I64:$src1, I32:$src2), (S2_lsr_r_p DoubleRegs:$src1, IntRegs:$src2)>;
+def: Pat<(shl I64:$src1, I32:$src2), (S2_asl_r_p DoubleRegs:$src1, IntRegs:$src2)>;
+def: Pat<(shl I64:$src1, I32:$src2), (S2_lsl_r_p DoubleRegs:$src1, IntRegs:$src2)>;
+
+def: Pat<(sra I32:$src1, I32:$src2), (S2_asr_r_r IntRegs:$src1, IntRegs:$src2)>;
+def: Pat<(srl I32:$src1, I32:$src2), (S2_lsr_r_r IntRegs:$src1, IntRegs:$src2)>;
+def: Pat<(shl I32:$src1, I32:$src2), (S2_asl_r_r IntRegs:$src1, IntRegs:$src2)>;
+def: Pat<(shl I32:$src1, I32:$src2), (S2_lsl_r_r IntRegs:$src1, IntRegs:$src2)>;
+
+def SDTHexagonINSERT:
+ SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+ SDTCisInt<0>, SDTCisVT<3, i32>, SDTCisVT<4, i32>]>;
+def SDTHexagonINSERTRP:
+ SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+ SDTCisInt<0>, SDTCisVT<3, i64>]>;
+
+def HexagonINSERT : SDNode<"HexagonISD::INSERT", SDTHexagonINSERT>;
+def HexagonINSERTRP : SDNode<"HexagonISD::INSERTRP", SDTHexagonINSERTRP>;
+
+def: Pat<(HexagonINSERT I32:$Rs, I32:$Rt, u5_0ImmPred:$u1, u5_0ImmPred:$u2),
+ (S2_insert I32:$Rs, I32:$Rt, u5_0ImmPred:$u1, u5_0ImmPred:$u2)>;
+def: Pat<(HexagonINSERT I64:$Rs, I64:$Rt, u6_0ImmPred:$u1, u6_0ImmPred:$u2),
+ (S2_insertp I64:$Rs, I64:$Rt, u6_0ImmPred:$u1, u6_0ImmPred:$u2)>;
+def: Pat<(HexagonINSERTRP I32:$Rs, I32:$Rt, I64:$Ru),
+ (S2_insert_rp I32:$Rs, I32:$Rt, I64:$Ru)>;
+def: Pat<(HexagonINSERTRP I64:$Rs, I64:$Rt, I64:$Ru),
+ (S2_insertp_rp I64:$Rs, I64:$Rt, I64:$Ru)>;
+
+let AddedComplexity = 100 in
+def: Pat<(or (or (shl (HexagonINSERT (i32 (zextloadi8 (add I32:$b, 2))),
+ (i32 (extloadi8 (add I32:$b, 3))),
+ 24, 8),
+ (i32 16)),
+ (shl (i32 (zextloadi8 (add I32:$b, 1))), (i32 8))),
+ (zextloadi8 I32:$b)),
+ (A2_swiz (L2_loadri_io I32:$b, 0))>;
+
+def SDTHexagonEXTRACTU:
+ SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<1>,
+ SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
+def SDTHexagonEXTRACTURP:
+ SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<1>,
+ SDTCisVT<2, i64>]>;
+
+def HexagonEXTRACTU : SDNode<"HexagonISD::EXTRACTU", SDTHexagonEXTRACTU>;
+def HexagonEXTRACTURP : SDNode<"HexagonISD::EXTRACTURP", SDTHexagonEXTRACTURP>;
+
+def: Pat<(HexagonEXTRACTU I32:$src1, u5_0ImmPred:$src2, u5_0ImmPred:$src3),
+ (S2_extractu I32:$src1, u5_0ImmPred:$src2, u5_0ImmPred:$src3)>;
+def: Pat<(HexagonEXTRACTU I64:$src1, u6_0ImmPred:$src2, u6_0ImmPred:$src3),
+ (S2_extractup I64:$src1, u6_0ImmPred:$src2, u6_0ImmPred:$src3)>;
+def: Pat<(HexagonEXTRACTURP I32:$src1, I64:$src2),
+ (S2_extractu_rp I32:$src1, I64:$src2)>;
+def: Pat<(HexagonEXTRACTURP I64:$src1, I64:$src2),
+ (S2_extractup_rp I64:$src1, I64:$src2)>;
+
+def n8_0ImmPred: PatLeaf<(i32 imm), [{
+ int64_t V = N->getSExtValue();
+ return -255 <= V && V <= 0;
+}]>;
+
+// Change the sign of the immediate for Rd=-mpyi(Rs,#u8)
+def: Pat<(mul I32:$src1, (ineg n8_0ImmPred:$src2)),
+ (M2_mpysin IntRegs:$src1, u8_0ImmPred:$src2)>;
+
+multiclass MinMax_pats_p<PatFrag Op, InstHexagon Inst, InstHexagon SwapInst> {
+ defm: T_MinMax_pats<Op, I64, Inst, SwapInst>;
+}
+
+def: Pat<(add (Sext64 I32:$Rs), I64:$Rt),
+ (A2_addsp IntRegs:$Rs, DoubleRegs:$Rt)>;
+
+let AddedComplexity = 200 in {
+ defm: MinMax_pats_p<setge, A2_maxp, A2_minp>;
+ defm: MinMax_pats_p<setgt, A2_maxp, A2_minp>;
+ defm: MinMax_pats_p<setle, A2_minp, A2_maxp>;
+ defm: MinMax_pats_p<setlt, A2_minp, A2_maxp>;
+ defm: MinMax_pats_p<setuge, A2_maxup, A2_minup>;
+ defm: MinMax_pats_p<setugt, A2_maxup, A2_minup>;
+ defm: MinMax_pats_p<setule, A2_minup, A2_maxup>;
+ defm: MinMax_pats_p<setult, A2_minup, A2_maxup>;
+}
+
+def callv3 : SDNode<"HexagonISD::CALL", SDT_SPCall,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>;
+
+def callv3nr : SDNode<"HexagonISD::CALLnr", SDT_SPCall,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>;
+
+
+// Map call instruction
+def : Pat<(callv3 I32:$dst),
+ (J2_callr I32:$dst)>;
+def : Pat<(callv3 tglobaladdr:$dst),
+ (J2_call tglobaladdr:$dst)>;
+def : Pat<(callv3 texternalsym:$dst),
+ (J2_call texternalsym:$dst)>;
+def : Pat<(callv3 tglobaltlsaddr:$dst),
+ (J2_call tglobaltlsaddr:$dst)>;
+
+def : Pat<(callv3nr I32:$dst),
+ (PS_callr_nr I32:$dst)>;
+def : Pat<(callv3nr tglobaladdr:$dst),
+ (PS_call_nr tglobaladdr:$dst)>;
+def : Pat<(callv3nr texternalsym:$dst),
+ (PS_call_nr texternalsym:$dst)>;
+
+
+def addrga: PatLeaf<(i32 AddrGA:$Addr)>;
+def addrgp: PatLeaf<(i32 AddrGP:$Addr)>;
+
+
+// Pats for instruction selection.
+
+// A class to embed the usual comparison patfrags within a zext to i32.
+// The seteq/setne frags use "lhs" and "rhs" as operands, so use the same
+// names, or else the frag's "body" won't match the operands.
+class CmpInReg<PatFrag Op>
+ : PatFrag<(ops node:$lhs, node:$rhs),(i32 (zext (i1 Op.Fragment)))>;
+
+def: T_cmp32_rr_pat<A4_rcmpeq, CmpInReg<seteq>, i32>;
+def: T_cmp32_rr_pat<A4_rcmpneq, CmpInReg<setne>, i32>;
+
+def: T_cmp32_rr_pat<C4_cmpneq, setne, i1>;
+def: T_cmp32_rr_pat<C4_cmplte, setle, i1>;
+def: T_cmp32_rr_pat<C4_cmplteu, setule, i1>;
+
+def: T_cmp32_rr_pat<C4_cmplte, RevCmp<setge>, i1>;
+def: T_cmp32_rr_pat<C4_cmplteu, RevCmp<setuge>, i1>;
+
+let AddedComplexity = 100 in {
+ def: Pat<(i1 (seteq (and (xor I32:$Rs, I32:$Rt),
+ 255), 0)),
+ (A4_cmpbeq IntRegs:$Rs, IntRegs:$Rt)>;
+ def: Pat<(i1 (setne (and (xor I32:$Rs, I32:$Rt),
+ 255), 0)),
+ (C2_not (A4_cmpbeq IntRegs:$Rs, IntRegs:$Rt))>;
+ def: Pat<(i1 (seteq (and (xor I32:$Rs, I32:$Rt),
+ 65535), 0)),
+ (A4_cmpheq IntRegs:$Rs, IntRegs:$Rt)>;
+ def: Pat<(i1 (setne (and (xor I32:$Rs, I32:$Rt),
+ 65535), 0)),
+ (C2_not (A4_cmpheq IntRegs:$Rs, IntRegs:$Rt))>;
+}
+
+def: Pat<(i32 (zext (i1 (seteq I32:$Rs, s32_0ImmPred:$s8)))),
+ (A4_rcmpeqi IntRegs:$Rs, s32_0ImmPred:$s8)>;
+def: Pat<(i32 (zext (i1 (setne I32:$Rs, s32_0ImmPred:$s8)))),
+ (A4_rcmpneqi IntRegs:$Rs, s32_0ImmPred:$s8)>;
+
+// Preserve the S2_tstbit_r generation
+def: Pat<(i32 (zext (i1 (setne (i32 (and (i32 (shl 1, I32:$src2)),
+ I32:$src1)), 0)))),
+ (C2_muxii (S2_tstbit_r IntRegs:$src1, IntRegs:$src2), 1, 0)>;
+
+// The complexity of the combines involving immediates should be greater
+// than the complexity of the combine with two registers.
+let AddedComplexity = 50 in {
+def: Pat<(HexagonCOMBINE IntRegs:$r, s32_0ImmPred:$i),
+ (A4_combineri IntRegs:$r, s32_0ImmPred:$i)>;
+
+def: Pat<(HexagonCOMBINE s32_0ImmPred:$i, IntRegs:$r),
+ (A4_combineir s32_0ImmPred:$i, IntRegs:$r)>;
+}
+
+// The complexity of the combine with two immediates should be greater than
+// the complexity of a combine involving a register.
+let AddedComplexity = 75 in {
+def: Pat<(HexagonCOMBINE s8_0ImmPred:$s8, u32_0ImmPred:$u6),
+ (A4_combineii imm:$s8, imm:$u6)>;
+def: Pat<(HexagonCOMBINE s32_0ImmPred:$s8, s8_0ImmPred:$S8),
+ (A2_combineii imm:$s8, imm:$S8)>;
+}
+
+
+def ToZext64: OutPatFrag<(ops node:$Rs),
+ (i64 (A4_combineir 0, (i32 $Rs)))>;
+def ToSext64: OutPatFrag<(ops node:$Rs),
+ (i64 (A2_sxtw (i32 $Rs)))>;
+
+// Patterns to generate indexed loads with different forms of the address:
+// - frameindex,
+// - base + offset,
+// - base (without offset).
+multiclass Loadxm_pat<PatFrag Load, ValueType VT, PatFrag ValueMod,
+ PatLeaf ImmPred, InstHexagon MI> {
+ def: Pat<(VT (Load AddrFI:$fi)),
+ (VT (ValueMod (MI AddrFI:$fi, 0)))>;
+ def: Pat<(VT (Load (add AddrFI:$fi, ImmPred:$Off))),
+ (VT (ValueMod (MI AddrFI:$fi, imm:$Off)))>;
+ def: Pat<(VT (Load (add IntRegs:$Rs, ImmPred:$Off))),
+ (VT (ValueMod (MI IntRegs:$Rs, imm:$Off)))>;
+ def: Pat<(VT (Load I32:$Rs)),
+ (VT (ValueMod (MI IntRegs:$Rs, 0)))>;
+}
+
+defm: Loadxm_pat<extloadi1, i64, ToZext64, s32_0ImmPred, L2_loadrub_io>;
+defm: Loadxm_pat<extloadi8, i64, ToZext64, s32_0ImmPred, L2_loadrub_io>;
+defm: Loadxm_pat<extloadi16, i64, ToZext64, s31_1ImmPred, L2_loadruh_io>;
+defm: Loadxm_pat<zextloadi1, i64, ToZext64, s32_0ImmPred, L2_loadrub_io>;
+defm: Loadxm_pat<zextloadi8, i64, ToZext64, s32_0ImmPred, L2_loadrub_io>;
+defm: Loadxm_pat<zextloadi16, i64, ToZext64, s31_1ImmPred, L2_loadruh_io>;
+defm: Loadxm_pat<sextloadi8, i64, ToSext64, s32_0ImmPred, L2_loadrb_io>;
+defm: Loadxm_pat<sextloadi16, i64, ToSext64, s31_1ImmPred, L2_loadrh_io>;
+
+// Map Rdd = anyext(Rs) -> Rdd = combine(#0, Rs).
+def: Pat<(Aext64 I32:$src1), (ToZext64 IntRegs:$src1)>;
+
+multiclass T_LoadAbsReg_Pat <PatFrag ldOp, InstHexagon MI, ValueType VT = i32> {
+ def : Pat <(VT (ldOp (add (shl IntRegs:$src1, u2_0ImmPred:$src2),
+ (HexagonCONST32 tglobaladdr:$src3)))),
+ (MI IntRegs:$src1, u2_0ImmPred:$src2, tglobaladdr:$src3)>;
+ def : Pat <(VT (ldOp (add IntRegs:$src1,
+ (HexagonCONST32 tglobaladdr:$src2)))),
+ (MI IntRegs:$src1, 0, tglobaladdr:$src2)>;
+
+ def : Pat <(VT (ldOp (add (shl IntRegs:$src1, u2_0ImmPred:$src2),
+ (HexagonCONST32 tconstpool:$src3)))),
+ (MI IntRegs:$src1, u2_0ImmPred:$src2, tconstpool:$src3)>;
+ def : Pat <(VT (ldOp (add IntRegs:$src1,
+ (HexagonCONST32 tconstpool:$src2)))),
+ (MI IntRegs:$src1, 0, tconstpool:$src2)>;
+
+ def : Pat <(VT (ldOp (add (shl IntRegs:$src1, u2_0ImmPred:$src2),
+ (HexagonCONST32 tjumptable:$src3)))),
+ (MI IntRegs:$src1, u2_0ImmPred:$src2, tjumptable:$src3)>;
+ def : Pat <(VT (ldOp (add IntRegs:$src1,
+ (HexagonCONST32 tjumptable:$src2)))),
+ (MI IntRegs:$src1, 0, tjumptable:$src2)>;
+}
+
+let AddedComplexity = 60 in {
+defm : T_LoadAbsReg_Pat <sextloadi8, L4_loadrb_ur>;
+defm : T_LoadAbsReg_Pat <zextloadi8, L4_loadrub_ur>;
+defm : T_LoadAbsReg_Pat <extloadi8, L4_loadrub_ur>;
+
+defm : T_LoadAbsReg_Pat <sextloadi16, L4_loadrh_ur>;
+defm : T_LoadAbsReg_Pat <zextloadi16, L4_loadruh_ur>;
+defm : T_LoadAbsReg_Pat <extloadi16, L4_loadruh_ur>;
+
+defm : T_LoadAbsReg_Pat <load, L4_loadri_ur>;
+defm : T_LoadAbsReg_Pat <load, L4_loadrd_ur, i64>;
+}
+
+// 'def pats' for load instructions with base + register offset and non-zero
+// immediate value. Immediate value is used to left-shift the second
+// register operand.
+class Loadxs_pat<PatFrag Load, ValueType VT, InstHexagon MI>
+ : Pat<(VT (Load (add I32:$Rs,
+ (i32 (shl I32:$Rt, u2_0ImmPred:$u2))))),
+ (VT (MI IntRegs:$Rs, IntRegs:$Rt, imm:$u2))>;
+
+let AddedComplexity = 40 in {
+ def: Loadxs_pat<extloadi8, i32, L4_loadrub_rr>;
+ def: Loadxs_pat<zextloadi8, i32, L4_loadrub_rr>;
+ def: Loadxs_pat<sextloadi8, i32, L4_loadrb_rr>;
+ def: Loadxs_pat<extloadi16, i32, L4_loadruh_rr>;
+ def: Loadxs_pat<zextloadi16, i32, L4_loadruh_rr>;
+ def: Loadxs_pat<sextloadi16, i32, L4_loadrh_rr>;
+ def: Loadxs_pat<load, i32, L4_loadri_rr>;
+ def: Loadxs_pat<load, i64, L4_loadrd_rr>;
+}
+
+// 'def pats' for load instruction base + register offset and
+// zero immediate value.
+class Loadxs_simple_pat<PatFrag Load, ValueType VT, InstHexagon MI>
+ : Pat<(VT (Load (add I32:$Rs, I32:$Rt))),
+ (VT (MI IntRegs:$Rs, IntRegs:$Rt, 0))>;
+
+let AddedComplexity = 20 in {
+ def: Loadxs_simple_pat<extloadi8, i32, L4_loadrub_rr>;
+ def: Loadxs_simple_pat<zextloadi8, i32, L4_loadrub_rr>;
+ def: Loadxs_simple_pat<sextloadi8, i32, L4_loadrb_rr>;
+ def: Loadxs_simple_pat<extloadi16, i32, L4_loadruh_rr>;
+ def: Loadxs_simple_pat<zextloadi16, i32, L4_loadruh_rr>;
+ def: Loadxs_simple_pat<sextloadi16, i32, L4_loadrh_rr>;
+ def: Loadxs_simple_pat<load, i32, L4_loadri_rr>;
+ def: Loadxs_simple_pat<load, i64, L4_loadrd_rr>;
+}
+
+// zext i1->i64
+def: Pat<(i64 (zext I1:$src1)),
+ (ToZext64 (C2_muxii PredRegs:$src1, 1, 0))>;
+
+// zext i32->i64
+def: Pat<(Zext64 I32:$src1),
+ (ToZext64 IntRegs:$src1)>;
+
+let AddedComplexity = 40 in
+multiclass T_StoreAbsReg_Pats <InstHexagon MI, RegisterClass RC, ValueType VT,
+ PatFrag stOp> {
+ def : Pat<(stOp (VT RC:$src4),
+ (add (shl I32:$src1, u2_0ImmPred:$src2),
+ u32_0ImmPred:$src3)),
+ (MI IntRegs:$src1, u2_0ImmPred:$src2, u32_0ImmPred:$src3, RC:$src4)>;
+
+ def : Pat<(stOp (VT RC:$src4),
+ (add (shl IntRegs:$src1, u2_0ImmPred:$src2),
+ (HexagonCONST32 tglobaladdr:$src3))),
+ (MI IntRegs:$src1, u2_0ImmPred:$src2, tglobaladdr:$src3, RC:$src4)>;
+
+ def : Pat<(stOp (VT RC:$src4),
+ (add IntRegs:$src1, (HexagonCONST32 tglobaladdr:$src3))),
+ (MI IntRegs:$src1, 0, tglobaladdr:$src3, RC:$src4)>;
+}
+
+defm : T_StoreAbsReg_Pats <S4_storerd_ur, DoubleRegs, i64, store>;
+defm : T_StoreAbsReg_Pats <S4_storeri_ur, IntRegs, i32, store>;
+defm : T_StoreAbsReg_Pats <S4_storerb_ur, IntRegs, i32, truncstorei8>;
+defm : T_StoreAbsReg_Pats <S4_storerh_ur, IntRegs, i32, truncstorei16>;
+
+class Storexs_pat<PatFrag Store, PatFrag Value, InstHexagon MI>
+ : Pat<(Store Value:$Ru, (add I32:$Rs,
+ (i32 (shl I32:$Rt, u2_0ImmPred:$u2)))),
+ (MI IntRegs:$Rs, IntRegs:$Rt, imm:$u2, Value:$Ru)>;
+
+let AddedComplexity = 40 in {
+ def: Storexs_pat<truncstorei8, I32, S4_storerb_rr>;
+ def: Storexs_pat<truncstorei16, I32, S4_storerh_rr>;
+ def: Storexs_pat<store, I32, S4_storeri_rr>;
+ def: Storexs_pat<store, I64, S4_storerd_rr>;
+}
+
+def s30_2ProperPred : PatLeaf<(i32 imm), [{
+ int64_t v = (int64_t)N->getSExtValue();
+ return isShiftedInt<30,2>(v) && !isShiftedInt<29,3>(v);
+}]>;
+def RoundTo8 : SDNodeXForm<imm, [{
+ int32_t Imm = N->getSExtValue();
+ return CurDAG->getTargetConstant(Imm & -8, SDLoc(N), MVT::i32);
+}]>;
+
+let AddedComplexity = 40 in
+def: Pat<(store I64:$Ru, (add I32:$Rs, s30_2ProperPred:$Off)),
+ (S2_storerd_io (A2_addi I32:$Rs, 4), (RoundTo8 $Off), I64:$Ru)>;
+
+class Store_rr_pat<PatFrag Store, PatFrag Value, InstHexagon MI>
+ : Pat<(Store Value:$Ru, (add I32:$Rs, I32:$Rt)),
+ (MI IntRegs:$Rs, IntRegs:$Rt, 0, Value:$Ru)>;
+
+let AddedComplexity = 20 in {
+ def: Store_rr_pat<truncstorei8, I32, S4_storerb_rr>;
+ def: Store_rr_pat<truncstorei16, I32, S4_storerh_rr>;
+ def: Store_rr_pat<store, I32, S4_storeri_rr>;
+ def: Store_rr_pat<store, I64, S4_storerd_rr>;
+}
+
+
+def IMM_BYTE : SDNodeXForm<imm, [{
+ // -1 etc is represented as 255 etc
+ // assigning to a byte restores our desired signed value.
+ int8_t imm = N->getSExtValue();
+ return CurDAG->getTargetConstant(imm, SDLoc(N), MVT::i32);
+}]>;
+
+def IMM_HALF : SDNodeXForm<imm, [{
+ // -1 etc is represented as 65535 etc
+ // assigning to a short restores our desired signed value.
+ int16_t imm = N->getSExtValue();
+ return CurDAG->getTargetConstant(imm, SDLoc(N), MVT::i32);
+}]>;
+
+def IMM_WORD : SDNodeXForm<imm, [{
+ // -1 etc can be represented as 4294967295 etc
+ // Currently, it's not doing this. But some optimization
+ // might convert -1 to a large +ve number.
+ // assigning to a word restores our desired signed value.
+ int32_t imm = N->getSExtValue();
+ return CurDAG->getTargetConstant(imm, SDLoc(N), MVT::i32);
+}]>;
+
+def ToImmByte : OutPatFrag<(ops node:$R), (IMM_BYTE $R)>;
+def ToImmHalf : OutPatFrag<(ops node:$R), (IMM_HALF $R)>;
+def ToImmWord : OutPatFrag<(ops node:$R), (IMM_WORD $R)>;
+
+// Emit store-immediate, but only when the stored value will not be constant-
+// extended. The reason for that is that there is no pass that can optimize
+// constant extenders in store-immediate instructions. In some cases we can
+// end up will a number of such stores, all of which store the same extended
+// value (e.g. after unrolling a loop that initializes floating point array).
+
+// Predicates to determine if the 16-bit immediate is expressible as a sign-
+// extended 8-bit immediate. Store-immediate-halfword will ignore any bits
+// beyond 0..15, so we don't care what is in there.
+
+def i16in8ImmPred: PatLeaf<(i32 imm), [{
+ int64_t v = (int16_t)N->getSExtValue();
+ return v == (int64_t)(int8_t)v;
+}]>;
+
+// Predicates to determine if the 32-bit immediate is expressible as a sign-
+// extended 8-bit immediate.
+def i32in8ImmPred: PatLeaf<(i32 imm), [{
+ int64_t v = (int32_t)N->getSExtValue();
+ return v == (int64_t)(int8_t)v;
+}]>;
+
+
+let AddedComplexity = 40 in {
+ // Even though the offset is not extendable in the store-immediate, we
+ // can still generate the fi# in the base address. If the final offset
+ // is not valid for the instruction, we will replace it with a scratch
+ // register.
+// def: Storexm_fi_pat <truncstorei8, s32_0ImmPred, ToImmByte, S4_storeirb_io>;
+// def: Storexm_fi_pat <truncstorei16, i16in8ImmPred, ToImmHalf,
+// S4_storeirh_io>;
+// def: Storexm_fi_pat <store, i32in8ImmPred, ToImmWord, S4_storeiri_io>;
+
+// defm: Storexm_fi_add_pat <truncstorei8, s32_0ImmPred, u6_0ImmPred, ToImmByte,
+// S4_storeirb_io>;
+// defm: Storexm_fi_add_pat <truncstorei16, i16in8ImmPred, u6_1ImmPred,
+// ToImmHalf, S4_storeirh_io>;
+// defm: Storexm_fi_add_pat <store, i32in8ImmPred, u6_2ImmPred, ToImmWord,
+// S4_storeiri_io>;
+
+ defm: Storexm_add_pat<truncstorei8, s32_0ImmPred, u6_0ImmPred, ToImmByte,
+ S4_storeirb_io>;
+ defm: Storexm_add_pat<truncstorei16, i16in8ImmPred, u6_1ImmPred, ToImmHalf,
+ S4_storeirh_io>;
+ defm: Storexm_add_pat<store, i32in8ImmPred, u6_2ImmPred, ToImmWord,
+ S4_storeiri_io>;
+}
+
+def: Storexm_simple_pat<truncstorei8, s32_0ImmPred, ToImmByte, S4_storeirb_io>;
+def: Storexm_simple_pat<truncstorei16, s32_0ImmPred, ToImmHalf, S4_storeirh_io>;
+def: Storexm_simple_pat<store, s32_0ImmPred, ToImmWord, S4_storeiri_io>;
+
+// op(Ps, op(Pt, Pu))
+class LogLog_pat<SDNode Op1, SDNode Op2, InstHexagon MI>
+ : Pat<(i1 (Op1 I1:$Ps, (Op2 I1:$Pt, I1:$Pu))),
+ (MI I1:$Ps, I1:$Pt, I1:$Pu)>;
+
+// op(Ps, op(Pt, ~Pu))
+class LogLogNot_pat<SDNode Op1, SDNode Op2, InstHexagon MI>
+ : Pat<(i1 (Op1 I1:$Ps, (Op2 I1:$Pt, (not I1:$Pu)))),
+ (MI I1:$Ps, I1:$Pt, I1:$Pu)>;
+
+def: LogLog_pat<and, and, C4_and_and>;
+def: LogLog_pat<and, or, C4_and_or>;
+def: LogLog_pat<or, and, C4_or_and>;
+def: LogLog_pat<or, or, C4_or_or>;
+
+def: LogLogNot_pat<and, and, C4_and_andn>;
+def: LogLogNot_pat<and, or, C4_and_orn>;
+def: LogLogNot_pat<or, and, C4_or_andn>;
+def: LogLogNot_pat<or, or, C4_or_orn>;
+
+//===----------------------------------------------------------------------===//
+// PIC: Support for PIC compilations. The patterns and SD nodes defined
+// below are needed to support code generation for PIC
+//===----------------------------------------------------------------------===//
+
+def SDT_HexagonAtGot
+ : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>]>;
+def SDT_HexagonAtPcrel
+ : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
+
+// AT_GOT address-of-GOT, address-of-global, offset-in-global
+def HexagonAtGot : SDNode<"HexagonISD::AT_GOT", SDT_HexagonAtGot>;
+// AT_PCREL address-of-global
+def HexagonAtPcrel : SDNode<"HexagonISD::AT_PCREL", SDT_HexagonAtPcrel>;
+
+def: Pat<(HexagonAtGot I32:$got, I32:$addr, (i32 0)),
+ (L2_loadri_io I32:$got, imm:$addr)>;
+def: Pat<(HexagonAtGot I32:$got, I32:$addr, s30_2ImmPred:$off),
+ (A2_addi (L2_loadri_io I32:$got, imm:$addr), imm:$off)>;
+def: Pat<(HexagonAtPcrel I32:$addr),
+ (C4_addipc imm:$addr)>;
+
+def: Pat<(i64 (and I64:$Rs, (i64 (not I64:$Rt)))),
+ (A4_andnp DoubleRegs:$Rs, DoubleRegs:$Rt)>;
+def: Pat<(i64 (or I64:$Rs, (i64 (not I64:$Rt)))),
+ (A4_ornp DoubleRegs:$Rs, DoubleRegs:$Rt)>;
+
+def: Pat<(add I32:$Rs, (add I32:$Ru, s32_0ImmPred:$s6)),
+ (S4_addaddi IntRegs:$Rs, IntRegs:$Ru, imm:$s6)>;
+
+// Rd=add(Rs,sub(#s6,Ru))
+def: Pat<(add I32:$src1, (sub s32_0ImmPred:$src2,
+ I32:$src3)),
+ (S4_subaddi IntRegs:$src1, s32_0ImmPred:$src2, IntRegs:$src3)>;
+
+// Rd=sub(add(Rs,#s6),Ru)
+def: Pat<(sub (add I32:$src1, s32_0ImmPred:$src2),
+ I32:$src3),
+ (S4_subaddi IntRegs:$src1, s32_0ImmPred:$src2, IntRegs:$src3)>;
+
+// Rd=add(sub(Rs,Ru),#s6)
+def: Pat<(add (sub I32:$src1, I32:$src3),
+ (s32_0ImmPred:$src2)),
+ (S4_subaddi IntRegs:$src1, s32_0ImmPred:$src2, IntRegs:$src3)>;
+
+def: Pat<(xor I64:$dst2,
+ (xor I64:$Rss, I64:$Rtt)),
+ (M4_xor_xacc DoubleRegs:$dst2, DoubleRegs:$Rss, DoubleRegs:$Rtt)>;
+def: Pat<(or I32:$Ru, (and (i32 IntRegs:$_src_), s32_0ImmPred:$s10)),
+ (S4_or_andix IntRegs:$Ru, IntRegs:$_src_, imm:$s10)>;
+
+def: Pat<(or I32:$src1, (and I32:$Rs, s32_0ImmPred:$s10)),
+ (S4_or_andi IntRegs:$src1, IntRegs:$Rs, imm:$s10)>;
+
+def: Pat<(or I32:$src1, (or I32:$Rs, s32_0ImmPred:$s10)),
+ (S4_or_ori IntRegs:$src1, IntRegs:$Rs, imm:$s10)>;
+
+
+
+// Count trailing zeros: 64-bit.
+def: Pat<(i32 (trunc (cttz I64:$Rss))), (S2_ct0p I64:$Rss)>;
+
+// Count trailing ones: 64-bit.
+def: Pat<(i32 (trunc (cttz (not I64:$Rss)))), (S2_ct1p I64:$Rss)>;
+
+// Define leading/trailing patterns that require zero-extensions to 64 bits.
+def: Pat<(i64 (ctlz I64:$Rss)), (ToZext64 (S2_cl0p I64:$Rss))>;
+def: Pat<(i64 (cttz I64:$Rss)), (ToZext64 (S2_ct0p I64:$Rss))>;
+def: Pat<(i64 (ctlz (not I64:$Rss))), (ToZext64 (S2_cl1p I64:$Rss))>;
+def: Pat<(i64 (cttz (not I64:$Rss))), (ToZext64 (S2_ct1p I64:$Rss))>;
+
+
+let AddedComplexity = 20 in { // Complexity greater than cmp reg-imm.
+ def: Pat<(i1 (seteq (and (shl 1, u5_0ImmPred:$u5), I32:$Rs), 0)),
+ (S4_ntstbit_i I32:$Rs, u5_0ImmPred:$u5)>;
+ def: Pat<(i1 (seteq (and (shl 1, I32:$Rt), I32:$Rs), 0)),
+ (S4_ntstbit_r I32:$Rs, I32:$Rt)>;
+}
+
+// Add extra complexity to prefer these instructions over bitsset/bitsclr.
+// The reason is that tstbit/ntstbit can be folded into a compound instruction:
+// if ([!]tstbit(...)) jump ...
+let AddedComplexity = 100 in
+def: Pat<(i1 (setne (and I32:$Rs, (i32 IsPow2_32:$u5)), (i32 0))),
+ (S2_tstbit_i I32:$Rs, (Log2_32 imm:$u5))>;
+
+let AddedComplexity = 100 in
+def: Pat<(i1 (seteq (and I32:$Rs, (i32 IsPow2_32:$u5)), (i32 0))),
+ (S4_ntstbit_i I32:$Rs, (Log2_32 imm:$u5))>;
+
+// Do not increase complexity of these patterns. In the DAG, "cmp i8" may be
+// represented as a compare against "value & 0xFF", which is an exact match
+// for cmpb (same for cmph). The patterns below do not contain any additional
+// complexity that would make them preferable, and if they were actually used
+// instead of cmpb/cmph, they would result in a compare against register that
+// is loaded with the byte/half mask (i.e. 0xFF or 0xFFFF).
+def: Pat<(i1 (setne (and I32:$Rs, u6_0ImmPred:$u6), 0)),
+ (C4_nbitsclri I32:$Rs, u6_0ImmPred:$u6)>;
+def: Pat<(i1 (setne (and I32:$Rs, I32:$Rt), 0)),
+ (C4_nbitsclr I32:$Rs, I32:$Rt)>;
+def: Pat<(i1 (setne (and I32:$Rs, I32:$Rt), I32:$Rt)),
+ (C4_nbitsset I32:$Rs, I32:$Rt)>;
+
+
+def: Pat<(add (mul I32:$Rs, u6_0ImmPred:$U6), u32_0ImmPred:$u6),
+ (M4_mpyri_addi imm:$u6, IntRegs:$Rs, imm:$U6)>;
+def: Pat<(add (mul I32:$Rs, I32:$Rt), u32_0ImmPred:$u6),
+ (M4_mpyrr_addi imm:$u6, IntRegs:$Rs, IntRegs:$Rt)>;
+
+def: Pat<(add I32:$src1, (mul I32:$src3, u6_2ImmPred:$src2)),
+ (M4_mpyri_addr_u2 IntRegs:$src1, imm:$src2, IntRegs:$src3)>;
+def: Pat<(add I32:$src1, (mul I32:$src3, u32_0ImmPred:$src2)),
+ (M4_mpyri_addr IntRegs:$src1, IntRegs:$src3, imm:$src2)>;
+
+def: Pat<(add I32:$Ru, (mul (i32 IntRegs:$_src_), I32:$Rs)),
+ (M4_mpyrr_addr IntRegs:$Ru, IntRegs:$_src_, IntRegs:$Rs)>;
+
+def: T_vcmp_pat<A4_vcmpbgt, setgt, v8i8>;
+
+class T_Shift_CommOp_pat<InstHexagon MI, SDNode Op, SDNode ShOp>
+ : Pat<(Op (ShOp IntRegs:$Rx, u5_0ImmPred:$U5), u32_0ImmPred:$u8),
+ (MI u32_0ImmPred:$u8, IntRegs:$Rx, u5_0ImmPred:$U5)>;
+
+let AddedComplexity = 200 in {
+ def : T_Shift_CommOp_pat <S4_addi_asl_ri, add, shl>;
+ def : T_Shift_CommOp_pat <S4_addi_lsr_ri, add, srl>;
+ def : T_Shift_CommOp_pat <S4_andi_asl_ri, and, shl>;
+ def : T_Shift_CommOp_pat <S4_andi_lsr_ri, and, srl>;
+}
+
+let AddedComplexity = 30 in {
+ def : T_Shift_CommOp_pat <S4_ori_asl_ri, or, shl>;
+ def : T_Shift_CommOp_pat <S4_ori_lsr_ri, or, srl>;
+}
+
+class T_Shift_Op_pat<InstHexagon MI, SDNode Op, SDNode ShOp>
+ : Pat<(Op u32_0ImmPred:$u8, (ShOp IntRegs:$Rx, u5_0ImmPred:$U5)),
+ (MI u32_0ImmPred:$u8, IntRegs:$Rx, u5_0ImmPred:$U5)>;
+
+def : T_Shift_Op_pat <S4_subi_asl_ri, sub, shl>;
+def : T_Shift_Op_pat <S4_subi_lsr_ri, sub, srl>;
+
+let AddedComplexity = 200 in {
+ def: Pat<(add addrga:$addr, (shl I32:$src2, u5_0ImmPred:$src3)),
+ (S4_addi_asl_ri addrga:$addr, IntRegs:$src2, u5_0ImmPred:$src3)>;
+ def: Pat<(add addrga:$addr, (srl I32:$src2, u5_0ImmPred:$src3)),
+ (S4_addi_lsr_ri addrga:$addr, IntRegs:$src2, u5_0ImmPred:$src3)>;
+ def: Pat<(sub addrga:$addr, (shl I32:$src2, u5_0ImmPred:$src3)),
+ (S4_subi_asl_ri addrga:$addr, IntRegs:$src2, u5_0ImmPred:$src3)>;
+ def: Pat<(sub addrga:$addr, (srl I32:$src2, u5_0ImmPred:$src3)),
+ (S4_subi_lsr_ri addrga:$addr, IntRegs:$src2, u5_0ImmPred:$src3)>;
+}
+
+def: Pat<(shl s6_0ImmPred:$s6, I32:$Rt),
+ (S4_lsli imm:$s6, IntRegs:$Rt)>;
+
+
+//===----------------------------------------------------------------------===//
+// MEMOP
+//===----------------------------------------------------------------------===//
+
+def m5_0Imm8Pred : PatLeaf<(i32 imm), [{
+ int8_t V = N->getSExtValue();
+ return -32 < V && V <= -1;
+}]>;
+
+def m5_0Imm16Pred : PatLeaf<(i32 imm), [{
+ int16_t V = N->getSExtValue();
+ return -32 < V && V <= -1;
+}]>;
+
+def m5_0ImmPred : PatLeaf<(i32 imm), [{
+ int64_t V = N->getSExtValue();
+ return -31 <= V && V <= -1;
+}]>;
+
+def IsNPow2_8 : PatLeaf<(i32 imm), [{
+ uint8_t NV = ~N->getZExtValue();
+ return isPowerOf2_32(NV);
+}]>;
+
+def IsNPow2_16 : PatLeaf<(i32 imm), [{
+ uint16_t NV = ~N->getZExtValue();
+ return isPowerOf2_32(NV);
+}]>;
+
+def Log2_8 : SDNodeXForm<imm, [{
+ uint8_t V = N->getZExtValue();
+ return CurDAG->getTargetConstant(Log2_32(V), SDLoc(N), MVT::i32);
+}]>;
+
+def Log2_16 : SDNodeXForm<imm, [{
+ uint16_t V = N->getZExtValue();
+ return CurDAG->getTargetConstant(Log2_32(V), SDLoc(N), MVT::i32);
+}]>;
+
+def LogN2_8 : SDNodeXForm<imm, [{
+ uint8_t NV = ~N->getZExtValue();
+ return CurDAG->getTargetConstant(Log2_32(NV), SDLoc(N), MVT::i32);
+}]>;
+
+def LogN2_16 : SDNodeXForm<imm, [{
+ uint16_t NV = ~N->getZExtValue();
+ return CurDAG->getTargetConstant(Log2_32(NV), SDLoc(N), MVT::i32);
+}]>;
+
+def NegImm8 : SDNodeXForm<imm, [{
+ int8_t NV = -N->getSExtValue();
+ return CurDAG->getTargetConstant(NV, SDLoc(N), MVT::i32);
+}]>;
+
+def NegImm16 : SDNodeXForm<imm, [{
+ int16_t NV = -N->getSExtValue();
+ return CurDAG->getTargetConstant(NV, SDLoc(N), MVT::i32);
+}]>;
+
+def NegImm32 : SDNodeXForm<imm, [{
+ int32_t NV = -N->getSExtValue();
+ return CurDAG->getTargetConstant(NV, SDLoc(N), MVT::i32);
+}]>;
+
+def IdImm : SDNodeXForm<imm, [{ return SDValue(N, 0); }]>;
+
+multiclass Memopxr_simple_pat<PatFrag Load, PatFrag Store, SDNode Oper,
+ InstHexagon MI> {
+ // Addr: i32
+ def: Pat<(Store (Oper (Load I32:$Rs), I32:$A), I32:$Rs),
+ (MI I32:$Rs, 0, I32:$A)>;
+ // Addr: fi
+ def: Pat<(Store (Oper (Load AddrFI:$Rs), I32:$A), AddrFI:$Rs),
+ (MI AddrFI:$Rs, 0, I32:$A)>;
+}
+
+multiclass Memopxr_add_pat<PatFrag Load, PatFrag Store, PatFrag ImmPred,
+ SDNode Oper, InstHexagon MI> {
+ // Addr: i32
+ def: Pat<(Store (Oper (Load (add I32:$Rs, ImmPred:$Off)), I32:$A),
+ (add I32:$Rs, ImmPred:$Off)),
+ (MI I32:$Rs, imm:$Off, I32:$A)>;
+ def: Pat<(Store (Oper (Load (IsOrAdd I32:$Rs, ImmPred:$Off)), I32:$A),
+ (IsOrAdd I32:$Rs, ImmPred:$Off)),
+ (MI I32:$Rs, imm:$Off, I32:$A)>;
+ // Addr: fi
+ def: Pat<(Store (Oper (Load (add AddrFI:$Rs, ImmPred:$Off)), I32:$A),
+ (add AddrFI:$Rs, ImmPred:$Off)),
+ (MI AddrFI:$Rs, imm:$Off, I32:$A)>;
+ def: Pat<(Store (Oper (Load (IsOrAdd AddrFI:$Rs, ImmPred:$Off)), I32:$A),
+ (IsOrAdd AddrFI:$Rs, ImmPred:$Off)),
+ (MI AddrFI:$Rs, imm:$Off, I32:$A)>;
+}
+
+multiclass Memopxr_pat<PatFrag Load, PatFrag Store, PatFrag ImmPred,
+ SDNode Oper, InstHexagon MI> {
+ defm: Memopxr_simple_pat <Load, Store, Oper, MI>;
+ defm: Memopxr_add_pat <Load, Store, ImmPred, Oper, MI>;
+}
+
+let AddedComplexity = 180 in {
+ // add reg
+ defm: Memopxr_pat<extloadi8, truncstorei8, u6_0ImmPred, add,
+ /*anyext*/ L4_add_memopb_io>;
+ defm: Memopxr_pat<sextloadi8, truncstorei8, u6_0ImmPred, add,
+ /*sext*/ L4_add_memopb_io>;
+ defm: Memopxr_pat<zextloadi8, truncstorei8, u6_0ImmPred, add,
+ /*zext*/ L4_add_memopb_io>;
+ defm: Memopxr_pat<extloadi16, truncstorei16, u6_1ImmPred, add,
+ /*anyext*/ L4_add_memoph_io>;
+ defm: Memopxr_pat<sextloadi16, truncstorei16, u6_1ImmPred, add,
+ /*sext*/ L4_add_memoph_io>;
+ defm: Memopxr_pat<zextloadi16, truncstorei16, u6_1ImmPred, add,
+ /*zext*/ L4_add_memoph_io>;
+ defm: Memopxr_pat<load, store, u6_2ImmPred, add, L4_add_memopw_io>;
+
+ // sub reg
+ defm: Memopxr_pat<extloadi8, truncstorei8, u6_0ImmPred, sub,
+ /*anyext*/ L4_sub_memopb_io>;
+ defm: Memopxr_pat<sextloadi8, truncstorei8, u6_0ImmPred, sub,
+ /*sext*/ L4_sub_memopb_io>;
+ defm: Memopxr_pat<zextloadi8, truncstorei8, u6_0ImmPred, sub,
+ /*zext*/ L4_sub_memopb_io>;
+ defm: Memopxr_pat<extloadi16, truncstorei16, u6_1ImmPred, sub,
+ /*anyext*/ L4_sub_memoph_io>;
+ defm: Memopxr_pat<sextloadi16, truncstorei16, u6_1ImmPred, sub,
+ /*sext*/ L4_sub_memoph_io>;
+ defm: Memopxr_pat<zextloadi16, truncstorei16, u6_1ImmPred, sub,
+ /*zext*/ L4_sub_memoph_io>;
+ defm: Memopxr_pat<load, store, u6_2ImmPred, sub, L4_sub_memopw_io>;
+
+ // and reg
+ defm: Memopxr_pat<extloadi8, truncstorei8, u6_0ImmPred, and,
+ /*anyext*/ L4_and_memopb_io>;
+ defm: Memopxr_pat<sextloadi8, truncstorei8, u6_0ImmPred, and,
+ /*sext*/ L4_and_memopb_io>;
+ defm: Memopxr_pat<zextloadi8, truncstorei8, u6_0ImmPred, and,
+ /*zext*/ L4_and_memopb_io>;
+ defm: Memopxr_pat<extloadi16, truncstorei16, u6_1ImmPred, and,
+ /*anyext*/ L4_and_memoph_io>;
+ defm: Memopxr_pat<sextloadi16, truncstorei16, u6_1ImmPred, and,
+ /*sext*/ L4_and_memoph_io>;
+ defm: Memopxr_pat<zextloadi16, truncstorei16, u6_1ImmPred, and,
+ /*zext*/ L4_and_memoph_io>;
+ defm: Memopxr_pat<load, store, u6_2ImmPred, and, L4_and_memopw_io>;
+
+ // or reg
+ defm: Memopxr_pat<extloadi8, truncstorei8, u6_0ImmPred, or,
+ /*anyext*/ L4_or_memopb_io>;
+ defm: Memopxr_pat<sextloadi8, truncstorei8, u6_0ImmPred, or,
+ /*sext*/ L4_or_memopb_io>;
+ defm: Memopxr_pat<zextloadi8, truncstorei8, u6_0ImmPred, or,
+ /*zext*/ L4_or_memopb_io>;
+ defm: Memopxr_pat<extloadi16, truncstorei16, u6_1ImmPred, or,
+ /*anyext*/ L4_or_memoph_io>;
+ defm: Memopxr_pat<sextloadi16, truncstorei16, u6_1ImmPred, or,
+ /*sext*/ L4_or_memoph_io>;
+ defm: Memopxr_pat<zextloadi16, truncstorei16, u6_1ImmPred, or,
+ /*zext*/ L4_or_memoph_io>;
+ defm: Memopxr_pat<load, store, u6_2ImmPred, or, L4_or_memopw_io>;
+}
+
+
+multiclass Memopxi_simple_pat<PatFrag Load, PatFrag Store, SDNode Oper,
+ PatFrag Arg, SDNodeXForm ArgMod,
+ InstHexagon MI> {
+ // Addr: i32
+ def: Pat<(Store (Oper (Load I32:$Rs), Arg:$A), I32:$Rs),
+ (MI I32:$Rs, 0, (ArgMod Arg:$A))>;
+ // Addr: fi
+ def: Pat<(Store (Oper (Load AddrFI:$Rs), Arg:$A), AddrFI:$Rs),
+ (MI AddrFI:$Rs, 0, (ArgMod Arg:$A))>;
+}
+
+multiclass Memopxi_add_pat<PatFrag Load, PatFrag Store, PatFrag ImmPred,
+ SDNode Oper, PatFrag Arg, SDNodeXForm ArgMod,
+ InstHexagon MI> {
+ // Addr: i32
+ def: Pat<(Store (Oper (Load (add I32:$Rs, ImmPred:$Off)), Arg:$A),
+ (add I32:$Rs, ImmPred:$Off)),
+ (MI I32:$Rs, imm:$Off, (ArgMod Arg:$A))>;
+ def: Pat<(Store (Oper (Load (IsOrAdd I32:$Rs, ImmPred:$Off)), Arg:$A),
+ (IsOrAdd I32:$Rs, ImmPred:$Off)),
+ (MI I32:$Rs, imm:$Off, (ArgMod Arg:$A))>;
+ // Addr: fi
+ def: Pat<(Store (Oper (Load (add AddrFI:$Rs, ImmPred:$Off)), Arg:$A),
+ (add AddrFI:$Rs, ImmPred:$Off)),
+ (MI AddrFI:$Rs, imm:$Off, (ArgMod Arg:$A))>;
+ def: Pat<(Store (Oper (Load (IsOrAdd AddrFI:$Rs, ImmPred:$Off)), Arg:$A),
+ (IsOrAdd AddrFI:$Rs, ImmPred:$Off)),
+ (MI AddrFI:$Rs, imm:$Off, (ArgMod Arg:$A))>;
+}
+
+multiclass Memopxi_pat<PatFrag Load, PatFrag Store, PatFrag ImmPred,
+ SDNode Oper, PatFrag Arg, SDNodeXForm ArgMod,
+ InstHexagon MI> {
+ defm: Memopxi_simple_pat <Load, Store, Oper, Arg, ArgMod, MI>;
+ defm: Memopxi_add_pat <Load, Store, ImmPred, Oper, Arg, ArgMod, MI>;
+}
+
+
+let AddedComplexity = 200 in {
+ // add imm
+ defm: Memopxi_pat<extloadi8, truncstorei8, u6_0ImmPred, add, u5_0ImmPred,
+ /*anyext*/ IdImm, L4_iadd_memopb_io>;
+ defm: Memopxi_pat<sextloadi8, truncstorei8, u6_0ImmPred, add, u5_0ImmPred,
+ /*sext*/ IdImm, L4_iadd_memopb_io>;
+ defm: Memopxi_pat<zextloadi8, truncstorei8, u6_0ImmPred, add, u5_0ImmPred,
+ /*zext*/ IdImm, L4_iadd_memopb_io>;
+ defm: Memopxi_pat<extloadi16, truncstorei16, u6_1ImmPred, add, u5_0ImmPred,
+ /*anyext*/ IdImm, L4_iadd_memoph_io>;
+ defm: Memopxi_pat<extloadi16, truncstorei16, u6_1ImmPred, add, u5_0ImmPred,
+ /*sext*/ IdImm, L4_iadd_memoph_io>;
+ defm: Memopxi_pat<extloadi16, truncstorei16, u6_1ImmPred, add, u5_0ImmPred,
+ /*zext*/ IdImm, L4_iadd_memoph_io>;
+ defm: Memopxi_pat<load, store, u6_2ImmPred, add, u5_0ImmPred, IdImm,
+ L4_iadd_memopw_io>;
+ defm: Memopxi_pat<extloadi8, truncstorei8, u6_0ImmPred, sub, m5_0Imm8Pred,
+ /*anyext*/ NegImm8, L4_iadd_memopb_io>;
+ defm: Memopxi_pat<sextloadi8, truncstorei8, u6_0ImmPred, sub, m5_0Imm8Pred,
+ /*sext*/ NegImm8, L4_iadd_memopb_io>;
+ defm: Memopxi_pat<zextloadi8, truncstorei8, u6_0ImmPred, sub, m5_0Imm8Pred,
+ /*zext*/ NegImm8, L4_iadd_memopb_io>;
+ defm: Memopxi_pat<extloadi16, truncstorei16, u6_1ImmPred, sub, m5_0Imm16Pred,
+ /*anyext*/ NegImm16, L4_iadd_memoph_io>;
+ defm: Memopxi_pat<sextloadi16, truncstorei16, u6_1ImmPred, sub, m5_0Imm16Pred,
+ /*sext*/ NegImm16, L4_iadd_memoph_io>;
+ defm: Memopxi_pat<zextloadi16, truncstorei16, u6_1ImmPred, sub, m5_0Imm16Pred,
+ /*zext*/ NegImm16, L4_iadd_memoph_io>;
+ defm: Memopxi_pat<load, store, u6_2ImmPred, sub, m5_0ImmPred, NegImm32,
+ L4_iadd_memopw_io>;
+
+ // sub imm
+ defm: Memopxi_pat<extloadi8, truncstorei8, u6_0ImmPred, sub, u5_0ImmPred,
+ /*anyext*/ IdImm, L4_isub_memopb_io>;
+ defm: Memopxi_pat<sextloadi8, truncstorei8, u6_0ImmPred, sub, u5_0ImmPred,
+ /*sext*/ IdImm, L4_isub_memopb_io>;
+ defm: Memopxi_pat<zextloadi8, truncstorei8, u6_0ImmPred, sub, u5_0ImmPred,
+ /*zext*/ IdImm, L4_isub_memopb_io>;
+ defm: Memopxi_pat<extloadi16, truncstorei16, u6_1ImmPred, sub, u5_0ImmPred,
+ /*anyext*/ IdImm, L4_isub_memoph_io>;
+ defm: Memopxi_pat<sextloadi16, truncstorei16, u6_1ImmPred, sub, u5_0ImmPred,
+ /*sext*/ IdImm, L4_isub_memoph_io>;
+ defm: Memopxi_pat<zextloadi16, truncstorei16, u6_1ImmPred, sub, u5_0ImmPred,
+ /*zext*/ IdImm, L4_isub_memoph_io>;
+ defm: Memopxi_pat<load, store, u6_2ImmPred, sub, u5_0ImmPred, IdImm,
+ L4_isub_memopw_io>;
+ defm: Memopxi_pat<extloadi8, truncstorei8, u6_0ImmPred, add, m5_0Imm8Pred,
+ /*anyext*/ NegImm8, L4_isub_memopb_io>;
+ defm: Memopxi_pat<sextloadi8, truncstorei8, u6_0ImmPred, add, m5_0Imm8Pred,
+ /*sext*/ NegImm8, L4_isub_memopb_io>;
+ defm: Memopxi_pat<zextloadi8, truncstorei8, u6_0ImmPred, add, m5_0Imm8Pred,
+ /*zext*/ NegImm8, L4_isub_memopb_io>;
+ defm: Memopxi_pat<extloadi16, truncstorei16, u6_1ImmPred, add, m5_0Imm16Pred,
+ /*anyext*/ NegImm16, L4_isub_memoph_io>;
+ defm: Memopxi_pat<sextloadi16, truncstorei16, u6_1ImmPred, add, m5_0Imm16Pred,
+ /*sext*/ NegImm16, L4_isub_memoph_io>;
+ defm: Memopxi_pat<zextloadi16, truncstorei16, u6_1ImmPred, add, m5_0Imm16Pred,
+ /*zext*/ NegImm16, L4_isub_memoph_io>;
+ defm: Memopxi_pat<load, store, u6_2ImmPred, add, m5_0ImmPred, NegImm32,
+ L4_isub_memopw_io>;
+
+ // clrbit imm
+ defm: Memopxi_pat<extloadi8, truncstorei8, u6_0ImmPred, and, IsNPow2_8,
+ /*anyext*/ LogN2_8, L4_iand_memopb_io>;
+ defm: Memopxi_pat<sextloadi8, truncstorei8, u6_0ImmPred, and, IsNPow2_8,
+ /*sext*/ LogN2_8, L4_iand_memopb_io>;
+ defm: Memopxi_pat<zextloadi8, truncstorei8, u6_0ImmPred, and, IsNPow2_8,
+ /*zext*/ LogN2_8, L4_iand_memopb_io>;
+ defm: Memopxi_pat<extloadi16, truncstorei16, u6_1ImmPred, and, IsNPow2_16,
+ /*anyext*/ LogN2_16, L4_iand_memoph_io>;
+ defm: Memopxi_pat<sextloadi16, truncstorei16, u6_1ImmPred, and, IsNPow2_16,
+ /*sext*/ LogN2_16, L4_iand_memoph_io>;
+ defm: Memopxi_pat<zextloadi16, truncstorei16, u6_1ImmPred, and, IsNPow2_16,
+ /*zext*/ LogN2_16, L4_iand_memoph_io>;
+ defm: Memopxi_pat<load, store, u6_2ImmPred, and, IsNPow2_32,
+ LogN2_32, L4_iand_memopw_io>;
+
+ // setbit imm
+ defm: Memopxi_pat<extloadi8, truncstorei8, u6_0ImmPred, or, IsPow2_32,
+ /*anyext*/ Log2_8, L4_ior_memopb_io>;
+ defm: Memopxi_pat<sextloadi8, truncstorei8, u6_0ImmPred, or, IsPow2_32,
+ /*sext*/ Log2_8, L4_ior_memopb_io>;
+ defm: Memopxi_pat<zextloadi8, truncstorei8, u6_0ImmPred, or, IsPow2_32,
+ /*zext*/ Log2_8, L4_ior_memopb_io>;
+ defm: Memopxi_pat<extloadi16, truncstorei16, u6_1ImmPred, or, IsPow2_32,
+ /*anyext*/ Log2_16, L4_ior_memoph_io>;
+ defm: Memopxi_pat<sextloadi16, truncstorei16, u6_1ImmPred, or, IsPow2_32,
+ /*sext*/ Log2_16, L4_ior_memoph_io>;
+ defm: Memopxi_pat<zextloadi16, truncstorei16, u6_1ImmPred, or, IsPow2_32,
+ /*zext*/ Log2_16, L4_ior_memoph_io>;
+ defm: Memopxi_pat<load, store, u6_2ImmPred, or, IsPow2_32,
+ Log2_32, L4_ior_memopw_io>;
+}
+
+def : T_CMP_pat <C4_cmpneqi, setne, s32_0ImmPred>;
+def : T_CMP_pat <C4_cmpltei, setle, s32_0ImmPred>;
+def : T_CMP_pat <C4_cmplteui, setule, u9_0ImmPred>;
+
+// Map cmplt(Rs, Imm) -> !cmpgt(Rs, Imm-1).
+def: Pat<(i1 (setlt I32:$src1, s32_0ImmPred:$src2)),
+ (C4_cmpltei IntRegs:$src1, (SDEC1 s32_0ImmPred:$src2))>;
+
+// rs != rt -> !(rs == rt).
+def: Pat<(i1 (setne I32:$src1, s32_0ImmPred:$src2)),
+ (C4_cmpneqi IntRegs:$src1, s32_0ImmPred:$src2)>;
+
+// For the sequence
+// zext( setult ( and(Rs, 255), u8))
+// Use the isdigit transformation below
+
+
+def u7_0PosImmPred : ImmLeaf<i32, [{
+ // True if the immediate fits in an 7-bit unsigned field and
+ // is strictly greater than 0.
+ return Imm > 0 && isUInt<7>(Imm);
+}]>;
+
+
+// Generate code of the form 'C2_muxii(cmpbgtui(Rdd, C-1),0,1)'
+// for C code of the form r = ((c>='0') & (c<='9')) ? 1 : 0;.
+// The isdigit transformation relies on two 'clever' aspects:
+// 1) The data type is unsigned which allows us to eliminate a zero test after
+// biasing the expression by 48. We are depending on the representation of
+// the unsigned types, and semantics.
+// 2) The front end has converted <= 9 into < 10 on entry to LLVM
+//
+// For the C code:
+// retval = ((c>='0') & (c<='9')) ? 1 : 0;
+// The code is transformed upstream of llvm into
+// retval = (c-48) < 10 ? 1 : 0;
+
+let AddedComplexity = 139 in
+def: Pat<(i32 (zext (i1 (setult (and I32:$src1, 255), u7_0PosImmPred:$src2)))),
+ (C2_muxii (A4_cmpbgtui IntRegs:$src1, (UDEC1 imm:$src2)), 0, 1)>;
+
+class Loada_pat<PatFrag Load, ValueType VT, PatFrag Addr, InstHexagon MI>
+ : Pat<(VT (Load Addr:$addr)), (MI Addr:$addr)>;
+
+class Loadam_pat<PatFrag Load, ValueType VT, PatFrag Addr, PatFrag ValueMod,
+ InstHexagon MI>
+ : Pat<(VT (Load Addr:$addr)), (ValueMod (MI Addr:$addr))>;
+
+class Storea_pat<PatFrag Store, PatFrag Value, PatFrag Addr, InstHexagon MI>
+ : Pat<(Store Value:$val, Addr:$addr), (MI Addr:$addr, Value:$val)>;
+
+class Stoream_pat<PatFrag Store, PatFrag Value, PatFrag Addr, PatFrag ValueMod,
+ InstHexagon MI>
+ : Pat<(Store Value:$val, Addr:$addr),
+ (MI Addr:$addr, (ValueMod Value:$val))>;
+
+let AddedComplexity = 30 in {
+ def: Storea_pat<truncstorei8, I32, addrga, PS_storerbabs>;
+ def: Storea_pat<truncstorei16, I32, addrga, PS_storerhabs>;
+ def: Storea_pat<store, I32, addrga, PS_storeriabs>;
+ def: Storea_pat<store, I64, addrga, PS_storerdabs>;
+
+ def: Stoream_pat<truncstorei8, I64, addrga, LoReg, PS_storerbabs>;
+ def: Stoream_pat<truncstorei16, I64, addrga, LoReg, PS_storerhabs>;
+ def: Stoream_pat<truncstorei32, I64, addrga, LoReg, PS_storeriabs>;
+}
+
+def: Storea_pat<SwapSt<atomic_store_8>, I32, addrgp, S2_storerbgp>;
+def: Storea_pat<SwapSt<atomic_store_16>, I32, addrgp, S2_storerhgp>;
+def: Storea_pat<SwapSt<atomic_store_32>, I32, addrgp, S2_storerigp>;
+def: Storea_pat<SwapSt<atomic_store_64>, I64, addrgp, S2_storerdgp>;
+
+let AddedComplexity = 100 in {
+ def: Storea_pat<truncstorei8, I32, addrgp, S2_storerbgp>;
+ def: Storea_pat<truncstorei16, I32, addrgp, S2_storerhgp>;
+ def: Storea_pat<store, I32, addrgp, S2_storerigp>;
+ def: Storea_pat<store, I64, addrgp, S2_storerdgp>;
+
+ // Map from "i1 = constant<-1>; memw(CONST32(#foo)) = i1"
+ // to "r0 = 1; memw(#foo) = r0"
+ let AddedComplexity = 100 in
+ def: Pat<(store (i1 -1), (HexagonCONST32_GP tglobaladdr:$global)),
+ (S2_storerbgp tglobaladdr:$global, (A2_tfrsi 1))>;
+}
+
+class LoadAbs_pats <PatFrag ldOp, InstHexagon MI, ValueType VT = i32>
+ : Pat <(VT (ldOp (HexagonCONST32 tglobaladdr:$absaddr))),
+ (VT (MI tglobaladdr:$absaddr))>;
+
+let AddedComplexity = 30 in {
+ def: LoadAbs_pats <load, PS_loadriabs>;
+ def: LoadAbs_pats <zextloadi1, PS_loadrubabs>;
+ def: LoadAbs_pats <sextloadi8, PS_loadrbabs>;
+ def: LoadAbs_pats <extloadi8, PS_loadrubabs>;
+ def: LoadAbs_pats <zextloadi8, PS_loadrubabs>;
+ def: LoadAbs_pats <sextloadi16, PS_loadrhabs>;
+ def: LoadAbs_pats <extloadi16, PS_loadruhabs>;
+ def: LoadAbs_pats <zextloadi16, PS_loadruhabs>;
+ def: LoadAbs_pats <load, PS_loadrdabs, i64>;
+}
+
+let AddedComplexity = 30 in
+def: Pat<(i64 (zextloadi1 (HexagonCONST32 tglobaladdr:$absaddr))),
+ (ToZext64 (PS_loadrubabs tglobaladdr:$absaddr))>;
+
+def: Loada_pat<atomic_load_8, i32, addrgp, L2_loadrubgp>;
+def: Loada_pat<atomic_load_16, i32, addrgp, L2_loadruhgp>;
+def: Loada_pat<atomic_load_32, i32, addrgp, L2_loadrigp>;
+def: Loada_pat<atomic_load_64, i64, addrgp, L2_loadrdgp>;
+
+def: Loadam_pat<load, i1, addrga, I32toI1, PS_loadrubabs>;
+def: Loadam_pat<load, i1, addrgp, I32toI1, L2_loadrubgp>;
+
+def: Stoream_pat<store, I1, addrga, I1toI32, PS_storerbabs>;
+def: Stoream_pat<store, I1, addrgp, I1toI32, S2_storerbgp>;
+
+// Map from load(globaladdress) -> mem[u][bhwd](#foo)
+class LoadGP_pats <PatFrag ldOp, InstHexagon MI, ValueType VT = i32>
+ : Pat <(VT (ldOp (HexagonCONST32_GP tglobaladdr:$global))),
+ (VT (MI tglobaladdr:$global))>;
+
+let AddedComplexity = 100 in {
+ def: LoadGP_pats <extloadi8, L2_loadrubgp>;
+ def: LoadGP_pats <sextloadi8, L2_loadrbgp>;
+ def: LoadGP_pats <zextloadi8, L2_loadrubgp>;
+ def: LoadGP_pats <extloadi16, L2_loadruhgp>;
+ def: LoadGP_pats <sextloadi16, L2_loadrhgp>;
+ def: LoadGP_pats <zextloadi16, L2_loadruhgp>;
+ def: LoadGP_pats <load, L2_loadrigp>;
+ def: LoadGP_pats <load, L2_loadrdgp, i64>;
+}
+
+// When the Interprocedural Global Variable optimizer realizes that a certain
+// global variable takes only two constant values, it shrinks the global to
+// a boolean. Catch those loads here in the following 3 patterns.
+let AddedComplexity = 100 in {
+ def: LoadGP_pats <extloadi1, L2_loadrubgp>;
+ def: LoadGP_pats <zextloadi1, L2_loadrubgp>;
+}
+
+// Transfer global address into a register
+def: Pat<(HexagonCONST32 tglobaladdr:$Rs), (A2_tfrsi imm:$Rs)>;
+def: Pat<(HexagonCONST32_GP tblockaddress:$Rs), (A2_tfrsi imm:$Rs)>;
+def: Pat<(HexagonCONST32_GP tglobaladdr:$Rs), (A2_tfrsi imm:$Rs)>;
+
+let AddedComplexity = 30 in {
+ def: Storea_pat<truncstorei8, I32, u32_0ImmPred, PS_storerbabs>;
+ def: Storea_pat<truncstorei16, I32, u32_0ImmPred, PS_storerhabs>;
+ def: Storea_pat<store, I32, u32_0ImmPred, PS_storeriabs>;
+}
+
+let AddedComplexity = 30 in {
+ def: Loada_pat<load, i32, u32_0ImmPred, PS_loadriabs>;
+ def: Loada_pat<sextloadi8, i32, u32_0ImmPred, PS_loadrbabs>;
+ def: Loada_pat<zextloadi8, i32, u32_0ImmPred, PS_loadrubabs>;
+ def: Loada_pat<sextloadi16, i32, u32_0ImmPred, PS_loadrhabs>;
+ def: Loada_pat<zextloadi16, i32, u32_0ImmPred, PS_loadruhabs>;
+}
+
+// Indexed store word - global address.
+// memw(Rs+#u6:2)=#S8
+let AddedComplexity = 100 in
+defm: Storex_add_pat<store, addrga, u6_2ImmPred, S4_storeiri_io>;
+
+// Load from a global address that has only one use in the current basic block.
+let AddedComplexity = 100 in {
+ def: Loada_pat<extloadi8, i32, addrga, PS_loadrubabs>;
+ def: Loada_pat<sextloadi8, i32, addrga, PS_loadrbabs>;
+ def: Loada_pat<zextloadi8, i32, addrga, PS_loadrubabs>;
+
+ def: Loada_pat<extloadi16, i32, addrga, PS_loadruhabs>;
+ def: Loada_pat<sextloadi16, i32, addrga, PS_loadrhabs>;
+ def: Loada_pat<zextloadi16, i32, addrga, PS_loadruhabs>;
+
+ def: Loada_pat<load, i32, addrga, PS_loadriabs>;
+ def: Loada_pat<load, i64, addrga, PS_loadrdabs>;
+}
+
+// Store to a global address that has only one use in the current basic block.
+let AddedComplexity = 100 in {
+ def: Storea_pat<truncstorei8, I32, addrga, PS_storerbabs>;
+ def: Storea_pat<truncstorei16, I32, addrga, PS_storerhabs>;
+ def: Storea_pat<store, I32, addrga, PS_storeriabs>;
+ def: Storea_pat<store, I64, addrga, PS_storerdabs>;
+
+ def: Stoream_pat<truncstorei32, I64, addrga, LoReg, PS_storeriabs>;
+}
+
+// i8/i16/i32 -> i64 loads
+// We need a complexity of 120 here to override preceding handling of
+// zextload.
+let AddedComplexity = 120 in {
+ def: Loadam_pat<extloadi8, i64, addrga, ToZext64, PS_loadrubabs>;
+ def: Loadam_pat<sextloadi8, i64, addrga, ToSext64, PS_loadrbabs>;
+ def: Loadam_pat<zextloadi8, i64, addrga, ToZext64, PS_loadrubabs>;
+
+ def: Loadam_pat<extloadi16, i64, addrga, ToZext64, PS_loadruhabs>;
+ def: Loadam_pat<sextloadi16, i64, addrga, ToSext64, PS_loadrhabs>;
+ def: Loadam_pat<zextloadi16, i64, addrga, ToZext64, PS_loadruhabs>;
+
+ def: Loadam_pat<extloadi32, i64, addrga, ToZext64, PS_loadriabs>;
+ def: Loadam_pat<sextloadi32, i64, addrga, ToSext64, PS_loadriabs>;
+ def: Loadam_pat<zextloadi32, i64, addrga, ToZext64, PS_loadriabs>;
+}
+
+let AddedComplexity = 100 in {
+ def: Loada_pat<extloadi8, i32, addrgp, PS_loadrubabs>;
+ def: Loada_pat<sextloadi8, i32, addrgp, PS_loadrbabs>;
+ def: Loada_pat<zextloadi8, i32, addrgp, PS_loadrubabs>;
+
+ def: Loada_pat<extloadi16, i32, addrgp, PS_loadruhabs>;
+ def: Loada_pat<sextloadi16, i32, addrgp, PS_loadrhabs>;
+ def: Loada_pat<zextloadi16, i32, addrgp, PS_loadruhabs>;
+
+ def: Loada_pat<load, i32, addrgp, PS_loadriabs>;
+ def: Loada_pat<load, i64, addrgp, PS_loadrdabs>;
+}
+
+let AddedComplexity = 100 in {
+ def: Storea_pat<truncstorei8, I32, addrgp, PS_storerbabs>;
+ def: Storea_pat<truncstorei16, I32, addrgp, PS_storerhabs>;
+ def: Storea_pat<store, I32, addrgp, PS_storeriabs>;
+ def: Storea_pat<store, I64, addrgp, PS_storerdabs>;
+}
+
+def: Loada_pat<atomic_load_8, i32, addrgp, PS_loadrubabs>;
+def: Loada_pat<atomic_load_16, i32, addrgp, PS_loadruhabs>;
+def: Loada_pat<atomic_load_32, i32, addrgp, PS_loadriabs>;
+def: Loada_pat<atomic_load_64, i64, addrgp, PS_loadrdabs>;
+
+def: Storea_pat<SwapSt<atomic_store_8>, I32, addrgp, PS_storerbabs>;
+def: Storea_pat<SwapSt<atomic_store_16>, I32, addrgp, PS_storerhabs>;
+def: Storea_pat<SwapSt<atomic_store_32>, I32, addrgp, PS_storeriabs>;
+def: Storea_pat<SwapSt<atomic_store_64>, I64, addrgp, PS_storerdabs>;
+
+def: Pat<(or (or (or (shl (i64 (zext (and I32:$b, (i32 65535)))), (i32 16)),
+ (i64 (zext (i32 (and I32:$a, (i32 65535)))))),
+ (shl (i64 (anyext (and I32:$c, (i32 65535)))), (i32 32))),
+ (shl (Aext64 I32:$d), (i32 48))),
+ (A2_combinew (A2_combine_ll I32:$d, I32:$c),
+ (A2_combine_ll I32:$b, I32:$a))>;
+
+// We need custom lowering of ISD::PREFETCH into HexagonISD::DCFETCH
+// because the SDNode ISD::PREFETCH has properties MayLoad and MayStore.
+// We don't really want either one here.
+def SDTHexagonDCFETCH : SDTypeProfile<0, 2, [SDTCisPtrTy<0>,SDTCisInt<1>]>;
+def HexagonDCFETCH : SDNode<"HexagonISD::DCFETCH", SDTHexagonDCFETCH,
+ [SDNPHasChain]>;
+
+def: Pat<(HexagonDCFETCH IntRegs:$Rs, u11_3ImmPred:$u11_3),
+ (Y2_dcfetchbo IntRegs:$Rs, imm:$u11_3)>;
+def: Pat<(HexagonDCFETCH (i32 (add IntRegs:$Rs, u11_3ImmPred:$u11_3)), (i32 0)),
+ (Y2_dcfetchbo IntRegs:$Rs, imm:$u11_3)>;
+
+def f32ImmPred : PatLeaf<(f32 fpimm:$F)>;
+def f64ImmPred : PatLeaf<(f64 fpimm:$F)>;
+
+def ftoi : SDNodeXForm<fpimm, [{
+ APInt I = N->getValueAPF().bitcastToAPInt();
+ return CurDAG->getTargetConstant(I.getZExtValue(), SDLoc(N),
+ MVT::getIntegerVT(I.getBitWidth()));
+}]>;
+
+
+def: Pat<(sra (i64 (add (sra I64:$src1, u6_0ImmPred:$src2), 1)), (i32 1)),
+ (S2_asr_i_p_rnd I64:$src1, imm:$src2)>;
+
+def SDTHexagonI32I64: SDTypeProfile<1, 1, [SDTCisVT<0, i32>,
+ SDTCisVT<1, i64>]>;
+def HexagonPOPCOUNT: SDNode<"HexagonISD::POPCOUNT", SDTHexagonI32I64>;
+
+def: Pat<(HexagonPOPCOUNT I64:$Rss), (S5_popcountp I64:$Rss)>;
+
+let AddedComplexity = 20 in {
+ defm: Loadx_pat<load, f32, s30_2ImmPred, L2_loadri_io>;
+ defm: Loadx_pat<load, f64, s29_3ImmPred, L2_loadrd_io>;
+}
+
+let AddedComplexity = 60 in {
+ defm : T_LoadAbsReg_Pat <load, L4_loadri_ur, f32>;
+ defm : T_LoadAbsReg_Pat <load, L4_loadrd_ur, f64>;
+}
+
+let AddedComplexity = 40 in {
+ def: Loadxs_pat<load, f32, L4_loadri_rr>;
+ def: Loadxs_pat<load, f64, L4_loadrd_rr>;
+}
+
+let AddedComplexity = 20 in {
+ def: Loadxs_simple_pat<load, f32, L4_loadri_rr>;
+ def: Loadxs_simple_pat<load, f64, L4_loadrd_rr>;
+}
+
+let AddedComplexity = 80 in {
+ def: Loada_pat<load, f32, u32_0ImmPred, PS_loadriabs>;
+ def: Loada_pat<load, f32, addrga, PS_loadriabs>;
+ def: Loada_pat<load, f64, addrga, PS_loadrdabs>;
+}
+
+let AddedComplexity = 100 in {
+ def: LoadGP_pats <load, L2_loadrigp, f32>;
+ def: LoadGP_pats <load, L2_loadrdgp, f64>;
+}
+
+let AddedComplexity = 20 in {
+ defm: Storex_pat<store, F32, s30_2ImmPred, S2_storeri_io>;
+ defm: Storex_pat<store, F64, s29_3ImmPred, S2_storerd_io>;
+}
+
+// Simple patterns should be tried with the least priority.
+def: Storex_simple_pat<store, F32, S2_storeri_io>;
+def: Storex_simple_pat<store, F64, S2_storerd_io>;
+
+let AddedComplexity = 60 in {
+ defm : T_StoreAbsReg_Pats <S4_storeri_ur, IntRegs, f32, store>;
+ defm : T_StoreAbsReg_Pats <S4_storerd_ur, DoubleRegs, f64, store>;
+}
+
+let AddedComplexity = 40 in {
+ def: Storexs_pat<store, F32, S4_storeri_rr>;
+ def: Storexs_pat<store, F64, S4_storerd_rr>;
+}
+
+let AddedComplexity = 20 in {
+ def: Store_rr_pat<store, F32, S4_storeri_rr>;
+ def: Store_rr_pat<store, F64, S4_storerd_rr>;
+}
+
+let AddedComplexity = 80 in {
+ def: Storea_pat<store, F32, addrga, PS_storeriabs>;
+ def: Storea_pat<store, F64, addrga, PS_storerdabs>;
+}
+
+let AddedComplexity = 100 in {
+ def: Storea_pat<store, F32, addrgp, S2_storerigp>;
+ def: Storea_pat<store, F64, addrgp, S2_storerdgp>;
+}
+
+defm: Storex_pat<store, F32, s30_2ImmPred, S2_storeri_io>;
+defm: Storex_pat<store, F64, s29_3ImmPred, S2_storerd_io>;
+def: Storex_simple_pat<store, F32, S2_storeri_io>;
+def: Storex_simple_pat<store, F64, S2_storerd_io>;
+
+def: Pat<(fadd F32:$src1, F32:$src2),
+ (F2_sfadd F32:$src1, F32:$src2)>;
+
+def: Pat<(fsub F32:$src1, F32:$src2),
+ (F2_sfsub F32:$src1, F32:$src2)>;
+
+def: Pat<(fmul F32:$src1, F32:$src2),
+ (F2_sfmpy F32:$src1, F32:$src2)>;
+
+let Predicates = [HasV5T] in {
+ def: Pat<(f32 (fminnum F32:$Rs, F32:$Rt)), (F2_sfmin F32:$Rs, F32:$Rt)>;
+ def: Pat<(f32 (fmaxnum F32:$Rs, F32:$Rt)), (F2_sfmax F32:$Rs, F32:$Rt)>;
+}
+
+let AddedComplexity = 100, Predicates = [HasV5T] in {
+ class SfSel12<PatFrag Cmp, InstHexagon MI>
+ : Pat<(select (i1 (Cmp F32:$Rs, F32:$Rt)), F32:$Rs, F32:$Rt),
+ (MI F32:$Rs, F32:$Rt)>;
+ class SfSel21<PatFrag Cmp, InstHexagon MI>
+ : Pat<(select (i1 (Cmp F32:$Rs, F32:$Rt)), F32:$Rt, F32:$Rs),
+ (MI F32:$Rs, F32:$Rt)>;
+
+ def: SfSel12<setolt, F2_sfmin>;
+ def: SfSel12<setole, F2_sfmin>;
+ def: SfSel12<setogt, F2_sfmax>;
+ def: SfSel12<setoge, F2_sfmax>;
+ def: SfSel21<setolt, F2_sfmax>;
+ def: SfSel21<setole, F2_sfmax>;
+ def: SfSel21<setogt, F2_sfmin>;
+ def: SfSel21<setoge, F2_sfmin>;
+}
+
+class T_fcmp32_pat<PatFrag OpNode, InstHexagon MI>
+ : Pat<(i1 (OpNode F32:$src1, F32:$src2)),
+ (MI F32:$src1, F32:$src2)>;
+class T_fcmp64_pat<PatFrag OpNode, InstHexagon MI>
+ : Pat<(i1 (OpNode F64:$src1, F64:$src2)),
+ (MI F64:$src1, F64:$src2)>;
+
+def: T_fcmp32_pat<setoge, F2_sfcmpge>;
+def: T_fcmp32_pat<setuo, F2_sfcmpuo>;
+def: T_fcmp32_pat<setoeq, F2_sfcmpeq>;
+def: T_fcmp32_pat<setogt, F2_sfcmpgt>;
+
+def: T_fcmp64_pat<setoge, F2_dfcmpge>;
+def: T_fcmp64_pat<setuo, F2_dfcmpuo>;
+def: T_fcmp64_pat<setoeq, F2_dfcmpeq>;
+def: T_fcmp64_pat<setogt, F2_dfcmpgt>;
+
+let Predicates = [HasV5T] in
+multiclass T_fcmp_pats<PatFrag cmpOp, InstHexagon IntMI, InstHexagon DoubleMI> {
+ // IntRegs
+ def: Pat<(i1 (cmpOp F32:$src1, F32:$src2)),
+ (IntMI F32:$src1, F32:$src2)>;
+ // DoubleRegs
+ def: Pat<(i1 (cmpOp F64:$src1, F64:$src2)),
+ (DoubleMI F64:$src1, F64:$src2)>;
+}
+
+defm : T_fcmp_pats <seteq, F2_sfcmpeq, F2_dfcmpeq>;
+defm : T_fcmp_pats <setgt, F2_sfcmpgt, F2_dfcmpgt>;
+defm : T_fcmp_pats <setge, F2_sfcmpge, F2_dfcmpge>;
+
+//===----------------------------------------------------------------------===//
+// Multiclass to define 'Def Pats' for unordered gt, ge, eq operations.
+//===----------------------------------------------------------------------===//
+let Predicates = [HasV5T] in
+multiclass unord_Pats <PatFrag cmpOp, InstHexagon IntMI, InstHexagon DoubleMI> {
+ // IntRegs
+ def: Pat<(i1 (cmpOp F32:$src1, F32:$src2)),
+ (C2_or (F2_sfcmpuo F32:$src1, F32:$src2),
+ (IntMI F32:$src1, F32:$src2))>;
+
+ // DoubleRegs
+ def: Pat<(i1 (cmpOp F64:$src1, F64:$src2)),
+ (C2_or (F2_dfcmpuo F64:$src1, F64:$src2),
+ (DoubleMI F64:$src1, F64:$src2))>;
+}
+
+defm : unord_Pats <setuge, F2_sfcmpge, F2_dfcmpge>;
+defm : unord_Pats <setugt, F2_sfcmpgt, F2_dfcmpgt>;
+defm : unord_Pats <setueq, F2_sfcmpeq, F2_dfcmpeq>;
+
+//===----------------------------------------------------------------------===//
+// Multiclass to define 'Def Pats' for the following dags:
+// seteq(setoeq(op1, op2), 0) -> not(setoeq(op1, op2))
+// seteq(setoeq(op1, op2), 1) -> setoeq(op1, op2)
+// setne(setoeq(op1, op2), 0) -> setoeq(op1, op2)
+// setne(setoeq(op1, op2), 1) -> not(setoeq(op1, op2))
+//===----------------------------------------------------------------------===//
+let Predicates = [HasV5T] in
+multiclass eq_ordgePats <PatFrag cmpOp, InstHexagon IntMI,
+ InstHexagon DoubleMI> {
+ // IntRegs
+ def: Pat<(i1 (seteq (i1 (cmpOp F32:$src1, F32:$src2)), 0)),
+ (C2_not (IntMI F32:$src1, F32:$src2))>;
+ def: Pat<(i1 (seteq (i1 (cmpOp F32:$src1, F32:$src2)), 1)),
+ (IntMI F32:$src1, F32:$src2)>;
+ def: Pat<(i1 (setne (i1 (cmpOp F32:$src1, F32:$src2)), 0)),
+ (IntMI F32:$src1, F32:$src2)>;
+ def: Pat<(i1 (setne (i1 (cmpOp F32:$src1, F32:$src2)), 1)),
+ (C2_not (IntMI F32:$src1, F32:$src2))>;
+
+ // DoubleRegs
+ def : Pat<(i1 (seteq (i1 (cmpOp F64:$src1, F64:$src2)), 0)),
+ (C2_not (DoubleMI F64:$src1, F64:$src2))>;
+ def : Pat<(i1 (seteq (i1 (cmpOp F64:$src1, F64:$src2)), 1)),
+ (DoubleMI F64:$src1, F64:$src2)>;
+ def : Pat<(i1 (setne (i1 (cmpOp F64:$src1, F64:$src2)), 0)),
+ (DoubleMI F64:$src1, F64:$src2)>;
+ def : Pat<(i1 (setne (i1 (cmpOp F64:$src1, F64:$src2)), 1)),
+ (C2_not (DoubleMI F64:$src1, F64:$src2))>;
+}
+
+defm : eq_ordgePats<setoeq, F2_sfcmpeq, F2_dfcmpeq>;
+defm : eq_ordgePats<setoge, F2_sfcmpge, F2_dfcmpge>;
+defm : eq_ordgePats<setogt, F2_sfcmpgt, F2_dfcmpgt>;
+
+//===----------------------------------------------------------------------===//
+// Multiclass to define 'Def Pats' for the following dags:
+// seteq(setolt(op1, op2), 0) -> not(setogt(op2, op1))
+// seteq(setolt(op1, op2), 1) -> setogt(op2, op1)
+// setne(setolt(op1, op2), 0) -> setogt(op2, op1)
+// setne(setolt(op1, op2), 1) -> not(setogt(op2, op1))
+//===----------------------------------------------------------------------===//
+let Predicates = [HasV5T] in
+multiclass eq_ordltPats <PatFrag cmpOp, InstHexagon IntMI,
+ InstHexagon DoubleMI> {
+ // IntRegs
+ def: Pat<(i1 (seteq (i1 (cmpOp F32:$src1, F32:$src2)), 0)),
+ (C2_not (IntMI F32:$src2, F32:$src1))>;
+ def: Pat<(i1 (seteq (i1 (cmpOp F32:$src1, F32:$src2)), 1)),
+ (IntMI F32:$src2, F32:$src1)>;
+ def: Pat<(i1 (setne (i1 (cmpOp F32:$src1, F32:$src2)), 0)),
+ (IntMI F32:$src2, F32:$src1)>;
+ def: Pat<(i1 (setne (i1 (cmpOp F32:$src1, F32:$src2)), 1)),
+ (C2_not (IntMI F32:$src2, F32:$src1))>;
+
+ // DoubleRegs
+ def: Pat<(i1 (seteq (i1 (cmpOp F64:$src1, F64:$src2)), 0)),
+ (C2_not (DoubleMI F64:$src2, F64:$src1))>;
+ def: Pat<(i1 (seteq (i1 (cmpOp F64:$src1, F64:$src2)), 1)),
+ (DoubleMI F64:$src2, F64:$src1)>;
+ def: Pat<(i1 (setne (i1 (cmpOp F64:$src1, F64:$src2)), 0)),
+ (DoubleMI F64:$src2, F64:$src1)>;
+ def: Pat<(i1 (setne (i1 (cmpOp F64:$src1, F64:$src2)), 0)),
+ (C2_not (DoubleMI F64:$src2, F64:$src1))>;
+}
+
+defm : eq_ordltPats<setole, F2_sfcmpge, F2_dfcmpge>;
+defm : eq_ordltPats<setolt, F2_sfcmpgt, F2_dfcmpgt>;
+
+
+// o. seto inverse of setuo. http://llvm.org/docs/LangRef.html#i_fcmp
+let Predicates = [HasV5T] in {
+ def: Pat<(i1 (seto F32:$src1, F32:$src2)),
+ (C2_not (F2_sfcmpuo F32:$src2, F32:$src1))>;
+ def: Pat<(i1 (seto F32:$src1, f32ImmPred:$src2)),
+ (C2_not (F2_sfcmpuo (f32 (A2_tfrsi (ftoi $src2))), F32:$src1))>;
+ def: Pat<(i1 (seto F64:$src1, F64:$src2)),
+ (C2_not (F2_dfcmpuo F64:$src2, F64:$src1))>;
+ def: Pat<(i1 (seto F64:$src1, f64ImmPred:$src2)),
+ (C2_not (F2_dfcmpuo (CONST64 (ftoi $src2)), F64:$src1))>;
+}
+
+// Ordered lt.
+let Predicates = [HasV5T] in {
+ def: Pat<(i1 (setolt F32:$src1, F32:$src2)),
+ (F2_sfcmpgt F32:$src2, F32:$src1)>;
+ def: Pat<(i1 (setolt F32:$src1, f32ImmPred:$src2)),
+ (F2_sfcmpgt (f32 (A2_tfrsi (ftoi $src2))), F32:$src1)>;
+ def: Pat<(i1 (setolt F64:$src1, F64:$src2)),
+ (F2_dfcmpgt F64:$src2, F64:$src1)>;
+ def: Pat<(i1 (setolt F64:$src1, f64ImmPred:$src2)),
+ (F2_dfcmpgt (CONST64 (ftoi $src2)), F64:$src1)>;
+}
+
+// Unordered lt.
+let Predicates = [HasV5T] in {
+ def: Pat<(i1 (setult F32:$src1, F32:$src2)),
+ (C2_or (F2_sfcmpuo F32:$src1, F32:$src2),
+ (F2_sfcmpgt F32:$src2, F32:$src1))>;
+ def: Pat<(i1 (setult F32:$src1, f32ImmPred:$src2)),
+ (C2_or (F2_sfcmpuo F32:$src1, (f32 (A2_tfrsi (ftoi $src2)))),
+ (F2_sfcmpgt (f32 (A2_tfrsi (ftoi $src2))), F32:$src1))>;
+ def: Pat<(i1 (setult F64:$src1, F64:$src2)),
+ (C2_or (F2_dfcmpuo F64:$src1, F64:$src2),
+ (F2_dfcmpgt F64:$src2, F64:$src1))>;
+ def: Pat<(i1 (setult F64:$src1, f64ImmPred:$src2)),
+ (C2_or (F2_dfcmpuo F64:$src1, (CONST64 (ftoi $src2))),
+ (F2_dfcmpgt (CONST64 (ftoi $src2)), F64:$src1))>;
+}
+
+// Ordered le.
+let Predicates = [HasV5T] in {
+ // rs <= rt -> rt >= rs.
+ def: Pat<(i1 (setole F32:$src1, F32:$src2)),
+ (F2_sfcmpge F32:$src2, F32:$src1)>;
+ def: Pat<(i1 (setole F32:$src1, f32ImmPred:$src2)),
+ (F2_sfcmpge (f32 (A2_tfrsi (ftoi $src2))), F32:$src1)>;
+
+ // Rss <= Rtt -> Rtt >= Rss.
+ def: Pat<(i1 (setole F64:$src1, F64:$src2)),
+ (F2_dfcmpge F64:$src2, F64:$src1)>;
+ def: Pat<(i1 (setole F64:$src1, f64ImmPred:$src2)),
+ (F2_dfcmpge (CONST64 (ftoi $src2)), F64:$src1)>;
+}
+
+// Unordered le.
+let Predicates = [HasV5T] in {
+// rs <= rt -> rt >= rs.
+ def: Pat<(i1 (setule F32:$src1, F32:$src2)),
+ (C2_or (F2_sfcmpuo F32:$src1, F32:$src2),
+ (F2_sfcmpge F32:$src2, F32:$src1))>;
+ def: Pat<(i1 (setule F32:$src1, f32ImmPred:$src2)),
+ (C2_or (F2_sfcmpuo F32:$src1, (f32 (A2_tfrsi (ftoi $src2)))),
+ (F2_sfcmpge (f32 (A2_tfrsi (ftoi $src2))), F32:$src1))>;
+ def: Pat<(i1 (setule F64:$src1, F64:$src2)),
+ (C2_or (F2_dfcmpuo F64:$src1, F64:$src2),
+ (F2_dfcmpge F64:$src2, F64:$src1))>;
+ def: Pat<(i1 (setule F64:$src1, f64ImmPred:$src2)),
+ (C2_or (F2_dfcmpuo F64:$src1, (CONST64 (ftoi $src2))),
+ (F2_dfcmpge (CONST64 (ftoi $src2)), F64:$src1))>;
+}
+
+// Ordered ne.
+let Predicates = [HasV5T] in {
+ def: Pat<(i1 (setone F32:$src1, F32:$src2)),
+ (C2_not (F2_sfcmpeq F32:$src1, F32:$src2))>;
+ def: Pat<(i1 (setone F64:$src1, F64:$src2)),
+ (C2_not (F2_dfcmpeq F64:$src1, F64:$src2))>;
+ def: Pat<(i1 (setone F32:$src1, f32ImmPred:$src2)),
+ (C2_not (F2_sfcmpeq F32:$src1, (f32 (A2_tfrsi (ftoi $src2)))))>;
+ def: Pat<(i1 (setone F64:$src1, f64ImmPred:$src2)),
+ (C2_not (F2_dfcmpeq F64:$src1, (CONST64 (ftoi $src2))))>;
+}
+
+// Unordered ne.
+let Predicates = [HasV5T] in {
+ def: Pat<(i1 (setune F32:$src1, F32:$src2)),
+ (C2_or (F2_sfcmpuo F32:$src1, F32:$src2),
+ (C2_not (F2_sfcmpeq F32:$src1, F32:$src2)))>;
+ def: Pat<(i1 (setune F64:$src1, F64:$src2)),
+ (C2_or (F2_dfcmpuo F64:$src1, F64:$src2),
+ (C2_not (F2_dfcmpeq F64:$src1, F64:$src2)))>;
+ def: Pat<(i1 (setune F32:$src1, f32ImmPred:$src2)),
+ (C2_or (F2_sfcmpuo F32:$src1, (f32 (A2_tfrsi (ftoi $src2)))),
+ (C2_not (F2_sfcmpeq F32:$src1,
+ (f32 (A2_tfrsi (ftoi $src2))))))>;
+ def: Pat<(i1 (setune F64:$src1, f64ImmPred:$src2)),
+ (C2_or (F2_dfcmpuo F64:$src1, (CONST64 (ftoi $src2))),
+ (C2_not (F2_dfcmpeq F64:$src1,
+ (CONST64 (ftoi $src2)))))>;
+}
+
+// Besides set[o|u][comparions], we also need set[comparisons].
+let Predicates = [HasV5T] in {
+ // lt.
+ def: Pat<(i1 (setlt F32:$src1, F32:$src2)),
+ (F2_sfcmpgt F32:$src2, F32:$src1)>;
+ def: Pat<(i1 (setlt F32:$src1, f32ImmPred:$src2)),
+ (F2_sfcmpgt (f32 (A2_tfrsi (ftoi $src2))), F32:$src1)>;
+ def: Pat<(i1 (setlt F64:$src1, F64:$src2)),
+ (F2_dfcmpgt F64:$src2, F64:$src1)>;
+ def: Pat<(i1 (setlt F64:$src1, f64ImmPred:$src2)),
+ (F2_dfcmpgt (CONST64 (ftoi $src2)), F64:$src1)>;
+
+ // le.
+ // rs <= rt -> rt >= rs.
+ def: Pat<(i1 (setle F32:$src1, F32:$src2)),
+ (F2_sfcmpge F32:$src2, F32:$src1)>;
+ def: Pat<(i1 (setle F32:$src1, f32ImmPred:$src2)),
+ (F2_sfcmpge (f32 (A2_tfrsi (ftoi $src2))), F32:$src1)>;
+
+ // Rss <= Rtt -> Rtt >= Rss.
+ def: Pat<(i1 (setle F64:$src1, F64:$src2)),
+ (F2_dfcmpge F64:$src2, F64:$src1)>;
+ def: Pat<(i1 (setle F64:$src1, f64ImmPred:$src2)),
+ (F2_dfcmpge (CONST64 (ftoi $src2)), F64:$src1)>;
+
+ // ne.
+ def: Pat<(i1 (setne F32:$src1, F32:$src2)),
+ (C2_not (F2_sfcmpeq F32:$src1, F32:$src2))>;
+ def: Pat<(i1 (setne F64:$src1, F64:$src2)),
+ (C2_not (F2_dfcmpeq F64:$src1, F64:$src2))>;
+ def: Pat<(i1 (setne F32:$src1, f32ImmPred:$src2)),
+ (C2_not (F2_sfcmpeq F32:$src1, (f32 (A2_tfrsi (ftoi $src2)))))>;
+ def: Pat<(i1 (setne F64:$src1, f64ImmPred:$src2)),
+ (C2_not (F2_dfcmpeq F64:$src1, (CONST64 (ftoi $src2))))>;
+}
+
+
+def: Pat<(f64 (fpextend F32:$Rs)), (F2_conv_sf2df F32:$Rs)>;
+def: Pat<(f32 (fpround F64:$Rs)), (F2_conv_df2sf F64:$Rs)>;
+
+def: Pat<(f32 (sint_to_fp I32:$Rs)), (F2_conv_w2sf I32:$Rs)>;
+def: Pat<(f32 (sint_to_fp I64:$Rs)), (F2_conv_d2sf I64:$Rs)>;
+def: Pat<(f64 (sint_to_fp I32:$Rs)), (F2_conv_w2df I32:$Rs)>;
+def: Pat<(f64 (sint_to_fp I64:$Rs)), (F2_conv_d2df I64:$Rs)>;
+
+def: Pat<(f32 (uint_to_fp I32:$Rs)), (F2_conv_uw2sf I32:$Rs)>;
+def: Pat<(f32 (uint_to_fp I64:$Rs)), (F2_conv_ud2sf I64:$Rs)>;
+def: Pat<(f64 (uint_to_fp I32:$Rs)), (F2_conv_uw2df I32:$Rs)>;
+def: Pat<(f64 (uint_to_fp I64:$Rs)), (F2_conv_ud2df I64:$Rs)>;
+
+def: Pat<(i32 (fp_to_sint F32:$Rs)), (F2_conv_sf2w_chop F32:$Rs)>;
+def: Pat<(i32 (fp_to_sint F64:$Rs)), (F2_conv_df2w_chop F64:$Rs)>;
+def: Pat<(i64 (fp_to_sint F32:$Rs)), (F2_conv_sf2d_chop F32:$Rs)>;
+def: Pat<(i64 (fp_to_sint F64:$Rs)), (F2_conv_df2d_chop F64:$Rs)>;
+
+def: Pat<(i32 (fp_to_uint F32:$Rs)), (F2_conv_sf2uw_chop F32:$Rs)>;
+def: Pat<(i32 (fp_to_uint F64:$Rs)), (F2_conv_df2uw_chop F64:$Rs)>;
+def: Pat<(i64 (fp_to_uint F32:$Rs)), (F2_conv_sf2ud_chop F32:$Rs)>;
+def: Pat<(i64 (fp_to_uint F64:$Rs)), (F2_conv_df2ud_chop F64:$Rs)>;
+
+// Bitcast is different than [fp|sint|uint]_to_[sint|uint|fp].
+let Predicates = [HasV5T] in {
+ def: Pat <(i32 (bitconvert F32:$src)), (I32:$src)>;
+ def: Pat <(f32 (bitconvert I32:$src)), (F32:$src)>;
+ def: Pat <(i64 (bitconvert F64:$src)), (I64:$src)>;
+ def: Pat <(f64 (bitconvert I64:$src)), (F64:$src)>;
+}
+
+def : Pat <(fma F32:$src2, F32:$src3, F32:$src1),
+ (F2_sffma F32:$src1, F32:$src2, F32:$src3)>;
+
+def : Pat <(fma (fneg F32:$src2), F32:$src3, F32:$src1),
+ (F2_sffms F32:$src1, F32:$src2, F32:$src3)>;
+
+def : Pat <(fma F32:$src2, (fneg F32:$src3), F32:$src1),
+ (F2_sffms F32:$src1, F32:$src2, F32:$src3)>;
+
+def: Pat<(select I1:$Pu, F32:$Rs, f32ImmPred:$imm),
+ (C2_muxir I1:$Pu, F32:$Rs, (ftoi $imm))>,
+ Requires<[HasV5T]>;
+
+def: Pat<(select I1:$Pu, f32ImmPred:$imm, F32:$Rt),
+ (C2_muxri I1:$Pu, (ftoi $imm), F32:$Rt)>,
+ Requires<[HasV5T]>;
+
+def: Pat<(select I1:$src1, F32:$src2, F32:$src3),
+ (C2_mux I1:$src1, F32:$src2, F32:$src3)>,
+ Requires<[HasV5T]>;
+
+def: Pat<(select (i1 (setult F32:$src1, F32:$src2)), F32:$src3, F32:$src4),
+ (C2_mux (F2_sfcmpgt F32:$src2, F32:$src1), F32:$src4, F32:$src3)>,
+ Requires<[HasV5T]>;
+
+def: Pat<(select I1:$src1, F64:$src2, F64:$src3),
+ (C2_vmux I1:$src1, F64:$src2, F64:$src3)>,
+ Requires<[HasV5T]>;
+
+def: Pat<(select (i1 (setult F64:$src1, F64:$src2)), F64:$src3, F64:$src4),
+ (C2_vmux (F2_dfcmpgt F64:$src2, F64:$src1), F64:$src3, F64:$src4)>,
+ Requires<[HasV5T]>;
+
+// Map from p0 = pnot(p0); r0 = select(p0, #i, r1)
+// => r0 = mux(p0, #i, r1)
+def: Pat<(select (not I1:$src1), f32ImmPred:$src2, F32:$src3),
+ (C2_muxir I1:$src1, F32:$src3, (ftoi $src2))>,
+ Requires<[HasV5T]>;
+
+// Map from p0 = pnot(p0); r0 = mux(p0, r1, #i)
+// => r0 = mux(p0, r1, #i)
+def: Pat<(select (not I1:$src1), F32:$src2, f32ImmPred:$src3),
+ (C2_muxri I1:$src1, (ftoi $src3), F32:$src2)>,
+ Requires<[HasV5T]>;
+
+def: Pat<(i32 (fp_to_sint F64:$src1)),
+ (LoReg (F2_conv_df2d_chop F64:$src1))>,
+ Requires<[HasV5T]>;
+
+def : Pat <(fabs F32:$src1),
+ (S2_clrbit_i F32:$src1, 31)>,
+ Requires<[HasV5T]>;
+
+def : Pat <(fneg F32:$src1),
+ (S2_togglebit_i F32:$src1, 31)>,
+ Requires<[HasV5T]>;
+
+def: Pat<(fabs F64:$Rs),
+ (REG_SEQUENCE DoubleRegs,
+ (S2_clrbit_i (HiReg $Rs), 31), isub_hi,
+ (i32 (LoReg $Rs)), isub_lo)>;
+
+def: Pat<(fneg F64:$Rs),
+ (REG_SEQUENCE DoubleRegs,
+ (S2_togglebit_i (HiReg $Rs), 31), isub_hi,
+ (i32 (LoReg $Rs)), isub_lo)>;
+
+def alignedload : PatFrag<(ops node:$addr), (load $addr), [{
+ return isAlignedMemNode(dyn_cast<MemSDNode>(N));
+}]>;
+
+def unalignedload : PatFrag<(ops node:$addr), (load $addr), [{
+ return !isAlignedMemNode(dyn_cast<MemSDNode>(N));
+}]>;
+
+def alignedstore : PatFrag<(ops node:$val, node:$addr), (store $val, $addr), [{
+ return isAlignedMemNode(dyn_cast<MemSDNode>(N));
+}]>;
+
+def unalignedstore : PatFrag<(ops node:$val, node:$addr), (store $val, $addr), [{
+ return !isAlignedMemNode(dyn_cast<MemSDNode>(N));
+}]>;
+
+
+def s4_6ImmPred: PatLeaf<(i32 imm), [{
+ int64_t V = N->getSExtValue();
+ return isShiftedInt<4,6>(V);
+}]>;
+
+def s4_7ImmPred: PatLeaf<(i32 imm), [{
+ int64_t V = N->getSExtValue();
+ return isShiftedInt<4,7>(V);
+}]>;
+
+
+multiclass vS32b_ai_pats <ValueType VTSgl, ValueType VTDbl> {
+ // Aligned stores
+ def : Pat<(alignedstore (VTSgl VectorRegs:$src1), IntRegs:$addr),
+ (V6_vS32b_ai IntRegs:$addr, 0, (VTSgl VectorRegs:$src1))>,
+ Requires<[UseHVXSgl]>;
+ def : Pat<(unalignedstore (VTSgl VectorRegs:$src1), IntRegs:$addr),
+ (V6_vS32Ub_ai IntRegs:$addr, 0, (VTSgl VectorRegs:$src1))>,
+ Requires<[UseHVXSgl]>;
+
+ // 128B Aligned stores
+ def : Pat<(alignedstore (VTDbl VectorRegs128B:$src1), IntRegs:$addr),
+ (V6_vS32b_ai_128B IntRegs:$addr, 0, (VTDbl VectorRegs128B:$src1))>,
+ Requires<[UseHVXDbl]>;
+ def : Pat<(unalignedstore (VTDbl VectorRegs128B:$src1), IntRegs:$addr),
+ (V6_vS32Ub_ai_128B IntRegs:$addr, 0, (VTDbl VectorRegs128B:$src1))>,
+ Requires<[UseHVXDbl]>;
+
+ // Fold Add R+OFF into vector store.
+ let AddedComplexity = 10 in {
+ def : Pat<(alignedstore (VTSgl VectorRegs:$src1),
+ (add IntRegs:$src2, s4_6ImmPred:$offset)),
+ (V6_vS32b_ai IntRegs:$src2, s4_6ImmPred:$offset,
+ (VTSgl VectorRegs:$src1))>,
+ Requires<[UseHVXSgl]>;
+ def : Pat<(unalignedstore (VTSgl VectorRegs:$src1),
+ (add IntRegs:$src2, s4_6ImmPred:$offset)),
+ (V6_vS32Ub_ai IntRegs:$src2, s4_6ImmPred:$offset,
+ (VTSgl VectorRegs:$src1))>,
+ Requires<[UseHVXSgl]>;
+
+ // Fold Add R+OFF into vector store 128B.
+ def : Pat<(alignedstore (VTDbl VectorRegs128B:$src1),
+ (add IntRegs:$src2, s4_7ImmPred:$offset)),
+ (V6_vS32b_ai_128B IntRegs:$src2, s4_7ImmPred:$offset,
+ (VTDbl VectorRegs128B:$src1))>,
+ Requires<[UseHVXDbl]>;
+ def : Pat<(unalignedstore (VTDbl VectorRegs128B:$src1),
+ (add IntRegs:$src2, s4_7ImmPred:$offset)),
+ (V6_vS32Ub_ai_128B IntRegs:$src2, s4_7ImmPred:$offset,
+ (VTDbl VectorRegs128B:$src1))>,
+ Requires<[UseHVXDbl]>;
+ }
+}
+
+defm : vS32b_ai_pats <v64i8, v128i8>;
+defm : vS32b_ai_pats <v32i16, v64i16>;
+defm : vS32b_ai_pats <v16i32, v32i32>;
+defm : vS32b_ai_pats <v8i64, v16i64>;
+
+
+multiclass vL32b_ai_pats <ValueType VTSgl, ValueType VTDbl> {
+ // Aligned loads
+ def : Pat < (VTSgl (alignedload IntRegs:$addr)),
+ (V6_vL32b_ai IntRegs:$addr, 0) >,
+ Requires<[UseHVXSgl]>;
+ def : Pat < (VTSgl (unalignedload IntRegs:$addr)),
+ (V6_vL32Ub_ai IntRegs:$addr, 0) >,
+ Requires<[UseHVXSgl]>;
+
+ // 128B Load
+ def : Pat < (VTDbl (alignedload IntRegs:$addr)),
+ (V6_vL32b_ai_128B IntRegs:$addr, 0) >,
+ Requires<[UseHVXDbl]>;
+ def : Pat < (VTDbl (unalignedload IntRegs:$addr)),
+ (V6_vL32Ub_ai_128B IntRegs:$addr, 0) >,
+ Requires<[UseHVXDbl]>;
+
+ // Fold Add R+OFF into vector load.
+ let AddedComplexity = 10 in {
+ def : Pat<(VTDbl (alignedload (add IntRegs:$src2, s4_7ImmPred:$offset))),
+ (V6_vL32b_ai_128B IntRegs:$src2, s4_7ImmPred:$offset)>,
+ Requires<[UseHVXDbl]>;
+ def : Pat<(VTDbl (unalignedload (add IntRegs:$src2, s4_7ImmPred:$offset))),
+ (V6_vL32Ub_ai_128B IntRegs:$src2, s4_7ImmPred:$offset)>,
+ Requires<[UseHVXDbl]>;
+
+ def : Pat<(VTSgl (alignedload (add IntRegs:$src2, s4_6ImmPred:$offset))),
+ (V6_vL32b_ai IntRegs:$src2, s4_6ImmPred:$offset)>,
+ Requires<[UseHVXSgl]>;
+ def : Pat<(VTSgl (unalignedload (add IntRegs:$src2, s4_6ImmPred:$offset))),
+ (V6_vL32Ub_ai IntRegs:$src2, s4_6ImmPred:$offset)>,
+ Requires<[UseHVXSgl]>;
+ }
+}
+
+defm : vL32b_ai_pats <v64i8, v128i8>;
+defm : vL32b_ai_pats <v32i16, v64i16>;
+defm : vL32b_ai_pats <v16i32, v32i32>;
+defm : vL32b_ai_pats <v8i64, v16i64>;
+
+multiclass STrivv_pats <ValueType VTSgl, ValueType VTDbl> {
+ def : Pat<(alignedstore (VTSgl VecDblRegs:$src1), IntRegs:$addr),
+ (PS_vstorerw_ai IntRegs:$addr, 0, (VTSgl VecDblRegs:$src1))>,
+ Requires<[UseHVXSgl]>;
+ def : Pat<(unalignedstore (VTSgl VecDblRegs:$src1), IntRegs:$addr),
+ (PS_vstorerwu_ai IntRegs:$addr, 0, (VTSgl VecDblRegs:$src1))>,
+ Requires<[UseHVXSgl]>;
+
+ def : Pat<(alignedstore (VTDbl VecDblRegs128B:$src1), IntRegs:$addr),
+ (PS_vstorerw_ai_128B IntRegs:$addr, 0,
+ (VTDbl VecDblRegs128B:$src1))>,
+ Requires<[UseHVXDbl]>;
+ def : Pat<(unalignedstore (VTDbl VecDblRegs128B:$src1), IntRegs:$addr),
+ (PS_vstorerwu_ai_128B IntRegs:$addr, 0,
+ (VTDbl VecDblRegs128B:$src1))>,
+ Requires<[UseHVXDbl]>;
+}
+
+defm : STrivv_pats <v128i8, v256i8>;
+defm : STrivv_pats <v64i16, v128i16>;
+defm : STrivv_pats <v32i32, v64i32>;
+defm : STrivv_pats <v16i64, v32i64>;
+
+multiclass LDrivv_pats <ValueType VTSgl, ValueType VTDbl> {
+ def : Pat<(VTSgl (alignedload I32:$addr)),
+ (PS_vloadrw_ai I32:$addr, 0)>,
+ Requires<[UseHVXSgl]>;
+ def : Pat<(VTSgl (unalignedload I32:$addr)),
+ (PS_vloadrwu_ai I32:$addr, 0)>,
+ Requires<[UseHVXSgl]>;
+
+ def : Pat<(VTDbl (alignedload I32:$addr)),
+ (PS_vloadrw_ai_128B I32:$addr, 0)>,
+ Requires<[UseHVXDbl]>;
+ def : Pat<(VTDbl (unalignedload I32:$addr)),
+ (PS_vloadrwu_ai_128B I32:$addr, 0)>,
+ Requires<[UseHVXDbl]>;
+}
+
+defm : LDrivv_pats <v128i8, v256i8>;
+defm : LDrivv_pats <v64i16, v128i16>;
+defm : LDrivv_pats <v32i32, v64i32>;
+defm : LDrivv_pats <v16i64, v32i64>;
+
+let Predicates = [HasV60T,UseHVXSgl] in {
+ def: Pat<(select I1:$Pu, (v16i32 VectorRegs:$Vs), VectorRegs:$Vt),
+ (PS_vselect I1:$Pu, VectorRegs:$Vs, VectorRegs:$Vt)>;
+ def: Pat<(select I1:$Pu, (v32i32 VecDblRegs:$Vs), VecDblRegs:$Vt),
+ (PS_wselect I1:$Pu, VecDblRegs:$Vs, VecDblRegs:$Vt)>;
+}
+let Predicates = [HasV60T,UseHVXDbl] in {
+ def: Pat<(select I1:$Pu, (v32i32 VectorRegs128B:$Vs), VectorRegs128B:$Vt),
+ (PS_vselect_128B I1:$Pu, VectorRegs128B:$Vs, VectorRegs128B:$Vt)>;
+ def: Pat<(select I1:$Pu, (v64i32 VecDblRegs128B:$Vs), VecDblRegs128B:$Vt),
+ (PS_wselect_128B I1:$Pu, VecDblRegs128B:$Vs, VecDblRegs128B:$Vt)>;
+}
+
+
+def SDTHexagonVCOMBINE: SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>,
+ SDTCisSubVecOfVec<1, 0>]>;
+
+def HexagonVCOMBINE: SDNode<"HexagonISD::VCOMBINE", SDTHexagonVCOMBINE>;
+
+def: Pat<(v32i32 (HexagonVCOMBINE (v16i32 VectorRegs:$Vs),
+ (v16i32 VectorRegs:$Vt))),
+ (V6_vcombine VectorRegs:$Vs, VectorRegs:$Vt)>,
+ Requires<[UseHVXSgl]>;
+def: Pat<(v64i32 (HexagonVCOMBINE (v32i32 VecDblRegs:$Vs),
+ (v32i32 VecDblRegs:$Vt))),
+ (V6_vcombine_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>,
+ Requires<[UseHVXDbl]>;
+
+def SDTHexagonVPACK: SDTypeProfile<1, 3, [SDTCisSameAs<1, 2>,
+ SDTCisInt<3>]>;
+
+def HexagonVPACK: SDNode<"HexagonISD::VPACK", SDTHexagonVPACK>;
+
+// 0 as the last argument denotes vpacke. 1 denotes vpacko
+def: Pat<(v64i8 (HexagonVPACK (v64i8 VectorRegs:$Vs),
+ (v64i8 VectorRegs:$Vt), (i32 0))),
+ (V6_vpackeb VectorRegs:$Vs, VectorRegs:$Vt)>,
+ Requires<[UseHVXSgl]>;
+def: Pat<(v64i8 (HexagonVPACK (v64i8 VectorRegs:$Vs),
+ (v64i8 VectorRegs:$Vt), (i32 1))),
+ (V6_vpackob VectorRegs:$Vs, VectorRegs:$Vt)>,
+ Requires<[UseHVXSgl]>;
+def: Pat<(v32i16 (HexagonVPACK (v32i16 VectorRegs:$Vs),
+ (v32i16 VectorRegs:$Vt), (i32 0))),
+ (V6_vpackeh VectorRegs:$Vs, VectorRegs:$Vt)>,
+ Requires<[UseHVXSgl]>;
+def: Pat<(v32i16 (HexagonVPACK (v32i16 VectorRegs:$Vs),
+ (v32i16 VectorRegs:$Vt), (i32 1))),
+ (V6_vpackoh VectorRegs:$Vs, VectorRegs:$Vt)>,
+ Requires<[UseHVXSgl]>;
+
+def: Pat<(v128i8 (HexagonVPACK (v128i8 VecDblRegs:$Vs),
+ (v128i8 VecDblRegs:$Vt), (i32 0))),
+ (V6_vpackeb_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>,
+ Requires<[UseHVXDbl]>;
+def: Pat<(v128i8 (HexagonVPACK (v128i8 VecDblRegs:$Vs),
+ (v128i8 VecDblRegs:$Vt), (i32 1))),
+ (V6_vpackob_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>,
+ Requires<[UseHVXDbl]>;
+def: Pat<(v64i16 (HexagonVPACK (v64i16 VecDblRegs:$Vs),
+ (v64i16 VecDblRegs:$Vt), (i32 0))),
+ (V6_vpackeh_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>,
+ Requires<[UseHVXDbl]>;
+def: Pat<(v64i16 (HexagonVPACK (v64i16 VecDblRegs:$Vs),
+ (v64i16 VecDblRegs:$Vt), (i32 1))),
+ (V6_vpackoh_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>,
+ Requires<[UseHVXDbl]>;
+
+def V2I1: PatLeaf<(v2i1 PredRegs:$R)>;
+def V4I1: PatLeaf<(v4i1 PredRegs:$R)>;
+def V8I1: PatLeaf<(v8i1 PredRegs:$R)>;
+def V4I8: PatLeaf<(v4i8 IntRegs:$R)>;
+def V2I16: PatLeaf<(v2i16 IntRegs:$R)>;
+def V8I8: PatLeaf<(v8i8 DoubleRegs:$R)>;
+def V4I16: PatLeaf<(v4i16 DoubleRegs:$R)>;
+def V2I32: PatLeaf<(v2i32 DoubleRegs:$R)>;
+
+
+multiclass bitconvert_32<ValueType a, ValueType b> {
+ def : Pat <(b (bitconvert (a IntRegs:$src))),
+ (b IntRegs:$src)>;
+ def : Pat <(a (bitconvert (b IntRegs:$src))),
+ (a IntRegs:$src)>;
+}
+
+multiclass bitconvert_64<ValueType a, ValueType b> {
+ def : Pat <(b (bitconvert (a DoubleRegs:$src))),
+ (b DoubleRegs:$src)>;
+ def : Pat <(a (bitconvert (b DoubleRegs:$src))),
+ (a DoubleRegs:$src)>;
+}
+
+// Bit convert vector types to integers.
+defm : bitconvert_32<v4i8, i32>;
+defm : bitconvert_32<v2i16, i32>;
+defm : bitconvert_64<v8i8, i64>;
+defm : bitconvert_64<v4i16, i64>;
+defm : bitconvert_64<v2i32, i64>;
+
+def: Pat<(sra (v4i16 DoubleRegs:$src1), u4_0ImmPred:$src2),
+ (S2_asr_i_vh DoubleRegs:$src1, imm:$src2)>;
+def: Pat<(srl (v4i16 DoubleRegs:$src1), u4_0ImmPred:$src2),
+ (S2_lsr_i_vh DoubleRegs:$src1, imm:$src2)>;
+def: Pat<(shl (v4i16 DoubleRegs:$src1), u4_0ImmPred:$src2),
+ (S2_asl_i_vh DoubleRegs:$src1, imm:$src2)>;
+
+def: Pat<(sra (v2i32 DoubleRegs:$src1), u5_0ImmPred:$src2),
+ (S2_asr_i_vw DoubleRegs:$src1, imm:$src2)>;
+def: Pat<(srl (v2i32 DoubleRegs:$src1), u5_0ImmPred:$src2),
+ (S2_lsr_i_vw DoubleRegs:$src1, imm:$src2)>;
+def: Pat<(shl (v2i32 DoubleRegs:$src1), u5_0ImmPred:$src2),
+ (S2_asl_i_vw DoubleRegs:$src1, imm:$src2)>;
+
+def : Pat<(v2i16 (add (v2i16 IntRegs:$src1), (v2i16 IntRegs:$src2))),
+ (A2_svaddh IntRegs:$src1, IntRegs:$src2)>;
+
+def : Pat<(v2i16 (sub (v2i16 IntRegs:$src1), (v2i16 IntRegs:$src2))),
+ (A2_svsubh IntRegs:$src1, IntRegs:$src2)>;
+
+def HexagonVSPLATB: SDNode<"HexagonISD::VSPLATB", SDTUnaryOp>;
+def HexagonVSPLATH: SDNode<"HexagonISD::VSPLATH", SDTUnaryOp>;
+
+// Replicate the low 8-bits from 32-bits input register into each of the
+// four bytes of 32-bits destination register.
+def: Pat<(v4i8 (HexagonVSPLATB I32:$Rs)), (S2_vsplatrb I32:$Rs)>;
+
+// Replicate the low 16-bits from 32-bits input register into each of the
+// four halfwords of 64-bits destination register.
+def: Pat<(v4i16 (HexagonVSPLATH I32:$Rs)), (S2_vsplatrh I32:$Rs)>;
+
+
+class VArith_pat <InstHexagon MI, SDNode Op, PatFrag Type>
+ : Pat <(Op Type:$Rss, Type:$Rtt),
+ (MI Type:$Rss, Type:$Rtt)>;
+
+def: VArith_pat <A2_vaddub, add, V8I8>;
+def: VArith_pat <A2_vaddh, add, V4I16>;
+def: VArith_pat <A2_vaddw, add, V2I32>;
+def: VArith_pat <A2_vsubub, sub, V8I8>;
+def: VArith_pat <A2_vsubh, sub, V4I16>;
+def: VArith_pat <A2_vsubw, sub, V2I32>;
+
+def: VArith_pat <A2_and, and, V2I16>;
+def: VArith_pat <A2_xor, xor, V2I16>;
+def: VArith_pat <A2_or, or, V2I16>;
+
+def: VArith_pat <A2_andp, and, V8I8>;
+def: VArith_pat <A2_andp, and, V4I16>;
+def: VArith_pat <A2_andp, and, V2I32>;
+def: VArith_pat <A2_orp, or, V8I8>;
+def: VArith_pat <A2_orp, or, V4I16>;
+def: VArith_pat <A2_orp, or, V2I32>;
+def: VArith_pat <A2_xorp, xor, V8I8>;
+def: VArith_pat <A2_xorp, xor, V4I16>;
+def: VArith_pat <A2_xorp, xor, V2I32>;
+
+def: Pat<(v2i32 (sra V2I32:$b, (i64 (HexagonCOMBINE (i32 u5_0ImmPred:$c),
+ (i32 u5_0ImmPred:$c))))),
+ (S2_asr_i_vw V2I32:$b, imm:$c)>;
+def: Pat<(v2i32 (srl V2I32:$b, (i64 (HexagonCOMBINE (i32 u5_0ImmPred:$c),
+ (i32 u5_0ImmPred:$c))))),
+ (S2_lsr_i_vw V2I32:$b, imm:$c)>;
+def: Pat<(v2i32 (shl V2I32:$b, (i64 (HexagonCOMBINE (i32 u5_0ImmPred:$c),
+ (i32 u5_0ImmPred:$c))))),
+ (S2_asl_i_vw V2I32:$b, imm:$c)>;
+
+def: Pat<(v4i16 (sra V4I16:$b, (v4i16 (HexagonVSPLATH (i32 (u4_0ImmPred:$c)))))),
+ (S2_asr_i_vh V4I16:$b, imm:$c)>;
+def: Pat<(v4i16 (srl V4I16:$b, (v4i16 (HexagonVSPLATH (i32 (u4_0ImmPred:$c)))))),
+ (S2_lsr_i_vh V4I16:$b, imm:$c)>;
+def: Pat<(v4i16 (shl V4I16:$b, (v4i16 (HexagonVSPLATH (i32 (u4_0ImmPred:$c)))))),
+ (S2_asl_i_vh V4I16:$b, imm:$c)>;
+
+
+def SDTHexagon_v2i32_v2i32_i32 : SDTypeProfile<1, 2,
+ [SDTCisSameAs<0, 1>, SDTCisVT<0, v2i32>, SDTCisInt<2>]>;
+def SDTHexagon_v4i16_v4i16_i32 : SDTypeProfile<1, 2,
+ [SDTCisSameAs<0, 1>, SDTCisVT<0, v4i16>, SDTCisInt<2>]>;
+
+def HexagonVSRAW: SDNode<"HexagonISD::VSRAW", SDTHexagon_v2i32_v2i32_i32>;
+def HexagonVSRAH: SDNode<"HexagonISD::VSRAH", SDTHexagon_v4i16_v4i16_i32>;
+def HexagonVSRLW: SDNode<"HexagonISD::VSRLW", SDTHexagon_v2i32_v2i32_i32>;
+def HexagonVSRLH: SDNode<"HexagonISD::VSRLH", SDTHexagon_v4i16_v4i16_i32>;
+def HexagonVSHLW: SDNode<"HexagonISD::VSHLW", SDTHexagon_v2i32_v2i32_i32>;
+def HexagonVSHLH: SDNode<"HexagonISD::VSHLH", SDTHexagon_v4i16_v4i16_i32>;
+
+def: Pat<(v2i32 (HexagonVSRAW V2I32:$Rs, u5_0ImmPred:$u5)),
+ (S2_asr_i_vw V2I32:$Rs, imm:$u5)>;
+def: Pat<(v4i16 (HexagonVSRAH V4I16:$Rs, u4_0ImmPred:$u4)),
+ (S2_asr_i_vh V4I16:$Rs, imm:$u4)>;
+def: Pat<(v2i32 (HexagonVSRLW V2I32:$Rs, u5_0ImmPred:$u5)),
+ (S2_lsr_i_vw V2I32:$Rs, imm:$u5)>;
+def: Pat<(v4i16 (HexagonVSRLH V4I16:$Rs, u4_0ImmPred:$u4)),
+ (S2_lsr_i_vh V4I16:$Rs, imm:$u4)>;
+def: Pat<(v2i32 (HexagonVSHLW V2I32:$Rs, u5_0ImmPred:$u5)),
+ (S2_asl_i_vw V2I32:$Rs, imm:$u5)>;
+def: Pat<(v4i16 (HexagonVSHLH V4I16:$Rs, u4_0ImmPred:$u4)),
+ (S2_asl_i_vh V4I16:$Rs, imm:$u4)>;
+
+class vshift_rr_pat<InstHexagon MI, SDNode Op, PatFrag Value>
+ : Pat <(Op Value:$Rs, I32:$Rt),
+ (MI Value:$Rs, I32:$Rt)>;
+
+def: vshift_rr_pat <S2_asr_r_vw, HexagonVSRAW, V2I32>;
+def: vshift_rr_pat <S2_asr_r_vh, HexagonVSRAH, V4I16>;
+def: vshift_rr_pat <S2_lsr_r_vw, HexagonVSRLW, V2I32>;
+def: vshift_rr_pat <S2_lsr_r_vh, HexagonVSRLH, V4I16>;
+def: vshift_rr_pat <S2_asl_r_vw, HexagonVSHLW, V2I32>;
+def: vshift_rr_pat <S2_asl_r_vh, HexagonVSHLH, V4I16>;
+
+
+def SDTHexagonVecCompare_v8i8 : SDTypeProfile<1, 2,
+ [SDTCisSameAs<1, 2>, SDTCisVT<0, i1>, SDTCisVT<1, v8i8>]>;
+def SDTHexagonVecCompare_v4i16 : SDTypeProfile<1, 2,
+ [SDTCisSameAs<1, 2>, SDTCisVT<0, i1>, SDTCisVT<1, v4i16>]>;
+def SDTHexagonVecCompare_v2i32 : SDTypeProfile<1, 2,
+ [SDTCisSameAs<1, 2>, SDTCisVT<0, i1>, SDTCisVT<1, v2i32>]>;
+
+def HexagonVCMPBEQ: SDNode<"HexagonISD::VCMPBEQ", SDTHexagonVecCompare_v8i8>;
+def HexagonVCMPBGT: SDNode<"HexagonISD::VCMPBGT", SDTHexagonVecCompare_v8i8>;
+def HexagonVCMPBGTU: SDNode<"HexagonISD::VCMPBGTU", SDTHexagonVecCompare_v8i8>;
+def HexagonVCMPHEQ: SDNode<"HexagonISD::VCMPHEQ", SDTHexagonVecCompare_v4i16>;
+def HexagonVCMPHGT: SDNode<"HexagonISD::VCMPHGT", SDTHexagonVecCompare_v4i16>;
+def HexagonVCMPHGTU: SDNode<"HexagonISD::VCMPHGTU", SDTHexagonVecCompare_v4i16>;
+def HexagonVCMPWEQ: SDNode<"HexagonISD::VCMPWEQ", SDTHexagonVecCompare_v2i32>;
+def HexagonVCMPWGT: SDNode<"HexagonISD::VCMPWGT", SDTHexagonVecCompare_v2i32>;
+def HexagonVCMPWGTU: SDNode<"HexagonISD::VCMPWGTU", SDTHexagonVecCompare_v2i32>;
+
+
+class vcmp_i1_pat<InstHexagon MI, SDNode Op, PatFrag Value>
+ : Pat <(i1 (Op Value:$Rs, Value:$Rt)),
+ (MI Value:$Rs, Value:$Rt)>;
+
+def: vcmp_i1_pat<A2_vcmpbeq, HexagonVCMPBEQ, V8I8>;
+def: vcmp_i1_pat<A4_vcmpbgt, HexagonVCMPBGT, V8I8>;
+def: vcmp_i1_pat<A2_vcmpbgtu, HexagonVCMPBGTU, V8I8>;
+
+def: vcmp_i1_pat<A2_vcmpheq, HexagonVCMPHEQ, V4I16>;
+def: vcmp_i1_pat<A2_vcmphgt, HexagonVCMPHGT, V4I16>;
+def: vcmp_i1_pat<A2_vcmphgtu, HexagonVCMPHGTU, V4I16>;
+
+def: vcmp_i1_pat<A2_vcmpweq, HexagonVCMPWEQ, V2I32>;
+def: vcmp_i1_pat<A2_vcmpwgt, HexagonVCMPWGT, V2I32>;
+def: vcmp_i1_pat<A2_vcmpwgtu, HexagonVCMPWGTU, V2I32>;
+
+
+class vcmp_vi1_pat<InstHexagon MI, PatFrag Op, PatFrag InVal, ValueType OutTy>
+ : Pat <(OutTy (Op InVal:$Rs, InVal:$Rt)),
+ (MI InVal:$Rs, InVal:$Rt)>;
+
+def: vcmp_vi1_pat<A2_vcmpweq, seteq, V2I32, v2i1>;
+def: vcmp_vi1_pat<A2_vcmpwgt, setgt, V2I32, v2i1>;
+def: vcmp_vi1_pat<A2_vcmpwgtu, setugt, V2I32, v2i1>;
+
+def: vcmp_vi1_pat<A2_vcmpheq, seteq, V4I16, v4i1>;
+def: vcmp_vi1_pat<A2_vcmphgt, setgt, V4I16, v4i1>;
+def: vcmp_vi1_pat<A2_vcmphgtu, setugt, V4I16, v4i1>;
+
+def: Pat<(mul V2I32:$Rs, V2I32:$Rt),
+ (PS_vmulw DoubleRegs:$Rs, DoubleRegs:$Rt)>;
+def: Pat<(add V2I32:$Rx, (mul V2I32:$Rs, V2I32:$Rt)),
+ (PS_vmulw_acc DoubleRegs:$Rx, DoubleRegs:$Rs, DoubleRegs:$Rt)>;
+
+
+// Adds two v4i8: Hexagon does not have an insn for this one, so we
+// use the double add v8i8, and use only the low part of the result.
+def: Pat<(v4i8 (add (v4i8 IntRegs:$Rs), (v4i8 IntRegs:$Rt))),
+ (LoReg (A2_vaddub (ToZext64 $Rs), (ToZext64 $Rt)))>;
+
+// Subtract two v4i8: Hexagon does not have an insn for this one, so we
+// use the double sub v8i8, and use only the low part of the result.
+def: Pat<(v4i8 (sub (v4i8 IntRegs:$Rs), (v4i8 IntRegs:$Rt))),
+ (LoReg (A2_vsubub (ToZext64 $Rs), (ToZext64 $Rt)))>;
+
+//
+// No 32 bit vector mux.
+//
+def: Pat<(v4i8 (select I1:$Pu, V4I8:$Rs, V4I8:$Rt)),
+ (LoReg (C2_vmux I1:$Pu, (ToZext64 $Rs), (ToZext64 $Rt)))>;
+def: Pat<(v2i16 (select I1:$Pu, V2I16:$Rs, V2I16:$Rt)),
+ (LoReg (C2_vmux I1:$Pu, (ToZext64 $Rs), (ToZext64 $Rt)))>;
+
+//
+// 64-bit vector mux.
+//
+def: Pat<(v8i8 (vselect V8I1:$Pu, V8I8:$Rs, V8I8:$Rt)),
+ (C2_vmux V8I1:$Pu, V8I8:$Rs, V8I8:$Rt)>;
+def: Pat<(v4i16 (vselect V4I1:$Pu, V4I16:$Rs, V4I16:$Rt)),
+ (C2_vmux V4I1:$Pu, V4I16:$Rs, V4I16:$Rt)>;
+def: Pat<(v2i32 (vselect V2I1:$Pu, V2I32:$Rs, V2I32:$Rt)),
+ (C2_vmux V2I1:$Pu, V2I32:$Rs, V2I32:$Rt)>;
+
+//
+// No 32 bit vector compare.
+//
+def: Pat<(i1 (seteq V4I8:$Rs, V4I8:$Rt)),
+ (A2_vcmpbeq (ToZext64 $Rs), (ToZext64 $Rt))>;
+def: Pat<(i1 (setgt V4I8:$Rs, V4I8:$Rt)),
+ (A4_vcmpbgt (ToZext64 $Rs), (ToZext64 $Rt))>;
+def: Pat<(i1 (setugt V4I8:$Rs, V4I8:$Rt)),
+ (A2_vcmpbgtu (ToZext64 $Rs), (ToZext64 $Rt))>;
+
+def: Pat<(i1 (seteq V2I16:$Rs, V2I16:$Rt)),
+ (A2_vcmpheq (ToZext64 $Rs), (ToZext64 $Rt))>;
+def: Pat<(i1 (setgt V2I16:$Rs, V2I16:$Rt)),
+ (A2_vcmphgt (ToZext64 $Rs), (ToZext64 $Rt))>;
+def: Pat<(i1 (setugt V2I16:$Rs, V2I16:$Rt)),
+ (A2_vcmphgtu (ToZext64 $Rs), (ToZext64 $Rt))>;
+
+
+class InvertCmp_pat<InstHexagon InvMI, PatFrag CmpOp, PatFrag Value,
+ ValueType CmpTy>
+ : Pat<(CmpTy (CmpOp Value:$Rs, Value:$Rt)),
+ (InvMI Value:$Rt, Value:$Rs)>;
+
+// Map from a compare operation to the corresponding instruction with the
+// order of operands reversed, e.g. x > y --> cmp.lt(y,x).
+def: InvertCmp_pat<A4_vcmpbgt, setlt, V8I8, i1>;
+def: InvertCmp_pat<A4_vcmpbgt, setlt, V8I8, v8i1>;
+def: InvertCmp_pat<A2_vcmphgt, setlt, V4I16, i1>;
+def: InvertCmp_pat<A2_vcmphgt, setlt, V4I16, v4i1>;
+def: InvertCmp_pat<A2_vcmpwgt, setlt, V2I32, i1>;
+def: InvertCmp_pat<A2_vcmpwgt, setlt, V2I32, v2i1>;
+
+def: InvertCmp_pat<A2_vcmpbgtu, setult, V8I8, i1>;
+def: InvertCmp_pat<A2_vcmpbgtu, setult, V8I8, v8i1>;
+def: InvertCmp_pat<A2_vcmphgtu, setult, V4I16, i1>;
+def: InvertCmp_pat<A2_vcmphgtu, setult, V4I16, v4i1>;
+def: InvertCmp_pat<A2_vcmpwgtu, setult, V2I32, i1>;
+def: InvertCmp_pat<A2_vcmpwgtu, setult, V2I32, v2i1>;
+
+// Map from vcmpne(Rss) -> !vcmpew(Rss).
+// rs != rt -> !(rs == rt).
+def: Pat<(v2i1 (setne V2I32:$Rs, V2I32:$Rt)),
+ (C2_not (v2i1 (A2_vcmpbeq V2I32:$Rs, V2I32:$Rt)))>;
+
+
+// Truncate: from vector B copy all 'E'ven 'B'yte elements:
+// A[0] = B[0]; A[1] = B[2]; A[2] = B[4]; A[3] = B[6];
+def: Pat<(v4i8 (trunc V4I16:$Rs)),
+ (S2_vtrunehb V4I16:$Rs)>;
+
+// Truncate: from vector B copy all 'O'dd 'B'yte elements:
+// A[0] = B[1]; A[1] = B[3]; A[2] = B[5]; A[3] = B[7];
+// S2_vtrunohb
+
+// Truncate: from vectors B and C copy all 'E'ven 'H'alf-word elements:
+// A[0] = B[0]; A[1] = B[2]; A[2] = C[0]; A[3] = C[2];
+// S2_vtruneh
+
+def: Pat<(v2i16 (trunc V2I32:$Rs)),
+ (LoReg (S2_packhl (HiReg $Rs), (LoReg $Rs)))>;
+
+
+def HexagonVSXTBH : SDNode<"HexagonISD::VSXTBH", SDTUnaryOp>;
+def HexagonVSXTBW : SDNode<"HexagonISD::VSXTBW", SDTUnaryOp>;
+
+def: Pat<(i64 (HexagonVSXTBH I32:$Rs)), (S2_vsxtbh I32:$Rs)>;
+def: Pat<(i64 (HexagonVSXTBW I32:$Rs)), (S2_vsxthw I32:$Rs)>;
+
+def: Pat<(v4i16 (zext V4I8:$Rs)), (S2_vzxtbh V4I8:$Rs)>;
+def: Pat<(v2i32 (zext V2I16:$Rs)), (S2_vzxthw V2I16:$Rs)>;
+def: Pat<(v4i16 (anyext V4I8:$Rs)), (S2_vzxtbh V4I8:$Rs)>;
+def: Pat<(v2i32 (anyext V2I16:$Rs)), (S2_vzxthw V2I16:$Rs)>;
+def: Pat<(v4i16 (sext V4I8:$Rs)), (S2_vsxtbh V4I8:$Rs)>;
+def: Pat<(v2i32 (sext V2I16:$Rs)), (S2_vsxthw V2I16:$Rs)>;
+
+// Sign extends a v2i8 into a v2i32.
+def: Pat<(v2i32 (sext_inreg V2I32:$Rs, v2i8)),
+ (A2_combinew (A2_sxtb (HiReg $Rs)), (A2_sxtb (LoReg $Rs)))>;
+
+// Sign extends a v2i16 into a v2i32.
+def: Pat<(v2i32 (sext_inreg V2I32:$Rs, v2i16)),
+ (A2_combinew (A2_sxth (HiReg $Rs)), (A2_sxth (LoReg $Rs)))>;
+
+
+// Multiplies two v2i16 and returns a v2i32. We are using here the
+// saturating multiply, as hexagon does not provide a non saturating
+// vector multiply, and saturation does not impact the result that is
+// in double precision of the operands.
+
+// Multiplies two v2i16 vectors: as Hexagon does not have a multiply
+// with the C semantics for this one, this pattern uses the half word
+// multiply vmpyh that takes two v2i16 and returns a v2i32. This is
+// then truncated to fit this back into a v2i16 and to simulate the
+// wrap around semantics for unsigned in C.
+def vmpyh: OutPatFrag<(ops node:$Rs, node:$Rt),
+ (M2_vmpy2s_s0 (i32 $Rs), (i32 $Rt))>;
+
+def: Pat<(v2i16 (mul V2I16:$Rs, V2I16:$Rt)),
+ (LoReg (S2_vtrunewh (v2i32 (A2_combineii 0, 0)),
+ (v2i32 (vmpyh V2I16:$Rs, V2I16:$Rt))))>;
+
+// Multiplies two v4i16 vectors.
+def: Pat<(v4i16 (mul V4I16:$Rs, V4I16:$Rt)),
+ (S2_vtrunewh (vmpyh (HiReg $Rs), (HiReg $Rt)),
+ (vmpyh (LoReg $Rs), (LoReg $Rt)))>;
+
+def VMPYB_no_V5: OutPatFrag<(ops node:$Rs, node:$Rt),
+ (S2_vtrunewh (vmpyh (HiReg (S2_vsxtbh $Rs)), (HiReg (S2_vsxtbh $Rt))),
+ (vmpyh (LoReg (S2_vsxtbh $Rs)), (LoReg (S2_vsxtbh $Rt))))>;
+
+// Multiplies two v4i8 vectors.
+def: Pat<(v4i8 (mul V4I8:$Rs, V4I8:$Rt)),
+ (S2_vtrunehb (M5_vmpybsu V4I8:$Rs, V4I8:$Rt))>,
+ Requires<[HasV5T]>;
+
+def: Pat<(v4i8 (mul V4I8:$Rs, V4I8:$Rt)),
+ (S2_vtrunehb (VMPYB_no_V5 V4I8:$Rs, V4I8:$Rt))>;
+
+// Multiplies two v8i8 vectors.
+def: Pat<(v8i8 (mul V8I8:$Rs, V8I8:$Rt)),
+ (A2_combinew (S2_vtrunehb (M5_vmpybsu (HiReg $Rs), (HiReg $Rt))),
+ (S2_vtrunehb (M5_vmpybsu (LoReg $Rs), (LoReg $Rt))))>,
+ Requires<[HasV5T]>;
+
+def: Pat<(v8i8 (mul V8I8:$Rs, V8I8:$Rt)),
+ (A2_combinew (S2_vtrunehb (VMPYB_no_V5 (HiReg $Rs), (HiReg $Rt))),
+ (S2_vtrunehb (VMPYB_no_V5 (LoReg $Rs), (LoReg $Rt))))>;
+
+def SDTHexagonBinOp64 : SDTypeProfile<1, 2,
+ [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisVT<0, i64>]>;
+
+def HexagonSHUFFEB: SDNode<"HexagonISD::SHUFFEB", SDTHexagonBinOp64>;
+def HexagonSHUFFEH: SDNode<"HexagonISD::SHUFFEH", SDTHexagonBinOp64>;
+def HexagonSHUFFOB: SDNode<"HexagonISD::SHUFFOB", SDTHexagonBinOp64>;
+def HexagonSHUFFOH: SDNode<"HexagonISD::SHUFFOH", SDTHexagonBinOp64>;
+
+class ShufflePat<InstHexagon MI, SDNode Op>
+ : Pat<(i64 (Op DoubleRegs:$src1, DoubleRegs:$src2)),
+ (i64 (MI DoubleRegs:$src1, DoubleRegs:$src2))>;
+
+// Shuffles even bytes for i=0..3: A[2*i].b = C[2*i].b; A[2*i+1].b = B[2*i].b
+def: ShufflePat<S2_shuffeb, HexagonSHUFFEB>;
+
+// Shuffles odd bytes for i=0..3: A[2*i].b = C[2*i+1].b; A[2*i+1].b = B[2*i+1].b
+def: ShufflePat<S2_shuffob, HexagonSHUFFOB>;
+
+// Shuffles even half for i=0,1: A[2*i].h = C[2*i].h; A[2*i+1].h = B[2*i].h
+def: ShufflePat<S2_shuffeh, HexagonSHUFFEH>;
+
+// Shuffles odd half for i=0,1: A[2*i].h = C[2*i+1].h; A[2*i+1].h = B[2*i+1].h
+def: ShufflePat<S2_shuffoh, HexagonSHUFFOH>;
+
+
+// Truncated store from v4i16 to v4i8.
+def truncstorev4i8: PatFrag<(ops node:$val, node:$ptr),
+ (truncstore node:$val, node:$ptr),
+ [{ return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v4i8; }]>;
+
+// Truncated store from v2i32 to v2i16.
+def truncstorev2i16: PatFrag<(ops node:$val, node:$ptr),
+ (truncstore node:$val, node:$ptr),
+ [{ return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v2i16; }]>;
+
+def: Pat<(truncstorev2i16 V2I32:$Rs, I32:$Rt),
+ (S2_storeri_io I32:$Rt, 0, (LoReg (S2_packhl (HiReg $Rs),
+ (LoReg $Rs))))>;
+
+def: Pat<(truncstorev4i8 V4I16:$Rs, I32:$Rt),
+ (S2_storeri_io I32:$Rt, 0, (S2_vtrunehb V4I16:$Rs))>;
+
+
+// Zero and sign extended load from v2i8 into v2i16.
+def zextloadv2i8: PatFrag<(ops node:$ptr), (zextload node:$ptr),
+ [{ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::v2i8; }]>;
+
+def sextloadv2i8: PatFrag<(ops node:$ptr), (sextload node:$ptr),
+ [{ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::v2i8; }]>;
+
+def: Pat<(v2i16 (zextloadv2i8 I32:$Rs)),
+ (LoReg (v4i16 (S2_vzxtbh (L2_loadruh_io I32:$Rs, 0))))>;
+
+def: Pat<(v2i16 (sextloadv2i8 I32:$Rs)),
+ (LoReg (v4i16 (S2_vsxtbh (L2_loadrh_io I32:$Rs, 0))))>;
+
+def: Pat<(v2i32 (zextloadv2i8 I32:$Rs)),
+ (S2_vzxthw (LoReg (v4i16 (S2_vzxtbh (L2_loadruh_io I32:$Rs, 0)))))>;
+
+def: Pat<(v2i32 (sextloadv2i8 I32:$Rs)),
+ (S2_vsxthw (LoReg (v4i16 (S2_vsxtbh (L2_loadrh_io I32:$Rs, 0)))))>;
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonPeephole.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonPeephole.cpp
new file mode 100644
index 000000000000..ee3209354688
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonPeephole.cpp
@@ -0,0 +1,338 @@
+//===-- HexagonPeephole.cpp - Hexagon Peephole Optimiztions ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// This peephole pass optimizes in the following cases.
+// 1. Optimizes redundant sign extends for the following case
+// Transform the following pattern
+// %vreg170<def> = SXTW %vreg166
+// ...
+// %vreg176<def> = COPY %vreg170:isub_lo
+//
+// Into
+// %vreg176<def> = COPY vreg166
+//
+// 2. Optimizes redundant negation of predicates.
+// %vreg15<def> = CMPGTrr %vreg6, %vreg2
+// ...
+// %vreg16<def> = NOT_p %vreg15<kill>
+// ...
+// JMP_c %vreg16<kill>, <BB#1>, %PC<imp-def,dead>
+//
+// Into
+// %vreg15<def> = CMPGTrr %vreg6, %vreg2;
+// ...
+// JMP_cNot %vreg15<kill>, <BB#1>, %PC<imp-def,dead>;
+//
+// Note: The peephole pass makes the instrucstions like
+// %vreg170<def> = SXTW %vreg166 or %vreg16<def> = NOT_p %vreg15<kill>
+// redundant and relies on some form of dead removal instructions, like
+// DCE or DIE to actually eliminate them.
+
+
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/PassSupport.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagon-peephole"
+
+static cl::opt<bool> DisableHexagonPeephole("disable-hexagon-peephole",
+ cl::Hidden, cl::ZeroOrMore, cl::init(false),
+ cl::desc("Disable Peephole Optimization"));
+
+static cl::opt<bool> DisablePNotP("disable-hexagon-pnotp",
+ cl::Hidden, cl::ZeroOrMore, cl::init(false),
+ cl::desc("Disable Optimization of PNotP"));
+
+static cl::opt<bool> DisableOptSZExt("disable-hexagon-optszext",
+ cl::Hidden, cl::ZeroOrMore, cl::init(true),
+ cl::desc("Disable Optimization of Sign/Zero Extends"));
+
+static cl::opt<bool> DisableOptExtTo64("disable-hexagon-opt-ext-to-64",
+ cl::Hidden, cl::ZeroOrMore, cl::init(true),
+ cl::desc("Disable Optimization of extensions to i64."));
+
+namespace llvm {
+ FunctionPass *createHexagonPeephole();
+ void initializeHexagonPeepholePass(PassRegistry&);
+}
+
+namespace {
+ struct HexagonPeephole : public MachineFunctionPass {
+ const HexagonInstrInfo *QII;
+ const HexagonRegisterInfo *QRI;
+ const MachineRegisterInfo *MRI;
+
+ public:
+ static char ID;
+ HexagonPeephole() : MachineFunctionPass(ID) {
+ initializeHexagonPeepholePass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "Hexagon optimize redundant zero and size extends";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ private:
+ void ChangeOpInto(MachineOperand &Dst, MachineOperand &Src);
+ };
+}
+
+char HexagonPeephole::ID = 0;
+
+INITIALIZE_PASS(HexagonPeephole, "hexagon-peephole", "Hexagon Peephole",
+ false, false)
+
+bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
+ QII = static_cast<const HexagonInstrInfo *>(MF.getSubtarget().getInstrInfo());
+ QRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+ MRI = &MF.getRegInfo();
+
+ DenseMap<unsigned, unsigned> PeepholeMap;
+ DenseMap<unsigned, std::pair<unsigned, unsigned> > PeepholeDoubleRegsMap;
+
+ if (DisableHexagonPeephole) return false;
+
+ // Loop over all of the basic blocks.
+ for (MachineFunction::iterator MBBb = MF.begin(), MBBe = MF.end();
+ MBBb != MBBe; ++MBBb) {
+ MachineBasicBlock *MBB = &*MBBb;
+ PeepholeMap.clear();
+ PeepholeDoubleRegsMap.clear();
+
+ // Traverse the basic block.
+ for (MachineInstr &MI : *MBB) {
+ // Look for sign extends:
+ // %vreg170<def> = SXTW %vreg166
+ if (!DisableOptSZExt && MI.getOpcode() == Hexagon::A2_sxtw) {
+ assert(MI.getNumOperands() == 2);
+ MachineOperand &Dst = MI.getOperand(0);
+ MachineOperand &Src = MI.getOperand(1);
+ unsigned DstReg = Dst.getReg();
+ unsigned SrcReg = Src.getReg();
+ // Just handle virtual registers.
+ if (TargetRegisterInfo::isVirtualRegister(DstReg) &&
+ TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+ // Map the following:
+ // %vreg170<def> = SXTW %vreg166
+ // PeepholeMap[170] = vreg166
+ PeepholeMap[DstReg] = SrcReg;
+ }
+ }
+
+ // Look for %vreg170<def> = COMBINE_ir_V4 (0, %vreg169)
+ // %vreg170:DoublRegs, %vreg169:IntRegs
+ if (!DisableOptExtTo64 && MI.getOpcode() == Hexagon::A4_combineir) {
+ assert(MI.getNumOperands() == 3);
+ MachineOperand &Dst = MI.getOperand(0);
+ MachineOperand &Src1 = MI.getOperand(1);
+ MachineOperand &Src2 = MI.getOperand(2);
+ if (Src1.getImm() != 0)
+ continue;
+ unsigned DstReg = Dst.getReg();
+ unsigned SrcReg = Src2.getReg();
+ PeepholeMap[DstReg] = SrcReg;
+ }
+
+ // Look for this sequence below
+ // %vregDoubleReg1 = LSRd_ri %vregDoubleReg0, 32
+ // %vregIntReg = COPY %vregDoubleReg1:isub_lo.
+ // and convert into
+ // %vregIntReg = COPY %vregDoubleReg0:isub_hi.
+ if (MI.getOpcode() == Hexagon::S2_lsr_i_p) {
+ assert(MI.getNumOperands() == 3);
+ MachineOperand &Dst = MI.getOperand(0);
+ MachineOperand &Src1 = MI.getOperand(1);
+ MachineOperand &Src2 = MI.getOperand(2);
+ if (Src2.getImm() != 32)
+ continue;
+ unsigned DstReg = Dst.getReg();
+ unsigned SrcReg = Src1.getReg();
+ PeepholeDoubleRegsMap[DstReg] =
+ std::make_pair(*&SrcReg, Hexagon::isub_hi);
+ }
+
+ // Look for P=NOT(P).
+ if (!DisablePNotP && MI.getOpcode() == Hexagon::C2_not) {
+ assert(MI.getNumOperands() == 2);
+ MachineOperand &Dst = MI.getOperand(0);
+ MachineOperand &Src = MI.getOperand(1);
+ unsigned DstReg = Dst.getReg();
+ unsigned SrcReg = Src.getReg();
+ // Just handle virtual registers.
+ if (TargetRegisterInfo::isVirtualRegister(DstReg) &&
+ TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+ // Map the following:
+ // %vreg170<def> = NOT_xx %vreg166
+ // PeepholeMap[170] = vreg166
+ PeepholeMap[DstReg] = SrcReg;
+ }
+ }
+
+ // Look for copy:
+ // %vreg176<def> = COPY %vreg170:isub_lo
+ if (!DisableOptSZExt && MI.isCopy()) {
+ assert(MI.getNumOperands() == 2);
+ MachineOperand &Dst = MI.getOperand(0);
+ MachineOperand &Src = MI.getOperand(1);
+
+ // Make sure we are copying the lower 32 bits.
+ if (Src.getSubReg() != Hexagon::isub_lo)
+ continue;
+
+ unsigned DstReg = Dst.getReg();
+ unsigned SrcReg = Src.getReg();
+ if (TargetRegisterInfo::isVirtualRegister(DstReg) &&
+ TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+ // Try to find in the map.
+ if (unsigned PeepholeSrc = PeepholeMap.lookup(SrcReg)) {
+ // Change the 1st operand.
+ MI.RemoveOperand(1);
+ MI.addOperand(MachineOperand::CreateReg(PeepholeSrc, false));
+ } else {
+ DenseMap<unsigned, std::pair<unsigned, unsigned> >::iterator DI =
+ PeepholeDoubleRegsMap.find(SrcReg);
+ if (DI != PeepholeDoubleRegsMap.end()) {
+ std::pair<unsigned,unsigned> PeepholeSrc = DI->second;
+ MI.RemoveOperand(1);
+ MI.addOperand(MachineOperand::CreateReg(
+ PeepholeSrc.first, false /*isDef*/, false /*isImp*/,
+ false /*isKill*/, false /*isDead*/, false /*isUndef*/,
+ false /*isEarlyClobber*/, PeepholeSrc.second));
+ }
+ }
+ }
+ }
+
+ // Look for Predicated instructions.
+ if (!DisablePNotP) {
+ bool Done = false;
+ if (QII->isPredicated(MI)) {
+ MachineOperand &Op0 = MI.getOperand(0);
+ unsigned Reg0 = Op0.getReg();
+ const TargetRegisterClass *RC0 = MRI->getRegClass(Reg0);
+ if (RC0->getID() == Hexagon::PredRegsRegClassID) {
+ // Handle instructions that have a prediate register in op0
+ // (most cases of predicable instructions).
+ if (TargetRegisterInfo::isVirtualRegister(Reg0)) {
+ // Try to find in the map.
+ if (unsigned PeepholeSrc = PeepholeMap.lookup(Reg0)) {
+ // Change the 1st operand and, flip the opcode.
+ MI.getOperand(0).setReg(PeepholeSrc);
+ MRI->clearKillFlags(PeepholeSrc);
+ int NewOp = QII->getInvertedPredicatedOpcode(MI.getOpcode());
+ MI.setDesc(QII->get(NewOp));
+ Done = true;
+ }
+ }
+ }
+ }
+
+ if (!Done) {
+ // Handle special instructions.
+ unsigned Op = MI.getOpcode();
+ unsigned NewOp = 0;
+ unsigned PR = 1, S1 = 2, S2 = 3; // Operand indices.
+
+ switch (Op) {
+ case Hexagon::C2_mux:
+ case Hexagon::C2_muxii:
+ NewOp = Op;
+ break;
+ case Hexagon::C2_muxri:
+ NewOp = Hexagon::C2_muxir;
+ break;
+ case Hexagon::C2_muxir:
+ NewOp = Hexagon::C2_muxri;
+ break;
+ }
+ if (NewOp) {
+ unsigned PSrc = MI.getOperand(PR).getReg();
+ if (unsigned POrig = PeepholeMap.lookup(PSrc)) {
+ MI.getOperand(PR).setReg(POrig);
+ MRI->clearKillFlags(POrig);
+ MI.setDesc(QII->get(NewOp));
+ // Swap operands S1 and S2.
+ MachineOperand Op1 = MI.getOperand(S1);
+ MachineOperand Op2 = MI.getOperand(S2);
+ ChangeOpInto(MI.getOperand(S1), Op2);
+ ChangeOpInto(MI.getOperand(S2), Op1);
+ }
+ } // if (NewOp)
+ } // if (!Done)
+
+ } // if (!DisablePNotP)
+
+ } // Instruction
+ } // Basic Block
+ return true;
+}
+
+void HexagonPeephole::ChangeOpInto(MachineOperand &Dst, MachineOperand &Src) {
+ assert (&Dst != &Src && "Cannot duplicate into itself");
+ switch (Dst.getType()) {
+ case MachineOperand::MO_Register:
+ if (Src.isReg()) {
+ Dst.setReg(Src.getReg());
+ Dst.setSubReg(Src.getSubReg());
+ MRI->clearKillFlags(Src.getReg());
+ } else if (Src.isImm()) {
+ Dst.ChangeToImmediate(Src.getImm());
+ } else {
+ llvm_unreachable("Unexpected src operand type");
+ }
+ break;
+
+ case MachineOperand::MO_Immediate:
+ if (Src.isImm()) {
+ Dst.setImm(Src.getImm());
+ } else if (Src.isReg()) {
+ Dst.ChangeToRegister(Src.getReg(), Src.isDef(), Src.isImplicit(),
+ false, Src.isDead(), Src.isUndef(),
+ Src.isDebug());
+ Dst.setSubReg(Src.getSubReg());
+ } else {
+ llvm_unreachable("Unexpected src operand type");
+ }
+ break;
+
+ default:
+ llvm_unreachable("Unexpected dst operand type");
+ break;
+ }
+}
+
+FunctionPass *llvm::createHexagonPeephole() {
+ return new HexagonPeephole();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
new file mode 100644
index 000000000000..30640e19ebac
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
@@ -0,0 +1,330 @@
+//===--- HexagonRDFOpt.cpp ------------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonInstrInfo.h"
+#include "HexagonSubtarget.h"
+#include "RDFCopy.h"
+#include "RDFDeadCode.h"
+#include "RDFGraph.h"
+#include "RDFLiveness.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominanceFrontier.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+using namespace rdf;
+
+namespace llvm {
+ void initializeHexagonRDFOptPass(PassRegistry&);
+ FunctionPass *createHexagonRDFOpt();
+}
+
+namespace {
+ unsigned RDFCount = 0;
+ cl::opt<unsigned> RDFLimit("rdf-limit", cl::init(UINT_MAX));
+ cl::opt<bool> RDFDump("rdf-dump", cl::init(false));
+
+ class HexagonRDFOpt : public MachineFunctionPass {
+ public:
+ HexagonRDFOpt() : MachineFunctionPass(ID) {
+ initializeHexagonRDFOptPass(*PassRegistry::getPassRegistry());
+ }
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<MachineDominanceFrontier>();
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+ StringRef getPassName() const override {
+ return "Hexagon RDF optimizations";
+ }
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+ static char ID;
+
+ private:
+ MachineDominatorTree *MDT;
+ MachineRegisterInfo *MRI;
+ };
+
+ char HexagonRDFOpt::ID = 0;
+}
+
+INITIALIZE_PASS_BEGIN(HexagonRDFOpt, "rdfopt", "Hexagon RDF opt", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier)
+INITIALIZE_PASS_END(HexagonRDFOpt, "rdfopt", "Hexagon RDF opt", false, false)
+
+
+namespace {
+struct HexagonCP : public CopyPropagation {
+ HexagonCP(DataFlowGraph &G) : CopyPropagation(G) {}
+ bool interpretAsCopy(const MachineInstr *MI, EqualityMap &EM) override;
+};
+
+
+struct HexagonDCE : public DeadCodeElimination {
+ HexagonDCE(DataFlowGraph &G, MachineRegisterInfo &MRI)
+ : DeadCodeElimination(G, MRI) {}
+ bool rewrite(NodeAddr<InstrNode*> IA, SetVector<NodeId> &Remove);
+ void removeOperand(NodeAddr<InstrNode*> IA, unsigned OpNum);
+
+ bool run();
+};
+} // end anonymous namespace
+
+
+bool HexagonCP::interpretAsCopy(const MachineInstr *MI, EqualityMap &EM) {
+ auto mapRegs = [MI,&EM] (RegisterRef DstR, RegisterRef SrcR) -> void {
+ EM.insert(std::make_pair(DstR, SrcR));
+ };
+
+ DataFlowGraph &DFG = getDFG();
+ unsigned Opc = MI->getOpcode();
+ switch (Opc) {
+ case Hexagon::A2_combinew: {
+ const MachineOperand &DstOp = MI->getOperand(0);
+ const MachineOperand &HiOp = MI->getOperand(1);
+ const MachineOperand &LoOp = MI->getOperand(2);
+ assert(DstOp.getSubReg() == 0 && "Unexpected subregister");
+ mapRegs(DFG.makeRegRef(DstOp.getReg(), Hexagon::isub_hi),
+ DFG.makeRegRef(HiOp.getReg(), HiOp.getSubReg()));
+ mapRegs(DFG.makeRegRef(DstOp.getReg(), Hexagon::isub_lo),
+ DFG.makeRegRef(LoOp.getReg(), LoOp.getSubReg()));
+ return true;
+ }
+ case Hexagon::A2_addi: {
+ const MachineOperand &A = MI->getOperand(2);
+ if (!A.isImm() || A.getImm() != 0)
+ return false;
+ LLVM_FALLTHROUGH;
+ }
+ case Hexagon::A2_tfr: {
+ const MachineOperand &DstOp = MI->getOperand(0);
+ const MachineOperand &SrcOp = MI->getOperand(1);
+ mapRegs(DFG.makeRegRef(DstOp.getReg(), DstOp.getSubReg()),
+ DFG.makeRegRef(SrcOp.getReg(), SrcOp.getSubReg()));
+ return true;
+ }
+ }
+
+ return CopyPropagation::interpretAsCopy(MI, EM);
+}
+
+
+bool HexagonDCE::run() {
+ bool Collected = collect();
+ if (!Collected)
+ return false;
+
+ const SetVector<NodeId> &DeadNodes = getDeadNodes();
+ const SetVector<NodeId> &DeadInstrs = getDeadInstrs();
+
+ typedef DenseMap<NodeId,NodeId> RefToInstrMap;
+ RefToInstrMap R2I;
+ SetVector<NodeId> PartlyDead;
+ DataFlowGraph &DFG = getDFG();
+
+ for (NodeAddr<BlockNode*> BA : DFG.getFunc().Addr->members(DFG)) {
+ for (auto TA : BA.Addr->members_if(DFG.IsCode<NodeAttrs::Stmt>, DFG)) {
+ NodeAddr<StmtNode*> SA = TA;
+ for (NodeAddr<RefNode*> RA : SA.Addr->members(DFG)) {
+ R2I.insert(std::make_pair(RA.Id, SA.Id));
+ if (DFG.IsDef(RA) && DeadNodes.count(RA.Id))
+ if (!DeadInstrs.count(SA.Id))
+ PartlyDead.insert(SA.Id);
+ }
+ }
+ }
+
+
+ // Nodes to remove.
+ SetVector<NodeId> Remove = DeadInstrs;
+
+ bool Changed = false;
+ for (NodeId N : PartlyDead) {
+ auto SA = DFG.addr<StmtNode*>(N);
+ if (trace())
+ dbgs() << "Partly dead: " << *SA.Addr->getCode();
+ Changed |= rewrite(SA, Remove);
+ }
+
+ return erase(Remove) || Changed;
+}
+
+
+void HexagonDCE::removeOperand(NodeAddr<InstrNode*> IA, unsigned OpNum) {
+ MachineInstr *MI = NodeAddr<StmtNode*>(IA).Addr->getCode();
+
+ auto getOpNum = [MI] (MachineOperand &Op) -> unsigned {
+ for (unsigned i = 0, n = MI->getNumOperands(); i != n; ++i)
+ if (&MI->getOperand(i) == &Op)
+ return i;
+ llvm_unreachable("Invalid operand");
+ };
+ DenseMap<NodeId,unsigned> OpMap;
+ DataFlowGraph &DFG = getDFG();
+ NodeList Refs = IA.Addr->members(DFG);
+ for (NodeAddr<RefNode*> RA : Refs)
+ OpMap.insert(std::make_pair(RA.Id, getOpNum(RA.Addr->getOp())));
+
+ MI->RemoveOperand(OpNum);
+
+ for (NodeAddr<RefNode*> RA : Refs) {
+ unsigned N = OpMap[RA.Id];
+ if (N < OpNum)
+ RA.Addr->setRegRef(&MI->getOperand(N), DFG);
+ else if (N > OpNum)
+ RA.Addr->setRegRef(&MI->getOperand(N-1), DFG);
+ }
+}
+
+
+bool HexagonDCE::rewrite(NodeAddr<InstrNode*> IA, SetVector<NodeId> &Remove) {
+ if (!getDFG().IsCode<NodeAttrs::Stmt>(IA))
+ return false;
+ DataFlowGraph &DFG = getDFG();
+ MachineInstr &MI = *NodeAddr<StmtNode*>(IA).Addr->getCode();
+ auto &HII = static_cast<const HexagonInstrInfo&>(DFG.getTII());
+ if (HII.getAddrMode(MI) != HexagonII::PostInc)
+ return false;
+ unsigned Opc = MI.getOpcode();
+ unsigned OpNum, NewOpc;
+ switch (Opc) {
+ case Hexagon::L2_loadri_pi:
+ NewOpc = Hexagon::L2_loadri_io;
+ OpNum = 1;
+ break;
+ case Hexagon::L2_loadrd_pi:
+ NewOpc = Hexagon::L2_loadrd_io;
+ OpNum = 1;
+ break;
+ case Hexagon::V6_vL32b_pi:
+ NewOpc = Hexagon::V6_vL32b_ai;
+ OpNum = 1;
+ break;
+ case Hexagon::S2_storeri_pi:
+ NewOpc = Hexagon::S2_storeri_io;
+ OpNum = 0;
+ break;
+ case Hexagon::S2_storerd_pi:
+ NewOpc = Hexagon::S2_storerd_io;
+ OpNum = 0;
+ break;
+ case Hexagon::V6_vS32b_pi:
+ NewOpc = Hexagon::V6_vS32b_ai;
+ OpNum = 0;
+ break;
+ default:
+ return false;
+ }
+ auto IsDead = [this] (NodeAddr<DefNode*> DA) -> bool {
+ return getDeadNodes().count(DA.Id);
+ };
+ NodeList Defs;
+ MachineOperand &Op = MI.getOperand(OpNum);
+ for (NodeAddr<DefNode*> DA : IA.Addr->members_if(DFG.IsDef, DFG)) {
+ if (&DA.Addr->getOp() != &Op)
+ continue;
+ Defs = DFG.getRelatedRefs(IA, DA);
+ if (!all_of(Defs, IsDead))
+ return false;
+ break;
+ }
+
+ // Mark all nodes in Defs for removal.
+ for (auto D : Defs)
+ Remove.insert(D.Id);
+
+ if (trace())
+ dbgs() << "Rewriting: " << MI;
+ MI.setDesc(HII.get(NewOpc));
+ MI.getOperand(OpNum+2).setImm(0);
+ removeOperand(IA, OpNum);
+ if (trace())
+ dbgs() << " to: " << MI;
+
+ return true;
+}
+
+
+bool HexagonRDFOpt::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
+ if (RDFLimit.getPosition()) {
+ if (RDFCount >= RDFLimit)
+ return false;
+ RDFCount++;
+ }
+
+ MDT = &getAnalysis<MachineDominatorTree>();
+ const auto &MDF = getAnalysis<MachineDominanceFrontier>();
+ const auto &HII = *MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
+ const auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+ MRI = &MF.getRegInfo();
+ bool Changed;
+
+ if (RDFDump)
+ MF.print(dbgs() << "Before " << getPassName() << "\n", nullptr);
+
+ TargetOperandInfo TOI(HII);
+ DataFlowGraph G(MF, HII, HRI, *MDT, MDF, TOI);
+ // Dead phi nodes are necessary for copy propagation: we can add a use
+ // of a register in a block where it would need a phi node, but which
+ // was dead (and removed) during the graph build time.
+ G.build(BuildOptions::KeepDeadPhis);
+
+ if (RDFDump)
+ dbgs() << "Starting copy propagation on: " << MF.getName() << '\n'
+ << PrintNode<FuncNode*>(G.getFunc(), G) << '\n';
+ HexagonCP CP(G);
+ CP.trace(RDFDump);
+ Changed = CP.run();
+
+ if (RDFDump)
+ dbgs() << "Starting dead code elimination on: " << MF.getName() << '\n'
+ << PrintNode<FuncNode*>(G.getFunc(), G) << '\n';
+ HexagonDCE DCE(G, *MRI);
+ DCE.trace(RDFDump);
+ Changed |= DCE.run();
+
+ if (Changed) {
+ if (RDFDump)
+ dbgs() << "Starting liveness recomputation on: " << MF.getName() << '\n';
+ Liveness LV(*MRI, G);
+ LV.trace(RDFDump);
+ LV.computeLiveIns();
+ LV.resetLiveIns();
+ LV.resetKills();
+ }
+
+ if (RDFDump)
+ MF.print(dbgs() << "After " << getPassName() << "\n", nullptr);
+
+ return false;
+}
+
+
+FunctionPass *llvm::createHexagonRDFOpt() {
+ return new HexagonRDFOpt();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
new file mode 100644
index 000000000000..d3f230d3f8a6
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -0,0 +1,272 @@
+//===-- HexagonRegisterInfo.cpp - Hexagon Register Information ------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Hexagon implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonRegisterInfo.h"
+#include "Hexagon.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+HexagonRegisterInfo::HexagonRegisterInfo()
+ : HexagonGenRegisterInfo(Hexagon::R31) {}
+
+
+bool HexagonRegisterInfo::isEHReturnCalleeSaveReg(unsigned R) const {
+ return R == Hexagon::R0 || R == Hexagon::R1 || R == Hexagon::R2 ||
+ R == Hexagon::R3 || R == Hexagon::D0 || R == Hexagon::D1;
+}
+
+bool HexagonRegisterInfo::isCalleeSaveReg(unsigned Reg) const {
+ return Hexagon::R16 <= Reg && Reg <= Hexagon::R27;
+}
+
+
+const MCPhysReg *
+HexagonRegisterInfo::getCallerSavedRegs(const MachineFunction *MF,
+ const TargetRegisterClass *RC) const {
+ using namespace Hexagon;
+
+ static const MCPhysReg Int32[] = {
+ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, 0
+ };
+ static const MCPhysReg Int64[] = {
+ D0, D1, D2, D3, D4, D5, D6, D7, 0
+ };
+ static const MCPhysReg Pred[] = {
+ P0, P1, P2, P3, 0
+ };
+ static const MCPhysReg VecSgl[] = {
+ V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13,
+ V14, V15, V16, V17, V18, V19, V20, V21, V22, V23, V24, V25, V26, V27,
+ V28, V29, V30, V31, 0
+ };
+ static const MCPhysReg VecDbl[] = {
+ W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, 0
+ };
+
+ switch (RC->getID()) {
+ case IntRegsRegClassID:
+ return Int32;
+ case DoubleRegsRegClassID:
+ return Int64;
+ case PredRegsRegClassID:
+ return Pred;
+ case VectorRegsRegClassID:
+ case VectorRegs128BRegClassID:
+ return VecSgl;
+ case VecDblRegsRegClassID:
+ case VecDblRegs128BRegClassID:
+ return VecDbl;
+ default:
+ break;
+ }
+
+ static const MCPhysReg Empty[] = { 0 };
+#ifndef NDEBUG
+ dbgs() << "Register class: " << getRegClassName(RC) << "\n";
+#endif
+ llvm_unreachable("Unexpected register class");
+ return Empty;
+}
+
+
+const MCPhysReg *
+HexagonRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+ static const MCPhysReg CalleeSavedRegsV3[] = {
+ Hexagon::R16, Hexagon::R17, Hexagon::R18, Hexagon::R19,
+ Hexagon::R20, Hexagon::R21, Hexagon::R22, Hexagon::R23,
+ Hexagon::R24, Hexagon::R25, Hexagon::R26, Hexagon::R27, 0
+ };
+
+ // Functions that contain a call to __builtin_eh_return also save the first 4
+ // parameter registers.
+ static const MCPhysReg CalleeSavedRegsV3EHReturn[] = {
+ Hexagon::R0, Hexagon::R1, Hexagon::R2, Hexagon::R3,
+ Hexagon::R16, Hexagon::R17, Hexagon::R18, Hexagon::R19,
+ Hexagon::R20, Hexagon::R21, Hexagon::R22, Hexagon::R23,
+ Hexagon::R24, Hexagon::R25, Hexagon::R26, Hexagon::R27, 0
+ };
+
+ bool HasEHReturn = MF->getInfo<HexagonMachineFunctionInfo>()->hasEHReturn();
+
+ switch (MF->getSubtarget<HexagonSubtarget>().getHexagonArchVersion()) {
+ case HexagonSubtarget::V4:
+ case HexagonSubtarget::V5:
+ case HexagonSubtarget::V55:
+ case HexagonSubtarget::V60:
+ return HasEHReturn ? CalleeSavedRegsV3EHReturn : CalleeSavedRegsV3;
+ }
+
+ llvm_unreachable("Callee saved registers requested for unknown architecture "
+ "version");
+}
+
+
+BitVector HexagonRegisterInfo::getReservedRegs(const MachineFunction &MF)
+ const {
+ BitVector Reserved(getNumRegs());
+ Reserved.set(Hexagon::R29);
+ Reserved.set(Hexagon::R30);
+ Reserved.set(Hexagon::R31);
+ Reserved.set(Hexagon::PC);
+ Reserved.set(Hexagon::D14);
+ Reserved.set(Hexagon::D15);
+ Reserved.set(Hexagon::LC0);
+ Reserved.set(Hexagon::LC1);
+ Reserved.set(Hexagon::SA0);
+ Reserved.set(Hexagon::SA1);
+ Reserved.set(Hexagon::UGP);
+ Reserved.set(Hexagon::GP);
+ Reserved.set(Hexagon::CS0);
+ Reserved.set(Hexagon::CS1);
+ Reserved.set(Hexagon::CS);
+ Reserved.set(Hexagon::USR);
+ return Reserved;
+}
+
+
+void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+ int SPAdj, unsigned FIOp,
+ RegScavenger *RS) const {
+ //
+ // Hexagon_TODO: Do we need to enforce this for Hexagon?
+ assert(SPAdj == 0 && "Unexpected");
+
+ MachineInstr &MI = *II;
+ MachineBasicBlock &MB = *MI.getParent();
+ MachineFunction &MF = *MB.getParent();
+ auto &HST = MF.getSubtarget<HexagonSubtarget>();
+ auto &HII = *HST.getInstrInfo();
+ auto &HFI = *HST.getFrameLowering();
+
+ unsigned BP = 0;
+ int FI = MI.getOperand(FIOp).getIndex();
+ // Select the base pointer (BP) and calculate the actual offset from BP
+ // to the beginning of the object at index FI.
+ int Offset = HFI.getFrameIndexReference(MF, FI, BP);
+ // Add the offset from the instruction.
+ int RealOffset = Offset + MI.getOperand(FIOp+1).getImm();
+ bool IsKill = false;
+
+ unsigned Opc = MI.getOpcode();
+ switch (Opc) {
+ case Hexagon::PS_fia:
+ MI.setDesc(HII.get(Hexagon::A2_addi));
+ MI.getOperand(FIOp).ChangeToImmediate(RealOffset);
+ MI.RemoveOperand(FIOp+1);
+ return;
+ case Hexagon::PS_fi:
+ // Set up the instruction for updating below.
+ MI.setDesc(HII.get(Hexagon::A2_addi));
+ break;
+ }
+
+ if (!HII.isValidOffset(Opc, RealOffset)) {
+ // If the offset is not valid, calculate the address in a temporary
+ // register and use it with offset 0.
+ auto &MRI = MF.getRegInfo();
+ unsigned TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+ const DebugLoc &DL = MI.getDebugLoc();
+ BuildMI(MB, II, DL, HII.get(Hexagon::A2_addi), TmpR)
+ .addReg(BP)
+ .addImm(RealOffset);
+ BP = TmpR;
+ RealOffset = 0;
+ IsKill = true;
+ }
+
+ MI.getOperand(FIOp).ChangeToRegister(BP, false, false, IsKill);
+ MI.getOperand(FIOp+1).ChangeToImmediate(RealOffset);
+}
+
+
+unsigned HexagonRegisterInfo::getRARegister() const {
+ return Hexagon::R31;
+}
+
+
+unsigned HexagonRegisterInfo::getFrameRegister(const MachineFunction
+ &MF) const {
+ const HexagonFrameLowering *TFI = getFrameLowering(MF);
+ if (TFI->hasFP(MF))
+ return getFrameRegister();
+ return getStackRegister();
+}
+
+
+unsigned HexagonRegisterInfo::getFrameRegister() const {
+ return Hexagon::R30;
+}
+
+
+unsigned HexagonRegisterInfo::getStackRegister() const {
+ return Hexagon::R29;
+}
+
+
+unsigned HexagonRegisterInfo::getHexagonSubRegIndex(
+ const TargetRegisterClass *RC, unsigned GenIdx) const {
+ assert(GenIdx == Hexagon::ps_sub_lo || GenIdx == Hexagon::ps_sub_hi);
+
+ static const unsigned ISub[] = { Hexagon::isub_lo, Hexagon::isub_hi };
+ static const unsigned VSub[] = { Hexagon::vsub_lo, Hexagon::vsub_hi };
+
+ switch (RC->getID()) {
+ case Hexagon::CtrRegs64RegClassID:
+ case Hexagon::DoubleRegsRegClassID:
+ return ISub[GenIdx];
+ case Hexagon::VecDblRegsRegClassID:
+ case Hexagon::VecDblRegs128BRegClassID:
+ return VSub[GenIdx];
+ }
+
+ if (const TargetRegisterClass *SuperRC = *RC->getSuperClasses())
+ return getHexagonSubRegIndex(SuperRC, GenIdx);
+
+ llvm_unreachable("Invalid register class");
+}
+
+bool HexagonRegisterInfo::useFPForScavengingIndex(const MachineFunction &MF)
+ const {
+ return MF.getSubtarget<HexagonSubtarget>().getFrameLowering()->hasFP(MF);
+}
+
+
+unsigned HexagonRegisterInfo::getFirstCallerSavedNonParamReg() const {
+ return Hexagon::R6;
+}
+
+
+#define GET_REGINFO_TARGET_DESC
+#include "HexagonGenRegisterInfo.inc"
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h
new file mode 100644
index 000000000000..1fb295b5bd8c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h
@@ -0,0 +1,84 @@
+//==- HexagonRegisterInfo.h - Hexagon Register Information Impl --*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Hexagon implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONREGISTERINFO_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONREGISTERINFO_H
+
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "HexagonGenRegisterInfo.inc"
+
+namespace llvm {
+
+namespace Hexagon {
+ // Generic (pseudo) subreg indices for use with getHexagonSubRegIndex.
+ enum { ps_sub_lo = 0, ps_sub_hi = 1 };
+}
+
+class HexagonRegisterInfo : public HexagonGenRegisterInfo {
+public:
+ HexagonRegisterInfo();
+
+ /// Code Generation virtual methods...
+ const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF)
+ const override;
+
+
+ BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+ void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+ unsigned FIOperandNum, RegScavenger *RS = nullptr) const override;
+
+ /// Returns true since we may need scavenging for a temporary register
+ /// when generating hardware loop instructions.
+ bool requiresRegisterScavenging(const MachineFunction &MF) const override {
+ return true;
+ }
+
+ /// Returns true. Spill code for predicate registers might need an extra
+ /// register.
+ bool requiresFrameIndexScavenging(const MachineFunction &MF) const override {
+ return true;
+ }
+
+ /// Returns true if the frame pointer is valid.
+ bool useFPForScavengingIndex(const MachineFunction &MF) const override;
+
+ bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override {
+ return true;
+ }
+
+ // Debug information queries.
+ unsigned getRARegister() const;
+ unsigned getFrameRegister(const MachineFunction &MF) const override;
+ unsigned getFrameRegister() const;
+ unsigned getStackRegister() const;
+
+ unsigned getHexagonSubRegIndex(const TargetRegisterClass *RC,
+ unsigned GenIdx) const;
+
+ const MCPhysReg *getCallerSavedRegs(const MachineFunction *MF,
+ const TargetRegisterClass *RC) const;
+
+ unsigned getFirstCallerSavedNonParamReg() const;
+
+ bool isEHReturnCalleeSaveReg(unsigned Reg) const;
+ bool isCalleeSaveReg(unsigned Reg) const;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td
new file mode 100644
index 000000000000..a75f3514dbd2
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td
@@ -0,0 +1,286 @@
+//===-- HexagonRegisterInfo.td - Hexagon Register defs -----*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Declarations that describe the Hexagon register file.
+//===----------------------------------------------------------------------===//
+
+let Namespace = "Hexagon" in {
+
+ class HexagonReg<bits<5> num, string n, list<string> alt = [],
+ list<Register> alias = []> : Register<n, alt> {
+ field bits<5> Num;
+ let Aliases = alias;
+ let HWEncoding{4-0} = num;
+ }
+
+ class HexagonDoubleReg<bits<5> num, string n, list<Register> subregs,
+ list<string> alt = []> :
+ RegisterWithSubRegs<n, subregs> {
+ field bits<5> Num;
+
+ let AltNames = alt;
+ let HWEncoding{4-0} = num;
+ }
+
+ // Registers are identified with 5-bit ID numbers.
+ // Ri - 32-bit integer registers.
+ class Ri<bits<5> num, string n, list<string> alt = []> :
+ HexagonReg<num, n, alt> {
+ let Num = num;
+ }
+
+ // Rf - 32-bit floating-point registers.
+ class Rf<bits<5> num, string n> : HexagonReg<num, n> {
+ let Num = num;
+ }
+
+
+ // Rd - 64-bit registers.
+ class Rd<bits<5> num, string n, list<Register> subregs,
+ list<string> alt = []> :
+ HexagonDoubleReg<num, n, subregs, alt> {
+ let Num = num;
+ let SubRegs = subregs;
+ }
+
+ // Rp - predicate registers
+ class Rp<bits<5> num, string n> : HexagonReg<num, n> {
+ let Num = num;
+ }
+
+
+ // Rq - vector predicate registers
+ class Rq<bits<3> num, string n> : Register<n, []> {
+ let HWEncoding{2-0} = num;
+ }
+
+ // Rc - control registers
+ class Rc<bits<5> num, string n,
+ list<string> alt = [], list<Register> alias = []> :
+ HexagonReg<num, n, alt, alias> {
+ let Num = num;
+ }
+
+ // Rcc - 64-bit control registers.
+ class Rcc<bits<5> num, string n, list<Register> subregs,
+ list<string> alt = []> :
+ HexagonDoubleReg<num, n, subregs, alt> {
+ let Num = num;
+ let SubRegs = subregs;
+ }
+
+ // Mx - address modifier registers
+ class Mx<bits<1> num, string n> : HexagonReg<{0b0000, num}, n> {
+ let Num = !cast<bits<5>>(num);
+ }
+
+ def isub_lo : SubRegIndex<32>;
+ def isub_hi : SubRegIndex<32, 32>;
+ def vsub_lo : SubRegIndex<512>;
+ def vsub_hi : SubRegIndex<512, 512>;
+ def subreg_overflow : SubRegIndex<1, 0>;
+
+ // Integer registers.
+ foreach i = 0-28 in {
+ def R#i : Ri<i, "r"#i>, DwarfRegNum<[i]>;
+ }
+
+ def R29 : Ri<29, "r29", ["sp"]>, DwarfRegNum<[29]>;
+ def R30 : Ri<30, "r30", ["fp"]>, DwarfRegNum<[30]>;
+ def R31 : Ri<31, "r31", ["lr"]>, DwarfRegNum<[31]>;
+
+ // Aliases of the R* registers used to hold 64-bit int values (doubles).
+ let SubRegIndices = [isub_lo, isub_hi], CoveredBySubRegs = 1 in {
+ def D0 : Rd< 0, "r1:0", [R0, R1]>, DwarfRegNum<[32]>;
+ def D1 : Rd< 2, "r3:2", [R2, R3]>, DwarfRegNum<[34]>;
+ def D2 : Rd< 4, "r5:4", [R4, R5]>, DwarfRegNum<[36]>;
+ def D3 : Rd< 6, "r7:6", [R6, R7]>, DwarfRegNum<[38]>;
+ def D4 : Rd< 8, "r9:8", [R8, R9]>, DwarfRegNum<[40]>;
+ def D5 : Rd<10, "r11:10", [R10, R11]>, DwarfRegNum<[42]>;
+ def D6 : Rd<12, "r13:12", [R12, R13]>, DwarfRegNum<[44]>;
+ def D7 : Rd<14, "r15:14", [R14, R15]>, DwarfRegNum<[46]>;
+ def D8 : Rd<16, "r17:16", [R16, R17]>, DwarfRegNum<[48]>;
+ def D9 : Rd<18, "r19:18", [R18, R19]>, DwarfRegNum<[50]>;
+ def D10 : Rd<20, "r21:20", [R20, R21]>, DwarfRegNum<[52]>;
+ def D11 : Rd<22, "r23:22", [R22, R23]>, DwarfRegNum<[54]>;
+ def D12 : Rd<24, "r25:24", [R24, R25]>, DwarfRegNum<[56]>;
+ def D13 : Rd<26, "r27:26", [R26, R27]>, DwarfRegNum<[58]>;
+ def D14 : Rd<28, "r29:28", [R28, R29]>, DwarfRegNum<[60]>;
+ def D15 : Rd<30, "r31:30", [R30, R31], ["lr:fp"]>, DwarfRegNum<[62]>;
+ }
+
+ // Predicate registers.
+ def P0 : Rp<0, "p0">, DwarfRegNum<[63]>;
+ def P1 : Rp<1, "p1">, DwarfRegNum<[64]>;
+ def P2 : Rp<2, "p2">, DwarfRegNum<[65]>;
+ def P3 : Rp<3, "p3">, DwarfRegNum<[66]>;
+
+ // Modifier registers.
+ // C6 and C7 can also be M0 and M1, but register names must be unique, even
+ // if belonging to different register classes.
+ def M0 : Mx<0, "m0">, DwarfRegNum<[72]>;
+ def M1 : Mx<1, "m1">, DwarfRegNum<[73]>;
+
+ // Fake register to represent USR.OVF bit. Artihmetic/saturating instruc-
+ // tions modify this bit, and multiple such instructions are allowed in the
+ // same packet. We need to ignore output dependencies on this bit, but not
+ // on the entire USR.
+ def USR_OVF : Rc<?, "usr.ovf">;
+
+ def USR : Rc<8, "usr", ["c8"]>, DwarfRegNum<[75]> {
+ let SubRegIndices = [subreg_overflow];
+ let SubRegs = [USR_OVF];
+ }
+
+ // Control registers.
+ def SA0 : Rc<0, "sa0", ["c0"]>, DwarfRegNum<[67]>;
+ def LC0 : Rc<1, "lc0", ["c1"]>, DwarfRegNum<[68]>;
+ def SA1 : Rc<2, "sa1", ["c2"]>, DwarfRegNum<[69]>;
+ def LC1 : Rc<3, "lc1", ["c3"]>, DwarfRegNum<[70]>;
+ def P3_0 : Rc<4, "p3:0", ["c4"], [P0, P1, P2, P3]>,
+ DwarfRegNum<[71]>;
+ def C5 : Rc<5, "c5", ["c5"]>, DwarfRegNum<[72]>; // future use
+ def C6 : Rc<6, "c6", [], [M0]>, DwarfRegNum<[73]>;
+ def C7 : Rc<7, "c7", [], [M1]>, DwarfRegNum<[74]>;
+ // Define C8 separately and make it aliased with USR.
+ // The problem is that USR has subregisters (e.g. overflow). If USR was
+ // specified as a subregister of C9_8, it would imply that subreg_overflow
+ // and isub_lo can be composed, which leads to all kinds of issues
+ // with lane masks.
+ def C8 : Rc<8, "c8", [], [USR]>, DwarfRegNum<[75]>;
+ def PC : Rc<9, "pc">, DwarfRegNum<[76]>;
+ def UGP : Rc<10, "ugp", ["c10"]>, DwarfRegNum<[77]>;
+ def GP : Rc<11, "gp", ["c11"]>, DwarfRegNum<[78]>;
+ def CS0 : Rc<12, "cs0", ["c12"]>, DwarfRegNum<[79]>;
+ def CS1 : Rc<13, "cs1", ["c13"]>, DwarfRegNum<[80]>;
+ def UPCL : Rc<14, "upcyclelo", ["c14"]>, DwarfRegNum<[81]>;
+ def UPCH : Rc<15, "upcyclehi", ["c15"]>, DwarfRegNum<[82]>;
+}
+
+ // Control registers pairs.
+ let SubRegIndices = [isub_lo, isub_hi], CoveredBySubRegs = 1 in {
+ def C1_0 : Rcc<0, "c1:0", [SA0, LC0], ["lc0:sa0"]>, DwarfRegNum<[67]>;
+ def C3_2 : Rcc<2, "c3:2", [SA1, LC1], ["lc1:sa1"]>, DwarfRegNum<[69]>;
+ def C5_4 : Rcc<4, "c5:4", [P3_0, C5]>, DwarfRegNum<[71]>;
+ def C7_6 : Rcc<6, "c7:6", [C6, C7], ["m1:0"]>, DwarfRegNum<[72]>;
+ // Use C8 instead of USR as a subregister of C9_8.
+ def C9_8 : Rcc<8, "c9:8", [C8, PC]>, DwarfRegNum<[74]>;
+ def C11_10 : Rcc<10, "c11:10", [UGP, GP]>, DwarfRegNum<[76]>;
+ def CS : Rcc<12, "c13:12", [CS0, CS1], ["cs1:0"]>, DwarfRegNum<[78]>;
+ def UPC : Rcc<14, "c15:14", [UPCL, UPCH]>, DwarfRegNum<[80]>;
+ }
+
+ foreach i = 0-31 in {
+ def V#i : Ri<i, "v"#i>, DwarfRegNum<[!add(i, 99)]>;
+ }
+
+ // Aliases of the V* registers used to hold double vec values.
+ let SubRegIndices = [vsub_lo, vsub_hi], CoveredBySubRegs = 1 in {
+ def W0 : Rd< 0, "v1:0", [V0, V1]>, DwarfRegNum<[99]>;
+ def W1 : Rd< 2, "v3:2", [V2, V3]>, DwarfRegNum<[101]>;
+ def W2 : Rd< 4, "v5:4", [V4, V5]>, DwarfRegNum<[103]>;
+ def W3 : Rd< 6, "v7:6", [V6, V7]>, DwarfRegNum<[105]>;
+ def W4 : Rd< 8, "v9:8", [V8, V9]>, DwarfRegNum<[107]>;
+ def W5 : Rd<10, "v11:10", [V10, V11]>, DwarfRegNum<[109]>;
+ def W6 : Rd<12, "v13:12", [V12, V13]>, DwarfRegNum<[111]>;
+ def W7 : Rd<14, "v15:14", [V14, V15]>, DwarfRegNum<[113]>;
+ def W8 : Rd<16, "v17:16", [V16, V17]>, DwarfRegNum<[115]>;
+ def W9 : Rd<18, "v19:18", [V18, V19]>, DwarfRegNum<[117]>;
+ def W10 : Rd<20, "v21:20", [V20, V21]>, DwarfRegNum<[119]>;
+ def W11 : Rd<22, "v23:22", [V22, V23]>, DwarfRegNum<[121]>;
+ def W12 : Rd<24, "v25:24", [V24, V25]>, DwarfRegNum<[123]>;
+ def W13 : Rd<26, "v27:26", [V26, V27]>, DwarfRegNum<[125]>;
+ def W14 : Rd<28, "v29:28", [V28, V29]>, DwarfRegNum<[127]>;
+ def W15 : Rd<30, "v31:30", [V30, V31]>, DwarfRegNum<[129]>;
+ }
+
+ // Vector Predicate registers.
+ def Q0 : Rq<0, "q0">, DwarfRegNum<[131]>;
+ def Q1 : Rq<1, "q1">, DwarfRegNum<[132]>;
+ def Q2 : Rq<2, "q2">, DwarfRegNum<[133]>;
+ def Q3 : Rq<3, "q3">, DwarfRegNum<[134]>;
+
+// Register classes.
+//
+// FIXME: the register order should be defined in terms of the preferred
+// allocation order...
+//
+def IntRegs : RegisterClass<"Hexagon", [i32, f32, v4i8, v2i16], 32,
+ (add (sequence "R%u", 0, 9),
+ (sequence "R%u", 12, 28),
+ R10, R11, R29, R30, R31)> {
+}
+
+// Registers are listed in reverse order for allocation preference reasons.
+def IntRegsLow8 : RegisterClass<"Hexagon", [i32], 32,
+ (add R7, R6, R5, R4, R3, R2, R1, R0)> ;
+
+def DoubleRegs : RegisterClass<"Hexagon", [i64, f64, v8i8, v4i16, v2i32], 64,
+ (add (sequence "D%u", 0, 4),
+ (sequence "D%u", 6, 13), D5, D14, D15)>;
+
+def VectorRegs : RegisterClass<"Hexagon", [v64i8, v32i16, v16i32, v8i64], 512,
+ (add (sequence "V%u", 0, 31))>;
+
+def VecDblRegs : RegisterClass<"Hexagon",
+ [v128i8, v64i16, v32i32, v16i64], 1024,
+ (add (sequence "W%u", 0, 15))>;
+
+def VectorRegs128B : RegisterClass<"Hexagon",
+ [v128i8, v64i16, v32i32, v16i64], 1024,
+ (add (sequence "V%u", 0, 31))>;
+
+def VecDblRegs128B : RegisterClass<"Hexagon",
+ [v256i8,v128i16,v64i32,v32i64], 2048,
+ (add (sequence "W%u", 0, 15))>;
+
+def VecPredRegs : RegisterClass<"Hexagon", [v512i1], 512,
+ (add (sequence "Q%u", 0, 3))>;
+
+def VecPredRegs128B : RegisterClass<"Hexagon", [v1024i1], 1024,
+ (add (sequence "Q%u", 0, 3))>;
+
+def PredRegs : RegisterClass<"Hexagon",
+ [i1, v2i1, v4i1, v8i1, v4i8, v2i16, i32], 32,
+ (add (sequence "P%u", 0, 3))>
+{
+ let Size = 32;
+}
+
+let Size = 32 in
+def ModRegs : RegisterClass<"Hexagon", [i32], 32, (add M0, M1)>;
+
+let Size = 32, isAllocatable = 0 in
+def CtrRegs : RegisterClass<"Hexagon", [i32], 32,
+ (add LC0, SA0, LC1, SA1,
+ P3_0, C5,
+ M0, M1, C6, C7, C8, CS0, CS1, UPCL, UPCH,
+ USR, UGP, GP, PC)>;
+
+let isAllocatable = 0 in
+def UsrBits : RegisterClass<"Hexagon", [i1], 0, (add USR_OVF)>;
+
+let Size = 64, isAllocatable = 0 in
+def CtrRegs64 : RegisterClass<"Hexagon", [i64], 64,
+ (add C1_0, C3_2, C7_6, C9_8, C11_10, CS, UPC)>;
+
+def VolatileV3 {
+ list<Register> Regs = [D0, D1, D2, D3, D4, D5, D6, D7,
+ R28, R31,
+ P0, P1, P2, P3,
+ M0, M1,
+ LC0, LC1, SA0, SA1, USR, USR_OVF, CS0, CS1,
+ V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11,
+ V12, V13, V14, V15, V16, V17, V18, V19, V20, V21,
+ V22, V23, V24, V25, V26, V27, V28, V29, V30, V31,
+ W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11,
+ W12, W13, W14, W15,
+ Q0, Q1, Q2, Q3];
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSchedule.td b/contrib/llvm/lib/Target/Hexagon/HexagonSchedule.td
new file mode 100644
index 000000000000..6e4987b7e4e3
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSchedule.td
@@ -0,0 +1,24 @@
+//===- HexagonSchedule.td - Hexagon Scheduling Definitions -*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// V4 Machine Info +
+//===----------------------------------------------------------------------===//
+
+include "HexagonScheduleV4.td"
+
+// V55 Machine Info +
+include "HexagonScheduleV55.td"
+
+//===----------------------------------------------------------------------===//
+// V60 Machine Info -
+//===----------------------------------------------------------------------===//
+
+include "HexagonScheduleV60.td"
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV4.td b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV4.td
new file mode 100644
index 000000000000..7416baab392c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV4.td
@@ -0,0 +1,203 @@
+//=-HexagonScheduleV4.td - HexagonV4 Scheduling Definitions --*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// There are four SLOTS (four parallel pipelines) in Hexagon V4 machine.
+// This file describes that machine information.
+
+//
+// |===========|==================================================|
+// | PIPELINE | Instruction Classes |
+// |===========|==================================================|
+// | SLOT0 | LD ST ALU32 MEMOP NV SYSTEM |
+// |-----------|--------------------------------------------------|
+// | SLOT1 | LD ST ALU32 |
+// |-----------|--------------------------------------------------|
+// | SLOT2 | XTYPE ALU32 J JR |
+// |-----------|--------------------------------------------------|
+// | SLOT3 | XTYPE ALU32 J CR |
+// |===========|==================================================|
+
+// Functional Units.
+def SLOT0 : FuncUnit;
+def SLOT1 : FuncUnit;
+def SLOT2 : FuncUnit;
+def SLOT3 : FuncUnit;
+// Endloop is a pseudo instruction that is encoded with 2 bits in a packet
+// rather than taking an execution slot. This special unit is needed
+// to schedule an ENDLOOP with 4 other instructions.
+def SLOT_ENDLOOP: FuncUnit;
+
+// Itinerary classes.
+def PSEUDO : InstrItinClass;
+def PSEUDOM : InstrItinClass;
+// ALU64/M/S Instruction classes of V2 are collectively knownn as XTYPE in V4.
+def DUPLEX : InstrItinClass;
+def PREFIX : InstrItinClass;
+def COMPOUND_CJ_ARCHDEPSLOT : InstrItinClass;
+def COMPOUND : InstrItinClass;
+
+def ALU32_2op_tc_1_SLOT0123 : InstrItinClass;
+def ALU32_2op_tc_2early_SLOT0123 : InstrItinClass;
+def ALU32_3op_tc_2early_SLOT0123 : InstrItinClass;
+def ALU32_3op_tc_1_SLOT0123 : InstrItinClass;
+def ALU32_3op_tc_2_SLOT0123 : InstrItinClass;
+def ALU32_ADDI_tc_1_SLOT0123 : InstrItinClass;
+def ALU64_tc_1_SLOT23 : InstrItinClass;
+def ALU64_tc_2_SLOT23 : InstrItinClass;
+def ALU64_tc_2early_SLOT23 : InstrItinClass;
+def ALU64_tc_3x_SLOT23 : InstrItinClass;
+def CR_tc_2_SLOT3 : InstrItinClass;
+def CR_tc_2early_SLOT23 : InstrItinClass;
+def CR_tc_2early_SLOT3 : InstrItinClass;
+def CR_tc_3x_SLOT23 : InstrItinClass;
+def CR_tc_3x_SLOT3 : InstrItinClass;
+def J_tc_2early_SLOT23 : InstrItinClass;
+def J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT : InstrItinClass;
+def J_tc_2early_SLOT2 : InstrItinClass;
+def LD_tc_ld_SLOT01 : InstrItinClass;
+def LD_tc_ld_SLOT0 : InstrItinClass;
+def LD_tc_3or4stall_SLOT0 : InstrItinClass;
+def M_tc_2_SLOT23 : InstrItinClass;
+def M_tc_3_SLOT23 : InstrItinClass;
+def M_tc_1_SLOT23 : InstrItinClass;
+def M_tc_3x_SLOT23 : InstrItinClass;
+def M_tc_3or4x_SLOT23 : InstrItinClass;
+def ST_tc_st_SLOT01 : InstrItinClass;
+def ST_tc_st_SLOT0 : InstrItinClass;
+def ST_tc_ld_SLOT0 : InstrItinClass;
+def ST_tc_3stall_SLOT0 : InstrItinClass;
+def S_2op_tc_1_SLOT23 : InstrItinClass;
+def S_2op_tc_2_SLOT23 : InstrItinClass;
+def S_2op_tc_2early_SLOT23 : InstrItinClass;
+def S_2op_tc_3or4x_SLOT23 : InstrItinClass;
+def S_3op_tc_1_SLOT23 : InstrItinClass;
+def S_3op_tc_2_SLOT23 : InstrItinClass;
+def S_3op_tc_2early_SLOT23 : InstrItinClass;
+def S_3op_tc_3_SLOT23 : InstrItinClass;
+def S_3op_tc_3x_SLOT23 : InstrItinClass;
+def NCJ_tc_3or4stall_SLOT0 : InstrItinClass;
+def V2LDST_tc_ld_SLOT01 : InstrItinClass;
+def V2LDST_tc_st_SLOT0 : InstrItinClass;
+def V2LDST_tc_st_SLOT01 : InstrItinClass;
+def V4LDST_tc_ld_SLOT01 : InstrItinClass;
+def V4LDST_tc_st_SLOT0 : InstrItinClass;
+def V4LDST_tc_st_SLOT01 : InstrItinClass;
+def J_tc_2early_SLOT0123 : InstrItinClass;
+def EXTENDER_tc_1_SLOT0123 : InstrItinClass;
+def S_3op_tc_3stall_SLOT23 : InstrItinClass;
+
+def HexagonItinerariesV4 :
+ ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP], [], [
+ // ALU32
+ InstrItinData<ALU32_2op_tc_1_SLOT0123 ,
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+ InstrItinData<ALU32_2op_tc_2early_SLOT0123,
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+ InstrItinData<ALU32_3op_tc_1_SLOT0123 ,
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+ InstrItinData<ALU32_3op_tc_2early_SLOT0123,
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+ InstrItinData<ALU32_3op_tc_2_SLOT0123 ,
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+ InstrItinData<ALU32_ADDI_tc_1_SLOT0123 ,
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+
+ // ALU64
+ InstrItinData<ALU64_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData<ALU64_tc_2_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData<ALU64_tc_2early_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData<ALU64_tc_3x_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+
+ // CR -> System
+ InstrItinData<CR_tc_2_SLOT3 , [InstrStage<1, [SLOT3]>]>,
+ InstrItinData<CR_tc_2early_SLOT3 , [InstrStage<1, [SLOT3]>]>,
+ InstrItinData<CR_tc_3x_SLOT3 , [InstrStage<1, [SLOT3]>]>,
+
+ // Jump (conditional/unconditional/return etc)
+ // CR
+ InstrItinData<CR_tc_2early_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData<CR_tc_3x_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ // J
+ InstrItinData<J_tc_2early_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData<J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ // JR
+ InstrItinData<J_tc_2early_SLOT2 , [InstrStage<1, [SLOT2]>]>,
+
+ //Load
+ InstrItinData<LD_tc_ld_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>,
+ InstrItinData<LD_tc_ld_SLOT0 , [InstrStage<1, [SLOT0]>]>,
+ InstrItinData<LD_tc_3or4stall_SLOT0 , [InstrStage<1, [SLOT0]>]>,
+
+ // M
+ InstrItinData<M_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData<M_tc_2_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData<M_tc_3_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData<M_tc_3x_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData<M_tc_3or4x_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+
+ // Store
+ // ST
+ InstrItinData<ST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>,
+ // ST0
+ InstrItinData<ST_tc_st_SLOT0 , [InstrStage<1, [SLOT0]>]>,
+ InstrItinData<ST_tc_ld_SLOT0 , [InstrStage<1, [SLOT0]>]>,
+
+ // S
+ InstrItinData<S_2op_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData<S_2op_tc_2_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData<S_2op_tc_2early_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData<S_2op_tc_3or4x_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData<S_3op_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData<S_3op_tc_2early_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData<S_3op_tc_2_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData<S_3op_tc_3_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData<S_3op_tc_3x_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData<S_3op_tc_3stall_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>,
+
+ // SYS
+ InstrItinData<ST_tc_3stall_SLOT0 , [InstrStage<1, [SLOT0]>]>,
+
+ // New Value Compare Jump
+ InstrItinData<NCJ_tc_3or4stall_SLOT0 , [InstrStage<1, [SLOT0]>]>,
+
+ // Mem ops - MEM_V4
+ InstrItinData<V2LDST_tc_st_SLOT0 , [InstrStage<1, [SLOT0]>]>,
+ InstrItinData<V2LDST_tc_ld_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>,
+ InstrItinData<V2LDST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>,
+ InstrItinData<V4LDST_tc_st_SLOT0 , [InstrStage<1, [SLOT0]>]>,
+ InstrItinData<V4LDST_tc_ld_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>,
+ InstrItinData<V4LDST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>,
+
+ InstrItinData<DUPLEX , [InstrStage<1, [SLOT0]>]>,
+
+ // ENDLOOP
+ InstrItinData<J_tc_2early_SLOT0123 , [InstrStage<1, [SLOT_ENDLOOP]>]>,
+
+ // Extender/PREFIX
+ InstrItinData<EXTENDER_tc_1_SLOT0123,
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+
+ InstrItinData<COMPOUND_CJ_ARCHDEPSLOT , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData<COMPOUND , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData<PSEUDO , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+ InstrItinData<PSEUDOM, [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [SLOT2, SLOT3]>]>
+ ]>;
+
+def HexagonModelV4 : SchedMachineModel {
+ // Max issue per cycle == bundle width.
+ let IssueWidth = 4;
+ let Itineraries = HexagonItinerariesV4;
+ let LoadLatency = 1;
+ let CompleteModel = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Hexagon V4 Resource Definitions -
+//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV55.td b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV55.td
new file mode 100644
index 000000000000..b2a75f7200d7
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV55.td
@@ -0,0 +1,194 @@
+//=-HexagonScheduleV4.td - HexagonV4 Scheduling Definitions --*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// There are four SLOTS (four parallel pipelines) in Hexagon V4 machine.
+// This file describes that machine information.
+
+//
+// |===========|==================================================|
+// | PIPELINE | Instruction Classes |
+// |===========|==================================================|
+// | SLOT0 | LD ST ALU32 MEMOP NV SYSTEM |
+// |-----------|--------------------------------------------------|
+// | SLOT1 | LD ST ALU32 |
+// |-----------|--------------------------------------------------|
+// | SLOT2 | XTYPE ALU32 J JR |
+// |-----------|--------------------------------------------------|
+// | SLOT3 | XTYPE ALU32 J CR |
+// |===========|==================================================|
+
+def CJ_tc_1_SLOT23 : InstrItinClass;
+def CJ_tc_2early_SLOT23 : InstrItinClass;
+def COPROC_VMEM_vtc_long_SLOT01 : InstrItinClass;
+def COPROC_VX_vtc_long_SLOT23 : InstrItinClass;
+def COPROC_VX_vtc_SLOT23 : InstrItinClass;
+def J_tc_3stall_SLOT2 : InstrItinClass;
+def MAPPING_tc_1_SLOT0123 : InstrItinClass;
+def M_tc_3stall_SLOT23 : InstrItinClass;
+
+def HexagonItinerariesV55 :
+ ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP], [], [
+ // ALU32
+ InstrItinData<ALU32_2op_tc_1_SLOT0123 ,
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 1, 1]>,
+ InstrItinData<ALU32_2op_tc_2early_SLOT0123,
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 1, 1]>,
+ InstrItinData<ALU32_3op_tc_1_SLOT0123 ,
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 1, 1]>,
+ InstrItinData<ALU32_3op_tc_2_SLOT0123 ,
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 1, 1]>,
+ InstrItinData<ALU32_3op_tc_2early_SLOT0123,
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 1, 1]>,
+ InstrItinData<ALU32_ADDI_tc_1_SLOT0123 ,
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 1, 1]>,
+
+ // ALU64
+ InstrItinData<ALU64_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
+ [1, 1, 1]>,
+ InstrItinData<ALU64_tc_2_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
+ [2, 1, 1]>,
+ InstrItinData<ALU64_tc_2early_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
+ [2, 1, 1]>,
+ InstrItinData<ALU64_tc_3x_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
+ [3, 1, 1]>,
+
+ // CR -> System
+ InstrItinData<CR_tc_2_SLOT3 , [InstrStage<1, [SLOT3]>], [2, 1, 1]>,
+ InstrItinData<CR_tc_2early_SLOT3 , [InstrStage<1, [SLOT3]>], [2, 1, 1]>,
+ InstrItinData<CR_tc_3x_SLOT3 , [InstrStage<1, [SLOT3]>], [3, 1, 1]>,
+
+ // Jump (conditional/unconditional/return etc)
+ InstrItinData<CR_tc_2early_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
+ [2, 1, 1, 1]>,
+ InstrItinData<CR_tc_3x_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
+ [3, 1, 1, 1]>,
+ InstrItinData<CJ_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
+ [1, 1, 1, 1]>,
+ InstrItinData<CJ_tc_2early_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
+ [2, 1, 1, 1]>,
+ InstrItinData<J_tc_2early_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
+ [2, 1, 1, 1]>,
+ InstrItinData<J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT,
+ [InstrStage<1, [SLOT2, SLOT3]>], [2, 1, 1, 1]>,
+
+ // JR
+ InstrItinData<J_tc_2early_SLOT2 , [InstrStage<1, [SLOT2]>], [2, 1, 1]>,
+ InstrItinData<J_tc_3stall_SLOT2 , [InstrStage<1, [SLOT2]>], [3, 1, 1]>,
+
+ // Extender
+ InstrItinData<EXTENDER_tc_1_SLOT0123,
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 1, 1]>,
+
+ // Load
+ InstrItinData<LD_tc_ld_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>],
+ [2, 1]>,
+ InstrItinData<LD_tc_3or4stall_SLOT0, [InstrStage<1, [SLOT0]>], [2, 1]>,
+ InstrItinData<LD_tc_ld_SLOT0 , [InstrStage<1, [SLOT0]>], [2, 1]>,
+
+ // M
+ InstrItinData<M_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
+ [1, 1, 1]>,
+ InstrItinData<M_tc_2_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
+ [2, 1, 1]>,
+ InstrItinData<M_tc_3_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
+ [1, 1, 1]>,
+ InstrItinData<M_tc_3x_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
+ [3, 1, 1]>,
+ InstrItinData<M_tc_3or4x_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
+ [3, 1, 1]>,
+ InstrItinData<M_tc_3stall_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
+ [3, 1, 1]>,
+
+ // Store
+ InstrItinData<ST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>],
+ [1, 1, 1]>,
+ InstrItinData<ST_tc_3stall_SLOT0, [InstrStage<1, [SLOT0]>], [2, 1, 1]>,
+ InstrItinData<ST_tc_ld_SLOT0 , [InstrStage<1, [SLOT0]>], [2, 1, 1]>,
+ InstrItinData<ST_tc_st_SLOT0 , [InstrStage<1, [SLOT0]>], [1, 1, 1]>,
+
+ // S
+ InstrItinData<S_2op_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
+ [1, 1, 1]>,
+ InstrItinData<S_2op_tc_2_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
+ [2, 1, 1]>,
+ InstrItinData<S_2op_tc_2early_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
+ [2, 1, 1]>,
+ InstrItinData<S_2op_tc_3or4x_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
+ [3, 1, 1]>,
+ InstrItinData<S_3op_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
+ [1, 1, 1]>,
+ InstrItinData<S_3op_tc_2_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
+ [2, 1, 1]>,
+ InstrItinData<S_3op_tc_2early_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
+ [2, 1, 1]>,
+ InstrItinData<S_3op_tc_3_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
+ [3, 1, 1]>,
+ InstrItinData<S_3op_tc_3stall_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
+ [3, 1, 1]>,
+ InstrItinData<S_3op_tc_3x_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
+ [3, 1, 1]>,
+
+ // New Value Compare Jump
+ InstrItinData<NCJ_tc_3or4stall_SLOT0, [InstrStage<1, [SLOT0]>],
+ [3, 1, 1, 1]>,
+
+ // Mem ops
+ InstrItinData<V2LDST_tc_st_SLOT0 , [InstrStage<1, [SLOT0]>],
+ [1, 1, 1, 1]>,
+ InstrItinData<V2LDST_tc_ld_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>],
+ [2, 1, 1, 1]>,
+ InstrItinData<V2LDST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>],
+ [1, 1, 1, 1]>,
+ InstrItinData<V4LDST_tc_st_SLOT0 , [InstrStage<1, [SLOT0]>],
+ [1, 1, 1, 1]>,
+ InstrItinData<V4LDST_tc_ld_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>],
+ [3, 1, 1, 1]>,
+ InstrItinData<V4LDST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>],
+ [1, 1, 1, 1]>,
+
+ // Endloop
+ InstrItinData<J_tc_2early_SLOT0123, [InstrStage<1, [SLOT_ENDLOOP]>],
+ [2]>,
+
+ // Vector
+ InstrItinData<COPROC_VMEM_vtc_long_SLOT01,
+ [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 1, 1]>,
+ InstrItinData<COPROC_VX_vtc_long_SLOT23 ,
+ [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 1, 1]>,
+ InstrItinData<COPROC_VX_vtc_SLOT23 ,
+ [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 1, 1]>,
+ InstrItinData<MAPPING_tc_1_SLOT0123 ,
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>],
+ [1, 1, 1, 1]>,
+
+ // Misc
+ InstrItinData<COMPOUND_CJ_ARCHDEPSLOT , [InstrStage<1, [SLOT2, SLOT3]>],
+ [1, 1, 1]>,
+ InstrItinData<COMPOUND , [InstrStage<1, [SLOT2, SLOT3]>],
+ [1, 1, 1]>,
+ InstrItinData<DUPLEX , [InstrStage<1, [SLOT0]>], [1, 1, 1]>,
+ InstrItinData<PREFIX , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>],
+ [1, 1, 1]>,
+ InstrItinData<PSEUDO , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>],
+ [1, 1, 1]>,
+ InstrItinData<PSEUDOM, [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [SLOT2, SLOT3]>], [1, 1, 1]>
+ ]>;
+
+def HexagonModelV55 : SchedMachineModel {
+ // Max issue per cycle == bundle width.
+ let IssueWidth = 4;
+ let Itineraries = HexagonItinerariesV55;
+ let LoadLatency = 1;
+ let CompleteModel = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Hexagon V4 Resource Definitions -
+//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV60.td b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV60.td
new file mode 100644
index 000000000000..dc2ce43b0579
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV60.td
@@ -0,0 +1,301 @@
+//=-HexagonScheduleV60.td - HexagonV60 Scheduling Definitions *- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// CVI pipes from the "Hexagon Multimedia Co-Processor Extensions Arch Spec".
+def CVI_ST : FuncUnit;
+def CVI_XLANE : FuncUnit;
+def CVI_SHIFT : FuncUnit;
+def CVI_MPY0 : FuncUnit;
+def CVI_MPY1 : FuncUnit;
+def CVI_LD : FuncUnit;
+
+// Combined functional units.
+def CVI_XLSHF : FuncUnit;
+def CVI_MPY01 : FuncUnit;
+def CVI_ALL : FuncUnit;
+
+// Combined functional unit data.
+def HexagonComboFuncsV60 :
+ ComboFuncUnits<[
+ ComboFuncData<CVI_XLSHF , [CVI_XLANE, CVI_SHIFT]>,
+ ComboFuncData<CVI_MPY01 , [CVI_MPY0, CVI_MPY1]>,
+ ComboFuncData<CVI_ALL , [CVI_ST, CVI_XLANE, CVI_SHIFT,
+ CVI_MPY0, CVI_MPY1, CVI_LD]>
+ ]>;
+
+// Note: When adding additional vector scheduling classes, add the
+// corresponding methods to the class HexagonInstrInfo.
+def CVI_VA : InstrItinClass;
+def CVI_VA_DV : InstrItinClass;
+def CVI_VX_LONG : InstrItinClass;
+def CVI_VX_LATE : InstrItinClass;
+def CVI_VX : InstrItinClass;
+def CVI_VX_DV_LONG : InstrItinClass;
+def CVI_VX_DV : InstrItinClass;
+def CVI_VX_DV_SLOT2 : InstrItinClass;
+def CVI_VP : InstrItinClass;
+def CVI_VP_LONG : InstrItinClass;
+def CVI_VP_VS_EARLY : InstrItinClass;
+def CVI_VP_VS_LONG_EARLY : InstrItinClass;
+def CVI_VP_VS_LONG : InstrItinClass;
+def CVI_VP_VS : InstrItinClass;
+def CVI_VP_DV : InstrItinClass;
+def CVI_VS : InstrItinClass;
+def CVI_VINLANESAT : InstrItinClass;
+def CVI_VM_LD : InstrItinClass;
+def CVI_VM_TMP_LD : InstrItinClass;
+def CVI_VM_CUR_LD : InstrItinClass;
+def CVI_VM_VP_LDU : InstrItinClass;
+def CVI_VM_ST : InstrItinClass;
+def CVI_VM_NEW_ST : InstrItinClass;
+def CVI_VM_STU : InstrItinClass;
+def CVI_HIST : InstrItinClass;
+def CVI_VA_EXT : InstrItinClass;
+
+// There are four SLOTS (four parallel pipelines) in Hexagon V60 machine.
+// This file describes that machine information.
+//
+// |===========|==================================================|
+// | PIPELINE | Instruction Classes |
+// |===========|==================================================|
+// | SLOT0 | LD ST ALU32 MEMOP NV SYSTEM |
+// |-----------|--------------------------------------------------|
+// | SLOT1 | LD ST ALU32 |
+// |-----------|--------------------------------------------------|
+// | SLOT2 | XTYPE ALU32 J JR |
+// |-----------|--------------------------------------------------|
+// | SLOT3 | XTYPE ALU32 J CR |
+// |===========|==================================================|
+//
+//
+// In addition to using the above SLOTS, there are also six vector pipelines
+// in the CVI co-processor in the Hexagon V60 machine.
+//
+// |=========| |=========| |=========| |=========| |=========| |=========|
+// SLOT | CVI_LD | |CVI_MPY3 | |CVI_MPY2 | |CVI_SHIFT| |CVI_XLANE| | CVI_ST |
+// ==== |=========| |=========| |=========| |=========| |=========| |=========|
+// S0-3 | | | CVI_VA | | CVI_VA | | CVI_VA | | CVI_VA | | |
+// S2-3 | | | CVI_VX | | CVI_VX | | | | | | |
+// S0-3 | | | | | | | | | CVI_VP | | |
+// S0-3 | | | | | | | CVI_VS | | | | |
+// S0-1 |(CVI_LD) | | CVI_LD | | CVI_LD | | CVI_LD | | CVI_LD | | |
+// S0-1 |(C*TMP_LD) | | | | | | | | | |
+// S01 |(C*_LDU) | | | | | | | | C*_LDU | | |
+// S0 | | | CVI_ST | | CVI_ST | | CVI_ST | | CVI_ST | |(CVI_ST) |
+// S0 | | | | | | | | | | |(C*TMP_ST)
+// S01 | | | | | | | | | VSTU | |(C*_STU) |
+// |=========| |=========| |=========| |=========| |=========| |=========|
+// |=====================| |=====================|
+// | CVI_MPY2 & CVI_MPY3 | |CVI_XLANE & CVI_SHIFT|
+// |=====================| |=====================|
+// S0-3 | CVI_VA_DV | | CVI_VA_DV |
+// S0-3 | | | CVI_VP_DV |
+// S2-3 | CVI_VX_DV | | |
+// |=====================| |=====================|
+// |=====================================================================|
+// S0-3 | CVI_HIST Histogram |
+// S0123| CVI_VA_EXT Extract |
+// |=====================================================================|
+
+def HexagonItinerariesV60 :
+ ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP,
+ CVI_ST, CVI_XLANE, CVI_SHIFT, CVI_MPY0, CVI_MPY1,
+ CVI_LD, CVI_XLSHF, CVI_MPY01, CVI_ALL], [], [
+ // ALU32
+ InstrItinData<ALU32_2op_tc_1_SLOT0123 ,
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+ InstrItinData<ALU32_2op_tc_2early_SLOT0123,
+ [InstrStage<2, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+ InstrItinData<ALU32_3op_tc_1_SLOT0123 ,
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+ InstrItinData<ALU32_3op_tc_2_SLOT0123 ,
+ [InstrStage<2, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+ InstrItinData<ALU32_3op_tc_2early_SLOT0123,
+ [InstrStage<2, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+ InstrItinData<ALU32_ADDI_tc_1_SLOT0123 ,
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+
+ // ALU64
+ InstrItinData<ALU64_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData<ALU64_tc_2_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>,
+ InstrItinData<ALU64_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
+ InstrItinData<ALU64_tc_3x_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>,
+
+ // CR -> System
+ InstrItinData<CR_tc_2_SLOT3 , [InstrStage<2, [SLOT3]>]>,
+ InstrItinData<CR_tc_2early_SLOT3 , [InstrStage<2, [SLOT3]>]>,
+ InstrItinData<CR_tc_3x_SLOT3 , [InstrStage<3, [SLOT3]>]>,
+
+ // Jump (conditional/unconditional/return etc)
+ InstrItinData<CR_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
+ InstrItinData<CR_tc_3x_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>,
+ InstrItinData<CJ_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData<CJ_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
+ InstrItinData<J_tc_2early_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>,
+ InstrItinData<J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT , [InstrStage<1, [SLOT2, SLOT3]>]>,
+
+ // JR
+ InstrItinData<J_tc_2early_SLOT2 , [InstrStage<2, [SLOT2]>]>,
+ InstrItinData<J_tc_3stall_SLOT2 , [InstrStage<3, [SLOT2]>]>,
+
+ // Extender
+ InstrItinData<EXTENDER_tc_1_SLOT0123, [InstrStage<1,
+ [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+
+ // Load
+ InstrItinData<LD_tc_ld_SLOT01 , [InstrStage<3, [SLOT0, SLOT1]>]>,
+ InstrItinData<LD_tc_3or4stall_SLOT0, [InstrStage<4, [SLOT0]>]>,
+ InstrItinData<LD_tc_ld_SLOT0 , [InstrStage<3, [SLOT0]>]>,
+
+ // M
+ InstrItinData<M_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData<M_tc_2_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>,
+ InstrItinData<M_tc_3_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>,
+ InstrItinData<M_tc_3x_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>,
+ InstrItinData<M_tc_3or4x_SLOT23 , [InstrStage<4, [SLOT2, SLOT3]>]>,
+ InstrItinData<M_tc_3stall_SLOT23, [InstrStage<3, [SLOT2, SLOT3]>]>,
+
+ // Store
+ InstrItinData<ST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>,
+ InstrItinData<ST_tc_3stall_SLOT0, [InstrStage<3, [SLOT0]>]>,
+ InstrItinData<ST_tc_ld_SLOT0 , [InstrStage<3, [SLOT0]>]>,
+ InstrItinData<ST_tc_st_SLOT0 , [InstrStage<1, [SLOT0]>]>,
+
+ // S
+ InstrItinData<S_2op_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData<S_2op_tc_2_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>,
+ InstrItinData<S_2op_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
+ // The S_2op_tc_3x_SLOT23 slots are 4 cycles on v60.
+ InstrItinData<S_2op_tc_3or4x_SLOT23 , [InstrStage<4, [SLOT2, SLOT3]>]>,
+ InstrItinData<S_3op_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData<S_3op_tc_2_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>,
+ InstrItinData<S_3op_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
+ InstrItinData<S_3op_tc_3_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>,
+ InstrItinData<S_3op_tc_3stall_SLOT23, [InstrStage<3, [SLOT2, SLOT3]>]>,
+ InstrItinData<S_3op_tc_3x_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>,
+
+ // New Value Compare Jump
+ InstrItinData<NCJ_tc_3or4stall_SLOT0, [InstrStage<4, [SLOT0]>]>,
+
+ // Mem ops
+ InstrItinData<V2LDST_tc_st_SLOT0 , [InstrStage<1, [SLOT0]>]>,
+ InstrItinData<V2LDST_tc_ld_SLOT01 , [InstrStage<2, [SLOT0, SLOT1]>]>,
+ InstrItinData<V2LDST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>,
+ InstrItinData<V4LDST_tc_st_SLOT0 , [InstrStage<1, [SLOT0]>]>,
+ InstrItinData<V4LDST_tc_ld_SLOT01 , [InstrStage<3, [SLOT0, SLOT1]>]>,
+ InstrItinData<V4LDST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>,
+
+ // Endloop
+ InstrItinData<J_tc_2early_SLOT0123, [InstrStage<2, [SLOT_ENDLOOP]>]>,
+
+ // Vector
+ InstrItinData<COPROC_VMEM_vtc_long_SLOT01,
+ [InstrStage<3, [SLOT0, SLOT1]>]>,
+ InstrItinData<COPROC_VX_vtc_long_SLOT23 ,
+ [InstrStage<3, [SLOT2, SLOT3]>]>,
+ InstrItinData<COPROC_VX_vtc_SLOT23 ,
+ [InstrStage<3, [SLOT2, SLOT3]>]>,
+ InstrItinData<MAPPING_tc_1_SLOT0123 ,
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+
+ // Duplex and Compound
+ InstrItinData<DUPLEX , [InstrStage<1, [SLOT0]>]>,
+ InstrItinData<COMPOUND_CJ_ARCHDEPSLOT , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData<COMPOUND , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ // Misc
+ InstrItinData<PREFIX , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+ InstrItinData<PSEUDO , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+ InstrItinData<PSEUDOM , [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [SLOT2, SLOT3]>]>,
+
+ // Latest CVI spec definitions.
+ InstrItinData<CVI_VA,[InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+ InstrStage<1, [CVI_XLANE,CVI_SHIFT,
+ CVI_MPY0, CVI_MPY1]>]>,
+ InstrItinData<CVI_VA_DV,
+ [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+ InstrStage<1, [CVI_XLSHF, CVI_MPY01]>]>,
+ InstrItinData<CVI_VX_LONG, [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1]>]>,
+ InstrItinData<CVI_VX_LATE, [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1]>]>,
+ InstrItinData<CVI_VX,[InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1]>]>,
+ InstrItinData<CVI_VX_DV_LONG,
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01]>]>,
+ InstrItinData<CVI_VX_DV,
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01]>]>,
+ InstrItinData<CVI_VX_DV_SLOT2,
+ [InstrStage<1, [SLOT2], 0>,
+ InstrStage<1, [CVI_MPY01]>]>,
+ InstrItinData<CVI_VP, [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+ InstrStage<1, [CVI_XLANE]>]>,
+ InstrItinData<CVI_VP_LONG, [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+ InstrStage<1, [CVI_XLANE]>]>,
+ InstrItinData<CVI_VP_VS_EARLY,
+ [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+ InstrStage<1, [CVI_XLSHF]>]>,
+ InstrItinData<CVI_VP_VS_LONG,
+ [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+ InstrStage<1, [CVI_XLSHF]>]>,
+ InstrItinData<CVI_VP_VS,
+ [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+ InstrStage<1, [CVI_XLSHF]>]>,
+ InstrItinData<CVI_VP_VS_LONG_EARLY,
+ [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+ InstrStage<1, [CVI_XLSHF]>]>,
+ InstrItinData<CVI_VP_DV , [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+ InstrStage<1, [CVI_XLSHF]>]>,
+ InstrItinData<CVI_VS,
+ [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+ InstrStage<1, [CVI_SHIFT]>]>,
+ InstrItinData<CVI_VINLANESAT,
+ [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+ InstrStage<1, [CVI_SHIFT]>]>,
+ InstrItinData<CVI_VM_LD , [InstrStage<1, [SLOT0, SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_XLANE, CVI_SHIFT,
+ CVI_MPY0, CVI_MPY1]>]>,
+ InstrItinData<CVI_VM_TMP_LD,[InstrStage<1,[SLOT0, SLOT1], 0>,
+ InstrStage<1, [CVI_LD]>]>,
+ InstrItinData<CVI_VM_CUR_LD,[InstrStage<1,[SLOT0, SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_XLANE, CVI_SHIFT,
+ CVI_MPY0, CVI_MPY1]>]>,
+ InstrItinData<CVI_VM_VP_LDU,[InstrStage<1,[SLOT0], 0>,
+ InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_XLANE]>]>,
+ InstrItinData<CVI_VM_ST , [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [CVI_ST], 0>,
+ InstrStage<1, [CVI_XLANE, CVI_SHIFT,
+ CVI_MPY0, CVI_MPY1]>]>,
+ InstrItinData<CVI_VM_NEW_ST,[InstrStage<1,[SLOT0], 0>,
+ InstrStage<1, [CVI_ST]>]>,
+ InstrItinData<CVI_VM_STU , [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_ST], 0>,
+ InstrStage<1, [CVI_XLANE]>]>,
+ InstrItinData<CVI_HIST , [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+ InstrStage<1, [CVI_ALL]>]>
+ ]>;
+
+def HexagonModelV60 : SchedMachineModel {
+ // Max issue per cycle == bundle width.
+ let IssueWidth = 4;
+ let Itineraries = HexagonItinerariesV60;
+ let LoadLatency = 1;
+ let CompleteModel = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Hexagon V60 Resource Definitions -
+//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
new file mode 100644
index 000000000000..10730536080e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
@@ -0,0 +1,63 @@
+//===-- HexagonSelectionDAGInfo.cpp - Hexagon SelectionDAG Info -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the HexagonSelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagon-selectiondag-info"
+
+SDValue HexagonSelectionDAGInfo::EmitTargetCodeForMemcpy(
+ SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline,
+ MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
+ ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+ if (AlwaysInline || (Align & 0x3) != 0 || !ConstantSize)
+ return SDValue();
+
+ uint64_t SizeVal = ConstantSize->getZExtValue();
+ if (SizeVal < 32 || (SizeVal % 8) != 0)
+ return SDValue();
+
+ // Special case aligned memcpys with size >= 32 bytes and a multiple of 8.
+ //
+ const TargetLowering &TLI = *DAG.getSubtarget().getTargetLowering();
+ TargetLowering::ArgListTy Args;
+ TargetLowering::ArgListEntry Entry;
+ Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
+ Entry.Node = Dst;
+ Args.push_back(Entry);
+ Entry.Node = Src;
+ Args.push_back(Entry);
+ Entry.Node = Size;
+ Args.push_back(Entry);
+
+ const char *SpecialMemcpyName =
+ "__hexagon_memcpy_likely_aligned_min32bytes_mult8bytes";
+ const MachineFunction &MF = DAG.getMachineFunction();
+ bool LongCalls = MF.getSubtarget<HexagonSubtarget>().useLongCalls();
+ unsigned Flags = LongCalls ? HexagonII::HMOTF_ConstExtended : 0;
+
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(dl)
+ .setChain(Chain)
+ .setCallee(TLI.getLibcallCallingConv(RTLIB::MEMCPY),
+ Type::getVoidTy(*DAG.getContext()),
+ DAG.getTargetExternalSymbol(SpecialMemcpyName,
+ TLI.getPointerTy(DAG.getDataLayout()), Flags),
+ std::move(Args))
+ .setDiscardResult();
+
+ std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+ return CallResult.second;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.h b/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
new file mode 100644
index 000000000000..a83a8efb7588
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
@@ -0,0 +1,35 @@
+//===-- HexagonSelectionDAGInfo.h - Hexagon SelectionDAG Info ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Hexagon subclass for SelectionDAGTargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONSELECTIONDAGINFO_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONSELECTIONDAGINFO_H
+
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+
+namespace llvm {
+
+class HexagonSelectionDAGInfo : public SelectionDAGTargetInfo {
+public:
+ explicit HexagonSelectionDAGInfo() = default;
+
+ SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
+ SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align, bool isVolatile,
+ bool AlwaysInline,
+ MachinePointerInfo DstPtrInfo,
+ MachinePointerInfo SrcPtrInfo) const override;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
new file mode 100644
index 000000000000..68484344fded
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
@@ -0,0 +1,115 @@
+//=== HexagonSplitConst32AndConst64.cpp - split CONST32/Const64 into HI/LO ===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// When the compiler is invoked with no small data, for instance, with the -G0
+// command line option, then all CONST* opcodes should be broken down into
+// appropriate LO and HI instructions. This splitting is done by this pass.
+// The only reason this is not done in the DAG lowering itself is that there
+// is no simple way of getting the register allocator to allot the same hard
+// register to the result of LO and HI instructions. This pass is always
+// scheduled after register allocation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "HexagonTargetObjectFile.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "xfer"
+
+namespace llvm {
+ FunctionPass *createHexagonSplitConst32AndConst64();
+ void initializeHexagonSplitConst32AndConst64Pass(PassRegistry&);
+}
+
+namespace {
+ class HexagonSplitConst32AndConst64 : public MachineFunctionPass {
+ public:
+ static char ID;
+ HexagonSplitConst32AndConst64() : MachineFunctionPass(ID) {
+ PassRegistry &R = *PassRegistry::getPassRegistry();
+ initializeHexagonSplitConst32AndConst64Pass(R);
+ }
+ StringRef getPassName() const override {
+ return "Hexagon Split Const32s and Const64s";
+ }
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+ };
+}
+
+char HexagonSplitConst32AndConst64::ID = 0;
+
+INITIALIZE_PASS(HexagonSplitConst32AndConst64, "split-const-for-sdata",
+ "Hexagon Split Const32s and Const64s", false, false)
+
+bool HexagonSplitConst32AndConst64::runOnMachineFunction(MachineFunction &Fn) {
+ const HexagonTargetObjectFile &TLOF =
+ *static_cast<const HexagonTargetObjectFile *>(
+ Fn.getTarget().getObjFileLowering());
+ if (TLOF.isSmallDataEnabled())
+ return true;
+
+ const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo();
+ const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo();
+
+ // Loop over all of the basic blocks
+ for (MachineBasicBlock &B : Fn) {
+ for (auto I = B.begin(), E = B.end(); I != E; ) {
+ MachineInstr &MI = *I;
+ ++I;
+ unsigned Opc = MI.getOpcode();
+
+ if (Opc == Hexagon::CONST32) {
+ unsigned DestReg = MI.getOperand(0).getReg();
+ uint64_t ImmValue = MI.getOperand(1).getImm();
+ const DebugLoc &DL = MI.getDebugLoc();
+ BuildMI(B, MI, DL, TII->get(Hexagon::A2_tfrsi), DestReg)
+ .addImm(ImmValue);
+ B.erase(&MI);
+ } else if (Opc == Hexagon::CONST64) {
+ unsigned DestReg = MI.getOperand(0).getReg();
+ int64_t ImmValue = MI.getOperand(1).getImm();
+ const DebugLoc &DL = MI.getDebugLoc();
+ unsigned DestLo = TRI->getSubReg(DestReg, Hexagon::isub_lo);
+ unsigned DestHi = TRI->getSubReg(DestReg, Hexagon::isub_hi);
+
+ int32_t LowWord = (ImmValue & 0xFFFFFFFF);
+ int32_t HighWord = (ImmValue >> 32) & 0xFFFFFFFF;
+
+ BuildMI(B, MI, DL, TII->get(Hexagon::A2_tfrsi), DestLo)
+ .addImm(LowWord);
+ BuildMI(B, MI, DL, TII->get(Hexagon::A2_tfrsi), DestHi)
+ .addImm(HighWord);
+ B.erase(&MI);
+ }
+ }
+ }
+
+ return true;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+FunctionPass *llvm::createHexagonSplitConst32AndConst64() {
+ return new HexagonSplitConst32AndConst64();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp
new file mode 100644
index 000000000000..2c937216d463
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp
@@ -0,0 +1,1205 @@
+//===--- HexagonSplitDouble.cpp -------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hsdr"
+
+#include "HexagonInstrInfo.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <limits>
+#include <map>
+#include <set>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+namespace llvm {
+
+ FunctionPass *createHexagonSplitDoubleRegs();
+ void initializeHexagonSplitDoubleRegsPass(PassRegistry&);
+
+} // end namespace llvm
+
+namespace {
+
+ static cl::opt<int> MaxHSDR("max-hsdr", cl::Hidden, cl::init(-1),
+ cl::desc("Maximum number of split partitions"));
+ static cl::opt<bool> MemRefsFixed("hsdr-no-mem", cl::Hidden, cl::init(true),
+ cl::desc("Do not split loads or stores"));
+
+ class HexagonSplitDoubleRegs : public MachineFunctionPass {
+ public:
+ static char ID;
+
+ HexagonSplitDoubleRegs() : MachineFunctionPass(ID), TRI(nullptr),
+ TII(nullptr) {
+ initializeHexagonSplitDoubleRegsPass(*PassRegistry::getPassRegistry());
+ }
+
+ StringRef getPassName() const override {
+ return "Hexagon Split Double Registers";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineLoopInfo>();
+ AU.addPreserved<MachineLoopInfo>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ private:
+ static const TargetRegisterClass *const DoubleRC;
+
+ const HexagonRegisterInfo *TRI;
+ const HexagonInstrInfo *TII;
+ const MachineLoopInfo *MLI;
+ MachineRegisterInfo *MRI;
+
+ typedef std::set<unsigned> USet;
+ typedef std::map<unsigned,USet> UUSetMap;
+ typedef std::pair<unsigned,unsigned> UUPair;
+ typedef std::map<unsigned,UUPair> UUPairMap;
+ typedef std::map<const MachineLoop*,USet> LoopRegMap;
+
+ bool isInduction(unsigned Reg, LoopRegMap &IRM) const;
+ bool isVolatileInstr(const MachineInstr *MI) const;
+ bool isFixedInstr(const MachineInstr *MI) const;
+ void partitionRegisters(UUSetMap &P2Rs);
+ int32_t profit(const MachineInstr *MI) const;
+ bool isProfitable(const USet &Part, LoopRegMap &IRM) const;
+
+ void collectIndRegsForLoop(const MachineLoop *L, USet &Rs);
+ void collectIndRegs(LoopRegMap &IRM);
+
+ void createHalfInstr(unsigned Opc, MachineInstr *MI,
+ const UUPairMap &PairMap, unsigned SubR);
+ void splitMemRef(MachineInstr *MI, const UUPairMap &PairMap);
+ void splitImmediate(MachineInstr *MI, const UUPairMap &PairMap);
+ void splitCombine(MachineInstr *MI, const UUPairMap &PairMap);
+ void splitExt(MachineInstr *MI, const UUPairMap &PairMap);
+ void splitShift(MachineInstr *MI, const UUPairMap &PairMap);
+ void splitAslOr(MachineInstr *MI, const UUPairMap &PairMap);
+ bool splitInstr(MachineInstr *MI, const UUPairMap &PairMap);
+ void replaceSubregUses(MachineInstr *MI, const UUPairMap &PairMap);
+ void collapseRegPairs(MachineInstr *MI, const UUPairMap &PairMap);
+ bool splitPartition(const USet &Part);
+
+ static int Counter;
+ static void dump_partition(raw_ostream&, const USet&,
+ const TargetRegisterInfo&);
+ };
+
+ char HexagonSplitDoubleRegs::ID;
+ int HexagonSplitDoubleRegs::Counter = 0;
+ const TargetRegisterClass *const HexagonSplitDoubleRegs::DoubleRC
+ = &Hexagon::DoubleRegsRegClass;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(HexagonSplitDoubleRegs, "hexagon-split-double",
+ "Hexagon Split Double Registers", false, false)
+
+void HexagonSplitDoubleRegs::dump_partition(raw_ostream &os,
+ const USet &Part, const TargetRegisterInfo &TRI) {
+ dbgs() << '{';
+ for (auto I : Part)
+ dbgs() << ' ' << PrintReg(I, &TRI);
+ dbgs() << " }";
+}
+
+bool HexagonSplitDoubleRegs::isInduction(unsigned Reg, LoopRegMap &IRM) const {
+ for (auto I : IRM) {
+ const USet &Rs = I.second;
+ if (Rs.find(Reg) != Rs.end())
+ return true;
+ }
+ return false;
+}
+
+bool HexagonSplitDoubleRegs::isVolatileInstr(const MachineInstr *MI) const {
+ for (auto &I : MI->memoperands())
+ if (I->isVolatile())
+ return true;
+ return false;
+}
+
+bool HexagonSplitDoubleRegs::isFixedInstr(const MachineInstr *MI) const {
+ if (MI->mayLoad() || MI->mayStore())
+ if (MemRefsFixed || isVolatileInstr(MI))
+ return true;
+ if (MI->isDebugValue())
+ return false;
+
+ unsigned Opc = MI->getOpcode();
+ switch (Opc) {
+ default:
+ return true;
+
+ case TargetOpcode::PHI:
+ case TargetOpcode::COPY:
+ break;
+
+ case Hexagon::L2_loadrd_io:
+ // Not handling stack stores (only reg-based addresses).
+ if (MI->getOperand(1).isReg())
+ break;
+ return true;
+ case Hexagon::S2_storerd_io:
+ // Not handling stack stores (only reg-based addresses).
+ if (MI->getOperand(0).isReg())
+ break;
+ return true;
+ case Hexagon::L2_loadrd_pi:
+ case Hexagon::S2_storerd_pi:
+
+ case Hexagon::A2_tfrpi:
+ case Hexagon::A2_combineii:
+ case Hexagon::A4_combineir:
+ case Hexagon::A4_combineii:
+ case Hexagon::A4_combineri:
+ case Hexagon::A2_combinew:
+ case Hexagon::CONST64:
+
+ case Hexagon::A2_sxtw:
+
+ case Hexagon::A2_andp:
+ case Hexagon::A2_orp:
+ case Hexagon::A2_xorp:
+ case Hexagon::S2_asl_i_p_or:
+ case Hexagon::S2_asl_i_p:
+ case Hexagon::S2_asr_i_p:
+ case Hexagon::S2_lsr_i_p:
+ break;
+ }
+
+ for (auto &Op : MI->operands()) {
+ if (!Op.isReg())
+ continue;
+ unsigned R = Op.getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(R))
+ return true;
+ }
+ return false;
+}
+
+void HexagonSplitDoubleRegs::partitionRegisters(UUSetMap &P2Rs) {
+ typedef std::map<unsigned,unsigned> UUMap;
+ typedef std::vector<unsigned> UVect;
+
+ unsigned NumRegs = MRI->getNumVirtRegs();
+ BitVector DoubleRegs(NumRegs);
+ for (unsigned i = 0; i < NumRegs; ++i) {
+ unsigned R = TargetRegisterInfo::index2VirtReg(i);
+ if (MRI->getRegClass(R) == DoubleRC)
+ DoubleRegs.set(i);
+ }
+
+ BitVector FixedRegs(NumRegs);
+ for (int x = DoubleRegs.find_first(); x >= 0; x = DoubleRegs.find_next(x)) {
+ unsigned R = TargetRegisterInfo::index2VirtReg(x);
+ MachineInstr *DefI = MRI->getVRegDef(R);
+ // In some cases a register may exist, but never be defined or used.
+ // It should never appear anywhere, but mark it as "fixed", just to be
+ // safe.
+ if (!DefI || isFixedInstr(DefI))
+ FixedRegs.set(x);
+ }
+
+ UUSetMap AssocMap;
+ for (int x = DoubleRegs.find_first(); x >= 0; x = DoubleRegs.find_next(x)) {
+ if (FixedRegs[x])
+ continue;
+ unsigned R = TargetRegisterInfo::index2VirtReg(x);
+ DEBUG(dbgs() << PrintReg(R, TRI) << " ~~");
+ USet &Asc = AssocMap[R];
+ for (auto U = MRI->use_nodbg_begin(R), Z = MRI->use_nodbg_end();
+ U != Z; ++U) {
+ MachineOperand &Op = *U;
+ MachineInstr *UseI = Op.getParent();
+ if (isFixedInstr(UseI))
+ continue;
+ for (unsigned i = 0, n = UseI->getNumOperands(); i < n; ++i) {
+ MachineOperand &MO = UseI->getOperand(i);
+ // Skip non-registers or registers with subregisters.
+ if (&MO == &Op || !MO.isReg() || MO.getSubReg())
+ continue;
+ unsigned T = MO.getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(T)) {
+ FixedRegs.set(x);
+ continue;
+ }
+ if (MRI->getRegClass(T) != DoubleRC)
+ continue;
+ unsigned u = TargetRegisterInfo::virtReg2Index(T);
+ if (FixedRegs[u])
+ continue;
+ DEBUG(dbgs() << ' ' << PrintReg(T, TRI));
+ Asc.insert(T);
+ // Make it symmetric.
+ AssocMap[T].insert(R);
+ }
+ }
+ DEBUG(dbgs() << '\n');
+ }
+
+ UUMap R2P;
+ unsigned NextP = 1;
+ USet Visited;
+ for (int x = DoubleRegs.find_first(); x >= 0; x = DoubleRegs.find_next(x)) {
+ unsigned R = TargetRegisterInfo::index2VirtReg(x);
+ if (Visited.count(R))
+ continue;
+ // Create a new partition for R.
+ unsigned ThisP = FixedRegs[x] ? 0 : NextP++;
+ UVect WorkQ;
+ WorkQ.push_back(R);
+ for (unsigned i = 0; i < WorkQ.size(); ++i) {
+ unsigned T = WorkQ[i];
+ if (Visited.count(T))
+ continue;
+ R2P[T] = ThisP;
+ Visited.insert(T);
+ // Add all registers associated with T.
+ USet &Asc = AssocMap[T];
+ for (USet::iterator J = Asc.begin(), F = Asc.end(); J != F; ++J)
+ WorkQ.push_back(*J);
+ }
+ }
+
+ for (auto I : R2P)
+ P2Rs[I.second].insert(I.first);
+}
+
+static inline int32_t profitImm(unsigned Lo, unsigned Hi) {
+ int32_t P = 0;
+ bool LoZ1 = false, HiZ1 = false;
+ if (Lo == 0 || Lo == 0xFFFFFFFF)
+ P += 10, LoZ1 = true;
+ if (Hi == 0 || Hi == 0xFFFFFFFF)
+ P += 10, HiZ1 = true;
+ if (!LoZ1 && !HiZ1 && Lo == Hi)
+ P += 3;
+ return P;
+}
+
+int32_t HexagonSplitDoubleRegs::profit(const MachineInstr *MI) const {
+ unsigned ImmX = 0;
+ unsigned Opc = MI->getOpcode();
+ switch (Opc) {
+ case TargetOpcode::PHI:
+ for (const auto &Op : MI->operands())
+ if (!Op.getSubReg())
+ return 0;
+ return 10;
+ case TargetOpcode::COPY:
+ if (MI->getOperand(1).getSubReg() != 0)
+ return 10;
+ return 0;
+
+ case Hexagon::L2_loadrd_io:
+ case Hexagon::S2_storerd_io:
+ return -1;
+ case Hexagon::L2_loadrd_pi:
+ case Hexagon::S2_storerd_pi:
+ return 2;
+
+ case Hexagon::A2_tfrpi:
+ case Hexagon::CONST64: {
+ uint64_t D = MI->getOperand(1).getImm();
+ unsigned Lo = D & 0xFFFFFFFFULL;
+ unsigned Hi = D >> 32;
+ return profitImm(Lo, Hi);
+ }
+ case Hexagon::A2_combineii:
+ case Hexagon::A4_combineii:
+ return profitImm(MI->getOperand(1).getImm(),
+ MI->getOperand(2).getImm());
+ case Hexagon::A4_combineri:
+ ImmX++;
+ case Hexagon::A4_combineir: {
+ ImmX++;
+ int64_t V = MI->getOperand(ImmX).getImm();
+ if (V == 0 || V == -1)
+ return 10;
+ // Fall through into A2_combinew.
+ LLVM_FALLTHROUGH;
+ }
+ case Hexagon::A2_combinew:
+ return 2;
+
+ case Hexagon::A2_sxtw:
+ return 3;
+
+ case Hexagon::A2_andp:
+ case Hexagon::A2_orp:
+ case Hexagon::A2_xorp:
+ return 1;
+
+ case Hexagon::S2_asl_i_p_or: {
+ unsigned S = MI->getOperand(3).getImm();
+ if (S == 0 || S == 32)
+ return 10;
+ return -1;
+ }
+ case Hexagon::S2_asl_i_p:
+ case Hexagon::S2_asr_i_p:
+ case Hexagon::S2_lsr_i_p:
+ unsigned S = MI->getOperand(2).getImm();
+ if (S == 0 || S == 32)
+ return 10;
+ if (S == 16)
+ return 5;
+ if (S == 48)
+ return 7;
+ return -10;
+ }
+
+ return 0;
+}
+
+bool HexagonSplitDoubleRegs::isProfitable(const USet &Part, LoopRegMap &IRM)
+ const {
+ unsigned FixedNum = 0, SplitNum = 0, LoopPhiNum = 0;
+ int32_t TotalP = 0;
+
+ for (unsigned DR : Part) {
+ MachineInstr *DefI = MRI->getVRegDef(DR);
+ int32_t P = profit(DefI);
+ if (P == std::numeric_limits<int>::min())
+ return false;
+ TotalP += P;
+ // Reduce the profitability of splitting induction registers.
+ if (isInduction(DR, IRM))
+ TotalP -= 30;
+
+ for (auto U = MRI->use_nodbg_begin(DR), W = MRI->use_nodbg_end();
+ U != W; ++U) {
+ MachineInstr *UseI = U->getParent();
+ if (isFixedInstr(UseI)) {
+ FixedNum++;
+ // Calculate the cost of generating REG_SEQUENCE instructions.
+ for (auto &Op : UseI->operands()) {
+ if (Op.isReg() && Part.count(Op.getReg()))
+ if (Op.getSubReg())
+ TotalP -= 2;
+ }
+ continue;
+ }
+ // If a register from this partition is used in a fixed instruction,
+ // and there is also a register in this partition that is used in
+ // a loop phi node, then decrease the splitting profit as this can
+ // confuse the modulo scheduler.
+ if (UseI->isPHI()) {
+ const MachineBasicBlock *PB = UseI->getParent();
+ const MachineLoop *L = MLI->getLoopFor(PB);
+ if (L && L->getHeader() == PB)
+ LoopPhiNum++;
+ }
+ // Splittable instruction.
+ SplitNum++;
+ int32_t P = profit(UseI);
+ if (P == std::numeric_limits<int>::min())
+ return false;
+ TotalP += P;
+ }
+ }
+
+ if (FixedNum > 0 && LoopPhiNum > 0)
+ TotalP -= 20*LoopPhiNum;
+
+ DEBUG(dbgs() << "Partition profit: " << TotalP << '\n');
+ return TotalP > 0;
+}
+
+void HexagonSplitDoubleRegs::collectIndRegsForLoop(const MachineLoop *L,
+ USet &Rs) {
+ const MachineBasicBlock *HB = L->getHeader();
+ const MachineBasicBlock *LB = L->getLoopLatch();
+ if (!HB || !LB)
+ return;
+
+ // Examine the latch branch. Expect it to be a conditional branch to
+ // the header (either "br-cond header" or "br-cond exit; br header").
+ MachineBasicBlock *TB = nullptr, *FB = nullptr;
+ MachineBasicBlock *TmpLB = const_cast<MachineBasicBlock*>(LB);
+ SmallVector<MachineOperand,2> Cond;
+ bool BadLB = TII->analyzeBranch(*TmpLB, TB, FB, Cond, false);
+ // Only analyzable conditional branches. HII::analyzeBranch will put
+ // the branch opcode as the first element of Cond, and the predicate
+ // operand as the second.
+ if (BadLB || Cond.size() != 2)
+ return;
+ // Only simple jump-conditional (with or without negation).
+ if (!TII->PredOpcodeHasJMP_c(Cond[0].getImm()))
+ return;
+ // Must go to the header.
+ if (TB != HB && FB != HB)
+ return;
+ assert(Cond[1].isReg() && "Unexpected Cond vector from analyzeBranch");
+ // Expect a predicate register.
+ unsigned PR = Cond[1].getReg();
+ assert(MRI->getRegClass(PR) == &Hexagon::PredRegsRegClass);
+
+ // Get the registers on which the loop controlling compare instruction
+ // depends.
+ unsigned CmpR1 = 0, CmpR2 = 0;
+ const MachineInstr *CmpI = MRI->getVRegDef(PR);
+ while (CmpI->getOpcode() == Hexagon::C2_not)
+ CmpI = MRI->getVRegDef(CmpI->getOperand(1).getReg());
+
+ int Mask = 0, Val = 0;
+ bool OkCI = TII->analyzeCompare(*CmpI, CmpR1, CmpR2, Mask, Val);
+ if (!OkCI)
+ return;
+ // Eliminate non-double input registers.
+ if (CmpR1 && MRI->getRegClass(CmpR1) != DoubleRC)
+ CmpR1 = 0;
+ if (CmpR2 && MRI->getRegClass(CmpR2) != DoubleRC)
+ CmpR2 = 0;
+ if (!CmpR1 && !CmpR2)
+ return;
+
+ // Now examine the top of the loop: the phi nodes that could poten-
+ // tially define loop induction registers. The registers defined by
+ // such a phi node would be used in a 64-bit add, which then would
+ // be used in the loop compare instruction.
+
+ // Get the set of all double registers defined by phi nodes in the
+ // loop header.
+ typedef std::vector<unsigned> UVect;
+ UVect DP;
+ for (auto &MI : *HB) {
+ if (!MI.isPHI())
+ break;
+ const MachineOperand &MD = MI.getOperand(0);
+ unsigned R = MD.getReg();
+ if (MRI->getRegClass(R) == DoubleRC)
+ DP.push_back(R);
+ }
+ if (DP.empty())
+ return;
+
+ auto NoIndOp = [this, CmpR1, CmpR2] (unsigned R) -> bool {
+ for (auto I = MRI->use_nodbg_begin(R), E = MRI->use_nodbg_end();
+ I != E; ++I) {
+ const MachineInstr *UseI = I->getParent();
+ if (UseI->getOpcode() != Hexagon::A2_addp)
+ continue;
+ // Get the output from the add. If it is one of the inputs to the
+ // loop-controlling compare instruction, then R is likely an induc-
+ // tion register.
+ unsigned T = UseI->getOperand(0).getReg();
+ if (T == CmpR1 || T == CmpR2)
+ return false;
+ }
+ return true;
+ };
+ UVect::iterator End = llvm::remove_if(DP, NoIndOp);
+ Rs.insert(DP.begin(), End);
+ Rs.insert(CmpR1);
+ Rs.insert(CmpR2);
+
+ DEBUG({
+ dbgs() << "For loop at BB#" << HB->getNumber() << " ind regs: ";
+ dump_partition(dbgs(), Rs, *TRI);
+ dbgs() << '\n';
+ });
+}
+
+void HexagonSplitDoubleRegs::collectIndRegs(LoopRegMap &IRM) {
+ typedef std::vector<MachineLoop*> LoopVector;
+ LoopVector WorkQ;
+
+ for (auto I : *MLI)
+ WorkQ.push_back(I);
+ for (unsigned i = 0; i < WorkQ.size(); ++i) {
+ for (auto I : *WorkQ[i])
+ WorkQ.push_back(I);
+ }
+
+ USet Rs;
+ for (unsigned i = 0, n = WorkQ.size(); i < n; ++i) {
+ MachineLoop *L = WorkQ[i];
+ Rs.clear();
+ collectIndRegsForLoop(L, Rs);
+ if (!Rs.empty())
+ IRM.insert(std::make_pair(L, Rs));
+ }
+}
+
+void HexagonSplitDoubleRegs::createHalfInstr(unsigned Opc, MachineInstr *MI,
+ const UUPairMap &PairMap, unsigned SubR) {
+ MachineBasicBlock &B = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+ MachineInstr *NewI = BuildMI(B, MI, DL, TII->get(Opc));
+
+ for (auto &Op : MI->operands()) {
+ if (!Op.isReg()) {
+ NewI->addOperand(Op);
+ continue;
+ }
+ // For register operands, set the subregister.
+ unsigned R = Op.getReg();
+ unsigned SR = Op.getSubReg();
+ bool isVirtReg = TargetRegisterInfo::isVirtualRegister(R);
+ bool isKill = Op.isKill();
+ if (isVirtReg && MRI->getRegClass(R) == DoubleRC) {
+ isKill = false;
+ UUPairMap::const_iterator F = PairMap.find(R);
+ if (F == PairMap.end()) {
+ SR = SubR;
+ } else {
+ const UUPair &P = F->second;
+ R = (SubR == Hexagon::isub_lo) ? P.first : P.second;
+ SR = 0;
+ }
+ }
+ auto CO = MachineOperand::CreateReg(R, Op.isDef(), Op.isImplicit(), isKill,
+ Op.isDead(), Op.isUndef(), Op.isEarlyClobber(), SR, Op.isDebug(),
+ Op.isInternalRead());
+ NewI->addOperand(CO);
+ }
+}
+
+void HexagonSplitDoubleRegs::splitMemRef(MachineInstr *MI,
+ const UUPairMap &PairMap) {
+ bool Load = MI->mayLoad();
+ unsigned OrigOpc = MI->getOpcode();
+ bool PostInc = (OrigOpc == Hexagon::L2_loadrd_pi ||
+ OrigOpc == Hexagon::S2_storerd_pi);
+ MachineInstr *LowI, *HighI;
+ MachineBasicBlock &B = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+
+ // Index of the base-address-register operand.
+ unsigned AdrX = PostInc ? (Load ? 2 : 1)
+ : (Load ? 1 : 0);
+ MachineOperand &AdrOp = MI->getOperand(AdrX);
+ unsigned RSA = getRegState(AdrOp);
+ MachineOperand &ValOp = Load ? MI->getOperand(0)
+ : (PostInc ? MI->getOperand(3)
+ : MI->getOperand(2));
+ UUPairMap::const_iterator F = PairMap.find(ValOp.getReg());
+ assert(F != PairMap.end());
+
+ if (Load) {
+ const UUPair &P = F->second;
+ int64_t Off = PostInc ? 0 : MI->getOperand(2).getImm();
+ LowI = BuildMI(B, MI, DL, TII->get(Hexagon::L2_loadri_io), P.first)
+ .addReg(AdrOp.getReg(), RSA & ~RegState::Kill, AdrOp.getSubReg())
+ .addImm(Off);
+ HighI = BuildMI(B, MI, DL, TII->get(Hexagon::L2_loadri_io), P.second)
+ .addReg(AdrOp.getReg(), RSA & ~RegState::Kill, AdrOp.getSubReg())
+ .addImm(Off+4);
+ } else {
+ const UUPair &P = F->second;
+ int64_t Off = PostInc ? 0 : MI->getOperand(1).getImm();
+ LowI = BuildMI(B, MI, DL, TII->get(Hexagon::S2_storeri_io))
+ .addReg(AdrOp.getReg(), RSA & ~RegState::Kill, AdrOp.getSubReg())
+ .addImm(Off)
+ .addReg(P.first);
+ HighI = BuildMI(B, MI, DL, TII->get(Hexagon::S2_storeri_io))
+ .addReg(AdrOp.getReg(), RSA & ~RegState::Kill, AdrOp.getSubReg())
+ .addImm(Off+4)
+ .addReg(P.second);
+ }
+
+ if (PostInc) {
+ // Create the increment of the address register.
+ int64_t Inc = Load ? MI->getOperand(3).getImm()
+ : MI->getOperand(2).getImm();
+ MachineOperand &UpdOp = Load ? MI->getOperand(1) : MI->getOperand(0);
+ const TargetRegisterClass *RC = MRI->getRegClass(UpdOp.getReg());
+ unsigned NewR = MRI->createVirtualRegister(RC);
+ assert(!UpdOp.getSubReg() && "Def operand with subreg");
+ BuildMI(B, MI, DL, TII->get(Hexagon::A2_addi), NewR)
+ .addReg(AdrOp.getReg(), RSA)
+ .addImm(Inc);
+ MRI->replaceRegWith(UpdOp.getReg(), NewR);
+ // The original instruction will be deleted later.
+ }
+
+ // Generate a new pair of memory-operands.
+ MachineFunction &MF = *B.getParent();
+ for (auto &MO : MI->memoperands()) {
+ const MachinePointerInfo &Ptr = MO->getPointerInfo();
+ MachineMemOperand::Flags F = MO->getFlags();
+ int A = MO->getAlignment();
+
+ auto *Tmp1 = MF.getMachineMemOperand(Ptr, F, 4/*size*/, A);
+ LowI->addMemOperand(MF, Tmp1);
+ auto *Tmp2 = MF.getMachineMemOperand(Ptr, F, 4/*size*/, std::min(A, 4));
+ HighI->addMemOperand(MF, Tmp2);
+ }
+}
+
+void HexagonSplitDoubleRegs::splitImmediate(MachineInstr *MI,
+ const UUPairMap &PairMap) {
+ MachineOperand &Op0 = MI->getOperand(0);
+ MachineOperand &Op1 = MI->getOperand(1);
+ assert(Op0.isReg() && Op1.isImm());
+ uint64_t V = Op1.getImm();
+
+ MachineBasicBlock &B = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+ UUPairMap::const_iterator F = PairMap.find(Op0.getReg());
+ assert(F != PairMap.end());
+ const UUPair &P = F->second;
+
+ // The operand to A2_tfrsi can only have 32 significant bits. Immediate
+ // values in MachineOperand are stored as 64-bit integers, and so the
+ // value -1 may be represented either as 64-bit -1, or 4294967295. Both
+ // will have the 32 higher bits truncated in the end, but -1 will remain
+ // as -1, while the latter may appear to be a large unsigned value
+ // requiring a constant extender. The casting to int32_t will select the
+ // former representation. (The same reasoning applies to all 32-bit
+ // values.)
+ BuildMI(B, MI, DL, TII->get(Hexagon::A2_tfrsi), P.first)
+ .addImm(int32_t(V & 0xFFFFFFFFULL));
+ BuildMI(B, MI, DL, TII->get(Hexagon::A2_tfrsi), P.second)
+ .addImm(int32_t(V >> 32));
+}
+
+void HexagonSplitDoubleRegs::splitCombine(MachineInstr *MI,
+ const UUPairMap &PairMap) {
+ MachineOperand &Op0 = MI->getOperand(0);
+ MachineOperand &Op1 = MI->getOperand(1);
+ MachineOperand &Op2 = MI->getOperand(2);
+ assert(Op0.isReg());
+
+ MachineBasicBlock &B = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+ UUPairMap::const_iterator F = PairMap.find(Op0.getReg());
+ assert(F != PairMap.end());
+ const UUPair &P = F->second;
+
+ if (Op1.isImm()) {
+ BuildMI(B, MI, DL, TII->get(Hexagon::A2_tfrsi), P.second)
+ .addImm(Op1.getImm());
+ } else if (Op1.isReg()) {
+ BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), P.second)
+ .addReg(Op1.getReg(), getRegState(Op1), Op1.getSubReg());
+ } else
+ llvm_unreachable("Unexpected operand");
+
+ if (Op2.isImm()) {
+ BuildMI(B, MI, DL, TII->get(Hexagon::A2_tfrsi), P.first)
+ .addImm(Op2.getImm());
+ } else if (Op2.isReg()) {
+ BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), P.first)
+ .addReg(Op2.getReg(), getRegState(Op2), Op2.getSubReg());
+ } else
+ llvm_unreachable("Unexpected operand");
+}
+
+void HexagonSplitDoubleRegs::splitExt(MachineInstr *MI,
+ const UUPairMap &PairMap) {
+ MachineOperand &Op0 = MI->getOperand(0);
+ MachineOperand &Op1 = MI->getOperand(1);
+ assert(Op0.isReg() && Op1.isReg());
+
+ MachineBasicBlock &B = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+ UUPairMap::const_iterator F = PairMap.find(Op0.getReg());
+ assert(F != PairMap.end());
+ const UUPair &P = F->second;
+ unsigned RS = getRegState(Op1);
+
+ BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), P.first)
+ .addReg(Op1.getReg(), RS & ~RegState::Kill, Op1.getSubReg());
+ BuildMI(B, MI, DL, TII->get(Hexagon::S2_asr_i_r), P.second)
+ .addReg(Op1.getReg(), RS, Op1.getSubReg())
+ .addImm(31);
+}
+
+void HexagonSplitDoubleRegs::splitShift(MachineInstr *MI,
+ const UUPairMap &PairMap) {
+ using namespace Hexagon;
+
+ MachineOperand &Op0 = MI->getOperand(0);
+ MachineOperand &Op1 = MI->getOperand(1);
+ MachineOperand &Op2 = MI->getOperand(2);
+ assert(Op0.isReg() && Op1.isReg() && Op2.isImm());
+ int64_t Sh64 = Op2.getImm();
+ assert(Sh64 >= 0 && Sh64 < 64);
+ unsigned S = Sh64;
+
+ UUPairMap::const_iterator F = PairMap.find(Op0.getReg());
+ assert(F != PairMap.end());
+ const UUPair &P = F->second;
+ unsigned LoR = P.first;
+ unsigned HiR = P.second;
+
+ unsigned Opc = MI->getOpcode();
+ bool Right = (Opc == S2_lsr_i_p || Opc == S2_asr_i_p);
+ bool Left = !Right;
+ bool Signed = (Opc == S2_asr_i_p);
+
+ MachineBasicBlock &B = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+ unsigned RS = getRegState(Op1);
+ unsigned ShiftOpc = Left ? S2_asl_i_r
+ : (Signed ? S2_asr_i_r : S2_lsr_i_r);
+ unsigned LoSR = isub_lo;
+ unsigned HiSR = isub_hi;
+
+ if (S == 0) {
+ // No shift, subregister copy.
+ BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), LoR)
+ .addReg(Op1.getReg(), RS & ~RegState::Kill, LoSR);
+ BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), HiR)
+ .addReg(Op1.getReg(), RS, HiSR);
+ } else if (S < 32) {
+ const TargetRegisterClass *IntRC = &IntRegsRegClass;
+ unsigned TmpR = MRI->createVirtualRegister(IntRC);
+ // Expansion:
+ // Shift left: DR = shl R, #s
+ // LoR = shl R.lo, #s
+ // TmpR = extractu R.lo, #s, #32-s
+ // HiR = or (TmpR, asl(R.hi, #s))
+ // Shift right: DR = shr R, #s
+ // HiR = shr R.hi, #s
+ // TmpR = shr R.lo, #s
+ // LoR = insert TmpR, R.hi, #s, #32-s
+
+ // Shift left:
+ // LoR = shl R.lo, #s
+ // Shift right:
+ // TmpR = shr R.lo, #s
+
+ // Make a special case for A2_aslh and A2_asrh (they are predicable as
+ // opposed to S2_asl_i_r/S2_asr_i_r).
+ if (S == 16 && Left)
+ BuildMI(B, MI, DL, TII->get(A2_aslh), LoR)
+ .addReg(Op1.getReg(), RS & ~RegState::Kill, LoSR);
+ else if (S == 16 && Signed)
+ BuildMI(B, MI, DL, TII->get(A2_asrh), TmpR)
+ .addReg(Op1.getReg(), RS & ~RegState::Kill, LoSR);
+ else
+ BuildMI(B, MI, DL, TII->get(ShiftOpc), (Left ? LoR : TmpR))
+ .addReg(Op1.getReg(), RS & ~RegState::Kill, LoSR)
+ .addImm(S);
+
+ if (Left) {
+ // TmpR = extractu R.lo, #s, #32-s
+ BuildMI(B, MI, DL, TII->get(S2_extractu), TmpR)
+ .addReg(Op1.getReg(), RS & ~RegState::Kill, LoSR)
+ .addImm(S)
+ .addImm(32-S);
+ // HiR = or (TmpR, asl(R.hi, #s))
+ BuildMI(B, MI, DL, TII->get(S2_asl_i_r_or), HiR)
+ .addReg(TmpR)
+ .addReg(Op1.getReg(), RS, HiSR)
+ .addImm(S);
+ } else {
+ // HiR = shr R.hi, #s
+ BuildMI(B, MI, DL, TII->get(ShiftOpc), HiR)
+ .addReg(Op1.getReg(), RS & ~RegState::Kill, HiSR)
+ .addImm(S);
+ // LoR = insert TmpR, R.hi, #s, #32-s
+ BuildMI(B, MI, DL, TII->get(S2_insert), LoR)
+ .addReg(TmpR)
+ .addReg(Op1.getReg(), RS, HiSR)
+ .addImm(S)
+ .addImm(32-S);
+ }
+ } else if (S == 32) {
+ BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), (Left ? HiR : LoR))
+ .addReg(Op1.getReg(), RS & ~RegState::Kill, (Left ? LoSR : HiSR));
+ if (!Signed)
+ BuildMI(B, MI, DL, TII->get(A2_tfrsi), (Left ? LoR : HiR))
+ .addImm(0);
+ else // Must be right shift.
+ BuildMI(B, MI, DL, TII->get(S2_asr_i_r), HiR)
+ .addReg(Op1.getReg(), RS, HiSR)
+ .addImm(31);
+ } else if (S < 64) {
+ S -= 32;
+ if (S == 16 && Left)
+ BuildMI(B, MI, DL, TII->get(A2_aslh), HiR)
+ .addReg(Op1.getReg(), RS & ~RegState::Kill, LoSR);
+ else if (S == 16 && Signed)
+ BuildMI(B, MI, DL, TII->get(A2_asrh), LoR)
+ .addReg(Op1.getReg(), RS & ~RegState::Kill, HiSR);
+ else
+ BuildMI(B, MI, DL, TII->get(ShiftOpc), (Left ? HiR : LoR))
+ .addReg(Op1.getReg(), RS & ~RegState::Kill, (Left ? LoSR : HiSR))
+ .addImm(S);
+
+ if (Signed)
+ BuildMI(B, MI, DL, TII->get(S2_asr_i_r), HiR)
+ .addReg(Op1.getReg(), RS, HiSR)
+ .addImm(31);
+ else
+ BuildMI(B, MI, DL, TII->get(A2_tfrsi), (Left ? LoR : HiR))
+ .addImm(0);
+ }
+}
+
+void HexagonSplitDoubleRegs::splitAslOr(MachineInstr *MI,
+ const UUPairMap &PairMap) {
+ using namespace Hexagon;
+
+ MachineOperand &Op0 = MI->getOperand(0);
+ MachineOperand &Op1 = MI->getOperand(1);
+ MachineOperand &Op2 = MI->getOperand(2);
+ MachineOperand &Op3 = MI->getOperand(3);
+ assert(Op0.isReg() && Op1.isReg() && Op2.isReg() && Op3.isImm());
+ int64_t Sh64 = Op3.getImm();
+ assert(Sh64 >= 0 && Sh64 < 64);
+ unsigned S = Sh64;
+
+ UUPairMap::const_iterator F = PairMap.find(Op0.getReg());
+ assert(F != PairMap.end());
+ const UUPair &P = F->second;
+ unsigned LoR = P.first;
+ unsigned HiR = P.second;
+
+ MachineBasicBlock &B = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+ unsigned RS1 = getRegState(Op1);
+ unsigned RS2 = getRegState(Op2);
+ const TargetRegisterClass *IntRC = &IntRegsRegClass;
+
+ unsigned LoSR = isub_lo;
+ unsigned HiSR = isub_hi;
+
+ // Op0 = S2_asl_i_p_or Op1, Op2, Op3
+ // means: Op0 = or (Op1, asl(Op2, Op3))
+
+ // Expansion of
+ // DR = or (R1, asl(R2, #s))
+ //
+ // LoR = or (R1.lo, asl(R2.lo, #s))
+ // Tmp1 = extractu R2.lo, #s, #32-s
+ // Tmp2 = or R1.hi, Tmp1
+ // HiR = or (Tmp2, asl(R2.hi, #s))
+
+ if (S == 0) {
+ // DR = or (R1, asl(R2, #0))
+ // -> or (R1, R2)
+ // i.e. LoR = or R1.lo, R2.lo
+ // HiR = or R1.hi, R2.hi
+ BuildMI(B, MI, DL, TII->get(A2_or), LoR)
+ .addReg(Op1.getReg(), RS1 & ~RegState::Kill, LoSR)
+ .addReg(Op2.getReg(), RS2 & ~RegState::Kill, LoSR);
+ BuildMI(B, MI, DL, TII->get(A2_or), HiR)
+ .addReg(Op1.getReg(), RS1, HiSR)
+ .addReg(Op2.getReg(), RS2, HiSR);
+ } else if (S < 32) {
+ BuildMI(B, MI, DL, TII->get(S2_asl_i_r_or), LoR)
+ .addReg(Op1.getReg(), RS1 & ~RegState::Kill, LoSR)
+ .addReg(Op2.getReg(), RS2 & ~RegState::Kill, LoSR)
+ .addImm(S);
+ unsigned TmpR1 = MRI->createVirtualRegister(IntRC);
+ BuildMI(B, MI, DL, TII->get(S2_extractu), TmpR1)
+ .addReg(Op2.getReg(), RS2 & ~RegState::Kill, LoSR)
+ .addImm(S)
+ .addImm(32-S);
+ unsigned TmpR2 = MRI->createVirtualRegister(IntRC);
+ BuildMI(B, MI, DL, TII->get(A2_or), TmpR2)
+ .addReg(Op1.getReg(), RS1, HiSR)
+ .addReg(TmpR1);
+ BuildMI(B, MI, DL, TII->get(S2_asl_i_r_or), HiR)
+ .addReg(TmpR2)
+ .addReg(Op2.getReg(), RS2, HiSR)
+ .addImm(S);
+ } else if (S == 32) {
+ // DR = or (R1, asl(R2, #32))
+ // -> or R1, R2.lo
+ // LoR = R1.lo
+ // HiR = or R1.hi, R2.lo
+ BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), LoR)
+ .addReg(Op1.getReg(), RS1 & ~RegState::Kill, LoSR);
+ BuildMI(B, MI, DL, TII->get(A2_or), HiR)
+ .addReg(Op1.getReg(), RS1, HiSR)
+ .addReg(Op2.getReg(), RS2, LoSR);
+ } else if (S < 64) {
+ // DR = or (R1, asl(R2, #s))
+ //
+ // LoR = R1:lo
+ // HiR = or (R1:hi, asl(R2:lo, #s-32))
+ S -= 32;
+ BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), LoR)
+ .addReg(Op1.getReg(), RS1 & ~RegState::Kill, LoSR);
+ BuildMI(B, MI, DL, TII->get(S2_asl_i_r_or), HiR)
+ .addReg(Op1.getReg(), RS1, HiSR)
+ .addReg(Op2.getReg(), RS2, LoSR)
+ .addImm(S);
+ }
+}
+
+bool HexagonSplitDoubleRegs::splitInstr(MachineInstr *MI,
+ const UUPairMap &PairMap) {
+ using namespace Hexagon;
+
+ DEBUG(dbgs() << "Splitting: " << *MI);
+ bool Split = false;
+ unsigned Opc = MI->getOpcode();
+
+ switch (Opc) {
+ case TargetOpcode::PHI:
+ case TargetOpcode::COPY: {
+ unsigned DstR = MI->getOperand(0).getReg();
+ if (MRI->getRegClass(DstR) == DoubleRC) {
+ createHalfInstr(Opc, MI, PairMap, isub_lo);
+ createHalfInstr(Opc, MI, PairMap, isub_hi);
+ Split = true;
+ }
+ break;
+ }
+ case A2_andp:
+ createHalfInstr(A2_and, MI, PairMap, isub_lo);
+ createHalfInstr(A2_and, MI, PairMap, isub_hi);
+ Split = true;
+ break;
+ case A2_orp:
+ createHalfInstr(A2_or, MI, PairMap, isub_lo);
+ createHalfInstr(A2_or, MI, PairMap, isub_hi);
+ Split = true;
+ break;
+ case A2_xorp:
+ createHalfInstr(A2_xor, MI, PairMap, isub_lo);
+ createHalfInstr(A2_xor, MI, PairMap, isub_hi);
+ Split = true;
+ break;
+
+ case L2_loadrd_io:
+ case L2_loadrd_pi:
+ case S2_storerd_io:
+ case S2_storerd_pi:
+ splitMemRef(MI, PairMap);
+ Split = true;
+ break;
+
+ case A2_tfrpi:
+ case CONST64:
+ splitImmediate(MI, PairMap);
+ Split = true;
+ break;
+
+ case A2_combineii:
+ case A4_combineir:
+ case A4_combineii:
+ case A4_combineri:
+ case A2_combinew:
+ splitCombine(MI, PairMap);
+ Split = true;
+ break;
+
+ case A2_sxtw:
+ splitExt(MI, PairMap);
+ Split = true;
+ break;
+
+ case S2_asl_i_p:
+ case S2_asr_i_p:
+ case S2_lsr_i_p:
+ splitShift(MI, PairMap);
+ Split = true;
+ break;
+
+ case S2_asl_i_p_or:
+ splitAslOr(MI, PairMap);
+ Split = true;
+ break;
+
+ default:
+ llvm_unreachable("Instruction not splitable");
+ return false;
+ }
+
+ return Split;
+}
+
+void HexagonSplitDoubleRegs::replaceSubregUses(MachineInstr *MI,
+ const UUPairMap &PairMap) {
+ for (auto &Op : MI->operands()) {
+ if (!Op.isReg() || !Op.isUse() || !Op.getSubReg())
+ continue;
+ unsigned R = Op.getReg();
+ UUPairMap::const_iterator F = PairMap.find(R);
+ if (F == PairMap.end())
+ continue;
+ const UUPair &P = F->second;
+ switch (Op.getSubReg()) {
+ case Hexagon::isub_lo:
+ Op.setReg(P.first);
+ break;
+ case Hexagon::isub_hi:
+ Op.setReg(P.second);
+ break;
+ }
+ Op.setSubReg(0);
+ }
+}
+
+void HexagonSplitDoubleRegs::collapseRegPairs(MachineInstr *MI,
+ const UUPairMap &PairMap) {
+ MachineBasicBlock &B = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+
+ for (auto &Op : MI->operands()) {
+ if (!Op.isReg() || !Op.isUse())
+ continue;
+ unsigned R = Op.getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(R))
+ continue;
+ if (MRI->getRegClass(R) != DoubleRC || Op.getSubReg())
+ continue;
+ UUPairMap::const_iterator F = PairMap.find(R);
+ if (F == PairMap.end())
+ continue;
+ const UUPair &Pr = F->second;
+ unsigned NewDR = MRI->createVirtualRegister(DoubleRC);
+ BuildMI(B, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), NewDR)
+ .addReg(Pr.first)
+ .addImm(Hexagon::isub_lo)
+ .addReg(Pr.second)
+ .addImm(Hexagon::isub_hi);
+ Op.setReg(NewDR);
+ }
+}
+
+bool HexagonSplitDoubleRegs::splitPartition(const USet &Part) {
+ const TargetRegisterClass *IntRC = &Hexagon::IntRegsRegClass;
+ typedef std::set<MachineInstr*> MISet;
+ bool Changed = false;
+
+ DEBUG(dbgs() << "Splitting partition: "; dump_partition(dbgs(), Part, *TRI);
+ dbgs() << '\n');
+
+ UUPairMap PairMap;
+
+ MISet SplitIns;
+ for (unsigned DR : Part) {
+ MachineInstr *DefI = MRI->getVRegDef(DR);
+ SplitIns.insert(DefI);
+
+ // Collect all instructions, including fixed ones. We won't split them,
+ // but we need to visit them again to insert the REG_SEQUENCE instructions.
+ for (auto U = MRI->use_nodbg_begin(DR), W = MRI->use_nodbg_end();
+ U != W; ++U)
+ SplitIns.insert(U->getParent());
+
+ unsigned LoR = MRI->createVirtualRegister(IntRC);
+ unsigned HiR = MRI->createVirtualRegister(IntRC);
+ DEBUG(dbgs() << "Created mapping: " << PrintReg(DR, TRI) << " -> "
+ << PrintReg(HiR, TRI) << ':' << PrintReg(LoR, TRI) << '\n');
+ PairMap.insert(std::make_pair(DR, UUPair(LoR, HiR)));
+ }
+
+ MISet Erase;
+ for (auto MI : SplitIns) {
+ if (isFixedInstr(MI)) {
+ collapseRegPairs(MI, PairMap);
+ } else {
+ bool Done = splitInstr(MI, PairMap);
+ if (Done)
+ Erase.insert(MI);
+ Changed |= Done;
+ }
+ }
+
+ for (unsigned DR : Part) {
+ // Before erasing "double" instructions, revisit all uses of the double
+ // registers in this partition, and replace all uses of them with subre-
+ // gisters, with the corresponding single registers.
+ MISet Uses;
+ for (auto U = MRI->use_nodbg_begin(DR), W = MRI->use_nodbg_end();
+ U != W; ++U)
+ Uses.insert(U->getParent());
+ for (auto M : Uses)
+ replaceSubregUses(M, PairMap);
+ }
+
+ for (auto MI : Erase) {
+ MachineBasicBlock *B = MI->getParent();
+ B->erase(MI);
+ }
+
+ return Changed;
+}
+
+bool HexagonSplitDoubleRegs::runOnMachineFunction(MachineFunction &MF) {
+ DEBUG(dbgs() << "Splitting double registers in function: "
+ << MF.getName() << '\n');
+
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
+ auto &ST = MF.getSubtarget<HexagonSubtarget>();
+ TRI = ST.getRegisterInfo();
+ TII = ST.getInstrInfo();
+ MRI = &MF.getRegInfo();
+ MLI = &getAnalysis<MachineLoopInfo>();
+
+ UUSetMap P2Rs;
+ LoopRegMap IRM;
+
+ collectIndRegs(IRM);
+ partitionRegisters(P2Rs);
+
+ DEBUG({
+ dbgs() << "Register partitioning: (partition #0 is fixed)\n";
+ for (UUSetMap::iterator I = P2Rs.begin(), E = P2Rs.end(); I != E; ++I) {
+ dbgs() << '#' << I->first << " -> ";
+ dump_partition(dbgs(), I->second, *TRI);
+ dbgs() << '\n';
+ }
+ });
+
+ bool Changed = false;
+ int Limit = MaxHSDR;
+
+ for (UUSetMap::iterator I = P2Rs.begin(), E = P2Rs.end(); I != E; ++I) {
+ if (I->first == 0)
+ continue;
+ if (Limit >= 0 && Counter >= Limit)
+ break;
+ USet &Part = I->second;
+ DEBUG(dbgs() << "Calculating profit for partition #" << I->first << '\n');
+ if (!isProfitable(Part, IRM))
+ continue;
+ Counter++;
+ Changed |= splitPartition(Part);
+ }
+
+ return Changed;
+}
+
+FunctionPass *llvm::createHexagonSplitDoubleRegs() {
+ return new HexagonSplitDoubleRegs();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp
new file mode 100644
index 000000000000..af1bf48b6320
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp
@@ -0,0 +1,609 @@
+//===--- HexagonStoreWidening.cpp------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Replace sequences of "narrow" stores to adjacent memory locations with
+// a fewer "wide" stores that have the same effect.
+// For example, replace:
+// S4_storeirb_io %vreg100, 0, 0 ; store-immediate-byte
+// S4_storeirb_io %vreg100, 1, 0 ; store-immediate-byte
+// with
+// S4_storeirh_io %vreg100, 0, 0 ; store-immediate-halfword
+// The above is the general idea. The actual cases handled by the code
+// may be a bit more complex.
+// The purpose of this pass is to reduce the number of outstanding stores,
+// or as one could say, "reduce store queue pressure". Also, wide stores
+// mean fewer stores, and since there are only two memory instructions allowed
+// per packet, it also means fewer packets, and ultimately fewer cycles.
+//===---------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hexagon-widen-stores"
+
+#include "HexagonInstrInfo.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <vector>
+
+using namespace llvm;
+
+namespace llvm {
+
+ FunctionPass *createHexagonStoreWidening();
+ void initializeHexagonStoreWideningPass(PassRegistry&);
+
+} // end namespace llvm
+
+namespace {
+
+ struct HexagonStoreWidening : public MachineFunctionPass {
+ const HexagonInstrInfo *TII;
+ const HexagonRegisterInfo *TRI;
+ const MachineRegisterInfo *MRI;
+ AliasAnalysis *AA;
+ MachineFunction *MF;
+
+ public:
+ static char ID;
+
+ HexagonStoreWidening() : MachineFunctionPass(ID) {
+ initializeHexagonStoreWideningPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override { return "Hexagon Store Widening"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addPreserved<AAResultsWrapperPass>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ static bool handledStoreType(const MachineInstr *MI);
+
+ private:
+ static const int MaxWideSize = 4;
+
+ typedef std::vector<MachineInstr*> InstrGroup;
+ typedef std::vector<InstrGroup> InstrGroupList;
+
+ bool instrAliased(InstrGroup &Stores, const MachineMemOperand &MMO);
+ bool instrAliased(InstrGroup &Stores, const MachineInstr *MI);
+ void createStoreGroup(MachineInstr *BaseStore, InstrGroup::iterator Begin,
+ InstrGroup::iterator End, InstrGroup &Group);
+ void createStoreGroups(MachineBasicBlock &MBB,
+ InstrGroupList &StoreGroups);
+ bool processBasicBlock(MachineBasicBlock &MBB);
+ bool processStoreGroup(InstrGroup &Group);
+ bool selectStores(InstrGroup::iterator Begin, InstrGroup::iterator End,
+ InstrGroup &OG, unsigned &TotalSize, unsigned MaxSize);
+ bool createWideStores(InstrGroup &OG, InstrGroup &NG, unsigned TotalSize);
+ bool replaceStores(InstrGroup &OG, InstrGroup &NG);
+ bool storesAreAdjacent(const MachineInstr *S1, const MachineInstr *S2);
+ };
+
+char HexagonStoreWidening::ID = 0;
+
+} // end anonymous namespace
+
+// Some local helper functions...
+static unsigned getBaseAddressRegister(const MachineInstr *MI) {
+ const MachineOperand &MO = MI->getOperand(0);
+ assert(MO.isReg() && "Expecting register operand");
+ return MO.getReg();
+}
+
+static int64_t getStoreOffset(const MachineInstr *MI) {
+ unsigned OpC = MI->getOpcode();
+ assert(HexagonStoreWidening::handledStoreType(MI) && "Unhandled opcode");
+
+ switch (OpC) {
+ case Hexagon::S4_storeirb_io:
+ case Hexagon::S4_storeirh_io:
+ case Hexagon::S4_storeiri_io: {
+ const MachineOperand &MO = MI->getOperand(1);
+ assert(MO.isImm() && "Expecting immediate offset");
+ return MO.getImm();
+ }
+ }
+ dbgs() << *MI;
+ llvm_unreachable("Store offset calculation missing for a handled opcode");
+ return 0;
+}
+
+static const MachineMemOperand &getStoreTarget(const MachineInstr *MI) {
+ assert(!MI->memoperands_empty() && "Expecting memory operands");
+ return **MI->memoperands_begin();
+}
+
+INITIALIZE_PASS_BEGIN(HexagonStoreWidening, "hexagon-widen-stores",
+ "Hexason Store Widening", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(HexagonStoreWidening, "hexagon-widen-stores",
+ "Hexagon Store Widening", false, false)
+
+// Filtering function: any stores whose opcodes are not "approved" of by
+// this function will not be subjected to widening.
+inline bool HexagonStoreWidening::handledStoreType(const MachineInstr *MI) {
+ // For now, only handle stores of immediate values.
+ // Also, reject stores to stack slots.
+ unsigned Opc = MI->getOpcode();
+ switch (Opc) {
+ case Hexagon::S4_storeirb_io:
+ case Hexagon::S4_storeirh_io:
+ case Hexagon::S4_storeiri_io:
+ // Base address must be a register. (Implement FI later.)
+ return MI->getOperand(0).isReg();
+ default:
+ return false;
+ }
+}
+
+// Check if the machine memory operand MMO is aliased with any of the
+// stores in the store group Stores.
+bool HexagonStoreWidening::instrAliased(InstrGroup &Stores,
+ const MachineMemOperand &MMO) {
+ if (!MMO.getValue())
+ return true;
+
+ MemoryLocation L(MMO.getValue(), MMO.getSize(), MMO.getAAInfo());
+
+ for (auto SI : Stores) {
+ const MachineMemOperand &SMO = getStoreTarget(SI);
+ if (!SMO.getValue())
+ return true;
+
+ MemoryLocation SL(SMO.getValue(), SMO.getSize(), SMO.getAAInfo());
+ if (AA->alias(L, SL))
+ return true;
+ }
+
+ return false;
+}
+
+// Check if the machine instruction MI accesses any storage aliased with
+// any store in the group Stores.
+bool HexagonStoreWidening::instrAliased(InstrGroup &Stores,
+ const MachineInstr *MI) {
+ for (auto &I : MI->memoperands())
+ if (instrAliased(Stores, *I))
+ return true;
+ return false;
+}
+
+// Inspect a machine basic block, and generate store groups out of stores
+// encountered in the block.
+//
+// A store group is a group of stores that use the same base register,
+// and which can be reordered within that group without altering the
+// semantics of the program. A single store group could be widened as
+// a whole, if there existed a single store instruction with the same
+// semantics as the entire group. In many cases, a single store group
+// may need more than one wide store.
+void HexagonStoreWidening::createStoreGroups(MachineBasicBlock &MBB,
+ InstrGroupList &StoreGroups) {
+ InstrGroup AllInsns;
+
+ // Copy all instruction pointers from the basic block to a temporary
+ // list. This will allow operating on the list, and modifying its
+ // elements without affecting the basic block.
+ for (auto &I : MBB)
+ AllInsns.push_back(&I);
+
+ // Traverse all instructions in the AllInsns list, and if we encounter
+ // a store, then try to create a store group starting at that instruction
+ // i.e. a sequence of independent stores that can be widened.
+ for (auto I = AllInsns.begin(), E = AllInsns.end(); I != E; ++I) {
+ MachineInstr *MI = *I;
+ // Skip null pointers (processed instructions).
+ if (!MI || !handledStoreType(MI))
+ continue;
+
+ // Found a store. Try to create a store group.
+ InstrGroup G;
+ createStoreGroup(MI, I+1, E, G);
+ if (G.size() > 1)
+ StoreGroups.push_back(G);
+ }
+}
+
+// Create a single store group. The stores need to be independent between
+// themselves, and also there cannot be other instructions between them
+// that could read or modify storage being stored into.
+void HexagonStoreWidening::createStoreGroup(MachineInstr *BaseStore,
+ InstrGroup::iterator Begin, InstrGroup::iterator End, InstrGroup &Group) {
+ assert(handledStoreType(BaseStore) && "Unexpected instruction");
+ unsigned BaseReg = getBaseAddressRegister(BaseStore);
+ InstrGroup Other;
+
+ Group.push_back(BaseStore);
+
+ for (auto I = Begin; I != End; ++I) {
+ MachineInstr *MI = *I;
+ if (!MI)
+ continue;
+
+ if (handledStoreType(MI)) {
+ // If this store instruction is aliased with anything already in the
+ // group, terminate the group now.
+ if (instrAliased(Group, getStoreTarget(MI)))
+ return;
+ // If this store is aliased to any of the memory instructions we have
+ // seen so far (that are not a part of this group), terminate the group.
+ if (instrAliased(Other, getStoreTarget(MI)))
+ return;
+
+ unsigned BR = getBaseAddressRegister(MI);
+ if (BR == BaseReg) {
+ Group.push_back(MI);
+ *I = nullptr;
+ continue;
+ }
+ }
+
+ // Assume calls are aliased to everything.
+ if (MI->isCall() || MI->hasUnmodeledSideEffects())
+ return;
+
+ if (MI->mayLoad() || MI->mayStore()) {
+ if (MI->hasOrderedMemoryRef() || instrAliased(Group, MI))
+ return;
+ Other.push_back(MI);
+ }
+ } // for
+}
+
+// Check if store instructions S1 and S2 are adjacent. More precisely,
+// S2 has to access memory immediately following that accessed by S1.
+bool HexagonStoreWidening::storesAreAdjacent(const MachineInstr *S1,
+ const MachineInstr *S2) {
+ if (!handledStoreType(S1) || !handledStoreType(S2))
+ return false;
+
+ const MachineMemOperand &S1MO = getStoreTarget(S1);
+
+ // Currently only handling immediate stores.
+ int Off1 = S1->getOperand(1).getImm();
+ int Off2 = S2->getOperand(1).getImm();
+
+ return (Off1 >= 0) ? Off1+S1MO.getSize() == unsigned(Off2)
+ : int(Off1+S1MO.getSize()) == Off2;
+}
+
+/// Given a sequence of adjacent stores, and a maximum size of a single wide
+/// store, pick a group of stores that can be replaced by a single store
+/// of size not exceeding MaxSize. The selected sequence will be recorded
+/// in OG ("old group" of instructions).
+/// OG should be empty on entry, and should be left empty if the function
+/// fails.
+bool HexagonStoreWidening::selectStores(InstrGroup::iterator Begin,
+ InstrGroup::iterator End, InstrGroup &OG, unsigned &TotalSize,
+ unsigned MaxSize) {
+ assert(Begin != End && "No instructions to analyze");
+ assert(OG.empty() && "Old group not empty on entry");
+
+ if (std::distance(Begin, End) <= 1)
+ return false;
+
+ MachineInstr *FirstMI = *Begin;
+ assert(!FirstMI->memoperands_empty() && "Expecting some memory operands");
+ const MachineMemOperand &FirstMMO = getStoreTarget(FirstMI);
+ unsigned Alignment = FirstMMO.getAlignment();
+ unsigned SizeAccum = FirstMMO.getSize();
+ unsigned FirstOffset = getStoreOffset(FirstMI);
+
+ // The initial value of SizeAccum should always be a power of 2.
+ assert(isPowerOf2_32(SizeAccum) && "First store size not a power of 2");
+
+ // If the size of the first store equals to or exceeds the limit, do nothing.
+ if (SizeAccum >= MaxSize)
+ return false;
+
+ // If the size of the first store is greater than or equal to the address
+ // stored to, then the store cannot be made any wider.
+ if (SizeAccum >= Alignment)
+ return false;
+
+ // The offset of a store will put restrictions on how wide the store can be.
+ // Offsets in stores of size 2^n bytes need to have the n lowest bits be 0.
+ // If the first store already exhausts the offset limits, quit. Test this
+ // by checking if the next wider size would exceed the limit.
+ if ((2*SizeAccum-1) & FirstOffset)
+ return false;
+
+ OG.push_back(FirstMI);
+ MachineInstr *S1 = FirstMI, *S2 = *(Begin+1);
+ InstrGroup::iterator I = Begin+1;
+
+ // Pow2Num will be the largest number of elements in OG such that the sum
+ // of sizes of stores 0...Pow2Num-1 will be a power of 2.
+ unsigned Pow2Num = 1;
+ unsigned Pow2Size = SizeAccum;
+
+ // Be greedy: keep accumulating stores as long as they are to adjacent
+ // memory locations, and as long as the total number of bytes stored
+ // does not exceed the limit (MaxSize).
+ // Keep track of when the total size covered is a power of 2, since
+ // this is a size a single store can cover.
+ while (I != End) {
+ S2 = *I;
+ // Stores are sorted, so if S1 and S2 are not adjacent, there won't be
+ // any other store to fill the "hole".
+ if (!storesAreAdjacent(S1, S2))
+ break;
+
+ unsigned S2Size = getStoreTarget(S2).getSize();
+ if (SizeAccum + S2Size > std::min(MaxSize, Alignment))
+ break;
+
+ OG.push_back(S2);
+ SizeAccum += S2Size;
+ if (isPowerOf2_32(SizeAccum)) {
+ Pow2Num = OG.size();
+ Pow2Size = SizeAccum;
+ }
+ if ((2*Pow2Size-1) & FirstOffset)
+ break;
+
+ S1 = S2;
+ ++I;
+ }
+
+ // The stores don't add up to anything that can be widened. Clean up.
+ if (Pow2Num <= 1) {
+ OG.clear();
+ return false;
+ }
+
+ // Only leave the stored being widened.
+ OG.resize(Pow2Num);
+ TotalSize = Pow2Size;
+ return true;
+}
+
+/// Given an "old group" OG of stores, create a "new group" NG of instructions
+/// to replace them. Ideally, NG would only have a single instruction in it,
+/// but that may only be possible for store-immediate.
+bool HexagonStoreWidening::createWideStores(InstrGroup &OG, InstrGroup &NG,
+ unsigned TotalSize) {
+ // XXX Current limitations:
+ // - only expect stores of immediate values in OG,
+ // - only handle a TotalSize of up to 4.
+
+ if (TotalSize > 4)
+ return false;
+
+ unsigned Acc = 0; // Value accumulator.
+ unsigned Shift = 0;
+
+ for (InstrGroup::iterator I = OG.begin(), E = OG.end(); I != E; ++I) {
+ MachineInstr *MI = *I;
+ const MachineMemOperand &MMO = getStoreTarget(MI);
+ MachineOperand &SO = MI->getOperand(2); // Source.
+ assert(SO.isImm() && "Expecting an immediate operand");
+
+ unsigned NBits = MMO.getSize()*8;
+ unsigned Mask = (0xFFFFFFFFU >> (32-NBits));
+ unsigned Val = (SO.getImm() & Mask) << Shift;
+ Acc |= Val;
+ Shift += NBits;
+ }
+
+ MachineInstr *FirstSt = OG.front();
+ DebugLoc DL = OG.back()->getDebugLoc();
+ const MachineMemOperand &OldM = getStoreTarget(FirstSt);
+ MachineMemOperand *NewM =
+ MF->getMachineMemOperand(OldM.getPointerInfo(), OldM.getFlags(),
+ TotalSize, OldM.getAlignment(),
+ OldM.getAAInfo());
+
+ if (Acc < 0x10000) {
+ // Create mem[hw] = #Acc
+ unsigned WOpc = (TotalSize == 2) ? Hexagon::S4_storeirh_io :
+ (TotalSize == 4) ? Hexagon::S4_storeiri_io : 0;
+ assert(WOpc && "Unexpected size");
+
+ int Val = (TotalSize == 2) ? int16_t(Acc) : int(Acc);
+ const MCInstrDesc &StD = TII->get(WOpc);
+ MachineOperand &MR = FirstSt->getOperand(0);
+ int64_t Off = FirstSt->getOperand(1).getImm();
+ MachineInstr *StI = BuildMI(*MF, DL, StD)
+ .addReg(MR.getReg(), getKillRegState(MR.isKill()))
+ .addImm(Off)
+ .addImm(Val);
+ StI->addMemOperand(*MF, NewM);
+ NG.push_back(StI);
+ } else {
+ // Create vreg = A2_tfrsi #Acc; mem[hw] = vreg
+ const MCInstrDesc &TfrD = TII->get(Hexagon::A2_tfrsi);
+ const TargetRegisterClass *RC = TII->getRegClass(TfrD, 0, TRI, *MF);
+ unsigned VReg = MF->getRegInfo().createVirtualRegister(RC);
+ MachineInstr *TfrI = BuildMI(*MF, DL, TfrD, VReg)
+ .addImm(int(Acc));
+ NG.push_back(TfrI);
+
+ unsigned WOpc = (TotalSize == 2) ? Hexagon::S2_storerh_io :
+ (TotalSize == 4) ? Hexagon::S2_storeri_io : 0;
+ assert(WOpc && "Unexpected size");
+
+ const MCInstrDesc &StD = TII->get(WOpc);
+ MachineOperand &MR = FirstSt->getOperand(0);
+ int64_t Off = FirstSt->getOperand(1).getImm();
+ MachineInstr *StI = BuildMI(*MF, DL, StD)
+ .addReg(MR.getReg(), getKillRegState(MR.isKill()))
+ .addImm(Off)
+ .addReg(VReg, RegState::Kill);
+ StI->addMemOperand(*MF, NewM);
+ NG.push_back(StI);
+ }
+
+ return true;
+}
+
+// Replace instructions from the old group OG with instructions from the
+// new group NG. Conceptually, remove all instructions in OG, and then
+// insert all instructions in NG, starting at where the first instruction
+// from OG was (in the order in which they appeared in the basic block).
+// (The ordering in OG does not have to match the order in the basic block.)
+bool HexagonStoreWidening::replaceStores(InstrGroup &OG, InstrGroup &NG) {
+ DEBUG({
+ dbgs() << "Replacing:\n";
+ for (auto I : OG)
+ dbgs() << " " << *I;
+ dbgs() << "with\n";
+ for (auto I : NG)
+ dbgs() << " " << *I;
+ });
+
+ MachineBasicBlock *MBB = OG.back()->getParent();
+ MachineBasicBlock::iterator InsertAt = MBB->end();
+
+ // Need to establish the insertion point. The best one is right before
+ // the first store in the OG, but in the order in which the stores occur
+ // in the program list. Since the ordering in OG does not correspond
+ // to the order in the program list, we need to do some work to find
+ // the insertion point.
+
+ // Create a set of all instructions in OG (for quick lookup).
+ SmallPtrSet<MachineInstr*, 4> InstrSet;
+ for (auto I : OG)
+ InstrSet.insert(I);
+
+ // Traverse the block, until we hit an instruction from OG.
+ for (auto &I : *MBB) {
+ if (InstrSet.count(&I)) {
+ InsertAt = I;
+ break;
+ }
+ }
+
+ assert((InsertAt != MBB->end()) && "Cannot locate any store from the group");
+
+ bool AtBBStart = false;
+
+ // InsertAt points at the first instruction that will be removed. We need
+ // to move it out of the way, so it remains valid after removing all the
+ // old stores, and so we are able to recover it back to the proper insertion
+ // position.
+ if (InsertAt != MBB->begin())
+ --InsertAt;
+ else
+ AtBBStart = true;
+
+ for (auto I : OG)
+ I->eraseFromParent();
+
+ if (!AtBBStart)
+ ++InsertAt;
+ else
+ InsertAt = MBB->begin();
+
+ for (auto I : NG)
+ MBB->insert(InsertAt, I);
+
+ return true;
+}
+
+// Break up the group into smaller groups, each of which can be replaced by
+// a single wide store. Widen each such smaller group and replace the old
+// instructions with the widened ones.
+bool HexagonStoreWidening::processStoreGroup(InstrGroup &Group) {
+ bool Changed = false;
+ InstrGroup::iterator I = Group.begin(), E = Group.end();
+ InstrGroup OG, NG; // Old and new groups.
+ unsigned CollectedSize;
+
+ while (I != E) {
+ OG.clear();
+ NG.clear();
+
+ bool Succ = selectStores(I++, E, OG, CollectedSize, MaxWideSize) &&
+ createWideStores(OG, NG, CollectedSize) &&
+ replaceStores(OG, NG);
+ if (!Succ)
+ continue;
+
+ assert(OG.size() > 1 && "Created invalid group");
+ assert(distance(I, E)+1 >= int(OG.size()) && "Too many elements");
+ I += OG.size()-1;
+
+ Changed = true;
+ }
+
+ return Changed;
+}
+
+// Process a single basic block: create the store groups, and replace them
+// with the widened stores, if possible. Processing of each basic block
+// is independent from processing of any other basic block. This transfor-
+// mation could be stopped after having processed any basic block without
+// any ill effects (other than not having performed widening in the unpro-
+// cessed blocks). Also, the basic blocks can be processed in any order.
+bool HexagonStoreWidening::processBasicBlock(MachineBasicBlock &MBB) {
+ InstrGroupList SGs;
+ bool Changed = false;
+
+ createStoreGroups(MBB, SGs);
+
+ auto Less = [] (const MachineInstr *A, const MachineInstr *B) -> bool {
+ return getStoreOffset(A) < getStoreOffset(B);
+ };
+ for (auto &G : SGs) {
+ assert(G.size() > 1 && "Store group with fewer than 2 elements");
+ std::sort(G.begin(), G.end(), Less);
+
+ Changed |= processStoreGroup(G);
+ }
+
+ return Changed;
+}
+
+bool HexagonStoreWidening::runOnMachineFunction(MachineFunction &MFn) {
+ if (skipFunction(*MFn.getFunction()))
+ return false;
+
+ MF = &MFn;
+ auto &ST = MFn.getSubtarget<HexagonSubtarget>();
+ TII = ST.getInstrInfo();
+ TRI = ST.getRegisterInfo();
+ MRI = &MFn.getRegInfo();
+ AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+ bool Changed = false;
+
+ for (auto &B : MFn)
+ Changed |= processBasicBlock(B);
+
+ return Changed;
+}
+
+FunctionPass *llvm::createHexagonStoreWidening() {
+ return new HexagonStoreWidening();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
new file mode 100644
index 000000000000..8c23a2465dd6
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -0,0 +1,393 @@
+//===-- HexagonSubtarget.cpp - Hexagon Subtarget Information --------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Hexagon specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonSubtarget.h"
+#include "Hexagon.h"
+#include "HexagonRegisterInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <map>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagon-subtarget"
+
+#define GET_SUBTARGETINFO_CTOR
+#define GET_SUBTARGETINFO_TARGET_DESC
+#include "HexagonGenSubtargetInfo.inc"
+
+static cl::opt<bool> EnableMemOps("enable-hexagon-memops",
+ cl::Hidden, cl::ZeroOrMore, cl::ValueDisallowed, cl::init(true),
+ cl::desc("Generate V4 MEMOP in code generation for Hexagon target"));
+
+static cl::opt<bool> DisableMemOps("disable-hexagon-memops",
+ cl::Hidden, cl::ZeroOrMore, cl::ValueDisallowed, cl::init(false),
+ cl::desc("Do not generate V4 MEMOP in code generation for Hexagon target"));
+
+static cl::opt<bool> EnableIEEERndNear("enable-hexagon-ieee-rnd-near",
+ cl::Hidden, cl::ZeroOrMore, cl::init(false),
+ cl::desc("Generate non-chopped conversion from fp to int."));
+
+static cl::opt<bool> EnableBSBSched("enable-bsb-sched",
+ cl::Hidden, cl::ZeroOrMore, cl::init(true));
+
+static cl::opt<bool> EnableHexagonHVXDouble("enable-hexagon-hvx-double",
+ cl::Hidden, cl::ZeroOrMore, cl::init(false),
+ cl::desc("Enable Hexagon Double Vector eXtensions"));
+
+static cl::opt<bool> EnableHexagonHVX("enable-hexagon-hvx",
+ cl::Hidden, cl::ZeroOrMore, cl::init(false),
+ cl::desc("Enable Hexagon Vector eXtensions"));
+
+static cl::opt<bool> EnableTCLatencySched("enable-tc-latency-sched",
+ cl::Hidden, cl::ZeroOrMore, cl::init(false));
+
+static cl::opt<bool> EnableDotCurSched("enable-cur-sched",
+ cl::Hidden, cl::ZeroOrMore, cl::init(true),
+ cl::desc("Enable the scheduler to generate .cur"));
+
+static cl::opt<bool> EnableVecFrwdSched("enable-evec-frwd-sched",
+ cl::Hidden, cl::ZeroOrMore, cl::init(true));
+
+static cl::opt<bool> DisableHexagonMISched("disable-hexagon-misched",
+ cl::Hidden, cl::ZeroOrMore, cl::init(false),
+ cl::desc("Disable Hexagon MI Scheduling"));
+
+static cl::opt<bool> EnableSubregLiveness("hexagon-subreg-liveness",
+ cl::Hidden, cl::ZeroOrMore, cl::init(true),
+ cl::desc("Enable subregister liveness tracking for Hexagon"));
+
+static cl::opt<bool> OverrideLongCalls("hexagon-long-calls",
+ cl::Hidden, cl::ZeroOrMore, cl::init(false),
+ cl::desc("If present, forces/disables the use of long calls"));
+
+void HexagonSubtarget::initializeEnvironment() {
+ UseMemOps = false;
+ ModeIEEERndNear = false;
+ UseBSBScheduling = false;
+}
+
+HexagonSubtarget &
+HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
+ CPUString = Hexagon_MC::selectHexagonCPU(getTargetTriple(), CPU);
+
+ static std::map<StringRef, HexagonArchEnum> CpuTable {
+ { "hexagonv4", V4 },
+ { "hexagonv5", V5 },
+ { "hexagonv55", V55 },
+ { "hexagonv60", V60 },
+ };
+
+ auto foundIt = CpuTable.find(CPUString);
+ if (foundIt != CpuTable.end())
+ HexagonArchVersion = foundIt->second;
+ else
+ llvm_unreachable("Unrecognized Hexagon processor version");
+
+ UseHVXOps = false;
+ UseHVXDblOps = false;
+ UseLongCalls = false;
+ ParseSubtargetFeatures(CPUString, FS);
+
+ if (EnableHexagonHVX.getPosition())
+ UseHVXOps = EnableHexagonHVX;
+ if (EnableHexagonHVXDouble.getPosition())
+ UseHVXDblOps = EnableHexagonHVXDouble;
+ if (OverrideLongCalls.getPosition())
+ UseLongCalls = OverrideLongCalls;
+
+ return *this;
+}
+
+HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU,
+ StringRef FS, const TargetMachine &TM)
+ : HexagonGenSubtargetInfo(TT, CPU, FS), CPUString(CPU),
+ InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
+ FrameLowering() {
+
+ initializeEnvironment();
+
+ // Initialize scheduling itinerary for the specified CPU.
+ InstrItins = getInstrItineraryForCPU(CPUString);
+
+ // UseMemOps on by default unless disabled explicitly
+ if (DisableMemOps)
+ UseMemOps = false;
+ else if (EnableMemOps)
+ UseMemOps = true;
+ else
+ UseMemOps = false;
+
+ if (EnableIEEERndNear)
+ ModeIEEERndNear = true;
+ else
+ ModeIEEERndNear = false;
+
+ UseBSBScheduling = hasV60TOps() && EnableBSBSched;
+}
+
+
+void HexagonSubtarget::HexagonDAGMutation::apply(ScheduleDAGInstrs *DAG) {
+ for (auto &SU : DAG->SUnits) {
+ if (!SU.isInstr())
+ continue;
+ SmallVector<SDep, 4> Erase;
+ for (auto &D : SU.Preds)
+ if (D.getKind() == SDep::Output && D.getReg() == Hexagon::USR_OVF)
+ Erase.push_back(D);
+ for (auto &E : Erase)
+ SU.removePred(E);
+ }
+
+ for (auto &SU : DAG->SUnits) {
+ // Update the latency of chain edges between v60 vector load or store
+ // instructions to be 1. These instructions cannot be scheduled in the
+ // same packet.
+ MachineInstr &MI1 = *SU.getInstr();
+ auto *QII = static_cast<const HexagonInstrInfo*>(DAG->TII);
+ bool IsStoreMI1 = MI1.mayStore();
+ bool IsLoadMI1 = MI1.mayLoad();
+ if (!QII->isV60VectorInstruction(MI1) || !(IsStoreMI1 || IsLoadMI1))
+ continue;
+ for (auto &SI : SU.Succs) {
+ if (SI.getKind() != SDep::Order || SI.getLatency() != 0)
+ continue;
+ MachineInstr &MI2 = *SI.getSUnit()->getInstr();
+ if (!QII->isV60VectorInstruction(MI2))
+ continue;
+ if ((IsStoreMI1 && MI2.mayStore()) || (IsLoadMI1 && MI2.mayLoad())) {
+ SI.setLatency(1);
+ SU.setHeightDirty();
+ // Change the dependence in the opposite direction too.
+ for (auto &PI : SI.getSUnit()->Preds) {
+ if (PI.getSUnit() != &SU || PI.getKind() != SDep::Order)
+ continue;
+ PI.setLatency(1);
+ SI.getSUnit()->setDepthDirty();
+ }
+ }
+ }
+ }
+}
+
+
+void HexagonSubtarget::getPostRAMutations(
+ std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
+ Mutations.push_back(make_unique<HexagonSubtarget::HexagonDAGMutation>());
+}
+
+void HexagonSubtarget::getSMSMutations(
+ std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
+ Mutations.push_back(make_unique<HexagonSubtarget::HexagonDAGMutation>());
+}
+
+
+// Pin the vtable to this file.
+void HexagonSubtarget::anchor() {}
+
+bool HexagonSubtarget::enableMachineScheduler() const {
+ if (DisableHexagonMISched.getNumOccurrences())
+ return !DisableHexagonMISched;
+ return true;
+}
+
+bool HexagonSubtarget::enableSubRegLiveness() const {
+ return EnableSubregLiveness;
+}
+
+// This helper function is responsible for increasing the latency only.
+void HexagonSubtarget::updateLatency(MachineInstr &SrcInst,
+ MachineInstr &DstInst, SDep &Dep) const {
+ if (!hasV60TOps())
+ return;
+
+ auto &QII = static_cast<const HexagonInstrInfo&>(*getInstrInfo());
+
+ if (EnableVecFrwdSched && QII.addLatencyToSchedule(SrcInst, DstInst)) {
+ // Vec frwd scheduling.
+ Dep.setLatency(Dep.getLatency() + 1);
+ } else if (useBSBScheduling() &&
+ QII.isLateInstrFeedsEarlyInstr(SrcInst, DstInst)) {
+ // BSB scheduling.
+ Dep.setLatency(Dep.getLatency() + 1);
+ } else if (EnableTCLatencySched) {
+ // TClass latency scheduling.
+ // Check if SrcInst produces in 2C an operand of DstInst taken in stage 2B.
+ if (QII.isTC1(SrcInst) || QII.isTC2(SrcInst))
+ if (!QII.isTC1(DstInst) && !QII.isTC2(DstInst))
+ Dep.setLatency(Dep.getLatency() + 1);
+ }
+}
+
+/// If the SUnit has a zero latency edge, return the other SUnit.
+static SUnit *getZeroLatency(SUnit *N, SmallVector<SDep, 4> &Deps) {
+ for (auto &I : Deps)
+ if (I.isAssignedRegDep() && I.getLatency() == 0 &&
+ !I.getSUnit()->getInstr()->isPseudo())
+ return I.getSUnit();
+ return nullptr;
+}
+
+/// Change the latency between the two SUnits.
+void HexagonSubtarget::changeLatency(SUnit *Src, SmallVector<SDep, 4> &Deps,
+ SUnit *Dst, unsigned Lat) const {
+ MachineInstr &SrcI = *Src->getInstr();
+ for (auto &I : Deps) {
+ if (I.getSUnit() != Dst)
+ continue;
+ I.setLatency(Lat);
+ SUnit *UpdateDst = I.getSUnit();
+ updateLatency(SrcI, *UpdateDst->getInstr(), I);
+ // Update the latency of opposite edge too.
+ for (auto &PI : UpdateDst->Preds) {
+ if (PI.getSUnit() != Src || !PI.isAssignedRegDep())
+ continue;
+ PI.setLatency(Lat);
+ updateLatency(SrcI, *UpdateDst->getInstr(), PI);
+ }
+ }
+}
+
+// Return true if these are the best two instructions to schedule
+// together with a zero latency. Only one dependence should have a zero
+// latency. If there are multiple choices, choose the best, and change
+// ther others, if needed.
+bool HexagonSubtarget::isBestZeroLatency(SUnit *Src, SUnit *Dst,
+ const HexagonInstrInfo *TII) const {
+ MachineInstr &SrcInst = *Src->getInstr();
+ MachineInstr &DstInst = *Dst->getInstr();
+
+ // Ignore Boundary SU nodes as these have null instructions.
+ if (Dst->isBoundaryNode())
+ return false;
+
+ if (SrcInst.isPHI() || DstInst.isPHI())
+ return false;
+
+ // Check if the Dst instruction is the best candidate first.
+ SUnit *Best = nullptr;
+ SUnit *DstBest = nullptr;
+ SUnit *SrcBest = getZeroLatency(Dst, Dst->Preds);
+ if (SrcBest == nullptr || Src->NodeNum >= SrcBest->NodeNum) {
+ // Check that Src doesn't have a better candidate.
+ DstBest = getZeroLatency(Src, Src->Succs);
+ if (DstBest == nullptr || Dst->NodeNum <= DstBest->NodeNum)
+ Best = Dst;
+ }
+ if (Best != Dst)
+ return false;
+
+ // The caller frequents adds the same dependence twice. If so, then
+ // return true for this case too.
+ if (Src == SrcBest && Dst == DstBest)
+ return true;
+
+ // Reassign the latency for the previous bests, which requires setting
+ // the dependence edge in both directions.
+ if (SrcBest != nullptr)
+ changeLatency(SrcBest, SrcBest->Succs, Dst, 1);
+ if (DstBest != nullptr)
+ changeLatency(Src, Src->Succs, DstBest, 1);
+ // If there is an edge from SrcBest to DstBst, then try to change that
+ // to 0 now.
+ if (SrcBest && DstBest)
+ changeLatency(SrcBest, SrcBest->Succs, DstBest, 0);
+
+ return true;
+}
+
+// Update the latency of a Phi when the Phi bridges two instructions that
+// require a multi-cycle latency.
+void HexagonSubtarget::changePhiLatency(MachineInstr &SrcInst, SUnit *Dst,
+ SDep &Dep) const {
+ if (!SrcInst.isPHI() || Dst->NumPreds == 0 || Dep.getLatency() != 0)
+ return;
+
+ for (const SDep &PI : Dst->Preds) {
+ if (PI.getLatency() != 0)
+ continue;
+ Dep.setLatency(2);
+ break;
+ }
+}
+
+/// \brief Perform target specific adjustments to the latency of a schedule
+/// dependency.
+void HexagonSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst,
+ SDep &Dep) const {
+ MachineInstr *SrcInst = Src->getInstr();
+ MachineInstr *DstInst = Dst->getInstr();
+ if (!Src->isInstr() || !Dst->isInstr())
+ return;
+
+ const HexagonInstrInfo *QII = static_cast<const HexagonInstrInfo *>(getInstrInfo());
+
+ // Instructions with .new operands have zero latency.
+ if (QII->canExecuteInBundle(*SrcInst, *DstInst) &&
+ isBestZeroLatency(Src, Dst, QII)) {
+ Dep.setLatency(0);
+ return;
+ }
+
+ if (!hasV60TOps())
+ return;
+
+ // Don't adjust the latency of post-increment part of the instruction.
+ if (QII->isPostIncrement(*SrcInst) && Dep.isAssignedRegDep()) {
+ if (SrcInst->mayStore())
+ return;
+ if (Dep.getReg() != SrcInst->getOperand(0).getReg())
+ return;
+ } else if (QII->isPostIncrement(*DstInst) && Dep.getKind() == SDep::Anti) {
+ if (DstInst->mayStore())
+ return;
+ if (Dep.getReg() != DstInst->getOperand(0).getReg())
+ return;
+ } else if (QII->isPostIncrement(*DstInst) && DstInst->mayStore() &&
+ Dep.isAssignedRegDep()) {
+ MachineOperand &Op = DstInst->getOperand(DstInst->getNumOperands() - 1);
+ if (Op.isReg() && Dep.getReg() != Op.getReg())
+ return;
+ }
+
+ // Check if we need to change any the latency values when Phis are added.
+ if (useBSBScheduling() && SrcInst->isPHI()) {
+ changePhiLatency(*SrcInst, Dst, Dep);
+ return;
+ }
+
+ // If it's a REG_SEQUENCE, use its destination instruction to determine
+ // the correct latency.
+ if (DstInst->isRegSequence() && Dst->NumSuccs == 1)
+ DstInst = Dst->Succs[0].getSUnit()->getInstr();
+
+ // Try to schedule uses near definitions to generate .cur.
+ if (EnableDotCurSched && QII->isToBeScheduledASAP(*SrcInst, *DstInst) &&
+ isBestZeroLatency(Src, Dst, QII)) {
+ Dep.setLatency(0);
+ return;
+ }
+
+ updateLatency(*SrcInst, *DstInst, Dep);
+}
+
+unsigned HexagonSubtarget::getL1CacheLineSize() const {
+ return 32;
+}
+
+unsigned HexagonSubtarget::getL1PrefetchDistance() const {
+ return 32;
+}
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.h
new file mode 100644
index 000000000000..f2b9cdaad1ae
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.h
@@ -0,0 +1,157 @@
+//===-- HexagonSubtarget.h - Define Subtarget for the Hexagon ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Hexagon specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONSUBTARGET_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONSUBTARGET_H
+
+#include "HexagonFrameLowering.h"
+#include "HexagonISelLowering.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonSelectionDAGInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "HexagonGenSubtargetInfo.inc"
+
+#define Hexagon_SMALL_DATA_THRESHOLD 8
+#define Hexagon_SLOTS 4
+
+namespace llvm {
+
+class HexagonSubtarget : public HexagonGenSubtargetInfo {
+ virtual void anchor();
+
+ bool UseMemOps, UseHVXOps, UseHVXDblOps;
+ bool UseLongCalls;
+ bool ModeIEEERndNear;
+
+public:
+ enum HexagonArchEnum {
+ V4, V5, V55, V60
+ };
+
+ HexagonArchEnum HexagonArchVersion;
+ /// True if the target should use Back-Skip-Back scheduling. This is the
+ /// default for V60.
+ bool UseBSBScheduling;
+
+ class HexagonDAGMutation : public ScheduleDAGMutation {
+ public:
+ void apply(ScheduleDAGInstrs *DAG) override;
+ };
+
+private:
+ std::string CPUString;
+ HexagonInstrInfo InstrInfo;
+ HexagonTargetLowering TLInfo;
+ HexagonSelectionDAGInfo TSInfo;
+ HexagonFrameLowering FrameLowering;
+ InstrItineraryData InstrItins;
+ void initializeEnvironment();
+
+public:
+ HexagonSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
+ const TargetMachine &TM);
+
+ /// getInstrItins - Return the instruction itineraries based on subtarget
+ /// selection.
+ const InstrItineraryData *getInstrItineraryData() const override {
+ return &InstrItins;
+ }
+ const HexagonInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+ const HexagonRegisterInfo *getRegisterInfo() const override {
+ return &InstrInfo.getRegisterInfo();
+ }
+ const HexagonTargetLowering *getTargetLowering() const override {
+ return &TLInfo;
+ }
+ const HexagonFrameLowering *getFrameLowering() const override {
+ return &FrameLowering;
+ }
+ const HexagonSelectionDAGInfo *getSelectionDAGInfo() const override {
+ return &TSInfo;
+ }
+
+ HexagonSubtarget &initializeSubtargetDependencies(StringRef CPU,
+ StringRef FS);
+
+ /// ParseSubtargetFeatures - Parses features string setting specified
+ /// subtarget options. Definition of function is auto generated by tblgen.
+ void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+ bool useMemOps() const { return UseMemOps; }
+ bool hasV5TOps() const { return getHexagonArchVersion() >= V5; }
+ bool hasV5TOpsOnly() const { return getHexagonArchVersion() == V5; }
+ bool hasV55TOps() const { return getHexagonArchVersion() >= V55; }
+ bool hasV55TOpsOnly() const { return getHexagonArchVersion() == V55; }
+ bool hasV60TOps() const { return getHexagonArchVersion() >= V60; }
+ bool hasV60TOpsOnly() const { return getHexagonArchVersion() == V60; }
+ bool modeIEEERndNear() const { return ModeIEEERndNear; }
+ bool useHVXOps() const { return UseHVXOps; }
+ bool useHVXDblOps() const { return UseHVXOps && UseHVXDblOps; }
+ bool useHVXSglOps() const { return UseHVXOps && !UseHVXDblOps; }
+ bool useLongCalls() const { return UseLongCalls; }
+
+ bool useBSBScheduling() const { return UseBSBScheduling; }
+ bool enableMachineScheduler() const override;
+ // Always use the TargetLowering default scheduler.
+ // FIXME: This will use the vliw scheduler which is probably just hurting
+ // compiler time and will be removed eventually anyway.
+ bool enableMachineSchedDefaultSched() const override { return false; }
+
+ AntiDepBreakMode getAntiDepBreakMode() const override { return ANTIDEP_ALL; }
+ bool enablePostRAScheduler() const override { return true; }
+
+ bool enableSubRegLiveness() const override;
+
+ const std::string &getCPUString () const { return CPUString; }
+
+ // Threshold for small data section
+ unsigned getSmallDataThreshold() const {
+ return Hexagon_SMALL_DATA_THRESHOLD;
+ }
+ const HexagonArchEnum &getHexagonArchVersion() const {
+ return HexagonArchVersion;
+ }
+
+ void getPostRAMutations(
+ std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
+ const override;
+
+ void getSMSMutations(
+ std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
+ const override;
+
+ /// \brief Perform target specific adjustments to the latency of a schedule
+ /// dependency.
+ void adjustSchedDependency(SUnit *def, SUnit *use, SDep& dep) const override;
+
+ unsigned getL1CacheLineSize() const;
+ unsigned getL1PrefetchDistance() const;
+
+private:
+ // Helper function responsible for increasing the latency only.
+ void updateLatency(MachineInstr &SrcInst, MachineInstr &DstInst, SDep &Dep)
+ const;
+ void changeLatency(SUnit *Src, SmallVector<SDep, 4> &Deps, SUnit *Dst,
+ unsigned Lat) const;
+ bool isBestZeroLatency(SUnit *Src, SUnit *Dst, const HexagonInstrInfo *TII)
+ const;
+ void changePhiLatency(MachineInstr &SrcInst, SUnit *Dst, SDep &Dep) const;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSystemInst.td b/contrib/llvm/lib/Target/Hexagon/HexagonSystemInst.td
new file mode 100644
index 000000000000..629a98749ee9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSystemInst.td
@@ -0,0 +1,134 @@
+//==- HexagonSystemInst.td - System Instructions for Hexagon -*- tablegen -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Cache manipulation instructions.
+//===----------------------------------------------------------------------===//
+let mayStore = 1 in
+class ST_MISC_CACHEOP<dag outs, dag ins,
+ string asmstr, list<dag> pattern = [],
+ bits<3> amode, bits<3> type, bits<1> un>
+ : ST0Inst<outs, ins, asmstr, pattern, "", ST_tc_ld_SLOT0> {
+
+ bits<5> Rs;
+ bits<5> Rt;
+ bits<5> Rd;
+ let Inst{31-28} = 0b1010;
+ let Inst{27-25} = amode;
+ let Inst{24-22} = type;
+ let Inst{21} = un;
+ let Inst{20-16} = Rs;
+ let Inst{12-8} = Rt;
+ let Inst{4-0} = Rd;
+}
+
+let mayStore = 1 in
+class ST_MISC_CACHEOP_SYS<dag outs, dag ins,
+ string asmstr, list<dag> pattern = [],
+ bits<3> amode, bits<3> type, bits<1> un>
+ : SYSInst<outs, ins, asmstr, pattern, ""> {
+
+ bits<5> Rs;
+ bits<5> Rt;
+ bits<5> Rd;
+ let Inst{31-28} = 0b1010;
+ let Inst{27-25} = amode;
+ let Inst{24-22} = type;
+ let Inst{21} = un;
+ let Inst{20-16} = Rs;
+ let Inst{12-8} = Rt;
+ let Inst{4-0} = Rd;
+}
+
+
+let isSolo = 1, Rs = 0, Rt = 0, Rd = 0 in {
+def Y2_syncht: ST_MISC_CACHEOP <(outs), (ins),
+ "syncht" , [], 0b100, 0b001, 0b0>;
+}
+
+let Rt = 0, Rd = 0 in {
+let isSoloAin1 = 1 in {
+ def Y2_dccleana: ST_MISC_CACHEOP <(outs), (ins IntRegs:$Rs),
+ "dccleana($Rs)", [], 0b000, 0b000, 0b0>;
+ def Y2_dcinva: ST_MISC_CACHEOP <(outs), (ins IntRegs:$Rs),
+ "dcinva($Rs)", [], 0b000, 0b000, 0b1>;
+ def Y2_dccleaninva: ST_MISC_CACHEOP <(outs), (ins IntRegs:$Rs),
+ "dccleaninva($Rs)", [], 0b000, 0b001, 0b0>;
+ }
+}
+
+let isSoloAX = 1, hasSideEffects = 1, Rd = 0 in {
+ def Y4_l2fetch: ST_MISC_CACHEOP_SYS<(outs), (ins IntRegs:$Rs, IntRegs:$Rt),
+ "l2fetch($Rs, $Rt)", [], 0b011, 0b000, 0b0>;
+ def Y5_l2fetch: ST_MISC_CACHEOP_SYS<(outs), (ins IntRegs:$Rs, DoubleRegs:$Rt),
+ "l2fetch($Rs, $Rt)", [], 0b011, 0b010, 0b0>;
+}
+
+let hasSideEffects = 0, isSolo = 1 in
+class Y2_INVALIDATE_CACHE<string mnemonic, bit MajOp>
+ : JRInst <
+ (outs), (ins IntRegs:$Rs),
+ #mnemonic#"($Rs)" > {
+ bits<5> Rs;
+
+ let IClass = 0b0101;
+ let Inst{27-21} = 0b0110110;
+ let Inst{20-16} = Rs;
+ let Inst{13-12} = 0b00;
+ let Inst{11} = MajOp;
+ }
+// Instruction cache invalidate
+def Y2_icinva : Y2_INVALIDATE_CACHE<"icinva", 0b0>;
+
+// Zero an aligned 32-byte cacheline.
+let isSoloAin1 = 1 in
+def Y2_dczeroa: ST0Inst <(outs), (ins IntRegs:$Rs),
+ "dczeroa($Rs)"> {
+ bits<5> Rs;
+ let IClass = 0b1010;
+ let Inst{27-21} = 0b0000110;
+ let Inst{13} = 0b0;
+ let Inst{20-16} = Rs;
+ }
+
+// Memory synchronization.
+let hasSideEffects = 0, isSolo = 1 in
+def Y2_isync: JRInst <(outs), (ins),
+ "isync"> {
+ let IClass = 0b0101;
+ let Inst{27-16} = 0b011111000000;
+ let Inst{13} = 0b0;
+ let Inst{9-0} = 0b0000000010;
+ }
+
+//===----------------------------------------------------------------------===//
+// System/User instructions.
+//===----------------------------------------------------------------------===//
+// traps and pause
+let hasSideEffects = 0, isSolo = 1 in
+class J2_MISC_TRAP_PAUSE<string mnemonic, bits<2> MajOp>
+ : JRInst
+ <(outs), (ins u8_0Imm:$u8),
+ #mnemonic#"(#$u8)"> {
+ bits<8> u8;
+
+ let IClass = 0b0101;
+ let Inst{27-24} = 0b0100;
+ let Inst{23-22} = MajOp;
+ let Inst{12-8} = u8{7-3};
+ let Inst{4-2} = u8{2-0};
+ }
+def J2_trap0 : J2_MISC_TRAP_PAUSE<"trap0", 0b00>;
+def J2_trap1 : J2_MISC_TRAP_PAUSE<"trap1", 0b10>;
+def J2_pause : J2_MISC_TRAP_PAUSE<"pause", 0b01>;
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
new file mode 100644
index 000000000000..132d12a66d46
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -0,0 +1,345 @@
+//===-- HexagonTargetMachine.cpp - Define TargetMachine for Hexagon -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements the info about Hexagon target spec.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonTargetMachine.h"
+#include "Hexagon.h"
+#include "HexagonISelLowering.h"
+#include "HexagonMachineScheduler.h"
+#include "HexagonTargetObjectFile.h"
+#include "HexagonTargetTransformInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+static cl::opt<bool> EnableRDFOpt("rdf-opt", cl::Hidden, cl::ZeroOrMore,
+ cl::init(true), cl::desc("Enable RDF-based optimizations"));
+
+static cl::opt<bool> DisableHardwareLoops("disable-hexagon-hwloops",
+ cl::Hidden, cl::desc("Disable Hardware Loops for Hexagon target"));
+
+static cl::opt<bool> DisableAModeOpt("disable-hexagon-amodeopt",
+ cl::Hidden, cl::ZeroOrMore, cl::init(false),
+ cl::desc("Disable Hexagon Addressing Mode Optimization"));
+
+static cl::opt<bool> DisableHexagonCFGOpt("disable-hexagon-cfgopt",
+ cl::Hidden, cl::ZeroOrMore, cl::init(false),
+ cl::desc("Disable Hexagon CFG Optimization"));
+
+static cl::opt<bool> DisableHCP("disable-hcp", cl::init(false), cl::Hidden,
+ cl::ZeroOrMore, cl::desc("Disable Hexagon constant propagation"));
+
+static cl::opt<bool> DisableStoreWidening("disable-store-widen",
+ cl::Hidden, cl::init(false), cl::desc("Disable store widening"));
+
+static cl::opt<bool> EnableExpandCondsets("hexagon-expand-condsets",
+ cl::init(true), cl::Hidden, cl::ZeroOrMore,
+ cl::desc("Early expansion of MUX"));
+
+static cl::opt<bool> EnableEarlyIf("hexagon-eif", cl::init(true), cl::Hidden,
+ cl::ZeroOrMore, cl::desc("Enable early if-conversion"));
+
+static cl::opt<bool> EnableGenInsert("hexagon-insert", cl::init(true),
+ cl::Hidden, cl::desc("Generate \"insert\" instructions"));
+
+static cl::opt<bool> EnableCommGEP("hexagon-commgep", cl::init(true),
+ cl::Hidden, cl::ZeroOrMore, cl::desc("Enable commoning of GEP instructions"));
+
+static cl::opt<bool> EnableGenExtract("hexagon-extract", cl::init(true),
+ cl::Hidden, cl::desc("Generate \"extract\" instructions"));
+
+static cl::opt<bool> EnableGenMux("hexagon-mux", cl::init(true), cl::Hidden,
+ cl::desc("Enable converting conditional transfers into MUX instructions"));
+
+static cl::opt<bool> EnableGenPred("hexagon-gen-pred", cl::init(true),
+ cl::Hidden, cl::desc("Enable conversion of arithmetic operations to "
+ "predicate instructions"));
+
+static cl::opt<bool> EnableLoopPrefetch("hexagon-loop-prefetch",
+ cl::init(false), cl::Hidden, cl::ZeroOrMore,
+ cl::desc("Enable loop data prefetch on Hexagon"));
+
+static cl::opt<bool> DisableHSDR("disable-hsdr", cl::init(false), cl::Hidden,
+ cl::desc("Disable splitting double registers"));
+
+static cl::opt<bool> EnableBitSimplify("hexagon-bit", cl::init(true),
+ cl::Hidden, cl::desc("Bit simplification"));
+
+static cl::opt<bool> EnableLoopResched("hexagon-loop-resched", cl::init(true),
+ cl::Hidden, cl::desc("Loop rescheduling"));
+
+static cl::opt<bool> HexagonNoOpt("hexagon-noopt", cl::init(false),
+ cl::Hidden, cl::desc("Disable backend optimizations"));
+
+static cl::opt<bool> EnableVectorPrint("enable-hexagon-vector-print",
+ cl::Hidden, cl::ZeroOrMore, cl::init(false),
+ cl::desc("Enable Hexagon Vector print instr pass"));
+
+/// HexagonTargetMachineModule - Note that this is used on hosts that
+/// cannot link in a library unless there are references into the
+/// library. In particular, it seems that it is not possible to get
+/// things to work on Win32 without this. Though it is unused, do not
+/// remove it.
+extern "C" int HexagonTargetMachineModule;
+int HexagonTargetMachineModule = 0;
+
+extern "C" void LLVMInitializeHexagonTarget() {
+ // Register the target.
+ RegisterTargetMachine<HexagonTargetMachine> X(getTheHexagonTarget());
+}
+
+static ScheduleDAGInstrs *createVLIWMachineSched(MachineSchedContext *C) {
+ return new VLIWMachineScheduler(C, make_unique<ConvergingVLIWScheduler>());
+}
+
+static MachineSchedRegistry
+SchedCustomRegistry("hexagon", "Run Hexagon's custom scheduler",
+ createVLIWMachineSched);
+
+namespace llvm {
+ extern char &HexagonExpandCondsetsID;
+ void initializeHexagonExpandCondsetsPass(PassRegistry&);
+
+ FunctionPass *createHexagonBitSimplify();
+ FunctionPass *createHexagonBranchRelaxation();
+ FunctionPass *createHexagonCallFrameInformation();
+ FunctionPass *createHexagonCFGOptimizer();
+ FunctionPass *createHexagonCommonGEP();
+ FunctionPass *createHexagonConstPropagationPass();
+ FunctionPass *createHexagonCopyToCombine();
+ FunctionPass *createHexagonEarlyIfConversion();
+ FunctionPass *createHexagonFixupHwLoops();
+ FunctionPass *createHexagonGenExtract();
+ FunctionPass *createHexagonGenInsert();
+ FunctionPass *createHexagonGenMux();
+ FunctionPass *createHexagonGenPredicate();
+ FunctionPass *createHexagonHardwareLoops();
+ FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM,
+ CodeGenOpt::Level OptLevel);
+ FunctionPass *createHexagonLoopRescheduling();
+ FunctionPass *createHexagonNewValueJump();
+ FunctionPass *createHexagonOptimizeSZextends();
+ FunctionPass *createHexagonOptAddrMode();
+ FunctionPass *createHexagonPacketizer();
+ FunctionPass *createHexagonPeephole();
+ FunctionPass *createHexagonRDFOpt();
+ FunctionPass *createHexagonSplitConst32AndConst64();
+ FunctionPass *createHexagonSplitDoubleRegs();
+ FunctionPass *createHexagonStoreWidening();
+ FunctionPass *createHexagonVectorPrint();
+} // end namespace llvm;
+
+static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
+ if (!RM.hasValue())
+ return Reloc::Static;
+ return *RM;
+}
+
+HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT,
+ StringRef CPU, StringRef FS,
+ const TargetOptions &Options,
+ Optional<Reloc::Model> RM,
+ CodeModel::Model CM,
+ CodeGenOpt::Level OL)
+ // Specify the vector alignment explicitly. For v512x1, the calculated
+ // alignment would be 512*alignment(i1), which is 512 bytes, instead of
+ // the required minimum of 64 bytes.
+ : LLVMTargetMachine(
+ T, "e-m:e-p:32:32:32-a:0-n16:32-"
+ "i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-"
+ "v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048",
+ TT, CPU, FS, Options, getEffectiveRelocModel(RM), CM,
+ (HexagonNoOpt ? CodeGenOpt::None : OL)),
+ TLOF(make_unique<HexagonTargetObjectFile>()) {
+ initializeHexagonExpandCondsetsPass(*PassRegistry::getPassRegistry());
+ initAsmInfo();
+}
+
+const HexagonSubtarget *
+HexagonTargetMachine::getSubtargetImpl(const Function &F) const {
+ AttributeSet FnAttrs = F.getAttributes();
+ Attribute CPUAttr =
+ FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-cpu");
+ Attribute FSAttr =
+ FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-features");
+
+ std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
+ ? CPUAttr.getValueAsString().str()
+ : TargetCPU;
+ std::string FS = !FSAttr.hasAttribute(Attribute::None)
+ ? FSAttr.getValueAsString().str()
+ : TargetFS;
+
+ auto &I = SubtargetMap[CPU + FS];
+ if (!I) {
+ // This needs to be done before we create a new subtarget since any
+ // creation will depend on the TM and the code generation flags on the
+ // function that reside in TargetOptions.
+ resetTargetOptions(F);
+ I = llvm::make_unique<HexagonSubtarget>(TargetTriple, CPU, FS, *this);
+ }
+ return I.get();
+}
+
+TargetIRAnalysis HexagonTargetMachine::getTargetIRAnalysis() {
+ return TargetIRAnalysis([this](const Function &F) {
+ return TargetTransformInfo(HexagonTTIImpl(this, F));
+ });
+}
+
+
+HexagonTargetMachine::~HexagonTargetMachine() {}
+
+namespace {
+/// Hexagon Code Generator Pass Configuration Options.
+class HexagonPassConfig : public TargetPassConfig {
+public:
+ HexagonPassConfig(HexagonTargetMachine *TM, PassManagerBase &PM)
+ : TargetPassConfig(TM, PM) {}
+
+ HexagonTargetMachine &getHexagonTargetMachine() const {
+ return getTM<HexagonTargetMachine>();
+ }
+
+ ScheduleDAGInstrs *
+ createMachineScheduler(MachineSchedContext *C) const override {
+ return createVLIWMachineSched(C);
+ }
+
+ void addIRPasses() override;
+ bool addInstSelector() override;
+ void addPreRegAlloc() override;
+ void addPostRegAlloc() override;
+ void addPreSched2() override;
+ void addPreEmitPass() override;
+};
+} // namespace
+
+TargetPassConfig *HexagonTargetMachine::createPassConfig(PassManagerBase &PM) {
+ return new HexagonPassConfig(this, PM);
+}
+
+void HexagonPassConfig::addIRPasses() {
+ TargetPassConfig::addIRPasses();
+ bool NoOpt = (getOptLevel() == CodeGenOpt::None);
+
+ addPass(createAtomicExpandPass(TM));
+ if (!NoOpt) {
+ if (EnableLoopPrefetch)
+ addPass(createLoopDataPrefetchPass());
+ if (EnableCommGEP)
+ addPass(createHexagonCommonGEP());
+ // Replace certain combinations of shifts and ands with extracts.
+ if (EnableGenExtract)
+ addPass(createHexagonGenExtract());
+ }
+}
+
+bool HexagonPassConfig::addInstSelector() {
+ HexagonTargetMachine &TM = getHexagonTargetMachine();
+ bool NoOpt = (getOptLevel() == CodeGenOpt::None);
+
+ if (!NoOpt)
+ addPass(createHexagonOptimizeSZextends());
+
+ addPass(createHexagonISelDag(TM, getOptLevel()));
+
+ if (!NoOpt) {
+ // Create logical operations on predicate registers.
+ if (EnableGenPred)
+ addPass(createHexagonGenPredicate(), false);
+ // Rotate loops to expose bit-simplification opportunities.
+ if (EnableLoopResched)
+ addPass(createHexagonLoopRescheduling(), false);
+ // Split double registers.
+ if (!DisableHSDR)
+ addPass(createHexagonSplitDoubleRegs());
+ // Bit simplification.
+ if (EnableBitSimplify)
+ addPass(createHexagonBitSimplify(), false);
+ addPass(createHexagonPeephole());
+ printAndVerify("After hexagon peephole pass");
+ // Constant propagation.
+ if (!DisableHCP) {
+ addPass(createHexagonConstPropagationPass(), false);
+ addPass(&UnreachableMachineBlockElimID, false);
+ }
+ if (EnableGenInsert)
+ addPass(createHexagonGenInsert(), false);
+ if (EnableEarlyIf)
+ addPass(createHexagonEarlyIfConversion(), false);
+ }
+
+ return false;
+}
+
+void HexagonPassConfig::addPreRegAlloc() {
+ if (getOptLevel() != CodeGenOpt::None) {
+ if (EnableExpandCondsets)
+ insertPass(&RegisterCoalescerID, &HexagonExpandCondsetsID);
+ if (!DisableStoreWidening)
+ addPass(createHexagonStoreWidening(), false);
+ if (!DisableHardwareLoops)
+ addPass(createHexagonHardwareLoops(), false);
+ }
+ if (TM->getOptLevel() >= CodeGenOpt::Default)
+ addPass(&MachinePipelinerID);
+}
+
+void HexagonPassConfig::addPostRegAlloc() {
+ if (getOptLevel() != CodeGenOpt::None) {
+ if (EnableRDFOpt)
+ addPass(createHexagonRDFOpt());
+ if (!DisableHexagonCFGOpt)
+ addPass(createHexagonCFGOptimizer(), false);
+ if (!DisableAModeOpt)
+ addPass(createHexagonOptAddrMode(), false);
+ }
+}
+
+void HexagonPassConfig::addPreSched2() {
+ addPass(createHexagonCopyToCombine(), false);
+ if (getOptLevel() != CodeGenOpt::None)
+ addPass(&IfConverterID, false);
+ addPass(createHexagonSplitConst32AndConst64());
+}
+
+void HexagonPassConfig::addPreEmitPass() {
+ bool NoOpt = (getOptLevel() == CodeGenOpt::None);
+
+ if (!NoOpt)
+ addPass(createHexagonNewValueJump(), false);
+
+ addPass(createHexagonBranchRelaxation(), false);
+
+ // Create Packets.
+ if (!NoOpt) {
+ if (!DisableHardwareLoops)
+ addPass(createHexagonFixupHwLoops(), false);
+ // Generate MUX from pairs of conditional transfers.
+ if (EnableGenMux)
+ addPass(createHexagonGenMux(), false);
+
+ addPass(createHexagonPacketizer(), false);
+ }
+ if (EnableVectorPrint)
+ addPass(createHexagonVectorPrint(), false);
+
+ // Add CFI instructions if necessary.
+ addPass(createHexagonCallFrameInformation(), false);
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.h b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
new file mode 100644
index 000000000000..70835c0d4ac5
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
@@ -0,0 +1,50 @@
+//=-- HexagonTargetMachine.h - Define TargetMachine for Hexagon ---*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Hexagon specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONTARGETMACHINE_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONTARGETMACHINE_H
+
+#include "HexagonInstrInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class Module;
+
+class HexagonTargetMachine : public LLVMTargetMachine {
+ std::unique_ptr<TargetLoweringObjectFile> TLOF;
+ mutable StringMap<std::unique_ptr<HexagonSubtarget>> SubtargetMap;
+
+public:
+ HexagonTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL);
+ ~HexagonTargetMachine() override;
+ const HexagonSubtarget *getSubtargetImpl(const Function &F) const override;
+
+ static unsigned getModuleMatchQuality(const Module &M);
+
+ TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+ TargetIRAnalysis getTargetIRAnalysis() override;
+
+ HexagonTargetObjectFile *getObjFileLowering() const override {
+ return static_cast<HexagonTargetObjectFile*>(TLOF.get());
+ }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
new file mode 100644
index 000000000000..e902f600e881
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -0,0 +1,382 @@
+//===-- HexagonTargetObjectFile.cpp ---------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the HexagonTargetAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "hexagon-sdata"
+
+#include "HexagonTargetMachine.h"
+#include "HexagonTargetObjectFile.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ELF.h"
+
+using namespace llvm;
+
+static cl::opt<unsigned> SmallDataThreshold("hexagon-small-data-threshold",
+ cl::init(8), cl::Hidden,
+ cl::desc("The maximum size of an object in the sdata section"));
+
+static cl::opt<bool> NoSmallDataSorting("mno-sort-sda", cl::init(false),
+ cl::Hidden, cl::desc("Disable small data sections sorting"));
+
+static cl::opt<bool> StaticsInSData("hexagon-statics-in-small-data",
+ cl::init(false), cl::Hidden, cl::ZeroOrMore,
+ cl::desc("Allow static variables in .sdata"));
+
+static cl::opt<bool> TraceGVPlacement("trace-gv-placement",
+ cl::Hidden, cl::init(false),
+ cl::desc("Trace global value placement"));
+
+// TraceGVPlacement controls messages for all builds. For builds with assertions
+// (debug or release), messages are also controlled by the usual debug flags
+// (e.g. -debug and -debug-only=globallayout)
+#define TRACE_TO(s, X) s << X
+#ifdef NDEBUG
+#define TRACE(X) do { if (TraceGVPlacement) { TRACE_TO(errs(), X); } } while (0)
+#else
+#define TRACE(X) \
+ do { \
+ if (TraceGVPlacement) { TRACE_TO(errs(), X); } \
+ else { DEBUG( TRACE_TO(dbgs(), X) ); } \
+ } while (0)
+#endif
+
+// Returns true if the section name is such that the symbol will be put
+// in a small data section.
+// For instance, global variables with section attributes such as ".sdata"
+// ".sdata.*", ".sbss", and ".sbss.*" will go into small data.
+static bool isSmallDataSection(StringRef Sec) {
+ // sectionName is either ".sdata" or ".sbss". Looking for an exact match
+ // obviates the need for checks for section names such as ".sdatafoo".
+ if (Sec.equals(".sdata") || Sec.equals(".sbss") || Sec.equals(".scommon"))
+ return true;
+ // If either ".sdata." or ".sbss." is a substring of the section name
+ // then put the symbol in small data.
+ return Sec.find(".sdata.") != StringRef::npos ||
+ Sec.find(".sbss.") != StringRef::npos ||
+ Sec.find(".scommon.") != StringRef::npos;
+}
+
+
+static const char *getSectionSuffixForSize(unsigned Size) {
+ switch (Size) {
+ default:
+ return "";
+ case 1:
+ return ".1";
+ case 2:
+ return ".2";
+ case 4:
+ return ".4";
+ case 8:
+ return ".8";
+ }
+}
+
+void HexagonTargetObjectFile::Initialize(MCContext &Ctx,
+ const TargetMachine &TM) {
+ TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+ InitializeELF(TM.Options.UseInitArray);
+
+ SmallDataSection =
+ getContext().getELFSection(".sdata", ELF::SHT_PROGBITS,
+ ELF::SHF_WRITE | ELF::SHF_ALLOC |
+ ELF::SHF_HEX_GPREL);
+ SmallBSSSection =
+ getContext().getELFSection(".sbss", ELF::SHT_NOBITS,
+ ELF::SHF_WRITE | ELF::SHF_ALLOC |
+ ELF::SHF_HEX_GPREL);
+}
+
+MCSection *HexagonTargetObjectFile::SelectSectionForGlobal(
+ const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+ TRACE("[SelectSectionForGlobal] GO(" << GO->getName() << ") ");
+ TRACE("input section(" << GO->getSection() << ") ");
+
+ TRACE((GO->hasPrivateLinkage() ? "private_linkage " : "")
+ << (GO->hasLocalLinkage() ? "local_linkage " : "")
+ << (GO->hasInternalLinkage() ? "internal " : "")
+ << (GO->hasExternalLinkage() ? "external " : "")
+ << (GO->hasCommonLinkage() ? "common_linkage " : "")
+ << (GO->hasCommonLinkage() ? "common " : "" )
+ << (Kind.isCommon() ? "kind_common " : "" )
+ << (Kind.isBSS() ? "kind_bss " : "" )
+ << (Kind.isBSSLocal() ? "kind_bss_local " : "" ));
+
+ if (isGlobalInSmallSection(GO, TM))
+ return selectSmallSectionForGlobal(GO, Kind, TM);
+
+ if (Kind.isCommon()) {
+ // This is purely for LTO+Linker Script because commons don't really have a
+ // section. However, the BitcodeSectionWriter pass will query for the
+ // sections of commons (and the linker expects us to know their section) so
+ // we'll return one here.
+ return BSSSection;
+ }
+
+ TRACE("default_ELF_section\n");
+ // Otherwise, we work the same as ELF.
+ return TargetLoweringObjectFileELF::SelectSectionForGlobal(GO, Kind, TM);
+}
+
+MCSection *HexagonTargetObjectFile::getExplicitSectionGlobal(
+ const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+ TRACE("[getExplicitSectionGlobal] GO(" << GO->getName() << ") from("
+ << GO->getSection() << ") ");
+ TRACE((GO->hasPrivateLinkage() ? "private_linkage " : "")
+ << (GO->hasLocalLinkage() ? "local_linkage " : "")
+ << (GO->hasInternalLinkage() ? "internal " : "")
+ << (GO->hasExternalLinkage() ? "external " : "")
+ << (GO->hasCommonLinkage() ? "common_linkage " : "")
+ << (GO->hasCommonLinkage() ? "common " : "" )
+ << (Kind.isCommon() ? "kind_common " : "" )
+ << (Kind.isBSS() ? "kind_bss " : "" )
+ << (Kind.isBSSLocal() ? "kind_bss_local " : "" ));
+
+ if (GO->hasSection()) {
+ StringRef Section = GO->getSection();
+ if (Section.find(".access.text.group") != StringRef::npos)
+ return getContext().getELFSection(GO->getSection(), ELF::SHT_PROGBITS,
+ ELF::SHF_ALLOC | ELF::SHF_EXECINSTR);
+ if (Section.find(".access.data.group") != StringRef::npos)
+ return getContext().getELFSection(GO->getSection(), ELF::SHT_PROGBITS,
+ ELF::SHF_WRITE | ELF::SHF_ALLOC);
+ }
+
+ if (isGlobalInSmallSection(GO, TM))
+ return selectSmallSectionForGlobal(GO, Kind, TM);
+
+ // Otherwise, we work the same as ELF.
+ TRACE("default_ELF_section\n");
+ return TargetLoweringObjectFileELF::getExplicitSectionGlobal(GO, Kind, TM);
+}
+
+
+/// Return true if this global value should be placed into small data/bss
+/// section.
+bool HexagonTargetObjectFile::isGlobalInSmallSection(const GlobalObject *GO,
+ const TargetMachine &TM) const {
+ // Only global variables, not functions.
+ DEBUG(dbgs() << "Checking if value is in small-data, -G"
+ << SmallDataThreshold << ": \"" << GO->getName() << "\": ");
+ const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GO);
+ if (!GVar) {
+ DEBUG(dbgs() << "no, not a global variable\n");
+ return false;
+ }
+
+ // Globals with external linkage that have an original section set must be
+ // emitted to that section, regardless of whether we would put them into
+ // small data or not. This is how we can support mixing -G0/-G8 in LTO.
+ if (GVar->hasSection()) {
+ bool IsSmall = isSmallDataSection(GVar->getSection());
+ DEBUG(dbgs() << (IsSmall ? "yes" : "no") << ", has section: "
+ << GVar->getSection() << '\n');
+ return IsSmall;
+ }
+
+ if (GVar->isConstant()) {
+ DEBUG(dbgs() << "no, is a constant\n");
+ return false;
+ }
+
+ bool IsLocal = GVar->hasLocalLinkage();
+ if (!StaticsInSData && IsLocal) {
+ DEBUG(dbgs() << "no, is static\n");
+ return false;
+ }
+
+ Type *GType = GVar->getType();
+ if (PointerType *PT = dyn_cast<PointerType>(GType))
+ GType = PT->getElementType();
+
+ if (isa<ArrayType>(GType)) {
+ DEBUG(dbgs() << "no, is an array\n");
+ return false;
+ }
+
+ // If the type is a struct with no body provided, treat is conservatively.
+ // There cannot be actual definitions of object of such a type in this CU
+ // (only references), so assuming that they are not in sdata is safe. If
+ // these objects end up in the sdata, the references will still be valid.
+ if (StructType *ST = dyn_cast<StructType>(GType)) {
+ if (ST->isOpaque()) {
+ DEBUG(dbgs() << "no, has opaque type\n");
+ return false;
+ }
+ }
+
+ unsigned Size = GVar->getParent()->getDataLayout().getTypeAllocSize(GType);
+ if (Size == 0) {
+ DEBUG(dbgs() << "no, has size 0\n");
+ return false;
+ }
+ if (Size > SmallDataThreshold) {
+ DEBUG(dbgs() << "no, size exceeds sdata threshold: " << Size << '\n');
+ return false;
+ }
+
+ DEBUG(dbgs() << "yes\n");
+ return true;
+}
+
+
+bool HexagonTargetObjectFile::isSmallDataEnabled() const {
+ return SmallDataThreshold > 0;
+}
+
+
+unsigned HexagonTargetObjectFile::getSmallDataSize() const {
+ return SmallDataThreshold;
+}
+
+
+/// Descends any type down to "elementary" components,
+/// discovering the smallest addressable one.
+/// If zero is returned, declaration will not be modified.
+unsigned HexagonTargetObjectFile::getSmallestAddressableSize(const Type *Ty,
+ const GlobalValue *GV, const TargetMachine &TM) const {
+ // Assign the smallest element access size to the highest
+ // value which assembler can handle.
+ unsigned SmallestElement = 8;
+
+ if (!Ty)
+ return 0;
+ switch (Ty->getTypeID()) {
+ case Type::StructTyID: {
+ const StructType *STy = cast<const StructType>(Ty);
+ for (auto &E : STy->elements()) {
+ unsigned AtomicSize = getSmallestAddressableSize(E, GV, TM);
+ if (AtomicSize < SmallestElement)
+ SmallestElement = AtomicSize;
+ }
+ return (STy->getNumElements() == 0) ? 0 : SmallestElement;
+ }
+ case Type::ArrayTyID: {
+ const ArrayType *ATy = cast<const ArrayType>(Ty);
+ return getSmallestAddressableSize(ATy->getElementType(), GV, TM);
+ }
+ case Type::VectorTyID: {
+ const VectorType *PTy = cast<const VectorType>(Ty);
+ return getSmallestAddressableSize(PTy->getElementType(), GV, TM);
+ }
+ case Type::PointerTyID:
+ case Type::HalfTyID:
+ case Type::FloatTyID:
+ case Type::DoubleTyID:
+ case Type::IntegerTyID: {
+ const DataLayout &DL = GV->getParent()->getDataLayout();
+ // It is unfortunate that DL's function take non-const Type*.
+ return DL.getTypeAllocSize(const_cast<Type*>(Ty));
+ }
+ case Type::FunctionTyID:
+ case Type::VoidTyID:
+ case Type::X86_FP80TyID:
+ case Type::FP128TyID:
+ case Type::PPC_FP128TyID:
+ case Type::LabelTyID:
+ case Type::MetadataTyID:
+ case Type::X86_MMXTyID:
+ case Type::TokenTyID:
+ return 0;
+ }
+
+ return 0;
+}
+
+MCSection *HexagonTargetObjectFile::selectSmallSectionForGlobal(
+ const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+ const Type *GTy = GO->getType()->getElementType();
+ unsigned Size = getSmallestAddressableSize(GTy, GO, TM);
+
+ // If we have -ffunction-section or -fdata-section then we should emit the
+ // global value to a unique section specifically for it... even for sdata.
+ bool EmitUniquedSection = TM.getDataSections();
+
+ TRACE("Small data. Size(" << Size << ")");
+ // Handle Small Section classification here.
+ if (Kind.isBSS() || Kind.isBSSLocal()) {
+ // If -mno-sort-sda is not set, find out smallest accessible entity in
+ // declaration and add it to the section name string.
+ // Note. It does not track the actual usage of the value, only its de-
+ // claration. Also, compiler adds explicit pad fields to some struct
+ // declarations - they are currently counted towards smallest addres-
+ // sable entity.
+ if (NoSmallDataSorting) {
+ TRACE(" default sbss\n");
+ return SmallBSSSection;
+ }
+
+ StringRef Prefix(".sbss");
+ SmallString<128> Name(Prefix);
+ Name.append(getSectionSuffixForSize(Size));
+
+ if (EmitUniquedSection) {
+ Name.append(".");
+ Name.append(GO->getName());
+ }
+ TRACE(" unique sbss(" << Name << ")\n");
+ return getContext().getELFSection(Name.str(), ELF::SHT_NOBITS,
+ ELF::SHF_WRITE | ELF::SHF_ALLOC | ELF::SHF_HEX_GPREL);
+ }
+
+ if (Kind.isCommon()) {
+ // This is purely for LTO+Linker Script because commons don't really have a
+ // section. However, the BitcodeSectionWriter pass will query for the
+ // sections of commons (and the linker expects us to know their section) so
+ // we'll return one here.
+ if (NoSmallDataSorting)
+ return BSSSection;
+
+ Twine Name = Twine(".scommon") + getSectionSuffixForSize(Size);
+ TRACE(" small COMMON (" << Name << ")\n");
+
+ return getContext().getELFSection(Name.str(), ELF::SHT_NOBITS,
+ ELF::SHF_WRITE | ELF::SHF_ALLOC |
+ ELF::SHF_HEX_GPREL);
+ }
+
+ // We could have changed sdata object to a constant... in this
+ // case the Kind could be wrong for it.
+ if (Kind.isMergeableConst()) {
+ TRACE(" const_object_as_data ");
+ const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GO);
+ if (GVar->hasSection() && isSmallDataSection(GVar->getSection()))
+ Kind = SectionKind::getData();
+ }
+
+ if (Kind.isData()) {
+ if (NoSmallDataSorting) {
+ TRACE(" default sdata\n");
+ return SmallDataSection;
+ }
+
+ StringRef Prefix(".sdata");
+ SmallString<128> Name(Prefix);
+ Name.append(getSectionSuffixForSize(Size));
+
+ if (EmitUniquedSection) {
+ Name.append(".");
+ Name.append(GO->getName());
+ }
+ TRACE(" unique sdata(" << Name << ")\n");
+ return getContext().getELFSection(Name.str(), ELF::SHT_PROGBITS,
+ ELF::SHF_WRITE | ELF::SHF_ALLOC | ELF::SHF_HEX_GPREL);
+ }
+
+ TRACE("default ELF section\n");
+ // Otherwise, we work the same as ELF.
+ return TargetLoweringObjectFileELF::SelectSectionForGlobal(GO, Kind, TM);
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.h b/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.h
new file mode 100644
index 000000000000..58dff2b95e19
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.h
@@ -0,0 +1,50 @@
+//===-- HexagonTargetObjectFile.h -----------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONTARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONTARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/MC/MCSectionELF.h"
+
+namespace llvm {
+
+ class HexagonTargetObjectFile : public TargetLoweringObjectFileELF {
+ public:
+ void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+
+ MCSection *SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind,
+ const TargetMachine &TM) const override;
+
+ MCSection *getExplicitSectionGlobal(const GlobalObject *GO,
+ SectionKind Kind,
+ const TargetMachine &TM) const override;
+
+ bool isGlobalInSmallSection(const GlobalObject *GO,
+ const TargetMachine &TM) const;
+
+ bool isSmallDataEnabled() const;
+
+ unsigned getSmallDataSize() const;
+
+ private:
+ MCSectionELF *SmallDataSection;
+ MCSectionELF *SmallBSSSection;
+
+ unsigned getSmallestAddressableSize(const Type *Ty, const GlobalValue *GV,
+ const TargetMachine &TM) const;
+
+ MCSection *selectSmallSectionForGlobal(const GlobalObject *GO,
+ SectionKind Kind,
+ const TargetMachine &TM) const;
+ };
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetStreamer.h b/contrib/llvm/lib/Target/Hexagon/HexagonTargetStreamer.h
new file mode 100644
index 000000000000..e19c404450e6
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetStreamer.h
@@ -0,0 +1,31 @@
+//===-- HexagonTargetStreamer.h - Hexagon Target Streamer ------*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGONTARGETSTREAMER_H
+#define HEXAGONTARGETSTREAMER_H
+
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+class HexagonTargetStreamer : public MCTargetStreamer {
+public:
+ HexagonTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
+ virtual void EmitCodeAlignment(unsigned ByteAlignment,
+ unsigned MaxBytesToEmit = 0){};
+ virtual void emitFAlign(unsigned Size, unsigned MaxBytesToEmit){};
+ virtual void EmitCommonSymbolSorted(MCSymbol *Symbol, uint64_t Size,
+ unsigned ByteAlignment,
+ unsigned AccessGranularity){};
+ virtual void EmitLocalCommonSymbolSorted(MCSymbol *Symbol, uint64_t Size,
+ unsigned ByteAlign,
+ unsigned AccessGranularity){};
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
new file mode 100644
index 000000000000..d578bfab3658
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -0,0 +1,71 @@
+//===-- HexagonTargetTransformInfo.cpp - Hexagon specific TTI pass --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+/// This file implements a TargetTransformInfo analysis pass specific to the
+/// Hexagon target machine. It uses the target's detailed information to provide
+/// more precise answers to certain TTI queries, while letting the target
+/// independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#include "HexagonTargetTransformInfo.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagontti"
+
+TargetTransformInfo::PopcntSupportKind
+HexagonTTIImpl::getPopcntSupport(unsigned IntTyWidthInBit) const {
+ // Return Fast Hardware support as every input < 64 bits will be promoted
+ // to 64 bits.
+ return TargetTransformInfo::PSK_FastHardware;
+}
+
+// The Hexagon target can unroll loops with run-time trip counts.
+void HexagonTTIImpl::getUnrollingPreferences(Loop *L,
+ TTI::UnrollingPreferences &UP) {
+ UP.Runtime = UP.Partial = true;
+}
+
+unsigned HexagonTTIImpl::getNumberOfRegisters(bool vector) const {
+ return vector ? 0 : 32;
+}
+
+unsigned HexagonTTIImpl::getPrefetchDistance() const {
+ return getST()->getL1PrefetchDistance();
+}
+
+unsigned HexagonTTIImpl::getCacheLineSize() const {
+ return getST()->getL1CacheLineSize();
+}
+
+int HexagonTTIImpl::getUserCost(const User *U) {
+ auto isCastFoldedIntoLoad = [] (const CastInst *CI) -> bool {
+ if (!CI->isIntegerCast())
+ return false;
+ const LoadInst *LI = dyn_cast<const LoadInst>(CI->getOperand(0));
+ // Technically, this code could allow multiple uses of the load, and
+ // check if all the uses are the same extension operation, but this
+ // should be sufficient for most cases.
+ if (!LI || !LI->hasOneUse())
+ return false;
+
+ // Only extensions from an integer type shorter than 32-bit to i32
+ // can be folded into the load.
+ unsigned SBW = CI->getSrcTy()->getIntegerBitWidth();
+ unsigned DBW = CI->getDestTy()->getIntegerBitWidth();
+ return DBW == 32 && (SBW < DBW);
+ };
+
+ if (const CastInst *CI = dyn_cast<const CastInst>(U))
+ if (isCastFoldedIntoLoad(CI))
+ return TargetTransformInfo::TCC_Free;
+ return BaseT::getUserCost(U);
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
new file mode 100644
index 000000000000..8414bfc4e197
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -0,0 +1,69 @@
+//===-- HexagonTargetTransformInfo.cpp - Hexagon specific TTI pass --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+/// This file implements a TargetTransformInfo analysis pass specific to the
+/// Hexagon target machine. It uses the target's detailed information to provide
+/// more precise answers to certain TTI queries, while letting the target
+/// independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONTARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONTARGETTRANSFORMINFO_H
+
+#include "Hexagon.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+class HexagonTTIImpl : public BasicTTIImplBase<HexagonTTIImpl> {
+ typedef BasicTTIImplBase<HexagonTTIImpl> BaseT;
+ typedef TargetTransformInfo TTI;
+ friend BaseT;
+
+ const HexagonSubtarget *ST;
+ const HexagonTargetLowering *TLI;
+
+ const HexagonSubtarget *getST() const { return ST; }
+ const HexagonTargetLowering *getTLI() const { return TLI; }
+
+public:
+ explicit HexagonTTIImpl(const HexagonTargetMachine *TM, const Function &F)
+ : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+ TLI(ST->getTargetLowering()) {}
+
+ /// \name Scalar TTI Implementations
+ /// @{
+
+ TTI::PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const;
+
+ // The Hexagon target can unroll loops with run-time trip counts.
+ void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
+
+ // L1 cache prefetch.
+ unsigned getPrefetchDistance() const;
+ unsigned getCacheLineSize() const;
+
+ /// @}
+
+ /// \name Vector TTI Implementations
+ /// @{
+
+ unsigned getNumberOfRegisters(bool vector) const;
+
+ /// @}
+
+ int getUserCost(const User *U);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
new file mode 100644
index 000000000000..7b1247d815a5
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -0,0 +1,1683 @@
+//===----- HexagonPacketizer.cpp - vliw packetizer ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements a simple VLIW packetizer using DFA. The packetizer works on
+// machine basic blocks. For each instruction I in BB, the packetizer consults
+// the DFA to see if machine resources are available to execute I. If so, the
+// packetizer checks if I depends on any instruction J in the current packet.
+// If no dependency is found, I is added to current packet and machine resource
+// is marked as taken. If any dependency is found, a target API call is made to
+// prune the dependence.
+//
+//===----------------------------------------------------------------------===//
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "HexagonVLIWPacketizer.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "packets"
+
+static cl::opt<bool> DisablePacketizer("disable-packetizer", cl::Hidden,
+ cl::ZeroOrMore, cl::init(false),
+ cl::desc("Disable Hexagon packetizer pass"));
+
+static cl::opt<bool> PacketizeVolatiles("hexagon-packetize-volatiles",
+ cl::ZeroOrMore, cl::Hidden, cl::init(true),
+ cl::desc("Allow non-solo packetization of volatile memory references"));
+
+static cl::opt<bool> EnableGenAllInsnClass("enable-gen-insn", cl::init(false),
+ cl::Hidden, cl::ZeroOrMore, cl::desc("Generate all instruction with TC"));
+
+static cl::opt<bool> DisableVecDblNVStores("disable-vecdbl-nv-stores",
+ cl::init(false), cl::Hidden, cl::ZeroOrMore,
+ cl::desc("Disable vector double new-value-stores"));
+
+extern cl::opt<bool> ScheduleInlineAsm;
+
+namespace llvm {
+ FunctionPass *createHexagonPacketizer();
+ void initializeHexagonPacketizerPass(PassRegistry&);
+}
+
+
+namespace {
+ class HexagonPacketizer : public MachineFunctionPass {
+ public:
+ static char ID;
+ HexagonPacketizer() : MachineFunctionPass(ID) {
+ initializeHexagonPacketizerPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<MachineBranchProbabilityInfo>();
+ AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<MachineLoopInfo>();
+ AU.addPreserved<MachineDominatorTree>();
+ AU.addPreserved<MachineLoopInfo>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+ StringRef getPassName() const override { return "Hexagon Packetizer"; }
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+ private:
+ const HexagonInstrInfo *HII;
+ const HexagonRegisterInfo *HRI;
+ };
+
+ char HexagonPacketizer::ID = 0;
+}
+
+INITIALIZE_PASS_BEGIN(HexagonPacketizer, "packets", "Hexagon Packetizer",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(HexagonPacketizer, "packets", "Hexagon Packetizer",
+ false, false)
+
+HexagonPacketizerList::HexagonPacketizerList(MachineFunction &MF,
+ MachineLoopInfo &MLI, AliasAnalysis *AA,
+ const MachineBranchProbabilityInfo *MBPI)
+ : VLIWPacketizerList(MF, MLI, AA), MBPI(MBPI), MLI(&MLI) {
+ HII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
+ HRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+
+ addMutation(make_unique<HexagonSubtarget::HexagonDAGMutation>());
+}
+
+// Check if FirstI modifies a register that SecondI reads.
+static bool hasWriteToReadDep(const MachineInstr &FirstI,
+ const MachineInstr &SecondI,
+ const TargetRegisterInfo *TRI) {
+ for (auto &MO : FirstI.operands()) {
+ if (!MO.isReg() || !MO.isDef())
+ continue;
+ unsigned R = MO.getReg();
+ if (SecondI.readsRegister(R, TRI))
+ return true;
+ }
+ return false;
+}
+
+
+static MachineBasicBlock::iterator moveInstrOut(MachineInstr &MI,
+ MachineBasicBlock::iterator BundleIt, bool Before) {
+ MachineBasicBlock::instr_iterator InsertPt;
+ if (Before)
+ InsertPt = BundleIt.getInstrIterator();
+ else
+ InsertPt = std::next(BundleIt).getInstrIterator();
+
+ MachineBasicBlock &B = *MI.getParent();
+ // The instruction should at least be bundled with the preceding instruction
+ // (there will always be one, i.e. BUNDLE, if nothing else).
+ assert(MI.isBundledWithPred());
+ if (MI.isBundledWithSucc()) {
+ MI.clearFlag(MachineInstr::BundledSucc);
+ MI.clearFlag(MachineInstr::BundledPred);
+ } else {
+ // If it's not bundled with the successor (i.e. it is the last one
+ // in the bundle), then we can simply unbundle it from the predecessor,
+ // which will take care of updating the predecessor's flag.
+ MI.unbundleFromPred();
+ }
+ B.splice(InsertPt, &B, MI.getIterator());
+
+ // Get the size of the bundle without asserting.
+ MachineBasicBlock::const_instr_iterator I = BundleIt.getInstrIterator();
+ MachineBasicBlock::const_instr_iterator E = B.instr_end();
+ unsigned Size = 0;
+ for (++I; I != E && I->isBundledWithPred(); ++I)
+ ++Size;
+
+ // If there are still two or more instructions, then there is nothing
+ // else to be done.
+ if (Size > 1)
+ return BundleIt;
+
+ // Otherwise, extract the single instruction out and delete the bundle.
+ MachineBasicBlock::iterator NextIt = std::next(BundleIt);
+ MachineInstr &SingleI = *BundleIt->getNextNode();
+ SingleI.unbundleFromPred();
+ assert(!SingleI.isBundledWithSucc());
+ BundleIt->eraseFromParent();
+ return NextIt;
+}
+
+
+bool HexagonPacketizer::runOnMachineFunction(MachineFunction &MF) {
+ if (DisablePacketizer || skipFunction(*MF.getFunction()))
+ return false;
+
+ HII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
+ HRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+ auto &MLI = getAnalysis<MachineLoopInfo>();
+ auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ auto *MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+
+ if (EnableGenAllInsnClass)
+ HII->genAllInsnTimingClasses(MF);
+
+ // Instantiate the packetizer.
+ HexagonPacketizerList Packetizer(MF, MLI, AA, MBPI);
+
+ // DFA state table should not be empty.
+ assert(Packetizer.getResourceTracker() && "Empty DFA table!");
+
+ //
+ // Loop over all basic blocks and remove KILL pseudo-instructions
+ // These instructions confuse the dependence analysis. Consider:
+ // D0 = ... (Insn 0)
+ // R0 = KILL R0, D0 (Insn 1)
+ // R0 = ... (Insn 2)
+ // Here, Insn 1 will result in the dependence graph not emitting an output
+ // dependence between Insn 0 and Insn 2. This can lead to incorrect
+ // packetization
+ //
+ for (auto &MB : MF) {
+ auto End = MB.end();
+ auto MI = MB.begin();
+ while (MI != End) {
+ auto NextI = std::next(MI);
+ if (MI->isKill()) {
+ MB.erase(MI);
+ End = MB.end();
+ }
+ MI = NextI;
+ }
+ }
+
+ // Loop over all of the basic blocks.
+ for (auto &MB : MF) {
+ auto Begin = MB.begin(), End = MB.end();
+ while (Begin != End) {
+ // First the first non-boundary starting from the end of the last
+ // scheduling region.
+ MachineBasicBlock::iterator RB = Begin;
+ while (RB != End && HII->isSchedulingBoundary(*RB, &MB, MF))
+ ++RB;
+ // First the first boundary starting from the beginning of the new
+ // region.
+ MachineBasicBlock::iterator RE = RB;
+ while (RE != End && !HII->isSchedulingBoundary(*RE, &MB, MF))
+ ++RE;
+ // Add the scheduling boundary if it's not block end.
+ if (RE != End)
+ ++RE;
+ // If RB == End, then RE == End.
+ if (RB != End)
+ Packetizer.PacketizeMIs(&MB, RB, RE);
+
+ Begin = RE;
+ }
+ }
+
+ Packetizer.unpacketizeSoloInstrs(MF);
+ return true;
+}
+
+
+// Reserve resources for a constant extender. Trigger an assertion if the
+// reservation fails.
+void HexagonPacketizerList::reserveResourcesForConstExt() {
+ if (!tryAllocateResourcesForConstExt(true))
+ llvm_unreachable("Resources not available");
+}
+
+bool HexagonPacketizerList::canReserveResourcesForConstExt() {
+ return tryAllocateResourcesForConstExt(false);
+}
+
+// Allocate resources (i.e. 4 bytes) for constant extender. If succeeded,
+// return true, otherwise, return false.
+bool HexagonPacketizerList::tryAllocateResourcesForConstExt(bool Reserve) {
+ auto *ExtMI = MF.CreateMachineInstr(HII->get(Hexagon::A4_ext), DebugLoc());
+ bool Avail = ResourceTracker->canReserveResources(*ExtMI);
+ if (Reserve && Avail)
+ ResourceTracker->reserveResources(*ExtMI);
+ MF.DeleteMachineInstr(ExtMI);
+ return Avail;
+}
+
+
+bool HexagonPacketizerList::isCallDependent(const MachineInstr &MI,
+ SDep::Kind DepType, unsigned DepReg) {
+ // Check for LR dependence.
+ if (DepReg == HRI->getRARegister())
+ return true;
+
+ if (HII->isDeallocRet(MI))
+ if (DepReg == HRI->getFrameRegister() || DepReg == HRI->getStackRegister())
+ return true;
+
+ // Check if this is a predicate dependence.
+ const TargetRegisterClass* RC = HRI->getMinimalPhysRegClass(DepReg);
+ if (RC == &Hexagon::PredRegsRegClass)
+ return true;
+
+ // Assumes that the first operand of the CALLr is the function address.
+ if (HII->isIndirectCall(MI) && (DepType == SDep::Data)) {
+ const MachineOperand MO = MI.getOperand(0);
+ if (MO.isReg() && MO.isUse() && (MO.getReg() == DepReg))
+ return true;
+ }
+
+ if (HII->isJumpR(MI)) {
+ const MachineOperand &MO = HII->isPredicated(MI) ? MI.getOperand(1)
+ : MI.getOperand(0);
+ assert(MO.isReg() && MO.isUse());
+ if (MO.getReg() == DepReg)
+ return true;
+ }
+ return false;
+}
+
+static bool isRegDependence(const SDep::Kind DepType) {
+ return DepType == SDep::Data || DepType == SDep::Anti ||
+ DepType == SDep::Output;
+}
+
+static bool isDirectJump(const MachineInstr &MI) {
+ return MI.getOpcode() == Hexagon::J2_jump;
+}
+
+static bool isSchedBarrier(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case Hexagon::Y2_barrier:
+ return true;
+ }
+ return false;
+}
+
+static bool isControlFlow(const MachineInstr &MI) {
+ return MI.getDesc().isTerminator() || MI.getDesc().isCall();
+}
+
+
+/// Returns true if the instruction modifies a callee-saved register.
+static bool doesModifyCalleeSavedReg(const MachineInstr &MI,
+ const TargetRegisterInfo *TRI) {
+ const MachineFunction &MF = *MI.getParent()->getParent();
+ for (auto *CSR = TRI->getCalleeSavedRegs(&MF); CSR && *CSR; ++CSR)
+ if (MI.modifiesRegister(*CSR, TRI))
+ return true;
+ return false;
+}
+
+// Returns true if an instruction can be promoted to .new predicate or
+// new-value store.
+bool HexagonPacketizerList::isNewifiable(const MachineInstr &MI,
+ const TargetRegisterClass *NewRC) {
+ // Vector stores can be predicated, and can be new-value stores, but
+ // they cannot be predicated on a .new predicate value.
+ if (NewRC == &Hexagon::PredRegsRegClass)
+ if (HII->isV60VectorInstruction(MI) && MI.mayStore())
+ return false;
+ return HII->isCondInst(MI) || HII->isJumpR(MI) || MI.isReturn() ||
+ HII->mayBeNewStore(MI);
+}
+
+// Promote an instructiont to its .cur form.
+// At this time, we have already made a call to canPromoteToDotCur and made
+// sure that it can *indeed* be promoted.
+bool HexagonPacketizerList::promoteToDotCur(MachineInstr &MI,
+ SDep::Kind DepType, MachineBasicBlock::iterator &MII,
+ const TargetRegisterClass* RC) {
+ assert(DepType == SDep::Data);
+ int CurOpcode = HII->getDotCurOp(MI);
+ MI.setDesc(HII->get(CurOpcode));
+ return true;
+}
+
+void HexagonPacketizerList::cleanUpDotCur() {
+ MachineInstr *MI = nullptr;
+ for (auto BI : CurrentPacketMIs) {
+ DEBUG(dbgs() << "Cleanup packet has "; BI->dump(););
+ if (BI->getOpcode() == Hexagon::V6_vL32b_cur_ai) {
+ MI = BI;
+ continue;
+ }
+ if (MI) {
+ for (auto &MO : BI->operands())
+ if (MO.isReg() && MO.getReg() == MI->getOperand(0).getReg())
+ return;
+ }
+ }
+ if (!MI)
+ return;
+ // We did not find a use of the CUR, so de-cur it.
+ MI->setDesc(HII->get(Hexagon::V6_vL32b_ai));
+ DEBUG(dbgs() << "Demoted CUR "; MI->dump(););
+}
+
+// Check to see if an instruction can be dot cur.
+bool HexagonPacketizerList::canPromoteToDotCur(const MachineInstr &MI,
+ const SUnit *PacketSU, unsigned DepReg, MachineBasicBlock::iterator &MII,
+ const TargetRegisterClass *RC) {
+ if (!HII->isV60VectorInstruction(MI))
+ return false;
+ if (!HII->isV60VectorInstruction(*MII))
+ return false;
+
+ // Already a dot new instruction.
+ if (HII->isDotCurInst(MI) && !HII->mayBeCurLoad(MI))
+ return false;
+
+ if (!HII->mayBeCurLoad(MI))
+ return false;
+
+ // The "cur value" cannot come from inline asm.
+ if (PacketSU->getInstr()->isInlineAsm())
+ return false;
+
+ // Make sure candidate instruction uses cur.
+ DEBUG(dbgs() << "Can we DOT Cur Vector MI\n";
+ MI.dump();
+ dbgs() << "in packet\n";);
+ MachineInstr &MJ = *MII;
+ DEBUG({
+ dbgs() << "Checking CUR against ";
+ MJ.dump();
+ });
+ unsigned DestReg = MI.getOperand(0).getReg();
+ bool FoundMatch = false;
+ for (auto &MO : MJ.operands())
+ if (MO.isReg() && MO.getReg() == DestReg)
+ FoundMatch = true;
+ if (!FoundMatch)
+ return false;
+
+ // Check for existing uses of a vector register within the packet which
+ // would be affected by converting a vector load into .cur formt.
+ for (auto BI : CurrentPacketMIs) {
+ DEBUG(dbgs() << "packet has "; BI->dump(););
+ if (BI->readsRegister(DepReg, MF.getSubtarget().getRegisterInfo()))
+ return false;
+ }
+
+ DEBUG(dbgs() << "Can Dot CUR MI\n"; MI.dump(););
+ // We can convert the opcode into a .cur.
+ return true;
+}
+
+// Promote an instruction to its .new form. At this time, we have already
+// made a call to canPromoteToDotNew and made sure that it can *indeed* be
+// promoted.
+bool HexagonPacketizerList::promoteToDotNew(MachineInstr &MI,
+ SDep::Kind DepType, MachineBasicBlock::iterator &MII,
+ const TargetRegisterClass* RC) {
+ assert (DepType == SDep::Data);
+ int NewOpcode;
+ if (RC == &Hexagon::PredRegsRegClass)
+ NewOpcode = HII->getDotNewPredOp(MI, MBPI);
+ else
+ NewOpcode = HII->getDotNewOp(MI);
+ MI.setDesc(HII->get(NewOpcode));
+ return true;
+}
+
+bool HexagonPacketizerList::demoteToDotOld(MachineInstr &MI) {
+ int NewOpcode = HII->getDotOldOp(MI.getOpcode());
+ MI.setDesc(HII->get(NewOpcode));
+ return true;
+}
+
+bool HexagonPacketizerList::useCallersSP(MachineInstr &MI) {
+ unsigned Opc = MI.getOpcode();
+ switch (Opc) {
+ case Hexagon::S2_storerd_io:
+ case Hexagon::S2_storeri_io:
+ case Hexagon::S2_storerh_io:
+ case Hexagon::S2_storerb_io:
+ break;
+ default:
+ llvm_unreachable("Unexpected instruction");
+ }
+ unsigned FrameSize = MF.getFrameInfo().getStackSize();
+ MachineOperand &Off = MI.getOperand(1);
+ int64_t NewOff = Off.getImm() - (FrameSize + HEXAGON_LRFP_SIZE);
+ if (HII->isValidOffset(Opc, NewOff)) {
+ Off.setImm(NewOff);
+ return true;
+ }
+ return false;
+}
+
+void HexagonPacketizerList::useCalleesSP(MachineInstr &MI) {
+ unsigned Opc = MI.getOpcode();
+ switch (Opc) {
+ case Hexagon::S2_storerd_io:
+ case Hexagon::S2_storeri_io:
+ case Hexagon::S2_storerh_io:
+ case Hexagon::S2_storerb_io:
+ break;
+ default:
+ llvm_unreachable("Unexpected instruction");
+ }
+ unsigned FrameSize = MF.getFrameInfo().getStackSize();
+ MachineOperand &Off = MI.getOperand(1);
+ Off.setImm(Off.getImm() + FrameSize + HEXAGON_LRFP_SIZE);
+}
+
+enum PredicateKind {
+ PK_False,
+ PK_True,
+ PK_Unknown
+};
+
+/// Returns true if an instruction is predicated on p0 and false if it's
+/// predicated on !p0.
+static PredicateKind getPredicateSense(const MachineInstr &MI,
+ const HexagonInstrInfo *HII) {
+ if (!HII->isPredicated(MI))
+ return PK_Unknown;
+ if (HII->isPredicatedTrue(MI))
+ return PK_True;
+ return PK_False;
+}
+
+static const MachineOperand &getPostIncrementOperand(const MachineInstr &MI,
+ const HexagonInstrInfo *HII) {
+ assert(HII->isPostIncrement(MI) && "Not a post increment operation.");
+#ifndef NDEBUG
+ // Post Increment means duplicates. Use dense map to find duplicates in the
+ // list. Caution: Densemap initializes with the minimum of 64 buckets,
+ // whereas there are at most 5 operands in the post increment.
+ DenseSet<unsigned> DefRegsSet;
+ for (auto &MO : MI.operands())
+ if (MO.isReg() && MO.isDef())
+ DefRegsSet.insert(MO.getReg());
+
+ for (auto &MO : MI.operands())
+ if (MO.isReg() && MO.isUse() && DefRegsSet.count(MO.getReg()))
+ return MO;
+#else
+ if (MI.mayLoad()) {
+ const MachineOperand &Op1 = MI.getOperand(1);
+ // The 2nd operand is always the post increment operand in load.
+ assert(Op1.isReg() && "Post increment operand has be to a register.");
+ return Op1;
+ }
+ if (MI.getDesc().mayStore()) {
+ const MachineOperand &Op0 = MI.getOperand(0);
+ // The 1st operand is always the post increment operand in store.
+ assert(Op0.isReg() && "Post increment operand has be to a register.");
+ return Op0;
+ }
+#endif
+ // we should never come here.
+ llvm_unreachable("mayLoad or mayStore not set for Post Increment operation");
+}
+
+// Get the value being stored.
+static const MachineOperand& getStoreValueOperand(const MachineInstr &MI) {
+ // value being stored is always the last operand.
+ return MI.getOperand(MI.getNumOperands()-1);
+}
+
+static bool isLoadAbsSet(const MachineInstr &MI) {
+ unsigned Opc = MI.getOpcode();
+ switch (Opc) {
+ case Hexagon::L4_loadrd_ap:
+ case Hexagon::L4_loadrb_ap:
+ case Hexagon::L4_loadrh_ap:
+ case Hexagon::L4_loadrub_ap:
+ case Hexagon::L4_loadruh_ap:
+ case Hexagon::L4_loadri_ap:
+ return true;
+ }
+ return false;
+}
+
+static const MachineOperand &getAbsSetOperand(const MachineInstr &MI) {
+ assert(isLoadAbsSet(MI));
+ return MI.getOperand(1);
+}
+
+
+// Can be new value store?
+// Following restrictions are to be respected in convert a store into
+// a new value store.
+// 1. If an instruction uses auto-increment, its address register cannot
+// be a new-value register. Arch Spec 5.4.2.1
+// 2. If an instruction uses absolute-set addressing mode, its address
+// register cannot be a new-value register. Arch Spec 5.4.2.1.
+// 3. If an instruction produces a 64-bit result, its registers cannot be used
+// as new-value registers. Arch Spec 5.4.2.2.
+// 4. If the instruction that sets the new-value register is conditional, then
+// the instruction that uses the new-value register must also be conditional,
+// and both must always have their predicates evaluate identically.
+// Arch Spec 5.4.2.3.
+// 5. There is an implied restriction that a packet cannot have another store,
+// if there is a new value store in the packet. Corollary: if there is
+// already a store in a packet, there can not be a new value store.
+// Arch Spec: 3.4.4.2
+bool HexagonPacketizerList::canPromoteToNewValueStore(const MachineInstr &MI,
+ const MachineInstr &PacketMI, unsigned DepReg) {
+ // Make sure we are looking at the store, that can be promoted.
+ if (!HII->mayBeNewStore(MI))
+ return false;
+
+ // Make sure there is dependency and can be new value'd.
+ const MachineOperand &Val = getStoreValueOperand(MI);
+ if (Val.isReg() && Val.getReg() != DepReg)
+ return false;
+
+ const MCInstrDesc& MCID = PacketMI.getDesc();
+
+ // First operand is always the result.
+ const TargetRegisterClass *PacketRC = HII->getRegClass(MCID, 0, HRI, MF);
+ // Double regs can not feed into new value store: PRM section: 5.4.2.2.
+ if (PacketRC == &Hexagon::DoubleRegsRegClass)
+ return false;
+
+ // New-value stores are of class NV (slot 0), dual stores require class ST
+ // in slot 0 (PRM 5.5).
+ for (auto I : CurrentPacketMIs) {
+ SUnit *PacketSU = MIToSUnit.find(I)->second;
+ if (PacketSU->getInstr()->mayStore())
+ return false;
+ }
+
+ // Make sure it's NOT the post increment register that we are going to
+ // new value.
+ if (HII->isPostIncrement(MI) &&
+ getPostIncrementOperand(MI, HII).getReg() == DepReg) {
+ return false;
+ }
+
+ if (HII->isPostIncrement(PacketMI) && PacketMI.mayLoad() &&
+ getPostIncrementOperand(PacketMI, HII).getReg() == DepReg) {
+ // If source is post_inc, or absolute-set addressing, it can not feed
+ // into new value store
+ // r3 = memw(r2++#4)
+ // memw(r30 + #-1404) = r2.new -> can not be new value store
+ // arch spec section: 5.4.2.1.
+ return false;
+ }
+
+ if (isLoadAbsSet(PacketMI) && getAbsSetOperand(PacketMI).getReg() == DepReg)
+ return false;
+
+ // If the source that feeds the store is predicated, new value store must
+ // also be predicated.
+ if (HII->isPredicated(PacketMI)) {
+ if (!HII->isPredicated(MI))
+ return false;
+
+ // Check to make sure that they both will have their predicates
+ // evaluate identically.
+ unsigned predRegNumSrc = 0;
+ unsigned predRegNumDst = 0;
+ const TargetRegisterClass* predRegClass = nullptr;
+
+ // Get predicate register used in the source instruction.
+ for (auto &MO : PacketMI.operands()) {
+ if (!MO.isReg())
+ continue;
+ predRegNumSrc = MO.getReg();
+ predRegClass = HRI->getMinimalPhysRegClass(predRegNumSrc);
+ if (predRegClass == &Hexagon::PredRegsRegClass)
+ break;
+ }
+ assert((predRegClass == &Hexagon::PredRegsRegClass) &&
+ "predicate register not found in a predicated PacketMI instruction");
+
+ // Get predicate register used in new-value store instruction.
+ for (auto &MO : MI.operands()) {
+ if (!MO.isReg())
+ continue;
+ predRegNumDst = MO.getReg();
+ predRegClass = HRI->getMinimalPhysRegClass(predRegNumDst);
+ if (predRegClass == &Hexagon::PredRegsRegClass)
+ break;
+ }
+ assert((predRegClass == &Hexagon::PredRegsRegClass) &&
+ "predicate register not found in a predicated MI instruction");
+
+ // New-value register producer and user (store) need to satisfy these
+ // constraints:
+ // 1) Both instructions should be predicated on the same register.
+ // 2) If producer of the new-value register is .new predicated then store
+ // should also be .new predicated and if producer is not .new predicated
+ // then store should not be .new predicated.
+ // 3) Both new-value register producer and user should have same predicate
+ // sense, i.e, either both should be negated or both should be non-negated.
+ if (predRegNumDst != predRegNumSrc ||
+ HII->isDotNewInst(PacketMI) != HII->isDotNewInst(MI) ||
+ getPredicateSense(MI, HII) != getPredicateSense(PacketMI, HII))
+ return false;
+ }
+
+ // Make sure that other than the new-value register no other store instruction
+ // register has been modified in the same packet. Predicate registers can be
+ // modified by they should not be modified between the producer and the store
+ // instruction as it will make them both conditional on different values.
+ // We already know this to be true for all the instructions before and
+ // including PacketMI. Howerver, we need to perform the check for the
+ // remaining instructions in the packet.
+
+ unsigned StartCheck = 0;
+
+ for (auto I : CurrentPacketMIs) {
+ SUnit *TempSU = MIToSUnit.find(I)->second;
+ MachineInstr &TempMI = *TempSU->getInstr();
+
+ // Following condition is true for all the instructions until PacketMI is
+ // reached (StartCheck is set to 0 before the for loop).
+ // StartCheck flag is 1 for all the instructions after PacketMI.
+ if (&TempMI != &PacketMI && !StartCheck) // Start processing only after
+ continue; // encountering PacketMI.
+
+ StartCheck = 1;
+ if (&TempMI == &PacketMI) // We don't want to check PacketMI for dependence.
+ continue;
+
+ for (auto &MO : MI.operands())
+ if (MO.isReg() && TempSU->getInstr()->modifiesRegister(MO.getReg(), HRI))
+ return false;
+ }
+
+ // Make sure that for non-POST_INC stores:
+ // 1. The only use of reg is DepReg and no other registers.
+ // This handles V4 base+index registers.
+ // The following store can not be dot new.
+ // Eg. r0 = add(r0, #3)
+ // memw(r1+r0<<#2) = r0
+ if (!HII->isPostIncrement(MI)) {
+ for (unsigned opNum = 0; opNum < MI.getNumOperands()-1; opNum++) {
+ const MachineOperand &MO = MI.getOperand(opNum);
+ if (MO.isReg() && MO.getReg() == DepReg)
+ return false;
+ }
+ }
+
+ // If data definition is because of implicit definition of the register,
+ // do not newify the store. Eg.
+ // %R9<def> = ZXTH %R12, %D6<imp-use>, %R12<imp-def>
+ // S2_storerh_io %R8, 2, %R12<kill>; mem:ST2[%scevgep343]
+ for (auto &MO : PacketMI.operands()) {
+ if (!MO.isReg() || !MO.isDef() || !MO.isImplicit())
+ continue;
+ unsigned R = MO.getReg();
+ if (R == DepReg || HRI->isSuperRegister(DepReg, R))
+ return false;
+ }
+
+ // Handle imp-use of super reg case. There is a target independent side
+ // change that should prevent this situation but I am handling it for
+ // just-in-case. For example, we cannot newify R2 in the following case:
+ // %R3<def> = A2_tfrsi 0;
+ // S2_storeri_io %R0<kill>, 0, %R2<kill>, %D1<imp-use,kill>;
+ for (auto &MO : MI.operands()) {
+ if (MO.isReg() && MO.isUse() && MO.isImplicit() && MO.getReg() == DepReg)
+ return false;
+ }
+
+ // Can be dot new store.
+ return true;
+}
+
+// Can this MI to promoted to either new value store or new value jump.
+bool HexagonPacketizerList::canPromoteToNewValue(const MachineInstr &MI,
+ const SUnit *PacketSU, unsigned DepReg,
+ MachineBasicBlock::iterator &MII) {
+ if (!HII->mayBeNewStore(MI))
+ return false;
+
+ // Check to see the store can be new value'ed.
+ MachineInstr &PacketMI = *PacketSU->getInstr();
+ if (canPromoteToNewValueStore(MI, PacketMI, DepReg))
+ return true;
+
+ // Check to see the compare/jump can be new value'ed.
+ // This is done as a pass on its own. Don't need to check it here.
+ return false;
+}
+
+static bool isImplicitDependency(const MachineInstr &I, unsigned DepReg) {
+ for (auto &MO : I.operands())
+ if (MO.isReg() && MO.isDef() && (MO.getReg() == DepReg) && MO.isImplicit())
+ return true;
+ return false;
+}
+
+// Check to see if an instruction can be dot new
+// There are three kinds.
+// 1. dot new on predicate - V2/V3/V4
+// 2. dot new on stores NV/ST - V4
+// 3. dot new on jump NV/J - V4 -- This is generated in a pass.
+bool HexagonPacketizerList::canPromoteToDotNew(const MachineInstr &MI,
+ const SUnit *PacketSU, unsigned DepReg, MachineBasicBlock::iterator &MII,
+ const TargetRegisterClass* RC) {
+ // Already a dot new instruction.
+ if (HII->isDotNewInst(MI) && !HII->mayBeNewStore(MI))
+ return false;
+
+ if (!isNewifiable(MI, RC))
+ return false;
+
+ const MachineInstr &PI = *PacketSU->getInstr();
+
+ // The "new value" cannot come from inline asm.
+ if (PI.isInlineAsm())
+ return false;
+
+ // IMPLICIT_DEFs won't materialize as real instructions, so .new makes no
+ // sense.
+ if (PI.isImplicitDef())
+ return false;
+
+ // If dependency is trough an implicitly defined register, we should not
+ // newify the use.
+ if (isImplicitDependency(PI, DepReg))
+ return false;
+
+ const MCInstrDesc& MCID = PI.getDesc();
+ const TargetRegisterClass *VecRC = HII->getRegClass(MCID, 0, HRI, MF);
+ if (DisableVecDblNVStores && VecRC == &Hexagon::VecDblRegsRegClass)
+ return false;
+
+ // predicate .new
+ if (RC == &Hexagon::PredRegsRegClass)
+ if (HII->isCondInst(MI) || HII->isJumpR(MI) || MI.isReturn())
+ return HII->predCanBeUsedAsDotNew(PI, DepReg);
+
+ if (RC != &Hexagon::PredRegsRegClass && !HII->mayBeNewStore(MI))
+ return false;
+
+ // Create a dot new machine instruction to see if resources can be
+ // allocated. If not, bail out now.
+ int NewOpcode = HII->getDotNewOp(MI);
+ const MCInstrDesc &D = HII->get(NewOpcode);
+ MachineInstr *NewMI = MF.CreateMachineInstr(D, DebugLoc());
+ bool ResourcesAvailable = ResourceTracker->canReserveResources(*NewMI);
+ MF.DeleteMachineInstr(NewMI);
+ if (!ResourcesAvailable)
+ return false;
+
+ // New Value Store only. New Value Jump generated as a separate pass.
+ if (!canPromoteToNewValue(MI, PacketSU, DepReg, MII))
+ return false;
+
+ return true;
+}
+
+// Go through the packet instructions and search for an anti dependency between
+// them and DepReg from MI. Consider this case:
+// Trying to add
+// a) %R1<def> = TFRI_cdNotPt %P3, 2
+// to this packet:
+// {
+// b) %P0<def> = C2_or %P3<kill>, %P0<kill>
+// c) %P3<def> = C2_tfrrp %R23
+// d) %R1<def> = C2_cmovenewit %P3, 4
+// }
+// The P3 from a) and d) will be complements after
+// a)'s P3 is converted to .new form
+// Anti-dep between c) and b) is irrelevant for this case
+bool HexagonPacketizerList::restrictingDepExistInPacket(MachineInstr &MI,
+ unsigned DepReg) {
+ SUnit *PacketSUDep = MIToSUnit.find(&MI)->second;
+
+ for (auto I : CurrentPacketMIs) {
+ // We only care for dependencies to predicated instructions
+ if (!HII->isPredicated(*I))
+ continue;
+
+ // Scheduling Unit for current insn in the packet
+ SUnit *PacketSU = MIToSUnit.find(I)->second;
+
+ // Look at dependencies between current members of the packet and
+ // predicate defining instruction MI. Make sure that dependency is
+ // on the exact register we care about.
+ if (PacketSU->isSucc(PacketSUDep)) {
+ for (unsigned i = 0; i < PacketSU->Succs.size(); ++i) {
+ auto &Dep = PacketSU->Succs[i];
+ if (Dep.getSUnit() == PacketSUDep && Dep.getKind() == SDep::Anti &&
+ Dep.getReg() == DepReg)
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+
+/// Gets the predicate register of a predicated instruction.
+static unsigned getPredicatedRegister(MachineInstr &MI,
+ const HexagonInstrInfo *QII) {
+ /// We use the following rule: The first predicate register that is a use is
+ /// the predicate register of a predicated instruction.
+ assert(QII->isPredicated(MI) && "Must be predicated instruction");
+
+ for (auto &Op : MI.operands()) {
+ if (Op.isReg() && Op.getReg() && Op.isUse() &&
+ Hexagon::PredRegsRegClass.contains(Op.getReg()))
+ return Op.getReg();
+ }
+
+ llvm_unreachable("Unknown instruction operand layout");
+ return 0;
+}
+
+// Given two predicated instructions, this function detects whether
+// the predicates are complements.
+bool HexagonPacketizerList::arePredicatesComplements(MachineInstr &MI1,
+ MachineInstr &MI2) {
+ // If we don't know the predicate sense of the instructions bail out early, we
+ // need it later.
+ if (getPredicateSense(MI1, HII) == PK_Unknown ||
+ getPredicateSense(MI2, HII) == PK_Unknown)
+ return false;
+
+ // Scheduling unit for candidate.
+ SUnit *SU = MIToSUnit[&MI1];
+
+ // One corner case deals with the following scenario:
+ // Trying to add
+ // a) %R24<def> = A2_tfrt %P0, %R25
+ // to this packet:
+ // {
+ // b) %R25<def> = A2_tfrf %P0, %R24
+ // c) %P0<def> = C2_cmpeqi %R26, 1
+ // }
+ //
+ // On general check a) and b) are complements, but presence of c) will
+ // convert a) to .new form, and then it is not a complement.
+ // We attempt to detect it by analyzing existing dependencies in the packet.
+
+ // Analyze relationships between all existing members of the packet.
+ // Look for Anti dependecy on the same predicate reg as used in the
+ // candidate.
+ for (auto I : CurrentPacketMIs) {
+ // Scheduling Unit for current insn in the packet.
+ SUnit *PacketSU = MIToSUnit.find(I)->second;
+
+ // If this instruction in the packet is succeeded by the candidate...
+ if (PacketSU->isSucc(SU)) {
+ for (unsigned i = 0; i < PacketSU->Succs.size(); ++i) {
+ auto Dep = PacketSU->Succs[i];
+ // The corner case exist when there is true data dependency between
+ // candidate and one of current packet members, this dep is on
+ // predicate reg, and there already exist anti dep on the same pred in
+ // the packet.
+ if (Dep.getSUnit() == SU && Dep.getKind() == SDep::Data &&
+ Hexagon::PredRegsRegClass.contains(Dep.getReg())) {
+ // Here I know that I is predicate setting instruction with true
+ // data dep to candidate on the register we care about - c) in the
+ // above example. Now I need to see if there is an anti dependency
+ // from c) to any other instruction in the same packet on the pred
+ // reg of interest.
+ if (restrictingDepExistInPacket(*I, Dep.getReg()))
+ return false;
+ }
+ }
+ }
+ }
+
+ // If the above case does not apply, check regular complement condition.
+ // Check that the predicate register is the same and that the predicate
+ // sense is different We also need to differentiate .old vs. .new: !p0
+ // is not complementary to p0.new.
+ unsigned PReg1 = getPredicatedRegister(MI1, HII);
+ unsigned PReg2 = getPredicatedRegister(MI2, HII);
+ return PReg1 == PReg2 &&
+ Hexagon::PredRegsRegClass.contains(PReg1) &&
+ Hexagon::PredRegsRegClass.contains(PReg2) &&
+ getPredicateSense(MI1, HII) != getPredicateSense(MI2, HII) &&
+ HII->isDotNewInst(MI1) == HII->isDotNewInst(MI2);
+}
+
+// Initialize packetizer flags.
+void HexagonPacketizerList::initPacketizerState() {
+ Dependence = false;
+ PromotedToDotNew = false;
+ GlueToNewValueJump = false;
+ GlueAllocframeStore = false;
+ FoundSequentialDependence = false;
+}
+
+// Ignore bundling of pseudo instructions.
+bool HexagonPacketizerList::ignorePseudoInstruction(const MachineInstr &MI,
+ const MachineBasicBlock *) {
+ if (MI.isDebugValue())
+ return true;
+
+ if (MI.isCFIInstruction())
+ return false;
+
+ // We must print out inline assembly.
+ if (MI.isInlineAsm())
+ return false;
+
+ if (MI.isImplicitDef())
+ return false;
+
+ // We check if MI has any functional units mapped to it. If it doesn't,
+ // we ignore the instruction.
+ const MCInstrDesc& TID = MI.getDesc();
+ auto *IS = ResourceTracker->getInstrItins()->beginStage(TID.getSchedClass());
+ unsigned FuncUnits = IS->getUnits();
+ return !FuncUnits;
+}
+
+bool HexagonPacketizerList::isSoloInstruction(const MachineInstr &MI) {
+ if (MI.isEHLabel() || MI.isCFIInstruction())
+ return true;
+
+ // Consider inline asm to not be a solo instruction by default.
+ // Inline asm will be put in a packet temporarily, but then it will be
+ // removed, and placed outside of the packet (before or after, depending
+ // on dependencies). This is to reduce the impact of inline asm as a
+ // "packet splitting" instruction.
+ if (MI.isInlineAsm() && !ScheduleInlineAsm)
+ return true;
+
+ // From Hexagon V4 Programmer's Reference Manual 3.4.4 Grouping constraints:
+ // trap, pause, barrier, icinva, isync, and syncht are solo instructions.
+ // They must not be grouped with other instructions in a packet.
+ if (isSchedBarrier(MI))
+ return true;
+
+ if (HII->isSolo(MI))
+ return true;
+
+ if (MI.getOpcode() == Hexagon::A2_nop)
+ return true;
+
+ return false;
+}
+
+
+// Quick check if instructions MI and MJ cannot coexist in the same packet.
+// Limit the tests to be "one-way", e.g. "if MI->isBranch and MJ->isInlineAsm",
+// but not the symmetric case: "if MJ->isBranch and MI->isInlineAsm".
+// For full test call this function twice:
+// cannotCoexistAsymm(MI, MJ) || cannotCoexistAsymm(MJ, MI)
+// Doing the test only one way saves the amount of code in this function,
+// since every test would need to be repeated with the MI and MJ reversed.
+static bool cannotCoexistAsymm(const MachineInstr &MI, const MachineInstr &MJ,
+ const HexagonInstrInfo &HII) {
+ const MachineFunction *MF = MI.getParent()->getParent();
+ if (MF->getSubtarget<HexagonSubtarget>().hasV60TOpsOnly() &&
+ HII.isHVXMemWithAIndirect(MI, MJ))
+ return true;
+
+ // An inline asm cannot be together with a branch, because we may not be
+ // able to remove the asm out after packetizing (i.e. if the asm must be
+ // moved past the bundle). Similarly, two asms cannot be together to avoid
+ // complications when determining their relative order outside of a bundle.
+ if (MI.isInlineAsm())
+ return MJ.isInlineAsm() || MJ.isBranch() || MJ.isBarrier() ||
+ MJ.isCall() || MJ.isTerminator();
+
+ switch (MI.getOpcode()) {
+ case (Hexagon::S2_storew_locked):
+ case (Hexagon::S4_stored_locked):
+ case (Hexagon::L2_loadw_locked):
+ case (Hexagon::L4_loadd_locked):
+ case (Hexagon::Y4_l2fetch): {
+ // These instructions can only be grouped with ALU32 or non-floating-point
+ // XTYPE instructions. Since there is no convenient way of identifying fp
+ // XTYPE instructions, only allow grouping with ALU32 for now.
+ unsigned TJ = HII.getType(MJ);
+ if (TJ != HexagonII::TypeALU32)
+ return true;
+ break;
+ }
+ default:
+ break;
+ }
+
+ // "False" really means that the quick check failed to determine if
+ // I and J cannot coexist.
+ return false;
+}
+
+
+// Full, symmetric check.
+bool HexagonPacketizerList::cannotCoexist(const MachineInstr &MI,
+ const MachineInstr &MJ) {
+ return cannotCoexistAsymm(MI, MJ, *HII) || cannotCoexistAsymm(MJ, MI, *HII);
+}
+
+void HexagonPacketizerList::unpacketizeSoloInstrs(MachineFunction &MF) {
+ for (auto &B : MF) {
+ MachineBasicBlock::iterator BundleIt;
+ MachineBasicBlock::instr_iterator NextI;
+ for (auto I = B.instr_begin(), E = B.instr_end(); I != E; I = NextI) {
+ NextI = std::next(I);
+ MachineInstr &MI = *I;
+ if (MI.isBundle())
+ BundleIt = I;
+ if (!MI.isInsideBundle())
+ continue;
+
+ // Decide on where to insert the instruction that we are pulling out.
+ // Debug instructions always go before the bundle, but the placement of
+ // INLINE_ASM depends on potential dependencies. By default, try to
+ // put it before the bundle, but if the asm writes to a register that
+ // other instructions in the bundle read, then we need to place it
+ // after the bundle (to preserve the bundle semantics).
+ bool InsertBeforeBundle;
+ if (MI.isInlineAsm())
+ InsertBeforeBundle = !hasWriteToReadDep(MI, *BundleIt, HRI);
+ else if (MI.isDebugValue())
+ InsertBeforeBundle = true;
+ else
+ continue;
+
+ BundleIt = moveInstrOut(MI, BundleIt, InsertBeforeBundle);
+ }
+ }
+}
+
+// Check if a given instruction is of class "system".
+static bool isSystemInstr(const MachineInstr &MI) {
+ unsigned Opc = MI.getOpcode();
+ switch (Opc) {
+ case Hexagon::Y2_barrier:
+ case Hexagon::Y2_dcfetchbo:
+ return true;
+ }
+ return false;
+}
+
+bool HexagonPacketizerList::hasDeadDependence(const MachineInstr &I,
+ const MachineInstr &J) {
+ // The dependence graph may not include edges between dead definitions,
+ // so without extra checks, we could end up packetizing two instruction
+ // defining the same (dead) register.
+ if (I.isCall() || J.isCall())
+ return false;
+ if (HII->isPredicated(I) || HII->isPredicated(J))
+ return false;
+
+ BitVector DeadDefs(Hexagon::NUM_TARGET_REGS);
+ for (auto &MO : I.operands()) {
+ if (!MO.isReg() || !MO.isDef() || !MO.isDead())
+ continue;
+ DeadDefs[MO.getReg()] = true;
+ }
+
+ for (auto &MO : J.operands()) {
+ if (!MO.isReg() || !MO.isDef() || !MO.isDead())
+ continue;
+ unsigned R = MO.getReg();
+ if (R != Hexagon::USR_OVF && DeadDefs[R])
+ return true;
+ }
+ return false;
+}
+
+bool HexagonPacketizerList::hasControlDependence(const MachineInstr &I,
+ const MachineInstr &J) {
+ // A save callee-save register function call can only be in a packet
+ // with instructions that don't write to the callee-save registers.
+ if ((HII->isSaveCalleeSavedRegsCall(I) &&
+ doesModifyCalleeSavedReg(J, HRI)) ||
+ (HII->isSaveCalleeSavedRegsCall(J) &&
+ doesModifyCalleeSavedReg(I, HRI)))
+ return true;
+
+ // Two control flow instructions cannot go in the same packet.
+ if (isControlFlow(I) && isControlFlow(J))
+ return true;
+
+ // \ref-manual (7.3.4) A loop setup packet in loopN or spNloop0 cannot
+ // contain a speculative indirect jump,
+ // a new-value compare jump or a dealloc_return.
+ auto isBadForLoopN = [this] (const MachineInstr &MI) -> bool {
+ if (MI.isCall() || HII->isDeallocRet(MI) || HII->isNewValueJump(MI))
+ return true;
+ if (HII->isPredicated(MI) && HII->isPredicatedNew(MI) && HII->isJumpR(MI))
+ return true;
+ return false;
+ };
+
+ if (HII->isLoopN(I) && isBadForLoopN(J))
+ return true;
+ if (HII->isLoopN(J) && isBadForLoopN(I))
+ return true;
+
+ // dealloc_return cannot appear in the same packet as a conditional or
+ // unconditional jump.
+ return HII->isDeallocRet(I) &&
+ (J.isBranch() || J.isCall() || J.isBarrier());
+}
+
+bool HexagonPacketizerList::hasV4SpecificDependence(const MachineInstr &I,
+ const MachineInstr &J) {
+ bool SysI = isSystemInstr(I), SysJ = isSystemInstr(J);
+ bool StoreI = I.mayStore(), StoreJ = J.mayStore();
+ if ((SysI && StoreJ) || (SysJ && StoreI))
+ return true;
+
+ if (StoreI && StoreJ) {
+ if (HII->isNewValueInst(J) || HII->isMemOp(J) || HII->isMemOp(I))
+ return true;
+ } else {
+ // A memop cannot be in the same packet with another memop or a store.
+ // Two stores can be together, but here I and J cannot both be stores.
+ bool MopStI = HII->isMemOp(I) || StoreI;
+ bool MopStJ = HII->isMemOp(J) || StoreJ;
+ if (MopStI && MopStJ)
+ return true;
+ }
+
+ return (StoreJ && HII->isDeallocRet(I)) || (StoreI && HII->isDeallocRet(J));
+}
+
+// SUI is the current instruction that is out side of the current packet.
+// SUJ is the current instruction inside the current packet against which that
+// SUI will be packetized.
+bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
+ assert(SUI->getInstr() && SUJ->getInstr());
+ MachineInstr &I = *SUI->getInstr();
+ MachineInstr &J = *SUJ->getInstr();
+
+ // Clear IgnoreDepMIs when Packet starts.
+ if (CurrentPacketMIs.size() == 1)
+ IgnoreDepMIs.clear();
+
+ MachineBasicBlock::iterator II = I.getIterator();
+
+ // Solo instructions cannot go in the packet.
+ assert(!isSoloInstruction(I) && "Unexpected solo instr!");
+
+ if (cannotCoexist(I, J))
+ return false;
+
+ Dependence = hasDeadDependence(I, J) || hasControlDependence(I, J);
+ if (Dependence)
+ return false;
+
+ // V4 allows dual stores. It does not allow second store, if the first
+ // store is not in SLOT0. New value store, new value jump, dealloc_return
+ // and memop always take SLOT0. Arch spec 3.4.4.2.
+ Dependence = hasV4SpecificDependence(I, J);
+ if (Dependence)
+ return false;
+
+ // If an instruction feeds new value jump, glue it.
+ MachineBasicBlock::iterator NextMII = I.getIterator();
+ ++NextMII;
+ if (NextMII != I.getParent()->end() && HII->isNewValueJump(*NextMII)) {
+ MachineInstr &NextMI = *NextMII;
+
+ bool secondRegMatch = false;
+ const MachineOperand &NOp0 = NextMI.getOperand(0);
+ const MachineOperand &NOp1 = NextMI.getOperand(1);
+
+ if (NOp1.isReg() && I.getOperand(0).getReg() == NOp1.getReg())
+ secondRegMatch = true;
+
+ for (auto T : CurrentPacketMIs) {
+ SUnit *PacketSU = MIToSUnit.find(T)->second;
+ MachineInstr &PI = *PacketSU->getInstr();
+ // NVJ can not be part of the dual jump - Arch Spec: section 7.8.
+ if (PI.isCall()) {
+ Dependence = true;
+ break;
+ }
+ // Validate:
+ // 1. Packet does not have a store in it.
+ // 2. If the first operand of the nvj is newified, and the second
+ // operand is also a reg, it (second reg) is not defined in
+ // the same packet.
+ // 3. If the second operand of the nvj is newified, (which means
+ // first operand is also a reg), first reg is not defined in
+ // the same packet.
+ if (PI.getOpcode() == Hexagon::S2_allocframe || PI.mayStore() ||
+ HII->isLoopN(PI)) {
+ Dependence = true;
+ break;
+ }
+ // Check #2/#3.
+ const MachineOperand &OpR = secondRegMatch ? NOp0 : NOp1;
+ if (OpR.isReg() && PI.modifiesRegister(OpR.getReg(), HRI)) {
+ Dependence = true;
+ break;
+ }
+ }
+
+ if (Dependence)
+ return false;
+ GlueToNewValueJump = true;
+ }
+
+ // There no dependency between a prolog instruction and its successor.
+ if (!SUJ->isSucc(SUI))
+ return true;
+
+ for (unsigned i = 0; i < SUJ->Succs.size(); ++i) {
+ if (FoundSequentialDependence)
+ break;
+
+ if (SUJ->Succs[i].getSUnit() != SUI)
+ continue;
+
+ SDep::Kind DepType = SUJ->Succs[i].getKind();
+ // For direct calls:
+ // Ignore register dependences for call instructions for packetization
+ // purposes except for those due to r31 and predicate registers.
+ //
+ // For indirect calls:
+ // Same as direct calls + check for true dependences to the register
+ // used in the indirect call.
+ //
+ // We completely ignore Order dependences for call instructions.
+ //
+ // For returns:
+ // Ignore register dependences for return instructions like jumpr,
+ // dealloc return unless we have dependencies on the explicit uses
+ // of the registers used by jumpr (like r31) or dealloc return
+ // (like r29 or r30).
+ unsigned DepReg = 0;
+ const TargetRegisterClass *RC = nullptr;
+ if (DepType == SDep::Data) {
+ DepReg = SUJ->Succs[i].getReg();
+ RC = HRI->getMinimalPhysRegClass(DepReg);
+ }
+
+ if (I.isCall() || HII->isJumpR(I) || I.isReturn() || HII->isTailCall(I)) {
+ if (!isRegDependence(DepType))
+ continue;
+ if (!isCallDependent(I, DepType, SUJ->Succs[i].getReg()))
+ continue;
+ }
+
+ if (DepType == SDep::Data) {
+ if (canPromoteToDotCur(J, SUJ, DepReg, II, RC))
+ if (promoteToDotCur(J, DepType, II, RC))
+ continue;
+ }
+
+ // Data dpendence ok if we have load.cur.
+ if (DepType == SDep::Data && HII->isDotCurInst(J)) {
+ if (HII->isV60VectorInstruction(I))
+ continue;
+ }
+
+ // For instructions that can be promoted to dot-new, try to promote.
+ if (DepType == SDep::Data) {
+ if (canPromoteToDotNew(I, SUJ, DepReg, II, RC)) {
+ if (promoteToDotNew(I, DepType, II, RC)) {
+ PromotedToDotNew = true;
+ continue;
+ }
+ }
+ if (HII->isNewValueJump(I))
+ continue;
+ }
+
+ // For predicated instructions, if the predicates are complements then
+ // there can be no dependence.
+ if (HII->isPredicated(I) && HII->isPredicated(J) &&
+ arePredicatesComplements(I, J)) {
+ // Not always safe to do this translation.
+ // DAG Builder attempts to reduce dependence edges using transitive
+ // nature of dependencies. Here is an example:
+ //
+ // r0 = tfr_pt ... (1)
+ // r0 = tfr_pf ... (2)
+ // r0 = tfr_pt ... (3)
+ //
+ // There will be an output dependence between (1)->(2) and (2)->(3).
+ // However, there is no dependence edge between (1)->(3). This results
+ // in all 3 instructions going in the same packet. We ignore dependce
+ // only once to avoid this situation.
+ auto Itr = find(IgnoreDepMIs, &J);
+ if (Itr != IgnoreDepMIs.end()) {
+ Dependence = true;
+ return false;
+ }
+ IgnoreDepMIs.push_back(&I);
+ continue;
+ }
+
+ // Ignore Order dependences between unconditional direct branches
+ // and non-control-flow instructions.
+ if (isDirectJump(I) && !J.isBranch() && !J.isCall() &&
+ DepType == SDep::Order)
+ continue;
+
+ // Ignore all dependences for jumps except for true and output
+ // dependences.
+ if (I.isConditionalBranch() && DepType != SDep::Data &&
+ DepType != SDep::Output)
+ continue;
+
+ // Ignore output dependences due to superregs. We can write to two
+ // different subregisters of R1:0 for instance in the same cycle.
+
+ // If neither I nor J defines DepReg, then this is a superfluous output
+ // dependence. The dependence must be of the form:
+ // R0 = ...
+ // R1 = ...
+ // and there is an output dependence between the two instructions with
+ // DepReg = D0.
+ // We want to ignore these dependences. Ideally, the dependence
+ // constructor should annotate such dependences. We can then avoid this
+ // relatively expensive check.
+ //
+ if (DepType == SDep::Output) {
+ // DepReg is the register that's responsible for the dependence.
+ unsigned DepReg = SUJ->Succs[i].getReg();
+
+ // Check if I and J really defines DepReg.
+ if (!I.definesRegister(DepReg) && !J.definesRegister(DepReg))
+ continue;
+ FoundSequentialDependence = true;
+ break;
+ }
+
+ // For Order dependences:
+ // 1. On V4 or later, volatile loads/stores can be packetized together,
+ // unless other rules prevent is.
+ // 2. Store followed by a load is not allowed.
+ // 3. Store followed by a store is only valid on V4 or later.
+ // 4. Load followed by any memory operation is allowed.
+ if (DepType == SDep::Order) {
+ if (!PacketizeVolatiles) {
+ bool OrdRefs = I.hasOrderedMemoryRef() || J.hasOrderedMemoryRef();
+ if (OrdRefs) {
+ FoundSequentialDependence = true;
+ break;
+ }
+ }
+ // J is first, I is second.
+ bool LoadJ = J.mayLoad(), StoreJ = J.mayStore();
+ bool LoadI = I.mayLoad(), StoreI = I.mayStore();
+ if (StoreJ) {
+ // Two stores are only allowed on V4+. Load following store is never
+ // allowed.
+ if (LoadI) {
+ FoundSequentialDependence = true;
+ break;
+ }
+ } else if (!LoadJ || (!LoadI && !StoreI)) {
+ // If J is neither load nor store, assume a dependency.
+ // If J is a load, but I is neither, also assume a dependency.
+ FoundSequentialDependence = true;
+ break;
+ }
+ // Store followed by store: not OK on V2.
+ // Store followed by load: not OK on all.
+ // Load followed by store: OK on all.
+ // Load followed by load: OK on all.
+ continue;
+ }
+
+ // For V4, special case ALLOCFRAME. Even though there is dependency
+ // between ALLOCFRAME and subsequent store, allow it to be packetized
+ // in a same packet. This implies that the store is using the caller's
+ // SP. Hence, offset needs to be updated accordingly.
+ if (DepType == SDep::Data && J.getOpcode() == Hexagon::S2_allocframe) {
+ unsigned Opc = I.getOpcode();
+ switch (Opc) {
+ case Hexagon::S2_storerd_io:
+ case Hexagon::S2_storeri_io:
+ case Hexagon::S2_storerh_io:
+ case Hexagon::S2_storerb_io:
+ if (I.getOperand(0).getReg() == HRI->getStackRegister()) {
+ // Since this store is to be glued with allocframe in the same
+ // packet, it will use SP of the previous stack frame, i.e.
+ // caller's SP. Therefore, we need to recalculate offset
+ // according to this change.
+ GlueAllocframeStore = useCallersSP(I);
+ if (GlueAllocframeStore)
+ continue;
+ }
+ default:
+ break;
+ }
+ }
+
+ // There are certain anti-dependencies that cannot be ignored.
+ // Specifically:
+ // J2_call ... %R0<imp-def> ; SUJ
+ // R0 = ... ; SUI
+ // Those cannot be packetized together, since the call will observe
+ // the effect of the assignment to R0.
+ if (DepType == SDep::Anti && J.isCall()) {
+ // Check if I defines any volatile register. We should also check
+ // registers that the call may read, but these happen to be a
+ // subset of the volatile register set.
+ for (const MCPhysReg *P = J.getDesc().ImplicitDefs; P && *P; ++P) {
+ if (!I.modifiesRegister(*P, HRI))
+ continue;
+ FoundSequentialDependence = true;
+ break;
+ }
+ }
+
+ // Skip over remaining anti-dependences. Two instructions that are
+ // anti-dependent can share a packet, since in most such cases all
+ // operands are read before any modifications take place.
+ // The exceptions are branch and call instructions, since they are
+ // executed after all other instructions have completed (at least
+ // conceptually).
+ if (DepType != SDep::Anti) {
+ FoundSequentialDependence = true;
+ break;
+ }
+ }
+
+ if (FoundSequentialDependence) {
+ Dependence = true;
+ return false;
+ }
+
+ return true;
+}
+
+bool HexagonPacketizerList::isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) {
+ assert(SUI->getInstr() && SUJ->getInstr());
+ MachineInstr &I = *SUI->getInstr();
+ MachineInstr &J = *SUJ->getInstr();
+
+ if (cannotCoexist(I, J))
+ return false;
+
+ if (!Dependence)
+ return true;
+
+ // Check if the instruction was promoted to a dot-new. If so, demote it
+ // back into a dot-old.
+ if (PromotedToDotNew)
+ demoteToDotOld(I);
+
+ cleanUpDotCur();
+ // Check if the instruction (must be a store) was glued with an allocframe
+ // instruction. If so, restore its offset to its original value, i.e. use
+ // current SP instead of caller's SP.
+ if (GlueAllocframeStore) {
+ useCalleesSP(I);
+ GlueAllocframeStore = false;
+ }
+ return false;
+}
+
+MachineBasicBlock::iterator
+HexagonPacketizerList::addToPacket(MachineInstr &MI) {
+ MachineBasicBlock::iterator MII = MI.getIterator();
+ MachineBasicBlock *MBB = MI.getParent();
+ if (MI.isImplicitDef()) {
+ unsigned R = MI.getOperand(0).getReg();
+ if (Hexagon::IntRegsRegClass.contains(R)) {
+ MCSuperRegIterator S(R, HRI, false);
+ MI.addOperand(MachineOperand::CreateReg(*S, true, true));
+ }
+ return MII;
+ }
+ assert(ResourceTracker->canReserveResources(MI));
+
+ bool ExtMI = HII->isExtended(MI) || HII->isConstExtended(MI);
+ bool Good = true;
+
+ if (GlueToNewValueJump) {
+ MachineInstr &NvjMI = *++MII;
+ // We need to put both instructions in the same packet: MI and NvjMI.
+ // Either of them can require a constant extender. Try to add both to
+ // the current packet, and if that fails, end the packet and start a
+ // new one.
+ ResourceTracker->reserveResources(MI);
+ if (ExtMI)
+ Good = tryAllocateResourcesForConstExt(true);
+
+ bool ExtNvjMI = HII->isExtended(NvjMI) || HII->isConstExtended(NvjMI);
+ if (Good) {
+ if (ResourceTracker->canReserveResources(NvjMI))
+ ResourceTracker->reserveResources(NvjMI);
+ else
+ Good = false;
+ }
+ if (Good && ExtNvjMI)
+ Good = tryAllocateResourcesForConstExt(true);
+
+ if (!Good) {
+ endPacket(MBB, MI);
+ assert(ResourceTracker->canReserveResources(MI));
+ ResourceTracker->reserveResources(MI);
+ if (ExtMI) {
+ assert(canReserveResourcesForConstExt());
+ tryAllocateResourcesForConstExt(true);
+ }
+ assert(ResourceTracker->canReserveResources(NvjMI));
+ ResourceTracker->reserveResources(NvjMI);
+ if (ExtNvjMI) {
+ assert(canReserveResourcesForConstExt());
+ reserveResourcesForConstExt();
+ }
+ }
+ CurrentPacketMIs.push_back(&MI);
+ CurrentPacketMIs.push_back(&NvjMI);
+ return MII;
+ }
+
+ ResourceTracker->reserveResources(MI);
+ if (ExtMI && !tryAllocateResourcesForConstExt(true)) {
+ endPacket(MBB, MI);
+ if (PromotedToDotNew)
+ demoteToDotOld(MI);
+ if (GlueAllocframeStore) {
+ useCalleesSP(MI);
+ GlueAllocframeStore = false;
+ }
+ ResourceTracker->reserveResources(MI);
+ reserveResourcesForConstExt();
+ }
+
+ CurrentPacketMIs.push_back(&MI);
+ return MII;
+}
+
+void HexagonPacketizerList::endPacket(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator MI) {
+ OldPacketMIs = CurrentPacketMIs;
+ VLIWPacketizerList::endPacket(MBB, MI);
+}
+
+bool HexagonPacketizerList::shouldAddToPacket(const MachineInstr &MI) {
+ return !producesStall(MI);
+}
+
+
+// Return true when ConsMI uses a register defined by ProdMI.
+static bool isDependent(const MachineInstr &ProdMI,
+ const MachineInstr &ConsMI) {
+ if (!ProdMI.getOperand(0).isReg())
+ return false;
+ unsigned DstReg = ProdMI.getOperand(0).getReg();
+
+ for (auto &Op : ConsMI.operands())
+ if (Op.isReg() && Op.isUse() && Op.getReg() == DstReg)
+ // The MIs depend on each other.
+ return true;
+
+ return false;
+}
+
+// V60 forward scheduling.
+bool HexagonPacketizerList::producesStall(const MachineInstr &I) {
+ // Check whether the previous packet is in a different loop. If this is the
+ // case, there is little point in trying to avoid a stall because that would
+ // favor the rare case (loop entry) over the common case (loop iteration).
+ //
+ // TODO: We should really be able to check all the incoming edges if this is
+ // the first packet in a basic block, so we can avoid stalls from the loop
+ // backedge.
+ if (!OldPacketMIs.empty()) {
+ auto *OldBB = OldPacketMIs.front()->getParent();
+ auto *ThisBB = I.getParent();
+ if (MLI->getLoopFor(OldBB) != MLI->getLoopFor(ThisBB))
+ return false;
+ }
+
+ // Check for stall between two vector instructions.
+ if (HII->isV60VectorInstruction(I)) {
+ for (auto J : OldPacketMIs) {
+ if (!HII->isV60VectorInstruction(*J))
+ continue;
+ if (isDependent(*J, I) && !HII->isVecUsableNextPacket(*J, I))
+ return true;
+ }
+ return false;
+ }
+
+ // Check for stall between two scalar instructions. First, check that
+ // there is no definition of a use in the current packet, because it
+ // may be a candidate for .new.
+ for (auto J : CurrentPacketMIs)
+ if (!HII->isV60VectorInstruction(*J) && isDependent(*J, I))
+ return false;
+
+ // Check for stall between I and instructions in the previous packet.
+ if (MF.getSubtarget<HexagonSubtarget>().useBSBScheduling()) {
+ for (auto J : OldPacketMIs) {
+ if (HII->isV60VectorInstruction(*J))
+ continue;
+ if (!HII->isLateInstrFeedsEarlyInstr(*J, I))
+ continue;
+ if (isDependent(*J, I) && !HII->canExecuteInBundle(*J, I))
+ return true;
+ }
+ }
+
+ return false;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+FunctionPass *llvm::createHexagonPacketizer() {
+ return new HexagonPacketizer();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h b/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h
new file mode 100644
index 000000000000..b28b926ec300
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h
@@ -0,0 +1,117 @@
+#ifndef HEXAGONVLIWPACKETIZER_H
+#define HEXAGONVLIWPACKETIZER_H
+
+#include "llvm/CodeGen/DFAPacketizer.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+
+namespace llvm {
+class HexagonPacketizerList : public VLIWPacketizerList {
+ // Vector of instructions assigned to the packet that has just been created.
+ std::vector<MachineInstr*> OldPacketMIs;
+
+ // Has the instruction been promoted to a dot-new instruction.
+ bool PromotedToDotNew;
+
+ // Has the instruction been glued to allocframe.
+ bool GlueAllocframeStore;
+
+ // Has the feeder instruction been glued to new value jump.
+ bool GlueToNewValueJump;
+
+ // Check if there is a dependence between some instruction already in this
+ // packet and this instruction.
+ bool Dependence;
+
+ // Only check for dependence if there are resources available to
+ // schedule this instruction.
+ bool FoundSequentialDependence;
+
+ // Track MIs with ignored dependence.
+ std::vector<MachineInstr*> IgnoreDepMIs;
+
+protected:
+ /// \brief A handle to the branch probability pass.
+ const MachineBranchProbabilityInfo *MBPI;
+ const MachineLoopInfo *MLI;
+
+private:
+ const HexagonInstrInfo *HII;
+ const HexagonRegisterInfo *HRI;
+
+public:
+ // Ctor.
+ HexagonPacketizerList(MachineFunction &MF, MachineLoopInfo &MLI,
+ AliasAnalysis *AA,
+ const MachineBranchProbabilityInfo *MBPI);
+
+ // initPacketizerState - initialize some internal flags.
+ void initPacketizerState() override;
+
+ // ignorePseudoInstruction - Ignore bundling of pseudo instructions.
+ bool ignorePseudoInstruction(const MachineInstr &MI,
+ const MachineBasicBlock *MBB) override;
+
+ // isSoloInstruction - return true if instruction MI can not be packetized
+ // with any other instruction, which means that MI itself is a packet.
+ bool isSoloInstruction(const MachineInstr &MI) override;
+
+ // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ
+ // together.
+ bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) override;
+
+ // isLegalToPruneDependencies - Is it legal to prune dependece between SUI
+ // and SUJ.
+ bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) override;
+
+ MachineBasicBlock::iterator addToPacket(MachineInstr &MI) override;
+ void endPacket(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator MI) override;
+ bool shouldAddToPacket(const MachineInstr &MI) override;
+
+ void unpacketizeSoloInstrs(MachineFunction &MF);
+
+protected:
+ bool isCallDependent(const MachineInstr &MI, SDep::Kind DepType,
+ unsigned DepReg);
+ bool promoteToDotCur(MachineInstr &MI, SDep::Kind DepType,
+ MachineBasicBlock::iterator &MII,
+ const TargetRegisterClass *RC);
+ bool canPromoteToDotCur(const MachineInstr &MI, const SUnit *PacketSU,
+ unsigned DepReg, MachineBasicBlock::iterator &MII,
+ const TargetRegisterClass *RC);
+ void cleanUpDotCur();
+
+ bool promoteToDotNew(MachineInstr &MI, SDep::Kind DepType,
+ MachineBasicBlock::iterator &MII,
+ const TargetRegisterClass *RC);
+ bool canPromoteToDotNew(const MachineInstr &MI, const SUnit *PacketSU,
+ unsigned DepReg, MachineBasicBlock::iterator &MII,
+ const TargetRegisterClass *RC);
+ bool canPromoteToNewValue(const MachineInstr &MI, const SUnit *PacketSU,
+ unsigned DepReg, MachineBasicBlock::iterator &MII);
+ bool canPromoteToNewValueStore(const MachineInstr &MI,
+ const MachineInstr &PacketMI, unsigned DepReg);
+ bool demoteToDotOld(MachineInstr &MI);
+ bool useCallersSP(MachineInstr &MI);
+ void useCalleesSP(MachineInstr &MI);
+ bool arePredicatesComplements(MachineInstr &MI1, MachineInstr &MI2);
+ bool restrictingDepExistInPacket(MachineInstr&, unsigned);
+ bool isNewifiable(const MachineInstr &MI, const TargetRegisterClass *NewRC);
+ bool isCurifiable(MachineInstr &MI);
+ bool cannotCoexist(const MachineInstr &MI, const MachineInstr &MJ);
+ inline bool isPromotedToDotNew() const {
+ return PromotedToDotNew;
+ }
+ bool tryAllocateResourcesForConstExt(bool Reserve);
+ bool canReserveResourcesForConstExt();
+ void reserveResourcesForConstExt();
+ bool hasDeadDependence(const MachineInstr &I, const MachineInstr &J);
+ bool hasControlDependence(const MachineInstr &I, const MachineInstr &J);
+ bool hasV4SpecificDependence(const MachineInstr &I, const MachineInstr &J);
+ bool producesStall(const MachineInstr &MI);
+};
+} // namespace llvm
+#endif // HEXAGONVLIWPACKETIZER_H
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonVectorPrint.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonVectorPrint.cpp
new file mode 100644
index 000000000000..085d4645df06
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonVectorPrint.cpp
@@ -0,0 +1,209 @@
+//===-- HexagonVectorPrint.cpp - Generate vector printing instructions -===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass adds the capability to generate pseudo vector/predicate register
+// printing instructions. These pseudo instructions should be used with the
+// simulator, NEVER on hardware.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hexagon-vector-print"
+
+#include "HexagonInstrInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <string>
+#include <vector>
+
+using namespace llvm;
+
+static cl::opt<bool> TraceHexVectorStoresOnly("trace-hex-vector-stores-only",
+ cl::Hidden, cl::ZeroOrMore, cl::init(false),
+ cl::desc("Enables tracing of vector stores"));
+
+namespace llvm {
+
+ FunctionPass *createHexagonVectorPrint();
+ void initializeHexagonVectorPrintPass(PassRegistry&);
+
+} // end namespace llvm
+
+namespace {
+
+class HexagonVectorPrint : public MachineFunctionPass {
+ const HexagonSubtarget *QST;
+ const HexagonInstrInfo *QII;
+ const HexagonRegisterInfo *QRI;
+
+public:
+ static char ID;
+
+ HexagonVectorPrint()
+ : MachineFunctionPass(ID), QST(nullptr), QII(nullptr), QRI(nullptr) {
+ initializeHexagonVectorPrintPass(*PassRegistry::getPassRegistry());
+ }
+
+ StringRef getPassName() const override { return "Hexagon VectorPrint pass"; }
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+};
+
+char HexagonVectorPrint::ID = 0;
+
+} // end anonymous namespace
+
+static bool isVecReg(unsigned Reg) {
+ return (Reg >= Hexagon::V0 && Reg <= Hexagon::V31)
+ || (Reg >= Hexagon::W0 && Reg <= Hexagon::W15)
+ || (Reg >= Hexagon::Q0 && Reg <= Hexagon::Q3);
+}
+
+static std::string getStringReg(unsigned R) {
+ if (R >= Hexagon::V0 && R <= Hexagon::V31) {
+ static const char* S[] = { "20", "21", "22", "23", "24", "25", "26", "27",
+ "28", "29", "2a", "2b", "2c", "2d", "2e", "2f",
+ "30", "31", "32", "33", "34", "35", "36", "37",
+ "38", "39", "3a", "3b", "3c", "3d", "3e", "3f"};
+ return S[R-Hexagon::V0];
+ }
+ if (R >= Hexagon::Q0 && R <= Hexagon::Q3) {
+ static const char* S[] = { "00", "01", "02", "03"};
+ return S[R-Hexagon::Q0];
+
+ }
+ llvm_unreachable("valid vreg");
+}
+
+static void addAsmInstr(MachineBasicBlock *MBB, unsigned Reg,
+ MachineBasicBlock::instr_iterator I,
+ const DebugLoc &DL, const HexagonInstrInfo *QII,
+ MachineFunction &Fn) {
+
+ std::string VDescStr = ".long 0x1dffe0" + getStringReg(Reg);
+ const char *cstr = Fn.createExternalSymbolName(VDescStr);
+ unsigned ExtraInfo = InlineAsm::Extra_HasSideEffects;
+ BuildMI(*MBB, I, DL, QII->get(TargetOpcode::INLINEASM))
+ .addExternalSymbol(cstr)
+ .addImm(ExtraInfo);
+}
+
+static bool getInstrVecReg(const MachineInstr &MI, unsigned &Reg) {
+ if (MI.getNumOperands() < 1) return false;
+ // Vec load or compute.
+ if (MI.getOperand(0).isReg() && MI.getOperand(0).isDef()) {
+ Reg = MI.getOperand(0).getReg();
+ if (isVecReg(Reg))
+ return !TraceHexVectorStoresOnly;
+ }
+ // Vec store.
+ if (MI.mayStore() && MI.getNumOperands() >= 3 && MI.getOperand(2).isReg()) {
+ Reg = MI.getOperand(2).getReg();
+ if (isVecReg(Reg))
+ return true;
+ }
+ // Vec store post increment.
+ if (MI.mayStore() && MI.getNumOperands() >= 4 && MI.getOperand(3).isReg()) {
+ Reg = MI.getOperand(3).getReg();
+ if (isVecReg(Reg))
+ return true;
+ }
+ return false;
+}
+
+bool HexagonVectorPrint::runOnMachineFunction(MachineFunction &Fn) {
+ bool Changed = false;
+ QST = &Fn.getSubtarget<HexagonSubtarget>();
+ QRI = QST->getRegisterInfo();
+ QII = QST->getInstrInfo();
+ std::vector<MachineInstr *> VecPrintList;
+ for (auto &MBB : Fn)
+ for (auto &MI : MBB) {
+ if (MI.isBundle()) {
+ MachineBasicBlock::instr_iterator MII = MI.getIterator();
+ for (++MII; MII != MBB.instr_end() && MII->isInsideBundle(); ++MII) {
+ if (MII->getNumOperands() < 1)
+ continue;
+ unsigned Reg = 0;
+ if (getInstrVecReg(*MII, Reg)) {
+ VecPrintList.push_back((&*MII));
+ DEBUG(dbgs() << "Found vector reg inside bundle \n"; MII->dump());
+ }
+ }
+ } else {
+ unsigned Reg = 0;
+ if (getInstrVecReg(MI, Reg)) {
+ VecPrintList.push_back(&MI);
+ DEBUG(dbgs() << "Found vector reg \n"; MI.dump());
+ }
+ }
+ }
+
+ Changed = !VecPrintList.empty();
+ if (!Changed)
+ return Changed;
+
+ for (auto *I : VecPrintList) {
+ DebugLoc DL = I->getDebugLoc();
+ MachineBasicBlock *MBB = I->getParent();
+ DEBUG(dbgs() << "Evaluating V MI\n"; I->dump());
+ unsigned Reg = 0;
+ if (!getInstrVecReg(*I, Reg))
+ llvm_unreachable("Need a vector reg");
+ MachineBasicBlock::instr_iterator MII = I->getIterator();
+ if (I->isInsideBundle()) {
+ DEBUG(dbgs() << "add to end of bundle\n"; I->dump());
+ while (MBB->instr_end() != MII && MII->isInsideBundle())
+ MII++;
+ } else {
+ DEBUG(dbgs() << "add after instruction\n"; I->dump());
+ MII++;
+ }
+ if (MBB->instr_end() == MII)
+ continue;
+
+ if (Reg >= Hexagon::V0 && Reg <= Hexagon::V31) {
+ DEBUG(dbgs() << "adding dump for V" << Reg-Hexagon::V0 << '\n');
+ addAsmInstr(MBB, Reg, MII, DL, QII, Fn);
+ } else if (Reg >= Hexagon::W0 && Reg <= Hexagon::W15) {
+ DEBUG(dbgs() << "adding dump for W" << Reg-Hexagon::W0 << '\n');
+ addAsmInstr(MBB, Hexagon::V0 + (Reg - Hexagon::W0) * 2 + 1,
+ MII, DL, QII, Fn);
+ addAsmInstr(MBB, Hexagon::V0 + (Reg - Hexagon::W0) * 2,
+ MII, DL, QII, Fn);
+ } else if (Reg >= Hexagon::Q0 && Reg <= Hexagon::Q3) {
+ DEBUG(dbgs() << "adding dump for Q" << Reg-Hexagon::Q0 << '\n');
+ addAsmInstr(MBB, Reg, MII, DL, QII, Fn);
+ } else
+ llvm_unreachable("Bad Vector reg");
+ }
+ return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// Public Constructor Functions
+//===----------------------------------------------------------------------===//
+INITIALIZE_PASS(HexagonVectorPrint, "hexagon-vector-print",
+ "Hexagon VectorPrint pass", false, false)
+
+FunctionPass *llvm::createHexagonVectorPrint() {
+ return new HexagonVectorPrint();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
new file mode 100644
index 000000000000..c140bd1d7ee2
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -0,0 +1,753 @@
+//===-- HexagonAsmBackend.cpp - Hexagon Assembler Backend -----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "HexagonFixupKinds.h"
+#include "HexagonMCTargetDesc.h"
+#include "MCTargetDesc/HexagonBaseInfo.h"
+#include "MCTargetDesc/HexagonMCChecker.h"
+#include "MCTargetDesc/HexagonMCCodeEmitter.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
+#include "MCTargetDesc/HexagonMCShuffler.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#include <sstream>
+
+using namespace llvm;
+using namespace Hexagon;
+
+#define DEBUG_TYPE "hexagon-asm-backend"
+
+static cl::opt<bool> DisableFixup
+ ("mno-fixup", cl::desc("Disable fixing up resolved relocations for Hexagon"));
+
+namespace {
+
+class HexagonAsmBackend : public MCAsmBackend {
+ uint8_t OSABI;
+ StringRef CPU;
+ mutable uint64_t relaxedCnt;
+ std::unique_ptr <MCInstrInfo> MCII;
+ std::unique_ptr <MCInst *> RelaxTarget;
+ MCInst * Extender;
+
+ void ReplaceInstruction(MCCodeEmitter &E, MCRelaxableFragment &RF,
+ MCInst &HMB) const {
+ SmallVector<MCFixup, 4> Fixups;
+ SmallString<256> Code;
+ raw_svector_ostream VecOS(Code);
+ E.encodeInstruction(HMB, VecOS, Fixups, RF.getSubtargetInfo());
+
+ // Update the fragment.
+ RF.setInst(HMB);
+ RF.getContents() = Code;
+ RF.getFixups() = Fixups;
+ }
+public:
+ HexagonAsmBackend(const Target &T, uint8_t OSABI, StringRef CPU) :
+ OSABI(OSABI), MCII (T.createMCInstrInfo()), RelaxTarget(new MCInst *),
+ Extender(nullptr) {}
+
+ MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+ return createHexagonELFObjectWriter(OS, OSABI, CPU);
+ }
+
+ void setExtender(MCContext &Context) const {
+ if (Extender == nullptr)
+ const_cast<HexagonAsmBackend *>(this)->Extender = new (Context) MCInst;
+ }
+
+ MCInst *takeExtender() const {
+ assert(Extender != nullptr);
+ MCInst * Result = Extender;
+ const_cast<HexagonAsmBackend *>(this)->Extender = nullptr;
+ return Result;
+ }
+
+ unsigned getNumFixupKinds() const override {
+ return Hexagon::NumTargetFixupKinds;
+ }
+
+ const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
+ const static MCFixupKindInfo Infos[Hexagon::NumTargetFixupKinds] = {
+ // This table *must* be in same the order of fixup_* kinds in
+ // HexagonFixupKinds.h.
+ //
+ // namei offset bits flags
+ { "fixup_Hexagon_B22_PCREL", 0, 32, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_Hexagon_B15_PCREL", 0, 32, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_Hexagon_B7_PCREL", 0, 32, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_Hexagon_LO16", 0, 32, 0 },
+ { "fixup_Hexagon_HI16", 0, 32, 0 },
+ { "fixup_Hexagon_32", 0, 32, 0 },
+ { "fixup_Hexagon_16", 0, 32, 0 },
+ { "fixup_Hexagon_8", 0, 32, 0 },
+ { "fixup_Hexagon_GPREL16_0", 0, 32, 0 },
+ { "fixup_Hexagon_GPREL16_1", 0, 32, 0 },
+ { "fixup_Hexagon_GPREL16_2", 0, 32, 0 },
+ { "fixup_Hexagon_GPREL16_3", 0, 32, 0 },
+ { "fixup_Hexagon_HL16", 0, 32, 0 },
+ { "fixup_Hexagon_B13_PCREL", 0, 32, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_Hexagon_B9_PCREL", 0, 32, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_Hexagon_B32_PCREL_X", 0, 32, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_Hexagon_32_6_X", 0, 32, 0 },
+ { "fixup_Hexagon_B22_PCREL_X", 0, 32, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_Hexagon_B15_PCREL_X", 0, 32, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_Hexagon_B13_PCREL_X", 0, 32, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_Hexagon_B9_PCREL_X", 0, 32, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_Hexagon_B7_PCREL_X", 0, 32, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_Hexagon_16_X", 0, 32, 0 },
+ { "fixup_Hexagon_12_X", 0, 32, 0 },
+ { "fixup_Hexagon_11_X", 0, 32, 0 },
+ { "fixup_Hexagon_10_X", 0, 32, 0 },
+ { "fixup_Hexagon_9_X", 0, 32, 0 },
+ { "fixup_Hexagon_8_X", 0, 32, 0 },
+ { "fixup_Hexagon_7_X", 0, 32, 0 },
+ { "fixup_Hexagon_6_X", 0, 32, 0 },
+ { "fixup_Hexagon_32_PCREL", 0, 32, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_Hexagon_COPY", 0, 32, 0 },
+ { "fixup_Hexagon_GLOB_DAT", 0, 32, 0 },
+ { "fixup_Hexagon_JMP_SLOT", 0, 32, 0 },
+ { "fixup_Hexagon_RELATIVE", 0, 32, 0 },
+ { "fixup_Hexagon_PLT_B22_PCREL", 0, 32, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_Hexagon_GOTREL_LO16", 0, 32, 0 },
+ { "fixup_Hexagon_GOTREL_HI16", 0, 32, 0 },
+ { "fixup_Hexagon_GOTREL_32", 0, 32, 0 },
+ { "fixup_Hexagon_GOT_LO16", 0, 32, 0 },
+ { "fixup_Hexagon_GOT_HI16", 0, 32, 0 },
+ { "fixup_Hexagon_GOT_32", 0, 32, 0 },
+ { "fixup_Hexagon_GOT_16", 0, 32, 0 },
+ { "fixup_Hexagon_DTPMOD_32", 0, 32, 0 },
+ { "fixup_Hexagon_DTPREL_LO16", 0, 32, 0 },
+ { "fixup_Hexagon_DTPREL_HI16", 0, 32, 0 },
+ { "fixup_Hexagon_DTPREL_32", 0, 32, 0 },
+ { "fixup_Hexagon_DTPREL_16", 0, 32, 0 },
+ { "fixup_Hexagon_GD_PLT_B22_PCREL", 0, 32, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_Hexagon_LD_PLT_B22_PCREL", 0, 32, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_Hexagon_GD_GOT_LO16", 0, 32, 0 },
+ { "fixup_Hexagon_GD_GOT_HI16", 0, 32, 0 },
+ { "fixup_Hexagon_GD_GOT_32", 0, 32, 0 },
+ { "fixup_Hexagon_GD_GOT_16", 0, 32, 0 },
+ { "fixup_Hexagon_LD_GOT_LO16", 0, 32, 0 },
+ { "fixup_Hexagon_LD_GOT_HI16", 0, 32, 0 },
+ { "fixup_Hexagon_LD_GOT_32", 0, 32, 0 },
+ { "fixup_Hexagon_LD_GOT_16", 0, 32, 0 },
+ { "fixup_Hexagon_IE_LO16", 0, 32, 0 },
+ { "fixup_Hexagon_IE_HI16", 0, 32, 0 },
+ { "fixup_Hexagon_IE_32", 0, 32, 0 },
+ { "fixup_Hexagon_IE_16", 0, 32, 0 },
+ { "fixup_Hexagon_IE_GOT_LO16", 0, 32, 0 },
+ { "fixup_Hexagon_IE_GOT_HI16", 0, 32, 0 },
+ { "fixup_Hexagon_IE_GOT_32", 0, 32, 0 },
+ { "fixup_Hexagon_IE_GOT_16", 0, 32, 0 },
+ { "fixup_Hexagon_TPREL_LO16", 0, 32, 0 },
+ { "fixup_Hexagon_TPREL_HI16", 0, 32, 0 },
+ { "fixup_Hexagon_TPREL_32", 0, 32, 0 },
+ { "fixup_Hexagon_TPREL_16", 0, 32, 0 },
+ { "fixup_Hexagon_6_PCREL_X", 0, 32, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_Hexagon_GOTREL_32_6_X", 0, 32, 0 },
+ { "fixup_Hexagon_GOTREL_16_X", 0, 32, 0 },
+ { "fixup_Hexagon_GOTREL_11_X", 0, 32, 0 },
+ { "fixup_Hexagon_GOT_32_6_X", 0, 32, 0 },
+ { "fixup_Hexagon_GOT_16_X", 0, 32, 0 },
+ { "fixup_Hexagon_GOT_11_X", 0, 32, 0 },
+ { "fixup_Hexagon_DTPREL_32_6_X", 0, 32, 0 },
+ { "fixup_Hexagon_DTPREL_16_X", 0, 32, 0 },
+ { "fixup_Hexagon_DTPREL_11_X", 0, 32, 0 },
+ { "fixup_Hexagon_GD_GOT_32_6_X", 0, 32, 0 },
+ { "fixup_Hexagon_GD_GOT_16_X", 0, 32, 0 },
+ { "fixup_Hexagon_GD_GOT_11_X", 0, 32, 0 },
+ { "fixup_Hexagon_LD_GOT_32_6_X", 0, 32, 0 },
+ { "fixup_Hexagon_LD_GOT_16_X", 0, 32, 0 },
+ { "fixup_Hexagon_LD_GOT_11_X", 0, 32, 0 },
+ { "fixup_Hexagon_IE_32_6_X", 0, 32, 0 },
+ { "fixup_Hexagon_IE_16_X", 0, 32, 0 },
+ { "fixup_Hexagon_IE_GOT_32_6_X", 0, 32, 0 },
+ { "fixup_Hexagon_IE_GOT_16_X", 0, 32, 0 },
+ { "fixup_Hexagon_IE_GOT_11_X", 0, 32, 0 },
+ { "fixup_Hexagon_TPREL_32_6_X", 0, 32, 0 },
+ { "fixup_Hexagon_TPREL_16_X", 0, 32, 0 },
+ { "fixup_Hexagon_TPREL_11_X", 0, 32, 0 }
+ };
+
+ if (Kind < FirstTargetFixupKind)
+ return MCAsmBackend::getFixupKindInfo(Kind);
+
+ assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+ "Invalid kind!");
+ return Infos[Kind - FirstTargetFixupKind];
+ }
+
+ /// processFixupValue - Target hook to adjust the literal value of a fixup
+ /// if necessary. IsResolved signals whether the caller believes a relocation
+ /// is needed; the target can modify the value. The default does nothing.
+ void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
+ const MCFixup &Fixup, const MCFragment *DF,
+ const MCValue &Target, uint64_t &Value,
+ bool &IsResolved) override {
+ MCFixupKind Kind = Fixup.getKind();
+
+ switch((unsigned)Kind) {
+ default:
+ llvm_unreachable("Unknown Fixup Kind!");
+
+ case fixup_Hexagon_LO16:
+ case fixup_Hexagon_HI16:
+ case fixup_Hexagon_16:
+ case fixup_Hexagon_8:
+ case fixup_Hexagon_GPREL16_0:
+ case fixup_Hexagon_GPREL16_1:
+ case fixup_Hexagon_GPREL16_2:
+ case fixup_Hexagon_GPREL16_3:
+ case fixup_Hexagon_HL16:
+ case fixup_Hexagon_32_6_X:
+ case fixup_Hexagon_16_X:
+ case fixup_Hexagon_12_X:
+ case fixup_Hexagon_11_X:
+ case fixup_Hexagon_10_X:
+ case fixup_Hexagon_9_X:
+ case fixup_Hexagon_8_X:
+ case fixup_Hexagon_7_X:
+ case fixup_Hexagon_6_X:
+ case fixup_Hexagon_COPY:
+ case fixup_Hexagon_GLOB_DAT:
+ case fixup_Hexagon_JMP_SLOT:
+ case fixup_Hexagon_RELATIVE:
+ case fixup_Hexagon_PLT_B22_PCREL:
+ case fixup_Hexagon_GOTREL_LO16:
+ case fixup_Hexagon_GOTREL_HI16:
+ case fixup_Hexagon_GOTREL_32:
+ case fixup_Hexagon_GOT_LO16:
+ case fixup_Hexagon_GOT_HI16:
+ case fixup_Hexagon_GOT_32:
+ case fixup_Hexagon_GOT_16:
+ case fixup_Hexagon_DTPMOD_32:
+ case fixup_Hexagon_DTPREL_LO16:
+ case fixup_Hexagon_DTPREL_HI16:
+ case fixup_Hexagon_DTPREL_32:
+ case fixup_Hexagon_DTPREL_16:
+ case fixup_Hexagon_GD_PLT_B22_PCREL:
+ case fixup_Hexagon_LD_PLT_B22_PCREL:
+ case fixup_Hexagon_GD_GOT_LO16:
+ case fixup_Hexagon_GD_GOT_HI16:
+ case fixup_Hexagon_GD_GOT_32:
+ case fixup_Hexagon_GD_GOT_16:
+ case fixup_Hexagon_LD_GOT_LO16:
+ case fixup_Hexagon_LD_GOT_HI16:
+ case fixup_Hexagon_LD_GOT_32:
+ case fixup_Hexagon_LD_GOT_16:
+ case fixup_Hexagon_IE_LO16:
+ case fixup_Hexagon_IE_HI16:
+ case fixup_Hexagon_IE_32:
+ case fixup_Hexagon_IE_16:
+ case fixup_Hexagon_IE_GOT_LO16:
+ case fixup_Hexagon_IE_GOT_HI16:
+ case fixup_Hexagon_IE_GOT_32:
+ case fixup_Hexagon_IE_GOT_16:
+ case fixup_Hexagon_TPREL_LO16:
+ case fixup_Hexagon_TPREL_HI16:
+ case fixup_Hexagon_TPREL_32:
+ case fixup_Hexagon_TPREL_16:
+ case fixup_Hexagon_GOTREL_32_6_X:
+ case fixup_Hexagon_GOTREL_16_X:
+ case fixup_Hexagon_GOTREL_11_X:
+ case fixup_Hexagon_GOT_32_6_X:
+ case fixup_Hexagon_GOT_16_X:
+ case fixup_Hexagon_GOT_11_X:
+ case fixup_Hexagon_DTPREL_32_6_X:
+ case fixup_Hexagon_DTPREL_16_X:
+ case fixup_Hexagon_DTPREL_11_X:
+ case fixup_Hexagon_GD_GOT_32_6_X:
+ case fixup_Hexagon_GD_GOT_16_X:
+ case fixup_Hexagon_GD_GOT_11_X:
+ case fixup_Hexagon_LD_GOT_32_6_X:
+ case fixup_Hexagon_LD_GOT_16_X:
+ case fixup_Hexagon_LD_GOT_11_X:
+ case fixup_Hexagon_IE_32_6_X:
+ case fixup_Hexagon_IE_16_X:
+ case fixup_Hexagon_IE_GOT_32_6_X:
+ case fixup_Hexagon_IE_GOT_16_X:
+ case fixup_Hexagon_IE_GOT_11_X:
+ case fixup_Hexagon_TPREL_32_6_X:
+ case fixup_Hexagon_TPREL_16_X:
+ case fixup_Hexagon_TPREL_11_X:
+ case fixup_Hexagon_32_PCREL:
+ case fixup_Hexagon_6_PCREL_X:
+ case fixup_Hexagon_23_REG:
+ // These relocations should always have a relocation recorded
+ IsResolved = false;
+ return;
+
+ case fixup_Hexagon_B22_PCREL:
+ //IsResolved = false;
+ break;
+
+ case fixup_Hexagon_B13_PCREL:
+ case fixup_Hexagon_B13_PCREL_X:
+ case fixup_Hexagon_B32_PCREL_X:
+ case fixup_Hexagon_B22_PCREL_X:
+ case fixup_Hexagon_B15_PCREL:
+ case fixup_Hexagon_B15_PCREL_X:
+ case fixup_Hexagon_B9_PCREL:
+ case fixup_Hexagon_B9_PCREL_X:
+ case fixup_Hexagon_B7_PCREL:
+ case fixup_Hexagon_B7_PCREL_X:
+ if (DisableFixup)
+ IsResolved = false;
+ break;
+
+ case FK_Data_1:
+ case FK_Data_2:
+ case FK_Data_4:
+ case FK_PCRel_4:
+ case fixup_Hexagon_32:
+ // Leave these relocations alone as they are used for EH.
+ return;
+ }
+ }
+
+ /// getFixupKindNumBytes - The number of bytes the fixup may change.
+ static unsigned getFixupKindNumBytes(unsigned Kind) {
+ switch (Kind) {
+ default:
+ return 0;
+
+ case FK_Data_1:
+ return 1;
+ case FK_Data_2:
+ return 2;
+ case FK_Data_4: // this later gets mapped to R_HEX_32
+ case FK_PCRel_4: // this later gets mapped to R_HEX_32_PCREL
+ case fixup_Hexagon_32:
+ case fixup_Hexagon_B32_PCREL_X:
+ case fixup_Hexagon_B22_PCREL:
+ case fixup_Hexagon_B22_PCREL_X:
+ case fixup_Hexagon_B15_PCREL:
+ case fixup_Hexagon_B15_PCREL_X:
+ case fixup_Hexagon_B13_PCREL:
+ case fixup_Hexagon_B13_PCREL_X:
+ case fixup_Hexagon_B9_PCREL:
+ case fixup_Hexagon_B9_PCREL_X:
+ case fixup_Hexagon_B7_PCREL:
+ case fixup_Hexagon_B7_PCREL_X:
+ return 4;
+ }
+ }
+
+ // Make up for left shift when encoding the operand.
+ static uint64_t adjustFixupValue(MCFixupKind Kind, uint64_t Value) {
+ switch((unsigned)Kind) {
+ default:
+ break;
+
+ case fixup_Hexagon_B7_PCREL:
+ case fixup_Hexagon_B9_PCREL:
+ case fixup_Hexagon_B13_PCREL:
+ case fixup_Hexagon_B15_PCREL:
+ case fixup_Hexagon_B22_PCREL:
+ Value >>= 2;
+ break;
+
+ case fixup_Hexagon_B7_PCREL_X:
+ case fixup_Hexagon_B9_PCREL_X:
+ case fixup_Hexagon_B13_PCREL_X:
+ case fixup_Hexagon_B15_PCREL_X:
+ case fixup_Hexagon_B22_PCREL_X:
+ Value &= 0x3f;
+ break;
+
+ case fixup_Hexagon_B32_PCREL_X:
+ Value >>= 6;
+ break;
+ }
+ return (Value);
+ }
+
+ void HandleFixupError(const int bits, const int align_bits,
+ const int64_t FixupValue, const char *fixupStr) const {
+ // Error: value 1124 out of range: -1024-1023 when resolving
+ // symbol in file xprtsock.S
+ const APInt IntMin = APInt::getSignedMinValue(bits+align_bits);
+ const APInt IntMax = APInt::getSignedMaxValue(bits+align_bits);
+ std::stringstream errStr;
+ errStr << "\nError: value " <<
+ FixupValue <<
+ " out of range: " <<
+ IntMin.getSExtValue() <<
+ "-" <<
+ IntMax.getSExtValue() <<
+ " when resolving " <<
+ fixupStr <<
+ " fixup\n";
+ llvm_unreachable(errStr.str().c_str());
+ }
+
+ /// ApplyFixup - Apply the \arg Value for given \arg Fixup into the provided
+ /// data fragment, at the offset specified by the fixup and following the
+ /// fixup kind as appropriate.
+ void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+ uint64_t FixupValue, bool IsPCRel) const override {
+
+ // When FixupValue is 0 the relocation is external and there
+ // is nothing for us to do.
+ if (!FixupValue) return;
+
+ MCFixupKind Kind = Fixup.getKind();
+ uint64_t Value;
+ uint32_t InstMask;
+ uint32_t Reloc;
+
+ // LLVM gives us an encoded value, we have to convert it back
+ // to a real offset before we can use it.
+ uint32_t Offset = Fixup.getOffset();
+ unsigned NumBytes = getFixupKindNumBytes(Kind);
+ assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+ char *InstAddr = Data + Offset;
+
+ Value = adjustFixupValue(Kind, FixupValue);
+ if(!Value)
+ return;
+ int sValue = (int)Value;
+
+ switch((unsigned)Kind) {
+ default:
+ return;
+
+ case fixup_Hexagon_B7_PCREL:
+ if (!(isIntN(7, sValue)))
+ HandleFixupError(7, 2, (int64_t)FixupValue, "B7_PCREL");
+ case fixup_Hexagon_B7_PCREL_X:
+ InstMask = 0x00001f18; // Word32_B7
+ Reloc = (((Value >> 2) & 0x1f) << 8) | // Value 6-2 = Target 12-8
+ ((Value & 0x3) << 3); // Value 1-0 = Target 4-3
+ break;
+
+ case fixup_Hexagon_B9_PCREL:
+ if (!(isIntN(9, sValue)))
+ HandleFixupError(9, 2, (int64_t)FixupValue, "B9_PCREL");
+ case fixup_Hexagon_B9_PCREL_X:
+ InstMask = 0x003000fe; // Word32_B9
+ Reloc = (((Value >> 7) & 0x3) << 20) | // Value 8-7 = Target 21-20
+ ((Value & 0x7f) << 1); // Value 6-0 = Target 7-1
+ break;
+
+ // Since the existing branches that use this relocation cannot be
+ // extended, they should only be fixed up if the target is within range.
+ case fixup_Hexagon_B13_PCREL:
+ if (!(isIntN(13, sValue)))
+ HandleFixupError(13, 2, (int64_t)FixupValue, "B13_PCREL");
+ case fixup_Hexagon_B13_PCREL_X:
+ InstMask = 0x00202ffe; // Word32_B13
+ Reloc = (((Value >> 12) & 0x1) << 21) | // Value 12 = Target 21
+ (((Value >> 11) & 0x1) << 13) | // Value 11 = Target 13
+ ((Value & 0x7ff) << 1); // Value 10-0 = Target 11-1
+ break;
+
+ case fixup_Hexagon_B15_PCREL:
+ if (!(isIntN(15, sValue)))
+ HandleFixupError(15, 2, (int64_t)FixupValue, "B15_PCREL");
+ case fixup_Hexagon_B15_PCREL_X:
+ InstMask = 0x00df20fe; // Word32_B15
+ Reloc = (((Value >> 13) & 0x3) << 22) | // Value 14-13 = Target 23-22
+ (((Value >> 8) & 0x1f) << 16) | // Value 12-8 = Target 20-16
+ (((Value >> 7) & 0x1) << 13) | // Value 7 = Target 13
+ ((Value & 0x7f) << 1); // Value 6-0 = Target 7-1
+ break;
+
+ case fixup_Hexagon_B22_PCREL:
+ if (!(isIntN(22, sValue)))
+ HandleFixupError(22, 2, (int64_t)FixupValue, "B22_PCREL");
+ case fixup_Hexagon_B22_PCREL_X:
+ InstMask = 0x01ff3ffe; // Word32_B22
+ Reloc = (((Value >> 13) & 0x1ff) << 16) | // Value 21-13 = Target 24-16
+ ((Value & 0x1fff) << 1); // Value 12-0 = Target 13-1
+ break;
+
+ case fixup_Hexagon_B32_PCREL_X:
+ InstMask = 0x0fff3fff; // Word32_X26
+ Reloc = (((Value >> 14) & 0xfff) << 16) | // Value 25-14 = Target 27-16
+ (Value & 0x3fff); // Value 13-0 = Target 13-0
+ break;
+
+ case FK_Data_1:
+ case FK_Data_2:
+ case FK_Data_4:
+ case fixup_Hexagon_32:
+ InstMask = 0xffffffff; // Word32
+ Reloc = Value;
+ break;
+ }
+
+ DEBUG(dbgs() << "Name=" << getFixupKindInfo(Kind).Name << "(" <<
+ (unsigned)Kind << ")\n");
+ DEBUG(uint32_t OldData = 0;
+ for (unsigned i = 0; i < NumBytes; i++)
+ OldData |= (InstAddr[i] << (i * 8)) & (0xff << (i * 8));
+ dbgs() << "\tBValue=0x"; dbgs().write_hex(Value) <<
+ ": AValue=0x"; dbgs().write_hex(FixupValue) <<
+ ": Offset=" << Offset <<
+ ": Size=" << DataSize <<
+ ": OInst=0x"; dbgs().write_hex(OldData) <<
+ ": Reloc=0x"; dbgs().write_hex(Reloc););
+
+ // For each byte of the fragment that the fixup touches, mask in the
+ // bits from the fixup value. The Value has been "split up" into the
+ // appropriate bitfields above.
+ for (unsigned i = 0; i < NumBytes; i++){
+ InstAddr[i] &= uint8_t(~InstMask >> (i * 8)) & 0xff; // Clear reloc bits
+ InstAddr[i] |= uint8_t(Reloc >> (i * 8)) & 0xff; // Apply new reloc
+ }
+
+ DEBUG(uint32_t NewData = 0;
+ for (unsigned i = 0; i < NumBytes; i++)
+ NewData |= (InstAddr[i] << (i * 8)) & (0xff << (i * 8));
+ dbgs() << ": NInst=0x"; dbgs().write_hex(NewData) << "\n";);
+ }
+
+ bool isInstRelaxable(MCInst const &HMI) const {
+ const MCInstrDesc &MCID = HexagonMCInstrInfo::getDesc(*MCII, HMI);
+ bool Relaxable = false;
+ // Branches and loop-setup insns are handled as necessary by relaxation.
+ if (llvm::HexagonMCInstrInfo::getType(*MCII, HMI) == HexagonII::TypeJ ||
+ (llvm::HexagonMCInstrInfo::getType(*MCII, HMI) ==
+ HexagonII::TypeCOMPOUND &&
+ MCID.isBranch()) ||
+ (llvm::HexagonMCInstrInfo::getType(*MCII, HMI) == HexagonII::TypeNV &&
+ MCID.isBranch()) ||
+ (llvm::HexagonMCInstrInfo::getType(*MCII, HMI) == HexagonII::TypeCR &&
+ HMI.getOpcode() != Hexagon::C4_addipc))
+ if (HexagonMCInstrInfo::isExtendable(*MCII, HMI)) {
+ Relaxable = true;
+ MCOperand const &Operand =
+ HMI.getOperand(HexagonMCInstrInfo::getExtendableOp(*MCII, HMI));
+ if (HexagonMCInstrInfo::mustNotExtend(*Operand.getExpr()))
+ Relaxable = false;
+ }
+
+ return Relaxable;
+ }
+
+ /// MayNeedRelaxation - Check whether the given instruction may need
+ /// relaxation.
+ ///
+ /// \param Inst - The instruction to test.
+ bool mayNeedRelaxation(MCInst const &Inst) const override {
+ return true;
+ }
+
+ /// fixupNeedsRelaxation - Target specific predicate for whether a given
+ /// fixup requires the associated instruction to be relaxed.
+ bool fixupNeedsRelaxationAdvanced(const MCFixup &Fixup, bool Resolved,
+ uint64_t Value,
+ const MCRelaxableFragment *DF,
+ const MCAsmLayout &Layout) const override {
+ MCInst const &MCB = DF->getInst();
+ assert(HexagonMCInstrInfo::isBundle(MCB));
+
+ *RelaxTarget = nullptr;
+ MCInst &MCI = const_cast<MCInst &>(HexagonMCInstrInfo::instruction(
+ MCB, Fixup.getOffset() / HEXAGON_INSTR_SIZE));
+ bool Relaxable = isInstRelaxable(MCI);
+ if (Relaxable == false)
+ return false;
+ // If we cannot resolve the fixup value, it requires relaxation.
+ if (!Resolved) {
+ switch ((unsigned)Fixup.getKind()) {
+ case fixup_Hexagon_B22_PCREL:
+ // GetFixupCount assumes B22 won't relax
+ LLVM_FALLTHROUGH;
+ default:
+ return false;
+ break;
+ case fixup_Hexagon_B13_PCREL:
+ case fixup_Hexagon_B15_PCREL:
+ case fixup_Hexagon_B9_PCREL:
+ case fixup_Hexagon_B7_PCREL: {
+ if (HexagonMCInstrInfo::bundleSize(MCB) < HEXAGON_PACKET_SIZE) {
+ ++relaxedCnt;
+ *RelaxTarget = &MCI;
+ setExtender(Layout.getAssembler().getContext());
+ return true;
+ } else {
+ return false;
+ }
+ break;
+ }
+ }
+ }
+
+ MCFixupKind Kind = Fixup.getKind();
+ int64_t sValue = Value;
+ int64_t maxValue;
+
+ switch ((unsigned)Kind) {
+ case fixup_Hexagon_B7_PCREL:
+ maxValue = 1 << 8;
+ break;
+ case fixup_Hexagon_B9_PCREL:
+ maxValue = 1 << 10;
+ break;
+ case fixup_Hexagon_B15_PCREL:
+ maxValue = 1 << 16;
+ break;
+ case fixup_Hexagon_B22_PCREL:
+ maxValue = 1 << 23;
+ break;
+ default:
+ maxValue = INT64_MAX;
+ break;
+ }
+
+ bool isFarAway = -maxValue > sValue || sValue > maxValue - 1;
+
+ if (isFarAway) {
+ if (HexagonMCInstrInfo::bundleSize(MCB) < HEXAGON_PACKET_SIZE) {
+ ++relaxedCnt;
+ *RelaxTarget = &MCI;
+ setExtender(Layout.getAssembler().getContext());
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ /// Simple predicate for targets where !Resolved implies requiring relaxation
+ bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+ const MCRelaxableFragment *DF,
+ const MCAsmLayout &Layout) const override {
+ llvm_unreachable("Handled by fixupNeedsRelaxationAdvanced");
+ }
+
+ void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+ MCInst &Res) const override {
+ assert(HexagonMCInstrInfo::isBundle(Inst) &&
+ "Hexagon relaxInstruction only works on bundles");
+
+ Res = HexagonMCInstrInfo::createBundle();
+ // Copy the results into the bundle.
+ bool Update = false;
+ for (auto &I : HexagonMCInstrInfo::bundleInstructions(Inst)) {
+ MCInst &CrntHMI = const_cast<MCInst &>(*I.getInst());
+
+ // if immediate extender needed, add it in
+ if (*RelaxTarget == &CrntHMI) {
+ Update = true;
+ assert((HexagonMCInstrInfo::bundleSize(Res) < HEXAGON_PACKET_SIZE) &&
+ "No room to insert extender for relaxation");
+
+ MCInst *HMIx = takeExtender();
+ *HMIx = HexagonMCInstrInfo::deriveExtender(
+ *MCII, CrntHMI,
+ HexagonMCInstrInfo::getExtendableOperand(*MCII, CrntHMI));
+ Res.addOperand(MCOperand::createInst(HMIx));
+ *RelaxTarget = nullptr;
+ }
+ // now copy over the original instruction(the one we may have extended)
+ Res.addOperand(MCOperand::createInst(I.getInst()));
+ }
+ (void)Update;
+ assert(Update && "Didn't find relaxation target");
+ }
+
+ bool writeNopData(uint64_t Count,
+ MCObjectWriter * OW) const override {
+ static const uint32_t Nopcode = 0x7f000000, // Hard-coded NOP.
+ ParseIn = 0x00004000, // In packet parse-bits.
+ ParseEnd = 0x0000c000; // End of packet parse-bits.
+
+ while(Count % HEXAGON_INSTR_SIZE) {
+ DEBUG(dbgs() << "Alignment not a multiple of the instruction size:" <<
+ Count % HEXAGON_INSTR_SIZE << "/" << HEXAGON_INSTR_SIZE << "\n");
+ --Count;
+ OW->write8(0);
+ }
+
+ while(Count) {
+ Count -= HEXAGON_INSTR_SIZE;
+ // Close the packet whenever a multiple of the maximum packet size remains
+ uint32_t ParseBits = (Count % (HEXAGON_PACKET_SIZE * HEXAGON_INSTR_SIZE))?
+ ParseIn: ParseEnd;
+ OW->write32(Nopcode | ParseBits);
+ }
+ return true;
+ }
+
+ void finishLayout(MCAssembler const &Asm,
+ MCAsmLayout &Layout) const override {
+ for (auto I : Layout.getSectionOrder()) {
+ auto &Fragments = I->getFragmentList();
+ for (auto &J : Fragments) {
+ switch (J.getKind()) {
+ default:
+ break;
+ case MCFragment::FT_Align: {
+ auto Size = Asm.computeFragmentSize(Layout, J);
+ for (auto K = J.getIterator();
+ K != Fragments.begin() && Size >= HEXAGON_PACKET_SIZE;) {
+ --K;
+ switch (K->getKind()) {
+ default:
+ break;
+ case MCFragment::FT_Align: {
+ // Don't pad before other alignments
+ Size = 0;
+ break;
+ }
+ case MCFragment::FT_Relaxable: {
+ auto &RF = cast<MCRelaxableFragment>(*K);
+ auto &Inst = const_cast<MCInst &>(RF.getInst());
+ while (Size > 0 && HexagonMCInstrInfo::bundleSize(Inst) < 4) {
+ MCInst *Nop = new (Asm.getContext()) MCInst;
+ Nop->setOpcode(Hexagon::A2_nop);
+ Inst.addOperand(MCOperand::createInst(Nop));
+ Size -= 4;
+ if (!HexagonMCChecker(
+ *MCII, RF.getSubtargetInfo(), Inst, Inst,
+ *Asm.getContext().getRegisterInfo()).check()) {
+ Inst.erase(Inst.end() - 1);
+ Size = 0;
+ }
+ }
+ bool Error = HexagonMCShuffle(*MCII, RF.getSubtargetInfo(), Inst);
+ //assert(!Error);
+ (void)Error;
+ ReplaceInstruction(Asm.getEmitter(), RF, Inst);
+ Layout.invalidateFragmentsFrom(&RF);
+ Size = 0; // Only look back one instruction
+ break;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+};
+} // end anonymous namespace
+
+namespace llvm {
+MCAsmBackend *createHexagonAsmBackend(Target const &T,
+ MCRegisterInfo const & /*MRI*/,
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options) {
+ uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TT.getOS());
+ return new HexagonAsmBackend(T, OSABI, CPU);
+}
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
new file mode 100644
index 000000000000..4292f6b3faa4
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
@@ -0,0 +1,308 @@
+//===-- HexagonBaseInfo.h - Top level definitions for Hexagon --*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the Hexagon target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONBASEINFO_H
+#define LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONBASEINFO_H
+
+#include "HexagonMCTargetDesc.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <stdint.h>
+
+namespace llvm {
+
+/// HexagonII - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace HexagonII {
+ // *** The code below must match HexagonInstrFormat*.td *** //
+
+ // Insn types.
+ // *** Must match HexagonInstrFormat*.td ***
+ enum Type {
+ TypePSEUDO = 0,
+ TypeALU32 = 1,
+ TypeCR = 2,
+ TypeJR = 3,
+ TypeJ = 4,
+ TypeLD = 5,
+ TypeST = 6,
+ TypeSYSTEM = 7,
+ TypeXTYPE = 8,
+ TypeV4LDST = 9,
+ TypeNV = 10,
+ TypeDUPLEX = 11,
+ TypeCOMPOUND = 12,
+ TypeCVI_FIRST = 13,
+ TypeCVI_VA = TypeCVI_FIRST,
+ TypeCVI_VA_DV = 14,
+ TypeCVI_VX = 15,
+ TypeCVI_VX_DV = 16,
+ TypeCVI_VP = 17,
+ TypeCVI_VP_VS = 18,
+ TypeCVI_VS = 19,
+ TypeCVI_VINLANESAT= 20,
+ TypeCVI_VM_LD = 21,
+ TypeCVI_VM_TMP_LD = 22,
+ TypeCVI_VM_CUR_LD = 23,
+ TypeCVI_VM_VP_LDU = 24,
+ TypeCVI_VM_ST = 25,
+ TypeCVI_VM_NEW_ST = 26,
+ TypeCVI_VM_STU = 27,
+ TypeCVI_HIST = 28,
+ TypeCVI_LAST = TypeCVI_HIST,
+ TypePREFIX = 30, // Such as extenders.
+ TypeENDLOOP = 31 // Such as end of a HW loop.
+ };
+
+ enum SubTarget {
+ HasV2SubT = 0xf,
+ HasV2SubTOnly = 0x1,
+ NoV2SubT = 0x0,
+ HasV3SubT = 0xe,
+ HasV3SubTOnly = 0x2,
+ NoV3SubT = 0x1,
+ HasV4SubT = 0xc,
+ NoV4SubT = 0x3,
+ HasV5SubT = 0x8,
+ NoV5SubT = 0x7
+ };
+
+ enum AddrMode {
+ NoAddrMode = 0, // No addressing mode
+ Absolute = 1, // Absolute addressing mode
+ AbsoluteSet = 2, // Absolute set addressing mode
+ BaseImmOffset = 3, // Indirect with offset
+ BaseLongOffset = 4, // Indirect with long offset
+ BaseRegOffset = 5, // Indirect with register offset
+ PostInc = 6 // Post increment addressing mode
+ };
+
+ // MemAccessSize is represented as 1+log2(N) where N is size in bits.
+ enum class MemAccessSize {
+ NoMemAccess = 0, // Not a memory access instruction.
+ ByteAccess = 1, // Byte access instruction (memb).
+ HalfWordAccess = 2, // Half word access instruction (memh).
+ WordAccess = 3, // Word access instruction (memw).
+ DoubleWordAccess = 4, // Double word access instruction (memd)
+ // 5, // We do not have a 16 byte vector access.
+ Vector64Access = 7, // 64 Byte vector access instruction (vmem).
+ Vector128Access = 8 // 128 Byte vector access instruction (vmem).
+ };
+
+ // MCInstrDesc TSFlags
+ // *** Must match HexagonInstrFormat*.td ***
+ enum {
+ // This 5-bit field describes the insn type.
+ TypePos = 0,
+ TypeMask = 0x1f,
+
+ // Solo instructions.
+ SoloPos = 5,
+ SoloMask = 0x1,
+ // Packed only with A or X-type instructions.
+ SoloAXPos = 6,
+ SoloAXMask = 0x1,
+ // Only A-type instruction in first slot or nothing.
+ SoloAin1Pos = 7,
+ SoloAin1Mask = 0x1,
+
+ // Predicated instructions.
+ PredicatedPos = 8,
+ PredicatedMask = 0x1,
+ PredicatedFalsePos = 9,
+ PredicatedFalseMask = 0x1,
+ PredicatedNewPos = 10,
+ PredicatedNewMask = 0x1,
+ PredicateLatePos = 11,
+ PredicateLateMask = 0x1,
+
+ // New-Value consumer instructions.
+ NewValuePos = 12,
+ NewValueMask = 0x1,
+ // New-Value producer instructions.
+ hasNewValuePos = 13,
+ hasNewValueMask = 0x1,
+ // Which operand consumes or produces a new value.
+ NewValueOpPos = 14,
+ NewValueOpMask = 0x7,
+ // Stores that can become new-value stores.
+ mayNVStorePos = 17,
+ mayNVStoreMask = 0x1,
+ // New-value store instructions.
+ NVStorePos = 18,
+ NVStoreMask = 0x1,
+ // Loads that can become current-value loads.
+ mayCVLoadPos = 19,
+ mayCVLoadMask = 0x1,
+ // Current-value load instructions.
+ CVLoadPos = 20,
+ CVLoadMask = 0x1,
+
+ // Extendable insns.
+ ExtendablePos = 21,
+ ExtendableMask = 0x1,
+ // Insns must be extended.
+ ExtendedPos = 22,
+ ExtendedMask = 0x1,
+ // Which operand may be extended.
+ ExtendableOpPos = 23,
+ ExtendableOpMask = 0x7,
+ // Signed or unsigned range.
+ ExtentSignedPos = 26,
+ ExtentSignedMask = 0x1,
+ // Number of bits of range before extending operand.
+ ExtentBitsPos = 27,
+ ExtentBitsMask = 0x1f,
+ // Alignment power-of-two before extending operand.
+ ExtentAlignPos = 32,
+ ExtentAlignMask = 0x3,
+
+ // Valid subtargets
+ validSubTargetPos = 34,
+ validSubTargetMask = 0xf,
+
+ // Addressing mode for load/store instructions.
+ AddrModePos = 40,
+ AddrModeMask = 0x7,
+ // Access size for load/store instructions.
+ MemAccessSizePos = 43,
+ MemAccesSizeMask = 0xf,
+
+ // Branch predicted taken.
+ TakenPos = 47,
+ TakenMask = 0x1,
+
+ // Floating-point instructions.
+ FPPos = 48,
+ FPMask = 0x1,
+
+ // New-Value producer-2 instructions.
+ hasNewValuePos2 = 50,
+ hasNewValueMask2 = 0x1,
+
+ // Which operand consumes or produces a new value.
+ NewValueOpPos2 = 51,
+ NewValueOpMask2 = 0x7,
+
+ // Accumulator instructions.
+ AccumulatorPos = 54,
+ AccumulatorMask = 0x1,
+
+ // Complex XU, prevent xu competition by preferring slot3
+ PrefersSlot3Pos = 55,
+ PrefersSlot3Mask = 0x1,
+
+ CofMax1Pos = 60,
+ CofMax1Mask = 0x1
+ };
+
+ // *** The code above must match HexagonInstrFormat*.td *** //
+
+ // Hexagon specific MO operand flag mask.
+ enum HexagonMOTargetFlagVal {
+ //===------------------------------------------------------------------===//
+ // Hexagon Specific MachineOperand flags.
+ MO_NO_FLAG,
+
+ HMOTF_ConstExtended = 1,
+
+ /// MO_PCREL - On a symbol operand, indicates a PC-relative relocation
+ /// Used for computing a global address for PIC compilations
+ MO_PCREL,
+
+ /// MO_GOT - Indicates a GOT-relative relocation
+ MO_GOT,
+
+ // Low or high part of a symbol.
+ MO_LO16, MO_HI16,
+
+ // Offset from the base of the SDA.
+ MO_GPREL,
+
+ // MO_GDGOT - indicates GOT relative relocation for TLS
+ // GeneralDynamic method
+ MO_GDGOT,
+
+ // MO_GDPLT - indicates PLT relative relocation for TLS
+ // GeneralDynamic method
+ MO_GDPLT,
+
+ // MO_IE - indicates non PIC relocation for TLS
+ // Initial Executable method
+ MO_IE,
+
+ // MO_IEGOT - indicates PIC relocation for TLS
+ // Initial Executable method
+ MO_IEGOT,
+
+ // MO_TPREL - indicates relocation for TLS
+ // local Executable method
+ MO_TPREL
+ };
+
+ // Hexagon Sub-instruction classes.
+ enum SubInstructionGroup {
+ HSIG_None = 0,
+ HSIG_L1,
+ HSIG_L2,
+ HSIG_S1,
+ HSIG_S2,
+ HSIG_A,
+ HSIG_Compound
+ };
+
+ // Hexagon Compound classes.
+ enum CompoundGroup {
+ HCG_None = 0,
+ HCG_A,
+ HCG_B,
+ HCG_C
+ };
+
+ enum InstParseBits {
+ INST_PARSE_MASK = 0x0000c000,
+ INST_PARSE_PACKET_END = 0x0000c000,
+ INST_PARSE_LOOP_END = 0x00008000,
+ INST_PARSE_NOT_END = 0x00004000,
+ INST_PARSE_DUPLEX = 0x00000000,
+ INST_PARSE_EXTENDER = 0x00000000
+ };
+
+ enum InstIClassBits : unsigned {
+ INST_ICLASS_MASK = 0xf0000000,
+ INST_ICLASS_EXTENDER = 0x00000000,
+ INST_ICLASS_J_1 = 0x10000000,
+ INST_ICLASS_J_2 = 0x20000000,
+ INST_ICLASS_LD_ST_1 = 0x30000000,
+ INST_ICLASS_LD_ST_2 = 0x40000000,
+ INST_ICLASS_J_3 = 0x50000000,
+ INST_ICLASS_CR = 0x60000000,
+ INST_ICLASS_ALU32_1 = 0x70000000,
+ INST_ICLASS_XTYPE_1 = 0x80000000,
+ INST_ICLASS_LD = 0x90000000,
+ INST_ICLASS_ST = 0xa0000000,
+ INST_ICLASS_ALU32_2 = 0xb0000000,
+ INST_ICLASS_XTYPE_2 = 0xc0000000,
+ INST_ICLASS_XTYPE_3 = 0xd0000000,
+ INST_ICLASS_XTYPE_4 = 0xe0000000,
+ INST_ICLASS_ALU32_3 = 0xf0000000
+ };
+
+} // End namespace HexagonII.
+
+} // End namespace llvm.
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
new file mode 100644
index 000000000000..944e235e72f2
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
@@ -0,0 +1,295 @@
+//===-- HexagonELFObjectWriter.cpp - Hexagon Target Descriptions ----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "MCTargetDesc/HexagonFixupKinds.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "hexagon-elf-writer"
+
+using namespace llvm;
+using namespace Hexagon;
+
+namespace {
+
+class HexagonELFObjectWriter : public MCELFObjectTargetWriter {
+private:
+ StringRef CPU;
+
+public:
+ HexagonELFObjectWriter(uint8_t OSABI, StringRef C);
+
+ unsigned getRelocType(MCContext &Ctx, MCValue const &Target,
+ MCFixup const &Fixup, bool IsPCRel) const override;
+};
+}
+
+HexagonELFObjectWriter::HexagonELFObjectWriter(uint8_t OSABI, StringRef C)
+ : MCELFObjectTargetWriter(/*Is64bit*/ false, OSABI, ELF::EM_HEXAGON,
+ /*HasRelocationAddend*/ true),
+ CPU(C) {}
+
+unsigned HexagonELFObjectWriter::getRelocType(MCContext &Ctx,
+ MCValue const &Target,
+ MCFixup const &Fixup,
+ bool IsPCRel) const {
+ MCSymbolRefExpr::VariantKind Variant = Target.getAccessVariant();
+ switch ((unsigned)Fixup.getKind()) {
+ default:
+ report_fatal_error("Unrecognized relocation type");
+ break;
+ case FK_Data_4:
+ switch(Variant) {
+ case MCSymbolRefExpr::VariantKind::VK_DTPREL:
+ return ELF::R_HEX_DTPREL_32;
+ case MCSymbolRefExpr::VariantKind::VK_GOT:
+ return ELF::R_HEX_GOT_32;
+ case MCSymbolRefExpr::VariantKind::VK_GOTREL:
+ return ELF::R_HEX_GOTREL_32;
+ case MCSymbolRefExpr::VariantKind::VK_Hexagon_GD_GOT:
+ return ELF::R_HEX_GD_GOT_32;
+ case MCSymbolRefExpr::VariantKind::VK_Hexagon_IE:
+ return ELF::R_HEX_IE_32;
+ case MCSymbolRefExpr::VariantKind::VK_Hexagon_IE_GOT:
+ return ELF::R_HEX_IE_GOT_32;
+ case MCSymbolRefExpr::VariantKind::VK_Hexagon_LD_GOT:
+ return ELF::R_HEX_LD_GOT_32;
+ case MCSymbolRefExpr::VariantKind::VK_Hexagon_PCREL:
+ return ELF::R_HEX_32_PCREL;
+ case MCSymbolRefExpr::VariantKind::VK_TPREL:
+ return ELF::R_HEX_TPREL_32;
+ case MCSymbolRefExpr::VariantKind::VK_None:
+ return IsPCRel ? ELF::R_HEX_32_PCREL : ELF::R_HEX_32;
+ default:
+ report_fatal_error("Unrecognized variant type");
+ };
+ case FK_PCRel_4:
+ return ELF::R_HEX_32_PCREL;
+ case FK_Data_2:
+ switch(Variant) {
+ case MCSymbolRefExpr::VariantKind::VK_DTPREL:
+ return ELF::R_HEX_DTPREL_16;
+ case MCSymbolRefExpr::VariantKind::VK_GOT:
+ return ELF::R_HEX_GOT_16;
+ case MCSymbolRefExpr::VariantKind::VK_Hexagon_GD_GOT:
+ return ELF::R_HEX_GD_GOT_16;
+ case MCSymbolRefExpr::VariantKind::VK_Hexagon_IE_GOT:
+ return ELF::R_HEX_IE_GOT_16;
+ case MCSymbolRefExpr::VariantKind::VK_Hexagon_LD_GOT:
+ return ELF::R_HEX_LD_GOT_16;
+ case MCSymbolRefExpr::VariantKind::VK_TPREL:
+ return ELF::R_HEX_TPREL_16;
+ case MCSymbolRefExpr::VariantKind::VK_None:
+ return ELF::R_HEX_16;
+ default:
+ report_fatal_error("Unrecognized variant type");
+ };
+ case FK_Data_1:
+ return ELF::R_HEX_8;
+ case fixup_Hexagon_B22_PCREL:
+ return ELF::R_HEX_B22_PCREL;
+ case fixup_Hexagon_B15_PCREL:
+ return ELF::R_HEX_B15_PCREL;
+ case fixup_Hexagon_B7_PCREL:
+ return ELF::R_HEX_B7_PCREL;
+ case fixup_Hexagon_LO16:
+ return ELF::R_HEX_LO16;
+ case fixup_Hexagon_HI16:
+ return ELF::R_HEX_HI16;
+ case fixup_Hexagon_32:
+ return ELF::R_HEX_32;
+ case fixup_Hexagon_16:
+ return ELF::R_HEX_16;
+ case fixup_Hexagon_8:
+ return ELF::R_HEX_8;
+ case fixup_Hexagon_GPREL16_0:
+ return ELF::R_HEX_GPREL16_0;
+ case fixup_Hexagon_GPREL16_1:
+ return ELF::R_HEX_GPREL16_1;
+ case fixup_Hexagon_GPREL16_2:
+ return ELF::R_HEX_GPREL16_2;
+ case fixup_Hexagon_GPREL16_3:
+ return ELF::R_HEX_GPREL16_3;
+ case fixup_Hexagon_HL16:
+ return ELF::R_HEX_HL16;
+ case fixup_Hexagon_B13_PCREL:
+ return ELF::R_HEX_B13_PCREL;
+ case fixup_Hexagon_B9_PCREL:
+ return ELF::R_HEX_B9_PCREL;
+ case fixup_Hexagon_B32_PCREL_X:
+ return ELF::R_HEX_B32_PCREL_X;
+ case fixup_Hexagon_32_6_X:
+ return ELF::R_HEX_32_6_X;
+ case fixup_Hexagon_B22_PCREL_X:
+ return ELF::R_HEX_B22_PCREL_X;
+ case fixup_Hexagon_B15_PCREL_X:
+ return ELF::R_HEX_B15_PCREL_X;
+ case fixup_Hexagon_B13_PCREL_X:
+ return ELF::R_HEX_B13_PCREL_X;
+ case fixup_Hexagon_B9_PCREL_X:
+ return ELF::R_HEX_B9_PCREL_X;
+ case fixup_Hexagon_B7_PCREL_X:
+ return ELF::R_HEX_B7_PCREL_X;
+ case fixup_Hexagon_16_X:
+ return ELF::R_HEX_16_X;
+ case fixup_Hexagon_12_X:
+ return ELF::R_HEX_12_X;
+ case fixup_Hexagon_11_X:
+ return ELF::R_HEX_11_X;
+ case fixup_Hexagon_10_X:
+ return ELF::R_HEX_10_X;
+ case fixup_Hexagon_9_X:
+ return ELF::R_HEX_9_X;
+ case fixup_Hexagon_8_X:
+ return ELF::R_HEX_8_X;
+ case fixup_Hexagon_7_X:
+ return ELF::R_HEX_7_X;
+ case fixup_Hexagon_6_X:
+ return ELF::R_HEX_6_X;
+ case fixup_Hexagon_32_PCREL:
+ return ELF::R_HEX_32_PCREL;
+ case fixup_Hexagon_COPY:
+ return ELF::R_HEX_COPY;
+ case fixup_Hexagon_GLOB_DAT:
+ return ELF::R_HEX_GLOB_DAT;
+ case fixup_Hexagon_JMP_SLOT:
+ return ELF::R_HEX_JMP_SLOT;
+ case fixup_Hexagon_RELATIVE:
+ return ELF::R_HEX_RELATIVE;
+ case fixup_Hexagon_PLT_B22_PCREL:
+ return ELF::R_HEX_PLT_B22_PCREL;
+ case fixup_Hexagon_GOTREL_LO16:
+ return ELF::R_HEX_GOTREL_LO16;
+ case fixup_Hexagon_GOTREL_HI16:
+ return ELF::R_HEX_GOTREL_HI16;
+ case fixup_Hexagon_GOTREL_32:
+ return ELF::R_HEX_GOTREL_32;
+ case fixup_Hexagon_GOT_LO16:
+ return ELF::R_HEX_GOT_LO16;
+ case fixup_Hexagon_GOT_HI16:
+ return ELF::R_HEX_GOT_HI16;
+ case fixup_Hexagon_GOT_32:
+ return ELF::R_HEX_GOT_32;
+ case fixup_Hexagon_GOT_16:
+ return ELF::R_HEX_GOT_16;
+ case fixup_Hexagon_DTPMOD_32:
+ return ELF::R_HEX_DTPMOD_32;
+ case fixup_Hexagon_DTPREL_LO16:
+ return ELF::R_HEX_DTPREL_LO16;
+ case fixup_Hexagon_DTPREL_HI16:
+ return ELF::R_HEX_DTPREL_HI16;
+ case fixup_Hexagon_DTPREL_32:
+ return ELF::R_HEX_DTPREL_32;
+ case fixup_Hexagon_DTPREL_16:
+ return ELF::R_HEX_DTPREL_16;
+ case fixup_Hexagon_GD_PLT_B22_PCREL:
+ return ELF::R_HEX_GD_PLT_B22_PCREL;
+ case fixup_Hexagon_LD_PLT_B22_PCREL:
+ return ELF::R_HEX_LD_PLT_B22_PCREL;
+ case fixup_Hexagon_GD_GOT_LO16:
+ return ELF::R_HEX_GD_GOT_LO16;
+ case fixup_Hexagon_GD_GOT_HI16:
+ return ELF::R_HEX_GD_GOT_HI16;
+ case fixup_Hexagon_GD_GOT_32:
+ return ELF::R_HEX_GD_GOT_32;
+ case fixup_Hexagon_GD_GOT_16:
+ return ELF::R_HEX_GD_GOT_16;
+ case fixup_Hexagon_LD_GOT_LO16:
+ return ELF::R_HEX_LD_GOT_LO16;
+ case fixup_Hexagon_LD_GOT_HI16:
+ return ELF::R_HEX_LD_GOT_HI16;
+ case fixup_Hexagon_LD_GOT_32:
+ return ELF::R_HEX_LD_GOT_32;
+ case fixup_Hexagon_LD_GOT_16:
+ return ELF::R_HEX_LD_GOT_16;
+ case fixup_Hexagon_IE_LO16:
+ return ELF::R_HEX_IE_LO16;
+ case fixup_Hexagon_IE_HI16:
+ return ELF::R_HEX_IE_HI16;
+ case fixup_Hexagon_IE_32:
+ return ELF::R_HEX_IE_32;
+ case fixup_Hexagon_IE_GOT_LO16:
+ return ELF::R_HEX_IE_GOT_LO16;
+ case fixup_Hexagon_IE_GOT_HI16:
+ return ELF::R_HEX_IE_GOT_HI16;
+ case fixup_Hexagon_IE_GOT_32:
+ return ELF::R_HEX_IE_GOT_32;
+ case fixup_Hexagon_IE_GOT_16:
+ return ELF::R_HEX_IE_GOT_16;
+ case fixup_Hexagon_TPREL_LO16:
+ return ELF::R_HEX_TPREL_LO16;
+ case fixup_Hexagon_TPREL_HI16:
+ return ELF::R_HEX_TPREL_HI16;
+ case fixup_Hexagon_TPREL_32:
+ return ELF::R_HEX_TPREL_32;
+ case fixup_Hexagon_TPREL_16:
+ return ELF::R_HEX_TPREL_16;
+ case fixup_Hexagon_6_PCREL_X:
+ return ELF::R_HEX_6_PCREL_X;
+ case fixup_Hexagon_GOTREL_32_6_X:
+ return ELF::R_HEX_GOTREL_32_6_X;
+ case fixup_Hexagon_GOTREL_16_X:
+ return ELF::R_HEX_GOTREL_16_X;
+ case fixup_Hexagon_GOTREL_11_X:
+ return ELF::R_HEX_GOTREL_11_X;
+ case fixup_Hexagon_GOT_32_6_X:
+ return ELF::R_HEX_GOT_32_6_X;
+ case fixup_Hexagon_GOT_16_X:
+ return ELF::R_HEX_GOT_16_X;
+ case fixup_Hexagon_GOT_11_X:
+ return ELF::R_HEX_GOT_11_X;
+ case fixup_Hexagon_DTPREL_32_6_X:
+ return ELF::R_HEX_DTPREL_32_6_X;
+ case fixup_Hexagon_DTPREL_16_X:
+ return ELF::R_HEX_DTPREL_16_X;
+ case fixup_Hexagon_DTPREL_11_X:
+ return ELF::R_HEX_DTPREL_11_X;
+ case fixup_Hexagon_GD_GOT_32_6_X:
+ return ELF::R_HEX_GD_GOT_32_6_X;
+ case fixup_Hexagon_GD_GOT_16_X:
+ return ELF::R_HEX_GD_GOT_16_X;
+ case fixup_Hexagon_GD_GOT_11_X:
+ return ELF::R_HEX_GD_GOT_11_X;
+ case fixup_Hexagon_LD_GOT_32_6_X:
+ return ELF::R_HEX_LD_GOT_32_6_X;
+ case fixup_Hexagon_LD_GOT_16_X:
+ return ELF::R_HEX_LD_GOT_16_X;
+ case fixup_Hexagon_LD_GOT_11_X:
+ return ELF::R_HEX_LD_GOT_11_X;
+ case fixup_Hexagon_IE_32_6_X:
+ return ELF::R_HEX_IE_32_6_X;
+ case fixup_Hexagon_IE_16_X:
+ return ELF::R_HEX_IE_16_X;
+ case fixup_Hexagon_IE_GOT_32_6_X:
+ return ELF::R_HEX_IE_GOT_32_6_X;
+ case fixup_Hexagon_IE_GOT_16_X:
+ return ELF::R_HEX_IE_GOT_16_X;
+ case fixup_Hexagon_IE_GOT_11_X:
+ return ELF::R_HEX_IE_GOT_11_X;
+ case fixup_Hexagon_TPREL_32_6_X:
+ return ELF::R_HEX_TPREL_32_6_X;
+ case fixup_Hexagon_TPREL_16_X:
+ return ELF::R_HEX_TPREL_16_X;
+ case fixup_Hexagon_TPREL_11_X:
+ return ELF::R_HEX_TPREL_11_X;
+ case fixup_Hexagon_23_REG:
+ return ELF::R_HEX_23_REG;
+ }
+}
+
+MCObjectWriter *llvm::createHexagonELFObjectWriter(raw_pwrite_stream &OS,
+ uint8_t OSABI,
+ StringRef CPU) {
+ MCELFObjectTargetWriter *MOTW = new HexagonELFObjectWriter(OSABI, CPU);
+ return createELFObjectWriter(MOTW, OS, /*IsLittleEndian*/ true);
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonFixupKinds.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonFixupKinds.h
new file mode 100644
index 000000000000..4c97ebbdd346
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonFixupKinds.h
@@ -0,0 +1,138 @@
+//===-- HexagonFixupKinds.h - Hexagon Specific Fixup Entries --------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_HEXAGON_HEXAGONFIXUPKINDS_H
+#define LLVM_HEXAGON_HEXAGONFIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace Hexagon {
+enum Fixups {
+ // Branch fixups for R_HEX_B{22,15,7}_PCREL.
+ fixup_Hexagon_B22_PCREL = FirstTargetFixupKind,
+ fixup_Hexagon_B15_PCREL,
+ fixup_Hexagon_B7_PCREL,
+ fixup_Hexagon_LO16,
+ fixup_Hexagon_HI16,
+ fixup_Hexagon_32,
+ fixup_Hexagon_16,
+ fixup_Hexagon_8,
+ fixup_Hexagon_GPREL16_0,
+ fixup_Hexagon_GPREL16_1,
+ fixup_Hexagon_GPREL16_2,
+ fixup_Hexagon_GPREL16_3,
+ fixup_Hexagon_HL16,
+ fixup_Hexagon_B13_PCREL,
+ fixup_Hexagon_B9_PCREL,
+ fixup_Hexagon_B32_PCREL_X,
+ fixup_Hexagon_32_6_X,
+ fixup_Hexagon_B22_PCREL_X,
+ fixup_Hexagon_B15_PCREL_X,
+ fixup_Hexagon_B13_PCREL_X,
+ fixup_Hexagon_B9_PCREL_X,
+ fixup_Hexagon_B7_PCREL_X,
+ fixup_Hexagon_16_X,
+ fixup_Hexagon_12_X,
+ fixup_Hexagon_11_X,
+ fixup_Hexagon_10_X,
+ fixup_Hexagon_9_X,
+ fixup_Hexagon_8_X,
+ fixup_Hexagon_7_X,
+ fixup_Hexagon_6_X,
+ fixup_Hexagon_32_PCREL,
+ fixup_Hexagon_COPY,
+ fixup_Hexagon_GLOB_DAT,
+ fixup_Hexagon_JMP_SLOT,
+ fixup_Hexagon_RELATIVE,
+ fixup_Hexagon_PLT_B22_PCREL,
+ fixup_Hexagon_GOTREL_LO16,
+ fixup_Hexagon_GOTREL_HI16,
+ fixup_Hexagon_GOTREL_32,
+ fixup_Hexagon_GOT_LO16,
+ fixup_Hexagon_GOT_HI16,
+ fixup_Hexagon_GOT_32,
+ fixup_Hexagon_GOT_16,
+ fixup_Hexagon_DTPMOD_32,
+ fixup_Hexagon_DTPREL_LO16,
+ fixup_Hexagon_DTPREL_HI16,
+ fixup_Hexagon_DTPREL_32,
+ fixup_Hexagon_DTPREL_16,
+ fixup_Hexagon_GD_PLT_B22_PCREL,
+ fixup_Hexagon_LD_PLT_B22_PCREL,
+ fixup_Hexagon_GD_GOT_LO16,
+ fixup_Hexagon_GD_GOT_HI16,
+ fixup_Hexagon_GD_GOT_32,
+ fixup_Hexagon_GD_GOT_16,
+ fixup_Hexagon_LD_GOT_LO16,
+ fixup_Hexagon_LD_GOT_HI16,
+ fixup_Hexagon_LD_GOT_32,
+ fixup_Hexagon_LD_GOT_16,
+ fixup_Hexagon_IE_LO16,
+ fixup_Hexagon_IE_HI16,
+ fixup_Hexagon_IE_32,
+ fixup_Hexagon_IE_16,
+ fixup_Hexagon_IE_GOT_LO16,
+ fixup_Hexagon_IE_GOT_HI16,
+ fixup_Hexagon_IE_GOT_32,
+ fixup_Hexagon_IE_GOT_16,
+ fixup_Hexagon_TPREL_LO16,
+ fixup_Hexagon_TPREL_HI16,
+ fixup_Hexagon_TPREL_32,
+ fixup_Hexagon_TPREL_16,
+ fixup_Hexagon_6_PCREL_X,
+ fixup_Hexagon_GOTREL_32_6_X,
+ fixup_Hexagon_GOTREL_16_X,
+ fixup_Hexagon_GOTREL_11_X,
+ fixup_Hexagon_GOT_32_6_X,
+ fixup_Hexagon_GOT_16_X,
+ fixup_Hexagon_GOT_11_X,
+ fixup_Hexagon_DTPREL_32_6_X,
+ fixup_Hexagon_DTPREL_16_X,
+ fixup_Hexagon_DTPREL_11_X,
+ fixup_Hexagon_GD_GOT_32_6_X,
+ fixup_Hexagon_GD_GOT_16_X,
+ fixup_Hexagon_GD_GOT_11_X,
+ fixup_Hexagon_LD_GOT_32_6_X,
+ fixup_Hexagon_LD_GOT_16_X,
+ fixup_Hexagon_LD_GOT_11_X,
+ fixup_Hexagon_IE_32_6_X,
+ fixup_Hexagon_IE_16_X,
+ fixup_Hexagon_IE_GOT_32_6_X,
+ fixup_Hexagon_IE_GOT_16_X,
+ fixup_Hexagon_IE_GOT_11_X,
+ fixup_Hexagon_TPREL_32_6_X,
+ fixup_Hexagon_TPREL_16_X,
+ fixup_Hexagon_TPREL_11_X,
+ fixup_Hexagon_23_REG,
+
+ LastTargetFixupKind,
+ NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+enum FixupBitmaps : unsigned {
+ Word8 = 0xff,
+ Word16 = 0xffff,
+ Word32 = 0xffffffff,
+ Word32_LO = 0x00c03fff,
+ Word32_HL = 0x0, // Not Implemented
+ Word32_GP = 0x0, // Not Implemented
+ Word32_B7 = 0x00001f18,
+ Word32_B9 = 0x003000fe,
+ Word32_B13 = 0x00202ffe,
+ Word32_B15 = 0x00df20fe,
+ Word32_B22 = 0x01ff3ffe,
+ Word32_R6 = 0x000007e0,
+ Word32_U6 = 0x0, // Not Implemented
+ Word32_U16 = 0x0, // Not Implemented
+ Word32_X26 = 0x0fff3fff
+};
+} // namespace Hexagon
+} // namespace llvm
+
+#endif // LLVM_HEXAGON_HEXAGONFIXUPKINDS_H
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
new file mode 100644
index 000000000000..42fcc5a6aa89
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
@@ -0,0 +1,227 @@
+//===- HexagonInstPrinter.cpp - Convert Hexagon MCInst to assembly syntax -===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an Hexagon MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonAsmPrinter.h"
+#include "HexagonInstPrinter.h"
+#include "MCTargetDesc/HexagonBaseInfo.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+#define GET_INSTRUCTION_NAME
+#include "HexagonGenAsmWriter.inc"
+
+HexagonInstPrinter::HexagonInstPrinter(MCAsmInfo const &MAI,
+ MCInstrInfo const &MII,
+ MCRegisterInfo const &MRI)
+ : MCInstPrinter(MAI, MII, MRI), MII(MII), HasExtender(false) {
+}
+
+StringRef HexagonInstPrinter::getOpcodeName(unsigned Opcode) const {
+ return MII.getName(Opcode);
+}
+
+void HexagonInstPrinter::printRegName(raw_ostream &O, unsigned RegNo) const {
+ O << getRegName(RegNo);
+}
+
+StringRef HexagonInstPrinter::getRegName(unsigned RegNo) const {
+ return getRegisterName(RegNo);
+}
+
+void HexagonInstPrinter::setExtender(MCInst const &MCI) {
+ HasExtender = HexagonMCInstrInfo::isImmext(MCI);
+}
+
+void HexagonInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+ StringRef Annot, const MCSubtargetInfo &STI) {
+ assert(HexagonMCInstrInfo::isBundle(*MI));
+ assert(HexagonMCInstrInfo::bundleSize(*MI) <= HEXAGON_PACKET_SIZE);
+ assert(HexagonMCInstrInfo::bundleSize(*MI) > 0);
+ HasExtender = false;
+ for (auto const &I : HexagonMCInstrInfo::bundleInstructions(*MI)) {
+ MCInst const &MCI = *I.getInst();
+ if (HexagonMCInstrInfo::isDuplex(MII, MCI)) {
+ printInstruction(MCI.getOperand(1).getInst(), OS);
+ OS << '\v';
+ HasExtender = false;
+ printInstruction(MCI.getOperand(0).getInst(), OS);
+ } else
+ printInstruction(&MCI, OS);
+ setExtender(MCI);
+ OS << "\n";
+ }
+
+ auto Separator = "";
+ if (HexagonMCInstrInfo::isInnerLoop(*MI)) {
+ OS << Separator;
+ Separator = " ";
+ MCInst ME;
+ ME.setOpcode(Hexagon::ENDLOOP0);
+ printInstruction(&ME, OS);
+ }
+ if (HexagonMCInstrInfo::isOuterLoop(*MI)) {
+ OS << Separator;
+ MCInst ME;
+ ME.setOpcode(Hexagon::ENDLOOP1);
+ printInstruction(&ME, OS);
+ }
+}
+
+void HexagonInstPrinter::printOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const {
+ if (HexagonMCInstrInfo::getExtendableOp(MII, *MI) == OpNo &&
+ (HasExtender || HexagonMCInstrInfo::isConstExtended(MII, *MI)))
+ O << "#";
+ MCOperand const &MO = MI->getOperand(OpNo);
+ if (MO.isReg()) {
+ O << getRegisterName(MO.getReg());
+ } else if (MO.isExpr()) {
+ int64_t Value;
+ if (MO.getExpr()->evaluateAsAbsolute(Value))
+ O << formatImm(Value);
+ else
+ O << *MO.getExpr();
+ } else {
+ llvm_unreachable("Unknown operand");
+ }
+}
+
+void HexagonInstPrinter::printExtOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const {
+ printOperand(MI, OpNo, O);
+}
+
+void HexagonInstPrinter::printUnsignedImmOperand(MCInst const *MI,
+ unsigned OpNo,
+ raw_ostream &O) const {
+ O << MI->getOperand(OpNo).getImm();
+}
+
+void HexagonInstPrinter::printNegImmOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const {
+ O << -MI->getOperand(OpNo).getImm();
+}
+
+void HexagonInstPrinter::printNOneImmOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const {
+ O << -1;
+}
+
+void HexagonInstPrinter::prints3_6ImmOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const {
+ int64_t Imm;
+ bool Success = MI->getOperand(OpNo).getExpr()->evaluateAsAbsolute(Imm);
+ Imm = SignExtend64<9>(Imm);
+ assert(Success); (void)Success;
+ assert(((Imm & 0x3f) == 0) && "Lower 6 bits must be ZERO.");
+ O << formatImm(Imm/64);
+}
+
+void HexagonInstPrinter::prints3_7ImmOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const {
+ int64_t Imm;
+ bool Success = MI->getOperand(OpNo).getExpr()->evaluateAsAbsolute(Imm);
+ Imm = SignExtend64<10>(Imm);
+ assert(Success); (void)Success;
+ assert(((Imm & 0x7f) == 0) && "Lower 7 bits must be ZERO.");
+ O << formatImm(Imm/128);
+}
+
+void HexagonInstPrinter::prints4_6ImmOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const {
+ int64_t Imm;
+ bool Success = MI->getOperand(OpNo).getExpr()->evaluateAsAbsolute(Imm);
+ Imm = SignExtend64<10>(Imm);
+ assert(Success); (void)Success;
+ assert(((Imm & 0x3f) == 0) && "Lower 6 bits must be ZERO.");
+ O << formatImm(Imm/64);
+}
+
+void HexagonInstPrinter::prints4_7ImmOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const {
+ int64_t Imm;
+ bool Success = MI->getOperand(OpNo).getExpr()->evaluateAsAbsolute(Imm);
+ Imm = SignExtend64<11>(Imm);
+ assert(Success); (void)Success;
+ assert(((Imm & 0x7f) == 0) && "Lower 7 bits must be ZERO.");
+ O << formatImm(Imm/128);
+}
+
+void HexagonInstPrinter::printGlobalOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const {
+ printOperand(MI, OpNo, O);
+}
+
+void HexagonInstPrinter::printJumpTable(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const {
+ assert(MI->getOperand(OpNo).isExpr() && "Expecting expression");
+
+ printOperand(MI, OpNo, O);
+}
+
+void HexagonInstPrinter::printConstantPool(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const {
+ assert(MI->getOperand(OpNo).isExpr() && "Expecting expression");
+
+ printOperand(MI, OpNo, O);
+}
+
+void HexagonInstPrinter::printBranchOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const {
+ // Branches can take an immediate operand. This is used by the branch
+ // selection pass to print $+8, an eight byte displacement from the PC.
+ llvm_unreachable("Unknown branch operand.");
+}
+
+void HexagonInstPrinter::printCallOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const {}
+
+void HexagonInstPrinter::printAbsAddrOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const {}
+
+void HexagonInstPrinter::printPredicateOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const {}
+
+void HexagonInstPrinter::printSymbol(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O, bool hi) const {
+ assert(MI->getOperand(OpNo).isImm() && "Unknown symbol operand");
+
+ O << '#' << (hi ? "HI" : "LO") << '(';
+ O << '#';
+ printOperand(MI, OpNo, O);
+ O << ')';
+}
+
+void HexagonInstPrinter::printBrtarget(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const {
+ MCOperand const &MO = MI->getOperand(OpNo);
+ assert (MO.isExpr());
+ MCExpr const &Expr = *MO.getExpr();
+ int64_t Value;
+ if (Expr.evaluateAsAbsolute(Value))
+ O << format("0x%" PRIx64, Value);
+ else {
+ if (HasExtender || HexagonMCInstrInfo::isConstExtended(MII, *MI))
+ if (HexagonMCInstrInfo::getExtendableOp(MII, *MI) == OpNo)
+ O << "##";
+ O << Expr;
+ }
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h
new file mode 100644
index 000000000000..5f421184b20a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h
@@ -0,0 +1,92 @@
+//===-- HexagonInstPrinter.h - Convert Hexagon MCInst to assembly syntax --===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_INSTPRINTER_HEXAGONINSTPRINTER_H
+#define LLVM_LIB_TARGET_HEXAGON_INSTPRINTER_HEXAGONINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+/// Prints bundles as a newline separated list of individual instructions
+/// Duplexes are separated by a vertical tab \v character
+/// A trailing line includes bundle properties such as endloop0/1
+///
+/// r0 = add(r1, r2)
+/// r0 = #0 \v jump 0x0
+/// :endloop0 :endloop1
+class HexagonInstPrinter : public MCInstPrinter {
+public:
+ explicit HexagonInstPrinter(MCAsmInfo const &MAI, MCInstrInfo const &MII,
+ MCRegisterInfo const &MRI);
+ void printInst(MCInst const *MI, raw_ostream &O, StringRef Annot,
+ const MCSubtargetInfo &STI) override;
+ virtual StringRef getOpcodeName(unsigned Opcode) const;
+ void printInstruction(MCInst const *MI, raw_ostream &O);
+
+ StringRef getRegName(unsigned RegNo) const;
+ static char const *getRegisterName(unsigned RegNo);
+ void printRegName(raw_ostream &O, unsigned RegNo) const override;
+
+ void printOperand(MCInst const *MI, unsigned OpNo, raw_ostream &O) const;
+ void printExtOperand(MCInst const *MI, unsigned OpNo, raw_ostream &O) const;
+ void printUnsignedImmOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const;
+ void printNegImmOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const;
+ void printNOneImmOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const;
+ void prints3_6ImmOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const;
+ void prints3_7ImmOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const;
+ void prints4_6ImmOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const;
+ void prints4_7ImmOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const;
+ void printBranchOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const;
+ void printCallOperand(MCInst const *MI, unsigned OpNo, raw_ostream &O) const;
+ void printAbsAddrOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const;
+ void printPredicateOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const;
+ void printGlobalOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const;
+ void printJumpTable(MCInst const *MI, unsigned OpNo, raw_ostream &O) const;
+ void printBrtarget(MCInst const *MI, unsigned OpNo, raw_ostream &O) const;
+
+ void printConstantPool(MCInst const *MI, unsigned OpNo, raw_ostream &O) const;
+
+ void printSymbolHi(MCInst const *MI, unsigned OpNo, raw_ostream &O) const {
+ printSymbol(MI, OpNo, O, true);
+ }
+ void printSymbolLo(MCInst const *MI, unsigned OpNo, raw_ostream &O) const {
+ printSymbol(MI, OpNo, O, false);
+ }
+
+ MCAsmInfo const &getMAI() const { return MAI; }
+ MCInstrInfo const &getMII() const { return MII; }
+
+protected:
+ void printSymbol(MCInst const *MI, unsigned OpNo, raw_ostream &O,
+ bool hi) const;
+
+private:
+ MCInstrInfo const &MII;
+
+ bool HasExtender;
+ void setExtender(MCInst const &MCI);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
new file mode 100644
index 000000000000..c619c36164cf
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
@@ -0,0 +1,37 @@
+//===-- HexagonMCAsmInfo.cpp - Hexagon asm properties ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the HexagonMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonMCAsmInfo.h"
+
+using namespace llvm;
+
+// Pin the vtable to this file.
+void HexagonMCAsmInfo::anchor() {}
+
+HexagonMCAsmInfo::HexagonMCAsmInfo(const Triple &TT) {
+ Data16bitsDirective = "\t.half\t";
+ Data32bitsDirective = "\t.word\t";
+ Data64bitsDirective = nullptr; // .xword is only supported by V9.
+ CommentString = "//";
+
+ LCOMMDirectiveAlignmentType = LCOMM::ByteAlignment;
+ InlineAsmStart = "# InlineAsm Start";
+ InlineAsmEnd = "# InlineAsm End";
+ ZeroDirective = "\t.space\t";
+ AscizDirective = "\t.string\t";
+
+ SupportsDebugInformation = true;
+ MinInstAlignment = 4;
+ UsesELFSectionDirectiveForBSS = true;
+ ExceptionsType = ExceptionHandling::DwarfCFI;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
new file mode 100644
index 000000000000..efeff2436234
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
@@ -0,0 +1,31 @@
+//===-- HexagonTargetAsmInfo.h - Hexagon asm properties --------*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the HexagonMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCASMINFO_H
+#define LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCASMINFO_H
+
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+class Triple;
+
+class HexagonMCAsmInfo : public MCAsmInfoELF {
+ void anchor() override;
+
+public:
+ explicit HexagonMCAsmInfo(const Triple &TT);
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
new file mode 100644
index 000000000000..07c9ad96a0d7
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
@@ -0,0 +1,585 @@
+//===----- HexagonMCChecker.cpp - Instruction bundle checking -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the checking of insns inside a bundle according to the
+// packet constraint rules of the Hexagon ISA.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonMCChecker.h"
+
+#include "HexagonBaseInfo.h"
+
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+static cl::opt<bool> RelaxNVChecks("relax-nv-checks", cl::init(false),
+ cl::ZeroOrMore, cl::Hidden, cl::desc("Relax checks of new-value validity"));
+
+const HexagonMCChecker::PredSense
+ HexagonMCChecker::Unconditional(Hexagon::NoRegister, false);
+
+void HexagonMCChecker::init() {
+ // Initialize read-only registers set.
+ ReadOnly.insert(Hexagon::PC);
+
+ // Figure out the loop-registers definitions.
+ if (HexagonMCInstrInfo::isInnerLoop(MCB)) {
+ Defs[Hexagon::SA0].insert(Unconditional); // FIXME: define or change SA0?
+ Defs[Hexagon::LC0].insert(Unconditional);
+ }
+ if (HexagonMCInstrInfo::isOuterLoop(MCB)) {
+ Defs[Hexagon::SA1].insert(Unconditional); // FIXME: define or change SA0?
+ Defs[Hexagon::LC1].insert(Unconditional);
+ }
+
+ if (HexagonMCInstrInfo::isBundle(MCB))
+ // Unfurl a bundle.
+ for (auto const&I : HexagonMCInstrInfo::bundleInstructions(MCB)) {
+ init(*I.getInst());
+ }
+ else
+ init(MCB);
+}
+
+void HexagonMCChecker::init(MCInst const& MCI) {
+ const MCInstrDesc& MCID = HexagonMCInstrInfo::getDesc(MCII, MCI);
+ unsigned PredReg = Hexagon::NoRegister;
+ bool isTrue = false;
+
+ // Get used registers.
+ for (unsigned i = MCID.getNumDefs(); i < MCID.getNumOperands(); ++i)
+ if (MCI.getOperand(i).isReg()) {
+ unsigned R = MCI.getOperand(i).getReg();
+
+ if (HexagonMCInstrInfo::isPredicated(MCII, MCI) && isPredicateRegister(R)) {
+ // Note an used predicate register.
+ PredReg = R;
+ isTrue = HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI);
+
+ // Note use of new predicate register.
+ if (HexagonMCInstrInfo::isPredicatedNew(MCII, MCI))
+ NewPreds.insert(PredReg);
+ }
+ else
+ // Note register use. Super-registers are not tracked directly,
+ // but their components.
+ for(MCRegAliasIterator SRI(R, &RI, !MCSubRegIterator(R, &RI).isValid());
+ SRI.isValid();
+ ++SRI)
+ if (!MCSubRegIterator(*SRI, &RI).isValid())
+ // Skip super-registers used indirectly.
+ Uses.insert(*SRI);
+ }
+
+ // Get implicit register definitions.
+ if (const MCPhysReg *ImpDef = MCID.getImplicitDefs())
+ for (; *ImpDef; ++ImpDef) {
+ unsigned R = *ImpDef;
+
+ if (Hexagon::R31 != R && MCID.isCall())
+ // Any register other than the LR and the PC are actually volatile ones
+ // as defined by the ABI, not modified implicitly by the call insn.
+ continue;
+ if (Hexagon::PC == R)
+ // Branches are the only insns that can change the PC,
+ // otherwise a read-only register.
+ continue;
+
+ if (Hexagon::USR_OVF == R)
+ // Many insns change the USR implicitly, but only one or another flag.
+ // The instruction table models the USR.OVF flag, which can be implicitly
+ // modified more than once, but cannot be modified in the same packet
+ // with an instruction that modifies is explicitly. Deal with such situ-
+ // ations individually.
+ SoftDefs.insert(R);
+ else if (isPredicateRegister(R) &&
+ HexagonMCInstrInfo::isPredicateLate(MCII, MCI))
+ // Include implicit late predicates.
+ LatePreds.insert(R);
+ else
+ Defs[R].insert(PredSense(PredReg, isTrue));
+ }
+
+ // Figure out explicit register definitions.
+ for (unsigned i = 0; i < MCID.getNumDefs(); ++i) {
+ unsigned R = MCI.getOperand(i).getReg(),
+ S = Hexagon::NoRegister;
+ // USR has subregisters (while C8 does not for technical reasons), so
+ // reset R to USR, since we know how to handle multiple defs of USR,
+ // taking into account its subregisters.
+ if (R == Hexagon::C8)
+ R = Hexagon::USR;
+
+ // Note register definitions, direct ones as well as indirect side-effects.
+ // Super-registers are not tracked directly, but their components.
+ for(MCRegAliasIterator SRI(R, &RI, !MCSubRegIterator(R, &RI).isValid());
+ SRI.isValid();
+ ++SRI) {
+ if (MCSubRegIterator(*SRI, &RI).isValid())
+ // Skip super-registers defined indirectly.
+ continue;
+
+ if (R == *SRI) {
+ if (S == R)
+ // Avoid scoring the defined register multiple times.
+ continue;
+ else
+ // Note that the defined register has already been scored.
+ S = R;
+ }
+
+ if (Hexagon::P3_0 != R && Hexagon::P3_0 == *SRI)
+ // P3:0 is a special case, since multiple predicate register definitions
+ // in a packet is allowed as the equivalent of their logical "and".
+ // Only an explicit definition of P3:0 is noted as such; if a
+ // side-effect, then note as a soft definition.
+ SoftDefs.insert(*SRI);
+ else if (HexagonMCInstrInfo::isPredicateLate(MCII, MCI) && isPredicateRegister(*SRI))
+ // Some insns produce predicates too late to be used in the same packet.
+ LatePreds.insert(*SRI);
+ else if (i == 0 && llvm::HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCVI_VM_CUR_LD)
+ // Current loads should be used in the same packet.
+ // TODO: relies on the impossibility of a current and a temporary loads
+ // in the same packet.
+ CurDefs.insert(*SRI), Defs[*SRI].insert(PredSense(PredReg, isTrue));
+ else if (i == 0 && llvm::HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCVI_VM_TMP_LD)
+ // Temporary loads should be used in the same packet, but don't commit
+ // results, so it should be disregarded if another insn changes the same
+ // register.
+ // TODO: relies on the impossibility of a current and a temporary loads
+ // in the same packet.
+ TmpDefs.insert(*SRI);
+ else if (i <= 1 && llvm::HexagonMCInstrInfo::hasNewValue2(MCII, MCI) )
+ // vshuff(Vx, Vy, Rx) <- Vx(0) and Vy(1) are both source and
+ // destination registers with this instruction. same for vdeal(Vx,Vy,Rx)
+ Uses.insert(*SRI);
+ else
+ Defs[*SRI].insert(PredSense(PredReg, isTrue));
+ }
+ }
+
+ // Figure out register definitions that produce new values.
+ if (HexagonMCInstrInfo::hasNewValue(MCII, MCI)) {
+ unsigned R = HexagonMCInstrInfo::getNewValueOperand(MCII, MCI).getReg();
+
+ if (HexagonMCInstrInfo::isCompound(MCII, MCI))
+ compoundRegisterMap(R); // Compound insns have a limited register range.
+
+ for(MCRegAliasIterator SRI(R, &RI, !MCSubRegIterator(R, &RI).isValid());
+ SRI.isValid();
+ ++SRI)
+ if (!MCSubRegIterator(*SRI, &RI).isValid())
+ // No super-registers defined indirectly.
+ NewDefs[*SRI].push_back(NewSense::Def(PredReg, HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI),
+ HexagonMCInstrInfo::isFloat(MCII, MCI)));
+
+ // For fairly unique 2-dot-new producers, example:
+ // vdeal(V1, V9, R0) V1.new and V9.new can be used by consumers.
+ if (HexagonMCInstrInfo::hasNewValue2(MCII, MCI)) {
+ unsigned R2 = HexagonMCInstrInfo::getNewValueOperand2(MCII, MCI).getReg();
+
+ for(MCRegAliasIterator SRI(R2, &RI, !MCSubRegIterator(R2, &RI).isValid());
+ SRI.isValid();
+ ++SRI)
+ if (!MCSubRegIterator(*SRI, &RI).isValid())
+ NewDefs[*SRI].push_back(NewSense::Def(PredReg, HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI),
+ HexagonMCInstrInfo::isFloat(MCII, MCI)));
+ }
+ }
+
+ // Figure out definitions of new predicate registers.
+ if (HexagonMCInstrInfo::isPredicatedNew(MCII, MCI))
+ for (unsigned i = MCID.getNumDefs(); i < MCID.getNumOperands(); ++i)
+ if (MCI.getOperand(i).isReg()) {
+ unsigned P = MCI.getOperand(i).getReg();
+
+ if (isPredicateRegister(P))
+ NewPreds.insert(P);
+ }
+
+ // Figure out uses of new values.
+ if (HexagonMCInstrInfo::isNewValue(MCII, MCI)) {
+ unsigned N = HexagonMCInstrInfo::getNewValueOperand(MCII, MCI).getReg();
+
+ if (!MCSubRegIterator(N, &RI).isValid()) {
+ // Super-registers cannot use new values.
+ if (MCID.isBranch())
+ NewUses[N] = NewSense::Jmp(llvm::HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeNV);
+ else
+ NewUses[N] = NewSense::Use(PredReg, HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI));
+ }
+ }
+}
+
+HexagonMCChecker::HexagonMCChecker(MCInstrInfo const &MCII, MCSubtargetInfo const &STI, MCInst &mcb, MCInst &mcbdx,
+ MCRegisterInfo const &ri)
+ : MCB(mcb), MCBDX(mcbdx), RI(ri), MCII(MCII), STI(STI),
+ bLoadErrInfo(false) {
+ init();
+}
+
+bool HexagonMCChecker::check() {
+ bool chkB = checkBranches();
+ bool chkP = checkPredicates();
+ bool chkNV = checkNewValues();
+ bool chkR = checkRegisters();
+ bool chkS = checkSolo();
+ bool chkSh = checkShuffle();
+ bool chkSl = checkSlots();
+ bool chk = chkB && chkP && chkNV && chkR && chkS && chkSh && chkSl;
+
+ return chk;
+}
+
+bool HexagonMCChecker::checkSlots()
+
+{
+ unsigned slotsUsed = 0;
+ for (auto HMI: HexagonMCInstrInfo::bundleInstructions(MCBDX)) {
+ MCInst const& MCI = *HMI.getInst();
+ if (HexagonMCInstrInfo::isImmext(MCI))
+ continue;
+ if (HexagonMCInstrInfo::isDuplex(MCII, MCI))
+ slotsUsed += 2;
+ else
+ ++slotsUsed;
+ }
+
+ if (slotsUsed > HEXAGON_PACKET_SIZE) {
+ HexagonMCErrInfo errInfo;
+ errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_NOSLOTS);
+ addErrInfo(errInfo);
+ return false;
+ }
+ return true;
+}
+
+// Check legal use of branches.
+bool HexagonMCChecker::checkBranches() {
+ HexagonMCErrInfo errInfo;
+ if (HexagonMCInstrInfo::isBundle(MCB)) {
+ bool hasConditional = false;
+ unsigned Branches = 0, Returns = 0, NewIndirectBranches = 0,
+ NewValueBranches = 0, Conditional = HEXAGON_PRESHUFFLE_PACKET_SIZE,
+ Unconditional = HEXAGON_PRESHUFFLE_PACKET_SIZE;
+
+ for (unsigned i = HexagonMCInstrInfo::bundleInstructionsOffset;
+ i < MCB.size(); ++i) {
+ MCInst const &MCI = *MCB.begin()[i].getInst();
+
+ if (HexagonMCInstrInfo::isImmext(MCI))
+ continue;
+ if (HexagonMCInstrInfo::getDesc(MCII, MCI).isBranch() ||
+ HexagonMCInstrInfo::getDesc(MCII, MCI).isCall()) {
+ ++Branches;
+ if (HexagonMCInstrInfo::getDesc(MCII, MCI).isIndirectBranch() &&
+ HexagonMCInstrInfo::isPredicatedNew(MCII, MCI))
+ ++NewIndirectBranches;
+ if (HexagonMCInstrInfo::isNewValue(MCII, MCI))
+ ++NewValueBranches;
+
+ if (HexagonMCInstrInfo::isPredicated(MCII, MCI) ||
+ HexagonMCInstrInfo::isPredicatedNew(MCII, MCI)) {
+ hasConditional = true;
+ Conditional = i; // Record the position of the conditional branch.
+ } else {
+ Unconditional = i; // Record the position of the unconditional branch.
+ }
+ }
+ if (HexagonMCInstrInfo::getDesc(MCII, MCI).isReturn() &&
+ HexagonMCInstrInfo::getDesc(MCII, MCI).mayLoad())
+ ++Returns;
+ }
+
+ if (Branches) // FIXME: should "Defs.count(Hexagon::PC)" be here too?
+ if (HexagonMCInstrInfo::isInnerLoop(MCB) ||
+ HexagonMCInstrInfo::isOuterLoop(MCB)) {
+ // Error out if there's any branch in a loop-end packet.
+ errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_ENDLOOP, Hexagon::PC);
+ addErrInfo(errInfo);
+ return false;
+ }
+ if (Branches > 1)
+ if (!hasConditional || Conditional > Unconditional) {
+ // Error out if more than one unconditional branch or
+ // the conditional branch appears after the unconditional one.
+ errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_BRANCHES);
+ addErrInfo(errInfo);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+// Check legal use of predicate registers.
+bool HexagonMCChecker::checkPredicates() {
+ HexagonMCErrInfo errInfo;
+ // Check for proper use of new predicate registers.
+ for (const auto& I : NewPreds) {
+ unsigned P = I;
+
+ if (!Defs.count(P) || LatePreds.count(P)) {
+ // Error out if the new predicate register is not defined,
+ // or defined "late"
+ // (e.g., "{ if (p3.new)... ; p3 = sp1loop0(#r7:2, Rs) }").
+ errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_NEWP, P);
+ addErrInfo(errInfo);
+ return false;
+ }
+ }
+
+ // Check for proper use of auto-anded of predicate registers.
+ for (const auto& I : LatePreds) {
+ unsigned P = I;
+
+ if (LatePreds.count(P) > 1 || Defs.count(P)) {
+ // Error out if predicate register defined "late" multiple times or
+ // defined late and regularly defined
+ // (e.g., "{ p3 = sp1loop0(...); p3 = cmp.eq(...) }".
+ errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_REGISTERS, P);
+ addErrInfo(errInfo);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+// Check legal use of new values.
+bool HexagonMCChecker::checkNewValues() {
+ HexagonMCErrInfo errInfo;
+ memset(&errInfo, 0, sizeof(errInfo));
+ for (auto& I : NewUses) {
+ unsigned R = I.first;
+ NewSense &US = I.second;
+
+ if (!hasValidNewValueDef(US, NewDefs[R])) {
+ errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_NEWV, R);
+ addErrInfo(errInfo);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+// Check for legal register uses and definitions.
+bool HexagonMCChecker::checkRegisters() {
+ HexagonMCErrInfo errInfo;
+ // Check for proper register definitions.
+ for (const auto& I : Defs) {
+ unsigned R = I.first;
+
+ if (ReadOnly.count(R)) {
+ // Error out for definitions of read-only registers.
+ errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_READONLY, R);
+ addErrInfo(errInfo);
+ return false;
+ }
+ if (isLoopRegister(R) && Defs.count(R) > 1 &&
+ (HexagonMCInstrInfo::isInnerLoop(MCB) ||
+ HexagonMCInstrInfo::isOuterLoop(MCB))) {
+ // Error out for definitions of loop registers at the end of a loop.
+ errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_LOOP, R);
+ addErrInfo(errInfo);
+ return false;
+ }
+ if (SoftDefs.count(R)) {
+ // Error out for explicit changes to registers also weakly defined
+ // (e.g., "{ usr = r0; r0 = sfadd(...) }").
+ unsigned UsrR = Hexagon::USR; // Silence warning about mixed types in ?:.
+ unsigned BadR = RI.isSubRegister(Hexagon::USR, R) ? UsrR : R;
+ errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_REGISTERS, BadR);
+ addErrInfo(errInfo);
+ return false;
+ }
+ if (!isPredicateRegister(R) && Defs[R].size() > 1) {
+ // Check for multiple register definitions.
+ PredSet &PM = Defs[R];
+
+ // Check for multiple unconditional register definitions.
+ if (PM.count(Unconditional)) {
+ // Error out on an unconditional change when there are any other
+ // changes, conditional or not.
+ unsigned UsrR = Hexagon::USR;
+ unsigned BadR = RI.isSubRegister(Hexagon::USR, R) ? UsrR : R;
+ errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_REGISTERS, BadR);
+ addErrInfo(errInfo);
+ return false;
+ }
+ // Check for multiple conditional register definitions.
+ for (const auto& J : PM) {
+ PredSense P = J;
+
+ // Check for multiple uses of the same condition.
+ if (PM.count(P) > 1) {
+ // Error out on conditional changes based on the same predicate
+ // (e.g., "{ if (!p0) r0 =...; if (!p0) r0 =... }").
+ errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_REGISTERS, R);
+ addErrInfo(errInfo);
+ return false;
+ }
+ // Check for the use of the complementary condition.
+ P.second = !P.second;
+ if (PM.count(P) && PM.size() > 2) {
+ // Error out on conditional changes based on the same predicate
+ // multiple times
+ // (e.g., "{ if (p0) r0 =...; if (!p0) r0 =... }; if (!p0) r0 =... }").
+ errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_REGISTERS, R);
+ addErrInfo(errInfo);
+ return false;
+ }
+ }
+ }
+ }
+
+ // Check for use of current definitions.
+ for (const auto& I : CurDefs) {
+ unsigned R = I;
+
+ if (!Uses.count(R)) {
+ // Warn on an unused current definition.
+ errInfo.setWarning(HexagonMCErrInfo::CHECK_WARN_CURRENT, R);
+ addErrInfo(errInfo);
+ return true;
+ }
+ }
+
+ // Check for use of temporary definitions.
+ for (const auto& I : TmpDefs) {
+ unsigned R = I;
+
+ if (!Uses.count(R)) {
+ // special case for vhist
+ bool vHistFound = false;
+ for (auto const&HMI : HexagonMCInstrInfo::bundleInstructions(MCB)) {
+ if(llvm::HexagonMCInstrInfo::getType(MCII, *HMI.getInst()) == HexagonII::TypeCVI_HIST) {
+ vHistFound = true; // vhist() implicitly uses ALL REGxx.tmp
+ break;
+ }
+ }
+ // Warn on an unused temporary definition.
+ if (vHistFound == false) {
+ errInfo.setWarning(HexagonMCErrInfo::CHECK_WARN_TEMPORARY, R);
+ addErrInfo(errInfo);
+ return true;
+ }
+ }
+ }
+
+ return true;
+}
+
+// Check for legal use of solo insns.
+bool HexagonMCChecker::checkSolo() {
+ HexagonMCErrInfo errInfo;
+ if (HexagonMCInstrInfo::isBundle(MCB) &&
+ HexagonMCInstrInfo::bundleSize(MCB) > 1) {
+ for (auto const&I : HexagonMCInstrInfo::bundleInstructions(MCB)) {
+ if (llvm::HexagonMCInstrInfo::isSolo(MCII, *I.getInst())) {
+ errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_SOLO);
+ addErrInfo(errInfo);
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
+bool HexagonMCChecker::checkShuffle() {
+ HexagonMCErrInfo errInfo;
+ // Branch info is lost when duplexing. The unduplexed insns must be
+ // checked and only branch errors matter for this case.
+ HexagonMCShuffler MCS(MCII, STI, MCB);
+ if (!MCS.check()) {
+ if (MCS.getError() == HexagonShuffler::SHUFFLE_ERROR_BRANCHES) {
+ errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_SHUFFLE);
+ errInfo.setShuffleError(MCS.getError());
+ addErrInfo(errInfo);
+ return false;
+ }
+ }
+ HexagonMCShuffler MCSDX(MCII, STI, MCBDX);
+ if (!MCSDX.check()) {
+ errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_SHUFFLE);
+ errInfo.setShuffleError(MCSDX.getError());
+ addErrInfo(errInfo);
+ return false;
+ }
+ return true;
+}
+
+void HexagonMCChecker::compoundRegisterMap(unsigned& Register) {
+ switch (Register) {
+ default:
+ break;
+ case Hexagon::R15:
+ Register = Hexagon::R23;
+ break;
+ case Hexagon::R14:
+ Register = Hexagon::R22;
+ break;
+ case Hexagon::R13:
+ Register = Hexagon::R21;
+ break;
+ case Hexagon::R12:
+ Register = Hexagon::R20;
+ break;
+ case Hexagon::R11:
+ Register = Hexagon::R19;
+ break;
+ case Hexagon::R10:
+ Register = Hexagon::R18;
+ break;
+ case Hexagon::R9:
+ Register = Hexagon::R17;
+ break;
+ case Hexagon::R8:
+ Register = Hexagon::R16;
+ break;
+ }
+}
+
+bool HexagonMCChecker::hasValidNewValueDef(const NewSense &Use,
+ const NewSenseList &Defs) const {
+ bool Strict = !RelaxNVChecks;
+
+ for (unsigned i = 0, n = Defs.size(); i < n; ++i) {
+ const NewSense &Def = Defs[i];
+ // NVJ cannot use a new FP value [7.6.1]
+ if (Use.IsNVJ && (Def.IsFloat || Def.PredReg != 0))
+ continue;
+ // If the definition was not predicated, then it does not matter if
+ // the use is.
+ if (Def.PredReg == 0)
+ return true;
+ // With the strict checks, both the definition and the use must be
+ // predicated on the same register and condition.
+ if (Strict) {
+ if (Def.PredReg == Use.PredReg && Def.Cond == Use.Cond)
+ return true;
+ } else {
+ // With the relaxed checks, if the definition was predicated, the only
+ // detectable violation is if the use is predicated on the opposing
+ // condition, otherwise, it's ok.
+ if (Def.PredReg != Use.PredReg || Def.Cond == Use.Cond)
+ return true;
+ }
+ }
+ return false;
+}
+
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
new file mode 100644
index 000000000000..33e22798c954
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
@@ -0,0 +1,217 @@
+//===----- HexagonMCChecker.h - Instruction bundle checking ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the checking of insns inside a bundle according to the
+// packet constraint rules of the Hexagon ISA.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGONMCCHECKER_H
+#define HEXAGONMCCHECKER_H
+
+#include "MCTargetDesc/HexagonMCShuffler.h"
+#include <queue>
+#include <set>
+
+using namespace llvm;
+
+namespace llvm {
+class MCOperandInfo;
+
+typedef struct {
+ unsigned Error, Warning, ShuffleError;
+ unsigned Register;
+} ErrInfo_T;
+
+class HexagonMCErrInfo {
+public:
+ enum {
+ CHECK_SUCCESS = 0,
+ // Errors.
+ CHECK_ERROR_BRANCHES = 0x00001,
+ CHECK_ERROR_NEWP = 0x00002,
+ CHECK_ERROR_NEWV = 0x00004,
+ CHECK_ERROR_REGISTERS = 0x00008,
+ CHECK_ERROR_READONLY = 0x00010,
+ CHECK_ERROR_LOOP = 0x00020,
+ CHECK_ERROR_ENDLOOP = 0x00040,
+ CHECK_ERROR_SOLO = 0x00080,
+ CHECK_ERROR_SHUFFLE = 0x00100,
+ CHECK_ERROR_NOSLOTS = 0x00200,
+ CHECK_ERROR_UNKNOWN = 0x00400,
+ // Warnings.
+ CHECK_WARN_CURRENT = 0x10000,
+ CHECK_WARN_TEMPORARY = 0x20000
+ };
+ ErrInfo_T s;
+
+ void reset() {
+ s.Error = CHECK_SUCCESS;
+ s.Warning = CHECK_SUCCESS;
+ s.ShuffleError = HexagonShuffler::SHUFFLE_SUCCESS;
+ s.Register = Hexagon::NoRegister;
+ };
+ HexagonMCErrInfo() {
+ reset();
+ };
+
+ void setError(unsigned e, unsigned r = Hexagon::NoRegister)
+ { s.Error = e; s.Register = r; };
+ void setWarning(unsigned w, unsigned r = Hexagon::NoRegister)
+ { s.Warning = w; s.Register = r; };
+ void setShuffleError(unsigned e) { s.ShuffleError = e; };
+};
+
+/// Check for a valid bundle.
+class HexagonMCChecker {
+ /// Insn bundle.
+ MCInst& MCB;
+ MCInst& MCBDX;
+ const MCRegisterInfo& RI;
+ MCInstrInfo const &MCII;
+ MCSubtargetInfo const &STI;
+ bool bLoadErrInfo;
+
+ /// Set of definitions: register #, if predicated, if predicated true.
+ typedef std::pair<unsigned, bool> PredSense;
+ static const PredSense Unconditional;
+ typedef std::multiset<PredSense> PredSet;
+ typedef std::multiset<PredSense>::iterator PredSetIterator;
+
+ typedef llvm::DenseMap<unsigned, PredSet>::iterator DefsIterator;
+ llvm::DenseMap<unsigned, PredSet> Defs;
+
+ /// Information about how a new-value register is defined or used:
+ /// PredReg = predicate register, 0 if use/def not predicated,
+ /// Cond = true/false for if(PredReg)/if(!PredReg) respectively,
+ /// IsFloat = true if definition produces a floating point value
+ /// (not valid for uses),
+ /// IsNVJ = true if the use is a new-value branch (not valid for
+ /// definitions).
+ struct NewSense {
+ unsigned PredReg;
+ bool IsFloat, IsNVJ, Cond;
+ // The special-case "constructors":
+ static NewSense Jmp(bool isNVJ) {
+ NewSense NS = { /*PredReg=*/ 0, /*IsFloat=*/ false, /*IsNVJ=*/ isNVJ,
+ /*Cond=*/ false };
+ return NS;
+ }
+ static NewSense Use(unsigned PR, bool True) {
+ NewSense NS = { /*PredReg=*/ PR, /*IsFloat=*/ false, /*IsNVJ=*/ false,
+ /*Cond=*/ True };
+ return NS;
+ }
+ static NewSense Def(unsigned PR, bool True, bool Float) {
+ NewSense NS = { /*PredReg=*/ PR, /*IsFloat=*/ Float, /*IsNVJ=*/ false,
+ /*Cond=*/ True };
+ return NS;
+ }
+ };
+ /// Set of definitions that produce new register:
+ typedef llvm::SmallVector<NewSense,2> NewSenseList;
+ typedef llvm::DenseMap<unsigned, NewSenseList>::iterator NewDefsIterator;
+ llvm::DenseMap<unsigned, NewSenseList> NewDefs;
+
+ /// Set of weak definitions whose clashes should be enforced selectively.
+ typedef std::set<unsigned>::iterator SoftDefsIterator;
+ std::set<unsigned> SoftDefs;
+
+ /// Set of current definitions committed to the register file.
+ typedef std::set<unsigned>::iterator CurDefsIterator;
+ std::set<unsigned> CurDefs;
+
+ /// Set of temporary definitions not committed to the register file.
+ typedef std::set<unsigned>::iterator TmpDefsIterator;
+ std::set<unsigned> TmpDefs;
+
+ /// Set of new predicates used.
+ typedef std::set<unsigned>::iterator NewPredsIterator;
+ std::set<unsigned> NewPreds;
+
+ /// Set of predicates defined late.
+ typedef std::multiset<unsigned>::iterator LatePredsIterator;
+ std::multiset<unsigned> LatePreds;
+
+ /// Set of uses.
+ typedef std::set<unsigned>::iterator UsesIterator;
+ std::set<unsigned> Uses;
+
+ /// Set of new values used: new register, if new-value jump.
+ typedef llvm::DenseMap<unsigned, NewSense>::iterator NewUsesIterator;
+ llvm::DenseMap<unsigned, NewSense> NewUses;
+
+ /// Pre-defined set of read-only registers.
+ typedef std::set<unsigned>::iterator ReadOnlyIterator;
+ std::set<unsigned> ReadOnly;
+
+ std::queue<ErrInfo_T> ErrInfoQ;
+ HexagonMCErrInfo CrntErrInfo;
+
+ void getErrInfo() {
+ if (bLoadErrInfo == true) {
+ if (ErrInfoQ.empty()) {
+ CrntErrInfo.reset();
+ } else {
+ CrntErrInfo.s = ErrInfoQ.front();
+ ErrInfoQ.pop();
+ }
+ }
+ bLoadErrInfo = false;
+ }
+
+ void init();
+ void init(MCInst const&);
+
+ // Checks performed.
+ bool checkBranches();
+ bool checkPredicates();
+ bool checkNewValues();
+ bool checkRegisters();
+ bool checkSolo();
+ bool checkShuffle();
+ bool checkSlots();
+
+ static void compoundRegisterMap(unsigned&);
+
+ bool isPredicateRegister(unsigned R) const {
+ return (Hexagon::P0 == R || Hexagon::P1 == R ||
+ Hexagon::P2 == R || Hexagon::P3 == R);
+ };
+ bool isLoopRegister(unsigned R) const {
+ return (Hexagon::SA0 == R || Hexagon::LC0 == R ||
+ Hexagon::SA1 == R || Hexagon::LC1 == R);
+ };
+
+ bool hasValidNewValueDef(const NewSense &Use,
+ const NewSenseList &Defs) const;
+
+ public:
+ explicit HexagonMCChecker(MCInstrInfo const &MCII, MCSubtargetInfo const &STI, MCInst& mcb, MCInst &mcbdx,
+ const MCRegisterInfo& ri);
+
+ bool check();
+
+ /// add a new error/warning
+ void addErrInfo(HexagonMCErrInfo &err) { ErrInfoQ.push(err.s); };
+
+ /// Return the error code for the last operation in the insn bundle.
+ unsigned getError() { getErrInfo(); return CrntErrInfo.s.Error; };
+ unsigned getWarning() { getErrInfo(); return CrntErrInfo.s.Warning; };
+ unsigned getShuffleError() { getErrInfo(); return CrntErrInfo.s.ShuffleError; };
+ unsigned getErrRegister() { getErrInfo(); return CrntErrInfo.s.Register; };
+ bool getNextErrInfo() {
+ bLoadErrInfo = true;
+ return (ErrInfoQ.empty()) ? false : (getErrInfo(), true);
+ }
+};
+
+}
+
+#endif // HEXAGONMCCHECKER_H
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
new file mode 100644
index 000000000000..2645a17b9bd0
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
@@ -0,0 +1,824 @@
+//===-- HexagonMCCodeEmitter.cpp - Hexagon Target Descriptions ------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "MCTargetDesc/HexagonBaseInfo.h"
+#include "MCTargetDesc/HexagonFixupKinds.h"
+#include "MCTargetDesc/HexagonMCCodeEmitter.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "mccodeemitter"
+
+using namespace llvm;
+using namespace Hexagon;
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
+
+HexagonMCCodeEmitter::HexagonMCCodeEmitter(MCInstrInfo const &aMII,
+ MCContext &aMCT)
+ : MCT(aMCT), MCII(aMII), Addend(new unsigned(0)),
+ Extended(new bool(false)), CurrentBundle(new MCInst const *) {}
+
+uint32_t HexagonMCCodeEmitter::parseBits(size_t Instruction, size_t Last,
+ MCInst const &MCB,
+ MCInst const &MCI) const {
+ bool Duplex = HexagonMCInstrInfo::isDuplex(MCII, MCI);
+ if (Instruction == 0) {
+ if (HexagonMCInstrInfo::isInnerLoop(MCB)) {
+ assert(!Duplex);
+ assert(Instruction != Last);
+ return HexagonII::INST_PARSE_LOOP_END;
+ }
+ }
+ if (Instruction == 1) {
+ if (HexagonMCInstrInfo::isOuterLoop(MCB)) {
+ assert(!Duplex);
+ assert(Instruction != Last);
+ return HexagonII::INST_PARSE_LOOP_END;
+ }
+ }
+ if (Duplex) {
+ assert(Instruction == Last);
+ return HexagonII::INST_PARSE_DUPLEX;
+ }
+ if(Instruction == Last)
+ return HexagonII::INST_PARSE_PACKET_END;
+ return HexagonII::INST_PARSE_NOT_END;
+}
+
+void HexagonMCCodeEmitter::encodeInstruction(MCInst const &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ MCSubtargetInfo const &STI) const {
+ MCInst &HMB = const_cast<MCInst &>(MI);
+
+ assert(HexagonMCInstrInfo::isBundle(HMB));
+ DEBUG(dbgs() << "Encoding bundle\n";);
+ *Addend = 0;
+ *Extended = false;
+ *CurrentBundle = &MI;
+ size_t Instruction = 0;
+ size_t Last = HexagonMCInstrInfo::bundleSize(HMB) - 1;
+ for (auto &I : HexagonMCInstrInfo::bundleInstructions(HMB)) {
+ MCInst &HMI = const_cast<MCInst &>(*I.getInst());
+ verifyInstructionPredicates(HMI,
+ computeAvailableFeatures(STI.getFeatureBits()));
+
+ EncodeSingleInstruction(HMI, OS, Fixups, STI,
+ parseBits(Instruction, Last, HMB, HMI),
+ Instruction);
+ *Extended = HexagonMCInstrInfo::isImmext(HMI);
+ *Addend += HEXAGON_INSTR_SIZE;
+ ++Instruction;
+ }
+ return;
+}
+
+static bool RegisterMatches(unsigned Consumer, unsigned Producer,
+ unsigned Producer2) {
+ if (Consumer == Producer)
+ return true;
+ if (Consumer == Producer2)
+ return true;
+ // Calculate if we're a single vector consumer referencing a double producer
+ if (Producer >= Hexagon::W0 && Producer <= Hexagon::W15)
+ if (Consumer >= Hexagon::V0 && Consumer <= Hexagon::V31)
+ return ((Consumer - Hexagon::V0) >> 1) == (Producer - Hexagon::W0);
+ return false;
+}
+
+/// EncodeSingleInstruction - Emit a single
+void HexagonMCCodeEmitter::EncodeSingleInstruction(
+ const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI, uint32_t Parse, size_t Index) const {
+ MCInst HMB = MI;
+ assert(!HexagonMCInstrInfo::isBundle(HMB));
+ uint64_t Binary;
+
+ // Compound instructions are limited to using registers 0-7 and 16-23
+ // and here we make a map 16-23 to 8-15 so they can be correctly encoded.
+ static unsigned RegMap[8] = {Hexagon::R8, Hexagon::R9, Hexagon::R10,
+ Hexagon::R11, Hexagon::R12, Hexagon::R13,
+ Hexagon::R14, Hexagon::R15};
+
+ // Pseudo instructions don't get encoded and shouldn't be here
+ // in the first place!
+ assert(!HexagonMCInstrInfo::getDesc(MCII, HMB).isPseudo() &&
+ "pseudo-instruction found");
+ DEBUG(dbgs() << "Encoding insn"
+ " `" << HexagonMCInstrInfo::getName(MCII, HMB) << "'"
+ "\n");
+
+ if (llvm::HexagonMCInstrInfo::getType(MCII, HMB) == HexagonII::TypeCOMPOUND) {
+ for (unsigned i = 0; i < HMB.getNumOperands(); ++i)
+ if (HMB.getOperand(i).isReg()) {
+ unsigned Reg =
+ MCT.getRegisterInfo()->getEncodingValue(HMB.getOperand(i).getReg());
+ if ((Reg <= 23) && (Reg >= 16))
+ HMB.getOperand(i).setReg(RegMap[Reg - 16]);
+ }
+ }
+
+ if (HexagonMCInstrInfo::isNewValue(MCII, HMB)) {
+ // Calculate the new value distance to the associated producer
+ MCOperand &MCO =
+ HMB.getOperand(HexagonMCInstrInfo::getNewValueOp(MCII, HMB));
+ unsigned SOffset = 0;
+ unsigned VOffset = 0;
+ unsigned Register = MCO.getReg();
+ unsigned Register1;
+ unsigned Register2;
+ auto Instructions = HexagonMCInstrInfo::bundleInstructions(**CurrentBundle);
+ auto i = Instructions.begin() + Index - 1;
+ for (;; --i) {
+ assert(i != Instructions.begin() - 1 && "Couldn't find producer");
+ MCInst const &Inst = *i->getInst();
+ if (HexagonMCInstrInfo::isImmext(Inst))
+ continue;
+ ++SOffset;
+ if (HexagonMCInstrInfo::isVector(MCII, Inst))
+ // Vector instructions don't count scalars
+ ++VOffset;
+ Register1 =
+ HexagonMCInstrInfo::hasNewValue(MCII, Inst)
+ ? HexagonMCInstrInfo::getNewValueOperand(MCII, Inst).getReg()
+ : static_cast<unsigned>(Hexagon::NoRegister);
+ Register2 =
+ HexagonMCInstrInfo::hasNewValue2(MCII, Inst)
+ ? HexagonMCInstrInfo::getNewValueOperand2(MCII, Inst).getReg()
+ : static_cast<unsigned>(Hexagon::NoRegister);
+ if (!RegisterMatches(Register, Register1, Register2))
+ // This isn't the register we're looking for
+ continue;
+ if (!HexagonMCInstrInfo::isPredicated(MCII, Inst))
+ // Producer is unpredicated
+ break;
+ assert(HexagonMCInstrInfo::isPredicated(MCII, HMB) &&
+ "Unpredicated consumer depending on predicated producer");
+ if (HexagonMCInstrInfo::isPredicatedTrue(MCII, Inst) ==
+ HexagonMCInstrInfo::isPredicatedTrue(MCII, HMB))
+ // Producer predicate sense matched ours
+ break;
+ }
+ // Hexagon PRM 10.11 Construct Nt from distance
+ unsigned Offset =
+ HexagonMCInstrInfo::isVector(MCII, HMB) ? VOffset : SOffset;
+ Offset <<= 1;
+ Offset |=
+ HexagonMCInstrInfo::SubregisterBit(Register, Register1, Register2);
+ MCO.setReg(Offset + Hexagon::R0);
+ }
+
+ Binary = getBinaryCodeForInstr(HMB, Fixups, STI);
+ // Check for unimplemented instructions. Immediate extenders
+ // are encoded as zero, so they need to be accounted for.
+ if ((!Binary) &&
+ ((HMB.getOpcode() != DuplexIClass0) && (HMB.getOpcode() != A4_ext) &&
+ (HMB.getOpcode() != A4_ext_b) && (HMB.getOpcode() != A4_ext_c) &&
+ (HMB.getOpcode() != A4_ext_g))) {
+ DEBUG(dbgs() << "Unimplemented inst: "
+ " `" << HexagonMCInstrInfo::getName(MCII, HMB) << "'"
+ "\n");
+ llvm_unreachable("Unimplemented Instruction");
+ }
+ Binary |= Parse;
+
+ // if we need to emit a duplexed instruction
+ if (HMB.getOpcode() >= Hexagon::DuplexIClass0 &&
+ HMB.getOpcode() <= Hexagon::DuplexIClassF) {
+ assert(Parse == HexagonII::INST_PARSE_DUPLEX &&
+ "Emitting duplex without duplex parse bits");
+ unsigned dupIClass;
+ switch (HMB.getOpcode()) {
+ case Hexagon::DuplexIClass0:
+ dupIClass = 0;
+ break;
+ case Hexagon::DuplexIClass1:
+ dupIClass = 1;
+ break;
+ case Hexagon::DuplexIClass2:
+ dupIClass = 2;
+ break;
+ case Hexagon::DuplexIClass3:
+ dupIClass = 3;
+ break;
+ case Hexagon::DuplexIClass4:
+ dupIClass = 4;
+ break;
+ case Hexagon::DuplexIClass5:
+ dupIClass = 5;
+ break;
+ case Hexagon::DuplexIClass6:
+ dupIClass = 6;
+ break;
+ case Hexagon::DuplexIClass7:
+ dupIClass = 7;
+ break;
+ case Hexagon::DuplexIClass8:
+ dupIClass = 8;
+ break;
+ case Hexagon::DuplexIClass9:
+ dupIClass = 9;
+ break;
+ case Hexagon::DuplexIClassA:
+ dupIClass = 10;
+ break;
+ case Hexagon::DuplexIClassB:
+ dupIClass = 11;
+ break;
+ case Hexagon::DuplexIClassC:
+ dupIClass = 12;
+ break;
+ case Hexagon::DuplexIClassD:
+ dupIClass = 13;
+ break;
+ case Hexagon::DuplexIClassE:
+ dupIClass = 14;
+ break;
+ case Hexagon::DuplexIClassF:
+ dupIClass = 15;
+ break;
+ default:
+ llvm_unreachable("Unimplemented DuplexIClass");
+ break;
+ }
+ // 29 is the bit position.
+ // 0b1110 =0xE bits are masked off and down shifted by 1 bit.
+ // Last bit is moved to bit position 13
+ Binary = ((dupIClass & 0xE) << (29 - 1)) | ((dupIClass & 0x1) << 13);
+
+ const MCInst *subInst0 = HMB.getOperand(0).getInst();
+ const MCInst *subInst1 = HMB.getOperand(1).getInst();
+
+ // get subinstruction slot 0
+ unsigned subInstSlot0Bits = getBinaryCodeForInstr(*subInst0, Fixups, STI);
+ // get subinstruction slot 1
+ unsigned subInstSlot1Bits = getBinaryCodeForInstr(*subInst1, Fixups, STI);
+
+ Binary |= subInstSlot0Bits | (subInstSlot1Bits << 16);
+ }
+ support::endian::Writer<support::little>(OS).write<uint32_t>(Binary);
+ ++MCNumEmitted;
+}
+
+namespace {
+void raise_relocation_error(unsigned bits, unsigned kind) {
+ std::string Text;
+ {
+ llvm::raw_string_ostream Stream(Text);
+ Stream << "Unrecognized relocation combination bits: " << bits
+ << " kind: " << kind;
+ }
+ report_fatal_error(Text);
+}
+}
+
+/// getFixupNoBits - Some insns are not extended and thus have no
+/// bits. These cases require a more brute force method for determining
+/// the correct relocation.
+namespace {
+Hexagon::Fixups getFixupNoBits(MCInstrInfo const &MCII, const MCInst &MI,
+ const MCOperand &MO,
+ const MCSymbolRefExpr::VariantKind kind) {
+ const MCInstrDesc &MCID = HexagonMCInstrInfo::getDesc(MCII, MI);
+ unsigned insnType = llvm::HexagonMCInstrInfo::getType(MCII, MI);
+
+ if (insnType == HexagonII::TypePREFIX) {
+ switch (kind) {
+ case MCSymbolRefExpr::VK_GOTREL:
+ return Hexagon::fixup_Hexagon_GOTREL_32_6_X;
+ case MCSymbolRefExpr::VK_GOT:
+ return Hexagon::fixup_Hexagon_GOT_32_6_X;
+ case MCSymbolRefExpr::VK_TPREL:
+ return Hexagon::fixup_Hexagon_TPREL_32_6_X;
+ case MCSymbolRefExpr::VK_DTPREL:
+ return Hexagon::fixup_Hexagon_DTPREL_32_6_X;
+ case MCSymbolRefExpr::VK_Hexagon_GD_GOT:
+ return Hexagon::fixup_Hexagon_GD_GOT_32_6_X;
+ case MCSymbolRefExpr::VK_Hexagon_LD_GOT:
+ return Hexagon::fixup_Hexagon_LD_GOT_32_6_X;
+ case MCSymbolRefExpr::VK_Hexagon_IE:
+ return Hexagon::fixup_Hexagon_IE_32_6_X;
+ case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
+ return Hexagon::fixup_Hexagon_IE_GOT_32_6_X;
+ case MCSymbolRefExpr::VK_Hexagon_PCREL:
+ case MCSymbolRefExpr::VK_None:
+ if (MCID.isBranch())
+ return Hexagon::fixup_Hexagon_B32_PCREL_X;
+ else
+ return Hexagon::fixup_Hexagon_32_6_X;
+ default:
+ raise_relocation_error(0, kind);
+ }
+ } else if (MCID.isBranch())
+ return Hexagon::fixup_Hexagon_B13_PCREL;
+
+ switch (MCID.getOpcode()) {
+ case Hexagon::HI:
+ case Hexagon::A2_tfrih:
+ switch (kind) {
+ case MCSymbolRefExpr::VK_GOT:
+ return Hexagon::fixup_Hexagon_GOT_HI16;
+ case MCSymbolRefExpr::VK_GOTREL:
+ return Hexagon::fixup_Hexagon_GOTREL_HI16;
+ case MCSymbolRefExpr::VK_Hexagon_GD_GOT:
+ return Hexagon::fixup_Hexagon_GD_GOT_HI16;
+ case MCSymbolRefExpr::VK_Hexagon_LD_GOT:
+ return Hexagon::fixup_Hexagon_LD_GOT_HI16;
+ case MCSymbolRefExpr::VK_Hexagon_IE:
+ return Hexagon::fixup_Hexagon_IE_HI16;
+ case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
+ return Hexagon::fixup_Hexagon_IE_GOT_HI16;
+ case MCSymbolRefExpr::VK_TPREL:
+ return Hexagon::fixup_Hexagon_TPREL_HI16;
+ case MCSymbolRefExpr::VK_DTPREL:
+ return Hexagon::fixup_Hexagon_DTPREL_HI16;
+ case MCSymbolRefExpr::VK_None:
+ return Hexagon::fixup_Hexagon_HI16;
+ default:
+ raise_relocation_error(0, kind);
+ }
+
+ case Hexagon::LO:
+ case Hexagon::A2_tfril:
+ switch (kind) {
+ case MCSymbolRefExpr::VK_GOT:
+ return Hexagon::fixup_Hexagon_GOT_LO16;
+ case MCSymbolRefExpr::VK_GOTREL:
+ return Hexagon::fixup_Hexagon_GOTREL_LO16;
+ case MCSymbolRefExpr::VK_Hexagon_GD_GOT:
+ return Hexagon::fixup_Hexagon_GD_GOT_LO16;
+ case MCSymbolRefExpr::VK_Hexagon_LD_GOT:
+ return Hexagon::fixup_Hexagon_LD_GOT_LO16;
+ case MCSymbolRefExpr::VK_Hexagon_IE:
+ return Hexagon::fixup_Hexagon_IE_LO16;
+ case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
+ return Hexagon::fixup_Hexagon_IE_GOT_LO16;
+ case MCSymbolRefExpr::VK_TPREL:
+ return Hexagon::fixup_Hexagon_TPREL_LO16;
+ case MCSymbolRefExpr::VK_DTPREL:
+ return Hexagon::fixup_Hexagon_DTPREL_LO16;
+ case MCSymbolRefExpr::VK_None:
+ return Hexagon::fixup_Hexagon_LO16;
+ default:
+ raise_relocation_error(0, kind);
+ }
+
+ // The only relocs left should be GP relative:
+ default:
+ if (MCID.mayStore() || MCID.mayLoad()) {
+ for (const MCPhysReg *ImpUses = MCID.getImplicitUses(); *ImpUses;
+ ++ImpUses) {
+ if (*ImpUses != Hexagon::GP)
+ continue;
+ switch (HexagonMCInstrInfo::getAccessSize(MCII, MI)) {
+ case HexagonII::MemAccessSize::ByteAccess:
+ return fixup_Hexagon_GPREL16_0;
+ case HexagonII::MemAccessSize::HalfWordAccess:
+ return fixup_Hexagon_GPREL16_1;
+ case HexagonII::MemAccessSize::WordAccess:
+ return fixup_Hexagon_GPREL16_2;
+ case HexagonII::MemAccessSize::DoubleWordAccess:
+ return fixup_Hexagon_GPREL16_3;
+ default:
+ raise_relocation_error(0, kind);
+ }
+ }
+ }
+ raise_relocation_error(0, kind);
+ }
+ llvm_unreachable("Relocation exit not taken");
+}
+}
+
+namespace llvm {
+extern const MCInstrDesc HexagonInsts[];
+}
+
+namespace {
+ bool isPCRel (unsigned Kind) {
+ switch(Kind){
+ case fixup_Hexagon_B22_PCREL:
+ case fixup_Hexagon_B15_PCREL:
+ case fixup_Hexagon_B7_PCREL:
+ case fixup_Hexagon_B13_PCREL:
+ case fixup_Hexagon_B9_PCREL:
+ case fixup_Hexagon_B32_PCREL_X:
+ case fixup_Hexagon_B22_PCREL_X:
+ case fixup_Hexagon_B15_PCREL_X:
+ case fixup_Hexagon_B13_PCREL_X:
+ case fixup_Hexagon_B9_PCREL_X:
+ case fixup_Hexagon_B7_PCREL_X:
+ case fixup_Hexagon_32_PCREL:
+ case fixup_Hexagon_PLT_B22_PCREL:
+ case fixup_Hexagon_GD_PLT_B22_PCREL:
+ case fixup_Hexagon_LD_PLT_B22_PCREL:
+ case fixup_Hexagon_6_PCREL_X:
+ return true;
+ default:
+ return false;
+ }
+ }
+}
+
+unsigned HexagonMCCodeEmitter::getExprOpValue(const MCInst &MI,
+ const MCOperand &MO,
+ const MCExpr *ME,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const
+
+{
+ if (isa<HexagonMCExpr>(ME))
+ ME = &HexagonMCInstrInfo::getExpr(*ME);
+ int64_t Value;
+ if (ME->evaluateAsAbsolute(Value))
+ return Value;
+ assert(ME->getKind() == MCExpr::SymbolRef || ME->getKind() == MCExpr::Binary);
+ if (ME->getKind() == MCExpr::Binary) {
+ MCBinaryExpr const *Binary = cast<MCBinaryExpr>(ME);
+ getExprOpValue(MI, MO, Binary->getLHS(), Fixups, STI);
+ getExprOpValue(MI, MO, Binary->getRHS(), Fixups, STI);
+ return 0;
+ }
+ Hexagon::Fixups FixupKind =
+ Hexagon::Fixups(Hexagon::fixup_Hexagon_TPREL_LO16);
+ const MCSymbolRefExpr *MCSRE = static_cast<const MCSymbolRefExpr *>(ME);
+ const MCInstrDesc &MCID = HexagonMCInstrInfo::getDesc(MCII, MI);
+ unsigned bits = HexagonMCInstrInfo::getExtentBits(MCII, MI) -
+ HexagonMCInstrInfo::getExtentAlignment(MCII, MI);
+ const MCSymbolRefExpr::VariantKind kind = MCSRE->getKind();
+
+ DEBUG(dbgs() << "----------------------------------------\n");
+ DEBUG(dbgs() << "Opcode Name: " << HexagonMCInstrInfo::getName(MCII, MI)
+ << "\n");
+ DEBUG(dbgs() << "Opcode: " << MCID.getOpcode() << "\n");
+ DEBUG(dbgs() << "Relocation bits: " << bits << "\n");
+ DEBUG(dbgs() << "Addend: " << *Addend << "\n");
+ DEBUG(dbgs() << "----------------------------------------\n");
+
+ switch (bits) {
+ default:
+ raise_relocation_error(bits, kind);
+ case 32:
+ switch (kind) {
+ case MCSymbolRefExpr::VK_DTPREL:
+ FixupKind = *Extended ? Hexagon::fixup_Hexagon_DTPREL_32_6_X
+ : Hexagon::fixup_Hexagon_DTPREL_32;
+ break;
+ case MCSymbolRefExpr::VK_GOT:
+ FixupKind = *Extended ? Hexagon::fixup_Hexagon_GOT_32_6_X
+ : Hexagon::fixup_Hexagon_GOT_32;
+ break;
+ case MCSymbolRefExpr::VK_GOTREL:
+ FixupKind = *Extended ? Hexagon::fixup_Hexagon_GOTREL_32_6_X
+ : Hexagon::fixup_Hexagon_GOTREL_32;
+ break;
+ case MCSymbolRefExpr::VK_Hexagon_GD_GOT:
+ FixupKind = *Extended ? Hexagon::fixup_Hexagon_GD_GOT_32_6_X
+ : Hexagon::fixup_Hexagon_GD_GOT_32;
+ break;
+ case MCSymbolRefExpr::VK_Hexagon_IE:
+ FixupKind = *Extended ? Hexagon::fixup_Hexagon_IE_32_6_X
+ : Hexagon::fixup_Hexagon_IE_32;
+ break;
+ case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
+ FixupKind = *Extended ? Hexagon::fixup_Hexagon_IE_GOT_32_6_X
+ : Hexagon::fixup_Hexagon_IE_GOT_32;
+ break;
+ case MCSymbolRefExpr::VK_Hexagon_LD_GOT:
+ FixupKind = *Extended ? Hexagon::fixup_Hexagon_LD_GOT_32_6_X
+ : Hexagon::fixup_Hexagon_LD_GOT_32;
+ break;
+ case MCSymbolRefExpr::VK_Hexagon_PCREL:
+ FixupKind = Hexagon::fixup_Hexagon_32_PCREL;
+ break;
+ case MCSymbolRefExpr::VK_None:
+ FixupKind =
+ *Extended ? Hexagon::fixup_Hexagon_32_6_X : Hexagon::fixup_Hexagon_32;
+ break;
+ case MCSymbolRefExpr::VK_TPREL:
+ FixupKind = *Extended ? Hexagon::fixup_Hexagon_TPREL_32_6_X
+ : Hexagon::fixup_Hexagon_TPREL_32;
+ break;
+ default:
+ raise_relocation_error(bits, kind);
+ }
+ break;
+
+ case 22:
+ switch (kind) {
+ case MCSymbolRefExpr::VK_Hexagon_GD_PLT:
+ FixupKind = Hexagon::fixup_Hexagon_GD_PLT_B22_PCREL;
+ break;
+ case MCSymbolRefExpr::VK_Hexagon_LD_PLT:
+ FixupKind = Hexagon::fixup_Hexagon_LD_PLT_B22_PCREL;
+ break;
+ case MCSymbolRefExpr::VK_None:
+ FixupKind = *Extended ? Hexagon::fixup_Hexagon_B22_PCREL_X
+ : Hexagon::fixup_Hexagon_B22_PCREL;
+ break;
+ case MCSymbolRefExpr::VK_PLT:
+ FixupKind = Hexagon::fixup_Hexagon_PLT_B22_PCREL;
+ break;
+ default:
+ raise_relocation_error(bits, kind);
+ }
+ break;
+
+ case 16:
+ if (*Extended) {
+ switch (kind) {
+ case MCSymbolRefExpr::VK_DTPREL:
+ FixupKind = Hexagon::fixup_Hexagon_DTPREL_16_X;
+ break;
+ case MCSymbolRefExpr::VK_GOT:
+ FixupKind = Hexagon::fixup_Hexagon_GOT_16_X;
+ break;
+ case MCSymbolRefExpr::VK_GOTREL:
+ FixupKind = Hexagon::fixup_Hexagon_GOTREL_16_X;
+ break;
+ case MCSymbolRefExpr::VK_Hexagon_GD_GOT:
+ FixupKind = Hexagon::fixup_Hexagon_GD_GOT_16_X;
+ break;
+ case MCSymbolRefExpr::VK_Hexagon_IE:
+ FixupKind = Hexagon::fixup_Hexagon_IE_16_X;
+ break;
+ case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
+ FixupKind = Hexagon::fixup_Hexagon_IE_GOT_16_X;
+ break;
+ case MCSymbolRefExpr::VK_Hexagon_LD_GOT:
+ FixupKind = Hexagon::fixup_Hexagon_LD_GOT_16_X;
+ break;
+ case MCSymbolRefExpr::VK_None:
+ FixupKind = Hexagon::fixup_Hexagon_16_X;
+ break;
+ case MCSymbolRefExpr::VK_TPREL:
+ FixupKind = Hexagon::fixup_Hexagon_TPREL_16_X;
+ break;
+ default:
+ raise_relocation_error(bits, kind);
+ }
+ } else
+ switch (kind) {
+ case MCSymbolRefExpr::VK_None: {
+ if (HexagonMCInstrInfo::s23_2_reloc(*MO.getExpr()))
+ FixupKind = Hexagon::fixup_Hexagon_23_REG;
+ else
+ raise_relocation_error(bits, kind);
+ break;
+ }
+ case MCSymbolRefExpr::VK_DTPREL:
+ FixupKind = Hexagon::fixup_Hexagon_DTPREL_16;
+ break;
+ case MCSymbolRefExpr::VK_GOTREL:
+ if (MCID.getOpcode() == Hexagon::HI)
+ FixupKind = Hexagon::fixup_Hexagon_GOTREL_HI16;
+ else
+ FixupKind = Hexagon::fixup_Hexagon_GOTREL_LO16;
+ break;
+ case MCSymbolRefExpr::VK_Hexagon_GD_GOT:
+ FixupKind = Hexagon::fixup_Hexagon_GD_GOT_16;
+ break;
+ case MCSymbolRefExpr::VK_Hexagon_GPREL:
+ FixupKind = Hexagon::fixup_Hexagon_GPREL16_0;
+ break;
+ case MCSymbolRefExpr::VK_Hexagon_HI16:
+ FixupKind = Hexagon::fixup_Hexagon_HI16;
+ break;
+ case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
+ FixupKind = Hexagon::fixup_Hexagon_IE_GOT_16;
+ break;
+ case MCSymbolRefExpr::VK_Hexagon_LD_GOT:
+ FixupKind = Hexagon::fixup_Hexagon_LD_GOT_16;
+ break;
+ case MCSymbolRefExpr::VK_Hexagon_LO16:
+ FixupKind = Hexagon::fixup_Hexagon_LO16;
+ break;
+ case MCSymbolRefExpr::VK_TPREL:
+ FixupKind = Hexagon::fixup_Hexagon_TPREL_16;
+ break;
+ default:
+ raise_relocation_error(bits, kind);
+ }
+ break;
+
+ case 15:
+ switch (kind) {
+ case MCSymbolRefExpr::VK_None:
+ FixupKind = *Extended ? Hexagon::fixup_Hexagon_B15_PCREL_X
+ : Hexagon::fixup_Hexagon_B15_PCREL;
+ break;
+ default:
+ raise_relocation_error(bits, kind);
+ }
+ break;
+
+ case 13:
+ switch (kind) {
+ case MCSymbolRefExpr::VK_None:
+ FixupKind = Hexagon::fixup_Hexagon_B13_PCREL;
+ break;
+ default:
+ raise_relocation_error(bits, kind);
+ }
+ break;
+
+ case 12:
+ if (*Extended)
+ switch (kind) {
+ // There isn't a GOT_12_X, both 11_X and 16_X resolve to 6/26
+ case MCSymbolRefExpr::VK_GOT:
+ FixupKind = Hexagon::fixup_Hexagon_GOT_16_X;
+ break;
+ case MCSymbolRefExpr::VK_GOTREL:
+ FixupKind = Hexagon::fixup_Hexagon_GOTREL_16_X;
+ break;
+ case MCSymbolRefExpr::VK_None:
+ FixupKind = Hexagon::fixup_Hexagon_12_X;
+ break;
+ default:
+ raise_relocation_error(bits, kind);
+ }
+ else
+ raise_relocation_error(bits, kind);
+ break;
+
+ case 11:
+ if (*Extended)
+ switch (kind) {
+ case MCSymbolRefExpr::VK_DTPREL:
+ FixupKind = Hexagon::fixup_Hexagon_DTPREL_11_X;
+ break;
+ case MCSymbolRefExpr::VK_GOT:
+ FixupKind = Hexagon::fixup_Hexagon_GOT_11_X;
+ break;
+ case MCSymbolRefExpr::VK_GOTREL:
+ FixupKind = Hexagon::fixup_Hexagon_GOTREL_11_X;
+ break;
+ case MCSymbolRefExpr::VK_Hexagon_GD_GOT:
+ FixupKind = Hexagon::fixup_Hexagon_GD_GOT_11_X;
+ break;
+ case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
+ FixupKind = Hexagon::fixup_Hexagon_IE_GOT_11_X;
+ break;
+ case MCSymbolRefExpr::VK_Hexagon_LD_GOT:
+ FixupKind = Hexagon::fixup_Hexagon_LD_GOT_11_X;
+ break;
+ case MCSymbolRefExpr::VK_None:
+ FixupKind = Hexagon::fixup_Hexagon_11_X;
+ break;
+ case MCSymbolRefExpr::VK_TPREL:
+ FixupKind = Hexagon::fixup_Hexagon_TPREL_11_X;
+ break;
+ default:
+ raise_relocation_error(bits, kind);
+ }
+ else {
+ switch (kind) {
+ case MCSymbolRefExpr::VK_TPREL:
+ FixupKind = Hexagon::fixup_Hexagon_TPREL_11_X;
+ break;
+ default:
+ raise_relocation_error(bits, kind);
+ }
+ }
+ break;
+
+ case 10:
+ if (*Extended) {
+ switch (kind) {
+ case MCSymbolRefExpr::VK_None:
+ FixupKind = Hexagon::fixup_Hexagon_10_X;
+ break;
+ default:
+ raise_relocation_error(bits, kind);
+ }
+ } else
+ raise_relocation_error(bits, kind);
+ break;
+
+ case 9:
+ if (MCID.isBranch() ||
+ (HexagonMCInstrInfo::getType(MCII, MI) == HexagonII::TypeCR))
+ FixupKind = *Extended ? Hexagon::fixup_Hexagon_B9_PCREL_X
+ : Hexagon::fixup_Hexagon_B9_PCREL;
+ else if (*Extended)
+ FixupKind = Hexagon::fixup_Hexagon_9_X;
+ else
+ raise_relocation_error(bits, kind);
+ break;
+
+ case 8:
+ if (*Extended)
+ FixupKind = Hexagon::fixup_Hexagon_8_X;
+ else
+ raise_relocation_error(bits, kind);
+ break;
+
+ case 7:
+ if (MCID.isBranch() ||
+ (HexagonMCInstrInfo::getType(MCII, MI) == HexagonII::TypeCR))
+ FixupKind = *Extended ? Hexagon::fixup_Hexagon_B7_PCREL_X
+ : Hexagon::fixup_Hexagon_B7_PCREL;
+ else if (*Extended)
+ FixupKind = Hexagon::fixup_Hexagon_7_X;
+ else
+ raise_relocation_error(bits, kind);
+ break;
+
+ case 6:
+ if (*Extended) {
+ switch (kind) {
+ case MCSymbolRefExpr::VK_DTPREL:
+ FixupKind = Hexagon::fixup_Hexagon_DTPREL_16_X;
+ break;
+ // This is part of an extender, GOT_11 is a
+ // Word32_U6 unsigned/truncated reloc.
+ case MCSymbolRefExpr::VK_GOT:
+ FixupKind = Hexagon::fixup_Hexagon_GOT_11_X;
+ break;
+ case MCSymbolRefExpr::VK_GOTREL:
+ FixupKind = Hexagon::fixup_Hexagon_GOTREL_11_X;
+ break;
+ case MCSymbolRefExpr::VK_Hexagon_PCREL:
+ FixupKind = Hexagon::fixup_Hexagon_6_PCREL_X;
+ break;
+ case MCSymbolRefExpr::VK_TPREL:
+ FixupKind = Hexagon::fixup_Hexagon_TPREL_16_X;
+ break;
+ case MCSymbolRefExpr::VK_None:
+ FixupKind = Hexagon::fixup_Hexagon_6_X;
+ break;
+ default:
+ raise_relocation_error(bits, kind);
+ }
+ } else
+ raise_relocation_error(bits, kind);
+ break;
+
+ case 0:
+ FixupKind = getFixupNoBits(MCII, MI, MO, kind);
+ break;
+ }
+
+ MCExpr const *FixupExpression =
+ (*Addend > 0 && isPCRel(FixupKind))
+ ? MCBinaryExpr::createAdd(MO.getExpr(),
+ MCConstantExpr::create(*Addend, MCT), MCT)
+ : MO.getExpr();
+
+ MCFixup fixup = MCFixup::create(*Addend, FixupExpression,
+ MCFixupKind(FixupKind), MI.getLoc());
+ Fixups.push_back(fixup);
+ // All of the information is in the fixup.
+ return 0;
+}
+
+unsigned
+HexagonMCCodeEmitter::getMachineOpValue(MCInst const &MI, MCOperand const &MO,
+ SmallVectorImpl<MCFixup> &Fixups,
+ MCSubtargetInfo const &STI) const {
+ assert(!MO.isImm());
+ if (MO.isReg()) {
+ unsigned Reg = MO.getReg();
+ if (HexagonMCInstrInfo::isSubInstruction(MI))
+ return HexagonMCInstrInfo::getDuplexRegisterNumbering(Reg);
+ switch(MI.getOpcode()){
+ case Hexagon::A2_tfrrcr:
+ case Hexagon::A2_tfrcrr:
+ if(Reg == Hexagon::M0)
+ Reg = Hexagon::C6;
+ if(Reg == Hexagon::M1)
+ Reg = Hexagon::C7;
+ }
+ return MCT.getRegisterInfo()->getEncodingValue(Reg);
+ }
+
+ return getExprOpValue(MI, MO, MO.getExpr(), Fixups, STI);
+}
+
+MCCodeEmitter *llvm::createHexagonMCCodeEmitter(MCInstrInfo const &MII,
+ MCRegisterInfo const &MRI,
+ MCContext &MCT) {
+ return new HexagonMCCodeEmitter(MII, MCT);
+}
+
+#define ENABLE_INSTR_PREDICATE_VERIFIER
+#include "HexagonGenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h
new file mode 100644
index 000000000000..8e0667d9ac8e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h
@@ -0,0 +1,75 @@
+//===-- HexagonMCCodeEmitter.h - Hexagon Target Descriptions ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Definition for classes that emit Hexagon machine code from MCInsts
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGONMCCODEEMITTER_H
+#define HEXAGONMCCODEEMITTER_H
+
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+class HexagonMCCodeEmitter : public MCCodeEmitter {
+ MCContext &MCT;
+ MCInstrInfo const &MCII;
+ std::unique_ptr<unsigned> Addend;
+ std::unique_ptr<bool> Extended;
+ std::unique_ptr<MCInst const *> CurrentBundle;
+
+ // helper routine for getMachineOpValue()
+ unsigned getExprOpValue(const MCInst &MI, const MCOperand &MO,
+ const MCExpr *ME, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+public:
+ HexagonMCCodeEmitter(MCInstrInfo const &aMII, MCContext &aMCT);
+
+ // Return parse bits for instruction `MCI' inside bundle `MCB'
+ uint32_t parseBits(size_t Instruction, size_t Last, MCInst const &MCB,
+ MCInst const &MCI) const;
+
+ void encodeInstruction(MCInst const &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ MCSubtargetInfo const &STI) const override;
+
+ void EncodeSingleInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI,
+ uint32_t Parse, size_t Index) const;
+
+ // \brief TableGen'erated function for getting the
+ // binary encoding for an instruction.
+ uint64_t getBinaryCodeForInstr(MCInst const &MI,
+ SmallVectorImpl<MCFixup> &Fixups,
+ MCSubtargetInfo const &STI) const;
+
+ /// \brief Return binary encoding of operand.
+ unsigned getMachineOpValue(MCInst const &MI, MCOperand const &MO,
+ SmallVectorImpl<MCFixup> &Fixups,
+ MCSubtargetInfo const &STI) const;
+
+private:
+ uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;
+ void verifyInstructionPredicates(const MCInst &MI,
+ uint64_t AvailableFeatures) const;
+}; // class HexagonMCCodeEmitter
+
+} // namespace llvm
+
+#endif /* HEXAGONMCCODEEMITTER_H */
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
new file mode 100644
index 000000000000..5feaffe6efb9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
@@ -0,0 +1,425 @@
+
+//=== HexagonMCCompound.cpp - Hexagon Compound checker -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is looks at a packet and tries to form compound insns
+//
+//===----------------------------------------------------------------------===//
+#include "Hexagon.h"
+#include "MCTargetDesc/HexagonBaseInfo.h"
+#include "MCTargetDesc/HexagonMCShuffler.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace Hexagon;
+
+#define DEBUG_TYPE "hexagon-mccompound"
+
+enum OpcodeIndex {
+ fp0_jump_nt = 0,
+ fp0_jump_t,
+ fp1_jump_nt,
+ fp1_jump_t,
+ tp0_jump_nt,
+ tp0_jump_t,
+ tp1_jump_nt,
+ tp1_jump_t
+};
+
+static const unsigned tstBitOpcode[8] = {
+ J4_tstbit0_fp0_jump_nt, J4_tstbit0_fp0_jump_t, J4_tstbit0_fp1_jump_nt,
+ J4_tstbit0_fp1_jump_t, J4_tstbit0_tp0_jump_nt, J4_tstbit0_tp0_jump_t,
+ J4_tstbit0_tp1_jump_nt, J4_tstbit0_tp1_jump_t};
+static const unsigned cmpeqBitOpcode[8] = {
+ J4_cmpeq_fp0_jump_nt, J4_cmpeq_fp0_jump_t, J4_cmpeq_fp1_jump_nt,
+ J4_cmpeq_fp1_jump_t, J4_cmpeq_tp0_jump_nt, J4_cmpeq_tp0_jump_t,
+ J4_cmpeq_tp1_jump_nt, J4_cmpeq_tp1_jump_t};
+static const unsigned cmpgtBitOpcode[8] = {
+ J4_cmpgt_fp0_jump_nt, J4_cmpgt_fp0_jump_t, J4_cmpgt_fp1_jump_nt,
+ J4_cmpgt_fp1_jump_t, J4_cmpgt_tp0_jump_nt, J4_cmpgt_tp0_jump_t,
+ J4_cmpgt_tp1_jump_nt, J4_cmpgt_tp1_jump_t};
+static const unsigned cmpgtuBitOpcode[8] = {
+ J4_cmpgtu_fp0_jump_nt, J4_cmpgtu_fp0_jump_t, J4_cmpgtu_fp1_jump_nt,
+ J4_cmpgtu_fp1_jump_t, J4_cmpgtu_tp0_jump_nt, J4_cmpgtu_tp0_jump_t,
+ J4_cmpgtu_tp1_jump_nt, J4_cmpgtu_tp1_jump_t};
+static const unsigned cmpeqiBitOpcode[8] = {
+ J4_cmpeqi_fp0_jump_nt, J4_cmpeqi_fp0_jump_t, J4_cmpeqi_fp1_jump_nt,
+ J4_cmpeqi_fp1_jump_t, J4_cmpeqi_tp0_jump_nt, J4_cmpeqi_tp0_jump_t,
+ J4_cmpeqi_tp1_jump_nt, J4_cmpeqi_tp1_jump_t};
+static const unsigned cmpgtiBitOpcode[8] = {
+ J4_cmpgti_fp0_jump_nt, J4_cmpgti_fp0_jump_t, J4_cmpgti_fp1_jump_nt,
+ J4_cmpgti_fp1_jump_t, J4_cmpgti_tp0_jump_nt, J4_cmpgti_tp0_jump_t,
+ J4_cmpgti_tp1_jump_nt, J4_cmpgti_tp1_jump_t};
+static const unsigned cmpgtuiBitOpcode[8] = {
+ J4_cmpgtui_fp0_jump_nt, J4_cmpgtui_fp0_jump_t, J4_cmpgtui_fp1_jump_nt,
+ J4_cmpgtui_fp1_jump_t, J4_cmpgtui_tp0_jump_nt, J4_cmpgtui_tp0_jump_t,
+ J4_cmpgtui_tp1_jump_nt, J4_cmpgtui_tp1_jump_t};
+static const unsigned cmpeqn1BitOpcode[8] = {
+ J4_cmpeqn1_fp0_jump_nt, J4_cmpeqn1_fp0_jump_t, J4_cmpeqn1_fp1_jump_nt,
+ J4_cmpeqn1_fp1_jump_t, J4_cmpeqn1_tp0_jump_nt, J4_cmpeqn1_tp0_jump_t,
+ J4_cmpeqn1_tp1_jump_nt, J4_cmpeqn1_tp1_jump_t};
+static const unsigned cmpgtn1BitOpcode[8] = {
+ J4_cmpgtn1_fp0_jump_nt, J4_cmpgtn1_fp0_jump_t, J4_cmpgtn1_fp1_jump_nt,
+ J4_cmpgtn1_fp1_jump_t, J4_cmpgtn1_tp0_jump_nt, J4_cmpgtn1_tp0_jump_t,
+ J4_cmpgtn1_tp1_jump_nt, J4_cmpgtn1_tp1_jump_t,
+};
+
+// enum HexagonII::CompoundGroup
+namespace {
+unsigned getCompoundCandidateGroup(MCInst const &MI, bool IsExtended) {
+ unsigned DstReg, SrcReg, Src1Reg, Src2Reg;
+
+ switch (MI.getOpcode()) {
+ default:
+ return HexagonII::HCG_None;
+ //
+ // Compound pairs.
+ // "p0=cmp.eq(Rs16,Rt16); if (p0.new) jump:nt #r9:2"
+ // "Rd16=#U6 ; jump #r9:2"
+ // "Rd16=Rs16 ; jump #r9:2"
+ //
+ case Hexagon::C2_cmpeq:
+ case Hexagon::C2_cmpgt:
+ case Hexagon::C2_cmpgtu:
+ if (IsExtended)
+ return false;
+ DstReg = MI.getOperand(0).getReg();
+ Src1Reg = MI.getOperand(1).getReg();
+ Src2Reg = MI.getOperand(2).getReg();
+ if ((Hexagon::P0 == DstReg || Hexagon::P1 == DstReg) &&
+ HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) &&
+ HexagonMCInstrInfo::isIntRegForSubInst(Src2Reg))
+ return HexagonII::HCG_A;
+ break;
+ case Hexagon::C2_cmpeqi:
+ case Hexagon::C2_cmpgti:
+ case Hexagon::C2_cmpgtui:
+ if (IsExtended)
+ return false;
+ // P0 = cmp.eq(Rs,#u2)
+ DstReg = MI.getOperand(0).getReg();
+ SrcReg = MI.getOperand(1).getReg();
+ if ((Hexagon::P0 == DstReg || Hexagon::P1 == DstReg) &&
+ HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
+ (HexagonMCInstrInfo::inRange<5>(MI, 2) ||
+ HexagonMCInstrInfo::minConstant(MI, 2) == -1))
+ return HexagonII::HCG_A;
+ break;
+ case Hexagon::A2_tfr:
+ if (IsExtended)
+ return false;
+ // Rd = Rs
+ DstReg = MI.getOperand(0).getReg();
+ SrcReg = MI.getOperand(1).getReg();
+ if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) &&
+ HexagonMCInstrInfo::isIntRegForSubInst(SrcReg))
+ return HexagonII::HCG_A;
+ break;
+ case Hexagon::A2_tfrsi:
+ if (IsExtended)
+ return false;
+ // Rd = #u6
+ DstReg = MI.getOperand(0).getReg();
+ if (HexagonMCInstrInfo::minConstant(MI, 1) <= 63 &&
+ HexagonMCInstrInfo::minConstant(MI, 1) >= 0 &&
+ HexagonMCInstrInfo::isIntRegForSubInst(DstReg))
+ return HexagonII::HCG_A;
+ break;
+ case Hexagon::S2_tstbit_i:
+ if (IsExtended)
+ return false;
+ DstReg = MI.getOperand(0).getReg();
+ Src1Reg = MI.getOperand(1).getReg();
+ if ((Hexagon::P0 == DstReg || Hexagon::P1 == DstReg) &&
+ HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) &&
+ HexagonMCInstrInfo::minConstant(MI, 2) == 0)
+ return HexagonII::HCG_A;
+ break;
+ // The fact that .new form is used pretty much guarantees
+ // that predicate register will match. Nevertheless,
+ // there could be some false positives without additional
+ // checking.
+ case Hexagon::J2_jumptnew:
+ case Hexagon::J2_jumpfnew:
+ case Hexagon::J2_jumptnewpt:
+ case Hexagon::J2_jumpfnewpt:
+ Src1Reg = MI.getOperand(0).getReg();
+ if (Hexagon::P0 == Src1Reg || Hexagon::P1 == Src1Reg)
+ return HexagonII::HCG_B;
+ break;
+ // Transfer and jump:
+ // Rd=#U6 ; jump #r9:2
+ // Rd=Rs ; jump #r9:2
+ // Do not test for jump range here.
+ case Hexagon::J2_jump:
+ case Hexagon::RESTORE_DEALLOC_RET_JMP_V4:
+ return HexagonII::HCG_C;
+ break;
+ }
+
+ return HexagonII::HCG_None;
+}
+}
+
+/// getCompoundOp - Return the index from 0-7 into the above opcode lists.
+namespace {
+unsigned getCompoundOp(MCInst const &HMCI) {
+ const MCOperand &Predicate = HMCI.getOperand(0);
+ unsigned PredReg = Predicate.getReg();
+
+ assert((PredReg == Hexagon::P0) || (PredReg == Hexagon::P1) ||
+ (PredReg == Hexagon::P2) || (PredReg == Hexagon::P3));
+
+ switch (HMCI.getOpcode()) {
+ default:
+ llvm_unreachable("Expected match not found.\n");
+ break;
+ case Hexagon::J2_jumpfnew:
+ return (PredReg == Hexagon::P0) ? fp0_jump_nt : fp1_jump_nt;
+ case Hexagon::J2_jumpfnewpt:
+ return (PredReg == Hexagon::P0) ? fp0_jump_t : fp1_jump_t;
+ case Hexagon::J2_jumptnew:
+ return (PredReg == Hexagon::P0) ? tp0_jump_nt : tp1_jump_nt;
+ case Hexagon::J2_jumptnewpt:
+ return (PredReg == Hexagon::P0) ? tp0_jump_t : tp1_jump_t;
+ }
+}
+}
+
+namespace {
+MCInst *getCompoundInsn(MCContext &Context, MCInst const &L, MCInst const &R) {
+ MCInst *CompoundInsn = 0;
+ unsigned compoundOpcode;
+ MCOperand Rs, Rt;
+ int64_t Value;
+ bool Success;
+
+ switch (L.getOpcode()) {
+ default:
+ DEBUG(dbgs() << "Possible compound ignored\n");
+ return CompoundInsn;
+
+ case Hexagon::A2_tfrsi:
+ Rt = L.getOperand(0);
+ compoundOpcode = J4_jumpseti;
+ CompoundInsn = new (Context) MCInst;
+ CompoundInsn->setOpcode(compoundOpcode);
+
+ CompoundInsn->addOperand(Rt);
+ CompoundInsn->addOperand(L.getOperand(1)); // Immediate
+ CompoundInsn->addOperand(R.getOperand(0)); // Jump target
+ break;
+
+ case Hexagon::A2_tfr:
+ Rt = L.getOperand(0);
+ Rs = L.getOperand(1);
+
+ compoundOpcode = J4_jumpsetr;
+ CompoundInsn = new (Context) MCInst;
+ CompoundInsn->setOpcode(compoundOpcode);
+ CompoundInsn->addOperand(Rt);
+ CompoundInsn->addOperand(Rs);
+ CompoundInsn->addOperand(R.getOperand(0)); // Jump target.
+
+ break;
+
+ case Hexagon::C2_cmpeq:
+ DEBUG(dbgs() << "CX: C2_cmpeq\n");
+ Rs = L.getOperand(1);
+ Rt = L.getOperand(2);
+
+ compoundOpcode = cmpeqBitOpcode[getCompoundOp(R)];
+ CompoundInsn = new (Context) MCInst;
+ CompoundInsn->setOpcode(compoundOpcode);
+ CompoundInsn->addOperand(Rs);
+ CompoundInsn->addOperand(Rt);
+ CompoundInsn->addOperand(R.getOperand(1));
+ break;
+
+ case Hexagon::C2_cmpgt:
+ DEBUG(dbgs() << "CX: C2_cmpgt\n");
+ Rs = L.getOperand(1);
+ Rt = L.getOperand(2);
+
+ compoundOpcode = cmpgtBitOpcode[getCompoundOp(R)];
+ CompoundInsn = new (Context) MCInst;
+ CompoundInsn->setOpcode(compoundOpcode);
+ CompoundInsn->addOperand(Rs);
+ CompoundInsn->addOperand(Rt);
+ CompoundInsn->addOperand(R.getOperand(1));
+ break;
+
+ case Hexagon::C2_cmpgtu:
+ DEBUG(dbgs() << "CX: C2_cmpgtu\n");
+ Rs = L.getOperand(1);
+ Rt = L.getOperand(2);
+
+ compoundOpcode = cmpgtuBitOpcode[getCompoundOp(R)];
+ CompoundInsn = new (Context) MCInst;
+ CompoundInsn->setOpcode(compoundOpcode);
+ CompoundInsn->addOperand(Rs);
+ CompoundInsn->addOperand(Rt);
+ CompoundInsn->addOperand(R.getOperand(1));
+ break;
+
+ case Hexagon::C2_cmpeqi:
+ DEBUG(dbgs() << "CX: C2_cmpeqi\n");
+ Success = L.getOperand(2).getExpr()->evaluateAsAbsolute(Value);
+ (void)Success;
+ assert(Success);
+ if (Value == -1)
+ compoundOpcode = cmpeqn1BitOpcode[getCompoundOp(R)];
+ else
+ compoundOpcode = cmpeqiBitOpcode[getCompoundOp(R)];
+
+ Rs = L.getOperand(1);
+ CompoundInsn = new (Context) MCInst;
+ CompoundInsn->setOpcode(compoundOpcode);
+ CompoundInsn->addOperand(Rs);
+ CompoundInsn->addOperand(L.getOperand(2));
+ CompoundInsn->addOperand(R.getOperand(1));
+ break;
+
+ case Hexagon::C2_cmpgti:
+ DEBUG(dbgs() << "CX: C2_cmpgti\n");
+ Success = L.getOperand(2).getExpr()->evaluateAsAbsolute(Value);
+ (void)Success;
+ assert(Success);
+ if (Value == -1)
+ compoundOpcode = cmpgtn1BitOpcode[getCompoundOp(R)];
+ else
+ compoundOpcode = cmpgtiBitOpcode[getCompoundOp(R)];
+
+ Rs = L.getOperand(1);
+ CompoundInsn = new (Context) MCInst;
+ CompoundInsn->setOpcode(compoundOpcode);
+ CompoundInsn->addOperand(Rs);
+ CompoundInsn->addOperand(L.getOperand(2));
+ CompoundInsn->addOperand(R.getOperand(1));
+ break;
+
+ case Hexagon::C2_cmpgtui:
+ DEBUG(dbgs() << "CX: C2_cmpgtui\n");
+ Rs = L.getOperand(1);
+ compoundOpcode = cmpgtuiBitOpcode[getCompoundOp(R)];
+ CompoundInsn = new (Context) MCInst;
+ CompoundInsn->setOpcode(compoundOpcode);
+ CompoundInsn->addOperand(Rs);
+ CompoundInsn->addOperand(L.getOperand(2));
+ CompoundInsn->addOperand(R.getOperand(1));
+ break;
+
+ case Hexagon::S2_tstbit_i:
+ DEBUG(dbgs() << "CX: S2_tstbit_i\n");
+ Rs = L.getOperand(1);
+ compoundOpcode = tstBitOpcode[getCompoundOp(R)];
+ CompoundInsn = new (Context) MCInst;
+ CompoundInsn->setOpcode(compoundOpcode);
+ CompoundInsn->addOperand(Rs);
+ CompoundInsn->addOperand(R.getOperand(1));
+ break;
+ }
+
+ return CompoundInsn;
+}
+}
+
+/// Non-Symmetrical. See if these two instructions are fit for compound pair.
+namespace {
+bool isOrderedCompoundPair(MCInst const &MIa, bool IsExtendedA,
+ MCInst const &MIb, bool IsExtendedB) {
+ unsigned MIaG = getCompoundCandidateGroup(MIa, IsExtendedA);
+ unsigned MIbG = getCompoundCandidateGroup(MIb, IsExtendedB);
+ // We have two candidates - check that this is the same register
+ // we are talking about.
+ unsigned Opca = MIa.getOpcode();
+ if (MIaG == HexagonII::HCG_A && MIbG == HexagonII::HCG_C &&
+ (Opca == Hexagon::A2_tfr || Opca == Hexagon::A2_tfrsi))
+ return true;
+ return ((MIaG == HexagonII::HCG_A && MIbG == HexagonII::HCG_B) &&
+ (MIa.getOperand(0).getReg() == MIb.getOperand(0).getReg()));
+}
+}
+
+namespace {
+bool lookForCompound(MCInstrInfo const &MCII, MCContext &Context, MCInst &MCI) {
+ assert(HexagonMCInstrInfo::isBundle(MCI));
+ bool JExtended = false;
+ for (MCInst::iterator J =
+ MCI.begin() + HexagonMCInstrInfo::bundleInstructionsOffset;
+ J != MCI.end(); ++J) {
+ MCInst const *JumpInst = J->getInst();
+ if (HexagonMCInstrInfo::isImmext(*JumpInst)) {
+ JExtended = true;
+ continue;
+ }
+ if (llvm::HexagonMCInstrInfo::getType(MCII, *JumpInst) ==
+ HexagonII::TypeJ) {
+ // Try to pair with another insn (B)undled with jump.
+ bool BExtended = false;
+ for (MCInst::iterator B =
+ MCI.begin() + HexagonMCInstrInfo::bundleInstructionsOffset;
+ B != MCI.end(); ++B) {
+ MCInst const *Inst = B->getInst();
+ if (JumpInst == Inst)
+ continue;
+ if (HexagonMCInstrInfo::isImmext(*Inst)) {
+ BExtended = true;
+ continue;
+ }
+ DEBUG(dbgs() << "J,B: " << JumpInst->getOpcode() << ","
+ << Inst->getOpcode() << "\n");
+ if (isOrderedCompoundPair(*Inst, BExtended, *JumpInst, JExtended)) {
+ MCInst *CompoundInsn = getCompoundInsn(Context, *Inst, *JumpInst);
+ if (CompoundInsn) {
+ DEBUG(dbgs() << "B: " << Inst->getOpcode() << ","
+ << JumpInst->getOpcode() << " Compounds to "
+ << CompoundInsn->getOpcode() << "\n");
+ J->setInst(CompoundInsn);
+ MCI.erase(B);
+ return true;
+ }
+ }
+ BExtended = false;
+ }
+ }
+ JExtended = false;
+ }
+ return false;
+}
+}
+
+/// tryCompound - Given a bundle check for compound insns when one
+/// is found update the contents fo the bundle with the compound insn.
+/// If a compound instruction is found then the bundle will have one
+/// additional slot.
+void HexagonMCInstrInfo::tryCompound(MCInstrInfo const &MCII,
+ MCContext &Context, MCInst &MCI) {
+ assert(HexagonMCInstrInfo::isBundle(MCI) &&
+ "Non-Bundle where Bundle expected");
+
+ // By definition a compound must have 2 insn.
+ if (MCI.size() < 2)
+ return;
+
+ // Look for compounds until none are found, only update the bundle when
+ // a compound is found.
+ while (lookForCompound(MCII, Context, MCI))
+ ;
+
+ return;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
new file mode 100644
index 000000000000..413f052aa4bd
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
@@ -0,0 +1,1081 @@
+//===----- HexagonMCDuplexInfo.cpp - Instruction bundle checking ----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements duplexing of instructions to reduce code size
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonBaseInfo.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <map>
+
+using namespace llvm;
+using namespace Hexagon;
+
+#define DEBUG_TYPE "hexagon-mcduplex-info"
+
+// pair table of subInstructions with opcodes
+static const std::pair<unsigned, unsigned> opcodeData[] = {
+ std::make_pair((unsigned)SA1_addi, 0),
+ std::make_pair((unsigned)SA1_addrx, 6144),
+ std::make_pair((unsigned)SA1_addsp, 3072),
+ std::make_pair((unsigned)SA1_and1, 4608),
+ std::make_pair((unsigned)SA1_clrf, 6768),
+ std::make_pair((unsigned)SA1_clrfnew, 6736),
+ std::make_pair((unsigned)SA1_clrt, 6752),
+ std::make_pair((unsigned)SA1_clrtnew, 6720),
+ std::make_pair((unsigned)SA1_cmpeqi, 6400),
+ std::make_pair((unsigned)SA1_combine0i, 7168),
+ std::make_pair((unsigned)SA1_combine1i, 7176),
+ std::make_pair((unsigned)SA1_combine2i, 7184),
+ std::make_pair((unsigned)SA1_combine3i, 7192),
+ std::make_pair((unsigned)SA1_combinerz, 7432),
+ std::make_pair((unsigned)SA1_combinezr, 7424),
+ std::make_pair((unsigned)SA1_dec, 4864),
+ std::make_pair((unsigned)SA1_inc, 4352),
+ std::make_pair((unsigned)SA1_seti, 2048),
+ std::make_pair((unsigned)SA1_setin1, 6656),
+ std::make_pair((unsigned)SA1_sxtb, 5376),
+ std::make_pair((unsigned)SA1_sxth, 5120),
+ std::make_pair((unsigned)SA1_tfr, 4096),
+ std::make_pair((unsigned)SA1_zxtb, 5888),
+ std::make_pair((unsigned)SA1_zxth, 5632),
+ std::make_pair((unsigned)SL1_loadri_io, 0),
+ std::make_pair((unsigned)SL1_loadrub_io, 4096),
+ std::make_pair((unsigned)SL2_deallocframe, 7936),
+ std::make_pair((unsigned)SL2_jumpr31, 8128),
+ std::make_pair((unsigned)SL2_jumpr31_f, 8133),
+ std::make_pair((unsigned)SL2_jumpr31_fnew, 8135),
+ std::make_pair((unsigned)SL2_jumpr31_t, 8132),
+ std::make_pair((unsigned)SL2_jumpr31_tnew, 8134),
+ std::make_pair((unsigned)SL2_loadrb_io, 4096),
+ std::make_pair((unsigned)SL2_loadrd_sp, 7680),
+ std::make_pair((unsigned)SL2_loadrh_io, 0),
+ std::make_pair((unsigned)SL2_loadri_sp, 7168),
+ std::make_pair((unsigned)SL2_loadruh_io, 2048),
+ std::make_pair((unsigned)SL2_return, 8000),
+ std::make_pair((unsigned)SL2_return_f, 8005),
+ std::make_pair((unsigned)SL2_return_fnew, 8007),
+ std::make_pair((unsigned)SL2_return_t, 8004),
+ std::make_pair((unsigned)SL2_return_tnew, 8006),
+ std::make_pair((unsigned)SS1_storeb_io, 4096),
+ std::make_pair((unsigned)SS1_storew_io, 0),
+ std::make_pair((unsigned)SS2_allocframe, 7168),
+ std::make_pair((unsigned)SS2_storebi0, 4608),
+ std::make_pair((unsigned)SS2_storebi1, 4864),
+ std::make_pair((unsigned)SS2_stored_sp, 2560),
+ std::make_pair((unsigned)SS2_storeh_io, 0),
+ std::make_pair((unsigned)SS2_storew_sp, 2048),
+ std::make_pair((unsigned)SS2_storewi0, 4096),
+ std::make_pair((unsigned)SS2_storewi1, 4352)};
+
+bool HexagonMCInstrInfo::isDuplexPairMatch(unsigned Ga, unsigned Gb) {
+ switch (Ga) {
+ case HexagonII::HSIG_None:
+ default:
+ return false;
+ case HexagonII::HSIG_L1:
+ return (Gb == HexagonII::HSIG_L1 || Gb == HexagonII::HSIG_A);
+ case HexagonII::HSIG_L2:
+ return (Gb == HexagonII::HSIG_L1 || Gb == HexagonII::HSIG_L2 ||
+ Gb == HexagonII::HSIG_A);
+ case HexagonII::HSIG_S1:
+ return (Gb == HexagonII::HSIG_L1 || Gb == HexagonII::HSIG_L2 ||
+ Gb == HexagonII::HSIG_S1 || Gb == HexagonII::HSIG_A);
+ case HexagonII::HSIG_S2:
+ return (Gb == HexagonII::HSIG_L1 || Gb == HexagonII::HSIG_L2 ||
+ Gb == HexagonII::HSIG_S1 || Gb == HexagonII::HSIG_S2 ||
+ Gb == HexagonII::HSIG_A);
+ case HexagonII::HSIG_A:
+ return (Gb == HexagonII::HSIG_A);
+ case HexagonII::HSIG_Compound:
+ return (Gb == HexagonII::HSIG_Compound);
+ }
+ return false;
+}
+
+unsigned HexagonMCInstrInfo::iClassOfDuplexPair(unsigned Ga, unsigned Gb) {
+ switch (Ga) {
+ case HexagonII::HSIG_None:
+ default:
+ break;
+ case HexagonII::HSIG_L1:
+ switch (Gb) {
+ default:
+ break;
+ case HexagonII::HSIG_L1:
+ return 0;
+ case HexagonII::HSIG_A:
+ return 0x4;
+ }
+ case HexagonII::HSIG_L2:
+ switch (Gb) {
+ default:
+ break;
+ case HexagonII::HSIG_L1:
+ return 0x1;
+ case HexagonII::HSIG_L2:
+ return 0x2;
+ case HexagonII::HSIG_A:
+ return 0x5;
+ }
+ case HexagonII::HSIG_S1:
+ switch (Gb) {
+ default:
+ break;
+ case HexagonII::HSIG_L1:
+ return 0x8;
+ case HexagonII::HSIG_L2:
+ return 0x9;
+ case HexagonII::HSIG_S1:
+ return 0xA;
+ case HexagonII::HSIG_A:
+ return 0x6;
+ }
+ case HexagonII::HSIG_S2:
+ switch (Gb) {
+ default:
+ break;
+ case HexagonII::HSIG_L1:
+ return 0xC;
+ case HexagonII::HSIG_L2:
+ return 0xD;
+ case HexagonII::HSIG_S1:
+ return 0xB;
+ case HexagonII::HSIG_S2:
+ return 0xE;
+ case HexagonII::HSIG_A:
+ return 0x7;
+ }
+ case HexagonII::HSIG_A:
+ switch (Gb) {
+ default:
+ break;
+ case HexagonII::HSIG_A:
+ return 0x3;
+ }
+ case HexagonII::HSIG_Compound:
+ switch (Gb) {
+ case HexagonII::HSIG_Compound:
+ return 0xFFFFFFFF;
+ }
+ }
+ return 0xFFFFFFFF;
+}
+
+unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
+ unsigned DstReg, PredReg, SrcReg, Src1Reg, Src2Reg;
+
+ switch (MCI.getOpcode()) {
+ default:
+ return HexagonII::HSIG_None;
+ //
+ // Group L1:
+ //
+ // Rd = memw(Rs+#u4:2)
+ // Rd = memub(Rs+#u4:0)
+ case Hexagon::L2_loadri_io:
+ DstReg = MCI.getOperand(0).getReg();
+ SrcReg = MCI.getOperand(1).getReg();
+ // Special case this one from Group L2.
+ // Rd = memw(r29+#u5:2)
+ if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg)) {
+ if (HexagonMCInstrInfo::isIntReg(SrcReg) &&
+ Hexagon::R29 == SrcReg && inRange<5, 2>(MCI, 2)) {
+ return HexagonII::HSIG_L2;
+ }
+ // Rd = memw(Rs+#u4:2)
+ if (HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
+ inRange<4, 2>(MCI, 2)) {
+ return HexagonII::HSIG_L1;
+ }
+ }
+ break;
+ case Hexagon::L2_loadrub_io:
+ // Rd = memub(Rs+#u4:0)
+ DstReg = MCI.getOperand(0).getReg();
+ SrcReg = MCI.getOperand(1).getReg();
+ if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) &&
+ HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
+ inRange<4>(MCI, 2)) {
+ return HexagonII::HSIG_L1;
+ }
+ break;
+ //
+ // Group L2:
+ //
+ // Rd = memh/memuh(Rs+#u3:1)
+ // Rd = memb(Rs+#u3:0)
+ // Rd = memw(r29+#u5:2) - Handled above.
+ // Rdd = memd(r29+#u5:3)
+ // deallocframe
+ // [if ([!]p0[.new])] dealloc_return
+ // [if ([!]p0[.new])] jumpr r31
+ case Hexagon::L2_loadrh_io:
+ case Hexagon::L2_loadruh_io:
+ // Rd = memh/memuh(Rs+#u3:1)
+ DstReg = MCI.getOperand(0).getReg();
+ SrcReg = MCI.getOperand(1).getReg();
+ if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) &&
+ HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
+ inRange<3, 1>(MCI, 2)) {
+ return HexagonII::HSIG_L2;
+ }
+ break;
+ case Hexagon::L2_loadrb_io:
+ // Rd = memb(Rs+#u3:0)
+ DstReg = MCI.getOperand(0).getReg();
+ SrcReg = MCI.getOperand(1).getReg();
+ if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) &&
+ HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
+ inRange<3>(MCI, 2)) {
+ return HexagonII::HSIG_L2;
+ }
+ break;
+ case Hexagon::L2_loadrd_io:
+ // Rdd = memd(r29+#u5:3)
+ DstReg = MCI.getOperand(0).getReg();
+ SrcReg = MCI.getOperand(1).getReg();
+ if (HexagonMCInstrInfo::isDblRegForSubInst(DstReg) &&
+ HexagonMCInstrInfo::isIntReg(SrcReg) && Hexagon::R29 == SrcReg &&
+ inRange<5, 3>(MCI, 2)) {
+ return HexagonII::HSIG_L2;
+ }
+ break;
+
+ case Hexagon::L4_return:
+
+ case Hexagon::L2_deallocframe:
+
+ return HexagonII::HSIG_L2;
+ case Hexagon::EH_RETURN_JMPR:
+
+ case Hexagon::J2_jumpr:
+ // jumpr r31
+ // Actual form JMPR %PC<imp-def>, %R31<imp-use>, %R0<imp-use,internal>.
+ DstReg = MCI.getOperand(0).getReg();
+ if (Hexagon::R31 == DstReg)
+ return HexagonII::HSIG_L2;
+ break;
+
+ case Hexagon::J2_jumprt:
+ case Hexagon::J2_jumprf:
+ case Hexagon::J2_jumprtnew:
+ case Hexagon::J2_jumprfnew:
+ case Hexagon::J2_jumprtnewpt:
+ case Hexagon::J2_jumprfnewpt:
+ DstReg = MCI.getOperand(1).getReg();
+ SrcReg = MCI.getOperand(0).getReg();
+ // [if ([!]p0[.new])] jumpr r31
+ if ((HexagonMCInstrInfo::isPredReg(SrcReg) && (Hexagon::P0 == SrcReg)) &&
+ (Hexagon::R31 == DstReg)) {
+ return HexagonII::HSIG_L2;
+ }
+ break;
+ case Hexagon::L4_return_t:
+
+ case Hexagon::L4_return_f:
+
+ case Hexagon::L4_return_tnew_pnt:
+
+ case Hexagon::L4_return_fnew_pnt:
+
+ case Hexagon::L4_return_tnew_pt:
+
+ case Hexagon::L4_return_fnew_pt:
+ // [if ([!]p0[.new])] dealloc_return
+ SrcReg = MCI.getOperand(0).getReg();
+ if (Hexagon::P0 == SrcReg) {
+ return HexagonII::HSIG_L2;
+ }
+ break;
+ //
+ // Group S1:
+ //
+ // memw(Rs+#u4:2) = Rt
+ // memb(Rs+#u4:0) = Rt
+ case Hexagon::S2_storeri_io:
+ // Special case this one from Group S2.
+ // memw(r29+#u5:2) = Rt
+ Src1Reg = MCI.getOperand(0).getReg();
+ Src2Reg = MCI.getOperand(2).getReg();
+ if (HexagonMCInstrInfo::isIntReg(Src1Reg) &&
+ HexagonMCInstrInfo::isIntRegForSubInst(Src2Reg) &&
+ Hexagon::R29 == Src1Reg && inRange<5, 2>(MCI, 1)) {
+ return HexagonII::HSIG_S2;
+ }
+ // memw(Rs+#u4:2) = Rt
+ if (HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) &&
+ HexagonMCInstrInfo::isIntRegForSubInst(Src2Reg) &&
+ inRange<4, 2>(MCI, 1)) {
+ return HexagonII::HSIG_S1;
+ }
+ break;
+ case Hexagon::S2_storerb_io:
+ // memb(Rs+#u4:0) = Rt
+ Src1Reg = MCI.getOperand(0).getReg();
+ Src2Reg = MCI.getOperand(2).getReg();
+ if (HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) &&
+ HexagonMCInstrInfo::isIntRegForSubInst(Src2Reg) &&
+ inRange<4>(MCI, 1)) {
+ return HexagonII::HSIG_S1;
+ }
+ break;
+ //
+ // Group S2:
+ //
+ // memh(Rs+#u3:1) = Rt
+ // memw(r29+#u5:2) = Rt
+ // memd(r29+#s6:3) = Rtt
+ // memw(Rs+#u4:2) = #U1
+ // memb(Rs+#u4) = #U1
+ // allocframe(#u5:3)
+ case Hexagon::S2_storerh_io:
+ // memh(Rs+#u3:1) = Rt
+ Src1Reg = MCI.getOperand(0).getReg();
+ Src2Reg = MCI.getOperand(2).getReg();
+ if (HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) &&
+ HexagonMCInstrInfo::isIntRegForSubInst(Src2Reg) &&
+ inRange<3, 1>(MCI, 1)) {
+ return HexagonII::HSIG_S2;
+ }
+ break;
+ case Hexagon::S2_storerd_io:
+ // memd(r29+#s6:3) = Rtt
+ Src1Reg = MCI.getOperand(0).getReg();
+ Src2Reg = MCI.getOperand(2).getReg();
+ if (HexagonMCInstrInfo::isDblRegForSubInst(Src2Reg) &&
+ HexagonMCInstrInfo::isIntReg(Src1Reg) && Hexagon::R29 == Src1Reg &&
+ inSRange<6, 3>(MCI, 1)) {
+ return HexagonII::HSIG_S2;
+ }
+ break;
+ case Hexagon::S4_storeiri_io:
+ // memw(Rs+#u4:2) = #U1
+ Src1Reg = MCI.getOperand(0).getReg();
+ if (HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) &&
+ inRange<4, 2>(MCI, 1) && inRange<1>(MCI, 2)) {
+ return HexagonII::HSIG_S2;
+ }
+ break;
+ case Hexagon::S4_storeirb_io:
+ // memb(Rs+#u4) = #U1
+ Src1Reg = MCI.getOperand(0).getReg();
+ if (HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) &&
+ inRange<4>(MCI, 1) && inRange<1>(MCI, 2)) {
+ return HexagonII::HSIG_S2;
+ }
+ break;
+ case Hexagon::S2_allocframe:
+ if (inRange<5, 3>(MCI, 0))
+ return HexagonII::HSIG_S2;
+ break;
+ //
+ // Group A:
+ //
+ // Rx = add(Rx,#s7)
+ // Rd = Rs
+ // Rd = #u6
+ // Rd = #-1
+ // if ([!]P0[.new]) Rd = #0
+ // Rd = add(r29,#u6:2)
+ // Rx = add(Rx,Rs)
+ // P0 = cmp.eq(Rs,#u2)
+ // Rdd = combine(#0,Rs)
+ // Rdd = combine(Rs,#0)
+ // Rdd = combine(#u2,#U2)
+ // Rd = add(Rs,#1)
+ // Rd = add(Rs,#-1)
+ // Rd = sxth/sxtb/zxtb/zxth(Rs)
+ // Rd = and(Rs,#1)
+ case Hexagon::A2_addi:
+ DstReg = MCI.getOperand(0).getReg();
+ SrcReg = MCI.getOperand(1).getReg();
+ if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg)) {
+ // Rd = add(r29,#u6:2)
+ if (HexagonMCInstrInfo::isIntReg(SrcReg) && Hexagon::R29 == SrcReg &&
+ inRange<6, 2>(MCI, 2)) {
+ return HexagonII::HSIG_A;
+ }
+ // Rx = add(Rx,#s7)
+ if (DstReg == SrcReg) {
+ return HexagonII::HSIG_A;
+ }
+ // Rd = add(Rs,#1)
+ // Rd = add(Rs,#-1)
+ if (HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
+ (minConstant(MCI, 2) == 1 || minConstant(MCI, 2) == -1)) {
+ return HexagonII::HSIG_A;
+ }
+ }
+ break;
+ case Hexagon::A2_add:
+ // Rx = add(Rx,Rs)
+ DstReg = MCI.getOperand(0).getReg();
+ Src1Reg = MCI.getOperand(1).getReg();
+ Src2Reg = MCI.getOperand(2).getReg();
+ if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) && (DstReg == Src1Reg) &&
+ HexagonMCInstrInfo::isIntRegForSubInst(Src2Reg)) {
+ return HexagonII::HSIG_A;
+ }
+ break;
+ case Hexagon::A2_andir:
+ DstReg = MCI.getOperand(0).getReg();
+ SrcReg = MCI.getOperand(1).getReg();
+ if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) &&
+ HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
+ (minConstant(MCI, 2) == 1 || minConstant(MCI, 2) == 255)) {
+ return HexagonII::HSIG_A;
+ }
+ break;
+ case Hexagon::A2_tfr:
+ // Rd = Rs
+ DstReg = MCI.getOperand(0).getReg();
+ SrcReg = MCI.getOperand(1).getReg();
+ if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) &&
+ HexagonMCInstrInfo::isIntRegForSubInst(SrcReg)) {
+ return HexagonII::HSIG_A;
+ }
+ break;
+ case Hexagon::A2_tfrsi:
+ DstReg = MCI.getOperand(0).getReg();
+
+ if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg)) {
+ return HexagonII::HSIG_A;
+ }
+ break;
+ case Hexagon::C2_cmoveit:
+ case Hexagon::C2_cmovenewit:
+ case Hexagon::C2_cmoveif:
+ case Hexagon::C2_cmovenewif:
+ // if ([!]P0[.new]) Rd = #0
+ // Actual form:
+ // %R16<def> = C2_cmovenewit %P0<internal>, 0, %R16<imp-use,undef>;
+ DstReg = MCI.getOperand(0).getReg(); // Rd
+ PredReg = MCI.getOperand(1).getReg(); // P0
+ if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) &&
+ Hexagon::P0 == PredReg && minConstant(MCI, 2) == 0) {
+ return HexagonII::HSIG_A;
+ }
+ break;
+ case Hexagon::C2_cmpeqi:
+ // P0 = cmp.eq(Rs,#u2)
+ DstReg = MCI.getOperand(0).getReg();
+ SrcReg = MCI.getOperand(1).getReg();
+ if (Hexagon::P0 == DstReg &&
+ HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
+ inRange<2>(MCI, 2)) {
+ return HexagonII::HSIG_A;
+ }
+ break;
+ case Hexagon::A2_combineii:
+ case Hexagon::A4_combineii:
+ // Rdd = combine(#u2,#U2)
+ DstReg = MCI.getOperand(0).getReg();
+ if (HexagonMCInstrInfo::isDblRegForSubInst(DstReg) &&
+ inRange<2>(MCI, 1) && inRange<2>(MCI, 2)) {
+ return HexagonII::HSIG_A;
+ }
+ break;
+ case Hexagon::A4_combineri:
+ // Rdd = combine(Rs,#0)
+ DstReg = MCI.getOperand(0).getReg();
+ SrcReg = MCI.getOperand(1).getReg();
+ if (HexagonMCInstrInfo::isDblRegForSubInst(DstReg) &&
+ HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
+ minConstant(MCI, 2) == 0) {
+ return HexagonII::HSIG_A;
+ }
+ break;
+ case Hexagon::A4_combineir:
+ // Rdd = combine(#0,Rs)
+ DstReg = MCI.getOperand(0).getReg();
+ SrcReg = MCI.getOperand(2).getReg();
+ if (HexagonMCInstrInfo::isDblRegForSubInst(DstReg) &&
+ HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
+ minConstant(MCI, 1) == 0) {
+ return HexagonII::HSIG_A;
+ }
+ break;
+ case Hexagon::A2_sxtb:
+ case Hexagon::A2_sxth:
+ case Hexagon::A2_zxtb:
+ case Hexagon::A2_zxth:
+ // Rd = sxth/sxtb/zxtb/zxth(Rs)
+ DstReg = MCI.getOperand(0).getReg();
+ SrcReg = MCI.getOperand(1).getReg();
+ if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) &&
+ HexagonMCInstrInfo::isIntRegForSubInst(SrcReg)) {
+ return HexagonII::HSIG_A;
+ }
+ break;
+ }
+
+ return HexagonII::HSIG_None;
+}
+
+bool HexagonMCInstrInfo::subInstWouldBeExtended(MCInst const &potentialDuplex) {
+ unsigned DstReg, SrcReg;
+ switch (potentialDuplex.getOpcode()) {
+ case Hexagon::A2_addi:
+ // testing for case of: Rx = add(Rx,#s7)
+ DstReg = potentialDuplex.getOperand(0).getReg();
+ SrcReg = potentialDuplex.getOperand(1).getReg();
+ if (DstReg == SrcReg && HexagonMCInstrInfo::isIntRegForSubInst(DstReg)) {
+ int64_t Value;
+ if (!potentialDuplex.getOperand(2).getExpr()->evaluateAsAbsolute(Value))
+ return true;
+ if (!isShiftedInt<7, 0>(Value))
+ return true;
+ }
+ break;
+ case Hexagon::A2_tfrsi:
+ DstReg = potentialDuplex.getOperand(0).getReg();
+
+ if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg)) {
+ int64_t Value;
+ if (!potentialDuplex.getOperand(1).getExpr()->evaluateAsAbsolute(Value))
+ return true;
+ // Check for case of Rd = #-1.
+ if (Value == -1)
+ return false;
+ // Check for case of Rd = #u6.
+ if (!isShiftedUInt<6, 0>(Value))
+ return true;
+ }
+ break;
+ default:
+ break;
+ }
+ return false;
+}
+
+/// non-Symmetrical. See if these two instructions are fit for duplex pair.
+bool HexagonMCInstrInfo::isOrderedDuplexPair(MCInstrInfo const &MCII,
+ MCInst const &MIa, bool ExtendedA,
+ MCInst const &MIb, bool ExtendedB,
+ bool bisReversable) {
+ // Slot 1 cannot be extended in duplexes PRM 10.5
+ if (ExtendedA)
+ return false;
+ // Only A2_addi and A2_tfrsi can be extended in duplex form PRM 10.5
+ if (ExtendedB) {
+ unsigned Opcode = MIb.getOpcode();
+ if ((Opcode != Hexagon::A2_addi) && (Opcode != Hexagon::A2_tfrsi))
+ return false;
+ }
+ unsigned MIaG = HexagonMCInstrInfo::getDuplexCandidateGroup(MIa),
+ MIbG = HexagonMCInstrInfo::getDuplexCandidateGroup(MIb);
+
+ static std::map<unsigned, unsigned> subinstOpcodeMap(std::begin(opcodeData),
+ std::end(opcodeData));
+
+ // If a duplex contains 2 insns in the same group, the insns must be
+ // ordered such that the numerically smaller opcode is in slot 1.
+ if ((MIaG != HexagonII::HSIG_None) && (MIaG == MIbG) && bisReversable) {
+ MCInst SubInst0 = HexagonMCInstrInfo::deriveSubInst(MIa);
+ MCInst SubInst1 = HexagonMCInstrInfo::deriveSubInst(MIb);
+
+ unsigned zeroedSubInstS0 =
+ subinstOpcodeMap.find(SubInst0.getOpcode())->second;
+ unsigned zeroedSubInstS1 =
+ subinstOpcodeMap.find(SubInst1.getOpcode())->second;
+
+ if (zeroedSubInstS0 < zeroedSubInstS1)
+ // subinstS0 (maps to slot 0) must be greater than
+ // subinstS1 (maps to slot 1)
+ return false;
+ }
+
+ // allocframe must always be in slot 0
+ if (MIb.getOpcode() == Hexagon::S2_allocframe)
+ return false;
+
+ if ((MIaG != HexagonII::HSIG_None) && (MIbG != HexagonII::HSIG_None)) {
+ // Prevent 2 instructions with extenders from duplexing
+ // Note that MIb (slot1) can be extended and MIa (slot0)
+ // can never be extended
+ if (subInstWouldBeExtended(MIa))
+ return false;
+
+ // If duplexing produces an extender, but the original did not
+ // have an extender, do not duplex.
+ if (subInstWouldBeExtended(MIb) && !ExtendedB)
+ return false;
+ }
+
+ // If jumpr r31 appears, it must be in slot 0, and never slot 1 (MIb).
+ if (MIbG == HexagonII::HSIG_L2) {
+ if ((MIb.getNumOperands() > 1) && MIb.getOperand(1).isReg() &&
+ (MIb.getOperand(1).getReg() == Hexagon::R31))
+ return false;
+ if ((MIb.getNumOperands() > 0) && MIb.getOperand(0).isReg() &&
+ (MIb.getOperand(0).getReg() == Hexagon::R31))
+ return false;
+ }
+
+ // If a store appears, it must be in slot 0 (MIa) 1st, and then slot 1 (MIb);
+ // therefore, not duplexable if slot 1 is a store, and slot 0 is not.
+ if ((MIbG == HexagonII::HSIG_S1) || (MIbG == HexagonII::HSIG_S2)) {
+ if ((MIaG != HexagonII::HSIG_S1) && (MIaG != HexagonII::HSIG_S2))
+ return false;
+ }
+
+ return (isDuplexPairMatch(MIaG, MIbG));
+}
+
+/// Symmetrical. See if these two instructions are fit for duplex pair.
+bool HexagonMCInstrInfo::isDuplexPair(MCInst const &MIa, MCInst const &MIb) {
+ unsigned MIaG = getDuplexCandidateGroup(MIa),
+ MIbG = getDuplexCandidateGroup(MIb);
+ return (isDuplexPairMatch(MIaG, MIbG) || isDuplexPairMatch(MIbG, MIaG));
+}
+
+inline static void addOps(MCInst &subInstPtr, MCInst const &Inst,
+ unsigned opNum) {
+ if (Inst.getOperand(opNum).isReg()) {
+ switch (Inst.getOperand(opNum).getReg()) {
+ default:
+ llvm_unreachable("Not Duplexable Register");
+ break;
+ case Hexagon::R0:
+ case Hexagon::R1:
+ case Hexagon::R2:
+ case Hexagon::R3:
+ case Hexagon::R4:
+ case Hexagon::R5:
+ case Hexagon::R6:
+ case Hexagon::R7:
+ case Hexagon::D0:
+ case Hexagon::D1:
+ case Hexagon::D2:
+ case Hexagon::D3:
+ case Hexagon::R16:
+ case Hexagon::R17:
+ case Hexagon::R18:
+ case Hexagon::R19:
+ case Hexagon::R20:
+ case Hexagon::R21:
+ case Hexagon::R22:
+ case Hexagon::R23:
+ case Hexagon::D8:
+ case Hexagon::D9:
+ case Hexagon::D10:
+ case Hexagon::D11:
+ case Hexagon::P0:
+ subInstPtr.addOperand(Inst.getOperand(opNum));
+ break;
+ }
+ } else
+ subInstPtr.addOperand(Inst.getOperand(opNum));
+}
+
+MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) {
+ MCInst Result;
+ bool Absolute;
+ int64_t Value;
+ switch (Inst.getOpcode()) {
+ default:
+ // dbgs() << "opcode: "<< Inst->getOpcode() << "\n";
+ llvm_unreachable("Unimplemented subinstruction \n");
+ break;
+ case Hexagon::A2_addi:
+ Absolute = Inst.getOperand(2).getExpr()->evaluateAsAbsolute(Value);
+ assert(Absolute);(void)Absolute;
+ if (Value == 1) {
+ Result.setOpcode(Hexagon::SA1_inc);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 1);
+ break;
+ } // 1,2 SUBInst $Rd = add($Rs, #1)
+ else if (Value == -1) {
+ Result.setOpcode(Hexagon::SA1_dec);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 1);
+ break;
+ } // 1,2 SUBInst $Rd = add($Rs,#-1)
+ else if (Inst.getOperand(1).getReg() == Hexagon::R29) {
+ Result.setOpcode(Hexagon::SA1_addsp);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 2);
+ break;
+ } // 1,3 SUBInst $Rd = add(r29, #$u6_2)
+ else {
+ Result.setOpcode(Hexagon::SA1_addi);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 1);
+ addOps(Result, Inst, 2);
+ break;
+ } // 1,2,3 SUBInst $Rx = add($Rx, #$s7)
+ case Hexagon::A2_add:
+ Result.setOpcode(Hexagon::SA1_addrx);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 1);
+ addOps(Result, Inst, 2);
+ break; // 1,2,3 SUBInst $Rx = add($_src_, $Rs)
+ case Hexagon::S2_allocframe:
+ Result.setOpcode(Hexagon::SS2_allocframe);
+ addOps(Result, Inst, 0);
+ break; // 1 SUBInst allocframe(#$u5_3)
+ case Hexagon::A2_andir:
+ if (minConstant(Inst, 2) == 255) {
+ Result.setOpcode(Hexagon::SA1_zxtb);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 1);
+ break; // 1,2 $Rd = and($Rs, #255)
+ } else {
+ Result.setOpcode(Hexagon::SA1_and1);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 1);
+ break; // 1,2 SUBInst $Rd = and($Rs, #1)
+ }
+ case Hexagon::C2_cmpeqi:
+ Result.setOpcode(Hexagon::SA1_cmpeqi);
+ addOps(Result, Inst, 1);
+ addOps(Result, Inst, 2);
+ break; // 2,3 SUBInst p0 = cmp.eq($Rs, #$u2)
+ case Hexagon::A4_combineii:
+ case Hexagon::A2_combineii:
+ Absolute = Inst.getOperand(1).getExpr()->evaluateAsAbsolute(Value);
+ assert(Absolute);(void)Absolute;
+ if (Value == 1) {
+ Result.setOpcode(Hexagon::SA1_combine1i);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 2);
+ break; // 1,3 SUBInst $Rdd = combine(#1, #$u2)
+ }
+ if (Value == 3) {
+ Result.setOpcode(Hexagon::SA1_combine3i);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 2);
+ break; // 1,3 SUBInst $Rdd = combine(#3, #$u2)
+ }
+ if (Value == 0) {
+ Result.setOpcode(Hexagon::SA1_combine0i);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 2);
+ break; // 1,3 SUBInst $Rdd = combine(#0, #$u2)
+ }
+ if (Value == 2) {
+ Result.setOpcode(Hexagon::SA1_combine2i);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 2);
+ break; // 1,3 SUBInst $Rdd = combine(#2, #$u2)
+ }
+ case Hexagon::A4_combineir:
+ Result.setOpcode(Hexagon::SA1_combinezr);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 2);
+ break; // 1,3 SUBInst $Rdd = combine(#0, $Rs)
+
+ case Hexagon::A4_combineri:
+ Result.setOpcode(Hexagon::SA1_combinerz);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 1);
+ break; // 1,2 SUBInst $Rdd = combine($Rs, #0)
+ case Hexagon::L4_return_tnew_pnt:
+ case Hexagon::L4_return_tnew_pt:
+ Result.setOpcode(Hexagon::SL2_return_tnew);
+ break; // none SUBInst if (p0.new) dealloc_return:nt
+ case Hexagon::L4_return_fnew_pnt:
+ case Hexagon::L4_return_fnew_pt:
+ Result.setOpcode(Hexagon::SL2_return_fnew);
+ break; // none SUBInst if (!p0.new) dealloc_return:nt
+ case Hexagon::L4_return_f:
+ Result.setOpcode(Hexagon::SL2_return_f);
+ break; // none SUBInst if (!p0) dealloc_return
+ case Hexagon::L4_return_t:
+ Result.setOpcode(Hexagon::SL2_return_t);
+ break; // none SUBInst if (p0) dealloc_return
+ case Hexagon::L4_return:
+ Result.setOpcode(Hexagon::SL2_return);
+ break; // none SUBInst dealloc_return
+ case Hexagon::L2_deallocframe:
+ Result.setOpcode(Hexagon::SL2_deallocframe);
+ break; // none SUBInst deallocframe
+ case Hexagon::EH_RETURN_JMPR:
+ case Hexagon::J2_jumpr:
+ Result.setOpcode(Hexagon::SL2_jumpr31);
+ break; // none SUBInst jumpr r31
+ case Hexagon::J2_jumprf:
+ Result.setOpcode(Hexagon::SL2_jumpr31_f);
+ break; // none SUBInst if (!p0) jumpr r31
+ case Hexagon::J2_jumprfnew:
+ case Hexagon::J2_jumprfnewpt:
+ Result.setOpcode(Hexagon::SL2_jumpr31_fnew);
+ break; // none SUBInst if (!p0.new) jumpr:nt r31
+ case Hexagon::J2_jumprt:
+ Result.setOpcode(Hexagon::SL2_jumpr31_t);
+ break; // none SUBInst if (p0) jumpr r31
+ case Hexagon::J2_jumprtnew:
+ case Hexagon::J2_jumprtnewpt:
+ Result.setOpcode(Hexagon::SL2_jumpr31_tnew);
+ break; // none SUBInst if (p0.new) jumpr:nt r31
+ case Hexagon::L2_loadrb_io:
+ Result.setOpcode(Hexagon::SL2_loadrb_io);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 1);
+ addOps(Result, Inst, 2);
+ break; // 1,2,3 SUBInst $Rd = memb($Rs + #$u3_0)
+ case Hexagon::L2_loadrd_io:
+ Result.setOpcode(Hexagon::SL2_loadrd_sp);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 2);
+ break; // 1,3 SUBInst $Rdd = memd(r29 + #$u5_3)
+ case Hexagon::L2_loadrh_io:
+ Result.setOpcode(Hexagon::SL2_loadrh_io);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 1);
+ addOps(Result, Inst, 2);
+ break; // 1,2,3 SUBInst $Rd = memh($Rs + #$u3_1)
+ case Hexagon::L2_loadrub_io:
+ Result.setOpcode(Hexagon::SL1_loadrub_io);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 1);
+ addOps(Result, Inst, 2);
+ break; // 1,2,3 SUBInst $Rd = memub($Rs + #$u4_0)
+ case Hexagon::L2_loadruh_io:
+ Result.setOpcode(Hexagon::SL2_loadruh_io);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 1);
+ addOps(Result, Inst, 2);
+ break; // 1,2,3 SUBInst $Rd = memuh($Rs + #$u3_1)
+ case Hexagon::L2_loadri_io:
+ if (Inst.getOperand(1).getReg() == Hexagon::R29) {
+ Result.setOpcode(Hexagon::SL2_loadri_sp);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 2);
+ break; // 2 1,3 SUBInst $Rd = memw(r29 + #$u5_2)
+ } else {
+ Result.setOpcode(Hexagon::SL1_loadri_io);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 1);
+ addOps(Result, Inst, 2);
+ break; // 1,2,3 SUBInst $Rd = memw($Rs + #$u4_2)
+ }
+ case Hexagon::S4_storeirb_io:
+ Absolute = Inst.getOperand(2).getExpr()->evaluateAsAbsolute(Value);
+ assert(Absolute);(void)Absolute;
+ if (Value == 0) {
+ Result.setOpcode(Hexagon::SS2_storebi0);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 1);
+ break; // 1,2 SUBInst memb($Rs + #$u4_0)=#0
+ } else if (Value == 1) {
+ Result.setOpcode(Hexagon::SS2_storebi1);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 1);
+ break; // 2 1,2 SUBInst memb($Rs + #$u4_0)=#1
+ }
+ case Hexagon::S2_storerb_io:
+ Result.setOpcode(Hexagon::SS1_storeb_io);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 1);
+ addOps(Result, Inst, 2);
+ break; // 1,2,3 SUBInst memb($Rs + #$u4_0) = $Rt
+ case Hexagon::S2_storerd_io:
+ Result.setOpcode(Hexagon::SS2_stored_sp);
+ addOps(Result, Inst, 1);
+ addOps(Result, Inst, 2);
+ break; // 2,3 SUBInst memd(r29 + #$s6_3) = $Rtt
+ case Hexagon::S2_storerh_io:
+ Result.setOpcode(Hexagon::SS2_storeh_io);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 1);
+ addOps(Result, Inst, 2);
+ break; // 1,2,3 SUBInst memb($Rs + #$u4_0) = $Rt
+ case Hexagon::S4_storeiri_io:
+ Absolute = Inst.getOperand(2).getExpr()->evaluateAsAbsolute(Value);
+ assert(Absolute);(void)Absolute;
+ if (Value == 0) {
+ Result.setOpcode(Hexagon::SS2_storewi0);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 1);
+ break; // 3 1,2 SUBInst memw($Rs + #$u4_2)=#0
+ } else if (Value == 1) {
+ Result.setOpcode(Hexagon::SS2_storewi1);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 1);
+ break; // 3 1,2 SUBInst memw($Rs + #$u4_2)=#1
+ } else if (Inst.getOperand(0).getReg() == Hexagon::R29) {
+ Result.setOpcode(Hexagon::SS2_storew_sp);
+ addOps(Result, Inst, 1);
+ addOps(Result, Inst, 2);
+ break; // 1 2,3 SUBInst memw(r29 + #$u5_2) = $Rt
+ }
+ case Hexagon::S2_storeri_io:
+ if (Inst.getOperand(0).getReg() == Hexagon::R29) {
+ Result.setOpcode(Hexagon::SS2_storew_sp);
+ addOps(Result, Inst, 1);
+ addOps(Result, Inst, 2); // 1,2,3 SUBInst memw(sp + #$u5_2) = $Rt
+ } else {
+ Result.setOpcode(Hexagon::SS1_storew_io);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 1);
+ addOps(Result, Inst, 2); // 1,2,3 SUBInst memw($Rs + #$u4_2) = $Rt
+ }
+ break;
+ case Hexagon::A2_sxtb:
+ Result.setOpcode(Hexagon::SA1_sxtb);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 1);
+ break; // 1,2 SUBInst $Rd = sxtb($Rs)
+ case Hexagon::A2_sxth:
+ Result.setOpcode(Hexagon::SA1_sxth);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 1);
+ break; // 1,2 SUBInst $Rd = sxth($Rs)
+ case Hexagon::A2_tfr:
+ Result.setOpcode(Hexagon::SA1_tfr);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 1);
+ break; // 1,2 SUBInst $Rd = $Rs
+ case Hexagon::C2_cmovenewif:
+ Result.setOpcode(Hexagon::SA1_clrfnew);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 1);
+ break; // 2 SUBInst if (!p0.new) $Rd = #0
+ case Hexagon::C2_cmovenewit:
+ Result.setOpcode(Hexagon::SA1_clrtnew);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 1);
+ break; // 2 SUBInst if (p0.new) $Rd = #0
+ case Hexagon::C2_cmoveif:
+ Result.setOpcode(Hexagon::SA1_clrf);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 1);
+ break; // 2 SUBInst if (!p0) $Rd = #0
+ case Hexagon::C2_cmoveit:
+ Result.setOpcode(Hexagon::SA1_clrt);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 1);
+ break; // 2 SUBInst if (p0) $Rd = #0
+ case Hexagon::A2_tfrsi:
+ Absolute = Inst.getOperand(1).getExpr()->evaluateAsAbsolute(Value);
+ if (Absolute && Value == -1) {
+ Result.setOpcode(Hexagon::SA1_setin1);
+ addOps(Result, Inst, 0);
+ break; // 2 1 SUBInst $Rd = #-1
+ } else {
+ Result.setOpcode(Hexagon::SA1_seti);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 1);
+ break; // 1,2 SUBInst $Rd = #$u6
+ }
+ case Hexagon::A2_zxtb:
+ Result.setOpcode(Hexagon::SA1_zxtb);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 1);
+ break; // 1,2 $Rd = and($Rs, #255)
+
+ case Hexagon::A2_zxth:
+ Result.setOpcode(Hexagon::SA1_zxth);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 1);
+ break; // 1,2 SUBInst $Rd = zxth($Rs)
+ }
+ return Result;
+}
+
+static bool isStoreInst(unsigned opCode) {
+ switch (opCode) {
+ case Hexagon::S2_storeri_io:
+ case Hexagon::S2_storerb_io:
+ case Hexagon::S2_storerh_io:
+ case Hexagon::S2_storerd_io:
+ case Hexagon::S4_storeiri_io:
+ case Hexagon::S4_storeirb_io:
+ case Hexagon::S2_allocframe:
+ return true;
+ default:
+ return false;
+ }
+}
+
+SmallVector<DuplexCandidate, 8>
+HexagonMCInstrInfo::getDuplexPossibilties(MCInstrInfo const &MCII,
+ MCInst const &MCB) {
+ assert(isBundle(MCB));
+ SmallVector<DuplexCandidate, 8> duplexToTry;
+ // Use an "order matters" version of isDuplexPair.
+ unsigned numInstrInPacket = MCB.getNumOperands();
+
+ for (unsigned distance = 1; distance < numInstrInPacket; ++distance) {
+ for (unsigned j = HexagonMCInstrInfo::bundleInstructionsOffset,
+ k = j + distance;
+ (j < numInstrInPacket) && (k < numInstrInPacket); ++j, ++k) {
+
+ // Check if reversible.
+ bool bisReversable = true;
+ if (isStoreInst(MCB.getOperand(j).getInst()->getOpcode()) &&
+ isStoreInst(MCB.getOperand(k).getInst()->getOpcode())) {
+ DEBUG(dbgs() << "skip out of order write pair: " << k << "," << j
+ << "\n");
+ bisReversable = false;
+ }
+ if (HexagonMCInstrInfo::isMemReorderDisabled(MCB)) // }:mem_noshuf
+ bisReversable = false;
+
+ // Try in order.
+ if (isOrderedDuplexPair(
+ MCII, *MCB.getOperand(k).getInst(),
+ HexagonMCInstrInfo::hasExtenderForIndex(MCB, k - 1),
+ *MCB.getOperand(j).getInst(),
+ HexagonMCInstrInfo::hasExtenderForIndex(MCB, j - 1),
+ bisReversable)) {
+ // Get iClass.
+ unsigned iClass = iClassOfDuplexPair(
+ getDuplexCandidateGroup(*MCB.getOperand(k).getInst()),
+ getDuplexCandidateGroup(*MCB.getOperand(j).getInst()));
+
+ // Save off pairs for duplex checking.
+ duplexToTry.push_back(DuplexCandidate(j, k, iClass));
+ DEBUG(dbgs() << "adding pair: " << j << "," << k << ":"
+ << MCB.getOperand(j).getInst()->getOpcode() << ","
+ << MCB.getOperand(k).getInst()->getOpcode() << "\n");
+ continue;
+ } else {
+ DEBUG(dbgs() << "skipping pair: " << j << "," << k << ":"
+ << MCB.getOperand(j).getInst()->getOpcode() << ","
+ << MCB.getOperand(k).getInst()->getOpcode() << "\n");
+ }
+
+ // Try reverse.
+ if (bisReversable) {
+ if (isOrderedDuplexPair(
+ MCII, *MCB.getOperand(j).getInst(),
+ HexagonMCInstrInfo::hasExtenderForIndex(MCB, j - 1),
+ *MCB.getOperand(k).getInst(),
+ HexagonMCInstrInfo::hasExtenderForIndex(MCB, k - 1),
+ bisReversable)) {
+ // Get iClass.
+ unsigned iClass = iClassOfDuplexPair(
+ getDuplexCandidateGroup(*MCB.getOperand(j).getInst()),
+ getDuplexCandidateGroup(*MCB.getOperand(k).getInst()));
+
+ // Save off pairs for duplex checking.
+ duplexToTry.push_back(DuplexCandidate(k, j, iClass));
+ DEBUG(dbgs() << "adding pair:" << k << "," << j << ":"
+ << MCB.getOperand(j).getInst()->getOpcode() << ","
+ << MCB.getOperand(k).getInst()->getOpcode() << "\n");
+ } else {
+ DEBUG(dbgs() << "skipping pair: " << k << "," << j << ":"
+ << MCB.getOperand(j).getInst()->getOpcode() << ","
+ << MCB.getOperand(k).getInst()->getOpcode() << "\n");
+ }
+ }
+ }
+ }
+ return duplexToTry;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
new file mode 100644
index 000000000000..226470cfbced
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
@@ -0,0 +1,164 @@
+//=== HexagonMCELFStreamer.cpp - Hexagon subclass of MCELFStreamer -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a stub that parses a MCInst bundle and passes the
+// instructions on to the real streamer.
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "hexagonmcelfstreamer"
+
+#include "MCTargetDesc/HexagonMCELFStreamer.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
+#include "MCTargetDesc/HexagonMCShuffler.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+#include <cstdint>
+
+using namespace llvm;
+
+static cl::opt<unsigned>
+ GPSize("gpsize", cl::NotHidden,
+ cl::desc("Global Pointer Addressing Size. The default size is 8."),
+ cl::Prefix, cl::init(8));
+
+void HexagonMCELFStreamer::EmitInstruction(const MCInst &MCK,
+ const MCSubtargetInfo &STI) {
+ MCInst HMI = HexagonMCInstrInfo::createBundle();
+ MCInst *MCB;
+
+ if (MCK.getOpcode() != Hexagon::BUNDLE) {
+ HMI.addOperand(MCOperand::createInst(&MCK));
+ MCB = &HMI;
+ } else
+ MCB = const_cast<MCInst *>(&MCK);
+
+ // Examines packet and pad the packet, if needed, when an
+ // end-loop is in the bundle.
+ HexagonMCInstrInfo::padEndloop(getContext(), *MCB);
+ HexagonMCShuffle(*MCII, STI, *MCB);
+
+ assert(HexagonMCInstrInfo::bundleSize(*MCB) <= HEXAGON_PACKET_SIZE);
+ bool Extended = false;
+ for (auto &I : HexagonMCInstrInfo::bundleInstructions(*MCB)) {
+ MCInst *MCI = const_cast<MCInst *>(I.getInst());
+ if (Extended) {
+ if (HexagonMCInstrInfo::isDuplex(*MCII, *MCI)) {
+ MCInst *SubInst = const_cast<MCInst *>(MCI->getOperand(1).getInst());
+ HexagonMCInstrInfo::clampExtended(*MCII, getContext(), *SubInst);
+ } else {
+ HexagonMCInstrInfo::clampExtended(*MCII, getContext(), *MCI);
+ }
+ Extended = false;
+ } else {
+ Extended = HexagonMCInstrInfo::isImmext(*MCI);
+ }
+ }
+
+ // At this point, MCB is a bundle
+ // Iterate through the bundle and assign addends for the instructions
+ for (auto const &I : HexagonMCInstrInfo::bundleInstructions(*MCB)) {
+ MCInst *MCI = const_cast<MCInst *>(I.getInst());
+ EmitSymbol(*MCI);
+ }
+ MCObjectStreamer::EmitInstruction(*MCB, STI);
+}
+
+void HexagonMCELFStreamer::EmitSymbol(const MCInst &Inst) {
+ // Scan for values.
+ for (unsigned i = Inst.getNumOperands(); i--;)
+ if (Inst.getOperand(i).isExpr())
+ visitUsedExpr(*Inst.getOperand(i).getExpr());
+}
+
+// EmitCommonSymbol and EmitLocalCommonSymbol are extended versions of the
+// functions found in MCELFStreamer.cpp taking AccessSize as an additional
+// parameter.
+void HexagonMCELFStreamer::HexagonMCEmitCommonSymbol(MCSymbol *Symbol,
+ uint64_t Size,
+ unsigned ByteAlignment,
+ unsigned AccessSize) {
+ getAssembler().registerSymbol(*Symbol);
+ StringRef sbss[4] = {".sbss.1", ".sbss.2", ".sbss.4", ".sbss.8"};
+
+ auto ELFSymbol = cast<MCSymbolELF>(Symbol);
+ if (!ELFSymbol->isBindingSet()) {
+ ELFSymbol->setBinding(ELF::STB_GLOBAL);
+ ELFSymbol->setExternal(true);
+ }
+
+ ELFSymbol->setType(ELF::STT_OBJECT);
+
+ if (ELFSymbol->getBinding() == ELF::STB_LOCAL) {
+ StringRef SectionName =
+ ((AccessSize == 0) || (Size == 0) || (Size > GPSize))
+ ? ".bss"
+ : sbss[(Log2_64(AccessSize))];
+ MCSection &Section = *getAssembler().getContext().getELFSection(
+ SectionName, ELF::SHT_NOBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
+ MCSectionSubPair P = getCurrentSection();
+ SwitchSection(&Section);
+
+ EmitValueToAlignment(ByteAlignment, 0, 1, 0);
+ EmitLabel(Symbol);
+ EmitZeros(Size);
+
+ // Update the maximum alignment of the section if necessary.
+ if (ByteAlignment > Section.getAlignment())
+ Section.setAlignment(ByteAlignment);
+
+ SwitchSection(P.first, P.second);
+ } else {
+ if (ELFSymbol->declareCommon(Size, ByteAlignment))
+ report_fatal_error("Symbol: " + Symbol->getName() +
+ " redeclared as different type");
+ if ((AccessSize) && (Size <= GPSize)) {
+ uint64_t SectionIndex =
+ (AccessSize <= GPSize)
+ ? ELF::SHN_HEXAGON_SCOMMON + (Log2_64(AccessSize) + 1)
+ : (unsigned)ELF::SHN_HEXAGON_SCOMMON;
+ ELFSymbol->setIndex(SectionIndex);
+ }
+ }
+
+ ELFSymbol->setSize(MCConstantExpr::create(Size, getContext()));
+}
+
+void HexagonMCELFStreamer::HexagonMCEmitLocalCommonSymbol(
+ MCSymbol *Symbol, uint64_t Size, unsigned ByteAlignment,
+ unsigned AccessSize) {
+ getAssembler().registerSymbol(*Symbol);
+ auto ELFSymbol = cast<MCSymbolELF>(Symbol);
+ ELFSymbol->setBinding(ELF::STB_LOCAL);
+ ELFSymbol->setExternal(false);
+ HexagonMCEmitCommonSymbol(Symbol, Size, ByteAlignment, AccessSize);
+}
+
+namespace llvm {
+
+MCStreamer *createHexagonELFStreamer(MCContext &Context, MCAsmBackend &MAB,
+ raw_pwrite_stream &OS, MCCodeEmitter *CE) {
+ return new HexagonMCELFStreamer(Context, MAB, OS, CE);
+}
+
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h
new file mode 100644
index 000000000000..0ac1a68d4ef9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h
@@ -0,0 +1,44 @@
+//===- HexagonMCELFStreamer.h - Hexagon subclass of MCElfStreamer ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCELFSTREAMER_H
+#define LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCELFSTREAMER_H
+
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include <cstdint>
+#include <memory>
+
+namespace llvm {
+
+class HexagonMCELFStreamer : public MCELFStreamer {
+ std::unique_ptr<MCInstrInfo> MCII;
+
+public:
+ HexagonMCELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+ raw_pwrite_stream &OS, MCCodeEmitter *Emitter)
+ : MCELFStreamer(Context, TAB, OS, Emitter),
+ MCII(createHexagonMCInstrInfo()) {}
+
+ void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override;
+ void EmitSymbol(const MCInst &Inst);
+ void HexagonMCEmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+ unsigned ByteAlignment,
+ unsigned AccessSize);
+ void HexagonMCEmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+ unsigned ByteAlignment, unsigned AccessSize);
+};
+
+MCStreamer *createHexagonELFStreamer(MCContext &Context, MCAsmBackend &MAB,
+ raw_pwrite_stream &OS, MCCodeEmitter *CE);
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCELFSTREAMER_H
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp
new file mode 100644
index 000000000000..e93906a0a396
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp
@@ -0,0 +1,78 @@
+//===-- HexagonMCExpr.cpp - Hexagon specific MC expression classes
+//----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonMCExpr.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagon-mcexpr"
+
+HexagonMCExpr *HexagonMCExpr::create(MCExpr const *Expr, MCContext &Ctx) {
+ return new (Ctx) HexagonMCExpr(Expr);
+}
+
+bool HexagonMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
+ MCAsmLayout const *Layout,
+ MCFixup const *Fixup) const {
+ return Expr->evaluateAsRelocatable(Res, Layout, Fixup);
+}
+
+void HexagonMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
+ Streamer.visitUsedExpr(*Expr);
+}
+
+MCFragment *llvm::HexagonMCExpr::findAssociatedFragment() const {
+ return Expr->findAssociatedFragment();
+}
+
+void HexagonMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {}
+
+MCExpr const *HexagonMCExpr::getExpr() const { return Expr; }
+
+void HexagonMCExpr::setMustExtend(bool Val) {
+ assert((!Val || !MustNotExtend) && "Extension contradiction");
+ MustExtend = Val;
+}
+
+bool HexagonMCExpr::mustExtend() const { return MustExtend; }
+void HexagonMCExpr::setMustNotExtend(bool Val) {
+ assert((!Val || !MustExtend) && "Extension contradiction");
+ MustNotExtend = Val;
+}
+bool HexagonMCExpr::mustNotExtend() const { return MustNotExtend; }
+
+bool HexagonMCExpr::s23_2_reloc() const { return S23_2_reloc; }
+void HexagonMCExpr::setS23_2_reloc(bool Val) {
+ S23_2_reloc = Val;
+}
+
+bool HexagonMCExpr::classof(MCExpr const *E) {
+ return E->getKind() == MCExpr::Target;
+}
+
+HexagonMCExpr::HexagonMCExpr(MCExpr const *Expr)
+ : Expr(Expr), MustNotExtend(false), MustExtend(false), S23_2_reloc(false),
+ SignMismatch(false) {}
+
+void HexagonMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
+ Expr->print(OS, MAI);
+}
+
+void HexagonMCExpr::setSignMismatch(bool Val) {
+ SignMismatch = Val;
+}
+
+bool HexagonMCExpr::signMismatch() const {
+ return SignMismatch;
+} \ No newline at end of file
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h
new file mode 100644
index 000000000000..bca40cfaf6f4
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h
@@ -0,0 +1,47 @@
+//==- HexagonMCExpr.h - Hexagon specific MC expression classes --*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONMCEXPR_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONMCEXPR_H
+
+#include "llvm/MC/MCExpr.h"
+
+namespace llvm {
+class MCInst;
+class HexagonMCExpr : public MCTargetExpr {
+public:
+ static HexagonMCExpr *create(MCExpr const *Expr, MCContext &Ctx);
+ void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+ bool evaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const override;
+ void visitUsedExpr(MCStreamer &Streamer) const override;
+ MCFragment *findAssociatedFragment() const override;
+ void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override;
+ static bool classof(MCExpr const *E);
+ MCExpr const *getExpr() const;
+ void setMustExtend(bool Val = true);
+ bool mustExtend() const;
+ void setMustNotExtend(bool Val = true);
+ bool mustNotExtend() const;
+ void setS23_2_reloc(bool Val = true);
+ bool s23_2_reloc() const;
+ void setSignMismatch(bool Val = true);
+ bool signMismatch() const;
+
+private:
+ HexagonMCExpr(MCExpr const *Expr);
+ MCExpr const *Expr;
+ bool MustNotExtend;
+ bool MustExtend;
+ bool S23_2_reloc;
+ bool SignMismatch;
+};
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_HEXAGON_HEXAGONMCEXPR_H
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
new file mode 100644
index 000000000000..e627f026c8ad
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
@@ -0,0 +1,809 @@
+//===- HexagonMCInstrInfo.cpp - Hexagon sub-class of MCInst ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class extends MCInstrInfo to allow Hexagon specific MCInstr queries
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonMCInstrInfo.h"
+
+#include "Hexagon.h"
+#include "HexagonBaseInfo.h"
+#include "HexagonMCChecker.h"
+
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+
+namespace llvm {
+void HexagonMCInstrInfo::addConstant(MCInst &MI, uint64_t Value,
+ MCContext &Context) {
+ MI.addOperand(MCOperand::createExpr(MCConstantExpr::create(Value, Context)));
+}
+
+void HexagonMCInstrInfo::addConstExtender(MCContext &Context,
+ MCInstrInfo const &MCII, MCInst &MCB,
+ MCInst const &MCI) {
+ assert(HexagonMCInstrInfo::isBundle(MCB));
+ MCOperand const &exOp =
+ MCI.getOperand(HexagonMCInstrInfo::getExtendableOp(MCII, MCI));
+
+ // Create the extender.
+ MCInst *XMCI =
+ new (Context) MCInst(HexagonMCInstrInfo::deriveExtender(MCII, MCI, exOp));
+
+ MCB.addOperand(MCOperand::createInst(XMCI));
+}
+
+iterator_range<MCInst::const_iterator>
+HexagonMCInstrInfo::bundleInstructions(MCInst const &MCI) {
+ assert(isBundle(MCI));
+ return make_range(MCI.begin() + bundleInstructionsOffset, MCI.end());
+}
+
+size_t HexagonMCInstrInfo::bundleSize(MCInst const &MCI) {
+ if (HexagonMCInstrInfo::isBundle(MCI))
+ return (MCI.size() - bundleInstructionsOffset);
+ else
+ return (1);
+}
+
+bool HexagonMCInstrInfo::canonicalizePacket(MCInstrInfo const &MCII,
+ MCSubtargetInfo const &STI,
+ MCContext &Context, MCInst &MCB,
+ HexagonMCChecker *Check) {
+ // Examine the packet and convert pairs of instructions to compound
+ // instructions when possible.
+ if (!HexagonDisableCompound)
+ HexagonMCInstrInfo::tryCompound(MCII, Context, MCB);
+ // Check the bundle for errors.
+ bool CheckOk = Check ? Check->check() : true;
+ if (!CheckOk)
+ return false;
+ HexagonMCShuffle(MCII, STI, MCB);
+ // Examine the packet and convert pairs of instructions to duplex
+ // instructions when possible.
+ MCInst InstBundlePreDuplex = MCInst(MCB);
+ if (!HexagonDisableDuplex) {
+ SmallVector<DuplexCandidate, 8> possibleDuplexes;
+ possibleDuplexes = HexagonMCInstrInfo::getDuplexPossibilties(MCII, MCB);
+ HexagonMCShuffle(MCII, STI, Context, MCB, possibleDuplexes);
+ }
+ // Examines packet and pad the packet, if needed, when an
+ // end-loop is in the bundle.
+ HexagonMCInstrInfo::padEndloop(Context, MCB);
+ // If compounding and duplexing didn't reduce the size below
+ // 4 or less we have a packet that is too big.
+ if (HexagonMCInstrInfo::bundleSize(MCB) > HEXAGON_PACKET_SIZE)
+ return false;
+ HexagonMCShuffle(MCII, STI, MCB);
+ return true;
+}
+
+void HexagonMCInstrInfo::clampExtended(MCInstrInfo const &MCII,
+ MCContext &Context, MCInst &MCI) {
+ assert(HexagonMCInstrInfo::isExtendable(MCII, MCI) ||
+ HexagonMCInstrInfo::isExtended(MCII, MCI));
+ MCOperand &exOp =
+ MCI.getOperand(HexagonMCInstrInfo::getExtendableOp(MCII, MCI));
+ // If the extended value is a constant, then use it for the extended and
+ // for the extender instructions, masking off the lower 6 bits and
+ // including the assumed bits.
+ int64_t Value;
+ if (exOp.getExpr()->evaluateAsAbsolute(Value)) {
+ unsigned Shift = HexagonMCInstrInfo::getExtentAlignment(MCII, MCI);
+ exOp.setExpr(HexagonMCExpr::create(
+ MCConstantExpr::create((Value & 0x3f) << Shift, Context), Context));
+ }
+}
+
+MCInst HexagonMCInstrInfo::createBundle() {
+ MCInst Result;
+ Result.setOpcode(Hexagon::BUNDLE);
+ Result.addOperand(MCOperand::createImm(0));
+ return Result;
+}
+
+MCInst *HexagonMCInstrInfo::deriveDuplex(MCContext &Context, unsigned iClass,
+ MCInst const &inst0,
+ MCInst const &inst1) {
+ assert((iClass <= 0xf) && "iClass must have range of 0 to 0xf");
+ MCInst *duplexInst = new (Context) MCInst;
+ duplexInst->setOpcode(Hexagon::DuplexIClass0 + iClass);
+
+ MCInst *SubInst0 = new (Context) MCInst(deriveSubInst(inst0));
+ MCInst *SubInst1 = new (Context) MCInst(deriveSubInst(inst1));
+ duplexInst->addOperand(MCOperand::createInst(SubInst0));
+ duplexInst->addOperand(MCOperand::createInst(SubInst1));
+ return duplexInst;
+}
+
+MCInst HexagonMCInstrInfo::deriveExtender(MCInstrInfo const &MCII,
+ MCInst const &Inst,
+ MCOperand const &MO) {
+ assert(HexagonMCInstrInfo::isExtendable(MCII, Inst) ||
+ HexagonMCInstrInfo::isExtended(MCII, Inst));
+
+ MCInstrDesc const &Desc = HexagonMCInstrInfo::getDesc(MCII, Inst);
+ MCInst XMI;
+ XMI.setOpcode((Desc.isBranch() || Desc.isCall() ||
+ HexagonMCInstrInfo::getType(MCII, Inst) == HexagonII::TypeCR)
+ ? Hexagon::A4_ext_b
+ : Hexagon::A4_ext);
+ if (MO.isImm())
+ XMI.addOperand(MCOperand::createImm(MO.getImm() & (~0x3f)));
+ else if (MO.isExpr())
+ XMI.addOperand(MCOperand::createExpr(MO.getExpr()));
+ else
+ llvm_unreachable("invalid extendable operand");
+ return XMI;
+}
+
+MCInst const *HexagonMCInstrInfo::extenderForIndex(MCInst const &MCB,
+ size_t Index) {
+ assert(Index <= bundleSize(MCB));
+ if (Index == 0)
+ return nullptr;
+ MCInst const *Inst =
+ MCB.getOperand(Index + bundleInstructionsOffset - 1).getInst();
+ if (isImmext(*Inst))
+ return Inst;
+ return nullptr;
+}
+
+void HexagonMCInstrInfo::extendIfNeeded(MCContext &Context,
+ MCInstrInfo const &MCII, MCInst &MCB,
+ MCInst const &MCI) {
+ if (isConstExtended(MCII, MCI))
+ addConstExtender(Context, MCII, MCB, MCI);
+}
+
+HexagonII::MemAccessSize
+HexagonMCInstrInfo::getAccessSize(MCInstrInfo const &MCII, MCInst const &MCI) {
+ const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+
+ return (HexagonII::MemAccessSize((F >> HexagonII::MemAccessSizePos) &
+ HexagonII::MemAccesSizeMask));
+}
+
+unsigned HexagonMCInstrInfo::getBitCount(MCInstrInfo const &MCII,
+ MCInst const &MCI) {
+ uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+ return ((F >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask);
+}
+
+// Return constant extended operand number.
+unsigned short HexagonMCInstrInfo::getCExtOpNum(MCInstrInfo const &MCII,
+ MCInst const &MCI) {
+ const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+ return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask);
+}
+
+MCInstrDesc const &HexagonMCInstrInfo::getDesc(MCInstrInfo const &MCII,
+ MCInst const &MCI) {
+ return (MCII.get(MCI.getOpcode()));
+}
+
+unsigned HexagonMCInstrInfo::getDuplexRegisterNumbering(unsigned Reg) {
+ using namespace Hexagon;
+ switch (Reg) {
+ default:
+ llvm_unreachable("unknown duplex register");
+ // Rs Rss
+ case R0:
+ case D0:
+ return 0;
+ case R1:
+ case D1:
+ return 1;
+ case R2:
+ case D2:
+ return 2;
+ case R3:
+ case D3:
+ return 3;
+ case R4:
+ case D8:
+ return 4;
+ case R5:
+ case D9:
+ return 5;
+ case R6:
+ case D10:
+ return 6;
+ case R7:
+ case D11:
+ return 7;
+ case R16:
+ return 8;
+ case R17:
+ return 9;
+ case R18:
+ return 10;
+ case R19:
+ return 11;
+ case R20:
+ return 12;
+ case R21:
+ return 13;
+ case R22:
+ return 14;
+ case R23:
+ return 15;
+ }
+}
+
+MCExpr const &HexagonMCInstrInfo::getExpr(MCExpr const &Expr) {
+ const auto &HExpr = cast<HexagonMCExpr>(Expr);
+ assert(HExpr.getExpr());
+ return *HExpr.getExpr();
+}
+
+unsigned short HexagonMCInstrInfo::getExtendableOp(MCInstrInfo const &MCII,
+ MCInst const &MCI) {
+ const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+ return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask);
+}
+
+MCOperand const &
+HexagonMCInstrInfo::getExtendableOperand(MCInstrInfo const &MCII,
+ MCInst const &MCI) {
+ unsigned O = HexagonMCInstrInfo::getExtendableOp(MCII, MCI);
+ MCOperand const &MO = MCI.getOperand(O);
+
+ assert((HexagonMCInstrInfo::isExtendable(MCII, MCI) ||
+ HexagonMCInstrInfo::isExtended(MCII, MCI)) &&
+ (MO.isImm() || MO.isExpr()));
+ return (MO);
+}
+
+unsigned HexagonMCInstrInfo::getExtentAlignment(MCInstrInfo const &MCII,
+ MCInst const &MCI) {
+ const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+ return ((F >> HexagonII::ExtentAlignPos) & HexagonII::ExtentAlignMask);
+}
+
+unsigned HexagonMCInstrInfo::getExtentBits(MCInstrInfo const &MCII,
+ MCInst const &MCI) {
+ const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+ return ((F >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask);
+}
+
+// Return the max value that a constant extendable operand can have
+// without being extended.
+int HexagonMCInstrInfo::getMaxValue(MCInstrInfo const &MCII,
+ MCInst const &MCI) {
+ uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+ unsigned isSigned =
+ (F >> HexagonII::ExtentSignedPos) & HexagonII::ExtentSignedMask;
+ unsigned bits = (F >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask;
+
+ if (isSigned) // if value is signed
+ return ~(-1U << (bits - 1));
+ else
+ return ~(-1U << bits);
+}
+
+// Return the min value that a constant extendable operand can have
+// without being extended.
+int HexagonMCInstrInfo::getMinValue(MCInstrInfo const &MCII,
+ MCInst const &MCI) {
+ uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+ unsigned isSigned =
+ (F >> HexagonII::ExtentSignedPos) & HexagonII::ExtentSignedMask;
+ unsigned bits = (F >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask;
+
+ if (isSigned) // if value is signed
+ return -1U << (bits - 1);
+ else
+ return 0;
+}
+
+StringRef HexagonMCInstrInfo::getName(MCInstrInfo const &MCII,
+ MCInst const &MCI) {
+ return MCII.getName(MCI.getOpcode());
+}
+
+unsigned short HexagonMCInstrInfo::getNewValueOp(MCInstrInfo const &MCII,
+ MCInst const &MCI) {
+ const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+ return ((F >> HexagonII::NewValueOpPos) & HexagonII::NewValueOpMask);
+}
+
+MCOperand const &HexagonMCInstrInfo::getNewValueOperand(MCInstrInfo const &MCII,
+ MCInst const &MCI) {
+ uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+ unsigned const O =
+ (F >> HexagonII::NewValueOpPos) & HexagonII::NewValueOpMask;
+ MCOperand const &MCO = MCI.getOperand(O);
+
+ assert((HexagonMCInstrInfo::isNewValue(MCII, MCI) ||
+ HexagonMCInstrInfo::hasNewValue(MCII, MCI)) &&
+ MCO.isReg());
+ return (MCO);
+}
+
+/// Return the new value or the newly produced value.
+unsigned short HexagonMCInstrInfo::getNewValueOp2(MCInstrInfo const &MCII,
+ MCInst const &MCI) {
+ const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+ return ((F >> HexagonII::NewValueOpPos2) & HexagonII::NewValueOpMask2);
+}
+
+MCOperand const &
+HexagonMCInstrInfo::getNewValueOperand2(MCInstrInfo const &MCII,
+ MCInst const &MCI) {
+ unsigned O = HexagonMCInstrInfo::getNewValueOp2(MCII, MCI);
+ MCOperand const &MCO = MCI.getOperand(O);
+
+ assert((HexagonMCInstrInfo::isNewValue(MCII, MCI) ||
+ HexagonMCInstrInfo::hasNewValue2(MCII, MCI)) &&
+ MCO.isReg());
+ return (MCO);
+}
+
+int HexagonMCInstrInfo::getSubTarget(MCInstrInfo const &MCII,
+ MCInst const &MCI) {
+ const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+
+ HexagonII::SubTarget Target = static_cast<HexagonII::SubTarget>(
+ (F >> HexagonII::validSubTargetPos) & HexagonII::validSubTargetMask);
+
+ switch (Target) {
+ default:
+ return Hexagon::ArchV4;
+ case HexagonII::HasV5SubT:
+ return Hexagon::ArchV5;
+ }
+}
+
+// Return the Hexagon ISA class for the insn.
+unsigned HexagonMCInstrInfo::getType(MCInstrInfo const &MCII,
+ MCInst const &MCI) {
+ const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+
+ return ((F >> HexagonII::TypePos) & HexagonII::TypeMask);
+}
+
+unsigned HexagonMCInstrInfo::getUnits(MCInstrInfo const &MCII,
+ MCSubtargetInfo const &STI,
+ MCInst const &MCI) {
+
+ const InstrItinerary *II = STI.getSchedModel().InstrItineraries;
+ int SchedClass = HexagonMCInstrInfo::getDesc(MCII, MCI).getSchedClass();
+ return ((II[SchedClass].FirstStage + HexagonStages)->getUnits());
+}
+
+bool HexagonMCInstrInfo::hasImmExt(MCInst const &MCI) {
+ if (!HexagonMCInstrInfo::isBundle(MCI))
+ return false;
+
+ for (const auto &I : HexagonMCInstrInfo::bundleInstructions(MCI)) {
+ auto MI = I.getInst();
+ if (isImmext(*MI))
+ return true;
+ }
+
+ return false;
+}
+
+bool HexagonMCInstrInfo::hasExtenderForIndex(MCInst const &MCB, size_t Index) {
+ return extenderForIndex(MCB, Index) != nullptr;
+}
+
+// Return whether the instruction is a legal new-value producer.
+bool HexagonMCInstrInfo::hasNewValue(MCInstrInfo const &MCII,
+ MCInst const &MCI) {
+ const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+ return ((F >> HexagonII::hasNewValuePos) & HexagonII::hasNewValueMask);
+}
+
+/// Return whether the insn produces a second value.
+bool HexagonMCInstrInfo::hasNewValue2(MCInstrInfo const &MCII,
+ MCInst const &MCI) {
+ const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+ return ((F >> HexagonII::hasNewValuePos2) & HexagonII::hasNewValueMask2);
+}
+
+MCInst const &HexagonMCInstrInfo::instruction(MCInst const &MCB, size_t Index) {
+ assert(isBundle(MCB));
+ assert(Index < HEXAGON_PACKET_SIZE);
+ return *MCB.getOperand(bundleInstructionsOffset + Index).getInst();
+}
+
+bool HexagonMCInstrInfo::isBundle(MCInst const &MCI) {
+ auto Result = Hexagon::BUNDLE == MCI.getOpcode();
+ assert(!Result || (MCI.size() > 0 && MCI.getOperand(0).isImm()));
+ return Result;
+}
+
+// Return whether the insn is an actual insn.
+bool HexagonMCInstrInfo::isCanon(MCInstrInfo const &MCII, MCInst const &MCI) {
+ return (!HexagonMCInstrInfo::getDesc(MCII, MCI).isPseudo() &&
+ !HexagonMCInstrInfo::isPrefix(MCII, MCI) &&
+ HexagonMCInstrInfo::getType(MCII, MCI) != HexagonII::TypeENDLOOP);
+}
+
+bool HexagonMCInstrInfo::isCofMax1(MCInstrInfo const &MCII, MCInst const &MCI) {
+ const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+ return ((F >> HexagonII::CofMax1Pos) & HexagonII::CofMax1Mask);
+}
+
+bool HexagonMCInstrInfo::isCompound(MCInstrInfo const &MCII,
+ MCInst const &MCI) {
+ return (getType(MCII, MCI) == HexagonII::TypeCOMPOUND);
+}
+
+bool HexagonMCInstrInfo::isDblRegForSubInst(unsigned Reg) {
+ return ((Reg >= Hexagon::D0 && Reg <= Hexagon::D3) ||
+ (Reg >= Hexagon::D8 && Reg <= Hexagon::D11));
+}
+
+bool HexagonMCInstrInfo::isDuplex(MCInstrInfo const &MCII, MCInst const &MCI) {
+ return HexagonII::TypeDUPLEX == HexagonMCInstrInfo::getType(MCII, MCI);
+}
+
+// Return whether the instruction needs to be constant extended.
+// 1) Always return true if the instruction has 'isExtended' flag set.
+//
+// isExtendable:
+// 2) For immediate extended operands, return true only if the value is
+// out-of-range.
+// 3) For global address, always return true.
+
+bool HexagonMCInstrInfo::isConstExtended(MCInstrInfo const &MCII,
+ MCInst const &MCI) {
+ if (HexagonMCInstrInfo::isExtended(MCII, MCI))
+ return true;
+ if (!HexagonMCInstrInfo::isExtendable(MCII, MCI))
+ return false;
+ MCOperand const &MO = HexagonMCInstrInfo::getExtendableOperand(MCII, MCI);
+ if (isa<HexagonMCExpr>(MO.getExpr()) &&
+ HexagonMCInstrInfo::mustExtend(*MO.getExpr()))
+ return true;
+ // Branch insns are handled as necessary by relaxation.
+ if ((HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeJ) ||
+ (HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCOMPOUND &&
+ HexagonMCInstrInfo::getDesc(MCII, MCI).isBranch()) ||
+ (HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeNV &&
+ HexagonMCInstrInfo::getDesc(MCII, MCI).isBranch()))
+ return false;
+ // Otherwise loop instructions and other CR insts are handled by relaxation
+ else if ((HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCR) &&
+ (MCI.getOpcode() != Hexagon::C4_addipc))
+ return false;
+
+ assert(!MO.isImm());
+ if (isa<HexagonMCExpr>(MO.getExpr()) &&
+ HexagonMCInstrInfo::mustNotExtend(*MO.getExpr()))
+ return false;
+ int64_t Value;
+ if (!MO.getExpr()->evaluateAsAbsolute(Value))
+ return true;
+ int MinValue = HexagonMCInstrInfo::getMinValue(MCII, MCI);
+ int MaxValue = HexagonMCInstrInfo::getMaxValue(MCII, MCI);
+ return (MinValue > Value || Value > MaxValue);
+}
+
+bool HexagonMCInstrInfo::isExtendable(MCInstrInfo const &MCII,
+ MCInst const &MCI) {
+ uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+ return (F >> HexagonII::ExtendablePos) & HexagonII::ExtendableMask;
+}
+
+bool HexagonMCInstrInfo::isExtended(MCInstrInfo const &MCII,
+ MCInst const &MCI) {
+ uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+ return (F >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask;
+}
+
+bool HexagonMCInstrInfo::isFloat(MCInstrInfo const &MCII, MCInst const &MCI) {
+ const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+ return ((F >> HexagonII::FPPos) & HexagonII::FPMask);
+}
+
+bool HexagonMCInstrInfo::isImmext(MCInst const &MCI) {
+ auto Op = MCI.getOpcode();
+ return (Op == Hexagon::A4_ext_b || Op == Hexagon::A4_ext_c ||
+ Op == Hexagon::A4_ext_g || Op == Hexagon::A4_ext);
+}
+
+bool HexagonMCInstrInfo::isInnerLoop(MCInst const &MCI) {
+ assert(isBundle(MCI));
+ int64_t Flags = MCI.getOperand(0).getImm();
+ return (Flags & innerLoopMask) != 0;
+}
+
+bool HexagonMCInstrInfo::isIntReg(unsigned Reg) {
+ return (Reg >= Hexagon::R0 && Reg <= Hexagon::R31);
+}
+
+bool HexagonMCInstrInfo::isIntRegForSubInst(unsigned Reg) {
+ return ((Reg >= Hexagon::R0 && Reg <= Hexagon::R7) ||
+ (Reg >= Hexagon::R16 && Reg <= Hexagon::R23));
+}
+
+// Return whether the insn is a new-value consumer.
+bool HexagonMCInstrInfo::isNewValue(MCInstrInfo const &MCII,
+ MCInst const &MCI) {
+ const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+ return ((F >> HexagonII::NewValuePos) & HexagonII::NewValueMask);
+}
+
+// Return whether the operand can be constant extended.
+bool HexagonMCInstrInfo::isOperandExtended(MCInstrInfo const &MCII,
+ MCInst const &MCI,
+ unsigned short OperandNum) {
+ uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+ return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask) ==
+ OperandNum;
+}
+
+bool HexagonMCInstrInfo::isOuterLoop(MCInst const &MCI) {
+ assert(isBundle(MCI));
+ int64_t Flags = MCI.getOperand(0).getImm();
+ return (Flags & outerLoopMask) != 0;
+}
+
+bool HexagonMCInstrInfo::isPredicated(MCInstrInfo const &MCII,
+ MCInst const &MCI) {
+ const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+ return ((F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask);
+}
+
+bool HexagonMCInstrInfo::isPredicateLate(MCInstrInfo const &MCII,
+ MCInst const &MCI) {
+ const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+ return (F >> HexagonII::PredicateLatePos & HexagonII::PredicateLateMask);
+}
+
+/// Return whether the insn is newly predicated.
+bool HexagonMCInstrInfo::isPredicatedNew(MCInstrInfo const &MCII,
+ MCInst const &MCI) {
+ const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+ return ((F >> HexagonII::PredicatedNewPos) & HexagonII::PredicatedNewMask);
+}
+
+bool HexagonMCInstrInfo::isPredicatedTrue(MCInstrInfo const &MCII,
+ MCInst const &MCI) {
+ const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+ return (
+ !((F >> HexagonII::PredicatedFalsePos) & HexagonII::PredicatedFalseMask));
+}
+
+bool HexagonMCInstrInfo::isPredReg(unsigned Reg) {
+ return (Reg >= Hexagon::P0 && Reg <= Hexagon::P3_0);
+}
+
+bool HexagonMCInstrInfo::isPrefix(MCInstrInfo const &MCII, MCInst const &MCI) {
+ return (HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypePREFIX);
+}
+
+bool HexagonMCInstrInfo::isSolo(MCInstrInfo const &MCII, MCInst const &MCI) {
+ const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+ return ((F >> HexagonII::SoloPos) & HexagonII::SoloMask);
+}
+
+bool HexagonMCInstrInfo::isMemReorderDisabled(MCInst const &MCI) {
+ assert(isBundle(MCI));
+ auto Flags = MCI.getOperand(0).getImm();
+ return (Flags & memReorderDisabledMask) != 0;
+}
+
+bool HexagonMCInstrInfo::isMemStoreReorderEnabled(MCInst const &MCI) {
+ assert(isBundle(MCI));
+ auto Flags = MCI.getOperand(0).getImm();
+ return (Flags & memStoreReorderEnabledMask) != 0;
+}
+
+bool HexagonMCInstrInfo::isSubInstruction(MCInst const &MCI) {
+ switch (MCI.getOpcode()) {
+ default:
+ return false;
+ case Hexagon::SA1_addi:
+ case Hexagon::SA1_addrx:
+ case Hexagon::SA1_addsp:
+ case Hexagon::SA1_and1:
+ case Hexagon::SA1_clrf:
+ case Hexagon::SA1_clrfnew:
+ case Hexagon::SA1_clrt:
+ case Hexagon::SA1_clrtnew:
+ case Hexagon::SA1_cmpeqi:
+ case Hexagon::SA1_combine0i:
+ case Hexagon::SA1_combine1i:
+ case Hexagon::SA1_combine2i:
+ case Hexagon::SA1_combine3i:
+ case Hexagon::SA1_combinerz:
+ case Hexagon::SA1_combinezr:
+ case Hexagon::SA1_dec:
+ case Hexagon::SA1_inc:
+ case Hexagon::SA1_seti:
+ case Hexagon::SA1_setin1:
+ case Hexagon::SA1_sxtb:
+ case Hexagon::SA1_sxth:
+ case Hexagon::SA1_tfr:
+ case Hexagon::SA1_zxtb:
+ case Hexagon::SA1_zxth:
+ case Hexagon::SL1_loadri_io:
+ case Hexagon::SL1_loadrub_io:
+ case Hexagon::SL2_deallocframe:
+ case Hexagon::SL2_jumpr31:
+ case Hexagon::SL2_jumpr31_f:
+ case Hexagon::SL2_jumpr31_fnew:
+ case Hexagon::SL2_jumpr31_t:
+ case Hexagon::SL2_jumpr31_tnew:
+ case Hexagon::SL2_loadrb_io:
+ case Hexagon::SL2_loadrd_sp:
+ case Hexagon::SL2_loadrh_io:
+ case Hexagon::SL2_loadri_sp:
+ case Hexagon::SL2_loadruh_io:
+ case Hexagon::SL2_return:
+ case Hexagon::SL2_return_f:
+ case Hexagon::SL2_return_fnew:
+ case Hexagon::SL2_return_t:
+ case Hexagon::SL2_return_tnew:
+ case Hexagon::SS1_storeb_io:
+ case Hexagon::SS1_storew_io:
+ case Hexagon::SS2_allocframe:
+ case Hexagon::SS2_storebi0:
+ case Hexagon::SS2_storebi1:
+ case Hexagon::SS2_stored_sp:
+ case Hexagon::SS2_storeh_io:
+ case Hexagon::SS2_storew_sp:
+ case Hexagon::SS2_storewi0:
+ case Hexagon::SS2_storewi1:
+ return true;
+ }
+}
+
+bool HexagonMCInstrInfo::isSoloAX(MCInstrInfo const &MCII, MCInst const &MCI) {
+ const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+ return ((F >> HexagonII::SoloAXPos) & HexagonII::SoloAXMask);
+}
+
+bool HexagonMCInstrInfo::isSoloAin1(MCInstrInfo const &MCII,
+ MCInst const &MCI) {
+ const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+ return ((F >> HexagonII::SoloAin1Pos) & HexagonII::SoloAin1Mask);
+}
+
+bool HexagonMCInstrInfo::isVector(MCInstrInfo const &MCII, MCInst const &MCI) {
+ if ((getType(MCII, MCI) <= HexagonII::TypeCVI_LAST) &&
+ (getType(MCII, MCI) >= HexagonII::TypeCVI_FIRST))
+ return true;
+ return false;
+}
+
+int64_t HexagonMCInstrInfo::minConstant(MCInst const &MCI, size_t Index) {
+ auto Sentinal = static_cast<int64_t>(std::numeric_limits<uint32_t>::max())
+ << 8;
+ if (MCI.size() <= Index)
+ return Sentinal;
+ MCOperand const &MCO = MCI.getOperand(Index);
+ if (!MCO.isExpr())
+ return Sentinal;
+ int64_t Value;
+ if (!MCO.getExpr()->evaluateAsAbsolute(Value))
+ return Sentinal;
+ return Value;
+}
+
+void HexagonMCInstrInfo::setMustExtend(MCExpr const &Expr, bool Val) {
+ HexagonMCExpr &HExpr = const_cast<HexagonMCExpr &>(cast<HexagonMCExpr>(Expr));
+ HExpr.setMustExtend(Val);
+}
+
+bool HexagonMCInstrInfo::mustExtend(MCExpr const &Expr) {
+ HexagonMCExpr const &HExpr = cast<HexagonMCExpr>(Expr);
+ return HExpr.mustExtend();
+}
+void HexagonMCInstrInfo::setMustNotExtend(MCExpr const &Expr, bool Val) {
+ HexagonMCExpr &HExpr =
+ const_cast<HexagonMCExpr &>(cast<HexagonMCExpr>(Expr));
+ HExpr.setMustNotExtend(Val);
+}
+bool HexagonMCInstrInfo::mustNotExtend(MCExpr const &Expr) {
+ HexagonMCExpr const &HExpr = cast<HexagonMCExpr>(Expr);
+ return HExpr.mustNotExtend();
+}
+
+void HexagonMCInstrInfo::padEndloop(MCContext &Context, MCInst &MCB) {
+ MCInst Nop;
+ Nop.setOpcode(Hexagon::A2_nop);
+ assert(isBundle(MCB));
+ while ((HexagonMCInstrInfo::isInnerLoop(MCB) &&
+ (HexagonMCInstrInfo::bundleSize(MCB) < HEXAGON_PACKET_INNER_SIZE)) ||
+ ((HexagonMCInstrInfo::isOuterLoop(MCB) &&
+ (HexagonMCInstrInfo::bundleSize(MCB) < HEXAGON_PACKET_OUTER_SIZE))))
+ MCB.addOperand(MCOperand::createInst(new (Context) MCInst(Nop)));
+}
+
+bool HexagonMCInstrInfo::prefersSlot3(MCInstrInfo const &MCII,
+ MCInst const &MCI) {
+ if (HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCR)
+ return false;
+
+ unsigned SchedClass = HexagonMCInstrInfo::getDesc(MCII, MCI).getSchedClass();
+ switch (SchedClass) {
+ case Hexagon::Sched::ALU32_3op_tc_2_SLOT0123:
+ case Hexagon::Sched::ALU64_tc_2_SLOT23:
+ case Hexagon::Sched::ALU64_tc_3x_SLOT23:
+ case Hexagon::Sched::M_tc_2_SLOT23:
+ case Hexagon::Sched::M_tc_3x_SLOT23:
+ case Hexagon::Sched::S_2op_tc_2_SLOT23:
+ case Hexagon::Sched::S_3op_tc_2_SLOT23:
+ case Hexagon::Sched::S_3op_tc_3x_SLOT23:
+ return true;
+ }
+ return false;
+}
+
+void HexagonMCInstrInfo::replaceDuplex(MCContext &Context, MCInst &MCB,
+ DuplexCandidate Candidate) {
+ assert(Candidate.packetIndexI < MCB.size());
+ assert(Candidate.packetIndexJ < MCB.size());
+ assert(isBundle(MCB));
+ MCInst *Duplex =
+ deriveDuplex(Context, Candidate.iClass,
+ *MCB.getOperand(Candidate.packetIndexJ).getInst(),
+ *MCB.getOperand(Candidate.packetIndexI).getInst());
+ assert(Duplex != nullptr);
+ MCB.getOperand(Candidate.packetIndexI).setInst(Duplex);
+ MCB.erase(MCB.begin() + Candidate.packetIndexJ);
+}
+
+void HexagonMCInstrInfo::setInnerLoop(MCInst &MCI) {
+ assert(isBundle(MCI));
+ MCOperand &Operand = MCI.getOperand(0);
+ Operand.setImm(Operand.getImm() | innerLoopMask);
+}
+
+void HexagonMCInstrInfo::setMemReorderDisabled(MCInst &MCI) {
+ assert(isBundle(MCI));
+ MCOperand &Operand = MCI.getOperand(0);
+ Operand.setImm(Operand.getImm() | memReorderDisabledMask);
+ assert(isMemReorderDisabled(MCI));
+}
+
+void HexagonMCInstrInfo::setMemStoreReorderEnabled(MCInst &MCI) {
+ assert(isBundle(MCI));
+ MCOperand &Operand = MCI.getOperand(0);
+ Operand.setImm(Operand.getImm() | memStoreReorderEnabledMask);
+ assert(isMemStoreReorderEnabled(MCI));
+}
+void HexagonMCInstrInfo::setS23_2_reloc(MCExpr const &Expr, bool Val) {
+ HexagonMCExpr &HExpr =
+ const_cast<HexagonMCExpr &>(*llvm::cast<HexagonMCExpr>(&Expr));
+ HExpr.setS23_2_reloc(Val);
+}
+bool HexagonMCInstrInfo::s23_2_reloc(MCExpr const &Expr) {
+ HexagonMCExpr const &HExpr = *llvm::cast<HexagonMCExpr>(&Expr);
+ return HExpr.s23_2_reloc();
+}
+
+void HexagonMCInstrInfo::setOuterLoop(MCInst &MCI) {
+ assert(isBundle(MCI));
+ MCOperand &Operand = MCI.getOperand(0);
+ Operand.setImm(Operand.getImm() | outerLoopMask);
+}
+
+unsigned HexagonMCInstrInfo::SubregisterBit(unsigned Consumer,
+ unsigned Producer,
+ unsigned Producer2) {
+ // If we're a single vector consumer of a double producer, set subreg bit
+ // based on if we're accessing the lower or upper register component
+ if (Producer >= Hexagon::W0 && Producer <= Hexagon::W15)
+ if (Consumer >= Hexagon::V0 && Consumer <= Hexagon::V31)
+ return (Consumer - Hexagon::V0) & 0x1;
+ if (Consumer == Producer2)
+ return 0x1;
+ return 0;
+}
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
new file mode 100644
index 000000000000..d701c3ade69e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
@@ -0,0 +1,302 @@
+//===- HexagonMCInstrInfo.cpp - Utility functions on Hexagon MCInsts ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Utility functions for Hexagon specific MCInst queries
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCINSTRINFO_H
+#define LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCINSTRINFO_H
+
+#include "HexagonMCExpr.h"
+#include "llvm/MC/MCInst.h"
+
+namespace llvm {
+class HexagonMCChecker;
+class MCContext;
+class MCInstrDesc;
+class MCInstrInfo;
+class MCInst;
+class MCOperand;
+class MCSubtargetInfo;
+namespace HexagonII {
+enum class MemAccessSize;
+}
+class DuplexCandidate {
+public:
+ unsigned packetIndexI, packetIndexJ, iClass;
+ DuplexCandidate(unsigned i, unsigned j, unsigned iClass)
+ : packetIndexI(i), packetIndexJ(j), iClass(iClass) {}
+};
+namespace HexagonMCInstrInfo {
+size_t const innerLoopOffset = 0;
+int64_t const innerLoopMask = 1 << innerLoopOffset;
+
+size_t const outerLoopOffset = 1;
+int64_t const outerLoopMask = 1 << outerLoopOffset;
+
+// do not reorder memory load/stores by default load/stores are re-ordered
+// and by default loads can be re-ordered
+size_t const memReorderDisabledOffset = 2;
+int64_t const memReorderDisabledMask = 1 << memReorderDisabledOffset;
+
+// allow re-ordering of memory stores by default stores cannot be re-ordered
+size_t const memStoreReorderEnabledOffset = 3;
+int64_t const memStoreReorderEnabledMask = 1 << memStoreReorderEnabledOffset;
+
+size_t const bundleInstructionsOffset = 1;
+
+void addConstant(MCInst &MI, uint64_t Value, MCContext &Context);
+void addConstExtender(MCContext &Context, MCInstrInfo const &MCII, MCInst &MCB,
+ MCInst const &MCI);
+
+// Returns a iterator range of instructions in this bundle
+iterator_range<MCInst::const_iterator> bundleInstructions(MCInst const &MCI);
+
+// Returns the number of instructions in the bundle
+size_t bundleSize(MCInst const &MCI);
+
+// Put the packet in to canonical form, compound, duplex, pad, and shuffle
+bool canonicalizePacket(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+ MCContext &Context, MCInst &MCB,
+ HexagonMCChecker *Checker);
+
+// Clamp off upper 26 bits of extendable operand for emission
+void clampExtended(MCInstrInfo const &MCII, MCContext &Context, MCInst &MCI);
+
+MCInst createBundle();
+
+// Return the extender for instruction at Index or nullptr if none
+MCInst const *extenderForIndex(MCInst const &MCB, size_t Index);
+void extendIfNeeded(MCContext &Context, MCInstrInfo const &MCII, MCInst &MCB,
+ MCInst const &MCI);
+
+// Create a duplex instruction given the two subinsts
+MCInst *deriveDuplex(MCContext &Context, unsigned iClass, MCInst const &inst0,
+ MCInst const &inst1);
+MCInst deriveExtender(MCInstrInfo const &MCII, MCInst const &Inst,
+ MCOperand const &MO);
+
+// Convert this instruction in to a duplex subinst
+MCInst deriveSubInst(MCInst const &Inst);
+
+// Return the extender for instruction at Index or nullptr if none
+MCInst const *extenderForIndex(MCInst const &MCB, size_t Index);
+
+// Return memory access size
+HexagonII::MemAccessSize getAccessSize(MCInstrInfo const &MCII,
+ MCInst const &MCI);
+
+// Return number of bits in the constant extended operand.
+unsigned getBitCount(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return constant extended operand number.
+unsigned short getCExtOpNum(MCInstrInfo const &MCII, MCInst const &MCI);
+
+MCInstrDesc const &getDesc(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return which duplex group this instruction belongs to
+unsigned getDuplexCandidateGroup(MCInst const &MI);
+
+// Return a list of all possible instruction duplex combinations
+SmallVector<DuplexCandidate, 8> getDuplexPossibilties(MCInstrInfo const &MCII,
+ MCInst const &MCB);
+unsigned getDuplexRegisterNumbering(unsigned Reg);
+
+MCExpr const &getExpr(MCExpr const &Expr);
+
+// Return the index of the extendable operand
+unsigned short getExtendableOp(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return a reference to the extendable operand
+MCOperand const &getExtendableOperand(MCInstrInfo const &MCII,
+ MCInst const &MCI);
+
+// Return the implicit alignment of the extendable operand
+unsigned getExtentAlignment(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return the number of logical bits of the extendable operand
+unsigned getExtentBits(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return the max value that a constant extendable operand can have
+// without being extended.
+int getMaxValue(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return the min value that a constant extendable operand can have
+// without being extended.
+int getMinValue(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return instruction name
+StringRef getName(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return the operand index for the new value.
+unsigned short getNewValueOp(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return the operand that consumes or produces a new value.
+MCOperand const &getNewValueOperand(MCInstrInfo const &MCII, MCInst const &MCI);
+unsigned short getNewValueOp2(MCInstrInfo const &MCII, MCInst const &MCI);
+MCOperand const &getNewValueOperand2(MCInstrInfo const &MCII,
+ MCInst const &MCI);
+
+int getSubTarget(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return the Hexagon ISA class for the insn.
+unsigned getType(MCInstrInfo const &MCII, MCInst const &MCI);
+
+/// Return the slots used by the insn.
+unsigned getUnits(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+ MCInst const &MCI);
+
+// Does the packet have an extender for the instruction at Index
+bool hasExtenderForIndex(MCInst const &MCB, size_t Index);
+
+bool hasImmExt(MCInst const &MCI);
+
+// Return whether the instruction is a legal new-value producer.
+bool hasNewValue(MCInstrInfo const &MCII, MCInst const &MCI);
+bool hasNewValue2(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return the instruction at Index
+MCInst const &instruction(MCInst const &MCB, size_t Index);
+
+// Returns whether this MCInst is a wellformed bundle
+bool isBundle(MCInst const &MCI);
+
+// Return whether the insn is an actual insn.
+bool isCanon(MCInstrInfo const &MCII, MCInst const &MCI);
+bool isCofMax1(MCInstrInfo const &MCII, MCInst const &MCI);
+bool isCompound(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return the duplex iclass given the two duplex classes
+unsigned iClassOfDuplexPair(unsigned Ga, unsigned Gb);
+
+int64_t minConstant(MCInst const &MCI, size_t Index);
+template <unsigned N, unsigned S>
+bool inRange(MCInst const &MCI, size_t Index) {
+ return isShiftedUInt<N, S>(minConstant(MCI, Index));
+}
+template <unsigned N, unsigned S>
+bool inSRange(MCInst const &MCI, size_t Index) {
+ return isShiftedInt<N, S>(minConstant(MCI, Index));
+}
+template <unsigned N> bool inRange(MCInst const &MCI, size_t Index) {
+ return isUInt<N>(minConstant(MCI, Index));
+}
+
+// Return whether the instruction needs to be constant extended.
+bool isConstExtended(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Is this double register suitable for use in a duplex subinst
+bool isDblRegForSubInst(unsigned Reg);
+
+// Is this a duplex instruction
+bool isDuplex(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Can these instructions be duplexed
+bool isDuplexPair(MCInst const &MIa, MCInst const &MIb);
+
+// Can these duplex classes be combine in to a duplex instruction
+bool isDuplexPairMatch(unsigned Ga, unsigned Gb);
+
+// Return true if the insn may be extended based on the operand value.
+bool isExtendable(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return whether the instruction must be always extended.
+bool isExtended(MCInstrInfo const &MCII, MCInst const &MCI);
+
+/// Return whether it is a floating-point insn.
+bool isFloat(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Returns whether this instruction is an immediate extender
+bool isImmext(MCInst const &MCI);
+
+// Returns whether this bundle is an endloop0
+bool isInnerLoop(MCInst const &MCI);
+
+// Is this an integer register
+bool isIntReg(unsigned Reg);
+
+// Is this register suitable for use in a duplex subinst
+bool isIntRegForSubInst(unsigned Reg);
+bool isMemReorderDisabled(MCInst const &MCI);
+bool isMemStoreReorderEnabled(MCInst const &MCI);
+
+// Return whether the insn is a new-value consumer.
+bool isNewValue(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return true if the operand can be constant extended.
+bool isOperandExtended(MCInstrInfo const &MCII, MCInst const &MCI,
+ unsigned short OperandNum);
+
+// Can these two instructions be duplexed
+bool isOrderedDuplexPair(MCInstrInfo const &MCII, MCInst const &MIa,
+ bool ExtendedA, MCInst const &MIb, bool ExtendedB,
+ bool bisReversable);
+
+// Returns whether this bundle is an endloop1
+bool isOuterLoop(MCInst const &MCI);
+
+// Return whether this instruction is predicated
+bool isPredicated(MCInstrInfo const &MCII, MCInst const &MCI);
+bool isPredicateLate(MCInstrInfo const &MCII, MCInst const &MCI);
+bool isPredicatedNew(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return whether the predicate sense is true
+bool isPredicatedTrue(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Is this a predicate register
+bool isPredReg(unsigned Reg);
+
+// Return whether the insn is a prefix.
+bool isPrefix(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return whether the insn is solo, i.e., cannot be in a packet.
+bool isSolo(MCInstrInfo const &MCII, MCInst const &MCI);
+
+/// Return whether the insn can be packaged only with A and X-type insns.
+bool isSoloAX(MCInstrInfo const &MCII, MCInst const &MCI);
+
+/// Return whether the insn can be packaged only with an A-type insn in slot #1.
+bool isSoloAin1(MCInstrInfo const &MCII, MCInst const &MCI);
+bool isSubInstruction(MCInst const &MCI);
+bool isVector(MCInstrInfo const &MCII, MCInst const &MCI);
+bool mustExtend(MCExpr const &Expr);
+bool mustNotExtend(MCExpr const &Expr);
+
+// Pad the bundle with nops to satisfy endloop requirements
+void padEndloop(MCContext &Context, MCInst &MCI);
+
+bool prefersSlot3(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Replace the instructions inside MCB, represented by Candidate
+void replaceDuplex(MCContext &Context, MCInst &MCB, DuplexCandidate Candidate);
+
+bool s23_2_reloc(MCExpr const &Expr);
+// Marks a bundle as endloop0
+void setInnerLoop(MCInst &MCI);
+void setMemReorderDisabled(MCInst &MCI);
+void setMemStoreReorderEnabled(MCInst &MCI);
+void setMustExtend(MCExpr const &Expr, bool Val = true);
+void setMustNotExtend(MCExpr const &Expr, bool Val = true);
+void setS23_2_reloc(MCExpr const &Expr, bool Val = true);
+
+// Marks a bundle as endloop1
+void setOuterLoop(MCInst &MCI);
+
+// Would duplexing this instruction create a requirement to extend
+bool subInstWouldBeExtended(MCInst const &potentialDuplex);
+unsigned SubregisterBit(unsigned Consumer, unsigned Producer,
+ unsigned Producer2);
+
+// Attempt to find and replace compound pairs
+void tryCompound(MCInstrInfo const &MCII, MCContext &Context, MCInst &MCI);
+}
+}
+
+#endif // LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCINSTRINFO_H
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp
new file mode 100644
index 000000000000..7f8e7a4edb0c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp
@@ -0,0 +1,236 @@
+//===----- HexagonMCShuffler.cpp - MC bundle shuffling --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the shuffling of insns inside a bundle according to the
+// packet formation rules of the Hexagon ISA.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hexagon-shuffle"
+
+#include "Hexagon.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
+#include "MCTargetDesc/HexagonMCShuffler.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+ DisableShuffle("disable-hexagon-shuffle", cl::Hidden, cl::init(false),
+ cl::desc("Disable Hexagon instruction shuffling"));
+
+void HexagonMCShuffler::init(MCInst &MCB) {
+ if (HexagonMCInstrInfo::isBundle(MCB)) {
+ MCInst const *Extender = nullptr;
+ // Copy the bundle for the shuffling.
+ for (const auto &I : HexagonMCInstrInfo::bundleInstructions(MCB)) {
+ assert(!HexagonMCInstrInfo::getDesc(MCII, *I.getInst()).isPseudo());
+ MCInst *MI = const_cast<MCInst *>(I.getInst());
+
+ if (!HexagonMCInstrInfo::isImmext(*MI)) {
+ append(MI, Extender, HexagonMCInstrInfo::getUnits(MCII, STI, *MI),
+ false);
+ Extender = nullptr;
+ } else
+ Extender = MI;
+ }
+ }
+
+ BundleFlags = MCB.getOperand(0).getImm();
+}
+
+void HexagonMCShuffler::init(MCInst &MCB, MCInst const *AddMI,
+ bool bInsertAtFront) {
+ if (HexagonMCInstrInfo::isBundle(MCB)) {
+ if (bInsertAtFront && AddMI)
+ append(AddMI, nullptr, HexagonMCInstrInfo::getUnits(MCII, STI, *AddMI),
+ false);
+ MCInst const *Extender = nullptr;
+ // Copy the bundle for the shuffling.
+ for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCB)) {
+ assert(!HexagonMCInstrInfo::getDesc(MCII, *I.getInst()).isPseudo());
+ MCInst *MI = const_cast<MCInst *>(I.getInst());
+ if (!HexagonMCInstrInfo::isImmext(*MI)) {
+ append(MI, Extender, HexagonMCInstrInfo::getUnits(MCII, STI, *MI),
+ false);
+ Extender = nullptr;
+ } else
+ Extender = MI;
+ }
+ if (!bInsertAtFront && AddMI)
+ append(AddMI, nullptr, HexagonMCInstrInfo::getUnits(MCII, STI, *AddMI),
+ false);
+ }
+
+ BundleFlags = MCB.getOperand(0).getImm();
+}
+
+void HexagonMCShuffler::copyTo(MCInst &MCB) {
+ MCB.clear();
+ MCB.addOperand(MCOperand::createImm(BundleFlags));
+ // Copy the results into the bundle.
+ for (HexagonShuffler::iterator I = begin(); I != end(); ++I) {
+
+ MCInst const *MI = I->getDesc();
+ MCInst const *Extender = I->getExtender();
+ if (Extender)
+ MCB.addOperand(MCOperand::createInst(Extender));
+ MCB.addOperand(MCOperand::createInst(MI));
+ }
+}
+
+bool HexagonMCShuffler::reshuffleTo(MCInst &MCB) {
+ if (shuffle()) {
+ // Copy the results into the bundle.
+ copyTo(MCB);
+ } else
+ DEBUG(MCB.dump());
+
+ return (!getError());
+}
+
+bool llvm::HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+ MCInst &MCB) {
+ HexagonMCShuffler MCS(MCII, STI, MCB);
+
+ if (DisableShuffle)
+ // Ignore if user chose so.
+ return false;
+
+ if (!HexagonMCInstrInfo::bundleSize(MCB)) {
+ // There once was a bundle:
+ // BUNDLE %D2<imp-def>, %R4<imp-def>, %R5<imp-def>, %D7<imp-def>, ...
+ // * %D2<def> = IMPLICIT_DEF; flags:
+ // * %D7<def> = IMPLICIT_DEF; flags:
+ // After the IMPLICIT_DEFs were removed by the asm printer, the bundle
+ // became empty.
+ DEBUG(dbgs() << "Skipping empty bundle");
+ return false;
+ } else if (!HexagonMCInstrInfo::isBundle(MCB)) {
+ DEBUG(dbgs() << "Skipping stand-alone insn");
+ return false;
+ }
+
+ // Reorder the bundle and copy the result.
+ if (!MCS.reshuffleTo(MCB)) {
+ // Unless there is any error, which should not happen at this point.
+ unsigned shuffleError = MCS.getError();
+ switch (shuffleError) {
+ default:
+ llvm_unreachable("unknown error");
+ case HexagonShuffler::SHUFFLE_ERROR_INVALID:
+ llvm_unreachable("invalid packet");
+ case HexagonShuffler::SHUFFLE_ERROR_STORES:
+ llvm_unreachable("too many stores");
+ case HexagonShuffler::SHUFFLE_ERROR_LOADS:
+ llvm_unreachable("too many loads");
+ case HexagonShuffler::SHUFFLE_ERROR_BRANCHES:
+ llvm_unreachable("too many branches");
+ case HexagonShuffler::SHUFFLE_ERROR_NOSLOTS:
+ llvm_unreachable("no suitable slot");
+ case HexagonShuffler::SHUFFLE_ERROR_SLOTS:
+ llvm_unreachable("over-subscribed slots");
+ case HexagonShuffler::SHUFFLE_SUCCESS: // Single instruction case.
+ return true;
+ }
+ }
+
+ return true;
+}
+
+unsigned
+llvm::HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+ MCContext &Context, MCInst &MCB,
+ SmallVector<DuplexCandidate, 8> possibleDuplexes) {
+
+ if (DisableShuffle)
+ return HexagonShuffler::SHUFFLE_SUCCESS;
+
+ if (!HexagonMCInstrInfo::bundleSize(MCB)) {
+ // There once was a bundle:
+ // BUNDLE %D2<imp-def>, %R4<imp-def>, %R5<imp-def>, %D7<imp-def>, ...
+ // * %D2<def> = IMPLICIT_DEF; flags:
+ // * %D7<def> = IMPLICIT_DEF; flags:
+ // After the IMPLICIT_DEFs were removed by the asm printer, the bundle
+ // became empty.
+ DEBUG(dbgs() << "Skipping empty bundle");
+ return HexagonShuffler::SHUFFLE_SUCCESS;
+ } else if (!HexagonMCInstrInfo::isBundle(MCB)) {
+ DEBUG(dbgs() << "Skipping stand-alone insn");
+ return HexagonShuffler::SHUFFLE_SUCCESS;
+ }
+
+ bool doneShuffling = false;
+ unsigned shuffleError;
+ while (possibleDuplexes.size() > 0 && (!doneShuffling)) {
+ // case of Duplex Found
+ DuplexCandidate duplexToTry = possibleDuplexes.pop_back_val();
+ MCInst Attempt(MCB);
+ HexagonMCInstrInfo::replaceDuplex(Context, Attempt, duplexToTry);
+ HexagonMCShuffler MCS(MCII, STI, Attempt); // copy packet to the shuffler
+ if (MCS.size() == 1) { // case of one duplex
+ // copy the created duplex in the shuffler to the bundle
+ MCS.copyTo(MCB);
+ return HexagonShuffler::SHUFFLE_SUCCESS;
+ }
+ // try shuffle with this duplex
+ doneShuffling = MCS.reshuffleTo(MCB);
+ shuffleError = MCS.getError();
+
+ if (doneShuffling)
+ break;
+ }
+
+ if (doneShuffling == false) {
+ HexagonMCShuffler MCS(MCII, STI, MCB);
+ doneShuffling = MCS.reshuffleTo(MCB); // shuffle
+ shuffleError = MCS.getError();
+ }
+ if (!doneShuffling)
+ return shuffleError;
+
+ return HexagonShuffler::SHUFFLE_SUCCESS;
+}
+
+bool llvm::HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+ MCInst &MCB, MCInst const *AddMI, int fixupCount) {
+ if (!HexagonMCInstrInfo::isBundle(MCB) || !AddMI)
+ return false;
+
+ // if fixups present, make sure we don't insert too many nops that would
+ // later prevent an extender from being inserted.
+ unsigned int bundleSize = HexagonMCInstrInfo::bundleSize(MCB);
+ if (bundleSize >= HEXAGON_PACKET_SIZE)
+ return false;
+ if (fixupCount >= 2) {
+ return false;
+ } else {
+ if (bundleSize == HEXAGON_PACKET_SIZE - 1 && fixupCount)
+ return false;
+ }
+
+ if (DisableShuffle)
+ return false;
+
+ HexagonMCShuffler MCS(MCII, STI, MCB, AddMI);
+ if (!MCS.reshuffleTo(MCB)) {
+ unsigned shuffleError = MCS.getError();
+ switch (shuffleError) {
+ default:
+ return false;
+ case HexagonShuffler::SHUFFLE_SUCCESS: // single instruction case
+ return true;
+ }
+ }
+
+ return true;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h
new file mode 100644
index 000000000000..a21cce1fc240
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h
@@ -0,0 +1,65 @@
+//=-- HexagonMCShuffler.h ---------------------------------------------------=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This declares the shuffling of insns inside a bundle according to the
+// packet formation rules of the Hexagon ISA.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGONMCSHUFFLER_H
+#define HEXAGONMCSHUFFLER_H
+
+#include "MCTargetDesc/HexagonShuffler.h"
+
+namespace llvm {
+
+class MCInst;
+
+// Insn bundle shuffler.
+class HexagonMCShuffler : public HexagonShuffler {
+ bool immext_present;
+ bool duplex_present;
+
+public:
+ HexagonMCShuffler(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+ MCInst &MCB)
+ : HexagonShuffler(MCII, STI) {
+ init(MCB);
+ };
+ HexagonMCShuffler(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+ MCInst &MCB, const MCInst *AddMI,
+ bool bInsertAtFront = false)
+ : HexagonShuffler(MCII, STI) {
+ init(MCB, AddMI, bInsertAtFront);
+ };
+
+ // Copy reordered bundle to another.
+ void copyTo(MCInst &MCB);
+ // Reorder and copy result to another.
+ bool reshuffleTo(MCInst &MCB);
+
+ bool immextPresent() const { return immext_present; };
+ bool duplexPresent() const { return duplex_present; };
+
+private:
+ void init(MCInst &MCB);
+ void init(MCInst &MCB, const MCInst *AddMI, bool bInsertAtFront = false);
+};
+
+// Invocation of the shuffler.
+bool HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+ MCInst &);
+bool HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+ MCInst &, const MCInst *, int);
+unsigned HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+ MCContext &Context, MCInst &,
+ SmallVector<DuplexCandidate, 8>);
+}
+
+#endif // HEXAGONMCSHUFFLER_H
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
new file mode 100644
index 000000000000..694cf582f8d9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -0,0 +1,278 @@
+//===-- HexagonMCTargetDesc.cpp - Hexagon Target Descriptions -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Hexagon specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "HexagonTargetStreamer.h"
+#include "MCTargetDesc/HexagonInstPrinter.h"
+#include "MCTargetDesc/HexagonMCAsmInfo.h"
+#include "MCTargetDesc/HexagonMCELFStreamer.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include <cassert>
+#include <cstdint>
+#include <new>
+#include <string>
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "HexagonGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "HexagonGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "HexagonGenRegisterInfo.inc"
+
+cl::opt<bool> llvm::HexagonDisableCompound
+ ("mno-compound",
+ cl::desc("Disable looking for compound instructions for Hexagon"));
+
+cl::opt<bool> llvm::HexagonDisableDuplex
+ ("mno-pairing",
+ cl::desc("Disable looking for duplex instructions for Hexagon"));
+
+static cl::opt<bool> HexagonV4ArchVariant("mv4", cl::Hidden, cl::init(false),
+ cl::desc("Build for Hexagon V4"));
+
+static cl::opt<bool> HexagonV5ArchVariant("mv5", cl::Hidden, cl::init(false),
+ cl::desc("Build for Hexagon V5"));
+
+static cl::opt<bool> HexagonV55ArchVariant("mv55", cl::Hidden, cl::init(false),
+ cl::desc("Build for Hexagon V55"));
+
+static cl::opt<bool> HexagonV60ArchVariant("mv60", cl::Hidden, cl::init(false),
+ cl::desc("Build for Hexagon V60"));
+
+static StringRef DefaultArch = "hexagonv60";
+
+static StringRef HexagonGetArchVariant() {
+ if (HexagonV4ArchVariant)
+ return "hexagonv4";
+ if (HexagonV5ArchVariant)
+ return "hexagonv5";
+ if (HexagonV55ArchVariant)
+ return "hexagonv55";
+ if (HexagonV60ArchVariant)
+ return "hexagonv60";
+ return "";
+}
+
+StringRef Hexagon_MC::selectHexagonCPU(const Triple &TT, StringRef CPU) {
+ StringRef ArchV = HexagonGetArchVariant();
+ if (!ArchV.empty() && !CPU.empty()) {
+ if (ArchV != CPU)
+ report_fatal_error("conflicting architectures specified.");
+ return CPU;
+ }
+ if (ArchV.empty()) {
+ if (CPU.empty())
+ CPU = DefaultArch;
+ return CPU;
+ }
+ return ArchV;
+}
+
+MCInstrInfo *llvm::createHexagonMCInstrInfo() {
+ MCInstrInfo *X = new MCInstrInfo();
+ InitHexagonMCInstrInfo(X);
+ return X;
+}
+
+static MCRegisterInfo *createHexagonMCRegisterInfo(const Triple &TT) {
+ MCRegisterInfo *X = new MCRegisterInfo();
+ InitHexagonMCRegisterInfo(X, Hexagon::R31);
+ return X;
+}
+
+static MCSubtargetInfo *
+createHexagonMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
+ CPU = Hexagon_MC::selectHexagonCPU(TT, CPU);
+ return createHexagonMCSubtargetInfoImpl(TT, CPU, FS);
+}
+
+namespace {
+
+class HexagonTargetAsmStreamer : public HexagonTargetStreamer {
+public:
+ HexagonTargetAsmStreamer(MCStreamer &S,
+ formatted_raw_ostream &, bool,
+ MCInstPrinter &)
+ : HexagonTargetStreamer(S) {}
+
+ void prettyPrintAsm(MCInstPrinter &InstPrinter, raw_ostream &OS,
+ const MCInst &Inst, const MCSubtargetInfo &STI) override {
+ assert(HexagonMCInstrInfo::isBundle(Inst));
+ assert(HexagonMCInstrInfo::bundleSize(Inst) <= HEXAGON_PACKET_SIZE);
+ std::string Buffer;
+ {
+ raw_string_ostream TempStream(Buffer);
+ InstPrinter.printInst(&Inst, TempStream, "", STI);
+ }
+ StringRef Contents(Buffer);
+ auto PacketBundle = Contents.rsplit('\n');
+ auto HeadTail = PacketBundle.first.split('\n');
+ StringRef Separator = "\n";
+ StringRef Indent = "\t\t";
+ OS << "\t{\n";
+ while (!HeadTail.first.empty()) {
+ StringRef InstTxt;
+ auto Duplex = HeadTail.first.split('\v');
+ if (!Duplex.second.empty()) {
+ OS << Indent << Duplex.first << Separator;
+ InstTxt = Duplex.second;
+ } else if (!HeadTail.first.trim().startswith("immext")) {
+ InstTxt = Duplex.first;
+ }
+ if (!InstTxt.empty())
+ OS << Indent << InstTxt << Separator;
+ HeadTail = HeadTail.second.split('\n');
+ }
+ OS << "\t}" << PacketBundle.second;
+ }
+};
+
+class HexagonTargetELFStreamer : public HexagonTargetStreamer {
+public:
+ HexagonTargetELFStreamer(MCStreamer &S, MCSubtargetInfo const &STI)
+ : HexagonTargetStreamer(S) {
+ auto Bits = STI.getFeatureBits();
+ unsigned Flags = 0;
+ if (Bits[Hexagon::ArchV60])
+ Flags = ELF::EF_HEXAGON_MACH_V60;
+ else if (Bits[Hexagon::ArchV55])
+ Flags = ELF::EF_HEXAGON_MACH_V55;
+ else if (Bits[Hexagon::ArchV5])
+ Flags = ELF::EF_HEXAGON_MACH_V5;
+ else if (Bits[Hexagon::ArchV4])
+ Flags = ELF::EF_HEXAGON_MACH_V4;
+ getStreamer().getAssembler().setELFHeaderEFlags(Flags);
+ }
+
+ MCELFStreamer &getStreamer() {
+ return static_cast<MCELFStreamer &>(Streamer);
+ }
+
+ void EmitCommonSymbolSorted(MCSymbol *Symbol, uint64_t Size,
+ unsigned ByteAlignment,
+ unsigned AccessSize) override {
+ HexagonMCELFStreamer &HexagonELFStreamer =
+ static_cast<HexagonMCELFStreamer &>(getStreamer());
+ HexagonELFStreamer.HexagonMCEmitCommonSymbol(Symbol, Size, ByteAlignment,
+ AccessSize);
+ }
+
+ void EmitLocalCommonSymbolSorted(MCSymbol *Symbol, uint64_t Size,
+ unsigned ByteAlignment,
+ unsigned AccessSize) override {
+ HexagonMCELFStreamer &HexagonELFStreamer =
+ static_cast<HexagonMCELFStreamer &>(getStreamer());
+ HexagonELFStreamer.HexagonMCEmitLocalCommonSymbol(
+ Symbol, Size, ByteAlignment, AccessSize);
+ }
+};
+
+} // end anonymous namespace
+
+static MCAsmInfo *createHexagonMCAsmInfo(const MCRegisterInfo &MRI,
+ const Triple &TT) {
+ MCAsmInfo *MAI = new HexagonMCAsmInfo(TT);
+
+ // VirtualFP = (R30 + #0).
+ MCCFIInstruction Inst =
+ MCCFIInstruction::createDefCfa(nullptr, Hexagon::R30, 0);
+ MAI->addInitialFrameState(Inst);
+
+ return MAI;
+}
+
+static MCInstPrinter *createHexagonMCInstPrinter(const Triple &T,
+ unsigned SyntaxVariant,
+ const MCAsmInfo &MAI,
+ const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI) {
+ if (SyntaxVariant == 0)
+ return (new HexagonInstPrinter(MAI, MII, MRI));
+ else
+ return nullptr;
+}
+
+static MCTargetStreamer *createMCAsmTargetStreamer(MCStreamer &S,
+ formatted_raw_ostream &OS,
+ MCInstPrinter *InstPrint,
+ bool IsVerboseAsm) {
+ return new HexagonTargetAsmStreamer(S, OS, IsVerboseAsm, *InstPrint);
+}
+
+static MCStreamer *createMCStreamer(Triple const &T, MCContext &Context,
+ MCAsmBackend &MAB, raw_pwrite_stream &OS,
+ MCCodeEmitter *Emitter, bool RelaxAll) {
+ return createHexagonELFStreamer(Context, MAB, OS, Emitter);
+}
+
+static MCTargetStreamer *
+createHexagonObjectTargetStreamer(MCStreamer &S, MCSubtargetInfo const &STI) {
+ return new HexagonTargetELFStreamer(S, STI);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeHexagonTargetMC() {
+ // Register the MC asm info.
+ RegisterMCAsmInfoFn X(getTheHexagonTarget(), createHexagonMCAsmInfo);
+
+ // Register the MC instruction info.
+ TargetRegistry::RegisterMCInstrInfo(getTheHexagonTarget(),
+ createHexagonMCInstrInfo);
+
+ // Register the MC register info.
+ TargetRegistry::RegisterMCRegInfo(getTheHexagonTarget(),
+ createHexagonMCRegisterInfo);
+
+ // Register the MC subtarget info.
+ TargetRegistry::RegisterMCSubtargetInfo(getTheHexagonTarget(),
+ createHexagonMCSubtargetInfo);
+
+ // Register the MC Code Emitter
+ TargetRegistry::RegisterMCCodeEmitter(getTheHexagonTarget(),
+ createHexagonMCCodeEmitter);
+
+ // Register the asm backend
+ TargetRegistry::RegisterMCAsmBackend(getTheHexagonTarget(),
+ createHexagonAsmBackend);
+
+ // Register the obj streamer
+ TargetRegistry::RegisterELFStreamer(getTheHexagonTarget(), createMCStreamer);
+
+ // Register the asm streamer
+ TargetRegistry::RegisterAsmTargetStreamer(getTheHexagonTarget(),
+ createMCAsmTargetStreamer);
+
+ // Register the MC Inst Printer
+ TargetRegistry::RegisterMCInstPrinter(getTheHexagonTarget(),
+ createHexagonMCInstPrinter);
+
+ TargetRegistry::RegisterObjectTargetStreamer(
+ getTheHexagonTarget(), createHexagonObjectTargetStreamer);
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
new file mode 100644
index 000000000000..6e677e9d9f86
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
@@ -0,0 +1,79 @@
+//===-- HexagonMCTargetDesc.h - Hexagon Target Descriptions -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Hexagon specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCTARGETDESC_H
+#define LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCTARGETDESC_H
+
+#include "llvm/Support/CommandLine.h"
+#include <cstdint>
+
+namespace llvm {
+
+struct InstrItinerary;
+struct InstrStage;
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class MCTargetOptions;
+class Target;
+class Triple;
+class StringRef;
+class raw_ostream;
+class raw_pwrite_stream;
+
+Target &getTheHexagonTarget();
+extern cl::opt<bool> HexagonDisableCompound;
+extern cl::opt<bool> HexagonDisableDuplex;
+extern const InstrStage HexagonStages[];
+
+MCInstrInfo *createHexagonMCInstrInfo();
+
+MCCodeEmitter *createHexagonMCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &MCT);
+
+MCAsmBackend *createHexagonAsmBackend(const Target &T,
+ const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options);
+
+MCObjectWriter *createHexagonELFObjectWriter(raw_pwrite_stream &OS,
+ uint8_t OSABI, StringRef CPU);
+
+namespace Hexagon_MC {
+
+ StringRef selectHexagonCPU(const Triple &TT, StringRef CPU);
+
+} // end namespace Hexagon_MC
+
+} // end namespace llvm
+
+// Define symbolic names for Hexagon registers. This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "HexagonGenRegisterInfo.inc"
+
+// Defines symbolic names for the Hexagon instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "HexagonGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "HexagonGenSubtargetInfo.inc"
+
+#endif // LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCTARGETDESC_H
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
new file mode 100644
index 000000000000..88f37d620dcf
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
@@ -0,0 +1,463 @@
+//===----- HexagonShuffler.cpp - Instruction bundle shuffling -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the shuffling of insns inside a bundle according to the
+// packet formation rules of the Hexagon ISA.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hexagon-shuffle"
+
+#include <algorithm>
+#include <utility>
+#include "Hexagon.h"
+#include "MCTargetDesc/HexagonBaseInfo.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
+#include "HexagonShuffler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+// Insn shuffling priority.
+class HexagonBid {
+ // The priority is directly proportional to how restricted the insn is based
+ // on its flexibility to run on the available slots. So, the fewer slots it
+ // may run on, the higher its priority.
+ enum { MAX = 360360 }; // LCD of 1/2, 1/3, 1/4,... 1/15.
+ unsigned Bid;
+
+public:
+ HexagonBid() : Bid(0){};
+ HexagonBid(unsigned B) { Bid = B ? MAX / countPopulation(B) : 0; };
+
+ // Check if the insn priority is overflowed.
+ bool isSold() const { return (Bid >= MAX); };
+
+ HexagonBid &operator+=(const HexagonBid &B) {
+ Bid += B.Bid;
+ return *this;
+ };
+};
+
+// Slot shuffling allocation.
+class HexagonUnitAuction {
+ HexagonBid Scores[HEXAGON_PACKET_SIZE];
+ // Mask indicating which slot is unavailable.
+ unsigned isSold : HEXAGON_PACKET_SIZE;
+
+public:
+ HexagonUnitAuction() : isSold(0){};
+
+ // Allocate slots.
+ bool bid(unsigned B) {
+ // Exclude already auctioned slots from the bid.
+ unsigned b = B & ~isSold;
+ if (b) {
+ for (unsigned i = 0; i < HEXAGON_PACKET_SIZE; ++i)
+ if (b & (1 << i)) {
+ // Request candidate slots.
+ Scores[i] += HexagonBid(b);
+ isSold |= Scores[i].isSold() << i;
+ }
+ return true;
+ ;
+ } else
+ // Error if the desired slots are already full.
+ return false;
+ };
+};
+} // end anonymous namespace
+
+unsigned HexagonResource::setWeight(unsigned s) {
+ const unsigned SlotWeight = 8;
+ const unsigned MaskWeight = SlotWeight - 1;
+ bool Key = (1 << s) & getUnits();
+
+ // TODO: Improve this API so that we can prevent misuse statically.
+ assert(SlotWeight * s < 32 && "Argument to setWeight too large.");
+
+ // Calculate relative weight of the insn for the given slot, weighing it the
+ // heavier the more restrictive the insn is and the lowest the slots that the
+ // insn may be executed in.
+ Weight =
+ (Key << (SlotWeight * s)) * ((MaskWeight - countPopulation(getUnits()))
+ << countTrailingZeros(getUnits()));
+ return (Weight);
+}
+
+void HexagonCVIResource::SetupTUL(TypeUnitsAndLanes *TUL, StringRef CPU) {
+ (*TUL)[HexagonII::TypeCVI_VA] =
+ UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
+ (*TUL)[HexagonII::TypeCVI_VA_DV] = UnitsAndLanes(CVI_XLANE | CVI_MPY0, 2);
+ (*TUL)[HexagonII::TypeCVI_VX] = UnitsAndLanes(CVI_MPY0 | CVI_MPY1, 1);
+ (*TUL)[HexagonII::TypeCVI_VX_DV] = UnitsAndLanes(CVI_MPY0, 2);
+ (*TUL)[HexagonII::TypeCVI_VP] = UnitsAndLanes(CVI_XLANE, 1);
+ (*TUL)[HexagonII::TypeCVI_VP_VS] = UnitsAndLanes(CVI_XLANE, 2);
+ (*TUL)[HexagonII::TypeCVI_VS] = UnitsAndLanes(CVI_SHIFT, 1);
+ (*TUL)[HexagonII::TypeCVI_VINLANESAT] = UnitsAndLanes(CVI_SHIFT, 1);
+ (*TUL)[HexagonII::TypeCVI_VM_LD] =
+ UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
+ (*TUL)[HexagonII::TypeCVI_VM_TMP_LD] = UnitsAndLanes(CVI_NONE, 0);
+ (*TUL)[HexagonII::TypeCVI_VM_CUR_LD] =
+ UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
+ (*TUL)[HexagonII::TypeCVI_VM_VP_LDU] = UnitsAndLanes(CVI_XLANE, 1);
+ (*TUL)[HexagonII::TypeCVI_VM_ST] =
+ UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
+ (*TUL)[HexagonII::TypeCVI_VM_NEW_ST] = UnitsAndLanes(CVI_NONE, 0);
+ (*TUL)[HexagonII::TypeCVI_VM_STU] = UnitsAndLanes(CVI_XLANE, 1);
+ (*TUL)[HexagonII::TypeCVI_HIST] = UnitsAndLanes(CVI_XLANE, 4);
+}
+
+HexagonCVIResource::HexagonCVIResource(TypeUnitsAndLanes *TUL,
+ MCInstrInfo const &MCII, unsigned s,
+ MCInst const *id)
+ : HexagonResource(s), TUL(TUL) {
+ unsigned T = HexagonMCInstrInfo::getType(MCII, *id);
+
+ if (TUL->count(T)) {
+ // For an HVX insn.
+ Valid = true;
+ setUnits((*TUL)[T].first);
+ setLanes((*TUL)[T].second);
+ setLoad(HexagonMCInstrInfo::getDesc(MCII, *id).mayLoad());
+ setStore(HexagonMCInstrInfo::getDesc(MCII, *id).mayStore());
+ } else {
+ // For core insns.
+ Valid = false;
+ setUnits(0);
+ setLanes(0);
+ setLoad(false);
+ setStore(false);
+ }
+}
+
+HexagonShuffler::HexagonShuffler(MCInstrInfo const &MCII,
+ MCSubtargetInfo const &STI)
+ : MCII(MCII), STI(STI) {
+ reset();
+ HexagonCVIResource::SetupTUL(&TUL, STI.getCPU());
+}
+
+void HexagonShuffler::reset() {
+ Packet.clear();
+ BundleFlags = 0;
+ Error = SHUFFLE_SUCCESS;
+}
+
+void HexagonShuffler::append(MCInst const *ID, MCInst const *Extender,
+ unsigned S, bool X) {
+ HexagonInstr PI(&TUL, MCII, ID, Extender, S, X);
+
+ Packet.push_back(PI);
+}
+
+/// Check that the packet is legal and enforce relative insn order.
+bool HexagonShuffler::check() {
+ // Descriptive slot masks.
+ const unsigned slotSingleLoad = 0x1, slotSingleStore = 0x1, slotOne = 0x2,
+ slotThree = 0x8, slotFirstJump = 0x8, slotLastJump = 0x4,
+ slotFirstLoadStore = 0x2, slotLastLoadStore = 0x1;
+ // Highest slots for branches and stores used to keep their original order.
+ unsigned slotJump = slotFirstJump;
+ unsigned slotLoadStore = slotFirstLoadStore;
+ // Number of branches, solo branches, indirect branches.
+ unsigned jumps = 0, jump1 = 0;
+ // Number of memory operations, loads, solo loads, stores, solo stores, single
+ // stores.
+ unsigned memory = 0, loads = 0, load0 = 0, stores = 0, store0 = 0, store1 = 0;
+ // Number of HVX loads, HVX stores.
+ unsigned CVIloads = 0, CVIstores = 0;
+ // Number of duplex insns, solo insns.
+ unsigned duplex = 0, solo = 0;
+ // Number of insns restricting other insns in the packet to A and X types,
+ // which is neither A or X types.
+ unsigned onlyAX = 0, neitherAnorX = 0;
+ // Number of insns restricting other insns in slot #1 to A type.
+ unsigned onlyAin1 = 0;
+ // Number of insns restricting any insn in slot #1, except A2_nop.
+ unsigned onlyNo1 = 0;
+ unsigned xtypeFloat = 0;
+ unsigned pSlot3Cnt = 0;
+ iterator slot3ISJ = end();
+
+ // Collect information from the insns in the packet.
+ for (iterator ISJ = begin(); ISJ != end(); ++ISJ) {
+ MCInst const *ID = ISJ->getDesc();
+
+ if (HexagonMCInstrInfo::isSolo(MCII, *ID))
+ solo += !ISJ->isSoloException();
+ else if (HexagonMCInstrInfo::isSoloAX(MCII, *ID))
+ onlyAX += !ISJ->isSoloException();
+ else if (HexagonMCInstrInfo::isSoloAin1(MCII, *ID))
+ onlyAin1 += !ISJ->isSoloException();
+ if (HexagonMCInstrInfo::getType(MCII, *ID) != HexagonII::TypeALU32 &&
+ HexagonMCInstrInfo::getType(MCII, *ID) != HexagonII::TypeXTYPE)
+ ++neitherAnorX;
+ if (HexagonMCInstrInfo::prefersSlot3(MCII, *ID)) {
+ ++pSlot3Cnt;
+ slot3ISJ = ISJ;
+ }
+ if (HexagonMCInstrInfo::isCofMax1(MCII, *ID))
+ ++jump1;
+
+ switch (HexagonMCInstrInfo::getType(MCII, *ID)) {
+ case HexagonII::TypeXTYPE:
+ if (HexagonMCInstrInfo::isFloat(MCII, *ID))
+ ++xtypeFloat;
+ break;
+ case HexagonII::TypeJR:
+ case HexagonII::TypeJ:
+ ++jumps;
+ break;
+ case HexagonII::TypeCVI_VM_VP_LDU:
+ ++onlyNo1;
+ case HexagonII::TypeCVI_VM_LD:
+ case HexagonII::TypeCVI_VM_TMP_LD:
+ case HexagonII::TypeCVI_VM_CUR_LD:
+ ++CVIloads;
+ case HexagonII::TypeLD:
+ ++loads;
+ ++memory;
+ if (ISJ->Core.getUnits() == slotSingleLoad)
+ ++load0;
+ if (HexagonMCInstrInfo::getDesc(MCII, *ID).isReturn())
+ ++jumps, ++jump1; // DEALLOC_RETURN is of type LD.
+ break;
+ case HexagonII::TypeCVI_VM_STU:
+ ++onlyNo1;
+ case HexagonII::TypeCVI_VM_ST:
+ case HexagonII::TypeCVI_VM_NEW_ST:
+ ++CVIstores;
+ case HexagonII::TypeST:
+ ++stores;
+ ++memory;
+ if (ISJ->Core.getUnits() == slotSingleStore)
+ ++store0;
+ break;
+ case HexagonII::TypeV4LDST:
+ ++loads;
+ ++stores;
+ ++store1;
+ ++memory;
+ break;
+ case HexagonII::TypeNV:
+ ++memory; // NV insns are memory-like.
+ if (HexagonMCInstrInfo::getDesc(MCII, *ID).isBranch())
+ ++jumps, ++jump1;
+ break;
+ case HexagonII::TypeCR:
+ // Legacy conditional branch predicated on a register.
+ case HexagonII::TypeSYSTEM:
+ if (HexagonMCInstrInfo::getDesc(MCII, *ID).mayLoad())
+ ++loads;
+ break;
+ }
+ }
+
+ // Check if the packet is legal.
+ if ((load0 > 1 || store0 > 1 || CVIloads > 1 || CVIstores > 1) ||
+ (duplex > 1 || (duplex && memory)) || (solo && size() > 1) ||
+ (onlyAX && neitherAnorX > 1) || (onlyAX && xtypeFloat)) {
+ Error = SHUFFLE_ERROR_INVALID;
+ return false;
+ }
+
+ if (jump1 && jumps > 1) {
+ // Error if single branch with another branch.
+ Error = SHUFFLE_ERROR_BRANCHES;
+ return false;
+ }
+
+ // Modify packet accordingly.
+ // TODO: need to reserve slots #0 and #1 for duplex insns.
+ bool bOnlySlot3 = false;
+ for (iterator ISJ = begin(); ISJ != end(); ++ISJ) {
+ MCInst const *ID = ISJ->getDesc();
+
+ if (!ISJ->Core.getUnits()) {
+ // Error if insn may not be executed in any slot.
+ Error = SHUFFLE_ERROR_UNKNOWN;
+ return false;
+ }
+
+ // Exclude from slot #1 any insn but A2_nop.
+ if (HexagonMCInstrInfo::getDesc(MCII, *ID).getOpcode() != Hexagon::A2_nop)
+ if (onlyNo1)
+ ISJ->Core.setUnits(ISJ->Core.getUnits() & ~slotOne);
+
+ // Exclude from slot #1 any insn but A-type.
+ if (HexagonMCInstrInfo::getType(MCII, *ID) != HexagonII::TypeALU32)
+ if (onlyAin1)
+ ISJ->Core.setUnits(ISJ->Core.getUnits() & ~slotOne);
+
+ // Branches must keep the original order.
+ if (HexagonMCInstrInfo::getDesc(MCII, *ID).isBranch() ||
+ HexagonMCInstrInfo::getDesc(MCII, *ID).isCall())
+ if (jumps > 1) {
+ if (slotJump < slotLastJump) {
+ // Error if indirect branch with another branch or
+ // no more slots available for branches.
+ Error = SHUFFLE_ERROR_BRANCHES;
+ return false;
+ }
+ // Pin the branch to the highest slot available to it.
+ ISJ->Core.setUnits(ISJ->Core.getUnits() & slotJump);
+ // Update next highest slot available to branches.
+ slotJump >>= 1;
+ }
+
+ // A single load must use slot #0.
+ if (HexagonMCInstrInfo::getDesc(MCII, *ID).mayLoad()) {
+ if (loads == 1 && loads == memory)
+ // Pin the load to slot #0.
+ ISJ->Core.setUnits(ISJ->Core.getUnits() & slotSingleLoad);
+ }
+
+ // A single store must use slot #0.
+ if (HexagonMCInstrInfo::getDesc(MCII, *ID).mayStore()) {
+ if (!store0) {
+ if (stores == 1)
+ ISJ->Core.setUnits(ISJ->Core.getUnits() & slotSingleStore);
+ else if (stores > 1) {
+ if (slotLoadStore < slotLastLoadStore) {
+ // Error if no more slots available for stores.
+ Error = SHUFFLE_ERROR_STORES;
+ return false;
+ }
+ // Pin the store to the highest slot available to it.
+ ISJ->Core.setUnits(ISJ->Core.getUnits() & slotLoadStore);
+ // Update the next highest slot available to stores.
+ slotLoadStore >>= 1;
+ }
+ }
+ if (store1 && stores > 1) {
+ // Error if a single store with another store.
+ Error = SHUFFLE_ERROR_STORES;
+ return false;
+ }
+ }
+
+ // flag if an instruction can only be executed in slot 3
+ if (ISJ->Core.getUnits() == slotThree)
+ bOnlySlot3 = true;
+
+ if (!ISJ->Core.getUnits()) {
+ // Error if insn may not be executed in any slot.
+ Error = SHUFFLE_ERROR_NOSLOTS;
+ return false;
+ }
+ }
+
+ bool validateSlots = true;
+ if (bOnlySlot3 == false && pSlot3Cnt == 1 && slot3ISJ != end()) {
+ // save off slot mask of instruction marked with A_PREFER_SLOT3
+ // and then pin it to slot #3
+ unsigned saveUnits = slot3ISJ->Core.getUnits();
+ slot3ISJ->Core.setUnits(saveUnits & slotThree);
+
+ HexagonUnitAuction AuctionCore;
+ std::sort(begin(), end(), HexagonInstr::lessCore);
+
+ // see if things ok with that instruction being pinned to slot #3
+ bool bFail = false;
+ for (iterator I = begin(); I != end() && bFail != true; ++I)
+ if (!AuctionCore.bid(I->Core.getUnits()))
+ bFail = true;
+
+ // if yes, great, if not then restore original slot mask
+ if (!bFail)
+ validateSlots = false; // all good, no need to re-do auction
+ else
+ for (iterator ISJ = begin(); ISJ != end(); ++ISJ) {
+ MCInst const *ID = ISJ->getDesc();
+ if (HexagonMCInstrInfo::prefersSlot3(MCII, *ID))
+ ISJ->Core.setUnits(saveUnits);
+ }
+ }
+
+ // Check if any slot, core, is over-subscribed.
+ // Verify the core slot subscriptions.
+ if (validateSlots) {
+ HexagonUnitAuction AuctionCore;
+
+ std::sort(begin(), end(), HexagonInstr::lessCore);
+
+ for (iterator I = begin(); I != end(); ++I)
+ if (!AuctionCore.bid(I->Core.getUnits())) {
+ Error = SHUFFLE_ERROR_SLOTS;
+ return false;
+ }
+ }
+ // Verify the CVI slot subscriptions.
+ {
+ HexagonUnitAuction AuctionCVI;
+
+ std::sort(begin(), end(), HexagonInstr::lessCVI);
+
+ for (iterator I = begin(); I != end(); ++I)
+ for (unsigned i = 0; i < I->CVI.getLanes(); ++i) // TODO: I->CVI.isValid?
+ if (!AuctionCVI.bid(I->CVI.getUnits() << i)) {
+ Error = SHUFFLE_ERROR_SLOTS;
+ return false;
+ }
+ }
+
+ Error = SHUFFLE_SUCCESS;
+ return true;
+}
+
+bool HexagonShuffler::shuffle() {
+ if (size() > HEXAGON_PACKET_SIZE) {
+ // Ignore a packet with with more than what a packet can hold
+ // or with compound or duplex insns for now.
+ Error = SHUFFLE_ERROR_INVALID;
+ return false;
+ }
+
+ // Check and prepare packet.
+ if (size() > 1 && check())
+ // Reorder the handles for each slot.
+ for (unsigned nSlot = 0, emptySlots = 0; nSlot < HEXAGON_PACKET_SIZE;
+ ++nSlot) {
+ iterator ISJ, ISK;
+ unsigned slotSkip, slotWeight;
+
+ // Prioritize the handles considering their restrictions.
+ for (ISJ = ISK = Packet.begin(), slotSkip = slotWeight = 0;
+ ISK != Packet.end(); ++ISK, ++slotSkip)
+ if (slotSkip < nSlot - emptySlots)
+ // Note which handle to begin at.
+ ++ISJ;
+ else
+ // Calculate the weight of the slot.
+ slotWeight += ISK->Core.setWeight(HEXAGON_PACKET_SIZE - nSlot - 1);
+
+ if (slotWeight)
+ // Sort the packet, favoring source order,
+ // beginning after the previous slot.
+ std::sort(ISJ, Packet.end());
+ else
+ // Skip unused slot.
+ ++emptySlots;
+ }
+
+ for (iterator ISJ = begin(); ISJ != end(); ++ISJ)
+ DEBUG(dbgs().write_hex(ISJ->Core.getUnits());
+ dbgs() << ':'
+ << HexagonMCInstrInfo::getDesc(MCII, *ISJ->getDesc())
+ .getOpcode();
+ dbgs() << '\n');
+ DEBUG(dbgs() << '\n');
+
+ return (!getError());
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
new file mode 100644
index 000000000000..a093f8545132
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
@@ -0,0 +1,190 @@
+//===----- HexagonShuffler.h - Instruction bundle shuffling ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the shuffling of insns inside a bundle according to the
+// packet formation rules of the Hexagon ISA.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGONSHUFFLER_H
+#define HEXAGONSHUFFLER_H
+
+#include "Hexagon.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+
+using namespace llvm;
+
+namespace llvm {
+// Insn resources.
+class HexagonResource {
+ // Mask of the slots or units that may execute the insn and
+ // the weight or priority that the insn requires to be assigned a slot.
+ unsigned Slots, Weight;
+
+public:
+ HexagonResource(unsigned s) { setUnits(s); };
+
+ void setUnits(unsigned s) {
+ Slots = s & ~(~0U << HEXAGON_PACKET_SIZE);
+ };
+ unsigned setWeight(unsigned s);
+
+ unsigned getUnits() const { return (Slots); };
+ unsigned getWeight() const { return (Weight); };
+
+ // Check if the resources are in ascending slot order.
+ static bool lessUnits(const HexagonResource &A, const HexagonResource &B) {
+ return (countPopulation(A.getUnits()) < countPopulation(B.getUnits()));
+ };
+ // Check if the resources are in ascending weight order.
+ static bool lessWeight(const HexagonResource &A, const HexagonResource &B) {
+ return (A.getWeight() < B.getWeight());
+ };
+};
+
+// HVX insn resources.
+class HexagonCVIResource : public HexagonResource {
+public:
+ typedef std::pair<unsigned, unsigned> UnitsAndLanes;
+ typedef llvm::DenseMap<unsigned, UnitsAndLanes> TypeUnitsAndLanes;
+
+private:
+ // Available HVX slots.
+ enum {
+ CVI_NONE = 0,
+ CVI_XLANE = 1 << 0,
+ CVI_SHIFT = 1 << 1,
+ CVI_MPY0 = 1 << 2,
+ CVI_MPY1 = 1 << 3
+ };
+
+ TypeUnitsAndLanes *TUL;
+
+ // Count of adjacent slots that the insn requires to be executed.
+ unsigned Lanes;
+ // Flag whether the insn is a load or a store.
+ bool Load, Store;
+ // Flag whether the HVX resources are valid.
+ bool Valid;
+
+ void setLanes(unsigned l) { Lanes = l; };
+ void setLoad(bool f = true) { Load = f; };
+ void setStore(bool f = true) { Store = f; };
+
+public:
+ HexagonCVIResource(TypeUnitsAndLanes *TUL, MCInstrInfo const &MCII,
+ unsigned s, MCInst const *id);
+ static void SetupTUL(TypeUnitsAndLanes *TUL, StringRef CPU);
+
+ bool isValid() const { return (Valid); };
+ unsigned getLanes() const { return (Lanes); };
+ bool mayLoad() const { return (Load); };
+ bool mayStore() const { return (Store); };
+};
+
+// Handle to an insn used by the shuffling algorithm.
+class HexagonInstr {
+ friend class HexagonShuffler;
+
+ MCInst const *ID;
+ MCInst const *Extender;
+ HexagonResource Core;
+ HexagonCVIResource CVI;
+ bool SoloException;
+
+public:
+ HexagonInstr(HexagonCVIResource::TypeUnitsAndLanes *T,
+ MCInstrInfo const &MCII, MCInst const *id,
+ MCInst const *Extender, unsigned s, bool x = false)
+ : ID(id), Extender(Extender), Core(s), CVI(T, MCII, s, id),
+ SoloException(x) {};
+
+ MCInst const *getDesc() const { return (ID); };
+
+ MCInst const *getExtender() const { return Extender; }
+
+ unsigned isSoloException() const { return (SoloException); };
+
+ // Check if the handles are in ascending order for shuffling purposes.
+ bool operator<(const HexagonInstr &B) const {
+ return (HexagonResource::lessWeight(B.Core, Core));
+ };
+ // Check if the handles are in ascending order by core slots.
+ static bool lessCore(const HexagonInstr &A, const HexagonInstr &B) {
+ return (HexagonResource::lessUnits(A.Core, B.Core));
+ };
+ // Check if the handles are in ascending order by HVX slots.
+ static bool lessCVI(const HexagonInstr &A, const HexagonInstr &B) {
+ return (HexagonResource::lessUnits(A.CVI, B.CVI));
+ };
+};
+
+// Bundle shuffler.
+class HexagonShuffler {
+ typedef SmallVector<HexagonInstr, HEXAGON_PRESHUFFLE_PACKET_SIZE>
+ HexagonPacket;
+
+ // Insn handles in a bundle.
+ HexagonPacket Packet;
+
+ // Shuffling error code.
+ unsigned Error;
+
+ HexagonCVIResource::TypeUnitsAndLanes TUL;
+
+protected:
+ int64_t BundleFlags;
+ MCInstrInfo const &MCII;
+ MCSubtargetInfo const &STI;
+
+public:
+ typedef HexagonPacket::iterator iterator;
+
+ enum {
+ SHUFFLE_SUCCESS = 0, ///< Successful operation.
+ SHUFFLE_ERROR_INVALID, ///< Invalid bundle.
+ SHUFFLE_ERROR_STORES, ///< No free slots for store insns.
+ SHUFFLE_ERROR_LOADS, ///< No free slots for load insns.
+ SHUFFLE_ERROR_BRANCHES, ///< No free slots for branch insns.
+ SHUFFLE_ERROR_NOSLOTS, ///< No free slots for other insns.
+ SHUFFLE_ERROR_SLOTS, ///< Over-subscribed slots.
+ SHUFFLE_ERROR_ERRATA2, ///< Errata violation (v60).
+ SHUFFLE_ERROR_STORE_LOAD_CONFLICT, ///< store/load conflict
+ SHUFFLE_ERROR_UNKNOWN ///< Unknown error.
+ };
+
+ explicit HexagonShuffler(MCInstrInfo const &MCII, MCSubtargetInfo const &STI);
+
+ // Reset to initial state.
+ void reset();
+ // Check if the bundle may be validly shuffled.
+ bool check();
+ // Reorder the insn handles in the bundle.
+ bool shuffle();
+
+ unsigned size() const { return (Packet.size()); };
+
+ iterator begin() { return (Packet.begin()); };
+ iterator end() { return (Packet.end()); };
+
+ // Add insn handle to the bundle .
+ void append(MCInst const *ID, MCInst const *Extender, unsigned S,
+ bool X = false);
+
+ // Return the error code for the last check or shuffling of the bundle.
+ void setError(unsigned Err) { Error = Err; };
+ unsigned getError() const { return (Error); };
+};
+}
+
+#endif // HEXAGONSHUFFLER_H
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFCopy.cpp b/contrib/llvm/lib/Target/Hexagon/RDFCopy.cpp
new file mode 100644
index 000000000000..392871628d98
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/RDFCopy.cpp
@@ -0,0 +1,240 @@
+//===--- RDFCopy.cpp ------------------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// RDF-based copy propagation.
+
+#include "RDFCopy.h"
+#include "RDFGraph.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+using namespace rdf;
+
+#ifndef NDEBUG
+static cl::opt<unsigned> CpLimit("rdf-cp-limit", cl::init(0), cl::Hidden);
+static unsigned CpCount = 0;
+#endif
+
+bool CopyPropagation::interpretAsCopy(const MachineInstr *MI, EqualityMap &EM) {
+ unsigned Opc = MI->getOpcode();
+ switch (Opc) {
+ case TargetOpcode::COPY: {
+ const MachineOperand &Dst = MI->getOperand(0);
+ const MachineOperand &Src = MI->getOperand(1);
+ RegisterRef DstR = DFG.makeRegRef(Dst.getReg(), Dst.getSubReg());
+ RegisterRef SrcR = DFG.makeRegRef(Src.getReg(), Src.getSubReg());
+ assert(TargetRegisterInfo::isPhysicalRegister(DstR.Reg));
+ assert(TargetRegisterInfo::isPhysicalRegister(SrcR.Reg));
+ const TargetRegisterInfo &TRI = DFG.getTRI();
+ if (TRI.getMinimalPhysRegClass(DstR.Reg) !=
+ TRI.getMinimalPhysRegClass(SrcR.Reg))
+ return false;
+ EM.insert(std::make_pair(DstR, SrcR));
+ return true;
+ }
+ case TargetOpcode::REG_SEQUENCE:
+ llvm_unreachable("Unexpected REG_SEQUENCE");
+ }
+ return false;
+}
+
+
+void CopyPropagation::recordCopy(NodeAddr<StmtNode*> SA, EqualityMap &EM) {
+ CopyMap.insert(std::make_pair(SA.Id, EM));
+ Copies.push_back(SA.Id);
+
+ for (auto I : EM) {
+ auto FS = DefM.find(I.second.Reg);
+ if (FS == DefM.end() || FS->second.empty())
+ continue; // Undefined source
+ RDefMap[I.second][SA.Id] = FS->second.top()->Id;
+ // Insert DstR into the map.
+ RDefMap[I.first];
+ }
+}
+
+
+void CopyPropagation::updateMap(NodeAddr<InstrNode*> IA) {
+ RegisterSet RRs;
+ for (NodeAddr<RefNode*> RA : IA.Addr->members(DFG))
+ RRs.insert(RA.Addr->getRegRef(DFG));
+ bool Common = false;
+ for (auto &R : RDefMap) {
+ if (!RRs.count(R.first))
+ continue;
+ Common = true;
+ break;
+ }
+ if (!Common)
+ return;
+
+ for (auto &R : RDefMap) {
+ if (!RRs.count(R.first))
+ continue;
+ auto F = DefM.find(R.first.Reg);
+ if (F == DefM.end() || F->second.empty())
+ continue;
+ R.second[IA.Id] = F->second.top()->Id;
+ }
+}
+
+
+bool CopyPropagation::scanBlock(MachineBasicBlock *B) {
+ bool Changed = false;
+ auto BA = DFG.getFunc().Addr->findBlock(B, DFG);
+ DFG.markBlock(BA.Id, DefM);
+
+ for (NodeAddr<InstrNode*> IA : BA.Addr->members(DFG)) {
+ if (DFG.IsCode<NodeAttrs::Stmt>(IA)) {
+ NodeAddr<StmtNode*> SA = IA;
+ EqualityMap EM;
+ if (interpretAsCopy(SA.Addr->getCode(), EM))
+ recordCopy(SA, EM);
+ }
+
+ updateMap(IA);
+ DFG.pushDefs(IA, DefM);
+ }
+
+ MachineDomTreeNode *N = MDT.getNode(B);
+ for (auto I : *N)
+ Changed |= scanBlock(I->getBlock());
+
+ DFG.releaseBlock(BA.Id, DefM);
+ return Changed;
+}
+
+
+bool CopyPropagation::run() {
+ scanBlock(&DFG.getMF().front());
+
+ if (trace()) {
+ dbgs() << "Copies:\n";
+ for (auto I : Copies) {
+ dbgs() << "Instr: " << *DFG.addr<StmtNode*>(I).Addr->getCode();
+ dbgs() << " eq: {";
+ for (auto J : CopyMap[I])
+ dbgs() << ' ' << Print<RegisterRef>(J.first, DFG) << '='
+ << Print<RegisterRef>(J.second, DFG);
+ dbgs() << " }\n";
+ }
+ dbgs() << "\nRDef map:\n";
+ for (auto R : RDefMap) {
+ dbgs() << Print<RegisterRef>(R.first, DFG) << " -> {";
+ for (auto &M : R.second)
+ dbgs() << ' ' << Print<NodeId>(M.first, DFG) << ':'
+ << Print<NodeId>(M.second, DFG);
+ dbgs() << " }\n";
+ }
+ }
+
+ bool Changed = false;
+#ifndef NDEBUG
+ bool HasLimit = CpLimit.getNumOccurrences() > 0;
+#endif
+
+ auto MinPhysReg = [this] (RegisterRef RR) -> unsigned {
+ const TargetRegisterInfo &TRI = DFG.getTRI();
+ const TargetRegisterClass &RC = *TRI.getMinimalPhysRegClass(RR.Reg);
+ if ((RC.LaneMask & RR.Mask) == RC.LaneMask)
+ return RR.Reg;
+ for (MCSubRegIndexIterator S(RR.Reg, &TRI); S.isValid(); ++S)
+ if (RR.Mask == TRI.getSubRegIndexLaneMask(S.getSubRegIndex()))
+ return S.getSubReg();
+ llvm_unreachable("Should have found a register");
+ return 0;
+ };
+
+ for (auto C : Copies) {
+#ifndef NDEBUG
+ if (HasLimit && CpCount >= CpLimit)
+ break;
+#endif
+ auto SA = DFG.addr<InstrNode*>(C);
+ auto FS = CopyMap.find(SA.Id);
+ if (FS == CopyMap.end())
+ continue;
+
+ EqualityMap &EM = FS->second;
+ for (NodeAddr<DefNode*> DA : SA.Addr->members_if(DFG.IsDef, DFG)) {
+ RegisterRef DR = DA.Addr->getRegRef(DFG);
+ auto FR = EM.find(DR);
+ if (FR == EM.end())
+ continue;
+ RegisterRef SR = FR->second;
+ if (DR == SR)
+ continue;
+
+ auto &RDefSR = RDefMap[SR];
+ NodeId RDefSR_SA = RDefSR[SA.Id];
+
+ for (NodeId N = DA.Addr->getReachedUse(), NextN; N; N = NextN) {
+ auto UA = DFG.addr<UseNode*>(N);
+ NextN = UA.Addr->getSibling();
+ uint16_t F = UA.Addr->getFlags();
+ if ((F & NodeAttrs::PhiRef) || (F & NodeAttrs::Fixed))
+ continue;
+ if (UA.Addr->getRegRef(DFG) != DR)
+ continue;
+
+ NodeAddr<InstrNode*> IA = UA.Addr->getOwner(DFG);
+ assert(DFG.IsCode<NodeAttrs::Stmt>(IA));
+ if (RDefSR[IA.Id] != RDefSR_SA)
+ continue;
+
+ MachineOperand &Op = UA.Addr->getOp();
+ if (Op.isTied())
+ continue;
+ if (trace()) {
+ dbgs() << "Can replace " << Print<RegisterRef>(DR, DFG)
+ << " with " << Print<RegisterRef>(SR, DFG) << " in "
+ << *NodeAddr<StmtNode*>(IA).Addr->getCode();
+ }
+
+ unsigned NewReg = MinPhysReg(SR);
+ Op.setReg(NewReg);
+ Op.setSubReg(0);
+ DFG.unlinkUse(UA, false);
+ if (RDefSR_SA != 0) {
+ UA.Addr->linkToDef(UA.Id, DFG.addr<DefNode*>(RDefSR_SA));
+ } else {
+ UA.Addr->setReachingDef(0);
+ UA.Addr->setSibling(0);
+ }
+
+ Changed = true;
+ #ifndef NDEBUG
+ if (HasLimit && CpCount >= CpLimit)
+ break;
+ CpCount++;
+ #endif
+
+ auto FC = CopyMap.find(IA.Id);
+ if (FC != CopyMap.end()) {
+ // Update the EM map in the copy's entry.
+ auto &M = FC->second;
+ for (auto &J : M) {
+ if (J.second != DR)
+ continue;
+ J.second = SR;
+ break;
+ }
+ }
+ } // for (N in reached-uses)
+ } // for (DA in defs)
+ } // for (C in Copies)
+
+ return Changed;
+}
+
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFCopy.h b/contrib/llvm/lib/Target/Hexagon/RDFCopy.h
new file mode 100644
index 000000000000..517f17cc9c64
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/RDFCopy.h
@@ -0,0 +1,55 @@
+//===--- RDFCopy.h --------------------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef RDF_COPY_H
+#define RDF_COPY_H
+
+#include "RDFGraph.h"
+#include <map>
+#include <vector>
+
+namespace llvm {
+ class MachineBasicBlock;
+ class MachineDominatorTree;
+ class MachineInstr;
+
+namespace rdf {
+ struct CopyPropagation {
+ CopyPropagation(DataFlowGraph &dfg) : MDT(dfg.getDT()), DFG(dfg),
+ Trace(false) {}
+ virtual ~CopyPropagation() {}
+
+ bool run();
+ void trace(bool On) { Trace = On; }
+ bool trace() const { return Trace; }
+ DataFlowGraph &getDFG() { return DFG; }
+
+ typedef std::map<RegisterRef, RegisterRef> EqualityMap;
+ virtual bool interpretAsCopy(const MachineInstr *MI, EqualityMap &EM);
+
+ private:
+ const MachineDominatorTree &MDT;
+ DataFlowGraph &DFG;
+ DataFlowGraph::DefStackMap DefM;
+ bool Trace;
+
+ // map: register -> (map: stmt -> reaching def)
+ std::map<RegisterRef,std::map<NodeId,NodeId>> RDefMap;
+ // map: statement -> (map: dst reg -> src reg)
+ std::map<NodeId, EqualityMap> CopyMap;
+ std::vector<NodeId> Copies;
+
+ void recordCopy(NodeAddr<StmtNode*> SA, EqualityMap &EM);
+ void updateMap(NodeAddr<InstrNode*> IA);
+ bool scanBlock(MachineBasicBlock *B);
+ };
+} // namespace rdf
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.cpp b/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.cpp
new file mode 100644
index 000000000000..63177d51cada
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.cpp
@@ -0,0 +1,232 @@
+//===--- RDFDeadCode.cpp --------------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// RDF-based generic dead code elimination.
+
+#include "RDFGraph.h"
+#include "RDFLiveness.h"
+#include "RDFDeadCode.h"
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+#include <queue>
+
+using namespace llvm;
+using namespace rdf;
+
+// This drastically improves execution time in "collect" over using
+// SetVector as a work queue, and popping the first element from it.
+template<typename T> struct DeadCodeElimination::SetQueue {
+ SetQueue() : Set(), Queue() {}
+
+ bool empty() const {
+ return Queue.empty();
+ }
+ T pop_front() {
+ T V = Queue.front();
+ Queue.pop();
+ Set.erase(V);
+ return V;
+ }
+ void push_back(T V) {
+ if (Set.count(V))
+ return;
+ Queue.push(V);
+ Set.insert(V);
+ }
+
+private:
+ DenseSet<T> Set;
+ std::queue<T> Queue;
+};
+
+
+// Check if the given instruction has observable side-effects, i.e. if
+// it should be considered "live". It is safe for this function to be
+// overly conservative (i.e. return "true" for all instructions), but it
+// is not safe to return "false" for an instruction that should not be
+// considered removable.
+bool DeadCodeElimination::isLiveInstr(const MachineInstr *MI) const {
+ if (MI->mayStore() || MI->isBranch() || MI->isCall() || MI->isReturn())
+ return true;
+ if (MI->hasOrderedMemoryRef() || MI->hasUnmodeledSideEffects())
+ return true;
+ if (MI->isPHI())
+ return false;
+ for (auto &Op : MI->operands())
+ if (Op.isReg() && MRI.isReserved(Op.getReg()))
+ return true;
+ return false;
+}
+
+void DeadCodeElimination::scanInstr(NodeAddr<InstrNode*> IA,
+ SetQueue<NodeId> &WorkQ) {
+ if (!DFG.IsCode<NodeAttrs::Stmt>(IA))
+ return;
+ if (!isLiveInstr(NodeAddr<StmtNode*>(IA).Addr->getCode()))
+ return;
+ for (NodeAddr<RefNode*> RA : IA.Addr->members(DFG)) {
+ if (!LiveNodes.count(RA.Id))
+ WorkQ.push_back(RA.Id);
+ }
+}
+
+void DeadCodeElimination::processDef(NodeAddr<DefNode*> DA,
+ SetQueue<NodeId> &WorkQ) {
+ NodeAddr<InstrNode*> IA = DA.Addr->getOwner(DFG);
+ for (NodeAddr<UseNode*> UA : IA.Addr->members_if(DFG.IsUse, DFG)) {
+ if (!LiveNodes.count(UA.Id))
+ WorkQ.push_back(UA.Id);
+ }
+ for (NodeAddr<DefNode*> TA : DFG.getRelatedRefs(IA, DA))
+ LiveNodes.insert(TA.Id);
+}
+
+void DeadCodeElimination::processUse(NodeAddr<UseNode*> UA,
+ SetQueue<NodeId> &WorkQ) {
+ for (NodeAddr<DefNode*> DA : LV.getAllReachingDefs(UA)) {
+ if (!LiveNodes.count(DA.Id))
+ WorkQ.push_back(DA.Id);
+ }
+}
+
+// Traverse the DFG and collect the set dead RefNodes and the set of
+// dead instructions. Return "true" if any of these sets is non-empty,
+// "false" otherwise.
+bool DeadCodeElimination::collect() {
+ // This function works by first finding all live nodes. The dead nodes
+ // are then the complement of the set of live nodes.
+ //
+ // Assume that all nodes are dead. Identify instructions which must be
+ // considered live, i.e. instructions with observable side-effects, such
+ // as calls and stores. All arguments of such instructions are considered
+ // live. For each live def, all operands used in the corresponding
+ // instruction are considered live. For each live use, all its reaching
+ // defs are considered live.
+ LiveNodes.clear();
+ SetQueue<NodeId> WorkQ;
+ for (NodeAddr<BlockNode*> BA : DFG.getFunc().Addr->members(DFG))
+ for (NodeAddr<InstrNode*> IA : BA.Addr->members(DFG))
+ scanInstr(IA, WorkQ);
+
+ while (!WorkQ.empty()) {
+ NodeId N = WorkQ.pop_front();
+ LiveNodes.insert(N);
+ auto RA = DFG.addr<RefNode*>(N);
+ if (DFG.IsDef(RA))
+ processDef(RA, WorkQ);
+ else
+ processUse(RA, WorkQ);
+ }
+
+ if (trace()) {
+ dbgs() << "Live nodes:\n";
+ for (NodeId N : LiveNodes) {
+ auto RA = DFG.addr<RefNode*>(N);
+ dbgs() << PrintNode<RefNode*>(RA, DFG) << "\n";
+ }
+ }
+
+ auto IsDead = [this] (NodeAddr<InstrNode*> IA) -> bool {
+ for (NodeAddr<DefNode*> DA : IA.Addr->members_if(DFG.IsDef, DFG))
+ if (LiveNodes.count(DA.Id))
+ return false;
+ return true;
+ };
+
+ for (NodeAddr<BlockNode*> BA : DFG.getFunc().Addr->members(DFG)) {
+ for (NodeAddr<InstrNode*> IA : BA.Addr->members(DFG)) {
+ for (NodeAddr<RefNode*> RA : IA.Addr->members(DFG))
+ if (!LiveNodes.count(RA.Id))
+ DeadNodes.insert(RA.Id);
+ if (DFG.IsCode<NodeAttrs::Stmt>(IA))
+ if (isLiveInstr(NodeAddr<StmtNode*>(IA).Addr->getCode()))
+ continue;
+ if (IsDead(IA)) {
+ DeadInstrs.insert(IA.Id);
+ if (trace())
+ dbgs() << "Dead instr: " << PrintNode<InstrNode*>(IA, DFG) << "\n";
+ }
+ }
+ }
+
+ return !DeadNodes.empty();
+}
+
+// Erase the nodes given in the Nodes set from DFG. In addition to removing
+// them from the DFG, if a node corresponds to a statement, the corresponding
+// machine instruction is erased from the function.
+bool DeadCodeElimination::erase(const SetVector<NodeId> &Nodes) {
+ if (Nodes.empty())
+ return false;
+
+ // Prepare the actual set of ref nodes to remove: ref nodes from Nodes
+ // are included directly, for each InstrNode in Nodes, include the set
+ // of all RefNodes from it.
+ NodeList DRNs, DINs;
+ for (auto I : Nodes) {
+ auto BA = DFG.addr<NodeBase*>(I);
+ uint16_t Type = BA.Addr->getType();
+ if (Type == NodeAttrs::Ref) {
+ DRNs.push_back(DFG.addr<RefNode*>(I));
+ continue;
+ }
+
+ // If it's a code node, add all ref nodes from it.
+ uint16_t Kind = BA.Addr->getKind();
+ if (Kind == NodeAttrs::Stmt || Kind == NodeAttrs::Phi) {
+ for (auto N : NodeAddr<CodeNode*>(BA).Addr->members(DFG))
+ DRNs.push_back(N);
+ DINs.push_back(DFG.addr<InstrNode*>(I));
+ } else {
+ llvm_unreachable("Unexpected code node");
+ return false;
+ }
+ }
+
+ // Sort the list so that use nodes are removed first. This makes the
+ // "unlink" functions a bit faster.
+ auto UsesFirst = [] (NodeAddr<RefNode*> A, NodeAddr<RefNode*> B) -> bool {
+ uint16_t KindA = A.Addr->getKind(), KindB = B.Addr->getKind();
+ if (KindA == NodeAttrs::Use && KindB == NodeAttrs::Def)
+ return true;
+ if (KindA == NodeAttrs::Def && KindB == NodeAttrs::Use)
+ return false;
+ return A.Id < B.Id;
+ };
+ std::sort(DRNs.begin(), DRNs.end(), UsesFirst);
+
+ if (trace())
+ dbgs() << "Removing dead ref nodes:\n";
+ for (NodeAddr<RefNode*> RA : DRNs) {
+ if (trace())
+ dbgs() << " " << PrintNode<RefNode*>(RA, DFG) << '\n';
+ if (DFG.IsUse(RA))
+ DFG.unlinkUse(RA, true);
+ else if (DFG.IsDef(RA))
+ DFG.unlinkDef(RA, true);
+ }
+
+ // Now, remove all dead instruction nodes.
+ for (NodeAddr<InstrNode*> IA : DINs) {
+ NodeAddr<BlockNode*> BA = IA.Addr->getOwner(DFG);
+ BA.Addr->removeMember(IA, DFG);
+ if (!DFG.IsCode<NodeAttrs::Stmt>(IA))
+ continue;
+
+ MachineInstr *MI = NodeAddr<StmtNode*>(IA).Addr->getCode();
+ if (trace())
+ dbgs() << "erasing: " << *MI;
+ MI->eraseFromParent();
+ }
+ return true;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.h b/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.h
new file mode 100644
index 000000000000..8977e730b855
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.h
@@ -0,0 +1,67 @@
+//===--- RDFDeadCode.h ----------------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// RDF-based generic dead code elimination.
+//
+// The main interface of this class are functions "collect" and "erase".
+// This allows custom processing of the function being optimized by a
+// particular consumer. The simplest way to use this class would be to
+// instantiate an object, and then simply call "collect" and "erase",
+// passing the result of "getDeadInstrs()" to it.
+// A more complex scenario would be to call "collect" first, then visit
+// all post-increment instructions to see if the address update is dead
+// or not, and if it is, convert the instruction to a non-updating form.
+// After that "erase" can be called with the set of nodes including both,
+// dead defs from the updating instructions and the nodes corresponding
+// to the dead instructions.
+
+#ifndef RDF_DEADCODE_H
+#define RDF_DEADCODE_H
+
+#include "RDFGraph.h"
+#include "RDFLiveness.h"
+#include "llvm/ADT/SetVector.h"
+
+namespace llvm {
+ class MachineRegisterInfo;
+
+namespace rdf {
+ struct DeadCodeElimination {
+ DeadCodeElimination(DataFlowGraph &dfg, MachineRegisterInfo &mri)
+ : Trace(false), DFG(dfg), MRI(mri), LV(mri, dfg) {}
+
+ bool collect();
+ bool erase(const SetVector<NodeId> &Nodes);
+ void trace(bool On) { Trace = On; }
+ bool trace() const { return Trace; }
+
+ SetVector<NodeId> getDeadNodes() { return DeadNodes; }
+ SetVector<NodeId> getDeadInstrs() { return DeadInstrs; }
+ DataFlowGraph &getDFG() { return DFG; }
+
+ private:
+ bool Trace;
+ SetVector<NodeId> LiveNodes;
+ SetVector<NodeId> DeadNodes;
+ SetVector<NodeId> DeadInstrs;
+ DataFlowGraph &DFG;
+ MachineRegisterInfo &MRI;
+ Liveness LV;
+
+ template<typename T> struct SetQueue;
+
+ bool isLiveInstr(const MachineInstr *MI) const;
+ void scanInstr(NodeAddr<InstrNode*> IA, SetQueue<NodeId> &WorkQ);
+ void processDef(NodeAddr<DefNode*> DA, SetQueue<NodeId> &WorkQ);
+ void processUse(NodeAddr<UseNode*> UA, SetQueue<NodeId> &WorkQ);
+ };
+} // namespace rdf
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFGraph.cpp b/contrib/llvm/lib/Target/Hexagon/RDFGraph.cpp
new file mode 100644
index 000000000000..33c3f03790f3
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/RDFGraph.cpp
@@ -0,0 +1,1950 @@
+//===--- RDFGraph.cpp -----------------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Target-independent, SSA-based data flow graph for register data flow (RDF).
+//
+#include "RDFGraph.h"
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominanceFrontier.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+using namespace rdf;
+
+// Printing functions. Have them here first, so that the rest of the code
+// can use them.
+namespace llvm {
+namespace rdf {
+
+raw_ostream &operator<< (raw_ostream &OS, const PrintLaneMaskOpt &P) {
+ if (!P.Mask.all())
+ OS << ':' << PrintLaneMask(P.Mask);
+ return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS, const Print<RegisterRef> &P) {
+ auto &TRI = P.G.getTRI();
+ if (P.Obj.Reg > 0 && P.Obj.Reg < TRI.getNumRegs())
+ OS << TRI.getName(P.Obj.Reg);
+ else
+ OS << '#' << P.Obj.Reg;
+ OS << PrintLaneMaskOpt(P.Obj.Mask);
+ return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS, const Print<NodeId> &P) {
+ auto NA = P.G.addr<NodeBase*>(P.Obj);
+ uint16_t Attrs = NA.Addr->getAttrs();
+ uint16_t Kind = NodeAttrs::kind(Attrs);
+ uint16_t Flags = NodeAttrs::flags(Attrs);
+ switch (NodeAttrs::type(Attrs)) {
+ case NodeAttrs::Code:
+ switch (Kind) {
+ case NodeAttrs::Func: OS << 'f'; break;
+ case NodeAttrs::Block: OS << 'b'; break;
+ case NodeAttrs::Stmt: OS << 's'; break;
+ case NodeAttrs::Phi: OS << 'p'; break;
+ default: OS << "c?"; break;
+ }
+ break;
+ case NodeAttrs::Ref:
+ if (Flags & NodeAttrs::Undef)
+ OS << '/';
+ if (Flags & NodeAttrs::Dead)
+ OS << '\\';
+ if (Flags & NodeAttrs::Preserving)
+ OS << '+';
+ if (Flags & NodeAttrs::Clobbering)
+ OS << '~';
+ switch (Kind) {
+ case NodeAttrs::Use: OS << 'u'; break;
+ case NodeAttrs::Def: OS << 'd'; break;
+ case NodeAttrs::Block: OS << 'b'; break;
+ default: OS << "r?"; break;
+ }
+ break;
+ default:
+ OS << '?';
+ break;
+ }
+ OS << P.Obj;
+ if (Flags & NodeAttrs::Shadow)
+ OS << '"';
+ return OS;
+}
+
+namespace {
+ void printRefHeader(raw_ostream &OS, const NodeAddr<RefNode*> RA,
+ const DataFlowGraph &G) {
+ OS << Print<NodeId>(RA.Id, G) << '<'
+ << Print<RegisterRef>(RA.Addr->getRegRef(G), G) << '>';
+ if (RA.Addr->getFlags() & NodeAttrs::Fixed)
+ OS << '!';
+ }
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<DefNode*>> &P) {
+ printRefHeader(OS, P.Obj, P.G);
+ OS << '(';
+ if (NodeId N = P.Obj.Addr->getReachingDef())
+ OS << Print<NodeId>(N, P.G);
+ OS << ',';
+ if (NodeId N = P.Obj.Addr->getReachedDef())
+ OS << Print<NodeId>(N, P.G);
+ OS << ',';
+ if (NodeId N = P.Obj.Addr->getReachedUse())
+ OS << Print<NodeId>(N, P.G);
+ OS << "):";
+ if (NodeId N = P.Obj.Addr->getSibling())
+ OS << Print<NodeId>(N, P.G);
+ return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<UseNode*>> &P) {
+ printRefHeader(OS, P.Obj, P.G);
+ OS << '(';
+ if (NodeId N = P.Obj.Addr->getReachingDef())
+ OS << Print<NodeId>(N, P.G);
+ OS << "):";
+ if (NodeId N = P.Obj.Addr->getSibling())
+ OS << Print<NodeId>(N, P.G);
+ return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS,
+ const Print<NodeAddr<PhiUseNode*>> &P) {
+ printRefHeader(OS, P.Obj, P.G);
+ OS << '(';
+ if (NodeId N = P.Obj.Addr->getReachingDef())
+ OS << Print<NodeId>(N, P.G);
+ OS << ',';
+ if (NodeId N = P.Obj.Addr->getPredecessor())
+ OS << Print<NodeId>(N, P.G);
+ OS << "):";
+ if (NodeId N = P.Obj.Addr->getSibling())
+ OS << Print<NodeId>(N, P.G);
+ return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<RefNode*>> &P) {
+ switch (P.Obj.Addr->getKind()) {
+ case NodeAttrs::Def:
+ OS << PrintNode<DefNode*>(P.Obj, P.G);
+ break;
+ case NodeAttrs::Use:
+ if (P.Obj.Addr->getFlags() & NodeAttrs::PhiRef)
+ OS << PrintNode<PhiUseNode*>(P.Obj, P.G);
+ else
+ OS << PrintNode<UseNode*>(P.Obj, P.G);
+ break;
+ }
+ return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS, const Print<NodeList> &P) {
+ unsigned N = P.Obj.size();
+ for (auto I : P.Obj) {
+ OS << Print<NodeId>(I.Id, P.G);
+ if (--N)
+ OS << ' ';
+ }
+ return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS, const Print<NodeSet> &P) {
+ unsigned N = P.Obj.size();
+ for (auto I : P.Obj) {
+ OS << Print<NodeId>(I, P.G);
+ if (--N)
+ OS << ' ';
+ }
+ return OS;
+}
+
+namespace {
+ template <typename T>
+ struct PrintListV {
+ PrintListV(const NodeList &L, const DataFlowGraph &G) : List(L), G(G) {}
+ typedef T Type;
+ const NodeList &List;
+ const DataFlowGraph &G;
+ };
+
+ template <typename T>
+ raw_ostream &operator<< (raw_ostream &OS, const PrintListV<T> &P) {
+ unsigned N = P.List.size();
+ for (NodeAddr<T> A : P.List) {
+ OS << PrintNode<T>(A, P.G);
+ if (--N)
+ OS << ", ";
+ }
+ return OS;
+ }
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<PhiNode*>> &P) {
+ OS << Print<NodeId>(P.Obj.Id, P.G) << ": phi ["
+ << PrintListV<RefNode*>(P.Obj.Addr->members(P.G), P.G) << ']';
+ return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS,
+ const Print<NodeAddr<StmtNode*>> &P) {
+ const MachineInstr &MI = *P.Obj.Addr->getCode();
+ unsigned Opc = MI.getOpcode();
+ OS << Print<NodeId>(P.Obj.Id, P.G) << ": " << P.G.getTII().getName(Opc);
+ // Print the target for calls and branches (for readability).
+ if (MI.isCall() || MI.isBranch()) {
+ MachineInstr::const_mop_iterator T =
+ find_if(MI.operands(),
+ [] (const MachineOperand &Op) -> bool {
+ return Op.isMBB() || Op.isGlobal() || Op.isSymbol();
+ });
+ if (T != MI.operands_end()) {
+ OS << ' ';
+ if (T->isMBB())
+ OS << "BB#" << T->getMBB()->getNumber();
+ else if (T->isGlobal())
+ OS << T->getGlobal()->getName();
+ else if (T->isSymbol())
+ OS << T->getSymbolName();
+ }
+ }
+ OS << " [" << PrintListV<RefNode*>(P.Obj.Addr->members(P.G), P.G) << ']';
+ return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS,
+ const Print<NodeAddr<InstrNode*>> &P) {
+ switch (P.Obj.Addr->getKind()) {
+ case NodeAttrs::Phi:
+ OS << PrintNode<PhiNode*>(P.Obj, P.G);
+ break;
+ case NodeAttrs::Stmt:
+ OS << PrintNode<StmtNode*>(P.Obj, P.G);
+ break;
+ default:
+ OS << "instr? " << Print<NodeId>(P.Obj.Id, P.G);
+ break;
+ }
+ return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS,
+ const Print<NodeAddr<BlockNode*>> &P) {
+ MachineBasicBlock *BB = P.Obj.Addr->getCode();
+ unsigned NP = BB->pred_size();
+ std::vector<int> Ns;
+ auto PrintBBs = [&OS,&P] (std::vector<int> Ns) -> void {
+ unsigned N = Ns.size();
+ for (int I : Ns) {
+ OS << "BB#" << I;
+ if (--N)
+ OS << ", ";
+ }
+ };
+
+ OS << Print<NodeId>(P.Obj.Id, P.G) << ": --- BB#" << BB->getNumber()
+ << " --- preds(" << NP << "): ";
+ for (MachineBasicBlock *B : BB->predecessors())
+ Ns.push_back(B->getNumber());
+ PrintBBs(Ns);
+
+ unsigned NS = BB->succ_size();
+ OS << " succs(" << NS << "): ";
+ Ns.clear();
+ for (MachineBasicBlock *B : BB->successors())
+ Ns.push_back(B->getNumber());
+ PrintBBs(Ns);
+ OS << '\n';
+
+ for (auto I : P.Obj.Addr->members(P.G))
+ OS << PrintNode<InstrNode*>(I, P.G) << '\n';
+ return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS,
+ const Print<NodeAddr<FuncNode*>> &P) {
+ OS << "DFG dump:[\n" << Print<NodeId>(P.Obj.Id, P.G) << ": Function: "
+ << P.Obj.Addr->getCode()->getName() << '\n';
+ for (auto I : P.Obj.Addr->members(P.G))
+ OS << PrintNode<BlockNode*>(I, P.G) << '\n';
+ OS << "]\n";
+ return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS, const Print<RegisterSet> &P) {
+ OS << '{';
+ for (auto I : P.Obj)
+ OS << ' ' << Print<RegisterRef>(I, P.G);
+ OS << " }";
+ return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS, const Print<RegisterAggr> &P) {
+ P.Obj.print(OS);
+ return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS,
+ const Print<DataFlowGraph::DefStack> &P) {
+ for (auto I = P.Obj.top(), E = P.Obj.bottom(); I != E; ) {
+ OS << Print<NodeId>(I->Id, P.G)
+ << '<' << Print<RegisterRef>(I->Addr->getRegRef(P.G), P.G) << '>';
+ I.down();
+ if (I != E)
+ OS << ' ';
+ }
+ return OS;
+}
+
+} // namespace rdf
+} // namespace llvm
+
+// Node allocation functions.
+//
+// Node allocator is like a slab memory allocator: it allocates blocks of
+// memory in sizes that are multiples of the size of a node. Each block has
+// the same size. Nodes are allocated from the currently active block, and
+// when it becomes full, a new one is created.
+// There is a mapping scheme between node id and its location in a block,
+// and within that block is described in the header file.
+//
+void NodeAllocator::startNewBlock() {
+ void *T = MemPool.Allocate(NodesPerBlock*NodeMemSize, NodeMemSize);
+ char *P = static_cast<char*>(T);
+ Blocks.push_back(P);
+ // Check if the block index is still within the allowed range, i.e. less
+ // than 2^N, where N is the number of bits in NodeId for the block index.
+ // BitsPerIndex is the number of bits per node index.
+ assert((Blocks.size() < ((size_t)1 << (8*sizeof(NodeId)-BitsPerIndex))) &&
+ "Out of bits for block index");
+ ActiveEnd = P;
+}
+
+bool NodeAllocator::needNewBlock() {
+ if (Blocks.empty())
+ return true;
+
+ char *ActiveBegin = Blocks.back();
+ uint32_t Index = (ActiveEnd-ActiveBegin)/NodeMemSize;
+ return Index >= NodesPerBlock;
+}
+
+NodeAddr<NodeBase*> NodeAllocator::New() {
+ if (needNewBlock())
+ startNewBlock();
+
+ uint32_t ActiveB = Blocks.size()-1;
+ uint32_t Index = (ActiveEnd - Blocks[ActiveB])/NodeMemSize;
+ NodeAddr<NodeBase*> NA = { reinterpret_cast<NodeBase*>(ActiveEnd),
+ makeId(ActiveB, Index) };
+ ActiveEnd += NodeMemSize;
+ return NA;
+}
+
+NodeId NodeAllocator::id(const NodeBase *P) const {
+ uintptr_t A = reinterpret_cast<uintptr_t>(P);
+ for (unsigned i = 0, n = Blocks.size(); i != n; ++i) {
+ uintptr_t B = reinterpret_cast<uintptr_t>(Blocks[i]);
+ if (A < B || A >= B + NodesPerBlock*NodeMemSize)
+ continue;
+ uint32_t Idx = (A-B)/NodeMemSize;
+ return makeId(i, Idx);
+ }
+ llvm_unreachable("Invalid node address");
+}
+
+void NodeAllocator::clear() {
+ MemPool.Reset();
+ Blocks.clear();
+ ActiveEnd = nullptr;
+}
+
+
+// Insert node NA after "this" in the circular chain.
+void NodeBase::append(NodeAddr<NodeBase*> NA) {
+ NodeId Nx = Next;
+ // If NA is already "next", do nothing.
+ if (Next != NA.Id) {
+ Next = NA.Id;
+ NA.Addr->Next = Nx;
+ }
+}
+
+
+// Fundamental node manipulator functions.
+
+// Obtain the register reference from a reference node.
+RegisterRef RefNode::getRegRef(const DataFlowGraph &G) const {
+ assert(NodeAttrs::type(Attrs) == NodeAttrs::Ref);
+ if (NodeAttrs::flags(Attrs) & NodeAttrs::PhiRef)
+ return G.unpack(Ref.PR);
+ assert(Ref.Op != nullptr);
+ return G.makeRegRef(Ref.Op->getReg(), Ref.Op->getSubReg());
+}
+
+// Set the register reference in the reference node directly (for references
+// in phi nodes).
+void RefNode::setRegRef(RegisterRef RR, DataFlowGraph &G) {
+ assert(NodeAttrs::type(Attrs) == NodeAttrs::Ref);
+ assert(NodeAttrs::flags(Attrs) & NodeAttrs::PhiRef);
+ Ref.PR = G.pack(RR);
+}
+
+// Set the register reference in the reference node based on a machine
+// operand (for references in statement nodes).
+void RefNode::setRegRef(MachineOperand *Op, DataFlowGraph &G) {
+ assert(NodeAttrs::type(Attrs) == NodeAttrs::Ref);
+ assert(!(NodeAttrs::flags(Attrs) & NodeAttrs::PhiRef));
+ (void)G;
+ Ref.Op = Op;
+}
+
+// Get the owner of a given reference node.
+NodeAddr<NodeBase*> RefNode::getOwner(const DataFlowGraph &G) {
+ NodeAddr<NodeBase*> NA = G.addr<NodeBase*>(getNext());
+
+ while (NA.Addr != this) {
+ if (NA.Addr->getType() == NodeAttrs::Code)
+ return NA;
+ NA = G.addr<NodeBase*>(NA.Addr->getNext());
+ }
+ llvm_unreachable("No owner in circular list");
+}
+
+// Connect the def node to the reaching def node.
+void DefNode::linkToDef(NodeId Self, NodeAddr<DefNode*> DA) {
+ Ref.RD = DA.Id;
+ Ref.Sib = DA.Addr->getReachedDef();
+ DA.Addr->setReachedDef(Self);
+}
+
+// Connect the use node to the reaching def node.
+void UseNode::linkToDef(NodeId Self, NodeAddr<DefNode*> DA) {
+ Ref.RD = DA.Id;
+ Ref.Sib = DA.Addr->getReachedUse();
+ DA.Addr->setReachedUse(Self);
+}
+
+// Get the first member of the code node.
+NodeAddr<NodeBase*> CodeNode::getFirstMember(const DataFlowGraph &G) const {
+ if (Code.FirstM == 0)
+ return NodeAddr<NodeBase*>();
+ return G.addr<NodeBase*>(Code.FirstM);
+}
+
+// Get the last member of the code node.
+NodeAddr<NodeBase*> CodeNode::getLastMember(const DataFlowGraph &G) const {
+ if (Code.LastM == 0)
+ return NodeAddr<NodeBase*>();
+ return G.addr<NodeBase*>(Code.LastM);
+}
+
+// Add node NA at the end of the member list of the given code node.
+void CodeNode::addMember(NodeAddr<NodeBase*> NA, const DataFlowGraph &G) {
+ NodeAddr<NodeBase*> ML = getLastMember(G);
+ if (ML.Id != 0) {
+ ML.Addr->append(NA);
+ } else {
+ Code.FirstM = NA.Id;
+ NodeId Self = G.id(this);
+ NA.Addr->setNext(Self);
+ }
+ Code.LastM = NA.Id;
+}
+
+// Add node NA after member node MA in the given code node.
+void CodeNode::addMemberAfter(NodeAddr<NodeBase*> MA, NodeAddr<NodeBase*> NA,
+ const DataFlowGraph &G) {
+ MA.Addr->append(NA);
+ if (Code.LastM == MA.Id)
+ Code.LastM = NA.Id;
+}
+
+// Remove member node NA from the given code node.
+void CodeNode::removeMember(NodeAddr<NodeBase*> NA, const DataFlowGraph &G) {
+ NodeAddr<NodeBase*> MA = getFirstMember(G);
+ assert(MA.Id != 0);
+
+ // Special handling if the member to remove is the first member.
+ if (MA.Id == NA.Id) {
+ if (Code.LastM == MA.Id) {
+ // If it is the only member, set both first and last to 0.
+ Code.FirstM = Code.LastM = 0;
+ } else {
+ // Otherwise, advance the first member.
+ Code.FirstM = MA.Addr->getNext();
+ }
+ return;
+ }
+
+ while (MA.Addr != this) {
+ NodeId MX = MA.Addr->getNext();
+ if (MX == NA.Id) {
+ MA.Addr->setNext(NA.Addr->getNext());
+ // If the member to remove happens to be the last one, update the
+ // LastM indicator.
+ if (Code.LastM == NA.Id)
+ Code.LastM = MA.Id;
+ return;
+ }
+ MA = G.addr<NodeBase*>(MX);
+ }
+ llvm_unreachable("No such member");
+}
+
+// Return the list of all members of the code node.
+NodeList CodeNode::members(const DataFlowGraph &G) const {
+ static auto True = [] (NodeAddr<NodeBase*>) -> bool { return true; };
+ return members_if(True, G);
+}
+
+// Return the owner of the given instr node.
+NodeAddr<NodeBase*> InstrNode::getOwner(const DataFlowGraph &G) {
+ NodeAddr<NodeBase*> NA = G.addr<NodeBase*>(getNext());
+
+ while (NA.Addr != this) {
+ assert(NA.Addr->getType() == NodeAttrs::Code);
+ if (NA.Addr->getKind() == NodeAttrs::Block)
+ return NA;
+ NA = G.addr<NodeBase*>(NA.Addr->getNext());
+ }
+ llvm_unreachable("No owner in circular list");
+}
+
+// Add the phi node PA to the given block node.
+void BlockNode::addPhi(NodeAddr<PhiNode*> PA, const DataFlowGraph &G) {
+ NodeAddr<NodeBase*> M = getFirstMember(G);
+ if (M.Id == 0) {
+ addMember(PA, G);
+ return;
+ }
+
+ assert(M.Addr->getType() == NodeAttrs::Code);
+ if (M.Addr->getKind() == NodeAttrs::Stmt) {
+ // If the first member of the block is a statement, insert the phi as
+ // the first member.
+ Code.FirstM = PA.Id;
+ PA.Addr->setNext(M.Id);
+ } else {
+ // If the first member is a phi, find the last phi, and append PA to it.
+ assert(M.Addr->getKind() == NodeAttrs::Phi);
+ NodeAddr<NodeBase*> MN = M;
+ do {
+ M = MN;
+ MN = G.addr<NodeBase*>(M.Addr->getNext());
+ assert(MN.Addr->getType() == NodeAttrs::Code);
+ } while (MN.Addr->getKind() == NodeAttrs::Phi);
+
+ // M is the last phi.
+ addMemberAfter(M, PA, G);
+ }
+}
+
+// Find the block node corresponding to the machine basic block BB in the
+// given func node.
+NodeAddr<BlockNode*> FuncNode::findBlock(const MachineBasicBlock *BB,
+ const DataFlowGraph &G) const {
+ auto EqBB = [BB] (NodeAddr<NodeBase*> NA) -> bool {
+ return NodeAddr<BlockNode*>(NA).Addr->getCode() == BB;
+ };
+ NodeList Ms = members_if(EqBB, G);
+ if (!Ms.empty())
+ return Ms[0];
+ return NodeAddr<BlockNode*>();
+}
+
+// Get the block node for the entry block in the given function.
+NodeAddr<BlockNode*> FuncNode::getEntryBlock(const DataFlowGraph &G) {
+ MachineBasicBlock *EntryB = &getCode()->front();
+ return findBlock(EntryB, G);
+}
+
+
+// Target operand information.
+//
+
+// For a given instruction, check if there are any bits of RR that can remain
+// unchanged across this def.
+bool TargetOperandInfo::isPreserving(const MachineInstr &In, unsigned OpNum)
+ const {
+ return TII.isPredicated(In);
+}
+
+// Check if the definition of RR produces an unspecified value.
+bool TargetOperandInfo::isClobbering(const MachineInstr &In, unsigned OpNum)
+ const {
+ if (In.isCall())
+ if (In.getOperand(OpNum).isImplicit())
+ return true;
+ return false;
+}
+
+// Check if the given instruction specifically requires
+bool TargetOperandInfo::isFixedReg(const MachineInstr &In, unsigned OpNum)
+ const {
+ if (In.isCall() || In.isReturn() || In.isInlineAsm())
+ return true;
+ // Check for a tail call.
+ if (In.isBranch())
+ for (const MachineOperand &O : In.operands())
+ if (O.isGlobal() || O.isSymbol())
+ return true;
+
+ const MCInstrDesc &D = In.getDesc();
+ if (!D.getImplicitDefs() && !D.getImplicitUses())
+ return false;
+ const MachineOperand &Op = In.getOperand(OpNum);
+ // If there is a sub-register, treat the operand as non-fixed. Currently,
+ // fixed registers are those that are listed in the descriptor as implicit
+ // uses or defs, and those lists do not allow sub-registers.
+ if (Op.getSubReg() != 0)
+ return false;
+ RegisterId Reg = Op.getReg();
+ const MCPhysReg *ImpR = Op.isDef() ? D.getImplicitDefs()
+ : D.getImplicitUses();
+ if (!ImpR)
+ return false;
+ while (*ImpR)
+ if (*ImpR++ == Reg)
+ return true;
+ return false;
+}
+
+
+RegisterRef RegisterAggr::normalize(RegisterRef RR) const {
+ RegisterId SuperReg = RR.Reg;
+ while (true) {
+ MCSuperRegIterator SR(SuperReg, &TRI, false);
+ if (!SR.isValid())
+ break;
+ SuperReg = *SR;
+ }
+
+ const TargetRegisterClass &RC = *TRI.getMinimalPhysRegClass(RR.Reg);
+ LaneBitmask Common = RR.Mask & RC.LaneMask;
+ uint32_t Sub = TRI.getSubRegIndex(SuperReg, RR.Reg);
+ LaneBitmask SuperMask = TRI.composeSubRegIndexLaneMask(Sub, Common);
+ return RegisterRef(SuperReg, SuperMask);
+}
+
+bool RegisterAggr::hasAliasOf(RegisterRef RR) const {
+ RegisterRef NR = normalize(RR);
+ auto F = Masks.find(NR.Reg);
+ if (F != Masks.end()) {
+ if ((F->second & NR.Mask).any())
+ return true;
+ }
+ if (CheckUnits) {
+ for (MCRegUnitIterator U(RR.Reg, &TRI); U.isValid(); ++U)
+ if (ExpAliasUnits.test(*U))
+ return true;
+ }
+ return false;
+}
+
+bool RegisterAggr::hasCoverOf(RegisterRef RR) const {
+ // Always have a cover for empty lane mask.
+ RegisterRef NR = normalize(RR);
+ if (NR.Mask.none())
+ return true;
+ auto F = Masks.find(NR.Reg);
+ if (F == Masks.end())
+ return false;
+ return (NR.Mask & F->second) == NR.Mask;
+}
+
+RegisterAggr &RegisterAggr::insert(RegisterRef RR) {
+ RegisterRef NR = normalize(RR);
+ auto F = Masks.find(NR.Reg);
+ if (F == Masks.end())
+ Masks.insert({NR.Reg, NR.Mask});
+ else
+ F->second |= NR.Mask;
+
+ // Visit all register units to see if there are any that were created
+ // by explicit aliases. Add those that were to the bit vector.
+ for (MCRegUnitIterator U(RR.Reg, &TRI); U.isValid(); ++U) {
+ MCRegUnitRootIterator R(*U, &TRI);
+ ++R;
+ if (!R.isValid())
+ continue;
+ ExpAliasUnits.set(*U);
+ CheckUnits = true;
+ }
+ return *this;
+}
+
+RegisterAggr &RegisterAggr::insert(const RegisterAggr &RG) {
+ for (std::pair<RegisterId,LaneBitmask> P : RG.Masks)
+ insert(RegisterRef(P.first, P.second));
+ return *this;
+}
+
+RegisterAggr &RegisterAggr::clear(RegisterRef RR) {
+ RegisterRef NR = normalize(RR);
+ auto F = Masks.find(NR.Reg);
+ if (F == Masks.end())
+ return *this;
+ LaneBitmask NewM = F->second & ~NR.Mask;
+ if (NewM.none())
+ Masks.erase(F);
+ else
+ F->second = NewM;
+ return *this;
+}
+
+RegisterAggr &RegisterAggr::clear(const RegisterAggr &RG) {
+ for (std::pair<RegisterId,LaneBitmask> P : RG.Masks)
+ clear(RegisterRef(P.first, P.second));
+ return *this;
+}
+
+RegisterRef RegisterAggr::clearIn(RegisterRef RR) const {
+ RegisterAggr T(TRI);
+ T.insert(RR).clear(*this);
+ if (T.empty())
+ return RegisterRef();
+ return RegisterRef(T.begin()->first, T.begin()->second);
+}
+
+void RegisterAggr::print(raw_ostream &OS) const {
+ OS << '{';
+ for (auto I : Masks)
+ OS << ' ' << PrintReg(I.first, &TRI) << PrintLaneMaskOpt(I.second);
+ OS << " }";
+}
+
+
+//
+// The data flow graph construction.
+//
+
+DataFlowGraph::DataFlowGraph(MachineFunction &mf, const TargetInstrInfo &tii,
+ const TargetRegisterInfo &tri, const MachineDominatorTree &mdt,
+ const MachineDominanceFrontier &mdf, const TargetOperandInfo &toi)
+ : LMI(), MF(mf), TII(tii), TRI(tri), MDT(mdt), MDF(mdf), TOI(toi) {
+}
+
+
+// The implementation of the definition stack.
+// Each register reference has its own definition stack. In particular,
+// for a register references "Reg" and "Reg:subreg" will each have their
+// own definition stacks.
+
+// Construct a stack iterator.
+DataFlowGraph::DefStack::Iterator::Iterator(const DataFlowGraph::DefStack &S,
+ bool Top) : DS(S) {
+ if (!Top) {
+ // Initialize to bottom.
+ Pos = 0;
+ return;
+ }
+ // Initialize to the top, i.e. top-most non-delimiter (or 0, if empty).
+ Pos = DS.Stack.size();
+ while (Pos > 0 && DS.isDelimiter(DS.Stack[Pos-1]))
+ Pos--;
+}
+
+// Return the size of the stack, including block delimiters.
+unsigned DataFlowGraph::DefStack::size() const {
+ unsigned S = 0;
+ for (auto I = top(), E = bottom(); I != E; I.down())
+ S++;
+ return S;
+}
+
+// Remove the top entry from the stack. Remove all intervening delimiters
+// so that after this, the stack is either empty, or the top of the stack
+// is a non-delimiter.
+void DataFlowGraph::DefStack::pop() {
+ assert(!empty());
+ unsigned P = nextDown(Stack.size());
+ Stack.resize(P);
+}
+
+// Push a delimiter for block node N on the stack.
+void DataFlowGraph::DefStack::start_block(NodeId N) {
+ assert(N != 0);
+ Stack.push_back(NodeAddr<DefNode*>(nullptr, N));
+}
+
+// Remove all nodes from the top of the stack, until the delimited for
+// block node N is encountered. Remove the delimiter as well. In effect,
+// this will remove from the stack all definitions from block N.
+void DataFlowGraph::DefStack::clear_block(NodeId N) {
+ assert(N != 0);
+ unsigned P = Stack.size();
+ while (P > 0) {
+ bool Found = isDelimiter(Stack[P-1], N);
+ P--;
+ if (Found)
+ break;
+ }
+ // This will also remove the delimiter, if found.
+ Stack.resize(P);
+}
+
+// Move the stack iterator up by one.
+unsigned DataFlowGraph::DefStack::nextUp(unsigned P) const {
+ // Get the next valid position after P (skipping all delimiters).
+ // The input position P does not have to point to a non-delimiter.
+ unsigned SS = Stack.size();
+ bool IsDelim;
+ assert(P < SS);
+ do {
+ P++;
+ IsDelim = isDelimiter(Stack[P-1]);
+ } while (P < SS && IsDelim);
+ assert(!IsDelim);
+ return P;
+}
+
+// Move the stack iterator down by one.
+unsigned DataFlowGraph::DefStack::nextDown(unsigned P) const {
+ // Get the preceding valid position before P (skipping all delimiters).
+ // The input position P does not have to point to a non-delimiter.
+ assert(P > 0 && P <= Stack.size());
+ bool IsDelim = isDelimiter(Stack[P-1]);
+ do {
+ if (--P == 0)
+ break;
+ IsDelim = isDelimiter(Stack[P-1]);
+ } while (P > 0 && IsDelim);
+ assert(!IsDelim);
+ return P;
+}
+
+
+// Register information.
+
+// Get the list of references aliased to RR. Lane masks are ignored.
+RegisterSet DataFlowGraph::getAliasSet(RegisterId Reg) const {
+ // Do not include RR in the alias set.
+ RegisterSet AS;
+ assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+
+ for (MCRegAliasIterator AI(Reg, &TRI, false); AI.isValid(); ++AI)
+ AS.insert(RegisterRef(*AI));
+ return AS;
+}
+
+RegisterSet DataFlowGraph::getLandingPadLiveIns() const {
+ RegisterSet LR;
+ const Function &F = *MF.getFunction();
+ const Constant *PF = F.hasPersonalityFn() ? F.getPersonalityFn()
+ : nullptr;
+ const TargetLowering &TLI = *MF.getSubtarget().getTargetLowering();
+ if (RegisterId R = TLI.getExceptionPointerRegister(PF))
+ LR.insert(RegisterRef(R));
+ if (RegisterId R = TLI.getExceptionSelectorRegister(PF))
+ LR.insert(RegisterRef(R));
+ return LR;
+}
+
+// Node management functions.
+
+// Get the pointer to the node with the id N.
+NodeBase *DataFlowGraph::ptr(NodeId N) const {
+ if (N == 0)
+ return nullptr;
+ return Memory.ptr(N);
+}
+
+// Get the id of the node at the address P.
+NodeId DataFlowGraph::id(const NodeBase *P) const {
+ if (P == nullptr)
+ return 0;
+ return Memory.id(P);
+}
+
+// Allocate a new node and set the attributes to Attrs.
+NodeAddr<NodeBase*> DataFlowGraph::newNode(uint16_t Attrs) {
+ NodeAddr<NodeBase*> P = Memory.New();
+ P.Addr->init();
+ P.Addr->setAttrs(Attrs);
+ return P;
+}
+
+// Make a copy of the given node B, except for the data-flow links, which
+// are set to 0.
+NodeAddr<NodeBase*> DataFlowGraph::cloneNode(const NodeAddr<NodeBase*> B) {
+ NodeAddr<NodeBase*> NA = newNode(0);
+ memcpy(NA.Addr, B.Addr, sizeof(NodeBase));
+ // Ref nodes need to have the data-flow links reset.
+ if (NA.Addr->getType() == NodeAttrs::Ref) {
+ NodeAddr<RefNode*> RA = NA;
+ RA.Addr->setReachingDef(0);
+ RA.Addr->setSibling(0);
+ if (NA.Addr->getKind() == NodeAttrs::Def) {
+ NodeAddr<DefNode*> DA = NA;
+ DA.Addr->setReachedDef(0);
+ DA.Addr->setReachedUse(0);
+ }
+ }
+ return NA;
+}
+
+
+// Allocation routines for specific node types/kinds.
+
+NodeAddr<UseNode*> DataFlowGraph::newUse(NodeAddr<InstrNode*> Owner,
+ MachineOperand &Op, uint16_t Flags) {
+ NodeAddr<UseNode*> UA = newNode(NodeAttrs::Ref | NodeAttrs::Use | Flags);
+ UA.Addr->setRegRef(&Op, *this);
+ return UA;
+}
+
+NodeAddr<PhiUseNode*> DataFlowGraph::newPhiUse(NodeAddr<PhiNode*> Owner,
+ RegisterRef RR, NodeAddr<BlockNode*> PredB, uint16_t Flags) {
+ NodeAddr<PhiUseNode*> PUA = newNode(NodeAttrs::Ref | NodeAttrs::Use | Flags);
+ assert(Flags & NodeAttrs::PhiRef);
+ PUA.Addr->setRegRef(RR, *this);
+ PUA.Addr->setPredecessor(PredB.Id);
+ return PUA;
+}
+
+NodeAddr<DefNode*> DataFlowGraph::newDef(NodeAddr<InstrNode*> Owner,
+ MachineOperand &Op, uint16_t Flags) {
+ NodeAddr<DefNode*> DA = newNode(NodeAttrs::Ref | NodeAttrs::Def | Flags);
+ DA.Addr->setRegRef(&Op, *this);
+ return DA;
+}
+
+NodeAddr<DefNode*> DataFlowGraph::newDef(NodeAddr<InstrNode*> Owner,
+ RegisterRef RR, uint16_t Flags) {
+ NodeAddr<DefNode*> DA = newNode(NodeAttrs::Ref | NodeAttrs::Def | Flags);
+ assert(Flags & NodeAttrs::PhiRef);
+ DA.Addr->setRegRef(RR, *this);
+ return DA;
+}
+
+NodeAddr<PhiNode*> DataFlowGraph::newPhi(NodeAddr<BlockNode*> Owner) {
+ NodeAddr<PhiNode*> PA = newNode(NodeAttrs::Code | NodeAttrs::Phi);
+ Owner.Addr->addPhi(PA, *this);
+ return PA;
+}
+
+NodeAddr<StmtNode*> DataFlowGraph::newStmt(NodeAddr<BlockNode*> Owner,
+ MachineInstr *MI) {
+ NodeAddr<StmtNode*> SA = newNode(NodeAttrs::Code | NodeAttrs::Stmt);
+ SA.Addr->setCode(MI);
+ Owner.Addr->addMember(SA, *this);
+ return SA;
+}
+
+NodeAddr<BlockNode*> DataFlowGraph::newBlock(NodeAddr<FuncNode*> Owner,
+ MachineBasicBlock *BB) {
+ NodeAddr<BlockNode*> BA = newNode(NodeAttrs::Code | NodeAttrs::Block);
+ BA.Addr->setCode(BB);
+ Owner.Addr->addMember(BA, *this);
+ return BA;
+}
+
+NodeAddr<FuncNode*> DataFlowGraph::newFunc(MachineFunction *MF) {
+ NodeAddr<FuncNode*> FA = newNode(NodeAttrs::Code | NodeAttrs::Func);
+ FA.Addr->setCode(MF);
+ return FA;
+}
+
+// Build the data flow graph.
+void DataFlowGraph::build(unsigned Options) {
+ reset();
+ Func = newFunc(&MF);
+
+ if (MF.empty())
+ return;
+
+ for (MachineBasicBlock &B : MF) {
+ NodeAddr<BlockNode*> BA = newBlock(Func, &B);
+ BlockNodes.insert(std::make_pair(&B, BA));
+ for (MachineInstr &I : B) {
+ if (I.isDebugValue())
+ continue;
+ buildStmt(BA, I);
+ }
+ }
+
+ NodeAddr<BlockNode*> EA = Func.Addr->getEntryBlock(*this);
+ NodeList Blocks = Func.Addr->members(*this);
+
+ // Collect information about block references.
+ BlockRefsMap RefM;
+ buildBlockRefs(EA, RefM);
+
+ // Add function-entry phi nodes.
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ for (auto I = MRI.livein_begin(), E = MRI.livein_end(); I != E; ++I) {
+ NodeAddr<PhiNode*> PA = newPhi(EA);
+ RegisterRef RR = RegisterRef(I->first);
+ uint16_t PhiFlags = NodeAttrs::PhiRef | NodeAttrs::Preserving;
+ NodeAddr<DefNode*> DA = newDef(PA, RR, PhiFlags);
+ PA.Addr->addMember(DA, *this);
+ }
+
+ // Add phis for landing pads.
+ // Landing pads, unlike usual backs blocks, are not entered through
+ // branches in the program, or fall-throughs from other blocks. They
+ // are entered from the exception handling runtime and target's ABI
+ // may define certain registers as defined on entry to such a block.
+ RegisterSet EHRegs = getLandingPadLiveIns();
+ if (!EHRegs.empty()) {
+ for (NodeAddr<BlockNode*> BA : Blocks) {
+ const MachineBasicBlock &B = *BA.Addr->getCode();
+ if (!B.isEHPad())
+ continue;
+
+ // Prepare a list of NodeIds of the block's predecessors.
+ NodeList Preds;
+ for (MachineBasicBlock *PB : B.predecessors())
+ Preds.push_back(findBlock(PB));
+
+ // Build phi nodes for each live-in.
+ for (RegisterRef RR : EHRegs) {
+ NodeAddr<PhiNode*> PA = newPhi(BA);
+ uint16_t PhiFlags = NodeAttrs::PhiRef | NodeAttrs::Preserving;
+ // Add def:
+ NodeAddr<DefNode*> DA = newDef(PA, RR, PhiFlags);
+ PA.Addr->addMember(DA, *this);
+ // Add uses (no reaching defs for phi uses):
+ for (NodeAddr<BlockNode*> PBA : Preds) {
+ NodeAddr<PhiUseNode*> PUA = newPhiUse(PA, RR, PBA);
+ PA.Addr->addMember(PUA, *this);
+ }
+ }
+ }
+ }
+
+ // Build a map "PhiM" which will contain, for each block, the set
+ // of references that will require phi definitions in that block.
+ BlockRefsMap PhiM;
+ for (NodeAddr<BlockNode*> BA : Blocks)
+ recordDefsForDF(PhiM, RefM, BA);
+ for (NodeAddr<BlockNode*> BA : Blocks)
+ buildPhis(PhiM, RefM, BA);
+
+ // Link all the refs. This will recursively traverse the dominator tree.
+ DefStackMap DM;
+ linkBlockRefs(DM, EA);
+
+ // Finally, remove all unused phi nodes.
+ if (!(Options & BuildOptions::KeepDeadPhis))
+ removeUnusedPhis();
+}
+
+RegisterRef DataFlowGraph::makeRegRef(unsigned Reg, unsigned Sub) const {
+ assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+ if (Sub != 0)
+ Reg = TRI.getSubReg(Reg, Sub);
+ return RegisterRef(Reg);
+}
+
+RegisterRef DataFlowGraph::normalizeRef(RegisterRef RR) const {
+ // FIXME copied from RegisterAggr
+ RegisterId SuperReg = RR.Reg;
+ while (true) {
+ MCSuperRegIterator SR(SuperReg, &TRI, false);
+ if (!SR.isValid())
+ break;
+ SuperReg = *SR;
+ }
+
+ uint32_t Sub = TRI.getSubRegIndex(SuperReg, RR.Reg);
+ const TargetRegisterClass &RC = *TRI.getMinimalPhysRegClass(RR.Reg);
+ LaneBitmask SuperMask = RR.Mask &
+ TRI.composeSubRegIndexLaneMask(Sub, RC.LaneMask);
+ return RegisterRef(SuperReg, SuperMask);
+}
+
+RegisterRef DataFlowGraph::restrictRef(RegisterRef AR, RegisterRef BR) const {
+ if (AR.Reg == BR.Reg) {
+ LaneBitmask M = AR.Mask & BR.Mask;
+ return M.any() ? RegisterRef(AR.Reg, M) : RegisterRef();
+ }
+#ifndef NDEBUG
+ RegisterRef NAR = normalizeRef(AR);
+ RegisterRef NBR = normalizeRef(BR);
+ assert(NAR.Reg != NBR.Reg);
+#endif
+ // This isn't strictly correct, because the overlap may happen in the
+ // part masked out.
+ if (TRI.regsOverlap(AR.Reg, BR.Reg))
+ return AR;
+ return RegisterRef();
+}
+
+// For each stack in the map DefM, push the delimiter for block B on it.
+void DataFlowGraph::markBlock(NodeId B, DefStackMap &DefM) {
+ // Push block delimiters.
+ for (auto I = DefM.begin(), E = DefM.end(); I != E; ++I)
+ I->second.start_block(B);
+}
+
+// Remove all definitions coming from block B from each stack in DefM.
+void DataFlowGraph::releaseBlock(NodeId B, DefStackMap &DefM) {
+ // Pop all defs from this block from the definition stack. Defs that were
+ // added to the map during the traversal of instructions will not have a
+ // delimiter, but for those, the whole stack will be emptied.
+ for (auto I = DefM.begin(), E = DefM.end(); I != E; ++I)
+ I->second.clear_block(B);
+
+ // Finally, remove empty stacks from the map.
+ for (auto I = DefM.begin(), E = DefM.end(), NextI = I; I != E; I = NextI) {
+ NextI = std::next(I);
+ // This preserves the validity of iterators other than I.
+ if (I->second.empty())
+ DefM.erase(I);
+ }
+}
+
+// Push all definitions from the instruction node IA to an appropriate
+// stack in DefM.
+void DataFlowGraph::pushDefs(NodeAddr<InstrNode*> IA, DefStackMap &DefM) {
+ NodeList Defs = IA.Addr->members_if(IsDef, *this);
+ NodeSet Visited;
+#ifndef NDEBUG
+ RegisterSet Defined;
+#endif
+
+ // The important objectives of this function are:
+ // - to be able to handle instructions both while the graph is being
+ // constructed, and after the graph has been constructed, and
+ // - maintain proper ordering of definitions on the stack for each
+ // register reference:
+ // - if there are two or more related defs in IA (i.e. coming from
+ // the same machine operand), then only push one def on the stack,
+ // - if there are multiple unrelated defs of non-overlapping
+ // subregisters of S, then the stack for S will have both (in an
+ // unspecified order), but the order does not matter from the data-
+ // -flow perspective.
+
+ for (NodeAddr<DefNode*> DA : Defs) {
+ if (Visited.count(DA.Id))
+ continue;
+
+ NodeList Rel = getRelatedRefs(IA, DA);
+ NodeAddr<DefNode*> PDA = Rel.front();
+ RegisterRef RR = PDA.Addr->getRegRef(*this);
+#ifndef NDEBUG
+ // Assert if the register is defined in two or more unrelated defs.
+ // This could happen if there are two or more def operands defining it.
+ if (!Defined.insert(RR).second) {
+ MachineInstr *MI = NodeAddr<StmtNode*>(IA).Addr->getCode();
+ dbgs() << "Multiple definitions of register: "
+ << Print<RegisterRef>(RR, *this) << " in\n " << *MI
+ << "in BB#" << MI->getParent()->getNumber() << '\n';
+ llvm_unreachable(nullptr);
+ }
+#endif
+ // Push the definition on the stack for the register and all aliases.
+ // The def stack traversal in linkNodeUp will check the exact aliasing.
+ DefM[RR.Reg].push(DA);
+ for (RegisterRef A : getAliasSet(RR.Reg /*FIXME? use RegisterRef*/)) {
+ // Check that we don't push the same def twice.
+ assert(A != RR);
+ DefM[A.Reg].push(DA);
+ }
+ // Mark all the related defs as visited.
+ for (NodeAddr<NodeBase*> T : Rel)
+ Visited.insert(T.Id);
+ }
+}
+
+// Return the list of all reference nodes related to RA, including RA itself.
+// See "getNextRelated" for the meaning of a "related reference".
+NodeList DataFlowGraph::getRelatedRefs(NodeAddr<InstrNode*> IA,
+ NodeAddr<RefNode*> RA) const {
+ assert(IA.Id != 0 && RA.Id != 0);
+
+ NodeList Refs;
+ NodeId Start = RA.Id;
+ do {
+ Refs.push_back(RA);
+ RA = getNextRelated(IA, RA);
+ } while (RA.Id != 0 && RA.Id != Start);
+ return Refs;
+}
+
+// Return true if RA and RB overlap, false otherwise.
+bool DataFlowGraph::alias(RegisterRef RA, RegisterRef RB) const {
+ assert(TargetRegisterInfo::isPhysicalRegister(RA.Reg));
+ assert(TargetRegisterInfo::isPhysicalRegister(RB.Reg));
+
+ MCRegUnitMaskIterator UMA(RA.Reg, &TRI);
+ MCRegUnitMaskIterator UMB(RB.Reg, &TRI);
+ // Reg units are returned in the numerical order.
+ while (UMA.isValid() && UMB.isValid()) {
+ std::pair<uint32_t,LaneBitmask> PA = *UMA;
+ std::pair<uint32_t,LaneBitmask> PB = *UMB;
+ if (PA.first == PB.first) {
+ // Lane mask of 0 (given by the iterator) should be treated as "full".
+ // This can happen when the register has only one unit, or when the
+ // unit corresponds to explicit aliasing. In such cases, the lane mask
+ // from RegisterRef should be ignored.
+ if (PA.second.none() || PB.second.none())
+ return true;
+
+ // At this point the common unit corresponds to a subregister. The lane
+ // masks correspond to the lane mask of that unit within the original
+ // register, for example assuming register quadruple q0 = r3:0, and
+ // a register pair d1 = r3:2, the lane mask of r2 in q0 may be 0b0100,
+ // while the lane mask of r2 in d1 may be 0b0001.
+ LaneBitmask LA = PA.second & RA.Mask;
+ LaneBitmask LB = PB.second & RB.Mask;
+ if (LA.any() && LB.any()) {
+ unsigned Root = *MCRegUnitRootIterator(PA.first, &TRI);
+ // If register units were guaranteed to only have 1 bit in any lane
+ // mask, the code below would not be necessary. This is because LA
+ // and LB would have at most 1 bit set each, and that bit would be
+ // guaranteed to correspond to the given register unit.
+ uint32_t SubA = TRI.getSubRegIndex(RA.Reg, Root);
+ uint32_t SubB = TRI.getSubRegIndex(RB.Reg, Root);
+ const TargetRegisterClass &RC = *TRI.getMinimalPhysRegClass(Root);
+ LaneBitmask MaskA = TRI.reverseComposeSubRegIndexLaneMask(SubA, LA);
+ LaneBitmask MaskB = TRI.reverseComposeSubRegIndexLaneMask(SubB, LB);
+ if ((MaskA & MaskB & RC.LaneMask).any())
+ return true;
+ }
+
+ ++UMA;
+ ++UMB;
+ continue;
+ }
+ if (PA.first < PB.first)
+ ++UMA;
+ else if (PB.first < PA.first)
+ ++UMB;
+ }
+ return false;
+}
+
+
+// Clear all information in the graph.
+void DataFlowGraph::reset() {
+ Memory.clear();
+ BlockNodes.clear();
+ Func = NodeAddr<FuncNode*>();
+}
+
+
+// Return the next reference node in the instruction node IA that is related
+// to RA. Conceptually, two reference nodes are related if they refer to the
+// same instance of a register access, but differ in flags or other minor
+// characteristics. Specific examples of related nodes are shadow reference
+// nodes.
+// Return the equivalent of nullptr if there are no more related references.
+NodeAddr<RefNode*> DataFlowGraph::getNextRelated(NodeAddr<InstrNode*> IA,
+ NodeAddr<RefNode*> RA) const {
+ assert(IA.Id != 0 && RA.Id != 0);
+
+ auto Related = [this,RA](NodeAddr<RefNode*> TA) -> bool {
+ if (TA.Addr->getKind() != RA.Addr->getKind())
+ return false;
+ if (TA.Addr->getRegRef(*this) != RA.Addr->getRegRef(*this))
+ return false;
+ return true;
+ };
+ auto RelatedStmt = [&Related,RA](NodeAddr<RefNode*> TA) -> bool {
+ return Related(TA) &&
+ &RA.Addr->getOp() == &TA.Addr->getOp();
+ };
+ auto RelatedPhi = [&Related,RA](NodeAddr<RefNode*> TA) -> bool {
+ if (!Related(TA))
+ return false;
+ if (TA.Addr->getKind() != NodeAttrs::Use)
+ return true;
+ // For phi uses, compare predecessor blocks.
+ const NodeAddr<const PhiUseNode*> TUA = TA;
+ const NodeAddr<const PhiUseNode*> RUA = RA;
+ return TUA.Addr->getPredecessor() == RUA.Addr->getPredecessor();
+ };
+
+ RegisterRef RR = RA.Addr->getRegRef(*this);
+ if (IA.Addr->getKind() == NodeAttrs::Stmt)
+ return RA.Addr->getNextRef(RR, RelatedStmt, true, *this);
+ return RA.Addr->getNextRef(RR, RelatedPhi, true, *this);
+}
+
+// Find the next node related to RA in IA that satisfies condition P.
+// If such a node was found, return a pair where the second element is the
+// located node. If such a node does not exist, return a pair where the
+// first element is the element after which such a node should be inserted,
+// and the second element is a null-address.
+template <typename Predicate>
+std::pair<NodeAddr<RefNode*>,NodeAddr<RefNode*>>
+DataFlowGraph::locateNextRef(NodeAddr<InstrNode*> IA, NodeAddr<RefNode*> RA,
+ Predicate P) const {
+ assert(IA.Id != 0 && RA.Id != 0);
+
+ NodeAddr<RefNode*> NA;
+ NodeId Start = RA.Id;
+ while (true) {
+ NA = getNextRelated(IA, RA);
+ if (NA.Id == 0 || NA.Id == Start)
+ break;
+ if (P(NA))
+ break;
+ RA = NA;
+ }
+
+ if (NA.Id != 0 && NA.Id != Start)
+ return std::make_pair(RA, NA);
+ return std::make_pair(RA, NodeAddr<RefNode*>());
+}
+
+// Get the next shadow node in IA corresponding to RA, and optionally create
+// such a node if it does not exist.
+NodeAddr<RefNode*> DataFlowGraph::getNextShadow(NodeAddr<InstrNode*> IA,
+ NodeAddr<RefNode*> RA, bool Create) {
+ assert(IA.Id != 0 && RA.Id != 0);
+
+ uint16_t Flags = RA.Addr->getFlags() | NodeAttrs::Shadow;
+ auto IsShadow = [Flags] (NodeAddr<RefNode*> TA) -> bool {
+ return TA.Addr->getFlags() == Flags;
+ };
+ auto Loc = locateNextRef(IA, RA, IsShadow);
+ if (Loc.second.Id != 0 || !Create)
+ return Loc.second;
+
+ // Create a copy of RA and mark is as shadow.
+ NodeAddr<RefNode*> NA = cloneNode(RA);
+ NA.Addr->setFlags(Flags | NodeAttrs::Shadow);
+ IA.Addr->addMemberAfter(Loc.first, NA, *this);
+ return NA;
+}
+
+// Get the next shadow node in IA corresponding to RA. Return null-address
+// if such a node does not exist.
+NodeAddr<RefNode*> DataFlowGraph::getNextShadow(NodeAddr<InstrNode*> IA,
+ NodeAddr<RefNode*> RA) const {
+ assert(IA.Id != 0 && RA.Id != 0);
+ uint16_t Flags = RA.Addr->getFlags() | NodeAttrs::Shadow;
+ auto IsShadow = [Flags] (NodeAddr<RefNode*> TA) -> bool {
+ return TA.Addr->getFlags() == Flags;
+ };
+ return locateNextRef(IA, RA, IsShadow).second;
+}
+
+// Create a new statement node in the block node BA that corresponds to
+// the machine instruction MI.
+void DataFlowGraph::buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In) {
+ NodeAddr<StmtNode*> SA = newStmt(BA, &In);
+
+ auto isCall = [] (const MachineInstr &In) -> bool {
+ if (In.isCall())
+ return true;
+ // Is tail call?
+ if (In.isBranch())
+ for (const MachineOperand &Op : In.operands())
+ if (Op.isGlobal() || Op.isSymbol())
+ return true;
+ return false;
+ };
+
+ auto isDefUndef = [this] (const MachineInstr &In, RegisterRef DR) -> bool {
+ // This instruction defines DR. Check if there is a use operand that
+ // would make DR live on entry to the instruction.
+ for (const MachineOperand &UseOp : In.operands()) {
+ if (!UseOp.isReg() || !UseOp.isUse() || UseOp.isUndef())
+ continue;
+ RegisterRef UR = makeRegRef(UseOp.getReg(), UseOp.getSubReg());
+ if (alias(DR, UR))
+ return false;
+ }
+ return true;
+ };
+
+ // Collect a set of registers that this instruction implicitly uses
+ // or defines. Implicit operands from an instruction will be ignored
+ // unless they are listed here.
+ RegisterSet ImpUses, ImpDefs;
+ if (const uint16_t *ImpD = In.getDesc().getImplicitDefs())
+ while (uint16_t R = *ImpD++)
+ ImpDefs.insert(RegisterRef(R));
+ if (const uint16_t *ImpU = In.getDesc().getImplicitUses())
+ while (uint16_t R = *ImpU++)
+ ImpUses.insert(RegisterRef(R));
+
+ bool IsCall = isCall(In);
+ bool NeedsImplicit = IsCall || In.isInlineAsm() || In.isReturn();
+ bool IsPredicated = TII.isPredicated(In);
+ unsigned NumOps = In.getNumOperands();
+
+ // Avoid duplicate implicit defs. This will not detect cases of implicit
+ // defs that define registers that overlap, but it is not clear how to
+ // interpret that in the absence of explicit defs. Overlapping explicit
+ // defs are likely illegal already.
+ RegisterSet DoneDefs;
+ // Process explicit defs first.
+ for (unsigned OpN = 0; OpN < NumOps; ++OpN) {
+ MachineOperand &Op = In.getOperand(OpN);
+ if (!Op.isReg() || !Op.isDef() || Op.isImplicit())
+ continue;
+ RegisterRef RR = makeRegRef(Op.getReg(), Op.getSubReg());
+ uint16_t Flags = NodeAttrs::None;
+ if (TOI.isPreserving(In, OpN)) {
+ Flags |= NodeAttrs::Preserving;
+ // If the def is preserving, check if it is also undefined.
+ if (isDefUndef(In, RR))
+ Flags |= NodeAttrs::Undef;
+ }
+ if (TOI.isClobbering(In, OpN))
+ Flags |= NodeAttrs::Clobbering;
+ if (TOI.isFixedReg(In, OpN))
+ Flags |= NodeAttrs::Fixed;
+ if (IsCall && Op.isDead())
+ Flags |= NodeAttrs::Dead;
+ NodeAddr<DefNode*> DA = newDef(SA, Op, Flags);
+ SA.Addr->addMember(DA, *this);
+ DoneDefs.insert(RR);
+ }
+
+ // Process implicit defs, skipping those that have already been added
+ // as explicit.
+ for (unsigned OpN = 0; OpN < NumOps; ++OpN) {
+ MachineOperand &Op = In.getOperand(OpN);
+ if (!Op.isReg() || !Op.isDef() || !Op.isImplicit())
+ continue;
+ RegisterRef RR = makeRegRef(Op.getReg(), Op.getSubReg());
+ if (!NeedsImplicit && !ImpDefs.count(RR))
+ continue;
+ if (DoneDefs.count(RR))
+ continue;
+ uint16_t Flags = NodeAttrs::None;
+ if (TOI.isPreserving(In, OpN)) {
+ Flags |= NodeAttrs::Preserving;
+ // If the def is preserving, check if it is also undefined.
+ if (isDefUndef(In, RR))
+ Flags |= NodeAttrs::Undef;
+ }
+ if (TOI.isClobbering(In, OpN))
+ Flags |= NodeAttrs::Clobbering;
+ if (TOI.isFixedReg(In, OpN))
+ Flags |= NodeAttrs::Fixed;
+ if (IsCall && Op.isDead())
+ Flags |= NodeAttrs::Dead;
+ NodeAddr<DefNode*> DA = newDef(SA, Op, Flags);
+ SA.Addr->addMember(DA, *this);
+ DoneDefs.insert(RR);
+ }
+
+ for (unsigned OpN = 0; OpN < NumOps; ++OpN) {
+ MachineOperand &Op = In.getOperand(OpN);
+ if (!Op.isReg() || !Op.isUse())
+ continue;
+ RegisterRef RR = makeRegRef(Op.getReg(), Op.getSubReg());
+ // Add implicit uses on return and call instructions, and on predicated
+ // instructions regardless of whether or not they appear in the instruction
+ // descriptor's list.
+ bool Implicit = Op.isImplicit();
+ bool TakeImplicit = NeedsImplicit || IsPredicated;
+ if (Implicit && !TakeImplicit && !ImpUses.count(RR))
+ continue;
+ uint16_t Flags = NodeAttrs::None;
+ if (Op.isUndef())
+ Flags |= NodeAttrs::Undef;
+ if (TOI.isFixedReg(In, OpN))
+ Flags |= NodeAttrs::Fixed;
+ NodeAddr<UseNode*> UA = newUse(SA, Op, Flags);
+ SA.Addr->addMember(UA, *this);
+ }
+}
+
+// Build a map that for each block will have the set of all references from
+// that block, and from all blocks dominated by it.
+void DataFlowGraph::buildBlockRefs(NodeAddr<BlockNode*> BA,
+ BlockRefsMap &RefM) {
+ RegisterSet &Refs = RefM[BA.Id];
+ MachineDomTreeNode *N = MDT.getNode(BA.Addr->getCode());
+ assert(N);
+ for (auto I : *N) {
+ MachineBasicBlock *SB = I->getBlock();
+ NodeAddr<BlockNode*> SBA = findBlock(SB);
+ buildBlockRefs(SBA, RefM);
+ const RegisterSet &RefsS = RefM[SBA.Id];
+ Refs.insert(RefsS.begin(), RefsS.end());
+ }
+
+ for (NodeAddr<InstrNode*> IA : BA.Addr->members(*this))
+ for (NodeAddr<RefNode*> RA : IA.Addr->members(*this))
+ Refs.insert(RA.Addr->getRegRef(*this));
+}
+
+// Scan all defs in the block node BA and record in PhiM the locations of
+// phi nodes corresponding to these defs.
+void DataFlowGraph::recordDefsForDF(BlockRefsMap &PhiM, BlockRefsMap &RefM,
+ NodeAddr<BlockNode*> BA) {
+ // Check all defs from block BA and record them in each block in BA's
+ // iterated dominance frontier. This information will later be used to
+ // create phi nodes.
+ MachineBasicBlock *BB = BA.Addr->getCode();
+ assert(BB);
+ auto DFLoc = MDF.find(BB);
+ if (DFLoc == MDF.end() || DFLoc->second.empty())
+ return;
+
+ // Traverse all instructions in the block and collect the set of all
+ // defined references. For each reference there will be a phi created
+ // in the block's iterated dominance frontier.
+ // This is done to make sure that each defined reference gets only one
+ // phi node, even if it is defined multiple times.
+ RegisterSet Defs;
+ for (NodeAddr<InstrNode*> IA : BA.Addr->members(*this))
+ for (NodeAddr<RefNode*> RA : IA.Addr->members_if(IsDef, *this))
+ Defs.insert(RA.Addr->getRegRef(*this));
+
+ // Calculate the iterated dominance frontier of BB.
+ const MachineDominanceFrontier::DomSetType &DF = DFLoc->second;
+ SetVector<MachineBasicBlock*> IDF(DF.begin(), DF.end());
+ for (unsigned i = 0; i < IDF.size(); ++i) {
+ auto F = MDF.find(IDF[i]);
+ if (F != MDF.end())
+ IDF.insert(F->second.begin(), F->second.end());
+ }
+
+ // Get the register references that are reachable from this block.
+ RegisterSet &Refs = RefM[BA.Id];
+ for (auto DB : IDF) {
+ NodeAddr<BlockNode*> DBA = findBlock(DB);
+ const RegisterSet &RefsD = RefM[DBA.Id];
+ Refs.insert(RefsD.begin(), RefsD.end());
+ }
+
+ // Finally, add the set of defs to each block in the iterated dominance
+ // frontier.
+ for (auto DB : IDF) {
+ NodeAddr<BlockNode*> DBA = findBlock(DB);
+ PhiM[DBA.Id].insert(Defs.begin(), Defs.end());
+ }
+}
+
+// Given the locations of phi nodes in the map PhiM, create the phi nodes
+// that are located in the block node BA.
+void DataFlowGraph::buildPhis(BlockRefsMap &PhiM, BlockRefsMap &RefM,
+ NodeAddr<BlockNode*> BA) {
+ // Check if this blocks has any DF defs, i.e. if there are any defs
+ // that this block is in the iterated dominance frontier of.
+ auto HasDF = PhiM.find(BA.Id);
+ if (HasDF == PhiM.end() || HasDF->second.empty())
+ return;
+
+ // First, remove all R in Refs in such that there exists T in Refs
+ // such that T covers R. In other words, only leave those refs that
+ // are not covered by another ref (i.e. maximal with respect to covering).
+
+ auto MaxCoverIn = [this] (RegisterRef RR, RegisterSet &RRs) -> RegisterRef {
+ for (RegisterRef I : RRs)
+ if (I != RR && RegisterAggr::isCoverOf(I, RR, TRI))
+ RR = I;
+ return RR;
+ };
+
+ RegisterSet MaxDF;
+ for (RegisterRef I : HasDF->second)
+ MaxDF.insert(MaxCoverIn(I, HasDF->second));
+
+ std::vector<RegisterRef> MaxRefs;
+ RegisterSet &RefB = RefM[BA.Id];
+ for (RegisterRef I : MaxDF)
+ MaxRefs.push_back(MaxCoverIn(I, RefB));
+
+ // Now, for each R in MaxRefs, get the alias closure of R. If the closure
+ // only has R in it, create a phi a def for R. Otherwise, create a phi,
+ // and add a def for each S in the closure.
+
+ // Sort the refs so that the phis will be created in a deterministic order.
+ std::sort(MaxRefs.begin(), MaxRefs.end());
+ // Remove duplicates.
+ auto NewEnd = std::unique(MaxRefs.begin(), MaxRefs.end());
+ MaxRefs.erase(NewEnd, MaxRefs.end());
+
+ auto Aliased = [this,&MaxRefs](RegisterRef RR,
+ std::vector<unsigned> &Closure) -> bool {
+ for (unsigned I : Closure)
+ if (alias(RR, MaxRefs[I]))
+ return true;
+ return false;
+ };
+
+ // Prepare a list of NodeIds of the block's predecessors.
+ NodeList Preds;
+ const MachineBasicBlock *MBB = BA.Addr->getCode();
+ for (MachineBasicBlock *PB : MBB->predecessors())
+ Preds.push_back(findBlock(PB));
+
+ while (!MaxRefs.empty()) {
+ // Put the first element in the closure, and then add all subsequent
+ // elements from MaxRefs to it, if they alias at least one element
+ // already in the closure.
+ // ClosureIdx: vector of indices in MaxRefs of members of the closure.
+ std::vector<unsigned> ClosureIdx = { 0 };
+ for (unsigned i = 1; i != MaxRefs.size(); ++i)
+ if (Aliased(MaxRefs[i], ClosureIdx))
+ ClosureIdx.push_back(i);
+
+ // Build a phi for the closure.
+ unsigned CS = ClosureIdx.size();
+ NodeAddr<PhiNode*> PA = newPhi(BA);
+
+ // Add defs.
+ for (unsigned X = 0; X != CS; ++X) {
+ RegisterRef RR = MaxRefs[ClosureIdx[X]];
+ uint16_t PhiFlags = NodeAttrs::PhiRef | NodeAttrs::Preserving;
+ NodeAddr<DefNode*> DA = newDef(PA, RR, PhiFlags);
+ PA.Addr->addMember(DA, *this);
+ }
+ // Add phi uses.
+ for (NodeAddr<BlockNode*> PBA : Preds) {
+ for (unsigned X = 0; X != CS; ++X) {
+ RegisterRef RR = MaxRefs[ClosureIdx[X]];
+ NodeAddr<PhiUseNode*> PUA = newPhiUse(PA, RR, PBA);
+ PA.Addr->addMember(PUA, *this);
+ }
+ }
+
+ // Erase from MaxRefs all elements in the closure.
+ auto Begin = MaxRefs.begin();
+ for (unsigned i = ClosureIdx.size(); i != 0; --i)
+ MaxRefs.erase(Begin + ClosureIdx[i-1]);
+ }
+}
+
+// Remove any unneeded phi nodes that were created during the build process.
+void DataFlowGraph::removeUnusedPhis() {
+ // This will remove unused phis, i.e. phis where each def does not reach
+ // any uses or other defs. This will not detect or remove circular phi
+ // chains that are otherwise dead. Unused/dead phis are created during
+ // the build process and this function is intended to remove these cases
+ // that are easily determinable to be unnecessary.
+
+ SetVector<NodeId> PhiQ;
+ for (NodeAddr<BlockNode*> BA : Func.Addr->members(*this)) {
+ for (auto P : BA.Addr->members_if(IsPhi, *this))
+ PhiQ.insert(P.Id);
+ }
+
+ static auto HasUsedDef = [](NodeList &Ms) -> bool {
+ for (NodeAddr<NodeBase*> M : Ms) {
+ if (M.Addr->getKind() != NodeAttrs::Def)
+ continue;
+ NodeAddr<DefNode*> DA = M;
+ if (DA.Addr->getReachedDef() != 0 || DA.Addr->getReachedUse() != 0)
+ return true;
+ }
+ return false;
+ };
+
+ // Any phi, if it is removed, may affect other phis (make them dead).
+ // For each removed phi, collect the potentially affected phis and add
+ // them back to the queue.
+ while (!PhiQ.empty()) {
+ auto PA = addr<PhiNode*>(PhiQ[0]);
+ PhiQ.remove(PA.Id);
+ NodeList Refs = PA.Addr->members(*this);
+ if (HasUsedDef(Refs))
+ continue;
+ for (NodeAddr<RefNode*> RA : Refs) {
+ if (NodeId RD = RA.Addr->getReachingDef()) {
+ auto RDA = addr<DefNode*>(RD);
+ NodeAddr<InstrNode*> OA = RDA.Addr->getOwner(*this);
+ if (IsPhi(OA))
+ PhiQ.insert(OA.Id);
+ }
+ if (RA.Addr->isDef())
+ unlinkDef(RA, true);
+ else
+ unlinkUse(RA, true);
+ }
+ NodeAddr<BlockNode*> BA = PA.Addr->getOwner(*this);
+ BA.Addr->removeMember(PA, *this);
+ }
+}
+
+// For a given reference node TA in an instruction node IA, connect the
+// reaching def of TA to the appropriate def node. Create any shadow nodes
+// as appropriate.
+template <typename T>
+void DataFlowGraph::linkRefUp(NodeAddr<InstrNode*> IA, NodeAddr<T> TA,
+ DefStack &DS) {
+ if (DS.empty())
+ return;
+ RegisterRef RR = TA.Addr->getRegRef(*this);
+ NodeAddr<T> TAP;
+
+ // References from the def stack that have been examined so far.
+ RegisterAggr Defs(TRI);
+
+ for (auto I = DS.top(), E = DS.bottom(); I != E; I.down()) {
+ RegisterRef QR = I->Addr->getRegRef(*this);
+
+ // Skip all defs that are aliased to any of the defs that we have already
+ // seen. If this completes a cover of RR, stop the stack traversal.
+ bool Alias = Defs.hasAliasOf(QR);
+ bool Cover = Defs.insert(QR).hasCoverOf(RR);
+ if (Alias) {
+ if (Cover)
+ break;
+ continue;
+ }
+
+ // The reaching def.
+ NodeAddr<DefNode*> RDA = *I;
+
+ // Pick the reached node.
+ if (TAP.Id == 0) {
+ TAP = TA;
+ } else {
+ // Mark the existing ref as "shadow" and create a new shadow.
+ TAP.Addr->setFlags(TAP.Addr->getFlags() | NodeAttrs::Shadow);
+ TAP = getNextShadow(IA, TAP, true);
+ }
+
+ // Create the link.
+ TAP.Addr->linkToDef(TAP.Id, RDA);
+
+ if (Cover)
+ break;
+ }
+}
+
+// Create data-flow links for all reference nodes in the statement node SA.
+void DataFlowGraph::linkStmtRefs(DefStackMap &DefM, NodeAddr<StmtNode*> SA) {
+#ifndef NDEBUG
+ RegisterSet Defs;
+#endif
+
+ // Link all nodes (upwards in the data-flow) with their reaching defs.
+ for (NodeAddr<RefNode*> RA : SA.Addr->members(*this)) {
+ uint16_t Kind = RA.Addr->getKind();
+ assert(Kind == NodeAttrs::Def || Kind == NodeAttrs::Use);
+ RegisterRef RR = RA.Addr->getRegRef(*this);
+#ifndef NDEBUG
+ // Do not expect multiple defs of the same reference.
+ assert(Kind != NodeAttrs::Def || !Defs.count(RR));
+ Defs.insert(RR);
+#endif
+
+ auto F = DefM.find(RR.Reg);
+ if (F == DefM.end())
+ continue;
+ DefStack &DS = F->second;
+ if (Kind == NodeAttrs::Use)
+ linkRefUp<UseNode*>(SA, RA, DS);
+ else if (Kind == NodeAttrs::Def)
+ linkRefUp<DefNode*>(SA, RA, DS);
+ else
+ llvm_unreachable("Unexpected node in instruction");
+ }
+}
+
+// Create data-flow links for all instructions in the block node BA. This
+// will include updating any phi nodes in BA.
+void DataFlowGraph::linkBlockRefs(DefStackMap &DefM, NodeAddr<BlockNode*> BA) {
+ // Push block delimiters.
+ markBlock(BA.Id, DefM);
+
+ assert(BA.Addr && "block node address is needed to create a data-flow link");
+ // For each non-phi instruction in the block, link all the defs and uses
+ // to their reaching defs. For any member of the block (including phis),
+ // push the defs on the corresponding stacks.
+ for (NodeAddr<InstrNode*> IA : BA.Addr->members(*this)) {
+ // Ignore phi nodes here. They will be linked part by part from the
+ // predecessors.
+ if (IA.Addr->getKind() == NodeAttrs::Stmt)
+ linkStmtRefs(DefM, IA);
+
+ // Push the definitions on the stack.
+ pushDefs(IA, DefM);
+ }
+
+ // Recursively process all children in the dominator tree.
+ MachineDomTreeNode *N = MDT.getNode(BA.Addr->getCode());
+ for (auto I : *N) {
+ MachineBasicBlock *SB = I->getBlock();
+ NodeAddr<BlockNode*> SBA = findBlock(SB);
+ linkBlockRefs(DefM, SBA);
+ }
+
+ // Link the phi uses from the successor blocks.
+ auto IsUseForBA = [BA](NodeAddr<NodeBase*> NA) -> bool {
+ if (NA.Addr->getKind() != NodeAttrs::Use)
+ return false;
+ assert(NA.Addr->getFlags() & NodeAttrs::PhiRef);
+ NodeAddr<PhiUseNode*> PUA = NA;
+ return PUA.Addr->getPredecessor() == BA.Id;
+ };
+
+ RegisterSet EHLiveIns = getLandingPadLiveIns();
+ MachineBasicBlock *MBB = BA.Addr->getCode();
+
+ for (MachineBasicBlock *SB : MBB->successors()) {
+ bool IsEHPad = SB->isEHPad();
+ NodeAddr<BlockNode*> SBA = findBlock(SB);
+ for (NodeAddr<InstrNode*> IA : SBA.Addr->members_if(IsPhi, *this)) {
+ // Do not link phi uses for landing pad live-ins.
+ if (IsEHPad) {
+ // Find what register this phi is for.
+ NodeAddr<RefNode*> RA = IA.Addr->getFirstMember(*this);
+ assert(RA.Id != 0);
+ if (EHLiveIns.count(RA.Addr->getRegRef(*this)))
+ continue;
+ }
+ // Go over each phi use associated with MBB, and link it.
+ for (auto U : IA.Addr->members_if(IsUseForBA, *this)) {
+ NodeAddr<PhiUseNode*> PUA = U;
+ RegisterRef RR = PUA.Addr->getRegRef(*this);
+ linkRefUp<UseNode*>(IA, PUA, DefM[RR.Reg]);
+ }
+ }
+ }
+
+ // Pop all defs from this block from the definition stacks.
+ releaseBlock(BA.Id, DefM);
+}
+
+// Remove the use node UA from any data-flow and structural links.
+void DataFlowGraph::unlinkUseDF(NodeAddr<UseNode*> UA) {
+ NodeId RD = UA.Addr->getReachingDef();
+ NodeId Sib = UA.Addr->getSibling();
+
+ if (RD == 0) {
+ assert(Sib == 0);
+ return;
+ }
+
+ auto RDA = addr<DefNode*>(RD);
+ auto TA = addr<UseNode*>(RDA.Addr->getReachedUse());
+ if (TA.Id == UA.Id) {
+ RDA.Addr->setReachedUse(Sib);
+ return;
+ }
+
+ while (TA.Id != 0) {
+ NodeId S = TA.Addr->getSibling();
+ if (S == UA.Id) {
+ TA.Addr->setSibling(UA.Addr->getSibling());
+ return;
+ }
+ TA = addr<UseNode*>(S);
+ }
+}
+
+// Remove the def node DA from any data-flow and structural links.
+void DataFlowGraph::unlinkDefDF(NodeAddr<DefNode*> DA) {
+ //
+ // RD
+ // | reached
+ // | def
+ // :
+ // .
+ // +----+
+ // ... -- | DA | -- ... -- 0 : sibling chain of DA
+ // +----+
+ // | | reached
+ // | : def
+ // | .
+ // | ... : Siblings (defs)
+ // |
+ // : reached
+ // . use
+ // ... : sibling chain of reached uses
+
+ NodeId RD = DA.Addr->getReachingDef();
+
+ // Visit all siblings of the reached def and reset their reaching defs.
+ // Also, defs reached by DA are now "promoted" to being reached by RD,
+ // so all of them will need to be spliced into the sibling chain where
+ // DA belongs.
+ auto getAllNodes = [this] (NodeId N) -> NodeList {
+ NodeList Res;
+ while (N) {
+ auto RA = addr<RefNode*>(N);
+ // Keep the nodes in the exact sibling order.
+ Res.push_back(RA);
+ N = RA.Addr->getSibling();
+ }
+ return Res;
+ };
+ NodeList ReachedDefs = getAllNodes(DA.Addr->getReachedDef());
+ NodeList ReachedUses = getAllNodes(DA.Addr->getReachedUse());
+
+ if (RD == 0) {
+ for (NodeAddr<RefNode*> I : ReachedDefs)
+ I.Addr->setSibling(0);
+ for (NodeAddr<RefNode*> I : ReachedUses)
+ I.Addr->setSibling(0);
+ }
+ for (NodeAddr<DefNode*> I : ReachedDefs)
+ I.Addr->setReachingDef(RD);
+ for (NodeAddr<UseNode*> I : ReachedUses)
+ I.Addr->setReachingDef(RD);
+
+ NodeId Sib = DA.Addr->getSibling();
+ if (RD == 0) {
+ assert(Sib == 0);
+ return;
+ }
+
+ // Update the reaching def node and remove DA from the sibling list.
+ auto RDA = addr<DefNode*>(RD);
+ auto TA = addr<DefNode*>(RDA.Addr->getReachedDef());
+ if (TA.Id == DA.Id) {
+ // If DA is the first reached def, just update the RD's reached def
+ // to the DA's sibling.
+ RDA.Addr->setReachedDef(Sib);
+ } else {
+ // Otherwise, traverse the sibling list of the reached defs and remove
+ // DA from it.
+ while (TA.Id != 0) {
+ NodeId S = TA.Addr->getSibling();
+ if (S == DA.Id) {
+ TA.Addr->setSibling(Sib);
+ break;
+ }
+ TA = addr<DefNode*>(S);
+ }
+ }
+
+ // Splice the DA's reached defs into the RDA's reached def chain.
+ if (!ReachedDefs.empty()) {
+ auto Last = NodeAddr<DefNode*>(ReachedDefs.back());
+ Last.Addr->setSibling(RDA.Addr->getReachedDef());
+ RDA.Addr->setReachedDef(ReachedDefs.front().Id);
+ }
+ // Splice the DA's reached uses into the RDA's reached use chain.
+ if (!ReachedUses.empty()) {
+ auto Last = NodeAddr<UseNode*>(ReachedUses.back());
+ Last.Addr->setSibling(RDA.Addr->getReachedUse());
+ RDA.Addr->setReachedUse(ReachedUses.front().Id);
+ }
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFGraph.h b/contrib/llvm/lib/Target/Hexagon/RDFGraph.h
new file mode 100644
index 000000000000..871062ff2b05
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/RDFGraph.h
@@ -0,0 +1,997 @@
+//===--- RDFGraph.h -------------------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Target-independent, SSA-based data flow graph for register data flow (RDF)
+// for a non-SSA program representation (e.g. post-RA machine code).
+//
+//
+// *** Introduction
+//
+// The RDF graph is a collection of nodes, each of which denotes some element
+// of the program. There are two main types of such elements: code and refe-
+// rences. Conceptually, "code" is something that represents the structure
+// of the program, e.g. basic block or a statement, while "reference" is an
+// instance of accessing a register, e.g. a definition or a use. Nodes are
+// connected with each other based on the structure of the program (such as
+// blocks, instructions, etc.), and based on the data flow (e.g. reaching
+// definitions, reached uses, etc.). The single-reaching-definition principle
+// of SSA is generally observed, although, due to the non-SSA representation
+// of the program, there are some differences between the graph and a "pure"
+// SSA representation.
+//
+//
+// *** Implementation remarks
+//
+// Since the graph can contain a large number of nodes, memory consumption
+// was one of the major design considerations. As a result, there is a single
+// base class NodeBase which defines all members used by all possible derived
+// classes. The members are arranged in a union, and a derived class cannot
+// add any data members of its own. Each derived class only defines the
+// functional interface, i.e. member functions. NodeBase must be a POD,
+// which implies that all of its members must also be PODs.
+// Since nodes need to be connected with other nodes, pointers have been
+// replaced with 32-bit identifiers: each node has an id of type NodeId.
+// There are mapping functions in the graph that translate between actual
+// memory addresses and the corresponding identifiers.
+// A node id of 0 is equivalent to nullptr.
+//
+//
+// *** Structure of the graph
+//
+// A code node is always a collection of other nodes. For example, a code
+// node corresponding to a basic block will contain code nodes corresponding
+// to instructions. In turn, a code node corresponding to an instruction will
+// contain a list of reference nodes that correspond to the definitions and
+// uses of registers in that instruction. The members are arranged into a
+// circular list, which is yet another consequence of the effort to save
+// memory: for each member node it should be possible to obtain its owner,
+// and it should be possible to access all other members. There are other
+// ways to accomplish that, but the circular list seemed the most natural.
+//
+// +- CodeNode -+
+// | | <---------------------------------------------------+
+// +-+--------+-+ |
+// |FirstM |LastM |
+// | +-------------------------------------+ |
+// | | |
+// V V |
+// +----------+ Next +----------+ Next Next +----------+ Next |
+// | |----->| |-----> ... ----->| |----->-+
+// +- Member -+ +- Member -+ +- Member -+
+//
+// The order of members is such that related reference nodes (see below)
+// should be contiguous on the member list.
+//
+// A reference node is a node that encapsulates an access to a register,
+// in other words, data flowing into or out of a register. There are two
+// major kinds of reference nodes: defs and uses. A def node will contain
+// the id of the first reached use, and the id of the first reached def.
+// Each def and use will contain the id of the reaching def, and also the
+// id of the next reached def (for def nodes) or use (for use nodes).
+// The "next node sharing the same reaching def" is denoted as "sibling".
+// In summary:
+// - Def node contains: reaching def, sibling, first reached def, and first
+// reached use.
+// - Use node contains: reaching def and sibling.
+//
+// +-- DefNode --+
+// | R2 = ... | <---+--------------------+
+// ++---------+--+ | |
+// |Reached |Reached | |
+// |Def |Use | |
+// | | |Reaching |Reaching
+// | V |Def |Def
+// | +-- UseNode --+ Sib +-- UseNode --+ Sib Sib
+// | | ... = R2 |----->| ... = R2 |----> ... ----> 0
+// | +-------------+ +-------------+
+// V
+// +-- DefNode --+ Sib
+// | R2 = ... |----> ...
+// ++---------+--+
+// | |
+// | |
+// ... ...
+//
+// To get a full picture, the circular lists connecting blocks within a
+// function, instructions within a block, etc. should be superimposed with
+// the def-def, def-use links shown above.
+// To illustrate this, consider a small example in a pseudo-assembly:
+// foo:
+// add r2, r0, r1 ; r2 = r0+r1
+// addi r0, r2, 1 ; r0 = r2+1
+// ret r0 ; return value in r0
+//
+// The graph (in a format used by the debugging functions) would look like:
+//
+// DFG dump:[
+// f1: Function foo
+// b2: === BB#0 === preds(0), succs(0):
+// p3: phi [d4<r0>(,d12,u9):]
+// p5: phi [d6<r1>(,,u10):]
+// s7: add [d8<r2>(,,u13):, u9<r0>(d4):, u10<r1>(d6):]
+// s11: addi [d12<r0>(d4,,u15):, u13<r2>(d8):]
+// s14: ret [u15<r0>(d12):]
+// ]
+//
+// The f1, b2, p3, etc. are node ids. The letter is prepended to indicate the
+// kind of the node (i.e. f - function, b - basic block, p - phi, s - state-
+// ment, d - def, u - use).
+// The format of a def node is:
+// dN<R>(rd,d,u):sib,
+// where
+// N - numeric node id,
+// R - register being defined
+// rd - reaching def,
+// d - reached def,
+// u - reached use,
+// sib - sibling.
+// The format of a use node is:
+// uN<R>[!](rd):sib,
+// where
+// N - numeric node id,
+// R - register being used,
+// rd - reaching def,
+// sib - sibling.
+// Possible annotations (usually preceding the node id):
+// + - preserving def,
+// ~ - clobbering def,
+// " - shadow ref (follows the node id),
+// ! - fixed register (appears after register name).
+//
+// The circular lists are not explicit in the dump.
+//
+//
+// *** Node attributes
+//
+// NodeBase has a member "Attrs", which is the primary way of determining
+// the node's characteristics. The fields in this member decide whether
+// the node is a code node or a reference node (i.e. node's "type"), then
+// within each type, the "kind" determines what specifically this node
+// represents. The remaining bits, "flags", contain additional information
+// that is even more detailed than the "kind".
+// CodeNode's kinds are:
+// - Phi: Phi node, members are reference nodes.
+// - Stmt: Statement, members are reference nodes.
+// - Block: Basic block, members are instruction nodes (i.e. Phi or Stmt).
+// - Func: The whole function. The members are basic block nodes.
+// RefNode's kinds are:
+// - Use.
+// - Def.
+//
+// Meaning of flags:
+// - Preserving: applies only to defs. A preserving def is one that can
+// preserve some of the original bits among those that are included in
+// the register associated with that def. For example, if R0 is a 32-bit
+// register, but a def can only change the lower 16 bits, then it will
+// be marked as preserving.
+// - Shadow: a reference that has duplicates holding additional reaching
+// defs (see more below).
+// - Clobbering: applied only to defs, indicates that the value generated
+// by this def is unspecified. A typical example would be volatile registers
+// after function calls.
+// - Fixed: the register in this def/use cannot be replaced with any other
+// register. A typical case would be a parameter register to a call, or
+// the register with the return value from a function.
+// - Undef: the register in this reference the register is assumed to have
+// no pre-existing value, even if it appears to be reached by some def.
+// This is typically used to prevent keeping registers artificially live
+// in cases when they are defined via predicated instructions. For example:
+// r0 = add-if-true cond, r10, r11 (1)
+// r0 = add-if-false cond, r12, r13, r0<imp-use> (2)
+// ... = r0 (3)
+// Before (1), r0 is not intended to be live, and the use of r0 in (3) is
+// not meant to be reached by any def preceding (1). However, since the
+// defs in (1) and (2) are both preserving, these properties alone would
+// imply that the use in (3) may indeed be reached by some prior def.
+// Adding Undef flag to the def in (1) prevents that. The Undef flag
+// may be applied to both defs and uses.
+// - Dead: applies only to defs. The value coming out of a "dead" def is
+// assumed to be unused, even if the def appears to be reaching other defs
+// or uses. The motivation for this flag comes from dead defs on function
+// calls: there is no way to determine if such a def is dead without
+// analyzing the target's ABI. Hence the graph should contain this info,
+// as it is unavailable otherwise. On the other hand, a def without any
+// uses on a typical instruction is not the intended target for this flag.
+//
+// *** Shadow references
+//
+// It may happen that a super-register can have two (or more) non-overlapping
+// sub-registers. When both of these sub-registers are defined and followed
+// by a use of the super-register, the use of the super-register will not
+// have a unique reaching def: both defs of the sub-registers need to be
+// accounted for. In such cases, a duplicate use of the super-register is
+// added and it points to the extra reaching def. Both uses are marked with
+// a flag "shadow". Example:
+// Assume t0 is a super-register of r0 and r1, r0 and r1 do not overlap:
+// set r0, 1 ; r0 = 1
+// set r1, 1 ; r1 = 1
+// addi t1, t0, 1 ; t1 = t0+1
+//
+// The DFG:
+// s1: set [d2<r0>(,,u9):]
+// s3: set [d4<r1>(,,u10):]
+// s5: addi [d6<t1>(,,):, u7"<t0>(d2):, u8"<t0>(d4):]
+//
+// The statement s5 has two use nodes for t0: u7" and u9". The quotation
+// mark " indicates that the node is a shadow.
+//
+#ifndef RDF_GRAPH_H
+#define RDF_GRAPH_H
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Timer.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#include <functional>
+#include <map>
+#include <set>
+#include <unordered_map>
+#include <vector>
+
+// RDF uses uint32_t to refer to registers. This is to ensure that the type
+// size remains specific. In other places, registers are often stored using
+// unsigned.
+static_assert(sizeof(uint32_t) == sizeof(unsigned), "Those should be equal");
+
+namespace llvm {
+ class MachineBasicBlock;
+ class MachineFunction;
+ class MachineInstr;
+ class MachineOperand;
+ class MachineDominanceFrontier;
+ class MachineDominatorTree;
+ class TargetInstrInfo;
+
+namespace rdf {
+ typedef uint32_t NodeId;
+ typedef uint32_t RegisterId;
+
+ struct DataFlowGraph;
+
+ struct NodeAttrs {
+ enum : uint16_t {
+ None = 0x0000, // Nothing
+
+ // Types: 2 bits
+ TypeMask = 0x0003,
+ Code = 0x0001, // 01, Container
+ Ref = 0x0002, // 10, Reference
+
+ // Kind: 3 bits
+ KindMask = 0x0007 << 2,
+ Def = 0x0001 << 2, // 001
+ Use = 0x0002 << 2, // 010
+ Phi = 0x0003 << 2, // 011
+ Stmt = 0x0004 << 2, // 100
+ Block = 0x0005 << 2, // 101
+ Func = 0x0006 << 2, // 110
+
+ // Flags: 7 bits for now
+ FlagMask = 0x007F << 5,
+ Shadow = 0x0001 << 5, // 0000001, Has extra reaching defs.
+ Clobbering = 0x0002 << 5, // 0000010, Produces unspecified values.
+ PhiRef = 0x0004 << 5, // 0000100, Member of PhiNode.
+ Preserving = 0x0008 << 5, // 0001000, Def can keep original bits.
+ Fixed = 0x0010 << 5, // 0010000, Fixed register.
+ Undef = 0x0020 << 5, // 0100000, Has no pre-existing value.
+ Dead = 0x0040 << 5, // 1000000, Does not define a value.
+ };
+
+ static uint16_t type(uint16_t T) { return T & TypeMask; }
+ static uint16_t kind(uint16_t T) { return T & KindMask; }
+ static uint16_t flags(uint16_t T) { return T & FlagMask; }
+
+ static uint16_t set_type(uint16_t A, uint16_t T) {
+ return (A & ~TypeMask) | T;
+ }
+ static uint16_t set_kind(uint16_t A, uint16_t K) {
+ return (A & ~KindMask) | K;
+ }
+ static uint16_t set_flags(uint16_t A, uint16_t F) {
+ return (A & ~FlagMask) | F;
+ }
+
+ // Test if A contains B.
+ static bool contains(uint16_t A, uint16_t B) {
+ if (type(A) != Code)
+ return false;
+ uint16_t KB = kind(B);
+ switch (kind(A)) {
+ case Func:
+ return KB == Block;
+ case Block:
+ return KB == Phi || KB == Stmt;
+ case Phi:
+ case Stmt:
+ return type(B) == Ref;
+ }
+ return false;
+ }
+ };
+
+ struct BuildOptions {
+ enum : unsigned {
+ None = 0x00,
+ KeepDeadPhis = 0x01, // Do not remove dead phis during build.
+ };
+ };
+
+ template <typename T> struct NodeAddr {
+ NodeAddr() : Addr(nullptr), Id(0) {}
+ NodeAddr(T A, NodeId I) : Addr(A), Id(I) {}
+
+ bool operator== (const NodeAddr<T> &NA) const {
+ assert((Addr == NA.Addr) == (Id == NA.Id));
+ return Addr == NA.Addr;
+ }
+ bool operator!= (const NodeAddr<T> &NA) const {
+ return !operator==(NA);
+ }
+ // Type cast (casting constructor). The reason for having this class
+ // instead of std::pair.
+ template <typename S> NodeAddr(const NodeAddr<S> &NA)
+ : Addr(static_cast<T>(NA.Addr)), Id(NA.Id) {}
+
+ T Addr;
+ NodeId Id;
+ };
+
+ struct NodeBase;
+
+ // Fast memory allocation and translation between node id and node address.
+ // This is really the same idea as the one underlying the "bump pointer
+ // allocator", the difference being in the translation. A node id is
+ // composed of two components: the index of the block in which it was
+ // allocated, and the index within the block. With the default settings,
+ // where the number of nodes per block is 4096, the node id (minus 1) is:
+ //
+ // bit position: 11 0
+ // +----------------------------+--------------+
+ // | Index of the block |Index in block|
+ // +----------------------------+--------------+
+ //
+ // The actual node id is the above plus 1, to avoid creating a node id of 0.
+ //
+ // This method significantly improved the build time, compared to using maps
+ // (std::unordered_map or DenseMap) to translate between pointers and ids.
+ struct NodeAllocator {
+ // Amount of storage for a single node.
+ enum { NodeMemSize = 32 };
+ NodeAllocator(uint32_t NPB = 4096)
+ : NodesPerBlock(NPB), BitsPerIndex(Log2_32(NPB)),
+ IndexMask((1 << BitsPerIndex)-1), ActiveEnd(nullptr) {
+ assert(isPowerOf2_32(NPB));
+ }
+ NodeBase *ptr(NodeId N) const {
+ uint32_t N1 = N-1;
+ uint32_t BlockN = N1 >> BitsPerIndex;
+ uint32_t Offset = (N1 & IndexMask) * NodeMemSize;
+ return reinterpret_cast<NodeBase*>(Blocks[BlockN]+Offset);
+ }
+ NodeId id(const NodeBase *P) const;
+ NodeAddr<NodeBase*> New();
+ void clear();
+
+ private:
+ void startNewBlock();
+ bool needNewBlock();
+ uint32_t makeId(uint32_t Block, uint32_t Index) const {
+ // Add 1 to the id, to avoid the id of 0, which is treated as "null".
+ return ((Block << BitsPerIndex) | Index) + 1;
+ }
+
+ const uint32_t NodesPerBlock;
+ const uint32_t BitsPerIndex;
+ const uint32_t IndexMask;
+ char *ActiveEnd;
+ std::vector<char*> Blocks;
+ typedef BumpPtrAllocatorImpl<MallocAllocator, 65536> AllocatorTy;
+ AllocatorTy MemPool;
+ };
+
+ struct RegisterRef {
+ RegisterId Reg;
+ LaneBitmask Mask;
+
+ RegisterRef() : RegisterRef(0) {}
+ explicit RegisterRef(RegisterId R, LaneBitmask M = LaneBitmask::getAll())
+ : Reg(R), Mask(R != 0 ? M : LaneBitmask::getNone()) {}
+ operator bool() const { return Reg != 0 && Mask.any(); }
+ bool operator== (const RegisterRef &RR) const {
+ return Reg == RR.Reg && Mask == RR.Mask;
+ }
+ bool operator!= (const RegisterRef &RR) const {
+ return !operator==(RR);
+ }
+ bool operator< (const RegisterRef &RR) const {
+ return Reg < RR.Reg || (Reg == RR.Reg && Mask < RR.Mask);
+ }
+ };
+ typedef std::set<RegisterRef> RegisterSet;
+
+ struct TargetOperandInfo {
+ TargetOperandInfo(const TargetInstrInfo &tii) : TII(tii) {}
+ virtual ~TargetOperandInfo() {}
+ virtual bool isPreserving(const MachineInstr &In, unsigned OpNum) const;
+ virtual bool isClobbering(const MachineInstr &In, unsigned OpNum) const;
+ virtual bool isFixedReg(const MachineInstr &In, unsigned OpNum) const;
+
+ const TargetInstrInfo &TII;
+ };
+
+
+ // Packed register reference. Only used for storage.
+ struct PackedRegisterRef {
+ RegisterId Reg;
+ uint32_t MaskId;
+ };
+
+ // Template class for a map translating uint32_t into arbitrary types.
+ // The map will act like an indexed set: upon insertion of a new object,
+ // it will automatically assign a new index to it. Index of 0 is treated
+ // as invalid and is never allocated.
+ template <typename T, unsigned N = 32>
+ struct IndexedSet {
+ IndexedSet() : Map() { Map.reserve(N); }
+ T get(uint32_t Idx) const {
+ // Index Idx corresponds to Map[Idx-1].
+ assert(Idx != 0 && !Map.empty() && Idx-1 < Map.size());
+ return Map[Idx-1];
+ }
+ uint32_t insert(T Val) {
+ // Linear search.
+ auto F = llvm::find(Map, Val);
+ if (F != Map.end())
+ return F - Map.begin() + 1;
+ Map.push_back(Val);
+ return Map.size(); // Return actual_index + 1.
+ }
+ uint32_t find(T Val) const {
+ auto F = llvm::find(Map, Val);
+ assert(F != Map.end());
+ return F - Map.begin();
+ }
+ private:
+ std::vector<T> Map;
+ };
+
+ struct LaneMaskIndex : private IndexedSet<LaneBitmask> {
+ LaneMaskIndex() = default;
+
+ LaneBitmask getLaneMaskForIndex(uint32_t K) const {
+ return K == 0 ? LaneBitmask::getAll() : get(K);
+ }
+ uint32_t getIndexForLaneMask(LaneBitmask LM) {
+ assert(LM.any());
+ return LM.all() ? 0 : insert(LM);
+ }
+ uint32_t getIndexForLaneMask(LaneBitmask LM) const {
+ assert(LM.any());
+ return LM.all() ? 0 : find(LM);
+ }
+ PackedRegisterRef pack(RegisterRef RR) {
+ return { RR.Reg, getIndexForLaneMask(RR.Mask) };
+ }
+ PackedRegisterRef pack(RegisterRef RR) const {
+ return { RR.Reg, getIndexForLaneMask(RR.Mask) };
+ }
+ RegisterRef unpack(PackedRegisterRef PR) const {
+ return RegisterRef(PR.Reg, getLaneMaskForIndex(PR.MaskId));
+ }
+ };
+
+ struct RegisterAggr {
+ RegisterAggr(const TargetRegisterInfo &tri)
+ : Masks(), ExpAliasUnits(tri.getNumRegUnits()), CheckUnits(false),
+ TRI(tri) {}
+ RegisterAggr(const RegisterAggr &RG)
+ : Masks(RG.Masks), ExpAliasUnits(RG.ExpAliasUnits),
+ CheckUnits(RG.CheckUnits), TRI(RG.TRI) {}
+
+ bool empty() const { return Masks.empty(); }
+ bool hasAliasOf(RegisterRef RR) const;
+ bool hasCoverOf(RegisterRef RR) const;
+ static bool isCoverOf(RegisterRef RA, RegisterRef RB,
+ const TargetRegisterInfo &TRI) {
+ return RegisterAggr(TRI).insert(RA).hasCoverOf(RB);
+ }
+
+ RegisterAggr &insert(RegisterRef RR);
+ RegisterAggr &insert(const RegisterAggr &RG);
+ RegisterAggr &clear(RegisterRef RR);
+ RegisterAggr &clear(const RegisterAggr &RG);
+
+ RegisterRef clearIn(RegisterRef RR) const;
+
+ void print(raw_ostream &OS) const;
+
+ private:
+ typedef std::unordered_map<RegisterId, LaneBitmask> MapType;
+
+ public:
+ typedef MapType::const_iterator iterator;
+ iterator begin() const { return Masks.begin(); }
+ iterator end() const { return Masks.end(); }
+ RegisterRef normalize(RegisterRef RR) const;
+
+ private:
+ MapType Masks;
+ BitVector ExpAliasUnits; // Register units for explicit aliases.
+ bool CheckUnits;
+ const TargetRegisterInfo &TRI;
+ };
+
+
+ struct NodeBase {
+ public:
+ // Make sure this is a POD.
+ NodeBase() = default;
+ uint16_t getType() const { return NodeAttrs::type(Attrs); }
+ uint16_t getKind() const { return NodeAttrs::kind(Attrs); }
+ uint16_t getFlags() const { return NodeAttrs::flags(Attrs); }
+ NodeId getNext() const { return Next; }
+
+ uint16_t getAttrs() const { return Attrs; }
+ void setAttrs(uint16_t A) { Attrs = A; }
+ void setFlags(uint16_t F) { setAttrs(NodeAttrs::set_flags(getAttrs(), F)); }
+
+ // Insert node NA after "this" in the circular chain.
+ void append(NodeAddr<NodeBase*> NA);
+ // Initialize all members to 0.
+ void init() { memset(this, 0, sizeof *this); }
+ void setNext(NodeId N) { Next = N; }
+
+ protected:
+ uint16_t Attrs;
+ uint16_t Reserved;
+ NodeId Next; // Id of the next node in the circular chain.
+ // Definitions of nested types. Using anonymous nested structs would make
+ // this class definition clearer, but unnamed structs are not a part of
+ // the standard.
+ struct Def_struct {
+ NodeId DD, DU; // Ids of the first reached def and use.
+ };
+ struct PhiU_struct {
+ NodeId PredB; // Id of the predecessor block for a phi use.
+ };
+ struct Code_struct {
+ void *CP; // Pointer to the actual code.
+ NodeId FirstM, LastM; // Id of the first member and last.
+ };
+ struct Ref_struct {
+ NodeId RD, Sib; // Ids of the reaching def and the sibling.
+ union {
+ Def_struct Def;
+ PhiU_struct PhiU;
+ };
+ union {
+ MachineOperand *Op; // Non-phi refs point to a machine operand.
+ PackedRegisterRef PR; // Phi refs store register info directly.
+ };
+ };
+
+ // The actual payload.
+ union {
+ Ref_struct Ref;
+ Code_struct Code;
+ };
+ };
+ // The allocator allocates chunks of 32 bytes for each node. The fact that
+ // each node takes 32 bytes in memory is used for fast translation between
+ // the node id and the node address.
+ static_assert(sizeof(NodeBase) <= NodeAllocator::NodeMemSize,
+ "NodeBase must be at most NodeAllocator::NodeMemSize bytes");
+
+ typedef std::vector<NodeAddr<NodeBase*>> NodeList;
+ typedef std::set<NodeId> NodeSet;
+
+ struct RefNode : public NodeBase {
+ RefNode() = default;
+ RegisterRef getRegRef(const DataFlowGraph &G) const;
+ MachineOperand &getOp() {
+ assert(!(getFlags() & NodeAttrs::PhiRef));
+ return *Ref.Op;
+ }
+ void setRegRef(RegisterRef RR, DataFlowGraph &G);
+ void setRegRef(MachineOperand *Op, DataFlowGraph &G);
+ NodeId getReachingDef() const {
+ return Ref.RD;
+ }
+ void setReachingDef(NodeId RD) {
+ Ref.RD = RD;
+ }
+ NodeId getSibling() const {
+ return Ref.Sib;
+ }
+ void setSibling(NodeId Sib) {
+ Ref.Sib = Sib;
+ }
+ bool isUse() const {
+ assert(getType() == NodeAttrs::Ref);
+ return getKind() == NodeAttrs::Use;
+ }
+ bool isDef() const {
+ assert(getType() == NodeAttrs::Ref);
+ return getKind() == NodeAttrs::Def;
+ }
+
+ template <typename Predicate>
+ NodeAddr<RefNode*> getNextRef(RegisterRef RR, Predicate P, bool NextOnly,
+ const DataFlowGraph &G);
+ NodeAddr<NodeBase*> getOwner(const DataFlowGraph &G);
+ };
+
+ struct DefNode : public RefNode {
+ NodeId getReachedDef() const {
+ return Ref.Def.DD;
+ }
+ void setReachedDef(NodeId D) {
+ Ref.Def.DD = D;
+ }
+ NodeId getReachedUse() const {
+ return Ref.Def.DU;
+ }
+ void setReachedUse(NodeId U) {
+ Ref.Def.DU = U;
+ }
+
+ void linkToDef(NodeId Self, NodeAddr<DefNode*> DA);
+ };
+
+ struct UseNode : public RefNode {
+ void linkToDef(NodeId Self, NodeAddr<DefNode*> DA);
+ };
+
+ struct PhiUseNode : public UseNode {
+ NodeId getPredecessor() const {
+ assert(getFlags() & NodeAttrs::PhiRef);
+ return Ref.PhiU.PredB;
+ }
+ void setPredecessor(NodeId B) {
+ assert(getFlags() & NodeAttrs::PhiRef);
+ Ref.PhiU.PredB = B;
+ }
+ };
+
+ struct CodeNode : public NodeBase {
+ template <typename T> T getCode() const {
+ return static_cast<T>(Code.CP);
+ }
+ void setCode(void *C) {
+ Code.CP = C;
+ }
+
+ NodeAddr<NodeBase*> getFirstMember(const DataFlowGraph &G) const;
+ NodeAddr<NodeBase*> getLastMember(const DataFlowGraph &G) const;
+ void addMember(NodeAddr<NodeBase*> NA, const DataFlowGraph &G);
+ void addMemberAfter(NodeAddr<NodeBase*> MA, NodeAddr<NodeBase*> NA,
+ const DataFlowGraph &G);
+ void removeMember(NodeAddr<NodeBase*> NA, const DataFlowGraph &G);
+
+ NodeList members(const DataFlowGraph &G) const;
+ template <typename Predicate>
+ NodeList members_if(Predicate P, const DataFlowGraph &G) const;
+ };
+
+ struct InstrNode : public CodeNode {
+ NodeAddr<NodeBase*> getOwner(const DataFlowGraph &G);
+ };
+
+ struct PhiNode : public InstrNode {
+ MachineInstr *getCode() const {
+ return nullptr;
+ }
+ };
+
+ struct StmtNode : public InstrNode {
+ MachineInstr *getCode() const {
+ return CodeNode::getCode<MachineInstr*>();
+ }
+ };
+
+ struct BlockNode : public CodeNode {
+ MachineBasicBlock *getCode() const {
+ return CodeNode::getCode<MachineBasicBlock*>();
+ }
+ void addPhi(NodeAddr<PhiNode*> PA, const DataFlowGraph &G);
+ };
+
+ struct FuncNode : public CodeNode {
+ MachineFunction *getCode() const {
+ return CodeNode::getCode<MachineFunction*>();
+ }
+ NodeAddr<BlockNode*> findBlock(const MachineBasicBlock *BB,
+ const DataFlowGraph &G) const;
+ NodeAddr<BlockNode*> getEntryBlock(const DataFlowGraph &G);
+ };
+
+ struct DataFlowGraph {
+ DataFlowGraph(MachineFunction &mf, const TargetInstrInfo &tii,
+ const TargetRegisterInfo &tri, const MachineDominatorTree &mdt,
+ const MachineDominanceFrontier &mdf, const TargetOperandInfo &toi);
+
+ NodeBase *ptr(NodeId N) const;
+ template <typename T> T ptr(NodeId N) const {
+ return static_cast<T>(ptr(N));
+ }
+ NodeId id(const NodeBase *P) const;
+
+ template <typename T> NodeAddr<T> addr(NodeId N) const {
+ return { ptr<T>(N), N };
+ }
+
+ NodeAddr<FuncNode*> getFunc() const { return Func; }
+ MachineFunction &getMF() const { return MF; }
+ const TargetInstrInfo &getTII() const { return TII; }
+ const TargetRegisterInfo &getTRI() const { return TRI; }
+ const MachineDominatorTree &getDT() const { return MDT; }
+ const MachineDominanceFrontier &getDF() const { return MDF; }
+
+ struct DefStack {
+ DefStack() = default;
+ bool empty() const { return Stack.empty() || top() == bottom(); }
+ private:
+ typedef NodeAddr<DefNode*> value_type;
+ struct Iterator {
+ typedef DefStack::value_type value_type;
+ Iterator &up() { Pos = DS.nextUp(Pos); return *this; }
+ Iterator &down() { Pos = DS.nextDown(Pos); return *this; }
+ value_type operator*() const {
+ assert(Pos >= 1);
+ return DS.Stack[Pos-1];
+ }
+ const value_type *operator->() const {
+ assert(Pos >= 1);
+ return &DS.Stack[Pos-1];
+ }
+ bool operator==(const Iterator &It) const { return Pos == It.Pos; }
+ bool operator!=(const Iterator &It) const { return Pos != It.Pos; }
+ private:
+ Iterator(const DefStack &S, bool Top);
+ // Pos-1 is the index in the StorageType object that corresponds to
+ // the top of the DefStack.
+ const DefStack &DS;
+ unsigned Pos;
+ friend struct DefStack;
+ };
+ public:
+ typedef Iterator iterator;
+ iterator top() const { return Iterator(*this, true); }
+ iterator bottom() const { return Iterator(*this, false); }
+ unsigned size() const;
+
+ void push(NodeAddr<DefNode*> DA) { Stack.push_back(DA); }
+ void pop();
+ void start_block(NodeId N);
+ void clear_block(NodeId N);
+ private:
+ friend struct Iterator;
+ typedef std::vector<value_type> StorageType;
+ bool isDelimiter(const StorageType::value_type &P, NodeId N = 0) const {
+ return (P.Addr == nullptr) && (N == 0 || P.Id == N);
+ }
+ unsigned nextUp(unsigned P) const;
+ unsigned nextDown(unsigned P) const;
+ StorageType Stack;
+ };
+
+ // Make this std::unordered_map for speed of accessing elements.
+ // Map: Register (physical or virtual) -> DefStack
+ typedef std::unordered_map<RegisterId,DefStack> DefStackMap;
+
+ void build(unsigned Options = BuildOptions::None);
+ void pushDefs(NodeAddr<InstrNode*> IA, DefStackMap &DM);
+ void markBlock(NodeId B, DefStackMap &DefM);
+ void releaseBlock(NodeId B, DefStackMap &DefM);
+
+ PackedRegisterRef pack(RegisterRef RR) { return LMI.pack(RR); }
+ PackedRegisterRef pack(RegisterRef RR) const { return LMI.pack(RR); }
+ RegisterRef unpack(PackedRegisterRef PR) const { return LMI.unpack(PR); }
+ RegisterRef makeRegRef(unsigned Reg, unsigned Sub) const;
+ RegisterRef normalizeRef(RegisterRef RR) const;
+ RegisterRef restrictRef(RegisterRef AR, RegisterRef BR) const;
+
+ NodeAddr<RefNode*> getNextRelated(NodeAddr<InstrNode*> IA,
+ NodeAddr<RefNode*> RA) const;
+ NodeAddr<RefNode*> getNextImp(NodeAddr<InstrNode*> IA,
+ NodeAddr<RefNode*> RA, bool Create);
+ NodeAddr<RefNode*> getNextImp(NodeAddr<InstrNode*> IA,
+ NodeAddr<RefNode*> RA) const;
+ NodeAddr<RefNode*> getNextShadow(NodeAddr<InstrNode*> IA,
+ NodeAddr<RefNode*> RA, bool Create);
+ NodeAddr<RefNode*> getNextShadow(NodeAddr<InstrNode*> IA,
+ NodeAddr<RefNode*> RA) const;
+
+ NodeList getRelatedRefs(NodeAddr<InstrNode*> IA,
+ NodeAddr<RefNode*> RA) const;
+
+ void unlinkUse(NodeAddr<UseNode*> UA, bool RemoveFromOwner) {
+ unlinkUseDF(UA);
+ if (RemoveFromOwner)
+ removeFromOwner(UA);
+ }
+ void unlinkDef(NodeAddr<DefNode*> DA, bool RemoveFromOwner) {
+ unlinkDefDF(DA);
+ if (RemoveFromOwner)
+ removeFromOwner(DA);
+ }
+
+ // Some useful filters.
+ template <uint16_t Kind>
+ static bool IsRef(const NodeAddr<NodeBase*> BA) {
+ return BA.Addr->getType() == NodeAttrs::Ref &&
+ BA.Addr->getKind() == Kind;
+ }
+ template <uint16_t Kind>
+ static bool IsCode(const NodeAddr<NodeBase*> BA) {
+ return BA.Addr->getType() == NodeAttrs::Code &&
+ BA.Addr->getKind() == Kind;
+ }
+ static bool IsDef(const NodeAddr<NodeBase*> BA) {
+ return BA.Addr->getType() == NodeAttrs::Ref &&
+ BA.Addr->getKind() == NodeAttrs::Def;
+ }
+ static bool IsUse(const NodeAddr<NodeBase*> BA) {
+ return BA.Addr->getType() == NodeAttrs::Ref &&
+ BA.Addr->getKind() == NodeAttrs::Use;
+ }
+ static bool IsPhi(const NodeAddr<NodeBase*> BA) {
+ return BA.Addr->getType() == NodeAttrs::Code &&
+ BA.Addr->getKind() == NodeAttrs::Phi;
+ }
+ static bool IsPreservingDef(const NodeAddr<DefNode*> DA) {
+ uint16_t Flags = DA.Addr->getFlags();
+ return (Flags & NodeAttrs::Preserving) && !(Flags & NodeAttrs::Undef);
+ }
+
+ // Register aliasing.
+ bool alias(RegisterRef RA, RegisterRef RB) const;
+
+ private:
+ void reset();
+
+ RegisterSet getAliasSet(RegisterId Reg) const;
+ RegisterSet getLandingPadLiveIns() const;
+
+ NodeAddr<NodeBase*> newNode(uint16_t Attrs);
+ NodeAddr<NodeBase*> cloneNode(const NodeAddr<NodeBase*> B);
+ NodeAddr<UseNode*> newUse(NodeAddr<InstrNode*> Owner,
+ MachineOperand &Op, uint16_t Flags = NodeAttrs::None);
+ NodeAddr<PhiUseNode*> newPhiUse(NodeAddr<PhiNode*> Owner,
+ RegisterRef RR, NodeAddr<BlockNode*> PredB,
+ uint16_t Flags = NodeAttrs::PhiRef);
+ NodeAddr<DefNode*> newDef(NodeAddr<InstrNode*> Owner,
+ MachineOperand &Op, uint16_t Flags = NodeAttrs::None);
+ NodeAddr<DefNode*> newDef(NodeAddr<InstrNode*> Owner,
+ RegisterRef RR, uint16_t Flags = NodeAttrs::PhiRef);
+ NodeAddr<PhiNode*> newPhi(NodeAddr<BlockNode*> Owner);
+ NodeAddr<StmtNode*> newStmt(NodeAddr<BlockNode*> Owner,
+ MachineInstr *MI);
+ NodeAddr<BlockNode*> newBlock(NodeAddr<FuncNode*> Owner,
+ MachineBasicBlock *BB);
+ NodeAddr<FuncNode*> newFunc(MachineFunction *MF);
+
+ template <typename Predicate>
+ std::pair<NodeAddr<RefNode*>,NodeAddr<RefNode*>>
+ locateNextRef(NodeAddr<InstrNode*> IA, NodeAddr<RefNode*> RA,
+ Predicate P) const;
+
+ typedef std::map<NodeId,RegisterSet> BlockRefsMap;
+
+ void buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In);
+ void buildBlockRefs(NodeAddr<BlockNode*> BA, BlockRefsMap &RefM);
+ void recordDefsForDF(BlockRefsMap &PhiM, BlockRefsMap &RefM,
+ NodeAddr<BlockNode*> BA);
+ void buildPhis(BlockRefsMap &PhiM, BlockRefsMap &RefM,
+ NodeAddr<BlockNode*> BA);
+ void removeUnusedPhis();
+
+ template <typename T> void linkRefUp(NodeAddr<InstrNode*> IA,
+ NodeAddr<T> TA, DefStack &DS);
+ void linkStmtRefs(DefStackMap &DefM, NodeAddr<StmtNode*> SA);
+ void linkBlockRefs(DefStackMap &DefM, NodeAddr<BlockNode*> BA);
+
+ void unlinkUseDF(NodeAddr<UseNode*> UA);
+ void unlinkDefDF(NodeAddr<DefNode*> DA);
+ void removeFromOwner(NodeAddr<RefNode*> RA) {
+ NodeAddr<InstrNode*> IA = RA.Addr->getOwner(*this);
+ IA.Addr->removeMember(RA, *this);
+ }
+
+ NodeAddr<BlockNode*> findBlock(MachineBasicBlock *BB) {
+ return BlockNodes[BB];
+ }
+
+ NodeAddr<FuncNode*> Func;
+ NodeAllocator Memory;
+ // Local map: MachineBasicBlock -> NodeAddr<BlockNode*>
+ std::map<MachineBasicBlock*,NodeAddr<BlockNode*>> BlockNodes;
+ // Lane mask map.
+ LaneMaskIndex LMI;
+
+ MachineFunction &MF;
+ const TargetInstrInfo &TII;
+ const TargetRegisterInfo &TRI;
+ const MachineDominatorTree &MDT;
+ const MachineDominanceFrontier &MDF;
+ const TargetOperandInfo &TOI;
+ }; // struct DataFlowGraph
+
+ template <typename Predicate>
+ NodeAddr<RefNode*> RefNode::getNextRef(RegisterRef RR, Predicate P,
+ bool NextOnly, const DataFlowGraph &G) {
+ // Get the "Next" reference in the circular list that references RR and
+ // satisfies predicate "Pred".
+ auto NA = G.addr<NodeBase*>(getNext());
+
+ while (NA.Addr != this) {
+ if (NA.Addr->getType() == NodeAttrs::Ref) {
+ NodeAddr<RefNode*> RA = NA;
+ if (RA.Addr->getRegRef(G) == RR && P(NA))
+ return NA;
+ if (NextOnly)
+ break;
+ NA = G.addr<NodeBase*>(NA.Addr->getNext());
+ } else {
+ // We've hit the beginning of the chain.
+ assert(NA.Addr->getType() == NodeAttrs::Code);
+ NodeAddr<CodeNode*> CA = NA;
+ NA = CA.Addr->getFirstMember(G);
+ }
+ }
+ // Return the equivalent of "nullptr" if such a node was not found.
+ return NodeAddr<RefNode*>();
+ }
+
+ template <typename Predicate>
+ NodeList CodeNode::members_if(Predicate P, const DataFlowGraph &G) const {
+ NodeList MM;
+ auto M = getFirstMember(G);
+ if (M.Id == 0)
+ return MM;
+
+ while (M.Addr != this) {
+ if (P(M))
+ MM.push_back(M);
+ M = G.addr<NodeBase*>(M.Addr->getNext());
+ }
+ return MM;
+ }
+
+
+ // Optionally print the lane mask, if it is not ~0.
+ struct PrintLaneMaskOpt {
+ PrintLaneMaskOpt(LaneBitmask M) : Mask(M) {}
+ LaneBitmask Mask;
+ };
+ raw_ostream &operator<< (raw_ostream &OS, const PrintLaneMaskOpt &P);
+
+ template <typename T> struct Print;
+ template <typename T>
+ raw_ostream &operator<< (raw_ostream &OS, const Print<T> &P);
+
+ template <typename T>
+ struct Print {
+ Print(const T &x, const DataFlowGraph &g) : Obj(x), G(g) {}
+ const T &Obj;
+ const DataFlowGraph &G;
+ };
+
+ template <typename T>
+ struct PrintNode : Print<NodeAddr<T>> {
+ PrintNode(const NodeAddr<T> &x, const DataFlowGraph &g)
+ : Print<NodeAddr<T>>(x, g) {}
+ };
+} // namespace rdf
+} // namespace llvm
+
+#endif // RDF_GRAPH_H
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFLiveness.cpp b/contrib/llvm/lib/Target/Hexagon/RDFLiveness.cpp
new file mode 100644
index 000000000000..e74c4bfc1645
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/RDFLiveness.cpp
@@ -0,0 +1,1030 @@
+//===--- RDFLiveness.cpp --------------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Computation of the liveness information from the data-flow graph.
+//
+// The main functionality of this code is to compute block live-in
+// information. With the live-in information in place, the placement
+// of kill flags can also be recalculated.
+//
+// The block live-in calculation is based on the ideas from the following
+// publication:
+//
+// Dibyendu Das, Ramakrishna Upadrasta, Benoit Dupont de Dinechin.
+// "Efficient Liveness Computation Using Merge Sets and DJ-Graphs."
+// ACM Transactions on Architecture and Code Optimization, Association for
+// Computing Machinery, 2012, ACM TACO Special Issue on "High-Performance
+// and Embedded Architectures and Compilers", 8 (4),
+// <10.1145/2086696.2086706>. <hal-00647369>
+//
+#include "RDFGraph.h"
+#include "RDFLiveness.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominanceFrontier.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+using namespace rdf;
+
+namespace llvm {
+namespace rdf {
+ template<>
+ raw_ostream &operator<< (raw_ostream &OS, const Print<Liveness::RefMap> &P) {
+ OS << '{';
+ for (auto &I : P.Obj) {
+ OS << ' ' << PrintReg(I.first, &P.G.getTRI()) << '{';
+ for (auto J = I.second.begin(), E = I.second.end(); J != E; ) {
+ OS << Print<NodeId>(J->first, P.G) << PrintLaneMaskOpt(J->second);
+ if (++J != E)
+ OS << ',';
+ }
+ OS << '}';
+ }
+ OS << " }";
+ return OS;
+ }
+} // namespace rdf
+} // namespace llvm
+
+// The order in the returned sequence is the order of reaching defs in the
+// upward traversal: the first def is the closest to the given reference RefA,
+// the next one is further up, and so on.
+// The list ends at a reaching phi def, or when the reference from RefA is
+// covered by the defs in the list (see FullChain).
+// This function provides two modes of operation:
+// (1) Returning the sequence of reaching defs for a particular reference
+// node. This sequence will terminate at the first phi node [1].
+// (2) Returning a partial sequence of reaching defs, where the final goal
+// is to traverse past phi nodes to the actual defs arising from the code
+// itself.
+// In mode (2), the register reference for which the search was started
+// may be different from the reference node RefA, for which this call was
+// made, hence the argument RefRR, which holds the original register.
+// Also, some definitions may have already been encountered in a previous
+// call that will influence register covering. The register references
+// already defined are passed in through DefRRs.
+// In mode (1), the "continuation" considerations do not apply, and the
+// RefRR is the same as the register in RefA, and the set DefRRs is empty.
+//
+// [1] It is possible for multiple phi nodes to be included in the returned
+// sequence:
+// SubA = phi ...
+// SubB = phi ...
+// ... = SuperAB(rdef:SubA), SuperAB"(rdef:SubB)
+// However, these phi nodes are independent from one another in terms of
+// the data-flow.
+
+NodeList Liveness::getAllReachingDefs(RegisterRef RefRR,
+ NodeAddr<RefNode*> RefA, bool FullChain, const RegisterAggr &DefRRs) {
+ NodeList RDefs; // Return value.
+ SetVector<NodeId> DefQ;
+ SetVector<NodeId> Owners;
+
+ // Dead defs will be treated as if they were live, since they are actually
+ // on the data-flow path. They cannot be ignored because even though they
+ // do not generate meaningful values, they still modify registers.
+
+ // If the reference is undefined, there is nothing to do.
+ if (RefA.Addr->getFlags() & NodeAttrs::Undef)
+ return RDefs;
+
+ // The initial queue should not have reaching defs for shadows. The
+ // whole point of a shadow is that it will have a reaching def that
+ // is not aliased to the reaching defs of the related shadows.
+ NodeId Start = RefA.Id;
+ auto SNA = DFG.addr<RefNode*>(Start);
+ if (NodeId RD = SNA.Addr->getReachingDef())
+ DefQ.insert(RD);
+
+ // Collect all the reaching defs, going up until a phi node is encountered,
+ // or there are no more reaching defs. From this set, the actual set of
+ // reaching defs will be selected.
+ // The traversal upwards must go on until a covering def is encountered.
+ // It is possible that a collection of non-covering (individually) defs
+ // will be sufficient, but keep going until a covering one is found.
+ for (unsigned i = 0; i < DefQ.size(); ++i) {
+ auto TA = DFG.addr<DefNode*>(DefQ[i]);
+ if (TA.Addr->getFlags() & NodeAttrs::PhiRef)
+ continue;
+ // Stop at the covering/overwriting def of the initial register reference.
+ RegisterRef RR = TA.Addr->getRegRef(DFG);
+ if (!DFG.IsPreservingDef(TA))
+ if (RegisterAggr::isCoverOf(RR, RefRR, TRI))
+ continue;
+ // Get the next level of reaching defs. This will include multiple
+ // reaching defs for shadows.
+ for (auto S : DFG.getRelatedRefs(TA.Addr->getOwner(DFG), TA))
+ if (NodeId RD = NodeAddr<RefNode*>(S).Addr->getReachingDef())
+ DefQ.insert(RD);
+ }
+
+ // Remove all non-phi defs that are not aliased to RefRR, and collect
+ // the owners of the remaining defs.
+ SetVector<NodeId> Defs;
+ for (NodeId N : DefQ) {
+ auto TA = DFG.addr<DefNode*>(N);
+ bool IsPhi = TA.Addr->getFlags() & NodeAttrs::PhiRef;
+ if (!IsPhi && !DFG.alias(RefRR, TA.Addr->getRegRef(DFG)))
+ continue;
+ Defs.insert(TA.Id);
+ Owners.insert(TA.Addr->getOwner(DFG).Id);
+ }
+
+ // Return the MachineBasicBlock containing a given instruction.
+ auto Block = [this] (NodeAddr<InstrNode*> IA) -> MachineBasicBlock* {
+ if (IA.Addr->getKind() == NodeAttrs::Stmt)
+ return NodeAddr<StmtNode*>(IA).Addr->getCode()->getParent();
+ assert(IA.Addr->getKind() == NodeAttrs::Phi);
+ NodeAddr<PhiNode*> PA = IA;
+ NodeAddr<BlockNode*> BA = PA.Addr->getOwner(DFG);
+ return BA.Addr->getCode();
+ };
+ // Less(A,B) iff instruction A is further down in the dominator tree than B.
+ auto Less = [&Block,this] (NodeId A, NodeId B) -> bool {
+ if (A == B)
+ return false;
+ auto OA = DFG.addr<InstrNode*>(A), OB = DFG.addr<InstrNode*>(B);
+ MachineBasicBlock *BA = Block(OA), *BB = Block(OB);
+ if (BA != BB)
+ return MDT.dominates(BB, BA);
+ // They are in the same block.
+ bool StmtA = OA.Addr->getKind() == NodeAttrs::Stmt;
+ bool StmtB = OB.Addr->getKind() == NodeAttrs::Stmt;
+ if (StmtA) {
+ if (!StmtB) // OB is a phi and phis dominate statements.
+ return true;
+ MachineInstr *CA = NodeAddr<StmtNode*>(OA).Addr->getCode();
+ MachineInstr *CB = NodeAddr<StmtNode*>(OB).Addr->getCode();
+ // The order must be linear, so tie-break such equalities.
+ if (CA == CB)
+ return A < B;
+ return MDT.dominates(CB, CA);
+ } else {
+ // OA is a phi.
+ if (StmtB)
+ return false;
+ // Both are phis. There is no ordering between phis (in terms of
+ // the data-flow), so tie-break this via node id comparison.
+ return A < B;
+ }
+ };
+
+ std::vector<NodeId> Tmp(Owners.begin(), Owners.end());
+ std::sort(Tmp.begin(), Tmp.end(), Less);
+
+ // The vector is a list of instructions, so that defs coming from
+ // the same instruction don't need to be artificially ordered.
+ // Then, when computing the initial segment, and iterating over an
+ // instruction, pick the defs that contribute to the covering (i.e. is
+ // not covered by previously added defs). Check the defs individually,
+ // i.e. first check each def if is covered or not (without adding them
+ // to the tracking set), and then add all the selected ones.
+
+ // The reason for this is this example:
+ // *d1<A>, *d2<B>, ... Assume A and B are aliased (can happen in phi nodes).
+ // *d3<C> If A \incl BuC, and B \incl AuC, then *d2 would be
+ // covered if we added A first, and A would be covered
+ // if we added B first.
+
+ RegisterAggr RRs(DefRRs);
+
+ auto DefInSet = [&Defs] (NodeAddr<RefNode*> TA) -> bool {
+ return TA.Addr->getKind() == NodeAttrs::Def &&
+ Defs.count(TA.Id);
+ };
+ for (NodeId T : Tmp) {
+ if (!FullChain && RRs.hasCoverOf(RefRR))
+ break;
+ auto TA = DFG.addr<InstrNode*>(T);
+ bool IsPhi = DFG.IsCode<NodeAttrs::Phi>(TA);
+ NodeList Ds;
+ for (NodeAddr<DefNode*> DA : TA.Addr->members_if(DefInSet, DFG)) {
+ RegisterRef QR = DA.Addr->getRegRef(DFG);
+ // Add phi defs even if they are covered by subsequent defs. This is
+ // for cases where the reached use is not covered by any of the defs
+ // encountered so far: the phi def is needed to expose the liveness
+ // of that use to the entry of the block.
+ // Example:
+ // phi d1<R3>(,d2,), ... Phi def d1 is covered by d2.
+ // d2<R3>(d1,,u3), ...
+ // ..., u3<D1>(d2) This use needs to be live on entry.
+ if (FullChain || IsPhi || !RRs.hasCoverOf(QR))
+ Ds.push_back(DA);
+ }
+ RDefs.insert(RDefs.end(), Ds.begin(), Ds.end());
+ for (NodeAddr<DefNode*> DA : Ds) {
+ // When collecting a full chain of definitions, do not consider phi
+ // defs to actually define a register.
+ uint16_t Flags = DA.Addr->getFlags();
+ if (!FullChain || !(Flags & NodeAttrs::PhiRef))
+ if (!(Flags & NodeAttrs::Preserving)) // Don't care about Undef here.
+ RRs.insert(DA.Addr->getRegRef(DFG));
+ }
+ }
+
+ auto DeadP = [](const NodeAddr<DefNode*> DA) -> bool {
+ return DA.Addr->getFlags() & NodeAttrs::Dead;
+ };
+ RDefs.resize(std::distance(RDefs.begin(), remove_if(RDefs, DeadP)));
+
+ return RDefs;
+}
+
+
+NodeSet Liveness::getAllReachingDefsRec(RegisterRef RefRR,
+ NodeAddr<RefNode*> RefA, NodeSet &Visited, const NodeSet &Defs) {
+ // Collect all defined registers. Do not consider phis to be defining
+ // anything, only collect "real" definitions.
+ RegisterAggr DefRRs(TRI);
+ for (NodeId D : Defs) {
+ const auto DA = DFG.addr<const DefNode*>(D);
+ if (!(DA.Addr->getFlags() & NodeAttrs::PhiRef))
+ DefRRs.insert(DA.Addr->getRegRef(DFG));
+ }
+
+ NodeList RDs = getAllReachingDefs(RefRR, RefA, true, DefRRs);
+ if (RDs.empty())
+ return Defs;
+
+ // Make a copy of the preexisting definitions and add the newly found ones.
+ NodeSet TmpDefs = Defs;
+ for (NodeAddr<NodeBase*> R : RDs)
+ TmpDefs.insert(R.Id);
+
+ NodeSet Result = Defs;
+
+ for (NodeAddr<DefNode*> DA : RDs) {
+ Result.insert(DA.Id);
+ if (!(DA.Addr->getFlags() & NodeAttrs::PhiRef))
+ continue;
+ NodeAddr<PhiNode*> PA = DA.Addr->getOwner(DFG);
+ if (Visited.count(PA.Id))
+ continue;
+ Visited.insert(PA.Id);
+ // Go over all phi uses and get the reaching defs for each use.
+ for (auto U : PA.Addr->members_if(DFG.IsRef<NodeAttrs::Use>, DFG)) {
+ const auto &T = getAllReachingDefsRec(RefRR, U, Visited, TmpDefs);
+ Result.insert(T.begin(), T.end());
+ }
+ }
+
+ return Result;
+}
+
+
+NodeSet Liveness::getAllReachedUses(RegisterRef RefRR,
+ NodeAddr<DefNode*> DefA, const RegisterAggr &DefRRs) {
+ NodeSet Uses;
+
+ // If the original register is already covered by all the intervening
+ // defs, no more uses can be reached.
+ if (DefRRs.hasCoverOf(RefRR))
+ return Uses;
+
+ // Add all directly reached uses.
+ // If the def is dead, it does not provide a value for any use.
+ bool IsDead = DefA.Addr->getFlags() & NodeAttrs::Dead;
+ NodeId U = !IsDead ? DefA.Addr->getReachedUse() : 0;
+ while (U != 0) {
+ auto UA = DFG.addr<UseNode*>(U);
+ if (!(UA.Addr->getFlags() & NodeAttrs::Undef)) {
+ RegisterRef UR = UA.Addr->getRegRef(DFG);
+ if (DFG.alias(RefRR, UR) && !DefRRs.hasCoverOf(UR))
+ Uses.insert(U);
+ }
+ U = UA.Addr->getSibling();
+ }
+
+ // Traverse all reached defs. This time dead defs cannot be ignored.
+ for (NodeId D = DefA.Addr->getReachedDef(), NextD; D != 0; D = NextD) {
+ auto DA = DFG.addr<DefNode*>(D);
+ NextD = DA.Addr->getSibling();
+ RegisterRef DR = DA.Addr->getRegRef(DFG);
+ // If this def is already covered, it cannot reach anything new.
+ // Similarly, skip it if it is not aliased to the interesting register.
+ if (DefRRs.hasCoverOf(DR) || !DFG.alias(RefRR, DR))
+ continue;
+ NodeSet T;
+ if (DFG.IsPreservingDef(DA)) {
+ // If it is a preserving def, do not update the set of intervening defs.
+ T = getAllReachedUses(RefRR, DA, DefRRs);
+ } else {
+ RegisterAggr NewDefRRs = DefRRs;
+ NewDefRRs.insert(DR);
+ T = getAllReachedUses(RefRR, DA, NewDefRRs);
+ }
+ Uses.insert(T.begin(), T.end());
+ }
+ return Uses;
+}
+
+
+void Liveness::computePhiInfo() {
+ RealUseMap.clear();
+
+ NodeList Phis;
+ NodeAddr<FuncNode*> FA = DFG.getFunc();
+ NodeList Blocks = FA.Addr->members(DFG);
+ for (NodeAddr<BlockNode*> BA : Blocks) {
+ auto Ps = BA.Addr->members_if(DFG.IsCode<NodeAttrs::Phi>, DFG);
+ Phis.insert(Phis.end(), Ps.begin(), Ps.end());
+ }
+
+ // phi use -> (map: reaching phi -> set of registers defined in between)
+ std::map<NodeId,std::map<NodeId,RegisterAggr>> PhiUp;
+ std::vector<NodeId> PhiUQ; // Work list of phis for upward propagation.
+
+ // Go over all phis.
+ for (NodeAddr<PhiNode*> PhiA : Phis) {
+ // Go over all defs and collect the reached uses that are non-phi uses
+ // (i.e. the "real uses").
+ RefMap &RealUses = RealUseMap[PhiA.Id];
+ NodeList PhiRefs = PhiA.Addr->members(DFG);
+
+ // Have a work queue of defs whose reached uses need to be found.
+ // For each def, add to the queue all reached (non-phi) defs.
+ SetVector<NodeId> DefQ;
+ NodeSet PhiDefs;
+ for (NodeAddr<RefNode*> R : PhiRefs) {
+ if (!DFG.IsRef<NodeAttrs::Def>(R))
+ continue;
+ DefQ.insert(R.Id);
+ PhiDefs.insert(R.Id);
+ }
+
+ // Collect the super-set of all possible reached uses. This set will
+ // contain all uses reached from this phi, either directly from the
+ // phi defs, or (recursively) via non-phi defs reached by the phi defs.
+ // This set of uses will later be trimmed to only contain these uses that
+ // are actually reached by the phi defs.
+ for (unsigned i = 0; i < DefQ.size(); ++i) {
+ NodeAddr<DefNode*> DA = DFG.addr<DefNode*>(DefQ[i]);
+ // Visit all reached uses. Phi defs should not really have the "dead"
+ // flag set, but check it anyway for consistency.
+ bool IsDead = DA.Addr->getFlags() & NodeAttrs::Dead;
+ NodeId UN = !IsDead ? DA.Addr->getReachedUse() : 0;
+ while (UN != 0) {
+ NodeAddr<UseNode*> A = DFG.addr<UseNode*>(UN);
+ uint16_t F = A.Addr->getFlags();
+ if ((F & (NodeAttrs::Undef | NodeAttrs::PhiRef)) == 0) {
+ RegisterRef R = DFG.normalizeRef(getRestrictedRegRef(A));
+ RealUses[R.Reg].insert({A.Id,R.Mask});
+ }
+ UN = A.Addr->getSibling();
+ }
+ // Visit all reached defs, and add them to the queue. These defs may
+ // override some of the uses collected here, but that will be handled
+ // later.
+ NodeId DN = DA.Addr->getReachedDef();
+ while (DN != 0) {
+ NodeAddr<DefNode*> A = DFG.addr<DefNode*>(DN);
+ for (auto T : DFG.getRelatedRefs(A.Addr->getOwner(DFG), A)) {
+ uint16_t Flags = NodeAddr<DefNode*>(T).Addr->getFlags();
+ // Must traverse the reached-def chain. Consider:
+ // def(D0) -> def(R0) -> def(R0) -> use(D0)
+ // The reachable use of D0 passes through a def of R0.
+ if (!(Flags & NodeAttrs::PhiRef))
+ DefQ.insert(T.Id);
+ }
+ DN = A.Addr->getSibling();
+ }
+ }
+ // Filter out these uses that appear to be reachable, but really
+ // are not. For example:
+ //
+ // R1:0 = d1
+ // = R1:0 u2 Reached by d1.
+ // R0 = d3
+ // = R1:0 u4 Still reached by d1: indirectly through
+ // the def d3.
+ // R1 = d5
+ // = R1:0 u6 Not reached by d1 (covered collectively
+ // by d3 and d5), but following reached
+ // defs and uses from d1 will lead here.
+ auto InPhiDefs = [&PhiDefs] (NodeAddr<DefNode*> DA) -> bool {
+ return PhiDefs.count(DA.Id);
+ };
+ for (auto UI = RealUses.begin(), UE = RealUses.end(); UI != UE; ) {
+ // For each reached register UI->first, there is a set UI->second, of
+ // uses of it. For each such use, check if it is reached by this phi,
+ // i.e. check if the set of its reaching uses intersects the set of
+ // this phi's defs.
+ NodeRefSet &Uses = UI->second;
+ for (auto I = Uses.begin(), E = Uses.end(); I != E; ) {
+ auto UA = DFG.addr<UseNode*>(I->first);
+ // Undef flag is checked above.
+ assert((UA.Addr->getFlags() & NodeAttrs::Undef) == 0);
+ RegisterRef R(UI->first, I->second);
+ NodeList RDs = getAllReachingDefs(R, UA);
+ if (any_of(RDs, InPhiDefs))
+ ++I;
+ else
+ I = Uses.erase(I);
+ }
+ if (Uses.empty())
+ UI = RealUses.erase(UI);
+ else
+ ++UI;
+ }
+
+ // If this phi reaches some "real" uses, add it to the queue for upward
+ // propagation.
+ if (!RealUses.empty())
+ PhiUQ.push_back(PhiA.Id);
+
+ // Go over all phi uses and check if the reaching def is another phi.
+ // Collect the phis that are among the reaching defs of these uses.
+ // While traversing the list of reaching defs for each phi use, accumulate
+ // the set of registers defined between this phi (PhiA) and the owner phi
+ // of the reaching def.
+ NodeSet SeenUses;
+
+ for (auto I : PhiRefs) {
+ if (!DFG.IsRef<NodeAttrs::Use>(I) || SeenUses.count(I.Id))
+ continue;
+ NodeAddr<UseNode*> UA = I;
+
+ // Given a phi use UA, traverse all related phi uses (including UA).
+ // The related phi uses may reach different phi nodes or may reach the
+ // same phi node. If multiple uses reach the same phi P, the intervening
+ // defs must be accumulated for all such uses. To group all such uses
+ // into one set, map their node ids to the first use id that reaches P.
+ std::map<NodeId,NodeId> FirstUse; // Phi reached up -> first phi use.
+
+ for (NodeAddr<UseNode*> VA : DFG.getRelatedRefs(PhiA, UA)) {
+ SeenUses.insert(VA.Id);
+ RegisterAggr DefRRs(TRI);
+ for (NodeAddr<DefNode*> DA : getAllReachingDefs(VA)) {
+ if (DA.Addr->getFlags() & NodeAttrs::PhiRef) {
+ NodeId RP = DA.Addr->getOwner(DFG).Id;
+ NodeId FU = FirstUse.insert({RP,VA.Id}).first->second;
+ std::map<NodeId,RegisterAggr> &M = PhiUp[FU];
+ auto F = M.find(RP);
+ if (F == M.end())
+ M.insert(std::make_pair(RP, DefRRs));
+ else
+ F->second.insert(DefRRs);
+ }
+ DefRRs.insert(DA.Addr->getRegRef(DFG));
+ }
+ }
+ }
+ }
+
+ if (Trace) {
+ dbgs() << "Phi-up-to-phi map with intervening defs:\n";
+ for (auto I : PhiUp) {
+ dbgs() << "phi " << Print<NodeId>(I.first, DFG) << " -> {";
+ for (auto R : I.second)
+ dbgs() << ' ' << Print<NodeId>(R.first, DFG)
+ << Print<RegisterAggr>(R.second, DFG);
+ dbgs() << " }\n";
+ }
+ }
+
+ // Propagate the reached registers up in the phi chain.
+ //
+ // The following type of situation needs careful handling:
+ //
+ // phi d1<R1:0> (1)
+ // |
+ // ... d2<R1>
+ // |
+ // phi u3<R1:0> (2)
+ // |
+ // ... u4<R1>
+ //
+ // The phi node (2) defines a register pair R1:0, and reaches a "real"
+ // use u4 of just R1. The same phi node is also known to reach (upwards)
+ // the phi node (1). However, the use u4 is not reached by phi (1),
+ // because of the intervening definition d2 of R1. The data flow between
+ // phis (1) and (2) is restricted to R1:0 minus R1, i.e. R0.
+ //
+ // When propagating uses up the phi chains, get the all reaching defs
+ // for a given phi use, and traverse the list until the propagated ref
+ // is covered, or until reaching the final phi. Only assume that the
+ // reference reaches the phi in the latter case.
+
+ for (unsigned i = 0; i < PhiUQ.size(); ++i) {
+ auto PA = DFG.addr<PhiNode*>(PhiUQ[i]);
+ NodeList PUs = PA.Addr->members_if(DFG.IsRef<NodeAttrs::Use>, DFG);
+ RefMap &RUM = RealUseMap[PA.Id];
+
+ for (NodeAddr<UseNode*> UA : PUs) {
+ std::map<NodeId,RegisterAggr> &PUM = PhiUp[UA.Id];
+ RegisterRef UR = DFG.normalizeRef(getRestrictedRegRef(UA));
+ for (const std::pair<NodeId,RegisterAggr> &P : PUM) {
+ bool Changed = false;
+ const RegisterAggr &MidDefs = P.second;
+
+ // Collect the set PropUp of uses that are reached by the current
+ // phi PA, and are not covered by any intervening def between the
+ // currently visited use UA and the the upward phi P.
+
+ if (MidDefs.hasCoverOf(UR))
+ continue;
+
+ // General algorithm:
+ // for each (R,U) : U is use node of R, U is reached by PA
+ // if MidDefs does not cover (R,U)
+ // then add (R-MidDefs,U) to RealUseMap[P]
+ //
+ for (const std::pair<RegisterId,NodeRefSet> &T : RUM) {
+ RegisterRef R = DFG.restrictRef(RegisterRef(T.first), UR);
+ if (!R)
+ continue;
+ for (std::pair<NodeId,LaneBitmask> V : T.second) {
+ RegisterRef S = DFG.restrictRef(RegisterRef(R.Reg, V.second), R);
+ if (!S)
+ continue;
+ if (RegisterRef SS = MidDefs.clearIn(S)) {
+ NodeRefSet &RS = RealUseMap[P.first][SS.Reg];
+ Changed |= RS.insert({V.first,SS.Mask}).second;
+ }
+ }
+ }
+
+ if (Changed)
+ PhiUQ.push_back(P.first);
+ }
+ }
+ }
+
+ if (Trace) {
+ dbgs() << "Real use map:\n";
+ for (auto I : RealUseMap) {
+ dbgs() << "phi " << Print<NodeId>(I.first, DFG);
+ NodeAddr<PhiNode*> PA = DFG.addr<PhiNode*>(I.first);
+ NodeList Ds = PA.Addr->members_if(DFG.IsRef<NodeAttrs::Def>, DFG);
+ if (!Ds.empty()) {
+ RegisterRef RR = NodeAddr<DefNode*>(Ds[0]).Addr->getRegRef(DFG);
+ dbgs() << '<' << Print<RegisterRef>(RR, DFG) << '>';
+ } else {
+ dbgs() << "<noreg>";
+ }
+ dbgs() << " -> " << Print<RefMap>(I.second, DFG) << '\n';
+ }
+ }
+}
+
+
+void Liveness::computeLiveIns() {
+ // Populate the node-to-block map. This speeds up the calculations
+ // significantly.
+ NBMap.clear();
+ for (NodeAddr<BlockNode*> BA : DFG.getFunc().Addr->members(DFG)) {
+ MachineBasicBlock *BB = BA.Addr->getCode();
+ for (NodeAddr<InstrNode*> IA : BA.Addr->members(DFG)) {
+ for (NodeAddr<RefNode*> RA : IA.Addr->members(DFG))
+ NBMap.insert(std::make_pair(RA.Id, BB));
+ NBMap.insert(std::make_pair(IA.Id, BB));
+ }
+ }
+
+ MachineFunction &MF = DFG.getMF();
+
+ // Compute IDF first, then the inverse.
+ decltype(IIDF) IDF;
+ for (MachineBasicBlock &B : MF) {
+ auto F1 = MDF.find(&B);
+ if (F1 == MDF.end())
+ continue;
+ SetVector<MachineBasicBlock*> IDFB(F1->second.begin(), F1->second.end());
+ for (unsigned i = 0; i < IDFB.size(); ++i) {
+ auto F2 = MDF.find(IDFB[i]);
+ if (F2 != MDF.end())
+ IDFB.insert(F2->second.begin(), F2->second.end());
+ }
+ // Add B to the IDF(B). This will put B in the IIDF(B).
+ IDFB.insert(&B);
+ IDF[&B].insert(IDFB.begin(), IDFB.end());
+ }
+
+ for (auto I : IDF)
+ for (auto S : I.second)
+ IIDF[S].insert(I.first);
+
+ computePhiInfo();
+
+ NodeAddr<FuncNode*> FA = DFG.getFunc();
+ NodeList Blocks = FA.Addr->members(DFG);
+
+ // Build the phi live-on-entry map.
+ for (NodeAddr<BlockNode*> BA : Blocks) {
+ MachineBasicBlock *MB = BA.Addr->getCode();
+ RefMap &LON = PhiLON[MB];
+ for (auto P : BA.Addr->members_if(DFG.IsCode<NodeAttrs::Phi>, DFG))
+ for (const RefMap::value_type &S : RealUseMap[P.Id])
+ LON[S.first].insert(S.second.begin(), S.second.end());
+ }
+
+ if (Trace) {
+ dbgs() << "Phi live-on-entry map:\n";
+ for (auto &I : PhiLON)
+ dbgs() << "block #" << I.first->getNumber() << " -> "
+ << Print<RefMap>(I.second, DFG) << '\n';
+ }
+
+ // Build the phi live-on-exit map. Each phi node has some set of reached
+ // "real" uses. Propagate this set backwards into the block predecessors
+ // through the reaching defs of the corresponding phi uses.
+ for (NodeAddr<BlockNode*> BA : Blocks) {
+ NodeList Phis = BA.Addr->members_if(DFG.IsCode<NodeAttrs::Phi>, DFG);
+ for (NodeAddr<PhiNode*> PA : Phis) {
+ RefMap &RUs = RealUseMap[PA.Id];
+ if (RUs.empty())
+ continue;
+
+ for (auto U : PA.Addr->members_if(DFG.IsRef<NodeAttrs::Use>, DFG)) {
+ NodeAddr<PhiUseNode*> PUA = U;
+ if (PUA.Addr->getReachingDef() == 0)
+ continue;
+
+ // Mark all reached "real" uses of P as live on exit in the
+ // predecessor.
+ // Remap all the RUs so that they have a correct reaching def.
+ auto PrA = DFG.addr<BlockNode*>(PUA.Addr->getPredecessor());
+ RefMap &LOX = PhiLOX[PrA.Addr->getCode()];
+
+ RegisterRef UR = DFG.normalizeRef(getRestrictedRegRef(PUA));
+ for (const std::pair<RegisterId,NodeRefSet> &T : RUs) {
+ // Check if T.first aliases UR?
+ LaneBitmask M;
+ for (std::pair<NodeId,LaneBitmask> P : T.second)
+ M |= P.second;
+
+ RegisterRef S = DFG.restrictRef(RegisterRef(T.first, M), UR);
+ if (!S)
+ continue;
+ for (NodeAddr<DefNode*> D : getAllReachingDefs(S, PUA))
+ LOX[S.Reg].insert({D.Id, S.Mask});
+ }
+ } // for U : phi uses
+ } // for P : Phis
+ } // for B : Blocks
+
+ if (Trace) {
+ dbgs() << "Phi live-on-exit map:\n";
+ for (auto &I : PhiLOX)
+ dbgs() << "block #" << I.first->getNumber() << " -> "
+ << Print<RefMap>(I.second, DFG) << '\n';
+ }
+
+ RefMap LiveIn;
+ traverse(&MF.front(), LiveIn);
+
+ // Add function live-ins to the live-in set of the function entry block.
+ auto &EntryIn = LiveMap[&MF.front()];
+ for (auto I = MRI.livein_begin(), E = MRI.livein_end(); I != E; ++I)
+ EntryIn.insert(RegisterRef(I->first));
+
+ if (Trace) {
+ // Dump the liveness map
+ for (MachineBasicBlock &B : MF) {
+ std::vector<RegisterRef> LV;
+ for (auto I = B.livein_begin(), E = B.livein_end(); I != E; ++I)
+ LV.push_back(RegisterRef(I->PhysReg, I->LaneMask));
+ std::sort(LV.begin(), LV.end());
+ dbgs() << "BB#" << B.getNumber() << "\t rec = {";
+ for (auto I : LV)
+ dbgs() << ' ' << Print<RegisterRef>(I, DFG);
+ dbgs() << " }\n";
+ //dbgs() << "\tcomp = " << Print<RegisterAggr>(LiveMap[&B], DFG) << '\n';
+
+ LV.clear();
+ for (std::pair<RegisterId,LaneBitmask> P : LiveMap[&B]) {
+ MCSubRegIndexIterator S(P.first, &TRI);
+ if (!S.isValid()) {
+ LV.push_back(RegisterRef(P.first));
+ continue;
+ }
+ do {
+ LaneBitmask M = TRI.getSubRegIndexLaneMask(S.getSubRegIndex());
+ if ((M & P.second).any())
+ LV.push_back(RegisterRef(S.getSubReg()));
+ ++S;
+ } while (S.isValid());
+ }
+ std::sort(LV.begin(), LV.end());
+ dbgs() << "\tcomp = {";
+ for (auto I : LV)
+ dbgs() << ' ' << Print<RegisterRef>(I, DFG);
+ dbgs() << " }\n";
+
+ }
+ }
+}
+
+
+void Liveness::resetLiveIns() {
+ for (auto &B : DFG.getMF()) {
+ // Remove all live-ins.
+ std::vector<unsigned> T;
+ for (auto I = B.livein_begin(), E = B.livein_end(); I != E; ++I)
+ T.push_back(I->PhysReg);
+ for (auto I : T)
+ B.removeLiveIn(I);
+ // Add the newly computed live-ins.
+ auto &LiveIns = LiveMap[&B];
+ for (auto I : LiveIns) {
+ B.addLiveIn({MCPhysReg(I.first), I.second});
+ }
+ }
+}
+
+
+void Liveness::resetKills() {
+ for (auto &B : DFG.getMF())
+ resetKills(&B);
+}
+
+
+void Liveness::resetKills(MachineBasicBlock *B) {
+ auto CopyLiveIns = [this] (MachineBasicBlock *B, BitVector &LV) -> void {
+ for (auto I : B->liveins()) {
+ MCSubRegIndexIterator S(I.PhysReg, &TRI);
+ if (!S.isValid()) {
+ LV.set(I.PhysReg);
+ continue;
+ }
+ do {
+ LaneBitmask M = TRI.getSubRegIndexLaneMask(S.getSubRegIndex());
+ if ((M & I.LaneMask).any())
+ LV.set(S.getSubReg());
+ ++S;
+ } while (S.isValid());
+ }
+ };
+
+ BitVector LiveIn(TRI.getNumRegs()), Live(TRI.getNumRegs());
+ CopyLiveIns(B, LiveIn);
+ for (auto SI : B->successors())
+ CopyLiveIns(SI, Live);
+
+ for (auto I = B->rbegin(), E = B->rend(); I != E; ++I) {
+ MachineInstr *MI = &*I;
+ if (MI->isDebugValue())
+ continue;
+
+ MI->clearKillInfo();
+ for (auto &Op : MI->operands()) {
+ // An implicit def of a super-register may not necessarily start a
+ // live range of it, since an implicit use could be used to keep parts
+ // of it live. Instead of analyzing the implicit operands, ignore
+ // implicit defs.
+ if (!Op.isReg() || !Op.isDef() || Op.isImplicit())
+ continue;
+ unsigned R = Op.getReg();
+ if (!TargetRegisterInfo::isPhysicalRegister(R))
+ continue;
+ for (MCSubRegIterator SR(R, &TRI, true); SR.isValid(); ++SR)
+ Live.reset(*SR);
+ }
+ for (auto &Op : MI->operands()) {
+ if (!Op.isReg() || !Op.isUse())
+ continue;
+ unsigned R = Op.getReg();
+ if (!TargetRegisterInfo::isPhysicalRegister(R))
+ continue;
+ bool IsLive = false;
+ for (MCRegAliasIterator AR(R, &TRI, true); AR.isValid(); ++AR) {
+ if (!Live[*AR])
+ continue;
+ IsLive = true;
+ break;
+ }
+ if (IsLive)
+ continue;
+ Op.setIsKill(true);
+ for (MCSubRegIterator SR(R, &TRI, true); SR.isValid(); ++SR)
+ Live.set(*SR);
+ }
+ }
+}
+
+
+RegisterRef Liveness::getRestrictedRegRef(NodeAddr<RefNode*> RA) const {
+ assert(DFG.IsRef<NodeAttrs::Use>(RA));
+ if (RA.Addr->getFlags() & NodeAttrs::Shadow) {
+ NodeId RD = RA.Addr->getReachingDef();
+ assert(RD);
+ RA = DFG.addr<DefNode*>(RD);
+ }
+ return RA.Addr->getRegRef(DFG);
+}
+
+
+// Helper function to obtain the basic block containing the reaching def
+// of the given use.
+MachineBasicBlock *Liveness::getBlockWithRef(NodeId RN) const {
+ auto F = NBMap.find(RN);
+ if (F != NBMap.end())
+ return F->second;
+ llvm_unreachable("Node id not in map");
+}
+
+
+void Liveness::traverse(MachineBasicBlock *B, RefMap &LiveIn) {
+ // The LiveIn map, for each (physical) register, contains the set of live
+ // reaching defs of that register that are live on entry to the associated
+ // block.
+
+ // The summary of the traversal algorithm:
+ //
+ // R is live-in in B, if there exists a U(R), such that rdef(R) dom B
+ // and (U \in IDF(B) or B dom U).
+ //
+ // for (C : children) {
+ // LU = {}
+ // traverse(C, LU)
+ // LiveUses += LU
+ // }
+ //
+ // LiveUses -= Defs(B);
+ // LiveUses += UpwardExposedUses(B);
+ // for (C : IIDF[B])
+ // for (U : LiveUses)
+ // if (Rdef(U) dom C)
+ // C.addLiveIn(U)
+ //
+
+ // Go up the dominator tree (depth-first).
+ MachineDomTreeNode *N = MDT.getNode(B);
+ for (auto I : *N) {
+ RefMap L;
+ MachineBasicBlock *SB = I->getBlock();
+ traverse(SB, L);
+
+ for (auto S : L)
+ LiveIn[S.first].insert(S.second.begin(), S.second.end());
+ }
+
+ if (Trace) {
+ dbgs() << "\n-- BB#" << B->getNumber() << ": " << __func__
+ << " after recursion into: {";
+ for (auto I : *N)
+ dbgs() << ' ' << I->getBlock()->getNumber();
+ dbgs() << " }\n";
+ dbgs() << " LiveIn: " << Print<RefMap>(LiveIn, DFG) << '\n';
+ dbgs() << " Local: " << Print<RegisterAggr>(LiveMap[B], DFG) << '\n';
+ }
+
+ // Add reaching defs of phi uses that are live on exit from this block.
+ RefMap &PUs = PhiLOX[B];
+ for (auto &S : PUs)
+ LiveIn[S.first].insert(S.second.begin(), S.second.end());
+
+ if (Trace) {
+ dbgs() << "after LOX\n";
+ dbgs() << " LiveIn: " << Print<RefMap>(LiveIn, DFG) << '\n';
+ dbgs() << " Local: " << Print<RegisterAggr>(LiveMap[B], DFG) << '\n';
+ }
+
+ // The LiveIn map at this point has all defs that are live-on-exit from B,
+ // as if they were live-on-entry to B. First, we need to filter out all
+ // defs that are present in this block. Then we will add reaching defs of
+ // all upward-exposed uses.
+
+ // To filter out the defs, first make a copy of LiveIn, and then re-populate
+ // LiveIn with the defs that should remain.
+ RefMap LiveInCopy = LiveIn;
+ LiveIn.clear();
+
+ for (const std::pair<RegisterId,NodeRefSet> &LE : LiveInCopy) {
+ RegisterRef LRef(LE.first);
+ NodeRefSet &NewDefs = LiveIn[LRef.Reg]; // To be filled.
+ const NodeRefSet &OldDefs = LE.second;
+ for (NodeRef OR : OldDefs) {
+ // R is a def node that was live-on-exit
+ auto DA = DFG.addr<DefNode*>(OR.first);
+ NodeAddr<InstrNode*> IA = DA.Addr->getOwner(DFG);
+ NodeAddr<BlockNode*> BA = IA.Addr->getOwner(DFG);
+ if (B != BA.Addr->getCode()) {
+ // Defs from a different block need to be preserved. Defs from this
+ // block will need to be processed further, except for phi defs, the
+ // liveness of which is handled through the PhiLON/PhiLOX maps.
+ NewDefs.insert(OR);
+ continue;
+ }
+
+ // Defs from this block need to stop the liveness from being
+ // propagated upwards. This only applies to non-preserving defs,
+ // and to the parts of the register actually covered by those defs.
+ // (Note that phi defs should always be preserving.)
+ RegisterAggr RRs(TRI);
+ LRef.Mask = OR.second;
+
+ if (!DFG.IsPreservingDef(DA)) {
+ assert(!(IA.Addr->getFlags() & NodeAttrs::Phi));
+ // DA is a non-phi def that is live-on-exit from this block, and
+ // that is also located in this block. LRef is a register ref
+ // whose use this def reaches. If DA covers LRef, then no part
+ // of LRef is exposed upwards.A
+ if (RRs.insert(DA.Addr->getRegRef(DFG)).hasCoverOf(LRef))
+ continue;
+ }
+
+ // DA itself was not sufficient to cover LRef. In general, it is
+ // the last in a chain of aliased defs before the exit from this block.
+ // There could be other defs in this block that are a part of that
+ // chain. Check that now: accumulate the registers from these defs,
+ // and if they all together cover LRef, it is not live-on-entry.
+ for (NodeAddr<DefNode*> TA : getAllReachingDefs(DA)) {
+ // DefNode -> InstrNode -> BlockNode.
+ NodeAddr<InstrNode*> ITA = TA.Addr->getOwner(DFG);
+ NodeAddr<BlockNode*> BTA = ITA.Addr->getOwner(DFG);
+ // Reaching defs are ordered in the upward direction.
+ if (BTA.Addr->getCode() != B) {
+ // We have reached past the beginning of B, and the accumulated
+ // registers are not covering LRef. The first def from the
+ // upward chain will be live.
+ // Subtract all accumulated defs (RRs) from LRef.
+ RegisterAggr L(TRI);
+ L.insert(LRef).clear(RRs);
+ assert(!L.empty());
+ NewDefs.insert({TA.Id,L.begin()->second});
+ break;
+ }
+
+ // TA is in B. Only add this def to the accumulated cover if it is
+ // not preserving.
+ if (!(TA.Addr->getFlags() & NodeAttrs::Preserving))
+ RRs.insert(TA.Addr->getRegRef(DFG));
+ // If this is enough to cover LRef, then stop.
+ if (RRs.hasCoverOf(LRef))
+ break;
+ }
+ }
+ }
+
+ emptify(LiveIn);
+
+ if (Trace) {
+ dbgs() << "after defs in block\n";
+ dbgs() << " LiveIn: " << Print<RefMap>(LiveIn, DFG) << '\n';
+ dbgs() << " Local: " << Print<RegisterAggr>(LiveMap[B], DFG) << '\n';
+ }
+
+ // Scan the block for upward-exposed uses and add them to the tracking set.
+ for (auto I : DFG.getFunc().Addr->findBlock(B, DFG).Addr->members(DFG)) {
+ NodeAddr<InstrNode*> IA = I;
+ if (IA.Addr->getKind() != NodeAttrs::Stmt)
+ continue;
+ for (NodeAddr<UseNode*> UA : IA.Addr->members_if(DFG.IsUse, DFG)) {
+ if (UA.Addr->getFlags() & NodeAttrs::Undef)
+ continue;
+ RegisterRef RR = DFG.normalizeRef(UA.Addr->getRegRef(DFG));
+ for (NodeAddr<DefNode*> D : getAllReachingDefs(UA))
+ if (getBlockWithRef(D.Id) != B)
+ LiveIn[RR.Reg].insert({D.Id,RR.Mask});
+ }
+ }
+
+ if (Trace) {
+ dbgs() << "after uses in block\n";
+ dbgs() << " LiveIn: " << Print<RefMap>(LiveIn, DFG) << '\n';
+ dbgs() << " Local: " << Print<RegisterAggr>(LiveMap[B], DFG) << '\n';
+ }
+
+ // Phi uses should not be propagated up the dominator tree, since they
+ // are not dominated by their corresponding reaching defs.
+ RegisterAggr &Local = LiveMap[B];
+ RefMap &LON = PhiLON[B];
+ for (auto &R : LON) {
+ LaneBitmask M;
+ for (auto P : R.second)
+ M |= P.second;
+ Local.insert(RegisterRef(R.first,M));
+ }
+
+ if (Trace) {
+ dbgs() << "after phi uses in block\n";
+ dbgs() << " LiveIn: " << Print<RefMap>(LiveIn, DFG) << '\n';
+ dbgs() << " Local: " << Print<RegisterAggr>(Local, DFG) << '\n';
+ }
+
+ for (auto C : IIDF[B]) {
+ RegisterAggr &LiveC = LiveMap[C];
+ for (const std::pair<RegisterId,NodeRefSet> &S : LiveIn)
+ for (auto R : S.second)
+ if (MDT.properlyDominates(getBlockWithRef(R.first), C))
+ LiveC.insert(RegisterRef(S.first, R.second));
+ }
+}
+
+
+void Liveness::emptify(RefMap &M) {
+ for (auto I = M.begin(), E = M.end(); I != E; )
+ I = I->second.empty() ? M.erase(I) : std::next(I);
+}
+
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFLiveness.h b/contrib/llvm/lib/Target/Hexagon/RDFLiveness.h
new file mode 100644
index 000000000000..c88396f36bbb
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/RDFLiveness.h
@@ -0,0 +1,134 @@
+//===--- RDFLiveness.h ----------------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Recalculate the liveness information given a data flow graph.
+// This includes block live-ins and kill flags.
+
+#ifndef RDF_LIVENESS_H
+#define RDF_LIVENESS_H
+
+#include "RDFGraph.h"
+#include "llvm/ADT/DenseMap.h"
+#include <map>
+
+using namespace llvm;
+
+namespace llvm {
+ class MachineBasicBlock;
+ class MachineFunction;
+ class MachineRegisterInfo;
+ class TargetRegisterInfo;
+ class MachineDominatorTree;
+ class MachineDominanceFrontier;
+
+namespace rdf {
+ struct Liveness {
+ public:
+ // This is really a std::map, except that it provides a non-trivial
+ // default constructor to the element accessed via [].
+ struct LiveMapType {
+ LiveMapType(const TargetRegisterInfo &tri) : Empty(tri) {}
+
+ RegisterAggr &operator[] (MachineBasicBlock *B) {
+ return Map.emplace(B, Empty).first->second;
+ }
+ private:
+ RegisterAggr Empty;
+ std::map<MachineBasicBlock*,RegisterAggr> Map;
+ };
+
+ typedef std::pair<NodeId,LaneBitmask> NodeRef;
+ typedef std::set<NodeRef> NodeRefSet;
+ // RegisterId in RefMap must be normalized.
+ typedef std::map<RegisterId,NodeRefSet> RefMap;
+
+ Liveness(MachineRegisterInfo &mri, const DataFlowGraph &g)
+ : DFG(g), TRI(g.getTRI()), MDT(g.getDT()), MDF(g.getDF()),
+ MRI(mri), LiveMap(g.getTRI()), Empty(), NoRegs(g.getTRI()),
+ Trace(false) {}
+
+ NodeList getAllReachingDefs(RegisterRef RefRR, NodeAddr<RefNode*> RefA,
+ bool FullChain, const RegisterAggr &DefRRs);
+ NodeList getAllReachingDefs(NodeAddr<RefNode*> RefA) {
+ return getAllReachingDefs(RefA.Addr->getRegRef(DFG), RefA, false, NoRegs);
+ }
+ NodeList getAllReachingDefs(RegisterRef RefRR, NodeAddr<RefNode*> RefA) {
+ return getAllReachingDefs(RefRR, RefA, false, NoRegs);
+ }
+ NodeSet getAllReachingDefsRec(RegisterRef RefRR, NodeAddr<RefNode*> RefA,
+ NodeSet &Visited, const NodeSet &Defs);
+ NodeSet getAllReachedUses(RegisterRef RefRR, NodeAddr<DefNode*> DefA,
+ const RegisterAggr &DefRRs);
+ NodeSet getAllReachedUses(RegisterRef RefRR, NodeAddr<DefNode*> DefA) {
+ return getAllReachedUses(RefRR, DefA, NoRegs);
+ }
+
+ LiveMapType &getLiveMap() { return LiveMap; }
+ const LiveMapType &getLiveMap() const { return LiveMap; }
+ const RefMap &getRealUses(NodeId P) const {
+ auto F = RealUseMap.find(P);
+ return F == RealUseMap.end() ? Empty : F->second;
+ }
+
+ void computePhiInfo();
+ void computeLiveIns();
+ void resetLiveIns();
+ void resetKills();
+ void resetKills(MachineBasicBlock *B);
+
+ void trace(bool T) { Trace = T; }
+
+ private:
+ const DataFlowGraph &DFG;
+ const TargetRegisterInfo &TRI;
+ const MachineDominatorTree &MDT;
+ const MachineDominanceFrontier &MDF;
+ MachineRegisterInfo &MRI;
+ LiveMapType LiveMap;
+ const RefMap Empty;
+ const RegisterAggr NoRegs;
+ bool Trace;
+
+ // Cache of mapping from node ids (for RefNodes) to the containing
+ // basic blocks. Not computing it each time for each node reduces
+ // the liveness calculation time by a large fraction.
+ typedef DenseMap<NodeId,MachineBasicBlock*> NodeBlockMap;
+ NodeBlockMap NBMap;
+
+ // Phi information:
+ //
+ // RealUseMap
+ // map: NodeId -> (map: RegisterId -> NodeRefSet)
+ // phi id -> (map: register -> set of reached non-phi uses)
+ std::map<NodeId, RefMap> RealUseMap;
+
+ // Inverse iterated dominance frontier.
+ std::map<MachineBasicBlock*,std::set<MachineBasicBlock*>> IIDF;
+
+ // Live on entry.
+ std::map<MachineBasicBlock*,RefMap> PhiLON;
+
+ // Phi uses are considered to be located at the end of the block that
+ // they are associated with. The reaching def of a phi use dominates the
+ // block that the use corresponds to, but not the block that contains
+ // the phi itself. To include these uses in the liveness propagation (up
+ // the dominator tree), create a map: block -> set of uses live on exit.
+ std::map<MachineBasicBlock*,RefMap> PhiLOX;
+
+ bool isRestrictedToRef(NodeAddr<InstrNode*> IA, NodeAddr<RefNode*> RA,
+ RegisterRef RR) const;
+ RegisterRef getRestrictedRegRef(NodeAddr<RefNode*> RA) const;
+ MachineBasicBlock *getBlockWithRef(NodeId RN) const;
+ void traverse(MachineBasicBlock *B, RefMap &LiveIn);
+ void emptify(RefMap &M);
+ };
+} // namespace rdf
+} // namespace llvm
+
+#endif // RDF_LIVENESS_H
diff --git a/contrib/llvm/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp b/contrib/llvm/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp
new file mode 100644
index 000000000000..0554646bb6be
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp
@@ -0,0 +1,23 @@
+//===-- HexagonTargetInfo.cpp - Hexagon Target Implementation ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target &llvm::getTheHexagonTarget() {
+ static Target TheHexagonTarget;
+ return TheHexagonTarget;
+}
+
+extern "C" void LLVMInitializeHexagonTargetInfo() {
+ RegisterTarget<Triple::hexagon, /*HasJIT=*/false> X(getTheHexagonTarget(),
+ "hexagon", "Hexagon");
+}
diff --git a/contrib/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp b/contrib/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
new file mode 100644
index 000000000000..903f92a04431
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
@@ -0,0 +1,1213 @@
+//===-- LanaiAsmParser.cpp - Parse Lanai assembly to MCInst instructions --===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Lanai.h"
+#include "MCTargetDesc/LanaiMCExpr.h"
+#include "MCTargetDesc/LanaiMCTargetDesc.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
+
+namespace llvm {
+namespace {
+struct LanaiOperand;
+
+class LanaiAsmParser : public MCTargetAsmParser {
+ // Parse operands
+ std::unique_ptr<LanaiOperand> parseRegister();
+
+ std::unique_ptr<LanaiOperand> parseImmediate();
+
+ std::unique_ptr<LanaiOperand> parseIdentifier();
+
+ unsigned parseAluOperator(bool PreOp, bool PostOp);
+
+ // Split the mnemonic stripping conditional code and quantifiers
+ StringRef splitMnemonic(StringRef Name, SMLoc NameLoc,
+ OperandVector *Operands);
+
+ bool parsePrePost(StringRef Type, int *OffsetValue);
+
+ bool ParseDirective(AsmToken DirectiveID) override;
+
+ bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+ SMLoc NameLoc, OperandVector &Operands) override;
+
+ bool ParseRegister(unsigned &RegNum, SMLoc &StartLoc, SMLoc &EndLoc) override;
+
+ bool MatchAndEmitInstruction(SMLoc IdLoc, unsigned &Opcode,
+ OperandVector &Operands, MCStreamer &Out,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) override;
+
+// Auto-generated instruction matching functions
+#define GET_ASSEMBLER_HEADER
+#include "LanaiGenAsmMatcher.inc"
+
+ OperandMatchResultTy parseOperand(OperandVector *Operands,
+ StringRef Mnemonic);
+
+ OperandMatchResultTy parseMemoryOperand(OperandVector &Operands);
+
+public:
+ LanaiAsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
+ const MCInstrInfo &MII, const MCTargetOptions &Options)
+ : MCTargetAsmParser(Options, STI), Parser(Parser),
+ Lexer(Parser.getLexer()), SubtargetInfo(STI) {
+ setAvailableFeatures(
+ ComputeAvailableFeatures(SubtargetInfo.getFeatureBits()));
+ }
+
+private:
+ MCAsmParser &Parser;
+ MCAsmLexer &Lexer;
+
+ const MCSubtargetInfo &SubtargetInfo;
+};
+
+// Auto-generated by TableGen
+static unsigned MatchRegisterName(llvm::StringRef Name);
+
+// LanaiOperand - Instances of this class represented a parsed machine
+// instruction
+struct LanaiOperand : public MCParsedAsmOperand {
+ enum KindTy {
+ TOKEN,
+ REGISTER,
+ IMMEDIATE,
+ MEMORY_IMM,
+ MEMORY_REG_IMM,
+ MEMORY_REG_REG,
+ } Kind;
+
+ SMLoc StartLoc, EndLoc;
+
+ struct Token {
+ const char *Data;
+ unsigned Length;
+ };
+
+ struct RegOp {
+ unsigned RegNum;
+ };
+
+ struct ImmOp {
+ const MCExpr *Value;
+ };
+
+ struct MemOp {
+ unsigned BaseReg;
+ unsigned OffsetReg;
+ unsigned AluOp;
+ const MCExpr *Offset;
+ };
+
+ union {
+ struct Token Tok;
+ struct RegOp Reg;
+ struct ImmOp Imm;
+ struct MemOp Mem;
+ };
+
+ explicit LanaiOperand(KindTy Kind) : MCParsedAsmOperand(), Kind(Kind) {}
+
+public:
+ // The functions below are used by the autogenerated ASM matcher and hence to
+ // be of the form expected.
+
+ // getStartLoc - Gets location of the first token of this operand
+ SMLoc getStartLoc() const override { return StartLoc; }
+
+ // getEndLoc - Gets location of the last token of this operand
+ SMLoc getEndLoc() const override { return EndLoc; }
+
+ unsigned getReg() const override {
+ assert(isReg() && "Invalid type access!");
+ return Reg.RegNum;
+ }
+
+ const MCExpr *getImm() const {
+ assert(isImm() && "Invalid type access!");
+ return Imm.Value;
+ }
+
+ StringRef getToken() const {
+ assert(isToken() && "Invalid type access!");
+ return StringRef(Tok.Data, Tok.Length);
+ }
+
+ unsigned getMemBaseReg() const {
+ assert(isMem() && "Invalid type access!");
+ return Mem.BaseReg;
+ }
+
+ unsigned getMemOffsetReg() const {
+ assert(isMem() && "Invalid type access!");
+ return Mem.OffsetReg;
+ }
+
+ const MCExpr *getMemOffset() const {
+ assert(isMem() && "Invalid type access!");
+ return Mem.Offset;
+ }
+
+ unsigned getMemOp() const {
+ assert(isMem() && "Invalid type access!");
+ return Mem.AluOp;
+ }
+
+ // Functions for testing operand type
+ bool isReg() const override { return Kind == REGISTER; }
+
+ bool isImm() const override { return Kind == IMMEDIATE; }
+
+ bool isMem() const override {
+ return isMemImm() || isMemRegImm() || isMemRegReg();
+ }
+
+ bool isMemImm() const { return Kind == MEMORY_IMM; }
+
+ bool isMemRegImm() const { return Kind == MEMORY_REG_IMM; }
+
+ bool isMemRegReg() const { return Kind == MEMORY_REG_REG; }
+
+ bool isMemSpls() const { return isMemRegImm() || isMemRegReg(); }
+
+ bool isToken() const override { return Kind == TOKEN; }
+
+ bool isBrImm() {
+ if (!isImm())
+ return false;
+
+ // Constant case
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Imm.Value);
+ if (!MCE)
+ return true;
+ int64_t Value = MCE->getValue();
+ // Check if value fits in 25 bits with 2 least significant bits 0.
+ return isShiftedUInt<23, 2>(static_cast<int32_t>(Value));
+ }
+
+ bool isBrTarget() { return isBrImm() || isToken(); }
+
+ bool isCallTarget() { return isImm() || isToken(); }
+
+ bool isHiImm16() {
+ if (!isImm())
+ return false;
+
+ // Constant case
+ if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(Imm.Value)) {
+ int64_t Value = ConstExpr->getValue();
+ return Value != 0 && isShiftedUInt<16, 16>(Value);
+ }
+
+ // Symbolic reference expression
+ if (const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(Imm.Value))
+ return SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_ABS_HI;
+
+ // Binary expression
+ if (const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(Imm.Value))
+ if (const LanaiMCExpr *SymbolRefExpr =
+ dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS()))
+ return SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_ABS_HI;
+
+ return false;
+ }
+
+ bool isHiImm16And() {
+ if (!isImm())
+ return false;
+
+ const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(Imm.Value);
+ if (ConstExpr) {
+ int64_t Value = ConstExpr->getValue();
+ // Check if in the form 0xXYZWffff
+ return (Value != 0) && ((Value & ~0xffff0000) == 0xffff);
+ }
+ return false;
+ }
+
+ bool isLoImm16() {
+ if (!isImm())
+ return false;
+
+ // Constant case
+ if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(Imm.Value)) {
+ int64_t Value = ConstExpr->getValue();
+ // Check if value fits in 16 bits
+ return isUInt<16>(static_cast<int32_t>(Value));
+ }
+
+ // Symbolic reference expression
+ if (const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(Imm.Value))
+ return SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_ABS_LO;
+
+ // Binary expression
+ if (const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(Imm.Value))
+ if (const LanaiMCExpr *SymbolRefExpr =
+ dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS()))
+ return SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_ABS_LO;
+
+ return false;
+ }
+
+ bool isLoImm16Signed() {
+ if (!isImm())
+ return false;
+
+ // Constant case
+ if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(Imm.Value)) {
+ int64_t Value = ConstExpr->getValue();
+ // Check if value fits in 16 bits or value of the form 0xffffxyzw
+ return isInt<16>(static_cast<int32_t>(Value));
+ }
+
+ // Symbolic reference expression
+ if (const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(Imm.Value))
+ return SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_ABS_LO;
+
+ // Binary expression
+ if (const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(Imm.Value))
+ if (const LanaiMCExpr *SymbolRefExpr =
+ dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS()))
+ return SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_ABS_LO;
+
+ return false;
+ }
+
+ bool isLoImm16And() {
+ if (!isImm())
+ return false;
+
+ const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(Imm.Value);
+ if (ConstExpr) {
+ int64_t Value = ConstExpr->getValue();
+ // Check if in the form 0xffffXYZW
+ return ((Value & ~0xffff) == 0xffff0000);
+ }
+ return false;
+ }
+
+ bool isImmShift() {
+ if (!isImm())
+ return false;
+
+ const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(Imm.Value);
+ if (!ConstExpr)
+ return false;
+ int64_t Value = ConstExpr->getValue();
+ return (Value >= -31) && (Value <= 31);
+ }
+
+ bool isLoImm21() {
+ if (!isImm())
+ return false;
+
+ // Constant case
+ if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(Imm.Value)) {
+ int64_t Value = ConstExpr->getValue();
+ return isUInt<21>(Value);
+ }
+
+ // Symbolic reference expression
+ if (const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(Imm.Value))
+ return SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_None;
+ if (const MCSymbolRefExpr *SymbolRefExpr =
+ dyn_cast<MCSymbolRefExpr>(Imm.Value)) {
+ return SymbolRefExpr->getKind() == MCSymbolRefExpr::VK_None;
+ }
+
+ // Binary expression
+ if (const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(Imm.Value)) {
+ if (const LanaiMCExpr *SymbolRefExpr =
+ dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS()))
+ return SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_None;
+ if (const MCSymbolRefExpr *SymbolRefExpr =
+ dyn_cast<MCSymbolRefExpr>(BinaryExpr->getLHS()))
+ return SymbolRefExpr->getKind() == MCSymbolRefExpr::VK_None;
+ }
+
+ return false;
+ }
+
+ bool isImm10() {
+ if (!isImm())
+ return false;
+
+ const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(Imm.Value);
+ if (!ConstExpr)
+ return false;
+ int64_t Value = ConstExpr->getValue();
+ return isInt<10>(Value);
+ }
+
+ bool isCondCode() {
+ if (!isImm())
+ return false;
+
+ const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(Imm.Value);
+ if (!ConstExpr)
+ return false;
+ uint64_t Value = ConstExpr->getValue();
+ // The condition codes are between 0 (ICC_T) and 15 (ICC_LE). If the
+ // unsigned value of the immediate is less than LPCC::UNKNOWN (16) then
+ // value corresponds to a valid condition code.
+ return Value < LPCC::UNKNOWN;
+ }
+
+ void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+ // Add as immediates where possible. Null MCExpr = 0
+ if (Expr == nullptr)
+ Inst.addOperand(MCOperand::createImm(0));
+ else if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(Expr))
+ Inst.addOperand(
+ MCOperand::createImm(static_cast<int32_t>(ConstExpr->getValue())));
+ else
+ Inst.addOperand(MCOperand::createExpr(Expr));
+ }
+
+ void addRegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getReg()));
+ }
+
+ void addImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ addExpr(Inst, getImm());
+ }
+
+ void addBrTargetOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ addExpr(Inst, getImm());
+ }
+
+ void addCallTargetOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ addExpr(Inst, getImm());
+ }
+
+ void addCondCodeOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ addExpr(Inst, getImm());
+ }
+
+ void addMemImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCExpr *Expr = getMemOffset();
+ addExpr(Inst, Expr);
+ }
+
+ void addMemRegImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 3 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getMemBaseReg()));
+ const MCExpr *Expr = getMemOffset();
+ addExpr(Inst, Expr);
+ Inst.addOperand(MCOperand::createImm(getMemOp()));
+ }
+
+ void addMemRegRegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 3 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getMemBaseReg()));
+ assert(getMemOffsetReg() != 0 && "Invalid offset");
+ Inst.addOperand(MCOperand::createReg(getMemOffsetReg()));
+ Inst.addOperand(MCOperand::createImm(getMemOp()));
+ }
+
+ void addMemSplsOperands(MCInst &Inst, unsigned N) const {
+ if (isMemRegImm())
+ addMemRegImmOperands(Inst, N);
+ if (isMemRegReg())
+ addMemRegRegOperands(Inst, N);
+ }
+
+ void addImmShiftOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ addExpr(Inst, getImm());
+ }
+
+ void addImm10Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ addExpr(Inst, getImm());
+ }
+
+ void addLoImm16Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(getImm()))
+ Inst.addOperand(
+ MCOperand::createImm(static_cast<int32_t>(ConstExpr->getValue())));
+ else if (isa<LanaiMCExpr>(getImm())) {
+#ifndef NDEBUG
+ const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(getImm());
+ assert(SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_ABS_LO);
+#endif
+ Inst.addOperand(MCOperand::createExpr(getImm()));
+ } else if (isa<MCBinaryExpr>(getImm())) {
+#ifndef NDEBUG
+ const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(getImm());
+ assert(dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS()) &&
+ dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS())->getKind() ==
+ LanaiMCExpr::VK_Lanai_ABS_LO);
+#endif
+ Inst.addOperand(MCOperand::createExpr(getImm()));
+ } else
+ assert(false && "Operand type not supported.");
+ }
+
+ void addLoImm16AndOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(getImm()))
+ Inst.addOperand(MCOperand::createImm(ConstExpr->getValue() & 0xffff));
+ else
+ assert(false && "Operand type not supported.");
+ }
+
+ void addHiImm16Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(getImm()))
+ Inst.addOperand(MCOperand::createImm(ConstExpr->getValue() >> 16));
+ else if (isa<LanaiMCExpr>(getImm())) {
+#ifndef NDEBUG
+ const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(getImm());
+ assert(SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_ABS_HI);
+#endif
+ Inst.addOperand(MCOperand::createExpr(getImm()));
+ } else if (isa<MCBinaryExpr>(getImm())) {
+#ifndef NDEBUG
+ const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(getImm());
+ assert(dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS()) &&
+ dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS())->getKind() ==
+ LanaiMCExpr::VK_Lanai_ABS_HI);
+#endif
+ Inst.addOperand(MCOperand::createExpr(getImm()));
+ } else
+ assert(false && "Operand type not supported.");
+ }
+
+ void addHiImm16AndOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(getImm()))
+ Inst.addOperand(MCOperand::createImm(ConstExpr->getValue() >> 16));
+ else
+ assert(false && "Operand type not supported.");
+ }
+
+ void addLoImm21Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(getImm()))
+ Inst.addOperand(MCOperand::createImm(ConstExpr->getValue() & 0x1fffff));
+ else if (isa<LanaiMCExpr>(getImm())) {
+#ifndef NDEBUG
+ const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(getImm());
+ assert(SymbolRefExpr &&
+ SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_None);
+#endif
+ Inst.addOperand(MCOperand::createExpr(getImm()));
+ } else if (isa<MCSymbolRefExpr>(getImm())) {
+#ifndef NDEBUG
+ const MCSymbolRefExpr *SymbolRefExpr =
+ dyn_cast<MCSymbolRefExpr>(getImm());
+ assert(SymbolRefExpr &&
+ SymbolRefExpr->getKind() == MCSymbolRefExpr::VK_None);
+#endif
+ Inst.addOperand(MCOperand::createExpr(getImm()));
+ } else if (isa<MCBinaryExpr>(getImm())) {
+#ifndef NDEBUG
+ const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(getImm());
+ const LanaiMCExpr *SymbolRefExpr =
+ dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS());
+ assert(SymbolRefExpr &&
+ SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_None);
+#endif
+ Inst.addOperand(MCOperand::createExpr(getImm()));
+ } else
+ assert(false && "Operand type not supported.");
+ }
+
+ void print(raw_ostream &OS) const override {
+ switch (Kind) {
+ case IMMEDIATE:
+ OS << "Imm: " << getImm() << "\n";
+ break;
+ case TOKEN:
+ OS << "Token: " << getToken() << "\n";
+ break;
+ case REGISTER:
+ OS << "Reg: %r" << getReg() << "\n";
+ break;
+ case MEMORY_IMM:
+ OS << "MemImm: " << *getMemOffset() << "\n";
+ break;
+ case MEMORY_REG_IMM:
+ OS << "MemRegImm: " << getMemBaseReg() << "+" << *getMemOffset() << "\n";
+ break;
+ case MEMORY_REG_REG:
+ assert(getMemOffset() == nullptr);
+ OS << "MemRegReg: " << getMemBaseReg() << "+"
+ << "%r" << getMemOffsetReg() << "\n";
+ break;
+ }
+ }
+
+ static std::unique_ptr<LanaiOperand> CreateToken(StringRef Str, SMLoc Start) {
+ auto Op = make_unique<LanaiOperand>(TOKEN);
+ Op->Tok.Data = Str.data();
+ Op->Tok.Length = Str.size();
+ Op->StartLoc = Start;
+ Op->EndLoc = Start;
+ return Op;
+ }
+
+ static std::unique_ptr<LanaiOperand> createReg(unsigned RegNum, SMLoc Start,
+ SMLoc End) {
+ auto Op = make_unique<LanaiOperand>(REGISTER);
+ Op->Reg.RegNum = RegNum;
+ Op->StartLoc = Start;
+ Op->EndLoc = End;
+ return Op;
+ }
+
+ static std::unique_ptr<LanaiOperand> createImm(const MCExpr *Value,
+ SMLoc Start, SMLoc End) {
+ auto Op = make_unique<LanaiOperand>(IMMEDIATE);
+ Op->Imm.Value = Value;
+ Op->StartLoc = Start;
+ Op->EndLoc = End;
+ return Op;
+ }
+
+ static std::unique_ptr<LanaiOperand>
+ MorphToMemImm(std::unique_ptr<LanaiOperand> Op) {
+ const MCExpr *Imm = Op->getImm();
+ Op->Kind = MEMORY_IMM;
+ Op->Mem.BaseReg = 0;
+ Op->Mem.AluOp = LPAC::ADD;
+ Op->Mem.OffsetReg = 0;
+ Op->Mem.Offset = Imm;
+ return Op;
+ }
+
+ static std::unique_ptr<LanaiOperand>
+ MorphToMemRegReg(unsigned BaseReg, std::unique_ptr<LanaiOperand> Op,
+ unsigned AluOp) {
+ unsigned OffsetReg = Op->getReg();
+ Op->Kind = MEMORY_REG_REG;
+ Op->Mem.BaseReg = BaseReg;
+ Op->Mem.AluOp = AluOp;
+ Op->Mem.OffsetReg = OffsetReg;
+ Op->Mem.Offset = nullptr;
+ return Op;
+ }
+
+ static std::unique_ptr<LanaiOperand>
+ MorphToMemRegImm(unsigned BaseReg, std::unique_ptr<LanaiOperand> Op,
+ unsigned AluOp) {
+ const MCExpr *Imm = Op->getImm();
+ Op->Kind = MEMORY_REG_IMM;
+ Op->Mem.BaseReg = BaseReg;
+ Op->Mem.AluOp = AluOp;
+ Op->Mem.OffsetReg = 0;
+ Op->Mem.Offset = Imm;
+ return Op;
+ }
+};
+
+bool LanaiAsmParser::ParseDirective(AsmToken /*DirectiveId*/) { return true; }
+
+bool LanaiAsmParser::MatchAndEmitInstruction(SMLoc IdLoc, unsigned &Opcode,
+ OperandVector &Operands,
+ MCStreamer &Out,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) {
+ MCInst Inst;
+ SMLoc ErrorLoc;
+
+ switch (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm)) {
+ case Match_Success:
+ Out.EmitInstruction(Inst, SubtargetInfo);
+ Opcode = Inst.getOpcode();
+ return false;
+ case Match_MissingFeature:
+ return Error(IdLoc, "Instruction use requires option to be enabled");
+ case Match_MnemonicFail:
+ return Error(IdLoc, "Unrecognized instruction mnemonic");
+ case Match_InvalidOperand: {
+ ErrorLoc = IdLoc;
+ if (ErrorInfo != ~0U) {
+ if (ErrorInfo >= Operands.size())
+ return Error(IdLoc, "Too few operands for instruction");
+
+ ErrorLoc = ((LanaiOperand &)*Operands[ErrorInfo]).getStartLoc();
+ if (ErrorLoc == SMLoc())
+ ErrorLoc = IdLoc;
+ }
+ return Error(ErrorLoc, "Invalid operand for instruction");
+ }
+ default:
+ break;
+ }
+
+ llvm_unreachable("Unknown match type detected!");
+}
+
+// Both '%rN' and 'rN' are parsed as valid registers. This was done to remain
+// backwards compatible with GCC and the different ways inline assembly is
+// handled.
+// TODO: see if there isn't a better way to do this.
+std::unique_ptr<LanaiOperand> LanaiAsmParser::parseRegister() {
+ SMLoc Start = Parser.getTok().getLoc();
+ SMLoc End = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
+ unsigned RegNum;
+ // Eat the '%'.
+ if (Lexer.getKind() == AsmToken::Percent)
+ Parser.Lex();
+ if (Lexer.getKind() == AsmToken::Identifier) {
+ RegNum = MatchRegisterName(Lexer.getTok().getIdentifier());
+ if (RegNum == 0)
+ return 0;
+ Parser.Lex(); // Eat identifier token
+ return LanaiOperand::createReg(RegNum, Start, End);
+ }
+ return 0;
+}
+
+bool LanaiAsmParser::ParseRegister(unsigned &RegNum, SMLoc &StartLoc,
+ SMLoc &EndLoc) {
+ const AsmToken &Tok = getParser().getTok();
+ StartLoc = Tok.getLoc();
+ EndLoc = Tok.getEndLoc();
+ std::unique_ptr<LanaiOperand> Op = parseRegister();
+ if (Op != nullptr)
+ RegNum = Op->getReg();
+ return (Op == nullptr);
+}
+
+std::unique_ptr<LanaiOperand> LanaiAsmParser::parseIdentifier() {
+ SMLoc Start = Parser.getTok().getLoc();
+ SMLoc End = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+ const MCExpr *Res, *RHS = 0;
+ LanaiMCExpr::VariantKind Kind = LanaiMCExpr::VK_Lanai_None;
+
+ if (Lexer.getKind() != AsmToken::Identifier)
+ return 0;
+
+ StringRef Identifier;
+ if (Parser.parseIdentifier(Identifier))
+ return 0;
+
+ // Check if identifier has a modifier
+ if (Identifier.equals_lower("hi"))
+ Kind = LanaiMCExpr::VK_Lanai_ABS_HI;
+ else if (Identifier.equals_lower("lo"))
+ Kind = LanaiMCExpr::VK_Lanai_ABS_LO;
+
+ // If the identifier corresponds to a variant then extract the real
+ // identifier.
+ if (Kind != LanaiMCExpr::VK_Lanai_None) {
+ if (Lexer.getKind() != AsmToken::LParen) {
+ Error(Lexer.getLoc(), "Expected '('");
+ return 0;
+ }
+ Lexer.Lex(); // lex '('
+
+ // Parse identifier
+ if (Parser.parseIdentifier(Identifier))
+ return 0;
+ }
+
+ // If addition parse the RHS.
+ if (Lexer.getKind() == AsmToken::Plus && Parser.parseExpression(RHS))
+ return 0;
+
+ // For variants parse the final ')'
+ if (Kind != LanaiMCExpr::VK_Lanai_None) {
+ if (Lexer.getKind() != AsmToken::RParen) {
+ Error(Lexer.getLoc(), "Expected ')'");
+ return 0;
+ }
+ Lexer.Lex(); // lex ')'
+ }
+
+ End = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+ MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier);
+ const MCExpr *Expr = MCSymbolRefExpr::create(Sym, getContext());
+ Res = LanaiMCExpr::create(Kind, Expr, getContext());
+
+ // Nest if this was an addition
+ if (RHS)
+ Res = MCBinaryExpr::createAdd(Res, RHS, getContext());
+
+ return LanaiOperand::createImm(Res, Start, End);
+}
+
+std::unique_ptr<LanaiOperand> LanaiAsmParser::parseImmediate() {
+ SMLoc Start = Parser.getTok().getLoc();
+ SMLoc End = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
+ const MCExpr *ExprVal;
+ switch (Lexer.getKind()) {
+ case AsmToken::Identifier:
+ return parseIdentifier();
+ case AsmToken::Plus:
+ case AsmToken::Minus:
+ case AsmToken::Integer:
+ case AsmToken::Dot:
+ if (!Parser.parseExpression(ExprVal))
+ return LanaiOperand::createImm(ExprVal, Start, End);
+ default:
+ return 0;
+ }
+}
+
+static unsigned AluWithPrePost(unsigned AluCode, bool PreOp, bool PostOp) {
+ if (PreOp)
+ return LPAC::makePreOp(AluCode);
+ if (PostOp)
+ return LPAC::makePostOp(AluCode);
+ return AluCode;
+}
+
+unsigned LanaiAsmParser::parseAluOperator(bool PreOp, bool PostOp) {
+ StringRef IdString;
+ Parser.parseIdentifier(IdString);
+ unsigned AluCode = LPAC::stringToLanaiAluCode(IdString);
+ if (AluCode == LPAC::UNKNOWN) {
+ Error(Parser.getTok().getLoc(), "Can't parse ALU operator");
+ return 0;
+ }
+ return AluCode;
+}
+
+static int SizeForSuffix(StringRef T) {
+ return StringSwitch<int>(T).EndsWith(".h", 2).EndsWith(".b", 1).Default(4);
+}
+
+bool LanaiAsmParser::parsePrePost(StringRef Type, int *OffsetValue) {
+ bool PreOrPost = false;
+ if (Lexer.getKind() == Lexer.peekTok(true).getKind()) {
+ PreOrPost = true;
+ if (Lexer.is(AsmToken::Minus))
+ *OffsetValue = -SizeForSuffix(Type);
+ else if (Lexer.is(AsmToken::Plus))
+ *OffsetValue = SizeForSuffix(Type);
+ else
+ return false;
+
+ // Eat the '-' '-' or '+' '+'
+ Parser.Lex();
+ Parser.Lex();
+ } else if (Lexer.is(AsmToken::Star)) {
+ Parser.Lex(); // Eat the '*'
+ PreOrPost = true;
+ }
+
+ return PreOrPost;
+}
+
+bool shouldBeSls(const LanaiOperand &Op) {
+ // The instruction should be encoded as an SLS if the constant is word
+ // aligned and will fit in 21 bits
+ if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(Op.getImm())) {
+ int64_t Value = ConstExpr->getValue();
+ return (Value % 4 == 0) && (Value >= 0) && (Value <= 0x1fffff);
+ }
+ // The instruction should be encoded as an SLS if the operand is a symbolic
+ // reference with no variant.
+ if (const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(Op.getImm()))
+ return SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_None;
+ // The instruction should be encoded as an SLS if the operand is a binary
+ // expression with the left-hand side being a symbolic reference with no
+ // variant.
+ if (const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(Op.getImm())) {
+ const LanaiMCExpr *LHSSymbolRefExpr =
+ dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS());
+ return (LHSSymbolRefExpr &&
+ LHSSymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_None);
+ }
+ return false;
+}
+
+// Matches memory operand. Returns true if error encountered.
+OperandMatchResultTy
+LanaiAsmParser::parseMemoryOperand(OperandVector &Operands) {
+ // Try to match a memory operand.
+ // The memory operands are of the form:
+ // (1) Register|Immediate|'' '[' '*'? Register '*'? ']' or
+ // ^
+ // (2) '[' '*'? Register '*'? AluOperator Register ']'
+ // ^
+ // (3) '[' '--'|'++' Register '--'|'++' ']'
+ //
+ // (4) '[' Immediate ']' (for SLS)
+
+ // Store the type for use in parsing pre/post increment/decrement operators
+ StringRef Type;
+ if (Operands[0]->isToken())
+ Type = static_cast<LanaiOperand *>(Operands[0].get())->getToken();
+
+ // Use 0 if no offset given
+ int OffsetValue = 0;
+ unsigned BaseReg = 0;
+ unsigned AluOp = LPAC::ADD;
+ bool PostOp = false, PreOp = false;
+
+ // Try to parse the offset
+ std::unique_ptr<LanaiOperand> Op = parseRegister();
+ if (!Op)
+ Op = parseImmediate();
+
+ // Only continue if next token is '['
+ if (Lexer.isNot(AsmToken::LBrac)) {
+ if (!Op)
+ return MatchOperand_NoMatch;
+
+ // The start of this custom parsing overlaps with register/immediate so
+ // consider this as a successful match of an operand of that type as the
+ // token stream can't be rewound to allow them to match separately.
+ Operands.push_back(std::move(Op));
+ return MatchOperand_Success;
+ }
+
+ Parser.Lex(); // Eat the '['.
+ std::unique_ptr<LanaiOperand> Offset = nullptr;
+ if (Op)
+ Offset.swap(Op);
+
+ // Determine if a pre operation
+ PreOp = parsePrePost(Type, &OffsetValue);
+
+ Op = parseRegister();
+ if (!Op) {
+ if (!Offset) {
+ if ((Op = parseImmediate()) && Lexer.is(AsmToken::RBrac)) {
+ Parser.Lex(); // Eat the ']'
+
+ // Memory address operations aligned to word boundary are encoded as
+ // SLS, the rest as RM.
+ if (shouldBeSls(*Op)) {
+ Operands.push_back(LanaiOperand::MorphToMemImm(std::move(Op)));
+ } else {
+ if (!Op->isLoImm16Signed()) {
+ Error(Parser.getTok().getLoc(),
+ "Memory address is not word "
+ "aligned and larger than class RM can handle");
+ return MatchOperand_ParseFail;
+ }
+ Operands.push_back(LanaiOperand::MorphToMemRegImm(
+ Lanai::R0, std::move(Op), LPAC::ADD));
+ }
+ return MatchOperand_Success;
+ }
+ }
+
+ Error(Parser.getTok().getLoc(),
+ "Unknown operand, expected register or immediate");
+ return MatchOperand_ParseFail;
+ }
+ BaseReg = Op->getReg();
+
+ // Determine if a post operation
+ if (!PreOp)
+ PostOp = parsePrePost(Type, &OffsetValue);
+
+ // If ] match form (1) else match form (2)
+ if (Lexer.is(AsmToken::RBrac)) {
+ Parser.Lex(); // Eat the ']'.
+ if (!Offset) {
+ SMLoc Start = Parser.getTok().getLoc();
+ SMLoc End =
+ SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+ const MCConstantExpr *OffsetConstExpr =
+ MCConstantExpr::create(OffsetValue, getContext());
+ Offset = LanaiOperand::createImm(OffsetConstExpr, Start, End);
+ }
+ } else {
+ if (Offset || OffsetValue != 0) {
+ Error(Parser.getTok().getLoc(), "Expected ']'");
+ return MatchOperand_ParseFail;
+ }
+
+ // Parse operator
+ AluOp = parseAluOperator(PreOp, PostOp);
+
+ // Second form requires offset register
+ Offset = parseRegister();
+ if (!BaseReg || Lexer.isNot(AsmToken::RBrac)) {
+ Error(Parser.getTok().getLoc(), "Expected ']'");
+ return MatchOperand_ParseFail;
+ }
+ Parser.Lex(); // Eat the ']'.
+ }
+
+ // First form has addition as operator. Add pre- or post-op indicator as
+ // needed.
+ AluOp = AluWithPrePost(AluOp, PreOp, PostOp);
+
+ // Ensure immediate offset is not too large
+ if (Offset->isImm() && !Offset->isLoImm16Signed()) {
+ Error(Parser.getTok().getLoc(),
+ "Memory address is not word "
+ "aligned and larger than class RM can handle");
+ return MatchOperand_ParseFail;
+ }
+
+ Operands.push_back(
+ Offset->isImm()
+ ? LanaiOperand::MorphToMemRegImm(BaseReg, std::move(Offset), AluOp)
+ : LanaiOperand::MorphToMemRegReg(BaseReg, std::move(Offset), AluOp));
+
+ return MatchOperand_Success;
+}
+
+// Looks at a token type and creates the relevant operand from this
+// information, adding to operands.
+// If operand was parsed, returns false, else true.
+OperandMatchResultTy
+LanaiAsmParser::parseOperand(OperandVector *Operands, StringRef Mnemonic) {
+ // Check if the current operand has a custom associated parser, if so, try to
+ // custom parse the operand, or fallback to the general approach.
+ OperandMatchResultTy Result = MatchOperandParserImpl(*Operands, Mnemonic);
+
+ if (Result == MatchOperand_Success)
+ return Result;
+ if (Result == MatchOperand_ParseFail) {
+ Parser.eatToEndOfStatement();
+ return Result;
+ }
+
+ // Attempt to parse token as register
+ std::unique_ptr<LanaiOperand> Op = parseRegister();
+
+ // Attempt to parse token as immediate
+ if (!Op)
+ Op = parseImmediate();
+
+ // If the token could not be parsed then fail
+ if (!Op) {
+ Error(Parser.getTok().getLoc(), "Unknown operand");
+ Parser.eatToEndOfStatement();
+ return MatchOperand_ParseFail;
+ }
+
+ // Push back parsed operand into list of operands
+ Operands->push_back(std::move(Op));
+
+ return MatchOperand_Success;
+}
+
+// Split the mnemonic into ASM operand, conditional code and instruction
+// qualifier (half-word, byte).
+StringRef LanaiAsmParser::splitMnemonic(StringRef Name, SMLoc NameLoc,
+ OperandVector *Operands) {
+ size_t Next = Name.find('.');
+
+ StringRef Mnemonic = Name;
+
+ bool IsBRR = false;
+ if (Name.endswith(".r")) {
+ Mnemonic = Name.substr(0, Name.size() - 2);
+ IsBRR = true;
+ }
+
+ // Match b?? and s?? (BR, BRR, and SCC instruction classes).
+ if (Mnemonic[0] == 'b' ||
+ (Mnemonic[0] == 's' && !Mnemonic.startswith("sel") &&
+ !Mnemonic.startswith("st"))) {
+ // Parse instructions with a conditional code. For example, 'bne' is
+ // converted into two operands 'b' and 'ne'.
+ LPCC::CondCode CondCode =
+ LPCC::suffixToLanaiCondCode(Mnemonic.substr(1, Next));
+ if (CondCode != LPCC::UNKNOWN) {
+ Mnemonic = Mnemonic.slice(0, 1);
+ Operands->push_back(LanaiOperand::CreateToken(Mnemonic, NameLoc));
+ Operands->push_back(LanaiOperand::createImm(
+ MCConstantExpr::create(CondCode, getContext()), NameLoc, NameLoc));
+ if (IsBRR) {
+ Operands->push_back(LanaiOperand::CreateToken(".r", NameLoc));
+ }
+ return Mnemonic;
+ }
+ }
+
+ // Parse other instructions with condition codes (RR instructions).
+ // We ignore .f here and assume they are flag-setting operations, not
+ // conditional codes (except for select instructions where flag-setting
+ // variants are not yet implemented).
+ if (Mnemonic.startswith("sel") ||
+ (!Mnemonic.endswith(".f") && !Mnemonic.startswith("st"))) {
+ LPCC::CondCode CondCode = LPCC::suffixToLanaiCondCode(Mnemonic);
+ if (CondCode != LPCC::UNKNOWN) {
+ size_t Next = Mnemonic.rfind('.', Name.size());
+ // 'sel' doesn't use a predicate operand whose printer adds the period,
+ // but instead has the period as part of the identifier (i.e., 'sel.' is
+ // expected by the generated matcher). If the mnemonic starts with 'sel'
+ // then include the period as part of the mnemonic, else don't include it
+ // as part of the mnemonic.
+ if (Mnemonic.startswith("sel")) {
+ Mnemonic = Mnemonic.substr(0, Next + 1);
+ } else {
+ Mnemonic = Mnemonic.substr(0, Next);
+ }
+ Operands->push_back(LanaiOperand::CreateToken(Mnemonic, NameLoc));
+ Operands->push_back(LanaiOperand::createImm(
+ MCConstantExpr::create(CondCode, getContext()), NameLoc, NameLoc));
+ return Mnemonic;
+ }
+ }
+
+ Operands->push_back(LanaiOperand::CreateToken(Mnemonic, NameLoc));
+ if (IsBRR) {
+ Operands->push_back(LanaiOperand::CreateToken(".r", NameLoc));
+ }
+
+ return Mnemonic;
+}
+
+bool IsMemoryAssignmentError(const OperandVector &Operands) {
+ // Detects if a memory operation has an erroneous base register modification.
+ // Memory operations are detected by matching the types of operands.
+ //
+ // TODO: This test is focussed on one specific instance (ld/st).
+ // Extend it to handle more cases or be more robust.
+ bool Modifies = false;
+
+ int Offset = 0;
+
+ if (Operands.size() < 5)
+ return false;
+ else if (Operands[0]->isToken() && Operands[1]->isReg() &&
+ Operands[2]->isImm() && Operands[3]->isImm() && Operands[4]->isReg())
+ Offset = 0;
+ else if (Operands[0]->isToken() && Operands[1]->isToken() &&
+ Operands[2]->isReg() && Operands[3]->isImm() &&
+ Operands[4]->isImm() && Operands[5]->isReg())
+ Offset = 1;
+ else
+ return false;
+
+ int PossibleAluOpIdx = Offset + 3;
+ int PossibleBaseIdx = Offset + 1;
+ int PossibleDestIdx = Offset + 4;
+ if (LanaiOperand *PossibleAluOp =
+ static_cast<LanaiOperand *>(Operands[PossibleAluOpIdx].get()))
+ if (PossibleAluOp->isImm())
+ if (const MCConstantExpr *ConstExpr =
+ dyn_cast<MCConstantExpr>(PossibleAluOp->getImm()))
+ Modifies = LPAC::modifiesOp(ConstExpr->getValue());
+ return Modifies && Operands[PossibleBaseIdx]->isReg() &&
+ Operands[PossibleDestIdx]->isReg() &&
+ Operands[PossibleBaseIdx]->getReg() ==
+ Operands[PossibleDestIdx]->getReg();
+}
+
+static bool IsRegister(const MCParsedAsmOperand &op) {
+ return static_cast<const LanaiOperand &>(op).isReg();
+}
+
+static bool MaybePredicatedInst(const OperandVector &Operands) {
+ if (Operands.size() < 4 || !IsRegister(*Operands[1]) ||
+ !IsRegister(*Operands[2]))
+ return false;
+ return StringSwitch<bool>(
+ static_cast<const LanaiOperand &>(*Operands[0]).getToken())
+ .StartsWith("addc", true)
+ .StartsWith("add", true)
+ .StartsWith("and", true)
+ .StartsWith("sh", true)
+ .StartsWith("subb", true)
+ .StartsWith("sub", true)
+ .StartsWith("or", true)
+ .StartsWith("xor", true)
+ .Default(false);
+}
+
+bool LanaiAsmParser::ParseInstruction(ParseInstructionInfo & /*Info*/,
+ StringRef Name, SMLoc NameLoc,
+ OperandVector &Operands) {
+ // First operand is token for instruction
+ StringRef Mnemonic = splitMnemonic(Name, NameLoc, &Operands);
+
+ // If there are no more operands, then finish
+ if (Lexer.is(AsmToken::EndOfStatement))
+ return false;
+
+ // Parse first operand
+ if (parseOperand(&Operands, Mnemonic) != MatchOperand_Success)
+ return true;
+
+ // If it is a st instruction with one 1 operand then it is a "store true".
+ // Transform <"st"> to <"s">, <LPCC:ICC_T>
+ if (Lexer.is(AsmToken::EndOfStatement) && Name == "st" &&
+ Operands.size() == 2) {
+ Operands.erase(Operands.begin(), Operands.begin() + 1);
+ Operands.insert(Operands.begin(), LanaiOperand::CreateToken("s", NameLoc));
+ Operands.insert(Operands.begin() + 1,
+ LanaiOperand::createImm(
+ MCConstantExpr::create(LPCC::ICC_T, getContext()),
+ NameLoc, NameLoc));
+ }
+
+ // If the instruction is a bt instruction with 1 operand (in assembly) then it
+ // is an unconditional branch instruction and the first two elements of
+ // operands need to be merged.
+ if (Lexer.is(AsmToken::EndOfStatement) && Name.startswith("bt") &&
+ Operands.size() == 3) {
+ Operands.erase(Operands.begin(), Operands.begin() + 2);
+ Operands.insert(Operands.begin(), LanaiOperand::CreateToken("bt", NameLoc));
+ }
+
+ // Parse until end of statement, consuming commas between operands
+ while (Lexer.isNot(AsmToken::EndOfStatement) && Lexer.is(AsmToken::Comma)) {
+ // Consume comma token
+ Lex();
+
+ // Parse next operand
+ if (parseOperand(&Operands, Mnemonic) != MatchOperand_Success)
+ return true;
+ }
+
+ if (IsMemoryAssignmentError(Operands)) {
+ Error(Parser.getTok().getLoc(),
+ "the destination register can't equal the base register in an "
+ "instruction that modifies the base register.");
+ return true;
+ }
+
+ // Insert always true operand for instruction that may be predicated but
+ // are not. Currently the autogenerated parser always expects a predicate.
+ if (MaybePredicatedInst(Operands)) {
+ Operands.insert(Operands.begin() + 1,
+ LanaiOperand::createImm(
+ MCConstantExpr::create(LPCC::ICC_T, getContext()),
+ NameLoc, NameLoc));
+ }
+
+ return false;
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#include "LanaiGenAsmMatcher.inc"
+} // namespace
+
+extern "C" void LLVMInitializeLanaiAsmParser() {
+ RegisterMCAsmParser<LanaiAsmParser> x(getTheLanaiTarget());
+}
+
+} // namespace llvm
diff --git a/contrib/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp b/contrib/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp
new file mode 100644
index 000000000000..609b650e5d32
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp
@@ -0,0 +1,240 @@
+//===- LanaiDisassembler.cpp - Disassembler for Lanai -----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the Lanai Disassembler.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiDisassembler.h"
+
+#include "Lanai.h"
+#include "LanaiSubtarget.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace llvm {
+Target &getTheLanaiTarget();
+}
+
+static MCDisassembler *createLanaiDisassembler(const Target & /*T*/,
+ const MCSubtargetInfo &STI,
+ MCContext &Ctx) {
+ return new LanaiDisassembler(STI, Ctx);
+}
+
+extern "C" void LLVMInitializeLanaiDisassembler() {
+ // Register the disassembler
+ TargetRegistry::RegisterMCDisassembler(getTheLanaiTarget(),
+ createLanaiDisassembler);
+}
+
+LanaiDisassembler::LanaiDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
+ : MCDisassembler(STI, Ctx) {}
+
+// Forward declare because the autogenerated code will reference this.
+// Definition is further down.
+static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus decodeRiMemoryValue(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+
+static DecodeStatus decodeRrMemoryValue(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+
+static DecodeStatus decodeSplsValue(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+
+static DecodeStatus decodeBranch(MCInst &Inst, unsigned Insn, uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus decodePredicateOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus decodeShiftImm(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+
+#include "LanaiGenDisassemblerTables.inc"
+
+static DecodeStatus readInstruction32(ArrayRef<uint8_t> Bytes, uint64_t &Size,
+ uint32_t &Insn) {
+ // We want to read exactly 4 bytes of data.
+ if (Bytes.size() < 4) {
+ Size = 0;
+ return MCDisassembler::Fail;
+ }
+
+ // Encoded as big-endian 32-bit word in the stream.
+ Insn =
+ (Bytes[0] << 24) | (Bytes[1] << 16) | (Bytes[2] << 8) | (Bytes[3] << 0);
+
+ return MCDisassembler::Success;
+}
+
+static void PostOperandDecodeAdjust(MCInst &Instr, uint32_t Insn) {
+ unsigned AluOp = LPAC::ADD;
+ // Fix up for pre and post operations.
+ int PqShift = -1;
+ if (isRMOpcode(Instr.getOpcode()))
+ PqShift = 16;
+ else if (isSPLSOpcode(Instr.getOpcode()))
+ PqShift = 10;
+ else if (isRRMOpcode(Instr.getOpcode())) {
+ PqShift = 16;
+ // Determine RRM ALU op.
+ AluOp = (Insn >> 8) & 0x7;
+ if (AluOp == 7)
+ // Handle JJJJJ
+ // 0b10000 or 0b11000
+ AluOp |= 0x20 | (((Insn >> 3) & 0xf) << 1);
+ }
+
+ if (PqShift != -1) {
+ unsigned PQ = (Insn >> PqShift) & 0x3;
+ switch (PQ) {
+ case 0x0:
+ if (Instr.getOperand(2).isReg()) {
+ Instr.getOperand(2).setReg(Lanai::R0);
+ }
+ if (Instr.getOperand(2).isImm())
+ Instr.getOperand(2).setImm(0);
+ break;
+ case 0x1:
+ AluOp = LPAC::makePostOp(AluOp);
+ break;
+ case 0x2:
+ break;
+ case 0x3:
+ AluOp = LPAC::makePreOp(AluOp);
+ break;
+ }
+ Instr.addOperand(MCOperand::createImm(AluOp));
+ }
+}
+
+DecodeStatus LanaiDisassembler::getInstruction(
+ MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address,
+ raw_ostream & /*VStream*/, raw_ostream & /*CStream*/) const {
+ uint32_t Insn;
+
+ DecodeStatus Result = readInstruction32(Bytes, Size, Insn);
+
+ if (Result == MCDisassembler::Fail)
+ return MCDisassembler::Fail;
+
+ // Call auto-generated decoder function
+ Result =
+ decodeInstruction(DecoderTableLanai32, Instr, Insn, Address, this, STI);
+
+ if (Result != MCDisassembler::Fail) {
+ PostOperandDecodeAdjust(Instr, Insn);
+ Size = 4;
+ return Result;
+ }
+
+ return MCDisassembler::Fail;
+}
+
+static const unsigned GPRDecoderTable[] = {
+ Lanai::R0, Lanai::R1, Lanai::PC, Lanai::R3, Lanai::SP, Lanai::FP,
+ Lanai::R6, Lanai::R7, Lanai::RV, Lanai::R9, Lanai::RR1, Lanai::RR2,
+ Lanai::R12, Lanai::R13, Lanai::R14, Lanai::RCA, Lanai::R16, Lanai::R17,
+ Lanai::R18, Lanai::R19, Lanai::R20, Lanai::R21, Lanai::R22, Lanai::R23,
+ Lanai::R24, Lanai::R25, Lanai::R26, Lanai::R27, Lanai::R28, Lanai::R29,
+ Lanai::R30, Lanai::R31};
+
+DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t /*Address*/,
+ const void * /*Decoder*/) {
+ if (RegNo > 31)
+ return MCDisassembler::Fail;
+
+ unsigned Reg = GPRDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeRiMemoryValue(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ // RI memory values encoded using 23 bits:
+ // 5 bit register, 16 bit constant
+ unsigned Register = (Insn >> 18) & 0x1f;
+ Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register]));
+ unsigned Offset = (Insn & 0xffff);
+ Inst.addOperand(MCOperand::createImm(SignExtend32<16>(Offset)));
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeRrMemoryValue(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ // RR memory values encoded using 20 bits:
+ // 5 bit register, 5 bit register, 2 bit PQ, 3 bit ALU operator, 5 bit JJJJJ
+ unsigned Register = (Insn >> 15) & 0x1f;
+ Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register]));
+ Register = (Insn >> 10) & 0x1f;
+ Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register]));
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeSplsValue(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ // RI memory values encoded using 17 bits:
+ // 5 bit register, 10 bit constant
+ unsigned Register = (Insn >> 12) & 0x1f;
+ Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register]));
+ unsigned Offset = (Insn & 0x3ff);
+ Inst.addOperand(MCOperand::createImm(SignExtend32<10>(Offset)));
+
+ return MCDisassembler::Success;
+}
+
+static bool tryAddingSymbolicOperand(int64_t Value, bool IsBranch,
+ uint64_t Address, uint64_t Offset,
+ uint64_t Width, MCInst &MI,
+ const void *Decoder) {
+ const MCDisassembler *Dis = static_cast<const MCDisassembler *>(Decoder);
+ return Dis->tryAddingSymbolicOperand(MI, Value, Address, IsBranch, Offset,
+ Width);
+}
+
+static DecodeStatus decodeBranch(MCInst &MI, unsigned Insn, uint64_t Address,
+ const void *Decoder) {
+ if (!tryAddingSymbolicOperand(Insn + Address, false, Address, 2, 23, MI,
+ Decoder))
+ MI.addOperand(MCOperand::createImm(Insn));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeShiftImm(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ unsigned Offset = (Insn & 0xffff);
+ Inst.addOperand(MCOperand::createImm(SignExtend32<16>(Offset)));
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus decodePredicateOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address,
+ const void *Decoder) {
+ if (Val >= LPCC::UNKNOWN)
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createImm(Val));
+ return MCDisassembler::Success;
+}
diff --git a/contrib/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.h b/contrib/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.h
new file mode 100644
index 000000000000..a317cd88ad63
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.h
@@ -0,0 +1,41 @@
+//===- LanaiDisassembler.cpp - Disassembler for Lanai -----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the Lanai Disassembler.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_DISASSEMBLER_LANAIDISASSEMBLER_H
+#define LLVM_LIB_TARGET_LANAI_DISASSEMBLER_LANAIDISASSEMBLER_H
+
+#define DEBUG_TYPE "lanai-disassembler"
+
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+
+namespace llvm {
+
+class MCInst;
+class raw_ostream;
+
+class LanaiDisassembler : public MCDisassembler {
+public:
+ LanaiDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx);
+
+ ~LanaiDisassembler() override {}
+
+ // getInstruction - See MCDisassembler.
+ MCDisassembler::DecodeStatus
+ getInstruction(MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes,
+ uint64_t Address, raw_ostream &VStream,
+ raw_ostream &CStream) const override;
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_DISASSEMBLER_LANAIDISASSEMBLER_H
diff --git a/contrib/llvm/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.cpp b/contrib/llvm/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.cpp
new file mode 100644
index 000000000000..2fa411fcfd87
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.cpp
@@ -0,0 +1,305 @@
+//===-- LanaiInstPrinter.cpp - Convert Lanai MCInst to asm syntax ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an Lanai MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiInstPrinter.h"
+#include "Lanai.h"
+#include "MCTargetDesc/LanaiMCExpr.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+// Include the auto-generated portion of the assembly writer.
+#define PRINT_ALIAS_INSTR
+#include "LanaiGenAsmWriter.inc"
+
+void LanaiInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+ OS << StringRef(getRegisterName(RegNo)).lower();
+}
+
+bool LanaiInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+ StringRef Alias, unsigned OpNo0,
+ unsigned OpNo1) {
+ OS << "\t" << Alias << " ";
+ printOperand(MI, OpNo0, OS);
+ OS << ", ";
+ printOperand(MI, OpNo1, OS);
+ return true;
+}
+
+static bool usesGivenOffset(const MCInst *MI, int AddOffset) {
+ unsigned AluCode = MI->getOperand(3).getImm();
+ return LPAC::encodeLanaiAluCode(AluCode) == LPAC::ADD &&
+ (MI->getOperand(2).getImm() == AddOffset ||
+ MI->getOperand(2).getImm() == -AddOffset);
+}
+
+static bool isPreIncrementForm(const MCInst *MI, int AddOffset) {
+ unsigned AluCode = MI->getOperand(3).getImm();
+ return LPAC::isPreOp(AluCode) && usesGivenOffset(MI, AddOffset);
+}
+
+static bool isPostIncrementForm(const MCInst *MI, int AddOffset) {
+ unsigned AluCode = MI->getOperand(3).getImm();
+ return LPAC::isPostOp(AluCode) && usesGivenOffset(MI, AddOffset);
+}
+
+static StringRef decIncOperator(const MCInst *MI) {
+ if (MI->getOperand(2).getImm() < 0)
+ return "--";
+ return "++";
+}
+
+bool LanaiInstPrinter::printMemoryLoadIncrement(const MCInst *MI,
+ raw_ostream &OS,
+ StringRef Opcode,
+ int AddOffset) {
+ if (isPreIncrementForm(MI, AddOffset)) {
+ OS << "\t" << Opcode << "\t[" << decIncOperator(MI) << "%"
+ << getRegisterName(MI->getOperand(1).getReg()) << "], %"
+ << getRegisterName(MI->getOperand(0).getReg());
+ return true;
+ }
+ if (isPostIncrementForm(MI, AddOffset)) {
+ OS << "\t" << Opcode << "\t[%"
+ << getRegisterName(MI->getOperand(1).getReg()) << decIncOperator(MI)
+ << "], %" << getRegisterName(MI->getOperand(0).getReg());
+ return true;
+ }
+ return false;
+}
+
+bool LanaiInstPrinter::printMemoryStoreIncrement(const MCInst *MI,
+ raw_ostream &OS,
+ StringRef Opcode,
+ int AddOffset) {
+ if (isPreIncrementForm(MI, AddOffset)) {
+ OS << "\t" << Opcode << "\t%" << getRegisterName(MI->getOperand(0).getReg())
+ << ", [" << decIncOperator(MI) << "%"
+ << getRegisterName(MI->getOperand(1).getReg()) << "]";
+ return true;
+ }
+ if (isPostIncrementForm(MI, AddOffset)) {
+ OS << "\t" << Opcode << "\t%" << getRegisterName(MI->getOperand(0).getReg())
+ << ", [%" << getRegisterName(MI->getOperand(1).getReg())
+ << decIncOperator(MI) << "]";
+ return true;
+ }
+ return false;
+}
+
+bool LanaiInstPrinter::printAlias(const MCInst *MI, raw_ostream &OS) {
+ switch (MI->getOpcode()) {
+ case Lanai::LDW_RI:
+ // ld 4[*%rN], %rX => ld [++imm], %rX
+ // ld -4[*%rN], %rX => ld [--imm], %rX
+ // ld 4[%rN*], %rX => ld [imm++], %rX
+ // ld -4[%rN*], %rX => ld [imm--], %rX
+ return printMemoryLoadIncrement(MI, OS, "ld", 4);
+ case Lanai::LDHs_RI:
+ return printMemoryLoadIncrement(MI, OS, "ld.h", 2);
+ case Lanai::LDHz_RI:
+ return printMemoryLoadIncrement(MI, OS, "uld.h", 2);
+ case Lanai::LDBs_RI:
+ return printMemoryLoadIncrement(MI, OS, "ld.b", 1);
+ case Lanai::LDBz_RI:
+ return printMemoryLoadIncrement(MI, OS, "uld.b", 1);
+ case Lanai::SW_RI:
+ // st %rX, 4[*%rN] => st %rX, [++imm]
+ // st %rX, -4[*%rN] => st %rX, [--imm]
+ // st %rX, 4[%rN*] => st %rX, [imm++]
+ // st %rX, -4[%rN*] => st %rX, [imm--]
+ return printMemoryStoreIncrement(MI, OS, "st", 4);
+ case Lanai::STH_RI:
+ return printMemoryStoreIncrement(MI, OS, "st.h", 2);
+ case Lanai::STB_RI:
+ return printMemoryStoreIncrement(MI, OS, "st.b", 1);
+ default:
+ return false;
+ }
+}
+
+void LanaiInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+ StringRef Annotation,
+ const MCSubtargetInfo & /*STI*/) {
+ if (!printAlias(MI, OS) && !printAliasInstr(MI, OS))
+ printInstruction(MI, OS);
+ printAnnotation(OS, Annotation);
+}
+
+void LanaiInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &OS, const char *Modifier) {
+ assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported");
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isReg())
+ OS << "%" << getRegisterName(Op.getReg());
+ else if (Op.isImm())
+ OS << formatHex(Op.getImm());
+ else {
+ assert(Op.isExpr() && "Expected an expression");
+ Op.getExpr()->print(OS, &MAI);
+ }
+}
+
+void LanaiInstPrinter::printMemImmOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &OS) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isImm()) {
+ OS << '[' << formatHex(Op.getImm()) << ']';
+ } else {
+ // Symbolic operand will be lowered to immediate value by linker
+ assert(Op.isExpr() && "Expected an expression");
+ OS << '[';
+ Op.getExpr()->print(OS, &MAI);
+ OS << ']';
+ }
+}
+
+void LanaiInstPrinter::printHi16ImmOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &OS) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isImm()) {
+ OS << formatHex(Op.getImm() << 16);
+ } else {
+ // Symbolic operand will be lowered to immediate value by linker
+ assert(Op.isExpr() && "Expected an expression");
+ Op.getExpr()->print(OS, &MAI);
+ }
+}
+
+void LanaiInstPrinter::printHi16AndImmOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &OS) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isImm()) {
+ OS << formatHex((Op.getImm() << 16) | 0xffff);
+ } else {
+ // Symbolic operand will be lowered to immediate value by linker
+ assert(Op.isExpr() && "Expected an expression");
+ Op.getExpr()->print(OS, &MAI);
+ }
+}
+
+void LanaiInstPrinter::printLo16AndImmOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &OS) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isImm()) {
+ OS << formatHex(0xffff0000 | Op.getImm());
+ } else {
+ // Symbolic operand will be lowered to immediate value by linker
+ assert(Op.isExpr() && "Expected an expression");
+ Op.getExpr()->print(OS, &MAI);
+ }
+}
+
+static void printMemoryBaseRegister(raw_ostream &OS, const unsigned AluCode,
+ const MCOperand &RegOp) {
+ assert(RegOp.isReg() && "Register operand expected");
+ OS << "[";
+ if (LPAC::isPreOp(AluCode))
+ OS << "*";
+ OS << "%" << LanaiInstPrinter::getRegisterName(RegOp.getReg());
+ if (LPAC::isPostOp(AluCode))
+ OS << "*";
+ OS << "]";
+}
+
+template <unsigned SizeInBits>
+static void printMemoryImmediateOffset(const MCAsmInfo &MAI,
+ const MCOperand &OffsetOp,
+ raw_ostream &OS) {
+ assert((OffsetOp.isImm() || OffsetOp.isExpr()) && "Immediate expected");
+ if (OffsetOp.isImm()) {
+ assert(isInt<SizeInBits>(OffsetOp.getImm()) && "Constant value truncated");
+ OS << OffsetOp.getImm();
+ } else
+ OffsetOp.getExpr()->print(OS, &MAI);
+}
+
+void LanaiInstPrinter::printMemRiOperand(const MCInst *MI, int OpNo,
+ raw_ostream &OS,
+ const char * /*Modifier*/) {
+ const MCOperand &RegOp = MI->getOperand(OpNo);
+ const MCOperand &OffsetOp = MI->getOperand(OpNo + 1);
+ const MCOperand &AluOp = MI->getOperand(OpNo + 2);
+ const unsigned AluCode = AluOp.getImm();
+
+ // Offset
+ printMemoryImmediateOffset<16>(MAI, OffsetOp, OS);
+
+ // Register
+ printMemoryBaseRegister(OS, AluCode, RegOp);
+}
+
+void LanaiInstPrinter::printMemRrOperand(const MCInst *MI, int OpNo,
+ raw_ostream &OS,
+ const char * /*Modifier*/) {
+ const MCOperand &RegOp = MI->getOperand(OpNo);
+ const MCOperand &OffsetOp = MI->getOperand(OpNo + 1);
+ const MCOperand &AluOp = MI->getOperand(OpNo + 2);
+ const unsigned AluCode = AluOp.getImm();
+ assert(OffsetOp.isReg() && RegOp.isReg() && "Registers expected.");
+
+ // [ Base OP Offset ]
+ OS << "[";
+ if (LPAC::isPreOp(AluCode))
+ OS << "*";
+ OS << "%" << getRegisterName(RegOp.getReg());
+ if (LPAC::isPostOp(AluCode))
+ OS << "*";
+ OS << " " << LPAC::lanaiAluCodeToString(AluCode) << " ";
+ OS << "%" << getRegisterName(OffsetOp.getReg());
+ OS << "]";
+}
+
+void LanaiInstPrinter::printMemSplsOperand(const MCInst *MI, int OpNo,
+ raw_ostream &OS,
+ const char * /*Modifier*/) {
+ const MCOperand &RegOp = MI->getOperand(OpNo);
+ const MCOperand &OffsetOp = MI->getOperand(OpNo + 1);
+ const MCOperand &AluOp = MI->getOperand(OpNo + 2);
+ const unsigned AluCode = AluOp.getImm();
+
+ // Offset
+ printMemoryImmediateOffset<10>(MAI, OffsetOp, OS);
+
+ // Register
+ printMemoryBaseRegister(OS, AluCode, RegOp);
+}
+
+void LanaiInstPrinter::printCCOperand(const MCInst *MI, int OpNo,
+ raw_ostream &OS) {
+ LPCC::CondCode CC =
+ static_cast<LPCC::CondCode>(MI->getOperand(OpNo).getImm());
+ // Handle the undefined value here for printing so we don't abort().
+ if (CC >= LPCC::UNKNOWN)
+ OS << "<und>";
+ else
+ OS << lanaiCondCodeToString(CC);
+}
+
+void LanaiInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &OS) {
+ LPCC::CondCode CC =
+ static_cast<LPCC::CondCode>(MI->getOperand(OpNo).getImm());
+ // Handle the undefined value here for printing so we don't abort().
+ if (CC >= LPCC::UNKNOWN)
+ OS << "<und>";
+ else if (CC != LPCC::ICC_T)
+ OS << "." << lanaiCondCodeToString(CC);
+}
diff --git a/contrib/llvm/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.h b/contrib/llvm/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.h
new file mode 100644
index 000000000000..1c9d186ad819
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.h
@@ -0,0 +1,65 @@
+//= LanaiInstPrinter.h - Convert Lanai MCInst to asm syntax -------*- C++ -*--//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints a Lanai MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_INSTPRINTER_LANAIINSTPRINTER_H
+#define LLVM_LIB_TARGET_LANAI_INSTPRINTER_LANAIINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+class MCOperand;
+
+class LanaiInstPrinter : public MCInstPrinter {
+public:
+ LanaiInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI)
+ : MCInstPrinter(MAI, MII, MRI) {}
+
+ void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
+ const MCSubtargetInfo &STI) override;
+ void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O,
+ const char *Modifier = 0);
+ void printPredicateOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+ void printMemRiOperand(const MCInst *MI, int OpNo, raw_ostream &O,
+ const char *Modifier = 0);
+ void printMemRrOperand(const MCInst *MI, int OpNo, raw_ostream &O,
+ const char *Modifier = 0);
+ void printMemSplsOperand(const MCInst *MI, int OpNo, raw_ostream &O,
+ const char *Modifier = 0);
+ void printCCOperand(const MCInst *MI, int OpNo, raw_ostream &O);
+ void printAluOperand(const MCInst *MI, int OpNo, raw_ostream &O);
+ void printHi16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printHi16AndImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printLo16AndImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printMemImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+ // Autogenerated by tblgen.
+ void printInstruction(const MCInst *MI, raw_ostream &O);
+ bool printAliasInstr(const MCInst *MI, raw_ostream &OS);
+ void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+ unsigned PrintMethodIdx, raw_ostream &O);
+ static const char *getRegisterName(unsigned RegNo);
+ void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+
+private:
+ bool printAlias(const MCInst *MI, raw_ostream &Ostream);
+ bool printInst(const MCInst *MI, raw_ostream &Ostream, StringRef Alias,
+ unsigned OpNo0, unsigned OpnNo1);
+ bool printMemoryLoadIncrement(const MCInst *MI, raw_ostream &Ostream,
+ StringRef Opcode, int AddOffset);
+ bool printMemoryStoreIncrement(const MCInst *MI, raw_ostream &Ostream,
+ StringRef Opcode, int AddOffset);
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_INSTPRINTER_LANAIINSTPRINTER_H
diff --git a/contrib/llvm/lib/Target/Lanai/Lanai.h b/contrib/llvm/lib/Target/Lanai/Lanai.h
new file mode 100644
index 000000000000..c1fdf793305b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/Lanai.h
@@ -0,0 +1,51 @@
+//===-- Lanai.h - Top-level interface for Lanai representation --*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// Lanai back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAI_H
+#define LLVM_LIB_TARGET_LANAI_LANAI_H
+
+#include "LanaiAluCode.h"
+#include "LanaiCondCode.h"
+#include "MCTargetDesc/LanaiBaseInfo.h"
+#include "MCTargetDesc/LanaiMCTargetDesc.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class FunctionPass;
+class LanaiTargetMachine;
+class MachineFunctionPass;
+class TargetMachine;
+class formatted_raw_ostream;
+
+// createLanaiISelDag - This pass converts a legalized DAG into a
+// Lanai-specific DAG, ready for instruction scheduling.
+FunctionPass *createLanaiISelDag(LanaiTargetMachine &TM);
+
+// createLanaiDelaySlotFillerPass - This pass fills delay slots
+// with useful instructions or nop's
+FunctionPass *createLanaiDelaySlotFillerPass(const LanaiTargetMachine &TM);
+
+// createLanaiMemAluCombinerPass - This pass combines loads/stores and
+// arithmetic operations.
+FunctionPass *createLanaiMemAluCombinerPass();
+
+// createLanaiSetflagAluCombinerPass - This pass combines SET_FLAG and ALU
+// operations.
+FunctionPass *createLanaiSetflagAluCombinerPass();
+
+Target &getTheLanaiTarget();
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAI_H
diff --git a/contrib/llvm/lib/Target/Lanai/Lanai.td b/contrib/llvm/lib/Target/Lanai/Lanai.td
new file mode 100644
index 000000000000..73d080457034
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/Lanai.td
@@ -0,0 +1,47 @@
+//===- Lanai.td - Describe the Lanai Target Machine --------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// Register File, Calling Conv, Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "LanaiSchedule.td"
+include "LanaiRegisterInfo.td"
+include "LanaiCallingConv.td"
+include "LanaiInstrInfo.td"
+
+def LanaiInstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// Lanai processors supported.
+//===----------------------------------------------------------------------===//
+
+def : ProcessorModel<"generic", LanaiSchedModel, []>;
+def : ProcessorModel<"v11", LanaiSchedModel, []>;
+
+def LanaiInstPrinter : AsmWriter {
+ string AsmWriterClassName = "InstPrinter";
+ bit isMCAsmWriter = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Declare the target which we are implementing
+//===----------------------------------------------------------------------===//
+
+def Lanai : Target {
+ // Pull in Instruction Info:
+ let InstructionSet = LanaiInstrInfo;
+ let AssemblyWriters = [LanaiInstPrinter];
+}
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiAluCode.h b/contrib/llvm/lib/Target/Lanai/LanaiAluCode.h
new file mode 100644
index 000000000000..d5145694fe46
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiAluCode.h
@@ -0,0 +1,148 @@
+//===-- LanaiAluCode.h - ALU operator encoding ----------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The encoding for ALU operators used in RM and RRM operands
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAIALUCODE_H
+#define LLVM_LIB_TARGET_LANAI_LANAIALUCODE_H
+
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+namespace LPAC {
+enum AluCode {
+ ADD = 0x00,
+ ADDC = 0x01,
+ SUB = 0x02,
+ SUBB = 0x03,
+ AND = 0x04,
+ OR = 0x05,
+ XOR = 0x06,
+ SPECIAL = 0x07,
+
+ // Shift instructions are treated as SPECIAL when encoding the machine
+ // instruction, but kept distinct until lowering. The constant values are
+ // chosen to ease lowering.
+ SHL = 0x17,
+ SRL = 0x27,
+ SRA = 0x37,
+
+ // Indicates an unknown/unsupported operator
+ UNKNOWN = 0xFF,
+};
+
+// Bits indicating post- and pre-operators should be tested and set using Is*
+// and Make* utility functions
+const int Lanai_PRE_OP = 0x40;
+const int Lanai_POST_OP = 0x80;
+
+inline static unsigned encodeLanaiAluCode(unsigned AluOp) {
+ unsigned const OP_ENCODING_MASK = 0x07;
+ return AluOp & OP_ENCODING_MASK;
+}
+
+inline static unsigned getAluOp(unsigned AluOp) {
+ unsigned const ALU_MASK = 0x3F;
+ return AluOp & ALU_MASK;
+}
+
+inline static bool isPreOp(unsigned AluOp) { return AluOp & Lanai_PRE_OP; }
+
+inline static bool isPostOp(unsigned AluOp) { return AluOp & Lanai_POST_OP; }
+
+inline static unsigned makePreOp(unsigned AluOp) {
+ assert(!isPostOp(AluOp) && "Operator can't be a post- and pre-op");
+ return AluOp | Lanai_PRE_OP;
+}
+
+inline static unsigned makePostOp(unsigned AluOp) {
+ assert(!isPreOp(AluOp) && "Operator can't be a post- and pre-op");
+ return AluOp | Lanai_POST_OP;
+}
+
+inline static bool modifiesOp(unsigned AluOp) {
+ return isPreOp(AluOp) | isPostOp(AluOp);
+}
+
+inline static const char *lanaiAluCodeToString(unsigned AluOp) {
+ switch (getAluOp(AluOp)) {
+ case ADD:
+ return "add";
+ case ADDC:
+ return "addc";
+ case SUB:
+ return "sub";
+ case SUBB:
+ return "subb";
+ case AND:
+ return "and";
+ case OR:
+ return "or";
+ case XOR:
+ return "xor";
+ case SHL:
+ return "sh";
+ case SRL:
+ return "sh";
+ case SRA:
+ return "sha";
+ default:
+ llvm_unreachable("Invalid ALU code.");
+ }
+}
+
+inline static AluCode stringToLanaiAluCode(StringRef S) {
+ return StringSwitch<AluCode>(S)
+ .Case("add", ADD)
+ .Case("addc", ADDC)
+ .Case("sub", SUB)
+ .Case("subb", SUBB)
+ .Case("and", AND)
+ .Case("or", OR)
+ .Case("xor", XOR)
+ .Case("sh", SHL)
+ .Case("srl", SRL)
+ .Case("sha", SRA)
+ .Default(UNKNOWN);
+}
+
+inline static AluCode isdToLanaiAluCode(ISD::NodeType Node_type) {
+ switch (Node_type) {
+ case ISD::ADD:
+ return AluCode::ADD;
+ case ISD::ADDE:
+ return AluCode::ADDC;
+ case ISD::SUB:
+ return AluCode::SUB;
+ case ISD::SUBE:
+ return AluCode::SUBB;
+ case ISD::AND:
+ return AluCode::AND;
+ case ISD::OR:
+ return AluCode::OR;
+ case ISD::XOR:
+ return AluCode::XOR;
+ case ISD::SHL:
+ return AluCode::SHL;
+ case ISD::SRL:
+ return AluCode::SRL;
+ case ISD::SRA:
+ return AluCode::SRA;
+ default:
+ return AluCode::UNKNOWN;
+ }
+}
+} // namespace LPAC
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAIALUCODE_H
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp b/contrib/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp
new file mode 100644
index 000000000000..607b2a97b29f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp
@@ -0,0 +1,243 @@
+//===-- LanaiAsmPrinter.cpp - Lanai LLVM assembly writer ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to the Lanai assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstPrinter/LanaiInstPrinter.h"
+#include "Lanai.h"
+#include "LanaiInstrInfo.h"
+#include "LanaiMCInstLower.h"
+#include "LanaiTargetMachine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "asm-printer"
+
+using namespace llvm;
+
+namespace {
+class LanaiAsmPrinter : public AsmPrinter {
+public:
+ explicit LanaiAsmPrinter(TargetMachine &TM,
+ std::unique_ptr<MCStreamer> Streamer)
+ : AsmPrinter(TM, std::move(Streamer)) {}
+
+ StringRef getPassName() const override { return "Lanai Assembly Printer"; }
+
+ void printOperand(const MachineInstr *MI, int OpNum, raw_ostream &O);
+ bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &O) override;
+ void EmitInstruction(const MachineInstr *MI) override;
+ bool isBlockOnlyReachableByFallthrough(
+ const MachineBasicBlock *MBB) const override;
+
+private:
+ void customEmitInstruction(const MachineInstr *MI);
+ void emitCallInstruction(const MachineInstr *MI);
+};
+} // end of anonymous namespace
+
+void LanaiAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
+ raw_ostream &O) {
+ const MachineOperand &MO = MI->getOperand(OpNum);
+
+ switch (MO.getType()) {
+ case MachineOperand::MO_Register:
+ O << LanaiInstPrinter::getRegisterName(MO.getReg());
+ break;
+
+ case MachineOperand::MO_Immediate:
+ O << MO.getImm();
+ break;
+
+ case MachineOperand::MO_MachineBasicBlock:
+ O << *MO.getMBB()->getSymbol();
+ break;
+
+ case MachineOperand::MO_GlobalAddress:
+ O << *getSymbol(MO.getGlobal());
+ break;
+
+ case MachineOperand::MO_BlockAddress: {
+ MCSymbol *BA = GetBlockAddressSymbol(MO.getBlockAddress());
+ O << BA->getName();
+ break;
+ }
+
+ case MachineOperand::MO_ExternalSymbol:
+ O << *GetExternalSymbolSymbol(MO.getSymbolName());
+ break;
+
+ case MachineOperand::MO_JumpTableIndex:
+ O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber() << '_'
+ << MO.getIndex();
+ break;
+
+ case MachineOperand::MO_ConstantPoolIndex:
+ O << MAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << '_'
+ << MO.getIndex();
+ return;
+
+ default:
+ llvm_unreachable("<unknown operand type>");
+ }
+}
+
+// PrintAsmOperand - Print out an operand for an inline asm expression.
+bool LanaiAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned /*AsmVariant*/,
+ const char *ExtraCode, raw_ostream &O) {
+ // Does this asm operand have a single letter operand modifier?
+ if (ExtraCode && ExtraCode[0]) {
+ if (ExtraCode[1])
+ return true; // Unknown modifier.
+
+ switch (ExtraCode[0]) {
+ // The highest-numbered register of a pair.
+ case 'H': {
+ if (OpNo == 0)
+ return true;
+ const MachineOperand &FlagsOP = MI->getOperand(OpNo - 1);
+ if (!FlagsOP.isImm())
+ return true;
+ unsigned Flags = FlagsOP.getImm();
+ unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+ if (NumVals != 2)
+ return true;
+ unsigned RegOp = OpNo + 1;
+ if (RegOp >= MI->getNumOperands())
+ return true;
+ const MachineOperand &MO = MI->getOperand(RegOp);
+ if (!MO.isReg())
+ return true;
+ unsigned Reg = MO.getReg();
+ O << LanaiInstPrinter::getRegisterName(Reg);
+ return false;
+ }
+ default:
+ return true; // Unknown modifier.
+ }
+ }
+ printOperand(MI, OpNo, O);
+ return false;
+}
+
+//===----------------------------------------------------------------------===//
+void LanaiAsmPrinter::emitCallInstruction(const MachineInstr *MI) {
+ assert((MI->getOpcode() == Lanai::CALL || MI->getOpcode() == Lanai::CALLR) &&
+ "Unsupported call function");
+
+ LanaiMCInstLower MCInstLowering(OutContext, *this);
+ MCSubtargetInfo STI = getSubtargetInfo();
+ // Insert save rca instruction immediately before the call.
+ // TODO: We should generate a pc-relative mov instruction here instead
+ // of pc + 16 (should be mov .+16 %rca).
+ OutStreamer->EmitInstruction(MCInstBuilder(Lanai::ADD_I_LO)
+ .addReg(Lanai::RCA)
+ .addReg(Lanai::PC)
+ .addImm(16),
+ STI);
+
+ // Push rca onto the stack.
+ // st %rca, [--%sp]
+ OutStreamer->EmitInstruction(MCInstBuilder(Lanai::SW_RI)
+ .addReg(Lanai::RCA)
+ .addReg(Lanai::SP)
+ .addImm(-4)
+ .addImm(LPAC::makePreOp(LPAC::ADD)),
+ STI);
+
+ // Lower the call instruction.
+ if (MI->getOpcode() == Lanai::CALL) {
+ MCInst TmpInst;
+ MCInstLowering.Lower(MI, TmpInst);
+ TmpInst.setOpcode(Lanai::BT);
+ OutStreamer->EmitInstruction(TmpInst, STI);
+ } else {
+ OutStreamer->EmitInstruction(MCInstBuilder(Lanai::ADD_R)
+ .addReg(Lanai::PC)
+ .addReg(MI->getOperand(0).getReg())
+ .addReg(Lanai::R0)
+ .addImm(LPCC::ICC_T),
+ STI);
+ }
+}
+
+void LanaiAsmPrinter::customEmitInstruction(const MachineInstr *MI) {
+ LanaiMCInstLower MCInstLowering(OutContext, *this);
+ MCSubtargetInfo STI = getSubtargetInfo();
+ MCInst TmpInst;
+ MCInstLowering.Lower(MI, TmpInst);
+ OutStreamer->EmitInstruction(TmpInst, STI);
+}
+
+void LanaiAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+ MachineBasicBlock::const_instr_iterator I = MI->getIterator();
+ MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
+
+ do {
+ if (I->isCall()) {
+ emitCallInstruction(&*I);
+ continue;
+ }
+
+ customEmitInstruction(&*I);
+ } while ((++I != E) && I->isInsideBundle());
+}
+
+// isBlockOnlyReachableByFallthough - Return true if the basic block has
+// exactly one predecessor and the control transfer mechanism between
+// the predecessor and this block is a fall-through.
+// FIXME: could the overridden cases be handled in AnalyzeBranch?
+bool LanaiAsmPrinter::isBlockOnlyReachableByFallthrough(
+ const MachineBasicBlock *MBB) const {
+ // The predecessor has to be immediately before this block.
+ const MachineBasicBlock *Pred = *MBB->pred_begin();
+
+ // If the predecessor is a switch statement, assume a jump table
+ // implementation, so it is not a fall through.
+ if (const BasicBlock *B = Pred->getBasicBlock())
+ if (isa<SwitchInst>(B->getTerminator()))
+ return false;
+
+ // Check default implementation
+ if (!AsmPrinter::isBlockOnlyReachableByFallthrough(MBB))
+ return false;
+
+ // Otherwise, check the last instruction.
+ // Check if the last terminator is an unconditional branch.
+ MachineBasicBlock::const_iterator I = Pred->end();
+ while (I != Pred->begin() && !(--I)->isTerminator()) {
+ }
+
+ return !I->isBarrier();
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeLanaiAsmPrinter() {
+ RegisterAsmPrinter<LanaiAsmPrinter> X(getTheLanaiTarget());
+}
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiCallingConv.td b/contrib/llvm/lib/Target/Lanai/LanaiCallingConv.td
new file mode 100644
index 000000000000..056b329c33c5
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiCallingConv.td
@@ -0,0 +1,50 @@
+//===- LanaiCallingConv.td - Calling Conventions Lanai -------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the Lanai architectures.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// Lanai 32-bit C Calling convention.
+def CC_Lanai32 : CallingConv<[
+ // Promote i8/i16 args to i32
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // Put argument in registers if marked 'inreg' and not a vararg call.
+ CCIfNotVarArg<CCIfInReg<CCIfType<[i32],
+ CCAssignToReg<[R6, R7, R18, R19]>>>>,
+
+ // Otherwise they are assigned to the stack in 4-byte aligned units.
+ CCAssignToStack<4, 4>
+]>;
+
+// Lanai 32-bit Fast Calling convention.
+def CC_Lanai32_Fast : CallingConv<[
+ // Promote i8/i16 args to i32
+ CCIfType<[ i8, i16 ], CCPromoteToType<i32>>,
+
+ // Put arguments in registers.
+ CCIfNotVarArg<CCIfType<[i32], CCAssignToReg<[ R6, R7, R18, R19 ]>>>,
+
+ // Otherwise they are assigned to the stack in 4-byte aligned units.
+ CCAssignToStack<4, 4>
+]>;
+
+// Lanai 32-bit C return-value convention.
+def RetCC_Lanai32 : CallingConv<[
+ // Specify two registers to allow returning 64-bit results that have already
+ // been lowered to 2 32-bit values.
+ CCIfType<[i32], CCAssignToReg<[RV, R9]>>
+]>;
+
+def CSR: CalleeSavedRegs<(add)>;
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiCondCode.h b/contrib/llvm/lib/Target/Lanai/LanaiCondCode.h
new file mode 100644
index 000000000000..6c5bdefc83dc
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiCondCode.h
@@ -0,0 +1,100 @@
+// The encoding used for conditional codes used in BR instructions
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAICONDCODE_H
+#define LLVM_LIB_TARGET_LANAI_LANAICONDCODE_H
+
+#include "llvm/ADT/StringSwitch.h"
+
+namespace llvm {
+namespace LPCC {
+enum CondCode {
+ ICC_T = 0, // true
+ ICC_F = 1, // false
+ ICC_HI = 2, // high
+ ICC_UGT = 2, // unsigned greater than
+ ICC_LS = 3, // low or same
+ ICC_ULE = 3, // unsigned less than or equal
+ ICC_CC = 4, // carry cleared
+ ICC_ULT = 4, // unsigned less than
+ ICC_CS = 5, // carry set
+ ICC_UGE = 5, // unsigned greater than or equal
+ ICC_NE = 6, // not equal
+ ICC_EQ = 7, // equal
+ ICC_VC = 8, // oVerflow cleared
+ ICC_VS = 9, // oVerflow set
+ ICC_PL = 10, // plus
+ ICC_MI = 11, // minus
+ ICC_GE = 12, // greater than or equal
+ ICC_LT = 13, // less than
+ ICC_GT = 14, // greater than
+ ICC_LE = 15, // less than or equal
+ UNKNOWN
+};
+
+inline static StringRef lanaiCondCodeToString(LPCC::CondCode CC) {
+ switch (CC) {
+ case LPCC::ICC_T:
+ return "t"; // true
+ case LPCC::ICC_F:
+ return "f"; // false
+ case LPCC::ICC_NE:
+ return "ne"; // not equal
+ case LPCC::ICC_EQ:
+ return "eq"; // equal
+ case LPCC::ICC_VC:
+ return "vc"; // oVerflow cleared
+ case LPCC::ICC_VS:
+ return "vs"; // oVerflow set
+ case LPCC::ICC_PL:
+ return "pl"; // plus
+ case LPCC::ICC_MI:
+ return "mi"; // minus
+ case LPCC::ICC_GE:
+ return "ge"; // greater than or equal
+ case LPCC::ICC_LT:
+ return "lt"; // less than
+ case LPCC::ICC_GT:
+ return "gt"; // greater than
+ case LPCC::ICC_LE:
+ return "le"; // less than or equal
+ case LPCC::ICC_UGT:
+ return "ugt"; // high | unsigned greater than
+ case LPCC::ICC_ULE:
+ return "ule"; // low or same | unsigned less or equal
+ case LPCC::ICC_ULT:
+ return "ult"; // carry cleared | unsigned less than
+ case LPCC::ICC_UGE:
+ return "uge"; // carry set | unsigned than or equal
+ default:
+ llvm_unreachable("Invalid cond code");
+ }
+}
+
+inline static CondCode suffixToLanaiCondCode(StringRef S) {
+ return StringSwitch<CondCode>(S)
+ .EndsWith("f", LPCC::ICC_F)
+ .EndsWith("hi", LPCC::ICC_HI)
+ .EndsWith("ugt", LPCC::ICC_UGT)
+ .EndsWith("ls", LPCC::ICC_LS)
+ .EndsWith("ule", LPCC::ICC_ULE)
+ .EndsWith("cc", LPCC::ICC_CC)
+ .EndsWith("ult", LPCC::ICC_ULT)
+ .EndsWith("cs", LPCC::ICC_CS)
+ .EndsWith("uge", LPCC::ICC_UGE)
+ .EndsWith("ne", LPCC::ICC_NE)
+ .EndsWith("eq", LPCC::ICC_EQ)
+ .EndsWith("vc", LPCC::ICC_VC)
+ .EndsWith("vs", LPCC::ICC_VS)
+ .EndsWith("pl", LPCC::ICC_PL)
+ .EndsWith("mi", LPCC::ICC_MI)
+ .EndsWith("ge", LPCC::ICC_GE)
+ .EndsWith("lt", LPCC::ICC_LT)
+ .EndsWith("gt", LPCC::ICC_GT)
+ .EndsWith("le", LPCC::ICC_LE)
+ .EndsWith("t", LPCC::ICC_T) // Has to be after others with suffix t
+ .Default(LPCC::UNKNOWN);
+}
+} // namespace LPCC
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAICONDCODE_H
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiDelaySlotFiller.cpp b/contrib/llvm/lib/Target/Lanai/LanaiDelaySlotFiller.cpp
new file mode 100644
index 000000000000..802232b05828
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiDelaySlotFiller.cpp
@@ -0,0 +1,262 @@
+//===-- LanaiDelaySlotFiller.cpp - Lanai delay slot filler ----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Simple pass to fills delay slots with useful instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Lanai.h"
+#include "LanaiTargetMachine.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "delay-slot-filler"
+
+STATISTIC(FilledSlots, "Number of delay slots filled");
+
+static cl::opt<bool>
+ NopDelaySlotFiller("lanai-nop-delay-filler", cl::init(false),
+ cl::desc("Fill Lanai delay slots with NOPs."),
+ cl::Hidden);
+
+namespace {
+struct Filler : public MachineFunctionPass {
+ // Target machine description which we query for reg. names, data
+ // layout, etc.
+ const TargetInstrInfo *TII;
+ const TargetRegisterInfo *TRI;
+ MachineBasicBlock::instr_iterator LastFiller;
+
+ static char ID;
+ explicit Filler() : MachineFunctionPass(ID) {}
+
+ StringRef getPassName() const override { return "Lanai Delay Slot Filler"; }
+
+ bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ const LanaiSubtarget &Subtarget = MF.getSubtarget<LanaiSubtarget>();
+ TII = Subtarget.getInstrInfo();
+ TRI = Subtarget.getRegisterInfo();
+
+ bool Changed = false;
+ for (MachineFunction::iterator FI = MF.begin(), FE = MF.end(); FI != FE;
+ ++FI)
+ Changed |= runOnMachineBasicBlock(*FI);
+ return Changed;
+ }
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+ void insertDefsUses(MachineBasicBlock::instr_iterator MI,
+ SmallSet<unsigned, 32> &RegDefs,
+ SmallSet<unsigned, 32> &RegUses);
+
+ bool isRegInSet(SmallSet<unsigned, 32> &RegSet, unsigned Reg);
+
+ bool delayHasHazard(MachineBasicBlock::instr_iterator MI, bool &SawLoad,
+ bool &SawStore, SmallSet<unsigned, 32> &RegDefs,
+ SmallSet<unsigned, 32> &RegUses);
+
+ bool findDelayInstr(MachineBasicBlock &MBB,
+ MachineBasicBlock::instr_iterator Slot,
+ MachineBasicBlock::instr_iterator &Filler);
+};
+char Filler::ID = 0;
+} // end of anonymous namespace
+
+// createLanaiDelaySlotFillerPass - Returns a pass that fills in delay
+// slots in Lanai MachineFunctions
+FunctionPass *
+llvm::createLanaiDelaySlotFillerPass(const LanaiTargetMachine & /*tm*/) {
+ return new Filler();
+}
+
+// runOnMachineBasicBlock - Fill in delay slots for the given basic block.
+// There is one or two delay slot per delayed instruction.
+bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
+ bool Changed = false;
+ LastFiller = MBB.instr_end();
+
+ for (MachineBasicBlock::instr_iterator I = MBB.instr_begin();
+ I != MBB.instr_end(); ++I) {
+ if (I->getDesc().hasDelaySlot()) {
+ MachineBasicBlock::instr_iterator InstrWithSlot = I;
+ MachineBasicBlock::instr_iterator J = I;
+
+ // Treat RET specially as it is only instruction with 2 delay slots
+ // generated while all others generated have 1 delay slot.
+ if (I->getOpcode() == Lanai::RET) {
+ // RET is generated as part of epilogue generation and hence we know
+ // what the two instructions preceding it are and that it is safe to
+ // insert RET above them.
+ MachineBasicBlock::reverse_instr_iterator RI = ++I.getReverse();
+ assert(RI->getOpcode() == Lanai::LDW_RI && RI->getOperand(0).isReg() &&
+ RI->getOperand(0).getReg() == Lanai::FP &&
+ RI->getOperand(1).isReg() &&
+ RI->getOperand(1).getReg() == Lanai::FP &&
+ RI->getOperand(2).isImm() && RI->getOperand(2).getImm() == -8);
+ ++RI;
+ assert(RI->getOpcode() == Lanai::ADD_I_LO &&
+ RI->getOperand(0).isReg() &&
+ RI->getOperand(0).getReg() == Lanai::SP &&
+ RI->getOperand(1).isReg() &&
+ RI->getOperand(1).getReg() == Lanai::FP);
+ MachineBasicBlock::instr_iterator FI = RI.getReverse();
+ MBB.splice(std::next(I), &MBB, FI, I);
+ FilledSlots += 2;
+ } else {
+ if (!NopDelaySlotFiller && findDelayInstr(MBB, I, J)) {
+ MBB.splice(std::next(I), &MBB, J);
+ } else {
+ BuildMI(MBB, std::next(I), DebugLoc(), TII->get(Lanai::NOP));
+ }
+ ++FilledSlots;
+ }
+
+ Changed = true;
+ // Record the filler instruction that filled the delay slot.
+ // The instruction after it will be visited in the next iteration.
+ LastFiller = ++I;
+
+ // Bundle the delay slot filler to InstrWithSlot so that the machine
+ // verifier doesn't expect this instruction to be a terminator.
+ MIBundleBuilder(MBB, InstrWithSlot, std::next(LastFiller));
+ }
+ }
+ return Changed;
+}
+
+bool Filler::findDelayInstr(MachineBasicBlock &MBB,
+ MachineBasicBlock::instr_iterator Slot,
+ MachineBasicBlock::instr_iterator &Filler) {
+ SmallSet<unsigned, 32> RegDefs;
+ SmallSet<unsigned, 32> RegUses;
+
+ insertDefsUses(Slot, RegDefs, RegUses);
+
+ bool SawLoad = false;
+ bool SawStore = false;
+
+ for (MachineBasicBlock::reverse_instr_iterator I = ++Slot.getReverse();
+ I != MBB.instr_rend(); ++I) {
+ // skip debug value
+ if (I->isDebugValue())
+ continue;
+
+ // Convert to forward iterator.
+ MachineBasicBlock::instr_iterator FI = I.getReverse();
+
+ if (I->hasUnmodeledSideEffects() || I->isInlineAsm() || I->isLabel() ||
+ FI == LastFiller || I->isPseudo())
+ break;
+
+ if (delayHasHazard(FI, SawLoad, SawStore, RegDefs, RegUses)) {
+ insertDefsUses(FI, RegDefs, RegUses);
+ continue;
+ }
+ Filler = FI;
+ return true;
+ }
+ return false;
+}
+
+bool Filler::delayHasHazard(MachineBasicBlock::instr_iterator MI, bool &SawLoad,
+ bool &SawStore, SmallSet<unsigned, 32> &RegDefs,
+ SmallSet<unsigned, 32> &RegUses) {
+ if (MI->isImplicitDef() || MI->isKill())
+ return true;
+
+ // Loads or stores cannot be moved past a store to the delay slot
+ // and stores cannot be moved past a load.
+ if (MI->mayLoad()) {
+ if (SawStore)
+ return true;
+ SawLoad = true;
+ }
+
+ if (MI->mayStore()) {
+ if (SawStore)
+ return true;
+ SawStore = true;
+ if (SawLoad)
+ return true;
+ }
+
+ assert((!MI->isCall() && !MI->isReturn()) &&
+ "Cannot put calls or returns in delay slot.");
+
+ for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
+ const MachineOperand &MO = MI->getOperand(I);
+ unsigned Reg;
+
+ if (!MO.isReg() || !(Reg = MO.getReg()))
+ continue; // skip
+
+ if (MO.isDef()) {
+ // check whether Reg is defined or used before delay slot.
+ if (isRegInSet(RegDefs, Reg) || isRegInSet(RegUses, Reg))
+ return true;
+ }
+ if (MO.isUse()) {
+ // check whether Reg is defined before delay slot.
+ if (isRegInSet(RegDefs, Reg))
+ return true;
+ }
+ }
+ return false;
+}
+
+// Insert Defs and Uses of MI into the sets RegDefs and RegUses.
+void Filler::insertDefsUses(MachineBasicBlock::instr_iterator MI,
+ SmallSet<unsigned, 32> &RegDefs,
+ SmallSet<unsigned, 32> &RegUses) {
+ // If MI is a call or return, just examine the explicit non-variadic operands.
+ MCInstrDesc MCID = MI->getDesc();
+ unsigned E = MI->isCall() || MI->isReturn() ? MCID.getNumOperands()
+ : MI->getNumOperands();
+ for (unsigned I = 0; I != E; ++I) {
+ const MachineOperand &MO = MI->getOperand(I);
+ unsigned Reg;
+
+ if (!MO.isReg() || !(Reg = MO.getReg()))
+ continue;
+
+ if (MO.isDef())
+ RegDefs.insert(Reg);
+ else if (MO.isUse())
+ RegUses.insert(Reg);
+ }
+
+ // Call & return instructions defines SP implicitly. Implicit defines are not
+ // included in the RegDefs set of calls but instructions modifying SP cannot
+ // be inserted in the delay slot of a call/return as these instructions are
+ // expanded to multiple instructions with SP modified before the branch that
+ // has the delay slot.
+ if (MI->isCall() || MI->isReturn())
+ RegDefs.insert(Lanai::SP);
+}
+
+// Returns true if the Reg or its alias is in the RegSet.
+bool Filler::isRegInSet(SmallSet<unsigned, 32> &RegSet, unsigned Reg) {
+ // Check Reg and all aliased Registers.
+ for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+ if (RegSet.count(*AI))
+ return true;
+ return false;
+}
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiFrameLowering.cpp b/contrib/llvm/lib/Target/Lanai/LanaiFrameLowering.cpp
new file mode 100644
index 000000000000..0723668c743e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiFrameLowering.cpp
@@ -0,0 +1,220 @@
+//===-- LanaiFrameLowering.cpp - Lanai Frame Information ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Lanai implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiFrameLowering.h"
+
+#include "LanaiInstrInfo.h"
+#include "LanaiMachineFunctionInfo.h"
+#include "LanaiSubtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+
+using namespace llvm;
+
+// Determines the size of the frame and maximum call frame size.
+void LanaiFrameLowering::determineFrameLayout(MachineFunction &MF) const {
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ const LanaiRegisterInfo *LRI = STI.getRegisterInfo();
+
+ // Get the number of bytes to allocate from the FrameInfo.
+ unsigned FrameSize = MFI.getStackSize();
+
+ // Get the alignment.
+ unsigned StackAlign = LRI->needsStackRealignment(MF) ? MFI.getMaxAlignment()
+ : getStackAlignment();
+
+ // Get the maximum call frame size of all the calls.
+ unsigned MaxCallFrameSize = MFI.getMaxCallFrameSize();
+
+ // If we have dynamic alloca then MaxCallFrameSize needs to be aligned so
+ // that allocations will be aligned.
+ if (MFI.hasVarSizedObjects())
+ MaxCallFrameSize = alignTo(MaxCallFrameSize, StackAlign);
+
+ // Update maximum call frame size.
+ MFI.setMaxCallFrameSize(MaxCallFrameSize);
+
+ // Include call frame size in total.
+ if (!(hasReservedCallFrame(MF) && MFI.adjustsStack()))
+ FrameSize += MaxCallFrameSize;
+
+ // Make sure the frame is aligned.
+ FrameSize = alignTo(FrameSize, StackAlign);
+
+ // Update frame info.
+ MFI.setStackSize(FrameSize);
+}
+
+// Iterates through each basic block in a machine function and replaces
+// ADJDYNALLOC pseudo instructions with a Lanai:ADDI with the
+// maximum call frame size as the immediate.
+void LanaiFrameLowering::replaceAdjDynAllocPseudo(MachineFunction &MF) const {
+ const LanaiInstrInfo &LII =
+ *static_cast<const LanaiInstrInfo *>(STI.getInstrInfo());
+ unsigned MaxCallFrameSize = MF.getFrameInfo().getMaxCallFrameSize();
+
+ for (MachineFunction::iterator MBB = MF.begin(), E = MF.end(); MBB != E;
+ ++MBB) {
+ MachineBasicBlock::iterator MBBI = MBB->begin();
+ while (MBBI != MBB->end()) {
+ MachineInstr &MI = *MBBI++;
+ if (MI.getOpcode() == Lanai::ADJDYNALLOC) {
+ DebugLoc DL = MI.getDebugLoc();
+ unsigned Dst = MI.getOperand(0).getReg();
+ unsigned Src = MI.getOperand(1).getReg();
+
+ BuildMI(*MBB, MI, DL, LII.get(Lanai::ADD_I_LO), Dst)
+ .addReg(Src)
+ .addImm(MaxCallFrameSize);
+ MI.eraseFromParent();
+ }
+ }
+ }
+}
+
+// Generates the following sequence for function entry:
+// st %fp,-4[*%sp] !push old FP
+// add %sp,8,%fp !generate new FP
+// sub %sp,0x4,%sp !allocate stack space (as needed)
+void LanaiFrameLowering::emitPrologue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
+
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ const LanaiInstrInfo &LII =
+ *static_cast<const LanaiInstrInfo *>(STI.getInstrInfo());
+ MachineBasicBlock::iterator MBBI = MBB.begin();
+
+ // Debug location must be unknown since the first debug location is used
+ // to determine the end of the prologue.
+ DebugLoc DL;
+
+ // Determine the correct frame layout
+ determineFrameLayout(MF);
+
+ // FIXME: This appears to be overallocating. Needs investigation.
+ // Get the number of bytes to allocate from the FrameInfo.
+ unsigned StackSize = MFI.getStackSize();
+
+ // Push old FP
+ // st %fp,-4[*%sp]
+ BuildMI(MBB, MBBI, DL, LII.get(Lanai::SW_RI))
+ .addReg(Lanai::FP)
+ .addReg(Lanai::SP)
+ .addImm(-4)
+ .addImm(LPAC::makePreOp(LPAC::ADD))
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ // Generate new FP
+ // add %sp,8,%fp
+ BuildMI(MBB, MBBI, DL, LII.get(Lanai::ADD_I_LO), Lanai::FP)
+ .addReg(Lanai::SP)
+ .addImm(8)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ // Allocate space on the stack if needed
+ // sub %sp,StackSize,%sp
+ if (StackSize != 0) {
+ BuildMI(MBB, MBBI, DL, LII.get(Lanai::SUB_I_LO), Lanai::SP)
+ .addReg(Lanai::SP)
+ .addImm(StackSize)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ // Replace ADJDYNANALLOC
+ if (MFI.hasVarSizedObjects())
+ replaceAdjDynAllocPseudo(MF);
+}
+
+MachineBasicBlock::iterator LanaiFrameLowering::eliminateCallFramePseudoInstr(
+ MachineFunction & /*MF*/, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
+ // Discard ADJCALLSTACKDOWN, ADJCALLSTACKUP instructions.
+ return MBB.erase(I);
+}
+
+// The function epilogue should not depend on the current stack pointer!
+// It should use the frame pointer only. This is mandatory because
+// of alloca; we also take advantage of it to omit stack adjustments
+// before returning.
+//
+// Note that when we go to restore the preserved register values we must
+// not try to address their slots by using offsets from the stack pointer.
+// That's because the stack pointer may have been moved during the function
+// execution due to a call to alloca(). Rather, we must restore all
+// preserved registers via offsets from the frame pointer value.
+//
+// Note also that when the current frame is being "popped" (by adjusting
+// the value of the stack pointer) on function exit, we must (for the
+// sake of alloca) set the new value of the stack pointer based upon
+// the current value of the frame pointer. We can't just add what we
+// believe to be the (static) frame size to the stack pointer because
+// if we did that, and alloca() had been called during this function,
+// we would end up returning *without* having fully deallocated all of
+// the space grabbed by alloca. If that happened, and a function
+// containing one or more alloca() calls was called over and over again,
+// then the stack would grow without limit!
+//
+// RET is lowered to
+// ld -4[%fp],%pc # modify %pc (two delay slots)
+// as the return address is in the stack frame and mov to pc is allowed.
+// emitEpilogue emits
+// mov %fp,%sp # restore the stack pointer
+// ld -8[%fp],%fp # restore the caller's frame pointer
+// before RET and the delay slot filler will move RET such that these
+// instructions execute in the delay slots of the load to PC.
+void LanaiFrameLowering::emitEpilogue(MachineFunction & /*MF*/,
+ MachineBasicBlock &MBB) const {
+ MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+ const LanaiInstrInfo &LII =
+ *static_cast<const LanaiInstrInfo *>(STI.getInstrInfo());
+ DebugLoc DL = MBBI->getDebugLoc();
+
+ // Restore the stack pointer using the callee's frame pointer value.
+ BuildMI(MBB, MBBI, DL, LII.get(Lanai::ADD_I_LO), Lanai::SP)
+ .addReg(Lanai::FP)
+ .addImm(0);
+
+ // Restore the frame pointer from the stack.
+ BuildMI(MBB, MBBI, DL, LII.get(Lanai::LDW_RI), Lanai::FP)
+ .addReg(Lanai::FP)
+ .addImm(-8)
+ .addImm(LPAC::ADD);
+}
+
+void LanaiFrameLowering::determineCalleeSaves(MachineFunction &MF,
+ BitVector &SavedRegs,
+ RegScavenger *RS) const {
+ TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ const LanaiRegisterInfo *LRI =
+ static_cast<const LanaiRegisterInfo *>(STI.getRegisterInfo());
+ int Offset = -4;
+
+ // Reserve 4 bytes for the saved RCA
+ MFI.CreateFixedObject(4, Offset, true);
+ Offset -= 4;
+
+ // Reserve 4 bytes for the saved FP
+ MFI.CreateFixedObject(4, Offset, true);
+ Offset -= 4;
+
+ if (LRI->hasBasePointer(MF)) {
+ MFI.CreateFixedObject(4, Offset, true);
+ SavedRegs.reset(LRI->getBaseRegister());
+ }
+}
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiFrameLowering.h b/contrib/llvm/lib/Target/Lanai/LanaiFrameLowering.h
new file mode 100644
index 000000000000..2f9b6c3c158f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiFrameLowering.h
@@ -0,0 +1,57 @@
+//===-- LanaiFrameLowering.h - Define frame lowering for Lanai --*- C++-*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements Lanai-specific bits of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAIFRAMELOWERING_H
+#define LLVM_LIB_TARGET_LANAI_LANAIFRAMELOWERING_H
+
+#include "Lanai.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+
+class BitVector;
+class LanaiSubtarget;
+
+class LanaiFrameLowering : public TargetFrameLowering {
+private:
+ void determineFrameLayout(MachineFunction &MF) const;
+ void replaceAdjDynAllocPseudo(MachineFunction &MF) const;
+
+protected:
+ const LanaiSubtarget &STI;
+
+public:
+ explicit LanaiFrameLowering(const LanaiSubtarget &Subtarget)
+ : TargetFrameLowering(StackGrowsDown,
+ /*StackAlignment=*/8,
+ /*LocalAreaOffset=*/0),
+ STI(Subtarget) {}
+
+ // emitProlog/emitEpilog - These methods insert prolog and epilog code into
+ // the function.
+ void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+ void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+ MachineBasicBlock::iterator
+ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const override;
+
+ bool hasFP(const MachineFunction & /*MF*/) const override { return true; }
+
+ void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+ RegScavenger *RS = nullptr) const override;
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAIFRAMELOWERING_H
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Lanai/LanaiISelDAGToDAG.cpp
new file mode 100644
index 000000000000..ed0c99a76ce4
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiISelDAGToDAG.cpp
@@ -0,0 +1,337 @@
+//===-- LanaiISelDAGToDAG.cpp - A dag to dag inst selector for Lanai ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the Lanai target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Lanai.h"
+#include "LanaiMachineFunctionInfo.h"
+#include "LanaiRegisterInfo.h"
+#include "LanaiSubtarget.h"
+#include "LanaiTargetMachine.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "lanai-isel"
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// LanaiDAGToDAGISel - Lanai specific code to select Lanai machine
+// instructions for SelectionDAG operations.
+//===----------------------------------------------------------------------===//
+namespace {
+
+class LanaiDAGToDAGISel : public SelectionDAGISel {
+public:
+ explicit LanaiDAGToDAGISel(LanaiTargetMachine &TargetMachine)
+ : SelectionDAGISel(TargetMachine) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ return SelectionDAGISel::runOnMachineFunction(MF);
+ }
+
+ // Pass Name
+ StringRef getPassName() const override {
+ return "Lanai DAG->DAG Pattern Instruction Selection";
+ }
+
+ bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintCode,
+ std::vector<SDValue> &OutOps) override;
+
+private:
+// Include the pieces autogenerated from the target description.
+#include "LanaiGenDAGISel.inc"
+
+ // Instruction Selection not handled by the auto-generated tablgen
+ void Select(SDNode *N) override;
+
+ // Support functions for the opcodes of Instruction Selection
+ // not handled by the auto-generated tablgen
+ void selectFrameIndex(SDNode *N);
+
+ // Complex Pattern for address selection.
+ bool selectAddrRi(SDValue Addr, SDValue &Base, SDValue &Offset,
+ SDValue &AluOp);
+ bool selectAddrRr(SDValue Addr, SDValue &R1, SDValue &R2, SDValue &AluOp);
+ bool selectAddrSls(SDValue Addr, SDValue &Offset);
+ bool selectAddrSpls(SDValue Addr, SDValue &Base, SDValue &Offset,
+ SDValue &AluOp);
+
+ // getI32Imm - Return a target constant with the specified value, of type i32.
+ inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
+ return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
+ }
+
+private:
+ bool selectAddrRiSpls(SDValue Addr, SDValue &Base, SDValue &Offset,
+ SDValue &AluOp, bool RiMode);
+};
+
+bool canBeRepresentedAsSls(const ConstantSDNode &CN) {
+ // Fits in 21-bit signed immediate and two low-order bits are zero.
+ return isInt<21>(CN.getSExtValue()) && ((CN.getSExtValue() & 0x3) == 0);
+}
+
+} // namespace
+
+// Helper functions for ComplexPattern used on LanaiInstrInfo
+// Used on Lanai Load/Store instructions.
+bool LanaiDAGToDAGISel::selectAddrSls(SDValue Addr, SDValue &Offset) {
+ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr)) {
+ SDLoc DL(Addr);
+ // Loading from a constant address.
+ if (canBeRepresentedAsSls(*CN)) {
+ int32_t Imm = CN->getSExtValue();
+ Offset = CurDAG->getTargetConstant(Imm, DL, CN->getValueType(0));
+ return true;
+ }
+ }
+ if (Addr.getOpcode() == ISD::OR &&
+ Addr.getOperand(1).getOpcode() == LanaiISD::SMALL) {
+ Offset = Addr.getOperand(1).getOperand(0);
+ return true;
+ }
+ return false;
+}
+
+bool LanaiDAGToDAGISel::selectAddrRiSpls(SDValue Addr, SDValue &Base,
+ SDValue &Offset, SDValue &AluOp,
+ bool RiMode) {
+ SDLoc DL(Addr);
+
+ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr)) {
+ if (RiMode) {
+ // Fits in 16-bit signed immediate.
+ if (isInt<16>(CN->getSExtValue())) {
+ int16_t Imm = CN->getSExtValue();
+ Offset = CurDAG->getTargetConstant(Imm, DL, CN->getValueType(0));
+ Base = CurDAG->getRegister(Lanai::R0, CN->getValueType(0));
+ AluOp = CurDAG->getTargetConstant(LPAC::ADD, DL, MVT::i32);
+ return true;
+ }
+ // Allow SLS to match if the constant doesn't fit in 16 bits but can be
+ // represented as an SLS.
+ if (canBeRepresentedAsSls(*CN))
+ return false;
+ } else {
+ // Fits in 10-bit signed immediate.
+ if (isInt<10>(CN->getSExtValue())) {
+ int16_t Imm = CN->getSExtValue();
+ Offset = CurDAG->getTargetConstant(Imm, DL, CN->getValueType(0));
+ Base = CurDAG->getRegister(Lanai::R0, CN->getValueType(0));
+ AluOp = CurDAG->getTargetConstant(LPAC::ADD, DL, MVT::i32);
+ return true;
+ }
+ }
+ }
+
+ // if Address is FI, get the TargetFrameIndex.
+ if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+ Base = CurDAG->getTargetFrameIndex(
+ FIN->getIndex(),
+ getTargetLowering()->getPointerTy(CurDAG->getDataLayout()));
+ Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
+ AluOp = CurDAG->getTargetConstant(LPAC::ADD, DL, MVT::i32);
+ return true;
+ }
+
+ // Skip direct calls
+ if ((Addr.getOpcode() == ISD::TargetExternalSymbol ||
+ Addr.getOpcode() == ISD::TargetGlobalAddress))
+ return false;
+
+ // Address of the form imm + reg
+ ISD::NodeType AluOperator = static_cast<ISD::NodeType>(Addr.getOpcode());
+ if (AluOperator == ISD::ADD) {
+ AluOp = CurDAG->getTargetConstant(LPAC::ADD, DL, MVT::i32);
+ // Addresses of the form FI+const
+ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
+ if ((RiMode && isInt<16>(CN->getSExtValue())) ||
+ (!RiMode && isInt<10>(CN->getSExtValue()))) {
+ // If the first operand is a FI, get the TargetFI Node
+ if (FrameIndexSDNode *FIN =
+ dyn_cast<FrameIndexSDNode>(Addr.getOperand(0))) {
+ Base = CurDAG->getTargetFrameIndex(
+ FIN->getIndex(),
+ getTargetLowering()->getPointerTy(CurDAG->getDataLayout()));
+ } else {
+ Base = Addr.getOperand(0);
+ }
+
+ Offset = CurDAG->getTargetConstant(CN->getSExtValue(), DL, MVT::i32);
+ return true;
+ }
+ }
+
+ // Let SLS match SMALL instead of RI.
+ if (AluOperator == ISD::OR && RiMode &&
+ Addr.getOperand(1).getOpcode() == LanaiISD::SMALL)
+ return false;
+
+ Base = Addr;
+ Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
+ AluOp = CurDAG->getTargetConstant(LPAC::ADD, DL, MVT::i32);
+ return true;
+}
+
+bool LanaiDAGToDAGISel::selectAddrRi(SDValue Addr, SDValue &Base,
+ SDValue &Offset, SDValue &AluOp) {
+ return selectAddrRiSpls(Addr, Base, Offset, AluOp, /*RiMode=*/true);
+}
+
+bool LanaiDAGToDAGISel::selectAddrSpls(SDValue Addr, SDValue &Base,
+ SDValue &Offset, SDValue &AluOp) {
+ return selectAddrRiSpls(Addr, Base, Offset, AluOp, /*RiMode=*/false);
+}
+
+bool LanaiDAGToDAGISel::selectAddrRr(SDValue Addr, SDValue &R1, SDValue &R2,
+ SDValue &AluOp) {
+ // if Address is FI, get the TargetFrameIndex.
+ if (Addr.getOpcode() == ISD::FrameIndex)
+ return false;
+
+ // Skip direct calls
+ if ((Addr.getOpcode() == ISD::TargetExternalSymbol ||
+ Addr.getOpcode() == ISD::TargetGlobalAddress))
+ return false;
+
+ // Address of the form OP + OP
+ ISD::NodeType AluOperator = static_cast<ISD::NodeType>(Addr.getOpcode());
+ LPAC::AluCode AluCode = LPAC::isdToLanaiAluCode(AluOperator);
+ if (AluCode != LPAC::UNKNOWN) {
+ // Skip addresses of the form FI OP const
+ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
+ if (isInt<16>(CN->getSExtValue()))
+ return false;
+
+ // Skip addresses with hi/lo operands
+ if (Addr.getOperand(0).getOpcode() == LanaiISD::HI ||
+ Addr.getOperand(0).getOpcode() == LanaiISD::LO ||
+ Addr.getOperand(0).getOpcode() == LanaiISD::SMALL ||
+ Addr.getOperand(1).getOpcode() == LanaiISD::HI ||
+ Addr.getOperand(1).getOpcode() == LanaiISD::LO ||
+ Addr.getOperand(1).getOpcode() == LanaiISD::SMALL)
+ return false;
+
+ // Addresses of the form register OP register
+ R1 = Addr.getOperand(0);
+ R2 = Addr.getOperand(1);
+ AluOp = CurDAG->getTargetConstant(AluCode, SDLoc(Addr), MVT::i32);
+ return true;
+ }
+
+ // Skip addresses with zero offset
+ return false;
+}
+
+bool LanaiDAGToDAGISel::SelectInlineAsmMemoryOperand(
+ const SDValue &Op, unsigned ConstraintCode, std::vector<SDValue> &OutOps) {
+ SDValue Op0, Op1, AluOp;
+ switch (ConstraintCode) {
+ default:
+ return true;
+ case InlineAsm::Constraint_m: // memory
+ if (!selectAddrRr(Op, Op0, Op1, AluOp) &&
+ !selectAddrRi(Op, Op0, Op1, AluOp))
+ return true;
+ break;
+ }
+
+ OutOps.push_back(Op0);
+ OutOps.push_back(Op1);
+ OutOps.push_back(AluOp);
+ return false;
+}
+
+// Select instructions not customized! Used for
+// expanded, promoted and normal instructions
+void LanaiDAGToDAGISel::Select(SDNode *Node) {
+ unsigned Opcode = Node->getOpcode();
+
+ // Dump information about the Node being selected
+ DEBUG(errs() << "Selecting: "; Node->dump(CurDAG); errs() << "\n");
+
+ // If we have a custom node, we already have selected!
+ if (Node->isMachineOpcode()) {
+ DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+ return;
+ }
+
+ // Instruction Selection not handled by the auto-generated tablegen selection
+ // should be handled here.
+ EVT VT = Node->getValueType(0);
+ switch (Opcode) {
+ case ISD::Constant:
+ if (VT == MVT::i32) {
+ ConstantSDNode *ConstNode = cast<ConstantSDNode>(Node);
+ // Materialize zero constants as copies from R0. This allows the coalescer
+ // to propagate these into other instructions.
+ if (ConstNode->isNullValue()) {
+ SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
+ SDLoc(Node), Lanai::R0, MVT::i32);
+ return ReplaceNode(Node, New.getNode());
+ }
+ // Materialize all ones constants as copies from R1. This allows the
+ // coalescer to propagate these into other instructions.
+ if (ConstNode->isAllOnesValue()) {
+ SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
+ SDLoc(Node), Lanai::R1, MVT::i32);
+ return ReplaceNode(Node, New.getNode());
+ }
+ }
+ break;
+ case ISD::FrameIndex:
+ selectFrameIndex(Node);
+ return;
+ default:
+ break;
+ }
+
+ // Select the default instruction
+ SelectCode(Node);
+}
+
+void LanaiDAGToDAGISel::selectFrameIndex(SDNode *Node) {
+ SDLoc DL(Node);
+ SDValue Imm = CurDAG->getTargetConstant(0, DL, MVT::i32);
+ int FI = dyn_cast<FrameIndexSDNode>(Node)->getIndex();
+ EVT VT = Node->getValueType(0);
+ SDValue TFI = CurDAG->getTargetFrameIndex(FI, VT);
+ unsigned Opc = Lanai::ADD_I_LO;
+ if (Node->hasOneUse()) {
+ CurDAG->SelectNodeTo(Node, Opc, VT, TFI, Imm);
+ return;
+ }
+ ReplaceNode(Node, CurDAG->getMachineNode(Opc, DL, VT, TFI, Imm));
+}
+
+// createLanaiISelDag - This pass converts a legalized DAG into a
+// Lanai-specific DAG, ready for instruction scheduling.
+FunctionPass *llvm::createLanaiISelDag(LanaiTargetMachine &TM) {
+ return new LanaiDAGToDAGISel(TM);
+}
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiISelLowering.cpp b/contrib/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
new file mode 100644
index 000000000000..ae7870e07d42
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
@@ -0,0 +1,1488 @@
+//===-- LanaiISelLowering.cpp - Lanai DAG Lowering Implementation ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LanaiTargetLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiISelLowering.h"
+
+#include "Lanai.h"
+#include "LanaiMachineFunctionInfo.h"
+#include "LanaiSubtarget.h"
+#include "LanaiTargetMachine.h"
+#include "LanaiTargetObjectFile.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "lanai-lower"
+
+using namespace llvm;
+
+// Limit on number of instructions the lowered multiplication may have before a
+// call to the library function should be generated instead. The threshold is
+// currently set to 14 as this was the smallest threshold that resulted in all
+// constant multiplications being lowered. A threshold of 5 covered all cases
+// except for one multiplication which required 14. mulsi3 requires 16
+// instructions (including the prologue and epilogue but excluding instructions
+// at call site). Until we can inline mulsi3, generating at most 14 instructions
+// will be faster than invoking mulsi3.
+static cl::opt<int> LanaiLowerConstantMulThreshold(
+ "lanai-constant-mul-threshold", cl::Hidden,
+ cl::desc("Maximum number of instruction to generate when lowering constant "
+ "multiplication instead of calling library function [default=14]"),
+ cl::init(14));
+
+LanaiTargetLowering::LanaiTargetLowering(const TargetMachine &TM,
+ const LanaiSubtarget &STI)
+ : TargetLowering(TM) {
+ // Set up the register classes.
+ addRegisterClass(MVT::i32, &Lanai::GPRRegClass);
+
+ // Compute derived properties from the register classes
+ TRI = STI.getRegisterInfo();
+ computeRegisterProperties(TRI);
+
+ setStackPointerRegisterToSaveRestore(Lanai::SP);
+
+ setOperationAction(ISD::BR_CC, MVT::i32, Custom);
+ setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+ setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+ setOperationAction(ISD::SETCC, MVT::i32, Custom);
+ setOperationAction(ISD::SETCCE, MVT::i32, Custom);
+ setOperationAction(ISD::SELECT, MVT::i32, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
+
+ setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+ setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
+ setOperationAction(ISD::JumpTable, MVT::i32, Custom);
+ setOperationAction(ISD::ConstantPool, MVT::i32, Custom);
+
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+ setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+ setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+
+ setOperationAction(ISD::VASTART, MVT::Other, Custom);
+ setOperationAction(ISD::VAARG, MVT::Other, Expand);
+ setOperationAction(ISD::VACOPY, MVT::Other, Expand);
+ setOperationAction(ISD::VAEND, MVT::Other, Expand);
+
+ setOperationAction(ISD::SDIV, MVT::i32, Expand);
+ setOperationAction(ISD::UDIV, MVT::i32, Expand);
+ setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+ setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+ setOperationAction(ISD::SREM, MVT::i32, Expand);
+ setOperationAction(ISD::UREM, MVT::i32, Expand);
+
+ setOperationAction(ISD::MUL, MVT::i32, Custom);
+ setOperationAction(ISD::MULHU, MVT::i32, Expand);
+ setOperationAction(ISD::MULHS, MVT::i32, Expand);
+ setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+ setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+
+ setOperationAction(ISD::ROTR, MVT::i32, Expand);
+ setOperationAction(ISD::ROTL, MVT::i32, Expand);
+ setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
+ setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
+ setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
+
+ setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+ setOperationAction(ISD::CTPOP, MVT::i32, Legal);
+ setOperationAction(ISD::CTLZ, MVT::i32, Legal);
+ setOperationAction(ISD::CTTZ, MVT::i32, Legal);
+
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+
+ // Extended load operations for i1 types must be promoted
+ for (MVT VT : MVT::integer_valuetypes()) {
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+ }
+
+ setTargetDAGCombine(ISD::ADD);
+ setTargetDAGCombine(ISD::SUB);
+ setTargetDAGCombine(ISD::AND);
+ setTargetDAGCombine(ISD::OR);
+ setTargetDAGCombine(ISD::XOR);
+
+ // Function alignments (log2)
+ setMinFunctionAlignment(2);
+ setPrefFunctionAlignment(2);
+
+ setJumpIsExpensive(true);
+
+ // TODO: Setting the minimum jump table entries needed before a
+ // switch is transformed to a jump table to 100 to avoid creating jump tables
+ // as this was causing bad performance compared to a large group of if
+ // statements. Re-evaluate this on new benchmarks.
+ setMinimumJumpTableEntries(100);
+
+ // Use fast calling convention for library functions.
+ for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I) {
+ setLibcallCallingConv(static_cast<RTLIB::Libcall>(I), CallingConv::Fast);
+ }
+
+ MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
+ MaxStoresPerMemsetOptSize = 8;
+ MaxStoresPerMemcpy = 16; // For @llvm.memcpy -> sequence of stores
+ MaxStoresPerMemcpyOptSize = 8;
+ MaxStoresPerMemmove = 16; // For @llvm.memmove -> sequence of stores
+ MaxStoresPerMemmoveOptSize = 8;
+
+ // Booleans always contain 0 or 1.
+ setBooleanContents(ZeroOrOneBooleanContent);
+}
+
+SDValue LanaiTargetLowering::LowerOperation(SDValue Op,
+ SelectionDAG &DAG) const {
+ switch (Op.getOpcode()) {
+ case ISD::MUL:
+ return LowerMUL(Op, DAG);
+ case ISD::BR_CC:
+ return LowerBR_CC(Op, DAG);
+ case ISD::ConstantPool:
+ return LowerConstantPool(Op, DAG);
+ case ISD::GlobalAddress:
+ return LowerGlobalAddress(Op, DAG);
+ case ISD::BlockAddress:
+ return LowerBlockAddress(Op, DAG);
+ case ISD::JumpTable:
+ return LowerJumpTable(Op, DAG);
+ case ISD::SELECT_CC:
+ return LowerSELECT_CC(Op, DAG);
+ case ISD::SETCC:
+ return LowerSETCC(Op, DAG);
+ case ISD::SETCCE:
+ return LowerSETCCE(Op, DAG);
+ case ISD::SHL_PARTS:
+ return LowerSHL_PARTS(Op, DAG);
+ case ISD::SRL_PARTS:
+ return LowerSRL_PARTS(Op, DAG);
+ case ISD::VASTART:
+ return LowerVASTART(Op, DAG);
+ case ISD::DYNAMIC_STACKALLOC:
+ return LowerDYNAMIC_STACKALLOC(Op, DAG);
+ case ISD::RETURNADDR:
+ return LowerRETURNADDR(Op, DAG);
+ case ISD::FRAMEADDR:
+ return LowerFRAMEADDR(Op, DAG);
+ default:
+ llvm_unreachable("unimplemented operand");
+ }
+}
+//===----------------------------------------------------------------------===//
+// Lanai Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+unsigned LanaiTargetLowering::getRegisterByName(const char *RegName, EVT /*VT*/,
+ SelectionDAG & /*DAG*/) const {
+ // Only unallocatable registers should be matched here.
+ unsigned Reg = StringSwitch<unsigned>(RegName)
+ .Case("pc", Lanai::PC)
+ .Case("sp", Lanai::SP)
+ .Case("fp", Lanai::FP)
+ .Case("rr1", Lanai::RR1)
+ .Case("r10", Lanai::R10)
+ .Case("rr2", Lanai::RR2)
+ .Case("r11", Lanai::R11)
+ .Case("rca", Lanai::RCA)
+ .Default(0);
+
+ if (Reg)
+ return Reg;
+ report_fatal_error("Invalid register name global variable");
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+LanaiTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ StringRef Constraint,
+ MVT VT) const {
+ if (Constraint.size() == 1)
+ // GCC Constraint Letters
+ switch (Constraint[0]) {
+ case 'r': // GENERAL_REGS
+ return std::make_pair(0U, &Lanai::GPRRegClass);
+ default:
+ break;
+ }
+
+ return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+}
+
+// Examine constraint type and operand type and determine a weight value.
+// This object must already have been set up with the operand type
+// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+LanaiTargetLowering::getSingleConstraintMatchWeight(
+ AsmOperandInfo &Info, const char *Constraint) const {
+ ConstraintWeight Weight = CW_Invalid;
+ Value *CallOperandVal = Info.CallOperandVal;
+ // If we don't have a value, we can't do a match,
+ // but allow it at the lowest weight.
+ if (CallOperandVal == NULL)
+ return CW_Default;
+ // Look at the constraint type.
+ switch (*Constraint) {
+ case 'I': // signed 16 bit immediate
+ case 'J': // integer zero
+ case 'K': // unsigned 16 bit immediate
+ case 'L': // immediate in the range 0 to 31
+ case 'M': // signed 32 bit immediate where lower 16 bits are 0
+ case 'N': // signed 26 bit immediate
+ case 'O': // integer zero
+ if (isa<ConstantInt>(CallOperandVal))
+ Weight = CW_Constant;
+ break;
+ default:
+ Weight = TargetLowering::getSingleConstraintMatchWeight(Info, Constraint);
+ break;
+ }
+ return Weight;
+}
+
+// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+// vector. If it is invalid, don't add anything to Ops.
+void LanaiTargetLowering::LowerAsmOperandForConstraint(
+ SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
+ SelectionDAG &DAG) const {
+ SDValue Result(0, 0);
+
+ // Only support length 1 constraints for now.
+ if (Constraint.length() > 1)
+ return;
+
+ char ConstraintLetter = Constraint[0];
+ switch (ConstraintLetter) {
+ case 'I': // Signed 16 bit constant
+ // If this fails, the parent routine will give an error
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (isInt<16>(C->getSExtValue())) {
+ Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(C),
+ Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'J': // integer zero
+ case 'O':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() == 0) {
+ Result = DAG.getTargetConstant(0, SDLoc(C), Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'K': // unsigned 16 bit immediate
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (isUInt<16>(C->getZExtValue())) {
+ Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(C),
+ Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'L': // immediate in the range 0 to 31
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() <= 31) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(C),
+ Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'M': // signed 32 bit immediate where lower 16 bits are 0
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ int64_t Val = C->getSExtValue();
+ if ((isInt<32>(Val)) && ((Val & 0xffff) == 0)) {
+ Result = DAG.getTargetConstant(Val, SDLoc(C), Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'N': // signed 26 bit immediate
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ int64_t Val = C->getSExtValue();
+ if ((Val >= -33554432) && (Val <= 33554431)) {
+ Result = DAG.getTargetConstant(Val, SDLoc(C), Op.getValueType());
+ break;
+ }
+ }
+ return;
+ default:
+ break; // This will fall through to the generic implementation
+ }
+
+ if (Result.getNode()) {
+ Ops.push_back(Result);
+ return;
+ }
+
+ TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+//===----------------------------------------------------------------------===//
+// Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "LanaiGenCallingConv.inc"
+
+static unsigned NumFixedArgs;
+static bool CC_Lanai32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State) {
+ // Handle fixed arguments with default CC.
+ // Note: Both the default and fast CC handle VarArg the same and hence the
+ // calling convention of the function is not considered here.
+ if (ValNo < NumFixedArgs) {
+ return CC_Lanai32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State);
+ }
+
+ // Promote i8/i16 args to i32
+ if (LocVT == MVT::i8 || LocVT == MVT::i16) {
+ LocVT = MVT::i32;
+ if (ArgFlags.isSExt())
+ LocInfo = CCValAssign::SExt;
+ else if (ArgFlags.isZExt())
+ LocInfo = CCValAssign::ZExt;
+ else
+ LocInfo = CCValAssign::AExt;
+ }
+
+ // VarArgs get passed on stack
+ unsigned Offset = State.AllocateStack(4, 4);
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ return false;
+}
+
+SDValue LanaiTargetLowering::LowerFormalArguments(
+ SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+ switch (CallConv) {
+ case CallingConv::C:
+ case CallingConv::Fast:
+ return LowerCCCArguments(Chain, CallConv, IsVarArg, Ins, DL, DAG, InVals);
+ default:
+ llvm_unreachable("Unsupported calling convention");
+ }
+}
+
+SDValue LanaiTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const {
+ SelectionDAG &DAG = CLI.DAG;
+ SDLoc &DL = CLI.DL;
+ SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+ SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+ SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
+ SDValue Chain = CLI.Chain;
+ SDValue Callee = CLI.Callee;
+ bool &IsTailCall = CLI.IsTailCall;
+ CallingConv::ID CallConv = CLI.CallConv;
+ bool IsVarArg = CLI.IsVarArg;
+
+ // Lanai target does not yet support tail call optimization.
+ IsTailCall = false;
+
+ switch (CallConv) {
+ case CallingConv::Fast:
+ case CallingConv::C:
+ return LowerCCCCallTo(Chain, Callee, CallConv, IsVarArg, IsTailCall, Outs,
+ OutVals, Ins, DL, DAG, InVals);
+ default:
+ llvm_unreachable("Unsupported calling convention");
+ }
+}
+
+// LowerCCCArguments - transform physical registers into virtual registers and
+// generate load operations for arguments places on the stack.
+SDValue LanaiTargetLowering::LowerCCCArguments(
+ SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MachineRegisterInfo &RegInfo = MF.getRegInfo();
+ LanaiMachineFunctionInfo *LanaiMFI = MF.getInfo<LanaiMachineFunctionInfo>();
+
+ // Assign locations to all of the incoming arguments.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
+ *DAG.getContext());
+ if (CallConv == CallingConv::Fast) {
+ CCInfo.AnalyzeFormalArguments(Ins, CC_Lanai32_Fast);
+ } else {
+ CCInfo.AnalyzeFormalArguments(Ins, CC_Lanai32);
+ }
+
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ if (VA.isRegLoc()) {
+ // Arguments passed in registers
+ EVT RegVT = VA.getLocVT();
+ switch (RegVT.getSimpleVT().SimpleTy) {
+ case MVT::i32: {
+ unsigned VReg = RegInfo.createVirtualRegister(&Lanai::GPRRegClass);
+ RegInfo.addLiveIn(VA.getLocReg(), VReg);
+ SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, RegVT);
+
+ // If this is an 8/16-bit value, it is really passed promoted to 32
+ // bits. Insert an assert[sz]ext to capture this, then truncate to the
+ // right size.
+ if (VA.getLocInfo() == CCValAssign::SExt)
+ ArgValue = DAG.getNode(ISD::AssertSext, DL, RegVT, ArgValue,
+ DAG.getValueType(VA.getValVT()));
+ else if (VA.getLocInfo() == CCValAssign::ZExt)
+ ArgValue = DAG.getNode(ISD::AssertZext, DL, RegVT, ArgValue,
+ DAG.getValueType(VA.getValVT()));
+
+ if (VA.getLocInfo() != CCValAssign::Full)
+ ArgValue = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), ArgValue);
+
+ InVals.push_back(ArgValue);
+ break;
+ }
+ default:
+ DEBUG(dbgs() << "LowerFormalArguments Unhandled argument type: "
+ << RegVT.getEVTString() << "\n");
+ llvm_unreachable("unhandled argument type");
+ }
+ } else {
+ // Sanity check
+ assert(VA.isMemLoc());
+ // Load the argument to a virtual register
+ unsigned ObjSize = VA.getLocVT().getSizeInBits() / 8;
+ // Check that the argument fits in stack slot
+ if (ObjSize > 4) {
+ errs() << "LowerFormalArguments Unhandled argument type: "
+ << EVT(VA.getLocVT()).getEVTString() << "\n";
+ }
+ // Create the frame index object for this incoming parameter...
+ int FI = MFI.CreateFixedObject(ObjSize, VA.getLocMemOffset(), true);
+
+ // Create the SelectionDAG nodes corresponding to a load
+ // from this parameter
+ SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+ InVals.push_back(DAG.getLoad(
+ VA.getLocVT(), DL, Chain, FIN,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
+ }
+ }
+
+ // The Lanai ABI for returning structs by value requires that we copy
+ // the sret argument into rv for the return. Save the argument into
+ // a virtual register so that we can access it from the return points.
+ if (MF.getFunction()->hasStructRetAttr()) {
+ unsigned Reg = LanaiMFI->getSRetReturnReg();
+ if (!Reg) {
+ Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i32));
+ LanaiMFI->setSRetReturnReg(Reg);
+ }
+ SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[0]);
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
+ }
+
+ if (IsVarArg) {
+ // Record the frame index of the first variable argument
+ // which is a value necessary to VASTART.
+ int FI = MFI.CreateFixedObject(4, CCInfo.getNextStackOffset(), true);
+ LanaiMFI->setVarArgsFrameIndex(FI);
+ }
+
+ return Chain;
+}
+
+SDValue
+LanaiTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+ bool IsVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SDLoc &DL, SelectionDAG &DAG) const {
+ // CCValAssign - represent the assignment of the return value to a location
+ SmallVector<CCValAssign, 16> RVLocs;
+
+ // CCState - Info about the registers and stack slot.
+ CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
+ *DAG.getContext());
+
+ // Analize return values.
+ CCInfo.AnalyzeReturn(Outs, RetCC_Lanai32);
+
+ SDValue Flag;
+ SmallVector<SDValue, 4> RetOps(1, Chain);
+
+ // Copy the result values into the output registers.
+ for (unsigned i = 0; i != RVLocs.size(); ++i) {
+ CCValAssign &VA = RVLocs[i];
+ assert(VA.isRegLoc() && "Can only return in registers!");
+
+ Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), OutVals[i], Flag);
+
+ // Guarantee that all emitted copies are stuck together with flags.
+ Flag = Chain.getValue(1);
+ RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+ }
+
+ // The Lanai ABI for returning structs by value requires that we copy
+ // the sret argument into rv for the return. We saved the argument into
+ // a virtual register in the entry block, so now we copy the value out
+ // and into rv.
+ if (DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ LanaiMachineFunctionInfo *LanaiMFI = MF.getInfo<LanaiMachineFunctionInfo>();
+ unsigned Reg = LanaiMFI->getSRetReturnReg();
+ assert(Reg &&
+ "SRetReturnReg should have been set in LowerFormalArguments().");
+ SDValue Val =
+ DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(DAG.getDataLayout()));
+
+ Chain = DAG.getCopyToReg(Chain, DL, Lanai::RV, Val, Flag);
+ Flag = Chain.getValue(1);
+ RetOps.push_back(
+ DAG.getRegister(Lanai::RV, getPointerTy(DAG.getDataLayout())));
+ }
+
+ RetOps[0] = Chain; // Update chain
+
+ unsigned Opc = LanaiISD::RET_FLAG;
+ if (Flag.getNode())
+ RetOps.push_back(Flag);
+
+ // Return Void
+ return DAG.getNode(Opc, DL, MVT::Other,
+ ArrayRef<SDValue>(&RetOps[0], RetOps.size()));
+}
+
+// LowerCCCCallTo - functions arguments are copied from virtual regs to
+// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
+SDValue LanaiTargetLowering::LowerCCCCallTo(
+ SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool IsVarArg,
+ bool /*IsTailCall*/, const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
+ *DAG.getContext());
+ GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+
+ NumFixedArgs = 0;
+ if (IsVarArg && G) {
+ const Function *CalleeFn = dyn_cast<Function>(G->getGlobal());
+ if (CalleeFn)
+ NumFixedArgs = CalleeFn->getFunctionType()->getNumParams();
+ }
+ if (NumFixedArgs)
+ CCInfo.AnalyzeCallOperands(Outs, CC_Lanai32_VarArg);
+ else {
+ if (CallConv == CallingConv::Fast)
+ CCInfo.AnalyzeCallOperands(Outs, CC_Lanai32_Fast);
+ else
+ CCInfo.AnalyzeCallOperands(Outs, CC_Lanai32);
+ }
+
+ // Get a count of how many bytes are to be pushed on the stack.
+ unsigned NumBytes = CCInfo.getNextStackOffset();
+
+ // Create local copies for byval args.
+ SmallVector<SDValue, 8> ByValArgs;
+ for (unsigned I = 0, E = Outs.size(); I != E; ++I) {
+ ISD::ArgFlagsTy Flags = Outs[I].Flags;
+ if (!Flags.isByVal())
+ continue;
+
+ SDValue Arg = OutVals[I];
+ unsigned Size = Flags.getByValSize();
+ unsigned Align = Flags.getByValAlign();
+
+ int FI = MFI.CreateStackObject(Size, Align, false);
+ SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+ SDValue SizeNode = DAG.getConstant(Size, DL, MVT::i32);
+
+ Chain = DAG.getMemcpy(Chain, DL, FIPtr, Arg, SizeNode, Align,
+ /*IsVolatile=*/false,
+ /*AlwaysInline=*/false,
+ /*isTailCall=*/false, MachinePointerInfo(),
+ MachinePointerInfo());
+ ByValArgs.push_back(FIPtr);
+ }
+
+ Chain = DAG.getCALLSEQ_START(
+ Chain,
+ DAG.getConstant(NumBytes, DL, getPointerTy(DAG.getDataLayout()), true),
+ DL);
+
+ SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
+ SmallVector<SDValue, 12> MemOpChains;
+ SDValue StackPtr;
+
+ // Walk the register/memloc assignments, inserting copies/loads.
+ for (unsigned I = 0, J = 0, E = ArgLocs.size(); I != E; ++I) {
+ CCValAssign &VA = ArgLocs[I];
+ SDValue Arg = OutVals[I];
+ ISD::ArgFlagsTy Flags = Outs[I].Flags;
+
+ // Promote the value if needed.
+ switch (VA.getLocInfo()) {
+ case CCValAssign::Full:
+ break;
+ case CCValAssign::SExt:
+ Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::ZExt:
+ Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::AExt:
+ Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
+ break;
+ default:
+ llvm_unreachable("Unknown loc info!");
+ }
+
+ // Use local copy if it is a byval arg.
+ if (Flags.isByVal())
+ Arg = ByValArgs[J++];
+
+ // Arguments that can be passed on register must be kept at RegsToPass
+ // vector
+ if (VA.isRegLoc()) {
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+ } else {
+ assert(VA.isMemLoc());
+
+ if (StackPtr.getNode() == 0)
+ StackPtr = DAG.getCopyFromReg(Chain, DL, Lanai::SP,
+ getPointerTy(DAG.getDataLayout()));
+
+ SDValue PtrOff =
+ DAG.getNode(ISD::ADD, DL, getPointerTy(DAG.getDataLayout()), StackPtr,
+ DAG.getIntPtrConstant(VA.getLocMemOffset(), DL));
+
+ MemOpChains.push_back(
+ DAG.getStore(Chain, DL, Arg, PtrOff, MachinePointerInfo()));
+ }
+ }
+
+ // Transform all store nodes into one single node because all store nodes are
+ // independent of each other.
+ if (!MemOpChains.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+ ArrayRef<SDValue>(&MemOpChains[0], MemOpChains.size()));
+
+ SDValue InFlag;
+
+ // Build a sequence of copy-to-reg nodes chained together with token chain and
+ // flag operands which copy the outgoing args into registers. The InFlag in
+ // necessary since all emitted instructions must be stuck together.
+ for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I) {
+ Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[I].first,
+ RegsToPass[I].second, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ // If the callee is a GlobalAddress node (quite common, every direct call is)
+ // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+ // Likewise ExternalSymbol -> TargetExternalSymbol.
+ uint8_t OpFlag = LanaiII::MO_NO_FLAG;
+ if (G) {
+ Callee = DAG.getTargetGlobalAddress(
+ G->getGlobal(), DL, getPointerTy(DAG.getDataLayout()), 0, OpFlag);
+ } else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+ Callee = DAG.getTargetExternalSymbol(
+ E->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlag);
+ }
+
+ // Returns a chain & a flag for retval copy to use.
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(Chain);
+ Ops.push_back(Callee);
+
+ // Add a register mask operand representing the call-preserved registers.
+ // TODO: Should return-twice functions be handled?
+ const uint32_t *Mask =
+ TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv);
+ assert(Mask && "Missing call preserved mask for calling convention");
+ Ops.push_back(DAG.getRegisterMask(Mask));
+
+ // Add argument registers to the end of the list so that they are
+ // known live into the call.
+ for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I)
+ Ops.push_back(DAG.getRegister(RegsToPass[I].first,
+ RegsToPass[I].second.getValueType()));
+
+ if (InFlag.getNode())
+ Ops.push_back(InFlag);
+
+ Chain = DAG.getNode(LanaiISD::CALL, DL, NodeTys,
+ ArrayRef<SDValue>(&Ops[0], Ops.size()));
+ InFlag = Chain.getValue(1);
+
+ // Create the CALLSEQ_END node.
+ Chain = DAG.getCALLSEQ_END(
+ Chain,
+ DAG.getConstant(NumBytes, DL, getPointerTy(DAG.getDataLayout()), true),
+ DAG.getConstant(0, DL, getPointerTy(DAG.getDataLayout()), true), InFlag,
+ DL);
+ InFlag = Chain.getValue(1);
+
+ // Handle result values, copying them out of physregs into vregs that we
+ // return.
+ return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
+ InVals);
+}
+
+// LowerCallResult - Lower the result values of a call into the
+// appropriate copies out of appropriate physical registers.
+SDValue LanaiTargetLowering::LowerCallResult(
+ SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+ // Assign locations to each value returned by this call.
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
+ *DAG.getContext());
+
+ CCInfo.AnalyzeCallResult(Ins, RetCC_Lanai32);
+
+ // Copy all of the result registers out of their specified physreg.
+ for (unsigned I = 0; I != RVLocs.size(); ++I) {
+ Chain = DAG.getCopyFromReg(Chain, DL, RVLocs[I].getLocReg(),
+ RVLocs[I].getValVT(), InFlag)
+ .getValue(1);
+ InFlag = Chain.getValue(2);
+ InVals.push_back(Chain.getValue(0));
+ }
+
+ return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+// Custom Lowerings
+//===----------------------------------------------------------------------===//
+
+static LPCC::CondCode IntCondCCodeToICC(SDValue CC, const SDLoc &DL,
+ SDValue &RHS, SelectionDAG &DAG) {
+ ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+
+ // For integer, only the SETEQ, SETNE, SETLT, SETLE, SETGT, SETGE, SETULT,
+ // SETULE, SETUGT, and SETUGE opcodes are used (see CodeGen/ISDOpcodes.h)
+ // and Lanai only supports integer comparisons, so only provide definitions
+ // for them.
+ switch (SetCCOpcode) {
+ case ISD::SETEQ:
+ return LPCC::ICC_EQ;
+ case ISD::SETGT:
+ if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS))
+ if (RHSC->getZExtValue() == 0xFFFFFFFF) {
+ // X > -1 -> X >= 0 -> is_plus(X)
+ RHS = DAG.getConstant(0, DL, RHS.getValueType());
+ return LPCC::ICC_PL;
+ }
+ return LPCC::ICC_GT;
+ case ISD::SETUGT:
+ return LPCC::ICC_UGT;
+ case ISD::SETLT:
+ if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS))
+ if (RHSC->getZExtValue() == 0)
+ // X < 0 -> is_minus(X)
+ return LPCC::ICC_MI;
+ return LPCC::ICC_LT;
+ case ISD::SETULT:
+ return LPCC::ICC_ULT;
+ case ISD::SETLE:
+ if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS))
+ if (RHSC->getZExtValue() == 0xFFFFFFFF) {
+ // X <= -1 -> X < 0 -> is_minus(X)
+ RHS = DAG.getConstant(0, DL, RHS.getValueType());
+ return LPCC::ICC_MI;
+ }
+ return LPCC::ICC_LE;
+ case ISD::SETULE:
+ return LPCC::ICC_ULE;
+ case ISD::SETGE:
+ if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS))
+ if (RHSC->getZExtValue() == 0)
+ // X >= 0 -> is_plus(X)
+ return LPCC::ICC_PL;
+ return LPCC::ICC_GE;
+ case ISD::SETUGE:
+ return LPCC::ICC_UGE;
+ case ISD::SETNE:
+ return LPCC::ICC_NE;
+ case ISD::SETONE:
+ case ISD::SETUNE:
+ case ISD::SETOGE:
+ case ISD::SETOLE:
+ case ISD::SETOLT:
+ case ISD::SETOGT:
+ case ISD::SETOEQ:
+ case ISD::SETUEQ:
+ case ISD::SETO:
+ case ISD::SETUO:
+ llvm_unreachable("Unsupported comparison.");
+ default:
+ llvm_unreachable("Unknown integer condition code!");
+ }
+}
+
+SDValue LanaiTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
+ SDValue Chain = Op.getOperand(0);
+ SDValue Cond = Op.getOperand(1);
+ SDValue LHS = Op.getOperand(2);
+ SDValue RHS = Op.getOperand(3);
+ SDValue Dest = Op.getOperand(4);
+ SDLoc DL(Op);
+
+ LPCC::CondCode CC = IntCondCCodeToICC(Cond, DL, RHS, DAG);
+ SDValue TargetCC = DAG.getConstant(CC, DL, MVT::i32);
+ SDValue Flag =
+ DAG.getNode(LanaiISD::SET_FLAG, DL, MVT::Glue, LHS, RHS, TargetCC);
+
+ return DAG.getNode(LanaiISD::BR_CC, DL, Op.getValueType(), Chain, Dest,
+ TargetCC, Flag);
+}
+
+SDValue LanaiTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op->getValueType(0);
+ if (VT != MVT::i32)
+ return SDValue();
+
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
+ if (!C)
+ return SDValue();
+
+ int64_t MulAmt = C->getSExtValue();
+ int32_t HighestOne = -1;
+ uint32_t NonzeroEntries = 0;
+ int SignedDigit[32] = {0};
+
+ // Convert to non-adjacent form (NAF) signed-digit representation.
+ // NAF is a signed-digit form where no adjacent digits are non-zero. It is the
+ // minimal Hamming weight representation of a number (on average 1/3 of the
+ // digits will be non-zero vs 1/2 for regular binary representation). And as
+ // the non-zero digits will be the only digits contributing to the instruction
+ // count, this is desirable. The next loop converts it to NAF (following the
+ // approach in 'Guide to Elliptic Curve Cryptography' [ISBN: 038795273X]) by
+ // choosing the non-zero coefficients such that the resulting quotient is
+ // divisible by 2 which will cause the next coefficient to be zero.
+ int64_t E = std::abs(MulAmt);
+ int S = (MulAmt < 0 ? -1 : 1);
+ int I = 0;
+ while (E > 0) {
+ int ZI = 0;
+ if (E % 2 == 1) {
+ ZI = 2 - (E % 4);
+ if (ZI != 0)
+ ++NonzeroEntries;
+ }
+ SignedDigit[I] = S * ZI;
+ if (SignedDigit[I] == 1)
+ HighestOne = I;
+ E = (E - ZI) / 2;
+ ++I;
+ }
+
+ // Compute number of instructions required. Due to differences in lowering
+ // between the different processors this count is not exact.
+ // Start by assuming a shift and a add/sub for every non-zero entry (hence
+ // every non-zero entry requires 1 shift and 1 add/sub except for the first
+ // entry).
+ int32_t InstrRequired = 2 * NonzeroEntries - 1;
+ // Correct possible over-adding due to shift by 0 (which is not emitted).
+ if (std::abs(MulAmt) % 2 == 1)
+ --InstrRequired;
+ // Return if the form generated would exceed the instruction threshold.
+ if (InstrRequired > LanaiLowerConstantMulThreshold)
+ return SDValue();
+
+ SDValue Res;
+ SDLoc DL(Op);
+ SDValue V = Op->getOperand(0);
+
+ // Initialize the running sum. Set the running sum to the maximal shifted
+ // positive value (i.e., largest i such that zi == 1 and MulAmt has V<<i as a
+ // term NAF).
+ if (HighestOne == -1)
+ Res = DAG.getConstant(0, DL, MVT::i32);
+ else {
+ Res = DAG.getNode(ISD::SHL, DL, VT, V,
+ DAG.getConstant(HighestOne, DL, MVT::i32));
+ SignedDigit[HighestOne] = 0;
+ }
+
+ // Assemble multiplication from shift, add, sub using NAF form and running
+ // sum.
+ for (unsigned int I = 0; I < sizeof(SignedDigit) / sizeof(SignedDigit[0]);
+ ++I) {
+ if (SignedDigit[I] == 0)
+ continue;
+
+ // Shifted multiplicand (v<<i).
+ SDValue Op =
+ DAG.getNode(ISD::SHL, DL, VT, V, DAG.getConstant(I, DL, MVT::i32));
+ if (SignedDigit[I] == 1)
+ Res = DAG.getNode(ISD::ADD, DL, VT, Res, Op);
+ else if (SignedDigit[I] == -1)
+ Res = DAG.getNode(ISD::SUB, DL, VT, Res, Op);
+ }
+ return Res;
+}
+
+SDValue LanaiTargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const {
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ SDValue Carry = Op.getOperand(2);
+ SDValue Cond = Op.getOperand(3);
+ SDLoc DL(Op);
+
+ LPCC::CondCode CC = IntCondCCodeToICC(Cond, DL, RHS, DAG);
+ SDValue TargetCC = DAG.getConstant(CC, DL, MVT::i32);
+ SDValue Flag = DAG.getNode(LanaiISD::SUBBF, DL, MVT::Glue, LHS, RHS, Carry);
+ return DAG.getNode(LanaiISD::SETCC, DL, Op.getValueType(), TargetCC, Flag);
+}
+
+SDValue LanaiTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ SDValue Cond = Op.getOperand(2);
+ SDLoc DL(Op);
+
+ LPCC::CondCode CC = IntCondCCodeToICC(Cond, DL, RHS, DAG);
+ SDValue TargetCC = DAG.getConstant(CC, DL, MVT::i32);
+ SDValue Flag =
+ DAG.getNode(LanaiISD::SET_FLAG, DL, MVT::Glue, LHS, RHS, TargetCC);
+
+ return DAG.getNode(LanaiISD::SETCC, DL, Op.getValueType(), TargetCC, Flag);
+}
+
+SDValue LanaiTargetLowering::LowerSELECT_CC(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ SDValue TrueV = Op.getOperand(2);
+ SDValue FalseV = Op.getOperand(3);
+ SDValue Cond = Op.getOperand(4);
+ SDLoc DL(Op);
+
+ LPCC::CondCode CC = IntCondCCodeToICC(Cond, DL, RHS, DAG);
+ SDValue TargetCC = DAG.getConstant(CC, DL, MVT::i32);
+ SDValue Flag =
+ DAG.getNode(LanaiISD::SET_FLAG, DL, MVT::Glue, LHS, RHS, TargetCC);
+
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
+ return DAG.getNode(LanaiISD::SELECT_CC, DL, VTs, TrueV, FalseV, TargetCC,
+ Flag);
+}
+
+SDValue LanaiTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ LanaiMachineFunctionInfo *FuncInfo = MF.getInfo<LanaiMachineFunctionInfo>();
+
+ SDLoc DL(Op);
+ SDValue FI = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
+ getPointerTy(DAG.getDataLayout()));
+
+ // vastart just stores the address of the VarArgsFrameIndex slot into the
+ // memory location argument.
+ const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+ return DAG.getStore(Op.getOperand(0), DL, FI, Op.getOperand(1),
+ MachinePointerInfo(SV));
+}
+
+SDValue LanaiTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue Chain = Op.getOperand(0);
+ SDValue Size = Op.getOperand(1);
+ SDLoc DL(Op);
+
+ unsigned SPReg = getStackPointerRegisterToSaveRestore();
+
+ // Get a reference to the stack pointer.
+ SDValue StackPointer = DAG.getCopyFromReg(Chain, DL, SPReg, MVT::i32);
+
+ // Subtract the dynamic size from the actual stack size to
+ // obtain the new stack size.
+ SDValue Sub = DAG.getNode(ISD::SUB, DL, MVT::i32, StackPointer, Size);
+
+ // For Lanai, the outgoing memory arguments area should be on top of the
+ // alloca area on the stack i.e., the outgoing memory arguments should be
+ // at a lower address than the alloca area. Move the alloca area down the
+ // stack by adding back the space reserved for outgoing arguments to SP
+ // here.
+ //
+ // We do not know what the size of the outgoing args is at this point.
+ // So, we add a pseudo instruction ADJDYNALLOC that will adjust the
+ // stack pointer. We replace this instruction with on that has the correct,
+ // known offset in emitPrologue().
+ SDValue ArgAdjust = DAG.getNode(LanaiISD::ADJDYNALLOC, DL, MVT::i32, Sub);
+
+ // The Sub result contains the new stack start address, so it
+ // must be placed in the stack pointer register.
+ SDValue CopyChain = DAG.getCopyToReg(Chain, DL, SPReg, Sub);
+
+ SDValue Ops[2] = {ArgAdjust, CopyChain};
+ return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue LanaiTargetLowering::LowerRETURNADDR(SDValue Op,
+ SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MFI.setReturnAddressIsTaken(true);
+
+ EVT VT = Op.getValueType();
+ SDLoc DL(Op);
+ unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ if (Depth) {
+ SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+ const unsigned Offset = -4;
+ SDValue Ptr = DAG.getNode(ISD::ADD, DL, VT, FrameAddr,
+ DAG.getIntPtrConstant(Offset, DL));
+ return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, MachinePointerInfo());
+ }
+
+ // Return the link register, which contains the return address.
+ // Mark it an implicit live-in.
+ unsigned Reg = MF.addLiveIn(TRI->getRARegister(), getRegClassFor(MVT::i32));
+ return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
+}
+
+SDValue LanaiTargetLowering::LowerFRAMEADDR(SDValue Op,
+ SelectionDAG &DAG) const {
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ MFI.setFrameAddressIsTaken(true);
+
+ EVT VT = Op.getValueType();
+ SDLoc DL(Op);
+ SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Lanai::FP, VT);
+ unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ while (Depth--) {
+ const unsigned Offset = -8;
+ SDValue Ptr = DAG.getNode(ISD::ADD, DL, VT, FrameAddr,
+ DAG.getIntPtrConstant(Offset, DL));
+ FrameAddr =
+ DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, MachinePointerInfo());
+ }
+ return FrameAddr;
+}
+
+const char *LanaiTargetLowering::getTargetNodeName(unsigned Opcode) const {
+ switch (Opcode) {
+ case LanaiISD::ADJDYNALLOC:
+ return "LanaiISD::ADJDYNALLOC";
+ case LanaiISD::RET_FLAG:
+ return "LanaiISD::RET_FLAG";
+ case LanaiISD::CALL:
+ return "LanaiISD::CALL";
+ case LanaiISD::SELECT_CC:
+ return "LanaiISD::SELECT_CC";
+ case LanaiISD::SETCC:
+ return "LanaiISD::SETCC";
+ case LanaiISD::SUBBF:
+ return "LanaiISD::SUBBF";
+ case LanaiISD::SET_FLAG:
+ return "LanaiISD::SET_FLAG";
+ case LanaiISD::BR_CC:
+ return "LanaiISD::BR_CC";
+ case LanaiISD::Wrapper:
+ return "LanaiISD::Wrapper";
+ case LanaiISD::HI:
+ return "LanaiISD::HI";
+ case LanaiISD::LO:
+ return "LanaiISD::LO";
+ case LanaiISD::SMALL:
+ return "LanaiISD::SMALL";
+ default:
+ return NULL;
+ }
+}
+
+SDValue LanaiTargetLowering::LowerConstantPool(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op);
+ const Constant *C = N->getConstVal();
+ const LanaiTargetObjectFile *TLOF =
+ static_cast<const LanaiTargetObjectFile *>(
+ getTargetMachine().getObjFileLowering());
+
+ // If the code model is small or constant will be placed in the small section,
+ // then assume address will fit in 21-bits.
+ if (getTargetMachine().getCodeModel() == CodeModel::Small ||
+ TLOF->isConstantInSmallSection(DAG.getDataLayout(), C)) {
+ SDValue Small = DAG.getTargetConstantPool(
+ C, MVT::i32, N->getAlignment(), N->getOffset(), LanaiII::MO_NO_FLAG);
+ return DAG.getNode(ISD::OR, DL, MVT::i32,
+ DAG.getRegister(Lanai::R0, MVT::i32),
+ DAG.getNode(LanaiISD::SMALL, DL, MVT::i32, Small));
+ } else {
+ uint8_t OpFlagHi = LanaiII::MO_ABS_HI;
+ uint8_t OpFlagLo = LanaiII::MO_ABS_LO;
+
+ SDValue Hi = DAG.getTargetConstantPool(C, MVT::i32, N->getAlignment(),
+ N->getOffset(), OpFlagHi);
+ SDValue Lo = DAG.getTargetConstantPool(C, MVT::i32, N->getAlignment(),
+ N->getOffset(), OpFlagLo);
+ Hi = DAG.getNode(LanaiISD::HI, DL, MVT::i32, Hi);
+ Lo = DAG.getNode(LanaiISD::LO, DL, MVT::i32, Lo);
+ SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i32, Hi, Lo);
+ return Result;
+ }
+}
+
+SDValue LanaiTargetLowering::LowerGlobalAddress(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+ int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
+
+ const LanaiTargetObjectFile *TLOF =
+ static_cast<const LanaiTargetObjectFile *>(
+ getTargetMachine().getObjFileLowering());
+
+ // If the code model is small or global variable will be placed in the small
+ // section, then assume address will fit in 21-bits.
+ const GlobalObject *GO = GV->getBaseObject();
+ if (TLOF->isGlobalInSmallSection(GO, getTargetMachine())) {
+ SDValue Small = DAG.getTargetGlobalAddress(
+ GV, DL, getPointerTy(DAG.getDataLayout()), Offset, LanaiII::MO_NO_FLAG);
+ return DAG.getNode(ISD::OR, DL, MVT::i32,
+ DAG.getRegister(Lanai::R0, MVT::i32),
+ DAG.getNode(LanaiISD::SMALL, DL, MVT::i32, Small));
+ } else {
+ uint8_t OpFlagHi = LanaiII::MO_ABS_HI;
+ uint8_t OpFlagLo = LanaiII::MO_ABS_LO;
+
+ // Create the TargetGlobalAddress node, folding in the constant offset.
+ SDValue Hi = DAG.getTargetGlobalAddress(
+ GV, DL, getPointerTy(DAG.getDataLayout()), Offset, OpFlagHi);
+ SDValue Lo = DAG.getTargetGlobalAddress(
+ GV, DL, getPointerTy(DAG.getDataLayout()), Offset, OpFlagLo);
+ Hi = DAG.getNode(LanaiISD::HI, DL, MVT::i32, Hi);
+ Lo = DAG.getNode(LanaiISD::LO, DL, MVT::i32, Lo);
+ return DAG.getNode(ISD::OR, DL, MVT::i32, Hi, Lo);
+ }
+}
+
+SDValue LanaiTargetLowering::LowerBlockAddress(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+
+ uint8_t OpFlagHi = LanaiII::MO_ABS_HI;
+ uint8_t OpFlagLo = LanaiII::MO_ABS_LO;
+
+ SDValue Hi = DAG.getBlockAddress(BA, MVT::i32, true, OpFlagHi);
+ SDValue Lo = DAG.getBlockAddress(BA, MVT::i32, true, OpFlagLo);
+ Hi = DAG.getNode(LanaiISD::HI, DL, MVT::i32, Hi);
+ Lo = DAG.getNode(LanaiISD::LO, DL, MVT::i32, Lo);
+ SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i32, Hi, Lo);
+ return Result;
+}
+
+SDValue LanaiTargetLowering::LowerJumpTable(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+
+ // If the code model is small assume address will fit in 21-bits.
+ if (getTargetMachine().getCodeModel() == CodeModel::Small) {
+ SDValue Small = DAG.getTargetJumpTable(
+ JT->getIndex(), getPointerTy(DAG.getDataLayout()), LanaiII::MO_NO_FLAG);
+ return DAG.getNode(ISD::OR, DL, MVT::i32,
+ DAG.getRegister(Lanai::R0, MVT::i32),
+ DAG.getNode(LanaiISD::SMALL, DL, MVT::i32, Small));
+ } else {
+ uint8_t OpFlagHi = LanaiII::MO_ABS_HI;
+ uint8_t OpFlagLo = LanaiII::MO_ABS_LO;
+
+ SDValue Hi = DAG.getTargetJumpTable(
+ JT->getIndex(), getPointerTy(DAG.getDataLayout()), OpFlagHi);
+ SDValue Lo = DAG.getTargetJumpTable(
+ JT->getIndex(), getPointerTy(DAG.getDataLayout()), OpFlagLo);
+ Hi = DAG.getNode(LanaiISD::HI, DL, MVT::i32, Hi);
+ Lo = DAG.getNode(LanaiISD::LO, DL, MVT::i32, Lo);
+ SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i32, Hi, Lo);
+ return Result;
+ }
+}
+
+SDValue LanaiTargetLowering::LowerSHL_PARTS(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ unsigned VTBits = VT.getSizeInBits();
+ SDLoc dl(Op);
+ assert(Op.getNumOperands() == 3 && "Unexpected SHL!");
+ SDValue ShOpLo = Op.getOperand(0);
+ SDValue ShOpHi = Op.getOperand(1);
+ SDValue ShAmt = Op.getOperand(2);
+
+ // Performs the following for (ShOpLo + (ShOpHi << 32)) << ShAmt:
+ // LoBitsForHi = (ShAmt == 0) ? 0 : (ShOpLo >> (32-ShAmt))
+ // HiBitsForHi = ShOpHi << ShAmt
+ // Hi = (ShAmt >= 32) ? (ShOpLo << (ShAmt-32)) : (LoBitsForHi | HiBitsForHi)
+ // Lo = (ShAmt >= 32) ? 0 : (ShOpLo << ShAmt)
+ // return (Hi << 32) | Lo;
+
+ SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
+ DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
+ SDValue LoBitsForHi = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
+
+ // If ShAmt == 0, we just calculated "(SRL ShOpLo, 32)" which is "undef". We
+ // wanted 0, so CSEL it directly.
+ SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
+ SDValue SetCC = DAG.getSetCC(dl, MVT::i32, ShAmt, Zero, ISD::SETEQ);
+ LoBitsForHi = DAG.getSelect(dl, MVT::i32, SetCC, Zero, LoBitsForHi);
+
+ SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
+ DAG.getConstant(VTBits, dl, MVT::i32));
+ SDValue HiBitsForHi = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
+ SDValue HiForNormalShift =
+ DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi);
+
+ SDValue HiForBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
+
+ SetCC = DAG.getSetCC(dl, MVT::i32, ExtraShAmt, Zero, ISD::SETGE);
+ SDValue Hi =
+ DAG.getSelect(dl, MVT::i32, SetCC, HiForBigShift, HiForNormalShift);
+
+ // Lanai shifts of larger than register sizes are wrapped rather than
+ // clamped, so we can't just emit "lo << b" if b is too big.
+ SDValue LoForNormalShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+ SDValue Lo = DAG.getSelect(
+ dl, MVT::i32, SetCC, DAG.getConstant(0, dl, MVT::i32), LoForNormalShift);
+
+ SDValue Ops[2] = {Lo, Hi};
+ return DAG.getMergeValues(Ops, dl);
+}
+
+SDValue LanaiTargetLowering::LowerSRL_PARTS(SDValue Op,
+ SelectionDAG &DAG) const {
+ MVT VT = Op.getSimpleValueType();
+ unsigned VTBits = VT.getSizeInBits();
+ SDLoc dl(Op);
+ SDValue ShOpLo = Op.getOperand(0);
+ SDValue ShOpHi = Op.getOperand(1);
+ SDValue ShAmt = Op.getOperand(2);
+
+ // Performs the following for a >> b:
+ // unsigned r_high = a_high >> b;
+ // r_high = (32 - b <= 0) ? 0 : r_high;
+ //
+ // unsigned r_low = a_low >> b;
+ // r_low = (32 - b <= 0) ? r_high : r_low;
+ // r_low = (b == 0) ? r_low : r_low | (a_high << (32 - b));
+ // return (unsigned long long)r_high << 32 | r_low;
+ // Note: This takes advantage of Lanai's shift behavior to avoid needing to
+ // mask the shift amount.
+
+ SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
+ SDValue NegatedPlus32 = DAG.getNode(
+ ISD::SUB, dl, MVT::i32, DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
+ SDValue SetCC = DAG.getSetCC(dl, MVT::i32, NegatedPlus32, Zero, ISD::SETLE);
+
+ SDValue Hi = DAG.getNode(ISD::SRL, dl, MVT::i32, ShOpHi, ShAmt);
+ Hi = DAG.getSelect(dl, MVT::i32, SetCC, Zero, Hi);
+
+ SDValue Lo = DAG.getNode(ISD::SRL, dl, MVT::i32, ShOpLo, ShAmt);
+ Lo = DAG.getSelect(dl, MVT::i32, SetCC, Hi, Lo);
+ SDValue CarryBits =
+ DAG.getNode(ISD::SHL, dl, MVT::i32, ShOpHi, NegatedPlus32);
+ SDValue ShiftIsZero = DAG.getSetCC(dl, MVT::i32, ShAmt, Zero, ISD::SETEQ);
+ Lo = DAG.getSelect(dl, MVT::i32, ShiftIsZero, Lo,
+ DAG.getNode(ISD::OR, dl, MVT::i32, Lo, CarryBits));
+
+ SDValue Ops[2] = {Lo, Hi};
+ return DAG.getMergeValues(Ops, dl);
+}
+
+// Helper function that checks if N is a null or all ones constant.
+static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
+ return AllOnes ? isAllOnesConstant(N) : isNullConstant(N);
+}
+
+// Return true if N is conditionally 0 or all ones.
+// Detects these expressions where cc is an i1 value:
+//
+// (select cc 0, y) [AllOnes=0]
+// (select cc y, 0) [AllOnes=0]
+// (zext cc) [AllOnes=0]
+// (sext cc) [AllOnes=0/1]
+// (select cc -1, y) [AllOnes=1]
+// (select cc y, -1) [AllOnes=1]
+//
+// * AllOnes determines whether to check for an all zero (AllOnes false) or an
+// all ones operand (AllOnes true).
+// * Invert is set when N is the all zero/ones constant when CC is false.
+// * OtherOp is set to the alternative value of N.
+//
+// For example, for (select cc X, Y) and AllOnes = 0 if:
+// * X = 0, Invert = False and OtherOp = Y
+// * Y = 0, Invert = True and OtherOp = X
+static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, SDValue &CC,
+ bool &Invert, SDValue &OtherOp,
+ SelectionDAG &DAG) {
+ switch (N->getOpcode()) {
+ default:
+ return false;
+ case ISD::SELECT: {
+ CC = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDValue N2 = N->getOperand(2);
+ if (isZeroOrAllOnes(N1, AllOnes)) {
+ Invert = false;
+ OtherOp = N2;
+ return true;
+ }
+ if (isZeroOrAllOnes(N2, AllOnes)) {
+ Invert = true;
+ OtherOp = N1;
+ return true;
+ }
+ return false;
+ }
+ case ISD::ZERO_EXTEND: {
+ // (zext cc) can never be the all ones value.
+ if (AllOnes)
+ return false;
+ CC = N->getOperand(0);
+ if (CC.getValueType() != MVT::i1)
+ return false;
+ SDLoc dl(N);
+ EVT VT = N->getValueType(0);
+ OtherOp = DAG.getConstant(1, dl, VT);
+ Invert = true;
+ return true;
+ }
+ case ISD::SIGN_EXTEND: {
+ CC = N->getOperand(0);
+ if (CC.getValueType() != MVT::i1)
+ return false;
+ SDLoc dl(N);
+ EVT VT = N->getValueType(0);
+ Invert = !AllOnes;
+ if (AllOnes)
+ // When looking for an AllOnes constant, N is an sext, and the 'other'
+ // value is 0.
+ OtherOp = DAG.getConstant(0, dl, VT);
+ else
+ OtherOp =
+ DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl, VT);
+ return true;
+ }
+ }
+}
+
+// Combine a constant select operand into its use:
+//
+// (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
+// (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
+// (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1]
+// (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
+// (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
+//
+// The transform is rejected if the select doesn't have a constant operand that
+// is null, or all ones when AllOnes is set.
+//
+// Also recognize sext/zext from i1:
+//
+// (add (zext cc), x) -> (select cc (add x, 1), x)
+// (add (sext cc), x) -> (select cc (add x, -1), x)
+//
+// These transformations eventually create predicated instructions.
+static SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
+ TargetLowering::DAGCombinerInfo &DCI,
+ bool AllOnes) {
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+ SDValue NonConstantVal;
+ SDValue CCOp;
+ bool SwapSelectOps;
+ if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
+ NonConstantVal, DAG))
+ return SDValue();
+
+ // Slct is now know to be the desired identity constant when CC is true.
+ SDValue TrueVal = OtherOp;
+ SDValue FalseVal =
+ DAG.getNode(N->getOpcode(), SDLoc(N), VT, OtherOp, NonConstantVal);
+ // Unless SwapSelectOps says CC should be false.
+ if (SwapSelectOps)
+ std::swap(TrueVal, FalseVal);
+
+ return DAG.getNode(ISD::SELECT, SDLoc(N), VT, CCOp, TrueVal, FalseVal);
+}
+
+// Attempt combineSelectAndUse on each operand of a commutative operator N.
+static SDValue
+combineSelectAndUseCommutative(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+ bool AllOnes) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ if (N0.getNode()->hasOneUse())
+ if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes))
+ return Result;
+ if (N1.getNode()->hasOneUse())
+ if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes))
+ return Result;
+ return SDValue();
+}
+
+// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
+static SDValue PerformSUBCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
+ if (N1.getNode()->hasOneUse())
+ if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, /*AllOnes=*/false))
+ return Result;
+
+ return SDValue();
+}
+
+SDValue LanaiTargetLowering::PerformDAGCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ switch (N->getOpcode()) {
+ default:
+ break;
+ case ISD::ADD:
+ case ISD::OR:
+ case ISD::XOR:
+ return combineSelectAndUseCommutative(N, DCI, /*AllOnes=*/false);
+ case ISD::AND:
+ return combineSelectAndUseCommutative(N, DCI, /*AllOnes=*/true);
+ case ISD::SUB:
+ return PerformSUBCombine(N, DCI);
+ }
+
+ return SDValue();
+}
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiISelLowering.h b/contrib/llvm/lib/Target/Lanai/LanaiISelLowering.h
new file mode 100644
index 000000000000..c2fba4f9d167
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiISelLowering.h
@@ -0,0 +1,149 @@
+//===-- LanaiISelLowering.h - Lanai DAG Lowering Interface -....-*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Lanai uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAIISELLOWERING_H
+#define LLVM_LIB_TARGET_LANAI_LANAIISELLOWERING_H
+
+#include "Lanai.h"
+#include "LanaiRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+namespace LanaiISD {
+enum {
+ FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+ ADJDYNALLOC,
+
+ // Return with a flag operand. Operand 0 is the chain operand.
+ RET_FLAG,
+
+ // CALL - These operations represent an abstract call instruction, which
+ // includes a bunch of information.
+ CALL,
+
+ // SELECT_CC - Operand 0 and operand 1 are selection variable, operand 3
+ // is condition code and operand 4 is flag operand.
+ SELECT_CC,
+
+ // SETCC - Store the conditional code to a register.
+ SETCC,
+
+ // SET_FLAG - Set flag compare.
+ SET_FLAG,
+
+ // SUBBF - Subtract with borrow that sets flags.
+ SUBBF,
+
+ // BR_CC - Used to glue together a conditional branch and comparison
+ BR_CC,
+
+ // Wrapper - A wrapper node for TargetConstantPool, TargetExternalSymbol,
+ // and TargetGlobalAddress.
+ Wrapper,
+
+ // Get the Higher/Lower 16 bits from a 32-bit immediate.
+ HI,
+ LO,
+
+ // Small 21-bit immediate in global memory.
+ SMALL
+};
+} // namespace LanaiISD
+
+class LanaiSubtarget;
+
+class LanaiTargetLowering : public TargetLowering {
+public:
+ LanaiTargetLowering(const TargetMachine &TM, const LanaiSubtarget &STI);
+
+ // LowerOperation - Provide custom lowering hooks for some operations.
+ SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+ // getTargetNodeName - This method returns the name of a target specific
+ // DAG node.
+ const char *getTargetNodeName(unsigned Opcode) const override;
+
+ SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSETCCE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+
+ unsigned getRegisterByName(const char *RegName, EVT VT,
+ SelectionDAG &DAG) const override;
+ std::pair<unsigned, const TargetRegisterClass *>
+ getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ StringRef Constraint, MVT VT) const override;
+ ConstraintWeight
+ getSingleConstraintMatchWeight(AsmOperandInfo &Info,
+ const char *Constraint) const override;
+ void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
+ std::vector<SDValue> &Ops,
+ SelectionDAG &DAG) const override;
+
+ SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+private:
+ SDValue LowerCCCCallTo(SDValue Chain, SDValue Callee,
+ CallingConv::ID CallConv, bool IsVarArg,
+ bool IsTailCall,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const;
+
+ SDValue LowerCCCArguments(SDValue Chain, CallingConv::ID CallConv,
+ bool IsVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &DL, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const;
+
+ SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+ CallingConv::ID CallConv, bool IsVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &DL, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const;
+
+ SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const override;
+
+ SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+ bool IsVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &DL, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const override;
+
+ SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
+ SelectionDAG &DAG) const override;
+
+ const LanaiRegisterInfo *TRI;
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAIISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiInstrFormats.td b/contrib/llvm/lib/Target/Lanai/LanaiInstrFormats.td
new file mode 100644
index 000000000000..30289ea4ac0b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiInstrFormats.td
@@ -0,0 +1,561 @@
+//===- LanaiInstrFormats.td - Lanai Instruction Formats ----*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+class InstLanai<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : Instruction {
+ field bits<32> Inst;
+ field bits<32> SoftFail = 0;
+ let Size = 4;
+
+ let Namespace = "Lanai";
+ let DecoderNamespace = "Lanai";
+
+ bits<4> Opcode;
+ let Inst{31 - 28} = Opcode;
+
+ dag OutOperandList = outs;
+ dag InOperandList = ins;
+ let AsmString = asmstr;
+ let Pattern = pattern;
+}
+
+//------------------------------------------------------------------------------
+// Register Immediate (RI)
+//------------------------------------------------------------------------------
+// Encoding:
+// -----------------------------------------------------------------
+// |0.A.A.A| . . . . | . . . . |F.H| . . . . . . . . . . . . . . . |
+// -----------------------------------------------------------------
+// opcode Rd Rs1 constant (16)
+//
+// Action:
+// Rd <- Rs1 op constant
+//
+// Except for shift instructions, `H' determines whether the constant
+// is in the high (1) or low (0) word. The other halfword is 0x0000,
+// except for the `AND' instruction (`AAA' = 100), for which the other
+// halfword is 0xFFFF, and shifts (`AAA' = 111), for which the constant is
+// sign extended.
+//
+// `F' determines whether the instruction modifies (1) or does not
+// modify (0) the program flags.
+//
+// `AAA' specifies the operation: `add' (000), `addc' (001), `sub'
+// (010), `subb' (011), `and' (100), `or' (101), `xor' (110), or `shift'
+// (111). For the shift, `H' specifies a logical (0) or arithmetic (1)
+// shift. The amount and direction of the shift are determined by the
+// sign extended constant interpreted as a two's complement number. The
+// shift operation is defined only for the range of:
+// 31 ... 0 -1 ... -31
+// \ / \ /
+// left right
+// shift shift
+//
+// If and only if the `F' bit is 1, RI instructions modify the
+// condition bits, `Z' (Zero), `N' (Negative), `V' (oVerflow), and `C'
+// (Carry), according to the result. If the flags are updated, they are
+// updated as follows:
+// `Z'
+// is set if the result is zero and cleared otherwise.
+//
+// `N'
+// is set to the most significant bit of the result.
+//
+// `V'
+// For arithmetic instructions (`add', `addc', `sub', `subb') `V' is
+// set if the sign (most significant) bits of the input operands are
+// the same but different from the sign bit of the result and cleared
+// otherwise. For other RI instructions, `V' is cleared.
+//
+// `C'
+// For arithmetic instructions, `C' is set/cleared if there is/is_not
+// a carry generated out of the most significant when performing the
+// twos-complement addition (`sub(a,b) == a + ~b + 1', `subb(a,b) ==
+// a + ~b + `C''). For left shifts, `C' is set to the least
+// significant bit discarded by the shift operation. For all other
+// operations, `C' is cleared.
+//
+// A Jump is accomplished by `Rd' being `pc', and it has one shadow.
+//
+// The all-0s word is the instruction `R0 <- R0 + 0', which is a no-op.
+class InstRI<bits<3> op, dag outs, dag ins, string asmstr,
+ list<dag> pattern>
+ : InstLanai<outs, ins, asmstr, pattern>, Sched<[WriteALU]> {
+ let Itinerary = IIC_ALU;
+ bits<5> Rd;
+ bits<5> Rs1;
+ bit F;
+ bit H;
+ bits<16> imm16;
+
+ let Opcode{3} = 0;
+ let Opcode{2 - 0} = op;
+ let Inst{27 - 23} = Rd;
+ let Inst{22 - 18} = Rs1;
+ let Inst{17} = F;
+ let Inst{16} = H;
+ let Inst{15 - 0} = imm16;
+}
+
+//------------------------------------------------------------------------------
+// Register Register (RR)
+//------------------------------------------------------------------------------
+// Encoding:
+// -----------------------------------------------------------------
+// |1.1.0.0| . . . . | . . . . |F.I| . . . . |B.B.B|J.J.J.J.J|D.D.D|
+// -----------------------------------------------------------------
+// opcode Rd Rs1 Rs2 \ operation /
+//
+// Action:
+// `Rd <- Rs1 op Rs2' iff condition DDDI is true.
+//
+// `DDDI' is as described for the BR instruction.
+//
+// `F' determines whether the instruction modifies (1) or does not
+// modify (0) the program flags.
+//
+// `BBB' determines the operation: `add' (000), `addc' (001), `sub'
+// (010), `subb' (011), `and' (100), `or' (101), `xor' (110), or "special"
+// (111). The `JJJJJ' field is irrelevant except for special.
+//
+// `JJJJJ' determines which special operation is performed. `10---'
+// is a logical shift, and `11---' is an arithmetic shift, and ‘00000` is
+// the SELECT operation. The amount and direction of the shift are
+// determined by the contents of `Rs2' interpreted as a two's complement
+// number (in the same way as shifts in the Register-Immediate
+// instructions in *Note RI::). For the SELECT operation, Rd gets Rs1 if
+// condition DDDI is true, Rs2 otherwise. All other `JJJJJ' combinations
+// are reserved for instructions that may be defined in the future.
+//
+// If the `F' bit is 1, RR instructions modify the condition bits, `Z'
+// (Zero), `N' (Negative), `V' (oVerflow), and `C' (Carry), according to
+// the result. All RR instructions modify the `Z', `N', and `V' flags.
+// Except for arithmetic instructions (`add', `addc', `sub', `subb'), `V'
+// is cleared. Only arithmetic instructions and shifts modify `C'. Right
+// shifts clear C.
+//
+// DDDI is as described in the table for the BR instruction and only used for
+// the select instruction.
+//
+// A Jump is accomplished by `Rd' being `pc', and it has one shadow.
+class InstRR<bits<3> op, dag outs, dag ins, string asmstr,
+ list<dag> pattern>
+ : InstLanai<outs, ins, asmstr, pattern>, Sched<[WriteALU]> {
+ let Itinerary = IIC_ALU;
+ bits<5> Rd;
+ bits<5> Rs1;
+ bits<5> Rs2;
+ bit F;
+ bits<4> DDDI;
+ bits<5> JJJJJ;
+
+ let Opcode = 0b1100;
+ let Inst{27 - 23} = Rd;
+ let Inst{22 - 18} = Rs1;
+ let Inst{17} = F;
+ let Inst{16} = DDDI{0};
+ let Inst{15 - 11} = Rs2;
+ let Inst{10 - 8} = op;
+ let Inst{7 - 3} = JJJJJ;
+ let Inst{2 - 0} = DDDI{3 - 1};
+}
+
+//------------------------------------------------------------------------------
+// Register Memory (RM)
+//------------------------------------------------------------------------------
+// Encoding:
+// -----------------------------------------------------------------
+// |1.0.0.S| . . . . | . . . . |P.Q| . . . . . . . . . . . . . . . |
+// -----------------------------------------------------------------
+// opcode Rd Rs1 constant (16)
+//
+// Action:
+// Rd <- Memory(ea) (Load) see below for the
+// Memory(ea) <- Rd (Store) definition of ea.
+//
+// `S' determines whether the instruction is a Load (0) or a Store (1).
+// Loads appear in Rd one cycle after this instruction executes. If the
+// following instruction reads Rd, that instruction will be delayed by 1
+// clock cycle.
+//
+// PQ operation
+// -- ------------------------------------------
+// 00 ea = Rs1
+// 01 ea = Rs1, Rs1 <- Rs1 + constant
+// 10 ea = Rs1 + constant
+// 11 ea = Rs1 + constant, Rs1 <- Rs1 + constant
+//
+// The constant is sign-extended for this instruction.
+//
+// A Jump is accomplished by `Rd' being `pc', and it has *two* delay slots.
+class InstRM<bit S, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstLanai<outs, ins, asmstr, pattern> {
+ bits<5> Rd;
+ bits<5> Rs1;
+ bit P;
+ bit Q;
+ bits<16> imm16;
+ // Dummy variables to allow multiclass definition of RM and RRM
+ bits<2> YL;
+ bit E;
+
+ let Opcode{3 - 1} = 0b100;
+ let Opcode{0} = S;
+ let Inst{27 - 23} = Rd;
+ let Inst{22 - 18} = Rs1;
+ let Inst{17} = P;
+ let Inst{16} = Q;
+ let Inst{15 - 0} = imm16;
+
+ let PostEncoderMethod = "adjustPqBitsRmAndRrm";
+}
+
+//------------------------------------------------------------------------------
+// Register Register Memory (RRM)
+//------------------------------------------------------------------------------
+// Encoding:
+// -----------------------------------------------------------------
+// |1.0.1.S| . . . . | . . . . |P.Q| . . . . |B.B.B|J.J.J.J.J|Y.L.E|
+// -----------------------------------------------------------------
+// opcode Rd Rs1 Rs2 \ operation /
+//
+// Action:
+// Rd <- Memory(ea) (Load) see below for the
+// Memory(ea) <- Rd (Store) definition of ea.
+//
+// The RRM instruction is identical to the RM (*note RM::.) instruction
+// except that:
+//
+// 1. `Rs1 + constant' is replaced with `Rs1 op Rs2', where `op' is
+// determined in the same way as in the RR instruction (*note RR::.)
+// and
+//
+// 2. part-word memory accesses are allowed as specified below.
+//
+// If `BBB' != 111 (i.e.: For all but shift operations):
+// If `YLE' = 01- => fuLl-word memory access
+// If `YLE' = 00- => half-word memory access
+// If `YLE' = 10- => bYte memory access
+// If `YLE' = --1 => loads are zEro extended
+// If `YLE' = --0 => loads are sign extended
+//
+// If `BBB' = 111 (For shift operations):
+// fullword memory access are performed.
+//
+// All part-word loads write the least significant part of the
+// destination register with the higher-order bits zero- or sign-extended.
+// All part-word stores store the least significant part-word of the
+// source register in the destination memory location.
+//
+// A Jump is accomplished by `Rd' being `pc', and it has *two* delay slots.
+class InstRRM<bit S, dag outs, dag ins, string asmstr,
+ list<dag> pattern>
+ : InstLanai<outs, ins, asmstr, pattern> {
+ bits<5> Rd;
+ bits<5> Rs1;
+ bits<5> Rs2;
+ bit P;
+ bit Q;
+ bits<3> BBB;
+ bits<5> JJJJJ;
+ bits<2> YL;
+ bit E;
+
+ let Opcode{3 - 1} = 0b101;
+ let Opcode{0} = S;
+ let Inst{27 - 23} = Rd;
+ let Inst{22 - 18} = Rs1;
+ let Inst{17} = P;
+ let Inst{16} = Q;
+ let Inst{15 - 11} = Rs2;
+ let Inst{10 - 8} = BBB;
+ let Inst{7 - 3} = JJJJJ;
+ let Inst{2 - 1} = YL;
+ let Inst{0} = E;
+
+ let PostEncoderMethod = "adjustPqBitsRmAndRrm";
+}
+
+//------------------------------------------------------------------------------
+// Conditional Branch (BR)
+//------------------------------------------------------------------------------
+// Encoding:
+// -----------------------------------------------------------------
+// |1.1.1.0|D.D.D| . . . . . . . . . . . . . . . . . . . . . . |0.I|
+// -----------------------------------------------------------------
+// opcode condition constant (23)
+//
+// Action:
+// if (condition) { `pc' <- 4*(zero-extended constant) }
+//
+// The BR instruction is an absolute branch.
+// The constant is scaled as shown by its position in the instruction word such
+// that it specifies word-aligned addresses in the range [0,2^25-4]
+//
+// The `DDDI' field selects the condition that causes the branch to be taken.
+// (the `I' (Invert sense) bit inverts the sense of the condition):
+//
+// DDDI logical function [code, used for...]
+// ---- -------------------------------------- ------------------------
+// 0000 1 [T, true]
+// 0001 0 [F, false]
+// 0010 C AND Z' [HI, high]
+// 0011 C' OR Z [LS, low or same]
+// 0100 C' [CC, carry cleared]
+// 0101 C [CS, carry set]
+// 0110 Z' [NE, not equal]
+// 0111 Z [EQ, equal]
+// 1000 V' [VC, oVerflow cleared]
+// 1001 V [VS, oVerflow set]
+// 1010 N' [PL, plus]
+// 1011 N [MI, minus]
+// 1100 (N AND V) OR (N' AND V') [GE, greater than or equal]
+// 1101 (N AND V') OR (N' AND V) [LT, less than]
+// 1110 (N AND V AND Z') OR (N' AND V' AND Z') [GT, greater than]
+// 1111 (Z) OR (N AND V') OR (N' AND V) [LE, less than or equal]
+//
+// If the branch is not taken, the BR instruction is a no-op. If the branch is
+// taken, the processor starts executing instructions at the branch target
+// address *after* the processor has executed one more instruction. That is,
+// the branch has one “branch delay slot”. Be very careful if you find yourself
+// wanting to put a branch in a branch delays slot!
+class InstBR<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstLanai<outs, ins, asmstr, pattern> {
+ let Itinerary = IIC_ALU;
+ bits<25> addr;
+ bits<4> DDDI;
+
+ let Opcode = 0b1110;
+ let Inst{27 - 25} = DDDI{3 - 1};
+ let Inst{24 - 0} = addr;
+ // These instructions overwrite the last two address bits (which are assumed
+ // and ensured to be 0).
+ let Inst{1} = 0;
+ let Inst{0} = DDDI{0};
+}
+
+//------------------------------------------------------------------------------
+// Conditional Branch Relative (BRR)
+//------------------------------------------------------------------------------
+// Encoding:
+// -----------------------------------------------------------------
+// |1.1.1.0|D.D.D|1|-| . . . . |-.-| . . . . . . . . . . . . . |1.I|
+// -----------------------------------------------------------------
+// opcode condition Rs1 constant (14)
+// Action:
+// if (condition) { ‘pc’ <- Rs1 + 4*sign-extended constant) }
+//
+// BRR behaves like BR, except the branch target address is a 16-bit PC relative
+// offset.
+class InstBRR<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstLanai<outs, ins, asmstr, pattern> {
+ bits<4> DDDI;
+ bits<5> Rs1;
+ bits<16> imm16;
+
+ let Opcode = 0b1110;
+ let Inst{27 - 25} = DDDI{3 - 1};
+ let Inst{24} = 1;
+ let Inst{22 - 18} = Rs1;
+ let Inst{17 - 16} = 0;
+ let Inst{15 - 0} = imm16;
+ // Overwrite last two bits which have to be zero
+ let Inst{1} = 1;
+ let Inst{0} = DDDI{0};
+
+ // Set don't cares to zero
+ let Inst{23} = 0;
+}
+
+//------------------------------------------------------------------------------
+// Conditional Set (SCC)
+//------------------------------------------------------------------------------
+// Encoding:
+// -----------------------------------------------------------------
+// |1.1.1.0|D.D.D|0.-| . . . . |-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-|1.I|
+// -----------------------------------------------------------------
+// opcode condition Rs1
+//
+// Action:
+// Rs1 <- logical function result
+//
+// SCC sets dst_reg to the boolean result of computing the logical function
+// specified by DDDI, as described in the table for the BR instruction.
+class InstSCC<dag outs, dag ins, string asmstr,
+ list<dag> pattern>
+ : InstLanai<outs, ins, asmstr, pattern> {
+ let Itinerary = IIC_ALU;
+ bits<5> Rs1; // dst_reg in documentation
+ bits<4> DDDI;
+
+ let Opcode = 0b1110;
+ let Inst{27 - 25} = DDDI{3 - 1};
+ let Inst{24} = 0;
+ let Inst{22 - 18} = Rs1;
+ let Inst{1} = 1;
+ let Inst{0} = DDDI{0};
+
+ // Set don't cares to zero
+ let Inst{23} = 0;
+ let Inst{17 - 2} = 0;
+}
+
+//------------------------------------------------------------------------------
+// Special Load/Store (SLS)
+//------------------------------------------------------------------------------
+//
+// Encoding:
+// -----------------------------------------------------------------
+// |1.1.1.1| . . . . | . . . . |0.S| . . . . . . . . . . . . . . . |
+// -----------------------------------------------------------------
+// opcode Rd addr 5msb's address 16 lsb's
+//
+// Action:
+// If S = 0 (LOAD): Rd <- Memory(address);
+// If S = 1 (STORE): Memory(address) <- Rd
+//
+// The timing is the same as for RM (*note RM::.) and RRM (*note
+// RRM::.) instructions. The two low-order bits of the 21-bit address are
+// ignored. The address is zero extended. Fullword memory accesses are
+// performed.
+class InstSLS<bit S, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstLanai<outs, ins, asmstr, pattern> {
+ bits<5> Rd;
+ bits<5> msb;
+ bits<16> lsb;
+
+ let Opcode = 0b1111;
+ let Inst{27 - 23} = Rd;
+ let Inst{22 - 18} = msb;
+ let Inst{17} = 0;
+ let Inst{16} = S;
+ let Inst{15 - 0} = lsb;
+}
+
+//------------------------------------------------------------------------------
+// Special Load Immediate (SLI)
+//------------------------------------------------------------------------------
+// Encoding:
+// -----------------------------------------------------------------
+// |1.1.1.1| . . . . | . . . . |1.0| . . . . . . . . . . . . . . . |
+// -----------------------------------------------------------------
+// opcode Rd const 5msb's constant 16 lsb's
+//
+// Action:
+// Rd <- constant
+//
+// The 21-bit constant is zero-extended. The timing is the same as the
+// RM instruction (*note RM::.).
+class InstSLI<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstLanai<outs, ins, asmstr, pattern> {
+ bits<5> Rd;
+ bits<5> msb;
+ bits<16> lsb;
+
+ let Opcode = 0b1111;
+ let Inst{27 - 23} = Rd;
+ let Inst{22 - 18} = msb;
+ let Inst{17} = 1;
+ let Inst{16} = 0;
+ let Inst{15 - 0} = lsb;
+}
+
+//------------------------------------------------------------------------------
+// Special Part-Word Load/Store (SPLS)
+//------------------------------------------------------------------------------
+// Encoding:
+// -----------------------------------------------------------------
+// |1.1.1.1| . . . . | . . . . |1.1.0.Y.S.E.P.Q| . . . . . . . . . |
+// -----------------------------------------------------------------
+// opcode Rd Rs1 constant (10)
+//
+// Action:
+// If `YS' = 11 (bYte Store):
+// Memory(ea) <- (least significant byte of Rr)
+// If `YS' = 01 (halfword Store):
+// Memory(ea) <- (least significant half-word of Rr)
+// If `YS' = 10 (bYte load): Rr <- Memory(ea)
+// If `YS' = 00 (halfword load): Rr <- Memory(ea)
+// [Note: here ea is determined as in the the RM instruction. ]
+// If `SE' = 01 then the value is zEro extended
+// before being loaded into Rd.
+// If `SE' = 00 then the value is sign extended
+// before being loaded into Rd.
+//
+// `P' and `Q' are used to determine `ea' as in the RM instruction. The
+// constant is sign extended. The timing is the same as the RM and RRM
+// instructions. *Note RM:: and *Note RRM::.
+//
+// All part-word loads write the part-word into the least significant
+// part of the destination register, with the higher-order bits zero- or
+// sign-extended. All part-word stores store the least significant
+// part-word of the source register into the destination memory location.
+class InstSPLS<dag outs, dag ins, string asmstr,
+ list<dag> pattern>
+ : InstLanai<outs, ins, asmstr, pattern> {
+ bits<5> Rd;
+ bits<5> Rs1;
+ bits<5> msb;
+ bit Y;
+ bit S;
+ bit E;
+ bit P;
+ bit Q;
+ bits<10> imm10;
+
+ let Opcode = 0b1111;
+ let Inst{27 - 23} = Rd;
+ let Inst{22 - 18} = Rs1;
+ let Inst{17 - 15} = 0b110;
+ let Inst{14} = Y;
+ let Inst{13} = S;
+ let Inst{12} = E;
+ let Inst{11} = P;
+ let Inst{10} = Q;
+ let Inst{9 - 0} = imm10;
+
+ let PostEncoderMethod = "adjustPqBitsSpls";
+}
+
+//------------------------------------------------------------------------------
+// Special instructions (popc, leadz, trailz)
+//------------------------------------------------------------------------------
+// Encoding:
+// -----------------------------------------------------------------
+// |1.1.0.1| Rd | Rs1 |F.-| . . . . | . . | . . . . | OP |
+// -----------------------------------------------------------------
+// opcode Rd Rs1
+// Action:
+// Rd <- Perform action encoded in OP on Rs1
+// OP is one of:
+// 0b001 POPC Population count;
+// 0b010 LEADZ Count number of leading zeros;
+// 0b011 TRAILZ Count number of trailing zeros;
+class InstSpecial<bits<3> op, dag outs, dag ins, string asmstr,
+ list<dag> pattern> : InstLanai<outs, ins, asmstr,
+ pattern>, Sched<[WriteALU]> {
+ let Itinerary = IIC_ALU;
+ bit F;
+ bits<5> Rd;
+ bits<5> Rs1;
+
+ let Opcode = 0b1101;
+ let Inst{27 - 23} = Rd;
+ let Inst{22 - 18} = Rs1;
+ let Inst{17} = F;
+ let Inst{16 - 3} = 0;
+ let Inst{2 - 0} = op;
+}
+
+// Pseudo instructions
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstLanai<outs, ins, asmstr, pattern> {
+ let Inst{15 - 0} = 0;
+ let isPseudo = 1;
+}
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp b/contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
new file mode 100644
index 000000000000..fcd5da876b15
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
@@ -0,0 +1,808 @@
+//===-- LanaiInstrInfo.cpp - Lanai Instruction Information ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Lanai implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Lanai.h"
+#include "LanaiInstrInfo.h"
+#include "LanaiMachineFunctionInfo.h"
+#include "LanaiTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "LanaiGenInstrInfo.inc"
+
+LanaiInstrInfo::LanaiInstrInfo()
+ : LanaiGenInstrInfo(Lanai::ADJCALLSTACKDOWN, Lanai::ADJCALLSTACKUP),
+ RegisterInfo() {}
+
+void LanaiInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator Position,
+ const DebugLoc &DL,
+ unsigned DestinationRegister,
+ unsigned SourceRegister,
+ bool KillSource) const {
+ if (!Lanai::GPRRegClass.contains(DestinationRegister, SourceRegister)) {
+ llvm_unreachable("Impossible reg-to-reg copy");
+ }
+
+ BuildMI(MBB, Position, DL, get(Lanai::OR_I_LO), DestinationRegister)
+ .addReg(SourceRegister, getKillRegState(KillSource))
+ .addImm(0);
+}
+
+void LanaiInstrInfo::storeRegToStackSlot(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator Position,
+ unsigned SourceRegister, bool IsKill, int FrameIndex,
+ const TargetRegisterClass *RegisterClass,
+ const TargetRegisterInfo * /*RegisterInfo*/) const {
+ DebugLoc DL;
+ if (Position != MBB.end()) {
+ DL = Position->getDebugLoc();
+ }
+
+ if (!Lanai::GPRRegClass.hasSubClassEq(RegisterClass)) {
+ llvm_unreachable("Can't store this register to stack slot");
+ }
+ BuildMI(MBB, Position, DL, get(Lanai::SW_RI))
+ .addReg(SourceRegister, getKillRegState(IsKill))
+ .addFrameIndex(FrameIndex)
+ .addImm(0)
+ .addImm(LPAC::ADD);
+}
+
+void LanaiInstrInfo::loadRegFromStackSlot(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator Position,
+ unsigned DestinationRegister, int FrameIndex,
+ const TargetRegisterClass *RegisterClass,
+ const TargetRegisterInfo * /*RegisterInfo*/) const {
+ DebugLoc DL;
+ if (Position != MBB.end()) {
+ DL = Position->getDebugLoc();
+ }
+
+ if (!Lanai::GPRRegClass.hasSubClassEq(RegisterClass)) {
+ llvm_unreachable("Can't load this register from stack slot");
+ }
+ BuildMI(MBB, Position, DL, get(Lanai::LDW_RI), DestinationRegister)
+ .addFrameIndex(FrameIndex)
+ .addImm(0)
+ .addImm(LPAC::ADD);
+}
+
+bool LanaiInstrInfo::areMemAccessesTriviallyDisjoint(
+ MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis * /*AA*/) const {
+ assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
+ assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
+
+ if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
+ MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
+ return false;
+
+ // Retrieve the base register, offset from the base register and width. Width
+ // is the size of memory that is being loaded/stored (e.g. 1, 2, 4). If
+ // base registers are identical, and the offset of a lower memory access +
+ // the width doesn't overlap the offset of a higher memory access,
+ // then the memory accesses are different.
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ unsigned BaseRegA = 0, BaseRegB = 0;
+ int64_t OffsetA = 0, OffsetB = 0;
+ unsigned int WidthA = 0, WidthB = 0;
+ if (getMemOpBaseRegImmOfsWidth(MIa, BaseRegA, OffsetA, WidthA, TRI) &&
+ getMemOpBaseRegImmOfsWidth(MIb, BaseRegB, OffsetB, WidthB, TRI)) {
+ if (BaseRegA == BaseRegB) {
+ int LowOffset = std::min(OffsetA, OffsetB);
+ int HighOffset = std::max(OffsetA, OffsetB);
+ int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
+ if (LowOffset + LowWidth <= HighOffset)
+ return true;
+ }
+ }
+ return false;
+}
+
+bool LanaiInstrInfo::expandPostRAPseudo(MachineInstr & /*MI*/) const {
+ return false;
+}
+
+static LPCC::CondCode getOppositeCondition(LPCC::CondCode CC) {
+ switch (CC) {
+ case LPCC::ICC_T: // true
+ return LPCC::ICC_F;
+ case LPCC::ICC_F: // false
+ return LPCC::ICC_T;
+ case LPCC::ICC_HI: // high
+ return LPCC::ICC_LS;
+ case LPCC::ICC_LS: // low or same
+ return LPCC::ICC_HI;
+ case LPCC::ICC_CC: // carry cleared
+ return LPCC::ICC_CS;
+ case LPCC::ICC_CS: // carry set
+ return LPCC::ICC_CC;
+ case LPCC::ICC_NE: // not equal
+ return LPCC::ICC_EQ;
+ case LPCC::ICC_EQ: // equal
+ return LPCC::ICC_NE;
+ case LPCC::ICC_VC: // oVerflow cleared
+ return LPCC::ICC_VS;
+ case LPCC::ICC_VS: // oVerflow set
+ return LPCC::ICC_VC;
+ case LPCC::ICC_PL: // plus (note: 0 is "minus" too here)
+ return LPCC::ICC_MI;
+ case LPCC::ICC_MI: // minus
+ return LPCC::ICC_PL;
+ case LPCC::ICC_GE: // greater than or equal
+ return LPCC::ICC_LT;
+ case LPCC::ICC_LT: // less than
+ return LPCC::ICC_GE;
+ case LPCC::ICC_GT: // greater than
+ return LPCC::ICC_LE;
+ case LPCC::ICC_LE: // less than or equal
+ return LPCC::ICC_GT;
+ default:
+ llvm_unreachable("Invalid condtional code");
+ }
+}
+
+std::pair<unsigned, unsigned>
+LanaiInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
+ return std::make_pair(TF, 0u);
+}
+
+ArrayRef<std::pair<unsigned, const char *>>
+LanaiInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
+ using namespace LanaiII;
+ static const std::pair<unsigned, const char *> TargetFlags[] = {
+ {MO_ABS_HI, "lanai-hi"},
+ {MO_ABS_LO, "lanai-lo"},
+ {MO_NO_FLAG, "lanai-nf"}};
+ return makeArrayRef(TargetFlags);
+}
+
+bool LanaiInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
+ unsigned &SrcReg2, int &CmpMask,
+ int &CmpValue) const {
+ switch (MI.getOpcode()) {
+ default:
+ break;
+ case Lanai::SFSUB_F_RI_LO:
+ case Lanai::SFSUB_F_RI_HI:
+ SrcReg = MI.getOperand(0).getReg();
+ SrcReg2 = 0;
+ CmpMask = ~0;
+ CmpValue = MI.getOperand(1).getImm();
+ return true;
+ case Lanai::SFSUB_F_RR:
+ SrcReg = MI.getOperand(0).getReg();
+ SrcReg2 = MI.getOperand(1).getReg();
+ CmpMask = ~0;
+ CmpValue = 0;
+ return true;
+ }
+
+ return false;
+}
+
+// isRedundantFlagInstr - check whether the first instruction, whose only
+// purpose is to update flags, can be made redundant.
+// * SFSUB_F_RR can be made redundant by SUB_RI if the operands are the same.
+// * SFSUB_F_RI can be made redundant by SUB_I if the operands are the same.
+inline static bool isRedundantFlagInstr(MachineInstr *CmpI, unsigned SrcReg,
+ unsigned SrcReg2, int ImmValue,
+ MachineInstr *OI) {
+ if (CmpI->getOpcode() == Lanai::SFSUB_F_RR &&
+ OI->getOpcode() == Lanai::SUB_R &&
+ ((OI->getOperand(1).getReg() == SrcReg &&
+ OI->getOperand(2).getReg() == SrcReg2) ||
+ (OI->getOperand(1).getReg() == SrcReg2 &&
+ OI->getOperand(2).getReg() == SrcReg)))
+ return true;
+
+ if (((CmpI->getOpcode() == Lanai::SFSUB_F_RI_LO &&
+ OI->getOpcode() == Lanai::SUB_I_LO) ||
+ (CmpI->getOpcode() == Lanai::SFSUB_F_RI_HI &&
+ OI->getOpcode() == Lanai::SUB_I_HI)) &&
+ OI->getOperand(1).getReg() == SrcReg &&
+ OI->getOperand(2).getImm() == ImmValue)
+ return true;
+ return false;
+}
+
+inline static unsigned flagSettingOpcodeVariant(unsigned OldOpcode) {
+ switch (OldOpcode) {
+ case Lanai::ADD_I_HI:
+ return Lanai::ADD_F_I_HI;
+ case Lanai::ADD_I_LO:
+ return Lanai::ADD_F_I_LO;
+ case Lanai::ADD_R:
+ return Lanai::ADD_F_R;
+ case Lanai::ADDC_I_HI:
+ return Lanai::ADDC_F_I_HI;
+ case Lanai::ADDC_I_LO:
+ return Lanai::ADDC_F_I_LO;
+ case Lanai::ADDC_R:
+ return Lanai::ADDC_F_R;
+ case Lanai::AND_I_HI:
+ return Lanai::AND_F_I_HI;
+ case Lanai::AND_I_LO:
+ return Lanai::AND_F_I_LO;
+ case Lanai::AND_R:
+ return Lanai::AND_F_R;
+ case Lanai::OR_I_HI:
+ return Lanai::OR_F_I_HI;
+ case Lanai::OR_I_LO:
+ return Lanai::OR_F_I_LO;
+ case Lanai::OR_R:
+ return Lanai::OR_F_R;
+ case Lanai::SL_I:
+ return Lanai::SL_F_I;
+ case Lanai::SRL_R:
+ return Lanai::SRL_F_R;
+ case Lanai::SA_I:
+ return Lanai::SA_F_I;
+ case Lanai::SRA_R:
+ return Lanai::SRA_F_R;
+ case Lanai::SUB_I_HI:
+ return Lanai::SUB_F_I_HI;
+ case Lanai::SUB_I_LO:
+ return Lanai::SUB_F_I_LO;
+ case Lanai::SUB_R:
+ return Lanai::SUB_F_R;
+ case Lanai::SUBB_I_HI:
+ return Lanai::SUBB_F_I_HI;
+ case Lanai::SUBB_I_LO:
+ return Lanai::SUBB_F_I_LO;
+ case Lanai::SUBB_R:
+ return Lanai::SUBB_F_R;
+ case Lanai::XOR_I_HI:
+ return Lanai::XOR_F_I_HI;
+ case Lanai::XOR_I_LO:
+ return Lanai::XOR_F_I_LO;
+ case Lanai::XOR_R:
+ return Lanai::XOR_F_R;
+ default:
+ return Lanai::NOP;
+ }
+}
+
+bool LanaiInstrInfo::optimizeCompareInstr(
+ MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int /*CmpMask*/,
+ int CmpValue, const MachineRegisterInfo *MRI) const {
+ // Get the unique definition of SrcReg.
+ MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
+ if (!MI)
+ return false;
+
+ // Get ready to iterate backward from CmpInstr.
+ MachineBasicBlock::iterator I = CmpInstr, E = MI,
+ B = CmpInstr.getParent()->begin();
+
+ // Early exit if CmpInstr is at the beginning of the BB.
+ if (I == B)
+ return false;
+
+ // There are two possible candidates which can be changed to set SR:
+ // One is MI, the other is a SUB instruction.
+ // * For SFSUB_F_RR(r1,r2), we are looking for SUB(r1,r2) or SUB(r2,r1).
+ // * For SFSUB_F_RI(r1, CmpValue), we are looking for SUB(r1, CmpValue).
+ MachineInstr *Sub = nullptr;
+ if (SrcReg2 != 0)
+ // MI is not a candidate to transform into a flag setting instruction.
+ MI = nullptr;
+ else if (MI->getParent() != CmpInstr.getParent() || CmpValue != 0) {
+ // Conservatively refuse to convert an instruction which isn't in the same
+ // BB as the comparison. Don't return if SFSUB_F_RI and CmpValue != 0 as Sub
+ // may still be a candidate.
+ if (CmpInstr.getOpcode() == Lanai::SFSUB_F_RI_LO)
+ MI = nullptr;
+ else
+ return false;
+ }
+
+ // Check that SR isn't set between the comparison instruction and the
+ // instruction we want to change while searching for Sub.
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ for (--I; I != E; --I) {
+ const MachineInstr &Instr = *I;
+
+ if (Instr.modifiesRegister(Lanai::SR, TRI) ||
+ Instr.readsRegister(Lanai::SR, TRI))
+ // This instruction modifies or uses SR after the one we want to change.
+ // We can't do this transformation.
+ return false;
+
+ // Check whether CmpInstr can be made redundant by the current instruction.
+ if (isRedundantFlagInstr(&CmpInstr, SrcReg, SrcReg2, CmpValue, &*I)) {
+ Sub = &*I;
+ break;
+ }
+
+ // Don't search outside the containing basic block.
+ if (I == B)
+ return false;
+ }
+
+ // Return false if no candidates exist.
+ if (!MI && !Sub)
+ return false;
+
+ // The single candidate is called MI.
+ if (!MI)
+ MI = Sub;
+
+ if (flagSettingOpcodeVariant(MI->getOpcode()) != Lanai::NOP) {
+ bool isSafe = false;
+
+ SmallVector<std::pair<MachineOperand *, LPCC::CondCode>, 4>
+ OperandsToUpdate;
+ I = CmpInstr;
+ E = CmpInstr.getParent()->end();
+ while (!isSafe && ++I != E) {
+ const MachineInstr &Instr = *I;
+ for (unsigned IO = 0, EO = Instr.getNumOperands(); !isSafe && IO != EO;
+ ++IO) {
+ const MachineOperand &MO = Instr.getOperand(IO);
+ if (MO.isRegMask() && MO.clobbersPhysReg(Lanai::SR)) {
+ isSafe = true;
+ break;
+ }
+ if (!MO.isReg() || MO.getReg() != Lanai::SR)
+ continue;
+ if (MO.isDef()) {
+ isSafe = true;
+ break;
+ }
+ // Condition code is after the operand before SR.
+ LPCC::CondCode CC;
+ CC = (LPCC::CondCode)Instr.getOperand(IO - 1).getImm();
+
+ if (Sub) {
+ LPCC::CondCode NewCC = getOppositeCondition(CC);
+ if (NewCC == LPCC::ICC_T)
+ return false;
+ // If we have SUB(r1, r2) and CMP(r2, r1), the condition code based on
+ // CMP needs to be updated to be based on SUB. Push the condition
+ // code operands to OperandsToUpdate. If it is safe to remove
+ // CmpInstr, the condition code of these operands will be modified.
+ if (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
+ Sub->getOperand(2).getReg() == SrcReg) {
+ OperandsToUpdate.push_back(
+ std::make_pair(&((*I).getOperand(IO - 1)), NewCC));
+ }
+ } else {
+ // No Sub, so this is x = <op> y, z; cmp x, 0.
+ switch (CC) {
+ case LPCC::ICC_EQ: // Z
+ case LPCC::ICC_NE: // Z
+ case LPCC::ICC_MI: // N
+ case LPCC::ICC_PL: // N
+ case LPCC::ICC_F: // none
+ case LPCC::ICC_T: // none
+ // SR can be used multiple times, we should continue.
+ break;
+ case LPCC::ICC_CS: // C
+ case LPCC::ICC_CC: // C
+ case LPCC::ICC_VS: // V
+ case LPCC::ICC_VC: // V
+ case LPCC::ICC_HI: // C Z
+ case LPCC::ICC_LS: // C Z
+ case LPCC::ICC_GE: // N V
+ case LPCC::ICC_LT: // N V
+ case LPCC::ICC_GT: // Z N V
+ case LPCC::ICC_LE: // Z N V
+ // The instruction uses the V bit or C bit which is not safe.
+ return false;
+ case LPCC::UNKNOWN:
+ return false;
+ }
+ }
+ }
+ }
+
+ // If SR is not killed nor re-defined, we should check whether it is
+ // live-out. If it is live-out, do not optimize.
+ if (!isSafe) {
+ MachineBasicBlock *MBB = CmpInstr.getParent();
+ for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+ SE = MBB->succ_end();
+ SI != SE; ++SI)
+ if ((*SI)->isLiveIn(Lanai::SR))
+ return false;
+ }
+
+ // Toggle the optional operand to SR.
+ MI->setDesc(get(flagSettingOpcodeVariant(MI->getOpcode())));
+ MI->addRegisterDefined(Lanai::SR);
+ CmpInstr.eraseFromParent();
+ return true;
+ }
+
+ return false;
+}
+
+bool LanaiInstrInfo::analyzeSelect(const MachineInstr &MI,
+ SmallVectorImpl<MachineOperand> &Cond,
+ unsigned &TrueOp, unsigned &FalseOp,
+ bool &Optimizable) const {
+ assert(MI.getOpcode() == Lanai::SELECT && "unknown select instruction");
+ // Select operands:
+ // 0: Def.
+ // 1: True use.
+ // 2: False use.
+ // 3: Condition code.
+ TrueOp = 1;
+ FalseOp = 2;
+ Cond.push_back(MI.getOperand(3));
+ Optimizable = true;
+ return false;
+}
+
+// Identify instructions that can be folded into a SELECT instruction, and
+// return the defining instruction.
+static MachineInstr *canFoldIntoSelect(unsigned Reg,
+ const MachineRegisterInfo &MRI) {
+ if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ return nullptr;
+ if (!MRI.hasOneNonDBGUse(Reg))
+ return nullptr;
+ MachineInstr *MI = MRI.getVRegDef(Reg);
+ if (!MI)
+ return nullptr;
+ // MI is folded into the SELECT by predicating it.
+ if (!MI->isPredicable())
+ return nullptr;
+ // Check if MI has any non-dead defs or physreg uses. This also detects
+ // predicated instructions which will be reading SR.
+ for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+ // Reject frame index operands.
+ if (MO.isFI() || MO.isCPI() || MO.isJTI())
+ return nullptr;
+ if (!MO.isReg())
+ continue;
+ // MI can't have any tied operands, that would conflict with predication.
+ if (MO.isTied())
+ return nullptr;
+ if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+ return nullptr;
+ if (MO.isDef() && !MO.isDead())
+ return nullptr;
+ }
+ bool DontMoveAcrossStores = true;
+ if (!MI->isSafeToMove(/*AliasAnalysis=*/nullptr, DontMoveAcrossStores))
+ return nullptr;
+ return MI;
+}
+
+MachineInstr *
+LanaiInstrInfo::optimizeSelect(MachineInstr &MI,
+ SmallPtrSetImpl<MachineInstr *> &SeenMIs,
+ bool /*PreferFalse*/) const {
+ assert(MI.getOpcode() == Lanai::SELECT && "unknown select instruction");
+ MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ MachineInstr *DefMI = canFoldIntoSelect(MI.getOperand(1).getReg(), MRI);
+ bool Invert = !DefMI;
+ if (!DefMI)
+ DefMI = canFoldIntoSelect(MI.getOperand(2).getReg(), MRI);
+ if (!DefMI)
+ return nullptr;
+
+ // Find new register class to use.
+ MachineOperand FalseReg = MI.getOperand(Invert ? 1 : 2);
+ unsigned DestReg = MI.getOperand(0).getReg();
+ const TargetRegisterClass *PreviousClass = MRI.getRegClass(FalseReg.getReg());
+ if (!MRI.constrainRegClass(DestReg, PreviousClass))
+ return nullptr;
+
+ // Create a new predicated version of DefMI.
+ MachineInstrBuilder NewMI =
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), DefMI->getDesc(), DestReg);
+
+ // Copy all the DefMI operands, excluding its (null) predicate.
+ const MCInstrDesc &DefDesc = DefMI->getDesc();
+ for (unsigned i = 1, e = DefDesc.getNumOperands();
+ i != e && !DefDesc.OpInfo[i].isPredicate(); ++i)
+ NewMI.addOperand(DefMI->getOperand(i));
+
+ unsigned CondCode = MI.getOperand(3).getImm();
+ if (Invert)
+ NewMI.addImm(getOppositeCondition(LPCC::CondCode(CondCode)));
+ else
+ NewMI.addImm(CondCode);
+ NewMI.copyImplicitOps(MI);
+
+ // The output register value when the predicate is false is an implicit
+ // register operand tied to the first def. The tie makes the register
+ // allocator ensure the FalseReg is allocated the same register as operand 0.
+ FalseReg.setImplicit();
+ NewMI.addOperand(FalseReg);
+ NewMI->tieOperands(0, NewMI->getNumOperands() - 1);
+
+ // Update SeenMIs set: register newly created MI and erase removed DefMI.
+ SeenMIs.insert(NewMI);
+ SeenMIs.erase(DefMI);
+
+ // If MI is inside a loop, and DefMI is outside the loop, then kill flags on
+ // DefMI would be invalid when transferred inside the loop. Checking for a
+ // loop is expensive, but at least remove kill flags if they are in different
+ // BBs.
+ if (DefMI->getParent() != MI.getParent())
+ NewMI->clearKillInfo();
+
+ // The caller will erase MI, but not DefMI.
+ DefMI->eraseFromParent();
+ return NewMI;
+}
+
+// The analyzeBranch function is used to examine conditional instructions and
+// remove unnecessary instructions. This method is used by BranchFolder and
+// IfConverter machine function passes to improve the CFG.
+// - TrueBlock is set to the destination if condition evaluates true (it is the
+// nullptr if the destination is the fall-through branch);
+// - FalseBlock is set to the destination if condition evaluates to false (it
+// is the nullptr if the branch is unconditional);
+// - condition is populated with machine operands needed to generate the branch
+// to insert in insertBranch;
+// Returns: false if branch could successfully be analyzed.
+bool LanaiInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *&TrueBlock,
+ MachineBasicBlock *&FalseBlock,
+ SmallVectorImpl<MachineOperand> &Condition,
+ bool AllowModify) const {
+ // Iterator to current instruction being considered.
+ MachineBasicBlock::iterator Instruction = MBB.end();
+
+ // Start from the bottom of the block and work up, examining the
+ // terminator instructions.
+ while (Instruction != MBB.begin()) {
+ --Instruction;
+
+ // Skip over debug values.
+ if (Instruction->isDebugValue())
+ continue;
+
+ // Working from the bottom, when we see a non-terminator
+ // instruction, we're done.
+ if (!isUnpredicatedTerminator(*Instruction))
+ break;
+
+ // A terminator that isn't a branch can't easily be handled
+ // by this analysis.
+ if (!Instruction->isBranch())
+ return true;
+
+ // Handle unconditional branches.
+ if (Instruction->getOpcode() == Lanai::BT) {
+ if (!AllowModify) {
+ TrueBlock = Instruction->getOperand(0).getMBB();
+ continue;
+ }
+
+ // If the block has any instructions after a branch, delete them.
+ while (std::next(Instruction) != MBB.end()) {
+ std::next(Instruction)->eraseFromParent();
+ }
+
+ Condition.clear();
+ FalseBlock = nullptr;
+
+ // Delete the jump if it's equivalent to a fall-through.
+ if (MBB.isLayoutSuccessor(Instruction->getOperand(0).getMBB())) {
+ TrueBlock = nullptr;
+ Instruction->eraseFromParent();
+ Instruction = MBB.end();
+ continue;
+ }
+
+ // TrueBlock is used to indicate the unconditional destination.
+ TrueBlock = Instruction->getOperand(0).getMBB();
+ continue;
+ }
+
+ // Handle conditional branches
+ unsigned Opcode = Instruction->getOpcode();
+ if (Opcode != Lanai::BRCC)
+ return true; // Unknown opcode.
+
+ // Multiple conditional branches are not handled here so only proceed if
+ // there are no conditions enqueued.
+ if (Condition.empty()) {
+ LPCC::CondCode BranchCond =
+ static_cast<LPCC::CondCode>(Instruction->getOperand(1).getImm());
+
+ // TrueBlock is the target of the previously seen unconditional branch.
+ FalseBlock = TrueBlock;
+ TrueBlock = Instruction->getOperand(0).getMBB();
+ Condition.push_back(MachineOperand::CreateImm(BranchCond));
+ continue;
+ }
+
+ // Multiple conditional branches are not handled.
+ return true;
+ }
+
+ // Return false indicating branch successfully analyzed.
+ return false;
+}
+
+// reverseBranchCondition - Reverses the branch condition of the specified
+// condition list, returning false on success and true if it cannot be
+// reversed.
+bool LanaiInstrInfo::reverseBranchCondition(
+ SmallVectorImpl<llvm::MachineOperand> &Condition) const {
+ assert((Condition.size() == 1) &&
+ "Lanai branch conditions should have one component.");
+
+ LPCC::CondCode BranchCond =
+ static_cast<LPCC::CondCode>(Condition[0].getImm());
+ Condition[0].setImm(getOppositeCondition(BranchCond));
+ return false;
+}
+
+// Insert the branch with condition specified in condition and given targets
+// (TrueBlock and FalseBlock). This function returns the number of machine
+// instructions inserted.
+unsigned LanaiInstrInfo::insertBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *TrueBlock,
+ MachineBasicBlock *FalseBlock,
+ ArrayRef<MachineOperand> Condition,
+ const DebugLoc &DL,
+ int *BytesAdded) const {
+ // Shouldn't be a fall through.
+ assert(TrueBlock && "insertBranch must not be told to insert a fallthrough");
+ assert(!BytesAdded && "code size not handled");
+
+ // If condition is empty then an unconditional branch is being inserted.
+ if (Condition.empty()) {
+ assert(!FalseBlock && "Unconditional branch with multiple successors!");
+ BuildMI(&MBB, DL, get(Lanai::BT)).addMBB(TrueBlock);
+ return 1;
+ }
+
+ // Else a conditional branch is inserted.
+ assert((Condition.size() == 1) &&
+ "Lanai branch conditions should have one component.");
+ unsigned ConditionalCode = Condition[0].getImm();
+ BuildMI(&MBB, DL, get(Lanai::BRCC)).addMBB(TrueBlock).addImm(ConditionalCode);
+
+ // If no false block, then false behavior is fall through and no branch needs
+ // to be inserted.
+ if (!FalseBlock)
+ return 1;
+
+ BuildMI(&MBB, DL, get(Lanai::BT)).addMBB(FalseBlock);
+ return 2;
+}
+
+unsigned LanaiInstrInfo::removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved) const {
+ assert(!BytesRemoved && "code size not handled");
+
+ MachineBasicBlock::iterator Instruction = MBB.end();
+ unsigned Count = 0;
+
+ while (Instruction != MBB.begin()) {
+ --Instruction;
+ if (Instruction->isDebugValue())
+ continue;
+ if (Instruction->getOpcode() != Lanai::BT &&
+ Instruction->getOpcode() != Lanai::BRCC) {
+ break;
+ }
+
+ // Remove the branch.
+ Instruction->eraseFromParent();
+ Instruction = MBB.end();
+ ++Count;
+ }
+
+ return Count;
+}
+
+unsigned LanaiInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const {
+ if (MI.getOpcode() == Lanai::LDW_RI)
+ if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
+ MI.getOperand(2).getImm() == 0) {
+ FrameIndex = MI.getOperand(1).getIndex();
+ return MI.getOperand(0).getReg();
+ }
+ return 0;
+}
+
+unsigned LanaiInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI,
+ int &FrameIndex) const {
+ if (MI.getOpcode() == Lanai::LDW_RI) {
+ unsigned Reg;
+ if ((Reg = isLoadFromStackSlot(MI, FrameIndex)))
+ return Reg;
+ // Check for post-frame index elimination operations
+ const MachineMemOperand *Dummy;
+ return hasLoadFromStackSlot(MI, Dummy, FrameIndex);
+ }
+ return 0;
+}
+
+unsigned LanaiInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const {
+ if (MI.getOpcode() == Lanai::SW_RI)
+ if (MI.getOperand(0).isFI() && MI.getOperand(1).isImm() &&
+ MI.getOperand(1).getImm() == 0) {
+ FrameIndex = MI.getOperand(0).getIndex();
+ return MI.getOperand(2).getReg();
+ }
+ return 0;
+}
+
+bool LanaiInstrInfo::getMemOpBaseRegImmOfsWidth(
+ MachineInstr &LdSt, unsigned &BaseReg, int64_t &Offset, unsigned &Width,
+ const TargetRegisterInfo * /*TRI*/) const {
+ // Handle only loads/stores with base register followed by immediate offset
+ // and with add as ALU op.
+ if (LdSt.getNumOperands() != 4)
+ return false;
+ if (!LdSt.getOperand(1).isReg() || !LdSt.getOperand(2).isImm() ||
+ !(LdSt.getOperand(3).isImm() && LdSt.getOperand(3).getImm() == LPAC::ADD))
+ return false;
+
+ switch (LdSt.getOpcode()) {
+ default:
+ return false;
+ case Lanai::LDW_RI:
+ case Lanai::LDW_RR:
+ case Lanai::SW_RR:
+ case Lanai::SW_RI:
+ Width = 4;
+ break;
+ case Lanai::LDHs_RI:
+ case Lanai::LDHz_RI:
+ case Lanai::STH_RI:
+ Width = 2;
+ break;
+ case Lanai::LDBs_RI:
+ case Lanai::LDBz_RI:
+ case Lanai::STB_RI:
+ Width = 1;
+ break;
+ }
+
+ BaseReg = LdSt.getOperand(1).getReg();
+ Offset = LdSt.getOperand(2).getImm();
+ return true;
+}
+
+bool LanaiInstrInfo::getMemOpBaseRegImmOfs(
+ MachineInstr &LdSt, unsigned &BaseReg, int64_t &Offset,
+ const TargetRegisterInfo *TRI) const {
+ switch (LdSt.getOpcode()) {
+ default:
+ return false;
+ case Lanai::LDW_RI:
+ case Lanai::LDW_RR:
+ case Lanai::SW_RR:
+ case Lanai::SW_RI:
+ case Lanai::LDHs_RI:
+ case Lanai::LDHz_RI:
+ case Lanai::STH_RI:
+ case Lanai::LDBs_RI:
+ case Lanai::LDBz_RI:
+ unsigned Width;
+ return getMemOpBaseRegImmOfsWidth(LdSt, BaseReg, Offset, Width, TRI);
+ }
+}
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.h b/contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.h
new file mode 100644
index 000000000000..4387fe1af3c3
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.h
@@ -0,0 +1,186 @@
+//===- LanaiInstrInfo.h - Lanai Instruction Information ---------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Lanai implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAIINSTRINFO_H
+#define LLVM_LIB_TARGET_LANAI_LANAIINSTRINFO_H
+
+#include "LanaiRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "LanaiGenInstrInfo.inc"
+
+namespace llvm {
+
+class LanaiInstrInfo : public LanaiGenInstrInfo {
+ const LanaiRegisterInfo RegisterInfo;
+
+public:
+ LanaiInstrInfo();
+
+ // getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As
+ // such, whenever a client has an instance of instruction info, it should
+ // always be able to get register info as well (through this method).
+ virtual const LanaiRegisterInfo &getRegisterInfo() const {
+ return RegisterInfo;
+ }
+
+ bool areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
+ AliasAnalysis *AA) const override;
+
+ unsigned isLoadFromStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const override;
+
+ unsigned isLoadFromStackSlotPostFE(const MachineInstr &MI,
+ int &FrameIndex) const override;
+
+ unsigned isStoreToStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const override;
+
+ void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator Position,
+ const DebugLoc &DL, unsigned DestinationRegister,
+ unsigned SourceRegister, bool KillSource) const override;
+
+ void
+ storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator Position,
+ unsigned SourceRegister, bool IsKill, int FrameIndex,
+ const TargetRegisterClass *RegisterClass,
+ const TargetRegisterInfo *RegisterInfo) const override;
+
+ void
+ loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator Position,
+ unsigned DestinationRegister, int FrameIndex,
+ const TargetRegisterClass *RegisterClass,
+ const TargetRegisterInfo *RegisterInfo) const override;
+
+ bool expandPostRAPseudo(MachineInstr &MI) const override;
+
+ bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
+ int64_t &Offset,
+ const TargetRegisterInfo *TRI) const override;
+
+ bool getMemOpBaseRegImmOfsWidth(MachineInstr &LdSt, unsigned &BaseReg,
+ int64_t &Offset, unsigned &Width,
+ const TargetRegisterInfo *TRI) const;
+
+ std::pair<unsigned, unsigned>
+ decomposeMachineOperandsTargetFlags(unsigned TF) const override;
+
+ ArrayRef<std::pair<unsigned, const char *>>
+ getSerializableDirectMachineOperandTargetFlags() const override;
+
+ bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TrueBlock,
+ MachineBasicBlock *&FalseBlock,
+ SmallVectorImpl<MachineOperand> &Condition,
+ bool AllowModify) const override;
+
+ unsigned removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved = nullptr) const override;
+
+ // For a comparison instruction, return the source registers in SrcReg and
+ // SrcReg2 if having two register operands, and the value it compares against
+ // in CmpValue. Return true if the comparison instruction can be analyzed.
+ bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
+ unsigned &SrcReg2, int &CmpMask,
+ int &CmpValue) const override;
+
+ // See if the comparison instruction can be converted into something more
+ // efficient. E.g., on Lanai register-register instructions can set the flag
+ // register, obviating the need for a separate compare.
+ bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
+ unsigned SrcReg2, int CmpMask, int CmpValue,
+ const MachineRegisterInfo *MRI) const override;
+
+ // Analyze the given select instruction, returning true if it cannot be
+ // understood. It is assumed that MI->isSelect() is true.
+ //
+ // When successful, return the controlling condition and the operands that
+ // determine the true and false result values.
+ //
+ // Result = SELECT Cond, TrueOp, FalseOp
+ //
+ // Lanai can optimize certain select instructions, for example by predicating
+ // the instruction defining one of the operands and sets Optimizable to true.
+ bool analyzeSelect(const MachineInstr &MI,
+ SmallVectorImpl<MachineOperand> &Cond, unsigned &TrueOp,
+ unsigned &FalseOp, bool &Optimizable) const override;
+
+ // Given a select instruction that was understood by analyzeSelect and
+ // returned Optimizable = true, attempt to optimize MI by merging it with one
+ // of its operands. Returns NULL on failure.
+ //
+ // When successful, returns the new select instruction. The client is
+ // responsible for deleting MI.
+ //
+ // If both sides of the select can be optimized, the TrueOp is modifed.
+ // PreferFalse is not used.
+ MachineInstr *optimizeSelect(MachineInstr &MI,
+ SmallPtrSetImpl<MachineInstr *> &SeenMIs,
+ bool PreferFalse) const override;
+
+ bool reverseBranchCondition(
+ SmallVectorImpl<MachineOperand> &Condition) const override;
+
+ unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TrueBlock,
+ MachineBasicBlock *FalseBlock,
+ ArrayRef<MachineOperand> Condition,
+ const DebugLoc &DL,
+ int *BytesAdded = nullptr) const override;
+};
+
+static inline bool isSPLSOpcode(unsigned Opcode) {
+ switch (Opcode) {
+ case Lanai::LDBs_RI:
+ case Lanai::LDBz_RI:
+ case Lanai::LDHs_RI:
+ case Lanai::LDHz_RI:
+ case Lanai::STB_RI:
+ case Lanai::STH_RI:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static inline bool isRMOpcode(unsigned Opcode) {
+ switch (Opcode) {
+ case Lanai::LDW_RI:
+ case Lanai::SW_RI:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static inline bool isRRMOpcode(unsigned Opcode) {
+ switch (Opcode) {
+ case Lanai::LDBs_RR:
+ case Lanai::LDBz_RR:
+ case Lanai::LDHs_RR:
+ case Lanai::LDHz_RR:
+ case Lanai::LDWz_RR:
+ case Lanai::LDW_RR:
+ case Lanai::STB_RR:
+ case Lanai::STH_RR:
+ case Lanai::SW_RR:
+ return true;
+ default:
+ return false;
+ }
+}
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAIINSTRINFO_H
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.td b/contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.td
new file mode 100644
index 000000000000..285fca11737d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.td
@@ -0,0 +1,884 @@
+//===-- LanaiInstrInfo.td - Target Description for Lanai Target -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Lanai instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+
+include "LanaiInstrFormats.td"
+
+// -------------------------------------------------- //
+// Instruction Operands and Patterns
+// -------------------------------------------------- //
+
+// These are target-independent nodes, but have target-specific formats.
+def SDT_LanaiCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_LanaiCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>,
+ SDTCisVT<1, i32>]>;
+def SDT_LanaiCall : SDTypeProfile<0, -1, [SDTCisVT<0, i32>]>;
+def SDT_LanaiSetFlag : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
+def SDT_LanaiSelectCC : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>,
+ SDTCisSameAs<1, 2>]>;
+def SDT_LanaiSetCC : SDTypeProfile<1, 1, [SDTCisVT<0, i32>,
+ SDTCisVT<1, i32>]>;
+def SDT_LanaiBrCC : SDTypeProfile<0, 2, [SDTCisVT<0, OtherVT>,
+ SDTCisVT<1, i32>]>;
+def SDT_LanaiAdjDynAlloc : SDTypeProfile<1, 1, [SDTCisVT<0, i32>,
+ SDTCisVT<1, i32>]>;
+
+def Call : SDNode<"LanaiISD::CALL", SDT_LanaiCall,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+ SDNPVariadic]>;
+def RetFlag : SDNode<"LanaiISD::RET_FLAG", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def CallSeqStart : SDNode<"ISD::CALLSEQ_START", SDT_LanaiCallSeqStart,
+ [SDNPHasChain, SDNPOutGlue]>;
+def CallSeqEnd : SDNode<"ISD::CALLSEQ_END", SDT_LanaiCallSeqEnd,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def LanaiSetFlag : SDNode<"LanaiISD::SET_FLAG", SDT_LanaiSetFlag,
+ [SDNPOutGlue]>;
+def LanaiSubbF : SDNode<"LanaiISD::SUBBF", SDT_LanaiSetFlag,
+ [SDNPOutGlue, SDNPInGlue]>;
+def LanaiBrCC : SDNode<"LanaiISD::BR_CC", SDT_LanaiBrCC,
+ [SDNPHasChain, SDNPInGlue]>;
+def LanaiSelectCC : SDNode<"LanaiISD::SELECT_CC", SDT_LanaiSelectCC,
+ [SDNPInGlue]>;
+def LanaiSetCC : SDNode<"LanaiISD::SETCC", SDT_LanaiSetCC,
+ [SDNPInGlue]>;
+def LanaiHi : SDNode<"LanaiISD::HI", SDTIntUnaryOp>;
+def LanaiLo : SDNode<"LanaiISD::LO", SDTIntUnaryOp>;
+def LanaiSmall : SDNode<"LanaiISD::SMALL", SDTIntUnaryOp>;
+def LanaiAdjDynAlloc : SDNode<"LanaiISD::ADJDYNALLOC", SDT_LanaiAdjDynAlloc>;
+
+// Extract bits 0-15 (low-end) of an immediate value.
+def LO16 : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant((uint64_t)N->getZExtValue() & 0xffff,
+ SDLoc(N), MVT::i32);
+}]>;
+
+// Extract bits 16-31 (high-end) of an immediate value.
+// Transformation function: shift the immediate value down into the low bits.
+def HI16 : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant((uint64_t)N->getZExtValue() >> 16, SDLoc(N),
+ MVT::i32);
+}]>;
+
+def NEG : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(-N->getSExtValue(), SDLoc(N), MVT::i32);
+}]>;
+
+def LO21 : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant((uint64_t)N->getZExtValue() & 0x1fffff,
+ SDLoc(N), MVT::i32);
+}]>;
+
+// Branch targets
+def BrTargetAsmOperand : AsmOperandClass {
+ let Name = "BrTarget";
+}
+def BrTarget : Operand<OtherVT> {
+ let ParserMatchClass = BrTargetAsmOperand;
+ let EncoderMethod = "getBranchTargetOpValue";
+ let DecoderMethod = "decodeBranch";
+}
+
+def CallTargetAsmOperand : AsmOperandClass {
+ let Name = "CallTarget";
+}
+def CallTarget : Operand<i32> {
+ let ParserMatchClass = CallTargetAsmOperand;
+ let EncoderMethod = "getBranchTargetOpValue";
+ let DecoderMethod = "decodeBranch";
+}
+
+def ImmShiftAsmOperand : AsmOperandClass { let Name = "ImmShift"; }
+def immShift : Operand<i32>, PatLeaf<(imm), [{
+ int Imm = N->getSExtValue();
+ return Imm >= -31 && Imm <= 31;}]> {
+ let ParserMatchClass = ImmShiftAsmOperand;
+ let DecoderMethod = "decodeShiftImm";
+}
+
+def Imm10AsmOperand : AsmOperandClass { let Name = "Imm10"; }
+def imm10 : Operand<i32>, PatLeaf<(imm), [{
+ return isInt<10>(N->getSExtValue()); }]> {
+ let ParserMatchClass = Imm10AsmOperand;
+}
+
+def LoImm16AsmOperand : AsmOperandClass { let Name = "LoImm16"; }
+def i32lo16z : Operand<i32>, PatLeaf<(i32 imm), [{
+ // i32lo16 predicate - true if the 32-bit immediate has only rightmost 16
+ // bits set.
+ return ((N->getZExtValue() & 0xFFFFUL) == N->getZExtValue());}], LO16> {
+ let ParserMatchClass = LoImm16AsmOperand;
+}
+def i32neg16 : Operand<i32>, PatLeaf<(i32 imm), [{
+ // i32neg16 predicate - true if the 32-bit immediate is negative and can
+ // be represented by a 16 bit integer.
+ int Imm = N->getSExtValue();
+ return (Imm < 0) && (isInt<16>(Imm));}], LO16> {
+ let ParserMatchClass = LoImm16AsmOperand;
+}
+def i32lo16s : Operand<i32>, PatLeaf<(i32 imm), [{
+ // i32lo16 predicate - true if the 32-bit immediate has only rightmost 16
+ // bits set.
+ return ((int64_t)(N->getSExtValue() & 0xFFFFUL) == N->getSExtValue());}], LO16> {
+ let ParserMatchClass = LoImm16AsmOperand;
+}
+
+def LoImm16AndAsmOperand : AsmOperandClass { let Name = "LoImm16And"; }
+def i32lo16and : Operand<i32>, PatLeaf<(i32 imm), [{
+ // i32lo16 predicate - true if the 32-bit immediate has the rightmost 16
+ // bits set and the leftmost 16 bits 1's.
+ return (N->getZExtValue() >= 0xFFFF0000UL);}], LO16> {
+ let ParserMatchClass = LoImm16AndAsmOperand;
+ let PrintMethod = "printLo16AndImmOperand";
+}
+
+def HiImm16AsmOperand : AsmOperandClass { let Name = "HiImm16"; }
+def i32hi16 : Operand<i32>, PatLeaf<(i32 imm), [{
+ // i32hi16 predicate - true if the 32-bit immediate has only leftmost 16
+ // bits set.
+ return ((N->getZExtValue() & 0xFFFF0000UL) == N->getZExtValue());}], HI16> {
+ let ParserMatchClass = HiImm16AsmOperand;
+ let PrintMethod = "printHi16ImmOperand";
+}
+
+def HiImm16AndAsmOperand : AsmOperandClass { let Name = "HiImm16And"; }
+def i32hi16and : Operand<i32>, PatLeaf<(i32 imm), [{
+ // i32lo16 predicate - true if the 32-bit immediate has the leftmost 16
+ // bits set and the rightmost 16 bits 1's.
+ return ((N->getZExtValue() & 0xFFFFUL) == 0xFFFFUL);}], HI16> {
+ let ParserMatchClass = HiImm16AndAsmOperand;
+ let PrintMethod = "printHi16AndImmOperand";
+}
+
+def LoImm21AsmOperand : AsmOperandClass { let Name = "LoImm21"; }
+def i32lo21 : Operand<i32>, PatLeaf<(i32 imm), [{
+ // i32lo21 predicate - true if the 32-bit immediate has only rightmost 21
+ // bits set.
+ return ((N->getZExtValue() & 0x1FFFFFUL) == N->getZExtValue());}], LO21> {
+ let ParserMatchClass = LoImm21AsmOperand;
+}
+
+def AluOp : Operand<i32> {
+ let PrintMethod = "printAluOperand";
+}
+
+// Addressing modes.
+def ADDRrr : ComplexPattern<i32, 3, "selectAddrRr", [], []>;
+def ADDRri : ComplexPattern<i32, 3, "selectAddrRi", [frameindex], []>;
+def ADDRsls : ComplexPattern<i32, 1, "selectAddrSls", [frameindex], []>;
+def ADDRspls : ComplexPattern<i32, 3, "selectAddrSpls", [frameindex], []>;
+
+// Address operands
+def MemRegImmAsmOperand : AsmOperandClass {
+ let Name = "MemRegImm";
+ let ParserMethod = "parseMemoryOperand";
+}
+def MEMri : Operand<i32> {
+ let DecoderMethod = "decodeRiMemoryValue";
+ let EncoderMethod = "getRiMemoryOpValue";
+ let MIOperandInfo = (ops GPR:$base, i32lo16s:$offset, AluOp:$Opcode);
+ let ParserMatchClass = MemRegImmAsmOperand;
+ let PrintMethod = "printMemRiOperand";
+}
+
+def MemRegRegAsmOperand : AsmOperandClass {
+ let Name = "MemRegReg";
+ let ParserMethod = "parseMemoryOperand";
+}
+def MEMrr : Operand<i32> {
+ let DecoderMethod = "decodeRrMemoryValue";
+ let EncoderMethod = "getRrMemoryOpValue";
+ let MIOperandInfo = (ops GPR:$Op1, GPR:$Op2, AluOp:$Opcode);
+ let ParserMatchClass = MemRegRegAsmOperand;
+ let PrintMethod = "printMemRrOperand";
+}
+
+def MemImmAsmOperand : AsmOperandClass {
+ let Name = "MemImm";
+ let ParserMethod = "parseMemoryOperand";
+}
+def MEMi : Operand<i32> {
+ let MIOperandInfo = (ops i32lo21:$offset);
+ let ParserMatchClass = MemImmAsmOperand;
+ let PrintMethod = "printMemImmOperand";
+}
+
+def MemSplsAsmOperand : AsmOperandClass {
+ let Name = "MemSpls";
+ let ParserMethod = "parseMemoryOperand";
+}
+def MEMspls : Operand<i32> {
+ let DecoderMethod = "decodeSplsValue";
+ let EncoderMethod = "getSplsOpValue";
+ let MIOperandInfo = (ops GPR:$base, imm10:$offset, AluOp:$Opcode);
+ let ParserMatchClass = MemSplsAsmOperand;
+ let PrintMethod = "printMemSplsOperand";
+}
+
+def CCOp : Operand<i32> {
+ let PrintMethod = "printCCOperand";
+}
+
+// Predicate operand. Default to 0 = true.
+def CondCodeOperand : AsmOperandClass { let Name = "CondCode"; }
+
+def pred : PredicateOperand<i32, (ops i32imm), (ops (i32 0))> {
+ let PrintMethod = "printPredicateOperand";
+ let ParserMatchClass = CondCodeOperand;
+ let DecoderMethod = "decodePredicateOperand";
+}
+
+let hasSideEffects = 0, Inst = 0x00000001 in
+ def NOP : InstLanai<(outs), (ins), "nop", []>;
+
+// Special NOPs to change logging level in vlanai.
+let hasSideEffects = 0, Inst = 0x00000002 in
+ def LOG0 : InstLanai<(outs), (ins), "log_0", []>;
+let hasSideEffects = 0, Inst = 0x00000003 in
+ def LOG1 : InstLanai<(outs), (ins), "log_1", []>;
+let hasSideEffects = 0, Inst = 0x00000004 in
+ def LOG2 : InstLanai<(outs), (ins), "log_2", []>;
+let hasSideEffects = 0, Inst = 0x00000005 in
+ def LOG3 : InstLanai<(outs), (ins), "log_3", []>;
+let hasSideEffects = 0, Inst = 0x00000006 in
+ def LOG4 : InstLanai<(outs), (ins), "log_4", []>;
+
+// Map an SPLS instruction onto itself. All other instructions will be mapped
+// onto -1. Used to identify SPLS instructions.
+def splsIdempotent : InstrMapping {
+ let FilterClass = "InstSPLS";
+ let RowFields = ["AsmString"];
+ let ColFields = ["PostEncoderMethod"];
+ let KeyCol = ["adjustPqBitsSpls"];
+ let ValueCols = [["adjustPqBitsSpls"]];
+}
+
+// -------------------------------------------------- //
+// ALU instructions
+// -------------------------------------------------- //
+multiclass ALUbase<bits<3> subOp, string AsmStr, SDNode OpNode,
+ PatLeaf LoExt, PatLeaf HiExt,
+ list<dag> loPattern, list<dag> hiPattern> {
+ // Register Immediate
+ let H = 0 in
+ def LO : InstRI<subOp, (outs GPR:$Rd), (ins GPR:$Rs1, LoExt:$imm16),
+ !strconcat(AsmStr, "\t$Rs1, $imm16, $Rd"),
+ loPattern>;
+ let H = 1 in
+ def HI : InstRI<subOp, (outs GPR:$Rd), (ins GPR:$Rs1, HiExt:$imm16),
+ !strconcat(AsmStr, "\t$Rs1, $imm16, $Rd"),
+ hiPattern>;
+
+}
+
+multiclass ALUarith<bits<3> subOp, string AsmStr, SDNode OpNode,
+ PatLeaf LoExt, PatLeaf HiExt> {
+ defm I_ : ALUbase<subOp, AsmStr, OpNode, LoExt, HiExt, [], []>;
+
+ // Register Register
+ let JJJJJ = 0 in
+ def R : InstRR<subOp, (outs GPR:$Rd), (ins GPR:$Rs1, GPR:$Rs2, pred:$DDDI),
+ !strconcat(AsmStr, "$DDDI\t$Rs1, $Rs2, $Rd"),
+ [(set GPR:$Rd, (OpNode GPR:$Rs1, GPR:$Rs2))]>;
+}
+
+multiclass ALUlogic<bits<3> subOp, string AsmStr, SDNode OpNode,
+ PatLeaf LoExt, PatLeaf HiExt> {
+ defm I_ : ALUbase<subOp, AsmStr, OpNode, LoExt, HiExt,
+ [(set GPR:$Rd, (OpNode GPR:$Rs1, LoExt:$imm16))],
+ [(set GPR:$Rd, (OpNode GPR:$Rs1, HiExt:$imm16))]>;
+
+ // Register Register
+ let JJJJJ = 0 in
+ def R : InstRR<subOp, (outs GPR:$Rd), (ins GPR:$Rs1, GPR:$Rs2, pred:$DDDI),
+ !strconcat(AsmStr, "$DDDI\t$Rs1, $Rs2, $Rd"),
+ [(set GPR:$Rd, (OpNode GPR:$Rs1, GPR:$Rs2))]>;
+}
+
+// Non flag setting ALU operations
+let isAsCheapAsAMove = 1, F = 0 in {
+ let isCommutable = 1 in {
+ defm ADD_ : ALUarith<0b000, "add", add, i32lo16z, i32hi16>;
+ }
+ defm SUB_ : ALUarith<0b010, "sub", sub, i32lo16z, i32hi16>;
+ let isCommutable = 1 in {
+ defm AND_ : ALUlogic<0b100, "and", and, i32lo16and, i32hi16and>;
+ defm OR_ : ALUlogic<0b101, "or", or, i32lo16z, i32hi16>;
+ defm XOR_ : ALUlogic<0b110, "xor", xor, i32lo16z, i32hi16>;
+ }
+}
+
+def : Pat<(add GPR:$Rs1, i32lo16z:$imm),
+ (ADD_I_LO GPR:$Rs1, i32lo16z:$imm)>;
+
+def : Pat<(sub GPR:$Rs1, i32lo16z:$imm),
+ (SUB_I_LO GPR:$Rs1, i32lo16z:$imm)>;
+
+def : Pat<(add GPR:$Rs1, i32hi16:$imm),
+ (ADD_I_HI GPR:$Rs1, i32hi16:$imm)>;
+
+def : Pat<(sub GPR:$Rs1, i32hi16:$imm),
+ (SUB_I_HI GPR:$Rs1, i32hi16:$imm)>;
+
+def : Pat<(i32 i32lo16and:$imm), (AND_I_LO (i32 R1), i32lo16and:$imm)>;
+def : Pat<(i32 i32hi16and:$imm), (AND_I_HI (i32 R1), i32hi16and:$imm)>;
+
+// Change add/sub with negative number to sub/add
+def : Pat<(add GPR:$Rs1, i32neg16:$imm),
+ (SUB_I_LO GPR:$Rs1, (NEG $imm))>;
+def : Pat<(sub GPR:$Rs1, i32neg16:$imm),
+ (ADD_I_LO GPR:$Rs1, (NEG $imm))>;
+
+// Flag (incl. carry) setting addition and subtraction
+let F = 1, Defs = [SR] in {
+ defm ADD_F_ : ALUarith<0b000, "add.f", addc, i32lo16z, i32hi16>;
+ defm SUB_F_ : ALUarith<0b010, "sub.f", subc, i32lo16z, i32hi16>;
+}
+
+def : Pat<(addc GPR:$Rs1, i32lo16z:$imm),
+ (ADD_F_I_LO GPR:$Rs1, i32lo16z:$imm)>;
+
+def : Pat<(subc GPR:$Rs1, i32lo16z:$imm),
+ (SUB_F_I_LO GPR:$Rs1, i32lo16z:$imm)>;
+
+def : Pat<(addc GPR:$Rs1, i32hi16:$imm),
+ (ADD_F_I_HI GPR:$Rs1, i32hi16:$imm)>;
+
+def : Pat<(subc GPR:$Rs1, i32hi16:$imm),
+ (SUB_F_I_HI GPR:$Rs1, i32hi16:$imm)>;
+
+// Carry using addition and subtraction
+let F = 0, Uses = [SR] in {
+ defm ADDC_ : ALUarith<0b001, "addc", adde, i32lo16z, i32hi16>;
+ defm SUBB_ : ALUarith<0b011, "subb", sube, i32lo16z, i32hi16>;
+}
+
+def : Pat<(adde GPR:$Rs1, i32lo16z:$imm),
+ (ADDC_I_LO GPR:$Rs1, i32lo16z:$imm)>;
+
+def : Pat<(sube GPR:$Rs1, i32lo16z:$imm),
+ (SUBB_I_LO GPR:$Rs1, i32lo16z:$imm)>;
+
+def : Pat<(adde GPR:$Rs1, i32hi16:$imm),
+ (ADDC_I_HI GPR:$Rs1, i32hi16:$imm)>;
+
+def : Pat<(sube GPR:$Rs1, i32hi16:$imm),
+ (SUBB_I_HI GPR:$Rs1, i32hi16:$imm)>;
+
+// Flag setting ALU operations
+let isAsCheapAsAMove = 1, F = 1, Defs = [SR] in {
+ let isCommutable = 1 in {
+ defm AND_F_ : ALUlogic<0b100, "and.f", and, i32lo16and, i32hi16and>;
+ defm OR_F_ : ALUlogic<0b101, "or.f", or, i32lo16z, i32hi16>;
+ defm XOR_F_ : ALUlogic<0b110, "xor.f", xor, i32lo16z, i32hi16>;
+ }
+}
+
+let isAsCheapAsAMove = 1, F = 1, Defs = [SR], Uses = [SR] in {
+ defm ADDC_F_ : ALUarith<0b001, "addc.f", adde, i32lo16z, i32hi16>;
+ defm SUBB_F_ : ALUarith<0b011, "subb.f", sube, i32lo16z, i32hi16>;
+}
+
+def : Pat<(LanaiSubbF GPR:$Rs1, GPR:$Rs2),
+ (SUBB_F_R GPR:$Rs1, GPR:$Rs2)>;
+
+def : Pat<(LanaiSubbF GPR:$Rs1, i32lo16z:$imm),
+ (SUBB_F_I_LO GPR:$Rs1, i32lo16z:$imm)>;
+
+def : Pat<(LanaiSubbF GPR:$Rs1, i32hi16:$imm),
+ (SUBB_F_I_HI GPR:$Rs1, i32hi16:$imm)>;
+
+def : InstAlias<"mov $src, $dst", (ADD_R GPR:$dst, GPR:$src, R0, 0)>;
+
+let isAsCheapAsAMove = 1, Rs1 = R0.Num, isCodeGenOnly = 1, H = 1, F = 0,
+ isReMaterializable = 1 in
+ def MOVHI : InstRI<0b000, (outs GPR:$Rd), (ins i32hi16:$imm16),
+ "mov\t$imm16, $Rd",
+ [(set GPR:$Rd, i32hi16:$imm16)]>;
+
+def : InstAlias<"mov $imm16, $dst", (ADD_I_LO GPR:$dst, R0, i32lo16z:$imm16)>;
+def : InstAlias<"mov $imm16, $dst", (ADD_I_HI GPR:$dst, R0, i32hi16:$imm16)>;
+def : InstAlias<"mov $imm16, $dst",
+ (AND_I_LO GPR:$dst, R1, i32lo16and:$imm16)>;
+def : InstAlias<"mov $imm16, $dst",
+ (AND_I_HI GPR:$dst, R1, i32hi16and:$imm16)>;
+
+// Shift instructions
+class ShiftRI<string AsmStr, list<dag> Pattern>
+ : InstRI<0b111, (outs GPR:$Rd), (ins GPR:$Rs1, immShift:$imm16),
+ !strconcat(AsmStr, "\t$Rs1, $imm16, $Rd"), Pattern> {
+ let isReMaterializable = 1;
+}
+
+let F = 0 in {
+ let H = 0 in
+ def SL_I : ShiftRI<"sh", [(set GPR:$Rd, (shl GPR:$Rs1, immShift:$imm16))]>;
+ let H = 1 in
+ def SA_I : ShiftRI<"sha", []>;
+}
+def : Pat<(srl GPR:$Rs1, immShift:$imm), (SL_I GPR:$Rs1, (NEG $imm))>;
+def : Pat<(sra GPR:$Rs1, immShift:$imm), (SA_I GPR:$Rs1, (NEG $imm))>;
+
+let F = 1, Defs = [SR] in {
+ let H = 0 in
+ def SL_F_I : ShiftRI<"sh.f", []>;
+ let H = 1 in
+ def SA_F_I : ShiftRI<"sha.f", []>;
+}
+
+class ShiftRR<string AsmStr, list<dag> Pattern>
+ : InstRR<0b111, (outs GPR:$Rd), (ins GPR:$Rs1, GPR:$Rs2, pred:$DDDI), AsmStr,
+ Pattern>;
+
+let F = 0 in {
+ let JJJJJ = 0b10000 in
+ def SHL_R : ShiftRR<"sh$DDDI\t$Rs1, $Rs2, $Rd",
+ [(set GPR:$Rd, (shl GPR:$Rs1, GPR:$Rs2))]>;
+ let isCodeGenOnly = 1 in {
+ let JJJJJ = 0b10000 in
+ def SRL_R : ShiftRR<"sh$DDDI\t$Rs1, $Rs2, $Rd", []>;
+ }
+ let JJJJJ = 0b11000 in
+ def SRA_R : ShiftRR<"sha$DDDI\t$Rs1, $Rs2, $Rd", []>;
+}
+
+let F = 1, Defs = [SR] in {
+ let JJJJJ = 0b10000 in
+ def SHL_F_R : ShiftRR<"sh.f$DDDI\t$Rs1, $Rs2, $Rd", []>;
+ let isCodeGenOnly = 1 in {
+ let JJJJJ = 0b10000 in
+ def SRL_F_R : ShiftRR<"sh.f$DDDI\t$Rs1, $Rs2, $Rd", []>;
+ }
+ let JJJJJ = 0b11000 in
+ def SRA_F_R : ShiftRR<"sha.f$DDDI\t$Rs1, $Rs2, $Rd", []>;
+}
+
+// Expand shift-right operations
+def : Pat<(srl GPR:$Rs1, GPR:$Rs2),
+ (SRL_R GPR:$Rs1, (SUB_R R0, GPR:$Rs2))>;
+def : Pat<(sra GPR:$Rs1, GPR:$Rs2),
+ (SRA_R GPR:$Rs1, (SUB_R R0, GPR:$Rs2))>;
+
+// -------------------------------------------------- //
+// LOAD instructions
+// -------------------------------------------------- //
+
+class LoadRR<string OpcString, PatFrag OpNode, ValueType Ty>
+ : InstRRM<0b0, (outs GPR:$Rd), (ins MEMrr:$src),
+ !strconcat(OpcString, "\t$src, $Rd"),
+ [(set (Ty GPR:$Rd), (OpNode ADDRrr:$src))]>,
+ Sched<[WriteLD]> {
+ bits<20> src;
+
+ let Rs1 = src{19-15};
+ let Rs2 = src{14-10};
+ let P = src{9};
+ let Q = src{8};
+ let BBB = src{7-5};
+ let JJJJJ = src{4-0};
+ let mayLoad = 1;
+}
+
+class LoadRI<string OpcString, PatFrag OpNode, ValueType Ty>
+ : InstRM<0b0, (outs GPR:$Rd), (ins MEMri:$src),
+ !strconcat(OpcString, "\t$src, $Rd"),
+ [(set (Ty GPR:$Rd), (OpNode ADDRri:$src))]>,
+ Sched<[WriteLD]> {
+ bits<23> src;
+
+ let Itinerary = IIC_LD;
+ let Rs1 = src{22-18};
+ let P = src{17};
+ let Q = src{16};
+ let imm16 = src{15-0};
+ let isReMaterializable = 1;
+ let mayLoad = 1;
+}
+
+let E = 0 in {
+ let YL = 0b01 in {
+ // uld is used here and ld in the alias as the alias is printed out first if
+ // an alias exist
+ def LDW_RI : LoadRI<"uld", load, i32>;
+ def LDW_RR : LoadRR<"ld", load, i32>;
+ }
+}
+
+def : InstAlias<"ld $src, $dst", (LDW_RI GPR:$dst, MEMri:$src)>;
+
+let E = 1 in {
+ let YL = 0b01 in {
+ def LDWz_RR : LoadRR<"uld", zextloadi32, i32>;
+ }
+}
+
+let E = 1 in {
+ let YL = 0b00 in
+ def LDHz_RR : LoadRR<"uld.h", zextloadi16, i32>;
+ let YL = 0b10 in
+ def LDBz_RR : LoadRR<"uld.b", zextloadi8, i32>;
+}
+
+let E = 0 in {
+ let YL = 0b00 in
+ def LDHs_RR : LoadRR<"ld.h", sextloadi16, i32>;
+ let YL = 0b10 in
+ def LDBs_RR : LoadRR<"ld.b", sextloadi8, i32>;
+}
+
+def LDADDR : InstSLS<0x0, (outs GPR:$Rd), (ins MEMi:$src),
+ "ld\t$src, $Rd",
+ [(set (i32 GPR:$Rd), (load ADDRsls:$src))]>,
+ Sched<[WriteLD]> {
+ bits<21> src;
+
+ let Itinerary = IIC_LD;
+ let msb = src{20-16};
+ let lsb = src{15-0};
+ let isReMaterializable = 1;
+ let mayLoad = 1;
+}
+
+class LoadSPLS<string asmstring, PatFrag opNode>
+ : InstSPLS<(outs GPR:$Rd), (ins MEMspls:$src),
+ !strconcat(asmstring, "\t$src, $Rd"),
+ [(set (i32 GPR:$Rd), (opNode ADDRspls:$src))]>,
+ Sched<[WriteLDSW]> {
+ bits<17> src;
+ let Itinerary = IIC_LDSW;
+ let Rs1 = src{16-12};
+ let P = src{11};
+ let Q = src{10};
+ let imm10 = src{9-0};
+ let mayLoad = 1;
+ let isReMaterializable = 1;
+}
+
+let Y = 0, S = 0, E = 1 in
+ def LDHz_RI : LoadSPLS<"uld.h", zextloadi16>;
+
+let Y = 0, S = 0, E = 0 in
+ def LDHs_RI : LoadSPLS<"ld.h", sextloadi16>;
+
+let Y = 1, S = 0, E = 1 in
+ def LDBz_RI : LoadSPLS<"uld.b", zextloadi8>;
+
+let Y = 1, S = 0, E = 0 in
+ def LDBs_RI : LoadSPLS<"ld.b", sextloadi8>;
+
+def SLI : InstSLI<(outs GPR:$Rd), (ins i32lo21:$imm),
+ "mov\t$imm, $Rd",
+ [(set GPR:$Rd, i32lo21:$imm)]> {
+ bits<21> imm;
+
+ let msb = imm{20-16};
+ let lsb = imm{15-0};
+ let isReMaterializable = 1;
+ let isAsCheapAsAMove = 1;
+}
+
+// -------------------------------------------------- //
+// STORE instructions
+// -------------------------------------------------- //
+
+class StoreRR<string OpcString, PatFrag OpNode, ValueType Ty>
+ : InstRRM<0b1, (outs), (ins GPR:$Rd, MEMrr:$dst),
+ !strconcat(OpcString, "\t$Rd, $dst"),
+ [(OpNode (Ty GPR:$Rd), ADDRrr:$dst)]>,
+ Sched<[WriteST]> {
+ bits<20> dst;
+
+ let Itinerary = IIC_ST;
+ let Rs1 = dst{19-15};
+ let Rs2 = dst{14-10};
+ let P = dst{9};
+ let Q = dst{8};
+ let BBB = dst{7-5};
+ let JJJJJ = dst{4-0};
+ let mayStore = 1;
+}
+
+class StoreRI<string OpcString, PatFrag OpNode, ValueType Ty>
+ : InstRM<0b1, (outs), (ins GPR:$Rd, MEMri:$dst),
+ !strconcat(OpcString, "\t$Rd, $dst"),
+ [(OpNode (Ty GPR:$Rd), ADDRri:$dst)]>,
+ Sched<[WriteST]> {
+ bits<23> dst;
+
+ let Itinerary = IIC_ST;
+ let Rs1 = dst{22-18};
+ let P = dst{17};
+ let Q = dst{16};
+ let imm16 = dst{15-0};
+ let mayStore = 1;
+}
+
+let YL = 0b01, E = 0 in {
+ def SW_RR : StoreRR<"st", store, i32>;
+ def SW_RI : StoreRI<"st", store, i32>;
+}
+
+let E = 0 in {
+ let YL = 0b00 in
+ def STH_RR : StoreRR<"st.h", truncstorei16, i32>;
+ let YL = 0b10 in
+ def STB_RR : StoreRR<"st.b", truncstorei8, i32>;
+}
+
+def STADDR : InstSLS<0x1, (outs), (ins GPR:$Rd, MEMi:$dst),
+ "st\t$Rd, $dst",
+ [(store (i32 GPR:$Rd), ADDRsls:$dst)]>,
+ Sched<[WriteST]> {
+ bits<21> dst;
+
+ let Itinerary = IIC_ST;
+ let msb = dst{20-16};
+ let lsb = dst{15-0};
+ let mayStore = 1;
+}
+
+class StoreSPLS<string asmstring, PatFrag opNode>
+ : InstSPLS<(outs), (ins GPR:$Rd, MEMspls:$dst),
+ !strconcat(asmstring, "\t$Rd, $dst"),
+ [(opNode (i32 GPR:$Rd), ADDRspls:$dst)]>,
+ Sched<[WriteSTSW]> {
+ bits<17> dst;
+
+ let Itinerary = IIC_STSW;
+ let Rs1 = dst{16-12};
+ let P = dst{11};
+ let Q = dst{10};
+ let imm10 = dst{9-0};
+ let mayStore = 1;
+}
+
+let Y = 0, S = 1, E = 0 in
+ def STH_RI : StoreSPLS<"st.h", truncstorei16>;
+
+let Y = 1, S = 1, E = 0 in
+ def STB_RI : StoreSPLS<"st.b", truncstorei8>;
+
+// -------------------------------------------------- //
+// BRANCH instructions
+// -------------------------------------------------- //
+
+let isBranch = 1, isBarrier = 1, isTerminator = 1, hasDelaySlot = 1 in {
+ def BT : InstBR<(outs), (ins BrTarget:$addr),
+ "bt\t$addr",
+ [(br bb:$addr)]> {
+ let DDDI = 0b0000;
+ }
+ let Uses = [SR] in
+ def BRCC : InstBR<(outs), (ins BrTarget:$addr, CCOp:$DDDI),
+ "b$DDDI\t$addr",
+ [(LanaiBrCC bb:$addr, imm:$DDDI)]>;
+
+ let isIndirectBranch = 1 in {
+ def JR : InstRR<0b101, (outs), (ins GPR:$Rs2), "bt\t$Rs2",
+ [(brind GPR:$Rs2)]> {
+ let Rs1 = R0.Num;
+ let Rd = R2.Num;
+ let F = 0;
+ let JJJJJ = 0;
+ let DDDI = 0;
+ }
+ }
+}
+
+// -------------------------------------------------- //
+// Condition/SF instructions
+// -------------------------------------------------- //
+
+// Instructions to set flags used in lowering comparisons.
+multiclass SF<bits<3> op2Val, string AsmStr> {
+ let F = 1, Rd = R0.Num, JJJJJ = 0, Defs = [SR], DDDI = 0 in
+ def _RR : InstRR<op2Val, (outs), (ins GPR:$Rs1, GPR:$Rs2),
+ !strconcat(AsmStr, "\t$Rs1, $Rs2, %r0"),
+ [(LanaiSetFlag (i32 GPR:$Rs1), (i32 GPR:$Rs2))]>;
+ let F = 1, Rd = R0.Num, H = 0, Defs = [SR] in
+ def _RI_LO : InstRI<op2Val, (outs), (ins GPR:$Rs1, i32lo16z:$imm16),
+ !strconcat(AsmStr, "\t$Rs1, $imm16, %r0"),
+ [(LanaiSetFlag (i32 GPR:$Rs1), i32lo16z:$imm16)]>;
+ let F = 1, Rd = R0.Num, H = 1, Defs = [SR] in
+ def _RI_HI : InstRI<op2Val, (outs), (ins GPR:$Rs1, i32hi16:$imm16),
+ !strconcat(AsmStr, "\t$Rs1, $imm16, %r0"),
+ [(LanaiSetFlag (i32 GPR:$Rs1), i32hi16:$imm16)]>;
+}
+let isCodeGenOnly = 1, isCompare = 1 in {
+ defm SFSUB_F : SF<0b010, "sub.f">;
+}
+
+// Jump and link
+let isCall = 1, hasDelaySlot = 1, isCodeGenOnly = 1, Uses = [SP],
+ Defs = [RCA] in {
+ def CALL : Pseudo<(outs), (ins CallTarget:$addr), "", []>;
+ def CALLR : Pseudo<(outs), (ins GPR:$Rs1), "", [(Call GPR:$Rs1)]>;
+}
+
+let isReturn = 1, isTerminator = 1, hasDelaySlot = 1, isBarrier = 1,
+ Uses = [RCA] in {
+ def RET : InstRM<0b0, (outs), (ins),
+ "ld\t-4[%fp], %pc ! return",
+ [(RetFlag)]> {
+ let Rd = PC.Num;
+ let Rs1 = FP.Num;
+ let P = 1;
+ let Q = 0;
+ let imm16 = -4;
+
+ // Post encoding is not needed for RET.
+ let PostEncoderMethod = "";
+ }
+}
+
+// ADJCALLSTACKDOWN/UP implicitly use/def SP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
+// sub / add which can clobber SP.
+let Defs = [SP], Uses = [SP] in {
+ def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
+ "#ADJCALLSTACKDOWN $amt",
+ [(CallSeqStart timm:$amt)]>;
+ def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+ "#ADJCALLSTACKUP $amt1 $amt2",
+ [(CallSeqEnd timm:$amt1, timm:$amt2)]>;
+}
+
+let Defs = [SP], Uses = [SP] in {
+ def ADJDYNALLOC : Pseudo<(outs GPR:$dst), (ins GPR:$src),
+ "#ADJDYNALLOC $dst $src",
+ [(set GPR:$dst, (LanaiAdjDynAlloc GPR:$src))]>;
+}
+
+let Uses = [SR] in {
+ def SCC : InstSCC<(outs GPR:$Rs1), (ins CCOp:$DDDI),
+ "s$DDDI\t$Rs1",
+ [(set (i32 GPR:$Rs1), (LanaiSetCC imm:$DDDI))]>;
+}
+
+// SCC's output is already 1-bit so and'ing with 1 is redundant.
+def : Pat<(and (LanaiSetCC imm:$DDDI), 1), (SCC imm:$DDDI)>;
+
+// Select with hardware support
+let Uses = [SR], isSelect = 1 in {
+ def SELECT : InstRR<0b111, (outs GPR:$Rd),
+ (ins GPR:$Rs1, GPR:$Rs2, CCOp:$DDDI),
+ "sel.$DDDI $Rs1, $Rs2, $Rd",
+ [(set (i32 GPR:$Rd),
+ (LanaiSelectCC (i32 GPR:$Rs1), (i32 GPR:$Rs2),
+ (imm:$DDDI)))]> {
+ let JJJJJ = 0;
+ let F = 0;
+ }
+}
+
+let isBranch = 1, isBarrier = 1, isTerminator = 1, hasDelaySlot = 1,
+ isIndirectBranch = 1, Uses = [SR] in {
+ def BRIND_CC : InstRR<0b101, (outs), (ins GPR:$Rs1, CCOp:$DDDI),
+ "b$DDDI\t$Rs1", []> {
+ let F = 0;
+ let JJJJJ = 0;
+ let Rd = PC.Num;
+ let Rs2 = R0.Num;
+ }
+
+ def BRIND_CCA : InstRR<0b101, (outs), (ins GPR:$Rs1, GPR:$Rs2, CCOp:$DDDI),
+ "b${DDDI}\t$Rs1 add $Rs2", []> {
+ let F = 0;
+ let Rd = PC.Num;
+ let JJJJJ = 0;
+ }
+}
+
+// TODO: This only considers the case where BROFF is an immediate and not where
+// it is a register. Add support for register relative branching.
+let isBranch = 1, isBarrier = 1, isTerminator = 1, hasDelaySlot = 1, Rs1 = 0,
+ Uses = [SR] in
+ def BRR : InstBRR<(outs), (ins i16imm:$imm16, CCOp:$DDDI),
+ "b${DDDI}.r\t$imm16", []>;
+
+let F = 0 in {
+// Population Count (POPC)
+def POPC: InstSpecial<0b001, (outs GPR:$Rd), (ins GPR:$Rs1),
+ "popc\t$Rs1, $Rd",
+ [(set GPR:$Rd, (ctpop GPR:$Rs1))]>;
+
+// Count Leading Zeros (LEADZ)
+def LEADZ: InstSpecial<0b010, (outs GPR:$Rd), (ins GPR:$Rs1),
+ "leadz\t$Rs1, $Rd", [(set GPR:$Rd, (ctlz GPR:$Rs1))]>;
+
+// Count Trailing Zeros (TRAILZ)
+def TRAILZ : InstSpecial<0b011, (outs GPR:$Rd), (ins GPR:$Rs1),
+ "trailz\t$Rs1, $Rd",
+ [(set GPR:$Rd, (cttz GPR:$Rs1))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// unsigned 16-bit immediate
+def : Pat<(i32 i32lo16z:$imm), (OR_I_LO (i32 R0), imm:$imm)>;
+
+// arbitrary immediate
+def : Pat<(i32 imm:$imm), (OR_I_LO (MOVHI (HI16 imm:$imm)), (LO16 imm:$imm))>;
+
+// Calls
+def : Pat<(Call tglobaladdr:$dst), (CALL tglobaladdr:$dst)>;
+def : Pat<(Call texternalsym:$dst), (CALL texternalsym:$dst)>;
+
+// Loads
+def : Pat<(extloadi8 ADDRspls:$src), (i32 (LDBz_RI ADDRspls:$src))>;
+def : Pat<(extloadi16 ADDRspls:$src), (i32 (LDHz_RI ADDRspls:$src))>;
+
+// GlobalAddress, ExternalSymbol, Jumptable, ConstantPool
+def : Pat<(LanaiHi tglobaladdr:$dst), (MOVHI tglobaladdr:$dst)>;
+def : Pat<(LanaiLo tglobaladdr:$dst), (OR_I_LO (i32 R0), tglobaladdr:$dst)>;
+def : Pat<(LanaiSmall tglobaladdr:$dst), (SLI tglobaladdr:$dst)>;
+def : Pat<(LanaiHi texternalsym:$dst), (MOVHI texternalsym:$dst)>;
+def : Pat<(LanaiLo texternalsym:$dst), (OR_I_LO (i32 R0), texternalsym:$dst)>;
+def : Pat<(LanaiSmall texternalsym:$dst), (SLI texternalsym:$dst)>;
+def : Pat<(LanaiHi tblockaddress:$dst), (MOVHI tblockaddress:$dst)>;
+def : Pat<(LanaiLo tblockaddress:$dst), (OR_I_LO (i32 R0), tblockaddress:$dst)>;
+def : Pat<(LanaiSmall tblockaddress:$dst), (SLI tblockaddress:$dst)>;
+def : Pat<(LanaiHi tjumptable:$dst), (MOVHI tjumptable:$dst)>;
+def : Pat<(LanaiLo tjumptable:$dst), (OR_I_LO (i32 R0), tjumptable:$dst)>;
+def : Pat<(LanaiSmall tjumptable:$dst), (SLI tjumptable:$dst)>;
+def : Pat<(LanaiHi tconstpool:$dst), (MOVHI tconstpool:$dst)>;
+def : Pat<(LanaiLo tconstpool:$dst), (OR_I_LO (i32 R0), tconstpool:$dst)>;
+def : Pat<(LanaiSmall tconstpool:$dst), (SLI tconstpool:$dst)>;
+
+def : Pat<(or GPR:$hi, (LanaiLo tglobaladdr:$lo)),
+ (OR_I_LO GPR:$hi, tglobaladdr:$lo)>;
+def : Pat<(or R0, (LanaiSmall tglobaladdr:$small)),
+ (SLI tglobaladdr:$small)>;
+def : Pat<(or GPR:$hi, (LanaiLo texternalsym:$lo)),
+ (OR_I_LO GPR:$hi, texternalsym:$lo)>;
+def : Pat<(or R0, (LanaiSmall texternalsym:$small)),
+ (SLI texternalsym:$small)>;
+def : Pat<(or GPR:$hi, (LanaiLo tblockaddress:$lo)),
+ (OR_I_LO GPR:$hi, tblockaddress:$lo)>;
+def : Pat<(or R0, (LanaiSmall tblockaddress:$small)),
+ (SLI tblockaddress:$small)>;
+def : Pat<(or GPR:$hi, (LanaiLo tjumptable:$lo)),
+ (OR_I_LO GPR:$hi, tjumptable:$lo)>;
+def : Pat<(or R0, (LanaiSmall tjumptable:$small)),
+ (SLI tjumptable:$small)>;
+def : Pat<(or GPR:$hi, (LanaiLo tconstpool:$lo)),
+ (OR_I_LO GPR:$hi, tconstpool:$lo)>;
+def : Pat<(or R0, (LanaiSmall tconstpool:$small)),
+ (SLI tconstpool:$small)>;
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiMCInstLower.cpp b/contrib/llvm/lib/Target/Lanai/LanaiMCInstLower.cpp
new file mode 100644
index 000000000000..39c633578d43
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiMCInstLower.cpp
@@ -0,0 +1,139 @@
+//=-- LanaiMCInstLower.cpp - Convert Lanai MachineInstr to an MCInst --------=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower Lanai MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiMCInstLower.h"
+
+#include "MCTargetDesc/LanaiBaseInfo.h"
+#include "MCTargetDesc/LanaiMCExpr.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+MCSymbol *
+LanaiMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
+ return Printer.getSymbol(MO.getGlobal());
+}
+
+MCSymbol *
+LanaiMCInstLower::GetBlockAddressSymbol(const MachineOperand &MO) const {
+ return Printer.GetBlockAddressSymbol(MO.getBlockAddress());
+}
+
+MCSymbol *
+LanaiMCInstLower::GetExternalSymbolSymbol(const MachineOperand &MO) const {
+ return Printer.GetExternalSymbolSymbol(MO.getSymbolName());
+}
+
+MCSymbol *LanaiMCInstLower::GetJumpTableSymbol(const MachineOperand &MO) const {
+ SmallString<256> Name;
+ raw_svector_ostream(Name) << Printer.MAI->getPrivateGlobalPrefix() << "JTI"
+ << Printer.getFunctionNumber() << '_'
+ << MO.getIndex();
+ // Create a symbol for the name.
+ return Ctx.getOrCreateSymbol(Name.str());
+}
+
+MCSymbol *
+LanaiMCInstLower::GetConstantPoolIndexSymbol(const MachineOperand &MO) const {
+ SmallString<256> Name;
+ raw_svector_ostream(Name) << Printer.MAI->getPrivateGlobalPrefix() << "CPI"
+ << Printer.getFunctionNumber() << '_'
+ << MO.getIndex();
+ // Create a symbol for the name.
+ return Ctx.getOrCreateSymbol(Name.str());
+}
+
+MCOperand LanaiMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
+ MCSymbol *Sym) const {
+ LanaiMCExpr::VariantKind Kind;
+
+ switch (MO.getTargetFlags()) {
+ case LanaiII::MO_NO_FLAG:
+ Kind = LanaiMCExpr::VK_Lanai_None;
+ break;
+ case LanaiII::MO_ABS_HI:
+ Kind = LanaiMCExpr::VK_Lanai_ABS_HI;
+ break;
+ case LanaiII::MO_ABS_LO:
+ Kind = LanaiMCExpr::VK_Lanai_ABS_LO;
+ break;
+ default:
+ llvm_unreachable("Unknown target flag on GV operand");
+ }
+
+ const MCExpr *Expr =
+ MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, Ctx);
+ if (!MO.isJTI() && MO.getOffset())
+ Expr = MCBinaryExpr::createAdd(
+ Expr, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
+ Expr = LanaiMCExpr::create(Kind, Expr, Ctx);
+ return MCOperand::createExpr(Expr);
+}
+
+void LanaiMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+ OutMI.setOpcode(MI->getOpcode());
+
+ for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
+ const MachineOperand &MO = MI->getOperand(I);
+
+ MCOperand MCOp;
+ switch (MO.getType()) {
+ case MachineOperand::MO_Register:
+ // Ignore all implicit register operands.
+ if (MO.isImplicit())
+ continue;
+ MCOp = MCOperand::createReg(MO.getReg());
+ break;
+ case MachineOperand::MO_Immediate:
+ MCOp = MCOperand::createImm(MO.getImm());
+ break;
+ case MachineOperand::MO_MachineBasicBlock:
+ MCOp = MCOperand::createExpr(
+ MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx));
+ break;
+ case MachineOperand::MO_RegisterMask:
+ continue;
+ case MachineOperand::MO_GlobalAddress:
+ MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
+ break;
+ case MachineOperand::MO_BlockAddress:
+ MCOp = LowerSymbolOperand(MO, GetBlockAddressSymbol(MO));
+ break;
+ case MachineOperand::MO_ExternalSymbol:
+ MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO));
+ break;
+ case MachineOperand::MO_JumpTableIndex:
+ MCOp = LowerSymbolOperand(MO, GetJumpTableSymbol(MO));
+ break;
+ case MachineOperand::MO_ConstantPoolIndex:
+ MCOp = LowerSymbolOperand(MO, GetConstantPoolIndexSymbol(MO));
+ break;
+ default:
+ MI->dump();
+ llvm_unreachable("unknown operand type");
+ }
+
+ OutMI.addOperand(MCOp);
+ }
+}
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiMCInstLower.h b/contrib/llvm/lib/Target/Lanai/LanaiMCInstLower.h
new file mode 100644
index 000000000000..6d7818d63d87
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiMCInstLower.h
@@ -0,0 +1,47 @@
+//===-- LanaiMCInstLower.h - Lower MachineInstr to MCInst -------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAIMCINSTLOWER_H
+#define LLVM_LIB_TARGET_LANAI_LANAIMCINSTLOWER_H
+
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+class AsmPrinter;
+class MCContext;
+class MCInst;
+class MCOperand;
+class MCSymbol;
+class MachineInstr;
+class MachineModuleInfoMachO;
+class MachineOperand;
+class Mangler;
+
+// LanaiMCInstLower - This class is used to lower an MachineInstr
+// into an MCInst.
+class LLVM_LIBRARY_VISIBILITY LanaiMCInstLower {
+ MCContext &Ctx;
+
+ AsmPrinter &Printer;
+
+public:
+ LanaiMCInstLower(MCContext &CTX, AsmPrinter &AP) : Ctx(CTX), Printer(AP) {}
+ void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+ MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+
+ MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
+ MCSymbol *GetBlockAddressSymbol(const MachineOperand &MO) const;
+ MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
+ MCSymbol *GetJumpTableSymbol(const MachineOperand &MO) const;
+ MCSymbol *GetConstantPoolIndexSymbol(const MachineOperand &MO) const;
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAIMCINSTLOWER_H
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/Lanai/LanaiMachineFunctionInfo.cpp
new file mode 100644
index 000000000000..c72271b67790
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiMachineFunctionInfo.cpp
@@ -0,0 +1,23 @@
+//===-- LanaiMachineFuctionInfo.cpp - Lanai machine function info ---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiMachineFunctionInfo.h"
+
+using namespace llvm;
+
+void LanaiMachineFunctionInfo::anchor() {}
+
+unsigned LanaiMachineFunctionInfo::getGlobalBaseReg() {
+ // Return if it has already been initialized.
+ if (GlobalBaseReg)
+ return GlobalBaseReg;
+
+ return GlobalBaseReg =
+ MF.getRegInfo().createVirtualRegister(&Lanai::GPRRegClass);
+}
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiMachineFunctionInfo.h b/contrib/llvm/lib/Target/Lanai/LanaiMachineFunctionInfo.h
new file mode 100644
index 000000000000..3bd9112a9e13
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiMachineFunctionInfo.h
@@ -0,0 +1,58 @@
+//===- LanaiMachineFuctionInfo.h - Lanai machine func info -------*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares Lanai-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAIMACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_LANAI_LANAIMACHINEFUNCTIONINFO_H
+
+#include "LanaiRegisterInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+namespace llvm {
+
+// LanaiMachineFunctionInfo - This class is derived from MachineFunction and
+// contains private Lanai target-specific information for each MachineFunction.
+class LanaiMachineFunctionInfo : public MachineFunctionInfo {
+ virtual void anchor();
+
+ MachineFunction &MF;
+
+ // SRetReturnReg - Lanai ABI require that sret lowering includes
+ // returning the value of the returned struct in a register. This field
+ // holds the virtual register into which the sret argument is passed.
+ unsigned SRetReturnReg;
+
+ // GlobalBaseReg - keeps track of the virtual register initialized for
+ // use as the global base register. This is used for PIC in some PIC
+ // relocation models.
+ unsigned GlobalBaseReg;
+
+ // VarArgsFrameIndex - FrameIndex for start of varargs area.
+ int VarArgsFrameIndex;
+
+public:
+ explicit LanaiMachineFunctionInfo(MachineFunction &MF)
+ : MF(MF), SRetReturnReg(0), GlobalBaseReg(0), VarArgsFrameIndex(0) {}
+
+ unsigned getSRetReturnReg() const { return SRetReturnReg; }
+ void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+
+ unsigned getGlobalBaseReg();
+
+ int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+ void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAIMACHINEFUNCTIONINFO_H
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiMemAluCombiner.cpp b/contrib/llvm/lib/Target/Lanai/LanaiMemAluCombiner.cpp
new file mode 100644
index 000000000000..7259c02194ca
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiMemAluCombiner.cpp
@@ -0,0 +1,425 @@
+//===-- LanaiMemAluCombiner.cpp - Pass to combine memory & ALU operations -===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Simple pass to combine memory and ALU operations
+//
+// The Lanai ISA supports instructions where a load/store modifies the base
+// register used in the load/store operation. This pass finds suitable
+// load/store and ALU instructions and combines them into one instruction.
+//
+// For example,
+// ld [ %r6 -- ], %r12
+// is a supported instruction that is not currently generated by the instruction
+// selection pass of this backend. This pass generates these instructions by
+// merging
+// add %r6, -4, %r6
+// followed by
+// ld [ %r6 ], %r12
+// in the same machine basic block into one machine instruction.
+//===----------------------------------------------------------------------===//
+
+#include "Lanai.h"
+#include "LanaiTargetMachine.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+using namespace llvm;
+
+#define GET_INSTRMAP_INFO
+#include "LanaiGenInstrInfo.inc"
+
+#define DEBUG_TYPE "lanai-mem-alu-combiner"
+
+STATISTIC(NumLdStAluCombined, "Number of memory and ALU instructions combined");
+
+static llvm::cl::opt<bool> DisableMemAluCombiner(
+ "disable-lanai-mem-alu-combiner", llvm::cl::init(false),
+ llvm::cl::desc("Do not combine ALU and memory operators"),
+ llvm::cl::Hidden);
+
+namespace llvm {
+void initializeLanaiMemAluCombinerPass(PassRegistry &);
+} // namespace llvm
+
+namespace {
+typedef MachineBasicBlock::iterator MbbIterator;
+typedef MachineFunction::iterator MfIterator;
+
+class LanaiMemAluCombiner : public MachineFunctionPass {
+public:
+ static char ID;
+ explicit LanaiMemAluCombiner() : MachineFunctionPass(ID) {
+ initializeLanaiMemAluCombinerPass(*PassRegistry::getPassRegistry());
+ }
+
+ StringRef getPassName() const override {
+ return "Lanai load / store optimization pass";
+ }
+
+ bool runOnMachineFunction(MachineFunction &F) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+private:
+ MbbIterator findClosestSuitableAluInstr(MachineBasicBlock *BB,
+ const MbbIterator &MemInstr,
+ bool Decrement);
+ void insertMergedInstruction(MachineBasicBlock *BB,
+ const MbbIterator &MemInstr,
+ const MbbIterator &AluInstr, bool Before);
+ bool combineMemAluInBasicBlock(MachineBasicBlock *BB);
+
+ // Target machine description which we query for register names, data
+ // layout, etc.
+ const TargetInstrInfo *TII;
+};
+} // namespace
+
+char LanaiMemAluCombiner::ID = 0;
+
+INITIALIZE_PASS(LanaiMemAluCombiner, DEBUG_TYPE,
+ "Lanai memory ALU combiner pass", false, false)
+
+namespace {
+bool isSpls(uint16_t Opcode) { return Lanai::splsIdempotent(Opcode) == Opcode; }
+
+// Determine the opcode for the merged instruction created by considering the
+// old memory operation's opcode and whether the merged opcode will have an
+// immediate offset.
+unsigned mergedOpcode(unsigned OldOpcode, bool ImmediateOffset) {
+ switch (OldOpcode) {
+ case Lanai::LDW_RI:
+ case Lanai::LDW_RR:
+ if (ImmediateOffset)
+ return Lanai::LDW_RI;
+ return Lanai::LDW_RR;
+ case Lanai::LDHs_RI:
+ case Lanai::LDHs_RR:
+ if (ImmediateOffset)
+ return Lanai::LDHs_RI;
+ return Lanai::LDHs_RR;
+ case Lanai::LDHz_RI:
+ case Lanai::LDHz_RR:
+ if (ImmediateOffset)
+ return Lanai::LDHz_RI;
+ return Lanai::LDHz_RR;
+ case Lanai::LDBs_RI:
+ case Lanai::LDBs_RR:
+ if (ImmediateOffset)
+ return Lanai::LDBs_RI;
+ return Lanai::LDBs_RR;
+ case Lanai::LDBz_RI:
+ case Lanai::LDBz_RR:
+ if (ImmediateOffset)
+ return Lanai::LDBz_RI;
+ return Lanai::LDBz_RR;
+ case Lanai::SW_RI:
+ case Lanai::SW_RR:
+ if (ImmediateOffset)
+ return Lanai::SW_RI;
+ return Lanai::SW_RR;
+ case Lanai::STB_RI:
+ case Lanai::STB_RR:
+ if (ImmediateOffset)
+ return Lanai::STB_RI;
+ return Lanai::STB_RR;
+ case Lanai::STH_RI:
+ case Lanai::STH_RR:
+ if (ImmediateOffset)
+ return Lanai::STH_RI;
+ return Lanai::STH_RR;
+ default:
+ return 0;
+ }
+}
+
+// Check if the machine instruction has non-volatile memory operands of the type
+// supported for combining with ALU instructions.
+bool isNonVolatileMemoryOp(const MachineInstr &MI) {
+ if (!MI.hasOneMemOperand())
+ return false;
+
+ // Determine if the machine instruction is a supported memory operation by
+ // testing if the computed merge opcode is a valid memory operation opcode.
+ if (mergedOpcode(MI.getOpcode(), false) == 0)
+ return false;
+
+ const MachineMemOperand *MemOperand = *MI.memoperands_begin();
+
+ // Don't move volatile memory accesses
+ if (MemOperand->isVolatile())
+ return false;
+
+ return true;
+}
+
+// Test to see if two machine operands are of the same type. This test is less
+// strict than the MachineOperand::isIdenticalTo function.
+bool isSameOperand(const MachineOperand &Op1, const MachineOperand &Op2) {
+ if (Op1.getType() != Op2.getType())
+ return false;
+
+ switch (Op1.getType()) {
+ case MachineOperand::MO_Register:
+ return Op1.getReg() == Op2.getReg();
+ case MachineOperand::MO_Immediate:
+ return Op1.getImm() == Op2.getImm();
+ default:
+ return false;
+ }
+}
+
+bool isZeroOperand(const MachineOperand &Op) {
+ return ((Op.isReg() && Op.getReg() == Lanai::R0) ||
+ (Op.isImm() && Op.getImm() == 0));
+}
+
+// Determines whether a register is used by an instruction.
+bool InstrUsesReg(const MbbIterator &Instr, const MachineOperand *Reg) {
+ for (MachineInstr::const_mop_iterator Mop = Instr->operands_begin();
+ Mop != Instr->operands_end(); ++Mop) {
+ if (isSameOperand(*Mop, *Reg))
+ return true;
+ }
+ return false;
+}
+
+// Converts between machine opcode and AluCode.
+// Flag using/modifying ALU operations should not be considered for merging and
+// are omitted from this list.
+LPAC::AluCode mergedAluCode(unsigned AluOpcode) {
+ switch (AluOpcode) {
+ case Lanai::ADD_I_LO:
+ case Lanai::ADD_R:
+ return LPAC::ADD;
+ case Lanai::SUB_I_LO:
+ case Lanai::SUB_R:
+ return LPAC::SUB;
+ case Lanai::AND_I_LO:
+ case Lanai::AND_R:
+ return LPAC::AND;
+ case Lanai::OR_I_LO:
+ case Lanai::OR_R:
+ return LPAC::OR;
+ case Lanai::XOR_I_LO:
+ case Lanai::XOR_R:
+ return LPAC::XOR;
+ case Lanai::SHL_R:
+ return LPAC::SHL;
+ case Lanai::SRL_R:
+ return LPAC::SRL;
+ case Lanai::SRA_R:
+ return LPAC::SRA;
+ case Lanai::SA_I:
+ case Lanai::SL_I:
+ default:
+ return LPAC::UNKNOWN;
+ }
+}
+
+// Insert a new combined memory and ALU operation instruction.
+//
+// This function builds a new machine instruction using the MachineInstrBuilder
+// class and inserts it before the memory instruction.
+void LanaiMemAluCombiner::insertMergedInstruction(MachineBasicBlock *BB,
+ const MbbIterator &MemInstr,
+ const MbbIterator &AluInstr,
+ bool Before) {
+ // Insert new combined load/store + alu operation
+ MachineOperand Dest = MemInstr->getOperand(0);
+ MachineOperand Base = MemInstr->getOperand(1);
+ MachineOperand MemOffset = MemInstr->getOperand(2);
+ MachineOperand AluOffset = AluInstr->getOperand(2);
+
+ // Abort if ALU offset is not a register or immediate
+ assert((AluOffset.isReg() || AluOffset.isImm()) &&
+ "Unsupported operand type in merge");
+
+ // Determined merged instructions opcode and ALU code
+ LPAC::AluCode AluOpcode = mergedAluCode(AluInstr->getOpcode());
+ unsigned NewOpc = mergedOpcode(MemInstr->getOpcode(), AluOffset.isImm());
+
+ assert(AluOpcode != LPAC::UNKNOWN && "Unknown ALU code in merging");
+ assert(NewOpc != 0 && "Unknown merged node opcode");
+
+ // Build and insert new machine instruction
+ MachineInstrBuilder InstrBuilder =
+ BuildMI(*BB, MemInstr, MemInstr->getDebugLoc(), TII->get(NewOpc));
+ InstrBuilder.addReg(Dest.getReg(), getDefRegState(true));
+ InstrBuilder.addReg(Base.getReg(), getKillRegState(true));
+
+ // Add offset to machine instruction
+ if (AluOffset.isReg())
+ InstrBuilder.addReg(AluOffset.getReg());
+ else if (AluOffset.isImm())
+ InstrBuilder.addImm(AluOffset.getImm());
+ else
+ llvm_unreachable("Unsupported ld/st ALU merge.");
+
+ // Create a pre-op if the ALU operation preceded the memory operation or the
+ // MemOffset is non-zero (i.e. the memory value should be adjusted before
+ // accessing it), else create a post-op.
+ if (Before || !isZeroOperand(MemOffset))
+ InstrBuilder.addImm(LPAC::makePreOp(AluOpcode));
+ else
+ InstrBuilder.addImm(LPAC::makePostOp(AluOpcode));
+
+ // Transfer memory operands.
+ InstrBuilder->setMemRefs(MemInstr->memoperands_begin(),
+ MemInstr->memoperands_end());
+}
+
+// Function determines if ALU operation (in alu_iter) can be combined with
+// a load/store with base and offset.
+bool isSuitableAluInstr(bool IsSpls, const MbbIterator &AluIter,
+ const MachineOperand &Base,
+ const MachineOperand &Offset) {
+ // ALU operations have 3 operands
+ if (AluIter->getNumOperands() != 3)
+ return false;
+
+ MachineOperand &Dest = AluIter->getOperand(0);
+ MachineOperand &Op1 = AluIter->getOperand(1);
+ MachineOperand &Op2 = AluIter->getOperand(2);
+
+ // Only match instructions using the base register as destination and with the
+ // base and first operand equal
+ if (!isSameOperand(Dest, Base) || !isSameOperand(Dest, Op1))
+ return false;
+
+ if (Op2.isImm()) {
+ // It is not a match if the 2nd operand in the ALU operation is an
+ // immediate but the ALU operation is not an addition.
+ if (AluIter->getOpcode() != Lanai::ADD_I_LO)
+ return false;
+
+ if (Offset.isReg() && Offset.getReg() == Lanai::R0)
+ return true;
+
+ if (Offset.isImm() &&
+ ((Offset.getImm() == 0 &&
+ // Check that the Op2 would fit in the immediate field of the
+ // memory operation.
+ ((IsSpls && isInt<10>(Op2.getImm())) ||
+ (!IsSpls && isInt<16>(Op2.getImm())))) ||
+ Offset.getImm() == Op2.getImm()))
+ return true;
+ } else if (Op2.isReg()) {
+ // The Offset and 2nd operand are both registers and equal
+ if (Offset.isReg() && Op2.getReg() == Offset.getReg())
+ return true;
+ } else
+ // Only consider operations with register or immediate values
+ return false;
+
+ return false;
+}
+
+MbbIterator LanaiMemAluCombiner::findClosestSuitableAluInstr(
+ MachineBasicBlock *BB, const MbbIterator &MemInstr, const bool Decrement) {
+ MachineOperand *Base = &MemInstr->getOperand(1);
+ MachineOperand *Offset = &MemInstr->getOperand(2);
+ bool IsSpls = isSpls(MemInstr->getOpcode());
+
+ MbbIterator First = MemInstr;
+ MbbIterator Last = Decrement ? BB->begin() : BB->end();
+
+ while (First != Last) {
+ Decrement ? --First : ++First;
+
+ if (First == Last)
+ break;
+
+ // Skip over debug instructions
+ if (First->isDebugValue())
+ continue;
+
+ if (isSuitableAluInstr(IsSpls, First, *Base, *Offset)) {
+ return First;
+ }
+
+ // Usage of the base or offset register is not a form suitable for merging.
+ if (First != Last) {
+ if (InstrUsesReg(First, Base))
+ break;
+ if (Offset->isReg() && InstrUsesReg(First, Offset))
+ break;
+ }
+ }
+
+ return MemInstr;
+}
+
+bool LanaiMemAluCombiner::combineMemAluInBasicBlock(MachineBasicBlock *BB) {
+ bool Modified = false;
+
+ MbbIterator MBBIter = BB->begin(), End = BB->end();
+ while (MBBIter != End) {
+ bool IsMemOp = isNonVolatileMemoryOp(*MBBIter);
+
+ if (IsMemOp) {
+ MachineOperand AluOperand = MBBIter->getOperand(3);
+ unsigned int DestReg = MBBIter->getOperand(0).getReg(),
+ BaseReg = MBBIter->getOperand(1).getReg();
+ assert(AluOperand.isImm() && "Unexpected memory operator type");
+ LPAC::AluCode AluOpcode = static_cast<LPAC::AluCode>(AluOperand.getImm());
+
+ // Skip memory operations that already modify the base register or if
+ // the destination and base register are the same
+ if (!LPAC::modifiesOp(AluOpcode) && DestReg != BaseReg) {
+ for (int Inc = 0; Inc <= 1; ++Inc) {
+ MbbIterator AluIter =
+ findClosestSuitableAluInstr(BB, MBBIter, Inc == 0);
+ if (AluIter != MBBIter) {
+ insertMergedInstruction(BB, MBBIter, AluIter, Inc == 0);
+
+ ++NumLdStAluCombined;
+ Modified = true;
+
+ // Erase the matching ALU instruction
+ BB->erase(AluIter);
+ // Erase old load/store instruction
+ BB->erase(MBBIter++);
+ break;
+ }
+ }
+ }
+ }
+ if (MBBIter == End)
+ break;
+ ++MBBIter;
+ }
+
+ return Modified;
+}
+
+// Driver function that iterates over the machine basic building blocks of a
+// machine function
+bool LanaiMemAluCombiner::runOnMachineFunction(MachineFunction &MF) {
+ if (DisableMemAluCombiner)
+ return false;
+
+ TII = MF.getSubtarget<LanaiSubtarget>().getInstrInfo();
+ bool Modified = false;
+ for (MfIterator MFI = MF.begin(); MFI != MF.end(); ++MFI) {
+ Modified |= combineMemAluInBasicBlock(&*MFI);
+ }
+ return Modified;
+}
+} // namespace
+
+FunctionPass *llvm::createLanaiMemAluCombinerPass() {
+ return new LanaiMemAluCombiner();
+}
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiRegisterInfo.cpp b/contrib/llvm/lib/Target/Lanai/LanaiRegisterInfo.cpp
new file mode 100644
index 000000000000..12a2571c28d9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiRegisterInfo.cpp
@@ -0,0 +1,287 @@
+//===-- LanaiRegisterInfo.cpp - Lanai Register Information ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Lanai implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiRegisterInfo.h"
+#include "Lanai.h"
+#include "LanaiSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_REGINFO_TARGET_DESC
+#include "LanaiGenRegisterInfo.inc"
+
+using namespace llvm;
+
+LanaiRegisterInfo::LanaiRegisterInfo() : LanaiGenRegisterInfo(Lanai::RCA) {}
+
+const uint16_t *
+LanaiRegisterInfo::getCalleeSavedRegs(const MachineFunction * /*MF*/) const {
+ return CSR_SaveList;
+}
+
+BitVector LanaiRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+ BitVector Reserved(getNumRegs());
+
+ Reserved.set(Lanai::R0);
+ Reserved.set(Lanai::R1);
+ Reserved.set(Lanai::PC);
+ Reserved.set(Lanai::R2);
+ Reserved.set(Lanai::SP);
+ Reserved.set(Lanai::R4);
+ Reserved.set(Lanai::FP);
+ Reserved.set(Lanai::R5);
+ Reserved.set(Lanai::RR1);
+ Reserved.set(Lanai::R10);
+ Reserved.set(Lanai::RR2);
+ Reserved.set(Lanai::R11);
+ Reserved.set(Lanai::RCA);
+ Reserved.set(Lanai::R15);
+ if (hasBasePointer(MF))
+ Reserved.set(getBaseRegister());
+ return Reserved;
+}
+
+bool LanaiRegisterInfo::requiresRegisterScavenging(
+ const MachineFunction & /*MF*/) const {
+ return true;
+}
+
+bool LanaiRegisterInfo::trackLivenessAfterRegAlloc(
+ const MachineFunction & /*MF*/) const {
+ return true;
+}
+
+static bool isALUArithLoOpcode(unsigned Opcode) {
+ switch (Opcode) {
+ case Lanai::ADD_I_LO:
+ case Lanai::SUB_I_LO:
+ case Lanai::ADD_F_I_LO:
+ case Lanai::SUB_F_I_LO:
+ case Lanai::ADDC_I_LO:
+ case Lanai::SUBB_I_LO:
+ case Lanai::ADDC_F_I_LO:
+ case Lanai::SUBB_F_I_LO:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static unsigned getOppositeALULoOpcode(unsigned Opcode) {
+ switch (Opcode) {
+ case Lanai::ADD_I_LO:
+ return Lanai::SUB_I_LO;
+ case Lanai::SUB_I_LO:
+ return Lanai::ADD_I_LO;
+ case Lanai::ADD_F_I_LO:
+ return Lanai::SUB_F_I_LO;
+ case Lanai::SUB_F_I_LO:
+ return Lanai::ADD_F_I_LO;
+ case Lanai::ADDC_I_LO:
+ return Lanai::SUBB_I_LO;
+ case Lanai::SUBB_I_LO:
+ return Lanai::ADDC_I_LO;
+ case Lanai::ADDC_F_I_LO:
+ return Lanai::SUBB_F_I_LO;
+ case Lanai::SUBB_F_I_LO:
+ return Lanai::ADDC_F_I_LO;
+ default:
+ llvm_unreachable("Invalid ALU lo opcode");
+ }
+}
+
+static unsigned getRRMOpcodeVariant(unsigned Opcode) {
+ switch (Opcode) {
+ case Lanai::LDBs_RI:
+ return Lanai::LDBs_RR;
+ case Lanai::LDBz_RI:
+ return Lanai::LDBz_RR;
+ case Lanai::LDHs_RI:
+ return Lanai::LDHs_RR;
+ case Lanai::LDHz_RI:
+ return Lanai::LDHz_RR;
+ case Lanai::LDW_RI:
+ return Lanai::LDW_RR;
+ case Lanai::STB_RI:
+ return Lanai::STB_RR;
+ case Lanai::STH_RI:
+ return Lanai::STH_RR;
+ case Lanai::SW_RI:
+ return Lanai::SW_RR;
+ default:
+ llvm_unreachable("Opcode has no RRM variant");
+ }
+}
+
+void LanaiRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+ int SPAdj, unsigned FIOperandNum,
+ RegScavenger *RS) const {
+ assert(SPAdj == 0 && "Unexpected");
+
+ MachineInstr &MI = *II;
+ MachineFunction &MF = *MI.getParent()->getParent();
+ const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+ const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+ bool HasFP = TFI->hasFP(MF);
+ DebugLoc DL = MI.getDebugLoc();
+
+ int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+
+ int Offset = MF.getFrameInfo().getObjectOffset(FrameIndex) +
+ MI.getOperand(FIOperandNum + 1).getImm();
+
+ // Addressable stack objects are addressed using neg. offsets from fp
+ // or pos. offsets from sp/basepointer
+ if (!HasFP || (needsStackRealignment(MF) && FrameIndex >= 0))
+ Offset += MF.getFrameInfo().getStackSize();
+
+ unsigned FrameReg = getFrameRegister(MF);
+ if (FrameIndex >= 0) {
+ if (hasBasePointer(MF))
+ FrameReg = getBaseRegister();
+ else if (needsStackRealignment(MF))
+ FrameReg = Lanai::SP;
+ }
+
+ // Replace frame index with a frame pointer reference.
+ // If the offset is small enough to fit in the immediate field, directly
+ // encode it.
+ // Otherwise scavenge a register and encode it into a MOVHI, OR_I_LO sequence.
+ if ((isSPLSOpcode(MI.getOpcode()) && !isInt<10>(Offset)) ||
+ !isInt<16>(Offset)) {
+ assert(RS && "Register scavenging must be on");
+ unsigned Reg = RS->FindUnusedReg(&Lanai::GPRRegClass);
+ if (!Reg)
+ Reg = RS->scavengeRegister(&Lanai::GPRRegClass, II, SPAdj);
+ assert(Reg && "Register scavenger failed");
+
+ bool HasNegOffset = false;
+ // ALU ops have unsigned immediate values. If the Offset is negative, we
+ // negate it here and reverse the opcode later.
+ if (Offset < 0) {
+ HasNegOffset = true;
+ Offset = -Offset;
+ }
+
+ if (!isInt<16>(Offset)) {
+ // Reg = hi(offset) | lo(offset)
+ BuildMI(*MI.getParent(), II, DL, TII->get(Lanai::MOVHI), Reg)
+ .addImm(static_cast<uint32_t>(Offset) >> 16);
+ BuildMI(*MI.getParent(), II, DL, TII->get(Lanai::OR_I_LO), Reg)
+ .addReg(Reg)
+ .addImm(Offset & 0xffffU);
+ } else {
+ // Reg = mov(offset)
+ BuildMI(*MI.getParent(), II, DL, TII->get(Lanai::ADD_I_LO), Reg)
+ .addImm(0)
+ .addImm(Offset);
+ }
+ // Reg = FrameReg OP Reg
+ if (MI.getOpcode() == Lanai::ADD_I_LO) {
+ BuildMI(*MI.getParent(), II, DL,
+ HasNegOffset ? TII->get(Lanai::SUB_R) : TII->get(Lanai::ADD_R),
+ MI.getOperand(0).getReg())
+ .addReg(FrameReg)
+ .addReg(Reg)
+ .addImm(LPCC::ICC_T);
+ MI.eraseFromParent();
+ return;
+ }
+ if (isSPLSOpcode(MI.getOpcode()) || isRMOpcode(MI.getOpcode())) {
+ MI.setDesc(TII->get(getRRMOpcodeVariant(MI.getOpcode())));
+ if (HasNegOffset) {
+ // Change the ALU op (operand 3) from LPAC::ADD (the default) to
+ // LPAC::SUB with the already negated offset.
+ assert((MI.getOperand(3).getImm() == LPAC::ADD) &&
+ "Unexpected ALU op in RRM instruction");
+ MI.getOperand(3).setImm(LPAC::SUB);
+ }
+ } else
+ llvm_unreachable("Unexpected opcode in frame index operation");
+
+ MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, /*isDef=*/false);
+ MI.getOperand(FIOperandNum + 1)
+ .ChangeToRegister(Reg, /*isDef=*/false, /*isImp=*/false,
+ /*isKill=*/true);
+ return;
+ }
+
+ // ALU arithmetic ops take unsigned immediates. If the offset is negative,
+ // we replace the instruction with one that inverts the opcode and negates
+ // the immediate.
+ if ((Offset < 0) && isALUArithLoOpcode(MI.getOpcode())) {
+ unsigned NewOpcode = getOppositeALULoOpcode(MI.getOpcode());
+ // We know this is an ALU op, so we know the operands are as follows:
+ // 0: destination register
+ // 1: source register (frame register)
+ // 2: immediate
+ BuildMI(*MI.getParent(), II, DL, TII->get(NewOpcode),
+ MI.getOperand(0).getReg())
+ .addReg(FrameReg)
+ .addImm(-Offset);
+ MI.eraseFromParent();
+ } else {
+ MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, /*isDef=*/false);
+ MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+ }
+}
+
+bool LanaiRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ // When we need stack realignment and there are dynamic allocas, we can't
+ // reference off of the stack pointer, so we reserve a base pointer.
+ if (needsStackRealignment(MF) && MFI.hasVarSizedObjects())
+ return true;
+
+ return false;
+}
+
+unsigned LanaiRegisterInfo::getRARegister() const { return Lanai::RCA; }
+
+unsigned
+LanaiRegisterInfo::getFrameRegister(const MachineFunction & /*MF*/) const {
+ return Lanai::FP;
+}
+
+unsigned LanaiRegisterInfo::getBaseRegister() const { return Lanai::R14; }
+
+bool LanaiRegisterInfo::canRealignStack(const MachineFunction &MF) const {
+ if (!TargetRegisterInfo::canRealignStack(MF))
+ return false;
+ return true;
+}
+
+unsigned LanaiRegisterInfo::getEHExceptionRegister() const {
+ llvm_unreachable("no exception support");
+ return 0;
+}
+
+unsigned LanaiRegisterInfo::getEHHandlerRegister() const {
+ llvm_unreachable("no exception support");
+ return 0;
+}
+
+const uint32_t *
+LanaiRegisterInfo::getCallPreservedMask(const MachineFunction & /*MF*/,
+ CallingConv::ID /*CC*/) const {
+ return CSR_RegMask;
+}
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiRegisterInfo.h b/contrib/llvm/lib/Target/Lanai/LanaiRegisterInfo.h
new file mode 100644
index 000000000000..8b84bbc460e8
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiRegisterInfo.h
@@ -0,0 +1,63 @@
+//===- LanaiRegisterInfo.h - Lanai Register Information Impl ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Lanai implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAIREGISTERINFO_H
+#define LLVM_LIB_TARGET_LANAI_LANAIREGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "LanaiGenRegisterInfo.inc"
+
+namespace llvm {
+
+class TargetInstrInfo;
+class Type;
+
+struct LanaiRegisterInfo : public LanaiGenRegisterInfo {
+ LanaiRegisterInfo();
+
+ const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+ CallingConv::ID) const override;
+
+ // Code Generation virtual methods.
+ const uint16_t *
+ getCalleeSavedRegs(const MachineFunction *MF = 0) const override;
+
+ BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+ bool requiresRegisterScavenging(const MachineFunction &MF) const override;
+
+ bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
+
+ void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+ unsigned FIOperandNum,
+ RegScavenger *RS = NULL) const override;
+
+ bool canRealignStack(const MachineFunction &MF) const override;
+
+ // Debug information queries.
+ unsigned getRARegister() const;
+ unsigned getFrameRegister(const MachineFunction &MF) const override;
+ unsigned getBaseRegister() const;
+ bool hasBasePointer(const MachineFunction &MF) const;
+
+ // Exception handling queries.
+ unsigned getEHExceptionRegister() const;
+ unsigned getEHHandlerRegister() const;
+ int getDwarfRegNum(unsigned RegNum, bool IsEH) const;
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAIREGISTERINFO_H
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiRegisterInfo.td b/contrib/llvm/lib/Target/Lanai/LanaiRegisterInfo.td
new file mode 100644
index 000000000000..cf8cfe30cce9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiRegisterInfo.td
@@ -0,0 +1,64 @@
+//===- LanaiRegisterInfo.td - Lanai Register defs ------------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Declarations that describe the Lanai register file
+//===----------------------------------------------------------------------===//
+
+// Registers are identified with 5-bit ID numbers.
+class LanaiReg<bits<5> num, string n, list<Register> subregs = [],
+ list<string> altNames = []> : Register<n, altNames> {
+ field bits<5> Num;
+ let Num = num;
+ let Namespace = "Lanai";
+ let SubRegs = subregs;
+}
+
+let Namespace = "Lanai" in {
+ def sub_32 : SubRegIndex<32>;
+}
+
+// Integer registers
+foreach i = 0-31 in {
+ def R#i : LanaiReg<i, "r"#i>, DwarfRegNum<[i]>;
+}
+
+// Register aliases
+let SubRegIndices = [sub_32] in {
+ def PC : LanaiReg< 2, "pc", [R2]>, DwarfRegAlias<R2>;
+ def SP : LanaiReg< 4, "sp", [R4]>, DwarfRegAlias<R4>;
+ def FP : LanaiReg< 5, "fp", [R5]>, DwarfRegAlias<R5>;
+ def RV : LanaiReg< 8, "rv", [R8]>, DwarfRegAlias<R8>;
+ def RR1 : LanaiReg<10, "rr1", [R10]>, DwarfRegAlias<R10>;
+ def RR2 : LanaiReg<11, "rr2", [R11]>, DwarfRegAlias<R11>;
+ def RCA : LanaiReg<15, "rca", [R15]>, DwarfRegAlias<R15>;
+}
+
+// Define a status register to capture the dependencies between the set flag
+// and setcc instructions
+def SR : LanaiReg< 0, "sw">;
+
+// Register classes.
+def GPR : RegisterClass<"Lanai", [i32], 32,
+ (add R3, R9, R12, R13, R14, R16, R17,
+ (sequence "R%i", 20, 31),
+ R6, R7, R18, R19, // registers for passing arguments
+ R15, RCA, // register for constant addresses
+ R10, RR1, R11, RR2, // programmer controlled registers
+ R8, RV, // return value
+ R5, FP, // frame pointer
+ R4, SP, // stack pointer
+ R2, PC, // program counter
+ R1, // all 1s (0xffffffff)
+ R0 // constant 0
+ )>;
+
+// Condition code register class
+def CCR : RegisterClass<"Lanai", [i32], 32, (add SR)> {
+ let CopyCost = -1; // Don't allow copying of status registers
+ let isAllocatable = 0;
+}
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiSchedule.td b/contrib/llvm/lib/Target/Lanai/LanaiSchedule.td
new file mode 100644
index 000000000000..7f931c4be8bb
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiSchedule.td
@@ -0,0 +1,70 @@
+//=-LanaiSchedule.td - Lanai Scheduling Definitions --*- tablegen -*-=========//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def ALU_FU : FuncUnit;
+def LDST_FU : FuncUnit;
+
+def IIC_ALU : InstrItinClass;
+def IIC_LD : InstrItinClass;
+def IIC_ST : InstrItinClass;
+def IIC_LDSW : InstrItinClass;
+def IIC_STSW : InstrItinClass;
+
+def LanaiItinerary : ProcessorItineraries<[ALU_FU, LDST_FU],[],[
+ InstrItinData<IIC_LD, [InstrStage<1, [LDST_FU]>]>,
+ InstrItinData<IIC_ST, [InstrStage<1, [LDST_FU]>]>,
+ InstrItinData<IIC_LDSW, [InstrStage<2, [LDST_FU]>]>,
+ InstrItinData<IIC_STSW, [InstrStage<2, [LDST_FU]>]>,
+ InstrItinData<IIC_ALU, [InstrStage<1, [ALU_FU]>]>
+]>;
+
+def LanaiSchedModel : SchedMachineModel {
+ // Cycles for loads to access the cache [default = -1]
+ let LoadLatency = 2;
+
+ // Max micro-ops that can be buffered for optimized loop dispatch/execution.
+ // [default = -1]
+ let LoopMicroOpBufferSize = 0;
+
+ // Allow scheduler to assign default model to any unrecognized opcodes.
+ // [default = 1]
+ let CompleteModel = 0;
+
+ // Max micro-ops that may be scheduled per cycle. [default = 1]
+ let IssueWidth = 1;
+
+ // Extra cycles for a mispredicted branch. [default = -1]
+ let MispredictPenalty = 10;
+
+ // Enable Post RegAlloc Scheduler pass. [default = 0]
+ let PostRAScheduler = 0;
+
+ // Max micro-ops that can be buffered. [default = -1]
+ let MicroOpBufferSize = 0;
+
+ // Per-cycle resources tables. [default = NoItineraries]
+ let Itineraries = LanaiItinerary;
+}
+
+def ALU : ProcResource<1> { let BufferSize = 0; }
+def LdSt : ProcResource<1> { let BufferSize = 0; }
+
+def WriteLD : SchedWrite;
+def WriteST : SchedWrite;
+def WriteLDSW : SchedWrite;
+def WriteSTSW : SchedWrite;
+def WriteALU : SchedWrite;
+
+let SchedModel = LanaiSchedModel in {
+ def : WriteRes<WriteLD, [LdSt]> { let Latency = 2; }
+ def : WriteRes<WriteST, [LdSt]> { let Latency = 2; }
+ def : WriteRes<WriteLDSW, [LdSt]> { let Latency = 2; }
+ def : WriteRes<WriteSTSW, [LdSt]> { let Latency = 4; }
+ def : WriteRes<WriteALU, [ALU]> { let Latency = 1; }
+}
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/Lanai/LanaiSelectionDAGInfo.cpp
new file mode 100644
index 000000000000..b71c30fe3e05
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiSelectionDAGInfo.cpp
@@ -0,0 +1,35 @@
+//===-- LanaiSelectionDAGInfo.cpp - Lanai SelectionDAG Info -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LanaiSelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiSelectionDAGInfo.h"
+
+#include "LanaiTargetMachine.h"
+
+#define DEBUG_TYPE "lanai-selectiondag-info"
+
+namespace llvm {
+
+SDValue LanaiSelectionDAGInfo::EmitTargetCodeForMemcpy(
+ SelectionDAG & /*DAG*/, const SDLoc & /*dl*/, SDValue /*Chain*/,
+ SDValue /*Dst*/, SDValue /*Src*/, SDValue Size, unsigned /*Align*/,
+ bool /*isVolatile*/, bool /*AlwaysInline*/,
+ MachinePointerInfo /*DstPtrInfo*/,
+ MachinePointerInfo /*SrcPtrInfo*/) const {
+ ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+ if (!ConstantSize)
+ return SDValue();
+
+ return SDValue();
+}
+
+} // namespace llvm
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiSelectionDAGInfo.h b/contrib/llvm/lib/Target/Lanai/LanaiSelectionDAGInfo.h
new file mode 100644
index 000000000000..bfd2be2ede09
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiSelectionDAGInfo.h
@@ -0,0 +1,36 @@
+//===-- LanaiSelectionDAGInfo.h - Lanai SelectionDAG Info -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Lanai subclass for TargetSelectionDAGInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAISELECTIONDAGINFO_H
+#define LLVM_LIB_TARGET_LANAI_LANAISELECTIONDAGINFO_H
+
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class LanaiSelectionDAGInfo : public SelectionDAGTargetInfo {
+public:
+ LanaiSelectionDAGInfo() = default;
+
+ SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
+ SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align, bool isVolatile,
+ bool AlwaysInline,
+ MachinePointerInfo DstPtrInfo,
+ MachinePointerInfo SrcPtrInfo) const override;
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAISELECTIONDAGINFO_H
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiSubtarget.cpp b/contrib/llvm/lib/Target/Lanai/LanaiSubtarget.cpp
new file mode 100644
index 000000000000..0fa5e82a7a66
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiSubtarget.cpp
@@ -0,0 +1,47 @@
+//===- LanaiSubtarget.cpp - Lanai Subtarget Information -----------*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Lanai specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiSubtarget.h"
+
+#include "Lanai.h"
+
+#define DEBUG_TYPE "lanai-subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "LanaiGenSubtargetInfo.inc"
+
+using namespace llvm;
+
+void LanaiSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
+ std::string CPUName = CPU;
+ if (CPUName.empty())
+ CPUName = "generic";
+
+ ParseSubtargetFeatures(CPUName, FS);
+}
+
+LanaiSubtarget &LanaiSubtarget::initializeSubtargetDependencies(StringRef CPU,
+ StringRef FS) {
+ initSubtargetFeatures(CPU, FS);
+ return *this;
+}
+
+LanaiSubtarget::LanaiSubtarget(const Triple &TargetTriple, StringRef Cpu,
+ StringRef FeatureString, const TargetMachine &TM,
+ const TargetOptions & /*Options*/,
+ CodeModel::Model /*CodeModel*/,
+ CodeGenOpt::Level /*OptLevel*/)
+ : LanaiGenSubtargetInfo(TargetTriple, Cpu, FeatureString),
+ FrameLowering(initializeSubtargetDependencies(Cpu, FeatureString)),
+ InstrInfo(), TLInfo(TM, *this), TSInfo() {}
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiSubtarget.h b/contrib/llvm/lib/Target/Lanai/LanaiSubtarget.h
new file mode 100644
index 000000000000..2732ef3097ec
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiSubtarget.h
@@ -0,0 +1,76 @@
+//=====-- LanaiSubtarget.h - Define Subtarget for the Lanai -----*- C++ -*--==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Lanai specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAISUBTARGET_H
+#define LLVM_LIB_TARGET_LANAI_LANAISUBTARGET_H
+
+#include "LanaiFrameLowering.h"
+#include "LanaiISelLowering.h"
+#include "LanaiInstrInfo.h"
+#include "LanaiSelectionDAGInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+#define GET_SUBTARGETINFO_HEADER
+#include "LanaiGenSubtargetInfo.inc"
+
+namespace llvm {
+
+class LanaiSubtarget : public LanaiGenSubtargetInfo {
+public:
+ // This constructor initializes the data members to match that
+ // of the specified triple.
+ LanaiSubtarget(const Triple &TargetTriple, StringRef Cpu,
+ StringRef FeatureString, const TargetMachine &TM,
+ const TargetOptions &Options, CodeModel::Model CodeModel,
+ CodeGenOpt::Level OptLevel);
+
+ // ParseSubtargetFeatures - Parses features string setting specified
+ // subtarget options. Definition of function is auto generated by tblgen.
+ void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+ LanaiSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
+
+ void initSubtargetFeatures(StringRef CPU, StringRef FS);
+
+ bool enableMachineScheduler() const override { return true; }
+
+ const LanaiInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+
+ const TargetFrameLowering *getFrameLowering() const override {
+ return &FrameLowering;
+ }
+
+ const LanaiRegisterInfo *getRegisterInfo() const override {
+ return &InstrInfo.getRegisterInfo();
+ }
+
+ const LanaiTargetLowering *getTargetLowering() const override {
+ return &TLInfo;
+ }
+
+ const LanaiSelectionDAGInfo *getSelectionDAGInfo() const override {
+ return &TSInfo;
+ }
+
+private:
+ LanaiFrameLowering FrameLowering;
+ LanaiInstrInfo InstrInfo;
+ LanaiTargetLowering TLInfo;
+ LanaiSelectionDAGInfo TSInfo;
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAISUBTARGET_H
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp b/contrib/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
new file mode 100644
index 000000000000..2a9bc25d7fad
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
@@ -0,0 +1,113 @@
+//===-- LanaiTargetMachine.cpp - Define TargetMachine for Lanai ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements the info about Lanai target spec.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiTargetMachine.h"
+
+#include "Lanai.h"
+#include "LanaiTargetObjectFile.h"
+#include "LanaiTargetTransformInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+namespace llvm {
+void initializeLanaiMemAluCombinerPass(PassRegistry &);
+} // namespace llvm
+
+extern "C" void LLVMInitializeLanaiTarget() {
+ // Register the target.
+ RegisterTargetMachine<LanaiTargetMachine> registered_target(
+ getTheLanaiTarget());
+}
+
+static std::string computeDataLayout() {
+ // Data layout (keep in sync with clang/lib/Basic/Targets.cpp)
+ return "E" // Big endian
+ "-m:e" // ELF name manging
+ "-p:32:32" // 32-bit pointers, 32 bit aligned
+ "-i64:64" // 64 bit integers, 64 bit aligned
+ "-a:0:32" // 32 bit alignment of objects of aggregate type
+ "-n32" // 32 bit native integer width
+ "-S64"; // 64 bit natural stack alignment
+}
+
+static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
+ if (!RM.hasValue())
+ return Reloc::PIC_;
+ return *RM;
+}
+
+LanaiTargetMachine::LanaiTargetMachine(const Target &T, const Triple &TT,
+ StringRef Cpu, StringRef FeatureString,
+ const TargetOptions &Options,
+ Optional<Reloc::Model> RM,
+ CodeModel::Model CodeModel,
+ CodeGenOpt::Level OptLevel)
+ : LLVMTargetMachine(T, computeDataLayout(), TT, Cpu, FeatureString, Options,
+ getEffectiveRelocModel(RM), CodeModel, OptLevel),
+ Subtarget(TT, Cpu, FeatureString, *this, Options, CodeModel, OptLevel),
+ TLOF(new LanaiTargetObjectFile()) {
+ initAsmInfo();
+}
+
+TargetIRAnalysis LanaiTargetMachine::getTargetIRAnalysis() {
+ return TargetIRAnalysis([this](const Function &F) {
+ return TargetTransformInfo(LanaiTTIImpl(this, F));
+ });
+}
+
+namespace {
+// Lanai Code Generator Pass Configuration Options.
+class LanaiPassConfig : public TargetPassConfig {
+public:
+ LanaiPassConfig(LanaiTargetMachine *TM, PassManagerBase *PassManager)
+ : TargetPassConfig(TM, *PassManager) {}
+
+ LanaiTargetMachine &getLanaiTargetMachine() const {
+ return getTM<LanaiTargetMachine>();
+ }
+
+ bool addInstSelector() override;
+ void addPreSched2() override;
+ void addPreEmitPass() override;
+};
+} // namespace
+
+TargetPassConfig *
+LanaiTargetMachine::createPassConfig(PassManagerBase &PassManager) {
+ return new LanaiPassConfig(this, &PassManager);
+}
+
+// Install an instruction selector pass.
+bool LanaiPassConfig::addInstSelector() {
+ addPass(createLanaiISelDag(getLanaiTargetMachine()));
+ return false;
+}
+
+// Implemented by targets that want to run passes immediately before
+// machine code is emitted.
+void LanaiPassConfig::addPreEmitPass() {
+ addPass(createLanaiDelaySlotFillerPass(getLanaiTargetMachine()));
+}
+
+// Run passes after prolog-epilog insertion and before the second instruction
+// scheduling pass.
+void LanaiPassConfig::addPreSched2() {
+ addPass(createLanaiMemAluCombinerPass());
+}
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiTargetMachine.h b/contrib/llvm/lib/Target/Lanai/LanaiTargetMachine.h
new file mode 100644
index 000000000000..5278c70d909d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiTargetMachine.h
@@ -0,0 +1,55 @@
+//===-- LanaiTargetMachine.h - Define TargetMachine for Lanai --- C++ ---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Lanai specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAITARGETMACHINE_H
+#define LLVM_LIB_TARGET_LANAI_LANAITARGETMACHINE_H
+
+#include "LanaiFrameLowering.h"
+#include "LanaiISelLowering.h"
+#include "LanaiInstrInfo.h"
+#include "LanaiSelectionDAGInfo.h"
+#include "LanaiSubtarget.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class formatted_raw_ostream;
+
+class LanaiTargetMachine : public LLVMTargetMachine {
+ LanaiSubtarget Subtarget;
+ std::unique_ptr<TargetLoweringObjectFile> TLOF;
+
+public:
+ LanaiTargetMachine(const Target &TheTarget, const Triple &TargetTriple,
+ StringRef Cpu, StringRef FeatureString,
+ const TargetOptions &Options,
+ Optional<Reloc::Model> RelocationModel,
+ CodeModel::Model CodeModel, CodeGenOpt::Level OptLevel);
+
+ const LanaiSubtarget *
+ getSubtargetImpl(const llvm::Function & /*Fn*/) const override {
+ return &Subtarget;
+ }
+
+ TargetIRAnalysis getTargetIRAnalysis() override;
+
+ // Pass Pipeline Configuration
+ TargetPassConfig *createPassConfig(PassManagerBase &pass_manager) override;
+
+ TargetLoweringObjectFile *getObjFileLowering() const override {
+ return TLOF.get();
+ }
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAITARGETMACHINE_H
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiTargetObjectFile.cpp b/contrib/llvm/lib/Target/Lanai/LanaiTargetObjectFile.cpp
new file mode 100644
index 000000000000..7475dbd68ae4
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiTargetObjectFile.cpp
@@ -0,0 +1,132 @@
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiTargetObjectFile.h"
+
+#include "LanaiSubtarget.h"
+#include "LanaiTargetMachine.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+static cl::opt<unsigned> SSThreshold(
+ "lanai-ssection-threshold", cl::Hidden,
+ cl::desc("Small data and bss section threshold size (default=0)"),
+ cl::init(0));
+
+void LanaiTargetObjectFile::Initialize(MCContext &Ctx,
+ const TargetMachine &TM) {
+ TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+ InitializeELF(TM.Options.UseInitArray);
+
+ SmallDataSection = getContext().getELFSection(
+ ".sdata", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
+ SmallBSSSection = getContext().getELFSection(".sbss", ELF::SHT_NOBITS,
+ ELF::SHF_WRITE | ELF::SHF_ALLOC);
+}
+
+// A address must be loaded from a small section if its size is less than the
+// small section size threshold. Data in this section must be addressed using
+// gp_rel operator.
+static bool isInSmallSection(uint64_t Size) {
+ // gcc has traditionally not treated zero-sized objects as small data, so this
+ // is effectively part of the ABI.
+ return Size > 0 && Size <= SSThreshold;
+}
+
+// Return true if this global address should be placed into small data/bss
+// section.
+bool LanaiTargetObjectFile::isGlobalInSmallSection(
+ const GlobalObject *GO, const TargetMachine &TM) const {
+ if (GO == nullptr)
+ return false;
+
+ // We first check the case where global is a declaration, because finding
+ // section kind using getKindForGlobal() is only allowed for global
+ // definitions.
+ if (GO->isDeclaration() || GO->hasAvailableExternallyLinkage())
+ return isGlobalInSmallSectionImpl(GO, TM);
+
+ return isGlobalInSmallSection(GO, TM, getKindForGlobal(GO, TM));
+}
+
+// Return true if this global address should be placed into small data/bss
+// section.
+bool LanaiTargetObjectFile::isGlobalInSmallSection(const GlobalObject *GO,
+ const TargetMachine &TM,
+ SectionKind Kind) const {
+ return (isGlobalInSmallSectionImpl(GO, TM) &&
+ (Kind.isData() || Kind.isBSS() || Kind.isCommon()));
+}
+
+// Return true if this global address should be placed into small data/bss
+// section. This method does all the work, except for checking the section
+// kind.
+bool LanaiTargetObjectFile::isGlobalInSmallSectionImpl(
+ const GlobalObject *GO, const TargetMachine &TM) const {
+ // Only global variables, not functions.
+ const auto *GVA = dyn_cast<GlobalVariable>(GO);
+ if (!GVA)
+ return false;
+
+ // Global values placed in sections starting with .ldata do not fit in
+ // 21-bits, so always use large memory access for them. FIXME: This is a
+ // workaround for a tool limitation.
+ if (GVA->getSection().startswith(".ldata"))
+ return false;
+
+ if (TM.getCodeModel() == CodeModel::Small)
+ return true;
+
+ if (GVA->hasLocalLinkage())
+ return false;
+
+ if (((GVA->hasExternalLinkage() && GVA->isDeclaration()) ||
+ GVA->hasCommonLinkage()))
+ return false;
+
+ Type *Ty = GVA->getValueType();
+ return isInSmallSection(
+ GVA->getParent()->getDataLayout().getTypeAllocSize(Ty));
+}
+
+MCSection *LanaiTargetObjectFile::SelectSectionForGlobal(
+ const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+ // Handle Small Section classification here.
+ if (Kind.isBSS() && isGlobalInSmallSection(GO, TM, Kind))
+ return SmallBSSSection;
+ if (Kind.isData() && isGlobalInSmallSection(GO, TM, Kind))
+ return SmallDataSection;
+
+ // Otherwise, we work the same as ELF.
+ return TargetLoweringObjectFileELF::SelectSectionForGlobal(GO, Kind, TM);
+}
+
+/// Return true if this constant should be placed into small data section.
+bool LanaiTargetObjectFile::isConstantInSmallSection(const DataLayout &DL,
+ const Constant *CN) const {
+ return isInSmallSection(DL.getTypeAllocSize(CN->getType()));
+}
+
+MCSection *LanaiTargetObjectFile::getSectionForConstant(const DataLayout &DL,
+ SectionKind Kind,
+ const Constant *C,
+ unsigned &Align) const {
+ if (isConstantInSmallSection(DL, C))
+ return SmallDataSection;
+
+ // Otherwise, we work the same as ELF.
+ return TargetLoweringObjectFileELF::getSectionForConstant(DL, Kind, C, Align);
+}
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiTargetObjectFile.h b/contrib/llvm/lib/Target/Lanai/LanaiTargetObjectFile.h
new file mode 100644
index 000000000000..99ec1956da4b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiTargetObjectFile.h
@@ -0,0 +1,46 @@
+//===-- LanaiTargetObjectFile.h - Lanai Object Info -----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAITARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_LANAI_LANAITARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+
+namespace llvm {
+class LanaiTargetMachine;
+class LanaiTargetObjectFile : public TargetLoweringObjectFileELF {
+ MCSection *SmallDataSection;
+ MCSection *SmallBSSSection;
+
+ bool isGlobalInSmallSection(const GlobalObject *GO, const TargetMachine &TM,
+ SectionKind Kind) const;
+ bool isGlobalInSmallSectionImpl(const GlobalObject *GO,
+ const TargetMachine &TM) const;
+
+public:
+ void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+
+ /// Return true if this global address should be placed into small data/bss
+ /// section.
+ bool isGlobalInSmallSection(const GlobalObject *GO,
+ const TargetMachine &TM) const;
+
+ MCSection *SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind,
+ const TargetMachine &TM) const override;
+
+ /// Return true if this constant should be placed into small data section.
+ bool isConstantInSmallSection(const DataLayout &DL, const Constant *CN) const;
+
+ MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
+ const Constant *C,
+ unsigned &Align) const override;
+};
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAITARGETOBJECTFILE_H
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h b/contrib/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h
new file mode 100644
index 000000000000..7fcb3ce45bbb
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h
@@ -0,0 +1,81 @@
+//===-- LanaiTargetTransformInfo.h - Lanai specific TTI ---------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file a TargetTransformInfo::Concept conforming object specific to the
+// Lanai target machine. It uses the target's detailed information to
+// provide more precise answers to certain TTI queries, while letting the
+// target independent and default TTI implementations handle the rest.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAITARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_LANAI_LANAITARGETTRANSFORMINFO_H
+
+#include "Lanai.h"
+#include "LanaiSubtarget.h"
+#include "LanaiTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+class LanaiTTIImpl : public BasicTTIImplBase<LanaiTTIImpl> {
+ typedef BasicTTIImplBase<LanaiTTIImpl> BaseT;
+ typedef TargetTransformInfo TTI;
+ friend BaseT;
+
+ const LanaiSubtarget *ST;
+ const LanaiTargetLowering *TLI;
+
+ const LanaiSubtarget *getST() const { return ST; }
+ const LanaiTargetLowering *getTLI() const { return TLI; }
+
+public:
+ explicit LanaiTTIImpl(const LanaiTargetMachine *TM, const Function &F)
+ : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+ TLI(ST->getTargetLowering()) {}
+
+ bool shouldBuildLookupTables() const { return false; }
+
+ TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth) {
+ if (TyWidth == 32)
+ return TTI::PSK_FastHardware;
+ return TTI::PSK_Software;
+ }
+
+ unsigned getArithmeticInstrCost(
+ unsigned Opcode, Type *Ty,
+ TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+ TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+ TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None) {
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+
+ switch (ISD) {
+ default:
+ return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+ Opd1PropInfo, Opd2PropInfo);
+ case ISD::MUL:
+ case ISD::SDIV:
+ case ISD::UDIV:
+ case ISD::UREM:
+ // This increases the cost associated with multiplication and division
+ // to 64 times what the baseline arithmetic cost is. The arithmetic
+ // instruction cost was arbitrarily chosen to reduce the desirability
+ // of emitting arithmetic instructions that are emulated in software.
+ // TODO: Investigate the performance impact given specialized lowerings.
+ return 64 * BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+ Opd1PropInfo, Opd2PropInfo);
+ }
+ }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAITARGETTRANSFORMINFO_H
diff --git a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
new file mode 100644
index 000000000000..a04fe8112fb9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
@@ -0,0 +1,172 @@
+//===-- LanaiAsmBackend.cpp - Lanai Assembler Backend ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiFixupKinds.h"
+#include "MCTargetDesc/LanaiMCTargetDesc.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+// Prepare value for the target space
+static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
+ switch (Kind) {
+ case FK_Data_1:
+ case FK_Data_2:
+ case FK_Data_4:
+ case FK_Data_8:
+ return Value;
+ case Lanai::FIXUP_LANAI_21:
+ case Lanai::FIXUP_LANAI_21_F:
+ case Lanai::FIXUP_LANAI_25:
+ case Lanai::FIXUP_LANAI_32:
+ case Lanai::FIXUP_LANAI_HI16:
+ case Lanai::FIXUP_LANAI_LO16:
+ return Value;
+ default:
+ llvm_unreachable("Unknown fixup kind!");
+ }
+}
+
+namespace {
+class LanaiAsmBackend : public MCAsmBackend {
+ Triple::OSType OSType;
+
+public:
+ LanaiAsmBackend(const Target &T, Triple::OSType OST)
+ : MCAsmBackend(), OSType(OST) {}
+
+ void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+ uint64_t Value, bool IsPCRel) const override;
+
+ MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
+
+ // No instruction requires relaxation
+ bool fixupNeedsRelaxation(const MCFixup & /*Fixup*/, uint64_t /*Value*/,
+ const MCRelaxableFragment * /*DF*/,
+ const MCAsmLayout & /*Layout*/) const override {
+ return false;
+ }
+
+ const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
+
+ unsigned getNumFixupKinds() const override {
+ return Lanai::NumTargetFixupKinds;
+ }
+
+ bool mayNeedRelaxation(const MCInst & /*Inst*/) const override {
+ return false;
+ }
+
+ void relaxInstruction(const MCInst & /*Inst*/,
+ const MCSubtargetInfo & /*STI*/,
+ MCInst & /*Res*/) const override {}
+
+ bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+};
+
+bool LanaiAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+ if ((Count % 4) != 0)
+ return false;
+
+ for (uint64_t i = 0; i < Count; i += 4)
+ OW->write32(0x15000000);
+
+ return true;
+}
+
+void LanaiAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+ unsigned /*DataSize*/, uint64_t Value,
+ bool /*IsPCRel*/) const {
+ MCFixupKind Kind = Fixup.getKind();
+ Value = adjustFixupValue(static_cast<unsigned>(Kind), Value);
+
+ if (!Value)
+ return; // This value doesn't change the encoding
+
+ // Where in the object and where the number of bytes that need
+ // fixing up
+ unsigned Offset = Fixup.getOffset();
+ unsigned NumBytes = (getFixupKindInfo(Kind).TargetSize + 7) / 8;
+ unsigned FullSize = 4;
+
+ // Grab current value, if any, from bits.
+ uint64_t CurVal = 0;
+
+ // Load instruction and apply value
+ for (unsigned i = 0; i != NumBytes; ++i) {
+ unsigned Idx = (FullSize - 1 - i);
+ CurVal |= static_cast<uint64_t>(static_cast<uint8_t>(Data[Offset + Idx]))
+ << (i * 8);
+ }
+
+ uint64_t Mask =
+ (static_cast<uint64_t>(-1) >> (64 - getFixupKindInfo(Kind).TargetSize));
+ CurVal |= Value & Mask;
+
+ // Write out the fixed up bytes back to the code/data bits.
+ for (unsigned i = 0; i != NumBytes; ++i) {
+ unsigned Idx = (FullSize - 1 - i);
+ Data[Offset + Idx] = static_cast<uint8_t>((CurVal >> (i * 8)) & 0xff);
+ }
+}
+
+MCObjectWriter *
+LanaiAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
+ return createLanaiELFObjectWriter(OS,
+ MCELFObjectTargetWriter::getOSABI(OSType));
+}
+
+const MCFixupKindInfo &
+LanaiAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
+ static const MCFixupKindInfo Infos[Lanai::NumTargetFixupKinds] = {
+ // This table *must* be in same the order of fixup_* kinds in
+ // LanaiFixupKinds.h.
+ // Note: The number of bits indicated here are assumed to be contiguous.
+ // This does not hold true for LANAI_21 and LANAI_21_F which are applied
+ // to bits 0x7cffff and 0x7cfffc, respectively. Since the 'bits' counts
+ // here are used only for cosmetic purposes, we set the size to 16 bits
+ // for these 21-bit relocation as llvm/lib/MC/MCAsmStreamer.cpp checks
+ // no bits are set in the fixup range.
+ //
+ // name offset bits flags
+ {"FIXUP_LANAI_NONE", 0, 32, 0},
+ {"FIXUP_LANAI_21", 16, 16 /*21*/, 0},
+ {"FIXUP_LANAI_21_F", 16, 16 /*21*/, 0},
+ {"FIXUP_LANAI_25", 7, 25, 0},
+ {"FIXUP_LANAI_32", 0, 32, 0},
+ {"FIXUP_LANAI_HI16", 16, 16, 0},
+ {"FIXUP_LANAI_LO16", 16, 16, 0}};
+
+ if (Kind < FirstTargetFixupKind)
+ return MCAsmBackend::getFixupKindInfo(Kind);
+
+ assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+ "Invalid kind!");
+ return Infos[Kind - FirstTargetFixupKind];
+}
+
+} // namespace
+
+MCAsmBackend *llvm::createLanaiAsmBackend(const Target &T,
+ const MCRegisterInfo & /*MRI*/,
+ const Triple &TT, StringRef /*CPU*/,
+ const MCTargetOptions & /*Options*/) {
+ if (!TT.isOSBinFormatELF())
+ llvm_unreachable("OS not supported");
+
+ return new LanaiAsmBackend(T, TT.getOS());
+}
diff --git a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiBaseInfo.h b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiBaseInfo.h
new file mode 100644
index 000000000000..ce7f83509c9b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiBaseInfo.h
@@ -0,0 +1,119 @@
+//===-- LanaiBaseInfo.h - Top level definitions for Lanai MC ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the Lanai target useful for the compiler back-end and the MC libraries.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIBASEINFO_H
+#define LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIBASEINFO_H
+
+#include "LanaiMCTargetDesc.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+// LanaiII - This namespace holds all of the target specific flags that
+// instruction info tracks.
+namespace LanaiII {
+// Target Operand Flag enum.
+enum TOF {
+ //===------------------------------------------------------------------===//
+ // Lanai Specific MachineOperand flags.
+ MO_NO_FLAG,
+
+ // MO_ABS_HI/LO - Represents the hi or low part of an absolute symbol
+ // address.
+ MO_ABS_HI,
+ MO_ABS_LO,
+};
+} // namespace LanaiII
+
+static inline unsigned getLanaiRegisterNumbering(unsigned Reg) {
+ switch (Reg) {
+ case Lanai::R0:
+ return 0;
+ case Lanai::R1:
+ return 1;
+ case Lanai::R2:
+ case Lanai::PC:
+ return 2;
+ case Lanai::R3:
+ return 3;
+ case Lanai::R4:
+ case Lanai::SP:
+ return 4;
+ case Lanai::R5:
+ case Lanai::FP:
+ return 5;
+ case Lanai::R6:
+ return 6;
+ case Lanai::R7:
+ return 7;
+ case Lanai::R8:
+ case Lanai::RV:
+ return 8;
+ case Lanai::R9:
+ return 9;
+ case Lanai::R10:
+ case Lanai::RR1:
+ return 10;
+ case Lanai::R11:
+ case Lanai::RR2:
+ return 11;
+ case Lanai::R12:
+ return 12;
+ case Lanai::R13:
+ return 13;
+ case Lanai::R14:
+ return 14;
+ case Lanai::R15:
+ case Lanai::RCA:
+ return 15;
+ case Lanai::R16:
+ return 16;
+ case Lanai::R17:
+ return 17;
+ case Lanai::R18:
+ return 18;
+ case Lanai::R19:
+ return 19;
+ case Lanai::R20:
+ return 20;
+ case Lanai::R21:
+ return 21;
+ case Lanai::R22:
+ return 22;
+ case Lanai::R23:
+ return 23;
+ case Lanai::R24:
+ return 24;
+ case Lanai::R25:
+ return 25;
+ case Lanai::R26:
+ return 26;
+ case Lanai::R27:
+ return 27;
+ case Lanai::R28:
+ return 28;
+ case Lanai::R29:
+ return 29;
+ case Lanai::R30:
+ return 30;
+ case Lanai::R31:
+ return 31;
+ default:
+ llvm_unreachable("Unknown register number!");
+ }
+}
+} // namespace llvm
+#endif // LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIBASEINFO_H
diff --git a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp
new file mode 100644
index 000000000000..e30d5e9a18eb
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp
@@ -0,0 +1,95 @@
+//===-- LanaiELFObjectWriter.cpp - Lanai ELF Writer -----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/LanaiBaseInfo.h"
+#include "MCTargetDesc/LanaiFixupKinds.h"
+#include "MCTargetDesc/LanaiMCTargetDesc.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+class LanaiELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+ explicit LanaiELFObjectWriter(uint8_t OSABI);
+
+ ~LanaiELFObjectWriter() override;
+
+protected:
+ unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+ const MCFixup &Fixup, bool IsPCRel) const override;
+ bool needsRelocateWithSymbol(const MCSymbol &SD,
+ unsigned Type) const override;
+};
+} // namespace
+
+LanaiELFObjectWriter::LanaiELFObjectWriter(uint8_t OSABI)
+ : MCELFObjectTargetWriter(/*Is64Bit_=*/false, OSABI, ELF::EM_LANAI,
+ /*HasRelocationAddend=*/true) {}
+
+LanaiELFObjectWriter::~LanaiELFObjectWriter() {}
+
+unsigned LanaiELFObjectWriter::getRelocType(MCContext & /*Ctx*/,
+ const MCValue & /*Target*/,
+ const MCFixup &Fixup,
+ bool /*IsPCRel*/) const {
+ unsigned Type;
+ unsigned Kind = static_cast<unsigned>(Fixup.getKind());
+ switch (Kind) {
+ case Lanai::FIXUP_LANAI_21:
+ Type = ELF::R_LANAI_21;
+ break;
+ case Lanai::FIXUP_LANAI_21_F:
+ Type = ELF::R_LANAI_21_F;
+ break;
+ case Lanai::FIXUP_LANAI_25:
+ Type = ELF::R_LANAI_25;
+ break;
+ case Lanai::FIXUP_LANAI_32:
+ case FK_Data_4:
+ Type = ELF::R_LANAI_32;
+ break;
+ case Lanai::FIXUP_LANAI_HI16:
+ Type = ELF::R_LANAI_HI16;
+ break;
+ case Lanai::FIXUP_LANAI_LO16:
+ Type = ELF::R_LANAI_LO16;
+ break;
+ case Lanai::FIXUP_LANAI_NONE:
+ Type = ELF::R_LANAI_NONE;
+ break;
+
+ default:
+ llvm_unreachable("Invalid fixup kind!");
+ }
+ return Type;
+}
+
+bool LanaiELFObjectWriter::needsRelocateWithSymbol(const MCSymbol & /*SD*/,
+ unsigned Type) const {
+ switch (Type) {
+ case ELF::R_LANAI_21:
+ case ELF::R_LANAI_21_F:
+ case ELF::R_LANAI_25:
+ case ELF::R_LANAI_32:
+ case ELF::R_LANAI_HI16:
+ return true;
+ default:
+ return false;
+ }
+}
+
+MCObjectWriter *llvm::createLanaiELFObjectWriter(raw_pwrite_stream &OS,
+ uint8_t OSABI) {
+ MCELFObjectTargetWriter *MOTW = new LanaiELFObjectWriter(OSABI);
+ return createELFObjectWriter(MOTW, OS, /*IsLittleEndian=*/false);
+}
diff --git a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiFixupKinds.h b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiFixupKinds.h
new file mode 100644
index 000000000000..9ff8340d2922
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiFixupKinds.h
@@ -0,0 +1,43 @@
+//===-- LanaiFixupKinds.h - Lanai Specific Fixup Entries --------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIFIXUPKINDS_H
+#define LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIFIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace Lanai {
+// Although most of the current fixup types reflect a unique relocation
+// one can have multiple fixup types for a given relocation and thus need
+// to be uniquely named.
+//
+// This table *must* be in the save order of
+// MCFixupKindInfo Infos[Lanai::NumTargetFixupKinds]
+// in LanaiAsmBackend.cpp.
+//
+enum Fixups {
+ // Results in R_Lanai_NONE
+ FIXUP_LANAI_NONE = FirstTargetFixupKind,
+
+ FIXUP_LANAI_21, // 21-bit symbol relocation
+ FIXUP_LANAI_21_F, // 21-bit symbol relocation, last two bits masked to 0
+ FIXUP_LANAI_25, // 25-bit branch targets
+ FIXUP_LANAI_32, // general 32-bit relocation
+ FIXUP_LANAI_HI16, // upper 16-bits of a symbolic relocation
+ FIXUP_LANAI_LO16, // lower 16-bits of a symbolic relocation
+
+ // Marker
+ LastTargetFixupKind,
+ NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+} // namespace Lanai
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIFIXUPKINDS_H
diff --git a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.cpp b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.cpp
new file mode 100644
index 000000000000..7e2705e67b6d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.cpp
@@ -0,0 +1,43 @@
+//===-- LanaiMCAsmInfo.cpp - Lanai asm properties -----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the LanaiMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiMCAsmInfo.h"
+
+#include "llvm/ADT/Triple.h"
+
+using namespace llvm;
+
+void LanaiMCAsmInfo::anchor() {}
+
+LanaiMCAsmInfo::LanaiMCAsmInfo(const Triple & /*TheTriple*/) {
+ IsLittleEndian = false;
+ PrivateGlobalPrefix = ".L";
+ WeakRefDirective = "\t.weak\t";
+ ExceptionsType = ExceptionHandling::DwarfCFI;
+
+ // Lanai assembly requires ".section" before ".bss"
+ UsesELFSectionDirectiveForBSS = true;
+
+ // Use the integrated assembler instead of system one.
+ UseIntegratedAssembler = true;
+
+ // Use '!' as comment string to correspond with old toolchain.
+ CommentString = "!";
+
+ // Target supports emission of debugging information.
+ SupportsDebugInformation = true;
+
+ // Set the instruction alignment. Currently used only for address adjustment
+ // in dwarf generation.
+ MinInstAlignment = 4;
+}
diff --git a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.h b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.h
new file mode 100644
index 000000000000..3eef0592d2fa
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.h
@@ -0,0 +1,31 @@
+//=====-- LanaiMCAsmInfo.h - Lanai asm properties -----------*- C++ -*--====//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the LanaiMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIMCASMINFO_H
+#define LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIMCASMINFO_H
+
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+class Triple;
+
+class LanaiMCAsmInfo : public MCAsmInfoELF {
+ void anchor() override;
+
+public:
+ explicit LanaiMCAsmInfo(const Triple &TheTriple);
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIMCASMINFO_H
diff --git a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp
new file mode 100644
index 000000000000..ce68b7e24dba
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp
@@ -0,0 +1,309 @@
+//===-- LanaiMCCodeEmitter.cpp - Convert Lanai code to machine code -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LanaiMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Lanai.h"
+#include "MCTargetDesc/LanaiBaseInfo.h"
+#include "MCTargetDesc/LanaiFixupKinds.h"
+#include "MCTargetDesc/LanaiMCExpr.h"
+#include "MCTargetDesc/LanaiMCTargetDesc.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "mccodeemitter"
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
+
+namespace llvm {
+namespace {
+class LanaiMCCodeEmitter : public MCCodeEmitter {
+ LanaiMCCodeEmitter(const LanaiMCCodeEmitter &); // DO NOT IMPLEMENT
+ void operator=(const LanaiMCCodeEmitter &); // DO NOT IMPLEMENT
+ const MCInstrInfo &InstrInfo;
+ MCContext &Context;
+
+public:
+ LanaiMCCodeEmitter(const MCInstrInfo &MCII, MCContext &C)
+ : InstrInfo(MCII), Context(C) {}
+
+ ~LanaiMCCodeEmitter() override {}
+
+ // The functions below are called by TableGen generated functions for getting
+ // the binary encoding of instructions/opereands.
+
+ // getBinaryCodeForInstr - TableGen'erated function for getting the
+ // binary encoding for an instruction.
+ uint64_t getBinaryCodeForInstr(const MCInst &Inst,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &SubtargetInfo) const;
+
+ // getMachineOpValue - Return binary encoding of operand. If the machine
+ // operand requires relocation, record the relocation and return zero.
+ unsigned getMachineOpValue(const MCInst &Inst, const MCOperand &MCOp,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &SubtargetInfo) const;
+
+ unsigned getRiMemoryOpValue(const MCInst &Inst, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &SubtargetInfo) const;
+
+ unsigned getRrMemoryOpValue(const MCInst &Inst, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &SubtargetInfo) const;
+
+ unsigned getSplsOpValue(const MCInst &Inst, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &SubtargetInfo) const;
+
+ unsigned getBranchTargetOpValue(const MCInst &Inst, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &SubtargetInfo) const;
+
+ void encodeInstruction(const MCInst &Inst, raw_ostream &Ostream,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &SubtargetInfo) const override;
+
+ unsigned adjustPqBitsRmAndRrm(const MCInst &Inst, unsigned Value,
+ const MCSubtargetInfo &STI) const;
+
+ unsigned adjustPqBitsSpls(const MCInst &Inst, unsigned Value,
+ const MCSubtargetInfo &STI) const;
+};
+
+Lanai::Fixups FixupKind(const MCExpr *Expr) {
+ if (isa<MCSymbolRefExpr>(Expr))
+ return Lanai::FIXUP_LANAI_21;
+ if (const LanaiMCExpr *McExpr = dyn_cast<LanaiMCExpr>(Expr)) {
+ LanaiMCExpr::VariantKind ExprKind = McExpr->getKind();
+ switch (ExprKind) {
+ case LanaiMCExpr::VK_Lanai_None:
+ return Lanai::FIXUP_LANAI_21;
+ case LanaiMCExpr::VK_Lanai_ABS_HI:
+ return Lanai::FIXUP_LANAI_HI16;
+ case LanaiMCExpr::VK_Lanai_ABS_LO:
+ return Lanai::FIXUP_LANAI_LO16;
+ }
+ }
+ return Lanai::Fixups(0);
+}
+
+// getMachineOpValue - Return binary encoding of operand. If the machine
+// operand requires relocation, record the relocation and return zero.
+unsigned LanaiMCCodeEmitter::getMachineOpValue(
+ const MCInst &Inst, const MCOperand &MCOp, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &SubtargetInfo) const {
+ if (MCOp.isReg())
+ return getLanaiRegisterNumbering(MCOp.getReg());
+ if (MCOp.isImm())
+ return static_cast<unsigned>(MCOp.getImm());
+
+ // MCOp must be an expression
+ assert(MCOp.isExpr());
+ const MCExpr *Expr = MCOp.getExpr();
+
+ // Extract the symbolic reference side of a binary expression.
+ if (Expr->getKind() == MCExpr::Binary) {
+ const MCBinaryExpr *BinaryExpr = static_cast<const MCBinaryExpr *>(Expr);
+ Expr = BinaryExpr->getLHS();
+ }
+
+ assert(isa<LanaiMCExpr>(Expr) || Expr->getKind() == MCExpr::SymbolRef);
+ // Push fixup (all info is contained within)
+ Fixups.push_back(
+ MCFixup::create(0, MCOp.getExpr(), MCFixupKind(FixupKind(Expr))));
+ return 0;
+}
+
+// Helper function to adjust P and Q bits on load and store instructions.
+unsigned adjustPqBits(const MCInst &Inst, unsigned Value, unsigned PBitShift,
+ unsigned QBitShift) {
+ const MCOperand AluOp = Inst.getOperand(3);
+ unsigned AluCode = AluOp.getImm();
+
+ // Set the P bit to one iff the immediate is nonzero and not a post-op
+ // instruction.
+ const MCOperand Op2 = Inst.getOperand(2);
+ Value &= ~(1 << PBitShift);
+ if (!LPAC::isPostOp(AluCode) &&
+ ((Op2.isImm() && Op2.getImm() != 0) ||
+ (Op2.isReg() && Op2.getReg() != Lanai::R0) || (Op2.isExpr())))
+ Value |= (1 << PBitShift);
+
+ // Set the Q bit to one iff it is a post- or pre-op instruction.
+ assert(Inst.getOperand(0).isReg() && Inst.getOperand(1).isReg() &&
+ "Expected register operand.");
+ Value &= ~(1 << QBitShift);
+ if (LPAC::modifiesOp(AluCode) && ((Op2.isImm() && Op2.getImm() != 0) ||
+ (Op2.isReg() && Op2.getReg() != Lanai::R0)))
+ Value |= (1 << QBitShift);
+
+ return Value;
+}
+
+unsigned
+LanaiMCCodeEmitter::adjustPqBitsRmAndRrm(const MCInst &Inst, unsigned Value,
+ const MCSubtargetInfo &STI) const {
+ return adjustPqBits(Inst, Value, 17, 16);
+}
+
+unsigned
+LanaiMCCodeEmitter::adjustPqBitsSpls(const MCInst &Inst, unsigned Value,
+ const MCSubtargetInfo &STI) const {
+ return adjustPqBits(Inst, Value, 11, 10);
+}
+
+void LanaiMCCodeEmitter::encodeInstruction(
+ const MCInst &Inst, raw_ostream &Ostream, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &SubtargetInfo) const {
+ // Get instruction encoding and emit it
+ unsigned Value = getBinaryCodeForInstr(Inst, Fixups, SubtargetInfo);
+ ++MCNumEmitted; // Keep track of the number of emitted insns.
+
+ // Emit bytes in big-endian
+ for (int i = (4 - 1) * 8; i >= 0; i -= 8)
+ Ostream << static_cast<char>((Value >> i) & 0xff);
+}
+
+// Encode Lanai Memory Operand
+unsigned LanaiMCCodeEmitter::getRiMemoryOpValue(
+ const MCInst &Inst, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &SubtargetInfo) const {
+ unsigned Encoding;
+ const MCOperand Op1 = Inst.getOperand(OpNo + 0);
+ const MCOperand Op2 = Inst.getOperand(OpNo + 1);
+ const MCOperand AluOp = Inst.getOperand(OpNo + 2);
+
+ assert(Op1.isReg() && "First operand is not register.");
+ assert((Op2.isImm() || Op2.isExpr()) &&
+ "Second operand is neither an immediate nor an expression.");
+ assert((LPAC::getAluOp(AluOp.getImm()) == LPAC::ADD) &&
+ "Register immediate only supports addition operator");
+
+ Encoding = (getLanaiRegisterNumbering(Op1.getReg()) << 18);
+ if (Op2.isImm()) {
+ assert(isInt<16>(Op2.getImm()) &&
+ "Constant value truncated (limited to 16-bit)");
+
+ Encoding |= (Op2.getImm() & 0xffff);
+ if (Op2.getImm() != 0) {
+ if (LPAC::isPreOp(AluOp.getImm()))
+ Encoding |= (0x3 << 16);
+ if (LPAC::isPostOp(AluOp.getImm()))
+ Encoding |= (0x1 << 16);
+ }
+ } else
+ getMachineOpValue(Inst, Op2, Fixups, SubtargetInfo);
+
+ return Encoding;
+}
+
+unsigned LanaiMCCodeEmitter::getRrMemoryOpValue(
+ const MCInst &Inst, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &SubtargetInfo) const {
+ unsigned Encoding;
+ const MCOperand Op1 = Inst.getOperand(OpNo + 0);
+ const MCOperand Op2 = Inst.getOperand(OpNo + 1);
+ const MCOperand AluMCOp = Inst.getOperand(OpNo + 2);
+
+ assert(Op1.isReg() && "First operand is not register.");
+ Encoding = (getLanaiRegisterNumbering(Op1.getReg()) << 15);
+ assert(Op2.isReg() && "Second operand is not register.");
+ Encoding |= (getLanaiRegisterNumbering(Op2.getReg()) << 10);
+
+ assert(AluMCOp.isImm() && "Third operator is not immediate.");
+ // Set BBB
+ unsigned AluOp = AluMCOp.getImm();
+ Encoding |= LPAC::encodeLanaiAluCode(AluOp) << 5;
+ // Set P and Q
+ if (LPAC::isPreOp(AluOp))
+ Encoding |= (0x3 << 8);
+ if (LPAC::isPostOp(AluOp))
+ Encoding |= (0x1 << 8);
+ // Set JJJJ
+ switch (LPAC::getAluOp(AluOp)) {
+ case LPAC::SHL:
+ case LPAC::SRL:
+ Encoding |= 0x10;
+ break;
+ case LPAC::SRA:
+ Encoding |= 0x18;
+ break;
+ default:
+ break;
+ }
+
+ return Encoding;
+}
+
+unsigned
+LanaiMCCodeEmitter::getSplsOpValue(const MCInst &Inst, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &SubtargetInfo) const {
+ unsigned Encoding;
+ const MCOperand Op1 = Inst.getOperand(OpNo + 0);
+ const MCOperand Op2 = Inst.getOperand(OpNo + 1);
+ const MCOperand AluOp = Inst.getOperand(OpNo + 2);
+
+ assert(Op1.isReg() && "First operand is not register.");
+ assert((Op2.isImm() || Op2.isExpr()) &&
+ "Second operand is neither an immediate nor an expression.");
+ assert((LPAC::getAluOp(AluOp.getImm()) == LPAC::ADD) &&
+ "Register immediate only supports addition operator");
+
+ Encoding = (getLanaiRegisterNumbering(Op1.getReg()) << 12);
+ if (Op2.isImm()) {
+ assert(isInt<10>(Op2.getImm()) &&
+ "Constant value truncated (limited to 10-bit)");
+
+ Encoding |= (Op2.getImm() & 0x3ff);
+ if (Op2.getImm() != 0) {
+ if (LPAC::isPreOp(AluOp.getImm()))
+ Encoding |= (0x3 << 10);
+ if (LPAC::isPostOp(AluOp.getImm()))
+ Encoding |= (0x1 << 10);
+ }
+ } else
+ getMachineOpValue(Inst, Op2, Fixups, SubtargetInfo);
+
+ return Encoding;
+}
+
+unsigned LanaiMCCodeEmitter::getBranchTargetOpValue(
+ const MCInst &Inst, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &SubtargetInfo) const {
+ const MCOperand &MCOp = Inst.getOperand(OpNo);
+ if (MCOp.isReg() || MCOp.isImm())
+ return getMachineOpValue(Inst, MCOp, Fixups, SubtargetInfo);
+
+ Fixups.push_back(MCFixup::create(
+ 0, MCOp.getExpr(), static_cast<MCFixupKind>(Lanai::FIXUP_LANAI_25)));
+
+ return 0;
+}
+
+#include "LanaiGenMCCodeEmitter.inc"
+} // namespace
+} // namespace llvm
+
+llvm::MCCodeEmitter *
+llvm::createLanaiMCCodeEmitter(const MCInstrInfo &InstrInfo,
+ const MCRegisterInfo & /*MRI*/,
+ MCContext &context) {
+ return new LanaiMCCodeEmitter(InstrInfo, context);
+}
diff --git a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp
new file mode 100644
index 000000000000..201c95de07f4
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp
@@ -0,0 +1,60 @@
+//===-- LanaiMCExpr.cpp - Lanai specific MC expression classes ------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiMCExpr.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "lanaimcexpr"
+
+const LanaiMCExpr *LanaiMCExpr::create(VariantKind Kind, const MCExpr *Expr,
+ MCContext &Ctx) {
+ return new (Ctx) LanaiMCExpr(Kind, Expr);
+}
+
+void LanaiMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
+ if (Kind == VK_Lanai_None) {
+ Expr->print(OS, MAI);
+ return;
+ }
+
+ switch (Kind) {
+ default:
+ llvm_unreachable("Invalid kind!");
+ case VK_Lanai_ABS_HI:
+ OS << "hi";
+ break;
+ case VK_Lanai_ABS_LO:
+ OS << "lo";
+ break;
+ }
+
+ OS << '(';
+ const MCExpr *Expr = getSubExpr();
+ Expr->print(OS, MAI);
+ OS << ')';
+}
+
+void LanaiMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
+ Streamer.visitUsedExpr(*getSubExpr());
+}
+
+bool LanaiMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
+ const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const {
+ if (!getSubExpr()->evaluateAsRelocatable(Res, Layout, Fixup))
+ return false;
+
+ Res =
+ MCValue::get(Res.getSymA(), Res.getSymB(), Res.getConstant(), getKind());
+
+ return true;
+}
diff --git a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.h b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.h
new file mode 100644
index 000000000000..5004d541ff70
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.h
@@ -0,0 +1,56 @@
+//===-- LanaiMCExpr.h - Lanai specific MC expression classes ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIMCEXPR_H
+#define LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIMCEXPR_H
+
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+
+namespace llvm {
+
+class LanaiMCExpr : public MCTargetExpr {
+public:
+ enum VariantKind { VK_Lanai_None, VK_Lanai_ABS_HI, VK_Lanai_ABS_LO };
+
+private:
+ const VariantKind Kind;
+ const MCExpr *Expr;
+
+ explicit LanaiMCExpr(VariantKind Kind, const MCExpr *Expr)
+ : Kind(Kind), Expr(Expr) {}
+
+public:
+ static const LanaiMCExpr *create(VariantKind Kind, const MCExpr *Expr,
+ MCContext &Ctx);
+
+ // Returns the kind of this expression.
+ VariantKind getKind() const { return Kind; }
+
+ // Returns the child of this expression.
+ const MCExpr *getSubExpr() const { return Expr; }
+
+ void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+ bool evaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const override;
+ void visitUsedExpr(MCStreamer &Streamer) const override;
+ MCFragment *findAssociatedFragment() const override {
+ return getSubExpr()->findAssociatedFragment();
+ }
+
+ // There are no TLS LanaiMCExprs at the moment.
+ void fixELFSymbolsInTLSFixups(MCAssembler & /*Asm*/) const override {}
+
+ static bool classof(const MCExpr *E) {
+ return E->getKind() == MCExpr::Target;
+ }
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
new file mode 100644
index 000000000000..c2f8c0f7ad50
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
@@ -0,0 +1,154 @@
+//===-- LanaiMCTargetDesc.cpp - Lanai Target Descriptions -----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Lanai specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiMCTargetDesc.h"
+
+#include "InstPrinter/LanaiInstPrinter.h"
+#include "LanaiMCAsmInfo.h"
+#include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_INSTRINFO_MC_DESC
+#include "LanaiGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "LanaiGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "LanaiGenRegisterInfo.inc"
+
+using namespace llvm;
+
+static MCInstrInfo *createLanaiMCInstrInfo() {
+ MCInstrInfo *X = new MCInstrInfo();
+ InitLanaiMCInstrInfo(X);
+ return X;
+}
+
+static MCRegisterInfo *createLanaiMCRegisterInfo(const Triple & /*TT*/) {
+ MCRegisterInfo *X = new MCRegisterInfo();
+ InitLanaiMCRegisterInfo(X, Lanai::RCA, 0, 0, Lanai::PC);
+ return X;
+}
+
+static MCSubtargetInfo *
+createLanaiMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
+ std::string CPUName = CPU;
+ if (CPUName.empty())
+ CPUName = "generic";
+
+ return createLanaiMCSubtargetInfoImpl(TT, CPUName, FS);
+}
+
+static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context,
+ MCAsmBackend &MAB, raw_pwrite_stream &OS,
+ MCCodeEmitter *Emitter, bool RelaxAll) {
+ if (!T.isOSBinFormatELF())
+ llvm_unreachable("OS not supported");
+
+ return createELFStreamer(Context, MAB, OS, Emitter, RelaxAll);
+}
+
+static MCInstPrinter *createLanaiMCInstPrinter(const Triple & /*T*/,
+ unsigned SyntaxVariant,
+ const MCAsmInfo &MAI,
+ const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI) {
+ if (SyntaxVariant == 0)
+ return new LanaiInstPrinter(MAI, MII, MRI);
+ return 0;
+}
+
+static MCRelocationInfo *createLanaiElfRelocation(const Triple &TheTriple,
+ MCContext &Ctx) {
+ return createMCRelocationInfo(TheTriple, Ctx);
+}
+
+namespace {
+class LanaiMCInstrAnalysis : public MCInstrAnalysis {
+public:
+ explicit LanaiMCInstrAnalysis(const MCInstrInfo *Info)
+ : MCInstrAnalysis(Info) {}
+
+ bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size,
+ uint64_t &Target) const override {
+ if (Inst.getNumOperands() == 0)
+ return false;
+
+ if (Info->get(Inst.getOpcode()).OpInfo[0].OperandType ==
+ MCOI::OPERAND_PCREL) {
+ int64_t Imm = Inst.getOperand(0).getImm();
+ Target = Addr + Size + Imm;
+ return true;
+ } else {
+ int64_t Imm = Inst.getOperand(0).getImm();
+
+ // Skip case where immediate is 0 as that occurs in file that isn't linked
+ // and the branch target inferred would be wrong.
+ if (Imm == 0)
+ return false;
+
+ Target = Imm;
+ return true;
+ }
+ }
+};
+} // end anonymous namespace
+
+static MCInstrAnalysis *createLanaiInstrAnalysis(const MCInstrInfo *Info) {
+ return new LanaiMCInstrAnalysis(Info);
+}
+
+extern "C" void LLVMInitializeLanaiTargetMC() {
+ // Register the MC asm info.
+ RegisterMCAsmInfo<LanaiMCAsmInfo> X(getTheLanaiTarget());
+
+ // Register the MC instruction info.
+ TargetRegistry::RegisterMCInstrInfo(getTheLanaiTarget(),
+ createLanaiMCInstrInfo);
+
+ // Register the MC register info.
+ TargetRegistry::RegisterMCRegInfo(getTheLanaiTarget(),
+ createLanaiMCRegisterInfo);
+
+ // Register the MC subtarget info.
+ TargetRegistry::RegisterMCSubtargetInfo(getTheLanaiTarget(),
+ createLanaiMCSubtargetInfo);
+
+ // Register the MC code emitter
+ TargetRegistry::RegisterMCCodeEmitter(getTheLanaiTarget(),
+ llvm::createLanaiMCCodeEmitter);
+
+ // Register the ASM Backend
+ TargetRegistry::RegisterMCAsmBackend(getTheLanaiTarget(),
+ createLanaiAsmBackend);
+
+ // Register the MCInstPrinter.
+ TargetRegistry::RegisterMCInstPrinter(getTheLanaiTarget(),
+ createLanaiMCInstPrinter);
+
+ // Register the ELF streamer.
+ TargetRegistry::RegisterELFStreamer(getTheLanaiTarget(), createMCStreamer);
+
+ // Register the MC relocation info.
+ TargetRegistry::RegisterMCRelocationInfo(getTheLanaiTarget(),
+ createLanaiElfRelocation);
+
+ // Register the MC instruction analyzer.
+ TargetRegistry::RegisterMCInstrAnalysis(getTheLanaiTarget(),
+ createLanaiInstrAnalysis);
+}
diff --git a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h
new file mode 100644
index 000000000000..8adaf4cea420
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h
@@ -0,0 +1,61 @@
+//===-- LanaiMCTargetDesc.h - Lanai Target Descriptions ---------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Lanai specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIMCTARGETDESC_H
+#define LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIMCTARGETDESC_H
+
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCInstrAnalysis;
+class MCObjectWriter;
+class MCRelocationInfo;
+class MCSubtargetInfo;
+class Target;
+class Triple;
+class StringRef;
+class raw_pwrite_stream;
+
+Target &getTheLanaiTarget();
+
+MCCodeEmitter *createLanaiMCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx);
+
+MCAsmBackend *createLanaiAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+ const Triple &TheTriple, StringRef CPU,
+ const MCTargetOptions &Options);
+
+MCObjectWriter *createLanaiELFObjectWriter(raw_pwrite_stream &OS,
+ uint8_t OSABI);
+} // namespace llvm
+
+// Defines symbolic names for Lanai registers. This defines a mapping from
+// register name to register number.
+#define GET_REGINFO_ENUM
+#include "LanaiGenRegisterInfo.inc"
+
+// Defines symbolic names for the Lanai instructions.
+#define GET_INSTRINFO_ENUM
+#include "LanaiGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "LanaiGenSubtargetInfo.inc"
+
+#endif // LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIMCTARGETDESC_H
diff --git a/contrib/llvm/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp b/contrib/llvm/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp
new file mode 100644
index 000000000000..e377db1d49da
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp
@@ -0,0 +1,25 @@
+//===-- LanaiTargetInfo.cpp - Lanai Target Implementation -----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Lanai.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+namespace llvm {
+Target &getTheLanaiTarget() {
+ static Target TheLanaiTarget;
+ return TheLanaiTarget;
+}
+} // namespace llvm
+
+extern "C" void LLVMInitializeLanaiTargetInfo() {
+ RegisterTarget<Triple::lanai> X(getTheLanaiTarget(), "lanai", "Lanai");
+}
diff --git a/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp b/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp
new file mode 100644
index 000000000000..be6d1a84a377
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp
@@ -0,0 +1,116 @@
+//===-- MSP430InstPrinter.cpp - Convert MSP430 MCInst to assembly syntax --===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an MSP430 MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430InstPrinter.h"
+#include "MSP430.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+
+// Include the auto-generated portion of the assembly writer.
+#include "MSP430GenAsmWriter.inc"
+
+void MSP430InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+ StringRef Annot, const MCSubtargetInfo &STI) {
+ printInstruction(MI, O);
+ printAnnotation(O, Annot);
+}
+
+void MSP430InstPrinter::printPCRelImmOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isImm())
+ O << Op.getImm();
+ else {
+ assert(Op.isExpr() && "unknown pcrel immediate operand");
+ Op.getExpr()->print(O, &MAI);
+ }
+}
+
+void MSP430InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O, const char *Modifier) {
+ assert((Modifier == nullptr || Modifier[0] == 0) && "No modifiers supported");
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isReg()) {
+ O << getRegisterName(Op.getReg());
+ } else if (Op.isImm()) {
+ O << '#' << Op.getImm();
+ } else {
+ assert(Op.isExpr() && "unknown operand kind in printOperand");
+ O << '#';
+ Op.getExpr()->print(O, &MAI);
+ }
+}
+
+void MSP430InstPrinter::printSrcMemOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O,
+ const char *Modifier) {
+ const MCOperand &Base = MI->getOperand(OpNo);
+ const MCOperand &Disp = MI->getOperand(OpNo+1);
+
+ // Print displacement first
+
+ // If the global address expression is a part of displacement field with a
+ // register base, we should not emit any prefix symbol here, e.g.
+ // mov.w &foo, r1
+ // vs
+ // mov.w glb(r1), r2
+ // Otherwise (!) msp430-as will silently miscompile the output :(
+ if (!Base.getReg())
+ O << '&';
+
+ if (Disp.isExpr())
+ Disp.getExpr()->print(O, &MAI);
+ else {
+ assert(Disp.isImm() && "Expected immediate in displacement field");
+ O << Disp.getImm();
+ }
+
+ // Print register base field
+ if (Base.getReg())
+ O << '(' << getRegisterName(Base.getReg()) << ')';
+}
+
+void MSP430InstPrinter::printCCOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ unsigned CC = MI->getOperand(OpNo).getImm();
+
+ switch (CC) {
+ default:
+ llvm_unreachable("Unsupported CC code");
+ case MSP430CC::COND_E:
+ O << "eq";
+ break;
+ case MSP430CC::COND_NE:
+ O << "ne";
+ break;
+ case MSP430CC::COND_HS:
+ O << "hs";
+ break;
+ case MSP430CC::COND_LO:
+ O << "lo";
+ break;
+ case MSP430CC::COND_GE:
+ O << "ge";
+ break;
+ case MSP430CC::COND_L:
+ O << 'l';
+ break;
+ }
+}
diff --git a/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h b/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
new file mode 100644
index 000000000000..72afec18becb
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
@@ -0,0 +1,43 @@
+//= MSP430InstPrinter.h - Convert MSP430 MCInst to assembly syntax -*- C++ -*-//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints a MSP430 MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MSP430_INSTPRINTER_MSP430INSTPRINTER_H
+#define LLVM_LIB_TARGET_MSP430_INSTPRINTER_MSP430INSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+ class MSP430InstPrinter : public MCInstPrinter {
+ public:
+ MSP430InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI)
+ : MCInstPrinter(MAI, MII, MRI) {}
+
+ void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
+ const MCSubtargetInfo &STI) override;
+
+ // Autogenerated by tblgen.
+ void printInstruction(const MCInst *MI, raw_ostream &O);
+ static const char *getRegisterName(unsigned RegNo);
+
+ void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O,
+ const char *Modifier = nullptr);
+ void printPCRelImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printSrcMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O,
+ const char *Modifier = nullptr);
+ void printCCOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+ };
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
new file mode 100644
index 000000000000..c26b3081dbc3
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
@@ -0,0 +1,26 @@
+//===-- MSP430MCAsmInfo.cpp - MSP430 asm properties -----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the MSP430MCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430MCAsmInfo.h"
+using namespace llvm;
+
+void MSP430MCAsmInfo::anchor() { }
+
+MSP430MCAsmInfo::MSP430MCAsmInfo(const Triple &TT) {
+ PointerSize = CalleeSaveStackSlotSize = 2;
+
+ CommentString = ";";
+
+ AlignmentIsInBytes = false;
+ UsesELFSectionDirectiveForBSS = true;
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
new file mode 100644
index 000000000000..183dee36a047
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
@@ -0,0 +1,31 @@
+//===-- MSP430MCAsmInfo.h - MSP430 asm properties --------------*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the MSP430MCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MSP430_MCTARGETDESC_MSP430MCASMINFO_H
+#define LLVM_LIB_TARGET_MSP430_MCTARGETDESC_MSP430MCASMINFO_H
+
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+class Triple;
+
+class MSP430MCAsmInfo : public MCAsmInfoELF {
+ void anchor() override;
+
+public:
+ explicit MSP430MCAsmInfo(const Triple &TT);
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
new file mode 100644
index 000000000000..8c715500f38b
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
@@ -0,0 +1,79 @@
+//===-- MSP430MCTargetDesc.cpp - MSP430 Target Descriptions ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides MSP430 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430MCTargetDesc.h"
+#include "InstPrinter/MSP430InstPrinter.h"
+#include "MSP430MCAsmInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "MSP430GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "MSP430GenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "MSP430GenRegisterInfo.inc"
+
+static MCInstrInfo *createMSP430MCInstrInfo() {
+ MCInstrInfo *X = new MCInstrInfo();
+ InitMSP430MCInstrInfo(X);
+ return X;
+}
+
+static MCRegisterInfo *createMSP430MCRegisterInfo(const Triple &TT) {
+ MCRegisterInfo *X = new MCRegisterInfo();
+ InitMSP430MCRegisterInfo(X, MSP430::PC);
+ return X;
+}
+
+static MCSubtargetInfo *
+createMSP430MCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
+ return createMSP430MCSubtargetInfoImpl(TT, CPU, FS);
+}
+
+static MCInstPrinter *createMSP430MCInstPrinter(const Triple &T,
+ unsigned SyntaxVariant,
+ const MCAsmInfo &MAI,
+ const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI) {
+ if (SyntaxVariant == 0)
+ return new MSP430InstPrinter(MAI, MII, MRI);
+ return nullptr;
+}
+
+extern "C" void LLVMInitializeMSP430TargetMC() {
+ // Register the MC asm info.
+ RegisterMCAsmInfo<MSP430MCAsmInfo> X(getTheMSP430Target());
+
+ // Register the MC instruction info.
+ TargetRegistry::RegisterMCInstrInfo(getTheMSP430Target(),
+ createMSP430MCInstrInfo);
+
+ // Register the MC register info.
+ TargetRegistry::RegisterMCRegInfo(getTheMSP430Target(),
+ createMSP430MCRegisterInfo);
+
+ // Register the MC subtarget info.
+ TargetRegistry::RegisterMCSubtargetInfo(getTheMSP430Target(),
+ createMSP430MCSubtargetInfo);
+
+ // Register the MCInstPrinter.
+ TargetRegistry::RegisterMCInstPrinter(getTheMSP430Target(),
+ createMSP430MCInstPrinter);
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h
new file mode 100644
index 000000000000..b901c5f09794
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h
@@ -0,0 +1,38 @@
+//===-- MSP430MCTargetDesc.h - MSP430 Target Descriptions -------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides MSP430 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MSP430_MCTARGETDESC_MSP430MCTARGETDESC_H
+#define LLVM_LIB_TARGET_MSP430_MCTARGETDESC_MSP430MCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+class Target;
+
+Target &getTheMSP430Target();
+
+} // End llvm namespace
+
+// Defines symbolic names for MSP430 registers.
+// This defines a mapping from register name to register number.
+#define GET_REGINFO_ENUM
+#include "MSP430GenRegisterInfo.inc"
+
+// Defines symbolic names for the MSP430 instructions.
+#define GET_INSTRINFO_ENUM
+#include "MSP430GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "MSP430GenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430.h b/contrib/llvm/lib/Target/MSP430/MSP430.h
new file mode 100644
index 000000000000..796f25233123
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430.h
@@ -0,0 +1,47 @@
+//==-- MSP430.h - Top-level interface for MSP430 representation --*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in
+// the LLVM MSP430 backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MSP430_MSP430_H
+#define LLVM_LIB_TARGET_MSP430_MSP430_H
+
+#include "MCTargetDesc/MSP430MCTargetDesc.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace MSP430CC {
+ // MSP430 specific condition code.
+ enum CondCodes {
+ COND_E = 0, // aka COND_Z
+ COND_NE = 1, // aka COND_NZ
+ COND_HS = 2, // aka COND_C
+ COND_LO = 3, // aka COND_NC
+ COND_GE = 4,
+ COND_L = 5,
+
+ COND_INVALID = -1
+ };
+}
+
+namespace llvm {
+ class MSP430TargetMachine;
+ class FunctionPass;
+ class formatted_raw_ostream;
+
+ FunctionPass *createMSP430ISelDag(MSP430TargetMachine &TM,
+ CodeGenOpt::Level OptLevel);
+
+ FunctionPass *createMSP430BranchSelectionPass();
+
+} // end namespace llvm;
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430.td b/contrib/llvm/lib/Target/MSP430/MSP430.td
new file mode 100644
index 000000000000..dfea669f3ba1
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430.td
@@ -0,0 +1,60 @@
+//===-- MSP430.td - Describe the MSP430 Target Machine -----*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This is the top level entry point for the MSP430 target.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// Subtarget Features.
+//===----------------------------------------------------------------------===//
+def FeatureX
+ : SubtargetFeature<"ext", "ExtendedInsts", "true",
+ "Enable MSP430-X extensions">;
+
+//===----------------------------------------------------------------------===//
+// MSP430 supported processors.
+//===----------------------------------------------------------------------===//
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"generic", []>;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "MSP430RegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Calling Convention Description
+//===----------------------------------------------------------------------===//
+
+include "MSP430CallingConv.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "MSP430InstrInfo.td"
+
+def MSP430InstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// Target Declaration
+//===----------------------------------------------------------------------===//
+
+def MSP430 : Target {
+ let InstructionSet = MSP430InstrInfo;
+}
+
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp b/contrib/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp
new file mode 100644
index 000000000000..abf062fe86ae
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp
@@ -0,0 +1,159 @@
+//===-- MSP430AsmPrinter.cpp - MSP430 LLVM assembly writer ----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to the MSP430 assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430.h"
+#include "InstPrinter/MSP430InstPrinter.h"
+#include "MSP430InstrInfo.h"
+#include "MSP430MCInstLower.h"
+#include "MSP430TargetMachine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+namespace {
+ class MSP430AsmPrinter : public AsmPrinter {
+ public:
+ MSP430AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
+ : AsmPrinter(TM, std::move(Streamer)) {}
+
+ StringRef getPassName() const override { return "MSP430 Assembly Printer"; }
+
+ void printOperand(const MachineInstr *MI, int OpNum,
+ raw_ostream &O, const char* Modifier = nullptr);
+ void printSrcMemOperand(const MachineInstr *MI, int OpNum,
+ raw_ostream &O);
+ bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &O) override;
+ bool PrintAsmMemoryOperand(const MachineInstr *MI,
+ unsigned OpNo, unsigned AsmVariant,
+ const char *ExtraCode, raw_ostream &O) override;
+ void EmitInstruction(const MachineInstr *MI) override;
+ };
+} // end of anonymous namespace
+
+
+void MSP430AsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
+ raw_ostream &O, const char *Modifier) {
+ const MachineOperand &MO = MI->getOperand(OpNum);
+ switch (MO.getType()) {
+ default: llvm_unreachable("Not implemented yet!");
+ case MachineOperand::MO_Register:
+ O << MSP430InstPrinter::getRegisterName(MO.getReg());
+ return;
+ case MachineOperand::MO_Immediate:
+ if (!Modifier || strcmp(Modifier, "nohash"))
+ O << '#';
+ O << MO.getImm();
+ return;
+ case MachineOperand::MO_MachineBasicBlock:
+ MO.getMBB()->getSymbol()->print(O, MAI);
+ return;
+ case MachineOperand::MO_GlobalAddress: {
+ bool isMemOp = Modifier && !strcmp(Modifier, "mem");
+ uint64_t Offset = MO.getOffset();
+
+ // If the global address expression is a part of displacement field with a
+ // register base, we should not emit any prefix symbol here, e.g.
+ // mov.w &foo, r1
+ // vs
+ // mov.w glb(r1), r2
+ // Otherwise (!) msp430-as will silently miscompile the output :(
+ if (!Modifier || strcmp(Modifier, "nohash"))
+ O << (isMemOp ? '&' : '#');
+ if (Offset)
+ O << '(' << Offset << '+';
+
+ getSymbol(MO.getGlobal())->print(O, MAI);
+
+ if (Offset)
+ O << ')';
+
+ return;
+ }
+ }
+}
+
+void MSP430AsmPrinter::printSrcMemOperand(const MachineInstr *MI, int OpNum,
+ raw_ostream &O) {
+ const MachineOperand &Base = MI->getOperand(OpNum);
+ const MachineOperand &Disp = MI->getOperand(OpNum+1);
+
+ // Print displacement first
+
+ // Imm here is in fact global address - print extra modifier.
+ if (Disp.isImm() && !Base.getReg())
+ O << '&';
+ printOperand(MI, OpNum+1, O, "nohash");
+
+ // Print register base field
+ if (Base.getReg()) {
+ O << '(';
+ printOperand(MI, OpNum, O);
+ O << ')';
+ }
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool MSP430AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant,
+ const char *ExtraCode, raw_ostream &O) {
+ // Does this asm operand have a single letter operand modifier?
+ if (ExtraCode && ExtraCode[0])
+ return true; // Unknown modifier.
+
+ printOperand(MI, OpNo, O);
+ return false;
+}
+
+bool MSP430AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+ unsigned OpNo, unsigned AsmVariant,
+ const char *ExtraCode,
+ raw_ostream &O) {
+ if (ExtraCode && ExtraCode[0]) {
+ return true; // Unknown modifier.
+ }
+ printSrcMemOperand(MI, OpNo, O);
+ return false;
+}
+
+//===----------------------------------------------------------------------===//
+void MSP430AsmPrinter::EmitInstruction(const MachineInstr *MI) {
+ MSP430MCInstLower MCInstLowering(OutContext, *this);
+
+ MCInst TmpInst;
+ MCInstLowering.Lower(MI, TmpInst);
+ EmitToStreamer(*OutStreamer, TmpInst);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeMSP430AsmPrinter() {
+ RegisterAsmPrinter<MSP430AsmPrinter> X(getTheMSP430Target());
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430BranchSelector.cpp b/contrib/llvm/lib/Target/MSP430/MSP430BranchSelector.cpp
new file mode 100644
index 000000000000..5fd6b6305f68
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430BranchSelector.cpp
@@ -0,0 +1,257 @@
+//===-- MSP430BranchSelector.cpp - Emit long conditional branches ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that scans a machine function to determine which
+// conditional branches need more than 10 bits of displacement to reach their
+// target basic block. It does this in two passes; a calculation of basic block
+// positions pass, and a branch pseudo op to machine branch opcode pass. This
+// pass should be run last, just before the assembly printer.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430.h"
+#include "MSP430InstrInfo.h"
+#include "MSP430Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "msp430-branch-select"
+
+static cl::opt<bool>
+ BranchSelectEnabled("msp430-branch-select", cl::Hidden, cl::init(true),
+ cl::desc("Expand out of range branches"));
+
+STATISTIC(NumSplit, "Number of machine basic blocks split");
+STATISTIC(NumExpanded, "Number of branches expanded to long format");
+
+namespace {
+class MSP430BSel : public MachineFunctionPass {
+
+ typedef SmallVector<int, 16> OffsetVector;
+
+ MachineFunction *MF;
+ const MSP430InstrInfo *TII;
+
+ unsigned measureFunction(OffsetVector &BlockOffsets,
+ MachineBasicBlock *FromBB = nullptr);
+ bool expandBranches(OffsetVector &BlockOffsets);
+
+public:
+ static char ID;
+ MSP430BSel() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+ StringRef getPassName() const override { return "MSP430 Branch Selector"; }
+};
+char MSP430BSel::ID = 0;
+}
+
+static bool isInRage(int DistanceInBytes) {
+ // According to CC430 Family User's Guide, Section 4.5.1.3, branch
+ // instructions have the signed 10-bit word offset field, so first we need to
+ // convert the distance from bytes to words, then check if it fits in 10-bit
+ // signed integer.
+ const int WordSize = 2;
+
+ assert((DistanceInBytes % WordSize == 0) &&
+ "Branch offset should be word aligned!");
+
+ int Words = DistanceInBytes / WordSize;
+ return isInt<10>(Words);
+}
+
+/// Measure each basic block, fill the BlockOffsets, and return the size of
+/// the function, starting with BB
+unsigned MSP430BSel::measureFunction(OffsetVector &BlockOffsets,
+ MachineBasicBlock *FromBB) {
+ // Give the blocks of the function a dense, in-order, numbering.
+ MF->RenumberBlocks(FromBB);
+
+ MachineFunction::iterator Begin;
+ if (FromBB == nullptr) {
+ Begin = MF->begin();
+ } else {
+ Begin = FromBB->getIterator();
+ }
+
+ BlockOffsets.resize(MF->getNumBlockIDs());
+
+ unsigned TotalSize = BlockOffsets[Begin->getNumber()];
+ for (auto &MBB : make_range(Begin, MF->end())) {
+ BlockOffsets[MBB.getNumber()] = TotalSize;
+ for (MachineInstr &MI : MBB) {
+ TotalSize += TII->getInstSizeInBytes(MI);
+ }
+ }
+ return TotalSize;
+}
+
+/// Do expand branches and split the basic blocks if necessary.
+/// Returns true if made any change.
+bool MSP430BSel::expandBranches(OffsetVector &BlockOffsets) {
+ // For each conditional branch, if the offset to its destination is larger
+ // than the offset field allows, transform it into a long branch sequence
+ // like this:
+ // short branch:
+ // bCC MBB
+ // long branch:
+ // b!CC $PC+6
+ // b MBB
+ //
+ bool MadeChange = false;
+ for (auto MBB = MF->begin(), E = MF->end(); MBB != E; ++MBB) {
+ unsigned MBBStartOffset = 0;
+ for (auto MI = MBB->begin(), EE = MBB->end(); MI != EE; ++MI) {
+ MBBStartOffset += TII->getInstSizeInBytes(*MI);
+
+ // If this instruction is not a short branch then skip it.
+ if (MI->getOpcode() != MSP430::JCC && MI->getOpcode() != MSP430::JMP) {
+ continue;
+ }
+
+ MachineBasicBlock *DestBB = MI->getOperand(0).getMBB();
+ // Determine the distance from the current branch to the destination
+ // block. MBBStartOffset already includes the size of the current branch
+ // instruction.
+ int BlockDistance =
+ BlockOffsets[DestBB->getNumber()] - BlockOffsets[MBB->getNumber()];
+ int BranchDistance = BlockDistance - MBBStartOffset;
+
+ // If this branch is in range, ignore it.
+ if (isInRage(BranchDistance)) {
+ continue;
+ }
+
+ DEBUG(dbgs() << " Found a branch that needs expanding, BB#"
+ << DestBB->getNumber() << ", Distance " << BranchDistance
+ << "\n");
+
+ // If JCC is not the last instruction we need to split the MBB.
+ if (MI->getOpcode() == MSP430::JCC && std::next(MI) != EE) {
+
+ DEBUG(dbgs() << " Found a basic block that needs to be split, BB#"
+ << MBB->getNumber() << "\n");
+
+ // Create a new basic block.
+ MachineBasicBlock *NewBB =
+ MF->CreateMachineBasicBlock(MBB->getBasicBlock());
+ MF->insert(std::next(MBB), NewBB);
+
+ // Splice the instructions following MI over to the NewBB.
+ NewBB->splice(NewBB->end(), &*MBB, std::next(MI), MBB->end());
+
+ // Update the successor lists.
+ for (MachineBasicBlock *Succ : MBB->successors()) {
+ if (Succ == DestBB) {
+ continue;
+ }
+ MBB->replaceSuccessor(Succ, NewBB);
+ NewBB->addSuccessor(Succ);
+ }
+
+ // We introduced a new MBB so all following blocks should be numbered
+ // and measured again.
+ measureFunction(BlockOffsets, &*MBB);
+
+ ++NumSplit;
+
+ // It may be not necessary to start all over at this point, but it's
+ // safer do this anyway.
+ return true;
+ }
+
+ MachineInstr &OldBranch = *MI;
+ DebugLoc dl = OldBranch.getDebugLoc();
+ int InstrSizeDiff = -TII->getInstSizeInBytes(OldBranch);
+
+ if (MI->getOpcode() == MSP430::JCC) {
+ MachineBasicBlock *NextMBB = &*std::next(MBB);
+ assert(MBB->isSuccessor(NextMBB) &&
+ "This block must have a layout successor!");
+
+ // The BCC operands are:
+ // 0. Target MBB
+ // 1. MSP430 branch predicate
+ SmallVector<MachineOperand, 1> Cond;
+ Cond.push_back(MI->getOperand(1));
+
+ // Jump over the long branch on the opposite condition
+ TII->reverseBranchCondition(Cond);
+ MI = BuildMI(*MBB, MI, dl, TII->get(MSP430::JCC))
+ .addMBB(NextMBB)
+ .addOperand(Cond[0]);
+ InstrSizeDiff += TII->getInstSizeInBytes(*MI);
+ ++MI;
+ }
+
+ // Unconditional branch to the real destination.
+ MI = BuildMI(*MBB, MI, dl, TII->get(MSP430::Bi)).addMBB(DestBB);
+ InstrSizeDiff += TII->getInstSizeInBytes(*MI);
+
+ // Remove the old branch from the function.
+ OldBranch.eraseFromParent();
+
+ // The size of a new instruction is different from the old one, so we need
+ // to correct all block offsets.
+ for (int i = MBB->getNumber() + 1, e = BlockOffsets.size(); i < e; ++i) {
+ BlockOffsets[i] += InstrSizeDiff;
+ }
+ MBBStartOffset += InstrSizeDiff;
+
+ ++NumExpanded;
+ MadeChange = true;
+ }
+ }
+ return MadeChange;
+}
+
+bool MSP430BSel::runOnMachineFunction(MachineFunction &mf) {
+ MF = &mf;
+ TII = static_cast<const MSP430InstrInfo *>(MF->getSubtarget().getInstrInfo());
+
+ // If the pass is disabled, just bail early.
+ if (!BranchSelectEnabled)
+ return false;
+
+ DEBUG(dbgs() << "\n********** " << getPassName() << " **********\n");
+
+ // BlockOffsets - Contains the distance from the beginning of the function to
+ // the beginning of each basic block.
+ OffsetVector BlockOffsets;
+
+ unsigned FunctionSize = measureFunction(BlockOffsets);
+ // If the entire function is smaller than the displacement of a branch field,
+ // we know we don't need to expand any branches in this
+ // function. This is a common case.
+ if (isInRage(FunctionSize)) {
+ return false;
+ }
+
+ // Iteratively expand branches until we reach a fixed point.
+ bool MadeChange = false;
+ while (expandBranches(BlockOffsets))
+ MadeChange = true;
+
+ return MadeChange;
+}
+
+/// Returns an instance of the Branch Selection Pass
+FunctionPass *llvm::createMSP430BranchSelectionPass() {
+ return new MSP430BSel();
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430CallingConv.td b/contrib/llvm/lib/Target/MSP430/MSP430CallingConv.td
new file mode 100644
index 000000000000..b38f5781c84a
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430CallingConv.td
@@ -0,0 +1,37 @@
+//==- MSP430CallingConv.td - Calling Conventions for MSP430 -*- tablegen -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for MSP430 architecture.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MSP430 Return Value Calling Convention
+//===----------------------------------------------------------------------===//
+def RetCC_MSP430 : CallingConv<[
+ // i8 are returned in registers R15B, R14B, R13B, R12B
+ CCIfType<[i8], CCAssignToReg<[R15B, R14B, R13B, R12B]>>,
+
+ // i16 are returned in registers R15, R14, R13, R12
+ CCIfType<[i16], CCAssignToReg<[R15, R14, R13, R12]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// MSP430 Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+def CC_MSP430_AssignStack : CallingConv<[
+ // Pass by value if the byval attribute is given
+ CCIfByVal<CCPassByVal<2, 2>>,
+
+ // Promote i8 arguments to i16.
+ CCIfType<[i8], CCPromoteToType<i16>>,
+
+ // Integer values get stored in stack slots that are 2 bytes in
+ // size and 2-byte aligned.
+ CCIfType<[i16], CCAssignToStack<2, 2>>
+]>;
+
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp b/contrib/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
new file mode 100644
index 000000000000..f1cb0b6c031b
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
@@ -0,0 +1,301 @@
+//===-- MSP430FrameLowering.cpp - MSP430 Frame Information ----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MSP430 implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430FrameLowering.h"
+#include "MSP430InstrInfo.h"
+#include "MSP430MachineFunctionInfo.h"
+#include "MSP430Subtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+bool MSP430FrameLowering::hasFP(const MachineFunction &MF) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
+ MF.getFrameInfo().hasVarSizedObjects() ||
+ MFI.isFrameAddressTaken());
+}
+
+bool MSP430FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+ return !MF.getFrameInfo().hasVarSizedObjects();
+}
+
+void MSP430FrameLowering::emitPrologue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MSP430MachineFunctionInfo *MSP430FI = MF.getInfo<MSP430MachineFunctionInfo>();
+ const MSP430InstrInfo &TII =
+ *static_cast<const MSP430InstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+ MachineBasicBlock::iterator MBBI = MBB.begin();
+ DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+ // Get the number of bytes to allocate from the FrameInfo.
+ uint64_t StackSize = MFI.getStackSize();
+
+ uint64_t NumBytes = 0;
+ if (hasFP(MF)) {
+ // Calculate required stack adjustment
+ uint64_t FrameSize = StackSize - 2;
+ NumBytes = FrameSize - MSP430FI->getCalleeSavedFrameSize();
+
+ // Get the offset of the stack slot for the EBP register... which is
+ // guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
+ // Update the frame offset adjustment.
+ MFI.setOffsetAdjustment(-NumBytes);
+
+ // Save FP into the appropriate stack slot...
+ BuildMI(MBB, MBBI, DL, TII.get(MSP430::PUSH16r))
+ .addReg(MSP430::FP, RegState::Kill);
+
+ // Update FP with the new base value...
+ BuildMI(MBB, MBBI, DL, TII.get(MSP430::MOV16rr), MSP430::FP)
+ .addReg(MSP430::SP);
+
+ // Mark the FramePtr as live-in in every block except the entry.
+ for (MachineFunction::iterator I = std::next(MF.begin()), E = MF.end();
+ I != E; ++I)
+ I->addLiveIn(MSP430::FP);
+
+ } else
+ NumBytes = StackSize - MSP430FI->getCalleeSavedFrameSize();
+
+ // Skip the callee-saved push instructions.
+ while (MBBI != MBB.end() && (MBBI->getOpcode() == MSP430::PUSH16r))
+ ++MBBI;
+
+ if (MBBI != MBB.end())
+ DL = MBBI->getDebugLoc();
+
+ if (NumBytes) { // adjust stack pointer: SP -= numbytes
+ // If there is an SUB16ri of SP immediately before this instruction, merge
+ // the two.
+ //NumBytes -= mergeSPUpdates(MBB, MBBI, true);
+ // If there is an ADD16ri or SUB16ri of SP immediately after this
+ // instruction, merge the two instructions.
+ // mergeSPUpdatesDown(MBB, MBBI, &NumBytes);
+
+ if (NumBytes) {
+ MachineInstr *MI =
+ BuildMI(MBB, MBBI, DL, TII.get(MSP430::SUB16ri), MSP430::SP)
+ .addReg(MSP430::SP).addImm(NumBytes);
+ // The SRW implicit def is dead.
+ MI->getOperand(3).setIsDead();
+ }
+ }
+}
+
+void MSP430FrameLowering::emitEpilogue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ MSP430MachineFunctionInfo *MSP430FI = MF.getInfo<MSP430MachineFunctionInfo>();
+ const MSP430InstrInfo &TII =
+ *static_cast<const MSP430InstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+ MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+ unsigned RetOpcode = MBBI->getOpcode();
+ DebugLoc DL = MBBI->getDebugLoc();
+
+ switch (RetOpcode) {
+ case MSP430::RET:
+ case MSP430::RETI: break; // These are ok
+ default:
+ llvm_unreachable("Can only insert epilog into returning blocks");
+ }
+
+ // Get the number of bytes to allocate from the FrameInfo
+ uint64_t StackSize = MFI.getStackSize();
+ unsigned CSSize = MSP430FI->getCalleeSavedFrameSize();
+ uint64_t NumBytes = 0;
+
+ if (hasFP(MF)) {
+ // Calculate required stack adjustment
+ uint64_t FrameSize = StackSize - 2;
+ NumBytes = FrameSize - CSSize;
+
+ // pop FP.
+ BuildMI(MBB, MBBI, DL, TII.get(MSP430::POP16r), MSP430::FP);
+ } else
+ NumBytes = StackSize - CSSize;
+
+ // Skip the callee-saved pop instructions.
+ while (MBBI != MBB.begin()) {
+ MachineBasicBlock::iterator PI = std::prev(MBBI);
+ unsigned Opc = PI->getOpcode();
+ if (Opc != MSP430::POP16r && !PI->isTerminator())
+ break;
+ --MBBI;
+ }
+
+ DL = MBBI->getDebugLoc();
+
+ // If there is an ADD16ri or SUB16ri of SP immediately before this
+ // instruction, merge the two instructions.
+ //if (NumBytes || MFI.hasVarSizedObjects())
+ // mergeSPUpdatesUp(MBB, MBBI, StackPtr, &NumBytes);
+
+ if (MFI.hasVarSizedObjects()) {
+ BuildMI(MBB, MBBI, DL,
+ TII.get(MSP430::MOV16rr), MSP430::SP).addReg(MSP430::FP);
+ if (CSSize) {
+ MachineInstr *MI =
+ BuildMI(MBB, MBBI, DL,
+ TII.get(MSP430::SUB16ri), MSP430::SP)
+ .addReg(MSP430::SP).addImm(CSSize);
+ // The SRW implicit def is dead.
+ MI->getOperand(3).setIsDead();
+ }
+ } else {
+ // adjust stack pointer back: SP += numbytes
+ if (NumBytes) {
+ MachineInstr *MI =
+ BuildMI(MBB, MBBI, DL, TII.get(MSP430::ADD16ri), MSP430::SP)
+ .addReg(MSP430::SP).addImm(NumBytes);
+ // The SRW implicit def is dead.
+ MI->getOperand(3).setIsDead();
+ }
+ }
+}
+
+// FIXME: Can we eleminate these in favour of generic code?
+bool
+MSP430FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const {
+ if (CSI.empty())
+ return false;
+
+ DebugLoc DL;
+ if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+ MachineFunction &MF = *MBB.getParent();
+ const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+ MSP430MachineFunctionInfo *MFI = MF.getInfo<MSP430MachineFunctionInfo>();
+ MFI->setCalleeSavedFrameSize(CSI.size() * 2);
+
+ for (unsigned i = CSI.size(); i != 0; --i) {
+ unsigned Reg = CSI[i-1].getReg();
+ // Add the callee-saved register as live-in. It's killed at the spill.
+ MBB.addLiveIn(Reg);
+ BuildMI(MBB, MI, DL, TII.get(MSP430::PUSH16r))
+ .addReg(Reg, RegState::Kill);
+ }
+ return true;
+}
+
+bool
+MSP430FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const {
+ if (CSI.empty())
+ return false;
+
+ DebugLoc DL;
+ if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+ MachineFunction &MF = *MBB.getParent();
+ const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+
+ for (unsigned i = 0, e = CSI.size(); i != e; ++i)
+ BuildMI(MBB, MI, DL, TII.get(MSP430::POP16r), CSI[i].getReg());
+
+ return true;
+}
+
+MachineBasicBlock::iterator MSP430FrameLowering::eliminateCallFramePseudoInstr(
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
+ const MSP430InstrInfo &TII =
+ *static_cast<const MSP430InstrInfo *>(MF.getSubtarget().getInstrInfo());
+ unsigned StackAlign = getStackAlignment();
+
+ if (!hasReservedCallFrame(MF)) {
+ // If the stack pointer can be changed after prologue, turn the
+ // adjcallstackup instruction into a 'sub SP, <amt>' and the
+ // adjcallstackdown instruction into 'add SP, <amt>'
+ // TODO: consider using push / pop instead of sub + store / add
+ MachineInstr &Old = *I;
+ uint64_t Amount = Old.getOperand(0).getImm();
+ if (Amount != 0) {
+ // We need to keep the stack aligned properly. To do this, we round the
+ // amount of space needed for the outgoing arguments up to the next
+ // alignment boundary.
+ Amount = (Amount+StackAlign-1)/StackAlign*StackAlign;
+
+ MachineInstr *New = nullptr;
+ if (Old.getOpcode() == TII.getCallFrameSetupOpcode()) {
+ New =
+ BuildMI(MF, Old.getDebugLoc(), TII.get(MSP430::SUB16ri), MSP430::SP)
+ .addReg(MSP430::SP)
+ .addImm(Amount);
+ } else {
+ assert(Old.getOpcode() == TII.getCallFrameDestroyOpcode());
+ // factor out the amount the callee already popped.
+ uint64_t CalleeAmt = Old.getOperand(1).getImm();
+ Amount -= CalleeAmt;
+ if (Amount)
+ New = BuildMI(MF, Old.getDebugLoc(), TII.get(MSP430::ADD16ri),
+ MSP430::SP)
+ .addReg(MSP430::SP)
+ .addImm(Amount);
+ }
+
+ if (New) {
+ // The SRW implicit def is dead.
+ New->getOperand(3).setIsDead();
+
+ // Replace the pseudo instruction with a new instruction...
+ MBB.insert(I, New);
+ }
+ }
+ } else if (I->getOpcode() == TII.getCallFrameDestroyOpcode()) {
+ // If we are performing frame pointer elimination and if the callee pops
+ // something off the stack pointer, add it back.
+ if (uint64_t CalleeAmt = I->getOperand(1).getImm()) {
+ MachineInstr &Old = *I;
+ MachineInstr *New =
+ BuildMI(MF, Old.getDebugLoc(), TII.get(MSP430::SUB16ri), MSP430::SP)
+ .addReg(MSP430::SP)
+ .addImm(CalleeAmt);
+ // The SRW implicit def is dead.
+ New->getOperand(3).setIsDead();
+
+ MBB.insert(I, New);
+ }
+ }
+
+ return MBB.erase(I);
+}
+
+void
+MSP430FrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
+ RegScavenger *) const {
+ // Create a frame entry for the FP register that must be saved.
+ if (hasFP(MF)) {
+ int FrameIdx = MF.getFrameInfo().CreateFixedObject(2, -4, true);
+ (void)FrameIdx;
+ assert(FrameIdx == MF.getFrameInfo().getObjectIndexBegin() &&
+ "Slot for FP register must be last in order to be found!");
+ }
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430FrameLowering.h b/contrib/llvm/lib/Target/MSP430/MSP430FrameLowering.h
new file mode 100644
index 000000000000..f77de18b4d16
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430FrameLowering.h
@@ -0,0 +1,54 @@
+//==- MSP430FrameLowering.h - Define frame lowering for MSP430 --*- C++ -*--==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MSP430_MSP430FRAMELOWERING_H
+#define LLVM_LIB_TARGET_MSP430_MSP430FRAMELOWERING_H
+
+#include "MSP430.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+class MSP430FrameLowering : public TargetFrameLowering {
+protected:
+
+public:
+ explicit MSP430FrameLowering()
+ : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 2, -2, 2) {}
+
+ /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+ /// the function.
+ void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+ void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+ MachineBasicBlock::iterator
+ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const override;
+
+ bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const override;
+ bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const override;
+
+ bool hasFP(const MachineFunction &MF) const override;
+ bool hasReservedCallFrame(const MachineFunction &MF) const override;
+ void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+ RegScavenger *RS = nullptr) const override;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
new file mode 100644
index 000000000000..6e481b68e038
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
@@ -0,0 +1,469 @@
+//===-- MSP430ISelDAGToDAG.cpp - A dag to dag inst selector for MSP430 ----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the MSP430 target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430.h"
+#include "MSP430TargetMachine.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "msp430-isel"
+
+namespace {
+ struct MSP430ISelAddressMode {
+ enum {
+ RegBase,
+ FrameIndexBase
+ } BaseType;
+
+ struct { // This is really a union, discriminated by BaseType!
+ SDValue Reg;
+ int FrameIndex;
+ } Base;
+
+ int16_t Disp;
+ const GlobalValue *GV;
+ const Constant *CP;
+ const BlockAddress *BlockAddr;
+ const char *ES;
+ int JT;
+ unsigned Align; // CP alignment.
+
+ MSP430ISelAddressMode()
+ : BaseType(RegBase), Disp(0), GV(nullptr), CP(nullptr),
+ BlockAddr(nullptr), ES(nullptr), JT(-1), Align(0) {
+ }
+
+ bool hasSymbolicDisplacement() const {
+ return GV != nullptr || CP != nullptr || ES != nullptr || JT != -1;
+ }
+
+ void dump() {
+ errs() << "MSP430ISelAddressMode " << this << '\n';
+ if (BaseType == RegBase && Base.Reg.getNode() != nullptr) {
+ errs() << "Base.Reg ";
+ Base.Reg.getNode()->dump();
+ } else if (BaseType == FrameIndexBase) {
+ errs() << " Base.FrameIndex " << Base.FrameIndex << '\n';
+ }
+ errs() << " Disp " << Disp << '\n';
+ if (GV) {
+ errs() << "GV ";
+ GV->dump();
+ } else if (CP) {
+ errs() << " CP ";
+ CP->dump();
+ errs() << " Align" << Align << '\n';
+ } else if (ES) {
+ errs() << "ES ";
+ errs() << ES << '\n';
+ } else if (JT != -1)
+ errs() << " JT" << JT << " Align" << Align << '\n';
+ }
+ };
+}
+
+/// MSP430DAGToDAGISel - MSP430 specific code to select MSP430 machine
+/// instructions for SelectionDAG operations.
+///
+namespace {
+ class MSP430DAGToDAGISel : public SelectionDAGISel {
+ public:
+ MSP430DAGToDAGISel(MSP430TargetMachine &TM, CodeGenOpt::Level OptLevel)
+ : SelectionDAGISel(TM, OptLevel) {}
+
+ StringRef getPassName() const override {
+ return "MSP430 DAG->DAG Pattern Instruction Selection";
+ }
+
+ bool MatchAddress(SDValue N, MSP430ISelAddressMode &AM);
+ bool MatchWrapper(SDValue N, MSP430ISelAddressMode &AM);
+ bool MatchAddressBase(SDValue N, MSP430ISelAddressMode &AM);
+
+ bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
+ std::vector<SDValue> &OutOps) override;
+
+ // Include the pieces autogenerated from the target description.
+ #include "MSP430GenDAGISel.inc"
+
+ private:
+ void Select(SDNode *N) override;
+ bool tryIndexedLoad(SDNode *Op);
+ bool tryIndexedBinOp(SDNode *Op, SDValue N1, SDValue N2, unsigned Opc8,
+ unsigned Opc16);
+
+ bool SelectAddr(SDValue Addr, SDValue &Base, SDValue &Disp);
+ };
+} // end anonymous namespace
+
+/// createMSP430ISelDag - This pass converts a legalized DAG into a
+/// MSP430-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createMSP430ISelDag(MSP430TargetMachine &TM,
+ CodeGenOpt::Level OptLevel) {
+ return new MSP430DAGToDAGISel(TM, OptLevel);
+}
+
+
+/// MatchWrapper - Try to match MSP430ISD::Wrapper node into an addressing mode.
+/// These wrap things that will resolve down into a symbol reference. If no
+/// match is possible, this returns true, otherwise it returns false.
+bool MSP430DAGToDAGISel::MatchWrapper(SDValue N, MSP430ISelAddressMode &AM) {
+ // If the addressing mode already has a symbol as the displacement, we can
+ // never match another symbol.
+ if (AM.hasSymbolicDisplacement())
+ return true;
+
+ SDValue N0 = N.getOperand(0);
+
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
+ AM.GV = G->getGlobal();
+ AM.Disp += G->getOffset();
+ //AM.SymbolFlags = G->getTargetFlags();
+ } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
+ AM.CP = CP->getConstVal();
+ AM.Align = CP->getAlignment();
+ AM.Disp += CP->getOffset();
+ //AM.SymbolFlags = CP->getTargetFlags();
+ } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
+ AM.ES = S->getSymbol();
+ //AM.SymbolFlags = S->getTargetFlags();
+ } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
+ AM.JT = J->getIndex();
+ //AM.SymbolFlags = J->getTargetFlags();
+ } else {
+ AM.BlockAddr = cast<BlockAddressSDNode>(N0)->getBlockAddress();
+ //AM.SymbolFlags = cast<BlockAddressSDNode>(N0)->getTargetFlags();
+ }
+ return false;
+}
+
+/// MatchAddressBase - Helper for MatchAddress. Add the specified node to the
+/// specified addressing mode without any further recursion.
+bool MSP430DAGToDAGISel::MatchAddressBase(SDValue N, MSP430ISelAddressMode &AM) {
+ // Is the base register already occupied?
+ if (AM.BaseType != MSP430ISelAddressMode::RegBase || AM.Base.Reg.getNode()) {
+ // If so, we cannot select it.
+ return true;
+ }
+
+ // Default, generate it as a register.
+ AM.BaseType = MSP430ISelAddressMode::RegBase;
+ AM.Base.Reg = N;
+ return false;
+}
+
+bool MSP430DAGToDAGISel::MatchAddress(SDValue N, MSP430ISelAddressMode &AM) {
+ DEBUG(errs() << "MatchAddress: "; AM.dump());
+
+ switch (N.getOpcode()) {
+ default: break;
+ case ISD::Constant: {
+ uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
+ AM.Disp += Val;
+ return false;
+ }
+
+ case MSP430ISD::Wrapper:
+ if (!MatchWrapper(N, AM))
+ return false;
+ break;
+
+ case ISD::FrameIndex:
+ if (AM.BaseType == MSP430ISelAddressMode::RegBase
+ && AM.Base.Reg.getNode() == nullptr) {
+ AM.BaseType = MSP430ISelAddressMode::FrameIndexBase;
+ AM.Base.FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
+ return false;
+ }
+ break;
+
+ case ISD::ADD: {
+ MSP430ISelAddressMode Backup = AM;
+ if (!MatchAddress(N.getNode()->getOperand(0), AM) &&
+ !MatchAddress(N.getNode()->getOperand(1), AM))
+ return false;
+ AM = Backup;
+ if (!MatchAddress(N.getNode()->getOperand(1), AM) &&
+ !MatchAddress(N.getNode()->getOperand(0), AM))
+ return false;
+ AM = Backup;
+
+ break;
+ }
+
+ case ISD::OR:
+ // Handle "X | C" as "X + C" iff X is known to have C bits clear.
+ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+ MSP430ISelAddressMode Backup = AM;
+ uint64_t Offset = CN->getSExtValue();
+ // Start with the LHS as an addr mode.
+ if (!MatchAddress(N.getOperand(0), AM) &&
+ // Address could not have picked a GV address for the displacement.
+ AM.GV == nullptr &&
+ // Check to see if the LHS & C is zero.
+ CurDAG->MaskedValueIsZero(N.getOperand(0), CN->getAPIntValue())) {
+ AM.Disp += Offset;
+ return false;
+ }
+ AM = Backup;
+ }
+ break;
+ }
+
+ return MatchAddressBase(N, AM);
+}
+
+/// SelectAddr - returns true if it is able pattern match an addressing mode.
+/// It returns the operands which make up the maximal addressing mode it can
+/// match by reference.
+bool MSP430DAGToDAGISel::SelectAddr(SDValue N,
+ SDValue &Base, SDValue &Disp) {
+ MSP430ISelAddressMode AM;
+
+ if (MatchAddress(N, AM))
+ return false;
+
+ EVT VT = N.getValueType();
+ if (AM.BaseType == MSP430ISelAddressMode::RegBase) {
+ if (!AM.Base.Reg.getNode())
+ AM.Base.Reg = CurDAG->getRegister(0, VT);
+ }
+
+ Base = (AM.BaseType == MSP430ISelAddressMode::FrameIndexBase)
+ ? CurDAG->getTargetFrameIndex(
+ AM.Base.FrameIndex,
+ getTargetLowering()->getPointerTy(CurDAG->getDataLayout()))
+ : AM.Base.Reg;
+
+ if (AM.GV)
+ Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(N),
+ MVT::i16, AM.Disp,
+ 0/*AM.SymbolFlags*/);
+ else if (AM.CP)
+ Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i16,
+ AM.Align, AM.Disp, 0/*AM.SymbolFlags*/);
+ else if (AM.ES)
+ Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i16, 0/*AM.SymbolFlags*/);
+ else if (AM.JT != -1)
+ Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i16, 0/*AM.SymbolFlags*/);
+ else if (AM.BlockAddr)
+ Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, 0,
+ 0/*AM.SymbolFlags*/);
+ else
+ Disp = CurDAG->getTargetConstant(AM.Disp, SDLoc(N), MVT::i16);
+
+ return true;
+}
+
+bool MSP430DAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
+ std::vector<SDValue> &OutOps) {
+ SDValue Op0, Op1;
+ switch (ConstraintID) {
+ default: return true;
+ case InlineAsm::Constraint_m: // memory
+ if (!SelectAddr(Op, Op0, Op1))
+ return true;
+ break;
+ }
+
+ OutOps.push_back(Op0);
+ OutOps.push_back(Op1);
+ return false;
+}
+
+static bool isValidIndexedLoad(const LoadSDNode *LD) {
+ ISD::MemIndexedMode AM = LD->getAddressingMode();
+ if (AM != ISD::POST_INC || LD->getExtensionType() != ISD::NON_EXTLOAD)
+ return false;
+
+ EVT VT = LD->getMemoryVT();
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ case MVT::i8:
+ // Sanity check
+ if (cast<ConstantSDNode>(LD->getOffset())->getZExtValue() != 1)
+ return false;
+
+ break;
+ case MVT::i16:
+ // Sanity check
+ if (cast<ConstantSDNode>(LD->getOffset())->getZExtValue() != 2)
+ return false;
+
+ break;
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+bool MSP430DAGToDAGISel::tryIndexedLoad(SDNode *N) {
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ if (!isValidIndexedLoad(LD))
+ return false;
+
+ MVT VT = LD->getMemoryVT().getSimpleVT();
+
+ unsigned Opcode = 0;
+ switch (VT.SimpleTy) {
+ case MVT::i8:
+ Opcode = MSP430::MOV8rm_POST;
+ break;
+ case MVT::i16:
+ Opcode = MSP430::MOV16rm_POST;
+ break;
+ default:
+ return false;
+ }
+
+ ReplaceNode(N,
+ CurDAG->getMachineNode(Opcode, SDLoc(N), VT, MVT::i16, MVT::Other,
+ LD->getBasePtr(), LD->getChain()));
+ return true;
+}
+
+bool MSP430DAGToDAGISel::tryIndexedBinOp(SDNode *Op, SDValue N1, SDValue N2,
+ unsigned Opc8, unsigned Opc16) {
+ if (N1.getOpcode() == ISD::LOAD &&
+ N1.hasOneUse() &&
+ IsLegalToFold(N1, Op, Op, OptLevel)) {
+ LoadSDNode *LD = cast<LoadSDNode>(N1);
+ if (!isValidIndexedLoad(LD))
+ return false;
+
+ MVT VT = LD->getMemoryVT().getSimpleVT();
+ unsigned Opc = (VT == MVT::i16 ? Opc16 : Opc8);
+ MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+ MemRefs0[0] = cast<MemSDNode>(N1)->getMemOperand();
+ SDValue Ops0[] = { N2, LD->getBasePtr(), LD->getChain() };
+ SDNode *ResNode =
+ CurDAG->SelectNodeTo(Op, Opc, VT, MVT::i16, MVT::Other, Ops0);
+ cast<MachineSDNode>(ResNode)->setMemRefs(MemRefs0, MemRefs0 + 1);
+ // Transfer chain.
+ ReplaceUses(SDValue(N1.getNode(), 2), SDValue(ResNode, 2));
+ // Transfer writeback.
+ ReplaceUses(SDValue(N1.getNode(), 1), SDValue(ResNode, 1));
+ return true;
+ }
+
+ return false;
+}
+
+
+void MSP430DAGToDAGISel::Select(SDNode *Node) {
+ SDLoc dl(Node);
+
+ // Dump information about the Node being selected
+ DEBUG(errs() << "Selecting: ");
+ DEBUG(Node->dump(CurDAG));
+ DEBUG(errs() << "\n");
+
+ // If we have a custom node, we already have selected!
+ if (Node->isMachineOpcode()) {
+ DEBUG(errs() << "== ";
+ Node->dump(CurDAG);
+ errs() << "\n");
+ Node->setNodeId(-1);
+ return;
+ }
+
+ // Few custom selection stuff.
+ switch (Node->getOpcode()) {
+ default: break;
+ case ISD::FrameIndex: {
+ assert(Node->getValueType(0) == MVT::i16);
+ int FI = cast<FrameIndexSDNode>(Node)->getIndex();
+ SDValue TFI = CurDAG->getTargetFrameIndex(FI, MVT::i16);
+ if (Node->hasOneUse()) {
+ CurDAG->SelectNodeTo(Node, MSP430::ADD16ri, MVT::i16, TFI,
+ CurDAG->getTargetConstant(0, dl, MVT::i16));
+ return;
+ }
+ ReplaceNode(Node, CurDAG->getMachineNode(
+ MSP430::ADD16ri, dl, MVT::i16, TFI,
+ CurDAG->getTargetConstant(0, dl, MVT::i16)));
+ return;
+ }
+ case ISD::LOAD:
+ if (tryIndexedLoad(Node))
+ return;
+ // Other cases are autogenerated.
+ break;
+ case ISD::ADD:
+ if (tryIndexedBinOp(Node, Node->getOperand(0), Node->getOperand(1),
+ MSP430::ADD8rm_POST, MSP430::ADD16rm_POST))
+ return;
+ else if (tryIndexedBinOp(Node, Node->getOperand(1), Node->getOperand(0),
+ MSP430::ADD8rm_POST, MSP430::ADD16rm_POST))
+ return;
+
+ // Other cases are autogenerated.
+ break;
+ case ISD::SUB:
+ if (tryIndexedBinOp(Node, Node->getOperand(0), Node->getOperand(1),
+ MSP430::SUB8rm_POST, MSP430::SUB16rm_POST))
+ return;
+
+ // Other cases are autogenerated.
+ break;
+ case ISD::AND:
+ if (tryIndexedBinOp(Node, Node->getOperand(0), Node->getOperand(1),
+ MSP430::AND8rm_POST, MSP430::AND16rm_POST))
+ return;
+ else if (tryIndexedBinOp(Node, Node->getOperand(1), Node->getOperand(0),
+ MSP430::AND8rm_POST, MSP430::AND16rm_POST))
+ return;
+
+ // Other cases are autogenerated.
+ break;
+ case ISD::OR:
+ if (tryIndexedBinOp(Node, Node->getOperand(0), Node->getOperand(1),
+ MSP430::OR8rm_POST, MSP430::OR16rm_POST))
+ return;
+ else if (tryIndexedBinOp(Node, Node->getOperand(1), Node->getOperand(0),
+ MSP430::OR8rm_POST, MSP430::OR16rm_POST))
+ return;
+
+ // Other cases are autogenerated.
+ break;
+ case ISD::XOR:
+ if (tryIndexedBinOp(Node, Node->getOperand(0), Node->getOperand(1),
+ MSP430::XOR8rm_POST, MSP430::XOR16rm_POST))
+ return;
+ else if (tryIndexedBinOp(Node, Node->getOperand(1), Node->getOperand(0),
+ MSP430::XOR8rm_POST, MSP430::XOR16rm_POST))
+ return;
+
+ // Other cases are autogenerated.
+ break;
+ }
+
+ // Select the default instruction
+ SelectCode(Node);
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp b/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
new file mode 100644
index 000000000000..73346b9ce41d
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -0,0 +1,1342 @@
+//===-- MSP430ISelLowering.cpp - MSP430 DAG Lowering Implementation ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MSP430TargetLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430ISelLowering.h"
+#include "MSP430.h"
+#include "MSP430MachineFunctionInfo.h"
+#include "MSP430Subtarget.h"
+#include "MSP430TargetMachine.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "msp430-lower"
+
+typedef enum {
+ NoHWMult,
+ HWMultIntr,
+ HWMultNoIntr
+} HWMultUseMode;
+
+static cl::opt<HWMultUseMode>
+HWMultMode("msp430-hwmult-mode", cl::Hidden,
+ cl::desc("Hardware multiplier use mode"),
+ cl::init(HWMultNoIntr),
+ cl::values(
+ clEnumValN(NoHWMult, "no",
+ "Do not use hardware multiplier"),
+ clEnumValN(HWMultIntr, "interrupts",
+ "Assume hardware multiplier can be used inside interrupts"),
+ clEnumValN(HWMultNoIntr, "use",
+ "Assume hardware multiplier cannot be used inside interrupts")));
+
+MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
+ const MSP430Subtarget &STI)
+ : TargetLowering(TM) {
+
+ // Set up the register classes.
+ addRegisterClass(MVT::i8, &MSP430::GR8RegClass);
+ addRegisterClass(MVT::i16, &MSP430::GR16RegClass);
+
+ // Compute derived properties from the register classes
+ computeRegisterProperties(STI.getRegisterInfo());
+
+ // Provide all sorts of operation actions
+ setStackPointerRegisterToSaveRestore(MSP430::SP);
+ setBooleanContents(ZeroOrOneBooleanContent);
+ setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
+
+ // We have post-incremented loads / stores.
+ setIndexedLoadAction(ISD::POST_INC, MVT::i8, Legal);
+ setIndexedLoadAction(ISD::POST_INC, MVT::i16, Legal);
+
+ for (MVT VT : MVT::integer_valuetypes()) {
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Expand);
+ }
+
+ // We don't have any truncstores
+ setTruncStoreAction(MVT::i16, MVT::i8, Expand);
+
+ setOperationAction(ISD::SRA, MVT::i8, Custom);
+ setOperationAction(ISD::SHL, MVT::i8, Custom);
+ setOperationAction(ISD::SRL, MVT::i8, Custom);
+ setOperationAction(ISD::SRA, MVT::i16, Custom);
+ setOperationAction(ISD::SHL, MVT::i16, Custom);
+ setOperationAction(ISD::SRL, MVT::i16, Custom);
+ setOperationAction(ISD::ROTL, MVT::i8, Expand);
+ setOperationAction(ISD::ROTR, MVT::i8, Expand);
+ setOperationAction(ISD::ROTL, MVT::i16, Expand);
+ setOperationAction(ISD::ROTR, MVT::i16, Expand);
+ setOperationAction(ISD::GlobalAddress, MVT::i16, Custom);
+ setOperationAction(ISD::ExternalSymbol, MVT::i16, Custom);
+ setOperationAction(ISD::BlockAddress, MVT::i16, Custom);
+ setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+ setOperationAction(ISD::BR_CC, MVT::i8, Custom);
+ setOperationAction(ISD::BR_CC, MVT::i16, Custom);
+ setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+ setOperationAction(ISD::SETCC, MVT::i8, Custom);
+ setOperationAction(ISD::SETCC, MVT::i16, Custom);
+ setOperationAction(ISD::SELECT, MVT::i8, Expand);
+ setOperationAction(ISD::SELECT, MVT::i16, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::i8, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::i16, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Custom);
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i8, Expand);
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i16, Expand);
+
+ setOperationAction(ISD::CTTZ, MVT::i8, Expand);
+ setOperationAction(ISD::CTTZ, MVT::i16, Expand);
+ setOperationAction(ISD::CTLZ, MVT::i8, Expand);
+ setOperationAction(ISD::CTLZ, MVT::i16, Expand);
+ setOperationAction(ISD::CTPOP, MVT::i8, Expand);
+ setOperationAction(ISD::CTPOP, MVT::i16, Expand);
+
+ setOperationAction(ISD::SHL_PARTS, MVT::i8, Expand);
+ setOperationAction(ISD::SHL_PARTS, MVT::i16, Expand);
+ setOperationAction(ISD::SRL_PARTS, MVT::i8, Expand);
+ setOperationAction(ISD::SRL_PARTS, MVT::i16, Expand);
+ setOperationAction(ISD::SRA_PARTS, MVT::i8, Expand);
+ setOperationAction(ISD::SRA_PARTS, MVT::i16, Expand);
+
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+ // FIXME: Implement efficiently multiplication by a constant
+ setOperationAction(ISD::MUL, MVT::i8, Expand);
+ setOperationAction(ISD::MULHS, MVT::i8, Expand);
+ setOperationAction(ISD::MULHU, MVT::i8, Expand);
+ setOperationAction(ISD::SMUL_LOHI, MVT::i8, Expand);
+ setOperationAction(ISD::UMUL_LOHI, MVT::i8, Expand);
+ setOperationAction(ISD::MUL, MVT::i16, Expand);
+ setOperationAction(ISD::MULHS, MVT::i16, Expand);
+ setOperationAction(ISD::MULHU, MVT::i16, Expand);
+ setOperationAction(ISD::SMUL_LOHI, MVT::i16, Expand);
+ setOperationAction(ISD::UMUL_LOHI, MVT::i16, Expand);
+
+ setOperationAction(ISD::UDIV, MVT::i8, Expand);
+ setOperationAction(ISD::UDIVREM, MVT::i8, Expand);
+ setOperationAction(ISD::UREM, MVT::i8, Expand);
+ setOperationAction(ISD::SDIV, MVT::i8, Expand);
+ setOperationAction(ISD::SDIVREM, MVT::i8, Expand);
+ setOperationAction(ISD::SREM, MVT::i8, Expand);
+ setOperationAction(ISD::UDIV, MVT::i16, Expand);
+ setOperationAction(ISD::UDIVREM, MVT::i16, Expand);
+ setOperationAction(ISD::UREM, MVT::i16, Expand);
+ setOperationAction(ISD::SDIV, MVT::i16, Expand);
+ setOperationAction(ISD::SDIVREM, MVT::i16, Expand);
+ setOperationAction(ISD::SREM, MVT::i16, Expand);
+
+ // varargs support
+ setOperationAction(ISD::VASTART, MVT::Other, Custom);
+ setOperationAction(ISD::VAARG, MVT::Other, Expand);
+ setOperationAction(ISD::VAEND, MVT::Other, Expand);
+ setOperationAction(ISD::VACOPY, MVT::Other, Expand);
+ setOperationAction(ISD::JumpTable, MVT::i16, Custom);
+
+ // Libcalls names.
+ if (HWMultMode == HWMultIntr) {
+ setLibcallName(RTLIB::MUL_I8, "__mulqi3hw");
+ setLibcallName(RTLIB::MUL_I16, "__mulhi3hw");
+ } else if (HWMultMode == HWMultNoIntr) {
+ setLibcallName(RTLIB::MUL_I8, "__mulqi3hw_noint");
+ setLibcallName(RTLIB::MUL_I16, "__mulhi3hw_noint");
+ }
+
+ setMinFunctionAlignment(1);
+ setPrefFunctionAlignment(2);
+}
+
+SDValue MSP430TargetLowering::LowerOperation(SDValue Op,
+ SelectionDAG &DAG) const {
+ switch (Op.getOpcode()) {
+ case ISD::SHL: // FALLTHROUGH
+ case ISD::SRL:
+ case ISD::SRA: return LowerShifts(Op, DAG);
+ case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
+ case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
+ case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
+ case ISD::SETCC: return LowerSETCC(Op, DAG);
+ case ISD::BR_CC: return LowerBR_CC(Op, DAG);
+ case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
+ case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, DAG);
+ case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
+ case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
+ case ISD::VASTART: return LowerVASTART(Op, DAG);
+ case ISD::JumpTable: return LowerJumpTable(Op, DAG);
+ default:
+ llvm_unreachable("unimplemented operand");
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// MSP430 Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+TargetLowering::ConstraintType
+MSP430TargetLowering::getConstraintType(StringRef Constraint) const {
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ case 'r':
+ return C_RegisterClass;
+ default:
+ break;
+ }
+ }
+ return TargetLowering::getConstraintType(Constraint);
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+MSP430TargetLowering::getRegForInlineAsmConstraint(
+ const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
+ if (Constraint.size() == 1) {
+ // GCC Constraint Letters
+ switch (Constraint[0]) {
+ default: break;
+ case 'r': // GENERAL_REGS
+ if (VT == MVT::i8)
+ return std::make_pair(0U, &MSP430::GR8RegClass);
+
+ return std::make_pair(0U, &MSP430::GR16RegClass);
+ }
+ }
+
+ return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+}
+
+//===----------------------------------------------------------------------===//
+// Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "MSP430GenCallingConv.inc"
+
+/// For each argument in a function store the number of pieces it is composed
+/// of.
+template<typename ArgT>
+static void ParseFunctionArgs(const SmallVectorImpl<ArgT> &Args,
+ SmallVectorImpl<unsigned> &Out) {
+ unsigned CurrentArgIndex = ~0U;
+ for (unsigned i = 0, e = Args.size(); i != e; i++) {
+ if (CurrentArgIndex == Args[i].OrigArgIndex) {
+ Out.back()++;
+ } else {
+ Out.push_back(1);
+ CurrentArgIndex++;
+ }
+ }
+}
+
+static void AnalyzeVarArgs(CCState &State,
+ const SmallVectorImpl<ISD::OutputArg> &Outs) {
+ State.AnalyzeCallOperands(Outs, CC_MSP430_AssignStack);
+}
+
+static void AnalyzeVarArgs(CCState &State,
+ const SmallVectorImpl<ISD::InputArg> &Ins) {
+ State.AnalyzeFormalArguments(Ins, CC_MSP430_AssignStack);
+}
+
+/// Analyze incoming and outgoing function arguments. We need custom C++ code
+/// to handle special constraints in the ABI like reversing the order of the
+/// pieces of splitted arguments. In addition, all pieces of a certain argument
+/// have to be passed either using registers or the stack but never mixing both.
+template<typename ArgT>
+static void AnalyzeArguments(CCState &State,
+ SmallVectorImpl<CCValAssign> &ArgLocs,
+ const SmallVectorImpl<ArgT> &Args) {
+ static const MCPhysReg RegList[] = {
+ MSP430::R15, MSP430::R14, MSP430::R13, MSP430::R12
+ };
+ static const unsigned NbRegs = array_lengthof(RegList);
+
+ if (State.isVarArg()) {
+ AnalyzeVarArgs(State, Args);
+ return;
+ }
+
+ SmallVector<unsigned, 4> ArgsParts;
+ ParseFunctionArgs(Args, ArgsParts);
+
+ unsigned RegsLeft = NbRegs;
+ bool UseStack = false;
+ unsigned ValNo = 0;
+
+ for (unsigned i = 0, e = ArgsParts.size(); i != e; i++) {
+ MVT ArgVT = Args[ValNo].VT;
+ ISD::ArgFlagsTy ArgFlags = Args[ValNo].Flags;
+ MVT LocVT = ArgVT;
+ CCValAssign::LocInfo LocInfo = CCValAssign::Full;
+
+ // Promote i8 to i16
+ if (LocVT == MVT::i8) {
+ LocVT = MVT::i16;
+ if (ArgFlags.isSExt())
+ LocInfo = CCValAssign::SExt;
+ else if (ArgFlags.isZExt())
+ LocInfo = CCValAssign::ZExt;
+ else
+ LocInfo = CCValAssign::AExt;
+ }
+
+ // Handle byval arguments
+ if (ArgFlags.isByVal()) {
+ State.HandleByVal(ValNo++, ArgVT, LocVT, LocInfo, 2, 2, ArgFlags);
+ continue;
+ }
+
+ unsigned Parts = ArgsParts[i];
+
+ if (!UseStack && Parts <= RegsLeft) {
+ unsigned FirstVal = ValNo;
+ for (unsigned j = 0; j < Parts; j++) {
+ unsigned Reg = State.AllocateReg(RegList);
+ State.addLoc(CCValAssign::getReg(ValNo++, ArgVT, Reg, LocVT, LocInfo));
+ RegsLeft--;
+ }
+
+ // Reverse the order of the pieces to agree with the "big endian" format
+ // required in the calling convention ABI.
+ SmallVectorImpl<CCValAssign>::iterator B = ArgLocs.begin() + FirstVal;
+ std::reverse(B, B + Parts);
+ } else {
+ UseStack = true;
+ for (unsigned j = 0; j < Parts; j++)
+ CC_MSP430_AssignStack(ValNo++, ArgVT, LocVT, LocInfo, ArgFlags, State);
+ }
+ }
+}
+
+static void AnalyzeRetResult(CCState &State,
+ const SmallVectorImpl<ISD::InputArg> &Ins) {
+ State.AnalyzeCallResult(Ins, RetCC_MSP430);
+}
+
+static void AnalyzeRetResult(CCState &State,
+ const SmallVectorImpl<ISD::OutputArg> &Outs) {
+ State.AnalyzeReturn(Outs, RetCC_MSP430);
+}
+
+template<typename ArgT>
+static void AnalyzeReturnValues(CCState &State,
+ SmallVectorImpl<CCValAssign> &RVLocs,
+ const SmallVectorImpl<ArgT> &Args) {
+ AnalyzeRetResult(State, Args);
+
+ // Reverse splitted return values to get the "big endian" format required
+ // to agree with the calling convention ABI.
+ std::reverse(RVLocs.begin(), RVLocs.end());
+}
+
+SDValue MSP430TargetLowering::LowerFormalArguments(
+ SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+
+ switch (CallConv) {
+ default:
+ llvm_unreachable("Unsupported calling convention");
+ case CallingConv::C:
+ case CallingConv::Fast:
+ return LowerCCCArguments(Chain, CallConv, isVarArg, Ins, dl, DAG, InVals);
+ case CallingConv::MSP430_INTR:
+ if (Ins.empty())
+ return Chain;
+ report_fatal_error("ISRs cannot have arguments");
+ }
+}
+
+SDValue
+MSP430TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const {
+ SelectionDAG &DAG = CLI.DAG;
+ SDLoc &dl = CLI.DL;
+ SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+ SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+ SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
+ SDValue Chain = CLI.Chain;
+ SDValue Callee = CLI.Callee;
+ bool &isTailCall = CLI.IsTailCall;
+ CallingConv::ID CallConv = CLI.CallConv;
+ bool isVarArg = CLI.IsVarArg;
+
+ // MSP430 target does not yet support tail call optimization.
+ isTailCall = false;
+
+ switch (CallConv) {
+ default:
+ llvm_unreachable("Unsupported calling convention");
+ case CallingConv::Fast:
+ case CallingConv::C:
+ return LowerCCCCallTo(Chain, Callee, CallConv, isVarArg, isTailCall,
+ Outs, OutVals, Ins, dl, DAG, InVals);
+ case CallingConv::MSP430_INTR:
+ report_fatal_error("ISRs cannot be called directly");
+ }
+}
+
+/// LowerCCCArguments - transform physical registers into virtual registers and
+/// generate load operations for arguments places on the stack.
+// FIXME: struct return stuff
+SDValue MSP430TargetLowering::LowerCCCArguments(
+ SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MachineRegisterInfo &RegInfo = MF.getRegInfo();
+ MSP430MachineFunctionInfo *FuncInfo = MF.getInfo<MSP430MachineFunctionInfo>();
+
+ // Assign locations to all of the incoming arguments.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+ *DAG.getContext());
+ AnalyzeArguments(CCInfo, ArgLocs, Ins);
+
+ // Create frame index for the start of the first vararg value
+ if (isVarArg) {
+ unsigned Offset = CCInfo.getNextStackOffset();
+ FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, Offset, true));
+ }
+
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ if (VA.isRegLoc()) {
+ // Arguments passed in registers
+ EVT RegVT = VA.getLocVT();
+ switch (RegVT.getSimpleVT().SimpleTy) {
+ default:
+ {
+#ifndef NDEBUG
+ errs() << "LowerFormalArguments Unhandled argument type: "
+ << RegVT.getEVTString() << "\n";
+#endif
+ llvm_unreachable(nullptr);
+ }
+ case MVT::i16:
+ unsigned VReg = RegInfo.createVirtualRegister(&MSP430::GR16RegClass);
+ RegInfo.addLiveIn(VA.getLocReg(), VReg);
+ SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, VReg, RegVT);
+
+ // If this is an 8-bit value, it is really passed promoted to 16
+ // bits. Insert an assert[sz]ext to capture this, then truncate to the
+ // right size.
+ if (VA.getLocInfo() == CCValAssign::SExt)
+ ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
+ DAG.getValueType(VA.getValVT()));
+ else if (VA.getLocInfo() == CCValAssign::ZExt)
+ ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
+ DAG.getValueType(VA.getValVT()));
+
+ if (VA.getLocInfo() != CCValAssign::Full)
+ ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+
+ InVals.push_back(ArgValue);
+ }
+ } else {
+ // Sanity check
+ assert(VA.isMemLoc());
+
+ SDValue InVal;
+ ISD::ArgFlagsTy Flags = Ins[i].Flags;
+
+ if (Flags.isByVal()) {
+ int FI = MFI.CreateFixedObject(Flags.getByValSize(),
+ VA.getLocMemOffset(), true);
+ InVal = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+ } else {
+ // Load the argument to a virtual register
+ unsigned ObjSize = VA.getLocVT().getSizeInBits()/8;
+ if (ObjSize > 2) {
+ errs() << "LowerFormalArguments Unhandled argument type: "
+ << EVT(VA.getLocVT()).getEVTString()
+ << "\n";
+ }
+ // Create the frame index object for this incoming parameter...
+ int FI = MFI.CreateFixedObject(ObjSize, VA.getLocMemOffset(), true);
+
+ // Create the SelectionDAG nodes corresponding to a load
+ //from this parameter
+ SDValue FIN = DAG.getFrameIndex(FI, MVT::i16);
+ InVal = DAG.getLoad(
+ VA.getLocVT(), dl, Chain, FIN,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+ }
+
+ InVals.push_back(InVal);
+ }
+ }
+
+ return Chain;
+}
+
+SDValue
+MSP430TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SDLoc &dl, SelectionDAG &DAG) const {
+
+ // CCValAssign - represent the assignment of the return value to a location
+ SmallVector<CCValAssign, 16> RVLocs;
+
+ // ISRs cannot return any value.
+ if (CallConv == CallingConv::MSP430_INTR && !Outs.empty())
+ report_fatal_error("ISRs cannot return any value");
+
+ // CCState - Info about the registers and stack slot.
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+ *DAG.getContext());
+
+ // Analize return values.
+ AnalyzeReturnValues(CCInfo, RVLocs, Outs);
+
+ SDValue Flag;
+ SmallVector<SDValue, 4> RetOps(1, Chain);
+
+ // Copy the result values into the output registers.
+ for (unsigned i = 0; i != RVLocs.size(); ++i) {
+ CCValAssign &VA = RVLocs[i];
+ assert(VA.isRegLoc() && "Can only return in registers!");
+
+ Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+ OutVals[i], Flag);
+
+ // Guarantee that all emitted copies are stuck together,
+ // avoiding something bad.
+ Flag = Chain.getValue(1);
+ RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+ }
+
+ unsigned Opc = (CallConv == CallingConv::MSP430_INTR ?
+ MSP430ISD::RETI_FLAG : MSP430ISD::RET_FLAG);
+
+ RetOps[0] = Chain; // Update chain.
+
+ // Add the flag if we have it.
+ if (Flag.getNode())
+ RetOps.push_back(Flag);
+
+ return DAG.getNode(Opc, dl, MVT::Other, RetOps);
+}
+
+/// LowerCCCCallTo - functions arguments are copied from virtual regs to
+/// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
+// TODO: sret.
+SDValue MSP430TargetLowering::LowerCCCCallTo(
+ SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
+ bool isTailCall, const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+ *DAG.getContext());
+ AnalyzeArguments(CCInfo, ArgLocs, Outs);
+
+ // Get a count of how many bytes are to be pushed on the stack.
+ unsigned NumBytes = CCInfo.getNextStackOffset();
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+ Chain = DAG.getCALLSEQ_START(Chain,
+ DAG.getConstant(NumBytes, dl, PtrVT, true), dl);
+
+ SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
+ SmallVector<SDValue, 12> MemOpChains;
+ SDValue StackPtr;
+
+ // Walk the register/memloc assignments, inserting copies/loads.
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+
+ SDValue Arg = OutVals[i];
+
+ // Promote the value if needed.
+ switch (VA.getLocInfo()) {
+ default: llvm_unreachable("Unknown loc info!");
+ case CCValAssign::Full: break;
+ case CCValAssign::SExt:
+ Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::ZExt:
+ Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::AExt:
+ Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+ break;
+ }
+
+ // Arguments that can be passed on register must be kept at RegsToPass
+ // vector
+ if (VA.isRegLoc()) {
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+ } else {
+ assert(VA.isMemLoc());
+
+ if (!StackPtr.getNode())
+ StackPtr = DAG.getCopyFromReg(Chain, dl, MSP430::SP, PtrVT);
+
+ SDValue PtrOff =
+ DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
+ DAG.getIntPtrConstant(VA.getLocMemOffset(), dl));
+
+ SDValue MemOp;
+ ISD::ArgFlagsTy Flags = Outs[i].Flags;
+
+ if (Flags.isByVal()) {
+ SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i16);
+ MemOp = DAG.getMemcpy(Chain, dl, PtrOff, Arg, SizeNode,
+ Flags.getByValAlign(),
+ /*isVolatile*/false,
+ /*AlwaysInline=*/true,
+ /*isTailCall=*/false,
+ MachinePointerInfo(),
+ MachinePointerInfo());
+ } else {
+ MemOp = DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
+ }
+
+ MemOpChains.push_back(MemOp);
+ }
+ }
+
+ // Transform all store nodes into one single node because all store nodes are
+ // independent of each other.
+ if (!MemOpChains.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+
+ // Build a sequence of copy-to-reg nodes chained together with token chain and
+ // flag operands which copy the outgoing args into registers. The InFlag in
+ // necessary since all emitted instructions must be stuck together.
+ SDValue InFlag;
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+ Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+ RegsToPass[i].second, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ // If the callee is a GlobalAddress node (quite common, every direct call is)
+ // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+ // Likewise ExternalSymbol -> TargetExternalSymbol.
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+ Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, MVT::i16);
+ else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
+ Callee = DAG.getTargetExternalSymbol(E->getSymbol(), MVT::i16);
+
+ // Returns a chain & a flag for retval copy to use.
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(Chain);
+ Ops.push_back(Callee);
+
+ // Add argument registers to the end of the list so that they are
+ // known live into the call.
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+ Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+ RegsToPass[i].second.getValueType()));
+
+ if (InFlag.getNode())
+ Ops.push_back(InFlag);
+
+ Chain = DAG.getNode(MSP430ISD::CALL, dl, NodeTys, Ops);
+ InFlag = Chain.getValue(1);
+
+ // Create the CALLSEQ_END node.
+ Chain = DAG.getCALLSEQ_END(Chain, DAG.getConstant(NumBytes, dl, PtrVT, true),
+ DAG.getConstant(0, dl, PtrVT, true), InFlag, dl);
+ InFlag = Chain.getValue(1);
+
+ // Handle result values, copying them out of physregs into vregs that we
+ // return.
+ return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl,
+ DAG, InVals);
+}
+
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+///
+SDValue MSP430TargetLowering::LowerCallResult(
+ SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+
+ // Assign locations to each value returned by this call.
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+ *DAG.getContext());
+
+ AnalyzeReturnValues(CCInfo, RVLocs, Ins);
+
+ // Copy all of the result registers out of their specified physreg.
+ for (unsigned i = 0; i != RVLocs.size(); ++i) {
+ Chain = DAG.getCopyFromReg(Chain, dl, RVLocs[i].getLocReg(),
+ RVLocs[i].getValVT(), InFlag).getValue(1);
+ InFlag = Chain.getValue(2);
+ InVals.push_back(Chain.getValue(0));
+ }
+
+ return Chain;
+}
+
+SDValue MSP430TargetLowering::LowerShifts(SDValue Op,
+ SelectionDAG &DAG) const {
+ unsigned Opc = Op.getOpcode();
+ SDNode* N = Op.getNode();
+ EVT VT = Op.getValueType();
+ SDLoc dl(N);
+
+ // Expand non-constant shifts to loops:
+ if (!isa<ConstantSDNode>(N->getOperand(1)))
+ switch (Opc) {
+ default: llvm_unreachable("Invalid shift opcode!");
+ case ISD::SHL:
+ return DAG.getNode(MSP430ISD::SHL, dl,
+ VT, N->getOperand(0), N->getOperand(1));
+ case ISD::SRA:
+ return DAG.getNode(MSP430ISD::SRA, dl,
+ VT, N->getOperand(0), N->getOperand(1));
+ case ISD::SRL:
+ return DAG.getNode(MSP430ISD::SRL, dl,
+ VT, N->getOperand(0), N->getOperand(1));
+ }
+
+ uint64_t ShiftAmount = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+
+ // Expand the stuff into sequence of shifts.
+ // FIXME: for some shift amounts this might be done better!
+ // E.g.: foo >> (8 + N) => sxt(swpb(foo)) >> N
+ SDValue Victim = N->getOperand(0);
+
+ if (Opc == ISD::SRL && ShiftAmount) {
+ // Emit a special goodness here:
+ // srl A, 1 => clrc; rrc A
+ Victim = DAG.getNode(MSP430ISD::RRC, dl, VT, Victim);
+ ShiftAmount -= 1;
+ }
+
+ while (ShiftAmount--)
+ Victim = DAG.getNode((Opc == ISD::SHL ? MSP430ISD::RLA : MSP430ISD::RRA),
+ dl, VT, Victim);
+
+ return Victim;
+}
+
+SDValue MSP430TargetLowering::LowerGlobalAddress(SDValue Op,
+ SelectionDAG &DAG) const {
+ const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+ int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+ // Create the TargetGlobalAddress node, folding in the constant offset.
+ SDValue Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op), PtrVT, Offset);
+ return DAG.getNode(MSP430ISD::Wrapper, SDLoc(Op), PtrVT, Result);
+}
+
+SDValue MSP430TargetLowering::LowerExternalSymbol(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT);
+
+ return DAG.getNode(MSP430ISD::Wrapper, dl, PtrVT, Result);
+}
+
+SDValue MSP430TargetLowering::LowerBlockAddress(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+ const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+ SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT);
+
+ return DAG.getNode(MSP430ISD::Wrapper, dl, PtrVT, Result);
+}
+
+static SDValue EmitCMP(SDValue &LHS, SDValue &RHS, SDValue &TargetCC,
+ ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG) {
+ // FIXME: Handle bittests someday
+ assert(!LHS.getValueType().isFloatingPoint() && "We don't handle FP yet");
+
+ // FIXME: Handle jump negative someday
+ MSP430CC::CondCodes TCC = MSP430CC::COND_INVALID;
+ switch (CC) {
+ default: llvm_unreachable("Invalid integer condition!");
+ case ISD::SETEQ:
+ TCC = MSP430CC::COND_E; // aka COND_Z
+ // Minor optimization: if LHS is a constant, swap operands, then the
+ // constant can be folded into comparison.
+ if (LHS.getOpcode() == ISD::Constant)
+ std::swap(LHS, RHS);
+ break;
+ case ISD::SETNE:
+ TCC = MSP430CC::COND_NE; // aka COND_NZ
+ // Minor optimization: if LHS is a constant, swap operands, then the
+ // constant can be folded into comparison.
+ if (LHS.getOpcode() == ISD::Constant)
+ std::swap(LHS, RHS);
+ break;
+ case ISD::SETULE:
+ std::swap(LHS, RHS);
+ LLVM_FALLTHROUGH;
+ case ISD::SETUGE:
+ // Turn lhs u>= rhs with lhs constant into rhs u< lhs+1, this allows us to
+ // fold constant into instruction.
+ if (const ConstantSDNode * C = dyn_cast<ConstantSDNode>(LHS)) {
+ LHS = RHS;
+ RHS = DAG.getConstant(C->getSExtValue() + 1, dl, C->getValueType(0));
+ TCC = MSP430CC::COND_LO;
+ break;
+ }
+ TCC = MSP430CC::COND_HS; // aka COND_C
+ break;
+ case ISD::SETUGT:
+ std::swap(LHS, RHS);
+ LLVM_FALLTHROUGH;
+ case ISD::SETULT:
+ // Turn lhs u< rhs with lhs constant into rhs u>= lhs+1, this allows us to
+ // fold constant into instruction.
+ if (const ConstantSDNode * C = dyn_cast<ConstantSDNode>(LHS)) {
+ LHS = RHS;
+ RHS = DAG.getConstant(C->getSExtValue() + 1, dl, C->getValueType(0));
+ TCC = MSP430CC::COND_HS;
+ break;
+ }
+ TCC = MSP430CC::COND_LO; // aka COND_NC
+ break;
+ case ISD::SETLE:
+ std::swap(LHS, RHS);
+ LLVM_FALLTHROUGH;
+ case ISD::SETGE:
+ // Turn lhs >= rhs with lhs constant into rhs < lhs+1, this allows us to
+ // fold constant into instruction.
+ if (const ConstantSDNode * C = dyn_cast<ConstantSDNode>(LHS)) {
+ LHS = RHS;
+ RHS = DAG.getConstant(C->getSExtValue() + 1, dl, C->getValueType(0));
+ TCC = MSP430CC::COND_L;
+ break;
+ }
+ TCC = MSP430CC::COND_GE;
+ break;
+ case ISD::SETGT:
+ std::swap(LHS, RHS);
+ LLVM_FALLTHROUGH;
+ case ISD::SETLT:
+ // Turn lhs < rhs with lhs constant into rhs >= lhs+1, this allows us to
+ // fold constant into instruction.
+ if (const ConstantSDNode * C = dyn_cast<ConstantSDNode>(LHS)) {
+ LHS = RHS;
+ RHS = DAG.getConstant(C->getSExtValue() + 1, dl, C->getValueType(0));
+ TCC = MSP430CC::COND_GE;
+ break;
+ }
+ TCC = MSP430CC::COND_L;
+ break;
+ }
+
+ TargetCC = DAG.getConstant(TCC, dl, MVT::i8);
+ return DAG.getNode(MSP430ISD::CMP, dl, MVT::Glue, LHS, RHS);
+}
+
+
+SDValue MSP430TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
+ SDValue Chain = Op.getOperand(0);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+ SDValue LHS = Op.getOperand(2);
+ SDValue RHS = Op.getOperand(3);
+ SDValue Dest = Op.getOperand(4);
+ SDLoc dl (Op);
+
+ SDValue TargetCC;
+ SDValue Flag = EmitCMP(LHS, RHS, TargetCC, CC, dl, DAG);
+
+ return DAG.getNode(MSP430ISD::BR_CC, dl, Op.getValueType(),
+ Chain, Dest, TargetCC, Flag);
+}
+
+SDValue MSP430TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ SDLoc dl (Op);
+
+ // If we are doing an AND and testing against zero, then the CMP
+ // will not be generated. The AND (or BIT) will generate the condition codes,
+ // but they are different from CMP.
+ // FIXME: since we're doing a post-processing, use a pseudoinstr here, so
+ // lowering & isel wouldn't diverge.
+ bool andCC = false;
+ if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
+ if (RHSC->isNullValue() && LHS.hasOneUse() &&
+ (LHS.getOpcode() == ISD::AND ||
+ (LHS.getOpcode() == ISD::TRUNCATE &&
+ LHS.getOperand(0).getOpcode() == ISD::AND))) {
+ andCC = true;
+ }
+ }
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+ SDValue TargetCC;
+ SDValue Flag = EmitCMP(LHS, RHS, TargetCC, CC, dl, DAG);
+
+ // Get the condition codes directly from the status register, if its easy.
+ // Otherwise a branch will be generated. Note that the AND and BIT
+ // instructions generate different flags than CMP, the carry bit can be used
+ // for NE/EQ.
+ bool Invert = false;
+ bool Shift = false;
+ bool Convert = true;
+ switch (cast<ConstantSDNode>(TargetCC)->getZExtValue()) {
+ default:
+ Convert = false;
+ break;
+ case MSP430CC::COND_HS:
+ // Res = SR & 1, no processing is required
+ break;
+ case MSP430CC::COND_LO:
+ // Res = ~(SR & 1)
+ Invert = true;
+ break;
+ case MSP430CC::COND_NE:
+ if (andCC) {
+ // C = ~Z, thus Res = SR & 1, no processing is required
+ } else {
+ // Res = ~((SR >> 1) & 1)
+ Shift = true;
+ Invert = true;
+ }
+ break;
+ case MSP430CC::COND_E:
+ Shift = true;
+ // C = ~Z for AND instruction, thus we can put Res = ~(SR & 1), however,
+ // Res = (SR >> 1) & 1 is 1 word shorter.
+ break;
+ }
+ EVT VT = Op.getValueType();
+ SDValue One = DAG.getConstant(1, dl, VT);
+ if (Convert) {
+ SDValue SR = DAG.getCopyFromReg(DAG.getEntryNode(), dl, MSP430::SR,
+ MVT::i16, Flag);
+ if (Shift)
+ // FIXME: somewhere this is turned into a SRL, lower it MSP specific?
+ SR = DAG.getNode(ISD::SRA, dl, MVT::i16, SR, One);
+ SR = DAG.getNode(ISD::AND, dl, MVT::i16, SR, One);
+ if (Invert)
+ SR = DAG.getNode(ISD::XOR, dl, MVT::i16, SR, One);
+ return SR;
+ } else {
+ SDValue Zero = DAG.getConstant(0, dl, VT);
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
+ SDValue Ops[] = {One, Zero, TargetCC, Flag};
+ return DAG.getNode(MSP430ISD::SELECT_CC, dl, VTs, Ops);
+ }
+}
+
+SDValue MSP430TargetLowering::LowerSELECT_CC(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ SDValue TrueV = Op.getOperand(2);
+ SDValue FalseV = Op.getOperand(3);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+ SDLoc dl (Op);
+
+ SDValue TargetCC;
+ SDValue Flag = EmitCMP(LHS, RHS, TargetCC, CC, dl, DAG);
+
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
+ SDValue Ops[] = {TrueV, FalseV, TargetCC, Flag};
+
+ return DAG.getNode(MSP430ISD::SELECT_CC, dl, VTs, Ops);
+}
+
+SDValue MSP430TargetLowering::LowerSIGN_EXTEND(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue Val = Op.getOperand(0);
+ EVT VT = Op.getValueType();
+ SDLoc dl(Op);
+
+ assert(VT == MVT::i16 && "Only support i16 for now!");
+
+ return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT,
+ DAG.getNode(ISD::ANY_EXTEND, dl, VT, Val),
+ DAG.getValueType(Val.getValueType()));
+}
+
+SDValue
+MSP430TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MSP430MachineFunctionInfo *FuncInfo = MF.getInfo<MSP430MachineFunctionInfo>();
+ int ReturnAddrIndex = FuncInfo->getRAIndex();
+ auto PtrVT = getPointerTy(MF.getDataLayout());
+
+ if (ReturnAddrIndex == 0) {
+ // Set up a frame object for the return address.
+ uint64_t SlotSize = MF.getDataLayout().getPointerSize();
+ ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize, -SlotSize,
+ true);
+ FuncInfo->setRAIndex(ReturnAddrIndex);
+ }
+
+ return DAG.getFrameIndex(ReturnAddrIndex, PtrVT);
+}
+
+SDValue MSP430TargetLowering::LowerRETURNADDR(SDValue Op,
+ SelectionDAG &DAG) const {
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ MFI.setReturnAddressIsTaken(true);
+
+ if (verifyReturnAddressArgumentIsConstant(Op, DAG))
+ return SDValue();
+
+ unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ SDLoc dl(Op);
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+ if (Depth > 0) {
+ SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+ SDValue Offset =
+ DAG.getConstant(DAG.getDataLayout().getPointerSize(), dl, MVT::i16);
+ return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
+ DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
+ MachinePointerInfo());
+ }
+
+ // Just load the return address.
+ SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
+ return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
+ MachinePointerInfo());
+}
+
+SDValue MSP430TargetLowering::LowerFRAMEADDR(SDValue Op,
+ SelectionDAG &DAG) const {
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ MFI.setFrameAddressIsTaken(true);
+
+ EVT VT = Op.getValueType();
+ SDLoc dl(Op); // FIXME probably not meaningful
+ unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
+ MSP430::FP, VT);
+ while (Depth--)
+ FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
+ MachinePointerInfo());
+ return FrameAddr;
+}
+
+SDValue MSP430TargetLowering::LowerVASTART(SDValue Op,
+ SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MSP430MachineFunctionInfo *FuncInfo = MF.getInfo<MSP430MachineFunctionInfo>();
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+ // Frame index of first vararg argument
+ SDValue FrameIndex =
+ DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+ const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+
+ // Create a store of the frame index to the location operand
+ return DAG.getStore(Op.getOperand(0), SDLoc(Op), FrameIndex, Op.getOperand(1),
+ MachinePointerInfo(SV));
+}
+
+SDValue MSP430TargetLowering::LowerJumpTable(SDValue Op,
+ SelectionDAG &DAG) const {
+ JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
+ return DAG.getNode(MSP430ISD::Wrapper, SDLoc(JT), PtrVT, Result);
+}
+
+/// getPostIndexedAddressParts - returns true by value, base pointer and
+/// offset pointer and addressing mode by reference if this node can be
+/// combined with a load / store to form a post-indexed load / store.
+bool MSP430TargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+ SDValue &Base,
+ SDValue &Offset,
+ ISD::MemIndexedMode &AM,
+ SelectionDAG &DAG) const {
+
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ if (LD->getExtensionType() != ISD::NON_EXTLOAD)
+ return false;
+
+ EVT VT = LD->getMemoryVT();
+ if (VT != MVT::i8 && VT != MVT::i16)
+ return false;
+
+ if (Op->getOpcode() != ISD::ADD)
+ return false;
+
+ if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
+ uint64_t RHSC = RHS->getZExtValue();
+ if ((VT == MVT::i16 && RHSC != 2) ||
+ (VT == MVT::i8 && RHSC != 1))
+ return false;
+
+ Base = Op->getOperand(0);
+ Offset = DAG.getConstant(RHSC, SDLoc(N), VT);
+ AM = ISD::POST_INC;
+ return true;
+ }
+
+ return false;
+}
+
+
+const char *MSP430TargetLowering::getTargetNodeName(unsigned Opcode) const {
+ switch ((MSP430ISD::NodeType)Opcode) {
+ case MSP430ISD::FIRST_NUMBER: break;
+ case MSP430ISD::RET_FLAG: return "MSP430ISD::RET_FLAG";
+ case MSP430ISD::RETI_FLAG: return "MSP430ISD::RETI_FLAG";
+ case MSP430ISD::RRA: return "MSP430ISD::RRA";
+ case MSP430ISD::RLA: return "MSP430ISD::RLA";
+ case MSP430ISD::RRC: return "MSP430ISD::RRC";
+ case MSP430ISD::CALL: return "MSP430ISD::CALL";
+ case MSP430ISD::Wrapper: return "MSP430ISD::Wrapper";
+ case MSP430ISD::BR_CC: return "MSP430ISD::BR_CC";
+ case MSP430ISD::CMP: return "MSP430ISD::CMP";
+ case MSP430ISD::SETCC: return "MSP430ISD::SETCC";
+ case MSP430ISD::SELECT_CC: return "MSP430ISD::SELECT_CC";
+ case MSP430ISD::SHL: return "MSP430ISD::SHL";
+ case MSP430ISD::SRA: return "MSP430ISD::SRA";
+ case MSP430ISD::SRL: return "MSP430ISD::SRL";
+ }
+ return nullptr;
+}
+
+bool MSP430TargetLowering::isTruncateFree(Type *Ty1,
+ Type *Ty2) const {
+ if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+ return false;
+
+ return (Ty1->getPrimitiveSizeInBits() > Ty2->getPrimitiveSizeInBits());
+}
+
+bool MSP430TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
+ if (!VT1.isInteger() || !VT2.isInteger())
+ return false;
+
+ return (VT1.getSizeInBits() > VT2.getSizeInBits());
+}
+
+bool MSP430TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
+ // MSP430 implicitly zero-extends 8-bit results in 16-bit registers.
+ return 0 && Ty1->isIntegerTy(8) && Ty2->isIntegerTy(16);
+}
+
+bool MSP430TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
+ // MSP430 implicitly zero-extends 8-bit results in 16-bit registers.
+ return 0 && VT1 == MVT::i8 && VT2 == MVT::i16;
+}
+
+bool MSP430TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+ return isZExtFree(Val.getValueType(), VT2);
+}
+
+//===----------------------------------------------------------------------===//
+// Other Lowering Code
+//===----------------------------------------------------------------------===//
+
+MachineBasicBlock *
+MSP430TargetLowering::EmitShiftInstr(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ MachineFunction *F = BB->getParent();
+ MachineRegisterInfo &RI = F->getRegInfo();
+ DebugLoc dl = MI.getDebugLoc();
+ const TargetInstrInfo &TII = *F->getSubtarget().getInstrInfo();
+
+ unsigned Opc;
+ const TargetRegisterClass * RC;
+ switch (MI.getOpcode()) {
+ default: llvm_unreachable("Invalid shift opcode!");
+ case MSP430::Shl8:
+ Opc = MSP430::SHL8r1;
+ RC = &MSP430::GR8RegClass;
+ break;
+ case MSP430::Shl16:
+ Opc = MSP430::SHL16r1;
+ RC = &MSP430::GR16RegClass;
+ break;
+ case MSP430::Sra8:
+ Opc = MSP430::SAR8r1;
+ RC = &MSP430::GR8RegClass;
+ break;
+ case MSP430::Sra16:
+ Opc = MSP430::SAR16r1;
+ RC = &MSP430::GR16RegClass;
+ break;
+ case MSP430::Srl8:
+ Opc = MSP430::SAR8r1c;
+ RC = &MSP430::GR8RegClass;
+ break;
+ case MSP430::Srl16:
+ Opc = MSP430::SAR16r1c;
+ RC = &MSP430::GR16RegClass;
+ break;
+ }
+
+ const BasicBlock *LLVM_BB = BB->getBasicBlock();
+ MachineFunction::iterator I = ++BB->getIterator();
+
+ // Create loop block
+ MachineBasicBlock *LoopBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *RemBB = F->CreateMachineBasicBlock(LLVM_BB);
+
+ F->insert(I, LoopBB);
+ F->insert(I, RemBB);
+
+ // Update machine-CFG edges by transferring all successors of the current
+ // block to the block containing instructions after shift.
+ RemBB->splice(RemBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)),
+ BB->end());
+ RemBB->transferSuccessorsAndUpdatePHIs(BB);
+
+ // Add adges BB => LoopBB => RemBB, BB => RemBB, LoopBB => LoopBB
+ BB->addSuccessor(LoopBB);
+ BB->addSuccessor(RemBB);
+ LoopBB->addSuccessor(RemBB);
+ LoopBB->addSuccessor(LoopBB);
+
+ unsigned ShiftAmtReg = RI.createVirtualRegister(&MSP430::GR8RegClass);
+ unsigned ShiftAmtReg2 = RI.createVirtualRegister(&MSP430::GR8RegClass);
+ unsigned ShiftReg = RI.createVirtualRegister(RC);
+ unsigned ShiftReg2 = RI.createVirtualRegister(RC);
+ unsigned ShiftAmtSrcReg = MI.getOperand(2).getReg();
+ unsigned SrcReg = MI.getOperand(1).getReg();
+ unsigned DstReg = MI.getOperand(0).getReg();
+
+ // BB:
+ // cmp 0, N
+ // je RemBB
+ BuildMI(BB, dl, TII.get(MSP430::CMP8ri))
+ .addReg(ShiftAmtSrcReg).addImm(0);
+ BuildMI(BB, dl, TII.get(MSP430::JCC))
+ .addMBB(RemBB)
+ .addImm(MSP430CC::COND_E);
+
+ // LoopBB:
+ // ShiftReg = phi [%SrcReg, BB], [%ShiftReg2, LoopBB]
+ // ShiftAmt = phi [%N, BB], [%ShiftAmt2, LoopBB]
+ // ShiftReg2 = shift ShiftReg
+ // ShiftAmt2 = ShiftAmt - 1;
+ BuildMI(LoopBB, dl, TII.get(MSP430::PHI), ShiftReg)
+ .addReg(SrcReg).addMBB(BB)
+ .addReg(ShiftReg2).addMBB(LoopBB);
+ BuildMI(LoopBB, dl, TII.get(MSP430::PHI), ShiftAmtReg)
+ .addReg(ShiftAmtSrcReg).addMBB(BB)
+ .addReg(ShiftAmtReg2).addMBB(LoopBB);
+ BuildMI(LoopBB, dl, TII.get(Opc), ShiftReg2)
+ .addReg(ShiftReg);
+ BuildMI(LoopBB, dl, TII.get(MSP430::SUB8ri), ShiftAmtReg2)
+ .addReg(ShiftAmtReg).addImm(1);
+ BuildMI(LoopBB, dl, TII.get(MSP430::JCC))
+ .addMBB(LoopBB)
+ .addImm(MSP430CC::COND_NE);
+
+ // RemBB:
+ // DestReg = phi [%SrcReg, BB], [%ShiftReg, LoopBB]
+ BuildMI(*RemBB, RemBB->begin(), dl, TII.get(MSP430::PHI), DstReg)
+ .addReg(SrcReg).addMBB(BB)
+ .addReg(ShiftReg2).addMBB(LoopBB);
+
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
+ return RemBB;
+}
+
+MachineBasicBlock *
+MSP430TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ unsigned Opc = MI.getOpcode();
+
+ if (Opc == MSP430::Shl8 || Opc == MSP430::Shl16 ||
+ Opc == MSP430::Sra8 || Opc == MSP430::Sra16 ||
+ Opc == MSP430::Srl8 || Opc == MSP430::Srl16)
+ return EmitShiftInstr(MI, BB);
+
+ const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
+ DebugLoc dl = MI.getDebugLoc();
+
+ assert((Opc == MSP430::Select16 || Opc == MSP430::Select8) &&
+ "Unexpected instr type to insert");
+
+ // To "insert" a SELECT instruction, we actually have to insert the diamond
+ // control-flow pattern. The incoming instruction knows the destination vreg
+ // to set, the condition code register to branch on, the true/false values to
+ // select between, and a branch opcode to use.
+ const BasicBlock *LLVM_BB = BB->getBasicBlock();
+ MachineFunction::iterator I = ++BB->getIterator();
+
+ // thisMBB:
+ // ...
+ // TrueVal = ...
+ // cmpTY ccX, r1, r2
+ // jCC copy1MBB
+ // fallthrough --> copy0MBB
+ MachineBasicBlock *thisMBB = BB;
+ MachineFunction *F = BB->getParent();
+ MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *copy1MBB = F->CreateMachineBasicBlock(LLVM_BB);
+ F->insert(I, copy0MBB);
+ F->insert(I, copy1MBB);
+ // Update machine-CFG edges by transferring all successors of the current
+ // block to the new block which will contain the Phi node for the select.
+ copy1MBB->splice(copy1MBB->begin(), BB,
+ std::next(MachineBasicBlock::iterator(MI)), BB->end());
+ copy1MBB->transferSuccessorsAndUpdatePHIs(BB);
+ // Next, add the true and fallthrough blocks as its successors.
+ BB->addSuccessor(copy0MBB);
+ BB->addSuccessor(copy1MBB);
+
+ BuildMI(BB, dl, TII.get(MSP430::JCC))
+ .addMBB(copy1MBB)
+ .addImm(MI.getOperand(3).getImm());
+
+ // copy0MBB:
+ // %FalseValue = ...
+ // # fallthrough to copy1MBB
+ BB = copy0MBB;
+
+ // Update machine-CFG edges
+ BB->addSuccessor(copy1MBB);
+
+ // copy1MBB:
+ // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+ // ...
+ BB = copy1MBB;
+ BuildMI(*BB, BB->begin(), dl, TII.get(MSP430::PHI), MI.getOperand(0).getReg())
+ .addReg(MI.getOperand(2).getReg())
+ .addMBB(copy0MBB)
+ .addReg(MI.getOperand(1).getReg())
+ .addMBB(thisMBB);
+
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
+ return BB;
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.h b/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.h
new file mode 100644
index 000000000000..8864807e999e
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.h
@@ -0,0 +1,174 @@
+//===-- MSP430ISelLowering.h - MSP430 DAG Lowering Interface ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that MSP430 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MSP430_MSP430ISELLOWERING_H
+#define LLVM_LIB_TARGET_MSP430_MSP430ISELLOWERING_H
+
+#include "MSP430.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+ namespace MSP430ISD {
+ enum NodeType : unsigned {
+ FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+ /// Return with a flag operand. Operand 0 is the chain operand.
+ RET_FLAG,
+
+ /// Same as RET_FLAG, but used for returning from ISRs.
+ RETI_FLAG,
+
+ /// Y = R{R,L}A X, rotate right (left) arithmetically
+ RRA, RLA,
+
+ /// Y = RRC X, rotate right via carry
+ RRC,
+
+ /// CALL - These operations represent an abstract call
+ /// instruction, which includes a bunch of information.
+ CALL,
+
+ /// Wrapper - A wrapper node for TargetConstantPool, TargetExternalSymbol,
+ /// and TargetGlobalAddress.
+ Wrapper,
+
+ /// CMP - Compare instruction.
+ CMP,
+
+ /// SetCC - Operand 0 is condition code, and operand 1 is the flag
+ /// operand produced by a CMP instruction.
+ SETCC,
+
+ /// MSP430 conditional branches. Operand 0 is the chain operand, operand 1
+ /// is the block to branch if condition is true, operand 2 is the
+ /// condition code, and operand 3 is the flag operand produced by a CMP
+ /// instruction.
+ BR_CC,
+
+ /// SELECT_CC - Operand 0 and operand 1 are selection variable, operand 3
+ /// is condition code and operand 4 is flag operand.
+ SELECT_CC,
+
+ /// SHL, SRA, SRL - Non-constant shifts.
+ SHL, SRA, SRL
+ };
+ }
+
+ class MSP430Subtarget;
+ class MSP430TargetLowering : public TargetLowering {
+ public:
+ explicit MSP430TargetLowering(const TargetMachine &TM,
+ const MSP430Subtarget &STI);
+
+ MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
+ return MVT::i8;
+ }
+
+ /// LowerOperation - Provide custom lowering hooks for some operations.
+ SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+ /// getTargetNodeName - This method returns the name of a target specific
+ /// DAG node.
+ const char *getTargetNodeName(unsigned Opcode) const override;
+
+ SDValue LowerShifts(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+ SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
+
+ TargetLowering::ConstraintType
+ getConstraintType(StringRef Constraint) const override;
+ std::pair<unsigned, const TargetRegisterClass *>
+ getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ StringRef Constraint, MVT VT) const override;
+
+ /// isTruncateFree - Return true if it's free to truncate a value of type
+ /// Ty1 to type Ty2. e.g. On msp430 it's free to truncate a i16 value in
+ /// register R15W to i8 by referencing its sub-register R15B.
+ bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
+ bool isTruncateFree(EVT VT1, EVT VT2) const override;
+
+ /// isZExtFree - Return true if any actual instruction that defines a value
+ /// of type Ty1 implicit zero-extends the value to Ty2 in the result
+ /// register. This does not necessarily include registers defined in unknown
+ /// ways, such as incoming arguments, or copies from unknown virtual
+ /// registers. Also, if isTruncateFree(Ty2, Ty1) is true, this does not
+ /// necessarily apply to truncate instructions. e.g. on msp430, all
+ /// instructions that define 8-bit values implicit zero-extend the result
+ /// out to 16 bits.
+ bool isZExtFree(Type *Ty1, Type *Ty2) const override;
+ bool isZExtFree(EVT VT1, EVT VT2) const override;
+ bool isZExtFree(SDValue Val, EVT VT2) const override;
+
+ MachineBasicBlock *
+ EmitInstrWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *BB) const override;
+ MachineBasicBlock *EmitShiftInstr(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+
+ private:
+ SDValue LowerCCCCallTo(SDValue Chain, SDValue Callee,
+ CallingConv::ID CallConv, bool isVarArg,
+ bool isTailCall,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const;
+
+ SDValue LowerCCCArguments(SDValue Chain, CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const;
+
+ SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+ CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const;
+
+ SDValue
+ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const override;
+ SDValue
+ LowerCall(TargetLowering::CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const override;
+
+ SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SDLoc &dl, SelectionDAG &DAG) const override;
+
+ bool getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+ SDValue &Base,
+ SDValue &Offset,
+ ISD::MemIndexedMode &AM,
+ SelectionDAG &DAG) const override;
+ };
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430InstrFormats.td b/contrib/llvm/lib/Target/MSP430/MSP430InstrFormats.td
new file mode 100644
index 000000000000..a9e87dad0cd8
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430InstrFormats.td
@@ -0,0 +1,211 @@
+//===-- MSP430InstrFormats.td - MSP430 Instruction Formats -*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Describe MSP430 instructions format here
+//
+
+// Format specifies the encoding used by the instruction. This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<2> val> {
+ bits<2> Value = val;
+}
+
+def PseudoFrm : Format<0>;
+def SingleOpFrm : Format<1>;
+def DoubleOpFrm : Format<2>;
+def CondJumpFrm : Format<3>;
+
+class SourceMode<bits<2> val> {
+ bits<2> Value = val;
+}
+
+def SrcReg : SourceMode<0>;
+def SrcMem : SourceMode<1>;
+def SrcIndReg : SourceMode<2>;
+def SrcPostInc : SourceMode<3>;
+def SrcImm : SourceMode<3>;
+
+class DestMode<bit val> {
+ bit Value = val;
+}
+
+def DstReg : DestMode<0>;
+def DstMem : DestMode<1>;
+
+class SizeVal<bits<3> val> {
+ bits<3> Value = val;
+}
+
+def SizeUnknown : SizeVal<0>; // Unknown / unset size
+def SizeSpecial : SizeVal<1>; // Special instruction, e.g. pseudo
+def Size2Bytes : SizeVal<2>;
+def Size4Bytes : SizeVal<3>;
+def Size6Bytes : SizeVal<4>;
+
+// Generic MSP430 Format
+class MSP430Inst<dag outs, dag ins, SizeVal sz, Format f,
+ string asmstr> : Instruction {
+ field bits<16> Inst;
+
+ let Namespace = "MSP430";
+
+ dag OutOperandList = outs;
+ dag InOperandList = ins;
+
+ Format Form = f;
+ SizeVal Sz = sz;
+
+ // Define how we want to layout our TargetSpecific information field... This
+ // should be kept up-to-date with the fields in the MSP430InstrInfo.h file.
+ let TSFlags{1-0} = Form.Value;
+ let TSFlags{4-2} = Sz.Value;
+
+ let AsmString = asmstr;
+}
+
+// FIXME: Create different classes for different addressing modes.
+
+// MSP430 Double Operand (Format I) Instructions
+class IForm<bits<4> opcode, DestMode dest, bit bw, SourceMode src, SizeVal sz,
+ dag outs, dag ins, string asmstr, list<dag> pattern>
+ : MSP430Inst<outs, ins, sz, DoubleOpFrm, asmstr> {
+ let Pattern = pattern;
+
+ DestMode ad = dest;
+ SourceMode as = src;
+
+ let Inst{12-15} = opcode;
+ let Inst{7} = ad.Value;
+ let Inst{6} = bw;
+ let Inst{4-5} = as.Value;
+}
+
+// 8 bit IForm instructions
+class IForm8<bits<4> opcode, DestMode dest, SourceMode src, SizeVal sz,
+ dag outs, dag ins, string asmstr, list<dag> pattern>
+ : IForm<opcode, dest, 1, src, sz, outs, ins, asmstr, pattern>;
+
+class I8rr<bits<4> opcode,
+ dag outs, dag ins, string asmstr, list<dag> pattern>
+ : IForm8<opcode, DstReg, SrcReg, Size2Bytes, outs, ins, asmstr, pattern>;
+
+class I8ri<bits<4> opcode,
+ dag outs, dag ins, string asmstr, list<dag> pattern>
+ : IForm8<opcode, DstReg, SrcImm, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class I8rm<bits<4> opcode,
+ dag outs, dag ins, string asmstr, list<dag> pattern>
+ : IForm8<opcode, DstReg, SrcMem, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class I8mr<bits<4> opcode,
+ dag outs, dag ins, string asmstr, list<dag> pattern>
+ : IForm8<opcode, DstMem, SrcReg, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class I8mi<bits<4> opcode,
+ dag outs, dag ins, string asmstr, list<dag> pattern>
+ : IForm8<opcode, DstMem, SrcImm, Size6Bytes, outs, ins, asmstr, pattern>;
+
+class I8mm<bits<4> opcode,
+ dag outs, dag ins, string asmstr, list<dag> pattern>
+ : IForm8<opcode, DstMem, SrcMem, Size6Bytes, outs, ins, asmstr, pattern>;
+
+// 16 bit IForm instructions
+class IForm16<bits<4> opcode, DestMode dest, SourceMode src, SizeVal sz,
+ dag outs, dag ins, string asmstr, list<dag> pattern>
+ : IForm<opcode, dest, 0, src, sz, outs, ins, asmstr, pattern>;
+
+class I16rr<bits<4> opcode,
+ dag outs, dag ins, string asmstr, list<dag> pattern>
+ : IForm16<opcode, DstReg, SrcReg, Size2Bytes, outs, ins, asmstr, pattern>;
+
+class I16ri<bits<4> opcode,
+ dag outs, dag ins, string asmstr, list<dag> pattern>
+ : IForm16<opcode, DstReg, SrcImm, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class I16rm<bits<4> opcode,
+ dag outs, dag ins, string asmstr, list<dag> pattern>
+ : IForm16<opcode, DstReg, SrcMem, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class I16mr<bits<4> opcode,
+ dag outs, dag ins, string asmstr, list<dag> pattern>
+ : IForm16<opcode, DstMem, SrcReg, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class I16mi<bits<4> opcode,
+ dag outs, dag ins, string asmstr, list<dag> pattern>
+ : IForm16<opcode, DstMem, SrcImm, Size6Bytes, outs, ins, asmstr, pattern>;
+
+class I16mm<bits<4> opcode,
+ dag outs, dag ins, string asmstr, list<dag> pattern>
+ : IForm16<opcode, DstMem, SrcMem, Size6Bytes, outs, ins, asmstr, pattern>;
+
+// MSP430 Single Operand (Format II) Instructions
+class IIForm<bits<9> opcode, bit bw, SourceMode src, SizeVal sz,
+ dag outs, dag ins, string asmstr, list<dag> pattern>
+ : MSP430Inst<outs, ins, sz, SingleOpFrm, asmstr> {
+ let Pattern = pattern;
+
+ SourceMode as = src;
+
+ let Inst{7-15} = opcode;
+ let Inst{6} = bw;
+ let Inst{4-5} = as.Value;
+}
+
+// 8 bit IIForm instructions
+class IIForm8<bits<9> opcode, SourceMode src, SizeVal sz,
+ dag outs, dag ins, string asmstr, list<dag> pattern>
+ : IIForm<opcode, 1, src, sz, outs, ins, asmstr, pattern>;
+
+class II8r<bits<9> opcode,
+ dag outs, dag ins, string asmstr, list<dag> pattern>
+ : IIForm8<opcode, SrcReg, Size2Bytes, outs, ins, asmstr, pattern>;
+
+class II8m<bits<9> opcode,
+ dag outs, dag ins, string asmstr, list<dag> pattern>
+ : IIForm8<opcode, SrcMem, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class II8i<bits<9> opcode,
+ dag outs, dag ins, string asmstr, list<dag> pattern>
+ : IIForm8<opcode, SrcImm, Size4Bytes, outs, ins, asmstr, pattern>;
+
+// 16 bit IIForm instructions
+class IIForm16<bits<9> opcode, SourceMode src, SizeVal sz,
+ dag outs, dag ins, string asmstr, list<dag> pattern>
+ : IIForm<opcode, 0, src, sz, outs, ins, asmstr, pattern>;
+
+class II16r<bits<9> opcode,
+ dag outs, dag ins, string asmstr, list<dag> pattern>
+ : IIForm16<opcode, SrcReg, Size2Bytes, outs, ins, asmstr, pattern>;
+
+class II16m<bits<9> opcode,
+ dag outs, dag ins, string asmstr, list<dag> pattern>
+ : IIForm16<opcode, SrcMem, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class II16i<bits<9> opcode,
+ dag outs, dag ins, string asmstr, list<dag> pattern>
+ : IIForm16<opcode, SrcImm, Size4Bytes, outs, ins, asmstr, pattern>;
+
+// MSP430 Conditional Jumps Instructions
+class CJForm<bits<3> opcode, bits<3> cond,
+ dag outs, dag ins, string asmstr, list<dag> pattern>
+ : MSP430Inst<outs, ins, Size2Bytes, CondJumpFrm, asmstr> {
+ let Pattern = pattern;
+
+ let Inst{13-15} = opcode;
+ let Inst{10-12} = cond;
+}
+
+// Pseudo instructions
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : MSP430Inst<outs, ins, SizeSpecial, PseudoFrm, asmstr> {
+ let Pattern = pattern;
+ let Inst{15-0} = 0;
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
new file mode 100644
index 000000000000..6135ce080920
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -0,0 +1,335 @@
+//===-- MSP430InstrInfo.cpp - MSP430 Instruction Information --------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MSP430 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430InstrInfo.h"
+#include "MSP430.h"
+#include "MSP430MachineFunctionInfo.h"
+#include "MSP430TargetMachine.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "MSP430GenInstrInfo.inc"
+
+// Pin the vtable to this file.
+void MSP430InstrInfo::anchor() {}
+
+MSP430InstrInfo::MSP430InstrInfo(MSP430Subtarget &STI)
+ : MSP430GenInstrInfo(MSP430::ADJCALLSTACKDOWN, MSP430::ADJCALLSTACKUP),
+ RI() {}
+
+void MSP430InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned SrcReg, bool isKill, int FrameIdx,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ DebugLoc DL;
+ if (MI != MBB.end()) DL = MI->getDebugLoc();
+ MachineFunction &MF = *MBB.getParent();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FrameIdx),
+ MachineMemOperand::MOStore, MFI.getObjectSize(FrameIdx),
+ MFI.getObjectAlignment(FrameIdx));
+
+ if (RC == &MSP430::GR16RegClass)
+ BuildMI(MBB, MI, DL, get(MSP430::MOV16mr))
+ .addFrameIndex(FrameIdx).addImm(0)
+ .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
+ else if (RC == &MSP430::GR8RegClass)
+ BuildMI(MBB, MI, DL, get(MSP430::MOV8mr))
+ .addFrameIndex(FrameIdx).addImm(0)
+ .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
+ else
+ llvm_unreachable("Cannot store this register to stack slot!");
+}
+
+void MSP430InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned DestReg, int FrameIdx,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const{
+ DebugLoc DL;
+ if (MI != MBB.end()) DL = MI->getDebugLoc();
+ MachineFunction &MF = *MBB.getParent();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FrameIdx),
+ MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIdx),
+ MFI.getObjectAlignment(FrameIdx));
+
+ if (RC == &MSP430::GR16RegClass)
+ BuildMI(MBB, MI, DL, get(MSP430::MOV16rm))
+ .addReg(DestReg, getDefRegState(true)).addFrameIndex(FrameIdx)
+ .addImm(0).addMemOperand(MMO);
+ else if (RC == &MSP430::GR8RegClass)
+ BuildMI(MBB, MI, DL, get(MSP430::MOV8rm))
+ .addReg(DestReg, getDefRegState(true)).addFrameIndex(FrameIdx)
+ .addImm(0).addMemOperand(MMO);
+ else
+ llvm_unreachable("Cannot store this register to stack slot!");
+}
+
+void MSP430InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL, unsigned DestReg,
+ unsigned SrcReg, bool KillSrc) const {
+ unsigned Opc;
+ if (MSP430::GR16RegClass.contains(DestReg, SrcReg))
+ Opc = MSP430::MOV16rr;
+ else if (MSP430::GR8RegClass.contains(DestReg, SrcReg))
+ Opc = MSP430::MOV8rr;
+ else
+ llvm_unreachable("Impossible reg-to-reg copy");
+
+ BuildMI(MBB, I, DL, get(Opc), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+}
+
+unsigned MSP430InstrInfo::removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved) const {
+ assert(!BytesRemoved && "code size not handled");
+
+ MachineBasicBlock::iterator I = MBB.end();
+ unsigned Count = 0;
+
+ while (I != MBB.begin()) {
+ --I;
+ if (I->isDebugValue())
+ continue;
+ if (I->getOpcode() != MSP430::JMP &&
+ I->getOpcode() != MSP430::JCC &&
+ I->getOpcode() != MSP430::Br &&
+ I->getOpcode() != MSP430::Bm)
+ break;
+ // Remove the branch.
+ I->eraseFromParent();
+ I = MBB.end();
+ ++Count;
+ }
+
+ return Count;
+}
+
+bool MSP430InstrInfo::
+reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+ assert(Cond.size() == 1 && "Invalid Xbranch condition!");
+
+ MSP430CC::CondCodes CC = static_cast<MSP430CC::CondCodes>(Cond[0].getImm());
+
+ switch (CC) {
+ default: llvm_unreachable("Invalid branch condition!");
+ case MSP430CC::COND_E:
+ CC = MSP430CC::COND_NE;
+ break;
+ case MSP430CC::COND_NE:
+ CC = MSP430CC::COND_E;
+ break;
+ case MSP430CC::COND_L:
+ CC = MSP430CC::COND_GE;
+ break;
+ case MSP430CC::COND_GE:
+ CC = MSP430CC::COND_L;
+ break;
+ case MSP430CC::COND_HS:
+ CC = MSP430CC::COND_LO;
+ break;
+ case MSP430CC::COND_LO:
+ CC = MSP430CC::COND_HS;
+ break;
+ }
+
+ Cond[0].setImm(CC);
+ return false;
+}
+
+bool MSP430InstrInfo::isUnpredicatedTerminator(const MachineInstr &MI) const {
+ if (!MI.isTerminator())
+ return false;
+
+ // Conditional branch is a special case.
+ if (MI.isBranch() && !MI.isBarrier())
+ return true;
+ if (!MI.isPredicable())
+ return true;
+ return !isPredicated(MI);
+}
+
+bool MSP430InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const {
+ // Start from the bottom of the block and work up, examining the
+ // terminator instructions.
+ MachineBasicBlock::iterator I = MBB.end();
+ while (I != MBB.begin()) {
+ --I;
+ if (I->isDebugValue())
+ continue;
+
+ // Working from the bottom, when we see a non-terminator
+ // instruction, we're done.
+ if (!isUnpredicatedTerminator(*I))
+ break;
+
+ // A terminator that isn't a branch can't easily be handled
+ // by this analysis.
+ if (!I->isBranch())
+ return true;
+
+ // Cannot handle indirect branches.
+ if (I->getOpcode() == MSP430::Br ||
+ I->getOpcode() == MSP430::Bm)
+ return true;
+
+ // Handle unconditional branches.
+ if (I->getOpcode() == MSP430::JMP) {
+ if (!AllowModify) {
+ TBB = I->getOperand(0).getMBB();
+ continue;
+ }
+
+ // If the block has any instructions after a JMP, delete them.
+ while (std::next(I) != MBB.end())
+ std::next(I)->eraseFromParent();
+ Cond.clear();
+ FBB = nullptr;
+
+ // Delete the JMP if it's equivalent to a fall-through.
+ if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
+ TBB = nullptr;
+ I->eraseFromParent();
+ I = MBB.end();
+ continue;
+ }
+
+ // TBB is used to indicate the unconditinal destination.
+ TBB = I->getOperand(0).getMBB();
+ continue;
+ }
+
+ // Handle conditional branches.
+ assert(I->getOpcode() == MSP430::JCC && "Invalid conditional branch");
+ MSP430CC::CondCodes BranchCode =
+ static_cast<MSP430CC::CondCodes>(I->getOperand(1).getImm());
+ if (BranchCode == MSP430CC::COND_INVALID)
+ return true; // Can't handle weird stuff.
+
+ // Working from the bottom, handle the first conditional branch.
+ if (Cond.empty()) {
+ FBB = TBB;
+ TBB = I->getOperand(0).getMBB();
+ Cond.push_back(MachineOperand::CreateImm(BranchCode));
+ continue;
+ }
+
+ // Handle subsequent conditional branches. Only handle the case where all
+ // conditional branches branch to the same destination.
+ assert(Cond.size() == 1);
+ assert(TBB);
+
+ // Only handle the case where all conditional branches branch to
+ // the same destination.
+ if (TBB != I->getOperand(0).getMBB())
+ return true;
+
+ MSP430CC::CondCodes OldBranchCode = (MSP430CC::CondCodes)Cond[0].getImm();
+ // If the conditions are the same, we can leave them alone.
+ if (OldBranchCode == BranchCode)
+ continue;
+
+ return true;
+ }
+
+ return false;
+}
+
+unsigned MSP430InstrInfo::insertBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB,
+ ArrayRef<MachineOperand> Cond,
+ const DebugLoc &DL,
+ int *BytesAdded) const {
+ // Shouldn't be a fall through.
+ assert(TBB && "insertBranch must not be told to insert a fallthrough");
+ assert((Cond.size() == 1 || Cond.size() == 0) &&
+ "MSP430 branch conditions have one component!");
+ assert(!BytesAdded && "code size not handled");
+
+ if (Cond.empty()) {
+ // Unconditional branch?
+ assert(!FBB && "Unconditional branch with multiple successors!");
+ BuildMI(&MBB, DL, get(MSP430::JMP)).addMBB(TBB);
+ return 1;
+ }
+
+ // Conditional branch.
+ unsigned Count = 0;
+ BuildMI(&MBB, DL, get(MSP430::JCC)).addMBB(TBB).addImm(Cond[0].getImm());
+ ++Count;
+
+ if (FBB) {
+ // Two-way Conditional branch. Insert the second branch.
+ BuildMI(&MBB, DL, get(MSP430::JMP)).addMBB(FBB);
+ ++Count;
+ }
+ return Count;
+}
+
+/// GetInstSize - Return the number of bytes of code the specified
+/// instruction may be. This returns the maximum number of bytes.
+///
+unsigned MSP430InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
+ const MCInstrDesc &Desc = MI.getDesc();
+
+ switch (Desc.TSFlags & MSP430II::SizeMask) {
+ default:
+ switch (Desc.getOpcode()) {
+ default: llvm_unreachable("Unknown instruction size!");
+ case TargetOpcode::CFI_INSTRUCTION:
+ case TargetOpcode::EH_LABEL:
+ case TargetOpcode::IMPLICIT_DEF:
+ case TargetOpcode::KILL:
+ case TargetOpcode::DBG_VALUE:
+ return 0;
+ case TargetOpcode::INLINEASM: {
+ const MachineFunction *MF = MI.getParent()->getParent();
+ const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo();
+ return TII.getInlineAsmLength(MI.getOperand(0).getSymbolName(),
+ *MF->getTarget().getMCAsmInfo());
+ }
+ }
+ case MSP430II::SizeSpecial:
+ switch (MI.getOpcode()) {
+ default: llvm_unreachable("Unknown instruction size!");
+ case MSP430::SAR8r1c:
+ case MSP430::SAR16r1c:
+ return 4;
+ }
+ case MSP430II::Size2Bytes:
+ return 2;
+ case MSP430II::Size4Bytes:
+ return 4;
+ case MSP430II::Size6Bytes:
+ return 6;
+ }
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.h b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.h
new file mode 100644
index 000000000000..e3259bd6a7bc
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.h
@@ -0,0 +1,92 @@
+//===-- MSP430InstrInfo.h - MSP430 Instruction Information ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MSP430 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MSP430_MSP430INSTRINFO_H
+#define LLVM_LIB_TARGET_MSP430_MSP430INSTRINFO_H
+
+#include "MSP430RegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "MSP430GenInstrInfo.inc"
+
+namespace llvm {
+
+class MSP430Subtarget;
+
+/// MSP430II - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace MSP430II {
+ enum {
+ SizeShift = 2,
+ SizeMask = 7 << SizeShift,
+
+ SizeUnknown = 0 << SizeShift,
+ SizeSpecial = 1 << SizeShift,
+ Size2Bytes = 2 << SizeShift,
+ Size4Bytes = 3 << SizeShift,
+ Size6Bytes = 4 << SizeShift
+ };
+}
+
+class MSP430InstrInfo : public MSP430GenInstrInfo {
+ const MSP430RegisterInfo RI;
+ virtual void anchor();
+public:
+ explicit MSP430InstrInfo(MSP430Subtarget &STI);
+
+ /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As
+ /// such, whenever a client has an instance of instruction info, it should
+ /// always be able to get register info as well (through this method).
+ ///
+ const TargetRegisterInfo &getRegisterInfo() const { return RI; }
+
+ void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+ bool KillSrc) const override;
+
+ void storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned SrcReg, bool isKill,
+ int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
+ void loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned DestReg, int FrameIdx,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
+
+ unsigned getInstSizeInBytes(const MachineInstr &MI) const override;
+
+ // Branch folding goodness
+ bool
+ reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+ bool isUnpredicatedTerminator(const MachineInstr &MI) const override;
+ bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const override;
+
+ unsigned removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved = nullptr) const override;
+ unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+ const DebugLoc &DL,
+ int *BytesAdded = nullptr) const override;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.td b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.td
new file mode 100644
index 000000000000..c0c29b992238
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.td
@@ -0,0 +1,1211 @@
+//===-- MSP430InstrInfo.td - MSP430 Instruction defs -------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the MSP430 instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+include "MSP430InstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Type Constraints.
+//===----------------------------------------------------------------------===//
+class SDTCisI8<int OpNum> : SDTCisVT<OpNum, i8>;
+class SDTCisI16<int OpNum> : SDTCisVT<OpNum, i16>;
+
+//===----------------------------------------------------------------------===//
+// Type Profiles.
+//===----------------------------------------------------------------------===//
+def SDT_MSP430Call : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
+def SDT_MSP430CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>]>;
+def SDT_MSP430CallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
+def SDT_MSP430Wrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
+ SDTCisPtrTy<0>]>;
+def SDT_MSP430Cmp : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
+def SDT_MSP430BrCC : SDTypeProfile<0, 2, [SDTCisVT<0, OtherVT>,
+ SDTCisVT<1, i8>]>;
+def SDT_MSP430SelectCC : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>,
+ SDTCisSameAs<1, 2>,
+ SDTCisVT<3, i8>]>;
+def SDT_MSP430Shift : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>,
+ SDTCisI8<2>]>;
+
+//===----------------------------------------------------------------------===//
+// MSP430 Specific Node Definitions.
+//===----------------------------------------------------------------------===//
+def MSP430retflag : SDNode<"MSP430ISD::RET_FLAG", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def MSP430retiflag : SDNode<"MSP430ISD::RETI_FLAG", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+def MSP430rra : SDNode<"MSP430ISD::RRA", SDTIntUnaryOp, []>;
+def MSP430rla : SDNode<"MSP430ISD::RLA", SDTIntUnaryOp, []>;
+def MSP430rrc : SDNode<"MSP430ISD::RRC", SDTIntUnaryOp, []>;
+
+def MSP430call : SDNode<"MSP430ISD::CALL", SDT_MSP430Call,
+ [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue, SDNPVariadic]>;
+def MSP430callseq_start :
+ SDNode<"ISD::CALLSEQ_START", SDT_MSP430CallSeqStart,
+ [SDNPHasChain, SDNPOutGlue]>;
+def MSP430callseq_end :
+ SDNode<"ISD::CALLSEQ_END", SDT_MSP430CallSeqEnd,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def MSP430Wrapper : SDNode<"MSP430ISD::Wrapper", SDT_MSP430Wrapper>;
+def MSP430cmp : SDNode<"MSP430ISD::CMP", SDT_MSP430Cmp, [SDNPOutGlue]>;
+def MSP430brcc : SDNode<"MSP430ISD::BR_CC", SDT_MSP430BrCC,
+ [SDNPHasChain, SDNPInGlue]>;
+def MSP430selectcc: SDNode<"MSP430ISD::SELECT_CC", SDT_MSP430SelectCC,
+ [SDNPInGlue]>;
+def MSP430shl : SDNode<"MSP430ISD::SHL", SDT_MSP430Shift, []>;
+def MSP430sra : SDNode<"MSP430ISD::SRA", SDT_MSP430Shift, []>;
+def MSP430srl : SDNode<"MSP430ISD::SRL", SDT_MSP430Shift, []>;
+
+//===----------------------------------------------------------------------===//
+// MSP430 Operand Definitions.
+//===----------------------------------------------------------------------===//
+
+// Address operands
+def memsrc : Operand<i16> {
+ let PrintMethod = "printSrcMemOperand";
+ let MIOperandInfo = (ops GR16, i16imm);
+}
+
+def memdst : Operand<i16> {
+ let PrintMethod = "printSrcMemOperand";
+ let MIOperandInfo = (ops GR16, i16imm);
+}
+
+// Short jump targets have OtherVT type and are printed as pcrel imm values.
+def jmptarget : Operand<OtherVT> {
+ let PrintMethod = "printPCRelImmOperand";
+}
+
+// Operand for printing out a condition code.
+def cc : Operand<i8> {
+ let PrintMethod = "printCCOperand";
+}
+
+//===----------------------------------------------------------------------===//
+// MSP430 Complex Pattern Definitions.
+//===----------------------------------------------------------------------===//
+
+def addr : ComplexPattern<iPTR, 2, "SelectAddr", [], []>;
+
+//===----------------------------------------------------------------------===//
+// Pattern Fragments
+def zextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (zextloadi8 node:$ptr))>;
+def extloadi16i8 : PatFrag<(ops node:$ptr), (i16 ( extloadi8 node:$ptr))>;
+def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
+ return N->hasOneUse();
+}]>;
+//===----------------------------------------------------------------------===//
+// Instruction list..
+
+// ADJCALLSTACKDOWN/UP implicitly use/def SP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
+// sub / add which can clobber SR.
+let Defs = [SP, SR], Uses = [SP] in {
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i16imm:$amt),
+ "#ADJCALLSTACKDOWN",
+ [(MSP430callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP : Pseudo<(outs), (ins i16imm:$amt1, i16imm:$amt2),
+ "#ADJCALLSTACKUP",
+ [(MSP430callseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+let usesCustomInserter = 1 in {
+ def Select8 : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$src2, i8imm:$cc),
+ "# Select8 PSEUDO",
+ [(set GR8:$dst,
+ (MSP430selectcc GR8:$src, GR8:$src2, imm:$cc))]>;
+ def Select16 : Pseudo<(outs GR16:$dst), (ins GR16:$src, GR16:$src2, i8imm:$cc),
+ "# Select16 PSEUDO",
+ [(set GR16:$dst,
+ (MSP430selectcc GR16:$src, GR16:$src2, imm:$cc))]>;
+ let Defs = [SR] in {
+ def Shl8 : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$cnt),
+ "# Shl8 PSEUDO",
+ [(set GR8:$dst, (MSP430shl GR8:$src, GR8:$cnt))]>;
+ def Shl16 : Pseudo<(outs GR16:$dst), (ins GR16:$src, GR8:$cnt),
+ "# Shl16 PSEUDO",
+ [(set GR16:$dst, (MSP430shl GR16:$src, GR8:$cnt))]>;
+ def Sra8 : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$cnt),
+ "# Sra8 PSEUDO",
+ [(set GR8:$dst, (MSP430sra GR8:$src, GR8:$cnt))]>;
+ def Sra16 : Pseudo<(outs GR16:$dst), (ins GR16:$src, GR8:$cnt),
+ "# Sra16 PSEUDO",
+ [(set GR16:$dst, (MSP430sra GR16:$src, GR8:$cnt))]>;
+ def Srl8 : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$cnt),
+ "# Srl8 PSEUDO",
+ [(set GR8:$dst, (MSP430srl GR8:$src, GR8:$cnt))]>;
+ def Srl16 : Pseudo<(outs GR16:$dst), (ins GR16:$src, GR8:$cnt),
+ "# Srl16 PSEUDO",
+ [(set GR16:$dst, (MSP430srl GR16:$src, GR8:$cnt))]>;
+
+ }
+}
+
+let hasSideEffects = 0 in
+def NOP : Pseudo<(outs), (ins), "nop", []>;
+
+//===----------------------------------------------------------------------===//
+// Control Flow Instructions...
+//
+
+// FIXME: Provide proper encoding!
+let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
+ def RET : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+ (outs), (ins), "ret", [(MSP430retflag)]>;
+ def RETI : II16r<0x0, (outs), (ins), "reti", [(MSP430retiflag)]>;
+}
+
+let isBranch = 1, isTerminator = 1 in {
+
+// FIXME: expand opcode & cond field for branches!
+
+// Direct branch
+let isBarrier = 1 in {
+ // Short branch
+ def JMP : CJForm<0, 0, (outs), (ins jmptarget:$dst),
+ "jmp\t$dst",
+ [(br bb:$dst)]>;
+ let isIndirectBranch = 1 in {
+ // Long branches
+ def Bi : I16ri<0, (outs), (ins i16imm:$brdst),
+ "br\t$brdst",
+ [(brind tblockaddress:$brdst)]>;
+ def Br : I16rr<0, (outs), (ins GR16:$brdst),
+ "br\t$brdst",
+ [(brind GR16:$brdst)]>;
+ def Bm : I16rm<0, (outs), (ins memsrc:$brdst),
+ "br\t$brdst",
+ [(brind (load addr:$brdst))]>;
+ }
+}
+
+// Conditional branches
+let Uses = [SR] in
+ def JCC : CJForm<0, 0,
+ (outs), (ins jmptarget:$dst, cc:$cc),
+ "j$cc\t$dst",
+ [(MSP430brcc bb:$dst, imm:$cc)]>;
+} // isBranch, isTerminator
+
+//===----------------------------------------------------------------------===//
+// Call Instructions...
+//
+let isCall = 1 in
+ // All calls clobber the non-callee saved registers. SPW is marked as
+ // a use to prevent stack-pointer assignments that appear immediately
+ // before calls from potentially appearing dead. Uses for argument
+ // registers are added manually.
+ let Defs = [R12, R13, R14, R15, SR],
+ Uses = [SP] in {
+ def CALLi : II16i<0x0,
+ (outs), (ins i16imm:$dst),
+ "call\t$dst", [(MSP430call imm:$dst)]>;
+ def CALLr : II16r<0x0,
+ (outs), (ins GR16:$dst),
+ "call\t$dst", [(MSP430call GR16:$dst)]>;
+ def CALLm : II16m<0x0,
+ (outs), (ins memsrc:$dst),
+ "call\t${dst:mem}", [(MSP430call (load addr:$dst))]>;
+ }
+
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Instructions...
+//
+let Defs = [SP], Uses = [SP], hasSideEffects=0 in {
+let mayLoad = 1 in
+def POP16r : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+ (outs GR16:$reg), (ins), "pop.w\t$reg", []>;
+
+let mayStore = 1 in
+def PUSH16r : II16r<0x0,
+ (outs), (ins GR16:$reg), "push.w\t$reg",[]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Move Instructions
+
+// FIXME: Provide proper encoding!
+let hasSideEffects = 0 in {
+def MOV8rr : I8rr<0x0,
+ (outs GR8:$dst), (ins GR8:$src),
+ "mov.b\t{$src, $dst}",
+ []>;
+def MOV16rr : I16rr<0x0,
+ (outs GR16:$dst), (ins GR16:$src),
+ "mov.w\t{$src, $dst}",
+ []>;
+}
+
+// FIXME: Provide proper encoding!
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+def MOV8ri : I8ri<0x0,
+ (outs GR8:$dst), (ins i8imm:$src),
+ "mov.b\t{$src, $dst}",
+ [(set GR8:$dst, imm:$src)]>;
+def MOV16ri : I16ri<0x0,
+ (outs GR16:$dst), (ins i16imm:$src),
+ "mov.w\t{$src, $dst}",
+ [(set GR16:$dst, imm:$src)]>;
+}
+
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+def MOV8rm : I8rm<0x0,
+ (outs GR8:$dst), (ins memsrc:$src),
+ "mov.b\t{$src, $dst}",
+ [(set GR8:$dst, (load addr:$src))]>;
+def MOV16rm : I16rm<0x0,
+ (outs GR16:$dst), (ins memsrc:$src),
+ "mov.w\t{$src, $dst}",
+ [(set GR16:$dst, (load addr:$src))]>;
+}
+
+def MOVZX16rr8 : I8rr<0x0,
+ (outs GR16:$dst), (ins GR8:$src),
+ "mov.b\t{$src, $dst}",
+ [(set GR16:$dst, (zext GR8:$src))]>;
+def MOVZX16rm8 : I8rm<0x0,
+ (outs GR16:$dst), (ins memsrc:$src),
+ "mov.b\t{$src, $dst}",
+ [(set GR16:$dst, (zextloadi16i8 addr:$src))]>;
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1, Constraints = "$base = $base_wb" in {
+def MOV8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
+ (outs GR8:$dst, GR16:$base_wb), (ins GR16:$base),
+ "mov.b\t{@$base+, $dst}", []>;
+def MOV16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+ (outs GR16:$dst, GR16:$base_wb), (ins GR16:$base),
+ "mov.w\t{@$base+, $dst}", []>;
+}
+
+// Any instruction that defines a 8-bit result leaves the high half of the
+// register. Truncate can be lowered to EXTRACT_SUBREG, and CopyFromReg may
+// be copying from a truncate, but any other 8-bit operation will zero-extend
+// up to 16 bits.
+def def8 : PatLeaf<(i8 GR8:$src), [{
+ return N->getOpcode() != ISD::TRUNCATE &&
+ N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
+ N->getOpcode() != ISD::CopyFromReg;
+}]>;
+
+// In the case of a 8-bit def that is known to implicitly zero-extend,
+// we can use a SUBREG_TO_REG.
+def : Pat<(i16 (zext def8:$src)),
+ (SUBREG_TO_REG (i16 0), GR8:$src, subreg_8bit)>;
+
+def MOV8mi : I8mi<0x0,
+ (outs), (ins memdst:$dst, i8imm:$src),
+ "mov.b\t{$src, $dst}",
+ [(store (i8 imm:$src), addr:$dst)]>;
+def MOV16mi : I16mi<0x0,
+ (outs), (ins memdst:$dst, i16imm:$src),
+ "mov.w\t{$src, $dst}",
+ [(store (i16 imm:$src), addr:$dst)]>;
+
+def MOV8mr : I8mr<0x0,
+ (outs), (ins memdst:$dst, GR8:$src),
+ "mov.b\t{$src, $dst}",
+ [(store GR8:$src, addr:$dst)]>;
+def MOV16mr : I16mr<0x0,
+ (outs), (ins memdst:$dst, GR16:$src),
+ "mov.w\t{$src, $dst}",
+ [(store GR16:$src, addr:$dst)]>;
+
+def MOV8mm : I8mm<0x0,
+ (outs), (ins memdst:$dst, memsrc:$src),
+ "mov.b\t{$src, $dst}",
+ [(store (i8 (load addr:$src)), addr:$dst)]>;
+def MOV16mm : I16mm<0x0,
+ (outs), (ins memdst:$dst, memsrc:$src),
+ "mov.w\t{$src, $dst}",
+ [(store (i16 (load addr:$src)), addr:$dst)]>;
+
+//===----------------------------------------------------------------------===//
+// Arithmetic Instructions
+
+let Constraints = "$src = $dst" in {
+
+let Defs = [SR] in {
+
+let isCommutable = 1 in { // X = ADD Y, Z == X = ADD Z, Y
+
+def ADD8rr : I8rr<0x0,
+ (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
+ "add.b\t{$src2, $dst}",
+ [(set GR8:$dst, (add GR8:$src, GR8:$src2)),
+ (implicit SR)]>;
+def ADD16rr : I16rr<0x0,
+ (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
+ "add.w\t{$src2, $dst}",
+ [(set GR16:$dst, (add GR16:$src, GR16:$src2)),
+ (implicit SR)]>;
+}
+
+def ADD8rm : I8rm<0x0,
+ (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
+ "add.b\t{$src2, $dst}",
+ [(set GR8:$dst, (add GR8:$src, (load addr:$src2))),
+ (implicit SR)]>;
+def ADD16rm : I16rm<0x0,
+ (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
+ "add.w\t{$src2, $dst}",
+ [(set GR16:$dst, (add GR16:$src, (load addr:$src2))),
+ (implicit SR)]>;
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1,
+Constraints = "$base = $base_wb, $src = $dst" in {
+def ADD8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
+ (outs GR8:$dst, GR16:$base_wb),
+ (ins GR8:$src, GR16:$base),
+ "add.b\t{@$base+, $dst}", []>;
+def ADD16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+ (outs GR16:$dst, GR16:$base_wb),
+ (ins GR16:$src, GR16:$base),
+ "add.w\t{@$base+, $dst}", []>;
+}
+
+
+def ADD8ri : I8ri<0x0,
+ (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
+ "add.b\t{$src2, $dst}",
+ [(set GR8:$dst, (add GR8:$src, imm:$src2)),
+ (implicit SR)]>;
+def ADD16ri : I16ri<0x0,
+ (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
+ "add.w\t{$src2, $dst}",
+ [(set GR16:$dst, (add GR16:$src, imm:$src2)),
+ (implicit SR)]>;
+
+let Constraints = "" in {
+def ADD8mr : I8mr<0x0,
+ (outs), (ins memdst:$dst, GR8:$src),
+ "add.b\t{$src, $dst}",
+ [(store (add (load addr:$dst), GR8:$src), addr:$dst),
+ (implicit SR)]>;
+def ADD16mr : I16mr<0x0,
+ (outs), (ins memdst:$dst, GR16:$src),
+ "add.w\t{$src, $dst}",
+ [(store (add (load addr:$dst), GR16:$src), addr:$dst),
+ (implicit SR)]>;
+
+def ADD8mi : I8mi<0x0,
+ (outs), (ins memdst:$dst, i8imm:$src),
+ "add.b\t{$src, $dst}",
+ [(store (add (load addr:$dst), (i8 imm:$src)), addr:$dst),
+ (implicit SR)]>;
+def ADD16mi : I16mi<0x0,
+ (outs), (ins memdst:$dst, i16imm:$src),
+ "add.w\t{$src, $dst}",
+ [(store (add (load addr:$dst), (i16 imm:$src)), addr:$dst),
+ (implicit SR)]>;
+
+def ADD8mm : I8mm<0x0,
+ (outs), (ins memdst:$dst, memsrc:$src),
+ "add.b\t{$src, $dst}",
+ [(store (add (load addr:$dst),
+ (i8 (load addr:$src))), addr:$dst),
+ (implicit SR)]>;
+def ADD16mm : I16mm<0x0,
+ (outs), (ins memdst:$dst, memsrc:$src),
+ "add.w\t{$src, $dst}",
+ [(store (add (load addr:$dst),
+ (i16 (load addr:$src))), addr:$dst),
+ (implicit SR)]>;
+}
+
+let Uses = [SR] in {
+
+let isCommutable = 1 in { // X = ADDC Y, Z == X = ADDC Z, Y
+def ADC8rr : I8rr<0x0,
+ (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
+ "addc.b\t{$src2, $dst}",
+ [(set GR8:$dst, (adde GR8:$src, GR8:$src2)),
+ (implicit SR)]>;
+def ADC16rr : I16rr<0x0,
+ (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
+ "addc.w\t{$src2, $dst}",
+ [(set GR16:$dst, (adde GR16:$src, GR16:$src2)),
+ (implicit SR)]>;
+} // isCommutable
+
+def ADC8ri : I8ri<0x0,
+ (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
+ "addc.b\t{$src2, $dst}",
+ [(set GR8:$dst, (adde GR8:$src, imm:$src2)),
+ (implicit SR)]>;
+def ADC16ri : I16ri<0x0,
+ (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
+ "addc.w\t{$src2, $dst}",
+ [(set GR16:$dst, (adde GR16:$src, imm:$src2)),
+ (implicit SR)]>;
+
+def ADC8rm : I8rm<0x0,
+ (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
+ "addc.b\t{$src2, $dst}",
+ [(set GR8:$dst, (adde GR8:$src, (load addr:$src2))),
+ (implicit SR)]>;
+def ADC16rm : I16rm<0x0,
+ (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
+ "addc.w\t{$src2, $dst}",
+ [(set GR16:$dst, (adde GR16:$src, (load addr:$src2))),
+ (implicit SR)]>;
+
+let Constraints = "" in {
+def ADC8mr : I8mr<0x0,
+ (outs), (ins memdst:$dst, GR8:$src),
+ "addc.b\t{$src, $dst}",
+ [(store (adde (load addr:$dst), GR8:$src), addr:$dst),
+ (implicit SR)]>;
+def ADC16mr : I16mr<0x0,
+ (outs), (ins memdst:$dst, GR16:$src),
+ "addc.w\t{$src, $dst}",
+ [(store (adde (load addr:$dst), GR16:$src), addr:$dst),
+ (implicit SR)]>;
+
+def ADC8mi : I8mi<0x0,
+ (outs), (ins memdst:$dst, i8imm:$src),
+ "addc.b\t{$src, $dst}",
+ [(store (adde (load addr:$dst), (i8 imm:$src)), addr:$dst),
+ (implicit SR)]>;
+def ADC16mi : I16mi<0x0,
+ (outs), (ins memdst:$dst, i16imm:$src),
+ "addc.w\t{$src, $dst}",
+ [(store (adde (load addr:$dst), (i16 imm:$src)), addr:$dst),
+ (implicit SR)]>;
+
+def ADC8mm : I8mm<0x0,
+ (outs), (ins memdst:$dst, memsrc:$src),
+ "addc.b\t{$src, $dst}",
+ [(store (adde (load addr:$dst),
+ (i8 (load addr:$src))), addr:$dst),
+ (implicit SR)]>;
+def ADC16mm : I8mm<0x0,
+ (outs), (ins memdst:$dst, memsrc:$src),
+ "addc.w\t{$src, $dst}",
+ [(store (adde (load addr:$dst),
+ (i16 (load addr:$src))), addr:$dst),
+ (implicit SR)]>;
+}
+
+} // Uses = [SR]
+
+let isCommutable = 1 in { // X = AND Y, Z == X = AND Z, Y
+def AND8rr : I8rr<0x0,
+ (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
+ "and.b\t{$src2, $dst}",
+ [(set GR8:$dst, (and GR8:$src, GR8:$src2)),
+ (implicit SR)]>;
+def AND16rr : I16rr<0x0,
+ (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
+ "and.w\t{$src2, $dst}",
+ [(set GR16:$dst, (and GR16:$src, GR16:$src2)),
+ (implicit SR)]>;
+}
+
+def AND8ri : I8ri<0x0,
+ (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
+ "and.b\t{$src2, $dst}",
+ [(set GR8:$dst, (and GR8:$src, imm:$src2)),
+ (implicit SR)]>;
+def AND16ri : I16ri<0x0,
+ (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
+ "and.w\t{$src2, $dst}",
+ [(set GR16:$dst, (and GR16:$src, imm:$src2)),
+ (implicit SR)]>;
+
+def AND8rm : I8rm<0x0,
+ (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
+ "and.b\t{$src2, $dst}",
+ [(set GR8:$dst, (and GR8:$src, (load addr:$src2))),
+ (implicit SR)]>;
+def AND16rm : I16rm<0x0,
+ (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
+ "and.w\t{$src2, $dst}",
+ [(set GR16:$dst, (and GR16:$src, (load addr:$src2))),
+ (implicit SR)]>;
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1,
+Constraints = "$base = $base_wb, $src = $dst" in {
+def AND8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
+ (outs GR8:$dst, GR16:$base_wb),
+ (ins GR8:$src, GR16:$base),
+ "and.b\t{@$base+, $dst}", []>;
+def AND16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+ (outs GR16:$dst, GR16:$base_wb),
+ (ins GR16:$src, GR16:$base),
+ "and.w\t{@$base+, $dst}", []>;
+}
+
+let Constraints = "" in {
+def AND8mr : I8mr<0x0,
+ (outs), (ins memdst:$dst, GR8:$src),
+ "and.b\t{$src, $dst}",
+ [(store (and (load addr:$dst), GR8:$src), addr:$dst),
+ (implicit SR)]>;
+def AND16mr : I16mr<0x0,
+ (outs), (ins memdst:$dst, GR16:$src),
+ "and.w\t{$src, $dst}",
+ [(store (and (load addr:$dst), GR16:$src), addr:$dst),
+ (implicit SR)]>;
+
+def AND8mi : I8mi<0x0,
+ (outs), (ins memdst:$dst, i8imm:$src),
+ "and.b\t{$src, $dst}",
+ [(store (and (load addr:$dst), (i8 imm:$src)), addr:$dst),
+ (implicit SR)]>;
+def AND16mi : I16mi<0x0,
+ (outs), (ins memdst:$dst, i16imm:$src),
+ "and.w\t{$src, $dst}",
+ [(store (and (load addr:$dst), (i16 imm:$src)), addr:$dst),
+ (implicit SR)]>;
+
+def AND8mm : I8mm<0x0,
+ (outs), (ins memdst:$dst, memsrc:$src),
+ "and.b\t{$src, $dst}",
+ [(store (and (load addr:$dst),
+ (i8 (load addr:$src))), addr:$dst),
+ (implicit SR)]>;
+def AND16mm : I16mm<0x0,
+ (outs), (ins memdst:$dst, memsrc:$src),
+ "and.w\t{$src, $dst}",
+ [(store (and (load addr:$dst),
+ (i16 (load addr:$src))), addr:$dst),
+ (implicit SR)]>;
+}
+
+let isCommutable = 1 in { // X = OR Y, Z == X = OR Z, Y
+def OR8rr : I8rr<0x0,
+ (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
+ "bis.b\t{$src2, $dst}",
+ [(set GR8:$dst, (or GR8:$src, GR8:$src2))]>;
+def OR16rr : I16rr<0x0,
+ (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
+ "bis.w\t{$src2, $dst}",
+ [(set GR16:$dst, (or GR16:$src, GR16:$src2))]>;
+}
+
+def OR8ri : I8ri<0x0,
+ (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
+ "bis.b\t{$src2, $dst}",
+ [(set GR8:$dst, (or GR8:$src, imm:$src2))]>;
+def OR16ri : I16ri<0x0,
+ (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
+ "bis.w\t{$src2, $dst}",
+ [(set GR16:$dst, (or GR16:$src, imm:$src2))]>;
+
+def OR8rm : I8rm<0x0,
+ (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
+ "bis.b\t{$src2, $dst}",
+ [(set GR8:$dst, (or GR8:$src, (load addr:$src2)))]>;
+def OR16rm : I16rm<0x0,
+ (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
+ "bis.w\t{$src2, $dst}",
+ [(set GR16:$dst, (or GR16:$src, (load addr:$src2)))]>;
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1,
+Constraints = "$base = $base_wb, $src = $dst" in {
+def OR8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
+ (outs GR8:$dst, GR16:$base_wb),
+ (ins GR8:$src, GR16:$base),
+ "bis.b\t{@$base+, $dst}", []>;
+def OR16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+ (outs GR16:$dst, GR16:$base_wb),
+ (ins GR16:$src, GR16:$base),
+ "bis.w\t{@$base+, $dst}", []>;
+}
+
+let Constraints = "" in {
+def OR8mr : I8mr<0x0,
+ (outs), (ins memdst:$dst, GR8:$src),
+ "bis.b\t{$src, $dst}",
+ [(store (or (load addr:$dst), GR8:$src), addr:$dst)]>;
+def OR16mr : I16mr<0x0,
+ (outs), (ins memdst:$dst, GR16:$src),
+ "bis.w\t{$src, $dst}",
+ [(store (or (load addr:$dst), GR16:$src), addr:$dst)]>;
+
+def OR8mi : I8mi<0x0,
+ (outs), (ins memdst:$dst, i8imm:$src),
+ "bis.b\t{$src, $dst}",
+ [(store (or (load addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def OR16mi : I16mi<0x0,
+ (outs), (ins memdst:$dst, i16imm:$src),
+ "bis.w\t{$src, $dst}",
+ [(store (or (load addr:$dst), (i16 imm:$src)), addr:$dst)]>;
+
+def OR8mm : I8mm<0x0,
+ (outs), (ins memdst:$dst, memsrc:$src),
+ "bis.b\t{$src, $dst}",
+ [(store (or (i8 (load addr:$dst)),
+ (i8 (load addr:$src))), addr:$dst)]>;
+def OR16mm : I16mm<0x0,
+ (outs), (ins memdst:$dst, memsrc:$src),
+ "bis.w\t{$src, $dst}",
+ [(store (or (i16 (load addr:$dst)),
+ (i16 (load addr:$src))), addr:$dst)]>;
+}
+
+// bic does not modify condition codes
+def BIC8rr : I8rr<0x0,
+ (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
+ "bic.b\t{$src2, $dst}",
+ [(set GR8:$dst, (and GR8:$src, (not GR8:$src2)))]>;
+def BIC16rr : I16rr<0x0,
+ (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
+ "bic.w\t{$src2, $dst}",
+ [(set GR16:$dst, (and GR16:$src, (not GR16:$src2)))]>;
+
+def BIC8rm : I8rm<0x0,
+ (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
+ "bic.b\t{$src2, $dst}",
+ [(set GR8:$dst, (and GR8:$src, (not (i8 (load addr:$src2)))))]>;
+def BIC16rm : I16rm<0x0,
+ (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
+ "bic.w\t{$src2, $dst}",
+ [(set GR16:$dst, (and GR16:$src, (not (i16 (load addr:$src2)))))]>;
+
+let Constraints = "" in {
+def BIC8mr : I8mr<0x0,
+ (outs), (ins memdst:$dst, GR8:$src),
+ "bic.b\t{$src, $dst}",
+ [(store (and (load addr:$dst), (not GR8:$src)), addr:$dst)]>;
+def BIC16mr : I16mr<0x0,
+ (outs), (ins memdst:$dst, GR16:$src),
+ "bic.w\t{$src, $dst}",
+ [(store (and (load addr:$dst), (not GR16:$src)), addr:$dst)]>;
+
+def BIC8mm : I8mm<0x0,
+ (outs), (ins memdst:$dst, memsrc:$src),
+ "bic.b\t{$src, $dst}",
+ [(store (and (load addr:$dst),
+ (not (i8 (load addr:$src)))), addr:$dst)]>;
+def BIC16mm : I16mm<0x0,
+ (outs), (ins memdst:$dst, memsrc:$src),
+ "bic.w\t{$src, $dst}",
+ [(store (and (load addr:$dst),
+ (not (i16 (load addr:$src)))), addr:$dst)]>;
+}
+
+let isCommutable = 1 in { // X = XOR Y, Z == X = XOR Z, Y
+def XOR8rr : I8rr<0x0,
+ (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
+ "xor.b\t{$src2, $dst}",
+ [(set GR8:$dst, (xor GR8:$src, GR8:$src2)),
+ (implicit SR)]>;
+def XOR16rr : I16rr<0x0,
+ (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
+ "xor.w\t{$src2, $dst}",
+ [(set GR16:$dst, (xor GR16:$src, GR16:$src2)),
+ (implicit SR)]>;
+}
+
+def XOR8ri : I8ri<0x0,
+ (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
+ "xor.b\t{$src2, $dst}",
+ [(set GR8:$dst, (xor GR8:$src, imm:$src2)),
+ (implicit SR)]>;
+def XOR16ri : I16ri<0x0,
+ (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
+ "xor.w\t{$src2, $dst}",
+ [(set GR16:$dst, (xor GR16:$src, imm:$src2)),
+ (implicit SR)]>;
+
+def XOR8rm : I8rm<0x0,
+ (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
+ "xor.b\t{$src2, $dst}",
+ [(set GR8:$dst, (xor GR8:$src, (load addr:$src2))),
+ (implicit SR)]>;
+def XOR16rm : I16rm<0x0,
+ (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
+ "xor.w\t{$src2, $dst}",
+ [(set GR16:$dst, (xor GR16:$src, (load addr:$src2))),
+ (implicit SR)]>;
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1,
+Constraints = "$base = $base_wb, $src = $dst" in {
+def XOR8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
+ (outs GR8:$dst, GR16:$base_wb),
+ (ins GR8:$src, GR16:$base),
+ "xor.b\t{@$base+, $dst}", []>;
+def XOR16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+ (outs GR16:$dst, GR16:$base_wb),
+ (ins GR16:$src, GR16:$base),
+ "xor.w\t{@$base+, $dst}", []>;
+}
+
+let Constraints = "" in {
+def XOR8mr : I8mr<0x0,
+ (outs), (ins memdst:$dst, GR8:$src),
+ "xor.b\t{$src, $dst}",
+ [(store (xor (load addr:$dst), GR8:$src), addr:$dst),
+ (implicit SR)]>;
+def XOR16mr : I16mr<0x0,
+ (outs), (ins memdst:$dst, GR16:$src),
+ "xor.w\t{$src, $dst}",
+ [(store (xor (load addr:$dst), GR16:$src), addr:$dst),
+ (implicit SR)]>;
+
+def XOR8mi : I8mi<0x0,
+ (outs), (ins memdst:$dst, i8imm:$src),
+ "xor.b\t{$src, $dst}",
+ [(store (xor (load addr:$dst), (i8 imm:$src)), addr:$dst),
+ (implicit SR)]>;
+def XOR16mi : I16mi<0x0,
+ (outs), (ins memdst:$dst, i16imm:$src),
+ "xor.w\t{$src, $dst}",
+ [(store (xor (load addr:$dst), (i16 imm:$src)), addr:$dst),
+ (implicit SR)]>;
+
+def XOR8mm : I8mm<0x0,
+ (outs), (ins memdst:$dst, memsrc:$src),
+ "xor.b\t{$src, $dst}",
+ [(store (xor (load addr:$dst), (i8 (load addr:$src))), addr:$dst),
+ (implicit SR)]>;
+def XOR16mm : I16mm<0x0,
+ (outs), (ins memdst:$dst, memsrc:$src),
+ "xor.w\t{$src, $dst}",
+ [(store (xor (load addr:$dst), (i16 (load addr:$src))), addr:$dst),
+ (implicit SR)]>;
+}
+
+
+def SUB8rr : I8rr<0x0,
+ (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
+ "sub.b\t{$src2, $dst}",
+ [(set GR8:$dst, (sub GR8:$src, GR8:$src2)),
+ (implicit SR)]>;
+def SUB16rr : I16rr<0x0,
+ (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
+ "sub.w\t{$src2, $dst}",
+ [(set GR16:$dst, (sub GR16:$src, GR16:$src2)),
+ (implicit SR)]>;
+
+def SUB8ri : I8ri<0x0,
+ (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
+ "sub.b\t{$src2, $dst}",
+ [(set GR8:$dst, (sub GR8:$src, imm:$src2)),
+ (implicit SR)]>;
+def SUB16ri : I16ri<0x0,
+ (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
+ "sub.w\t{$src2, $dst}",
+ [(set GR16:$dst, (sub GR16:$src, imm:$src2)),
+ (implicit SR)]>;
+
+def SUB8rm : I8rm<0x0,
+ (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
+ "sub.b\t{$src2, $dst}",
+ [(set GR8:$dst, (sub GR8:$src, (load addr:$src2))),
+ (implicit SR)]>;
+def SUB16rm : I16rm<0x0,
+ (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
+ "sub.w\t{$src2, $dst}",
+ [(set GR16:$dst, (sub GR16:$src, (load addr:$src2))),
+ (implicit SR)]>;
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1,
+Constraints = "$base = $base_wb, $src = $dst" in {
+def SUB8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
+ (outs GR8:$dst, GR16:$base_wb),
+ (ins GR8:$src, GR16:$base),
+ "sub.b\t{@$base+, $dst}", []>;
+def SUB16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+ (outs GR16:$dst, GR16:$base_wb),
+ (ins GR16:$src, GR16:$base),
+ "sub.w\t{@$base+, $dst}", []>;
+}
+
+let Constraints = "" in {
+def SUB8mr : I8mr<0x0,
+ (outs), (ins memdst:$dst, GR8:$src),
+ "sub.b\t{$src, $dst}",
+ [(store (sub (load addr:$dst), GR8:$src), addr:$dst),
+ (implicit SR)]>;
+def SUB16mr : I16mr<0x0,
+ (outs), (ins memdst:$dst, GR16:$src),
+ "sub.w\t{$src, $dst}",
+ [(store (sub (load addr:$dst), GR16:$src), addr:$dst),
+ (implicit SR)]>;
+
+def SUB8mi : I8mi<0x0,
+ (outs), (ins memdst:$dst, i8imm:$src),
+ "sub.b\t{$src, $dst}",
+ [(store (sub (load addr:$dst), (i8 imm:$src)), addr:$dst),
+ (implicit SR)]>;
+def SUB16mi : I16mi<0x0,
+ (outs), (ins memdst:$dst, i16imm:$src),
+ "sub.w\t{$src, $dst}",
+ [(store (sub (load addr:$dst), (i16 imm:$src)), addr:$dst),
+ (implicit SR)]>;
+
+def SUB8mm : I8mm<0x0,
+ (outs), (ins memdst:$dst, memsrc:$src),
+ "sub.b\t{$src, $dst}",
+ [(store (sub (load addr:$dst),
+ (i8 (load addr:$src))), addr:$dst),
+ (implicit SR)]>;
+def SUB16mm : I16mm<0x0,
+ (outs), (ins memdst:$dst, memsrc:$src),
+ "sub.w\t{$src, $dst}",
+ [(store (sub (load addr:$dst),
+ (i16 (load addr:$src))), addr:$dst),
+ (implicit SR)]>;
+}
+
+let Uses = [SR] in {
+def SBC8rr : I8rr<0x0,
+ (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
+ "subc.b\t{$src2, $dst}",
+ [(set GR8:$dst, (sube GR8:$src, GR8:$src2)),
+ (implicit SR)]>;
+def SBC16rr : I16rr<0x0,
+ (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
+ "subc.w\t{$src2, $dst}",
+ [(set GR16:$dst, (sube GR16:$src, GR16:$src2)),
+ (implicit SR)]>;
+
+def SBC8ri : I8ri<0x0,
+ (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
+ "subc.b\t{$src2, $dst}",
+ [(set GR8:$dst, (sube GR8:$src, imm:$src2)),
+ (implicit SR)]>;
+def SBC16ri : I16ri<0x0,
+ (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
+ "subc.w\t{$src2, $dst}",
+ [(set GR16:$dst, (sube GR16:$src, imm:$src2)),
+ (implicit SR)]>;
+
+def SBC8rm : I8rm<0x0,
+ (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
+ "subc.b\t{$src2, $dst}",
+ [(set GR8:$dst, (sube GR8:$src, (load addr:$src2))),
+ (implicit SR)]>;
+def SBC16rm : I16rm<0x0,
+ (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
+ "subc.w\t{$src2, $dst}",
+ [(set GR16:$dst, (sube GR16:$src, (load addr:$src2))),
+ (implicit SR)]>;
+
+let Constraints = "" in {
+def SBC8mr : I8mr<0x0,
+ (outs), (ins memdst:$dst, GR8:$src),
+ "subc.b\t{$src, $dst}",
+ [(store (sube (load addr:$dst), GR8:$src), addr:$dst),
+ (implicit SR)]>;
+def SBC16mr : I16mr<0x0,
+ (outs), (ins memdst:$dst, GR16:$src),
+ "subc.w\t{$src, $dst}",
+ [(store (sube (load addr:$dst), GR16:$src), addr:$dst),
+ (implicit SR)]>;
+
+def SBC8mi : I8mi<0x0,
+ (outs), (ins memdst:$dst, i8imm:$src),
+ "subc.b\t{$src, $dst}",
+ [(store (sube (load addr:$dst), (i8 imm:$src)), addr:$dst),
+ (implicit SR)]>;
+def SBC16mi : I16mi<0x0,
+ (outs), (ins memdst:$dst, i16imm:$src),
+ "subc.w\t{$src, $dst}",
+ [(store (sube (load addr:$dst), (i16 imm:$src)), addr:$dst),
+ (implicit SR)]>;
+
+def SBC8mm : I8mm<0x0,
+ (outs), (ins memdst:$dst, memsrc:$src),
+ "subc.b\t{$src, $dst}",
+ [(store (sube (load addr:$dst),
+ (i8 (load addr:$src))), addr:$dst),
+ (implicit SR)]>;
+def SBC16mm : I16mm<0x0,
+ (outs), (ins memdst:$dst, memsrc:$src),
+ "subc.w\t{$src, $dst}",
+ [(store (sube (load addr:$dst),
+ (i16 (load addr:$src))), addr:$dst),
+ (implicit SR)]>;
+}
+
+} // Uses = [SR]
+
+// FIXME: memory variant!
+def SAR8r1 : II8r<0x0,
+ (outs GR8:$dst), (ins GR8:$src),
+ "rra.b\t$dst",
+ [(set GR8:$dst, (MSP430rra GR8:$src)),
+ (implicit SR)]>;
+def SAR16r1 : II16r<0x0,
+ (outs GR16:$dst), (ins GR16:$src),
+ "rra.w\t$dst",
+ [(set GR16:$dst, (MSP430rra GR16:$src)),
+ (implicit SR)]>;
+
+def SHL8r1 : I8rr<0x0,
+ (outs GR8:$dst), (ins GR8:$src),
+ "rla.b\t$dst",
+ [(set GR8:$dst, (MSP430rla GR8:$src)),
+ (implicit SR)]>;
+def SHL16r1 : I16rr<0x0,
+ (outs GR16:$dst), (ins GR16:$src),
+ "rla.w\t$dst",
+ [(set GR16:$dst, (MSP430rla GR16:$src)),
+ (implicit SR)]>;
+
+def SAR8r1c : Pseudo<(outs GR8:$dst), (ins GR8:$src),
+ "clrc\n\t"
+ "rrc.b\t$dst",
+ [(set GR8:$dst, (MSP430rrc GR8:$src)),
+ (implicit SR)]>;
+def SAR16r1c : Pseudo<(outs GR16:$dst), (ins GR16:$src),
+ "clrc\n\t"
+ "rrc.w\t$dst",
+ [(set GR16:$dst, (MSP430rrc GR16:$src)),
+ (implicit SR)]>;
+
+// FIXME: Memory sext's ?
+def SEXT16r : II16r<0x0,
+ (outs GR16:$dst), (ins GR16:$src),
+ "sxt\t$dst",
+ [(set GR16:$dst, (sext_inreg GR16:$src, i8)),
+ (implicit SR)]>;
+
+} // Defs = [SR]
+
+def ZEXT16r : I8rr<0x0,
+ (outs GR16:$dst), (ins GR16:$src),
+ "mov.b\t{$src, $dst}",
+ [(set GR16:$dst, (zext (trunc GR16:$src)))]>;
+
+// FIXME: Memory bitswaps?
+def SWPB16r : II16r<0x0,
+ (outs GR16:$dst), (ins GR16:$src),
+ "swpb\t$dst",
+ [(set GR16:$dst, (bswap GR16:$src))]>;
+
+} // Constraints = "$src = $dst"
+
+// Integer comparisons
+let Defs = [SR] in {
+def CMP8rr : I8rr<0x0,
+ (outs), (ins GR8:$src, GR8:$src2),
+ "cmp.b\t{$src2, $src}",
+ [(MSP430cmp GR8:$src, GR8:$src2), (implicit SR)]>;
+def CMP16rr : I16rr<0x0,
+ (outs), (ins GR16:$src, GR16:$src2),
+ "cmp.w\t{$src2, $src}",
+ [(MSP430cmp GR16:$src, GR16:$src2), (implicit SR)]>;
+
+def CMP8ri : I8ri<0x0,
+ (outs), (ins GR8:$src, i8imm:$src2),
+ "cmp.b\t{$src2, $src}",
+ [(MSP430cmp GR8:$src, imm:$src2), (implicit SR)]>;
+def CMP16ri : I16ri<0x0,
+ (outs), (ins GR16:$src, i16imm:$src2),
+ "cmp.w\t{$src2, $src}",
+ [(MSP430cmp GR16:$src, imm:$src2), (implicit SR)]>;
+
+def CMP8mi : I8mi<0x0,
+ (outs), (ins memsrc:$src, i8imm:$src2),
+ "cmp.b\t{$src2, $src}",
+ [(MSP430cmp (load addr:$src),
+ (i8 imm:$src2)), (implicit SR)]>;
+def CMP16mi : I16mi<0x0,
+ (outs), (ins memsrc:$src, i16imm:$src2),
+ "cmp.w\t{$src2, $src}",
+ [(MSP430cmp (load addr:$src),
+ (i16 imm:$src2)), (implicit SR)]>;
+
+def CMP8rm : I8rm<0x0,
+ (outs), (ins GR8:$src, memsrc:$src2),
+ "cmp.b\t{$src2, $src}",
+ [(MSP430cmp GR8:$src, (load addr:$src2)),
+ (implicit SR)]>;
+def CMP16rm : I16rm<0x0,
+ (outs), (ins GR16:$src, memsrc:$src2),
+ "cmp.w\t{$src2, $src}",
+ [(MSP430cmp GR16:$src, (load addr:$src2)),
+ (implicit SR)]>;
+
+def CMP8mr : I8mr<0x0,
+ (outs), (ins memsrc:$src, GR8:$src2),
+ "cmp.b\t{$src2, $src}",
+ [(MSP430cmp (load addr:$src), GR8:$src2),
+ (implicit SR)]>;
+def CMP16mr : I16mr<0x0,
+ (outs), (ins memsrc:$src, GR16:$src2),
+ "cmp.w\t{$src2, $src}",
+ [(MSP430cmp (load addr:$src), GR16:$src2),
+ (implicit SR)]>;
+
+
+// BIT TESTS, just sets condition codes
+// Note that the C condition is set differently than when using CMP.
+let isCommutable = 1 in {
+def BIT8rr : I8rr<0x0,
+ (outs), (ins GR8:$src, GR8:$src2),
+ "bit.b\t{$src2, $src}",
+ [(MSP430cmp (and_su GR8:$src, GR8:$src2), 0),
+ (implicit SR)]>;
+def BIT16rr : I16rr<0x0,
+ (outs), (ins GR16:$src, GR16:$src2),
+ "bit.w\t{$src2, $src}",
+ [(MSP430cmp (and_su GR16:$src, GR16:$src2), 0),
+ (implicit SR)]>;
+}
+def BIT8ri : I8ri<0x0,
+ (outs), (ins GR8:$src, i8imm:$src2),
+ "bit.b\t{$src2, $src}",
+ [(MSP430cmp (and_su GR8:$src, imm:$src2), 0),
+ (implicit SR)]>;
+def BIT16ri : I16ri<0x0,
+ (outs), (ins GR16:$src, i16imm:$src2),
+ "bit.w\t{$src2, $src}",
+ [(MSP430cmp (and_su GR16:$src, imm:$src2), 0),
+ (implicit SR)]>;
+
+def BIT8rm : I8rm<0x0,
+ (outs), (ins GR8:$src, memdst:$src2),
+ "bit.b\t{$src2, $src}",
+ [(MSP430cmp (and_su GR8:$src, (load addr:$src2)), 0),
+ (implicit SR)]>;
+def BIT16rm : I16rm<0x0,
+ (outs), (ins GR16:$src, memdst:$src2),
+ "bit.w\t{$src2, $src}",
+ [(MSP430cmp (and_su GR16:$src, (load addr:$src2)), 0),
+ (implicit SR)]>;
+
+def BIT8mr : I8mr<0x0,
+ (outs), (ins memsrc:$src, GR8:$src2),
+ "bit.b\t{$src2, $src}",
+ [(MSP430cmp (and_su (load addr:$src), GR8:$src2), 0),
+ (implicit SR)]>;
+def BIT16mr : I16mr<0x0,
+ (outs), (ins memsrc:$src, GR16:$src2),
+ "bit.w\t{$src2, $src}",
+ [(MSP430cmp (and_su (load addr:$src), GR16:$src2), 0),
+ (implicit SR)]>;
+
+def BIT8mi : I8mi<0x0,
+ (outs), (ins memsrc:$src, i8imm:$src2),
+ "bit.b\t{$src2, $src}",
+ [(MSP430cmp (and_su (load addr:$src), (i8 imm:$src2)), 0),
+ (implicit SR)]>;
+def BIT16mi : I16mi<0x0,
+ (outs), (ins memsrc:$src, i16imm:$src2),
+ "bit.w\t{$src2, $src}",
+ [(MSP430cmp (and_su (load addr:$src), (i16 imm:$src2)), 0),
+ (implicit SR)]>;
+
+def BIT8mm : I8mm<0x0,
+ (outs), (ins memsrc:$src, memsrc:$src2),
+ "bit.b\t{$src2, $src}",
+ [(MSP430cmp (and_su (i8 (load addr:$src)),
+ (load addr:$src2)),
+ 0),
+ (implicit SR)]>;
+def BIT16mm : I16mm<0x0,
+ (outs), (ins memsrc:$src, memsrc:$src2),
+ "bit.w\t{$src2, $src}",
+ [(MSP430cmp (and_su (i16 (load addr:$src)),
+ (load addr:$src2)),
+ 0),
+ (implicit SR)]>;
+} // Defs = [SR]
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+
+// extload
+def : Pat<(extloadi16i8 addr:$src), (MOVZX16rm8 addr:$src)>;
+
+// anyext
+def : Pat<(i16 (anyext GR8:$src)),
+ (SUBREG_TO_REG (i16 0), GR8:$src, subreg_8bit)>;
+
+// truncs
+def : Pat<(i8 (trunc GR16:$src)),
+ (EXTRACT_SUBREG GR16:$src, subreg_8bit)>;
+
+// GlobalAddress, ExternalSymbol
+def : Pat<(i16 (MSP430Wrapper tglobaladdr:$dst)), (MOV16ri tglobaladdr:$dst)>;
+def : Pat<(i16 (MSP430Wrapper texternalsym:$dst)), (MOV16ri texternalsym:$dst)>;
+def : Pat<(i16 (MSP430Wrapper tblockaddress:$dst)), (MOV16ri tblockaddress:$dst)>;
+
+def : Pat<(add GR16:$src, (MSP430Wrapper tglobaladdr :$src2)),
+ (ADD16ri GR16:$src, tglobaladdr:$src2)>;
+def : Pat<(add GR16:$src, (MSP430Wrapper texternalsym:$src2)),
+ (ADD16ri GR16:$src, texternalsym:$src2)>;
+def : Pat<(add GR16:$src, (MSP430Wrapper tblockaddress:$src2)),
+ (ADD16ri GR16:$src, tblockaddress:$src2)>;
+
+def : Pat<(store (i16 (MSP430Wrapper tglobaladdr:$src)), addr:$dst),
+ (MOV16mi addr:$dst, tglobaladdr:$src)>;
+def : Pat<(store (i16 (MSP430Wrapper texternalsym:$src)), addr:$dst),
+ (MOV16mi addr:$dst, texternalsym:$src)>;
+def : Pat<(store (i16 (MSP430Wrapper tblockaddress:$src)), addr:$dst),
+ (MOV16mi addr:$dst, tblockaddress:$src)>;
+
+// calls
+def : Pat<(MSP430call (i16 tglobaladdr:$dst)),
+ (CALLi tglobaladdr:$dst)>;
+def : Pat<(MSP430call (i16 texternalsym:$dst)),
+ (CALLi texternalsym:$dst)>;
+
+// add and sub always produce carry
+def : Pat<(addc GR16:$src, GR16:$src2),
+ (ADD16rr GR16:$src, GR16:$src2)>;
+def : Pat<(addc GR16:$src, (load addr:$src2)),
+ (ADD16rm GR16:$src, addr:$src2)>;
+def : Pat<(addc GR16:$src, imm:$src2),
+ (ADD16ri GR16:$src, imm:$src2)>;
+def : Pat<(store (addc (load addr:$dst), GR16:$src), addr:$dst),
+ (ADD16mr addr:$dst, GR16:$src)>;
+def : Pat<(store (addc (load addr:$dst), (i16 (load addr:$src))), addr:$dst),
+ (ADD16mm addr:$dst, addr:$src)>;
+
+def : Pat<(addc GR8:$src, GR8:$src2),
+ (ADD8rr GR8:$src, GR8:$src2)>;
+def : Pat<(addc GR8:$src, (load addr:$src2)),
+ (ADD8rm GR8:$src, addr:$src2)>;
+def : Pat<(addc GR8:$src, imm:$src2),
+ (ADD8ri GR8:$src, imm:$src2)>;
+def : Pat<(store (addc (load addr:$dst), GR8:$src), addr:$dst),
+ (ADD8mr addr:$dst, GR8:$src)>;
+def : Pat<(store (addc (load addr:$dst), (i8 (load addr:$src))), addr:$dst),
+ (ADD8mm addr:$dst, addr:$src)>;
+
+def : Pat<(subc GR16:$src, GR16:$src2),
+ (SUB16rr GR16:$src, GR16:$src2)>;
+def : Pat<(subc GR16:$src, (load addr:$src2)),
+ (SUB16rm GR16:$src, addr:$src2)>;
+def : Pat<(subc GR16:$src, imm:$src2),
+ (SUB16ri GR16:$src, imm:$src2)>;
+def : Pat<(store (subc (load addr:$dst), GR16:$src), addr:$dst),
+ (SUB16mr addr:$dst, GR16:$src)>;
+def : Pat<(store (subc (load addr:$dst), (i16 (load addr:$src))), addr:$dst),
+ (SUB16mm addr:$dst, addr:$src)>;
+
+def : Pat<(subc GR8:$src, GR8:$src2),
+ (SUB8rr GR8:$src, GR8:$src2)>;
+def : Pat<(subc GR8:$src, (load addr:$src2)),
+ (SUB8rm GR8:$src, addr:$src2)>;
+def : Pat<(subc GR8:$src, imm:$src2),
+ (SUB8ri GR8:$src, imm:$src2)>;
+def : Pat<(store (subc (load addr:$dst), GR8:$src), addr:$dst),
+ (SUB8mr addr:$dst, GR8:$src)>;
+def : Pat<(store (subc (load addr:$dst), (i8 (load addr:$src))), addr:$dst),
+ (SUB8mm addr:$dst, addr:$src)>;
+
+// peephole patterns
+def : Pat<(and GR16:$src, 255), (ZEXT16r GR16:$src)>;
+def : Pat<(MSP430cmp (trunc (and_su GR16:$src, GR16:$src2)), 0),
+ (BIT8rr (EXTRACT_SUBREG GR16:$src, subreg_8bit),
+ (EXTRACT_SUBREG GR16:$src2, subreg_8bit))>;
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.cpp b/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.cpp
new file mode 100644
index 000000000000..47b0e270c5b3
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.cpp
@@ -0,0 +1,157 @@
+//===-- MSP430MCInstLower.cpp - Convert MSP430 MachineInstr to an MCInst --===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower MSP430 MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430MCInstLower.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+MCSymbol *MSP430MCInstLower::
+GetGlobalAddressSymbol(const MachineOperand &MO) const {
+ switch (MO.getTargetFlags()) {
+ default: llvm_unreachable("Unknown target flag on GV operand");
+ case 0: break;
+ }
+
+ return Printer.getSymbol(MO.getGlobal());
+}
+
+MCSymbol *MSP430MCInstLower::
+GetExternalSymbolSymbol(const MachineOperand &MO) const {
+ switch (MO.getTargetFlags()) {
+ default: llvm_unreachable("Unknown target flag on GV operand");
+ case 0: break;
+ }
+
+ return Printer.GetExternalSymbolSymbol(MO.getSymbolName());
+}
+
+MCSymbol *MSP430MCInstLower::
+GetJumpTableSymbol(const MachineOperand &MO) const {
+ const DataLayout &DL = Printer.getDataLayout();
+ SmallString<256> Name;
+ raw_svector_ostream(Name) << DL.getPrivateGlobalPrefix() << "JTI"
+ << Printer.getFunctionNumber() << '_'
+ << MO.getIndex();
+
+ switch (MO.getTargetFlags()) {
+ default: llvm_unreachable("Unknown target flag on GV operand");
+ case 0: break;
+ }
+
+ // Create a symbol for the name.
+ return Ctx.getOrCreateSymbol(Name);
+}
+
+MCSymbol *MSP430MCInstLower::
+GetConstantPoolIndexSymbol(const MachineOperand &MO) const {
+ const DataLayout &DL = Printer.getDataLayout();
+ SmallString<256> Name;
+ raw_svector_ostream(Name) << DL.getPrivateGlobalPrefix() << "CPI"
+ << Printer.getFunctionNumber() << '_'
+ << MO.getIndex();
+
+ switch (MO.getTargetFlags()) {
+ default: llvm_unreachable("Unknown target flag on GV operand");
+ case 0: break;
+ }
+
+ // Create a symbol for the name.
+ return Ctx.getOrCreateSymbol(Name);
+}
+
+MCSymbol *MSP430MCInstLower::
+GetBlockAddressSymbol(const MachineOperand &MO) const {
+ switch (MO.getTargetFlags()) {
+ default: llvm_unreachable("Unknown target flag on GV operand");
+ case 0: break;
+ }
+
+ return Printer.GetBlockAddressSymbol(MO.getBlockAddress());
+}
+
+MCOperand MSP430MCInstLower::
+LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const {
+ // FIXME: We would like an efficient form for this, so we don't have to do a
+ // lot of extra uniquing.
+ const MCExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx);
+
+ switch (MO.getTargetFlags()) {
+ default: llvm_unreachable("Unknown target flag on GV operand");
+ case 0: break;
+ }
+
+ if (!MO.isJTI() && MO.getOffset())
+ Expr = MCBinaryExpr::createAdd(Expr,
+ MCConstantExpr::create(MO.getOffset(), Ctx),
+ Ctx);
+ return MCOperand::createExpr(Expr);
+}
+
+void MSP430MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+ OutMI.setOpcode(MI->getOpcode());
+
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+
+ MCOperand MCOp;
+ switch (MO.getType()) {
+ default:
+ MI->dump();
+ llvm_unreachable("unknown operand type");
+ case MachineOperand::MO_Register:
+ // Ignore all implicit register operands.
+ if (MO.isImplicit()) continue;
+ MCOp = MCOperand::createReg(MO.getReg());
+ break;
+ case MachineOperand::MO_Immediate:
+ MCOp = MCOperand::createImm(MO.getImm());
+ break;
+ case MachineOperand::MO_MachineBasicBlock:
+ MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(
+ MO.getMBB()->getSymbol(), Ctx));
+ break;
+ case MachineOperand::MO_GlobalAddress:
+ MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
+ break;
+ case MachineOperand::MO_ExternalSymbol:
+ MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO));
+ break;
+ case MachineOperand::MO_JumpTableIndex:
+ MCOp = LowerSymbolOperand(MO, GetJumpTableSymbol(MO));
+ break;
+ case MachineOperand::MO_ConstantPoolIndex:
+ MCOp = LowerSymbolOperand(MO, GetConstantPoolIndexSymbol(MO));
+ break;
+ case MachineOperand::MO_BlockAddress:
+ MCOp = LowerSymbolOperand(MO, GetBlockAddressSymbol(MO));
+ break;
+ case MachineOperand::MO_RegisterMask:
+ continue;
+ }
+
+ OutMI.addOperand(MCOp);
+ }
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.h b/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.h
new file mode 100644
index 000000000000..ebd639744bcc
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.h
@@ -0,0 +1,47 @@
+//===-- MSP430MCInstLower.h - Lower MachineInstr to MCInst ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MSP430_MSP430MCINSTLOWER_H
+#define LLVM_LIB_TARGET_MSP430_MSP430MCINSTLOWER_H
+
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+ class AsmPrinter;
+ class MCContext;
+ class MCInst;
+ class MCOperand;
+ class MCSymbol;
+ class MachineInstr;
+ class MachineModuleInfoMachO;
+ class MachineOperand;
+
+ /// MSP430MCInstLower - This class is used to lower an MachineInstr
+ /// into an MCInst.
+class LLVM_LIBRARY_VISIBILITY MSP430MCInstLower {
+ MCContext &Ctx;
+
+ AsmPrinter &Printer;
+public:
+ MSP430MCInstLower(MCContext &ctx, AsmPrinter &printer)
+ : Ctx(ctx), Printer(printer) {}
+ void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+ MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+
+ MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
+ MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
+ MCSymbol *GetJumpTableSymbol(const MachineOperand &MO) const;
+ MCSymbol *GetConstantPoolIndexSymbol(const MachineOperand &MO) const;
+ MCSymbol *GetBlockAddressSymbol(const MachineOperand &MO) const;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp b/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp
new file mode 100644
index 000000000000..b442fc03b257
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp
@@ -0,0 +1,14 @@
+//===-- MSP430MachineFunctionInfo.cpp - MSP430 machine function info ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430MachineFunctionInfo.h"
+
+using namespace llvm;
+
+void MSP430MachineFunctionInfo::anchor() { }
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h b/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h
new file mode 100644
index 000000000000..2d937318c7e5
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h
@@ -0,0 +1,54 @@
+//=== MSP430MachineFunctionInfo.h - MSP430 machine function info -*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares MSP430-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MSP430_MSP430MACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_MSP430_MSP430MACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+/// MSP430MachineFunctionInfo - This class is derived from MachineFunction and
+/// contains private MSP430 target-specific information for each MachineFunction.
+class MSP430MachineFunctionInfo : public MachineFunctionInfo {
+ virtual void anchor();
+
+ /// CalleeSavedFrameSize - Size of the callee-saved register portion of the
+ /// stack frame in bytes.
+ unsigned CalleeSavedFrameSize;
+
+ /// ReturnAddrIndex - FrameIndex for return slot.
+ int ReturnAddrIndex;
+
+ /// VarArgsFrameIndex - FrameIndex for start of varargs area.
+ int VarArgsFrameIndex;
+
+public:
+ MSP430MachineFunctionInfo() : CalleeSavedFrameSize(0) {}
+
+ explicit MSP430MachineFunctionInfo(MachineFunction &MF)
+ : CalleeSavedFrameSize(0), ReturnAddrIndex(0) {}
+
+ unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
+ void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
+
+ int getRAIndex() const { return ReturnAddrIndex; }
+ void setRAIndex(int Index) { ReturnAddrIndex = Index; }
+
+ int getVarArgsFrameIndex() const { return VarArgsFrameIndex;}
+ void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.cpp b/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.cpp
new file mode 100644
index 000000000000..81cd9d1ad3f8
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.cpp
@@ -0,0 +1,161 @@
+//===-- MSP430RegisterInfo.cpp - MSP430 Register Information --------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MSP430 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430RegisterInfo.h"
+#include "MSP430.h"
+#include "MSP430MachineFunctionInfo.h"
+#include "MSP430TargetMachine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "msp430-reg-info"
+
+#define GET_REGINFO_TARGET_DESC
+#include "MSP430GenRegisterInfo.inc"
+
+// FIXME: Provide proper call frame setup / destroy opcodes.
+MSP430RegisterInfo::MSP430RegisterInfo()
+ : MSP430GenRegisterInfo(MSP430::PC) {}
+
+const MCPhysReg*
+MSP430RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+ const MSP430FrameLowering *TFI = getFrameLowering(*MF);
+ const Function* F = MF->getFunction();
+ static const MCPhysReg CalleeSavedRegs[] = {
+ MSP430::FP, MSP430::R5, MSP430::R6, MSP430::R7,
+ MSP430::R8, MSP430::R9, MSP430::R10, MSP430::R11,
+ 0
+ };
+ static const MCPhysReg CalleeSavedRegsFP[] = {
+ MSP430::R5, MSP430::R6, MSP430::R7,
+ MSP430::R8, MSP430::R9, MSP430::R10, MSP430::R11,
+ 0
+ };
+ static const MCPhysReg CalleeSavedRegsIntr[] = {
+ MSP430::FP, MSP430::R5, MSP430::R6, MSP430::R7,
+ MSP430::R8, MSP430::R9, MSP430::R10, MSP430::R11,
+ MSP430::R12, MSP430::R13, MSP430::R14, MSP430::R15,
+ 0
+ };
+ static const MCPhysReg CalleeSavedRegsIntrFP[] = {
+ MSP430::R5, MSP430::R6, MSP430::R7,
+ MSP430::R8, MSP430::R9, MSP430::R10, MSP430::R11,
+ MSP430::R12, MSP430::R13, MSP430::R14, MSP430::R15,
+ 0
+ };
+
+ if (TFI->hasFP(*MF))
+ return (F->getCallingConv() == CallingConv::MSP430_INTR ?
+ CalleeSavedRegsIntrFP : CalleeSavedRegsFP);
+ else
+ return (F->getCallingConv() == CallingConv::MSP430_INTR ?
+ CalleeSavedRegsIntr : CalleeSavedRegs);
+
+}
+
+BitVector MSP430RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+ BitVector Reserved(getNumRegs());
+ const MSP430FrameLowering *TFI = getFrameLowering(MF);
+
+ // Mark 4 special registers with subregisters as reserved.
+ Reserved.set(MSP430::PCB);
+ Reserved.set(MSP430::SPB);
+ Reserved.set(MSP430::SRB);
+ Reserved.set(MSP430::CGB);
+ Reserved.set(MSP430::PC);
+ Reserved.set(MSP430::SP);
+ Reserved.set(MSP430::SR);
+ Reserved.set(MSP430::CG);
+
+ // Mark frame pointer as reserved if needed.
+ if (TFI->hasFP(MF)) {
+ Reserved.set(MSP430::FPB);
+ Reserved.set(MSP430::FP);
+ }
+
+ return Reserved;
+}
+
+const TargetRegisterClass *
+MSP430RegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
+ const {
+ return &MSP430::GR16RegClass;
+}
+
+void
+MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+ int SPAdj, unsigned FIOperandNum,
+ RegScavenger *RS) const {
+ assert(SPAdj == 0 && "Unexpected");
+
+ MachineInstr &MI = *II;
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineFunction &MF = *MBB.getParent();
+ const MSP430FrameLowering *TFI = getFrameLowering(MF);
+ DebugLoc dl = MI.getDebugLoc();
+ int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+
+ unsigned BasePtr = (TFI->hasFP(MF) ? MSP430::FP : MSP430::SP);
+ int Offset = MF.getFrameInfo().getObjectOffset(FrameIndex);
+
+ // Skip the saved PC
+ Offset += 2;
+
+ if (!TFI->hasFP(MF))
+ Offset += MF.getFrameInfo().getStackSize();
+ else
+ Offset += 2; // Skip the saved FP
+
+ // Fold imm into offset
+ Offset += MI.getOperand(FIOperandNum + 1).getImm();
+
+ if (MI.getOpcode() == MSP430::ADD16ri) {
+ // This is actually "load effective address" of the stack slot
+ // instruction. We have only two-address instructions, thus we need to
+ // expand it into mov + add
+ const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+
+ MI.setDesc(TII.get(MSP430::MOV16rr));
+ MI.getOperand(FIOperandNum).ChangeToRegister(BasePtr, false);
+
+ if (Offset == 0)
+ return;
+
+ // We need to materialize the offset via add instruction.
+ unsigned DstReg = MI.getOperand(0).getReg();
+ if (Offset < 0)
+ BuildMI(MBB, std::next(II), dl, TII.get(MSP430::SUB16ri), DstReg)
+ .addReg(DstReg).addImm(-Offset);
+ else
+ BuildMI(MBB, std::next(II), dl, TII.get(MSP430::ADD16ri), DstReg)
+ .addReg(DstReg).addImm(Offset);
+
+ return;
+ }
+
+ MI.getOperand(FIOperandNum).ChangeToRegister(BasePtr, false);
+ MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+}
+
+unsigned MSP430RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+ const MSP430FrameLowering *TFI = getFrameLowering(MF);
+ return TFI->hasFP(MF) ? MSP430::FP : MSP430::SP;
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.h b/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.h
new file mode 100644
index 000000000000..0cfa4a42bfe4
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.h
@@ -0,0 +1,46 @@
+//===-- MSP430RegisterInfo.h - MSP430 Register Information Impl -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MSP430 implementation of the MRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MSP430_MSP430REGISTERINFO_H
+#define LLVM_LIB_TARGET_MSP430_MSP430REGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "MSP430GenRegisterInfo.inc"
+
+namespace llvm {
+
+struct MSP430RegisterInfo : public MSP430GenRegisterInfo {
+public:
+ MSP430RegisterInfo();
+
+ /// Code Generation virtual methods...
+ const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+
+ BitVector getReservedRegs(const MachineFunction &MF) const override;
+ const TargetRegisterClass*
+ getPointerRegClass(const MachineFunction &MF,
+ unsigned Kind = 0) const override;
+
+ void eliminateFrameIndex(MachineBasicBlock::iterator II,
+ int SPAdj, unsigned FIOperandNum,
+ RegScavenger *RS = nullptr) const override;
+
+ // Debug information queries.
+ unsigned getFrameRegister(const MachineFunction &MF) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.td b/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.td
new file mode 100644
index 000000000000..b5a6ed0f0a56
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.td
@@ -0,0 +1,81 @@
+//===-- MSP430RegisterInfo.td - MSP430 Register defs -------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Declarations that describe the MSP430 register file
+//===----------------------------------------------------------------------===//
+
+class MSP430Reg<bits<4> num, string n> : Register<n> {
+ field bits<4> Num = num;
+ let Namespace = "MSP430";
+}
+
+class MSP430RegWithSubregs<bits<4> num, string n, list<Register> subregs>
+ : RegisterWithSubRegs<n, subregs> {
+ field bits<4> Num = num;
+ let Namespace = "MSP430";
+}
+
+//===----------------------------------------------------------------------===//
+// Registers
+//===----------------------------------------------------------------------===//
+
+def PCB : MSP430Reg<0, "r0">;
+def SPB : MSP430Reg<1, "r1">;
+def SRB : MSP430Reg<2, "r2">;
+def CGB : MSP430Reg<3, "r3">;
+def FPB : MSP430Reg<4, "r4">;
+def R5B : MSP430Reg<5, "r5">;
+def R6B : MSP430Reg<6, "r6">;
+def R7B : MSP430Reg<7, "r7">;
+def R8B : MSP430Reg<8, "r8">;
+def R9B : MSP430Reg<9, "r9">;
+def R10B : MSP430Reg<10, "r10">;
+def R11B : MSP430Reg<11, "r11">;
+def R12B : MSP430Reg<12, "r12">;
+def R13B : MSP430Reg<13, "r13">;
+def R14B : MSP430Reg<14, "r14">;
+def R15B : MSP430Reg<15, "r15">;
+
+def subreg_8bit : SubRegIndex<8> { let Namespace = "MSP430"; }
+
+let SubRegIndices = [subreg_8bit] in {
+def PC : MSP430RegWithSubregs<0, "r0", [PCB]>;
+def SP : MSP430RegWithSubregs<1, "r1", [SPB]>;
+def SR : MSP430RegWithSubregs<2, "r2", [SRB]>;
+def CG : MSP430RegWithSubregs<3, "r3", [CGB]>;
+def FP : MSP430RegWithSubregs<4, "r4", [FPB]>;
+def R5 : MSP430RegWithSubregs<5, "r5", [R5B]>;
+def R6 : MSP430RegWithSubregs<6, "r6", [R6B]>;
+def R7 : MSP430RegWithSubregs<7, "r7", [R7B]>;
+def R8 : MSP430RegWithSubregs<8, "r8", [R8B]>;
+def R9 : MSP430RegWithSubregs<9, "r9", [R9B]>;
+def R10 : MSP430RegWithSubregs<10, "r10", [R10B]>;
+def R11 : MSP430RegWithSubregs<11, "r11", [R11B]>;
+def R12 : MSP430RegWithSubregs<12, "r12", [R12B]>;
+def R13 : MSP430RegWithSubregs<13, "r13", [R13B]>;
+def R14 : MSP430RegWithSubregs<14, "r14", [R14B]>;
+def R15 : MSP430RegWithSubregs<15, "r15", [R15B]>;
+}
+
+def GR8 : RegisterClass<"MSP430", [i8], 8,
+ // Volatile registers
+ (add R12B, R13B, R14B, R15B, R11B, R10B, R9B, R8B, R7B, R6B, R5B,
+ // Frame pointer, sometimes allocable
+ FPB,
+ // Volatile, but not allocable
+ PCB, SPB, SRB, CGB)>;
+
+def GR16 : RegisterClass<"MSP430", [i16], 16,
+ // Volatile registers
+ (add R12, R13, R14, R15, R11, R10, R9, R8, R7, R6, R5,
+ // Frame pointer, sometimes allocable
+ FP,
+ // Volatile, but not allocable
+ PC, SP, SR, CG)>;
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.cpp b/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.cpp
new file mode 100644
index 000000000000..6216348e4d71
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.cpp
@@ -0,0 +1,37 @@
+//===-- MSP430Subtarget.cpp - MSP430 Subtarget Information ----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MSP430 specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430Subtarget.h"
+#include "MSP430.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "msp430-subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "MSP430GenSubtargetInfo.inc"
+
+void MSP430Subtarget::anchor() { }
+
+MSP430Subtarget &
+MSP430Subtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
+ ParseSubtargetFeatures("generic", FS);
+ return *this;
+}
+
+MSP430Subtarget::MSP430Subtarget(const Triple &TT, const std::string &CPU,
+ const std::string &FS, const TargetMachine &TM)
+ : MSP430GenSubtargetInfo(TT, CPU, FS), FrameLowering(),
+ InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this) {}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.h b/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.h
new file mode 100644
index 000000000000..1a00d85e01cb
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.h
@@ -0,0 +1,69 @@
+//===-- MSP430Subtarget.h - Define Subtarget for the MSP430 ----*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the MSP430 specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MSP430_MSP430SUBTARGET_H
+#define LLVM_LIB_TARGET_MSP430_MSP430SUBTARGET_H
+
+#include "MSP430FrameLowering.h"
+#include "MSP430ISelLowering.h"
+#include "MSP430InstrInfo.h"
+#include "MSP430RegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "MSP430GenSubtargetInfo.inc"
+
+namespace llvm {
+class StringRef;
+
+class MSP430Subtarget : public MSP430GenSubtargetInfo {
+ virtual void anchor();
+ bool ExtendedInsts;
+ MSP430FrameLowering FrameLowering;
+ MSP430InstrInfo InstrInfo;
+ MSP430TargetLowering TLInfo;
+ SelectionDAGTargetInfo TSInfo;
+
+public:
+ /// This constructor initializes the data members to match that
+ /// of the specified triple.
+ ///
+ MSP430Subtarget(const Triple &TT, const std::string &CPU,
+ const std::string &FS, const TargetMachine &TM);
+
+ MSP430Subtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
+
+ /// ParseSubtargetFeatures - Parses features string setting specified
+ /// subtarget options. Definition of function is auto generated by tblgen.
+ void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+ const TargetFrameLowering *getFrameLowering() const override {
+ return &FrameLowering;
+ }
+ const MSP430InstrInfo *getInstrInfo() const override { return &InstrInfo; }
+ const TargetRegisterInfo *getRegisterInfo() const override {
+ return &InstrInfo.getRegisterInfo();
+ }
+ const MSP430TargetLowering *getTargetLowering() const override {
+ return &TLInfo;
+ }
+ const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+ return &TSInfo;
+ }
+};
+} // End llvm namespace
+
+#endif // LLVM_TARGET_MSP430_SUBTARGET_H
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp b/contrib/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
new file mode 100644
index 000000000000..bebe5fa35ad4
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -0,0 +1,80 @@
+//===-- MSP430TargetMachine.cpp - Define TargetMachine for MSP430 ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Top-level implementation for the MSP430 target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430TargetMachine.h"
+#include "MSP430.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+extern "C" void LLVMInitializeMSP430Target() {
+ // Register the target.
+ RegisterTargetMachine<MSP430TargetMachine> X(getTheMSP430Target());
+}
+
+static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
+ if (!RM.hasValue())
+ return Reloc::Static;
+ return *RM;
+}
+
+MSP430TargetMachine::MSP430TargetMachine(const Target &T, const Triple &TT,
+ StringRef CPU, StringRef FS,
+ const TargetOptions &Options,
+ Optional<Reloc::Model> RM,
+ CodeModel::Model CM,
+ CodeGenOpt::Level OL)
+ : LLVMTargetMachine(T, "e-m:e-p:16:16-i32:16:32-a:16-n8:16", TT, CPU, FS,
+ Options, getEffectiveRelocModel(RM), CM, OL),
+ TLOF(make_unique<TargetLoweringObjectFileELF>()),
+ // FIXME: Check DataLayout string.
+ Subtarget(TT, CPU, FS, *this) {
+ initAsmInfo();
+}
+
+MSP430TargetMachine::~MSP430TargetMachine() {}
+
+namespace {
+/// MSP430 Code Generator Pass Configuration Options.
+class MSP430PassConfig : public TargetPassConfig {
+public:
+ MSP430PassConfig(MSP430TargetMachine *TM, PassManagerBase &PM)
+ : TargetPassConfig(TM, PM) {}
+
+ MSP430TargetMachine &getMSP430TargetMachine() const {
+ return getTM<MSP430TargetMachine>();
+ }
+
+ bool addInstSelector() override;
+ void addPreEmitPass() override;
+};
+} // namespace
+
+TargetPassConfig *MSP430TargetMachine::createPassConfig(PassManagerBase &PM) {
+ return new MSP430PassConfig(this, PM);
+}
+
+bool MSP430PassConfig::addInstSelector() {
+ // Install an instruction selector.
+ addPass(createMSP430ISelDag(getMSP430TargetMachine(), getOptLevel()));
+ return false;
+}
+
+void MSP430PassConfig::addPreEmitPass() {
+ // Must run branch selection immediately preceding the asm printer.
+ addPass(createMSP430BranchSelectionPass(), false);
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430TargetMachine.h b/contrib/llvm/lib/Target/MSP430/MSP430TargetMachine.h
new file mode 100644
index 000000000000..de8f06e71dee
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430TargetMachine.h
@@ -0,0 +1,49 @@
+//===-- MSP430TargetMachine.h - Define TargetMachine for MSP430 -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the MSP430 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef LLVM_LIB_TARGET_MSP430_MSP430TARGETMACHINE_H
+#define LLVM_LIB_TARGET_MSP430_MSP430TARGETMACHINE_H
+
+#include "MSP430Subtarget.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+/// MSP430TargetMachine
+///
+class MSP430TargetMachine : public LLVMTargetMachine {
+ std::unique_ptr<TargetLoweringObjectFile> TLOF;
+ MSP430Subtarget Subtarget;
+
+public:
+ MSP430TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL);
+ ~MSP430TargetMachine() override;
+
+ const MSP430Subtarget *getSubtargetImpl(const Function &F) const override {
+ return &Subtarget;
+ }
+ TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+ TargetLoweringObjectFile *getObjFileLowering() const override {
+ return TLOF.get();
+ }
+}; // MSP430TargetMachine.
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp b/contrib/llvm/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp
new file mode 100644
index 000000000000..62f52a193674
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp
@@ -0,0 +1,23 @@
+//===-- MSP430TargetInfo.cpp - MSP430 Target Implementation ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target &llvm::getTheMSP430Target() {
+ static Target TheMSP430Target;
+ return TheMSP430Target;
+}
+
+extern "C" void LLVMInitializeMSP430TargetInfo() {
+ RegisterTarget<Triple::msp430> X(getTheMSP430Target(), "msp430",
+ "MSP430 [experimental]");
+}
diff --git a/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
new file mode 100644
index 000000000000..97ca11ca501e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -0,0 +1,6766 @@
+//===-- MipsAsmParser.cpp - Parse Mips assembly to MCInst instructions ----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/MipsABIInfo.h"
+#include "MCTargetDesc/MipsMCExpr.h"
+#include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "MipsRegisterInfo.h"
+#include "MipsTargetObjectFile.h"
+#include "MipsTargetStreamer.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include <memory>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-asm-parser"
+
+namespace llvm {
+class MCInstrInfo;
+}
+
+namespace {
+class MipsAssemblerOptions {
+public:
+ MipsAssemblerOptions(const FeatureBitset &Features_) :
+ ATReg(1), Reorder(true), Macro(true), Features(Features_) {}
+
+ MipsAssemblerOptions(const MipsAssemblerOptions *Opts) {
+ ATReg = Opts->getATRegIndex();
+ Reorder = Opts->isReorder();
+ Macro = Opts->isMacro();
+ Features = Opts->getFeatures();
+ }
+
+ unsigned getATRegIndex() const { return ATReg; }
+ bool setATRegIndex(unsigned Reg) {
+ if (Reg > 31)
+ return false;
+
+ ATReg = Reg;
+ return true;
+ }
+
+ bool isReorder() const { return Reorder; }
+ void setReorder() { Reorder = true; }
+ void setNoReorder() { Reorder = false; }
+
+ bool isMacro() const { return Macro; }
+ void setMacro() { Macro = true; }
+ void setNoMacro() { Macro = false; }
+
+ const FeatureBitset &getFeatures() const { return Features; }
+ void setFeatures(const FeatureBitset &Features_) { Features = Features_; }
+
+ // Set of features that are either architecture features or referenced
+ // by them (e.g.: FeatureNaN2008 implied by FeatureMips32r6).
+ // The full table can be found in MipsGenSubtargetInfo.inc (MipsFeatureKV[]).
+ // The reason we need this mask is explained in the selectArch function.
+ // FIXME: Ideally we would like TableGen to generate this information.
+ static const FeatureBitset AllArchRelatedMask;
+
+private:
+ unsigned ATReg;
+ bool Reorder;
+ bool Macro;
+ FeatureBitset Features;
+};
+}
+
+const FeatureBitset MipsAssemblerOptions::AllArchRelatedMask = {
+ Mips::FeatureMips1, Mips::FeatureMips2, Mips::FeatureMips3,
+ Mips::FeatureMips3_32, Mips::FeatureMips3_32r2, Mips::FeatureMips4,
+ Mips::FeatureMips4_32, Mips::FeatureMips4_32r2, Mips::FeatureMips5,
+ Mips::FeatureMips5_32r2, Mips::FeatureMips32, Mips::FeatureMips32r2,
+ Mips::FeatureMips32r3, Mips::FeatureMips32r5, Mips::FeatureMips32r6,
+ Mips::FeatureMips64, Mips::FeatureMips64r2, Mips::FeatureMips64r3,
+ Mips::FeatureMips64r5, Mips::FeatureMips64r6, Mips::FeatureCnMips,
+ Mips::FeatureFP64Bit, Mips::FeatureGP64Bit, Mips::FeatureNaN2008
+};
+
+namespace {
+class MipsAsmParser : public MCTargetAsmParser {
+ MipsTargetStreamer &getTargetStreamer() {
+ MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
+ return static_cast<MipsTargetStreamer &>(TS);
+ }
+
+ MipsABIInfo ABI;
+ SmallVector<std::unique_ptr<MipsAssemblerOptions>, 2> AssemblerOptions;
+ MCSymbol *CurrentFn; // Pointer to the function being parsed. It may be a
+ // nullptr, which indicates that no function is currently
+ // selected. This usually happens after an '.end func'
+ // directive.
+ bool IsLittleEndian;
+ bool IsPicEnabled;
+ bool IsCpRestoreSet;
+ int CpRestoreOffset;
+ unsigned CpSaveLocation;
+ /// If true, then CpSaveLocation is a register, otherwise it's an offset.
+ bool CpSaveLocationIsRegister;
+
+ // Print a warning along with its fix-it message at the given range.
+ void printWarningWithFixIt(const Twine &Msg, const Twine &FixMsg,
+ SMRange Range, bool ShowColors = true);
+
+#define GET_ASSEMBLER_HEADER
+#include "MipsGenAsmMatcher.inc"
+
+ unsigned
+ checkEarlyTargetMatchPredicate(MCInst &Inst,
+ const OperandVector &Operands) override;
+ unsigned checkTargetMatchPredicate(MCInst &Inst) override;
+
+ bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands, MCStreamer &Out,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) override;
+
+ /// Parse a register as used in CFI directives
+ bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+
+ bool parseParenSuffix(StringRef Name, OperandVector &Operands);
+
+ bool parseBracketSuffix(StringRef Name, OperandVector &Operands);
+
+ bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+ SMLoc NameLoc, OperandVector &Operands) override;
+
+ bool ParseDirective(AsmToken DirectiveID) override;
+
+ OperandMatchResultTy parseMemOperand(OperandVector &Operands);
+ OperandMatchResultTy
+ matchAnyRegisterNameWithoutDollar(OperandVector &Operands,
+ StringRef Identifier, SMLoc S);
+ OperandMatchResultTy matchAnyRegisterWithoutDollar(OperandVector &Operands,
+ SMLoc S);
+ OperandMatchResultTy parseAnyRegister(OperandVector &Operands);
+ OperandMatchResultTy parseImm(OperandVector &Operands);
+ OperandMatchResultTy parseJumpTarget(OperandVector &Operands);
+ OperandMatchResultTy parseInvNum(OperandVector &Operands);
+ OperandMatchResultTy parseRegisterPair(OperandVector &Operands);
+ OperandMatchResultTy parseMovePRegPair(OperandVector &Operands);
+ OperandMatchResultTy parseRegisterList(OperandVector &Operands);
+
+ bool searchSymbolAlias(OperandVector &Operands);
+
+ bool parseOperand(OperandVector &, StringRef Mnemonic);
+
+ enum MacroExpanderResultTy {
+ MER_NotAMacro,
+ MER_Success,
+ MER_Fail,
+ };
+
+ // Expands assembly pseudo instructions.
+ MacroExpanderResultTy tryExpandInstruction(MCInst &Inst, SMLoc IDLoc,
+ MCStreamer &Out,
+ const MCSubtargetInfo *STI);
+
+ bool expandJalWithRegs(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI);
+
+ bool loadImmediate(int64_t ImmValue, unsigned DstReg, unsigned SrcReg,
+ bool Is32BitImm, bool IsAddress, SMLoc IDLoc,
+ MCStreamer &Out, const MCSubtargetInfo *STI);
+
+ bool loadAndAddSymbolAddress(const MCExpr *SymExpr, unsigned DstReg,
+ unsigned SrcReg, bool Is32BitSym, SMLoc IDLoc,
+ MCStreamer &Out, const MCSubtargetInfo *STI);
+
+ bool expandLoadImm(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc,
+ MCStreamer &Out, const MCSubtargetInfo *STI);
+
+ bool expandLoadAddress(unsigned DstReg, unsigned BaseReg,
+ const MCOperand &Offset, bool Is32BitAddress,
+ SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI);
+
+ bool expandUncondBranchMMPseudo(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI);
+
+ void expandMemInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI, bool IsLoad, bool IsImmOpnd);
+
+ void expandLoadInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI, bool IsImmOpnd);
+
+ void expandStoreInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI, bool IsImmOpnd);
+
+ bool expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI);
+
+ bool expandAliasImmediate(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI);
+
+ bool expandBranchImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI);
+
+ bool expandCondBranches(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI);
+
+ bool expandDiv(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI, const bool IsMips64,
+ const bool Signed);
+
+ bool expandTrunc(MCInst &Inst, bool IsDouble, bool Is64FPU, SMLoc IDLoc,
+ MCStreamer &Out, const MCSubtargetInfo *STI);
+
+ bool expandUlh(MCInst &Inst, bool Signed, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI);
+
+ bool expandUsh(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI);
+
+ bool expandUxw(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI);
+
+ bool expandRotation(MCInst &Inst, SMLoc IDLoc,
+ MCStreamer &Out, const MCSubtargetInfo *STI);
+ bool expandRotationImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI);
+ bool expandDRotation(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI);
+ bool expandDRotationImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI);
+
+ bool expandAbs(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI);
+
+ bool expandLoadStoreDMacro(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI, bool IsLoad);
+
+ bool expandSeq(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI);
+
+ bool expandSeqI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI);
+
+ bool reportParseError(Twine ErrorMsg);
+ bool reportParseError(SMLoc Loc, Twine ErrorMsg);
+
+ bool parseMemOffset(const MCExpr *&Res, bool isParenExpr);
+
+ bool isEvaluated(const MCExpr *Expr);
+ bool parseSetMips0Directive();
+ bool parseSetArchDirective();
+ bool parseSetFeature(uint64_t Feature);
+ bool isPicAndNotNxxAbi(); // Used by .cpload, .cprestore, and .cpsetup.
+ bool parseDirectiveCpLoad(SMLoc Loc);
+ bool parseDirectiveCpRestore(SMLoc Loc);
+ bool parseDirectiveCPSetup();
+ bool parseDirectiveCPReturn();
+ bool parseDirectiveNaN();
+ bool parseDirectiveSet();
+ bool parseDirectiveOption();
+ bool parseInsnDirective();
+ bool parseSSectionDirective(StringRef Section, unsigned Type);
+
+ bool parseSetAtDirective();
+ bool parseSetNoAtDirective();
+ bool parseSetMacroDirective();
+ bool parseSetNoMacroDirective();
+ bool parseSetMsaDirective();
+ bool parseSetNoMsaDirective();
+ bool parseSetNoDspDirective();
+ bool parseSetReorderDirective();
+ bool parseSetNoReorderDirective();
+ bool parseSetMips16Directive();
+ bool parseSetNoMips16Directive();
+ bool parseSetFpDirective();
+ bool parseSetOddSPRegDirective();
+ bool parseSetNoOddSPRegDirective();
+ bool parseSetPopDirective();
+ bool parseSetPushDirective();
+ bool parseSetSoftFloatDirective();
+ bool parseSetHardFloatDirective();
+
+ bool parseSetAssignment();
+
+ bool parseDataDirective(unsigned Size, SMLoc L);
+ bool parseDirectiveGpWord();
+ bool parseDirectiveGpDWord();
+ bool parseDirectiveDtpRelWord();
+ bool parseDirectiveDtpRelDWord();
+ bool parseDirectiveTpRelWord();
+ bool parseDirectiveTpRelDWord();
+ bool parseDirectiveModule();
+ bool parseDirectiveModuleFP();
+ bool parseFpABIValue(MipsABIFlagsSection::FpABIKind &FpABI,
+ StringRef Directive);
+
+ bool parseInternalDirectiveReallowModule();
+
+ bool eatComma(StringRef ErrorStr);
+
+ int matchCPURegisterName(StringRef Symbol);
+
+ int matchHWRegsRegisterName(StringRef Symbol);
+
+ int matchFPURegisterName(StringRef Name);
+
+ int matchFCCRegisterName(StringRef Name);
+
+ int matchACRegisterName(StringRef Name);
+
+ int matchMSA128RegisterName(StringRef Name);
+
+ int matchMSA128CtrlRegisterName(StringRef Name);
+
+ unsigned getReg(int RC, int RegNo);
+
+ /// Returns the internal register number for the current AT. Also checks if
+ /// the current AT is unavailable (set to $0) and gives an error if it is.
+ /// This should be used in pseudo-instruction expansions which need AT.
+ unsigned getATReg(SMLoc Loc);
+
+ bool processInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI);
+
+ // Helper function that checks if the value of a vector index is within the
+ // boundaries of accepted values for each RegisterKind
+ // Example: INSERT.B $w0[n], $1 => 16 > n >= 0
+ bool validateMSAIndex(int Val, int RegKind);
+
+ // Selects a new architecture by updating the FeatureBits with the necessary
+ // info including implied dependencies.
+ // Internally, it clears all the feature bits related to *any* architecture
+ // and selects the new one using the ToggleFeature functionality of the
+ // MCSubtargetInfo object that handles implied dependencies. The reason we
+ // clear all the arch related bits manually is because ToggleFeature only
+ // clears the features that imply the feature being cleared and not the
+ // features implied by the feature being cleared. This is easier to see
+ // with an example:
+ // --------------------------------------------------
+ // | Feature | Implies |
+ // | -------------------------------------------------|
+ // | FeatureMips1 | None |
+ // | FeatureMips2 | FeatureMips1 |
+ // | FeatureMips3 | FeatureMips2 | FeatureMipsGP64 |
+ // | FeatureMips4 | FeatureMips3 |
+ // | ... | |
+ // --------------------------------------------------
+ //
+ // Setting Mips3 is equivalent to set: (FeatureMips3 | FeatureMips2 |
+ // FeatureMipsGP64 | FeatureMips1)
+ // Clearing Mips3 is equivalent to clear (FeatureMips3 | FeatureMips4).
+ void selectArch(StringRef ArchFeature) {
+ MCSubtargetInfo &STI = copySTI();
+ FeatureBitset FeatureBits = STI.getFeatureBits();
+ FeatureBits &= ~MipsAssemblerOptions::AllArchRelatedMask;
+ STI.setFeatureBits(FeatureBits);
+ setAvailableFeatures(
+ ComputeAvailableFeatures(STI.ToggleFeature(ArchFeature)));
+ AssemblerOptions.back()->setFeatures(STI.getFeatureBits());
+ }
+
+ void setFeatureBits(uint64_t Feature, StringRef FeatureString) {
+ if (!(getSTI().getFeatureBits()[Feature])) {
+ MCSubtargetInfo &STI = copySTI();
+ setAvailableFeatures(
+ ComputeAvailableFeatures(STI.ToggleFeature(FeatureString)));
+ AssemblerOptions.back()->setFeatures(STI.getFeatureBits());
+ }
+ }
+
+ void clearFeatureBits(uint64_t Feature, StringRef FeatureString) {
+ if (getSTI().getFeatureBits()[Feature]) {
+ MCSubtargetInfo &STI = copySTI();
+ setAvailableFeatures(
+ ComputeAvailableFeatures(STI.ToggleFeature(FeatureString)));
+ AssemblerOptions.back()->setFeatures(STI.getFeatureBits());
+ }
+ }
+
+ void setModuleFeatureBits(uint64_t Feature, StringRef FeatureString) {
+ setFeatureBits(Feature, FeatureString);
+ AssemblerOptions.front()->setFeatures(getSTI().getFeatureBits());
+ }
+
+ void clearModuleFeatureBits(uint64_t Feature, StringRef FeatureString) {
+ clearFeatureBits(Feature, FeatureString);
+ AssemblerOptions.front()->setFeatures(getSTI().getFeatureBits());
+ }
+
+public:
+ enum MipsMatchResultTy {
+ Match_RequiresDifferentSrcAndDst = FIRST_TARGET_MATCH_RESULT_TY,
+ Match_RequiresDifferentOperands,
+ Match_RequiresNoZeroRegister,
+ Match_RequiresSameSrcAndDst,
+ Match_NonZeroOperandForSync,
+#define GET_OPERAND_DIAGNOSTIC_TYPES
+#include "MipsGenAsmMatcher.inc"
+#undef GET_OPERAND_DIAGNOSTIC_TYPES
+ };
+
+ MipsAsmParser(const MCSubtargetInfo &sti, MCAsmParser &parser,
+ const MCInstrInfo &MII, const MCTargetOptions &Options)
+ : MCTargetAsmParser(Options, sti),
+ ABI(MipsABIInfo::computeTargetABI(Triple(sti.getTargetTriple()),
+ sti.getCPU(), Options)) {
+ MCAsmParserExtension::Initialize(parser);
+
+ parser.addAliasForDirective(".asciiz", ".asciz");
+
+ // Initialize the set of available features.
+ setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
+
+ // Remember the initial assembler options. The user can not modify these.
+ AssemblerOptions.push_back(
+ llvm::make_unique<MipsAssemblerOptions>(getSTI().getFeatureBits()));
+
+ // Create an assembler options environment for the user to modify.
+ AssemblerOptions.push_back(
+ llvm::make_unique<MipsAssemblerOptions>(getSTI().getFeatureBits()));
+
+ getTargetStreamer().updateABIInfo(*this);
+
+ if (!isABI_O32() && !useOddSPReg() != 0)
+ report_fatal_error("-mno-odd-spreg requires the O32 ABI");
+
+ CurrentFn = nullptr;
+
+ IsPicEnabled = getContext().getObjectFileInfo()->isPositionIndependent();
+
+ IsCpRestoreSet = false;
+ CpRestoreOffset = -1;
+
+ const Triple &TheTriple = sti.getTargetTriple();
+ if ((TheTriple.getArch() == Triple::mips) ||
+ (TheTriple.getArch() == Triple::mips64))
+ IsLittleEndian = false;
+ else
+ IsLittleEndian = true;
+ }
+
+ /// True if all of $fcc0 - $fcc7 exist for the current ISA.
+ bool hasEightFccRegisters() const { return hasMips4() || hasMips32(); }
+
+ bool isGP64bit() const {
+ return getSTI().getFeatureBits()[Mips::FeatureGP64Bit];
+ }
+ bool isFP64bit() const {
+ return getSTI().getFeatureBits()[Mips::FeatureFP64Bit];
+ }
+ const MipsABIInfo &getABI() const { return ABI; }
+ bool isABI_N32() const { return ABI.IsN32(); }
+ bool isABI_N64() const { return ABI.IsN64(); }
+ bool isABI_O32() const { return ABI.IsO32(); }
+ bool isABI_FPXX() const {
+ return getSTI().getFeatureBits()[Mips::FeatureFPXX];
+ }
+
+ bool useOddSPReg() const {
+ return !(getSTI().getFeatureBits()[Mips::FeatureNoOddSPReg]);
+ }
+
+ bool inMicroMipsMode() const {
+ return getSTI().getFeatureBits()[Mips::FeatureMicroMips];
+ }
+ bool hasMips1() const {
+ return getSTI().getFeatureBits()[Mips::FeatureMips1];
+ }
+ bool hasMips2() const {
+ return getSTI().getFeatureBits()[Mips::FeatureMips2];
+ }
+ bool hasMips3() const {
+ return getSTI().getFeatureBits()[Mips::FeatureMips3];
+ }
+ bool hasMips4() const {
+ return getSTI().getFeatureBits()[Mips::FeatureMips4];
+ }
+ bool hasMips5() const {
+ return getSTI().getFeatureBits()[Mips::FeatureMips5];
+ }
+ bool hasMips32() const {
+ return getSTI().getFeatureBits()[Mips::FeatureMips32];
+ }
+ bool hasMips64() const {
+ return getSTI().getFeatureBits()[Mips::FeatureMips64];
+ }
+ bool hasMips32r2() const {
+ return getSTI().getFeatureBits()[Mips::FeatureMips32r2];
+ }
+ bool hasMips64r2() const {
+ return getSTI().getFeatureBits()[Mips::FeatureMips64r2];
+ }
+ bool hasMips32r3() const {
+ return (getSTI().getFeatureBits()[Mips::FeatureMips32r3]);
+ }
+ bool hasMips64r3() const {
+ return (getSTI().getFeatureBits()[Mips::FeatureMips64r3]);
+ }
+ bool hasMips32r5() const {
+ return (getSTI().getFeatureBits()[Mips::FeatureMips32r5]);
+ }
+ bool hasMips64r5() const {
+ return (getSTI().getFeatureBits()[Mips::FeatureMips64r5]);
+ }
+ bool hasMips32r6() const {
+ return getSTI().getFeatureBits()[Mips::FeatureMips32r6];
+ }
+ bool hasMips64r6() const {
+ return getSTI().getFeatureBits()[Mips::FeatureMips64r6];
+ }
+
+ bool hasDSP() const {
+ return getSTI().getFeatureBits()[Mips::FeatureDSP];
+ }
+ bool hasDSPR2() const {
+ return getSTI().getFeatureBits()[Mips::FeatureDSPR2];
+ }
+ bool hasDSPR3() const {
+ return getSTI().getFeatureBits()[Mips::FeatureDSPR3];
+ }
+ bool hasMSA() const {
+ return getSTI().getFeatureBits()[Mips::FeatureMSA];
+ }
+ bool hasCnMips() const {
+ return (getSTI().getFeatureBits()[Mips::FeatureCnMips]);
+ }
+
+ bool inPicMode() {
+ return IsPicEnabled;
+ }
+
+ bool inMips16Mode() const {
+ return getSTI().getFeatureBits()[Mips::FeatureMips16];
+ }
+
+ bool useTraps() const {
+ return getSTI().getFeatureBits()[Mips::FeatureUseTCCInDIV];
+ }
+
+ bool useSoftFloat() const {
+ return getSTI().getFeatureBits()[Mips::FeatureSoftFloat];
+ }
+
+ /// Warn if RegIndex is the same as the current AT.
+ void warnIfRegIndexIsAT(unsigned RegIndex, SMLoc Loc);
+
+ void warnIfNoMacro(SMLoc Loc);
+
+ bool isLittle() const { return IsLittleEndian; }
+
+ const MCExpr *createTargetUnaryExpr(const MCExpr *E,
+ AsmToken::TokenKind OperatorToken,
+ MCContext &Ctx) override {
+ switch(OperatorToken) {
+ default:
+ llvm_unreachable("Unknown token");
+ return nullptr;
+ case AsmToken::PercentCall16:
+ return MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, E, Ctx);
+ case AsmToken::PercentCall_Hi:
+ return MipsMCExpr::create(MipsMCExpr::MEK_CALL_HI16, E, Ctx);
+ case AsmToken::PercentCall_Lo:
+ return MipsMCExpr::create(MipsMCExpr::MEK_CALL_LO16, E, Ctx);
+ case AsmToken::PercentDtprel_Hi:
+ return MipsMCExpr::create(MipsMCExpr::MEK_DTPREL_HI, E, Ctx);
+ case AsmToken::PercentDtprel_Lo:
+ return MipsMCExpr::create(MipsMCExpr::MEK_DTPREL_LO, E, Ctx);
+ case AsmToken::PercentGot:
+ return MipsMCExpr::create(MipsMCExpr::MEK_GOT, E, Ctx);
+ case AsmToken::PercentGot_Disp:
+ return MipsMCExpr::create(MipsMCExpr::MEK_GOT_DISP, E, Ctx);
+ case AsmToken::PercentGot_Hi:
+ return MipsMCExpr::create(MipsMCExpr::MEK_GOT_HI16, E, Ctx);
+ case AsmToken::PercentGot_Lo:
+ return MipsMCExpr::create(MipsMCExpr::MEK_GOT_LO16, E, Ctx);
+ case AsmToken::PercentGot_Ofst:
+ return MipsMCExpr::create(MipsMCExpr::MEK_GOT_OFST, E, Ctx);
+ case AsmToken::PercentGot_Page:
+ return MipsMCExpr::create(MipsMCExpr::MEK_GOT_PAGE, E, Ctx);
+ case AsmToken::PercentGottprel:
+ return MipsMCExpr::create(MipsMCExpr::MEK_GOTTPREL, E, Ctx);
+ case AsmToken::PercentGp_Rel:
+ return MipsMCExpr::create(MipsMCExpr::MEK_GPREL, E, Ctx);
+ case AsmToken::PercentHi:
+ return MipsMCExpr::create(MipsMCExpr::MEK_HI, E, Ctx);
+ case AsmToken::PercentHigher:
+ return MipsMCExpr::create(MipsMCExpr::MEK_HIGHER, E, Ctx);
+ case AsmToken::PercentHighest:
+ return MipsMCExpr::create(MipsMCExpr::MEK_HIGHEST, E, Ctx);
+ case AsmToken::PercentLo:
+ return MipsMCExpr::create(MipsMCExpr::MEK_LO, E, Ctx);
+ case AsmToken::PercentNeg:
+ return MipsMCExpr::create(MipsMCExpr::MEK_NEG, E, Ctx);
+ case AsmToken::PercentPcrel_Hi:
+ return MipsMCExpr::create(MipsMCExpr::MEK_PCREL_HI16, E, Ctx);
+ case AsmToken::PercentPcrel_Lo:
+ return MipsMCExpr::create(MipsMCExpr::MEK_PCREL_LO16, E, Ctx);
+ case AsmToken::PercentTlsgd:
+ return MipsMCExpr::create(MipsMCExpr::MEK_TLSGD, E, Ctx);
+ case AsmToken::PercentTlsldm:
+ return MipsMCExpr::create(MipsMCExpr::MEK_TLSLDM, E, Ctx);
+ case AsmToken::PercentTprel_Hi:
+ return MipsMCExpr::create(MipsMCExpr::MEK_TPREL_HI, E, Ctx);
+ case AsmToken::PercentTprel_Lo:
+ return MipsMCExpr::create(MipsMCExpr::MEK_TPREL_LO, E, Ctx);
+ }
+ }
+};
+}
+
+namespace {
+
+/// MipsOperand - Instances of this class represent a parsed Mips machine
+/// instruction.
+class MipsOperand : public MCParsedAsmOperand {
+public:
+ /// Broad categories of register classes
+ /// The exact class is finalized by the render method.
+ enum RegKind {
+ RegKind_GPR = 1, /// GPR32 and GPR64 (depending on isGP64bit())
+ RegKind_FGR = 2, /// FGR32, FGR64, AFGR64 (depending on context and
+ /// isFP64bit())
+ RegKind_FCC = 4, /// FCC
+ RegKind_MSA128 = 8, /// MSA128[BHWD] (makes no difference which)
+ RegKind_MSACtrl = 16, /// MSA control registers
+ RegKind_COP2 = 32, /// COP2
+ RegKind_ACC = 64, /// HI32DSP, LO32DSP, and ACC64DSP (depending on
+ /// context).
+ RegKind_CCR = 128, /// CCR
+ RegKind_HWRegs = 256, /// HWRegs
+ RegKind_COP3 = 512, /// COP3
+ RegKind_COP0 = 1024, /// COP0
+ /// Potentially any (e.g. $1)
+ RegKind_Numeric = RegKind_GPR | RegKind_FGR | RegKind_FCC | RegKind_MSA128 |
+ RegKind_MSACtrl | RegKind_COP2 | RegKind_ACC |
+ RegKind_CCR | RegKind_HWRegs | RegKind_COP3 | RegKind_COP0
+ };
+
+private:
+ enum KindTy {
+ k_Immediate, /// An immediate (possibly involving symbol references)
+ k_Memory, /// Base + Offset Memory Address
+ k_RegisterIndex, /// A register index in one or more RegKind.
+ k_Token, /// A simple token
+ k_RegList, /// A physical register list
+ k_RegPair /// A pair of physical register
+ } Kind;
+
+public:
+ MipsOperand(KindTy K, MipsAsmParser &Parser)
+ : MCParsedAsmOperand(), Kind(K), AsmParser(Parser) {}
+
+private:
+ /// For diagnostics, and checking the assembler temporary
+ MipsAsmParser &AsmParser;
+
+ struct Token {
+ const char *Data;
+ unsigned Length;
+ };
+
+ struct RegIdxOp {
+ unsigned Index; /// Index into the register class
+ RegKind Kind; /// Bitfield of the kinds it could possibly be
+ struct Token Tok; /// The input token this operand originated from.
+ const MCRegisterInfo *RegInfo;
+ };
+
+ struct ImmOp {
+ const MCExpr *Val;
+ };
+
+ struct MemOp {
+ MipsOperand *Base;
+ const MCExpr *Off;
+ };
+
+ struct RegListOp {
+ SmallVector<unsigned, 10> *List;
+ };
+
+ union {
+ struct Token Tok;
+ struct RegIdxOp RegIdx;
+ struct ImmOp Imm;
+ struct MemOp Mem;
+ struct RegListOp RegList;
+ };
+
+ SMLoc StartLoc, EndLoc;
+
+ /// Internal constructor for register kinds
+ static std::unique_ptr<MipsOperand> CreateReg(unsigned Index, StringRef Str,
+ RegKind RegKind,
+ const MCRegisterInfo *RegInfo,
+ SMLoc S, SMLoc E,
+ MipsAsmParser &Parser) {
+ auto Op = make_unique<MipsOperand>(k_RegisterIndex, Parser);
+ Op->RegIdx.Index = Index;
+ Op->RegIdx.RegInfo = RegInfo;
+ Op->RegIdx.Kind = RegKind;
+ Op->RegIdx.Tok.Data = Str.data();
+ Op->RegIdx.Tok.Length = Str.size();
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+public:
+ /// Coerce the register to GPR32 and return the real register for the current
+ /// target.
+ unsigned getGPR32Reg() const {
+ assert(isRegIdx() && (RegIdx.Kind & RegKind_GPR) && "Invalid access!");
+ AsmParser.warnIfRegIndexIsAT(RegIdx.Index, StartLoc);
+ unsigned ClassID = Mips::GPR32RegClassID;
+ return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+ }
+
+ /// Coerce the register to GPR32 and return the real register for the current
+ /// target.
+ unsigned getGPRMM16Reg() const {
+ assert(isRegIdx() && (RegIdx.Kind & RegKind_GPR) && "Invalid access!");
+ unsigned ClassID = Mips::GPR32RegClassID;
+ return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+ }
+
+ /// Coerce the register to GPR64 and return the real register for the current
+ /// target.
+ unsigned getGPR64Reg() const {
+ assert(isRegIdx() && (RegIdx.Kind & RegKind_GPR) && "Invalid access!");
+ unsigned ClassID = Mips::GPR64RegClassID;
+ return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+ }
+
+private:
+ /// Coerce the register to AFGR64 and return the real register for the current
+ /// target.
+ unsigned getAFGR64Reg() const {
+ assert(isRegIdx() && (RegIdx.Kind & RegKind_FGR) && "Invalid access!");
+ if (RegIdx.Index % 2 != 0)
+ AsmParser.Warning(StartLoc, "Float register should be even.");
+ return RegIdx.RegInfo->getRegClass(Mips::AFGR64RegClassID)
+ .getRegister(RegIdx.Index / 2);
+ }
+
+ /// Coerce the register to FGR64 and return the real register for the current
+ /// target.
+ unsigned getFGR64Reg() const {
+ assert(isRegIdx() && (RegIdx.Kind & RegKind_FGR) && "Invalid access!");
+ return RegIdx.RegInfo->getRegClass(Mips::FGR64RegClassID)
+ .getRegister(RegIdx.Index);
+ }
+
+ /// Coerce the register to FGR32 and return the real register for the current
+ /// target.
+ unsigned getFGR32Reg() const {
+ assert(isRegIdx() && (RegIdx.Kind & RegKind_FGR) && "Invalid access!");
+ return RegIdx.RegInfo->getRegClass(Mips::FGR32RegClassID)
+ .getRegister(RegIdx.Index);
+ }
+
+ /// Coerce the register to FGRH32 and return the real register for the current
+ /// target.
+ unsigned getFGRH32Reg() const {
+ assert(isRegIdx() && (RegIdx.Kind & RegKind_FGR) && "Invalid access!");
+ return RegIdx.RegInfo->getRegClass(Mips::FGRH32RegClassID)
+ .getRegister(RegIdx.Index);
+ }
+
+ /// Coerce the register to FCC and return the real register for the current
+ /// target.
+ unsigned getFCCReg() const {
+ assert(isRegIdx() && (RegIdx.Kind & RegKind_FCC) && "Invalid access!");
+ return RegIdx.RegInfo->getRegClass(Mips::FCCRegClassID)
+ .getRegister(RegIdx.Index);
+ }
+
+ /// Coerce the register to MSA128 and return the real register for the current
+ /// target.
+ unsigned getMSA128Reg() const {
+ assert(isRegIdx() && (RegIdx.Kind & RegKind_MSA128) && "Invalid access!");
+ // It doesn't matter which of the MSA128[BHWD] classes we use. They are all
+ // identical
+ unsigned ClassID = Mips::MSA128BRegClassID;
+ return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+ }
+
+ /// Coerce the register to MSACtrl and return the real register for the
+ /// current target.
+ unsigned getMSACtrlReg() const {
+ assert(isRegIdx() && (RegIdx.Kind & RegKind_MSACtrl) && "Invalid access!");
+ unsigned ClassID = Mips::MSACtrlRegClassID;
+ return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+ }
+
+ /// Coerce the register to COP0 and return the real register for the
+ /// current target.
+ unsigned getCOP0Reg() const {
+ assert(isRegIdx() && (RegIdx.Kind & RegKind_COP0) && "Invalid access!");
+ unsigned ClassID = Mips::COP0RegClassID;
+ return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+ }
+
+ /// Coerce the register to COP2 and return the real register for the
+ /// current target.
+ unsigned getCOP2Reg() const {
+ assert(isRegIdx() && (RegIdx.Kind & RegKind_COP2) && "Invalid access!");
+ unsigned ClassID = Mips::COP2RegClassID;
+ return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+ }
+
+ /// Coerce the register to COP3 and return the real register for the
+ /// current target.
+ unsigned getCOP3Reg() const {
+ assert(isRegIdx() && (RegIdx.Kind & RegKind_COP3) && "Invalid access!");
+ unsigned ClassID = Mips::COP3RegClassID;
+ return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+ }
+
+ /// Coerce the register to ACC64DSP and return the real register for the
+ /// current target.
+ unsigned getACC64DSPReg() const {
+ assert(isRegIdx() && (RegIdx.Kind & RegKind_ACC) && "Invalid access!");
+ unsigned ClassID = Mips::ACC64DSPRegClassID;
+ return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+ }
+
+ /// Coerce the register to HI32DSP and return the real register for the
+ /// current target.
+ unsigned getHI32DSPReg() const {
+ assert(isRegIdx() && (RegIdx.Kind & RegKind_ACC) && "Invalid access!");
+ unsigned ClassID = Mips::HI32DSPRegClassID;
+ return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+ }
+
+ /// Coerce the register to LO32DSP and return the real register for the
+ /// current target.
+ unsigned getLO32DSPReg() const {
+ assert(isRegIdx() && (RegIdx.Kind & RegKind_ACC) && "Invalid access!");
+ unsigned ClassID = Mips::LO32DSPRegClassID;
+ return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+ }
+
+ /// Coerce the register to CCR and return the real register for the
+ /// current target.
+ unsigned getCCRReg() const {
+ assert(isRegIdx() && (RegIdx.Kind & RegKind_CCR) && "Invalid access!");
+ unsigned ClassID = Mips::CCRRegClassID;
+ return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+ }
+
+ /// Coerce the register to HWRegs and return the real register for the
+ /// current target.
+ unsigned getHWRegsReg() const {
+ assert(isRegIdx() && (RegIdx.Kind & RegKind_HWRegs) && "Invalid access!");
+ unsigned ClassID = Mips::HWRegsRegClassID;
+ return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+ }
+
+public:
+ void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+ // Add as immediate when possible. Null MCExpr = 0.
+ if (!Expr)
+ Inst.addOperand(MCOperand::createImm(0));
+ else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+ Inst.addOperand(MCOperand::createImm(CE->getValue()));
+ else
+ Inst.addOperand(MCOperand::createExpr(Expr));
+ }
+
+ void addRegOperands(MCInst &Inst, unsigned N) const {
+ llvm_unreachable("Use a custom parser instead");
+ }
+
+ /// Render the operand to an MCInst as a GPR32
+ /// Asserts if the wrong number of operands are requested, or the operand
+ /// is not a k_RegisterIndex compatible with RegKind_GPR
+ void addGPR32AsmRegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getGPR32Reg()));
+ }
+
+ void addGPRMM16AsmRegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getGPRMM16Reg()));
+ }
+
+ void addGPRMM16AsmRegZeroOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getGPRMM16Reg()));
+ }
+
+ void addGPRMM16AsmRegMovePOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getGPRMM16Reg()));
+ }
+
+ /// Render the operand to an MCInst as a GPR64
+ /// Asserts if the wrong number of operands are requested, or the operand
+ /// is not a k_RegisterIndex compatible with RegKind_GPR
+ void addGPR64AsmRegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getGPR64Reg()));
+ }
+
+ void addAFGR64AsmRegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getAFGR64Reg()));
+ }
+
+ void addFGR64AsmRegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getFGR64Reg()));
+ }
+
+ void addFGR32AsmRegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getFGR32Reg()));
+ // FIXME: We ought to do this for -integrated-as without -via-file-asm too.
+ // FIXME: This should propagate failure up to parseStatement.
+ if (!AsmParser.useOddSPReg() && RegIdx.Index & 1)
+ AsmParser.getParser().printError(
+ StartLoc, "-mno-odd-spreg prohibits the use of odd FPU "
+ "registers");
+ }
+
+ void addFGRH32AsmRegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getFGRH32Reg()));
+ }
+
+ void addFCCAsmRegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getFCCReg()));
+ }
+
+ void addMSA128AsmRegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getMSA128Reg()));
+ }
+
+ void addMSACtrlAsmRegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getMSACtrlReg()));
+ }
+
+ void addCOP0AsmRegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getCOP0Reg()));
+ }
+
+ void addCOP2AsmRegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getCOP2Reg()));
+ }
+
+ void addCOP3AsmRegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getCOP3Reg()));
+ }
+
+ void addACC64DSPAsmRegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getACC64DSPReg()));
+ }
+
+ void addHI32DSPAsmRegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getHI32DSPReg()));
+ }
+
+ void addLO32DSPAsmRegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getLO32DSPReg()));
+ }
+
+ void addCCRAsmRegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getCCRReg()));
+ }
+
+ void addHWRegsAsmRegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getHWRegsReg()));
+ }
+
+ template <unsigned Bits, int Offset = 0, int AdjustOffset = 0>
+ void addConstantUImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ uint64_t Imm = getConstantImm() - Offset;
+ Imm &= (1ULL << Bits) - 1;
+ Imm += Offset;
+ Imm += AdjustOffset;
+ Inst.addOperand(MCOperand::createImm(Imm));
+ }
+
+ template <unsigned Bits>
+ void addSImmOperands(MCInst &Inst, unsigned N) const {
+ if (isImm() && !isConstantImm()) {
+ addExpr(Inst, getImm());
+ return;
+ }
+ addConstantSImmOperands<Bits, 0, 0>(Inst, N);
+ }
+
+ template <unsigned Bits>
+ void addUImmOperands(MCInst &Inst, unsigned N) const {
+ if (isImm() && !isConstantImm()) {
+ addExpr(Inst, getImm());
+ return;
+ }
+ addConstantUImmOperands<Bits, 0, 0>(Inst, N);
+ }
+
+ template <unsigned Bits, int Offset = 0, int AdjustOffset = 0>
+ void addConstantSImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ int64_t Imm = getConstantImm() - Offset;
+ Imm = SignExtend64<Bits>(Imm);
+ Imm += Offset;
+ Imm += AdjustOffset;
+ Inst.addOperand(MCOperand::createImm(Imm));
+ }
+
+ void addImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCExpr *Expr = getImm();
+ addExpr(Inst, Expr);
+ }
+
+ void addMemOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 2 && "Invalid number of operands!");
+
+ Inst.addOperand(MCOperand::createReg(AsmParser.getABI().ArePtrs64bit()
+ ? getMemBase()->getGPR64Reg()
+ : getMemBase()->getGPR32Reg()));
+
+ const MCExpr *Expr = getMemOff();
+ addExpr(Inst, Expr);
+ }
+
+ void addMicroMipsMemOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 2 && "Invalid number of operands!");
+
+ Inst.addOperand(MCOperand::createReg(getMemBase()->getGPRMM16Reg()));
+
+ const MCExpr *Expr = getMemOff();
+ addExpr(Inst, Expr);
+ }
+
+ void addRegListOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+
+ for (auto RegNo : getRegList())
+ Inst.addOperand(MCOperand::createReg(RegNo));
+ }
+
+ void addRegPairOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 2 && "Invalid number of operands!");
+ assert((RegIdx.Kind & RegKind_GPR) && "Invalid access!");
+ unsigned RegNo = getRegPair();
+ AsmParser.warnIfRegIndexIsAT(RegNo, StartLoc);
+ Inst.addOperand(MCOperand::createReg(
+ RegIdx.RegInfo->getRegClass(
+ AsmParser.getABI().AreGprs64bit()
+ ? Mips::GPR64RegClassID
+ : Mips::GPR32RegClassID).getRegister(RegNo++)));
+ Inst.addOperand(MCOperand::createReg(
+ RegIdx.RegInfo->getRegClass(
+ AsmParser.getABI().AreGprs64bit()
+ ? Mips::GPR64RegClassID
+ : Mips::GPR32RegClassID).getRegister(RegNo)));
+ }
+
+ void addMovePRegPairOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 2 && "Invalid number of operands!");
+ for (auto RegNo : getRegList())
+ Inst.addOperand(MCOperand::createReg(RegNo));
+ }
+
+ bool isReg() const override {
+ // As a special case until we sort out the definition of div/divu, accept
+ // $0/$zero here so that MCK_ZERO works correctly.
+ return isGPRAsmReg() && RegIdx.Index == 0;
+ }
+ bool isRegIdx() const { return Kind == k_RegisterIndex; }
+ bool isImm() const override { return Kind == k_Immediate; }
+ bool isConstantImm() const {
+ int64_t Res;
+ return isImm() && getImm()->evaluateAsAbsolute(Res);
+ }
+ bool isConstantImmz() const {
+ return isConstantImm() && getConstantImm() == 0;
+ }
+ template <unsigned Bits, int Offset = 0> bool isConstantUImm() const {
+ return isConstantImm() && isUInt<Bits>(getConstantImm() - Offset);
+ }
+ template <unsigned Bits> bool isSImm() const {
+ return isConstantImm() ? isInt<Bits>(getConstantImm()) : isImm();
+ }
+ template <unsigned Bits> bool isUImm() const {
+ return isConstantImm() ? isUInt<Bits>(getConstantImm()) : isImm();
+ }
+ template <unsigned Bits> bool isAnyImm() const {
+ return isConstantImm() ? (isInt<Bits>(getConstantImm()) ||
+ isUInt<Bits>(getConstantImm()))
+ : isImm();
+ }
+ template <unsigned Bits, int Offset = 0> bool isConstantSImm() const {
+ return isConstantImm() && isInt<Bits>(getConstantImm() - Offset);
+ }
+ template <unsigned Bottom, unsigned Top> bool isConstantUImmRange() const {
+ return isConstantImm() && getConstantImm() >= Bottom &&
+ getConstantImm() <= Top;
+ }
+ bool isToken() const override {
+ // Note: It's not possible to pretend that other operand kinds are tokens.
+ // The matcher emitter checks tokens first.
+ return Kind == k_Token;
+ }
+ bool isMem() const override { return Kind == k_Memory; }
+ bool isConstantMemOff() const {
+ return isMem() && isa<MCConstantExpr>(getMemOff());
+ }
+ // Allow relocation operators.
+ // FIXME: This predicate and others need to look through binary expressions
+ // and determine whether a Value is a constant or not.
+ template <unsigned Bits, unsigned ShiftAmount = 0>
+ bool isMemWithSimmOffset() const {
+ if (!isMem())
+ return false;
+ if (!getMemBase()->isGPRAsmReg())
+ return false;
+ if (isa<MCTargetExpr>(getMemOff()) ||
+ (isConstantMemOff() &&
+ isShiftedInt<Bits, ShiftAmount>(getConstantMemOff())))
+ return true;
+ MCValue Res;
+ bool IsReloc = getMemOff()->evaluateAsRelocatable(Res, nullptr, nullptr);
+ return IsReloc && isShiftedInt<Bits, ShiftAmount>(Res.getConstant());
+ }
+ bool isMemWithGRPMM16Base() const {
+ return isMem() && getMemBase()->isMM16AsmReg();
+ }
+ template <unsigned Bits> bool isMemWithUimmOffsetSP() const {
+ return isMem() && isConstantMemOff() && isUInt<Bits>(getConstantMemOff())
+ && getMemBase()->isRegIdx() && (getMemBase()->getGPR32Reg() == Mips::SP);
+ }
+ template <unsigned Bits> bool isMemWithUimmWordAlignedOffsetSP() const {
+ return isMem() && isConstantMemOff() && isUInt<Bits>(getConstantMemOff())
+ && (getConstantMemOff() % 4 == 0) && getMemBase()->isRegIdx()
+ && (getMemBase()->getGPR32Reg() == Mips::SP);
+ }
+ template <unsigned Bits> bool isMemWithSimmWordAlignedOffsetGP() const {
+ return isMem() && isConstantMemOff() && isInt<Bits>(getConstantMemOff())
+ && (getConstantMemOff() % 4 == 0) && getMemBase()->isRegIdx()
+ && (getMemBase()->getGPR32Reg() == Mips::GP);
+ }
+ template <unsigned Bits, unsigned ShiftLeftAmount>
+ bool isScaledUImm() const {
+ return isConstantImm() &&
+ isShiftedUInt<Bits, ShiftLeftAmount>(getConstantImm());
+ }
+ template <unsigned Bits, unsigned ShiftLeftAmount>
+ bool isScaledSImm() const {
+ if (isConstantImm() && isShiftedInt<Bits, ShiftLeftAmount>(getConstantImm()))
+ return true;
+ // Operand can also be a symbol or symbol plus offset in case of relocations.
+ if (Kind != k_Immediate)
+ return false;
+ MCValue Res;
+ bool Success = getImm()->evaluateAsRelocatable(Res, nullptr, nullptr);
+ return Success && isShiftedInt<Bits, ShiftLeftAmount>(Res.getConstant());
+ }
+ bool isRegList16() const {
+ if (!isRegList())
+ return false;
+
+ int Size = RegList.List->size();
+ if (Size < 2 || Size > 5)
+ return false;
+
+ unsigned R0 = RegList.List->front();
+ unsigned R1 = RegList.List->back();
+ if (!((R0 == Mips::S0 && R1 == Mips::RA) ||
+ (R0 == Mips::S0_64 && R1 == Mips::RA_64)))
+ return false;
+
+ int PrevReg = *RegList.List->begin();
+ for (int i = 1; i < Size - 1; i++) {
+ int Reg = (*(RegList.List))[i];
+ if ( Reg != PrevReg + 1)
+ return false;
+ PrevReg = Reg;
+ }
+
+ return true;
+ }
+ bool isInvNum() const { return Kind == k_Immediate; }
+ bool isLSAImm() const {
+ if (!isConstantImm())
+ return false;
+ int64_t Val = getConstantImm();
+ return 1 <= Val && Val <= 4;
+ }
+ bool isRegList() const { return Kind == k_RegList; }
+ bool isMovePRegPair() const {
+ if (Kind != k_RegList || RegList.List->size() != 2)
+ return false;
+
+ unsigned R0 = RegList.List->front();
+ unsigned R1 = RegList.List->back();
+
+ if ((R0 == Mips::A1 && R1 == Mips::A2) ||
+ (R0 == Mips::A1 && R1 == Mips::A3) ||
+ (R0 == Mips::A2 && R1 == Mips::A3) ||
+ (R0 == Mips::A0 && R1 == Mips::S5) ||
+ (R0 == Mips::A0 && R1 == Mips::S6) ||
+ (R0 == Mips::A0 && R1 == Mips::A1) ||
+ (R0 == Mips::A0 && R1 == Mips::A2) ||
+ (R0 == Mips::A0 && R1 == Mips::A3) ||
+ (R0 == Mips::A1_64 && R1 == Mips::A2_64) ||
+ (R0 == Mips::A1_64 && R1 == Mips::A3_64) ||
+ (R0 == Mips::A2_64 && R1 == Mips::A3_64) ||
+ (R0 == Mips::A0_64 && R1 == Mips::S5_64) ||
+ (R0 == Mips::A0_64 && R1 == Mips::S6_64) ||
+ (R0 == Mips::A0_64 && R1 == Mips::A1_64) ||
+ (R0 == Mips::A0_64 && R1 == Mips::A2_64) ||
+ (R0 == Mips::A0_64 && R1 == Mips::A3_64))
+ return true;
+
+ return false;
+ }
+
+ StringRef getToken() const {
+ assert(Kind == k_Token && "Invalid access!");
+ return StringRef(Tok.Data, Tok.Length);
+ }
+ bool isRegPair() const {
+ return Kind == k_RegPair && RegIdx.Index <= 30;
+ }
+
+ unsigned getReg() const override {
+ // As a special case until we sort out the definition of div/divu, accept
+ // $0/$zero here so that MCK_ZERO works correctly.
+ if (Kind == k_RegisterIndex && RegIdx.Index == 0 &&
+ RegIdx.Kind & RegKind_GPR)
+ return getGPR32Reg(); // FIXME: GPR64 too
+
+ llvm_unreachable("Invalid access!");
+ return 0;
+ }
+
+ const MCExpr *getImm() const {
+ assert((Kind == k_Immediate) && "Invalid access!");
+ return Imm.Val;
+ }
+
+ int64_t getConstantImm() const {
+ const MCExpr *Val = getImm();
+ int64_t Value = 0;
+ (void)Val->evaluateAsAbsolute(Value);
+ return Value;
+ }
+
+ MipsOperand *getMemBase() const {
+ assert((Kind == k_Memory) && "Invalid access!");
+ return Mem.Base;
+ }
+
+ const MCExpr *getMemOff() const {
+ assert((Kind == k_Memory) && "Invalid access!");
+ return Mem.Off;
+ }
+
+ int64_t getConstantMemOff() const {
+ return static_cast<const MCConstantExpr *>(getMemOff())->getValue();
+ }
+
+ const SmallVectorImpl<unsigned> &getRegList() const {
+ assert((Kind == k_RegList) && "Invalid access!");
+ return *(RegList.List);
+ }
+
+ unsigned getRegPair() const {
+ assert((Kind == k_RegPair) && "Invalid access!");
+ return RegIdx.Index;
+ }
+
+ static std::unique_ptr<MipsOperand> CreateToken(StringRef Str, SMLoc S,
+ MipsAsmParser &Parser) {
+ auto Op = make_unique<MipsOperand>(k_Token, Parser);
+ Op->Tok.Data = Str.data();
+ Op->Tok.Length = Str.size();
+ Op->StartLoc = S;
+ Op->EndLoc = S;
+ return Op;
+ }
+
+ /// Create a numeric register (e.g. $1). The exact register remains
+ /// unresolved until an instruction successfully matches
+ static std::unique_ptr<MipsOperand>
+ createNumericReg(unsigned Index, StringRef Str, const MCRegisterInfo *RegInfo,
+ SMLoc S, SMLoc E, MipsAsmParser &Parser) {
+ DEBUG(dbgs() << "createNumericReg(" << Index << ", ...)\n");
+ return CreateReg(Index, Str, RegKind_Numeric, RegInfo, S, E, Parser);
+ }
+
+ /// Create a register that is definitely a GPR.
+ /// This is typically only used for named registers such as $gp.
+ static std::unique_ptr<MipsOperand>
+ createGPRReg(unsigned Index, StringRef Str, const MCRegisterInfo *RegInfo,
+ SMLoc S, SMLoc E, MipsAsmParser &Parser) {
+ return CreateReg(Index, Str, RegKind_GPR, RegInfo, S, E, Parser);
+ }
+
+ /// Create a register that is definitely a FGR.
+ /// This is typically only used for named registers such as $f0.
+ static std::unique_ptr<MipsOperand>
+ createFGRReg(unsigned Index, StringRef Str, const MCRegisterInfo *RegInfo,
+ SMLoc S, SMLoc E, MipsAsmParser &Parser) {
+ return CreateReg(Index, Str, RegKind_FGR, RegInfo, S, E, Parser);
+ }
+
+ /// Create a register that is definitely a HWReg.
+ /// This is typically only used for named registers such as $hwr_cpunum.
+ static std::unique_ptr<MipsOperand>
+ createHWRegsReg(unsigned Index, StringRef Str, const MCRegisterInfo *RegInfo,
+ SMLoc S, SMLoc E, MipsAsmParser &Parser) {
+ return CreateReg(Index, Str, RegKind_HWRegs, RegInfo, S, E, Parser);
+ }
+
+ /// Create a register that is definitely an FCC.
+ /// This is typically only used for named registers such as $fcc0.
+ static std::unique_ptr<MipsOperand>
+ createFCCReg(unsigned Index, StringRef Str, const MCRegisterInfo *RegInfo,
+ SMLoc S, SMLoc E, MipsAsmParser &Parser) {
+ return CreateReg(Index, Str, RegKind_FCC, RegInfo, S, E, Parser);
+ }
+
+ /// Create a register that is definitely an ACC.
+ /// This is typically only used for named registers such as $ac0.
+ static std::unique_ptr<MipsOperand>
+ createACCReg(unsigned Index, StringRef Str, const MCRegisterInfo *RegInfo,
+ SMLoc S, SMLoc E, MipsAsmParser &Parser) {
+ return CreateReg(Index, Str, RegKind_ACC, RegInfo, S, E, Parser);
+ }
+
+ /// Create a register that is definitely an MSA128.
+ /// This is typically only used for named registers such as $w0.
+ static std::unique_ptr<MipsOperand>
+ createMSA128Reg(unsigned Index, StringRef Str, const MCRegisterInfo *RegInfo,
+ SMLoc S, SMLoc E, MipsAsmParser &Parser) {
+ return CreateReg(Index, Str, RegKind_MSA128, RegInfo, S, E, Parser);
+ }
+
+ /// Create a register that is definitely an MSACtrl.
+ /// This is typically only used for named registers such as $msaaccess.
+ static std::unique_ptr<MipsOperand>
+ createMSACtrlReg(unsigned Index, StringRef Str, const MCRegisterInfo *RegInfo,
+ SMLoc S, SMLoc E, MipsAsmParser &Parser) {
+ return CreateReg(Index, Str, RegKind_MSACtrl, RegInfo, S, E, Parser);
+ }
+
+ static std::unique_ptr<MipsOperand>
+ CreateImm(const MCExpr *Val, SMLoc S, SMLoc E, MipsAsmParser &Parser) {
+ auto Op = make_unique<MipsOperand>(k_Immediate, Parser);
+ Op->Imm.Val = Val;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ static std::unique_ptr<MipsOperand>
+ CreateMem(std::unique_ptr<MipsOperand> Base, const MCExpr *Off, SMLoc S,
+ SMLoc E, MipsAsmParser &Parser) {
+ auto Op = make_unique<MipsOperand>(k_Memory, Parser);
+ Op->Mem.Base = Base.release();
+ Op->Mem.Off = Off;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ static std::unique_ptr<MipsOperand>
+ CreateRegList(SmallVectorImpl<unsigned> &Regs, SMLoc StartLoc, SMLoc EndLoc,
+ MipsAsmParser &Parser) {
+ assert (Regs.size() > 0 && "Empty list not allowed");
+
+ auto Op = make_unique<MipsOperand>(k_RegList, Parser);
+ Op->RegList.List = new SmallVector<unsigned, 10>(Regs.begin(), Regs.end());
+ Op->StartLoc = StartLoc;
+ Op->EndLoc = EndLoc;
+ return Op;
+ }
+
+ static std::unique_ptr<MipsOperand> CreateRegPair(const MipsOperand &MOP,
+ SMLoc S, SMLoc E,
+ MipsAsmParser &Parser) {
+ auto Op = make_unique<MipsOperand>(k_RegPair, Parser);
+ Op->RegIdx.Index = MOP.RegIdx.Index;
+ Op->RegIdx.RegInfo = MOP.RegIdx.RegInfo;
+ Op->RegIdx.Kind = MOP.RegIdx.Kind;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ bool isGPRAsmReg() const {
+ return isRegIdx() && RegIdx.Kind & RegKind_GPR && RegIdx.Index <= 31;
+ }
+ bool isMM16AsmReg() const {
+ if (!(isRegIdx() && RegIdx.Kind))
+ return false;
+ return ((RegIdx.Index >= 2 && RegIdx.Index <= 7)
+ || RegIdx.Index == 16 || RegIdx.Index == 17);
+ }
+ bool isMM16AsmRegZero() const {
+ if (!(isRegIdx() && RegIdx.Kind))
+ return false;
+ return (RegIdx.Index == 0 ||
+ (RegIdx.Index >= 2 && RegIdx.Index <= 7) ||
+ RegIdx.Index == 17);
+ }
+ bool isMM16AsmRegMoveP() const {
+ if (!(isRegIdx() && RegIdx.Kind))
+ return false;
+ return (RegIdx.Index == 0 || (RegIdx.Index >= 2 && RegIdx.Index <= 3) ||
+ (RegIdx.Index >= 16 && RegIdx.Index <= 20));
+ }
+ bool isFGRAsmReg() const {
+ // AFGR64 is $0-$15 but we handle this in getAFGR64()
+ return isRegIdx() && RegIdx.Kind & RegKind_FGR && RegIdx.Index <= 31;
+ }
+ bool isHWRegsAsmReg() const {
+ return isRegIdx() && RegIdx.Kind & RegKind_HWRegs && RegIdx.Index <= 31;
+ }
+ bool isCCRAsmReg() const {
+ return isRegIdx() && RegIdx.Kind & RegKind_CCR && RegIdx.Index <= 31;
+ }
+ bool isFCCAsmReg() const {
+ if (!(isRegIdx() && RegIdx.Kind & RegKind_FCC))
+ return false;
+ if (!AsmParser.hasEightFccRegisters())
+ return RegIdx.Index == 0;
+ return RegIdx.Index <= 7;
+ }
+ bool isACCAsmReg() const {
+ return isRegIdx() && RegIdx.Kind & RegKind_ACC && RegIdx.Index <= 3;
+ }
+ bool isCOP0AsmReg() const {
+ return isRegIdx() && RegIdx.Kind & RegKind_COP0 && RegIdx.Index <= 31;
+ }
+ bool isCOP2AsmReg() const {
+ return isRegIdx() && RegIdx.Kind & RegKind_COP2 && RegIdx.Index <= 31;
+ }
+ bool isCOP3AsmReg() const {
+ return isRegIdx() && RegIdx.Kind & RegKind_COP3 && RegIdx.Index <= 31;
+ }
+ bool isMSA128AsmReg() const {
+ return isRegIdx() && RegIdx.Kind & RegKind_MSA128 && RegIdx.Index <= 31;
+ }
+ bool isMSACtrlAsmReg() const {
+ return isRegIdx() && RegIdx.Kind & RegKind_MSACtrl && RegIdx.Index <= 7;
+ }
+
+ /// getStartLoc - Get the location of the first token of this operand.
+ SMLoc getStartLoc() const override { return StartLoc; }
+ /// getEndLoc - Get the location of the last token of this operand.
+ SMLoc getEndLoc() const override { return EndLoc; }
+
+ virtual ~MipsOperand() {
+ switch (Kind) {
+ case k_Immediate:
+ break;
+ case k_Memory:
+ delete Mem.Base;
+ break;
+ case k_RegList:
+ delete RegList.List;
+ case k_RegisterIndex:
+ case k_Token:
+ case k_RegPair:
+ break;
+ }
+ }
+
+ void print(raw_ostream &OS) const override {
+ switch (Kind) {
+ case k_Immediate:
+ OS << "Imm<";
+ OS << *Imm.Val;
+ OS << ">";
+ break;
+ case k_Memory:
+ OS << "Mem<";
+ Mem.Base->print(OS);
+ OS << ", ";
+ OS << *Mem.Off;
+ OS << ">";
+ break;
+ case k_RegisterIndex:
+ OS << "RegIdx<" << RegIdx.Index << ":" << RegIdx.Kind << ", "
+ << StringRef(RegIdx.Tok.Data, RegIdx.Tok.Length) << ">";
+ break;
+ case k_Token:
+ OS << getToken();
+ break;
+ case k_RegList:
+ OS << "RegList< ";
+ for (auto Reg : (*RegList.List))
+ OS << Reg << " ";
+ OS << ">";
+ break;
+ case k_RegPair:
+ OS << "RegPair<" << RegIdx.Index << "," << RegIdx.Index + 1 << ">";
+ break;
+ }
+ }
+
+ bool isValidForTie(const MipsOperand &Other) const {
+ if (Kind != Other.Kind)
+ return false;
+
+ switch (Kind) {
+ default:
+ llvm_unreachable("Unexpected kind");
+ return false;
+ case k_RegisterIndex: {
+ StringRef Token(RegIdx.Tok.Data, RegIdx.Tok.Length);
+ StringRef OtherToken(Other.RegIdx.Tok.Data, Other.RegIdx.Tok.Length);
+ return Token == OtherToken;
+ }
+ }
+ }
+}; // class MipsOperand
+} // namespace
+
+namespace llvm {
+extern const MCInstrDesc MipsInsts[];
+}
+static const MCInstrDesc &getInstDesc(unsigned Opcode) {
+ return MipsInsts[Opcode];
+}
+
+static bool hasShortDelaySlot(unsigned Opcode) {
+ switch (Opcode) {
+ case Mips::JALS_MM:
+ case Mips::JALRS_MM:
+ case Mips::JALRS16_MM:
+ case Mips::BGEZALS_MM:
+ case Mips::BLTZALS_MM:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static const MCSymbol *getSingleMCSymbol(const MCExpr *Expr) {
+ if (const MCSymbolRefExpr *SRExpr = dyn_cast<MCSymbolRefExpr>(Expr)) {
+ return &SRExpr->getSymbol();
+ }
+
+ if (const MCBinaryExpr *BExpr = dyn_cast<MCBinaryExpr>(Expr)) {
+ const MCSymbol *LHSSym = getSingleMCSymbol(BExpr->getLHS());
+ const MCSymbol *RHSSym = getSingleMCSymbol(BExpr->getRHS());
+
+ if (LHSSym)
+ return LHSSym;
+
+ if (RHSSym)
+ return RHSSym;
+
+ return nullptr;
+ }
+
+ if (const MCUnaryExpr *UExpr = dyn_cast<MCUnaryExpr>(Expr))
+ return getSingleMCSymbol(UExpr->getSubExpr());
+
+ return nullptr;
+}
+
+static unsigned countMCSymbolRefExpr(const MCExpr *Expr) {
+ if (isa<MCSymbolRefExpr>(Expr))
+ return 1;
+
+ if (const MCBinaryExpr *BExpr = dyn_cast<MCBinaryExpr>(Expr))
+ return countMCSymbolRefExpr(BExpr->getLHS()) +
+ countMCSymbolRefExpr(BExpr->getRHS());
+
+ if (const MCUnaryExpr *UExpr = dyn_cast<MCUnaryExpr>(Expr))
+ return countMCSymbolRefExpr(UExpr->getSubExpr());
+
+ return 0;
+}
+
+bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
+ MCStreamer &Out,
+ const MCSubtargetInfo *STI) {
+ MipsTargetStreamer &TOut = getTargetStreamer();
+ const MCInstrDesc &MCID = getInstDesc(Inst.getOpcode());
+ bool ExpandedJalSym = false;
+
+ Inst.setLoc(IDLoc);
+
+ if (MCID.isBranch() || MCID.isCall()) {
+ const unsigned Opcode = Inst.getOpcode();
+ MCOperand Offset;
+
+ switch (Opcode) {
+ default:
+ break;
+ case Mips::BBIT0:
+ case Mips::BBIT032:
+ case Mips::BBIT1:
+ case Mips::BBIT132:
+ assert(hasCnMips() && "instruction only valid for octeon cpus");
+ LLVM_FALLTHROUGH;
+
+ case Mips::BEQ:
+ case Mips::BNE:
+ case Mips::BEQ_MM:
+ case Mips::BNE_MM:
+ assert(MCID.getNumOperands() == 3 && "unexpected number of operands");
+ Offset = Inst.getOperand(2);
+ if (!Offset.isImm())
+ break; // We'll deal with this situation later on when applying fixups.
+ if (!isIntN(inMicroMipsMode() ? 17 : 18, Offset.getImm()))
+ return Error(IDLoc, "branch target out of range");
+ if (OffsetToAlignment(Offset.getImm(),
+ 1LL << (inMicroMipsMode() ? 1 : 2)))
+ return Error(IDLoc, "branch to misaligned address");
+ break;
+ case Mips::BGEZ:
+ case Mips::BGTZ:
+ case Mips::BLEZ:
+ case Mips::BLTZ:
+ case Mips::BGEZAL:
+ case Mips::BLTZAL:
+ case Mips::BC1F:
+ case Mips::BC1T:
+ case Mips::BGEZ_MM:
+ case Mips::BGTZ_MM:
+ case Mips::BLEZ_MM:
+ case Mips::BLTZ_MM:
+ case Mips::BGEZAL_MM:
+ case Mips::BLTZAL_MM:
+ case Mips::BC1F_MM:
+ case Mips::BC1T_MM:
+ case Mips::BC1EQZC_MMR6:
+ case Mips::BC1NEZC_MMR6:
+ case Mips::BC2EQZC_MMR6:
+ case Mips::BC2NEZC_MMR6:
+ assert(MCID.getNumOperands() == 2 && "unexpected number of operands");
+ Offset = Inst.getOperand(1);
+ if (!Offset.isImm())
+ break; // We'll deal with this situation later on when applying fixups.
+ if (!isIntN(inMicroMipsMode() ? 17 : 18, Offset.getImm()))
+ return Error(IDLoc, "branch target out of range");
+ if (OffsetToAlignment(Offset.getImm(),
+ 1LL << (inMicroMipsMode() ? 1 : 2)))
+ return Error(IDLoc, "branch to misaligned address");
+ break;
+ case Mips::BGEC: case Mips::BGEC_MMR6:
+ case Mips::BLTC: case Mips::BLTC_MMR6:
+ case Mips::BGEUC: case Mips::BGEUC_MMR6:
+ case Mips::BLTUC: case Mips::BLTUC_MMR6:
+ case Mips::BEQC: case Mips::BEQC_MMR6:
+ case Mips::BNEC: case Mips::BNEC_MMR6:
+ assert(MCID.getNumOperands() == 3 && "unexpected number of operands");
+ Offset = Inst.getOperand(2);
+ if (!Offset.isImm())
+ break; // We'll deal with this situation later on when applying fixups.
+ if (!isIntN(18, Offset.getImm()))
+ return Error(IDLoc, "branch target out of range");
+ if (OffsetToAlignment(Offset.getImm(), 1LL << 2))
+ return Error(IDLoc, "branch to misaligned address");
+ break;
+ case Mips::BLEZC: case Mips::BLEZC_MMR6:
+ case Mips::BGEZC: case Mips::BGEZC_MMR6:
+ case Mips::BGTZC: case Mips::BGTZC_MMR6:
+ case Mips::BLTZC: case Mips::BLTZC_MMR6:
+ assert(MCID.getNumOperands() == 2 && "unexpected number of operands");
+ Offset = Inst.getOperand(1);
+ if (!Offset.isImm())
+ break; // We'll deal with this situation later on when applying fixups.
+ if (!isIntN(18, Offset.getImm()))
+ return Error(IDLoc, "branch target out of range");
+ if (OffsetToAlignment(Offset.getImm(), 1LL << 2))
+ return Error(IDLoc, "branch to misaligned address");
+ break;
+ case Mips::BEQZC: case Mips::BEQZC_MMR6:
+ case Mips::BNEZC: case Mips::BNEZC_MMR6:
+ assert(MCID.getNumOperands() == 2 && "unexpected number of operands");
+ Offset = Inst.getOperand(1);
+ if (!Offset.isImm())
+ break; // We'll deal with this situation later on when applying fixups.
+ if (!isIntN(23, Offset.getImm()))
+ return Error(IDLoc, "branch target out of range");
+ if (OffsetToAlignment(Offset.getImm(), 1LL << 2))
+ return Error(IDLoc, "branch to misaligned address");
+ break;
+ case Mips::BEQZ16_MM:
+ case Mips::BEQZC16_MMR6:
+ case Mips::BNEZ16_MM:
+ case Mips::BNEZC16_MMR6:
+ assert(MCID.getNumOperands() == 2 && "unexpected number of operands");
+ Offset = Inst.getOperand(1);
+ if (!Offset.isImm())
+ break; // We'll deal with this situation later on when applying fixups.
+ if (!isInt<8>(Offset.getImm()))
+ return Error(IDLoc, "branch target out of range");
+ if (OffsetToAlignment(Offset.getImm(), 2LL))
+ return Error(IDLoc, "branch to misaligned address");
+ break;
+ }
+ }
+
+ // SSNOP is deprecated on MIPS32r6/MIPS64r6
+ // We still accept it but it is a normal nop.
+ if (hasMips32r6() && Inst.getOpcode() == Mips::SSNOP) {
+ std::string ISA = hasMips64r6() ? "MIPS64r6" : "MIPS32r6";
+ Warning(IDLoc, "ssnop is deprecated for " + ISA + " and is equivalent to a "
+ "nop instruction");
+ }
+
+ if (hasCnMips()) {
+ const unsigned Opcode = Inst.getOpcode();
+ MCOperand Opnd;
+ int Imm;
+
+ switch (Opcode) {
+ default:
+ break;
+
+ case Mips::BBIT0:
+ case Mips::BBIT032:
+ case Mips::BBIT1:
+ case Mips::BBIT132:
+ assert(MCID.getNumOperands() == 3 && "unexpected number of operands");
+ // The offset is handled above
+ Opnd = Inst.getOperand(1);
+ if (!Opnd.isImm())
+ return Error(IDLoc, "expected immediate operand kind");
+ Imm = Opnd.getImm();
+ if (Imm < 0 || Imm > (Opcode == Mips::BBIT0 ||
+ Opcode == Mips::BBIT1 ? 63 : 31))
+ return Error(IDLoc, "immediate operand value out of range");
+ if (Imm > 31) {
+ Inst.setOpcode(Opcode == Mips::BBIT0 ? Mips::BBIT032
+ : Mips::BBIT132);
+ Inst.getOperand(1).setImm(Imm - 32);
+ }
+ break;
+
+ case Mips::SEQi:
+ case Mips::SNEi:
+ assert(MCID.getNumOperands() == 3 && "unexpected number of operands");
+ Opnd = Inst.getOperand(2);
+ if (!Opnd.isImm())
+ return Error(IDLoc, "expected immediate operand kind");
+ Imm = Opnd.getImm();
+ if (!isInt<10>(Imm))
+ return Error(IDLoc, "immediate operand value out of range");
+ break;
+ }
+ }
+
+ // For PIC code convert unconditional jump to unconditional branch.
+ if ((Inst.getOpcode() == Mips::J || Inst.getOpcode() == Mips::J_MM) &&
+ inPicMode()) {
+ MCInst BInst;
+ BInst.setOpcode(inMicroMipsMode() ? Mips::BEQ_MM : Mips::BEQ);
+ BInst.addOperand(MCOperand::createReg(Mips::ZERO));
+ BInst.addOperand(MCOperand::createReg(Mips::ZERO));
+ BInst.addOperand(Inst.getOperand(0));
+ Inst = BInst;
+ }
+
+ // This expansion is not in a function called by tryExpandInstruction()
+ // because the pseudo-instruction doesn't have a distinct opcode.
+ if ((Inst.getOpcode() == Mips::JAL || Inst.getOpcode() == Mips::JAL_MM) &&
+ inPicMode()) {
+ warnIfNoMacro(IDLoc);
+
+ const MCExpr *JalExpr = Inst.getOperand(0).getExpr();
+
+ // We can do this expansion if there's only 1 symbol in the argument
+ // expression.
+ if (countMCSymbolRefExpr(JalExpr) > 1)
+ return Error(IDLoc, "jal doesn't support multiple symbols in PIC mode");
+
+ // FIXME: This is checking the expression can be handled by the later stages
+ // of the assembler. We ought to leave it to those later stages.
+ const MCSymbol *JalSym = getSingleMCSymbol(JalExpr);
+
+ // FIXME: Add support for label+offset operands (currently causes an error).
+ // FIXME: Add support for forward-declared local symbols.
+ // FIXME: Add expansion for when the LargeGOT option is enabled.
+ if (JalSym->isInSection() || JalSym->isTemporary() ||
+ (JalSym->isELF() && cast<MCSymbolELF>(JalSym)->getBinding() == ELF::STB_LOCAL)) {
+ if (isABI_O32()) {
+ // If it's a local symbol and the O32 ABI is being used, we expand to:
+ // lw $25, 0($gp)
+ // R_(MICRO)MIPS_GOT16 label
+ // addiu $25, $25, 0
+ // R_(MICRO)MIPS_LO16 label
+ // jalr $25
+ const MCExpr *Got16RelocExpr =
+ MipsMCExpr::create(MipsMCExpr::MEK_GOT, JalExpr, getContext());
+ const MCExpr *Lo16RelocExpr =
+ MipsMCExpr::create(MipsMCExpr::MEK_LO, JalExpr, getContext());
+
+ TOut.emitRRX(Mips::LW, Mips::T9, Mips::GP,
+ MCOperand::createExpr(Got16RelocExpr), IDLoc, STI);
+ TOut.emitRRX(Mips::ADDiu, Mips::T9, Mips::T9,
+ MCOperand::createExpr(Lo16RelocExpr), IDLoc, STI);
+ } else if (isABI_N32() || isABI_N64()) {
+ // If it's a local symbol and the N32/N64 ABIs are being used,
+ // we expand to:
+ // lw/ld $25, 0($gp)
+ // R_(MICRO)MIPS_GOT_DISP label
+ // jalr $25
+ const MCExpr *GotDispRelocExpr =
+ MipsMCExpr::create(MipsMCExpr::MEK_GOT_DISP, JalExpr, getContext());
+
+ TOut.emitRRX(ABI.ArePtrs64bit() ? Mips::LD : Mips::LW, Mips::T9,
+ Mips::GP, MCOperand::createExpr(GotDispRelocExpr), IDLoc,
+ STI);
+ }
+ } else {
+ // If it's an external/weak symbol, we expand to:
+ // lw/ld $25, 0($gp)
+ // R_(MICRO)MIPS_CALL16 label
+ // jalr $25
+ const MCExpr *Call16RelocExpr =
+ MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, JalExpr, getContext());
+
+ TOut.emitRRX(ABI.ArePtrs64bit() ? Mips::LD : Mips::LW, Mips::T9, Mips::GP,
+ MCOperand::createExpr(Call16RelocExpr), IDLoc, STI);
+ }
+
+ MCInst JalrInst;
+ if (IsCpRestoreSet && inMicroMipsMode())
+ JalrInst.setOpcode(Mips::JALRS_MM);
+ else
+ JalrInst.setOpcode(inMicroMipsMode() ? Mips::JALR_MM : Mips::JALR);
+ JalrInst.addOperand(MCOperand::createReg(Mips::RA));
+ JalrInst.addOperand(MCOperand::createReg(Mips::T9));
+
+ // FIXME: Add an R_(MICRO)MIPS_JALR relocation after the JALR.
+ // This relocation is supposed to be an optimization hint for the linker
+ // and is not necessary for correctness.
+
+ Inst = JalrInst;
+ ExpandedJalSym = true;
+ }
+
+ bool IsPCRelativeLoad = (MCID.TSFlags & MipsII::IsPCRelativeLoad) != 0;
+ if ((MCID.mayLoad() || MCID.mayStore()) && !IsPCRelativeLoad) {
+ // Check the offset of memory operand, if it is a symbol
+ // reference or immediate we may have to expand instructions.
+ for (unsigned i = 0; i < MCID.getNumOperands(); i++) {
+ const MCOperandInfo &OpInfo = MCID.OpInfo[i];
+ if ((OpInfo.OperandType == MCOI::OPERAND_MEMORY) ||
+ (OpInfo.OperandType == MCOI::OPERAND_UNKNOWN)) {
+ MCOperand &Op = Inst.getOperand(i);
+ if (Op.isImm()) {
+ int MemOffset = Op.getImm();
+ if (MemOffset < -32768 || MemOffset > 32767) {
+ // Offset can't exceed 16bit value.
+ expandMemInst(Inst, IDLoc, Out, STI, MCID.mayLoad(), true);
+ return getParser().hasPendingError();
+ }
+ } else if (Op.isExpr()) {
+ const MCExpr *Expr = Op.getExpr();
+ if (Expr->getKind() == MCExpr::SymbolRef) {
+ const MCSymbolRefExpr *SR =
+ static_cast<const MCSymbolRefExpr *>(Expr);
+ if (SR->getKind() == MCSymbolRefExpr::VK_None) {
+ // Expand symbol.
+ expandMemInst(Inst, IDLoc, Out, STI, MCID.mayLoad(), false);
+ return getParser().hasPendingError();
+ }
+ } else if (!isEvaluated(Expr)) {
+ expandMemInst(Inst, IDLoc, Out, STI, MCID.mayLoad(), false);
+ return getParser().hasPendingError();
+ }
+ }
+ }
+ } // for
+ } // if load/store
+
+ if (inMicroMipsMode()) {
+ if (MCID.mayLoad()) {
+ // Try to create 16-bit GP relative load instruction.
+ for (unsigned i = 0; i < MCID.getNumOperands(); i++) {
+ const MCOperandInfo &OpInfo = MCID.OpInfo[i];
+ if ((OpInfo.OperandType == MCOI::OPERAND_MEMORY) ||
+ (OpInfo.OperandType == MCOI::OPERAND_UNKNOWN)) {
+ MCOperand &Op = Inst.getOperand(i);
+ if (Op.isImm()) {
+ int MemOffset = Op.getImm();
+ MCOperand &DstReg = Inst.getOperand(0);
+ MCOperand &BaseReg = Inst.getOperand(1);
+ if (isInt<9>(MemOffset) && (MemOffset % 4 == 0) &&
+ getContext().getRegisterInfo()->getRegClass(
+ Mips::GPRMM16RegClassID).contains(DstReg.getReg()) &&
+ (BaseReg.getReg() == Mips::GP ||
+ BaseReg.getReg() == Mips::GP_64)) {
+
+ TOut.emitRRI(Mips::LWGP_MM, DstReg.getReg(), Mips::GP, MemOffset,
+ IDLoc, STI);
+ return false;
+ }
+ }
+ }
+ } // for
+ } // if load
+
+ // TODO: Handle this with the AsmOperandClass.PredicateMethod.
+
+ MCOperand Opnd;
+ int Imm;
+
+ switch (Inst.getOpcode()) {
+ default:
+ break;
+ case Mips::ADDIUSP_MM:
+ Opnd = Inst.getOperand(0);
+ if (!Opnd.isImm())
+ return Error(IDLoc, "expected immediate operand kind");
+ Imm = Opnd.getImm();
+ if (Imm < -1032 || Imm > 1028 || (Imm < 8 && Imm > -12) ||
+ Imm % 4 != 0)
+ return Error(IDLoc, "immediate operand value out of range");
+ break;
+ case Mips::SLL16_MM:
+ case Mips::SRL16_MM:
+ Opnd = Inst.getOperand(2);
+ if (!Opnd.isImm())
+ return Error(IDLoc, "expected immediate operand kind");
+ Imm = Opnd.getImm();
+ if (Imm < 1 || Imm > 8)
+ return Error(IDLoc, "immediate operand value out of range");
+ break;
+ case Mips::LI16_MM:
+ Opnd = Inst.getOperand(1);
+ if (!Opnd.isImm())
+ return Error(IDLoc, "expected immediate operand kind");
+ Imm = Opnd.getImm();
+ if (Imm < -1 || Imm > 126)
+ return Error(IDLoc, "immediate operand value out of range");
+ break;
+ case Mips::ADDIUR2_MM:
+ Opnd = Inst.getOperand(2);
+ if (!Opnd.isImm())
+ return Error(IDLoc, "expected immediate operand kind");
+ Imm = Opnd.getImm();
+ if (!(Imm == 1 || Imm == -1 ||
+ ((Imm % 4 == 0) && Imm < 28 && Imm > 0)))
+ return Error(IDLoc, "immediate operand value out of range");
+ break;
+ case Mips::ANDI16_MM:
+ Opnd = Inst.getOperand(2);
+ if (!Opnd.isImm())
+ return Error(IDLoc, "expected immediate operand kind");
+ Imm = Opnd.getImm();
+ if (!(Imm == 128 || (Imm >= 1 && Imm <= 4) || Imm == 7 || Imm == 8 ||
+ Imm == 15 || Imm == 16 || Imm == 31 || Imm == 32 || Imm == 63 ||
+ Imm == 64 || Imm == 255 || Imm == 32768 || Imm == 65535))
+ return Error(IDLoc, "immediate operand value out of range");
+ break;
+ case Mips::LBU16_MM:
+ Opnd = Inst.getOperand(2);
+ if (!Opnd.isImm())
+ return Error(IDLoc, "expected immediate operand kind");
+ Imm = Opnd.getImm();
+ if (Imm < -1 || Imm > 14)
+ return Error(IDLoc, "immediate operand value out of range");
+ break;
+ case Mips::SB16_MM:
+ case Mips::SB16_MMR6:
+ Opnd = Inst.getOperand(2);
+ if (!Opnd.isImm())
+ return Error(IDLoc, "expected immediate operand kind");
+ Imm = Opnd.getImm();
+ if (Imm < 0 || Imm > 15)
+ return Error(IDLoc, "immediate operand value out of range");
+ break;
+ case Mips::LHU16_MM:
+ case Mips::SH16_MM:
+ case Mips::SH16_MMR6:
+ Opnd = Inst.getOperand(2);
+ if (!Opnd.isImm())
+ return Error(IDLoc, "expected immediate operand kind");
+ Imm = Opnd.getImm();
+ if (Imm < 0 || Imm > 30 || (Imm % 2 != 0))
+ return Error(IDLoc, "immediate operand value out of range");
+ break;
+ case Mips::LW16_MM:
+ case Mips::SW16_MM:
+ case Mips::SW16_MMR6:
+ Opnd = Inst.getOperand(2);
+ if (!Opnd.isImm())
+ return Error(IDLoc, "expected immediate operand kind");
+ Imm = Opnd.getImm();
+ if (Imm < 0 || Imm > 60 || (Imm % 4 != 0))
+ return Error(IDLoc, "immediate operand value out of range");
+ break;
+ case Mips::ADDIUPC_MM:
+ MCOperand Opnd = Inst.getOperand(1);
+ if (!Opnd.isImm())
+ return Error(IDLoc, "expected immediate operand kind");
+ int Imm = Opnd.getImm();
+ if ((Imm % 4 != 0) || !isInt<25>(Imm))
+ return Error(IDLoc, "immediate operand value out of range");
+ break;
+ }
+ }
+
+ bool FillDelaySlot =
+ MCID.hasDelaySlot() && AssemblerOptions.back()->isReorder();
+ if (FillDelaySlot)
+ TOut.emitDirectiveSetNoReorder();
+
+ MacroExpanderResultTy ExpandResult =
+ tryExpandInstruction(Inst, IDLoc, Out, STI);
+ switch (ExpandResult) {
+ case MER_NotAMacro:
+ Out.EmitInstruction(Inst, *STI);
+ break;
+ case MER_Success:
+ break;
+ case MER_Fail:
+ return true;
+ }
+
+ // We know we emitted an instruction on the MER_NotAMacro or MER_Success path.
+ // If we're in microMIPS mode then we must also set EF_MIPS_MICROMIPS.
+ if (inMicroMipsMode())
+ TOut.setUsesMicroMips();
+
+ // If this instruction has a delay slot and .set reorder is active,
+ // emit a NOP after it.
+ if (FillDelaySlot) {
+ TOut.emitEmptyDelaySlot(hasShortDelaySlot(Inst.getOpcode()), IDLoc, STI);
+ TOut.emitDirectiveSetReorder();
+ }
+
+ if ((Inst.getOpcode() == Mips::JalOneReg ||
+ Inst.getOpcode() == Mips::JalTwoReg || ExpandedJalSym) &&
+ isPicAndNotNxxAbi()) {
+ if (IsCpRestoreSet) {
+ // We need a NOP between the JALR and the LW:
+ // If .set reorder has been used, we've already emitted a NOP.
+ // If .set noreorder has been used, we need to emit a NOP at this point.
+ if (!AssemblerOptions.back()->isReorder())
+ TOut.emitEmptyDelaySlot(hasShortDelaySlot(Inst.getOpcode()), IDLoc,
+ STI);
+
+ // Load the $gp from the stack.
+ TOut.emitGPRestore(CpRestoreOffset, IDLoc, STI);
+ } else
+ Warning(IDLoc, "no .cprestore used in PIC mode");
+ }
+
+ return false;
+}
+
+MipsAsmParser::MacroExpanderResultTy
+MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI) {
+ switch (Inst.getOpcode()) {
+ default:
+ return MER_NotAMacro;
+ case Mips::LoadImm32:
+ return expandLoadImm(Inst, true, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+ case Mips::LoadImm64:
+ return expandLoadImm(Inst, false, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+ case Mips::LoadAddrImm32:
+ case Mips::LoadAddrImm64:
+ assert(Inst.getOperand(0).isReg() && "expected register operand kind");
+ assert((Inst.getOperand(1).isImm() || Inst.getOperand(1).isExpr()) &&
+ "expected immediate operand kind");
+
+ return expandLoadAddress(Inst.getOperand(0).getReg(), Mips::NoRegister,
+ Inst.getOperand(1),
+ Inst.getOpcode() == Mips::LoadAddrImm32, IDLoc,
+ Out, STI)
+ ? MER_Fail
+ : MER_Success;
+ case Mips::LoadAddrReg32:
+ case Mips::LoadAddrReg64:
+ assert(Inst.getOperand(0).isReg() && "expected register operand kind");
+ assert(Inst.getOperand(1).isReg() && "expected register operand kind");
+ assert((Inst.getOperand(2).isImm() || Inst.getOperand(2).isExpr()) &&
+ "expected immediate operand kind");
+
+ return expandLoadAddress(Inst.getOperand(0).getReg(),
+ Inst.getOperand(1).getReg(), Inst.getOperand(2),
+ Inst.getOpcode() == Mips::LoadAddrReg32, IDLoc,
+ Out, STI)
+ ? MER_Fail
+ : MER_Success;
+ case Mips::B_MM_Pseudo:
+ case Mips::B_MMR6_Pseudo:
+ return expandUncondBranchMMPseudo(Inst, IDLoc, Out, STI) ? MER_Fail
+ : MER_Success;
+ case Mips::SWM_MM:
+ case Mips::LWM_MM:
+ return expandLoadStoreMultiple(Inst, IDLoc, Out, STI) ? MER_Fail
+ : MER_Success;
+ case Mips::JalOneReg:
+ case Mips::JalTwoReg:
+ return expandJalWithRegs(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+ case Mips::BneImm:
+ case Mips::BeqImm:
+ return expandBranchImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+ case Mips::BLT:
+ case Mips::BLE:
+ case Mips::BGE:
+ case Mips::BGT:
+ case Mips::BLTU:
+ case Mips::BLEU:
+ case Mips::BGEU:
+ case Mips::BGTU:
+ case Mips::BLTL:
+ case Mips::BLEL:
+ case Mips::BGEL:
+ case Mips::BGTL:
+ case Mips::BLTUL:
+ case Mips::BLEUL:
+ case Mips::BGEUL:
+ case Mips::BGTUL:
+ case Mips::BLTImmMacro:
+ case Mips::BLEImmMacro:
+ case Mips::BGEImmMacro:
+ case Mips::BGTImmMacro:
+ case Mips::BLTUImmMacro:
+ case Mips::BLEUImmMacro:
+ case Mips::BGEUImmMacro:
+ case Mips::BGTUImmMacro:
+ case Mips::BLTLImmMacro:
+ case Mips::BLELImmMacro:
+ case Mips::BGELImmMacro:
+ case Mips::BGTLImmMacro:
+ case Mips::BLTULImmMacro:
+ case Mips::BLEULImmMacro:
+ case Mips::BGEULImmMacro:
+ case Mips::BGTULImmMacro:
+ return expandCondBranches(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+ case Mips::SDivMacro:
+ return expandDiv(Inst, IDLoc, Out, STI, false, true) ? MER_Fail
+ : MER_Success;
+ case Mips::DSDivMacro:
+ return expandDiv(Inst, IDLoc, Out, STI, true, true) ? MER_Fail
+ : MER_Success;
+ case Mips::UDivMacro:
+ return expandDiv(Inst, IDLoc, Out, STI, false, false) ? MER_Fail
+ : MER_Success;
+ case Mips::DUDivMacro:
+ return expandDiv(Inst, IDLoc, Out, STI, true, false) ? MER_Fail
+ : MER_Success;
+ case Mips::PseudoTRUNC_W_S:
+ return expandTrunc(Inst, false, false, IDLoc, Out, STI) ? MER_Fail
+ : MER_Success;
+ case Mips::PseudoTRUNC_W_D32:
+ return expandTrunc(Inst, true, false, IDLoc, Out, STI) ? MER_Fail
+ : MER_Success;
+ case Mips::PseudoTRUNC_W_D:
+ return expandTrunc(Inst, true, true, IDLoc, Out, STI) ? MER_Fail
+ : MER_Success;
+ case Mips::Ulh:
+ return expandUlh(Inst, true, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+ case Mips::Ulhu:
+ return expandUlh(Inst, false, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+ case Mips::Ush:
+ return expandUsh(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+ case Mips::Ulw:
+ case Mips::Usw:
+ return expandUxw(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+ case Mips::NORImm:
+ return expandAliasImmediate(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+ case Mips::ADDi:
+ case Mips::ADDiu:
+ case Mips::SLTi:
+ case Mips::SLTiu:
+ if ((Inst.getNumOperands() == 3) && Inst.getOperand(0).isReg() &&
+ Inst.getOperand(1).isReg() && Inst.getOperand(2).isImm()) {
+ int64_t ImmValue = Inst.getOperand(2).getImm();
+ if (isInt<16>(ImmValue))
+ return MER_NotAMacro;
+ return expandAliasImmediate(Inst, IDLoc, Out, STI) ? MER_Fail
+ : MER_Success;
+ }
+ return MER_NotAMacro;
+ case Mips::ANDi:
+ case Mips::ORi:
+ case Mips::XORi:
+ if ((Inst.getNumOperands() == 3) && Inst.getOperand(0).isReg() &&
+ Inst.getOperand(1).isReg() && Inst.getOperand(2).isImm()) {
+ int64_t ImmValue = Inst.getOperand(2).getImm();
+ if (isUInt<16>(ImmValue))
+ return MER_NotAMacro;
+ return expandAliasImmediate(Inst, IDLoc, Out, STI) ? MER_Fail
+ : MER_Success;
+ }
+ return MER_NotAMacro;
+ case Mips::ROL:
+ case Mips::ROR:
+ return expandRotation(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+ case Mips::ROLImm:
+ case Mips::RORImm:
+ return expandRotationImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+ case Mips::DROL:
+ case Mips::DROR:
+ return expandDRotation(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+ case Mips::DROLImm:
+ case Mips::DRORImm:
+ return expandDRotationImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+ case Mips::ABSMacro:
+ return expandAbs(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+ case Mips::LDMacro:
+ case Mips::SDMacro:
+ return expandLoadStoreDMacro(Inst, IDLoc, Out, STI,
+ Inst.getOpcode() == Mips::LDMacro)
+ ? MER_Fail
+ : MER_Success;
+ case Mips::SEQMacro:
+ return expandSeq(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+ case Mips::SEQIMacro:
+ return expandSeqI(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+ }
+}
+
+bool MipsAsmParser::expandJalWithRegs(MCInst &Inst, SMLoc IDLoc,
+ MCStreamer &Out,
+ const MCSubtargetInfo *STI) {
+ MipsTargetStreamer &TOut = getTargetStreamer();
+
+ // Create a JALR instruction which is going to replace the pseudo-JAL.
+ MCInst JalrInst;
+ JalrInst.setLoc(IDLoc);
+ const MCOperand FirstRegOp = Inst.getOperand(0);
+ const unsigned Opcode = Inst.getOpcode();
+
+ if (Opcode == Mips::JalOneReg) {
+ // jal $rs => jalr $rs
+ if (IsCpRestoreSet && inMicroMipsMode()) {
+ JalrInst.setOpcode(Mips::JALRS16_MM);
+ JalrInst.addOperand(FirstRegOp);
+ } else if (inMicroMipsMode()) {
+ JalrInst.setOpcode(hasMips32r6() ? Mips::JALRC16_MMR6 : Mips::JALR16_MM);
+ JalrInst.addOperand(FirstRegOp);
+ } else {
+ JalrInst.setOpcode(Mips::JALR);
+ JalrInst.addOperand(MCOperand::createReg(Mips::RA));
+ JalrInst.addOperand(FirstRegOp);
+ }
+ } else if (Opcode == Mips::JalTwoReg) {
+ // jal $rd, $rs => jalr $rd, $rs
+ if (IsCpRestoreSet && inMicroMipsMode())
+ JalrInst.setOpcode(Mips::JALRS_MM);
+ else
+ JalrInst.setOpcode(inMicroMipsMode() ? Mips::JALR_MM : Mips::JALR);
+ JalrInst.addOperand(FirstRegOp);
+ const MCOperand SecondRegOp = Inst.getOperand(1);
+ JalrInst.addOperand(SecondRegOp);
+ }
+ Out.EmitInstruction(JalrInst, *STI);
+
+ // If .set reorder is active and branch instruction has a delay slot,
+ // emit a NOP after it.
+ const MCInstrDesc &MCID = getInstDesc(JalrInst.getOpcode());
+ if (MCID.hasDelaySlot() && AssemblerOptions.back()->isReorder())
+ TOut.emitEmptyDelaySlot(hasShortDelaySlot(JalrInst.getOpcode()), IDLoc,
+ STI);
+
+ return false;
+}
+
+/// Can the value be represented by a unsigned N-bit value and a shift left?
+template <unsigned N> static bool isShiftedUIntAtAnyPosition(uint64_t x) {
+ unsigned BitNum = findFirstSet(x);
+
+ return (x == x >> BitNum << BitNum) && isUInt<N>(x >> BitNum);
+}
+
+/// Load (or add) an immediate into a register.
+///
+/// @param ImmValue The immediate to load.
+/// @param DstReg The register that will hold the immediate.
+/// @param SrcReg A register to add to the immediate or Mips::NoRegister
+/// for a simple initialization.
+/// @param Is32BitImm Is ImmValue 32-bit or 64-bit?
+/// @param IsAddress True if the immediate represents an address. False if it
+/// is an integer.
+/// @param IDLoc Location of the immediate in the source file.
+bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg,
+ unsigned SrcReg, bool Is32BitImm,
+ bool IsAddress, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI) {
+ MipsTargetStreamer &TOut = getTargetStreamer();
+
+ if (!Is32BitImm && !isGP64bit()) {
+ Error(IDLoc, "instruction requires a 64-bit architecture");
+ return true;
+ }
+
+ if (Is32BitImm) {
+ if (isInt<32>(ImmValue) || isUInt<32>(ImmValue)) {
+ // Sign extend up to 64-bit so that the predicates match the hardware
+ // behaviour. In particular, isInt<16>(0xffff8000) and similar should be
+ // true.
+ ImmValue = SignExtend64<32>(ImmValue);
+ } else {
+ Error(IDLoc, "instruction requires a 32-bit immediate");
+ return true;
+ }
+ }
+
+ unsigned ZeroReg = IsAddress ? ABI.GetNullPtr() : ABI.GetZeroReg();
+ unsigned AdduOp = !Is32BitImm ? Mips::DADDu : Mips::ADDu;
+
+ bool UseSrcReg = false;
+ if (SrcReg != Mips::NoRegister)
+ UseSrcReg = true;
+
+ unsigned TmpReg = DstReg;
+ if (UseSrcReg &&
+ getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg, SrcReg)) {
+ // At this point we need AT to perform the expansions and we exit if it is
+ // not available.
+ unsigned ATReg = getATReg(IDLoc);
+ if (!ATReg)
+ return true;
+ TmpReg = ATReg;
+ }
+
+ if (isInt<16>(ImmValue)) {
+ if (!UseSrcReg)
+ SrcReg = ZeroReg;
+
+ // This doesn't quite follow the usual ABI expectations for N32 but matches
+ // traditional assembler behaviour. N32 would normally use addiu for both
+ // integers and addresses.
+ if (IsAddress && !Is32BitImm) {
+ TOut.emitRRI(Mips::DADDiu, DstReg, SrcReg, ImmValue, IDLoc, STI);
+ return false;
+ }
+
+ TOut.emitRRI(Mips::ADDiu, DstReg, SrcReg, ImmValue, IDLoc, STI);
+ return false;
+ }
+
+ if (isUInt<16>(ImmValue)) {
+ unsigned TmpReg = DstReg;
+ if (SrcReg == DstReg) {
+ TmpReg = getATReg(IDLoc);
+ if (!TmpReg)
+ return true;
+ }
+
+ TOut.emitRRI(Mips::ORi, TmpReg, ZeroReg, ImmValue, IDLoc, STI);
+ if (UseSrcReg)
+ TOut.emitRRR(ABI.GetPtrAdduOp(), DstReg, TmpReg, SrcReg, IDLoc, STI);
+ return false;
+ }
+
+ if (isInt<32>(ImmValue) || isUInt<32>(ImmValue)) {
+ warnIfNoMacro(IDLoc);
+
+ uint16_t Bits31To16 = (ImmValue >> 16) & 0xffff;
+ uint16_t Bits15To0 = ImmValue & 0xffff;
+
+ if (!Is32BitImm && !isInt<32>(ImmValue)) {
+ // Traditional behaviour seems to special case this particular value. It's
+ // not clear why other masks are handled differently.
+ if (ImmValue == 0xffffffff) {
+ TOut.emitRI(Mips::LUi, TmpReg, 0xffff, IDLoc, STI);
+ TOut.emitRRI(Mips::DSRL32, TmpReg, TmpReg, 0, IDLoc, STI);
+ if (UseSrcReg)
+ TOut.emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, STI);
+ return false;
+ }
+
+ // Expand to an ORi instead of a LUi to avoid sign-extending into the
+ // upper 32 bits.
+ TOut.emitRRI(Mips::ORi, TmpReg, ZeroReg, Bits31To16, IDLoc, STI);
+ TOut.emitRRI(Mips::DSLL, TmpReg, TmpReg, 16, IDLoc, STI);
+ if (Bits15To0)
+ TOut.emitRRI(Mips::ORi, TmpReg, TmpReg, Bits15To0, IDLoc, STI);
+ if (UseSrcReg)
+ TOut.emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, STI);
+ return false;
+ }
+
+ TOut.emitRI(Mips::LUi, TmpReg, Bits31To16, IDLoc, STI);
+ if (Bits15To0)
+ TOut.emitRRI(Mips::ORi, TmpReg, TmpReg, Bits15To0, IDLoc, STI);
+ if (UseSrcReg)
+ TOut.emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, STI);
+ return false;
+ }
+
+ if (isShiftedUIntAtAnyPosition<16>(ImmValue)) {
+ if (Is32BitImm) {
+ Error(IDLoc, "instruction requires a 32-bit immediate");
+ return true;
+ }
+
+ // Traditionally, these immediates are shifted as little as possible and as
+ // such we align the most significant bit to bit 15 of our temporary.
+ unsigned FirstSet = findFirstSet((uint64_t)ImmValue);
+ unsigned LastSet = findLastSet((uint64_t)ImmValue);
+ unsigned ShiftAmount = FirstSet - (15 - (LastSet - FirstSet));
+ uint16_t Bits = (ImmValue >> ShiftAmount) & 0xffff;
+ TOut.emitRRI(Mips::ORi, TmpReg, ZeroReg, Bits, IDLoc, STI);
+ TOut.emitRRI(Mips::DSLL, TmpReg, TmpReg, ShiftAmount, IDLoc, STI);
+
+ if (UseSrcReg)
+ TOut.emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, STI);
+
+ return false;
+ }
+
+ warnIfNoMacro(IDLoc);
+
+ // The remaining case is packed with a sequence of dsll and ori with zeros
+ // being omitted and any neighbouring dsll's being coalesced.
+ // The highest 32-bit's are equivalent to a 32-bit immediate load.
+
+ // Load bits 32-63 of ImmValue into bits 0-31 of the temporary register.
+ if (loadImmediate(ImmValue >> 32, TmpReg, Mips::NoRegister, true, false,
+ IDLoc, Out, STI))
+ return false;
+
+ // Shift and accumulate into the register. If a 16-bit chunk is zero, then
+ // skip it and defer the shift to the next chunk.
+ unsigned ShiftCarriedForwards = 16;
+ for (int BitNum = 16; BitNum >= 0; BitNum -= 16) {
+ uint16_t ImmChunk = (ImmValue >> BitNum) & 0xffff;
+
+ if (ImmChunk != 0) {
+ TOut.emitDSLL(TmpReg, TmpReg, ShiftCarriedForwards, IDLoc, STI);
+ TOut.emitRRI(Mips::ORi, TmpReg, TmpReg, ImmChunk, IDLoc, STI);
+ ShiftCarriedForwards = 0;
+ }
+
+ ShiftCarriedForwards += 16;
+ }
+ ShiftCarriedForwards -= 16;
+
+ // Finish any remaining shifts left by trailing zeros.
+ if (ShiftCarriedForwards)
+ TOut.emitDSLL(TmpReg, TmpReg, ShiftCarriedForwards, IDLoc, STI);
+
+ if (UseSrcReg)
+ TOut.emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, STI);
+
+ return false;
+}
+
+bool MipsAsmParser::expandLoadImm(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc,
+ MCStreamer &Out, const MCSubtargetInfo *STI) {
+ const MCOperand &ImmOp = Inst.getOperand(1);
+ assert(ImmOp.isImm() && "expected immediate operand kind");
+ const MCOperand &DstRegOp = Inst.getOperand(0);
+ assert(DstRegOp.isReg() && "expected register operand kind");
+
+ if (loadImmediate(ImmOp.getImm(), DstRegOp.getReg(), Mips::NoRegister,
+ Is32BitImm, false, IDLoc, Out, STI))
+ return true;
+
+ return false;
+}
+
+bool MipsAsmParser::expandLoadAddress(unsigned DstReg, unsigned BaseReg,
+ const MCOperand &Offset,
+ bool Is32BitAddress, SMLoc IDLoc,
+ MCStreamer &Out,
+ const MCSubtargetInfo *STI) {
+ // la can't produce a usable address when addresses are 64-bit.
+ if (Is32BitAddress && ABI.ArePtrs64bit()) {
+ // FIXME: Demote this to a warning and continue as if we had 'dla' instead.
+ // We currently can't do this because we depend on the equality
+ // operator and N64 can end up with a GPR32/GPR64 mismatch.
+ Error(IDLoc, "la used to load 64-bit address");
+ // Continue as if we had 'dla' instead.
+ Is32BitAddress = false;
+ return true;
+ }
+
+ // dla requires 64-bit addresses.
+ if (!Is32BitAddress && !hasMips3()) {
+ Error(IDLoc, "instruction requires a 64-bit architecture");
+ return true;
+ }
+
+ if (!Offset.isImm())
+ return loadAndAddSymbolAddress(Offset.getExpr(), DstReg, BaseReg,
+ Is32BitAddress, IDLoc, Out, STI);
+
+ if (!ABI.ArePtrs64bit()) {
+ // Continue as if we had 'la' whether we had 'la' or 'dla'.
+ Is32BitAddress = true;
+ }
+
+ return loadImmediate(Offset.getImm(), DstReg, BaseReg, Is32BitAddress, true,
+ IDLoc, Out, STI);
+}
+
+bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
+ unsigned DstReg, unsigned SrcReg,
+ bool Is32BitSym, SMLoc IDLoc,
+ MCStreamer &Out,
+ const MCSubtargetInfo *STI) {
+ MipsTargetStreamer &TOut = getTargetStreamer();
+ bool UseSrcReg = SrcReg != Mips::NoRegister;
+ warnIfNoMacro(IDLoc);
+
+ if (inPicMode() && ABI.IsO32()) {
+ MCValue Res;
+ if (!SymExpr->evaluateAsRelocatable(Res, nullptr, nullptr)) {
+ Error(IDLoc, "expected relocatable expression");
+ return true;
+ }
+ if (Res.getSymB() != nullptr) {
+ Error(IDLoc, "expected relocatable expression with only one symbol");
+ return true;
+ }
+
+ // The case where the result register is $25 is somewhat special. If the
+ // symbol in the final relocation is external and not modified with a
+ // constant then we must use R_MIPS_CALL16 instead of R_MIPS_GOT16.
+ if ((DstReg == Mips::T9 || DstReg == Mips::T9_64) && !UseSrcReg &&
+ Res.getConstant() == 0 && !Res.getSymA()->getSymbol().isInSection() &&
+ !Res.getSymA()->getSymbol().isTemporary()) {
+ const MCExpr *CallExpr =
+ MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, SymExpr, getContext());
+ TOut.emitRRX(Mips::LW, DstReg, ABI.GetGlobalPtr(),
+ MCOperand::createExpr(CallExpr), IDLoc, STI);
+ return false;
+ }
+
+ // The remaining cases are:
+ // External GOT: lw $tmp, %got(symbol+offset)($gp)
+ // >addiu $tmp, $tmp, %lo(offset)
+ // >addiu $rd, $tmp, $rs
+ // Local GOT: lw $tmp, %got(symbol+offset)($gp)
+ // addiu $tmp, $tmp, %lo(symbol+offset)($gp)
+ // >addiu $rd, $tmp, $rs
+ // The addiu's marked with a '>' may be omitted if they are redundant. If
+ // this happens then the last instruction must use $rd as the result
+ // register.
+ const MipsMCExpr *GotExpr =
+ MipsMCExpr::create(MipsMCExpr::MEK_GOT, SymExpr, getContext());
+ const MCExpr *LoExpr = nullptr;
+ if (Res.getSymA()->getSymbol().isInSection() ||
+ Res.getSymA()->getSymbol().isTemporary())
+ LoExpr = MipsMCExpr::create(MipsMCExpr::MEK_LO, SymExpr, getContext());
+ else if (Res.getConstant() != 0) {
+ // External symbols fully resolve the symbol with just the %got(symbol)
+ // but we must still account for any offset to the symbol for expressions
+ // like symbol+8.
+ LoExpr = MCConstantExpr::create(Res.getConstant(), getContext());
+ }
+
+ unsigned TmpReg = DstReg;
+ if (UseSrcReg &&
+ getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg,
+ SrcReg)) {
+ // If $rs is the same as $rd, we need to use AT.
+ // If it is not available we exit.
+ unsigned ATReg = getATReg(IDLoc);
+ if (!ATReg)
+ return true;
+ TmpReg = ATReg;
+ }
+
+ TOut.emitRRX(Mips::LW, TmpReg, ABI.GetGlobalPtr(),
+ MCOperand::createExpr(GotExpr), IDLoc, STI);
+
+ if (LoExpr)
+ TOut.emitRRX(Mips::ADDiu, TmpReg, TmpReg, MCOperand::createExpr(LoExpr),
+ IDLoc, STI);
+
+ if (UseSrcReg)
+ TOut.emitRRR(Mips::ADDu, DstReg, TmpReg, SrcReg, IDLoc, STI);
+
+ return false;
+ }
+
+ const MipsMCExpr *HiExpr =
+ MipsMCExpr::create(MipsMCExpr::MEK_HI, SymExpr, getContext());
+ const MipsMCExpr *LoExpr =
+ MipsMCExpr::create(MipsMCExpr::MEK_LO, SymExpr, getContext());
+
+ // This is the 64-bit symbol address expansion.
+ if (ABI.ArePtrs64bit() && isGP64bit()) {
+ // We always need AT for the 64-bit expansion.
+ // If it is not available we exit.
+ unsigned ATReg = getATReg(IDLoc);
+ if (!ATReg)
+ return true;
+
+ const MipsMCExpr *HighestExpr =
+ MipsMCExpr::create(MipsMCExpr::MEK_HIGHEST, SymExpr, getContext());
+ const MipsMCExpr *HigherExpr =
+ MipsMCExpr::create(MipsMCExpr::MEK_HIGHER, SymExpr, getContext());
+
+ if (UseSrcReg &&
+ getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg,
+ SrcReg)) {
+ // If $rs is the same as $rd:
+ // (d)la $rd, sym($rd) => lui $at, %highest(sym)
+ // daddiu $at, $at, %higher(sym)
+ // dsll $at, $at, 16
+ // daddiu $at, $at, %hi(sym)
+ // dsll $at, $at, 16
+ // daddiu $at, $at, %lo(sym)
+ // daddu $rd, $at, $rd
+ TOut.emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HighestExpr), IDLoc,
+ STI);
+ TOut.emitRRX(Mips::DADDiu, ATReg, ATReg,
+ MCOperand::createExpr(HigherExpr), IDLoc, STI);
+ TOut.emitRRI(Mips::DSLL, ATReg, ATReg, 16, IDLoc, STI);
+ TOut.emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(HiExpr),
+ IDLoc, STI);
+ TOut.emitRRI(Mips::DSLL, ATReg, ATReg, 16, IDLoc, STI);
+ TOut.emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(LoExpr),
+ IDLoc, STI);
+ TOut.emitRRR(Mips::DADDu, DstReg, ATReg, SrcReg, IDLoc, STI);
+
+ return false;
+ }
+
+ // Otherwise, if the $rs is different from $rd or if $rs isn't specified:
+ // (d)la $rd, sym/sym($rs) => lui $rd, %highest(sym)
+ // lui $at, %hi(sym)
+ // daddiu $rd, $rd, %higher(sym)
+ // daddiu $at, $at, %lo(sym)
+ // dsll32 $rd, $rd, 0
+ // daddu $rd, $rd, $at
+ // (daddu $rd, $rd, $rs)
+ TOut.emitRX(Mips::LUi, DstReg, MCOperand::createExpr(HighestExpr), IDLoc,
+ STI);
+ TOut.emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HiExpr), IDLoc, STI);
+ TOut.emitRRX(Mips::DADDiu, DstReg, DstReg,
+ MCOperand::createExpr(HigherExpr), IDLoc, STI);
+ TOut.emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(LoExpr),
+ IDLoc, STI);
+ TOut.emitRRI(Mips::DSLL32, DstReg, DstReg, 0, IDLoc, STI);
+ TOut.emitRRR(Mips::DADDu, DstReg, DstReg, ATReg, IDLoc, STI);
+ if (UseSrcReg)
+ TOut.emitRRR(Mips::DADDu, DstReg, DstReg, SrcReg, IDLoc, STI);
+
+ return false;
+ }
+
+ // And now, the 32-bit symbol address expansion:
+ // If $rs is the same as $rd:
+ // (d)la $rd, sym($rd) => lui $at, %hi(sym)
+ // ori $at, $at, %lo(sym)
+ // addu $rd, $at, $rd
+ // Otherwise, if the $rs is different from $rd or if $rs isn't specified:
+ // (d)la $rd, sym/sym($rs) => lui $rd, %hi(sym)
+ // ori $rd, $rd, %lo(sym)
+ // (addu $rd, $rd, $rs)
+ unsigned TmpReg = DstReg;
+ if (UseSrcReg &&
+ getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg, SrcReg)) {
+ // If $rs is the same as $rd, we need to use AT.
+ // If it is not available we exit.
+ unsigned ATReg = getATReg(IDLoc);
+ if (!ATReg)
+ return true;
+ TmpReg = ATReg;
+ }
+
+ TOut.emitRX(Mips::LUi, TmpReg, MCOperand::createExpr(HiExpr), IDLoc, STI);
+ TOut.emitRRX(Mips::ADDiu, TmpReg, TmpReg, MCOperand::createExpr(LoExpr),
+ IDLoc, STI);
+
+ if (UseSrcReg)
+ TOut.emitRRR(Mips::ADDu, DstReg, TmpReg, SrcReg, IDLoc, STI);
+ else
+ assert(
+ getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg, TmpReg));
+
+ return false;
+}
+
+bool MipsAsmParser::expandUncondBranchMMPseudo(MCInst &Inst, SMLoc IDLoc,
+ MCStreamer &Out,
+ const MCSubtargetInfo *STI) {
+ MipsTargetStreamer &TOut = getTargetStreamer();
+
+ assert(getInstDesc(Inst.getOpcode()).getNumOperands() == 1 &&
+ "unexpected number of operands");
+
+ MCOperand Offset = Inst.getOperand(0);
+ if (Offset.isExpr()) {
+ Inst.clear();
+ Inst.setOpcode(Mips::BEQ_MM);
+ Inst.addOperand(MCOperand::createReg(Mips::ZERO));
+ Inst.addOperand(MCOperand::createReg(Mips::ZERO));
+ Inst.addOperand(MCOperand::createExpr(Offset.getExpr()));
+ } else {
+ assert(Offset.isImm() && "expected immediate operand kind");
+ if (isInt<11>(Offset.getImm())) {
+ // If offset fits into 11 bits then this instruction becomes microMIPS
+ // 16-bit unconditional branch instruction.
+ if (inMicroMipsMode())
+ Inst.setOpcode(hasMips32r6() ? Mips::BC16_MMR6 : Mips::B16_MM);
+ } else {
+ if (!isInt<17>(Offset.getImm()))
+ return Error(IDLoc, "branch target out of range");
+ if (OffsetToAlignment(Offset.getImm(), 1LL << 1))
+ return Error(IDLoc, "branch to misaligned address");
+ Inst.clear();
+ Inst.setOpcode(Mips::BEQ_MM);
+ Inst.addOperand(MCOperand::createReg(Mips::ZERO));
+ Inst.addOperand(MCOperand::createReg(Mips::ZERO));
+ Inst.addOperand(MCOperand::createImm(Offset.getImm()));
+ }
+ }
+ Out.EmitInstruction(Inst, *STI);
+
+ // If .set reorder is active and branch instruction has a delay slot,
+ // emit a NOP after it.
+ const MCInstrDesc &MCID = getInstDesc(Inst.getOpcode());
+ if (MCID.hasDelaySlot() && AssemblerOptions.back()->isReorder())
+ TOut.emitEmptyDelaySlot(true, IDLoc, STI);
+
+ return false;
+}
+
+bool MipsAsmParser::expandBranchImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI) {
+ MipsTargetStreamer &TOut = getTargetStreamer();
+ const MCOperand &DstRegOp = Inst.getOperand(0);
+ assert(DstRegOp.isReg() && "expected register operand kind");
+
+ const MCOperand &ImmOp = Inst.getOperand(1);
+ assert(ImmOp.isImm() && "expected immediate operand kind");
+
+ const MCOperand &MemOffsetOp = Inst.getOperand(2);
+ assert((MemOffsetOp.isImm() || MemOffsetOp.isExpr()) &&
+ "expected immediate or expression operand");
+
+ unsigned OpCode = 0;
+ switch(Inst.getOpcode()) {
+ case Mips::BneImm:
+ OpCode = Mips::BNE;
+ break;
+ case Mips::BeqImm:
+ OpCode = Mips::BEQ;
+ break;
+ default:
+ llvm_unreachable("Unknown immediate branch pseudo-instruction.");
+ break;
+ }
+
+ int64_t ImmValue = ImmOp.getImm();
+ if (ImmValue == 0)
+ TOut.emitRRX(OpCode, DstRegOp.getReg(), Mips::ZERO, MemOffsetOp, IDLoc,
+ STI);
+ else {
+ warnIfNoMacro(IDLoc);
+
+ unsigned ATReg = getATReg(IDLoc);
+ if (!ATReg)
+ return true;
+
+ if (loadImmediate(ImmValue, ATReg, Mips::NoRegister, !isGP64bit(), true,
+ IDLoc, Out, STI))
+ return true;
+
+ TOut.emitRRX(OpCode, DstRegOp.getReg(), ATReg, MemOffsetOp, IDLoc, STI);
+ }
+ return false;
+}
+
+void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI, bool IsLoad,
+ bool IsImmOpnd) {
+ if (IsLoad) {
+ expandLoadInst(Inst, IDLoc, Out, STI, IsImmOpnd);
+ return;
+ }
+ expandStoreInst(Inst, IDLoc, Out, STI, IsImmOpnd);
+}
+
+void MipsAsmParser::expandLoadInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI, bool IsImmOpnd) {
+ MipsTargetStreamer &TOut = getTargetStreamer();
+
+ unsigned DstReg = Inst.getOperand(0).getReg();
+ unsigned BaseReg = Inst.getOperand(1).getReg();
+
+ const MCInstrDesc &Desc = getInstDesc(Inst.getOpcode());
+ int16_t DstRegClass = Desc.OpInfo[0].RegClass;
+ unsigned DstRegClassID =
+ getContext().getRegisterInfo()->getRegClass(DstRegClass).getID();
+ bool IsGPR = (DstRegClassID == Mips::GPR32RegClassID) ||
+ (DstRegClassID == Mips::GPR64RegClassID);
+
+ if (IsImmOpnd) {
+ // Try to use DstReg as the temporary.
+ if (IsGPR && (BaseReg != DstReg)) {
+ TOut.emitLoadWithImmOffset(Inst.getOpcode(), DstReg, BaseReg,
+ Inst.getOperand(2).getImm(), DstReg, IDLoc,
+ STI);
+ return;
+ }
+
+ // At this point we need AT to perform the expansions and we exit if it is
+ // not available.
+ unsigned ATReg = getATReg(IDLoc);
+ if (!ATReg)
+ return;
+
+ TOut.emitLoadWithImmOffset(Inst.getOpcode(), DstReg, BaseReg,
+ Inst.getOperand(2).getImm(), ATReg, IDLoc, STI);
+ return;
+ }
+
+ const MCExpr *ExprOffset = Inst.getOperand(2).getExpr();
+ MCOperand LoOperand = MCOperand::createExpr(
+ MipsMCExpr::create(MipsMCExpr::MEK_LO, ExprOffset, getContext()));
+ MCOperand HiOperand = MCOperand::createExpr(
+ MipsMCExpr::create(MipsMCExpr::MEK_HI, ExprOffset, getContext()));
+
+ // Try to use DstReg as the temporary.
+ if (IsGPR && (BaseReg != DstReg)) {
+ TOut.emitLoadWithSymOffset(Inst.getOpcode(), DstReg, BaseReg, HiOperand,
+ LoOperand, DstReg, IDLoc, STI);
+ return;
+ }
+
+ // At this point we need AT to perform the expansions and we exit if it is
+ // not available.
+ unsigned ATReg = getATReg(IDLoc);
+ if (!ATReg)
+ return;
+
+ TOut.emitLoadWithSymOffset(Inst.getOpcode(), DstReg, BaseReg, HiOperand,
+ LoOperand, ATReg, IDLoc, STI);
+}
+
+void MipsAsmParser::expandStoreInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI,
+ bool IsImmOpnd) {
+ MipsTargetStreamer &TOut = getTargetStreamer();
+
+ unsigned SrcReg = Inst.getOperand(0).getReg();
+ unsigned BaseReg = Inst.getOperand(1).getReg();
+
+ if (IsImmOpnd) {
+ TOut.emitStoreWithImmOffset(Inst.getOpcode(), SrcReg, BaseReg,
+ Inst.getOperand(2).getImm(),
+ [&]() { return getATReg(IDLoc); }, IDLoc, STI);
+ return;
+ }
+
+ unsigned ATReg = getATReg(IDLoc);
+ if (!ATReg)
+ return;
+
+ const MCExpr *ExprOffset = Inst.getOperand(2).getExpr();
+ MCOperand LoOperand = MCOperand::createExpr(
+ MipsMCExpr::create(MipsMCExpr::MEK_LO, ExprOffset, getContext()));
+ MCOperand HiOperand = MCOperand::createExpr(
+ MipsMCExpr::create(MipsMCExpr::MEK_HI, ExprOffset, getContext()));
+ TOut.emitStoreWithSymOffset(Inst.getOpcode(), SrcReg, BaseReg, HiOperand,
+ LoOperand, ATReg, IDLoc, STI);
+}
+
+bool MipsAsmParser::expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc,
+ MCStreamer &Out,
+ const MCSubtargetInfo *STI) {
+ unsigned OpNum = Inst.getNumOperands();
+ unsigned Opcode = Inst.getOpcode();
+ unsigned NewOpcode = Opcode == Mips::SWM_MM ? Mips::SWM32_MM : Mips::LWM32_MM;
+
+ assert (Inst.getOperand(OpNum - 1).isImm() &&
+ Inst.getOperand(OpNum - 2).isReg() &&
+ Inst.getOperand(OpNum - 3).isReg() && "Invalid instruction operand.");
+
+ if (OpNum < 8 && Inst.getOperand(OpNum - 1).getImm() <= 60 &&
+ Inst.getOperand(OpNum - 1).getImm() >= 0 &&
+ (Inst.getOperand(OpNum - 2).getReg() == Mips::SP ||
+ Inst.getOperand(OpNum - 2).getReg() == Mips::SP_64) &&
+ (Inst.getOperand(OpNum - 3).getReg() == Mips::RA ||
+ Inst.getOperand(OpNum - 3).getReg() == Mips::RA_64)) {
+ // It can be implemented as SWM16 or LWM16 instruction.
+ if (inMicroMipsMode() && hasMips32r6())
+ NewOpcode = Opcode == Mips::SWM_MM ? Mips::SWM16_MMR6 : Mips::LWM16_MMR6;
+ else
+ NewOpcode = Opcode == Mips::SWM_MM ? Mips::SWM16_MM : Mips::LWM16_MM;
+ }
+
+ Inst.setOpcode(NewOpcode);
+ Out.EmitInstruction(Inst, *STI);
+ return false;
+}
+
+bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
+ MCStreamer &Out,
+ const MCSubtargetInfo *STI) {
+ MipsTargetStreamer &TOut = getTargetStreamer();
+ bool EmittedNoMacroWarning = false;
+ unsigned PseudoOpcode = Inst.getOpcode();
+ unsigned SrcReg = Inst.getOperand(0).getReg();
+ const MCOperand &TrgOp = Inst.getOperand(1);
+ const MCExpr *OffsetExpr = Inst.getOperand(2).getExpr();
+
+ unsigned ZeroSrcOpcode, ZeroTrgOpcode;
+ bool ReverseOrderSLT, IsUnsigned, IsLikely, AcceptsEquality;
+
+ unsigned TrgReg;
+ if (TrgOp.isReg())
+ TrgReg = TrgOp.getReg();
+ else if (TrgOp.isImm()) {
+ warnIfNoMacro(IDLoc);
+ EmittedNoMacroWarning = true;
+
+ TrgReg = getATReg(IDLoc);
+ if (!TrgReg)
+ return true;
+
+ switch(PseudoOpcode) {
+ default:
+ llvm_unreachable("unknown opcode for branch pseudo-instruction");
+ case Mips::BLTImmMacro:
+ PseudoOpcode = Mips::BLT;
+ break;
+ case Mips::BLEImmMacro:
+ PseudoOpcode = Mips::BLE;
+ break;
+ case Mips::BGEImmMacro:
+ PseudoOpcode = Mips::BGE;
+ break;
+ case Mips::BGTImmMacro:
+ PseudoOpcode = Mips::BGT;
+ break;
+ case Mips::BLTUImmMacro:
+ PseudoOpcode = Mips::BLTU;
+ break;
+ case Mips::BLEUImmMacro:
+ PseudoOpcode = Mips::BLEU;
+ break;
+ case Mips::BGEUImmMacro:
+ PseudoOpcode = Mips::BGEU;
+ break;
+ case Mips::BGTUImmMacro:
+ PseudoOpcode = Mips::BGTU;
+ break;
+ case Mips::BLTLImmMacro:
+ PseudoOpcode = Mips::BLTL;
+ break;
+ case Mips::BLELImmMacro:
+ PseudoOpcode = Mips::BLEL;
+ break;
+ case Mips::BGELImmMacro:
+ PseudoOpcode = Mips::BGEL;
+ break;
+ case Mips::BGTLImmMacro:
+ PseudoOpcode = Mips::BGTL;
+ break;
+ case Mips::BLTULImmMacro:
+ PseudoOpcode = Mips::BLTUL;
+ break;
+ case Mips::BLEULImmMacro:
+ PseudoOpcode = Mips::BLEUL;
+ break;
+ case Mips::BGEULImmMacro:
+ PseudoOpcode = Mips::BGEUL;
+ break;
+ case Mips::BGTULImmMacro:
+ PseudoOpcode = Mips::BGTUL;
+ break;
+ }
+
+ if (loadImmediate(TrgOp.getImm(), TrgReg, Mips::NoRegister, !isGP64bit(),
+ false, IDLoc, Out, STI))
+ return true;
+ }
+
+ switch (PseudoOpcode) {
+ case Mips::BLT:
+ case Mips::BLTU:
+ case Mips::BLTL:
+ case Mips::BLTUL:
+ AcceptsEquality = false;
+ ReverseOrderSLT = false;
+ IsUnsigned = ((PseudoOpcode == Mips::BLTU) || (PseudoOpcode == Mips::BLTUL));
+ IsLikely = ((PseudoOpcode == Mips::BLTL) || (PseudoOpcode == Mips::BLTUL));
+ ZeroSrcOpcode = Mips::BGTZ;
+ ZeroTrgOpcode = Mips::BLTZ;
+ break;
+ case Mips::BLE:
+ case Mips::BLEU:
+ case Mips::BLEL:
+ case Mips::BLEUL:
+ AcceptsEquality = true;
+ ReverseOrderSLT = true;
+ IsUnsigned = ((PseudoOpcode == Mips::BLEU) || (PseudoOpcode == Mips::BLEUL));
+ IsLikely = ((PseudoOpcode == Mips::BLEL) || (PseudoOpcode == Mips::BLEUL));
+ ZeroSrcOpcode = Mips::BGEZ;
+ ZeroTrgOpcode = Mips::BLEZ;
+ break;
+ case Mips::BGE:
+ case Mips::BGEU:
+ case Mips::BGEL:
+ case Mips::BGEUL:
+ AcceptsEquality = true;
+ ReverseOrderSLT = false;
+ IsUnsigned = ((PseudoOpcode == Mips::BGEU) || (PseudoOpcode == Mips::BGEUL));
+ IsLikely = ((PseudoOpcode == Mips::BGEL) || (PseudoOpcode == Mips::BGEUL));
+ ZeroSrcOpcode = Mips::BLEZ;
+ ZeroTrgOpcode = Mips::BGEZ;
+ break;
+ case Mips::BGT:
+ case Mips::BGTU:
+ case Mips::BGTL:
+ case Mips::BGTUL:
+ AcceptsEquality = false;
+ ReverseOrderSLT = true;
+ IsUnsigned = ((PseudoOpcode == Mips::BGTU) || (PseudoOpcode == Mips::BGTUL));
+ IsLikely = ((PseudoOpcode == Mips::BGTL) || (PseudoOpcode == Mips::BGTUL));
+ ZeroSrcOpcode = Mips::BLTZ;
+ ZeroTrgOpcode = Mips::BGTZ;
+ break;
+ default:
+ llvm_unreachable("unknown opcode for branch pseudo-instruction");
+ }
+
+ bool IsTrgRegZero = (TrgReg == Mips::ZERO);
+ bool IsSrcRegZero = (SrcReg == Mips::ZERO);
+ if (IsSrcRegZero && IsTrgRegZero) {
+ // FIXME: All of these Opcode-specific if's are needed for compatibility
+ // with GAS' behaviour. However, they may not generate the most efficient
+ // code in some circumstances.
+ if (PseudoOpcode == Mips::BLT) {
+ TOut.emitRX(Mips::BLTZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr),
+ IDLoc, STI);
+ return false;
+ }
+ if (PseudoOpcode == Mips::BLE) {
+ TOut.emitRX(Mips::BLEZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr),
+ IDLoc, STI);
+ Warning(IDLoc, "branch is always taken");
+ return false;
+ }
+ if (PseudoOpcode == Mips::BGE) {
+ TOut.emitRX(Mips::BGEZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr),
+ IDLoc, STI);
+ Warning(IDLoc, "branch is always taken");
+ return false;
+ }
+ if (PseudoOpcode == Mips::BGT) {
+ TOut.emitRX(Mips::BGTZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr),
+ IDLoc, STI);
+ return false;
+ }
+ if (PseudoOpcode == Mips::BGTU) {
+ TOut.emitRRX(Mips::BNE, Mips::ZERO, Mips::ZERO,
+ MCOperand::createExpr(OffsetExpr), IDLoc, STI);
+ return false;
+ }
+ if (AcceptsEquality) {
+ // If both registers are $0 and the pseudo-branch accepts equality, it
+ // will always be taken, so we emit an unconditional branch.
+ TOut.emitRRX(Mips::BEQ, Mips::ZERO, Mips::ZERO,
+ MCOperand::createExpr(OffsetExpr), IDLoc, STI);
+ Warning(IDLoc, "branch is always taken");
+ return false;
+ }
+ // If both registers are $0 and the pseudo-branch does not accept
+ // equality, it will never be taken, so we don't have to emit anything.
+ return false;
+ }
+ if (IsSrcRegZero || IsTrgRegZero) {
+ if ((IsSrcRegZero && PseudoOpcode == Mips::BGTU) ||
+ (IsTrgRegZero && PseudoOpcode == Mips::BLTU)) {
+ // If the $rs is $0 and the pseudo-branch is BGTU (0 > x) or
+ // if the $rt is $0 and the pseudo-branch is BLTU (x < 0),
+ // the pseudo-branch will never be taken, so we don't emit anything.
+ // This only applies to unsigned pseudo-branches.
+ return false;
+ }
+ if ((IsSrcRegZero && PseudoOpcode == Mips::BLEU) ||
+ (IsTrgRegZero && PseudoOpcode == Mips::BGEU)) {
+ // If the $rs is $0 and the pseudo-branch is BLEU (0 <= x) or
+ // if the $rt is $0 and the pseudo-branch is BGEU (x >= 0),
+ // the pseudo-branch will always be taken, so we emit an unconditional
+ // branch.
+ // This only applies to unsigned pseudo-branches.
+ TOut.emitRRX(Mips::BEQ, Mips::ZERO, Mips::ZERO,
+ MCOperand::createExpr(OffsetExpr), IDLoc, STI);
+ Warning(IDLoc, "branch is always taken");
+ return false;
+ }
+ if (IsUnsigned) {
+ // If the $rs is $0 and the pseudo-branch is BLTU (0 < x) or
+ // if the $rt is $0 and the pseudo-branch is BGTU (x > 0),
+ // the pseudo-branch will be taken only when the non-zero register is
+ // different from 0, so we emit a BNEZ.
+ //
+ // If the $rs is $0 and the pseudo-branch is BGEU (0 >= x) or
+ // if the $rt is $0 and the pseudo-branch is BLEU (x <= 0),
+ // the pseudo-branch will be taken only when the non-zero register is
+ // equal to 0, so we emit a BEQZ.
+ //
+ // Because only BLEU and BGEU branch on equality, we can use the
+ // AcceptsEquality variable to decide when to emit the BEQZ.
+ TOut.emitRRX(AcceptsEquality ? Mips::BEQ : Mips::BNE,
+ IsSrcRegZero ? TrgReg : SrcReg, Mips::ZERO,
+ MCOperand::createExpr(OffsetExpr), IDLoc, STI);
+ return false;
+ }
+ // If we have a signed pseudo-branch and one of the registers is $0,
+ // we can use an appropriate compare-to-zero branch. We select which one
+ // to use in the switch statement above.
+ TOut.emitRX(IsSrcRegZero ? ZeroSrcOpcode : ZeroTrgOpcode,
+ IsSrcRegZero ? TrgReg : SrcReg,
+ MCOperand::createExpr(OffsetExpr), IDLoc, STI);
+ return false;
+ }
+
+ // If neither the SrcReg nor the TrgReg are $0, we need AT to perform the
+ // expansions. If it is not available, we return.
+ unsigned ATRegNum = getATReg(IDLoc);
+ if (!ATRegNum)
+ return true;
+
+ if (!EmittedNoMacroWarning)
+ warnIfNoMacro(IDLoc);
+
+ // SLT fits well with 2 of our 4 pseudo-branches:
+ // BLT, where $rs < $rt, translates into "slt $at, $rs, $rt" and
+ // BGT, where $rs > $rt, translates into "slt $at, $rt, $rs".
+ // If the result of the SLT is 1, we branch, and if it's 0, we don't.
+ // This is accomplished by using a BNEZ with the result of the SLT.
+ //
+ // The other 2 pseudo-branches are opposites of the above 2 (BGE with BLT
+ // and BLE with BGT), so we change the BNEZ into a a BEQZ.
+ // Because only BGE and BLE branch on equality, we can use the
+ // AcceptsEquality variable to decide when to emit the BEQZ.
+ // Note that the order of the SLT arguments doesn't change between
+ // opposites.
+ //
+ // The same applies to the unsigned variants, except that SLTu is used
+ // instead of SLT.
+ TOut.emitRRR(IsUnsigned ? Mips::SLTu : Mips::SLT, ATRegNum,
+ ReverseOrderSLT ? TrgReg : SrcReg,
+ ReverseOrderSLT ? SrcReg : TrgReg, IDLoc, STI);
+
+ TOut.emitRRX(IsLikely ? (AcceptsEquality ? Mips::BEQL : Mips::BNEL)
+ : (AcceptsEquality ? Mips::BEQ : Mips::BNE),
+ ATRegNum, Mips::ZERO, MCOperand::createExpr(OffsetExpr), IDLoc,
+ STI);
+ return false;
+}
+
+bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI, const bool IsMips64,
+ const bool Signed) {
+ MipsTargetStreamer &TOut = getTargetStreamer();
+
+ warnIfNoMacro(IDLoc);
+
+ const MCOperand &RdRegOp = Inst.getOperand(0);
+ assert(RdRegOp.isReg() && "expected register operand kind");
+ unsigned RdReg = RdRegOp.getReg();
+
+ const MCOperand &RsRegOp = Inst.getOperand(1);
+ assert(RsRegOp.isReg() && "expected register operand kind");
+ unsigned RsReg = RsRegOp.getReg();
+
+ const MCOperand &RtRegOp = Inst.getOperand(2);
+ assert(RtRegOp.isReg() && "expected register operand kind");
+ unsigned RtReg = RtRegOp.getReg();
+ unsigned DivOp;
+ unsigned ZeroReg;
+
+ if (IsMips64) {
+ DivOp = Signed ? Mips::DSDIV : Mips::DUDIV;
+ ZeroReg = Mips::ZERO_64;
+ } else {
+ DivOp = Signed ? Mips::SDIV : Mips::UDIV;
+ ZeroReg = Mips::ZERO;
+ }
+
+ bool UseTraps = useTraps();
+
+ if (RsReg == Mips::ZERO || RsReg == Mips::ZERO_64) {
+ if (RtReg == Mips::ZERO || RtReg == Mips::ZERO_64)
+ Warning(IDLoc, "dividing zero by zero");
+ if (IsMips64) {
+ if (Signed && (RtReg == Mips::ZERO || RtReg == Mips::ZERO_64)) {
+ if (UseTraps) {
+ TOut.emitRRI(Mips::TEQ, RtReg, ZeroReg, 0x7, IDLoc, STI);
+ return false;
+ }
+
+ TOut.emitII(Mips::BREAK, 0x7, 0, IDLoc, STI);
+ return false;
+ }
+ } else {
+ TOut.emitRR(DivOp, RsReg, RtReg, IDLoc, STI);
+ return false;
+ }
+ }
+
+ if (RtReg == Mips::ZERO || RtReg == Mips::ZERO_64) {
+ Warning(IDLoc, "division by zero");
+ if (Signed) {
+ if (UseTraps) {
+ TOut.emitRRI(Mips::TEQ, RtReg, ZeroReg, 0x7, IDLoc, STI);
+ return false;
+ }
+
+ TOut.emitII(Mips::BREAK, 0x7, 0, IDLoc, STI);
+ return false;
+ }
+ }
+
+ // FIXME: The values for these two BranchTarget variables may be different in
+ // micromips. These magic numbers need to be removed.
+ unsigned BranchTargetNoTraps;
+ unsigned BranchTarget;
+
+ if (UseTraps) {
+ BranchTarget = IsMips64 ? 12 : 8;
+ TOut.emitRRI(Mips::TEQ, RtReg, ZeroReg, 0x7, IDLoc, STI);
+ } else {
+ BranchTarget = IsMips64 ? 20 : 16;
+ BranchTargetNoTraps = 8;
+ // Branch to the li instruction.
+ TOut.emitRRI(Mips::BNE, RtReg, ZeroReg, BranchTargetNoTraps, IDLoc, STI);
+ }
+
+ TOut.emitRR(DivOp, RsReg, RtReg, IDLoc, STI);
+
+ if (!UseTraps)
+ TOut.emitII(Mips::BREAK, 0x7, 0, IDLoc, STI);
+
+ if (!Signed) {
+ TOut.emitR(Mips::MFLO, RdReg, IDLoc, STI);
+ return false;
+ }
+
+ unsigned ATReg = getATReg(IDLoc);
+ if (!ATReg)
+ return true;
+
+ TOut.emitRRI(Mips::ADDiu, ATReg, ZeroReg, -1, IDLoc, STI);
+ if (IsMips64) {
+ // Branch to the mflo instruction.
+ TOut.emitRRI(Mips::BNE, RtReg, ATReg, BranchTarget, IDLoc, STI);
+ TOut.emitRRI(Mips::ADDiu, ATReg, ZeroReg, 1, IDLoc, STI);
+ TOut.emitRRI(Mips::DSLL32, ATReg, ATReg, 0x1f, IDLoc, STI);
+ } else {
+ // Branch to the mflo instruction.
+ TOut.emitRRI(Mips::BNE, RtReg, ATReg, BranchTarget, IDLoc, STI);
+ TOut.emitRI(Mips::LUi, ATReg, (uint16_t)0x8000, IDLoc, STI);
+ }
+
+ if (UseTraps)
+ TOut.emitRRI(Mips::TEQ, RsReg, ATReg, 0x6, IDLoc, STI);
+ else {
+ // Branch to the mflo instruction.
+ TOut.emitRRI(Mips::BNE, RsReg, ATReg, BranchTargetNoTraps, IDLoc, STI);
+ TOut.emitRRI(Mips::SLL, ZeroReg, ZeroReg, 0, IDLoc, STI);
+ TOut.emitII(Mips::BREAK, 0x6, 0, IDLoc, STI);
+ }
+ TOut.emitR(Mips::MFLO, RdReg, IDLoc, STI);
+ return false;
+}
+
+bool MipsAsmParser::expandTrunc(MCInst &Inst, bool IsDouble, bool Is64FPU,
+ SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI) {
+ MipsTargetStreamer &TOut = getTargetStreamer();
+
+ assert(Inst.getNumOperands() == 3 && "Invalid operand count");
+ assert(Inst.getOperand(0).isReg() && Inst.getOperand(1).isReg() &&
+ Inst.getOperand(2).isReg() && "Invalid instruction operand.");
+
+ unsigned FirstReg = Inst.getOperand(0).getReg();
+ unsigned SecondReg = Inst.getOperand(1).getReg();
+ unsigned ThirdReg = Inst.getOperand(2).getReg();
+
+ if (hasMips1() && !hasMips2()) {
+ unsigned ATReg = getATReg(IDLoc);
+ if (!ATReg)
+ return true;
+ TOut.emitRR(Mips::CFC1, ThirdReg, Mips::RA, IDLoc, STI);
+ TOut.emitRR(Mips::CFC1, ThirdReg, Mips::RA, IDLoc, STI);
+ TOut.emitNop(IDLoc, STI);
+ TOut.emitRRI(Mips::ORi, ATReg, ThirdReg, 0x3, IDLoc, STI);
+ TOut.emitRRI(Mips::XORi, ATReg, ATReg, 0x2, IDLoc, STI);
+ TOut.emitRR(Mips::CTC1, Mips::RA, ATReg, IDLoc, STI);
+ TOut.emitNop(IDLoc, STI);
+ TOut.emitRR(IsDouble ? (Is64FPU ? Mips::CVT_W_D64 : Mips::CVT_W_D32)
+ : Mips::CVT_W_S,
+ FirstReg, SecondReg, IDLoc, STI);
+ TOut.emitRR(Mips::CTC1, Mips::RA, ThirdReg, IDLoc, STI);
+ TOut.emitNop(IDLoc, STI);
+ return false;
+ }
+
+ TOut.emitRR(IsDouble ? (Is64FPU ? Mips::TRUNC_W_D64 : Mips::TRUNC_W_D32)
+ : Mips::TRUNC_W_S,
+ FirstReg, SecondReg, IDLoc, STI);
+
+ return false;
+}
+
+bool MipsAsmParser::expandUlh(MCInst &Inst, bool Signed, SMLoc IDLoc,
+ MCStreamer &Out, const MCSubtargetInfo *STI) {
+ if (hasMips32r6() || hasMips64r6()) {
+ return Error(IDLoc, "instruction not supported on mips32r6 or mips64r6");
+ }
+
+ const MCOperand &DstRegOp = Inst.getOperand(0);
+ assert(DstRegOp.isReg() && "expected register operand kind");
+ const MCOperand &SrcRegOp = Inst.getOperand(1);
+ assert(SrcRegOp.isReg() && "expected register operand kind");
+ const MCOperand &OffsetImmOp = Inst.getOperand(2);
+ assert(OffsetImmOp.isImm() && "expected immediate operand kind");
+
+ MipsTargetStreamer &TOut = getTargetStreamer();
+ unsigned DstReg = DstRegOp.getReg();
+ unsigned SrcReg = SrcRegOp.getReg();
+ int64_t OffsetValue = OffsetImmOp.getImm();
+
+ // NOTE: We always need AT for ULHU, as it is always used as the source
+ // register for one of the LBu's.
+ warnIfNoMacro(IDLoc);
+ unsigned ATReg = getATReg(IDLoc);
+ if (!ATReg)
+ return true;
+
+ bool IsLargeOffset = !(isInt<16>(OffsetValue + 1) && isInt<16>(OffsetValue));
+ if (IsLargeOffset) {
+ if (loadImmediate(OffsetValue, ATReg, SrcReg, !ABI.ArePtrs64bit(), true,
+ IDLoc, Out, STI))
+ return true;
+ }
+
+ int64_t FirstOffset = IsLargeOffset ? 0 : OffsetValue;
+ int64_t SecondOffset = IsLargeOffset ? 1 : (OffsetValue + 1);
+ if (isLittle())
+ std::swap(FirstOffset, SecondOffset);
+
+ unsigned FirstLbuDstReg = IsLargeOffset ? DstReg : ATReg;
+ unsigned SecondLbuDstReg = IsLargeOffset ? ATReg : DstReg;
+
+ unsigned LbuSrcReg = IsLargeOffset ? ATReg : SrcReg;
+ unsigned SllReg = IsLargeOffset ? DstReg : ATReg;
+
+ TOut.emitRRI(Signed ? Mips::LB : Mips::LBu, FirstLbuDstReg, LbuSrcReg,
+ FirstOffset, IDLoc, STI);
+ TOut.emitRRI(Mips::LBu, SecondLbuDstReg, LbuSrcReg, SecondOffset, IDLoc, STI);
+ TOut.emitRRI(Mips::SLL, SllReg, SllReg, 8, IDLoc, STI);
+ TOut.emitRRR(Mips::OR, DstReg, DstReg, ATReg, IDLoc, STI);
+
+ return false;
+}
+
+bool MipsAsmParser::expandUsh(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI) {
+ if (hasMips32r6() || hasMips64r6()) {
+ return Error(IDLoc, "instruction not supported on mips32r6 or mips64r6");
+ }
+
+ const MCOperand &DstRegOp = Inst.getOperand(0);
+ assert(DstRegOp.isReg() && "expected register operand kind");
+ const MCOperand &SrcRegOp = Inst.getOperand(1);
+ assert(SrcRegOp.isReg() && "expected register operand kind");
+ const MCOperand &OffsetImmOp = Inst.getOperand(2);
+ assert(OffsetImmOp.isImm() && "expected immediate operand kind");
+
+ MipsTargetStreamer &TOut = getTargetStreamer();
+ unsigned DstReg = DstRegOp.getReg();
+ unsigned SrcReg = SrcRegOp.getReg();
+ int64_t OffsetValue = OffsetImmOp.getImm();
+
+ warnIfNoMacro(IDLoc);
+ unsigned ATReg = getATReg(IDLoc);
+ if (!ATReg)
+ return true;
+
+ bool IsLargeOffset = !(isInt<16>(OffsetValue + 1) && isInt<16>(OffsetValue));
+ if (IsLargeOffset) {
+ if (loadImmediate(OffsetValue, ATReg, SrcReg, !ABI.ArePtrs64bit(), true,
+ IDLoc, Out, STI))
+ return true;
+ }
+
+ int64_t FirstOffset = IsLargeOffset ? 1 : (OffsetValue + 1);
+ int64_t SecondOffset = IsLargeOffset ? 0 : OffsetValue;
+ if (isLittle())
+ std::swap(FirstOffset, SecondOffset);
+
+ if (IsLargeOffset) {
+ TOut.emitRRI(Mips::SB, DstReg, ATReg, FirstOffset, IDLoc, STI);
+ TOut.emitRRI(Mips::SRL, DstReg, DstReg, 8, IDLoc, STI);
+ TOut.emitRRI(Mips::SB, DstReg, ATReg, SecondOffset, IDLoc, STI);
+ TOut.emitRRI(Mips::LBu, ATReg, ATReg, 0, IDLoc, STI);
+ TOut.emitRRI(Mips::SLL, DstReg, DstReg, 8, IDLoc, STI);
+ TOut.emitRRR(Mips::OR, DstReg, DstReg, ATReg, IDLoc, STI);
+ } else {
+ TOut.emitRRI(Mips::SB, DstReg, SrcReg, FirstOffset, IDLoc, STI);
+ TOut.emitRRI(Mips::SRL, ATReg, DstReg, 8, IDLoc, STI);
+ TOut.emitRRI(Mips::SB, ATReg, SrcReg, SecondOffset, IDLoc, STI);
+ }
+
+ return false;
+}
+
+bool MipsAsmParser::expandUxw(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI) {
+ if (hasMips32r6() || hasMips64r6()) {
+ return Error(IDLoc, "instruction not supported on mips32r6 or mips64r6");
+ }
+
+ const MCOperand &DstRegOp = Inst.getOperand(0);
+ assert(DstRegOp.isReg() && "expected register operand kind");
+ const MCOperand &SrcRegOp = Inst.getOperand(1);
+ assert(SrcRegOp.isReg() && "expected register operand kind");
+ const MCOperand &OffsetImmOp = Inst.getOperand(2);
+ assert(OffsetImmOp.isImm() && "expected immediate operand kind");
+
+ MipsTargetStreamer &TOut = getTargetStreamer();
+ unsigned DstReg = DstRegOp.getReg();
+ unsigned SrcReg = SrcRegOp.getReg();
+ int64_t OffsetValue = OffsetImmOp.getImm();
+
+ // Compute left/right load/store offsets.
+ bool IsLargeOffset = !(isInt<16>(OffsetValue + 3) && isInt<16>(OffsetValue));
+ int64_t LxlOffset = IsLargeOffset ? 0 : OffsetValue;
+ int64_t LxrOffset = IsLargeOffset ? 3 : (OffsetValue + 3);
+ if (isLittle())
+ std::swap(LxlOffset, LxrOffset);
+
+ bool IsLoadInst = (Inst.getOpcode() == Mips::Ulw);
+ bool DoMove = IsLoadInst && (SrcReg == DstReg) && !IsLargeOffset;
+ unsigned TmpReg = SrcReg;
+ if (IsLargeOffset || DoMove) {
+ warnIfNoMacro(IDLoc);
+ TmpReg = getATReg(IDLoc);
+ if (!TmpReg)
+ return true;
+ }
+
+ if (IsLargeOffset) {
+ if (loadImmediate(OffsetValue, TmpReg, SrcReg, !ABI.ArePtrs64bit(), true,
+ IDLoc, Out, STI))
+ return true;
+ }
+
+ if (DoMove)
+ std::swap(DstReg, TmpReg);
+
+ unsigned XWL = IsLoadInst ? Mips::LWL : Mips::SWL;
+ unsigned XWR = IsLoadInst ? Mips::LWR : Mips::SWR;
+ TOut.emitRRI(XWL, DstReg, TmpReg, LxlOffset, IDLoc, STI);
+ TOut.emitRRI(XWR, DstReg, TmpReg, LxrOffset, IDLoc, STI);
+
+ if (DoMove)
+ TOut.emitRRR(Mips::OR, TmpReg, DstReg, Mips::ZERO, IDLoc, STI);
+
+ return false;
+}
+
+bool MipsAsmParser::expandAliasImmediate(MCInst &Inst, SMLoc IDLoc,
+ MCStreamer &Out,
+ const MCSubtargetInfo *STI) {
+ MipsTargetStreamer &TOut = getTargetStreamer();
+
+ assert (Inst.getNumOperands() == 3 && "Invalid operand count");
+ assert (Inst.getOperand(0).isReg() &&
+ Inst.getOperand(1).isReg() &&
+ Inst.getOperand(2).isImm() && "Invalid instruction operand.");
+
+ unsigned ATReg = Mips::NoRegister;
+ unsigned FinalDstReg = Mips::NoRegister;
+ unsigned DstReg = Inst.getOperand(0).getReg();
+ unsigned SrcReg = Inst.getOperand(1).getReg();
+ int64_t ImmValue = Inst.getOperand(2).getImm();
+
+ bool Is32Bit = isInt<32>(ImmValue) || isUInt<32>(ImmValue);
+
+ unsigned FinalOpcode = Inst.getOpcode();
+
+ if (DstReg == SrcReg) {
+ ATReg = getATReg(Inst.getLoc());
+ if (!ATReg)
+ return true;
+ FinalDstReg = DstReg;
+ DstReg = ATReg;
+ }
+
+ if (!loadImmediate(ImmValue, DstReg, Mips::NoRegister, Is32Bit, false, Inst.getLoc(), Out, STI)) {
+ switch (FinalOpcode) {
+ default:
+ llvm_unreachable("unimplemented expansion");
+ case (Mips::ADDi):
+ FinalOpcode = Mips::ADD;
+ break;
+ case (Mips::ADDiu):
+ FinalOpcode = Mips::ADDu;
+ break;
+ case (Mips::ANDi):
+ FinalOpcode = Mips::AND;
+ break;
+ case (Mips::NORImm):
+ FinalOpcode = Mips::NOR;
+ break;
+ case (Mips::ORi):
+ FinalOpcode = Mips::OR;
+ break;
+ case (Mips::SLTi):
+ FinalOpcode = Mips::SLT;
+ break;
+ case (Mips::SLTiu):
+ FinalOpcode = Mips::SLTu;
+ break;
+ case (Mips::XORi):
+ FinalOpcode = Mips::XOR;
+ break;
+ }
+
+ if (FinalDstReg == Mips::NoRegister)
+ TOut.emitRRR(FinalOpcode, DstReg, DstReg, SrcReg, IDLoc, STI);
+ else
+ TOut.emitRRR(FinalOpcode, FinalDstReg, FinalDstReg, DstReg, IDLoc, STI);
+ return false;
+ }
+ return true;
+}
+
+bool MipsAsmParser::expandRotation(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI) {
+ MipsTargetStreamer &TOut = getTargetStreamer();
+ unsigned ATReg = Mips::NoRegister;
+ unsigned DReg = Inst.getOperand(0).getReg();
+ unsigned SReg = Inst.getOperand(1).getReg();
+ unsigned TReg = Inst.getOperand(2).getReg();
+ unsigned TmpReg = DReg;
+
+ unsigned FirstShift = Mips::NOP;
+ unsigned SecondShift = Mips::NOP;
+
+ if (hasMips32r2()) {
+
+ if (DReg == SReg) {
+ TmpReg = getATReg(Inst.getLoc());
+ if (!TmpReg)
+ return true;
+ }
+
+ if (Inst.getOpcode() == Mips::ROL) {
+ TOut.emitRRR(Mips::SUBu, TmpReg, Mips::ZERO, TReg, Inst.getLoc(), STI);
+ TOut.emitRRR(Mips::ROTRV, DReg, SReg, TmpReg, Inst.getLoc(), STI);
+ return false;
+ }
+
+ if (Inst.getOpcode() == Mips::ROR) {
+ TOut.emitRRR(Mips::ROTRV, DReg, SReg, TReg, Inst.getLoc(), STI);
+ return false;
+ }
+
+ return true;
+ }
+
+ if (hasMips32()) {
+
+ switch (Inst.getOpcode()) {
+ default:
+ llvm_unreachable("unexpected instruction opcode");
+ case Mips::ROL:
+ FirstShift = Mips::SRLV;
+ SecondShift = Mips::SLLV;
+ break;
+ case Mips::ROR:
+ FirstShift = Mips::SLLV;
+ SecondShift = Mips::SRLV;
+ break;
+ }
+
+ ATReg = getATReg(Inst.getLoc());
+ if (!ATReg)
+ return true;
+
+ TOut.emitRRR(Mips::SUBu, ATReg, Mips::ZERO, TReg, Inst.getLoc(), STI);
+ TOut.emitRRR(FirstShift, ATReg, SReg, ATReg, Inst.getLoc(), STI);
+ TOut.emitRRR(SecondShift, DReg, SReg, TReg, Inst.getLoc(), STI);
+ TOut.emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), STI);
+
+ return false;
+ }
+
+ return true;
+}
+
+bool MipsAsmParser::expandRotationImm(MCInst &Inst, SMLoc IDLoc,
+ MCStreamer &Out,
+ const MCSubtargetInfo *STI) {
+ MipsTargetStreamer &TOut = getTargetStreamer();
+ unsigned ATReg = Mips::NoRegister;
+ unsigned DReg = Inst.getOperand(0).getReg();
+ unsigned SReg = Inst.getOperand(1).getReg();
+ int64_t ImmValue = Inst.getOperand(2).getImm();
+
+ unsigned FirstShift = Mips::NOP;
+ unsigned SecondShift = Mips::NOP;
+
+ if (hasMips32r2()) {
+
+ if (Inst.getOpcode() == Mips::ROLImm) {
+ uint64_t MaxShift = 32;
+ uint64_t ShiftValue = ImmValue;
+ if (ImmValue != 0)
+ ShiftValue = MaxShift - ImmValue;
+ TOut.emitRRI(Mips::ROTR, DReg, SReg, ShiftValue, Inst.getLoc(), STI);
+ return false;
+ }
+
+ if (Inst.getOpcode() == Mips::RORImm) {
+ TOut.emitRRI(Mips::ROTR, DReg, SReg, ImmValue, Inst.getLoc(), STI);
+ return false;
+ }
+
+ return true;
+ }
+
+ if (hasMips32()) {
+
+ if (ImmValue == 0) {
+ TOut.emitRRI(Mips::SRL, DReg, SReg, 0, Inst.getLoc(), STI);
+ return false;
+ }
+
+ switch (Inst.getOpcode()) {
+ default:
+ llvm_unreachable("unexpected instruction opcode");
+ case Mips::ROLImm:
+ FirstShift = Mips::SLL;
+ SecondShift = Mips::SRL;
+ break;
+ case Mips::RORImm:
+ FirstShift = Mips::SRL;
+ SecondShift = Mips::SLL;
+ break;
+ }
+
+ ATReg = getATReg(Inst.getLoc());
+ if (!ATReg)
+ return true;
+
+ TOut.emitRRI(FirstShift, ATReg, SReg, ImmValue, Inst.getLoc(), STI);
+ TOut.emitRRI(SecondShift, DReg, SReg, 32 - ImmValue, Inst.getLoc(), STI);
+ TOut.emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), STI);
+
+ return false;
+ }
+
+ return true;
+}
+
+bool MipsAsmParser::expandDRotation(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI) {
+ MipsTargetStreamer &TOut = getTargetStreamer();
+ unsigned ATReg = Mips::NoRegister;
+ unsigned DReg = Inst.getOperand(0).getReg();
+ unsigned SReg = Inst.getOperand(1).getReg();
+ unsigned TReg = Inst.getOperand(2).getReg();
+ unsigned TmpReg = DReg;
+
+ unsigned FirstShift = Mips::NOP;
+ unsigned SecondShift = Mips::NOP;
+
+ if (hasMips64r2()) {
+
+ if (TmpReg == SReg) {
+ TmpReg = getATReg(Inst.getLoc());
+ if (!TmpReg)
+ return true;
+ }
+
+ if (Inst.getOpcode() == Mips::DROL) {
+ TOut.emitRRR(Mips::DSUBu, TmpReg, Mips::ZERO, TReg, Inst.getLoc(), STI);
+ TOut.emitRRR(Mips::DROTRV, DReg, SReg, TmpReg, Inst.getLoc(), STI);
+ return false;
+ }
+
+ if (Inst.getOpcode() == Mips::DROR) {
+ TOut.emitRRR(Mips::DROTRV, DReg, SReg, TReg, Inst.getLoc(), STI);
+ return false;
+ }
+
+ return true;
+ }
+
+ if (hasMips64()) {
+
+ switch (Inst.getOpcode()) {
+ default:
+ llvm_unreachable("unexpected instruction opcode");
+ case Mips::DROL:
+ FirstShift = Mips::DSRLV;
+ SecondShift = Mips::DSLLV;
+ break;
+ case Mips::DROR:
+ FirstShift = Mips::DSLLV;
+ SecondShift = Mips::DSRLV;
+ break;
+ }
+
+ ATReg = getATReg(Inst.getLoc());
+ if (!ATReg)
+ return true;
+
+ TOut.emitRRR(Mips::DSUBu, ATReg, Mips::ZERO, TReg, Inst.getLoc(), STI);
+ TOut.emitRRR(FirstShift, ATReg, SReg, ATReg, Inst.getLoc(), STI);
+ TOut.emitRRR(SecondShift, DReg, SReg, TReg, Inst.getLoc(), STI);
+ TOut.emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), STI);
+
+ return false;
+ }
+
+ return true;
+}
+
+bool MipsAsmParser::expandDRotationImm(MCInst &Inst, SMLoc IDLoc,
+ MCStreamer &Out,
+ const MCSubtargetInfo *STI) {
+ MipsTargetStreamer &TOut = getTargetStreamer();
+ unsigned ATReg = Mips::NoRegister;
+ unsigned DReg = Inst.getOperand(0).getReg();
+ unsigned SReg = Inst.getOperand(1).getReg();
+ int64_t ImmValue = Inst.getOperand(2).getImm() % 64;
+
+ unsigned FirstShift = Mips::NOP;
+ unsigned SecondShift = Mips::NOP;
+
+ MCInst TmpInst;
+
+ if (hasMips64r2()) {
+
+ unsigned FinalOpcode = Mips::NOP;
+ if (ImmValue == 0)
+ FinalOpcode = Mips::DROTR;
+ else if (ImmValue % 32 == 0)
+ FinalOpcode = Mips::DROTR32;
+ else if ((ImmValue >= 1) && (ImmValue <= 32)) {
+ if (Inst.getOpcode() == Mips::DROLImm)
+ FinalOpcode = Mips::DROTR32;
+ else
+ FinalOpcode = Mips::DROTR;
+ } else if (ImmValue >= 33) {
+ if (Inst.getOpcode() == Mips::DROLImm)
+ FinalOpcode = Mips::DROTR;
+ else
+ FinalOpcode = Mips::DROTR32;
+ }
+
+ uint64_t ShiftValue = ImmValue % 32;
+ if (Inst.getOpcode() == Mips::DROLImm)
+ ShiftValue = (32 - ImmValue % 32) % 32;
+
+ TOut.emitRRI(FinalOpcode, DReg, SReg, ShiftValue, Inst.getLoc(), STI);
+
+ return false;
+ }
+
+ if (hasMips64()) {
+
+ if (ImmValue == 0) {
+ TOut.emitRRI(Mips::DSRL, DReg, SReg, 0, Inst.getLoc(), STI);
+ return false;
+ }
+
+ switch (Inst.getOpcode()) {
+ default:
+ llvm_unreachable("unexpected instruction opcode");
+ case Mips::DROLImm:
+ if ((ImmValue >= 1) && (ImmValue <= 31)) {
+ FirstShift = Mips::DSLL;
+ SecondShift = Mips::DSRL32;
+ }
+ if (ImmValue == 32) {
+ FirstShift = Mips::DSLL32;
+ SecondShift = Mips::DSRL32;
+ }
+ if ((ImmValue >= 33) && (ImmValue <= 63)) {
+ FirstShift = Mips::DSLL32;
+ SecondShift = Mips::DSRL;
+ }
+ break;
+ case Mips::DRORImm:
+ if ((ImmValue >= 1) && (ImmValue <= 31)) {
+ FirstShift = Mips::DSRL;
+ SecondShift = Mips::DSLL32;
+ }
+ if (ImmValue == 32) {
+ FirstShift = Mips::DSRL32;
+ SecondShift = Mips::DSLL32;
+ }
+ if ((ImmValue >= 33) && (ImmValue <= 63)) {
+ FirstShift = Mips::DSRL32;
+ SecondShift = Mips::DSLL;
+ }
+ break;
+ }
+
+ ATReg = getATReg(Inst.getLoc());
+ if (!ATReg)
+ return true;
+
+ TOut.emitRRI(FirstShift, ATReg, SReg, ImmValue % 32, Inst.getLoc(), STI);
+ TOut.emitRRI(SecondShift, DReg, SReg, (32 - ImmValue % 32) % 32,
+ Inst.getLoc(), STI);
+ TOut.emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), STI);
+
+ return false;
+ }
+
+ return true;
+}
+
+bool MipsAsmParser::expandAbs(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI) {
+ MipsTargetStreamer &TOut = getTargetStreamer();
+ unsigned FirstRegOp = Inst.getOperand(0).getReg();
+ unsigned SecondRegOp = Inst.getOperand(1).getReg();
+
+ TOut.emitRI(Mips::BGEZ, SecondRegOp, 8, IDLoc, STI);
+ if (FirstRegOp != SecondRegOp)
+ TOut.emitRRR(Mips::ADDu, FirstRegOp, SecondRegOp, Mips::ZERO, IDLoc, STI);
+ else
+ TOut.emitEmptyDelaySlot(false, IDLoc, STI);
+ TOut.emitRRR(Mips::SUB, FirstRegOp, Mips::ZERO, SecondRegOp, IDLoc, STI);
+
+ return false;
+}
+
+static unsigned nextReg(unsigned Reg) {
+ switch (Reg) {
+ case Mips::ZERO: return Mips::AT;
+ case Mips::AT: return Mips::V0;
+ case Mips::V0: return Mips::V1;
+ case Mips::V1: return Mips::A0;
+ case Mips::A0: return Mips::A1;
+ case Mips::A1: return Mips::A2;
+ case Mips::A2: return Mips::A3;
+ case Mips::A3: return Mips::T0;
+ case Mips::T0: return Mips::T1;
+ case Mips::T1: return Mips::T2;
+ case Mips::T2: return Mips::T3;
+ case Mips::T3: return Mips::T4;
+ case Mips::T4: return Mips::T5;
+ case Mips::T5: return Mips::T6;
+ case Mips::T6: return Mips::T7;
+ case Mips::T7: return Mips::S0;
+ case Mips::S0: return Mips::S1;
+ case Mips::S1: return Mips::S2;
+ case Mips::S2: return Mips::S3;
+ case Mips::S3: return Mips::S4;
+ case Mips::S4: return Mips::S5;
+ case Mips::S5: return Mips::S6;
+ case Mips::S6: return Mips::S7;
+ case Mips::S7: return Mips::T8;
+ case Mips::T8: return Mips::T9;
+ case Mips::T9: return Mips::K0;
+ case Mips::K0: return Mips::K1;
+ case Mips::K1: return Mips::GP;
+ case Mips::GP: return Mips::SP;
+ case Mips::SP: return Mips::FP;
+ case Mips::FP: return Mips::RA;
+ case Mips::RA: return Mips::ZERO;
+ default: return 0;
+ }
+
+}
+
+// Expand 'ld $<reg> offset($reg2)' to 'lw $<reg>, offset($reg2);
+// lw $<reg+1>>, offset+4($reg2)'
+// or expand 'sd $<reg> offset($reg2)' to 'sw $<reg>, offset($reg2);
+// sw $<reg+1>>, offset+4($reg2)'
+// for O32.
+bool MipsAsmParser::expandLoadStoreDMacro(MCInst &Inst, SMLoc IDLoc,
+ MCStreamer &Out,
+ const MCSubtargetInfo *STI,
+ bool IsLoad) {
+ if (!isABI_O32())
+ return true;
+
+ warnIfNoMacro(IDLoc);
+
+ MipsTargetStreamer &TOut = getTargetStreamer();
+ unsigned Opcode = IsLoad ? Mips::LW : Mips::SW;
+ unsigned FirstReg = Inst.getOperand(0).getReg();
+ unsigned SecondReg = nextReg(FirstReg);
+ unsigned BaseReg = Inst.getOperand(1).getReg();
+ if (!SecondReg)
+ return true;
+
+ warnIfRegIndexIsAT(FirstReg, IDLoc);
+
+ assert(Inst.getOperand(2).isImm() &&
+ "Offset for load macro is not immediate!");
+
+ MCOperand &FirstOffset = Inst.getOperand(2);
+ signed NextOffset = FirstOffset.getImm() + 4;
+ MCOperand SecondOffset = MCOperand::createImm(NextOffset);
+
+ if (!isInt<16>(FirstOffset.getImm()) || !isInt<16>(NextOffset))
+ return true;
+
+ // For loads, clobber the base register with the second load instead of the
+ // first if the BaseReg == FirstReg.
+ if (FirstReg != BaseReg || !IsLoad) {
+ TOut.emitRRX(Opcode, FirstReg, BaseReg, FirstOffset, IDLoc, STI);
+ TOut.emitRRX(Opcode, SecondReg, BaseReg, SecondOffset, IDLoc, STI);
+ } else {
+ TOut.emitRRX(Opcode, SecondReg, BaseReg, SecondOffset, IDLoc, STI);
+ TOut.emitRRX(Opcode, FirstReg, BaseReg, FirstOffset, IDLoc, STI);
+ }
+
+ return false;
+}
+
+bool MipsAsmParser::expandSeq(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI) {
+
+ warnIfNoMacro(IDLoc);
+ MipsTargetStreamer &TOut = getTargetStreamer();
+
+ if (Inst.getOperand(1).getReg() != Mips::ZERO &&
+ Inst.getOperand(2).getReg() != Mips::ZERO) {
+ TOut.emitRRR(Mips::XOR, Inst.getOperand(0).getReg(),
+ Inst.getOperand(1).getReg(), Inst.getOperand(2).getReg(),
+ IDLoc, STI);
+ TOut.emitRRI(Mips::SLTiu, Inst.getOperand(0).getReg(),
+ Inst.getOperand(0).getReg(), 1, IDLoc, STI);
+ return false;
+ }
+
+ unsigned Reg = 0;
+ if (Inst.getOperand(1).getReg() == Mips::ZERO) {
+ Reg = Inst.getOperand(2).getReg();
+ } else {
+ Reg = Inst.getOperand(1).getReg();
+ }
+ TOut.emitRRI(Mips::SLTiu, Inst.getOperand(0).getReg(), Reg, 1, IDLoc, STI);
+ return false;
+}
+
+bool MipsAsmParser::expandSeqI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI) {
+
+ warnIfNoMacro(IDLoc);
+ MipsTargetStreamer &TOut = getTargetStreamer();
+
+ unsigned Opc;
+ int64_t Imm = Inst.getOperand(2).getImm();
+ unsigned Reg = Inst.getOperand(1).getReg();
+
+ if (Imm == 0) {
+ TOut.emitRRI(Mips::SLTiu, Inst.getOperand(0).getReg(),
+ Inst.getOperand(1).getReg(), 1, IDLoc, STI);
+ return false;
+ } else {
+
+ if (Reg == Mips::ZERO) {
+ Warning(IDLoc, "comparison is always false");
+ TOut.emitRRR(isGP64bit() ? Mips::DADDu : Mips::ADDu,
+ Inst.getOperand(0).getReg(), Reg, Reg, IDLoc, STI);
+ return false;
+ }
+
+ if (Imm > -0x8000 && Imm < 0) {
+ Imm = -Imm;
+ Opc = isGP64bit() ? Mips::DADDiu : Mips::ADDiu;
+ } else {
+ Opc = Mips::XORi;
+ }
+ }
+ if (!isUInt<16>(Imm)) {
+ unsigned ATReg = getATReg(IDLoc);
+ if (!ATReg)
+ return true;
+
+ if (loadImmediate(Imm, ATReg, Mips::NoRegister, true, isGP64bit(), IDLoc,
+ Out, STI))
+ return true;
+
+ TOut.emitRRR(Mips::XOR, Inst.getOperand(0).getReg(),
+ Inst.getOperand(1).getReg(), ATReg, IDLoc, STI);
+ TOut.emitRRI(Mips::SLTiu, Inst.getOperand(0).getReg(),
+ Inst.getOperand(0).getReg(), 1, IDLoc, STI);
+ return false;
+ }
+
+ TOut.emitRRI(Opc, Inst.getOperand(0).getReg(), Inst.getOperand(1).getReg(),
+ Imm, IDLoc, STI);
+ TOut.emitRRI(Mips::SLTiu, Inst.getOperand(0).getReg(),
+ Inst.getOperand(0).getReg(), 1, IDLoc, STI);
+ return false;
+}
+
+unsigned
+MipsAsmParser::checkEarlyTargetMatchPredicate(MCInst &Inst,
+ const OperandVector &Operands) {
+ switch (Inst.getOpcode()) {
+ default:
+ return Match_Success;
+ case Mips::DATI:
+ case Mips::DAHI:
+ case Mips::DATI_MM64R6:
+ case Mips::DAHI_MM64R6:
+ if (static_cast<MipsOperand &>(*Operands[1])
+ .isValidForTie(static_cast<MipsOperand &>(*Operands[2])))
+ return Match_Success;
+ return Match_RequiresSameSrcAndDst;
+ }
+}
+unsigned MipsAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
+ switch (Inst.getOpcode()) {
+ // As described by the MIPSR6 spec, daui must not use the zero operand for
+ // its source operand.
+ case Mips::DAUI:
+ case Mips::DAUI_MM64R6:
+ if (Inst.getOperand(1).getReg() == Mips::ZERO ||
+ Inst.getOperand(1).getReg() == Mips::ZERO_64)
+ return Match_RequiresNoZeroRegister;
+ return Match_Success;
+ // As described by the Mips32r2 spec, the registers Rd and Rs for
+ // jalr.hb must be different.
+ // It also applies for registers Rt and Rs of microMIPSr6 jalrc.hb instruction
+ // and registers Rd and Base for microMIPS lwp instruction
+ case Mips::JALR_HB:
+ case Mips::JALRC_HB_MMR6:
+ case Mips::JALRC_MMR6:
+ if (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg())
+ return Match_RequiresDifferentSrcAndDst;
+ return Match_Success;
+ case Mips::LWP_MM:
+ case Mips::LWP_MMR6:
+ if (Inst.getOperand(0).getReg() == Inst.getOperand(2).getReg())
+ return Match_RequiresDifferentSrcAndDst;
+ return Match_Success;
+ case Mips::SYNC:
+ if (Inst.getOperand(0).getImm() != 0 && !hasMips32())
+ return Match_NonZeroOperandForSync;
+ return Match_Success;
+ // As described the MIPSR6 spec, the compact branches that compare registers
+ // must:
+ // a) Not use the zero register.
+ // b) Not use the same register twice.
+ // c) rs < rt for bnec, beqc.
+ // NB: For this case, the encoding will swap the operands as their
+ // ordering doesn't matter. GAS performs this transformation too.
+ // Hence, that constraint does not have to be enforced.
+ //
+ // The compact branches that branch iff the signed addition of two registers
+ // would overflow must have rs >= rt. That can be handled like beqc/bnec with
+ // operand swapping. They do not have restriction of using the zero register.
+ case Mips::BLEZC: case Mips::BLEZC_MMR6:
+ case Mips::BGEZC: case Mips::BGEZC_MMR6:
+ case Mips::BGTZC: case Mips::BGTZC_MMR6:
+ case Mips::BLTZC: case Mips::BLTZC_MMR6:
+ case Mips::BEQZC: case Mips::BEQZC_MMR6:
+ case Mips::BNEZC: case Mips::BNEZC_MMR6:
+ case Mips::BLEZC64:
+ case Mips::BGEZC64:
+ case Mips::BGTZC64:
+ case Mips::BLTZC64:
+ case Mips::BEQZC64:
+ case Mips::BNEZC64:
+ if (Inst.getOperand(0).getReg() == Mips::ZERO ||
+ Inst.getOperand(0).getReg() == Mips::ZERO_64)
+ return Match_RequiresNoZeroRegister;
+ return Match_Success;
+ case Mips::BGEC: case Mips::BGEC_MMR6:
+ case Mips::BLTC: case Mips::BLTC_MMR6:
+ case Mips::BGEUC: case Mips::BGEUC_MMR6:
+ case Mips::BLTUC: case Mips::BLTUC_MMR6:
+ case Mips::BEQC: case Mips::BEQC_MMR6:
+ case Mips::BNEC: case Mips::BNEC_MMR6:
+ case Mips::BGEC64:
+ case Mips::BLTC64:
+ case Mips::BGEUC64:
+ case Mips::BLTUC64:
+ case Mips::BEQC64:
+ case Mips::BNEC64:
+ if (Inst.getOperand(0).getReg() == Mips::ZERO ||
+ Inst.getOperand(0).getReg() == Mips::ZERO_64)
+ return Match_RequiresNoZeroRegister;
+ if (Inst.getOperand(1).getReg() == Mips::ZERO ||
+ Inst.getOperand(1).getReg() == Mips::ZERO_64)
+ return Match_RequiresNoZeroRegister;
+ if (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg())
+ return Match_RequiresDifferentOperands;
+ return Match_Success;
+ default:
+ return Match_Success;
+ }
+}
+
+static SMLoc RefineErrorLoc(const SMLoc Loc, const OperandVector &Operands,
+ uint64_t ErrorInfo) {
+ if (ErrorInfo != ~0ULL && ErrorInfo < Operands.size()) {
+ SMLoc ErrorLoc = Operands[ErrorInfo]->getStartLoc();
+ if (ErrorLoc == SMLoc())
+ return Loc;
+ return ErrorLoc;
+ }
+ return Loc;
+}
+
+bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands,
+ MCStreamer &Out,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) {
+
+ MCInst Inst;
+ unsigned MatchResult =
+ MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm);
+
+ switch (MatchResult) {
+ case Match_Success: {
+ if (processInstruction(Inst, IDLoc, Out, STI))
+ return true;
+ return false;
+ }
+ case Match_MissingFeature:
+ Error(IDLoc, "instruction requires a CPU feature not currently enabled");
+ return true;
+ case Match_InvalidOperand: {
+ SMLoc ErrorLoc = IDLoc;
+ if (ErrorInfo != ~0ULL) {
+ if (ErrorInfo >= Operands.size())
+ return Error(IDLoc, "too few operands for instruction");
+
+ ErrorLoc = Operands[ErrorInfo]->getStartLoc();
+ if (ErrorLoc == SMLoc())
+ ErrorLoc = IDLoc;
+ }
+
+ return Error(ErrorLoc, "invalid operand for instruction");
+ }
+ case Match_NonZeroOperandForSync:
+ return Error(IDLoc, "s-type must be zero or unspecified for pre-MIPS32 ISAs");
+ case Match_MnemonicFail:
+ return Error(IDLoc, "invalid instruction");
+ case Match_RequiresDifferentSrcAndDst:
+ return Error(IDLoc, "source and destination must be different");
+ case Match_RequiresDifferentOperands:
+ return Error(IDLoc, "registers must be different");
+ case Match_RequiresNoZeroRegister:
+ return Error(IDLoc, "invalid operand ($zero) for instruction");
+ case Match_RequiresSameSrcAndDst:
+ return Error(IDLoc, "source and destination must match");
+ case Match_Immz:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected '0'");
+ case Match_UImm1_0:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected 1-bit unsigned immediate");
+ case Match_UImm2_0:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected 2-bit unsigned immediate");
+ case Match_UImm2_1:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected immediate in range 1 .. 4");
+ case Match_UImm3_0:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected 3-bit unsigned immediate");
+ case Match_UImm4_0:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected 4-bit unsigned immediate");
+ case Match_SImm4_0:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected 4-bit signed immediate");
+ case Match_UImm5_0:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected 5-bit unsigned immediate");
+ case Match_SImm5_0:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected 5-bit signed immediate");
+ case Match_UImm5_1:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected immediate in range 1 .. 32");
+ case Match_UImm5_32:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected immediate in range 32 .. 63");
+ case Match_UImm5_33:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected immediate in range 33 .. 64");
+ case Match_UImm5_0_Report_UImm6:
+ // This is used on UImm5 operands that have a corresponding UImm5_32
+ // operand to avoid confusing the user.
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected 6-bit unsigned immediate");
+ case Match_UImm5_Lsl2:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected both 7-bit unsigned immediate and multiple of 4");
+ case Match_UImmRange2_64:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected immediate in range 2 .. 64");
+ case Match_UImm6_0:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected 6-bit unsigned immediate");
+ case Match_UImm6_Lsl2:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected both 8-bit unsigned immediate and multiple of 4");
+ case Match_SImm6_0:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected 6-bit signed immediate");
+ case Match_UImm7_0:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected 7-bit unsigned immediate");
+ case Match_UImm7_N1:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected immediate in range -1 .. 126");
+ case Match_SImm7_Lsl2:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected both 9-bit signed immediate and multiple of 4");
+ case Match_UImm8_0:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected 8-bit unsigned immediate");
+ case Match_UImm10_0:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected 10-bit unsigned immediate");
+ case Match_SImm10_0:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected 10-bit signed immediate");
+ case Match_SImm11_0:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected 11-bit signed immediate");
+ case Match_UImm16:
+ case Match_UImm16_Relaxed:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected 16-bit unsigned immediate");
+ case Match_SImm16:
+ case Match_SImm16_Relaxed:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected 16-bit signed immediate");
+ case Match_SImm19_Lsl2:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected both 19-bit signed immediate and multiple of 4");
+ case Match_UImm20_0:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected 20-bit unsigned immediate");
+ case Match_UImm26_0:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected 26-bit unsigned immediate");
+ case Match_SImm32:
+ case Match_SImm32_Relaxed:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected 32-bit signed immediate");
+ case Match_UImm32_Coerced:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected 32-bit immediate");
+ case Match_MemSImm9:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected memory with 9-bit signed offset");
+ case Match_MemSImm10:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected memory with 10-bit signed offset");
+ case Match_MemSImm10Lsl1:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected memory with 11-bit signed offset and multiple of 2");
+ case Match_MemSImm10Lsl2:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected memory with 12-bit signed offset and multiple of 4");
+ case Match_MemSImm10Lsl3:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected memory with 13-bit signed offset and multiple of 8");
+ case Match_MemSImm11:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected memory with 11-bit signed offset");
+ case Match_MemSImm12:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected memory with 12-bit signed offset");
+ case Match_MemSImm16:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected memory with 16-bit signed offset");
+ }
+
+ llvm_unreachable("Implement any new match types added!");
+}
+
+void MipsAsmParser::warnIfRegIndexIsAT(unsigned RegIndex, SMLoc Loc) {
+ if (RegIndex != 0 && AssemblerOptions.back()->getATRegIndex() == RegIndex)
+ Warning(Loc, "used $at (currently $" + Twine(RegIndex) +
+ ") without \".set noat\"");
+}
+
+void MipsAsmParser::warnIfNoMacro(SMLoc Loc) {
+ if (!AssemblerOptions.back()->isMacro())
+ Warning(Loc, "macro instruction expanded into multiple instructions");
+}
+
+void
+MipsAsmParser::printWarningWithFixIt(const Twine &Msg, const Twine &FixMsg,
+ SMRange Range, bool ShowColors) {
+ getSourceManager().PrintMessage(Range.Start, SourceMgr::DK_Warning, Msg,
+ Range, SMFixIt(Range, FixMsg),
+ ShowColors);
+}
+
+int MipsAsmParser::matchCPURegisterName(StringRef Name) {
+ int CC;
+
+ CC = StringSwitch<unsigned>(Name)
+ .Case("zero", 0)
+ .Case("at", 1)
+ .Case("a0", 4)
+ .Case("a1", 5)
+ .Case("a2", 6)
+ .Case("a3", 7)
+ .Case("v0", 2)
+ .Case("v1", 3)
+ .Case("s0", 16)
+ .Case("s1", 17)
+ .Case("s2", 18)
+ .Case("s3", 19)
+ .Case("s4", 20)
+ .Case("s5", 21)
+ .Case("s6", 22)
+ .Case("s7", 23)
+ .Case("k0", 26)
+ .Case("k1", 27)
+ .Case("gp", 28)
+ .Case("sp", 29)
+ .Case("fp", 30)
+ .Case("s8", 30)
+ .Case("ra", 31)
+ .Case("t0", 8)
+ .Case("t1", 9)
+ .Case("t2", 10)
+ .Case("t3", 11)
+ .Case("t4", 12)
+ .Case("t5", 13)
+ .Case("t6", 14)
+ .Case("t7", 15)
+ .Case("t8", 24)
+ .Case("t9", 25)
+ .Default(-1);
+
+ if (!(isABI_N32() || isABI_N64()))
+ return CC;
+
+ if (12 <= CC && CC <= 15) {
+ // Name is one of t4-t7
+ AsmToken RegTok = getLexer().peekTok();
+ SMRange RegRange = RegTok.getLocRange();
+
+ StringRef FixedName = StringSwitch<StringRef>(Name)
+ .Case("t4", "t0")
+ .Case("t5", "t1")
+ .Case("t6", "t2")
+ .Case("t7", "t3")
+ .Default("");
+ assert(FixedName != "" && "Register name is not one of t4-t7.");
+
+ printWarningWithFixIt("register names $t4-$t7 are only available in O32.",
+ "Did you mean $" + FixedName + "?", RegRange);
+ }
+
+ // Although SGI documentation just cuts out t0-t3 for n32/n64,
+ // GNU pushes the values of t0-t3 to override the o32/o64 values for t4-t7
+ // We are supporting both cases, so for t0-t3 we'll just push them to t4-t7.
+ if (8 <= CC && CC <= 11)
+ CC += 4;
+
+ if (CC == -1)
+ CC = StringSwitch<unsigned>(Name)
+ .Case("a4", 8)
+ .Case("a5", 9)
+ .Case("a6", 10)
+ .Case("a7", 11)
+ .Case("kt0", 26)
+ .Case("kt1", 27)
+ .Default(-1);
+
+ return CC;
+}
+
+int MipsAsmParser::matchHWRegsRegisterName(StringRef Name) {
+ int CC;
+
+ CC = StringSwitch<unsigned>(Name)
+ .Case("hwr_cpunum", 0)
+ .Case("hwr_synci_step", 1)
+ .Case("hwr_cc", 2)
+ .Case("hwr_ccres", 3)
+ .Case("hwr_ulr", 29)
+ .Default(-1);
+
+ return CC;
+}
+
+int MipsAsmParser::matchFPURegisterName(StringRef Name) {
+
+ if (Name[0] == 'f') {
+ StringRef NumString = Name.substr(1);
+ unsigned IntVal;
+ if (NumString.getAsInteger(10, IntVal))
+ return -1; // This is not an integer.
+ if (IntVal > 31) // Maximum index for fpu register.
+ return -1;
+ return IntVal;
+ }
+ return -1;
+}
+
+int MipsAsmParser::matchFCCRegisterName(StringRef Name) {
+
+ if (Name.startswith("fcc")) {
+ StringRef NumString = Name.substr(3);
+ unsigned IntVal;
+ if (NumString.getAsInteger(10, IntVal))
+ return -1; // This is not an integer.
+ if (IntVal > 7) // There are only 8 fcc registers.
+ return -1;
+ return IntVal;
+ }
+ return -1;
+}
+
+int MipsAsmParser::matchACRegisterName(StringRef Name) {
+
+ if (Name.startswith("ac")) {
+ StringRef NumString = Name.substr(2);
+ unsigned IntVal;
+ if (NumString.getAsInteger(10, IntVal))
+ return -1; // This is not an integer.
+ if (IntVal > 3) // There are only 3 acc registers.
+ return -1;
+ return IntVal;
+ }
+ return -1;
+}
+
+int MipsAsmParser::matchMSA128RegisterName(StringRef Name) {
+ unsigned IntVal;
+
+ if (Name.front() != 'w' || Name.drop_front(1).getAsInteger(10, IntVal))
+ return -1;
+
+ if (IntVal > 31)
+ return -1;
+
+ return IntVal;
+}
+
+int MipsAsmParser::matchMSA128CtrlRegisterName(StringRef Name) {
+ int CC;
+
+ CC = StringSwitch<unsigned>(Name)
+ .Case("msair", 0)
+ .Case("msacsr", 1)
+ .Case("msaaccess", 2)
+ .Case("msasave", 3)
+ .Case("msamodify", 4)
+ .Case("msarequest", 5)
+ .Case("msamap", 6)
+ .Case("msaunmap", 7)
+ .Default(-1);
+
+ return CC;
+}
+
+unsigned MipsAsmParser::getATReg(SMLoc Loc) {
+ unsigned ATIndex = AssemblerOptions.back()->getATRegIndex();
+ if (ATIndex == 0) {
+ reportParseError(Loc,
+ "pseudo-instruction requires $at, which is not available");
+ return 0;
+ }
+ unsigned AT = getReg(
+ (isGP64bit()) ? Mips::GPR64RegClassID : Mips::GPR32RegClassID, ATIndex);
+ return AT;
+}
+
+unsigned MipsAsmParser::getReg(int RC, int RegNo) {
+ return *(getContext().getRegisterInfo()->getRegClass(RC).begin() + RegNo);
+}
+
+bool MipsAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
+ MCAsmParser &Parser = getParser();
+ DEBUG(dbgs() << "parseOperand\n");
+
+ // Check if the current operand has a custom associated parser, if so, try to
+ // custom parse the operand, or fallback to the general approach.
+ OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+ if (ResTy == MatchOperand_Success)
+ return false;
+ // If there wasn't a custom match, try the generic matcher below. Otherwise,
+ // there was a match, but an error occurred, in which case, just return that
+ // the operand parsing failed.
+ if (ResTy == MatchOperand_ParseFail)
+ return true;
+
+ DEBUG(dbgs() << ".. Generic Parser\n");
+
+ switch (getLexer().getKind()) {
+ case AsmToken::Dollar: {
+ // Parse the register.
+ SMLoc S = Parser.getTok().getLoc();
+
+ // Almost all registers have been parsed by custom parsers. There is only
+ // one exception to this. $zero (and it's alias $0) will reach this point
+ // for div, divu, and similar instructions because it is not an operand
+ // to the instruction definition but an explicit register. Special case
+ // this situation for now.
+ if (parseAnyRegister(Operands) != MatchOperand_NoMatch)
+ return false;
+
+ // Maybe it is a symbol reference.
+ StringRef Identifier;
+ if (Parser.parseIdentifier(Identifier))
+ return true;
+
+ SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+ MCSymbol *Sym = getContext().getOrCreateSymbol("$" + Identifier);
+ // Otherwise create a symbol reference.
+ const MCExpr *Res =
+ MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
+
+ Operands.push_back(MipsOperand::CreateImm(Res, S, E, *this));
+ return false;
+ }
+ default: {
+ DEBUG(dbgs() << ".. generic integer expression\n");
+
+ const MCExpr *Expr;
+ SMLoc S = Parser.getTok().getLoc(); // Start location of the operand.
+ if (getParser().parseExpression(Expr))
+ return true;
+
+ SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
+ Operands.push_back(MipsOperand::CreateImm(Expr, S, E, *this));
+ return false;
+ }
+ } // switch(getLexer().getKind())
+ return true;
+}
+
+bool MipsAsmParser::isEvaluated(const MCExpr *Expr) {
+
+ switch (Expr->getKind()) {
+ case MCExpr::Constant:
+ return true;
+ case MCExpr::SymbolRef:
+ return (cast<MCSymbolRefExpr>(Expr)->getKind() != MCSymbolRefExpr::VK_None);
+ case MCExpr::Binary:
+ if (const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(Expr)) {
+ if (!isEvaluated(BE->getLHS()))
+ return false;
+ return isEvaluated(BE->getRHS());
+ }
+ case MCExpr::Unary:
+ return isEvaluated(cast<MCUnaryExpr>(Expr)->getSubExpr());
+ case MCExpr::Target:
+ return true;
+ }
+ return false;
+}
+
+bool MipsAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+ SMLoc &EndLoc) {
+ SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> Operands;
+ OperandMatchResultTy ResTy = parseAnyRegister(Operands);
+ if (ResTy == MatchOperand_Success) {
+ assert(Operands.size() == 1);
+ MipsOperand &Operand = static_cast<MipsOperand &>(*Operands.front());
+ StartLoc = Operand.getStartLoc();
+ EndLoc = Operand.getEndLoc();
+
+ // AFAIK, we only support numeric registers and named GPR's in CFI
+ // directives.
+ // Don't worry about eating tokens before failing. Using an unrecognised
+ // register is a parse error.
+ if (Operand.isGPRAsmReg()) {
+ // Resolve to GPR32 or GPR64 appropriately.
+ RegNo = isGP64bit() ? Operand.getGPR64Reg() : Operand.getGPR32Reg();
+ }
+
+ return (RegNo == (unsigned)-1);
+ }
+
+ assert(Operands.size() == 0);
+ return (RegNo == (unsigned)-1);
+}
+
+bool MipsAsmParser::parseMemOffset(const MCExpr *&Res, bool isParenExpr) {
+ SMLoc S;
+
+ if (isParenExpr)
+ return getParser().parseParenExprOfDepth(0, Res, S);
+ return getParser().parseExpression(Res);
+}
+
+OperandMatchResultTy
+MipsAsmParser::parseMemOperand(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ DEBUG(dbgs() << "parseMemOperand\n");
+ const MCExpr *IdVal = nullptr;
+ SMLoc S;
+ bool isParenExpr = false;
+ OperandMatchResultTy Res = MatchOperand_NoMatch;
+ // First operand is the offset.
+ S = Parser.getTok().getLoc();
+
+ if (getLexer().getKind() == AsmToken::LParen) {
+ Parser.Lex();
+ isParenExpr = true;
+ }
+
+ if (getLexer().getKind() != AsmToken::Dollar) {
+ if (parseMemOffset(IdVal, isParenExpr))
+ return MatchOperand_ParseFail;
+
+ const AsmToken &Tok = Parser.getTok(); // Get the next token.
+ if (Tok.isNot(AsmToken::LParen)) {
+ MipsOperand &Mnemonic = static_cast<MipsOperand &>(*Operands[0]);
+ if (Mnemonic.getToken() == "la" || Mnemonic.getToken() == "dla") {
+ SMLoc E =
+ SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+ Operands.push_back(MipsOperand::CreateImm(IdVal, S, E, *this));
+ return MatchOperand_Success;
+ }
+ if (Tok.is(AsmToken::EndOfStatement)) {
+ SMLoc E =
+ SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
+ // Zero register assumed, add a memory operand with ZERO as its base.
+ // "Base" will be managed by k_Memory.
+ auto Base = MipsOperand::createGPRReg(
+ 0, "0", getContext().getRegisterInfo(), S, E, *this);
+ Operands.push_back(
+ MipsOperand::CreateMem(std::move(Base), IdVal, S, E, *this));
+ return MatchOperand_Success;
+ }
+ MCBinaryExpr::Opcode Opcode;
+ // GAS and LLVM treat comparison operators different. GAS will generate -1
+ // or 0, while LLVM will generate 0 or 1. Since a comparsion operator is
+ // highly unlikely to be found in a memory offset expression, we don't
+ // handle them.
+ switch (Tok.getKind()) {
+ case AsmToken::Plus:
+ Opcode = MCBinaryExpr::Add;
+ Parser.Lex();
+ break;
+ case AsmToken::Minus:
+ Opcode = MCBinaryExpr::Sub;
+ Parser.Lex();
+ break;
+ case AsmToken::Star:
+ Opcode = MCBinaryExpr::Mul;
+ Parser.Lex();
+ break;
+ case AsmToken::Pipe:
+ Opcode = MCBinaryExpr::Or;
+ Parser.Lex();
+ break;
+ case AsmToken::Amp:
+ Opcode = MCBinaryExpr::And;
+ Parser.Lex();
+ break;
+ case AsmToken::LessLess:
+ Opcode = MCBinaryExpr::Shl;
+ Parser.Lex();
+ break;
+ case AsmToken::GreaterGreater:
+ Opcode = MCBinaryExpr::LShr;
+ Parser.Lex();
+ break;
+ case AsmToken::Caret:
+ Opcode = MCBinaryExpr::Xor;
+ Parser.Lex();
+ break;
+ case AsmToken::Slash:
+ Opcode = MCBinaryExpr::Div;
+ Parser.Lex();
+ break;
+ case AsmToken::Percent:
+ Opcode = MCBinaryExpr::Mod;
+ Parser.Lex();
+ break;
+ default:
+ Error(Parser.getTok().getLoc(), "'(' or expression expected");
+ return MatchOperand_ParseFail;
+ }
+ const MCExpr * NextExpr;
+ if (getParser().parseExpression(NextExpr))
+ return MatchOperand_ParseFail;
+ IdVal = MCBinaryExpr::create(Opcode, IdVal, NextExpr, getContext());
+ }
+
+ Parser.Lex(); // Eat the '(' token.
+ }
+
+ Res = parseAnyRegister(Operands);
+ if (Res != MatchOperand_Success)
+ return Res;
+
+ if (Parser.getTok().isNot(AsmToken::RParen)) {
+ Error(Parser.getTok().getLoc(), "')' expected");
+ return MatchOperand_ParseFail;
+ }
+
+ SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
+ Parser.Lex(); // Eat the ')' token.
+
+ if (!IdVal)
+ IdVal = MCConstantExpr::create(0, getContext());
+
+ // Replace the register operand with the memory operand.
+ std::unique_ptr<MipsOperand> op(
+ static_cast<MipsOperand *>(Operands.back().release()));
+ // Remove the register from the operands.
+ // "op" will be managed by k_Memory.
+ Operands.pop_back();
+ // Add the memory operand.
+ if (const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(IdVal)) {
+ int64_t Imm;
+ if (IdVal->evaluateAsAbsolute(Imm))
+ IdVal = MCConstantExpr::create(Imm, getContext());
+ else if (BE->getLHS()->getKind() != MCExpr::SymbolRef)
+ IdVal = MCBinaryExpr::create(BE->getOpcode(), BE->getRHS(), BE->getLHS(),
+ getContext());
+ }
+
+ Operands.push_back(MipsOperand::CreateMem(std::move(op), IdVal, S, E, *this));
+ return MatchOperand_Success;
+}
+
+bool MipsAsmParser::searchSymbolAlias(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ MCSymbol *Sym = getContext().lookupSymbol(Parser.getTok().getIdentifier());
+ if (Sym) {
+ SMLoc S = Parser.getTok().getLoc();
+ const MCExpr *Expr;
+ if (Sym->isVariable())
+ Expr = Sym->getVariableValue();
+ else
+ return false;
+ if (Expr->getKind() == MCExpr::SymbolRef) {
+ const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr *>(Expr);
+ StringRef DefSymbol = Ref->getSymbol().getName();
+ if (DefSymbol.startswith("$")) {
+ OperandMatchResultTy ResTy =
+ matchAnyRegisterNameWithoutDollar(Operands, DefSymbol.substr(1), S);
+ if (ResTy == MatchOperand_Success) {
+ Parser.Lex();
+ return true;
+ } else if (ResTy == MatchOperand_ParseFail)
+ llvm_unreachable("Should never ParseFail");
+ return false;
+ }
+ }
+ }
+ return false;
+}
+
+OperandMatchResultTy
+MipsAsmParser::matchAnyRegisterNameWithoutDollar(OperandVector &Operands,
+ StringRef Identifier,
+ SMLoc S) {
+ int Index = matchCPURegisterName(Identifier);
+ if (Index != -1) {
+ Operands.push_back(MipsOperand::createGPRReg(
+ Index, Identifier, getContext().getRegisterInfo(), S,
+ getLexer().getLoc(), *this));
+ return MatchOperand_Success;
+ }
+
+ Index = matchHWRegsRegisterName(Identifier);
+ if (Index != -1) {
+ Operands.push_back(MipsOperand::createHWRegsReg(
+ Index, Identifier, getContext().getRegisterInfo(), S,
+ getLexer().getLoc(), *this));
+ return MatchOperand_Success;
+ }
+
+ Index = matchFPURegisterName(Identifier);
+ if (Index != -1) {
+ Operands.push_back(MipsOperand::createFGRReg(
+ Index, Identifier, getContext().getRegisterInfo(), S,
+ getLexer().getLoc(), *this));
+ return MatchOperand_Success;
+ }
+
+ Index = matchFCCRegisterName(Identifier);
+ if (Index != -1) {
+ Operands.push_back(MipsOperand::createFCCReg(
+ Index, Identifier, getContext().getRegisterInfo(), S,
+ getLexer().getLoc(), *this));
+ return MatchOperand_Success;
+ }
+
+ Index = matchACRegisterName(Identifier);
+ if (Index != -1) {
+ Operands.push_back(MipsOperand::createACCReg(
+ Index, Identifier, getContext().getRegisterInfo(), S,
+ getLexer().getLoc(), *this));
+ return MatchOperand_Success;
+ }
+
+ Index = matchMSA128RegisterName(Identifier);
+ if (Index != -1) {
+ Operands.push_back(MipsOperand::createMSA128Reg(
+ Index, Identifier, getContext().getRegisterInfo(), S,
+ getLexer().getLoc(), *this));
+ return MatchOperand_Success;
+ }
+
+ Index = matchMSA128CtrlRegisterName(Identifier);
+ if (Index != -1) {
+ Operands.push_back(MipsOperand::createMSACtrlReg(
+ Index, Identifier, getContext().getRegisterInfo(), S,
+ getLexer().getLoc(), *this));
+ return MatchOperand_Success;
+ }
+
+ return MatchOperand_NoMatch;
+}
+
+OperandMatchResultTy
+MipsAsmParser::matchAnyRegisterWithoutDollar(OperandVector &Operands, SMLoc S) {
+ MCAsmParser &Parser = getParser();
+ auto Token = Parser.getLexer().peekTok(false);
+
+ if (Token.is(AsmToken::Identifier)) {
+ DEBUG(dbgs() << ".. identifier\n");
+ StringRef Identifier = Token.getIdentifier();
+ OperandMatchResultTy ResTy =
+ matchAnyRegisterNameWithoutDollar(Operands, Identifier, S);
+ return ResTy;
+ } else if (Token.is(AsmToken::Integer)) {
+ DEBUG(dbgs() << ".. integer\n");
+ Operands.push_back(MipsOperand::createNumericReg(
+ Token.getIntVal(), Token.getString(), getContext().getRegisterInfo(), S,
+ Token.getLoc(), *this));
+ return MatchOperand_Success;
+ }
+
+ DEBUG(dbgs() << Parser.getTok().getKind() << "\n");
+
+ return MatchOperand_NoMatch;
+}
+
+OperandMatchResultTy
+MipsAsmParser::parseAnyRegister(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ DEBUG(dbgs() << "parseAnyRegister\n");
+
+ auto Token = Parser.getTok();
+
+ SMLoc S = Token.getLoc();
+
+ if (Token.isNot(AsmToken::Dollar)) {
+ DEBUG(dbgs() << ".. !$ -> try sym aliasing\n");
+ if (Token.is(AsmToken::Identifier)) {
+ if (searchSymbolAlias(Operands))
+ return MatchOperand_Success;
+ }
+ DEBUG(dbgs() << ".. !symalias -> NoMatch\n");
+ return MatchOperand_NoMatch;
+ }
+ DEBUG(dbgs() << ".. $\n");
+
+ OperandMatchResultTy ResTy = matchAnyRegisterWithoutDollar(Operands, S);
+ if (ResTy == MatchOperand_Success) {
+ Parser.Lex(); // $
+ Parser.Lex(); // identifier
+ }
+ return ResTy;
+}
+
+OperandMatchResultTy
+MipsAsmParser::parseJumpTarget(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ DEBUG(dbgs() << "parseJumpTarget\n");
+
+ SMLoc S = getLexer().getLoc();
+
+ // Registers are a valid target and have priority over symbols.
+ OperandMatchResultTy ResTy = parseAnyRegister(Operands);
+ if (ResTy != MatchOperand_NoMatch)
+ return ResTy;
+
+ // Integers and expressions are acceptable
+ const MCExpr *Expr = nullptr;
+ if (Parser.parseExpression(Expr)) {
+ // We have no way of knowing if a symbol was consumed so we must ParseFail
+ return MatchOperand_ParseFail;
+ }
+ Operands.push_back(
+ MipsOperand::CreateImm(Expr, S, getLexer().getLoc(), *this));
+ return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+MipsAsmParser::parseInvNum(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ const MCExpr *IdVal;
+ // If the first token is '$' we may have register operand.
+ if (Parser.getTok().is(AsmToken::Dollar))
+ return MatchOperand_NoMatch;
+ SMLoc S = Parser.getTok().getLoc();
+ if (getParser().parseExpression(IdVal))
+ return MatchOperand_ParseFail;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(IdVal);
+ assert(MCE && "Unexpected MCExpr type.");
+ int64_t Val = MCE->getValue();
+ SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+ Operands.push_back(MipsOperand::CreateImm(
+ MCConstantExpr::create(0 - Val, getContext()), S, E, *this));
+ return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+MipsAsmParser::parseRegisterList(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ SmallVector<unsigned, 10> Regs;
+ unsigned RegNo;
+ unsigned PrevReg = Mips::NoRegister;
+ bool RegRange = false;
+ SmallVector<std::unique_ptr<MCParsedAsmOperand>, 8> TmpOperands;
+
+ if (Parser.getTok().isNot(AsmToken::Dollar))
+ return MatchOperand_ParseFail;
+
+ SMLoc S = Parser.getTok().getLoc();
+ while (parseAnyRegister(TmpOperands) == MatchOperand_Success) {
+ SMLoc E = getLexer().getLoc();
+ MipsOperand &Reg = static_cast<MipsOperand &>(*TmpOperands.back());
+ RegNo = isGP64bit() ? Reg.getGPR64Reg() : Reg.getGPR32Reg();
+ if (RegRange) {
+ // Remove last register operand because registers from register range
+ // should be inserted first.
+ if ((isGP64bit() && RegNo == Mips::RA_64) ||
+ (!isGP64bit() && RegNo == Mips::RA)) {
+ Regs.push_back(RegNo);
+ } else {
+ unsigned TmpReg = PrevReg + 1;
+ while (TmpReg <= RegNo) {
+ if ((((TmpReg < Mips::S0) || (TmpReg > Mips::S7)) && !isGP64bit()) ||
+ (((TmpReg < Mips::S0_64) || (TmpReg > Mips::S7_64)) &&
+ isGP64bit())) {
+ Error(E, "invalid register operand");
+ return MatchOperand_ParseFail;
+ }
+
+ PrevReg = TmpReg;
+ Regs.push_back(TmpReg++);
+ }
+ }
+
+ RegRange = false;
+ } else {
+ if ((PrevReg == Mips::NoRegister) &&
+ ((isGP64bit() && (RegNo != Mips::S0_64) && (RegNo != Mips::RA_64)) ||
+ (!isGP64bit() && (RegNo != Mips::S0) && (RegNo != Mips::RA)))) {
+ Error(E, "$16 or $31 expected");
+ return MatchOperand_ParseFail;
+ } else if (!(((RegNo == Mips::FP || RegNo == Mips::RA ||
+ (RegNo >= Mips::S0 && RegNo <= Mips::S7)) &&
+ !isGP64bit()) ||
+ ((RegNo == Mips::FP_64 || RegNo == Mips::RA_64 ||
+ (RegNo >= Mips::S0_64 && RegNo <= Mips::S7_64)) &&
+ isGP64bit()))) {
+ Error(E, "invalid register operand");
+ return MatchOperand_ParseFail;
+ } else if ((PrevReg != Mips::NoRegister) && (RegNo != PrevReg + 1) &&
+ ((RegNo != Mips::FP && RegNo != Mips::RA && !isGP64bit()) ||
+ (RegNo != Mips::FP_64 && RegNo != Mips::RA_64 &&
+ isGP64bit()))) {
+ Error(E, "consecutive register numbers expected");
+ return MatchOperand_ParseFail;
+ }
+
+ Regs.push_back(RegNo);
+ }
+
+ if (Parser.getTok().is(AsmToken::Minus))
+ RegRange = true;
+
+ if (!Parser.getTok().isNot(AsmToken::Minus) &&
+ !Parser.getTok().isNot(AsmToken::Comma)) {
+ Error(E, "',' or '-' expected");
+ return MatchOperand_ParseFail;
+ }
+
+ Lex(); // Consume comma or minus
+ if (Parser.getTok().isNot(AsmToken::Dollar))
+ break;
+
+ PrevReg = RegNo;
+ }
+
+ SMLoc E = Parser.getTok().getLoc();
+ Operands.push_back(MipsOperand::CreateRegList(Regs, S, E, *this));
+ parseMemOperand(Operands);
+ return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+MipsAsmParser::parseRegisterPair(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+
+ SMLoc S = Parser.getTok().getLoc();
+ if (parseAnyRegister(Operands) != MatchOperand_Success)
+ return MatchOperand_ParseFail;
+
+ SMLoc E = Parser.getTok().getLoc();
+ MipsOperand Op = static_cast<MipsOperand &>(*Operands.back());
+
+ Operands.pop_back();
+ Operands.push_back(MipsOperand::CreateRegPair(Op, S, E, *this));
+ return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+MipsAsmParser::parseMovePRegPair(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ SmallVector<std::unique_ptr<MCParsedAsmOperand>, 8> TmpOperands;
+ SmallVector<unsigned, 10> Regs;
+
+ if (Parser.getTok().isNot(AsmToken::Dollar))
+ return MatchOperand_ParseFail;
+
+ SMLoc S = Parser.getTok().getLoc();
+
+ if (parseAnyRegister(TmpOperands) != MatchOperand_Success)
+ return MatchOperand_ParseFail;
+
+ MipsOperand *Reg = &static_cast<MipsOperand &>(*TmpOperands.back());
+ unsigned RegNo = isGP64bit() ? Reg->getGPR64Reg() : Reg->getGPR32Reg();
+ Regs.push_back(RegNo);
+
+ SMLoc E = Parser.getTok().getLoc();
+ if (Parser.getTok().isNot(AsmToken::Comma)) {
+ Error(E, "',' expected");
+ return MatchOperand_ParseFail;
+ }
+
+ // Remove comma.
+ Parser.Lex();
+
+ if (parseAnyRegister(TmpOperands) != MatchOperand_Success)
+ return MatchOperand_ParseFail;
+
+ Reg = &static_cast<MipsOperand &>(*TmpOperands.back());
+ RegNo = isGP64bit() ? Reg->getGPR64Reg() : Reg->getGPR32Reg();
+ Regs.push_back(RegNo);
+
+ Operands.push_back(MipsOperand::CreateRegList(Regs, S, E, *this));
+
+ return MatchOperand_Success;
+}
+
+/// Sometimes (i.e. load/stores) the operand may be followed immediately by
+/// either this.
+/// ::= '(', register, ')'
+/// handle it before we iterate so we don't get tripped up by the lack of
+/// a comma.
+bool MipsAsmParser::parseParenSuffix(StringRef Name, OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ if (getLexer().is(AsmToken::LParen)) {
+ Operands.push_back(
+ MipsOperand::CreateToken("(", getLexer().getLoc(), *this));
+ Parser.Lex();
+ if (parseOperand(Operands, Name)) {
+ SMLoc Loc = getLexer().getLoc();
+ return Error(Loc, "unexpected token in argument list");
+ }
+ if (Parser.getTok().isNot(AsmToken::RParen)) {
+ SMLoc Loc = getLexer().getLoc();
+ return Error(Loc, "unexpected token, expected ')'");
+ }
+ Operands.push_back(
+ MipsOperand::CreateToken(")", getLexer().getLoc(), *this));
+ Parser.Lex();
+ }
+ return false;
+}
+
+/// Sometimes (i.e. in MSA) the operand may be followed immediately by
+/// either one of these.
+/// ::= '[', register, ']'
+/// ::= '[', integer, ']'
+/// handle it before we iterate so we don't get tripped up by the lack of
+/// a comma.
+bool MipsAsmParser::parseBracketSuffix(StringRef Name,
+ OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ if (getLexer().is(AsmToken::LBrac)) {
+ Operands.push_back(
+ MipsOperand::CreateToken("[", getLexer().getLoc(), *this));
+ Parser.Lex();
+ if (parseOperand(Operands, Name)) {
+ SMLoc Loc = getLexer().getLoc();
+ return Error(Loc, "unexpected token in argument list");
+ }
+ if (Parser.getTok().isNot(AsmToken::RBrac)) {
+ SMLoc Loc = getLexer().getLoc();
+ return Error(Loc, "unexpected token, expected ']'");
+ }
+ Operands.push_back(
+ MipsOperand::CreateToken("]", getLexer().getLoc(), *this));
+ Parser.Lex();
+ }
+ return false;
+}
+
+bool MipsAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+ SMLoc NameLoc, OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ DEBUG(dbgs() << "ParseInstruction\n");
+
+ // We have reached first instruction, module directive are now forbidden.
+ getTargetStreamer().forbidModuleDirective();
+
+ // Check if we have valid mnemonic
+ if (!mnemonicIsValid(Name, 0)) {
+ return Error(NameLoc, "unknown instruction");
+ }
+ // First operand in MCInst is instruction mnemonic.
+ Operands.push_back(MipsOperand::CreateToken(Name, NameLoc, *this));
+
+ // Read the remaining operands.
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ // Read the first operand.
+ if (parseOperand(Operands, Name)) {
+ SMLoc Loc = getLexer().getLoc();
+ return Error(Loc, "unexpected token in argument list");
+ }
+ if (getLexer().is(AsmToken::LBrac) && parseBracketSuffix(Name, Operands))
+ return true;
+ // AFAIK, parenthesis suffixes are never on the first operand
+
+ while (getLexer().is(AsmToken::Comma)) {
+ Parser.Lex(); // Eat the comma.
+ // Parse and remember the operand.
+ if (parseOperand(Operands, Name)) {
+ SMLoc Loc = getLexer().getLoc();
+ return Error(Loc, "unexpected token in argument list");
+ }
+ // Parse bracket and parenthesis suffixes before we iterate
+ if (getLexer().is(AsmToken::LBrac)) {
+ if (parseBracketSuffix(Name, Operands))
+ return true;
+ } else if (getLexer().is(AsmToken::LParen) &&
+ parseParenSuffix(Name, Operands))
+ return true;
+ }
+ }
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ SMLoc Loc = getLexer().getLoc();
+ return Error(Loc, "unexpected token in argument list");
+ }
+ Parser.Lex(); // Consume the EndOfStatement.
+ return false;
+}
+
+// FIXME: Given that these have the same name, these should both be
+// consistent on affecting the Parser.
+bool MipsAsmParser::reportParseError(Twine ErrorMsg) {
+ SMLoc Loc = getLexer().getLoc();
+ return Error(Loc, ErrorMsg);
+}
+
+bool MipsAsmParser::reportParseError(SMLoc Loc, Twine ErrorMsg) {
+ return Error(Loc, ErrorMsg);
+}
+
+bool MipsAsmParser::parseSetNoAtDirective() {
+ MCAsmParser &Parser = getParser();
+ // Line should look like: ".set noat".
+
+ // Set the $at register to $0.
+ AssemblerOptions.back()->setATRegIndex(0);
+
+ Parser.Lex(); // Eat "noat".
+
+ // If this is not the end of the statement, report an error.
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ reportParseError("unexpected token, expected end of statement");
+ return false;
+ }
+
+ getTargetStreamer().emitDirectiveSetNoAt();
+ Parser.Lex(); // Consume the EndOfStatement.
+ return false;
+}
+
+bool MipsAsmParser::parseSetAtDirective() {
+ // Line can be: ".set at", which sets $at to $1
+ // or ".set at=$reg", which sets $at to $reg.
+ MCAsmParser &Parser = getParser();
+ Parser.Lex(); // Eat "at".
+
+ if (getLexer().is(AsmToken::EndOfStatement)) {
+ // No register was specified, so we set $at to $1.
+ AssemblerOptions.back()->setATRegIndex(1);
+
+ getTargetStreamer().emitDirectiveSetAt();
+ Parser.Lex(); // Consume the EndOfStatement.
+ return false;
+ }
+
+ if (getLexer().isNot(AsmToken::Equal)) {
+ reportParseError("unexpected token, expected equals sign");
+ return false;
+ }
+ Parser.Lex(); // Eat "=".
+
+ if (getLexer().isNot(AsmToken::Dollar)) {
+ if (getLexer().is(AsmToken::EndOfStatement)) {
+ reportParseError("no register specified");
+ return false;
+ } else {
+ reportParseError("unexpected token, expected dollar sign '$'");
+ return false;
+ }
+ }
+ Parser.Lex(); // Eat "$".
+
+ // Find out what "reg" is.
+ unsigned AtRegNo;
+ const AsmToken &Reg = Parser.getTok();
+ if (Reg.is(AsmToken::Identifier)) {
+ AtRegNo = matchCPURegisterName(Reg.getIdentifier());
+ } else if (Reg.is(AsmToken::Integer)) {
+ AtRegNo = Reg.getIntVal();
+ } else {
+ reportParseError("unexpected token, expected identifier or integer");
+ return false;
+ }
+
+ // Check if $reg is a valid register. If it is, set $at to $reg.
+ if (!AssemblerOptions.back()->setATRegIndex(AtRegNo)) {
+ reportParseError("invalid register");
+ return false;
+ }
+ Parser.Lex(); // Eat "reg".
+
+ // If this is not the end of the statement, report an error.
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ reportParseError("unexpected token, expected end of statement");
+ return false;
+ }
+
+ getTargetStreamer().emitDirectiveSetAtWithArg(AtRegNo);
+
+ Parser.Lex(); // Consume the EndOfStatement.
+ return false;
+}
+
+bool MipsAsmParser::parseSetReorderDirective() {
+ MCAsmParser &Parser = getParser();
+ Parser.Lex();
+ // If this is not the end of the statement, report an error.
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ reportParseError("unexpected token, expected end of statement");
+ return false;
+ }
+ AssemblerOptions.back()->setReorder();
+ getTargetStreamer().emitDirectiveSetReorder();
+ Parser.Lex(); // Consume the EndOfStatement.
+ return false;
+}
+
+bool MipsAsmParser::parseSetNoReorderDirective() {
+ MCAsmParser &Parser = getParser();
+ Parser.Lex();
+ // If this is not the end of the statement, report an error.
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ reportParseError("unexpected token, expected end of statement");
+ return false;
+ }
+ AssemblerOptions.back()->setNoReorder();
+ getTargetStreamer().emitDirectiveSetNoReorder();
+ Parser.Lex(); // Consume the EndOfStatement.
+ return false;
+}
+
+bool MipsAsmParser::parseSetMacroDirective() {
+ MCAsmParser &Parser = getParser();
+ Parser.Lex();
+ // If this is not the end of the statement, report an error.
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ reportParseError("unexpected token, expected end of statement");
+ return false;
+ }
+ AssemblerOptions.back()->setMacro();
+ getTargetStreamer().emitDirectiveSetMacro();
+ Parser.Lex(); // Consume the EndOfStatement.
+ return false;
+}
+
+bool MipsAsmParser::parseSetNoMacroDirective() {
+ MCAsmParser &Parser = getParser();
+ Parser.Lex();
+ // If this is not the end of the statement, report an error.
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ reportParseError("unexpected token, expected end of statement");
+ return false;
+ }
+ if (AssemblerOptions.back()->isReorder()) {
+ reportParseError("`noreorder' must be set before `nomacro'");
+ return false;
+ }
+ AssemblerOptions.back()->setNoMacro();
+ getTargetStreamer().emitDirectiveSetNoMacro();
+ Parser.Lex(); // Consume the EndOfStatement.
+ return false;
+}
+
+bool MipsAsmParser::parseSetMsaDirective() {
+ MCAsmParser &Parser = getParser();
+ Parser.Lex();
+
+ // If this is not the end of the statement, report an error.
+ if (getLexer().isNot(AsmToken::EndOfStatement))
+ return reportParseError("unexpected token, expected end of statement");
+
+ setFeatureBits(Mips::FeatureMSA, "msa");
+ getTargetStreamer().emitDirectiveSetMsa();
+ return false;
+}
+
+bool MipsAsmParser::parseSetNoMsaDirective() {
+ MCAsmParser &Parser = getParser();
+ Parser.Lex();
+
+ // If this is not the end of the statement, report an error.
+ if (getLexer().isNot(AsmToken::EndOfStatement))
+ return reportParseError("unexpected token, expected end of statement");
+
+ clearFeatureBits(Mips::FeatureMSA, "msa");
+ getTargetStreamer().emitDirectiveSetNoMsa();
+ return false;
+}
+
+bool MipsAsmParser::parseSetNoDspDirective() {
+ MCAsmParser &Parser = getParser();
+ Parser.Lex(); // Eat "nodsp".
+
+ // If this is not the end of the statement, report an error.
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ reportParseError("unexpected token, expected end of statement");
+ return false;
+ }
+
+ clearFeatureBits(Mips::FeatureDSP, "dsp");
+ getTargetStreamer().emitDirectiveSetNoDsp();
+ return false;
+}
+
+bool MipsAsmParser::parseSetMips16Directive() {
+ MCAsmParser &Parser = getParser();
+ Parser.Lex(); // Eat "mips16".
+
+ // If this is not the end of the statement, report an error.
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ reportParseError("unexpected token, expected end of statement");
+ return false;
+ }
+
+ setFeatureBits(Mips::FeatureMips16, "mips16");
+ getTargetStreamer().emitDirectiveSetMips16();
+ Parser.Lex(); // Consume the EndOfStatement.
+ return false;
+}
+
+bool MipsAsmParser::parseSetNoMips16Directive() {
+ MCAsmParser &Parser = getParser();
+ Parser.Lex(); // Eat "nomips16".
+
+ // If this is not the end of the statement, report an error.
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ reportParseError("unexpected token, expected end of statement");
+ return false;
+ }
+
+ clearFeatureBits(Mips::FeatureMips16, "mips16");
+ getTargetStreamer().emitDirectiveSetNoMips16();
+ Parser.Lex(); // Consume the EndOfStatement.
+ return false;
+}
+
+bool MipsAsmParser::parseSetFpDirective() {
+ MCAsmParser &Parser = getParser();
+ MipsABIFlagsSection::FpABIKind FpAbiVal;
+ // Line can be: .set fp=32
+ // .set fp=xx
+ // .set fp=64
+ Parser.Lex(); // Eat fp token
+ AsmToken Tok = Parser.getTok();
+ if (Tok.isNot(AsmToken::Equal)) {
+ reportParseError("unexpected token, expected equals sign '='");
+ return false;
+ }
+ Parser.Lex(); // Eat '=' token.
+ Tok = Parser.getTok();
+
+ if (!parseFpABIValue(FpAbiVal, ".set"))
+ return false;
+
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ reportParseError("unexpected token, expected end of statement");
+ return false;
+ }
+ getTargetStreamer().emitDirectiveSetFp(FpAbiVal);
+ Parser.Lex(); // Consume the EndOfStatement.
+ return false;
+}
+
+bool MipsAsmParser::parseSetOddSPRegDirective() {
+ MCAsmParser &Parser = getParser();
+
+ Parser.Lex(); // Eat "oddspreg".
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ reportParseError("unexpected token, expected end of statement");
+ return false;
+ }
+
+ clearFeatureBits(Mips::FeatureNoOddSPReg, "nooddspreg");
+ getTargetStreamer().emitDirectiveSetOddSPReg();
+ return false;
+}
+
+bool MipsAsmParser::parseSetNoOddSPRegDirective() {
+ MCAsmParser &Parser = getParser();
+
+ Parser.Lex(); // Eat "nooddspreg".
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ reportParseError("unexpected token, expected end of statement");
+ return false;
+ }
+
+ setFeatureBits(Mips::FeatureNoOddSPReg, "nooddspreg");
+ getTargetStreamer().emitDirectiveSetNoOddSPReg();
+ return false;
+}
+
+bool MipsAsmParser::parseSetPopDirective() {
+ MCAsmParser &Parser = getParser();
+ SMLoc Loc = getLexer().getLoc();
+
+ Parser.Lex();
+ if (getLexer().isNot(AsmToken::EndOfStatement))
+ return reportParseError("unexpected token, expected end of statement");
+
+ // Always keep an element on the options "stack" to prevent the user
+ // from changing the initial options. This is how we remember them.
+ if (AssemblerOptions.size() == 2)
+ return reportParseError(Loc, ".set pop with no .set push");
+
+ MCSubtargetInfo &STI = copySTI();
+ AssemblerOptions.pop_back();
+ setAvailableFeatures(
+ ComputeAvailableFeatures(AssemblerOptions.back()->getFeatures()));
+ STI.setFeatureBits(AssemblerOptions.back()->getFeatures());
+
+ getTargetStreamer().emitDirectiveSetPop();
+ return false;
+}
+
+bool MipsAsmParser::parseSetPushDirective() {
+ MCAsmParser &Parser = getParser();
+ Parser.Lex();
+ if (getLexer().isNot(AsmToken::EndOfStatement))
+ return reportParseError("unexpected token, expected end of statement");
+
+ // Create a copy of the current assembler options environment and push it.
+ AssemblerOptions.push_back(
+ make_unique<MipsAssemblerOptions>(AssemblerOptions.back().get()));
+
+ getTargetStreamer().emitDirectiveSetPush();
+ return false;
+}
+
+bool MipsAsmParser::parseSetSoftFloatDirective() {
+ MCAsmParser &Parser = getParser();
+ Parser.Lex();
+ if (getLexer().isNot(AsmToken::EndOfStatement))
+ return reportParseError("unexpected token, expected end of statement");
+
+ setFeatureBits(Mips::FeatureSoftFloat, "soft-float");
+ getTargetStreamer().emitDirectiveSetSoftFloat();
+ return false;
+}
+
+bool MipsAsmParser::parseSetHardFloatDirective() {
+ MCAsmParser &Parser = getParser();
+ Parser.Lex();
+ if (getLexer().isNot(AsmToken::EndOfStatement))
+ return reportParseError("unexpected token, expected end of statement");
+
+ clearFeatureBits(Mips::FeatureSoftFloat, "soft-float");
+ getTargetStreamer().emitDirectiveSetHardFloat();
+ return false;
+}
+
+bool MipsAsmParser::parseSetAssignment() {
+ StringRef Name;
+ const MCExpr *Value;
+ MCAsmParser &Parser = getParser();
+
+ if (Parser.parseIdentifier(Name))
+ reportParseError("expected identifier after .set");
+
+ if (getLexer().isNot(AsmToken::Comma))
+ return reportParseError("unexpected token, expected comma");
+ Lex(); // Eat comma
+
+ if (Parser.parseExpression(Value))
+ return reportParseError("expected valid expression after comma");
+
+ MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
+ Sym->setVariableValue(Value);
+
+ return false;
+}
+
+bool MipsAsmParser::parseSetMips0Directive() {
+ MCAsmParser &Parser = getParser();
+ Parser.Lex();
+ if (getLexer().isNot(AsmToken::EndOfStatement))
+ return reportParseError("unexpected token, expected end of statement");
+
+ // Reset assembler options to their initial values.
+ MCSubtargetInfo &STI = copySTI();
+ setAvailableFeatures(
+ ComputeAvailableFeatures(AssemblerOptions.front()->getFeatures()));
+ STI.setFeatureBits(AssemblerOptions.front()->getFeatures());
+ AssemblerOptions.back()->setFeatures(AssemblerOptions.front()->getFeatures());
+
+ getTargetStreamer().emitDirectiveSetMips0();
+ return false;
+}
+
+bool MipsAsmParser::parseSetArchDirective() {
+ MCAsmParser &Parser = getParser();
+ Parser.Lex();
+ if (getLexer().isNot(AsmToken::Equal))
+ return reportParseError("unexpected token, expected equals sign");
+
+ Parser.Lex();
+ StringRef Arch;
+ if (Parser.parseIdentifier(Arch))
+ return reportParseError("expected arch identifier");
+
+ StringRef ArchFeatureName =
+ StringSwitch<StringRef>(Arch)
+ .Case("mips1", "mips1")
+ .Case("mips2", "mips2")
+ .Case("mips3", "mips3")
+ .Case("mips4", "mips4")
+ .Case("mips5", "mips5")
+ .Case("mips32", "mips32")
+ .Case("mips32r2", "mips32r2")
+ .Case("mips32r3", "mips32r3")
+ .Case("mips32r5", "mips32r5")
+ .Case("mips32r6", "mips32r6")
+ .Case("mips64", "mips64")
+ .Case("mips64r2", "mips64r2")
+ .Case("mips64r3", "mips64r3")
+ .Case("mips64r5", "mips64r5")
+ .Case("mips64r6", "mips64r6")
+ .Case("octeon", "cnmips")
+ .Case("r4000", "mips3") // This is an implementation of Mips3.
+ .Default("");
+
+ if (ArchFeatureName.empty())
+ return reportParseError("unsupported architecture");
+
+ selectArch(ArchFeatureName);
+ getTargetStreamer().emitDirectiveSetArch(Arch);
+ return false;
+}
+
+bool MipsAsmParser::parseSetFeature(uint64_t Feature) {
+ MCAsmParser &Parser = getParser();
+ Parser.Lex();
+ if (getLexer().isNot(AsmToken::EndOfStatement))
+ return reportParseError("unexpected token, expected end of statement");
+
+ switch (Feature) {
+ default:
+ llvm_unreachable("Unimplemented feature");
+ case Mips::FeatureDSP:
+ setFeatureBits(Mips::FeatureDSP, "dsp");
+ getTargetStreamer().emitDirectiveSetDsp();
+ break;
+ case Mips::FeatureMicroMips:
+ setFeatureBits(Mips::FeatureMicroMips, "micromips");
+ getTargetStreamer().emitDirectiveSetMicroMips();
+ break;
+ case Mips::FeatureMips1:
+ selectArch("mips1");
+ getTargetStreamer().emitDirectiveSetMips1();
+ break;
+ case Mips::FeatureMips2:
+ selectArch("mips2");
+ getTargetStreamer().emitDirectiveSetMips2();
+ break;
+ case Mips::FeatureMips3:
+ selectArch("mips3");
+ getTargetStreamer().emitDirectiveSetMips3();
+ break;
+ case Mips::FeatureMips4:
+ selectArch("mips4");
+ getTargetStreamer().emitDirectiveSetMips4();
+ break;
+ case Mips::FeatureMips5:
+ selectArch("mips5");
+ getTargetStreamer().emitDirectiveSetMips5();
+ break;
+ case Mips::FeatureMips32:
+ selectArch("mips32");
+ getTargetStreamer().emitDirectiveSetMips32();
+ break;
+ case Mips::FeatureMips32r2:
+ selectArch("mips32r2");
+ getTargetStreamer().emitDirectiveSetMips32R2();
+ break;
+ case Mips::FeatureMips32r3:
+ selectArch("mips32r3");
+ getTargetStreamer().emitDirectiveSetMips32R3();
+ break;
+ case Mips::FeatureMips32r5:
+ selectArch("mips32r5");
+ getTargetStreamer().emitDirectiveSetMips32R5();
+ break;
+ case Mips::FeatureMips32r6:
+ selectArch("mips32r6");
+ getTargetStreamer().emitDirectiveSetMips32R6();
+ break;
+ case Mips::FeatureMips64:
+ selectArch("mips64");
+ getTargetStreamer().emitDirectiveSetMips64();
+ break;
+ case Mips::FeatureMips64r2:
+ selectArch("mips64r2");
+ getTargetStreamer().emitDirectiveSetMips64R2();
+ break;
+ case Mips::FeatureMips64r3:
+ selectArch("mips64r3");
+ getTargetStreamer().emitDirectiveSetMips64R3();
+ break;
+ case Mips::FeatureMips64r5:
+ selectArch("mips64r5");
+ getTargetStreamer().emitDirectiveSetMips64R5();
+ break;
+ case Mips::FeatureMips64r6:
+ selectArch("mips64r6");
+ getTargetStreamer().emitDirectiveSetMips64R6();
+ break;
+ }
+ return false;
+}
+
+bool MipsAsmParser::eatComma(StringRef ErrorStr) {
+ MCAsmParser &Parser = getParser();
+ if (getLexer().isNot(AsmToken::Comma)) {
+ SMLoc Loc = getLexer().getLoc();
+ return Error(Loc, ErrorStr);
+ }
+
+ Parser.Lex(); // Eat the comma.
+ return true;
+}
+
+// Used to determine if .cpload, .cprestore, and .cpsetup have any effect.
+// In this class, it is only used for .cprestore.
+// FIXME: Only keep track of IsPicEnabled in one place, instead of in both
+// MipsTargetELFStreamer and MipsAsmParser.
+bool MipsAsmParser::isPicAndNotNxxAbi() {
+ return inPicMode() && !(isABI_N32() || isABI_N64());
+}
+
+bool MipsAsmParser::parseDirectiveCpLoad(SMLoc Loc) {
+ if (AssemblerOptions.back()->isReorder())
+ Warning(Loc, ".cpload should be inside a noreorder section");
+
+ if (inMips16Mode()) {
+ reportParseError(".cpload is not supported in Mips16 mode");
+ return false;
+ }
+
+ SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> Reg;
+ OperandMatchResultTy ResTy = parseAnyRegister(Reg);
+ if (ResTy == MatchOperand_NoMatch || ResTy == MatchOperand_ParseFail) {
+ reportParseError("expected register containing function address");
+ return false;
+ }
+
+ MipsOperand &RegOpnd = static_cast<MipsOperand &>(*Reg[0]);
+ if (!RegOpnd.isGPRAsmReg()) {
+ reportParseError(RegOpnd.getStartLoc(), "invalid register");
+ return false;
+ }
+
+ // If this is not the end of the statement, report an error.
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ reportParseError("unexpected token, expected end of statement");
+ return false;
+ }
+
+ getTargetStreamer().emitDirectiveCpLoad(RegOpnd.getGPR32Reg());
+ return false;
+}
+
+bool MipsAsmParser::parseDirectiveCpRestore(SMLoc Loc) {
+ MCAsmParser &Parser = getParser();
+
+ // Note that .cprestore is ignored if used with the N32 and N64 ABIs or if it
+ // is used in non-PIC mode.
+
+ if (inMips16Mode()) {
+ reportParseError(".cprestore is not supported in Mips16 mode");
+ return false;
+ }
+
+ // Get the stack offset value.
+ const MCExpr *StackOffset;
+ int64_t StackOffsetVal;
+ if (Parser.parseExpression(StackOffset)) {
+ reportParseError("expected stack offset value");
+ return false;
+ }
+
+ if (!StackOffset->evaluateAsAbsolute(StackOffsetVal)) {
+ reportParseError("stack offset is not an absolute expression");
+ return false;
+ }
+
+ if (StackOffsetVal < 0) {
+ Warning(Loc, ".cprestore with negative stack offset has no effect");
+ IsCpRestoreSet = false;
+ } else {
+ IsCpRestoreSet = true;
+ CpRestoreOffset = StackOffsetVal;
+ }
+
+ // If this is not the end of the statement, report an error.
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ reportParseError("unexpected token, expected end of statement");
+ return false;
+ }
+
+ if (!getTargetStreamer().emitDirectiveCpRestore(
+ CpRestoreOffset, [&]() { return getATReg(Loc); }, Loc, STI))
+ return true;
+ Parser.Lex(); // Consume the EndOfStatement.
+ return false;
+}
+
+bool MipsAsmParser::parseDirectiveCPSetup() {
+ MCAsmParser &Parser = getParser();
+ unsigned FuncReg;
+ unsigned Save;
+ bool SaveIsReg = true;
+
+ SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> TmpReg;
+ OperandMatchResultTy ResTy = parseAnyRegister(TmpReg);
+ if (ResTy == MatchOperand_NoMatch) {
+ reportParseError("expected register containing function address");
+ return false;
+ }
+
+ MipsOperand &FuncRegOpnd = static_cast<MipsOperand &>(*TmpReg[0]);
+ if (!FuncRegOpnd.isGPRAsmReg()) {
+ reportParseError(FuncRegOpnd.getStartLoc(), "invalid register");
+ return false;
+ }
+
+ FuncReg = FuncRegOpnd.getGPR32Reg();
+ TmpReg.clear();
+
+ if (!eatComma("unexpected token, expected comma"))
+ return true;
+
+ ResTy = parseAnyRegister(TmpReg);
+ if (ResTy == MatchOperand_NoMatch) {
+ const MCExpr *OffsetExpr;
+ int64_t OffsetVal;
+ SMLoc ExprLoc = getLexer().getLoc();
+
+ if (Parser.parseExpression(OffsetExpr) ||
+ !OffsetExpr->evaluateAsAbsolute(OffsetVal)) {
+ reportParseError(ExprLoc, "expected save register or stack offset");
+ return false;
+ }
+
+ Save = OffsetVal;
+ SaveIsReg = false;
+ } else {
+ MipsOperand &SaveOpnd = static_cast<MipsOperand &>(*TmpReg[0]);
+ if (!SaveOpnd.isGPRAsmReg()) {
+ reportParseError(SaveOpnd.getStartLoc(), "invalid register");
+ return false;
+ }
+ Save = SaveOpnd.getGPR32Reg();
+ }
+
+ if (!eatComma("unexpected token, expected comma"))
+ return true;
+
+ const MCExpr *Expr;
+ if (Parser.parseExpression(Expr)) {
+ reportParseError("expected expression");
+ return false;
+ }
+
+ if (Expr->getKind() != MCExpr::SymbolRef) {
+ reportParseError("expected symbol");
+ return false;
+ }
+ const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr *>(Expr);
+
+ CpSaveLocation = Save;
+ CpSaveLocationIsRegister = SaveIsReg;
+
+ getTargetStreamer().emitDirectiveCpsetup(FuncReg, Save, Ref->getSymbol(),
+ SaveIsReg);
+ return false;
+}
+
+bool MipsAsmParser::parseDirectiveCPReturn() {
+ getTargetStreamer().emitDirectiveCpreturn(CpSaveLocation,
+ CpSaveLocationIsRegister);
+ return false;
+}
+
+bool MipsAsmParser::parseDirectiveNaN() {
+ MCAsmParser &Parser = getParser();
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ const AsmToken &Tok = Parser.getTok();
+
+ if (Tok.getString() == "2008") {
+ Parser.Lex();
+ getTargetStreamer().emitDirectiveNaN2008();
+ return false;
+ } else if (Tok.getString() == "legacy") {
+ Parser.Lex();
+ getTargetStreamer().emitDirectiveNaNLegacy();
+ return false;
+ }
+ }
+ // If we don't recognize the option passed to the .nan
+ // directive (e.g. no option or unknown option), emit an error.
+ reportParseError("invalid option in .nan directive");
+ return false;
+}
+
+bool MipsAsmParser::parseDirectiveSet() {
+ MCAsmParser &Parser = getParser();
+ // Get the next token.
+ const AsmToken &Tok = Parser.getTok();
+
+ if (Tok.getString() == "noat") {
+ return parseSetNoAtDirective();
+ } else if (Tok.getString() == "at") {
+ return parseSetAtDirective();
+ } else if (Tok.getString() == "arch") {
+ return parseSetArchDirective();
+ } else if (Tok.getString() == "fp") {
+ return parseSetFpDirective();
+ } else if (Tok.getString() == "oddspreg") {
+ return parseSetOddSPRegDirective();
+ } else if (Tok.getString() == "nooddspreg") {
+ return parseSetNoOddSPRegDirective();
+ } else if (Tok.getString() == "pop") {
+ return parseSetPopDirective();
+ } else if (Tok.getString() == "push") {
+ return parseSetPushDirective();
+ } else if (Tok.getString() == "reorder") {
+ return parseSetReorderDirective();
+ } else if (Tok.getString() == "noreorder") {
+ return parseSetNoReorderDirective();
+ } else if (Tok.getString() == "macro") {
+ return parseSetMacroDirective();
+ } else if (Tok.getString() == "nomacro") {
+ return parseSetNoMacroDirective();
+ } else if (Tok.getString() == "mips16") {
+ return parseSetMips16Directive();
+ } else if (Tok.getString() == "nomips16") {
+ return parseSetNoMips16Directive();
+ } else if (Tok.getString() == "nomicromips") {
+ clearFeatureBits(Mips::FeatureMicroMips, "micromips");
+ getTargetStreamer().emitDirectiveSetNoMicroMips();
+ Parser.eatToEndOfStatement();
+ return false;
+ } else if (Tok.getString() == "micromips") {
+ return parseSetFeature(Mips::FeatureMicroMips);
+ } else if (Tok.getString() == "mips0") {
+ return parseSetMips0Directive();
+ } else if (Tok.getString() == "mips1") {
+ return parseSetFeature(Mips::FeatureMips1);
+ } else if (Tok.getString() == "mips2") {
+ return parseSetFeature(Mips::FeatureMips2);
+ } else if (Tok.getString() == "mips3") {
+ return parseSetFeature(Mips::FeatureMips3);
+ } else if (Tok.getString() == "mips4") {
+ return parseSetFeature(Mips::FeatureMips4);
+ } else if (Tok.getString() == "mips5") {
+ return parseSetFeature(Mips::FeatureMips5);
+ } else if (Tok.getString() == "mips32") {
+ return parseSetFeature(Mips::FeatureMips32);
+ } else if (Tok.getString() == "mips32r2") {
+ return parseSetFeature(Mips::FeatureMips32r2);
+ } else if (Tok.getString() == "mips32r3") {
+ return parseSetFeature(Mips::FeatureMips32r3);
+ } else if (Tok.getString() == "mips32r5") {
+ return parseSetFeature(Mips::FeatureMips32r5);
+ } else if (Tok.getString() == "mips32r6") {
+ return parseSetFeature(Mips::FeatureMips32r6);
+ } else if (Tok.getString() == "mips64") {
+ return parseSetFeature(Mips::FeatureMips64);
+ } else if (Tok.getString() == "mips64r2") {
+ return parseSetFeature(Mips::FeatureMips64r2);
+ } else if (Tok.getString() == "mips64r3") {
+ return parseSetFeature(Mips::FeatureMips64r3);
+ } else if (Tok.getString() == "mips64r5") {
+ return parseSetFeature(Mips::FeatureMips64r5);
+ } else if (Tok.getString() == "mips64r6") {
+ return parseSetFeature(Mips::FeatureMips64r6);
+ } else if (Tok.getString() == "dsp") {
+ return parseSetFeature(Mips::FeatureDSP);
+ } else if (Tok.getString() == "nodsp") {
+ return parseSetNoDspDirective();
+ } else if (Tok.getString() == "msa") {
+ return parseSetMsaDirective();
+ } else if (Tok.getString() == "nomsa") {
+ return parseSetNoMsaDirective();
+ } else if (Tok.getString() == "softfloat") {
+ return parseSetSoftFloatDirective();
+ } else if (Tok.getString() == "hardfloat") {
+ return parseSetHardFloatDirective();
+ } else {
+ // It is just an identifier, look for an assignment.
+ parseSetAssignment();
+ return false;
+ }
+
+ return true;
+}
+
+/// parseDataDirective
+/// ::= .word [ expression (, expression)* ]
+bool MipsAsmParser::parseDataDirective(unsigned Size, SMLoc L) {
+ MCAsmParser &Parser = getParser();
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ for (;;) {
+ const MCExpr *Value;
+ if (getParser().parseExpression(Value))
+ return true;
+
+ getParser().getStreamer().EmitValue(Value, Size);
+
+ if (getLexer().is(AsmToken::EndOfStatement))
+ break;
+
+ if (getLexer().isNot(AsmToken::Comma))
+ return Error(L, "unexpected token, expected comma");
+ Parser.Lex();
+ }
+ }
+
+ Parser.Lex();
+ return false;
+}
+
+/// parseDirectiveGpWord
+/// ::= .gpword local_sym
+bool MipsAsmParser::parseDirectiveGpWord() {
+ MCAsmParser &Parser = getParser();
+ const MCExpr *Value;
+ // EmitGPRel32Value requires an expression, so we are using base class
+ // method to evaluate the expression.
+ if (getParser().parseExpression(Value))
+ return true;
+ getParser().getStreamer().EmitGPRel32Value(Value);
+
+ if (getLexer().isNot(AsmToken::EndOfStatement))
+ return Error(getLexer().getLoc(),
+ "unexpected token, expected end of statement");
+ Parser.Lex(); // Eat EndOfStatement token.
+ return false;
+}
+
+/// parseDirectiveGpDWord
+/// ::= .gpdword local_sym
+bool MipsAsmParser::parseDirectiveGpDWord() {
+ MCAsmParser &Parser = getParser();
+ const MCExpr *Value;
+ // EmitGPRel64Value requires an expression, so we are using base class
+ // method to evaluate the expression.
+ if (getParser().parseExpression(Value))
+ return true;
+ getParser().getStreamer().EmitGPRel64Value(Value);
+
+ if (getLexer().isNot(AsmToken::EndOfStatement))
+ return Error(getLexer().getLoc(),
+ "unexpected token, expected end of statement");
+ Parser.Lex(); // Eat EndOfStatement token.
+ return false;
+}
+
+/// parseDirectiveDtpRelWord
+/// ::= .dtprelword tls_sym
+bool MipsAsmParser::parseDirectiveDtpRelWord() {
+ MCAsmParser &Parser = getParser();
+ const MCExpr *Value;
+ // EmitDTPRel32Value requires an expression, so we are using base class
+ // method to evaluate the expression.
+ if (getParser().parseExpression(Value))
+ return true;
+ getParser().getStreamer().EmitDTPRel32Value(Value);
+
+ if (getLexer().isNot(AsmToken::EndOfStatement))
+ return Error(getLexer().getLoc(),
+ "unexpected token, expected end of statement");
+ Parser.Lex(); // Eat EndOfStatement token.
+ return false;
+}
+
+/// parseDirectiveDtpRelDWord
+/// ::= .dtpreldword tls_sym
+bool MipsAsmParser::parseDirectiveDtpRelDWord() {
+ MCAsmParser &Parser = getParser();
+ const MCExpr *Value;
+ // EmitDTPRel64Value requires an expression, so we are using base class
+ // method to evaluate the expression.
+ if (getParser().parseExpression(Value))
+ return true;
+ getParser().getStreamer().EmitDTPRel64Value(Value);
+
+ if (getLexer().isNot(AsmToken::EndOfStatement))
+ return Error(getLexer().getLoc(),
+ "unexpected token, expected end of statement");
+ Parser.Lex(); // Eat EndOfStatement token.
+ return false;
+}
+
+/// parseDirectiveTpRelWord
+/// ::= .tprelword tls_sym
+bool MipsAsmParser::parseDirectiveTpRelWord() {
+ MCAsmParser &Parser = getParser();
+ const MCExpr *Value;
+ // EmitTPRel32Value requires an expression, so we are using base class
+ // method to evaluate the expression.
+ if (getParser().parseExpression(Value))
+ return true;
+ getParser().getStreamer().EmitTPRel32Value(Value);
+
+ if (getLexer().isNot(AsmToken::EndOfStatement))
+ return Error(getLexer().getLoc(),
+ "unexpected token, expected end of statement");
+ Parser.Lex(); // Eat EndOfStatement token.
+ return false;
+}
+
+/// parseDirectiveTpRelDWord
+/// ::= .tpreldword tls_sym
+bool MipsAsmParser::parseDirectiveTpRelDWord() {
+ MCAsmParser &Parser = getParser();
+ const MCExpr *Value;
+ // EmitTPRel64Value requires an expression, so we are using base class
+ // method to evaluate the expression.
+ if (getParser().parseExpression(Value))
+ return true;
+ getParser().getStreamer().EmitTPRel64Value(Value);
+
+ if (getLexer().isNot(AsmToken::EndOfStatement))
+ return Error(getLexer().getLoc(),
+ "unexpected token, expected end of statement");
+ Parser.Lex(); // Eat EndOfStatement token.
+ return false;
+}
+
+bool MipsAsmParser::parseDirectiveOption() {
+ MCAsmParser &Parser = getParser();
+ // Get the option token.
+ AsmToken Tok = Parser.getTok();
+ // At the moment only identifiers are supported.
+ if (Tok.isNot(AsmToken::Identifier)) {
+ return Error(Parser.getTok().getLoc(),
+ "unexpected token, expected identifier");
+ }
+
+ StringRef Option = Tok.getIdentifier();
+
+ if (Option == "pic0") {
+ // MipsAsmParser needs to know if the current PIC mode changes.
+ IsPicEnabled = false;
+
+ getTargetStreamer().emitDirectiveOptionPic0();
+ Parser.Lex();
+ if (Parser.getTok().isNot(AsmToken::EndOfStatement)) {
+ return Error(Parser.getTok().getLoc(),
+ "unexpected token, expected end of statement");
+ }
+ return false;
+ }
+
+ if (Option == "pic2") {
+ // MipsAsmParser needs to know if the current PIC mode changes.
+ IsPicEnabled = true;
+
+ getTargetStreamer().emitDirectiveOptionPic2();
+ Parser.Lex();
+ if (Parser.getTok().isNot(AsmToken::EndOfStatement)) {
+ return Error(Parser.getTok().getLoc(),
+ "unexpected token, expected end of statement");
+ }
+ return false;
+ }
+
+ // Unknown option.
+ Warning(Parser.getTok().getLoc(),
+ "unknown option, expected 'pic0' or 'pic2'");
+ Parser.eatToEndOfStatement();
+ return false;
+}
+
+/// parseInsnDirective
+/// ::= .insn
+bool MipsAsmParser::parseInsnDirective() {
+ // If this is not the end of the statement, report an error.
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ reportParseError("unexpected token, expected end of statement");
+ return false;
+ }
+
+ // The actual label marking happens in
+ // MipsELFStreamer::createPendingLabelRelocs().
+ getTargetStreamer().emitDirectiveInsn();
+
+ getParser().Lex(); // Eat EndOfStatement token.
+ return false;
+}
+
+/// parseSSectionDirective
+/// ::= .sbss
+/// ::= .sdata
+bool MipsAsmParser::parseSSectionDirective(StringRef Section, unsigned Type) {
+ // If this is not the end of the statement, report an error.
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ reportParseError("unexpected token, expected end of statement");
+ return false;
+ }
+
+ MCSection *ELFSection = getContext().getELFSection(
+ Section, Type, ELF::SHF_WRITE | ELF::SHF_ALLOC | ELF::SHF_MIPS_GPREL);
+ getParser().getStreamer().SwitchSection(ELFSection);
+
+ getParser().Lex(); // Eat EndOfStatement token.
+ return false;
+}
+
+/// parseDirectiveModule
+/// ::= .module oddspreg
+/// ::= .module nooddspreg
+/// ::= .module fp=value
+/// ::= .module softfloat
+/// ::= .module hardfloat
+bool MipsAsmParser::parseDirectiveModule() {
+ MCAsmParser &Parser = getParser();
+ MCAsmLexer &Lexer = getLexer();
+ SMLoc L = Lexer.getLoc();
+
+ if (!getTargetStreamer().isModuleDirectiveAllowed()) {
+ // TODO : get a better message.
+ reportParseError(".module directive must appear before any code");
+ return false;
+ }
+
+ StringRef Option;
+ if (Parser.parseIdentifier(Option)) {
+ reportParseError("expected .module option identifier");
+ return false;
+ }
+
+ if (Option == "oddspreg") {
+ clearModuleFeatureBits(Mips::FeatureNoOddSPReg, "nooddspreg");
+
+ // Synchronize the abiflags information with the FeatureBits information we
+ // changed above.
+ getTargetStreamer().updateABIInfo(*this);
+
+ // If printing assembly, use the recently updated abiflags information.
+ // If generating ELF, don't do anything (the .MIPS.abiflags section gets
+ // emitted at the end).
+ getTargetStreamer().emitDirectiveModuleOddSPReg();
+
+ // If this is not the end of the statement, report an error.
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ reportParseError("unexpected token, expected end of statement");
+ return false;
+ }
+
+ return false; // parseDirectiveModule has finished successfully.
+ } else if (Option == "nooddspreg") {
+ if (!isABI_O32()) {
+ return Error(L, "'.module nooddspreg' requires the O32 ABI");
+ }
+
+ setModuleFeatureBits(Mips::FeatureNoOddSPReg, "nooddspreg");
+
+ // Synchronize the abiflags information with the FeatureBits information we
+ // changed above.
+ getTargetStreamer().updateABIInfo(*this);
+
+ // If printing assembly, use the recently updated abiflags information.
+ // If generating ELF, don't do anything (the .MIPS.abiflags section gets
+ // emitted at the end).
+ getTargetStreamer().emitDirectiveModuleOddSPReg();
+
+ // If this is not the end of the statement, report an error.
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ reportParseError("unexpected token, expected end of statement");
+ return false;
+ }
+
+ return false; // parseDirectiveModule has finished successfully.
+ } else if (Option == "fp") {
+ return parseDirectiveModuleFP();
+ } else if (Option == "softfloat") {
+ setModuleFeatureBits(Mips::FeatureSoftFloat, "soft-float");
+
+ // Synchronize the ABI Flags information with the FeatureBits information we
+ // updated above.
+ getTargetStreamer().updateABIInfo(*this);
+
+ // If printing assembly, use the recently updated ABI Flags information.
+ // If generating ELF, don't do anything (the .MIPS.abiflags section gets
+ // emitted later).
+ getTargetStreamer().emitDirectiveModuleSoftFloat();
+
+ // If this is not the end of the statement, report an error.
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ reportParseError("unexpected token, expected end of statement");
+ return false;
+ }
+
+ return false; // parseDirectiveModule has finished successfully.
+ } else if (Option == "hardfloat") {
+ clearModuleFeatureBits(Mips::FeatureSoftFloat, "soft-float");
+
+ // Synchronize the ABI Flags information with the FeatureBits information we
+ // updated above.
+ getTargetStreamer().updateABIInfo(*this);
+
+ // If printing assembly, use the recently updated ABI Flags information.
+ // If generating ELF, don't do anything (the .MIPS.abiflags section gets
+ // emitted later).
+ getTargetStreamer().emitDirectiveModuleHardFloat();
+
+ // If this is not the end of the statement, report an error.
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ reportParseError("unexpected token, expected end of statement");
+ return false;
+ }
+
+ return false; // parseDirectiveModule has finished successfully.
+ } else {
+ return Error(L, "'" + Twine(Option) + "' is not a valid .module option.");
+ }
+}
+
+/// parseDirectiveModuleFP
+/// ::= =32
+/// ::= =xx
+/// ::= =64
+bool MipsAsmParser::parseDirectiveModuleFP() {
+ MCAsmParser &Parser = getParser();
+ MCAsmLexer &Lexer = getLexer();
+
+ if (Lexer.isNot(AsmToken::Equal)) {
+ reportParseError("unexpected token, expected equals sign '='");
+ return false;
+ }
+ Parser.Lex(); // Eat '=' token.
+
+ MipsABIFlagsSection::FpABIKind FpABI;
+ if (!parseFpABIValue(FpABI, ".module"))
+ return false;
+
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ reportParseError("unexpected token, expected end of statement");
+ return false;
+ }
+
+ // Synchronize the abiflags information with the FeatureBits information we
+ // changed above.
+ getTargetStreamer().updateABIInfo(*this);
+
+ // If printing assembly, use the recently updated abiflags information.
+ // If generating ELF, don't do anything (the .MIPS.abiflags section gets
+ // emitted at the end).
+ getTargetStreamer().emitDirectiveModuleFP();
+
+ Parser.Lex(); // Consume the EndOfStatement.
+ return false;
+}
+
+bool MipsAsmParser::parseFpABIValue(MipsABIFlagsSection::FpABIKind &FpABI,
+ StringRef Directive) {
+ MCAsmParser &Parser = getParser();
+ MCAsmLexer &Lexer = getLexer();
+ bool ModuleLevelOptions = Directive == ".module";
+
+ if (Lexer.is(AsmToken::Identifier)) {
+ StringRef Value = Parser.getTok().getString();
+ Parser.Lex();
+
+ if (Value != "xx") {
+ reportParseError("unsupported value, expected 'xx', '32' or '64'");
+ return false;
+ }
+
+ if (!isABI_O32()) {
+ reportParseError("'" + Directive + " fp=xx' requires the O32 ABI");
+ return false;
+ }
+
+ FpABI = MipsABIFlagsSection::FpABIKind::XX;
+ if (ModuleLevelOptions) {
+ setModuleFeatureBits(Mips::FeatureFPXX, "fpxx");
+ clearModuleFeatureBits(Mips::FeatureFP64Bit, "fp64");
+ } else {
+ setFeatureBits(Mips::FeatureFPXX, "fpxx");
+ clearFeatureBits(Mips::FeatureFP64Bit, "fp64");
+ }
+ return true;
+ }
+
+ if (Lexer.is(AsmToken::Integer)) {
+ unsigned Value = Parser.getTok().getIntVal();
+ Parser.Lex();
+
+ if (Value != 32 && Value != 64) {
+ reportParseError("unsupported value, expected 'xx', '32' or '64'");
+ return false;
+ }
+
+ if (Value == 32) {
+ if (!isABI_O32()) {
+ reportParseError("'" + Directive + " fp=32' requires the O32 ABI");
+ return false;
+ }
+
+ FpABI = MipsABIFlagsSection::FpABIKind::S32;
+ if (ModuleLevelOptions) {
+ clearModuleFeatureBits(Mips::FeatureFPXX, "fpxx");
+ clearModuleFeatureBits(Mips::FeatureFP64Bit, "fp64");
+ } else {
+ clearFeatureBits(Mips::FeatureFPXX, "fpxx");
+ clearFeatureBits(Mips::FeatureFP64Bit, "fp64");
+ }
+ } else {
+ FpABI = MipsABIFlagsSection::FpABIKind::S64;
+ if (ModuleLevelOptions) {
+ clearModuleFeatureBits(Mips::FeatureFPXX, "fpxx");
+ setModuleFeatureBits(Mips::FeatureFP64Bit, "fp64");
+ } else {
+ clearFeatureBits(Mips::FeatureFPXX, "fpxx");
+ setFeatureBits(Mips::FeatureFP64Bit, "fp64");
+ }
+ }
+
+ return true;
+ }
+
+ return false;
+}
+
+bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
+ // This returns false if this function recognizes the directive
+ // regardless of whether it is successfully handles or reports an
+ // error. Otherwise it returns true to give the generic parser a
+ // chance at recognizing it.
+
+ MCAsmParser &Parser = getParser();
+ StringRef IDVal = DirectiveID.getString();
+
+ if (IDVal == ".cpload") {
+ parseDirectiveCpLoad(DirectiveID.getLoc());
+ return false;
+ }
+ if (IDVal == ".cprestore") {
+ parseDirectiveCpRestore(DirectiveID.getLoc());
+ return false;
+ }
+ if (IDVal == ".dword") {
+ parseDataDirective(8, DirectiveID.getLoc());
+ return false;
+ }
+ if (IDVal == ".ent") {
+ StringRef SymbolName;
+
+ if (Parser.parseIdentifier(SymbolName)) {
+ reportParseError("expected identifier after .ent");
+ return false;
+ }
+
+ // There's an undocumented extension that allows an integer to
+ // follow the name of the procedure which AFAICS is ignored by GAS.
+ // Example: .ent foo,2
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ if (getLexer().isNot(AsmToken::Comma)) {
+ // Even though we accept this undocumented extension for compatibility
+ // reasons, the additional integer argument does not actually change
+ // the behaviour of the '.ent' directive, so we would like to discourage
+ // its use. We do this by not referring to the extended version in
+ // error messages which are not directly related to its use.
+ reportParseError("unexpected token, expected end of statement");
+ return false;
+ }
+ Parser.Lex(); // Eat the comma.
+ const MCExpr *DummyNumber;
+ int64_t DummyNumberVal;
+ // If the user was explicitly trying to use the extended version,
+ // we still give helpful extension-related error messages.
+ if (Parser.parseExpression(DummyNumber)) {
+ reportParseError("expected number after comma");
+ return false;
+ }
+ if (!DummyNumber->evaluateAsAbsolute(DummyNumberVal)) {
+ reportParseError("expected an absolute expression after comma");
+ return false;
+ }
+ }
+
+ // If this is not the end of the statement, report an error.
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ reportParseError("unexpected token, expected end of statement");
+ return false;
+ }
+
+ MCSymbol *Sym = getContext().getOrCreateSymbol(SymbolName);
+
+ getTargetStreamer().emitDirectiveEnt(*Sym);
+ CurrentFn = Sym;
+ IsCpRestoreSet = false;
+ return false;
+ }
+
+ if (IDVal == ".end") {
+ StringRef SymbolName;
+
+ if (Parser.parseIdentifier(SymbolName)) {
+ reportParseError("expected identifier after .end");
+ return false;
+ }
+
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ reportParseError("unexpected token, expected end of statement");
+ return false;
+ }
+
+ if (CurrentFn == nullptr) {
+ reportParseError(".end used without .ent");
+ return false;
+ }
+
+ if ((SymbolName != CurrentFn->getName())) {
+ reportParseError(".end symbol does not match .ent symbol");
+ return false;
+ }
+
+ getTargetStreamer().emitDirectiveEnd(SymbolName);
+ CurrentFn = nullptr;
+ IsCpRestoreSet = false;
+ return false;
+ }
+
+ if (IDVal == ".frame") {
+ // .frame $stack_reg, frame_size_in_bytes, $return_reg
+ SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> TmpReg;
+ OperandMatchResultTy ResTy = parseAnyRegister(TmpReg);
+ if (ResTy == MatchOperand_NoMatch || ResTy == MatchOperand_ParseFail) {
+ reportParseError("expected stack register");
+ return false;
+ }
+
+ MipsOperand &StackRegOpnd = static_cast<MipsOperand &>(*TmpReg[0]);
+ if (!StackRegOpnd.isGPRAsmReg()) {
+ reportParseError(StackRegOpnd.getStartLoc(),
+ "expected general purpose register");
+ return false;
+ }
+ unsigned StackReg = StackRegOpnd.getGPR32Reg();
+
+ if (Parser.getTok().is(AsmToken::Comma))
+ Parser.Lex();
+ else {
+ reportParseError("unexpected token, expected comma");
+ return false;
+ }
+
+ // Parse the frame size.
+ const MCExpr *FrameSize;
+ int64_t FrameSizeVal;
+
+ if (Parser.parseExpression(FrameSize)) {
+ reportParseError("expected frame size value");
+ return false;
+ }
+
+ if (!FrameSize->evaluateAsAbsolute(FrameSizeVal)) {
+ reportParseError("frame size not an absolute expression");
+ return false;
+ }
+
+ if (Parser.getTok().is(AsmToken::Comma))
+ Parser.Lex();
+ else {
+ reportParseError("unexpected token, expected comma");
+ return false;
+ }
+
+ // Parse the return register.
+ TmpReg.clear();
+ ResTy = parseAnyRegister(TmpReg);
+ if (ResTy == MatchOperand_NoMatch || ResTy == MatchOperand_ParseFail) {
+ reportParseError("expected return register");
+ return false;
+ }
+
+ MipsOperand &ReturnRegOpnd = static_cast<MipsOperand &>(*TmpReg[0]);
+ if (!ReturnRegOpnd.isGPRAsmReg()) {
+ reportParseError(ReturnRegOpnd.getStartLoc(),
+ "expected general purpose register");
+ return false;
+ }
+
+ // If this is not the end of the statement, report an error.
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ reportParseError("unexpected token, expected end of statement");
+ return false;
+ }
+
+ getTargetStreamer().emitFrame(StackReg, FrameSizeVal,
+ ReturnRegOpnd.getGPR32Reg());
+ IsCpRestoreSet = false;
+ return false;
+ }
+
+ if (IDVal == ".set") {
+ parseDirectiveSet();
+ return false;
+ }
+
+ if (IDVal == ".mask" || IDVal == ".fmask") {
+ // .mask bitmask, frame_offset
+ // bitmask: One bit for each register used.
+ // frame_offset: Offset from Canonical Frame Address ($sp on entry) where
+ // first register is expected to be saved.
+ // Examples:
+ // .mask 0x80000000, -4
+ // .fmask 0x80000000, -4
+ //
+
+ // Parse the bitmask
+ const MCExpr *BitMask;
+ int64_t BitMaskVal;
+
+ if (Parser.parseExpression(BitMask)) {
+ reportParseError("expected bitmask value");
+ return false;
+ }
+
+ if (!BitMask->evaluateAsAbsolute(BitMaskVal)) {
+ reportParseError("bitmask not an absolute expression");
+ return false;
+ }
+
+ if (Parser.getTok().is(AsmToken::Comma))
+ Parser.Lex();
+ else {
+ reportParseError("unexpected token, expected comma");
+ return false;
+ }
+
+ // Parse the frame_offset
+ const MCExpr *FrameOffset;
+ int64_t FrameOffsetVal;
+
+ if (Parser.parseExpression(FrameOffset)) {
+ reportParseError("expected frame offset value");
+ return false;
+ }
+
+ if (!FrameOffset->evaluateAsAbsolute(FrameOffsetVal)) {
+ reportParseError("frame offset not an absolute expression");
+ return false;
+ }
+
+ // If this is not the end of the statement, report an error.
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ reportParseError("unexpected token, expected end of statement");
+ return false;
+ }
+
+ if (IDVal == ".mask")
+ getTargetStreamer().emitMask(BitMaskVal, FrameOffsetVal);
+ else
+ getTargetStreamer().emitFMask(BitMaskVal, FrameOffsetVal);
+ return false;
+ }
+
+ if (IDVal == ".nan")
+ return parseDirectiveNaN();
+
+ if (IDVal == ".gpword") {
+ parseDirectiveGpWord();
+ return false;
+ }
+
+ if (IDVal == ".gpdword") {
+ parseDirectiveGpDWord();
+ return false;
+ }
+
+ if (IDVal == ".dtprelword") {
+ parseDirectiveDtpRelWord();
+ return false;
+ }
+
+ if (IDVal == ".dtpreldword") {
+ parseDirectiveDtpRelDWord();
+ return false;
+ }
+
+ if (IDVal == ".tprelword") {
+ parseDirectiveTpRelWord();
+ return false;
+ }
+
+ if (IDVal == ".tpreldword") {
+ parseDirectiveTpRelDWord();
+ return false;
+ }
+
+ if (IDVal == ".word") {
+ parseDataDirective(4, DirectiveID.getLoc());
+ return false;
+ }
+
+ if (IDVal == ".hword") {
+ parseDataDirective(2, DirectiveID.getLoc());
+ return false;
+ }
+
+ if (IDVal == ".option") {
+ parseDirectiveOption();
+ return false;
+ }
+
+ if (IDVal == ".abicalls") {
+ getTargetStreamer().emitDirectiveAbiCalls();
+ if (Parser.getTok().isNot(AsmToken::EndOfStatement)) {
+ Error(Parser.getTok().getLoc(),
+ "unexpected token, expected end of statement");
+ }
+ return false;
+ }
+
+ if (IDVal == ".cpsetup") {
+ parseDirectiveCPSetup();
+ return false;
+ }
+ if (IDVal == ".cpreturn") {
+ parseDirectiveCPReturn();
+ return false;
+ }
+ if (IDVal == ".module") {
+ parseDirectiveModule();
+ return false;
+ }
+ if (IDVal == ".llvm_internal_mips_reallow_module_directive") {
+ parseInternalDirectiveReallowModule();
+ return false;
+ }
+ if (IDVal == ".insn") {
+ parseInsnDirective();
+ return false;
+ }
+ if (IDVal == ".sbss") {
+ parseSSectionDirective(IDVal, ELF::SHT_NOBITS);
+ return false;
+ }
+ if (IDVal == ".sdata") {
+ parseSSectionDirective(IDVal, ELF::SHT_PROGBITS);
+ return false;
+ }
+
+ return true;
+}
+
+bool MipsAsmParser::parseInternalDirectiveReallowModule() {
+ // If this is not the end of the statement, report an error.
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ reportParseError("unexpected token, expected end of statement");
+ return false;
+ }
+
+ getTargetStreamer().reallowModuleDirective();
+
+ getParser().Lex(); // Eat EndOfStatement token.
+ return false;
+}
+
+extern "C" void LLVMInitializeMipsAsmParser() {
+ RegisterMCAsmParser<MipsAsmParser> X(getTheMipsTarget());
+ RegisterMCAsmParser<MipsAsmParser> Y(getTheMipselTarget());
+ RegisterMCAsmParser<MipsAsmParser> A(getTheMips64Target());
+ RegisterMCAsmParser<MipsAsmParser> B(getTheMips64elTarget());
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#include "MipsGenAsmMatcher.inc"
diff --git a/contrib/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/contrib/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
new file mode 100644
index 000000000000..f80efb18507b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -0,0 +1,2505 @@
+//===- MipsDisassembler.cpp - Disassembler for Mips -------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the Mips Disassembler.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips.h"
+#include "MipsRegisterInfo.h"
+#include "MipsSubtarget.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-disassembler"
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+
+class MipsDisassembler : public MCDisassembler {
+ bool IsMicroMips;
+ bool IsBigEndian;
+public:
+ MipsDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, bool IsBigEndian)
+ : MCDisassembler(STI, Ctx),
+ IsMicroMips(STI.getFeatureBits()[Mips::FeatureMicroMips]),
+ IsBigEndian(IsBigEndian) {}
+
+ bool hasMips2() const { return STI.getFeatureBits()[Mips::FeatureMips2]; }
+ bool hasMips3() const { return STI.getFeatureBits()[Mips::FeatureMips3]; }
+ bool hasMips32() const { return STI.getFeatureBits()[Mips::FeatureMips32]; }
+ bool hasMips32r6() const {
+ return STI.getFeatureBits()[Mips::FeatureMips32r6];
+ }
+ bool isFP64() const { return STI.getFeatureBits()[Mips::FeatureFP64Bit]; }
+
+ bool isGP64() const { return STI.getFeatureBits()[Mips::FeatureGP64Bit]; }
+
+ bool isPTR64() const { return STI.getFeatureBits()[Mips::FeaturePTR64Bit]; }
+
+ bool hasCnMips() const { return STI.getFeatureBits()[Mips::FeatureCnMips]; }
+
+ bool hasCOP3() const {
+ // Only present in MIPS-I and MIPS-II
+ return !hasMips32() && !hasMips3();
+ }
+
+ DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes, uint64_t Address,
+ raw_ostream &VStream,
+ raw_ostream &CStream) const override;
+};
+
+} // end anonymous namespace
+
+// Forward declare these because the autogenerated code will reference them.
+// Definitions are further down.
+static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeCPU16RegsRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeGPRMM16RegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeGPRMM16ZeroRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeGPRMM16MovePRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodePtrRegisterClass(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeDSPRRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeFGR64RegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeFGR32RegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeCCRRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeFCCRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeFGRCCRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeHWRegsRegisterClass(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeAFGR64RegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeACC64DSPRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeHI32DSPRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeLO32DSPRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeMSA128BRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeMSA128HRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeMSA128WRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeMSA128DRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeMSACtrlRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeCOP0RegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeCOP2RegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeBranchTarget(MCInst &Inst,
+ unsigned Offset,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeBranchTarget1SImm16(MCInst &Inst,
+ unsigned Offset,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeJumpTarget(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeBranchTarget21(MCInst &Inst,
+ unsigned Offset,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeBranchTarget21MM(MCInst &Inst,
+ unsigned Offset,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeBranchTarget26(MCInst &Inst,
+ unsigned Offset,
+ uint64_t Address,
+ const void *Decoder);
+
+// DecodeBranchTarget7MM - Decode microMIPS branch offset, which is
+// shifted left by 1 bit.
+static DecodeStatus DecodeBranchTarget7MM(MCInst &Inst,
+ unsigned Offset,
+ uint64_t Address,
+ const void *Decoder);
+
+// DecodeBranchTarget10MM - Decode microMIPS branch offset, which is
+// shifted left by 1 bit.
+static DecodeStatus DecodeBranchTarget10MM(MCInst &Inst,
+ unsigned Offset,
+ uint64_t Address,
+ const void *Decoder);
+
+// DecodeBranchTargetMM - Decode microMIPS branch offset, which is
+// shifted left by 1 bit.
+static DecodeStatus DecodeBranchTargetMM(MCInst &Inst,
+ unsigned Offset,
+ uint64_t Address,
+ const void *Decoder);
+
+// DecodeBranchTarget26MM - Decode microMIPS branch offset, which is
+// shifted left by 1 bit.
+static DecodeStatus DecodeBranchTarget26MM(MCInst &Inst,
+ unsigned Offset,
+ uint64_t Address,
+ const void *Decoder);
+
+// DecodeJumpTargetMM - Decode microMIPS jump target, which is
+// shifted left by 1 bit.
+static DecodeStatus DecodeJumpTargetMM(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeMem(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeMemEVA(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeLoadByte9(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeLoadByte15(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeCacheOp(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeCacheeOp_CacheOpR6(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeCacheOpMM(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeStoreEvaOpMM(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodePrefeOpMM(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeSyncI(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeSynciR6(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeMSA128Mem(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeMemMMImm4(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeMemMMSPImm5Lsl2(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeMemMMGPImm7Lsl2(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeMemMMReglistImm4Lsl2(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeMemMMImm9(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeMemMMImm12(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeMemMMImm16(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeFMem(MCInst &Inst, unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeFMemMMR2(MCInst &Inst, unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeFMem2(MCInst &Inst, unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeFMem3(MCInst &Inst, unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeFMemCop2R6(MCInst &Inst, unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeFMemCop2MMR6(MCInst &Inst, unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeSpecial3LlSc(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeAddiur2Simm7(MCInst &Inst,
+ unsigned Value,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeLi16Imm(MCInst &Inst,
+ unsigned Value,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodePOOL16BEncodedField(MCInst &Inst,
+ unsigned Value,
+ uint64_t Address,
+ const void *Decoder);
+
+template <unsigned Bits, int Offset, int Scale>
+static DecodeStatus DecodeUImmWithOffsetAndScale(MCInst &Inst, unsigned Value,
+ uint64_t Address,
+ const void *Decoder);
+
+template <unsigned Bits, int Offset>
+static DecodeStatus DecodeUImmWithOffset(MCInst &Inst, unsigned Value,
+ uint64_t Address,
+ const void *Decoder) {
+ return DecodeUImmWithOffsetAndScale<Bits, Offset, 1>(Inst, Value, Address,
+ Decoder);
+}
+
+template <unsigned Bits, int Offset = 0, int ScaleBy = 1>
+static DecodeStatus DecodeSImmWithOffsetAndScale(MCInst &Inst, unsigned Value,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeInsSize(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeSimm19Lsl2(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeSimm18Lsl3(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeSimm9SP(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeANDI16Imm(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeSimm23Lsl2(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+
+/// INSVE_[BHWD] have an implicit operand that the generated decoder doesn't
+/// handle.
+template <typename InsnType>
+static DecodeStatus DecodeINSVE_DF(MCInst &MI, InsnType insn, uint64_t Address,
+ const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus DecodeDAHIDATIMMR6(MCInst &MI, InsnType insn, uint64_t Address,
+ const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus DecodeDAHIDATI(MCInst &MI, InsnType insn, uint64_t Address,
+ const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus DecodeDAHIDATIMMR6(MCInst &MI, InsnType insn, uint64_t Address,
+ const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus DecodeDAHIDATI(MCInst &MI, InsnType insn, uint64_t Address,
+ const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodeAddiGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
+ const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodePOP35GroupBranchMMR6(MCInst &MI, InsnType insn, uint64_t Address,
+ const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodeDaddiGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
+ const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodePOP37GroupBranchMMR6(MCInst &MI, InsnType insn, uint64_t Address,
+ const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodePOP65GroupBranchMMR6(MCInst &MI, InsnType insn, uint64_t Address,
+ const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodePOP75GroupBranchMMR6(MCInst &MI, InsnType insn, uint64_t Address,
+ const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodeBlezlGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
+ const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodeBgtzlGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
+ const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodeBgtzGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
+ const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodeBlezGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
+ const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodeBgtzGroupBranchMMR6(MCInst &MI, InsnType insn, uint64_t Address,
+ const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodeBlezGroupBranchMMR6(MCInst &MI, InsnType insn, uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeRegListOperand16(MCInst &Inst, unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeMovePRegPair(MCInst &Inst, unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+namespace llvm {
+Target &getTheMipselTarget();
+Target &getTheMipsTarget();
+Target &getTheMips64Target();
+Target &getTheMips64elTarget();
+}
+
+static MCDisassembler *createMipsDisassembler(
+ const Target &T,
+ const MCSubtargetInfo &STI,
+ MCContext &Ctx) {
+ return new MipsDisassembler(STI, Ctx, true);
+}
+
+static MCDisassembler *createMipselDisassembler(
+ const Target &T,
+ const MCSubtargetInfo &STI,
+ MCContext &Ctx) {
+ return new MipsDisassembler(STI, Ctx, false);
+}
+
+extern "C" void LLVMInitializeMipsDisassembler() {
+ // Register the disassembler.
+ TargetRegistry::RegisterMCDisassembler(getTheMipsTarget(),
+ createMipsDisassembler);
+ TargetRegistry::RegisterMCDisassembler(getTheMipselTarget(),
+ createMipselDisassembler);
+ TargetRegistry::RegisterMCDisassembler(getTheMips64Target(),
+ createMipsDisassembler);
+ TargetRegistry::RegisterMCDisassembler(getTheMips64elTarget(),
+ createMipselDisassembler);
+}
+
+#include "MipsGenDisassemblerTables.inc"
+
+static unsigned getReg(const void *D, unsigned RC, unsigned RegNo) {
+ const MipsDisassembler *Dis = static_cast<const MipsDisassembler*>(D);
+ const MCRegisterInfo *RegInfo = Dis->getContext().getRegisterInfo();
+ return *(RegInfo->getRegClass(RC).begin() + RegNo);
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeINSVE_DF(MCInst &MI, InsnType insn, uint64_t Address,
+ const void *Decoder) {
+ typedef DecodeStatus (*DecodeFN)(MCInst &, unsigned, uint64_t, const void *);
+ // The size of the n field depends on the element size
+ // The register class also depends on this.
+ InsnType tmp = fieldFromInstruction(insn, 17, 5);
+ unsigned NSize = 0;
+ DecodeFN RegDecoder = nullptr;
+ if ((tmp & 0x18) == 0x00) { // INSVE_B
+ NSize = 4;
+ RegDecoder = DecodeMSA128BRegisterClass;
+ } else if ((tmp & 0x1c) == 0x10) { // INSVE_H
+ NSize = 3;
+ RegDecoder = DecodeMSA128HRegisterClass;
+ } else if ((tmp & 0x1e) == 0x18) { // INSVE_W
+ NSize = 2;
+ RegDecoder = DecodeMSA128WRegisterClass;
+ } else if ((tmp & 0x1f) == 0x1c) { // INSVE_D
+ NSize = 1;
+ RegDecoder = DecodeMSA128DRegisterClass;
+ } else
+ llvm_unreachable("Invalid encoding");
+
+ assert(NSize != 0 && RegDecoder != nullptr);
+
+ // $wd
+ tmp = fieldFromInstruction(insn, 6, 5);
+ if (RegDecoder(MI, tmp, Address, Decoder) == MCDisassembler::Fail)
+ return MCDisassembler::Fail;
+ // $wd_in
+ if (RegDecoder(MI, tmp, Address, Decoder) == MCDisassembler::Fail)
+ return MCDisassembler::Fail;
+ // $n
+ tmp = fieldFromInstruction(insn, 16, NSize);
+ MI.addOperand(MCOperand::createImm(tmp));
+ // $ws
+ tmp = fieldFromInstruction(insn, 11, 5);
+ if (RegDecoder(MI, tmp, Address, Decoder) == MCDisassembler::Fail)
+ return MCDisassembler::Fail;
+ // $n2
+ MI.addOperand(MCOperand::createImm(0));
+
+ return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeDAHIDATIMMR6(MCInst &MI, InsnType insn, uint64_t Address,
+ const void *Decoder) {
+ InsnType Rs = fieldFromInstruction(insn, 16, 5);
+ InsnType Imm = fieldFromInstruction(insn, 0, 16);
+ MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR64RegClassID,
+ Rs)));
+ MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR64RegClassID,
+ Rs)));
+ MI.addOperand(MCOperand::createImm(Imm));
+
+ return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeDAHIDATI(MCInst &MI, InsnType insn, uint64_t Address,
+ const void *Decoder) {
+ InsnType Rs = fieldFromInstruction(insn, 21, 5);
+ InsnType Imm = fieldFromInstruction(insn, 0, 16);
+ MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR64RegClassID,
+ Rs)));
+ MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR64RegClassID,
+ Rs)));
+ MI.addOperand(MCOperand::createImm(Imm));
+
+ return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeAddiGroupBranch(MCInst &MI, InsnType insn,
+ uint64_t Address,
+ const void *Decoder) {
+ // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
+ // (otherwise we would have matched the ADDI instruction from the earlier
+ // ISA's instead).
+ //
+ // We have:
+ // 0b001000 sssss ttttt iiiiiiiiiiiiiiii
+ // BOVC if rs >= rt
+ // BEQZALC if rs == 0 && rt != 0
+ // BEQC if rs < rt && rs != 0
+
+ InsnType Rs = fieldFromInstruction(insn, 21, 5);
+ InsnType Rt = fieldFromInstruction(insn, 16, 5);
+ int64_t Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4 + 4;
+ bool HasRs = false;
+
+ if (Rs >= Rt) {
+ MI.setOpcode(Mips::BOVC);
+ HasRs = true;
+ } else if (Rs != 0 && Rs < Rt) {
+ MI.setOpcode(Mips::BEQC);
+ HasRs = true;
+ } else
+ MI.setOpcode(Mips::BEQZALC);
+
+ if (HasRs)
+ MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+ Rs)));
+
+ MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+ Rt)));
+ MI.addOperand(MCOperand::createImm(Imm));
+
+ return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodePOP35GroupBranchMMR6(MCInst &MI, InsnType insn,
+ uint64_t Address,
+ const void *Decoder) {
+ InsnType Rt = fieldFromInstruction(insn, 21, 5);
+ InsnType Rs = fieldFromInstruction(insn, 16, 5);
+ int64_t Imm = 0;
+
+ if (Rs >= Rt) {
+ MI.setOpcode(Mips::BOVC_MMR6);
+ MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+ Rt)));
+ MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+ Rs)));
+ Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 2 + 4;
+ } else if (Rs != 0 && Rs < Rt) {
+ MI.setOpcode(Mips::BEQC_MMR6);
+ MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+ Rs)));
+ MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+ Rt)));
+ Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4 + 4;
+ } else {
+ MI.setOpcode(Mips::BEQZALC_MMR6);
+ MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+ Rt)));
+ Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 2 + 4;
+ }
+
+ MI.addOperand(MCOperand::createImm(Imm));
+
+ return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeDaddiGroupBranch(MCInst &MI, InsnType insn,
+ uint64_t Address,
+ const void *Decoder) {
+ // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
+ // (otherwise we would have matched the ADDI instruction from the earlier
+ // ISA's instead).
+ //
+ // We have:
+ // 0b011000 sssss ttttt iiiiiiiiiiiiiiii
+ // BNVC if rs >= rt
+ // BNEZALC if rs == 0 && rt != 0
+ // BNEC if rs < rt && rs != 0
+
+ InsnType Rs = fieldFromInstruction(insn, 21, 5);
+ InsnType Rt = fieldFromInstruction(insn, 16, 5);
+ int64_t Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4 + 4;
+ bool HasRs = false;
+
+ if (Rs >= Rt) {
+ MI.setOpcode(Mips::BNVC);
+ HasRs = true;
+ } else if (Rs != 0 && Rs < Rt) {
+ MI.setOpcode(Mips::BNEC);
+ HasRs = true;
+ } else
+ MI.setOpcode(Mips::BNEZALC);
+
+ if (HasRs)
+ MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+ Rs)));
+
+ MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+ Rt)));
+ MI.addOperand(MCOperand::createImm(Imm));
+
+ return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodePOP37GroupBranchMMR6(MCInst &MI, InsnType insn,
+ uint64_t Address,
+ const void *Decoder) {
+ InsnType Rt = fieldFromInstruction(insn, 21, 5);
+ InsnType Rs = fieldFromInstruction(insn, 16, 5);
+ int64_t Imm = 0;
+
+ if (Rs >= Rt) {
+ MI.setOpcode(Mips::BNVC_MMR6);
+ MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+ Rt)));
+ MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+ Rs)));
+ Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 2 + 4;
+ } else if (Rs != 0 && Rs < Rt) {
+ MI.setOpcode(Mips::BNEC_MMR6);
+ MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+ Rs)));
+ MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+ Rt)));
+ Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4 + 4;
+ } else {
+ MI.setOpcode(Mips::BNEZALC_MMR6);
+ MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+ Rt)));
+ Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 2 + 4;
+ }
+
+ MI.addOperand(MCOperand::createImm(Imm));
+
+ return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodePOP65GroupBranchMMR6(MCInst &MI, InsnType insn,
+ uint64_t Address,
+ const void *Decoder) {
+ // We have:
+ // 0b110101 ttttt sssss iiiiiiiiiiiiiiii
+ // Invalid if rt == 0
+ // BGTZC_MMR6 if rs == 0 && rt != 0
+ // BLTZC_MMR6 if rs == rt && rt != 0
+ // BLTC_MMR6 if rs != rt && rs != 0 && rt != 0
+
+ InsnType Rt = fieldFromInstruction(insn, 21, 5);
+ InsnType Rs = fieldFromInstruction(insn, 16, 5);
+ int64_t Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4 + 4;
+ bool HasRs = false;
+
+ if (Rt == 0)
+ return MCDisassembler::Fail;
+ else if (Rs == 0)
+ MI.setOpcode(Mips::BGTZC_MMR6);
+ else if (Rs == Rt)
+ MI.setOpcode(Mips::BLTZC_MMR6);
+ else {
+ MI.setOpcode(Mips::BLTC_MMR6);
+ HasRs = true;
+ }
+
+ if (HasRs)
+ MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+ Rs)));
+
+ MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+ Rt)));
+
+ MI.addOperand(MCOperand::createImm(Imm));
+
+ return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodePOP75GroupBranchMMR6(MCInst &MI, InsnType insn,
+ uint64_t Address,
+ const void *Decoder) {
+ // We have:
+ // 0b111101 ttttt sssss iiiiiiiiiiiiiiii
+ // Invalid if rt == 0
+ // BLEZC_MMR6 if rs == 0 && rt != 0
+ // BGEZC_MMR6 if rs == rt && rt != 0
+ // BGEC_MMR6 if rs != rt && rs != 0 && rt != 0
+
+ InsnType Rt = fieldFromInstruction(insn, 21, 5);
+ InsnType Rs = fieldFromInstruction(insn, 16, 5);
+ int64_t Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4 + 4;
+ bool HasRs = false;
+
+ if (Rt == 0)
+ return MCDisassembler::Fail;
+ else if (Rs == 0)
+ MI.setOpcode(Mips::BLEZC_MMR6);
+ else if (Rs == Rt)
+ MI.setOpcode(Mips::BGEZC_MMR6);
+ else {
+ HasRs = true;
+ MI.setOpcode(Mips::BGEC_MMR6);
+ }
+
+ if (HasRs)
+ MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+ Rs)));
+
+ MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+ Rt)));
+
+ MI.addOperand(MCOperand::createImm(Imm));
+
+ return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeBlezlGroupBranch(MCInst &MI, InsnType insn,
+ uint64_t Address,
+ const void *Decoder) {
+ // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
+ // (otherwise we would have matched the BLEZL instruction from the earlier
+ // ISA's instead).
+ //
+ // We have:
+ // 0b010110 sssss ttttt iiiiiiiiiiiiiiii
+ // Invalid if rs == 0
+ // BLEZC if rs == 0 && rt != 0
+ // BGEZC if rs == rt && rt != 0
+ // BGEC if rs != rt && rs != 0 && rt != 0
+
+ InsnType Rs = fieldFromInstruction(insn, 21, 5);
+ InsnType Rt = fieldFromInstruction(insn, 16, 5);
+ int64_t Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4 + 4;
+ bool HasRs = false;
+
+ if (Rt == 0)
+ return MCDisassembler::Fail;
+ else if (Rs == 0)
+ MI.setOpcode(Mips::BLEZC);
+ else if (Rs == Rt)
+ MI.setOpcode(Mips::BGEZC);
+ else {
+ HasRs = true;
+ MI.setOpcode(Mips::BGEC);
+ }
+
+ if (HasRs)
+ MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+ Rs)));
+
+ MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+ Rt)));
+
+ MI.addOperand(MCOperand::createImm(Imm));
+
+ return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeBgtzlGroupBranch(MCInst &MI, InsnType insn,
+ uint64_t Address,
+ const void *Decoder) {
+ // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
+ // (otherwise we would have matched the BGTZL instruction from the earlier
+ // ISA's instead).
+ //
+ // We have:
+ // 0b010111 sssss ttttt iiiiiiiiiiiiiiii
+ // Invalid if rs == 0
+ // BGTZC if rs == 0 && rt != 0
+ // BLTZC if rs == rt && rt != 0
+ // BLTC if rs != rt && rs != 0 && rt != 0
+
+ bool HasRs = false;
+
+ InsnType Rs = fieldFromInstruction(insn, 21, 5);
+ InsnType Rt = fieldFromInstruction(insn, 16, 5);
+ int64_t Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4 + 4;
+
+ if (Rt == 0)
+ return MCDisassembler::Fail;
+ else if (Rs == 0)
+ MI.setOpcode(Mips::BGTZC);
+ else if (Rs == Rt)
+ MI.setOpcode(Mips::BLTZC);
+ else {
+ MI.setOpcode(Mips::BLTC);
+ HasRs = true;
+ }
+
+ if (HasRs)
+ MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+ Rs)));
+
+ MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+ Rt)));
+
+ MI.addOperand(MCOperand::createImm(Imm));
+
+ return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeBgtzGroupBranch(MCInst &MI, InsnType insn,
+ uint64_t Address,
+ const void *Decoder) {
+ // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
+ // (otherwise we would have matched the BGTZ instruction from the earlier
+ // ISA's instead).
+ //
+ // We have:
+ // 0b000111 sssss ttttt iiiiiiiiiiiiiiii
+ // BGTZ if rt == 0
+ // BGTZALC if rs == 0 && rt != 0
+ // BLTZALC if rs != 0 && rs == rt
+ // BLTUC if rs != 0 && rs != rt
+
+ InsnType Rs = fieldFromInstruction(insn, 21, 5);
+ InsnType Rt = fieldFromInstruction(insn, 16, 5);
+ int64_t Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4 + 4;
+ bool HasRs = false;
+ bool HasRt = false;
+
+ if (Rt == 0) {
+ MI.setOpcode(Mips::BGTZ);
+ HasRs = true;
+ } else if (Rs == 0) {
+ MI.setOpcode(Mips::BGTZALC);
+ HasRt = true;
+ } else if (Rs == Rt) {
+ MI.setOpcode(Mips::BLTZALC);
+ HasRs = true;
+ } else {
+ MI.setOpcode(Mips::BLTUC);
+ HasRs = true;
+ HasRt = true;
+ }
+
+ if (HasRs)
+ MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+ Rs)));
+
+ if (HasRt)
+ MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+ Rt)));
+
+ MI.addOperand(MCOperand::createImm(Imm));
+
+ return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeBlezGroupBranch(MCInst &MI, InsnType insn,
+ uint64_t Address,
+ const void *Decoder) {
+ // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
+ // (otherwise we would have matched the BLEZL instruction from the earlier
+ // ISA's instead).
+ //
+ // We have:
+ // 0b000110 sssss ttttt iiiiiiiiiiiiiiii
+ // Invalid if rs == 0
+ // BLEZALC if rs == 0 && rt != 0
+ // BGEZALC if rs == rt && rt != 0
+ // BGEUC if rs != rt && rs != 0 && rt != 0
+
+ InsnType Rs = fieldFromInstruction(insn, 21, 5);
+ InsnType Rt = fieldFromInstruction(insn, 16, 5);
+ int64_t Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4 + 4;
+ bool HasRs = false;
+
+ if (Rt == 0)
+ return MCDisassembler::Fail;
+ else if (Rs == 0)
+ MI.setOpcode(Mips::BLEZALC);
+ else if (Rs == Rt)
+ MI.setOpcode(Mips::BGEZALC);
+ else {
+ HasRs = true;
+ MI.setOpcode(Mips::BGEUC);
+ }
+
+ if (HasRs)
+ MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+ Rs)));
+ MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+ Rt)));
+
+ MI.addOperand(MCOperand::createImm(Imm));
+
+ return MCDisassembler::Success;
+}
+
+/// Read two bytes from the ArrayRef and return 16 bit halfword sorted
+/// according to the given endianness.
+static DecodeStatus readInstruction16(ArrayRef<uint8_t> Bytes, uint64_t Address,
+ uint64_t &Size, uint32_t &Insn,
+ bool IsBigEndian) {
+ // We want to read exactly 2 Bytes of data.
+ if (Bytes.size() < 2) {
+ Size = 0;
+ return MCDisassembler::Fail;
+ }
+
+ if (IsBigEndian) {
+ Insn = (Bytes[0] << 8) | Bytes[1];
+ } else {
+ Insn = (Bytes[1] << 8) | Bytes[0];
+ }
+
+ return MCDisassembler::Success;
+}
+
+/// Read four bytes from the ArrayRef and return 32 bit word sorted
+/// according to the given endianness.
+static DecodeStatus readInstruction32(ArrayRef<uint8_t> Bytes, uint64_t Address,
+ uint64_t &Size, uint32_t &Insn,
+ bool IsBigEndian, bool IsMicroMips) {
+ // We want to read exactly 4 Bytes of data.
+ if (Bytes.size() < 4) {
+ Size = 0;
+ return MCDisassembler::Fail;
+ }
+
+ // High 16 bits of a 32-bit microMIPS instruction (where the opcode is)
+ // always precede the low 16 bits in the instruction stream (that is, they
+ // are placed at lower addresses in the instruction stream).
+ //
+ // microMIPS byte ordering:
+ // Big-endian: 0 | 1 | 2 | 3
+ // Little-endian: 1 | 0 | 3 | 2
+
+ if (IsBigEndian) {
+ // Encoded as a big-endian 32-bit word in the stream.
+ Insn =
+ (Bytes[3] << 0) | (Bytes[2] << 8) | (Bytes[1] << 16) | (Bytes[0] << 24);
+ } else {
+ if (IsMicroMips) {
+ Insn = (Bytes[2] << 0) | (Bytes[3] << 8) | (Bytes[0] << 16) |
+ (Bytes[1] << 24);
+ } else {
+ Insn = (Bytes[0] << 0) | (Bytes[1] << 8) | (Bytes[2] << 16) |
+ (Bytes[3] << 24);
+ }
+ }
+
+ return MCDisassembler::Success;
+}
+
+DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes,
+ uint64_t Address,
+ raw_ostream &VStream,
+ raw_ostream &CStream) const {
+ uint32_t Insn;
+ DecodeStatus Result;
+
+ if (IsMicroMips) {
+ Result = readInstruction16(Bytes, Address, Size, Insn, IsBigEndian);
+ if (Result == MCDisassembler::Fail)
+ return MCDisassembler::Fail;
+
+ if (hasMips32r6()) {
+ DEBUG(dbgs() << "Trying MicroMipsR616 table (16-bit instructions):\n");
+ // Calling the auto-generated decoder function for microMIPS32R6
+ // (and microMIPS64R6) 16-bit instructions.
+ Result = decodeInstruction(DecoderTableMicroMipsR616, Instr, Insn,
+ Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 2;
+ return Result;
+ }
+ }
+
+ DEBUG(dbgs() << "Trying MicroMips16 table (16-bit instructions):\n");
+ // Calling the auto-generated decoder function for microMIPS 16-bit
+ // instructions.
+ Result = decodeInstruction(DecoderTableMicroMips16, Instr, Insn, Address,
+ this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 2;
+ return Result;
+ }
+
+ Result = readInstruction32(Bytes, Address, Size, Insn, IsBigEndian, true);
+ if (Result == MCDisassembler::Fail)
+ return MCDisassembler::Fail;
+
+ if (hasMips32r6()) {
+ DEBUG(dbgs() << "Trying MicroMips32r632 table (32-bit instructions):\n");
+ // Calling the auto-generated decoder function.
+ Result = decodeInstruction(DecoderTableMicroMipsR632, Instr, Insn, Address,
+ this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ return Result;
+ }
+ }
+
+ DEBUG(dbgs() << "Trying MicroMips32 table (32-bit instructions):\n");
+ // Calling the auto-generated decoder function.
+ Result = decodeInstruction(DecoderTableMicroMips32, Instr, Insn, Address,
+ this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ return Result;
+ }
+
+ if (hasMips32r6() && isFP64()) {
+ DEBUG(dbgs() << "Trying MicroMips32r6FP64 table (32-bit opcodes):\n");
+ Result = decodeInstruction(DecoderTableMicroMips32r6FP6432, Instr, Insn,
+ Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ return Result;
+ }
+ }
+
+ // This is an invalid instruction. Let the disassembler move forward by the
+ // minimum instruction size.
+ Size = 2;
+ return MCDisassembler::Fail;
+ }
+
+ Result = readInstruction32(Bytes, Address, Size, Insn, IsBigEndian, false);
+ if (Result == MCDisassembler::Fail) {
+ Size = 4;
+ return MCDisassembler::Fail;
+ }
+
+ if (hasCOP3()) {
+ DEBUG(dbgs() << "Trying COP3_ table (32-bit opcodes):\n");
+ Result =
+ decodeInstruction(DecoderTableCOP3_32, Instr, Insn, Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ return Result;
+ }
+ }
+
+ if (hasMips32r6() && isGP64()) {
+ DEBUG(dbgs() << "Trying Mips32r6_64r6 (GPR64) table (32-bit opcodes):\n");
+ Result = decodeInstruction(DecoderTableMips32r6_64r6_GP6432, Instr, Insn,
+ Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ return Result;
+ }
+ }
+
+ if (hasMips32r6() && isPTR64()) {
+ DEBUG(dbgs() << "Trying Mips32r6_64r6 (PTR64) table (32-bit opcodes):\n");
+ Result = decodeInstruction(DecoderTableMips32r6_64r6_PTR6432, Instr, Insn,
+ Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ return Result;
+ }
+ }
+
+ if (hasMips32r6()) {
+ DEBUG(dbgs() << "Trying Mips32r6_64r6 table (32-bit opcodes):\n");
+ Result = decodeInstruction(DecoderTableMips32r6_64r632, Instr, Insn,
+ Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ return Result;
+ }
+ }
+
+ if (hasMips2() && isPTR64()) {
+ DEBUG(dbgs() << "Trying Mips32r6_64r6 (PTR64) table (32-bit opcodes):\n");
+ Result = decodeInstruction(DecoderTableMips32_64_PTR6432, Instr, Insn,
+ Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ return Result;
+ }
+ }
+
+ if (hasCnMips()) {
+ DEBUG(dbgs() << "Trying CnMips table (32-bit opcodes):\n");
+ Result = decodeInstruction(DecoderTableCnMips32, Instr, Insn,
+ Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ return Result;
+ }
+ }
+
+ if (isGP64()) {
+ DEBUG(dbgs() << "Trying Mips64 (GPR64) table (32-bit opcodes):\n");
+ Result = decodeInstruction(DecoderTableMips6432, Instr, Insn,
+ Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ return Result;
+ }
+ }
+
+ DEBUG(dbgs() << "Trying Mips table (32-bit opcodes):\n");
+ // Calling the auto-generated decoder function.
+ Result =
+ decodeInstruction(DecoderTableMips32, Instr, Insn, Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ return Result;
+ }
+
+ Size = 4;
+ return MCDisassembler::Fail;
+}
+
+static DecodeStatus DecodeCPU16RegsRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+
+ return MCDisassembler::Fail;
+
+}
+
+static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+
+ if (RegNo > 31)
+ return MCDisassembler::Fail;
+
+ unsigned Reg = getReg(Decoder, Mips::GPR64RegClassID, RegNo);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeGPRMM16RegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo > 7)
+ return MCDisassembler::Fail;
+ unsigned Reg = getReg(Decoder, Mips::GPRMM16RegClassID, RegNo);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeGPRMM16ZeroRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo > 7)
+ return MCDisassembler::Fail;
+ unsigned Reg = getReg(Decoder, Mips::GPRMM16ZeroRegClassID, RegNo);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeGPRMM16MovePRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo > 7)
+ return MCDisassembler::Fail;
+ unsigned Reg = getReg(Decoder, Mips::GPRMM16MovePRegClassID, RegNo);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return MCDisassembler::Fail;
+ unsigned Reg = getReg(Decoder, Mips::GPR32RegClassID, RegNo);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodePtrRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (static_cast<const MipsDisassembler *>(Decoder)->isGP64())
+ return DecodeGPR64RegisterClass(Inst, RegNo, Address, Decoder);
+
+ return DecodeGPR32RegisterClass(Inst, RegNo, Address, Decoder);
+}
+
+static DecodeStatus DecodeDSPRRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ return DecodeGPR32RegisterClass(Inst, RegNo, Address, Decoder);
+}
+
+static DecodeStatus DecodeFGR64RegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return MCDisassembler::Fail;
+
+ unsigned Reg = getReg(Decoder, Mips::FGR64RegClassID, RegNo);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFGR32RegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return MCDisassembler::Fail;
+
+ unsigned Reg = getReg(Decoder, Mips::FGR32RegClassID, RegNo);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCCRRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return MCDisassembler::Fail;
+ unsigned Reg = getReg(Decoder, Mips::CCRRegClassID, RegNo);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFCCRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo > 7)
+ return MCDisassembler::Fail;
+ unsigned Reg = getReg(Decoder, Mips::FCCRegClassID, RegNo);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFGRCCRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return MCDisassembler::Fail;
+
+ unsigned Reg = getReg(Decoder, Mips::FGRCCRegClassID, RegNo);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMem(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ int Offset = SignExtend32<16>(Insn & 0xffff);
+ unsigned Reg = fieldFromInstruction(Insn, 16, 5);
+ unsigned Base = fieldFromInstruction(Insn, 21, 5);
+
+ Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+ Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+ if (Inst.getOpcode() == Mips::SC ||
+ Inst.getOpcode() == Mips::SCD)
+ Inst.addOperand(MCOperand::createReg(Reg));
+
+ Inst.addOperand(MCOperand::createReg(Reg));
+ Inst.addOperand(MCOperand::createReg(Base));
+ Inst.addOperand(MCOperand::createImm(Offset));
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMemEVA(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ int Offset = SignExtend32<9>(Insn >> 7);
+ unsigned Reg = fieldFromInstruction(Insn, 16, 5);
+ unsigned Base = fieldFromInstruction(Insn, 21, 5);
+
+ Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+ Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+ if (Inst.getOpcode() == Mips::SCE)
+ Inst.addOperand(MCOperand::createReg(Reg));
+
+ Inst.addOperand(MCOperand::createReg(Reg));
+ Inst.addOperand(MCOperand::createReg(Base));
+ Inst.addOperand(MCOperand::createImm(Offset));
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeLoadByte9(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ int Offset = SignExtend32<9>(Insn & 0x1ff);
+ unsigned Base = fieldFromInstruction(Insn, 16, 5);
+ unsigned Reg = fieldFromInstruction(Insn, 21, 5);
+
+ Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+ Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+
+ Inst.addOperand(MCOperand::createReg(Reg));
+ Inst.addOperand(MCOperand::createReg(Base));
+ Inst.addOperand(MCOperand::createImm(Offset));
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeLoadByte15(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ int Offset = SignExtend32<16>(Insn & 0xffff);
+ unsigned Base = fieldFromInstruction(Insn, 16, 5);
+ unsigned Reg = fieldFromInstruction(Insn, 21, 5);
+
+ Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+ Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+
+ Inst.addOperand(MCOperand::createReg(Reg));
+ Inst.addOperand(MCOperand::createReg(Base));
+ Inst.addOperand(MCOperand::createImm(Offset));
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCacheOp(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ int Offset = SignExtend32<16>(Insn & 0xffff);
+ unsigned Hint = fieldFromInstruction(Insn, 16, 5);
+ unsigned Base = fieldFromInstruction(Insn, 21, 5);
+
+ Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+ Inst.addOperand(MCOperand::createReg(Base));
+ Inst.addOperand(MCOperand::createImm(Offset));
+ Inst.addOperand(MCOperand::createImm(Hint));
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCacheOpMM(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ int Offset = SignExtend32<12>(Insn & 0xfff);
+ unsigned Base = fieldFromInstruction(Insn, 16, 5);
+ unsigned Hint = fieldFromInstruction(Insn, 21, 5);
+
+ Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+ Inst.addOperand(MCOperand::createReg(Base));
+ Inst.addOperand(MCOperand::createImm(Offset));
+ Inst.addOperand(MCOperand::createImm(Hint));
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodePrefeOpMM(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ int Offset = SignExtend32<9>(Insn & 0x1ff);
+ unsigned Base = fieldFromInstruction(Insn, 16, 5);
+ unsigned Hint = fieldFromInstruction(Insn, 21, 5);
+
+ Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+ Inst.addOperand(MCOperand::createReg(Base));
+ Inst.addOperand(MCOperand::createImm(Offset));
+ Inst.addOperand(MCOperand::createImm(Hint));
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCacheeOp_CacheOpR6(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ int Offset = SignExtend32<9>(Insn >> 7);
+ unsigned Hint = fieldFromInstruction(Insn, 16, 5);
+ unsigned Base = fieldFromInstruction(Insn, 21, 5);
+
+ Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+ Inst.addOperand(MCOperand::createReg(Base));
+ Inst.addOperand(MCOperand::createImm(Offset));
+ Inst.addOperand(MCOperand::createImm(Hint));
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeStoreEvaOpMM(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ int Offset = SignExtend32<9>(Insn & 0x1ff);
+ unsigned Reg = fieldFromInstruction(Insn, 21, 5);
+ unsigned Base = fieldFromInstruction(Insn, 16, 5);
+
+ Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+ Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+ Inst.addOperand(MCOperand::createReg(Reg));
+ Inst.addOperand(MCOperand::createReg(Base));
+ Inst.addOperand(MCOperand::createImm(Offset));
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSyncI(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ int Offset = SignExtend32<16>(Insn & 0xffff);
+ unsigned Base = fieldFromInstruction(Insn, 21, 5);
+
+ Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+ Inst.addOperand(MCOperand::createReg(Base));
+ Inst.addOperand(MCOperand::createImm(Offset));
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSynciR6(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ int Immediate = SignExtend32<16>(Insn & 0xffff);
+ unsigned Base = fieldFromInstruction(Insn, 16, 5);
+
+ Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+ Inst.addOperand(MCOperand::createReg(Base));
+ Inst.addOperand(MCOperand::createImm(Immediate));
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMSA128Mem(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ int Offset = SignExtend32<10>(fieldFromInstruction(Insn, 16, 10));
+ unsigned Reg = fieldFromInstruction(Insn, 6, 5);
+ unsigned Base = fieldFromInstruction(Insn, 11, 5);
+
+ Reg = getReg(Decoder, Mips::MSA128BRegClassID, Reg);
+ Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+ Inst.addOperand(MCOperand::createReg(Reg));
+ Inst.addOperand(MCOperand::createReg(Base));
+
+ // The immediate field of an LD/ST instruction is scaled which means it must
+ // be multiplied (when decoding) by the size (in bytes) of the instructions'
+ // data format.
+ // .b - 1 byte
+ // .h - 2 bytes
+ // .w - 4 bytes
+ // .d - 8 bytes
+ switch(Inst.getOpcode())
+ {
+ default:
+ assert (0 && "Unexpected instruction");
+ return MCDisassembler::Fail;
+ break;
+ case Mips::LD_B:
+ case Mips::ST_B:
+ Inst.addOperand(MCOperand::createImm(Offset));
+ break;
+ case Mips::LD_H:
+ case Mips::ST_H:
+ Inst.addOperand(MCOperand::createImm(Offset * 2));
+ break;
+ case Mips::LD_W:
+ case Mips::ST_W:
+ Inst.addOperand(MCOperand::createImm(Offset * 4));
+ break;
+ case Mips::LD_D:
+ case Mips::ST_D:
+ Inst.addOperand(MCOperand::createImm(Offset * 8));
+ break;
+ }
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMemMMImm4(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ unsigned Offset = Insn & 0xf;
+ unsigned Reg = fieldFromInstruction(Insn, 7, 3);
+ unsigned Base = fieldFromInstruction(Insn, 4, 3);
+
+ switch (Inst.getOpcode()) {
+ case Mips::LBU16_MM:
+ case Mips::LHU16_MM:
+ case Mips::LW16_MM:
+ if (DecodeGPRMM16RegisterClass(Inst, Reg, Address, Decoder)
+ == MCDisassembler::Fail)
+ return MCDisassembler::Fail;
+ break;
+ case Mips::SB16_MM:
+ case Mips::SB16_MMR6:
+ case Mips::SH16_MM:
+ case Mips::SH16_MMR6:
+ case Mips::SW16_MM:
+ case Mips::SW16_MMR6:
+ if (DecodeGPRMM16ZeroRegisterClass(Inst, Reg, Address, Decoder)
+ == MCDisassembler::Fail)
+ return MCDisassembler::Fail;
+ break;
+ }
+
+ if (DecodeGPRMM16RegisterClass(Inst, Base, Address, Decoder)
+ == MCDisassembler::Fail)
+ return MCDisassembler::Fail;
+
+ switch (Inst.getOpcode()) {
+ case Mips::LBU16_MM:
+ if (Offset == 0xf)
+ Inst.addOperand(MCOperand::createImm(-1));
+ else
+ Inst.addOperand(MCOperand::createImm(Offset));
+ break;
+ case Mips::SB16_MM:
+ case Mips::SB16_MMR6:
+ Inst.addOperand(MCOperand::createImm(Offset));
+ break;
+ case Mips::LHU16_MM:
+ case Mips::SH16_MM:
+ case Mips::SH16_MMR6:
+ Inst.addOperand(MCOperand::createImm(Offset << 1));
+ break;
+ case Mips::LW16_MM:
+ case Mips::SW16_MM:
+ case Mips::SW16_MMR6:
+ Inst.addOperand(MCOperand::createImm(Offset << 2));
+ break;
+ }
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMemMMSPImm5Lsl2(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ unsigned Offset = Insn & 0x1F;
+ unsigned Reg = fieldFromInstruction(Insn, 5, 5);
+
+ Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+
+ Inst.addOperand(MCOperand::createReg(Reg));
+ Inst.addOperand(MCOperand::createReg(Mips::SP));
+ Inst.addOperand(MCOperand::createImm(Offset << 2));
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMemMMGPImm7Lsl2(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ unsigned Offset = Insn & 0x7F;
+ unsigned Reg = fieldFromInstruction(Insn, 7, 3);
+
+ Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+
+ Inst.addOperand(MCOperand::createReg(Reg));
+ Inst.addOperand(MCOperand::createReg(Mips::GP));
+ Inst.addOperand(MCOperand::createImm(Offset << 2));
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMemMMReglistImm4Lsl2(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ int Offset;
+ switch (Inst.getOpcode()) {
+ case Mips::LWM16_MMR6:
+ case Mips::SWM16_MMR6:
+ Offset = fieldFromInstruction(Insn, 4, 4);
+ break;
+ default:
+ Offset = SignExtend32<4>(Insn & 0xf);
+ break;
+ }
+
+ if (DecodeRegListOperand16(Inst, Insn, Address, Decoder)
+ == MCDisassembler::Fail)
+ return MCDisassembler::Fail;
+
+ Inst.addOperand(MCOperand::createReg(Mips::SP));
+ Inst.addOperand(MCOperand::createImm(Offset << 2));
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMemMMImm9(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ int Offset = SignExtend32<9>(Insn & 0x1ff);
+ unsigned Reg = fieldFromInstruction(Insn, 21, 5);
+ unsigned Base = fieldFromInstruction(Insn, 16, 5);
+
+ Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+ Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+ if (Inst.getOpcode() == Mips::SCE_MM)
+ Inst.addOperand(MCOperand::createReg(Reg));
+
+ Inst.addOperand(MCOperand::createReg(Reg));
+ Inst.addOperand(MCOperand::createReg(Base));
+ Inst.addOperand(MCOperand::createImm(Offset));
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMemMMImm12(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ int Offset = SignExtend32<12>(Insn & 0x0fff);
+ unsigned Reg = fieldFromInstruction(Insn, 21, 5);
+ unsigned Base = fieldFromInstruction(Insn, 16, 5);
+
+ Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+ Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+ switch (Inst.getOpcode()) {
+ case Mips::SWM32_MM:
+ case Mips::LWM32_MM:
+ if (DecodeRegListOperand(Inst, Insn, Address, Decoder)
+ == MCDisassembler::Fail)
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createReg(Base));
+ Inst.addOperand(MCOperand::createImm(Offset));
+ break;
+ case Mips::SC_MM:
+ Inst.addOperand(MCOperand::createReg(Reg));
+ LLVM_FALLTHROUGH;
+ default:
+ Inst.addOperand(MCOperand::createReg(Reg));
+ if (Inst.getOpcode() == Mips::LWP_MM || Inst.getOpcode() == Mips::SWP_MM ||
+ Inst.getOpcode() == Mips::LWP_MMR6 || Inst.getOpcode() == Mips::SWP_MMR6)
+ Inst.addOperand(MCOperand::createReg(Reg+1));
+
+ Inst.addOperand(MCOperand::createReg(Base));
+ Inst.addOperand(MCOperand::createImm(Offset));
+ }
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMemMMImm16(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ int Offset = SignExtend32<16>(Insn & 0xffff);
+ unsigned Reg = fieldFromInstruction(Insn, 21, 5);
+ unsigned Base = fieldFromInstruction(Insn, 16, 5);
+
+ Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+ Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+ Inst.addOperand(MCOperand::createReg(Reg));
+ Inst.addOperand(MCOperand::createReg(Base));
+ Inst.addOperand(MCOperand::createImm(Offset));
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFMem(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ int Offset = SignExtend32<16>(Insn & 0xffff);
+ unsigned Reg = fieldFromInstruction(Insn, 16, 5);
+ unsigned Base = fieldFromInstruction(Insn, 21, 5);
+
+ Reg = getReg(Decoder, Mips::FGR64RegClassID, Reg);
+ Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+ Inst.addOperand(MCOperand::createReg(Reg));
+ Inst.addOperand(MCOperand::createReg(Base));
+ Inst.addOperand(MCOperand::createImm(Offset));
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFMemMMR2(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ // This function is the same as DecodeFMem but with the Reg and Base fields
+ // swapped according to microMIPS spec.
+ int Offset = SignExtend32<16>(Insn & 0xffff);
+ unsigned Base = fieldFromInstruction(Insn, 16, 5);
+ unsigned Reg = fieldFromInstruction(Insn, 21, 5);
+
+ Reg = getReg(Decoder, Mips::FGR64RegClassID, Reg);
+ Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+ Inst.addOperand(MCOperand::createReg(Reg));
+ Inst.addOperand(MCOperand::createReg(Base));
+ Inst.addOperand(MCOperand::createImm(Offset));
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFMem2(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ int Offset = SignExtend32<16>(Insn & 0xffff);
+ unsigned Reg = fieldFromInstruction(Insn, 16, 5);
+ unsigned Base = fieldFromInstruction(Insn, 21, 5);
+
+ Reg = getReg(Decoder, Mips::COP2RegClassID, Reg);
+ Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+ Inst.addOperand(MCOperand::createReg(Reg));
+ Inst.addOperand(MCOperand::createReg(Base));
+ Inst.addOperand(MCOperand::createImm(Offset));
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFMem3(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ int Offset = SignExtend32<16>(Insn & 0xffff);
+ unsigned Reg = fieldFromInstruction(Insn, 16, 5);
+ unsigned Base = fieldFromInstruction(Insn, 21, 5);
+
+ Reg = getReg(Decoder, Mips::COP3RegClassID, Reg);
+ Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+ Inst.addOperand(MCOperand::createReg(Reg));
+ Inst.addOperand(MCOperand::createReg(Base));
+ Inst.addOperand(MCOperand::createImm(Offset));
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFMemCop2R6(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ int Offset = SignExtend32<11>(Insn & 0x07ff);
+ unsigned Reg = fieldFromInstruction(Insn, 16, 5);
+ unsigned Base = fieldFromInstruction(Insn, 11, 5);
+
+ Reg = getReg(Decoder, Mips::COP2RegClassID, Reg);
+ Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+ Inst.addOperand(MCOperand::createReg(Reg));
+ Inst.addOperand(MCOperand::createReg(Base));
+ Inst.addOperand(MCOperand::createImm(Offset));
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFMemCop2MMR6(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ int Offset = SignExtend32<11>(Insn & 0x07ff);
+ unsigned Reg = fieldFromInstruction(Insn, 21, 5);
+ unsigned Base = fieldFromInstruction(Insn, 16, 5);
+
+ Reg = getReg(Decoder, Mips::COP2RegClassID, Reg);
+ Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+ Inst.addOperand(MCOperand::createReg(Reg));
+ Inst.addOperand(MCOperand::createReg(Base));
+ Inst.addOperand(MCOperand::createImm(Offset));
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSpecial3LlSc(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ int64_t Offset = SignExtend64<9>((Insn >> 7) & 0x1ff);
+ unsigned Rt = fieldFromInstruction(Insn, 16, 5);
+ unsigned Base = fieldFromInstruction(Insn, 21, 5);
+
+ Rt = getReg(Decoder, Mips::GPR32RegClassID, Rt);
+ Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+ if(Inst.getOpcode() == Mips::SC_R6 || Inst.getOpcode() == Mips::SCD_R6){
+ Inst.addOperand(MCOperand::createReg(Rt));
+ }
+
+ Inst.addOperand(MCOperand::createReg(Rt));
+ Inst.addOperand(MCOperand::createReg(Base));
+ Inst.addOperand(MCOperand::createImm(Offset));
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeHWRegsRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ // Currently only hardware register 29 is supported.
+ if (RegNo != 29)
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createReg(Mips::HWR29));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeAFGR64RegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo > 30 || RegNo %2)
+ return MCDisassembler::Fail;
+
+ ;
+ unsigned Reg = getReg(Decoder, Mips::AFGR64RegClassID, RegNo /2);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeACC64DSPRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo >= 4)
+ return MCDisassembler::Fail;
+
+ unsigned Reg = getReg(Decoder, Mips::ACC64DSPRegClassID, RegNo);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeHI32DSPRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo >= 4)
+ return MCDisassembler::Fail;
+
+ unsigned Reg = getReg(Decoder, Mips::HI32DSPRegClassID, RegNo);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeLO32DSPRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo >= 4)
+ return MCDisassembler::Fail;
+
+ unsigned Reg = getReg(Decoder, Mips::LO32DSPRegClassID, RegNo);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMSA128BRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return MCDisassembler::Fail;
+
+ unsigned Reg = getReg(Decoder, Mips::MSA128BRegClassID, RegNo);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMSA128HRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return MCDisassembler::Fail;
+
+ unsigned Reg = getReg(Decoder, Mips::MSA128HRegClassID, RegNo);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMSA128WRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return MCDisassembler::Fail;
+
+ unsigned Reg = getReg(Decoder, Mips::MSA128WRegClassID, RegNo);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMSA128DRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return MCDisassembler::Fail;
+
+ unsigned Reg = getReg(Decoder, Mips::MSA128DRegClassID, RegNo);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMSACtrlRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo > 7)
+ return MCDisassembler::Fail;
+
+ unsigned Reg = getReg(Decoder, Mips::MSACtrlRegClassID, RegNo);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCOP0RegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return MCDisassembler::Fail;
+
+ unsigned Reg = getReg(Decoder, Mips::COP0RegClassID, RegNo);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCOP2RegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return MCDisassembler::Fail;
+
+ unsigned Reg = getReg(Decoder, Mips::COP2RegClassID, RegNo);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeBranchTarget(MCInst &Inst,
+ unsigned Offset,
+ uint64_t Address,
+ const void *Decoder) {
+ int32_t BranchOffset = (SignExtend32<16>(Offset) * 4) + 4;
+ Inst.addOperand(MCOperand::createImm(BranchOffset));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeBranchTarget1SImm16(MCInst &Inst,
+ unsigned Offset,
+ uint64_t Address,
+ const void *Decoder) {
+ int32_t BranchOffset = (SignExtend32<16>(Offset) * 2);
+ Inst.addOperand(MCOperand::createImm(BranchOffset));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeJumpTarget(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+
+ unsigned JumpOffset = fieldFromInstruction(Insn, 0, 26) << 2;
+ Inst.addOperand(MCOperand::createImm(JumpOffset));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeBranchTarget21(MCInst &Inst,
+ unsigned Offset,
+ uint64_t Address,
+ const void *Decoder) {
+ int32_t BranchOffset = SignExtend32<21>(Offset) * 4 + 4;
+
+ Inst.addOperand(MCOperand::createImm(BranchOffset));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeBranchTarget21MM(MCInst &Inst,
+ unsigned Offset,
+ uint64_t Address,
+ const void *Decoder) {
+ int32_t BranchOffset = SignExtend32<21>(Offset) * 4 + 4;
+
+ Inst.addOperand(MCOperand::createImm(BranchOffset));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeBranchTarget26(MCInst &Inst,
+ unsigned Offset,
+ uint64_t Address,
+ const void *Decoder) {
+ int32_t BranchOffset = SignExtend32<26>(Offset) * 4 + 4;
+
+ Inst.addOperand(MCOperand::createImm(BranchOffset));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeBranchTarget7MM(MCInst &Inst,
+ unsigned Offset,
+ uint64_t Address,
+ const void *Decoder) {
+ int32_t BranchOffset = SignExtend32<7>(Offset) << 1;
+ Inst.addOperand(MCOperand::createImm(BranchOffset));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeBranchTarget10MM(MCInst &Inst,
+ unsigned Offset,
+ uint64_t Address,
+ const void *Decoder) {
+ int32_t BranchOffset = SignExtend32<10>(Offset) << 1;
+ Inst.addOperand(MCOperand::createImm(BranchOffset));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeBranchTargetMM(MCInst &Inst,
+ unsigned Offset,
+ uint64_t Address,
+ const void *Decoder) {
+ int32_t BranchOffset = SignExtend32<16>(Offset) * 2 + 4;
+ Inst.addOperand(MCOperand::createImm(BranchOffset));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeBranchTarget26MM(MCInst &Inst,
+ unsigned Offset,
+ uint64_t Address,
+ const void *Decoder) {
+ int32_t BranchOffset = SignExtend32<26>(Offset) << 1;
+
+ Inst.addOperand(MCOperand::createImm(BranchOffset));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeJumpTargetMM(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ unsigned JumpOffset = fieldFromInstruction(Insn, 0, 26) << 1;
+ Inst.addOperand(MCOperand::createImm(JumpOffset));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeAddiur2Simm7(MCInst &Inst,
+ unsigned Value,
+ uint64_t Address,
+ const void *Decoder) {
+ if (Value == 0)
+ Inst.addOperand(MCOperand::createImm(1));
+ else if (Value == 0x7)
+ Inst.addOperand(MCOperand::createImm(-1));
+ else
+ Inst.addOperand(MCOperand::createImm(Value << 2));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeLi16Imm(MCInst &Inst,
+ unsigned Value,
+ uint64_t Address,
+ const void *Decoder) {
+ if (Value == 0x7F)
+ Inst.addOperand(MCOperand::createImm(-1));
+ else
+ Inst.addOperand(MCOperand::createImm(Value));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodePOOL16BEncodedField(MCInst &Inst,
+ unsigned Value,
+ uint64_t Address,
+ const void *Decoder) {
+ Inst.addOperand(MCOperand::createImm(Value == 0x0 ? 8 : Value));
+ return MCDisassembler::Success;
+}
+
+template <unsigned Bits, int Offset, int Scale>
+static DecodeStatus DecodeUImmWithOffsetAndScale(MCInst &Inst, unsigned Value,
+ uint64_t Address,
+ const void *Decoder) {
+ Value &= ((1 << Bits) - 1);
+ Value *= Scale;
+ Inst.addOperand(MCOperand::createImm(Value + Offset));
+ return MCDisassembler::Success;
+}
+
+template <unsigned Bits, int Offset, int ScaleBy>
+static DecodeStatus DecodeSImmWithOffsetAndScale(MCInst &Inst, unsigned Value,
+ uint64_t Address,
+ const void *Decoder) {
+ int32_t Imm = SignExtend32<Bits>(Value) * ScaleBy;
+ Inst.addOperand(MCOperand::createImm(Imm + Offset));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeInsSize(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ // First we need to grab the pos(lsb) from MCInst.
+ int Pos = Inst.getOperand(2).getImm();
+ int Size = (int) Insn - Pos + 1;
+ Inst.addOperand(MCOperand::createImm(SignExtend32<16>(Size)));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSimm19Lsl2(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ Inst.addOperand(MCOperand::createImm(SignExtend32<19>(Insn) * 4));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSimm18Lsl3(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ Inst.addOperand(MCOperand::createImm(SignExtend32<18>(Insn) * 8));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSimm9SP(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ int32_t DecodedValue;
+ switch (Insn) {
+ case 0: DecodedValue = 256; break;
+ case 1: DecodedValue = 257; break;
+ case 510: DecodedValue = -258; break;
+ case 511: DecodedValue = -257; break;
+ default: DecodedValue = SignExtend32<9>(Insn); break;
+ }
+ Inst.addOperand(MCOperand::createImm(DecodedValue * 4));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeANDI16Imm(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ // Insn must be >= 0, since it is unsigned that condition is always true.
+ assert(Insn < 16);
+ int32_t DecodedValues[] = {128, 1, 2, 3, 4, 7, 8, 15, 16, 31, 32, 63, 64,
+ 255, 32768, 65535};
+ Inst.addOperand(MCOperand::createImm(DecodedValues[Insn]));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeRegListOperand(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ unsigned Regs[] = {Mips::S0, Mips::S1, Mips::S2, Mips::S3, Mips::S4, Mips::S5,
+ Mips::S6, Mips::S7, Mips::FP};
+ unsigned RegNum;
+
+ unsigned RegLst = fieldFromInstruction(Insn, 21, 5);
+
+ // Empty register lists are not allowed.
+ if (RegLst == 0)
+ return MCDisassembler::Fail;
+
+ RegNum = RegLst & 0xf;
+
+ // RegLst values 10-15, and 26-31 are reserved.
+ if (RegNum > 9)
+ return MCDisassembler::Fail;
+
+ for (unsigned i = 0; i < RegNum; i++)
+ Inst.addOperand(MCOperand::createReg(Regs[i]));
+
+ if (RegLst & 0x10)
+ Inst.addOperand(MCOperand::createReg(Mips::RA));
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeRegListOperand16(MCInst &Inst, unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ unsigned Regs[] = {Mips::S0, Mips::S1, Mips::S2, Mips::S3};
+ unsigned RegLst;
+ switch(Inst.getOpcode()) {
+ default:
+ RegLst = fieldFromInstruction(Insn, 4, 2);
+ break;
+ case Mips::LWM16_MMR6:
+ case Mips::SWM16_MMR6:
+ RegLst = fieldFromInstruction(Insn, 8, 2);
+ break;
+ }
+ unsigned RegNum = RegLst & 0x3;
+
+ for (unsigned i = 0; i <= RegNum; i++)
+ Inst.addOperand(MCOperand::createReg(Regs[i]));
+
+ Inst.addOperand(MCOperand::createReg(Mips::RA));
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMovePRegPair(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+
+ unsigned RegPair = fieldFromInstruction(Insn, 7, 3);
+
+ switch (RegPair) {
+ default:
+ return MCDisassembler::Fail;
+ case 0:
+ Inst.addOperand(MCOperand::createReg(Mips::A1));
+ Inst.addOperand(MCOperand::createReg(Mips::A2));
+ break;
+ case 1:
+ Inst.addOperand(MCOperand::createReg(Mips::A1));
+ Inst.addOperand(MCOperand::createReg(Mips::A3));
+ break;
+ case 2:
+ Inst.addOperand(MCOperand::createReg(Mips::A2));
+ Inst.addOperand(MCOperand::createReg(Mips::A3));
+ break;
+ case 3:
+ Inst.addOperand(MCOperand::createReg(Mips::A0));
+ Inst.addOperand(MCOperand::createReg(Mips::S5));
+ break;
+ case 4:
+ Inst.addOperand(MCOperand::createReg(Mips::A0));
+ Inst.addOperand(MCOperand::createReg(Mips::S6));
+ break;
+ case 5:
+ Inst.addOperand(MCOperand::createReg(Mips::A0));
+ Inst.addOperand(MCOperand::createReg(Mips::A1));
+ break;
+ case 6:
+ Inst.addOperand(MCOperand::createReg(Mips::A0));
+ Inst.addOperand(MCOperand::createReg(Mips::A2));
+ break;
+ case 7:
+ Inst.addOperand(MCOperand::createReg(Mips::A0));
+ Inst.addOperand(MCOperand::createReg(Mips::A3));
+ break;
+ }
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSimm23Lsl2(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ Inst.addOperand(MCOperand::createImm(SignExtend32<25>(Insn << 2)));
+ return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeBgtzGroupBranchMMR6(MCInst &MI, InsnType insn,
+ uint64_t Address,
+ const void *Decoder) {
+ // We have:
+ // 0b000111 ttttt sssss iiiiiiiiiiiiiiii
+ // Invalid if rt == 0
+ // BGTZALC_MMR6 if rs == 0 && rt != 0
+ // BLTZALC_MMR6 if rs != 0 && rs == rt
+ // BLTUC_MMR6 if rs != 0 && rs != rt
+
+ InsnType Rt = fieldFromInstruction(insn, 21, 5);
+ InsnType Rs = fieldFromInstruction(insn, 16, 5);
+ InsnType Imm = 0;
+ bool HasRs = false;
+ bool HasRt = false;
+
+ if (Rt == 0)
+ return MCDisassembler::Fail;
+ else if (Rs == 0) {
+ MI.setOpcode(Mips::BGTZALC_MMR6);
+ HasRt = true;
+ Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 2 + 4;
+ }
+ else if (Rs == Rt) {
+ MI.setOpcode(Mips::BLTZALC_MMR6);
+ HasRs = true;
+ Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 2 + 4;
+ }
+ else {
+ MI.setOpcode(Mips::BLTUC_MMR6);
+ HasRs = true;
+ HasRt = true;
+ Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4 + 4;
+ }
+
+ if (HasRs)
+ MI.addOperand(
+ MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID, Rs)));
+
+ if (HasRt)
+ MI.addOperand(
+ MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID, Rt)));
+
+ MI.addOperand(MCOperand::createImm(Imm));
+
+ return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeBlezGroupBranchMMR6(MCInst &MI, InsnType insn,
+ uint64_t Address,
+ const void *Decoder) {
+ // We have:
+ // 0b000110 ttttt sssss iiiiiiiiiiiiiiii
+ // Invalid if rt == 0
+ // BLEZALC_MMR6 if rs == 0 && rt != 0
+ // BGEZALC_MMR6 if rs == rt && rt != 0
+ // BGEUC_MMR6 if rs != rt && rs != 0 && rt != 0
+
+ InsnType Rt = fieldFromInstruction(insn, 21, 5);
+ InsnType Rs = fieldFromInstruction(insn, 16, 5);
+ InsnType Imm = 0;
+ bool HasRs = false;
+
+ if (Rt == 0)
+ return MCDisassembler::Fail;
+ else if (Rs == 0) {
+ MI.setOpcode(Mips::BLEZALC_MMR6);
+ Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 2 + 4;
+ }
+ else if (Rs == Rt) {
+ MI.setOpcode(Mips::BGEZALC_MMR6);
+ Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 2 + 4;
+ }
+ else {
+ HasRs = true;
+ MI.setOpcode(Mips::BGEUC_MMR6);
+ Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4 + 4;
+ }
+
+ if (HasRs)
+ MI.addOperand(
+ MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID, Rs)));
+ MI.addOperand(
+ MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID, Rt)));
+
+ MI.addOperand(MCOperand::createImm(Imm));
+
+ return MCDisassembler::Success;
+}
diff --git a/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp b/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
new file mode 100644
index 000000000000..49c42fd1880c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
@@ -0,0 +1,294 @@
+//===-- MipsInstPrinter.cpp - Convert Mips MCInst to assembly syntax ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an Mips MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsInstPrinter.h"
+#include "MCTargetDesc/MipsMCExpr.h"
+#include "MipsInstrInfo.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+#define PRINT_ALIAS_INSTR
+#include "MipsGenAsmWriter.inc"
+
+template<unsigned R>
+static bool isReg(const MCInst &MI, unsigned OpNo) {
+ assert(MI.getOperand(OpNo).isReg() && "Register operand expected.");
+ return MI.getOperand(OpNo).getReg() == R;
+}
+
+const char* Mips::MipsFCCToString(Mips::CondCode CC) {
+ switch (CC) {
+ case FCOND_F:
+ case FCOND_T: return "f";
+ case FCOND_UN:
+ case FCOND_OR: return "un";
+ case FCOND_OEQ:
+ case FCOND_UNE: return "eq";
+ case FCOND_UEQ:
+ case FCOND_ONE: return "ueq";
+ case FCOND_OLT:
+ case FCOND_UGE: return "olt";
+ case FCOND_ULT:
+ case FCOND_OGE: return "ult";
+ case FCOND_OLE:
+ case FCOND_UGT: return "ole";
+ case FCOND_ULE:
+ case FCOND_OGT: return "ule";
+ case FCOND_SF:
+ case FCOND_ST: return "sf";
+ case FCOND_NGLE:
+ case FCOND_GLE: return "ngle";
+ case FCOND_SEQ:
+ case FCOND_SNE: return "seq";
+ case FCOND_NGL:
+ case FCOND_GL: return "ngl";
+ case FCOND_LT:
+ case FCOND_NLT: return "lt";
+ case FCOND_NGE:
+ case FCOND_GE: return "nge";
+ case FCOND_LE:
+ case FCOND_NLE: return "le";
+ case FCOND_NGT:
+ case FCOND_GT: return "ngt";
+ }
+ llvm_unreachable("Impossible condition code!");
+}
+
+void MipsInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+ OS << '$' << StringRef(getRegisterName(RegNo)).lower();
+}
+
+void MipsInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+ StringRef Annot, const MCSubtargetInfo &STI) {
+ switch (MI->getOpcode()) {
+ default:
+ break;
+ case Mips::RDHWR:
+ case Mips::RDHWR64:
+ O << "\t.set\tpush\n";
+ O << "\t.set\tmips32r2\n";
+ break;
+ case Mips::Save16:
+ O << "\tsave\t";
+ printSaveRestore(MI, O);
+ O << " # 16 bit inst\n";
+ return;
+ case Mips::SaveX16:
+ O << "\tsave\t";
+ printSaveRestore(MI, O);
+ O << "\n";
+ return;
+ case Mips::Restore16:
+ O << "\trestore\t";
+ printSaveRestore(MI, O);
+ O << " # 16 bit inst\n";
+ return;
+ case Mips::RestoreX16:
+ O << "\trestore\t";
+ printSaveRestore(MI, O);
+ O << "\n";
+ return;
+ }
+
+ // Try to print any aliases first.
+ if (!printAliasInstr(MI, O) && !printAlias(*MI, O))
+ printInstruction(MI, O);
+ printAnnotation(O, Annot);
+
+ switch (MI->getOpcode()) {
+ default:
+ break;
+ case Mips::RDHWR:
+ case Mips::RDHWR64:
+ O << "\n\t.set\tpop";
+ }
+}
+
+void MipsInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isReg()) {
+ printRegName(O, Op.getReg());
+ return;
+ }
+
+ if (Op.isImm()) {
+ O << formatImm(Op.getImm());
+ return;
+ }
+
+ assert(Op.isExpr() && "unknown operand kind in printOperand");
+ Op.getExpr()->print(O, &MAI, true);
+}
+
+template <unsigned Bits, unsigned Offset>
+void MipsInstPrinter::printUImm(const MCInst *MI, int opNum, raw_ostream &O) {
+ const MCOperand &MO = MI->getOperand(opNum);
+ if (MO.isImm()) {
+ uint64_t Imm = MO.getImm();
+ Imm -= Offset;
+ Imm &= (1 << Bits) - 1;
+ Imm += Offset;
+ O << formatImm(Imm);
+ return;
+ }
+
+ printOperand(MI, opNum, O);
+}
+
+void MipsInstPrinter::
+printMemOperand(const MCInst *MI, int opNum, raw_ostream &O) {
+ // Load/Store memory operands -- imm($reg)
+ // If PIC target the target is loaded as the
+ // pattern lw $25,%call16($28)
+
+ // opNum can be invalid if instruction had reglist as operand.
+ // MemOperand is always last operand of instruction (base + offset).
+ switch (MI->getOpcode()) {
+ default:
+ break;
+ case Mips::SWM32_MM:
+ case Mips::LWM32_MM:
+ case Mips::SWM16_MM:
+ case Mips::SWM16_MMR6:
+ case Mips::LWM16_MM:
+ case Mips::LWM16_MMR6:
+ opNum = MI->getNumOperands() - 2;
+ break;
+ }
+
+ printOperand(MI, opNum+1, O);
+ O << "(";
+ printOperand(MI, opNum, O);
+ O << ")";
+}
+
+void MipsInstPrinter::
+printMemOperandEA(const MCInst *MI, int opNum, raw_ostream &O) {
+ // when using stack locations for not load/store instructions
+ // print the same way as all normal 3 operand instructions.
+ printOperand(MI, opNum, O);
+ O << ", ";
+ printOperand(MI, opNum+1, O);
+ return;
+}
+
+void MipsInstPrinter::
+printFCCOperand(const MCInst *MI, int opNum, raw_ostream &O) {
+ const MCOperand& MO = MI->getOperand(opNum);
+ O << MipsFCCToString((Mips::CondCode)MO.getImm());
+}
+
+void MipsInstPrinter::
+printRegisterPair(const MCInst *MI, int opNum, raw_ostream &O) {
+ printRegName(O, MI->getOperand(opNum).getReg());
+}
+
+void MipsInstPrinter::
+printSHFMask(const MCInst *MI, int opNum, raw_ostream &O) {
+ llvm_unreachable("TODO");
+}
+
+bool MipsInstPrinter::printAlias(const char *Str, const MCInst &MI,
+ unsigned OpNo, raw_ostream &OS) {
+ OS << "\t" << Str << "\t";
+ printOperand(&MI, OpNo, OS);
+ return true;
+}
+
+bool MipsInstPrinter::printAlias(const char *Str, const MCInst &MI,
+ unsigned OpNo0, unsigned OpNo1,
+ raw_ostream &OS) {
+ printAlias(Str, MI, OpNo0, OS);
+ OS << ", ";
+ printOperand(&MI, OpNo1, OS);
+ return true;
+}
+
+bool MipsInstPrinter::printAlias(const MCInst &MI, raw_ostream &OS) {
+ switch (MI.getOpcode()) {
+ case Mips::BEQ:
+ case Mips::BEQ_MM:
+ // beq $zero, $zero, $L2 => b $L2
+ // beq $r0, $zero, $L2 => beqz $r0, $L2
+ return (isReg<Mips::ZERO>(MI, 0) && isReg<Mips::ZERO>(MI, 1) &&
+ printAlias("b", MI, 2, OS)) ||
+ (isReg<Mips::ZERO>(MI, 1) && printAlias("beqz", MI, 0, 2, OS));
+ case Mips::BEQ64:
+ // beq $r0, $zero, $L2 => beqz $r0, $L2
+ return isReg<Mips::ZERO_64>(MI, 1) && printAlias("beqz", MI, 0, 2, OS);
+ case Mips::BNE:
+ case Mips::BNE_MM:
+ // bne $r0, $zero, $L2 => bnez $r0, $L2
+ return isReg<Mips::ZERO>(MI, 1) && printAlias("bnez", MI, 0, 2, OS);
+ case Mips::BNE64:
+ // bne $r0, $zero, $L2 => bnez $r0, $L2
+ return isReg<Mips::ZERO_64>(MI, 1) && printAlias("bnez", MI, 0, 2, OS);
+ case Mips::BGEZAL:
+ // bgezal $zero, $L1 => bal $L1
+ return isReg<Mips::ZERO>(MI, 0) && printAlias("bal", MI, 1, OS);
+ case Mips::BC1T:
+ // bc1t $fcc0, $L1 => bc1t $L1
+ return isReg<Mips::FCC0>(MI, 0) && printAlias("bc1t", MI, 1, OS);
+ case Mips::BC1F:
+ // bc1f $fcc0, $L1 => bc1f $L1
+ return isReg<Mips::FCC0>(MI, 0) && printAlias("bc1f", MI, 1, OS);
+ case Mips::JALR:
+ // jalr $ra, $r1 => jalr $r1
+ return isReg<Mips::RA>(MI, 0) && printAlias("jalr", MI, 1, OS);
+ case Mips::JALR64:
+ // jalr $ra, $r1 => jalr $r1
+ return isReg<Mips::RA_64>(MI, 0) && printAlias("jalr", MI, 1, OS);
+ case Mips::NOR:
+ case Mips::NOR_MM:
+ case Mips::NOR_MMR6:
+ // nor $r0, $r1, $zero => not $r0, $r1
+ return isReg<Mips::ZERO>(MI, 2) && printAlias("not", MI, 0, 1, OS);
+ case Mips::NOR64:
+ // nor $r0, $r1, $zero => not $r0, $r1
+ return isReg<Mips::ZERO_64>(MI, 2) && printAlias("not", MI, 0, 1, OS);
+ case Mips::OR:
+ // or $r0, $r1, $zero => move $r0, $r1
+ return isReg<Mips::ZERO>(MI, 2) && printAlias("move", MI, 0, 1, OS);
+ default: return false;
+ }
+}
+
+void MipsInstPrinter::printSaveRestore(const MCInst *MI, raw_ostream &O) {
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ if (i != 0) O << ", ";
+ if (MI->getOperand(i).isReg())
+ printRegName(O, MI->getOperand(i).getReg());
+ else
+ printUImm<16>(MI, i, O);
+ }
+}
+
+void MipsInstPrinter::
+printRegisterList(const MCInst *MI, int opNum, raw_ostream &O) {
+ // - 2 because register List is always first operand of instruction and it is
+ // always followed by memory operand (base + offset).
+ for (int i = opNum, e = MI->getNumOperands() - 2; i != e; ++i) {
+ if (i != opNum)
+ O << ", ";
+ printRegName(O, MI->getOperand(i).getReg());
+ }
+}
diff --git a/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.h b/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
new file mode 100644
index 000000000000..4a76b5acac79
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
@@ -0,0 +1,114 @@
+//=== MipsInstPrinter.h - Convert Mips MCInst to assembly syntax -*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints a Mips MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_INSTPRINTER_MIPSINSTPRINTER_H
+#define LLVM_LIB_TARGET_MIPS_INSTPRINTER_MIPSINSTPRINTER_H
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+// These enumeration declarations were originally in MipsInstrInfo.h but
+// had to be moved here to avoid circular dependencies between
+// LLVMMipsCodeGen and LLVMMipsAsmPrinter.
+namespace Mips {
+// Mips Branch Codes
+enum FPBranchCode {
+ BRANCH_F,
+ BRANCH_T,
+ BRANCH_FL,
+ BRANCH_TL,
+ BRANCH_INVALID
+};
+
+// Mips Condition Codes
+enum CondCode {
+ // To be used with float branch True
+ FCOND_F,
+ FCOND_UN,
+ FCOND_OEQ,
+ FCOND_UEQ,
+ FCOND_OLT,
+ FCOND_ULT,
+ FCOND_OLE,
+ FCOND_ULE,
+ FCOND_SF,
+ FCOND_NGLE,
+ FCOND_SEQ,
+ FCOND_NGL,
+ FCOND_LT,
+ FCOND_NGE,
+ FCOND_LE,
+ FCOND_NGT,
+
+ // To be used with float branch False
+ // This conditions have the same mnemonic as the
+ // above ones, but are used with a branch False;
+ FCOND_T,
+ FCOND_OR,
+ FCOND_UNE,
+ FCOND_ONE,
+ FCOND_UGE,
+ FCOND_OGE,
+ FCOND_UGT,
+ FCOND_OGT,
+ FCOND_ST,
+ FCOND_GLE,
+ FCOND_SNE,
+ FCOND_GL,
+ FCOND_NLT,
+ FCOND_GE,
+ FCOND_NLE,
+ FCOND_GT
+};
+
+const char *MipsFCCToString(Mips::CondCode CC);
+} // end namespace Mips
+
+class MipsInstPrinter : public MCInstPrinter {
+public:
+ MipsInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI)
+ : MCInstPrinter(MAI, MII, MRI) {}
+
+ // Autogenerated by tblgen.
+ void printInstruction(const MCInst *MI, raw_ostream &O);
+ static const char *getRegisterName(unsigned RegNo);
+
+ void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+ void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
+ const MCSubtargetInfo &STI) override;
+
+ bool printAliasInstr(const MCInst *MI, raw_ostream &OS);
+ void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+ unsigned PrintMethodIdx, raw_ostream &O);
+
+private:
+ void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ template <unsigned Bits, unsigned Offset = 0>
+ void printUImm(const MCInst *MI, int opNum, raw_ostream &O);
+ void printMemOperand(const MCInst *MI, int opNum, raw_ostream &O);
+ void printMemOperandEA(const MCInst *MI, int opNum, raw_ostream &O);
+ void printFCCOperand(const MCInst *MI, int opNum, raw_ostream &O);
+ void printRegisterPair(const MCInst *MI, int opNum, raw_ostream &O);
+ void printSHFMask(const MCInst *MI, int opNum, raw_ostream &O);
+
+ bool printAlias(const char *Str, const MCInst &MI, unsigned OpNo,
+ raw_ostream &OS);
+ bool printAlias(const char *Str, const MCInst &MI, unsigned OpNo0,
+ unsigned OpNo1, raw_ostream &OS);
+ bool printAlias(const MCInst &MI, raw_ostream &OS);
+ void printSaveRestore(const MCInst *MI, raw_ostream &O);
+ void printRegisterList(const MCInst *MI, int opNum, raw_ostream &O);
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp
new file mode 100644
index 000000000000..932d38a0b9fe
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp
@@ -0,0 +1,69 @@
+//===-- MipsABIFlagsSection.cpp - Mips ELF ABI Flags Section ---*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsABIFlagsSection.h"
+
+using namespace llvm;
+
+uint8_t MipsABIFlagsSection::getFpABIValue() {
+ switch (FpABI) {
+ case FpABIKind::ANY:
+ return Mips::Val_GNU_MIPS_ABI_FP_ANY;
+ case FpABIKind::SOFT:
+ return Mips::Val_GNU_MIPS_ABI_FP_SOFT;
+ case FpABIKind::XX:
+ return Mips::Val_GNU_MIPS_ABI_FP_XX;
+ case FpABIKind::S32:
+ return Mips::Val_GNU_MIPS_ABI_FP_DOUBLE;
+ case FpABIKind::S64:
+ if (Is32BitABI)
+ return OddSPReg ? Mips::Val_GNU_MIPS_ABI_FP_64
+ : Mips::Val_GNU_MIPS_ABI_FP_64A;
+ return Mips::Val_GNU_MIPS_ABI_FP_DOUBLE;
+ }
+
+ llvm_unreachable("unexpected fp abi value");
+}
+
+StringRef MipsABIFlagsSection::getFpABIString(FpABIKind Value) {
+ switch (Value) {
+ case FpABIKind::XX:
+ return "xx";
+ case FpABIKind::S32:
+ return "32";
+ case FpABIKind::S64:
+ return "64";
+ default:
+ llvm_unreachable("unsupported fp abi value");
+ }
+}
+
+uint8_t MipsABIFlagsSection::getCPR1SizeValue() {
+ if (FpABI == FpABIKind::XX)
+ return (uint8_t)Mips::AFL_REG_32;
+ return (uint8_t)CPR1Size;
+}
+
+namespace llvm {
+MCStreamer &operator<<(MCStreamer &OS, MipsABIFlagsSection &ABIFlagsSection) {
+ // Write out a Elf_Internal_ABIFlags_v0 struct
+ OS.EmitIntValue(ABIFlagsSection.getVersionValue(), 2); // version
+ OS.EmitIntValue(ABIFlagsSection.getISALevelValue(), 1); // isa_level
+ OS.EmitIntValue(ABIFlagsSection.getISARevisionValue(), 1); // isa_rev
+ OS.EmitIntValue(ABIFlagsSection.getGPRSizeValue(), 1); // gpr_size
+ OS.EmitIntValue(ABIFlagsSection.getCPR1SizeValue(), 1); // cpr1_size
+ OS.EmitIntValue(ABIFlagsSection.getCPR2SizeValue(), 1); // cpr2_size
+ OS.EmitIntValue(ABIFlagsSection.getFpABIValue(), 1); // fp_abi
+ OS.EmitIntValue(ABIFlagsSection.getISAExtensionValue(), 4); // isa_ext
+ OS.EmitIntValue(ABIFlagsSection.getASESetValue(), 4); // ases
+ OS.EmitIntValue(ABIFlagsSection.getFlags1Value(), 4); // flags1
+ OS.EmitIntValue(ABIFlagsSection.getFlags2Value(), 4); // flags2
+ return OS;
+}
+}
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h
new file mode 100644
index 000000000000..3966cae9fe33
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h
@@ -0,0 +1,200 @@
+//===-- MipsABIFlagsSection.h - Mips ELF ABI Flags Section -----*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSABIFLAGSSECTION_H
+#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSABIFLAGSSECTION_H
+
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MipsABIFlags.h"
+
+namespace llvm {
+
+class MCStreamer;
+
+struct MipsABIFlagsSection {
+ // Internal representation of the fp_abi related values used in .module.
+ enum class FpABIKind { ANY, XX, S32, S64, SOFT };
+
+ // Version of flags structure.
+ uint16_t Version;
+ // The level of the ISA: 1-5, 32, 64.
+ uint8_t ISALevel;
+ // The revision of ISA: 0 for MIPS V and below, 1-n otherwise.
+ uint8_t ISARevision;
+ // The size of general purpose registers.
+ Mips::AFL_REG GPRSize;
+ // The size of co-processor 1 registers.
+ Mips::AFL_REG CPR1Size;
+ // The size of co-processor 2 registers.
+ Mips::AFL_REG CPR2Size;
+ // Processor-specific extension.
+ Mips::AFL_EXT ISAExtension;
+ // Mask of ASEs used.
+ uint32_t ASESet;
+
+ bool OddSPReg;
+
+ bool Is32BitABI;
+
+protected:
+ // The floating-point ABI.
+ FpABIKind FpABI;
+
+public:
+ MipsABIFlagsSection()
+ : Version(0), ISALevel(0), ISARevision(0), GPRSize(Mips::AFL_REG_NONE),
+ CPR1Size(Mips::AFL_REG_NONE), CPR2Size(Mips::AFL_REG_NONE),
+ ISAExtension(Mips::AFL_EXT_NONE), ASESet(0), OddSPReg(false),
+ Is32BitABI(false), FpABI(FpABIKind::ANY) {}
+
+ uint16_t getVersionValue() { return (uint16_t)Version; }
+ uint8_t getISALevelValue() { return (uint8_t)ISALevel; }
+ uint8_t getISARevisionValue() { return (uint8_t)ISARevision; }
+ uint8_t getGPRSizeValue() { return (uint8_t)GPRSize; }
+ uint8_t getCPR1SizeValue();
+ uint8_t getCPR2SizeValue() { return (uint8_t)CPR2Size; }
+ uint8_t getFpABIValue();
+ uint32_t getISAExtensionValue() { return (uint32_t)ISAExtension; }
+ uint32_t getASESetValue() { return (uint32_t)ASESet; }
+
+ uint32_t getFlags1Value() {
+ uint32_t Value = 0;
+
+ if (OddSPReg)
+ Value |= (uint32_t)Mips::AFL_FLAGS1_ODDSPREG;
+
+ return Value;
+ }
+
+ uint32_t getFlags2Value() { return 0; }
+
+ FpABIKind getFpABI() { return FpABI; }
+ void setFpABI(FpABIKind Value, bool IsABI32Bit) {
+ FpABI = Value;
+ Is32BitABI = IsABI32Bit;
+ }
+ StringRef getFpABIString(FpABIKind Value);
+
+ template <class PredicateLibrary>
+ void setISALevelAndRevisionFromPredicates(const PredicateLibrary &P) {
+ if (P.hasMips64()) {
+ ISALevel = 64;
+ if (P.hasMips64r6())
+ ISARevision = 6;
+ else if (P.hasMips64r5())
+ ISARevision = 5;
+ else if (P.hasMips64r3())
+ ISARevision = 3;
+ else if (P.hasMips64r2())
+ ISARevision = 2;
+ else
+ ISARevision = 1;
+ } else if (P.hasMips32()) {
+ ISALevel = 32;
+ if (P.hasMips32r6())
+ ISARevision = 6;
+ else if (P.hasMips32r5())
+ ISARevision = 5;
+ else if (P.hasMips32r3())
+ ISARevision = 3;
+ else if (P.hasMips32r2())
+ ISARevision = 2;
+ else
+ ISARevision = 1;
+ } else {
+ ISARevision = 0;
+ if (P.hasMips5())
+ ISALevel = 5;
+ else if (P.hasMips4())
+ ISALevel = 4;
+ else if (P.hasMips3())
+ ISALevel = 3;
+ else if (P.hasMips2())
+ ISALevel = 2;
+ else if (P.hasMips1())
+ ISALevel = 1;
+ else
+ llvm_unreachable("Unknown ISA level!");
+ }
+ }
+
+ template <class PredicateLibrary>
+ void setGPRSizeFromPredicates(const PredicateLibrary &P) {
+ GPRSize = P.isGP64bit() ? Mips::AFL_REG_64 : Mips::AFL_REG_32;
+ }
+
+ template <class PredicateLibrary>
+ void setCPR1SizeFromPredicates(const PredicateLibrary &P) {
+ if (P.useSoftFloat())
+ CPR1Size = Mips::AFL_REG_NONE;
+ else if (P.hasMSA())
+ CPR1Size = Mips::AFL_REG_128;
+ else
+ CPR1Size = P.isFP64bit() ? Mips::AFL_REG_64 : Mips::AFL_REG_32;
+ }
+
+ template <class PredicateLibrary>
+ void setISAExtensionFromPredicates(const PredicateLibrary &P) {
+ if (P.hasCnMips())
+ ISAExtension = Mips::AFL_EXT_OCTEON;
+ else
+ ISAExtension = Mips::AFL_EXT_NONE;
+ }
+
+ template <class PredicateLibrary>
+ void setASESetFromPredicates(const PredicateLibrary &P) {
+ ASESet = 0;
+ if (P.hasDSP())
+ ASESet |= Mips::AFL_ASE_DSP;
+ if (P.hasDSPR2())
+ ASESet |= Mips::AFL_ASE_DSPR2;
+ if (P.hasMSA())
+ ASESet |= Mips::AFL_ASE_MSA;
+ if (P.inMicroMipsMode())
+ ASESet |= Mips::AFL_ASE_MICROMIPS;
+ if (P.inMips16Mode())
+ ASESet |= Mips::AFL_ASE_MIPS16;
+ }
+
+ template <class PredicateLibrary>
+ void setFpAbiFromPredicates(const PredicateLibrary &P) {
+ Is32BitABI = P.isABI_O32();
+
+ FpABI = FpABIKind::ANY;
+ if (P.useSoftFloat())
+ FpABI = FpABIKind::SOFT;
+ else if (P.isABI_N32() || P.isABI_N64())
+ FpABI = FpABIKind::S64;
+ else if (P.isABI_O32()) {
+ if (P.isABI_FPXX())
+ FpABI = FpABIKind::XX;
+ else if (P.isFP64bit())
+ FpABI = FpABIKind::S64;
+ else
+ FpABI = FpABIKind::S32;
+ }
+ }
+
+ template <class PredicateLibrary>
+ void setAllFromPredicates(const PredicateLibrary &P) {
+ setISALevelAndRevisionFromPredicates(P);
+ setGPRSizeFromPredicates(P);
+ setCPR1SizeFromPredicates(P);
+ setISAExtensionFromPredicates(P);
+ setASESetFromPredicates(P);
+ setFpAbiFromPredicates(P);
+ OddSPReg = P.useOddSPReg();
+ }
+};
+
+MCStreamer &operator<<(MCStreamer &OS, MipsABIFlagsSection &ABIFlagsSection);
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp
new file mode 100644
index 000000000000..498ea6fda4b3
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp
@@ -0,0 +1,119 @@
+//===---- MipsABIInfo.cpp - Information about MIPS ABI's ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsABIInfo.h"
+#include "MipsRegisterInfo.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/MC/MCTargetOptions.h"
+
+using namespace llvm;
+
+namespace {
+static const MCPhysReg O32IntRegs[4] = {Mips::A0, Mips::A1, Mips::A2, Mips::A3};
+
+static const MCPhysReg Mips64IntRegs[8] = {
+ Mips::A0_64, Mips::A1_64, Mips::A2_64, Mips::A3_64,
+ Mips::T0_64, Mips::T1_64, Mips::T2_64, Mips::T3_64};
+}
+
+ArrayRef<MCPhysReg> MipsABIInfo::GetByValArgRegs() const {
+ if (IsO32())
+ return makeArrayRef(O32IntRegs);
+ if (IsN32() || IsN64())
+ return makeArrayRef(Mips64IntRegs);
+ llvm_unreachable("Unhandled ABI");
+}
+
+ArrayRef<MCPhysReg> MipsABIInfo::GetVarArgRegs() const {
+ if (IsO32())
+ return makeArrayRef(O32IntRegs);
+ if (IsN32() || IsN64())
+ return makeArrayRef(Mips64IntRegs);
+ llvm_unreachable("Unhandled ABI");
+}
+
+unsigned MipsABIInfo::GetCalleeAllocdArgSizeInBytes(CallingConv::ID CC) const {
+ if (IsO32())
+ return CC != CallingConv::Fast ? 16 : 0;
+ if (IsN32() || IsN64())
+ return 0;
+ llvm_unreachable("Unhandled ABI");
+}
+
+MipsABIInfo MipsABIInfo::computeTargetABI(const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options) {
+ if (Options.getABIName().startswith("o32"))
+ return MipsABIInfo::O32();
+ if (Options.getABIName().startswith("n32"))
+ return MipsABIInfo::N32();
+ if (Options.getABIName().startswith("n64"))
+ return MipsABIInfo::N64();
+ assert(Options.getABIName().empty() && "Unknown ABI option for MIPS");
+
+ if (TT.getArch() == Triple::mips64 || TT.getArch() == Triple::mips64el)
+ return MipsABIInfo::N64();
+ return MipsABIInfo::O32();
+}
+
+unsigned MipsABIInfo::GetStackPtr() const {
+ return ArePtrs64bit() ? Mips::SP_64 : Mips::SP;
+}
+
+unsigned MipsABIInfo::GetFramePtr() const {
+ return ArePtrs64bit() ? Mips::FP_64 : Mips::FP;
+}
+
+unsigned MipsABIInfo::GetBasePtr() const {
+ return ArePtrs64bit() ? Mips::S7_64 : Mips::S7;
+}
+
+unsigned MipsABIInfo::GetGlobalPtr() const {
+ return ArePtrs64bit() ? Mips::GP_64 : Mips::GP;
+}
+
+unsigned MipsABIInfo::GetNullPtr() const {
+ return ArePtrs64bit() ? Mips::ZERO_64 : Mips::ZERO;
+}
+
+unsigned MipsABIInfo::GetZeroReg() const {
+ return AreGprs64bit() ? Mips::ZERO_64 : Mips::ZERO;
+}
+
+unsigned MipsABIInfo::GetPtrAdduOp() const {
+ return ArePtrs64bit() ? Mips::DADDu : Mips::ADDu;
+}
+
+unsigned MipsABIInfo::GetPtrAddiuOp() const {
+ return ArePtrs64bit() ? Mips::DADDiu : Mips::ADDiu;
+}
+
+unsigned MipsABIInfo::GetPtrSubuOp() const {
+ return ArePtrs64bit() ? Mips::DSUBu : Mips::SUBu;
+}
+
+unsigned MipsABIInfo::GetPtrAndOp() const {
+ return ArePtrs64bit() ? Mips::AND64 : Mips::AND;
+}
+
+unsigned MipsABIInfo::GetGPRMoveOp() const {
+ return ArePtrs64bit() ? Mips::OR64 : Mips::OR;
+}
+
+unsigned MipsABIInfo::GetEhDataReg(unsigned I) const {
+ static const unsigned EhDataReg[] = {
+ Mips::A0, Mips::A1, Mips::A2, Mips::A3
+ };
+ static const unsigned EhDataReg64[] = {
+ Mips::A0_64, Mips::A1_64, Mips::A2_64, Mips::A3_64
+ };
+
+ return IsN64() ? EhDataReg64[I] : EhDataReg[I];
+}
+
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h
new file mode 100644
index 000000000000..9372a3c2bb1f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h
@@ -0,0 +1,82 @@
+//===---- MipsABIInfo.h - Information about MIPS ABI's --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSABIINFO_H
+#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSABIINFO_H
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/MC/MCRegisterInfo.h"
+
+namespace llvm {
+
+template <typename T> class ArrayRef;
+class MCTargetOptions;
+class StringRef;
+class TargetRegisterClass;
+
+class MipsABIInfo {
+public:
+ enum class ABI { Unknown, O32, N32, N64 };
+
+protected:
+ ABI ThisABI;
+
+public:
+ MipsABIInfo(ABI ThisABI) : ThisABI(ThisABI) {}
+
+ static MipsABIInfo Unknown() { return MipsABIInfo(ABI::Unknown); }
+ static MipsABIInfo O32() { return MipsABIInfo(ABI::O32); }
+ static MipsABIInfo N32() { return MipsABIInfo(ABI::N32); }
+ static MipsABIInfo N64() { return MipsABIInfo(ABI::N64); }
+ static MipsABIInfo computeTargetABI(const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options);
+
+ bool IsKnown() const { return ThisABI != ABI::Unknown; }
+ bool IsO32() const { return ThisABI == ABI::O32; }
+ bool IsN32() const { return ThisABI == ABI::N32; }
+ bool IsN64() const { return ThisABI == ABI::N64; }
+ ABI GetEnumValue() const { return ThisABI; }
+
+ /// The registers to use for byval arguments.
+ ArrayRef<MCPhysReg> GetByValArgRegs() const;
+
+ /// The registers to use for the variable argument list.
+ ArrayRef<MCPhysReg> GetVarArgRegs() const;
+
+ /// Obtain the size of the area allocated by the callee for arguments.
+ /// CallingConv::FastCall affects the value for O32.
+ unsigned GetCalleeAllocdArgSizeInBytes(CallingConv::ID CC) const;
+
+ /// Ordering of ABI's
+ /// MipsGenSubtargetInfo.inc will use this to resolve conflicts when given
+ /// multiple ABI options.
+ bool operator<(const MipsABIInfo Other) const {
+ return ThisABI < Other.GetEnumValue();
+ }
+
+ unsigned GetStackPtr() const;
+ unsigned GetFramePtr() const;
+ unsigned GetBasePtr() const;
+ unsigned GetGlobalPtr() const;
+ unsigned GetNullPtr() const;
+ unsigned GetZeroReg() const;
+ unsigned GetPtrAdduOp() const;
+ unsigned GetPtrAddiuOp() const;
+ unsigned GetPtrSubuOp() const;
+ unsigned GetPtrAndOp() const;
+ unsigned GetGPRMoveOp() const;
+ inline bool ArePtrs64bit() const { return IsN64(); }
+ inline bool AreGprs64bit() const { return IsN32() || IsN64(); }
+
+ unsigned GetEhDataReg(unsigned I) const;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
new file mode 100644
index 000000000000..38b11f78e36d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -0,0 +1,522 @@
+//===-- MipsAsmBackend.cpp - Mips Asm Backend ----------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MipsAsmBackend class.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#include "MCTargetDesc/MipsFixupKinds.h"
+#include "MCTargetDesc/MipsAsmBackend.h"
+#include "MCTargetDesc/MipsMCExpr.h"
+#include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+// Prepare value for the target space for it
+static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
+ MCContext *Ctx = nullptr) {
+
+ unsigned Kind = Fixup.getKind();
+
+ // Add/subtract and shift
+ switch (Kind) {
+ default:
+ return 0;
+ case FK_Data_2:
+ case Mips::fixup_Mips_LO16:
+ case Mips::fixup_Mips_GPREL16:
+ case Mips::fixup_Mips_GPOFF_HI:
+ case Mips::fixup_Mips_GPOFF_LO:
+ case Mips::fixup_Mips_GOT_PAGE:
+ case Mips::fixup_Mips_GOT_OFST:
+ case Mips::fixup_Mips_GOT_DISP:
+ case Mips::fixup_Mips_GOT_LO16:
+ case Mips::fixup_Mips_CALL_LO16:
+ case Mips::fixup_MICROMIPS_LO16:
+ case Mips::fixup_MICROMIPS_GOT_PAGE:
+ case Mips::fixup_MICROMIPS_GOT_OFST:
+ case Mips::fixup_MICROMIPS_GOT_DISP:
+ case Mips::fixup_MIPS_PCLO16:
+ Value &= 0xffff;
+ break;
+ case FK_DTPRel_4:
+ case FK_DTPRel_8:
+ case FK_TPRel_4:
+ case FK_TPRel_8:
+ case FK_GPRel_4:
+ case FK_Data_4:
+ case FK_Data_8:
+ case Mips::fixup_Mips_SUB:
+ case Mips::fixup_MICROMIPS_SUB:
+ break;
+ case Mips::fixup_Mips_PC16:
+ // The displacement is then divided by 4 to give us an 18 bit
+ // address range. Forcing a signed division because Value can be negative.
+ Value = (int64_t)Value / 4;
+ // We now check if Value can be encoded as a 16-bit signed immediate.
+ if (!isInt<16>(Value) && Ctx) {
+ Ctx->reportError(Fixup.getLoc(), "out of range PC16 fixup");
+ return 0;
+ }
+ break;
+ case Mips::fixup_MIPS_PC19_S2:
+ case Mips::fixup_MICROMIPS_PC19_S2:
+ // Forcing a signed division because Value can be negative.
+ Value = (int64_t)Value / 4;
+ // We now check if Value can be encoded as a 19-bit signed immediate.
+ if (!isInt<19>(Value) && Ctx) {
+ Ctx->reportError(Fixup.getLoc(), "out of range PC19 fixup");
+ return 0;
+ }
+ break;
+ case Mips::fixup_Mips_26:
+ // So far we are only using this type for jumps.
+ // The displacement is then divided by 4 to give us an 28 bit
+ // address range.
+ Value >>= 2;
+ break;
+ case Mips::fixup_Mips_HI16:
+ case Mips::fixup_Mips_GOT:
+ case Mips::fixup_MICROMIPS_GOT16:
+ case Mips::fixup_Mips_GOT_HI16:
+ case Mips::fixup_Mips_CALL_HI16:
+ case Mips::fixup_MICROMIPS_HI16:
+ case Mips::fixup_MIPS_PCHI16:
+ // Get the 2nd 16-bits. Also add 1 if bit 15 is 1.
+ Value = ((Value + 0x8000) >> 16) & 0xffff;
+ break;
+ case Mips::fixup_Mips_HIGHER:
+ // Get the 3rd 16-bits.
+ Value = ((Value + 0x80008000LL) >> 32) & 0xffff;
+ break;
+ case Mips::fixup_Mips_HIGHEST:
+ // Get the 4th 16-bits.
+ Value = ((Value + 0x800080008000LL) >> 48) & 0xffff;
+ break;
+ case Mips::fixup_MICROMIPS_26_S1:
+ Value >>= 1;
+ break;
+ case Mips::fixup_MICROMIPS_PC7_S1:
+ Value -= 4;
+ // Forcing a signed division because Value can be negative.
+ Value = (int64_t) Value / 2;
+ // We now check if Value can be encoded as a 7-bit signed immediate.
+ if (!isInt<7>(Value) && Ctx) {
+ Ctx->reportError(Fixup.getLoc(), "out of range PC7 fixup");
+ return 0;
+ }
+ break;
+ case Mips::fixup_MICROMIPS_PC10_S1:
+ Value -= 2;
+ // Forcing a signed division because Value can be negative.
+ Value = (int64_t) Value / 2;
+ // We now check if Value can be encoded as a 10-bit signed immediate.
+ if (!isInt<10>(Value) && Ctx) {
+ Ctx->reportError(Fixup.getLoc(), "out of range PC10 fixup");
+ return 0;
+ }
+ break;
+ case Mips::fixup_MICROMIPS_PC16_S1:
+ Value -= 4;
+ // Forcing a signed division because Value can be negative.
+ Value = (int64_t)Value / 2;
+ // We now check if Value can be encoded as a 16-bit signed immediate.
+ if (!isInt<16>(Value) && Ctx) {
+ Ctx->reportError(Fixup.getLoc(), "out of range PC16 fixup");
+ return 0;
+ }
+ break;
+ case Mips::fixup_MIPS_PC18_S3:
+ // Forcing a signed division because Value can be negative.
+ Value = (int64_t)Value / 8;
+ // We now check if Value can be encoded as a 18-bit signed immediate.
+ if (!isInt<18>(Value) && Ctx) {
+ Ctx->reportError(Fixup.getLoc(), "out of range PC18 fixup");
+ return 0;
+ }
+ break;
+ case Mips::fixup_MICROMIPS_PC18_S3:
+ // Check alignment.
+ if ((Value & 7) && Ctx) {
+ Ctx->reportError(Fixup.getLoc(), "out of range PC18 fixup");
+ }
+ // Forcing a signed division because Value can be negative.
+ Value = (int64_t)Value / 8;
+ // We now check if Value can be encoded as a 18-bit signed immediate.
+ if (!isInt<18>(Value) && Ctx) {
+ Ctx->reportError(Fixup.getLoc(), "out of range PC18 fixup");
+ return 0;
+ }
+ break;
+ case Mips::fixup_MIPS_PC21_S2:
+ // Forcing a signed division because Value can be negative.
+ Value = (int64_t) Value / 4;
+ // We now check if Value can be encoded as a 21-bit signed immediate.
+ if (!isInt<21>(Value) && Ctx) {
+ Ctx->reportError(Fixup.getLoc(), "out of range PC21 fixup");
+ return 0;
+ }
+ break;
+ case Mips::fixup_MIPS_PC26_S2:
+ // Forcing a signed division because Value can be negative.
+ Value = (int64_t) Value / 4;
+ // We now check if Value can be encoded as a 26-bit signed immediate.
+ if (!isInt<26>(Value) && Ctx) {
+ Ctx->reportError(Fixup.getLoc(), "out of range PC26 fixup");
+ return 0;
+ }
+ break;
+ case Mips::fixup_MICROMIPS_PC26_S1:
+ // Forcing a signed division because Value can be negative.
+ Value = (int64_t)Value / 2;
+ // We now check if Value can be encoded as a 26-bit signed immediate.
+ if (!isInt<26>(Value) && Ctx) {
+ Ctx->reportFatalError(Fixup.getLoc(), "out of range PC26 fixup");
+ return 0;
+ }
+ break;
+ case Mips::fixup_MICROMIPS_PC21_S1:
+ // Forcing a signed division because Value can be negative.
+ Value = (int64_t)Value / 2;
+ // We now check if Value can be encoded as a 21-bit signed immediate.
+ if (!isInt<21>(Value) && Ctx) {
+ Ctx->reportError(Fixup.getLoc(), "out of range PC21 fixup");
+ return 0;
+ }
+ break;
+ }
+
+ return Value;
+}
+
+MCObjectWriter *
+MipsAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
+ return createMipsELFObjectWriter(OS,
+ MCELFObjectTargetWriter::getOSABI(OSType), IsLittle, Is64Bit);
+}
+
+// Little-endian fixup data byte ordering:
+// mips32r2: a | b | x | x
+// microMIPS: x | x | a | b
+
+static bool needsMMLEByteOrder(unsigned Kind) {
+ return Kind != Mips::fixup_MICROMIPS_PC10_S1 &&
+ Kind >= Mips::fixup_MICROMIPS_26_S1 &&
+ Kind < Mips::LastTargetFixupKind;
+}
+
+// Calculate index for microMIPS specific little endian byte order
+static unsigned calculateMMLEIndex(unsigned i) {
+ assert(i <= 3 && "Index out of range!");
+
+ return (1 - i / 2) * 2 + i % 2;
+}
+
+/// ApplyFixup - Apply the \p Value for given \p Fixup into the provided
+/// data fragment, at the offset specified by the fixup and following the
+/// fixup kind as appropriate.
+void MipsAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+ unsigned DataSize, uint64_t Value,
+ bool IsPCRel) const {
+ MCFixupKind Kind = Fixup.getKind();
+ Value = adjustFixupValue(Fixup, Value);
+
+ if (!Value)
+ return; // Doesn't change encoding.
+
+ // Where do we start in the object
+ unsigned Offset = Fixup.getOffset();
+ // Number of bytes we need to fixup
+ unsigned NumBytes = (getFixupKindInfo(Kind).TargetSize + 7) / 8;
+ // Used to point to big endian bytes
+ unsigned FullSize;
+
+ switch ((unsigned)Kind) {
+ case FK_Data_2:
+ case Mips::fixup_Mips_16:
+ case Mips::fixup_MICROMIPS_PC10_S1:
+ FullSize = 2;
+ break;
+ case FK_Data_8:
+ case Mips::fixup_Mips_64:
+ FullSize = 8;
+ break;
+ case FK_Data_4:
+ default:
+ FullSize = 4;
+ break;
+ }
+
+ // Grab current value, if any, from bits.
+ uint64_t CurVal = 0;
+
+ bool microMipsLEByteOrder = needsMMLEByteOrder((unsigned) Kind);
+
+ for (unsigned i = 0; i != NumBytes; ++i) {
+ unsigned Idx = IsLittle ? (microMipsLEByteOrder ? calculateMMLEIndex(i)
+ : i)
+ : (FullSize - 1 - i);
+ CurVal |= (uint64_t)((uint8_t)Data[Offset + Idx]) << (i*8);
+ }
+
+ uint64_t Mask = ((uint64_t)(-1) >>
+ (64 - getFixupKindInfo(Kind).TargetSize));
+ CurVal |= Value & Mask;
+
+ // Write out the fixed up bytes back to the code/data bits.
+ for (unsigned i = 0; i != NumBytes; ++i) {
+ unsigned Idx = IsLittle ? (microMipsLEByteOrder ? calculateMMLEIndex(i)
+ : i)
+ : (FullSize - 1 - i);
+ Data[Offset + Idx] = (uint8_t)((CurVal >> (i*8)) & 0xff);
+ }
+}
+
+Optional<MCFixupKind> MipsAsmBackend::getFixupKind(StringRef Name) const {
+ return StringSwitch<Optional<MCFixupKind>>(Name)
+ .Case("R_MIPS_NONE", (MCFixupKind)Mips::fixup_Mips_NONE)
+ .Case("R_MIPS_32", FK_Data_4)
+ .Default(MCAsmBackend::getFixupKind(Name));
+}
+
+const MCFixupKindInfo &MipsAsmBackend::
+getFixupKindInfo(MCFixupKind Kind) const {
+ const static MCFixupKindInfo LittleEndianInfos[Mips::NumTargetFixupKinds] = {
+ // This table *must* be in same the order of fixup_* kinds in
+ // MipsFixupKinds.h.
+ //
+ // name offset bits flags
+ { "fixup_Mips_NONE", 0, 0, 0 },
+ { "fixup_Mips_16", 0, 16, 0 },
+ { "fixup_Mips_32", 0, 32, 0 },
+ { "fixup_Mips_REL32", 0, 32, 0 },
+ { "fixup_Mips_26", 0, 26, 0 },
+ { "fixup_Mips_HI16", 0, 16, 0 },
+ { "fixup_Mips_LO16", 0, 16, 0 },
+ { "fixup_Mips_GPREL16", 0, 16, 0 },
+ { "fixup_Mips_LITERAL", 0, 16, 0 },
+ { "fixup_Mips_GOT", 0, 16, 0 },
+ { "fixup_Mips_PC16", 0, 16, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_Mips_CALL16", 0, 16, 0 },
+ { "fixup_Mips_GPREL32", 0, 32, 0 },
+ { "fixup_Mips_SHIFT5", 6, 5, 0 },
+ { "fixup_Mips_SHIFT6", 6, 5, 0 },
+ { "fixup_Mips_64", 0, 64, 0 },
+ { "fixup_Mips_TLSGD", 0, 16, 0 },
+ { "fixup_Mips_GOTTPREL", 0, 16, 0 },
+ { "fixup_Mips_TPREL_HI", 0, 16, 0 },
+ { "fixup_Mips_TPREL_LO", 0, 16, 0 },
+ { "fixup_Mips_TLSLDM", 0, 16, 0 },
+ { "fixup_Mips_DTPREL_HI", 0, 16, 0 },
+ { "fixup_Mips_DTPREL_LO", 0, 16, 0 },
+ { "fixup_Mips_Branch_PCRel", 0, 16, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_Mips_GPOFF_HI", 0, 16, 0 },
+ { "fixup_Mips_GPOFF_LO", 0, 16, 0 },
+ { "fixup_Mips_GOT_PAGE", 0, 16, 0 },
+ { "fixup_Mips_GOT_OFST", 0, 16, 0 },
+ { "fixup_Mips_GOT_DISP", 0, 16, 0 },
+ { "fixup_Mips_HIGHER", 0, 16, 0 },
+ { "fixup_Mips_HIGHEST", 0, 16, 0 },
+ { "fixup_Mips_GOT_HI16", 0, 16, 0 },
+ { "fixup_Mips_GOT_LO16", 0, 16, 0 },
+ { "fixup_Mips_CALL_HI16", 0, 16, 0 },
+ { "fixup_Mips_CALL_LO16", 0, 16, 0 },
+ { "fixup_Mips_PC18_S3", 0, 18, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_MIPS_PC19_S2", 0, 19, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_MIPS_PC21_S2", 0, 21, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_MIPS_PC26_S2", 0, 26, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_MIPS_PCHI16", 0, 16, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_MIPS_PCLO16", 0, 16, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_MICROMIPS_26_S1", 0, 26, 0 },
+ { "fixup_MICROMIPS_HI16", 0, 16, 0 },
+ { "fixup_MICROMIPS_LO16", 0, 16, 0 },
+ { "fixup_MICROMIPS_GOT16", 0, 16, 0 },
+ { "fixup_MICROMIPS_PC7_S1", 0, 7, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_MICROMIPS_PC10_S1", 0, 10, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_MICROMIPS_PC16_S1", 0, 16, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_MICROMIPS_PC26_S1", 0, 26, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_MICROMIPS_PC19_S2", 0, 19, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_MICROMIPS_PC18_S3", 0, 18, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_MICROMIPS_PC21_S1", 0, 21, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_MICROMIPS_CALL16", 0, 16, 0 },
+ { "fixup_MICROMIPS_GOT_DISP", 0, 16, 0 },
+ { "fixup_MICROMIPS_GOT_PAGE", 0, 16, 0 },
+ { "fixup_MICROMIPS_GOT_OFST", 0, 16, 0 },
+ { "fixup_MICROMIPS_TLS_GD", 0, 16, 0 },
+ { "fixup_MICROMIPS_TLS_LDM", 0, 16, 0 },
+ { "fixup_MICROMIPS_TLS_DTPREL_HI16", 0, 16, 0 },
+ { "fixup_MICROMIPS_TLS_DTPREL_LO16", 0, 16, 0 },
+ { "fixup_MICROMIPS_TLS_TPREL_HI16", 0, 16, 0 },
+ { "fixup_MICROMIPS_TLS_TPREL_LO16", 0, 16, 0 },
+ { "fixup_Mips_SUB", 0, 64, 0 },
+ { "fixup_MICROMIPS_SUB", 0, 64, 0 }
+ };
+
+ const static MCFixupKindInfo BigEndianInfos[Mips::NumTargetFixupKinds] = {
+ // This table *must* be in same the order of fixup_* kinds in
+ // MipsFixupKinds.h.
+ //
+ // name offset bits flags
+ { "fixup_Mips_NONE", 0, 0, 0 },
+ { "fixup_Mips_16", 16, 16, 0 },
+ { "fixup_Mips_32", 0, 32, 0 },
+ { "fixup_Mips_REL32", 0, 32, 0 },
+ { "fixup_Mips_26", 6, 26, 0 },
+ { "fixup_Mips_HI16", 16, 16, 0 },
+ { "fixup_Mips_LO16", 16, 16, 0 },
+ { "fixup_Mips_GPREL16", 16, 16, 0 },
+ { "fixup_Mips_LITERAL", 16, 16, 0 },
+ { "fixup_Mips_GOT", 16, 16, 0 },
+ { "fixup_Mips_PC16", 16, 16, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_Mips_CALL16", 16, 16, 0 },
+ { "fixup_Mips_GPREL32", 0, 32, 0 },
+ { "fixup_Mips_SHIFT5", 21, 5, 0 },
+ { "fixup_Mips_SHIFT6", 21, 5, 0 },
+ { "fixup_Mips_64", 0, 64, 0 },
+ { "fixup_Mips_TLSGD", 16, 16, 0 },
+ { "fixup_Mips_GOTTPREL", 16, 16, 0 },
+ { "fixup_Mips_TPREL_HI", 16, 16, 0 },
+ { "fixup_Mips_TPREL_LO", 16, 16, 0 },
+ { "fixup_Mips_TLSLDM", 16, 16, 0 },
+ { "fixup_Mips_DTPREL_HI", 16, 16, 0 },
+ { "fixup_Mips_DTPREL_LO", 16, 16, 0 },
+ { "fixup_Mips_Branch_PCRel",16, 16, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_Mips_GPOFF_HI", 16, 16, 0 },
+ { "fixup_Mips_GPOFF_LO", 16, 16, 0 },
+ { "fixup_Mips_GOT_PAGE", 16, 16, 0 },
+ { "fixup_Mips_GOT_OFST", 16, 16, 0 },
+ { "fixup_Mips_GOT_DISP", 16, 16, 0 },
+ { "fixup_Mips_HIGHER", 16, 16, 0 },
+ { "fixup_Mips_HIGHEST", 16, 16, 0 },
+ { "fixup_Mips_GOT_HI16", 16, 16, 0 },
+ { "fixup_Mips_GOT_LO16", 16, 16, 0 },
+ { "fixup_Mips_CALL_HI16", 16, 16, 0 },
+ { "fixup_Mips_CALL_LO16", 16, 16, 0 },
+ { "fixup_Mips_PC18_S3", 14, 18, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_MIPS_PC19_S2", 13, 19, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_MIPS_PC21_S2", 11, 21, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_MIPS_PC26_S2", 6, 26, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_MIPS_PCHI16", 16, 16, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_MIPS_PCLO16", 16, 16, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_MICROMIPS_26_S1", 6, 26, 0 },
+ { "fixup_MICROMIPS_HI16", 16, 16, 0 },
+ { "fixup_MICROMIPS_LO16", 16, 16, 0 },
+ { "fixup_MICROMIPS_GOT16", 16, 16, 0 },
+ { "fixup_MICROMIPS_PC7_S1", 9, 7, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_MICROMIPS_PC10_S1", 6, 10, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_MICROMIPS_PC16_S1",16, 16, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_MICROMIPS_PC26_S1", 6, 26, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_MICROMIPS_PC19_S2",13, 19, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_MICROMIPS_PC18_S3",14, 18, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_MICROMIPS_PC21_S1",11, 21, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_MICROMIPS_CALL16", 16, 16, 0 },
+ { "fixup_MICROMIPS_GOT_DISP", 16, 16, 0 },
+ { "fixup_MICROMIPS_GOT_PAGE", 16, 16, 0 },
+ { "fixup_MICROMIPS_GOT_OFST", 16, 16, 0 },
+ { "fixup_MICROMIPS_TLS_GD", 16, 16, 0 },
+ { "fixup_MICROMIPS_TLS_LDM", 16, 16, 0 },
+ { "fixup_MICROMIPS_TLS_DTPREL_HI16", 16, 16, 0 },
+ { "fixup_MICROMIPS_TLS_DTPREL_LO16", 16, 16, 0 },
+ { "fixup_MICROMIPS_TLS_TPREL_HI16", 16, 16, 0 },
+ { "fixup_MICROMIPS_TLS_TPREL_LO16", 16, 16, 0 },
+ { "fixup_Mips_SUB", 0, 64, 0 },
+ { "fixup_MICROMIPS_SUB", 0, 64, 0 }
+ };
+
+ if (Kind < FirstTargetFixupKind)
+ return MCAsmBackend::getFixupKindInfo(Kind);
+
+ assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+ "Invalid kind!");
+
+ if (IsLittle)
+ return LittleEndianInfos[Kind - FirstTargetFixupKind];
+ return BigEndianInfos[Kind - FirstTargetFixupKind];
+}
+
+/// WriteNopData - Write an (optimal) nop sequence of Count bytes
+/// to the given output. If the target cannot generate such a sequence,
+/// it should return an error.
+///
+/// \return - True on success.
+bool MipsAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+ // Check for a less than instruction size number of bytes
+ // FIXME: 16 bit instructions are not handled yet here.
+ // We shouldn't be using a hard coded number for instruction size.
+
+ // If the count is not 4-byte aligned, we must be writing data into the text
+ // section (otherwise we have unaligned instructions, and thus have far
+ // bigger problems), so just write zeros instead.
+ OW->WriteZeros(Count);
+ return true;
+}
+
+/// processFixupValue - Target hook to process the literal value of a fixup
+/// if necessary.
+void MipsAsmBackend::processFixupValue(const MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFixup &Fixup,
+ const MCFragment *DF,
+ const MCValue &Target,
+ uint64_t &Value,
+ bool &IsResolved) {
+ // At this point we'll ignore the value returned by adjustFixupValue as
+ // we are only checking if the fixup can be applied correctly. We have
+ // access to MCContext from here which allows us to report a fatal error
+ // with *possibly* a source code location.
+ // The caller will also ignore any changes we make to Value
+ // (recordRelocation() overwrites it with it's own calculation).
+ (void)adjustFixupValue(Fixup, Value, &Asm.getContext());
+}
+
+// MCAsmBackend
+MCAsmBackend *llvm::createMipsAsmBackendEL32(const Target &T,
+ const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options) {
+ return new MipsAsmBackend(T, TT.getOS(), /*IsLittle*/ true,
+ /*Is64Bit*/ false);
+}
+
+MCAsmBackend *llvm::createMipsAsmBackendEB32(const Target &T,
+ const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options) {
+ return new MipsAsmBackend(T, TT.getOS(), /*IsLittle*/ false,
+ /*Is64Bit*/ false);
+}
+
+MCAsmBackend *llvm::createMipsAsmBackendEL64(const Target &T,
+ const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options) {
+ return new MipsAsmBackend(T, TT.getOS(), /*IsLittle*/ true, /*Is64Bit*/ true);
+}
+
+MCAsmBackend *llvm::createMipsAsmBackendEB64(const Target &T,
+ const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options) {
+ return new MipsAsmBackend(T, TT.getOS(), /*IsLittle*/ false,
+ /*Is64Bit*/ true);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
new file mode 100644
index 000000000000..f260cfa566c9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
@@ -0,0 +1,94 @@
+//===-- MipsAsmBackend.h - Mips Asm Backend ------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the MipsAsmBackend class.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSASMBACKEND_H
+#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSASMBACKEND_H
+
+#include "MCTargetDesc/MipsFixupKinds.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCAsmBackend.h"
+
+namespace llvm {
+
+class MCAssembler;
+struct MCFixupKindInfo;
+class Target;
+class MCObjectWriter;
+
+class MipsAsmBackend : public MCAsmBackend {
+ Triple::OSType OSType;
+ bool IsLittle; // Big or little endian
+ bool Is64Bit; // 32 or 64 bit words
+
+public:
+ MipsAsmBackend(const Target &T, Triple::OSType OSType, bool IsLittle,
+ bool Is64Bit)
+ : MCAsmBackend(), OSType(OSType), IsLittle(IsLittle), Is64Bit(Is64Bit) {}
+
+ MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
+
+ void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+ uint64_t Value, bool IsPCRel) const override;
+
+ Optional<MCFixupKind> getFixupKind(StringRef Name) const override;
+ const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
+
+ unsigned getNumFixupKinds() const override {
+ return Mips::NumTargetFixupKinds;
+ }
+
+ /// @name Target Relaxation Interfaces
+ /// @{
+
+ /// MayNeedRelaxation - Check whether the given instruction may need
+ /// relaxation.
+ ///
+ /// \param Inst - The instruction to test.
+ bool mayNeedRelaxation(const MCInst &Inst) const override {
+ return false;
+ }
+
+ /// fixupNeedsRelaxation - Target specific predicate for whether a given
+ /// fixup requires the associated instruction to be relaxed.
+ bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+ const MCRelaxableFragment *DF,
+ const MCAsmLayout &Layout) const override {
+ // FIXME.
+ llvm_unreachable("RelaxInstruction() unimplemented");
+ return false;
+ }
+
+ /// RelaxInstruction - Relax the instruction in the given fragment
+ /// to the next wider instruction.
+ ///
+ /// \param Inst - The instruction to relax, which may be the same
+ /// as the output.
+ /// \param [out] Res On return, the relaxed instruction.
+ void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+ MCInst &Res) const override {}
+
+ /// @}
+
+ bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+
+ void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
+ const MCFixup &Fixup, const MCFragment *DF,
+ const MCValue &Target, uint64_t &Value,
+ bool &IsResolved) override;
+
+}; // class MipsAsmBackend
+
+} // namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
new file mode 100644
index 000000000000..35de7b27bf10
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
@@ -0,0 +1,132 @@
+//===-- MipsBaseInfo.h - Top level definitions for MIPS MC ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the Mips target useful for the compiler back-end and the MC libraries.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSBASEINFO_H
+#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSBASEINFO_H
+
+#include "MipsFixupKinds.h"
+#include "MipsMCTargetDesc.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+/// MipsII - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace MipsII {
+ /// Target Operand Flag enum.
+ enum TOF {
+ //===------------------------------------------------------------------===//
+ // Mips Specific MachineOperand flags.
+
+ MO_NO_FLAG,
+
+ /// MO_GOT - Represents the offset into the global offset table at which
+ /// the address the relocation entry symbol resides during execution.
+ MO_GOT,
+
+ /// MO_GOT_CALL - Represents the offset into the global offset table at
+ /// which the address of a call site relocation entry symbol resides
+ /// during execution. This is different from the above since this flag
+ /// can only be present in call instructions.
+ MO_GOT_CALL,
+
+ /// MO_GPREL - Represents the offset from the current gp value to be used
+ /// for the relocatable object file being produced.
+ MO_GPREL,
+
+ /// MO_ABS_HI/LO - Represents the hi or low part of an absolute symbol
+ /// address.
+ MO_ABS_HI,
+ MO_ABS_LO,
+
+ /// MO_TLSGD - Represents the offset into the global offset table at which
+ // the module ID and TSL block offset reside during execution (General
+ // Dynamic TLS).
+ MO_TLSGD,
+
+ /// MO_TLSLDM - Represents the offset into the global offset table at which
+ // the module ID and TSL block offset reside during execution (Local
+ // Dynamic TLS).
+ MO_TLSLDM,
+ MO_DTPREL_HI,
+ MO_DTPREL_LO,
+
+ /// MO_GOTTPREL - Represents the offset from the thread pointer (Initial
+ // Exec TLS).
+ MO_GOTTPREL,
+
+ /// MO_TPREL_HI/LO - Represents the hi and low part of the offset from
+ // the thread pointer (Local Exec TLS).
+ MO_TPREL_HI,
+ MO_TPREL_LO,
+
+ // N32/64 Flags.
+ MO_GPOFF_HI,
+ MO_GPOFF_LO,
+ MO_GOT_DISP,
+ MO_GOT_PAGE,
+ MO_GOT_OFST,
+
+ /// MO_HIGHER/HIGHEST - Represents the highest or higher half word of a
+ /// 64-bit symbol address.
+ MO_HIGHER,
+ MO_HIGHEST,
+
+ /// MO_GOT_HI16/LO16, MO_CALL_HI16/LO16 - Relocations used for large GOTs.
+ MO_GOT_HI16,
+ MO_GOT_LO16,
+ MO_CALL_HI16,
+ MO_CALL_LO16
+ };
+
+ enum {
+ //===------------------------------------------------------------------===//
+ // Instruction encodings. These are the standard/most common forms for
+ // Mips instructions.
+ //
+
+ // Pseudo - This represents an instruction that is a pseudo instruction
+ // or one that has not been implemented yet. It is illegal to code generate
+ // it, but tolerated for intermediate implementation stages.
+ Pseudo = 0,
+
+ /// FrmR - This form is for instructions of the format R.
+ FrmR = 1,
+ /// FrmI - This form is for instructions of the format I.
+ FrmI = 2,
+ /// FrmJ - This form is for instructions of the format J.
+ FrmJ = 3,
+ /// FrmFR - This form is for instructions of the format FR.
+ FrmFR = 4,
+ /// FrmFI - This form is for instructions of the format FI.
+ FrmFI = 5,
+ /// FrmOther - This form is for instructions that have no specific format.
+ FrmOther = 6,
+
+ FormMask = 15,
+ /// IsCTI - Instruction is a Control Transfer Instruction.
+ IsCTI = 1 << 4,
+ /// HasForbiddenSlot - Instruction has a forbidden slot.
+ HasForbiddenSlot = 1 << 5,
+ /// IsPCRelativeLoad - A Load instruction with implicit source register
+ /// ($pc) with explicit offset and destination register
+ IsPCRelativeLoad = 1 << 6
+
+ };
+}
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
new file mode 100644
index 000000000000..b2efd726da53
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -0,0 +1,662 @@
+//===-- MipsELFObjectWriter.cpp - Mips ELF Writer -------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <list>
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MCTargetDesc/MipsFixupKinds.h"
+#include "MCTargetDesc/MipsMCExpr.h"
+#include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+
+#define DEBUG_TYPE "mips-elf-object-writer"
+
+using namespace llvm;
+
+namespace {
+/// Holds additional information needed by the relocation ordering algorithm.
+struct MipsRelocationEntry {
+ const ELFRelocationEntry R; ///< The relocation.
+ bool Matched; ///< Is this relocation part of a match.
+
+ MipsRelocationEntry(const ELFRelocationEntry &R) : R(R), Matched(false) {}
+
+ void print(raw_ostream &Out) const {
+ R.print(Out);
+ Out << ", Matched=" << Matched;
+ }
+};
+
+#ifndef NDEBUG
+raw_ostream &operator<<(raw_ostream &OS, const MipsRelocationEntry &RHS) {
+ RHS.print(OS);
+ return OS;
+}
+#endif
+
+class MipsELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+ MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI, bool _isN64,
+ bool IsLittleEndian);
+
+ ~MipsELFObjectWriter() override;
+
+ unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+ const MCFixup &Fixup, bool IsPCRel) const override;
+ bool needsRelocateWithSymbol(const MCSymbol &Sym,
+ unsigned Type) const override;
+ virtual void sortRelocs(const MCAssembler &Asm,
+ std::vector<ELFRelocationEntry> &Relocs) override;
+};
+
+/// Copy elements in the range [First, Last) to d1 when the predicate is true or
+/// d2 when the predicate is false. This is essentially both std::copy_if and
+/// std::remove_copy_if combined into a single pass.
+template <class InputIt, class OutputIt1, class OutputIt2, class UnaryPredicate>
+std::pair<OutputIt1, OutputIt2> copy_if_else(InputIt First, InputIt Last,
+ OutputIt1 d1, OutputIt2 d2,
+ UnaryPredicate Predicate) {
+ for (InputIt I = First; I != Last; ++I) {
+ if (Predicate(*I)) {
+ *d1 = *I;
+ d1++;
+ } else {
+ *d2 = *I;
+ d2++;
+ }
+ }
+
+ return std::make_pair(d1, d2);
+}
+
+/// The possible results of the Predicate function used by find_best.
+enum FindBestPredicateResult {
+ FindBest_NoMatch = 0, ///< The current element is not a match.
+ FindBest_Match, ///< The current element is a match but better ones are
+ /// possible.
+ FindBest_PerfectMatch, ///< The current element is an unbeatable match.
+};
+
+/// Find the best match in the range [First, Last).
+///
+/// An element matches when Predicate(X) returns FindBest_Match or
+/// FindBest_PerfectMatch. A value of FindBest_PerfectMatch also terminates
+/// the search. BetterThan(A, B) is a comparator that returns true when A is a
+/// better match than B. The return value is the position of the best match.
+///
+/// This is similar to std::find_if but finds the best of multiple possible
+/// matches.
+template <class InputIt, class UnaryPredicate, class Comparator>
+InputIt find_best(InputIt First, InputIt Last, UnaryPredicate Predicate,
+ Comparator BetterThan) {
+ InputIt Best = Last;
+
+ for (InputIt I = First; I != Last; ++I) {
+ unsigned Matched = Predicate(*I);
+ if (Matched != FindBest_NoMatch) {
+ DEBUG(dbgs() << std::distance(First, I) << " is a match (";
+ I->print(dbgs()); dbgs() << ")\n");
+ if (Best == Last || BetterThan(*I, *Best)) {
+ DEBUG(dbgs() << ".. and it beats the last one\n");
+ Best = I;
+ }
+ }
+ if (Matched == FindBest_PerfectMatch) {
+ DEBUG(dbgs() << ".. and it is unbeatable\n");
+ break;
+ }
+ }
+
+ return Best;
+}
+
+/// Determine the low relocation that matches the given relocation.
+/// If the relocation does not need a low relocation then the return value
+/// is ELF::R_MIPS_NONE.
+///
+/// The relocations that need a matching low part are
+/// R_(MIPS|MICROMIPS|MIPS16)_HI16 for all symbols and
+/// R_(MIPS|MICROMIPS|MIPS16)_GOT16 for local symbols only.
+static unsigned getMatchingLoType(const ELFRelocationEntry &Reloc) {
+ unsigned Type = Reloc.Type;
+ if (Type == ELF::R_MIPS_HI16)
+ return ELF::R_MIPS_LO16;
+ if (Type == ELF::R_MICROMIPS_HI16)
+ return ELF::R_MICROMIPS_LO16;
+ if (Type == ELF::R_MIPS16_HI16)
+ return ELF::R_MIPS16_LO16;
+
+ if (Reloc.OriginalSymbol->getBinding() != ELF::STB_LOCAL)
+ return ELF::R_MIPS_NONE;
+
+ if (Type == ELF::R_MIPS_GOT16)
+ return ELF::R_MIPS_LO16;
+ if (Type == ELF::R_MICROMIPS_GOT16)
+ return ELF::R_MICROMIPS_LO16;
+ if (Type == ELF::R_MIPS16_GOT16)
+ return ELF::R_MIPS16_LO16;
+
+ return ELF::R_MIPS_NONE;
+}
+
+/// Determine whether a relocation (X) matches the one given in R.
+///
+/// A relocation matches if:
+/// - It's type matches that of a corresponding low part. This is provided in
+/// MatchingType for efficiency.
+/// - It's based on the same symbol.
+/// - It's offset of greater or equal to that of the one given in R.
+/// It should be noted that this rule assumes the programmer does not use
+/// offsets that exceed the alignment of the symbol. The carry-bit will be
+/// incorrect if this is not true.
+///
+/// A matching relocation is unbeatable if:
+/// - It is not already involved in a match.
+/// - It's offset is exactly that of the one given in R.
+static FindBestPredicateResult isMatchingReloc(const MipsRelocationEntry &X,
+ const ELFRelocationEntry &R,
+ unsigned MatchingType) {
+ if (X.R.Type == MatchingType && X.R.OriginalSymbol == R.OriginalSymbol) {
+ if (!X.Matched &&
+ X.R.OriginalAddend == R.OriginalAddend)
+ return FindBest_PerfectMatch;
+ else if (X.R.OriginalAddend >= R.OriginalAddend)
+ return FindBest_Match;
+ }
+ return FindBest_NoMatch;
+}
+
+/// Determine whether Candidate or PreviousBest is the better match.
+/// The return value is true if Candidate is the better match.
+///
+/// A matching relocation is a better match if:
+/// - It has a smaller addend.
+/// - It is not already involved in a match.
+static bool compareMatchingRelocs(const MipsRelocationEntry &Candidate,
+ const MipsRelocationEntry &PreviousBest) {
+ if (Candidate.R.OriginalAddend != PreviousBest.R.OriginalAddend)
+ return Candidate.R.OriginalAddend < PreviousBest.R.OriginalAddend;
+ return PreviousBest.Matched && !Candidate.Matched;
+}
+
+#ifndef NDEBUG
+/// Print all the relocations.
+template <class Container>
+static void dumpRelocs(const char *Prefix, const Container &Relocs) {
+ for (const auto &R : Relocs)
+ dbgs() << Prefix << R << "\n";
+}
+#endif
+
+} // end anonymous namespace
+
+MipsELFObjectWriter::MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI,
+ bool _isN64, bool IsLittleEndian)
+ : MCELFObjectTargetWriter(_is64Bit, OSABI, ELF::EM_MIPS,
+ /*HasRelocationAddend*/ _isN64,
+ /*IsN64*/ _isN64) {}
+
+MipsELFObjectWriter::~MipsELFObjectWriter() {}
+
+unsigned MipsELFObjectWriter::getRelocType(MCContext &Ctx,
+ const MCValue &Target,
+ const MCFixup &Fixup,
+ bool IsPCRel) const {
+ // Determine the type of the relocation.
+ unsigned Kind = (unsigned)Fixup.getKind();
+
+ switch (Kind) {
+ case Mips::fixup_Mips_NONE:
+ return ELF::R_MIPS_NONE;
+ case Mips::fixup_Mips_16:
+ case FK_Data_2:
+ return IsPCRel ? ELF::R_MIPS_PC16 : ELF::R_MIPS_16;
+ case Mips::fixup_Mips_32:
+ case FK_Data_4:
+ return IsPCRel ? ELF::R_MIPS_PC32 : ELF::R_MIPS_32;
+ }
+
+ if (IsPCRel) {
+ switch (Kind) {
+ case Mips::fixup_Mips_Branch_PCRel:
+ case Mips::fixup_Mips_PC16:
+ return ELF::R_MIPS_PC16;
+ case Mips::fixup_MICROMIPS_PC7_S1:
+ return ELF::R_MICROMIPS_PC7_S1;
+ case Mips::fixup_MICROMIPS_PC10_S1:
+ return ELF::R_MICROMIPS_PC10_S1;
+ case Mips::fixup_MICROMIPS_PC16_S1:
+ return ELF::R_MICROMIPS_PC16_S1;
+ case Mips::fixup_MICROMIPS_PC26_S1:
+ return ELF::R_MICROMIPS_PC26_S1;
+ case Mips::fixup_MICROMIPS_PC19_S2:
+ return ELF::R_MICROMIPS_PC19_S2;
+ case Mips::fixup_MICROMIPS_PC18_S3:
+ return ELF::R_MICROMIPS_PC18_S3;
+ case Mips::fixup_MICROMIPS_PC21_S1:
+ return ELF::R_MICROMIPS_PC21_S1;
+ case Mips::fixup_MIPS_PC19_S2:
+ return ELF::R_MIPS_PC19_S2;
+ case Mips::fixup_MIPS_PC18_S3:
+ return ELF::R_MIPS_PC18_S3;
+ case Mips::fixup_MIPS_PC21_S2:
+ return ELF::R_MIPS_PC21_S2;
+ case Mips::fixup_MIPS_PC26_S2:
+ return ELF::R_MIPS_PC26_S2;
+ case Mips::fixup_MIPS_PCHI16:
+ return ELF::R_MIPS_PCHI16;
+ case Mips::fixup_MIPS_PCLO16:
+ return ELF::R_MIPS_PCLO16;
+ }
+
+ llvm_unreachable("invalid PC-relative fixup kind!");
+ }
+
+ switch (Kind) {
+ case Mips::fixup_Mips_64:
+ case FK_Data_8:
+ return ELF::R_MIPS_64;
+ case FK_DTPRel_4:
+ return ELF::R_MIPS_TLS_DTPREL32;
+ case FK_DTPRel_8:
+ return ELF::R_MIPS_TLS_DTPREL64;
+ case FK_TPRel_4:
+ return ELF::R_MIPS_TLS_TPREL32;
+ case FK_TPRel_8:
+ return ELF::R_MIPS_TLS_TPREL64;
+ case FK_GPRel_4:
+ if (isN64()) {
+ unsigned Type = (unsigned)ELF::R_MIPS_NONE;
+ Type = setRType((unsigned)ELF::R_MIPS_GPREL32, Type);
+ Type = setRType2((unsigned)ELF::R_MIPS_64, Type);
+ Type = setRType3((unsigned)ELF::R_MIPS_NONE, Type);
+ return Type;
+ }
+ return ELF::R_MIPS_GPREL32;
+ case Mips::fixup_Mips_GPREL16:
+ return ELF::R_MIPS_GPREL16;
+ case Mips::fixup_Mips_26:
+ return ELF::R_MIPS_26;
+ case Mips::fixup_Mips_CALL16:
+ return ELF::R_MIPS_CALL16;
+ case Mips::fixup_Mips_GOT:
+ return ELF::R_MIPS_GOT16;
+ case Mips::fixup_Mips_HI16:
+ return ELF::R_MIPS_HI16;
+ case Mips::fixup_Mips_LO16:
+ return ELF::R_MIPS_LO16;
+ case Mips::fixup_Mips_TLSGD:
+ return ELF::R_MIPS_TLS_GD;
+ case Mips::fixup_Mips_GOTTPREL:
+ return ELF::R_MIPS_TLS_GOTTPREL;
+ case Mips::fixup_Mips_TPREL_HI:
+ return ELF::R_MIPS_TLS_TPREL_HI16;
+ case Mips::fixup_Mips_TPREL_LO:
+ return ELF::R_MIPS_TLS_TPREL_LO16;
+ case Mips::fixup_Mips_TLSLDM:
+ return ELF::R_MIPS_TLS_LDM;
+ case Mips::fixup_Mips_DTPREL_HI:
+ return ELF::R_MIPS_TLS_DTPREL_HI16;
+ case Mips::fixup_Mips_DTPREL_LO:
+ return ELF::R_MIPS_TLS_DTPREL_LO16;
+ case Mips::fixup_Mips_GOT_PAGE:
+ return ELF::R_MIPS_GOT_PAGE;
+ case Mips::fixup_Mips_GOT_OFST:
+ return ELF::R_MIPS_GOT_OFST;
+ case Mips::fixup_Mips_GOT_DISP:
+ return ELF::R_MIPS_GOT_DISP;
+ case Mips::fixup_Mips_GPOFF_HI: {
+ unsigned Type = (unsigned)ELF::R_MIPS_NONE;
+ Type = setRType((unsigned)ELF::R_MIPS_GPREL16, Type);
+ Type = setRType2((unsigned)ELF::R_MIPS_SUB, Type);
+ Type = setRType3((unsigned)ELF::R_MIPS_HI16, Type);
+ return Type;
+ }
+ case Mips::fixup_Mips_GPOFF_LO: {
+ unsigned Type = (unsigned)ELF::R_MIPS_NONE;
+ Type = setRType((unsigned)ELF::R_MIPS_GPREL16, Type);
+ Type = setRType2((unsigned)ELF::R_MIPS_SUB, Type);
+ Type = setRType3((unsigned)ELF::R_MIPS_LO16, Type);
+ return Type;
+ }
+ case Mips::fixup_Mips_HIGHER:
+ return ELF::R_MIPS_HIGHER;
+ case Mips::fixup_Mips_HIGHEST:
+ return ELF::R_MIPS_HIGHEST;
+ case Mips::fixup_Mips_SUB:
+ return ELF::R_MIPS_SUB;
+ case Mips::fixup_Mips_GOT_HI16:
+ return ELF::R_MIPS_GOT_HI16;
+ case Mips::fixup_Mips_GOT_LO16:
+ return ELF::R_MIPS_GOT_LO16;
+ case Mips::fixup_Mips_CALL_HI16:
+ return ELF::R_MIPS_CALL_HI16;
+ case Mips::fixup_Mips_CALL_LO16:
+ return ELF::R_MIPS_CALL_LO16;
+ case Mips::fixup_MICROMIPS_26_S1:
+ return ELF::R_MICROMIPS_26_S1;
+ case Mips::fixup_MICROMIPS_HI16:
+ return ELF::R_MICROMIPS_HI16;
+ case Mips::fixup_MICROMIPS_LO16:
+ return ELF::R_MICROMIPS_LO16;
+ case Mips::fixup_MICROMIPS_GOT16:
+ return ELF::R_MICROMIPS_GOT16;
+ case Mips::fixup_MICROMIPS_CALL16:
+ return ELF::R_MICROMIPS_CALL16;
+ case Mips::fixup_MICROMIPS_GOT_DISP:
+ return ELF::R_MICROMIPS_GOT_DISP;
+ case Mips::fixup_MICROMIPS_GOT_PAGE:
+ return ELF::R_MICROMIPS_GOT_PAGE;
+ case Mips::fixup_MICROMIPS_GOT_OFST:
+ return ELF::R_MICROMIPS_GOT_OFST;
+ case Mips::fixup_MICROMIPS_TLS_GD:
+ return ELF::R_MICROMIPS_TLS_GD;
+ case Mips::fixup_MICROMIPS_TLS_LDM:
+ return ELF::R_MICROMIPS_TLS_LDM;
+ case Mips::fixup_MICROMIPS_TLS_DTPREL_HI16:
+ return ELF::R_MICROMIPS_TLS_DTPREL_HI16;
+ case Mips::fixup_MICROMIPS_TLS_DTPREL_LO16:
+ return ELF::R_MICROMIPS_TLS_DTPREL_LO16;
+ case Mips::fixup_MICROMIPS_TLS_TPREL_HI16:
+ return ELF::R_MICROMIPS_TLS_TPREL_HI16;
+ case Mips::fixup_MICROMIPS_TLS_TPREL_LO16:
+ return ELF::R_MICROMIPS_TLS_TPREL_LO16;
+ case Mips::fixup_MICROMIPS_SUB:
+ return ELF::R_MICROMIPS_SUB;
+ }
+
+ llvm_unreachable("invalid fixup kind!");
+}
+
+/// Sort relocation table entries by offset except where another order is
+/// required by the MIPS ABI.
+///
+/// MIPS has a few relocations that have an AHL component in the expression used
+/// to evaluate them. This AHL component is an addend with the same number of
+/// bits as a symbol value but not all of our ABI's are able to supply a
+/// sufficiently sized addend in a single relocation.
+///
+/// The O32 ABI for example, uses REL relocations which store the addend in the
+/// section data. All the relocations with AHL components affect 16-bit fields
+/// so the addend for a single relocation is limited to 16-bit. This ABI
+/// resolves the limitation by linking relocations (e.g. R_MIPS_HI16 and
+/// R_MIPS_LO16) and distributing the addend between the linked relocations. The
+/// ABI mandates that such relocations must be next to each other in a
+/// particular order (e.g. R_MIPS_HI16 must be immediately followed by a
+/// matching R_MIPS_LO16) but the rule is less strict in practice.
+///
+/// The de facto standard is lenient in the following ways:
+/// - 'Immediately following' does not refer to the next relocation entry but
+/// the next matching relocation.
+/// - There may be multiple high parts relocations for one low part relocation.
+/// - There may be multiple low part relocations for one high part relocation.
+/// - The AHL addend in each part does not have to be exactly equal as long as
+/// the difference does not affect the carry bit from bit 15 into 16. This is
+/// to allow, for example, the use of %lo(foo) and %lo(foo+4) when loading
+/// both halves of a long long.
+///
+/// See getMatchingLoType() for a description of which high part relocations
+/// match which low part relocations. One particular thing to note is that
+/// R_MIPS_GOT16 and similar only have AHL addends if they refer to local
+/// symbols.
+///
+/// It should also be noted that this function is not affected by whether
+/// the symbol was kept or rewritten into a section-relative equivalent. We
+/// always match using the expressions from the source.
+void MipsELFObjectWriter::sortRelocs(const MCAssembler &Asm,
+ std::vector<ELFRelocationEntry> &Relocs) {
+
+ // We do not need to sort the relocation table for RELA relocations which
+ // N32/N64 uses as the relocation addend contains the value we require,
+ // rather than it being split across a pair of relocations.
+ if (hasRelocationAddend())
+ return;
+
+ if (Relocs.size() < 2)
+ return;
+
+ // Sort relocations by the address they are applied to.
+ std::sort(Relocs.begin(), Relocs.end(),
+ [](const ELFRelocationEntry &A, const ELFRelocationEntry &B) {
+ return A.Offset < B.Offset;
+ });
+
+ std::list<MipsRelocationEntry> Sorted;
+ std::list<ELFRelocationEntry> Remainder;
+
+ DEBUG(dumpRelocs("R: ", Relocs));
+
+ // Separate the movable relocations (AHL relocations using the high bits) from
+ // the immobile relocations (everything else). This does not preserve high/low
+ // matches that already existed in the input.
+ copy_if_else(Relocs.begin(), Relocs.end(), std::back_inserter(Remainder),
+ std::back_inserter(Sorted), [](const ELFRelocationEntry &Reloc) {
+ return getMatchingLoType(Reloc) != ELF::R_MIPS_NONE;
+ });
+
+ for (auto &R : Remainder) {
+ DEBUG(dbgs() << "Matching: " << R << "\n");
+
+ unsigned MatchingType = getMatchingLoType(R);
+ assert(MatchingType != ELF::R_MIPS_NONE &&
+ "Wrong list for reloc that doesn't need a match");
+
+ // Find the best matching relocation for the current high part.
+ // See isMatchingReloc for a description of a matching relocation and
+ // compareMatchingRelocs for a description of what 'best' means.
+ auto InsertionPoint =
+ find_best(Sorted.begin(), Sorted.end(),
+ [&R, &MatchingType](const MipsRelocationEntry &X) {
+ return isMatchingReloc(X, R, MatchingType);
+ },
+ compareMatchingRelocs);
+
+ // If we matched then insert the high part in front of the match and mark
+ // both relocations as being involved in a match. We only mark the high
+ // part for cosmetic reasons in the debug output.
+ //
+ // If we failed to find a match then the high part is orphaned. This is not
+ // permitted since the relocation cannot be evaluated without knowing the
+ // carry-in. We can sometimes handle this using a matching low part that is
+ // already used in a match but we already cover that case in
+ // isMatchingReloc and compareMatchingRelocs. For the remaining cases we
+ // should insert the high part at the end of the list. This will cause the
+ // linker to fail but the alternative is to cause the linker to bind the
+ // high part to a semi-matching low part and silently calculate the wrong
+ // value. Unfortunately we have no means to warn the user that we did this
+ // so leave it up to the linker to complain about it.
+ if (InsertionPoint != Sorted.end())
+ InsertionPoint->Matched = true;
+ Sorted.insert(InsertionPoint, R)->Matched = true;
+ }
+
+ DEBUG(dumpRelocs("S: ", Sorted));
+
+ assert(Relocs.size() == Sorted.size() && "Some relocs were not consumed");
+
+ // Overwrite the original vector with the sorted elements. The caller expects
+ // them in reverse order.
+ unsigned CopyTo = 0;
+ for (const auto &R : reverse(Sorted))
+ Relocs[CopyTo++] = R.R;
+}
+
+bool MipsELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
+ unsigned Type) const {
+ // If it's a compound relocation for N64 then we need the relocation if any
+ // sub-relocation needs it.
+ if (!isUInt<8>(Type))
+ return needsRelocateWithSymbol(Sym, Type & 0xff) ||
+ needsRelocateWithSymbol(Sym, (Type >> 8) & 0xff) ||
+ needsRelocateWithSymbol(Sym, (Type >> 16) & 0xff);
+
+ switch (Type) {
+ default:
+ errs() << Type << "\n";
+ llvm_unreachable("Unexpected relocation");
+ return true;
+
+ // This relocation doesn't affect the section data.
+ case ELF::R_MIPS_NONE:
+ return false;
+
+ // On REL ABI's (e.g. O32), these relocations form pairs. The pairing is done
+ // by the static linker by matching the symbol and offset.
+ // We only see one relocation at a time but it's still safe to relocate with
+ // the section so long as both relocations make the same decision.
+ //
+ // Some older linkers may require the symbol for particular cases. Such cases
+ // are not supported yet but can be added as required.
+ case ELF::R_MIPS_GOT16:
+ case ELF::R_MIPS16_GOT16:
+ case ELF::R_MICROMIPS_GOT16:
+ case ELF::R_MIPS_HI16:
+ case ELF::R_MIPS16_HI16:
+ case ELF::R_MICROMIPS_HI16:
+ case ELF::R_MIPS_LO16:
+ case ELF::R_MIPS16_LO16:
+ case ELF::R_MICROMIPS_LO16:
+ // FIXME: It should be safe to return false for the STO_MIPS_MICROMIPS but
+ // we neglect to handle the adjustment to the LSB of the addend that
+ // it causes in applyFixup() and similar.
+ if (cast<MCSymbolELF>(Sym).getOther() & ELF::STO_MIPS_MICROMIPS)
+ return true;
+ return false;
+
+ case ELF::R_MIPS_GOT_PAGE:
+ case ELF::R_MICROMIPS_GOT_PAGE:
+ case ELF::R_MIPS_GOT_OFST:
+ case ELF::R_MICROMIPS_GOT_OFST:
+ case ELF::R_MIPS_16:
+ case ELF::R_MIPS_32:
+ case ELF::R_MIPS_GPREL32:
+ if (cast<MCSymbolELF>(Sym).getOther() & ELF::STO_MIPS_MICROMIPS)
+ return true;
+ LLVM_FALLTHROUGH;
+ case ELF::R_MIPS_26:
+ case ELF::R_MIPS_64:
+ case ELF::R_MIPS_GPREL16:
+ case ELF::R_MIPS_PC16:
+ case ELF::R_MIPS_SUB:
+ return false;
+
+ // FIXME: Many of these relocations should probably return false but this
+ // hasn't been confirmed to be safe yet.
+ case ELF::R_MIPS_REL32:
+ case ELF::R_MIPS_LITERAL:
+ case ELF::R_MIPS_CALL16:
+ case ELF::R_MIPS_SHIFT5:
+ case ELF::R_MIPS_SHIFT6:
+ case ELF::R_MIPS_GOT_DISP:
+ case ELF::R_MIPS_GOT_HI16:
+ case ELF::R_MIPS_GOT_LO16:
+ case ELF::R_MIPS_INSERT_A:
+ case ELF::R_MIPS_INSERT_B:
+ case ELF::R_MIPS_DELETE:
+ case ELF::R_MIPS_HIGHER:
+ case ELF::R_MIPS_HIGHEST:
+ case ELF::R_MIPS_CALL_HI16:
+ case ELF::R_MIPS_CALL_LO16:
+ case ELF::R_MIPS_SCN_DISP:
+ case ELF::R_MIPS_REL16:
+ case ELF::R_MIPS_ADD_IMMEDIATE:
+ case ELF::R_MIPS_PJUMP:
+ case ELF::R_MIPS_RELGOT:
+ case ELF::R_MIPS_JALR:
+ case ELF::R_MIPS_TLS_DTPMOD32:
+ case ELF::R_MIPS_TLS_DTPREL32:
+ case ELF::R_MIPS_TLS_DTPMOD64:
+ case ELF::R_MIPS_TLS_DTPREL64:
+ case ELF::R_MIPS_TLS_GD:
+ case ELF::R_MIPS_TLS_LDM:
+ case ELF::R_MIPS_TLS_DTPREL_HI16:
+ case ELF::R_MIPS_TLS_DTPREL_LO16:
+ case ELF::R_MIPS_TLS_GOTTPREL:
+ case ELF::R_MIPS_TLS_TPREL32:
+ case ELF::R_MIPS_TLS_TPREL64:
+ case ELF::R_MIPS_TLS_TPREL_HI16:
+ case ELF::R_MIPS_TLS_TPREL_LO16:
+ case ELF::R_MIPS_GLOB_DAT:
+ case ELF::R_MIPS_PC21_S2:
+ case ELF::R_MIPS_PC26_S2:
+ case ELF::R_MIPS_PC18_S3:
+ case ELF::R_MIPS_PC19_S2:
+ case ELF::R_MIPS_PCHI16:
+ case ELF::R_MIPS_PCLO16:
+ case ELF::R_MIPS_COPY:
+ case ELF::R_MIPS_JUMP_SLOT:
+ case ELF::R_MIPS_NUM:
+ case ELF::R_MIPS_PC32:
+ case ELF::R_MIPS_EH:
+ case ELF::R_MICROMIPS_26_S1:
+ case ELF::R_MICROMIPS_GPREL16:
+ case ELF::R_MICROMIPS_LITERAL:
+ case ELF::R_MICROMIPS_PC7_S1:
+ case ELF::R_MICROMIPS_PC10_S1:
+ case ELF::R_MICROMIPS_PC16_S1:
+ case ELF::R_MICROMIPS_CALL16:
+ case ELF::R_MICROMIPS_GOT_DISP:
+ case ELF::R_MICROMIPS_GOT_HI16:
+ case ELF::R_MICROMIPS_GOT_LO16:
+ case ELF::R_MICROMIPS_SUB:
+ case ELF::R_MICROMIPS_HIGHER:
+ case ELF::R_MICROMIPS_HIGHEST:
+ case ELF::R_MICROMIPS_CALL_HI16:
+ case ELF::R_MICROMIPS_CALL_LO16:
+ case ELF::R_MICROMIPS_SCN_DISP:
+ case ELF::R_MICROMIPS_JALR:
+ case ELF::R_MICROMIPS_HI0_LO16:
+ case ELF::R_MICROMIPS_TLS_GD:
+ case ELF::R_MICROMIPS_TLS_LDM:
+ case ELF::R_MICROMIPS_TLS_DTPREL_HI16:
+ case ELF::R_MICROMIPS_TLS_DTPREL_LO16:
+ case ELF::R_MICROMIPS_TLS_GOTTPREL:
+ case ELF::R_MICROMIPS_TLS_TPREL_HI16:
+ case ELF::R_MICROMIPS_TLS_TPREL_LO16:
+ case ELF::R_MICROMIPS_GPREL7_S2:
+ case ELF::R_MICROMIPS_PC23_S2:
+ case ELF::R_MICROMIPS_PC21_S1:
+ case ELF::R_MICROMIPS_PC26_S1:
+ case ELF::R_MICROMIPS_PC18_S3:
+ case ELF::R_MICROMIPS_PC19_S2:
+ return true;
+
+ // FIXME: Many of these should probably return false but MIPS16 isn't
+ // supported by the integrated assembler.
+ case ELF::R_MIPS16_26:
+ case ELF::R_MIPS16_GPREL:
+ case ELF::R_MIPS16_CALL16:
+ case ELF::R_MIPS16_TLS_GD:
+ case ELF::R_MIPS16_TLS_LDM:
+ case ELF::R_MIPS16_TLS_DTPREL_HI16:
+ case ELF::R_MIPS16_TLS_DTPREL_LO16:
+ case ELF::R_MIPS16_TLS_GOTTPREL:
+ case ELF::R_MIPS16_TLS_TPREL_HI16:
+ case ELF::R_MIPS16_TLS_TPREL_LO16:
+ llvm_unreachable("Unsupported MIPS16 relocation");
+ return true;
+ }
+}
+
+MCObjectWriter *llvm::createMipsELFObjectWriter(raw_pwrite_stream &OS,
+ uint8_t OSABI,
+ bool IsLittleEndian,
+ bool Is64Bit) {
+ MCELFObjectTargetWriter *MOTW =
+ new MipsELFObjectWriter(Is64Bit, OSABI, Is64Bit, IsLittleEndian);
+ return createELFObjectWriter(MOTW, OS, IsLittleEndian);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
new file mode 100644
index 000000000000..e7d687e89a8a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
@@ -0,0 +1,82 @@
+//===-------- MipsELFStreamer.cpp - ELF Object Output ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsELFStreamer.h"
+#include "MipsTargetStreamer.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/Support/ELF.h"
+
+using namespace llvm;
+
+void MipsELFStreamer::EmitInstruction(const MCInst &Inst,
+ const MCSubtargetInfo &STI) {
+ MCELFStreamer::EmitInstruction(Inst, STI);
+
+ MCContext &Context = getContext();
+ const MCRegisterInfo *MCRegInfo = Context.getRegisterInfo();
+
+ for (unsigned OpIndex = 0; OpIndex < Inst.getNumOperands(); ++OpIndex) {
+ const MCOperand &Op = Inst.getOperand(OpIndex);
+
+ if (!Op.isReg())
+ continue;
+
+ unsigned Reg = Op.getReg();
+ RegInfoRecord->SetPhysRegUsed(Reg, MCRegInfo);
+ }
+
+ createPendingLabelRelocs();
+}
+
+void MipsELFStreamer::createPendingLabelRelocs() {
+ MipsTargetELFStreamer *ELFTargetStreamer =
+ static_cast<MipsTargetELFStreamer *>(getTargetStreamer());
+
+ // FIXME: Also mark labels when in MIPS16 mode.
+ if (ELFTargetStreamer->isMicroMipsEnabled()) {
+ for (auto *L : Labels) {
+ auto *Label = cast<MCSymbolELF>(L);
+ getAssembler().registerSymbol(*Label);
+ Label->setOther(ELF::STO_MIPS_MICROMIPS);
+ }
+ }
+
+ Labels.clear();
+}
+
+void MipsELFStreamer::EmitLabel(MCSymbol *Symbol) {
+ MCELFStreamer::EmitLabel(Symbol);
+ Labels.push_back(Symbol);
+}
+
+void MipsELFStreamer::SwitchSection(MCSection *Section,
+ const MCExpr *Subsection) {
+ MCELFStreamer::SwitchSection(Section, Subsection);
+ Labels.clear();
+}
+
+void MipsELFStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
+ SMLoc Loc) {
+ MCELFStreamer::EmitValueImpl(Value, Size, Loc);
+ Labels.clear();
+}
+
+void MipsELFStreamer::EmitMipsOptionRecords() {
+ for (const auto &I : MipsOptionRecords)
+ I->EmitMipsOptionRecord();
+}
+
+MCELFStreamer *llvm::createMipsELFStreamer(MCContext &Context,
+ MCAsmBackend &MAB,
+ raw_pwrite_stream &OS,
+ MCCodeEmitter *Emitter,
+ bool RelaxAll) {
+ return new MipsELFStreamer(Context, MAB, OS, Emitter);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
new file mode 100644
index 000000000000..a241cdebdcc8
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
@@ -0,0 +1,76 @@
+//===-------- MipsELFStreamer.h - ELF Object Output -----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a custom MCELFStreamer which allows us to insert some hooks before
+// emitting data into an actual object file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSELFSTREAMER_H
+#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSELFSTREAMER_H
+
+#include "MipsOptionRecord.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include <memory>
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCSubtargetInfo;
+
+class MipsELFStreamer : public MCELFStreamer {
+ SmallVector<std::unique_ptr<MipsOptionRecord>, 8> MipsOptionRecords;
+ MipsRegInfoRecord *RegInfoRecord;
+ SmallVector<MCSymbol*, 4> Labels;
+
+
+public:
+ MipsELFStreamer(MCContext &Context, MCAsmBackend &MAB, raw_pwrite_stream &OS,
+ MCCodeEmitter *Emitter)
+ : MCELFStreamer(Context, MAB, OS, Emitter) {
+
+ RegInfoRecord = new MipsRegInfoRecord(this, Context);
+ MipsOptionRecords.push_back(
+ std::unique_ptr<MipsRegInfoRecord>(RegInfoRecord));
+ }
+
+ /// Overriding this function allows us to add arbitrary behaviour before the
+ /// \p Inst is actually emitted. For example, we can inspect the operands and
+ /// gather sufficient information that allows us to reason about the register
+ /// usage for the translation unit.
+ void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override;
+
+ /// Overriding this function allows us to record all labels that should be
+ /// marked as microMIPS. Based on this data marking is done in
+ /// EmitInstruction.
+ void EmitLabel(MCSymbol *Symbol) override;
+
+ /// Overriding this function allows us to dismiss all labels that are
+ /// candidates for marking as microMIPS when .section directive is processed.
+ void SwitchSection(MCSection *Section,
+ const MCExpr *Subsection = nullptr) override;
+
+ /// Overriding this function allows us to dismiss all labels that are
+ /// candidates for marking as microMIPS when .word directive is emitted.
+ void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override;
+
+ /// Emits all the option records stored up until the point it's called.
+ void EmitMipsOptionRecords();
+
+ /// Mark labels as microMIPS, if necessary for the subtarget.
+ void createPendingLabelRelocs();
+};
+
+MCELFStreamer *createMipsELFStreamer(MCContext &Context, MCAsmBackend &MAB,
+ raw_pwrite_stream &OS,
+ MCCodeEmitter *Emitter, bool RelaxAll);
+} // namespace llvm.
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
new file mode 100644
index 000000000000..149296212eca
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
@@ -0,0 +1,224 @@
+//===-- MipsFixupKinds.h - Mips Specific Fixup Entries ----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSFIXUPKINDS_H
+#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSFIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace Mips {
+ // Although most of the current fixup types reflect a unique relocation
+ // one can have multiple fixup types for a given relocation and thus need
+ // to be uniquely named.
+ //
+ // This table *must* be in the same order of
+ // MCFixupKindInfo Infos[Mips::NumTargetFixupKinds]
+ // in MipsAsmBackend.cpp.
+ //
+ enum Fixups {
+ // Branch fixups resulting in R_MIPS_NONE.
+ fixup_Mips_NONE = FirstTargetFixupKind,
+
+ // Branch fixups resulting in R_MIPS_16.
+ fixup_Mips_16,
+
+ // Pure 32 bit data fixup resulting in - R_MIPS_32.
+ fixup_Mips_32,
+
+ // Full 32 bit data relative data fixup resulting in - R_MIPS_REL32.
+ fixup_Mips_REL32,
+
+ // Jump 26 bit fixup resulting in - R_MIPS_26.
+ fixup_Mips_26,
+
+ // Pure upper 16 bit fixup resulting in - R_MIPS_HI16.
+ fixup_Mips_HI16,
+
+ // Pure lower 16 bit fixup resulting in - R_MIPS_LO16.
+ fixup_Mips_LO16,
+
+ // 16 bit fixup for GP offest resulting in - R_MIPS_GPREL16.
+ fixup_Mips_GPREL16,
+
+ // 16 bit literal fixup resulting in - R_MIPS_LITERAL.
+ fixup_Mips_LITERAL,
+
+ // Symbol fixup resulting in - R_MIPS_GOT16.
+ fixup_Mips_GOT,
+
+ // PC relative branch fixup resulting in - R_MIPS_PC16.
+ fixup_Mips_PC16,
+
+ // resulting in - R_MIPS_CALL16.
+ fixup_Mips_CALL16,
+
+ // resulting in - R_MIPS_GPREL32.
+ fixup_Mips_GPREL32,
+
+ // resulting in - R_MIPS_SHIFT5.
+ fixup_Mips_SHIFT5,
+
+ // resulting in - R_MIPS_SHIFT6.
+ fixup_Mips_SHIFT6,
+
+ // Pure 64 bit data fixup resulting in - R_MIPS_64.
+ fixup_Mips_64,
+
+ // resulting in - R_MIPS_TLS_GD.
+ fixup_Mips_TLSGD,
+
+ // resulting in - R_MIPS_TLS_GOTTPREL.
+ fixup_Mips_GOTTPREL,
+
+ // resulting in - R_MIPS_TLS_TPREL_HI16.
+ fixup_Mips_TPREL_HI,
+
+ // resulting in - R_MIPS_TLS_TPREL_LO16.
+ fixup_Mips_TPREL_LO,
+
+ // resulting in - R_MIPS_TLS_LDM.
+ fixup_Mips_TLSLDM,
+
+ // resulting in - R_MIPS_TLS_DTPREL_HI16.
+ fixup_Mips_DTPREL_HI,
+
+ // resulting in - R_MIPS_TLS_DTPREL_LO16.
+ fixup_Mips_DTPREL_LO,
+
+ // PC relative branch fixup resulting in - R_MIPS_PC16
+ fixup_Mips_Branch_PCRel,
+
+ // resulting in - R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_HI16
+ fixup_Mips_GPOFF_HI,
+
+ // resulting in - R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_LO16
+ fixup_Mips_GPOFF_LO,
+
+ // resulting in - R_MIPS_PAGE
+ fixup_Mips_GOT_PAGE,
+
+ // resulting in - R_MIPS_GOT_OFST
+ fixup_Mips_GOT_OFST,
+
+ // resulting in - R_MIPS_GOT_DISP
+ fixup_Mips_GOT_DISP,
+
+ // resulting in - R_MIPS_GOT_HIGHER
+ fixup_Mips_HIGHER,
+
+ // resulting in - R_MIPS_HIGHEST
+ fixup_Mips_HIGHEST,
+
+ // resulting in - R_MIPS_GOT_HI16
+ fixup_Mips_GOT_HI16,
+
+ // resulting in - R_MIPS_GOT_LO16
+ fixup_Mips_GOT_LO16,
+
+ // resulting in - R_MIPS_CALL_HI16
+ fixup_Mips_CALL_HI16,
+
+ // resulting in - R_MIPS_CALL_LO16
+ fixup_Mips_CALL_LO16,
+
+ // resulting in - R_MIPS_PC18_S3
+ fixup_MIPS_PC18_S3,
+
+ // resulting in - R_MIPS_PC19_S2
+ fixup_MIPS_PC19_S2,
+
+ // resulting in - R_MIPS_PC21_S2
+ fixup_MIPS_PC21_S2,
+
+ // resulting in - R_MIPS_PC26_S2
+ fixup_MIPS_PC26_S2,
+
+ // resulting in - R_MIPS_PCHI16
+ fixup_MIPS_PCHI16,
+
+ // resulting in - R_MIPS_PCLO16
+ fixup_MIPS_PCLO16,
+
+ // resulting in - R_MICROMIPS_26_S1
+ fixup_MICROMIPS_26_S1,
+
+ // resulting in - R_MICROMIPS_HI16
+ fixup_MICROMIPS_HI16,
+
+ // resulting in - R_MICROMIPS_LO16
+ fixup_MICROMIPS_LO16,
+
+ // resulting in - R_MICROMIPS_GOT16
+ fixup_MICROMIPS_GOT16,
+
+ // resulting in - R_MICROMIPS_PC7_S1
+ fixup_MICROMIPS_PC7_S1,
+
+ // resulting in - R_MICROMIPS_PC10_S1
+ fixup_MICROMIPS_PC10_S1,
+
+ // resulting in - R_MICROMIPS_PC16_S1
+ fixup_MICROMIPS_PC16_S1,
+
+ // resulting in - R_MICROMIPS_PC26_S1
+ fixup_MICROMIPS_PC26_S1,
+
+ // resulting in - R_MICROMIPS_PC19_S2
+ fixup_MICROMIPS_PC19_S2,
+
+ // resulting in - R_MICROMIPS_PC18_S3
+ fixup_MICROMIPS_PC18_S3,
+
+ // resulting in - R_MICROMIPS_PC21_S1
+ fixup_MICROMIPS_PC21_S1,
+
+ // resulting in - R_MICROMIPS_CALL16
+ fixup_MICROMIPS_CALL16,
+
+ // resulting in - R_MICROMIPS_GOT_DISP
+ fixup_MICROMIPS_GOT_DISP,
+
+ // resulting in - R_MICROMIPS_GOT_PAGE
+ fixup_MICROMIPS_GOT_PAGE,
+
+ // resulting in - R_MICROMIPS_GOT_OFST
+ fixup_MICROMIPS_GOT_OFST,
+
+ // resulting in - R_MICROMIPS_TLS_GD
+ fixup_MICROMIPS_TLS_GD,
+
+ // resulting in - R_MICROMIPS_TLS_LDM
+ fixup_MICROMIPS_TLS_LDM,
+
+ // resulting in - R_MICROMIPS_TLS_DTPREL_HI16
+ fixup_MICROMIPS_TLS_DTPREL_HI16,
+
+ // resulting in - R_MICROMIPS_TLS_DTPREL_LO16
+ fixup_MICROMIPS_TLS_DTPREL_LO16,
+
+ // resulting in - R_MICROMIPS_TLS_TPREL_HI16
+ fixup_MICROMIPS_TLS_TPREL_HI16,
+
+ // resulting in - R_MICROMIPS_TLS_TPREL_LO16
+ fixup_MICROMIPS_TLS_TPREL_LO16,
+
+ // resulting in - R_MIPS_SUB/R_MICROMIPS_SUB
+ fixup_Mips_SUB,
+ fixup_MICROMIPS_SUB,
+
+ // Marker
+ LastTargetFixupKind,
+ NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+ };
+} // namespace Mips
+} // namespace llvm
+
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
new file mode 100644
index 000000000000..a44a35f49e5f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
@@ -0,0 +1,66 @@
+//===-- MipsMCAsmInfo.cpp - Mips Asm Properties ---------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the MipsMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsMCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+
+using namespace llvm;
+
+void MipsMCAsmInfo::anchor() { }
+
+MipsMCAsmInfo::MipsMCAsmInfo(const Triple &TheTriple) {
+ if ((TheTriple.getArch() == Triple::mips) ||
+ (TheTriple.getArch() == Triple::mips64))
+ IsLittleEndian = false;
+
+ if ((TheTriple.getArch() == Triple::mips64el) ||
+ (TheTriple.getArch() == Triple::mips64)) {
+ PointerSize = CalleeSaveStackSlotSize = 8;
+ }
+
+ // FIXME: This condition isn't quite right but it's the best we can do until
+ // this object can identify the ABI. It will misbehave when using O32
+ // on a mips64*-* triple.
+ if ((TheTriple.getArch() == Triple::mipsel) ||
+ (TheTriple.getArch() == Triple::mips)) {
+ PrivateGlobalPrefix = "$";
+ PrivateLabelPrefix = "$";
+ }
+
+ AlignmentIsInBytes = false;
+ Data16bitsDirective = "\t.2byte\t";
+ Data32bitsDirective = "\t.4byte\t";
+ Data64bitsDirective = "\t.8byte\t";
+ CommentString = "#";
+ ZeroDirective = "\t.space\t";
+ GPRel32Directive = "\t.gpword\t";
+ GPRel64Directive = "\t.gpdword\t";
+ DTPRel32Directive = "\t.dtprelword\t";
+ DTPRel64Directive = "\t.dtpreldword\t";
+ TPRel32Directive = "\t.tprelword\t";
+ TPRel64Directive = "\t.tpreldword\t";
+ UseAssignmentForEHBegin = true;
+ SupportsDebugInformation = true;
+ ExceptionsType = ExceptionHandling::DwarfCFI;
+ DwarfRegNumForCFI = true;
+ HasMipsExpressions = true;
+
+ // Enable IAS by default for O32.
+ if (TheTriple.getArch() == Triple::mips ||
+ TheTriple.getArch() == Triple::mipsel)
+ UseIntegratedAssembler = true;
+
+ // Enable IAS by default for Debian mips64/mips64el.
+ if (TheTriple.getEnvironment() == Triple::GNUABI64)
+ UseIntegratedAssembler = true;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
new file mode 100644
index 000000000000..d4ccf0349c16
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
@@ -0,0 +1,31 @@
+//===-- MipsMCAsmInfo.h - Mips Asm Info ------------------------*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the MipsMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCASMINFO_H
+#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCASMINFO_H
+
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+class Triple;
+
+class MipsMCAsmInfo : public MCAsmInfoELF {
+ void anchor() override;
+
+public:
+ explicit MipsMCAsmInfo(const Triple &TheTriple);
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
new file mode 100644
index 000000000000..0614316d5ac7
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -0,0 +1,1171 @@
+//===-- MipsMCCodeEmitter.cpp - Convert Mips Code to Machine Code ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MipsMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#include "MipsMCCodeEmitter.h"
+#include "MCTargetDesc/MipsFixupKinds.h"
+#include "MCTargetDesc/MipsMCExpr.h"
+#include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "mccodeemitter"
+
+#define GET_INSTRMAP_INFO
+#include "MipsGenInstrInfo.inc"
+#undef GET_INSTRMAP_INFO
+
+namespace llvm {
+MCCodeEmitter *createMipsMCCodeEmitterEB(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx) {
+ return new MipsMCCodeEmitter(MCII, Ctx, false);
+}
+
+MCCodeEmitter *createMipsMCCodeEmitterEL(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx) {
+ return new MipsMCCodeEmitter(MCII, Ctx, true);
+}
+} // End of namespace llvm.
+
+// If the D<shift> instruction has a shift amount that is greater
+// than 31 (checked in calling routine), lower it to a D<shift>32 instruction
+static void LowerLargeShift(MCInst& Inst) {
+
+ assert(Inst.getNumOperands() == 3 && "Invalid no. of operands for shift!");
+ assert(Inst.getOperand(2).isImm());
+
+ int64_t Shift = Inst.getOperand(2).getImm();
+ if (Shift <= 31)
+ return; // Do nothing
+ Shift -= 32;
+
+ // saminus32
+ Inst.getOperand(2).setImm(Shift);
+
+ switch (Inst.getOpcode()) {
+ default:
+ // Calling function is not synchronized
+ llvm_unreachable("Unexpected shift instruction");
+ case Mips::DSLL:
+ Inst.setOpcode(Mips::DSLL32);
+ return;
+ case Mips::DSRL:
+ Inst.setOpcode(Mips::DSRL32);
+ return;
+ case Mips::DSRA:
+ Inst.setOpcode(Mips::DSRA32);
+ return;
+ case Mips::DROTR:
+ Inst.setOpcode(Mips::DROTR32);
+ return;
+ case Mips::DSLL_MM64R6:
+ Inst.setOpcode(Mips::DSLL32_MM64R6);
+ return;
+ case Mips::DSRL_MM64R6:
+ Inst.setOpcode(Mips::DSRL32_MM64R6);
+ return;
+ case Mips::DSRA_MM64R6:
+ Inst.setOpcode(Mips::DSRA32_MM64R6);
+ return;
+ case Mips::DROTR_MM64R6:
+ Inst.setOpcode(Mips::DROTR32_MM64R6);
+ return;
+ }
+}
+
+// Pick a DINS instruction variant based on the pos and size operands
+static void LowerDins(MCInst& InstIn) {
+ assert(InstIn.getNumOperands() == 5 &&
+ "Invalid no. of machine operands for DINS!");
+
+ assert(InstIn.getOperand(2).isImm());
+ int64_t pos = InstIn.getOperand(2).getImm();
+ assert(InstIn.getOperand(3).isImm());
+ int64_t size = InstIn.getOperand(3).getImm();
+
+ if (size <= 32) {
+ if (pos < 32) // DINS, do nothing
+ return;
+ // DINSU
+ InstIn.getOperand(2).setImm(pos - 32);
+ InstIn.setOpcode(Mips::DINSU);
+ return;
+ }
+ // DINSM
+ assert(pos < 32 && "DINS cannot have both size and pos > 32");
+ InstIn.getOperand(3).setImm(size - 32);
+ InstIn.setOpcode(Mips::DINSM);
+ return;
+}
+
+// Fix a bad compact branch encoding for beqc/bnec.
+void MipsMCCodeEmitter::LowerCompactBranch(MCInst& Inst) const {
+
+ // Encoding may be illegal !(rs < rt), but this situation is
+ // easily fixed.
+ unsigned RegOp0 = Inst.getOperand(0).getReg();
+ unsigned RegOp1 = Inst.getOperand(1).getReg();
+
+ unsigned Reg0 = Ctx.getRegisterInfo()->getEncodingValue(RegOp0);
+ unsigned Reg1 = Ctx.getRegisterInfo()->getEncodingValue(RegOp1);
+
+ if (Inst.getOpcode() == Mips::BNEC || Inst.getOpcode() == Mips::BEQC ||
+ Inst.getOpcode() == Mips::BNEC64 || Inst.getOpcode() == Mips::BEQC64) {
+ assert(Reg0 != Reg1 && "Instruction has bad operands ($rs == $rt)!");
+ if (Reg0 < Reg1)
+ return;
+ } else if (Inst.getOpcode() == Mips::BNVC || Inst.getOpcode() == Mips::BOVC) {
+ if (Reg0 >= Reg1)
+ return;
+ } else if (Inst.getOpcode() == Mips::BNVC_MMR6 ||
+ Inst.getOpcode() == Mips::BOVC_MMR6) {
+ if (Reg1 >= Reg0)
+ return;
+ } else
+ llvm_unreachable("Cannot rewrite unknown branch!");
+
+ Inst.getOperand(0).setReg(RegOp1);
+ Inst.getOperand(1).setReg(RegOp0);
+
+}
+
+bool MipsMCCodeEmitter::isMicroMips(const MCSubtargetInfo &STI) const {
+ return STI.getFeatureBits()[Mips::FeatureMicroMips];
+}
+
+bool MipsMCCodeEmitter::isMips32r6(const MCSubtargetInfo &STI) const {
+ return STI.getFeatureBits()[Mips::FeatureMips32r6];
+}
+
+void MipsMCCodeEmitter::EmitByte(unsigned char C, raw_ostream &OS) const {
+ OS << (char)C;
+}
+
+void MipsMCCodeEmitter::EmitInstruction(uint64_t Val, unsigned Size,
+ const MCSubtargetInfo &STI,
+ raw_ostream &OS) const {
+ // Output the instruction encoding in little endian byte order.
+ // Little-endian byte ordering:
+ // mips32r2: 4 | 3 | 2 | 1
+ // microMIPS: 2 | 1 | 4 | 3
+ if (IsLittleEndian && Size == 4 && isMicroMips(STI)) {
+ EmitInstruction(Val >> 16, 2, STI, OS);
+ EmitInstruction(Val, 2, STI, OS);
+ } else {
+ for (unsigned i = 0; i < Size; ++i) {
+ unsigned Shift = IsLittleEndian ? i * 8 : (Size - 1 - i) * 8;
+ EmitByte((Val >> Shift) & 0xff, OS);
+ }
+ }
+}
+
+/// encodeInstruction - Emit the instruction.
+/// Size the instruction with Desc.getSize().
+void MipsMCCodeEmitter::
+encodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const
+{
+
+ // Non-pseudo instructions that get changed for direct object
+ // only based on operand values.
+ // If this list of instructions get much longer we will move
+ // the check to a function call. Until then, this is more efficient.
+ MCInst TmpInst = MI;
+ switch (MI.getOpcode()) {
+ // If shift amount is >= 32 it the inst needs to be lowered further
+ case Mips::DSLL:
+ case Mips::DSRL:
+ case Mips::DSRA:
+ case Mips::DROTR:
+ case Mips::DSLL_MM64R6:
+ case Mips::DSRL_MM64R6:
+ case Mips::DSRA_MM64R6:
+ case Mips::DROTR_MM64R6:
+ LowerLargeShift(TmpInst);
+ break;
+ // Double extract instruction is chosen by pos and size operands
+ case Mips::DINS:
+ LowerDins(TmpInst);
+ break;
+ // Compact branches, enforce encoding restrictions.
+ case Mips::BEQC:
+ case Mips::BNEC:
+ case Mips::BEQC64:
+ case Mips::BNEC64:
+ case Mips::BOVC:
+ case Mips::BOVC_MMR6:
+ case Mips::BNVC:
+ case Mips::BNVC_MMR6:
+ LowerCompactBranch(TmpInst);
+ }
+
+ unsigned long N = Fixups.size();
+ uint32_t Binary = getBinaryCodeForInstr(TmpInst, Fixups, STI);
+
+ // Check for unimplemented opcodes.
+ // Unfortunately in MIPS both NOP and SLL will come in with Binary == 0
+ // so we have to special check for them.
+ unsigned Opcode = TmpInst.getOpcode();
+ if ((Opcode != Mips::NOP) && (Opcode != Mips::SLL) &&
+ (Opcode != Mips::SLL_MM) && !Binary)
+ llvm_unreachable("unimplemented opcode in encodeInstruction()");
+
+ int NewOpcode = -1;
+ if (isMicroMips(STI)) {
+ if (isMips32r6(STI)) {
+ NewOpcode = Mips::MipsR62MicroMipsR6(Opcode, Mips::Arch_micromipsr6);
+ if (NewOpcode == -1)
+ NewOpcode = Mips::Std2MicroMipsR6(Opcode, Mips::Arch_micromipsr6);
+ }
+ else
+ NewOpcode = Mips::Std2MicroMips(Opcode, Mips::Arch_micromips);
+
+ // Check whether it is Dsp instruction.
+ if (NewOpcode == -1)
+ NewOpcode = Mips::Dsp2MicroMips(Opcode, Mips::Arch_mmdsp);
+
+ if (NewOpcode != -1) {
+ if (Fixups.size() > N)
+ Fixups.pop_back();
+
+ Opcode = NewOpcode;
+ TmpInst.setOpcode (NewOpcode);
+ Binary = getBinaryCodeForInstr(TmpInst, Fixups, STI);
+ }
+ }
+
+ const MCInstrDesc &Desc = MCII.get(TmpInst.getOpcode());
+
+ // Get byte count of instruction
+ unsigned Size = Desc.getSize();
+ if (!Size)
+ llvm_unreachable("Desc.getSize() returns 0");
+
+ EmitInstruction(Binary, Size, STI, OS);
+}
+
+/// getBranchTargetOpValue - Return binary encoding of the branch
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getBranchTargetOpValue(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+
+ const MCOperand &MO = MI.getOperand(OpNo);
+
+ // If the destination is an immediate, divide by 4.
+ if (MO.isImm()) return MO.getImm() >> 2;
+
+ assert(MO.isExpr() &&
+ "getBranchTargetOpValue expects only expressions or immediates");
+
+ const MCExpr *FixupExpression = MCBinaryExpr::createAdd(
+ MO.getExpr(), MCConstantExpr::create(-4, Ctx), Ctx);
+ Fixups.push_back(MCFixup::create(0, FixupExpression,
+ MCFixupKind(Mips::fixup_Mips_PC16)));
+ return 0;
+}
+
+/// getBranchTargetOpValue1SImm16 - Return binary encoding of the branch
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getBranchTargetOpValue1SImm16(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+
+ const MCOperand &MO = MI.getOperand(OpNo);
+
+ // If the destination is an immediate, divide by 2.
+ if (MO.isImm()) return MO.getImm() >> 1;
+
+ assert(MO.isExpr() &&
+ "getBranchTargetOpValue expects only expressions or immediates");
+
+ const MCExpr *FixupExpression = MCBinaryExpr::createAdd(
+ MO.getExpr(), MCConstantExpr::create(-4, Ctx), Ctx);
+ Fixups.push_back(MCFixup::create(0, FixupExpression,
+ MCFixupKind(Mips::fixup_Mips_PC16)));
+ return 0;
+}
+
+/// getBranchTargetOpValueMMR6 - Return binary encoding of the branch
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getBranchTargetOpValueMMR6(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+
+ const MCOperand &MO = MI.getOperand(OpNo);
+
+ // If the destination is an immediate, divide by 2.
+ if (MO.isImm())
+ return MO.getImm() >> 1;
+
+ assert(MO.isExpr() &&
+ "getBranchTargetOpValueMMR6 expects only expressions or immediates");
+
+ const MCExpr *FixupExpression = MCBinaryExpr::createAdd(
+ MO.getExpr(), MCConstantExpr::create(-2, Ctx), Ctx);
+ Fixups.push_back(MCFixup::create(0, FixupExpression,
+ MCFixupKind(Mips::fixup_Mips_PC16)));
+ return 0;
+}
+
+/// getBranchTargetOpValueLsl2MMR6 - Return binary encoding of the branch
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getBranchTargetOpValueLsl2MMR6(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+
+ const MCOperand &MO = MI.getOperand(OpNo);
+
+ // If the destination is an immediate, divide by 4.
+ if (MO.isImm())
+ return MO.getImm() >> 2;
+
+ assert(MO.isExpr() &&
+ "getBranchTargetOpValueLsl2MMR6 expects only expressions or immediates");
+
+ const MCExpr *FixupExpression = MCBinaryExpr::createAdd(
+ MO.getExpr(), MCConstantExpr::create(-4, Ctx), Ctx);
+ Fixups.push_back(MCFixup::create(0, FixupExpression,
+ MCFixupKind(Mips::fixup_Mips_PC16)));
+ return 0;
+}
+
+/// getBranchTarget7OpValueMM - Return binary encoding of the microMIPS branch
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getBranchTarget7OpValueMM(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+
+ const MCOperand &MO = MI.getOperand(OpNo);
+
+ // If the destination is an immediate, divide by 2.
+ if (MO.isImm()) return MO.getImm() >> 1;
+
+ assert(MO.isExpr() &&
+ "getBranchTargetOpValueMM expects only expressions or immediates");
+
+ const MCExpr *Expr = MO.getExpr();
+ Fixups.push_back(MCFixup::create(0, Expr,
+ MCFixupKind(Mips::fixup_MICROMIPS_PC7_S1)));
+ return 0;
+}
+
+/// getBranchTargetOpValueMMPC10 - Return binary encoding of the microMIPS
+/// 10-bit branch target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getBranchTargetOpValueMMPC10(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+
+ const MCOperand &MO = MI.getOperand(OpNo);
+
+ // If the destination is an immediate, divide by 2.
+ if (MO.isImm()) return MO.getImm() >> 1;
+
+ assert(MO.isExpr() &&
+ "getBranchTargetOpValuePC10 expects only expressions or immediates");
+
+ const MCExpr *Expr = MO.getExpr();
+ Fixups.push_back(MCFixup::create(0, Expr,
+ MCFixupKind(Mips::fixup_MICROMIPS_PC10_S1)));
+ return 0;
+}
+
+/// getBranchTargetOpValue - Return binary encoding of the microMIPS branch
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getBranchTargetOpValueMM(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+
+ const MCOperand &MO = MI.getOperand(OpNo);
+
+ // If the destination is an immediate, divide by 2.
+ if (MO.isImm()) return MO.getImm() >> 1;
+
+ assert(MO.isExpr() &&
+ "getBranchTargetOpValueMM expects only expressions or immediates");
+
+ const MCExpr *Expr = MO.getExpr();
+ Fixups.push_back(MCFixup::create(0, Expr,
+ MCFixupKind(Mips::
+ fixup_MICROMIPS_PC16_S1)));
+ return 0;
+}
+
+/// getBranchTarget21OpValue - Return binary encoding of the branch
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getBranchTarget21OpValue(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+
+ const MCOperand &MO = MI.getOperand(OpNo);
+
+ // If the destination is an immediate, divide by 4.
+ if (MO.isImm()) return MO.getImm() >> 2;
+
+ assert(MO.isExpr() &&
+ "getBranchTarget21OpValue expects only expressions or immediates");
+
+ const MCExpr *FixupExpression = MCBinaryExpr::createAdd(
+ MO.getExpr(), MCConstantExpr::create(-4, Ctx), Ctx);
+ Fixups.push_back(MCFixup::create(0, FixupExpression,
+ MCFixupKind(Mips::fixup_MIPS_PC21_S2)));
+ return 0;
+}
+
+/// getBranchTarget21OpValueMM - Return binary encoding of the branch
+/// target operand for microMIPS. If the machine operand requires
+/// relocation, record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getBranchTarget21OpValueMM(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+
+ const MCOperand &MO = MI.getOperand(OpNo);
+
+ // If the destination is an immediate, divide by 4.
+ if (MO.isImm()) return MO.getImm() >> 2;
+
+ assert(MO.isExpr() &&
+ "getBranchTarget21OpValueMM expects only expressions or immediates");
+
+ const MCExpr *FixupExpression = MCBinaryExpr::createAdd(
+ MO.getExpr(), MCConstantExpr::create(-4, Ctx), Ctx);
+ Fixups.push_back(MCFixup::create(0, FixupExpression,
+ MCFixupKind(Mips::fixup_MICROMIPS_PC21_S1)));
+ return 0;
+}
+
+/// getBranchTarget26OpValue - Return binary encoding of the branch
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getBranchTarget26OpValue(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+
+ const MCOperand &MO = MI.getOperand(OpNo);
+
+ // If the destination is an immediate, divide by 4.
+ if (MO.isImm()) return MO.getImm() >> 2;
+
+ assert(MO.isExpr() &&
+ "getBranchTarget26OpValue expects only expressions or immediates");
+
+ const MCExpr *FixupExpression = MCBinaryExpr::createAdd(
+ MO.getExpr(), MCConstantExpr::create(-4, Ctx), Ctx);
+ Fixups.push_back(MCFixup::create(0, FixupExpression,
+ MCFixupKind(Mips::fixup_MIPS_PC26_S2)));
+ return 0;
+}
+
+/// getBranchTarget26OpValueMM - Return binary encoding of the branch
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::getBranchTarget26OpValueMM(
+ const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+
+ const MCOperand &MO = MI.getOperand(OpNo);
+
+ // If the destination is an immediate, divide by 2.
+ if (MO.isImm())
+ return MO.getImm() >> 1;
+
+ assert(MO.isExpr() &&
+ "getBranchTarget26OpValueMM expects only expressions or immediates");
+
+ const MCExpr *FixupExpression = MCBinaryExpr::createAdd(
+ MO.getExpr(), MCConstantExpr::create(-4, Ctx), Ctx);
+ Fixups.push_back(MCFixup::create(0, FixupExpression,
+ MCFixupKind(Mips::fixup_MICROMIPS_PC26_S1)));
+ return 0;
+}
+
+/// getJumpOffset16OpValue - Return binary encoding of the jump
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getJumpOffset16OpValue(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+
+ const MCOperand &MO = MI.getOperand(OpNo);
+
+ if (MO.isImm()) return MO.getImm();
+
+ assert(MO.isExpr() &&
+ "getJumpOffset16OpValue expects only expressions or an immediate");
+
+ // TODO: Push fixup.
+ return 0;
+}
+
+/// getJumpTargetOpValue - Return binary encoding of the jump
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getJumpTargetOpValue(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+
+ const MCOperand &MO = MI.getOperand(OpNo);
+ // If the destination is an immediate, divide by 4.
+ if (MO.isImm()) return MO.getImm()>>2;
+
+ assert(MO.isExpr() &&
+ "getJumpTargetOpValue expects only expressions or an immediate");
+
+ const MCExpr *Expr = MO.getExpr();
+ Fixups.push_back(MCFixup::create(0, Expr,
+ MCFixupKind(Mips::fixup_Mips_26)));
+ return 0;
+}
+
+unsigned MipsMCCodeEmitter::
+getJumpTargetOpValueMM(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+
+ const MCOperand &MO = MI.getOperand(OpNo);
+ // If the destination is an immediate, divide by 2.
+ if (MO.isImm()) return MO.getImm() >> 1;
+
+ assert(MO.isExpr() &&
+ "getJumpTargetOpValueMM expects only expressions or an immediate");
+
+ const MCExpr *Expr = MO.getExpr();
+ Fixups.push_back(MCFixup::create(0, Expr,
+ MCFixupKind(Mips::fixup_MICROMIPS_26_S1)));
+ return 0;
+}
+
+unsigned MipsMCCodeEmitter::
+getUImm5Lsl2Encoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+
+ const MCOperand &MO = MI.getOperand(OpNo);
+ if (MO.isImm()) {
+ // The immediate is encoded as 'immediate << 2'.
+ unsigned Res = getMachineOpValue(MI, MO, Fixups, STI);
+ assert((Res & 3) == 0);
+ return Res >> 2;
+ }
+
+ assert(MO.isExpr() &&
+ "getUImm5Lsl2Encoding expects only expressions or an immediate");
+
+ return 0;
+}
+
+unsigned MipsMCCodeEmitter::
+getSImm3Lsa2Value(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+
+ const MCOperand &MO = MI.getOperand(OpNo);
+ if (MO.isImm()) {
+ int Value = MO.getImm();
+ return Value >> 2;
+ }
+
+ return 0;
+}
+
+unsigned MipsMCCodeEmitter::
+getUImm6Lsl2Encoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+
+ const MCOperand &MO = MI.getOperand(OpNo);
+ if (MO.isImm()) {
+ unsigned Value = MO.getImm();
+ return Value >> 2;
+ }
+
+ return 0;
+}
+
+unsigned MipsMCCodeEmitter::
+getSImm9AddiuspValue(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+
+ const MCOperand &MO = MI.getOperand(OpNo);
+ if (MO.isImm()) {
+ unsigned Binary = (MO.getImm() >> 2) & 0x0000ffff;
+ return (((Binary & 0x8000) >> 7) | (Binary & 0x00ff));
+ }
+
+ return 0;
+}
+
+unsigned MipsMCCodeEmitter::
+getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ int64_t Res;
+
+ if (Expr->evaluateAsAbsolute(Res))
+ return Res;
+
+ MCExpr::ExprKind Kind = Expr->getKind();
+ if (Kind == MCExpr::Constant) {
+ return cast<MCConstantExpr>(Expr)->getValue();
+ }
+
+ if (Kind == MCExpr::Binary) {
+ unsigned Res = getExprOpValue(cast<MCBinaryExpr>(Expr)->getLHS(), Fixups, STI);
+ Res += getExprOpValue(cast<MCBinaryExpr>(Expr)->getRHS(), Fixups, STI);
+ return Res;
+ }
+
+ if (Kind == MCExpr::Target) {
+ const MipsMCExpr *MipsExpr = cast<MipsMCExpr>(Expr);
+
+ Mips::Fixups FixupKind = Mips::Fixups(0);
+ switch (MipsExpr->getKind()) {
+ case MipsMCExpr::MEK_None:
+ case MipsMCExpr::MEK_Special:
+ llvm_unreachable("Unhandled fixup kind!");
+ break;
+ case MipsMCExpr::MEK_CALL_HI16:
+ FixupKind = Mips::fixup_Mips_CALL_HI16;
+ break;
+ case MipsMCExpr::MEK_CALL_LO16:
+ FixupKind = Mips::fixup_Mips_CALL_LO16;
+ break;
+ case MipsMCExpr::MEK_DTPREL_HI:
+ FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_DTPREL_HI16
+ : Mips::fixup_Mips_DTPREL_HI;
+ break;
+ case MipsMCExpr::MEK_DTPREL_LO:
+ FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_DTPREL_LO16
+ : Mips::fixup_Mips_DTPREL_LO;
+ break;
+ case MipsMCExpr::MEK_GOTTPREL:
+ FixupKind = Mips::fixup_Mips_GOTTPREL;
+ break;
+ case MipsMCExpr::MEK_GOT:
+ FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GOT16
+ : Mips::fixup_Mips_GOT;
+ break;
+ case MipsMCExpr::MEK_GOT_CALL:
+ FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_CALL16
+ : Mips::fixup_Mips_CALL16;
+ break;
+ case MipsMCExpr::MEK_GOT_DISP:
+ FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GOT_DISP
+ : Mips::fixup_Mips_GOT_DISP;
+ break;
+ case MipsMCExpr::MEK_GOT_HI16:
+ FixupKind = Mips::fixup_Mips_GOT_HI16;
+ break;
+ case MipsMCExpr::MEK_GOT_LO16:
+ FixupKind = Mips::fixup_Mips_GOT_LO16;
+ break;
+ case MipsMCExpr::MEK_GOT_PAGE:
+ FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GOT_PAGE
+ : Mips::fixup_Mips_GOT_PAGE;
+ break;
+ case MipsMCExpr::MEK_GOT_OFST:
+ FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GOT_OFST
+ : Mips::fixup_Mips_GOT_OFST;
+ break;
+ case MipsMCExpr::MEK_GPREL:
+ FixupKind = Mips::fixup_Mips_GPREL16;
+ break;
+ case MipsMCExpr::MEK_LO: {
+ // Check for %lo(%neg(%gp_rel(X)))
+ if (MipsExpr->isGpOff()) {
+ FixupKind = Mips::fixup_Mips_GPOFF_LO;
+ break;
+ }
+ FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_LO16
+ : Mips::fixup_Mips_LO16;
+ break;
+ }
+ case MipsMCExpr::MEK_HIGHEST:
+ FixupKind = Mips::fixup_Mips_HIGHEST;
+ break;
+ case MipsMCExpr::MEK_HIGHER:
+ FixupKind = Mips::fixup_Mips_HIGHER;
+ break;
+ case MipsMCExpr::MEK_HI:
+ // Check for %hi(%neg(%gp_rel(X)))
+ if (MipsExpr->isGpOff()) {
+ FixupKind = Mips::fixup_Mips_GPOFF_HI;
+ break;
+ }
+ FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_HI16
+ : Mips::fixup_Mips_HI16;
+ break;
+ case MipsMCExpr::MEK_PCREL_HI16:
+ FixupKind = Mips::fixup_MIPS_PCHI16;
+ break;
+ case MipsMCExpr::MEK_PCREL_LO16:
+ FixupKind = Mips::fixup_MIPS_PCLO16;
+ break;
+ case MipsMCExpr::MEK_TLSGD:
+ FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_GD
+ : Mips::fixup_Mips_TLSGD;
+ break;
+ case MipsMCExpr::MEK_TLSLDM:
+ FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_LDM
+ : Mips::fixup_Mips_TLSLDM;
+ break;
+ case MipsMCExpr::MEK_TPREL_HI:
+ FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_TPREL_HI16
+ : Mips::fixup_Mips_TPREL_HI;
+ break;
+ case MipsMCExpr::MEK_TPREL_LO:
+ FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_TPREL_LO16
+ : Mips::fixup_Mips_TPREL_LO;
+ break;
+ case MipsMCExpr::MEK_NEG:
+ FixupKind =
+ isMicroMips(STI) ? Mips::fixup_MICROMIPS_SUB : Mips::fixup_Mips_SUB;
+ break;
+ }
+ Fixups.push_back(MCFixup::create(0, MipsExpr, MCFixupKind(FixupKind)));
+ return 0;
+ }
+
+ if (Kind == MCExpr::SymbolRef) {
+ Mips::Fixups FixupKind = Mips::Fixups(0);
+
+ switch(cast<MCSymbolRefExpr>(Expr)->getKind()) {
+ default: llvm_unreachable("Unknown fixup kind!");
+ break;
+ case MCSymbolRefExpr::VK_None:
+ FixupKind = Mips::fixup_Mips_32; // FIXME: This is ok for O32/N32 but not N64.
+ break;
+ } // switch
+
+ Fixups.push_back(MCFixup::create(0, Expr, MCFixupKind(FixupKind)));
+ return 0;
+ }
+ return 0;
+}
+
+/// getMachineOpValue - Return binary encoding of operand. If the machine
+/// operand requires relocation, record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ if (MO.isReg()) {
+ unsigned Reg = MO.getReg();
+ unsigned RegNo = Ctx.getRegisterInfo()->getEncodingValue(Reg);
+ return RegNo;
+ } else if (MO.isImm()) {
+ return static_cast<unsigned>(MO.getImm());
+ } else if (MO.isFPImm()) {
+ return static_cast<unsigned>(APFloat(MO.getFPImm())
+ .bitcastToAPInt().getHiBits(32).getLimitedValue());
+ }
+ // MO must be an Expr.
+ assert(MO.isExpr());
+ return getExprOpValue(MO.getExpr(),Fixups, STI);
+}
+
+/// Return binary encoding of memory related operand.
+/// If the offset operand requires relocation, record the relocation.
+template <unsigned ShiftAmount>
+unsigned MipsMCCodeEmitter::getMemEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // Base register is encoded in bits 20-16, offset is encoded in bits 15-0.
+ assert(MI.getOperand(OpNo).isReg());
+ unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo),Fixups, STI) << 16;
+ unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI);
+
+ // Apply the scale factor if there is one.
+ OffBits >>= ShiftAmount;
+
+ return (OffBits & 0xFFFF) | RegBits;
+}
+
+unsigned MipsMCCodeEmitter::
+getMemEncodingMMImm4(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // Base register is encoded in bits 6-4, offset is encoded in bits 3-0.
+ assert(MI.getOperand(OpNo).isReg());
+ unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo),
+ Fixups, STI) << 4;
+ unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1),
+ Fixups, STI);
+
+ return (OffBits & 0xF) | RegBits;
+}
+
+unsigned MipsMCCodeEmitter::
+getMemEncodingMMImm4Lsl1(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // Base register is encoded in bits 6-4, offset is encoded in bits 3-0.
+ assert(MI.getOperand(OpNo).isReg());
+ unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo),
+ Fixups, STI) << 4;
+ unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1),
+ Fixups, STI) >> 1;
+
+ return (OffBits & 0xF) | RegBits;
+}
+
+unsigned MipsMCCodeEmitter::
+getMemEncodingMMImm4Lsl2(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // Base register is encoded in bits 6-4, offset is encoded in bits 3-0.
+ assert(MI.getOperand(OpNo).isReg());
+ unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo),
+ Fixups, STI) << 4;
+ unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1),
+ Fixups, STI) >> 2;
+
+ return (OffBits & 0xF) | RegBits;
+}
+
+unsigned MipsMCCodeEmitter::
+getMemEncodingMMSPImm5Lsl2(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // Register is encoded in bits 9-5, offset is encoded in bits 4-0.
+ assert(MI.getOperand(OpNo).isReg() &&
+ (MI.getOperand(OpNo).getReg() == Mips::SP ||
+ MI.getOperand(OpNo).getReg() == Mips::SP_64) &&
+ "Unexpected base register!");
+ unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1),
+ Fixups, STI) >> 2;
+
+ return OffBits & 0x1F;
+}
+
+unsigned MipsMCCodeEmitter::
+getMemEncodingMMGPImm7Lsl2(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // Register is encoded in bits 9-7, offset is encoded in bits 6-0.
+ assert(MI.getOperand(OpNo).isReg() &&
+ MI.getOperand(OpNo).getReg() == Mips::GP &&
+ "Unexpected base register!");
+
+ unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1),
+ Fixups, STI) >> 2;
+
+ return OffBits & 0x7F;
+}
+
+unsigned MipsMCCodeEmitter::
+getMemEncodingMMImm9(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // Base register is encoded in bits 20-16, offset is encoded in bits 8-0.
+ assert(MI.getOperand(OpNo).isReg());
+ unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups,
+ STI) << 16;
+ unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo + 1), Fixups, STI);
+
+ return (OffBits & 0x1FF) | RegBits;
+}
+
+unsigned MipsMCCodeEmitter::
+getMemEncodingMMImm11(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // Base register is encoded in bits 20-16, offset is encoded in bits 10-0.
+ assert(MI.getOperand(OpNo).isReg());
+ unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups,
+ STI) << 16;
+ unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI);
+
+ return (OffBits & 0x07FF) | RegBits;
+}
+
+unsigned MipsMCCodeEmitter::
+getMemEncodingMMImm12(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // opNum can be invalid if instruction had reglist as operand.
+ // MemOperand is always last operand of instruction (base + offset).
+ switch (MI.getOpcode()) {
+ default:
+ break;
+ case Mips::SWM32_MM:
+ case Mips::LWM32_MM:
+ OpNo = MI.getNumOperands() - 2;
+ break;
+ }
+
+ // Base register is encoded in bits 20-16, offset is encoded in bits 11-0.
+ assert(MI.getOperand(OpNo).isReg());
+ unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI) << 16;
+ unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI);
+
+ return (OffBits & 0x0FFF) | RegBits;
+}
+
+unsigned MipsMCCodeEmitter::
+getMemEncodingMMImm16(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // Base register is encoded in bits 20-16, offset is encoded in bits 15-0.
+ assert(MI.getOperand(OpNo).isReg());
+ unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups,
+ STI) << 16;
+ unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI);
+
+ return (OffBits & 0xFFFF) | RegBits;
+}
+
+unsigned MipsMCCodeEmitter::
+getMemEncodingMMImm4sp(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // opNum can be invalid if instruction had reglist as operand
+ // MemOperand is always last operand of instruction (base + offset)
+ switch (MI.getOpcode()) {
+ default:
+ break;
+ case Mips::SWM16_MM:
+ case Mips::SWM16_MMR6:
+ case Mips::LWM16_MM:
+ case Mips::LWM16_MMR6:
+ OpNo = MI.getNumOperands() - 2;
+ break;
+ }
+
+ // Offset is encoded in bits 4-0.
+ assert(MI.getOperand(OpNo).isReg());
+ // Base register is always SP - thus it is not encoded.
+ assert(MI.getOperand(OpNo+1).isImm());
+ unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI);
+
+ return ((OffBits >> 2) & 0x0F);
+}
+
+// FIXME: should be called getMSBEncoding
+//
+unsigned
+MipsMCCodeEmitter::getSizeInsEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ assert(MI.getOperand(OpNo-1).isImm());
+ assert(MI.getOperand(OpNo).isImm());
+ unsigned Position = getMachineOpValue(MI, MI.getOperand(OpNo-1), Fixups, STI);
+ unsigned Size = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI);
+
+ return Position + Size - 1;
+}
+
+template <unsigned Bits, int Offset>
+unsigned
+MipsMCCodeEmitter::getUImmWithOffsetEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ assert(MI.getOperand(OpNo).isImm());
+ unsigned Value = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI);
+ Value -= Offset;
+ return Value;
+}
+
+unsigned
+MipsMCCodeEmitter::getSimm19Lsl2Encoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpNo);
+ if (MO.isImm()) {
+ // The immediate is encoded as 'immediate << 2'.
+ unsigned Res = getMachineOpValue(MI, MO, Fixups, STI);
+ assert((Res & 3) == 0);
+ return Res >> 2;
+ }
+
+ assert(MO.isExpr() &&
+ "getSimm19Lsl2Encoding expects only expressions or an immediate");
+
+ const MCExpr *Expr = MO.getExpr();
+ Mips::Fixups FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_PC19_S2
+ : Mips::fixup_MIPS_PC19_S2;
+ Fixups.push_back(MCFixup::create(0, Expr, MCFixupKind(FixupKind)));
+ return 0;
+}
+
+unsigned
+MipsMCCodeEmitter::getSimm18Lsl3Encoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpNo);
+ if (MO.isImm()) {
+ // The immediate is encoded as 'immediate << 3'.
+ unsigned Res = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI);
+ assert((Res & 7) == 0);
+ return Res >> 3;
+ }
+
+ assert(MO.isExpr() &&
+ "getSimm18Lsl2Encoding expects only expressions or an immediate");
+
+ const MCExpr *Expr = MO.getExpr();
+ Mips::Fixups FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_PC18_S3
+ : Mips::fixup_MIPS_PC18_S3;
+ Fixups.push_back(MCFixup::create(0, Expr, MCFixupKind(FixupKind)));
+ return 0;
+}
+
+unsigned
+MipsMCCodeEmitter::getUImm3Mod8Encoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ assert(MI.getOperand(OpNo).isImm());
+ const MCOperand &MO = MI.getOperand(OpNo);
+ return MO.getImm() % 8;
+}
+
+unsigned
+MipsMCCodeEmitter::getUImm4AndValue(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ assert(MI.getOperand(OpNo).isImm());
+ const MCOperand &MO = MI.getOperand(OpNo);
+ unsigned Value = MO.getImm();
+ switch (Value) {
+ case 128: return 0x0;
+ case 1: return 0x1;
+ case 2: return 0x2;
+ case 3: return 0x3;
+ case 4: return 0x4;
+ case 7: return 0x5;
+ case 8: return 0x6;
+ case 15: return 0x7;
+ case 16: return 0x8;
+ case 31: return 0x9;
+ case 32: return 0xa;
+ case 63: return 0xb;
+ case 64: return 0xc;
+ case 255: return 0xd;
+ case 32768: return 0xe;
+ case 65535: return 0xf;
+ }
+ llvm_unreachable("Unexpected value");
+}
+
+unsigned
+MipsMCCodeEmitter::getRegisterListOpValue(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ unsigned res = 0;
+
+ // Register list operand is always first operand of instruction and it is
+ // placed before memory operand (register + imm).
+
+ for (unsigned I = OpNo, E = MI.getNumOperands() - 2; I < E; ++I) {
+ unsigned Reg = MI.getOperand(I).getReg();
+ unsigned RegNo = Ctx.getRegisterInfo()->getEncodingValue(Reg);
+ if (RegNo != 31)
+ res++;
+ else
+ res |= 0x10;
+ }
+ return res;
+}
+
+unsigned
+MipsMCCodeEmitter::getRegisterListOpValue16(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ return (MI.getNumOperands() - 4);
+}
+
+unsigned
+MipsMCCodeEmitter::getRegisterPairOpValue(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ return getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI);
+}
+
+unsigned
+MipsMCCodeEmitter::getMovePRegPairOpValue(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ unsigned res = 0;
+
+ if (MI.getOperand(0).getReg() == Mips::A1 &&
+ MI.getOperand(1).getReg() == Mips::A2)
+ res = 0;
+ else if (MI.getOperand(0).getReg() == Mips::A1 &&
+ MI.getOperand(1).getReg() == Mips::A3)
+ res = 1;
+ else if (MI.getOperand(0).getReg() == Mips::A2 &&
+ MI.getOperand(1).getReg() == Mips::A3)
+ res = 2;
+ else if (MI.getOperand(0).getReg() == Mips::A0 &&
+ MI.getOperand(1).getReg() == Mips::S5)
+ res = 3;
+ else if (MI.getOperand(0).getReg() == Mips::A0 &&
+ MI.getOperand(1).getReg() == Mips::S6)
+ res = 4;
+ else if (MI.getOperand(0).getReg() == Mips::A0 &&
+ MI.getOperand(1).getReg() == Mips::A1)
+ res = 5;
+ else if (MI.getOperand(0).getReg() == Mips::A0 &&
+ MI.getOperand(1).getReg() == Mips::A2)
+ res = 6;
+ else if (MI.getOperand(0).getReg() == Mips::A0 &&
+ MI.getOperand(1).getReg() == Mips::A3)
+ res = 7;
+
+ return res;
+}
+
+unsigned
+MipsMCCodeEmitter::getSimm23Lsl2Encoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpNo);
+ assert(MO.isImm() && "getSimm23Lsl2Encoding expects only an immediate");
+ // The immediate is encoded as 'immediate >> 2'.
+ unsigned Res = static_cast<unsigned>(MO.getImm());
+ assert((Res & 3) == 0);
+ return Res >> 2;
+}
+
+#include "MipsGenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
new file mode 100644
index 000000000000..2d041dcbf040
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
@@ -0,0 +1,278 @@
+//===-- MipsMCCodeEmitter.h - Convert Mips Code to Machine Code -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the MipsMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCCODEEMITTER_H
+#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCCODEEMITTER_H
+
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/Support/DataTypes.h"
+
+using namespace llvm;
+
+namespace llvm {
+class MCContext;
+class MCExpr;
+class MCInst;
+class MCInstrInfo;
+class MCFixup;
+class MCOperand;
+class MCSubtargetInfo;
+class raw_ostream;
+
+class MipsMCCodeEmitter : public MCCodeEmitter {
+ MipsMCCodeEmitter(const MipsMCCodeEmitter &) = delete;
+ void operator=(const MipsMCCodeEmitter &) = delete;
+ const MCInstrInfo &MCII;
+ MCContext &Ctx;
+ bool IsLittleEndian;
+
+ bool isMicroMips(const MCSubtargetInfo &STI) const;
+ bool isMips32r6(const MCSubtargetInfo &STI) const;
+
+public:
+ MipsMCCodeEmitter(const MCInstrInfo &mcii, MCContext &Ctx_, bool IsLittle)
+ : MCII(mcii), Ctx(Ctx_), IsLittleEndian(IsLittle) {}
+
+ ~MipsMCCodeEmitter() override {}
+
+ void EmitByte(unsigned char C, raw_ostream &OS) const;
+
+ void EmitInstruction(uint64_t Val, unsigned Size, const MCSubtargetInfo &STI,
+ raw_ostream &OS) const;
+
+ void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const override;
+
+ // getBinaryCodeForInstr - TableGen'erated function for getting the
+ // binary encoding for an instruction.
+ uint64_t getBinaryCodeForInstr(const MCInst &MI,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ // getJumpTargetOpValue - Return binary encoding of the jump
+ // target operand. If the machine operand requires relocation,
+ // record the relocation and return zero.
+ unsigned getJumpTargetOpValue(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ // getBranchJumpOpValueMM - Return binary encoding of the microMIPS jump
+ // target operand. If the machine operand requires relocation,
+ // record the relocation and return zero.
+ unsigned getJumpTargetOpValueMM(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ // getUImm5Lsl2Encoding - Return binary encoding of the microMIPS jump
+ // target operand.
+ unsigned getUImm5Lsl2Encoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ unsigned getSImm3Lsa2Value(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ unsigned getUImm6Lsl2Encoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ // getSImm9AddiuspValue - Return binary encoding of the microMIPS addiusp
+ // instruction immediate operand.
+ unsigned getSImm9AddiuspValue(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ // getBranchTargetOpValue - Return binary encoding of the branch
+ // target operand. If the machine operand requires relocation,
+ // record the relocation and return zero.
+ unsigned getBranchTargetOpValue(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ // getBranchTargetOpValue1SImm16 - Return binary encoding of the branch
+ // target operand. If the machine operand requires relocation,
+ // record the relocation and return zero.
+ unsigned getBranchTargetOpValue1SImm16(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ // getBranchTargetOpValueMMR6 - Return binary encoding of the branch
+ // target operand. If the machine operand requires relocation,
+ // record the relocation and return zero.
+ unsigned getBranchTargetOpValueMMR6(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ // getBranchTargetOpValueLsl2MMR6 - Return binary encoding of the branch
+ // target operand. If the machine operand requires relocation,
+ // record the relocation and return zero.
+ unsigned getBranchTargetOpValueLsl2MMR6(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ // getBranchTarget7OpValue - Return binary encoding of the microMIPS branch
+ // target operand. If the machine operand requires relocation,
+ // record the relocation and return zero.
+ unsigned getBranchTarget7OpValueMM(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ // getBranchTargetOpValueMMPC10 - Return binary encoding of the microMIPS
+ // 10-bit branch target operand. If the machine operand requires relocation,
+ // record the relocation and return zero.
+ unsigned getBranchTargetOpValueMMPC10(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ // getBranchTargetOpValue - Return binary encoding of the microMIPS branch
+ // target operand. If the machine operand requires relocation,
+ // record the relocation and return zero.
+ unsigned getBranchTargetOpValueMM(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ // getBranchTarget21OpValue - Return binary encoding of the branch
+ // offset operand. If the machine operand requires relocation,
+ // record the relocation and return zero.
+ unsigned getBranchTarget21OpValue(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ // getBranchTarget21OpValueMM - Return binary encoding of the branch
+ // offset operand for microMIPS. If the machine operand requires
+ // relocation,record the relocation and return zero.
+ unsigned getBranchTarget21OpValueMM(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ // getBranchTarget26OpValue - Return binary encoding of the branch
+ // offset operand. If the machine operand requires relocation,
+ // record the relocation and return zero.
+ unsigned getBranchTarget26OpValue(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ // getBranchTarget26OpValueMM - Return binary encoding of the branch
+ // offset operand. If the machine operand requires relocation,
+ // record the relocation and return zero.
+ unsigned getBranchTarget26OpValueMM(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ // getJumpOffset16OpValue - Return binary encoding of the jump
+ // offset operand. If the machine operand requires relocation,
+ // record the relocation and return zero.
+ unsigned getJumpOffset16OpValue(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ // getMachineOpValue - Return binary encoding of operand. If the machin
+ // operand requires relocation, record the relocation and return zero.
+ unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ unsigned getMSAMemEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ template <unsigned ShiftAmount = 0>
+ unsigned getMemEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ unsigned getMemEncodingMMImm4(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ unsigned getMemEncodingMMImm4Lsl1(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ unsigned getMemEncodingMMImm4Lsl2(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ unsigned getMemEncodingMMSPImm5Lsl2(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ unsigned getMemEncodingMMGPImm7Lsl2(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ unsigned getMemEncodingMMImm9(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ unsigned getMemEncodingMMImm11(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ unsigned getMemEncodingMMImm12(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ unsigned getMemEncodingMMImm16(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ unsigned getMemEncodingMMImm4sp(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ unsigned getSizeInsEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// Subtract Offset then encode as a N-bit unsigned integer.
+ template <unsigned Bits, int Offset>
+ unsigned getUImmWithOffsetEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ unsigned getSimm19Lsl2Encoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ unsigned getSimm18Lsl3Encoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ unsigned getUImm3Mod8Encoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ unsigned getUImm4AndValue(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ unsigned getRegisterPairOpValue(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ unsigned getMovePRegPairOpValue(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ unsigned getSimm23Lsl2Encoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ unsigned getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ unsigned getRegisterListOpValue(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ unsigned getRegisterListOpValue16(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ private:
+ void LowerCompactBranch(MCInst& Inst) const;
+}; // class MipsMCCodeEmitter
+} // namespace llvm.
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
new file mode 100644
index 000000000000..082bb87fcb8a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
@@ -0,0 +1,287 @@
+//===-- MipsMCExpr.cpp - Mips specific MC expression classes --------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsMCExpr.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/Support/ELF.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mipsmcexpr"
+
+const MipsMCExpr *MipsMCExpr::create(MipsMCExpr::MipsExprKind Kind,
+ const MCExpr *Expr, MCContext &Ctx) {
+ return new (Ctx) MipsMCExpr(Kind, Expr);
+}
+
+const MipsMCExpr *MipsMCExpr::createGpOff(MipsMCExpr::MipsExprKind Kind,
+ const MCExpr *Expr, MCContext &Ctx) {
+ return create(Kind, create(MEK_NEG, create(MEK_GPREL, Expr, Ctx), Ctx), Ctx);
+}
+
+void MipsMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
+ int64_t AbsVal;
+
+ switch (Kind) {
+ case MEK_None:
+ case MEK_Special:
+ llvm_unreachable("MEK_None and MEK_Special are invalid");
+ break;
+ case MEK_CALL_HI16:
+ OS << "%call_hi";
+ break;
+ case MEK_CALL_LO16:
+ OS << "%call_lo";
+ break;
+ case MEK_DTPREL_HI:
+ OS << "%dtprel_hi";
+ break;
+ case MEK_DTPREL_LO:
+ OS << "%dtprel_lo";
+ break;
+ case MEK_GOT:
+ OS << "%got";
+ break;
+ case MEK_GOTTPREL:
+ OS << "%gottprel";
+ break;
+ case MEK_GOT_CALL:
+ OS << "%call16";
+ break;
+ case MEK_GOT_DISP:
+ OS << "%got_disp";
+ break;
+ case MEK_GOT_HI16:
+ OS << "%got_hi";
+ break;
+ case MEK_GOT_LO16:
+ OS << "%got_lo";
+ break;
+ case MEK_GOT_PAGE:
+ OS << "%got_page";
+ break;
+ case MEK_GOT_OFST:
+ OS << "%got_ofst";
+ break;
+ case MEK_GPREL:
+ OS << "%gp_rel";
+ break;
+ case MEK_HI:
+ OS << "%hi";
+ break;
+ case MEK_HIGHER:
+ OS << "%higher";
+ break;
+ case MEK_HIGHEST:
+ OS << "%highest";
+ break;
+ case MEK_LO:
+ OS << "%lo";
+ break;
+ case MEK_NEG:
+ OS << "%neg";
+ break;
+ case MEK_PCREL_HI16:
+ OS << "%pcrel_hi";
+ break;
+ case MEK_PCREL_LO16:
+ OS << "%pcrel_lo";
+ break;
+ case MEK_TLSGD:
+ OS << "%tlsgd";
+ break;
+ case MEK_TLSLDM:
+ OS << "%tlsldm";
+ break;
+ case MEK_TPREL_HI:
+ OS << "%tprel_hi";
+ break;
+ case MEK_TPREL_LO:
+ OS << "%tprel_lo";
+ break;
+ }
+
+ OS << '(';
+ if (Expr->evaluateAsAbsolute(AbsVal))
+ OS << AbsVal;
+ else
+ Expr->print(OS, MAI, true);
+ OS << ')';
+}
+
+bool
+MipsMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
+ const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const {
+ // Look for the %hi(%neg(%gp_rel(X))) and %lo(%neg(%gp_rel(X))) special cases.
+ if (isGpOff()) {
+ const MCExpr *SubExpr =
+ cast<MipsMCExpr>(cast<MipsMCExpr>(getSubExpr())->getSubExpr())
+ ->getSubExpr();
+ if (!SubExpr->evaluateAsRelocatable(Res, Layout, Fixup))
+ return false;
+
+ Res = MCValue::get(Res.getSymA(), Res.getSymB(), Res.getConstant(),
+ MEK_Special);
+ return true;
+ }
+
+ if (!getSubExpr()->evaluateAsRelocatable(Res, Layout, Fixup))
+ return false;
+
+ if (Res.getRefKind() != MCSymbolRefExpr::VK_None)
+ return false;
+
+ // evaluateAsAbsolute() and evaluateAsValue() require that we evaluate the
+ // %hi/%lo/etc. here. Fixup is a null pointer when either of these is the
+ // caller.
+ if (Res.isAbsolute() && Fixup == nullptr) {
+ int64_t AbsVal = Res.getConstant();
+ switch (Kind) {
+ case MEK_None:
+ case MEK_Special:
+ llvm_unreachable("MEK_None and MEK_Special are invalid");
+ case MEK_DTPREL_HI:
+ case MEK_DTPREL_LO:
+ case MEK_GOT:
+ case MEK_GOTTPREL:
+ case MEK_GOT_CALL:
+ case MEK_GOT_DISP:
+ case MEK_GOT_HI16:
+ case MEK_GOT_LO16:
+ case MEK_GOT_OFST:
+ case MEK_GOT_PAGE:
+ case MEK_GPREL:
+ case MEK_PCREL_HI16:
+ case MEK_PCREL_LO16:
+ case MEK_TLSGD:
+ case MEK_TLSLDM:
+ case MEK_TPREL_HI:
+ case MEK_TPREL_LO:
+ return false;
+ case MEK_LO:
+ case MEK_CALL_LO16:
+ AbsVal = SignExtend64<16>(AbsVal);
+ break;
+ case MEK_CALL_HI16:
+ case MEK_HI:
+ AbsVal = SignExtend64<16>((AbsVal + 0x8000) >> 16);
+ break;
+ case MEK_HIGHER:
+ AbsVal = SignExtend64<16>((AbsVal + 0x80008000LL) >> 32);
+ break;
+ case MEK_HIGHEST:
+ AbsVal = SignExtend64<16>((AbsVal + 0x800080008000LL) >> 48);
+ break;
+ case MEK_NEG:
+ AbsVal = -AbsVal;
+ break;
+ }
+ Res = MCValue::get(AbsVal);
+ return true;
+ }
+
+ // We want to defer it for relocatable expressions since the constant is
+ // applied to the whole symbol value.
+ //
+ // The value of getKind() that is given to MCValue is only intended to aid
+ // debugging when inspecting MCValue objects. It shouldn't be relied upon
+ // for decision making.
+ Res = MCValue::get(Res.getSymA(), Res.getSymB(), Res.getConstant(), getKind());
+
+ return true;
+}
+
+void MipsMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
+ Streamer.visitUsedExpr(*getSubExpr());
+}
+
+static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
+ switch (Expr->getKind()) {
+ case MCExpr::Target:
+ fixELFSymbolsInTLSFixupsImpl(cast<MipsMCExpr>(Expr)->getSubExpr(), Asm);
+ break;
+ case MCExpr::Constant:
+ break;
+ case MCExpr::Binary: {
+ const MCBinaryExpr *BE = cast<MCBinaryExpr>(Expr);
+ fixELFSymbolsInTLSFixupsImpl(BE->getLHS(), Asm);
+ fixELFSymbolsInTLSFixupsImpl(BE->getRHS(), Asm);
+ break;
+ }
+ case MCExpr::SymbolRef: {
+ // We're known to be under a TLS fixup, so any symbol should be
+ // modified. There should be only one.
+ const MCSymbolRefExpr &SymRef = *cast<MCSymbolRefExpr>(Expr);
+ cast<MCSymbolELF>(SymRef.getSymbol()).setType(ELF::STT_TLS);
+ break;
+ }
+ case MCExpr::Unary:
+ fixELFSymbolsInTLSFixupsImpl(cast<MCUnaryExpr>(Expr)->getSubExpr(), Asm);
+ break;
+ }
+}
+
+void MipsMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
+ switch (getKind()) {
+ case MEK_None:
+ case MEK_Special:
+ llvm_unreachable("MEK_None and MEK_Special are invalid");
+ break;
+ case MEK_CALL_HI16:
+ case MEK_CALL_LO16:
+ case MEK_DTPREL_HI:
+ case MEK_DTPREL_LO:
+ case MEK_GOT:
+ case MEK_GOT_CALL:
+ case MEK_GOT_DISP:
+ case MEK_GOT_HI16:
+ case MEK_GOT_LO16:
+ case MEK_GOT_OFST:
+ case MEK_GOT_PAGE:
+ case MEK_GPREL:
+ case MEK_HI:
+ case MEK_HIGHER:
+ case MEK_HIGHEST:
+ case MEK_LO:
+ case MEK_NEG:
+ case MEK_PCREL_HI16:
+ case MEK_PCREL_LO16:
+ case MEK_TLSLDM:
+ // If we do have nested target-specific expressions, they will be in
+ // a consecutive chain.
+ if (const MipsMCExpr *E = dyn_cast<const MipsMCExpr>(getSubExpr()))
+ E->fixELFSymbolsInTLSFixups(Asm);
+ break;
+ case MEK_GOTTPREL:
+ case MEK_TLSGD:
+ case MEK_TPREL_HI:
+ case MEK_TPREL_LO:
+ fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm);
+ break;
+ }
+}
+
+bool MipsMCExpr::isGpOff(MipsExprKind &Kind) const {
+ if (getKind() == MEK_HI || getKind() == MEK_LO) {
+ if (const MipsMCExpr *S1 = dyn_cast<const MipsMCExpr>(getSubExpr())) {
+ if (const MipsMCExpr *S2 = dyn_cast<const MipsMCExpr>(S1->getSubExpr())) {
+ if (S1->getKind() == MEK_NEG && S2->getKind() == MEK_GPREL) {
+ Kind = getKind();
+ return true;
+ }
+ }
+ }
+ }
+ return false;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
new file mode 100644
index 000000000000..d1a4334ec640
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
@@ -0,0 +1,91 @@
+//===-- MipsMCExpr.h - Mips specific MC expression classes ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCEXPR_H
+#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCEXPR_H
+
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+
+namespace llvm {
+
+class MipsMCExpr : public MCTargetExpr {
+public:
+ enum MipsExprKind {
+ MEK_None,
+ MEK_CALL_HI16,
+ MEK_CALL_LO16,
+ MEK_DTPREL_HI,
+ MEK_DTPREL_LO,
+ MEK_GOT,
+ MEK_GOTTPREL,
+ MEK_GOT_CALL,
+ MEK_GOT_DISP,
+ MEK_GOT_HI16,
+ MEK_GOT_LO16,
+ MEK_GOT_OFST,
+ MEK_GOT_PAGE,
+ MEK_GPREL,
+ MEK_HI,
+ MEK_HIGHER,
+ MEK_HIGHEST,
+ MEK_LO,
+ MEK_NEG,
+ MEK_PCREL_HI16,
+ MEK_PCREL_LO16,
+ MEK_TLSGD,
+ MEK_TLSLDM,
+ MEK_TPREL_HI,
+ MEK_TPREL_LO,
+ MEK_Special,
+ };
+
+private:
+ const MipsExprKind Kind;
+ const MCExpr *Expr;
+
+ explicit MipsMCExpr(MipsExprKind Kind, const MCExpr *Expr)
+ : Kind(Kind), Expr(Expr) {}
+
+public:
+ static const MipsMCExpr *create(MipsExprKind Kind, const MCExpr *Expr,
+ MCContext &Ctx);
+ static const MipsMCExpr *createGpOff(MipsExprKind Kind, const MCExpr *Expr,
+ MCContext &Ctx);
+
+ /// Get the kind of this expression.
+ MipsExprKind getKind() const { return Kind; }
+
+ /// Get the child of this expression.
+ const MCExpr *getSubExpr() const { return Expr; }
+
+ void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+ bool evaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const override;
+ void visitUsedExpr(MCStreamer &Streamer) const override;
+ MCFragment *findAssociatedFragment() const override {
+ return getSubExpr()->findAssociatedFragment();
+ }
+
+ void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override;
+
+ static bool classof(const MCExpr *E) {
+ return E->getKind() == MCExpr::Target;
+ }
+
+ bool isGpOff(MipsExprKind &Kind) const;
+ bool isGpOff() const {
+ MipsExprKind Kind;
+ return isGpOff(Kind);
+ }
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
new file mode 100644
index 000000000000..687b800c2409
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
@@ -0,0 +1,30 @@
+//===-- MipsMCNaCl.h - NaCl-related declarations --------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCNACL_H
+#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCNACL_H
+
+#include "llvm/MC/MCELFStreamer.h"
+
+namespace llvm {
+
+// Log2 of the NaCl MIPS sandbox's instruction bundle size.
+static const unsigned MIPS_NACL_BUNDLE_ALIGN = 4u;
+
+bool isBasePlusOffsetMemoryAccess(unsigned Opcode, unsigned *AddrIdx,
+ bool *IsStore = nullptr);
+bool baseRegNeedsLoadStoreMask(unsigned Reg);
+
+// This function creates an MCELFStreamer for Mips NaCl.
+MCELFStreamer *createMipsNaClELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+ raw_pwrite_stream &OS,
+ MCCodeEmitter *Emitter, bool RelaxAll);
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
new file mode 100644
index 000000000000..56fe18572118
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -0,0 +1,201 @@
+//===-- MipsMCTargetDesc.cpp - Mips Target Descriptions -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Mips specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsMCTargetDesc.h"
+#include "InstPrinter/MipsInstPrinter.h"
+#include "MipsELFStreamer.h"
+#include "MipsMCAsmInfo.h"
+#include "MipsMCNaCl.h"
+#include "MipsTargetStreamer.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "MipsGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "MipsGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "MipsGenRegisterInfo.inc"
+
+/// Select the Mips CPU for the given triple and cpu name.
+/// FIXME: Merge with the copy in MipsSubtarget.cpp
+StringRef MIPS_MC::selectMipsCPU(const Triple &TT, StringRef CPU) {
+ if (CPU.empty() || CPU == "generic") {
+ if (TT.getArch() == Triple::mips || TT.getArch() == Triple::mipsel)
+ CPU = "mips32";
+ else
+ CPU = "mips64";
+ }
+ return CPU;
+}
+
+static MCInstrInfo *createMipsMCInstrInfo() {
+ MCInstrInfo *X = new MCInstrInfo();
+ InitMipsMCInstrInfo(X);
+ return X;
+}
+
+static MCRegisterInfo *createMipsMCRegisterInfo(const Triple &TT) {
+ MCRegisterInfo *X = new MCRegisterInfo();
+ InitMipsMCRegisterInfo(X, Mips::RA);
+ return X;
+}
+
+static MCSubtargetInfo *createMipsMCSubtargetInfo(const Triple &TT,
+ StringRef CPU, StringRef FS) {
+ CPU = MIPS_MC::selectMipsCPU(TT, CPU);
+ return createMipsMCSubtargetInfoImpl(TT, CPU, FS);
+}
+
+static MCAsmInfo *createMipsMCAsmInfo(const MCRegisterInfo &MRI,
+ const Triple &TT) {
+ MCAsmInfo *MAI = new MipsMCAsmInfo(TT);
+
+ unsigned SP = MRI.getDwarfRegNum(Mips::SP, true);
+ MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, SP, 0);
+ MAI->addInitialFrameState(Inst);
+
+ return MAI;
+}
+
+static MCInstPrinter *createMipsMCInstPrinter(const Triple &T,
+ unsigned SyntaxVariant,
+ const MCAsmInfo &MAI,
+ const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI) {
+ return new MipsInstPrinter(MAI, MII, MRI);
+}
+
+static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context,
+ MCAsmBackend &MAB, raw_pwrite_stream &OS,
+ MCCodeEmitter *Emitter, bool RelaxAll) {
+ MCStreamer *S;
+ if (!T.isOSNaCl())
+ S = createMipsELFStreamer(Context, MAB, OS, Emitter, RelaxAll);
+ else
+ S = createMipsNaClELFStreamer(Context, MAB, OS, Emitter, RelaxAll);
+ return S;
+}
+
+static MCTargetStreamer *createMipsAsmTargetStreamer(MCStreamer &S,
+ formatted_raw_ostream &OS,
+ MCInstPrinter *InstPrint,
+ bool isVerboseAsm) {
+ return new MipsTargetAsmStreamer(S, OS);
+}
+
+static MCTargetStreamer *createMipsNullTargetStreamer(MCStreamer &S) {
+ return new MipsTargetStreamer(S);
+}
+
+static MCTargetStreamer *
+createMipsObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
+ return new MipsTargetELFStreamer(S, STI);
+}
+
+namespace {
+
+class MipsMCInstrAnalysis : public MCInstrAnalysis {
+public:
+ MipsMCInstrAnalysis(const MCInstrInfo *Info) : MCInstrAnalysis(Info) {}
+
+ bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size,
+ uint64_t &Target) const override {
+ unsigned NumOps = Inst.getNumOperands();
+ if (NumOps == 0)
+ return false;
+ switch (Info->get(Inst.getOpcode()).OpInfo[NumOps - 1].OperandType) {
+ case MCOI::OPERAND_UNKNOWN:
+ case MCOI::OPERAND_IMMEDIATE:
+ // jal, bal ...
+ Target = Inst.getOperand(NumOps - 1).getImm();
+ return true;
+ case MCOI::OPERAND_PCREL:
+ // b, j, beq ...
+ Target = Addr + Inst.getOperand(NumOps - 1).getImm();
+ return true;
+ default:
+ return false;
+ }
+ }
+};
+}
+
+static MCInstrAnalysis *createMipsMCInstrAnalysis(const MCInstrInfo *Info) {
+ return new MipsMCInstrAnalysis(Info);
+}
+
+extern "C" void LLVMInitializeMipsTargetMC() {
+ for (Target *T : {&getTheMipsTarget(), &getTheMipselTarget(),
+ &getTheMips64Target(), &getTheMips64elTarget()}) {
+ // Register the MC asm info.
+ RegisterMCAsmInfoFn X(*T, createMipsMCAsmInfo);
+
+ // Register the MC instruction info.
+ TargetRegistry::RegisterMCInstrInfo(*T, createMipsMCInstrInfo);
+
+ // Register the MC register info.
+ TargetRegistry::RegisterMCRegInfo(*T, createMipsMCRegisterInfo);
+
+ // Register the elf streamer.
+ TargetRegistry::RegisterELFStreamer(*T, createMCStreamer);
+
+ // Register the asm target streamer.
+ TargetRegistry::RegisterAsmTargetStreamer(*T, createMipsAsmTargetStreamer);
+
+ TargetRegistry::RegisterNullTargetStreamer(*T,
+ createMipsNullTargetStreamer);
+
+ // Register the MC subtarget info.
+ TargetRegistry::RegisterMCSubtargetInfo(*T, createMipsMCSubtargetInfo);
+
+ // Register the MC instruction analyzer.
+ TargetRegistry::RegisterMCInstrAnalysis(*T, createMipsMCInstrAnalysis);
+
+ // Register the MCInstPrinter.
+ TargetRegistry::RegisterMCInstPrinter(*T, createMipsMCInstPrinter);
+
+ TargetRegistry::RegisterObjectTargetStreamer(
+ *T, createMipsObjectTargetStreamer);
+ }
+
+ // Register the MC Code Emitter
+ for (Target *T : {&getTheMipsTarget(), &getTheMips64Target()})
+ TargetRegistry::RegisterMCCodeEmitter(*T, createMipsMCCodeEmitterEB);
+
+ for (Target *T : {&getTheMipselTarget(), &getTheMips64elTarget()})
+ TargetRegistry::RegisterMCCodeEmitter(*T, createMipsMCCodeEmitterEL);
+
+ // Register the asm backend.
+ TargetRegistry::RegisterMCAsmBackend(getTheMipsTarget(),
+ createMipsAsmBackendEB32);
+ TargetRegistry::RegisterMCAsmBackend(getTheMipselTarget(),
+ createMipsAsmBackendEL32);
+ TargetRegistry::RegisterMCAsmBackend(getTheMips64Target(),
+ createMipsAsmBackendEB64);
+ TargetRegistry::RegisterMCAsmBackend(getTheMips64elTarget(),
+ createMipsAsmBackendEL64);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
new file mode 100644
index 000000000000..b28681f42ebe
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
@@ -0,0 +1,84 @@
+//===-- MipsMCTargetDesc.h - Mips Target Descriptions -----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Mips specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCTARGETDESC_H
+#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class MCTargetOptions;
+class StringRef;
+class Target;
+class Triple;
+class raw_ostream;
+class raw_pwrite_stream;
+
+Target &getTheMipsTarget();
+Target &getTheMipselTarget();
+Target &getTheMips64Target();
+Target &getTheMips64elTarget();
+
+MCCodeEmitter *createMipsMCCodeEmitterEB(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx);
+MCCodeEmitter *createMipsMCCodeEmitterEL(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx);
+
+MCAsmBackend *createMipsAsmBackendEB32(const Target &T,
+ const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options);
+MCAsmBackend *createMipsAsmBackendEL32(const Target &T,
+ const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options);
+MCAsmBackend *createMipsAsmBackendEB64(const Target &T,
+ const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options);
+MCAsmBackend *createMipsAsmBackendEL64(const Target &T,
+ const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options);
+
+MCObjectWriter *createMipsELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI,
+ bool IsLittleEndian, bool Is64Bit);
+
+namespace MIPS_MC {
+StringRef selectMipsCPU(const Triple &TT, StringRef CPU);
+}
+
+} // End llvm namespace
+
+// Defines symbolic names for Mips registers. This defines a mapping from
+// register name to register number.
+#define GET_REGINFO_ENUM
+#include "MipsGenRegisterInfo.inc"
+
+// Defines symbolic names for the Mips instructions.
+#define GET_INSTRINFO_ENUM
+#include "MipsGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "MipsGenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
new file mode 100644
index 000000000000..aef9bd3a8e2a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
@@ -0,0 +1,268 @@
+//===-- MipsNaClELFStreamer.cpp - ELF Object Output for Mips NaCl ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements MCELFStreamer for Mips NaCl. It emits .o object files
+// as required by NaCl's SFI sandbox. It inserts address-masking instructions
+// before dangerous control-flow and memory access instructions. It inserts
+// address-masking instructions after instructions that change the stack
+// pointer. It ensures that the mask and the dangerous instruction are always
+// emitted in the same bundle. It aligns call + branch delay to the bundle end,
+// so that return address is always aligned to the start of next bundle.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips.h"
+#include "MipsELFStreamer.h"
+#include "MipsMCNaCl.h"
+#include "llvm/MC/MCELFStreamer.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-mc-nacl"
+
+namespace {
+
+const unsigned IndirectBranchMaskReg = Mips::T6;
+const unsigned LoadStoreStackMaskReg = Mips::T7;
+
+/// Extend the generic MCELFStreamer class so that it can mask dangerous
+/// instructions.
+
+class MipsNaClELFStreamer : public MipsELFStreamer {
+public:
+ MipsNaClELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+ raw_pwrite_stream &OS, MCCodeEmitter *Emitter)
+ : MipsELFStreamer(Context, TAB, OS, Emitter), PendingCall(false) {}
+
+ ~MipsNaClELFStreamer() override {}
+
+private:
+ // Whether we started the sandboxing sequence for calls. Calls are bundled
+ // with branch delays and aligned to the bundle end.
+ bool PendingCall;
+
+ bool isIndirectJump(const MCInst &MI) {
+ if (MI.getOpcode() == Mips::JALR) {
+ // MIPS32r6/MIPS64r6 doesn't have a JR instruction and uses JALR instead.
+ // JALR is an indirect branch if the link register is $0.
+ assert(MI.getOperand(0).isReg());
+ return MI.getOperand(0).getReg() == Mips::ZERO;
+ }
+ return MI.getOpcode() == Mips::JR;
+ }
+
+ bool isStackPointerFirstOperand(const MCInst &MI) {
+ return (MI.getNumOperands() > 0 && MI.getOperand(0).isReg()
+ && MI.getOperand(0).getReg() == Mips::SP);
+ }
+
+ bool isCall(const MCInst &MI, bool *IsIndirectCall) {
+ unsigned Opcode = MI.getOpcode();
+
+ *IsIndirectCall = false;
+
+ switch (Opcode) {
+ default:
+ return false;
+
+ case Mips::JAL:
+ case Mips::BAL:
+ case Mips::BAL_BR:
+ case Mips::BLTZAL:
+ case Mips::BGEZAL:
+ return true;
+
+ case Mips::JALR:
+ // JALR is only a call if the link register is not $0. Otherwise it's an
+ // indirect branch.
+ assert(MI.getOperand(0).isReg());
+ if (MI.getOperand(0).getReg() == Mips::ZERO)
+ return false;
+
+ *IsIndirectCall = true;
+ return true;
+ }
+ }
+
+ void emitMask(unsigned AddrReg, unsigned MaskReg,
+ const MCSubtargetInfo &STI) {
+ MCInst MaskInst;
+ MaskInst.setOpcode(Mips::AND);
+ MaskInst.addOperand(MCOperand::createReg(AddrReg));
+ MaskInst.addOperand(MCOperand::createReg(AddrReg));
+ MaskInst.addOperand(MCOperand::createReg(MaskReg));
+ MipsELFStreamer::EmitInstruction(MaskInst, STI);
+ }
+
+ // Sandbox indirect branch or return instruction by inserting mask operation
+ // before it.
+ void sandboxIndirectJump(const MCInst &MI, const MCSubtargetInfo &STI) {
+ unsigned AddrReg = MI.getOperand(0).getReg();
+
+ EmitBundleLock(false);
+ emitMask(AddrReg, IndirectBranchMaskReg, STI);
+ MipsELFStreamer::EmitInstruction(MI, STI);
+ EmitBundleUnlock();
+ }
+
+ // Sandbox memory access or SP change. Insert mask operation before and/or
+ // after the instruction.
+ void sandboxLoadStoreStackChange(const MCInst &MI, unsigned AddrIdx,
+ const MCSubtargetInfo &STI, bool MaskBefore,
+ bool MaskAfter) {
+ EmitBundleLock(false);
+ if (MaskBefore) {
+ // Sandbox memory access.
+ unsigned BaseReg = MI.getOperand(AddrIdx).getReg();
+ emitMask(BaseReg, LoadStoreStackMaskReg, STI);
+ }
+ MipsELFStreamer::EmitInstruction(MI, STI);
+ if (MaskAfter) {
+ // Sandbox SP change.
+ unsigned SPReg = MI.getOperand(0).getReg();
+ assert((Mips::SP == SPReg) && "Unexpected stack-pointer register.");
+ emitMask(SPReg, LoadStoreStackMaskReg, STI);
+ }
+ EmitBundleUnlock();
+ }
+
+public:
+ /// This function is the one used to emit instruction data into the ELF
+ /// streamer. We override it to mask dangerous instructions.
+ void EmitInstruction(const MCInst &Inst,
+ const MCSubtargetInfo &STI) override {
+ // Sandbox indirect jumps.
+ if (isIndirectJump(Inst)) {
+ if (PendingCall)
+ report_fatal_error("Dangerous instruction in branch delay slot!");
+ sandboxIndirectJump(Inst, STI);
+ return;
+ }
+
+ // Sandbox loads, stores and SP changes.
+ unsigned AddrIdx;
+ bool IsStore;
+ bool IsMemAccess = isBasePlusOffsetMemoryAccess(Inst.getOpcode(), &AddrIdx,
+ &IsStore);
+ bool IsSPFirstOperand = isStackPointerFirstOperand(Inst);
+ if (IsMemAccess || IsSPFirstOperand) {
+ bool MaskBefore = (IsMemAccess
+ && baseRegNeedsLoadStoreMask(Inst.getOperand(AddrIdx)
+ .getReg()));
+ bool MaskAfter = IsSPFirstOperand && !IsStore;
+ if (MaskBefore || MaskAfter) {
+ if (PendingCall)
+ report_fatal_error("Dangerous instruction in branch delay slot!");
+ sandboxLoadStoreStackChange(Inst, AddrIdx, STI, MaskBefore, MaskAfter);
+ return;
+ }
+ // fallthrough
+ }
+
+ // Sandbox calls by aligning call and branch delay to the bundle end.
+ // For indirect calls, emit the mask before the call.
+ bool IsIndirectCall;
+ if (isCall(Inst, &IsIndirectCall)) {
+ if (PendingCall)
+ report_fatal_error("Dangerous instruction in branch delay slot!");
+
+ // Start the sandboxing sequence by emitting call.
+ EmitBundleLock(true);
+ if (IsIndirectCall) {
+ unsigned TargetReg = Inst.getOperand(1).getReg();
+ emitMask(TargetReg, IndirectBranchMaskReg, STI);
+ }
+ MipsELFStreamer::EmitInstruction(Inst, STI);
+ PendingCall = true;
+ return;
+ }
+ if (PendingCall) {
+ // Finish the sandboxing sequence by emitting branch delay.
+ MipsELFStreamer::EmitInstruction(Inst, STI);
+ EmitBundleUnlock();
+ PendingCall = false;
+ return;
+ }
+
+ // None of the sandboxing applies, just emit the instruction.
+ MipsELFStreamer::EmitInstruction(Inst, STI);
+ }
+};
+
+} // end anonymous namespace
+
+namespace llvm {
+
+bool isBasePlusOffsetMemoryAccess(unsigned Opcode, unsigned *AddrIdx,
+ bool *IsStore) {
+ if (IsStore)
+ *IsStore = false;
+
+ switch (Opcode) {
+ default:
+ return false;
+
+ // Load instructions with base address register in position 1.
+ case Mips::LB:
+ case Mips::LBu:
+ case Mips::LH:
+ case Mips::LHu:
+ case Mips::LW:
+ case Mips::LWC1:
+ case Mips::LDC1:
+ case Mips::LL:
+ case Mips::LL_R6:
+ case Mips::LWL:
+ case Mips::LWR:
+ *AddrIdx = 1;
+ return true;
+
+ // Store instructions with base address register in position 1.
+ case Mips::SB:
+ case Mips::SH:
+ case Mips::SW:
+ case Mips::SWC1:
+ case Mips::SDC1:
+ case Mips::SWL:
+ case Mips::SWR:
+ *AddrIdx = 1;
+ if (IsStore)
+ *IsStore = true;
+ return true;
+
+ // Store instructions with base address register in position 2.
+ case Mips::SC:
+ case Mips::SC_R6:
+ *AddrIdx = 2;
+ if (IsStore)
+ *IsStore = true;
+ return true;
+ }
+}
+
+bool baseRegNeedsLoadStoreMask(unsigned Reg) {
+ // The contents of SP and thread pointer register do not require masking.
+ return Reg != Mips::SP && Reg != Mips::T8;
+}
+
+MCELFStreamer *createMipsNaClELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+ raw_pwrite_stream &OS,
+ MCCodeEmitter *Emitter,
+ bool RelaxAll) {
+ MipsNaClELFStreamer *S = new MipsNaClELFStreamer(Context, TAB, OS, Emitter);
+ if (RelaxAll)
+ S->getAssembler().setRelaxAll(true);
+
+ // Set bundle-alignment as required by the NaCl ABI for the target.
+ S->EmitBundleAlignMode(MIPS_NACL_BUNDLE_ALIGN);
+
+ return S;
+}
+
+}
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
new file mode 100644
index 000000000000..24b602810d6e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
@@ -0,0 +1,95 @@
+//===-- MipsOptionRecord.cpp - Abstraction for storing information --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsOptionRecord.h"
+#include "MipsELFStreamer.h"
+#include "MipsTargetStreamer.h"
+#include "llvm/MC/MCSectionELF.h"
+
+using namespace llvm;
+
+void MipsRegInfoRecord::EmitMipsOptionRecord() {
+ MCAssembler &MCA = Streamer->getAssembler();
+ MipsTargetStreamer *MTS =
+ static_cast<MipsTargetStreamer *>(Streamer->getTargetStreamer());
+
+ Streamer->PushSection();
+
+ // We need to distinguish between N64 and the rest because at the moment
+ // we don't emit .Mips.options for other ELFs other than N64.
+ // Since .reginfo has the same information as .Mips.options (ODK_REGINFO),
+ // we can use the same abstraction (MipsRegInfoRecord class) to handle both.
+ if (MTS->getABI().IsN64()) {
+ // The EntrySize value of 1 seems strange since the records are neither
+ // 1-byte long nor fixed length but it matches the value GAS emits.
+ MCSectionELF *Sec =
+ Context.getELFSection(".MIPS.options", ELF::SHT_MIPS_OPTIONS,
+ ELF::SHF_ALLOC | ELF::SHF_MIPS_NOSTRIP, 1, "");
+ MCA.registerSection(*Sec);
+ Sec->setAlignment(8);
+ Streamer->SwitchSection(Sec);
+
+ Streamer->EmitIntValue(ELF::ODK_REGINFO, 1); // kind
+ Streamer->EmitIntValue(40, 1); // size
+ Streamer->EmitIntValue(0, 2); // section
+ Streamer->EmitIntValue(0, 4); // info
+ Streamer->EmitIntValue(ri_gprmask, 4);
+ Streamer->EmitIntValue(0, 4); // pad
+ Streamer->EmitIntValue(ri_cprmask[0], 4);
+ Streamer->EmitIntValue(ri_cprmask[1], 4);
+ Streamer->EmitIntValue(ri_cprmask[2], 4);
+ Streamer->EmitIntValue(ri_cprmask[3], 4);
+ Streamer->EmitIntValue(ri_gp_value, 8);
+ } else {
+ MCSectionELF *Sec = Context.getELFSection(".reginfo", ELF::SHT_MIPS_REGINFO,
+ ELF::SHF_ALLOC, 24, "");
+ MCA.registerSection(*Sec);
+ Sec->setAlignment(MTS->getABI().IsN32() ? 8 : 4);
+ Streamer->SwitchSection(Sec);
+
+ Streamer->EmitIntValue(ri_gprmask, 4);
+ Streamer->EmitIntValue(ri_cprmask[0], 4);
+ Streamer->EmitIntValue(ri_cprmask[1], 4);
+ Streamer->EmitIntValue(ri_cprmask[2], 4);
+ Streamer->EmitIntValue(ri_cprmask[3], 4);
+ assert((ri_gp_value & 0xffffffff) == ri_gp_value);
+ Streamer->EmitIntValue(ri_gp_value, 4);
+ }
+
+ Streamer->PopSection();
+}
+
+void MipsRegInfoRecord::SetPhysRegUsed(unsigned Reg,
+ const MCRegisterInfo *MCRegInfo) {
+ unsigned Value = 0;
+
+ for (MCSubRegIterator SubRegIt(Reg, MCRegInfo, true); SubRegIt.isValid();
+ ++SubRegIt) {
+ unsigned CurrentSubReg = *SubRegIt;
+
+ unsigned EncVal = MCRegInfo->getEncodingValue(CurrentSubReg);
+ Value |= 1 << EncVal;
+
+ if (GPR32RegClass->contains(CurrentSubReg) ||
+ GPR64RegClass->contains(CurrentSubReg))
+ ri_gprmask |= Value;
+ else if (COP0RegClass->contains(CurrentSubReg))
+ ri_cprmask[0] |= Value;
+ // MIPS COP1 is the FPU.
+ else if (FGR32RegClass->contains(CurrentSubReg) ||
+ FGR64RegClass->contains(CurrentSubReg) ||
+ AFGR64RegClass->contains(CurrentSubReg) ||
+ MSA128BRegClass->contains(CurrentSubReg))
+ ri_cprmask[1] |= Value;
+ else if (COP2RegClass->contains(CurrentSubReg))
+ ri_cprmask[2] |= Value;
+ else if (COP3RegClass->contains(CurrentSubReg))
+ ri_cprmask[3] |= Value;
+ }
+}
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
new file mode 100644
index 000000000000..7f79eb400f59
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -0,0 +1,1166 @@
+//===-- MipsTargetStreamer.cpp - Mips Target Streamer Methods -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Mips specific target streamer methods.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsTargetStreamer.h"
+#include "InstPrinter/MipsInstPrinter.h"
+#include "MipsELFStreamer.h"
+#include "MipsMCExpr.h"
+#include "MipsMCTargetDesc.h"
+#include "MipsTargetObjectFile.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+
+using namespace llvm;
+
+namespace {
+static cl::opt<bool> RoundSectionSizes(
+ "mips-round-section-sizes", cl::init(false),
+ cl::desc("Round section sizes up to the section alignment"), cl::Hidden);
+} // end anonymous namespace
+
+MipsTargetStreamer::MipsTargetStreamer(MCStreamer &S)
+ : MCTargetStreamer(S), ModuleDirectiveAllowed(true) {
+ GPRInfoSet = FPRInfoSet = FrameInfoSet = false;
+}
+void MipsTargetStreamer::emitDirectiveSetMicroMips() {}
+void MipsTargetStreamer::emitDirectiveSetNoMicroMips() {}
+void MipsTargetStreamer::setUsesMicroMips() {}
+void MipsTargetStreamer::emitDirectiveSetMips16() {}
+void MipsTargetStreamer::emitDirectiveSetNoMips16() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetReorder() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetNoReorder() {}
+void MipsTargetStreamer::emitDirectiveSetMacro() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetNoMacro() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMsa() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetNoMsa() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetAt() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetAtWithArg(unsigned RegNo) {
+ forbidModuleDirective();
+}
+void MipsTargetStreamer::emitDirectiveSetNoAt() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveEnd(StringRef Name) {}
+void MipsTargetStreamer::emitDirectiveEnt(const MCSymbol &Symbol) {}
+void MipsTargetStreamer::emitDirectiveAbiCalls() {}
+void MipsTargetStreamer::emitDirectiveNaN2008() {}
+void MipsTargetStreamer::emitDirectiveNaNLegacy() {}
+void MipsTargetStreamer::emitDirectiveOptionPic0() {}
+void MipsTargetStreamer::emitDirectiveOptionPic2() {}
+void MipsTargetStreamer::emitDirectiveInsn() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitFrame(unsigned StackReg, unsigned StackSize,
+ unsigned ReturnReg) {}
+void MipsTargetStreamer::emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff) {}
+void MipsTargetStreamer::emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff) {
+}
+void MipsTargetStreamer::emitDirectiveSetArch(StringRef Arch) {
+ forbidModuleDirective();
+}
+void MipsTargetStreamer::emitDirectiveSetMips0() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips1() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips2() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips3() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips4() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips5() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips32() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips32R2() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips32R3() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips32R5() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips32R6() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips64() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips64R2() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips64R3() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips64R5() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips64R6() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetPop() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetPush() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetSoftFloat() {
+ forbidModuleDirective();
+}
+void MipsTargetStreamer::emitDirectiveSetHardFloat() {
+ forbidModuleDirective();
+}
+void MipsTargetStreamer::emitDirectiveSetDsp() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetNoDsp() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveCpLoad(unsigned RegNo) {}
+bool MipsTargetStreamer::emitDirectiveCpRestore(
+ int Offset, function_ref<unsigned()> GetATReg, SMLoc IDLoc,
+ const MCSubtargetInfo *STI) {
+ forbidModuleDirective();
+ return true;
+}
+void MipsTargetStreamer::emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
+ const MCSymbol &Sym, bool IsReg) {
+}
+void MipsTargetStreamer::emitDirectiveCpreturn(unsigned SaveLocation,
+ bool SaveLocationIsRegister) {}
+
+void MipsTargetStreamer::emitDirectiveModuleFP() {}
+
+void MipsTargetStreamer::emitDirectiveModuleOddSPReg() {
+ if (!ABIFlagsSection.OddSPReg && !ABIFlagsSection.Is32BitABI)
+ report_fatal_error("+nooddspreg is only valid for O32");
+}
+void MipsTargetStreamer::emitDirectiveModuleSoftFloat() {}
+void MipsTargetStreamer::emitDirectiveModuleHardFloat() {}
+void MipsTargetStreamer::emitDirectiveSetFp(
+ MipsABIFlagsSection::FpABIKind Value) {
+ forbidModuleDirective();
+}
+void MipsTargetStreamer::emitDirectiveSetOddSPReg() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetNoOddSPReg() {
+ forbidModuleDirective();
+}
+
+void MipsTargetStreamer::emitR(unsigned Opcode, unsigned Reg0, SMLoc IDLoc,
+ const MCSubtargetInfo *STI) {
+ MCInst TmpInst;
+ TmpInst.setOpcode(Opcode);
+ TmpInst.addOperand(MCOperand::createReg(Reg0));
+ TmpInst.setLoc(IDLoc);
+ getStreamer().EmitInstruction(TmpInst, *STI);
+}
+
+void MipsTargetStreamer::emitRX(unsigned Opcode, unsigned Reg0, MCOperand Op1,
+ SMLoc IDLoc, const MCSubtargetInfo *STI) {
+ MCInst TmpInst;
+ TmpInst.setOpcode(Opcode);
+ TmpInst.addOperand(MCOperand::createReg(Reg0));
+ TmpInst.addOperand(Op1);
+ TmpInst.setLoc(IDLoc);
+ getStreamer().EmitInstruction(TmpInst, *STI);
+}
+
+void MipsTargetStreamer::emitRI(unsigned Opcode, unsigned Reg0, int32_t Imm,
+ SMLoc IDLoc, const MCSubtargetInfo *STI) {
+ emitRX(Opcode, Reg0, MCOperand::createImm(Imm), IDLoc, STI);
+}
+
+void MipsTargetStreamer::emitRR(unsigned Opcode, unsigned Reg0, unsigned Reg1,
+ SMLoc IDLoc, const MCSubtargetInfo *STI) {
+ emitRX(Opcode, Reg0, MCOperand::createReg(Reg1), IDLoc, STI);
+}
+
+void MipsTargetStreamer::emitII(unsigned Opcode, int16_t Imm1, int16_t Imm2,
+ SMLoc IDLoc, const MCSubtargetInfo *STI) {
+ MCInst TmpInst;
+ TmpInst.setOpcode(Opcode);
+ TmpInst.addOperand(MCOperand::createImm(Imm1));
+ TmpInst.addOperand(MCOperand::createImm(Imm2));
+ TmpInst.setLoc(IDLoc);
+ getStreamer().EmitInstruction(TmpInst, *STI);
+}
+
+void MipsTargetStreamer::emitRRX(unsigned Opcode, unsigned Reg0, unsigned Reg1,
+ MCOperand Op2, SMLoc IDLoc,
+ const MCSubtargetInfo *STI) {
+ MCInst TmpInst;
+ TmpInst.setOpcode(Opcode);
+ TmpInst.addOperand(MCOperand::createReg(Reg0));
+ TmpInst.addOperand(MCOperand::createReg(Reg1));
+ TmpInst.addOperand(Op2);
+ TmpInst.setLoc(IDLoc);
+ getStreamer().EmitInstruction(TmpInst, *STI);
+}
+
+void MipsTargetStreamer::emitRRR(unsigned Opcode, unsigned Reg0, unsigned Reg1,
+ unsigned Reg2, SMLoc IDLoc,
+ const MCSubtargetInfo *STI) {
+ emitRRX(Opcode, Reg0, Reg1, MCOperand::createReg(Reg2), IDLoc, STI);
+}
+
+void MipsTargetStreamer::emitRRI(unsigned Opcode, unsigned Reg0, unsigned Reg1,
+ int16_t Imm, SMLoc IDLoc,
+ const MCSubtargetInfo *STI) {
+ emitRRX(Opcode, Reg0, Reg1, MCOperand::createImm(Imm), IDLoc, STI);
+}
+
+void MipsTargetStreamer::emitAddu(unsigned DstReg, unsigned SrcReg,
+ unsigned TrgReg, bool Is64Bit,
+ const MCSubtargetInfo *STI) {
+ emitRRR(Is64Bit ? Mips::DADDu : Mips::ADDu, DstReg, SrcReg, TrgReg, SMLoc(),
+ STI);
+}
+
+void MipsTargetStreamer::emitDSLL(unsigned DstReg, unsigned SrcReg,
+ int16_t ShiftAmount, SMLoc IDLoc,
+ const MCSubtargetInfo *STI) {
+ if (ShiftAmount >= 32) {
+ emitRRI(Mips::DSLL32, DstReg, SrcReg, ShiftAmount - 32, IDLoc, STI);
+ return;
+ }
+
+ emitRRI(Mips::DSLL, DstReg, SrcReg, ShiftAmount, IDLoc, STI);
+}
+
+void MipsTargetStreamer::emitEmptyDelaySlot(bool hasShortDelaySlot, SMLoc IDLoc,
+ const MCSubtargetInfo *STI) {
+ if (hasShortDelaySlot)
+ emitRR(Mips::MOVE16_MM, Mips::ZERO, Mips::ZERO, IDLoc, STI);
+ else
+ emitRRI(Mips::SLL, Mips::ZERO, Mips::ZERO, 0, IDLoc, STI);
+}
+
+void MipsTargetStreamer::emitNop(SMLoc IDLoc, const MCSubtargetInfo *STI) {
+ emitRRI(Mips::SLL, Mips::ZERO, Mips::ZERO, 0, IDLoc, STI);
+}
+
+/// Emit the $gp restore operation for .cprestore.
+void MipsTargetStreamer::emitGPRestore(int Offset, SMLoc IDLoc,
+ const MCSubtargetInfo *STI) {
+ emitLoadWithImmOffset(Mips::LW, Mips::GP, Mips::SP, Offset, Mips::GP, IDLoc,
+ STI);
+}
+
+/// Emit a store instruction with an immediate offset.
+void MipsTargetStreamer::emitStoreWithImmOffset(
+ unsigned Opcode, unsigned SrcReg, unsigned BaseReg, int64_t Offset,
+ function_ref<unsigned()> GetATReg, SMLoc IDLoc,
+ const MCSubtargetInfo *STI) {
+ if (isInt<16>(Offset)) {
+ emitRRI(Opcode, SrcReg, BaseReg, Offset, IDLoc, STI);
+ return;
+ }
+
+ // sw $8, offset($8) => lui $at, %hi(offset)
+ // add $at, $at, $8
+ // sw $8, %lo(offset)($at)
+
+ unsigned ATReg = GetATReg();
+ if (!ATReg)
+ return;
+
+ unsigned LoOffset = Offset & 0x0000ffff;
+ unsigned HiOffset = (Offset & 0xffff0000) >> 16;
+
+ // If msb of LoOffset is 1(negative number) we must increment HiOffset
+ // to account for the sign-extension of the low part.
+ if (LoOffset & 0x8000)
+ HiOffset++;
+
+ // Generate the base address in ATReg.
+ emitRI(Mips::LUi, ATReg, HiOffset, IDLoc, STI);
+ if (BaseReg != Mips::ZERO)
+ emitRRR(Mips::ADDu, ATReg, ATReg, BaseReg, IDLoc, STI);
+ // Emit the store with the adjusted base and offset.
+ emitRRI(Opcode, SrcReg, ATReg, LoOffset, IDLoc, STI);
+}
+
+/// Emit a store instruction with an symbol offset. Symbols are assumed to be
+/// out of range for a simm16 will be expanded to appropriate instructions.
+void MipsTargetStreamer::emitStoreWithSymOffset(
+ unsigned Opcode, unsigned SrcReg, unsigned BaseReg, MCOperand &HiOperand,
+ MCOperand &LoOperand, unsigned ATReg, SMLoc IDLoc,
+ const MCSubtargetInfo *STI) {
+ // sw $8, sym => lui $at, %hi(sym)
+ // sw $8, %lo(sym)($at)
+
+ // Generate the base address in ATReg.
+ emitRX(Mips::LUi, ATReg, HiOperand, IDLoc, STI);
+ if (BaseReg != Mips::ZERO)
+ emitRRR(Mips::ADDu, ATReg, ATReg, BaseReg, IDLoc, STI);
+ // Emit the store with the adjusted base and offset.
+ emitRRX(Opcode, SrcReg, ATReg, LoOperand, IDLoc, STI);
+}
+
+/// Emit a load instruction with an immediate offset. DstReg and TmpReg are
+/// permitted to be the same register iff DstReg is distinct from BaseReg and
+/// DstReg is a GPR. It is the callers responsibility to identify such cases
+/// and pass the appropriate register in TmpReg.
+void MipsTargetStreamer::emitLoadWithImmOffset(unsigned Opcode, unsigned DstReg,
+ unsigned BaseReg, int64_t Offset,
+ unsigned TmpReg, SMLoc IDLoc,
+ const MCSubtargetInfo *STI) {
+ if (isInt<16>(Offset)) {
+ emitRRI(Opcode, DstReg, BaseReg, Offset, IDLoc, STI);
+ return;
+ }
+
+ // 1) lw $8, offset($9) => lui $8, %hi(offset)
+ // add $8, $8, $9
+ // lw $8, %lo(offset)($9)
+ // 2) lw $8, offset($8) => lui $at, %hi(offset)
+ // add $at, $at, $8
+ // lw $8, %lo(offset)($at)
+
+ unsigned LoOffset = Offset & 0x0000ffff;
+ unsigned HiOffset = (Offset & 0xffff0000) >> 16;
+
+ // If msb of LoOffset is 1(negative number) we must increment HiOffset
+ // to account for the sign-extension of the low part.
+ if (LoOffset & 0x8000)
+ HiOffset++;
+
+ // Generate the base address in TmpReg.
+ emitRI(Mips::LUi, TmpReg, HiOffset, IDLoc, STI);
+ if (BaseReg != Mips::ZERO)
+ emitRRR(Mips::ADDu, TmpReg, TmpReg, BaseReg, IDLoc, STI);
+ // Emit the load with the adjusted base and offset.
+ emitRRI(Opcode, DstReg, TmpReg, LoOffset, IDLoc, STI);
+}
+
+/// Emit a load instruction with an symbol offset. Symbols are assumed to be
+/// out of range for a simm16 will be expanded to appropriate instructions.
+/// DstReg and TmpReg are permitted to be the same register iff DstReg is a
+/// GPR. It is the callers responsibility to identify such cases and pass the
+/// appropriate register in TmpReg.
+void MipsTargetStreamer::emitLoadWithSymOffset(unsigned Opcode, unsigned DstReg,
+ unsigned BaseReg,
+ MCOperand &HiOperand,
+ MCOperand &LoOperand,
+ unsigned TmpReg, SMLoc IDLoc,
+ const MCSubtargetInfo *STI) {
+ // 1) lw $8, sym => lui $8, %hi(sym)
+ // lw $8, %lo(sym)($8)
+ // 2) ldc1 $f0, sym => lui $at, %hi(sym)
+ // ldc1 $f0, %lo(sym)($at)
+
+ // Generate the base address in TmpReg.
+ emitRX(Mips::LUi, TmpReg, HiOperand, IDLoc, STI);
+ if (BaseReg != Mips::ZERO)
+ emitRRR(Mips::ADDu, TmpReg, TmpReg, BaseReg, IDLoc, STI);
+ // Emit the load with the adjusted base and offset.
+ emitRRX(Opcode, DstReg, TmpReg, LoOperand, IDLoc, STI);
+}
+
+MipsTargetAsmStreamer::MipsTargetAsmStreamer(MCStreamer &S,
+ formatted_raw_ostream &OS)
+ : MipsTargetStreamer(S), OS(OS) {}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMicroMips() {
+ OS << "\t.set\tmicromips\n";
+ forbidModuleDirective();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetNoMicroMips() {
+ OS << "\t.set\tnomicromips\n";
+ forbidModuleDirective();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips16() {
+ OS << "\t.set\tmips16\n";
+ forbidModuleDirective();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetNoMips16() {
+ OS << "\t.set\tnomips16\n";
+ MipsTargetStreamer::emitDirectiveSetNoMips16();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetReorder() {
+ OS << "\t.set\treorder\n";
+ MipsTargetStreamer::emitDirectiveSetReorder();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetNoReorder() {
+ OS << "\t.set\tnoreorder\n";
+ forbidModuleDirective();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMacro() {
+ OS << "\t.set\tmacro\n";
+ MipsTargetStreamer::emitDirectiveSetMacro();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetNoMacro() {
+ OS << "\t.set\tnomacro\n";
+ MipsTargetStreamer::emitDirectiveSetNoMacro();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMsa() {
+ OS << "\t.set\tmsa\n";
+ MipsTargetStreamer::emitDirectiveSetMsa();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetNoMsa() {
+ OS << "\t.set\tnomsa\n";
+ MipsTargetStreamer::emitDirectiveSetNoMsa();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetAt() {
+ OS << "\t.set\tat\n";
+ MipsTargetStreamer::emitDirectiveSetAt();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetAtWithArg(unsigned RegNo) {
+ OS << "\t.set\tat=$" << Twine(RegNo) << "\n";
+ MipsTargetStreamer::emitDirectiveSetAtWithArg(RegNo);
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetNoAt() {
+ OS << "\t.set\tnoat\n";
+ MipsTargetStreamer::emitDirectiveSetNoAt();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveEnd(StringRef Name) {
+ OS << "\t.end\t" << Name << '\n';
+}
+
+void MipsTargetAsmStreamer::emitDirectiveEnt(const MCSymbol &Symbol) {
+ OS << "\t.ent\t" << Symbol.getName() << '\n';
+}
+
+void MipsTargetAsmStreamer::emitDirectiveAbiCalls() { OS << "\t.abicalls\n"; }
+
+void MipsTargetAsmStreamer::emitDirectiveNaN2008() { OS << "\t.nan\t2008\n"; }
+
+void MipsTargetAsmStreamer::emitDirectiveNaNLegacy() {
+ OS << "\t.nan\tlegacy\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveOptionPic0() {
+ OS << "\t.option\tpic0\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveOptionPic2() {
+ OS << "\t.option\tpic2\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveInsn() {
+ MipsTargetStreamer::emitDirectiveInsn();
+ OS << "\t.insn\n";
+}
+
+void MipsTargetAsmStreamer::emitFrame(unsigned StackReg, unsigned StackSize,
+ unsigned ReturnReg) {
+ OS << "\t.frame\t$"
+ << StringRef(MipsInstPrinter::getRegisterName(StackReg)).lower() << ","
+ << StackSize << ",$"
+ << StringRef(MipsInstPrinter::getRegisterName(ReturnReg)).lower() << '\n';
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetArch(StringRef Arch) {
+ OS << "\t.set arch=" << Arch << "\n";
+ MipsTargetStreamer::emitDirectiveSetArch(Arch);
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips0() {
+ OS << "\t.set\tmips0\n";
+ MipsTargetStreamer::emitDirectiveSetMips0();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips1() {
+ OS << "\t.set\tmips1\n";
+ MipsTargetStreamer::emitDirectiveSetMips1();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips2() {
+ OS << "\t.set\tmips2\n";
+ MipsTargetStreamer::emitDirectiveSetMips2();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips3() {
+ OS << "\t.set\tmips3\n";
+ MipsTargetStreamer::emitDirectiveSetMips3();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips4() {
+ OS << "\t.set\tmips4\n";
+ MipsTargetStreamer::emitDirectiveSetMips4();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips5() {
+ OS << "\t.set\tmips5\n";
+ MipsTargetStreamer::emitDirectiveSetMips5();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips32() {
+ OS << "\t.set\tmips32\n";
+ MipsTargetStreamer::emitDirectiveSetMips32();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips32R2() {
+ OS << "\t.set\tmips32r2\n";
+ MipsTargetStreamer::emitDirectiveSetMips32R2();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips32R3() {
+ OS << "\t.set\tmips32r3\n";
+ MipsTargetStreamer::emitDirectiveSetMips32R3();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips32R5() {
+ OS << "\t.set\tmips32r5\n";
+ MipsTargetStreamer::emitDirectiveSetMips32R5();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips32R6() {
+ OS << "\t.set\tmips32r6\n";
+ MipsTargetStreamer::emitDirectiveSetMips32R6();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips64() {
+ OS << "\t.set\tmips64\n";
+ MipsTargetStreamer::emitDirectiveSetMips64();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips64R2() {
+ OS << "\t.set\tmips64r2\n";
+ MipsTargetStreamer::emitDirectiveSetMips64R2();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips64R3() {
+ OS << "\t.set\tmips64r3\n";
+ MipsTargetStreamer::emitDirectiveSetMips64R3();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips64R5() {
+ OS << "\t.set\tmips64r5\n";
+ MipsTargetStreamer::emitDirectiveSetMips64R5();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips64R6() {
+ OS << "\t.set\tmips64r6\n";
+ MipsTargetStreamer::emitDirectiveSetMips64R6();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetDsp() {
+ OS << "\t.set\tdsp\n";
+ MipsTargetStreamer::emitDirectiveSetDsp();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetNoDsp() {
+ OS << "\t.set\tnodsp\n";
+ MipsTargetStreamer::emitDirectiveSetNoDsp();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetPop() {
+ OS << "\t.set\tpop\n";
+ MipsTargetStreamer::emitDirectiveSetPop();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetPush() {
+ OS << "\t.set\tpush\n";
+ MipsTargetStreamer::emitDirectiveSetPush();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetSoftFloat() {
+ OS << "\t.set\tsoftfloat\n";
+ MipsTargetStreamer::emitDirectiveSetSoftFloat();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetHardFloat() {
+ OS << "\t.set\thardfloat\n";
+ MipsTargetStreamer::emitDirectiveSetHardFloat();
+}
+
+// Print a 32 bit hex number with all numbers.
+static void printHex32(unsigned Value, raw_ostream &OS) {
+ OS << "0x";
+ for (int i = 7; i >= 0; i--)
+ OS.write_hex((Value & (0xF << (i * 4))) >> (i * 4));
+}
+
+void MipsTargetAsmStreamer::emitMask(unsigned CPUBitmask,
+ int CPUTopSavedRegOff) {
+ OS << "\t.mask \t";
+ printHex32(CPUBitmask, OS);
+ OS << ',' << CPUTopSavedRegOff << '\n';
+}
+
+void MipsTargetAsmStreamer::emitFMask(unsigned FPUBitmask,
+ int FPUTopSavedRegOff) {
+ OS << "\t.fmask\t";
+ printHex32(FPUBitmask, OS);
+ OS << "," << FPUTopSavedRegOff << '\n';
+}
+
+void MipsTargetAsmStreamer::emitDirectiveCpLoad(unsigned RegNo) {
+ OS << "\t.cpload\t$"
+ << StringRef(MipsInstPrinter::getRegisterName(RegNo)).lower() << "\n";
+ forbidModuleDirective();
+}
+
+bool MipsTargetAsmStreamer::emitDirectiveCpRestore(
+ int Offset, function_ref<unsigned()> GetATReg, SMLoc IDLoc,
+ const MCSubtargetInfo *STI) {
+ MipsTargetStreamer::emitDirectiveCpRestore(Offset, GetATReg, IDLoc, STI);
+ OS << "\t.cprestore\t" << Offset << "\n";
+ return true;
+}
+
+void MipsTargetAsmStreamer::emitDirectiveCpsetup(unsigned RegNo,
+ int RegOrOffset,
+ const MCSymbol &Sym,
+ bool IsReg) {
+ OS << "\t.cpsetup\t$"
+ << StringRef(MipsInstPrinter::getRegisterName(RegNo)).lower() << ", ";
+
+ if (IsReg)
+ OS << "$"
+ << StringRef(MipsInstPrinter::getRegisterName(RegOrOffset)).lower();
+ else
+ OS << RegOrOffset;
+
+ OS << ", ";
+
+ OS << Sym.getName();
+ forbidModuleDirective();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveCpreturn(unsigned SaveLocation,
+ bool SaveLocationIsRegister) {
+ OS << "\t.cpreturn";
+ forbidModuleDirective();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveModuleFP() {
+ OS << "\t.module\tfp=";
+ OS << ABIFlagsSection.getFpABIString(ABIFlagsSection.getFpABI()) << "\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetFp(
+ MipsABIFlagsSection::FpABIKind Value) {
+ MipsTargetStreamer::emitDirectiveSetFp(Value);
+
+ OS << "\t.set\tfp=";
+ OS << ABIFlagsSection.getFpABIString(Value) << "\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveModuleOddSPReg() {
+ MipsTargetStreamer::emitDirectiveModuleOddSPReg();
+
+ OS << "\t.module\t" << (ABIFlagsSection.OddSPReg ? "" : "no") << "oddspreg\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetOddSPReg() {
+ MipsTargetStreamer::emitDirectiveSetOddSPReg();
+ OS << "\t.set\toddspreg\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetNoOddSPReg() {
+ MipsTargetStreamer::emitDirectiveSetNoOddSPReg();
+ OS << "\t.set\tnooddspreg\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveModuleSoftFloat() {
+ OS << "\t.module\tsoftfloat\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveModuleHardFloat() {
+ OS << "\t.module\thardfloat\n";
+}
+
+// This part is for ELF object output.
+MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S,
+ const MCSubtargetInfo &STI)
+ : MipsTargetStreamer(S), MicroMipsEnabled(false), STI(STI) {
+ MCAssembler &MCA = getStreamer().getAssembler();
+
+ // It's possible that MCObjectFileInfo isn't fully initialized at this point
+ // due to an initialization order problem where LLVMTargetMachine creates the
+ // target streamer before TargetLoweringObjectFile calls
+ // InitializeMCObjectFileInfo. There doesn't seem to be a single place that
+ // covers all cases so this statement covers most cases and direct object
+ // emission must call setPic() once MCObjectFileInfo has been initialized. The
+ // cases we don't handle here are covered by MipsAsmPrinter.
+ Pic = MCA.getContext().getObjectFileInfo()->isPositionIndependent();
+
+ const FeatureBitset &Features = STI.getFeatureBits();
+
+ // Set the header flags that we can in the constructor.
+ // FIXME: This is a fairly terrible hack. We set the rest
+ // of these in the destructor. The problem here is two-fold:
+ //
+ // a: Some of the eflags can be set/reset by directives.
+ // b: There aren't any usage paths that initialize the ABI
+ // pointer until after we initialize either an assembler
+ // or the target machine.
+ // We can fix this by making the target streamer construct
+ // the ABI, but this is fraught with wide ranging dependency
+ // issues as well.
+ unsigned EFlags = MCA.getELFHeaderEFlags();
+
+ // Architecture
+ if (Features[Mips::FeatureMips64r6])
+ EFlags |= ELF::EF_MIPS_ARCH_64R6;
+ else if (Features[Mips::FeatureMips64r2] ||
+ Features[Mips::FeatureMips64r3] ||
+ Features[Mips::FeatureMips64r5])
+ EFlags |= ELF::EF_MIPS_ARCH_64R2;
+ else if (Features[Mips::FeatureMips64])
+ EFlags |= ELF::EF_MIPS_ARCH_64;
+ else if (Features[Mips::FeatureMips5])
+ EFlags |= ELF::EF_MIPS_ARCH_5;
+ else if (Features[Mips::FeatureMips4])
+ EFlags |= ELF::EF_MIPS_ARCH_4;
+ else if (Features[Mips::FeatureMips3])
+ EFlags |= ELF::EF_MIPS_ARCH_3;
+ else if (Features[Mips::FeatureMips32r6])
+ EFlags |= ELF::EF_MIPS_ARCH_32R6;
+ else if (Features[Mips::FeatureMips32r2] ||
+ Features[Mips::FeatureMips32r3] ||
+ Features[Mips::FeatureMips32r5])
+ EFlags |= ELF::EF_MIPS_ARCH_32R2;
+ else if (Features[Mips::FeatureMips32])
+ EFlags |= ELF::EF_MIPS_ARCH_32;
+ else if (Features[Mips::FeatureMips2])
+ EFlags |= ELF::EF_MIPS_ARCH_2;
+ else
+ EFlags |= ELF::EF_MIPS_ARCH_1;
+
+ // Machine
+ if (Features[Mips::FeatureCnMips])
+ EFlags |= ELF::EF_MIPS_MACH_OCTEON;
+
+ // Other options.
+ if (Features[Mips::FeatureNaN2008])
+ EFlags |= ELF::EF_MIPS_NAN2008;
+
+ // -mabicalls and -mplt are not implemented but we should act as if they were
+ // given.
+ EFlags |= ELF::EF_MIPS_CPIC;
+
+ MCA.setELFHeaderEFlags(EFlags);
+}
+
+void MipsTargetELFStreamer::emitLabel(MCSymbol *S) {
+ auto *Symbol = cast<MCSymbolELF>(S);
+ if (!isMicroMipsEnabled())
+ return;
+ getStreamer().getAssembler().registerSymbol(*Symbol);
+ uint8_t Type = Symbol->getType();
+ if (Type != ELF::STT_FUNC)
+ return;
+
+ Symbol->setOther(ELF::STO_MIPS_MICROMIPS);
+}
+
+void MipsTargetELFStreamer::finish() {
+ MCAssembler &MCA = getStreamer().getAssembler();
+ const MCObjectFileInfo &OFI = *MCA.getContext().getObjectFileInfo();
+
+ // .bss, .text and .data are always at least 16-byte aligned.
+ MCSection &TextSection = *OFI.getTextSection();
+ MCA.registerSection(TextSection);
+ MCSection &DataSection = *OFI.getDataSection();
+ MCA.registerSection(DataSection);
+ MCSection &BSSSection = *OFI.getBSSSection();
+ MCA.registerSection(BSSSection);
+
+ TextSection.setAlignment(std::max(16u, TextSection.getAlignment()));
+ DataSection.setAlignment(std::max(16u, DataSection.getAlignment()));
+ BSSSection.setAlignment(std::max(16u, BSSSection.getAlignment()));
+
+ if (RoundSectionSizes) {
+ // Make sections sizes a multiple of the alignment. This is useful for
+ // verifying the output of IAS against the output of other assemblers but
+ // it's not necessary to produce a correct object and increases section
+ // size.
+ MCStreamer &OS = getStreamer();
+ for (MCSection &S : MCA) {
+ MCSectionELF &Section = static_cast<MCSectionELF &>(S);
+
+ unsigned Alignment = Section.getAlignment();
+ if (Alignment) {
+ OS.SwitchSection(&Section);
+ if (Section.UseCodeAlign())
+ OS.EmitCodeAlignment(Alignment, Alignment);
+ else
+ OS.EmitValueToAlignment(Alignment, 0, 1, Alignment);
+ }
+ }
+ }
+
+ const FeatureBitset &Features = STI.getFeatureBits();
+
+ // Update e_header flags. See the FIXME and comment above in
+ // the constructor for a full rundown on this.
+ unsigned EFlags = MCA.getELFHeaderEFlags();
+
+ // ABI
+ // N64 does not require any ABI bits.
+ if (getABI().IsO32())
+ EFlags |= ELF::EF_MIPS_ABI_O32;
+ else if (getABI().IsN32())
+ EFlags |= ELF::EF_MIPS_ABI2;
+
+ if (Features[Mips::FeatureGP64Bit]) {
+ if (getABI().IsO32())
+ EFlags |= ELF::EF_MIPS_32BITMODE; /* Compatibility Mode */
+ } else if (Features[Mips::FeatureMips64r2] || Features[Mips::FeatureMips64])
+ EFlags |= ELF::EF_MIPS_32BITMODE;
+
+ // If we've set the cpic eflag and we're n64, go ahead and set the pic
+ // one as well.
+ if (EFlags & ELF::EF_MIPS_CPIC && getABI().IsN64())
+ EFlags |= ELF::EF_MIPS_PIC;
+
+ MCA.setELFHeaderEFlags(EFlags);
+
+ // Emit all the option records.
+ // At the moment we are only emitting .Mips.options (ODK_REGINFO) and
+ // .reginfo.
+ MipsELFStreamer &MEF = static_cast<MipsELFStreamer &>(Streamer);
+ MEF.EmitMipsOptionRecords();
+
+ emitMipsAbiFlags();
+}
+
+void MipsTargetELFStreamer::emitAssignment(MCSymbol *S, const MCExpr *Value) {
+ auto *Symbol = cast<MCSymbolELF>(S);
+ // If on rhs is micromips symbol then mark Symbol as microMips.
+ if (Value->getKind() != MCExpr::SymbolRef)
+ return;
+ const auto &RhsSym = cast<MCSymbolELF>(
+ static_cast<const MCSymbolRefExpr *>(Value)->getSymbol());
+
+ if (!(RhsSym.getOther() & ELF::STO_MIPS_MICROMIPS))
+ return;
+
+ Symbol->setOther(ELF::STO_MIPS_MICROMIPS);
+}
+
+MCELFStreamer &MipsTargetELFStreamer::getStreamer() {
+ return static_cast<MCELFStreamer &>(Streamer);
+}
+
+void MipsTargetELFStreamer::emitDirectiveSetMicroMips() {
+ MicroMipsEnabled = true;
+ forbidModuleDirective();
+}
+
+void MipsTargetELFStreamer::emitDirectiveSetNoMicroMips() {
+ MicroMipsEnabled = false;
+ forbidModuleDirective();
+}
+
+void MipsTargetELFStreamer::setUsesMicroMips() {
+ MCAssembler &MCA = getStreamer().getAssembler();
+ unsigned Flags = MCA.getELFHeaderEFlags();
+ Flags |= ELF::EF_MIPS_MICROMIPS;
+ MCA.setELFHeaderEFlags(Flags);
+}
+
+void MipsTargetELFStreamer::emitDirectiveSetMips16() {
+ MCAssembler &MCA = getStreamer().getAssembler();
+ unsigned Flags = MCA.getELFHeaderEFlags();
+ Flags |= ELF::EF_MIPS_ARCH_ASE_M16;
+ MCA.setELFHeaderEFlags(Flags);
+ forbidModuleDirective();
+}
+
+void MipsTargetELFStreamer::emitDirectiveSetNoReorder() {
+ MCAssembler &MCA = getStreamer().getAssembler();
+ unsigned Flags = MCA.getELFHeaderEFlags();
+ Flags |= ELF::EF_MIPS_NOREORDER;
+ MCA.setELFHeaderEFlags(Flags);
+ forbidModuleDirective();
+}
+
+void MipsTargetELFStreamer::emitDirectiveEnd(StringRef Name) {
+ MCAssembler &MCA = getStreamer().getAssembler();
+ MCContext &Context = MCA.getContext();
+ MCStreamer &OS = getStreamer();
+
+ MCSectionELF *Sec = Context.getELFSection(".pdr", ELF::SHT_PROGBITS, 0);
+
+ MCSymbol *Sym = Context.getOrCreateSymbol(Name);
+ const MCSymbolRefExpr *ExprRef =
+ MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, Context);
+
+ MCA.registerSection(*Sec);
+ Sec->setAlignment(4);
+
+ OS.PushSection();
+
+ OS.SwitchSection(Sec);
+
+ OS.EmitValueImpl(ExprRef, 4);
+
+ OS.EmitIntValue(GPRInfoSet ? GPRBitMask : 0, 4); // reg_mask
+ OS.EmitIntValue(GPRInfoSet ? GPROffset : 0, 4); // reg_offset
+
+ OS.EmitIntValue(FPRInfoSet ? FPRBitMask : 0, 4); // fpreg_mask
+ OS.EmitIntValue(FPRInfoSet ? FPROffset : 0, 4); // fpreg_offset
+
+ OS.EmitIntValue(FrameInfoSet ? FrameOffset : 0, 4); // frame_offset
+ OS.EmitIntValue(FrameInfoSet ? FrameReg : 0, 4); // frame_reg
+ OS.EmitIntValue(FrameInfoSet ? ReturnReg : 0, 4); // return_reg
+
+ // The .end directive marks the end of a procedure. Invalidate
+ // the information gathered up until this point.
+ GPRInfoSet = FPRInfoSet = FrameInfoSet = false;
+
+ OS.PopSection();
+
+ // .end also implicitly sets the size.
+ MCSymbol *CurPCSym = Context.createTempSymbol();
+ OS.EmitLabel(CurPCSym);
+ const MCExpr *Size = MCBinaryExpr::createSub(
+ MCSymbolRefExpr::create(CurPCSym, MCSymbolRefExpr::VK_None, Context),
+ ExprRef, Context);
+ int64_t AbsSize;
+ if (!Size->evaluateAsAbsolute(AbsSize, MCA))
+ llvm_unreachable("Function size must be evaluatable as absolute");
+ Size = MCConstantExpr::create(AbsSize, Context);
+ static_cast<MCSymbolELF *>(Sym)->setSize(Size);
+}
+
+void MipsTargetELFStreamer::emitDirectiveEnt(const MCSymbol &Symbol) {
+ GPRInfoSet = FPRInfoSet = FrameInfoSet = false;
+
+ // .ent also acts like an implicit '.type symbol, STT_FUNC'
+ static_cast<const MCSymbolELF &>(Symbol).setType(ELF::STT_FUNC);
+}
+
+void MipsTargetELFStreamer::emitDirectiveAbiCalls() {
+ MCAssembler &MCA = getStreamer().getAssembler();
+ unsigned Flags = MCA.getELFHeaderEFlags();
+ Flags |= ELF::EF_MIPS_CPIC | ELF::EF_MIPS_PIC;
+ MCA.setELFHeaderEFlags(Flags);
+}
+
+void MipsTargetELFStreamer::emitDirectiveNaN2008() {
+ MCAssembler &MCA = getStreamer().getAssembler();
+ unsigned Flags = MCA.getELFHeaderEFlags();
+ Flags |= ELF::EF_MIPS_NAN2008;
+ MCA.setELFHeaderEFlags(Flags);
+}
+
+void MipsTargetELFStreamer::emitDirectiveNaNLegacy() {
+ MCAssembler &MCA = getStreamer().getAssembler();
+ unsigned Flags = MCA.getELFHeaderEFlags();
+ Flags &= ~ELF::EF_MIPS_NAN2008;
+ MCA.setELFHeaderEFlags(Flags);
+}
+
+void MipsTargetELFStreamer::emitDirectiveOptionPic0() {
+ MCAssembler &MCA = getStreamer().getAssembler();
+ unsigned Flags = MCA.getELFHeaderEFlags();
+ // This option overrides other PIC options like -KPIC.
+ Pic = false;
+ Flags &= ~ELF::EF_MIPS_PIC;
+ MCA.setELFHeaderEFlags(Flags);
+}
+
+void MipsTargetELFStreamer::emitDirectiveOptionPic2() {
+ MCAssembler &MCA = getStreamer().getAssembler();
+ unsigned Flags = MCA.getELFHeaderEFlags();
+ Pic = true;
+ // NOTE: We are following the GAS behaviour here which means the directive
+ // 'pic2' also sets the CPIC bit in the ELF header. This is different from
+ // what is stated in the SYSV ABI which consider the bits EF_MIPS_PIC and
+ // EF_MIPS_CPIC to be mutually exclusive.
+ Flags |= ELF::EF_MIPS_PIC | ELF::EF_MIPS_CPIC;
+ MCA.setELFHeaderEFlags(Flags);
+}
+
+void MipsTargetELFStreamer::emitDirectiveInsn() {
+ MipsTargetStreamer::emitDirectiveInsn();
+ MipsELFStreamer &MEF = static_cast<MipsELFStreamer &>(Streamer);
+ MEF.createPendingLabelRelocs();
+}
+
+void MipsTargetELFStreamer::emitFrame(unsigned StackReg, unsigned StackSize,
+ unsigned ReturnReg_) {
+ MCContext &Context = getStreamer().getAssembler().getContext();
+ const MCRegisterInfo *RegInfo = Context.getRegisterInfo();
+
+ FrameInfoSet = true;
+ FrameReg = RegInfo->getEncodingValue(StackReg);
+ FrameOffset = StackSize;
+ ReturnReg = RegInfo->getEncodingValue(ReturnReg_);
+}
+
+void MipsTargetELFStreamer::emitMask(unsigned CPUBitmask,
+ int CPUTopSavedRegOff) {
+ GPRInfoSet = true;
+ GPRBitMask = CPUBitmask;
+ GPROffset = CPUTopSavedRegOff;
+}
+
+void MipsTargetELFStreamer::emitFMask(unsigned FPUBitmask,
+ int FPUTopSavedRegOff) {
+ FPRInfoSet = true;
+ FPRBitMask = FPUBitmask;
+ FPROffset = FPUTopSavedRegOff;
+}
+
+void MipsTargetELFStreamer::emitDirectiveCpLoad(unsigned RegNo) {
+ // .cpload $reg
+ // This directive expands to:
+ // lui $gp, %hi(_gp_disp)
+ // addui $gp, $gp, %lo(_gp_disp)
+ // addu $gp, $gp, $reg
+ // when support for position independent code is enabled.
+ if (!Pic || (getABI().IsN32() || getABI().IsN64()))
+ return;
+
+ // There's a GNU extension controlled by -mno-shared that allows
+ // locally-binding symbols to be accessed using absolute addresses.
+ // This is currently not supported. When supported -mno-shared makes
+ // .cpload expand to:
+ // lui $gp, %hi(__gnu_local_gp)
+ // addiu $gp, $gp, %lo(__gnu_local_gp)
+
+ StringRef SymName("_gp_disp");
+ MCAssembler &MCA = getStreamer().getAssembler();
+ MCSymbol *GP_Disp = MCA.getContext().getOrCreateSymbol(SymName);
+ MCA.registerSymbol(*GP_Disp);
+
+ MCInst TmpInst;
+ TmpInst.setOpcode(Mips::LUi);
+ TmpInst.addOperand(MCOperand::createReg(Mips::GP));
+ const MCExpr *HiSym = MipsMCExpr::create(
+ MipsMCExpr::MEK_HI,
+ MCSymbolRefExpr::create("_gp_disp", MCSymbolRefExpr::VK_None,
+ MCA.getContext()),
+ MCA.getContext());
+ TmpInst.addOperand(MCOperand::createExpr(HiSym));
+ getStreamer().EmitInstruction(TmpInst, STI);
+
+ TmpInst.clear();
+
+ TmpInst.setOpcode(Mips::ADDiu);
+ TmpInst.addOperand(MCOperand::createReg(Mips::GP));
+ TmpInst.addOperand(MCOperand::createReg(Mips::GP));
+ const MCExpr *LoSym = MipsMCExpr::create(
+ MipsMCExpr::MEK_LO,
+ MCSymbolRefExpr::create("_gp_disp", MCSymbolRefExpr::VK_None,
+ MCA.getContext()),
+ MCA.getContext());
+ TmpInst.addOperand(MCOperand::createExpr(LoSym));
+ getStreamer().EmitInstruction(TmpInst, STI);
+
+ TmpInst.clear();
+
+ TmpInst.setOpcode(Mips::ADDu);
+ TmpInst.addOperand(MCOperand::createReg(Mips::GP));
+ TmpInst.addOperand(MCOperand::createReg(Mips::GP));
+ TmpInst.addOperand(MCOperand::createReg(RegNo));
+ getStreamer().EmitInstruction(TmpInst, STI);
+
+ forbidModuleDirective();
+}
+
+bool MipsTargetELFStreamer::emitDirectiveCpRestore(
+ int Offset, function_ref<unsigned()> GetATReg, SMLoc IDLoc,
+ const MCSubtargetInfo *STI) {
+ MipsTargetStreamer::emitDirectiveCpRestore(Offset, GetATReg, IDLoc, STI);
+ // .cprestore offset
+ // When PIC mode is enabled and the O32 ABI is used, this directive expands
+ // to:
+ // sw $gp, offset($sp)
+ // and adds a corresponding LW after every JAL.
+
+ // Note that .cprestore is ignored if used with the N32 and N64 ABIs or if it
+ // is used in non-PIC mode.
+ if (!Pic || (getABI().IsN32() || getABI().IsN64()))
+ return true;
+
+ // Store the $gp on the stack.
+ emitStoreWithImmOffset(Mips::SW, Mips::GP, Mips::SP, Offset, GetATReg, IDLoc,
+ STI);
+ return true;
+}
+
+void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
+ int RegOrOffset,
+ const MCSymbol &Sym,
+ bool IsReg) {
+ // Only N32 and N64 emit anything for .cpsetup iff PIC is set.
+ if (!Pic || !(getABI().IsN32() || getABI().IsN64()))
+ return;
+
+ forbidModuleDirective();
+
+ MCAssembler &MCA = getStreamer().getAssembler();
+ MCInst Inst;
+
+ // Either store the old $gp in a register or on the stack
+ if (IsReg) {
+ // move $save, $gpreg
+ emitRRR(Mips::OR64, RegOrOffset, Mips::GP, Mips::ZERO, SMLoc(), &STI);
+ } else {
+ // sd $gpreg, offset($sp)
+ emitRRI(Mips::SD, Mips::GP, Mips::SP, RegOrOffset, SMLoc(), &STI);
+ }
+
+ if (getABI().IsN32()) {
+ MCSymbol *GPSym = MCA.getContext().getOrCreateSymbol("__gnu_local_gp");
+ const MipsMCExpr *HiExpr = MipsMCExpr::create(
+ MipsMCExpr::MEK_HI, MCSymbolRefExpr::create(GPSym, MCA.getContext()),
+ MCA.getContext());
+ const MipsMCExpr *LoExpr = MipsMCExpr::create(
+ MipsMCExpr::MEK_LO, MCSymbolRefExpr::create(GPSym, MCA.getContext()),
+ MCA.getContext());
+
+ // lui $gp, %hi(__gnu_local_gp)
+ emitRX(Mips::LUi, Mips::GP, MCOperand::createExpr(HiExpr), SMLoc(), &STI);
+
+ // addiu $gp, $gp, %lo(__gnu_local_gp)
+ emitRRX(Mips::ADDiu, Mips::GP, Mips::GP, MCOperand::createExpr(LoExpr),
+ SMLoc(), &STI);
+
+ return;
+ }
+
+ const MipsMCExpr *HiExpr = MipsMCExpr::createGpOff(
+ MipsMCExpr::MEK_HI, MCSymbolRefExpr::create(&Sym, MCA.getContext()),
+ MCA.getContext());
+ const MipsMCExpr *LoExpr = MipsMCExpr::createGpOff(
+ MipsMCExpr::MEK_LO, MCSymbolRefExpr::create(&Sym, MCA.getContext()),
+ MCA.getContext());
+
+ // lui $gp, %hi(%neg(%gp_rel(funcSym)))
+ emitRX(Mips::LUi, Mips::GP, MCOperand::createExpr(HiExpr), SMLoc(), &STI);
+
+ // addiu $gp, $gp, %lo(%neg(%gp_rel(funcSym)))
+ emitRRX(Mips::ADDiu, Mips::GP, Mips::GP, MCOperand::createExpr(LoExpr),
+ SMLoc(), &STI);
+
+ // daddu $gp, $gp, $funcreg
+ emitRRR(Mips::DADDu, Mips::GP, Mips::GP, RegNo, SMLoc(), &STI);
+}
+
+void MipsTargetELFStreamer::emitDirectiveCpreturn(unsigned SaveLocation,
+ bool SaveLocationIsRegister) {
+ // Only N32 and N64 emit anything for .cpreturn iff PIC is set.
+ if (!Pic || !(getABI().IsN32() || getABI().IsN64()))
+ return;
+
+ MCInst Inst;
+ // Either restore the old $gp from a register or on the stack
+ if (SaveLocationIsRegister) {
+ Inst.setOpcode(Mips::OR);
+ Inst.addOperand(MCOperand::createReg(Mips::GP));
+ Inst.addOperand(MCOperand::createReg(SaveLocation));
+ Inst.addOperand(MCOperand::createReg(Mips::ZERO));
+ } else {
+ Inst.setOpcode(Mips::LD);
+ Inst.addOperand(MCOperand::createReg(Mips::GP));
+ Inst.addOperand(MCOperand::createReg(Mips::SP));
+ Inst.addOperand(MCOperand::createImm(SaveLocation));
+ }
+ getStreamer().EmitInstruction(Inst, STI);
+
+ forbidModuleDirective();
+}
+
+void MipsTargetELFStreamer::emitMipsAbiFlags() {
+ MCAssembler &MCA = getStreamer().getAssembler();
+ MCContext &Context = MCA.getContext();
+ MCStreamer &OS = getStreamer();
+ MCSectionELF *Sec = Context.getELFSection(
+ ".MIPS.abiflags", ELF::SHT_MIPS_ABIFLAGS, ELF::SHF_ALLOC, 24, "");
+ MCA.registerSection(*Sec);
+ Sec->setAlignment(8);
+ OS.SwitchSection(Sec);
+
+ OS << ABIFlagsSection;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MSA.txt b/contrib/llvm/lib/Target/Mips/MSA.txt
new file mode 100644
index 000000000000..113375fa7f2f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MSA.txt
@@ -0,0 +1,83 @@
+Code Generation Notes for MSA
+=============================
+
+Intrinsics are lowered to SelectionDAG nodes where possible in order to enable
+optimisation, reduce the size of the ISel matcher, and reduce repetition in
+the implementation. In a small number of cases, this can cause different
+(semantically equivalent) instructions to be used in place of the requested
+instruction, even when no optimisation has taken place.
+
+Instructions
+============
+
+This section describes any quirks of instruction selection for MSA. For
+example, two instructions might be equally valid for some given IR and one is
+chosen in preference to the other.
+
+bclri.b:
+ It is not possible to emit bclri.b since andi.b covers exactly the
+ same cases. andi.b should use fractionally less power than bclri.b in
+ most hardware implementations so it is used in preference to bclri.b.
+
+vshf.w:
+ It is not possible to emit vshf.w when the shuffle description is
+ constant since shf.w covers exactly the same cases. shf.w is used
+ instead. It is also impossible for the shuffle description to be
+ unknown at compile-time due to the definition of shufflevector in
+ LLVM IR.
+
+vshf.[bhwd]
+ When the shuffle description describes a splat operation, splat.[bhwd]
+ instructions will be selected instead of vshf.[bhwd]. Unlike the ilv*,
+ and pck* instructions, this is matched from MipsISD::VSHF instead of
+ a special-case MipsISD node.
+
+ilvl.d, pckev.d:
+ It is not possible to emit ilvl.d, or pckev.d since ilvev.d covers the
+ same shuffle. ilvev.d will be emitted instead.
+
+ilvr.d, ilvod.d, pckod.d:
+ It is not possible to emit ilvr.d, or pckod.d since ilvod.d covers the
+ same shuffle. ilvod.d will be emitted instead.
+
+splat.[bhwd]
+ The intrinsic will work as expected. However, unlike other intrinsics
+ it lowers directly to MipsISD::VSHF instead of using common IR.
+
+splati.w:
+ It is not possible to emit splati.w since shf.w covers the same cases.
+ shf.w will be emitted instead.
+
+copy_s.w:
+ On MIPS32, the copy_u.d intrinsic will emit this instruction instead of
+ copy_u.w. This is semantically equivalent since the general-purpose
+ register file is 32-bits wide.
+
+binsri.[bhwd], binsli.[bhwd]:
+ These two operations are equivalent to each other with the operands
+ swapped and condition inverted. The compiler may use either one as
+ appropriate.
+ Furthermore, the compiler may use bsel.[bhwd] for some masks that do
+ not survive the legalization process (this is a bug and will be fixed).
+
+bmnz.v, bmz.v, bsel.v:
+ These three operations differ only in the operand that is tied to the
+ result and the order of the operands.
+ It is (currently) not possible to emit bmz.v, or bsel.v since bmnz.v is
+ the same operation and will be emitted instead.
+ In future, the compiler may choose between these three instructions
+ according to register allocation.
+ These three operations can be very confusing so here is a mapping
+ between the instructions and the vselect node in one place:
+ bmz.v wd, ws, wt/i8 -> (vselect wt/i8, wd, ws)
+ bmnz.v wd, ws, wt/i8 -> (vselect wt/i8, ws, wd)
+ bsel.v wd, ws, wt/i8 -> (vselect wd, wt/i8, ws)
+
+bmnzi.b, bmzi.b:
+ Like their non-immediate counterparts, bmnzi.v and bmzi.v are the same
+ operation with the operands swapped. bmnzi.v will (currently) be emitted
+ for both cases.
+
+bseli.v:
+ Unlike the non-immediate versions, bseli.v is distinguishable from
+ bmnzi.b and bmzi.b and can be emitted.
diff --git a/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrFormats.td b/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrFormats.td
new file mode 100644
index 000000000000..2f0933277e81
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrFormats.td
@@ -0,0 +1,1094 @@
+//=- MicroMips32r6InstrFormats.td - Mips32r6 Instruction Formats -*- tablegen -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes microMIPS32r6 instruction formats.
+//
+//===----------------------------------------------------------------------===//
+
+class MMR6Arch<string opstr> {
+ string Arch = "micromipsr6";
+ string BaseOpcode = opstr;
+ string DecoderNamespace = "MicroMipsR6";
+}
+
+// Class used for microMIPS32r6 and microMIPS64r6 instructions.
+class MicroMipsR6Inst16 : PredicateControl {
+ string DecoderNamespace = "MicroMipsR6";
+ let InsnPredicates = [HasMicroMips32r6];
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Disambiguators
+//
+//===----------------------------------------------------------------------===//
+//
+// Some encodings are ambiguous except by comparing field values.
+
+class MMDecodeDisambiguatedBy<string Name> : DecodeDisambiguates<Name> {
+ string DecoderNamespace = "MicroMipsR6_Ambiguous";
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Encoding Formats
+//
+//===----------------------------------------------------------------------===//
+
+class BC16_FM_MM16R6 {
+ bits<10> offset;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = 0x33;
+ let Inst{9-0} = offset;
+}
+
+class BEQZC_BNEZC_FM_MM16R6<bits<6> op> : MicroMipsR6Inst16 {
+ bits<3> rs;
+ bits<7> offset;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = op;
+ let Inst{9-7} = rs;
+ let Inst{6-0} = offset;
+}
+
+class POOL16C_JALRC_FM_MM16R6<bits<5> op> {
+ bits<5> rs;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = 0x11;
+ let Inst{9-5} = rs;
+ let Inst{4-0} = op;
+}
+
+class POP35_BOVC_FM_MMR6<string instr_asm> : MipsR6Inst, MMR6Arch<instr_asm> {
+ bits<5> rt;
+ bits<5> rs;
+ bits<16> offset;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b011101;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-0} = offset;
+}
+
+class POP37_BNVC_FM_MMR6<string instr_asm> : MipsR6Inst, MMR6Arch<instr_asm> {
+ bits<5> rt;
+ bits<5> rs;
+ bits<16> offset;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b011111;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-0} = offset;
+}
+
+class POOL16C_JRCADDIUSP_FM_MM16R6<bits<5> op> {
+ bits<5> imm;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = 0x11;
+ let Inst{9-5} = imm;
+ let Inst{4-0} = op;
+}
+
+class POOL16C_LWM_SWM_FM_MM16R6<bits<4> funct> {
+ bits<2> rt;
+ bits<4> addr;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = 0x11;
+ let Inst{9-8} = rt;
+ let Inst{7-4} = addr;
+ let Inst{3-0} = funct;
+}
+
+class POOL32A_BITSWAP_FM_MMR6<bits<6> funct> : MipsR6Inst {
+ bits<5> rd;
+ bits<5> rt;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rd;
+ let Inst{15-12} = 0b0000;
+ let Inst{11-6} = funct;
+ let Inst{5-0} = 0b111100;
+}
+
+class CACHE_PREF_FM_MMR6<bits<6> opgroup, bits<4> funct> : MipsR6Inst {
+ bits<21> addr;
+ bits<5> hint;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = opgroup;
+ let Inst{25-21} = hint;
+ let Inst{20-16} = addr{20-16};
+ let Inst{15-12} = funct;
+ let Inst{11-0} = addr{11-0};
+}
+
+class ARITH_FM_MMR6<string instr_asm, bits<10> funct> : MMR6Arch<instr_asm> {
+ bits<5> rd;
+ bits<5> rt;
+ bits<5> rs;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-11} = rd;
+ let Inst{10} = 0;
+ let Inst{9-0} = funct;
+}
+
+class ADDI_FM_MMR6<string instr_asm, bits<6> op> : MMR6Arch<instr_asm> {
+ bits<5> rt;
+ bits<5> rs;
+ bits<16> imm16;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = op;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-0} = imm16;
+}
+
+class POOL32C_ST_EVA_FM_MMR6<bits<6> op, bits<3> funct> : MipsR6Inst {
+ bits<21> addr;
+ bits<5> hint;
+ bits<5> base = addr{20-16};
+ bits<9> offset = addr{8-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = op;
+ let Inst{25-21} = hint;
+ let Inst{20-16} = base;
+ let Inst{15-12} = 0b1010;
+ let Inst{11-9} = funct;
+ let Inst{8-0} = offset;
+}
+
+class LB32_FM_MMR6 : MipsR6Inst {
+ bits<21> addr;
+ bits<5> rt;
+ bits<5> base = addr{20-16};
+ bits<16> offset = addr{15-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b000111;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = base;
+ let Inst{15-0} = offset;
+}
+
+class LBU32_FM_MMR6 : MipsR6Inst {
+ bits<21> addr;
+ bits<5> rt;
+ bits<5> base = addr{20-16};
+ bits<16> offset = addr{15-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b000101;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = base;
+ let Inst{15-0} = offset;
+}
+
+class POOL32C_LB_LBU_FM_MMR6<bits<3> funct> : MipsR6Inst {
+ bits<21> addr;
+ bits<5> rt;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b011000;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = addr{20-16};
+ let Inst{15-12} = 0b0110;
+ let Inst{11-9} = funct;
+ let Inst{8-0} = addr{8-0};
+}
+
+class SIGN_EXTEND_FM_MMR6<string instr_asm, bits<10> funct>
+ : MMR6Arch<instr_asm> {
+ bits<5> rd;
+ bits<5> rt;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-21} = rd;
+ let Inst{20-16} = rt;
+ let Inst{15-6} = funct;
+ let Inst{5-0} = 0b111100;
+}
+
+class PCREL19_FM_MMR6<bits<2> funct> : MipsR6Inst {
+ bits<5> rt;
+ bits<19> imm;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b011110;
+ let Inst{25-21} = rt;
+ let Inst{20-19} = funct;
+ let Inst{18-0} = imm;
+}
+
+class PCREL16_FM_MMR6<bits<5> funct> : MipsR6Inst {
+ bits<5> rt;
+ bits<16> imm;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b011110;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = funct;
+ let Inst{15-0} = imm;
+}
+
+class POOL32A_FM_MMR6<bits<10> funct> : MipsR6Inst {
+ bits<5> rd;
+ bits<5> rs;
+ bits<5> rt;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-11} = rd;
+ let Inst{10} = 0;
+ let Inst{9-0} = funct;
+}
+
+class POOL32A_PAUSE_FM_MMR6<string instr_asm, bits<5> op> : MMR6Arch<instr_asm> {
+ bits<32> Inst;
+
+ let Inst{31-26} = 0;
+ let Inst{25-21} = 0;
+ let Inst{20-16} = 0;
+ let Inst{15-11} = op;
+ let Inst{10-6} = 0;
+ let Inst{5-0} = 0;
+}
+
+class POOL32A_RDPGPR_FM_MMR6<bits<10> funct> {
+ bits<5> rt;
+ bits<5> rd;
+ bits<32> Inst;
+
+ let Inst{31-26} = 0;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rd;
+ let Inst{15-6} = funct;
+ let Inst{5-0} = 0b111100;
+}
+
+class POOL32A_RDHWR_FM_MMR6 {
+ bits<5> rt;
+ bits<5> rs;
+ bits<3> sel;
+ bits<32> Inst;
+
+ let Inst{31-26} = 0;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-14} = 0;
+ let Inst{13-11} = sel;
+ let Inst{10} = 0;
+ let Inst{9-0} = 0b0111000000;
+}
+
+class POOL32A_SYNC_FM_MMR6 {
+ bits<5> stype;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0;
+ let Inst{25-21} = 0;
+ let Inst{20-16} = stype;
+ let Inst{15-6} = 0b0110101101;
+ let Inst{5-0} = 0b111100;
+}
+
+class POOL32I_SYNCI_FM_MMR6 {
+ bits<21> addr;
+ bits<5> base = addr{20-16};
+ bits<16> immediate = addr{15-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010000;
+ let Inst{25-21} = 0b01100;
+ let Inst{20-16} = base;
+ let Inst{15-0} = immediate;
+}
+
+class POOL32A_2R_FM_MMR6<bits<10> funct> : MipsR6Inst {
+ bits<5> rs;
+ bits<5> rt;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-6} = funct;
+ let Inst{5-0} = 0b111100;
+}
+
+class SPECIAL_2R_FM_MMR6<bits<6> funct> : MipsR6Inst {
+ bits<5> rs;
+ bits<5> rt;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-21} = rs;
+ let Inst{20-16} = 0b00000;
+ let Inst{15-11} = rt;
+ let Inst{10-6} = 0b00001;
+ let Inst{5-0} = funct;
+}
+
+class POOL32A_ALIGN_FM_MMR6<bits<6> funct> : MipsR6Inst {
+ bits<5> rd;
+ bits<5> rs;
+ bits<5> rt;
+ bits<2> bp;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-21} = rs;
+ let Inst{20-16} = rt;
+ let Inst{15-11} = rd;
+ let Inst{10-9} = bp;
+ let Inst{8-6} = 0b000;
+ let Inst{5-0} = funct;
+}
+
+class AUI_FM_MMR6 : MipsR6Inst {
+ bits<5> rs;
+ bits<5> rt;
+ bits<16> imm;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b000100;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-0} = imm;
+}
+
+class POOL32A_LSA_FM<bits<6> funct> : MipsR6Inst {
+ bits<5> rd;
+ bits<5> rs;
+ bits<5> rt;
+ bits<2> imm2;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-11} = rd;
+ let Inst{10-9} = imm2;
+ let Inst{8-6} = 0b000;
+ let Inst{5-0} = funct;
+}
+
+class SB32_SH32_STORE_FM_MMR6<bits<6> op> {
+ bits<5> rt;
+ bits<21> addr;
+ bits<5> base = addr{20-16};
+ bits<16> offset = addr{15-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = op;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = base;
+ let Inst{15-0} = offset;
+}
+
+class POOL32C_STORE_EVA_FM_MMR6<bits<3> funct> {
+ bits<5> rt;
+ bits<21> addr;
+ bits<5> base = addr{20-16};
+ bits<9> offset = addr{8-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b011000;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = base;
+ let Inst{15-12} = 0b1010;
+ let Inst{11-9} = funct;
+ let Inst{8-0} = offset;
+}
+
+class LOAD_WORD_EVA_FM_MMR6<bits<3> funct> {
+ bits<5> rt;
+ bits<21> addr;
+ bits<5> base = addr{20-16};
+ bits<9> offset = addr{8-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b011000;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = base;
+ let Inst{15-12} = 0b0110;
+ let Inst{11-9} = funct;
+ let Inst{8-0} = offset;
+}
+
+class LOAD_WORD_FM_MMR6 {
+ bits<5> rt;
+ bits<21> addr;
+ bits<5> base = addr{20-16};
+ bits<16> offset = addr{15-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b111111;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = base;
+ let Inst{15-0} = offset;
+}
+
+class LOAD_UPPER_IMM_FM_MMR6 {
+ bits<5> rt;
+ bits<16> imm16;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b000100;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = 0;
+ let Inst{15-0} = imm16;
+}
+
+class CMP_BRANCH_1R_RT_OFF16_FM_MMR6<string instr_asm, bits<6> funct>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> rt;
+ bits<16> offset;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = funct;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = 0b00000;
+ let Inst{15-0} = offset;
+}
+
+class CMP_BRANCH_1R_BOTH_OFF16_FM_MMR6<string instr_asm, bits<6> funct>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> rt;
+ bits<16> offset;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = funct;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rt;
+ let Inst{15-0} = offset;
+}
+
+class POOL32A_JALRC_FM_MMR6<string instr_asm, bits<10> funct>
+ : MipsR6Inst, MMR6Arch<instr_asm> {
+ bits<5> rt;
+ bits<5> rs;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-6} = funct;
+ let Inst{5-0} = 0b111100;
+}
+
+class POOL32A_EXT_INS_FM_MMR6<string instr_asm, bits<6> funct>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> rt;
+ bits<5> rs;
+ bits<5> size;
+ bits<5> pos;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-11} = size;
+ let Inst{10-6} = pos;
+ let Inst{5-0} = funct;
+}
+
+class POOL32A_ERET_FM_MMR6<string instr_asm, bits<10> funct>
+ : MMR6Arch<instr_asm> {
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x00;
+ let Inst{25-16} = 0x00;
+ let Inst{15-6} = funct;
+ let Inst{5-0} = 0x3c;
+}
+
+class ERETNC_FM_MMR6<string instr_asm> : MMR6Arch<instr_asm> {
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x00;
+ let Inst{25-17} = 0x00;
+ let Inst{16-16} = 0x01;
+ let Inst{15-6} = 0x3cd;
+ let Inst{5-0} = 0x3c;
+}
+
+class BREAK_MMR6_ENC<string instr_asm> : MMR6Arch<instr_asm> {
+ bits<10> code_1;
+ bits<10> code_2;
+ bits<32> Inst;
+ let Inst{31-26} = 0x0;
+ let Inst{25-16} = code_1;
+ let Inst{15-6} = code_2;
+ let Inst{5-0} = 0x07;
+}
+
+class BARRIER_MMR6_ENC<string instr_asm, bits<5> op> : MMR6Arch<instr_asm> {
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x0;
+ let Inst{25-21} = 0x0;
+ let Inst{20-16} = 0x0;
+ let Inst{15-11} = op;
+ let Inst{10-6} = 0x0;
+ let Inst{5-0} = 0x0;
+}
+
+class POOL32A_EIDI_MMR6_ENC<string instr_asm, bits<10> funct>
+ : MMR6Arch<instr_asm> {
+ bits<32> Inst;
+ bits<5> rt; // Actually rs but we're sharing code with the standard encodings which call it rt
+
+ let Inst{31-26} = 0x00;
+ let Inst{25-21} = 0x00;
+ let Inst{20-16} = rt;
+ let Inst{15-6} = funct;
+ let Inst{5-0} = 0x3c;
+}
+
+class SHIFT_MMR6_ENC<string instr_asm, bits<10> funct, bit rotate> : MMR6Arch<instr_asm> {
+ bits<5> rd;
+ bits<5> rt;
+ bits<5> shamt;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0;
+ let Inst{25-21} = rd;
+ let Inst{20-16} = rt;
+ let Inst{15-11} = shamt;
+ let Inst{10} = rotate;
+ let Inst{9-0} = funct;
+}
+
+class SW32_FM_MMR6<string instr_asm, bits<6> op> : MMR6Arch<instr_asm> {
+ bits<5> rt;
+ bits<21> addr;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = op;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = addr{20-16};
+ let Inst{15-0} = addr{15-0};
+}
+
+class POOL32C_SWE_FM_MMR6<string instr_asm, bits<6> op, bits<4> fmt,
+ bits<3> funct> : MMR6Arch<instr_asm> {
+ bits<5> rt;
+ bits<21> addr;
+ bits<5> base = addr{20-16};
+ bits<9> offset = addr{8-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = op;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = base;
+ let Inst{15-12} = fmt;
+ let Inst{11-9} = funct;
+ let Inst{8-0} = offset;
+}
+
+class POOL32F_ARITH_FM_MMR6<string instr_asm, bits<2> fmt, bits<8> funct>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> ft;
+ bits<5> fs;
+ bits<5> fd;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010101;
+ let Inst{25-21} = ft;
+ let Inst{20-16} = fs;
+ let Inst{15-11} = fd;
+ let Inst{10} = 0;
+ let Inst{9-8} = fmt;
+ let Inst{7-0} = funct;
+}
+
+class POOL32F_ARITHF_FM_MMR6<string instr_asm, bits<2> fmt, bits<9> funct>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> ft;
+ bits<5> fs;
+ bits<5> fd;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010101;
+ let Inst{25-21} = ft;
+ let Inst{20-16} = fs;
+ let Inst{15-11} = fd;
+ let Inst{10-9} = fmt;
+ let Inst{8-0} = funct;
+}
+
+class POOL32F_MOV_NEG_FM_MMR6<string instr_asm, bits<2> fmt, bits<7> funct>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> ft;
+ bits<5> fs;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010101;
+ let Inst{25-21} = ft;
+ let Inst{20-16} = fs;
+ let Inst{15} = 0;
+ let Inst{14-13} = fmt;
+ let Inst{12-6} = funct;
+ let Inst{5-0} = 0b111011;
+}
+
+class POOL32F_MINMAX_FM<string instr_asm, bits<2> fmt, bits<9> funct>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> ft;
+ bits<5> fs;
+ bits<5> fd;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010101;
+ let Inst{25-21} = ft;
+ let Inst{20-16} = fs;
+ let Inst{15-11} = fd;
+ let Inst{10-9} = fmt;
+ let Inst{8-0} = funct;
+}
+
+class POOL32F_CMP_FM<string instr_asm, bits<6> format, FIELD_CMP_COND Cond>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> ft;
+ bits<5> fs;
+ bits<5> fd;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010101;
+ let Inst{25-21} = ft;
+ let Inst{20-16} = fs;
+ let Inst{15-11} = fd;
+ let Inst{10-6} = Cond.Value;
+ let Inst{5-0} = format;
+}
+
+class POOL32F_CVT_LW_FM<string instr_asm, bit fmt, bits<8> funct>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> ft;
+ bits<5> fs;
+
+ bits<32> Inst;
+ let Inst{31-26} = 0b010101;
+ let Inst{25-21} = ft;
+ let Inst{20-16} = fs;
+ let Inst{15} = 0;
+ let Inst{14} = fmt;
+ let Inst{13-6} = funct;
+ let Inst{5-0} = 0b111011;
+}
+
+class POOL32F_CVT_DS_FM<string instr_asm, bits<2> fmt, bits<7> funct>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> ft;
+ bits<5> fs;
+
+ bits<32> Inst;
+ let Inst{31-26} = 0b010101;
+ let Inst{25-21} = ft;
+ let Inst{20-16} = fs;
+ let Inst{15} = 0;
+ let Inst{14-13} = fmt;
+ let Inst{12-6} = funct;
+ let Inst{5-0} = 0b111011;
+}
+
+class POOL32F_ABS_FM_MMR6<string instr_asm, bits<2> fmt, bits<7> funct>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> ft;
+ bits<5> fs;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010101;
+ let Inst{25-21} = ft;
+ let Inst{20-16} = fs;
+ let Inst{15} = 0;
+ let Inst{14-13} = fmt;
+ let Inst{12-6} = funct;
+ let Inst{5-0} = 0b111011;
+}
+
+class POOL32F_MATH_FM_MMR6<string instr_asm, bits<1> fmt, bits<8> funct>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> ft;
+ bits<5> fs;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010101;
+ let Inst{25-21} = ft;
+ let Inst{20-16} = fs;
+ let Inst{15} = 0;
+ let Inst{14} = fmt;
+ let Inst{13-6} = funct;
+ let Inst{5-0} = 0b111011;
+}
+
+class POOL16A_ADDU16_FM_MMR6 : MicroMipsR6Inst16 {
+ bits<3> rs;
+ bits<3> rt;
+ bits<3> rd;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = 0b000001;
+ let Inst{9-7} = rs;
+ let Inst{6-4} = rt;
+ let Inst{3-1} = rd;
+ let Inst{0} = 0;
+}
+
+class POOL16C_AND16_FM_MMR6 : MicroMipsR6Inst16 {
+ bits<3> rt;
+ bits<3> rs;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = 0b010001;
+ let Inst{9-7} = rt;
+ let Inst{6-4} = rs;
+ let Inst{3-0} = 0b0001;
+}
+
+class POOL16C_NOT16_FM_MMR6 : MicroMipsR6Inst16 {
+ bits<3> rt;
+ bits<3> rs;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = 0x11;
+ let Inst{9-7} = rt;
+ let Inst{6-4} = rs;
+ let Inst{3-0} = 0b0000;
+}
+
+class POOL16C_OR16_XOR16_FM_MMR6<bits<4> op> : MicroMipsR6Inst16 {
+ bits<3> rt;
+ bits<3> rs;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = 0b010001;
+ let Inst{9-7} = rt;
+ let Inst{6-4} = rs;
+ let Inst{3-0} = op;
+}
+
+class POOL16C_BREAKPOINT_FM_MMR6<bits<6> op> {
+ bits<4> code_;
+ bits<16> Inst;
+
+ let Inst{15-10} = 0b010001;
+ let Inst{9-6} = code_;
+ let Inst{5-0} = op;
+}
+
+class POOL16A_SUBU16_FM_MMR6 {
+ bits<3> rs;
+ bits<3> rt;
+ bits<3> rd;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = 0b000001;
+ let Inst{9-7} = rs;
+ let Inst{6-4} = rt;
+ let Inst{3-1} = rd;
+ let Inst{0} = 0b1;
+}
+
+class POOL32A_WRPGPR_WSBH_FM_MMR6<bits<10> funct> : MipsR6Inst {
+ bits<5> rt;
+ bits<5> rs;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x00;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-6} = funct;
+ let Inst{5-0} = 0x3c;
+}
+
+class POOL32F_RECIP_ROUND_FM_MMR6<string instr_asm, bits<1> fmt, bits<8> funct>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> ft;
+ bits<5> fs;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010101;
+ let Inst{25-21} = ft;
+ let Inst{20-16} = fs;
+ let Inst{15} = 0;
+ let Inst{14} = fmt;
+ let Inst{13-6} = funct;
+ let Inst{5-0} = 0b111011;
+}
+
+class POOL32F_RINT_FM_MMR6<string instr_asm, bits<2> fmt>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> fs;
+ bits<5> fd;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010101;
+ let Inst{25-21} = fs;
+ let Inst{20-16} = fd;
+ let Inst{15-11} = 0;
+ let Inst{10-9} = fmt;
+ let Inst{8-0} = 0b000100000;
+}
+
+class POOL32F_SEL_FM_MMR6<string instr_asm, bits<2> fmt, bits<9> funct>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> ft;
+ bits<5> fs;
+ bits<5> fd;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010101;
+ let Inst{25-21} = ft;
+ let Inst{20-16} = fs;
+ let Inst{15-11} = fd;
+ let Inst{10-9} = fmt;
+ let Inst{8-0} = funct;
+}
+
+class POOL32F_CLASS_FM_MMR6<string instr_asm, bits<2> fmt, bits<9> funct>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> fs;
+ bits<5> fd;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010101;
+ let Inst{25-21} = fs;
+ let Inst{20-16} = fd;
+ let Inst{15-11} = 0b00000;
+ let Inst{10-9} = fmt;
+ let Inst{8-0} = funct;
+}
+
+class POOL32A_TLBINV_FM_MMR6<string instr_asm, bits<10> funct>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x0;
+ let Inst{25-16} = 0x0;
+ let Inst{15-6} = funct;
+ let Inst{5-0} = 0b111100;
+}
+
+class POOL32A_MFTC0_FM_MMR6<string instr_asm, bits<5> funct, bits<6> opcode>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> rt;
+ bits<5> rs;
+ bits<3> sel;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-14} = 0;
+ let Inst{13-11} = sel;
+ let Inst{10-6} = funct;
+ let Inst{5-0} = opcode;
+}
+
+class POOL32F_MFTC1_FM_MMR6<string instr_asm, bits<8> funct>
+ : MMR6Arch<instr_asm> {
+ bits<5> rt;
+ bits<5> fs;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010101;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = fs;
+ let Inst{15-14} = 0;
+ let Inst{13-6} = funct;
+ let Inst{5-0} = 0b111011;
+}
+
+class POOL32A_MFTC2_FM_MMR6<string instr_asm, bits<10> funct>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> rt;
+ bits<5> impl;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = impl;
+ let Inst{15-6} = funct;
+ let Inst{5-0} = 0b111100;
+}
+
+class CMP_BRANCH_2R_OFF16_FM_MMR6<string opstr, bits<6> funct>
+ : MipsR6Inst, MMR6Arch<opstr> {
+ bits<5> rt;
+ bits<5> rs;
+ bits<16> offset;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = funct;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-0} = offset;
+}
+
+class POOL32A_DVPEVP_FM_MMR6<string instr_asm, bits<10> funct>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> rs;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-21} = 0b00000;
+ let Inst{20-16} = rs;
+ let Inst{15-6} = funct;
+ let Inst{5-0} = 0b111100;
+}
+
+class POOL32B_LWP_SWP_FM_MMR6<bits<4> funct> : MipsR6Inst {
+ bits<5> rd;
+ bits<21> addr;
+ bits<5> base = addr{20-16};
+ bits<12> offset = addr{11-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x8;
+ let Inst{25-21} = rd;
+ let Inst{20-16} = base;
+ let Inst{15-12} = funct;
+ let Inst{11-0} = offset;
+}
+
+class CMP_BRANCH_OFF21_FM_MMR6<string opstr, bits<6> funct> : MipsR6Inst {
+ bits<5> rs;
+ bits<21> offset;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = funct;
+ let Inst{25-21} = rs;
+ let Inst{20-0} = offset;
+}
+
+class POOL32I_BRANCH_COP_1_2_FM_MMR6<string instr_asm, bits<5> funct>
+ : MMR6Arch<instr_asm> {
+ bits<5> rt;
+ bits<16> offset;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010000;
+ let Inst{25-21} = funct;
+ let Inst{20-16} = rt;
+ let Inst{15-0} = offset;
+}
+
+class LDWC1_SDWC1_FM_MMR6<string instr_asm, bits<6> funct>
+ : MMR6Arch<instr_asm> {
+ bits<5> ft;
+ bits<21> addr;
+ bits<5> base = addr{20-16};
+ bits<16> offset = addr{15-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = funct;
+ let Inst{25-21} = ft;
+ let Inst{20-16} = base;
+ let Inst{15-0} = offset;
+}
+
+class POOL32B_LDWC2_SDWC2_FM_MMR6<string instr_asm, bits<4> funct>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> rt;
+ bits<21> addr;
+ bits<5> base = addr{20-16};
+ bits<11> offset = addr{10-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b001000;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = base;
+ let Inst{15-12} = funct;
+ let Inst{11} = 0;
+ let Inst{10-0} = offset;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrInfo.td b/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrInfo.td
new file mode 100644
index 000000000000..fd04f80dd566
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrInfo.td
@@ -0,0 +1,1881 @@
+//=- MicroMips32r6InstrInfo.td - MicroMips r6 Instruction Information -*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes microMIPSr6 instructions.
+//
+//===----------------------------------------------------------------------===//
+
+def brtarget21_mm : Operand<OtherVT> {
+ let EncoderMethod = "getBranchTarget21OpValueMM";
+ let OperandType = "OPERAND_PCREL";
+ let DecoderMethod = "DecodeBranchTarget21MM";
+ let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+def brtarget26_mm : Operand<OtherVT> {
+ let EncoderMethod = "getBranchTarget26OpValueMM";
+ let OperandType = "OPERAND_PCREL";
+ let DecoderMethod = "DecodeBranchTarget26MM";
+ let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+def brtargetr6 : Operand<OtherVT> {
+ let EncoderMethod = "getBranchTargetOpValueMMR6";
+ let OperandType = "OPERAND_PCREL";
+ let DecoderMethod = "DecodeBranchTargetMM";
+ let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+def brtarget_lsl2_mm : Operand<OtherVT> {
+ let EncoderMethod = "getBranchTargetOpValueLsl2MMR6";
+ let OperandType = "OPERAND_PCREL";
+ // Instructions that use this operand have their decoder method
+ // set with DecodeDisambiguates
+ let DecoderMethod = "";
+ let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Encodings
+//
+//===----------------------------------------------------------------------===//
+class ADD_MMR6_ENC : ARITH_FM_MMR6<"add", 0x110>;
+class ADDIU_MMR6_ENC : ADDI_FM_MMR6<"addiu", 0xc>;
+class ADDU_MMR6_ENC : ARITH_FM_MMR6<"addu", 0x150>;
+class ADDIUPC_MMR6_ENC : PCREL19_FM_MMR6<0b00>;
+class ALUIPC_MMR6_ENC : PCREL16_FM_MMR6<0b11111>;
+class AND_MMR6_ENC : ARITH_FM_MMR6<"and", 0x250>;
+class ANDI_MMR6_ENC : ADDI_FM_MMR6<"andi", 0x34>;
+class AUIPC_MMR6_ENC : PCREL16_FM_MMR6<0b11110>;
+class ALIGN_MMR6_ENC : POOL32A_ALIGN_FM_MMR6<0b011111>;
+class AUI_MMR6_ENC : AUI_FM_MMR6;
+class BALC_MMR6_ENC : BRANCH_OFF26_FM<0b101101>;
+class BC_MMR6_ENC : BRANCH_OFF26_FM<0b100101>;
+class BC16_MMR6_ENC : BC16_FM_MM16R6;
+class BEQZC16_MMR6_ENC : BEQZC_BNEZC_FM_MM16R6<0x23>;
+class BNEZC16_MMR6_ENC : BEQZC_BNEZC_FM_MM16R6<0x2b>;
+class BITSWAP_MMR6_ENC : POOL32A_BITSWAP_FM_MMR6<0b101100>;
+class BRK_MMR6_ENC : BREAK_MMR6_ENC<"break">;
+class BEQZC_MMR6_ENC : CMP_BRANCH_OFF21_FM_MMR6<"beqzc", 0b100000>;
+class BNEZC_MMR6_ENC : CMP_BRANCH_OFF21_FM_MMR6<"bnezc", 0b101000>;
+class BGEC_MMR6_ENC : CMP_BRANCH_2R_OFF16_FM_MMR6<"bgec", 0b111101>,
+ DecodeDisambiguates<"POP75GroupBranchMMR6">;
+class BGEUC_MMR6_ENC : CMP_BRANCH_2R_OFF16_FM_MMR6<"bgeuc", 0b110000>,
+ DecodeDisambiguates<"BlezGroupBranchMMR6">;
+class BLTC_MMR6_ENC : CMP_BRANCH_2R_OFF16_FM_MMR6<"bltc", 0b110101>,
+ DecodeDisambiguates<"POP65GroupBranchMMR6">;
+class BLTUC_MMR6_ENC : CMP_BRANCH_2R_OFF16_FM_MMR6<"bltuc", 0b111000>,
+ DecodeDisambiguates<"BgtzGroupBranchMMR6">;
+class BEQC_MMR6_ENC : CMP_BRANCH_2R_OFF16_FM_MMR6<"beqc", 0b011101>;
+class BNEC_MMR6_ENC : CMP_BRANCH_2R_OFF16_FM_MMR6<"bnec", 0b011111>;
+class BLTZC_MMR6_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM_MMR6<"bltzc", 0b110101>,
+ DecodeDisambiguates<"POP65GroupBranchMMR6">;
+class BLEZC_MMR6_ENC : CMP_BRANCH_1R_RT_OFF16_FM_MMR6<"blezc", 0b111101>,
+ DecodeDisambiguates<"POP75GroupBranchMMR6">;
+class BGEZC_MMR6_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM_MMR6<"bgezc", 0b111101>,
+ DecodeDisambiguates<"POP75GroupBranchMMR6">;
+class BGTZC_MMR6_ENC : CMP_BRANCH_1R_RT_OFF16_FM_MMR6<"bgtzc", 0b110101>,
+ DecodeDisambiguates<"POP65GroupBranchMMR6">;
+class BEQZALC_MMR6_ENC : CMP_BRANCH_1R_RT_OFF16_FM_MMR6<"beqzalc", 0b011101>,
+ DecodeDisambiguates<"POP35GroupBranchMMR6">;
+class BNEZALC_MMR6_ENC : CMP_BRANCH_1R_RT_OFF16_FM_MMR6<"bnezalc", 0b011111>,
+ DecodeDisambiguates<"POP37GroupBranchMMR6">;
+class BGTZALC_MMR6_ENC : CMP_BRANCH_1R_RT_OFF16_FM_MMR6<"bgtzalc", 0b111000>,
+ MMDecodeDisambiguatedBy<"BgtzGroupBranchMMR6">;
+class BLTZALC_MMR6_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM_MMR6<"bltzalc", 0b111000>,
+ MMDecodeDisambiguatedBy<"BgtzGroupBranchMMR6">;
+class BGEZALC_MMR6_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM_MMR6<"bgezalc", 0b110000>,
+ MMDecodeDisambiguatedBy<"BlezGroupBranchMMR6">;
+class BLEZALC_MMR6_ENC : CMP_BRANCH_1R_RT_OFF16_FM_MMR6<"blezalc", 0b110000>,
+ MMDecodeDisambiguatedBy<"BlezGroupBranchMMR6">;
+class CACHE_MMR6_ENC : CACHE_PREF_FM_MMR6<0b001000, 0b0110>;
+class CLO_MMR6_ENC : POOL32A_2R_FM_MMR6<0b0100101100>;
+class CLZ_MMR6_ENC : SPECIAL_2R_FM_MMR6<0b010000>;
+class DIV_MMR6_ENC : ARITH_FM_MMR6<"div", 0x118>;
+class DIVU_MMR6_ENC : ARITH_FM_MMR6<"divu", 0x198>;
+class EHB_MMR6_ENC : BARRIER_MMR6_ENC<"ehb", 0x3>;
+class EI_MMR6_ENC : POOL32A_EIDI_MMR6_ENC<"ei", 0x15d>;
+class DI_MMR6_ENC : POOL32A_EIDI_MMR6_ENC<"di", 0b0100011101>;
+class ERET_MMR6_ENC : POOL32A_ERET_FM_MMR6<"eret", 0x3cd>;
+class DERET_MMR6_ENC : POOL32A_ERET_FM_MMR6<"eret", 0b1110001101>;
+class ERETNC_MMR6_ENC : ERETNC_FM_MMR6<"eretnc">;
+class JALRC16_MMR6_ENC : POOL16C_JALRC_FM_MM16R6<0xb>;
+class JIALC_MMR6_ENC : JMP_IDX_COMPACT_FM<0b100000>;
+class JIC_MMR6_ENC : JMP_IDX_COMPACT_FM<0b101000>;
+class JRC16_MMR6_ENC: POOL16C_JALRC_FM_MM16R6<0x3>;
+class JRCADDIUSP_MMR6_ENC : POOL16C_JRCADDIUSP_FM_MM16R6<0x13>;
+class LSA_MMR6_ENC : POOL32A_LSA_FM<0b001111>;
+class LWP_MMR6_ENC : POOL32B_LWP_SWP_FM_MMR6<0x1>;
+class LWPC_MMR6_ENC : PCREL19_FM_MMR6<0b01>;
+class LWM16_MMR6_ENC : POOL16C_LWM_SWM_FM_MM16R6<0x2>;
+class MFC0_MMR6_ENC : POOL32A_MFTC0_FM_MMR6<"mfc0", 0b00011, 0b111100>;
+class MFC1_MMR6_ENC : POOL32F_MFTC1_FM_MMR6<"mfc1", 0b10000000>;
+class MFC2_MMR6_ENC : POOL32A_MFTC2_FM_MMR6<"mfc2", 0b0100110100>;
+class MFHC0_MMR6_ENC : POOL32A_MFTC0_FM_MMR6<"mfhc0", 0b00011, 0b110100>;
+class MFHC1_MMR6_ENC : POOL32F_MFTC1_FM_MMR6<"mfhc1", 0b11000000>;
+class MFHC2_MMR6_ENC : POOL32A_MFTC2_FM_MMR6<"mfhc2", 0b1000110100>;
+class MOD_MMR6_ENC : ARITH_FM_MMR6<"mod", 0x158>;
+class MODU_MMR6_ENC : ARITH_FM_MMR6<"modu", 0x1d8>;
+class MUL_MMR6_ENC : ARITH_FM_MMR6<"mul", 0x18>;
+class MUH_MMR6_ENC : ARITH_FM_MMR6<"muh", 0x58>;
+class MULU_MMR6_ENC : ARITH_FM_MMR6<"mulu", 0x98>;
+class MUHU_MMR6_ENC : ARITH_FM_MMR6<"muhu", 0xd8>;
+class MTC0_MMR6_ENC : POOL32A_MFTC0_FM_MMR6<"mtc0", 0b01011, 0b111100>;
+class MTC1_MMR6_ENC : POOL32F_MFTC1_FM_MMR6<"mtc1", 0b10100000>;
+class MTC2_MMR6_ENC : POOL32A_MFTC2_FM_MMR6<"mtc2", 0b0101110100>;
+class MTHC0_MMR6_ENC : POOL32A_MFTC0_FM_MMR6<"mthc0", 0b01011, 0b110100>;
+class MTHC1_MMR6_ENC : POOL32F_MFTC1_FM_MMR6<"mthc1", 0b11100000>;
+class MTHC2_MMR6_ENC : POOL32A_MFTC2_FM_MMR6<"mthc2", 0b1001110100>;
+class NOR_MMR6_ENC : ARITH_FM_MMR6<"nor", 0x2d0>;
+class OR_MMR6_ENC : ARITH_FM_MMR6<"or", 0x290>;
+class ORI_MMR6_ENC : ADDI_FM_MMR6<"ori", 0x14>;
+class PREF_MMR6_ENC : CACHE_PREF_FM_MMR6<0b011000, 0b0010>;
+class SB16_MMR6_ENC : LOAD_STORE_FM_MM16<0x22>;
+class SEB_MMR6_ENC : SIGN_EXTEND_FM_MMR6<"seb", 0b0010101100>;
+class SEH_MMR6_ENC : SIGN_EXTEND_FM_MMR6<"seh", 0b0011101100>;
+class SELEQZ_MMR6_ENC : POOL32A_FM_MMR6<0b0101000000>;
+class SELNEZ_MMR6_ENC : POOL32A_FM_MMR6<0b0110000000>;
+class SH16_MMR6_ENC : LOAD_STORE_FM_MM16<0x2a>;
+class SLL_MMR6_ENC : SHIFT_MMR6_ENC<"sll", 0x00, 0b0>;
+class SUB_MMR6_ENC : ARITH_FM_MMR6<"sub", 0x190>;
+class SUBU_MMR6_ENC : ARITH_FM_MMR6<"subu", 0x1d0>;
+class SW_MMR6_ENC : SW32_FM_MMR6<"sw", 0x3e>;
+class SWE_MMR6_ENC : POOL32C_SWE_FM_MMR6<"swe", 0x18, 0xa, 0x7>;
+class SW16_MMR6_ENC : LOAD_STORE_FM_MM16<0x3a>;
+class SWM16_MMR6_ENC : POOL16C_LWM_SWM_FM_MM16R6<0xa>;
+class SWSP_MMR6_ENC : LOAD_STORE_SP_FM_MM16<0x32>;
+class SWP_MMR6_ENC : POOL32B_LWP_SWP_FM_MMR6<0x9>;
+class PREFE_MMR6_ENC : POOL32C_ST_EVA_FM_MMR6<0b011000, 0b010>;
+class CACHEE_MMR6_ENC : POOL32C_ST_EVA_FM_MMR6<0b011000, 0b011>;
+class WRPGPR_MMR6_ENC : POOL32A_WRPGPR_WSBH_FM_MMR6<0x3c5>;
+class WSBH_MMR6_ENC : POOL32A_WRPGPR_WSBH_FM_MMR6<0x1ec>;
+class LB_MMR6_ENC : LB32_FM_MMR6;
+class LBU_MMR6_ENC : LBU32_FM_MMR6;
+class LBE_MMR6_ENC : POOL32C_LB_LBU_FM_MMR6<0b100>;
+class LBUE_MMR6_ENC : POOL32C_LB_LBU_FM_MMR6<0b000>;
+class PAUSE_MMR6_ENC : POOL32A_PAUSE_FM_MMR6<"pause", 0b00101>;
+class RDHWR_MMR6_ENC : POOL32A_RDHWR_FM_MMR6;
+class WAIT_MMR6_ENC : WAIT_FM_MM, MMR6Arch<"wait">;
+class SSNOP_MMR6_ENC : BARRIER_FM_MM<0x1>, MMR6Arch<"ssnop">;
+class SYNC_MMR6_ENC : POOL32A_SYNC_FM_MMR6;
+class SYNCI_MMR6_ENC : POOL32I_SYNCI_FM_MMR6, MMR6Arch<"synci">;
+class RDPGPR_MMR6_ENC : POOL32A_RDPGPR_FM_MMR6<0b1110000101>;
+class SDBBP_MMR6_ENC : SDBBP_FM_MM, MMR6Arch<"sdbbp">;
+class XOR_MMR6_ENC : ARITH_FM_MMR6<"xor", 0x310>;
+class XORI_MMR6_ENC : ADDI_FM_MMR6<"xori", 0x1c>;
+class ABS_S_MMR6_ENC : POOL32F_ABS_FM_MMR6<"abs.s", 0, 0b0001101>;
+class ABS_D_MMR6_ENC : POOL32F_ABS_FM_MMR6<"abs.d", 1, 0b0001101>;
+class FLOOR_L_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"floor.l.s", 0, 0b00001100>;
+class FLOOR_L_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"floor.l.d", 1, 0b00001100>;
+class FLOOR_W_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"floor.w.s", 0, 0b00101100>;
+class FLOOR_W_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"floor.w.d", 1, 0b00101100>;
+class CEIL_L_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"ceil.l.s", 0, 0b01001100>;
+class CEIL_L_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"ceil.l.d", 1, 0b01001100>;
+class CEIL_W_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"ceil.w.s", 0, 0b01101100>;
+class CEIL_W_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"ceil.w.d", 1, 0b01101100>;
+class TRUNC_L_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"trunc.l.s", 0, 0b10001100>;
+class TRUNC_L_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"trunc.l.d", 1, 0b10001100>;
+class TRUNC_W_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"trunc.w.s", 0, 0b10101100>;
+class TRUNC_W_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"trunc.w.d", 1, 0b10101100>;
+class SQRT_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"sqrt.s", 0, 0b00101000>;
+class SQRT_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"sqrt.d", 1, 0b00101000>;
+class SB_MMR6_ENC : SB32_SH32_STORE_FM_MMR6<0b000110>;
+class SBE_MMR6_ENC : POOL32C_STORE_EVA_FM_MMR6<0b100>;
+class SCE_MMR6_ENC : POOL32C_STORE_EVA_FM_MMR6<0b110>;
+class SH_MMR6_ENC : SB32_SH32_STORE_FM_MMR6<0b001110>;
+class SHE_MMR6_ENC : POOL32C_STORE_EVA_FM_MMR6<0b101>;
+class LLE_MMR6_ENC : LOAD_WORD_EVA_FM_MMR6<0b110>;
+class LWE_MMR6_ENC : LOAD_WORD_EVA_FM_MMR6<0b111>;
+class LW_MMR6_ENC : LOAD_WORD_FM_MMR6;
+class LUI_MMR6_ENC : LOAD_UPPER_IMM_FM_MMR6;
+class JALRC_HB_MMR6_ENC : POOL32A_JALRC_FM_MMR6<"jalrc.hb", 0b0001111100>;
+class RINT_S_MMR6_ENC : POOL32F_RINT_FM_MMR6<"rint.s", 0>;
+class RINT_D_MMR6_ENC : POOL32F_RINT_FM_MMR6<"rint.d", 1>;
+class ROUND_L_S_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.l.s", 0,
+ 0b11001100>;
+class ROUND_L_D_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.l.d", 1,
+ 0b11001100>;
+class ROUND_W_S_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.w.s", 0,
+ 0b11101100>;
+class ROUND_W_D_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.w.d", 1,
+ 0b11101100>;
+class SEL_S_MMR6_ENC : POOL32F_SEL_FM_MMR6<"sel.s", 0, 0b010111000>;
+class SEL_D_MMR6_ENC : POOL32F_SEL_FM_MMR6<"sel.d", 1, 0b010111000>;
+class SELEQZ_S_MMR6_ENC : POOL32F_SEL_FM_MMR6<"seleqz.s", 0, 0b000111000>;
+class SELEQZ_D_MMR6_ENC : POOL32F_SEL_FM_MMR6<"seleqz.d", 1, 0b000111000>;
+class SELNEZ_S_MMR6_ENC : POOL32F_SEL_FM_MMR6<"selnez.s", 0, 0b001111000>;
+class SELNEZ_D_MMR6_ENC : POOL32F_SEL_FM_MMR6<"selnez.d", 1, 0b001111000>;
+class CLASS_S_MMR6_ENC : POOL32F_CLASS_FM_MMR6<"class.s", 0, 0b001100000>;
+class CLASS_D_MMR6_ENC : POOL32F_CLASS_FM_MMR6<"class.d", 1, 0b001100000>;
+class EXT_MMR6_ENC : POOL32A_EXT_INS_FM_MMR6<"ext", 0b101100>;
+class INS_MMR6_ENC : POOL32A_EXT_INS_FM_MMR6<"ins", 0b001100>;
+class JALRC_MMR6_ENC : POOL32A_JALRC_FM_MMR6<"jalrc", 0b0000111100>;
+class BOVC_MMR6_ENC : POP35_BOVC_FM_MMR6<"bovc">;
+class BNVC_MMR6_ENC : POP37_BNVC_FM_MMR6<"bnvc">;
+class ADDU16_MMR6_ENC : POOL16A_ADDU16_FM_MMR6;
+class AND16_MMR6_ENC : POOL16C_AND16_FM_MMR6;
+class ANDI16_MMR6_ENC : ANDI_FM_MM16<0b001011>, MicroMipsR6Inst16;
+class NOT16_MMR6_ENC : POOL16C_NOT16_FM_MMR6;
+class OR16_MMR6_ENC : POOL16C_OR16_XOR16_FM_MMR6<0b1001>;
+class SLL16_MMR6_ENC : SHIFT_FM_MM16<0>, MicroMipsR6Inst16;
+class SRL16_MMR6_ENC : SHIFT_FM_MM16<1>, MicroMipsR6Inst16;
+class BREAK16_MMR6_ENC : POOL16C_BREAKPOINT_FM_MMR6<0b011011>;
+class LI16_MMR6_ENC : LI_FM_MM16;
+class MOVE16_MMR6_ENC : MOVE_FM_MM16<0b000011>;
+class SDBBP16_MMR6_ENC : POOL16C_BREAKPOINT_FM_MMR6<0b111011>;
+class SUBU16_MMR6_ENC : POOL16A_SUBU16_FM_MMR6;
+class XOR16_MMR6_ENC : POOL16C_OR16_XOR16_FM_MMR6<0b1000>;
+class TLBINV_MMR6_ENC : POOL32A_TLBINV_FM_MMR6<"tlbinv", 0x10d>;
+class TLBINVF_MMR6_ENC : POOL32A_TLBINV_FM_MMR6<"tlbinvf", 0x14d>;
+class DVP_MMR6_ENC : POOL32A_DVPEVP_FM_MMR6<"dvp", 0b0001100101>;
+class EVP_MMR6_ENC : POOL32A_DVPEVP_FM_MMR6<"evp", 0b0011100101>;
+class BC1EQZC_MMR6_ENC : POOL32I_BRANCH_COP_1_2_FM_MMR6<"bc1eqzc", 0b01000>;
+class BC1NEZC_MMR6_ENC : POOL32I_BRANCH_COP_1_2_FM_MMR6<"bc1nezc", 0b01001>;
+class BC2EQZC_MMR6_ENC : POOL32I_BRANCH_COP_1_2_FM_MMR6<"bc2eqzc", 0b01010>;
+class BC2NEZC_MMR6_ENC : POOL32I_BRANCH_COP_1_2_FM_MMR6<"bc2nezc", 0b01011>;
+class LDC1_MMR6_ENC : LDWC1_SDWC1_FM_MMR6<"ldc1", 0b101111>;
+class SDC1_MMR6_ENC : LDWC1_SDWC1_FM_MMR6<"sdc1", 0b101110>;
+class LDC2_MMR6_ENC : POOL32B_LDWC2_SDWC2_FM_MMR6<"ldc2", 0b0010>;
+class SDC2_MMR6_ENC : POOL32B_LDWC2_SDWC2_FM_MMR6<"sdc2", 0b1010>;
+class LWC2_MMR6_ENC : POOL32B_LDWC2_SDWC2_FM_MMR6<"lwc2", 0b0000>;
+class SWC2_MMR6_ENC : POOL32B_LDWC2_SDWC2_FM_MMR6<"swc2", 0b1000>;
+
+/// Floating Point Instructions
+class FADD_S_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"add.s", 0, 0b00110000>;
+class FADD_D_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"add.d", 1, 0b00110000>;
+class FSUB_S_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"sub.s", 0, 0b01110000>;
+class FSUB_D_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"sub.d", 1, 0b01110000>;
+class FMUL_S_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"mul.s", 0, 0b10110000>;
+class FMUL_D_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"mul.d", 1, 0b10110000>;
+class FDIV_S_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"div.s", 0, 0b11110000>;
+class FDIV_D_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"div.d", 1, 0b11110000>;
+class MADDF_S_MMR6_ENC : POOL32F_ARITHF_FM_MMR6<"maddf.s", 0, 0b110111000>;
+class MADDF_D_MMR6_ENC : POOL32F_ARITHF_FM_MMR6<"maddf.d", 1, 0b110111000>;
+class MSUBF_S_MMR6_ENC : POOL32F_ARITHF_FM_MMR6<"msubf.s", 0, 0b111111000>;
+class MSUBF_D_MMR6_ENC : POOL32F_ARITHF_FM_MMR6<"msubf.d", 1, 0b111111000>;
+class FMOV_S_MMR6_ENC : POOL32F_MOV_NEG_FM_MMR6<"mov.s", 0, 0b0000001>;
+class FMOV_D_MMR6_ENC : POOL32F_MOV_NEG_FM_MMR6<"mov.d", 1, 0b0000001>;
+class FNEG_S_MMR6_ENC : POOL32F_MOV_NEG_FM_MMR6<"neg.s", 0, 0b0101101>;
+class FNEG_D_MMR6_ENC : POOL32F_MOV_NEG_FM_MMR6<"neg.d", 1, 0b0101101>;
+class MAX_S_MMR6_ENC : POOL32F_MINMAX_FM<"max.s", 0, 0b000001011>;
+class MAX_D_MMR6_ENC : POOL32F_MINMAX_FM<"max.d", 1, 0b000001011>;
+class MAXA_S_MMR6_ENC : POOL32F_MINMAX_FM<"maxa.s", 0, 0b000101011>;
+class MAXA_D_MMR6_ENC : POOL32F_MINMAX_FM<"maxa.d", 1, 0b000101011>;
+class MIN_S_MMR6_ENC : POOL32F_MINMAX_FM<"min.s", 0, 0b000000011>;
+class MIN_D_MMR6_ENC : POOL32F_MINMAX_FM<"min.d", 1, 0b000000011>;
+class MINA_S_MMR6_ENC : POOL32F_MINMAX_FM<"mina.s", 0, 0b000100011>;
+class MINA_D_MMR6_ENC : POOL32F_MINMAX_FM<"mina.d", 1, 0b000100011>;
+
+class CVT_L_S_MMR6_ENC : POOL32F_CVT_LW_FM<"cvt.l.s", 0, 0b00000100>;
+class CVT_L_D_MMR6_ENC : POOL32F_CVT_LW_FM<"cvt.l.d", 1, 0b00000100>;
+class CVT_W_S_MMR6_ENC : POOL32F_CVT_LW_FM<"cvt.w.s", 0, 0b00100100>;
+class CVT_W_D_MMR6_ENC : POOL32F_CVT_LW_FM<"cvt.w.d", 1, 0b00100100>;
+class CVT_D_S_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.d.s", 0, 0b1001101>;
+class CVT_D_W_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.d.w", 1, 0b1001101>;
+class CVT_D_L_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.d.l", 2, 0b1001101>;
+class CVT_S_D_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.s.d", 0, 0b1101101>;
+class CVT_S_W_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.s.w", 1, 0b1101101>;
+class CVT_S_L_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.s.l", 2, 0b1101101>;
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Descriptions
+//
+//===----------------------------------------------------------------------===//
+
+class CMP_CBR_RT_Z_MMR6_DESC_BASE<string instr_asm, DAGOperand opnd,
+ RegisterOperand GPROpnd>
+ : BRANCH_DESC_BASE {
+ dag InOperandList = (ins GPROpnd:$rt, opnd:$offset);
+ dag OutOperandList = (outs);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $offset");
+ list<Register> Defs = [AT];
+ InstrItinClass Itinerary = II_BCCZC;
+}
+
+class BEQZALC_MMR6_DESC : CMP_CBR_RT_Z_MMR6_DESC_BASE<"beqzalc", brtarget_mm,
+ GPR32Opnd> {
+ list<Register> Defs = [RA];
+}
+
+class BGEZALC_MMR6_DESC : CMP_CBR_RT_Z_MMR6_DESC_BASE<"bgezalc", brtarget_mm,
+ GPR32Opnd> {
+ list<Register> Defs = [RA];
+}
+
+class BGTZALC_MMR6_DESC : CMP_CBR_RT_Z_MMR6_DESC_BASE<"bgtzalc", brtarget_mm,
+ GPR32Opnd> {
+ list<Register> Defs = [RA];
+}
+
+class BLEZALC_MMR6_DESC : CMP_CBR_RT_Z_MMR6_DESC_BASE<"blezalc", brtarget_mm,
+ GPR32Opnd> {
+ list<Register> Defs = [RA];
+}
+
+class BLTZALC_MMR6_DESC : CMP_CBR_RT_Z_MMR6_DESC_BASE<"bltzalc", brtarget_mm,
+ GPR32Opnd> {
+ list<Register> Defs = [RA];
+}
+
+class BNEZALC_MMR6_DESC : CMP_CBR_RT_Z_MMR6_DESC_BASE<"bnezalc", brtarget_mm,
+ GPR32Opnd> {
+ list<Register> Defs = [RA];
+}
+
+class BLTZC_MMR6_DESC : CMP_CBR_RT_Z_MMR6_DESC_BASE<"bltzc", brtarget_lsl2_mm,
+ GPR32Opnd>;
+class BLEZC_MMR6_DESC : CMP_CBR_RT_Z_MMR6_DESC_BASE<"blezc", brtarget_lsl2_mm,
+ GPR32Opnd>;
+class BGEZC_MMR6_DESC : CMP_CBR_RT_Z_MMR6_DESC_BASE<"bgezc", brtarget_lsl2_mm,
+ GPR32Opnd>;
+class BGTZC_MMR6_DESC : CMP_CBR_RT_Z_MMR6_DESC_BASE<"bgtzc", brtarget_lsl2_mm,
+ GPR32Opnd>;
+
+class CMP_CBR_2R_MMR6_DESC_BASE<string instr_asm, DAGOperand opnd,
+ RegisterOperand GPROpnd> : BRANCH_DESC_BASE {
+ dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt, opnd:$offset);
+ dag OutOperandList = (outs);
+ string AsmString = !strconcat(instr_asm, "\t$rs, $rt, $offset");
+ list<Register> Defs = [AT];
+ InstrItinClass Itinerary = II_BCCC;
+}
+
+class BGEC_MMR6_DESC : CMP_CBR_2R_MMR6_DESC_BASE<"bgec", brtarget_lsl2_mm,
+ GPR32Opnd>;
+class BGEUC_MMR6_DESC : CMP_CBR_2R_MMR6_DESC_BASE<"bgeuc", brtarget_lsl2_mm,
+ GPR32Opnd>;
+class BLTC_MMR6_DESC : CMP_CBR_2R_MMR6_DESC_BASE<"bltc", brtarget_lsl2_mm,
+ GPR32Opnd>;
+class BLTUC_MMR6_DESC : CMP_CBR_2R_MMR6_DESC_BASE<"bltuc", brtarget_lsl2_mm,
+ GPR32Opnd>;
+class BEQC_MMR6_DESC : CMP_CBR_2R_MMR6_DESC_BASE<"beqc", brtarget_lsl2_mm,
+ GPR32Opnd>;
+class BNEC_MMR6_DESC : CMP_CBR_2R_MMR6_DESC_BASE<"bnec", brtarget_lsl2_mm,
+ GPR32Opnd>;
+
+class ADD_MMR6_DESC : ArithLogicR<"add", GPR32Opnd, 1, II_ADD>;
+class ADDIU_MMR6_DESC : ArithLogicI<"addiu", simm16, GPR32Opnd, II_ADDIU, immSExt16, add>;
+class ADDU_MMR6_DESC : ArithLogicR<"addu", GPR32Opnd, 1, II_ADDU>;
+class MUL_MMR6_DESC : ArithLogicR<"mul", GPR32Opnd, 1, II_MUL, mul>;
+class MUH_MMR6_DESC : ArithLogicR<"muh", GPR32Opnd, 1, II_MUH, mulhs>;
+class MULU_MMR6_DESC : ArithLogicR<"mulu", GPR32Opnd, 1, II_MULU>;
+class MUHU_MMR6_DESC : ArithLogicR<"muhu", GPR32Opnd, 1, II_MUHU, mulhu>;
+
+class BC_MMR6_DESC_BASE<string instr_asm, DAGOperand opnd, InstrItinClass Itin>
+ : BRANCH_DESC_BASE, MMR6Arch<instr_asm> {
+ dag InOperandList = (ins opnd:$offset);
+ dag OutOperandList = (outs);
+ string AsmString = !strconcat(instr_asm, "\t$offset");
+ bit isBarrier = 1;
+ InstrItinClass Itinerary = Itin;
+}
+
+class BALC_MMR6_DESC : BC_MMR6_DESC_BASE<"balc", brtarget26_mm, II_BALC> {
+ bit isCall = 1;
+ list<Register> Defs = [RA];
+}
+class BC_MMR6_DESC : BC_MMR6_DESC_BASE<"bc", brtarget26_mm, II_BC>;
+
+class BC16_MMR6_DESC : MicroMipsInst16<(outs), (ins brtarget10_mm:$offset),
+ !strconcat("bc16", "\t$offset"), [],
+ II_BC, FrmI>,
+ MMR6Arch<"bc16">, MicroMipsR6Inst16 {
+ let isBranch = 1;
+ let isTerminator = 1;
+ let isBarrier = 1;
+ let hasDelaySlot = 0;
+ let AdditionalPredicates = [RelocPIC];
+ let Defs = [AT];
+}
+
+class BEQZC_BNEZC_MM16R6_DESC_BASE<string instr_asm>
+ : CBranchZeroMM<instr_asm, brtarget7_mm, GPRMM16Opnd>, MMR6Arch<instr_asm> {
+ let isBranch = 1;
+ let isTerminator = 1;
+ let hasDelaySlot = 0;
+ let Defs = [AT];
+}
+class BEQZC16_MMR6_DESC : BEQZC_BNEZC_MM16R6_DESC_BASE<"beqzc16">;
+class BNEZC16_MMR6_DESC : BEQZC_BNEZC_MM16R6_DESC_BASE<"bnezc16">;
+
+class SUB_MMR6_DESC : ArithLogicR<"sub", GPR32Opnd, 0, II_SUB>;
+class SUBU_MMR6_DESC : ArithLogicR<"subu", GPR32Opnd, 0,II_SUBU>;
+
+class BITSWAP_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd>
+ : MMR6Arch<instr_asm> {
+ dag OutOperandList = (outs GPROpnd:$rd);
+ dag InOperandList = (ins GPROpnd:$rt);
+ string AsmString = !strconcat(instr_asm, "\t$rd, $rt");
+ list<dag> Pattern = [];
+ InstrItinClass Itinerary = II_BITSWAP;
+}
+
+class BITSWAP_MMR6_DESC : BITSWAP_MMR6_DESC_BASE<"bitswap", GPR32Opnd>;
+
+class BRK_MMR6_DESC : BRK_FT<"break">;
+
+class CACHE_HINT_MMR6_DESC<string instr_asm, Operand MemOpnd,
+ RegisterOperand GPROpnd, InstrItinClass Itin>
+ : MMR6Arch<instr_asm> {
+ dag OutOperandList = (outs);
+ dag InOperandList = (ins MemOpnd:$addr, uimm5:$hint);
+ string AsmString = !strconcat(instr_asm, "\t$hint, $addr");
+ list<dag> Pattern = [];
+ string DecoderMethod = "DecodeCacheOpMM";
+ InstrItinClass Itinerary = Itin;
+}
+
+class CACHE_MMR6_DESC : CACHE_HINT_MMR6_DESC<"cache", mem_mm_12, GPR32Opnd,
+ II_CACHE>;
+class PREF_MMR6_DESC : CACHE_HINT_MMR6_DESC<"pref", mem_mm_12, GPR32Opnd,
+ II_PREF>;
+
+class PREFE_CACHEE_MMR6_DESC_BASE<string instr_asm, Operand MemOpnd,
+ RegisterOperand GPROpnd, InstrItinClass Itin>
+ : CACHE_HINT_MMR6_DESC<instr_asm, MemOpnd, GPROpnd, Itin> {
+ string DecoderMethod = "DecodePrefeOpMM";
+}
+
+class PREFE_MMR6_DESC : PREFE_CACHEE_MMR6_DESC_BASE<"prefe", mem_mm_9,
+ GPR32Opnd, II_PREFE>;
+class CACHEE_MMR6_DESC : PREFE_CACHEE_MMR6_DESC_BASE<"cachee", mem_mm_9,
+ GPR32Opnd, II_CACHEE>;
+
+class LB_LBU_MMR6_DESC_BASE<string instr_asm, Operand MemOpnd,
+ RegisterOperand GPROpnd, InstrItinClass Itin>
+ : MMR6Arch<instr_asm> {
+ dag OutOperandList = (outs GPROpnd:$rt);
+ dag InOperandList = (ins MemOpnd:$addr);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+ string DecoderMethod = "DecodeLoadByte15";
+ bit mayLoad = 1;
+ InstrItinClass Itinerary = Itin;
+}
+class LB_MMR6_DESC : LB_LBU_MMR6_DESC_BASE<"lb", mem_mm_16, GPR32Opnd, II_LB>;
+class LBU_MMR6_DESC : LB_LBU_MMR6_DESC_BASE<"lbu", mem_mm_16, GPR32Opnd,
+ II_LBU>;
+
+class LBE_LBUE_MMR6_DESC_BASE<string instr_asm, Operand MemOpnd,
+ RegisterOperand GPROpnd, InstrItinClass Itin>
+ : LB_LBU_MMR6_DESC_BASE<instr_asm, MemOpnd, GPROpnd, Itin> {
+ let DecoderMethod = "DecodeLoadByte9";
+}
+class LBE_MMR6_DESC : LBE_LBUE_MMR6_DESC_BASE<"lbe", mem_mm_9, GPR32Opnd,
+ II_LBE>;
+class LBUE_MMR6_DESC : LBE_LBUE_MMR6_DESC_BASE<"lbue", mem_mm_9, GPR32Opnd,
+ II_LBUE>;
+
+class CLO_CLZ_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+ InstrItinClass Itin> : MMR6Arch<instr_asm> {
+ dag OutOperandList = (outs GPROpnd:$rt);
+ dag InOperandList = (ins GPROpnd:$rs);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $rs");
+ InstrItinClass Itinerary = Itin;
+}
+
+class CLO_MMR6_DESC : CLO_CLZ_MMR6_DESC_BASE<"clo", GPR32Opnd, II_CLO>;
+class CLZ_MMR6_DESC : CLO_CLZ_MMR6_DESC_BASE<"clz", GPR32Opnd, II_CLZ>;
+
+class EHB_MMR6_DESC : Barrier<"ehb", II_EHB>;
+class EI_MMR6_DESC : DEI_FT<"ei", GPR32Opnd, II_EI>;
+class DI_MMR6_DESC : DEI_FT<"di", GPR32Opnd, II_DI>;
+
+class ERET_MMR6_DESC : ER_FT<"eret", II_ERET>;
+class DERET_MMR6_DESC : ER_FT<"deret", II_DERET>;
+class ERETNC_MMR6_DESC : ER_FT<"eretnc", II_ERETNC>;
+
+class JALRC16_MMR6_DESC_BASE<string opstr, RegisterOperand RO>
+ : MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"),
+ [(MipsJmpLink RO:$rs)], II_JALR, FrmR>,
+ MMR6Arch<opstr>, MicroMipsR6Inst16 {
+ let isCall = 1;
+ let hasDelaySlot = 0;
+ let Defs = [RA];
+}
+class JALRC16_MMR6_DESC : JALRC16_MMR6_DESC_BASE<"jalr", GPR32Opnd>;
+
+class JMP_MMR6_IDX_COMPACT_DESC_BASE<string opstr, DAGOperand opnd,
+ RegisterOperand GPROpnd,
+ InstrItinClass Itin>
+ : MMR6Arch<opstr> {
+ dag InOperandList = (ins GPROpnd:$rt, opnd:$offset);
+ string AsmString = !strconcat(opstr, "\t$rt, $offset");
+ list<dag> Pattern = [];
+ bit isTerminator = 1;
+ bit hasDelaySlot = 0;
+ InstrItinClass Itinerary = Itin;
+}
+
+class JIALC_MMR6_DESC : JMP_MMR6_IDX_COMPACT_DESC_BASE<"jialc", calloffset16,
+ GPR32Opnd, II_JIALC> {
+ bit isCall = 1;
+ list<Register> Defs = [RA];
+}
+
+class JIC_MMR6_DESC : JMP_MMR6_IDX_COMPACT_DESC_BASE<"jic", jmpoffset16,
+ GPR32Opnd, II_JIC> {
+ bit isBarrier = 1;
+ list<Register> Defs = [AT];
+}
+
+class JRC16_MMR6_DESC_BASE<string opstr, RegisterOperand RO>
+ : MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"),
+ [], II_JR, FrmR>,
+ MMR6Arch<opstr>, MicroMipsR6Inst16 {
+ let hasDelaySlot = 0;
+ let isBranch = 1;
+ let isIndirectBranch = 1;
+}
+class JRC16_MMR6_DESC : JRC16_MMR6_DESC_BASE<"jrc16", GPR32Opnd>;
+
+class JRCADDIUSP_MMR6_DESC
+ : MicroMipsInst16<(outs), (ins uimm5_lsl2:$imm), "jrcaddiusp\t$imm",
+ [], II_JRADDIUSP, FrmR>,
+ MMR6Arch<"jrcaddiusp">, MicroMipsR6Inst16 {
+ let hasDelaySlot = 0;
+ let isTerminator = 1;
+ let isBarrier = 1;
+ let isBranch = 1;
+ let isIndirectBranch = 1;
+}
+
+class ALIGN_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+ Operand ImmOpnd, InstrItinClass Itin>
+ : MMR6Arch<instr_asm> {
+ dag OutOperandList = (outs GPROpnd:$rd);
+ dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt, ImmOpnd:$bp);
+ string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt, $bp");
+ list<dag> Pattern = [];
+ InstrItinClass Itinerary = Itin;
+}
+
+class ALIGN_MMR6_DESC : ALIGN_MMR6_DESC_BASE<"align", GPR32Opnd, uimm2,
+ II_ALIGN>;
+
+class AUI_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+ InstrItinClass Itin> : MMR6Arch<instr_asm> {
+ dag OutOperandList = (outs GPROpnd:$rt);
+ dag InOperandList = (ins GPROpnd:$rs, uimm16:$imm);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $imm");
+ list<dag> Pattern = [];
+ InstrItinClass Itinerary = Itin;
+}
+
+class AUI_MMR6_DESC : AUI_MMR6_DESC_BASE<"aui", GPR32Opnd, II_AUI>;
+
+class SEB_MMR6_DESC : SignExtInReg<"seb", i8, GPR32Opnd, II_SEB>;
+class SEH_MMR6_DESC : SignExtInReg<"seh", i16, GPR32Opnd, II_SEH>;
+class ALUIPC_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+ InstrItinClass Itin> : MMR6Arch<instr_asm> {
+ dag OutOperandList = (outs GPROpnd:$rt);
+ dag InOperandList = (ins simm16:$imm);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $imm");
+ list<dag> Pattern = [];
+ InstrItinClass Itinerary = Itin;
+}
+
+class ALUIPC_MMR6_DESC : ALUIPC_MMR6_DESC_BASE<"aluipc", GPR32Opnd, II_ALUIPC>;
+class AUIPC_MMR6_DESC : ALUIPC_MMR6_DESC_BASE<"auipc", GPR32Opnd, II_AUIPC>;
+
+class LSA_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+ Operand ImmOpnd, InstrItinClass Itin>
+ : MMR6Arch<instr_asm> {
+ dag OutOperandList = (outs GPROpnd:$rd);
+ dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt, ImmOpnd:$imm2);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $rd, $imm2");
+ list<dag> Pattern = [];
+ InstrItinClass Itinerary = Itin;
+}
+
+class LSA_MMR6_DESC : LSA_MMR6_DESC_BASE<"lsa", GPR32Opnd, uimm2_plus1, II_LSA>;
+
+class PCREL_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+ Operand ImmOpnd, InstrItinClass Itin>
+ : MMR6Arch<instr_asm> {
+ dag OutOperandList = (outs GPROpnd:$rt);
+ dag InOperandList = (ins ImmOpnd:$imm);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $imm");
+ list<dag> Pattern = [];
+ InstrItinClass Itinerary = Itin;
+}
+
+class ADDIUPC_MMR6_DESC : PCREL_MMR6_DESC_BASE<"addiupc", GPR32Opnd,
+ simm19_lsl2, II_ADDIUPC>;
+class LWPC_MMR6_DESC: PCREL_MMR6_DESC_BASE<"lwpc", GPR32Opnd, simm19_lsl2,
+ II_LWPC>;
+
+class LWP_MMR6_DESC : MMR6Arch<"lwp"> {
+ dag OutOperandList = (outs regpair:$rd);
+ dag InOperandList = (ins mem_simm12:$addr);
+ string AsmString = !strconcat("lwp", "\t$rd, $addr");
+ list<dag> Pattern = [];
+ InstrItinClass Itinerary = II_LWP;
+ ComplexPattern Addr = addr;
+ Format f = FrmI;
+ string BaseOpcode = "lwp";
+ string DecoderMethod = "DecodeMemMMImm12";
+ bit mayLoad = 1;
+}
+
+class SWP_MMR6_DESC : MMR6Arch<"swp"> {
+ dag OutOperandList = (outs);
+ dag InOperandList = (ins regpair:$rd, mem_simm12:$addr);
+ string AsmString = !strconcat("swp", "\t$rd, $addr");
+ list<dag> Pattern = [];
+ InstrItinClass Itinerary = II_SWP;
+ ComplexPattern Addr = addr;
+ Format f = FrmI;
+ string BaseOpcode = "swp";
+ string DecoderMethod = "DecodeMemMMImm12";
+ bit mayStore = 1;
+}
+
+class SELEQNE_Z_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+ InstrItinClass Itin> : MMR6Arch<instr_asm> {
+ dag OutOperandList = (outs GPROpnd:$rd);
+ dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt);
+ string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
+ list<dag> Pattern = [];
+ InstrItinClass Itinerary = Itin;
+}
+
+class SELEQZ_MMR6_DESC : SELEQNE_Z_MMR6_DESC_BASE<"seleqz", GPR32Opnd,
+ II_SELCCZ>;
+class SELNEZ_MMR6_DESC : SELEQNE_Z_MMR6_DESC_BASE<"selnez", GPR32Opnd,
+ II_SELCCZ>;
+class PAUSE_MMR6_DESC : Barrier<"pause", II_PAUSE>;
+class RDHWR_MMR6_DESC : MMR6Arch<"rdhwr">, MipsR6Inst {
+ dag OutOperandList = (outs GPR32Opnd:$rt);
+ dag InOperandList = (ins HWRegsOpnd:$rs, uimm3:$sel);
+ string AsmString = !strconcat("rdhwr", "\t$rt, $rs, $sel");
+ list<dag> Pattern = [];
+ InstrItinClass Itinerary = II_RDHWR;
+ Format Form = FrmR;
+}
+
+class WAIT_MMR6_DESC : WaitMM<"wait">;
+// FIXME: ssnop should not be defined for R6. Per MD000582 microMIPS32 6.03:
+// Assemblers targeting specifically Release 6 should reject the SSNOP
+// instruction with an error.
+class SSNOP_MMR6_DESC : Barrier<"ssnop", II_SSNOP>;
+class SLL_MMR6_DESC : shift_rotate_imm<"sll", uimm5, GPR32Opnd, II_SLL>;
+
+class DIVMOD_MMR6_DESC_BASE<string opstr, RegisterOperand GPROpnd,
+ InstrItinClass Itin,
+ SDPatternOperator OpNode=null_frag>
+ : MipsR6Inst {
+ dag OutOperandList = (outs GPROpnd:$rd);
+ dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt);
+ string AsmString = !strconcat(opstr, "\t$rd, $rs, $rt");
+ list<dag> Pattern = [(set GPROpnd:$rd, (OpNode GPROpnd:$rs, GPROpnd:$rt))];
+ string BaseOpcode = opstr;
+ Format f = FrmR;
+ let isCommutable = 0;
+ let isReMaterializable = 1;
+ InstrItinClass Itinerary = Itin;
+
+ // This instruction doesn't trap division by zero itself. We must insert
+ // teq instructions as well.
+ bit usesCustomInserter = 1;
+}
+class DIV_MMR6_DESC : DIVMOD_MMR6_DESC_BASE<"div", GPR32Opnd, II_DIV, sdiv>;
+class DIVU_MMR6_DESC : DIVMOD_MMR6_DESC_BASE<"divu", GPR32Opnd, II_DIVU, udiv>;
+class MOD_MMR6_DESC : DIVMOD_MMR6_DESC_BASE<"mod", GPR32Opnd, II_MOD, srem>;
+class MODU_MMR6_DESC : DIVMOD_MMR6_DESC_BASE<"modu", GPR32Opnd, II_MODU, urem>;
+class AND_MMR6_DESC : ArithLogicR<"and", GPR32Opnd, 1, II_AND, and>;
+class ANDI_MMR6_DESC : ArithLogicI<"andi", uimm16, GPR32Opnd, II_ANDI>;
+class NOR_MMR6_DESC : LogicNOR<"nor", GPR32Opnd>;
+class OR_MMR6_DESC : ArithLogicR<"or", GPR32Opnd, 1, II_OR, or>;
+class ORI_MMR6_DESC : ArithLogicI<"ori", uimm16, GPR32Opnd, II_ORI, immZExt16,
+ or> {
+ int AddedComplexity = 1;
+}
+class XOR_MMR6_DESC : ArithLogicR<"xor", GPR32Opnd, 1, II_XOR, xor>;
+class XORI_MMR6_DESC : ArithLogicI<"xori", uimm16, GPR32Opnd, II_XORI,
+ immZExt16, xor>;
+
+class SWE_MMR6_DESC_BASE<string opstr, DAGOperand RO, DAGOperand MO,
+ InstrItinClass Itin = NoItinerary,
+ SDPatternOperator OpNode = null_frag,
+ ComplexPattern Addr = addr> :
+ InstSE<(outs), (ins RO:$rt, MO:$addr), !strconcat(opstr, "\t$rt, $addr"),
+ [(OpNode RO:$rt, Addr:$addr)], Itin, FrmI, opstr> {
+ let DecoderMethod = "DecodeMem";
+ let mayStore = 1;
+}
+class SW_MMR6_DESC : Store<"sw", GPR32Opnd> {
+ InstrItinClass Itinerary = II_SW;
+}
+class SWE_MMR6_DESC : SWE_MMR6_DESC_BASE<"swe", GPR32Opnd, mem_simm9, II_SWE>;
+
+class WRPGPR_WSBH_MMR6_DESC_BASE<string instr_asm, RegisterOperand RO,
+ InstrItinClass Itin> : MMR6Arch<instr_asm> {
+ dag InOperandList = (ins RO:$rs);
+ dag OutOperandList = (outs RO:$rt);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $rs");
+ list<dag> Pattern = [];
+ Format f = FrmR;
+ string BaseOpcode = instr_asm;
+ bit hasSideEffects = 0;
+ InstrItinClass Itinerary = Itin;
+}
+class WRPGPR_MMR6_DESC : WRPGPR_WSBH_MMR6_DESC_BASE<"wrpgpr", GPR32Opnd,
+ II_WRPGPR>;
+class WSBH_MMR6_DESC : WRPGPR_WSBH_MMR6_DESC_BASE<"wsbh", GPR32Opnd, II_WSBH>;
+
+class MTC0_MMR6_DESC_BASE<string opstr, RegisterOperand DstRC,
+ RegisterOperand SrcRC, InstrItinClass Itin> {
+ dag InOperandList = (ins SrcRC:$rt, uimm3:$sel);
+ dag OutOperandList = (outs DstRC:$rs);
+ string AsmString = !strconcat(opstr, "\t$rt, $rs, $sel");
+ list<dag> Pattern = [];
+ Format f = FrmFR;
+ string BaseOpcode = opstr;
+ InstrItinClass Itinerary = Itin;
+}
+class MTC1_MMR6_DESC_BASE<
+ string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
+ InstrItinClass Itin = NoItinerary, SDPatternOperator OpNode = null_frag>
+ : MipsR6Inst {
+ dag InOperandList = (ins SrcRC:$rt);
+ dag OutOperandList = (outs DstRC:$fs);
+ string AsmString = !strconcat(opstr, "\t$rt, $fs");
+ list<dag> Pattern = [(set DstRC:$fs, (OpNode SrcRC:$rt))];
+ Format f = FrmFR;
+ InstrItinClass Itinerary = Itin;
+ string BaseOpcode = opstr;
+}
+class MTC1_64_MMR6_DESC_BASE<
+ string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
+ InstrItinClass Itin = NoItinerary> : MipsR6Inst {
+ dag InOperandList = (ins DstRC:$fs_in, SrcRC:$rt);
+ dag OutOperandList = (outs DstRC:$fs);
+ string AsmString = !strconcat(opstr, "\t$rt, $fs");
+ list<dag> Pattern = [];
+ Format f = FrmFR;
+ InstrItinClass Itinerary = Itin;
+ string BaseOpcode = opstr;
+ // $fs_in is part of a white lie to work around a widespread bug in the FPU
+ // implementation. See expandBuildPairF64 for details.
+ let Constraints = "$fs = $fs_in";
+}
+class MTC2_MMR6_DESC_BASE<string opstr, RegisterOperand DstRC,
+ RegisterOperand SrcRC, InstrItinClass Itin> {
+ dag InOperandList = (ins SrcRC:$rt);
+ dag OutOperandList = (outs DstRC:$impl);
+ string AsmString = !strconcat(opstr, "\t$rt, $impl");
+ list<dag> Pattern = [];
+ Format f = FrmFR;
+ string BaseOpcode = opstr;
+ InstrItinClass Itinerary = Itin;
+}
+
+class MTC0_MMR6_DESC : MTC0_MMR6_DESC_BASE<"mtc0", COP0Opnd, GPR32Opnd,
+ II_MTC0>;
+class MTC1_MMR6_DESC : MTC1_MMR6_DESC_BASE<"mtc1", FGR32Opnd, GPR32Opnd,
+ II_MTC1, bitconvert>, HARDFLOAT;
+class MTC2_MMR6_DESC : MTC2_MMR6_DESC_BASE<"mtc2", COP2Opnd, GPR32Opnd,
+ II_MTC2>;
+class MTHC0_MMR6_DESC : MTC0_MMR6_DESC_BASE<"mthc0", COP0Opnd, GPR32Opnd,
+ II_MTHC0>;
+class MTHC1_D32_MMR6_DESC : MTC1_64_MMR6_DESC_BASE<"mthc1", AFGR64Opnd,
+ GPR32Opnd, II_MTC1>,
+ HARDFLOAT, FGR_32;
+class MTHC1_D64_MMR6_DESC : MTC1_64_MMR6_DESC_BASE<"mthc1", FGR64Opnd,
+ GPR32Opnd, II_MTC1>,
+ HARDFLOAT, FGR_64;
+class MTHC2_MMR6_DESC : MTC2_MMR6_DESC_BASE<"mthc2", COP2Opnd, GPR32Opnd,
+ II_MTC2>;
+
+class MFC0_MMR6_DESC_BASE<string opstr, RegisterOperand DstRC,
+ RegisterOperand SrcRC, InstrItinClass Itin> {
+ dag InOperandList = (ins SrcRC:$rs, uimm3:$sel);
+ dag OutOperandList = (outs DstRC:$rt);
+ string AsmString = !strconcat(opstr, "\t$rt, $rs, $sel");
+ list<dag> Pattern = [];
+ Format f = FrmFR;
+ string BaseOpcode = opstr;
+ InstrItinClass Itinerary = Itin;
+}
+class MFC1_MMR6_DESC_BASE<string opstr, RegisterOperand DstRC,
+ RegisterOperand SrcRC,
+ InstrItinClass Itin = NoItinerary,
+ SDPatternOperator OpNode = null_frag> : MipsR6Inst {
+ dag InOperandList = (ins SrcRC:$fs);
+ dag OutOperandList = (outs DstRC:$rt);
+ string AsmString = !strconcat(opstr, "\t$rt, $fs");
+ list<dag> Pattern = [(set DstRC:$rt, (OpNode SrcRC:$fs))];
+ Format f = FrmFR;
+ InstrItinClass Itinerary = Itin;
+ string BaseOpcode = opstr;
+}
+class MFC2_MMR6_DESC_BASE<string opstr, RegisterOperand DstRC,
+ RegisterOperand SrcRC, InstrItinClass Itin> {
+ dag InOperandList = (ins SrcRC:$impl);
+ dag OutOperandList = (outs DstRC:$rt);
+ string AsmString = !strconcat(opstr, "\t$rt, $impl");
+ list<dag> Pattern = [];
+ Format f = FrmFR;
+ string BaseOpcode = opstr;
+ InstrItinClass Itinerary = Itin;
+}
+class MFC0_MMR6_DESC : MFC0_MMR6_DESC_BASE<"mfc0", GPR32Opnd, COP0Opnd,
+ II_MFC0>;
+class MFC1_MMR6_DESC : MFC1_MMR6_DESC_BASE<"mfc1", GPR32Opnd, FGR32Opnd,
+ II_MFC1, bitconvert>, HARDFLOAT;
+class MFC2_MMR6_DESC : MFC2_MMR6_DESC_BASE<"mfc2", GPR32Opnd, COP2Opnd,
+ II_MFC2>;
+class MFHC0_MMR6_DESC : MFC0_MMR6_DESC_BASE<"mfhc0", GPR32Opnd, COP0Opnd,
+ II_MFHC0>;
+class MFHC1_D32_MMR6_DESC : MFC1_MMR6_DESC_BASE<"mfhc1", GPR32Opnd, AFGR64Opnd,
+ II_MFHC1>, HARDFLOAT, FGR_32;
+class MFHC1_D64_MMR6_DESC : MFC1_MMR6_DESC_BASE<"mfhc1", GPR32Opnd, FGR64Opnd,
+ II_MFHC1>, HARDFLOAT, FGR_64;
+class MFHC2_MMR6_DESC : MFC2_MMR6_DESC_BASE<"mfhc2", GPR32Opnd, COP2Opnd,
+ II_MFC2>;
+
+class LDC1_D64_MMR6_DESC : MipsR6Inst, HARDFLOAT, FGR_64 {
+ dag InOperandList = (ins mem_mm_16:$addr);
+ dag OutOperandList = (outs FGR64Opnd:$ft);
+ string AsmString = !strconcat("ldc1", "\t$ft, $addr");
+ list<dag> Pattern = [(set FGR64Opnd:$ft, (load addrimm16:$addr))];
+ Format f = FrmFI;
+ InstrItinClass Itinerary = II_LDC1;
+ string BaseOpcode = "ldc1";
+ bit mayLoad = 1;
+ let DecoderMethod = "DecodeFMemMMR2";
+}
+
+class SDC1_D64_MMR6_DESC : MipsR6Inst, HARDFLOAT, FGR_64 {
+ dag InOperandList = (ins FGR64Opnd:$ft, mem_mm_16:$addr);
+ dag OutOperandList = (outs);
+ string AsmString = !strconcat("sdc1", "\t$ft, $addr");
+ list<dag> Pattern = [(store FGR64Opnd:$ft, addrimm16:$addr)];
+ Format f = FrmFI;
+ InstrItinClass Itinerary = II_SDC1;
+ string BaseOpcode = "sdc1";
+ bit mayStore = 1;
+ let DecoderMethod = "DecodeFMemMMR2";
+}
+
+class LDC2_LWC2_MMR6_DESC_BASE<string opstr, InstrItinClass itin> {
+ dag OutOperandList = (outs COP2Opnd:$rt);
+ dag InOperandList = (ins mem_mm_11:$addr);
+ string AsmString = !strconcat(opstr, "\t$rt, $addr");
+ list<dag> Pattern = [(set COP2Opnd:$rt, (load addrimm11:$addr))];
+ Format f = FrmFI;
+ InstrItinClass Itinerary = itin;
+ string BaseOpcode = opstr;
+ bit mayLoad = 1;
+ string DecoderMethod = "DecodeFMemCop2MMR6";
+}
+class LDC2_MMR6_DESC : LDC2_LWC2_MMR6_DESC_BASE<"ldc2", II_LDC2>;
+class LWC2_MMR6_DESC : LDC2_LWC2_MMR6_DESC_BASE<"lwc2", II_LWC2>;
+
+class SDC2_SWC2_MMR6_DESC_BASE<string opstr, InstrItinClass itin> {
+ dag OutOperandList = (outs);
+ dag InOperandList = (ins COP2Opnd:$rt, mem_mm_11:$addr);
+ string AsmString = !strconcat(opstr, "\t$rt, $addr");
+ list<dag> Pattern = [(store COP2Opnd:$rt, addrimm11:$addr)];
+ Format f = FrmFI;
+ InstrItinClass Itinerary = itin;
+ string BaseOpcode = opstr;
+ bit mayStore = 1;
+ string DecoderMethod = "DecodeFMemCop2MMR6";
+}
+class SDC2_MMR6_DESC : SDC2_SWC2_MMR6_DESC_BASE<"sdc2", II_SDC2>;
+class SWC2_MMR6_DESC : SDC2_SWC2_MMR6_DESC_BASE<"swc2", II_SWC2>;
+
+/// Floating Point Instructions
+class FARITH_MMR6_DESC_BASE<string instr_asm, RegisterOperand RC,
+ InstrItinClass Itin, bit isComm,
+ SDPatternOperator OpNode = null_frag> : HARDFLOAT {
+ dag OutOperandList = (outs RC:$fd);
+ dag InOperandList = (ins RC:$ft, RC:$fs);
+ string AsmString = !strconcat(instr_asm, "\t$fd, $fs, $ft");
+ list<dag> Pattern = [(set RC:$fd, (OpNode RC:$fs, RC:$ft))];
+ InstrItinClass Itinerary = Itin;
+ bit isCommutable = isComm;
+}
+class FADD_S_MMR6_DESC
+ : FARITH_MMR6_DESC_BASE<"add.s", FGR32Opnd, II_ADD_S, 1, fadd>;
+class FADD_D_MMR6_DESC
+ : FARITH_MMR6_DESC_BASE<"add.d", AFGR64Opnd, II_ADD_D, 1, fadd>;
+class FSUB_S_MMR6_DESC
+ : FARITH_MMR6_DESC_BASE<"sub.s", FGR32Opnd, II_SUB_S, 0, fsub>;
+class FSUB_D_MMR6_DESC
+ : FARITH_MMR6_DESC_BASE<"sub.d", AFGR64Opnd, II_SUB_D, 0, fsub>;
+class FMUL_S_MMR6_DESC
+ : FARITH_MMR6_DESC_BASE<"mul.s", FGR32Opnd, II_MUL_S, 1, fmul>;
+class FMUL_D_MMR6_DESC
+ : FARITH_MMR6_DESC_BASE<"mul.d", AFGR64Opnd, II_MUL_D, 1, fmul>;
+class FDIV_S_MMR6_DESC
+ : FARITH_MMR6_DESC_BASE<"div.s", FGR32Opnd, II_DIV_S, 0, fdiv>;
+class FDIV_D_MMR6_DESC
+ : FARITH_MMR6_DESC_BASE<"div.d", AFGR64Opnd, II_DIV_D, 0, fdiv>;
+class MADDF_S_MMR6_DESC : COP1_4R_DESC_BASE<"maddf.s", FGR32Opnd,
+ II_MADDF_S>, HARDFLOAT;
+class MADDF_D_MMR6_DESC : COP1_4R_DESC_BASE<"maddf.d", FGR64Opnd,
+ II_MADDF_D>, HARDFLOAT;
+class MSUBF_S_MMR6_DESC : COP1_4R_DESC_BASE<"msubf.s", FGR32Opnd,
+ II_MSUBF_S>, HARDFLOAT;
+class MSUBF_D_MMR6_DESC : COP1_4R_DESC_BASE<"msubf.d", FGR64Opnd,
+ II_MSUBF_D>, HARDFLOAT;
+
+class FMOV_FNEG_MMR6_DESC_BASE<string instr_asm, RegisterOperand DstRC,
+ RegisterOperand SrcRC, InstrItinClass Itin,
+ SDPatternOperator OpNode = null_frag>
+ : HARDFLOAT, NeverHasSideEffects {
+ dag OutOperandList = (outs DstRC:$ft);
+ dag InOperandList = (ins SrcRC:$fs);
+ string AsmString = !strconcat(instr_asm, "\t$ft, $fs");
+ list<dag> Pattern = [(set DstRC:$ft, (OpNode SrcRC:$fs))];
+ InstrItinClass Itinerary = Itin;
+ Format Form = FrmFR;
+}
+class FMOV_S_MMR6_DESC
+ : FMOV_FNEG_MMR6_DESC_BASE<"mov.s", FGR32Opnd, FGR32Opnd, II_MOV_S>;
+class FMOV_D_MMR6_DESC
+ : FMOV_FNEG_MMR6_DESC_BASE<"mov.d", AFGR64Opnd, AFGR64Opnd, II_MOV_D>;
+class FNEG_S_MMR6_DESC
+ : FMOV_FNEG_MMR6_DESC_BASE<"neg.s", FGR32Opnd, FGR32Opnd, II_NEG, fneg>;
+class FNEG_D_MMR6_DESC
+ : FMOV_FNEG_MMR6_DESC_BASE<"neg.d", AFGR64Opnd, AFGR64Opnd, II_NEG, fneg>;
+
+class MAX_S_MMR6_DESC : MAX_MIN_DESC_BASE<"max.s", FGR32Opnd, II_MAX_S>,
+ HARDFLOAT;
+class MAX_D_MMR6_DESC : MAX_MIN_DESC_BASE<"max.d", FGR64Opnd, II_MAX_D>,
+ HARDFLOAT;
+class MIN_S_MMR6_DESC : MAX_MIN_DESC_BASE<"min.s", FGR32Opnd, II_MIN_S>,
+ HARDFLOAT;
+class MIN_D_MMR6_DESC : MAX_MIN_DESC_BASE<"min.d", FGR64Opnd, II_MIN_D>,
+ HARDFLOAT;
+
+class MAXA_S_MMR6_DESC : MAX_MIN_DESC_BASE<"maxa.s", FGR32Opnd, II_MAXA_S>,
+ HARDFLOAT;
+class MAXA_D_MMR6_DESC : MAX_MIN_DESC_BASE<"maxa.d", FGR64Opnd, II_MAXA_D>,
+ HARDFLOAT;
+class MINA_S_MMR6_DESC : MAX_MIN_DESC_BASE<"mina.s", FGR32Opnd, II_MINA_S>,
+ HARDFLOAT;
+class MINA_D_MMR6_DESC : MAX_MIN_DESC_BASE<"mina.d", FGR64Opnd, II_MINA_D>,
+ HARDFLOAT;
+
+class CVT_MMR6_DESC_BASE<
+ string instr_asm, RegisterOperand DstRC, RegisterOperand SrcRC,
+ InstrItinClass Itin, SDPatternOperator OpNode = null_frag>
+ : HARDFLOAT, NeverHasSideEffects {
+ dag OutOperandList = (outs DstRC:$ft);
+ dag InOperandList = (ins SrcRC:$fs);
+ string AsmString = !strconcat(instr_asm, "\t$ft, $fs");
+ list<dag> Pattern = [(set DstRC:$ft, (OpNode SrcRC:$fs))];
+ InstrItinClass Itinerary = Itin;
+ Format Form = FrmFR;
+}
+
+class CVT_L_S_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.l.s", FGR64Opnd, FGR32Opnd,
+ II_CVT>;
+class CVT_L_D_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.l.d", FGR64Opnd, FGR64Opnd,
+ II_CVT>;
+class CVT_W_S_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.w.s", FGR32Opnd, FGR32Opnd,
+ II_CVT>;
+class CVT_W_D_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.w.d", FGR32Opnd, AFGR64Opnd,
+ II_CVT>;
+class CVT_D_S_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.d.s", FGR32Opnd, AFGR64Opnd,
+ II_CVT>;
+class CVT_D_W_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.d.w", FGR32Opnd, AFGR64Opnd,
+ II_CVT>;
+class CVT_D_L_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.d.l", FGR64Opnd, FGR64Opnd,
+ II_CVT>, FGR_64;
+class CVT_S_D_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.s.d", AFGR64Opnd, FGR32Opnd,
+ II_CVT>;
+class CVT_S_W_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.s.w", FGR32Opnd, FGR32Opnd,
+ II_CVT>;
+class CVT_S_L_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.s.l", FGR64Opnd, FGR32Opnd,
+ II_CVT>, FGR_64;
+
+multiclass CMP_CC_MMR6<bits<6> format, string Typestr,
+ RegisterOperand FGROpnd, InstrItinClass Itin> {
+ def CMP_AF_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
+ !strconcat("cmp.af.", Typestr), format, FIELD_CMP_COND_AF>,
+ CMP_CONDN_DESC_BASE<"af", Typestr, FGROpnd, Itin>, HARDFLOAT,
+ ISA_MICROMIPS32R6;
+ def CMP_UN_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
+ !strconcat("cmp.un.", Typestr), format, FIELD_CMP_COND_UN>,
+ CMP_CONDN_DESC_BASE<"un", Typestr, FGROpnd, Itin, setuo>, HARDFLOAT,
+ ISA_MICROMIPS32R6;
+ def CMP_EQ_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
+ !strconcat("cmp.eq.", Typestr), format, FIELD_CMP_COND_EQ>,
+ CMP_CONDN_DESC_BASE<"eq", Typestr, FGROpnd, Itin, setoeq>, HARDFLOAT,
+ ISA_MICROMIPS32R6;
+ def CMP_UEQ_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
+ !strconcat("cmp.ueq.", Typestr), format, FIELD_CMP_COND_UEQ>,
+ CMP_CONDN_DESC_BASE<"ueq", Typestr, FGROpnd, Itin, setueq>, HARDFLOAT,
+ ISA_MICROMIPS32R6;
+ def CMP_LT_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
+ !strconcat("cmp.lt.", Typestr), format, FIELD_CMP_COND_LT>,
+ CMP_CONDN_DESC_BASE<"lt", Typestr, FGROpnd, Itin, setolt>, HARDFLOAT,
+ ISA_MICROMIPS32R6;
+ def CMP_ULT_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
+ !strconcat("cmp.ult.", Typestr), format, FIELD_CMP_COND_ULT>,
+ CMP_CONDN_DESC_BASE<"ult", Typestr, FGROpnd, Itin, setult>, HARDFLOAT,
+ ISA_MICROMIPS32R6;
+ def CMP_LE_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
+ !strconcat("cmp.le.", Typestr), format, FIELD_CMP_COND_LE>,
+ CMP_CONDN_DESC_BASE<"le", Typestr, FGROpnd, Itin, setole>, HARDFLOAT,
+ ISA_MICROMIPS32R6;
+ def CMP_ULE_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
+ !strconcat("cmp.ule.", Typestr), format, FIELD_CMP_COND_ULE>,
+ CMP_CONDN_DESC_BASE<"ule", Typestr, FGROpnd, Itin, setule>, HARDFLOAT,
+ ISA_MICROMIPS32R6;
+ def CMP_SAF_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
+ !strconcat("cmp.saf.", Typestr), format, FIELD_CMP_COND_SAF>,
+ CMP_CONDN_DESC_BASE<"saf", Typestr, FGROpnd, Itin>, HARDFLOAT,
+ ISA_MICROMIPS32R6;
+ def CMP_SUN_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
+ !strconcat("cmp.sun.", Typestr), format, FIELD_CMP_COND_SUN>,
+ CMP_CONDN_DESC_BASE<"sun", Typestr, FGROpnd, Itin>, HARDFLOAT,
+ ISA_MICROMIPS32R6;
+ def CMP_SEQ_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
+ !strconcat("cmp.seq.", Typestr), format, FIELD_CMP_COND_SEQ>,
+ CMP_CONDN_DESC_BASE<"seq", Typestr, FGROpnd, Itin>, HARDFLOAT,
+ ISA_MICROMIPS32R6;
+ def CMP_SUEQ_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
+ !strconcat("cmp.sueq.", Typestr), format, FIELD_CMP_COND_SUEQ>,
+ CMP_CONDN_DESC_BASE<"sueq", Typestr, FGROpnd, Itin>, HARDFLOAT,
+ ISA_MICROMIPS32R6;
+ def CMP_SLT_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
+ !strconcat("cmp.slt.", Typestr), format, FIELD_CMP_COND_SLT>,
+ CMP_CONDN_DESC_BASE<"slt", Typestr, FGROpnd, Itin>, HARDFLOAT,
+ ISA_MICROMIPS32R6;
+ def CMP_SULT_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
+ !strconcat("cmp.sult.", Typestr), format, FIELD_CMP_COND_SULT>,
+ CMP_CONDN_DESC_BASE<"sult", Typestr, FGROpnd, Itin>, HARDFLOAT,
+ ISA_MICROMIPS32R6;
+ def CMP_SLE_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
+ !strconcat("cmp.sle.", Typestr), format, FIELD_CMP_COND_SLE>,
+ CMP_CONDN_DESC_BASE<"sle", Typestr, FGROpnd, Itin>, HARDFLOAT,
+ ISA_MICROMIPS32R6;
+ def CMP_SULE_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
+ !strconcat("cmp.sule.", Typestr), format, FIELD_CMP_COND_SULE>,
+ CMP_CONDN_DESC_BASE<"sule", Typestr, FGROpnd, Itin>, HARDFLOAT,
+ ISA_MICROMIPS32R6;
+}
+
+class ABSS_FT_MMR6_DESC_BASE<string instr_asm, RegisterOperand DstRC,
+ RegisterOperand SrcRC, InstrItinClass Itin,
+ SDPatternOperator OpNode = null_frag>
+ : HARDFLOAT, NeverHasSideEffects {
+ dag OutOperandList = (outs DstRC:$ft);
+ dag InOperandList = (ins SrcRC:$fs);
+ string AsmString = !strconcat(instr_asm, "\t$ft, $fs");
+ list<dag> Pattern = [(set DstRC:$ft, (OpNode SrcRC:$fs))];
+ InstrItinClass Itinerary = Itin;
+ Format Form = FrmFR;
+ list<Predicate> EncodingPredicates = [HasStdEnc];
+}
+
+class ABS_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"abs.s", FGR32Opnd, FGR32Opnd,
+ II_ABS, fabs>;
+class ABS_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"abs.d", AFGR64Opnd, AFGR64Opnd,
+ II_ABS, fabs>;
+class FLOOR_L_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"floor.l.s", FGR64Opnd,
+ FGR32Opnd, II_FLOOR>;
+class FLOOR_L_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"floor.l.d", FGR64Opnd,
+ FGR64Opnd, II_FLOOR>;
+class FLOOR_W_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"floor.w.s", FGR32Opnd,
+ FGR32Opnd, II_FLOOR>;
+class FLOOR_W_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"floor.w.d", FGR32Opnd,
+ AFGR64Opnd, II_FLOOR>;
+class CEIL_L_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"ceil.l.s", FGR64Opnd,
+ FGR32Opnd, II_CEIL>;
+class CEIL_L_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"ceil.l.d", FGR64Opnd,
+ FGR64Opnd, II_CEIL>;
+class CEIL_W_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"ceil.w.s", FGR32Opnd,
+ FGR32Opnd, II_CEIL>;
+class CEIL_W_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"ceil.w.d", FGR32Opnd,
+ AFGR64Opnd, II_CEIL>;
+class TRUNC_L_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"trunc.l.s", FGR64Opnd,
+ FGR32Opnd, II_TRUNC>;
+class TRUNC_L_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"trunc.l.d", FGR64Opnd,
+ FGR64Opnd, II_TRUNC>;
+class TRUNC_W_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"trunc.w.s", FGR32Opnd,
+ FGR32Opnd, II_TRUNC>;
+class TRUNC_W_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"trunc.w.d", FGR32Opnd,
+ AFGR64Opnd, II_TRUNC>;
+class SQRT_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"sqrt.s", FGR32Opnd, FGR32Opnd,
+ II_SQRT_S, fsqrt>;
+class SQRT_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"sqrt.d", AFGR64Opnd, AFGR64Opnd,
+ II_SQRT_D, fsqrt>;
+class ROUND_L_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.l.s", FGR64Opnd,
+ FGR32Opnd, II_ROUND>;
+class ROUND_L_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.l.d", FGR64Opnd,
+ FGR64Opnd, II_ROUND>;
+class ROUND_W_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.w.s", FGR32Opnd,
+ FGR32Opnd, II_ROUND>;
+class ROUND_W_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.w.d", FGR64Opnd,
+ FGR64Opnd, II_ROUND>;
+
+class SEL_S_MMR6_DESC : COP1_SEL_DESC_BASE<"sel.s", FGR32Opnd, II_SEL_S>;
+class SEL_D_MMR6_DESC : COP1_SEL_DESC_BASE<"sel.d", FGR64Opnd, II_SEL_D> {
+ // We must insert a SUBREG_TO_REG around $fd_in
+ bit usesCustomInserter = 1;
+}
+
+class SELEQZ_S_MMR6_DESC : SELEQNEZ_DESC_BASE<"seleqz.s", FGR32Opnd,
+ II_SELCCZ_S>;
+class SELEQZ_D_MMR6_DESC : SELEQNEZ_DESC_BASE<"seleqz.d", FGR64Opnd,
+ II_SELCCZ_D>;
+class SELNEZ_S_MMR6_DESC : SELEQNEZ_DESC_BASE<"selnez.s", FGR32Opnd,
+ II_SELCCZ_S>;
+class SELNEZ_D_MMR6_DESC : SELEQNEZ_DESC_BASE<"selnez.d", FGR64Opnd,
+ II_SELCCZ_D>;
+class RINT_S_MMR6_DESC : CLASS_RINT_DESC_BASE<"rint.s", FGR32Opnd,
+ II_RINT_S>;
+class RINT_D_MMR6_DESC : CLASS_RINT_DESC_BASE<"rint.d", FGR64Opnd,
+ II_RINT_S>;
+class CLASS_S_MMR6_DESC : CLASS_RINT_DESC_BASE<"class.s", FGR32Opnd,
+ II_CLASS_S>;
+class CLASS_D_MMR6_DESC : CLASS_RINT_DESC_BASE<"class.d", FGR64Opnd,
+ II_CLASS_S>;
+
+class STORE_MMR6_DESC_BASE<string opstr, DAGOperand RO,
+ InstrItinClass Itin>
+ : Store<opstr, RO>, MMR6Arch<opstr> {
+ let DecoderMethod = "DecodeMemMMImm16";
+ InstrItinClass Itinerary = Itin;
+}
+class SB_MMR6_DESC : STORE_MMR6_DESC_BASE<"sb", GPR32Opnd, II_SB>;
+
+class STORE_EVA_MMR6_DESC_BASE<string instr_asm, RegisterOperand RO,
+ InstrItinClass Itin>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ dag OutOperandList = (outs);
+ dag InOperandList = (ins RO:$rt, mem_simm9:$addr);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+ string DecoderMethod = "DecodeStoreEvaOpMM";
+ bit mayStore = 1;
+ InstrItinClass Itinerary = Itin;
+}
+class SBE_MMR6_DESC : STORE_EVA_MMR6_DESC_BASE<"sbe", GPR32Opnd, II_SBE>;
+class SCE_MMR6_DESC : STORE_EVA_MMR6_DESC_BASE<"sce", GPR32Opnd, II_SCE>;
+class SH_MMR6_DESC : STORE_MMR6_DESC_BASE<"sh", GPR32Opnd, II_SH>;
+class SHE_MMR6_DESC : STORE_EVA_MMR6_DESC_BASE<"she", GPR32Opnd, II_SHE>;
+class LOAD_WORD_EVA_MMR6_DESC_BASE<string instr_asm, RegisterOperand RO,
+ InstrItinClass Itin>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ dag OutOperandList = (outs RO:$rt);
+ dag InOperandList = (ins mem_simm9:$addr);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+ string DecoderMethod = "DecodeMemMMImm9";
+ bit mayLoad = 1;
+ InstrItinClass Itinerary = Itin;
+}
+class LLE_MMR6_DESC : LOAD_WORD_EVA_MMR6_DESC_BASE<"lle", GPR32Opnd, II_LLE>;
+class LWE_MMR6_DESC : LOAD_WORD_EVA_MMR6_DESC_BASE<"lwe", GPR32Opnd, II_LWE>;
+class ADDU16_MMR6_DESC : ArithRMM16<"addu16", GPRMM16Opnd, 1, II_ADDU, add>,
+ MMR6Arch<"addu16"> {
+ int AddedComplexity = 1;
+}
+class AND16_MMR6_DESC : LogicRMM16<"and16", GPRMM16Opnd, II_AND, and>,
+ MMR6Arch<"and16"> {
+ int AddedComplexity = 1;
+}
+class ANDI16_MMR6_DESC : AndImmMM16<"andi16", GPRMM16Opnd, II_AND>,
+ MMR6Arch<"andi16">;
+class NOT16_MMR6_DESC : NotMM16<"not16", GPRMM16Opnd>, MMR6Arch<"not16"> {
+ int AddedComplexity = 1;
+}
+class OR16_MMR6_DESC : LogicRMM16<"or16", GPRMM16Opnd, II_OR, or>,
+ MMR6Arch<"or16"> {
+ int AddedComplexity = 1;
+}
+class SLL16_MMR6_DESC : ShiftIMM16<"sll16", uimm3_shift, GPRMM16Opnd, II_SLL>,
+ MMR6Arch<"sll16">;
+class SRL16_MMR6_DESC : ShiftIMM16<"srl16", uimm3_shift, GPRMM16Opnd, II_SRL>,
+ MMR6Arch<"srl16">;
+class BREAK16_MMR6_DESC : BrkSdbbp16MM<"break16", II_BREAK>, MMR6Arch<"break16">,
+ MicroMipsR6Inst16;
+class LI16_MMR6_DESC : LoadImmMM16<"li16", li16_imm, GPRMM16Opnd>,
+ MMR6Arch<"li16">, MicroMipsR6Inst16, IsAsCheapAsAMove;
+class MOVE16_MMR6_DESC : MoveMM16<"move16", GPR32Opnd>, MMR6Arch<"move16">,
+ MicroMipsR6Inst16;
+class SDBBP16_MMR6_DESC : BrkSdbbp16MM<"sdbbp16", II_SDBBP>, MMR6Arch<"sdbbp16">,
+ MicroMipsR6Inst16;
+class SUBU16_MMR6_DESC : ArithRMM16<"subu16", GPRMM16Opnd, 0, II_SUBU, sub>,
+ MMR6Arch<"subu16">, MicroMipsR6Inst16 {
+ int AddedComplexity = 1;
+}
+class XOR16_MMR6_DESC : LogicRMM16<"xor16", GPRMM16Opnd, II_XOR, xor>,
+ MMR6Arch<"xor16"> {
+ int AddedComplexity = 1;
+}
+
+class LW_MMR6_DESC : MMR6Arch<"lw">, MipsR6Inst {
+ dag OutOperandList = (outs GPR32Opnd:$rt);
+ dag InOperandList = (ins mem:$addr);
+ string AsmString = "lw\t$rt, $addr";
+ let DecoderMethod = "DecodeMemMMImm16";
+ let canFoldAsLoad = 1;
+ let mayLoad = 1;
+ list<dag> Pattern = [(set GPR32Opnd:$rt, (load addrDefault:$addr))];
+ InstrItinClass Itinerary = II_LW;
+}
+
+class LUI_MMR6_DESC : IsAsCheapAsAMove, MMR6Arch<"lui">, MipsR6Inst{
+ dag OutOperandList = (outs GPR32Opnd:$rt);
+ dag InOperandList = (ins uimm16:$imm16);
+ string AsmString = "lui\t$rt, $imm16";
+ list<dag> Pattern = [];
+ bit hasSideEffects = 0;
+ bit isReMaterializable = 1;
+ InstrItinClass Itinerary = II_LUI;
+ Format Form = FrmI;
+}
+
+class SYNC_MMR6_DESC : MMR6Arch<"sync">, MipsR6Inst {
+ dag OutOperandList = (outs);
+ dag InOperandList = (ins uimm5:$stype);
+ string AsmString = !strconcat("sync", "\t$stype");
+ list<dag> Pattern = [(MipsSync immZExt5:$stype)];
+ InstrItinClass Itinerary = II_SYNC;
+ bit HasSideEffects = 1;
+}
+
+class SYNCI_MMR6_DESC : SYNCI_FT<"synci"> {
+ let DecoderMethod = "DecodeSynciR6";
+}
+
+class RDPGPR_MMR6_DESC : MMR6Arch<"rdpgpr">, MipsR6Inst {
+ dag OutOperandList = (outs GPR32Opnd:$rt);
+ dag InOperandList = (ins GPR32Opnd:$rd);
+ string AsmString = !strconcat("rdpgpr", "\t$rt, $rd");
+ InstrItinClass Itinerary = II_RDPGPR;
+}
+
+class SDBBP_MMR6_DESC : MipsR6Inst {
+ dag OutOperandList = (outs);
+ dag InOperandList = (ins uimm20:$code_);
+ string AsmString = !strconcat("sdbbp", "\t$code_");
+ list<dag> Pattern = [];
+ InstrItinClass Itinerary = II_SDBBP;
+}
+
+class LWM16_MMR6_DESC
+ : MicroMipsInst16<(outs reglist16:$rt), (ins mem_mm_4sp:$addr),
+ !strconcat("lwm16", "\t$rt, $addr"), [],
+ II_LWM, FrmI>,
+ MMR6Arch<"lwm16">, MicroMipsR6Inst16 {
+ let DecoderMethod = "DecodeMemMMReglistImm4Lsl2";
+ let mayLoad = 1;
+ ComplexPattern Addr = addr;
+}
+
+class SWM16_MMR6_DESC
+ : MicroMipsInst16<(outs), (ins reglist16:$rt, mem_mm_4sp:$addr),
+ !strconcat("swm16", "\t$rt, $addr"), [],
+ II_SWM, FrmI>,
+ MMR6Arch<"swm16">, MicroMipsR6Inst16 {
+ let DecoderMethod = "DecodeMemMMReglistImm4Lsl2";
+ let mayStore = 1;
+ ComplexPattern Addr = addr;
+}
+
+class SB16_MMR6_DESC_BASE<string opstr, DAGOperand RTOpnd, DAGOperand RO,
+ SDPatternOperator OpNode, InstrItinClass Itin,
+ Operand MemOpnd>
+ : MicroMipsInst16<(outs), (ins RTOpnd:$rt, MemOpnd:$addr),
+ !strconcat(opstr, "\t$rt, $addr"), [], Itin, FrmI>,
+ MMR6Arch<opstr>, MicroMipsR6Inst16 {
+ let DecoderMethod = "DecodeMemMMImm4";
+ let mayStore = 1;
+}
+class SB16_MMR6_DESC : SB16_MMR6_DESC_BASE<"sb16", GPRMM16OpndZero, GPRMM16Opnd,
+ truncstorei8, II_SB, mem_mm_4>;
+class SH16_MMR6_DESC : SB16_MMR6_DESC_BASE<"sh16", GPRMM16OpndZero, GPRMM16Opnd,
+ truncstorei16, II_SH, mem_mm_4_lsl1>;
+class SW16_MMR6_DESC : SB16_MMR6_DESC_BASE<"sw16", GPRMM16OpndZero, GPRMM16Opnd,
+ store, II_SW, mem_mm_4_lsl2>;
+
+class SWSP_MMR6_DESC
+ : MicroMipsInst16<(outs), (ins GPR32Opnd:$rt, mem_mm_sp_imm5_lsl2:$offset),
+ !strconcat("sw", "\t$rt, $offset"), [], II_SW, FrmI>,
+ MMR6Arch<"sw">, MicroMipsR6Inst16 {
+ let DecoderMethod = "DecodeMemMMSPImm5Lsl2";
+ let mayStore = 1;
+}
+
+class JALRC_HB_MMR6_DESC {
+ dag OutOperandList = (outs GPR32Opnd:$rt);
+ dag InOperandList = (ins GPR32Opnd:$rs);
+ string AsmString = !strconcat("jalrc.hb", "\t$rt, $rs");
+ list<dag> Pattern = [];
+ InstrItinClass Itinerary = II_JALR_HB;
+ Format Form = FrmJ;
+ bit isIndirectBranch = 1;
+ bit hasDelaySlot = 0;
+}
+
+class TLBINV_MMR6_DESC_BASE<string opstr, InstrItinClass Itin> {
+ dag OutOperandList = (outs);
+ dag InOperandList = (ins);
+ string AsmString = opstr;
+ list<dag> Pattern = [];
+ InstrItinClass Itinerary = Itin;
+}
+
+class TLBINV_MMR6_DESC : TLBINV_MMR6_DESC_BASE<"tlbinv", II_TLBINV>;
+class TLBINVF_MMR6_DESC : TLBINV_MMR6_DESC_BASE<"tlbinvf", II_TLBINVF>;
+
+class DVPEVP_MMR6_DESC_BASE<string opstr, InstrItinClass Itin> {
+ dag OutOperandList = (outs GPR32Opnd:$rs);
+ dag InOperandList = (ins);
+ string AsmString = !strconcat(opstr, "\t$rs");
+ list<dag> Pattern = [];
+ InstrItinClass Itinerary = Itin;
+ bit hasUnModeledSideEffects = 1;
+}
+
+class DVP_MMR6_DESC : DVPEVP_MMR6_DESC_BASE<"dvp", II_DVP>;
+class EVP_MMR6_DESC : DVPEVP_MMR6_DESC_BASE<"evp", II_EVP>;
+
+class BEQZC_MMR6_DESC
+ : CMP_CBR_EQNE_Z_DESC_BASE<"beqzc", brtarget21_mm, GPR32Opnd>,
+ MMR6Arch<"beqzc">;
+class BNEZC_MMR6_DESC
+ : CMP_CBR_EQNE_Z_DESC_BASE<"bnezc", brtarget21_mm, GPR32Opnd>,
+ MMR6Arch<"bnezc">;
+
+class BRANCH_COP1_MMR6_DESC_BASE<string opstr> :
+ InstSE<(outs), (ins FGR64Opnd:$rt, brtarget_mm:$offset),
+ !strconcat(opstr, "\t$rt, $offset"), [], II_BC1CCZ, FrmI>,
+ HARDFLOAT, BRANCH_DESC_BASE {
+ list<Register> Defs = [AT];
+}
+
+class BC1EQZC_MMR6_DESC : BRANCH_COP1_MMR6_DESC_BASE<"bc1eqzc">;
+class BC1NEZC_MMR6_DESC : BRANCH_COP1_MMR6_DESC_BASE<"bc1nezc">;
+
+class BRANCH_COP2_MMR6_DESC_BASE<string opstr, InstrItinClass Itin>
+ : BRANCH_DESC_BASE {
+ dag InOperandList = (ins COP2Opnd:$rt, brtarget_mm:$offset);
+ dag OutOperandList = (outs);
+ string AsmString = !strconcat(opstr, "\t$rt, $offset");
+ list<Register> Defs = [AT];
+ InstrItinClass Itinerary = Itin;
+}
+
+class BC2EQZC_MMR6_DESC : BRANCH_COP2_MMR6_DESC_BASE<"bc2eqzc", II_BC2CCZ>;
+class BC2NEZC_MMR6_DESC : BRANCH_COP2_MMR6_DESC_BASE<"bc2nezc", II_BC2CCZ>;
+
+class EXT_MMR6_DESC {
+ dag OutOperandList = (outs GPR32Opnd:$rt);
+ dag InOperandList = (ins GPR32Opnd:$rs, uimm5:$pos, uimm5_plus1:$size);
+ string AsmString = !strconcat("ext", "\t$rt, $rs, $pos, $size");
+ list<dag> Pattern = [(set GPR32Opnd:$rt, (MipsExt GPR32Opnd:$rs, imm:$pos,
+ imm:$size))];
+ InstrItinClass Itinerary = II_EXT;
+ Format Form = FrmR;
+ string BaseOpcode = "ext";
+}
+
+class INS_MMR6_DESC {
+ dag OutOperandList = (outs GPR32Opnd:$rt);
+ dag InOperandList = (ins GPR32Opnd:$rs, uimm5:$pos, uimm5_inssize_plus1:$size,
+ GPR32Opnd:$src);
+ string AsmString = !strconcat("ins", "\t$rt, $rs, $pos, $size");
+ list<dag> Pattern = [(set GPR32Opnd:$rt, (MipsIns GPR32Opnd:$rs, imm:$pos,
+ imm:$size, GPR32Opnd:$src))];
+ InstrItinClass Itinerary = II_INS;
+ Format Form = FrmR;
+ string BaseOpcode = "ins";
+ string Constraints = "$src = $rt";
+}
+
+class JALRC_MMR6_DESC {
+ dag OutOperandList = (outs GPR32Opnd:$rt);
+ dag InOperandList = (ins GPR32Opnd:$rs);
+ string AsmString = !strconcat("jalrc", "\t$rt, $rs");
+ list<dag> Pattern = [];
+ InstrItinClass Itinerary = II_JALRC;
+ bit isCall = 1;
+ bit hasDelaySlot = 0;
+ list<Register> Defs = [RA];
+}
+
+class BOVC_BNVC_MMR6_DESC_BASE<string instr_asm, Operand opnd,
+ RegisterOperand GPROpnd>
+ : BRANCH_DESC_BASE {
+ dag InOperandList = (ins GPROpnd:$rt, GPROpnd:$rs, opnd:$offset);
+ dag OutOperandList = (outs);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $offset");
+ list<Register> Defs = [AT];
+ InstrItinClass Itinerary = II_BCCC;
+}
+
+class BOVC_MMR6_DESC : BOVC_BNVC_MMR6_DESC_BASE<"bovc", brtargetr6, GPR32Opnd>;
+class BNVC_MMR6_DESC : BOVC_BNVC_MMR6_DESC_BASE<"bnvc", brtargetr6, GPR32Opnd>;
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Definitions
+//
+//===----------------------------------------------------------------------===//
+
+let DecoderNamespace = "MicroMipsR6" in {
+def ADD_MMR6 : StdMMR6Rel, ADD_MMR6_DESC, ADD_MMR6_ENC, ISA_MICROMIPS32R6;
+def ADDIU_MMR6 : StdMMR6Rel, ADDIU_MMR6_DESC, ADDIU_MMR6_ENC, ISA_MICROMIPS32R6;
+def ADDU_MMR6 : StdMMR6Rel, ADDU_MMR6_DESC, ADDU_MMR6_ENC, ISA_MICROMIPS32R6;
+def ADDIUPC_MMR6 : R6MMR6Rel, ADDIUPC_MMR6_ENC, ADDIUPC_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def ALUIPC_MMR6 : R6MMR6Rel, ALUIPC_MMR6_ENC, ALUIPC_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def AND_MMR6 : StdMMR6Rel, AND_MMR6_DESC, AND_MMR6_ENC, ISA_MICROMIPS32R6;
+def ANDI_MMR6 : StdMMR6Rel, ANDI_MMR6_DESC, ANDI_MMR6_ENC, ISA_MICROMIPS32R6;
+def AUIPC_MMR6 : R6MMR6Rel, AUIPC_MMR6_ENC, AUIPC_MMR6_DESC, ISA_MICROMIPS32R6;
+def ALIGN_MMR6 : R6MMR6Rel, ALIGN_MMR6_ENC, ALIGN_MMR6_DESC, ISA_MICROMIPS32R6;
+def AUI_MMR6 : R6MMR6Rel, AUI_MMR6_ENC, AUI_MMR6_DESC, ISA_MICROMIPS32R6;
+def BALC_MMR6 : R6MMR6Rel, BALC_MMR6_ENC, BALC_MMR6_DESC, ISA_MICROMIPS32R6;
+def BC_MMR6 : R6MMR6Rel, BC_MMR6_ENC, BC_MMR6_DESC, ISA_MICROMIPS32R6;
+def BC16_MMR6 : StdMMR6Rel, BC16_MMR6_DESC, BC16_MMR6_ENC, ISA_MICROMIPS32R6;
+def BEQZC_MMR6 : R6MMR6Rel, BEQZC_MMR6_ENC, BEQZC_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def BEQZC16_MMR6 : StdMMR6Rel, BEQZC16_MMR6_DESC, BEQZC16_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+def BNEZC_MMR6 : R6MMR6Rel, BNEZC_MMR6_ENC, BNEZC_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def BNEZC16_MMR6 : StdMMR6Rel, BNEZC16_MMR6_DESC, BNEZC16_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+def BITSWAP_MMR6 : R6MMR6Rel, BITSWAP_MMR6_ENC, BITSWAP_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def BEQZALC_MMR6 : R6MMR6Rel, BEQZALC_MMR6_ENC, BEQZALC_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def BNEZALC_MMR6 : R6MMR6Rel, BNEZALC_MMR6_ENC, BNEZALC_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def BREAK_MMR6 : StdMMR6Rel, BRK_MMR6_DESC, BRK_MMR6_ENC, ISA_MICROMIPS32R6;
+def CACHE_MMR6 : R6MMR6Rel, CACHE_MMR6_ENC, CACHE_MMR6_DESC, ISA_MICROMIPS32R6;
+def CLO_MMR6 : R6MMR6Rel, CLO_MMR6_ENC, CLO_MMR6_DESC, ISA_MICROMIPS32R6;
+def CLZ_MMR6 : R6MMR6Rel, CLZ_MMR6_ENC, CLZ_MMR6_DESC, ISA_MICROMIPS32R6;
+def DIV_MMR6 : R6MMR6Rel, DIV_MMR6_DESC, DIV_MMR6_ENC, ISA_MICROMIPS32R6;
+def DIVU_MMR6 : R6MMR6Rel, DIVU_MMR6_DESC, DIVU_MMR6_ENC, ISA_MICROMIPS32R6;
+def EHB_MMR6 : StdMMR6Rel, EHB_MMR6_DESC, EHB_MMR6_ENC, ISA_MICROMIPS32R6;
+def EI_MMR6 : StdMMR6Rel, EI_MMR6_DESC, EI_MMR6_ENC, ISA_MICROMIPS32R6;
+def DI_MMR6 : StdMMR6Rel, DI_MMR6_DESC, DI_MMR6_ENC, ISA_MICROMIPS32R6;
+def ERET_MMR6 : StdMMR6Rel, ERET_MMR6_DESC, ERET_MMR6_ENC, ISA_MICROMIPS32R6;
+def DERET_MMR6 : StdMMR6Rel, DERET_MMR6_DESC, DERET_MMR6_ENC, ISA_MICROMIPS32R6;
+def ERETNC_MMR6 : R6MMR6Rel, ERETNC_MMR6_DESC, ERETNC_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+def JALRC16_MMR6 : R6MMR6Rel, JALRC16_MMR6_DESC, JALRC16_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+def JIALC_MMR6 : R6MMR6Rel, JIALC_MMR6_ENC, JIALC_MMR6_DESC, ISA_MICROMIPS32R6;
+def JIC_MMR6 : R6MMR6Rel, JIC_MMR6_ENC, JIC_MMR6_DESC, ISA_MICROMIPS32R6;
+def JRC16_MMR6 : R6MMR6Rel, JRC16_MMR6_DESC, JRC16_MMR6_ENC, ISA_MICROMIPS32R6;
+def JRCADDIUSP_MMR6 : R6MMR6Rel, JRCADDIUSP_MMR6_DESC, JRCADDIUSP_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+def LSA_MMR6 : R6MMR6Rel, LSA_MMR6_ENC, LSA_MMR6_DESC, ISA_MICROMIPS32R6;
+def LWP_MMR6 : StdMMR6Rel, LWP_MMR6_ENC, LWP_MMR6_DESC, ISA_MICROMIPS32R6;
+def LWPC_MMR6 : R6MMR6Rel, LWPC_MMR6_ENC, LWPC_MMR6_DESC, ISA_MICROMIPS32R6;
+def LWM16_MMR6 : StdMMR6Rel, LWM16_MMR6_DESC, LWM16_MMR6_ENC, ISA_MICROMIPS32R6;
+def MTC0_MMR6 : StdMMR6Rel, MTC0_MMR6_ENC, MTC0_MMR6_DESC, ISA_MICROMIPS32R6;
+def MTC1_MMR6 : StdMMR6Rel, MTC1_MMR6_DESC, MTC1_MMR6_ENC, ISA_MICROMIPS32R6;
+def MTC2_MMR6 : StdMMR6Rel, MTC2_MMR6_ENC, MTC2_MMR6_DESC, ISA_MICROMIPS32R6;
+def MTHC0_MMR6 : R6MMR6Rel, MTHC0_MMR6_ENC, MTHC0_MMR6_DESC, ISA_MICROMIPS32R6;
+def MTHC1_D32_MMR6 : StdMMR6Rel, MTHC1_D32_MMR6_DESC, MTHC1_MMR6_ENC, ISA_MICROMIPS32R6;
+let DecoderNamespace = "MicroMips32r6FP64" in {
+ def MTHC1_D64_MMR6 : R6MMR6Rel, MTHC1_D64_MMR6_DESC, MTHC1_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+}
+def MTHC2_MMR6 : StdMMR6Rel, MTHC2_MMR6_ENC, MTHC2_MMR6_DESC, ISA_MICROMIPS32R6;
+def MFC0_MMR6 : StdMMR6Rel, MFC0_MMR6_ENC, MFC0_MMR6_DESC, ISA_MICROMIPS32R6;
+def MFC1_MMR6 : StdMMR6Rel, MFC1_MMR6_DESC, MFC1_MMR6_ENC, ISA_MICROMIPS32R6;
+def MFC2_MMR6 : StdMMR6Rel, MFC2_MMR6_ENC, MFC2_MMR6_DESC, ISA_MICROMIPS32R6;
+def MFHC0_MMR6 : R6MMR6Rel, MFHC0_MMR6_ENC, MFHC0_MMR6_DESC, ISA_MICROMIPS32R6;
+def MFHC1_D32_MMR6 : StdMMR6Rel, MFHC1_D32_MMR6_DESC, MFHC1_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+let DecoderNamespace = "MicroMips32r6FP64" in {
+ def MFHC1_D64_MMR6 : StdMMR6Rel, MFHC1_D64_MMR6_DESC, MFHC1_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+}
+def MFHC2_MMR6 : StdMMR6Rel, MFHC2_MMR6_ENC, MFHC2_MMR6_DESC, ISA_MICROMIPS32R6;
+def MOD_MMR6 : R6MMR6Rel, MOD_MMR6_DESC, MOD_MMR6_ENC, ISA_MICROMIPS32R6;
+def MODU_MMR6 : R6MMR6Rel, MODU_MMR6_DESC, MODU_MMR6_ENC, ISA_MICROMIPS32R6;
+def MUL_MMR6 : R6MMR6Rel, MUL_MMR6_DESC, MUL_MMR6_ENC, ISA_MICROMIPS32R6;
+def MUH_MMR6 : R6MMR6Rel, MUH_MMR6_DESC, MUH_MMR6_ENC, ISA_MICROMIPS32R6;
+def MULU_MMR6 : R6MMR6Rel, MULU_MMR6_DESC, MULU_MMR6_ENC, ISA_MICROMIPS32R6;
+def MUHU_MMR6 : R6MMR6Rel, MUHU_MMR6_DESC, MUHU_MMR6_ENC, ISA_MICROMIPS32R6;
+def NOR_MMR6 : StdMMR6Rel, NOR_MMR6_DESC, NOR_MMR6_ENC, ISA_MICROMIPS32R6;
+def OR_MMR6 : StdMMR6Rel, OR_MMR6_DESC, OR_MMR6_ENC, ISA_MICROMIPS32R6;
+def ORI_MMR6 : StdMMR6Rel, ORI_MMR6_DESC, ORI_MMR6_ENC, ISA_MICROMIPS32R6;
+def PREF_MMR6 : R6MMR6Rel, PREF_MMR6_ENC, PREF_MMR6_DESC, ISA_MICROMIPS32R6;
+def SB16_MMR6 : StdMMR6Rel, SB16_MMR6_DESC, SB16_MMR6_ENC, ISA_MICROMIPS32R6;
+def SEB_MMR6 : StdMMR6Rel, SEB_MMR6_DESC, SEB_MMR6_ENC, ISA_MICROMIPS32R6;
+def SEH_MMR6 : StdMMR6Rel, SEH_MMR6_DESC, SEH_MMR6_ENC, ISA_MICROMIPS32R6;
+def SELEQZ_MMR6 : R6MMR6Rel, SELEQZ_MMR6_ENC, SELEQZ_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def SELNEZ_MMR6 : R6MMR6Rel, SELNEZ_MMR6_ENC, SELNEZ_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def SH16_MMR6 : StdMMR6Rel, SH16_MMR6_DESC, SH16_MMR6_ENC, ISA_MICROMIPS32R6;
+def SLL_MMR6 : StdMMR6Rel, SLL_MMR6_DESC, SLL_MMR6_ENC, ISA_MICROMIPS32R6;
+def SUB_MMR6 : StdMMR6Rel, SUB_MMR6_DESC, SUB_MMR6_ENC, ISA_MICROMIPS32R6;
+def SUBU_MMR6 : StdMMR6Rel, SUBU_MMR6_DESC, SUBU_MMR6_ENC, ISA_MICROMIPS32R6;
+def SW16_MMR6 : StdMMR6Rel, SW16_MMR6_DESC, SW16_MMR6_ENC, ISA_MICROMIPS32R6;
+def SWM16_MMR6 : StdMMR6Rel, SWM16_MMR6_DESC, SWM16_MMR6_ENC, ISA_MICROMIPS32R6;
+def SWSP_MMR6 : StdMMR6Rel, SWSP_MMR6_DESC, SWSP_MMR6_ENC, ISA_MICROMIPS32R6;
+def SWP_MMR6 : StdMMR6Rel, SWP_MMR6_ENC, SWP_MMR6_DESC, ISA_MICROMIPS32R6;
+def PREFE_MMR6 : StdMMR6Rel, PREFE_MMR6_ENC, PREFE_MMR6_DESC, ISA_MICROMIPS32R6;
+def CACHEE_MMR6 : StdMMR6Rel, CACHEE_MMR6_ENC, CACHEE_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def WRPGPR_MMR6 : StdMMR6Rel, WRPGPR_MMR6_ENC, WRPGPR_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def WSBH_MMR6 : StdMMR6Rel, WSBH_MMR6_ENC, WSBH_MMR6_DESC, ISA_MICROMIPS32R6;
+def LB_MMR6 : R6MMR6Rel, LB_MMR6_ENC, LB_MMR6_DESC, ISA_MICROMIPS32R6;
+def LBU_MMR6 : R6MMR6Rel, LBU_MMR6_ENC, LBU_MMR6_DESC, ISA_MICROMIPS32R6;
+def LBE_MMR6 : R6MMR6Rel, LBE_MMR6_ENC, LBE_MMR6_DESC, ISA_MICROMIPS32R6;
+def LBUE_MMR6 : R6MMR6Rel, LBUE_MMR6_ENC, LBUE_MMR6_DESC, ISA_MICROMIPS32R6;
+def PAUSE_MMR6 : StdMMR6Rel, PAUSE_MMR6_DESC, PAUSE_MMR6_ENC, ISA_MICROMIPS32R6;
+def RDHWR_MMR6 : R6MMR6Rel, RDHWR_MMR6_DESC, RDHWR_MMR6_ENC, ISA_MICROMIPS32R6;
+def WAIT_MMR6 : StdMMR6Rel, WAIT_MMR6_DESC, WAIT_MMR6_ENC, ISA_MICROMIPS32R6;
+def SSNOP_MMR6 : StdMMR6Rel, SSNOP_MMR6_DESC, SSNOP_MMR6_ENC, ISA_MICROMIPS32R6;
+def SYNC_MMR6 : StdMMR6Rel, SYNC_MMR6_DESC, SYNC_MMR6_ENC, ISA_MICROMIPS32R6;
+def SYNCI_MMR6 : StdMMR6Rel, SYNCI_MMR6_DESC, SYNCI_MMR6_ENC, ISA_MICROMIPS32R6;
+def RDPGPR_MMR6 : R6MMR6Rel, RDPGPR_MMR6_DESC, RDPGPR_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+def SDBBP_MMR6 : R6MMR6Rel, SDBBP_MMR6_DESC, SDBBP_MMR6_ENC, ISA_MICROMIPS32R6;
+def XOR_MMR6 : StdMMR6Rel, XOR_MMR6_DESC, XOR_MMR6_ENC, ISA_MICROMIPS32R6;
+def XORI_MMR6 : StdMMR6Rel, XORI_MMR6_DESC, XORI_MMR6_ENC, ISA_MICROMIPS32R6;
+let DecoderMethod = "DecodeMemMMImm16" in {
+ def SW_MMR6 : StdMMR6Rel, SW_MMR6_DESC, SW_MMR6_ENC, ISA_MICROMIPS32R6;
+}
+let DecoderMethod = "DecodeMemMMImm9" in {
+ def SWE_MMR6 : StdMMR6Rel, SWE_MMR6_DESC, SWE_MMR6_ENC, ISA_MICROMIPS32R6;
+}
+/// Floating Point Instructions
+def FADD_S_MMR6 : StdMMR6Rel, FADD_S_MMR6_ENC, FADD_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def FADD_D_MMR6 : StdMMR6Rel, FADD_D_MMR6_ENC, FADD_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def FSUB_S_MMR6 : StdMMR6Rel, FSUB_S_MMR6_ENC, FSUB_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def FSUB_D_MMR6 : StdMMR6Rel, FSUB_D_MMR6_ENC, FSUB_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def FMUL_S_MMR6 : StdMMR6Rel, FMUL_S_MMR6_ENC, FMUL_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def FMUL_D_MMR6 : StdMMR6Rel, FMUL_D_MMR6_ENC, FMUL_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def FDIV_S_MMR6 : StdMMR6Rel, FDIV_S_MMR6_ENC, FDIV_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def FDIV_D_MMR6 : StdMMR6Rel, FDIV_D_MMR6_ENC, FDIV_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def MADDF_S_MMR6 : R6MMR6Rel, MADDF_S_MMR6_ENC, MADDF_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def MADDF_D_MMR6 : R6MMR6Rel, MADDF_D_MMR6_ENC, MADDF_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def MSUBF_S_MMR6 : R6MMR6Rel, MSUBF_S_MMR6_ENC, MSUBF_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def MSUBF_D_MMR6 : R6MMR6Rel, MSUBF_D_MMR6_ENC, MSUBF_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def FMOV_S_MMR6 : StdMMR6Rel, FMOV_S_MMR6_ENC, FMOV_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def FMOV_D_MMR6 : StdMMR6Rel, FMOV_D_MMR6_ENC, FMOV_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def FNEG_S_MMR6 : StdMMR6Rel, FNEG_S_MMR6_ENC, FNEG_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def FNEG_D_MMR6 : StdMMR6Rel, FNEG_D_MMR6_ENC, FNEG_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def MAX_S_MMR6 : R6MMR6Rel, MAX_S_MMR6_ENC, MAX_S_MMR6_DESC, ISA_MICROMIPS32R6;
+def MAX_D_MMR6 : R6MMR6Rel, MAX_D_MMR6_ENC, MAX_D_MMR6_DESC, ISA_MICROMIPS32R6;
+def MIN_S_MMR6 : R6MMR6Rel, MIN_S_MMR6_ENC, MIN_S_MMR6_DESC, ISA_MICROMIPS32R6;
+def MIN_D_MMR6 : R6MMR6Rel, MIN_D_MMR6_ENC, MIN_D_MMR6_DESC, ISA_MICROMIPS32R6;
+def MAXA_S_MMR6 : R6MMR6Rel, MAXA_S_MMR6_ENC, MAXA_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def MAXA_D_MMR6 : R6MMR6Rel, MAXA_D_MMR6_ENC, MAXA_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def MINA_S_MMR6 : R6MMR6Rel, MINA_S_MMR6_ENC, MINA_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def MINA_D_MMR6 : R6MMR6Rel, MINA_D_MMR6_ENC, MINA_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def CVT_L_S_MMR6 : StdMMR6Rel, CVT_L_S_MMR6_ENC, CVT_L_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def CVT_L_D_MMR6 : StdMMR6Rel, CVT_L_D_MMR6_ENC, CVT_L_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def CVT_W_S_MMR6 : StdMMR6Rel, CVT_W_S_MMR6_ENC, CVT_W_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def CVT_W_D_MMR6 : StdMMR6Rel, CVT_W_D_MMR6_ENC, CVT_W_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def CVT_D_S_MMR6 : StdMMR6Rel, CVT_D_S_MMR6_ENC, CVT_D_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def CVT_D_W_MMR6 : StdMMR6Rel, CVT_D_W_MMR6_ENC, CVT_D_W_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def CVT_D_L_MMR6 : StdMMR6Rel, CVT_D_L_MMR6_ENC, CVT_D_L_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def CVT_S_D_MMR6 : StdMMR6Rel, CVT_S_D_MMR6_ENC, CVT_S_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def CVT_S_W_MMR6 : StdMMR6Rel, CVT_S_W_MMR6_ENC, CVT_S_W_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def CVT_S_L_MMR6 : StdMMR6Rel, CVT_S_L_MMR6_ENC, CVT_S_L_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+defm S_MMR6 : CMP_CC_MMR6<0b000101, "s", FGR32Opnd, II_CMP_CC_S>;
+defm D_MMR6 : CMP_CC_MMR6<0b010101, "d", FGR64Opnd, II_CMP_CC_D>;
+def ABS_S_MMR6 : StdMMR6Rel, ABS_S_MMR6_ENC, ABS_S_MMR6_DESC, ISA_MICROMIPS32R6;
+def ABS_D_MMR6 : StdMMR6Rel, ABS_D_MMR6_ENC, ABS_D_MMR6_DESC, ISA_MICROMIPS32R6;
+def FLOOR_L_S_MMR6 : StdMMR6Rel, FLOOR_L_S_MMR6_ENC, FLOOR_L_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def FLOOR_L_D_MMR6 : StdMMR6Rel, FLOOR_L_D_MMR6_ENC, FLOOR_L_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def FLOOR_W_S_MMR6 : StdMMR6Rel, FLOOR_W_S_MMR6_ENC, FLOOR_W_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def FLOOR_W_D_MMR6 : StdMMR6Rel, FLOOR_W_D_MMR6_ENC, FLOOR_W_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def CEIL_L_S_MMR6 : StdMMR6Rel, CEIL_L_S_MMR6_ENC, CEIL_L_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def CEIL_L_D_MMR6 : StdMMR6Rel, CEIL_L_D_MMR6_ENC, CEIL_L_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def CEIL_W_S_MMR6 : StdMMR6Rel, CEIL_W_S_MMR6_ENC, CEIL_W_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def CEIL_W_D_MMR6 : StdMMR6Rel, CEIL_W_D_MMR6_ENC, CEIL_W_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def TRUNC_L_S_MMR6 : StdMMR6Rel, TRUNC_L_S_MMR6_ENC, TRUNC_L_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def TRUNC_L_D_MMR6 : StdMMR6Rel, TRUNC_L_D_MMR6_ENC, TRUNC_L_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def TRUNC_W_S_MMR6 : StdMMR6Rel, TRUNC_W_S_MMR6_ENC, TRUNC_W_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def TRUNC_W_D_MMR6 : StdMMR6Rel, TRUNC_W_D_MMR6_ENC, TRUNC_W_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def SQRT_S_MMR6 : StdMMR6Rel, SQRT_S_MMR6_ENC, SQRT_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def SQRT_D_MMR6 : StdMMR6Rel, SQRT_D_MMR6_ENC, SQRT_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def SB_MMR6 : StdMMR6Rel, SB_MMR6_DESC, SB_MMR6_ENC, ISA_MICROMIPS32R6;
+def SBE_MMR6 : StdMMR6Rel, SBE_MMR6_DESC, SBE_MMR6_ENC, ISA_MICROMIPS32R6;
+def SCE_MMR6 : StdMMR6Rel, SCE_MMR6_DESC, SCE_MMR6_ENC, ISA_MICROMIPS32R6;
+def SH_MMR6 : StdMMR6Rel, SH_MMR6_DESC, SH_MMR6_ENC, ISA_MICROMIPS32R6;
+def SHE_MMR6 : StdMMR6Rel, SHE_MMR6_DESC, SHE_MMR6_ENC, ISA_MICROMIPS32R6;
+def LLE_MMR6 : StdMMR6Rel, LLE_MMR6_DESC, LLE_MMR6_ENC, ISA_MICROMIPS32R6;
+def LWE_MMR6 : StdMMR6Rel, LWE_MMR6_DESC, LWE_MMR6_ENC, ISA_MICROMIPS32R6;
+def LW_MMR6 : StdMMR6Rel, LW_MMR6_DESC, LW_MMR6_ENC, ISA_MICROMIPS32R6;
+def LUI_MMR6 : R6MMR6Rel, LUI_MMR6_DESC, LUI_MMR6_ENC, ISA_MICROMIPS32R6;
+def ADDU16_MMR6 : StdMMR6Rel, ADDU16_MMR6_DESC, ADDU16_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+def AND16_MMR6 : StdMMR6Rel, AND16_MMR6_DESC, AND16_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+def ANDI16_MMR6 : StdMMR6Rel, ANDI16_MMR6_DESC, ANDI16_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+def NOT16_MMR6 : StdMMR6Rel, NOT16_MMR6_DESC, NOT16_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+def OR16_MMR6 : StdMMR6Rel, OR16_MMR6_DESC, OR16_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+def SLL16_MMR6 : StdMMR6Rel, SLL16_MMR6_DESC, SLL16_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+def SRL16_MMR6 : StdMMR6Rel, SRL16_MMR6_DESC, SRL16_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+def BREAK16_MMR6 : StdMMR6Rel, BREAK16_MMR6_DESC, BREAK16_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+def LI16_MMR6 : StdMMR6Rel, LI16_MMR6_DESC, LI16_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+def MOVE16_MMR6 : StdMMR6Rel, MOVE16_MMR6_DESC, MOVE16_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+def SDBBP16_MMR6 : StdMMR6Rel, SDBBP16_MMR6_DESC, SDBBP16_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+def SUBU16_MMR6 : StdMMR6Rel, SUBU16_MMR6_DESC, SUBU16_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+def XOR16_MMR6 : StdMMR6Rel, XOR16_MMR6_DESC, XOR16_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+def JALRC_HB_MMR6 : R6MMR6Rel, JALRC_HB_MMR6_ENC, JALRC_HB_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def EXT_MMR6 : StdMMR6Rel, EXT_MMR6_ENC, EXT_MMR6_DESC, ISA_MICROMIPS32R6;
+def INS_MMR6 : StdMMR6Rel, INS_MMR6_ENC, INS_MMR6_DESC, ISA_MICROMIPS32R6;
+def JALRC_MMR6 : R6MMR6Rel, JALRC_MMR6_ENC, JALRC_MMR6_DESC, ISA_MICROMIPS32R6;
+def RINT_S_MMR6 : StdMMR6Rel, RINT_S_MMR6_ENC, RINT_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def RINT_D_MMR6 : StdMMR6Rel, RINT_D_MMR6_ENC, RINT_D_MMR6_DESC, ISA_MICROMIPS32R6;
+def ROUND_L_S_MMR6 : StdMMR6Rel, ROUND_L_S_MMR6_ENC, ROUND_L_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def ROUND_L_D_MMR6 : StdMMR6Rel, ROUND_L_D_MMR6_ENC, ROUND_L_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def ROUND_W_S_MMR6 : StdMMR6Rel, ROUND_W_S_MMR6_ENC, ROUND_W_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def ROUND_W_D_MMR6 : StdMMR6Rel, ROUND_W_D_MMR6_ENC, ROUND_W_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def SEL_S_MMR6 : R6MMR6Rel, SEL_S_MMR6_ENC, SEL_S_MMR6_DESC, ISA_MICROMIPS32R6;
+def SEL_D_MMR6 : R6MMR6Rel, SEL_D_MMR6_ENC, SEL_D_MMR6_DESC, ISA_MICROMIPS32R6;
+def SELEQZ_S_MMR6 : R6MMR6Rel, SELEQZ_S_MMR6_ENC, SELEQZ_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def SELEQZ_D_MMR6 : R6MMR6Rel, SELEQZ_D_MMR6_ENC, SELEQZ_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def SELNEZ_S_MMR6 : R6MMR6Rel, SELNEZ_S_MMR6_ENC, SELNEZ_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def SELNEZ_D_MMR6 : R6MMR6Rel, SELNEZ_D_MMR6_ENC, SELNEZ_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def CLASS_S_MMR6 : StdMMR6Rel, CLASS_S_MMR6_ENC, CLASS_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def CLASS_D_MMR6 : StdMMR6Rel, CLASS_D_MMR6_ENC, CLASS_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def TLBINV_MMR6 : StdMMR6Rel, TLBINV_MMR6_ENC, TLBINV_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def TLBINVF_MMR6 : StdMMR6Rel, TLBINVF_MMR6_ENC, TLBINVF_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def DVP_MMR6 : R6MMR6Rel, DVP_MMR6_ENC, DVP_MMR6_DESC, ISA_MICROMIPS32R6;
+def EVP_MMR6 : R6MMR6Rel, EVP_MMR6_ENC, EVP_MMR6_DESC, ISA_MICROMIPS32R6;
+def BC1EQZC_MMR6 : R6MMR6Rel, BC1EQZC_MMR6_DESC, BC1EQZC_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+def BC1NEZC_MMR6 : R6MMR6Rel, BC1NEZC_MMR6_DESC, BC1NEZC_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+def BC2EQZC_MMR6 : R6MMR6Rel, MipsR6Inst, BC2EQZC_MMR6_ENC, BC2EQZC_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def BC2NEZC_MMR6 : R6MMR6Rel, MipsR6Inst, BC2NEZC_MMR6_ENC, BC2NEZC_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+let DecoderNamespace = "MicroMips32r6FP64" in {
+ def LDC1_D64_MMR6 : StdMMR6Rel, LDC1_D64_MMR6_DESC, LDC1_MMR6_ENC,
+ ISA_MICROMIPS32R6 {
+ let BaseOpcode = "LDC164";
+ }
+ def SDC1_D64_MMR6 : StdMMR6Rel, SDC1_D64_MMR6_DESC, SDC1_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+}
+def LDC2_MMR6 : StdMMR6Rel, LDC2_MMR6_ENC, LDC2_MMR6_DESC, ISA_MICROMIPS32R6;
+def SDC2_MMR6 : StdMMR6Rel, SDC2_MMR6_ENC, SDC2_MMR6_DESC, ISA_MICROMIPS32R6;
+def LWC2_MMR6 : StdMMR6Rel, LWC2_MMR6_ENC, LWC2_MMR6_DESC, ISA_MICROMIPS32R6;
+def SWC2_MMR6 : StdMMR6Rel, SWC2_MMR6_ENC, SWC2_MMR6_DESC, ISA_MICROMIPS32R6;
+}
+
+def BOVC_MMR6 : R6MMR6Rel, BOVC_MMR6_ENC, BOVC_MMR6_DESC, ISA_MICROMIPS32R6,
+ MMDecodeDisambiguatedBy<"POP35GroupBranchMMR6">;
+def BNVC_MMR6 : R6MMR6Rel, BNVC_MMR6_ENC, BNVC_MMR6_DESC, ISA_MICROMIPS32R6,
+ MMDecodeDisambiguatedBy<"POP37GroupBranchMMR6">;
+def BGEC_MMR6 : R6MMR6Rel, BGEC_MMR6_ENC, BGEC_MMR6_DESC, ISA_MICROMIPS32R6;
+def BGEUC_MMR6 : R6MMR6Rel, BGEUC_MMR6_ENC, BGEUC_MMR6_DESC, ISA_MICROMIPS32R6;
+def BLTC_MMR6 : R6MMR6Rel, BLTC_MMR6_ENC, BLTC_MMR6_DESC, ISA_MICROMIPS32R6;
+def BLTUC_MMR6 : R6MMR6Rel, BLTUC_MMR6_ENC, BLTUC_MMR6_DESC, ISA_MICROMIPS32R6;
+def BEQC_MMR6 : R6MMR6Rel, BEQC_MMR6_ENC, BEQC_MMR6_DESC, ISA_MICROMIPS32R6,
+ DecodeDisambiguates<"POP35GroupBranchMMR6">;
+def BNEC_MMR6 : R6MMR6Rel, BNEC_MMR6_ENC, BNEC_MMR6_DESC, ISA_MICROMIPS32R6,
+ DecodeDisambiguates<"POP37GroupBranchMMR6">;
+def BLTZC_MMR6 : R6MMR6Rel, BLTZC_MMR6_ENC, BLTZC_MMR6_DESC, ISA_MICROMIPS32R6;
+def BLEZC_MMR6 : R6MMR6Rel, BLEZC_MMR6_ENC, BLEZC_MMR6_DESC, ISA_MICROMIPS32R6;
+def BGEZC_MMR6 : R6MMR6Rel, BGEZC_MMR6_ENC, BGEZC_MMR6_DESC, ISA_MICROMIPS32R6;
+def BGTZC_MMR6 : R6MMR6Rel, BGTZC_MMR6_ENC, BGTZC_MMR6_DESC, ISA_MICROMIPS32R6;
+def BGEZALC_MMR6 : R6MMR6Rel, BGEZALC_MMR6_ENC, BGEZALC_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def BGTZALC_MMR6 : R6MMR6Rel, BGTZALC_MMR6_ENC, BGTZALC_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def BLEZALC_MMR6 : R6MMR6Rel, BLEZALC_MMR6_ENC, BLEZALC_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def BLTZALC_MMR6 : R6MMR6Rel, BLTZALC_MMR6_ENC, BLTZALC_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+
+//===----------------------------------------------------------------------===//
+//
+// MicroMips instruction aliases
+//
+//===----------------------------------------------------------------------===//
+
+def : MipsInstAlias<"ei", (EI_MMR6 ZERO), 1>, ISA_MICROMIPS32R6;
+def : MipsInstAlias<"di", (DI_MMR6 ZERO), 1>, ISA_MICROMIPS32R6;
+def : MipsInstAlias<"nop", (SLL_MMR6 ZERO, ZERO, 0), 1>, ISA_MICROMIPS32R6;
+def B_MMR6_Pseudo : MipsAsmPseudoInst<(outs), (ins brtarget_mm:$offset),
+ !strconcat("b", "\t$offset")> {
+ string DecoderNamespace = "MicroMipsR6";
+}
+def : MipsInstAlias<"sync", (SYNC_MMR6 0), 1>, ISA_MICROMIPS32R6;
+def : MipsInstAlias<"sdbbp", (SDBBP_MMR6 0), 1>, ISA_MICROMIPS32R6;
+def : MipsInstAlias<"rdhwr $rt, $rs",
+ (RDHWR_MMR6 GPR32Opnd:$rt, HWRegsOpnd:$rs, 0), 1>,
+ ISA_MICROMIPS32R6;
+def : MipsInstAlias<"mtc0 $rt, $rs",
+ (MTC0_MMR6 COP0Opnd:$rs, GPR32Opnd:$rt, 0), 0>,
+ ISA_MICROMIPS32R6;
+def : MipsInstAlias<"mthc0 $rt, $rs",
+ (MTHC0_MMR6 COP0Opnd:$rs, GPR32Opnd:$rt, 0), 0>,
+ ISA_MICROMIPS32R6;
+def : MipsInstAlias<"mfc0 $rt, $rs",
+ (MFC0_MMR6 GPR32Opnd:$rt, COP0Opnd:$rs, 0), 0>,
+ ISA_MICROMIPS32R6;
+def : MipsInstAlias<"mfhc0 $rt, $rs",
+ (MFHC0_MMR6 GPR32Opnd:$rt, COP0Opnd:$rs, 0), 0>,
+ ISA_MICROMIPS32R6;
+def : MipsInstAlias<"jalrc.hb $rs", (JALRC_HB_MMR6 RA, GPR32Opnd:$rs), 1>,
+ ISA_MICROMIPS32R6;
+def : MipsInstAlias<"dvp", (DVP_MMR6 ZERO), 0>, ISA_MICROMIPS32R6;
+def : MipsInstAlias<"evp", (EVP_MMR6 ZERO), 0>, ISA_MICROMIPS32R6;
+def : MipsInstAlias<"jalrc $rs", (JALRC_MMR6 RA, GPR32Opnd:$rs), 1>,
+ ISA_MICROMIPS32R6;
+def : MipsInstAlias<"and $rs, $rt, $imm",
+ (ANDI_MMR6 GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>,
+ ISA_MICROMIPS32R6;
+def : MipsInstAlias<"and $rs, $imm",
+ (ANDI_MMR6 GPR32Opnd:$rs, GPR32Opnd:$rs, uimm16:$imm), 0>,
+ ISA_MICROMIPS32R6;
+def : MipsInstAlias<"or $rs, $rt, $imm",
+ (ORI_MMR6 GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>,
+ ISA_MICROMIPS32R6;
+def : MipsInstAlias<"or $rs, $imm",
+ (ORI_MMR6 GPR32Opnd:$rs, GPR32Opnd:$rs, uimm16:$imm), 0>,
+ ISA_MICROMIPS32R6;
+def : MipsInstAlias<"xor $rs, $rt, $imm",
+ (XORI_MMR6 GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>,
+ ISA_MICROMIPS32R6;
+def : MipsInstAlias<"xor $rs, $imm",
+ (XORI_MMR6 GPR32Opnd:$rs, GPR32Opnd:$rs, uimm16:$imm), 0>,
+ ISA_MICROMIPS32R6;
+def : MipsInstAlias<"not $rt, $rs",
+ (NOR_MMR6 GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>,
+ ISA_MICROMIPS32R6;
+def : MipsInstAlias<"seh $rd", (SEH_MMR6 GPR32Opnd:$rd, GPR32Opnd:$rd), 0>,
+ ISA_MICROMIPS32R6;
+def : MipsInstAlias<"seb $rd", (SEB_MMR6 GPR32Opnd:$rd, GPR32Opnd:$rd), 0>,
+ ISA_MICROMIPS32R6;
+
+//===----------------------------------------------------------------------===//
+//
+// MicroMips arbitrary patterns that map to one or more instructions
+//
+//===----------------------------------------------------------------------===//
+
+def : MipsPat<(store GPRMM16:$src, addrimm4lsl2:$addr),
+ (SW16_MMR6 GPRMM16:$src, addrimm4lsl2:$addr)>, ISA_MICROMIPS32R6;
+def : MipsPat<(subc GPR32:$lhs, GPR32:$rhs),
+ (SUBU_MMR6 GPR32:$lhs, GPR32:$rhs)>, ISA_MICROMIPS32R6;
+
+def : MipsPat<(select i32:$cond, i32:$t, i32:$f),
+ (OR_MM (SELNEZ_MMR6 i32:$t, i32:$cond),
+ (SELEQZ_MMR6 i32:$f, i32:$cond))>,
+ ISA_MICROMIPS32R6;
+def : MipsPat<(select i32:$cond, i32:$t, immz),
+ (SELNEZ_MMR6 i32:$t, i32:$cond)>,
+ ISA_MICROMIPS32R6;
+def : MipsPat<(select i32:$cond, immz, i32:$f),
+ (SELEQZ_MMR6 i32:$f, i32:$cond)>,
+ ISA_MICROMIPS32R6;
+
+defm : SelectInt_Pats<i32, OR_MM, XORI_MMR6, SLTi_MM, SLTiu_MM, SELEQZ_MMR6,
+ SELNEZ_MMR6, immZExt16, i32>, ISA_MICROMIPS32R6;
+
+defm S_MMR6 : Cmp_Pats<f32, NOR_MMR6, ZERO>, ISA_MICROMIPS32R6;
+defm D_MMR6 : Cmp_Pats<f64, NOR_MMR6, ZERO>, ISA_MICROMIPS32R6;
+
+def : MipsPat<(and GPRMM16:$src, immZExtAndi16:$imm),
+ (ANDI16_MMR6 GPRMM16:$src, immZExtAndi16:$imm)>,
+ ISA_MICROMIPS32R6;
+def : MipsPat<(and GPR32:$src, immZExt16:$imm),
+ (ANDI_MMR6 GPR32:$src, immZExt16:$imm)>, ISA_MICROMIPS32R6;
+def : MipsPat<(i32 immZExt16:$imm),
+ (XORI_MMR6 ZERO, immZExt16:$imm)>, ISA_MICROMIPS32R6;
+def : MipsPat<(not GPRMM16:$in),
+ (NOT16_MMR6 GPRMM16:$in)>, ISA_MICROMIPS32R6;
+def : MipsPat<(not GPR32:$in),
+ (NOR_MMR6 GPR32Opnd:$in, ZERO)>, ISA_MICROMIPS32R6;
+// Patterns for load with a reg+imm operand.
+let AddedComplexity = 41 in {
+ def : LoadRegImmPat<LDC1_D64_MMR6, f64, load>, FGR_64, ISA_MICROMIPS32R6;
+ def : StoreRegImmPat<SDC1_D64_MMR6, f64>, FGR_64, ISA_MICROMIPS32R6;
+}
+
+def TAILCALL_MMR6 : TailCall<BC_MMR6, brtarget26_mm>, ISA_MICROMIPS32R6;
diff --git a/contrib/llvm/lib/Target/Mips/MicroMips64r6InstrFormats.td b/contrib/llvm/lib/Target/Mips/MicroMips64r6InstrFormats.td
new file mode 100644
index 000000000000..26062bfb2b8e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MicroMips64r6InstrFormats.td
@@ -0,0 +1,267 @@
+//=- MicroMips64r6InstrFormats.td - Instruction Formats -*- tablegen -* -=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes microMIPS64r6 instruction formats.
+//
+//===----------------------------------------------------------------------===//
+
+class DAUI_FM_MMR6 {
+ bits<5> rt;
+ bits<5> rs;
+ bits<16> imm;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b111100;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-0} = imm;
+}
+
+class POOL32I_ADD_IMM_FM_MMR6<bits<5> funct> {
+ bits<5> rs;
+ bits<16> imm;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010000;
+ let Inst{25-21} = funct;
+ let Inst{20-16} = rs;
+ let Inst{15-0} = imm;
+}
+
+class POOL32S_EXTBITS_FM_MMR6<bits<6> funct> {
+ bits<5> rt;
+ bits<5> rs;
+ bits<5> size;
+ bits<5> pos;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010110;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-11} = size;
+ let Inst{10-6} = pos;
+ let Inst{5-0} = funct;
+}
+
+class POOL32S_DALIGN_FM_MMR6 {
+ bits<5> rs;
+ bits<5> rt;
+ bits<5> rd;
+ bits<3> bp;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010110;
+ let Inst{25-21} = rs;
+ let Inst{20-16} = rt;
+ let Inst{15-11} = rd;
+ let Inst{10-8} = bp;
+ let Inst{7-6} = 0b00;
+ let Inst{5-0} = 0b011100;
+}
+
+class POOL32A_DIVMOD_FM_MMR6<string instr_asm, bits<9> funct>
+ : MMR6Arch<instr_asm> {
+ bits<5> rt;
+ bits<5> rs;
+ bits<5> rd;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010110;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-11} = rd;
+ let Inst{10-9} = 0b00;
+ let Inst{8-0} = funct;
+}
+
+class POOL32S_DMFTC0_FM_MMR6<string instr_asm, bits<5> funct>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> rt;
+ bits<5> rs;
+ bits<3> sel;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010110;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-14} = 0;
+ let Inst{13-11} = sel;
+ let Inst{10-6} = funct;
+ let Inst{5-0} = 0b111100;
+}
+
+class POOL32S_ARITH_FM_MMR6<string opstr, bits<9> funct>
+ : MMR6Arch<opstr> {
+ bits<5> rt;
+ bits<5> rs;
+ bits<5> rd;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010110;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-11} = rd;
+ let Inst{10-9} = 0b00;
+ let Inst{8-0} = funct;
+}
+
+class DADDIU_FM_MMR6<string opstr> : MMR6Arch<opstr> {
+ bits<5> rt;
+ bits<5> rs;
+ bits<16> imm16;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010111;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-0} = imm16;
+}
+
+class PCREL18_FM_MMR6<bits<3> funct> : MipsR6Inst {
+ bits<5> rt;
+ bits<18> imm;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b011110;
+ let Inst{25-21} = rt;
+ let Inst{20-18} = funct;
+ let Inst{17-0} = imm;
+}
+
+class POOL32S_2R_FM_MMR6<string instr_asm, bits<10> funct>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> rt;
+ bits<5> rs;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010110;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-6} = funct;
+ let Inst{5-0} = 0b111100;
+}
+
+class POOL32S_2RSA5B0_FM_MMR6<string instr_asm, bits<9> funct>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> rt;
+ bits<5> rs;
+ bits<5> sa;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010110;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-11} = sa;
+ let Inst{10-9} = 0b00;
+ let Inst{8-0} = funct;
+}
+
+class LD_SD_32_2R_OFFSET16_FM_MMR6<string instr_asm, bits<6> op>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> rt;
+ bits<21> addr;
+ bits<5> base = addr{20-16};
+ bits<16> offset = addr{15-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = op;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = base;
+ let Inst{15-0} = offset;
+}
+
+class POOL32C_2R_OFFSET12_FM_MMR6<string instr_asm, bits<4> funct>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> rt;
+ bits<21> addr;
+ bits<5> base = addr{20-16};
+ bits<12> offset = addr{11-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b011000;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = base;
+ let Inst{15-12} = funct;
+ let Inst{11-0} = offset;
+}
+
+class POOL32S_3R_FM_MMR6<string instr_asm, bits<9> funct>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> rt;
+ bits<5> rs;
+ bits<5> rd;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010110;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-11} = rd;
+ let Inst{10-9} = 0b00;
+ let Inst{8-0} = funct;
+}
+
+class POOL32S_DBITSWAP_FM_MMR6<string instr_asm> : MMR6Arch<instr_asm>,
+ MipsR6Inst {
+ bits<5> rt;
+ bits<5> rd;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010110;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rd;
+ let Inst{15-12} = 0b0000;
+ let Inst{11-6} = 0b101100;
+ let Inst{5-0} = 0b111100;
+}
+
+class POOL32S_3RSA_FM_MMR6<string instr_asm> : MMR6Arch<instr_asm>,
+ MipsR6Inst {
+ bits<5> rt;
+ bits<5> rs;
+ bits<5> rd;
+ bits<2> sa;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010110;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-11} = rd;
+ let Inst{10-9} = sa;
+ let Inst{8-6} = 0b100;
+ let Inst{5-0} = 0b000100;
+}
+
+class PCREL_1ROFFSET19_FM_MMR6<string instr_asm> : MMR6Arch<instr_asm>,
+ MipsR6Inst {
+ bits<5> rt;
+ bits<19> offset;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b011110;
+ let Inst{25-21} = rt;
+ let Inst{20-19} = 0b10;
+ let Inst{18-0} = offset;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MicroMips64r6InstrInfo.td b/contrib/llvm/lib/Target/Mips/MicroMips64r6InstrInfo.td
new file mode 100644
index 000000000000..05aad515da46
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MicroMips64r6InstrInfo.td
@@ -0,0 +1,568 @@
+//=- MicroMips64r6InstrInfo.td - Instruction Information -*- tablegen -*- -=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes MicroMips64r6 instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Encodings
+//
+//===----------------------------------------------------------------------===//
+
+class DAUI_MMR6_ENC : DAUI_FM_MMR6;
+class DAHI_MMR6_ENC : POOL32I_ADD_IMM_FM_MMR6<0b10001>;
+class DATI_MMR6_ENC : POOL32I_ADD_IMM_FM_MMR6<0b10000>;
+class DEXT_MMR6_ENC : POOL32S_EXTBITS_FM_MMR6<0b101100>;
+class DEXTM_MMR6_ENC : POOL32S_EXTBITS_FM_MMR6<0b100100>;
+class DEXTU_MMR6_ENC : POOL32S_EXTBITS_FM_MMR6<0b010100>;
+class DALIGN_MMR6_ENC : POOL32S_DALIGN_FM_MMR6;
+class DDIV_MM64R6_ENC : POOL32A_DIVMOD_FM_MMR6<"ddiv", 0b100011000>;
+class DMOD_MM64R6_ENC : POOL32A_DIVMOD_FM_MMR6<"dmod", 0b101011000>;
+class DDIVU_MM64R6_ENC : POOL32A_DIVMOD_FM_MMR6<"ddivu", 0b110011000>;
+class DMODU_MM64R6_ENC : POOL32A_DIVMOD_FM_MMR6<"dmodu", 0b111011000>;
+class DINSU_MM64R6_ENC : POOL32S_EXTBITS_FM_MMR6<0b110100>;
+class DINSM_MM64R6_ENC : POOL32S_EXTBITS_FM_MMR6<0b000100>;
+class DINS_MM64R6_ENC : POOL32S_EXTBITS_FM_MMR6<0b001100>;
+class DMTC0_MM64R6_ENC : POOL32S_DMFTC0_FM_MMR6<"dmtc0", 0b01011>;
+class DMTC1_MM64R6_ENC : POOL32F_MFTC1_FM_MMR6<"dmtc1", 0b10110000>;
+class DMTC2_MM64R6_ENC : POOL32A_MFTC2_FM_MMR6<"dmtc2", 0b0111110100>;
+class DMFC0_MM64R6_ENC : POOL32S_DMFTC0_FM_MMR6<"dmfc0", 0b00011>;
+class DMFC1_MM64R6_ENC : POOL32F_MFTC1_FM_MMR6<"dmfc1", 0b10010000>;
+class DMFC2_MM64R6_ENC : POOL32A_MFTC2_FM_MMR6<"dmfc2", 0b0110110100>;
+class DADD_MM64R6_ENC : POOL32S_ARITH_FM_MMR6<"dadd", 0b100010000>;
+class DADDIU_MM64R6_ENC : DADDIU_FM_MMR6<"daddiu">;
+class DADDU_MM64R6_ENC : POOL32S_ARITH_FM_MMR6<"daddu", 0b101010000>;
+class LDPC_MMR646_ENC : PCREL18_FM_MMR6<0b110>;
+class DSUB_MM64R6_ENC : POOL32S_ARITH_FM_MMR6<"dsub", 0b110010000>;
+class DSUBU_MM64R6_ENC : POOL32S_ARITH_FM_MMR6<"dsubu", 0b111010000>;
+class DMUL_MM64R6_ENC : POOL32S_ARITH_FM_MMR6<"dmul", 0b000011000>;
+class DMUH_MM64R6_ENC : POOL32S_ARITH_FM_MMR6<"dmuh", 0b001011000>;
+class DMULU_MM64R6_ENC : POOL32S_ARITH_FM_MMR6<"dmulu", 0b010011000>;
+class DMUHU_MM64R6_ENC : POOL32S_ARITH_FM_MMR6<"dmuhu", 0b011011000>;
+class DSBH_MM64R6_ENC : POOL32S_2R_FM_MMR6<"dsbh", 0b0111101100>;
+class DSHD_MM64R6_ENC : POOL32S_2R_FM_MMR6<"dshd", 0b1111101100>;
+class DSLL_MM64R6_ENC : POOL32S_2RSA5B0_FM_MMR6<"dsll", 0b000000000>;
+class DSLL32_MM64R6_ENC : POOL32S_2RSA5B0_FM_MMR6<"dsll32", 0b000001000>;
+class DSLLV_MM64R6_ENC : POOL32S_3R_FM_MMR6<"dsllv", 0b000010000>;
+class DSRAV_MM64R6_ENC : POOL32S_3R_FM_MMR6<"dsrav", 0b010010000>;
+class DSRA_MM64R6_ENC : POOL32S_2RSA5B0_FM_MMR6<"dsra", 0b010000000>;
+class DSRA32_MM64R6_ENC : POOL32S_2RSA5B0_FM_MMR6<"dsra32", 0b010000100>;
+class DCLO_MM64R6_ENC : POOL32S_2R_FM_MMR6<"dclo", 0b0100101100>;
+class DCLZ_MM64R6_ENC : POOL32S_2R_FM_MMR6<"dclz", 0b0101101100>;
+class DROTR_MM64R6_ENC : POOL32S_2RSA5B0_FM_MMR6<"drotr", 0b011000000>;
+class DROTR32_MM64R6_ENC : POOL32S_2RSA5B0_FM_MMR6<"drotr32", 0b011001000>;
+class DROTRV_MM64R6_ENC : POOL32S_3R_FM_MMR6<"drotrv", 0b011010000>;
+class LD_MM64R6_ENC : LD_SD_32_2R_OFFSET16_FM_MMR6<"ld", 0b110111>;
+class LLD_MM64R6_ENC : POOL32C_2R_OFFSET12_FM_MMR6<"lld", 0b0111>;
+class LWU_MM64R6_ENC : POOL32C_2R_OFFSET12_FM_MMR6<"lwu", 0b1110>;
+class SD_MM64R6_ENC : LD_SD_32_2R_OFFSET16_FM_MMR6<"sd", 0b110110>;
+class DSRL_MM64R6_ENC : POOL32S_2RSA5B0_FM_MMR6<"dsrl", 0b001000000>;
+class DSRL32_MM64R6_ENC : POOL32S_2RSA5B0_FM_MMR6<"dsrl32", 0b001001000>;
+class DSRLV_MM64R6_ENC : POOL32S_3R_FM_MMR6<"dsrlv", 0b001010000>;
+class DBITSWAP_MM64R6_ENC : POOL32S_DBITSWAP_FM_MMR6<"dbitswap">;
+class DLSA_MM64R6_ENC : POOL32S_3RSA_FM_MMR6<"dlsa">;
+class LWUPC_MM64R6_ENC : PCREL_1ROFFSET19_FM_MMR6<"lwupc">;
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Descriptions
+//
+//===----------------------------------------------------------------------===//
+
+class DAUI_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+ InstrItinClass Itin>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ dag OutOperandList = (outs GPROpnd:$rt);
+ dag InOperandList = (ins GPROpnd:$rs, uimm16:$imm);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $imm");
+ list<dag> Pattern = [];
+ InstrItinClass Itinerary = Itin;
+}
+class DAUI_MMR6_DESC : DAUI_MMR6_DESC_BASE<"daui", GPR64Opnd, II_DAUI>;
+
+class DAHI_DATI_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+ InstrItinClass Itin>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ dag OutOperandList = (outs GPROpnd:$rs);
+ dag InOperandList = (ins GPROpnd:$rt, uimm16:$imm);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $imm");
+ string Constraints = "$rs = $rt";
+ InstrItinClass Itinerary = Itin;
+}
+class DAHI_MMR6_DESC : DAHI_DATI_DESC_BASE<"dahi", GPR64Opnd, II_DAHI>;
+class DATI_MMR6_DESC : DAHI_DATI_DESC_BASE<"dati", GPR64Opnd, II_DATI>;
+
+class EXTBITS_DESC_BASE<string instr_asm, RegisterOperand RO, Operand PosOpnd,
+ Operand SizeOpnd, SDPatternOperator Op = null_frag>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ dag OutOperandList = (outs RO:$rt);
+ dag InOperandList = (ins RO:$rs, PosOpnd:$pos, SizeOpnd:$size);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $pos, $size");
+ list<dag> Pattern = [(set RO:$rt, (Op RO:$rs, imm:$pos, imm:$size))];
+ InstrItinClass Itinerary = II_EXT;
+ Format Form = FrmR;
+ string BaseOpcode = instr_asm;
+}
+// TODO: Add 'pos + size' constraint check to dext* instructions
+// DEXT: 0 < pos + size <= 63
+// DEXTM, DEXTU: 32 < pos + size <= 64
+class DEXT_MMR6_DESC : EXTBITS_DESC_BASE<"dext", GPR64Opnd, uimm5_report_uimm6,
+ uimm5_plus1, MipsExt>;
+class DEXTM_MMR6_DESC : EXTBITS_DESC_BASE<"dextm", GPR64Opnd, uimm5,
+ uimm5_plus33, MipsExt>;
+class DEXTU_MMR6_DESC : EXTBITS_DESC_BASE<"dextu", GPR64Opnd, uimm5_plus32,
+ uimm5_plus1, MipsExt>;
+
+class DALIGN_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+ Operand ImmOpnd, InstrItinClass itin>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ dag OutOperandList = (outs GPROpnd:$rd);
+ dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt, ImmOpnd:$bp);
+ string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt, $bp");
+ list<dag> Pattern = [];
+ InstrItinClass Itinerary = itin;
+}
+
+class DALIGN_MMR6_DESC : DALIGN_DESC_BASE<"dalign", GPR64Opnd, uimm3,
+ II_DALIGN>;
+
+class DDIV_MM64R6_DESC : DIVMOD_MMR6_DESC_BASE<"ddiv", GPR64Opnd, II_DDIV,
+ sdiv>;
+class DMOD_MM64R6_DESC : DIVMOD_MMR6_DESC_BASE<"dmod", GPR64Opnd, II_DMOD,
+ srem>;
+class DDIVU_MM64R6_DESC : DIVMOD_MMR6_DESC_BASE<"ddivu", GPR64Opnd, II_DDIVU,
+ udiv>;
+class DMODU_MM64R6_DESC : DIVMOD_MMR6_DESC_BASE<"dmodu", GPR64Opnd, II_DMODU,
+ urem>;
+
+class DCLO_MM64R6_DESC {
+ dag OutOperandList = (outs GPR64Opnd:$rt);
+ dag InOperandList = (ins GPR64Opnd:$rs);
+ string AsmString = !strconcat("dclo", "\t$rt, $rs");
+ list<dag> Pattern = [(set GPR64Opnd:$rt, (ctlz (not GPR64Opnd:$rs)))];
+ InstrItinClass Itinerary = II_DCLO;
+ Format Form = FrmR;
+ string BaseOpcode = "dclo";
+}
+
+class DCLZ_MM64R6_DESC {
+ dag OutOperandList = (outs GPR64Opnd:$rt);
+ dag InOperandList = (ins GPR64Opnd:$rs);
+ string AsmString = !strconcat("dclz", "\t$rt, $rs");
+ list<dag> Pattern = [(set GPR64Opnd:$rt, (ctlz GPR64Opnd:$rs))];
+ InstrItinClass Itinerary = II_DCLZ;
+ Format Form = FrmR;
+ string BaseOpcode = "dclz";
+}
+
+class DINSU_MM64R6_DESC : InsBase<"dinsu", GPR64Opnd, uimm5_plus32,
+ uimm5_inssize_plus1, MipsIns>;
+class DINSM_MM64R6_DESC : InsBase<"dinsm", GPR64Opnd, uimm5, uimm_range_2_64>;
+class DINS_MM64R6_DESC : InsBase<"dins", GPR64Opnd, uimm5, uimm5_inssize_plus1,
+ MipsIns>;
+class DMTC0_MM64R6_DESC : MTC0_MMR6_DESC_BASE<"dmtc0", COP0Opnd, GPR64Opnd,
+ II_DMTC0>;
+class DMTC1_MM64R6_DESC : MTC1_MMR6_DESC_BASE<"dmtc1", FGR64Opnd, GPR64Opnd,
+ II_DMTC1, bitconvert>;
+class DMTC2_MM64R6_DESC : MTC2_MMR6_DESC_BASE<"dmtc2", COP2Opnd, GPR64Opnd,
+ II_DMTC2>;
+class DMFC0_MM64R6_DESC : MFC0_MMR6_DESC_BASE<"dmfc0", GPR64Opnd, COP0Opnd,
+ II_DMFC0>;
+class DMFC1_MM64R6_DESC : MFC1_MMR6_DESC_BASE<"dmfc1", GPR64Opnd, FGR64Opnd,
+ II_DMFC1, bitconvert>;
+class DMFC2_MM64R6_DESC : MFC2_MMR6_DESC_BASE<"dmfc2", GPR64Opnd, COP2Opnd,
+ II_DMFC2>;
+class DADD_MM64R6_DESC : ArithLogicR<"dadd", GPR64Opnd, 1, II_DADD>;
+class DADDIU_MM64R6_DESC : ArithLogicI<"daddiu", simm16_64, GPR64Opnd,
+ II_DADDIU, immSExt16, add>,
+ IsAsCheapAsAMove;
+class DADDU_MM64R6_DESC : ArithLogicR<"daddu", GPR64Opnd, 1, II_DADDU, add>;
+
+class DSUB_DESC_BASE<string instr_asm, RegisterOperand RO,
+ InstrItinClass Itin = NoItinerary,
+ SDPatternOperator OpNode = null_frag>
+ : MipsR6Inst {
+ dag OutOperandList = (outs RO:$rd);
+ dag InOperandList = (ins RO:$rs, RO:$rt);
+ string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
+ list<dag> Pattern = [(set RO:$rd, (OpNode RO:$rs, RO:$rt))];
+ InstrItinClass Itinerary = Itin;
+ Format Form = FrmR;
+ string BaseOpcode = instr_asm;
+ let isCommutable = 0;
+ let isReMaterializable = 1;
+ let TwoOperandAliasConstraint = "$rd = $rs";
+}
+class DSUB_MM64R6_DESC : DSUB_DESC_BASE<"dsub", GPR64Opnd, II_DSUB>;
+class DSUBU_MM64R6_DESC : DSUB_DESC_BASE<"dsubu", GPR64Opnd, II_DSUBU, sub>;
+
+class LDPC_MM64R6_DESC : PCREL_MMR6_DESC_BASE<"ldpc", GPR64Opnd, simm18_lsl3,
+ II_LDPC>;
+
+class MUL_MM64R6_DESC_BASE<string opstr, RegisterOperand GPROpnd,
+ InstrItinClass Itin = NoItinerary,
+ SDPatternOperator Op = null_frag> : MipsR6Inst {
+ dag OutOperandList = (outs GPROpnd:$rd);
+ dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt);
+ string AsmString = !strconcat(opstr, "\t$rd, $rs, $rt");
+ InstrItinClass Itinerary = Itin;
+ list<dag> Pattern = [(set GPROpnd:$rd, (Op GPROpnd:$rs, GPROpnd:$rt))];
+}
+
+class DMUL_MM64R6_DESC : MUL_MM64R6_DESC_BASE<"dmul", GPR64Opnd, II_DMUL, mul>;
+class DMUH_MM64R6_DESC : MUL_MM64R6_DESC_BASE<"dmuh", GPR64Opnd, II_DMUH,
+ mulhs>;
+class DMULU_MM64R6_DESC : MUL_MM64R6_DESC_BASE<"dmulu", GPR64Opnd, II_DMULU>;
+class DMUHU_MM64R6_DESC : MUL_MM64R6_DESC_BASE<"dmuhu", GPR64Opnd, II_DMUHU,
+ mulhu>;
+
+class DSBH_DSHD_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+ InstrItinClass Itin> {
+ dag OutOperandList = (outs GPROpnd:$rt);
+ dag InOperandList = (ins GPROpnd:$rs);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $rs");
+ bit hasSideEffects = 0;
+ list<dag> Pattern = [];
+ InstrItinClass Itinerary = Itin;
+ Format Form = FrmR;
+ string BaseOpcode = instr_asm;
+}
+
+class DSBH_MM64R6_DESC : DSBH_DSHD_DESC_BASE<"dsbh", GPR64Opnd, II_DSBH>;
+class DSHD_MM64R6_DESC : DSBH_DSHD_DESC_BASE<"dshd", GPR64Opnd, II_DSHD>;
+
+class SHIFT_ROTATE_IMM_MM64R6<string instr_asm, Operand ImmOpnd,
+ InstrItinClass itin,
+ SDPatternOperator OpNode = null_frag,
+ SDPatternOperator PO = null_frag> {
+ dag OutOperandList = (outs GPR64Opnd:$rt);
+ dag InOperandList = (ins GPR64Opnd:$rs, ImmOpnd:$sa);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $sa");
+ list<dag> Pattern = [(set GPR64Opnd:$rt, (OpNode GPR64Opnd:$rs, PO:$sa))];
+ InstrItinClass Itinerary = itin;
+ Format Form = FrmR;
+ string TwoOperandAliasConstraint = "$rs = $rt";
+ string BaseOpcode = instr_asm;
+}
+
+class SHIFT_ROTATE_REG_MM64R6<string instr_asm, InstrItinClass itin,
+ SDPatternOperator OpNode = null_frag> {
+ dag OutOperandList = (outs GPR64Opnd:$rd);
+ dag InOperandList = (ins GPR64Opnd:$rt, GPR32Opnd:$rs);
+ string AsmString = !strconcat(instr_asm, "\t$rd, $rt, $rs");
+ list<dag> Pattern = [(set GPR64Opnd:$rd,
+ (OpNode GPR64Opnd:$rt, GPR32Opnd:$rs))];
+ InstrItinClass Itinerary = itin;
+ Format Form = FrmR;
+ string BaseOpcode = instr_asm;
+}
+
+class DSLL_MM64R6_DESC : SHIFT_ROTATE_IMM_MM64R6<"dsll", uimm6, II_DSLL, shl,
+ immZExt6>;
+class DSLL32_MM64R6_DESC : SHIFT_ROTATE_IMM_MM64R6<"dsll32", uimm5, II_DSLL32>;
+class DSLLV_MM64R6_DESC : SHIFT_ROTATE_REG_MM64R6<"dsllv", II_DSLLV, shl>;
+class DSRAV_MM64R6_DESC : SHIFT_ROTATE_REG_MM64R6<"dsrav", II_DSRAV, sra>;
+class DSRA_MM64R6_DESC : SHIFT_ROTATE_IMM_MM64R6<"dsra", uimm6, II_DSRA, sra,
+ immZExt6>;
+class DSRA32_MM64R6_DESC : SHIFT_ROTATE_IMM_MM64R6<"dsra32", uimm5, II_DSRA32>;
+class DROTR_MM64R6_DESC : SHIFT_ROTATE_IMM_MM64R6<"drotr", uimm6, II_DROTR,
+ rotr, immZExt6>;
+class DROTR32_MM64R6_DESC : SHIFT_ROTATE_IMM_MM64R6<"drotr32", uimm5,
+ II_DROTR32>;
+class DROTRV_MM64R6_DESC : SHIFT_ROTATE_REG_MM64R6<"drotrv", II_DROTRV, rotr>;
+class DSRL_MM64R6_DESC : SHIFT_ROTATE_IMM_MM64R6<"dsrl", uimm6, II_DSRL, srl,
+ immZExt6>;
+class DSRL32_MM64R6_DESC : SHIFT_ROTATE_IMM_MM64R6<"dsrl32", uimm5, II_DSRL32>;
+class DSRLV_MM64R6_DESC : SHIFT_ROTATE_REG_MM64R6<"dsrlv", II_DSRLV, srl>;
+
+class Load_MM64R6<string instr_asm, Operand MemOpnd, InstrItinClass itin,
+ SDPatternOperator OpNode = null_frag> {
+ dag OutOperandList = (outs GPR64Opnd:$rt);
+ dag InOperandList = (ins MemOpnd:$addr);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+ list<dag> Pattern = [(set GPR64Opnd:$rt, (OpNode addr:$addr))];
+ InstrItinClass Itinerary = itin;
+ Format Form = FrmI;
+ bit mayLoad = 1;
+ bit canFoldAsLoad = 1;
+ string BaseOpcode = instr_asm;
+}
+
+class LD_MM64R6_DESC : Load_MM64R6<"ld", mem_simm16, II_LD, load> {
+ string DecoderMethod = "DecodeMemMMImm16";
+}
+class LWU_MM64R6_DESC : Load_MM64R6<"lwu", mem_simm12, II_LWU, zextloadi32>{
+ string DecoderMethod = "DecodeMemMMImm12";
+}
+
+class LLD_MM64R6_DESC {
+ dag OutOperandList = (outs GPR64Opnd:$rt);
+ dag InOperandList = (ins mem_simm12:$addr);
+ string AsmString = "lld\t$rt, $addr";
+ list<dag> Pattern = [];
+ bit mayLoad = 1;
+ InstrItinClass Itinerary = II_LLD;
+ string BaseOpcode = "lld";
+ string DecoderMethod = "DecodeMemMMImm12";
+}
+
+class SD_MM64R6_DESC {
+ dag OutOperandList = (outs);
+ dag InOperandList = (ins GPR64Opnd:$rt, mem_simm16:$addr);
+ string AsmString = "sd\t$rt, $addr";
+ list<dag> Pattern = [(store GPR64Opnd:$rt, addr:$addr)];
+ InstrItinClass Itinerary = II_SD;
+ Format Form = FrmI;
+ bit mayStore = 1;
+ string BaseOpcode = "sd";
+ string DecoderMethod = "DecodeMemMMImm16";
+}
+
+class DBITSWAP_MM64R6_DESC {
+ dag OutOperandList = (outs GPR64Opnd:$rd);
+ dag InOperandList = (ins GPR64Opnd:$rt);
+ string AsmString = !strconcat("dbitswap", "\t$rd, $rt");
+ list<dag> Pattern = [];
+ InstrItinClass Itinerary = II_DBITSWAP;
+}
+
+class DLSA_MM64R6_DESC {
+ dag OutOperandList = (outs GPR64Opnd:$rd);
+ dag InOperandList = (ins GPR64Opnd:$rt, GPR64Opnd:$rs, uimm2_plus1:$sa);
+ string AsmString = "dlsa\t$rt, $rs, $rd, $sa";
+ list<dag> Pattern = [];
+ InstrItinClass Itinerary = II_DLSA;
+}
+
+class LWUPC_MM64R6_DESC {
+ dag OutOperandList = (outs GPR64Opnd:$rt);
+ dag InOperandList = (ins simm19_lsl2:$offset);
+ string AsmString = "lwupc\t$rt, $offset";
+ list<dag> Pattern = [];
+ InstrItinClass Itinerary = II_LWUPC;
+ bit mayLoad = 1;
+ bit IsPCRelativeLoad = 1;
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Definitions
+//
+//===----------------------------------------------------------------------===//
+
+let DecoderNamespace = "MicroMipsR6" in {
+ def DAUI_MM64R6 : StdMMR6Rel, DAUI_MMR6_DESC, DAUI_MMR6_ENC, ISA_MICROMIPS64R6;
+ let DecoderMethod = "DecodeDAHIDATIMMR6" in {
+ def DAHI_MM64R6 : StdMMR6Rel, DAHI_MMR6_DESC, DAHI_MMR6_ENC, ISA_MICROMIPS64R6;
+ def DATI_MM64R6 : StdMMR6Rel, DATI_MMR6_DESC, DATI_MMR6_ENC, ISA_MICROMIPS64R6;
+ }
+ def DEXT_MM64R6 : StdMMR6Rel, DEXT_MMR6_DESC, DEXT_MMR6_ENC,
+ ISA_MICROMIPS64R6;
+ def DEXTM_MM64R6 : StdMMR6Rel, DEXTM_MMR6_DESC, DEXTM_MMR6_ENC,
+ ISA_MICROMIPS64R6;
+ def DEXTU_MM64R6 : StdMMR6Rel, DEXTU_MMR6_DESC, DEXTU_MMR6_ENC,
+ ISA_MICROMIPS64R6;
+ def DALIGN_MM64R6 : StdMMR6Rel, DALIGN_MMR6_DESC, DALIGN_MMR6_ENC,
+ ISA_MICROMIPS64R6;
+ def DDIV_MM64R6 : R6MMR6Rel, DDIV_MM64R6_DESC, DDIV_MM64R6_ENC,
+ ISA_MICROMIPS64R6;
+ def DMOD_MM64R6 : R6MMR6Rel, DMOD_MM64R6_DESC, DMOD_MM64R6_ENC,
+ ISA_MICROMIPS64R6;
+ def DDIVU_MM64R6 : R6MMR6Rel, DDIVU_MM64R6_DESC, DDIVU_MM64R6_ENC,
+ ISA_MICROMIPS64R6;
+ def DMODU_MM64R6 : R6MMR6Rel, DMODU_MM64R6_DESC, DMODU_MM64R6_ENC,
+ ISA_MICROMIPS64R6;
+ def DINSU_MM64R6: R6MMR6Rel, DINSU_MM64R6_DESC, DINSU_MM64R6_ENC,
+ ISA_MICROMIPS64R6;
+ def DINSM_MM64R6: R6MMR6Rel, DINSM_MM64R6_DESC, DINSM_MM64R6_ENC,
+ ISA_MICROMIPS64R6;
+ def DINS_MM64R6: R6MMR6Rel, DINS_MM64R6_DESC, DINS_MM64R6_ENC,
+ ISA_MICROMIPS64R6;
+ def DMTC0_MM64R6 : StdMMR6Rel, DMTC0_MM64R6_ENC, DMTC0_MM64R6_DESC,
+ ISA_MICROMIPS64R6;
+ def DMTC1_MM64R6 : StdMMR6Rel, DMTC1_MM64R6_DESC, DMTC1_MM64R6_ENC,
+ ISA_MICROMIPS64R6;
+ def DMTC2_MM64R6 : StdMMR6Rel, DMTC2_MM64R6_ENC, DMTC2_MM64R6_DESC,
+ ISA_MICROMIPS64R6;
+ def DMFC0_MM64R6 : StdMMR6Rel, DMFC0_MM64R6_ENC, DMFC0_MM64R6_DESC,
+ ISA_MICROMIPS64R6;
+ def DMFC1_MM64R6 : StdMMR6Rel, DMFC1_MM64R6_DESC, DMFC1_MM64R6_ENC,
+ ISA_MICROMIPS64R6;
+ def DMFC2_MM64R6 : StdMMR6Rel, DMFC2_MM64R6_ENC, DMFC2_MM64R6_DESC,
+ ISA_MICROMIPS64R6;
+ def DADD_MM64R6: StdMMR6Rel, DADD_MM64R6_DESC, DADD_MM64R6_ENC,
+ ISA_MICROMIPS64R6;
+ def DADDIU_MM64R6: StdMMR6Rel, DADDIU_MM64R6_DESC, DADDIU_MM64R6_ENC,
+ ISA_MICROMIPS64R6;
+ def DADDU_MM64R6: StdMMR6Rel, DADDU_MM64R6_DESC, DADDU_MM64R6_ENC,
+ ISA_MICROMIPS64R6;
+ def LDPC_MM64R6 : R6MMR6Rel, LDPC_MMR646_ENC, LDPC_MM64R6_DESC,
+ ISA_MICROMIPS64R6;
+ def DSUB_MM64R6 : StdMMR6Rel, DSUB_MM64R6_DESC, DSUB_MM64R6_ENC,
+ ISA_MICROMIPS64R6;
+ def DSUBU_MM64R6 : StdMMR6Rel, DSUBU_MM64R6_DESC, DSUBU_MM64R6_ENC,
+ ISA_MICROMIPS64R6;
+ def DMUL_MM64R6 : R6MMR6Rel, DMUL_MM64R6_DESC, DMUL_MM64R6_ENC,
+ ISA_MICROMIPS64R6;
+ def DMUH_MM64R6 : R6MMR6Rel, DMUH_MM64R6_DESC, DMUH_MM64R6_ENC,
+ ISA_MICROMIPS64R6;
+ def DMULU_MM64R6 : R6MMR6Rel, DMULU_MM64R6_DESC, DMULU_MM64R6_ENC,
+ ISA_MICROMIPS64R6;
+ def DMUHU_MM64R6 : R6MMR6Rel, DMUHU_MM64R6_DESC, DMUHU_MM64R6_ENC,
+ ISA_MICROMIPS64R6;
+ def DSBH_MM64R6 : R6MMR6Rel, DSBH_MM64R6_ENC, DSBH_MM64R6_DESC,
+ ISA_MICROMIPS64R6;
+ def DSHD_MM64R6 : R6MMR6Rel, DSHD_MM64R6_ENC, DSHD_MM64R6_DESC,
+ ISA_MICROMIPS64R6;
+ def DSLL_MM64R6 : StdMMR6Rel, DSLL_MM64R6_ENC, DSLL_MM64R6_DESC,
+ ISA_MICROMIPS64R6;
+ def DSLL32_MM64R6 : StdMMR6Rel, DSLL32_MM64R6_ENC, DSLL32_MM64R6_DESC,
+ ISA_MICROMIPS64R6;
+ def DSLLV_MM64R6 : StdMMR6Rel, DSLLV_MM64R6_ENC, DSLLV_MM64R6_DESC,
+ ISA_MICROMIPS64R6;
+ def DSRAV_MM64R6 : StdMMR6Rel, DSRAV_MM64R6_ENC, DSRAV_MM64R6_DESC,
+ ISA_MICROMIPS64R6;
+ def DSRA_MM64R6 : StdMMR6Rel, DSRA_MM64R6_ENC, DSRA_MM64R6_DESC,
+ ISA_MICROMIPS64R6;
+ def DSRA32_MM64R6 : StdMMR6Rel, DSRA32_MM64R6_ENC, DSRA32_MM64R6_DESC,
+ ISA_MICROMIPS64R6;
+ def DCLO_MM64R6 : StdMMR6Rel, R6MMR6Rel, DCLO_MM64R6_ENC, DCLO_MM64R6_DESC,
+ ISA_MICROMIPS64R6;
+ def DCLZ_MM64R6 : StdMMR6Rel, R6MMR6Rel, DCLZ_MM64R6_ENC, DCLZ_MM64R6_DESC,
+ ISA_MICROMIPS64R6;
+ def DROTR_MM64R6 : StdMMR6Rel, DROTR_MM64R6_ENC, DROTR_MM64R6_DESC,
+ ISA_MICROMIPS64R6;
+ def DROTR32_MM64R6 : StdMMR6Rel, DROTR32_MM64R6_ENC, DROTR32_MM64R6_DESC,
+ ISA_MICROMIPS64R6;
+ def DROTRV_MM64R6 : StdMMR6Rel, DROTRV_MM64R6_ENC, DROTRV_MM64R6_DESC,
+ ISA_MICROMIPS64R6;
+ def LD_MM64R6 : StdMMR6Rel, LD_MM64R6_ENC, LD_MM64R6_DESC,
+ ISA_MICROMIPS64R6;
+ def LLD_MM64R6 : StdMMR6Rel, R6MMR6Rel, LLD_MM64R6_ENC, LLD_MM64R6_DESC,
+ ISA_MICROMIPS64R6;
+ def LWU_MM64R6 : StdMMR6Rel, LWU_MM64R6_ENC, LWU_MM64R6_DESC,
+ ISA_MICROMIPS64R6;
+ def SD_MM64R6 : StdMMR6Rel, SD_MM64R6_ENC, SD_MM64R6_DESC,
+ ISA_MICROMIPS64R6;
+ def DSRL_MM64R6 : StdMMR6Rel, DSRL_MM64R6_ENC, DSRL_MM64R6_DESC,
+ ISA_MICROMIPS64R6;
+ def DSRL32_MM64R6 : StdMMR6Rel, DSRL32_MM64R6_ENC, DSRL32_MM64R6_DESC,
+ ISA_MICROMIPS64R6;
+ def DSRLV_MM64R6 : StdMMR6Rel, DSRLV_MM64R6_ENC, DSRLV_MM64R6_DESC,
+ ISA_MICROMIPS64R6;
+ def DBITSWAP_MM64R6 : R6MMR6Rel, DBITSWAP_MM64R6_ENC, DBITSWAP_MM64R6_DESC,
+ ISA_MICROMIPS64R6;
+ def DLSA_MM64R6 : R6MMR6Rel, DLSA_MM64R6_ENC, DLSA_MM64R6_DESC,
+ ISA_MICROMIPS64R6;
+ def LWUPC_MM64R6 : R6MMR6Rel, LWUPC_MM64R6_ENC, LWUPC_MM64R6_DESC,
+ ISA_MICROMIPS64R6;
+}
+
+let AdditionalPredicates = [InMicroMips] in
+defm : MaterializeImms<i64, ZERO_64, DADDIU_MM64R6, LUi64, ORi64>;
+
+//===----------------------------------------------------------------------===//
+//
+// Arbitrary patterns that map to one or more instructions
+//
+//===----------------------------------------------------------------------===//
+
+def : MipsPat<(MipsLo tglobaladdr:$in),
+ (DADDIU_MM64R6 ZERO_64, tglobaladdr:$in)>, ISA_MICROMIPS64R6;
+def : MipsPat<(MipsLo tblockaddress:$in),
+ (DADDIU_MM64R6 ZERO_64, tblockaddress:$in)>, ISA_MICROMIPS64R6;
+def : MipsPat<(MipsLo tjumptable:$in),
+ (DADDIU_MM64R6 ZERO_64, tjumptable:$in)>, ISA_MICROMIPS64R6;
+def : MipsPat<(MipsLo tconstpool:$in),
+ (DADDIU_MM64R6 ZERO_64, tconstpool:$in)>, ISA_MICROMIPS64R6;
+def : MipsPat<(MipsLo tglobaltlsaddr:$in),
+ (DADDIU_MM64R6 ZERO_64, tglobaltlsaddr:$in)>, ISA_MICROMIPS64R6;
+def : MipsPat<(MipsLo texternalsym:$in),
+ (DADDIU_MM64R6 ZERO_64, texternalsym:$in)>, ISA_MICROMIPS64R6;
+
+def : MipsPat<(add GPR64:$hi, (MipsLo tglobaladdr:$lo)),
+ (DADDIU_MM64R6 GPR64:$hi, tglobaladdr:$lo)>, ISA_MICROMIPS64R6;
+def : MipsPat<(add GPR64:$hi, (MipsLo tblockaddress:$lo)),
+ (DADDIU_MM64R6 GPR64:$hi, tblockaddress:$lo)>, ISA_MICROMIPS64R6;
+def : MipsPat<(add GPR64:$hi, (MipsLo tjumptable:$lo)),
+ (DADDIU_MM64R6 GPR64:$hi, tjumptable:$lo)>, ISA_MICROMIPS64R6;
+def : MipsPat<(add GPR64:$hi, (MipsLo tconstpool:$lo)),
+ (DADDIU_MM64R6 GPR64:$hi, tconstpool:$lo)>, ISA_MICROMIPS64R6;
+def : MipsPat<(add GPR64:$hi, (MipsLo tglobaltlsaddr:$lo)),
+ (DADDIU_MM64R6 GPR64:$hi, tglobaltlsaddr:$lo)>, ISA_MICROMIPS64R6;
+
+def : MipsPat<(addc GPR64:$lhs, GPR64:$rhs),
+ (DADDU_MM64R6 GPR64:$lhs, GPR64:$rhs)>, ISA_MICROMIPS64R6;
+def : MipsPat<(addc GPR64:$lhs, immSExt16:$imm),
+ (DADDIU_MM64R6 GPR64:$lhs, imm:$imm)>, ISA_MICROMIPS64R6;
+
+
+def : MipsPat<(rotr GPR64:$rt, (i32 (trunc GPR64:$rs))),
+ (DROTRV_MM64R6 GPR64:$rt, (EXTRACT_SUBREG GPR64:$rs, sub_32))>,
+ ISA_MICROMIPS64R6;
+
+
+def : WrapperPat<tglobaladdr, DADDIU_MM64R6, GPR64>, ISA_MICROMIPS64R6;
+def : WrapperPat<tconstpool, DADDIU_MM64R6, GPR64>, ISA_MICROMIPS64R6;
+def : WrapperPat<texternalsym, DADDIU_MM64R6, GPR64>, ISA_MICROMIPS64R6;
+def : WrapperPat<tblockaddress, DADDIU_MM64R6, GPR64>, ISA_MICROMIPS64R6;
+def : WrapperPat<tjumptable, DADDIU_MM64R6, GPR64>, ISA_MICROMIPS64R6;
+def : WrapperPat<tglobaltlsaddr, DADDIU_MM64R6, GPR64>, ISA_MICROMIPS64R6;
+
+// Carry pattern
+def : MipsPat<(subc GPR64:$lhs, GPR64:$rhs),
+ (DSUBU_MM64R6 GPR64:$lhs, GPR64:$rhs)>, ISA_MICROMIPS64R6;
+
+def : MipsPat<(atomic_load_64 addr:$a), (LD_MM64R6 addr:$a)>, ISA_MICROMIPS64R6;
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction aliases
+//
+//===----------------------------------------------------------------------===//
+
+def : MipsInstAlias<"dmtc0 $rt, $rd",
+ (DMTC0_MM64R6 COP0Opnd:$rd, GPR64Opnd:$rt, 0), 0>;
+def : MipsInstAlias<"dmfc0 $rt, $rd",
+ (DMFC0_MM64R6 GPR64Opnd:$rt, COP0Opnd:$rd, 0), 0>,
+ ISA_MICROMIPS64R6;
+def : MipsInstAlias<"daddu $rs, $rt, $imm",
+ (DADDIU_MM64R6 GPR64Opnd:$rs,
+ GPR64Opnd:$rt,
+ simm16_64:$imm),
+ 0>, ISA_MICROMIPS64R6;
+def : MipsInstAlias<"daddu $rs, $imm",
+ (DADDIU_MM64R6 GPR64Opnd:$rs,
+ GPR64Opnd:$rs,
+ simm16_64:$imm),
+ 0>, ISA_MICROMIPS64R6;
+def : MipsInstAlias<"dsubu $rt, $rs, $imm",
+ (DADDIU_MM64R6 GPR64Opnd:$rt,
+ GPR64Opnd:$rs,
+ InvertedImOperand64:$imm),
+ 0>, ISA_MICROMIPS64R6;
+def : MipsInstAlias<"dsubu $rs, $imm",
+ (DADDIU_MM64R6 GPR64Opnd:$rs,
+ GPR64Opnd:$rs,
+ InvertedImOperand64:$imm),
+ 0>, ISA_MICROMIPS64R6;
+def : MipsInstAlias<"dneg $rt, $rs",
+ (DSUB_MM64R6 GPR64Opnd:$rt, ZERO_64, GPR64Opnd:$rs), 1>,
+ ISA_MICROMIPS64R6;
+def : MipsInstAlias<"dneg $rt",
+ (DSUB_MM64R6 GPR64Opnd:$rt, ZERO_64, GPR64Opnd:$rt), 1>,
+ ISA_MICROMIPS64R6;
+def : MipsInstAlias<"dnegu $rt, $rs",
+ (DSUBU_MM64R6 GPR64Opnd:$rt, ZERO_64, GPR64Opnd:$rs), 1>,
+ ISA_MICROMIPS64R6;
+def : MipsInstAlias<"dnegu $rt",
+ (DSUBU_MM64R6 GPR64Opnd:$rt, ZERO_64, GPR64Opnd:$rt), 1>,
+ ISA_MICROMIPS64R6;
diff --git a/contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrFormats.td b/contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrFormats.td
new file mode 100644
index 000000000000..af6473c468d9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrFormats.td
@@ -0,0 +1,302 @@
+//===-- MicroMipsDSPInstrFormats.td - Instruction Formats --*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+class MMDSPInst<string opstr = "">
+ : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>, PredicateControl {
+ let InsnPredicates = [HasDSP];
+ let AdditionalPredicates = [InMicroMips];
+ string BaseOpcode = opstr;
+ string Arch = "mmdsp";
+ let DecoderNamespace = "MicroMips";
+}
+
+class MMDSPInstAlias<string Asm, dag Result, bit Emit = 0b1>
+ : InstAlias<Asm, Result, Emit>, PredicateControl {
+ let InsnPredicates = [HasDSP];
+ let AdditionalPredicates = [InMicroMips];
+}
+
+class POOL32A_3R_FMT<string opstr, bits<11> op> : MMDSPInst<opstr> {
+ bits<5> rd;
+ bits<5> rs;
+ bits<5> rt;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-11} = rd;
+ let Inst{10-0} = op;
+}
+
+class POOL32A_2R_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> {
+ bits<5> rt;
+ bits<5> rs;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-6} = op;
+ let Inst{5-0} = 0b111100;
+}
+
+class POOL32A_2RAC_FMT<string opstr, bits<8> op> : MMDSPInst<opstr> {
+ bits<5> rt;
+ bits<5> rs;
+ bits<2> ac;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-14} = ac;
+ let Inst{13-6} = op;
+ let Inst{5-0} = 0b111100;
+}
+
+class POOL32A_3RB0_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> {
+ bits<5> rd;
+ bits<5> rs;
+ bits<5> rt;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-11} = rd;
+ let Inst{10} = 0b0;
+ let Inst{9-0} = op;
+}
+
+class POOL32A_2RSA4_FMT<string opstr, bits<12> op> : MMDSPInst<opstr> {
+ bits<5> rt;
+ bits<5> rs;
+ bits<4> sa;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-12} = sa;
+ let Inst{11-0} = op;
+}
+
+class POOL32A_2RSA3_FMT<string opstr, bits<7> op> : MMDSPInst<opstr> {
+ bits<5> rt;
+ bits<5> rs;
+ bits<3> sa;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-13} = sa;
+ let Inst{12-6} = op;
+ let Inst{5-0} = 0b111100;
+}
+
+class POOL32A_2RSA5B0_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> {
+ bits<5> rt;
+ bits<5> rs;
+ bits<5> sa;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-11} = sa;
+ let Inst{10} = 0b0;
+ let Inst{9-0} = op;
+}
+
+class POOL32A_2RSA4B0_FMT<string opstr, bits<11> op> : MMDSPInst<opstr> {
+ bits<5> rt;
+ bits<5> rs;
+ bits<4> sa;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-12} = sa;
+ let Inst{11} = 0b0;
+ let Inst{10-0} = op;
+}
+
+class POOL32A_2RSA4OP6_FMT<string opstr, bits<6> op> : MMDSPInst<opstr> {
+ bits<5> rt;
+ bits<5> rs;
+ bits<4> sa;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-12} = sa;
+ let Inst{11-6} = op;
+ let Inst{5-0} = 0b111100;
+}
+
+class POOL32A_1RIMM5AC_FMT<string opstr, bits<8> funct> : MMDSPInst<opstr> {
+ bits<5> rt;
+ bits<5> imm;
+ bits<2> ac;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = imm;
+ let Inst{15-14} = ac;
+ let Inst{13-6} = funct;
+ let Inst{5-0} = 0b111100;
+}
+
+class POOL32A_2RSA5_FMT<string opstr, bits<11> op> : MMDSPInst<opstr> {
+ bits<5> rt;
+ bits<5> rs;
+ bits<5> sa;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-11} = sa;
+ let Inst{10-0} = op;
+}
+
+class POOL32A_1RMEMB0_FMT<string opstr, bits<10> funct> : MMDSPInst<opstr> {
+ bits<5> index;
+ bits<5> base;
+ bits<5> rd;
+
+ let Inst{31-26} = 0;
+ let Inst{25-21} = index;
+ let Inst{20-16} = base;
+ let Inst{15-11} = rd;
+ let Inst{10} = 0b0;
+ let Inst{9-0} = funct;
+}
+
+class POOL32A_1RAC_FMT<string instr_asm, bits<8> funct> : MMDSPInst<instr_asm> {
+ bits<5> rs;
+ bits<2> ac;
+
+ let Inst{31-26} = 0;
+ let Inst{25-21} = 0;
+ let Inst{20-16} = rs;
+ let Inst{15-14} = ac;
+ let Inst{13-6} = funct;
+ let Inst{5-0} = 0b111100;
+}
+
+class POOL32A_1RMASK7_FMT<string opstr, bits<8> op> : MMDSPInst<opstr> {
+ bits<5> rt;
+ bits<7> mask;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-21} = rt;
+ let Inst{20-14} = mask;
+ let Inst{13-6} = op;
+ let Inst{5-0} = 0b111100;
+}
+
+class POOL32A_1RIMM10_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> {
+ bits<5> rd;
+ bits<10> imm;
+
+ let Inst{31-26} = 0;
+ let Inst{25-16} = imm;
+ let Inst{15-11} = rd;
+ let Inst{10} = 0;
+ let Inst{9-0} = op;
+}
+
+class POOL32A_1RIMM8_FMT<string opstr, bits<6> op> : MMDSPInst<opstr> {
+ bits<5> rt;
+ bits<8> imm;
+
+ let Inst{31-26} = 0;
+ let Inst{25-21} = rt;
+ let Inst{20-13} = imm;
+ let Inst{12} = 0;
+ let Inst{11-6} = op;
+ let Inst{5-0} = 0b111100;
+}
+
+class POOL32A_4B0SHIFT6AC4B0_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> {
+ bits<6> shift;
+ bits<2> ac;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-22} = 0b0000;
+ let Inst{21-16} = shift;
+ let Inst{15-14} = ac;
+ let Inst{13-10} = 0b0000;
+ let Inst{9-0} = op;
+}
+
+class POOL32A_5B01RAC_FMT<string opstr, bits<8> op> : MMDSPInst<opstr> {
+ bits<5> rs;
+ bits<2> ac;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-21} = 0b00000;
+ let Inst{20-16} = rs;
+ let Inst{15-14} = ac;
+ let Inst{13-6} = op;
+ let Inst{5-0} = 0b111100;
+}
+
+class POOL32I_IMMB0_FMT<string opstr, bits<5> op> : MMDSPInst<opstr> {
+ bits<16> offset;
+
+ let Inst{31-26} = 0b010000;
+ let Inst{25-21} = op;
+ let Inst{20-16} = 0;
+ let Inst{15-0} = offset;
+}
+
+class POOL32A_2RBP_FMT<string opstr> : MMDSPInst<opstr> {
+ bits<5> rt;
+ bits<5> rs;
+ bits<2> bp;
+
+ let Inst{31-26} = 0;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-14} = bp;
+ let Inst{13-6} = 0b00100010;
+ let Inst{5-0} = 0b111100;
+}
+
+class POOL32A_2RB0_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> {
+ bits<5> rt;
+ bits<5> rs;
+
+ let Inst{31-26} = 0;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-10} = 0;
+ let Inst{9-0} = op;
+}
+
+class POOL32S_3RB0_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> {
+ bits<5> rt;
+ bits<5> rs;
+ bits<5> rd;
+
+ let Inst{31-26} = 0b010110;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-11} = rd;
+ let Inst{10} = 0b0;
+ let Inst{9-0} = op;
+}
+
+class POOL32A_2R2B0_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> {
+ bits<5> rt;
+ bits<5> rs;
+
+ let Inst{31-26} = 0;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-11} = 0;
+ let Inst{10} = 0;
+ let Inst{9-0} = op;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrInfo.td b/contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrInfo.td
new file mode 100644
index 000000000000..f82f82fc7e45
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrInfo.td
@@ -0,0 +1,601 @@
+//===- MicroMipsDSPInstrInfo.td - Micromips DSP instructions -*- tablegen *-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes MicroMips DSP instructions.
+//
+//===----------------------------------------------------------------------===//
+
+// Instruction encoding.
+class ADDQ_PH_MM_ENC : POOL32A_3R_FMT<"addq.ph", 0b00000001101>;
+class ADDQ_S_PH_MM_ENC : POOL32A_3R_FMT<"addq_s.ph", 0b10000001101>;
+class ADDQ_S_W_MM_ENC : POOL32A_3RB0_FMT<"addq_s.w", 0b1100000101>;
+class ADDQH_PH_MMR2_ENC : POOL32A_3R_FMT<"addqh.ph", 0b00001001101>;
+class ADDQH_R_PH_MMR2_ENC : POOL32A_3R_FMT<"addqh_r.ph", 0b10001001101>;
+class ADDQH_W_MMR2_ENC: POOL32A_3R_FMT<"addqh.w", 0b00010001101>;
+class ADDQH_R_W_MMR2_ENC : POOL32A_3R_FMT<"addqh_r.w", 0b10010001101>;
+class ADDU_PH_MMR2_ENC : POOL32A_3R_FMT<"addu.ph", 0b00100001101>;
+class ADDU_S_PH_MMR2_ENC : POOL32A_3R_FMT<"addu_s.ph", 0b10100001101>;
+class ADDU_QB_MM_ENC : POOL32A_3R_FMT<"addu.qb", 0b00011001101>;
+class ADDU_S_QB_MM_ENC : POOL32A_3R_FMT<"addu_s.qb", 0b10011001101>;
+class ADDUH_QB_MMR2_ENC : POOL32A_3R_FMT<"adduh.qb", 0b00101001101>;
+class ADDUH_R_QB_MMR2_ENC : POOL32A_3R_FMT<"adduh_r.qb", 0b10101001101>;
+class ADDSC_MM_ENC : POOL32A_3RB0_FMT<"addsc", 0b1110000101>;
+class ADDWC_MM_ENC : POOL32A_3RB0_FMT<"addwc", 0b1111000101>;
+class DPA_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpa.w.ph", 0b00000010>;
+class DPAQ_S_W_PH_MM_ENC : POOL32A_2RAC_FMT<"dpaq_s.w.ph", 0b00001010>;
+class DPAQ_SA_L_W_MM_ENC : POOL32A_2RAC_FMT<"dpaq_sa.l.w", 0b01001010>;
+class DPAQX_S_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpaqx_s.w.ph", 0b10001010>;
+class DPAQX_SA_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpaqx_sa.w.ph", 0b11001010>;
+class DPAU_H_QBL_MM_ENC : POOL32A_2RAC_FMT<"dpau.h.qbl", 0b10000010>;
+class DPAU_H_QBR_MM_ENC : POOL32A_2RAC_FMT<"dpau.h.qbr", 0b11000010>;
+class DPAX_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpax.w.ph", 0b01000010>;
+class ABSQ_S_PH_MM_ENC : POOL32A_2R_FMT<"absq_s.ph", 0b0001000100>;
+class ABSQ_S_W_MM_ENC : POOL32A_2R_FMT<"absq_s.w", 0b0010000100>;
+class ABSQ_S_QB_MMR2_ENC : POOL32A_2R_FMT<"absq_s.qb", 0b0000000100>;
+class INSV_MM_ENC : POOL32A_2R_FMT<"insv", 0b0100000100>;
+class MADD_DSP_MM_ENC : POOL32A_2RAC_FMT<"madd", 0b00101010>;
+class MADDU_DSP_MM_ENC : POOL32A_2RAC_FMT<"maddu", 0b01101010>;
+class MSUB_DSP_MM_ENC : POOL32A_2RAC_FMT<"msub", 0b10101010>;
+class MSUBU_DSP_MM_ENC : POOL32A_2RAC_FMT<"msubu", 0b11101010>;
+class MULT_DSP_MM_ENC : POOL32A_2RAC_FMT<"mult", 0b00110010>;
+class MULTU_DSP_MM_ENC : POOL32A_2RAC_FMT<"multu", 0b01110010>;
+class SHLL_PH_MM_ENC : POOL32A_2RSA4_FMT<"shll.ph", 0b001110110101>;
+class SHLL_S_PH_MM_ENC : POOL32A_2RSA4_FMT<"shll_s.ph", 0b101110110101>;
+class SHLL_QB_MM_ENC : POOL32A_2RSA3_FMT<"shll.qb", 0b0100001>;
+class SHLLV_PH_MM_ENC : POOL32A_3R_FMT<"shllv.ph", 0b00000001110>;
+class SHLLV_S_PH_MM_ENC : POOL32A_3R_FMT<"shllv_s.ph", 0b10000001110>;
+class SHLLV_QB_MM_ENC : POOL32A_3RB0_FMT<"shllv.qb", 0b1110010101>;
+class SHLLV_S_W_MM_ENC : POOL32A_3RB0_FMT<"shllv_s.w", 0b1111010101>;
+class SHLL_S_W_MM_ENC : POOL32A_2RSA5B0_FMT<"shll_s.w", 0b1111110101>;
+class SHRA_QB_MMR2_ENC : POOL32A_2RSA3_FMT<"shra.qb", 0b0000111>;
+class SHRA_R_QB_MMR2_ENC : POOL32A_2RSA3_FMT<"shra_r.qb", 0b1000111>;
+class SHRA_PH_MM_ENC : POOL32A_2RSA4B0_FMT<"shra.ph", 0b01100110101>;
+class SHRA_R_PH_MM_ENC : POOL32A_2RSA4B0_FMT<"shra_r.ph", 0b11100110101>;
+class SHRAV_PH_MM_ENC : POOL32A_3R_FMT<"shrav.ph", 0b00110001101>;
+class SHRAV_R_PH_MM_ENC : POOL32A_3R_FMT<"shrav_r.ph", 0b10110001101>;
+class SHRAV_QB_MMR2_ENC : POOL32A_3R_FMT<"shrav.qb", 0b00111001101>;
+class SHRAV_R_QB_MMR2_ENC : POOL32A_3R_FMT<"shrav_r.qb", 0b10111001101>;
+class SHRAV_R_W_MM_ENC : POOL32A_3RB0_FMT<"shrav_r.w", 0b1011010101>;
+class SHRA_R_W_MM_ENC : POOL32A_2RSA5B0_FMT<"shra_r.w", 0b1011110101>;
+class SHRL_PH_MMR2_ENC : POOL32A_2RSA4OP6_FMT<"shrl.ph", 0b001111>;
+class SHRL_QB_MM_ENC : POOL32A_2RSA3_FMT<"shrl.qb", 0b1100001>;
+class SHRLV_PH_MMR2_ENC : POOL32A_3RB0_FMT<"shrlv.ph", 0b1100010101>;
+class SHRLV_QB_MM_ENC : POOL32A_3RB0_FMT<"shrlv.qb", 0b1101010101>;
+class PRECEQ_W_PHL_MM_ENC : POOL32A_2R_FMT<"preceq.w.phl", 0b0101000100>;
+class PRECEQ_W_PHR_MM_ENC : POOL32A_2R_FMT<"preceq.w.phr", 0b0110000100>;
+class PRECEQU_PH_QBL_MM_ENC : POOL32A_2R_FMT<"precequ.ph.qbl", 0b0111000100>;
+class PRECEQU_PH_QBLA_MM_ENC : POOL32A_2R_FMT<"precequ.ph.qbla", 0b0111001100>;
+class PRECEQU_PH_QBR_MM_ENC : POOL32A_2R_FMT<"precequ.ph.qbr", 0b1001000100>;
+class PRECEQU_PH_QBRA_MM_ENC : POOL32A_2R_FMT<"precequ.ph.qbra", 0b1001001100>;
+class PRECEU_PH_QBL_MM_ENC : POOL32A_2R_FMT<"preceu.ph.qbl", 0b1011000100>;
+class PRECEU_PH_QBLA_MM_ENC : POOL32A_2R_FMT<"preceu.ph.qbla", 0b1011001100>;
+class PRECEU_PH_QBR_MM_ENC : POOL32A_2R_FMT<"preceu.ph.qbr", 0b1101000100>;
+class PRECEU_PH_QBRA_MM_ENC : POOL32A_2R_FMT<"preceu.ph.qbra", 0b1101001100>;
+class SUBQ_PH_MM_ENC : POOL32A_3R_FMT<"subq.ph", 0b01000001101>;
+class SUBQ_S_PH_MM_ENC : POOL32A_3R_FMT<"subq_s.ph", 0b11000001101>;
+class SUBQ_S_W_MM_ENC : POOL32A_3RB0_FMT<"subq_s.w", 0b1101000101>;
+class SUBQH_PH_MMR2_ENC : POOL32A_3R_FMT<"subqh.ph", 0b01001001101>;
+class SUBQH_R_PH_MMR2_ENC : POOL32A_3R_FMT<"subqh_r.ph", 0b11001001101>;
+class SUBQH_W_MMR2_ENC : POOL32A_3R_FMT<"subqh.w", 0b01010001101>;
+class SUBQH_R_W_MMR2_ENC : POOL32A_3R_FMT<"subqh_r.w", 0b11010001101>;
+class SUBU_PH_MMR2_ENC : POOL32A_3R_FMT<"subu.ph", 0b01100001101>;
+class SUBU_S_PH_MMR2_ENC : POOL32A_3R_FMT<"subu_s.ph", 0b11100001101>;
+class SUBU_QB_MM_ENC : POOL32A_3R_FMT<"subu.qb", 0b01011001101>;
+class SUBU_S_QB_MM_ENC : POOL32A_3R_FMT<"subu_s.qb", 0b11011001101>;
+class SUBUH_QB_MMR2_ENC : POOL32A_3R_FMT<"subuh.qb", 0b01101001101>;
+class SUBUH_R_QB_MMR2_ENC : POOL32A_3R_FMT<"subuh_r.qb", 0b11101001101>;
+class EXTP_MM_ENC : POOL32A_1RIMM5AC_FMT<"extp", 0b10011001>;
+class EXTPDP_MM_ENC : POOL32A_1RIMM5AC_FMT<"extpdp", 0b11011001>;
+class EXTPDPV_MM_ENC : POOL32A_2RAC_FMT<"extpdpv", 0b11100010>;
+class EXTPV_MM_ENC : POOL32A_2RAC_FMT<"extpv", 0b10100010>;
+class EXTR_W_MM_ENC : POOL32A_1RIMM5AC_FMT<"extr.w", 0b00111001>;
+class EXTR_R_W_MM_ENC : POOL32A_1RIMM5AC_FMT<"extr_r.w", 0b01111001>;
+class EXTR_RS_W_MM_ENC : POOL32A_1RIMM5AC_FMT<"extr_rs.w", 0b10111001>;
+class EXTR_S_H_MM_ENC : POOL32A_1RIMM5AC_FMT<"extr_s.h", 0b11111001>;
+class EXTRV_W_MM_ENC : POOL32A_2RAC_FMT<"extrv.w", 0b00111010>;
+class EXTRV_R_W_MM_ENC : POOL32A_2RAC_FMT<"extrv_r.w", 0b01111010>;
+class EXTRV_RS_W_MM_ENC : POOL32A_2RAC_FMT<"extrv_rs.w", 0b10111010>;
+class EXTRV_S_H_MM_ENC : POOL32A_2RAC_FMT<"extrv_s.h", 0b11111010>;
+class DPS_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dps.w.ph", 0b00010010>;
+class DPSQ_S_W_PH_MM_ENC : POOL32A_2RAC_FMT<"dpsq_s.w.ph", 0b00011010>;
+class DPSQ_SA_L_W_MM_ENC : POOL32A_2RAC_FMT<"dpsq_sa.l.w", 0b01011010>;
+class DPSQX_S_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpsqx_s.w.ph", 0b10011010>;
+class DPSQX_SA_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpsqx_sa.w.ph", 0b11011010>;
+class DPSU_H_QBL_MM_ENC : POOL32A_2RAC_FMT<"dpsu.h.qbl", 0b10010010>;
+class DPSU_H_QBR_MM_ENC : POOL32A_2RAC_FMT<"dpsu.h.qbr", 0b11010010>;
+class DPSX_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpsx.w.ph", 0b01010010>;
+class MUL_PH_MMR2_ENC : POOL32A_3R_FMT<"mul.ph", 0b00000101101>;
+class MUL_S_PH_MMR2_ENC : POOL32A_3R_FMT<"mul_s.ph", 0b10000101101>;
+class MULEQ_S_W_PHL_MM_ENC : POOL32A_3RB0_FMT<"muleq_s.w.phl", 0b0000100101>;
+class MULEQ_S_W_PHR_MM_ENC : POOL32A_3RB0_FMT<"muleq_s.w.phr", 0b0001100101>;
+class MULEU_S_PH_QBL_MM_ENC : POOL32A_3RB0_FMT<"muleu_s.ph.qbl", 0b0010010101>;
+class MULEU_S_PH_QBR_MM_ENC : POOL32A_3RB0_FMT<"muleu_s.ph.qbr", 0b0011010101>;
+class MULQ_RS_PH_MM_ENC : POOL32A_3RB0_FMT<"mulq_rs.ph", 0b0100010101>;
+class MULQ_RS_W_MMR2_ENC : POOL32A_3RB0_FMT<"mulq_rs.w", 0b0110010101>;
+class MULQ_S_PH_MMR2_ENC : POOL32A_3RB0_FMT<"mulq_s.ph", 0b0101010101>;
+class MULQ_S_W_MMR2_ENC : POOL32A_3RB0_FMT<"mulq_s.w", 0b0111010101>;
+class PRECR_QB_PH_MMR2_ENC : POOL32A_3RB0_FMT<"precr.qb.ph", 0b0001101101>;
+class PRECR_SRA_PH_W_MMR2_ENC
+ : POOL32A_2RSA5_FMT<"precr_sra.ph.w", 0b01111001101>;
+class PRECR_SRA_R_PH_W_MMR2_ENC
+ : POOL32A_2RSA5_FMT<"precr_sra_r.ph.w", 0b11111001101>;
+class PRECRQ_PH_W_MM_ENC : POOL32A_3RB0_FMT<"precrq.ph.w", 0b0011101101>;
+class PRECRQ_QB_PH_MM_ENC : POOL32A_3RB0_FMT<"precrq.qb.ph", 0b0010101101>;
+class PRECRQU_S_QB_PH_MM_ENC
+ : POOL32A_3RB0_FMT<"precrqu_s.qb.ph", 0b0101101101>;
+class PRECRQ_RS_PH_W_MM_ENC : POOL32A_3RB0_FMT<"precrq_rs.ph.w", 0b0100101101>;
+class LBUX_MM_ENC : POOL32A_1RMEMB0_FMT<"lbux", 0b1000100101>;
+class LHX_MM_ENC : POOL32A_1RMEMB0_FMT<"lhx", 0b0101100101>;
+class LWX_MM_ENC : POOL32A_1RMEMB0_FMT<"lwx", 0b0110100101>;
+class MAQ_S_W_PHL_MM_ENC : POOL32A_2RAC_FMT<"maq_s.w.phl", 0b01101001>;
+class MAQ_SA_W_PHL_MM_ENC : POOL32A_2RAC_FMT<"maq_sa.w.phl", 0b11101001>;
+class MAQ_S_W_PHR_MM_ENC : POOL32A_2RAC_FMT<"maq_s.w.phr", 0b00101001>;
+class MAQ_SA_W_PHR_MM_ENC : POOL32A_2RAC_FMT<"maq_sa.w.phr", 0b10101001>;
+class MFHI_MM_ENC : POOL32A_1RAC_FMT<"mfhi", 0b00000001>;
+class MFLO_MM_ENC : POOL32A_1RAC_FMT<"mflo", 0b01000001>;
+class MTHI_MM_ENC : POOL32A_1RAC_FMT<"mthi", 0b10000001>;
+class MTLO_MM_ENC : POOL32A_1RAC_FMT<"mthi", 0b11000001>;
+class PREPEND_MMR2_ENC : POOL32A_2RSA5B0_FMT<"prepend", 0b1001010101>;
+class RADDU_W_QB_MM_ENC : POOL32A_2R_FMT<"raddu.w.qb", 0b1111000100>;
+class RDDSP_MM_ENC : POOL32A_1RMASK7_FMT<"rddsp", 0b00011001>;
+class REPL_PH_MM_ENC : POOL32A_1RIMM10_FMT<"repl.ph", 0b0000111101>;
+class REPL_QB_MM_ENC : POOL32A_1RIMM8_FMT<"repl.qb", 0b010111>;
+class REPLV_PH_MM_ENC : POOL32A_2R_FMT<"replv.ph", 0b0000001100>;
+class REPLV_QB_MM_ENC : POOL32A_2R_FMT<"replv.qb", 0b0001001100>;
+class MTHLIP_MM_ENC : POOL32A_1RAC_FMT<"mthlip", 0b00001001>;
+class PACKRL_PH_MM_ENC : POOL32A_3RB0_FMT<"packrl.ph", 0b0110101101>;
+class PICK_PH_MM_ENC : POOL32A_3RB0_FMT<"pick.ph", 0b1000101101>;
+class PICK_QB_MM_ENC : POOL32A_3RB0_FMT<"pick.qb", 0b0111101101>;
+class SHILO_MM_ENC : POOL32A_4B0SHIFT6AC4B0_FMT<"shilo", 0b0000011101>;
+class SHILOV_MM_ENC : POOL32A_5B01RAC_FMT<"shilov", 0b01001001>;
+class WRDSP_MM_ENC : POOL32A_1RMASK7_FMT<"wrdsp", 0b01011001>;
+class APPEND_MMR2_ENC : POOL32A_2RSA5B0_FMT<"append", 0b1000010101>;
+class MODSUB_MM_ENC : POOL32A_3RB0_FMT<"modsub", 0b1010010101>;
+class MULSA_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"mulsa.w.ph", 0b10110010>;
+class MULSAQ_S_W_PH_MM_ENC : POOL32A_2RAC_FMT<"mulsaq_s.w.ph", 0b11110010>;
+class BPOSGE32C_MMR3_ENC : POOL32I_IMMB0_FMT<"bposge32c", 0b11001>;
+class BITREV_MM_ENC : POOL32A_2R_FMT<"bitrev", 0b0011000100>;
+class BALIGN_MMR2_ENC : POOL32A_2RBP_FMT<"balign">;
+class BPOSGE32_MM_ENC : POOL32I_IMMB0_FMT<"bposge32", 0b11011>;
+class CMP_EQ_PH_MM_ENC : POOL32A_2RB0_FMT<"cmp.eq.ph", 0b0000000101>;
+class CMP_LE_PH_MM_ENC : POOL32A_2RB0_FMT<"cmp.le.ph", 0b0010000101>;
+class CMP_LT_PH_MM_ENC : POOL32A_2RB0_FMT<"cmp.lt.ph", 0b0001000101>;
+class CMPGDU_EQ_QB_MMR2_ENC : POOL32A_3RB0_FMT<"cmpgdu.eq.qb", 0b0110000101>;
+class CMPGDU_LT_QB_MMR2_ENC : POOL32A_3RB0_FMT<"cmpgdu.lt.qb", 0b0111000101>;
+class CMPGDU_LE_QB_MMR2_ENC : POOL32A_3RB0_FMT<"cmpgdu.le.qb", 0b1000000101>;
+class CMPGU_EQ_QB_MM_ENC : POOL32S_3RB0_FMT<"cmpgu.eq.qb", 0b0011000101>;
+class CMPGU_LT_QB_MM_ENC : POOL32S_3RB0_FMT<"cmpgu.lt.qb", 0b0100000101>;
+class CMPGU_LE_QB_MM_ENC : POOL32S_3RB0_FMT<"cmpgu.le.qb", 0b0101000101>;
+class CMPU_EQ_QB_MM_ENC : POOL32A_2R2B0_FMT<"cmpu.eq.qb", 0b1001000101>;
+class CMPU_LT_QB_MM_ENC : POOL32A_2R2B0_FMT<"cmpu.lt.qb", 0b1010000101>;
+class CMPU_LE_QB_MM_ENC : POOL32A_2R2B0_FMT<"cmpu.le.qb", 0b1011000101>;
+
+// Instruction desc.
+class ABSQ_S_PH_MM_R2_DESC_BASE<string opstr, SDPatternOperator OpNode,
+ InstrItinClass itin, RegisterOperand ROD,
+ RegisterOperand ROS = ROD> {
+ dag OutOperandList = (outs ROD:$rt);
+ dag InOperandList = (ins ROS:$rs);
+ string AsmString = !strconcat(opstr, "\t$rt, $rs");
+ list<dag> Pattern = [(set ROD:$rt, (OpNode ROS:$rs))];
+ InstrItinClass Itinerary = itin;
+}
+class ABSQ_S_PH_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+ "absq_s.ph", int_mips_absq_s_ph, NoItinerary, DSPROpnd>, Defs<[DSPOutFlag20]>;
+class ABSQ_S_W_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+ "absq_s.w", int_mips_absq_s_w, NoItinerary, GPR32Opnd>, Defs<[DSPOutFlag20]>;
+class ABSQ_S_QB_MMR2_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+ "absq_s.qb", int_mips_absq_s_qb, NoItinerary, DSPROpnd>, Defs<[DSPOutFlag20]>;
+class PRECEQ_W_PHL_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+ "preceq.w.phl", int_mips_preceq_w_phl, NoItinerary, GPR32Opnd, DSPROpnd>;
+class PRECEQ_W_PHR_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+ "preceq.w.phr", int_mips_preceq_w_phr, NoItinerary, GPR32Opnd, DSPROpnd>;
+class PRECEQU_PH_QBL_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+ "precequ.ph.qbl", int_mips_precequ_ph_qbl, NoItinerary, DSPROpnd>;
+class PRECEQU_PH_QBLA_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+ "precequ.ph.qbla", int_mips_precequ_ph_qbla, NoItinerary, DSPROpnd>;
+class PRECEQU_PH_QBR_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+ "precequ.ph.qbr", int_mips_precequ_ph_qbr, NoItinerary, DSPROpnd>;
+class PRECEQU_PH_QBRA_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+ "precequ.ph.qbra", int_mips_precequ_ph_qbra, NoItinerary, DSPROpnd>;
+class PRECEU_PH_QBL_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+ "preceu.ph.qbl", int_mips_preceu_ph_qbl, NoItinerary, DSPROpnd>;
+class PRECEU_PH_QBLA_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+ "preceu.ph.qbla", int_mips_preceu_ph_qbla, NoItinerary, DSPROpnd>;
+class PRECEU_PH_QBR_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+ "preceu.ph.qbr", int_mips_preceu_ph_qbr, NoItinerary, DSPROpnd>;
+class PRECEU_PH_QBRA_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+ "preceu.ph.qbra", int_mips_preceu_ph_qbra, NoItinerary, DSPROpnd>;
+
+class SHLL_R2_MM_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ SDPatternOperator ImmPat, InstrItinClass itin,
+ RegisterOperand RO, Operand ImmOpnd> {
+ dag OutOperandList = (outs RO:$rt);
+ dag InOperandList = (ins RO:$rs, ImmOpnd:$sa);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $sa");
+ list<dag> Pattern = [(set RO:$rt, (OpNode RO:$rs, ImmPat:$sa))];
+ InstrItinClass Itinerary = itin;
+ bit hasSideEffects = 1;
+}
+class SHLL_PH_MM_DESC : SHLL_R2_MM_DESC_BASE<
+ "shll.ph", null_frag, immZExt4, NoItinerary, DSPROpnd, uimm4>,
+ Defs<[DSPOutFlag22]>;
+class SHLL_S_PH_MM_DESC : SHLL_R2_MM_DESC_BASE<
+ "shll_s.ph", int_mips_shll_s_ph, immZExt4, NoItinerary, DSPROpnd, uimm4>,
+ Defs<[DSPOutFlag22]>;
+class SHLL_QB_MM_DESC : SHLL_R2_MM_DESC_BASE<
+ "shll.qb", null_frag, immZExt3, NoItinerary, DSPROpnd, uimm3>,
+ Defs<[DSPOutFlag22]>;
+class SHLL_S_W_MM_DESC : SHLL_R2_MM_DESC_BASE<
+ "shll_s.w", int_mips_shll_s_w, immZExt5, NoItinerary, GPR32Opnd, uimm5>,
+ Defs<[DSPOutFlag22]>;
+class SHRA_QB_MMR2_DESC : SHLL_R2_MM_DESC_BASE<
+ "shra.qb", null_frag, immZExt3, NoItinerary, DSPROpnd, uimm3>;
+class SHRA_R_QB_MMR2_DESC : SHLL_R2_MM_DESC_BASE<
+ "shra_r.qb", int_mips_shra_r_qb, immZExt3, NoItinerary, DSPROpnd, uimm3>;
+class SHRA_PH_MM_DESC : SHLL_R2_MM_DESC_BASE<
+ "shra.ph", null_frag, immZExt4, NoItinerary, DSPROpnd, uimm4>;
+class SHRA_R_PH_MM_DESC : SHLL_R2_MM_DESC_BASE<
+ "shra_r.ph", int_mips_shra_r_ph, immZExt4, NoItinerary, DSPROpnd, uimm4>;
+class SHRA_R_W_MM_DESC : SHLL_R2_MM_DESC_BASE<
+ "shra_r.w", int_mips_shra_r_w, immZExt5, NoItinerary, GPR32Opnd, uimm5>;
+class SHRL_QB_MM_DESC : SHLL_R2_MM_DESC_BASE<
+ "shrl.qb", null_frag, immZExt3, NoItinerary, DSPROpnd, uimm3>;
+class SHRL_PH_MMR2_DESC : SHLL_R2_MM_DESC_BASE<
+ "shrl.ph", null_frag, immZExt4, NoItinerary, DSPROpnd, uimm4>;
+
+class SHLLV_R3_MM_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ InstrItinClass itin, RegisterOperand RO> {
+ dag OutOperandList = (outs RO:$rd);
+ dag InOperandList = (ins RO:$rt, GPR32Opnd:$rs);
+ string AsmString = !strconcat(instr_asm, "\t$rd, $rt, $rs");
+ list<dag> Pattern = [(set RO:$rd, (OpNode RO:$rt, GPR32Opnd:$rs))];
+ InstrItinClass Itinerary = itin;
+}
+class SHLLV_PH_MM_DESC : SHLLV_R3_MM_DESC_BASE<
+ "shllv.ph", int_mips_shll_ph, NoItinerary, DSPROpnd>, Defs<[DSPOutFlag22]>;
+class SHLLV_S_PH_MM_DESC : SHLLV_R3_MM_DESC_BASE<
+ "shllv_s.ph", int_mips_shll_s_ph, NoItinerary, DSPROpnd>,
+ Defs<[DSPOutFlag22]>;
+class SHLLV_QB_MM_DESC : SHLLV_R3_MM_DESC_BASE<
+ "shllv.qb", int_mips_shll_qb, NoItinerary, DSPROpnd>, Defs<[DSPOutFlag22]>;
+class SHLLV_S_W_MM_DESC : SHLLV_R3_MM_DESC_BASE<
+ "shllv_s.w", int_mips_shll_s_w, NoItinerary, GPR32Opnd>, Defs<[DSPOutFlag22]>;
+class SHRAV_PH_MM_DESC : SHLLV_R3_MM_DESC_BASE<
+ "shrav.ph", int_mips_shra_ph, NoItinerary, DSPROpnd>;
+class SHRAV_R_PH_MM_DESC : SHLLV_R3_MM_DESC_BASE<
+ "shrav_r.ph", int_mips_shra_r_ph, NoItinerary, DSPROpnd>;
+class SHRAV_QB_MMR2_DESC : SHLLV_R3_MM_DESC_BASE<
+ "shrav.qb", int_mips_shra_qb, NoItinerary, DSPROpnd>;
+class SHRAV_R_QB_MMR2_DESC : SHLLV_R3_MM_DESC_BASE<
+ "shrav_r.qb", int_mips_shra_r_qb, NoItinerary, DSPROpnd>;
+class SHRAV_R_W_MM_DESC : SHLLV_R3_MM_DESC_BASE<
+ "shrav_r.w", int_mips_shra_r_w, NoItinerary, GPR32Opnd>;
+class SHRLV_PH_MMR2_DESC : SHLLV_R3_MM_DESC_BASE<
+ "shrlv.ph", int_mips_shrl_ph, NoItinerary, DSPROpnd>;
+class SHRLV_QB_MM_DESC : SHLLV_R3_MM_DESC_BASE<
+ "shrlv.qb", int_mips_shrl_qb, NoItinerary, DSPROpnd>;
+
+class EXT_MM_2R_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ InstrItinClass itin> {
+ dag OutOperandList = (outs GPR32Opnd:$rt);
+ dag InOperandList = (ins ACC64DSPOpnd:$ac, GPR32Opnd:$rs);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $rs");
+ InstrItinClass Itinerary = itin;
+}
+class EXT_MM_1R_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ InstrItinClass itin> {
+ dag OutOperandList = (outs GPR32Opnd:$rt);
+ dag InOperandList = (ins ACC64DSPOpnd:$ac, uimm5:$imm);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $imm");
+ InstrItinClass Itinerary = itin;
+}
+
+class EXTP_MM_DESC
+ : EXT_MM_1R_DESC_BASE<"extp", MipsEXTP, NoItinerary>,
+ Uses<[DSPPos]>, Defs<[DSPEFI]>;
+class EXTPDP_MM_DESC
+ : EXT_MM_1R_DESC_BASE<"extpdp", MipsEXTPDP, NoItinerary>,
+ Uses<[DSPPos]>, Defs<[DSPPos, DSPEFI]>;
+class EXTPDPV_MM_DESC
+ : EXT_MM_2R_DESC_BASE<"extpdpv", MipsEXTPDP, NoItinerary>,
+ Uses<[DSPPos]>, Defs<[DSPPos, DSPEFI]>;
+class EXTPV_MM_DESC
+ : EXT_MM_2R_DESC_BASE<"extpv", MipsEXTP, NoItinerary>,
+ Uses<[DSPPos]>, Defs<[DSPEFI]>;
+class EXTR_W_MM_DESC
+ : EXT_MM_1R_DESC_BASE<"extr.w", MipsEXTR_W, NoItinerary>,
+ Defs<[DSPOutFlag23]>;
+class EXTR_R_W_MM_DESC
+ : EXT_MM_1R_DESC_BASE<"extr_r.w", MipsEXTR_R_W, NoItinerary>,
+ Defs<[DSPOutFlag23]>;
+class EXTR_RS_W_MM_DESC
+ : EXT_MM_1R_DESC_BASE<"extr_rs.w", MipsEXTR_RS_W, NoItinerary>,
+ Defs<[DSPOutFlag23]>;
+class EXTR_S_H_MM_DESC
+ : EXT_MM_1R_DESC_BASE<"extr_s.h", MipsEXTR_S_H, NoItinerary>,
+ Defs<[DSPOutFlag23]>;
+class EXTRV_W_MM_DESC
+ : EXT_MM_2R_DESC_BASE<"extrv.w", MipsEXTR_W, NoItinerary>,
+ Defs<[DSPOutFlag23]>;
+class EXTRV_R_W_MM_DESC
+ : EXT_MM_2R_DESC_BASE<"extrv_r.w", MipsEXTR_R_W, NoItinerary>,
+ Defs<[DSPOutFlag23]>;
+class EXTRV_RS_W_MM_DESC
+ : EXT_MM_2R_DESC_BASE<"extrv_rs.w", MipsEXTR_RS_W, NoItinerary>,
+ Defs<[DSPOutFlag23]>;
+class EXTRV_S_H_MM_DESC
+ : EXT_MM_2R_DESC_BASE<"extrv_s.h", MipsEXTR_S_H, NoItinerary>,
+ Defs<[DSPOutFlag23]>;
+
+class MFHI_MM_DESC_BASE<string instr_asm, RegisterOperand RO, SDNode OpNode,
+ InstrItinClass itin> {
+ dag OutOperandList = (outs GPR32Opnd:$rs);
+ dag InOperandList = (ins RO:$ac);
+ string AsmString = !strconcat(instr_asm, "\t$rs, $ac");
+ list<dag> Pattern = [(set GPR32Opnd:$rs, (OpNode RO:$ac))];
+ InstrItinClass Itinerary = itin;
+}
+
+class MFHI_MM_DESC : MFHI_MM_DESC_BASE<"mfhi", ACC64DSPOpnd, MipsMFHI,
+ NoItinerary>;
+class MFLO_MM_DESC : MFHI_MM_DESC_BASE<"mflo", ACC64DSPOpnd, MipsMFLO,
+ NoItinerary>;
+
+class RADDU_W_QB_MM_DESC {
+ dag OutOperandList = (outs GPR32Opnd:$rt);
+ dag InOperandList = (ins DSPROpnd:$rs);
+ string AsmString = !strconcat("raddu.w.qb", "\t$rt, $rs");
+ list<dag> Pattern = [(set GPR32Opnd:$rt, (int_mips_raddu_w_qb DSPROpnd:$rs))];
+ InstrItinClass Itinerary = NoItinerary;
+ string BaseOpcode = "raddu.w.qb";
+}
+
+class RDDSP_MM_DESC {
+ dag OutOperandList = (outs GPR32Opnd:$rt);
+ dag InOperandList = (ins uimm7:$mask);
+ string AsmString = !strconcat("rddsp", "\t$rt, $mask");
+ list<dag> Pattern = [(set GPR32Opnd:$rt, (int_mips_rddsp immZExt7:$mask))];
+ InstrItinClass Itinerary = NoItinerary;
+}
+
+class REPL_QB_MM_DESC {
+ dag OutOperandList = (outs DSPROpnd:$rt);
+ dag InOperandList = (ins uimm8:$imm);
+ string AsmString = !strconcat("repl.qb", "\t$rt, $imm");
+ list<dag> Pattern = [(set DSPROpnd:$rt, (int_mips_repl_qb immZExt8:$imm))];
+ InstrItinClass Itinerary = NoItinerary;
+}
+
+class REPLV_PH_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<"replv.ph", int_mips_repl_ph,
+ NoItinerary, DSPROpnd,
+ GPR32Opnd>;
+class REPLV_QB_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<"replv.qb", int_mips_repl_qb,
+ NoItinerary, DSPROpnd,
+ GPR32Opnd>;
+
+class WRDSP_MM_DESC {
+ dag OutOperandList = (outs);
+ dag InOperandList = (ins GPR32Opnd:$rt, uimm7:$mask);
+ string AsmString = !strconcat("wrdsp", "\t$rt, $mask");
+ list<dag> Pattern = [(int_mips_wrdsp GPR32Opnd:$rt, immZExt7:$mask)];
+ InstrItinClass Itinerary = NoItinerary;
+}
+
+class BPOSGE32C_MMR3_DESC {
+ dag OutOperandList = (outs);
+ dag InOperandList = (ins brtarget1SImm16:$offset);
+ string AsmString = !strconcat("bposge32c", "\t$offset");
+ InstrItinClass Itinerary = NoItinerary;
+ bit isBranch = 1;
+ bit isTerminator = 1;
+ bit hasDelaySlot = 0;
+}
+
+class BALIGN_MMR2_DESC {
+ dag OutOperandList = (outs GPR32Opnd:$rt);
+ dag InOperandList = (ins GPR32Opnd:$rs, uimm2:$bp, GPR32Opnd:$src);
+ string AsmString = !strconcat("balign", "\t$rt, $rs, $bp");
+ list<dag> Pattern = [(set GPR32Opnd:$rt, (int_mips_balign GPR32Opnd:$src,
+ GPR32Opnd:$rs,
+ immZExt2:$bp))];
+ InstrItinClass Itinerary = NoItinerary;
+ string Constraints = "$src = $rt";
+}
+
+class BITREV_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<"bitrev", int_mips_bitrev,
+ NoItinerary, GPR32Opnd>;
+
+class BPOSGE32_MM_DESC : BPOSGE32_DESC_BASE<"bposge32", brtarget_mm,
+ NoItinerary>;
+
+// Instruction defs.
+// microMIPS DSP Rev 1
+def ADDQ_PH_MM : DspMMRel, ADDQ_PH_MM_ENC, ADDQ_PH_DESC;
+def ADDQ_S_PH_MM : DspMMRel, ADDQ_S_PH_MM_ENC, ADDQ_S_PH_DESC;
+def ADDQ_S_W_MM : DspMMRel, ADDQ_S_W_MM_ENC, ADDQ_S_W_DESC;
+def ADDU_QB_MM : DspMMRel, ADDU_QB_MM_ENC, ADDU_QB_DESC;
+def ADDU_S_QB_MM : DspMMRel, ADDU_S_QB_MM_ENC, ADDU_S_QB_DESC;
+def ADDSC_MM : DspMMRel, ADDSC_MM_ENC, ADDSC_DESC;
+def ADDWC_MM : DspMMRel, ADDWC_MM_ENC, ADDWC_DESC;
+def DPAQ_S_W_PH_MM : DspMMRel, DPAQ_S_W_PH_MM_ENC, DPAQ_S_W_PH_DESC;
+def DPAQ_SA_L_W_MM : DspMMRel, DPAQ_SA_L_W_MM_ENC, DPAQ_SA_L_W_DESC;
+def DPAU_H_QBL_MM : DspMMRel, DPAU_H_QBL_MM_ENC, DPAU_H_QBL_DESC;
+def DPAU_H_QBR_MM : DspMMRel, DPAU_H_QBR_MM_ENC, DPAU_H_QBR_DESC;
+def ABSQ_S_PH_MM : DspMMRel, ABSQ_S_PH_MM_ENC, ABSQ_S_PH_MM_DESC;
+def ABSQ_S_W_MM : DspMMRel, ABSQ_S_W_MM_ENC, ABSQ_S_W_MM_DESC;
+def INSV_MM : DspMMRel, INSV_MM_ENC, INSV_DESC;
+def MADD_DSP_MM : DspMMRel, MADD_DSP_MM_ENC, MADD_DSP_DESC;
+def MADDU_DSP_MM : DspMMRel, MADDU_DSP_MM_ENC, MADDU_DSP_DESC;
+def MSUB_DSP_MM : DspMMRel, MSUB_DSP_MM_ENC, MSUB_DSP_DESC;
+def MSUBU_DSP_MM : DspMMRel, MSUBU_DSP_MM_ENC, MSUBU_DSP_DESC;
+def MULT_DSP_MM : DspMMRel, MULT_DSP_MM_ENC, MULT_DSP_DESC;
+def MULTU_DSP_MM : DspMMRel, MULTU_DSP_MM_ENC, MULTU_DSP_DESC;
+def SHLL_PH_MM : DspMMRel, SHLL_PH_MM_ENC, SHLL_PH_MM_DESC;
+def SHLL_S_PH_MM : DspMMRel, SHLL_S_PH_MM_ENC, SHLL_S_PH_MM_DESC;
+def SHLL_QB_MM : DspMMRel, SHLL_QB_MM_ENC, SHLL_QB_MM_DESC;
+def SHLLV_PH_MM : DspMMRel, SHLLV_PH_MM_ENC, SHLLV_PH_MM_DESC;
+def SHLLV_S_PH_MM : DspMMRel, SHLLV_S_PH_MM_ENC, SHLLV_S_PH_MM_DESC;
+def SHLLV_QB_MM : DspMMRel, SHLLV_QB_MM_ENC, SHLLV_QB_MM_DESC;
+def SHLLV_S_W_MM : DspMMRel, SHLLV_S_W_MM_ENC, SHLLV_S_W_MM_DESC;
+def SHLL_S_W_MM : DspMMRel, SHLL_S_W_MM_ENC, SHLL_S_W_MM_DESC;
+def SHRA_PH_MM : DspMMRel, SHRA_PH_MM_ENC, SHRA_PH_MM_DESC;
+def SHRA_R_PH_MM : DspMMRel, SHRA_R_PH_MM_ENC, SHRA_R_PH_MM_DESC;
+def SHRAV_PH_MM : DspMMRel, SHRAV_PH_MM_ENC, SHRAV_PH_MM_DESC;
+def SHRAV_R_PH_MM : DspMMRel, SHRAV_R_PH_MM_ENC, SHRAV_R_PH_MM_DESC;
+def SHRAV_R_W_MM : DspMMRel, SHRAV_R_W_MM_ENC, SHRAV_R_W_MM_DESC;
+def SHRA_R_W_MM : DspMMRel, SHRA_R_W_MM_ENC, SHRA_R_W_MM_DESC;
+def SHRL_QB_MM : DspMMRel, SHRL_QB_MM_ENC, SHRL_QB_MM_DESC;
+def SHRLV_QB_MM : DspMMRel, SHRLV_QB_MM_ENC, SHRLV_QB_MM_DESC;
+def PRECEQ_W_PHL_MM : DspMMRel, PRECEQ_W_PHL_MM_ENC, PRECEQ_W_PHL_MM_DESC;
+def PRECEQ_W_PHR_MM : DspMMRel, PRECEQ_W_PHR_MM_ENC, PRECEQ_W_PHR_MM_DESC;
+def PRECEQU_PH_QBL_MM : DspMMRel, PRECEQU_PH_QBL_MM_ENC, PRECEQU_PH_QBL_MM_DESC;
+def PRECEQU_PH_QBLA_MM : DspMMRel, PRECEQU_PH_QBLA_MM_ENC,
+ PRECEQU_PH_QBLA_MM_DESC;
+def PRECEQU_PH_QBR_MM : DspMMRel, PRECEQU_PH_QBR_MM_ENC, PRECEQU_PH_QBR_MM_DESC;
+def PRECEQU_PH_QBRA_MM : DspMMRel, PRECEQU_PH_QBRA_MM_ENC,
+ PRECEQU_PH_QBRA_MM_DESC;
+def PRECEU_PH_QBL_MM : DspMMRel, PRECEU_PH_QBL_MM_ENC, PRECEU_PH_QBL_MM_DESC;
+def PRECEU_PH_QBLA_MM : DspMMRel, PRECEU_PH_QBLA_MM_ENC, PRECEU_PH_QBLA_MM_DESC;
+def PRECEU_PH_QBR_MM : DspMMRel, PRECEU_PH_QBR_MM_ENC, PRECEU_PH_QBR_MM_DESC;
+def PRECEU_PH_QBRA_MM : DspMMRel, PRECEU_PH_QBRA_MM_ENC, PRECEU_PH_QBRA_MM_DESC;
+def SUBQ_PH_MM : DspMMRel, SUBQ_PH_MM_ENC, SUBQ_PH_DESC;
+def SUBQ_S_PH_MM : DspMMRel, SUBQ_S_PH_MM_ENC, SUBQ_S_PH_DESC;
+def SUBQ_S_W_MM : DspMMRel, SUBQ_S_W_MM_ENC, SUBQ_S_W_DESC;
+def SUBU_QB_MM : DspMMRel, SUBU_QB_MM_ENC, SUBU_QB_DESC;
+def SUBU_S_QB_MM : DspMMRel, SUBU_S_QB_MM_ENC, SUBU_S_QB_DESC;
+def EXTP_MM : DspMMRel, EXTP_MM_ENC, EXTP_MM_DESC;
+def EXTPDP_MM : DspMMRel, EXTPDP_MM_ENC, EXTPDP_MM_DESC;
+def EXTPDPV_MM : DspMMRel, EXTPDPV_MM_ENC, EXTPDPV_MM_DESC;
+def EXTPV_MM : DspMMRel, EXTPV_MM_ENC, EXTPV_MM_DESC;
+def EXTR_W_MM : DspMMRel, EXTR_W_MM_ENC, EXTR_W_MM_DESC;
+def EXTR_R_W_MM : DspMMRel, EXTR_R_W_MM_ENC, EXTR_R_W_MM_DESC;
+def EXTR_RS_W_MM : DspMMRel, EXTR_RS_W_MM_ENC, EXTR_RS_W_MM_DESC;
+def EXTR_S_H_MM : DspMMRel, EXTR_S_H_MM_ENC, EXTR_S_H_MM_DESC;
+def EXTRV_W_MM : DspMMRel, EXTRV_W_MM_ENC, EXTRV_W_MM_DESC;
+def EXTRV_R_W_MM : DspMMRel, EXTRV_R_W_MM_ENC, EXTRV_R_W_MM_DESC;
+def EXTRV_RS_W_MM : DspMMRel, EXTRV_RS_W_MM_ENC, EXTRV_RS_W_MM_DESC;
+def EXTRV_S_H_MM : DspMMRel, EXTRV_S_H_MM_ENC, EXTRV_S_H_MM_DESC;
+def DPSQ_S_W_PH_MM : DspMMRel, DPSQ_S_W_PH_MM_ENC, DPSQ_S_W_PH_DESC;
+def DPSQ_SA_L_W_MM : DspMMRel, DPSQ_SA_L_W_MM_ENC, DPSQ_SA_L_W_DESC;
+def DPSU_H_QBL_MM : DspMMRel, DPSU_H_QBL_MM_ENC, DPSU_H_QBL_DESC;
+def DPSU_H_QBR_MM : DspMMRel, DPSU_H_QBR_MM_ENC, DPSU_H_QBR_DESC;
+def MULEQ_S_W_PHL_MM : DspMMRel, MULEQ_S_W_PHL_MM_ENC, MULEQ_S_W_PHL_DESC;
+def MULEQ_S_W_PHR_MM : DspMMRel, MULEQ_S_W_PHR_MM_ENC, MULEQ_S_W_PHR_DESC;
+def MULEU_S_PH_QBL_MM : DspMMRel, MULEU_S_PH_QBL_MM_ENC, MULEU_S_PH_QBL_DESC;
+def MULEU_S_PH_QBR_MM : DspMMRel, MULEU_S_PH_QBR_MM_ENC, MULEU_S_PH_QBR_DESC;
+def MULQ_RS_PH_MM : DspMMRel, MULQ_RS_PH_MM_ENC, MULQ_RS_PH_DESC;
+def PRECRQ_PH_W_MM : DspMMRel, PRECRQ_PH_W_MM_ENC, PRECRQ_PH_W_DESC;
+def PRECRQ_QB_PH_MM : DspMMRel, PRECRQ_QB_PH_MM_ENC, PRECRQ_QB_PH_DESC;
+def PRECRQU_S_QB_PH_MM : DspMMRel, PRECRQU_S_QB_PH_MM_ENC, PRECRQU_S_QB_PH_DESC;
+def PRECRQ_RS_PH_W_MM : DspMMRel, PRECRQ_RS_PH_W_MM_ENC, PRECRQ_RS_PH_W_DESC;
+def LBUX_MM : DspMMRel, LBUX_MM_ENC, LBUX_DESC;
+def LHX_MM : DspMMRel, LHX_MM_ENC, LHX_DESC;
+def LWX_MM : DspMMRel, LWX_MM_ENC, LWX_DESC;
+def MAQ_S_W_PHL_MM : DspMMRel, MAQ_S_W_PHL_MM_ENC, MAQ_S_W_PHL_DESC;
+def MAQ_SA_W_PHL_MM : DspMMRel, MAQ_SA_W_PHL_MM_ENC, MAQ_SA_W_PHL_DESC;
+def MAQ_S_W_PHR_MM : DspMMRel, MAQ_S_W_PHR_MM_ENC, MAQ_S_W_PHR_DESC;
+def MAQ_SA_W_PHR_MM : DspMMRel, MAQ_SA_W_PHR_MM_ENC, MAQ_SA_W_PHR_DESC;
+def MFHI_DSP_MM : DspMMRel, MFHI_MM_ENC, MFHI_MM_DESC;
+def MFLO_DSP_MM : DspMMRel, MFLO_MM_ENC, MFLO_MM_DESC;
+def MTHI_DSP_MM : DspMMRel, MTHI_MM_ENC, MTHI_DESC;
+def MTLO_DSP_MM : DspMMRel, MTLO_MM_ENC, MTLO_DESC;
+def RADDU_W_QB_MM : DspMMRel, RADDU_W_QB_MM_ENC, RADDU_W_QB_MM_DESC;
+def RDDSP_MM : DspMMRel, RDDSP_MM_ENC, RDDSP_MM_DESC;
+def REPL_PH_MM : DspMMRel, REPL_PH_MM_ENC, REPL_PH_DESC;
+def REPL_QB_MM : DspMMRel, REPL_QB_MM_ENC, REPL_QB_MM_DESC;
+def REPLV_PH_MM : DspMMRel, REPLV_PH_MM_ENC, REPLV_PH_MM_DESC;
+def REPLV_QB_MM : DspMMRel, REPLV_QB_MM_ENC, REPLV_QB_MM_DESC;
+def MTHLIP_MM : DspMMRel, MTHLIP_MM_ENC, MTHLIP_DESC;
+def PACKRL_PH_MM : DspMMRel, PACKRL_PH_MM_ENC, PACKRL_PH_DESC;
+def PICK_PH_MM : DspMMRel, PICK_PH_MM_ENC, PICK_PH_DESC;
+def PICK_QB_MM : DspMMRel, PICK_QB_MM_ENC, PICK_QB_DESC;
+def SHILO_MM : DspMMRel, SHILO_MM_ENC, SHILO_DESC;
+def SHILOV_MM : DspMMRel, SHILOV_MM_ENC, SHILOV_DESC;
+def WRDSP_MM : DspMMRel, WRDSP_MM_ENC, WRDSP_MM_DESC;
+def MODSUB_MM : DspMMRel, MODSUB_MM_ENC, MODSUB_DESC;
+def MULSAQ_S_W_PH_MM : DspMMRel, MULSAQ_S_W_PH_MM_ENC, MULSAQ_S_W_PH_DESC;
+def BITREV_MM : DspMMRel, BITREV_MM_ENC, BITREV_MM_DESC;
+def BPOSGE32_MM : DspMMRel, BPOSGE32_MM_ENC, BPOSGE32_MM_DESC,
+ ISA_MIPS1_NOT_32R6_64R6;
+def CMP_EQ_PH_MM : DspMMRel, CMP_EQ_PH_MM_ENC, CMP_EQ_PH_DESC;
+def CMP_LT_PH_MM : DspMMRel, CMP_LT_PH_MM_ENC, CMP_LT_PH_DESC;
+def CMP_LE_PH_MM : DspMMRel, CMP_LE_PH_MM_ENC, CMP_LE_PH_DESC;
+def CMPGU_EQ_QB_MM : DspMMRel, CMPGU_EQ_QB_MM_ENC, CMPGU_EQ_QB_DESC;
+def CMPGU_LT_QB_MM : DspMMRel, CMPGU_LT_QB_MM_ENC, CMPGU_LT_QB_DESC;
+def CMPGU_LE_QB_MM : DspMMRel, CMPGU_LE_QB_MM_ENC, CMPGU_LE_QB_DESC;
+def CMPU_EQ_QB_MM : DspMMRel, CMPU_EQ_QB_MM_ENC, CMPU_EQ_QB_DESC;
+def CMPU_LT_QB_MM : DspMMRel, CMPU_LT_QB_MM_ENC, CMPU_LT_QB_DESC;
+def CMPU_LE_QB_MM : DspMMRel, CMPU_LE_QB_MM_ENC, CMPU_LE_QB_DESC;
+// microMIPS DSP Rev 2
+def ABSQ_S_QB_MMR2 : DspMMRel, ABSQ_S_QB_MMR2_ENC, ABSQ_S_QB_MMR2_DESC,
+ ISA_DSPR2;
+def ADDQH_PH_MMR2 : DspMMRel, ADDQH_PH_MMR2_ENC, ADDQH_PH_DESC, ISA_DSPR2;
+def ADDQH_R_PH_MMR2 : DspMMRel, ADDQH_R_PH_MMR2_ENC, ADDQH_R_PH_DESC, ISA_DSPR2;
+def ADDQH_W_MMR2 : DspMMRel, ADDQH_W_MMR2_ENC, ADDQH_W_DESC, ISA_DSPR2;
+def ADDQH_R_W_MMR2 : DspMMRel, ADDQH_R_W_MMR2_ENC, ADDQH_R_W_DESC, ISA_DSPR2;
+def ADDU_PH_MMR2 : DspMMRel, ADDU_PH_MMR2_ENC, ADDU_PH_DESC, ISA_DSPR2;
+def ADDU_S_PH_MMR2 : DspMMRel, ADDU_S_PH_MMR2_ENC, ADDU_S_PH_DESC, ISA_DSPR2;
+def ADDUH_QB_MMR2 : DspMMRel, ADDUH_QB_MMR2_ENC, ADDUH_QB_DESC, ISA_DSPR2;
+def ADDUH_R_QB_MMR2 : DspMMRel, ADDUH_R_QB_MMR2_ENC, ADDUH_R_QB_DESC, ISA_DSPR2;
+def DPA_W_PH_MMR2 : DspMMRel, DPA_W_PH_MMR2_ENC, DPA_W_PH_DESC, ISA_DSPR2;
+def DPAQX_S_W_PH_MMR2 : DspMMRel, DPAQX_S_W_PH_MMR2_ENC, DPAQX_S_W_PH_DESC,
+ ISA_DSPR2;
+def DPAQX_SA_W_PH_MMR2 : DspMMRel, DPAQX_SA_W_PH_MMR2_ENC, DPAQX_SA_W_PH_DESC,
+ ISA_DSPR2;
+def DPAX_W_PH_MMR2 : DspMMRel, DPAX_W_PH_MMR2_ENC, DPAX_W_PH_DESC, ISA_DSPR2;
+def SHRA_QB_MMR2 : DspMMRel, SHRA_QB_MMR2_ENC, SHRA_QB_MMR2_DESC, ISA_DSPR2;
+def SHRA_R_QB_MMR2 : DspMMRel, SHRA_R_QB_MMR2_ENC, SHRA_R_QB_MMR2_DESC,
+ ISA_DSPR2;
+def SHRAV_QB_MMR2 : DspMMRel, SHRAV_QB_MMR2_ENC, SHRAV_QB_MMR2_DESC, ISA_DSPR2;
+def SHRAV_R_QB_MMR2 : DspMMRel, SHRAV_R_QB_MMR2_ENC, SHRAV_R_QB_MMR2_DESC,
+ ISA_DSPR2;
+def BALIGN_MMR2 : DspMMRel, BALIGN_MMR2_ENC, BALIGN_MMR2_DESC, ISA_DSPR2;
+def CMPGDU_EQ_QB_MMR2 : DspMMRel, CMPGDU_EQ_QB_MMR2_ENC, CMPGDU_EQ_QB_DESC,
+ ISA_DSPR2;
+def CMPGDU_LT_QB_MMR2 : DspMMRel, CMPGDU_LT_QB_MMR2_ENC, CMPGDU_LT_QB_DESC,
+ ISA_DSPR2;
+def CMPGDU_LE_QB_MMR2 : DspMMRel, CMPGDU_LE_QB_MMR2_ENC, CMPGDU_LE_QB_DESC,
+ ISA_DSPR2;
+def SHRL_PH_MMR2 : DspMMRel, SHRL_PH_MMR2_ENC, SHRL_PH_MMR2_DESC, ISA_DSPR2;
+def SHRLV_PH_MMR2 : DspMMRel, SHRLV_PH_MMR2_ENC, SHRLV_PH_MMR2_DESC, ISA_DSPR2;
+def SUBQH_PH_MMR2 : DspMMRel, SUBQH_PH_MMR2_ENC, SUBQH_PH_DESC, ISA_DSPR2;
+def SUBQH_R_PH_MMR2 : DspMMRel, SUBQH_R_PH_MMR2_ENC, SUBQH_R_PH_DESC, ISA_DSPR2;
+def SUBQH_W_MMR2 : DspMMRel, SUBQH_W_MMR2_ENC, SUBQH_W_DESC, ISA_DSPR2;
+def SUBQH_R_W_MMR2 : DspMMRel, SUBQH_R_W_MMR2_ENC, SUBQH_R_W_DESC, ISA_DSPR2;
+def SUBU_PH_MMR2 : DspMMRel, SUBU_PH_MMR2_ENC, SUBU_PH_DESC, ISA_DSPR2;
+def SUBU_S_PH_MMR2 : DspMMRel, SUBU_S_PH_MMR2_ENC, SUBU_S_PH_DESC, ISA_DSPR2;
+def SUBUH_QB_MMR2 : DspMMRel, SUBUH_QB_MMR2_ENC, SUBUH_QB_DESC, ISA_DSPR2;
+def SUBUH_R_QB_MMR2 : DspMMRel, SUBUH_R_QB_MMR2_ENC, SUBUH_R_QB_DESC, ISA_DSPR2;
+def DPS_W_PH_MMR2 : DspMMRel, DPS_W_PH_MMR2_ENC, DPS_W_PH_DESC, ISA_DSPR2;
+def DPSQX_S_W_PH_MMR2 : DspMMRel, DPSQX_S_W_PH_MMR2_ENC, DPSQX_S_W_PH_DESC,
+ ISA_DSPR2;
+def DPSQX_SA_W_PH_MMR2 : DspMMRel, DPSQX_SA_W_PH_MMR2_ENC, DPSQX_SA_W_PH_DESC,
+ ISA_DSPR2;
+def DPSX_W_PH_MMR2 : DspMMRel, DPSX_W_PH_MMR2_ENC, DPSX_W_PH_DESC, ISA_DSPR2;
+def MUL_PH_MMR2 : DspMMRel, MUL_PH_MMR2_ENC, MUL_PH_DESC, ISA_DSPR2;
+def MUL_S_PH_MMR2 : DspMMRel, MUL_S_PH_MMR2_ENC, MUL_S_PH_DESC, ISA_DSPR2;
+def MULQ_RS_W_MMR2 : DspMMRel, MULQ_RS_W_MMR2_ENC, MULQ_RS_W_DESC, ISA_DSPR2;
+def MULQ_S_PH_MMR2 : DspMMRel, MULQ_S_PH_MMR2_ENC, MULQ_S_PH_DESC, ISA_DSPR2;
+def MULQ_S_W_MMR2 : DspMMRel, MULQ_S_W_MMR2_ENC, MULQ_S_W_DESC, ISA_DSPR2;
+def PRECR_QB_PH_MMR2 : DspMMRel, PRECR_QB_PH_MMR2_ENC, PRECR_QB_PH_DESC,
+ ISA_DSPR2;
+def PRECR_SRA_PH_W_MMR2 : DspMMRel, PRECR_SRA_PH_W_MMR2_ENC,
+ PRECR_SRA_PH_W_DESC, ISA_DSPR2;
+def PRECR_SRA_R_PH_W_MMR2 : DspMMRel, PRECR_SRA_R_PH_W_MMR2_ENC,
+ PRECR_SRA_R_PH_W_DESC, ISA_DSPR2;
+def PREPEND_MMR2 : DspMMRel, PREPEND_MMR2_ENC, PREPEND_DESC, ISA_DSPR2;
+
+// Instruction alias.
+def : MMDSPInstAlias<"wrdsp $rt", (WRDSP_MM GPR32Opnd:$rt, 0x1F), 1>;
+def APPEND_MMR2 : DspMMRel, APPEND_MMR2_ENC, APPEND_DESC, ISA_DSPR2;
+def MULSA_W_PH_MMR2 : DspMMRel, MULSA_W_PH_MMR2_ENC, MULSA_W_PH_DESC, ISA_DSPR2;
+// microMIPS DSP Rev 3
+def BPOSGE32C_MMR3 : DspMMRel, BPOSGE32C_MMR3_ENC, BPOSGE32C_MMR3_DESC,
+ ISA_DSPR3;
diff --git a/contrib/llvm/lib/Target/Mips/MicroMipsInstrFPU.td b/contrib/llvm/lib/Target/Mips/MicroMipsInstrFPU.td
new file mode 100644
index 000000000000..fc83761e409b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MicroMipsInstrFPU.td
@@ -0,0 +1,180 @@
+let isCodeGenOnly = 1, Predicates = [InMicroMips] in {
+def FADD_S_MM : MMRel, ADDS_FT<"add.s", FGR32Opnd, II_ADD_S, 1, fadd>,
+ ADDS_FM_MM<0, 0x30>;
+def FDIV_S_MM : MMRel, ADDS_FT<"div.s", FGR32Opnd, II_DIV_S, 0, fdiv>,
+ ADDS_FM_MM<0, 0xf0>;
+def FMUL_S_MM : MMRel, ADDS_FT<"mul.s", FGR32Opnd, II_MUL_S, 1, fmul>,
+ ADDS_FM_MM<0, 0xb0>;
+def FSUB_S_MM : MMRel, ADDS_FT<"sub.s", FGR32Opnd, II_SUB_S, 0, fsub>,
+ ADDS_FM_MM<0, 0x70>;
+
+def FADD_MM : MMRel, ADDS_FT<"add.d", AFGR64Opnd, II_ADD_D, 1, fadd>,
+ ADDS_FM_MM<1, 0x30>;
+def FDIV_MM : MMRel, ADDS_FT<"div.d", AFGR64Opnd, II_DIV_D, 0, fdiv>,
+ ADDS_FM_MM<1, 0xf0>;
+def FMUL_MM : MMRel, ADDS_FT<"mul.d", AFGR64Opnd, II_MUL_D, 1, fmul>,
+ ADDS_FM_MM<1, 0xb0>;
+def FSUB_MM : MMRel, ADDS_FT<"sub.d", AFGR64Opnd, II_SUB_D, 0, fsub>,
+ ADDS_FM_MM<1, 0x70>;
+
+def LWXC1_MM : MMRel, LWXC1_FT<"lwxc1", FGR32Opnd, II_LWXC1, load>,
+ LWXC1_FM_MM<0x48>, INSN_MIPS4_32R2_NOT_32R6_64R6;
+def SWXC1_MM : MMRel, SWXC1_FT<"swxc1", FGR32Opnd, II_SWXC1, store>,
+ SWXC1_FM_MM<0x88>, INSN_MIPS4_32R2_NOT_32R6_64R6;
+def LUXC1_MM : MMRel, LWXC1_FT<"luxc1", AFGR64Opnd, II_LUXC1>,
+ LWXC1_FM_MM<0x148>, INSN_MIPS5_32R2_NOT_32R6_64R6;
+def SUXC1_MM : MMRel, SWXC1_FT<"suxc1", AFGR64Opnd, II_SUXC1>,
+ SWXC1_FM_MM<0x188>, INSN_MIPS5_32R2_NOT_32R6_64R6;
+
+def FCMP_S32_MM : MMRel, CEQS_FT<"s", FGR32, II_C_CC_S, MipsFPCmp>,
+ CEQS_FM_MM<0>;
+def FCMP_D32_MM : MMRel, CEQS_FT<"d", AFGR64, II_C_CC_D, MipsFPCmp>,
+ CEQS_FM_MM<1>;
+
+def BC1F_MM : MMRel, BC1F_FT<"bc1f", brtarget_mm, II_BC1F, MIPS_BRANCH_F>,
+ BC1F_FM_MM<0x1c>, ISA_MIPS1_NOT_32R6_64R6;
+def BC1T_MM : MMRel, BC1F_FT<"bc1t", brtarget_mm, II_BC1T, MIPS_BRANCH_T>,
+ BC1F_FM_MM<0x1d>, ISA_MIPS1_NOT_32R6_64R6;
+def CVT_W_S_MM : MMRel, ABSS_FT<"cvt.w.s", FGR32Opnd, FGR32Opnd, II_CVT>,
+ ROUND_W_FM_MM<0, 0x24>;
+def ROUND_W_S_MM : MMRel, StdMMR6Rel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>,
+ ROUND_W_FM_MM<0, 0xec>;
+
+def CEIL_W_MM : MMRel, ABSS_FT<"ceil.w.d", FGR32Opnd, AFGR64Opnd, II_CEIL>,
+ ROUND_W_FM_MM<1, 0x6c>;
+def CVT_W_MM : MMRel, ABSS_FT<"cvt.w.d", FGR32Opnd, AFGR64Opnd, II_CVT>,
+ ROUND_W_FM_MM<1, 0x24>;
+def FLOOR_W_MM : MMRel, ABSS_FT<"floor.w.d", FGR32Opnd, AFGR64Opnd, II_FLOOR>,
+ ROUND_W_FM_MM<1, 0x2c>;
+def ROUND_W_MM : MMRel, StdMMR6Rel, ABSS_FT<"round.w.d", FGR32Opnd, AFGR64Opnd, II_ROUND>,
+ ROUND_W_FM_MM<1, 0xec>;
+def TRUNC_W_MM : MMRel, ABSS_FT<"trunc.w.d", FGR32Opnd, AFGR64Opnd, II_TRUNC>,
+ ROUND_W_FM_MM<1, 0xac>;
+
+def FSQRT_MM : MMRel, ABSS_FT<"sqrt.d", AFGR64Opnd, AFGR64Opnd, II_SQRT_D,
+ fsqrt>, ROUND_W_FM_MM<1, 0x28>;
+
+def CVT_L_S_MM : MMRel, ABSS_FT<"cvt.l.s", FGR64Opnd, FGR32Opnd, II_CVT>,
+ ROUND_W_FM_MM<0, 0x4>, INSN_MIPS3_32R2;
+def CVT_L_D64_MM : MMRel, ABSS_FT<"cvt.l.d", FGR64Opnd, FGR64Opnd, II_CVT>,
+ ROUND_W_FM_MM<1, 0x4>, INSN_MIPS3_32R2;
+
+def FABS_S_MM : MMRel, ABSS_FT<"abs.s", FGR32Opnd, FGR32Opnd, II_ABS, fabs>,
+ ABS_FM_MM<0, 0xd>;
+def FMOV_S_MM : MMRel, ABSS_FT<"mov.s", FGR32Opnd, FGR32Opnd, II_MOV_S>,
+ ABS_FM_MM<0, 0x1>;
+def FNEG_S_MM : MMRel, ABSS_FT<"neg.s", FGR32Opnd, FGR32Opnd, II_NEG, fneg>,
+ ABS_FM_MM<0, 0x2d>;
+def CVT_D_S_MM : MMRel, ABSS_FT<"cvt.d.s", AFGR64Opnd, FGR32Opnd, II_CVT>,
+ ABS_FM_MM<0, 0x4d>;
+def CVT_D32_W_MM : MMRel, ABSS_FT<"cvt.d.w", AFGR64Opnd, FGR32Opnd, II_CVT>,
+ ABS_FM_MM<1, 0x4d>;
+def CVT_S_D32_MM : MMRel, ABSS_FT<"cvt.s.d", FGR32Opnd, AFGR64Opnd, II_CVT>,
+ ABS_FM_MM<0, 0x6d>;
+def CVT_S_W_MM : MMRel, ABSS_FT<"cvt.s.w", FGR32Opnd, FGR32Opnd, II_CVT>,
+ ABS_FM_MM<1, 0x6d>;
+
+def FABS_MM : MMRel, ABSS_FT<"abs.d", AFGR64Opnd, AFGR64Opnd, II_ABS, fabs>,
+ ABS_FM_MM<1, 0xd>;
+def FNEG_MM : MMRel, ABSS_FT<"neg.d", AFGR64Opnd, AFGR64Opnd, II_NEG, fneg>,
+ ABS_FM_MM<1, 0x2d>;
+
+def FMOV_D32_MM : MMRel, ABSS_FT<"mov.d", AFGR64Opnd, AFGR64Opnd, II_MOV_D>,
+ ABS_FM_MM<1, 0x1>, FGR_32;
+
+def MOVZ_I_S_MM : MMRel, CMov_I_F_FT<"movz.s", GPR32Opnd, FGR32Opnd,
+ II_MOVZ_S>, CMov_I_F_FM_MM<0x78, 0>;
+def MOVN_I_S_MM : MMRel, CMov_I_F_FT<"movn.s", GPR32Opnd, FGR32Opnd,
+ II_MOVN_S>, CMov_I_F_FM_MM<0x38, 0>;
+def MOVZ_I_D32_MM : MMRel, CMov_I_F_FT<"movz.d", GPR32Opnd, AFGR64Opnd,
+ II_MOVZ_D>, CMov_I_F_FM_MM<0x78, 1>;
+def MOVN_I_D32_MM : MMRel, CMov_I_F_FT<"movn.d", GPR32Opnd, AFGR64Opnd,
+ II_MOVN_D>, CMov_I_F_FM_MM<0x38, 1>;
+
+def MOVT_S_MM : MMRel, CMov_F_F_FT<"movt.s", FGR32Opnd, II_MOVT_S,
+ MipsCMovFP_T>, CMov_F_F_FM_MM<0x60, 0>;
+def MOVF_S_MM : MMRel, CMov_F_F_FT<"movf.s", FGR32Opnd, II_MOVF_S,
+ MipsCMovFP_F>, CMov_F_F_FM_MM<0x20, 0>;
+def MOVT_D32_MM : MMRel, CMov_F_F_FT<"movt.d", AFGR64Opnd, II_MOVT_D,
+ MipsCMovFP_T>, CMov_F_F_FM_MM<0x60, 1>;
+def MOVF_D32_MM : MMRel, CMov_F_F_FT<"movf.d", AFGR64Opnd, II_MOVF_D,
+ MipsCMovFP_F>, CMov_F_F_FM_MM<0x20, 1>;
+def MFC1_MM : MMRel, MFC1_FT<"mfc1", GPR32Opnd, FGR32Opnd,
+ II_MFC1, bitconvert>, MFC1_FM_MM<0x80>;
+def MTC1_MM : MMRel, MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd,
+ II_MTC1, bitconvert>, MFC1_FM_MM<0xa0>;
+
+def MADD_S_MM : MMRel, MADDS_FT<"madd.s", FGR32Opnd, II_MADD_S, fadd>,
+ MADDS_FM_MM<0x1>;
+def MSUB_S_MM : MMRel, MADDS_FT<"msub.s", FGR32Opnd, II_MSUB_S, fsub>,
+ MADDS_FM_MM<0x21>;
+def NMADD_S_MM : MMRel, NMADDS_FT<"nmadd.s", FGR32Opnd, II_NMADD_S, fadd>,
+ MADDS_FM_MM<0x2>;
+def NMSUB_S_MM : MMRel, NMADDS_FT<"nmsub.s", FGR32Opnd, II_NMSUB_S, fsub>,
+ MADDS_FM_MM<0x22>;
+
+def MADD_D32_MM : MMRel, MADDS_FT<"madd.d", AFGR64Opnd, II_MADD_D, fadd>,
+ MADDS_FM_MM<0x9>;
+def MSUB_D32_MM : MMRel, MADDS_FT<"msub.d", AFGR64Opnd, II_MSUB_D, fsub>,
+ MADDS_FM_MM<0x29>;
+def NMADD_D32_MM : MMRel, NMADDS_FT<"nmadd.d", AFGR64Opnd, II_NMADD_D, fadd>,
+ MADDS_FM_MM<0xa>;
+def NMSUB_D32_MM : MMRel, NMADDS_FT<"nmsub.d", AFGR64Opnd, II_NMSUB_D, fsub>,
+ MADDS_FM_MM<0x2a>;
+}
+
+let AdditionalPredicates = [InMicroMips] in {
+ def FLOOR_W_S_MM : MMRel, ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd,
+ II_FLOOR>, ROUND_W_FM_MM<0, 0x2c>;
+ def TRUNC_W_S_MM : MMRel, StdMMR6Rel, ABSS_FT<"trunc.w.s", FGR32Opnd,
+ FGR32Opnd, II_TRUNC>, ROUND_W_FM_MM<0, 0xac>;
+ def CEIL_W_S_MM : MMRel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>,
+ ROUND_W_FM_MM<0, 0x6c>;
+ def FSQRT_S_MM : MMRel, ABSS_FT<"sqrt.s", FGR32Opnd, FGR32Opnd, II_SQRT_S,
+ fsqrt>, ROUND_W_FM_MM<0, 0x28>;
+ def MTHC1_MM : MMRel, MTC1_64_FT<"mthc1", AFGR64Opnd, GPR32Opnd, II_MTHC1>,
+ MFC1_FM_MM<0xe0>, ISA_MIPS32R2, FGR_32;
+ def MFHC1_MM : MMRel, MFC1_FT<"mfhc1", GPR32Opnd, AFGR64Opnd, II_MFHC1>,
+ MFC1_FM_MM<0xc0>, ISA_MIPS32R2, FGR_32;
+ let DecoderNamespace = "MicroMips" in {
+ def CFC1_MM : MMRel, MFC1_FT<"cfc1", GPR32Opnd, CCROpnd, II_CFC1>,
+ MFC1_FM_MM<0x40>;
+ def CTC1_MM : MMRel, MTC1_FT<"ctc1", CCROpnd, GPR32Opnd, II_CTC1>,
+ MFC1_FM_MM<0x60>;
+ def RECIP_S_MM : MMRel, ABSS_FT<"recip.s", FGR32Opnd, FGR32Opnd,
+ II_RECIP_S>,
+ ROUND_W_FM_MM<0b0, 0b01001000>;
+ def RECIP_D_MM : MMRel, ABSS_FT<"recip.d", AFGR64Opnd, AFGR64Opnd,
+ II_RECIP_D>, ROUND_W_FM_MM<0b1, 0b01001000>;
+ def RSQRT_S_MM : MMRel, ABSS_FT<"rsqrt.s", FGR32Opnd, FGR32Opnd,
+ II_RECIP_S>,
+ ROUND_W_FM_MM<0b0, 0b00001000>;
+ def RSQRT_D_MM : MMRel, ABSS_FT<"rsqrt.d", AFGR64Opnd, AFGR64Opnd,
+ II_RECIP_D>, ROUND_W_FM_MM<0b1, 0b00001000>;
+ }
+ let DecoderNamespace = "MicroMips", DecoderMethod = "DecodeFMemMMR2" in {
+ def LDC1_MM : MMRel, LW_FT<"ldc1", AFGR64Opnd, mem_mm_16, II_LDC1, load>,
+ LW_FM_MM<0x2f>, FGR_32 {
+ let BaseOpcode = "LDC132";
+ }
+ def SDC1_MM : MMRel, SW_FT<"sdc1", AFGR64Opnd, mem_mm_16, II_SDC1, store>,
+ LW_FM_MM<0x2e>, FGR_32;
+ def LWC1_MM : MMRel, LW_FT<"lwc1", FGR32Opnd, mem_mm_16, II_LWC1, load>,
+ LW_FM_MM<0x27>;
+ def SWC1_MM : MMRel, SW_FT<"swc1", FGR32Opnd, mem_mm_16, II_SWC1, store>,
+ LW_FM_MM<0x26>;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Floating Point Patterns
+//===----------------------------------------------------------------------===//
+let AdditionalPredicates = [InMicroMips] in {
+ // Patterns for loads/stores with a reg+imm operand.
+ let AddedComplexity = 40 in {
+ def : LoadRegImmPat<LDC1_MM, f64, load>, FGR_32;
+ def : StoreRegImmPat<SDC1_MM, f64>, FGR_32;
+ def : LoadRegImmPat<LWC1_MM, f32, load>;
+ def : StoreRegImmPat<SWC1_MM, f32>;
+ }
+}
diff --git a/contrib/llvm/lib/Target/Mips/MicroMipsInstrFormats.td b/contrib/llvm/lib/Target/Mips/MicroMipsInstrFormats.td
new file mode 100644
index 000000000000..8b595f9e6c4c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MicroMipsInstrFormats.td
@@ -0,0 +1,1049 @@
+//===----------------------------------------------------------------------===//
+// MicroMIPS Base Classes
+//===----------------------------------------------------------------------===//
+
+//
+// Base class for MicroMips instructions.
+// This class does not depend on the instruction size.
+//
+class MicroMipsInstBase<dag outs, dag ins, string asmstr, list<dag> pattern,
+ InstrItinClass itin, Format f> : Instruction
+{
+ let Namespace = "Mips";
+ let DecoderNamespace = "MicroMips";
+
+ let OutOperandList = outs;
+ let InOperandList = ins;
+
+ let AsmString = asmstr;
+ let Pattern = pattern;
+ let Itinerary = itin;
+
+ let Predicates = [InMicroMips];
+
+ Format Form = f;
+}
+
+//
+// Base class for MicroMIPS 16-bit instructions.
+//
+class MicroMipsInst16<dag outs, dag ins, string asmstr, list<dag> pattern,
+ InstrItinClass itin, Format f> :
+ MicroMipsInstBase<outs, ins, asmstr, pattern, itin, f>
+{
+ let Size = 2;
+ field bits<16> Inst;
+ field bits<16> SoftFail = 0;
+ bits<6> Opcode = 0x0;
+}
+
+//===----------------------------------------------------------------------===//
+// MicroMIPS 16-bit Instruction Formats
+//===----------------------------------------------------------------------===//
+
+class ARITH_FM_MM16<bit funct> {
+ bits<3> rd;
+ bits<3> rt;
+ bits<3> rs;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = 0x01;
+ let Inst{9-7} = rd;
+ let Inst{6-4} = rt;
+ let Inst{3-1} = rs;
+ let Inst{0} = funct;
+}
+
+class ANDI_FM_MM16<bits<6> funct> {
+ bits<3> rd;
+ bits<3> rs;
+ bits<4> imm;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = funct;
+ let Inst{9-7} = rd;
+ let Inst{6-4} = rs;
+ let Inst{3-0} = imm;
+}
+
+class LOGIC_FM_MM16<bits<4> funct> {
+ bits<3> rt;
+ bits<3> rs;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = 0x11;
+ let Inst{9-6} = funct;
+ let Inst{5-3} = rt;
+ let Inst{2-0} = rs;
+}
+
+class SHIFT_FM_MM16<bits<1> funct> {
+ bits<3> rd;
+ bits<3> rt;
+ bits<3> shamt;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = 0x09;
+ let Inst{9-7} = rd;
+ let Inst{6-4} = rt;
+ let Inst{3-1} = shamt;
+ let Inst{0} = funct;
+}
+
+class ADDIUR2_FM_MM16 {
+ bits<3> rd;
+ bits<3> rs;
+ bits<3> imm;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = 0x1b;
+ let Inst{9-7} = rd;
+ let Inst{6-4} = rs;
+ let Inst{3-1} = imm;
+ let Inst{0} = 0;
+}
+
+class LOAD_STORE_FM_MM16<bits<6> op> {
+ bits<3> rt;
+ bits<7> addr;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = op;
+ let Inst{9-7} = rt;
+ let Inst{6-4} = addr{6-4};
+ let Inst{3-0} = addr{3-0};
+}
+
+class LOAD_STORE_SP_FM_MM16<bits<6> op> {
+ bits<5> rt;
+ bits<5> offset;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = op;
+ let Inst{9-5} = rt;
+ let Inst{4-0} = offset;
+}
+
+class LOAD_GP_FM_MM16<bits<6> op> {
+ bits<3> rt;
+ bits<7> offset;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = op;
+ let Inst{9-7} = rt;
+ let Inst{6-0} = offset;
+}
+
+class ADDIUS5_FM_MM16 {
+ bits<5> rd;
+ bits<4> imm;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = 0x13;
+ let Inst{9-5} = rd;
+ let Inst{4-1} = imm;
+ let Inst{0} = 0;
+}
+
+class ADDIUSP_FM_MM16 {
+ bits<9> imm;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = 0x13;
+ let Inst{9-1} = imm;
+ let Inst{0} = 1;
+}
+
+class MOVE_FM_MM16<bits<6> funct> {
+ bits<5> rs;
+ bits<5> rd;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = funct;
+ let Inst{9-5} = rd;
+ let Inst{4-0} = rs;
+}
+
+class LI_FM_MM16 {
+ bits<3> rd;
+ bits<7> imm;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = 0x3b;
+ let Inst{9-7} = rd;
+ let Inst{6-0} = imm;
+}
+
+class JALR_FM_MM16<bits<5> op> {
+ bits<5> rs;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = 0x11;
+ let Inst{9-5} = op;
+ let Inst{4-0} = rs;
+}
+
+class MFHILO_FM_MM16<bits<5> funct> {
+ bits<5> rd;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = 0x11;
+ let Inst{9-5} = funct;
+ let Inst{4-0} = rd;
+}
+
+class JRADDIUSP_FM_MM16<bits<5> op> {
+ bits<5> rs;
+ bits<5> imm;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = 0x11;
+ let Inst{9-5} = op;
+ let Inst{4-0} = imm;
+}
+
+class ADDIUR1SP_FM_MM16 {
+ bits<3> rd;
+ bits<6> imm;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = 0x1b;
+ let Inst{9-7} = rd;
+ let Inst{6-1} = imm;
+ let Inst{0} = 1;
+}
+
+class BRKSDBBP16_FM_MM<bits<6> op> {
+ bits<4> code_;
+ bits<16> Inst;
+
+ let Inst{15-10} = 0x11;
+ let Inst{9-4} = op;
+ let Inst{3-0} = code_;
+}
+
+class BEQNEZ_FM_MM16<bits<6> op> {
+ bits<3> rs;
+ bits<7> offset;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = op;
+ let Inst{9-7} = rs;
+ let Inst{6-0} = offset;
+}
+
+class B16_FM {
+ bits<10> offset;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = 0x33;
+ let Inst{9-0} = offset;
+}
+
+class MOVEP_FM_MM16 {
+ bits<3> dst_regs;
+ bits<3> rt;
+ bits<3> rs;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = 0x21;
+ let Inst{9-7} = dst_regs;
+ let Inst{6-4} = rt;
+ let Inst{3-1} = rs;
+ let Inst{0} = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// MicroMIPS 32-bit Instruction Formats
+//===----------------------------------------------------------------------===//
+
+class MMArch {
+ string Arch = "micromips";
+}
+
+class ADD_FM_MM<bits<6> op, bits<10> funct> : MMArch {
+ bits<5> rt;
+ bits<5> rs;
+ bits<5> rd;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = op;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-11} = rd;
+ let Inst{10} = 0;
+ let Inst{9-0} = funct;
+}
+
+class ADDI_FM_MM<bits<6> op> : MMArch {
+ bits<5> rs;
+ bits<5> rt;
+ bits<16> imm16;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = op;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-0} = imm16;
+}
+
+class SLTI_FM_MM<bits<6> op> : MMArch {
+ bits<5> rt;
+ bits<5> rs;
+ bits<16> imm16;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = op;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-0} = imm16;
+}
+
+class LUI_FM_MM : MMArch {
+ bits<5> rt;
+ bits<16> imm16;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x10;
+ let Inst{25-21} = 0xd;
+ let Inst{20-16} = rt;
+ let Inst{15-0} = imm16;
+}
+
+class MULT_FM_MM<bits<10> funct> : MMArch {
+ bits<5> rs;
+ bits<5> rt;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x00;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-6} = funct;
+ let Inst{5-0} = 0x3c;
+}
+
+class SRA_FM_MM<bits<10> funct, bit rotate> : MMArch {
+ bits<5> rd;
+ bits<5> rt;
+ bits<5> shamt;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0;
+ let Inst{25-21} = rd;
+ let Inst{20-16} = rt;
+ let Inst{15-11} = shamt;
+ let Inst{10} = rotate;
+ let Inst{9-0} = funct;
+}
+
+class SRLV_FM_MM<bits<10> funct, bit rotate> : MMArch {
+ bits<5> rd;
+ bits<5> rt;
+ bits<5> rs;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-11} = rd;
+ let Inst{10} = rotate;
+ let Inst{9-0} = funct;
+}
+
+class LW_FM_MM<bits<6> op> : MMArch {
+ bits<5> rt;
+ bits<21> addr;
+ bits<5> base = addr{20-16};
+ bits<16> offset = addr{15-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = op;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = base;
+ let Inst{15-0} = offset;
+}
+
+class POOL32C_LHUE_FM_MM<bits<6> op, bits<4> fmt, bits<3> funct> : MMArch {
+ bits<5> rt;
+ bits<21> addr;
+ bits<5> base = addr{20-16};
+ bits<9> offset = addr{8-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = op;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = base;
+ let Inst{15-12} = fmt;
+ let Inst{11-9} = funct;
+ let Inst{8-0} = offset;
+}
+
+class LWL_FM_MM<bits<4> funct> {
+ bits<5> rt;
+ bits<21> addr;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x18;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = addr{20-16};
+ let Inst{15-12} = funct;
+ let Inst{11-0} = addr{11-0};
+}
+
+class POOL32C_STEVA_LDEVA_FM_MM<bits<4> type, bits<3> funct> {
+ bits<5> rt;
+ bits<21> addr;
+ bits<5> base = addr{20-16};
+ bits<9> offset = addr{8-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x18;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = base;
+ let Inst{15-12} = type;
+ let Inst{11-9} = funct;
+ let Inst{8-0} = offset;
+}
+
+class CMov_F_I_FM_MM<bits<7> func> : MMArch {
+ bits<5> rd;
+ bits<5> rs;
+ bits<3> fcc;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x15;
+ let Inst{25-21} = rd;
+ let Inst{20-16} = rs;
+ let Inst{15-13} = fcc;
+ let Inst{12-6} = func;
+ let Inst{5-0} = 0x3b;
+}
+
+class MTLO_FM_MM<bits<10> funct> : MMArch {
+ bits<5> rs;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x00;
+ let Inst{25-21} = 0x00;
+ let Inst{20-16} = rs;
+ let Inst{15-6} = funct;
+ let Inst{5-0} = 0x3c;
+}
+
+class MFLO_FM_MM<bits<10> funct> : MMArch {
+ bits<5> rd;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x00;
+ let Inst{25-21} = 0x00;
+ let Inst{20-16} = rd;
+ let Inst{15-6} = funct;
+ let Inst{5-0} = 0x3c;
+}
+
+class CLO_FM_MM<bits<10> funct> : MMArch {
+ bits<5> rd;
+ bits<5> rs;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x00;
+ let Inst{25-21} = rd;
+ let Inst{20-16} = rs;
+ let Inst{15-6} = funct;
+ let Inst{5-0} = 0x3c;
+}
+
+class SEB_FM_MM<bits<10> funct> : MMArch {
+ bits<5> rd;
+ bits<5> rt;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x00;
+ let Inst{25-21} = rd;
+ let Inst{20-16} = rt;
+ let Inst{15-6} = funct;
+ let Inst{5-0} = 0x3c;
+}
+
+class EXT_FM_MM<bits<6> funct> : MMArch {
+ bits<5> rt;
+ bits<5> rs;
+ bits<5> pos;
+ bits<5> size;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x00;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-11} = size;
+ let Inst{10-6} = pos;
+ let Inst{5-0} = funct;
+}
+
+class J_FM_MM<bits<6> op> : MMArch {
+ bits<26> target;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = op;
+ let Inst{25-0} = target;
+}
+
+class JR_FM_MM<bits<8> funct> : MMArch {
+ bits<5> rs;
+
+ bits<32> Inst;
+
+ let Inst{31-21} = 0x00;
+ let Inst{20-16} = rs;
+ let Inst{15-14} = 0x0;
+ let Inst{13-6} = funct;
+ let Inst{5-0} = 0x3c;
+}
+
+class JALR_FM_MM<bits<10> funct> {
+ bits<5> rs;
+ bits<5> rd;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x00;
+ let Inst{25-21} = rd;
+ let Inst{20-16} = rs;
+ let Inst{15-6} = funct;
+ let Inst{5-0} = 0x3c;
+}
+
+class BEQ_FM_MM<bits<6> op> : MMArch {
+ bits<5> rs;
+ bits<5> rt;
+ bits<16> offset;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = op;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-0} = offset;
+}
+
+class BGEZ_FM_MM<bits<5> funct> : MMArch {
+ bits<5> rs;
+ bits<16> offset;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x10;
+ let Inst{25-21} = funct;
+ let Inst{20-16} = rs;
+ let Inst{15-0} = offset;
+}
+
+class BGEZAL_FM_MM<bits<5> funct> : MMArch {
+ bits<5> rs;
+ bits<16> offset;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x10;
+ let Inst{25-21} = funct;
+ let Inst{20-16} = rs;
+ let Inst{15-0} = offset;
+}
+
+class SYNC_FM_MM : MMArch {
+ bits<5> stype;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x00;
+ let Inst{25-21} = 0x0;
+ let Inst{20-16} = stype;
+ let Inst{15-6} = 0x1ad;
+ let Inst{5-0} = 0x3c;
+}
+
+class SYNCI_FM_MM : MMArch {
+ bits<5> rs;
+ bits<16> offset;
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010000;
+ let Inst{25-21} = 0b10000;
+ let Inst{20-16} = rs;
+ let Inst{15-0} = offset;
+}
+
+class BRK_FM_MM : MMArch {
+ bits<10> code_1;
+ bits<10> code_2;
+ bits<32> Inst;
+ let Inst{31-26} = 0x0;
+ let Inst{25-16} = code_1;
+ let Inst{15-6} = code_2;
+ let Inst{5-0} = 0x07;
+}
+
+class SYS_FM_MM : MMArch {
+ bits<10> code_;
+ bits<32> Inst;
+ let Inst{31-26} = 0x0;
+ let Inst{25-16} = code_;
+ let Inst{15-6} = 0x22d;
+ let Inst{5-0} = 0x3c;
+}
+
+class WAIT_FM_MM {
+ bits<10> code_;
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x00;
+ let Inst{25-16} = code_;
+ let Inst{15-6} = 0x24d;
+ let Inst{5-0} = 0x3c;
+}
+
+class ER_FM_MM<bits<10> funct> : MMArch {
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x00;
+ let Inst{25-16} = 0x00;
+ let Inst{15-6} = funct;
+ let Inst{5-0} = 0x3c;
+}
+
+class EI_FM_MM<bits<10> funct> : MMArch {
+ bits<32> Inst;
+ bits<5> rt;
+
+ let Inst{31-26} = 0x00;
+ let Inst{25-21} = 0x00;
+ let Inst{20-16} = rt;
+ let Inst{15-6} = funct;
+ let Inst{5-0} = 0x3c;
+}
+
+class TEQ_FM_MM<bits<6> funct> : MMArch {
+ bits<5> rs;
+ bits<5> rt;
+ bits<4> code_;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x00;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-12} = code_;
+ let Inst{11-6} = funct;
+ let Inst{5-0} = 0x3c;
+}
+
+class TEQI_FM_MM<bits<5> funct> : MMArch {
+ bits<5> rs;
+ bits<16> imm16;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x10;
+ let Inst{25-21} = funct;
+ let Inst{20-16} = rs;
+ let Inst{15-0} = imm16;
+}
+
+class LL_FM_MM<bits<4> funct> : MMArch {
+ bits<5> rt;
+ bits<21> addr;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x18;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = addr{20-16};
+ let Inst{15-12} = funct;
+ let Inst{11-0} = addr{11-0};
+}
+
+class LLE_FM_MM<bits<4> funct> {
+ bits<5> rt;
+ bits<21> addr;
+ bits<5> base = addr{20-16};
+ bits<9> offset = addr{8-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x18;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = base;
+ let Inst{15-12} = funct;
+ let Inst{11-9} = 0x6;
+ let Inst{8-0} = offset;
+}
+
+class ADDS_FM_MM<bits<2> fmt, bits<8> funct> : MMArch {
+ bits<5> ft;
+ bits<5> fs;
+ bits<5> fd;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x15;
+ let Inst{25-21} = ft;
+ let Inst{20-16} = fs;
+ let Inst{15-11} = fd;
+ let Inst{10} = 0;
+ let Inst{9-8} = fmt;
+ let Inst{7-0} = funct;
+
+ list<dag> Pattern = [];
+}
+
+class LWXC1_FM_MM<bits<9> funct> : MMArch {
+ bits<5> fd;
+ bits<5> base;
+ bits<5> index;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x15;
+ let Inst{25-21} = index;
+ let Inst{20-16} = base;
+ let Inst{15-11} = fd;
+ let Inst{10-9} = 0x0;
+ let Inst{8-0} = funct;
+}
+
+class SWXC1_FM_MM<bits<9> funct> : MMArch {
+ bits<5> fs;
+ bits<5> base;
+ bits<5> index;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x15;
+ let Inst{25-21} = index;
+ let Inst{20-16} = base;
+ let Inst{15-11} = fs;
+ let Inst{10-9} = 0x0;
+ let Inst{8-0} = funct;
+}
+
+class CEQS_FM_MM<bits<2> fmt> : MMArch {
+ bits<5> fs;
+ bits<5> ft;
+ bits<4> cond;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x15;
+ let Inst{25-21} = ft;
+ let Inst{20-16} = fs;
+ let Inst{15-13} = 0x0; // cc
+ let Inst{12} = 0;
+ let Inst{11-10} = fmt;
+ let Inst{9-6} = cond;
+ let Inst{5-0} = 0x3c;
+}
+
+class BC1F_FM_MM<bits<5> tf> : MMArch {
+ bits<16> offset;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x10;
+ let Inst{25-21} = tf;
+ let Inst{20-18} = 0x0; // cc
+ let Inst{17-16} = 0x0;
+ let Inst{15-0} = offset;
+}
+
+class ROUND_W_FM_MM<bits<1> fmt, bits<8> funct> : MMArch {
+ bits<5> fd;
+ bits<5> fs;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x15;
+ let Inst{25-21} = fd;
+ let Inst{20-16} = fs;
+ let Inst{15} = 0;
+ let Inst{14} = fmt;
+ let Inst{13-6} = funct;
+ let Inst{5-0} = 0x3b;
+}
+
+class ABS_FM_MM<bits<2> fmt, bits<7> funct> : MMArch {
+ bits<5> fd;
+ bits<5> fs;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x15;
+ let Inst{25-21} = fd;
+ let Inst{20-16} = fs;
+ let Inst{15} = 0;
+ let Inst{14-13} = fmt;
+ let Inst{12-6} = funct;
+ let Inst{5-0} = 0x3b;
+}
+
+class CMov_F_F_FM_MM<bits<9> func, bits<2> fmt> : MMArch {
+ bits<5> fd;
+ bits<5> fs;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x15;
+ let Inst{25-21} = fd;
+ let Inst{20-16} = fs;
+ let Inst{15-13} = 0x0; //cc
+ let Inst{12-11} = 0x0;
+ let Inst{10-9} = fmt;
+ let Inst{8-0} = func;
+}
+
+class CMov_I_F_FM_MM<bits<8> funct, bits<2> fmt> : MMArch {
+ bits<5> fd;
+ bits<5> fs;
+ bits<5> rt;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x15;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = fs;
+ let Inst{15-11} = fd;
+ let Inst{9-8} = fmt;
+ let Inst{7-0} = funct;
+}
+
+class MFC1_FM_MM<bits<8> funct> : MMArch {
+ bits<5> rt;
+ bits<5> fs;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x15;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = fs;
+ let Inst{15-14} = 0x0;
+ let Inst{13-6} = funct;
+ let Inst{5-0} = 0x3b;
+}
+
+class MADDS_FM_MM<bits<6> funct>: MMArch {
+ bits<5> ft;
+ bits<5> fs;
+ bits<5> fd;
+ bits<5> fr;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x15;
+ let Inst{25-21} = ft;
+ let Inst{20-16} = fs;
+ let Inst{15-11} = fd;
+ let Inst{10-6} = fr;
+ let Inst{5-0} = funct;
+}
+
+class COMPACT_BRANCH_FM_MM<bits<5> funct> {
+ bits<5> rs;
+ bits<16> offset;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x10;
+ let Inst{25-21} = funct;
+ let Inst{20-16} = rs;
+ let Inst{15-0} = offset;
+}
+
+class COP0_TLB_FM_MM<bits<10> op> : MMArch {
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x0;
+ let Inst{25-16} = 0x0;
+ let Inst{15-6} = op;
+ let Inst{5-0} = 0x3c;
+}
+
+class SDBBP_FM_MM : MMArch {
+ bits<10> code_;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x0;
+ let Inst{25-16} = code_;
+ let Inst{15-6} = 0x36d;
+ let Inst{5-0} = 0x3c;
+}
+
+class RDHWR_FM_MM : MMArch {
+ bits<5> rt;
+ bits<5> rd;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x0;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rd;
+ let Inst{15-6} = 0x1ac;
+ let Inst{5-0} = 0x3c;
+}
+
+class LWXS_FM_MM<bits<10> funct> {
+ bits<5> rd;
+ bits<5> base;
+ bits<5> index;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x0;
+ let Inst{25-21} = index;
+ let Inst{20-16} = base;
+ let Inst{15-11} = rd;
+ let Inst{10} = 0;
+ let Inst{9-0} = funct;
+}
+
+class LWM_FM_MM<bits<4> funct> : MMArch {
+ bits<5> rt;
+ bits<21> addr;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x8;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = addr{20-16};
+ let Inst{15-12} = funct;
+ let Inst{11-0} = addr{11-0};
+}
+
+class LWM_FM_MM16<bits<4> funct> : MMArch, PredicateControl {
+ bits<2> rt;
+ bits<4> addr;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = 0x11;
+ let Inst{9-6} = funct;
+ let Inst{5-4} = rt;
+ let Inst{3-0} = addr;
+}
+
+class CACHE_PREF_FM_MM<bits<6> op, bits<4> funct> : MMArch {
+ bits<21> addr;
+ bits<5> hint;
+ bits<5> base = addr{20-16};
+ bits<12> offset = addr{11-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = op;
+ let Inst{25-21} = hint;
+ let Inst{20-16} = base;
+ let Inst{15-12} = funct;
+ let Inst{11-0} = offset;
+}
+
+class CACHE_PREFE_FM_MM<bits<6> op, bits<3> funct> : MMArch {
+ bits<21> addr;
+ bits<5> hint;
+ bits<5> base = addr{20-16};
+ bits<9> offset = addr{8-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = op;
+ let Inst{25-21} = hint;
+ let Inst{20-16} = base;
+ let Inst{15-12} = 0xA;
+ let Inst{11-9} = funct;
+ let Inst{8-0} = offset;
+}
+
+class POOL32F_PREFX_FM_MM<bits<6> op, bits<9> funct> : MMArch {
+ bits<5> index;
+ bits<5> base;
+ bits<5> hint;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = op;
+ let Inst{25-21} = index;
+ let Inst{20-16} = base;
+ let Inst{15-11} = hint;
+ let Inst{10-9} = 0x0;
+ let Inst{8-0} = funct;
+}
+
+class BARRIER_FM_MM<bits<5> op> : MMArch {
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x0;
+ let Inst{25-21} = 0x0;
+ let Inst{20-16} = 0x0;
+ let Inst{15-11} = op;
+ let Inst{10-6} = 0x0;
+ let Inst{5-0} = 0x0;
+}
+
+class ADDIUPC_FM_MM {
+ bits<3> rs;
+ bits<23> imm;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x1e;
+ let Inst{25-23} = rs;
+ let Inst{22-0} = imm;
+}
+
+class POOL32A_CFTC2_FM_MM<bits<10> funct> : MMArch {
+ bits<5> rt;
+ bits<5> impl;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = impl;
+ let Inst{15-6} = funct;
+ let Inst{5-0} = 0b111100;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MicroMipsInstrInfo.td b/contrib/llvm/lib/Target/Mips/MicroMipsInstrInfo.td
new file mode 100644
index 000000000000..c0de9e7390a4
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MicroMipsInstrInfo.td
@@ -0,0 +1,1190 @@
+def addrimm11 : ComplexPattern<iPTR, 2, "selectIntAddr11MM", [frameindex]>;
+def addrimm12 : ComplexPattern<iPTR, 2, "selectIntAddr12MM", [frameindex]>;
+def addrimm16 : ComplexPattern<iPTR, 2, "selectIntAddr16MM", [frameindex]>;
+def addrimm4lsl2 : ComplexPattern<iPTR, 2, "selectIntAddrLSL2MM", [frameindex]>;
+
+def simm9_addiusp : Operand<i32> {
+ let EncoderMethod = "getSImm9AddiuspValue";
+ let DecoderMethod = "DecodeSimm9SP";
+}
+
+def uimm3_shift : Operand<i32> {
+ let EncoderMethod = "getUImm3Mod8Encoding";
+ let DecoderMethod = "DecodePOOL16BEncodedField";
+}
+
+def simm3_lsa2 : Operand<i32> {
+ let EncoderMethod = "getSImm3Lsa2Value";
+ let DecoderMethod = "DecodeAddiur2Simm7";
+}
+
+def uimm4_andi : Operand<i32> {
+ let EncoderMethod = "getUImm4AndValue";
+ let DecoderMethod = "DecodeANDI16Imm";
+}
+
+def immSExtAddiur2 : ImmLeaf<i32, [{return Imm == 1 || Imm == -1 ||
+ ((Imm % 4 == 0) &&
+ Imm < 28 && Imm > 0);}]>;
+
+def immSExtAddius5 : ImmLeaf<i32, [{return Imm >= -8 && Imm <= 7;}]>;
+
+def immZExtAndi16 : ImmLeaf<i32,
+ [{return (Imm == 128 || (Imm >= 1 && Imm <= 4) || Imm == 7 || Imm == 8 ||
+ Imm == 15 || Imm == 16 || Imm == 31 || Imm == 32 || Imm == 63 ||
+ Imm == 64 || Imm == 255 || Imm == 32768 || Imm == 65535 );}]>;
+
+def immZExt2Shift : ImmLeaf<i32, [{return Imm >= 1 && Imm <= 8;}]>;
+
+def immLi16 : ImmLeaf<i32, [{return Imm >= -1 && Imm <= 126;}]>;
+
+def MicroMipsMemGPRMM16AsmOperand : AsmOperandClass {
+ let Name = "MicroMipsMem";
+ let RenderMethod = "addMicroMipsMemOperands";
+ let ParserMethod = "parseMemOperand";
+ let PredicateMethod = "isMemWithGRPMM16Base";
+}
+
+// Define the classes of pointers used by microMIPS.
+// The numbers must match those in MipsRegisterInfo::MipsPtrClass.
+def ptr_gpr16mm_rc : PointerLikeRegClass<1>;
+def ptr_sp_rc : PointerLikeRegClass<2>;
+def ptr_gp_rc : PointerLikeRegClass<3>;
+
+class mem_mm_4_generic : Operand<i32> {
+ let PrintMethod = "printMemOperand";
+ let MIOperandInfo = (ops ptr_gpr16mm_rc, simm4);
+ let OperandType = "OPERAND_MEMORY";
+ let ParserMatchClass = MicroMipsMemGPRMM16AsmOperand;
+}
+
+def mem_mm_4 : mem_mm_4_generic {
+ let EncoderMethod = "getMemEncodingMMImm4";
+}
+
+def mem_mm_4_lsl1 : mem_mm_4_generic {
+ let EncoderMethod = "getMemEncodingMMImm4Lsl1";
+}
+
+def mem_mm_4_lsl2 : mem_mm_4_generic {
+ let EncoderMethod = "getMemEncodingMMImm4Lsl2";
+}
+
+def MicroMipsMemSPAsmOperand : AsmOperandClass {
+ let Name = "MicroMipsMemSP";
+ let RenderMethod = "addMemOperands";
+ let ParserMethod = "parseMemOperand";
+ let PredicateMethod = "isMemWithUimmWordAlignedOffsetSP<7>";
+}
+
+def MicroMipsMemGPAsmOperand : AsmOperandClass {
+ let Name = "MicroMipsMemGP";
+ let RenderMethod = "addMemOperands";
+ let ParserMethod = "parseMemOperand";
+ let PredicateMethod = "isMemWithSimmWordAlignedOffsetGP<9>";
+}
+
+def mem_mm_sp_imm5_lsl2 : Operand<i32> {
+ let PrintMethod = "printMemOperand";
+ let MIOperandInfo = (ops ptr_sp_rc:$base, simm5:$offset);
+ let OperandType = "OPERAND_MEMORY";
+ let ParserMatchClass = MicroMipsMemSPAsmOperand;
+ let EncoderMethod = "getMemEncodingMMSPImm5Lsl2";
+}
+
+def mem_mm_gp_simm7_lsl2 : Operand<i32> {
+ let PrintMethod = "printMemOperand";
+ let MIOperandInfo = (ops ptr_gp_rc:$base, simm7_lsl2:$offset);
+ let OperandType = "OPERAND_MEMORY";
+ let ParserMatchClass = MicroMipsMemGPAsmOperand;
+ let EncoderMethod = "getMemEncodingMMGPImm7Lsl2";
+}
+
+def mem_mm_9 : Operand<i32> {
+ let PrintMethod = "printMemOperand";
+ let MIOperandInfo = (ops ptr_rc, simm9);
+ let EncoderMethod = "getMemEncodingMMImm9";
+ let ParserMatchClass = MipsMemSimm9AsmOperand;
+ let OperandType = "OPERAND_MEMORY";
+}
+
+def mem_mm_11 : Operand<i32> {
+ let PrintMethod = "printMemOperand";
+ let MIOperandInfo = (ops GPR32, simm11);
+ let EncoderMethod = "getMemEncodingMMImm11";
+ let ParserMatchClass = MipsMemSimm11AsmOperand;
+ let OperandType = "OPERAND_MEMORY";
+}
+
+def mem_mm_12 : Operand<i32> {
+ let PrintMethod = "printMemOperand";
+ let MIOperandInfo = (ops ptr_rc, simm12);
+ let EncoderMethod = "getMemEncodingMMImm12";
+ let ParserMatchClass = MipsMemAsmOperand;
+ let OperandType = "OPERAND_MEMORY";
+}
+
+def mem_mm_16 : Operand<i32> {
+ let PrintMethod = "printMemOperand";
+ let MIOperandInfo = (ops ptr_rc, simm16);
+ let EncoderMethod = "getMemEncodingMMImm16";
+ let ParserMatchClass = MipsMemSimm16AsmOperand;
+ let OperandType = "OPERAND_MEMORY";
+}
+
+def MipsMemUimm4AsmOperand : AsmOperandClass {
+ let Name = "MemOffsetUimm4";
+ let SuperClasses = [MipsMemAsmOperand];
+ let RenderMethod = "addMemOperands";
+ let ParserMethod = "parseMemOperand";
+ let PredicateMethod = "isMemWithUimmOffsetSP<6>";
+}
+
+def mem_mm_4sp : Operand<i32> {
+ let PrintMethod = "printMemOperand";
+ let MIOperandInfo = (ops ptr_sp_rc, uimm8);
+ let EncoderMethod = "getMemEncodingMMImm4sp";
+ let ParserMatchClass = MipsMemUimm4AsmOperand;
+ let OperandType = "OPERAND_MEMORY";
+}
+
+def jmptarget_mm : Operand<OtherVT> {
+ let EncoderMethod = "getJumpTargetOpValueMM";
+}
+
+def calltarget_mm : Operand<iPTR> {
+ let EncoderMethod = "getJumpTargetOpValueMM";
+}
+
+def brtarget7_mm : Operand<OtherVT> {
+ let EncoderMethod = "getBranchTarget7OpValueMM";
+ let OperandType = "OPERAND_PCREL";
+ let DecoderMethod = "DecodeBranchTarget7MM";
+ let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+def brtarget10_mm : Operand<OtherVT> {
+ let EncoderMethod = "getBranchTargetOpValueMMPC10";
+ let OperandType = "OPERAND_PCREL";
+ let DecoderMethod = "DecodeBranchTarget10MM";
+ let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+def brtarget_mm : Operand<OtherVT> {
+ let EncoderMethod = "getBranchTargetOpValueMM";
+ let OperandType = "OPERAND_PCREL";
+ let DecoderMethod = "DecodeBranchTargetMM";
+ let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+def simm23_lsl2 : Operand<i32> {
+ let EncoderMethod = "getSimm23Lsl2Encoding";
+ let DecoderMethod = "DecodeSimm23Lsl2";
+}
+
+class CompactBranchMM<string opstr, DAGOperand opnd, PatFrag cond_op,
+ RegisterOperand RO> :
+ InstSE<(outs), (ins RO:$rs, opnd:$offset),
+ !strconcat(opstr, "\t$rs, $offset"), [], II_BCCZC, FrmI> {
+ let isBranch = 1;
+ let isTerminator = 1;
+ let hasDelaySlot = 0;
+ let Defs = [AT];
+}
+
+let canFoldAsLoad = 1 in
+class LoadLeftRightMM<string opstr, SDNode OpNode, RegisterOperand RO,
+ Operand MemOpnd, InstrItinClass Itin> :
+ InstSE<(outs RO:$rt), (ins MemOpnd:$addr, RO:$src),
+ !strconcat(opstr, "\t$rt, $addr"),
+ [(set RO:$rt, (OpNode addrimm12:$addr, RO:$src))],
+ Itin, FrmI> {
+ let DecoderMethod = "DecodeMemMMImm12";
+ string Constraints = "$src = $rt";
+}
+
+class StoreLeftRightMM<string opstr, SDNode OpNode, RegisterOperand RO,
+ Operand MemOpnd, InstrItinClass Itin>:
+ InstSE<(outs), (ins RO:$rt, MemOpnd:$addr),
+ !strconcat(opstr, "\t$rt, $addr"),
+ [(OpNode RO:$rt, addrimm12:$addr)], Itin, FrmI> {
+ let DecoderMethod = "DecodeMemMMImm12";
+}
+
+/// A register pair used by movep instruction.
+def MovePRegPairAsmOperand : AsmOperandClass {
+ let Name = "MovePRegPair";
+ let ParserMethod = "parseMovePRegPair";
+ let PredicateMethod = "isMovePRegPair";
+}
+
+def movep_regpair : Operand<i32> {
+ let EncoderMethod = "getMovePRegPairOpValue";
+ let ParserMatchClass = MovePRegPairAsmOperand;
+ let PrintMethod = "printRegisterList";
+ let DecoderMethod = "DecodeMovePRegPair";
+ let MIOperandInfo = (ops ptr_rc, ptr_rc);
+}
+
+class MovePMM16<string opstr, RegisterOperand RO> :
+MicroMipsInst16<(outs movep_regpair:$dst_regs), (ins RO:$rs, RO:$rt),
+ !strconcat(opstr, "\t$dst_regs, $rs, $rt"), [],
+ NoItinerary, FrmR> {
+ let isReMaterializable = 1;
+}
+
+/// A register pair used by load/store pair instructions.
+def RegPairAsmOperand : AsmOperandClass {
+ let Name = "RegPair";
+ let ParserMethod = "parseRegisterPair";
+ let PredicateMethod = "isRegPair";
+}
+
+def regpair : Operand<i32> {
+ let EncoderMethod = "getRegisterPairOpValue";
+ let ParserMatchClass = RegPairAsmOperand;
+ let PrintMethod = "printRegisterPair";
+ let DecoderMethod = "DecodeRegPairOperand";
+ let MIOperandInfo = (ops ptr_rc, ptr_rc);
+}
+
+class StorePairMM<string opstr, ComplexPattern Addr = addr>
+ : InstSE<(outs), (ins regpair:$rt, mem_simm12:$addr),
+ !strconcat(opstr, "\t$rt, $addr"), [], II_SWP, FrmI, opstr> {
+ let DecoderMethod = "DecodeMemMMImm12";
+ let mayStore = 1;
+}
+
+class LoadPairMM<string opstr, ComplexPattern Addr = addr>
+ : InstSE<(outs regpair:$rt), (ins mem_simm12:$addr),
+ !strconcat(opstr, "\t$rt, $addr"), [], II_LWP, FrmI, opstr> {
+ let DecoderMethod = "DecodeMemMMImm12";
+ let mayLoad = 1;
+}
+
+class LLBaseMM<string opstr, RegisterOperand RO> :
+ InstSE<(outs RO:$rt), (ins mem_mm_12:$addr),
+ !strconcat(opstr, "\t$rt, $addr"), [], II_LL, FrmI> {
+ let DecoderMethod = "DecodeMemMMImm12";
+ let mayLoad = 1;
+}
+
+class LLEBaseMM<string opstr, RegisterOperand RO> :
+ InstSE<(outs RO:$rt), (ins mem_simm9:$addr),
+ !strconcat(opstr, "\t$rt, $addr"), [], II_LLE, FrmI> {
+ let DecoderMethod = "DecodeMemMMImm9";
+ let mayLoad = 1;
+}
+
+class SCBaseMM<string opstr, RegisterOperand RO> :
+ InstSE<(outs RO:$dst), (ins RO:$rt, mem_mm_12:$addr),
+ !strconcat(opstr, "\t$rt, $addr"), [], II_SC, FrmI> {
+ let DecoderMethod = "DecodeMemMMImm12";
+ let mayStore = 1;
+ let Constraints = "$rt = $dst";
+}
+
+class SCEBaseMM<string opstr, RegisterOperand RO> :
+ InstSE<(outs RO:$dst), (ins RO:$rt, mem_simm9:$addr),
+ !strconcat(opstr, "\t$rt, $addr"), [], II_SCE, FrmI> {
+ let DecoderMethod = "DecodeMemMMImm9";
+ let mayStore = 1;
+ let Constraints = "$rt = $dst";
+}
+
+class LoadMM<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag,
+ InstrItinClass Itin = NoItinerary, DAGOperand MO = mem_mm_12> :
+ InstSE<(outs RO:$rt), (ins MO:$addr),
+ !strconcat(opstr, "\t$rt, $addr"),
+ [(set RO:$rt, (OpNode addrimm12:$addr))], Itin, FrmI, opstr> {
+ let DecoderMethod = "DecodeMemMMImm12";
+ let canFoldAsLoad = 1;
+ let mayLoad = 1;
+}
+
+class ArithRMM16<string opstr, RegisterOperand RO, bit isComm = 0,
+ InstrItinClass Itin = NoItinerary,
+ SDPatternOperator OpNode = null_frag> :
+ MicroMipsInst16<(outs RO:$rd), (ins RO:$rs, RO:$rt),
+ !strconcat(opstr, "\t$rd, $rs, $rt"),
+ [(set RO:$rd, (OpNode RO:$rs, RO:$rt))], Itin, FrmR> {
+ let isCommutable = isComm;
+}
+
+class AndImmMM16<string opstr, RegisterOperand RO,
+ InstrItinClass Itin = NoItinerary> :
+ MicroMipsInst16<(outs RO:$rd), (ins RO:$rs, uimm4_andi:$imm),
+ !strconcat(opstr, "\t$rd, $rs, $imm"), [], Itin, FrmI>;
+
+class LogicRMM16<string opstr, RegisterOperand RO,
+ InstrItinClass Itin = NoItinerary,
+ SDPatternOperator OpNode = null_frag> :
+ MicroMipsInst16<(outs RO:$dst), (ins RO:$rs, RO:$rt),
+ !strconcat(opstr, "\t$rt, $rs"),
+ [(set RO:$dst, (OpNode RO:$rs, RO:$rt))], Itin, FrmR> {
+ let isCommutable = 1;
+ let Constraints = "$rt = $dst";
+}
+
+class NotMM16<string opstr, RegisterOperand RO> :
+ MicroMipsInst16<(outs RO:$rt), (ins RO:$rs),
+ !strconcat(opstr, "\t$rt, $rs"),
+ [(set RO:$rt, (not RO:$rs))], II_NOT, FrmR>;
+
+class ShiftIMM16<string opstr, Operand ImmOpnd, RegisterOperand RO,
+ InstrItinClass Itin = NoItinerary> :
+ MicroMipsInst16<(outs RO:$rd), (ins RO:$rt, ImmOpnd:$shamt),
+ !strconcat(opstr, "\t$rd, $rt, $shamt"), [], Itin, FrmR>;
+
+class LoadMM16<string opstr, DAGOperand RO, SDPatternOperator OpNode,
+ InstrItinClass Itin, Operand MemOpnd> :
+ MicroMipsInst16<(outs RO:$rt), (ins MemOpnd:$addr),
+ !strconcat(opstr, "\t$rt, $addr"), [], Itin, FrmI> {
+ let DecoderMethod = "DecodeMemMMImm4";
+ let canFoldAsLoad = 1;
+ let mayLoad = 1;
+}
+
+class StoreMM16<string opstr, DAGOperand RTOpnd, DAGOperand RO,
+ SDPatternOperator OpNode, InstrItinClass Itin,
+ Operand MemOpnd> :
+ MicroMipsInst16<(outs), (ins RTOpnd:$rt, MemOpnd:$addr),
+ !strconcat(opstr, "\t$rt, $addr"), [], Itin, FrmI> {
+ let DecoderMethod = "DecodeMemMMImm4";
+ let mayStore = 1;
+}
+
+class LoadSPMM16<string opstr, DAGOperand RO, InstrItinClass Itin,
+ Operand MemOpnd> :
+ MicroMipsInst16<(outs RO:$rt), (ins MemOpnd:$offset),
+ !strconcat(opstr, "\t$rt, $offset"), [], Itin, FrmI> {
+ let DecoderMethod = "DecodeMemMMSPImm5Lsl2";
+ let canFoldAsLoad = 1;
+ let mayLoad = 1;
+}
+
+class StoreSPMM16<string opstr, DAGOperand RO, InstrItinClass Itin,
+ Operand MemOpnd> :
+ MicroMipsInst16<(outs), (ins RO:$rt, MemOpnd:$offset),
+ !strconcat(opstr, "\t$rt, $offset"), [], Itin, FrmI> {
+ let DecoderMethod = "DecodeMemMMSPImm5Lsl2";
+ let mayStore = 1;
+}
+
+class LoadGPMM16<string opstr, DAGOperand RO, InstrItinClass Itin,
+ Operand MemOpnd> :
+ MicroMipsInst16<(outs RO:$rt), (ins MemOpnd:$offset),
+ !strconcat(opstr, "\t$rt, $offset"), [], Itin, FrmI> {
+ let DecoderMethod = "DecodeMemMMGPImm7Lsl2";
+ let canFoldAsLoad = 1;
+ let mayLoad = 1;
+}
+
+class AddImmUR2<string opstr, RegisterOperand RO> :
+ MicroMipsInst16<(outs RO:$rd), (ins RO:$rs, simm3_lsa2:$imm),
+ !strconcat(opstr, "\t$rd, $rs, $imm"),
+ [], II_ADDIU, FrmR> {
+ let isCommutable = 1;
+}
+
+class AddImmUS5<string opstr, RegisterOperand RO> :
+ MicroMipsInst16<(outs RO:$dst), (ins RO:$rd, simm4:$imm),
+ !strconcat(opstr, "\t$rd, $imm"), [], II_ADDIU, FrmR> {
+ let Constraints = "$rd = $dst";
+}
+
+class AddImmUR1SP<string opstr, RegisterOperand RO> :
+ MicroMipsInst16<(outs RO:$rd), (ins uimm6_lsl2:$imm),
+ !strconcat(opstr, "\t$rd, $imm"), [], II_ADDIU, FrmR>;
+
+class AddImmUSP<string opstr> :
+ MicroMipsInst16<(outs), (ins simm9_addiusp:$imm),
+ !strconcat(opstr, "\t$imm"), [], II_ADDIU, FrmI>;
+
+class MoveFromHILOMM<string opstr, RegisterOperand RO, Register UseReg> :
+ MicroMipsInst16<(outs RO:$rd), (ins), !strconcat(opstr, "\t$rd"),
+ [], II_MFHI_MFLO, FrmR> {
+ let Uses = [UseReg];
+ let hasSideEffects = 0;
+}
+
+class MoveMM16<string opstr, RegisterOperand RO>
+ : MicroMipsInst16<(outs RO:$rd), (ins RO:$rs),
+ !strconcat(opstr, "\t$rd, $rs"), [], II_MOVE, FrmR> {
+ let isReMaterializable = 1;
+}
+
+class LoadImmMM16<string opstr, Operand Od, RegisterOperand RO> :
+ MicroMipsInst16<(outs RO:$rd), (ins Od:$imm),
+ !strconcat(opstr, "\t$rd, $imm"), [], II_LI, FrmI> {
+ let isReMaterializable = 1;
+}
+
+// 16-bit Jump and Link (Call)
+class JumpLinkRegMM16<string opstr, RegisterOperand RO> :
+ MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"),
+ [(MipsJmpLink RO:$rs)], II_JALR, FrmR>, PredicateControl {
+ let isCall = 1;
+ let hasDelaySlot = 1;
+ let Defs = [RA];
+}
+
+// 16-bit Jump Reg
+class JumpRegMM16<string opstr, RegisterOperand RO> :
+ MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"),
+ [], II_JR, FrmR> {
+ let hasDelaySlot = 1;
+ let isBranch = 1;
+ let isIndirectBranch = 1;
+}
+
+// Base class for JRADDIUSP instruction.
+class JumpRAddiuStackMM16 :
+ MicroMipsInst16<(outs), (ins uimm5_lsl2:$imm), "jraddiusp\t$imm",
+ [], II_JRADDIUSP, FrmR> {
+ let isTerminator = 1;
+ let isBarrier = 1;
+ let isBranch = 1;
+ let isIndirectBranch = 1;
+}
+
+// 16-bit Jump and Link (Call) - Short Delay Slot
+class JumpLinkRegSMM16<string opstr, RegisterOperand RO> :
+ MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"),
+ [], II_JALRS, FrmR> {
+ let isCall = 1;
+ let hasDelaySlot = 1;
+ let Defs = [RA];
+}
+
+// 16-bit Jump Register Compact - No delay slot
+class JumpRegCMM16<string opstr, RegisterOperand RO> :
+ MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"),
+ [], II_JRC, FrmR> {
+ let isTerminator = 1;
+ let isBarrier = 1;
+ let isBranch = 1;
+ let isIndirectBranch = 1;
+}
+
+// Break16 and Sdbbp16
+class BrkSdbbp16MM<string opstr, InstrItinClass Itin> :
+ MicroMipsInst16<(outs), (ins uimm4:$code_),
+ !strconcat(opstr, "\t$code_"),
+ [], Itin, FrmOther>;
+
+class CBranchZeroMM<string opstr, DAGOperand opnd, RegisterOperand RO> :
+ MicroMipsInst16<(outs), (ins RO:$rs, opnd:$offset),
+ !strconcat(opstr, "\t$rs, $offset"), [], II_BCCZ, FrmI> {
+ let isBranch = 1;
+ let isTerminator = 1;
+ let hasDelaySlot = 1;
+ let Defs = [AT];
+}
+
+// MicroMIPS Jump and Link (Call) - Short Delay Slot
+let isCall = 1, hasDelaySlot = 1, Defs = [RA] in {
+ class JumpLinkMM<string opstr, DAGOperand opnd> :
+ InstSE<(outs), (ins opnd:$target), !strconcat(opstr, "\t$target"),
+ [], II_JALS, FrmJ, opstr> {
+ let DecoderMethod = "DecodeJumpTargetMM";
+ }
+
+ class JumpLinkRegMM<string opstr, RegisterOperand RO>:
+ InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
+ [], II_JALRS, FrmR>;
+
+ class BranchCompareToZeroLinkMM<string opstr, DAGOperand opnd,
+ RegisterOperand RO> :
+ InstSE<(outs), (ins RO:$rs, opnd:$offset),
+ !strconcat(opstr, "\t$rs, $offset"), [], II_BCCZALS, FrmI, opstr>;
+}
+
+class LoadWordIndexedScaledMM<string opstr, RegisterOperand RO,
+ SDPatternOperator OpNode = null_frag> :
+ InstSE<(outs RO:$rd), (ins PtrRC:$base, PtrRC:$index),
+ !strconcat(opstr, "\t$rd, ${index}(${base})"), [], II_LWXS, FrmFI>;
+
+class PrefetchIndexed<string opstr> :
+ InstSE<(outs), (ins PtrRC:$base, PtrRC:$index, uimm5:$hint),
+ !strconcat(opstr, "\t$hint, ${index}(${base})"), [], II_PREF, FrmOther>;
+
+class AddImmUPC<string opstr, RegisterOperand RO> :
+ InstSE<(outs RO:$rs), (ins simm23_lsl2:$imm),
+ !strconcat(opstr, "\t$rs, $imm"), [], II_ADDIU, FrmR>;
+
+/// A list of registers used by load/store multiple instructions.
+def RegListAsmOperand : AsmOperandClass {
+ let Name = "RegList";
+ let ParserMethod = "parseRegisterList";
+}
+
+def reglist : Operand<i32> {
+ let EncoderMethod = "getRegisterListOpValue";
+ let ParserMatchClass = RegListAsmOperand;
+ let PrintMethod = "printRegisterList";
+ let DecoderMethod = "DecodeRegListOperand";
+}
+
+def RegList16AsmOperand : AsmOperandClass {
+ let Name = "RegList16";
+ let ParserMethod = "parseRegisterList";
+ let PredicateMethod = "isRegList16";
+ let RenderMethod = "addRegListOperands";
+}
+
+def reglist16 : Operand<i32> {
+ let EncoderMethod = "getRegisterListOpValue16";
+ let DecoderMethod = "DecodeRegListOperand16";
+ let PrintMethod = "printRegisterList";
+ let ParserMatchClass = RegList16AsmOperand;
+}
+
+class StoreMultMM<string opstr,
+ InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr> :
+ InstSE<(outs), (ins reglist:$rt, mem_mm_12:$addr),
+ !strconcat(opstr, "\t$rt, $addr"), [], Itin, FrmI, opstr> {
+ let DecoderMethod = "DecodeMemMMImm12";
+ let mayStore = 1;
+}
+
+class LoadMultMM<string opstr,
+ InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr> :
+ InstSE<(outs reglist:$rt), (ins mem_mm_12:$addr),
+ !strconcat(opstr, "\t$rt, $addr"), [], Itin, FrmI, opstr> {
+ let DecoderMethod = "DecodeMemMMImm12";
+ let mayLoad = 1;
+}
+
+class StoreMultMM16<string opstr,
+ InstrItinClass Itin = NoItinerary,
+ ComplexPattern Addr = addr> :
+ MicroMipsInst16<(outs), (ins reglist16:$rt, mem_mm_4sp:$addr),
+ !strconcat(opstr, "\t$rt, $addr"), [], Itin, FrmI> {
+ let DecoderMethod = "DecodeMemMMReglistImm4Lsl2";
+ let mayStore = 1;
+}
+
+class LoadMultMM16<string opstr,
+ InstrItinClass Itin = NoItinerary,
+ ComplexPattern Addr = addr> :
+ MicroMipsInst16<(outs reglist16:$rt), (ins mem_mm_4sp:$addr),
+ !strconcat(opstr, "\t$rt, $addr"), [], Itin, FrmI> {
+ let DecoderMethod = "DecodeMemMMReglistImm4Lsl2";
+ let mayLoad = 1;
+}
+
+class UncondBranchMM16<string opstr> :
+ MicroMipsInst16<(outs), (ins brtarget10_mm:$offset),
+ !strconcat(opstr, "\t$offset"),
+ [], II_B, FrmI> {
+ let isBranch = 1;
+ let isTerminator = 1;
+ let isBarrier = 1;
+ let hasDelaySlot = 1;
+ let Predicates = [RelocPIC, InMicroMips];
+ let Defs = [AT];
+}
+
+def ADDU16_MM : ArithRMM16<"addu16", GPRMM16Opnd, 1, II_ADDU, add>,
+ ARITH_FM_MM16<0>, ISA_MICROMIPS_NOT_32R6_64R6;
+def AND16_MM : LogicRMM16<"and16", GPRMM16Opnd, II_AND, and>,
+ LOGIC_FM_MM16<0x2>, ISA_MICROMIPS_NOT_32R6_64R6;
+def ANDI16_MM : AndImmMM16<"andi16", GPRMM16Opnd, II_AND>, ANDI_FM_MM16<0x0b>,
+ ISA_MICROMIPS_NOT_32R6_64R6;
+def NOT16_MM : NotMM16<"not16", GPRMM16Opnd>, LOGIC_FM_MM16<0x0>,
+ ISA_MICROMIPS_NOT_32R6_64R6;
+def OR16_MM : LogicRMM16<"or16", GPRMM16Opnd, II_OR, or>, LOGIC_FM_MM16<0x3>,
+ ISA_MICROMIPS_NOT_32R6_64R6;
+def SLL16_MM : ShiftIMM16<"sll16", uimm3_shift, GPRMM16Opnd, II_SLL>,
+ SHIFT_FM_MM16<0>, ISA_MICROMIPS_NOT_32R6_64R6;
+def SRL16_MM : ShiftIMM16<"srl16", uimm3_shift, GPRMM16Opnd, II_SRL>,
+ SHIFT_FM_MM16<1>, ISA_MICROMIPS_NOT_32R6_64R6;
+
+def SUBU16_MM : ArithRMM16<"subu16", GPRMM16Opnd, 0, II_SUBU, sub>,
+ ARITH_FM_MM16<1>, ISA_MICROMIPS_NOT_32R6_64R6;
+def XOR16_MM : LogicRMM16<"xor16", GPRMM16Opnd, II_XOR, xor>,
+ LOGIC_FM_MM16<0x1>, ISA_MICROMIPS_NOT_32R6_64R6;
+def LBU16_MM : LoadMM16<"lbu16", GPRMM16Opnd, zextloadi8, II_LBU,
+ mem_mm_4>, LOAD_STORE_FM_MM16<0x02>;
+def LHU16_MM : LoadMM16<"lhu16", GPRMM16Opnd, zextloadi16, II_LHU,
+ mem_mm_4_lsl1>, LOAD_STORE_FM_MM16<0x0a>;
+def LW16_MM : LoadMM16<"lw16", GPRMM16Opnd, load, II_LW, mem_mm_4_lsl2>,
+ LOAD_STORE_FM_MM16<0x1a>;
+def SB16_MM : StoreMM16<"sb16", GPRMM16OpndZero, GPRMM16Opnd, truncstorei8,
+ II_SB, mem_mm_4>, LOAD_STORE_FM_MM16<0x22>;
+def SH16_MM : StoreMM16<"sh16", GPRMM16OpndZero, GPRMM16Opnd, truncstorei16,
+ II_SH, mem_mm_4_lsl1>,
+ LOAD_STORE_FM_MM16<0x2a>;
+def SW16_MM : StoreMM16<"sw16", GPRMM16OpndZero, GPRMM16Opnd, store, II_SW,
+ mem_mm_4_lsl2>, LOAD_STORE_FM_MM16<0x3a>;
+def LWGP_MM : LoadGPMM16<"lw", GPRMM16Opnd, II_LW, mem_mm_gp_simm7_lsl2>,
+ LOAD_GP_FM_MM16<0x19>;
+def LWSP_MM : LoadSPMM16<"lw", GPR32Opnd, II_LW, mem_mm_sp_imm5_lsl2>,
+ LOAD_STORE_SP_FM_MM16<0x12>;
+def SWSP_MM : StoreSPMM16<"sw", GPR32Opnd, II_SW, mem_mm_sp_imm5_lsl2>,
+ LOAD_STORE_SP_FM_MM16<0x32>;
+def ADDIUR1SP_MM : AddImmUR1SP<"addiur1sp", GPRMM16Opnd>, ADDIUR1SP_FM_MM16;
+def ADDIUR2_MM : AddImmUR2<"addiur2", GPRMM16Opnd>, ADDIUR2_FM_MM16;
+def ADDIUS5_MM : AddImmUS5<"addius5", GPR32Opnd>, ADDIUS5_FM_MM16;
+def ADDIUSP_MM : AddImmUSP<"addiusp">, ADDIUSP_FM_MM16;
+def MFHI16_MM : MoveFromHILOMM<"mfhi", GPR32Opnd, AC0>, MFHILO_FM_MM16<0x10>;
+def MFLO16_MM : MoveFromHILOMM<"mflo", GPR32Opnd, AC0>, MFHILO_FM_MM16<0x12>;
+def MOVE16_MM : MoveMM16<"move", GPR32Opnd>, MOVE_FM_MM16<0x03>;
+def MOVEP_MM : MovePMM16<"movep", GPRMM16OpndMoveP>, MOVEP_FM_MM16;
+def LI16_MM : LoadImmMM16<"li16", li16_imm, GPRMM16Opnd>, LI_FM_MM16,
+ IsAsCheapAsAMove;
+def JALR16_MM : JumpLinkRegMM16<"jalr", GPR32Opnd>, JALR_FM_MM16<0x0e>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+def JALRS16_MM : JumpLinkRegSMM16<"jalrs16", GPR32Opnd>, JALR_FM_MM16<0x0f>;
+def JRC16_MM : JumpRegCMM16<"jrc", GPR32Opnd>, JALR_FM_MM16<0x0d>;
+def JRADDIUSP : JumpRAddiuStackMM16, JRADDIUSP_FM_MM16<0x18>;
+def JR16_MM : JumpRegMM16<"jr16", GPR32Opnd>, JALR_FM_MM16<0x0c>;
+def BEQZ16_MM : CBranchZeroMM<"beqz16", brtarget7_mm, GPRMM16Opnd>,
+ BEQNEZ_FM_MM16<0x23>;
+def BNEZ16_MM : CBranchZeroMM<"bnez16", brtarget7_mm, GPRMM16Opnd>,
+ BEQNEZ_FM_MM16<0x2b>;
+def B16_MM : UncondBranchMM16<"b16">, B16_FM;
+def BREAK16_MM : BrkSdbbp16MM<"break16", II_BREAK>, BRKSDBBP16_FM_MM<0x28>,
+ ISA_MICROMIPS_NOT_32R6_64R6;
+def SDBBP16_MM : BrkSdbbp16MM<"sdbbp16", II_SDBBP>, BRKSDBBP16_FM_MM<0x2C>,
+ ISA_MICROMIPS_NOT_32R6_64R6;
+
+let DecoderNamespace = "MicroMips" in {
+ /// Load and Store Instructions - multiple
+ def SWM16_MM : StoreMultMM16<"swm16", II_SWM>, LWM_FM_MM16<0x5>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+ def LWM16_MM : LoadMultMM16<"lwm16", II_LWM>, LWM_FM_MM16<0x4>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+ let AdditionalPredicates = [InMicroMips] in {
+ def CFC2_MM : InstSE<(outs GPR32Opnd:$rt), (ins COP2Opnd:$impl),
+ "cfc2\t$rt, $impl", [], II_CFC2, FrmFR, "cfc2">,
+ POOL32A_CFTC2_FM_MM<0b1100110100>;
+ def CTC2_MM : InstSE<(outs COP2Opnd:$impl), (ins GPR32Opnd:$rt),
+ "ctc2\t$rt, $impl", [], II_CTC2, FrmFR, "ctc2">,
+ POOL32A_CFTC2_FM_MM<0b1101110100>;
+ }
+}
+
+class WaitMM<string opstr> :
+ InstSE<(outs), (ins uimm10:$code_), !strconcat(opstr, "\t$code_"), [],
+ II_WAIT, FrmOther, opstr>;
+
+let DecoderNamespace = "MicroMips", Predicates = [InMicroMips, NotMips32r6,
+ NotMips64r6] in {
+ /// Compact Branch Instructions
+ def BEQZC_MM : CompactBranchMM<"beqzc", brtarget_mm, seteq, GPR32Opnd>,
+ COMPACT_BRANCH_FM_MM<0x7>;
+ def BNEZC_MM : CompactBranchMM<"bnezc", brtarget_mm, setne, GPR32Opnd>,
+ COMPACT_BRANCH_FM_MM<0x5>;
+}
+let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
+ /// Arithmetic Instructions (ALU Immediate)
+ def ADDiu_MM : MMRel, ArithLogicI<"addiu", simm16, GPR32Opnd, II_ADDIU>,
+ ADDI_FM_MM<0xc>;
+ def ADDi_MM : MMRel, ArithLogicI<"addi", simm16, GPR32Opnd, II_ADDI>,
+ ADDI_FM_MM<0x4>;
+ def SLTi_MM : MMRel, SetCC_I<"slti", setlt, simm16, immSExt16, GPR32Opnd>,
+ SLTI_FM_MM<0x24>;
+ def SLTiu_MM : MMRel, SetCC_I<"sltiu", setult, simm16, immSExt16, GPR32Opnd>,
+ SLTI_FM_MM<0x2c>;
+ def ANDi_MM : MMRel, ArithLogicI<"andi", uimm16, GPR32Opnd, II_ANDI>,
+ ADDI_FM_MM<0x34>;
+ def ORi_MM : MMRel, ArithLogicI<"ori", uimm16, GPR32Opnd, II_ORI, immZExt16,
+ or>, ADDI_FM_MM<0x14>;
+ def XORi_MM : MMRel, ArithLogicI<"xori", uimm16, GPR32Opnd, II_XORI,
+ immZExt16, xor>, ADDI_FM_MM<0x1c>;
+ def LUi_MM : MMRel, LoadUpper<"lui", GPR32Opnd, uimm16_relaxed>, LUI_FM_MM;
+
+ def LEA_ADDiu_MM : MMRel, EffectiveAddress<"addiu", GPR32Opnd>,
+ LW_FM_MM<0xc>;
+
+ /// Arithmetic Instructions (3-Operand, R-Type)
+ def ADDu_MM : MMRel, ArithLogicR<"addu", GPR32Opnd, 1, II_ADDU, add>,
+ ADD_FM_MM<0, 0x150>;
+ def SUBu_MM : MMRel, ArithLogicR<"subu", GPR32Opnd, 0, II_SUBU, sub>,
+ ADD_FM_MM<0, 0x1d0>;
+ def MUL_MM : MMRel, ArithLogicR<"mul", GPR32Opnd, 1, II_MUL>,
+ ADD_FM_MM<0, 0x210>;
+ def ADD_MM : MMRel, ArithLogicR<"add", GPR32Opnd, 1, II_ADD>,
+ ADD_FM_MM<0, 0x110>;
+ def SUB_MM : MMRel, ArithLogicR<"sub", GPR32Opnd, 0, II_SUB>,
+ ADD_FM_MM<0, 0x190>;
+ def SLT_MM : MMRel, SetCC_R<"slt", setlt, GPR32Opnd>, ADD_FM_MM<0, 0x350>;
+ def SLTu_MM : MMRel, SetCC_R<"sltu", setult, GPR32Opnd>,
+ ADD_FM_MM<0, 0x390>;
+ def AND_MM : MMRel, ArithLogicR<"and", GPR32Opnd, 1, II_AND, and>,
+ ADD_FM_MM<0, 0x250>;
+ def OR_MM : MMRel, ArithLogicR<"or", GPR32Opnd, 1, II_OR, or>,
+ ADD_FM_MM<0, 0x290>;
+ def XOR_MM : MMRel, ArithLogicR<"xor", GPR32Opnd, 1, II_XOR, xor>,
+ ADD_FM_MM<0, 0x310>;
+ def NOR_MM : MMRel, LogicNOR<"nor", GPR32Opnd>, ADD_FM_MM<0, 0x2d0>;
+ def MULT_MM : MMRel, Mult<"mult", II_MULT, GPR32Opnd, [HI0, LO0]>,
+ MULT_FM_MM<0x22c>;
+ def MULTu_MM : MMRel, Mult<"multu", II_MULTU, GPR32Opnd, [HI0, LO0]>,
+ MULT_FM_MM<0x26c>;
+ def SDIV_MM : MMRel, Div<"div", II_DIV, GPR32Opnd, [HI0, LO0]>,
+ MULT_FM_MM<0x2ac>, ISA_MIPS1_NOT_32R6_64R6;
+ def UDIV_MM : MMRel, Div<"divu", II_DIVU, GPR32Opnd, [HI0, LO0]>,
+ MULT_FM_MM<0x2ec>, ISA_MIPS1_NOT_32R6_64R6;
+
+ /// Arithmetic Instructions with PC and Immediate
+ def ADDIUPC_MM : AddImmUPC<"addiupc", GPRMM16Opnd>, ADDIUPC_FM_MM;
+
+ /// Shift Instructions
+ def SLL_MM : MMRel, shift_rotate_imm<"sll", uimm5, GPR32Opnd, II_SLL>,
+ SRA_FM_MM<0, 0>;
+ def SRL_MM : MMRel, shift_rotate_imm<"srl", uimm5, GPR32Opnd, II_SRL>,
+ SRA_FM_MM<0x40, 0>;
+ def SRA_MM : MMRel, shift_rotate_imm<"sra", uimm5, GPR32Opnd, II_SRA>,
+ SRA_FM_MM<0x80, 0>;
+ def SLLV_MM : MMRel, shift_rotate_reg<"sllv", GPR32Opnd, II_SLLV>,
+ SRLV_FM_MM<0x10, 0>;
+ def SRLV_MM : MMRel, shift_rotate_reg<"srlv", GPR32Opnd, II_SRLV>,
+ SRLV_FM_MM<0x50, 0>;
+ def SRAV_MM : MMRel, shift_rotate_reg<"srav", GPR32Opnd, II_SRAV>,
+ SRLV_FM_MM<0x90, 0>;
+ def ROTR_MM : MMRel, shift_rotate_imm<"rotr", uimm5, GPR32Opnd, II_ROTR>,
+ SRA_FM_MM<0xc0, 0> {
+ list<dag> Pattern = [(set GPR32Opnd:$rd,
+ (rotr GPR32Opnd:$rt, immZExt5:$shamt))];
+ }
+ def ROTRV_MM : MMRel, shift_rotate_reg<"rotrv", GPR32Opnd, II_ROTRV>,
+ SRLV_FM_MM<0xd0, 0> {
+ list<dag> Pattern = [(set GPR32Opnd:$rd,
+ (rotr GPR32Opnd:$rt, GPR32Opnd:$rs))];
+ }
+
+ /// Load and Store Instructions - aligned
+ let DecoderMethod = "DecodeMemMMImm16" in {
+ def LB_MM : LoadMemory<"lb", GPR32Opnd, mem_mm_16, null_frag, II_LB>,
+ MMRel, LW_FM_MM<0x7>;
+ def LBu_MM : LoadMemory<"lbu", GPR32Opnd, mem_mm_16, null_frag, II_LBU>,
+ MMRel, LW_FM_MM<0x5>;
+ def LH_MM : LoadMemory<"lh", GPR32Opnd, mem_simm16, sextloadi16, II_LH,
+ addrDefault>, MMRel, LW_FM_MM<0xf>;
+ def LHu_MM : LoadMemory<"lhu", GPR32Opnd, mem_simm16, zextloadi16, II_LHU>,
+ MMRel, LW_FM_MM<0xd>;
+ def LW_MM : Load<"lw", GPR32Opnd, null_frag, II_LW>, MMRel, LW_FM_MM<0x3f>;
+ def SB_MM : Store<"sb", GPR32Opnd, null_frag, II_SB>, MMRel,
+ LW_FM_MM<0x6>;
+ def SH_MM : Store<"sh", GPR32Opnd, null_frag, II_SH>, MMRel,
+ LW_FM_MM<0xe>;
+ def SW_MM : Store<"sw", GPR32Opnd, null_frag, II_SW>, MMRel,
+ LW_FM_MM<0x3e>;
+ }
+
+ let DecoderMethod = "DecodeMemMMImm9" in {
+ def LBE_MM : Load<"lbe", GPR32Opnd, null_frag, II_LBE>,
+ POOL32C_LHUE_FM_MM<0x18, 0x6, 0x4>;
+ def LBuE_MM : Load<"lbue", GPR32Opnd, null_frag, II_LBUE>,
+ POOL32C_LHUE_FM_MM<0x18, 0x6, 0x0>;
+ def LHE_MM : LoadMemory<"lhe", GPR32Opnd, mem_simm9, null_frag, II_LHE>,
+ POOL32C_LHUE_FM_MM<0x18, 0x6, 0x5>;
+ def LHuE_MM : LoadMemory<"lhue", GPR32Opnd, mem_simm9, null_frag, II_LHUE>,
+ POOL32C_LHUE_FM_MM<0x18, 0x6, 0x1>;
+ def LWE_MM : LoadMemory<"lwe", GPR32Opnd, mem_simm9, null_frag, II_LWE>,
+ POOL32C_LHUE_FM_MM<0x18, 0x6, 0x7>;
+ def SBE_MM : StoreMemory<"sbe", GPR32Opnd, mem_simm9, null_frag, II_SBE>,
+ POOL32C_LHUE_FM_MM<0x18, 0xa, 0x4>;
+ def SHE_MM : StoreMemory<"she", GPR32Opnd, mem_simm9, null_frag, II_SHE>,
+ POOL32C_LHUE_FM_MM<0x18, 0xa, 0x5>;
+ def SWE_MM : StoreMemory<"swe", GPR32Opnd, mem_simm9, null_frag, II_SWE>,
+ POOL32C_LHUE_FM_MM<0x18, 0xa, 0x7>;
+ }
+
+ def LWXS_MM : LoadWordIndexedScaledMM<"lwxs", GPR32Opnd>, LWXS_FM_MM<0x118>;
+
+ /// Load and Store Instructions - unaligned
+ def LWL_MM : LoadLeftRightMM<"lwl", MipsLWL, GPR32Opnd, mem_mm_12, II_LWL>,
+ LWL_FM_MM<0x0>;
+ def LWR_MM : LoadLeftRightMM<"lwr", MipsLWR, GPR32Opnd, mem_mm_12, II_LWR>,
+ LWL_FM_MM<0x1>;
+ def SWL_MM : StoreLeftRightMM<"swl", MipsSWL, GPR32Opnd, mem_mm_12, II_SWL>,
+ LWL_FM_MM<0x8>;
+ def SWR_MM : StoreLeftRightMM<"swr", MipsSWR, GPR32Opnd, mem_mm_12, II_SWR>,
+ LWL_FM_MM<0x9>;
+ let DecoderMethod = "DecodeMemMMImm9" in {
+ def LWLE_MM : LoadLeftRightMM<"lwle", MipsLWL, GPR32Opnd, mem_mm_9,
+ II_LWLE>, POOL32C_STEVA_LDEVA_FM_MM<0x6, 0x2>;
+ def LWRE_MM : LoadLeftRightMM<"lwre", MipsLWR, GPR32Opnd, mem_mm_9,
+ II_LWRE>, POOL32C_STEVA_LDEVA_FM_MM<0x6, 0x3>;
+ def SWLE_MM : StoreLeftRightMM<"swle", MipsSWL, GPR32Opnd, mem_mm_9,
+ II_SWLE>,
+ POOL32C_STEVA_LDEVA_FM_MM<0xa, 0x0>;
+ def SWRE_MM : StoreLeftRightMM<"swre", MipsSWR, GPR32Opnd, mem_mm_9,
+ II_SWRE>,
+ POOL32C_STEVA_LDEVA_FM_MM<0xa, 0x1>, ISA_MIPS1_NOT_32R6_64R6;
+ }
+
+ /// Load and Store Instructions - multiple
+ def SWM32_MM : StoreMultMM<"swm32", II_SWM>, LWM_FM_MM<0xd>;
+ def LWM32_MM : LoadMultMM<"lwm32", II_LWM>, LWM_FM_MM<0x5>;
+
+ /// Load and Store Pair Instructions
+ def SWP_MM : StorePairMM<"swp">, LWM_FM_MM<0x9>;
+ def LWP_MM : LoadPairMM<"lwp">, LWM_FM_MM<0x1>;
+
+ /// Load and Store multiple pseudo Instructions
+ class LoadWordMultMM<string instr_asm > :
+ MipsAsmPseudoInst<(outs reglist:$rt), (ins mem_mm_12:$addr),
+ !strconcat(instr_asm, "\t$rt, $addr")> ;
+
+ class StoreWordMultMM<string instr_asm > :
+ MipsAsmPseudoInst<(outs), (ins reglist:$rt, mem_mm_12:$addr),
+ !strconcat(instr_asm, "\t$rt, $addr")> ;
+
+
+ def SWM_MM : StoreWordMultMM<"swm">;
+ def LWM_MM : LoadWordMultMM<"lwm">;
+
+ /// Move Conditional
+ def MOVZ_I_MM : MMRel, CMov_I_I_FT<"movz", GPR32Opnd, GPR32Opnd,
+ NoItinerary>, ADD_FM_MM<0, 0x58>;
+ def MOVN_I_MM : MMRel, CMov_I_I_FT<"movn", GPR32Opnd, GPR32Opnd,
+ NoItinerary>, ADD_FM_MM<0, 0x18>;
+ def MOVT_I_MM : MMRel, CMov_F_I_FT<"movt", GPR32Opnd, II_MOVT>,
+ CMov_F_I_FM_MM<0x25>;
+ def MOVF_I_MM : MMRel, CMov_F_I_FT<"movf", GPR32Opnd, II_MOVF>,
+ CMov_F_I_FM_MM<0x5>;
+
+ /// Move to/from HI/LO
+ def MTHI_MM : MMRel, MoveToLOHI<"mthi", GPR32Opnd, [HI0]>,
+ MTLO_FM_MM<0x0b5>;
+ def MTLO_MM : MMRel, MoveToLOHI<"mtlo", GPR32Opnd, [LO0]>,
+ MTLO_FM_MM<0x0f5>;
+ def MFHI_MM : MMRel, MoveFromLOHI<"mfhi", GPR32Opnd, AC0>,
+ MFLO_FM_MM<0x035>;
+ def MFLO_MM : MMRel, MoveFromLOHI<"mflo", GPR32Opnd, AC0>,
+ MFLO_FM_MM<0x075>;
+
+ /// Multiply Add/Sub Instructions
+ def MADD_MM : MMRel, MArithR<"madd", II_MADD, 1>, MULT_FM_MM<0x32c>;
+ def MADDU_MM : MMRel, MArithR<"maddu", II_MADDU, 1>, MULT_FM_MM<0x36c>;
+ def MSUB_MM : MMRel, MArithR<"msub", II_MSUB>, MULT_FM_MM<0x3ac>;
+ def MSUBU_MM : MMRel, MArithR<"msubu", II_MSUBU>, MULT_FM_MM<0x3ec>;
+
+ /// Count Leading
+ def CLZ_MM : MMRel, CountLeading0<"clz", GPR32Opnd, II_CLZ>, CLO_FM_MM<0x16c>,
+ ISA_MIPS32;
+ def CLO_MM : MMRel, CountLeading1<"clo", GPR32Opnd, II_CLO>, CLO_FM_MM<0x12c>,
+ ISA_MIPS32;
+
+ /// Sign Ext In Register Instructions.
+ def SEB_MM : MMRel, SignExtInReg<"seb", i8, GPR32Opnd, II_SEB>,
+ SEB_FM_MM<0x0ac>, ISA_MIPS32R2;
+ def SEH_MM : MMRel, SignExtInReg<"seh", i16, GPR32Opnd, II_SEH>,
+ SEB_FM_MM<0x0ec>, ISA_MIPS32R2;
+
+ /// Word Swap Bytes Within Halfwords
+ def WSBH_MM : MMRel, SubwordSwap<"wsbh", GPR32Opnd, II_WSBH>,
+ SEB_FM_MM<0x1ec>, ISA_MIPS32R2;
+ // TODO: Add '0 < pos+size <= 32' constraint check to ext instruction
+ def EXT_MM : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, uimm5_plus1, immZExt5,
+ immZExt5Plus1, MipsExt>, EXT_FM_MM<0x2c>;
+ def INS_MM : MMRel, InsBase<"ins", GPR32Opnd, uimm5, uimm5_inssize_plus1,
+ MipsIns>, EXT_FM_MM<0x0c>;
+
+ /// Jump Instructions
+ let DecoderMethod = "DecodeJumpTargetMM" in {
+ def J_MM : MMRel, JumpFJ<jmptarget_mm, "j", br, bb, "j">,
+ J_FM_MM<0x35>;
+ def JAL_MM : MMRel, JumpLink<"jal", calltarget_mm>, J_FM_MM<0x3d>;
+ def JALX_MM : MMRel, JumpLink<"jalx", calltarget>, J_FM_MM<0x3c>;
+ }
+ def JR_MM : MMRel, IndirectBranch<"jr", GPR32Opnd>, JR_FM_MM<0x3c>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+ def JALR_MM : JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM_MM<0x03c>;
+
+ /// Jump Instructions - Short Delay Slot
+ def JALS_MM : JumpLinkMM<"jals", calltarget_mm>, J_FM_MM<0x1d>;
+ def JALRS_MM : JumpLinkRegMM<"jalrs", GPR32Opnd>, JALR_FM_MM<0x13c>;
+
+ /// Branch Instructions
+ def BEQ_MM : MMRel, CBranch<"beq", brtarget_mm, seteq, GPR32Opnd>,
+ BEQ_FM_MM<0x25>;
+ def BNE_MM : MMRel, CBranch<"bne", brtarget_mm, setne, GPR32Opnd>,
+ BEQ_FM_MM<0x2d>;
+ def BGEZ_MM : MMRel, CBranchZero<"bgez", brtarget_mm, setge, GPR32Opnd>,
+ BGEZ_FM_MM<0x2>;
+ def BGTZ_MM : MMRel, CBranchZero<"bgtz", brtarget_mm, setgt, GPR32Opnd>,
+ BGEZ_FM_MM<0x6>;
+ def BLEZ_MM : MMRel, CBranchZero<"blez", brtarget_mm, setle, GPR32Opnd>,
+ BGEZ_FM_MM<0x4>;
+ def BLTZ_MM : MMRel, CBranchZero<"bltz", brtarget_mm, setlt, GPR32Opnd>,
+ BGEZ_FM_MM<0x0>;
+ def BGEZAL_MM : MMRel, BGEZAL_FT<"bgezal", brtarget_mm, GPR32Opnd>,
+ BGEZAL_FM_MM<0x03>;
+ def BLTZAL_MM : MMRel, BGEZAL_FT<"bltzal", brtarget_mm, GPR32Opnd>,
+ BGEZAL_FM_MM<0x01>;
+
+ /// Branch Instructions - Short Delay Slot
+ def BGEZALS_MM : BranchCompareToZeroLinkMM<"bgezals", brtarget_mm,
+ GPR32Opnd>, BGEZAL_FM_MM<0x13>;
+ def BLTZALS_MM : BranchCompareToZeroLinkMM<"bltzals", brtarget_mm,
+ GPR32Opnd>, BGEZAL_FM_MM<0x11>;
+
+ /// Control Instructions
+ def SYNC_MM : MMRel, SYNC_FT<"sync">, SYNC_FM_MM;
+ def SYNCI_MM : MMRel, SYNCI_FT<"synci">, SYNCI_FM_MM;
+ def BREAK_MM : MMRel, BRK_FT<"break">, BRK_FM_MM;
+ def SYSCALL_MM : MMRel, SYS_FT<"syscall", uimm10, II_SYSCALL>, SYS_FM_MM;
+ def WAIT_MM : WaitMM<"wait">, WAIT_FM_MM;
+ def ERET_MM : MMRel, ER_FT<"eret", II_ERET>, ER_FM_MM<0x3cd>;
+ def DERET_MM : MMRel, ER_FT<"deret", II_DERET>, ER_FM_MM<0x38d>;
+ def EI_MM : MMRel, DEI_FT<"ei", GPR32Opnd, II_EI>, EI_FM_MM<0x15d>,
+ ISA_MIPS32R2;
+ def DI_MM : MMRel, DEI_FT<"di", GPR32Opnd, II_DI>, EI_FM_MM<0x11d>,
+ ISA_MIPS32R2;
+
+ /// Trap Instructions
+ def TEQ_MM : MMRel, TEQ_FT<"teq", GPR32Opnd, uimm4, II_TEQ>, TEQ_FM_MM<0x0>;
+ def TGE_MM : MMRel, TEQ_FT<"tge", GPR32Opnd, uimm4, II_TGE>, TEQ_FM_MM<0x08>;
+ def TGEU_MM : MMRel, TEQ_FT<"tgeu", GPR32Opnd, uimm4, II_TGEU>,
+ TEQ_FM_MM<0x10>;
+ def TLT_MM : MMRel, TEQ_FT<"tlt", GPR32Opnd, uimm4, II_TLT>, TEQ_FM_MM<0x20>;
+ def TLTU_MM : MMRel, TEQ_FT<"tltu", GPR32Opnd, uimm4, II_TLTU>,
+ TEQ_FM_MM<0x28>;
+ def TNE_MM : MMRel, TEQ_FT<"tne", GPR32Opnd, uimm4, II_TNE>, TEQ_FM_MM<0x30>;
+
+ def TEQI_MM : MMRel, TEQI_FT<"teqi", GPR32Opnd, II_TEQI>, TEQI_FM_MM<0x0e>;
+ def TGEI_MM : MMRel, TEQI_FT<"tgei", GPR32Opnd, II_TGEI>, TEQI_FM_MM<0x09>;
+ def TGEIU_MM : MMRel, TEQI_FT<"tgeiu", GPR32Opnd, II_TGEIU>,
+ TEQI_FM_MM<0x0b>;
+ def TLTI_MM : MMRel, TEQI_FT<"tlti", GPR32Opnd, II_TLTI>, TEQI_FM_MM<0x08>;
+ def TLTIU_MM : MMRel, TEQI_FT<"tltiu", GPR32Opnd, II_TTLTIU>,
+ TEQI_FM_MM<0x0a>;
+ def TNEI_MM : MMRel, TEQI_FT<"tnei", GPR32Opnd, II_TNEI>, TEQI_FM_MM<0x0c>;
+
+ /// Load-linked, Store-conditional
+ def LL_MM : LLBaseMM<"ll", GPR32Opnd>, LL_FM_MM<0x3>;
+ def SC_MM : SCBaseMM<"sc", GPR32Opnd>, LL_FM_MM<0xb>;
+
+ def LLE_MM : LLEBaseMM<"lle", GPR32Opnd>, LLE_FM_MM<0x6>;
+ def SCE_MM : SCEBaseMM<"sce", GPR32Opnd>, LLE_FM_MM<0xA>;
+
+ let DecoderMethod = "DecodeCacheOpMM" in {
+ def CACHE_MM : MMRel, CacheOp<"cache", mem_mm_12, II_CACHE>,
+ CACHE_PREF_FM_MM<0x08, 0x6>;
+ def PREF_MM : MMRel, CacheOp<"pref", mem_mm_12, II_PREF>,
+ CACHE_PREF_FM_MM<0x18, 0x2>;
+ }
+
+ let DecoderMethod = "DecodePrefeOpMM" in {
+ def PREFE_MM : MMRel, CacheOp<"prefe", mem_mm_9, II_PREFE>,
+ CACHE_PREFE_FM_MM<0x18, 0x2>;
+ def CACHEE_MM : MMRel, CacheOp<"cachee", mem_mm_9, II_CACHEE>,
+ CACHE_PREFE_FM_MM<0x18, 0x3>;
+ }
+ def SSNOP_MM : MMRel, Barrier<"ssnop", II_SSNOP>, BARRIER_FM_MM<0x1>;
+ def EHB_MM : MMRel, Barrier<"ehb", II_EHB>, BARRIER_FM_MM<0x3>;
+ def PAUSE_MM : MMRel, Barrier<"pause", II_PAUSE>, BARRIER_FM_MM<0x5>;
+
+ def TLBP_MM : MMRel, TLB<"tlbp", II_TLBP>, COP0_TLB_FM_MM<0x0d>;
+ def TLBR_MM : MMRel, TLB<"tlbr", II_TLBR>, COP0_TLB_FM_MM<0x4d>;
+ def TLBWI_MM : MMRel, TLB<"tlbwi", II_TLBWI>, COP0_TLB_FM_MM<0x8d>;
+ def TLBWR_MM : MMRel, TLB<"tlbwr", II_TLBWR>, COP0_TLB_FM_MM<0xcd>;
+
+ def SDBBP_MM : MMRel, SYS_FT<"sdbbp", uimm10, II_SDBBP>, SDBBP_FM_MM;
+
+ def PREFX_MM : PrefetchIndexed<"prefx">, POOL32F_PREFX_FM_MM<0x15, 0x1A0>;
+}
+
+def TAILCALL_MM : TailCall<J_MM, jmptarget_mm>, ISA_MIPS1_NOT_32R6_64R6;
+
+let DecoderNamespace = "MicroMips" in {
+ def RDHWR_MM : MMRel, R6MMR6Rel, ReadHardware<GPR32Opnd, HWRegsOpnd>,
+ RDHWR_FM_MM, ISA_MICROMIPS32_NOT_MIPS32R6;
+ def LWU_MM : MMRel, LoadMM<"lwu", GPR32Opnd, zextloadi32, II_LWU,
+ mem_simm12>, LL_FM_MM<0xe>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+}
+
+//===----------------------------------------------------------------------===//
+// MicroMips arbitrary patterns that map to one or more instructions
+//===----------------------------------------------------------------------===//
+
+def : MipsPat<(i32 immLi16:$imm),
+ (LI16_MM immLi16:$imm)>;
+
+let AdditionalPredicates = [InMicroMips] in
+defm : MaterializeImms<i32, ZERO, ADDiu_MM, LUi_MM, ORi_MM>;
+
+let Predicates = [InMicroMips] in {
+ def : MipsPat<(i32 immLi16:$imm),
+ (LI16_MM immLi16:$imm)>;
+ def : MipsPat<(i32 immSExt16:$imm),
+ (ADDiu_MM ZERO, immSExt16:$imm)>;
+ def : MipsPat<(i32 immZExt16:$imm),
+ (ORi_MM ZERO, immZExt16:$imm)>;
+
+ def : MipsPat<(not GPRMM16:$in),
+ (NOT16_MM GPRMM16:$in)>;
+ def : MipsPat<(not GPR32:$in),
+ (NOR_MM GPR32Opnd:$in, ZERO)>;
+
+ def : MipsPat<(add GPRMM16:$src, immSExtAddiur2:$imm),
+ (ADDIUR2_MM GPRMM16:$src, immSExtAddiur2:$imm)>;
+ def : MipsPat<(add GPR32:$src, immSExtAddius5:$imm),
+ (ADDIUS5_MM GPR32:$src, immSExtAddius5:$imm)>;
+ def : MipsPat<(add GPR32:$src, immSExt16:$imm),
+ (ADDiu_MM GPR32:$src, immSExt16:$imm)>;
+
+ def : MipsPat<(and GPRMM16:$src, immZExtAndi16:$imm),
+ (ANDI16_MM GPRMM16:$src, immZExtAndi16:$imm)>;
+ def : MipsPat<(and GPR32:$src, immZExt16:$imm),
+ (ANDi_MM GPR32:$src, immZExt16:$imm)>;
+
+ def : MipsPat<(shl GPRMM16:$src, immZExt2Shift:$imm),
+ (SLL16_MM GPRMM16:$src, immZExt2Shift:$imm)>;
+ def : MipsPat<(shl GPR32:$src, immZExt5:$imm),
+ (SLL_MM GPR32:$src, immZExt5:$imm)>;
+ def : MipsPat<(shl GPR32:$lhs, GPR32:$rhs),
+ (SLLV_MM GPR32:$lhs, GPR32:$rhs)>;
+
+ def : MipsPat<(srl GPRMM16:$src, immZExt2Shift:$imm),
+ (SRL16_MM GPRMM16:$src, immZExt2Shift:$imm)>;
+ def : MipsPat<(srl GPR32:$src, immZExt5:$imm),
+ (SRL_MM GPR32:$src, immZExt5:$imm)>;
+ def : MipsPat<(srl GPR32:$lhs, GPR32:$rhs),
+ (SRLV_MM GPR32:$lhs, GPR32:$rhs)>;
+
+ def : MipsPat<(sra GPR32:$src, immZExt5:$imm),
+ (SRA_MM GPR32:$src, immZExt5:$imm)>;
+ def : MipsPat<(sra GPR32:$lhs, GPR32:$rhs),
+ (SRAV_MM GPR32:$lhs, GPR32:$rhs)>;
+
+ def : MipsPat<(store GPRMM16:$src, addrimm4lsl2:$addr),
+ (SW16_MM GPRMM16:$src, addrimm4lsl2:$addr)>;
+ def : MipsPat<(store GPR32:$src, addr:$addr),
+ (SW_MM GPR32:$src, addr:$addr)>;
+
+ def : MipsPat<(load addrimm4lsl2:$addr),
+ (LW16_MM addrimm4lsl2:$addr)>;
+ def : MipsPat<(load addr:$addr),
+ (LW_MM addr:$addr)>;
+ def : MipsPat<(subc GPR32:$lhs, GPR32:$rhs),
+ (SUBu_MM GPR32:$lhs, GPR32:$rhs)>;
+
+ def : MipsPat<(MipsTailCall (iPTR tglobaladdr:$dst)),
+ (TAILCALL_MM tglobaladdr:$dst)>, ISA_MIPS1_NOT_32R6_64R6;
+ def : MipsPat<(MipsTailCall (iPTR texternalsym:$dst)),
+ (TAILCALL_MM texternalsym:$dst)>, ISA_MIPS1_NOT_32R6_64R6;
+}
+
+let AddedComplexity = 40 in {
+ def : MipsPat<(i32 (sextloadi16 addrRegImm:$a)),
+ (LH_MM addrRegImm:$a)>;
+}
+def : MipsPat<(atomic_load_16 addr:$a),
+ (LH_MM addr:$a)>;
+def : MipsPat<(i32 (extloadi16 addr:$src)),
+ (LHu_MM addr:$src)>;
+
+defm : BrcondPats<GPR32, BEQ_MM, BEQ_MM, BNE_MM, SLT_MM, SLTu_MM, SLTi_MM,
+ SLTiu_MM, ZERO>;
+
+defm : SeteqPats<GPR32, SLTiu_MM, XOR_MM, SLTu_MM, ZERO>;
+defm : SetlePats<GPR32, XORi_MM, SLT_MM, SLTu_MM>;
+defm : SetgtPats<GPR32, SLT_MM, SLTu_MM>;
+defm : SetgePats<GPR32, XORi_MM, SLT_MM, SLTu_MM>;
+defm : SetgeImmPats<GPR32, XORi_MM, SLTi_MM, SLTiu_MM>;
+
+//===----------------------------------------------------------------------===//
+// MicroMips instruction aliases
+//===----------------------------------------------------------------------===//
+
+class UncondBranchMMPseudo<string opstr> :
+ MipsAsmPseudoInst<(outs), (ins brtarget_mm:$offset),
+ !strconcat(opstr, "\t$offset")>;
+
+def B_MM_Pseudo : UncondBranchMMPseudo<"b">, ISA_MICROMIPS;
+
+let Predicates = [InMicroMips] in {
+ def SDIV_MM_Pseudo : MultDivPseudo<SDIV_MM, ACC64, GPR32Opnd, MipsDivRem,
+ II_DIV, 0, 1, 1>, ISA_MIPS1_NOT_32R6_64R6;
+ def UDIV_MM_Pseudo : MultDivPseudo<UDIV_MM, ACC64, GPR32Opnd, MipsDivRemU,
+ II_DIVU, 0, 1, 1>, ISA_MIPS1_NOT_32R6_64R6;
+
+ def : MipsInstAlias<"wait", (WAIT_MM 0x0), 1>;
+ def : MipsInstAlias<"nop", (SLL_MM ZERO, ZERO, 0), 1>;
+ def : MipsInstAlias<"nop", (MOVE16_MM ZERO, ZERO), 1>;
+ def : MipsInstAlias<"ei", (EI_MM ZERO), 1>, ISA_MIPS32R2;
+ def : MipsInstAlias<"di", (DI_MM ZERO), 1>, ISA_MIPS32R2;
+ def : MipsInstAlias<"teq $rs, $rt",
+ (TEQ_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+ def : MipsInstAlias<"tge $rs, $rt",
+ (TGE_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+ def : MipsInstAlias<"tgeu $rs, $rt",
+ (TGEU_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+ def : MipsInstAlias<"tlt $rs, $rt",
+ (TLT_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+ def : MipsInstAlias<"tltu $rs, $rt",
+ (TLTU_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+ def : MipsInstAlias<"tne $rs, $rt",
+ (TNE_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+ def : MipsInstAlias<
+ "sgt $rd, $rs, $rt",
+ (SLT_MM GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+ def : MipsInstAlias<
+ "sgt $rs, $rt",
+ (SLT_MM GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+ def : MipsInstAlias<
+ "sgtu $rd, $rs, $rt",
+ (SLTu_MM GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+ def : MipsInstAlias<
+ "sgtu $rs, $rt",
+ (SLTu_MM GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+ def : MipsInstAlias<"slt $rs, $rt, $imm",
+ (SLTi_MM GPR32Opnd:$rs, GPR32Opnd:$rt,
+ simm32_relaxed:$imm), 0>;
+ def : MipsInstAlias<"sltu $rs, $rt, $imm",
+ (SLTiu_MM GPR32Opnd:$rs, GPR32Opnd:$rt,
+ simm32_relaxed:$imm), 0>;
+ def : MipsInstAlias<"sll $rd, $rt, $rs",
+ (SLLV_MM GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+ def : MipsInstAlias<"sra $rd, $rt, $rs",
+ (SRAV_MM GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+ def : MipsInstAlias<"srl $rd, $rt, $rs",
+ (SRLV_MM GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+ def : MipsInstAlias<"sll $rd, $rt",
+ (SLLV_MM GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rt), 0>;
+ def : MipsInstAlias<"sra $rd, $rt",
+ (SRAV_MM GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rt), 0>;
+ def : MipsInstAlias<"srl $rd, $rt",
+ (SRLV_MM GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rt), 0>;
+ def : MipsInstAlias<"sll $rd, $shamt",
+ (SLL_MM GPR32Opnd:$rd, GPR32Opnd:$rd, uimm5:$shamt), 0>;
+ def : MipsInstAlias<"sra $rd, $shamt",
+ (SRA_MM GPR32Opnd:$rd, GPR32Opnd:$rd, uimm5:$shamt), 0>;
+ def : MipsInstAlias<"srl $rd, $shamt",
+ (SRL_MM GPR32Opnd:$rd, GPR32Opnd:$rd, uimm5:$shamt), 0>;
+ def : MipsInstAlias<"rotr $rt, $imm",
+ (ROTR_MM GPR32Opnd:$rt, GPR32Opnd:$rt, uimm5:$imm), 0>;
+ def : MipsInstAlias<"syscall", (SYSCALL_MM 0), 1>;
+ def : MipsInstAlias<"and $rs, $rt, $imm",
+ (ANDi_MM GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
+ def : MipsInstAlias<"and $rs, $imm",
+ (ANDi_MM GPR32Opnd:$rs, GPR32Opnd:$rs, simm16:$imm), 0>;
+ def : MipsInstAlias<"or $rs, $rt, $imm",
+ (ORi_MM GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>;
+ def : MipsInstAlias<"or $rs, $imm",
+ (ORi_MM GPR32Opnd:$rs, GPR32Opnd:$rs, uimm16:$imm), 0>;
+ def : MipsInstAlias<"xor $rs, $rt, $imm",
+ (XORi_MM GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>;
+ def : MipsInstAlias<"xor $rs, $imm",
+ (XORi_MM GPR32Opnd:$rs, GPR32Opnd:$rs, uimm16:$imm), 0>;
+ def : MipsInstAlias<"not $rt, $rs",
+ (NOR_MM GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>;
+ def : MipsInstAlias<"not $rt",
+ (NOR_MM GPR32Opnd:$rt, GPR32Opnd:$rt, ZERO), 0>;
+ def : MipsInstAlias<"bnez $rs,$offset",
+ (BNE_MM GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
+ def : MipsInstAlias<"beqz $rs,$offset",
+ (BEQ_MM GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
+ def : MipsInstAlias<"seh $rd", (SEH_MM GPR32Opnd:$rd, GPR32Opnd:$rd), 0>,
+ ISA_MIPS32R2_NOT_32R6_64R6;
+ def : MipsInstAlias<"seb $rd", (SEB_MM GPR32Opnd:$rd, GPR32Opnd:$rd), 0>,
+ ISA_MIPS32R2_NOT_32R6_64R6;
+}
diff --git a/contrib/llvm/lib/Target/Mips/Mips.h b/contrib/llvm/lib/Target/Mips/Mips.h
new file mode 100644
index 000000000000..d9faf3325cac
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips.h
@@ -0,0 +1,37 @@
+//===-- Mips.h - Top-level interface for Mips representation ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in
+// the LLVM Mips back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPS_H
+#define LLVM_LIB_TARGET_MIPS_MIPS_H
+
+#include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+ class MipsTargetMachine;
+ class ModulePass;
+ class FunctionPass;
+
+ ModulePass *createMipsOs16Pass(MipsTargetMachine &TM);
+ ModulePass *createMips16HardFloatPass(MipsTargetMachine &TM);
+
+ FunctionPass *createMipsModuleISelDagPass(MipsTargetMachine &TM);
+ FunctionPass *createMipsOptimizePICCallPass(MipsTargetMachine &TM);
+ FunctionPass *createMipsDelaySlotFillerPass(MipsTargetMachine &TM);
+ FunctionPass *createMipsHazardSchedule();
+ FunctionPass *createMipsLongBranchPass(MipsTargetMachine &TM);
+ FunctionPass *createMipsConstantIslandPass();
+} // end namespace llvm;
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/Mips.td b/contrib/llvm/lib/Target/Mips/Mips.td
new file mode 100644
index 000000000000..670272d47e95
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips.td
@@ -0,0 +1,231 @@
+//===-- Mips.td - Describe the Mips Target Machine ---------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This is the top level entry point for the Mips target.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+// The overall idea of the PredicateControl class is to chop the Predicates list
+// into subsets that are usually overridden independently. This allows
+// subclasses to partially override the predicates of their superclasses without
+// having to re-add all the existing predicates.
+class PredicateControl {
+ // Predicates for the encoding scheme in use such as HasStdEnc
+ list<Predicate> EncodingPredicates = [];
+ // Predicates for the GPR size such as IsGP64bit
+ list<Predicate> GPRPredicates = [];
+ // Predicates for the PTR size such as IsPTR64bit
+ list<Predicate> PTRPredicates = [];
+ // Predicates for the FGR size and layout such as IsFP64bit
+ list<Predicate> FGRPredicates = [];
+ // Predicates for the instruction group membership such as ISA's and ASE's
+ list<Predicate> InsnPredicates = [];
+ // Predicate for marking the instruction as usable in hard-float mode only.
+ list<Predicate> HardFloatPredicate = [];
+ // Predicates for anything else
+ list<Predicate> AdditionalPredicates = [];
+ list<Predicate> Predicates = !listconcat(EncodingPredicates,
+ GPRPredicates,
+ PTRPredicates,
+ FGRPredicates,
+ InsnPredicates,
+ HardFloatPredicate,
+ AdditionalPredicates);
+}
+
+// Like Requires<> but for the AdditionalPredicates list
+class AdditionalRequires<list<Predicate> preds> {
+ list<Predicate> AdditionalPredicates = preds;
+}
+
+//===----------------------------------------------------------------------===//
+// Register File, Calling Conv, Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "MipsRegisterInfo.td"
+include "MipsSchedule.td"
+include "MipsInstrInfo.td"
+include "MipsCallingConv.td"
+
+// Avoid forward declaration issues.
+include "MipsScheduleP5600.td"
+include "MipsScheduleGeneric.td"
+
+def MipsInstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// Mips Subtarget features //
+//===----------------------------------------------------------------------===//
+
+def FeatureNoABICalls : SubtargetFeature<"noabicalls", "NoABICalls", "true",
+ "Disable SVR4-style position-independent code">;
+def FeaturePTR64Bit : SubtargetFeature<"ptr64", "IsPTR64bit", "true",
+ "Pointers are 64-bit wide">;
+def FeatureGP64Bit : SubtargetFeature<"gp64", "IsGP64bit", "true",
+ "General Purpose Registers are 64-bit wide">;
+def FeatureFP64Bit : SubtargetFeature<"fp64", "IsFP64bit", "true",
+ "Support 64-bit FP registers">;
+def FeatureFPXX : SubtargetFeature<"fpxx", "IsFPXX", "true",
+ "Support for FPXX">;
+def FeatureNaN2008 : SubtargetFeature<"nan2008", "IsNaN2008bit", "true",
+ "IEEE 754-2008 NaN encoding">;
+def FeatureSingleFloat : SubtargetFeature<"single-float", "IsSingleFloat",
+ "true", "Only supports single precision float">;
+def FeatureSoftFloat : SubtargetFeature<"soft-float", "IsSoftFloat", "true",
+ "Does not support floating point instructions">;
+def FeatureNoOddSPReg : SubtargetFeature<"nooddspreg", "UseOddSPReg", "false",
+ "Disable odd numbered single-precision "
+ "registers">;
+def FeatureVFPU : SubtargetFeature<"vfpu", "HasVFPU",
+ "true", "Enable vector FPU instructions">;
+def FeatureMips1 : SubtargetFeature<"mips1", "MipsArchVersion", "Mips1",
+ "Mips I ISA Support [highly experimental]">;
+def FeatureMips2 : SubtargetFeature<"mips2", "MipsArchVersion", "Mips2",
+ "Mips II ISA Support [highly experimental]",
+ [FeatureMips1]>;
+def FeatureMips3_32 : SubtargetFeature<"mips3_32", "HasMips3_32", "true",
+ "Subset of MIPS-III that is also in MIPS32 "
+ "[highly experimental]">;
+def FeatureMips3_32r2 : SubtargetFeature<"mips3_32r2", "HasMips3_32r2", "true",
+ "Subset of MIPS-III that is also in MIPS32r2 "
+ "[highly experimental]">;
+def FeatureMips3 : SubtargetFeature<"mips3", "MipsArchVersion", "Mips3",
+ "MIPS III ISA Support [highly experimental]",
+ [FeatureMips2, FeatureMips3_32,
+ FeatureMips3_32r2, FeatureGP64Bit,
+ FeatureFP64Bit]>;
+def FeatureMips4_32 : SubtargetFeature<"mips4_32", "HasMips4_32", "true",
+ "Subset of MIPS-IV that is also in MIPS32 "
+ "[highly experimental]">;
+def FeatureMips4_32r2 : SubtargetFeature<"mips4_32r2", "HasMips4_32r2", "true",
+ "Subset of MIPS-IV that is also in MIPS32r2 "
+ "[highly experimental]">;
+def FeatureMips4 : SubtargetFeature<"mips4", "MipsArchVersion",
+ "Mips4", "MIPS IV ISA Support",
+ [FeatureMips3, FeatureMips4_32,
+ FeatureMips4_32r2]>;
+def FeatureMips5_32r2 : SubtargetFeature<"mips5_32r2", "HasMips5_32r2", "true",
+ "Subset of MIPS-V that is also in MIPS32r2 "
+ "[highly experimental]">;
+def FeatureMips5 : SubtargetFeature<"mips5", "MipsArchVersion", "Mips5",
+ "MIPS V ISA Support [highly experimental]",
+ [FeatureMips4, FeatureMips5_32r2]>;
+def FeatureMips32 : SubtargetFeature<"mips32", "MipsArchVersion", "Mips32",
+ "Mips32 ISA Support",
+ [FeatureMips2, FeatureMips3_32,
+ FeatureMips4_32]>;
+def FeatureMips32r2 : SubtargetFeature<"mips32r2", "MipsArchVersion",
+ "Mips32r2", "Mips32r2 ISA Support",
+ [FeatureMips3_32r2, FeatureMips4_32r2,
+ FeatureMips5_32r2, FeatureMips32]>;
+def FeatureMips32r3 : SubtargetFeature<"mips32r3", "MipsArchVersion",
+ "Mips32r3", "Mips32r3 ISA Support",
+ [FeatureMips32r2]>;
+def FeatureMips32r5 : SubtargetFeature<"mips32r5", "MipsArchVersion",
+ "Mips32r5", "Mips32r5 ISA Support",
+ [FeatureMips32r3]>;
+def FeatureMips32r6 : SubtargetFeature<"mips32r6", "MipsArchVersion",
+ "Mips32r6",
+ "Mips32r6 ISA Support [experimental]",
+ [FeatureMips32r5, FeatureFP64Bit,
+ FeatureNaN2008]>;
+def FeatureMips64 : SubtargetFeature<"mips64", "MipsArchVersion",
+ "Mips64", "Mips64 ISA Support",
+ [FeatureMips5, FeatureMips32]>;
+def FeatureMips64r2 : SubtargetFeature<"mips64r2", "MipsArchVersion",
+ "Mips64r2", "Mips64r2 ISA Support",
+ [FeatureMips64, FeatureMips32r2]>;
+def FeatureMips64r3 : SubtargetFeature<"mips64r3", "MipsArchVersion",
+ "Mips64r3", "Mips64r3 ISA Support",
+ [FeatureMips64r2, FeatureMips32r3]>;
+def FeatureMips64r5 : SubtargetFeature<"mips64r5", "MipsArchVersion",
+ "Mips64r5", "Mips64r5 ISA Support",
+ [FeatureMips64r3, FeatureMips32r5]>;
+def FeatureMips64r6 : SubtargetFeature<"mips64r6", "MipsArchVersion",
+ "Mips64r6",
+ "Mips64r6 ISA Support [experimental]",
+ [FeatureMips32r6, FeatureMips64r5,
+ FeatureNaN2008]>;
+
+def FeatureMips16 : SubtargetFeature<"mips16", "InMips16Mode", "true",
+ "Mips16 mode">;
+
+def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true", "Mips DSP ASE">;
+def FeatureDSPR2 : SubtargetFeature<"dspr2", "HasDSPR2", "true",
+ "Mips DSP-R2 ASE", [FeatureDSP]>;
+def FeatureDSPR3
+ : SubtargetFeature<"dspr3", "HasDSPR3", "true", "Mips DSP-R3 ASE",
+ [ FeatureDSP, FeatureDSPR2 ]>;
+
+def FeatureMSA : SubtargetFeature<"msa", "HasMSA", "true", "Mips MSA ASE">;
+
+def FeatureEVA : SubtargetFeature<"eva", "HasEVA", "true", "Mips EVA ASE">;
+
+def FeatureMicroMips : SubtargetFeature<"micromips", "InMicroMipsMode", "true",
+ "microMips mode">;
+
+def FeatureCnMips : SubtargetFeature<"cnmips", "HasCnMips",
+ "true", "Octeon cnMIPS Support",
+ [FeatureMips64r2]>;
+
+def FeatureUseTCCInDIV : SubtargetFeature<
+ "use-tcc-in-div",
+ "UseTCCInDIV", "false",
+ "Force the assembler to use trapping">;
+
+//===----------------------------------------------------------------------===//
+// Mips processors supported.
+//===----------------------------------------------------------------------===//
+
+def ImplP5600 : SubtargetFeature<"p5600", "ProcImpl",
+ "MipsSubtarget::CPU::P5600",
+ "The P5600 Processor", [FeatureMips32r5]>;
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : ProcessorModel<Name, MipsGenericModel, Features>;
+
+def : Proc<"mips1", [FeatureMips1]>;
+def : Proc<"mips2", [FeatureMips2]>;
+def : Proc<"mips32", [FeatureMips32]>;
+def : Proc<"mips32r2", [FeatureMips32r2]>;
+def : Proc<"mips32r3", [FeatureMips32r3]>;
+def : Proc<"mips32r5", [FeatureMips32r5]>;
+def : Proc<"mips32r6", [FeatureMips32r6]>;
+
+def : Proc<"mips3", [FeatureMips3]>;
+def : Proc<"mips4", [FeatureMips4]>;
+def : Proc<"mips5", [FeatureMips5]>;
+def : Proc<"mips64", [FeatureMips64]>;
+def : Proc<"mips64r2", [FeatureMips64r2]>;
+def : Proc<"mips64r3", [FeatureMips64r3]>;
+def : Proc<"mips64r5", [FeatureMips64r5]>;
+def : Proc<"mips64r6", [FeatureMips64r6]>;
+def : Proc<"octeon", [FeatureMips64r2, FeatureCnMips]>;
+def : ProcessorModel<"p5600", MipsP5600Model, [ImplP5600]>;
+
+def MipsAsmParser : AsmParser {
+ let ShouldEmitMatchRegisterName = 0;
+}
+
+def MipsAsmParserVariant : AsmParserVariant {
+ int Variant = 0;
+
+ // Recognize hard coded registers.
+ string RegisterPrefix = "$";
+}
+
+def Mips : Target {
+ let InstructionSet = MipsInstrInfo;
+ let AssemblyParsers = [MipsAsmParser];
+ let AssemblyParserVariants = [MipsAsmParserVariant];
+}
diff --git a/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.cpp b/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.cpp
new file mode 100644
index 000000000000..e7ceca9612a9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.cpp
@@ -0,0 +1,176 @@
+//===-- Mips16FrameLowering.cpp - Mips16 Frame Information ----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips16 implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips16FrameLowering.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "Mips16InstrInfo.h"
+#include "MipsInstrInfo.h"
+#include "MipsRegisterInfo.h"
+#include "MipsSubtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+Mips16FrameLowering::Mips16FrameLowering(const MipsSubtarget &STI)
+ : MipsFrameLowering(STI, STI.stackAlignment()) {}
+
+void Mips16FrameLowering::emitPrologue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ const Mips16InstrInfo &TII =
+ *static_cast<const Mips16InstrInfo *>(STI.getInstrInfo());
+ MachineBasicBlock::iterator MBBI = MBB.begin();
+
+ // Debug location must be unknown since the first debug location is used
+ // to determine the end of the prologue.
+ DebugLoc dl;
+
+ uint64_t StackSize = MFI.getStackSize();
+
+ // No need to allocate space on the stack.
+ if (StackSize == 0 && !MFI.adjustsStack()) return;
+
+ MachineModuleInfo &MMI = MF.getMMI();
+ const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+ MachineLocation DstML, SrcML;
+
+ // Adjust stack.
+ TII.makeFrame(Mips::SP, StackSize, MBB, MBBI);
+
+ // emit ".cfi_def_cfa_offset StackSize"
+ unsigned CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::createDefCfaOffset(nullptr, -StackSize));
+ BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+
+ const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+
+ if (CSI.size()) {
+ const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+
+ for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
+ E = CSI.end(); I != E; ++I) {
+ int64_t Offset = MFI.getObjectOffset(I->getFrameIdx());
+ unsigned Reg = I->getReg();
+ unsigned DReg = MRI->getDwarfRegNum(Reg, true);
+ unsigned CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::createOffset(nullptr, DReg, Offset));
+ BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+ }
+ }
+ if (hasFP(MF))
+ BuildMI(MBB, MBBI, dl, TII.get(Mips::MoveR3216), Mips::S0)
+ .addReg(Mips::SP).setMIFlag(MachineInstr::FrameSetup);
+
+}
+
+void Mips16FrameLowering::emitEpilogue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ const Mips16InstrInfo &TII =
+ *static_cast<const Mips16InstrInfo *>(STI.getInstrInfo());
+ DebugLoc dl = MBBI->getDebugLoc();
+ uint64_t StackSize = MFI.getStackSize();
+
+ if (!StackSize)
+ return;
+
+ if (hasFP(MF))
+ BuildMI(MBB, MBBI, dl, TII.get(Mips::Move32R16), Mips::SP)
+ .addReg(Mips::S0);
+
+ // Adjust stack.
+ // assumes stacksize multiple of 8
+ TII.restoreFrame(Mips::SP, StackSize, MBB, MBBI);
+}
+
+bool Mips16FrameLowering::
+spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const {
+ MachineFunction *MF = MBB.getParent();
+ MachineBasicBlock *EntryBlock = &MF->front();
+
+ //
+ // Registers RA, S0,S1 are the callee saved registers and they
+ // will be saved with the "save" instruction
+ // during emitPrologue
+ //
+ for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+ // Add the callee-saved register as live-in. Do not add if the register is
+ // RA and return address is taken, because it has already been added in
+ // method MipsTargetLowering::lowerRETURNADDR.
+ // It's killed at the spill, unless the register is RA and return address
+ // is taken.
+ unsigned Reg = CSI[i].getReg();
+ bool IsRAAndRetAddrIsTaken = (Reg == Mips::RA)
+ && MF->getFrameInfo().isReturnAddressTaken();
+ if (!IsRAAndRetAddrIsTaken)
+ EntryBlock->addLiveIn(Reg);
+ }
+
+ return true;
+}
+
+bool Mips16FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const {
+ //
+ // Registers RA,S0,S1 are the callee saved registers and they will be restored
+ // with the restore instruction during emitEpilogue.
+ // We need to override this virtual function, otherwise llvm will try and
+ // restore the registers on it's on from the stack.
+ //
+
+ return true;
+}
+
+bool
+Mips16FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ // Reserve call frame if the size of the maximum call frame fits into 15-bit
+ // immediate field and there are no variable sized objects on the stack.
+ return isInt<15>(MFI.getMaxCallFrameSize()) && !MFI.hasVarSizedObjects();
+}
+
+void Mips16FrameLowering::determineCalleeSaves(MachineFunction &MF,
+ BitVector &SavedRegs,
+ RegScavenger *RS) const {
+ TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+ const Mips16InstrInfo &TII =
+ *static_cast<const Mips16InstrInfo *>(STI.getInstrInfo());
+ const MipsRegisterInfo &RI = TII.getRegisterInfo();
+ const BitVector Reserved = RI.getReservedRegs(MF);
+ bool SaveS2 = Reserved[Mips::S2];
+ if (SaveS2)
+ SavedRegs.set(Mips::S2);
+ if (hasFP(MF))
+ SavedRegs.set(Mips::S0);
+}
+
+const MipsFrameLowering *
+llvm::createMips16FrameLowering(const MipsSubtarget &ST) {
+ return new Mips16FrameLowering(ST);
+}
diff --git a/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.h b/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.h
new file mode 100644
index 000000000000..b48ed4641ea7
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.h
@@ -0,0 +1,47 @@
+//===-- Mips16FrameLowering.h - Mips16 frame lowering ----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPS16FRAMELOWERING_H
+#define LLVM_LIB_TARGET_MIPS_MIPS16FRAMELOWERING_H
+
+#include "MipsFrameLowering.h"
+
+namespace llvm {
+class Mips16FrameLowering : public MipsFrameLowering {
+public:
+ explicit Mips16FrameLowering(const MipsSubtarget &STI);
+
+ /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+ /// the function.
+ void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+ void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+ bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const override;
+
+ bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const override;
+
+ bool hasReservedCallFrame(const MachineFunction &MF) const override;
+
+ void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+ RegScavenger *RS) const override;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/Mips16HardFloat.cpp b/contrib/llvm/lib/Target/Mips/Mips16HardFloat.cpp
new file mode 100644
index 000000000000..191006d6463c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16HardFloat.cpp
@@ -0,0 +1,547 @@
+//===---- Mips16HardFloat.cpp for Mips16 Hard Float --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pass needed for Mips16 Hard Float
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsTargetMachine.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <string>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips16-hard-float"
+
+namespace {
+ class Mips16HardFloat : public ModulePass {
+ public:
+ static char ID;
+
+ Mips16HardFloat(MipsTargetMachine &TM_) : ModulePass(ID), TM(TM_) {}
+
+ StringRef getPassName() const override { return "MIPS16 Hard Float Pass"; }
+
+ bool runOnModule(Module &M) override;
+
+ protected:
+ const MipsTargetMachine &TM;
+ };
+
+ static void EmitInlineAsm(LLVMContext &C, BasicBlock *BB, StringRef AsmText) {
+ std::vector<llvm::Type *> AsmArgTypes;
+ std::vector<llvm::Value *> AsmArgs;
+
+ llvm::FunctionType *AsmFTy =
+ llvm::FunctionType::get(Type::getVoidTy(C), AsmArgTypes, false);
+ llvm::InlineAsm *IA =
+ llvm::InlineAsm::get(AsmFTy, AsmText, "", true,
+ /* IsAlignStack */ false, llvm::InlineAsm::AD_ATT);
+ CallInst::Create(IA, AsmArgs, "", BB);
+ }
+
+ char Mips16HardFloat::ID = 0;
+}
+
+//
+// Return types that matter for hard float are:
+// float, double, complex float, and complex double
+//
+enum FPReturnVariant {
+ FRet, DRet, CFRet, CDRet, NoFPRet
+};
+
+//
+// Determine which FP return type this function has
+//
+static FPReturnVariant whichFPReturnVariant(Type *T) {
+ switch (T->getTypeID()) {
+ case Type::FloatTyID:
+ return FRet;
+ case Type::DoubleTyID:
+ return DRet;
+ case Type::StructTyID:
+ if (T->getStructNumElements() != 2)
+ break;
+ if ((T->getContainedType(0)->isFloatTy()) &&
+ (T->getContainedType(1)->isFloatTy()))
+ return CFRet;
+ if ((T->getContainedType(0)->isDoubleTy()) &&
+ (T->getContainedType(1)->isDoubleTy()))
+ return CDRet;
+ break;
+ default:
+ break;
+ }
+ return NoFPRet;
+}
+
+//
+// Parameter type that matter are float, (float, float), (float, double),
+// double, (double, double), (double, float)
+//
+enum FPParamVariant {
+ FSig, FFSig, FDSig,
+ DSig, DDSig, DFSig, NoSig
+};
+
+// which floating point parameter signature variant we are dealing with
+//
+typedef Type::TypeID TypeID;
+const Type::TypeID FloatTyID = Type::FloatTyID;
+const Type::TypeID DoubleTyID = Type::DoubleTyID;
+
+static FPParamVariant whichFPParamVariantNeeded(Function &F) {
+ switch (F.arg_size()) {
+ case 0:
+ return NoSig;
+ case 1:{
+ TypeID ArgTypeID = F.getFunctionType()->getParamType(0)->getTypeID();
+ switch (ArgTypeID) {
+ case FloatTyID:
+ return FSig;
+ case DoubleTyID:
+ return DSig;
+ default:
+ return NoSig;
+ }
+ }
+ default: {
+ TypeID ArgTypeID0 = F.getFunctionType()->getParamType(0)->getTypeID();
+ TypeID ArgTypeID1 = F.getFunctionType()->getParamType(1)->getTypeID();
+ switch(ArgTypeID0) {
+ case FloatTyID: {
+ switch (ArgTypeID1) {
+ case FloatTyID:
+ return FFSig;
+ case DoubleTyID:
+ return FDSig;
+ default:
+ return FSig;
+ }
+ }
+ case DoubleTyID: {
+ switch (ArgTypeID1) {
+ case FloatTyID:
+ return DFSig;
+ case DoubleTyID:
+ return DDSig;
+ default:
+ return DSig;
+ }
+ }
+ default:
+ return NoSig;
+ }
+ }
+ }
+ llvm_unreachable("can't get here");
+}
+
+// Figure out if we need float point based on the function parameters.
+// We need to move variables in and/or out of floating point
+// registers because of the ABI
+//
+static bool needsFPStubFromParams(Function &F) {
+ if (F.arg_size() >=1) {
+ Type *ArgType = F.getFunctionType()->getParamType(0);
+ switch (ArgType->getTypeID()) {
+ case Type::FloatTyID:
+ case Type::DoubleTyID:
+ return true;
+ default:
+ break;
+ }
+ }
+ return false;
+}
+
+static bool needsFPReturnHelper(Function &F) {
+ Type* RetType = F.getReturnType();
+ return whichFPReturnVariant(RetType) != NoFPRet;
+}
+
+static bool needsFPReturnHelper(FunctionType &FT) {
+ Type* RetType = FT.getReturnType();
+ return whichFPReturnVariant(RetType) != NoFPRet;
+}
+
+static bool needsFPHelperFromSig(Function &F) {
+ return needsFPStubFromParams(F) || needsFPReturnHelper(F);
+}
+
+//
+// We swap between FP and Integer registers to allow Mips16 and Mips32 to
+// interoperate
+//
+static std::string swapFPIntParams(FPParamVariant PV, Module *M, bool LE,
+ bool ToFP) {
+ std::string MI = ToFP ? "mtc1 ": "mfc1 ";
+ std::string AsmText;
+
+ switch (PV) {
+ case FSig:
+ AsmText += MI + "$$4, $$f12\n";
+ break;
+
+ case FFSig:
+ AsmText += MI + "$$4, $$f12\n";
+ AsmText += MI + "$$5, $$f14\n";
+ break;
+
+ case FDSig:
+ AsmText += MI + "$$4, $$f12\n";
+ if (LE) {
+ AsmText += MI + "$$6, $$f14\n";
+ AsmText += MI + "$$7, $$f15\n";
+ } else {
+ AsmText += MI + "$$7, $$f14\n";
+ AsmText += MI + "$$6, $$f15\n";
+ }
+ break;
+
+ case DSig:
+ if (LE) {
+ AsmText += MI + "$$4, $$f12\n";
+ AsmText += MI + "$$5, $$f13\n";
+ } else {
+ AsmText += MI + "$$5, $$f12\n";
+ AsmText += MI + "$$4, $$f13\n";
+ }
+ break;
+
+ case DDSig:
+ if (LE) {
+ AsmText += MI + "$$4, $$f12\n";
+ AsmText += MI + "$$5, $$f13\n";
+ AsmText += MI + "$$6, $$f14\n";
+ AsmText += MI + "$$7, $$f15\n";
+ } else {
+ AsmText += MI + "$$5, $$f12\n";
+ AsmText += MI + "$$4, $$f13\n";
+ AsmText += MI + "$$7, $$f14\n";
+ AsmText += MI + "$$6, $$f15\n";
+ }
+ break;
+
+ case DFSig:
+ if (LE) {
+ AsmText += MI + "$$4, $$f12\n";
+ AsmText += MI + "$$5, $$f13\n";
+ } else {
+ AsmText += MI + "$$5, $$f12\n";
+ AsmText += MI + "$$4, $$f13\n";
+ }
+ AsmText += MI + "$$6, $$f14\n";
+ break;
+
+ case NoSig:
+ break;
+ }
+
+ return AsmText;
+}
+
+//
+// Make sure that we know we already need a stub for this function.
+// Having called needsFPHelperFromSig
+//
+static void assureFPCallStub(Function &F, Module *M,
+ const MipsTargetMachine &TM) {
+ // for now we only need them for static relocation
+ if (TM.isPositionIndependent())
+ return;
+ LLVMContext &Context = M->getContext();
+ bool LE = TM.isLittleEndian();
+ std::string Name = F.getName();
+ std::string SectionName = ".mips16.call.fp." + Name;
+ std::string StubName = "__call_stub_fp_" + Name;
+ //
+ // see if we already have the stub
+ //
+ Function *FStub = M->getFunction(StubName);
+ if (FStub && !FStub->isDeclaration()) return;
+ FStub = Function::Create(F.getFunctionType(),
+ Function::InternalLinkage, StubName, M);
+ FStub->addFnAttr("mips16_fp_stub");
+ FStub->addFnAttr(llvm::Attribute::Naked);
+ FStub->addFnAttr(llvm::Attribute::NoInline);
+ FStub->addFnAttr(llvm::Attribute::NoUnwind);
+ FStub->addFnAttr("nomips16");
+ FStub->setSection(SectionName);
+ BasicBlock *BB = BasicBlock::Create(Context, "entry", FStub);
+ FPReturnVariant RV = whichFPReturnVariant(FStub->getReturnType());
+ FPParamVariant PV = whichFPParamVariantNeeded(F);
+
+ std::string AsmText;
+ AsmText += ".set reorder\n";
+ AsmText += swapFPIntParams(PV, M, LE, true);
+ if (RV != NoFPRet) {
+ AsmText += "move $$18, $$31\n";
+ AsmText += "jal " + Name + "\n";
+ } else {
+ AsmText += "lui $$25, %hi(" + Name + ")\n";
+ AsmText += "addiu $$25, $$25, %lo(" + Name + ")\n";
+ }
+
+ switch (RV) {
+ case FRet:
+ AsmText += "mfc1 $$2, $$f0\n";
+ break;
+
+ case DRet:
+ if (LE) {
+ AsmText += "mfc1 $$2, $$f0\n";
+ AsmText += "mfc1 $$3, $$f1\n";
+ } else {
+ AsmText += "mfc1 $$3, $$f0\n";
+ AsmText += "mfc1 $$2, $$f1\n";
+ }
+ break;
+
+ case CFRet:
+ if (LE) {
+ AsmText += "mfc1 $$2, $$f0\n";
+ AsmText += "mfc1 $$3, $$f2\n";
+ } else {
+ AsmText += "mfc1 $$3, $$f0\n";
+ AsmText += "mfc1 $$3, $$f2\n";
+ }
+ break;
+
+ case CDRet:
+ if (LE) {
+ AsmText += "mfc1 $$4, $$f2\n";
+ AsmText += "mfc1 $$5, $$f3\n";
+ AsmText += "mfc1 $$2, $$f0\n";
+ AsmText += "mfc1 $$3, $$f1\n";
+
+ } else {
+ AsmText += "mfc1 $$5, $$f2\n";
+ AsmText += "mfc1 $$4, $$f3\n";
+ AsmText += "mfc1 $$3, $$f0\n";
+ AsmText += "mfc1 $$2, $$f1\n";
+ }
+ break;
+
+ case NoFPRet:
+ break;
+ }
+
+ if (RV != NoFPRet)
+ AsmText += "jr $$18\n";
+ else
+ AsmText += "jr $$25\n";
+ EmitInlineAsm(Context, BB, AsmText);
+
+ new UnreachableInst(Context, BB);
+}
+
+//
+// Functions that are llvm intrinsics and don't need helpers.
+//
+static const char *const IntrinsicInline[] = {
+ "fabs", "fabsf",
+ "llvm.ceil.f32", "llvm.ceil.f64",
+ "llvm.copysign.f32", "llvm.copysign.f64",
+ "llvm.cos.f32", "llvm.cos.f64",
+ "llvm.exp.f32", "llvm.exp.f64",
+ "llvm.exp2.f32", "llvm.exp2.f64",
+ "llvm.fabs.f32", "llvm.fabs.f64",
+ "llvm.floor.f32", "llvm.floor.f64",
+ "llvm.fma.f32", "llvm.fma.f64",
+ "llvm.log.f32", "llvm.log.f64",
+ "llvm.log10.f32", "llvm.log10.f64",
+ "llvm.nearbyint.f32", "llvm.nearbyint.f64",
+ "llvm.pow.f32", "llvm.pow.f64",
+ "llvm.powi.f32", "llvm.powi.f64",
+ "llvm.rint.f32", "llvm.rint.f64",
+ "llvm.round.f32", "llvm.round.f64",
+ "llvm.sin.f32", "llvm.sin.f64",
+ "llvm.sqrt.f32", "llvm.sqrt.f64",
+ "llvm.trunc.f32", "llvm.trunc.f64",
+};
+
+static bool isIntrinsicInline(Function *F) {
+ return std::binary_search(std::begin(IntrinsicInline),
+ std::end(IntrinsicInline), F->getName());
+}
+//
+// Returns of float, double and complex need to be handled with a helper
+// function.
+//
+static bool fixupFPReturnAndCall(Function &F, Module *M,
+ const MipsTargetMachine &TM) {
+ bool Modified = false;
+ LLVMContext &C = M->getContext();
+ Type *MyVoid = Type::getVoidTy(C);
+ for (auto &BB: F)
+ for (auto &I: BB) {
+ if (const ReturnInst *RI = dyn_cast<ReturnInst>(&I)) {
+ Value *RVal = RI->getReturnValue();
+ if (!RVal) continue;
+ //
+ // If there is a return value and it needs a helper function,
+ // figure out which one and add a call before the actual
+ // return to this helper. The purpose of the helper is to move
+ // floating point values from their soft float return mapping to
+ // where they would have been mapped to in floating point registers.
+ //
+ Type *T = RVal->getType();
+ FPReturnVariant RV = whichFPReturnVariant(T);
+ if (RV == NoFPRet) continue;
+ static const char *const Helper[NoFPRet] = {
+ "__mips16_ret_sf", "__mips16_ret_df", "__mips16_ret_sc",
+ "__mips16_ret_dc"
+ };
+ const char *Name = Helper[RV];
+ AttributeSet A;
+ Value *Params[] = {RVal};
+ Modified = true;
+ //
+ // These helper functions have a different calling ABI so
+ // this __Mips16RetHelper indicates that so that later
+ // during call setup, the proper call lowering to the helper
+ // functions will take place.
+ //
+ A = A.addAttribute(C, AttributeSet::FunctionIndex,
+ "__Mips16RetHelper");
+ A = A.addAttribute(C, AttributeSet::FunctionIndex,
+ Attribute::ReadNone);
+ A = A.addAttribute(C, AttributeSet::FunctionIndex,
+ Attribute::NoInline);
+ Value *F = (M->getOrInsertFunction(Name, A, MyVoid, T, nullptr));
+ CallInst::Create(F, Params, "", &I);
+ } else if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
+ FunctionType *FT = CI->getFunctionType();
+ Function *F_ = CI->getCalledFunction();
+ if (needsFPReturnHelper(*FT) &&
+ !(F_ && isIntrinsicInline(F_))) {
+ Modified=true;
+ F.addFnAttr("saveS2");
+ }
+ if (F_ && !isIntrinsicInline(F_)) {
+ // pic mode calls are handled by already defined
+ // helper functions
+ if (needsFPReturnHelper(*F_)) {
+ Modified=true;
+ F.addFnAttr("saveS2");
+ }
+ if (!TM.isPositionIndependent()) {
+ if (needsFPHelperFromSig(*F_)) {
+ assureFPCallStub(*F_, M, TM);
+ Modified=true;
+ }
+ }
+ }
+ }
+ }
+ return Modified;
+}
+
+static void createFPFnStub(Function *F, Module *M, FPParamVariant PV,
+ const MipsTargetMachine &TM) {
+ bool PicMode = TM.isPositionIndependent();
+ bool LE = TM.isLittleEndian();
+ LLVMContext &Context = M->getContext();
+ std::string Name = F->getName();
+ std::string SectionName = ".mips16.fn." + Name;
+ std::string StubName = "__fn_stub_" + Name;
+ std::string LocalName = "$$__fn_local_" + Name;
+ Function *FStub = Function::Create
+ (F->getFunctionType(),
+ Function::InternalLinkage, StubName, M);
+ FStub->addFnAttr("mips16_fp_stub");
+ FStub->addFnAttr(llvm::Attribute::Naked);
+ FStub->addFnAttr(llvm::Attribute::NoUnwind);
+ FStub->addFnAttr(llvm::Attribute::NoInline);
+ FStub->addFnAttr("nomips16");
+ FStub->setSection(SectionName);
+ BasicBlock *BB = BasicBlock::Create(Context, "entry", FStub);
+
+ std::string AsmText;
+ if (PicMode) {
+ AsmText += ".set noreorder\n";
+ AsmText += ".cpload $$25\n";
+ AsmText += ".set reorder\n";
+ AsmText += ".reloc 0, R_MIPS_NONE, " + Name + "\n";
+ AsmText += "la $$25, " + LocalName + "\n";
+ } else
+ AsmText += "la $$25, " + Name + "\n";
+ AsmText += swapFPIntParams(PV, M, LE, false);
+ AsmText += "jr $$25\n";
+ AsmText += LocalName + " = " + Name + "\n";
+ EmitInlineAsm(Context, BB, AsmText);
+
+ new UnreachableInst(FStub->getContext(), BB);
+}
+
+//
+// remove the use-soft-float attribute
+//
+static void removeUseSoftFloat(Function &F) {
+ AttributeSet A;
+ DEBUG(errs() << "removing -use-soft-float\n");
+ A = A.addAttribute(F.getContext(), AttributeSet::FunctionIndex,
+ "use-soft-float", "false");
+ F.removeAttributes(AttributeSet::FunctionIndex, A);
+ if (F.hasFnAttribute("use-soft-float")) {
+ DEBUG(errs() << "still has -use-soft-float\n");
+ }
+ F.addAttributes(AttributeSet::FunctionIndex, A);
+}
+
+
+//
+// This pass only makes sense when the underlying chip has floating point but
+// we are compiling as mips16.
+// For all mips16 functions (that are not stubs we have already generated), or
+// declared via attributes as nomips16, we must:
+// 1) fixup all returns of float, double, single and double complex
+// by calling a helper function before the actual return.
+// 2) generate helper functions (stubs) that can be called by mips32
+// functions that will move parameters passed normally passed in
+// floating point
+// registers the soft float equivalents.
+// 3) in the case of static relocation, generate helper functions so that
+// mips16 functions can call extern functions of unknown type (mips16 or
+// mips32).
+// 4) TBD. For pic, calls to extern functions of unknown type are handled by
+// predefined helper functions in libc but this work is currently done
+// during call lowering but it should be moved here in the future.
+//
+bool Mips16HardFloat::runOnModule(Module &M) {
+ DEBUG(errs() << "Run on Module Mips16HardFloat\n");
+ bool Modified = false;
+ for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+ if (F->hasFnAttribute("nomips16") &&
+ F->hasFnAttribute("use-soft-float")) {
+ removeUseSoftFloat(*F);
+ continue;
+ }
+ if (F->isDeclaration() || F->hasFnAttribute("mips16_fp_stub") ||
+ F->hasFnAttribute("nomips16")) continue;
+ Modified |= fixupFPReturnAndCall(*F, &M, TM);
+ FPParamVariant V = whichFPParamVariantNeeded(*F);
+ if (V != NoSig) {
+ Modified = true;
+ createFPFnStub(&*F, &M, V, TM);
+ }
+ }
+ return Modified;
+}
+
+
+ModulePass *llvm::createMips16HardFloatPass(MipsTargetMachine &TM) {
+ return new Mips16HardFloat(TM);
+}
diff --git a/contrib/llvm/lib/Target/Mips/Mips16HardFloatInfo.cpp b/contrib/llvm/lib/Target/Mips/Mips16HardFloatInfo.cpp
new file mode 100644
index 000000000000..2eb6e5ddd2d9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16HardFloatInfo.cpp
@@ -0,0 +1,50 @@
+//===---- Mips16HardFloatInfo.cpp for Mips16 Hard Float -----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips16 implementation of Mips16HardFloatInfo
+// namespace.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips16HardFloatInfo.h"
+#include <string.h>
+
+namespace llvm {
+
+namespace Mips16HardFloatInfo {
+
+const FuncNameSignature PredefinedFuncs[] = {
+ { "__floatdidf", { NoSig, DRet } },
+ { "__floatdisf", { NoSig, FRet } },
+ { "__floatundidf", { NoSig, DRet } },
+ { "__fixsfdi", { FSig, NoFPRet } },
+ { "__fixunsdfsi", { DSig, NoFPRet } },
+ { "__fixunsdfdi", { DSig, NoFPRet } },
+ { "__fixdfdi", { DSig, NoFPRet } },
+ { "__fixunssfsi", { FSig, NoFPRet } },
+ { "__fixunssfdi", { FSig, NoFPRet } },
+ { "__floatundisf", { NoSig, FRet } },
+ { nullptr, { NoSig, NoFPRet } }
+};
+
+// just do a search for now. there are very few of these special cases.
+//
+extern FuncSignature const *findFuncSignature(const char *name) {
+ const char *name_;
+ int i = 0;
+ while (PredefinedFuncs[i].Name) {
+ name_ = PredefinedFuncs[i].Name;
+ if (strcmp(name, name_) == 0)
+ return &PredefinedFuncs[i].Signature;
+ i++;
+ }
+ return nullptr;
+}
+}
+}
diff --git a/contrib/llvm/lib/Target/Mips/Mips16HardFloatInfo.h b/contrib/llvm/lib/Target/Mips/Mips16HardFloatInfo.h
new file mode 100644
index 000000000000..7295c287576d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16HardFloatInfo.h
@@ -0,0 +1,50 @@
+//===---- Mips16HardFloatInfo.h for Mips16 Hard Float --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines some data structures relevant to the implementation of
+// Mips16 hard float.
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPS16HARDFLOATINFO_H
+#define LLVM_LIB_TARGET_MIPS_MIPS16HARDFLOATINFO_H
+
+namespace llvm {
+
+namespace Mips16HardFloatInfo {
+
+// Return types that matter for hard float are:
+// float, double, complex float, and complex double
+//
+enum FPReturnVariant { FRet, DRet, CFRet, CDRet, NoFPRet };
+
+//
+// Parameter type that matter are float, (float, float), (float, double),
+// double, (double, double), (double, float)
+//
+enum FPParamVariant { FSig, FFSig, FDSig, DSig, DDSig, DFSig, NoSig };
+
+struct FuncSignature {
+ FPParamVariant ParamSig;
+ FPReturnVariant RetSig;
+};
+
+struct FuncNameSignature {
+ const char *Name;
+ FuncSignature Signature;
+};
+
+extern const FuncNameSignature PredefinedFuncs[];
+
+extern FuncSignature const *findFuncSignature(const char *name);
+}
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
new file mode 100644
index 000000000000..ce193b1734f3
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
@@ -0,0 +1,261 @@
+//===-- Mips16ISelDAGToDAG.cpp - A Dag to Dag Inst Selector for Mips16 ----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsDAGToDAGISel specialized for mips16.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips16ISelDAGToDAG.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "Mips.h"
+#include "MipsMachineFunction.h"
+#include "MipsRegisterInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-isel"
+
+bool Mips16DAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
+ Subtarget = &static_cast<const MipsSubtarget &>(MF.getSubtarget());
+ if (!Subtarget->inMips16Mode())
+ return false;
+ return MipsDAGToDAGISel::runOnMachineFunction(MF);
+}
+/// Select multiply instructions.
+std::pair<SDNode *, SDNode *>
+Mips16DAGToDAGISel::selectMULT(SDNode *N, unsigned Opc, const SDLoc &DL, EVT Ty,
+ bool HasLo, bool HasHi) {
+ SDNode *Lo = nullptr, *Hi = nullptr;
+ SDNode *Mul = CurDAG->getMachineNode(Opc, DL, MVT::Glue, N->getOperand(0),
+ N->getOperand(1));
+ SDValue InFlag = SDValue(Mul, 0);
+
+ if (HasLo) {
+ unsigned Opcode = Mips::Mflo16;
+ Lo = CurDAG->getMachineNode(Opcode, DL, Ty, MVT::Glue, InFlag);
+ InFlag = SDValue(Lo, 1);
+ }
+ if (HasHi) {
+ unsigned Opcode = Mips::Mfhi16;
+ Hi = CurDAG->getMachineNode(Opcode, DL, Ty, InFlag);
+ }
+ return std::make_pair(Lo, Hi);
+}
+
+void Mips16DAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) {
+ MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+ if (!MipsFI->globalBaseRegSet())
+ return;
+
+ MachineBasicBlock &MBB = MF.front();
+ MachineBasicBlock::iterator I = MBB.begin();
+ MachineRegisterInfo &RegInfo = MF.getRegInfo();
+ const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
+ DebugLoc DL;
+ unsigned V0, V1, V2, GlobalBaseReg = MipsFI->getGlobalBaseReg();
+ const TargetRegisterClass *RC = &Mips::CPU16RegsRegClass;
+
+ V0 = RegInfo.createVirtualRegister(RC);
+ V1 = RegInfo.createVirtualRegister(RC);
+ V2 = RegInfo.createVirtualRegister(RC);
+
+
+ BuildMI(MBB, I, DL, TII.get(Mips::LiRxImmX16), V0)
+ .addExternalSymbol("_gp_disp", MipsII::MO_ABS_HI);
+ BuildMI(MBB, I, DL, TII.get(Mips::AddiuRxPcImmX16), V1)
+ .addExternalSymbol("_gp_disp", MipsII::MO_ABS_LO);
+
+ BuildMI(MBB, I, DL, TII.get(Mips::SllX16), V2).addReg(V0).addImm(16);
+ BuildMI(MBB, I, DL, TII.get(Mips::AdduRxRyRz16), GlobalBaseReg)
+ .addReg(V1)
+ .addReg(V2);
+}
+
+void Mips16DAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) {
+ initGlobalBaseReg(MF);
+}
+
+bool Mips16DAGToDAGISel::selectAddr(bool SPAllowed, SDValue Addr, SDValue &Base,
+ SDValue &Offset) {
+ SDLoc DL(Addr);
+ EVT ValTy = Addr.getValueType();
+
+ // if Address is FI, get the TargetFrameIndex.
+ if (SPAllowed) {
+ if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+ Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
+ Offset = CurDAG->getTargetConstant(0, DL, ValTy);
+ return true;
+ }
+ }
+ // on PIC code Load GA
+ if (Addr.getOpcode() == MipsISD::Wrapper) {
+ Base = Addr.getOperand(0);
+ Offset = Addr.getOperand(1);
+ return true;
+ }
+ if (!TM.isPositionIndependent()) {
+ if ((Addr.getOpcode() == ISD::TargetExternalSymbol ||
+ Addr.getOpcode() == ISD::TargetGlobalAddress))
+ return false;
+ }
+ // Addresses of the form FI+const or FI|const
+ if (CurDAG->isBaseWithConstantOffset(Addr)) {
+ ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+ if (isInt<16>(CN->getSExtValue())) {
+ // If the first operand is a FI, get the TargetFI Node
+ if (SPAllowed) {
+ if (FrameIndexSDNode *FIN =
+ dyn_cast<FrameIndexSDNode>(Addr.getOperand(0))) {
+ Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
+ Offset = CurDAG->getTargetConstant(CN->getZExtValue(), DL, ValTy);
+ return true;
+ }
+ }
+
+ Base = Addr.getOperand(0);
+ Offset = CurDAG->getTargetConstant(CN->getZExtValue(), DL, ValTy);
+ return true;
+ }
+ }
+ // Operand is a result from an ADD.
+ if (Addr.getOpcode() == ISD::ADD) {
+ // When loading from constant pools, load the lower address part in
+ // the instruction itself. Example, instead of:
+ // lui $2, %hi($CPI1_0)
+ // addiu $2, $2, %lo($CPI1_0)
+ // lwc1 $f0, 0($2)
+ // Generate:
+ // lui $2, %hi($CPI1_0)
+ // lwc1 $f0, %lo($CPI1_0)($2)
+ if (Addr.getOperand(1).getOpcode() == MipsISD::Lo ||
+ Addr.getOperand(1).getOpcode() == MipsISD::GPRel) {
+ SDValue Opnd0 = Addr.getOperand(1).getOperand(0);
+ if (isa<ConstantPoolSDNode>(Opnd0) || isa<GlobalAddressSDNode>(Opnd0) ||
+ isa<JumpTableSDNode>(Opnd0)) {
+ Base = Addr.getOperand(0);
+ Offset = Opnd0;
+ return true;
+ }
+ }
+ }
+ Base = Addr;
+ Offset = CurDAG->getTargetConstant(0, DL, ValTy);
+ return true;
+}
+
+bool Mips16DAGToDAGISel::selectAddr16(SDValue Addr, SDValue &Base,
+ SDValue &Offset) {
+ return selectAddr(false, Addr, Base, Offset);
+}
+
+bool Mips16DAGToDAGISel::selectAddr16SP(SDValue Addr, SDValue &Base,
+ SDValue &Offset) {
+ return selectAddr(true, Addr, Base, Offset);
+}
+
+/// Select instructions not customized! Used for
+/// expanded, promoted and normal instructions
+bool Mips16DAGToDAGISel::trySelect(SDNode *Node) {
+ unsigned Opcode = Node->getOpcode();
+ SDLoc DL(Node);
+
+ ///
+ // Instruction Selection not handled by the auto-generated
+ // tablegen selection should be handled here.
+ ///
+ EVT NodeTy = Node->getValueType(0);
+ unsigned MultOpc;
+
+ switch (Opcode) {
+ default:
+ break;
+
+ case ISD::SUBE:
+ case ISD::ADDE: {
+ SDValue InFlag = Node->getOperand(2), CmpLHS;
+ unsigned Opc = InFlag.getOpcode();
+ (void)Opc;
+ assert(((Opc == ISD::ADDC || Opc == ISD::ADDE) ||
+ (Opc == ISD::SUBC || Opc == ISD::SUBE)) &&
+ "(ADD|SUB)E flag operand must come from (ADD|SUB)C/E insn");
+
+ unsigned MOp;
+ if (Opcode == ISD::ADDE) {
+ CmpLHS = InFlag.getValue(0);
+ MOp = Mips::AdduRxRyRz16;
+ } else {
+ CmpLHS = InFlag.getOperand(0);
+ MOp = Mips::SubuRxRyRz16;
+ }
+
+ SDValue Ops[] = {CmpLHS, InFlag.getOperand(1)};
+
+ SDValue LHS = Node->getOperand(0);
+ SDValue RHS = Node->getOperand(1);
+
+ EVT VT = LHS.getValueType();
+
+ unsigned Sltu_op = Mips::SltuRxRyRz16;
+ SDNode *Carry = CurDAG->getMachineNode(Sltu_op, DL, VT, Ops);
+ unsigned Addu_op = Mips::AdduRxRyRz16;
+ SDNode *AddCarry =
+ CurDAG->getMachineNode(Addu_op, DL, VT, SDValue(Carry, 0), RHS);
+
+ CurDAG->SelectNodeTo(Node, MOp, VT, MVT::Glue, LHS, SDValue(AddCarry, 0));
+ return true;
+ }
+
+ /// Mul with two results
+ case ISD::SMUL_LOHI:
+ case ISD::UMUL_LOHI: {
+ MultOpc = (Opcode == ISD::UMUL_LOHI ? Mips::MultuRxRy16 : Mips::MultRxRy16);
+ std::pair<SDNode *, SDNode *> LoHi =
+ selectMULT(Node, MultOpc, DL, NodeTy, true, true);
+ if (!SDValue(Node, 0).use_empty())
+ ReplaceUses(SDValue(Node, 0), SDValue(LoHi.first, 0));
+
+ if (!SDValue(Node, 1).use_empty())
+ ReplaceUses(SDValue(Node, 1), SDValue(LoHi.second, 0));
+
+ CurDAG->RemoveDeadNode(Node);
+ return true;
+ }
+
+ case ISD::MULHS:
+ case ISD::MULHU: {
+ MultOpc = (Opcode == ISD::MULHU ? Mips::MultuRxRy16 : Mips::MultRxRy16);
+ auto LoHi = selectMULT(Node, MultOpc, DL, NodeTy, false, true);
+ ReplaceNode(Node, LoHi.second);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+FunctionPass *llvm::createMips16ISelDag(MipsTargetMachine &TM,
+ CodeGenOpt::Level OptLevel) {
+ return new Mips16DAGToDAGISel(TM, OptLevel);
+}
diff --git a/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.h b/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.h
new file mode 100644
index 000000000000..bbf8cc36f241
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.h
@@ -0,0 +1,55 @@
+//===---- Mips16ISelDAGToDAG.h - A Dag to Dag Inst Selector for Mips ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsDAGToDAGISel specialized for mips16.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPS16ISELDAGTODAG_H
+#define LLVM_LIB_TARGET_MIPS_MIPS16ISELDAGTODAG_H
+
+#include "MipsISelDAGToDAG.h"
+
+namespace llvm {
+
+class Mips16DAGToDAGISel : public MipsDAGToDAGISel {
+public:
+ explicit Mips16DAGToDAGISel(MipsTargetMachine &TM, CodeGenOpt::Level OL)
+ : MipsDAGToDAGISel(TM, OL) {}
+
+private:
+ std::pair<SDNode *, SDNode *> selectMULT(SDNode *N, unsigned Opc,
+ const SDLoc &DL, EVT Ty, bool HasLo,
+ bool HasHi);
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ bool selectAddr(bool SPAllowed, SDValue Addr, SDValue &Base,
+ SDValue &Offset);
+ bool selectAddr16(SDValue Addr, SDValue &Base,
+ SDValue &Offset) override;
+ bool selectAddr16SP(SDValue Addr, SDValue &Base,
+ SDValue &Offset) override;
+
+ bool trySelect(SDNode *Node) override;
+
+ void processFunctionAfterISel(MachineFunction &MF) override;
+
+ // Insert instructions to initialize the global base register in the
+ // first MBB of the function.
+ void initGlobalBaseReg(MachineFunction &MF);
+
+ void initMips16SPAliasReg(MachineFunction &MF);
+};
+
+FunctionPass *createMips16ISelDag(MipsTargetMachine &TM,
+ CodeGenOpt::Level OptLevel);
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.cpp b/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.cpp
new file mode 100644
index 000000000000..bdb9eec4cc5a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.cpp
@@ -0,0 +1,800 @@
+//===-- Mips16ISelLowering.h - Mips16 DAG Lowering Interface ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsTargetLowering specialized for mips16.
+//
+//===----------------------------------------------------------------------===//
+#include "Mips16ISelLowering.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "Mips16HardFloatInfo.h"
+#include "MipsMachineFunction.h"
+#include "MipsRegisterInfo.h"
+#include "MipsTargetMachine.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-lower"
+
+static cl::opt<bool> DontExpandCondPseudos16(
+ "mips16-dont-expand-cond-pseudo",
+ cl::init(false),
+ cl::desc("Don't expand conditional move related "
+ "pseudos for Mips 16"),
+ cl::Hidden);
+
+namespace {
+struct Mips16Libcall {
+ RTLIB::Libcall Libcall;
+ const char *Name;
+
+ bool operator<(const Mips16Libcall &RHS) const {
+ return std::strcmp(Name, RHS.Name) < 0;
+ }
+};
+
+struct Mips16IntrinsicHelperType{
+ const char* Name;
+ const char* Helper;
+
+ bool operator<(const Mips16IntrinsicHelperType &RHS) const {
+ return std::strcmp(Name, RHS.Name) < 0;
+ }
+ bool operator==(const Mips16IntrinsicHelperType &RHS) const {
+ return std::strcmp(Name, RHS.Name) == 0;
+ }
+};
+}
+
+// Libcalls for which no helper is generated. Sorted by name for binary search.
+static const Mips16Libcall HardFloatLibCalls[] = {
+ { RTLIB::ADD_F64, "__mips16_adddf3" },
+ { RTLIB::ADD_F32, "__mips16_addsf3" },
+ { RTLIB::DIV_F64, "__mips16_divdf3" },
+ { RTLIB::DIV_F32, "__mips16_divsf3" },
+ { RTLIB::OEQ_F64, "__mips16_eqdf2" },
+ { RTLIB::OEQ_F32, "__mips16_eqsf2" },
+ { RTLIB::FPEXT_F32_F64, "__mips16_extendsfdf2" },
+ { RTLIB::FPTOSINT_F64_I32, "__mips16_fix_truncdfsi" },
+ { RTLIB::FPTOSINT_F32_I32, "__mips16_fix_truncsfsi" },
+ { RTLIB::SINTTOFP_I32_F64, "__mips16_floatsidf" },
+ { RTLIB::SINTTOFP_I32_F32, "__mips16_floatsisf" },
+ { RTLIB::UINTTOFP_I32_F64, "__mips16_floatunsidf" },
+ { RTLIB::UINTTOFP_I32_F32, "__mips16_floatunsisf" },
+ { RTLIB::OGE_F64, "__mips16_gedf2" },
+ { RTLIB::OGE_F32, "__mips16_gesf2" },
+ { RTLIB::OGT_F64, "__mips16_gtdf2" },
+ { RTLIB::OGT_F32, "__mips16_gtsf2" },
+ { RTLIB::OLE_F64, "__mips16_ledf2" },
+ { RTLIB::OLE_F32, "__mips16_lesf2" },
+ { RTLIB::OLT_F64, "__mips16_ltdf2" },
+ { RTLIB::OLT_F32, "__mips16_ltsf2" },
+ { RTLIB::MUL_F64, "__mips16_muldf3" },
+ { RTLIB::MUL_F32, "__mips16_mulsf3" },
+ { RTLIB::UNE_F64, "__mips16_nedf2" },
+ { RTLIB::UNE_F32, "__mips16_nesf2" },
+ { RTLIB::UNKNOWN_LIBCALL, "__mips16_ret_dc" }, // No associated libcall.
+ { RTLIB::UNKNOWN_LIBCALL, "__mips16_ret_df" }, // No associated libcall.
+ { RTLIB::UNKNOWN_LIBCALL, "__mips16_ret_sc" }, // No associated libcall.
+ { RTLIB::UNKNOWN_LIBCALL, "__mips16_ret_sf" }, // No associated libcall.
+ { RTLIB::SUB_F64, "__mips16_subdf3" },
+ { RTLIB::SUB_F32, "__mips16_subsf3" },
+ { RTLIB::FPROUND_F64_F32, "__mips16_truncdfsf2" },
+ { RTLIB::UO_F64, "__mips16_unorddf2" },
+ { RTLIB::UO_F32, "__mips16_unordsf2" }
+};
+
+static const Mips16IntrinsicHelperType Mips16IntrinsicHelper[] = {
+ {"__fixunsdfsi", "__mips16_call_stub_2" },
+ {"ceil", "__mips16_call_stub_df_2"},
+ {"ceilf", "__mips16_call_stub_sf_1"},
+ {"copysign", "__mips16_call_stub_df_10"},
+ {"copysignf", "__mips16_call_stub_sf_5"},
+ {"cos", "__mips16_call_stub_df_2"},
+ {"cosf", "__mips16_call_stub_sf_1"},
+ {"exp2", "__mips16_call_stub_df_2"},
+ {"exp2f", "__mips16_call_stub_sf_1"},
+ {"floor", "__mips16_call_stub_df_2"},
+ {"floorf", "__mips16_call_stub_sf_1"},
+ {"log2", "__mips16_call_stub_df_2"},
+ {"log2f", "__mips16_call_stub_sf_1"},
+ {"nearbyint", "__mips16_call_stub_df_2"},
+ {"nearbyintf", "__mips16_call_stub_sf_1"},
+ {"rint", "__mips16_call_stub_df_2"},
+ {"rintf", "__mips16_call_stub_sf_1"},
+ {"sin", "__mips16_call_stub_df_2"},
+ {"sinf", "__mips16_call_stub_sf_1"},
+ {"sqrt", "__mips16_call_stub_df_2"},
+ {"sqrtf", "__mips16_call_stub_sf_1"},
+ {"trunc", "__mips16_call_stub_df_2"},
+ {"truncf", "__mips16_call_stub_sf_1"},
+};
+
+Mips16TargetLowering::Mips16TargetLowering(const MipsTargetMachine &TM,
+ const MipsSubtarget &STI)
+ : MipsTargetLowering(TM, STI) {
+
+ // Set up the register classes
+ addRegisterClass(MVT::i32, &Mips::CPU16RegsRegClass);
+
+ if (!Subtarget.useSoftFloat())
+ setMips16HardFloatLibCalls();
+
+ setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Expand);
+ setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Expand);
+ setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Expand);
+ setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, Expand);
+ setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand);
+ setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Expand);
+ setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, Expand);
+ setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, Expand);
+ setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand);
+ setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand);
+ setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand);
+ setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand);
+ setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand);
+
+ setOperationAction(ISD::ROTR, MVT::i32, Expand);
+ setOperationAction(ISD::ROTR, MVT::i64, Expand);
+ setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+ setOperationAction(ISD::BSWAP, MVT::i64, Expand);
+
+ computeRegisterProperties(STI.getRegisterInfo());
+}
+
+const MipsTargetLowering *
+llvm::createMips16TargetLowering(const MipsTargetMachine &TM,
+ const MipsSubtarget &STI) {
+ return new Mips16TargetLowering(TM, STI);
+}
+
+bool
+Mips16TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+ unsigned,
+ unsigned,
+ bool *Fast) const {
+ return false;
+}
+
+MachineBasicBlock *
+Mips16TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ switch (MI.getOpcode()) {
+ default:
+ return MipsTargetLowering::EmitInstrWithCustomInserter(MI, BB);
+ case Mips::SelBeqZ:
+ return emitSel16(Mips::BeqzRxImm16, MI, BB);
+ case Mips::SelBneZ:
+ return emitSel16(Mips::BnezRxImm16, MI, BB);
+ case Mips::SelTBteqZCmpi:
+ return emitSeliT16(Mips::Bteqz16, Mips::CmpiRxImmX16, MI, BB);
+ case Mips::SelTBteqZSlti:
+ return emitSeliT16(Mips::Bteqz16, Mips::SltiRxImmX16, MI, BB);
+ case Mips::SelTBteqZSltiu:
+ return emitSeliT16(Mips::Bteqz16, Mips::SltiuRxImmX16, MI, BB);
+ case Mips::SelTBtneZCmpi:
+ return emitSeliT16(Mips::Btnez16, Mips::CmpiRxImmX16, MI, BB);
+ case Mips::SelTBtneZSlti:
+ return emitSeliT16(Mips::Btnez16, Mips::SltiRxImmX16, MI, BB);
+ case Mips::SelTBtneZSltiu:
+ return emitSeliT16(Mips::Btnez16, Mips::SltiuRxImmX16, MI, BB);
+ case Mips::SelTBteqZCmp:
+ return emitSelT16(Mips::Bteqz16, Mips::CmpRxRy16, MI, BB);
+ case Mips::SelTBteqZSlt:
+ return emitSelT16(Mips::Bteqz16, Mips::SltRxRy16, MI, BB);
+ case Mips::SelTBteqZSltu:
+ return emitSelT16(Mips::Bteqz16, Mips::SltuRxRy16, MI, BB);
+ case Mips::SelTBtneZCmp:
+ return emitSelT16(Mips::Btnez16, Mips::CmpRxRy16, MI, BB);
+ case Mips::SelTBtneZSlt:
+ return emitSelT16(Mips::Btnez16, Mips::SltRxRy16, MI, BB);
+ case Mips::SelTBtneZSltu:
+ return emitSelT16(Mips::Btnez16, Mips::SltuRxRy16, MI, BB);
+ case Mips::BteqzT8CmpX16:
+ return emitFEXT_T8I816_ins(Mips::Bteqz16, Mips::CmpRxRy16, MI, BB);
+ case Mips::BteqzT8SltX16:
+ return emitFEXT_T8I816_ins(Mips::Bteqz16, Mips::SltRxRy16, MI, BB);
+ case Mips::BteqzT8SltuX16:
+ // TBD: figure out a way to get this or remove the instruction
+ // altogether.
+ return emitFEXT_T8I816_ins(Mips::Bteqz16, Mips::SltuRxRy16, MI, BB);
+ case Mips::BtnezT8CmpX16:
+ return emitFEXT_T8I816_ins(Mips::Btnez16, Mips::CmpRxRy16, MI, BB);
+ case Mips::BtnezT8SltX16:
+ return emitFEXT_T8I816_ins(Mips::Btnez16, Mips::SltRxRy16, MI, BB);
+ case Mips::BtnezT8SltuX16:
+ // TBD: figure out a way to get this or remove the instruction
+ // altogether.
+ return emitFEXT_T8I816_ins(Mips::Btnez16, Mips::SltuRxRy16, MI, BB);
+ case Mips::BteqzT8CmpiX16: return emitFEXT_T8I8I16_ins(
+ Mips::Bteqz16, Mips::CmpiRxImm16, Mips::CmpiRxImmX16, false, MI, BB);
+ case Mips::BteqzT8SltiX16: return emitFEXT_T8I8I16_ins(
+ Mips::Bteqz16, Mips::SltiRxImm16, Mips::SltiRxImmX16, true, MI, BB);
+ case Mips::BteqzT8SltiuX16: return emitFEXT_T8I8I16_ins(
+ Mips::Bteqz16, Mips::SltiuRxImm16, Mips::SltiuRxImmX16, false, MI, BB);
+ case Mips::BtnezT8CmpiX16: return emitFEXT_T8I8I16_ins(
+ Mips::Btnez16, Mips::CmpiRxImm16, Mips::CmpiRxImmX16, false, MI, BB);
+ case Mips::BtnezT8SltiX16: return emitFEXT_T8I8I16_ins(
+ Mips::Btnez16, Mips::SltiRxImm16, Mips::SltiRxImmX16, true, MI, BB);
+ case Mips::BtnezT8SltiuX16: return emitFEXT_T8I8I16_ins(
+ Mips::Btnez16, Mips::SltiuRxImm16, Mips::SltiuRxImmX16, false, MI, BB);
+ break;
+ case Mips::SltCCRxRy16:
+ return emitFEXT_CCRX16_ins(Mips::SltRxRy16, MI, BB);
+ break;
+ case Mips::SltiCCRxImmX16:
+ return emitFEXT_CCRXI16_ins
+ (Mips::SltiRxImm16, Mips::SltiRxImmX16, MI, BB);
+ case Mips::SltiuCCRxImmX16:
+ return emitFEXT_CCRXI16_ins
+ (Mips::SltiuRxImm16, Mips::SltiuRxImmX16, MI, BB);
+ case Mips::SltuCCRxRy16:
+ return emitFEXT_CCRX16_ins
+ (Mips::SltuRxRy16, MI, BB);
+ }
+}
+
+bool Mips16TargetLowering::isEligibleForTailCallOptimization(
+ const CCState &CCInfo, unsigned NextStackOffset,
+ const MipsFunctionInfo &FI) const {
+ // No tail call optimization for mips16.
+ return false;
+}
+
+void Mips16TargetLowering::setMips16HardFloatLibCalls() {
+ for (unsigned I = 0; I != array_lengthof(HardFloatLibCalls); ++I) {
+ assert((I == 0 || HardFloatLibCalls[I - 1] < HardFloatLibCalls[I]) &&
+ "Array not sorted!");
+ if (HardFloatLibCalls[I].Libcall != RTLIB::UNKNOWN_LIBCALL)
+ setLibcallName(HardFloatLibCalls[I].Libcall, HardFloatLibCalls[I].Name);
+ }
+
+ setLibcallName(RTLIB::O_F64, "__mips16_unorddf2");
+ setLibcallName(RTLIB::O_F32, "__mips16_unordsf2");
+}
+
+//
+// The Mips16 hard float is a crazy quilt inherited from gcc. I have a much
+// cleaner way to do all of this but it will have to wait until the traditional
+// gcc mechanism is completed.
+//
+// For Pic, in order for Mips16 code to call Mips32 code which according the abi
+// have either arguments or returned values placed in floating point registers,
+// we use a set of helper functions. (This includes functions which return type
+// complex which on Mips are returned in a pair of floating point registers).
+//
+// This is an encoding that we inherited from gcc.
+// In Mips traditional O32, N32 ABI, floating point numbers are passed in
+// floating point argument registers 1,2 only when the first and optionally
+// the second arguments are float (sf) or double (df).
+// For Mips16 we are only concerned with the situations where floating point
+// arguments are being passed in floating point registers by the ABI, because
+// Mips16 mode code cannot execute floating point instructions to load those
+// values and hence helper functions are needed.
+// The possibilities are (), (sf), (sf, sf), (sf, df), (df), (df, sf), (df, df)
+// the helper function suffixs for these are:
+// 0, 1, 5, 9, 2, 6, 10
+// this suffix can then be calculated as follows:
+// for a given argument Arg:
+// Arg1x, Arg2x = 1 : Arg is sf
+// 2 : Arg is df
+// 0: Arg is neither sf or df
+// So this stub is the string for number Arg1x + Arg2x*4.
+// However not all numbers between 0 and 10 are possible, we check anyway and
+// assert if the impossible exists.
+//
+
+unsigned int Mips16TargetLowering::getMips16HelperFunctionStubNumber
+ (ArgListTy &Args) const {
+ unsigned int resultNum = 0;
+ if (Args.size() >= 1) {
+ Type *t = Args[0].Ty;
+ if (t->isFloatTy()) {
+ resultNum = 1;
+ }
+ else if (t->isDoubleTy()) {
+ resultNum = 2;
+ }
+ }
+ if (resultNum) {
+ if (Args.size() >=2) {
+ Type *t = Args[1].Ty;
+ if (t->isFloatTy()) {
+ resultNum += 4;
+ }
+ else if (t->isDoubleTy()) {
+ resultNum += 8;
+ }
+ }
+ }
+ return resultNum;
+}
+
+//
+// Prefixes are attached to stub numbers depending on the return type.
+// return type: float sf_
+// double df_
+// single complex sc_
+// double complext dc_
+// others NO PREFIX
+//
+//
+// The full name of a helper function is__mips16_call_stub +
+// return type dependent prefix + stub number
+//
+// FIXME: This is something that probably should be in a different source file
+// and perhaps done differently but my main purpose is to not waste runtime
+// on something that we can enumerate in the source. Another possibility is
+// to have a python script to generate these mapping tables. This will do
+// for now. There are a whole series of helper function mapping arrays, one
+// for each return type class as outlined above. There there are 11 possible
+// entries. Ones with 0 are ones which should never be selected.
+//
+// All the arrays are similar except for ones which return neither
+// sf, df, sc, dc, in which we only care about ones which have sf or df as a
+// first parameter.
+//
+#define P_ "__mips16_call_stub_"
+#define MAX_STUB_NUMBER 10
+#define T1 P "1", P "2", 0, 0, P "5", P "6", 0, 0, P "9", P "10"
+#define T P "0" , T1
+#define P P_
+static char const * vMips16Helper[MAX_STUB_NUMBER+1] =
+ {nullptr, T1 };
+#undef P
+#define P P_ "sf_"
+static char const * sfMips16Helper[MAX_STUB_NUMBER+1] =
+ { T };
+#undef P
+#define P P_ "df_"
+static char const * dfMips16Helper[MAX_STUB_NUMBER+1] =
+ { T };
+#undef P
+#define P P_ "sc_"
+static char const * scMips16Helper[MAX_STUB_NUMBER+1] =
+ { T };
+#undef P
+#define P P_ "dc_"
+static char const * dcMips16Helper[MAX_STUB_NUMBER+1] =
+ { T };
+#undef P
+#undef P_
+
+
+const char* Mips16TargetLowering::
+ getMips16HelperFunction
+ (Type* RetTy, ArgListTy &Args, bool &needHelper) const {
+ const unsigned int stubNum = getMips16HelperFunctionStubNumber(Args);
+#ifndef NDEBUG
+ const unsigned int maxStubNum = 10;
+ assert(stubNum <= maxStubNum);
+ const bool validStubNum[maxStubNum+1] =
+ {true, true, true, false, false, true, true, false, false, true, true};
+ assert(validStubNum[stubNum]);
+#endif
+ const char *result;
+ if (RetTy->isFloatTy()) {
+ result = sfMips16Helper[stubNum];
+ }
+ else if (RetTy ->isDoubleTy()) {
+ result = dfMips16Helper[stubNum];
+ }
+ else if (RetTy->isStructTy()) {
+ // check if it's complex
+ if (RetTy->getNumContainedTypes() == 2) {
+ if ((RetTy->getContainedType(0)->isFloatTy()) &&
+ (RetTy->getContainedType(1)->isFloatTy())) {
+ result = scMips16Helper[stubNum];
+ }
+ else if ((RetTy->getContainedType(0)->isDoubleTy()) &&
+ (RetTy->getContainedType(1)->isDoubleTy())) {
+ result = dcMips16Helper[stubNum];
+ }
+ else {
+ llvm_unreachable("Uncovered condition");
+ }
+ }
+ else {
+ llvm_unreachable("Uncovered condition");
+ }
+ }
+ else {
+ if (stubNum == 0) {
+ needHelper = false;
+ return "";
+ }
+ result = vMips16Helper[stubNum];
+ }
+ needHelper = true;
+ return result;
+}
+
+void Mips16TargetLowering::
+getOpndList(SmallVectorImpl<SDValue> &Ops,
+ std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
+ bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
+ bool IsCallReloc, CallLoweringInfo &CLI, SDValue Callee,
+ SDValue Chain) const {
+ SelectionDAG &DAG = CLI.DAG;
+ MachineFunction &MF = DAG.getMachineFunction();
+ MipsFunctionInfo *FuncInfo = MF.getInfo<MipsFunctionInfo>();
+ const char* Mips16HelperFunction = nullptr;
+ bool NeedMips16Helper = false;
+
+ if (Subtarget.inMips16HardFloat()) {
+ //
+ // currently we don't have symbols tagged with the mips16 or mips32
+ // qualifier so we will assume that we don't know what kind it is.
+ // and generate the helper
+ //
+ bool LookupHelper = true;
+ if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(CLI.Callee)) {
+ Mips16Libcall Find = { RTLIB::UNKNOWN_LIBCALL, S->getSymbol() };
+
+ if (std::binary_search(std::begin(HardFloatLibCalls),
+ std::end(HardFloatLibCalls), Find))
+ LookupHelper = false;
+ else {
+ const char *Symbol = S->getSymbol();
+ Mips16IntrinsicHelperType IntrinsicFind = { Symbol, "" };
+ const Mips16HardFloatInfo::FuncSignature *Signature =
+ Mips16HardFloatInfo::findFuncSignature(Symbol);
+ if (!IsPICCall && (Signature && (FuncInfo->StubsNeeded.find(Symbol) ==
+ FuncInfo->StubsNeeded.end()))) {
+ FuncInfo->StubsNeeded[Symbol] = Signature;
+ //
+ // S2 is normally saved if the stub is for a function which
+ // returns a float or double value and is not otherwise. This is
+ // because more work is required after the function the stub
+ // is calling completes, and so the stub cannot directly return
+ // and the stub has no stack space to store the return address so
+ // S2 is used for that purpose.
+ // In order to take advantage of not saving S2, we need to also
+ // optimize the call in the stub and this requires some further
+ // functionality in MipsAsmPrinter which we don't have yet.
+ // So for now we always save S2. The optimization will be done
+ // in a follow-on patch.
+ //
+ if (1 || (Signature->RetSig != Mips16HardFloatInfo::NoFPRet))
+ FuncInfo->setSaveS2();
+ }
+ // one more look at list of intrinsics
+ const Mips16IntrinsicHelperType *Helper =
+ std::lower_bound(std::begin(Mips16IntrinsicHelper),
+ std::end(Mips16IntrinsicHelper), IntrinsicFind);
+ if (Helper != std::end(Mips16IntrinsicHelper) &&
+ *Helper == IntrinsicFind) {
+ Mips16HelperFunction = Helper->Helper;
+ NeedMips16Helper = true;
+ LookupHelper = false;
+ }
+
+ }
+ } else if (GlobalAddressSDNode *G =
+ dyn_cast<GlobalAddressSDNode>(CLI.Callee)) {
+ Mips16Libcall Find = { RTLIB::UNKNOWN_LIBCALL,
+ G->getGlobal()->getName().data() };
+
+ if (std::binary_search(std::begin(HardFloatLibCalls),
+ std::end(HardFloatLibCalls), Find))
+ LookupHelper = false;
+ }
+ if (LookupHelper)
+ Mips16HelperFunction =
+ getMips16HelperFunction(CLI.RetTy, CLI.getArgs(), NeedMips16Helper);
+ }
+
+ SDValue JumpTarget = Callee;
+
+ // T9 should contain the address of the callee function if
+ // -relocation-model=pic or it is an indirect call.
+ if (IsPICCall || !GlobalOrExternal) {
+ unsigned V0Reg = Mips::V0;
+ if (NeedMips16Helper) {
+ RegsToPass.push_front(std::make_pair(V0Reg, Callee));
+ JumpTarget = DAG.getExternalSymbol(Mips16HelperFunction,
+ getPointerTy(DAG.getDataLayout()));
+ ExternalSymbolSDNode *S = cast<ExternalSymbolSDNode>(JumpTarget);
+ JumpTarget = getAddrGlobal(S, CLI.DL, JumpTarget.getValueType(), DAG,
+ MipsII::MO_GOT, Chain,
+ FuncInfo->callPtrInfo(S->getSymbol()));
+ } else
+ RegsToPass.push_front(std::make_pair((unsigned)Mips::T9, Callee));
+ }
+
+ Ops.push_back(JumpTarget);
+
+ MipsTargetLowering::getOpndList(Ops, RegsToPass, IsPICCall, GlobalOrExternal,
+ InternalLinkage, IsCallReloc, CLI, Callee,
+ Chain);
+}
+
+MachineBasicBlock *
+Mips16TargetLowering::emitSel16(unsigned Opc, MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ if (DontExpandCondPseudos16)
+ return BB;
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ DebugLoc DL = MI.getDebugLoc();
+ // To "insert" a SELECT_CC instruction, we actually have to insert the
+ // diamond control-flow pattern. The incoming instruction knows the
+ // destination vreg to set, the condition code register to branch on, the
+ // true/false values to select between, and a branch opcode to use.
+ const BasicBlock *LLVM_BB = BB->getBasicBlock();
+ MachineFunction::iterator It = ++BB->getIterator();
+
+ // thisMBB:
+ // ...
+ // TrueVal = ...
+ // setcc r1, r2, r3
+ // bNE r1, r0, copy1MBB
+ // fallthrough --> copy0MBB
+ MachineBasicBlock *thisMBB = BB;
+ MachineFunction *F = BB->getParent();
+ MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ F->insert(It, copy0MBB);
+ F->insert(It, sinkMBB);
+
+ // Transfer the remainder of BB and its successor edges to sinkMBB.
+ sinkMBB->splice(sinkMBB->begin(), BB,
+ std::next(MachineBasicBlock::iterator(MI)), BB->end());
+ sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+ // Next, add the true and fallthrough blocks as its successors.
+ BB->addSuccessor(copy0MBB);
+ BB->addSuccessor(sinkMBB);
+
+ BuildMI(BB, DL, TII->get(Opc))
+ .addReg(MI.getOperand(3).getReg())
+ .addMBB(sinkMBB);
+
+ // copy0MBB:
+ // %FalseValue = ...
+ // # fallthrough to sinkMBB
+ BB = copy0MBB;
+
+ // Update machine-CFG edges
+ BB->addSuccessor(sinkMBB);
+
+ // sinkMBB:
+ // %Result = phi [ %TrueValue, thisMBB ], [ %FalseValue, copy0MBB ]
+ // ...
+ BB = sinkMBB;
+
+ BuildMI(*BB, BB->begin(), DL, TII->get(Mips::PHI), MI.getOperand(0).getReg())
+ .addReg(MI.getOperand(1).getReg())
+ .addMBB(thisMBB)
+ .addReg(MI.getOperand(2).getReg())
+ .addMBB(copy0MBB);
+
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
+ return BB;
+}
+
+MachineBasicBlock *
+Mips16TargetLowering::emitSelT16(unsigned Opc1, unsigned Opc2, MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ if (DontExpandCondPseudos16)
+ return BB;
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ DebugLoc DL = MI.getDebugLoc();
+ // To "insert" a SELECT_CC instruction, we actually have to insert the
+ // diamond control-flow pattern. The incoming instruction knows the
+ // destination vreg to set, the condition code register to branch on, the
+ // true/false values to select between, and a branch opcode to use.
+ const BasicBlock *LLVM_BB = BB->getBasicBlock();
+ MachineFunction::iterator It = ++BB->getIterator();
+
+ // thisMBB:
+ // ...
+ // TrueVal = ...
+ // setcc r1, r2, r3
+ // bNE r1, r0, copy1MBB
+ // fallthrough --> copy0MBB
+ MachineBasicBlock *thisMBB = BB;
+ MachineFunction *F = BB->getParent();
+ MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ F->insert(It, copy0MBB);
+ F->insert(It, sinkMBB);
+
+ // Transfer the remainder of BB and its successor edges to sinkMBB.
+ sinkMBB->splice(sinkMBB->begin(), BB,
+ std::next(MachineBasicBlock::iterator(MI)), BB->end());
+ sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+ // Next, add the true and fallthrough blocks as its successors.
+ BB->addSuccessor(copy0MBB);
+ BB->addSuccessor(sinkMBB);
+
+ BuildMI(BB, DL, TII->get(Opc2))
+ .addReg(MI.getOperand(3).getReg())
+ .addReg(MI.getOperand(4).getReg());
+ BuildMI(BB, DL, TII->get(Opc1)).addMBB(sinkMBB);
+
+ // copy0MBB:
+ // %FalseValue = ...
+ // # fallthrough to sinkMBB
+ BB = copy0MBB;
+
+ // Update machine-CFG edges
+ BB->addSuccessor(sinkMBB);
+
+ // sinkMBB:
+ // %Result = phi [ %TrueValue, thisMBB ], [ %FalseValue, copy0MBB ]
+ // ...
+ BB = sinkMBB;
+
+ BuildMI(*BB, BB->begin(), DL, TII->get(Mips::PHI), MI.getOperand(0).getReg())
+ .addReg(MI.getOperand(1).getReg())
+ .addMBB(thisMBB)
+ .addReg(MI.getOperand(2).getReg())
+ .addMBB(copy0MBB);
+
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
+ return BB;
+
+}
+
+MachineBasicBlock *
+Mips16TargetLowering::emitSeliT16(unsigned Opc1, unsigned Opc2,
+ MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ if (DontExpandCondPseudos16)
+ return BB;
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ DebugLoc DL = MI.getDebugLoc();
+ // To "insert" a SELECT_CC instruction, we actually have to insert the
+ // diamond control-flow pattern. The incoming instruction knows the
+ // destination vreg to set, the condition code register to branch on, the
+ // true/false values to select between, and a branch opcode to use.
+ const BasicBlock *LLVM_BB = BB->getBasicBlock();
+ MachineFunction::iterator It = ++BB->getIterator();
+
+ // thisMBB:
+ // ...
+ // TrueVal = ...
+ // setcc r1, r2, r3
+ // bNE r1, r0, copy1MBB
+ // fallthrough --> copy0MBB
+ MachineBasicBlock *thisMBB = BB;
+ MachineFunction *F = BB->getParent();
+ MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ F->insert(It, copy0MBB);
+ F->insert(It, sinkMBB);
+
+ // Transfer the remainder of BB and its successor edges to sinkMBB.
+ sinkMBB->splice(sinkMBB->begin(), BB,
+ std::next(MachineBasicBlock::iterator(MI)), BB->end());
+ sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+ // Next, add the true and fallthrough blocks as its successors.
+ BB->addSuccessor(copy0MBB);
+ BB->addSuccessor(sinkMBB);
+
+ BuildMI(BB, DL, TII->get(Opc2))
+ .addReg(MI.getOperand(3).getReg())
+ .addImm(MI.getOperand(4).getImm());
+ BuildMI(BB, DL, TII->get(Opc1)).addMBB(sinkMBB);
+
+ // copy0MBB:
+ // %FalseValue = ...
+ // # fallthrough to sinkMBB
+ BB = copy0MBB;
+
+ // Update machine-CFG edges
+ BB->addSuccessor(sinkMBB);
+
+ // sinkMBB:
+ // %Result = phi [ %TrueValue, thisMBB ], [ %FalseValue, copy0MBB ]
+ // ...
+ BB = sinkMBB;
+
+ BuildMI(*BB, BB->begin(), DL, TII->get(Mips::PHI), MI.getOperand(0).getReg())
+ .addReg(MI.getOperand(1).getReg())
+ .addMBB(thisMBB)
+ .addReg(MI.getOperand(2).getReg())
+ .addMBB(copy0MBB);
+
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
+ return BB;
+
+}
+
+MachineBasicBlock *
+Mips16TargetLowering::emitFEXT_T8I816_ins(unsigned BtOpc, unsigned CmpOpc,
+ MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ if (DontExpandCondPseudos16)
+ return BB;
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ unsigned regX = MI.getOperand(0).getReg();
+ unsigned regY = MI.getOperand(1).getReg();
+ MachineBasicBlock *target = MI.getOperand(2).getMBB();
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(CmpOpc))
+ .addReg(regX)
+ .addReg(regY);
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(BtOpc)).addMBB(target);
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
+ return BB;
+}
+
+MachineBasicBlock *Mips16TargetLowering::emitFEXT_T8I8I16_ins(
+ unsigned BtOpc, unsigned CmpiOpc, unsigned CmpiXOpc, bool ImmSigned,
+ MachineInstr &MI, MachineBasicBlock *BB) const {
+ if (DontExpandCondPseudos16)
+ return BB;
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ unsigned regX = MI.getOperand(0).getReg();
+ int64_t imm = MI.getOperand(1).getImm();
+ MachineBasicBlock *target = MI.getOperand(2).getMBB();
+ unsigned CmpOpc;
+ if (isUInt<8>(imm))
+ CmpOpc = CmpiOpc;
+ else if ((!ImmSigned && isUInt<16>(imm)) ||
+ (ImmSigned && isInt<16>(imm)))
+ CmpOpc = CmpiXOpc;
+ else
+ llvm_unreachable("immediate field not usable");
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(CmpOpc)).addReg(regX).addImm(imm);
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(BtOpc)).addMBB(target);
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
+ return BB;
+}
+
+static unsigned Mips16WhichOp8uOr16simm
+ (unsigned shortOp, unsigned longOp, int64_t Imm) {
+ if (isUInt<8>(Imm))
+ return shortOp;
+ else if (isInt<16>(Imm))
+ return longOp;
+ else
+ llvm_unreachable("immediate field not usable");
+}
+
+MachineBasicBlock *
+Mips16TargetLowering::emitFEXT_CCRX16_ins(unsigned SltOpc, MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ if (DontExpandCondPseudos16)
+ return BB;
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ unsigned CC = MI.getOperand(0).getReg();
+ unsigned regX = MI.getOperand(1).getReg();
+ unsigned regY = MI.getOperand(2).getReg();
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SltOpc))
+ .addReg(regX)
+ .addReg(regY);
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Mips::MoveR3216), CC)
+ .addReg(Mips::T8);
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
+ return BB;
+}
+
+MachineBasicBlock *
+Mips16TargetLowering::emitFEXT_CCRXI16_ins(unsigned SltiOpc, unsigned SltiXOpc,
+ MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ if (DontExpandCondPseudos16)
+ return BB;
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ unsigned CC = MI.getOperand(0).getReg();
+ unsigned regX = MI.getOperand(1).getReg();
+ int64_t Imm = MI.getOperand(2).getImm();
+ unsigned SltOpc = Mips16WhichOp8uOr16simm(SltiOpc, SltiXOpc, Imm);
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SltOpc)).addReg(regX).addImm(Imm);
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Mips::MoveR3216), CC)
+ .addReg(Mips::T8);
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
+ return BB;
+
+}
diff --git a/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.h b/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.h
new file mode 100644
index 000000000000..0ee0b816ef70
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.h
@@ -0,0 +1,82 @@
+//===-- Mips16ISelLowering.h - Mips16 DAG Lowering Interface ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsTargetLowering specialized for mips16.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPS16ISELLOWERING_H
+#define LLVM_LIB_TARGET_MIPS_MIPS16ISELLOWERING_H
+
+#include "MipsISelLowering.h"
+
+namespace llvm {
+ class Mips16TargetLowering : public MipsTargetLowering {
+ public:
+ explicit Mips16TargetLowering(const MipsTargetMachine &TM,
+ const MipsSubtarget &STI);
+
+ bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace,
+ unsigned Align,
+ bool *Fast) const override;
+
+ MachineBasicBlock *
+ EmitInstrWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *MBB) const override;
+
+ private:
+ bool isEligibleForTailCallOptimization(
+ const CCState &CCInfo, unsigned NextStackOffset,
+ const MipsFunctionInfo &FI) const override;
+
+ void setMips16HardFloatLibCalls();
+
+ unsigned int
+ getMips16HelperFunctionStubNumber(ArgListTy &Args) const;
+
+ const char *getMips16HelperFunction
+ (Type* RetTy, ArgListTy &Args, bool &needHelper) const;
+
+ void
+ getOpndList(SmallVectorImpl<SDValue> &Ops,
+ std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
+ bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
+ bool IsCallReloc, CallLoweringInfo &CLI, SDValue Callee,
+ SDValue Chain) const override;
+
+ MachineBasicBlock *emitSel16(unsigned Opc, MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *emitSeliT16(unsigned Opc1, unsigned Opc2,
+ MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *emitSelT16(unsigned Opc1, unsigned Opc2,
+ MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *emitFEXT_T8I816_ins(unsigned BtOpc, unsigned CmpOpc,
+ MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *emitFEXT_T8I8I16_ins(unsigned BtOpc, unsigned CmpiOpc,
+ unsigned CmpiXOpc, bool ImmSigned,
+ MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *emitFEXT_CCRX16_ins(unsigned SltOpc, MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *emitFEXT_CCRXI16_ins(unsigned SltiOpc, unsigned SltiXOpc,
+ MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+ };
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/Mips16InstrFormats.td b/contrib/llvm/lib/Target/Mips/Mips16InstrFormats.td
new file mode 100644
index 000000000000..4ff68bef957e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16InstrFormats.td
@@ -0,0 +1,640 @@
+//===- Mips16InstrFormats.td - Mips Instruction Formats ----*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Describe MIPS instructions format
+//
+// CPU INSTRUCTION FORMATS
+//
+// funct or f Function field
+//
+// immediate 4-,5-,8- or 11-bit immediate, branch displacement, or
+// or imm address displacement
+//
+// op 5-bit major operation code
+//
+// rx 3-bit source or destination register
+//
+// ry 3-bit source or destination register
+//
+// rz 3-bit source or destination register
+//
+// sa 3- or 5-bit shift amount
+//
+//===----------------------------------------------------------------------===//
+
+
+// Base class for Mips 16 Format
+// This class does not depend on the instruction size
+//
+class MipsInst16_Base<dag outs, dag ins, string asmstr, list<dag> pattern,
+ InstrItinClass itin>: Instruction
+{
+
+ let Namespace = "Mips";
+
+ let OutOperandList = outs;
+ let InOperandList = ins;
+
+ let AsmString = asmstr;
+ let Pattern = pattern;
+ let Itinerary = itin;
+
+ let Predicates = [InMips16Mode];
+}
+
+//
+// Generic Mips 16 Format
+//
+class MipsInst16<dag outs, dag ins, string asmstr, list<dag> pattern,
+ InstrItinClass itin>:
+ MipsInst16_Base<outs, ins, asmstr, pattern, itin>
+{
+ field bits<16> Inst;
+ bits<5> Opcode = 0;
+
+ // Top 5 bits are the 'opcode' field
+ let Inst{15-11} = Opcode;
+
+ let Size=2;
+ field bits<16> SoftFail = 0;
+}
+
+//
+// For 32 bit extended instruction forms.
+//
+class MipsInst16_32<dag outs, dag ins, string asmstr, list<dag> pattern,
+ InstrItinClass itin>:
+ MipsInst16_Base<outs, ins, asmstr, pattern, itin>
+{
+ field bits<32> Inst;
+
+ let Size=4;
+ field bits<32> SoftFail = 0;
+}
+
+class MipsInst16_EXTEND<dag outs, dag ins, string asmstr, list<dag> pattern,
+ InstrItinClass itin>:
+ MipsInst16_32<outs, ins, asmstr, pattern, itin>
+{
+ let Inst{31-27} = 0b11110;
+}
+
+
+
+// Mips Pseudo Instructions Format
+class MipsPseudo16<dag outs, dag ins, string asmstr, list<dag> pattern>:
+ MipsInst16<outs, ins, asmstr, pattern, IIPseudo> {
+ let isCodeGenOnly = 1;
+ let isPseudo = 1;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Format I instruction class in Mips : <|opcode|imm11|>
+//===----------------------------------------------------------------------===//
+
+class FI16<bits<5> op, dag outs, dag ins, string asmstr, list<dag> pattern,
+ InstrItinClass itin>:
+ MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+ bits<11> imm11;
+
+ let Opcode = op;
+
+ let Inst{10-0} = imm11;
+}
+
+//===----------------------------------------------------------------------===//
+// Format RI instruction class in Mips : <|opcode|rx|imm8|>
+//===----------------------------------------------------------------------===//
+
+class FRI16<bits<5> op, dag outs, dag ins, string asmstr,
+ list<dag> pattern, InstrItinClass itin>:
+ MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+ bits<3> rx;
+ bits<8> imm8;
+
+ let Opcode = op;
+
+ let Inst{10-8} = rx;
+ let Inst{7-0} = imm8;
+}
+
+//===----------------------------------------------------------------------===//
+// Format RR instruction class in Mips : <|opcode|rx|ry|funct|>
+//===----------------------------------------------------------------------===//
+
+class FRR16<bits<5> _funct, dag outs, dag ins, string asmstr,
+ list<dag> pattern, InstrItinClass itin>:
+ MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+ bits<3> rx;
+ bits<3> ry;
+ bits<5> funct;
+
+ let Opcode = 0b11101;
+ let funct = _funct;
+
+ let Inst{10-8} = rx;
+ let Inst{7-5} = ry;
+ let Inst{4-0} = funct;
+}
+
+class FRRBreak16<dag outs, dag ins, string asmstr,
+ list<dag> pattern, InstrItinClass itin>:
+ MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+ bits<6> Code;
+ bits<5> funct;
+
+ let Opcode = 0b11101;
+ let funct = 0b00101;
+
+ let Inst{10-5} = Code;
+ let Inst{4-0} = funct;
+}
+
+//
+// For conversion functions.
+//
+class FRR_SF16<bits<5> _funct, bits<3> _subfunct, dag outs, dag ins,
+ string asmstr, list<dag> pattern, InstrItinClass itin>:
+ MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+ bits<3> rx;
+ bits<3> subfunct;
+ bits<5> funct;
+
+ let Opcode = 0b11101; // RR
+ let funct = _funct;
+ let subfunct = _subfunct;
+
+ let Inst{10-8} = rx;
+ let Inst{7-5} = subfunct;
+ let Inst{4-0} = funct;
+}
+
+//
+// just used for breakpoint (hardware and software) instructions.
+//
+class FC16<bits<5> _funct, dag outs, dag ins, string asmstr,
+ list<dag> pattern, InstrItinClass itin>:
+ MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+ bits<6> _code; // code is a keyword in tablegen
+ bits<5> funct;
+
+ let Opcode = 0b11101; // RR
+ let funct = _funct;
+
+ let Inst{10-5} = _code;
+ let Inst{4-0} = funct;
+}
+
+//
+// J(AL)R(C) subformat
+//
+class FRR16_JALRC<bits<1> _nd, bits<1> _l, bits<1> r_a,
+ dag outs, dag ins, string asmstr,
+ list<dag> pattern, InstrItinClass itin>:
+ MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+ bits<3> rx;
+ bits<1> nd;
+ bits<1> l;
+ bits<1> ra;
+
+ let nd = _nd;
+ let l = _l;
+ let ra = r_a;
+
+ let Opcode = 0b11101;
+
+ let Inst{10-8} = rx;
+ let Inst{7} = nd;
+ let Inst{6} = l;
+ let Inst{5} = ra;
+ let Inst{4-0} = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Format RRI instruction class in Mips : <|opcode|rx|ry|imm5|>
+//===----------------------------------------------------------------------===//
+
+class FRRI16<bits<5> op, dag outs, dag ins, string asmstr,
+ list<dag> pattern, InstrItinClass itin>:
+ MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+ bits<3> rx;
+ bits<3> ry;
+ bits<5> imm5;
+
+ let Opcode = op;
+
+
+ let Inst{10-8} = rx;
+ let Inst{7-5} = ry;
+ let Inst{4-0} = imm5;
+}
+
+//===----------------------------------------------------------------------===//
+// Format RRR instruction class in Mips : <|opcode|rx|ry|rz|f|>
+//===----------------------------------------------------------------------===//
+
+class FRRR16<bits<2> _f, dag outs, dag ins, string asmstr,
+ list<dag> pattern, InstrItinClass itin>:
+ MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+ bits<3> rx;
+ bits<3> ry;
+ bits<3> rz;
+ bits<2> f;
+
+ let Opcode = 0b11100;
+ let f = _f;
+
+ let Inst{10-8} = rx;
+ let Inst{7-5} = ry;
+ let Inst{4-2} = rz;
+ let Inst{1-0} = f;
+}
+
+//===----------------------------------------------------------------------===//
+// Format RRI-A instruction class in Mips : <|opcode|rx|ry|f|imm4|>
+//===----------------------------------------------------------------------===//
+
+class FRRI_A16<bits<1> _f, dag outs, dag ins, string asmstr,
+ list<dag> pattern, InstrItinClass itin>:
+ MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+ bits<3> rx;
+ bits<3> ry;
+ bits<1> f;
+ bits<4> imm4;
+
+ let Opcode = 0b01000;
+ let f = _f;
+
+ let Inst{10-8} = rx;
+ let Inst{7-5} = ry;
+ let Inst{4} = f;
+ let Inst{3-0} = imm4;
+}
+
+//===----------------------------------------------------------------------===//
+// Format Shift instruction class in Mips : <|opcode|rx|ry|sa|f|>
+//===----------------------------------------------------------------------===//
+
+class FSHIFT16<bits<2> _f, dag outs, dag ins, string asmstr,
+ list<dag> pattern, InstrItinClass itin>:
+ MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+ bits<3> rx;
+ bits<3> ry;
+ bits<3> sa;
+ bits<2> f;
+
+ let Opcode = 0b00110;
+ let f = _f;
+
+ let Inst{10-8} = rx;
+ let Inst{7-5} = ry;
+ let Inst{4-2} = sa;
+ let Inst{1-0} = f;
+}
+
+//===----------------------------------------------------------------------===//
+// Format i8 instruction class in Mips : <|opcode|funct|imm8>
+//===----------------------------------------------------------------------===//
+
+class FI816<bits<3> _func, dag outs, dag ins, string asmstr,
+ list<dag> pattern, InstrItinClass itin>:
+ MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+ bits<3> func;
+ bits<8> imm8;
+
+ let Opcode = 0b01100;
+ let func = _func;
+
+ let Inst{10-8} = func;
+ let Inst{7-0} = imm8;
+}
+
+//===----------------------------------------------------------------------===//
+// Format i8_MOVR32 instruction class in Mips : <|opcode|func|ry|r32>
+//===----------------------------------------------------------------------===//
+
+class FI8_MOVR3216<dag outs, dag ins, string asmstr,
+ list<dag> pattern, InstrItinClass itin>:
+ MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+
+ bits<4> ry;
+ bits<4> r32;
+
+ let Opcode = 0b01100;
+
+ let Inst{10-8} = 0b111;
+ let Inst{7-4} = ry;
+ let Inst{3-0} = r32;
+
+}
+
+
+
+//===----------------------------------------------------------------------===//
+// Format i8_MOV32R instruction class in Mips : <|opcode|func|r32|rz>
+//===----------------------------------------------------------------------===//
+
+class FI8_MOV32R16<dag outs, dag ins, string asmstr,
+ list<dag> pattern, InstrItinClass itin>:
+ MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+
+ bits<3> func;
+ bits<5> r32;
+ bits<3> rz;
+
+
+ let Opcode = 0b01100;
+
+ let Inst{10-8} = 0b101;
+ let Inst{7-5} = r32{2-0};
+ let Inst{4-3} = r32{4-3};
+ let Inst{2-0} = rz;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format i8_SVRS instruction class in Mips :
+// <|opcode|svrs|s|ra|s0|s1|framesize>
+//===----------------------------------------------------------------------===//
+
+class FI8_SVRS16<bits<1> _s, dag outs, dag ins, string asmstr,
+ list<dag> pattern, InstrItinClass itin>:
+ MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+ bits<1> s;
+ bits<1> ra = 0;
+ bits<1> s0 = 0;
+ bits<1> s1 = 0;
+ bits<4> framesize = 0;
+
+ let s =_s;
+ let Opcode = 0b01100;
+
+ let Inst{10-8} = 0b100;
+ let Inst{7} = s;
+ let Inst{6} = ra;
+ let Inst{5} = s0;
+ let Inst{4} = s1;
+ let Inst{3-0} = framesize;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format JAL instruction class in Mips16 :
+// <|opcode|svrs|s|ra|s0|s1|framesize>
+//===----------------------------------------------------------------------===//
+
+class FJAL16<bits<1> _X, dag outs, dag ins, string asmstr,
+ list<dag> pattern, InstrItinClass itin>:
+ MipsInst16_32<outs, ins, asmstr, pattern, itin>
+{
+ bits<1> X;
+ bits<26> imm26;
+
+
+ let X = _X;
+
+ let Inst{31-27} = 0b00011;
+ let Inst{26} = X;
+ let Inst{25-21} = imm26{20-16};
+ let Inst{20-16} = imm26{25-21};
+ let Inst{15-0} = imm26{15-0};
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format EXT-I instruction class in Mips16 :
+// <|EXTEND|imm10:5|imm15:11|op|0|0|0|0|0|0|imm4:0>
+//===----------------------------------------------------------------------===//
+
+class FEXT_I16<bits<5> _eop, dag outs, dag ins, string asmstr,
+ list<dag> pattern, InstrItinClass itin>:
+ MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
+{
+ bits<16> imm16;
+ bits<5> eop;
+
+ let eop = _eop;
+
+ let Inst{26-21} = imm16{10-5};
+ let Inst{20-16} = imm16{15-11};
+ let Inst{15-11} = eop;
+ let Inst{10-5} = 0;
+ let Inst{4-0} = imm16{4-0};
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format ASMACRO instruction class in Mips16 :
+// <EXTEND|select|p4|p3|RRR|p2|p1|p0>
+//===----------------------------------------------------------------------===//
+
+class FASMACRO16<dag outs, dag ins, string asmstr,
+ list<dag> pattern, InstrItinClass itin>:
+ MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
+{
+ bits<3> select;
+ bits<3> p4;
+ bits<5> p3;
+ bits<5> RRR = 0b11100;
+ bits<3> p2;
+ bits<3> p1;
+ bits<5> p0;
+
+
+ let Inst{26-24} = select;
+ let Inst{23-21} = p4;
+ let Inst{20-16} = p3;
+ let Inst{15-11} = RRR;
+ let Inst{10-8} = p2;
+ let Inst{7-5} = p1;
+ let Inst{4-0} = p0;
+
+}
+
+
+//===----------------------------------------------------------------------===//
+// Format EXT-RI instruction class in Mips16 :
+// <|EXTEND|imm10:5|imm15:11|op|rx|0|0|0|imm4:0>
+//===----------------------------------------------------------------------===//
+
+class FEXT_RI16<bits<5> _op, dag outs, dag ins, string asmstr,
+ list<dag> pattern, InstrItinClass itin>:
+ MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
+{
+ bits<16> imm16;
+ bits<5> op;
+ bits<3> rx;
+
+ let op = _op;
+
+ let Inst{26-21} = imm16{10-5};
+ let Inst{20-16} = imm16{15-11};
+ let Inst{15-11} = op;
+ let Inst{10-8} = rx;
+ let Inst{7-5} = 0;
+ let Inst{4-0} = imm16{4-0};
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format EXT-RRI instruction class in Mips16 :
+// <|EXTEND|imm10:5|imm15:11|op|rx|ry|imm4:0>
+//===----------------------------------------------------------------------===//
+
+class FEXT_RRI16<bits<5> _op, dag outs, dag ins, string asmstr,
+ list<dag> pattern, InstrItinClass itin>:
+ MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
+{
+ bits<5> op;
+ bits<16> imm16;
+ bits<3> rx;
+ bits<3> ry;
+
+ let op=_op;
+
+ let Inst{26-21} = imm16{10-5};
+ let Inst{20-16} = imm16{15-11};
+ let Inst{15-11} = op;
+ let Inst{10-8} = rx;
+ let Inst{7-5} = ry;
+ let Inst{4-0} = imm16{4-0};
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format EXT-RRI-A instruction class in Mips16 :
+// <|EXTEND|imm10:4|imm14:11|RRI-A|rx|ry|f|imm3:0>
+//===----------------------------------------------------------------------===//
+
+class FEXT_RRI_A16<bits<1> _f, dag outs, dag ins, string asmstr,
+ list<dag> pattern, InstrItinClass itin>:
+ MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
+{
+ bits<15> imm15;
+ bits<3> rx;
+ bits<3> ry;
+ bits<1> f;
+
+ let f = _f;
+
+ let Inst{26-20} = imm15{10-4};
+ let Inst{19-16} = imm15{14-11};
+ let Inst{15-11} = 0b01000;
+ let Inst{10-8} = rx;
+ let Inst{7-5} = ry;
+ let Inst{4} = f;
+ let Inst{3-0} = imm15{3-0};
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format EXT-SHIFT instruction class in Mips16 :
+// <|EXTEND|sa 4:0|s5|0|SHIFT|rx|ry|0|f>
+//===----------------------------------------------------------------------===//
+
+class FEXT_SHIFT16<bits<2> _f, dag outs, dag ins, string asmstr,
+ list<dag> pattern, InstrItinClass itin>:
+ MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
+{
+ bits<6> sa6;
+ bits<3> rx;
+ bits<3> ry;
+ bits<2> f;
+
+ let f = _f;
+
+ let Inst{26-22} = sa6{4-0};
+ let Inst{21} = sa6{5};
+ let Inst{20-16} = 0;
+ let Inst{15-11} = 0b00110;
+ let Inst{10-8} = rx;
+ let Inst{7-5} = ry;
+ let Inst{4-2} = 0;
+ let Inst{1-0} = f;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format EXT-I8 instruction class in Mips16 :
+// <|EXTEND|imm10:5|imm15:11|I8|funct|0|imm4:0>
+//===----------------------------------------------------------------------===//
+
+class FEXT_I816<bits<3> _funct, dag outs, dag ins, string asmstr,
+ list<dag> pattern, InstrItinClass itin>:
+ MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
+{
+ bits<16> imm16;
+ bits<5> I8;
+ bits<3> funct;
+
+ let funct = _funct;
+ let I8 = 0b00110;
+
+ let Inst{26-21} = imm16{10-5};
+ let Inst{20-16} = imm16{15-11};
+ let Inst{15-11} = I8;
+ let Inst{10-8} = funct;
+ let Inst{7-5} = 0;
+ let Inst{4-0} = imm16{4-0};
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format EXT-I8_SVRS instruction class in Mips16 :
+// <|EXTEND|xsregs|framesize7:4|aregs|I8|SVRS|s|ra|s0|s1|framesize3:0>
+//===----------------------------------------------------------------------===//
+
+class FEXT_I8_SVRS16<bits<1> s_, dag outs, dag ins, string asmstr,
+ list<dag> pattern, InstrItinClass itin>:
+ MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
+{
+ bits<3> xsregs =0;
+ bits<8> framesize =0;
+ bits<3> aregs =0;
+ bits<5> I8 = 0b01100;
+ bits<3> SVRS = 0b100;
+ bits<1> s;
+ bits<1> ra = 0;
+ bits<1> s0 = 0;
+ bits<1> s1 = 0;
+
+ let s= s_;
+
+ let Inst{26-24} = xsregs;
+ let Inst{23-20} = framesize{7-4};
+ let Inst{19} = 0;
+ let Inst{18-16} = aregs;
+ let Inst{15-11} = I8;
+ let Inst{10-8} = SVRS;
+ let Inst{7} = s;
+ let Inst{6} = ra;
+ let Inst{5} = s0;
+ let Inst{4} = s1;
+ let Inst{3-0} = framesize{3-0};
+
+
+}
+
diff --git a/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.cpp b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.cpp
new file mode 100644
index 000000000000..35ef31749f40
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.cpp
@@ -0,0 +1,519 @@
+//===-- Mips16InstrInfo.cpp - Mips16 Instruction Information --------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips16 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+#include "Mips16InstrInfo.h"
+#include "InstPrinter/MipsInstPrinter.h"
+#include "MipsMachineFunction.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cctype>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips16-instrinfo"
+
+Mips16InstrInfo::Mips16InstrInfo(const MipsSubtarget &STI)
+ : MipsInstrInfo(STI, Mips::Bimm16), RI() {}
+
+const MipsRegisterInfo &Mips16InstrInfo::getRegisterInfo() const {
+ return RI;
+}
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the destination along with the FrameIndex of the loaded stack slot. If
+/// not, return 0. This predicate must return 0 if the instruction has
+/// any side effects other than loading from the stack slot.
+unsigned Mips16InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const {
+ return 0;
+}
+
+/// isStoreToStackSlot - If the specified machine instruction is a direct
+/// store to a stack slot, return the virtual or physical register number of
+/// the source reg along with the FrameIndex of the loaded stack slot. If
+/// not, return 0. This predicate must return 0 if the instruction has
+/// any side effects other than storing to the stack slot.
+unsigned Mips16InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const {
+ return 0;
+}
+
+void Mips16InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL, unsigned DestReg,
+ unsigned SrcReg, bool KillSrc) const {
+ unsigned Opc = 0;
+
+ if (Mips::CPU16RegsRegClass.contains(DestReg) &&
+ Mips::GPR32RegClass.contains(SrcReg))
+ Opc = Mips::MoveR3216;
+ else if (Mips::GPR32RegClass.contains(DestReg) &&
+ Mips::CPU16RegsRegClass.contains(SrcReg))
+ Opc = Mips::Move32R16;
+ else if ((SrcReg == Mips::HI0) &&
+ (Mips::CPU16RegsRegClass.contains(DestReg)))
+ Opc = Mips::Mfhi16, SrcReg = 0;
+
+ else if ((SrcReg == Mips::LO0) &&
+ (Mips::CPU16RegsRegClass.contains(DestReg)))
+ Opc = Mips::Mflo16, SrcReg = 0;
+
+
+ assert(Opc && "Cannot copy registers");
+
+ MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opc));
+
+ if (DestReg)
+ MIB.addReg(DestReg, RegState::Define);
+
+ if (SrcReg)
+ MIB.addReg(SrcReg, getKillRegState(KillSrc));
+}
+
+void Mips16InstrInfo::storeRegToStack(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ unsigned SrcReg, bool isKill, int FI,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI,
+ int64_t Offset) const {
+ DebugLoc DL;
+ if (I != MBB.end()) DL = I->getDebugLoc();
+ MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOStore);
+ unsigned Opc = 0;
+ if (Mips::CPU16RegsRegClass.hasSubClassEq(RC))
+ Opc = Mips::SwRxSpImmX16;
+ assert(Opc && "Register class not handled!");
+ BuildMI(MBB, I, DL, get(Opc)).addReg(SrcReg, getKillRegState(isKill)).
+ addFrameIndex(FI).addImm(Offset)
+ .addMemOperand(MMO);
+}
+
+void Mips16InstrInfo::loadRegFromStack(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ unsigned DestReg, int FI,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI,
+ int64_t Offset) const {
+ DebugLoc DL;
+ if (I != MBB.end()) DL = I->getDebugLoc();
+ MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOLoad);
+ unsigned Opc = 0;
+
+ if (Mips::CPU16RegsRegClass.hasSubClassEq(RC))
+ Opc = Mips::LwRxSpImmX16;
+ assert(Opc && "Register class not handled!");
+ BuildMI(MBB, I, DL, get(Opc), DestReg).addFrameIndex(FI).addImm(Offset)
+ .addMemOperand(MMO);
+}
+
+bool Mips16InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+ MachineBasicBlock &MBB = *MI.getParent();
+ switch (MI.getDesc().getOpcode()) {
+ default:
+ return false;
+ case Mips::RetRA16:
+ ExpandRetRA16(MBB, MI, Mips::JrcRa16);
+ break;
+ }
+
+ MBB.erase(MI.getIterator());
+ return true;
+}
+
+/// GetOppositeBranchOpc - Return the inverse of the specified
+/// opcode, e.g. turning BEQ to BNE.
+unsigned Mips16InstrInfo::getOppositeBranchOpc(unsigned Opc) const {
+ switch (Opc) {
+ case Mips::BeqzRxImmX16: return Mips::BnezRxImmX16;
+ case Mips::BnezRxImmX16: return Mips::BeqzRxImmX16;
+ case Mips::BeqzRxImm16: return Mips::BnezRxImm16;
+ case Mips::BnezRxImm16: return Mips::BeqzRxImm16;
+ case Mips::BteqzT8CmpX16: return Mips::BtnezT8CmpX16;
+ case Mips::BteqzT8SltX16: return Mips::BtnezT8SltX16;
+ case Mips::BteqzT8SltiX16: return Mips::BtnezT8SltiX16;
+ case Mips::Btnez16: return Mips::Bteqz16;
+ case Mips::BtnezX16: return Mips::BteqzX16;
+ case Mips::BtnezT8CmpiX16: return Mips::BteqzT8CmpiX16;
+ case Mips::BtnezT8SltuX16: return Mips::BteqzT8SltuX16;
+ case Mips::BtnezT8SltiuX16: return Mips::BteqzT8SltiuX16;
+ case Mips::Bteqz16: return Mips::Btnez16;
+ case Mips::BteqzX16: return Mips::BtnezX16;
+ case Mips::BteqzT8CmpiX16: return Mips::BtnezT8CmpiX16;
+ case Mips::BteqzT8SltuX16: return Mips::BtnezT8SltuX16;
+ case Mips::BteqzT8SltiuX16: return Mips::BtnezT8SltiuX16;
+ case Mips::BtnezT8CmpX16: return Mips::BteqzT8CmpX16;
+ case Mips::BtnezT8SltX16: return Mips::BteqzT8SltX16;
+ case Mips::BtnezT8SltiX16: return Mips::BteqzT8SltiX16;
+ }
+ llvm_unreachable("Illegal opcode!");
+}
+
+static void addSaveRestoreRegs(MachineInstrBuilder &MIB,
+ const std::vector<CalleeSavedInfo> &CSI,
+ unsigned Flags = 0) {
+ for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+ // Add the callee-saved register as live-in. Do not add if the register is
+ // RA and return address is taken, because it has already been added in
+ // method MipsTargetLowering::lowerRETURNADDR.
+ // It's killed at the spill, unless the register is RA and return address
+ // is taken.
+ unsigned Reg = CSI[e-i-1].getReg();
+ switch (Reg) {
+ case Mips::RA:
+ case Mips::S0:
+ case Mips::S1:
+ MIB.addReg(Reg, Flags);
+ break;
+ case Mips::S2:
+ break;
+ default:
+ llvm_unreachable("unexpected mips16 callee saved register");
+
+ }
+ }
+}
+// Adjust SP by FrameSize bytes. Save RA, S0, S1
+void Mips16InstrInfo::makeFrame(unsigned SP, int64_t FrameSize,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
+ DebugLoc DL;
+ MachineFunction &MF = *MBB.getParent();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ const BitVector Reserved = RI.getReservedRegs(MF);
+ bool SaveS2 = Reserved[Mips::S2];
+ MachineInstrBuilder MIB;
+ unsigned Opc = ((FrameSize <= 128) && !SaveS2)? Mips::Save16:Mips::SaveX16;
+ MIB = BuildMI(MBB, I, DL, get(Opc));
+ const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+ addSaveRestoreRegs(MIB, CSI);
+ if (SaveS2)
+ MIB.addReg(Mips::S2);
+ if (isUInt<11>(FrameSize))
+ MIB.addImm(FrameSize);
+ else {
+ int Base = 2040; // should create template function like isUInt that
+ // returns largest possible n bit unsigned integer
+ int64_t Remainder = FrameSize - Base;
+ MIB.addImm(Base);
+ if (isInt<16>(-Remainder))
+ BuildAddiuSpImm(MBB, I, -Remainder);
+ else
+ adjustStackPtrBig(SP, -Remainder, MBB, I, Mips::V0, Mips::V1);
+ }
+}
+
+// Adjust SP by FrameSize bytes. Restore RA, S0, S1
+void Mips16InstrInfo::restoreFrame(unsigned SP, int64_t FrameSize,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
+ DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
+ MachineFunction *MF = MBB.getParent();
+ MachineFrameInfo &MFI = MF->getFrameInfo();
+ const BitVector Reserved = RI.getReservedRegs(*MF);
+ bool SaveS2 = Reserved[Mips::S2];
+ MachineInstrBuilder MIB;
+ unsigned Opc = ((FrameSize <= 128) && !SaveS2)?
+ Mips::Restore16:Mips::RestoreX16;
+
+ if (!isUInt<11>(FrameSize)) {
+ unsigned Base = 2040;
+ int64_t Remainder = FrameSize - Base;
+ FrameSize = Base; // should create template function like isUInt that
+ // returns largest possible n bit unsigned integer
+
+ if (isInt<16>(Remainder))
+ BuildAddiuSpImm(MBB, I, Remainder);
+ else
+ adjustStackPtrBig(SP, Remainder, MBB, I, Mips::A0, Mips::A1);
+ }
+ MIB = BuildMI(MBB, I, DL, get(Opc));
+ const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+ addSaveRestoreRegs(MIB, CSI, RegState::Define);
+ if (SaveS2)
+ MIB.addReg(Mips::S2, RegState::Define);
+ MIB.addImm(FrameSize);
+}
+
+// Adjust SP by Amount bytes where bytes can be up to 32bit number.
+// This can only be called at times that we know that there is at least one free
+// register.
+// This is clearly safe at prologue and epilogue.
+//
+void Mips16InstrInfo::adjustStackPtrBig(unsigned SP, int64_t Amount,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ unsigned Reg1, unsigned Reg2) const {
+ DebugLoc DL;
+ //
+ // li reg1, constant
+ // move reg2, sp
+ // add reg1, reg1, reg2
+ // move sp, reg1
+ //
+ //
+ MachineInstrBuilder MIB1 = BuildMI(MBB, I, DL, get(Mips::LwConstant32), Reg1);
+ MIB1.addImm(Amount).addImm(-1);
+ MachineInstrBuilder MIB2 = BuildMI(MBB, I, DL, get(Mips::MoveR3216), Reg2);
+ MIB2.addReg(Mips::SP, RegState::Kill);
+ MachineInstrBuilder MIB3 = BuildMI(MBB, I, DL, get(Mips::AdduRxRyRz16), Reg1);
+ MIB3.addReg(Reg1);
+ MIB3.addReg(Reg2, RegState::Kill);
+ MachineInstrBuilder MIB4 = BuildMI(MBB, I, DL, get(Mips::Move32R16),
+ Mips::SP);
+ MIB4.addReg(Reg1, RegState::Kill);
+}
+
+void Mips16InstrInfo::adjustStackPtrBigUnrestricted(
+ unsigned SP, int64_t Amount, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
+ llvm_unreachable("adjust stack pointer amount exceeded");
+}
+
+/// Adjust SP by Amount bytes.
+void Mips16InstrInfo::adjustStackPtr(unsigned SP, int64_t Amount,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
+ if (Amount == 0)
+ return;
+
+ if (isInt<16>(Amount)) // need to change to addiu sp, ....and isInt<16>
+ BuildAddiuSpImm(MBB, I, Amount);
+ else
+ adjustStackPtrBigUnrestricted(SP, Amount, MBB, I);
+}
+
+/// This function generates the sequence of instructions needed to get the
+/// result of adding register REG and immediate IMM.
+unsigned Mips16InstrInfo::loadImmediate(unsigned FrameReg, int64_t Imm,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator II,
+ const DebugLoc &DL,
+ unsigned &NewImm) const {
+ //
+ // given original instruction is:
+ // Instr rx, T[offset] where offset is too big.
+ //
+ // lo = offset & 0xFFFF
+ // hi = ((offset >> 16) + (lo >> 15)) & 0xFFFF;
+ //
+ // let T = temporary register
+ // li T, hi
+ // shl T, 16
+ // add T, Rx, T
+ //
+ RegScavenger rs;
+ int32_t lo = Imm & 0xFFFF;
+ NewImm = lo;
+ int Reg =0;
+ int SpReg = 0;
+
+ rs.enterBasicBlock(MBB);
+ rs.forward(II);
+ //
+ // We need to know which registers can be used, in the case where there
+ // are not enough free registers. We exclude all registers that
+ // are used in the instruction that we are helping.
+ // // Consider all allocatable registers in the register class initially
+ BitVector Candidates =
+ RI.getAllocatableSet
+ (*II->getParent()->getParent(), &Mips::CPU16RegsRegClass);
+ // Exclude all the registers being used by the instruction.
+ for (unsigned i = 0, e = II->getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = II->getOperand(i);
+ if (MO.isReg() && MO.getReg() != 0 && !MO.isDef() &&
+ !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+ Candidates.reset(MO.getReg());
+ }
+
+ // If the same register was used and defined in an instruction, then
+ // it will not be in the list of candidates.
+ //
+ // we need to analyze the instruction that we are helping.
+ // we need to know if it defines register x but register x is not
+ // present as an operand of the instruction. this tells
+ // whether the register is live before the instruction. if it's not
+ // then we don't need to save it in case there are no free registers.
+ int DefReg = 0;
+ for (unsigned i = 0, e = II->getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = II->getOperand(i);
+ if (MO.isReg() && MO.isDef()) {
+ DefReg = MO.getReg();
+ break;
+ }
+ }
+
+ BitVector Available = rs.getRegsAvailable(&Mips::CPU16RegsRegClass);
+ Available &= Candidates;
+ //
+ // we use T0 for the first register, if we need to save something away.
+ // we use T1 for the second register, if we need to save something away.
+ //
+ unsigned FirstRegSaved =0, SecondRegSaved=0;
+ unsigned FirstRegSavedTo = 0, SecondRegSavedTo = 0;
+
+ Reg = Available.find_first();
+
+ if (Reg == -1) {
+ Reg = Candidates.find_first();
+ Candidates.reset(Reg);
+ if (DefReg != Reg) {
+ FirstRegSaved = Reg;
+ FirstRegSavedTo = Mips::T0;
+ copyPhysReg(MBB, II, DL, FirstRegSavedTo, FirstRegSaved, true);
+ }
+ }
+ else
+ Available.reset(Reg);
+ BuildMI(MBB, II, DL, get(Mips::LwConstant32), Reg).addImm(Imm).addImm(-1);
+ NewImm = 0;
+ if (FrameReg == Mips::SP) {
+ SpReg = Available.find_first();
+ if (SpReg == -1) {
+ SpReg = Candidates.find_first();
+ // Candidates.reset(SpReg); // not really needed
+ if (DefReg!= SpReg) {
+ SecondRegSaved = SpReg;
+ SecondRegSavedTo = Mips::T1;
+ }
+ if (SecondRegSaved)
+ copyPhysReg(MBB, II, DL, SecondRegSavedTo, SecondRegSaved, true);
+ }
+ else
+ Available.reset(SpReg);
+ copyPhysReg(MBB, II, DL, SpReg, Mips::SP, false);
+ BuildMI(MBB, II, DL, get(Mips:: AdduRxRyRz16), Reg).addReg(SpReg, RegState::Kill)
+ .addReg(Reg);
+ }
+ else
+ BuildMI(MBB, II, DL, get(Mips:: AdduRxRyRz16), Reg).addReg(FrameReg)
+ .addReg(Reg, RegState::Kill);
+ if (FirstRegSaved || SecondRegSaved) {
+ II = std::next(II);
+ if (FirstRegSaved)
+ copyPhysReg(MBB, II, DL, FirstRegSaved, FirstRegSavedTo, true);
+ if (SecondRegSaved)
+ copyPhysReg(MBB, II, DL, SecondRegSaved, SecondRegSavedTo, true);
+ }
+ return Reg;
+}
+
+unsigned Mips16InstrInfo::getAnalyzableBrOpc(unsigned Opc) const {
+ return (Opc == Mips::BeqzRxImmX16 || Opc == Mips::BimmX16 ||
+ Opc == Mips::Bimm16 ||
+ Opc == Mips::Bteqz16 || Opc == Mips::Btnez16 ||
+ Opc == Mips::BeqzRxImm16 || Opc == Mips::BnezRxImm16 ||
+ Opc == Mips::BnezRxImmX16 || Opc == Mips::BteqzX16 ||
+ Opc == Mips::BteqzT8CmpX16 || Opc == Mips::BteqzT8CmpiX16 ||
+ Opc == Mips::BteqzT8SltX16 || Opc == Mips::BteqzT8SltuX16 ||
+ Opc == Mips::BteqzT8SltiX16 || Opc == Mips::BteqzT8SltiuX16 ||
+ Opc == Mips::BtnezX16 || Opc == Mips::BtnezT8CmpX16 ||
+ Opc == Mips::BtnezT8CmpiX16 || Opc == Mips::BtnezT8SltX16 ||
+ Opc == Mips::BtnezT8SltuX16 || Opc == Mips::BtnezT8SltiX16 ||
+ Opc == Mips::BtnezT8SltiuX16 ) ? Opc : 0;
+}
+
+void Mips16InstrInfo::ExpandRetRA16(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ unsigned Opc) const {
+ BuildMI(MBB, I, I->getDebugLoc(), get(Opc));
+}
+
+const MCInstrDesc &Mips16InstrInfo::AddiuSpImm(int64_t Imm) const {
+ if (validSpImm8(Imm))
+ return get(Mips::AddiuSpImm16);
+ else
+ return get(Mips::AddiuSpImmX16);
+}
+
+void Mips16InstrInfo::BuildAddiuSpImm
+ (MachineBasicBlock &MBB, MachineBasicBlock::iterator I, int64_t Imm) const {
+ DebugLoc DL;
+ BuildMI(MBB, I, DL, AddiuSpImm(Imm)).addImm(Imm);
+}
+
+const MipsInstrInfo *llvm::createMips16InstrInfo(const MipsSubtarget &STI) {
+ return new Mips16InstrInfo(STI);
+}
+
+bool Mips16InstrInfo::validImmediate(unsigned Opcode, unsigned Reg,
+ int64_t Amount) {
+ switch (Opcode) {
+ case Mips::LbRxRyOffMemX16:
+ case Mips::LbuRxRyOffMemX16:
+ case Mips::LhRxRyOffMemX16:
+ case Mips::LhuRxRyOffMemX16:
+ case Mips::SbRxRyOffMemX16:
+ case Mips::ShRxRyOffMemX16:
+ case Mips::LwRxRyOffMemX16:
+ case Mips::SwRxRyOffMemX16:
+ case Mips::SwRxSpImmX16:
+ case Mips::LwRxSpImmX16:
+ return isInt<16>(Amount);
+ case Mips::AddiuRxRyOffMemX16:
+ if ((Reg == Mips::PC) || (Reg == Mips::SP))
+ return isInt<16>(Amount);
+ return isInt<15>(Amount);
+ }
+ llvm_unreachable("unexpected Opcode in validImmediate");
+}
+
+/// Measure the specified inline asm to determine an approximation of its
+/// length.
+/// Comments (which run till the next SeparatorString or newline) do not
+/// count as an instruction.
+/// Any other non-whitespace text is considered an instruction, with
+/// multiple instructions separated by SeparatorString or newlines.
+/// Variable-length instructions are not handled here; this function
+/// may be overloaded in the target code to do that.
+/// We implement the special case of the .space directive taking only an
+/// integer argument, which is the size in bytes. This is used for creating
+/// inline code spacing for testing purposes using inline assembly.
+///
+unsigned Mips16InstrInfo::getInlineAsmLength(const char *Str,
+ const MCAsmInfo &MAI) const {
+
+ // Count the number of instructions in the asm.
+ bool atInsnStart = true;
+ unsigned Length = 0;
+ for (; *Str; ++Str) {
+ if (*Str == '\n' || strncmp(Str, MAI.getSeparatorString(),
+ strlen(MAI.getSeparatorString())) == 0)
+ atInsnStart = true;
+ if (atInsnStart && !std::isspace(static_cast<unsigned char>(*Str))) {
+ if (strncmp(Str, ".space", 6)==0) {
+ char *EStr; int Sz;
+ Sz = strtol(Str+6, &EStr, 10);
+ while (isspace(*EStr)) ++EStr;
+ if (*EStr=='\0') {
+ DEBUG(dbgs() << "parsed .space " << Sz << '\n');
+ return Sz;
+ }
+ }
+ Length += MAI.getMaxInstLength();
+ atInsnStart = false;
+ }
+ if (atInsnStart && strncmp(Str, MAI.getCommentString().data(),
+ MAI.getCommentString().size()) == 0)
+ atInsnStart = false;
+ }
+
+ return Length;
+}
diff --git a/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.h b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.h
new file mode 100644
index 000000000000..ab559799f00b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.h
@@ -0,0 +1,126 @@
+//===-- Mips16InstrInfo.h - Mips16 Instruction Information ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips16 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPS16INSTRINFO_H
+#define LLVM_LIB_TARGET_MIPS_MIPS16INSTRINFO_H
+
+#include "Mips16RegisterInfo.h"
+#include "MipsInstrInfo.h"
+
+namespace llvm {
+class MipsSubtarget;
+class Mips16InstrInfo : public MipsInstrInfo {
+ const Mips16RegisterInfo RI;
+
+public:
+ explicit Mips16InstrInfo(const MipsSubtarget &STI);
+
+ const MipsRegisterInfo &getRegisterInfo() const override;
+
+ /// isLoadFromStackSlot - If the specified machine instruction is a direct
+ /// load from a stack slot, return the virtual or physical register number of
+ /// the destination along with the FrameIndex of the loaded stack slot. If
+ /// not, return 0. This predicate must return 0 if the instruction has
+ /// any side effects other than loading from the stack slot.
+ unsigned isLoadFromStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const override;
+
+ /// isStoreToStackSlot - If the specified machine instruction is a direct
+ /// store to a stack slot, return the virtual or physical register number of
+ /// the source reg along with the FrameIndex of the loaded stack slot. If
+ /// not, return 0. This predicate must return 0 if the instruction has
+ /// any side effects other than storing to the stack slot.
+ unsigned isStoreToStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const override;
+
+ void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+ bool KillSrc) const override;
+
+ void storeRegToStack(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ unsigned SrcReg, bool isKill, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI,
+ int64_t Offset) const override;
+
+ void loadRegFromStack(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ unsigned DestReg, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI,
+ int64_t Offset) const override;
+
+ bool expandPostRAPseudo(MachineInstr &MI) const override;
+
+ unsigned getOppositeBranchOpc(unsigned Opc) const override;
+
+ // Adjust SP by FrameSize bytes. Save RA, S0, S1
+ void makeFrame(unsigned SP, int64_t FrameSize, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const;
+
+ // Adjust SP by FrameSize bytes. Restore RA, S0, S1
+ void restoreFrame(unsigned SP, int64_t FrameSize, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const;
+
+
+ /// Adjust SP by Amount bytes.
+ void adjustStackPtr(unsigned SP, int64_t Amount, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const override;
+
+ /// Emit a series of instructions to load an immediate.
+ // This is to adjust some FrameReg. We return the new register to be used
+ // in place of FrameReg and the adjusted immediate field (&NewImm)
+ //
+ unsigned loadImmediate(unsigned FrameReg, int64_t Imm, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator II, const DebugLoc &DL,
+ unsigned &NewImm) const;
+
+ static bool validImmediate(unsigned Opcode, unsigned Reg, int64_t Amount);
+
+ static bool validSpImm8(int offset) {
+ return ((offset & 7) == 0) && isInt<11>(offset);
+ }
+
+ //
+ // build the proper one based on the Imm field
+ //
+
+ const MCInstrDesc& AddiuSpImm(int64_t Imm) const;
+
+ void BuildAddiuSpImm
+ (MachineBasicBlock &MBB, MachineBasicBlock::iterator I, int64_t Imm) const;
+
+ unsigned getInlineAsmLength(const char *Str,
+ const MCAsmInfo &MAI) const override;
+private:
+ unsigned getAnalyzableBrOpc(unsigned Opc) const override;
+
+ void ExpandRetRA16(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ unsigned Opc) const;
+
+ // Adjust SP by Amount bytes where bytes can be up to 32bit number.
+ void adjustStackPtrBig(unsigned SP, int64_t Amount, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ unsigned Reg1, unsigned Reg2) const;
+
+ // Adjust SP by Amount bytes where bytes can be up to 32bit number.
+ void adjustStackPtrBigUnrestricted(unsigned SP, int64_t Amount,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const;
+
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.td b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.td
new file mode 100644
index 000000000000..021fb8678686
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.td
@@ -0,0 +1,1910 @@
+//===- Mips16InstrInfo.td - Target Description for Mips16 -*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips16 instructions.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+// Mips Address
+//
+def addr16 : ComplexPattern<iPTR, 2, "selectAddr16", [frameindex]>;
+def addr16sp : ComplexPattern<iPTR, 2, "selectAddr16SP", [frameindex]>;
+
+//
+// Address operand
+def mem16 : Operand<i32> {
+ let PrintMethod = "printMemOperand";
+ let MIOperandInfo = (ops CPU16Regs, simm16);
+ let EncoderMethod = "getMemEncoding";
+}
+
+def mem16sp : Operand<i32> {
+ let PrintMethod = "printMemOperand";
+ // This should be CPUSPReg but the MIPS16 subtarget isn't good enough at
+ // keeping the sp-relative load and the other varieties separate at the
+ // moment. This lie fixes the problem sufficiently well to fix the errors
+ // emitted by -verify-machineinstrs and the output ends up correct as long
+ // as we use an external assembler (which is already a requirement for MIPS16
+ // for several other reasons).
+ let MIOperandInfo = (ops CPU16RegsPlusSP, simm16);
+ let EncoderMethod = "getMemEncoding";
+}
+
+def mem16_ea : Operand<i32> {
+ let PrintMethod = "printMemOperandEA";
+ let MIOperandInfo = (ops CPU16RegsPlusSP, simm16);
+ let EncoderMethod = "getMemEncoding";
+}
+
+def pcrel16 : Operand<i32>;
+
+//
+// I-type instruction format
+//
+// this is only used by bimm. the actual assembly value is a 12 bit signed
+// number
+//
+class FI16_ins<bits<5> op, string asmstr, InstrItinClass itin>:
+ FI16<op, (outs), (ins brtarget:$imm16),
+ !strconcat(asmstr, "\t$imm16 # 16 bit inst"), [], itin>;
+
+//
+//
+// I8 instruction format
+//
+
+class FI816_ins_base<bits<3> _func, string asmstr,
+ string asmstr2, InstrItinClass itin>:
+ FI816<_func, (outs), (ins simm16:$imm), !strconcat(asmstr, asmstr2),
+ [], itin>;
+
+class FI816_ins<bits<3> _func, string asmstr,
+ InstrItinClass itin>:
+ FI816_ins_base<_func, asmstr, "\t$imm # 16 bit inst", itin>;
+
+class FI816_SP_ins<bits<3> _func, string asmstr,
+ InstrItinClass itin>:
+ FI816_ins_base<_func, asmstr, "\t$$sp, $imm # 16 bit inst", itin>;
+
+//
+// RI instruction format
+//
+
+
+class FRI16_ins_base<bits<5> op, string asmstr, string asmstr2,
+ InstrItinClass itin>:
+ FRI16<op, (outs CPU16Regs:$rx), (ins simm16:$imm),
+ !strconcat(asmstr, asmstr2), [], itin>;
+
+class FRI16_ins<bits<5> op, string asmstr,
+ InstrItinClass itin>:
+ FRI16_ins_base<op, asmstr, "\t$rx, $imm \t# 16 bit inst", itin>;
+
+class FRI16_TCP_ins<bits<5> _op, string asmstr,
+ InstrItinClass itin>:
+ FRI16<_op, (outs CPU16Regs:$rx), (ins pcrel16:$imm, i32imm:$size),
+ !strconcat(asmstr, "\t$rx, $imm\t# 16 bit inst"), [], itin>;
+
+class FRI16R_ins_base<bits<5> op, string asmstr, string asmstr2,
+ InstrItinClass itin>:
+ FRI16<op, (outs), (ins CPU16Regs:$rx, simm16:$imm),
+ !strconcat(asmstr, asmstr2), [], itin>;
+
+class FRI16R_ins<bits<5> op, string asmstr,
+ InstrItinClass itin>:
+ FRI16R_ins_base<op, asmstr, "\t$rx, $imm \t# 16 bit inst", itin>;
+
+class F2RI16_ins<bits<5> _op, string asmstr,
+ InstrItinClass itin>:
+ FRI16<_op, (outs CPU16Regs:$rx), (ins CPU16Regs:$rx_, simm16:$imm),
+ !strconcat(asmstr, "\t$rx, $imm\t# 16 bit inst"), [], itin> {
+ let Constraints = "$rx_ = $rx";
+}
+
+class FRI16_B_ins<bits<5> _op, string asmstr,
+ InstrItinClass itin>:
+ FRI16<_op, (outs), (ins CPU16Regs:$rx, brtarget:$imm),
+ !strconcat(asmstr, "\t$rx, $imm # 16 bit inst"), [], itin>;
+//
+// Compare a register and immediate and place result in CC
+// Implicit use of T8
+//
+// EXT-CCRR Instruction format
+//
+class FEXT_CCRXI16_ins<string asmstr>:
+ MipsPseudo16<(outs CPU16Regs:$cc), (ins CPU16Regs:$rx, simm16:$imm),
+ !strconcat(asmstr, "\t$rx, $imm\n\tmove\t$cc, $$t8"), []> {
+ let isCodeGenOnly=1;
+ let usesCustomInserter = 1;
+}
+
+// JAL and JALX instruction format
+//
+class FJAL16_ins<bits<1> _X, string asmstr,
+ InstrItinClass itin>:
+ FJAL16<_X, (outs), (ins uimm26:$imm),
+ !strconcat(asmstr, "\t$imm\n\tnop"),[],
+ itin> {
+ let isCodeGenOnly=1;
+ let Size=6;
+}
+
+class FJALB16_ins<bits<1> _X, string asmstr,
+ InstrItinClass itin>:
+ FJAL16<_X, (outs), (ins uimm26:$imm),
+ !strconcat(asmstr, "\t$imm\t# branch\n\tnop"),[],
+ itin> {
+ let isCodeGenOnly=1;
+ let Size=6;
+}
+
+//
+// EXT-I instruction format
+//
+class FEXT_I16_ins<bits<5> eop, string asmstr, InstrItinClass itin> :
+ FEXT_I16<eop, (outs), (ins brtarget:$imm16),
+ !strconcat(asmstr, "\t$imm16"),[], itin>;
+
+//
+// EXT-I8 instruction format
+//
+
+class FEXT_I816_ins_base<bits<3> _func, string asmstr,
+ string asmstr2, InstrItinClass itin>:
+ FEXT_I816<_func, (outs), (ins simm16:$imm), !strconcat(asmstr, asmstr2),
+ [], itin>;
+
+class FEXT_I816_ins<bits<3> _func, string asmstr,
+ InstrItinClass itin>:
+ FEXT_I816_ins_base<_func, asmstr, "\t$imm", itin>;
+
+class FEXT_I816_SP_ins<bits<3> _func, string asmstr,
+ InstrItinClass itin>:
+ FEXT_I816_ins_base<_func, asmstr, "\t$$sp, $imm", itin>;
+
+//
+// Assembler formats in alphabetical order.
+// Natural and pseudos are mixed together.
+//
+// Compare two registers and place result in CC
+// Implicit use of T8
+//
+// CC-RR Instruction format
+//
+class FCCRR16_ins<string asmstr> :
+ MipsPseudo16<(outs CPU16Regs:$cc), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+ !strconcat(asmstr, "\t$rx, $ry\n\tmove\t$cc, $$t8"), []> {
+ let isCodeGenOnly=1;
+ let usesCustomInserter = 1;
+}
+
+//
+// EXT-RI instruction format
+//
+
+class FEXT_RI16_ins_base<bits<5> _op, string asmstr, string asmstr2,
+ InstrItinClass itin>:
+ FEXT_RI16<_op, (outs CPU16Regs:$rx), (ins simm16:$imm),
+ !strconcat(asmstr, asmstr2), [], itin>;
+
+class FEXT_RI16_ins<bits<5> _op, string asmstr,
+ InstrItinClass itin>:
+ FEXT_RI16_ins_base<_op, asmstr, "\t$rx, $imm", itin>;
+
+class FEXT_RI16R_ins_base<bits<5> _op, string asmstr, string asmstr2,
+ InstrItinClass itin>:
+ FEXT_RI16<_op, (outs ), (ins CPU16Regs:$rx, simm16:$imm),
+ !strconcat(asmstr, asmstr2), [], itin>;
+
+class FEXT_RI16R_ins<bits<5> _op, string asmstr,
+ InstrItinClass itin>:
+ FEXT_RI16R_ins_base<_op, asmstr, "\t$rx, $imm", itin>;
+
+class FEXT_RI16_PC_ins<bits<5> _op, string asmstr, InstrItinClass itin>:
+ FEXT_RI16_ins_base<_op, asmstr, "\t$rx, $$pc, $imm", itin>;
+
+class FEXT_RI16_B_ins<bits<5> _op, string asmstr,
+ InstrItinClass itin>:
+ FEXT_RI16<_op, (outs), (ins CPU16Regs:$rx, brtarget:$imm),
+ !strconcat(asmstr, "\t$rx, $imm"), [], itin>;
+
+class FEXT_RI16_TCP_ins<bits<5> _op, string asmstr,
+ InstrItinClass itin>:
+ FEXT_RI16<_op, (outs CPU16Regs:$rx), (ins pcrel16:$imm, i32imm:$size),
+ !strconcat(asmstr, "\t$rx, $imm"), [], itin>;
+
+class FEXT_2RI16_ins<bits<5> _op, string asmstr,
+ InstrItinClass itin>:
+ FEXT_RI16<_op, (outs CPU16Regs:$rx), (ins CPU16Regs:$rx_, simm16:$imm),
+ !strconcat(asmstr, "\t$rx, $imm"), [], itin> {
+ let Constraints = "$rx_ = $rx";
+}
+
+//
+// EXT-RRI instruction format
+//
+
+class FEXT_RRI16_mem_ins<bits<5> op, string asmstr, Operand MemOpnd,
+ InstrItinClass itin>:
+ FEXT_RRI16<op, (outs CPU16Regs:$ry), (ins MemOpnd:$addr),
+ !strconcat(asmstr, "\t$ry, $addr"), [], itin>;
+
+class FEXT_RRI16_mem2_ins<bits<5> op, string asmstr, Operand MemOpnd,
+ InstrItinClass itin>:
+ FEXT_RRI16<op, (outs ), (ins CPU16Regs:$ry, MemOpnd:$addr),
+ !strconcat(asmstr, "\t$ry, $addr"), [], itin>;
+
+//
+//
+// EXT-RRI-A instruction format
+//
+
+class FEXT_RRI_A16_mem_ins<bits<1> op, string asmstr, Operand MemOpnd,
+ InstrItinClass itin>:
+ FEXT_RRI_A16<op, (outs CPU16Regs:$ry), (ins MemOpnd:$addr),
+ !strconcat(asmstr, "\t$ry, $addr"), [], itin>;
+
+//
+// EXT-SHIFT instruction format
+//
+class FEXT_SHIFT16_ins<bits<2> _f, string asmstr, InstrItinClass itin>:
+ FEXT_SHIFT16<_f, (outs CPU16Regs:$rx), (ins CPU16Regs:$ry, uimm5:$sa),
+ !strconcat(asmstr, "\t$rx, $ry, $sa"), [], itin>;
+
+//
+// EXT-T8I8
+//
+class FEXT_T8I816_ins<string asmstr, string asmstr2>:
+ MipsPseudo16<(outs),
+ (ins CPU16Regs:$rx, CPU16Regs:$ry, brtarget:$imm),
+ !strconcat(asmstr2, !strconcat("\t$rx, $ry\n\t",
+ !strconcat(asmstr, "\t$imm"))),[]> {
+ let isCodeGenOnly=1;
+ let usesCustomInserter = 1;
+}
+
+//
+// EXT-T8I8I
+//
+class FEXT_T8I8I16_ins<string asmstr, string asmstr2>:
+ MipsPseudo16<(outs),
+ (ins CPU16Regs:$rx, simm16:$imm, brtarget:$targ),
+ !strconcat(asmstr2, !strconcat("\t$rx, $imm\n\t",
+ !strconcat(asmstr, "\t$targ"))), []> {
+ let isCodeGenOnly=1;
+ let usesCustomInserter = 1;
+}
+//
+
+
+//
+// I8_MOVR32 instruction format (used only by the MOVR32 instructio
+//
+class FI8_MOVR3216_ins<string asmstr, InstrItinClass itin>:
+ FI8_MOVR3216<(outs CPU16Regs:$rz), (ins GPR32:$r32),
+ !strconcat(asmstr, "\t$rz, $r32"), [], itin>;
+
+//
+// I8_MOV32R instruction format (used only by MOV32R instruction)
+//
+
+class FI8_MOV32R16_ins<string asmstr, InstrItinClass itin>:
+ FI8_MOV32R16<(outs GPR32:$r32), (ins CPU16Regs:$rz),
+ !strconcat(asmstr, "\t$r32, $rz"), [], itin>;
+
+//
+// This are pseudo formats for multiply
+// This first one can be changed to non-pseudo now.
+//
+// MULT
+//
+class FMULT16_ins<string asmstr, InstrItinClass itin> :
+ MipsPseudo16<(outs), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+ !strconcat(asmstr, "\t$rx, $ry"), []>;
+
+//
+// MULT-LO
+//
+class FMULT16_LO_ins<string asmstr, InstrItinClass itin> :
+ MipsPseudo16<(outs CPU16Regs:$rz), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+ !strconcat(asmstr, "\t$rx, $ry\n\tmflo\t$rz"), []> {
+ let isCodeGenOnly=1;
+}
+
+//
+// RR-type instruction format
+//
+
+class FRR16_ins<bits<5> f, string asmstr, InstrItinClass itin> :
+ FRR16<f, (outs CPU16Regs:$rx), (ins CPU16Regs:$ry),
+ !strconcat(asmstr, "\t$rx, $ry"), [], itin> {
+}
+
+class FRRBreakNull16_ins<string asmstr, InstrItinClass itin> :
+ FRRBreak16<(outs), (ins), asmstr, [], itin> {
+ let Code=0;
+}
+
+class FRR16R_ins<bits<5> f, string asmstr, InstrItinClass itin> :
+ FRR16<f, (outs), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+ !strconcat(asmstr, "\t$rx, $ry"), [], itin> {
+}
+
+class FRRTR16_ins<string asmstr> :
+ MipsPseudo16<(outs CPU16Regs:$rz), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+ !strconcat(asmstr, "\t$rx, $ry\n\tmove\t$rz, $$t8"), []> ;
+
+//
+// maybe refactor but need a $zero as a dummy first parameter
+//
+class FRR16_div_ins<bits<5> f, string asmstr, InstrItinClass itin> :
+ FRR16<f, (outs ), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+ !strconcat(asmstr, "\t$$zero, $rx, $ry"), [], itin> ;
+
+class FUnaryRR16_ins<bits<5> f, string asmstr, InstrItinClass itin> :
+ FRR16<f, (outs CPU16Regs:$rx), (ins CPU16Regs:$ry),
+ !strconcat(asmstr, "\t$rx, $ry"), [], itin> ;
+
+
+class FRR16_M_ins<bits<5> f, string asmstr,
+ InstrItinClass itin> :
+ FRR16<f, (outs CPU16Regs:$rx), (ins),
+ !strconcat(asmstr, "\t$rx"), [], itin>;
+
+class FRxRxRy16_ins<bits<5> f, string asmstr,
+ InstrItinClass itin> :
+ FRR16<f, (outs CPU16Regs:$rz), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+ !strconcat(asmstr, "\t$rz, $ry"),
+ [], itin> {
+ let Constraints = "$rx = $rz";
+}
+
+let rx=0 in
+class FRR16_JALRC_RA_only_ins<bits<1> nd_, bits<1> l_,
+ string asmstr, InstrItinClass itin>:
+ FRR16_JALRC<nd_, l_, 1, (outs), (ins), !strconcat(asmstr, "\t $$ra"),
+ [], itin> ;
+
+
+class FRR16_JALRC_ins<bits<1> nd, bits<1> l, bits<1> ra,
+ string asmstr, InstrItinClass itin>:
+ FRR16_JALRC<nd, l, ra, (outs), (ins CPU16Regs:$rx),
+ !strconcat(asmstr, "\t $rx"), [], itin> ;
+
+class FRR_SF16_ins
+ <bits<5> _funct, bits<3> _subfunc,
+ string asmstr, InstrItinClass itin>:
+ FRR_SF16<_funct, _subfunc, (outs CPU16Regs:$rx), (ins CPU16Regs:$rx_),
+ !strconcat(asmstr, "\t $rx"),
+ [], itin> {
+ let Constraints = "$rx_ = $rx";
+ }
+//
+// RRR-type instruction format
+//
+
+class FRRR16_ins<bits<2> _f, string asmstr, InstrItinClass itin> :
+ FRRR16<_f, (outs CPU16Regs:$rz), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+ !strconcat(asmstr, "\t$rz, $rx, $ry"), [], itin>;
+
+//
+// These Sel patterns support the generation of conditional move
+// pseudo instructions.
+//
+// The nomenclature uses the components making up the pseudo and may
+// be a bit counter intuitive when compared with the end result we seek.
+// For example using a bqez in the example directly below results in the
+// conditional move being done if the tested register is not zero.
+// I considered in easier to check by keeping the pseudo consistent with
+// it's components but it could have been done differently.
+//
+// The simplest case is when can test and operand directly and do the
+// conditional move based on a simple mips16 conditional
+// branch instruction.
+// for example:
+// if $op == beqz or bnez:
+//
+// $op1 $rt, .+4
+// move $rd, $rs
+//
+// if $op == beqz, then if $rt != 0, then the conditional assignment
+// $rd = $rs is done.
+
+// if $op == bnez, then if $rt == 0, then the conditional assignment
+// $rd = $rs is done.
+//
+// So this pseudo class only has one operand, i.e. op
+//
+class Sel<string op>:
+ MipsPseudo16<(outs CPU16Regs:$rd_), (ins CPU16Regs:$rd, CPU16Regs:$rs,
+ CPU16Regs:$rt),
+ !strconcat(op, "\t$rt, .+4\n\t\n\tmove $rd, $rs"), []> {
+ //let isCodeGenOnly=1;
+ let Constraints = "$rd = $rd_";
+ let usesCustomInserter = 1;
+}
+
+//
+// The next two instruction classes allow for an operand which tests
+// two operands and returns a value in register T8 and
+//then does a conditional branch based on the value of T8
+//
+
+// op2 can be cmpi or slti/sltiu
+// op1 can bteqz or btnez
+// the operands for op2 are a register and a signed constant
+//
+// $op2 $t, $imm ;test register t and branch conditionally
+// $op1 .+4 ;op1 is a conditional branch
+// move $rd, $rs
+//
+//
+class SeliT<string op1, string op2>:
+ MipsPseudo16<(outs CPU16Regs:$rd_), (ins CPU16Regs:$rd, CPU16Regs:$rs,
+ CPU16Regs:$rl, simm16:$imm),
+ !strconcat(op2,
+ !strconcat("\t$rl, $imm\n\t",
+ !strconcat(op1, "\t.+4\n\tmove $rd, $rs"))), []> {
+ let isCodeGenOnly=1;
+ let Constraints = "$rd = $rd_";
+ let usesCustomInserter = 1;
+}
+
+//
+// op2 can be cmp or slt/sltu
+// op1 can be bteqz or btnez
+// the operands for op2 are two registers
+// op1 is a conditional branch
+//
+//
+// $op2 $rl, $rr ;test registers rl,rr
+// $op1 .+4 ;op2 is a conditional branch
+// move $rd, $rs
+//
+//
+class SelT<string op1, string op2>:
+ MipsPseudo16<(outs CPU16Regs:$rd_),
+ (ins CPU16Regs:$rd, CPU16Regs:$rs,
+ CPU16Regs:$rl, CPU16Regs:$rr),
+ !strconcat(op2,
+ !strconcat("\t$rl, $rr\n\t",
+ !strconcat(op1, "\t.+4\n\tmove $rd, $rs"))), []> {
+ let isCodeGenOnly=1;
+ let Constraints = "$rd = $rd_";
+ let usesCustomInserter = 1;
+}
+
+//
+// 32 bit constant
+//
+def Constant32:
+ MipsPseudo16<(outs), (ins simm32:$imm), "\t.word $imm", []>;
+
+def LwConstant32:
+ MipsPseudo16<(outs CPU16Regs:$rx), (ins simm32:$imm, simm32:$constid),
+ "lw\t$rx, 1f\n\tb\t2f\n\t.align\t2\n1: \t.word\t$imm\n2:", []>;
+
+
+//
+// Some general instruction class info
+//
+//
+
+class ArithLogic16Defs<bit isCom=0> {
+ bits<5> shamt = 0;
+ bit isCommutable = isCom;
+ bit isReMaterializable = 1;
+ bit hasSideEffects = 0;
+}
+
+class branch16 {
+ bit isBranch = 1;
+ bit isTerminator = 1;
+ bit isBarrier = 1;
+}
+
+class cbranch16 {
+ bit isBranch = 1;
+ bit isTerminator = 1;
+}
+
+class MayLoad {
+ bit mayLoad = 1;
+}
+
+class MayStore {
+ bit mayStore = 1;
+}
+//
+
+
+// Format: ADDIU rx, immediate MIPS16e
+// Purpose: Add Immediate Unsigned Word (2-Operand, Extended)
+// To add a constant to a 32-bit integer.
+//
+def AddiuRxImmX16: FEXT_RI16_ins<0b01001, "addiu", IIM16Alu>;
+
+def AddiuRxRxImm16: F2RI16_ins<0b01001, "addiu", IIM16Alu>,
+ ArithLogic16Defs<0> {
+ let AddedComplexity = 5;
+}
+def AddiuRxRxImmX16: FEXT_2RI16_ins<0b01001, "addiu", IIM16Alu>,
+ ArithLogic16Defs<0> {
+ let isCodeGenOnly = 1;
+}
+
+def AddiuRxRyOffMemX16:
+ FEXT_RRI_A16_mem_ins<0, "addiu", mem16_ea, IIM16Alu>;
+
+//
+
+// Format: ADDIU rx, pc, immediate MIPS16e
+// Purpose: Add Immediate Unsigned Word (3-Operand, PC-Relative, Extended)
+// To add a constant to the program counter.
+//
+def AddiuRxPcImmX16: FEXT_RI16_PC_ins<0b00001, "addiu", IIM16Alu>;
+
+//
+// Format: ADDIU sp, immediate MIPS16e
+// Purpose: Add Immediate Unsigned Word (2-Operand, SP-Relative, Extended)
+// To add a constant to the stack pointer.
+//
+def AddiuSpImm16
+ : FI816_SP_ins<0b011, "addiu", IIM16Alu> {
+ let Defs = [SP];
+ let Uses = [SP];
+ let AddedComplexity = 5;
+}
+
+def AddiuSpImmX16
+ : FEXT_I816_SP_ins<0b011, "addiu", IIM16Alu> {
+ let Defs = [SP];
+ let Uses = [SP];
+}
+
+//
+// Format: ADDU rz, rx, ry MIPS16e
+// Purpose: Add Unsigned Word (3-Operand)
+// To add 32-bit integers.
+//
+
+def AdduRxRyRz16: FRRR16_ins<01, "addu", IIM16Alu>, ArithLogic16Defs<1>;
+
+//
+// Format: AND rx, ry MIPS16e
+// Purpose: AND
+// To do a bitwise logical AND.
+
+def AndRxRxRy16: FRxRxRy16_ins<0b01100, "and", IIM16Alu>, ArithLogic16Defs<1>;
+
+
+//
+// Format: BEQZ rx, offset MIPS16e
+// Purpose: Branch on Equal to Zero
+// To test a GPR then do a PC-relative conditional branch.
+//
+def BeqzRxImm16: FRI16_B_ins<0b00100, "beqz", IIM16Alu>, cbranch16;
+
+
+//
+// Format: BEQZ rx, offset MIPS16e
+// Purpose: Branch on Equal to Zero (Extended)
+// To test a GPR then do a PC-relative conditional branch.
+//
+def BeqzRxImmX16: FEXT_RI16_B_ins<0b00100, "beqz", IIM16Alu>, cbranch16;
+
+//
+// Format: B offset MIPS16e
+// Purpose: Unconditional Branch (Extended)
+// To do an unconditional PC-relative branch.
+//
+
+def Bimm16: FI16_ins<0b00010, "b", IIM16Alu>, branch16;
+
+// Format: B offset MIPS16e
+// Purpose: Unconditional Branch
+// To do an unconditional PC-relative branch.
+//
+def BimmX16: FEXT_I16_ins<0b00010, "b", IIM16Alu>, branch16;
+
+//
+// Format: BNEZ rx, offset MIPS16e
+// Purpose: Branch on Not Equal to Zero
+// To test a GPR then do a PC-relative conditional branch.
+//
+def BnezRxImm16: FRI16_B_ins<0b00101, "bnez", IIM16Alu>, cbranch16;
+
+//
+// Format: BNEZ rx, offset MIPS16e
+// Purpose: Branch on Not Equal to Zero (Extended)
+// To test a GPR then do a PC-relative conditional branch.
+//
+def BnezRxImmX16: FEXT_RI16_B_ins<0b00101, "bnez", IIM16Alu>, cbranch16;
+
+
+//
+//Format: BREAK immediate
+// Purpose: Breakpoint
+// To cause a Breakpoint exception.
+
+def Break16: FRRBreakNull16_ins<"break 0", IIM16Alu>;
+//
+// Format: BTEQZ offset MIPS16e
+// Purpose: Branch on T Equal to Zero (Extended)
+// To test special register T then do a PC-relative conditional branch.
+//
+def Bteqz16: FI816_ins<0b000, "bteqz", IIM16Alu>, cbranch16 {
+ let Uses = [T8];
+}
+
+def BteqzX16: FEXT_I816_ins<0b000, "bteqz", IIM16Alu>, cbranch16 {
+ let Uses = [T8];
+}
+
+def BteqzT8CmpX16: FEXT_T8I816_ins<"bteqz", "cmp">, cbranch16;
+
+def BteqzT8CmpiX16: FEXT_T8I8I16_ins<"bteqz", "cmpi">,
+ cbranch16;
+
+def BteqzT8SltX16: FEXT_T8I816_ins<"bteqz", "slt">, cbranch16;
+
+def BteqzT8SltuX16: FEXT_T8I816_ins<"bteqz", "sltu">, cbranch16;
+
+def BteqzT8SltiX16: FEXT_T8I8I16_ins<"bteqz", "slti">, cbranch16;
+
+def BteqzT8SltiuX16: FEXT_T8I8I16_ins<"bteqz", "sltiu">,
+ cbranch16;
+
+//
+// Format: BTNEZ offset MIPS16e
+// Purpose: Branch on T Not Equal to Zero (Extended)
+// To test special register T then do a PC-relative conditional branch.
+//
+
+def Btnez16: FI816_ins<0b001, "btnez", IIM16Alu>, cbranch16 {
+ let Uses = [T8];
+}
+
+def BtnezX16: FEXT_I816_ins<0b001, "btnez", IIM16Alu> ,cbranch16 {
+ let Uses = [T8];
+}
+
+def BtnezT8CmpX16: FEXT_T8I816_ins<"btnez", "cmp">, cbranch16;
+
+def BtnezT8CmpiX16: FEXT_T8I8I16_ins<"btnez", "cmpi">, cbranch16;
+
+def BtnezT8SltX16: FEXT_T8I816_ins<"btnez", "slt">, cbranch16;
+
+def BtnezT8SltuX16: FEXT_T8I816_ins<"btnez", "sltu">, cbranch16;
+
+def BtnezT8SltiX16: FEXT_T8I8I16_ins<"btnez", "slti">, cbranch16;
+
+def BtnezT8SltiuX16: FEXT_T8I8I16_ins<"btnez", "sltiu">,
+ cbranch16;
+
+//
+// Format: CMP rx, ry MIPS16e
+// Purpose: Compare
+// To compare the contents of two GPRs.
+//
+def CmpRxRy16: FRR16R_ins<0b01010, "cmp", IIM16Alu> {
+ let Defs = [T8];
+}
+
+//
+// Format: CMPI rx, immediate MIPS16e
+// Purpose: Compare Immediate
+// To compare a constant with the contents of a GPR.
+//
+def CmpiRxImm16: FRI16R_ins<0b01110, "cmpi", IIM16Alu> {
+ let Defs = [T8];
+}
+
+//
+// Format: CMPI rx, immediate MIPS16e
+// Purpose: Compare Immediate (Extended)
+// To compare a constant with the contents of a GPR.
+//
+def CmpiRxImmX16: FEXT_RI16R_ins<0b01110, "cmpi", IIM16Alu> {
+ let Defs = [T8];
+}
+
+
+//
+// Format: DIV rx, ry MIPS16e
+// Purpose: Divide Word
+// To divide 32-bit signed integers.
+//
+def DivRxRy16: FRR16_div_ins<0b11010, "div", IIM16Alu> {
+ let Defs = [HI0, LO0];
+}
+
+//
+// Format: DIVU rx, ry MIPS16e
+// Purpose: Divide Unsigned Word
+// To divide 32-bit unsigned integers.
+//
+def DivuRxRy16: FRR16_div_ins<0b11011, "divu", IIM16Alu> {
+ let Defs = [HI0, LO0];
+}
+//
+// Format: JAL target MIPS16e
+// Purpose: Jump and Link
+// To execute a procedure call within the current 256 MB-aligned
+// region and preserve the current ISA.
+//
+
+def Jal16 : FJAL16_ins<0b0, "jal", IIM16Alu> {
+ let hasDelaySlot = 0; // not true, but we add the nop for now
+ let isCall=1;
+ let Defs = [RA];
+}
+
+def JalB16 : FJALB16_ins<0b0, "jal", IIM16Alu>, branch16 {
+ let hasDelaySlot = 0; // not true, but we add the nop for now
+ let isBranch=1;
+ let Defs = [RA];
+}
+
+//
+// Format: JR ra MIPS16e
+// Purpose: Jump Register Through Register ra
+// To execute a branch to the instruction address in the return
+// address register.
+//
+
+def JrRa16: FRR16_JALRC_RA_only_ins<0, 0, "jr", IIM16Alu> {
+ let isBranch = 1;
+ let isIndirectBranch = 1;
+ let hasDelaySlot = 1;
+ let isTerminator=1;
+ let isBarrier=1;
+}
+
+def JrcRa16: FRR16_JALRC_RA_only_ins<1, 1, "jrc", IIM16Alu> {
+ let isBranch = 1;
+ let isIndirectBranch = 1;
+ let isTerminator=1;
+ let isBarrier=1;
+}
+
+def JrcRx16: FRR16_JALRC_ins<1, 1, 0, "jrc", IIM16Alu> {
+ let isBranch = 1;
+ let isIndirectBranch = 1;
+ let isTerminator=1;
+ let isBarrier=1;
+}
+//
+// Format: LB ry, offset(rx) MIPS16e
+// Purpose: Load Byte (Extended)
+// To load a byte from memory as a signed value.
+//
+def LbRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10011, "lb", mem16, II_LB>, MayLoad{
+ let isCodeGenOnly = 1;
+}
+
+//
+// Format: LBU ry, offset(rx) MIPS16e
+// Purpose: Load Byte Unsigned (Extended)
+// To load a byte from memory as a unsigned value.
+//
+def LbuRxRyOffMemX16:
+ FEXT_RRI16_mem_ins<0b10100, "lbu", mem16, II_LBU>, MayLoad {
+ let isCodeGenOnly = 1;
+}
+
+//
+// Format: LH ry, offset(rx) MIPS16e
+// Purpose: Load Halfword signed (Extended)
+// To load a halfword from memory as a signed value.
+//
+def LhRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10100, "lh", mem16, II_LH>, MayLoad{
+ let isCodeGenOnly = 1;
+}
+
+//
+// Format: LHU ry, offset(rx) MIPS16e
+// Purpose: Load Halfword unsigned (Extended)
+// To load a halfword from memory as an unsigned value.
+//
+def LhuRxRyOffMemX16:
+ FEXT_RRI16_mem_ins<0b10100, "lhu", mem16, II_LHU>, MayLoad {
+ let isCodeGenOnly = 1;
+}
+
+//
+// Format: LI rx, immediate MIPS16e
+// Purpose: Load Immediate
+// To load a constant into a GPR.
+//
+def LiRxImm16: FRI16_ins<0b01101, "li", IIM16Alu>;
+
+//
+// Format: LI rx, immediate MIPS16e
+// Purpose: Load Immediate (Extended)
+// To load a constant into a GPR.
+//
+def LiRxImmX16: FEXT_RI16_ins<0b01101, "li", IIM16Alu>;
+
+def LiRxImmAlignX16: FEXT_RI16_ins<0b01101, ".align 2\n\tli", IIM16Alu> {
+ let isCodeGenOnly = 1;
+}
+
+//
+// Format: LW ry, offset(rx) MIPS16e
+// Purpose: Load Word (Extended)
+// To load a word from memory as a signed value.
+//
+def LwRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10011, "lw", mem16, II_LW>, MayLoad{
+ let isCodeGenOnly = 1;
+}
+
+// Format: LW rx, offset(sp) MIPS16e
+// Purpose: Load Word (SP-Relative, Extended)
+// To load an SP-relative word from memory as a signed value.
+//
+def LwRxSpImmX16: FEXT_RRI16_mem_ins<0b10010, "lw", mem16sp, II_LW>, MayLoad;
+
+def LwRxPcTcp16: FRI16_TCP_ins<0b10110, "lw", II_LW>, MayLoad;
+
+def LwRxPcTcpX16: FEXT_RI16_TCP_ins<0b10110, "lw", II_LW>, MayLoad;
+//
+// Format: MOVE r32, rz MIPS16e
+// Purpose: Move
+// To move the contents of a GPR to a GPR.
+//
+def Move32R16: FI8_MOV32R16_ins<"move", IIM16Alu>;
+
+//
+// Format: MOVE ry, r32 MIPS16e
+//Purpose: Move
+// To move the contents of a GPR to a GPR.
+//
+def MoveR3216: FI8_MOVR3216_ins<"move", IIM16Alu>;
+
+//
+// Format: MFHI rx MIPS16e
+// Purpose: Move From HI Register
+// To copy the special purpose HI register to a GPR.
+//
+def Mfhi16: FRR16_M_ins<0b10000, "mfhi", IIM16Alu> {
+ let Uses = [HI0];
+ let hasSideEffects = 0;
+}
+
+//
+// Format: MFLO rx MIPS16e
+// Purpose: Move From LO Register
+// To copy the special purpose LO register to a GPR.
+//
+def Mflo16: FRR16_M_ins<0b10010, "mflo", IIM16Alu> {
+ let Uses = [LO0];
+ let hasSideEffects = 0;
+}
+
+//
+// Pseudo Instruction for mult
+//
+def MultRxRy16: FMULT16_ins<"mult", IIM16Alu> {
+ let isCommutable = 1;
+ let hasSideEffects = 0;
+ let Defs = [HI0, LO0];
+}
+
+def MultuRxRy16: FMULT16_ins<"multu", IIM16Alu> {
+ let isCommutable = 1;
+ let hasSideEffects = 0;
+ let Defs = [HI0, LO0];
+}
+
+//
+// Format: MULT rx, ry MIPS16e
+// Purpose: Multiply Word
+// To multiply 32-bit signed integers.
+//
+def MultRxRyRz16: FMULT16_LO_ins<"mult", IIM16Alu> {
+ let isCommutable = 1;
+ let hasSideEffects = 0;
+ let Defs = [HI0, LO0];
+}
+
+//
+// Format: MULTU rx, ry MIPS16e
+// Purpose: Multiply Unsigned Word
+// To multiply 32-bit unsigned integers.
+//
+def MultuRxRyRz16: FMULT16_LO_ins<"multu", IIM16Alu> {
+ let isCommutable = 1;
+ let hasSideEffects = 0;
+ let Defs = [HI0, LO0];
+}
+
+//
+// Format: NEG rx, ry MIPS16e
+// Purpose: Negate
+// To negate an integer value.
+//
+def NegRxRy16: FUnaryRR16_ins<0b11101, "neg", IIM16Alu>;
+
+//
+// Format: NOT rx, ry MIPS16e
+// Purpose: Not
+// To complement an integer value
+//
+def NotRxRy16: FUnaryRR16_ins<0b01111, "not", IIM16Alu>;
+
+//
+// Format: OR rx, ry MIPS16e
+// Purpose: Or
+// To do a bitwise logical OR.
+//
+def OrRxRxRy16: FRxRxRy16_ins<0b01101, "or", IIM16Alu>, ArithLogic16Defs<1>;
+
+//
+// Format: RESTORE {ra,}{s0/s1/s0-1,}{framesize}
+// (All args are optional) MIPS16e
+// Purpose: Restore Registers and Deallocate Stack Frame
+// To deallocate a stack frame before exit from a subroutine,
+// restoring return address and static registers, and adjusting
+// stack
+//
+
+def Restore16:
+ FI8_SVRS16<0b1, (outs), (ins variable_ops),
+ "", [], II_RESTORE >, MayLoad {
+ let isCodeGenOnly = 1;
+ let Defs = [SP];
+ let Uses = [SP];
+}
+
+
+def RestoreX16:
+ FI8_SVRS16<0b1, (outs), (ins variable_ops),
+ "", [], II_RESTORE >, MayLoad {
+ let isCodeGenOnly = 1;
+ let Defs = [SP];
+ let Uses = [SP];
+}
+
+//
+// Format: SAVE {ra,}{s0/s1/s0-1,}{framesize} (All arguments are optional)
+// MIPS16e
+// Purpose: Save Registers and Set Up Stack Frame
+// To set up a stack frame on entry to a subroutine,
+// saving return address and static registers, and adjusting stack
+//
+def Save16:
+ FI8_SVRS16<0b1, (outs), (ins variable_ops),
+ "", [], II_SAVE >, MayStore {
+ let isCodeGenOnly = 1;
+ let Uses = [SP];
+ let Defs = [SP];
+}
+
+def SaveX16:
+ FI8_SVRS16<0b1, (outs), (ins variable_ops),
+ "", [], II_SAVE >, MayStore {
+ let isCodeGenOnly = 1;
+ let Uses = [SP];
+ let Defs = [SP];
+}
+//
+// Format: SB ry, offset(rx) MIPS16e
+// Purpose: Store Byte (Extended)
+// To store a byte to memory.
+//
+def SbRxRyOffMemX16:
+ FEXT_RRI16_mem2_ins<0b11000, "sb", mem16, II_SB>, MayStore;
+
+//
+// Format: SEB rx MIPS16e
+// Purpose: Sign-Extend Byte
+// Sign-extend least significant byte in register rx.
+//
+def SebRx16
+ : FRR_SF16_ins<0b10001, 0b100, "seb", IIM16Alu>;
+
+//
+// Format: SEH rx MIPS16e
+// Purpose: Sign-Extend Halfword
+// Sign-extend least significant word in register rx.
+//
+def SehRx16
+ : FRR_SF16_ins<0b10001, 0b101, "seh", IIM16Alu>;
+
+//
+// The Sel(T) instructions are pseudos
+// T means that they use T8 implicitly.
+//
+//
+// Format: SelBeqZ rd, rs, rt
+// Purpose: if rt==0, do nothing
+// else rs = rt
+//
+def SelBeqZ: Sel<"beqz">;
+
+//
+// Format: SelTBteqZCmp rd, rs, rl, rr
+// Purpose: b = Cmp rl, rr.
+// If b==0 then do nothing.
+// if b!=0 then rd = rs
+//
+def SelTBteqZCmp: SelT<"bteqz", "cmp">;
+
+//
+// Format: SelTBteqZCmpi rd, rs, rl, rr
+// Purpose: b = Cmpi rl, imm.
+// If b==0 then do nothing.
+// if b!=0 then rd = rs
+//
+def SelTBteqZCmpi: SeliT<"bteqz", "cmpi">;
+
+//
+// Format: SelTBteqZSlt rd, rs, rl, rr
+// Purpose: b = Slt rl, rr.
+// If b==0 then do nothing.
+// if b!=0 then rd = rs
+//
+def SelTBteqZSlt: SelT<"bteqz", "slt">;
+
+//
+// Format: SelTBteqZSlti rd, rs, rl, rr
+// Purpose: b = Slti rl, imm.
+// If b==0 then do nothing.
+// if b!=0 then rd = rs
+//
+def SelTBteqZSlti: SeliT<"bteqz", "slti">;
+
+//
+// Format: SelTBteqZSltu rd, rs, rl, rr
+// Purpose: b = Sltu rl, rr.
+// If b==0 then do nothing.
+// if b!=0 then rd = rs
+//
+def SelTBteqZSltu: SelT<"bteqz", "sltu">;
+
+//
+// Format: SelTBteqZSltiu rd, rs, rl, rr
+// Purpose: b = Sltiu rl, imm.
+// If b==0 then do nothing.
+// if b!=0 then rd = rs
+//
+def SelTBteqZSltiu: SeliT<"bteqz", "sltiu">;
+
+//
+// Format: SelBnez rd, rs, rt
+// Purpose: if rt!=0, do nothing
+// else rs = rt
+//
+def SelBneZ: Sel<"bnez">;
+
+//
+// Format: SelTBtneZCmp rd, rs, rl, rr
+// Purpose: b = Cmp rl, rr.
+// If b!=0 then do nothing.
+// if b0=0 then rd = rs
+//
+def SelTBtneZCmp: SelT<"btnez", "cmp">;
+
+//
+// Format: SelTBtnezCmpi rd, rs, rl, rr
+// Purpose: b = Cmpi rl, imm.
+// If b!=0 then do nothing.
+// if b==0 then rd = rs
+//
+def SelTBtneZCmpi: SeliT<"btnez", "cmpi">;
+
+//
+// Format: SelTBtneZSlt rd, rs, rl, rr
+// Purpose: b = Slt rl, rr.
+// If b!=0 then do nothing.
+// if b==0 then rd = rs
+//
+def SelTBtneZSlt: SelT<"btnez", "slt">;
+
+//
+// Format: SelTBtneZSlti rd, rs, rl, rr
+// Purpose: b = Slti rl, imm.
+// If b!=0 then do nothing.
+// if b==0 then rd = rs
+//
+def SelTBtneZSlti: SeliT<"btnez", "slti">;
+
+//
+// Format: SelTBtneZSltu rd, rs, rl, rr
+// Purpose: b = Sltu rl, rr.
+// If b!=0 then do nothing.
+// if b==0 then rd = rs
+//
+def SelTBtneZSltu: SelT<"btnez", "sltu">;
+
+//
+// Format: SelTBtneZSltiu rd, rs, rl, rr
+// Purpose: b = Slti rl, imm.
+// If b!=0 then do nothing.
+// if b==0 then rd = rs
+//
+def SelTBtneZSltiu: SeliT<"btnez", "sltiu">;
+//
+//
+// Format: SH ry, offset(rx) MIPS16e
+// Purpose: Store Halfword (Extended)
+// To store a halfword to memory.
+//
+def ShRxRyOffMemX16:
+ FEXT_RRI16_mem2_ins<0b11001, "sh", mem16, II_SH>, MayStore;
+
+//
+// Format: SLL rx, ry, sa MIPS16e
+// Purpose: Shift Word Left Logical (Extended)
+// To execute a left-shift of a word by a fixed number of bits-0 to 31 bits.
+//
+def SllX16: FEXT_SHIFT16_ins<0b00, "sll", IIM16Alu>;
+
+//
+// Format: SLLV ry, rx MIPS16e
+// Purpose: Shift Word Left Logical Variable
+// To execute a left-shift of a word by a variable number of bits.
+//
+def SllvRxRy16 : FRxRxRy16_ins<0b00100, "sllv", IIM16Alu>;
+
+// Format: SLTI rx, immediate MIPS16e
+// Purpose: Set on Less Than Immediate
+// To record the result of a less-than comparison with a constant.
+//
+//
+def SltiRxImm16: FRI16R_ins<0b01010, "slti", IIM16Alu> {
+ let Defs = [T8];
+}
+
+//
+// Format: SLTI rx, immediate MIPS16e
+// Purpose: Set on Less Than Immediate (Extended)
+// To record the result of a less-than comparison with a constant.
+//
+//
+def SltiRxImmX16: FEXT_RI16R_ins<0b01010, "slti", IIM16Alu> {
+ let Defs = [T8];
+}
+
+def SltiCCRxImmX16: FEXT_CCRXI16_ins<"slti">;
+
+// Format: SLTIU rx, immediate MIPS16e
+// Purpose: Set on Less Than Immediate Unsigned
+// To record the result of a less-than comparison with a constant.
+//
+//
+def SltiuRxImm16: FRI16R_ins<0b01011, "sltiu", IIM16Alu> {
+ let Defs = [T8];
+}
+
+//
+// Format: SLTI rx, immediate MIPS16e
+// Purpose: Set on Less Than Immediate Unsigned (Extended)
+// To record the result of a less-than comparison with a constant.
+//
+//
+def SltiuRxImmX16: FEXT_RI16R_ins<0b01011, "sltiu", IIM16Alu> {
+ let Defs = [T8];
+}
+//
+// Format: SLTIU rx, immediate MIPS16e
+// Purpose: Set on Less Than Immediate Unsigned (Extended)
+// To record the result of a less-than comparison with a constant.
+//
+def SltiuCCRxImmX16: FEXT_CCRXI16_ins<"sltiu">;
+
+//
+// Format: SLT rx, ry MIPS16e
+// Purpose: Set on Less Than
+// To record the result of a less-than comparison.
+//
+def SltRxRy16: FRR16R_ins<0b00010, "slt", IIM16Alu>{
+ let Defs = [T8];
+}
+
+def SltCCRxRy16: FCCRR16_ins<"slt">;
+
+// Format: SLTU rx, ry MIPS16e
+// Purpose: Set on Less Than Unsigned
+// To record the result of an unsigned less-than comparison.
+//
+def SltuRxRy16: FRR16R_ins<0b00011, "sltu", IIM16Alu>{
+ let Defs = [T8];
+}
+
+def SltuRxRyRz16: FRRTR16_ins<"sltu"> {
+ let isCodeGenOnly=1;
+ let Defs = [T8];
+}
+
+
+def SltuCCRxRy16: FCCRR16_ins<"sltu">;
+//
+// Format: SRAV ry, rx MIPS16e
+// Purpose: Shift Word Right Arithmetic Variable
+// To execute an arithmetic right-shift of a word by a variable
+// number of bits.
+//
+def SravRxRy16: FRxRxRy16_ins<0b00111, "srav", IIM16Alu>;
+
+
+//
+// Format: SRA rx, ry, sa MIPS16e
+// Purpose: Shift Word Right Arithmetic (Extended)
+// To execute an arithmetic right-shift of a word by a fixed
+// number of bits-1 to 8 bits.
+//
+def SraX16: FEXT_SHIFT16_ins<0b11, "sra", IIM16Alu>;
+
+
+//
+// Format: SRLV ry, rx MIPS16e
+// Purpose: Shift Word Right Logical Variable
+// To execute a logical right-shift of a word by a variable
+// number of bits.
+//
+def SrlvRxRy16: FRxRxRy16_ins<0b00110, "srlv", IIM16Alu>;
+
+
+//
+// Format: SRL rx, ry, sa MIPS16e
+// Purpose: Shift Word Right Logical (Extended)
+// To execute a logical right-shift of a word by a fixed
+// number of bits-1 to 31 bits.
+//
+def SrlX16: FEXT_SHIFT16_ins<0b10, "srl", IIM16Alu>;
+
+//
+// Format: SUBU rz, rx, ry MIPS16e
+// Purpose: Subtract Unsigned Word
+// To subtract 32-bit integers
+//
+def SubuRxRyRz16: FRRR16_ins<0b11, "subu", IIM16Alu>, ArithLogic16Defs<0>;
+
+//
+// Format: SW ry, offset(rx) MIPS16e
+// Purpose: Store Word (Extended)
+// To store a word to memory.
+//
+def SwRxRyOffMemX16: FEXT_RRI16_mem2_ins<0b11011, "sw", mem16, II_SW>, MayStore;
+
+//
+// Format: SW rx, offset(sp) MIPS16e
+// Purpose: Store Word rx (SP-Relative)
+// To store an SP-relative word to memory.
+//
+def SwRxSpImmX16: FEXT_RRI16_mem2_ins<0b11010, "sw", mem16sp, II_SW>, MayStore;
+
+//
+//
+// Format: XOR rx, ry MIPS16e
+// Purpose: Xor
+// To do a bitwise logical XOR.
+//
+def XorRxRxRy16: FRxRxRy16_ins<0b01110, "xor", IIM16Alu>, ArithLogic16Defs<1>;
+
+class Mips16Pat<dag pattern, dag result> : Pat<pattern, result> {
+ let Predicates = [InMips16Mode];
+}
+
+// Unary Arith/Logic
+//
+class ArithLogicU_pat<PatFrag OpNode, Instruction I> :
+ Mips16Pat<(OpNode CPU16Regs:$r),
+ (I CPU16Regs:$r)>;
+
+def: ArithLogicU_pat<not, NotRxRy16>;
+def: ArithLogicU_pat<ineg, NegRxRy16>;
+
+class ArithLogic16_pat<SDNode OpNode, Instruction I> :
+ Mips16Pat<(OpNode CPU16Regs:$l, CPU16Regs:$r),
+ (I CPU16Regs:$l, CPU16Regs:$r)>;
+
+def: ArithLogic16_pat<add, AdduRxRyRz16>;
+def: ArithLogic16_pat<and, AndRxRxRy16>;
+def: ArithLogic16_pat<mul, MultRxRyRz16>;
+def: ArithLogic16_pat<or, OrRxRxRy16>;
+def: ArithLogic16_pat<sub, SubuRxRyRz16>;
+def: ArithLogic16_pat<xor, XorRxRxRy16>;
+
+// Arithmetic and logical instructions with 2 register operands.
+
+class ArithLogicI16_pat<SDNode OpNode, PatFrag imm_type, Instruction I> :
+ Mips16Pat<(OpNode CPU16Regs:$in, imm_type:$imm),
+ (I CPU16Regs:$in, imm_type:$imm)>;
+
+def: ArithLogicI16_pat<add, immSExt8, AddiuRxRxImm16>;
+def: ArithLogicI16_pat<add, immSExt16, AddiuRxRxImmX16>;
+def: ArithLogicI16_pat<shl, immZExt5, SllX16>;
+def: ArithLogicI16_pat<srl, immZExt5, SrlX16>;
+def: ArithLogicI16_pat<sra, immZExt5, SraX16>;
+
+class shift_rotate_reg16_pat<SDNode OpNode, Instruction I> :
+ Mips16Pat<(OpNode CPU16Regs:$r, CPU16Regs:$ra),
+ (I CPU16Regs:$r, CPU16Regs:$ra)>;
+
+def: shift_rotate_reg16_pat<shl, SllvRxRy16>;
+def: shift_rotate_reg16_pat<sra, SravRxRy16>;
+def: shift_rotate_reg16_pat<srl, SrlvRxRy16>;
+
+class LoadM16_pat<PatFrag OpNode, Instruction I, ComplexPattern Addr> :
+ Mips16Pat<(OpNode Addr:$addr), (I Addr:$addr)>;
+
+def: LoadM16_pat<sextloadi8, LbRxRyOffMemX16, addr16>;
+def: LoadM16_pat<zextloadi8, LbuRxRyOffMemX16, addr16>;
+def: LoadM16_pat<sextloadi16, LhRxRyOffMemX16, addr16>;
+def: LoadM16_pat<zextloadi16, LhuRxRyOffMemX16, addr16>;
+def: LoadM16_pat<load, LwRxSpImmX16, addr16sp>;
+
+class StoreM16_pat<PatFrag OpNode, Instruction I, ComplexPattern Addr> :
+ Mips16Pat<(OpNode CPU16Regs:$r, Addr:$addr), (I CPU16Regs:$r, Addr:$addr)>;
+
+def: StoreM16_pat<truncstorei8, SbRxRyOffMemX16, addr16>;
+def: StoreM16_pat<truncstorei16, ShRxRyOffMemX16, addr16>;
+def: StoreM16_pat<store, SwRxSpImmX16, addr16sp>;
+
+// Unconditional branch
+class UncondBranch16_pat<SDNode OpNode, Instruction I>:
+ Mips16Pat<(OpNode bb:$imm16), (I bb:$imm16)> {
+ let Predicates = [InMips16Mode];
+ }
+
+def : Mips16Pat<(MipsJmpLink (i32 tglobaladdr:$dst)),
+ (Jal16 tglobaladdr:$dst)>;
+
+def : Mips16Pat<(MipsJmpLink (i32 texternalsym:$dst)),
+ (Jal16 texternalsym:$dst)>;
+
+// Indirect branch
+def: Mips16Pat<(brind CPU16Regs:$rs), (JrcRx16 CPU16Regs:$rs)> {
+ // Ensure that the addition of MIPS32r6/MIPS64r6 support does not change
+ // MIPS16's behaviour.
+ let AddedComplexity = 1;
+}
+
+// Jump and Link (Call)
+let isCall=1, hasDelaySlot=0 in
+def JumpLinkReg16:
+ FRR16_JALRC<0, 0, 0, (outs), (ins CPU16Regs:$rs),
+ "jalrc \t$rs", [(MipsJmpLink CPU16Regs:$rs)], II_JALRC> {
+ let Defs = [RA];
+}
+
+// Mips16 pseudos
+let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1, hasCtrlDep=1,
+ hasExtraSrcRegAllocReq = 1 in
+def RetRA16 : MipsPseudo16<(outs), (ins), "", [(MipsRet)]>;
+
+
+// setcc patterns
+
+class SetCC_R16<PatFrag cond_op, Instruction I>:
+ Mips16Pat<(cond_op CPU16Regs:$rx, CPU16Regs:$ry),
+ (I CPU16Regs:$rx, CPU16Regs:$ry)>;
+
+class SetCC_I16<PatFrag cond_op, PatLeaf imm_type, Instruction I>:
+ Mips16Pat<(cond_op CPU16Regs:$rx, imm_type:$imm16),
+ (I CPU16Regs:$rx, imm_type:$imm16)>;
+
+
+def: Mips16Pat<(i32 addr16sp:$addr), (AddiuRxRyOffMemX16 addr16sp:$addr)>;
+
+
+// Large (>16 bit) immediate loads
+def : Mips16Pat<(i32 imm:$imm), (LwConstant32 imm:$imm, -1)>;
+
+// Carry MipsPatterns
+def : Mips16Pat<(subc CPU16Regs:$lhs, CPU16Regs:$rhs),
+ (SubuRxRyRz16 CPU16Regs:$lhs, CPU16Regs:$rhs)>;
+def : Mips16Pat<(addc CPU16Regs:$lhs, CPU16Regs:$rhs),
+ (AdduRxRyRz16 CPU16Regs:$lhs, CPU16Regs:$rhs)>;
+def : Mips16Pat<(addc CPU16Regs:$src, immSExt16:$imm),
+ (AddiuRxRxImmX16 CPU16Regs:$src, imm:$imm)>;
+
+//
+// Some branch conditional patterns are not generated by llvm at this time.
+// Some are for seemingly arbitrary reasons not used: i.e. with signed number
+// comparison they are used and for unsigned a different pattern is used.
+// I am pushing upstream from the full mips16 port and it seemed that I needed
+// these earlier and the mips32 port has these but now I cannot create test
+// cases that use these patterns. While I sort this all out I will leave these
+// extra patterns commented out and if I can be sure they are really not used,
+// I will delete the code. I don't want to check the code in uncommented without
+// a valid test case. In some cases, the compiler is generating patterns with
+// setcc instead and earlier I had implemented setcc first so may have masked
+// the problem. The setcc variants are suboptimal for mips16 so I may wantto
+// figure out how to enable the brcond patterns or else possibly new
+// combinations of of brcond and setcc.
+//
+//
+// bcond-seteq
+//
+def: Mips16Pat
+ <(brcond (i32 (seteq CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+ (BteqzT8CmpX16 CPU16Regs:$rx, CPU16Regs:$ry, bb:$imm16)
+ >;
+
+
+def: Mips16Pat
+ <(brcond (i32 (seteq CPU16Regs:$rx, immZExt16:$imm)), bb:$targ16),
+ (BteqzT8CmpiX16 CPU16Regs:$rx, immSExt16:$imm, bb:$targ16)
+ >;
+
+def: Mips16Pat
+ <(brcond (i32 (seteq CPU16Regs:$rx, 0)), bb:$targ16),
+ (BeqzRxImm16 CPU16Regs:$rx, bb:$targ16)
+ >;
+
+//
+// bcond-setgt (do we need to have this pair of setlt, setgt??)
+//
+def: Mips16Pat
+ <(brcond (i32 (setgt CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+ (BtnezT8SltX16 CPU16Regs:$ry, CPU16Regs:$rx, bb:$imm16)
+ >;
+
+//
+// bcond-setge
+//
+def: Mips16Pat
+ <(brcond (i32 (setge CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+ (BteqzT8SltX16 CPU16Regs:$rx, CPU16Regs:$ry, bb:$imm16)
+ >;
+
+//
+// never called because compiler transforms a >= k to a > (k-1)
+def: Mips16Pat
+ <(brcond (i32 (setge CPU16Regs:$rx, immSExt16:$imm)), bb:$imm16),
+ (BteqzT8SltiX16 CPU16Regs:$rx, immSExt16:$imm, bb:$imm16)
+ >;
+
+//
+// bcond-setlt
+//
+def: Mips16Pat
+ <(brcond (i32 (setlt CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+ (BtnezT8SltX16 CPU16Regs:$rx, CPU16Regs:$ry, bb:$imm16)
+ >;
+
+def: Mips16Pat
+ <(brcond (i32 (setlt CPU16Regs:$rx, immSExt16:$imm)), bb:$imm16),
+ (BtnezT8SltiX16 CPU16Regs:$rx, immSExt16:$imm, bb:$imm16)
+ >;
+
+//
+// bcond-setle
+//
+def: Mips16Pat
+ <(brcond (i32 (setle CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+ (BteqzT8SltX16 CPU16Regs:$ry, CPU16Regs:$rx, bb:$imm16)
+ >;
+
+//
+// bcond-setne
+//
+def: Mips16Pat
+ <(brcond (i32 (setne CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+ (BtnezT8CmpX16 CPU16Regs:$rx, CPU16Regs:$ry, bb:$imm16)
+ >;
+
+def: Mips16Pat
+ <(brcond (i32 (setne CPU16Regs:$rx, immZExt16:$imm)), bb:$targ16),
+ (BtnezT8CmpiX16 CPU16Regs:$rx, immSExt16:$imm, bb:$targ16)
+ >;
+
+def: Mips16Pat
+ <(brcond (i32 (setne CPU16Regs:$rx, 0)), bb:$targ16),
+ (BnezRxImm16 CPU16Regs:$rx, bb:$targ16)
+ >;
+
+//
+// This needs to be there but I forget which code will generate it
+//
+def: Mips16Pat
+ <(brcond CPU16Regs:$rx, bb:$targ16),
+ (BnezRxImm16 CPU16Regs:$rx, bb:$targ16)
+ >;
+
+//
+
+//
+// bcond-setugt
+//
+//def: Mips16Pat
+// <(brcond (i32 (setugt CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+// (BtnezT8SltuX16 CPU16Regs:$ry, CPU16Regs:$rx, bb:$imm16)
+// >;
+
+//
+// bcond-setuge
+//
+//def: Mips16Pat
+// <(brcond (i32 (setuge CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+// (BteqzT8SltuX16 CPU16Regs:$rx, CPU16Regs:$ry, bb:$imm16)
+// >;
+
+
+//
+// bcond-setult
+//
+//def: Mips16Pat
+// <(brcond (i32 (setult CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+// (BtnezT8SltuX16 CPU16Regs:$rx, CPU16Regs:$ry, bb:$imm16)
+// >;
+
+def: UncondBranch16_pat<br, Bimm16>;
+
+// Small immediates
+def: Mips16Pat<(i32 immSExt16:$in),
+ (AddiuRxRxImmX16 (MoveR3216 ZERO), immSExt16:$in)>;
+
+def: Mips16Pat<(i32 immZExt16:$in), (LiRxImmX16 immZExt16:$in)>;
+
+//
+// MipsDivRem
+//
+def: Mips16Pat
+ <(MipsDivRem16 CPU16Regs:$rx, CPU16Regs:$ry),
+ (DivRxRy16 CPU16Regs:$rx, CPU16Regs:$ry)>;
+
+//
+// MipsDivRemU
+//
+def: Mips16Pat
+ <(MipsDivRemU16 CPU16Regs:$rx, CPU16Regs:$ry),
+ (DivuRxRy16 CPU16Regs:$rx, CPU16Regs:$ry)>;
+
+// signed a,b
+// x = (a>=b)?x:y
+//
+// if !(a < b) x = y
+//
+def : Mips16Pat<(select (i32 (setge CPU16Regs:$a, CPU16Regs:$b)),
+ CPU16Regs:$x, CPU16Regs:$y),
+ (SelTBteqZSlt CPU16Regs:$x, CPU16Regs:$y,
+ CPU16Regs:$a, CPU16Regs:$b)>;
+
+// signed a,b
+// x = (a>b)?x:y
+//
+// if (b < a) x = y
+//
+def : Mips16Pat<(select (i32 (setgt CPU16Regs:$a, CPU16Regs:$b)),
+ CPU16Regs:$x, CPU16Regs:$y),
+ (SelTBtneZSlt CPU16Regs:$x, CPU16Regs:$y,
+ CPU16Regs:$b, CPU16Regs:$a)>;
+
+// unsigned a,b
+// x = (a>=b)?x:y
+//
+// if !(a < b) x = y;
+//
+def : Mips16Pat<
+ (select (i32 (setuge CPU16Regs:$a, CPU16Regs:$b)),
+ CPU16Regs:$x, CPU16Regs:$y),
+ (SelTBteqZSltu CPU16Regs:$x, CPU16Regs:$y,
+ CPU16Regs:$a, CPU16Regs:$b)>;
+
+// unsigned a,b
+// x = (a>b)?x:y
+//
+// if (b < a) x = y
+//
+def : Mips16Pat<(select (i32 (setugt CPU16Regs:$a, CPU16Regs:$b)),
+ CPU16Regs:$x, CPU16Regs:$y),
+ (SelTBtneZSltu CPU16Regs:$x, CPU16Regs:$y,
+ CPU16Regs:$b, CPU16Regs:$a)>;
+
+// signed
+// x = (a >= k)?x:y
+// due to an llvm optimization, i don't think that this will ever
+// be used. This is transformed into x = (a > k-1)?x:y
+//
+//
+
+//def : Mips16Pat<
+// (select (i32 (setge CPU16Regs:$lhs, immSExt16:$rhs)),
+// CPU16Regs:$T, CPU16Regs:$F),
+// (SelTBteqZSlti CPU16Regs:$T, CPU16Regs:$F,
+// CPU16Regs:$lhs, immSExt16:$rhs)>;
+
+//def : Mips16Pat<
+// (select (i32 (setuge CPU16Regs:$lhs, immSExt16:$rhs)),
+// CPU16Regs:$T, CPU16Regs:$F),
+// (SelTBteqZSltiu CPU16Regs:$T, CPU16Regs:$F,
+// CPU16Regs:$lhs, immSExt16:$rhs)>;
+
+// signed
+// x = (a < k)?x:y
+//
+// if !(a < k) x = y;
+//
+def : Mips16Pat<
+ (select (i32 (setlt CPU16Regs:$a, immSExt16:$b)),
+ CPU16Regs:$x, CPU16Regs:$y),
+ (SelTBtneZSlti CPU16Regs:$x, CPU16Regs:$y,
+ CPU16Regs:$a, immSExt16:$b)>;
+
+
+//
+//
+// signed
+// x = (a <= b)? x : y
+//
+// if (b < a) x = y
+//
+def : Mips16Pat<(select (i32 (setle CPU16Regs:$a, CPU16Regs:$b)),
+ CPU16Regs:$x, CPU16Regs:$y),
+ (SelTBteqZSlt CPU16Regs:$x, CPU16Regs:$y,
+ CPU16Regs:$b, CPU16Regs:$a)>;
+
+//
+// unnsigned
+// x = (a <= b)? x : y
+//
+// if (b < a) x = y
+//
+def : Mips16Pat<(select (i32 (setule CPU16Regs:$a, CPU16Regs:$b)),
+ CPU16Regs:$x, CPU16Regs:$y),
+ (SelTBteqZSltu CPU16Regs:$x, CPU16Regs:$y,
+ CPU16Regs:$b, CPU16Regs:$a)>;
+
+//
+// signed/unsigned
+// x = (a == b)? x : y
+//
+// if (a != b) x = y
+//
+def : Mips16Pat<(select (i32 (seteq CPU16Regs:$a, CPU16Regs:$b)),
+ CPU16Regs:$x, CPU16Regs:$y),
+ (SelTBteqZCmp CPU16Regs:$x, CPU16Regs:$y,
+ CPU16Regs:$b, CPU16Regs:$a)>;
+
+//
+// signed/unsigned
+// x = (a == 0)? x : y
+//
+// if (a != 0) x = y
+//
+def : Mips16Pat<(select (i32 (seteq CPU16Regs:$a, 0)),
+ CPU16Regs:$x, CPU16Regs:$y),
+ (SelBeqZ CPU16Regs:$x, CPU16Regs:$y,
+ CPU16Regs:$a)>;
+
+
+//
+// signed/unsigned
+// x = (a == k)? x : y
+//
+// if (a != k) x = y
+//
+def : Mips16Pat<(select (i32 (seteq CPU16Regs:$a, immZExt16:$k)),
+ CPU16Regs:$x, CPU16Regs:$y),
+ (SelTBteqZCmpi CPU16Regs:$x, CPU16Regs:$y,
+ CPU16Regs:$a, immZExt16:$k)>;
+
+
+//
+// signed/unsigned
+// x = (a != b)? x : y
+//
+// if (a == b) x = y
+//
+//
+def : Mips16Pat<(select (i32 (setne CPU16Regs:$a, CPU16Regs:$b)),
+ CPU16Regs:$x, CPU16Regs:$y),
+ (SelTBtneZCmp CPU16Regs:$x, CPU16Regs:$y,
+ CPU16Regs:$b, CPU16Regs:$a)>;
+
+//
+// signed/unsigned
+// x = (a != 0)? x : y
+//
+// if (a == 0) x = y
+//
+def : Mips16Pat<(select (i32 (setne CPU16Regs:$a, 0)),
+ CPU16Regs:$x, CPU16Regs:$y),
+ (SelBneZ CPU16Regs:$x, CPU16Regs:$y,
+ CPU16Regs:$a)>;
+
+// signed/unsigned
+// x = (a)? x : y
+//
+// if (!a) x = y
+//
+def : Mips16Pat<(select CPU16Regs:$a,
+ CPU16Regs:$x, CPU16Regs:$y),
+ (SelBneZ CPU16Regs:$x, CPU16Regs:$y,
+ CPU16Regs:$a)>;
+
+
+//
+// signed/unsigned
+// x = (a != k)? x : y
+//
+// if (a == k) x = y
+//
+def : Mips16Pat<(select (i32 (setne CPU16Regs:$a, immZExt16:$k)),
+ CPU16Regs:$x, CPU16Regs:$y),
+ (SelTBtneZCmpi CPU16Regs:$x, CPU16Regs:$y,
+ CPU16Regs:$a, immZExt16:$k)>;
+
+//
+// When writing C code to test setxx these patterns,
+// some will be transformed into
+// other things. So we test using C code but using -O3 and -O0
+//
+// seteq
+//
+def : Mips16Pat
+ <(seteq CPU16Regs:$lhs,CPU16Regs:$rhs),
+ (SltiuCCRxImmX16 (XorRxRxRy16 CPU16Regs:$lhs, CPU16Regs:$rhs), 1)>;
+
+def : Mips16Pat
+ <(seteq CPU16Regs:$lhs, 0),
+ (SltiuCCRxImmX16 CPU16Regs:$lhs, 1)>;
+
+
+//
+// setge
+//
+
+def: Mips16Pat
+ <(setge CPU16Regs:$lhs, CPU16Regs:$rhs),
+ (XorRxRxRy16 (SltCCRxRy16 CPU16Regs:$lhs, CPU16Regs:$rhs),
+ (LiRxImmX16 1))>;
+
+//
+// For constants, llvm transforms this to:
+// x > (k - 1) and then reverses the operands to use setlt. So this pattern
+// is not used now by the compiler. (Presumably checking that k-1 does not
+// overflow). The compiler never uses this at the current time, due to
+// other optimizations.
+//
+//def: Mips16Pat
+// <(setge CPU16Regs:$lhs, immSExt16:$rhs),
+// (XorRxRxRy16 (SltiCCRxImmX16 CPU16Regs:$lhs, immSExt16:$rhs),
+// (LiRxImmX16 1))>;
+
+// This catches the x >= -32768 case by transforming it to x > -32769
+//
+def: Mips16Pat
+ <(setgt CPU16Regs:$lhs, -32769),
+ (XorRxRxRy16 (SltiCCRxImmX16 CPU16Regs:$lhs, -32768),
+ (LiRxImmX16 1))>;
+
+//
+// setgt
+//
+//
+
+def: Mips16Pat
+ <(setgt CPU16Regs:$lhs, CPU16Regs:$rhs),
+ (SltCCRxRy16 CPU16Regs:$rhs, CPU16Regs:$lhs)>;
+
+//
+// setle
+//
+def: Mips16Pat
+ <(setle CPU16Regs:$lhs, CPU16Regs:$rhs),
+ (XorRxRxRy16 (SltCCRxRy16 CPU16Regs:$rhs, CPU16Regs:$lhs), (LiRxImm16 1))>;
+
+//
+// setlt
+//
+def: SetCC_R16<setlt, SltCCRxRy16>;
+
+def: SetCC_I16<setlt, immSExt16, SltiCCRxImmX16>;
+
+//
+// setne
+//
+def : Mips16Pat
+ <(setne CPU16Regs:$lhs,CPU16Regs:$rhs),
+ (SltuCCRxRy16 (LiRxImmX16 0),
+ (XorRxRxRy16 CPU16Regs:$lhs, CPU16Regs:$rhs))>;
+
+
+//
+// setuge
+//
+def: Mips16Pat
+ <(setuge CPU16Regs:$lhs, CPU16Regs:$rhs),
+ (XorRxRxRy16 (SltuCCRxRy16 CPU16Regs:$lhs, CPU16Regs:$rhs),
+ (LiRxImmX16 1))>;
+
+// this pattern will never be used because the compiler will transform
+// x >= k to x > (k - 1) and then use SLT
+//
+//def: Mips16Pat
+// <(setuge CPU16Regs:$lhs, immZExt16:$rhs),
+// (XorRxRxRy16 (SltiuCCRxImmX16 CPU16Regs:$lhs, immZExt16:$rhs),
+// (LiRxImmX16 1))>;
+
+//
+// setugt
+//
+def: Mips16Pat
+ <(setugt CPU16Regs:$lhs, CPU16Regs:$rhs),
+ (SltuCCRxRy16 CPU16Regs:$rhs, CPU16Regs:$lhs)>;
+
+//
+// setule
+//
+def: Mips16Pat
+ <(setule CPU16Regs:$lhs, CPU16Regs:$rhs),
+ (XorRxRxRy16 (SltuCCRxRy16 CPU16Regs:$rhs, CPU16Regs:$lhs), (LiRxImmX16 1))>;
+
+//
+// setult
+//
+def: SetCC_R16<setult, SltuCCRxRy16>;
+
+def: SetCC_I16<setult, immSExt16, SltiuCCRxImmX16>;
+
+def: Mips16Pat<(add CPU16Regs:$hi, (MipsLo tglobaladdr:$lo)),
+ (AddiuRxRxImmX16 CPU16Regs:$hi, tglobaladdr:$lo)>;
+
+// hi/lo relocs
+def : Mips16Pat<(MipsHi tblockaddress:$in),
+ (SllX16 (LiRxImmX16 tblockaddress:$in), 16)>;
+def : Mips16Pat<(MipsHi tglobaladdr:$in),
+ (SllX16 (LiRxImmX16 tglobaladdr:$in), 16)>;
+def : Mips16Pat<(MipsHi tjumptable:$in),
+ (SllX16 (LiRxImmX16 tjumptable:$in), 16)>;
+def : Mips16Pat<(MipsHi tglobaltlsaddr:$in),
+ (SllX16 (LiRxImmX16 tglobaltlsaddr:$in), 16)>;
+
+def : Mips16Pat<(MipsLo tblockaddress:$in), (LiRxImmX16 tblockaddress:$in)>;
+
+// wrapper_pic
+class Wrapper16Pat<SDNode node, Instruction ADDiuOp, RegisterClass RC>:
+ Mips16Pat<(MipsWrapper RC:$gp, node:$in),
+ (ADDiuOp RC:$gp, node:$in)>;
+
+
+def : Wrapper16Pat<tglobaladdr, AddiuRxRxImmX16, CPU16Regs>;
+def : Wrapper16Pat<tglobaltlsaddr, AddiuRxRxImmX16, CPU16Regs>;
+
+def : Mips16Pat<(i32 (extloadi8 addr16:$src)),
+ (LbuRxRyOffMemX16 addr16:$src)>;
+def : Mips16Pat<(i32 (extloadi16 addr16:$src)),
+ (LhuRxRyOffMemX16 addr16:$src)>;
+
+def: Mips16Pat<(trap), (Break16)>;
+
+def : Mips16Pat<(sext_inreg CPU16Regs:$val, i8),
+ (SebRx16 CPU16Regs:$val)>;
+
+def : Mips16Pat<(sext_inreg CPU16Regs:$val, i16),
+ (SehRx16 CPU16Regs:$val)>;
+
+def GotPrologue16:
+ MipsPseudo16<
+ (outs CPU16Regs:$rh, CPU16Regs:$rl),
+ (ins simm16:$immHi, simm16:$immLo),
+ "li\t$rh, $immHi\n\taddiu\t$rl, $$pc, $immLo\n ",[]> ;
+
+// An operand for the CONSTPOOL_ENTRY pseudo-instruction.
+def cpinst_operand : Operand<i32> {
+ // let PrintMethod = "printCPInstOperand";
+}
+
+// CONSTPOOL_ENTRY - This instruction represents a floating constant pool in
+// the function. The first operand is the ID# for this instruction, the second
+// is the index into the MachineConstantPool that this is, the third is the
+// size in bytes of this constant pool entry.
+//
+let hasSideEffects = 0, isNotDuplicable = 1 in
+def CONSTPOOL_ENTRY :
+MipsPseudo16<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
+ i32imm:$size), "foo", []>;
+
diff --git a/contrib/llvm/lib/Target/Mips/Mips16RegisterInfo.cpp b/contrib/llvm/lib/Target/Mips/Mips16RegisterInfo.cpp
new file mode 100644
index 000000000000..44771cbe8be1
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16RegisterInfo.cpp
@@ -0,0 +1,148 @@
+//===-- Mips16RegisterInfo.cpp - MIPS16 Register Information --------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MIPS16 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips16RegisterInfo.h"
+#include "Mips.h"
+#include "Mips16InstrInfo.h"
+#include "MipsInstrInfo.h"
+#include "MipsMachineFunction.h"
+#include "MipsSubtarget.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips16-registerinfo"
+
+Mips16RegisterInfo::Mips16RegisterInfo() : MipsRegisterInfo() {}
+
+bool Mips16RegisterInfo::requiresRegisterScavenging
+ (const MachineFunction &MF) const {
+ return false;
+}
+bool Mips16RegisterInfo::requiresFrameIndexScavenging
+ (const MachineFunction &MF) const {
+ return false;
+}
+
+bool Mips16RegisterInfo::useFPForScavengingIndex
+ (const MachineFunction &MF) const {
+ return false;
+}
+
+bool Mips16RegisterInfo::saveScavengerRegister
+ (MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator &UseMI,
+ const TargetRegisterClass *RC,
+ unsigned Reg) const {
+ DebugLoc DL;
+ const TargetInstrInfo &TII = *MBB.getParent()->getSubtarget().getInstrInfo();
+ TII.copyPhysReg(MBB, I, DL, Mips::T0, Reg, true);
+ TII.copyPhysReg(MBB, UseMI, DL, Reg, Mips::T0, true);
+ return true;
+}
+
+const TargetRegisterClass *
+Mips16RegisterInfo::intRegClass(unsigned Size) const {
+ assert(Size == 4);
+ return &Mips::CPU16RegsRegClass;
+}
+
+void Mips16RegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
+ unsigned OpNo, int FrameIndex,
+ uint64_t StackSize,
+ int64_t SPOffset) const {
+ MachineInstr &MI = *II;
+ MachineFunction &MF = *MI.getParent()->getParent();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+ int MinCSFI = 0;
+ int MaxCSFI = -1;
+
+ if (CSI.size()) {
+ MinCSFI = CSI[0].getFrameIdx();
+ MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
+ }
+
+ // The following stack frame objects are always
+ // referenced relative to $sp:
+ // 1. Outgoing arguments.
+ // 2. Pointer to dynamically allocated stack space.
+ // 3. Locations for callee-saved registers.
+ // Everything else is referenced relative to whatever register
+ // getFrameRegister() returns.
+ unsigned FrameReg;
+
+ if (FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI)
+ FrameReg = Mips::SP;
+ else {
+ const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+ if (TFI->hasFP(MF)) {
+ FrameReg = Mips::S0;
+ }
+ else {
+ if ((MI.getNumOperands()> OpNo+2) && MI.getOperand(OpNo+2).isReg())
+ FrameReg = MI.getOperand(OpNo+2).getReg();
+ else
+ FrameReg = Mips::SP;
+ }
+ }
+ // Calculate final offset.
+ // - There is no need to change the offset if the frame object
+ // is one of the
+ // following: an outgoing argument, pointer to a dynamically allocated
+ // stack space or a $gp restore location,
+ // - If the frame object is any of the following,
+ // its offset must be adjusted
+ // by adding the size of the stack:
+ // incoming argument, callee-saved register location or local variable.
+ int64_t Offset;
+ bool IsKill = false;
+ Offset = SPOffset + (int64_t)StackSize;
+ Offset += MI.getOperand(OpNo + 1).getImm();
+
+
+ DEBUG(errs() << "Offset : " << Offset << "\n" << "<--------->\n");
+
+ if (!MI.isDebugValue() &&
+ !Mips16InstrInfo::validImmediate(MI.getOpcode(), FrameReg, Offset)) {
+ MachineBasicBlock &MBB = *MI.getParent();
+ DebugLoc DL = II->getDebugLoc();
+ unsigned NewImm;
+ const Mips16InstrInfo &TII =
+ *static_cast<const Mips16InstrInfo *>(MF.getSubtarget().getInstrInfo());
+ FrameReg = TII.loadImmediate(FrameReg, Offset, MBB, II, DL, NewImm);
+ Offset = SignExtend64<16>(NewImm);
+ IsKill = true;
+ }
+ MI.getOperand(OpNo).ChangeToRegister(FrameReg, false, false, IsKill);
+ MI.getOperand(OpNo + 1).ChangeToImmediate(Offset);
+
+
+}
diff --git a/contrib/llvm/lib/Target/Mips/Mips16RegisterInfo.h b/contrib/llvm/lib/Target/Mips/Mips16RegisterInfo.h
new file mode 100644
index 000000000000..d67a79b64033
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16RegisterInfo.h
@@ -0,0 +1,48 @@
+//===-- Mips16RegisterInfo.h - Mips16 Register Information ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips16 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPS16REGISTERINFO_H
+#define LLVM_LIB_TARGET_MIPS_MIPS16REGISTERINFO_H
+
+#include "MipsRegisterInfo.h"
+
+namespace llvm {
+class Mips16InstrInfo;
+
+class Mips16RegisterInfo : public MipsRegisterInfo {
+public:
+ Mips16RegisterInfo();
+
+ bool requiresRegisterScavenging(const MachineFunction &MF) const override;
+
+ bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
+
+ bool useFPForScavengingIndex(const MachineFunction &MF) const override;
+
+ bool saveScavengerRegister(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator &UseMI,
+ const TargetRegisterClass *RC,
+ unsigned Reg) const override;
+
+ const TargetRegisterClass *intRegClass(unsigned Size) const override;
+
+private:
+ void eliminateFI(MachineBasicBlock::iterator II, unsigned OpNo,
+ int FrameIndex, uint64_t StackSize,
+ int64_t SPOffset) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/Mips32r6InstrFormats.td b/contrib/llvm/lib/Target/Mips/Mips32r6InstrFormats.td
new file mode 100644
index 000000000000..516caa34fbf2
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips32r6InstrFormats.td
@@ -0,0 +1,578 @@
+//=- Mips32r6InstrFormats.td - Mips32r6 Instruction Formats -*- tablegen -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips32r6 instruction formats.
+//
+//===----------------------------------------------------------------------===//
+
+class R6MMR6Rel;
+
+def MipsR62MicroMipsR6 : InstrMapping {
+ let FilterClass = "R6MMR6Rel";
+ // Instructions with the same BaseOpcode and isNVStore values form a row.
+ let RowFields = ["BaseOpcode"];
+ // Instructions with the same predicate sense form a column.
+ let ColFields = ["Arch"];
+ // The key column is the unpredicated instructions.
+ let KeyCol = ["mipsr6"];
+ // Value columns are PredSense=true and PredSense=false
+ let ValueCols = [["mipsr6"], ["micromipsr6"]];
+}
+
+class MipsR6Arch<string opstr> {
+ string Arch = "mipsr6";
+ string BaseOpcode = opstr;
+}
+
+class MipsR6Inst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>,
+ PredicateControl {
+ let DecoderNamespace = "Mips32r6_64r6";
+ let EncodingPredicates = [HasStdEnc];
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Field Values
+//
+//===----------------------------------------------------------------------===//
+
+class OPGROUP<bits<6> Val> {
+ bits<6> Value = Val;
+}
+def OPGROUP_COP0 : OPGROUP<0b010000>;
+def OPGROUP_COP1 : OPGROUP<0b010001>;
+def OPGROUP_COP2 : OPGROUP<0b010010>;
+def OPGROUP_ADDI : OPGROUP<0b001000>;
+def OPGROUP_AUI : OPGROUP<0b001111>;
+def OPGROUP_BLEZ : OPGROUP<0b000110>;
+def OPGROUP_BGTZ : OPGROUP<0b000111>;
+def OPGROUP_BLEZL : OPGROUP<0b010110>;
+def OPGROUP_BGTZL : OPGROUP<0b010111>;
+def OPGROUP_DADDI : OPGROUP<0b011000>;
+def OPGROUP_DAUI : OPGROUP<0b011101>;
+def OPGROUP_PCREL : OPGROUP<0b111011>;
+def OPGROUP_REGIMM : OPGROUP<0b000001>;
+def OPGROUP_SPECIAL : OPGROUP<0b000000>;
+// The spec occasionally names this value LL, LLD, SC, or SCD.
+def OPGROUP_SPECIAL3 : OPGROUP<0b011111>;
+// The spec names this constant LWC2, LDC2, SWC2, and SDC2 in different places.
+def OPGROUP_COP2LDST : OPGROUP<0b010010>;
+
+class OPCODE2<bits<2> Val> {
+ bits<2> Value = Val;
+}
+def OPCODE2_ADDIUPC : OPCODE2<0b00>;
+def OPCODE2_LWPC : OPCODE2<0b01>;
+def OPCODE2_LWUPC : OPCODE2<0b10>;
+
+class OPCODE3<bits<3> Val> {
+ bits<3> Value = Val;
+}
+def OPCODE3_LDPC : OPCODE3<0b110>;
+
+class OPCODE5<bits<5> Val> {
+ bits<5> Value = Val;
+}
+def OPCODE5_ALUIPC : OPCODE5<0b11111>;
+def OPCODE5_AUIPC : OPCODE5<0b11110>;
+def OPCODE5_DAHI : OPCODE5<0b00110>;
+def OPCODE5_DATI : OPCODE5<0b11110>;
+def OPCODE5_BC1EQZ : OPCODE5<0b01001>;
+def OPCODE5_BC1NEZ : OPCODE5<0b01101>;
+def OPCODE5_BC2EQZ : OPCODE5<0b01001>;
+def OPCODE5_BC2NEZ : OPCODE5<0b01101>;
+def OPCODE5_BGEZAL : OPCODE5<0b10001>;
+// The next four constants are unnamed in the spec. These names are taken from
+// the OPGROUP names they are used with.
+def OPCODE5_LDC2 : OPCODE5<0b01110>;
+def OPCODE5_LWC2 : OPCODE5<0b01010>;
+def OPCODE5_SDC2 : OPCODE5<0b01111>;
+def OPCODE5_SWC2 : OPCODE5<0b01011>;
+
+class OPCODE6<bits<6> Val> {
+ bits<6> Value = Val;
+}
+def OPCODE6_ALIGN : OPCODE6<0b100000>;
+def OPCODE6_DALIGN : OPCODE6<0b100100>;
+def OPCODE6_BITSWAP : OPCODE6<0b100000>;
+def OPCODE6_DBITSWAP : OPCODE6<0b100100>;
+def OPCODE6_JALR : OPCODE6<0b001001>;
+def OPCODE6_CACHE : OPCODE6<0b100101>;
+def OPCODE6_PREF : OPCODE6<0b110101>;
+// The next four constants are unnamed in the spec. These names are taken from
+// the OPGROUP names they are used with.
+def OPCODE6_LL : OPCODE6<0b110110>;
+def OPCODE6_LLD : OPCODE6<0b110111>;
+def OPCODE6_SC : OPCODE6<0b100110>;
+def OPCODE6_SCD : OPCODE6<0b100111>;
+def OPCODE6_CLO : OPCODE6<0b010001>;
+def OPCODE6_CLZ : OPCODE6<0b010000>;
+def OPCODE6_DCLO : OPCODE6<0b010011>;
+def OPCODE6_DCLZ : OPCODE6<0b010010>;
+def OPCODE6_LSA : OPCODE6<0b000101>;
+def OPCODE6_DLSA : OPCODE6<0b010101>;
+def OPCODE6_SDBBP : OPCODE6<0b001110>;
+
+class FIELD_FMT<bits<5> Val> {
+ bits<5> Value = Val;
+}
+def FIELD_FMT_S : FIELD_FMT<0b10000>;
+def FIELD_FMT_D : FIELD_FMT<0b10001>;
+
+class FIELD_CMP_COND<bits<5> Val> {
+ bits<5> Value = Val;
+}
+// Note: The CMP_COND_FMT names differ from the C_COND_FMT names.
+def FIELD_CMP_COND_AF : FIELD_CMP_COND<0b00000>;
+def FIELD_CMP_COND_UN : FIELD_CMP_COND<0b00001>;
+def FIELD_CMP_COND_EQ : FIELD_CMP_COND<0b00010>;
+def FIELD_CMP_COND_UEQ : FIELD_CMP_COND<0b00011>;
+def FIELD_CMP_COND_LT : FIELD_CMP_COND<0b00100>;
+def FIELD_CMP_COND_ULT : FIELD_CMP_COND<0b00101>;
+def FIELD_CMP_COND_LE : FIELD_CMP_COND<0b00110>;
+def FIELD_CMP_COND_ULE : FIELD_CMP_COND<0b00111>;
+def FIELD_CMP_COND_SAF : FIELD_CMP_COND<0b01000>;
+def FIELD_CMP_COND_SUN : FIELD_CMP_COND<0b01001>;
+def FIELD_CMP_COND_SEQ : FIELD_CMP_COND<0b01010>;
+def FIELD_CMP_COND_SUEQ : FIELD_CMP_COND<0b01011>;
+def FIELD_CMP_COND_SLT : FIELD_CMP_COND<0b01100>;
+def FIELD_CMP_COND_SULT : FIELD_CMP_COND<0b01101>;
+def FIELD_CMP_COND_SLE : FIELD_CMP_COND<0b01110>;
+def FIELD_CMP_COND_SULE : FIELD_CMP_COND<0b01111>;
+
+class FIELD_CMP_FORMAT<bits<5> Val> {
+ bits<5> Value = Val;
+}
+def FIELD_CMP_FORMAT_S : FIELD_CMP_FORMAT<0b10100>;
+def FIELD_CMP_FORMAT_D : FIELD_CMP_FORMAT<0b10101>;
+
+//===----------------------------------------------------------------------===//
+//
+// Disambiguators
+//
+//===----------------------------------------------------------------------===//
+//
+// Some encodings are ambiguous except by comparing field values.
+
+class DecodeDisambiguates<string Name> {
+ string DecoderMethod = !strconcat("Decode", Name);
+}
+
+class DecodeDisambiguatedBy<string Name> : DecodeDisambiguates<Name> {
+ string DecoderNamespace = "Mips32r6_64r6_Ambiguous";
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Encoding Formats
+//
+//===----------------------------------------------------------------------===//
+
+class AUI_FM : MipsR6Inst {
+ bits<5> rs;
+ bits<5> rt;
+ bits<16> imm;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = OPGROUP_AUI.Value;
+ let Inst{25-21} = rs;
+ let Inst{20-16} = rt;
+ let Inst{15-0} = imm;
+}
+
+class DAUI_FM : AUI_FM {
+ let Inst{31-26} = OPGROUP_DAUI.Value;
+}
+
+class BAL_FM : MipsR6Inst {
+ bits<16> offset;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = OPGROUP_REGIMM.Value;
+ let Inst{25-21} = 0b00000;
+ let Inst{20-16} = OPCODE5_BGEZAL.Value;
+ let Inst{15-0} = offset;
+}
+
+class COP0_EVP_DVP_FM<bits<1> sc> : MipsR6Inst {
+ bits<5> rt;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = OPGROUP_COP0.Value;
+ let Inst{25-21} = 0b01011;
+ let Inst{20-16} = rt;
+ let Inst{15-11} = 0b00000;
+ let Inst{10-6} = 0b00000;
+ let Inst{5} = sc;
+ let Inst{4-3} = 0b00;
+ let Inst{2-0} = 0b100;
+}
+
+class COP1_2R_FM<bits<6> funct, FIELD_FMT Format> : MipsR6Inst {
+ bits<5> fs;
+ bits<5> fd;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = OPGROUP_COP1.Value;
+ let Inst{25-21} = Format.Value;
+ let Inst{20-16} = 0b00000;
+ let Inst{15-11} = fs;
+ let Inst{10-6} = fd;
+ let Inst{5-0} = funct;
+}
+
+class COP1_3R_FM<bits<6> funct, FIELD_FMT Format> : MipsR6Inst {
+ bits<5> ft;
+ bits<5> fs;
+ bits<5> fd;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = OPGROUP_COP1.Value;
+ let Inst{25-21} = Format.Value;
+ let Inst{20-16} = ft;
+ let Inst{15-11} = fs;
+ let Inst{10-6} = fd;
+ let Inst{5-0} = funct;
+}
+
+class COP1_BCCZ_FM<OPCODE5 Operation> : MipsR6Inst {
+ bits<5> ft;
+ bits<16> offset;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = OPGROUP_COP1.Value;
+ let Inst{25-21} = Operation.Value;
+ let Inst{20-16} = ft;
+ let Inst{15-0} = offset;
+}
+
+class COP2_BCCZ_FM<OPCODE5 Operation> : MipsR6Inst {
+ bits<5> ct;
+ bits<16> offset;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = OPGROUP_COP2.Value;
+ let Inst{25-21} = Operation.Value;
+ let Inst{20-16} = ct;
+ let Inst{15-0} = offset;
+}
+
+class PCREL16_FM<OPCODE5 Operation> : MipsR6Inst {
+ bits<5> rs;
+ bits<16> imm;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = OPGROUP_PCREL.Value;
+ let Inst{25-21} = rs;
+ let Inst{20-16} = Operation.Value;
+ let Inst{15-0} = imm;
+}
+
+class PCREL19_FM<OPCODE2 Operation> : MipsR6Inst {
+ bits<5> rs;
+ bits<19> imm;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = OPGROUP_PCREL.Value;
+ let Inst{25-21} = rs;
+ let Inst{20-19} = Operation.Value;
+ let Inst{18-0} = imm;
+}
+
+class PCREL18_FM<OPCODE3 Operation> : MipsR6Inst {
+ bits<5> rs;
+ bits<18> imm;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = OPGROUP_PCREL.Value;
+ let Inst{25-21} = rs;
+ let Inst{20-18} = Operation.Value;
+ let Inst{17-0} = imm;
+}
+
+class SPECIAL3_2R_FM<OPCODE6 Operation> : MipsR6Inst {
+ bits<5> rd;
+ bits<5> rt;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = OPGROUP_SPECIAL3.Value;
+ let Inst{25-21} = 0b00000;
+ let Inst{20-16} = rt;
+ let Inst{15-11} = rd;
+ let Inst{10-6} = 0b00000;
+ let Inst{5-0} = Operation.Value;
+}
+
+class SPECIAL3_MEM_FM<OPCODE6 Operation> : MipsR6Inst {
+ bits<21> addr;
+ bits<5> hint;
+ bits<5> base = addr{20-16};
+ bits<9> offset = addr{8-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = OPGROUP_SPECIAL3.Value;
+ let Inst{25-21} = base;
+ let Inst{20-16} = hint;
+ let Inst{15-7} = offset;
+ let Inst{6} = 0;
+ let Inst{5-0} = Operation.Value;
+}
+
+class SPECIAL_2R_FM<OPCODE6 Operation> : MipsR6Inst {
+ bits<5> rd;
+ bits<5> rs;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = OPGROUP_SPECIAL.Value;
+ let Inst{25-21} = rs;
+ let Inst{20-16} = 0b00000;
+ let Inst{15-11} = rd;
+ let Inst{10-6} = 0b00001;
+ let Inst{5-0} = Operation.Value;
+}
+
+class SPECIAL_3R_FM<bits<5> mulop, bits<6> funct> : MipsR6Inst {
+ bits<5> rd;
+ bits<5> rs;
+ bits<5> rt;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = OPGROUP_SPECIAL.Value;
+ let Inst{25-21} = rs;
+ let Inst{20-16} = rt;
+ let Inst{15-11} = rd;
+ let Inst{10-6} = mulop;
+ let Inst{5-0} = funct;
+}
+
+class SPECIAL_SDBBP_FM : MipsR6Inst {
+ bits<20> code_;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = OPGROUP_SPECIAL.Value;
+ let Inst{25-6} = code_;
+ let Inst{5-0} = OPCODE6_SDBBP.Value;
+}
+
+// This class is ambiguous with other branches:
+// BEQC/BNEC require that rs < rt && rs != 0
+class CMP_BRANCH_2R_OFF16_FM<OPGROUP funct> : MipsR6Inst {
+ bits<5> rs;
+ bits<5> rt;
+ bits<16> offset;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = funct.Value;
+ let Inst{25-21} = rs;
+ let Inst{20-16} = rt;
+ let Inst{15-0} = offset;
+}
+
+// This class is ambiguous with other branches:
+// BLEZC/BGEZC/BEQZALC/BNEZALC/BGTZALC require that rs == 0 && rt != 0
+// The '1R_RT' in the name means 1 register in the rt field.
+class CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP funct> : MipsR6Inst {
+ bits<5> rt;
+ bits<16> offset;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = funct.Value;
+ let Inst{25-21} = 0b00000;
+ let Inst{20-16} = rt;
+ let Inst{15-0} = offset;
+}
+
+// This class is ambiguous with other branches:
+// BLTZC/BGTZC/BLTZALC/BGEZALC require that rs == rt && rt != 0
+// The '1R_BOTH' in the name means 1 register in both the rs and rt fields.
+class CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP funct> : MipsR6Inst {
+ bits<5> rt;
+ bits<16> offset;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = funct.Value;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rt;
+ let Inst{15-0} = offset;
+}
+
+class CMP_BRANCH_OFF21_FM<bits<6> funct> : MipsR6Inst {
+ bits<5> rs; // rs != 0
+ bits<21> offset;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = funct;
+ let Inst{25-21} = rs;
+ let Inst{20-0} = offset;
+}
+
+class JMP_IDX_COMPACT_FM<bits<6> funct> : MipsR6Inst {
+ bits<5> rt;
+ bits<16> offset;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = funct;
+ let Inst{25-21} = 0b00000;
+ let Inst{20-16} = rt;
+ let Inst{15-0} = offset;
+}
+
+class BRANCH_OFF26_FM<bits<6> funct> : MipsR6Inst {
+ bits<32> Inst;
+ bits<26> offset;
+
+ let Inst{31-26} = funct;
+ let Inst{25-0} = offset;
+}
+
+class SPECIAL3_ALIGN_FM<OPCODE6 Operation> : MipsR6Inst {
+ bits<5> rd;
+ bits<5> rs;
+ bits<5> rt;
+ bits<2> bp;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = OPGROUP_SPECIAL3.Value;
+ let Inst{25-21} = rs;
+ let Inst{20-16} = rt;
+ let Inst{15-11} = rd;
+ let Inst{10-8} = 0b010;
+ let Inst{7-6} = bp;
+ let Inst{5-0} = Operation.Value;
+}
+
+class SPECIAL3_DALIGN_FM<OPCODE6 Operation> : MipsR6Inst {
+ bits<5> rd;
+ bits<5> rs;
+ bits<5> rt;
+ bits<3> bp;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = OPGROUP_SPECIAL3.Value;
+ let Inst{25-21} = rs;
+ let Inst{20-16} = rt;
+ let Inst{15-11} = rd;
+ let Inst{10-9} = 0b01;
+ let Inst{8-6} = bp;
+ let Inst{5-0} = Operation.Value;
+}
+
+class SPECIAL3_LL_SC_FM<OPCODE6 Operation> : MipsR6Inst {
+ bits<5> rt;
+ bits<21> addr;
+ bits<5> base = addr{20-16};
+ bits<9> offset = addr{8-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = OPGROUP_SPECIAL3.Value;
+ let Inst{25-21} = base;
+ let Inst{20-16} = rt;
+ let Inst{15-7} = offset;
+ let Inst{5-0} = Operation.Value;
+
+ string DecoderMethod = "DecodeSpecial3LlSc";
+}
+
+class SPECIAL_LSA_FM<OPCODE6 Operation> : MipsR6Inst {
+ bits<5> rd;
+ bits<5> rs;
+ bits<5> rt;
+ bits<2> imm2;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = OPGROUP_SPECIAL.Value;
+ let Inst{25-21} = rs;
+ let Inst{20-16} = rt;
+ let Inst{15-11} = rd;
+ let Inst{10-8} = 0b000;
+ let Inst{7-6} = imm2;
+ let Inst{5-0} = Operation.Value;
+}
+
+class REGIMM_FM<OPCODE5 Operation> : MipsR6Inst {
+ bits<5> rs;
+ bits<16> imm;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = OPGROUP_REGIMM.Value;
+ let Inst{25-21} = rs;
+ let Inst{20-16} = Operation.Value;
+ let Inst{15-0} = imm;
+}
+
+class COP1_CMP_CONDN_FM<FIELD_CMP_FORMAT Format,
+ FIELD_CMP_COND Cond> : MipsR6Inst {
+ bits<5> fd;
+ bits<5> fs;
+ bits<5> ft;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = OPGROUP_COP1.Value;
+ let Inst{25-21} = Format.Value;
+ let Inst{20-16} = ft;
+ let Inst{15-11} = fs;
+ let Inst{10-6} = fd;
+ let Inst{5} = 0;
+ let Inst{4-0} = Cond.Value;
+}
+
+class JR_HB_R6_FM<OPCODE6 Operation> : MipsR6Inst {
+ bits<5> rs;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = OPGROUP_SPECIAL.Value;
+ let Inst{25-21} = rs;
+ let Inst{20-16} = 0;
+ let Inst{15-11} = 0;
+ let Inst{10} = 1;
+ let Inst{9-6} = 0;
+ let Inst{5-0} = Operation.Value;
+}
+
+class COP2LDST_FM<OPCODE5 Operation> : MipsR6Inst {
+ bits<5> rt;
+ bits<21> addr;
+ bits<5> base = addr{20-16};
+ bits<11> offset = addr{10-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = OPGROUP_COP2LDST.Value;
+ let Inst{25-21} = Operation.Value;
+ let Inst{20-16} = rt;
+ let Inst{15-11} = base;
+ let Inst{10-0} = offset;
+}
diff --git a/contrib/llvm/lib/Target/Mips/Mips32r6InstrInfo.td b/contrib/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
new file mode 100644
index 000000000000..1b4d73b79895
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -0,0 +1,1004 @@
+//=- Mips32r6InstrInfo.td - Mips32r6 Instruction Information -*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips32r6 instructions.
+//
+//===----------------------------------------------------------------------===//
+
+include "Mips32r6InstrFormats.td"
+
+// Notes about removals/changes from MIPS32r6:
+// Reencoded: jr -> jalr
+// Reencoded: jr.hb -> jalr.hb
+
+def brtarget21 : Operand<OtherVT> {
+ let EncoderMethod = "getBranchTarget21OpValue";
+ let OperandType = "OPERAND_PCREL";
+ let DecoderMethod = "DecodeBranchTarget21";
+ let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+def brtarget26 : Operand<OtherVT> {
+ let EncoderMethod = "getBranchTarget26OpValue";
+ let OperandType = "OPERAND_PCREL";
+ let DecoderMethod = "DecodeBranchTarget26";
+ let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+def jmpoffset16 : Operand<OtherVT> {
+ let EncoderMethod = "getJumpOffset16OpValue";
+ let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+def calloffset16 : Operand<iPTR> {
+ let EncoderMethod = "getJumpOffset16OpValue";
+ let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Encodings
+//
+//===----------------------------------------------------------------------===//
+
+class ADDIUPC_ENC : PCREL19_FM<OPCODE2_ADDIUPC>;
+class ALIGN_ENC : SPECIAL3_ALIGN_FM<OPCODE6_ALIGN>;
+class ALUIPC_ENC : PCREL16_FM<OPCODE5_ALUIPC>;
+class AUI_ENC : AUI_FM;
+class AUIPC_ENC : PCREL16_FM<OPCODE5_AUIPC>;
+
+class BAL_ENC : BAL_FM;
+class BALC_ENC : BRANCH_OFF26_FM<0b111010>;
+class BC_ENC : BRANCH_OFF26_FM<0b110010>;
+class BEQC_ENC : CMP_BRANCH_2R_OFF16_FM<OPGROUP_ADDI>,
+ DecodeDisambiguates<"AddiGroupBranch">;
+class BEQZALC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_ADDI>,
+ DecodeDisambiguatedBy<"DaddiGroupBranch">;
+class BNEC_ENC : CMP_BRANCH_2R_OFF16_FM<OPGROUP_DADDI>,
+ DecodeDisambiguates<"DaddiGroupBranch">;
+class BNEZALC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_DADDI>,
+ DecodeDisambiguatedBy<"DaddiGroupBranch">;
+
+class BLTZC_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP_BGTZL>,
+ DecodeDisambiguates<"BgtzlGroupBranch">;
+class BGEC_ENC : CMP_BRANCH_2R_OFF16_FM<OPGROUP_BLEZL>,
+ DecodeDisambiguatedBy<"BlezlGroupBranch">;
+class BGEUC_ENC : CMP_BRANCH_2R_OFF16_FM<OPGROUP_BLEZ>,
+ DecodeDisambiguatedBy<"BlezGroupBranch">;
+class BGEZC_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP_BLEZL>,
+ DecodeDisambiguates<"BlezlGroupBranch">;
+class BGTZALC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_BGTZ>,
+ DecodeDisambiguatedBy<"BgtzGroupBranch">;
+
+class BLTC_ENC : CMP_BRANCH_2R_OFF16_FM<OPGROUP_BGTZL>,
+ DecodeDisambiguatedBy<"BgtzlGroupBranch">;
+class BLTUC_ENC : CMP_BRANCH_2R_OFF16_FM<OPGROUP_BGTZ>,
+ DecodeDisambiguatedBy<"BgtzGroupBranch">;
+
+class BLEZC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_BLEZL>,
+ DecodeDisambiguatedBy<"BlezlGroupBranch">;
+class BLTZALC_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP_BGTZ>,
+ DecodeDisambiguates<"BgtzGroupBranch">;
+class BGTZC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_BGTZL>,
+ DecodeDisambiguatedBy<"BgtzlGroupBranch">;
+
+class BEQZC_ENC : CMP_BRANCH_OFF21_FM<0b110110>;
+class BGEZALC_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP_BLEZ>,
+ DecodeDisambiguates<"BlezGroupBranch">;
+class BNEZC_ENC : CMP_BRANCH_OFF21_FM<0b111110>;
+
+class BC1EQZ_ENC : COP1_BCCZ_FM<OPCODE5_BC1EQZ>;
+class BC1NEZ_ENC : COP1_BCCZ_FM<OPCODE5_BC1NEZ>;
+class BC2EQZ_ENC : COP2_BCCZ_FM<OPCODE5_BC2EQZ>;
+class BC2NEZ_ENC : COP2_BCCZ_FM<OPCODE5_BC2NEZ>;
+
+class DVP_ENC : COP0_EVP_DVP_FM<0b1>;
+class EVP_ENC : COP0_EVP_DVP_FM<0b0>;
+
+class JIALC_ENC : JMP_IDX_COMPACT_FM<0b111110>;
+class JIC_ENC : JMP_IDX_COMPACT_FM<0b110110>;
+class JR_HB_R6_ENC : JR_HB_R6_FM<OPCODE6_JALR>;
+class BITSWAP_ENC : SPECIAL3_2R_FM<OPCODE6_BITSWAP>;
+class BLEZALC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_BLEZ>,
+ DecodeDisambiguatedBy<"BlezGroupBranch">;
+class BNVC_ENC : CMP_BRANCH_2R_OFF16_FM<OPGROUP_DADDI>,
+ DecodeDisambiguatedBy<"DaddiGroupBranch">;
+class BOVC_ENC : CMP_BRANCH_2R_OFF16_FM<OPGROUP_ADDI>,
+ DecodeDisambiguatedBy<"AddiGroupBranch">;
+class DIV_ENC : SPECIAL_3R_FM<0b00010, 0b011010>;
+class DIVU_ENC : SPECIAL_3R_FM<0b00010, 0b011011>;
+class MOD_ENC : SPECIAL_3R_FM<0b00011, 0b011010>;
+class MODU_ENC : SPECIAL_3R_FM<0b00011, 0b011011>;
+class MUH_ENC : SPECIAL_3R_FM<0b00011, 0b011000>;
+class MUHU_ENC : SPECIAL_3R_FM<0b00011, 0b011001>;
+class MUL_R6_ENC : SPECIAL_3R_FM<0b00010, 0b011000>;
+class MULU_ENC : SPECIAL_3R_FM<0b00010, 0b011001>;
+
+class MADDF_S_ENC : COP1_3R_FM<0b011000, FIELD_FMT_S>;
+class MADDF_D_ENC : COP1_3R_FM<0b011000, FIELD_FMT_D>;
+class MSUBF_S_ENC : COP1_3R_FM<0b011001, FIELD_FMT_S>;
+class MSUBF_D_ENC : COP1_3R_FM<0b011001, FIELD_FMT_D>;
+
+class SEL_D_ENC : COP1_3R_FM<0b010000, FIELD_FMT_D>;
+class SEL_S_ENC : COP1_3R_FM<0b010000, FIELD_FMT_S>;
+
+class SELEQZ_ENC : SPECIAL_3R_FM<0b00000, 0b110101>;
+class SELNEZ_ENC : SPECIAL_3R_FM<0b00000, 0b110111>;
+
+class LWPC_ENC : PCREL19_FM<OPCODE2_LWPC>;
+class LWUPC_ENC : PCREL19_FM<OPCODE2_LWUPC>;
+
+class MAX_S_ENC : COP1_3R_FM<0b011101, FIELD_FMT_S>;
+class MAX_D_ENC : COP1_3R_FM<0b011101, FIELD_FMT_D>;
+class MIN_S_ENC : COP1_3R_FM<0b011100, FIELD_FMT_S>;
+class MIN_D_ENC : COP1_3R_FM<0b011100, FIELD_FMT_D>;
+
+class MAXA_S_ENC : COP1_3R_FM<0b011111, FIELD_FMT_S>;
+class MAXA_D_ENC : COP1_3R_FM<0b011111, FIELD_FMT_D>;
+class MINA_S_ENC : COP1_3R_FM<0b011110, FIELD_FMT_S>;
+class MINA_D_ENC : COP1_3R_FM<0b011110, FIELD_FMT_D>;
+
+class SELEQZ_S_ENC : COP1_3R_FM<0b010100, FIELD_FMT_S>;
+class SELEQZ_D_ENC : COP1_3R_FM<0b010100, FIELD_FMT_D>;
+class SELNEZ_S_ENC : COP1_3R_FM<0b010111, FIELD_FMT_S>;
+class SELNEZ_D_ENC : COP1_3R_FM<0b010111, FIELD_FMT_D>;
+
+class RINT_S_ENC : COP1_2R_FM<0b011010, FIELD_FMT_S>;
+class RINT_D_ENC : COP1_2R_FM<0b011010, FIELD_FMT_D>;
+class CLASS_S_ENC : COP1_2R_FM<0b011011, FIELD_FMT_S>;
+class CLASS_D_ENC : COP1_2R_FM<0b011011, FIELD_FMT_D>;
+
+class CACHE_ENC : SPECIAL3_MEM_FM<OPCODE6_CACHE>;
+class PREF_ENC : SPECIAL3_MEM_FM<OPCODE6_PREF>;
+
+class LDC2_R6_ENC : COP2LDST_FM<OPCODE5_LDC2>;
+class LWC2_R6_ENC : COP2LDST_FM<OPCODE5_LWC2>;
+class SDC2_R6_ENC : COP2LDST_FM<OPCODE5_SDC2>;
+class SWC2_R6_ENC : COP2LDST_FM<OPCODE5_SWC2>;
+
+class LSA_R6_ENC : SPECIAL_LSA_FM<OPCODE6_LSA>;
+
+class LL_R6_ENC : SPECIAL3_LL_SC_FM<OPCODE6_LL>;
+class SC_R6_ENC : SPECIAL3_LL_SC_FM<OPCODE6_SC>;
+
+class CLO_R6_ENC : SPECIAL_2R_FM<OPCODE6_CLO>;
+class CLZ_R6_ENC : SPECIAL_2R_FM<OPCODE6_CLZ>;
+
+class SDBBP_R6_ENC : SPECIAL_SDBBP_FM;
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Multiclasses
+//
+//===----------------------------------------------------------------------===//
+
+class CMP_CONDN_DESC_BASE<string CondStr, string Typestr,
+ RegisterOperand FGROpnd,
+ InstrItinClass Itin,
+ SDPatternOperator Op = null_frag> {
+ dag OutOperandList = (outs FGRCCOpnd:$fd);
+ dag InOperandList = (ins FGROpnd:$fs, FGROpnd:$ft);
+ string AsmString = !strconcat("cmp.", CondStr, ".", Typestr, "\t$fd, $fs, $ft");
+ list<dag> Pattern = [(set FGRCCOpnd:$fd, (Op FGROpnd:$fs, FGROpnd:$ft))];
+ bit isCTI = 1;
+ InstrItinClass Itinerary = Itin;
+}
+
+multiclass CMP_CC_M <FIELD_CMP_FORMAT Format, string Typestr,
+ RegisterOperand FGROpnd, InstrItinClass Itin>{
+ let AdditionalPredicates = [NotInMicroMips] in {
+ def CMP_F_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_AF>,
+ CMP_CONDN_DESC_BASE<"af", Typestr, FGROpnd, Itin>,
+ MipsR6Arch<!strconcat("cmp.af.", Typestr)>,
+ ISA_MIPS32R6, HARDFLOAT;
+ def CMP_UN_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_UN>,
+ CMP_CONDN_DESC_BASE<"un", Typestr, FGROpnd, Itin, setuo>,
+ MipsR6Arch<!strconcat("cmp.un.", Typestr)>,
+ ISA_MIPS32R6, HARDFLOAT;
+ def CMP_EQ_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_EQ>,
+ CMP_CONDN_DESC_BASE<"eq", Typestr, FGROpnd, Itin,
+ setoeq>,
+ MipsR6Arch<!strconcat("cmp.eq.", Typestr)>,
+ ISA_MIPS32R6, HARDFLOAT;
+ def CMP_UEQ_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format,
+ FIELD_CMP_COND_UEQ>,
+ CMP_CONDN_DESC_BASE<"ueq", Typestr, FGROpnd, Itin,
+ setueq>,
+ MipsR6Arch<!strconcat("cmp.ueq.", Typestr)>,
+ ISA_MIPS32R6, HARDFLOAT;
+ def CMP_LT_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_LT>,
+ CMP_CONDN_DESC_BASE<"lt", Typestr, FGROpnd, Itin,
+ setolt>,
+ MipsR6Arch<!strconcat("cmp.lt.", Typestr)>,
+ ISA_MIPS32R6, HARDFLOAT;
+ def CMP_ULT_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format,
+ FIELD_CMP_COND_ULT>,
+ CMP_CONDN_DESC_BASE<"ult", Typestr, FGROpnd, Itin,
+ setult>,
+ MipsR6Arch<!strconcat("cmp.ult.", Typestr)>,
+ ISA_MIPS32R6, HARDFLOAT;
+ def CMP_LE_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_LE>,
+ CMP_CONDN_DESC_BASE<"le", Typestr, FGROpnd, Itin,
+ setole>,
+ MipsR6Arch<!strconcat("cmp.le.", Typestr)>,
+ ISA_MIPS32R6, HARDFLOAT;
+ def CMP_ULE_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format,
+ FIELD_CMP_COND_ULE>,
+ CMP_CONDN_DESC_BASE<"ule", Typestr, FGROpnd, Itin,
+ setule>,
+ MipsR6Arch<!strconcat("cmp.ule.", Typestr)>,
+ ISA_MIPS32R6, HARDFLOAT;
+ def CMP_SAF_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format,
+ FIELD_CMP_COND_SAF>,
+ CMP_CONDN_DESC_BASE<"saf", Typestr, FGROpnd, Itin>,
+ MipsR6Arch<!strconcat("cmp.saf.", Typestr)>,
+ ISA_MIPS32R6, HARDFLOAT;
+ def CMP_SUN_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format,
+ FIELD_CMP_COND_SUN>,
+ CMP_CONDN_DESC_BASE<"sun", Typestr, FGROpnd, Itin>,
+ MipsR6Arch<!strconcat("cmp.sun.", Typestr)>,
+ ISA_MIPS32R6, HARDFLOAT;
+ def CMP_SEQ_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format,
+ FIELD_CMP_COND_SEQ>,
+ CMP_CONDN_DESC_BASE<"seq", Typestr, FGROpnd, Itin>,
+ MipsR6Arch<!strconcat("cmp.seq.", Typestr)>,
+ ISA_MIPS32R6, HARDFLOAT;
+ def CMP_SUEQ_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format,
+ FIELD_CMP_COND_SUEQ>,
+ CMP_CONDN_DESC_BASE<"sueq", Typestr, FGROpnd, Itin>,
+ MipsR6Arch<!strconcat("cmp.sueq.", Typestr)>,
+ ISA_MIPS32R6, HARDFLOAT;
+ def CMP_SLT_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format,
+ FIELD_CMP_COND_SLT>,
+ CMP_CONDN_DESC_BASE<"slt", Typestr, FGROpnd, Itin>,
+ MipsR6Arch<!strconcat("cmp.slt.", Typestr)>,
+ ISA_MIPS32R6, HARDFLOAT;
+ def CMP_SULT_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format,
+ FIELD_CMP_COND_SULT>,
+ CMP_CONDN_DESC_BASE<"sult", Typestr, FGROpnd, Itin>,
+ MipsR6Arch<!strconcat("cmp.sult.", Typestr)>,
+ ISA_MIPS32R6, HARDFLOAT;
+ def CMP_SLE_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format,
+ FIELD_CMP_COND_SLE>,
+ CMP_CONDN_DESC_BASE<"sle", Typestr, FGROpnd, Itin>,
+ MipsR6Arch<!strconcat("cmp.sle.", Typestr)>,
+ ISA_MIPS32R6, HARDFLOAT;
+ def CMP_SULE_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format,
+ FIELD_CMP_COND_SULE>,
+ CMP_CONDN_DESC_BASE<"sule", Typestr, FGROpnd, Itin>,
+ MipsR6Arch<!strconcat("cmp.sule.", Typestr)>,
+ ISA_MIPS32R6, HARDFLOAT;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Descriptions
+//
+//===----------------------------------------------------------------------===//
+
+class PCREL_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+ Operand ImmOpnd, InstrItinClass itin>
+ : MipsR6Arch<instr_asm> {
+ dag OutOperandList = (outs GPROpnd:$rs);
+ dag InOperandList = (ins ImmOpnd:$imm);
+ string AsmString = !strconcat(instr_asm, "\t$rs, $imm");
+ list<dag> Pattern = [];
+ InstrItinClass Itinerary = itin;
+}
+
+class ADDIUPC_DESC : PCREL_DESC_BASE<"addiupc", GPR32Opnd, simm19_lsl2,
+ II_ADDIUPC>;
+class LWPC_DESC: PCREL_DESC_BASE<"lwpc", GPR32Opnd, simm19_lsl2, II_LWPC>;
+class LWUPC_DESC: PCREL_DESC_BASE<"lwupc", GPR32Opnd, simm19_lsl2, II_LWUPC>;
+
+class ALIGN_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+ Operand ImmOpnd, InstrItinClass itin>
+ : MipsR6Arch<instr_asm> {
+ dag OutOperandList = (outs GPROpnd:$rd);
+ dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt, ImmOpnd:$bp);
+ string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt, $bp");
+ list<dag> Pattern = [];
+ InstrItinClass Itinerary = itin;
+}
+
+class ALIGN_DESC : ALIGN_DESC_BASE<"align", GPR32Opnd, uimm2, II_ALIGN>;
+
+class ALUIPC_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+ InstrItinClass itin = NoItinerary>
+ : MipsR6Arch<instr_asm> {
+ dag OutOperandList = (outs GPROpnd:$rs);
+ dag InOperandList = (ins simm16:$imm);
+ string AsmString = !strconcat(instr_asm, "\t$rs, $imm");
+ list<dag> Pattern = [];
+ InstrItinClass Itinerary = itin;
+}
+
+class ALUIPC_DESC : ALUIPC_DESC_BASE<"aluipc", GPR32Opnd, II_ALUIPC>;
+class AUIPC_DESC : ALUIPC_DESC_BASE<"auipc", GPR32Opnd, II_AUIPC>;
+
+class AUI_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+ InstrItinClass itin = NoItinerary>
+ : MipsR6Arch<instr_asm> {
+ dag OutOperandList = (outs GPROpnd:$rs);
+ dag InOperandList = (ins GPROpnd:$rt, uimm16:$imm);
+ string AsmString = !strconcat(instr_asm, "\t$rs, $rt, $imm");
+ list<dag> Pattern = [];
+ InstrItinClass Itinerary = itin;
+}
+
+class AUI_DESC : AUI_DESC_BASE<"aui", GPR32Opnd, II_AUI>;
+
+class BRANCH_DESC_BASE {
+ bit isBranch = 1;
+ bit isTerminator = 1;
+ bit hasDelaySlot = 0;
+ bit isCTI = 1;
+}
+
+class BC_DESC_BASE<string instr_asm, DAGOperand opnd> : BRANCH_DESC_BASE,
+ MipsR6Arch<instr_asm> {
+ dag InOperandList = (ins opnd:$offset);
+ dag OutOperandList = (outs);
+ string AsmString = !strconcat(instr_asm, "\t$offset");
+ bit isBarrier = 1;
+ InstrItinClass Itinerary = II_BC;
+ bit isCTI = 1;
+}
+
+class CMP_BC_DESC_BASE<string instr_asm, DAGOperand opnd,
+ RegisterOperand GPROpnd> : BRANCH_DESC_BASE,
+ MipsR6Arch<instr_asm> {
+ dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt, opnd:$offset);
+ dag OutOperandList = (outs);
+ string AsmString = !strconcat(instr_asm, "\t$rs, $rt, $offset");
+ list<Register> Defs = [AT];
+ InstrItinClass Itinerary = II_BCCC;
+ bit hasForbiddenSlot = 1;
+ bit isCTI = 1;
+}
+
+class CMP_CBR_EQNE_Z_DESC_BASE<string instr_asm, DAGOperand opnd,
+ RegisterOperand GPROpnd>
+ : BRANCH_DESC_BASE, MipsR6Arch<instr_asm> {
+ dag InOperandList = (ins GPROpnd:$rs, opnd:$offset);
+ dag OutOperandList = (outs);
+ string AsmString = !strconcat(instr_asm, "\t$rs, $offset");
+ list<Register> Defs = [AT];
+ InstrItinClass Itinerary = II_BCCZC;
+ bit hasForbiddenSlot = 1;
+ bit isCTI = 1;
+}
+
+class CMP_CBR_RT_Z_DESC_BASE<string instr_asm, DAGOperand opnd,
+ RegisterOperand GPROpnd>
+ : BRANCH_DESC_BASE, MipsR6Arch<instr_asm> {
+ dag InOperandList = (ins GPROpnd:$rt, opnd:$offset);
+ dag OutOperandList = (outs);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $offset");
+ list<Register> Defs = [AT];
+ InstrItinClass Itinerary = II_BCCZC;
+ bit hasForbiddenSlot = 1;
+ bit isCTI = 1;
+}
+
+class BAL_DESC : BC_DESC_BASE<"bal", brtarget> {
+ bit isCall = 1;
+ bit hasDelaySlot = 1;
+ list<Register> Defs = [RA];
+ bit isCTI = 1;
+}
+
+class BALC_DESC : BC_DESC_BASE<"balc", brtarget26> {
+ bit isCall = 1;
+ list<Register> Defs = [RA];
+ InstrItinClass Itinerary = II_BALC;
+ bit isCTI = 1;
+}
+
+class BC_DESC : BC_DESC_BASE<"bc", brtarget26>;
+class BGEC_DESC : CMP_BC_DESC_BASE<"bgec", brtarget, GPR32Opnd>;
+class BGEUC_DESC : CMP_BC_DESC_BASE<"bgeuc", brtarget, GPR32Opnd>;
+class BEQC_DESC : CMP_BC_DESC_BASE<"beqc", brtarget, GPR32Opnd>;
+class BNEC_DESC : CMP_BC_DESC_BASE<"bnec", brtarget, GPR32Opnd>;
+
+class BLTC_DESC : CMP_BC_DESC_BASE<"bltc", brtarget, GPR32Opnd>;
+class BLTUC_DESC : CMP_BC_DESC_BASE<"bltuc", brtarget, GPR32Opnd>;
+
+class BLTZC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bltzc", brtarget, GPR32Opnd>;
+class BGEZC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bgezc", brtarget, GPR32Opnd>;
+
+class BLEZC_DESC : CMP_CBR_RT_Z_DESC_BASE<"blezc", brtarget, GPR32Opnd>;
+class BGTZC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bgtzc", brtarget, GPR32Opnd>;
+
+class BEQZC_DESC : CMP_CBR_EQNE_Z_DESC_BASE<"beqzc", brtarget21, GPR32Opnd>;
+class BNEZC_DESC : CMP_CBR_EQNE_Z_DESC_BASE<"bnezc", brtarget21, GPR32Opnd>;
+
+class COP1_BCCZ_DESC_BASE<string instr_asm> : BRANCH_DESC_BASE {
+ dag InOperandList = (ins FGR64Opnd:$ft, brtarget:$offset);
+ dag OutOperandList = (outs);
+ string AsmString = instr_asm;
+ bit hasDelaySlot = 1;
+ InstrItinClass Itinerary = II_BC1CCZ;
+}
+
+class BC1EQZ_DESC : COP1_BCCZ_DESC_BASE<"bc1eqz $ft, $offset">;
+class BC1NEZ_DESC : COP1_BCCZ_DESC_BASE<"bc1nez $ft, $offset">;
+
+class COP2_BCCZ_DESC_BASE<string instr_asm> : BRANCH_DESC_BASE {
+ dag InOperandList = (ins COP2Opnd:$ct, brtarget:$offset);
+ dag OutOperandList = (outs);
+ string AsmString = instr_asm;
+ bit hasDelaySlot = 1;
+ bit isCTI = 1;
+ InstrItinClass Itinerary = II_BC2CCZ;
+}
+
+class BC2EQZ_DESC : COP2_BCCZ_DESC_BASE<"bc2eqz $ct, $offset">;
+class BC2NEZ_DESC : COP2_BCCZ_DESC_BASE<"bc2nez $ct, $offset">;
+
+class BOVC_DESC : CMP_BC_DESC_BASE<"bovc", brtarget, GPR32Opnd>;
+class BNVC_DESC : CMP_BC_DESC_BASE<"bnvc", brtarget, GPR32Opnd>;
+
+class JMP_IDX_COMPACT_DESC_BASE<string opstr, DAGOperand opnd,
+ RegisterOperand GPROpnd,
+ InstrItinClass itin = NoItinerary>
+ : MipsR6Arch<opstr> {
+ dag InOperandList = (ins GPROpnd:$rt, opnd:$offset);
+ string AsmString = !strconcat(opstr, "\t$rt, $offset");
+ list<dag> Pattern = [];
+ bit hasDelaySlot = 0;
+ InstrItinClass Itinerary = itin;
+ bit isCTI = 1;
+ bit isBranch = 1;
+ bit isIndirectBranch = 1;
+}
+
+class JIALC_DESC : JMP_IDX_COMPACT_DESC_BASE<"jialc", calloffset16,
+ GPR32Opnd, II_JIALC> {
+ bit isCall = 1;
+ list<Register> Defs = [RA];
+}
+
+class JIC_DESC : JMP_IDX_COMPACT_DESC_BASE<"jic", jmpoffset16,
+ GPR32Opnd, II_JIALC> {
+ bit isBarrier = 1;
+ bit isTerminator = 1;
+ list<Register> Defs = [AT];
+}
+
+class JR_HB_R6_DESC : JR_HB_DESC_BASE<"jr.hb", GPR32Opnd> {
+ bit isBranch = 1;
+ bit isIndirectBranch = 1;
+ bit hasDelaySlot = 1;
+ bit isTerminator=1;
+ bit isBarrier=1;
+ bit isCTI = 1;
+ InstrItinClass Itinerary = II_JR_HB;
+}
+
+class BITSWAP_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+ InstrItinClass itin>
+ : MipsR6Arch<instr_asm> {
+ dag OutOperandList = (outs GPROpnd:$rd);
+ dag InOperandList = (ins GPROpnd:$rt);
+ string AsmString = !strconcat(instr_asm, "\t$rd, $rt");
+ list<dag> Pattern = [];
+ InstrItinClass Itinerary = itin;
+}
+
+class BITSWAP_DESC : BITSWAP_DESC_BASE<"bitswap", GPR32Opnd, II_BITSWAP>;
+
+class DIVMOD_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+ InstrItinClass itin,
+ SDPatternOperator Op=null_frag>
+ : MipsR6Arch<instr_asm> {
+ dag OutOperandList = (outs GPROpnd:$rd);
+ dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt);
+ string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
+ list<dag> Pattern = [(set GPROpnd:$rd, (Op GPROpnd:$rs, GPROpnd:$rt))];
+ InstrItinClass Itinerary = itin;
+ // This instruction doesn't trap division by zero itself. We must insert
+ // teq instructions as well.
+ bit usesCustomInserter = 1;
+}
+
+class DVPEVP_DESC_BASE<string instr_asm, InstrItinClass Itin>
+ : MipsR6Arch<instr_asm> {
+ dag OutOperandList = (outs GPR32Opnd:$rt);
+ dag InOperandList = (ins);
+ string AsmString = !strconcat(instr_asm, "\t$rt");
+ list<dag> Pattern = [];
+ InstrItinClass Itinerary = Itin;
+ bit hasUnModeledSideEffects = 1;
+}
+
+class DVP_DESC : DVPEVP_DESC_BASE<"dvp", II_DVP>;
+class EVP_DESC : DVPEVP_DESC_BASE<"evp", II_EVP>;
+
+class DIV_DESC : DIVMOD_DESC_BASE<"div", GPR32Opnd, II_DIV, sdiv>;
+class DIVU_DESC : DIVMOD_DESC_BASE<"divu", GPR32Opnd, II_DIVU, udiv>;
+class MOD_DESC : DIVMOD_DESC_BASE<"mod", GPR32Opnd, II_MOD, srem>;
+class MODU_DESC : DIVMOD_DESC_BASE<"modu", GPR32Opnd, II_MODU, urem>;
+
+class BEQZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"beqzalc", brtarget, GPR32Opnd> {
+ list<Register> Defs = [RA];
+}
+
+class BGEZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bgezalc", brtarget, GPR32Opnd> {
+ list<Register> Defs = [RA];
+}
+
+class BGTZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bgtzalc", brtarget, GPR32Opnd> {
+ list<Register> Defs = [RA];
+}
+
+class BLEZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"blezalc", brtarget, GPR32Opnd> {
+ list<Register> Defs = [RA];
+}
+
+class BLTZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bltzalc", brtarget, GPR32Opnd> {
+ list<Register> Defs = [RA];
+}
+
+class BNEZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bnezalc", brtarget, GPR32Opnd> {
+ list<Register> Defs = [RA];
+}
+
+class MUL_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+ InstrItinClass itin,
+ SDPatternOperator Op=null_frag> : MipsR6Arch<instr_asm> {
+ dag OutOperandList = (outs GPROpnd:$rd);
+ dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt);
+ string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
+ list<dag> Pattern = [(set GPROpnd:$rd, (Op GPROpnd:$rs, GPROpnd:$rt))];
+ InstrItinClass Itinerary = itin;
+}
+
+class MUH_DESC : MUL_R6_DESC_BASE<"muh", GPR32Opnd, II_MUH, mulhs>;
+class MUHU_DESC : MUL_R6_DESC_BASE<"muhu", GPR32Opnd, II_MUHU, mulhu>;
+class MUL_R6_DESC : MUL_R6_DESC_BASE<"mul", GPR32Opnd, II_MUL, mul>;
+class MULU_DESC : MUL_R6_DESC_BASE<"mulu", GPR32Opnd, II_MULU>;
+
+class COP1_SEL_DESC_BASE<string instr_asm, RegisterOperand FGROpnd,
+ InstrItinClass itin> {
+ dag OutOperandList = (outs FGROpnd:$fd);
+ dag InOperandList = (ins FGRCCOpnd:$fd_in, FGROpnd:$fs, FGROpnd:$ft);
+ string AsmString = !strconcat(instr_asm, "\t$fd, $fs, $ft");
+ list<dag> Pattern = [(set FGROpnd:$fd, (select FGRCCOpnd:$fd_in,
+ FGROpnd:$ft,
+ FGROpnd:$fs))];
+ string Constraints = "$fd_in = $fd";
+ InstrItinClass Itinerary = itin;
+}
+
+class SEL_D_DESC : COP1_SEL_DESC_BASE<"sel.d", FGR64Opnd, II_SEL_D>,
+ MipsR6Arch<"sel.d"> {
+ // We must insert a SUBREG_TO_REG around $fd_in
+ bit usesCustomInserter = 1;
+}
+class SEL_S_DESC : COP1_SEL_DESC_BASE<"sel.s", FGR32Opnd, II_SEL_S>,
+ MipsR6Arch<"sel.s">;
+
+class SELEQNE_Z_DESC_BASE<string instr_asm, RegisterOperand GPROpnd>
+ : MipsR6Arch<instr_asm> {
+ dag OutOperandList = (outs GPROpnd:$rd);
+ dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt);
+ string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
+ list<dag> Pattern = [];
+ InstrItinClass Itinerary = II_SELCCZ;
+}
+
+class SELEQZ_DESC : SELEQNE_Z_DESC_BASE<"seleqz", GPR32Opnd>;
+class SELNEZ_DESC : SELEQNE_Z_DESC_BASE<"selnez", GPR32Opnd>;
+
+class COP1_4R_DESC_BASE<string instr_asm, RegisterOperand FGROpnd,
+ InstrItinClass itin = NoItinerary> {
+ dag OutOperandList = (outs FGROpnd:$fd);
+ dag InOperandList = (ins FGROpnd:$fd_in, FGROpnd:$fs, FGROpnd:$ft);
+ string AsmString = !strconcat(instr_asm, "\t$fd, $fs, $ft");
+ list<dag> Pattern = [];
+ string Constraints = "$fd_in = $fd";
+ InstrItinClass Itinerary = itin;
+}
+
+class MADDF_S_DESC : COP1_4R_DESC_BASE<"maddf.s", FGR32Opnd, II_MADDF_S>;
+class MADDF_D_DESC : COP1_4R_DESC_BASE<"maddf.d", FGR64Opnd, II_MADDF_D>;
+class MSUBF_S_DESC : COP1_4R_DESC_BASE<"msubf.s", FGR32Opnd, II_MSUBF_S>;
+class MSUBF_D_DESC : COP1_4R_DESC_BASE<"msubf.d", FGR64Opnd, II_MSUBF_D>;
+
+class MAX_MIN_DESC_BASE<string instr_asm, RegisterOperand FGROpnd,
+ InstrItinClass itin> {
+ dag OutOperandList = (outs FGROpnd:$fd);
+ dag InOperandList = (ins FGROpnd:$fs, FGROpnd:$ft);
+ string AsmString = !strconcat(instr_asm, "\t$fd, $fs, $ft");
+ list<dag> Pattern = [];
+ InstrItinClass Itinerary = itin;
+}
+
+class MAX_S_DESC : MAX_MIN_DESC_BASE<"max.s", FGR32Opnd, II_MAX_S>;
+class MAX_D_DESC : MAX_MIN_DESC_BASE<"max.d", FGR64Opnd, II_MAX_D>;
+class MIN_S_DESC : MAX_MIN_DESC_BASE<"min.s", FGR32Opnd, II_MIN_S>;
+class MIN_D_DESC : MAX_MIN_DESC_BASE<"min.d", FGR64Opnd, II_MIN_D>;
+
+class MAXA_S_DESC : MAX_MIN_DESC_BASE<"maxa.s", FGR32Opnd, II_MAX_S>;
+class MAXA_D_DESC : MAX_MIN_DESC_BASE<"maxa.d", FGR64Opnd, II_MAX_D>;
+class MINA_S_DESC : MAX_MIN_DESC_BASE<"mina.s", FGR32Opnd, II_MIN_D>;
+class MINA_D_DESC : MAX_MIN_DESC_BASE<"mina.d", FGR64Opnd, II_MIN_S>;
+
+class SELEQNEZ_DESC_BASE<string instr_asm, RegisterOperand FGROpnd,
+ InstrItinClass itin> {
+ dag OutOperandList = (outs FGROpnd:$fd);
+ dag InOperandList = (ins FGROpnd:$fs, FGROpnd:$ft);
+ string AsmString = !strconcat(instr_asm, "\t$fd, $fs, $ft");
+ list<dag> Pattern = [];
+ InstrItinClass Itinerary = itin;
+}
+
+class SELEQZ_S_DESC : SELEQNEZ_DESC_BASE<"seleqz.s", FGR32Opnd, II_SELCCZ_S>,
+ MipsR6Arch<"seleqz.s">;
+class SELEQZ_D_DESC : SELEQNEZ_DESC_BASE<"seleqz.d", FGR64Opnd, II_SELCCZ_D>,
+ MipsR6Arch<"seleqz.d">;
+class SELNEZ_S_DESC : SELEQNEZ_DESC_BASE<"selnez.s", FGR32Opnd, II_SELCCZ_S>,
+ MipsR6Arch<"selnez.s">;
+class SELNEZ_D_DESC : SELEQNEZ_DESC_BASE<"selnez.d", FGR64Opnd, II_SELCCZ_D>,
+ MipsR6Arch<"selnez.d">;
+
+class CLASS_RINT_DESC_BASE<string instr_asm, RegisterOperand FGROpnd,
+ InstrItinClass itin> {
+ dag OutOperandList = (outs FGROpnd:$fd);
+ dag InOperandList = (ins FGROpnd:$fs);
+ string AsmString = !strconcat(instr_asm, "\t$fd, $fs");
+ list<dag> Pattern = [];
+ InstrItinClass Itinerary = itin;
+}
+
+class RINT_S_DESC : CLASS_RINT_DESC_BASE<"rint.s", FGR32Opnd, II_RINT_S>;
+class RINT_D_DESC : CLASS_RINT_DESC_BASE<"rint.d", FGR64Opnd, II_RINT_D>;
+class CLASS_S_DESC : CLASS_RINT_DESC_BASE<"class.s", FGR32Opnd, II_CLASS_S>;
+class CLASS_D_DESC : CLASS_RINT_DESC_BASE<"class.d", FGR64Opnd, II_CLASS_D>;
+
+class CACHE_HINT_DESC<string instr_asm, Operand MemOpnd,
+ RegisterOperand GPROpnd, InstrItinClass itin>
+ : MipsR6Arch<instr_asm> {
+ dag OutOperandList = (outs);
+ dag InOperandList = (ins MemOpnd:$addr, uimm5:$hint);
+ string AsmString = !strconcat(instr_asm, "\t$hint, $addr");
+ list<dag> Pattern = [];
+ string DecoderMethod = "DecodeCacheeOp_CacheOpR6";
+ InstrItinClass Itinerary = itin;
+}
+
+class CACHE_DESC : CACHE_HINT_DESC<"cache", mem_simm9, GPR32Opnd, II_CACHE>;
+class PREF_DESC : CACHE_HINT_DESC<"pref", mem_simm9, GPR32Opnd, II_PREF>;
+
+class COP2LD_DESC_BASE<string instr_asm, RegisterOperand COPOpnd,
+ InstrItinClass itin> {
+ dag OutOperandList = (outs COPOpnd:$rt);
+ dag InOperandList = (ins mem_simm11:$addr);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+ list<dag> Pattern = [];
+ bit mayLoad = 1;
+ string DecoderMethod = "DecodeFMemCop2R6";
+ InstrItinClass Itinerary = itin;
+}
+
+class LDC2_R6_DESC : COP2LD_DESC_BASE<"ldc2", COP2Opnd, II_LDC2>;
+class LWC2_R6_DESC : COP2LD_DESC_BASE<"lwc2", COP2Opnd, II_LWC2>;
+
+class COP2ST_DESC_BASE<string instr_asm, RegisterOperand COPOpnd,
+ InstrItinClass itin> {
+ dag OutOperandList = (outs);
+ dag InOperandList = (ins COPOpnd:$rt, mem_simm11:$addr);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+ list<dag> Pattern = [];
+ bit mayStore = 1;
+ string DecoderMethod = "DecodeFMemCop2R6";
+ InstrItinClass Itinerary = itin;
+}
+
+class SDC2_R6_DESC : COP2ST_DESC_BASE<"sdc2", COP2Opnd, II_SDC2>;
+class SWC2_R6_DESC : COP2ST_DESC_BASE<"swc2", COP2Opnd, II_SWC2>;
+
+class LSA_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+ Operand ImmOpnd, InstrItinClass itin>
+ : MipsR6Arch<instr_asm> {
+ dag OutOperandList = (outs GPROpnd:$rd);
+ dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt, ImmOpnd:$imm2);
+ string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt, $imm2");
+ list<dag> Pattern = [];
+ InstrItinClass Itinerary = itin;
+}
+
+class LSA_R6_DESC : LSA_R6_DESC_BASE<"lsa", GPR32Opnd, uimm2_plus1, II_LSA>;
+
+class LL_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+ Operand MemOpnd, InstrItinClass itin>
+ : MipsR6Arch<instr_asm> {
+ dag OutOperandList = (outs GPROpnd:$rt);
+ dag InOperandList = (ins MemOpnd:$addr);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+ list<dag> Pattern = [];
+ bit mayLoad = 1;
+ InstrItinClass Itinerary = itin;
+}
+
+class LL_R6_DESC : LL_R6_DESC_BASE<"ll", GPR32Opnd, mem_simm9, II_LL>;
+
+class SC_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+ InstrItinClass itin> {
+ dag OutOperandList = (outs GPROpnd:$dst);
+ dag InOperandList = (ins GPROpnd:$rt, mem_simm9:$addr);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+ list<dag> Pattern = [];
+ bit mayStore = 1;
+ string Constraints = "$rt = $dst";
+ InstrItinClass Itinerary = itin;
+}
+
+class SC_R6_DESC : SC_R6_DESC_BASE<"sc", GPR32Opnd, II_SC>;
+
+class CLO_CLZ_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+ InstrItinClass itin>
+ : MipsR6Arch<instr_asm> {
+ dag OutOperandList = (outs GPROpnd:$rd);
+ dag InOperandList = (ins GPROpnd:$rs);
+ string AsmString = !strconcat(instr_asm, "\t$rd, $rs");
+ InstrItinClass Itinerary = itin;
+}
+
+class CLO_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+ InstrItinClass itin> :
+ CLO_CLZ_R6_DESC_BASE<instr_asm, GPROpnd, itin> {
+ list<dag> Pattern = [(set GPROpnd:$rd, (ctlz (not GPROpnd:$rs)))];
+}
+
+class CLZ_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+ InstrItinClass itin> :
+ CLO_CLZ_R6_DESC_BASE<instr_asm, GPROpnd, itin> {
+ list<dag> Pattern = [(set GPROpnd:$rd, (ctlz GPROpnd:$rs))];
+}
+
+class CLO_R6_DESC : CLO_R6_DESC_BASE<"clo", GPR32Opnd, II_CLO>;
+class CLZ_R6_DESC : CLZ_R6_DESC_BASE<"clz", GPR32Opnd, II_CLZ>;
+
+class SDBBP_R6_DESC {
+ dag OutOperandList = (outs);
+ dag InOperandList = (ins uimm20:$code_);
+ string AsmString = "sdbbp\t$code_";
+ list<dag> Pattern = [];
+ bit isCTI = 1;
+ InstrItinClass Itinerary = II_SDBBP;
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Definitions
+//
+//===----------------------------------------------------------------------===//
+
+def ADDIUPC : R6MMR6Rel, ADDIUPC_ENC, ADDIUPC_DESC, ISA_MIPS32R6;
+def ALIGN : R6MMR6Rel, ALIGN_ENC, ALIGN_DESC, ISA_MIPS32R6;
+def ALUIPC : R6MMR6Rel, ALUIPC_ENC, ALUIPC_DESC, ISA_MIPS32R6;
+def AUI : R6MMR6Rel, AUI_ENC, AUI_DESC, ISA_MIPS32R6;
+def AUIPC : R6MMR6Rel, AUIPC_ENC, AUIPC_DESC, ISA_MIPS32R6;
+def BAL : BAL_ENC, BAL_DESC, ISA_MIPS32R6;
+def BALC : R6MMR6Rel, BALC_ENC, BALC_DESC, ISA_MIPS32R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def BC1EQZ : BC1EQZ_ENC, BC1EQZ_DESC, ISA_MIPS32R6, HARDFLOAT;
+ def BC1NEZ : BC1NEZ_ENC, BC1NEZ_DESC, ISA_MIPS32R6, HARDFLOAT;
+ def BC2EQZ : BC2EQZ_ENC, BC2EQZ_DESC, ISA_MIPS32R6;
+ def BC2NEZ : BC2NEZ_ENC, BC2NEZ_DESC, ISA_MIPS32R6;
+}
+def BC : R6MMR6Rel, BC_ENC, BC_DESC, ISA_MIPS32R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def BEQC : R6MMR6Rel, BEQC_ENC, BEQC_DESC, ISA_MIPS32R6;
+ def BEQZALC : R6MMR6Rel, BEQZALC_ENC, BEQZALC_DESC, ISA_MIPS32R6;
+ def BEQZC : R6MMR6Rel, BEQZC_ENC, BEQZC_DESC, ISA_MIPS32R6;
+ def BGEC : R6MMR6Rel, BGEC_ENC, BGEC_DESC, ISA_MIPS32R6;
+ def BGEUC : R6MMR6Rel, BGEUC_ENC, BGEUC_DESC, ISA_MIPS32R6;
+ def BGEZALC : R6MMR6Rel, BGEZALC_ENC, BGEZALC_DESC, ISA_MIPS32R6;
+ def BGEZC : R6MMR6Rel, BGEZC_ENC, BGEZC_DESC, ISA_MIPS32R6;
+ def BGTZALC : R6MMR6Rel, BGTZALC_ENC, BGTZALC_DESC, ISA_MIPS32R6;
+ def BGTZC : R6MMR6Rel, BGTZC_ENC, BGTZC_DESC, ISA_MIPS32R6;
+}
+def BITSWAP : R6MMR6Rel, BITSWAP_ENC, BITSWAP_DESC, ISA_MIPS32R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def BLEZALC : R6MMR6Rel, BLEZALC_ENC, BLEZALC_DESC, ISA_MIPS32R6;
+ def BLEZC : R6MMR6Rel, BLEZC_ENC, BLEZC_DESC, ISA_MIPS32R6;
+ def BLTC : R6MMR6Rel, BLTC_ENC, BLTC_DESC, ISA_MIPS32R6;
+ def BLTUC : R6MMR6Rel, BLTUC_ENC, BLTUC_DESC, ISA_MIPS32R6;
+ def BLTZALC : R6MMR6Rel, BLTZALC_ENC, BLTZALC_DESC, ISA_MIPS32R6;
+ def BLTZC : R6MMR6Rel, BLTZC_ENC, BLTZC_DESC, ISA_MIPS32R6;
+ def BNEC : R6MMR6Rel, BNEC_ENC, BNEC_DESC, ISA_MIPS32R6;
+ def BNEZALC : R6MMR6Rel, BNEZALC_ENC, BNEZALC_DESC, ISA_MIPS32R6;
+ def BNEZC : R6MMR6Rel, BNEZC_ENC, BNEZC_DESC, ISA_MIPS32R6;
+ def BNVC : R6MMR6Rel, BNVC_ENC, BNVC_DESC, ISA_MIPS32R6;
+ def BOVC : R6MMR6Rel, BOVC_ENC, BOVC_DESC, ISA_MIPS32R6;
+}
+def CACHE_R6 : R6MMR6Rel, CACHE_ENC, CACHE_DESC, ISA_MIPS32R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def CLASS_D : CLASS_D_ENC, CLASS_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+ def CLASS_S : CLASS_S_ENC, CLASS_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+}
+def CLO_R6 : R6MMR6Rel, CLO_R6_ENC, CLO_R6_DESC, ISA_MIPS32R6;
+def CLZ_R6 : R6MMR6Rel, CLZ_R6_ENC, CLZ_R6_DESC, ISA_MIPS32R6;
+defm S : CMP_CC_M<FIELD_CMP_FORMAT_S, "s", FGR32Opnd, II_CMP_CC_S>;
+defm D : CMP_CC_M<FIELD_CMP_FORMAT_D, "d", FGR64Opnd, II_CMP_CC_D>;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def DIV : R6MMR6Rel, DIV_ENC, DIV_DESC, ISA_MIPS32R6;
+ def DIVU : R6MMR6Rel, DIVU_ENC, DIVU_DESC, ISA_MIPS32R6;
+}
+
+def DVP : R6MMR6Rel, DVP_ENC, DVP_DESC, ISA_MIPS32R6;
+def EVP : R6MMR6Rel, EVP_ENC, EVP_DESC, ISA_MIPS32R6;
+
+def JIALC : R6MMR6Rel, JIALC_ENC, JIALC_DESC, ISA_MIPS32R6;
+def JIC : R6MMR6Rel, JIC_ENC, JIC_DESC, ISA_MIPS32R6;
+def JR_HB_R6 : JR_HB_R6_ENC, JR_HB_R6_DESC, ISA_MIPS32R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def LDC2_R6 : LDC2_R6_ENC, LDC2_R6_DESC, ISA_MIPS32R6;
+ def LL_R6 : LL_R6_ENC, LL_R6_DESC, PTR_32, ISA_MIPS32R6;
+}
+def LSA_R6 : R6MMR6Rel, LSA_R6_ENC, LSA_R6_DESC, ISA_MIPS32R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def LWC2_R6 : LWC2_R6_ENC, LWC2_R6_DESC, ISA_MIPS32R6;
+}
+def LWPC : R6MMR6Rel, LWPC_ENC, LWPC_DESC, ISA_MIPS32R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def LWUPC : R6MMR6Rel, LWUPC_ENC, LWUPC_DESC, ISA_MIPS32R6;
+ def MADDF_S : MADDF_S_ENC, MADDF_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+ def MADDF_D : MADDF_D_ENC, MADDF_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+ def MAXA_D : MAXA_D_ENC, MAXA_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+ def MAXA_S : MAXA_S_ENC, MAXA_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+ def MAX_D : MAX_D_ENC, MAX_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+ def MAX_S : MAX_S_ENC, MAX_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+ def MINA_D : MINA_D_ENC, MINA_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+ def MINA_S : MINA_S_ENC, MINA_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+ def MIN_D : MIN_D_ENC, MIN_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+ def MIN_S : MIN_S_ENC, MIN_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+
+ def MOD : R6MMR6Rel, MOD_ENC, MOD_DESC, ISA_MIPS32R6;
+ def MODU : R6MMR6Rel, MODU_ENC, MODU_DESC, ISA_MIPS32R6;
+
+ def MSUBF_S : MSUBF_S_ENC, MSUBF_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+ def MSUBF_D : MSUBF_D_ENC, MSUBF_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+
+ def MUH : R6MMR6Rel, MUH_ENC, MUH_DESC, ISA_MIPS32R6;
+ def MUHU : R6MMR6Rel, MUHU_ENC, MUHU_DESC, ISA_MIPS32R6;
+ def MUL_R6 : R6MMR6Rel, MUL_R6_ENC, MUL_R6_DESC, ISA_MIPS32R6;
+ def MULU : R6MMR6Rel, MULU_ENC, MULU_DESC, ISA_MIPS32R6;
+}
+def NAL; // BAL with rd=0
+def PREF_R6 : R6MMR6Rel, PREF_ENC, PREF_DESC, ISA_MIPS32R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def RINT_D : RINT_D_ENC, RINT_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+ def RINT_S : RINT_S_ENC, RINT_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+ def SC_R6 : SC_R6_ENC, SC_R6_DESC, PTR_32, ISA_MIPS32R6;
+ def SDBBP_R6 : SDBBP_R6_ENC, SDBBP_R6_DESC, ISA_MIPS32R6;
+ def SELEQZ : R6MMR6Rel, SELEQZ_ENC, SELEQZ_DESC, ISA_MIPS32R6, GPR_32;
+ def SELNEZ : R6MMR6Rel, SELNEZ_ENC, SELNEZ_DESC, ISA_MIPS32R6, GPR_32;
+ def SELEQZ_D : R6MMR6Rel, SELEQZ_D_ENC, SELEQZ_D_DESC, ISA_MIPS32R6,
+ HARDFLOAT;
+ def SELEQZ_S : R6MMR6Rel, SELEQZ_S_ENC, SELEQZ_S_DESC, ISA_MIPS32R6,
+ HARDFLOAT;
+ def SELNEZ_D : R6MMR6Rel, SELNEZ_D_ENC, SELNEZ_D_DESC, ISA_MIPS32R6,
+ HARDFLOAT;
+ def SELNEZ_S : R6MMR6Rel, SELNEZ_S_ENC, SELNEZ_S_DESC, ISA_MIPS32R6,
+ HARDFLOAT;
+ def SEL_D : R6MMR6Rel, SEL_D_ENC, SEL_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+ def SEL_S : R6MMR6Rel, SEL_S_ENC, SEL_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+ def SDC2_R6 : SDC2_R6_ENC, SDC2_R6_DESC, ISA_MIPS32R6;
+ def SWC2_R6 : SWC2_R6_ENC, SWC2_R6_DESC, ISA_MIPS32R6;
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Aliases
+//
+//===----------------------------------------------------------------------===//
+
+def : MipsInstAlias<"dvp", (DVP ZERO), 0>, ISA_MIPS32R6;
+def : MipsInstAlias<"evp", (EVP ZERO), 0>, ISA_MIPS32R6;
+
+let AdditionalPredicates = [NotInMicroMips] in {
+def : MipsInstAlias<"sdbbp", (SDBBP_R6 0)>, ISA_MIPS32R6;
+def : MipsInstAlias<"jr $rs", (JALR ZERO, GPR32Opnd:$rs), 1>, ISA_MIPS32R6, GPR_32;
+}
+
+def : MipsInstAlias<"jrc $rs", (JIC GPR32Opnd:$rs, 0), 1>, ISA_MIPS32R6, GPR_32;
+
+let AdditionalPredicates = [NotInMicroMips] in {
+def : MipsInstAlias<"jalrc $rs", (JIALC GPR32Opnd:$rs, 0), 1>, ISA_MIPS32R6, GPR_32;
+}
+//===----------------------------------------------------------------------===//
+//
+// Patterns and Pseudo Instructions
+//
+//===----------------------------------------------------------------------===//
+
+// comparisons supported via another comparison
+multiclass Cmp_Pats<ValueType VT, Instruction NOROp, Register ZEROReg> {
+def : MipsPat<(setone VT:$lhs, VT:$rhs),
+ (NOROp (!cast<Instruction>("CMP_UEQ_"#NAME) VT:$lhs, VT:$rhs), ZEROReg)>;
+def : MipsPat<(seto VT:$lhs, VT:$rhs),
+ (NOROp (!cast<Instruction>("CMP_UN_"#NAME) VT:$lhs, VT:$rhs), ZEROReg)>;
+def : MipsPat<(setune VT:$lhs, VT:$rhs),
+ (NOROp (!cast<Instruction>("CMP_EQ_"#NAME) VT:$lhs, VT:$rhs), ZEROReg)>;
+def : MipsPat<(seteq VT:$lhs, VT:$rhs),
+ (!cast<Instruction>("CMP_EQ_"#NAME) VT:$lhs, VT:$rhs)>;
+def : MipsPat<(setgt VT:$lhs, VT:$rhs),
+ (!cast<Instruction>("CMP_LE_"#NAME) VT:$rhs, VT:$lhs)>;
+def : MipsPat<(setge VT:$lhs, VT:$rhs),
+ (!cast<Instruction>("CMP_LT_"#NAME) VT:$rhs, VT:$lhs)>;
+def : MipsPat<(setlt VT:$lhs, VT:$rhs),
+ (!cast<Instruction>("CMP_LT_"#NAME) VT:$lhs, VT:$rhs)>;
+def : MipsPat<(setle VT:$lhs, VT:$rhs),
+ (!cast<Instruction>("CMP_LE_"#NAME) VT:$lhs, VT:$rhs)>;
+def : MipsPat<(setne VT:$lhs, VT:$rhs),
+ (NOROp (!cast<Instruction>("CMP_EQ_"#NAME) VT:$lhs, VT:$rhs), ZEROReg)>;
+}
+
+let AdditionalPredicates = [NotInMicroMips] in {
+ defm S : Cmp_Pats<f32, NOR, ZERO>, ISA_MIPS32R6;
+ defm D : Cmp_Pats<f64, NOR, ZERO>, ISA_MIPS32R6;
+}
+
+// i32 selects
+multiclass SelectInt_Pats<ValueType RC, Instruction OROp, Instruction XORiOp,
+ Instruction SLTiOp, Instruction SLTiuOp,
+ Instruction SELEQZOp, Instruction SELNEZOp,
+ SDPatternOperator imm_type, ValueType Opg> {
+// reg, immz
+def : MipsPat<(select (Opg (seteq RC:$cond, immz)), RC:$t, RC:$f),
+ (OROp (SELEQZOp RC:$t, RC:$cond), (SELNEZOp RC:$f, RC:$cond))>;
+def : MipsPat<(select (Opg (setne RC:$cond, immz)), RC:$t, RC:$f),
+ (OROp (SELNEZOp RC:$t, RC:$cond), (SELEQZOp RC:$f, RC:$cond))>;
+
+// reg, immZExt16[_64]
+def : MipsPat<(select (Opg (seteq RC:$cond, imm_type:$imm)), RC:$t, RC:$f),
+ (OROp (SELEQZOp RC:$t, (XORiOp RC:$cond, imm_type:$imm)),
+ (SELNEZOp RC:$f, (XORiOp RC:$cond, imm_type:$imm)))>;
+def : MipsPat<(select (Opg (setne RC:$cond, imm_type:$imm)), RC:$t, RC:$f),
+ (OROp (SELNEZOp RC:$t, (XORiOp RC:$cond, imm_type:$imm)),
+ (SELEQZOp RC:$f, (XORiOp RC:$cond, imm_type:$imm)))>;
+
+// reg, immSExt16Plus1
+def : MipsPat<(select (Opg (setgt RC:$cond, immSExt16Plus1:$imm)), RC:$t, RC:$f),
+ (OROp (SELEQZOp RC:$t, (SLTiOp RC:$cond, (Plus1 imm:$imm))),
+ (SELNEZOp RC:$f, (SLTiOp RC:$cond, (Plus1 imm:$imm))))>;
+def : MipsPat<(select (Opg (setugt RC:$cond, immSExt16Plus1:$imm)), RC:$t, RC:$f),
+ (OROp (SELEQZOp RC:$t, (SLTiuOp RC:$cond, (Plus1 imm:$imm))),
+ (SELNEZOp RC:$f, (SLTiuOp RC:$cond, (Plus1 imm:$imm))))>;
+
+def : MipsPat<(select (Opg (seteq RC:$cond, immz)), RC:$t, immz),
+ (SELEQZOp RC:$t, RC:$cond)>;
+def : MipsPat<(select (Opg (setne RC:$cond, immz)), RC:$t, immz),
+ (SELNEZOp RC:$t, RC:$cond)>;
+def : MipsPat<(select (Opg (seteq RC:$cond, immz)), immz, RC:$f),
+ (SELNEZOp RC:$f, RC:$cond)>;
+def : MipsPat<(select (Opg (setne RC:$cond, immz)), immz, RC:$f),
+ (SELEQZOp RC:$f, RC:$cond)>;
+}
+
+let AdditionalPredicates = [NotInMicroMips] in {
+defm : SelectInt_Pats<i32, OR, XORi, SLTi, SLTiu, SELEQZ, SELNEZ,
+ immZExt16, i32>, ISA_MIPS32R6;
+
+def : MipsPat<(select i32:$cond, i32:$t, i32:$f),
+ (OR (SELNEZ i32:$t, i32:$cond),
+ (SELEQZ i32:$f, i32:$cond))>,
+ ISA_MIPS32R6;
+def : MipsPat<(select i32:$cond, i32:$t, immz),
+ (SELNEZ i32:$t, i32:$cond)>,
+ ISA_MIPS32R6;
+def : MipsPat<(select i32:$cond, immz, i32:$f),
+ (SELEQZ i32:$f, i32:$cond)>,
+ ISA_MIPS32R6;
+}
diff --git a/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td b/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td
new file mode 100644
index 000000000000..521e22fb7992
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td
@@ -0,0 +1,772 @@
+//===- Mips64InstrInfo.td - Mips64 Instruction Information -*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips64 instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Mips Operand, Complex Patterns and Transformations Definitions.
+//===----------------------------------------------------------------------===//
+
+// shamt must fit in 6 bits.
+def immZExt6 : ImmLeaf<i32, [{return Imm == (Imm & 0x3f);}]>;
+
+// Node immediate fits as 10-bit sign extended on target immediate.
+// e.g. seqi, snei
+def immSExt10_64 : PatLeaf<(i64 imm),
+ [{ return isInt<10>(N->getSExtValue()); }]>;
+
+def immZExt16_64 : PatLeaf<(i64 imm),
+ [{ return isUInt<16>(N->getZExtValue()); }]>;
+
+def immZExt5_64 : ImmLeaf<i64, [{ return Imm == (Imm & 0x1f); }]>;
+
+// Transformation function: get log2 of low 32 bits of immediate
+def Log2LO : SDNodeXForm<imm, [{
+ return getImm(N, Log2_64((unsigned) N->getZExtValue()));
+}]>;
+
+// Transformation function: get log2 of high 32 bits of immediate
+def Log2HI : SDNodeXForm<imm, [{
+ return getImm(N, Log2_64((unsigned) (N->getZExtValue() >> 32)));
+}]>;
+
+// Predicate: True if immediate is a power of 2 and fits 32 bits
+def PowerOf2LO : PatLeaf<(imm), [{
+ if (N->getValueType(0) == MVT::i64) {
+ uint64_t Imm = N->getZExtValue();
+ return isPowerOf2_64(Imm) && (Imm & 0xffffffff) == Imm;
+ }
+ else
+ return false;
+}]>;
+
+// Predicate: True if immediate is a power of 2 and exceeds 32 bits
+def PowerOf2HI : PatLeaf<(imm), [{
+ if (N->getValueType(0) == MVT::i64) {
+ uint64_t Imm = N->getZExtValue();
+ return isPowerOf2_64(Imm) && (Imm & 0xffffffff00000000) == Imm;
+ }
+ else
+ return false;
+}]>;
+
+def assertzext_lt_i32 : PatFrag<(ops node:$src), (assertzext node:$src), [{
+ return cast<VTSDNode>(N->getOperand(1))->getVT().bitsLT(MVT::i32);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Instructions specific format
+//===----------------------------------------------------------------------===//
+let usesCustomInserter = 1 in {
+ def ATOMIC_LOAD_ADD_I64 : Atomic2Ops<atomic_load_add_64, GPR64>;
+ def ATOMIC_LOAD_SUB_I64 : Atomic2Ops<atomic_load_sub_64, GPR64>;
+ def ATOMIC_LOAD_AND_I64 : Atomic2Ops<atomic_load_and_64, GPR64>;
+ def ATOMIC_LOAD_OR_I64 : Atomic2Ops<atomic_load_or_64, GPR64>;
+ def ATOMIC_LOAD_XOR_I64 : Atomic2Ops<atomic_load_xor_64, GPR64>;
+ def ATOMIC_LOAD_NAND_I64 : Atomic2Ops<atomic_load_nand_64, GPR64>;
+ def ATOMIC_SWAP_I64 : Atomic2Ops<atomic_swap_64, GPR64>;
+ def ATOMIC_CMP_SWAP_I64 : AtomicCmpSwap<atomic_cmp_swap_64, GPR64>;
+}
+
+/// Pseudo instructions for loading and storing accumulator registers.
+let isPseudo = 1, isCodeGenOnly = 1, hasNoSchedulingInfo = 1 in {
+ def LOAD_ACC128 : Load<"", ACC128>;
+ def STORE_ACC128 : Store<"", ACC128>;
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction definition
+//===----------------------------------------------------------------------===//
+let DecoderNamespace = "Mips64" in {
+/// Arithmetic Instructions (ALU Immediate)
+def DADDi : ArithLogicI<"daddi", simm16_64, GPR64Opnd, II_DADDI>,
+ ADDI_FM<0x18>, ISA_MIPS3_NOT_32R6_64R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def DADDiu : StdMMR6Rel, ArithLogicI<"daddiu", simm16_64, GPR64Opnd,
+ II_DADDIU, immSExt16, add>,
+ ADDI_FM<0x19>, IsAsCheapAsAMove, ISA_MIPS3;
+}
+
+let isCodeGenOnly = 1 in {
+def SLTi64 : SetCC_I<"slti", setlt, simm16_64, immSExt16, GPR64Opnd>,
+ SLTI_FM<0xa>;
+def SLTiu64 : SetCC_I<"sltiu", setult, simm16_64, immSExt16, GPR64Opnd>,
+ SLTI_FM<0xb>;
+def ANDi64 : ArithLogicI<"andi", uimm16_64, GPR64Opnd, II_AND, immZExt16, and>,
+ ADDI_FM<0xc>;
+def ORi64 : ArithLogicI<"ori", uimm16_64, GPR64Opnd, II_OR, immZExt16, or>,
+ ADDI_FM<0xd>;
+def XORi64 : ArithLogicI<"xori", uimm16_64, GPR64Opnd, II_XOR, immZExt16, xor>,
+ ADDI_FM<0xe>;
+def LUi64 : LoadUpper<"lui", GPR64Opnd, uimm16_64_relaxed>, LUI_FM;
+}
+
+/// Arithmetic Instructions (3-Operand, R-Type)
+let AdditionalPredicates = [NotInMicroMips] in {
+ def DADD : StdMMR6Rel, ArithLogicR<"dadd", GPR64Opnd, 1, II_DADD>,
+ ADD_FM<0, 0x2c>, ISA_MIPS3;
+ def DADDu : StdMMR6Rel, ArithLogicR<"daddu", GPR64Opnd, 1, II_DADDU, add>,
+ ADD_FM<0, 0x2d>, ISA_MIPS3;
+ def DSUBu : StdMMR6Rel, ArithLogicR<"dsubu", GPR64Opnd, 0, II_DSUBU, sub>, ADD_FM<0, 0x2f>,
+ ISA_MIPS3;
+ def DSUB : StdMMR6Rel, ArithLogicR<"dsub", GPR64Opnd, 0, II_DSUB>, ADD_FM<0, 0x2e>,
+ ISA_MIPS3;
+}
+
+let isCodeGenOnly = 1 in {
+def SLT64 : SetCC_R<"slt", setlt, GPR64Opnd>, ADD_FM<0, 0x2a>;
+def SLTu64 : SetCC_R<"sltu", setult, GPR64Opnd>, ADD_FM<0, 0x2b>;
+def AND64 : ArithLogicR<"and", GPR64Opnd, 1, II_AND, and>, ADD_FM<0, 0x24>;
+def OR64 : ArithLogicR<"or", GPR64Opnd, 1, II_OR, or>, ADD_FM<0, 0x25>;
+def XOR64 : ArithLogicR<"xor", GPR64Opnd, 1, II_XOR, xor>, ADD_FM<0, 0x26>;
+def NOR64 : LogicNOR<"nor", GPR64Opnd>, ADD_FM<0, 0x27>;
+}
+
+/// Shift Instructions
+let AdditionalPredicates = [NotInMicroMips] in {
+ def DSLL : StdMMR6Rel, shift_rotate_imm<"dsll", uimm6, GPR64Opnd, II_DSLL,
+ shl, immZExt6>,
+ SRA_FM<0x38, 0>, ISA_MIPS3;
+ def DSRL : StdMMR6Rel, shift_rotate_imm<"dsrl", uimm6, GPR64Opnd, II_DSRL,
+ srl, immZExt6>,
+ SRA_FM<0x3a, 0>, ISA_MIPS3;
+ def DSRA : StdMMR6Rel, shift_rotate_imm<"dsra", uimm6, GPR64Opnd, II_DSRA,
+ sra, immZExt6>,
+ SRA_FM<0x3b, 0>, ISA_MIPS3;
+ def DSLLV : StdMMR6Rel, shift_rotate_reg<"dsllv", GPR64Opnd, II_DSLLV, shl>,
+ SRLV_FM<0x14, 0>, ISA_MIPS3;
+ def DSRAV : StdMMR6Rel, shift_rotate_reg<"dsrav", GPR64Opnd, II_DSRAV, sra>,
+ SRLV_FM<0x17, 0>, ISA_MIPS3;
+ def DSRLV : StdMMR6Rel, shift_rotate_reg<"dsrlv", GPR64Opnd, II_DSRLV, srl>,
+ SRLV_FM<0x16, 0>, ISA_MIPS3;
+ def DSLL32 : StdMMR6Rel, shift_rotate_imm<"dsll32", uimm5, GPR64Opnd,
+ II_DSLL32>,
+ SRA_FM<0x3c, 0>, ISA_MIPS3;
+ def DSRL32 : StdMMR6Rel, shift_rotate_imm<"dsrl32", uimm5, GPR64Opnd,
+ II_DSRL32>,
+ SRA_FM<0x3e, 0>, ISA_MIPS3;
+ def DSRA32 : StdMMR6Rel, shift_rotate_imm<"dsra32", uimm5, GPR64Opnd,
+ II_DSRA32>,
+ SRA_FM<0x3f, 0>, ISA_MIPS3;
+
+// Rotate Instructions
+ def DROTR : StdMMR6Rel, shift_rotate_imm<"drotr", uimm6, GPR64Opnd, II_DROTR,
+ rotr, immZExt6>,
+ SRA_FM<0x3a, 1>, ISA_MIPS64R2;
+ def DROTRV : StdMMR6Rel, shift_rotate_reg<"drotrv", GPR64Opnd, II_DROTRV,
+ rotr>,
+ SRLV_FM<0x16, 1>, ISA_MIPS64R2;
+ def DROTR32 : StdMMR6Rel, shift_rotate_imm<"drotr32", uimm5, GPR64Opnd,
+ II_DROTR32>,
+ SRA_FM<0x3e, 1>, ISA_MIPS64R2;
+}
+
+/// Load and Store Instructions
+/// aligned
+let isCodeGenOnly = 1 in {
+def LB64 : Load<"lb", GPR64Opnd, sextloadi8, II_LB>, LW_FM<0x20>;
+def LBu64 : Load<"lbu", GPR64Opnd, zextloadi8, II_LBU>, LW_FM<0x24>;
+def LH64 : Load<"lh", GPR64Opnd, sextloadi16, II_LH>, LW_FM<0x21>;
+def LHu64 : Load<"lhu", GPR64Opnd, zextloadi16, II_LHU>, LW_FM<0x25>;
+def LW64 : Load<"lw", GPR64Opnd, sextloadi32, II_LW>, LW_FM<0x23>;
+def SB64 : Store<"sb", GPR64Opnd, truncstorei8, II_SB>, LW_FM<0x28>;
+def SH64 : Store<"sh", GPR64Opnd, truncstorei16, II_SH>, LW_FM<0x29>;
+def SW64 : Store<"sw", GPR64Opnd, truncstorei32, II_SW>, LW_FM<0x2b>;
+}
+
+let AdditionalPredicates = [NotInMicroMips] in {
+ def LWu : StdMMR6Rel, MMRel, Load<"lwu", GPR64Opnd, zextloadi32, II_LWU>,
+ LW_FM<0x27>, ISA_MIPS3;
+ def LD : StdMMR6Rel, LoadMemory<"ld", GPR64Opnd, mem_simm16, load, II_LD>,
+ LW_FM<0x37>, ISA_MIPS3;
+ def SD : StdMMR6Rel, StoreMemory<"sd", GPR64Opnd, mem_simm16, store, II_SD>,
+ LW_FM<0x3f>, ISA_MIPS3;
+}
+
+
+
+/// load/store left/right
+let isCodeGenOnly = 1 in {
+def LWL64 : LoadLeftRight<"lwl", MipsLWL, GPR64Opnd, II_LWL>, LW_FM<0x22>;
+def LWR64 : LoadLeftRight<"lwr", MipsLWR, GPR64Opnd, II_LWR>, LW_FM<0x26>;
+def SWL64 : StoreLeftRight<"swl", MipsSWL, GPR64Opnd, II_SWL>, LW_FM<0x2a>;
+def SWR64 : StoreLeftRight<"swr", MipsSWR, GPR64Opnd, II_SWR>, LW_FM<0x2e>;
+}
+
+def LDL : LoadLeftRight<"ldl", MipsLDL, GPR64Opnd, II_LDL>, LW_FM<0x1a>,
+ ISA_MIPS3_NOT_32R6_64R6;
+def LDR : LoadLeftRight<"ldr", MipsLDR, GPR64Opnd, II_LDR>, LW_FM<0x1b>,
+ ISA_MIPS3_NOT_32R6_64R6;
+def SDL : StoreLeftRight<"sdl", MipsSDL, GPR64Opnd, II_SDL>, LW_FM<0x2c>,
+ ISA_MIPS3_NOT_32R6_64R6;
+def SDR : StoreLeftRight<"sdr", MipsSDR, GPR64Opnd, II_SDR>, LW_FM<0x2d>,
+ ISA_MIPS3_NOT_32R6_64R6;
+
+/// Load-linked, Store-conditional
+let AdditionalPredicates = [NotInMicroMips] in {
+ def LLD : StdMMR6Rel, LLBase<"lld", GPR64Opnd, mem_simm16>, LW_FM<0x34>,
+ ISA_MIPS3_NOT_32R6_64R6;
+}
+def SCD : SCBase<"scd", GPR64Opnd>, LW_FM<0x3c>, ISA_MIPS3_NOT_32R6_64R6;
+
+let AdditionalPredicates = [NotInMicroMips],
+ DecoderNamespace = "Mips32_64_PTR64" in {
+def LL64 : LLBase<"ll", GPR32Opnd>, LW_FM<0x30>, PTR_64,
+ ISA_MIPS2_NOT_32R6_64R6;
+def SC64 : SCBase<"sc", GPR32Opnd>, LW_FM<0x38>, PTR_64,
+ ISA_MIPS2_NOT_32R6_64R6;
+def JR64 : IndirectBranch<"jr", GPR64Opnd>, MTLO_FM<8>, PTR_64;
+}
+
+def JALR64 : JumpLinkReg<"jalr", GPR64Opnd>, JALR_FM;
+
+/// Jump and Branch Instructions
+let isCodeGenOnly = 1 in {
+ def BEQ64 : CBranch<"beq", brtarget, seteq, GPR64Opnd>, BEQ_FM<4>;
+ def BNE64 : CBranch<"bne", brtarget, setne, GPR64Opnd>, BEQ_FM<5>;
+ def BGEZ64 : CBranchZero<"bgez", brtarget, setge, GPR64Opnd>, BGEZ_FM<1, 1>;
+ def BGTZ64 : CBranchZero<"bgtz", brtarget, setgt, GPR64Opnd>, BGEZ_FM<7, 0>;
+ def BLEZ64 : CBranchZero<"blez", brtarget, setle, GPR64Opnd>, BGEZ_FM<6, 0>;
+ def BLTZ64 : CBranchZero<"bltz", brtarget, setlt, GPR64Opnd>, BGEZ_FM<1, 0>;
+ def JALR64Pseudo : JumpLinkRegPseudo<GPR64Opnd, JALR, RA, GPR32Opnd>;
+}
+
+def TAILCALLREG64 : TailCallReg<GPR64Opnd>;
+
+def PseudoReturn64 : PseudoReturnBase<GPR64Opnd>;
+def PseudoIndirectBranch64 : PseudoIndirectBranchBase<GPR64Opnd>;
+
+/// Multiply and Divide Instructions.
+let AdditionalPredicates = [NotInMicroMips] in {
+ def DMULT : Mult<"dmult", II_DMULT, GPR64Opnd, [HI0_64, LO0_64]>,
+ MULT_FM<0, 0x1c>, ISA_MIPS3_NOT_32R6_64R6;
+ def DMULTu : Mult<"dmultu", II_DMULTU, GPR64Opnd, [HI0_64, LO0_64]>,
+ MULT_FM<0, 0x1d>, ISA_MIPS3_NOT_32R6_64R6;
+}
+def PseudoDMULT : MultDivPseudo<DMULT, ACC128, GPR64Opnd, MipsMult,
+ II_DMULT>, ISA_MIPS3_NOT_32R6_64R6;
+def PseudoDMULTu : MultDivPseudo<DMULTu, ACC128, GPR64Opnd, MipsMultu,
+ II_DMULTU>, ISA_MIPS3_NOT_32R6_64R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def DSDIV : Div<"ddiv", II_DDIV, GPR64Opnd, [HI0_64, LO0_64]>,
+ MULT_FM<0, 0x1e>, ISA_MIPS3_NOT_32R6_64R6;
+ def DUDIV : Div<"ddivu", II_DDIVU, GPR64Opnd, [HI0_64, LO0_64]>,
+ MULT_FM<0, 0x1f>, ISA_MIPS3_NOT_32R6_64R6;
+}
+def PseudoDSDIV : MultDivPseudo<DSDIV, ACC128, GPR64Opnd, MipsDivRem,
+ II_DDIV, 0, 1, 1>, ISA_MIPS3_NOT_32R6_64R6;
+def PseudoDUDIV : MultDivPseudo<DUDIV, ACC128, GPR64Opnd, MipsDivRemU,
+ II_DDIVU, 0, 1, 1>, ISA_MIPS3_NOT_32R6_64R6;
+
+let isCodeGenOnly = 1 in {
+def MTHI64 : MoveToLOHI<"mthi", GPR64Opnd, [HI0_64]>, MTLO_FM<0x11>,
+ ISA_MIPS3_NOT_32R6_64R6;
+def MTLO64 : MoveToLOHI<"mtlo", GPR64Opnd, [LO0_64]>, MTLO_FM<0x13>,
+ ISA_MIPS3_NOT_32R6_64R6;
+def MFHI64 : MoveFromLOHI<"mfhi", GPR64Opnd, AC0_64>, MFLO_FM<0x10>,
+ ISA_MIPS3_NOT_32R6_64R6;
+def MFLO64 : MoveFromLOHI<"mflo", GPR64Opnd, AC0_64>, MFLO_FM<0x12>,
+ ISA_MIPS3_NOT_32R6_64R6;
+def PseudoMFHI64 : PseudoMFLOHI<GPR64, ACC128, MipsMFHI>,
+ ISA_MIPS3_NOT_32R6_64R6;
+def PseudoMFLO64 : PseudoMFLOHI<GPR64, ACC128, MipsMFLO>,
+ ISA_MIPS3_NOT_32R6_64R6;
+def PseudoMTLOHI64 : PseudoMTLOHI<ACC128, GPR64>, ISA_MIPS3_NOT_32R6_64R6;
+
+/// Sign Ext In Register Instructions.
+def SEB64 : SignExtInReg<"seb", i8, GPR64Opnd, II_SEB>, SEB_FM<0x10, 0x20>,
+ ISA_MIPS32R2;
+def SEH64 : SignExtInReg<"seh", i16, GPR64Opnd, II_SEH>, SEB_FM<0x18, 0x20>,
+ ISA_MIPS32R2;
+}
+
+/// Count Leading
+let AdditionalPredicates = [NotInMicroMips] in {
+ def DCLZ : StdMMR6Rel, CountLeading0<"dclz", GPR64Opnd, II_DCLZ>,
+ CLO_FM<0x24>, ISA_MIPS64_NOT_64R6;
+ def DCLO : StdMMR6Rel, CountLeading1<"dclo", GPR64Opnd, II_DCLO>,
+ CLO_FM<0x25>, ISA_MIPS64_NOT_64R6;
+
+/// Double Word Swap Bytes/HalfWords
+ def DSBH : SubwordSwap<"dsbh", GPR64Opnd, II_DSBH>, SEB_FM<2, 0x24>,
+ ISA_MIPS64R2;
+ def DSHD : SubwordSwap<"dshd", GPR64Opnd, II_DSHD>, SEB_FM<5, 0x24>,
+ ISA_MIPS64R2;
+}
+
+def LEA_ADDiu64 : EffectiveAddress<"daddiu", GPR64Opnd>, LW_FM<0x19>;
+
+let isCodeGenOnly = 1 in
+def RDHWR64 : ReadHardware<GPR64Opnd, HWRegsOpnd>, RDHWR_FM;
+
+let AdditionalPredicates = [NotInMicroMips] in {
+ // The 'pos + size' constraints are enforced by the code that lowers into
+ // MipsISD::Ext.
+ def DEXT : ExtBase<"dext", GPR64Opnd, uimm5_report_uimm6, uimm5_plus1,
+ immZExt5, immZExt5Plus1, MipsExt>, EXT_FM<3>,
+ ISA_MIPS64R2;
+ def DEXTM : ExtBase<"dextm", GPR64Opnd, uimm5, uimm5_plus33, immZExt5,
+ immZExt5Plus33, MipsExt>, EXT_FM<1>, ISA_MIPS64R2;
+ def DEXTU : ExtBase<"dextu", GPR64Opnd, uimm5_plus32, uimm5_plus1,
+ immZExt5Plus32, immZExt5Plus1, MipsExt>, EXT_FM<2>,
+ ISA_MIPS64R2;
+ def DINS : InsBase<"dins", GPR64Opnd, uimm6, uimm5_inssize_plus1, MipsIns>,
+ EXT_FM<7>, ISA_MIPS64R2;
+ def DINSU : InsBase<"dinsu", GPR64Opnd, uimm5_plus32, uimm5_inssize_plus1>,
+ EXT_FM<6>, ISA_MIPS64R2;
+ def DINSM : InsBase<"dinsm", GPR64Opnd, uimm5, uimm5_inssize_plus1>,
+ EXT_FM<5>, ISA_MIPS64R2;
+}
+
+let isCodeGenOnly = 1, rs = 0, shamt = 0 in {
+ def DSLL64_32 : FR<0x00, 0x3c, (outs GPR64:$rd), (ins GPR32:$rt),
+ "dsll\t$rd, $rt, 32", [], II_DSLL>;
+ def SLL64_32 : FR<0x0, 0x00, (outs GPR64:$rd), (ins GPR32:$rt),
+ "sll\t$rd, $rt, 0", [], II_SLL>;
+ def SLL64_64 : FR<0x0, 0x00, (outs GPR64:$rd), (ins GPR64:$rt),
+ "sll\t$rd, $rt, 0", [], II_SLL>;
+}
+
+// We need the following pseudo instruction to avoid offset calculation for
+// long branches. See the comment in file MipsLongBranch.cpp for detailed
+// explanation.
+
+// Expands to: daddiu $dst, $src, %PART($tgt - $baltgt)
+// where %PART may be %hi or %lo, depending on the relocation kind
+// that $tgt is annotated with.
+def LONG_BRANCH_DADDiu : PseudoSE<(outs GPR64Opnd:$dst),
+ (ins GPR64Opnd:$src, brtarget:$tgt, brtarget:$baltgt), []>;
+
+// Cavium Octeon cnMIPS instructions
+let DecoderNamespace = "CnMips",
+ // FIXME: The lack of HasStdEnc is probably a bug
+ EncodingPredicates = []<Predicate> in {
+
+class Count1s<string opstr, RegisterOperand RO>:
+ InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
+ [(set RO:$rd, (ctpop RO:$rs))], II_POP, FrmR, opstr> {
+ let TwoOperandAliasConstraint = "$rd = $rs";
+}
+
+class ExtsCins<string opstr, InstrItinClass itin,
+ SDPatternOperator Op = null_frag>:
+ InstSE<(outs GPR64Opnd:$rt), (ins GPR64Opnd:$rs, uimm5:$pos, uimm5:$lenm1),
+ !strconcat(opstr, " $rt, $rs, $pos, $lenm1"),
+ [(set GPR64Opnd:$rt, (Op GPR64Opnd:$rs, imm:$pos, imm:$lenm1))],
+ itin, FrmR, opstr> {
+ let TwoOperandAliasConstraint = "$rt = $rs";
+}
+
+class SetCC64_R<string opstr, PatFrag cond_op> :
+ InstSE<(outs GPR64Opnd:$rd), (ins GPR64Opnd:$rs, GPR64Opnd:$rt),
+ !strconcat(opstr, "\t$rd, $rs, $rt"),
+ [(set GPR64Opnd:$rd, (zext (cond_op GPR64Opnd:$rs,
+ GPR64Opnd:$rt)))],
+ II_SEQ_SNE, FrmR, opstr> {
+ let TwoOperandAliasConstraint = "$rd = $rs";
+}
+
+class SetCC64_I<string opstr, PatFrag cond_op>:
+ InstSE<(outs GPR64Opnd:$rt), (ins GPR64Opnd:$rs, simm10_64:$imm10),
+ !strconcat(opstr, "\t$rt, $rs, $imm10"),
+ [(set GPR64Opnd:$rt, (zext (cond_op GPR64Opnd:$rs,
+ immSExt10_64:$imm10)))],
+ II_SEQI_SNEI, FrmI, opstr> {
+ let TwoOperandAliasConstraint = "$rt = $rs";
+}
+
+class CBranchBitNum<string opstr, DAGOperand opnd, PatFrag cond_op,
+ RegisterOperand RO, Operand ImmOp, bits<64> shift = 1> :
+ InstSE<(outs), (ins RO:$rs, ImmOp:$p, opnd:$offset),
+ !strconcat(opstr, "\t$rs, $p, $offset"),
+ [(brcond (i32 (cond_op (and RO:$rs, (shl shift, immZExt5_64:$p)), 0)),
+ bb:$offset)], II_BBIT, FrmI, opstr> {
+ let isBranch = 1;
+ let isTerminator = 1;
+ let hasDelaySlot = 1;
+ let Defs = [AT];
+}
+
+class MFC2OP<string asmstr, RegisterOperand RO, InstrItinClass itin> :
+ InstSE<(outs RO:$rt, uimm16:$imm16), (ins),
+ !strconcat(asmstr, "\t$rt, $imm16"), [], itin, FrmFR>;
+
+// Unsigned Byte Add
+def BADDu : ArithLogicR<"baddu", GPR64Opnd, 1, II_BADDU>,
+ ADD_FM<0x1c, 0x28>, ASE_CNMIPS {
+ let Pattern = [(set GPR64Opnd:$rd,
+ (and (add GPR64Opnd:$rs, GPR64Opnd:$rt), 255))];
+}
+
+// Branch on Bit Clear /+32
+def BBIT0 : CBranchBitNum<"bbit0", brtarget, seteq, GPR64Opnd,
+ uimm5_64_report_uimm6>, BBIT_FM<0x32>, ASE_CNMIPS;
+def BBIT032: CBranchBitNum<"bbit032", brtarget, seteq, GPR64Opnd, uimm5_64,
+ 0x100000000>, BBIT_FM<0x36>, ASE_CNMIPS;
+
+// Branch on Bit Set /+32
+def BBIT1 : CBranchBitNum<"bbit1", brtarget, setne, GPR64Opnd,
+ uimm5_64_report_uimm6>, BBIT_FM<0x3a>, ASE_CNMIPS;
+def BBIT132: CBranchBitNum<"bbit132", brtarget, setne, GPR64Opnd, uimm5_64,
+ 0x100000000>, BBIT_FM<0x3e>, ASE_CNMIPS;
+
+// Multiply Doubleword to GPR
+def DMUL : ArithLogicR<"dmul", GPR64Opnd, 1, II_DMUL, mul>,
+ ADD_FM<0x1c, 0x03>, ASE_CNMIPS {
+ let Defs = [HI0, LO0, P0, P1, P2];
+}
+
+// Extract a signed bit field /+32
+def EXTS : ExtsCins<"exts", II_EXT>, EXTS_FM<0x3a>, ASE_CNMIPS;
+def EXTS32: ExtsCins<"exts32", II_EXT>, EXTS_FM<0x3b>, ASE_CNMIPS;
+
+// Clear and insert a bit field /+32
+def CINS : ExtsCins<"cins", II_INS>, EXTS_FM<0x32>, ASE_CNMIPS;
+def CINS32: ExtsCins<"cins32", II_INS>, EXTS_FM<0x33>, ASE_CNMIPS;
+
+// Move to multiplier/product register
+def MTM0 : MoveToLOHI<"mtm0", GPR64Opnd, [MPL0, P0, P1, P2]>, MTMR_FM<0x08>,
+ ASE_CNMIPS;
+def MTM1 : MoveToLOHI<"mtm1", GPR64Opnd, [MPL1, P0, P1, P2]>, MTMR_FM<0x0c>,
+ ASE_CNMIPS;
+def MTM2 : MoveToLOHI<"mtm2", GPR64Opnd, [MPL2, P0, P1, P2]>, MTMR_FM<0x0d>,
+ ASE_CNMIPS;
+def MTP0 : MoveToLOHI<"mtp0", GPR64Opnd, [P0]>, MTMR_FM<0x09>, ASE_CNMIPS;
+def MTP1 : MoveToLOHI<"mtp1", GPR64Opnd, [P1]>, MTMR_FM<0x0a>, ASE_CNMIPS;
+def MTP2 : MoveToLOHI<"mtp2", GPR64Opnd, [P2]>, MTMR_FM<0x0b>, ASE_CNMIPS;
+
+// Count Ones in a Word/Doubleword
+def POP : Count1s<"pop", GPR32Opnd>, POP_FM<0x2c>, ASE_CNMIPS;
+def DPOP : Count1s<"dpop", GPR64Opnd>, POP_FM<0x2d>, ASE_CNMIPS;
+
+// Set on equal/not equal
+def SEQ : SetCC64_R<"seq", seteq>, SEQ_FM<0x2a>, ASE_CNMIPS;
+def SEQi : SetCC64_I<"seqi", seteq>, SEQI_FM<0x2e>, ASE_CNMIPS;
+def SNE : SetCC64_R<"sne", setne>, SEQ_FM<0x2b>, ASE_CNMIPS;
+def SNEi : SetCC64_I<"snei", setne>, SEQI_FM<0x2f>, ASE_CNMIPS;
+
+// 192-bit x 64-bit Unsigned Multiply and Add
+def V3MULU: ArithLogicR<"v3mulu", GPR64Opnd, 0, II_DMUL>, ADD_FM<0x1c, 0x11>,
+ ASE_CNMIPS {
+ let Defs = [P0, P1, P2];
+}
+
+// 64-bit Unsigned Multiply and Add Move
+def VMM0 : ArithLogicR<"vmm0", GPR64Opnd, 0, II_DMUL>, ADD_FM<0x1c, 0x10>,
+ ASE_CNMIPS {
+ let Defs = [MPL0, P0, P1, P2];
+}
+
+// 64-bit Unsigned Multiply and Add
+def VMULU : ArithLogicR<"vmulu", GPR64Opnd, 0, II_DMUL>, ADD_FM<0x1c, 0x0f>,
+ ASE_CNMIPS {
+ let Defs = [MPL1, MPL2, P0, P1, P2];
+}
+
+// Move between CPU and coprocessor registers
+def DMFC2_OCTEON : MFC2OP<"dmfc2", GPR64Opnd, II_DMFC2>, MFC2OP_FM<0x12, 1>,
+ ASE_CNMIPS;
+def DMTC2_OCTEON : MFC2OP<"dmtc2", GPR64Opnd, II_DMTC2>, MFC2OP_FM<0x12, 5>,
+ ASE_CNMIPS;
+}
+
+}
+
+/// Move between CPU and coprocessor registers
+let DecoderNamespace = "Mips64", Predicates = [HasMips64] in {
+def DMFC0 : MFC3OP<"dmfc0", GPR64Opnd, COP0Opnd, II_DMFC0>, MFC3OP_FM<0x10, 1>,
+ ISA_MIPS3;
+def DMTC0 : MTC3OP<"dmtc0", COP0Opnd, GPR64Opnd, II_DMTC0>, MFC3OP_FM<0x10, 5>,
+ ISA_MIPS3;
+def DMFC2 : MFC3OP<"dmfc2", GPR64Opnd, COP2Opnd, II_DMFC2>, MFC3OP_FM<0x12, 1>,
+ ISA_MIPS3;
+def DMTC2 : MTC3OP<"dmtc2", COP2Opnd, GPR64Opnd, II_DMTC2>, MFC3OP_FM<0x12, 5>,
+ ISA_MIPS3;
+}
+
+//===----------------------------------------------------------------------===//
+// Arbitrary patterns that map to one or more instructions
+//===----------------------------------------------------------------------===//
+
+// Materialize i64 constants.
+defm : MaterializeImms<i64, ZERO_64, DADDiu, LUi64, ORi64>;
+
+def : MipsPat<(i64 immZExt32Low16Zero:$imm),
+ (DSLL (ORi64 ZERO_64, (HI16 imm:$imm)), 16)>;
+
+def : MipsPat<(i64 immZExt32:$imm),
+ (ORi64 (DSLL (ORi64 ZERO_64, (HI16 imm:$imm)), 16),
+ (LO16 imm:$imm))>;
+
+// extended loads
+def : MipsPat<(i64 (extloadi1 addr:$src)), (LB64 addr:$src)>;
+def : MipsPat<(i64 (extloadi8 addr:$src)), (LB64 addr:$src)>;
+def : MipsPat<(i64 (extloadi16 addr:$src)), (LH64 addr:$src)>;
+def : MipsPat<(i64 (extloadi32 addr:$src)), (LW64 addr:$src)>;
+
+// hi/lo relocs
+def : MipsPat<(MipsHi tglobaladdr:$in), (LUi64 tglobaladdr:$in)>;
+def : MipsPat<(MipsHi tblockaddress:$in), (LUi64 tblockaddress:$in)>;
+def : MipsPat<(MipsHi tjumptable:$in), (LUi64 tjumptable:$in)>;
+def : MipsPat<(MipsHi tconstpool:$in), (LUi64 tconstpool:$in)>;
+def : MipsPat<(MipsHi tglobaltlsaddr:$in), (LUi64 tglobaltlsaddr:$in)>;
+def : MipsPat<(MipsHi texternalsym:$in), (LUi64 texternalsym:$in)>;
+
+let AdditionalPredicates = [NotInMicroMips] in {
+ def : MipsPat<(MipsLo tglobaladdr:$in), (DADDiu ZERO_64, tglobaladdr:$in)>;
+ def : MipsPat<(MipsLo tblockaddress:$in),
+ (DADDiu ZERO_64, tblockaddress:$in)>;
+ def : MipsPat<(MipsLo tjumptable:$in), (DADDiu ZERO_64, tjumptable:$in)>;
+ def : MipsPat<(MipsLo tconstpool:$in), (DADDiu ZERO_64, tconstpool:$in)>;
+ def : MipsPat<(MipsLo tglobaltlsaddr:$in),
+ (DADDiu ZERO_64, tglobaltlsaddr:$in)>;
+ def : MipsPat<(MipsLo texternalsym:$in), (DADDiu ZERO_64, texternalsym:$in)>;
+
+ def : MipsPat<(add GPR64:$hi, (MipsLo tglobaladdr:$lo)),
+ (DADDiu GPR64:$hi, tglobaladdr:$lo)>;
+ def : MipsPat<(add GPR64:$hi, (MipsLo tblockaddress:$lo)),
+ (DADDiu GPR64:$hi, tblockaddress:$lo)>;
+ def : MipsPat<(add GPR64:$hi, (MipsLo tjumptable:$lo)),
+ (DADDiu GPR64:$hi, tjumptable:$lo)>;
+ def : MipsPat<(add GPR64:$hi, (MipsLo tconstpool:$lo)),
+ (DADDiu GPR64:$hi, tconstpool:$lo)>;
+ def : MipsPat<(add GPR64:$hi, (MipsLo tglobaltlsaddr:$lo)),
+ (DADDiu GPR64:$hi, tglobaltlsaddr:$lo)>;
+
+ def : WrapperPat<tglobaladdr, DADDiu, GPR64>;
+ def : WrapperPat<tconstpool, DADDiu, GPR64>;
+ def : WrapperPat<texternalsym, DADDiu, GPR64>;
+ def : WrapperPat<tblockaddress, DADDiu, GPR64>;
+ def : WrapperPat<tjumptable, DADDiu, GPR64>;
+ def : WrapperPat<tglobaltlsaddr, DADDiu, GPR64>;
+}
+
+defm : BrcondPats<GPR64, BEQ64, BEQ, BNE64, SLT64, SLTu64, SLTi64, SLTiu64,
+ ZERO_64>;
+def : MipsPat<(brcond (i32 (setlt i64:$lhs, 1)), bb:$dst),
+ (BLEZ64 i64:$lhs, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setgt i64:$lhs, -1)), bb:$dst),
+ (BGEZ64 i64:$lhs, bb:$dst)>;
+
+// setcc patterns
+let AdditionalPredicates = [NotInMicroMips] in {
+ defm : SeteqPats<GPR64, SLTiu64, XOR64, SLTu64, ZERO_64>;
+ defm : SetlePats<GPR64, XORi, SLT64, SLTu64>;
+ defm : SetgtPats<GPR64, SLT64, SLTu64>;
+ defm : SetgePats<GPR64, XORi, SLT64, SLTu64>;
+ defm : SetgeImmPats<GPR64, XORi, SLTi64, SLTiu64>;
+}
+// truncate
+def : MipsPat<(trunc (assertsext GPR64:$src)),
+ (EXTRACT_SUBREG GPR64:$src, sub_32)>;
+// The forward compatibility strategy employed by MIPS requires us to treat
+// values as being sign extended to an infinite number of bits. This allows
+// existing software to run without modification on any future MIPS
+// implementation (e.g. 128-bit, or 1024-bit). Being compatible with this
+// strategy requires that truncation acts as a sign-extension for values being
+// fed into instructions operating on 32-bit values. Such instructions have
+// undefined results if this is not true.
+// For our case, this means that we can't issue an extract_subreg for nodes
+// such as (trunc:i32 (assertzext:i64 X, i32)), because the sign-bit of the
+// lower subreg would not be replicated into the upper half.
+def : MipsPat<(trunc (assertzext_lt_i32 GPR64:$src)),
+ (EXTRACT_SUBREG GPR64:$src, sub_32)>;
+def : MipsPat<(i32 (trunc GPR64:$src)),
+ (SLL (EXTRACT_SUBREG GPR64:$src, sub_32), 0)>;
+
+// variable shift instructions patterns
+def : MipsPat<(shl GPR64:$rt, (i32 (trunc GPR64:$rs))),
+ (DSLLV GPR64:$rt, (EXTRACT_SUBREG GPR64:$rs, sub_32))>;
+def : MipsPat<(srl GPR64:$rt, (i32 (trunc GPR64:$rs))),
+ (DSRLV GPR64:$rt, (EXTRACT_SUBREG GPR64:$rs, sub_32))>;
+def : MipsPat<(sra GPR64:$rt, (i32 (trunc GPR64:$rs))),
+ (DSRAV GPR64:$rt, (EXTRACT_SUBREG GPR64:$rs, sub_32))>;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def : MipsPat<(rotr GPR64:$rt, (i32 (trunc GPR64:$rs))),
+ (DROTRV GPR64:$rt, (EXTRACT_SUBREG GPR64:$rs, sub_32))>;
+}
+
+// 32-to-64-bit extension
+def : MipsPat<(i64 (anyext GPR32:$src)),
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32)>;
+def : MipsPat<(i64 (zext GPR32:$src)), (DSRL (DSLL64_32 GPR32:$src), 32)>;
+def : MipsPat<(i64 (sext GPR32:$src)), (SLL64_32 GPR32:$src)>;
+
+// Sign extend in register
+def : MipsPat<(i64 (sext_inreg GPR64:$src, i32)),
+ (SLL64_64 GPR64:$src)>;
+
+// bswap MipsPattern
+def : MipsPat<(bswap GPR64:$rt), (DSHD (DSBH GPR64:$rt))>;
+
+// Carry pattern
+let AdditionalPredicates = [NotInMicroMips] in {
+ def : MipsPat<(subc GPR64:$lhs, GPR64:$rhs),
+ (DSUBu GPR64:$lhs, GPR64:$rhs)>;
+ def : MipsPat<(addc GPR64:$lhs, GPR64:$rhs),
+ (DADDu GPR64:$lhs, GPR64:$rhs)>, ASE_NOT_DSP;
+ def : MipsPat<(addc GPR64:$lhs, immSExt16:$imm),
+ (DADDiu GPR64:$lhs, imm:$imm)>, ASE_NOT_DSP;
+}
+
+// Octeon bbit0/bbit1 MipsPattern
+def : MipsPat<(brcond (i32 (seteq (and i64:$lhs, PowerOf2LO:$mask), 0)), bb:$dst),
+ (BBIT0 i64:$lhs, (Log2LO PowerOf2LO:$mask), bb:$dst)>, ASE_MIPS64_CNMIPS;
+def : MipsPat<(brcond (i32 (seteq (and i64:$lhs, PowerOf2HI:$mask), 0)), bb:$dst),
+ (BBIT032 i64:$lhs, (Log2HI PowerOf2HI:$mask), bb:$dst)>, ASE_MIPS64_CNMIPS;
+def : MipsPat<(brcond (i32 (setne (and i64:$lhs, PowerOf2LO:$mask), 0)), bb:$dst),
+ (BBIT1 i64:$lhs, (Log2LO PowerOf2LO:$mask), bb:$dst)>, ASE_MIPS64_CNMIPS;
+def : MipsPat<(brcond (i32 (setne (and i64:$lhs, PowerOf2HI:$mask), 0)), bb:$dst),
+ (BBIT132 i64:$lhs, (Log2HI PowerOf2HI:$mask), bb:$dst)>, ASE_MIPS64_CNMIPS;
+
+// Atomic load patterns.
+def : MipsPat<(atomic_load_8 addr:$a), (LB64 addr:$a)>;
+def : MipsPat<(atomic_load_16 addr:$a), (LH64 addr:$a)>;
+def : MipsPat<(atomic_load_32 addr:$a), (LW64 addr:$a)>;
+def : MipsPat<(atomic_load_64 addr:$a), (LD addr:$a)>;
+
+// Atomic store patterns.
+def : MipsPat<(atomic_store_8 addr:$a, GPR64:$v), (SB64 GPR64:$v, addr:$a)>;
+def : MipsPat<(atomic_store_16 addr:$a, GPR64:$v), (SH64 GPR64:$v, addr:$a)>;
+def : MipsPat<(atomic_store_32 addr:$a, GPR64:$v), (SW64 GPR64:$v, addr:$a)>;
+def : MipsPat<(atomic_store_64 addr:$a, GPR64:$v), (SD GPR64:$v, addr:$a)>;
+
+//===----------------------------------------------------------------------===//
+// Instruction aliases
+//===----------------------------------------------------------------------===//
+let AdditionalPredicates = [NotInMicroMips] in {
+ def : MipsInstAlias<"move $dst, $src",
+ (OR64 GPR64Opnd:$dst, GPR64Opnd:$src, ZERO_64), 1>,
+ GPR_64;
+ def : MipsInstAlias<"move $dst, $src",
+ (DADDu GPR64Opnd:$dst, GPR64Opnd:$src, ZERO_64), 1>,
+ GPR_64;
+ def : MipsInstAlias<"dadd $rs, $rt, $imm",
+ (DADDi GPR64Opnd:$rs, GPR64Opnd:$rt, simm16_64:$imm),
+ 0>, ISA_MIPS3_NOT_32R6_64R6;
+ def : MipsInstAlias<"dadd $rs, $imm",
+ (DADDi GPR64Opnd:$rs, GPR64Opnd:$rs, simm16_64:$imm),
+ 0>, ISA_MIPS3_NOT_32R6_64R6;
+ def : MipsInstAlias<"daddu $rs, $rt, $imm",
+ (DADDiu GPR64Opnd:$rs, GPR64Opnd:$rt, simm16_64:$imm),
+ 0>, ISA_MIPS3;
+ def : MipsInstAlias<"daddu $rs, $imm",
+ (DADDiu GPR64Opnd:$rs, GPR64Opnd:$rs, simm16_64:$imm),
+ 0>, ISA_MIPS3;
+}
+def : MipsInstAlias<"dsll $rd, $rt, $rs",
+ (DSLLV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>,
+ ISA_MIPS3;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def : MipsInstAlias<"dneg $rt, $rs",
+ (DSUB GPR64Opnd:$rt, ZERO_64, GPR64Opnd:$rs), 1>,
+ ISA_MIPS3;
+ def : MipsInstAlias<"dneg $rt",
+ (DSUB GPR64Opnd:$rt, ZERO_64, GPR64Opnd:$rt), 1>,
+ ISA_MIPS3;
+ def : MipsInstAlias<"dnegu $rt, $rs",
+ (DSUBu GPR64Opnd:$rt, ZERO_64, GPR64Opnd:$rs), 1>,
+ ISA_MIPS3;
+ def : MipsInstAlias<"dnegu $rt",
+ (DSUBu GPR64Opnd:$rt, ZERO_64, GPR64Opnd:$rt), 1>,
+ ISA_MIPS3;
+}
+def : MipsInstAlias<"dsubi $rs, $rt, $imm",
+ (DADDi GPR64Opnd:$rs, GPR64Opnd:$rt,
+ InvertedImOperand64:$imm),
+ 0>, ISA_MIPS3_NOT_32R6_64R6;
+def : MipsInstAlias<"dsubi $rs, $imm",
+ (DADDi GPR64Opnd:$rs, GPR64Opnd:$rs,
+ InvertedImOperand64:$imm),
+ 0>, ISA_MIPS3_NOT_32R6_64R6;
+def : MipsInstAlias<"dsub $rs, $rt, $imm",
+ (DADDi GPR64Opnd:$rs, GPR64Opnd:$rt,
+ InvertedImOperand64:$imm),
+ 0>, ISA_MIPS3_NOT_32R6_64R6;
+def : MipsInstAlias<"dsub $rs, $imm",
+ (DADDi GPR64Opnd:$rs, GPR64Opnd:$rs,
+ InvertedImOperand64:$imm),
+ 0>, ISA_MIPS3_NOT_32R6_64R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def : MipsInstAlias<"dsubu $rt, $rs, $imm",
+ (DADDiu GPR64Opnd:$rt, GPR64Opnd:$rs,
+ InvertedImOperand64:$imm), 0>, ISA_MIPS3;
+ def : MipsInstAlias<"dsubu $rs, $imm",
+ (DADDiu GPR64Opnd:$rs, GPR64Opnd:$rs,
+ InvertedImOperand64:$imm), 0>, ISA_MIPS3;
+}
+def : MipsInstAlias<"dsra $rd, $rt, $rs",
+ (DSRAV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>,
+ ISA_MIPS3;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def : MipsInstAlias<"dsrl $rd, $rt, $rs",
+ (DSRLV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>,
+ ISA_MIPS3;
+
+// Two operand (implicit 0 selector) versions:
+ def : MipsInstAlias<"dmtc0 $rt, $rd",
+ (DMTC0 COP0Opnd:$rd, GPR64Opnd:$rt, 0), 0>;
+ def : MipsInstAlias<"dmfc0 $rt, $rd",
+ (DMFC0 GPR64Opnd:$rt, COP0Opnd:$rd, 0), 0>;
+}
+def : MipsInstAlias<"dmfc2 $rt, $rd", (DMFC2 GPR64Opnd:$rt, COP2Opnd:$rd, 0), 0>;
+def : MipsInstAlias<"dmtc2 $rt, $rd", (DMTC2 COP2Opnd:$rd, GPR64Opnd:$rt, 0), 0>;
+
+def : MipsInstAlias<"synciobdma", (SYNC 0x2), 0>, ASE_MIPS64_CNMIPS;
+def : MipsInstAlias<"syncs", (SYNC 0x6), 0>, ASE_MIPS64_CNMIPS;
+def : MipsInstAlias<"syncw", (SYNC 0x4), 0>, ASE_MIPS64_CNMIPS;
+def : MipsInstAlias<"syncws", (SYNC 0x5), 0>, ASE_MIPS64_CNMIPS;
+
+// cnMIPS Aliases.
+
+// bbit* with $p 32-63 converted to bbit*32 with $p 0-31
+def : MipsInstAlias<"bbit0 $rs, $p, $offset",
+ (BBIT032 GPR64Opnd:$rs, uimm5_plus32_normalize_64:$p,
+ brtarget:$offset), 0>,
+ ASE_CNMIPS;
+def : MipsInstAlias<"bbit1 $rs, $p, $offset",
+ (BBIT132 GPR64Opnd:$rs, uimm5_plus32_normalize_64:$p,
+ brtarget:$offset), 0>,
+ ASE_CNMIPS;
+
+// exts with $pos 32-63 in converted to exts32 with $pos 0-31
+def : MipsInstAlias<"exts $rt, $rs, $pos, $lenm1",
+ (EXTS32 GPR64Opnd:$rt, GPR64Opnd:$rs,
+ uimm5_plus32_normalize:$pos, uimm5:$lenm1), 0>,
+ ASE_CNMIPS;
+def : MipsInstAlias<"exts $rt, $pos, $lenm1",
+ (EXTS32 GPR64Opnd:$rt, GPR64Opnd:$rt,
+ uimm5_plus32_normalize:$pos, uimm5:$lenm1), 0>,
+ ASE_CNMIPS;
+
+// cins with $pos 32-63 in converted to cins32 with $pos 0-31
+def : MipsInstAlias<"cins $rt, $rs, $pos, $lenm1",
+ (CINS32 GPR64Opnd:$rt, GPR64Opnd:$rs,
+ uimm5_plus32_normalize:$pos, uimm5:$lenm1), 0>,
+ ASE_CNMIPS;
+def : MipsInstAlias<"cins $rt, $pos, $lenm1",
+ (CINS32 GPR64Opnd:$rt, GPR64Opnd:$rt,
+ uimm5_plus32_normalize:$pos, uimm5:$lenm1), 0>,
+ ASE_CNMIPS;
+
+//===----------------------------------------------------------------------===//
+// Assembler Pseudo Instructions
+//===----------------------------------------------------------------------===//
+
+class LoadImmediate64<string instr_asm, Operand Od, RegisterOperand RO> :
+ MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm64),
+ !strconcat(instr_asm, "\t$rt, $imm64")> ;
+def LoadImm64 : LoadImmediate64<"dli", imm64, GPR64Opnd>;
+
+def LoadAddrReg64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rt), (ins mem:$addr),
+ "dla\t$rt, $addr">;
+def LoadAddrImm64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rt), (ins imm64:$imm64),
+ "dla\t$rt, $imm64">;
diff --git a/contrib/llvm/lib/Target/Mips/Mips64r6InstrInfo.td b/contrib/llvm/lib/Target/Mips/Mips64r6InstrInfo.td
new file mode 100644
index 000000000000..dabf4e0a52e2
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips64r6InstrInfo.td
@@ -0,0 +1,279 @@
+//=- Mips64r6InstrInfo.td - Mips64r6 Instruction Information -*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips64r6 instructions.
+//
+//===----------------------------------------------------------------------===//
+
+// Notes about removals/changes from MIPS32r6:
+// Reencoded: dclo, dclz
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Encodings
+//
+//===----------------------------------------------------------------------===//
+
+class DALIGN_ENC : SPECIAL3_DALIGN_FM<OPCODE6_DALIGN>;
+class DAUI_ENC : DAUI_FM;
+class DAHI_ENC : REGIMM_FM<OPCODE5_DAHI>;
+class DATI_ENC : REGIMM_FM<OPCODE5_DATI>;
+class DBITSWAP_ENC : SPECIAL3_2R_FM<OPCODE6_DBITSWAP>;
+class DCLO_R6_ENC : SPECIAL_2R_FM<OPCODE6_DCLO>;
+class DCLZ_R6_ENC : SPECIAL_2R_FM<OPCODE6_DCLZ>;
+class DDIV_ENC : SPECIAL_3R_FM<0b00010, 0b011110>;
+class DDIVU_ENC : SPECIAL_3R_FM<0b00010, 0b011111>;
+class DLSA_R6_ENC : SPECIAL_LSA_FM<OPCODE6_DLSA>;
+class DMOD_ENC : SPECIAL_3R_FM<0b00011, 0b011110>;
+class DMODU_ENC : SPECIAL_3R_FM<0b00011, 0b011111>;
+class DMUH_ENC : SPECIAL_3R_FM<0b00011, 0b011100>;
+class DMUHU_ENC : SPECIAL_3R_FM<0b00011, 0b011101>;
+class DMUL_R6_ENC : SPECIAL_3R_FM<0b00010, 0b011100>;
+class DMULU_ENC : SPECIAL_3R_FM<0b00010, 0b011101>;
+class LDPC_ENC : PCREL18_FM<OPCODE3_LDPC>;
+class LLD_R6_ENC : SPECIAL3_LL_SC_FM<OPCODE6_LLD>;
+class SCD_R6_ENC : SPECIAL3_LL_SC_FM<OPCODE6_SCD>;
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Descriptions
+//
+//===----------------------------------------------------------------------===//
+
+class AHI_ATI_DESC_BASE<string instr_asm, RegisterOperand GPROpnd, InstrItinClass itin> {
+ dag OutOperandList = (outs GPROpnd:$rs);
+ dag InOperandList = (ins GPROpnd:$rt, uimm16_altrelaxed:$imm);
+ string AsmString = !strconcat(instr_asm, "\t$rs, $rt, $imm");
+ string Constraints = "$rs = $rt";
+ InstrItinClass Itinerary = itin;
+}
+
+class DALIGN_DESC : ALIGN_DESC_BASE<"dalign", GPR64Opnd, uimm3, II_DALIGN>;
+class DAHI_DESC : AHI_ATI_DESC_BASE<"dahi", GPR64Opnd, II_DAHI>;
+class DATI_DESC : AHI_ATI_DESC_BASE<"dati", GPR64Opnd, II_DATI>;
+class DAUI_DESC : AUI_DESC_BASE<"daui", GPR64Opnd, II_DAUI>;
+class DBITSWAP_DESC : BITSWAP_DESC_BASE<"dbitswap", GPR64Opnd, II_DBITSWAP>;
+class DCLO_R6_DESC : CLO_R6_DESC_BASE<"dclo", GPR64Opnd, II_DCLO>;
+class DCLZ_R6_DESC : CLZ_R6_DESC_BASE<"dclz", GPR64Opnd, II_DCLZ>;
+class DDIV_DESC : DIVMOD_DESC_BASE<"ddiv", GPR64Opnd, II_DDIV, sdiv>;
+class DDIVU_DESC : DIVMOD_DESC_BASE<"ddivu", GPR64Opnd, II_DDIVU, udiv>;
+class DLSA_R6_DESC : LSA_R6_DESC_BASE<"dlsa", GPR64Opnd, uimm2_plus1, II_DLSA>;
+class DMOD_DESC : DIVMOD_DESC_BASE<"dmod", GPR64Opnd, II_DMOD, srem>;
+class DMODU_DESC : DIVMOD_DESC_BASE<"dmodu", GPR64Opnd, II_DMODU, urem>;
+class DMUH_DESC : MUL_R6_DESC_BASE<"dmuh", GPR64Opnd, II_DMUH, mulhs>;
+class DMUHU_DESC : MUL_R6_DESC_BASE<"dmuhu", GPR64Opnd, II_DMUHU, mulhu>;
+class DMUL_R6_DESC : MUL_R6_DESC_BASE<"dmul", GPR64Opnd, II_DMUL, mul>;
+class DMULU_DESC : MUL_R6_DESC_BASE<"dmulu", GPR64Opnd, II_DMUL>;
+class LDPC_DESC : PCREL_DESC_BASE<"ldpc", GPR64Opnd, simm18_lsl3, II_LDPC>;
+class LLD_R6_DESC : LL_R6_DESC_BASE<"lld", GPR64Opnd, mem_simm16, II_LLD>;
+class SCD_R6_DESC : SC_R6_DESC_BASE<"scd", GPR64Opnd, II_SCD>;
+class SELEQZ64_DESC : SELEQNE_Z_DESC_BASE<"seleqz", GPR64Opnd>;
+class SELNEZ64_DESC : SELEQNE_Z_DESC_BASE<"selnez", GPR64Opnd>;
+
+class BGEC64_DESC : CMP_BC_DESC_BASE<"bgec", brtarget, GPR64Opnd>;
+class BGEUC64_DESC : CMP_BC_DESC_BASE<"bgeuc", brtarget, GPR64Opnd>;
+class BEQC64_DESC : CMP_BC_DESC_BASE<"beqc", brtarget, GPR64Opnd>;
+class BNEC64_DESC : CMP_BC_DESC_BASE<"bnec", brtarget, GPR64Opnd>;
+class BLTC64_DESC : CMP_BC_DESC_BASE<"bltc", brtarget, GPR64Opnd>;
+class BLTUC64_DESC : CMP_BC_DESC_BASE<"bltuc", brtarget, GPR64Opnd>;
+class BLTZC64_DESC : CMP_CBR_RT_Z_DESC_BASE<"bltzc", brtarget, GPR64Opnd>;
+class BGEZC64_DESC : CMP_CBR_RT_Z_DESC_BASE<"bgezc", brtarget, GPR64Opnd>;
+class BLEZC64_DESC : CMP_CBR_RT_Z_DESC_BASE<"blezc", brtarget, GPR64Opnd>;
+class BGTZC64_DESC : CMP_CBR_RT_Z_DESC_BASE<"bgtzc", brtarget, GPR64Opnd>;
+class BEQZC64_DESC : CMP_CBR_EQNE_Z_DESC_BASE<"beqzc", brtarget21, GPR64Opnd>;
+class BNEZC64_DESC : CMP_CBR_EQNE_Z_DESC_BASE<"bnezc", brtarget21, GPR64Opnd>;
+
+class JIALC64_DESC : JMP_IDX_COMPACT_DESC_BASE<"jialc", calloffset16,
+ GPR64Opnd, II_JIALC> {
+ bit isCall = 1;
+ list<Register> Defs = [RA];
+}
+
+class JIC64_DESC : JMP_IDX_COMPACT_DESC_BASE<"jic", jmpoffset16, GPR64Opnd,
+ II_JIC> {
+ bit isBarrier = 1;
+ bit isTerminator = 1;
+ list<Register> Defs = [AT];
+}
+
+class LL64_R6_DESC : LL_R6_DESC_BASE<"ll", GPR32Opnd, mem_simm9, II_LL>;
+class SC64_R6_DESC : SC_R6_DESC_BASE<"sc", GPR32Opnd, II_SC>;
+//===----------------------------------------------------------------------===//
+//
+// Instruction Definitions
+//
+//===----------------------------------------------------------------------===//
+
+let AdditionalPredicates = [NotInMicroMips] in {
+ let DecoderMethod = "DecodeDAHIDATI" in {
+ def DATI : DATI_ENC, DATI_DESC, ISA_MIPS64R6;
+ def DAHI : DAHI_ENC, DAHI_DESC, ISA_MIPS64R6;
+ }
+ def DAUI : DAUI_ENC, DAUI_DESC, ISA_MIPS64R6;
+ def DALIGN : DALIGN_ENC, DALIGN_DESC, ISA_MIPS64R6;
+ def DBITSWAP : R6MMR6Rel, DBITSWAP_ENC, DBITSWAP_DESC, ISA_MIPS64R6;
+ def DCLO_R6 : R6MMR6Rel, DCLO_R6_ENC, DCLO_R6_DESC, ISA_MIPS64R6;
+ def DCLZ_R6 : R6MMR6Rel, DCLZ_R6_ENC, DCLZ_R6_DESC, ISA_MIPS64R6;
+ def DDIV : DDIV_ENC, DDIV_DESC, ISA_MIPS64R6;
+ def DDIVU : DDIVU_ENC, DDIVU_DESC, ISA_MIPS64R6;
+ def DMOD : DMOD_ENC, DMOD_DESC, ISA_MIPS64R6;
+ def DMODU : DMODU_ENC, DMODU_DESC, ISA_MIPS64R6;
+ def DLSA_R6 : R6MMR6Rel, DLSA_R6_ENC, DLSA_R6_DESC, ISA_MIPS64R6;
+ def DMUH: DMUH_ENC, DMUH_DESC, ISA_MIPS64R6;
+ def DMUHU: DMUHU_ENC, DMUHU_DESC, ISA_MIPS64R6;
+ def DMUL_R6: DMUL_R6_ENC, DMUL_R6_DESC, ISA_MIPS64R6;
+ def DMULU: DMULU_ENC, DMULU_DESC, ISA_MIPS64R6;
+ def LLD_R6 : R6MMR6Rel, LLD_R6_ENC, LLD_R6_DESC, ISA_MIPS64R6;
+}
+def LDPC: R6MMR6Rel, LDPC_ENC, LDPC_DESC, ISA_MIPS64R6;
+def SCD_R6 : SCD_R6_ENC, SCD_R6_DESC, ISA_MIPS32R6;
+let DecoderNamespace = "Mips32r6_64r6_GP64" in {
+ def SELEQZ64 : SELEQZ_ENC, SELEQZ64_DESC, ISA_MIPS32R6, GPR_64;
+ def SELNEZ64 : SELNEZ_ENC, SELNEZ64_DESC, ISA_MIPS32R6, GPR_64;
+}
+let AdditionalPredicates = [NotInMicroMips],
+ DecoderNamespace = "Mips32r6_64r6_PTR64" in {
+ def LL64_R6 : LL_R6_ENC, LL64_R6_DESC, PTR_64, ISA_MIPS64R6;
+ def SC64_R6 : SC_R6_ENC, SC64_R6_DESC, PTR_64, ISA_MIPS64R6;
+}
+
+let DecoderNamespace = "Mips32r6_64r6_GP64" in {
+// Jump and Branch Instructions
+def JIALC64 : JIALC_ENC, JIALC64_DESC, ISA_MIPS64R6, GPR_64;
+def JIC64 : JIC_ENC, JIC64_DESC, ISA_MIPS64R6, GPR_64;
+
+def BEQC64 : BEQC_ENC, BEQC64_DESC, ISA_MIPS64R6, GPR_64;
+def BEQZC64 : BEQZC_ENC, BEQZC64_DESC, ISA_MIPS64R6, GPR_64;
+def BGEC64 : BGEC_ENC, BGEC64_DESC, ISA_MIPS64R6, GPR_64;
+def BGEUC64 : BGEUC_ENC, BGEUC64_DESC, ISA_MIPS64R6, GPR_64;
+def BGTZC64 : BGTZC_ENC, BGTZC64_DESC, ISA_MIPS64R6, GPR_64;
+def BLEZC64 : BLEZC_ENC, BLEZC64_DESC, ISA_MIPS64R6, GPR_64;
+def BLTC64 : BLTC_ENC, BLTC64_DESC, ISA_MIPS64R6, GPR_64;
+def BLTUC64 : BLTUC_ENC, BLTUC64_DESC, ISA_MIPS64R6, GPR_64;
+def BNEC64 : BNEC_ENC, BNEC64_DESC, ISA_MIPS64R6, GPR_64;
+def BNEZC64 : BNEZC_ENC, BNEZC64_DESC, ISA_MIPS64R6, GPR_64;
+}
+let DecoderNamespace = "Mips32r6_64r6_BranchZero" in {
+def BLTZC64 : BLTZC_ENC, BLTZC64_DESC, ISA_MIPS64R6, GPR_64;
+def BGEZC64 : BGEZC_ENC, BGEZC64_DESC, ISA_MIPS64R6, GPR_64;
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Aliases
+//
+//===----------------------------------------------------------------------===//
+
+def : MipsInstAlias<"jr $rs", (JALR64 ZERO_64, GPR64Opnd:$rs), 1>, ISA_MIPS64R6;
+
+def : MipsInstAlias<"jrc $rs", (JIC64 GPR64Opnd:$rs, 0), 1>, ISA_MIPS64R6;
+
+def : MipsInstAlias<"jalrc $rs", (JIALC64 GPR64Opnd:$rs, 0), 1>, ISA_MIPS64R6;
+//===----------------------------------------------------------------------===//
+//
+// Patterns and Pseudo Instructions
+//
+//===----------------------------------------------------------------------===//
+
+// i64 selects
+def : MipsPat<(select i64:$cond, i64:$t, i64:$f),
+ (OR64 (SELNEZ64 i64:$t, i64:$cond),
+ (SELEQZ64 i64:$f, i64:$cond))>,
+ ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i64:$cond, immz)), i64:$t, i64:$f),
+ (OR64 (SELEQZ64 i64:$t, i64:$cond),
+ (SELNEZ64 i64:$f, i64:$cond))>,
+ ISA_MIPS64R6;
+def : MipsPat<(select (i32 (setne i64:$cond, immz)), i64:$t, i64:$f),
+ (OR64 (SELNEZ64 i64:$t, i64:$cond),
+ (SELEQZ64 i64:$f, i64:$cond))>,
+ ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i64:$cond, immZExt16_64:$imm)), i64:$t, i64:$f),
+ (OR64 (SELEQZ64 i64:$t, (XORi64 i64:$cond, immZExt16_64:$imm)),
+ (SELNEZ64 i64:$f, (XORi64 i64:$cond, immZExt16_64:$imm)))>,
+ ISA_MIPS64R6;
+def : MipsPat<(select (i32 (setne i64:$cond, immZExt16_64:$imm)), i64:$t, i64:$f),
+ (OR64 (SELNEZ64 i64:$t, (XORi64 i64:$cond, immZExt16_64:$imm)),
+ (SELEQZ64 i64:$f, (XORi64 i64:$cond, immZExt16_64:$imm)))>,
+ ISA_MIPS64R6;
+def : MipsPat<
+ (select (i32 (setgt i64:$cond, immSExt16Plus1:$imm)), i64:$t, i64:$f),
+ (OR64 (SELEQZ64 i64:$t,
+ (SUBREG_TO_REG (i64 0), (SLTi64 i64:$cond, (Plus1 imm:$imm)),
+ sub_32)),
+ (SELNEZ64 i64:$f,
+ (SUBREG_TO_REG (i64 0), (SLTi64 i64:$cond, (Plus1 imm:$imm)),
+ sub_32)))>,
+ ISA_MIPS64R6;
+def : MipsPat<
+ (select (i32 (setugt i64:$cond, immSExt16Plus1:$imm)), i64:$t, i64:$f),
+ (OR64 (SELEQZ64 i64:$t,
+ (SUBREG_TO_REG (i64 0), (SLTiu64 i64:$cond, (Plus1 imm:$imm)),
+ sub_32)),
+ (SELNEZ64 i64:$f,
+ (SUBREG_TO_REG (i64 0), (SLTiu64 i64:$cond, (Plus1 imm:$imm)),
+ sub_32)))>,
+ ISA_MIPS64R6;
+
+def : MipsPat<(select (i32 (setne i64:$cond, immz)), i64:$t, immz),
+ (SELNEZ64 i64:$t, i64:$cond)>, ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i64:$cond, immz)), i64:$t, immz),
+ (SELEQZ64 i64:$t, i64:$cond)>, ISA_MIPS64R6;
+def : MipsPat<(select (i32 (setne i64:$cond, immz)), immz, i64:$f),
+ (SELEQZ64 i64:$f, i64:$cond)>, ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i64:$cond, immz)), immz, i64:$f),
+ (SELNEZ64 i64:$f, i64:$cond)>, ISA_MIPS64R6;
+
+// i64 selects from an i32 comparison
+// One complicating factor here is that bits 32-63 of an i32 are undefined.
+// FIXME: Ideally, setcc would always produce an i64 on MIPS64 targets.
+// This would allow us to remove the sign-extensions here.
+def : MipsPat<(select i32:$cond, i64:$t, i64:$f),
+ (OR64 (SELNEZ64 i64:$t, (SLL64_32 i32:$cond)),
+ (SELEQZ64 i64:$f, (SLL64_32 i32:$cond)))>,
+ ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i32:$cond, immz)), i64:$t, i64:$f),
+ (OR64 (SELEQZ64 i64:$t, (SLL64_32 i32:$cond)),
+ (SELNEZ64 i64:$f, (SLL64_32 i32:$cond)))>,
+ ISA_MIPS64R6;
+def : MipsPat<(select (i32 (setne i32:$cond, immz)), i64:$t, i64:$f),
+ (OR64 (SELNEZ64 i64:$t, (SLL64_32 i32:$cond)),
+ (SELEQZ64 i64:$f, (SLL64_32 i32:$cond)))>,
+ ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i32:$cond, immZExt16:$imm)), i64:$t, i64:$f),
+ (OR64 (SELEQZ64 i64:$t, (SLL64_32 (XORi i32:$cond,
+ immZExt16:$imm))),
+ (SELNEZ64 i64:$f, (SLL64_32 (XORi i32:$cond,
+ immZExt16:$imm))))>,
+ ISA_MIPS64R6;
+def : MipsPat<(select (i32 (setne i32:$cond, immZExt16:$imm)), i64:$t, i64:$f),
+ (OR64 (SELNEZ64 i64:$t, (SLL64_32 (XORi i32:$cond,
+ immZExt16:$imm))),
+ (SELEQZ64 i64:$f, (SLL64_32 (XORi i32:$cond,
+ immZExt16:$imm))))>,
+ ISA_MIPS64R6;
+
+def : MipsPat<(select i32:$cond, i64:$t, immz),
+ (SELNEZ64 i64:$t, (SLL64_32 i32:$cond))>,
+ ISA_MIPS64R6;
+def : MipsPat<(select (i32 (setne i32:$cond, immz)), i64:$t, immz),
+ (SELNEZ64 i64:$t, (SLL64_32 i32:$cond))>,
+ ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i32:$cond, immz)), i64:$t, immz),
+ (SELEQZ64 i64:$t, (SLL64_32 i32:$cond))>,
+ ISA_MIPS64R6;
+def : MipsPat<(select i32:$cond, immz, i64:$f),
+ (SELEQZ64 i64:$f, (SLL64_32 i32:$cond))>,
+ ISA_MIPS64R6;
+def : MipsPat<(select (i32 (setne i32:$cond, immz)), immz, i64:$f),
+ (SELEQZ64 i64:$f, (SLL64_32 i32:$cond))>,
+ ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i32:$cond, immz)), immz, i64:$f),
+ (SELNEZ64 i64:$f, (SLL64_32 i32:$cond))>,
+ ISA_MIPS64R6;
diff --git a/contrib/llvm/lib/Target/Mips/MipsAnalyzeImmediate.cpp b/contrib/llvm/lib/Target/Mips/MipsAnalyzeImmediate.cpp
new file mode 100644
index 000000000000..161345d2a845
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsAnalyzeImmediate.cpp
@@ -0,0 +1,154 @@
+//===-- MipsAnalyzeImmediate.cpp - Analyze Immediates ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include "MipsAnalyzeImmediate.h"
+#include "Mips.h"
+#include "llvm/Support/MathExtras.h"
+
+using namespace llvm;
+
+MipsAnalyzeImmediate::Inst::Inst(unsigned O, unsigned I) : Opc(O), ImmOpnd(I) {}
+
+// Add I to the instruction sequences.
+void MipsAnalyzeImmediate::AddInstr(InstSeqLs &SeqLs, const Inst &I) {
+ // Add an instruction seqeunce consisting of just I.
+ if (SeqLs.empty()) {
+ SeqLs.push_back(InstSeq(1, I));
+ return;
+ }
+
+ for (InstSeqLs::iterator Iter = SeqLs.begin(); Iter != SeqLs.end(); ++Iter)
+ Iter->push_back(I);
+}
+
+void MipsAnalyzeImmediate::GetInstSeqLsADDiu(uint64_t Imm, unsigned RemSize,
+ InstSeqLs &SeqLs) {
+ GetInstSeqLs((Imm + 0x8000ULL) & 0xffffffffffff0000ULL, RemSize, SeqLs);
+ AddInstr(SeqLs, Inst(ADDiu, Imm & 0xffffULL));
+}
+
+void MipsAnalyzeImmediate::GetInstSeqLsORi(uint64_t Imm, unsigned RemSize,
+ InstSeqLs &SeqLs) {
+ GetInstSeqLs(Imm & 0xffffffffffff0000ULL, RemSize, SeqLs);
+ AddInstr(SeqLs, Inst(ORi, Imm & 0xffffULL));
+}
+
+void MipsAnalyzeImmediate::GetInstSeqLsSLL(uint64_t Imm, unsigned RemSize,
+ InstSeqLs &SeqLs) {
+ unsigned Shamt = countTrailingZeros(Imm);
+ GetInstSeqLs(Imm >> Shamt, RemSize - Shamt, SeqLs);
+ AddInstr(SeqLs, Inst(SLL, Shamt));
+}
+
+void MipsAnalyzeImmediate::GetInstSeqLs(uint64_t Imm, unsigned RemSize,
+ InstSeqLs &SeqLs) {
+ uint64_t MaskedImm = Imm & (0xffffffffffffffffULL >> (64 - Size));
+
+ // Do nothing if Imm is 0.
+ if (!MaskedImm)
+ return;
+
+ // A single ADDiu will do if RemSize <= 16.
+ if (RemSize <= 16) {
+ AddInstr(SeqLs, Inst(ADDiu, MaskedImm));
+ return;
+ }
+
+ // Shift if the lower 16-bit is cleared.
+ if (!(Imm & 0xffff)) {
+ GetInstSeqLsSLL(Imm, RemSize, SeqLs);
+ return;
+ }
+
+ GetInstSeqLsADDiu(Imm, RemSize, SeqLs);
+
+ // If bit 15 is cleared, it doesn't make a difference whether the last
+ // instruction is an ADDiu or ORi. In that case, do not call GetInstSeqLsORi.
+ if (Imm & 0x8000) {
+ InstSeqLs SeqLsORi;
+ GetInstSeqLsORi(Imm, RemSize, SeqLsORi);
+ SeqLs.append(std::make_move_iterator(SeqLsORi.begin()),
+ std::make_move_iterator(SeqLsORi.end()));
+ }
+}
+
+// Replace a ADDiu & SLL pair with a LUi.
+// e.g. the following two instructions
+// ADDiu 0x0111
+// SLL 18
+// are replaced with
+// LUi 0x444
+void MipsAnalyzeImmediate::ReplaceADDiuSLLWithLUi(InstSeq &Seq) {
+ // Check if the first two instructions are ADDiu and SLL and the shift amount
+ // is at least 16.
+ if ((Seq.size() < 2) || (Seq[0].Opc != ADDiu) ||
+ (Seq[1].Opc != SLL) || (Seq[1].ImmOpnd < 16))
+ return;
+
+ // Sign-extend and shift operand of ADDiu and see if it still fits in 16-bit.
+ int64_t Imm = SignExtend64<16>(Seq[0].ImmOpnd);
+ int64_t ShiftedImm = (uint64_t)Imm << (Seq[1].ImmOpnd - 16);
+
+ if (!isInt<16>(ShiftedImm))
+ return;
+
+ // Replace the first instruction and erase the second.
+ Seq[0].Opc = LUi;
+ Seq[0].ImmOpnd = (unsigned)(ShiftedImm & 0xffff);
+ Seq.erase(Seq.begin() + 1);
+}
+
+void MipsAnalyzeImmediate::GetShortestSeq(InstSeqLs &SeqLs, InstSeq &Insts) {
+ InstSeqLs::iterator ShortestSeq = SeqLs.end();
+ // The length of an instruction sequence is at most 7.
+ unsigned ShortestLength = 8;
+
+ for (InstSeqLs::iterator S = SeqLs.begin(); S != SeqLs.end(); ++S) {
+ ReplaceADDiuSLLWithLUi(*S);
+ assert(S->size() <= 7);
+
+ if (S->size() < ShortestLength) {
+ ShortestSeq = S;
+ ShortestLength = S->size();
+ }
+ }
+
+ Insts.clear();
+ Insts.append(ShortestSeq->begin(), ShortestSeq->end());
+}
+
+const MipsAnalyzeImmediate::InstSeq
+&MipsAnalyzeImmediate::Analyze(uint64_t Imm, unsigned Size,
+ bool LastInstrIsADDiu) {
+ this->Size = Size;
+
+ if (Size == 32) {
+ ADDiu = Mips::ADDiu;
+ ORi = Mips::ORi;
+ SLL = Mips::SLL;
+ LUi = Mips::LUi;
+ } else {
+ ADDiu = Mips::DADDiu;
+ ORi = Mips::ORi64;
+ SLL = Mips::DSLL;
+ LUi = Mips::LUi64;
+ }
+
+ InstSeqLs SeqLs;
+
+ // Get the list of instruction sequences.
+ if (LastInstrIsADDiu | !Imm)
+ GetInstSeqLsADDiu(Imm, Size, SeqLs);
+ else
+ GetInstSeqLs(Imm, Size, SeqLs);
+
+ // Set Insts to the shortest instruction sequence.
+ GetShortestSeq(SeqLs, Insts);
+
+ return Insts;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsAnalyzeImmediate.h b/contrib/llvm/lib/Target/Mips/MipsAnalyzeImmediate.h
new file mode 100644
index 000000000000..ae3c38ced80b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsAnalyzeImmediate.h
@@ -0,0 +1,63 @@
+//===-- MipsAnalyzeImmediate.h - Analyze Immediates ------------*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSANALYZEIMMEDIATE_H
+#define LLVM_LIB_TARGET_MIPS_MIPSANALYZEIMMEDIATE_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+ class MipsAnalyzeImmediate {
+ public:
+ struct Inst {
+ unsigned Opc, ImmOpnd;
+ Inst(unsigned Opc, unsigned ImmOpnd);
+ };
+ typedef SmallVector<Inst, 7 > InstSeq;
+
+ /// Analyze - Get an instruction sequence to load immediate Imm. The last
+ /// instruction in the sequence must be an ADDiu if LastInstrIsADDiu is
+ /// true;
+ const InstSeq &Analyze(uint64_t Imm, unsigned Size, bool LastInstrIsADDiu);
+ private:
+ typedef SmallVector<InstSeq, 5> InstSeqLs;
+
+ /// AddInstr - Add I to all instruction sequences in SeqLs.
+ void AddInstr(InstSeqLs &SeqLs, const Inst &I);
+
+ /// GetInstSeqLsADDiu - Get instruction sequences which end with an ADDiu to
+ /// load immediate Imm
+ void GetInstSeqLsADDiu(uint64_t Imm, unsigned RemSize, InstSeqLs &SeqLs);
+
+ /// GetInstSeqLsORi - Get instrutcion sequences which end with an ORi to
+ /// load immediate Imm
+ void GetInstSeqLsORi(uint64_t Imm, unsigned RemSize, InstSeqLs &SeqLs);
+
+ /// GetInstSeqLsSLL - Get instruction sequences which end with a SLL to
+ /// load immediate Imm
+ void GetInstSeqLsSLL(uint64_t Imm, unsigned RemSize, InstSeqLs &SeqLs);
+
+ /// GetInstSeqLs - Get instruction sequences to load immediate Imm.
+ void GetInstSeqLs(uint64_t Imm, unsigned RemSize, InstSeqLs &SeqLs);
+
+ /// ReplaceADDiuSLLWithLUi - Replace an ADDiu & SLL pair with a LUi.
+ void ReplaceADDiuSLLWithLUi(InstSeq &Seq);
+
+ /// GetShortestSeq - Find the shortest instruction sequence in SeqLs and
+ /// return it in Insts.
+ void GetShortestSeq(InstSeqLs &SeqLs, InstSeq &Insts);
+
+ unsigned Size;
+ unsigned ADDiu, ORi, SLL, LUi;
+ InstSeq Insts;
+ };
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
new file mode 100644
index 000000000000..179695bc6988
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -0,0 +1,1073 @@
+//===-- MipsAsmPrinter.cpp - Mips LLVM Assembly Printer -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to GAS-format MIPS assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstPrinter/MipsInstPrinter.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MCTargetDesc/MipsMCNaCl.h"
+#include "Mips.h"
+#include "MipsAsmPrinter.h"
+#include "MipsInstrInfo.h"
+#include "MipsMCInstLower.h"
+#include "MipsTargetMachine.h"
+#include "MipsTargetStreamer.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetOptions.h"
+#include <string>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-asm-printer"
+
+MipsTargetStreamer &MipsAsmPrinter::getTargetStreamer() const {
+ return static_cast<MipsTargetStreamer &>(*OutStreamer->getTargetStreamer());
+}
+
+bool MipsAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+ Subtarget = &MF.getSubtarget<MipsSubtarget>();
+
+ MipsFI = MF.getInfo<MipsFunctionInfo>();
+ if (Subtarget->inMips16Mode())
+ for (std::map<
+ const char *,
+ const llvm::Mips16HardFloatInfo::FuncSignature *>::const_iterator
+ it = MipsFI->StubsNeeded.begin();
+ it != MipsFI->StubsNeeded.end(); ++it) {
+ const char *Symbol = it->first;
+ const llvm::Mips16HardFloatInfo::FuncSignature *Signature = it->second;
+ if (StubsNeeded.find(Symbol) == StubsNeeded.end())
+ StubsNeeded[Symbol] = Signature;
+ }
+ MCP = MF.getConstantPool();
+
+ // In NaCl, all indirect jump targets must be aligned to bundle size.
+ if (Subtarget->isTargetNaCl())
+ NaClAlignIndirectJumpTargets(MF);
+
+ AsmPrinter::runOnMachineFunction(MF);
+ return true;
+}
+
+bool MipsAsmPrinter::lowerOperand(const MachineOperand &MO, MCOperand &MCOp) {
+ MCOp = MCInstLowering.LowerOperand(MO);
+ return MCOp.isValid();
+}
+
+#include "MipsGenMCPseudoLowering.inc"
+
+// Lower PseudoReturn/PseudoIndirectBranch/PseudoIndirectBranch64 to JR, JR_MM,
+// JALR, or JALR64 as appropriate for the target
+void MipsAsmPrinter::emitPseudoIndirectBranch(MCStreamer &OutStreamer,
+ const MachineInstr *MI) {
+ bool HasLinkReg = false;
+ bool InMicroMipsMode = Subtarget->inMicroMipsMode();
+ MCInst TmpInst0;
+
+ if (Subtarget->hasMips64r6()) {
+ // MIPS64r6 should use (JALR64 ZERO_64, $rs)
+ TmpInst0.setOpcode(Mips::JALR64);
+ HasLinkReg = true;
+ } else if (Subtarget->hasMips32r6()) {
+ // MIPS32r6 should use (JALR ZERO, $rs)
+ if (InMicroMipsMode)
+ TmpInst0.setOpcode(Mips::JRC16_MMR6);
+ else {
+ TmpInst0.setOpcode(Mips::JALR);
+ HasLinkReg = true;
+ }
+ } else if (Subtarget->inMicroMipsMode())
+ // microMIPS should use (JR_MM $rs)
+ TmpInst0.setOpcode(Mips::JR_MM);
+ else {
+ // Everything else should use (JR $rs)
+ TmpInst0.setOpcode(Mips::JR);
+ }
+
+ MCOperand MCOp;
+
+ if (HasLinkReg) {
+ unsigned ZeroReg = Subtarget->isGP64bit() ? Mips::ZERO_64 : Mips::ZERO;
+ TmpInst0.addOperand(MCOperand::createReg(ZeroReg));
+ }
+
+ lowerOperand(MI->getOperand(0), MCOp);
+ TmpInst0.addOperand(MCOp);
+
+ EmitToStreamer(OutStreamer, TmpInst0);
+}
+
+void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+ MipsTargetStreamer &TS = getTargetStreamer();
+ TS.forbidModuleDirective();
+
+ if (MI->isDebugValue()) {
+ SmallString<128> Str;
+ raw_svector_ostream OS(Str);
+
+ PrintDebugValueComment(MI, OS);
+ return;
+ }
+
+ // If we just ended a constant pool, mark it as such.
+ if (InConstantPool && MI->getOpcode() != Mips::CONSTPOOL_ENTRY) {
+ OutStreamer->EmitDataRegion(MCDR_DataRegionEnd);
+ InConstantPool = false;
+ }
+ if (MI->getOpcode() == Mips::CONSTPOOL_ENTRY) {
+ // CONSTPOOL_ENTRY - This instruction represents a floating
+ //constant pool in the function. The first operand is the ID#
+ // for this instruction, the second is the index into the
+ // MachineConstantPool that this is, the third is the size in
+ // bytes of this constant pool entry.
+ // The required alignment is specified on the basic block holding this MI.
+ //
+ unsigned LabelId = (unsigned)MI->getOperand(0).getImm();
+ unsigned CPIdx = (unsigned)MI->getOperand(1).getIndex();
+
+ // If this is the first entry of the pool, mark it.
+ if (!InConstantPool) {
+ OutStreamer->EmitDataRegion(MCDR_DataRegion);
+ InConstantPool = true;
+ }
+
+ OutStreamer->EmitLabel(GetCPISymbol(LabelId));
+
+ const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPIdx];
+ if (MCPE.isMachineConstantPoolEntry())
+ EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal);
+ else
+ EmitGlobalConstant(MF->getDataLayout(), MCPE.Val.ConstVal);
+ return;
+ }
+
+
+ MachineBasicBlock::const_instr_iterator I = MI->getIterator();
+ MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
+
+ do {
+ // Do any auto-generated pseudo lowerings.
+ if (emitPseudoExpansionLowering(*OutStreamer, &*I))
+ continue;
+
+ if (I->getOpcode() == Mips::PseudoReturn ||
+ I->getOpcode() == Mips::PseudoReturn64 ||
+ I->getOpcode() == Mips::PseudoIndirectBranch ||
+ I->getOpcode() == Mips::PseudoIndirectBranch64 ||
+ I->getOpcode() == Mips::TAILCALLREG ||
+ I->getOpcode() == Mips::TAILCALLREG64) {
+ emitPseudoIndirectBranch(*OutStreamer, &*I);
+ continue;
+ }
+
+ // The inMips16Mode() test is not permanent.
+ // Some instructions are marked as pseudo right now which
+ // would make the test fail for the wrong reason but
+ // that will be fixed soon. We need this here because we are
+ // removing another test for this situation downstream in the
+ // callchain.
+ //
+ if (I->isPseudo() && !Subtarget->inMips16Mode()
+ && !isLongBranchPseudo(I->getOpcode()))
+ llvm_unreachable("Pseudo opcode found in EmitInstruction()");
+
+ MCInst TmpInst0;
+ MCInstLowering.Lower(&*I, TmpInst0);
+ EmitToStreamer(*OutStreamer, TmpInst0);
+ } while ((++I != E) && I->isInsideBundle()); // Delay slot check
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Mips Asm Directives
+//
+// -- Frame directive "frame Stackpointer, Stacksize, RARegister"
+// Describe the stack frame.
+//
+// -- Mask directives "(f)mask bitmask, offset"
+// Tells the assembler which registers are saved and where.
+// bitmask - contain a little endian bitset indicating which registers are
+// saved on function prologue (e.g. with a 0x80000000 mask, the
+// assembler knows the register 31 (RA) is saved at prologue.
+// offset - the position before stack pointer subtraction indicating where
+// the first saved register on prologue is located. (e.g. with a
+//
+// Consider the following function prologue:
+//
+// .frame $fp,48,$ra
+// .mask 0xc0000000,-8
+// addiu $sp, $sp, -48
+// sw $ra, 40($sp)
+// sw $fp, 36($sp)
+//
+// With a 0xc0000000 mask, the assembler knows the register 31 (RA) and
+// 30 (FP) are saved at prologue. As the save order on prologue is from
+// left to right, RA is saved first. A -8 offset means that after the
+// stack pointer subtration, the first register in the mask (RA) will be
+// saved at address 48-8=40.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Mask directives
+//===----------------------------------------------------------------------===//
+
+// Create a bitmask with all callee saved registers for CPU or Floating Point
+// registers. For CPU registers consider RA, GP and FP for saving if necessary.
+void MipsAsmPrinter::printSavedRegsBitmask() {
+ // CPU and FPU Saved Registers Bitmasks
+ unsigned CPUBitmask = 0, FPUBitmask = 0;
+ int CPUTopSavedRegOff, FPUTopSavedRegOff;
+
+ // Set the CPU and FPU Bitmasks
+ const MachineFrameInfo &MFI = MF->getFrameInfo();
+ const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+ const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+ // size of stack area to which FP callee-saved regs are saved.
+ unsigned CPURegSize = Mips::GPR32RegClass.getSize();
+ unsigned FGR32RegSize = Mips::FGR32RegClass.getSize();
+ unsigned AFGR64RegSize = Mips::AFGR64RegClass.getSize();
+ bool HasAFGR64Reg = false;
+ unsigned CSFPRegsSize = 0;
+
+ for (const auto &I : CSI) {
+ unsigned Reg = I.getReg();
+ unsigned RegNum = TRI->getEncodingValue(Reg);
+
+ // If it's a floating point register, set the FPU Bitmask.
+ // If it's a general purpose register, set the CPU Bitmask.
+ if (Mips::FGR32RegClass.contains(Reg)) {
+ FPUBitmask |= (1 << RegNum);
+ CSFPRegsSize += FGR32RegSize;
+ } else if (Mips::AFGR64RegClass.contains(Reg)) {
+ FPUBitmask |= (3 << RegNum);
+ CSFPRegsSize += AFGR64RegSize;
+ HasAFGR64Reg = true;
+ } else if (Mips::GPR32RegClass.contains(Reg))
+ CPUBitmask |= (1 << RegNum);
+ }
+
+ // FP Regs are saved right below where the virtual frame pointer points to.
+ FPUTopSavedRegOff = FPUBitmask ?
+ (HasAFGR64Reg ? -AFGR64RegSize : -FGR32RegSize) : 0;
+
+ // CPU Regs are saved below FP Regs.
+ CPUTopSavedRegOff = CPUBitmask ? -CSFPRegsSize - CPURegSize : 0;
+
+ MipsTargetStreamer &TS = getTargetStreamer();
+ // Print CPUBitmask
+ TS.emitMask(CPUBitmask, CPUTopSavedRegOff);
+
+ // Print FPUBitmask
+ TS.emitFMask(FPUBitmask, FPUTopSavedRegOff);
+}
+
+//===----------------------------------------------------------------------===//
+// Frame and Set directives
+//===----------------------------------------------------------------------===//
+
+/// Frame Directive
+void MipsAsmPrinter::emitFrameDirective() {
+ const TargetRegisterInfo &RI = *MF->getSubtarget().getRegisterInfo();
+
+ unsigned stackReg = RI.getFrameRegister(*MF);
+ unsigned returnReg = RI.getRARegister();
+ unsigned stackSize = MF->getFrameInfo().getStackSize();
+
+ getTargetStreamer().emitFrame(stackReg, stackSize, returnReg);
+}
+
+/// Emit Set directives.
+const char *MipsAsmPrinter::getCurrentABIString() const {
+ switch (static_cast<MipsTargetMachine &>(TM).getABI().GetEnumValue()) {
+ case MipsABIInfo::ABI::O32: return "abi32";
+ case MipsABIInfo::ABI::N32: return "abiN32";
+ case MipsABIInfo::ABI::N64: return "abi64";
+ default: llvm_unreachable("Unknown Mips ABI");
+ }
+}
+
+void MipsAsmPrinter::EmitFunctionEntryLabel() {
+ MipsTargetStreamer &TS = getTargetStreamer();
+
+ // NaCl sandboxing requires that indirect call instructions are masked.
+ // This means that function entry points should be bundle-aligned.
+ if (Subtarget->isTargetNaCl())
+ EmitAlignment(std::max(MF->getAlignment(), MIPS_NACL_BUNDLE_ALIGN));
+
+ if (Subtarget->inMicroMipsMode()) {
+ TS.emitDirectiveSetMicroMips();
+ TS.setUsesMicroMips();
+ } else
+ TS.emitDirectiveSetNoMicroMips();
+
+ if (Subtarget->inMips16Mode())
+ TS.emitDirectiveSetMips16();
+ else
+ TS.emitDirectiveSetNoMips16();
+
+ TS.emitDirectiveEnt(*CurrentFnSym);
+ OutStreamer->EmitLabel(CurrentFnSym);
+}
+
+/// EmitFunctionBodyStart - Targets can override this to emit stuff before
+/// the first basic block in the function.
+void MipsAsmPrinter::EmitFunctionBodyStart() {
+ MipsTargetStreamer &TS = getTargetStreamer();
+
+ MCInstLowering.Initialize(&MF->getContext());
+
+ bool IsNakedFunction = MF->getFunction()->hasFnAttribute(Attribute::Naked);
+ if (!IsNakedFunction)
+ emitFrameDirective();
+
+ if (!IsNakedFunction)
+ printSavedRegsBitmask();
+
+ if (!Subtarget->inMips16Mode()) {
+ TS.emitDirectiveSetNoReorder();
+ TS.emitDirectiveSetNoMacro();
+ TS.emitDirectiveSetNoAt();
+ }
+}
+
+/// EmitFunctionBodyEnd - Targets can override this to emit stuff after
+/// the last basic block in the function.
+void MipsAsmPrinter::EmitFunctionBodyEnd() {
+ MipsTargetStreamer &TS = getTargetStreamer();
+
+ // There are instruction for this macros, but they must
+ // always be at the function end, and we can't emit and
+ // break with BB logic.
+ if (!Subtarget->inMips16Mode()) {
+ TS.emitDirectiveSetAt();
+ TS.emitDirectiveSetMacro();
+ TS.emitDirectiveSetReorder();
+ }
+ TS.emitDirectiveEnd(CurrentFnSym->getName());
+ // Make sure to terminate any constant pools that were at the end
+ // of the function.
+ if (!InConstantPool)
+ return;
+ InConstantPool = false;
+ OutStreamer->EmitDataRegion(MCDR_DataRegionEnd);
+}
+
+void MipsAsmPrinter::EmitBasicBlockEnd(const MachineBasicBlock &MBB) {
+ MipsTargetStreamer &TS = getTargetStreamer();
+ if (MBB.size() == 0)
+ TS.emitDirectiveInsn();
+}
+
+/// isBlockOnlyReachableByFallthough - Return true if the basic block has
+/// exactly one predecessor and the control transfer mechanism between
+/// the predecessor and this block is a fall-through.
+bool MipsAsmPrinter::isBlockOnlyReachableByFallthrough(const MachineBasicBlock*
+ MBB) const {
+ // The predecessor has to be immediately before this block.
+ const MachineBasicBlock *Pred = *MBB->pred_begin();
+
+ // If the predecessor is a switch statement, assume a jump table
+ // implementation, so it is not a fall through.
+ if (const BasicBlock *bb = Pred->getBasicBlock())
+ if (isa<SwitchInst>(bb->getTerminator()))
+ return false;
+
+ // If this is a landing pad, it isn't a fall through. If it has no preds,
+ // then nothing falls through to it.
+ if (MBB->isEHPad() || MBB->pred_empty())
+ return false;
+
+ // If there isn't exactly one predecessor, it can't be a fall through.
+ MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(), PI2 = PI;
+ ++PI2;
+
+ if (PI2 != MBB->pred_end())
+ return false;
+
+ // The predecessor has to be immediately before this block.
+ if (!Pred->isLayoutSuccessor(MBB))
+ return false;
+
+ // If the block is completely empty, then it definitely does fall through.
+ if (Pred->empty())
+ return true;
+
+ // Otherwise, check the last instruction.
+ // Check if the last terminator is an unconditional branch.
+ MachineBasicBlock::const_iterator I = Pred->end();
+ while (I != Pred->begin() && !(--I)->isTerminator()) ;
+
+ return !I->isBarrier();
+}
+
+// Print out an operand for an inline asm expression.
+bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &O) {
+ // Does this asm operand have a single letter operand modifier?
+ if (ExtraCode && ExtraCode[0]) {
+ if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+ const MachineOperand &MO = MI->getOperand(OpNum);
+ switch (ExtraCode[0]) {
+ default:
+ // See if this is a generic print operand
+ return AsmPrinter::PrintAsmOperand(MI,OpNum,AsmVariant,ExtraCode,O);
+ case 'X': // hex const int
+ if ((MO.getType()) != MachineOperand::MO_Immediate)
+ return true;
+ O << "0x" << Twine::utohexstr(MO.getImm());
+ return false;
+ case 'x': // hex const int (low 16 bits)
+ if ((MO.getType()) != MachineOperand::MO_Immediate)
+ return true;
+ O << "0x" << Twine::utohexstr(MO.getImm() & 0xffff);
+ return false;
+ case 'd': // decimal const int
+ if ((MO.getType()) != MachineOperand::MO_Immediate)
+ return true;
+ O << MO.getImm();
+ return false;
+ case 'm': // decimal const int minus 1
+ if ((MO.getType()) != MachineOperand::MO_Immediate)
+ return true;
+ O << MO.getImm() - 1;
+ return false;
+ case 'z': {
+ // $0 if zero, regular printing otherwise
+ if (MO.getType() == MachineOperand::MO_Immediate && MO.getImm() == 0) {
+ O << "$0";
+ return false;
+ }
+ // If not, call printOperand as normal.
+ break;
+ }
+ case 'D': // Second part of a double word register operand
+ case 'L': // Low order register of a double word register operand
+ case 'M': // High order register of a double word register operand
+ {
+ if (OpNum == 0)
+ return true;
+ const MachineOperand &FlagsOP = MI->getOperand(OpNum - 1);
+ if (!FlagsOP.isImm())
+ return true;
+ unsigned Flags = FlagsOP.getImm();
+ unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+ // Number of registers represented by this operand. We are looking
+ // for 2 for 32 bit mode and 1 for 64 bit mode.
+ if (NumVals != 2) {
+ if (Subtarget->isGP64bit() && NumVals == 1 && MO.isReg()) {
+ unsigned Reg = MO.getReg();
+ O << '$' << MipsInstPrinter::getRegisterName(Reg);
+ return false;
+ }
+ return true;
+ }
+
+ unsigned RegOp = OpNum;
+ if (!Subtarget->isGP64bit()){
+ // Endianness reverses which register holds the high or low value
+ // between M and L.
+ switch(ExtraCode[0]) {
+ case 'M':
+ RegOp = (Subtarget->isLittle()) ? OpNum + 1 : OpNum;
+ break;
+ case 'L':
+ RegOp = (Subtarget->isLittle()) ? OpNum : OpNum + 1;
+ break;
+ case 'D': // Always the second part
+ RegOp = OpNum + 1;
+ }
+ if (RegOp >= MI->getNumOperands())
+ return true;
+ const MachineOperand &MO = MI->getOperand(RegOp);
+ if (!MO.isReg())
+ return true;
+ unsigned Reg = MO.getReg();
+ O << '$' << MipsInstPrinter::getRegisterName(Reg);
+ return false;
+ }
+ }
+ case 'w':
+ // Print MSA registers for the 'f' constraint
+ // In LLVM, the 'w' modifier doesn't need to do anything.
+ // We can just call printOperand as normal.
+ break;
+ }
+ }
+
+ printOperand(MI, OpNum, O);
+ return false;
+}
+
+bool MipsAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+ unsigned OpNum, unsigned AsmVariant,
+ const char *ExtraCode,
+ raw_ostream &O) {
+ assert(OpNum + 1 < MI->getNumOperands() && "Insufficient operands");
+ const MachineOperand &BaseMO = MI->getOperand(OpNum);
+ const MachineOperand &OffsetMO = MI->getOperand(OpNum + 1);
+ assert(BaseMO.isReg() && "Unexpected base pointer for inline asm memory operand.");
+ assert(OffsetMO.isImm() && "Unexpected offset for inline asm memory operand.");
+ int Offset = OffsetMO.getImm();
+
+ // Currently we are expecting either no ExtraCode or 'D'
+ if (ExtraCode) {
+ if (ExtraCode[0] == 'D')
+ Offset += 4;
+ else
+ return true; // Unknown modifier.
+ // FIXME: M = high order bits
+ // FIXME: L = low order bits
+ }
+
+ O << Offset << "($" << MipsInstPrinter::getRegisterName(BaseMO.getReg()) << ")";
+
+ return false;
+}
+
+void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
+ raw_ostream &O) {
+ const MachineOperand &MO = MI->getOperand(opNum);
+ bool closeP = false;
+
+ if (MO.getTargetFlags())
+ closeP = true;
+
+ switch(MO.getTargetFlags()) {
+ case MipsII::MO_GPREL: O << "%gp_rel("; break;
+ case MipsII::MO_GOT_CALL: O << "%call16("; break;
+ case MipsII::MO_GOT: O << "%got("; break;
+ case MipsII::MO_ABS_HI: O << "%hi("; break;
+ case MipsII::MO_ABS_LO: O << "%lo("; break;
+ case MipsII::MO_TLSGD: O << "%tlsgd("; break;
+ case MipsII::MO_GOTTPREL: O << "%gottprel("; break;
+ case MipsII::MO_TPREL_HI: O << "%tprel_hi("; break;
+ case MipsII::MO_TPREL_LO: O << "%tprel_lo("; break;
+ case MipsII::MO_GPOFF_HI: O << "%hi(%neg(%gp_rel("; break;
+ case MipsII::MO_GPOFF_LO: O << "%lo(%neg(%gp_rel("; break;
+ case MipsII::MO_GOT_DISP: O << "%got_disp("; break;
+ case MipsII::MO_GOT_PAGE: O << "%got_page("; break;
+ case MipsII::MO_GOT_OFST: O << "%got_ofst("; break;
+ }
+
+ switch (MO.getType()) {
+ case MachineOperand::MO_Register:
+ O << '$'
+ << StringRef(MipsInstPrinter::getRegisterName(MO.getReg())).lower();
+ break;
+
+ case MachineOperand::MO_Immediate:
+ O << MO.getImm();
+ break;
+
+ case MachineOperand::MO_MachineBasicBlock:
+ MO.getMBB()->getSymbol()->print(O, MAI);
+ return;
+
+ case MachineOperand::MO_GlobalAddress:
+ getSymbol(MO.getGlobal())->print(O, MAI);
+ break;
+
+ case MachineOperand::MO_BlockAddress: {
+ MCSymbol *BA = GetBlockAddressSymbol(MO.getBlockAddress());
+ O << BA->getName();
+ break;
+ }
+
+ case MachineOperand::MO_ConstantPoolIndex:
+ O << getDataLayout().getPrivateGlobalPrefix() << "CPI"
+ << getFunctionNumber() << "_" << MO.getIndex();
+ if (MO.getOffset())
+ O << "+" << MO.getOffset();
+ break;
+
+ default:
+ llvm_unreachable("<unknown operand type>");
+ }
+
+ if (closeP) O << ")";
+}
+
+void MipsAsmPrinter::
+printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O) {
+ // Load/Store memory operands -- imm($reg)
+ // If PIC target the target is loaded as the
+ // pattern lw $25,%call16($28)
+
+ // opNum can be invalid if instruction has reglist as operand.
+ // MemOperand is always last operand of instruction (base + offset).
+ switch (MI->getOpcode()) {
+ default:
+ break;
+ case Mips::SWM32_MM:
+ case Mips::LWM32_MM:
+ opNum = MI->getNumOperands() - 2;
+ break;
+ }
+
+ printOperand(MI, opNum+1, O);
+ O << "(";
+ printOperand(MI, opNum, O);
+ O << ")";
+}
+
+void MipsAsmPrinter::
+printMemOperandEA(const MachineInstr *MI, int opNum, raw_ostream &O) {
+ // when using stack locations for not load/store instructions
+ // print the same way as all normal 3 operand instructions.
+ printOperand(MI, opNum, O);
+ O << ", ";
+ printOperand(MI, opNum+1, O);
+ return;
+}
+
+void MipsAsmPrinter::
+printFCCOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
+ const char *Modifier) {
+ const MachineOperand &MO = MI->getOperand(opNum);
+ O << Mips::MipsFCCToString((Mips::CondCode)MO.getImm());
+}
+
+void MipsAsmPrinter::
+printRegisterList(const MachineInstr *MI, int opNum, raw_ostream &O) {
+ for (int i = opNum, e = MI->getNumOperands(); i != e; ++i) {
+ if (i != opNum) O << ", ";
+ printOperand(MI, i, O);
+ }
+}
+
+void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) {
+ MipsTargetStreamer &TS = getTargetStreamer();
+
+ // MipsTargetStreamer has an initialization order problem when emitting an
+ // object file directly (see MipsTargetELFStreamer for full details). Work
+ // around it by re-initializing the PIC state here.
+ TS.setPic(OutContext.getObjectFileInfo()->isPositionIndependent());
+
+ // Compute MIPS architecture attributes based on the default subtarget
+ // that we'd have constructed. Module level directives aren't LTO
+ // clean anyhow.
+ // FIXME: For ifunc related functions we could iterate over and look
+ // for a feature string that doesn't match the default one.
+ const Triple &TT = TM.getTargetTriple();
+ StringRef CPU = MIPS_MC::selectMipsCPU(TT, TM.getTargetCPU());
+ StringRef FS = TM.getTargetFeatureString();
+ const MipsTargetMachine &MTM = static_cast<const MipsTargetMachine &>(TM);
+ const MipsSubtarget STI(TT, CPU, FS, MTM.isLittleEndian(), MTM);
+
+ bool IsABICalls = STI.isABICalls();
+ const MipsABIInfo &ABI = MTM.getABI();
+ if (IsABICalls) {
+ TS.emitDirectiveAbiCalls();
+ // FIXME: This condition should be a lot more complicated that it is here.
+ // Ideally it should test for properties of the ABI and not the ABI
+ // itself.
+ // For the moment, I'm only correcting enough to make MIPS-IV work.
+ if (!isPositionIndependent() && !ABI.IsN64())
+ TS.emitDirectiveOptionPic0();
+ }
+
+ // Tell the assembler which ABI we are using
+ std::string SectionName = std::string(".mdebug.") + getCurrentABIString();
+ OutStreamer->SwitchSection(
+ OutContext.getELFSection(SectionName, ELF::SHT_PROGBITS, 0));
+
+ // NaN: At the moment we only support:
+ // 1. .nan legacy (default)
+ // 2. .nan 2008
+ STI.isNaN2008() ? TS.emitDirectiveNaN2008()
+ : TS.emitDirectiveNaNLegacy();
+
+ // TODO: handle O64 ABI
+
+ TS.updateABIInfo(STI);
+
+ // We should always emit a '.module fp=...' but binutils 2.24 does not accept
+ // it. We therefore emit it when it contradicts the ABI defaults (-mfpxx or
+ // -mfp64) and omit it otherwise.
+ if (ABI.IsO32() && (STI.isABI_FPXX() || STI.isFP64bit()))
+ TS.emitDirectiveModuleFP();
+
+ // We should always emit a '.module [no]oddspreg' but binutils 2.24 does not
+ // accept it. We therefore emit it when it contradicts the default or an
+ // option has changed the default (i.e. FPXX) and omit it otherwise.
+ if (ABI.IsO32() && (!STI.useOddSPReg() || STI.isABI_FPXX()))
+ TS.emitDirectiveModuleOddSPReg();
+}
+
+void MipsAsmPrinter::emitInlineAsmStart() const {
+ MipsTargetStreamer &TS = getTargetStreamer();
+
+ // GCC's choice of assembler options for inline assembly code ('at', 'macro'
+ // and 'reorder') is different from LLVM's choice for generated code ('noat',
+ // 'nomacro' and 'noreorder').
+ // In order to maintain compatibility with inline assembly code which depends
+ // on GCC's assembler options being used, we have to switch to those options
+ // for the duration of the inline assembly block and then switch back.
+ TS.emitDirectiveSetPush();
+ TS.emitDirectiveSetAt();
+ TS.emitDirectiveSetMacro();
+ TS.emitDirectiveSetReorder();
+ OutStreamer->AddBlankLine();
+}
+
+void MipsAsmPrinter::emitInlineAsmEnd(const MCSubtargetInfo &StartInfo,
+ const MCSubtargetInfo *EndInfo) const {
+ OutStreamer->AddBlankLine();
+ getTargetStreamer().emitDirectiveSetPop();
+}
+
+void MipsAsmPrinter::EmitJal(const MCSubtargetInfo &STI, MCSymbol *Symbol) {
+ MCInst I;
+ I.setOpcode(Mips::JAL);
+ I.addOperand(
+ MCOperand::createExpr(MCSymbolRefExpr::create(Symbol, OutContext)));
+ OutStreamer->EmitInstruction(I, STI);
+}
+
+void MipsAsmPrinter::EmitInstrReg(const MCSubtargetInfo &STI, unsigned Opcode,
+ unsigned Reg) {
+ MCInst I;
+ I.setOpcode(Opcode);
+ I.addOperand(MCOperand::createReg(Reg));
+ OutStreamer->EmitInstruction(I, STI);
+}
+
+void MipsAsmPrinter::EmitInstrRegReg(const MCSubtargetInfo &STI,
+ unsigned Opcode, unsigned Reg1,
+ unsigned Reg2) {
+ MCInst I;
+ //
+ // Because of the current td files for Mips32, the operands for MTC1
+ // appear backwards from their normal assembly order. It's not a trivial
+ // change to fix this in the td file so we adjust for it here.
+ //
+ if (Opcode == Mips::MTC1) {
+ unsigned Temp = Reg1;
+ Reg1 = Reg2;
+ Reg2 = Temp;
+ }
+ I.setOpcode(Opcode);
+ I.addOperand(MCOperand::createReg(Reg1));
+ I.addOperand(MCOperand::createReg(Reg2));
+ OutStreamer->EmitInstruction(I, STI);
+}
+
+void MipsAsmPrinter::EmitInstrRegRegReg(const MCSubtargetInfo &STI,
+ unsigned Opcode, unsigned Reg1,
+ unsigned Reg2, unsigned Reg3) {
+ MCInst I;
+ I.setOpcode(Opcode);
+ I.addOperand(MCOperand::createReg(Reg1));
+ I.addOperand(MCOperand::createReg(Reg2));
+ I.addOperand(MCOperand::createReg(Reg3));
+ OutStreamer->EmitInstruction(I, STI);
+}
+
+void MipsAsmPrinter::EmitMovFPIntPair(const MCSubtargetInfo &STI,
+ unsigned MovOpc, unsigned Reg1,
+ unsigned Reg2, unsigned FPReg1,
+ unsigned FPReg2, bool LE) {
+ if (!LE) {
+ unsigned temp = Reg1;
+ Reg1 = Reg2;
+ Reg2 = temp;
+ }
+ EmitInstrRegReg(STI, MovOpc, Reg1, FPReg1);
+ EmitInstrRegReg(STI, MovOpc, Reg2, FPReg2);
+}
+
+void MipsAsmPrinter::EmitSwapFPIntParams(const MCSubtargetInfo &STI,
+ Mips16HardFloatInfo::FPParamVariant PV,
+ bool LE, bool ToFP) {
+ using namespace Mips16HardFloatInfo;
+ unsigned MovOpc = ToFP ? Mips::MTC1 : Mips::MFC1;
+ switch (PV) {
+ case FSig:
+ EmitInstrRegReg(STI, MovOpc, Mips::A0, Mips::F12);
+ break;
+ case FFSig:
+ EmitMovFPIntPair(STI, MovOpc, Mips::A0, Mips::A1, Mips::F12, Mips::F14, LE);
+ break;
+ case FDSig:
+ EmitInstrRegReg(STI, MovOpc, Mips::A0, Mips::F12);
+ EmitMovFPIntPair(STI, MovOpc, Mips::A2, Mips::A3, Mips::F14, Mips::F15, LE);
+ break;
+ case DSig:
+ EmitMovFPIntPair(STI, MovOpc, Mips::A0, Mips::A1, Mips::F12, Mips::F13, LE);
+ break;
+ case DDSig:
+ EmitMovFPIntPair(STI, MovOpc, Mips::A0, Mips::A1, Mips::F12, Mips::F13, LE);
+ EmitMovFPIntPair(STI, MovOpc, Mips::A2, Mips::A3, Mips::F14, Mips::F15, LE);
+ break;
+ case DFSig:
+ EmitMovFPIntPair(STI, MovOpc, Mips::A0, Mips::A1, Mips::F12, Mips::F13, LE);
+ EmitInstrRegReg(STI, MovOpc, Mips::A2, Mips::F14);
+ break;
+ case NoSig:
+ return;
+ }
+}
+
+void MipsAsmPrinter::EmitSwapFPIntRetval(
+ const MCSubtargetInfo &STI, Mips16HardFloatInfo::FPReturnVariant RV,
+ bool LE) {
+ using namespace Mips16HardFloatInfo;
+ unsigned MovOpc = Mips::MFC1;
+ switch (RV) {
+ case FRet:
+ EmitInstrRegReg(STI, MovOpc, Mips::V0, Mips::F0);
+ break;
+ case DRet:
+ EmitMovFPIntPair(STI, MovOpc, Mips::V0, Mips::V1, Mips::F0, Mips::F1, LE);
+ break;
+ case CFRet:
+ EmitMovFPIntPair(STI, MovOpc, Mips::V0, Mips::V1, Mips::F0, Mips::F1, LE);
+ break;
+ case CDRet:
+ EmitMovFPIntPair(STI, MovOpc, Mips::V0, Mips::V1, Mips::F0, Mips::F1, LE);
+ EmitMovFPIntPair(STI, MovOpc, Mips::A0, Mips::A1, Mips::F2, Mips::F3, LE);
+ break;
+ case NoFPRet:
+ break;
+ }
+}
+
+void MipsAsmPrinter::EmitFPCallStub(
+ const char *Symbol, const Mips16HardFloatInfo::FuncSignature *Signature) {
+ MCSymbol *MSymbol = OutContext.getOrCreateSymbol(StringRef(Symbol));
+ using namespace Mips16HardFloatInfo;
+ bool LE = getDataLayout().isLittleEndian();
+ // Construct a local MCSubtargetInfo here.
+ // This is because the MachineFunction won't exist (but have not yet been
+ // freed) and since we're at the global level we can use the default
+ // constructed subtarget.
+ std::unique_ptr<MCSubtargetInfo> STI(TM.getTarget().createMCSubtargetInfo(
+ TM.getTargetTriple().str(), TM.getTargetCPU(),
+ TM.getTargetFeatureString()));
+
+ //
+ // .global xxxx
+ //
+ OutStreamer->EmitSymbolAttribute(MSymbol, MCSA_Global);
+ const char *RetType;
+ //
+ // make the comment field identifying the return and parameter
+ // types of the floating point stub
+ // # Stub function to call rettype xxxx (params)
+ //
+ switch (Signature->RetSig) {
+ case FRet:
+ RetType = "float";
+ break;
+ case DRet:
+ RetType = "double";
+ break;
+ case CFRet:
+ RetType = "complex";
+ break;
+ case CDRet:
+ RetType = "double complex";
+ break;
+ case NoFPRet:
+ RetType = "";
+ break;
+ }
+ const char *Parms;
+ switch (Signature->ParamSig) {
+ case FSig:
+ Parms = "float";
+ break;
+ case FFSig:
+ Parms = "float, float";
+ break;
+ case FDSig:
+ Parms = "float, double";
+ break;
+ case DSig:
+ Parms = "double";
+ break;
+ case DDSig:
+ Parms = "double, double";
+ break;
+ case DFSig:
+ Parms = "double, float";
+ break;
+ case NoSig:
+ Parms = "";
+ break;
+ }
+ OutStreamer->AddComment("\t# Stub function to call " + Twine(RetType) + " " +
+ Twine(Symbol) + " (" + Twine(Parms) + ")");
+ //
+ // probably not necessary but we save and restore the current section state
+ //
+ OutStreamer->PushSection();
+ //
+ // .section mips16.call.fpxxxx,"ax",@progbits
+ //
+ MCSectionELF *M = OutContext.getELFSection(
+ ".mips16.call.fp." + std::string(Symbol), ELF::SHT_PROGBITS,
+ ELF::SHF_ALLOC | ELF::SHF_EXECINSTR);
+ OutStreamer->SwitchSection(M, nullptr);
+ //
+ // .align 2
+ //
+ OutStreamer->EmitValueToAlignment(4);
+ MipsTargetStreamer &TS = getTargetStreamer();
+ //
+ // .set nomips16
+ // .set nomicromips
+ //
+ TS.emitDirectiveSetNoMips16();
+ TS.emitDirectiveSetNoMicroMips();
+ //
+ // .ent __call_stub_fp_xxxx
+ // .type __call_stub_fp_xxxx,@function
+ // __call_stub_fp_xxxx:
+ //
+ std::string x = "__call_stub_fp_" + std::string(Symbol);
+ MCSymbolELF *Stub =
+ cast<MCSymbolELF>(OutContext.getOrCreateSymbol(StringRef(x)));
+ TS.emitDirectiveEnt(*Stub);
+ MCSymbol *MType =
+ OutContext.getOrCreateSymbol("__call_stub_fp_" + Twine(Symbol));
+ OutStreamer->EmitSymbolAttribute(MType, MCSA_ELF_TypeFunction);
+ OutStreamer->EmitLabel(Stub);
+
+ // Only handle non-pic for now.
+ assert(!isPositionIndependent() &&
+ "should not be here if we are compiling pic");
+ TS.emitDirectiveSetReorder();
+ //
+ // We need to add a MipsMCExpr class to MCTargetDesc to fully implement
+ // stubs without raw text but this current patch is for compiler generated
+ // functions and they all return some value.
+ // The calling sequence for non pic is different in that case and we need
+ // to implement %lo and %hi in order to handle the case of no return value
+ // See the corresponding method in Mips16HardFloat for details.
+ //
+ // mov the return address to S2.
+ // we have no stack space to store it and we are about to make another call.
+ // We need to make sure that the enclosing function knows to save S2
+ // This should have already been handled.
+ //
+ // Mov $18, $31
+
+ EmitInstrRegRegReg(*STI, Mips::OR, Mips::S2, Mips::RA, Mips::ZERO);
+
+ EmitSwapFPIntParams(*STI, Signature->ParamSig, LE, true);
+
+ // Jal xxxx
+ //
+ EmitJal(*STI, MSymbol);
+
+ // fix return values
+ EmitSwapFPIntRetval(*STI, Signature->RetSig, LE);
+ //
+ // do the return
+ // if (Signature->RetSig == NoFPRet)
+ // llvm_unreachable("should not be any stubs here with no return value");
+ // else
+ EmitInstrReg(*STI, Mips::JR, Mips::S2);
+
+ MCSymbol *Tmp = OutContext.createTempSymbol();
+ OutStreamer->EmitLabel(Tmp);
+ const MCSymbolRefExpr *E = MCSymbolRefExpr::create(Stub, OutContext);
+ const MCSymbolRefExpr *T = MCSymbolRefExpr::create(Tmp, OutContext);
+ const MCExpr *T_min_E = MCBinaryExpr::createSub(T, E, OutContext);
+ OutStreamer->emitELFSize(Stub, T_min_E);
+ TS.emitDirectiveEnd(x);
+ OutStreamer->PopSection();
+}
+
+void MipsAsmPrinter::EmitEndOfAsmFile(Module &M) {
+ // Emit needed stubs
+ //
+ for (std::map<
+ const char *,
+ const llvm::Mips16HardFloatInfo::FuncSignature *>::const_iterator
+ it = StubsNeeded.begin();
+ it != StubsNeeded.end(); ++it) {
+ const char *Symbol = it->first;
+ const llvm::Mips16HardFloatInfo::FuncSignature *Signature = it->second;
+ EmitFPCallStub(Symbol, Signature);
+ }
+ // return to the text section
+ OutStreamer->SwitchSection(OutContext.getObjectFileInfo()->getTextSection());
+}
+
+void MipsAsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
+ raw_ostream &OS) {
+ // TODO: implement
+}
+
+// Align all targets of indirect branches on bundle size. Used only if target
+// is NaCl.
+void MipsAsmPrinter::NaClAlignIndirectJumpTargets(MachineFunction &MF) {
+ // Align all blocks that are jumped to through jump table.
+ if (MachineJumpTableInfo *JtInfo = MF.getJumpTableInfo()) {
+ const std::vector<MachineJumpTableEntry> &JT = JtInfo->getJumpTables();
+ for (unsigned I = 0; I < JT.size(); ++I) {
+ const std::vector<MachineBasicBlock*> &MBBs = JT[I].MBBs;
+
+ for (unsigned J = 0; J < MBBs.size(); ++J)
+ MBBs[J]->setAlignment(MIPS_NACL_BUNDLE_ALIGN);
+ }
+ }
+
+ // If basic block address is taken, block can be target of indirect branch.
+ for (auto &MBB : MF) {
+ if (MBB.hasAddressTaken())
+ MBB.setAlignment(MIPS_NACL_BUNDLE_ALIGN);
+ }
+}
+
+bool MipsAsmPrinter::isLongBranchPseudo(int Opcode) const {
+ return (Opcode == Mips::LONG_BRANCH_LUi
+ || Opcode == Mips::LONG_BRANCH_ADDiu
+ || Opcode == Mips::LONG_BRANCH_DADDiu);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeMipsAsmPrinter() {
+ RegisterAsmPrinter<MipsAsmPrinter> X(getTheMipsTarget());
+ RegisterAsmPrinter<MipsAsmPrinter> Y(getTheMipselTarget());
+ RegisterAsmPrinter<MipsAsmPrinter> A(getTheMips64Target());
+ RegisterAsmPrinter<MipsAsmPrinter> B(getTheMips64elTarget());
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.h b/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.h
new file mode 100644
index 000000000000..259e557e1283
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.h
@@ -0,0 +1,147 @@
+//===-- MipsAsmPrinter.h - Mips LLVM Assembly Printer ----------*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Mips Assembly printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSASMPRINTER_H
+#define LLVM_LIB_TARGET_MIPS_MIPSASMPRINTER_H
+
+#include "Mips16HardFloatInfo.h"
+#include "MipsMCInstLower.h"
+#include "MipsMachineFunction.h"
+#include "MipsSubtarget.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class MCStreamer;
+class MachineInstr;
+class MachineBasicBlock;
+class MipsTargetStreamer;
+class Module;
+class raw_ostream;
+
+class LLVM_LIBRARY_VISIBILITY MipsAsmPrinter : public AsmPrinter {
+ MipsTargetStreamer &getTargetStreamer() const;
+
+ void EmitInstrWithMacroNoAT(const MachineInstr *MI);
+
+private:
+ // tblgen'erated function.
+ bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
+ const MachineInstr *MI);
+
+ // Emit PseudoReturn, PseudoReturn64, PseudoIndirectBranch,
+ // and PseudoIndirectBranch64 as a JR, JR_MM, JALR, or JALR64 as appropriate
+ // for the target.
+ void emitPseudoIndirectBranch(MCStreamer &OutStreamer,
+ const MachineInstr *MI);
+
+ // lowerOperand - Convert a MachineOperand into the equivalent MCOperand.
+ bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp);
+
+ /// MCP - Keep a pointer to constantpool entries of the current
+ /// MachineFunction.
+ const MachineConstantPool *MCP;
+
+ /// InConstantPool - Maintain state when emitting a sequence of constant
+ /// pool entries so we can properly mark them as data regions.
+ bool InConstantPool;
+
+ std::map<const char *, const llvm::Mips16HardFloatInfo::FuncSignature *>
+ StubsNeeded;
+
+ void emitInlineAsmStart() const override;
+
+ void emitInlineAsmEnd(const MCSubtargetInfo &StartInfo,
+ const MCSubtargetInfo *EndInfo) const override;
+
+ void EmitJal(const MCSubtargetInfo &STI, MCSymbol *Symbol);
+
+ void EmitInstrReg(const MCSubtargetInfo &STI, unsigned Opcode, unsigned Reg);
+
+ void EmitInstrRegReg(const MCSubtargetInfo &STI, unsigned Opcode,
+ unsigned Reg1, unsigned Reg2);
+
+ void EmitInstrRegRegReg(const MCSubtargetInfo &STI, unsigned Opcode,
+ unsigned Reg1, unsigned Reg2, unsigned Reg3);
+
+ void EmitMovFPIntPair(const MCSubtargetInfo &STI, unsigned MovOpc,
+ unsigned Reg1, unsigned Reg2, unsigned FPReg1,
+ unsigned FPReg2, bool LE);
+
+ void EmitSwapFPIntParams(const MCSubtargetInfo &STI,
+ Mips16HardFloatInfo::FPParamVariant, bool LE,
+ bool ToFP);
+
+ void EmitSwapFPIntRetval(const MCSubtargetInfo &STI,
+ Mips16HardFloatInfo::FPReturnVariant, bool LE);
+
+ void EmitFPCallStub(const char *, const Mips16HardFloatInfo::FuncSignature *);
+
+ void NaClAlignIndirectJumpTargets(MachineFunction &MF);
+
+ bool isLongBranchPseudo(int Opcode) const;
+
+public:
+
+ const MipsSubtarget *Subtarget;
+ const MipsFunctionInfo *MipsFI;
+ MipsMCInstLower MCInstLowering;
+
+ explicit MipsAsmPrinter(TargetMachine &TM,
+ std::unique_ptr<MCStreamer> Streamer)
+ : AsmPrinter(TM, std::move(Streamer)), MCP(nullptr),
+ InConstantPool(false), MCInstLowering(*this) {}
+
+ StringRef getPassName() const override { return "Mips Assembly Printer"; }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ void EmitConstantPool() override {
+ bool UsingConstantPools =
+ (Subtarget->inMips16Mode() && Subtarget->useConstantIslands());
+ if (!UsingConstantPools)
+ AsmPrinter::EmitConstantPool();
+ // we emit constant pools customly!
+ }
+
+ void EmitInstruction(const MachineInstr *MI) override;
+ void printSavedRegsBitmask();
+ void emitFrameDirective();
+ const char *getCurrentABIString() const;
+ void EmitFunctionEntryLabel() override;
+ void EmitFunctionBodyStart() override;
+ void EmitFunctionBodyEnd() override;
+ void EmitBasicBlockEnd(const MachineBasicBlock &MBB) override;
+ bool isBlockOnlyReachableByFallthrough(
+ const MachineBasicBlock* MBB) const override;
+ bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &O) override;
+ bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &O) override;
+ void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O);
+ void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O);
+ void printMemOperandEA(const MachineInstr *MI, int opNum, raw_ostream &O);
+ void printFCCOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
+ const char *Modifier = nullptr);
+ void printRegisterList(const MachineInstr *MI, int opNum, raw_ostream &O);
+ void EmitStartOfAsmFile(Module &M) override;
+ void EmitEndOfAsmFile(Module &M) override;
+ void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
+};
+}
+
+#endif
+
diff --git a/contrib/llvm/lib/Target/Mips/MipsCCState.cpp b/contrib/llvm/lib/Target/Mips/MipsCCState.cpp
new file mode 100644
index 000000000000..7af988c1f64d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsCCState.cpp
@@ -0,0 +1,136 @@
+//===---- MipsCCState.cpp - CCState with Mips specific extensions ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsCCState.h"
+#include "MipsSubtarget.h"
+#include "llvm/IR/Module.h"
+
+using namespace llvm;
+
+/// This function returns true if CallSym is a long double emulation routine.
+static bool isF128SoftLibCall(const char *CallSym) {
+ const char *const LibCalls[] = {
+ "__addtf3", "__divtf3", "__eqtf2", "__extenddftf2",
+ "__extendsftf2", "__fixtfdi", "__fixtfsi", "__fixtfti",
+ "__fixunstfdi", "__fixunstfsi", "__fixunstfti", "__floatditf",
+ "__floatsitf", "__floattitf", "__floatunditf", "__floatunsitf",
+ "__floatuntitf", "__getf2", "__gttf2", "__letf2",
+ "__lttf2", "__multf3", "__netf2", "__powitf2",
+ "__subtf3", "__trunctfdf2", "__trunctfsf2", "__unordtf2",
+ "ceill", "copysignl", "cosl", "exp2l",
+ "expl", "floorl", "fmal", "fmodl",
+ "log10l", "log2l", "logl", "nearbyintl",
+ "powl", "rintl", "roundl", "sinl",
+ "sqrtl", "truncl"};
+
+ // Check that LibCalls is sorted alphabetically.
+ auto Comp = [](const char *S1, const char *S2) { return strcmp(S1, S2) < 0; };
+ assert(std::is_sorted(std::begin(LibCalls), std::end(LibCalls), Comp));
+ return std::binary_search(std::begin(LibCalls), std::end(LibCalls),
+ CallSym, Comp);
+}
+
+/// This function returns true if Ty is fp128, {f128} or i128 which was
+/// originally a fp128.
+static bool originalTypeIsF128(Type *Ty, const SDNode *CallNode) {
+ if (Ty->isFP128Ty())
+ return true;
+
+ if (Ty->isStructTy() && Ty->getStructNumElements() == 1 &&
+ Ty->getStructElementType(0)->isFP128Ty())
+ return true;
+
+ const ExternalSymbolSDNode *ES =
+ dyn_cast_or_null<const ExternalSymbolSDNode>(CallNode);
+
+ // If the Ty is i128 and the function being called is a long double emulation
+ // routine, then the original type is f128.
+ return (ES && Ty->isIntegerTy(128) && isF128SoftLibCall(ES->getSymbol()));
+}
+
+MipsCCState::SpecialCallingConvType
+MipsCCState::getSpecialCallingConvForCallee(const SDNode *Callee,
+ const MipsSubtarget &Subtarget) {
+ MipsCCState::SpecialCallingConvType SpecialCallingConv = NoSpecialCallingConv;
+ if (Subtarget.inMips16HardFloat()) {
+ if (const GlobalAddressSDNode *G =
+ dyn_cast<const GlobalAddressSDNode>(Callee)) {
+ llvm::StringRef Sym = G->getGlobal()->getName();
+ Function *F = G->getGlobal()->getParent()->getFunction(Sym);
+ if (F && F->hasFnAttribute("__Mips16RetHelper")) {
+ SpecialCallingConv = Mips16RetHelperConv;
+ }
+ }
+ }
+ return SpecialCallingConv;
+}
+
+void MipsCCState::PreAnalyzeCallResultForF128(
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const TargetLowering::CallLoweringInfo &CLI) {
+ for (unsigned i = 0; i < Ins.size(); ++i) {
+ OriginalArgWasF128.push_back(
+ originalTypeIsF128(CLI.RetTy, CLI.Callee.getNode()));
+ OriginalArgWasFloat.push_back(CLI.RetTy->isFloatingPointTy());
+ }
+}
+
+/// Identify lowered values that originated from f128 arguments and record
+/// this for use by RetCC_MipsN.
+void MipsCCState::PreAnalyzeReturnForF128(
+ const SmallVectorImpl<ISD::OutputArg> &Outs) {
+ const MachineFunction &MF = getMachineFunction();
+ for (unsigned i = 0; i < Outs.size(); ++i) {
+ OriginalArgWasF128.push_back(
+ originalTypeIsF128(MF.getFunction()->getReturnType(), nullptr));
+ OriginalArgWasFloat.push_back(
+ MF.getFunction()->getReturnType()->isFloatingPointTy());
+ }
+}
+
+/// Identify lowered values that originated from f128 arguments and record
+/// this.
+void MipsCCState::PreAnalyzeCallOperands(
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ std::vector<TargetLowering::ArgListEntry> &FuncArgs,
+ const SDNode *CallNode) {
+ for (unsigned i = 0; i < Outs.size(); ++i) {
+ OriginalArgWasF128.push_back(
+ originalTypeIsF128(FuncArgs[Outs[i].OrigArgIndex].Ty, CallNode));
+ OriginalArgWasFloat.push_back(
+ FuncArgs[Outs[i].OrigArgIndex].Ty->isFloatingPointTy());
+ CallOperandIsFixed.push_back(Outs[i].IsFixed);
+ }
+}
+
+/// Identify lowered values that originated from f128 arguments and record
+/// this.
+void MipsCCState::PreAnalyzeFormalArgumentsForF128(
+ const SmallVectorImpl<ISD::InputArg> &Ins) {
+ const MachineFunction &MF = getMachineFunction();
+ for (unsigned i = 0; i < Ins.size(); ++i) {
+ Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin();
+
+ // SRet arguments cannot originate from f128 or {f128} returns so we just
+ // push false. We have to handle this specially since SRet arguments
+ // aren't mapped to an original argument.
+ if (Ins[i].Flags.isSRet()) {
+ OriginalArgWasF128.push_back(false);
+ OriginalArgWasFloat.push_back(false);
+ continue;
+ }
+
+ assert(Ins[i].getOrigArgIndex() < MF.getFunction()->arg_size());
+ std::advance(FuncArg, Ins[i].getOrigArgIndex());
+
+ OriginalArgWasF128.push_back(
+ originalTypeIsF128(FuncArg->getType(), nullptr));
+ OriginalArgWasFloat.push_back(FuncArg->getType()->isFloatingPointTy());
+ }
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsCCState.h b/contrib/llvm/lib/Target/Mips/MipsCCState.h
new file mode 100644
index 000000000000..081c393a09be
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsCCState.h
@@ -0,0 +1,136 @@
+//===---- MipsCCState.h - CCState with Mips specific extensions -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSCCSTATE_H
+#define MIPSCCSTATE_H
+
+#include "MipsISelLowering.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+
+namespace llvm {
+class SDNode;
+class MipsSubtarget;
+
+class MipsCCState : public CCState {
+public:
+ enum SpecialCallingConvType { Mips16RetHelperConv, NoSpecialCallingConv };
+
+ /// Determine the SpecialCallingConvType for the given callee
+ static SpecialCallingConvType
+ getSpecialCallingConvForCallee(const SDNode *Callee,
+ const MipsSubtarget &Subtarget);
+
+private:
+ /// Identify lowered values that originated from f128 arguments and record
+ /// this for use by RetCC_MipsN.
+ void PreAnalyzeCallResultForF128(const SmallVectorImpl<ISD::InputArg> &Ins,
+ const TargetLowering::CallLoweringInfo &CLI);
+
+ /// Identify lowered values that originated from f128 arguments and record
+ /// this for use by RetCC_MipsN.
+ void PreAnalyzeReturnForF128(const SmallVectorImpl<ISD::OutputArg> &Outs);
+
+ /// Identify lowered values that originated from f128 arguments and record
+ /// this.
+ void
+ PreAnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
+ std::vector<TargetLowering::ArgListEntry> &FuncArgs,
+ const SDNode *CallNode);
+
+ /// Identify lowered values that originated from f128 arguments and record
+ /// this.
+ void
+ PreAnalyzeFormalArgumentsForF128(const SmallVectorImpl<ISD::InputArg> &Ins);
+
+ /// Records whether the value has been lowered from an f128.
+ SmallVector<bool, 4> OriginalArgWasF128;
+
+ /// Records whether the value has been lowered from float.
+ SmallVector<bool, 4> OriginalArgWasFloat;
+
+ /// Records whether the value was a fixed argument.
+ /// See ISD::OutputArg::IsFixed,
+ SmallVector<bool, 4> CallOperandIsFixed;
+
+ // Used to handle MIPS16-specific calling convention tweaks.
+ // FIXME: This should probably be a fully fledged calling convention.
+ SpecialCallingConvType SpecialCallingConv;
+
+public:
+ MipsCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
+ SmallVectorImpl<CCValAssign> &locs, LLVMContext &C,
+ SpecialCallingConvType SpecialCC = NoSpecialCallingConv)
+ : CCState(CC, isVarArg, MF, locs, C), SpecialCallingConv(SpecialCC) {}
+
+ void
+ AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
+ CCAssignFn Fn,
+ std::vector<TargetLowering::ArgListEntry> &FuncArgs,
+ const SDNode *CallNode) {
+ PreAnalyzeCallOperands(Outs, FuncArgs, CallNode);
+ CCState::AnalyzeCallOperands(Outs, Fn);
+ OriginalArgWasF128.clear();
+ OriginalArgWasFloat.clear();
+ CallOperandIsFixed.clear();
+ }
+
+ // The AnalyzeCallOperands in the base class is not usable since we must
+ // provide a means of accessing ArgListEntry::IsFixed. Delete them from this
+ // class. This doesn't stop them being used via the base class though.
+ void AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
+ CCAssignFn Fn) = delete;
+ void AnalyzeCallOperands(const SmallVectorImpl<MVT> &Outs,
+ SmallVectorImpl<ISD::ArgFlagsTy> &Flags,
+ CCAssignFn Fn) = delete;
+
+ void AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
+ CCAssignFn Fn) {
+ PreAnalyzeFormalArgumentsForF128(Ins);
+ CCState::AnalyzeFormalArguments(Ins, Fn);
+ OriginalArgWasFloat.clear();
+ OriginalArgWasF128.clear();
+ }
+
+ void AnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins,
+ CCAssignFn Fn,
+ const TargetLowering::CallLoweringInfo &CLI) {
+ PreAnalyzeCallResultForF128(Ins, CLI);
+ CCState::AnalyzeCallResult(Ins, Fn);
+ OriginalArgWasFloat.clear();
+ OriginalArgWasF128.clear();
+ }
+
+ void AnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
+ CCAssignFn Fn) {
+ PreAnalyzeReturnForF128(Outs);
+ CCState::AnalyzeReturn(Outs, Fn);
+ OriginalArgWasFloat.clear();
+ OriginalArgWasF128.clear();
+ }
+
+ bool CheckReturn(const SmallVectorImpl<ISD::OutputArg> &ArgsFlags,
+ CCAssignFn Fn) {
+ PreAnalyzeReturnForF128(ArgsFlags);
+ bool Return = CCState::CheckReturn(ArgsFlags, Fn);
+ OriginalArgWasFloat.clear();
+ OriginalArgWasF128.clear();
+ return Return;
+ }
+
+ bool WasOriginalArgF128(unsigned ValNo) { return OriginalArgWasF128[ValNo]; }
+ bool WasOriginalArgFloat(unsigned ValNo) {
+ return OriginalArgWasFloat[ValNo];
+ }
+ bool IsCallOperandFixed(unsigned ValNo) { return CallOperandIsFixed[ValNo]; }
+ SpecialCallingConvType getSpecialCallingConv() { return SpecialCallingConv; }
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsCallingConv.td b/contrib/llvm/lib/Target/Mips/MipsCallingConv.td
new file mode 100644
index 000000000000..a57cb7badc17
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsCallingConv.td
@@ -0,0 +1,406 @@
+//===-- MipsCallingConv.td - Calling Conventions for Mips --*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for Mips architecture.
+//===----------------------------------------------------------------------===//
+
+/// CCIfSubtarget - Match if the current subtarget has a feature F.
+class CCIfSubtarget<string F, CCAction A, string Invert = "">
+ : CCIf<!strconcat(Invert,
+ "static_cast<const MipsSubtarget&>"
+ "(State.getMachineFunction().getSubtarget()).",
+ F),
+ A>;
+
+// The inverse of CCIfSubtarget
+class CCIfSubtargetNot<string F, CCAction A> : CCIfSubtarget<F, A, "!">;
+
+/// Match if the original argument (before lowering) was a float.
+/// For example, this is true for i32's that were lowered from soft-float.
+class CCIfOrigArgWasNotFloat<CCAction A>
+ : CCIf<"!static_cast<MipsCCState *>(&State)->WasOriginalArgFloat(ValNo)",
+ A>;
+
+/// Match if the original argument (before lowering) was a 128-bit float (i.e.
+/// long double).
+class CCIfOrigArgWasF128<CCAction A>
+ : CCIf<"static_cast<MipsCCState *>(&State)->WasOriginalArgF128(ValNo)", A>;
+
+/// Match if this specific argument is a vararg.
+/// This is slightly different fro CCIfIsVarArg which matches if any argument is
+/// a vararg.
+class CCIfArgIsVarArg<CCAction A>
+ : CCIf<"!static_cast<MipsCCState *>(&State)->IsCallOperandFixed(ValNo)", A>;
+
+
+/// Match if the special calling conv is the specified value.
+class CCIfSpecialCallingConv<string CC, CCAction A>
+ : CCIf<"static_cast<MipsCCState *>(&State)->getSpecialCallingConv() == "
+ "MipsCCState::" # CC, A>;
+
+// For soft-float, f128 values are returned in A0_64 rather than V1_64.
+def RetCC_F128SoftFloat : CallingConv<[
+ CCAssignToReg<[V0_64, A0_64]>
+]>;
+
+// For hard-float, f128 values are returned as a pair of f64's rather than a
+// pair of i64's.
+def RetCC_F128HardFloat : CallingConv<[
+ CCBitConvertToType<f64>,
+
+ // Contrary to the ABI documentation, a struct containing a long double is
+ // returned in $f0, and $f1 instead of the usual $f0, and $f2. This is to
+ // match the de facto ABI as implemented by GCC.
+ CCIfInReg<CCAssignToReg<[D0_64, D1_64]>>,
+
+ CCAssignToReg<[D0_64, D2_64]>
+]>;
+
+// Handle F128 specially since we can't identify the original type during the
+// tablegen-erated code.
+def RetCC_F128 : CallingConv<[
+ CCIfSubtarget<"useSoftFloat()",
+ CCIfType<[i64], CCDelegateTo<RetCC_F128SoftFloat>>>,
+ CCIfSubtargetNot<"useSoftFloat()",
+ CCIfType<[i64], CCDelegateTo<RetCC_F128HardFloat>>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Mips O32 Calling Convention
+//===----------------------------------------------------------------------===//
+
+def CC_MipsO32 : CallingConv<[
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+ // Integer values get stored in stack slots that are 4 bytes in
+ // size and 4-byte aligned.
+ CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+
+ // Integer values get stored in stack slots that are 8 bytes in
+ // size and 8-byte aligned.
+ CCIfType<[f64], CCAssignToStack<8, 8>>
+]>;
+
+// Only the return rules are defined here for O32. The rules for argument
+// passing are defined in MipsISelLowering.cpp.
+def RetCC_MipsO32 : CallingConv<[
+ // Promote i1/i8/i16 return values to i32.
+ CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+ // i32 are returned in registers V0, V1, A0, A1
+ CCIfType<[i32], CCAssignToReg<[V0, V1, A0, A1]>>,
+
+ // f32 are returned in registers F0, F2
+ CCIfType<[f32], CCAssignToReg<[F0, F2]>>,
+
+ // f64 arguments are returned in D0_64 and D2_64 in FP64bit mode or
+ // in D0 and D1 in FP32bit mode.
+ CCIfType<[f64], CCIfSubtarget<"isFP64bit()", CCAssignToReg<[D0_64, D2_64]>>>,
+ CCIfType<[f64], CCIfSubtargetNot<"isFP64bit()", CCAssignToReg<[D0, D1]>>>
+]>;
+
+def CC_MipsO32_FP32 : CustomCallingConv;
+def CC_MipsO32_FP64 : CustomCallingConv;
+
+def CC_MipsO32_FP : CallingConv<[
+ CCIfSubtargetNot<"isFP64bit()", CCDelegateTo<CC_MipsO32_FP32>>,
+ CCIfSubtarget<"isFP64bit()", CCDelegateTo<CC_MipsO32_FP64>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Mips N32/64 Calling Convention
+//===----------------------------------------------------------------------===//
+
+def CC_MipsN_SoftFloat : CallingConv<[
+ CCAssignToRegWithShadow<[A0, A1, A2, A3,
+ T0, T1, T2, T3],
+ [D12_64, D13_64, D14_64, D15_64,
+ D16_64, D17_64, D18_64, D19_64]>,
+ CCAssignToStack<4, 8>
+]>;
+
+def CC_MipsN : CallingConv<[
+ CCIfType<[i8, i16, i32, i64],
+ CCIfSubtargetNot<"isLittle()",
+ CCIfInReg<CCPromoteToUpperBitsInType<i64>>>>,
+
+ // All integers (except soft-float integers) are promoted to 64-bit.
+ CCIfType<[i8, i16, i32], CCIfOrigArgWasNotFloat<CCPromoteToType<i64>>>,
+
+ // The only i32's we have left are soft-float arguments.
+ CCIfSubtarget<"useSoftFloat()", CCIfType<[i32], CCDelegateTo<CC_MipsN_SoftFloat>>>,
+
+ // Integer arguments are passed in integer registers.
+ CCIfType<[i64], CCAssignToRegWithShadow<[A0_64, A1_64, A2_64, A3_64,
+ T0_64, T1_64, T2_64, T3_64],
+ [D12_64, D13_64, D14_64, D15_64,
+ D16_64, D17_64, D18_64, D19_64]>>,
+
+ // f32 arguments are passed in single precision FP registers.
+ CCIfType<[f32], CCAssignToRegWithShadow<[F12, F13, F14, F15,
+ F16, F17, F18, F19],
+ [A0_64, A1_64, A2_64, A3_64,
+ T0_64, T1_64, T2_64, T3_64]>>,
+
+ // f64 arguments are passed in double precision FP registers.
+ CCIfType<[f64], CCAssignToRegWithShadow<[D12_64, D13_64, D14_64, D15_64,
+ D16_64, D17_64, D18_64, D19_64],
+ [A0_64, A1_64, A2_64, A3_64,
+ T0_64, T1_64, T2_64, T3_64]>>,
+
+ // All stack parameter slots become 64-bit doublewords and are 8-byte aligned.
+ CCIfType<[f32], CCAssignToStack<4, 8>>,
+ CCIfType<[i64, f64], CCAssignToStack<8, 8>>
+]>;
+
+// N32/64 variable arguments.
+// All arguments are passed in integer registers.
+def CC_MipsN_VarArg : CallingConv<[
+ CCIfType<[i8, i16, i32, i64],
+ CCIfSubtargetNot<"isLittle()",
+ CCIfInReg<CCPromoteToUpperBitsInType<i64>>>>,
+
+ // All integers are promoted to 64-bit.
+ CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+ CCIfType<[f32], CCAssignToReg<[A0, A1, A2, A3, T0, T1, T2, T3]>>,
+
+ CCIfType<[i64, f64], CCAssignToReg<[A0_64, A1_64, A2_64, A3_64,
+ T0_64, T1_64, T2_64, T3_64]>>,
+
+ // All stack parameter slots become 64-bit doublewords and are 8-byte aligned.
+ CCIfType<[f32], CCAssignToStack<4, 8>>,
+ CCIfType<[i64, f64], CCAssignToStack<8, 8>>
+]>;
+
+def RetCC_MipsN : CallingConv<[
+ // f128 needs to be handled similarly to f32 and f64. However, f128 is not
+ // legal and is lowered to i128 which is further lowered to a pair of i64's.
+ // This presents us with a problem for the calling convention since hard-float
+ // still needs to pass them in FPU registers, and soft-float needs to use $v0,
+ // and $a0 instead of the usual $v0, and $v1. We therefore resort to a
+ // pre-analyze (see PreAnalyzeReturnForF128()) step to pass information on
+ // whether the result was originally an f128 into the tablegen-erated code.
+ //
+ // f128 should only occur for the N64 ABI where long double is 128-bit. On
+ // N32, long double is equivalent to double.
+ CCIfType<[i64], CCIfOrigArgWasF128<CCDelegateTo<RetCC_F128>>>,
+
+ // Aggregate returns are positioned at the lowest address in the slot for
+ // both little and big-endian targets. When passing in registers, this
+ // requires that big-endian targets shift the value into the upper bits.
+ CCIfSubtarget<"isLittle()",
+ CCIfType<[i8, i16, i32, i64], CCIfInReg<CCPromoteToType<i64>>>>,
+ CCIfSubtargetNot<"isLittle()",
+ CCIfType<[i8, i16, i32, i64],
+ CCIfInReg<CCPromoteToUpperBitsInType<i64>>>>,
+
+ // i64 are returned in registers V0_64, V1_64
+ CCIfType<[i64], CCAssignToReg<[V0_64, V1_64]>>,
+
+ // f32 are returned in registers F0, F2
+ CCIfType<[f32], CCAssignToReg<[F0, F2]>>,
+
+ // f64 are returned in registers D0, D2
+ CCIfType<[f64], CCAssignToReg<[D0_64, D2_64]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Mips FastCC Calling Convention
+//===----------------------------------------------------------------------===//
+def CC_MipsO32_FastCC : CallingConv<[
+ // f64 arguments are passed in double-precision floating pointer registers.
+ CCIfType<[f64], CCIfSubtargetNot<"isFP64bit()",
+ CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6,
+ D7, D8, D9]>>>,
+ CCIfType<[f64], CCIfSubtarget<"isFP64bit()", CCIfSubtarget<"useOddSPReg()",
+ CCAssignToReg<[D0_64, D1_64, D2_64, D3_64,
+ D4_64, D5_64, D6_64, D7_64,
+ D8_64, D9_64, D10_64, D11_64,
+ D12_64, D13_64, D14_64, D15_64,
+ D16_64, D17_64, D18_64,
+ D19_64]>>>>,
+ CCIfType<[f64], CCIfSubtarget<"isFP64bit()", CCIfSubtarget<"noOddSPReg()",
+ CCAssignToReg<[D0_64, D2_64, D4_64, D6_64,
+ D8_64, D10_64, D12_64, D14_64,
+ D16_64, D18_64]>>>>,
+
+ // Stack parameter slots for f64 are 64-bit doublewords and 8-byte aligned.
+ CCIfType<[f64], CCAssignToStack<8, 8>>
+]>;
+
+def CC_MipsN_FastCC : CallingConv<[
+ // Integer arguments are passed in integer registers.
+ CCIfType<[i64], CCAssignToReg<[A0_64, A1_64, A2_64, A3_64, T0_64, T1_64,
+ T2_64, T3_64, T4_64, T5_64, T6_64, T7_64,
+ T8_64, V1_64]>>,
+
+ // f64 arguments are passed in double-precision floating pointer registers.
+ CCIfType<[f64], CCAssignToReg<[D0_64, D1_64, D2_64, D3_64, D4_64, D5_64,
+ D6_64, D7_64, D8_64, D9_64, D10_64, D11_64,
+ D12_64, D13_64, D14_64, D15_64, D16_64, D17_64,
+ D18_64, D19_64]>>,
+
+ // Stack parameter slots for i64 and f64 are 64-bit doublewords and
+ // 8-byte aligned.
+ CCIfType<[i64, f64], CCAssignToStack<8, 8>>
+]>;
+
+def CC_Mips_FastCC : CallingConv<[
+ // Handles byval parameters.
+ CCIfByVal<CCPassByVal<4, 4>>,
+
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // Integer arguments are passed in integer registers. All scratch registers,
+ // except for AT, V0 and T9, are available to be used as argument registers.
+ CCIfType<[i32], CCIfSubtargetNot<"isTargetNaCl()",
+ CCAssignToReg<[A0, A1, A2, A3, T0, T1, T2, T3, T4, T5, T6, T7, T8, V1]>>>,
+
+ // In NaCl, T6, T7 and T8 are reserved and not available as argument
+ // registers for fastcc. T6 contains the mask for sandboxing control flow
+ // (indirect jumps and calls). T7 contains the mask for sandboxing memory
+ // accesses (loads and stores). T8 contains the thread pointer.
+ CCIfType<[i32], CCIfSubtarget<"isTargetNaCl()",
+ CCAssignToReg<[A0, A1, A2, A3, T0, T1, T2, T3, T4, T5, V1]>>>,
+
+ // f32 arguments are passed in single-precision floating pointer registers.
+ CCIfType<[f32], CCIfSubtarget<"useOddSPReg()",
+ CCAssignToReg<[F0, F1, F2, F3, F4, F5, F6, F7, F8, F9, F10, F11, F12, F13,
+ F14, F15, F16, F17, F18, F19]>>>,
+
+ // Don't use odd numbered single-precision registers for -mno-odd-spreg.
+ CCIfType<[f32], CCIfSubtarget<"noOddSPReg()",
+ CCAssignToReg<[F0, F2, F4, F6, F8, F10, F12, F14, F16, F18]>>>,
+
+ // Stack parameter slots for i32 and f32 are 32-bit words and 4-byte aligned.
+ CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+
+ CCIfSubtarget<"isABI_O32()", CCDelegateTo<CC_MipsO32_FastCC>>,
+ CCDelegateTo<CC_MipsN_FastCC>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Mips Calling Convention Dispatch
+//===----------------------------------------------------------------------===//
+
+def RetCC_Mips : CallingConv<[
+ CCIfSubtarget<"isABI_N32()", CCDelegateTo<RetCC_MipsN>>,
+ CCIfSubtarget<"isABI_N64()", CCDelegateTo<RetCC_MipsN>>,
+ CCDelegateTo<RetCC_MipsO32>
+]>;
+
+def CC_Mips_ByVal : CallingConv<[
+ CCIfSubtarget<"isABI_O32()", CCIfByVal<CCPassByVal<4, 4>>>,
+ CCIfByVal<CCPassByVal<8, 8>>
+]>;
+
+def CC_Mips16RetHelper : CallingConv<[
+ CCIfByVal<CCDelegateTo<CC_Mips_ByVal>>,
+
+ // Integer arguments are passed in integer registers.
+ CCIfType<[i32], CCAssignToReg<[V0, V1, A0, A1]>>
+]>;
+
+def CC_Mips_FixedArg : CallingConv<[
+ // Mips16 needs special handling on some functions.
+ CCIf<"State.getCallingConv() != CallingConv::Fast",
+ CCIfSpecialCallingConv<"Mips16RetHelperConv",
+ CCDelegateTo<CC_Mips16RetHelper>>>,
+
+ CCIfByVal<CCDelegateTo<CC_Mips_ByVal>>,
+
+ // f128 needs to be handled similarly to f32 and f64 on hard-float. However,
+ // f128 is not legal and is lowered to i128 which is further lowered to a pair
+ // of i64's.
+ // This presents us with a problem for the calling convention since hard-float
+ // still needs to pass them in FPU registers. We therefore resort to a
+ // pre-analyze (see PreAnalyzeFormalArgsForF128()) step to pass information on
+ // whether the argument was originally an f128 into the tablegen-erated code.
+ //
+ // f128 should only occur for the N64 ABI where long double is 128-bit. On
+ // N32, long double is equivalent to double.
+ CCIfType<[i64],
+ CCIfSubtargetNot<"useSoftFloat()",
+ CCIfOrigArgWasF128<CCBitConvertToType<f64>>>>,
+
+ CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_Mips_FastCC>>,
+
+ CCIfSubtarget<"isABI_O32()", CCDelegateTo<CC_MipsO32_FP>>,
+ CCDelegateTo<CC_MipsN>
+]>;
+
+def CC_Mips_VarArg : CallingConv<[
+ CCIfByVal<CCDelegateTo<CC_Mips_ByVal>>,
+
+ CCIfSubtarget<"isABI_O32()", CCDelegateTo<CC_MipsO32_FP>>,
+ CCDelegateTo<CC_MipsN_VarArg>
+]>;
+
+def CC_Mips : CallingConv<[
+ CCIfVarArg<CCIfArgIsVarArg<CCDelegateTo<CC_Mips_VarArg>>>,
+ CCDelegateTo<CC_Mips_FixedArg>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Callee-saved register lists.
+//===----------------------------------------------------------------------===//
+
+def CSR_SingleFloatOnly : CalleeSavedRegs<(add (sequence "F%u", 31, 20), RA, FP,
+ (sequence "S%u", 7, 0))>;
+
+def CSR_O32_FPXX : CalleeSavedRegs<(add (sequence "D%u", 15, 10), RA, FP,
+ (sequence "S%u", 7, 0))> {
+ let OtherPreserved = (add (decimate (sequence "F%u", 30, 20), 2));
+}
+
+def CSR_O32 : CalleeSavedRegs<(add (sequence "D%u", 15, 10), RA, FP,
+ (sequence "S%u", 7, 0))>;
+
+def CSR_O32_FP64 :
+ CalleeSavedRegs<(add (decimate (sequence "D%u_64", 30, 20), 2), RA, FP,
+ (sequence "S%u", 7, 0))>;
+
+def CSR_N32 : CalleeSavedRegs<(add D20_64, D22_64, D24_64, D26_64, D28_64,
+ D30_64, RA_64, FP_64, GP_64,
+ (sequence "S%u_64", 7, 0))>;
+
+def CSR_N64 : CalleeSavedRegs<(add (sequence "D%u_64", 31, 24), RA_64, FP_64,
+ GP_64, (sequence "S%u_64", 7, 0))>;
+
+def CSR_Mips16RetHelper :
+ CalleeSavedRegs<(add V0, V1, FP,
+ (sequence "A%u", 3, 0), (sequence "S%u", 7, 0),
+ (sequence "D%u", 15, 10))>;
+
+def CSR_Interrupt_32R6 : CalleeSavedRegs<(add (sequence "A%u", 3, 0),
+ (sequence "S%u", 7, 0),
+ (sequence "V%u", 1, 0),
+ (sequence "T%u", 9, 0),
+ RA, FP, GP, AT)>;
+
+def CSR_Interrupt_32 : CalleeSavedRegs<(add (sequence "A%u", 3, 0),
+ (sequence "S%u", 7, 0),
+ (sequence "V%u", 1, 0),
+ (sequence "T%u", 9, 0),
+ RA, FP, GP, AT, LO0, HI0)>;
+
+def CSR_Interrupt_64R6 : CalleeSavedRegs<(add (sequence "A%u_64", 3, 0),
+ (sequence "V%u_64", 1, 0),
+ (sequence "S%u_64", 7, 0),
+ (sequence "T%u_64", 9, 0),
+ RA_64, FP_64, GP_64, AT_64)>;
+
+def CSR_Interrupt_64 : CalleeSavedRegs<(add (sequence "A%u_64", 3, 0),
+ (sequence "S%u_64", 7, 0),
+ (sequence "T%u_64", 9, 0),
+ (sequence "V%u_64", 1, 0),
+ RA_64, FP_64, GP_64, AT_64,
+ LO0_64, HI0_64)>;
diff --git a/contrib/llvm/lib/Target/Mips/MipsCondMov.td b/contrib/llvm/lib/Target/Mips/MipsCondMov.td
new file mode 100644
index 000000000000..fd4517f25335
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsCondMov.td
@@ -0,0 +1,299 @@
+//===-- MipsCondMov.td - Describe Mips Conditional Moves --*- tablegen -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the Conditional Moves implementation.
+//
+//===----------------------------------------------------------------------===//
+
+// Conditional moves:
+// These instructions are expanded in
+// MipsISelLowering::EmitInstrWithCustomInserter if target does not have
+// conditional move instructions.
+// cond:int, data:int
+class CMov_I_I_FT<string opstr, RegisterOperand CRC, RegisterOperand DRC,
+ InstrItinClass Itin> :
+ InstSE<(outs DRC:$rd), (ins DRC:$rs, CRC:$rt, DRC:$F),
+ !strconcat(opstr, "\t$rd, $rs, $rt"), [], Itin, FrmFR, opstr> {
+ let Constraints = "$F = $rd";
+}
+
+// cond:int, data:float
+class CMov_I_F_FT<string opstr, RegisterOperand CRC, RegisterOperand DRC,
+ InstrItinClass Itin> :
+ InstSE<(outs DRC:$fd), (ins DRC:$fs, CRC:$rt, DRC:$F),
+ !strconcat(opstr, "\t$fd, $fs, $rt"), [], Itin, FrmFR, opstr>,
+ HARDFLOAT {
+ let Constraints = "$F = $fd";
+}
+
+// cond:float, data:int
+class CMov_F_I_FT<string opstr, RegisterOperand RC, InstrItinClass Itin,
+ SDPatternOperator OpNode = null_frag> :
+ InstSE<(outs RC:$rd), (ins RC:$rs, FCCRegsOpnd:$fcc, RC:$F),
+ !strconcat(opstr, "\t$rd, $rs, $fcc"),
+ [(set RC:$rd, (OpNode RC:$rs, FCCRegsOpnd:$fcc, RC:$F))],
+ Itin, FrmFR, opstr>, HARDFLOAT {
+ let Constraints = "$F = $rd";
+}
+
+// cond:float, data:float
+class CMov_F_F_FT<string opstr, RegisterOperand RC, InstrItinClass Itin,
+ SDPatternOperator OpNode = null_frag> :
+ InstSE<(outs RC:$fd), (ins RC:$fs, FCCRegsOpnd:$fcc, RC:$F),
+ !strconcat(opstr, "\t$fd, $fs, $fcc"),
+ [(set RC:$fd, (OpNode RC:$fs, FCCRegsOpnd:$fcc, RC:$F))],
+ Itin, FrmFR, opstr>, HARDFLOAT {
+ let Constraints = "$F = $fd";
+}
+
+// select patterns
+multiclass MovzPats0<RegisterClass CRC, RegisterClass DRC,
+ Instruction MOVZInst, Instruction SLTOp,
+ Instruction SLTuOp, Instruction SLTiOp,
+ Instruction SLTiuOp> {
+ def : MipsPat<(select (i32 (setge CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
+ (MOVZInst DRC:$T, (SLTOp CRC:$lhs, CRC:$rhs), DRC:$F)>;
+ def : MipsPat<(select (i32 (setuge CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
+ (MOVZInst DRC:$T, (SLTuOp CRC:$lhs, CRC:$rhs), DRC:$F)>;
+ def : MipsPat<(select (i32 (setge CRC:$lhs, immSExt16:$rhs)), DRC:$T, DRC:$F),
+ (MOVZInst DRC:$T, (SLTiOp CRC:$lhs, immSExt16:$rhs), DRC:$F)>;
+ def : MipsPat<(select (i32 (setuge CRC:$lh, immSExt16:$rh)), DRC:$T, DRC:$F),
+ (MOVZInst DRC:$T, (SLTiuOp CRC:$lh, immSExt16:$rh), DRC:$F)>;
+ def : MipsPat<(select (i32 (setle CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
+ (MOVZInst DRC:$T, (SLTOp CRC:$rhs, CRC:$lhs), DRC:$F)>;
+ def : MipsPat<(select (i32 (setule CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
+ (MOVZInst DRC:$T, (SLTuOp CRC:$rhs, CRC:$lhs), DRC:$F)>;
+ def : MipsPat<(select (i32 (setgt CRC:$lhs, immSExt16Plus1:$rhs)),
+ DRC:$T, DRC:$F),
+ (MOVZInst DRC:$T, (SLTiOp CRC:$lhs, (Plus1 imm:$rhs)), DRC:$F)>;
+ def : MipsPat<(select (i32 (setugt CRC:$lhs, immSExt16Plus1:$rhs)),
+ DRC:$T, DRC:$F),
+ (MOVZInst DRC:$T, (SLTiuOp CRC:$lhs, (Plus1 imm:$rhs)),
+ DRC:$F)>;
+}
+
+multiclass MovzPats1<RegisterClass CRC, RegisterClass DRC,
+ Instruction MOVZInst, Instruction XOROp> {
+ def : MipsPat<(select (i32 (seteq CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
+ (MOVZInst DRC:$T, (XOROp CRC:$lhs, CRC:$rhs), DRC:$F)>;
+ def : MipsPat<(select (i32 (seteq CRC:$lhs, 0)), DRC:$T, DRC:$F),
+ (MOVZInst DRC:$T, CRC:$lhs, DRC:$F)>;
+}
+
+multiclass MovzPats2<RegisterClass CRC, RegisterClass DRC,
+ Instruction MOVZInst, Instruction XORiOp> {
+ def : MipsPat<
+ (select (i32 (seteq CRC:$lhs, immZExt16:$uimm16)), DRC:$T, DRC:$F),
+ (MOVZInst DRC:$T, (XORiOp CRC:$lhs, immZExt16:$uimm16), DRC:$F)>;
+}
+
+multiclass MovnPats<RegisterClass CRC, RegisterClass DRC, Instruction MOVNInst,
+ Instruction XOROp> {
+ def : MipsPat<(select (i32 (setne CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
+ (MOVNInst DRC:$T, (XOROp CRC:$lhs, CRC:$rhs), DRC:$F)>;
+ def : MipsPat<(select CRC:$cond, DRC:$T, DRC:$F),
+ (MOVNInst DRC:$T, CRC:$cond, DRC:$F)>;
+ def : MipsPat<(select (i32 (setne CRC:$lhs, 0)),DRC:$T, DRC:$F),
+ (MOVNInst DRC:$T, CRC:$lhs, DRC:$F)>;
+}
+
+// Instantiation of instructions.
+def MOVZ_I_I : MMRel, CMov_I_I_FT<"movz", GPR32Opnd, GPR32Opnd, II_MOVZ>,
+ ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+let isCodeGenOnly = 1 in {
+ def MOVZ_I_I64 : CMov_I_I_FT<"movz", GPR32Opnd, GPR64Opnd, II_MOVZ>,
+ ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6;
+ def MOVZ_I64_I : CMov_I_I_FT<"movz", GPR64Opnd, GPR32Opnd, II_MOVZ>,
+ ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6;
+ def MOVZ_I64_I64 : CMov_I_I_FT<"movz", GPR64Opnd, GPR64Opnd, II_MOVZ>,
+ ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6;
+}
+
+def MOVN_I_I : MMRel, CMov_I_I_FT<"movn", GPR32Opnd, GPR32Opnd, II_MOVN>,
+ ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+let isCodeGenOnly = 1 in {
+ def MOVN_I_I64 : CMov_I_I_FT<"movn", GPR32Opnd, GPR64Opnd, II_MOVN>,
+ ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6;
+ def MOVN_I64_I : CMov_I_I_FT<"movn", GPR64Opnd, GPR32Opnd, II_MOVN>,
+ ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6;
+ def MOVN_I64_I64 : CMov_I_I_FT<"movn", GPR64Opnd, GPR64Opnd, II_MOVN>,
+ ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6;
+}
+
+def MOVZ_I_S : MMRel, CMov_I_F_FT<"movz.s", GPR32Opnd, FGR32Opnd, II_MOVZ_S>,
+ CMov_I_F_FM<18, 16>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+let isCodeGenOnly = 1 in
+def MOVZ_I64_S : CMov_I_F_FT<"movz.s", GPR64Opnd, FGR32Opnd, II_MOVZ_S>,
+ CMov_I_F_FM<18, 16>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+
+def MOVN_I_S : MMRel, CMov_I_F_FT<"movn.s", GPR32Opnd, FGR32Opnd, II_MOVN_S>,
+ CMov_I_F_FM<19, 16>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+let isCodeGenOnly = 1 in
+def MOVN_I64_S : CMov_I_F_FT<"movn.s", GPR64Opnd, FGR32Opnd, II_MOVN_S>,
+ CMov_I_F_FM<19, 16>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+
+def MOVZ_I_D32 : MMRel, CMov_I_F_FT<"movz.d", GPR32Opnd, AFGR64Opnd,
+ II_MOVZ_D>, CMov_I_F_FM<18, 17>,
+ INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
+def MOVN_I_D32 : MMRel, CMov_I_F_FT<"movn.d", GPR32Opnd, AFGR64Opnd,
+ II_MOVN_D>, CMov_I_F_FM<19, 17>,
+ INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
+
+let DecoderNamespace = "Mips64" in {
+ def MOVZ_I_D64 : CMov_I_F_FT<"movz.d", GPR32Opnd, FGR64Opnd, II_MOVZ_D>,
+ CMov_I_F_FM<18, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+ def MOVN_I_D64 : CMov_I_F_FT<"movn.d", GPR32Opnd, FGR64Opnd, II_MOVN_D>,
+ CMov_I_F_FM<19, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+ let isCodeGenOnly = 1 in {
+ def MOVZ_I64_D64 : CMov_I_F_FT<"movz.d", GPR64Opnd, FGR64Opnd, II_MOVZ_D>,
+ CMov_I_F_FM<18, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+ def MOVN_I64_D64 : CMov_I_F_FT<"movn.d", GPR64Opnd, FGR64Opnd, II_MOVN_D>,
+ CMov_I_F_FM<19, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+ }
+}
+
+def MOVT_I : MMRel, CMov_F_I_FT<"movt", GPR32Opnd, II_MOVT, MipsCMovFP_T>,
+ CMov_F_I_FM<1>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+let isCodeGenOnly = 1 in
+def MOVT_I64 : CMov_F_I_FT<"movt", GPR64Opnd, II_MOVT, MipsCMovFP_T>,
+ CMov_F_I_FM<1>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+
+def MOVF_I : MMRel, CMov_F_I_FT<"movf", GPR32Opnd, II_MOVF, MipsCMovFP_F>,
+ CMov_F_I_FM<0>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+let isCodeGenOnly = 1 in
+def MOVF_I64 : CMov_F_I_FT<"movf", GPR64Opnd, II_MOVF, MipsCMovFP_F>,
+ CMov_F_I_FM<0>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+
+def MOVT_S : MMRel, CMov_F_F_FT<"movt.s", FGR32Opnd, II_MOVT_S, MipsCMovFP_T>,
+ CMov_F_F_FM<16, 1>, INSN_MIPS4_32_NOT_32R6_64R6;
+def MOVF_S : MMRel, CMov_F_F_FT<"movf.s", FGR32Opnd, II_MOVF_S, MipsCMovFP_F>,
+ CMov_F_F_FM<16, 0>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+def MOVT_D32 : MMRel, CMov_F_F_FT<"movt.d", AFGR64Opnd, II_MOVT_D,
+ MipsCMovFP_T>, CMov_F_F_FM<17, 1>,
+ INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
+def MOVF_D32 : MMRel, CMov_F_F_FT<"movf.d", AFGR64Opnd, II_MOVF_D,
+ MipsCMovFP_F>, CMov_F_F_FM<17, 0>,
+ INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
+
+let DecoderNamespace = "Mips64" in {
+ def MOVT_D64 : CMov_F_F_FT<"movt.d", FGR64Opnd, II_MOVT_D, MipsCMovFP_T>,
+ CMov_F_F_FM<17, 1>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+ def MOVF_D64 : CMov_F_F_FT<"movf.d", FGR64Opnd, II_MOVF_D, MipsCMovFP_F>,
+ CMov_F_F_FM<17, 0>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+}
+
+// Instantiation of conditional move patterns.
+defm : MovzPats0<GPR32, GPR32, MOVZ_I_I, SLT, SLTu, SLTi, SLTiu>,
+ INSN_MIPS4_32_NOT_32R6_64R6;
+defm : MovzPats1<GPR32, GPR32, MOVZ_I_I, XOR>, INSN_MIPS4_32_NOT_32R6_64R6;
+defm : MovzPats2<GPR32, GPR32, MOVZ_I_I, XORi>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+defm : MovzPats0<GPR32, GPR64, MOVZ_I_I64, SLT, SLTu, SLTi, SLTiu>,
+ INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats0<GPR64, GPR32, MOVZ_I_I, SLT64, SLTu64, SLTi64, SLTiu64>,
+ INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats0<GPR64, GPR64, MOVZ_I_I64, SLT64, SLTu64, SLTi64, SLTiu64>,
+ INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats1<GPR32, GPR64, MOVZ_I_I64, XOR>,
+ INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats1<GPR64, GPR32, MOVZ_I64_I, XOR64>,
+ INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats1<GPR64, GPR64, MOVZ_I64_I64, XOR64>,
+ INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats2<GPR32, GPR64, MOVZ_I_I64, XORi>,
+ INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats2<GPR64, GPR32, MOVZ_I64_I, XORi64>,
+ INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats2<GPR64, GPR64, MOVZ_I64_I64, XORi64>,
+ INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+
+defm : MovnPats<GPR32, GPR32, MOVN_I_I, XOR>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+defm : MovnPats<GPR32, GPR64, MOVN_I_I64, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
+ GPR_64;
+defm : MovnPats<GPR64, GPR32, MOVN_I64_I, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
+ GPR_64;
+defm : MovnPats<GPR64, GPR64, MOVN_I64_I64, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
+ GPR_64;
+
+defm : MovzPats0<GPR32, FGR32, MOVZ_I_S, SLT, SLTu, SLTi, SLTiu>,
+ INSN_MIPS4_32_NOT_32R6_64R6;
+defm : MovzPats1<GPR32, FGR32, MOVZ_I_S, XOR>, INSN_MIPS4_32_NOT_32R6_64R6;
+defm : MovnPats<GPR32, FGR32, MOVN_I_S, XOR>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+defm : MovzPats0<GPR64, FGR32, MOVZ_I_S, SLT64, SLTu64, SLTi64, SLTiu64>,
+ INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats1<GPR64, FGR32, MOVZ_I64_S, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
+ GPR_64;
+defm : MovnPats<GPR64, FGR32, MOVN_I64_S, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
+ GPR_64;
+
+defm : MovzPats0<GPR32, AFGR64, MOVZ_I_D32, SLT, SLTu, SLTi, SLTiu>,
+ INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
+defm : MovzPats1<GPR32, AFGR64, MOVZ_I_D32, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
+ FGR_32;
+defm : MovnPats<GPR32, AFGR64, MOVN_I_D32, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
+ FGR_32;
+
+defm : MovzPats0<GPR32, FGR64, MOVZ_I_D64, SLT, SLTu, SLTi, SLTiu>,
+ INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+defm : MovzPats0<GPR64, FGR64, MOVZ_I_D64, SLT64, SLTu64, SLTi64, SLTiu64>,
+ INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+defm : MovzPats1<GPR32, FGR64, MOVZ_I_D64, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
+ FGR_64;
+defm : MovzPats1<GPR64, FGR64, MOVZ_I64_D64, XOR64>,
+ INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+defm : MovnPats<GPR32, FGR64, MOVN_I_D64, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
+ FGR_64;
+defm : MovnPats<GPR64, FGR64, MOVN_I64_D64, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
+ FGR_64;
+
+// For targets that don't have conditional-move instructions
+// we have to match SELECT nodes with pseudo instructions.
+let usesCustomInserter = 1 in {
+ class Select_Pseudo<RegisterOperand RC> :
+ PseudoSE<(outs RC:$dst), (ins GPR32Opnd:$cond, RC:$T, RC:$F),
+ [(set RC:$dst, (select GPR32Opnd:$cond, RC:$T, RC:$F))]>,
+ ISA_MIPS1_NOT_4_32;
+
+ class SelectFP_Pseudo_T<RegisterOperand RC> :
+ PseudoSE<(outs RC:$dst), (ins GPR32Opnd:$cond, RC:$T, RC:$F),
+ [(set RC:$dst, (MipsCMovFP_T RC:$T, GPR32Opnd:$cond, RC:$F))]>,
+ ISA_MIPS1_NOT_4_32;
+
+ class SelectFP_Pseudo_F<RegisterOperand RC> :
+ PseudoSE<(outs RC:$dst), (ins GPR32Opnd:$cond, RC:$T, RC:$F),
+ [(set RC:$dst, (MipsCMovFP_F RC:$T, GPR32Opnd:$cond, RC:$F))]>,
+ ISA_MIPS1_NOT_4_32;
+}
+
+def PseudoSELECT_I : Select_Pseudo<GPR32Opnd>;
+def PseudoSELECT_I64 : Select_Pseudo<GPR64Opnd>;
+def PseudoSELECT_S : Select_Pseudo<FGR32Opnd>;
+def PseudoSELECT_D32 : Select_Pseudo<AFGR64Opnd>, FGR_32;
+def PseudoSELECT_D64 : Select_Pseudo<FGR64Opnd>, FGR_64;
+
+def PseudoSELECTFP_T_I : SelectFP_Pseudo_T<GPR32Opnd>;
+def PseudoSELECTFP_T_I64 : SelectFP_Pseudo_T<GPR64Opnd>;
+def PseudoSELECTFP_T_S : SelectFP_Pseudo_T<FGR32Opnd>;
+def PseudoSELECTFP_T_D32 : SelectFP_Pseudo_T<AFGR64Opnd>, FGR_32;
+def PseudoSELECTFP_T_D64 : SelectFP_Pseudo_T<FGR64Opnd>, FGR_64;
+
+def PseudoSELECTFP_F_I : SelectFP_Pseudo_F<GPR32Opnd>;
+def PseudoSELECTFP_F_I64 : SelectFP_Pseudo_F<GPR64Opnd>;
+def PseudoSELECTFP_F_S : SelectFP_Pseudo_F<FGR32Opnd>;
+def PseudoSELECTFP_F_D32 : SelectFP_Pseudo_F<AFGR64Opnd>, FGR_32;
+def PseudoSELECTFP_F_D64 : SelectFP_Pseudo_F<FGR64Opnd>, FGR_64;
diff --git a/contrib/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp b/contrib/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
new file mode 100644
index 000000000000..08b8ed31ccbb
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
@@ -0,0 +1,1694 @@
+//===-- MipsConstantIslandPass.cpp - Emit Pc Relative loads----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+// This pass is used to make Pc relative loads of constants.
+// For now, only Mips16 will use this.
+//
+// Loading constants inline is expensive on Mips16 and it's in general better
+// to place the constant nearby in code space and then it can be loaded with a
+// simple 16 bit load instruction.
+//
+// The constants can be not just numbers but addresses of functions and labels.
+// This can be particularly helpful in static relocation mode for embedded
+// non-linux targets.
+//
+//
+
+#include "Mips.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "Mips16InstrInfo.h"
+#include "MipsMachineFunction.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-constant-islands"
+
+STATISTIC(NumCPEs, "Number of constpool entries");
+STATISTIC(NumSplit, "Number of uncond branches inserted");
+STATISTIC(NumCBrFixed, "Number of cond branches fixed");
+STATISTIC(NumUBrFixed, "Number of uncond branches fixed");
+
+// FIXME: This option should be removed once it has received sufficient testing.
+static cl::opt<bool>
+AlignConstantIslands("mips-align-constant-islands", cl::Hidden, cl::init(true),
+ cl::desc("Align constant islands in code"));
+
+
+// Rather than do make check tests with huge amounts of code, we force
+// the test to use this amount.
+//
+static cl::opt<int> ConstantIslandsSmallOffset(
+ "mips-constant-islands-small-offset",
+ cl::init(0),
+ cl::desc("Make small offsets be this amount for testing purposes"),
+ cl::Hidden);
+
+//
+// For testing purposes we tell it to not use relaxed load forms so that it
+// will split blocks.
+//
+static cl::opt<bool> NoLoadRelaxation(
+ "mips-constant-islands-no-load-relaxation",
+ cl::init(false),
+ cl::desc("Don't relax loads to long loads - for testing purposes"),
+ cl::Hidden);
+
+static unsigned int branchTargetOperand(MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ case Mips::Bimm16:
+ case Mips::BimmX16:
+ case Mips::Bteqz16:
+ case Mips::BteqzX16:
+ case Mips::Btnez16:
+ case Mips::BtnezX16:
+ case Mips::JalB16:
+ return 0;
+ case Mips::BeqzRxImm16:
+ case Mips::BeqzRxImmX16:
+ case Mips::BnezRxImm16:
+ case Mips::BnezRxImmX16:
+ return 1;
+ }
+ llvm_unreachable("Unknown branch type");
+}
+
+static unsigned int longformBranchOpcode(unsigned int Opcode) {
+ switch (Opcode) {
+ case Mips::Bimm16:
+ case Mips::BimmX16:
+ return Mips::BimmX16;
+ case Mips::Bteqz16:
+ case Mips::BteqzX16:
+ return Mips::BteqzX16;
+ case Mips::Btnez16:
+ case Mips::BtnezX16:
+ return Mips::BtnezX16;
+ case Mips::JalB16:
+ return Mips::JalB16;
+ case Mips::BeqzRxImm16:
+ case Mips::BeqzRxImmX16:
+ return Mips::BeqzRxImmX16;
+ case Mips::BnezRxImm16:
+ case Mips::BnezRxImmX16:
+ return Mips::BnezRxImmX16;
+ }
+ llvm_unreachable("Unknown branch type");
+}
+
+//
+// FIXME: need to go through this whole constant islands port and check the math
+// for branch ranges and clean this up and make some functions to calculate things
+// that are done many times identically.
+// Need to refactor some of the code to call this routine.
+//
+static unsigned int branchMaxOffsets(unsigned int Opcode) {
+ unsigned Bits, Scale;
+ switch (Opcode) {
+ case Mips::Bimm16:
+ Bits = 11;
+ Scale = 2;
+ break;
+ case Mips::BimmX16:
+ Bits = 16;
+ Scale = 2;
+ break;
+ case Mips::BeqzRxImm16:
+ Bits = 8;
+ Scale = 2;
+ break;
+ case Mips::BeqzRxImmX16:
+ Bits = 16;
+ Scale = 2;
+ break;
+ case Mips::BnezRxImm16:
+ Bits = 8;
+ Scale = 2;
+ break;
+ case Mips::BnezRxImmX16:
+ Bits = 16;
+ Scale = 2;
+ break;
+ case Mips::Bteqz16:
+ Bits = 8;
+ Scale = 2;
+ break;
+ case Mips::BteqzX16:
+ Bits = 16;
+ Scale = 2;
+ break;
+ case Mips::Btnez16:
+ Bits = 8;
+ Scale = 2;
+ break;
+ case Mips::BtnezX16:
+ Bits = 16;
+ Scale = 2;
+ break;
+ default:
+ llvm_unreachable("Unknown branch type");
+ }
+ unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale;
+ return MaxOffs;
+}
+
+namespace {
+
+
+ typedef MachineBasicBlock::iterator Iter;
+ typedef MachineBasicBlock::reverse_iterator ReverseIter;
+
+ /// MipsConstantIslands - Due to limited PC-relative displacements, Mips
+ /// requires constant pool entries to be scattered among the instructions
+ /// inside a function. To do this, it completely ignores the normal LLVM
+ /// constant pool; instead, it places constants wherever it feels like with
+ /// special instructions.
+ ///
+ /// The terminology used in this pass includes:
+ /// Islands - Clumps of constants placed in the function.
+ /// Water - Potential places where an island could be formed.
+ /// CPE - A constant pool entry that has been placed somewhere, which
+ /// tracks a list of users.
+
+ class MipsConstantIslands : public MachineFunctionPass {
+
+ /// BasicBlockInfo - Information about the offset and size of a single
+ /// basic block.
+ struct BasicBlockInfo {
+ /// Offset - Distance from the beginning of the function to the beginning
+ /// of this basic block.
+ ///
+ /// Offsets are computed assuming worst case padding before an aligned
+ /// block. This means that subtracting basic block offsets always gives a
+ /// conservative estimate of the real distance which may be smaller.
+ ///
+ /// Because worst case padding is used, the computed offset of an aligned
+ /// block may not actually be aligned.
+ unsigned Offset;
+
+ /// Size - Size of the basic block in bytes. If the block contains
+ /// inline assembly, this is a worst case estimate.
+ ///
+ /// The size does not include any alignment padding whether from the
+ /// beginning of the block, or from an aligned jump table at the end.
+ unsigned Size;
+
+ // FIXME: ignore LogAlign for this patch
+ //
+ unsigned postOffset(unsigned LogAlign = 0) const {
+ unsigned PO = Offset + Size;
+ return PO;
+ }
+
+ BasicBlockInfo() : Offset(0), Size(0) {}
+
+ };
+
+ std::vector<BasicBlockInfo> BBInfo;
+
+ /// WaterList - A sorted list of basic blocks where islands could be placed
+ /// (i.e. blocks that don't fall through to the following block, due
+ /// to a return, unreachable, or unconditional branch).
+ std::vector<MachineBasicBlock*> WaterList;
+
+ /// NewWaterList - The subset of WaterList that was created since the
+ /// previous iteration by inserting unconditional branches.
+ SmallSet<MachineBasicBlock*, 4> NewWaterList;
+
+ typedef std::vector<MachineBasicBlock*>::iterator water_iterator;
+
+ /// CPUser - One user of a constant pool, keeping the machine instruction
+ /// pointer, the constant pool being referenced, and the max displacement
+ /// allowed from the instruction to the CP. The HighWaterMark records the
+ /// highest basic block where a new CPEntry can be placed. To ensure this
+ /// pass terminates, the CP entries are initially placed at the end of the
+ /// function and then move monotonically to lower addresses. The
+ /// exception to this rule is when the current CP entry for a particular
+ /// CPUser is out of range, but there is another CP entry for the same
+ /// constant value in range. We want to use the existing in-range CP
+ /// entry, but if it later moves out of range, the search for new water
+ /// should resume where it left off. The HighWaterMark is used to record
+ /// that point.
+ struct CPUser {
+ MachineInstr *MI;
+ MachineInstr *CPEMI;
+ MachineBasicBlock *HighWaterMark;
+ private:
+ unsigned MaxDisp;
+ unsigned LongFormMaxDisp; // mips16 has 16/32 bit instructions
+ // with different displacements
+ unsigned LongFormOpcode;
+ public:
+ bool NegOk;
+ CPUser(MachineInstr *mi, MachineInstr *cpemi, unsigned maxdisp,
+ bool neg,
+ unsigned longformmaxdisp, unsigned longformopcode)
+ : MI(mi), CPEMI(cpemi), MaxDisp(maxdisp),
+ LongFormMaxDisp(longformmaxdisp), LongFormOpcode(longformopcode),
+ NegOk(neg){
+ HighWaterMark = CPEMI->getParent();
+ }
+ /// getMaxDisp - Returns the maximum displacement supported by MI.
+ unsigned getMaxDisp() const {
+ unsigned xMaxDisp = ConstantIslandsSmallOffset?
+ ConstantIslandsSmallOffset: MaxDisp;
+ return xMaxDisp;
+ }
+ void setMaxDisp(unsigned val) {
+ MaxDisp = val;
+ }
+ unsigned getLongFormMaxDisp() const {
+ return LongFormMaxDisp;
+ }
+ unsigned getLongFormOpcode() const {
+ return LongFormOpcode;
+ }
+ };
+
+ /// CPUsers - Keep track of all of the machine instructions that use various
+ /// constant pools and their max displacement.
+ std::vector<CPUser> CPUsers;
+
+ /// CPEntry - One per constant pool entry, keeping the machine instruction
+ /// pointer, the constpool index, and the number of CPUser's which
+ /// reference this entry.
+ struct CPEntry {
+ MachineInstr *CPEMI;
+ unsigned CPI;
+ unsigned RefCount;
+ CPEntry(MachineInstr *cpemi, unsigned cpi, unsigned rc = 0)
+ : CPEMI(cpemi), CPI(cpi), RefCount(rc) {}
+ };
+
+ /// CPEntries - Keep track of all of the constant pool entry machine
+ /// instructions. For each original constpool index (i.e. those that
+ /// existed upon entry to this pass), it keeps a vector of entries.
+ /// Original elements are cloned as we go along; the clones are
+ /// put in the vector of the original element, but have distinct CPIs.
+ std::vector<std::vector<CPEntry> > CPEntries;
+
+ /// ImmBranch - One per immediate branch, keeping the machine instruction
+ /// pointer, conditional or unconditional, the max displacement,
+ /// and (if isCond is true) the corresponding unconditional branch
+ /// opcode.
+ struct ImmBranch {
+ MachineInstr *MI;
+ unsigned MaxDisp : 31;
+ bool isCond : 1;
+ int UncondBr;
+ ImmBranch(MachineInstr *mi, unsigned maxdisp, bool cond, int ubr)
+ : MI(mi), MaxDisp(maxdisp), isCond(cond), UncondBr(ubr) {}
+ };
+
+ /// ImmBranches - Keep track of all the immediate branch instructions.
+ ///
+ std::vector<ImmBranch> ImmBranches;
+
+ /// HasFarJump - True if any far jump instruction has been emitted during
+ /// the branch fix up pass.
+ bool HasFarJump;
+
+ const MipsSubtarget *STI;
+ const Mips16InstrInfo *TII;
+ MipsFunctionInfo *MFI;
+ MachineFunction *MF;
+ MachineConstantPool *MCP;
+
+ unsigned PICLabelUId;
+ bool PrescannedForConstants;
+
+ void initPICLabelUId(unsigned UId) {
+ PICLabelUId = UId;
+ }
+
+
+ unsigned createPICLabelUId() {
+ return PICLabelUId++;
+ }
+
+ public:
+ static char ID;
+ MipsConstantIslands()
+ : MachineFunctionPass(ID), STI(nullptr), MF(nullptr), MCP(nullptr),
+ PrescannedForConstants(false) {}
+
+ StringRef getPassName() const override { return "Mips Constant Islands"; }
+
+ bool runOnMachineFunction(MachineFunction &F) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+ void doInitialPlacement(std::vector<MachineInstr*> &CPEMIs);
+ CPEntry *findConstPoolEntry(unsigned CPI, const MachineInstr *CPEMI);
+ unsigned getCPELogAlign(const MachineInstr &CPEMI);
+ void initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs);
+ unsigned getOffsetOf(MachineInstr *MI) const;
+ unsigned getUserOffset(CPUser&) const;
+ void dumpBBs();
+
+ bool isOffsetInRange(unsigned UserOffset, unsigned TrialOffset,
+ unsigned Disp, bool NegativeOK);
+ bool isOffsetInRange(unsigned UserOffset, unsigned TrialOffset,
+ const CPUser &U);
+
+ void computeBlockSize(MachineBasicBlock *MBB);
+ MachineBasicBlock *splitBlockBeforeInstr(MachineInstr &MI);
+ void updateForInsertedWaterBlock(MachineBasicBlock *NewBB);
+ void adjustBBOffsetsAfter(MachineBasicBlock *BB);
+ bool decrementCPEReferenceCount(unsigned CPI, MachineInstr* CPEMI);
+ int findInRangeCPEntry(CPUser& U, unsigned UserOffset);
+ int findLongFormInRangeCPEntry(CPUser& U, unsigned UserOffset);
+ bool findAvailableWater(CPUser&U, unsigned UserOffset,
+ water_iterator &WaterIter);
+ void createNewWater(unsigned CPUserIndex, unsigned UserOffset,
+ MachineBasicBlock *&NewMBB);
+ bool handleConstantPoolUser(unsigned CPUserIndex);
+ void removeDeadCPEMI(MachineInstr *CPEMI);
+ bool removeUnusedCPEntries();
+ bool isCPEntryInRange(MachineInstr *MI, unsigned UserOffset,
+ MachineInstr *CPEMI, unsigned Disp, bool NegOk,
+ bool DoDump = false);
+ bool isWaterInRange(unsigned UserOffset, MachineBasicBlock *Water,
+ CPUser &U, unsigned &Growth);
+ bool isBBInRange(MachineInstr *MI, MachineBasicBlock *BB, unsigned Disp);
+ bool fixupImmediateBr(ImmBranch &Br);
+ bool fixupConditionalBr(ImmBranch &Br);
+ bool fixupUnconditionalBr(ImmBranch &Br);
+
+ void prescanForConstants();
+
+ private:
+
+ };
+
+ char MipsConstantIslands::ID = 0;
+} // end of anonymous namespace
+
+bool MipsConstantIslands::isOffsetInRange
+ (unsigned UserOffset, unsigned TrialOffset,
+ const CPUser &U) {
+ return isOffsetInRange(UserOffset, TrialOffset,
+ U.getMaxDisp(), U.NegOk);
+}
+/// print block size and offset information - debugging
+void MipsConstantIslands::dumpBBs() {
+ DEBUG({
+ for (unsigned J = 0, E = BBInfo.size(); J !=E; ++J) {
+ const BasicBlockInfo &BBI = BBInfo[J];
+ dbgs() << format("%08x BB#%u\t", BBI.Offset, J)
+ << format(" size=%#x\n", BBInfo[J].Size);
+ }
+ });
+}
+/// Returns a pass that converts branches to long branches.
+FunctionPass *llvm::createMipsConstantIslandPass() {
+ return new MipsConstantIslands();
+}
+
+bool MipsConstantIslands::runOnMachineFunction(MachineFunction &mf) {
+ // The intention is for this to be a mips16 only pass for now
+ // FIXME:
+ MF = &mf;
+ MCP = mf.getConstantPool();
+ STI = &static_cast<const MipsSubtarget &>(mf.getSubtarget());
+ DEBUG(dbgs() << "constant island machine function " << "\n");
+ if (!STI->inMips16Mode() || !MipsSubtarget::useConstantIslands()) {
+ return false;
+ }
+ TII = (const Mips16InstrInfo *)STI->getInstrInfo();
+ MFI = MF->getInfo<MipsFunctionInfo>();
+ DEBUG(dbgs() << "constant island processing " << "\n");
+ //
+ // will need to make predermination if there is any constants we need to
+ // put in constant islands. TBD.
+ //
+ if (!PrescannedForConstants) prescanForConstants();
+
+ HasFarJump = false;
+ // This pass invalidates liveness information when it splits basic blocks.
+ MF->getRegInfo().invalidateLiveness();
+
+ // Renumber all of the machine basic blocks in the function, guaranteeing that
+ // the numbers agree with the position of the block in the function.
+ MF->RenumberBlocks();
+
+ bool MadeChange = false;
+
+ // Perform the initial placement of the constant pool entries. To start with,
+ // we put them all at the end of the function.
+ std::vector<MachineInstr*> CPEMIs;
+ if (!MCP->isEmpty())
+ doInitialPlacement(CPEMIs);
+
+ /// The next UID to take is the first unused one.
+ initPICLabelUId(CPEMIs.size());
+
+ // Do the initial scan of the function, building up information about the
+ // sizes of each block, the location of all the water, and finding all of the
+ // constant pool users.
+ initializeFunctionInfo(CPEMIs);
+ CPEMIs.clear();
+ DEBUG(dumpBBs());
+
+ /// Remove dead constant pool entries.
+ MadeChange |= removeUnusedCPEntries();
+
+ // Iteratively place constant pool entries and fix up branches until there
+ // is no change.
+ unsigned NoCPIters = 0, NoBRIters = 0;
+ (void)NoBRIters;
+ while (true) {
+ DEBUG(dbgs() << "Beginning CP iteration #" << NoCPIters << '\n');
+ bool CPChange = false;
+ for (unsigned i = 0, e = CPUsers.size(); i != e; ++i)
+ CPChange |= handleConstantPoolUser(i);
+ if (CPChange && ++NoCPIters > 30)
+ report_fatal_error("Constant Island pass failed to converge!");
+ DEBUG(dumpBBs());
+
+ // Clear NewWaterList now. If we split a block for branches, it should
+ // appear as "new water" for the next iteration of constant pool placement.
+ NewWaterList.clear();
+
+ DEBUG(dbgs() << "Beginning BR iteration #" << NoBRIters << '\n');
+ bool BRChange = false;
+ for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i)
+ BRChange |= fixupImmediateBr(ImmBranches[i]);
+ if (BRChange && ++NoBRIters > 30)
+ report_fatal_error("Branch Fix Up pass failed to converge!");
+ DEBUG(dumpBBs());
+ if (!CPChange && !BRChange)
+ break;
+ MadeChange = true;
+ }
+
+ DEBUG(dbgs() << '\n'; dumpBBs());
+
+ BBInfo.clear();
+ WaterList.clear();
+ CPUsers.clear();
+ CPEntries.clear();
+ ImmBranches.clear();
+ return MadeChange;
+}
+
+/// doInitialPlacement - Perform the initial placement of the constant pool
+/// entries. To start with, we put them all at the end of the function.
+void
+MipsConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) {
+ // Create the basic block to hold the CPE's.
+ MachineBasicBlock *BB = MF->CreateMachineBasicBlock();
+ MF->push_back(BB);
+
+
+ // MachineConstantPool measures alignment in bytes. We measure in log2(bytes).
+ unsigned MaxAlign = Log2_32(MCP->getConstantPoolAlignment());
+
+ // Mark the basic block as required by the const-pool.
+ // If AlignConstantIslands isn't set, use 4-byte alignment for everything.
+ BB->setAlignment(AlignConstantIslands ? MaxAlign : 2);
+
+ // The function needs to be as aligned as the basic blocks. The linker may
+ // move functions around based on their alignment.
+ MF->ensureAlignment(BB->getAlignment());
+
+ // Order the entries in BB by descending alignment. That ensures correct
+ // alignment of all entries as long as BB is sufficiently aligned. Keep
+ // track of the insertion point for each alignment. We are going to bucket
+ // sort the entries as they are created.
+ SmallVector<MachineBasicBlock::iterator, 8> InsPoint(MaxAlign + 1, BB->end());
+
+ // Add all of the constants from the constant pool to the end block, use an
+ // identity mapping of CPI's to CPE's.
+ const std::vector<MachineConstantPoolEntry> &CPs = MCP->getConstants();
+
+ const DataLayout &TD = MF->getDataLayout();
+ for (unsigned i = 0, e = CPs.size(); i != e; ++i) {
+ unsigned Size = TD.getTypeAllocSize(CPs[i].getType());
+ assert(Size >= 4 && "Too small constant pool entry");
+ unsigned Align = CPs[i].getAlignment();
+ assert(isPowerOf2_32(Align) && "Invalid alignment");
+ // Verify that all constant pool entries are a multiple of their alignment.
+ // If not, we would have to pad them out so that instructions stay aligned.
+ assert((Size % Align) == 0 && "CP Entry not multiple of 4 bytes!");
+
+ // Insert CONSTPOOL_ENTRY before entries with a smaller alignment.
+ unsigned LogAlign = Log2_32(Align);
+ MachineBasicBlock::iterator InsAt = InsPoint[LogAlign];
+
+ MachineInstr *CPEMI =
+ BuildMI(*BB, InsAt, DebugLoc(), TII->get(Mips::CONSTPOOL_ENTRY))
+ .addImm(i).addConstantPoolIndex(i).addImm(Size);
+
+ CPEMIs.push_back(CPEMI);
+
+ // Ensure that future entries with higher alignment get inserted before
+ // CPEMI. This is bucket sort with iterators.
+ for (unsigned a = LogAlign + 1; a <= MaxAlign; ++a)
+ if (InsPoint[a] == InsAt)
+ InsPoint[a] = CPEMI;
+ // Add a new CPEntry, but no corresponding CPUser yet.
+ CPEntries.emplace_back(1, CPEntry(CPEMI, i));
+ ++NumCPEs;
+ DEBUG(dbgs() << "Moved CPI#" << i << " to end of function, size = "
+ << Size << ", align = " << Align <<'\n');
+ }
+ DEBUG(BB->dump());
+}
+
+/// BBHasFallthrough - Return true if the specified basic block can fallthrough
+/// into the block immediately after it.
+static bool BBHasFallthrough(MachineBasicBlock *MBB) {
+ // Get the next machine basic block in the function.
+ MachineFunction::iterator MBBI = MBB->getIterator();
+ // Can't fall off end of function.
+ if (std::next(MBBI) == MBB->getParent()->end())
+ return false;
+
+ MachineBasicBlock *NextBB = &*std::next(MBBI);
+ for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
+ E = MBB->succ_end(); I != E; ++I)
+ if (*I == NextBB)
+ return true;
+
+ return false;
+}
+
+/// findConstPoolEntry - Given the constpool index and CONSTPOOL_ENTRY MI,
+/// look up the corresponding CPEntry.
+MipsConstantIslands::CPEntry
+*MipsConstantIslands::findConstPoolEntry(unsigned CPI,
+ const MachineInstr *CPEMI) {
+ std::vector<CPEntry> &CPEs = CPEntries[CPI];
+ // Number of entries per constpool index should be small, just do a
+ // linear search.
+ for (unsigned i = 0, e = CPEs.size(); i != e; ++i) {
+ if (CPEs[i].CPEMI == CPEMI)
+ return &CPEs[i];
+ }
+ return nullptr;
+}
+
+/// getCPELogAlign - Returns the required alignment of the constant pool entry
+/// represented by CPEMI. Alignment is measured in log2(bytes) units.
+unsigned MipsConstantIslands::getCPELogAlign(const MachineInstr &CPEMI) {
+ assert(CPEMI.getOpcode() == Mips::CONSTPOOL_ENTRY);
+
+ // Everything is 4-byte aligned unless AlignConstantIslands is set.
+ if (!AlignConstantIslands)
+ return 2;
+
+ unsigned CPI = CPEMI.getOperand(1).getIndex();
+ assert(CPI < MCP->getConstants().size() && "Invalid constant pool index.");
+ unsigned Align = MCP->getConstants()[CPI].getAlignment();
+ assert(isPowerOf2_32(Align) && "Invalid CPE alignment");
+ return Log2_32(Align);
+}
+
+/// initializeFunctionInfo - Do the initial scan of the function, building up
+/// information about the sizes of each block, the location of all the water,
+/// and finding all of the constant pool users.
+void MipsConstantIslands::
+initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
+ BBInfo.clear();
+ BBInfo.resize(MF->getNumBlockIDs());
+
+ // First thing, compute the size of all basic blocks, and see if the function
+ // has any inline assembly in it. If so, we have to be conservative about
+ // alignment assumptions, as we don't know for sure the size of any
+ // instructions in the inline assembly.
+ for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I)
+ computeBlockSize(&*I);
+
+
+ // Compute block offsets.
+ adjustBBOffsetsAfter(&MF->front());
+
+ // Now go back through the instructions and build up our data structures.
+ for (MachineBasicBlock &MBB : *MF) {
+ // If this block doesn't fall through into the next MBB, then this is
+ // 'water' that a constant pool island could be placed.
+ if (!BBHasFallthrough(&MBB))
+ WaterList.push_back(&MBB);
+ for (MachineInstr &MI : MBB) {
+ if (MI.isDebugValue())
+ continue;
+
+ int Opc = MI.getOpcode();
+ if (MI.isBranch()) {
+ bool isCond = false;
+ unsigned Bits = 0;
+ unsigned Scale = 1;
+ int UOpc = Opc;
+ switch (Opc) {
+ default:
+ continue; // Ignore other branches for now
+ case Mips::Bimm16:
+ Bits = 11;
+ Scale = 2;
+ isCond = false;
+ break;
+ case Mips::BimmX16:
+ Bits = 16;
+ Scale = 2;
+ isCond = false;
+ break;
+ case Mips::BeqzRxImm16:
+ UOpc=Mips::Bimm16;
+ Bits = 8;
+ Scale = 2;
+ isCond = true;
+ break;
+ case Mips::BeqzRxImmX16:
+ UOpc=Mips::Bimm16;
+ Bits = 16;
+ Scale = 2;
+ isCond = true;
+ break;
+ case Mips::BnezRxImm16:
+ UOpc=Mips::Bimm16;
+ Bits = 8;
+ Scale = 2;
+ isCond = true;
+ break;
+ case Mips::BnezRxImmX16:
+ UOpc=Mips::Bimm16;
+ Bits = 16;
+ Scale = 2;
+ isCond = true;
+ break;
+ case Mips::Bteqz16:
+ UOpc=Mips::Bimm16;
+ Bits = 8;
+ Scale = 2;
+ isCond = true;
+ break;
+ case Mips::BteqzX16:
+ UOpc=Mips::Bimm16;
+ Bits = 16;
+ Scale = 2;
+ isCond = true;
+ break;
+ case Mips::Btnez16:
+ UOpc=Mips::Bimm16;
+ Bits = 8;
+ Scale = 2;
+ isCond = true;
+ break;
+ case Mips::BtnezX16:
+ UOpc=Mips::Bimm16;
+ Bits = 16;
+ Scale = 2;
+ isCond = true;
+ break;
+ }
+ // Record this immediate branch.
+ unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale;
+ ImmBranches.push_back(ImmBranch(&MI, MaxOffs, isCond, UOpc));
+ }
+
+ if (Opc == Mips::CONSTPOOL_ENTRY)
+ continue;
+
+
+ // Scan the instructions for constant pool operands.
+ for (unsigned op = 0, e = MI.getNumOperands(); op != e; ++op)
+ if (MI.getOperand(op).isCPI()) {
+
+ // We found one. The addressing mode tells us the max displacement
+ // from the PC that this instruction permits.
+
+ // Basic size info comes from the TSFlags field.
+ unsigned Bits = 0;
+ unsigned Scale = 1;
+ bool NegOk = false;
+ unsigned LongFormBits = 0;
+ unsigned LongFormScale = 0;
+ unsigned LongFormOpcode = 0;
+ switch (Opc) {
+ default:
+ llvm_unreachable("Unknown addressing mode for CP reference!");
+ case Mips::LwRxPcTcp16:
+ Bits = 8;
+ Scale = 4;
+ LongFormOpcode = Mips::LwRxPcTcpX16;
+ LongFormBits = 14;
+ LongFormScale = 1;
+ break;
+ case Mips::LwRxPcTcpX16:
+ Bits = 14;
+ Scale = 1;
+ NegOk = true;
+ break;
+ }
+ // Remember that this is a user of a CP entry.
+ unsigned CPI = MI.getOperand(op).getIndex();
+ MachineInstr *CPEMI = CPEMIs[CPI];
+ unsigned MaxOffs = ((1 << Bits)-1) * Scale;
+ unsigned LongFormMaxOffs = ((1 << LongFormBits)-1) * LongFormScale;
+ CPUsers.push_back(CPUser(&MI, CPEMI, MaxOffs, NegOk, LongFormMaxOffs,
+ LongFormOpcode));
+
+ // Increment corresponding CPEntry reference count.
+ CPEntry *CPE = findConstPoolEntry(CPI, CPEMI);
+ assert(CPE && "Cannot find a corresponding CPEntry!");
+ CPE->RefCount++;
+
+ // Instructions can only use one CP entry, don't bother scanning the
+ // rest of the operands.
+ break;
+
+ }
+
+ }
+ }
+
+}
+
+/// computeBlockSize - Compute the size and some alignment information for MBB.
+/// This function updates BBInfo directly.
+void MipsConstantIslands::computeBlockSize(MachineBasicBlock *MBB) {
+ BasicBlockInfo &BBI = BBInfo[MBB->getNumber()];
+ BBI.Size = 0;
+
+ for (const MachineInstr &MI : *MBB)
+ BBI.Size += TII->getInstSizeInBytes(MI);
+}
+
+/// getOffsetOf - Return the current offset of the specified machine instruction
+/// from the start of the function. This offset changes as stuff is moved
+/// around inside the function.
+unsigned MipsConstantIslands::getOffsetOf(MachineInstr *MI) const {
+ MachineBasicBlock *MBB = MI->getParent();
+
+ // The offset is composed of two things: the sum of the sizes of all MBB's
+ // before this instruction's block, and the offset from the start of the block
+ // it is in.
+ unsigned Offset = BBInfo[MBB->getNumber()].Offset;
+
+ // Sum instructions before MI in MBB.
+ for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) {
+ assert(I != MBB->end() && "Didn't find MI in its own basic block?");
+ Offset += TII->getInstSizeInBytes(*I);
+ }
+ return Offset;
+}
+
+/// CompareMBBNumbers - Little predicate function to sort the WaterList by MBB
+/// ID.
+static bool CompareMBBNumbers(const MachineBasicBlock *LHS,
+ const MachineBasicBlock *RHS) {
+ return LHS->getNumber() < RHS->getNumber();
+}
+
+/// updateForInsertedWaterBlock - When a block is newly inserted into the
+/// machine function, it upsets all of the block numbers. Renumber the blocks
+/// and update the arrays that parallel this numbering.
+void MipsConstantIslands::updateForInsertedWaterBlock
+ (MachineBasicBlock *NewBB) {
+ // Renumber the MBB's to keep them consecutive.
+ NewBB->getParent()->RenumberBlocks(NewBB);
+
+ // Insert an entry into BBInfo to align it properly with the (newly
+ // renumbered) block numbers.
+ BBInfo.insert(BBInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
+
+ // Next, update WaterList. Specifically, we need to add NewMBB as having
+ // available water after it.
+ water_iterator IP =
+ std::lower_bound(WaterList.begin(), WaterList.end(), NewBB,
+ CompareMBBNumbers);
+ WaterList.insert(IP, NewBB);
+}
+
+unsigned MipsConstantIslands::getUserOffset(CPUser &U) const {
+ return getOffsetOf(U.MI);
+}
+
+/// Split the basic block containing MI into two blocks, which are joined by
+/// an unconditional branch. Update data structures and renumber blocks to
+/// account for this change and returns the newly created block.
+MachineBasicBlock *
+MipsConstantIslands::splitBlockBeforeInstr(MachineInstr &MI) {
+ MachineBasicBlock *OrigBB = MI.getParent();
+
+ // Create a new MBB for the code after the OrigBB.
+ MachineBasicBlock *NewBB =
+ MF->CreateMachineBasicBlock(OrigBB->getBasicBlock());
+ MachineFunction::iterator MBBI = ++OrigBB->getIterator();
+ MF->insert(MBBI, NewBB);
+
+ // Splice the instructions starting with MI over to NewBB.
+ NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end());
+
+ // Add an unconditional branch from OrigBB to NewBB.
+ // Note the new unconditional branch is not being recorded.
+ // There doesn't seem to be meaningful DebugInfo available; this doesn't
+ // correspond to anything in the source.
+ BuildMI(OrigBB, DebugLoc(), TII->get(Mips::Bimm16)).addMBB(NewBB);
+ ++NumSplit;
+
+ // Update the CFG. All succs of OrigBB are now succs of NewBB.
+ NewBB->transferSuccessors(OrigBB);
+
+ // OrigBB branches to NewBB.
+ OrigBB->addSuccessor(NewBB);
+
+ // Update internal data structures to account for the newly inserted MBB.
+ // This is almost the same as updateForInsertedWaterBlock, except that
+ // the Water goes after OrigBB, not NewBB.
+ MF->RenumberBlocks(NewBB);
+
+ // Insert an entry into BBInfo to align it properly with the (newly
+ // renumbered) block numbers.
+ BBInfo.insert(BBInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
+
+ // Next, update WaterList. Specifically, we need to add OrigMBB as having
+ // available water after it (but not if it's already there, which happens
+ // when splitting before a conditional branch that is followed by an
+ // unconditional branch - in that case we want to insert NewBB).
+ water_iterator IP =
+ std::lower_bound(WaterList.begin(), WaterList.end(), OrigBB,
+ CompareMBBNumbers);
+ MachineBasicBlock* WaterBB = *IP;
+ if (WaterBB == OrigBB)
+ WaterList.insert(std::next(IP), NewBB);
+ else
+ WaterList.insert(IP, OrigBB);
+ NewWaterList.insert(OrigBB);
+
+ // Figure out how large the OrigBB is. As the first half of the original
+ // block, it cannot contain a tablejump. The size includes
+ // the new jump we added. (It should be possible to do this without
+ // recounting everything, but it's very confusing, and this is rarely
+ // executed.)
+ computeBlockSize(OrigBB);
+
+ // Figure out how large the NewMBB is. As the second half of the original
+ // block, it may contain a tablejump.
+ computeBlockSize(NewBB);
+
+ // All BBOffsets following these blocks must be modified.
+ adjustBBOffsetsAfter(OrigBB);
+
+ return NewBB;
+}
+
+
+
+/// isOffsetInRange - Checks whether UserOffset (the location of a constant pool
+/// reference) is within MaxDisp of TrialOffset (a proposed location of a
+/// constant pool entry).
+bool MipsConstantIslands::isOffsetInRange(unsigned UserOffset,
+ unsigned TrialOffset, unsigned MaxDisp,
+ bool NegativeOK) {
+ if (UserOffset <= TrialOffset) {
+ // User before the Trial.
+ if (TrialOffset - UserOffset <= MaxDisp)
+ return true;
+ } else if (NegativeOK) {
+ if (UserOffset - TrialOffset <= MaxDisp)
+ return true;
+ }
+ return false;
+}
+
+/// isWaterInRange - Returns true if a CPE placed after the specified
+/// Water (a basic block) will be in range for the specific MI.
+///
+/// Compute how much the function will grow by inserting a CPE after Water.
+bool MipsConstantIslands::isWaterInRange(unsigned UserOffset,
+ MachineBasicBlock* Water, CPUser &U,
+ unsigned &Growth) {
+ unsigned CPELogAlign = getCPELogAlign(*U.CPEMI);
+ unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset(CPELogAlign);
+ unsigned NextBlockOffset, NextBlockAlignment;
+ MachineFunction::const_iterator NextBlock = ++Water->getIterator();
+ if (NextBlock == MF->end()) {
+ NextBlockOffset = BBInfo[Water->getNumber()].postOffset();
+ NextBlockAlignment = 0;
+ } else {
+ NextBlockOffset = BBInfo[NextBlock->getNumber()].Offset;
+ NextBlockAlignment = NextBlock->getAlignment();
+ }
+ unsigned Size = U.CPEMI->getOperand(2).getImm();
+ unsigned CPEEnd = CPEOffset + Size;
+
+ // The CPE may be able to hide in the alignment padding before the next
+ // block. It may also cause more padding to be required if it is more aligned
+ // that the next block.
+ if (CPEEnd > NextBlockOffset) {
+ Growth = CPEEnd - NextBlockOffset;
+ // Compute the padding that would go at the end of the CPE to align the next
+ // block.
+ Growth += OffsetToAlignment(CPEEnd, 1ULL << NextBlockAlignment);
+
+ // If the CPE is to be inserted before the instruction, that will raise
+ // the offset of the instruction. Also account for unknown alignment padding
+ // in blocks between CPE and the user.
+ if (CPEOffset < UserOffset)
+ UserOffset += Growth;
+ } else
+ // CPE fits in existing padding.
+ Growth = 0;
+
+ return isOffsetInRange(UserOffset, CPEOffset, U);
+}
+
+/// isCPEntryInRange - Returns true if the distance between specific MI and
+/// specific ConstPool entry instruction can fit in MI's displacement field.
+bool MipsConstantIslands::isCPEntryInRange
+ (MachineInstr *MI, unsigned UserOffset,
+ MachineInstr *CPEMI, unsigned MaxDisp,
+ bool NegOk, bool DoDump) {
+ unsigned CPEOffset = getOffsetOf(CPEMI);
+
+ if (DoDump) {
+ DEBUG({
+ unsigned Block = MI->getParent()->getNumber();
+ const BasicBlockInfo &BBI = BBInfo[Block];
+ dbgs() << "User of CPE#" << CPEMI->getOperand(0).getImm()
+ << " max delta=" << MaxDisp
+ << format(" insn address=%#x", UserOffset)
+ << " in BB#" << Block << ": "
+ << format("%#x-%x\t", BBI.Offset, BBI.postOffset()) << *MI
+ << format("CPE address=%#x offset=%+d: ", CPEOffset,
+ int(CPEOffset-UserOffset));
+ });
+ }
+
+ return isOffsetInRange(UserOffset, CPEOffset, MaxDisp, NegOk);
+}
+
+#ifndef NDEBUG
+/// BBIsJumpedOver - Return true of the specified basic block's only predecessor
+/// unconditionally branches to its only successor.
+static bool BBIsJumpedOver(MachineBasicBlock *MBB) {
+ if (MBB->pred_size() != 1 || MBB->succ_size() != 1)
+ return false;
+ MachineBasicBlock *Succ = *MBB->succ_begin();
+ MachineBasicBlock *Pred = *MBB->pred_begin();
+ MachineInstr *PredMI = &Pred->back();
+ if (PredMI->getOpcode() == Mips::Bimm16)
+ return PredMI->getOperand(0).getMBB() == Succ;
+ return false;
+}
+#endif
+
+void MipsConstantIslands::adjustBBOffsetsAfter(MachineBasicBlock *BB) {
+ unsigned BBNum = BB->getNumber();
+ for(unsigned i = BBNum + 1, e = MF->getNumBlockIDs(); i < e; ++i) {
+ // Get the offset and known bits at the end of the layout predecessor.
+ // Include the alignment of the current block.
+ unsigned Offset = BBInfo[i - 1].Offset + BBInfo[i - 1].Size;
+ BBInfo[i].Offset = Offset;
+ }
+}
+
+/// decrementCPEReferenceCount - find the constant pool entry with index CPI
+/// and instruction CPEMI, and decrement its refcount. If the refcount
+/// becomes 0 remove the entry and instruction. Returns true if we removed
+/// the entry, false if we didn't.
+
+bool MipsConstantIslands::decrementCPEReferenceCount(unsigned CPI,
+ MachineInstr *CPEMI) {
+ // Find the old entry. Eliminate it if it is no longer used.
+ CPEntry *CPE = findConstPoolEntry(CPI, CPEMI);
+ assert(CPE && "Unexpected!");
+ if (--CPE->RefCount == 0) {
+ removeDeadCPEMI(CPEMI);
+ CPE->CPEMI = nullptr;
+ --NumCPEs;
+ return true;
+ }
+ return false;
+}
+
+/// LookForCPEntryInRange - see if the currently referenced CPE is in range;
+/// if not, see if an in-range clone of the CPE is in range, and if so,
+/// change the data structures so the user references the clone. Returns:
+/// 0 = no existing entry found
+/// 1 = entry found, and there were no code insertions or deletions
+/// 2 = entry found, and there were code insertions or deletions
+int MipsConstantIslands::findInRangeCPEntry(CPUser& U, unsigned UserOffset)
+{
+ MachineInstr *UserMI = U.MI;
+ MachineInstr *CPEMI = U.CPEMI;
+
+ // Check to see if the CPE is already in-range.
+ if (isCPEntryInRange(UserMI, UserOffset, CPEMI, U.getMaxDisp(), U.NegOk,
+ true)) {
+ DEBUG(dbgs() << "In range\n");
+ return 1;
+ }
+
+ // No. Look for previously created clones of the CPE that are in range.
+ unsigned CPI = CPEMI->getOperand(1).getIndex();
+ std::vector<CPEntry> &CPEs = CPEntries[CPI];
+ for (unsigned i = 0, e = CPEs.size(); i != e; ++i) {
+ // We already tried this one
+ if (CPEs[i].CPEMI == CPEMI)
+ continue;
+ // Removing CPEs can leave empty entries, skip
+ if (CPEs[i].CPEMI == nullptr)
+ continue;
+ if (isCPEntryInRange(UserMI, UserOffset, CPEs[i].CPEMI, U.getMaxDisp(),
+ U.NegOk)) {
+ DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#"
+ << CPEs[i].CPI << "\n");
+ // Point the CPUser node to the replacement
+ U.CPEMI = CPEs[i].CPEMI;
+ // Change the CPI in the instruction operand to refer to the clone.
+ for (unsigned j = 0, e = UserMI->getNumOperands(); j != e; ++j)
+ if (UserMI->getOperand(j).isCPI()) {
+ UserMI->getOperand(j).setIndex(CPEs[i].CPI);
+ break;
+ }
+ // Adjust the refcount of the clone...
+ CPEs[i].RefCount++;
+ // ...and the original. If we didn't remove the old entry, none of the
+ // addresses changed, so we don't need another pass.
+ return decrementCPEReferenceCount(CPI, CPEMI) ? 2 : 1;
+ }
+ }
+ return 0;
+}
+
+/// LookForCPEntryInRange - see if the currently referenced CPE is in range;
+/// This version checks if the longer form of the instruction can be used to
+/// to satisfy things.
+/// if not, see if an in-range clone of the CPE is in range, and if so,
+/// change the data structures so the user references the clone. Returns:
+/// 0 = no existing entry found
+/// 1 = entry found, and there were no code insertions or deletions
+/// 2 = entry found, and there were code insertions or deletions
+int MipsConstantIslands::findLongFormInRangeCPEntry
+ (CPUser& U, unsigned UserOffset)
+{
+ MachineInstr *UserMI = U.MI;
+ MachineInstr *CPEMI = U.CPEMI;
+
+ // Check to see if the CPE is already in-range.
+ if (isCPEntryInRange(UserMI, UserOffset, CPEMI,
+ U.getLongFormMaxDisp(), U.NegOk,
+ true)) {
+ DEBUG(dbgs() << "In range\n");
+ UserMI->setDesc(TII->get(U.getLongFormOpcode()));
+ U.setMaxDisp(U.getLongFormMaxDisp());
+ return 2; // instruction is longer length now
+ }
+
+ // No. Look for previously created clones of the CPE that are in range.
+ unsigned CPI = CPEMI->getOperand(1).getIndex();
+ std::vector<CPEntry> &CPEs = CPEntries[CPI];
+ for (unsigned i = 0, e = CPEs.size(); i != e; ++i) {
+ // We already tried this one
+ if (CPEs[i].CPEMI == CPEMI)
+ continue;
+ // Removing CPEs can leave empty entries, skip
+ if (CPEs[i].CPEMI == nullptr)
+ continue;
+ if (isCPEntryInRange(UserMI, UserOffset, CPEs[i].CPEMI,
+ U.getLongFormMaxDisp(), U.NegOk)) {
+ DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#"
+ << CPEs[i].CPI << "\n");
+ // Point the CPUser node to the replacement
+ U.CPEMI = CPEs[i].CPEMI;
+ // Change the CPI in the instruction operand to refer to the clone.
+ for (unsigned j = 0, e = UserMI->getNumOperands(); j != e; ++j)
+ if (UserMI->getOperand(j).isCPI()) {
+ UserMI->getOperand(j).setIndex(CPEs[i].CPI);
+ break;
+ }
+ // Adjust the refcount of the clone...
+ CPEs[i].RefCount++;
+ // ...and the original. If we didn't remove the old entry, none of the
+ // addresses changed, so we don't need another pass.
+ return decrementCPEReferenceCount(CPI, CPEMI) ? 2 : 1;
+ }
+ }
+ return 0;
+}
+
+/// getUnconditionalBrDisp - Returns the maximum displacement that can fit in
+/// the specific unconditional branch instruction.
+static inline unsigned getUnconditionalBrDisp(int Opc) {
+ switch (Opc) {
+ case Mips::Bimm16:
+ return ((1<<10)-1)*2;
+ case Mips::BimmX16:
+ return ((1<<16)-1)*2;
+ default:
+ break;
+ }
+ return ((1<<16)-1)*2;
+}
+
+/// findAvailableWater - Look for an existing entry in the WaterList in which
+/// we can place the CPE referenced from U so it's within range of U's MI.
+/// Returns true if found, false if not. If it returns true, WaterIter
+/// is set to the WaterList entry.
+/// To ensure that this pass
+/// terminates, the CPE location for a particular CPUser is only allowed to
+/// move to a lower address, so search backward from the end of the list and
+/// prefer the first water that is in range.
+bool MipsConstantIslands::findAvailableWater(CPUser &U, unsigned UserOffset,
+ water_iterator &WaterIter) {
+ if (WaterList.empty())
+ return false;
+
+ unsigned BestGrowth = ~0u;
+ for (water_iterator IP = std::prev(WaterList.end()), B = WaterList.begin();;
+ --IP) {
+ MachineBasicBlock* WaterBB = *IP;
+ // Check if water is in range and is either at a lower address than the
+ // current "high water mark" or a new water block that was created since
+ // the previous iteration by inserting an unconditional branch. In the
+ // latter case, we want to allow resetting the high water mark back to
+ // this new water since we haven't seen it before. Inserting branches
+ // should be relatively uncommon and when it does happen, we want to be
+ // sure to take advantage of it for all the CPEs near that block, so that
+ // we don't insert more branches than necessary.
+ unsigned Growth;
+ if (isWaterInRange(UserOffset, WaterBB, U, Growth) &&
+ (WaterBB->getNumber() < U.HighWaterMark->getNumber() ||
+ NewWaterList.count(WaterBB)) && Growth < BestGrowth) {
+ // This is the least amount of required padding seen so far.
+ BestGrowth = Growth;
+ WaterIter = IP;
+ DEBUG(dbgs() << "Found water after BB#" << WaterBB->getNumber()
+ << " Growth=" << Growth << '\n');
+
+ // Keep looking unless it is perfect.
+ if (BestGrowth == 0)
+ return true;
+ }
+ if (IP == B)
+ break;
+ }
+ return BestGrowth != ~0u;
+}
+
+/// createNewWater - No existing WaterList entry will work for
+/// CPUsers[CPUserIndex], so create a place to put the CPE. The end of the
+/// block is used if in range, and the conditional branch munged so control
+/// flow is correct. Otherwise the block is split to create a hole with an
+/// unconditional branch around it. In either case NewMBB is set to a
+/// block following which the new island can be inserted (the WaterList
+/// is not adjusted).
+void MipsConstantIslands::createNewWater(unsigned CPUserIndex,
+ unsigned UserOffset,
+ MachineBasicBlock *&NewMBB) {
+ CPUser &U = CPUsers[CPUserIndex];
+ MachineInstr *UserMI = U.MI;
+ MachineInstr *CPEMI = U.CPEMI;
+ unsigned CPELogAlign = getCPELogAlign(*CPEMI);
+ MachineBasicBlock *UserMBB = UserMI->getParent();
+ const BasicBlockInfo &UserBBI = BBInfo[UserMBB->getNumber()];
+
+ // If the block does not end in an unconditional branch already, and if the
+ // end of the block is within range, make new water there.
+ if (BBHasFallthrough(UserMBB)) {
+ // Size of branch to insert.
+ unsigned Delta = 2;
+ // Compute the offset where the CPE will begin.
+ unsigned CPEOffset = UserBBI.postOffset(CPELogAlign) + Delta;
+
+ if (isOffsetInRange(UserOffset, CPEOffset, U)) {
+ DEBUG(dbgs() << "Split at end of BB#" << UserMBB->getNumber()
+ << format(", expected CPE offset %#x\n", CPEOffset));
+ NewMBB = &*++UserMBB->getIterator();
+ // Add an unconditional branch from UserMBB to fallthrough block. Record
+ // it for branch lengthening; this new branch will not get out of range,
+ // but if the preceding conditional branch is out of range, the targets
+ // will be exchanged, and the altered branch may be out of range, so the
+ // machinery has to know about it.
+ int UncondBr = Mips::Bimm16;
+ BuildMI(UserMBB, DebugLoc(), TII->get(UncondBr)).addMBB(NewMBB);
+ unsigned MaxDisp = getUnconditionalBrDisp(UncondBr);
+ ImmBranches.push_back(ImmBranch(&UserMBB->back(),
+ MaxDisp, false, UncondBr));
+ BBInfo[UserMBB->getNumber()].Size += Delta;
+ adjustBBOffsetsAfter(UserMBB);
+ return;
+ }
+ }
+
+ // What a big block. Find a place within the block to split it.
+
+ // Try to split the block so it's fully aligned. Compute the latest split
+ // point where we can add a 4-byte branch instruction, and then align to
+ // LogAlign which is the largest possible alignment in the function.
+ unsigned LogAlign = MF->getAlignment();
+ assert(LogAlign >= CPELogAlign && "Over-aligned constant pool entry");
+ unsigned BaseInsertOffset = UserOffset + U.getMaxDisp();
+ DEBUG(dbgs() << format("Split in middle of big block before %#x",
+ BaseInsertOffset));
+
+ // The 4 in the following is for the unconditional branch we'll be inserting
+ // Alignment of the island is handled
+ // inside isOffsetInRange.
+ BaseInsertOffset -= 4;
+
+ DEBUG(dbgs() << format(", adjusted to %#x", BaseInsertOffset)
+ << " la=" << LogAlign << '\n');
+
+ // This could point off the end of the block if we've already got constant
+ // pool entries following this block; only the last one is in the water list.
+ // Back past any possible branches (allow for a conditional and a maximally
+ // long unconditional).
+ if (BaseInsertOffset + 8 >= UserBBI.postOffset()) {
+ BaseInsertOffset = UserBBI.postOffset() - 8;
+ DEBUG(dbgs() << format("Move inside block: %#x\n", BaseInsertOffset));
+ }
+ unsigned EndInsertOffset = BaseInsertOffset + 4 +
+ CPEMI->getOperand(2).getImm();
+ MachineBasicBlock::iterator MI = UserMI;
+ ++MI;
+ unsigned CPUIndex = CPUserIndex+1;
+ unsigned NumCPUsers = CPUsers.size();
+ //MachineInstr *LastIT = 0;
+ for (unsigned Offset = UserOffset + TII->getInstSizeInBytes(*UserMI);
+ Offset < BaseInsertOffset;
+ Offset += TII->getInstSizeInBytes(*MI), MI = std::next(MI)) {
+ assert(MI != UserMBB->end() && "Fell off end of block");
+ if (CPUIndex < NumCPUsers && CPUsers[CPUIndex].MI == MI) {
+ CPUser &U = CPUsers[CPUIndex];
+ if (!isOffsetInRange(Offset, EndInsertOffset, U)) {
+ // Shift intertion point by one unit of alignment so it is within reach.
+ BaseInsertOffset -= 1u << LogAlign;
+ EndInsertOffset -= 1u << LogAlign;
+ }
+ // This is overly conservative, as we don't account for CPEMIs being
+ // reused within the block, but it doesn't matter much. Also assume CPEs
+ // are added in order with alignment padding. We may eventually be able
+ // to pack the aligned CPEs better.
+ EndInsertOffset += U.CPEMI->getOperand(2).getImm();
+ CPUIndex++;
+ }
+ }
+
+ NewMBB = splitBlockBeforeInstr(*--MI);
+}
+
+/// handleConstantPoolUser - Analyze the specified user, checking to see if it
+/// is out-of-range. If so, pick up the constant pool value and move it some
+/// place in-range. Return true if we changed any addresses (thus must run
+/// another pass of branch lengthening), false otherwise.
+bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
+ CPUser &U = CPUsers[CPUserIndex];
+ MachineInstr *UserMI = U.MI;
+ MachineInstr *CPEMI = U.CPEMI;
+ unsigned CPI = CPEMI->getOperand(1).getIndex();
+ unsigned Size = CPEMI->getOperand(2).getImm();
+ // Compute this only once, it's expensive.
+ unsigned UserOffset = getUserOffset(U);
+
+ // See if the current entry is within range, or there is a clone of it
+ // in range.
+ int result = findInRangeCPEntry(U, UserOffset);
+ if (result==1) return false;
+ else if (result==2) return true;
+
+
+ // Look for water where we can place this CPE.
+ MachineBasicBlock *NewIsland = MF->CreateMachineBasicBlock();
+ MachineBasicBlock *NewMBB;
+ water_iterator IP;
+ if (findAvailableWater(U, UserOffset, IP)) {
+ DEBUG(dbgs() << "Found water in range\n");
+ MachineBasicBlock *WaterBB = *IP;
+
+ // If the original WaterList entry was "new water" on this iteration,
+ // propagate that to the new island. This is just keeping NewWaterList
+ // updated to match the WaterList, which will be updated below.
+ if (NewWaterList.erase(WaterBB))
+ NewWaterList.insert(NewIsland);
+
+ // The new CPE goes before the following block (NewMBB).
+ NewMBB = &*++WaterBB->getIterator();
+ } else {
+ // No water found.
+ // we first see if a longer form of the instrucion could have reached
+ // the constant. in that case we won't bother to split
+ if (!NoLoadRelaxation) {
+ result = findLongFormInRangeCPEntry(U, UserOffset);
+ if (result != 0) return true;
+ }
+ DEBUG(dbgs() << "No water found\n");
+ createNewWater(CPUserIndex, UserOffset, NewMBB);
+
+ // splitBlockBeforeInstr adds to WaterList, which is important when it is
+ // called while handling branches so that the water will be seen on the
+ // next iteration for constant pools, but in this context, we don't want
+ // it. Check for this so it will be removed from the WaterList.
+ // Also remove any entry from NewWaterList.
+ MachineBasicBlock *WaterBB = &*--NewMBB->getIterator();
+ IP = find(WaterList, WaterBB);
+ if (IP != WaterList.end())
+ NewWaterList.erase(WaterBB);
+
+ // We are adding new water. Update NewWaterList.
+ NewWaterList.insert(NewIsland);
+ }
+
+ // Remove the original WaterList entry; we want subsequent insertions in
+ // this vicinity to go after the one we're about to insert. This
+ // considerably reduces the number of times we have to move the same CPE
+ // more than once and is also important to ensure the algorithm terminates.
+ if (IP != WaterList.end())
+ WaterList.erase(IP);
+
+ // Okay, we know we can put an island before NewMBB now, do it!
+ MF->insert(NewMBB->getIterator(), NewIsland);
+
+ // Update internal data structures to account for the newly inserted MBB.
+ updateForInsertedWaterBlock(NewIsland);
+
+ // Decrement the old entry, and remove it if refcount becomes 0.
+ decrementCPEReferenceCount(CPI, CPEMI);
+
+ // No existing clone of this CPE is within range.
+ // We will be generating a new clone. Get a UID for it.
+ unsigned ID = createPICLabelUId();
+
+ // Now that we have an island to add the CPE to, clone the original CPE and
+ // add it to the island.
+ U.HighWaterMark = NewIsland;
+ U.CPEMI = BuildMI(NewIsland, DebugLoc(), TII->get(Mips::CONSTPOOL_ENTRY))
+ .addImm(ID).addConstantPoolIndex(CPI).addImm(Size);
+ CPEntries[CPI].push_back(CPEntry(U.CPEMI, ID, 1));
+ ++NumCPEs;
+
+ // Mark the basic block as aligned as required by the const-pool entry.
+ NewIsland->setAlignment(getCPELogAlign(*U.CPEMI));
+
+ // Increase the size of the island block to account for the new entry.
+ BBInfo[NewIsland->getNumber()].Size += Size;
+ adjustBBOffsetsAfter(&*--NewIsland->getIterator());
+
+ // Finally, change the CPI in the instruction operand to be ID.
+ for (unsigned i = 0, e = UserMI->getNumOperands(); i != e; ++i)
+ if (UserMI->getOperand(i).isCPI()) {
+ UserMI->getOperand(i).setIndex(ID);
+ break;
+ }
+
+ DEBUG(dbgs() << " Moved CPE to #" << ID << " CPI=" << CPI
+ << format(" offset=%#x\n", BBInfo[NewIsland->getNumber()].Offset));
+
+ return true;
+}
+
+/// removeDeadCPEMI - Remove a dead constant pool entry instruction. Update
+/// sizes and offsets of impacted basic blocks.
+void MipsConstantIslands::removeDeadCPEMI(MachineInstr *CPEMI) {
+ MachineBasicBlock *CPEBB = CPEMI->getParent();
+ unsigned Size = CPEMI->getOperand(2).getImm();
+ CPEMI->eraseFromParent();
+ BBInfo[CPEBB->getNumber()].Size -= Size;
+ // All succeeding offsets have the current size value added in, fix this.
+ if (CPEBB->empty()) {
+ BBInfo[CPEBB->getNumber()].Size = 0;
+
+ // This block no longer needs to be aligned.
+ CPEBB->setAlignment(0);
+ } else
+ // Entries are sorted by descending alignment, so realign from the front.
+ CPEBB->setAlignment(getCPELogAlign(*CPEBB->begin()));
+
+ adjustBBOffsetsAfter(CPEBB);
+ // An island has only one predecessor BB and one successor BB. Check if
+ // this BB's predecessor jumps directly to this BB's successor. This
+ // shouldn't happen currently.
+ assert(!BBIsJumpedOver(CPEBB) && "How did this happen?");
+ // FIXME: remove the empty blocks after all the work is done?
+}
+
+/// removeUnusedCPEntries - Remove constant pool entries whose refcounts
+/// are zero.
+bool MipsConstantIslands::removeUnusedCPEntries() {
+ unsigned MadeChange = false;
+ for (unsigned i = 0, e = CPEntries.size(); i != e; ++i) {
+ std::vector<CPEntry> &CPEs = CPEntries[i];
+ for (unsigned j = 0, ee = CPEs.size(); j != ee; ++j) {
+ if (CPEs[j].RefCount == 0 && CPEs[j].CPEMI) {
+ removeDeadCPEMI(CPEs[j].CPEMI);
+ CPEs[j].CPEMI = nullptr;
+ MadeChange = true;
+ }
+ }
+ }
+ return MadeChange;
+}
+
+/// isBBInRange - Returns true if the distance between specific MI and
+/// specific BB can fit in MI's displacement field.
+bool MipsConstantIslands::isBBInRange
+ (MachineInstr *MI,MachineBasicBlock *DestBB, unsigned MaxDisp) {
+
+unsigned PCAdj = 4;
+
+ unsigned BrOffset = getOffsetOf(MI) + PCAdj;
+ unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset;
+
+ DEBUG(dbgs() << "Branch of destination BB#" << DestBB->getNumber()
+ << " from BB#" << MI->getParent()->getNumber()
+ << " max delta=" << MaxDisp
+ << " from " << getOffsetOf(MI) << " to " << DestOffset
+ << " offset " << int(DestOffset-BrOffset) << "\t" << *MI);
+
+ if (BrOffset <= DestOffset) {
+ // Branch before the Dest.
+ if (DestOffset-BrOffset <= MaxDisp)
+ return true;
+ } else {
+ if (BrOffset-DestOffset <= MaxDisp)
+ return true;
+ }
+ return false;
+}
+
+/// fixupImmediateBr - Fix up an immediate branch whose destination is too far
+/// away to fit in its displacement field.
+bool MipsConstantIslands::fixupImmediateBr(ImmBranch &Br) {
+ MachineInstr *MI = Br.MI;
+ unsigned TargetOperand = branchTargetOperand(MI);
+ MachineBasicBlock *DestBB = MI->getOperand(TargetOperand).getMBB();
+
+ // Check to see if the DestBB is already in-range.
+ if (isBBInRange(MI, DestBB, Br.MaxDisp))
+ return false;
+
+ if (!Br.isCond)
+ return fixupUnconditionalBr(Br);
+ return fixupConditionalBr(Br);
+}
+
+/// fixupUnconditionalBr - Fix up an unconditional branch whose destination is
+/// too far away to fit in its displacement field. If the LR register has been
+/// spilled in the epilogue, then we can use BL to implement a far jump.
+/// Otherwise, add an intermediate branch instruction to a branch.
+bool
+MipsConstantIslands::fixupUnconditionalBr(ImmBranch &Br) {
+ MachineInstr *MI = Br.MI;
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineBasicBlock *DestBB = MI->getOperand(0).getMBB();
+ // Use BL to implement far jump.
+ unsigned BimmX16MaxDisp = ((1 << 16)-1) * 2;
+ if (isBBInRange(MI, DestBB, BimmX16MaxDisp)) {
+ Br.MaxDisp = BimmX16MaxDisp;
+ MI->setDesc(TII->get(Mips::BimmX16));
+ }
+ else {
+ // need to give the math a more careful look here
+ // this is really a segment address and not
+ // a PC relative address. FIXME. But I think that
+ // just reducing the bits by 1 as I've done is correct.
+ // The basic block we are branching too much be longword aligned.
+ // we know that RA is saved because we always save it right now.
+ // this requirement will be relaxed later but we also have an alternate
+ // way to implement this that I will implement that does not need jal.
+ // We should have a way to back out this alignment restriction if we "can" later.
+ // but it is not harmful.
+ //
+ DestBB->setAlignment(2);
+ Br.MaxDisp = ((1<<24)-1) * 2;
+ MI->setDesc(TII->get(Mips::JalB16));
+ }
+ BBInfo[MBB->getNumber()].Size += 2;
+ adjustBBOffsetsAfter(MBB);
+ HasFarJump = true;
+ ++NumUBrFixed;
+
+ DEBUG(dbgs() << " Changed B to long jump " << *MI);
+
+ return true;
+}
+
+
+/// fixupConditionalBr - Fix up a conditional branch whose destination is too
+/// far away to fit in its displacement field. It is converted to an inverse
+/// conditional branch + an unconditional branch to the destination.
+bool
+MipsConstantIslands::fixupConditionalBr(ImmBranch &Br) {
+ MachineInstr *MI = Br.MI;
+ unsigned TargetOperand = branchTargetOperand(MI);
+ MachineBasicBlock *DestBB = MI->getOperand(TargetOperand).getMBB();
+ unsigned Opcode = MI->getOpcode();
+ unsigned LongFormOpcode = longformBranchOpcode(Opcode);
+ unsigned LongFormMaxOff = branchMaxOffsets(LongFormOpcode);
+
+ // Check to see if the DestBB is already in-range.
+ if (isBBInRange(MI, DestBB, LongFormMaxOff)) {
+ Br.MaxDisp = LongFormMaxOff;
+ MI->setDesc(TII->get(LongFormOpcode));
+ return true;
+ }
+
+ // Add an unconditional branch to the destination and invert the branch
+ // condition to jump over it:
+ // bteqz L1
+ // =>
+ // bnez L2
+ // b L1
+ // L2:
+
+ // If the branch is at the end of its MBB and that has a fall-through block,
+ // direct the updated conditional branch to the fall-through block. Otherwise,
+ // split the MBB before the next instruction.
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineInstr *BMI = &MBB->back();
+ bool NeedSplit = (BMI != MI) || !BBHasFallthrough(MBB);
+ unsigned OppositeBranchOpcode = TII->getOppositeBranchOpc(Opcode);
+
+ ++NumCBrFixed;
+ if (BMI != MI) {
+ if (std::next(MachineBasicBlock::iterator(MI)) == std::prev(MBB->end()) &&
+ BMI->isUnconditionalBranch()) {
+ // Last MI in the BB is an unconditional branch. Can we simply invert the
+ // condition and swap destinations:
+ // beqz L1
+ // b L2
+ // =>
+ // bnez L2
+ // b L1
+ unsigned BMITargetOperand = branchTargetOperand(BMI);
+ MachineBasicBlock *NewDest =
+ BMI->getOperand(BMITargetOperand).getMBB();
+ if (isBBInRange(MI, NewDest, Br.MaxDisp)) {
+ DEBUG(dbgs() << " Invert Bcc condition and swap its destination with "
+ << *BMI);
+ MI->setDesc(TII->get(OppositeBranchOpcode));
+ BMI->getOperand(BMITargetOperand).setMBB(DestBB);
+ MI->getOperand(TargetOperand).setMBB(NewDest);
+ return true;
+ }
+ }
+ }
+
+
+ if (NeedSplit) {
+ splitBlockBeforeInstr(*MI);
+ // No need for the branch to the next block. We're adding an unconditional
+ // branch to the destination.
+ int delta = TII->getInstSizeInBytes(MBB->back());
+ BBInfo[MBB->getNumber()].Size -= delta;
+ MBB->back().eraseFromParent();
+ // BBInfo[SplitBB].Offset is wrong temporarily, fixed below
+ }
+ MachineBasicBlock *NextBB = &*++MBB->getIterator();
+
+ DEBUG(dbgs() << " Insert B to BB#" << DestBB->getNumber()
+ << " also invert condition and change dest. to BB#"
+ << NextBB->getNumber() << "\n");
+
+ // Insert a new conditional branch and a new unconditional branch.
+ // Also update the ImmBranch as well as adding a new entry for the new branch.
+ if (MI->getNumExplicitOperands() == 2) {
+ BuildMI(MBB, DebugLoc(), TII->get(OppositeBranchOpcode))
+ .addReg(MI->getOperand(0).getReg())
+ .addMBB(NextBB);
+ } else {
+ BuildMI(MBB, DebugLoc(), TII->get(OppositeBranchOpcode))
+ .addMBB(NextBB);
+ }
+ Br.MI = &MBB->back();
+ BBInfo[MBB->getNumber()].Size += TII->getInstSizeInBytes(MBB->back());
+ BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB);
+ BBInfo[MBB->getNumber()].Size += TII->getInstSizeInBytes(MBB->back());
+ unsigned MaxDisp = getUnconditionalBrDisp(Br.UncondBr);
+ ImmBranches.push_back(ImmBranch(&MBB->back(), MaxDisp, false, Br.UncondBr));
+
+ // Remove the old conditional branch. It may or may not still be in MBB.
+ BBInfo[MI->getParent()->getNumber()].Size -= TII->getInstSizeInBytes(*MI);
+ MI->eraseFromParent();
+ adjustBBOffsetsAfter(MBB);
+ return true;
+}
+
+
+void MipsConstantIslands::prescanForConstants() {
+ unsigned J = 0;
+ (void)J;
+ for (MachineFunction::iterator B =
+ MF->begin(), E = MF->end(); B != E; ++B) {
+ for (MachineBasicBlock::instr_iterator I =
+ B->instr_begin(), EB = B->instr_end(); I != EB; ++I) {
+ switch(I->getDesc().getOpcode()) {
+ case Mips::LwConstant32: {
+ PrescannedForConstants = true;
+ DEBUG(dbgs() << "constant island constant " << *I << "\n");
+ J = I->getNumOperands();
+ DEBUG(dbgs() << "num operands " << J << "\n");
+ MachineOperand& Literal = I->getOperand(1);
+ if (Literal.isImm()) {
+ int64_t V = Literal.getImm();
+ DEBUG(dbgs() << "literal " << V << "\n");
+ Type *Int32Ty =
+ Type::getInt32Ty(MF->getFunction()->getContext());
+ const Constant *C = ConstantInt::get(Int32Ty, V);
+ unsigned index = MCP->getConstantPoolIndex(C, 4);
+ I->getOperand(2).ChangeToImmediate(index);
+ DEBUG(dbgs() << "constant island constant " << *I << "\n");
+ I->setDesc(TII->get(Mips::LwRxPcTcp16));
+ I->RemoveOperand(1);
+ I->RemoveOperand(1);
+ I->addOperand(MachineOperand::CreateCPI(index, 0));
+ I->addOperand(MachineOperand::CreateImm(4));
+ }
+ break;
+ }
+ default:
+ break;
+ }
+ }
+ }
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsDSPInstrFormats.td b/contrib/llvm/lib/Target/Mips/MipsDSPInstrFormats.td
new file mode 100644
index 000000000000..0ceb1858fb09
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsDSPInstrFormats.td
@@ -0,0 +1,369 @@
+//===- MipsDSPInstrFormats.td - Mips Instruction Formats ---*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+class DspMMRel;
+
+def Dsp2MicroMips : InstrMapping {
+ let FilterClass = "DspMMRel";
+ // Instructions with the same BaseOpcode and isNVStore values form a row.
+ let RowFields = ["BaseOpcode"];
+ // Instructions with the same predicate sense form a column.
+ let ColFields = ["Arch"];
+ // The key column is the unpredicated instructions.
+ let KeyCol = ["dsp"];
+ // Value columns are PredSense=true and PredSense=false
+ let ValueCols = [["dsp"], ["mmdsp"]];
+}
+
+def HasDSP : Predicate<"Subtarget->hasDSP()">,
+ AssemblerPredicate<"FeatureDSP">;
+def HasDSPR2 : Predicate<"Subtarget->hasDSPR2()">,
+ AssemblerPredicate<"FeatureDSPR2">;
+def HasDSPR3 : Predicate<"Subtarget->hasDSPR3()">,
+ AssemblerPredicate<"FeatureDSPR3">;
+
+class ISA_DSPR2 {
+ list<Predicate> InsnPredicates = [HasDSPR2];
+}
+
+class ISA_DSPR3 {
+ list<Predicate> InsnPredicates = [HasDSPR3];
+}
+
+// Fields.
+class Field6<bits<6> val> {
+ bits<6> V = val;
+}
+
+def SPECIAL3_OPCODE : Field6<0b011111>;
+def REGIMM_OPCODE : Field6<0b000001>;
+
+class DSPInst<string opstr = "">
+ : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>, PredicateControl {
+ let InsnPredicates = [HasDSP];
+ string BaseOpcode = opstr;
+ string Arch = "dsp";
+}
+
+class PseudoDSP<dag outs, dag ins, list<dag> pattern,
+ InstrItinClass itin = IIPseudo>
+ : MipsPseudo<outs, ins, pattern, itin>, PredicateControl {
+ let InsnPredicates = [HasDSP];
+}
+
+class DSPInstAlias<string Asm, dag Result, bit Emit = 0b1>
+ : InstAlias<Asm, Result, Emit>, PredicateControl {
+ let InsnPredicates = [HasDSP];
+}
+
+// ADDU.QB sub-class format.
+class ADDU_QB_FMT<bits<5> op> : DSPInst {
+ bits<5> rd;
+ bits<5> rs;
+ bits<5> rt;
+
+ let Opcode = SPECIAL3_OPCODE.V;
+
+ let Inst{25-21} = rs;
+ let Inst{20-16} = rt;
+ let Inst{15-11} = rd;
+ let Inst{10-6} = op;
+ let Inst{5-0} = 0b010000;
+}
+
+class RADDU_W_QB_FMT<bits<5> op> : DSPInst {
+ bits<5> rd;
+ bits<5> rs;
+
+ let Opcode = SPECIAL3_OPCODE.V;
+
+ let Inst{25-21} = rs;
+ let Inst{20-16} = 0;
+ let Inst{15-11} = rd;
+ let Inst{10-6} = op;
+ let Inst{5-0} = 0b010000;
+}
+
+// CMPU.EQ.QB sub-class format.
+class CMP_EQ_QB_R2_FMT<bits<5> op> : DSPInst {
+ bits<5> rs;
+ bits<5> rt;
+
+ let Opcode = SPECIAL3_OPCODE.V;
+
+ let Inst{25-21} = rs;
+ let Inst{20-16} = rt;
+ let Inst{15-11} = 0;
+ let Inst{10-6} = op;
+ let Inst{5-0} = 0b010001;
+}
+
+class CMP_EQ_QB_R3_FMT<bits<5> op> : DSPInst {
+ bits<5> rs;
+ bits<5> rt;
+ bits<5> rd;
+
+ let Opcode = SPECIAL3_OPCODE.V;
+
+ let Inst{25-21} = rs;
+ let Inst{20-16} = rt;
+ let Inst{15-11} = rd;
+ let Inst{10-6} = op;
+ let Inst{5-0} = 0b010001;
+}
+
+class PRECR_SRA_PH_W_FMT<bits<5> op> : DSPInst {
+ bits<5> rs;
+ bits<5> rt;
+ bits<5> sa;
+
+ let Opcode = SPECIAL3_OPCODE.V;
+
+ let Inst{25-21} = rs;
+ let Inst{20-16} = rt;
+ let Inst{15-11} = sa;
+ let Inst{10-6} = op;
+ let Inst{5-0} = 0b010001;
+}
+
+// ABSQ_S.PH sub-class format.
+class ABSQ_S_PH_R2_FMT<bits<5> op> : DSPInst {
+ bits<5> rd;
+ bits<5> rt;
+
+ let Opcode = SPECIAL3_OPCODE.V;
+
+ let Inst{25-21} = 0;
+ let Inst{20-16} = rt;
+ let Inst{15-11} = rd;
+ let Inst{10-6} = op;
+ let Inst{5-0} = 0b010010;
+}
+
+
+class REPL_FMT<bits<5> op> : DSPInst {
+ bits<5> rd;
+ bits<10> imm;
+
+ let Opcode = SPECIAL3_OPCODE.V;
+
+ let Inst{25-16} = imm;
+ let Inst{15-11} = rd;
+ let Inst{10-6} = op;
+ let Inst{5-0} = 0b010010;
+}
+
+// SHLL.QB sub-class format.
+class SHLL_QB_FMT<bits<5> op> : DSPInst {
+ bits<5> rd;
+ bits<5> rt;
+ bits<5> rs_sa;
+
+ let Opcode = SPECIAL3_OPCODE.V;
+
+ let Inst{25-21} = rs_sa;
+ let Inst{20-16} = rt;
+ let Inst{15-11} = rd;
+ let Inst{10-6} = op;
+ let Inst{5-0} = 0b010011;
+}
+
+// LX sub-class format.
+class LX_FMT<bits<5> op> : DSPInst {
+ bits<5> rd;
+ bits<5> base;
+ bits<5> index;
+
+ let Opcode = SPECIAL3_OPCODE.V;
+
+ let Inst{25-21} = base;
+ let Inst{20-16} = index;
+ let Inst{15-11} = rd;
+ let Inst{10-6} = op;
+ let Inst{5-0} = 0b001010;
+}
+
+// ADDUH.QB sub-class format.
+class ADDUH_QB_FMT<bits<5> op> : DSPInst {
+ bits<5> rd;
+ bits<5> rs;
+ bits<5> rt;
+
+ let Opcode = SPECIAL3_OPCODE.V;
+
+ let Inst{25-21} = rs;
+ let Inst{20-16} = rt;
+ let Inst{15-11} = rd;
+ let Inst{10-6} = op;
+ let Inst{5-0} = 0b011000;
+}
+
+// APPEND sub-class format.
+class APPEND_FMT<bits<5> op> : DSPInst {
+ bits<5> rt;
+ bits<5> rs;
+ bits<5> sa;
+
+ let Opcode = SPECIAL3_OPCODE.V;
+
+ let Inst{25-21} = rs;
+ let Inst{20-16} = rt;
+ let Inst{15-11} = sa;
+ let Inst{10-6} = op;
+ let Inst{5-0} = 0b110001;
+}
+
+// DPA.W.PH sub-class format.
+class DPA_W_PH_FMT<bits<5> op> : DSPInst {
+ bits<2> ac;
+ bits<5> rs;
+ bits<5> rt;
+
+ let Opcode = SPECIAL3_OPCODE.V;
+
+ let Inst{25-21} = rs;
+ let Inst{20-16} = rt;
+ let Inst{15-13} = 0;
+ let Inst{12-11} = ac;
+ let Inst{10-6} = op;
+ let Inst{5-0} = 0b110000;
+}
+
+// MULT sub-class format.
+class MULT_FMT<bits<6> opcode, bits<6> funct> : DSPInst {
+ bits<2> ac;
+ bits<5> rs;
+ bits<5> rt;
+
+ let Opcode = opcode;
+
+ let Inst{25-21} = rs;
+ let Inst{20-16} = rt;
+ let Inst{15-13} = 0;
+ let Inst{12-11} = ac;
+ let Inst{10-6} = 0;
+ let Inst{5-0} = funct;
+}
+
+// MFHI sub-class format.
+class MFHI_FMT<bits<6> funct> : DSPInst {
+ bits<5> rd;
+ bits<2> ac;
+
+ let Inst{31-26} = 0;
+ let Inst{25-23} = 0;
+ let Inst{22-21} = ac;
+ let Inst{20-16} = 0;
+ let Inst{15-11} = rd;
+ let Inst{10-6} = 0;
+ let Inst{5-0} = funct;
+}
+
+// MTHI sub-class format.
+class MTHI_FMT<bits<6> funct> : DSPInst {
+ bits<5> rs;
+ bits<2> ac;
+
+ let Inst{31-26} = 0;
+ let Inst{25-21} = rs;
+ let Inst{20-13} = 0;
+ let Inst{12-11} = ac;
+ let Inst{10-6} = 0;
+ let Inst{5-0} = funct;
+}
+
+// EXTR.W sub-class format (type 1).
+class EXTR_W_TY1_FMT<bits<5> op> : DSPInst {
+ bits<5> rt;
+ bits<2> ac;
+ bits<5> shift_rs;
+
+ let Opcode = SPECIAL3_OPCODE.V;
+
+ let Inst{25-21} = shift_rs;
+ let Inst{20-16} = rt;
+ let Inst{15-13} = 0;
+ let Inst{12-11} = ac;
+ let Inst{10-6} = op;
+ let Inst{5-0} = 0b111000;
+}
+
+// SHILO sub-class format.
+class SHILO_R1_FMT<bits<5> op> : DSPInst {
+ bits<2> ac;
+ bits<6> shift;
+
+ let Opcode = SPECIAL3_OPCODE.V;
+
+ let Inst{25-20} = shift;
+ let Inst{19-13} = 0;
+ let Inst{12-11} = ac;
+ let Inst{10-6} = op;
+ let Inst{5-0} = 0b111000;
+}
+
+class SHILO_R2_FMT<bits<5> op> : DSPInst {
+ bits<2> ac;
+ bits<5> rs;
+
+ let Opcode = SPECIAL3_OPCODE.V;
+
+ let Inst{25-21} = rs;
+ let Inst{20-13} = 0;
+ let Inst{12-11} = ac;
+ let Inst{10-6} = op;
+ let Inst{5-0} = 0b111000;
+}
+
+class RDDSP_FMT<bits<5> op> : DSPInst {
+ bits<5> rd;
+ bits<10> mask;
+
+ let Opcode = SPECIAL3_OPCODE.V;
+
+ let Inst{25-16} = mask;
+ let Inst{15-11} = rd;
+ let Inst{10-6} = op;
+ let Inst{5-0} = 0b111000;
+}
+
+class WRDSP_FMT<bits<5> op> : DSPInst {
+ bits<5> rs;
+ bits<10> mask;
+
+ let Opcode = SPECIAL3_OPCODE.V;
+
+ let Inst{25-21} = rs;
+ let Inst{20-11} = mask;
+ let Inst{10-6} = op;
+ let Inst{5-0} = 0b111000;
+}
+
+class BPOSGE32_FMT<bits<5> op> : DSPInst {
+ bits<16> offset;
+
+ let Opcode = REGIMM_OPCODE.V;
+
+ let Inst{25-21} = 0;
+ let Inst{20-16} = op;
+ let Inst{15-0} = offset;
+}
+
+// INSV sub-class format.
+class INSV_FMT<bits<6> op> : DSPInst {
+ bits<5> rt;
+ bits<5> rs;
+
+ let Opcode = SPECIAL3_OPCODE.V;
+
+ let Inst{25-21} = rs;
+ let Inst{20-16} = rt;
+ let Inst{15-6} = 0;
+ let Inst{5-0} = op;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsDSPInstrInfo.td b/contrib/llvm/lib/Target/Mips/MipsDSPInstrInfo.td
new file mode 100644
index 000000000000..ac9a81b1bb2f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsDSPInstrInfo.td
@@ -0,0 +1,1456 @@
+//===- MipsDSPInstrInfo.td - DSP ASE instructions -*- tablegen ------------*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips DSP ASE instructions.
+//
+//===----------------------------------------------------------------------===//
+
+// ImmLeaf
+def immZExt1 : ImmLeaf<i32, [{return isUInt<1>(Imm);}]>;
+def immZExt2 : ImmLeaf<i32, [{return isUInt<2>(Imm);}]>;
+def immZExt3 : ImmLeaf<i32, [{return isUInt<3>(Imm);}]>;
+def immZExt4 : ImmLeaf<i32, [{return isUInt<4>(Imm);}]>;
+def immZExt8 : ImmLeaf<i32, [{return isUInt<8>(Imm);}]>;
+def immZExt10 : ImmLeaf<i32, [{return isUInt<10>(Imm);}]>;
+def immSExt6 : ImmLeaf<i32, [{return isInt<6>(Imm);}]>;
+
+// Mips-specific dsp nodes
+def SDT_MipsExtr : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>,
+ SDTCisVT<2, untyped>]>;
+def SDT_MipsShilo : SDTypeProfile<1, 2, [SDTCisVT<0, untyped>,
+ SDTCisSameAs<0, 2>, SDTCisVT<1, i32>]>;
+def SDT_MipsDPA : SDTypeProfile<1, 3, [SDTCisVT<0, untyped>, SDTCisSameAs<0, 3>,
+ SDTCisVT<1, i32>, SDTCisSameAs<1, 2>]>;
+def SDT_MipsSHIFT_DSP : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+ SDTCisVT<2, i32>]>;
+
+class MipsDSPBase<string Opc, SDTypeProfile Prof> :
+ SDNode<!strconcat("MipsISD::", Opc), Prof>;
+
+class MipsDSPSideEffectBase<string Opc, SDTypeProfile Prof> :
+ SDNode<!strconcat("MipsISD::", Opc), Prof, [SDNPHasChain, SDNPSideEffect]>;
+
+def MipsEXTP : MipsDSPSideEffectBase<"EXTP", SDT_MipsExtr>;
+def MipsEXTPDP : MipsDSPSideEffectBase<"EXTPDP", SDT_MipsExtr>;
+def MipsEXTR_S_H : MipsDSPSideEffectBase<"EXTR_S_H", SDT_MipsExtr>;
+def MipsEXTR_W : MipsDSPSideEffectBase<"EXTR_W", SDT_MipsExtr>;
+def MipsEXTR_R_W : MipsDSPSideEffectBase<"EXTR_R_W", SDT_MipsExtr>;
+def MipsEXTR_RS_W : MipsDSPSideEffectBase<"EXTR_RS_W", SDT_MipsExtr>;
+
+def MipsSHILO : MipsDSPBase<"SHILO", SDT_MipsShilo>;
+def MipsMTHLIP : MipsDSPSideEffectBase<"MTHLIP", SDT_MipsShilo>;
+
+def MipsMULSAQ_S_W_PH : MipsDSPSideEffectBase<"MULSAQ_S_W_PH", SDT_MipsDPA>;
+def MipsMAQ_S_W_PHL : MipsDSPSideEffectBase<"MAQ_S_W_PHL", SDT_MipsDPA>;
+def MipsMAQ_S_W_PHR : MipsDSPSideEffectBase<"MAQ_S_W_PHR", SDT_MipsDPA>;
+def MipsMAQ_SA_W_PHL : MipsDSPSideEffectBase<"MAQ_SA_W_PHL", SDT_MipsDPA>;
+def MipsMAQ_SA_W_PHR : MipsDSPSideEffectBase<"MAQ_SA_W_PHR", SDT_MipsDPA>;
+
+def MipsDPAU_H_QBL : MipsDSPBase<"DPAU_H_QBL", SDT_MipsDPA>;
+def MipsDPAU_H_QBR : MipsDSPBase<"DPAU_H_QBR", SDT_MipsDPA>;
+def MipsDPSU_H_QBL : MipsDSPBase<"DPSU_H_QBL", SDT_MipsDPA>;
+def MipsDPSU_H_QBR : MipsDSPBase<"DPSU_H_QBR", SDT_MipsDPA>;
+def MipsDPAQ_S_W_PH : MipsDSPSideEffectBase<"DPAQ_S_W_PH", SDT_MipsDPA>;
+def MipsDPSQ_S_W_PH : MipsDSPSideEffectBase<"DPSQ_S_W_PH", SDT_MipsDPA>;
+def MipsDPAQ_SA_L_W : MipsDSPSideEffectBase<"DPAQ_SA_L_W", SDT_MipsDPA>;
+def MipsDPSQ_SA_L_W : MipsDSPSideEffectBase<"DPSQ_SA_L_W", SDT_MipsDPA>;
+
+def MipsDPA_W_PH : MipsDSPBase<"DPA_W_PH", SDT_MipsDPA>;
+def MipsDPS_W_PH : MipsDSPBase<"DPS_W_PH", SDT_MipsDPA>;
+def MipsDPAQX_S_W_PH : MipsDSPSideEffectBase<"DPAQX_S_W_PH", SDT_MipsDPA>;
+def MipsDPAQX_SA_W_PH : MipsDSPSideEffectBase<"DPAQX_SA_W_PH", SDT_MipsDPA>;
+def MipsDPAX_W_PH : MipsDSPBase<"DPAX_W_PH", SDT_MipsDPA>;
+def MipsDPSX_W_PH : MipsDSPBase<"DPSX_W_PH", SDT_MipsDPA>;
+def MipsDPSQX_S_W_PH : MipsDSPSideEffectBase<"DPSQX_S_W_PH", SDT_MipsDPA>;
+def MipsDPSQX_SA_W_PH : MipsDSPSideEffectBase<"DPSQX_SA_W_PH", SDT_MipsDPA>;
+def MipsMULSA_W_PH : MipsDSPBase<"MULSA_W_PH", SDT_MipsDPA>;
+
+def MipsMULT : MipsDSPBase<"MULT", SDT_MipsDPA>;
+def MipsMULTU : MipsDSPBase<"MULTU", SDT_MipsDPA>;
+def MipsMADD_DSP : MipsDSPBase<"MADD_DSP", SDT_MipsDPA>;
+def MipsMADDU_DSP : MipsDSPBase<"MADDU_DSP", SDT_MipsDPA>;
+def MipsMSUB_DSP : MipsDSPBase<"MSUB_DSP", SDT_MipsDPA>;
+def MipsMSUBU_DSP : MipsDSPBase<"MSUBU_DSP", SDT_MipsDPA>;
+def MipsSHLL_DSP : MipsDSPBase<"SHLL_DSP", SDT_MipsSHIFT_DSP>;
+def MipsSHRA_DSP : MipsDSPBase<"SHRA_DSP", SDT_MipsSHIFT_DSP>;
+def MipsSHRL_DSP : MipsDSPBase<"SHRL_DSP", SDT_MipsSHIFT_DSP>;
+def MipsSETCC_DSP : MipsDSPBase<"SETCC_DSP", SDTSetCC>;
+def MipsSELECT_CC_DSP : MipsDSPBase<"SELECT_CC_DSP", SDTSelectCC>;
+
+// Flags.
+class Uses<list<Register> Regs> {
+ list<Register> Uses = Regs;
+}
+
+class Defs<list<Register> Regs> {
+ list<Register> Defs = Regs;
+}
+
+// Instruction encoding.
+class ADDU_QB_ENC : ADDU_QB_FMT<0b00000>;
+class ADDU_S_QB_ENC : ADDU_QB_FMT<0b00100>;
+class SUBU_QB_ENC : ADDU_QB_FMT<0b00001>;
+class SUBU_S_QB_ENC : ADDU_QB_FMT<0b00101>;
+class ADDQ_PH_ENC : ADDU_QB_FMT<0b01010>;
+class ADDQ_S_PH_ENC : ADDU_QB_FMT<0b01110>;
+class SUBQ_PH_ENC : ADDU_QB_FMT<0b01011>;
+class SUBQ_S_PH_ENC : ADDU_QB_FMT<0b01111>;
+class ADDQ_S_W_ENC : ADDU_QB_FMT<0b10110>;
+class SUBQ_S_W_ENC : ADDU_QB_FMT<0b10111>;
+class ADDSC_ENC : ADDU_QB_FMT<0b10000>;
+class ADDWC_ENC : ADDU_QB_FMT<0b10001>;
+class MODSUB_ENC : ADDU_QB_FMT<0b10010>;
+class RADDU_W_QB_ENC : RADDU_W_QB_FMT<0b10100>;
+class ABSQ_S_PH_ENC : ABSQ_S_PH_R2_FMT<0b01001>;
+class ABSQ_S_W_ENC : ABSQ_S_PH_R2_FMT<0b10001>;
+class PRECRQ_QB_PH_ENC : CMP_EQ_QB_R3_FMT<0b01100>;
+class PRECRQ_PH_W_ENC : CMP_EQ_QB_R3_FMT<0b10100>;
+class PRECRQ_RS_PH_W_ENC : CMP_EQ_QB_R3_FMT<0b10101>;
+class PRECRQU_S_QB_PH_ENC : CMP_EQ_QB_R3_FMT<0b01111>;
+class PRECEQ_W_PHL_ENC : ABSQ_S_PH_R2_FMT<0b01100>;
+class PRECEQ_W_PHR_ENC : ABSQ_S_PH_R2_FMT<0b01101>;
+class PRECEQU_PH_QBL_ENC : ABSQ_S_PH_R2_FMT<0b00100>;
+class PRECEQU_PH_QBR_ENC : ABSQ_S_PH_R2_FMT<0b00101>;
+class PRECEQU_PH_QBLA_ENC : ABSQ_S_PH_R2_FMT<0b00110>;
+class PRECEQU_PH_QBRA_ENC : ABSQ_S_PH_R2_FMT<0b00111>;
+class PRECEU_PH_QBL_ENC : ABSQ_S_PH_R2_FMT<0b11100>;
+class PRECEU_PH_QBR_ENC : ABSQ_S_PH_R2_FMT<0b11101>;
+class PRECEU_PH_QBLA_ENC : ABSQ_S_PH_R2_FMT<0b11110>;
+class PRECEU_PH_QBRA_ENC : ABSQ_S_PH_R2_FMT<0b11111>;
+class SHLL_QB_ENC : SHLL_QB_FMT<0b00000>;
+class SHLLV_QB_ENC : SHLL_QB_FMT<0b00010>;
+class SHRL_QB_ENC : SHLL_QB_FMT<0b00001>;
+class SHRLV_QB_ENC : SHLL_QB_FMT<0b00011>;
+class SHLL_PH_ENC : SHLL_QB_FMT<0b01000>;
+class SHLLV_PH_ENC : SHLL_QB_FMT<0b01010>;
+class SHLL_S_PH_ENC : SHLL_QB_FMT<0b01100>;
+class SHLLV_S_PH_ENC : SHLL_QB_FMT<0b01110>;
+class SHRA_PH_ENC : SHLL_QB_FMT<0b01001>;
+class SHRAV_PH_ENC : SHLL_QB_FMT<0b01011>;
+class SHRA_R_PH_ENC : SHLL_QB_FMT<0b01101>;
+class SHRAV_R_PH_ENC : SHLL_QB_FMT<0b01111>;
+class SHLL_S_W_ENC : SHLL_QB_FMT<0b10100>;
+class SHLLV_S_W_ENC : SHLL_QB_FMT<0b10110>;
+class SHRA_R_W_ENC : SHLL_QB_FMT<0b10101>;
+class SHRAV_R_W_ENC : SHLL_QB_FMT<0b10111>;
+class MULEU_S_PH_QBL_ENC : ADDU_QB_FMT<0b00110>;
+class MULEU_S_PH_QBR_ENC : ADDU_QB_FMT<0b00111>;
+class MULEQ_S_W_PHL_ENC : ADDU_QB_FMT<0b11100>;
+class MULEQ_S_W_PHR_ENC : ADDU_QB_FMT<0b11101>;
+class MULQ_RS_PH_ENC : ADDU_QB_FMT<0b11111>;
+class MULSAQ_S_W_PH_ENC : DPA_W_PH_FMT<0b00110>;
+class MAQ_S_W_PHL_ENC : DPA_W_PH_FMT<0b10100>;
+class MAQ_S_W_PHR_ENC : DPA_W_PH_FMT<0b10110>;
+class MAQ_SA_W_PHL_ENC : DPA_W_PH_FMT<0b10000>;
+class MAQ_SA_W_PHR_ENC : DPA_W_PH_FMT<0b10010>;
+class MFHI_ENC : MFHI_FMT<0b010000>;
+class MFLO_ENC : MFHI_FMT<0b010010>;
+class MTHI_ENC : MTHI_FMT<0b010001>;
+class MTLO_ENC : MTHI_FMT<0b010011>;
+class DPAU_H_QBL_ENC : DPA_W_PH_FMT<0b00011>;
+class DPAU_H_QBR_ENC : DPA_W_PH_FMT<0b00111>;
+class DPSU_H_QBL_ENC : DPA_W_PH_FMT<0b01011>;
+class DPSU_H_QBR_ENC : DPA_W_PH_FMT<0b01111>;
+class DPAQ_S_W_PH_ENC : DPA_W_PH_FMT<0b00100>;
+class DPSQ_S_W_PH_ENC : DPA_W_PH_FMT<0b00101>;
+class DPAQ_SA_L_W_ENC : DPA_W_PH_FMT<0b01100>;
+class DPSQ_SA_L_W_ENC : DPA_W_PH_FMT<0b01101>;
+class MULT_DSP_ENC : MULT_FMT<0b000000, 0b011000>;
+class MULTU_DSP_ENC : MULT_FMT<0b000000, 0b011001>;
+class MADD_DSP_ENC : MULT_FMT<0b011100, 0b000000>;
+class MADDU_DSP_ENC : MULT_FMT<0b011100, 0b000001>;
+class MSUB_DSP_ENC : MULT_FMT<0b011100, 0b000100>;
+class MSUBU_DSP_ENC : MULT_FMT<0b011100, 0b000101>;
+class CMPU_EQ_QB_ENC : CMP_EQ_QB_R2_FMT<0b00000>;
+class CMPU_LT_QB_ENC : CMP_EQ_QB_R2_FMT<0b00001>;
+class CMPU_LE_QB_ENC : CMP_EQ_QB_R2_FMT<0b00010>;
+class CMPGU_EQ_QB_ENC : CMP_EQ_QB_R3_FMT<0b00100>;
+class CMPGU_LT_QB_ENC : CMP_EQ_QB_R3_FMT<0b00101>;
+class CMPGU_LE_QB_ENC : CMP_EQ_QB_R3_FMT<0b00110>;
+class CMP_EQ_PH_ENC : CMP_EQ_QB_R2_FMT<0b01000>;
+class CMP_LT_PH_ENC : CMP_EQ_QB_R2_FMT<0b01001>;
+class CMP_LE_PH_ENC : CMP_EQ_QB_R2_FMT<0b01010>;
+class BITREV_ENC : ABSQ_S_PH_R2_FMT<0b11011>;
+class PACKRL_PH_ENC : CMP_EQ_QB_R3_FMT<0b01110>;
+class REPL_QB_ENC : REPL_FMT<0b00010>;
+class REPL_PH_ENC : REPL_FMT<0b01010>;
+class REPLV_QB_ENC : ABSQ_S_PH_R2_FMT<0b00011>;
+class REPLV_PH_ENC : ABSQ_S_PH_R2_FMT<0b01011>;
+class PICK_QB_ENC : CMP_EQ_QB_R3_FMT<0b00011>;
+class PICK_PH_ENC : CMP_EQ_QB_R3_FMT<0b01011>;
+class LWX_ENC : LX_FMT<0b00000>;
+class LHX_ENC : LX_FMT<0b00100>;
+class LBUX_ENC : LX_FMT<0b00110>;
+class BPOSGE32_ENC : BPOSGE32_FMT<0b11100>;
+class INSV_ENC : INSV_FMT<0b001100>;
+
+class EXTP_ENC : EXTR_W_TY1_FMT<0b00010>;
+class EXTPV_ENC : EXTR_W_TY1_FMT<0b00011>;
+class EXTPDP_ENC : EXTR_W_TY1_FMT<0b01010>;
+class EXTPDPV_ENC : EXTR_W_TY1_FMT<0b01011>;
+class EXTR_W_ENC : EXTR_W_TY1_FMT<0b00000>;
+class EXTRV_W_ENC : EXTR_W_TY1_FMT<0b00001>;
+class EXTR_R_W_ENC : EXTR_W_TY1_FMT<0b00100>;
+class EXTRV_R_W_ENC : EXTR_W_TY1_FMT<0b00101>;
+class EXTR_RS_W_ENC : EXTR_W_TY1_FMT<0b00110>;
+class EXTRV_RS_W_ENC : EXTR_W_TY1_FMT<0b00111>;
+class EXTR_S_H_ENC : EXTR_W_TY1_FMT<0b01110>;
+class EXTRV_S_H_ENC : EXTR_W_TY1_FMT<0b01111>;
+class SHILO_ENC : SHILO_R1_FMT<0b11010>;
+class SHILOV_ENC : SHILO_R2_FMT<0b11011>;
+class MTHLIP_ENC : SHILO_R2_FMT<0b11111>;
+
+class RDDSP_ENC : RDDSP_FMT<0b10010>;
+class WRDSP_ENC : WRDSP_FMT<0b10011>;
+class ADDU_PH_ENC : ADDU_QB_FMT<0b01000>;
+class ADDU_S_PH_ENC : ADDU_QB_FMT<0b01100>;
+class SUBU_PH_ENC : ADDU_QB_FMT<0b01001>;
+class SUBU_S_PH_ENC : ADDU_QB_FMT<0b01101>;
+class CMPGDU_EQ_QB_ENC : CMP_EQ_QB_R3_FMT<0b11000>;
+class CMPGDU_LT_QB_ENC : CMP_EQ_QB_R3_FMT<0b11001>;
+class CMPGDU_LE_QB_ENC : CMP_EQ_QB_R3_FMT<0b11010>;
+class ABSQ_S_QB_ENC : ABSQ_S_PH_R2_FMT<0b00001>;
+class ADDUH_QB_ENC : ADDUH_QB_FMT<0b00000>;
+class ADDUH_R_QB_ENC : ADDUH_QB_FMT<0b00010>;
+class SUBUH_QB_ENC : ADDUH_QB_FMT<0b00001>;
+class SUBUH_R_QB_ENC : ADDUH_QB_FMT<0b00011>;
+class ADDQH_PH_ENC : ADDUH_QB_FMT<0b01000>;
+class ADDQH_R_PH_ENC : ADDUH_QB_FMT<0b01010>;
+class SUBQH_PH_ENC : ADDUH_QB_FMT<0b01001>;
+class SUBQH_R_PH_ENC : ADDUH_QB_FMT<0b01011>;
+class ADDQH_W_ENC : ADDUH_QB_FMT<0b10000>;
+class ADDQH_R_W_ENC : ADDUH_QB_FMT<0b10010>;
+class SUBQH_W_ENC : ADDUH_QB_FMT<0b10001>;
+class SUBQH_R_W_ENC : ADDUH_QB_FMT<0b10011>;
+class MUL_PH_ENC : ADDUH_QB_FMT<0b01100>;
+class MUL_S_PH_ENC : ADDUH_QB_FMT<0b01110>;
+class MULQ_S_W_ENC : ADDUH_QB_FMT<0b10110>;
+class MULQ_RS_W_ENC : ADDUH_QB_FMT<0b10111>;
+class MULQ_S_PH_ENC : ADDU_QB_FMT<0b11110>;
+class DPA_W_PH_ENC : DPA_W_PH_FMT<0b00000>;
+class DPS_W_PH_ENC : DPA_W_PH_FMT<0b00001>;
+class DPAQX_S_W_PH_ENC : DPA_W_PH_FMT<0b11000>;
+class DPAQX_SA_W_PH_ENC : DPA_W_PH_FMT<0b11010>;
+class DPAX_W_PH_ENC : DPA_W_PH_FMT<0b01000>;
+class DPSX_W_PH_ENC : DPA_W_PH_FMT<0b01001>;
+class DPSQX_S_W_PH_ENC : DPA_W_PH_FMT<0b11001>;
+class DPSQX_SA_W_PH_ENC : DPA_W_PH_FMT<0b11011>;
+class MULSA_W_PH_ENC : DPA_W_PH_FMT<0b00010>;
+class PRECR_QB_PH_ENC : CMP_EQ_QB_R3_FMT<0b01101>;
+class PRECR_SRA_PH_W_ENC : PRECR_SRA_PH_W_FMT<0b11110>;
+class PRECR_SRA_R_PH_W_ENC : PRECR_SRA_PH_W_FMT<0b11111>;
+class SHRA_QB_ENC : SHLL_QB_FMT<0b00100>;
+class SHRAV_QB_ENC : SHLL_QB_FMT<0b00110>;
+class SHRA_R_QB_ENC : SHLL_QB_FMT<0b00101>;
+class SHRAV_R_QB_ENC : SHLL_QB_FMT<0b00111>;
+class SHRL_PH_ENC : SHLL_QB_FMT<0b11001>;
+class SHRLV_PH_ENC : SHLL_QB_FMT<0b11011>;
+class APPEND_ENC : APPEND_FMT<0b00000>;
+class BALIGN_ENC : APPEND_FMT<0b10000>;
+class PREPEND_ENC : APPEND_FMT<0b00001>;
+
+// Instruction desc.
+class ADDU_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ InstrItinClass itin, RegisterOperand ROD,
+ RegisterOperand ROS, RegisterOperand ROT = ROS> {
+ dag OutOperandList = (outs ROD:$rd);
+ dag InOperandList = (ins ROS:$rs, ROT:$rt);
+ string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
+ list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs, ROT:$rt))];
+ InstrItinClass Itinerary = itin;
+ string BaseOpcode = instr_asm;
+}
+
+class RADDU_W_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ InstrItinClass itin, RegisterOperand ROD,
+ RegisterOperand ROS = ROD> {
+ dag OutOperandList = (outs ROD:$rd);
+ dag InOperandList = (ins ROS:$rs);
+ string AsmString = !strconcat(instr_asm, "\t$rd, $rs");
+ list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs))];
+ InstrItinClass Itinerary = itin;
+ string BaseOpcode = instr_asm;
+}
+
+class CMP_EQ_QB_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ InstrItinClass itin, RegisterOperand ROS,
+ RegisterOperand ROT = ROS> {
+ dag OutOperandList = (outs);
+ dag InOperandList = (ins ROS:$rs, ROT:$rt);
+ string AsmString = !strconcat(instr_asm, "\t$rs, $rt");
+ list<dag> Pattern = [(OpNode ROS:$rs, ROT:$rt)];
+ InstrItinClass Itinerary = itin;
+ string BaseOpcode = instr_asm;
+}
+
+class CMP_EQ_QB_R3_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ InstrItinClass itin, RegisterOperand ROD,
+ RegisterOperand ROS, RegisterOperand ROT = ROS> {
+ dag OutOperandList = (outs ROD:$rd);
+ dag InOperandList = (ins ROS:$rs, ROT:$rt);
+ string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
+ list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs, ROT:$rt))];
+ InstrItinClass Itinerary = itin;
+ string BaseOpcode = instr_asm;
+}
+
+class PRECR_SRA_PH_W_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ InstrItinClass itin, RegisterOperand ROT,
+ RegisterOperand ROS = ROT> {
+ dag OutOperandList = (outs ROT:$rt);
+ dag InOperandList = (ins ROS:$rs, uimm5:$sa, ROS:$src);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $sa");
+ list<dag> Pattern = [(set ROT:$rt, (OpNode ROS:$src, ROS:$rs, immZExt5:$sa))];
+ InstrItinClass Itinerary = itin;
+ string Constraints = "$src = $rt";
+ string BaseOpcode = instr_asm;
+}
+
+class ABSQ_S_PH_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ InstrItinClass itin, RegisterOperand ROD,
+ RegisterOperand ROT = ROD> {
+ dag OutOperandList = (outs ROD:$rd);
+ dag InOperandList = (ins ROT:$rt);
+ string AsmString = !strconcat(instr_asm, "\t$rd, $rt");
+ list<dag> Pattern = [(set ROD:$rd, (OpNode ROT:$rt))];
+ InstrItinClass Itinerary = itin;
+ string BaseOpcode = instr_asm;
+}
+
+class REPL_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ Operand ImmOp, ImmLeaf immPat, InstrItinClass itin,
+ RegisterOperand RO> {
+ dag OutOperandList = (outs RO:$rd);
+ dag InOperandList = (ins ImmOp:$imm);
+ string AsmString = !strconcat(instr_asm, "\t$rd, $imm");
+ list<dag> Pattern = [(set RO:$rd, (OpNode immPat:$imm))];
+ InstrItinClass Itinerary = itin;
+ string BaseOpcode = instr_asm;
+}
+
+class SHLL_QB_R3_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ InstrItinClass itin, RegisterOperand RO> {
+ dag OutOperandList = (outs RO:$rd);
+ dag InOperandList = (ins RO:$rt, GPR32Opnd:$rs_sa);
+ string AsmString = !strconcat(instr_asm, "\t$rd, $rt, $rs_sa");
+ list<dag> Pattern = [(set RO:$rd, (OpNode RO:$rt, GPR32Opnd:$rs_sa))];
+ InstrItinClass Itinerary = itin;
+ string BaseOpcode = instr_asm;
+}
+
+class SHLL_QB_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ SDPatternOperator ImmPat, InstrItinClass itin,
+ RegisterOperand RO, Operand ImmOpnd> {
+ dag OutOperandList = (outs RO:$rd);
+ dag InOperandList = (ins RO:$rt, ImmOpnd:$rs_sa);
+ string AsmString = !strconcat(instr_asm, "\t$rd, $rt, $rs_sa");
+ list<dag> Pattern = [(set RO:$rd, (OpNode RO:$rt, ImmPat:$rs_sa))];
+ InstrItinClass Itinerary = itin;
+ bit hasSideEffects = 1;
+ string BaseOpcode = instr_asm;
+}
+
+class LX_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ InstrItinClass itin> {
+ dag OutOperandList = (outs GPR32Opnd:$rd);
+ dag InOperandList = (ins PtrRC:$base, PtrRC:$index);
+ string AsmString = !strconcat(instr_asm, "\t$rd, ${index}(${base})");
+ list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode iPTR:$base, iPTR:$index))];
+ InstrItinClass Itinerary = itin;
+ bit mayLoad = 1;
+ string BaseOpcode = instr_asm;
+}
+
+class ADDUH_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ InstrItinClass itin, RegisterOperand ROD,
+ RegisterOperand ROS = ROD, RegisterOperand ROT = ROD> {
+ dag OutOperandList = (outs ROD:$rd);
+ dag InOperandList = (ins ROS:$rs, ROT:$rt);
+ string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
+ list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs, ROT:$rt))];
+ InstrItinClass Itinerary = itin;
+ string BaseOpcode = instr_asm;
+}
+
+class APPEND_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ Operand ImmOp, SDPatternOperator Imm, InstrItinClass itin> {
+ dag OutOperandList = (outs GPR32Opnd:$rt);
+ dag InOperandList = (ins GPR32Opnd:$rs, ImmOp:$sa, GPR32Opnd:$src);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $sa");
+ list<dag> Pattern = [(set GPR32Opnd:$rt,
+ (OpNode GPR32Opnd:$src, GPR32Opnd:$rs, Imm:$sa))];
+ InstrItinClass Itinerary = itin;
+ string Constraints = "$src = $rt";
+ string BaseOpcode = instr_asm;
+}
+
+class EXTR_W_TY1_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ InstrItinClass itin> {
+ dag OutOperandList = (outs GPR32Opnd:$rt);
+ dag InOperandList = (ins ACC64DSPOpnd:$ac, GPR32Opnd:$shift_rs);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $shift_rs");
+ InstrItinClass Itinerary = itin;
+ string BaseOpcode = instr_asm;
+}
+
+class EXTR_W_TY1_R1_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ InstrItinClass itin> {
+ dag OutOperandList = (outs GPR32Opnd:$rt);
+ dag InOperandList = (ins ACC64DSPOpnd:$ac, uimm5:$shift_rs);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $shift_rs");
+ InstrItinClass Itinerary = itin;
+ string BaseOpcode = instr_asm;
+}
+
+class SHILO_R1_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
+ dag OutOperandList = (outs ACC64DSPOpnd:$ac);
+ dag InOperandList = (ins simm6:$shift, ACC64DSPOpnd:$acin);
+ string AsmString = !strconcat(instr_asm, "\t$ac, $shift");
+ list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
+ (OpNode immSExt6:$shift, ACC64DSPOpnd:$acin))];
+ string Constraints = "$acin = $ac";
+ string BaseOpcode = instr_asm;
+}
+
+class SHILO_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
+ dag OutOperandList = (outs ACC64DSPOpnd:$ac);
+ dag InOperandList = (ins GPR32Opnd:$rs, ACC64DSPOpnd:$acin);
+ string AsmString = !strconcat(instr_asm, "\t$ac, $rs");
+ list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
+ (OpNode GPR32Opnd:$rs, ACC64DSPOpnd:$acin))];
+ string Constraints = "$acin = $ac";
+ string BaseOpcode = instr_asm;
+}
+
+class MTHLIP_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
+ dag OutOperandList = (outs ACC64DSPOpnd:$ac);
+ dag InOperandList = (ins GPR32Opnd:$rs, ACC64DSPOpnd:$acin);
+ string AsmString = !strconcat(instr_asm, "\t$rs, $ac");
+ list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
+ (OpNode GPR32Opnd:$rs, ACC64DSPOpnd:$acin))];
+ string Constraints = "$acin = $ac";
+ string BaseOpcode = instr_asm;
+}
+
+class RDDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ InstrItinClass itin> {
+ dag OutOperandList = (outs GPR32Opnd:$rd);
+ dag InOperandList = (ins uimm10:$mask);
+ string AsmString = !strconcat(instr_asm, "\t$rd, $mask");
+ list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode immZExt10:$mask))];
+ InstrItinClass Itinerary = itin;
+ string BaseOpcode = instr_asm;
+}
+
+class WRDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ InstrItinClass itin> {
+ dag OutOperandList = (outs);
+ dag InOperandList = (ins GPR32Opnd:$rs, uimm10:$mask);
+ string AsmString = !strconcat(instr_asm, "\t$rs, $mask");
+ list<dag> Pattern = [(OpNode GPR32Opnd:$rs, immZExt10:$mask)];
+ InstrItinClass Itinerary = itin;
+ string BaseOpcode = instr_asm;
+}
+
+class DPA_W_PH_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
+ dag OutOperandList = (outs ACC64DSPOpnd:$ac);
+ dag InOperandList = (ins GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64DSPOpnd:$acin);
+ string AsmString = !strconcat(instr_asm, "\t$ac, $rs, $rt");
+ list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
+ (OpNode GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64DSPOpnd:$acin))];
+ string Constraints = "$acin = $ac";
+ string BaseOpcode = instr_asm;
+}
+
+class MULT_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ InstrItinClass itin> {
+ dag OutOperandList = (outs ACC64DSPOpnd:$ac);
+ dag InOperandList = (ins GPR32Opnd:$rs, GPR32Opnd:$rt);
+ string AsmString = !strconcat(instr_asm, "\t$ac, $rs, $rt");
+ list<dag> Pattern = [(set ACC64DSPOpnd:$ac, (OpNode GPR32Opnd:$rs, GPR32Opnd:$rt))];
+ InstrItinClass Itinerary = itin;
+ bit isCommutable = 1;
+ string BaseOpcode = instr_asm;
+}
+
+class MADD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ InstrItinClass itin> {
+ dag OutOperandList = (outs ACC64DSPOpnd:$ac);
+ dag InOperandList = (ins GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64DSPOpnd:$acin);
+ string AsmString = !strconcat(instr_asm, "\t$ac, $rs, $rt");
+ list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
+ (OpNode GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64DSPOpnd:$acin))];
+ InstrItinClass Itinerary = itin;
+ string Constraints = "$acin = $ac";
+ string BaseOpcode = instr_asm;
+}
+
+class MFHI_DESC_BASE<string instr_asm, RegisterOperand RO, SDNode OpNode,
+ InstrItinClass itin> {
+ dag OutOperandList = (outs GPR32Opnd:$rd);
+ dag InOperandList = (ins RO:$ac);
+ string AsmString = !strconcat(instr_asm, "\t$rd, $ac");
+ list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode RO:$ac))];
+ InstrItinClass Itinerary = itin;
+ string BaseOpcode = instr_asm;
+}
+
+class MTHI_DESC_BASE<string instr_asm, RegisterOperand RO, InstrItinClass itin> {
+ dag OutOperandList = (outs RO:$ac);
+ dag InOperandList = (ins GPR32Opnd:$rs);
+ string AsmString = !strconcat(instr_asm, "\t$rs, $ac");
+ InstrItinClass Itinerary = itin;
+ string BaseOpcode = instr_asm;
+}
+
+class BPOSGE32_PSEUDO_DESC_BASE<SDPatternOperator OpNode, InstrItinClass itin> :
+ MipsPseudo<(outs GPR32Opnd:$dst), (ins), [(set GPR32Opnd:$dst, (OpNode))]> {
+ bit usesCustomInserter = 1;
+}
+
+class BPOSGE32_DESC_BASE<string instr_asm, DAGOperand opnd,
+ InstrItinClass itin> {
+ dag OutOperandList = (outs);
+ dag InOperandList = (ins opnd:$offset);
+ string AsmString = !strconcat(instr_asm, "\t$offset");
+ InstrItinClass Itinerary = itin;
+ bit isBranch = 1;
+ bit isTerminator = 1;
+ bit hasDelaySlot = 1;
+ string BaseOpcode = instr_asm;
+}
+
+class INSV_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ InstrItinClass itin> {
+ dag OutOperandList = (outs GPR32Opnd:$rt);
+ dag InOperandList = (ins GPR32Opnd:$src, GPR32Opnd:$rs);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $rs");
+ list<dag> Pattern = [(set GPR32Opnd:$rt, (OpNode GPR32Opnd:$src, GPR32Opnd:$rs))];
+ InstrItinClass Itinerary = itin;
+ string Constraints = "$src = $rt";
+ string BaseOpcode = instr_asm;
+}
+
+//===----------------------------------------------------------------------===//
+// MIPS DSP Rev 1
+//===----------------------------------------------------------------------===//
+
+// Addition/subtraction
+class ADDU_QB_DESC : ADDU_QB_DESC_BASE<"addu.qb", null_frag, NoItinerary,
+ DSPROpnd, DSPROpnd>, IsCommutable,
+ Defs<[DSPOutFlag20]>;
+
+class ADDU_S_QB_DESC : ADDU_QB_DESC_BASE<"addu_s.qb", int_mips_addu_s_qb,
+ NoItinerary, DSPROpnd, DSPROpnd>,
+ IsCommutable, Defs<[DSPOutFlag20]>;
+
+class SUBU_QB_DESC : ADDU_QB_DESC_BASE<"subu.qb", null_frag, NoItinerary,
+ DSPROpnd, DSPROpnd>,
+ Defs<[DSPOutFlag20]>;
+
+class SUBU_S_QB_DESC : ADDU_QB_DESC_BASE<"subu_s.qb", int_mips_subu_s_qb,
+ NoItinerary, DSPROpnd, DSPROpnd>,
+ Defs<[DSPOutFlag20]>;
+
+class ADDQ_PH_DESC : ADDU_QB_DESC_BASE<"addq.ph", null_frag, NoItinerary,
+ DSPROpnd, DSPROpnd>, IsCommutable,
+ Defs<[DSPOutFlag20]>;
+
+class ADDQ_S_PH_DESC : ADDU_QB_DESC_BASE<"addq_s.ph", int_mips_addq_s_ph,
+ NoItinerary, DSPROpnd, DSPROpnd>,
+ IsCommutable, Defs<[DSPOutFlag20]>;
+
+class SUBQ_PH_DESC : ADDU_QB_DESC_BASE<"subq.ph", null_frag, NoItinerary,
+ DSPROpnd, DSPROpnd>,
+ Defs<[DSPOutFlag20]>;
+
+class SUBQ_S_PH_DESC : ADDU_QB_DESC_BASE<"subq_s.ph", int_mips_subq_s_ph,
+ NoItinerary, DSPROpnd, DSPROpnd>,
+ Defs<[DSPOutFlag20]>;
+
+class ADDQ_S_W_DESC : ADDU_QB_DESC_BASE<"addq_s.w", int_mips_addq_s_w,
+ NoItinerary, GPR32Opnd, GPR32Opnd>,
+ IsCommutable, Defs<[DSPOutFlag20]>;
+
+class SUBQ_S_W_DESC : ADDU_QB_DESC_BASE<"subq_s.w", int_mips_subq_s_w,
+ NoItinerary, GPR32Opnd, GPR32Opnd>,
+ Defs<[DSPOutFlag20]>;
+
+class ADDSC_DESC : ADDU_QB_DESC_BASE<"addsc", null_frag, NoItinerary,
+ GPR32Opnd, GPR32Opnd>, IsCommutable,
+ Defs<[DSPCarry]>;
+
+class ADDWC_DESC : ADDU_QB_DESC_BASE<"addwc", null_frag, NoItinerary,
+ GPR32Opnd, GPR32Opnd>,
+ IsCommutable, Uses<[DSPCarry]>, Defs<[DSPOutFlag20]>;
+
+class MODSUB_DESC : ADDU_QB_DESC_BASE<"modsub", int_mips_modsub, NoItinerary,
+ GPR32Opnd, GPR32Opnd>;
+
+class RADDU_W_QB_DESC : RADDU_W_QB_DESC_BASE<"raddu.w.qb", int_mips_raddu_w_qb,
+ NoItinerary, GPR32Opnd, DSPROpnd>;
+
+// Absolute value
+class ABSQ_S_PH_DESC : ABSQ_S_PH_R2_DESC_BASE<"absq_s.ph", int_mips_absq_s_ph,
+ NoItinerary, DSPROpnd>,
+ Defs<[DSPOutFlag20]>;
+
+class ABSQ_S_W_DESC : ABSQ_S_PH_R2_DESC_BASE<"absq_s.w", int_mips_absq_s_w,
+ NoItinerary, GPR32Opnd>,
+ Defs<[DSPOutFlag20]>;
+
+// Precision reduce/expand
+class PRECRQ_QB_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"precrq.qb.ph",
+ int_mips_precrq_qb_ph,
+ NoItinerary, DSPROpnd, DSPROpnd>;
+
+class PRECRQ_PH_W_DESC : CMP_EQ_QB_R3_DESC_BASE<"precrq.ph.w",
+ int_mips_precrq_ph_w,
+ NoItinerary, DSPROpnd, GPR32Opnd>;
+
+class PRECRQ_RS_PH_W_DESC : CMP_EQ_QB_R3_DESC_BASE<"precrq_rs.ph.w",
+ int_mips_precrq_rs_ph_w,
+ NoItinerary, DSPROpnd,
+ GPR32Opnd>,
+ Defs<[DSPOutFlag22]>;
+
+class PRECRQU_S_QB_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"precrqu_s.qb.ph",
+ int_mips_precrqu_s_qb_ph,
+ NoItinerary, DSPROpnd,
+ DSPROpnd>,
+ Defs<[DSPOutFlag22]>;
+
+class PRECEQ_W_PHL_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceq.w.phl",
+ int_mips_preceq_w_phl,
+ NoItinerary, GPR32Opnd, DSPROpnd>;
+
+class PRECEQ_W_PHR_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceq.w.phr",
+ int_mips_preceq_w_phr,
+ NoItinerary, GPR32Opnd, DSPROpnd>;
+
+class PRECEQU_PH_QBL_DESC : ABSQ_S_PH_R2_DESC_BASE<"precequ.ph.qbl",
+ int_mips_precequ_ph_qbl,
+ NoItinerary, DSPROpnd>;
+
+class PRECEQU_PH_QBR_DESC : ABSQ_S_PH_R2_DESC_BASE<"precequ.ph.qbr",
+ int_mips_precequ_ph_qbr,
+ NoItinerary, DSPROpnd>;
+
+class PRECEQU_PH_QBLA_DESC : ABSQ_S_PH_R2_DESC_BASE<"precequ.ph.qbla",
+ int_mips_precequ_ph_qbla,
+ NoItinerary, DSPROpnd>;
+
+class PRECEQU_PH_QBRA_DESC : ABSQ_S_PH_R2_DESC_BASE<"precequ.ph.qbra",
+ int_mips_precequ_ph_qbra,
+ NoItinerary, DSPROpnd>;
+
+class PRECEU_PH_QBL_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbl",
+ int_mips_preceu_ph_qbl,
+ NoItinerary, DSPROpnd>;
+
+class PRECEU_PH_QBR_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbr",
+ int_mips_preceu_ph_qbr,
+ NoItinerary, DSPROpnd>;
+
+class PRECEU_PH_QBLA_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbla",
+ int_mips_preceu_ph_qbla,
+ NoItinerary, DSPROpnd>;
+
+class PRECEU_PH_QBRA_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbra",
+ int_mips_preceu_ph_qbra,
+ NoItinerary, DSPROpnd>;
+
+// Shift
+class SHLL_QB_DESC : SHLL_QB_R2_DESC_BASE<"shll.qb", null_frag, immZExt3,
+ NoItinerary, DSPROpnd, uimm3>,
+ Defs<[DSPOutFlag22]>;
+
+class SHLLV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shllv.qb", int_mips_shll_qb,
+ NoItinerary, DSPROpnd>,
+ Defs<[DSPOutFlag22]>;
+
+class SHRL_QB_DESC : SHLL_QB_R2_DESC_BASE<"shrl.qb", null_frag, immZExt3,
+ NoItinerary, DSPROpnd, uimm3>;
+
+class SHRLV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shrlv.qb", int_mips_shrl_qb,
+ NoItinerary, DSPROpnd>;
+
+class SHLL_PH_DESC : SHLL_QB_R2_DESC_BASE<"shll.ph", null_frag, immZExt4,
+ NoItinerary, DSPROpnd, uimm4>,
+ Defs<[DSPOutFlag22]>;
+
+class SHLLV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shllv.ph", int_mips_shll_ph,
+ NoItinerary, DSPROpnd>,
+ Defs<[DSPOutFlag22]>;
+
+class SHLL_S_PH_DESC : SHLL_QB_R2_DESC_BASE<"shll_s.ph", int_mips_shll_s_ph,
+ immZExt4, NoItinerary, DSPROpnd,
+ uimm4>,
+ Defs<[DSPOutFlag22]>;
+
+class SHLLV_S_PH_DESC : SHLL_QB_R3_DESC_BASE<"shllv_s.ph", int_mips_shll_s_ph,
+ NoItinerary, DSPROpnd>,
+ Defs<[DSPOutFlag22]>;
+
+class SHRA_PH_DESC : SHLL_QB_R2_DESC_BASE<"shra.ph", null_frag, immZExt4,
+ NoItinerary, DSPROpnd, uimm4>;
+
+class SHRAV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrav.ph", int_mips_shra_ph,
+ NoItinerary, DSPROpnd>;
+
+class SHRA_R_PH_DESC : SHLL_QB_R2_DESC_BASE<"shra_r.ph", int_mips_shra_r_ph,
+ immZExt4, NoItinerary, DSPROpnd,
+ uimm4>;
+
+class SHRAV_R_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.ph", int_mips_shra_r_ph,
+ NoItinerary, DSPROpnd>;
+
+class SHLL_S_W_DESC : SHLL_QB_R2_DESC_BASE<"shll_s.w", int_mips_shll_s_w,
+ immZExt5, NoItinerary, GPR32Opnd,
+ uimm5>,
+ Defs<[DSPOutFlag22]>;
+
+class SHLLV_S_W_DESC : SHLL_QB_R3_DESC_BASE<"shllv_s.w", int_mips_shll_s_w,
+ NoItinerary, GPR32Opnd>,
+ Defs<[DSPOutFlag22]>;
+
+class SHRA_R_W_DESC : SHLL_QB_R2_DESC_BASE<"shra_r.w", int_mips_shra_r_w,
+ immZExt5, NoItinerary, GPR32Opnd,
+ uimm5>;
+
+class SHRAV_R_W_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.w", int_mips_shra_r_w,
+ NoItinerary, GPR32Opnd>;
+
+// Multiplication
+class MULEU_S_PH_QBL_DESC : ADDU_QB_DESC_BASE<"muleu_s.ph.qbl",
+ int_mips_muleu_s_ph_qbl,
+ NoItinerary, DSPROpnd, DSPROpnd>,
+ Defs<[DSPOutFlag21]>;
+
+class MULEU_S_PH_QBR_DESC : ADDU_QB_DESC_BASE<"muleu_s.ph.qbr",
+ int_mips_muleu_s_ph_qbr,
+ NoItinerary, DSPROpnd, DSPROpnd>,
+ Defs<[DSPOutFlag21]>;
+
+class MULEQ_S_W_PHL_DESC : ADDU_QB_DESC_BASE<"muleq_s.w.phl",
+ int_mips_muleq_s_w_phl,
+ NoItinerary, GPR32Opnd, DSPROpnd>,
+ IsCommutable, Defs<[DSPOutFlag21]>;
+
+class MULEQ_S_W_PHR_DESC : ADDU_QB_DESC_BASE<"muleq_s.w.phr",
+ int_mips_muleq_s_w_phr,
+ NoItinerary, GPR32Opnd, DSPROpnd>,
+ IsCommutable, Defs<[DSPOutFlag21]>;
+
+class MULQ_RS_PH_DESC : ADDU_QB_DESC_BASE<"mulq_rs.ph", int_mips_mulq_rs_ph,
+ NoItinerary, DSPROpnd, DSPROpnd>,
+ IsCommutable, Defs<[DSPOutFlag21]>;
+
+class MULSAQ_S_W_PH_DESC : DPA_W_PH_DESC_BASE<"mulsaq_s.w.ph",
+ MipsMULSAQ_S_W_PH>,
+ Defs<[DSPOutFlag16_19]>;
+
+class MAQ_S_W_PHL_DESC : DPA_W_PH_DESC_BASE<"maq_s.w.phl", MipsMAQ_S_W_PHL>,
+ Defs<[DSPOutFlag16_19]>;
+
+class MAQ_S_W_PHR_DESC : DPA_W_PH_DESC_BASE<"maq_s.w.phr", MipsMAQ_S_W_PHR>,
+ Defs<[DSPOutFlag16_19]>;
+
+class MAQ_SA_W_PHL_DESC : DPA_W_PH_DESC_BASE<"maq_sa.w.phl", MipsMAQ_SA_W_PHL>,
+ Defs<[DSPOutFlag16_19]>;
+
+class MAQ_SA_W_PHR_DESC : DPA_W_PH_DESC_BASE<"maq_sa.w.phr", MipsMAQ_SA_W_PHR>,
+ Defs<[DSPOutFlag16_19]>;
+
+// Move from/to hi/lo.
+class MFHI_DESC : MFHI_DESC_BASE<"mfhi", ACC64DSPOpnd, MipsMFHI, NoItinerary>;
+class MFLO_DESC : MFHI_DESC_BASE<"mflo", ACC64DSPOpnd, MipsMFLO, NoItinerary>;
+class MTHI_DESC : MTHI_DESC_BASE<"mthi", HI32DSPOpnd, NoItinerary>;
+class MTLO_DESC : MTHI_DESC_BASE<"mtlo", LO32DSPOpnd, NoItinerary>;
+
+// Dot product with accumulate/subtract
+class DPAU_H_QBL_DESC : DPA_W_PH_DESC_BASE<"dpau.h.qbl", MipsDPAU_H_QBL>;
+
+class DPAU_H_QBR_DESC : DPA_W_PH_DESC_BASE<"dpau.h.qbr", MipsDPAU_H_QBR>;
+
+class DPSU_H_QBL_DESC : DPA_W_PH_DESC_BASE<"dpsu.h.qbl", MipsDPSU_H_QBL>;
+
+class DPSU_H_QBR_DESC : DPA_W_PH_DESC_BASE<"dpsu.h.qbr", MipsDPSU_H_QBR>;
+
+class DPAQ_S_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpaq_s.w.ph", MipsDPAQ_S_W_PH>,
+ Defs<[DSPOutFlag16_19]>;
+
+class DPSQ_S_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpsq_s.w.ph", MipsDPSQ_S_W_PH>,
+ Defs<[DSPOutFlag16_19]>;
+
+class DPAQ_SA_L_W_DESC : DPA_W_PH_DESC_BASE<"dpaq_sa.l.w", MipsDPAQ_SA_L_W>,
+ Defs<[DSPOutFlag16_19]>;
+
+class DPSQ_SA_L_W_DESC : DPA_W_PH_DESC_BASE<"dpsq_sa.l.w", MipsDPSQ_SA_L_W>,
+ Defs<[DSPOutFlag16_19]>;
+
+class MULT_DSP_DESC : MULT_DESC_BASE<"mult", MipsMult, NoItinerary>;
+class MULTU_DSP_DESC : MULT_DESC_BASE<"multu", MipsMultu, NoItinerary>;
+class MADD_DSP_DESC : MADD_DESC_BASE<"madd", MipsMAdd, NoItinerary>;
+class MADDU_DSP_DESC : MADD_DESC_BASE<"maddu", MipsMAddu, NoItinerary>;
+class MSUB_DSP_DESC : MADD_DESC_BASE<"msub", MipsMSub, NoItinerary>;
+class MSUBU_DSP_DESC : MADD_DESC_BASE<"msubu", MipsMSubu, NoItinerary>;
+
+// Comparison
+class CMPU_EQ_QB_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmpu.eq.qb",
+ int_mips_cmpu_eq_qb, NoItinerary,
+ DSPROpnd>,
+ IsCommutable, Defs<[DSPCCond]>;
+
+class CMPU_LT_QB_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmpu.lt.qb",
+ int_mips_cmpu_lt_qb, NoItinerary,
+ DSPROpnd>, Defs<[DSPCCond]>;
+
+class CMPU_LE_QB_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmpu.le.qb",
+ int_mips_cmpu_le_qb, NoItinerary,
+ DSPROpnd>, Defs<[DSPCCond]>;
+
+class CMPGU_EQ_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgu.eq.qb",
+ int_mips_cmpgu_eq_qb,
+ NoItinerary, GPR32Opnd, DSPROpnd>,
+ IsCommutable;
+
+class CMPGU_LT_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgu.lt.qb",
+ int_mips_cmpgu_lt_qb,
+ NoItinerary, GPR32Opnd, DSPROpnd>;
+
+class CMPGU_LE_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgu.le.qb",
+ int_mips_cmpgu_le_qb,
+ NoItinerary, GPR32Opnd, DSPROpnd>;
+
+class CMP_EQ_PH_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmp.eq.ph", int_mips_cmp_eq_ph,
+ NoItinerary, DSPROpnd>,
+ IsCommutable, Defs<[DSPCCond]>;
+
+class CMP_LT_PH_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmp.lt.ph", int_mips_cmp_lt_ph,
+ NoItinerary, DSPROpnd>,
+ Defs<[DSPCCond]>;
+
+class CMP_LE_PH_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmp.le.ph", int_mips_cmp_le_ph,
+ NoItinerary, DSPROpnd>,
+ Defs<[DSPCCond]>;
+
+// Misc
+class BITREV_DESC : ABSQ_S_PH_R2_DESC_BASE<"bitrev", int_mips_bitrev,
+ NoItinerary, GPR32Opnd>;
+
+class PACKRL_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"packrl.ph", int_mips_packrl_ph,
+ NoItinerary, DSPROpnd, DSPROpnd>;
+
+class REPL_QB_DESC : REPL_DESC_BASE<"repl.qb", int_mips_repl_qb, uimm8,
+ immZExt8, NoItinerary, DSPROpnd>;
+
+class REPL_PH_DESC : REPL_DESC_BASE<"repl.ph", int_mips_repl_ph, uimm10,
+ immZExt10, NoItinerary, DSPROpnd>;
+
+class REPLV_QB_DESC : ABSQ_S_PH_R2_DESC_BASE<"replv.qb", int_mips_repl_qb,
+ NoItinerary, DSPROpnd, GPR32Opnd>;
+
+class REPLV_PH_DESC : ABSQ_S_PH_R2_DESC_BASE<"replv.ph", int_mips_repl_ph,
+ NoItinerary, DSPROpnd, GPR32Opnd>;
+
+class PICK_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"pick.qb", int_mips_pick_qb,
+ NoItinerary, DSPROpnd, DSPROpnd>,
+ Uses<[DSPCCond]>;
+
+class PICK_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"pick.ph", int_mips_pick_ph,
+ NoItinerary, DSPROpnd, DSPROpnd>,
+ Uses<[DSPCCond]>;
+
+class LWX_DESC : LX_DESC_BASE<"lwx", int_mips_lwx, NoItinerary>;
+
+class LHX_DESC : LX_DESC_BASE<"lhx", int_mips_lhx, NoItinerary>;
+
+class LBUX_DESC : LX_DESC_BASE<"lbux", int_mips_lbux, NoItinerary>;
+
+class BPOSGE32_DESC : BPOSGE32_DESC_BASE<"bposge32", brtarget, NoItinerary>;
+
+// Extr
+class EXTP_DESC : EXTR_W_TY1_R1_DESC_BASE<"extp", MipsEXTP, NoItinerary>,
+ Uses<[DSPPos]>, Defs<[DSPEFI]>;
+
+class EXTPV_DESC : EXTR_W_TY1_R2_DESC_BASE<"extpv", MipsEXTP, NoItinerary>,
+ Uses<[DSPPos]>, Defs<[DSPEFI]>;
+
+class EXTPDP_DESC : EXTR_W_TY1_R1_DESC_BASE<"extpdp", MipsEXTPDP, NoItinerary>,
+ Uses<[DSPPos]>, Defs<[DSPPos, DSPEFI]>;
+
+class EXTPDPV_DESC : EXTR_W_TY1_R2_DESC_BASE<"extpdpv", MipsEXTPDP,
+ NoItinerary>,
+ Uses<[DSPPos]>, Defs<[DSPPos, DSPEFI]>;
+
+class EXTR_W_DESC : EXTR_W_TY1_R1_DESC_BASE<"extr.w", MipsEXTR_W, NoItinerary>,
+ Defs<[DSPOutFlag23]>;
+
+class EXTRV_W_DESC : EXTR_W_TY1_R2_DESC_BASE<"extrv.w", MipsEXTR_W,
+ NoItinerary>, Defs<[DSPOutFlag23]>;
+
+class EXTR_R_W_DESC : EXTR_W_TY1_R1_DESC_BASE<"extr_r.w", MipsEXTR_R_W,
+ NoItinerary>,
+ Defs<[DSPOutFlag23]>;
+
+class EXTRV_R_W_DESC : EXTR_W_TY1_R2_DESC_BASE<"extrv_r.w", MipsEXTR_R_W,
+ NoItinerary>,
+ Defs<[DSPOutFlag23]>;
+
+class EXTR_RS_W_DESC : EXTR_W_TY1_R1_DESC_BASE<"extr_rs.w", MipsEXTR_RS_W,
+ NoItinerary>,
+ Defs<[DSPOutFlag23]>;
+
+class EXTRV_RS_W_DESC : EXTR_W_TY1_R2_DESC_BASE<"extrv_rs.w", MipsEXTR_RS_W,
+ NoItinerary>,
+ Defs<[DSPOutFlag23]>;
+
+class EXTR_S_H_DESC : EXTR_W_TY1_R1_DESC_BASE<"extr_s.h", MipsEXTR_S_H,
+ NoItinerary>,
+ Defs<[DSPOutFlag23]>;
+
+class EXTRV_S_H_DESC : EXTR_W_TY1_R2_DESC_BASE<"extrv_s.h", MipsEXTR_S_H,
+ NoItinerary>,
+ Defs<[DSPOutFlag23]>;
+
+class SHILO_DESC : SHILO_R1_DESC_BASE<"shilo", MipsSHILO>;
+
+class SHILOV_DESC : SHILO_R2_DESC_BASE<"shilov", MipsSHILO>;
+
+class MTHLIP_DESC : MTHLIP_DESC_BASE<"mthlip", MipsMTHLIP>, Defs<[DSPPos]>;
+
+class RDDSP_DESC : RDDSP_DESC_BASE<"rddsp", int_mips_rddsp, NoItinerary>;
+
+class WRDSP_DESC : WRDSP_DESC_BASE<"wrdsp", int_mips_wrdsp, NoItinerary>;
+
+class INSV_DESC : INSV_DESC_BASE<"insv", int_mips_insv, NoItinerary>,
+ Uses<[DSPPos, DSPSCount]>;
+
+//===----------------------------------------------------------------------===//
+// MIPS DSP Rev 2
+// Addition/subtraction
+class ADDU_PH_DESC : ADDU_QB_DESC_BASE<"addu.ph", int_mips_addu_ph, NoItinerary,
+ DSPROpnd, DSPROpnd>, IsCommutable,
+ Defs<[DSPOutFlag20]>;
+
+class ADDU_S_PH_DESC : ADDU_QB_DESC_BASE<"addu_s.ph", int_mips_addu_s_ph,
+ NoItinerary, DSPROpnd, DSPROpnd>,
+ IsCommutable, Defs<[DSPOutFlag20]>;
+
+class SUBU_PH_DESC : ADDU_QB_DESC_BASE<"subu.ph", int_mips_subu_ph, NoItinerary,
+ DSPROpnd, DSPROpnd>,
+ Defs<[DSPOutFlag20]>;
+
+class SUBU_S_PH_DESC : ADDU_QB_DESC_BASE<"subu_s.ph", int_mips_subu_s_ph,
+ NoItinerary, DSPROpnd, DSPROpnd>,
+ Defs<[DSPOutFlag20]>;
+
+class ADDUH_QB_DESC : ADDUH_QB_DESC_BASE<"adduh.qb", int_mips_adduh_qb,
+ NoItinerary, DSPROpnd>, IsCommutable;
+
+class ADDUH_R_QB_DESC : ADDUH_QB_DESC_BASE<"adduh_r.qb", int_mips_adduh_r_qb,
+ NoItinerary, DSPROpnd>, IsCommutable;
+
+class SUBUH_QB_DESC : ADDUH_QB_DESC_BASE<"subuh.qb", int_mips_subuh_qb,
+ NoItinerary, DSPROpnd>;
+
+class SUBUH_R_QB_DESC : ADDUH_QB_DESC_BASE<"subuh_r.qb", int_mips_subuh_r_qb,
+ NoItinerary, DSPROpnd>;
+
+class ADDQH_PH_DESC : ADDUH_QB_DESC_BASE<"addqh.ph", int_mips_addqh_ph,
+ NoItinerary, DSPROpnd>, IsCommutable;
+
+class ADDQH_R_PH_DESC : ADDUH_QB_DESC_BASE<"addqh_r.ph", int_mips_addqh_r_ph,
+ NoItinerary, DSPROpnd>, IsCommutable;
+
+class SUBQH_PH_DESC : ADDUH_QB_DESC_BASE<"subqh.ph", int_mips_subqh_ph,
+ NoItinerary, DSPROpnd>;
+
+class SUBQH_R_PH_DESC : ADDUH_QB_DESC_BASE<"subqh_r.ph", int_mips_subqh_r_ph,
+ NoItinerary, DSPROpnd>;
+
+class ADDQH_W_DESC : ADDUH_QB_DESC_BASE<"addqh.w", int_mips_addqh_w,
+ NoItinerary, GPR32Opnd>, IsCommutable;
+
+class ADDQH_R_W_DESC : ADDUH_QB_DESC_BASE<"addqh_r.w", int_mips_addqh_r_w,
+ NoItinerary, GPR32Opnd>, IsCommutable;
+
+class SUBQH_W_DESC : ADDUH_QB_DESC_BASE<"subqh.w", int_mips_subqh_w,
+ NoItinerary, GPR32Opnd>;
+
+class SUBQH_R_W_DESC : ADDUH_QB_DESC_BASE<"subqh_r.w", int_mips_subqh_r_w,
+ NoItinerary, GPR32Opnd>;
+
+// Comparison
+class CMPGDU_EQ_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgdu.eq.qb",
+ int_mips_cmpgdu_eq_qb,
+ NoItinerary, GPR32Opnd, DSPROpnd>,
+ IsCommutable, Defs<[DSPCCond]>;
+
+class CMPGDU_LT_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgdu.lt.qb",
+ int_mips_cmpgdu_lt_qb,
+ NoItinerary, GPR32Opnd, DSPROpnd>,
+ Defs<[DSPCCond]>;
+
+class CMPGDU_LE_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgdu.le.qb",
+ int_mips_cmpgdu_le_qb,
+ NoItinerary, GPR32Opnd, DSPROpnd>,
+ Defs<[DSPCCond]>;
+
+// Absolute
+class ABSQ_S_QB_DESC : ABSQ_S_PH_R2_DESC_BASE<"absq_s.qb", int_mips_absq_s_qb,
+ NoItinerary, DSPROpnd>,
+ Defs<[DSPOutFlag20]>;
+
+// Multiplication
+class MUL_PH_DESC : ADDUH_QB_DESC_BASE<"mul.ph", null_frag, NoItinerary,
+ DSPROpnd>, IsCommutable,
+ Defs<[DSPOutFlag21]>;
+
+class MUL_S_PH_DESC : ADDUH_QB_DESC_BASE<"mul_s.ph", int_mips_mul_s_ph,
+ NoItinerary, DSPROpnd>, IsCommutable,
+ Defs<[DSPOutFlag21]>;
+
+class MULQ_S_W_DESC : ADDUH_QB_DESC_BASE<"mulq_s.w", int_mips_mulq_s_w,
+ NoItinerary, GPR32Opnd>, IsCommutable,
+ Defs<[DSPOutFlag21]>;
+
+class MULQ_RS_W_DESC : ADDUH_QB_DESC_BASE<"mulq_rs.w", int_mips_mulq_rs_w,
+ NoItinerary, GPR32Opnd>, IsCommutable,
+ Defs<[DSPOutFlag21]>;
+
+class MULQ_S_PH_DESC : ADDU_QB_DESC_BASE<"mulq_s.ph", int_mips_mulq_s_ph,
+ NoItinerary, DSPROpnd, DSPROpnd>,
+ IsCommutable, Defs<[DSPOutFlag21]>;
+
+// Dot product with accumulate/subtract
+class DPA_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpa.w.ph", MipsDPA_W_PH>;
+
+class DPS_W_PH_DESC : DPA_W_PH_DESC_BASE<"dps.w.ph", MipsDPS_W_PH>;
+
+class DPAQX_S_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpaqx_s.w.ph", MipsDPAQX_S_W_PH>,
+ Defs<[DSPOutFlag16_19]>;
+
+class DPAQX_SA_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpaqx_sa.w.ph",
+ MipsDPAQX_SA_W_PH>,
+ Defs<[DSPOutFlag16_19]>;
+
+class DPAX_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpax.w.ph", MipsDPAX_W_PH>;
+
+class DPSX_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpsx.w.ph", MipsDPSX_W_PH>;
+
+class DPSQX_S_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpsqx_s.w.ph", MipsDPSQX_S_W_PH>,
+ Defs<[DSPOutFlag16_19]>;
+
+class DPSQX_SA_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpsqx_sa.w.ph",
+ MipsDPSQX_SA_W_PH>,
+ Defs<[DSPOutFlag16_19]>;
+
+class MULSA_W_PH_DESC : DPA_W_PH_DESC_BASE<"mulsa.w.ph", MipsMULSA_W_PH>;
+
+// Precision reduce/expand
+class PRECR_QB_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"precr.qb.ph",
+ int_mips_precr_qb_ph,
+ NoItinerary, DSPROpnd, DSPROpnd>;
+
+class PRECR_SRA_PH_W_DESC : PRECR_SRA_PH_W_DESC_BASE<"precr_sra.ph.w",
+ int_mips_precr_sra_ph_w,
+ NoItinerary, DSPROpnd,
+ GPR32Opnd>;
+
+class PRECR_SRA_R_PH_W_DESC : PRECR_SRA_PH_W_DESC_BASE<"precr_sra_r.ph.w",
+ int_mips_precr_sra_r_ph_w,
+ NoItinerary, DSPROpnd,
+ GPR32Opnd>;
+
+// Shift
+class SHRA_QB_DESC : SHLL_QB_R2_DESC_BASE<"shra.qb", null_frag, immZExt3,
+ NoItinerary, DSPROpnd, uimm3>;
+
+class SHRAV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shrav.qb", int_mips_shra_qb,
+ NoItinerary, DSPROpnd>;
+
+class SHRA_R_QB_DESC : SHLL_QB_R2_DESC_BASE<"shra_r.qb", int_mips_shra_r_qb,
+ immZExt3, NoItinerary, DSPROpnd,
+ uimm3>;
+
+class SHRAV_R_QB_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.qb", int_mips_shra_r_qb,
+ NoItinerary, DSPROpnd>;
+
+class SHRL_PH_DESC : SHLL_QB_R2_DESC_BASE<"shrl.ph", null_frag, immZExt4,
+ NoItinerary, DSPROpnd, uimm4>;
+
+class SHRLV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrlv.ph", int_mips_shrl_ph,
+ NoItinerary, DSPROpnd>;
+
+// Misc
+class APPEND_DESC : APPEND_DESC_BASE<"append", int_mips_append, uimm5, immZExt5,
+ NoItinerary>;
+
+class BALIGN_DESC : APPEND_DESC_BASE<"balign", int_mips_balign, uimm2, immZExt2,
+ NoItinerary>;
+
+class PREPEND_DESC : APPEND_DESC_BASE<"prepend", int_mips_prepend, uimm5,
+ immZExt5, NoItinerary>;
+
+// Pseudos.
+def BPOSGE32_PSEUDO : BPOSGE32_PSEUDO_DESC_BASE<int_mips_bposge32,
+ NoItinerary>, Uses<[DSPPos]>;
+
+// Instruction defs.
+// MIPS DSP Rev 1
+def ADDU_QB : DspMMRel, ADDU_QB_ENC, ADDU_QB_DESC;
+def ADDU_S_QB : DspMMRel, ADDU_S_QB_ENC, ADDU_S_QB_DESC;
+def SUBU_QB : DspMMRel, SUBU_QB_ENC, SUBU_QB_DESC;
+def SUBU_S_QB : DspMMRel, SUBU_S_QB_ENC, SUBU_S_QB_DESC;
+def ADDQ_PH : DspMMRel, ADDQ_PH_ENC, ADDQ_PH_DESC;
+def ADDQ_S_PH : DspMMRel, ADDQ_S_PH_ENC, ADDQ_S_PH_DESC;
+def SUBQ_PH : DspMMRel, SUBQ_PH_ENC, SUBQ_PH_DESC;
+def SUBQ_S_PH : DspMMRel, SUBQ_S_PH_ENC, SUBQ_S_PH_DESC;
+def ADDQ_S_W : DspMMRel, ADDQ_S_W_ENC, ADDQ_S_W_DESC;
+def SUBQ_S_W : DspMMRel, SUBQ_S_W_ENC, SUBQ_S_W_DESC;
+def ADDSC : DspMMRel, ADDSC_ENC, ADDSC_DESC;
+def ADDWC : DspMMRel, ADDWC_ENC, ADDWC_DESC;
+def MODSUB : DspMMRel, MODSUB_ENC, MODSUB_DESC;
+def RADDU_W_QB : DspMMRel, RADDU_W_QB_ENC, RADDU_W_QB_DESC;
+def ABSQ_S_PH : DspMMRel, ABSQ_S_PH_ENC, ABSQ_S_PH_DESC;
+def ABSQ_S_W : DspMMRel, ABSQ_S_W_ENC, ABSQ_S_W_DESC;
+def PRECRQ_QB_PH : DspMMRel, PRECRQ_QB_PH_ENC, PRECRQ_QB_PH_DESC;
+def PRECRQ_PH_W : DspMMRel, PRECRQ_PH_W_ENC, PRECRQ_PH_W_DESC;
+def PRECRQ_RS_PH_W : DspMMRel, PRECRQ_RS_PH_W_ENC, PRECRQ_RS_PH_W_DESC;
+def PRECRQU_S_QB_PH : DspMMRel, PRECRQU_S_QB_PH_ENC, PRECRQU_S_QB_PH_DESC;
+def PRECEQ_W_PHL : DspMMRel, PRECEQ_W_PHL_ENC, PRECEQ_W_PHL_DESC;
+def PRECEQ_W_PHR : DspMMRel, PRECEQ_W_PHR_ENC, PRECEQ_W_PHR_DESC;
+def PRECEQU_PH_QBL : DspMMRel, PRECEQU_PH_QBL_ENC, PRECEQU_PH_QBL_DESC;
+def PRECEQU_PH_QBR : DspMMRel, PRECEQU_PH_QBR_ENC, PRECEQU_PH_QBR_DESC;
+def PRECEQU_PH_QBLA : DspMMRel, PRECEQU_PH_QBLA_ENC, PRECEQU_PH_QBLA_DESC;
+def PRECEQU_PH_QBRA : DspMMRel, PRECEQU_PH_QBRA_ENC, PRECEQU_PH_QBRA_DESC;
+def PRECEU_PH_QBL : DspMMRel, PRECEU_PH_QBL_ENC, PRECEU_PH_QBL_DESC;
+def PRECEU_PH_QBR : DspMMRel, PRECEU_PH_QBR_ENC, PRECEU_PH_QBR_DESC;
+def PRECEU_PH_QBLA : DspMMRel, PRECEU_PH_QBLA_ENC, PRECEU_PH_QBLA_DESC;
+def PRECEU_PH_QBRA : DspMMRel, PRECEU_PH_QBRA_ENC, PRECEU_PH_QBRA_DESC;
+def SHLL_QB : DspMMRel, SHLL_QB_ENC, SHLL_QB_DESC;
+def SHLLV_QB : DspMMRel, SHLLV_QB_ENC, SHLLV_QB_DESC;
+def SHRL_QB : DspMMRel, SHRL_QB_ENC, SHRL_QB_DESC;
+def SHRLV_QB : DspMMRel, SHRLV_QB_ENC, SHRLV_QB_DESC;
+def SHLL_PH : DspMMRel, SHLL_PH_ENC, SHLL_PH_DESC;
+def SHLLV_PH : DspMMRel, SHLLV_PH_ENC, SHLLV_PH_DESC;
+def SHLL_S_PH : DspMMRel, SHLL_S_PH_ENC, SHLL_S_PH_DESC;
+def SHLLV_S_PH : DspMMRel, SHLLV_S_PH_ENC, SHLLV_S_PH_DESC;
+def SHRA_PH : DspMMRel, SHRA_PH_ENC, SHRA_PH_DESC;
+def SHRAV_PH : DspMMRel, SHRAV_PH_ENC, SHRAV_PH_DESC;
+def SHRA_R_PH : DspMMRel, SHRA_R_PH_ENC, SHRA_R_PH_DESC;
+def SHRAV_R_PH : DspMMRel, SHRAV_R_PH_ENC, SHRAV_R_PH_DESC;
+def SHLL_S_W : DspMMRel, SHLL_S_W_ENC, SHLL_S_W_DESC;
+def SHLLV_S_W : DspMMRel, SHLLV_S_W_ENC, SHLLV_S_W_DESC;
+def SHRA_R_W : DspMMRel, SHRA_R_W_ENC, SHRA_R_W_DESC;
+def SHRAV_R_W : DspMMRel, SHRAV_R_W_ENC, SHRAV_R_W_DESC;
+def MULEU_S_PH_QBL : DspMMRel, MULEU_S_PH_QBL_ENC, MULEU_S_PH_QBL_DESC;
+def MULEU_S_PH_QBR : DspMMRel, MULEU_S_PH_QBR_ENC, MULEU_S_PH_QBR_DESC;
+def MULEQ_S_W_PHL : DspMMRel, MULEQ_S_W_PHL_ENC, MULEQ_S_W_PHL_DESC;
+def MULEQ_S_W_PHR : DspMMRel, MULEQ_S_W_PHR_ENC, MULEQ_S_W_PHR_DESC;
+def MULQ_RS_PH : DspMMRel, MULQ_RS_PH_ENC, MULQ_RS_PH_DESC;
+def MULSAQ_S_W_PH : DspMMRel, MULSAQ_S_W_PH_ENC, MULSAQ_S_W_PH_DESC;
+def MAQ_S_W_PHL : DspMMRel, MAQ_S_W_PHL_ENC, MAQ_S_W_PHL_DESC;
+def MAQ_S_W_PHR : DspMMRel, MAQ_S_W_PHR_ENC, MAQ_S_W_PHR_DESC;
+def MAQ_SA_W_PHL : DspMMRel, MAQ_SA_W_PHL_ENC, MAQ_SA_W_PHL_DESC;
+def MAQ_SA_W_PHR : DspMMRel, MAQ_SA_W_PHR_ENC, MAQ_SA_W_PHR_DESC;
+def MFHI_DSP : DspMMRel, MFHI_ENC, MFHI_DESC;
+def MFLO_DSP : DspMMRel, MFLO_ENC, MFLO_DESC;
+def MTHI_DSP : DspMMRel, MTHI_ENC, MTHI_DESC;
+def MTLO_DSP : DspMMRel, MTLO_ENC, MTLO_DESC;
+def DPAU_H_QBL : DspMMRel, DPAU_H_QBL_ENC, DPAU_H_QBL_DESC;
+def DPAU_H_QBR : DspMMRel, DPAU_H_QBR_ENC, DPAU_H_QBR_DESC;
+def DPSU_H_QBL : DspMMRel, DPSU_H_QBL_ENC, DPSU_H_QBL_DESC;
+def DPSU_H_QBR : DspMMRel, DPSU_H_QBR_ENC, DPSU_H_QBR_DESC;
+def DPAQ_S_W_PH : DspMMRel, DPAQ_S_W_PH_ENC, DPAQ_S_W_PH_DESC;
+def DPSQ_S_W_PH : DspMMRel, DPSQ_S_W_PH_ENC, DPSQ_S_W_PH_DESC;
+def DPAQ_SA_L_W : DspMMRel, DPAQ_SA_L_W_ENC, DPAQ_SA_L_W_DESC;
+def DPSQ_SA_L_W : DspMMRel, DPSQ_SA_L_W_ENC, DPSQ_SA_L_W_DESC;
+def MULT_DSP : DspMMRel, MULT_DSP_ENC, MULT_DSP_DESC;
+def MULTU_DSP : DspMMRel, MULTU_DSP_ENC, MULTU_DSP_DESC;
+def MADD_DSP : DspMMRel, MADD_DSP_ENC, MADD_DSP_DESC;
+def MADDU_DSP : DspMMRel, MADDU_DSP_ENC, MADDU_DSP_DESC;
+def MSUB_DSP : DspMMRel, MSUB_DSP_ENC, MSUB_DSP_DESC;
+def MSUBU_DSP : DspMMRel, MSUBU_DSP_ENC, MSUBU_DSP_DESC;
+def CMPU_EQ_QB : DspMMRel, CMPU_EQ_QB_ENC, CMPU_EQ_QB_DESC;
+def CMPU_LT_QB : DspMMRel, CMPU_LT_QB_ENC, CMPU_LT_QB_DESC;
+def CMPU_LE_QB : DspMMRel, CMPU_LE_QB_ENC, CMPU_LE_QB_DESC;
+def CMPGU_EQ_QB : DspMMRel, CMPGU_EQ_QB_ENC, CMPGU_EQ_QB_DESC;
+def CMPGU_LT_QB : DspMMRel, CMPGU_LT_QB_ENC, CMPGU_LT_QB_DESC;
+def CMPGU_LE_QB : DspMMRel, CMPGU_LE_QB_ENC, CMPGU_LE_QB_DESC;
+def CMP_EQ_PH : DspMMRel, CMP_EQ_PH_ENC, CMP_EQ_PH_DESC;
+def CMP_LT_PH : DspMMRel, CMP_LT_PH_ENC, CMP_LT_PH_DESC;
+def CMP_LE_PH : DspMMRel, CMP_LE_PH_ENC, CMP_LE_PH_DESC;
+def BITREV : DspMMRel, BITREV_ENC, BITREV_DESC;
+def PACKRL_PH : DspMMRel, PACKRL_PH_ENC, PACKRL_PH_DESC;
+def REPL_QB : DspMMRel, REPL_QB_ENC, REPL_QB_DESC;
+def REPL_PH : DspMMRel, REPL_PH_ENC, REPL_PH_DESC;
+def REPLV_QB : DspMMRel, REPLV_QB_ENC, REPLV_QB_DESC;
+def REPLV_PH : DspMMRel, REPLV_PH_ENC, REPLV_PH_DESC;
+def PICK_QB : DspMMRel, PICK_QB_ENC, PICK_QB_DESC;
+def PICK_PH : DspMMRel, PICK_PH_ENC, PICK_PH_DESC;
+def LWX : DspMMRel, LWX_ENC, LWX_DESC;
+def LHX : DspMMRel, LHX_ENC, LHX_DESC;
+def LBUX : DspMMRel, LBUX_ENC, LBUX_DESC;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def BPOSGE32 : DspMMRel, BPOSGE32_ENC, BPOSGE32_DESC;
+}
+def INSV : DspMMRel, INSV_ENC, INSV_DESC;
+def EXTP : DspMMRel, EXTP_ENC, EXTP_DESC;
+def EXTPV : DspMMRel, EXTPV_ENC, EXTPV_DESC;
+def EXTPDP : DspMMRel, EXTPDP_ENC, EXTPDP_DESC;
+def EXTPDPV : DspMMRel, EXTPDPV_ENC, EXTPDPV_DESC;
+def EXTR_W : DspMMRel, EXTR_W_ENC, EXTR_W_DESC;
+def EXTRV_W : DspMMRel, EXTRV_W_ENC, EXTRV_W_DESC;
+def EXTR_R_W : DspMMRel, EXTR_R_W_ENC, EXTR_R_W_DESC;
+def EXTRV_R_W : DspMMRel, EXTRV_R_W_ENC, EXTRV_R_W_DESC;
+def EXTR_RS_W : DspMMRel, EXTR_RS_W_ENC, EXTR_RS_W_DESC;
+def EXTRV_RS_W : DspMMRel, EXTRV_RS_W_ENC, EXTRV_RS_W_DESC;
+def EXTR_S_H : DspMMRel, EXTR_S_H_ENC, EXTR_S_H_DESC;
+def EXTRV_S_H : DspMMRel, EXTRV_S_H_ENC, EXTRV_S_H_DESC;
+def SHILO : DspMMRel, SHILO_ENC, SHILO_DESC;
+def SHILOV : DspMMRel, SHILOV_ENC, SHILOV_DESC;
+def MTHLIP : DspMMRel, MTHLIP_ENC, MTHLIP_DESC;
+def RDDSP : DspMMRel, RDDSP_ENC, RDDSP_DESC;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def WRDSP : WRDSP_ENC, WRDSP_DESC;
+}
+
+// MIPS DSP Rev 2
+def ADDU_PH : DspMMRel, ADDU_PH_ENC, ADDU_PH_DESC, ISA_DSPR2;
+def ADDU_S_PH : DspMMRel, ADDU_S_PH_ENC, ADDU_S_PH_DESC, ISA_DSPR2;
+def SUBU_PH : DspMMRel, SUBU_PH_ENC, SUBU_PH_DESC, ISA_DSPR2;
+def SUBU_S_PH : DspMMRel, SUBU_S_PH_ENC, SUBU_S_PH_DESC, ISA_DSPR2;
+def CMPGDU_EQ_QB : DspMMRel, CMPGDU_EQ_QB_ENC, CMPGDU_EQ_QB_DESC, ISA_DSPR2;
+def CMPGDU_LT_QB : DspMMRel, CMPGDU_LT_QB_ENC, CMPGDU_LT_QB_DESC, ISA_DSPR2;
+def CMPGDU_LE_QB : DspMMRel, CMPGDU_LE_QB_ENC, CMPGDU_LE_QB_DESC, ISA_DSPR2;
+def ABSQ_S_QB : DspMMRel, ABSQ_S_QB_ENC, ABSQ_S_QB_DESC, ISA_DSPR2;
+def ADDUH_QB : DspMMRel, ADDUH_QB_ENC, ADDUH_QB_DESC, ISA_DSPR2;
+def ADDUH_R_QB : DspMMRel, ADDUH_R_QB_ENC, ADDUH_R_QB_DESC, ISA_DSPR2;
+def SUBUH_QB : DspMMRel, SUBUH_QB_ENC, SUBUH_QB_DESC, ISA_DSPR2;
+def SUBUH_R_QB : DspMMRel, SUBUH_R_QB_ENC, SUBUH_R_QB_DESC, ISA_DSPR2;
+def ADDQH_PH : DspMMRel, ADDQH_PH_ENC, ADDQH_PH_DESC, ISA_DSPR2;
+def ADDQH_R_PH : DspMMRel, ADDQH_R_PH_ENC, ADDQH_R_PH_DESC, ISA_DSPR2;
+def SUBQH_PH : DspMMRel, SUBQH_PH_ENC, SUBQH_PH_DESC, ISA_DSPR2;
+def SUBQH_R_PH : DspMMRel, SUBQH_R_PH_ENC, SUBQH_R_PH_DESC, ISA_DSPR2;
+def ADDQH_W : DspMMRel, ADDQH_W_ENC, ADDQH_W_DESC, ISA_DSPR2;
+def ADDQH_R_W : DspMMRel, ADDQH_R_W_ENC, ADDQH_R_W_DESC, ISA_DSPR2;
+def SUBQH_W : DspMMRel, SUBQH_W_ENC, SUBQH_W_DESC, ISA_DSPR2;
+def SUBQH_R_W : DspMMRel, SUBQH_R_W_ENC, SUBQH_R_W_DESC, ISA_DSPR2;
+def MUL_PH : DspMMRel, MUL_PH_ENC, MUL_PH_DESC, ISA_DSPR2;
+def MUL_S_PH : DspMMRel, MUL_S_PH_ENC, MUL_S_PH_DESC, ISA_DSPR2;
+def MULQ_S_W : DspMMRel, MULQ_S_W_ENC, MULQ_S_W_DESC, ISA_DSPR2;
+def MULQ_RS_W : DspMMRel, MULQ_RS_W_ENC, MULQ_RS_W_DESC, ISA_DSPR2;
+def MULQ_S_PH : DspMMRel, MULQ_S_PH_ENC, MULQ_S_PH_DESC, ISA_DSPR2;
+def DPA_W_PH : DspMMRel, DPA_W_PH_ENC, DPA_W_PH_DESC, ISA_DSPR2;
+def DPS_W_PH : DspMMRel, DPS_W_PH_ENC, DPS_W_PH_DESC, ISA_DSPR2;
+def DPAQX_S_W_PH : DspMMRel, DPAQX_S_W_PH_ENC, DPAQX_S_W_PH_DESC, ISA_DSPR2;
+def DPAQX_SA_W_PH : DspMMRel, DPAQX_SA_W_PH_ENC, DPAQX_SA_W_PH_DESC, ISA_DSPR2;
+def DPAX_W_PH : DspMMRel, DPAX_W_PH_ENC, DPAX_W_PH_DESC, ISA_DSPR2;
+def DPSX_W_PH : DspMMRel, DPSX_W_PH_ENC, DPSX_W_PH_DESC, ISA_DSPR2;
+def DPSQX_S_W_PH : DspMMRel, DPSQX_S_W_PH_ENC, DPSQX_S_W_PH_DESC, ISA_DSPR2;
+def DPSQX_SA_W_PH : DspMMRel, DPSQX_SA_W_PH_ENC, DPSQX_SA_W_PH_DESC, ISA_DSPR2;
+def MULSA_W_PH : DspMMRel, MULSA_W_PH_ENC, MULSA_W_PH_DESC, ISA_DSPR2;
+def PRECR_QB_PH : DspMMRel, PRECR_QB_PH_ENC, PRECR_QB_PH_DESC, ISA_DSPR2;
+def PRECR_SRA_PH_W : DspMMRel, PRECR_SRA_PH_W_ENC, PRECR_SRA_PH_W_DESC, ISA_DSPR2;
+def PRECR_SRA_R_PH_W : DspMMRel, PRECR_SRA_R_PH_W_ENC, PRECR_SRA_R_PH_W_DESC, ISA_DSPR2;
+def SHRA_QB : DspMMRel, SHRA_QB_ENC, SHRA_QB_DESC, ISA_DSPR2;
+def SHRAV_QB : DspMMRel, SHRAV_QB_ENC, SHRAV_QB_DESC, ISA_DSPR2;
+def SHRA_R_QB : DspMMRel, SHRA_R_QB_ENC, SHRA_R_QB_DESC, ISA_DSPR2;
+def SHRAV_R_QB : DspMMRel, SHRAV_R_QB_ENC, SHRAV_R_QB_DESC, ISA_DSPR2;
+def SHRL_PH : DspMMRel, SHRL_PH_ENC, SHRL_PH_DESC, ISA_DSPR2;
+def SHRLV_PH : DspMMRel, SHRLV_PH_ENC, SHRLV_PH_DESC, ISA_DSPR2;
+def APPEND : DspMMRel, APPEND_ENC, APPEND_DESC, ISA_DSPR2;
+def BALIGN : DspMMRel, BALIGN_ENC, BALIGN_DESC, ISA_DSPR2;
+def PREPEND : DspMMRel, PREPEND_ENC, PREPEND_DESC, ISA_DSPR2;
+
+// Pseudos.
+let isPseudo = 1, isCodeGenOnly = 1, hasNoSchedulingInfo = 1 in {
+ // Pseudo instructions for loading and storing accumulator registers.
+ def LOAD_ACC64DSP : Load<"", ACC64DSPOpnd>;
+ def STORE_ACC64DSP : Store<"", ACC64DSPOpnd>;
+
+ // Pseudos for loading and storing ccond field of DSP control register.
+ def LOAD_CCOND_DSP : Load<"load_ccond_dsp", DSPCC>;
+ def STORE_CCOND_DSP : Store<"store_ccond_dsp", DSPCC>;
+}
+
+// Pseudo CMP and PICK instructions.
+class PseudoCMP<Instruction RealInst> :
+ PseudoDSP<(outs DSPCC:$cmp), (ins DSPROpnd:$rs, DSPROpnd:$rt), []>,
+ PseudoInstExpansion<(RealInst DSPROpnd:$rs, DSPROpnd:$rt)>, NeverHasSideEffects;
+
+class PseudoPICK<Instruction RealInst> :
+ PseudoDSP<(outs DSPROpnd:$rd), (ins DSPCC:$cmp, DSPROpnd:$rs, DSPROpnd:$rt), []>,
+ PseudoInstExpansion<(RealInst DSPROpnd:$rd, DSPROpnd:$rs, DSPROpnd:$rt)>,
+ NeverHasSideEffects;
+
+def PseudoCMP_EQ_PH : PseudoCMP<CMP_EQ_PH>;
+def PseudoCMP_LT_PH : PseudoCMP<CMP_LT_PH>;
+def PseudoCMP_LE_PH : PseudoCMP<CMP_LE_PH>;
+def PseudoCMPU_EQ_QB : PseudoCMP<CMPU_EQ_QB>;
+def PseudoCMPU_LT_QB : PseudoCMP<CMPU_LT_QB>;
+def PseudoCMPU_LE_QB : PseudoCMP<CMPU_LE_QB>;
+
+def PseudoPICK_PH : PseudoPICK<PICK_PH>;
+def PseudoPICK_QB : PseudoPICK<PICK_QB>;
+
+def PseudoMTLOHI_DSP : PseudoMTLOHI<ACC64DSP, GPR32>;
+
+// Patterns.
+class DSPPat<dag pattern, dag result, Predicate pred = HasDSP> :
+ Pat<pattern, result>, Requires<[pred]>;
+
+class BitconvertPat<ValueType DstVT, ValueType SrcVT, RegisterClass DstRC,
+ RegisterClass SrcRC> :
+ DSPPat<(DstVT (bitconvert (SrcVT SrcRC:$src))),
+ (COPY_TO_REGCLASS SrcRC:$src, DstRC)>;
+
+def : BitconvertPat<i32, v2i16, GPR32, DSPR>;
+def : BitconvertPat<i32, v4i8, GPR32, DSPR>;
+def : BitconvertPat<v2i16, i32, DSPR, GPR32>;
+def : BitconvertPat<v4i8, i32, DSPR, GPR32>;
+
+def : DSPPat<(v2i16 (load addr:$a)),
+ (v2i16 (COPY_TO_REGCLASS (LW addr:$a), DSPR))>;
+def : DSPPat<(v4i8 (load addr:$a)),
+ (v4i8 (COPY_TO_REGCLASS (LW addr:$a), DSPR))>;
+def : DSPPat<(store (v2i16 DSPR:$val), addr:$a),
+ (SW (COPY_TO_REGCLASS DSPR:$val, GPR32), addr:$a)>;
+def : DSPPat<(store (v4i8 DSPR:$val), addr:$a),
+ (SW (COPY_TO_REGCLASS DSPR:$val, GPR32), addr:$a)>;
+
+// Binary operations.
+class DSPBinPat<Instruction Inst, ValueType ValTy, SDPatternOperator Node,
+ Predicate Pred = HasDSP> :
+ DSPPat<(Node ValTy:$a, ValTy:$b), (Inst ValTy:$a, ValTy:$b), Pred>;
+
+def : DSPBinPat<ADDQ_PH, v2i16, int_mips_addq_ph>;
+def : DSPBinPat<ADDQ_PH, v2i16, add>;
+def : DSPBinPat<SUBQ_PH, v2i16, int_mips_subq_ph>;
+def : DSPBinPat<SUBQ_PH, v2i16, sub>;
+def : DSPBinPat<MUL_PH, v2i16, int_mips_mul_ph, HasDSPR2>;
+def : DSPBinPat<MUL_PH, v2i16, mul, HasDSPR2>;
+def : DSPBinPat<ADDU_QB, v4i8, int_mips_addu_qb>;
+def : DSPBinPat<ADDU_QB, v4i8, add>;
+def : DSPBinPat<SUBU_QB, v4i8, int_mips_subu_qb>;
+def : DSPBinPat<SUBU_QB, v4i8, sub>;
+def : DSPBinPat<ADDSC, i32, int_mips_addsc>;
+def : DSPBinPat<ADDSC, i32, addc>;
+def : DSPBinPat<ADDWC, i32, int_mips_addwc>;
+def : DSPBinPat<ADDWC, i32, adde>;
+
+// Shift immediate patterns.
+class DSPShiftPat<Instruction Inst, ValueType ValTy, SDPatternOperator Node,
+ SDPatternOperator Imm, Predicate Pred = HasDSP> :
+ DSPPat<(Node ValTy:$a, Imm:$shamt), (Inst ValTy:$a, Imm:$shamt), Pred>;
+
+def : DSPShiftPat<SHLL_PH, v2i16, MipsSHLL_DSP, imm>;
+def : DSPShiftPat<SHRA_PH, v2i16, MipsSHRA_DSP, imm>;
+def : DSPShiftPat<SHRL_PH, v2i16, MipsSHRL_DSP, imm, HasDSPR2>;
+def : DSPShiftPat<SHLL_PH, v2i16, int_mips_shll_ph, immZExt4>;
+def : DSPShiftPat<SHRA_PH, v2i16, int_mips_shra_ph, immZExt4>;
+def : DSPShiftPat<SHRL_PH, v2i16, int_mips_shrl_ph, immZExt4, HasDSPR2>;
+def : DSPShiftPat<SHLL_QB, v4i8, MipsSHLL_DSP, imm>;
+def : DSPShiftPat<SHRA_QB, v4i8, MipsSHRA_DSP, imm, HasDSPR2>;
+def : DSPShiftPat<SHRL_QB, v4i8, MipsSHRL_DSP, imm>;
+def : DSPShiftPat<SHLL_QB, v4i8, int_mips_shll_qb, immZExt3>;
+def : DSPShiftPat<SHRA_QB, v4i8, int_mips_shra_qb, immZExt3, HasDSPR2>;
+def : DSPShiftPat<SHRL_QB, v4i8, int_mips_shrl_qb, immZExt3>;
+
+// SETCC/SELECT_CC patterns.
+class DSPSetCCPat<Instruction Cmp, Instruction Pick, ValueType ValTy,
+ CondCode CC> :
+ DSPPat<(ValTy (MipsSETCC_DSP ValTy:$a, ValTy:$b, CC)),
+ (ValTy (Pick (ValTy (Cmp ValTy:$a, ValTy:$b)),
+ (ValTy (COPY_TO_REGCLASS (ADDiu ZERO, -1), DSPR)),
+ (ValTy ZERO)))>;
+
+class DSPSetCCPatInv<Instruction Cmp, Instruction Pick, ValueType ValTy,
+ CondCode CC> :
+ DSPPat<(ValTy (MipsSETCC_DSP ValTy:$a, ValTy:$b, CC)),
+ (ValTy (Pick (ValTy (Cmp ValTy:$a, ValTy:$b)),
+ (ValTy ZERO),
+ (ValTy (COPY_TO_REGCLASS (ADDiu ZERO, -1), DSPR))))>;
+
+class DSPSelectCCPat<Instruction Cmp, Instruction Pick, ValueType ValTy,
+ CondCode CC> :
+ DSPPat<(ValTy (MipsSELECT_CC_DSP ValTy:$a, ValTy:$b, ValTy:$c, ValTy:$d, CC)),
+ (ValTy (Pick (ValTy (Cmp ValTy:$a, ValTy:$b)), $c, $d))>;
+
+class DSPSelectCCPatInv<Instruction Cmp, Instruction Pick, ValueType ValTy,
+ CondCode CC> :
+ DSPPat<(ValTy (MipsSELECT_CC_DSP ValTy:$a, ValTy:$b, ValTy:$c, ValTy:$d, CC)),
+ (ValTy (Pick (ValTy (Cmp ValTy:$a, ValTy:$b)), $d, $c))>;
+
+def : DSPSetCCPat<PseudoCMP_EQ_PH, PseudoPICK_PH, v2i16, SETEQ>;
+def : DSPSetCCPat<PseudoCMP_LT_PH, PseudoPICK_PH, v2i16, SETLT>;
+def : DSPSetCCPat<PseudoCMP_LE_PH, PseudoPICK_PH, v2i16, SETLE>;
+def : DSPSetCCPatInv<PseudoCMP_EQ_PH, PseudoPICK_PH, v2i16, SETNE>;
+def : DSPSetCCPatInv<PseudoCMP_LT_PH, PseudoPICK_PH, v2i16, SETGE>;
+def : DSPSetCCPatInv<PseudoCMP_LE_PH, PseudoPICK_PH, v2i16, SETGT>;
+def : DSPSetCCPat<PseudoCMPU_EQ_QB, PseudoPICK_QB, v4i8, SETEQ>;
+def : DSPSetCCPat<PseudoCMPU_LT_QB, PseudoPICK_QB, v4i8, SETULT>;
+def : DSPSetCCPat<PseudoCMPU_LE_QB, PseudoPICK_QB, v4i8, SETULE>;
+def : DSPSetCCPatInv<PseudoCMPU_EQ_QB, PseudoPICK_QB, v4i8, SETNE>;
+def : DSPSetCCPatInv<PseudoCMPU_LT_QB, PseudoPICK_QB, v4i8, SETUGE>;
+def : DSPSetCCPatInv<PseudoCMPU_LE_QB, PseudoPICK_QB, v4i8, SETUGT>;
+
+def : DSPSelectCCPat<PseudoCMP_EQ_PH, PseudoPICK_PH, v2i16, SETEQ>;
+def : DSPSelectCCPat<PseudoCMP_LT_PH, PseudoPICK_PH, v2i16, SETLT>;
+def : DSPSelectCCPat<PseudoCMP_LE_PH, PseudoPICK_PH, v2i16, SETLE>;
+def : DSPSelectCCPatInv<PseudoCMP_EQ_PH, PseudoPICK_PH, v2i16, SETNE>;
+def : DSPSelectCCPatInv<PseudoCMP_LT_PH, PseudoPICK_PH, v2i16, SETGE>;
+def : DSPSelectCCPatInv<PseudoCMP_LE_PH, PseudoPICK_PH, v2i16, SETGT>;
+def : DSPSelectCCPat<PseudoCMPU_EQ_QB, PseudoPICK_QB, v4i8, SETEQ>;
+def : DSPSelectCCPat<PseudoCMPU_LT_QB, PseudoPICK_QB, v4i8, SETULT>;
+def : DSPSelectCCPat<PseudoCMPU_LE_QB, PseudoPICK_QB, v4i8, SETULE>;
+def : DSPSelectCCPatInv<PseudoCMPU_EQ_QB, PseudoPICK_QB, v4i8, SETNE>;
+def : DSPSelectCCPatInv<PseudoCMPU_LT_QB, PseudoPICK_QB, v4i8, SETUGE>;
+def : DSPSelectCCPatInv<PseudoCMPU_LE_QB, PseudoPICK_QB, v4i8, SETUGT>;
+
+// Extr patterns.
+class EXTR_W_TY1_R2_Pat<SDPatternOperator OpNode, Instruction Instr> :
+ DSPPat<(i32 (OpNode GPR32:$rs, ACC64DSP:$ac)),
+ (Instr ACC64DSP:$ac, GPR32:$rs)>;
+
+class EXTR_W_TY1_R1_Pat<SDPatternOperator OpNode, Instruction Instr> :
+ DSPPat<(i32 (OpNode immZExt5:$shift, ACC64DSP:$ac)),
+ (Instr ACC64DSP:$ac, immZExt5:$shift)>;
+
+def : EXTR_W_TY1_R1_Pat<MipsEXTP, EXTP>;
+def : EXTR_W_TY1_R2_Pat<MipsEXTP, EXTPV>;
+def : EXTR_W_TY1_R1_Pat<MipsEXTPDP, EXTPDP>;
+def : EXTR_W_TY1_R2_Pat<MipsEXTPDP, EXTPDPV>;
+def : EXTR_W_TY1_R1_Pat<MipsEXTR_W, EXTR_W>;
+def : EXTR_W_TY1_R2_Pat<MipsEXTR_W, EXTRV_W>;
+def : EXTR_W_TY1_R1_Pat<MipsEXTR_R_W, EXTR_R_W>;
+def : EXTR_W_TY1_R2_Pat<MipsEXTR_R_W, EXTRV_R_W>;
+def : EXTR_W_TY1_R1_Pat<MipsEXTR_RS_W, EXTR_RS_W>;
+def : EXTR_W_TY1_R2_Pat<MipsEXTR_RS_W, EXTRV_RS_W>;
+def : EXTR_W_TY1_R1_Pat<MipsEXTR_S_H, EXTR_S_H>;
+def : EXTR_W_TY1_R2_Pat<MipsEXTR_S_H, EXTRV_S_H>;
+
+// Indexed load patterns.
+class IndexedLoadPat<SDPatternOperator LoadNode, Instruction Instr> :
+ DSPPat<(i32 (LoadNode (add i32:$base, i32:$index))),
+ (Instr i32:$base, i32:$index)>;
+
+let AddedComplexity = 20 in {
+ def : IndexedLoadPat<zextloadi8, LBUX>;
+ def : IndexedLoadPat<sextloadi16, LHX>;
+ def : IndexedLoadPat<load, LWX>;
+}
+
+// Instruction alias.
+let AdditionalPredicates = [NotInMicroMips] in {
+ def : DSPInstAlias<"wrdsp $rt", (WRDSP GPR32Opnd:$rt, 0x1F), 1>;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp b/contrib/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
new file mode 100644
index 000000000000..c821084f68cf
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -0,0 +1,891 @@
+//===-- MipsDelaySlotFiller.cpp - Mips Delay Slot Filler ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Simple pass to fill delay slots with useful instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/MipsMCNaCl.h"
+#include "Mips.h"
+#include "MipsInstrInfo.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "delay-slot-filler"
+
+STATISTIC(FilledSlots, "Number of delay slots filled");
+STATISTIC(UsefulSlots, "Number of delay slots filled with instructions that"
+ " are not NOP.");
+
+static cl::opt<bool> DisableDelaySlotFiller(
+ "disable-mips-delay-filler",
+ cl::init(false),
+ cl::desc("Fill all delay slots with NOPs."),
+ cl::Hidden);
+
+static cl::opt<bool> DisableForwardSearch(
+ "disable-mips-df-forward-search",
+ cl::init(true),
+ cl::desc("Disallow MIPS delay filler to search forward."),
+ cl::Hidden);
+
+static cl::opt<bool> DisableSuccBBSearch(
+ "disable-mips-df-succbb-search",
+ cl::init(true),
+ cl::desc("Disallow MIPS delay filler to search successor basic blocks."),
+ cl::Hidden);
+
+static cl::opt<bool> DisableBackwardSearch(
+ "disable-mips-df-backward-search",
+ cl::init(false),
+ cl::desc("Disallow MIPS delay filler to search backward."),
+ cl::Hidden);
+
+enum CompactBranchPolicy {
+ CB_Never, ///< The policy 'never' may in some circumstances or for some
+ ///< ISAs not be absolutely adhered to.
+ CB_Optimal, ///< Optimal is the default and will produce compact branches
+ ///< when delay slots cannot be filled.
+ CB_Always ///< 'always' may in some circumstances may not be
+ ///< absolutely adhered to there may not be a corresponding
+ ///< compact form of a branch.
+};
+
+static cl::opt<CompactBranchPolicy> MipsCompactBranchPolicy(
+ "mips-compact-branches",cl::Optional,
+ cl::init(CB_Optimal),
+ cl::desc("MIPS Specific: Compact branch policy."),
+ cl::values(
+ clEnumValN(CB_Never, "never", "Do not use compact branches if possible."),
+ clEnumValN(CB_Optimal, "optimal", "Use compact branches where appropiate (default)."),
+ clEnumValN(CB_Always, "always", "Always use compact branches if possible.")
+ )
+);
+
+namespace {
+ typedef MachineBasicBlock::iterator Iter;
+ typedef MachineBasicBlock::reverse_iterator ReverseIter;
+ typedef SmallDenseMap<MachineBasicBlock*, MachineInstr*, 2> BB2BrMap;
+
+ class RegDefsUses {
+ public:
+ RegDefsUses(const TargetRegisterInfo &TRI);
+ void init(const MachineInstr &MI);
+
+ /// This function sets all caller-saved registers in Defs.
+ void setCallerSaved(const MachineInstr &MI);
+
+ /// This function sets all unallocatable registers in Defs.
+ void setUnallocatableRegs(const MachineFunction &MF);
+
+ /// Set bits in Uses corresponding to MBB's live-out registers except for
+ /// the registers that are live-in to SuccBB.
+ void addLiveOut(const MachineBasicBlock &MBB,
+ const MachineBasicBlock &SuccBB);
+
+ bool update(const MachineInstr &MI, unsigned Begin, unsigned End);
+
+ private:
+ bool checkRegDefsUses(BitVector &NewDefs, BitVector &NewUses, unsigned Reg,
+ bool IsDef) const;
+
+ /// Returns true if Reg or its alias is in RegSet.
+ bool isRegInSet(const BitVector &RegSet, unsigned Reg) const;
+
+ const TargetRegisterInfo &TRI;
+ BitVector Defs, Uses;
+ };
+
+ /// Base class for inspecting loads and stores.
+ class InspectMemInstr {
+ public:
+ InspectMemInstr(bool ForbidMemInstr_)
+ : OrigSeenLoad(false), OrigSeenStore(false), SeenLoad(false),
+ SeenStore(false), ForbidMemInstr(ForbidMemInstr_) {}
+
+ /// Return true if MI cannot be moved to delay slot.
+ bool hasHazard(const MachineInstr &MI);
+
+ virtual ~InspectMemInstr() {}
+
+ protected:
+ /// Flags indicating whether loads or stores have been seen.
+ bool OrigSeenLoad, OrigSeenStore, SeenLoad, SeenStore;
+
+ /// Memory instructions are not allowed to move to delay slot if this flag
+ /// is true.
+ bool ForbidMemInstr;
+
+ private:
+ virtual bool hasHazard_(const MachineInstr &MI) = 0;
+ };
+
+ /// This subclass rejects any memory instructions.
+ class NoMemInstr : public InspectMemInstr {
+ public:
+ NoMemInstr() : InspectMemInstr(true) {}
+ private:
+ bool hasHazard_(const MachineInstr &MI) override { return true; }
+ };
+
+ /// This subclass accepts loads from stacks and constant loads.
+ class LoadFromStackOrConst : public InspectMemInstr {
+ public:
+ LoadFromStackOrConst() : InspectMemInstr(false) {}
+ private:
+ bool hasHazard_(const MachineInstr &MI) override;
+ };
+
+ /// This subclass uses memory dependence information to determine whether a
+ /// memory instruction can be moved to a delay slot.
+ class MemDefsUses : public InspectMemInstr {
+ public:
+ MemDefsUses(const DataLayout &DL, const MachineFrameInfo *MFI);
+
+ private:
+ typedef PointerUnion<const Value *, const PseudoSourceValue *> ValueType;
+
+ bool hasHazard_(const MachineInstr &MI) override;
+
+ /// Update Defs and Uses. Return true if there exist dependences that
+ /// disqualify the delay slot candidate between V and values in Uses and
+ /// Defs.
+ bool updateDefsUses(ValueType V, bool MayStore);
+
+ /// Get the list of underlying objects of MI's memory operand.
+ bool getUnderlyingObjects(const MachineInstr &MI,
+ SmallVectorImpl<ValueType> &Objects) const;
+
+ const MachineFrameInfo *MFI;
+ SmallPtrSet<ValueType, 4> Uses, Defs;
+ const DataLayout &DL;
+
+ /// Flags indicating whether loads or stores with no underlying objects have
+ /// been seen.
+ bool SeenNoObjLoad, SeenNoObjStore;
+ };
+
+ class Filler : public MachineFunctionPass {
+ public:
+ Filler(TargetMachine &tm)
+ : MachineFunctionPass(ID), TM(tm) { }
+
+ StringRef getPassName() const override { return "Mips Delay Slot Filler"; }
+
+ bool runOnMachineFunction(MachineFunction &F) override {
+ bool Changed = false;
+ for (MachineFunction::iterator FI = F.begin(), FE = F.end();
+ FI != FE; ++FI)
+ Changed |= runOnMachineBasicBlock(*FI);
+
+ // This pass invalidates liveness information when it reorders
+ // instructions to fill delay slot. Without this, -verify-machineinstrs
+ // will fail.
+ if (Changed)
+ F.getRegInfo().invalidateLiveness();
+
+ return Changed;
+ }
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineBranchProbabilityInfo>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ private:
+ bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
+
+ Iter replaceWithCompactBranch(MachineBasicBlock &MBB, Iter Branch,
+ const DebugLoc &DL);
+
+ /// This function checks if it is valid to move Candidate to the delay slot
+ /// and returns true if it isn't. It also updates memory and register
+ /// dependence information.
+ bool delayHasHazard(const MachineInstr &Candidate, RegDefsUses &RegDU,
+ InspectMemInstr &IM) const;
+
+ /// This function searches range [Begin, End) for an instruction that can be
+ /// moved to the delay slot. Returns true on success.
+ template<typename IterTy>
+ bool searchRange(MachineBasicBlock &MBB, IterTy Begin, IterTy End,
+ RegDefsUses &RegDU, InspectMemInstr &IM, Iter Slot,
+ IterTy &Filler) const;
+
+ /// This function searches in the backward direction for an instruction that
+ /// can be moved to the delay slot. Returns true on success.
+ bool searchBackward(MachineBasicBlock &MBB, MachineInstr &Slot) const;
+
+ /// This function searches MBB in the forward direction for an instruction
+ /// that can be moved to the delay slot. Returns true on success.
+ bool searchForward(MachineBasicBlock &MBB, Iter Slot) const;
+
+ /// This function searches one of MBB's successor blocks for an instruction
+ /// that can be moved to the delay slot and inserts clones of the
+ /// instruction into the successor's predecessor blocks.
+ bool searchSuccBBs(MachineBasicBlock &MBB, Iter Slot) const;
+
+ /// Pick a successor block of MBB. Return NULL if MBB doesn't have a
+ /// successor block that is not a landing pad.
+ MachineBasicBlock *selectSuccBB(MachineBasicBlock &B) const;
+
+ /// This function analyzes MBB and returns an instruction with an unoccupied
+ /// slot that branches to Dst.
+ std::pair<MipsInstrInfo::BranchType, MachineInstr *>
+ getBranch(MachineBasicBlock &MBB, const MachineBasicBlock &Dst) const;
+
+ /// Examine Pred and see if it is possible to insert an instruction into
+ /// one of its branches delay slot or its end.
+ bool examinePred(MachineBasicBlock &Pred, const MachineBasicBlock &Succ,
+ RegDefsUses &RegDU, bool &HasMultipleSuccs,
+ BB2BrMap &BrMap) const;
+
+ bool terminateSearch(const MachineInstr &Candidate) const;
+
+ TargetMachine &TM;
+
+ static char ID;
+ };
+ char Filler::ID = 0;
+} // end of anonymous namespace
+
+static bool hasUnoccupiedSlot(const MachineInstr *MI) {
+ return MI->hasDelaySlot() && !MI->isBundledWithSucc();
+}
+
+/// This function inserts clones of Filler into predecessor blocks.
+static void insertDelayFiller(Iter Filler, const BB2BrMap &BrMap) {
+ MachineFunction *MF = Filler->getParent()->getParent();
+
+ for (BB2BrMap::const_iterator I = BrMap.begin(); I != BrMap.end(); ++I) {
+ if (I->second) {
+ MIBundleBuilder(I->second).append(MF->CloneMachineInstr(&*Filler));
+ ++UsefulSlots;
+ } else {
+ I->first->insert(I->first->end(), MF->CloneMachineInstr(&*Filler));
+ }
+ }
+}
+
+/// This function adds registers Filler defines to MBB's live-in register list.
+static void addLiveInRegs(Iter Filler, MachineBasicBlock &MBB) {
+ for (unsigned I = 0, E = Filler->getNumOperands(); I != E; ++I) {
+ const MachineOperand &MO = Filler->getOperand(I);
+ unsigned R;
+
+ if (!MO.isReg() || !MO.isDef() || !(R = MO.getReg()))
+ continue;
+
+#ifndef NDEBUG
+ const MachineFunction &MF = *MBB.getParent();
+ assert(MF.getSubtarget().getRegisterInfo()->getAllocatableSet(MF).test(R) &&
+ "Shouldn't move an instruction with unallocatable registers across "
+ "basic block boundaries.");
+#endif
+
+ if (!MBB.isLiveIn(R))
+ MBB.addLiveIn(R);
+ }
+}
+
+RegDefsUses::RegDefsUses(const TargetRegisterInfo &TRI)
+ : TRI(TRI), Defs(TRI.getNumRegs(), false), Uses(TRI.getNumRegs(), false) {}
+
+void RegDefsUses::init(const MachineInstr &MI) {
+ // Add all register operands which are explicit and non-variadic.
+ update(MI, 0, MI.getDesc().getNumOperands());
+
+ // If MI is a call, add RA to Defs to prevent users of RA from going into
+ // delay slot.
+ if (MI.isCall())
+ Defs.set(Mips::RA);
+
+ // Add all implicit register operands of branch instructions except
+ // register AT.
+ if (MI.isBranch()) {
+ update(MI, MI.getDesc().getNumOperands(), MI.getNumOperands());
+ Defs.reset(Mips::AT);
+ }
+}
+
+void RegDefsUses::setCallerSaved(const MachineInstr &MI) {
+ assert(MI.isCall());
+
+ // Add RA/RA_64 to Defs to prevent users of RA/RA_64 from going into
+ // the delay slot. The reason is that RA/RA_64 must not be changed
+ // in the delay slot so that the callee can return to the caller.
+ if (MI.definesRegister(Mips::RA) || MI.definesRegister(Mips::RA_64)) {
+ Defs.set(Mips::RA);
+ Defs.set(Mips::RA_64);
+ }
+
+ // If MI is a call, add all caller-saved registers to Defs.
+ BitVector CallerSavedRegs(TRI.getNumRegs(), true);
+
+ CallerSavedRegs.reset(Mips::ZERO);
+ CallerSavedRegs.reset(Mips::ZERO_64);
+
+ for (const MCPhysReg *R = TRI.getCalleeSavedRegs(MI.getParent()->getParent());
+ *R; ++R)
+ for (MCRegAliasIterator AI(*R, &TRI, true); AI.isValid(); ++AI)
+ CallerSavedRegs.reset(*AI);
+
+ Defs |= CallerSavedRegs;
+}
+
+void RegDefsUses::setUnallocatableRegs(const MachineFunction &MF) {
+ BitVector AllocSet = TRI.getAllocatableSet(MF);
+
+ for (int R = AllocSet.find_first(); R != -1; R = AllocSet.find_next(R))
+ for (MCRegAliasIterator AI(R, &TRI, false); AI.isValid(); ++AI)
+ AllocSet.set(*AI);
+
+ AllocSet.set(Mips::ZERO);
+ AllocSet.set(Mips::ZERO_64);
+
+ Defs |= AllocSet.flip();
+}
+
+void RegDefsUses::addLiveOut(const MachineBasicBlock &MBB,
+ const MachineBasicBlock &SuccBB) {
+ for (MachineBasicBlock::const_succ_iterator SI = MBB.succ_begin(),
+ SE = MBB.succ_end(); SI != SE; ++SI)
+ if (*SI != &SuccBB)
+ for (const auto &LI : (*SI)->liveins())
+ Uses.set(LI.PhysReg);
+}
+
+bool RegDefsUses::update(const MachineInstr &MI, unsigned Begin, unsigned End) {
+ BitVector NewDefs(TRI.getNumRegs()), NewUses(TRI.getNumRegs());
+ bool HasHazard = false;
+
+ for (unsigned I = Begin; I != End; ++I) {
+ const MachineOperand &MO = MI.getOperand(I);
+
+ if (MO.isReg() && MO.getReg())
+ HasHazard |= checkRegDefsUses(NewDefs, NewUses, MO.getReg(), MO.isDef());
+ }
+
+ Defs |= NewDefs;
+ Uses |= NewUses;
+
+ return HasHazard;
+}
+
+bool RegDefsUses::checkRegDefsUses(BitVector &NewDefs, BitVector &NewUses,
+ unsigned Reg, bool IsDef) const {
+ if (IsDef) {
+ NewDefs.set(Reg);
+ // check whether Reg has already been defined or used.
+ return (isRegInSet(Defs, Reg) || isRegInSet(Uses, Reg));
+ }
+
+ NewUses.set(Reg);
+ // check whether Reg has already been defined.
+ return isRegInSet(Defs, Reg);
+}
+
+bool RegDefsUses::isRegInSet(const BitVector &RegSet, unsigned Reg) const {
+ // Check Reg and all aliased Registers.
+ for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI)
+ if (RegSet.test(*AI))
+ return true;
+ return false;
+}
+
+bool InspectMemInstr::hasHazard(const MachineInstr &MI) {
+ if (!MI.mayStore() && !MI.mayLoad())
+ return false;
+
+ if (ForbidMemInstr)
+ return true;
+
+ OrigSeenLoad = SeenLoad;
+ OrigSeenStore = SeenStore;
+ SeenLoad |= MI.mayLoad();
+ SeenStore |= MI.mayStore();
+
+ // If MI is an ordered or volatile memory reference, disallow moving
+ // subsequent loads and stores to delay slot.
+ if (MI.hasOrderedMemoryRef() && (OrigSeenLoad || OrigSeenStore)) {
+ ForbidMemInstr = true;
+ return true;
+ }
+
+ return hasHazard_(MI);
+}
+
+bool LoadFromStackOrConst::hasHazard_(const MachineInstr &MI) {
+ if (MI.mayStore())
+ return true;
+
+ if (!MI.hasOneMemOperand() || !(*MI.memoperands_begin())->getPseudoValue())
+ return true;
+
+ if (const PseudoSourceValue *PSV =
+ (*MI.memoperands_begin())->getPseudoValue()) {
+ if (isa<FixedStackPseudoSourceValue>(PSV))
+ return false;
+ return !PSV->isConstant(nullptr) && !PSV->isStack();
+ }
+
+ return true;
+}
+
+MemDefsUses::MemDefsUses(const DataLayout &DL, const MachineFrameInfo *MFI_)
+ : InspectMemInstr(false), MFI(MFI_), DL(DL), SeenNoObjLoad(false),
+ SeenNoObjStore(false) {}
+
+bool MemDefsUses::hasHazard_(const MachineInstr &MI) {
+ bool HasHazard = false;
+ SmallVector<ValueType, 4> Objs;
+
+ // Check underlying object list.
+ if (getUnderlyingObjects(MI, Objs)) {
+ for (SmallVectorImpl<ValueType>::const_iterator I = Objs.begin();
+ I != Objs.end(); ++I)
+ HasHazard |= updateDefsUses(*I, MI.mayStore());
+
+ return HasHazard;
+ }
+
+ // No underlying objects found.
+ HasHazard = MI.mayStore() && (OrigSeenLoad || OrigSeenStore);
+ HasHazard |= MI.mayLoad() || OrigSeenStore;
+
+ SeenNoObjLoad |= MI.mayLoad();
+ SeenNoObjStore |= MI.mayStore();
+
+ return HasHazard;
+}
+
+bool MemDefsUses::updateDefsUses(ValueType V, bool MayStore) {
+ if (MayStore)
+ return !Defs.insert(V).second || Uses.count(V) || SeenNoObjStore ||
+ SeenNoObjLoad;
+
+ Uses.insert(V);
+ return Defs.count(V) || SeenNoObjStore;
+}
+
+bool MemDefsUses::
+getUnderlyingObjects(const MachineInstr &MI,
+ SmallVectorImpl<ValueType> &Objects) const {
+ if (!MI.hasOneMemOperand() ||
+ (!(*MI.memoperands_begin())->getValue() &&
+ !(*MI.memoperands_begin())->getPseudoValue()))
+ return false;
+
+ if (const PseudoSourceValue *PSV =
+ (*MI.memoperands_begin())->getPseudoValue()) {
+ if (!PSV->isAliased(MFI))
+ return false;
+ Objects.push_back(PSV);
+ return true;
+ }
+
+ const Value *V = (*MI.memoperands_begin())->getValue();
+
+ SmallVector<Value *, 4> Objs;
+ GetUnderlyingObjects(const_cast<Value *>(V), Objs, DL);
+
+ for (SmallVectorImpl<Value *>::iterator I = Objs.begin(), E = Objs.end();
+ I != E; ++I) {
+ if (!isIdentifiedObject(V))
+ return false;
+
+ Objects.push_back(*I);
+ }
+
+ return true;
+}
+
+// Replace Branch with the compact branch instruction.
+Iter Filler::replaceWithCompactBranch(MachineBasicBlock &MBB, Iter Branch,
+ const DebugLoc &DL) {
+ const MipsSubtarget &STI = MBB.getParent()->getSubtarget<MipsSubtarget>();
+ const MipsInstrInfo *TII = STI.getInstrInfo();
+
+ unsigned NewOpcode = TII->getEquivalentCompactForm(Branch);
+ Branch = TII->genInstrWithNewOpc(NewOpcode, Branch);
+
+ std::next(Branch)->eraseFromParent();
+ return Branch;
+}
+
+// For given opcode returns opcode of corresponding instruction with short
+// delay slot.
+// For the pseudo TAILCALL*_MM instrunctions return the short delay slot
+// form. Unfortunately, TAILCALL<->b16 is denied as b16 has a limited range
+// that is too short to make use of for tail calls.
+static int getEquivalentCallShort(int Opcode) {
+ switch (Opcode) {
+ case Mips::BGEZAL:
+ return Mips::BGEZALS_MM;
+ case Mips::BLTZAL:
+ return Mips::BLTZALS_MM;
+ case Mips::JAL:
+ return Mips::JALS_MM;
+ case Mips::JALR:
+ return Mips::JALRS_MM;
+ case Mips::JALR16_MM:
+ return Mips::JALRS16_MM;
+ case Mips::TAILCALL_MM:
+ llvm_unreachable("Attempting to shorten the TAILCALL_MM pseudo!");
+ case Mips::TAILCALLREG:
+ return Mips::JR16_MM;
+ default:
+ llvm_unreachable("Unexpected call instruction for microMIPS.");
+ }
+}
+
+/// runOnMachineBasicBlock - Fill in delay slots for the given basic block.
+/// We assume there is only one delay slot per delayed instruction.
+bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
+ bool Changed = false;
+ const MipsSubtarget &STI = MBB.getParent()->getSubtarget<MipsSubtarget>();
+ bool InMicroMipsMode = STI.inMicroMipsMode();
+ const MipsInstrInfo *TII = STI.getInstrInfo();
+
+ if (InMicroMipsMode && STI.hasMips32r6()) {
+ // This is microMIPS32r6 or microMIPS64r6 processor. Delay slot for
+ // branching instructions is not needed.
+ return Changed;
+ }
+
+ for (Iter I = MBB.begin(); I != MBB.end(); ++I) {
+ if (!hasUnoccupiedSlot(&*I))
+ continue;
+
+ ++FilledSlots;
+ Changed = true;
+
+ // Delay slot filling is disabled at -O0.
+ if (!DisableDelaySlotFiller && (TM.getOptLevel() != CodeGenOpt::None)) {
+ bool Filled = false;
+
+ if (MipsCompactBranchPolicy.getValue() != CB_Always ||
+ !TII->getEquivalentCompactForm(I)) {
+ if (searchBackward(MBB, *I)) {
+ Filled = true;
+ } else if (I->isTerminator()) {
+ if (searchSuccBBs(MBB, I)) {
+ Filled = true;
+ }
+ } else if (searchForward(MBB, I)) {
+ Filled = true;
+ }
+ }
+
+ if (Filled) {
+ // Get instruction with delay slot.
+ MachineBasicBlock::instr_iterator DSI = I.getInstrIterator();
+
+ if (InMicroMipsMode && TII->getInstSizeInBytes(*std::next(DSI)) == 2 &&
+ DSI->isCall()) {
+ // If instruction in delay slot is 16b change opcode to
+ // corresponding instruction with short delay slot.
+
+ // TODO: Implement an instruction mapping table of 16bit opcodes to
+ // 32bit opcodes so that an instruction can be expanded. This would
+ // save 16 bits as a TAILCALL_MM pseudo requires a fullsized nop.
+ // TODO: Permit b16 when branching backwards to the the same function
+ // if it is in range.
+ DSI->setDesc(TII->get(getEquivalentCallShort(DSI->getOpcode())));
+ }
+ continue;
+ }
+ }
+
+ // For microMIPS if instruction is BEQ or BNE with one ZERO register, then
+ // instead of adding NOP replace this instruction with the corresponding
+ // compact branch instruction, i.e. BEQZC or BNEZC. Additionally
+ // PseudoReturn and PseudoIndirectBranch are expanded to JR_MM, so they can
+ // be replaced with JRC16_MM.
+
+ // For MIPSR6 attempt to produce the corresponding compact (no delay slot)
+ // form of the CTI. For indirect jumps this will not require inserting a
+ // NOP and for branches will hopefully avoid requiring a NOP.
+ if ((InMicroMipsMode ||
+ (STI.hasMips32r6() && MipsCompactBranchPolicy != CB_Never)) &&
+ TII->getEquivalentCompactForm(I)) {
+ I = replaceWithCompactBranch(MBB, I, I->getDebugLoc());
+ continue;
+ }
+
+ // Bundle the NOP to the instruction with the delay slot.
+ BuildMI(MBB, std::next(I), I->getDebugLoc(), TII->get(Mips::NOP));
+ MIBundleBuilder(MBB, I, std::next(I, 2));
+ }
+
+ return Changed;
+}
+
+/// createMipsDelaySlotFillerPass - Returns a pass that fills in delay
+/// slots in Mips MachineFunctions
+FunctionPass *llvm::createMipsDelaySlotFillerPass(MipsTargetMachine &tm) {
+ return new Filler(tm);
+}
+
+template<typename IterTy>
+bool Filler::searchRange(MachineBasicBlock &MBB, IterTy Begin, IterTy End,
+ RegDefsUses &RegDU, InspectMemInstr& IM, Iter Slot,
+ IterTy &Filler) const {
+ for (IterTy I = Begin; I != End;) {
+ IterTy CurrI = I;
+ ++I;
+
+ // skip debug value
+ if (CurrI->isDebugValue())
+ continue;
+
+ if (terminateSearch(*CurrI))
+ break;
+
+ assert((!CurrI->isCall() && !CurrI->isReturn() && !CurrI->isBranch()) &&
+ "Cannot put calls, returns or branches in delay slot.");
+
+ if (CurrI->isKill()) {
+ CurrI->eraseFromParent();
+ continue;
+ }
+
+ if (delayHasHazard(*CurrI, RegDU, IM))
+ continue;
+
+ const MipsSubtarget &STI = MBB.getParent()->getSubtarget<MipsSubtarget>();
+ if (STI.isTargetNaCl()) {
+ // In NaCl, instructions that must be masked are forbidden in delay slots.
+ // We only check for loads, stores and SP changes. Calls, returns and
+ // branches are not checked because non-NaCl targets never put them in
+ // delay slots.
+ unsigned AddrIdx;
+ if ((isBasePlusOffsetMemoryAccess(CurrI->getOpcode(), &AddrIdx) &&
+ baseRegNeedsLoadStoreMask(CurrI->getOperand(AddrIdx).getReg())) ||
+ CurrI->modifiesRegister(Mips::SP, STI.getRegisterInfo()))
+ continue;
+ }
+
+ bool InMicroMipsMode = STI.inMicroMipsMode();
+ const MipsInstrInfo *TII = STI.getInstrInfo();
+ unsigned Opcode = (*Slot).getOpcode();
+ // This is complicated by the tail call optimization. For non-PIC code
+ // there is only a 32bit sized unconditional branch which can be assumed
+ // to be able to reach the target. b16 only has a range of +/- 1 KB.
+ // It's entirely possible that the target function is reachable with b16
+ // but we don't have enough information to make that decision.
+ if (InMicroMipsMode && TII->getInstSizeInBytes(*CurrI) == 2 &&
+ (Opcode == Mips::JR || Opcode == Mips::PseudoIndirectBranch ||
+ Opcode == Mips::PseudoReturn || Opcode == Mips::TAILCALL))
+ continue;
+
+ Filler = CurrI;
+ return true;
+ }
+
+ return false;
+}
+
+bool Filler::searchBackward(MachineBasicBlock &MBB, MachineInstr &Slot) const {
+ if (DisableBackwardSearch)
+ return false;
+
+ auto *Fn = MBB.getParent();
+ RegDefsUses RegDU(*Fn->getSubtarget().getRegisterInfo());
+ MemDefsUses MemDU(Fn->getDataLayout(), &Fn->getFrameInfo());
+ ReverseIter Filler;
+
+ RegDU.init(Slot);
+
+ MachineBasicBlock::iterator SlotI = Slot;
+ if (!searchRange(MBB, ++SlotI.getReverse(), MBB.rend(), RegDU, MemDU, Slot,
+ Filler))
+ return false;
+
+ MBB.splice(std::next(SlotI), &MBB, Filler.getReverse());
+ MIBundleBuilder(MBB, SlotI, std::next(SlotI, 2));
+ ++UsefulSlots;
+ return true;
+}
+
+bool Filler::searchForward(MachineBasicBlock &MBB, Iter Slot) const {
+ // Can handle only calls.
+ if (DisableForwardSearch || !Slot->isCall())
+ return false;
+
+ RegDefsUses RegDU(*MBB.getParent()->getSubtarget().getRegisterInfo());
+ NoMemInstr NM;
+ Iter Filler;
+
+ RegDU.setCallerSaved(*Slot);
+
+ if (!searchRange(MBB, std::next(Slot), MBB.end(), RegDU, NM, Slot, Filler))
+ return false;
+
+ MBB.splice(std::next(Slot), &MBB, Filler);
+ MIBundleBuilder(MBB, Slot, std::next(Slot, 2));
+ ++UsefulSlots;
+ return true;
+}
+
+bool Filler::searchSuccBBs(MachineBasicBlock &MBB, Iter Slot) const {
+ if (DisableSuccBBSearch)
+ return false;
+
+ MachineBasicBlock *SuccBB = selectSuccBB(MBB);
+
+ if (!SuccBB)
+ return false;
+
+ RegDefsUses RegDU(*MBB.getParent()->getSubtarget().getRegisterInfo());
+ bool HasMultipleSuccs = false;
+ BB2BrMap BrMap;
+ std::unique_ptr<InspectMemInstr> IM;
+ Iter Filler;
+ auto *Fn = MBB.getParent();
+
+ // Iterate over SuccBB's predecessor list.
+ for (MachineBasicBlock::pred_iterator PI = SuccBB->pred_begin(),
+ PE = SuccBB->pred_end(); PI != PE; ++PI)
+ if (!examinePred(**PI, *SuccBB, RegDU, HasMultipleSuccs, BrMap))
+ return false;
+
+ // Do not allow moving instructions which have unallocatable register operands
+ // across basic block boundaries.
+ RegDU.setUnallocatableRegs(*Fn);
+
+ // Only allow moving loads from stack or constants if any of the SuccBB's
+ // predecessors have multiple successors.
+ if (HasMultipleSuccs) {
+ IM.reset(new LoadFromStackOrConst());
+ } else {
+ const MachineFrameInfo &MFI = Fn->getFrameInfo();
+ IM.reset(new MemDefsUses(Fn->getDataLayout(), &MFI));
+ }
+
+ if (!searchRange(MBB, SuccBB->begin(), SuccBB->end(), RegDU, *IM, Slot,
+ Filler))
+ return false;
+
+ insertDelayFiller(Filler, BrMap);
+ addLiveInRegs(Filler, *SuccBB);
+ Filler->eraseFromParent();
+
+ return true;
+}
+
+MachineBasicBlock *Filler::selectSuccBB(MachineBasicBlock &B) const {
+ if (B.succ_empty())
+ return nullptr;
+
+ // Select the successor with the larget edge weight.
+ auto &Prob = getAnalysis<MachineBranchProbabilityInfo>();
+ MachineBasicBlock *S = *std::max_element(
+ B.succ_begin(), B.succ_end(),
+ [&](const MachineBasicBlock *Dst0, const MachineBasicBlock *Dst1) {
+ return Prob.getEdgeProbability(&B, Dst0) <
+ Prob.getEdgeProbability(&B, Dst1);
+ });
+ return S->isEHPad() ? nullptr : S;
+}
+
+std::pair<MipsInstrInfo::BranchType, MachineInstr *>
+Filler::getBranch(MachineBasicBlock &MBB, const MachineBasicBlock &Dst) const {
+ const MipsInstrInfo *TII =
+ MBB.getParent()->getSubtarget<MipsSubtarget>().getInstrInfo();
+ MachineBasicBlock *TrueBB = nullptr, *FalseBB = nullptr;
+ SmallVector<MachineInstr*, 2> BranchInstrs;
+ SmallVector<MachineOperand, 2> Cond;
+
+ MipsInstrInfo::BranchType R =
+ TII->analyzeBranch(MBB, TrueBB, FalseBB, Cond, false, BranchInstrs);
+
+ if ((R == MipsInstrInfo::BT_None) || (R == MipsInstrInfo::BT_NoBranch))
+ return std::make_pair(R, nullptr);
+
+ if (R != MipsInstrInfo::BT_CondUncond) {
+ if (!hasUnoccupiedSlot(BranchInstrs[0]))
+ return std::make_pair(MipsInstrInfo::BT_None, nullptr);
+
+ assert(((R != MipsInstrInfo::BT_Uncond) || (TrueBB == &Dst)));
+
+ return std::make_pair(R, BranchInstrs[0]);
+ }
+
+ assert((TrueBB == &Dst) || (FalseBB == &Dst));
+
+ // Examine the conditional branch. See if its slot is occupied.
+ if (hasUnoccupiedSlot(BranchInstrs[0]))
+ return std::make_pair(MipsInstrInfo::BT_Cond, BranchInstrs[0]);
+
+ // If that fails, try the unconditional branch.
+ if (hasUnoccupiedSlot(BranchInstrs[1]) && (FalseBB == &Dst))
+ return std::make_pair(MipsInstrInfo::BT_Uncond, BranchInstrs[1]);
+
+ return std::make_pair(MipsInstrInfo::BT_None, nullptr);
+}
+
+bool Filler::examinePred(MachineBasicBlock &Pred, const MachineBasicBlock &Succ,
+ RegDefsUses &RegDU, bool &HasMultipleSuccs,
+ BB2BrMap &BrMap) const {
+ std::pair<MipsInstrInfo::BranchType, MachineInstr *> P =
+ getBranch(Pred, Succ);
+
+ // Return if either getBranch wasn't able to analyze the branches or there
+ // were no branches with unoccupied slots.
+ if (P.first == MipsInstrInfo::BT_None)
+ return false;
+
+ if ((P.first != MipsInstrInfo::BT_Uncond) &&
+ (P.first != MipsInstrInfo::BT_NoBranch)) {
+ HasMultipleSuccs = true;
+ RegDU.addLiveOut(Pred, Succ);
+ }
+
+ BrMap[&Pred] = P.second;
+ return true;
+}
+
+bool Filler::delayHasHazard(const MachineInstr &Candidate, RegDefsUses &RegDU,
+ InspectMemInstr &IM) const {
+ assert(!Candidate.isKill() &&
+ "KILL instructions should have been eliminated at this point.");
+
+ bool HasHazard = Candidate.isImplicitDef();
+
+ HasHazard |= IM.hasHazard(Candidate);
+ HasHazard |= RegDU.update(Candidate, 0, Candidate.getNumOperands());
+
+ return HasHazard;
+}
+
+bool Filler::terminateSearch(const MachineInstr &Candidate) const {
+ return (Candidate.isTerminator() || Candidate.isCall() ||
+ Candidate.isPosition() || Candidate.isInlineAsm() ||
+ Candidate.hasUnmodeledSideEffects());
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsEVAInstrFormats.td b/contrib/llvm/lib/Target/Mips/MipsEVAInstrFormats.td
new file mode 100644
index 000000000000..8c3024810d27
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsEVAInstrFormats.td
@@ -0,0 +1,84 @@
+//===- MipsEVAInstrFormats.td - Mips Instruction Formats ---*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips32r6 instruction formats.
+//
+//===----------------------------------------------------------------------===//
+
+class MipsEVAInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>,
+ PredicateControl, StdArch {
+ let DecoderNamespace = "Mips";
+ let EncodingPredicates = [HasStdEnc];
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Field Values
+//
+//===----------------------------------------------------------------------===//
+
+// Memory Load/Store EVA
+def OPCODE6_LBE : OPCODE6<0b101100>;
+def OPCODE6_LBuE : OPCODE6<0b101000>;
+def OPCODE6_LHE : OPCODE6<0b101101>;
+def OPCODE6_LHuE : OPCODE6<0b101001>;
+def OPCODE6_LWE : OPCODE6<0b101111>;
+
+def OPCODE6_SBE : OPCODE6<0b011100>;
+def OPCODE6_SHE : OPCODE6<0b011101>;
+def OPCODE6_SWE : OPCODE6<0b011111>;
+
+// load/store left/right EVA
+def OPCODE6_LWLE : OPCODE6<0b011001>;
+def OPCODE6_LWRE : OPCODE6<0b011010>;
+def OPCODE6_SWLE : OPCODE6<0b100001>;
+def OPCODE6_SWRE : OPCODE6<0b100010>;
+
+// Load-linked EVA, Store-conditional EVA
+def OPCODE6_LLE : OPCODE6<0b101110>;
+def OPCODE6_SCE : OPCODE6<0b011110>;
+
+def OPCODE6_TLBINV : OPCODE6<0b000011>;
+def OPCODE6_TLBINVF : OPCODE6<0b000100>;
+
+def OPCODE6_CACHEE : OPCODE6<0b011011>;
+def OPCODE6_PREFE : OPCODE6<0b100011>;
+
+def OPGROUP_COP0_TLB : OPGROUP<0b010000>;
+
+//===----------------------------------------------------------------------===//
+//
+// Encoding Formats
+//
+//===----------------------------------------------------------------------===//
+
+class SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6 Operation> : MipsEVAInst {
+ bits<21> addr;
+ bits<5> hint;
+ bits<5> base = addr{20-16};
+ bits<9> offset = addr{8-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = OPGROUP_SPECIAL3.Value;
+ let Inst{25-21} = base;
+ let Inst{20-16} = hint;
+ let Inst{15-7} = offset;
+ let Inst{6} = 0;
+ let Inst{5-0} = Operation.Value;
+}
+
+class TLB_FM<OPCODE6 Operation> : MipsEVAInst {
+ bits<32> Inst;
+
+ let Inst{31-26} = OPGROUP_COP0_TLB.Value;
+ let Inst{25} = 1; // CO
+ let Inst{24-6} = 0;
+ let Inst{5-0} = Operation.Value;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsEVAInstrInfo.td b/contrib/llvm/lib/Target/Mips/MipsEVAInstrInfo.td
new file mode 100644
index 000000000000..26df263d228b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsEVAInstrInfo.td
@@ -0,0 +1,209 @@
+//===- MipsEVAInstrInfo.td - EVA ASE instructions -*- tablegen ------------*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips EVA ASE instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction encodings
+//
+//===----------------------------------------------------------------------===//
+
+// Memory Load/Store EVA encodings
+class LBE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LBE>;
+class LBuE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LBuE>;
+class LHE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LHE>;
+class LHuE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LHuE>;
+class LWE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LWE>;
+
+class SBE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_SBE>;
+class SHE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_SHE>;
+class SWE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_SWE>;
+
+// load/store left/right EVA encodings
+class LWLE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LWLE>;
+class LWRE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LWRE>;
+class SWLE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_SWLE>;
+class SWRE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_SWRE>;
+
+// Load-linked EVA, Store-conditional EVA encodings
+class LLE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LLE>;
+class SCE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_SCE>;
+
+class TLBINV_ENC : TLB_FM<OPCODE6_TLBINV>;
+class TLBINVF_ENC : TLB_FM<OPCODE6_TLBINVF>;
+
+class CACHEE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_CACHEE>;
+class PREFE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_PREFE>;
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction descriptions
+//
+//===----------------------------------------------------------------------===//
+
+// Memory Load/Store EVA descriptions
+class LOAD_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+ InstrItinClass itin = NoItinerary> {
+ dag OutOperandList = (outs GPROpnd:$rt);
+ dag InOperandList = (ins mem_simm9:$addr);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+ list<dag> Pattern = [];
+ string DecoderMethod = "DecodeMemEVA";
+ bit canFoldAsLoad = 1;
+ bit mayLoad = 1;
+ InstrItinClass Itinerary = itin;
+}
+
+class LBE_DESC : LOAD_EVA_DESC_BASE<"lbe", GPR32Opnd, II_LBE>;
+class LBuE_DESC : LOAD_EVA_DESC_BASE<"lbue", GPR32Opnd, II_LBUE>;
+class LHE_DESC : LOAD_EVA_DESC_BASE<"lhe", GPR32Opnd, II_LHE>;
+class LHuE_DESC : LOAD_EVA_DESC_BASE<"lhue", GPR32Opnd, II_LHUE>;
+class LWE_DESC : LOAD_EVA_DESC_BASE<"lwe", GPR32Opnd, II_LWE>;
+
+class STORE_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+ SDPatternOperator OpNode = null_frag,
+ InstrItinClass itin = NoItinerary> {
+ dag OutOperandList = (outs);
+ dag InOperandList = (ins GPROpnd:$rt, mem_simm9:$addr);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+ list<dag> Pattern = [];
+ string DecoderMethod = "DecodeMemEVA";
+ bit mayStore = 1;
+ InstrItinClass Itinerary = itin;
+}
+
+class SBE_DESC : STORE_EVA_DESC_BASE<"sbe", GPR32Opnd, null_frag, II_SBE>;
+class SHE_DESC : STORE_EVA_DESC_BASE<"she", GPR32Opnd, null_frag, II_SHE>;
+class SWE_DESC : STORE_EVA_DESC_BASE<"swe", GPR32Opnd, null_frag, II_SWE>;
+
+// Load/Store Left/Right EVA descriptions
+class LOAD_LEFT_RIGHT_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+ InstrItinClass itin = NoItinerary> {
+ dag OutOperandList = (outs GPROpnd:$rt);
+ dag InOperandList = (ins mem_simm9:$addr, GPROpnd:$src);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+ list<dag> Pattern = [];
+ string DecoderMethod = "DecodeMemEVA";
+ string Constraints = "$src = $rt";
+ bit canFoldAsLoad = 1;
+ InstrItinClass Itinerary = itin;
+}
+
+class LWLE_DESC : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"lwle", GPR32Opnd, II_LWLE>;
+class LWRE_DESC : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"lwre", GPR32Opnd, II_LWRE>;
+
+class STORE_LEFT_RIGHT_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+ InstrItinClass itin = NoItinerary> {
+ dag OutOperandList = (outs);
+ dag InOperandList = (ins GPROpnd:$rt, mem_simm9:$addr);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+ list<dag> Pattern = [];
+ string DecoderMethod = "DecodeMemEVA";
+ InstrItinClass Itinerary = itin;
+}
+
+class SWLE_DESC : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"swle", GPR32Opnd, II_SWLE>;
+class SWRE_DESC : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"swre", GPR32Opnd, II_SWRE>;
+
+// Load-linked EVA, Store-conditional EVA descriptions
+class LLE_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+ InstrItinClass itin = NoItinerary> {
+ dag OutOperandList = (outs GPROpnd:$rt);
+ dag InOperandList = (ins mem_simm9:$addr);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+ list<dag> Pattern = [];
+ bit mayLoad = 1;
+ string DecoderMethod = "DecodeMemEVA";
+ InstrItinClass Itinerary = itin;
+}
+
+class LLE_DESC : LLE_DESC_BASE<"lle", GPR32Opnd, II_LLE>;
+
+class SCE_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+ InstrItinClass itin = NoItinerary> {
+ dag OutOperandList = (outs GPROpnd:$dst);
+ dag InOperandList = (ins GPROpnd:$rt, mem_simm9:$addr);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+ list<dag> Pattern = [];
+ bit mayStore = 1;
+ string Constraints = "$rt = $dst";
+ string DecoderMethod = "DecodeMemEVA";
+ InstrItinClass Itinerary = itin;
+}
+
+class SCE_DESC : SCE_DESC_BASE<"sce", GPR32Opnd, II_SCE>;
+
+class TLB_DESC_BASE<string instr_asm, InstrItinClass itin = NoItinerary> {
+ dag OutOperandList = (outs);
+ dag InOperandList = (ins);
+ string AsmString = instr_asm;
+ list<dag> Pattern = [];
+ InstrItinClass Itinerary = itin;
+}
+
+class TLBINV_DESC : TLB_DESC_BASE<"tlbinv", II_TLBINV>;
+class TLBINVF_DESC : TLB_DESC_BASE<"tlbinvf", II_TLBINVF>;
+
+class CACHEE_DESC_BASE<string instr_asm, Operand MemOpnd,
+ InstrItinClass itin = NoItinerary> {
+ dag OutOperandList = (outs);
+ dag InOperandList = (ins MemOpnd:$addr, uimm5:$hint);
+ string AsmString = !strconcat(instr_asm, "\t$hint, $addr");
+ list<dag> Pattern = [];
+ string DecoderMethod = "DecodeCacheeOp_CacheOpR6";
+ InstrItinClass Itinerary = itin;
+}
+
+class CACHEE_DESC : CACHEE_DESC_BASE<"cachee", mem_simm9, II_CACHEE>;
+class PREFE_DESC : CACHEE_DESC_BASE<"prefe", mem_simm9, II_PREFE>;
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction definitions
+//
+//===----------------------------------------------------------------------===//
+
+/// Load and Store EVA Instructions
+def LBE : LBE_ENC, LBE_DESC, INSN_EVA;
+def LBuE : LBuE_ENC, LBuE_DESC, INSN_EVA;
+def LHE : LHE_ENC, LHE_DESC, INSN_EVA;
+def LHuE : LHuE_ENC, LHuE_DESC, INSN_EVA;
+let AdditionalPredicates = [NotInMicroMips] in {
+def LWE : LWE_ENC, LWE_DESC, INSN_EVA;
+}
+def SBE : SBE_ENC, SBE_DESC, INSN_EVA;
+def SHE : SHE_ENC, SHE_DESC, INSN_EVA;
+let AdditionalPredicates = [NotInMicroMips] in {
+def SWE : SWE_ENC, SWE_DESC, INSN_EVA;
+}
+
+/// load/store left/right EVA
+let AdditionalPredicates = [NotInMicroMips] in {
+def LWLE : LWLE_ENC, LWLE_DESC, INSN_EVA_NOT_32R6_64R6;
+def LWRE : LWRE_ENC, LWRE_DESC, INSN_EVA_NOT_32R6_64R6;
+def SWLE : SWLE_ENC, SWLE_DESC, INSN_EVA_NOT_32R6_64R6;
+def SWRE : SWRE_ENC, SWRE_DESC, INSN_EVA_NOT_32R6_64R6;
+}
+
+/// Load-linked EVA, Store-conditional EVA
+let AdditionalPredicates = [NotInMicroMips] in {
+def LLE : LLE_ENC, LLE_DESC, INSN_EVA;
+def SCE : SCE_ENC, SCE_DESC, INSN_EVA;
+}
+
+let AdditionalPredicates = [NotInMicroMips] in {
+ def TLBINV : TLBINV_ENC, TLBINV_DESC, INSN_EVA;
+ def TLBINVF : TLBINVF_ENC, TLBINVF_DESC, INSN_EVA;
+}
+
+def CACHEE : CACHEE_ENC, CACHEE_DESC, INSN_EVA;
+def PREFE : PREFE_ENC, PREFE_DESC, INSN_EVA;
diff --git a/contrib/llvm/lib/Target/Mips/MipsFastISel.cpp b/contrib/llvm/lib/Target/Mips/MipsFastISel.cpp
new file mode 100644
index 000000000000..29f3e2c07e04
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsFastISel.cpp
@@ -0,0 +1,2081 @@
+//===-- MipsFastISel.cpp - Mips FastISel implementation --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file defines the MIPS-specific support for the FastISel class.
+/// Some of the target-specific code is generated by tablegen in the file
+/// MipsGenFastISel.inc, which is #included here.
+///
+//===----------------------------------------------------------------------===//
+
+#include "MipsCCState.h"
+#include "MipsInstrInfo.h"
+#include "MipsISelLowering.h"
+#include "MipsMachineFunction.h"
+#include "MipsRegisterInfo.h"
+#include "MipsSubtarget.h"
+#include "MipsTargetMachine.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "mips-fastisel"
+
+using namespace llvm;
+
+namespace {
+
+class MipsFastISel final : public FastISel {
+
+ // All possible address modes.
+ class Address {
+ public:
+ typedef enum { RegBase, FrameIndexBase } BaseKind;
+
+ private:
+ BaseKind Kind;
+ union {
+ unsigned Reg;
+ int FI;
+ } Base;
+
+ int64_t Offset;
+
+ const GlobalValue *GV;
+
+ public:
+ // Innocuous defaults for our address.
+ Address() : Kind(RegBase), Offset(0), GV(0) { Base.Reg = 0; }
+ void setKind(BaseKind K) { Kind = K; }
+ BaseKind getKind() const { return Kind; }
+ bool isRegBase() const { return Kind == RegBase; }
+ bool isFIBase() const { return Kind == FrameIndexBase; }
+ void setReg(unsigned Reg) {
+ assert(isRegBase() && "Invalid base register access!");
+ Base.Reg = Reg;
+ }
+ unsigned getReg() const {
+ assert(isRegBase() && "Invalid base register access!");
+ return Base.Reg;
+ }
+ void setFI(unsigned FI) {
+ assert(isFIBase() && "Invalid base frame index access!");
+ Base.FI = FI;
+ }
+ unsigned getFI() const {
+ assert(isFIBase() && "Invalid base frame index access!");
+ return Base.FI;
+ }
+
+ void setOffset(int64_t Offset_) { Offset = Offset_; }
+ int64_t getOffset() const { return Offset; }
+ void setGlobalValue(const GlobalValue *G) { GV = G; }
+ const GlobalValue *getGlobalValue() { return GV; }
+ };
+
+ /// Subtarget - Keep a pointer to the MipsSubtarget around so that we can
+ /// make the right decision when generating code for different targets.
+ const TargetMachine &TM;
+ const MipsSubtarget *Subtarget;
+ const TargetInstrInfo &TII;
+ const TargetLowering &TLI;
+ MipsFunctionInfo *MFI;
+
+ // Convenience variables to avoid some queries.
+ LLVMContext *Context;
+
+ bool fastLowerArguments() override;
+ bool fastLowerCall(CallLoweringInfo &CLI) override;
+ bool fastLowerIntrinsicCall(const IntrinsicInst *II) override;
+
+ bool UnsupportedFPMode; // To allow fast-isel to proceed and just not handle
+ // floating point but not reject doing fast-isel in other
+ // situations
+
+private:
+ // Selection routines.
+ bool selectLogicalOp(const Instruction *I);
+ bool selectLoad(const Instruction *I);
+ bool selectStore(const Instruction *I);
+ bool selectBranch(const Instruction *I);
+ bool selectSelect(const Instruction *I);
+ bool selectCmp(const Instruction *I);
+ bool selectFPExt(const Instruction *I);
+ bool selectFPTrunc(const Instruction *I);
+ bool selectFPToInt(const Instruction *I, bool IsSigned);
+ bool selectRet(const Instruction *I);
+ bool selectTrunc(const Instruction *I);
+ bool selectIntExt(const Instruction *I);
+ bool selectShift(const Instruction *I);
+ bool selectDivRem(const Instruction *I, unsigned ISDOpcode);
+
+ // Utility helper routines.
+ bool isTypeLegal(Type *Ty, MVT &VT);
+ bool isTypeSupported(Type *Ty, MVT &VT);
+ bool isLoadTypeLegal(Type *Ty, MVT &VT);
+ bool computeAddress(const Value *Obj, Address &Addr);
+ bool computeCallAddress(const Value *V, Address &Addr);
+ void simplifyAddress(Address &Addr);
+
+ // Emit helper routines.
+ bool emitCmp(unsigned DestReg, const CmpInst *CI);
+ bool emitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+ unsigned Alignment = 0);
+ bool emitStore(MVT VT, unsigned SrcReg, Address Addr,
+ MachineMemOperand *MMO = nullptr);
+ bool emitStore(MVT VT, unsigned SrcReg, Address &Addr,
+ unsigned Alignment = 0);
+ unsigned emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool isZExt);
+ bool emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, unsigned DestReg,
+
+ bool IsZExt);
+ bool emitIntZExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, unsigned DestReg);
+
+ bool emitIntSExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, unsigned DestReg);
+ bool emitIntSExt32r1(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+ unsigned DestReg);
+ bool emitIntSExt32r2(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+ unsigned DestReg);
+
+ unsigned getRegEnsuringSimpleIntegerWidening(const Value *, bool IsUnsigned);
+
+ unsigned emitLogicalOp(unsigned ISDOpc, MVT RetVT, const Value *LHS,
+ const Value *RHS);
+
+ unsigned materializeFP(const ConstantFP *CFP, MVT VT);
+ unsigned materializeGV(const GlobalValue *GV, MVT VT);
+ unsigned materializeInt(const Constant *C, MVT VT);
+ unsigned materialize32BitInt(int64_t Imm, const TargetRegisterClass *RC);
+ unsigned materializeExternalCallSym(MCSymbol *Syn);
+
+ MachineInstrBuilder emitInst(unsigned Opc) {
+ return BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
+ }
+ MachineInstrBuilder emitInst(unsigned Opc, unsigned DstReg) {
+ return BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
+ DstReg);
+ }
+ MachineInstrBuilder emitInstStore(unsigned Opc, unsigned SrcReg,
+ unsigned MemReg, int64_t MemOffset) {
+ return emitInst(Opc).addReg(SrcReg).addReg(MemReg).addImm(MemOffset);
+ }
+ MachineInstrBuilder emitInstLoad(unsigned Opc, unsigned DstReg,
+ unsigned MemReg, int64_t MemOffset) {
+ return emitInst(Opc, DstReg).addReg(MemReg).addImm(MemOffset);
+ }
+
+ unsigned fastEmitInst_rr(unsigned MachineInstOpcode,
+ const TargetRegisterClass *RC,
+ unsigned Op0, bool Op0IsKill,
+ unsigned Op1, bool Op1IsKill);
+
+ // for some reason, this default is not generated by tablegen
+ // so we explicitly generate it here.
+ //
+ unsigned fastEmitInst_riir(uint64_t inst, const TargetRegisterClass *RC,
+ unsigned Op0, bool Op0IsKill, uint64_t imm1,
+ uint64_t imm2, unsigned Op3, bool Op3IsKill) {
+ return 0;
+ }
+
+ // Call handling routines.
+private:
+ CCAssignFn *CCAssignFnForCall(CallingConv::ID CC) const;
+ bool processCallArgs(CallLoweringInfo &CLI, SmallVectorImpl<MVT> &ArgVTs,
+ unsigned &NumBytes);
+ bool finishCall(CallLoweringInfo &CLI, MVT RetVT, unsigned NumBytes);
+ const MipsABIInfo &getABI() const {
+ return static_cast<const MipsTargetMachine &>(TM).getABI();
+ }
+
+public:
+ // Backend specific FastISel code.
+ explicit MipsFastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo)
+ : FastISel(funcInfo, libInfo), TM(funcInfo.MF->getTarget()),
+ Subtarget(&funcInfo.MF->getSubtarget<MipsSubtarget>()),
+ TII(*Subtarget->getInstrInfo()), TLI(*Subtarget->getTargetLowering()) {
+ MFI = funcInfo.MF->getInfo<MipsFunctionInfo>();
+ Context = &funcInfo.Fn->getContext();
+ UnsupportedFPMode = Subtarget->isFP64bit() || Subtarget->useSoftFloat();
+ }
+
+ unsigned fastMaterializeAlloca(const AllocaInst *AI) override;
+ unsigned fastMaterializeConstant(const Constant *C) override;
+ bool fastSelectInstruction(const Instruction *I) override;
+
+#include "MipsGenFastISel.inc"
+};
+} // end anonymous namespace.
+
+static bool CC_Mips(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
+ CCState &State) LLVM_ATTRIBUTE_UNUSED;
+
+static bool CC_MipsO32_FP32(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State) {
+ llvm_unreachable("should not be called");
+}
+
+static bool CC_MipsO32_FP64(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State) {
+ llvm_unreachable("should not be called");
+}
+
+#include "MipsGenCallingConv.inc"
+
+CCAssignFn *MipsFastISel::CCAssignFnForCall(CallingConv::ID CC) const {
+ return CC_MipsO32;
+}
+
+unsigned MipsFastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT,
+ const Value *LHS, const Value *RHS) {
+ // Canonicalize immediates to the RHS first.
+ if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS))
+ std::swap(LHS, RHS);
+
+ unsigned Opc;
+ switch (ISDOpc) {
+ case ISD::AND:
+ Opc = Mips::AND;
+ break;
+ case ISD::OR:
+ Opc = Mips::OR;
+ break;
+ case ISD::XOR:
+ Opc = Mips::XOR;
+ break;
+ default:
+ llvm_unreachable("unexpected opcode");
+ }
+
+ unsigned LHSReg = getRegForValue(LHS);
+ if (!LHSReg)
+ return 0;
+
+ unsigned RHSReg;
+ if (const auto *C = dyn_cast<ConstantInt>(RHS))
+ RHSReg = materializeInt(C, MVT::i32);
+ else
+ RHSReg = getRegForValue(RHS);
+ if (!RHSReg)
+ return 0;
+
+ unsigned ResultReg = createResultReg(&Mips::GPR32RegClass);
+ if (!ResultReg)
+ return 0;
+
+ emitInst(Opc, ResultReg).addReg(LHSReg).addReg(RHSReg);
+ return ResultReg;
+}
+
+unsigned MipsFastISel::fastMaterializeAlloca(const AllocaInst *AI) {
+ assert(TLI.getValueType(DL, AI->getType(), true) == MVT::i32 &&
+ "Alloca should always return a pointer.");
+
+ DenseMap<const AllocaInst *, int>::iterator SI =
+ FuncInfo.StaticAllocaMap.find(AI);
+
+ if (SI != FuncInfo.StaticAllocaMap.end()) {
+ unsigned ResultReg = createResultReg(&Mips::GPR32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::LEA_ADDiu),
+ ResultReg)
+ .addFrameIndex(SI->second)
+ .addImm(0);
+ return ResultReg;
+ }
+
+ return 0;
+}
+
+unsigned MipsFastISel::materializeInt(const Constant *C, MVT VT) {
+ if (VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8 && VT != MVT::i1)
+ return 0;
+ const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+ const ConstantInt *CI = cast<ConstantInt>(C);
+ return materialize32BitInt(CI->getZExtValue(), RC);
+}
+
+unsigned MipsFastISel::materialize32BitInt(int64_t Imm,
+ const TargetRegisterClass *RC) {
+ unsigned ResultReg = createResultReg(RC);
+
+ if (isInt<16>(Imm)) {
+ unsigned Opc = Mips::ADDiu;
+ emitInst(Opc, ResultReg).addReg(Mips::ZERO).addImm(Imm);
+ return ResultReg;
+ } else if (isUInt<16>(Imm)) {
+ emitInst(Mips::ORi, ResultReg).addReg(Mips::ZERO).addImm(Imm);
+ return ResultReg;
+ }
+ unsigned Lo = Imm & 0xFFFF;
+ unsigned Hi = (Imm >> 16) & 0xFFFF;
+ if (Lo) {
+ // Both Lo and Hi have nonzero bits.
+ unsigned TmpReg = createResultReg(RC);
+ emitInst(Mips::LUi, TmpReg).addImm(Hi);
+ emitInst(Mips::ORi, ResultReg).addReg(TmpReg).addImm(Lo);
+ } else {
+ emitInst(Mips::LUi, ResultReg).addImm(Hi);
+ }
+ return ResultReg;
+}
+
+unsigned MipsFastISel::materializeFP(const ConstantFP *CFP, MVT VT) {
+ if (UnsupportedFPMode)
+ return 0;
+ int64_t Imm = CFP->getValueAPF().bitcastToAPInt().getZExtValue();
+ if (VT == MVT::f32) {
+ const TargetRegisterClass *RC = &Mips::FGR32RegClass;
+ unsigned DestReg = createResultReg(RC);
+ unsigned TempReg = materialize32BitInt(Imm, &Mips::GPR32RegClass);
+ emitInst(Mips::MTC1, DestReg).addReg(TempReg);
+ return DestReg;
+ } else if (VT == MVT::f64) {
+ const TargetRegisterClass *RC = &Mips::AFGR64RegClass;
+ unsigned DestReg = createResultReg(RC);
+ unsigned TempReg1 = materialize32BitInt(Imm >> 32, &Mips::GPR32RegClass);
+ unsigned TempReg2 =
+ materialize32BitInt(Imm & 0xFFFFFFFF, &Mips::GPR32RegClass);
+ emitInst(Mips::BuildPairF64, DestReg).addReg(TempReg2).addReg(TempReg1);
+ return DestReg;
+ }
+ return 0;
+}
+
+unsigned MipsFastISel::materializeGV(const GlobalValue *GV, MVT VT) {
+ // For now 32-bit only.
+ if (VT != MVT::i32)
+ return 0;
+ const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+ unsigned DestReg = createResultReg(RC);
+ const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+ bool IsThreadLocal = GVar && GVar->isThreadLocal();
+ // TLS not supported at this time.
+ if (IsThreadLocal)
+ return 0;
+ emitInst(Mips::LW, DestReg)
+ .addReg(MFI->getGlobalBaseReg())
+ .addGlobalAddress(GV, 0, MipsII::MO_GOT);
+ if ((GV->hasInternalLinkage() ||
+ (GV->hasLocalLinkage() && !isa<Function>(GV)))) {
+ unsigned TempReg = createResultReg(RC);
+ emitInst(Mips::ADDiu, TempReg)
+ .addReg(DestReg)
+ .addGlobalAddress(GV, 0, MipsII::MO_ABS_LO);
+ DestReg = TempReg;
+ }
+ return DestReg;
+}
+
+unsigned MipsFastISel::materializeExternalCallSym(MCSymbol *Sym) {
+ const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+ unsigned DestReg = createResultReg(RC);
+ emitInst(Mips::LW, DestReg)
+ .addReg(MFI->getGlobalBaseReg())
+ .addSym(Sym, MipsII::MO_GOT);
+ return DestReg;
+}
+
+// Materialize a constant into a register, and return the register
+// number (or zero if we failed to handle it).
+unsigned MipsFastISel::fastMaterializeConstant(const Constant *C) {
+ EVT CEVT = TLI.getValueType(DL, C->getType(), true);
+
+ // Only handle simple types.
+ if (!CEVT.isSimple())
+ return 0;
+ MVT VT = CEVT.getSimpleVT();
+
+ if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
+ return (UnsupportedFPMode) ? 0 : materializeFP(CFP, VT);
+ else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+ return materializeGV(GV, VT);
+ else if (isa<ConstantInt>(C))
+ return materializeInt(C, VT);
+
+ return 0;
+}
+
+bool MipsFastISel::computeAddress(const Value *Obj, Address &Addr) {
+
+ const User *U = nullptr;
+ unsigned Opcode = Instruction::UserOp1;
+ if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
+ // Don't walk into other basic blocks unless the object is an alloca from
+ // another block, otherwise it may not have a virtual register assigned.
+ if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(Obj)) ||
+ FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
+ Opcode = I->getOpcode();
+ U = I;
+ }
+ } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(Obj)) {
+ Opcode = C->getOpcode();
+ U = C;
+ }
+ switch (Opcode) {
+ default:
+ break;
+ case Instruction::BitCast: {
+ // Look through bitcasts.
+ return computeAddress(U->getOperand(0), Addr);
+ }
+ case Instruction::GetElementPtr: {
+ Address SavedAddr = Addr;
+ int64_t TmpOffset = Addr.getOffset();
+ // Iterate through the GEP folding the constants into offsets where
+ // we can.
+ gep_type_iterator GTI = gep_type_begin(U);
+ for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end(); i != e;
+ ++i, ++GTI) {
+ const Value *Op = *i;
+ if (StructType *STy = GTI.getStructTypeOrNull()) {
+ const StructLayout *SL = DL.getStructLayout(STy);
+ unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
+ TmpOffset += SL->getElementOffset(Idx);
+ } else {
+ uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
+ for (;;) {
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+ // Constant-offset addressing.
+ TmpOffset += CI->getSExtValue() * S;
+ break;
+ }
+ if (canFoldAddIntoGEP(U, Op)) {
+ // A compatible add with a constant operand. Fold the constant.
+ ConstantInt *CI =
+ cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+ TmpOffset += CI->getSExtValue() * S;
+ // Iterate on the other operand.
+ Op = cast<AddOperator>(Op)->getOperand(0);
+ continue;
+ }
+ // Unsupported
+ goto unsupported_gep;
+ }
+ }
+ }
+ // Try to grab the base operand now.
+ Addr.setOffset(TmpOffset);
+ if (computeAddress(U->getOperand(0), Addr))
+ return true;
+ // We failed, restore everything and try the other options.
+ Addr = SavedAddr;
+ unsupported_gep:
+ break;
+ }
+ case Instruction::Alloca: {
+ const AllocaInst *AI = cast<AllocaInst>(Obj);
+ DenseMap<const AllocaInst *, int>::iterator SI =
+ FuncInfo.StaticAllocaMap.find(AI);
+ if (SI != FuncInfo.StaticAllocaMap.end()) {
+ Addr.setKind(Address::FrameIndexBase);
+ Addr.setFI(SI->second);
+ return true;
+ }
+ break;
+ }
+ }
+ Addr.setReg(getRegForValue(Obj));
+ return Addr.getReg() != 0;
+}
+
+bool MipsFastISel::computeCallAddress(const Value *V, Address &Addr) {
+ const User *U = nullptr;
+ unsigned Opcode = Instruction::UserOp1;
+
+ if (const auto *I = dyn_cast<Instruction>(V)) {
+ // Check if the value is defined in the same basic block. This information
+ // is crucial to know whether or not folding an operand is valid.
+ if (I->getParent() == FuncInfo.MBB->getBasicBlock()) {
+ Opcode = I->getOpcode();
+ U = I;
+ }
+ } else if (const auto *C = dyn_cast<ConstantExpr>(V)) {
+ Opcode = C->getOpcode();
+ U = C;
+ }
+
+ switch (Opcode) {
+ default:
+ break;
+ case Instruction::BitCast:
+ // Look past bitcasts if its operand is in the same BB.
+ return computeCallAddress(U->getOperand(0), Addr);
+ break;
+ case Instruction::IntToPtr:
+ // Look past no-op inttoptrs if its operand is in the same BB.
+ if (TLI.getValueType(DL, U->getOperand(0)->getType()) ==
+ TLI.getPointerTy(DL))
+ return computeCallAddress(U->getOperand(0), Addr);
+ break;
+ case Instruction::PtrToInt:
+ // Look past no-op ptrtoints if its operand is in the same BB.
+ if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
+ return computeCallAddress(U->getOperand(0), Addr);
+ break;
+ }
+
+ if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+ Addr.setGlobalValue(GV);
+ return true;
+ }
+
+ // If all else fails, try to materialize the value in a register.
+ if (!Addr.getGlobalValue()) {
+ Addr.setReg(getRegForValue(V));
+ return Addr.getReg() != 0;
+ }
+
+ return false;
+}
+
+bool MipsFastISel::isTypeLegal(Type *Ty, MVT &VT) {
+ EVT evt = TLI.getValueType(DL, Ty, true);
+ // Only handle simple types.
+ if (evt == MVT::Other || !evt.isSimple())
+ return false;
+ VT = evt.getSimpleVT();
+
+ // Handle all legal types, i.e. a register that will directly hold this
+ // value.
+ return TLI.isTypeLegal(VT);
+}
+
+bool MipsFastISel::isTypeSupported(Type *Ty, MVT &VT) {
+ if (Ty->isVectorTy())
+ return false;
+
+ if (isTypeLegal(Ty, VT))
+ return true;
+
+ // If this is a type than can be sign or zero-extended to a basic operation
+ // go ahead and accept it now.
+ if (VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16)
+ return true;
+
+ return false;
+}
+
+bool MipsFastISel::isLoadTypeLegal(Type *Ty, MVT &VT) {
+ if (isTypeLegal(Ty, VT))
+ return true;
+ // We will extend this in a later patch:
+ // If this is a type than can be sign or zero-extended to a basic operation
+ // go ahead and accept it now.
+ if (VT == MVT::i8 || VT == MVT::i16)
+ return true;
+ return false;
+}
+// Because of how EmitCmp is called with fast-isel, you can
+// end up with redundant "andi" instructions after the sequences emitted below.
+// We should try and solve this issue in the future.
+//
+bool MipsFastISel::emitCmp(unsigned ResultReg, const CmpInst *CI) {
+ const Value *Left = CI->getOperand(0), *Right = CI->getOperand(1);
+ bool IsUnsigned = CI->isUnsigned();
+ unsigned LeftReg = getRegEnsuringSimpleIntegerWidening(Left, IsUnsigned);
+ if (LeftReg == 0)
+ return false;
+ unsigned RightReg = getRegEnsuringSimpleIntegerWidening(Right, IsUnsigned);
+ if (RightReg == 0)
+ return false;
+ CmpInst::Predicate P = CI->getPredicate();
+
+ switch (P) {
+ default:
+ return false;
+ case CmpInst::ICMP_EQ: {
+ unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
+ emitInst(Mips::XOR, TempReg).addReg(LeftReg).addReg(RightReg);
+ emitInst(Mips::SLTiu, ResultReg).addReg(TempReg).addImm(1);
+ break;
+ }
+ case CmpInst::ICMP_NE: {
+ unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
+ emitInst(Mips::XOR, TempReg).addReg(LeftReg).addReg(RightReg);
+ emitInst(Mips::SLTu, ResultReg).addReg(Mips::ZERO).addReg(TempReg);
+ break;
+ }
+ case CmpInst::ICMP_UGT: {
+ emitInst(Mips::SLTu, ResultReg).addReg(RightReg).addReg(LeftReg);
+ break;
+ }
+ case CmpInst::ICMP_ULT: {
+ emitInst(Mips::SLTu, ResultReg).addReg(LeftReg).addReg(RightReg);
+ break;
+ }
+ case CmpInst::ICMP_UGE: {
+ unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
+ emitInst(Mips::SLTu, TempReg).addReg(LeftReg).addReg(RightReg);
+ emitInst(Mips::XORi, ResultReg).addReg(TempReg).addImm(1);
+ break;
+ }
+ case CmpInst::ICMP_ULE: {
+ unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
+ emitInst(Mips::SLTu, TempReg).addReg(RightReg).addReg(LeftReg);
+ emitInst(Mips::XORi, ResultReg).addReg(TempReg).addImm(1);
+ break;
+ }
+ case CmpInst::ICMP_SGT: {
+ emitInst(Mips::SLT, ResultReg).addReg(RightReg).addReg(LeftReg);
+ break;
+ }
+ case CmpInst::ICMP_SLT: {
+ emitInst(Mips::SLT, ResultReg).addReg(LeftReg).addReg(RightReg);
+ break;
+ }
+ case CmpInst::ICMP_SGE: {
+ unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
+ emitInst(Mips::SLT, TempReg).addReg(LeftReg).addReg(RightReg);
+ emitInst(Mips::XORi, ResultReg).addReg(TempReg).addImm(1);
+ break;
+ }
+ case CmpInst::ICMP_SLE: {
+ unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
+ emitInst(Mips::SLT, TempReg).addReg(RightReg).addReg(LeftReg);
+ emitInst(Mips::XORi, ResultReg).addReg(TempReg).addImm(1);
+ break;
+ }
+ case CmpInst::FCMP_OEQ:
+ case CmpInst::FCMP_UNE:
+ case CmpInst::FCMP_OLT:
+ case CmpInst::FCMP_OLE:
+ case CmpInst::FCMP_OGT:
+ case CmpInst::FCMP_OGE: {
+ if (UnsupportedFPMode)
+ return false;
+ bool IsFloat = Left->getType()->isFloatTy();
+ bool IsDouble = Left->getType()->isDoubleTy();
+ if (!IsFloat && !IsDouble)
+ return false;
+ unsigned Opc, CondMovOpc;
+ switch (P) {
+ case CmpInst::FCMP_OEQ:
+ Opc = IsFloat ? Mips::C_EQ_S : Mips::C_EQ_D32;
+ CondMovOpc = Mips::MOVT_I;
+ break;
+ case CmpInst::FCMP_UNE:
+ Opc = IsFloat ? Mips::C_EQ_S : Mips::C_EQ_D32;
+ CondMovOpc = Mips::MOVF_I;
+ break;
+ case CmpInst::FCMP_OLT:
+ Opc = IsFloat ? Mips::C_OLT_S : Mips::C_OLT_D32;
+ CondMovOpc = Mips::MOVT_I;
+ break;
+ case CmpInst::FCMP_OLE:
+ Opc = IsFloat ? Mips::C_OLE_S : Mips::C_OLE_D32;
+ CondMovOpc = Mips::MOVT_I;
+ break;
+ case CmpInst::FCMP_OGT:
+ Opc = IsFloat ? Mips::C_ULE_S : Mips::C_ULE_D32;
+ CondMovOpc = Mips::MOVF_I;
+ break;
+ case CmpInst::FCMP_OGE:
+ Opc = IsFloat ? Mips::C_ULT_S : Mips::C_ULT_D32;
+ CondMovOpc = Mips::MOVF_I;
+ break;
+ default:
+ llvm_unreachable("Only switching of a subset of CCs.");
+ }
+ unsigned RegWithZero = createResultReg(&Mips::GPR32RegClass);
+ unsigned RegWithOne = createResultReg(&Mips::GPR32RegClass);
+ emitInst(Mips::ADDiu, RegWithZero).addReg(Mips::ZERO).addImm(0);
+ emitInst(Mips::ADDiu, RegWithOne).addReg(Mips::ZERO).addImm(1);
+ emitInst(Opc).addReg(LeftReg).addReg(RightReg).addReg(
+ Mips::FCC0, RegState::ImplicitDefine);
+ emitInst(CondMovOpc, ResultReg)
+ .addReg(RegWithOne)
+ .addReg(Mips::FCC0)
+ .addReg(RegWithZero);
+ break;
+ }
+ }
+ return true;
+}
+bool MipsFastISel::emitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+ unsigned Alignment) {
+ //
+ // more cases will be handled here in following patches.
+ //
+ unsigned Opc;
+ switch (VT.SimpleTy) {
+ case MVT::i32: {
+ ResultReg = createResultReg(&Mips::GPR32RegClass);
+ Opc = Mips::LW;
+ break;
+ }
+ case MVT::i16: {
+ ResultReg = createResultReg(&Mips::GPR32RegClass);
+ Opc = Mips::LHu;
+ break;
+ }
+ case MVT::i8: {
+ ResultReg = createResultReg(&Mips::GPR32RegClass);
+ Opc = Mips::LBu;
+ break;
+ }
+ case MVT::f32: {
+ if (UnsupportedFPMode)
+ return false;
+ ResultReg = createResultReg(&Mips::FGR32RegClass);
+ Opc = Mips::LWC1;
+ break;
+ }
+ case MVT::f64: {
+ if (UnsupportedFPMode)
+ return false;
+ ResultReg = createResultReg(&Mips::AFGR64RegClass);
+ Opc = Mips::LDC1;
+ break;
+ }
+ default:
+ return false;
+ }
+ if (Addr.isRegBase()) {
+ simplifyAddress(Addr);
+ emitInstLoad(Opc, ResultReg, Addr.getReg(), Addr.getOffset());
+ return true;
+ }
+ if (Addr.isFIBase()) {
+ unsigned FI = Addr.getFI();
+ unsigned Align = 4;
+ int64_t Offset = Addr.getOffset();
+ MachineFrameInfo &MFI = MF->getFrameInfo();
+ MachineMemOperand *MMO = MF->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad,
+ MFI.getObjectSize(FI), Align);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+ .addFrameIndex(FI)
+ .addImm(Offset)
+ .addMemOperand(MMO);
+ return true;
+ }
+ return false;
+}
+
+bool MipsFastISel::emitStore(MVT VT, unsigned SrcReg, Address &Addr,
+ unsigned Alignment) {
+ //
+ // more cases will be handled here in following patches.
+ //
+ unsigned Opc;
+ switch (VT.SimpleTy) {
+ case MVT::i8:
+ Opc = Mips::SB;
+ break;
+ case MVT::i16:
+ Opc = Mips::SH;
+ break;
+ case MVT::i32:
+ Opc = Mips::SW;
+ break;
+ case MVT::f32:
+ if (UnsupportedFPMode)
+ return false;
+ Opc = Mips::SWC1;
+ break;
+ case MVT::f64:
+ if (UnsupportedFPMode)
+ return false;
+ Opc = Mips::SDC1;
+ break;
+ default:
+ return false;
+ }
+ if (Addr.isRegBase()) {
+ simplifyAddress(Addr);
+ emitInstStore(Opc, SrcReg, Addr.getReg(), Addr.getOffset());
+ return true;
+ }
+ if (Addr.isFIBase()) {
+ unsigned FI = Addr.getFI();
+ unsigned Align = 4;
+ int64_t Offset = Addr.getOffset();
+ MachineFrameInfo &MFI = MF->getFrameInfo();
+ MachineMemOperand *MMO = MF->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore,
+ MFI.getObjectSize(FI), Align);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
+ .addReg(SrcReg)
+ .addFrameIndex(FI)
+ .addImm(Offset)
+ .addMemOperand(MMO);
+ return true;
+ }
+ return false;
+}
+
+bool MipsFastISel::selectLogicalOp(const Instruction *I) {
+ MVT VT;
+ if (!isTypeSupported(I->getType(), VT))
+ return false;
+
+ unsigned ResultReg;
+ switch (I->getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected instruction.");
+ case Instruction::And:
+ ResultReg = emitLogicalOp(ISD::AND, VT, I->getOperand(0), I->getOperand(1));
+ break;
+ case Instruction::Or:
+ ResultReg = emitLogicalOp(ISD::OR, VT, I->getOperand(0), I->getOperand(1));
+ break;
+ case Instruction::Xor:
+ ResultReg = emitLogicalOp(ISD::XOR, VT, I->getOperand(0), I->getOperand(1));
+ break;
+ }
+
+ if (!ResultReg)
+ return false;
+
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool MipsFastISel::selectLoad(const Instruction *I) {
+ // Atomic loads need special handling.
+ if (cast<LoadInst>(I)->isAtomic())
+ return false;
+
+ // Verify we have a legal type before going any further.
+ MVT VT;
+ if (!isLoadTypeLegal(I->getType(), VT))
+ return false;
+
+ // See if we can handle this address.
+ Address Addr;
+ if (!computeAddress(I->getOperand(0), Addr))
+ return false;
+
+ unsigned ResultReg;
+ if (!emitLoad(VT, ResultReg, Addr, cast<LoadInst>(I)->getAlignment()))
+ return false;
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool MipsFastISel::selectStore(const Instruction *I) {
+ Value *Op0 = I->getOperand(0);
+ unsigned SrcReg = 0;
+
+ // Atomic stores need special handling.
+ if (cast<StoreInst>(I)->isAtomic())
+ return false;
+
+ // Verify we have a legal type before going any further.
+ MVT VT;
+ if (!isLoadTypeLegal(I->getOperand(0)->getType(), VT))
+ return false;
+
+ // Get the value to be stored into a register.
+ SrcReg = getRegForValue(Op0);
+ if (SrcReg == 0)
+ return false;
+
+ // See if we can handle this address.
+ Address Addr;
+ if (!computeAddress(I->getOperand(1), Addr))
+ return false;
+
+ if (!emitStore(VT, SrcReg, Addr, cast<StoreInst>(I)->getAlignment()))
+ return false;
+ return true;
+}
+
+//
+// This can cause a redundant sltiu to be generated.
+// FIXME: try and eliminate this in a future patch.
+//
+bool MipsFastISel::selectBranch(const Instruction *I) {
+ const BranchInst *BI = cast<BranchInst>(I);
+ MachineBasicBlock *BrBB = FuncInfo.MBB;
+ //
+ // TBB is the basic block for the case where the comparison is true.
+ // FBB is the basic block for the case where the comparison is false.
+ // if (cond) goto TBB
+ // goto FBB
+ // TBB:
+ //
+ MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
+ MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
+ BI->getCondition();
+ // For now, just try the simplest case where it's fed by a compare.
+ if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
+ unsigned CondReg = createResultReg(&Mips::GPR32RegClass);
+ if (!emitCmp(CondReg, CI))
+ return false;
+ BuildMI(*BrBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::BGTZ))
+ .addReg(CondReg)
+ .addMBB(TBB);
+ finishCondBranch(BI->getParent(), TBB, FBB);
+ return true;
+ }
+ return false;
+}
+
+bool MipsFastISel::selectCmp(const Instruction *I) {
+ const CmpInst *CI = cast<CmpInst>(I);
+ unsigned ResultReg = createResultReg(&Mips::GPR32RegClass);
+ if (!emitCmp(ResultReg, CI))
+ return false;
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+// Attempt to fast-select a floating-point extend instruction.
+bool MipsFastISel::selectFPExt(const Instruction *I) {
+ if (UnsupportedFPMode)
+ return false;
+ Value *Src = I->getOperand(0);
+ EVT SrcVT = TLI.getValueType(DL, Src->getType(), true);
+ EVT DestVT = TLI.getValueType(DL, I->getType(), true);
+
+ if (SrcVT != MVT::f32 || DestVT != MVT::f64)
+ return false;
+
+ unsigned SrcReg =
+ getRegForValue(Src); // this must be a 32bit floating point register class
+ // maybe we should handle this differently
+ if (!SrcReg)
+ return false;
+
+ unsigned DestReg = createResultReg(&Mips::AFGR64RegClass);
+ emitInst(Mips::CVT_D32_S, DestReg).addReg(SrcReg);
+ updateValueMap(I, DestReg);
+ return true;
+}
+
+bool MipsFastISel::selectSelect(const Instruction *I) {
+ assert(isa<SelectInst>(I) && "Expected a select instruction.");
+
+ DEBUG(dbgs() << "selectSelect\n");
+
+ MVT VT;
+ if (!isTypeSupported(I->getType(), VT) || UnsupportedFPMode) {
+ DEBUG(dbgs() << ".. .. gave up (!isTypeSupported || UnsupportedFPMode)\n");
+ return false;
+ }
+
+ unsigned CondMovOpc;
+ const TargetRegisterClass *RC;
+
+ if (VT.isInteger() && !VT.isVector() && VT.getSizeInBits() <= 32) {
+ CondMovOpc = Mips::MOVN_I_I;
+ RC = &Mips::GPR32RegClass;
+ } else if (VT == MVT::f32) {
+ CondMovOpc = Mips::MOVN_I_S;
+ RC = &Mips::FGR32RegClass;
+ } else if (VT == MVT::f64) {
+ CondMovOpc = Mips::MOVN_I_D32;
+ RC = &Mips::AFGR64RegClass;
+ } else
+ return false;
+
+ const SelectInst *SI = cast<SelectInst>(I);
+ const Value *Cond = SI->getCondition();
+ unsigned Src1Reg = getRegForValue(SI->getTrueValue());
+ unsigned Src2Reg = getRegForValue(SI->getFalseValue());
+ unsigned CondReg = getRegForValue(Cond);
+
+ if (!Src1Reg || !Src2Reg || !CondReg)
+ return false;
+
+ unsigned ZExtCondReg = createResultReg(&Mips::GPR32RegClass);
+ if (!ZExtCondReg)
+ return false;
+
+ if (!emitIntExt(MVT::i1, CondReg, MVT::i32, ZExtCondReg, true))
+ return false;
+
+ unsigned ResultReg = createResultReg(RC);
+ unsigned TempReg = createResultReg(RC);
+
+ if (!ResultReg || !TempReg)
+ return false;
+
+ emitInst(TargetOpcode::COPY, TempReg).addReg(Src2Reg);
+ emitInst(CondMovOpc, ResultReg)
+ .addReg(Src1Reg).addReg(ZExtCondReg).addReg(TempReg);
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+// Attempt to fast-select a floating-point truncate instruction.
+bool MipsFastISel::selectFPTrunc(const Instruction *I) {
+ if (UnsupportedFPMode)
+ return false;
+ Value *Src = I->getOperand(0);
+ EVT SrcVT = TLI.getValueType(DL, Src->getType(), true);
+ EVT DestVT = TLI.getValueType(DL, I->getType(), true);
+
+ if (SrcVT != MVT::f64 || DestVT != MVT::f32)
+ return false;
+
+ unsigned SrcReg = getRegForValue(Src);
+ if (!SrcReg)
+ return false;
+
+ unsigned DestReg = createResultReg(&Mips::FGR32RegClass);
+ if (!DestReg)
+ return false;
+
+ emitInst(Mips::CVT_S_D32, DestReg).addReg(SrcReg);
+ updateValueMap(I, DestReg);
+ return true;
+}
+
+// Attempt to fast-select a floating-point-to-integer conversion.
+bool MipsFastISel::selectFPToInt(const Instruction *I, bool IsSigned) {
+ if (UnsupportedFPMode)
+ return false;
+ MVT DstVT, SrcVT;
+ if (!IsSigned)
+ return false; // We don't handle this case yet. There is no native
+ // instruction for this but it can be synthesized.
+ Type *DstTy = I->getType();
+ if (!isTypeLegal(DstTy, DstVT))
+ return false;
+
+ if (DstVT != MVT::i32)
+ return false;
+
+ Value *Src = I->getOperand(0);
+ Type *SrcTy = Src->getType();
+ if (!isTypeLegal(SrcTy, SrcVT))
+ return false;
+
+ if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
+ return false;
+
+ unsigned SrcReg = getRegForValue(Src);
+ if (SrcReg == 0)
+ return false;
+
+ // Determine the opcode for the conversion, which takes place
+ // entirely within FPRs.
+ unsigned DestReg = createResultReg(&Mips::GPR32RegClass);
+ unsigned TempReg = createResultReg(&Mips::FGR32RegClass);
+ unsigned Opc = (SrcVT == MVT::f32) ? Mips::TRUNC_W_S : Mips::TRUNC_W_D32;
+
+ // Generate the convert.
+ emitInst(Opc, TempReg).addReg(SrcReg);
+ emitInst(Mips::MFC1, DestReg).addReg(TempReg);
+
+ updateValueMap(I, DestReg);
+ return true;
+}
+
+bool MipsFastISel::processCallArgs(CallLoweringInfo &CLI,
+ SmallVectorImpl<MVT> &OutVTs,
+ unsigned &NumBytes) {
+ CallingConv::ID CC = CLI.CallConv;
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CC, false, *FuncInfo.MF, ArgLocs, *Context);
+ CCInfo.AnalyzeCallOperands(OutVTs, CLI.OutFlags, CCAssignFnForCall(CC));
+ // Get a count of how many bytes are to be pushed on the stack.
+ NumBytes = CCInfo.getNextStackOffset();
+ // This is the minimum argument area used for A0-A3.
+ if (NumBytes < 16)
+ NumBytes = 16;
+
+ emitInst(Mips::ADJCALLSTACKDOWN).addImm(16);
+ // Process the args.
+ MVT firstMVT;
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ const Value *ArgVal = CLI.OutVals[VA.getValNo()];
+ MVT ArgVT = OutVTs[VA.getValNo()];
+
+ if (i == 0) {
+ firstMVT = ArgVT;
+ if (ArgVT == MVT::f32) {
+ VA.convertToReg(Mips::F12);
+ } else if (ArgVT == MVT::f64) {
+ VA.convertToReg(Mips::D6);
+ }
+ } else if (i == 1) {
+ if ((firstMVT == MVT::f32) || (firstMVT == MVT::f64)) {
+ if (ArgVT == MVT::f32) {
+ VA.convertToReg(Mips::F14);
+ } else if (ArgVT == MVT::f64) {
+ VA.convertToReg(Mips::D7);
+ }
+ }
+ }
+ if (((ArgVT == MVT::i32) || (ArgVT == MVT::f32) || (ArgVT == MVT::i16) ||
+ (ArgVT == MVT::i8)) &&
+ VA.isMemLoc()) {
+ switch (VA.getLocMemOffset()) {
+ case 0:
+ VA.convertToReg(Mips::A0);
+ break;
+ case 4:
+ VA.convertToReg(Mips::A1);
+ break;
+ case 8:
+ VA.convertToReg(Mips::A2);
+ break;
+ case 12:
+ VA.convertToReg(Mips::A3);
+ break;
+ default:
+ break;
+ }
+ }
+ unsigned ArgReg = getRegForValue(ArgVal);
+ if (!ArgReg)
+ return false;
+
+ // Handle arg promotion: SExt, ZExt, AExt.
+ switch (VA.getLocInfo()) {
+ case CCValAssign::Full:
+ break;
+ case CCValAssign::AExt:
+ case CCValAssign::SExt: {
+ MVT DestVT = VA.getLocVT();
+ MVT SrcVT = ArgVT;
+ ArgReg = emitIntExt(SrcVT, ArgReg, DestVT, /*isZExt=*/false);
+ if (!ArgReg)
+ return false;
+ break;
+ }
+ case CCValAssign::ZExt: {
+ MVT DestVT = VA.getLocVT();
+ MVT SrcVT = ArgVT;
+ ArgReg = emitIntExt(SrcVT, ArgReg, DestVT, /*isZExt=*/true);
+ if (!ArgReg)
+ return false;
+ break;
+ }
+ default:
+ llvm_unreachable("Unknown arg promotion!");
+ }
+
+ // Now copy/store arg to correct locations.
+ if (VA.isRegLoc() && !VA.needsCustom()) {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg);
+ CLI.OutRegs.push_back(VA.getLocReg());
+ } else if (VA.needsCustom()) {
+ llvm_unreachable("Mips does not use custom args.");
+ return false;
+ } else {
+ //
+ // FIXME: This path will currently return false. It was copied
+ // from the AArch64 port and should be essentially fine for Mips too.
+ // The work to finish up this path will be done in a follow-on patch.
+ //
+ assert(VA.isMemLoc() && "Assuming store on stack.");
+ // Don't emit stores for undef values.
+ if (isa<UndefValue>(ArgVal))
+ continue;
+
+ // Need to store on the stack.
+ // FIXME: This alignment is incorrect but this path is disabled
+ // for now (will return false). We need to determine the right alignment
+ // based on the normal alignment for the underlying machine type.
+ //
+ unsigned ArgSize = alignTo(ArgVT.getSizeInBits(), 4);
+
+ unsigned BEAlign = 0;
+ if (ArgSize < 8 && !Subtarget->isLittle())
+ BEAlign = 8 - ArgSize;
+
+ Address Addr;
+ Addr.setKind(Address::RegBase);
+ Addr.setReg(Mips::SP);
+ Addr.setOffset(VA.getLocMemOffset() + BEAlign);
+
+ unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType());
+ MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+ MachinePointerInfo::getStack(*FuncInfo.MF, Addr.getOffset()),
+ MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment);
+ (void)(MMO);
+ // if (!emitStore(ArgVT, ArgReg, Addr, MMO))
+ return false; // can't store on the stack yet.
+ }
+ }
+
+ return true;
+}
+
+bool MipsFastISel::finishCall(CallLoweringInfo &CLI, MVT RetVT,
+ unsigned NumBytes) {
+ CallingConv::ID CC = CLI.CallConv;
+ emitInst(Mips::ADJCALLSTACKUP).addImm(16).addImm(0);
+ if (RetVT != MVT::isVoid) {
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CC, false, *FuncInfo.MF, RVLocs, *Context);
+ CCInfo.AnalyzeCallResult(RetVT, RetCC_Mips);
+
+ // Only handle a single return value.
+ if (RVLocs.size() != 1)
+ return false;
+ // Copy all of the result registers out of their specified physreg.
+ MVT CopyVT = RVLocs[0].getValVT();
+ // Special handling for extended integers.
+ if (RetVT == MVT::i1 || RetVT == MVT::i8 || RetVT == MVT::i16)
+ CopyVT = MVT::i32;
+
+ unsigned ResultReg = createResultReg(TLI.getRegClassFor(CopyVT));
+ if (!ResultReg)
+ return false;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY),
+ ResultReg).addReg(RVLocs[0].getLocReg());
+ CLI.InRegs.push_back(RVLocs[0].getLocReg());
+
+ CLI.ResultReg = ResultReg;
+ CLI.NumResultRegs = 1;
+ }
+ return true;
+}
+
+bool MipsFastISel::fastLowerArguments() {
+ DEBUG(dbgs() << "fastLowerArguments\n");
+
+ if (!FuncInfo.CanLowerReturn) {
+ DEBUG(dbgs() << ".. gave up (!CanLowerReturn)\n");
+ return false;
+ }
+
+ const Function *F = FuncInfo.Fn;
+ if (F->isVarArg()) {
+ DEBUG(dbgs() << ".. gave up (varargs)\n");
+ return false;
+ }
+
+ CallingConv::ID CC = F->getCallingConv();
+ if (CC != CallingConv::C) {
+ DEBUG(dbgs() << ".. gave up (calling convention is not C)\n");
+ return false;
+ }
+
+ const ArrayRef<MCPhysReg> GPR32ArgRegs = {Mips::A0, Mips::A1, Mips::A2,
+ Mips::A3};
+ const ArrayRef<MCPhysReg> FGR32ArgRegs = {Mips::F12, Mips::F14};
+ const ArrayRef<MCPhysReg> AFGR64ArgRegs = {Mips::D6, Mips::D7};
+ ArrayRef<MCPhysReg>::iterator NextGPR32 = GPR32ArgRegs.begin();
+ ArrayRef<MCPhysReg>::iterator NextFGR32 = FGR32ArgRegs.begin();
+ ArrayRef<MCPhysReg>::iterator NextAFGR64 = AFGR64ArgRegs.begin();
+
+ struct AllocatedReg {
+ const TargetRegisterClass *RC;
+ unsigned Reg;
+ AllocatedReg(const TargetRegisterClass *RC, unsigned Reg)
+ : RC(RC), Reg(Reg) {}
+ };
+
+ // Only handle simple cases. i.e. All arguments are directly mapped to
+ // registers of the appropriate type.
+ SmallVector<AllocatedReg, 4> Allocation;
+ unsigned Idx = 1;
+ for (const auto &FormalArg : F->args()) {
+ if (F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
+ F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
+ F->getAttributes().hasAttribute(Idx, Attribute::ByVal)) {
+ DEBUG(dbgs() << ".. gave up (inreg, structret, byval)\n");
+ return false;
+ }
+
+ Type *ArgTy = FormalArg.getType();
+ if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy()) {
+ DEBUG(dbgs() << ".. gave up (struct, array, or vector)\n");
+ return false;
+ }
+
+ EVT ArgVT = TLI.getValueType(DL, ArgTy);
+ DEBUG(dbgs() << ".. " << (Idx - 1) << ": " << ArgVT.getEVTString() << "\n");
+ if (!ArgVT.isSimple()) {
+ DEBUG(dbgs() << ".. .. gave up (not a simple type)\n");
+ return false;
+ }
+
+ switch (ArgVT.getSimpleVT().SimpleTy) {
+ case MVT::i1:
+ case MVT::i8:
+ case MVT::i16:
+ if (!F->getAttributes().hasAttribute(Idx, Attribute::SExt) &&
+ !F->getAttributes().hasAttribute(Idx, Attribute::ZExt)) {
+ // It must be any extend, this shouldn't happen for clang-generated IR
+ // so just fall back on SelectionDAG.
+ DEBUG(dbgs() << ".. .. gave up (i8/i16 arg is not extended)\n");
+ return false;
+ }
+
+ if (NextGPR32 == GPR32ArgRegs.end()) {
+ DEBUG(dbgs() << ".. .. gave up (ran out of GPR32 arguments)\n");
+ return false;
+ }
+
+ DEBUG(dbgs() << ".. .. GPR32(" << *NextGPR32 << ")\n");
+ Allocation.emplace_back(&Mips::GPR32RegClass, *NextGPR32++);
+
+ // Allocating any GPR32 prohibits further use of floating point arguments.
+ NextFGR32 = FGR32ArgRegs.end();
+ NextAFGR64 = AFGR64ArgRegs.end();
+ break;
+
+ case MVT::i32:
+ if (F->getAttributes().hasAttribute(Idx, Attribute::ZExt)) {
+ // The O32 ABI does not permit a zero-extended i32.
+ DEBUG(dbgs() << ".. .. gave up (i32 arg is zero extended)\n");
+ return false;
+ }
+
+ if (NextGPR32 == GPR32ArgRegs.end()) {
+ DEBUG(dbgs() << ".. .. gave up (ran out of GPR32 arguments)\n");
+ return false;
+ }
+
+ DEBUG(dbgs() << ".. .. GPR32(" << *NextGPR32 << ")\n");
+ Allocation.emplace_back(&Mips::GPR32RegClass, *NextGPR32++);
+
+ // Allocating any GPR32 prohibits further use of floating point arguments.
+ NextFGR32 = FGR32ArgRegs.end();
+ NextAFGR64 = AFGR64ArgRegs.end();
+ break;
+
+ case MVT::f32:
+ if (UnsupportedFPMode) {
+ DEBUG(dbgs() << ".. .. gave up (UnsupportedFPMode)\n");
+ return false;
+ }
+ if (NextFGR32 == FGR32ArgRegs.end()) {
+ DEBUG(dbgs() << ".. .. gave up (ran out of FGR32 arguments)\n");
+ return false;
+ }
+ DEBUG(dbgs() << ".. .. FGR32(" << *NextFGR32 << ")\n");
+ Allocation.emplace_back(&Mips::FGR32RegClass, *NextFGR32++);
+ // Allocating an FGR32 also allocates the super-register AFGR64, and
+ // ABI rules require us to skip the corresponding GPR32.
+ if (NextGPR32 != GPR32ArgRegs.end())
+ NextGPR32++;
+ if (NextAFGR64 != AFGR64ArgRegs.end())
+ NextAFGR64++;
+ break;
+
+ case MVT::f64:
+ if (UnsupportedFPMode) {
+ DEBUG(dbgs() << ".. .. gave up (UnsupportedFPMode)\n");
+ return false;
+ }
+ if (NextAFGR64 == AFGR64ArgRegs.end()) {
+ DEBUG(dbgs() << ".. .. gave up (ran out of AFGR64 arguments)\n");
+ return false;
+ }
+ DEBUG(dbgs() << ".. .. AFGR64(" << *NextAFGR64 << ")\n");
+ Allocation.emplace_back(&Mips::AFGR64RegClass, *NextAFGR64++);
+ // Allocating an FGR32 also allocates the super-register AFGR64, and
+ // ABI rules require us to skip the corresponding GPR32 pair.
+ if (NextGPR32 != GPR32ArgRegs.end())
+ NextGPR32++;
+ if (NextGPR32 != GPR32ArgRegs.end())
+ NextGPR32++;
+ if (NextFGR32 != FGR32ArgRegs.end())
+ NextFGR32++;
+ break;
+
+ default:
+ DEBUG(dbgs() << ".. .. gave up (unknown type)\n");
+ return false;
+ }
+
+ ++Idx;
+ }
+
+ Idx = 0;
+ for (const auto &FormalArg : F->args()) {
+ unsigned SrcReg = Allocation[Idx].Reg;
+ unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, Allocation[Idx].RC);
+ // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
+ // Without this, EmitLiveInCopies may eliminate the livein if its only
+ // use is a bitcast (which isn't turned into an instruction).
+ unsigned ResultReg = createResultReg(Allocation[Idx].RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg)
+ .addReg(DstReg, getKillRegState(true));
+ updateValueMap(&FormalArg, ResultReg);
+ ++Idx;
+ }
+
+ // Calculate the size of the incoming arguments area.
+ // We currently reject all the cases where this would be non-zero.
+ unsigned IncomingArgSizeInBytes = 0;
+
+ // Account for the reserved argument area on ABI's that have one (O32).
+ // It seems strange to do this on the caller side but it's necessary in
+ // SelectionDAG's implementation.
+ IncomingArgSizeInBytes = std::min(getABI().GetCalleeAllocdArgSizeInBytes(CC),
+ IncomingArgSizeInBytes);
+
+ MF->getInfo<MipsFunctionInfo>()->setFormalArgInfo(IncomingArgSizeInBytes,
+ false);
+
+ return true;
+}
+
+bool MipsFastISel::fastLowerCall(CallLoweringInfo &CLI) {
+ CallingConv::ID CC = CLI.CallConv;
+ bool IsTailCall = CLI.IsTailCall;
+ bool IsVarArg = CLI.IsVarArg;
+ const Value *Callee = CLI.Callee;
+ MCSymbol *Symbol = CLI.Symbol;
+
+ // Do not handle FastCC.
+ if (CC == CallingConv::Fast)
+ return false;
+
+ // Allow SelectionDAG isel to handle tail calls.
+ if (IsTailCall)
+ return false;
+
+ // Let SDISel handle vararg functions.
+ if (IsVarArg)
+ return false;
+
+ // FIXME: Only handle *simple* calls for now.
+ MVT RetVT;
+ if (CLI.RetTy->isVoidTy())
+ RetVT = MVT::isVoid;
+ else if (!isTypeSupported(CLI.RetTy, RetVT))
+ return false;
+
+ for (auto Flag : CLI.OutFlags)
+ if (Flag.isInReg() || Flag.isSRet() || Flag.isNest() || Flag.isByVal())
+ return false;
+
+ // Set up the argument vectors.
+ SmallVector<MVT, 16> OutVTs;
+ OutVTs.reserve(CLI.OutVals.size());
+
+ for (auto *Val : CLI.OutVals) {
+ MVT VT;
+ if (!isTypeLegal(Val->getType(), VT) &&
+ !(VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16))
+ return false;
+
+ // We don't handle vector parameters yet.
+ if (VT.isVector() || VT.getSizeInBits() > 64)
+ return false;
+
+ OutVTs.push_back(VT);
+ }
+
+ Address Addr;
+ if (!computeCallAddress(Callee, Addr))
+ return false;
+
+ // Handle the arguments now that we've gotten them.
+ unsigned NumBytes;
+ if (!processCallArgs(CLI, OutVTs, NumBytes))
+ return false;
+
+ if (!Addr.getGlobalValue())
+ return false;
+
+ // Issue the call.
+ unsigned DestAddress;
+ if (Symbol)
+ DestAddress = materializeExternalCallSym(Symbol);
+ else
+ DestAddress = materializeGV(Addr.getGlobalValue(), MVT::i32);
+ emitInst(TargetOpcode::COPY, Mips::T9).addReg(DestAddress);
+ MachineInstrBuilder MIB =
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::JALR),
+ Mips::RA).addReg(Mips::T9);
+
+ // Add implicit physical register uses to the call.
+ for (auto Reg : CLI.OutRegs)
+ MIB.addReg(Reg, RegState::Implicit);
+
+ // Add a register mask with the call-preserved registers.
+ // Proper defs for return values will be added by setPhysRegsDeadExcept().
+ MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC));
+
+ CLI.Call = MIB;
+
+ // Finish off the call including any return values.
+ return finishCall(CLI, RetVT, NumBytes);
+}
+
+bool MipsFastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
+ switch (II->getIntrinsicID()) {
+ default:
+ return false;
+ case Intrinsic::bswap: {
+ Type *RetTy = II->getCalledFunction()->getReturnType();
+
+ MVT VT;
+ if (!isTypeSupported(RetTy, VT))
+ return false;
+
+ unsigned SrcReg = getRegForValue(II->getOperand(0));
+ if (SrcReg == 0)
+ return false;
+ unsigned DestReg = createResultReg(&Mips::GPR32RegClass);
+ if (DestReg == 0)
+ return false;
+ if (VT == MVT::i16) {
+ if (Subtarget->hasMips32r2()) {
+ emitInst(Mips::WSBH, DestReg).addReg(SrcReg);
+ updateValueMap(II, DestReg);
+ return true;
+ } else {
+ unsigned TempReg[3];
+ for (int i = 0; i < 3; i++) {
+ TempReg[i] = createResultReg(&Mips::GPR32RegClass);
+ if (TempReg[i] == 0)
+ return false;
+ }
+ emitInst(Mips::SLL, TempReg[0]).addReg(SrcReg).addImm(8);
+ emitInst(Mips::SRL, TempReg[1]).addReg(SrcReg).addImm(8);
+ emitInst(Mips::OR, TempReg[2]).addReg(TempReg[0]).addReg(TempReg[1]);
+ emitInst(Mips::ANDi, DestReg).addReg(TempReg[2]).addImm(0xFFFF);
+ updateValueMap(II, DestReg);
+ return true;
+ }
+ } else if (VT == MVT::i32) {
+ if (Subtarget->hasMips32r2()) {
+ unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
+ emitInst(Mips::WSBH, TempReg).addReg(SrcReg);
+ emitInst(Mips::ROTR, DestReg).addReg(TempReg).addImm(16);
+ updateValueMap(II, DestReg);
+ return true;
+ } else {
+ unsigned TempReg[8];
+ for (int i = 0; i < 8; i++) {
+ TempReg[i] = createResultReg(&Mips::GPR32RegClass);
+ if (TempReg[i] == 0)
+ return false;
+ }
+
+ emitInst(Mips::SRL, TempReg[0]).addReg(SrcReg).addImm(8);
+ emitInst(Mips::SRL, TempReg[1]).addReg(SrcReg).addImm(24);
+ emitInst(Mips::ANDi, TempReg[2]).addReg(TempReg[0]).addImm(0xFF00);
+ emitInst(Mips::OR, TempReg[3]).addReg(TempReg[1]).addReg(TempReg[2]);
+
+ emitInst(Mips::ANDi, TempReg[4]).addReg(SrcReg).addImm(0xFF00);
+ emitInst(Mips::SLL, TempReg[5]).addReg(TempReg[4]).addImm(8);
+
+ emitInst(Mips::SLL, TempReg[6]).addReg(SrcReg).addImm(24);
+ emitInst(Mips::OR, TempReg[7]).addReg(TempReg[3]).addReg(TempReg[5]);
+ emitInst(Mips::OR, DestReg).addReg(TempReg[6]).addReg(TempReg[7]);
+ updateValueMap(II, DestReg);
+ return true;
+ }
+ }
+ return false;
+ }
+ case Intrinsic::memcpy:
+ case Intrinsic::memmove: {
+ const auto *MTI = cast<MemTransferInst>(II);
+ // Don't handle volatile.
+ if (MTI->isVolatile())
+ return false;
+ if (!MTI->getLength()->getType()->isIntegerTy(32))
+ return false;
+ const char *IntrMemName = isa<MemCpyInst>(II) ? "memcpy" : "memmove";
+ return lowerCallTo(II, IntrMemName, II->getNumArgOperands() - 2);
+ }
+ case Intrinsic::memset: {
+ const MemSetInst *MSI = cast<MemSetInst>(II);
+ // Don't handle volatile.
+ if (MSI->isVolatile())
+ return false;
+ if (!MSI->getLength()->getType()->isIntegerTy(32))
+ return false;
+ return lowerCallTo(II, "memset", II->getNumArgOperands() - 2);
+ }
+ }
+ return false;
+}
+
+bool MipsFastISel::selectRet(const Instruction *I) {
+ const Function &F = *I->getParent()->getParent();
+ const ReturnInst *Ret = cast<ReturnInst>(I);
+
+ DEBUG(dbgs() << "selectRet\n");
+
+ if (!FuncInfo.CanLowerReturn)
+ return false;
+
+ // Build a list of return value registers.
+ SmallVector<unsigned, 4> RetRegs;
+
+ if (Ret->getNumOperands() > 0) {
+ CallingConv::ID CC = F.getCallingConv();
+
+ // Do not handle FastCC.
+ if (CC == CallingConv::Fast)
+ return false;
+
+ SmallVector<ISD::OutputArg, 4> Outs;
+ GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL);
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ValLocs;
+ MipsCCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs,
+ I->getContext());
+ CCAssignFn *RetCC = RetCC_Mips;
+ CCInfo.AnalyzeReturn(Outs, RetCC);
+
+ // Only handle a single return value for now.
+ if (ValLocs.size() != 1)
+ return false;
+
+ CCValAssign &VA = ValLocs[0];
+ const Value *RV = Ret->getOperand(0);
+
+ // Don't bother handling odd stuff for now.
+ if ((VA.getLocInfo() != CCValAssign::Full) &&
+ (VA.getLocInfo() != CCValAssign::BCvt))
+ return false;
+
+ // Only handle register returns for now.
+ if (!VA.isRegLoc())
+ return false;
+
+ unsigned Reg = getRegForValue(RV);
+ if (Reg == 0)
+ return false;
+
+ unsigned SrcReg = Reg + VA.getValNo();
+ unsigned DestReg = VA.getLocReg();
+ // Avoid a cross-class copy. This is very unlikely.
+ if (!MRI.getRegClass(SrcReg)->contains(DestReg))
+ return false;
+
+ EVT RVEVT = TLI.getValueType(DL, RV->getType());
+ if (!RVEVT.isSimple())
+ return false;
+
+ if (RVEVT.isVector())
+ return false;
+
+ MVT RVVT = RVEVT.getSimpleVT();
+ if (RVVT == MVT::f128)
+ return false;
+
+ // Do not handle FGR64 returns for now.
+ if (RVVT == MVT::f64 && UnsupportedFPMode) {
+ DEBUG(dbgs() << ".. .. gave up (UnsupportedFPMode\n");
+ return false;
+ }
+
+ MVT DestVT = VA.getValVT();
+ // Special handling for extended integers.
+ if (RVVT != DestVT) {
+ if (RVVT != MVT::i1 && RVVT != MVT::i8 && RVVT != MVT::i16)
+ return false;
+
+ if (Outs[0].Flags.isZExt() || Outs[0].Flags.isSExt()) {
+ bool IsZExt = Outs[0].Flags.isZExt();
+ SrcReg = emitIntExt(RVVT, SrcReg, DestVT, IsZExt);
+ if (SrcReg == 0)
+ return false;
+ }
+ }
+
+ // Make the copy.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), DestReg).addReg(SrcReg);
+
+ // Add register to return instruction.
+ RetRegs.push_back(VA.getLocReg());
+ }
+ MachineInstrBuilder MIB = emitInst(Mips::RetRA);
+ for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
+ MIB.addReg(RetRegs[i], RegState::Implicit);
+ return true;
+}
+
+bool MipsFastISel::selectTrunc(const Instruction *I) {
+ // The high bits for a type smaller than the register size are assumed to be
+ // undefined.
+ Value *Op = I->getOperand(0);
+
+ EVT SrcVT, DestVT;
+ SrcVT = TLI.getValueType(DL, Op->getType(), true);
+ DestVT = TLI.getValueType(DL, I->getType(), true);
+
+ if (SrcVT != MVT::i32 && SrcVT != MVT::i16 && SrcVT != MVT::i8)
+ return false;
+ if (DestVT != MVT::i16 && DestVT != MVT::i8 && DestVT != MVT::i1)
+ return false;
+
+ unsigned SrcReg = getRegForValue(Op);
+ if (!SrcReg)
+ return false;
+
+ // Because the high bits are undefined, a truncate doesn't generate
+ // any code.
+ updateValueMap(I, SrcReg);
+ return true;
+}
+bool MipsFastISel::selectIntExt(const Instruction *I) {
+ Type *DestTy = I->getType();
+ Value *Src = I->getOperand(0);
+ Type *SrcTy = Src->getType();
+
+ bool isZExt = isa<ZExtInst>(I);
+ unsigned SrcReg = getRegForValue(Src);
+ if (!SrcReg)
+ return false;
+
+ EVT SrcEVT, DestEVT;
+ SrcEVT = TLI.getValueType(DL, SrcTy, true);
+ DestEVT = TLI.getValueType(DL, DestTy, true);
+ if (!SrcEVT.isSimple())
+ return false;
+ if (!DestEVT.isSimple())
+ return false;
+
+ MVT SrcVT = SrcEVT.getSimpleVT();
+ MVT DestVT = DestEVT.getSimpleVT();
+ unsigned ResultReg = createResultReg(&Mips::GPR32RegClass);
+
+ if (!emitIntExt(SrcVT, SrcReg, DestVT, ResultReg, isZExt))
+ return false;
+ updateValueMap(I, ResultReg);
+ return true;
+}
+bool MipsFastISel::emitIntSExt32r1(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+ unsigned DestReg) {
+ unsigned ShiftAmt;
+ switch (SrcVT.SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ ShiftAmt = 24;
+ break;
+ case MVT::i16:
+ ShiftAmt = 16;
+ break;
+ }
+ unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
+ emitInst(Mips::SLL, TempReg).addReg(SrcReg).addImm(ShiftAmt);
+ emitInst(Mips::SRA, DestReg).addReg(TempReg).addImm(ShiftAmt);
+ return true;
+}
+
+bool MipsFastISel::emitIntSExt32r2(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+ unsigned DestReg) {
+ switch (SrcVT.SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ emitInst(Mips::SEB, DestReg).addReg(SrcReg);
+ break;
+ case MVT::i16:
+ emitInst(Mips::SEH, DestReg).addReg(SrcReg);
+ break;
+ }
+ return true;
+}
+
+bool MipsFastISel::emitIntSExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+ unsigned DestReg) {
+ if ((DestVT != MVT::i32) && (DestVT != MVT::i16))
+ return false;
+ if (Subtarget->hasMips32r2())
+ return emitIntSExt32r2(SrcVT, SrcReg, DestVT, DestReg);
+ return emitIntSExt32r1(SrcVT, SrcReg, DestVT, DestReg);
+}
+
+bool MipsFastISel::emitIntZExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+ unsigned DestReg) {
+ int64_t Imm;
+
+ switch (SrcVT.SimpleTy) {
+ default:
+ return false;
+ case MVT::i1:
+ Imm = 1;
+ break;
+ case MVT::i8:
+ Imm = 0xff;
+ break;
+ case MVT::i16:
+ Imm = 0xffff;
+ break;
+ }
+
+ emitInst(Mips::ANDi, DestReg).addReg(SrcReg).addImm(Imm);
+ return true;
+}
+
+bool MipsFastISel::emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+ unsigned DestReg, bool IsZExt) {
+ // FastISel does not have plumbing to deal with extensions where the SrcVT or
+ // DestVT are odd things, so test to make sure that they are both types we can
+ // handle (i1/i8/i16/i32 for SrcVT and i8/i16/i32/i64 for DestVT), otherwise
+ // bail out to SelectionDAG.
+ if (((DestVT != MVT::i8) && (DestVT != MVT::i16) && (DestVT != MVT::i32)) ||
+ ((SrcVT != MVT::i1) && (SrcVT != MVT::i8) && (SrcVT != MVT::i16)))
+ return false;
+ if (IsZExt)
+ return emitIntZExt(SrcVT, SrcReg, DestVT, DestReg);
+ return emitIntSExt(SrcVT, SrcReg, DestVT, DestReg);
+}
+
+unsigned MipsFastISel::emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+ bool isZExt) {
+ unsigned DestReg = createResultReg(&Mips::GPR32RegClass);
+ bool Success = emitIntExt(SrcVT, SrcReg, DestVT, DestReg, isZExt);
+ return Success ? DestReg : 0;
+}
+
+bool MipsFastISel::selectDivRem(const Instruction *I, unsigned ISDOpcode) {
+ EVT DestEVT = TLI.getValueType(DL, I->getType(), true);
+ if (!DestEVT.isSimple())
+ return false;
+
+ MVT DestVT = DestEVT.getSimpleVT();
+ if (DestVT != MVT::i32)
+ return false;
+
+ unsigned DivOpc;
+ switch (ISDOpcode) {
+ default:
+ return false;
+ case ISD::SDIV:
+ case ISD::SREM:
+ DivOpc = Mips::SDIV;
+ break;
+ case ISD::UDIV:
+ case ISD::UREM:
+ DivOpc = Mips::UDIV;
+ break;
+ }
+
+ unsigned Src0Reg = getRegForValue(I->getOperand(0));
+ unsigned Src1Reg = getRegForValue(I->getOperand(1));
+ if (!Src0Reg || !Src1Reg)
+ return false;
+
+ emitInst(DivOpc).addReg(Src0Reg).addReg(Src1Reg);
+ emitInst(Mips::TEQ).addReg(Src1Reg).addReg(Mips::ZERO).addImm(7);
+
+ unsigned ResultReg = createResultReg(&Mips::GPR32RegClass);
+ if (!ResultReg)
+ return false;
+
+ unsigned MFOpc = (ISDOpcode == ISD::SREM || ISDOpcode == ISD::UREM)
+ ? Mips::MFHI
+ : Mips::MFLO;
+ emitInst(MFOpc, ResultReg);
+
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool MipsFastISel::selectShift(const Instruction *I) {
+ MVT RetVT;
+
+ if (!isTypeSupported(I->getType(), RetVT))
+ return false;
+
+ unsigned ResultReg = createResultReg(&Mips::GPR32RegClass);
+ if (!ResultReg)
+ return false;
+
+ unsigned Opcode = I->getOpcode();
+ const Value *Op0 = I->getOperand(0);
+ unsigned Op0Reg = getRegForValue(Op0);
+ if (!Op0Reg)
+ return false;
+
+ // If AShr or LShr, then we need to make sure the operand0 is sign extended.
+ if (Opcode == Instruction::AShr || Opcode == Instruction::LShr) {
+ unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
+ if (!TempReg)
+ return false;
+
+ MVT Op0MVT = TLI.getValueType(DL, Op0->getType(), true).getSimpleVT();
+ bool IsZExt = Opcode == Instruction::LShr;
+ if (!emitIntExt(Op0MVT, Op0Reg, MVT::i32, TempReg, IsZExt))
+ return false;
+
+ Op0Reg = TempReg;
+ }
+
+ if (const auto *C = dyn_cast<ConstantInt>(I->getOperand(1))) {
+ uint64_t ShiftVal = C->getZExtValue();
+
+ switch (Opcode) {
+ default:
+ llvm_unreachable("Unexpected instruction.");
+ case Instruction::Shl:
+ Opcode = Mips::SLL;
+ break;
+ case Instruction::AShr:
+ Opcode = Mips::SRA;
+ break;
+ case Instruction::LShr:
+ Opcode = Mips::SRL;
+ break;
+ }
+
+ emitInst(Opcode, ResultReg).addReg(Op0Reg).addImm(ShiftVal);
+ updateValueMap(I, ResultReg);
+ return true;
+ }
+
+ unsigned Op1Reg = getRegForValue(I->getOperand(1));
+ if (!Op1Reg)
+ return false;
+
+ switch (Opcode) {
+ default:
+ llvm_unreachable("Unexpected instruction.");
+ case Instruction::Shl:
+ Opcode = Mips::SLLV;
+ break;
+ case Instruction::AShr:
+ Opcode = Mips::SRAV;
+ break;
+ case Instruction::LShr:
+ Opcode = Mips::SRLV;
+ break;
+ }
+
+ emitInst(Opcode, ResultReg).addReg(Op0Reg).addReg(Op1Reg);
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool MipsFastISel::fastSelectInstruction(const Instruction *I) {
+ switch (I->getOpcode()) {
+ default:
+ break;
+ case Instruction::Load:
+ return selectLoad(I);
+ case Instruction::Store:
+ return selectStore(I);
+ case Instruction::SDiv:
+ if (!selectBinaryOp(I, ISD::SDIV))
+ return selectDivRem(I, ISD::SDIV);
+ return true;
+ case Instruction::UDiv:
+ if (!selectBinaryOp(I, ISD::UDIV))
+ return selectDivRem(I, ISD::UDIV);
+ return true;
+ case Instruction::SRem:
+ if (!selectBinaryOp(I, ISD::SREM))
+ return selectDivRem(I, ISD::SREM);
+ return true;
+ case Instruction::URem:
+ if (!selectBinaryOp(I, ISD::UREM))
+ return selectDivRem(I, ISD::UREM);
+ return true;
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ return selectShift(I);
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ return selectLogicalOp(I);
+ case Instruction::Br:
+ return selectBranch(I);
+ case Instruction::Ret:
+ return selectRet(I);
+ case Instruction::Trunc:
+ return selectTrunc(I);
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ return selectIntExt(I);
+ case Instruction::FPTrunc:
+ return selectFPTrunc(I);
+ case Instruction::FPExt:
+ return selectFPExt(I);
+ case Instruction::FPToSI:
+ return selectFPToInt(I, /*isSigned*/ true);
+ case Instruction::FPToUI:
+ return selectFPToInt(I, /*isSigned*/ false);
+ case Instruction::ICmp:
+ case Instruction::FCmp:
+ return selectCmp(I);
+ case Instruction::Select:
+ return selectSelect(I);
+ }
+ return false;
+}
+
+unsigned MipsFastISel::getRegEnsuringSimpleIntegerWidening(const Value *V,
+ bool IsUnsigned) {
+ unsigned VReg = getRegForValue(V);
+ if (VReg == 0)
+ return 0;
+ MVT VMVT = TLI.getValueType(DL, V->getType(), true).getSimpleVT();
+ if ((VMVT == MVT::i8) || (VMVT == MVT::i16)) {
+ unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
+ if (!emitIntExt(VMVT, VReg, MVT::i32, TempReg, IsUnsigned))
+ return 0;
+ VReg = TempReg;
+ }
+ return VReg;
+}
+
+void MipsFastISel::simplifyAddress(Address &Addr) {
+ if (!isInt<16>(Addr.getOffset())) {
+ unsigned TempReg =
+ materialize32BitInt(Addr.getOffset(), &Mips::GPR32RegClass);
+ unsigned DestReg = createResultReg(&Mips::GPR32RegClass);
+ emitInst(Mips::ADDu, DestReg).addReg(TempReg).addReg(Addr.getReg());
+ Addr.setReg(DestReg);
+ Addr.setOffset(0);
+ }
+}
+
+unsigned MipsFastISel::fastEmitInst_rr(unsigned MachineInstOpcode,
+ const TargetRegisterClass *RC,
+ unsigned Op0, bool Op0IsKill,
+ unsigned Op1, bool Op1IsKill) {
+ // We treat the MUL instruction in a special way because it clobbers
+ // the HI0 & LO0 registers. The TableGen definition of this instruction can
+ // mark these registers only as implicitly defined. As a result, the
+ // register allocator runs out of registers when this instruction is
+ // followed by another instruction that defines the same registers too.
+ // We can fix this by explicitly marking those registers as dead.
+ if (MachineInstOpcode == Mips::MUL) {
+ unsigned ResultReg = createResultReg(RC);
+ const MCInstrDesc &II = TII.get(MachineInstOpcode);
+ Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
+ Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+ .addReg(Op0, getKillRegState(Op0IsKill))
+ .addReg(Op1, getKillRegState(Op1IsKill))
+ .addReg(Mips::HI0, RegState::ImplicitDefine | RegState::Dead)
+ .addReg(Mips::LO0, RegState::ImplicitDefine | RegState::Dead);
+ return ResultReg;
+ }
+
+ return FastISel::fastEmitInst_rr(MachineInstOpcode, RC, Op0, Op0IsKill, Op1,
+ Op1IsKill);
+}
+
+namespace llvm {
+FastISel *Mips::createFastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo) {
+ return new MipsFastISel(funcInfo, libInfo);
+}
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsFrameLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsFrameLowering.cpp
new file mode 100644
index 000000000000..b2cf03976f81
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsFrameLowering.cpp
@@ -0,0 +1,159 @@
+//===-- MipsFrameLowering.cpp - Mips Frame Information --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsFrameLowering.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MipsInstrInfo.h"
+#include "MipsMachineFunction.h"
+#include "MipsTargetMachine.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+
+//===----------------------------------------------------------------------===//
+//
+// Stack Frame Processing methods
+// +----------------------------+
+//
+// The stack is allocated decrementing the stack pointer on
+// the first instruction of a function prologue. Once decremented,
+// all stack references are done thought a positive offset
+// from the stack/frame pointer, so the stack is considering
+// to grow up! Otherwise terrible hacks would have to be made
+// to get this stack ABI compliant :)
+//
+// The stack frame required by the ABI (after call):
+// Offset
+//
+// 0 ----------
+// 4 Args to pass
+// . saved $GP (used in PIC)
+// . Alloca allocations
+// . Local Area
+// . CPU "Callee Saved" Registers
+// . saved FP
+// . saved RA
+// . FPU "Callee Saved" Registers
+// StackSize -----------
+//
+// Offset - offset from sp after stack allocation on function prologue
+//
+// The sp is the stack pointer subtracted/added from the stack size
+// at the Prologue/Epilogue
+//
+// References to the previous stack (to obtain arguments) are done
+// with offsets that exceeds the stack size: (stacksize+(4*(num_arg-1))
+//
+// Examples:
+// - reference to the actual stack frame
+// for any local area var there is smt like : FI >= 0, StackOffset: 4
+// sw REGX, 4(SP)
+//
+// - reference to previous stack frame
+// suppose there's a load to the 5th arguments : FI < 0, StackOffset: 16.
+// The emitted instruction will be something like:
+// lw REGX, 16+StackSize(SP)
+//
+// Since the total stack size is unknown on LowerFormalArguments, all
+// stack references (ObjectOffset) created to reference the function
+// arguments, are negative numbers. This way, on eliminateFrameIndex it's
+// possible to detect those references and the offsets are adjusted to
+// their real location.
+//
+//===----------------------------------------------------------------------===//
+
+const MipsFrameLowering *MipsFrameLowering::create(const MipsSubtarget &ST) {
+ if (ST.inMips16Mode())
+ return llvm::createMips16FrameLowering(ST);
+
+ return llvm::createMipsSEFrameLowering(ST);
+}
+
+// hasFP - Return true if the specified function should have a dedicated frame
+// pointer register. This is true if the function has variable sized allocas,
+// if it needs dynamic stack realignment, if frame pointer elimination is
+// disabled, or if the frame address is taken.
+bool MipsFrameLowering::hasFP(const MachineFunction &MF) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const TargetRegisterInfo *TRI = STI.getRegisterInfo();
+
+ return MF.getTarget().Options.DisableFramePointerElim(MF) ||
+ MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
+ TRI->needsStackRealignment(MF);
+}
+
+bool MipsFrameLowering::hasBP(const MachineFunction &MF) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const TargetRegisterInfo *TRI = STI.getRegisterInfo();
+
+ return MFI.hasVarSizedObjects() && TRI->needsStackRealignment(MF);
+}
+
+uint64_t MipsFrameLowering::estimateStackSize(const MachineFunction &MF) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+
+ int64_t Offset = 0;
+
+ // Iterate over fixed sized objects.
+ for (int I = MFI.getObjectIndexBegin(); I != 0; ++I)
+ Offset = std::max(Offset, -MFI.getObjectOffset(I));
+
+ // Conservatively assume all callee-saved registers will be saved.
+ for (const MCPhysReg *R = TRI.getCalleeSavedRegs(&MF); *R; ++R) {
+ unsigned Size = TRI.getMinimalPhysRegClass(*R)->getSize();
+ Offset = alignTo(Offset + Size, Size);
+ }
+
+ unsigned MaxAlign = MFI.getMaxAlignment();
+
+ // Check that MaxAlign is not zero if there is a stack object that is not a
+ // callee-saved spill.
+ assert(!MFI.getObjectIndexEnd() || MaxAlign);
+
+ // Iterate over other objects.
+ for (unsigned I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I)
+ Offset = alignTo(Offset + MFI.getObjectSize(I), MaxAlign);
+
+ // Call frame.
+ if (MFI.adjustsStack() && hasReservedCallFrame(MF))
+ Offset = alignTo(Offset + MFI.getMaxCallFrameSize(),
+ std::max(MaxAlign, getStackAlignment()));
+
+ return alignTo(Offset, getStackAlignment());
+}
+
+// Eliminate ADJCALLSTACKDOWN, ADJCALLSTACKUP pseudo instructions
+MachineBasicBlock::iterator MipsFrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
+ unsigned SP = STI.getABI().IsN64() ? Mips::SP_64 : Mips::SP;
+
+ if (!hasReservedCallFrame(MF)) {
+ int64_t Amount = I->getOperand(0).getImm();
+ if (I->getOpcode() == Mips::ADJCALLSTACKDOWN)
+ Amount = -Amount;
+
+ STI.getInstrInfo()->adjustStackPtr(SP, Amount, MBB, I);
+ }
+
+ return MBB.erase(I);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsFrameLowering.h b/contrib/llvm/lib/Target/Mips/MipsFrameLowering.h
new file mode 100644
index 000000000000..8c4214c4c21d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsFrameLowering.h
@@ -0,0 +1,54 @@
+//===-- MipsFrameLowering.h - Define frame lowering for Mips ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSFRAMELOWERING_H
+#define LLVM_LIB_TARGET_MIPS_MIPSFRAMELOWERING_H
+
+#include "Mips.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+ class MipsSubtarget;
+
+class MipsFrameLowering : public TargetFrameLowering {
+protected:
+ const MipsSubtarget &STI;
+
+public:
+ explicit MipsFrameLowering(const MipsSubtarget &sti, unsigned Alignment)
+ : TargetFrameLowering(StackGrowsDown, Alignment, 0, Alignment), STI(sti) {}
+
+ static const MipsFrameLowering *create(const MipsSubtarget &ST);
+
+ bool hasFP(const MachineFunction &MF) const override;
+
+ bool hasBP(const MachineFunction &MF) const;
+
+ bool isFPCloseToIncomingSP() const override { return false; }
+
+ MachineBasicBlock::iterator
+ eliminateCallFramePseudoInstr(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const override;
+
+protected:
+ uint64_t estimateStackSize(const MachineFunction &MF) const;
+};
+
+/// Create MipsFrameLowering objects.
+const MipsFrameLowering *createMips16FrameLowering(const MipsSubtarget &ST);
+const MipsFrameLowering *createMipsSEFrameLowering(const MipsSubtarget &ST);
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsHazardSchedule.cpp b/contrib/llvm/lib/Target/Mips/MipsHazardSchedule.cpp
new file mode 100644
index 000000000000..31b86124bc8d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsHazardSchedule.cpp
@@ -0,0 +1,160 @@
+//===-- MipsHazardSchedule.cpp - Workaround pipeline hazards --------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This pass is used to workaround certain pipeline hazards. For now, this
+/// covers compact branch hazards. In future this pass can be extended to other
+/// pipeline hazards, such as various MIPS1 hazards, processor errata that
+/// require instruction reorganization, etc.
+///
+/// This pass has to run after the delay slot filler as that pass can introduce
+/// pipeline hazards, hence the existing hazard recognizer is not suitable.
+///
+/// Hazards handled: forbidden slots for MIPSR6.
+///
+/// A forbidden slot hazard occurs when a compact branch instruction is executed
+/// and the adjacent instruction in memory is a control transfer instruction
+/// such as a branch or jump, ERET, ERETNC, DERET, WAIT and PAUSE.
+///
+/// For example:
+///
+/// 0x8004 bnec a1,v0,<P+0x18>
+/// 0x8008 beqc a1,a2,<P+0x54>
+///
+/// In such cases, the processor is required to signal a Reserved Instruction
+/// exception.
+///
+/// Here, if the instruction at 0x8004 is executed, the processor will raise an
+/// exception as there is a control transfer instruction at 0x8008.
+///
+/// There are two sources of forbidden slot hazards:
+///
+/// A) A previous pass has created a compact branch directly.
+/// B) Transforming a delay slot branch into compact branch. This case can be
+/// difficult to process as lookahead for hazards is insufficent, as
+/// backwards delay slot fillling can also produce hazards in previously
+/// processed instuctions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "Mips.h"
+#include "MipsInstrInfo.h"
+#include "MipsSEInstrInfo.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-hazard-schedule"
+
+STATISTIC(NumInsertedNops, "Number of nops inserted");
+
+namespace {
+
+typedef MachineBasicBlock::iterator Iter;
+typedef MachineBasicBlock::reverse_iterator ReverseIter;
+
+class MipsHazardSchedule : public MachineFunctionPass {
+
+public:
+ MipsHazardSchedule() : MachineFunctionPass(ID) {}
+
+ StringRef getPassName() const override { return "Mips Hazard Schedule"; }
+
+ bool runOnMachineFunction(MachineFunction &F) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+private:
+ static char ID;
+};
+
+char MipsHazardSchedule::ID = 0;
+} // end of anonymous namespace
+
+/// Returns a pass that clears pipeline hazards.
+FunctionPass *llvm::createMipsHazardSchedule() {
+ return new MipsHazardSchedule();
+}
+
+// Find the next real instruction from the current position in current basic
+// block.
+static Iter getNextMachineInstrInBB(Iter Position) {
+ Iter I = Position, E = Position->getParent()->end();
+ I = std::find_if_not(I, E,
+ [](const Iter &Insn) { return Insn->isTransient(); });
+
+ return I;
+}
+
+// Find the next real instruction from the current position, looking through
+// basic block boundaries.
+static Iter getNextMachineInstr(Iter Position, MachineBasicBlock *Parent) {
+ if (Position == Parent->end()) {
+ MachineBasicBlock *Succ = Parent->getNextNode();
+ if (Succ != nullptr && Parent->isSuccessor(Succ)) {
+ Position = Succ->begin();
+ Parent = Succ;
+ } else {
+ llvm_unreachable(
+ "Should have identified the end of the function earlier!");
+ }
+ }
+
+ Iter Instr = getNextMachineInstrInBB(Position);
+ if (Instr == Parent->end()) {
+ return getNextMachineInstr(Instr, Parent);
+ }
+ return Instr;
+}
+
+bool MipsHazardSchedule::runOnMachineFunction(MachineFunction &MF) {
+
+ const MipsSubtarget *STI =
+ &static_cast<const MipsSubtarget &>(MF.getSubtarget());
+
+ // Forbidden slot hazards are only defined for MIPSR6 but not microMIPSR6.
+ if (!STI->hasMips32r6() || STI->inMicroMipsMode())
+ return false;
+
+ bool Changed = false;
+ const MipsInstrInfo *TII = STI->getInstrInfo();
+
+ for (MachineFunction::iterator FI = MF.begin(); FI != MF.end(); ++FI) {
+ for (Iter I = FI->begin(); I != FI->end(); ++I) {
+
+ // Forbidden slot hazard handling. Use lookahead over state.
+ if (!TII->HasForbiddenSlot(*I))
+ continue;
+
+ Iter Inst;
+ bool LastInstInFunction =
+ std::next(I) == FI->end() && std::next(FI) == MF.end();
+ if (!LastInstInFunction) {
+ Inst = getNextMachineInstr(std::next(I), &*FI);
+ }
+
+ if (LastInstInFunction || !TII->SafeInForbiddenSlot(*Inst)) {
+ Changed = true;
+ MIBundleBuilder(&*I)
+ .append(BuildMI(MF, I->getDebugLoc(), TII->get(Mips::NOP)));
+ NumInsertedNops++;
+ }
+ }
+ }
+ return Changed;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Mips/MipsISelDAGToDAG.cpp
new file mode 100644
index 000000000000..0e1173f1c617
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -0,0 +1,270 @@
+//===-- MipsISelDAGToDAG.cpp - A Dag to Dag Inst Selector for Mips --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the MIPS target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsISelDAGToDAG.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "Mips.h"
+#include "Mips16ISelDAGToDAG.h"
+#include "MipsMachineFunction.h"
+#include "MipsRegisterInfo.h"
+#include "MipsSEISelDAGToDAG.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-isel"
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MipsDAGToDAGISel - MIPS specific code to select MIPS machine
+// instructions for SelectionDAG operations.
+//===----------------------------------------------------------------------===//
+
+bool MipsDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
+ Subtarget = &static_cast<const MipsSubtarget &>(MF.getSubtarget());
+ bool Ret = SelectionDAGISel::runOnMachineFunction(MF);
+
+ processFunctionAfterISel(MF);
+
+ return Ret;
+}
+
+/// getGlobalBaseReg - Output the instructions required to put the
+/// GOT address into a register.
+SDNode *MipsDAGToDAGISel::getGlobalBaseReg() {
+ unsigned GlobalBaseReg = MF->getInfo<MipsFunctionInfo>()->getGlobalBaseReg();
+ return CurDAG->getRegister(GlobalBaseReg, getTargetLowering()->getPointerTy(
+ CurDAG->getDataLayout()))
+ .getNode();
+}
+
+/// ComplexPattern used on MipsInstrInfo
+/// Used on Mips Load/Store instructions
+bool MipsDAGToDAGISel::selectAddrRegImm(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const {
+ llvm_unreachable("Unimplemented function.");
+ return false;
+}
+
+bool MipsDAGToDAGISel::selectAddrDefault(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const {
+ llvm_unreachable("Unimplemented function.");
+ return false;
+}
+
+bool MipsDAGToDAGISel::selectIntAddr(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const {
+ llvm_unreachable("Unimplemented function.");
+ return false;
+}
+
+bool MipsDAGToDAGISel::selectIntAddr11MM(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const {
+ llvm_unreachable("Unimplemented function.");
+ return false;
+}
+
+bool MipsDAGToDAGISel::selectIntAddr12MM(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const {
+ llvm_unreachable("Unimplemented function.");
+ return false;
+}
+
+bool MipsDAGToDAGISel::selectIntAddr16MM(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const {
+ llvm_unreachable("Unimplemented function.");
+ return false;
+}
+
+bool MipsDAGToDAGISel::selectIntAddrLSL2MM(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const {
+ llvm_unreachable("Unimplemented function.");
+ return false;
+}
+
+bool MipsDAGToDAGISel::selectIntAddrSImm10(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const {
+ llvm_unreachable("Unimplemented function.");
+ return false;
+}
+
+bool MipsDAGToDAGISel::selectIntAddrSImm10Lsl1(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const {
+ llvm_unreachable("Unimplemented function.");
+ return false;
+}
+
+bool MipsDAGToDAGISel::selectIntAddrSImm10Lsl2(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const {
+ llvm_unreachable("Unimplemented function.");
+ return false;
+}
+
+bool MipsDAGToDAGISel::selectIntAddrSImm10Lsl3(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const {
+ llvm_unreachable("Unimplemented function.");
+ return false;
+}
+
+bool MipsDAGToDAGISel::selectAddr16(SDValue Addr, SDValue &Base,
+ SDValue &Offset) {
+ llvm_unreachable("Unimplemented function.");
+ return false;
+}
+
+bool MipsDAGToDAGISel::selectAddr16SP(SDValue Addr, SDValue &Base,
+ SDValue &Offset) {
+ llvm_unreachable("Unimplemented function.");
+ return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplat(SDNode *N, APInt &Imm,
+ unsigned MinSizeInBits) const {
+ llvm_unreachable("Unimplemented function.");
+ return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm1(SDValue N, SDValue &Imm) const {
+ llvm_unreachable("Unimplemented function.");
+ return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm2(SDValue N, SDValue &Imm) const {
+ llvm_unreachable("Unimplemented function.");
+ return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm3(SDValue N, SDValue &Imm) const {
+ llvm_unreachable("Unimplemented function.");
+ return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm4(SDValue N, SDValue &Imm) const {
+ llvm_unreachable("Unimplemented function.");
+ return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm5(SDValue N, SDValue &Imm) const {
+ llvm_unreachable("Unimplemented function.");
+ return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm6(SDValue N, SDValue &Imm) const {
+ llvm_unreachable("Unimplemented function.");
+ return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm8(SDValue N, SDValue &Imm) const {
+ llvm_unreachable("Unimplemented function.");
+ return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatSimm5(SDValue N, SDValue &Imm) const {
+ llvm_unreachable("Unimplemented function.");
+ return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimmPow2(SDValue N, SDValue &Imm) const {
+ llvm_unreachable("Unimplemented function.");
+ return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimmInvPow2(SDValue N, SDValue &Imm) const {
+ llvm_unreachable("Unimplemented function.");
+ return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatMaskL(SDValue N, SDValue &Imm) const {
+ llvm_unreachable("Unimplemented function.");
+ return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatMaskR(SDValue N, SDValue &Imm) const {
+ llvm_unreachable("Unimplemented function.");
+ return false;
+}
+
+/// Select instructions not customized! Used for
+/// expanded, promoted and normal instructions
+void MipsDAGToDAGISel::Select(SDNode *Node) {
+ unsigned Opcode = Node->getOpcode();
+
+ // Dump information about the Node being selected
+ DEBUG(errs() << "Selecting: "; Node->dump(CurDAG); errs() << "\n");
+
+ // If we have a custom node, we already have selected!
+ if (Node->isMachineOpcode()) {
+ DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+ Node->setNodeId(-1);
+ return;
+ }
+
+ // See if subclasses can handle this node.
+ if (trySelect(Node))
+ return;
+
+ switch(Opcode) {
+ default: break;
+
+ // Get target GOT address.
+ case ISD::GLOBAL_OFFSET_TABLE:
+ ReplaceNode(Node, getGlobalBaseReg());
+ return;
+
+#ifndef NDEBUG
+ case ISD::LOAD:
+ case ISD::STORE:
+ assert((Subtarget->systemSupportsUnalignedAccess() ||
+ cast<MemSDNode>(Node)->getMemoryVT().getSizeInBits() / 8 <=
+ cast<MemSDNode>(Node)->getAlignment()) &&
+ "Unexpected unaligned loads/stores.");
+ break;
+#endif
+ }
+
+ // Select the default instruction
+ SelectCode(Node);
+}
+
+bool MipsDAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
+ std::vector<SDValue> &OutOps) {
+ // All memory constraints can at least accept raw pointers.
+ switch(ConstraintID) {
+ default:
+ llvm_unreachable("Unexpected asm memory constraint");
+ case InlineAsm::Constraint_i:
+ case InlineAsm::Constraint_m:
+ case InlineAsm::Constraint_R:
+ case InlineAsm::Constraint_ZC:
+ OutOps.push_back(Op);
+ return false;
+ }
+ return true;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsISelDAGToDAG.h b/contrib/llvm/lib/Target/Mips/MipsISelDAGToDAG.h
new file mode 100644
index 000000000000..20bdd4aa8f5f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsISelDAGToDAG.h
@@ -0,0 +1,144 @@
+//===---- MipsISelDAGToDAG.h - A Dag to Dag Inst Selector for Mips --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the MIPS target.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSISELDAGTODAG_H
+#define LLVM_LIB_TARGET_MIPS_MIPSISELDAGTODAG_H
+
+#include "Mips.h"
+#include "MipsSubtarget.h"
+#include "MipsTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MipsDAGToDAGISel - MIPS specific code to select MIPS machine
+// instructions for SelectionDAG operations.
+//===----------------------------------------------------------------------===//
+namespace llvm {
+
+class MipsDAGToDAGISel : public SelectionDAGISel {
+public:
+ explicit MipsDAGToDAGISel(MipsTargetMachine &TM, CodeGenOpt::Level OL)
+ : SelectionDAGISel(TM, OL), Subtarget(nullptr) {}
+
+ // Pass Name
+ StringRef getPassName() const override {
+ return "MIPS DAG->DAG Pattern Instruction Selection";
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+protected:
+ SDNode *getGlobalBaseReg();
+
+ /// Keep a pointer to the MipsSubtarget around so that we can make the right
+ /// decision when generating code for different targets.
+ const MipsSubtarget *Subtarget;
+
+private:
+ // Include the pieces autogenerated from the target description.
+ #include "MipsGenDAGISel.inc"
+
+ // Complex Pattern.
+ /// (reg + imm).
+ virtual bool selectAddrRegImm(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const;
+
+ /// Fall back on this function if all else fails.
+ virtual bool selectAddrDefault(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const;
+
+ /// Match integer address pattern.
+ virtual bool selectIntAddr(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const;
+
+ virtual bool selectIntAddr11MM(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const;
+
+ virtual bool selectIntAddr12MM(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const;
+
+ virtual bool selectIntAddr16MM(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const;
+
+ virtual bool selectIntAddrLSL2MM(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const;
+
+ /// Match addr+simm10 and addr
+ virtual bool selectIntAddrSImm10(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const;
+
+ virtual bool selectIntAddrSImm10Lsl1(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const;
+
+ virtual bool selectIntAddrSImm10Lsl2(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const;
+
+ virtual bool selectIntAddrSImm10Lsl3(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const;
+
+ virtual bool selectAddr16(SDValue Addr, SDValue &Base, SDValue &Offset);
+ virtual bool selectAddr16SP(SDValue Addr, SDValue &Base, SDValue &Offset);
+
+ /// \brief Select constant vector splats.
+ virtual bool selectVSplat(SDNode *N, APInt &Imm,
+ unsigned MinSizeInBits) const;
+ /// \brief Select constant vector splats whose value fits in a uimm1.
+ virtual bool selectVSplatUimm1(SDValue N, SDValue &Imm) const;
+ /// \brief Select constant vector splats whose value fits in a uimm2.
+ virtual bool selectVSplatUimm2(SDValue N, SDValue &Imm) const;
+ /// \brief Select constant vector splats whose value fits in a uimm3.
+ virtual bool selectVSplatUimm3(SDValue N, SDValue &Imm) const;
+ /// \brief Select constant vector splats whose value fits in a uimm4.
+ virtual bool selectVSplatUimm4(SDValue N, SDValue &Imm) const;
+ /// \brief Select constant vector splats whose value fits in a uimm5.
+ virtual bool selectVSplatUimm5(SDValue N, SDValue &Imm) const;
+ /// \brief Select constant vector splats whose value fits in a uimm6.
+ virtual bool selectVSplatUimm6(SDValue N, SDValue &Imm) const;
+ /// \brief Select constant vector splats whose value fits in a uimm8.
+ virtual bool selectVSplatUimm8(SDValue N, SDValue &Imm) const;
+ /// \brief Select constant vector splats whose value fits in a simm5.
+ virtual bool selectVSplatSimm5(SDValue N, SDValue &Imm) const;
+ /// \brief Select constant vector splats whose value is a power of 2.
+ virtual bool selectVSplatUimmPow2(SDValue N, SDValue &Imm) const;
+ /// \brief Select constant vector splats whose value is the inverse of a
+ /// power of 2.
+ virtual bool selectVSplatUimmInvPow2(SDValue N, SDValue &Imm) const;
+ /// \brief Select constant vector splats whose value is a run of set bits
+ /// ending at the most significant bit
+ virtual bool selectVSplatMaskL(SDValue N, SDValue &Imm) const;
+ /// \brief Select constant vector splats whose value is a run of set bits
+ /// starting at bit zero.
+ virtual bool selectVSplatMaskR(SDValue N, SDValue &Imm) const;
+
+ void Select(SDNode *N) override;
+
+ virtual bool trySelect(SDNode *Node) = 0;
+
+ // getImm - Return a target constant with the specified value.
+ inline SDValue getImm(const SDNode *Node, uint64_t Imm) {
+ return CurDAG->getTargetConstant(Imm, SDLoc(Node), Node->getValueType(0));
+ }
+
+ virtual void processFunctionAfterISel(MachineFunction &MF) = 0;
+
+ bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+ unsigned ConstraintID,
+ std::vector<SDValue> &OutOps) override;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp
new file mode 100644
index 000000000000..9c511bd77822
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -0,0 +1,4030 @@
+//===-- MipsISelLowering.cpp - Mips DAG Lowering Implementation -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Mips uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+#include "MipsISelLowering.h"
+#include "InstPrinter/MipsInstPrinter.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MipsCCState.h"
+#include "MipsMachineFunction.h"
+#include "MipsSubtarget.h"
+#include "MipsTargetMachine.h"
+#include "MipsTargetObjectFile.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cctype>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-lower"
+
+STATISTIC(NumTailCalls, "Number of tail calls");
+
+static cl::opt<bool>
+LargeGOT("mxgot", cl::Hidden,
+ cl::desc("MIPS: Enable GOT larger than 64k."), cl::init(false));
+
+static cl::opt<bool>
+NoZeroDivCheck("mno-check-zero-division", cl::Hidden,
+ cl::desc("MIPS: Don't trap on integer division by zero."),
+ cl::init(false));
+
+static const MCPhysReg Mips64DPRegs[8] = {
+ Mips::D12_64, Mips::D13_64, Mips::D14_64, Mips::D15_64,
+ Mips::D16_64, Mips::D17_64, Mips::D18_64, Mips::D19_64
+};
+
+// If I is a shifted mask, set the size (Size) and the first bit of the
+// mask (Pos), and return true.
+// For example, if I is 0x003ff800, (Pos, Size) = (11, 11).
+static bool isShiftedMask(uint64_t I, uint64_t &Pos, uint64_t &Size) {
+ if (!isShiftedMask_64(I))
+ return false;
+
+ Size = countPopulation(I);
+ Pos = countTrailingZeros(I);
+ return true;
+}
+
+SDValue MipsTargetLowering::getGlobalReg(SelectionDAG &DAG, EVT Ty) const {
+ MipsFunctionInfo *FI = DAG.getMachineFunction().getInfo<MipsFunctionInfo>();
+ return DAG.getRegister(FI->getGlobalBaseReg(), Ty);
+}
+
+SDValue MipsTargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
+ SelectionDAG &DAG,
+ unsigned Flag) const {
+ return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty, 0, Flag);
+}
+
+SDValue MipsTargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
+ SelectionDAG &DAG,
+ unsigned Flag) const {
+ return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
+}
+
+SDValue MipsTargetLowering::getTargetNode(BlockAddressSDNode *N, EVT Ty,
+ SelectionDAG &DAG,
+ unsigned Flag) const {
+ return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
+}
+
+SDValue MipsTargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
+ SelectionDAG &DAG,
+ unsigned Flag) const {
+ return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
+}
+
+SDValue MipsTargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
+ SelectionDAG &DAG,
+ unsigned Flag) const {
+ return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(),
+ N->getOffset(), Flag);
+}
+
+const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
+ switch ((MipsISD::NodeType)Opcode) {
+ case MipsISD::FIRST_NUMBER: break;
+ case MipsISD::JmpLink: return "MipsISD::JmpLink";
+ case MipsISD::TailCall: return "MipsISD::TailCall";
+ case MipsISD::Hi: return "MipsISD::Hi";
+ case MipsISD::Lo: return "MipsISD::Lo";
+ case MipsISD::GPRel: return "MipsISD::GPRel";
+ case MipsISD::ThreadPointer: return "MipsISD::ThreadPointer";
+ case MipsISD::Ret: return "MipsISD::Ret";
+ case MipsISD::ERet: return "MipsISD::ERet";
+ case MipsISD::EH_RETURN: return "MipsISD::EH_RETURN";
+ case MipsISD::FPBrcond: return "MipsISD::FPBrcond";
+ case MipsISD::FPCmp: return "MipsISD::FPCmp";
+ case MipsISD::CMovFP_T: return "MipsISD::CMovFP_T";
+ case MipsISD::CMovFP_F: return "MipsISD::CMovFP_F";
+ case MipsISD::TruncIntFP: return "MipsISD::TruncIntFP";
+ case MipsISD::MFHI: return "MipsISD::MFHI";
+ case MipsISD::MFLO: return "MipsISD::MFLO";
+ case MipsISD::MTLOHI: return "MipsISD::MTLOHI";
+ case MipsISD::Mult: return "MipsISD::Mult";
+ case MipsISD::Multu: return "MipsISD::Multu";
+ case MipsISD::MAdd: return "MipsISD::MAdd";
+ case MipsISD::MAddu: return "MipsISD::MAddu";
+ case MipsISD::MSub: return "MipsISD::MSub";
+ case MipsISD::MSubu: return "MipsISD::MSubu";
+ case MipsISD::DivRem: return "MipsISD::DivRem";
+ case MipsISD::DivRemU: return "MipsISD::DivRemU";
+ case MipsISD::DivRem16: return "MipsISD::DivRem16";
+ case MipsISD::DivRemU16: return "MipsISD::DivRemU16";
+ case MipsISD::BuildPairF64: return "MipsISD::BuildPairF64";
+ case MipsISD::ExtractElementF64: return "MipsISD::ExtractElementF64";
+ case MipsISD::Wrapper: return "MipsISD::Wrapper";
+ case MipsISD::DynAlloc: return "MipsISD::DynAlloc";
+ case MipsISD::Sync: return "MipsISD::Sync";
+ case MipsISD::Ext: return "MipsISD::Ext";
+ case MipsISD::Ins: return "MipsISD::Ins";
+ case MipsISD::LWL: return "MipsISD::LWL";
+ case MipsISD::LWR: return "MipsISD::LWR";
+ case MipsISD::SWL: return "MipsISD::SWL";
+ case MipsISD::SWR: return "MipsISD::SWR";
+ case MipsISD::LDL: return "MipsISD::LDL";
+ case MipsISD::LDR: return "MipsISD::LDR";
+ case MipsISD::SDL: return "MipsISD::SDL";
+ case MipsISD::SDR: return "MipsISD::SDR";
+ case MipsISD::EXTP: return "MipsISD::EXTP";
+ case MipsISD::EXTPDP: return "MipsISD::EXTPDP";
+ case MipsISD::EXTR_S_H: return "MipsISD::EXTR_S_H";
+ case MipsISD::EXTR_W: return "MipsISD::EXTR_W";
+ case MipsISD::EXTR_R_W: return "MipsISD::EXTR_R_W";
+ case MipsISD::EXTR_RS_W: return "MipsISD::EXTR_RS_W";
+ case MipsISD::SHILO: return "MipsISD::SHILO";
+ case MipsISD::MTHLIP: return "MipsISD::MTHLIP";
+ case MipsISD::MULSAQ_S_W_PH: return "MipsISD::MULSAQ_S_W_PH";
+ case MipsISD::MAQ_S_W_PHL: return "MipsISD::MAQ_S_W_PHL";
+ case MipsISD::MAQ_S_W_PHR: return "MipsISD::MAQ_S_W_PHR";
+ case MipsISD::MAQ_SA_W_PHL: return "MipsISD::MAQ_SA_W_PHL";
+ case MipsISD::MAQ_SA_W_PHR: return "MipsISD::MAQ_SA_W_PHR";
+ case MipsISD::DPAU_H_QBL: return "MipsISD::DPAU_H_QBL";
+ case MipsISD::DPAU_H_QBR: return "MipsISD::DPAU_H_QBR";
+ case MipsISD::DPSU_H_QBL: return "MipsISD::DPSU_H_QBL";
+ case MipsISD::DPSU_H_QBR: return "MipsISD::DPSU_H_QBR";
+ case MipsISD::DPAQ_S_W_PH: return "MipsISD::DPAQ_S_W_PH";
+ case MipsISD::DPSQ_S_W_PH: return "MipsISD::DPSQ_S_W_PH";
+ case MipsISD::DPAQ_SA_L_W: return "MipsISD::DPAQ_SA_L_W";
+ case MipsISD::DPSQ_SA_L_W: return "MipsISD::DPSQ_SA_L_W";
+ case MipsISD::DPA_W_PH: return "MipsISD::DPA_W_PH";
+ case MipsISD::DPS_W_PH: return "MipsISD::DPS_W_PH";
+ case MipsISD::DPAQX_S_W_PH: return "MipsISD::DPAQX_S_W_PH";
+ case MipsISD::DPAQX_SA_W_PH: return "MipsISD::DPAQX_SA_W_PH";
+ case MipsISD::DPAX_W_PH: return "MipsISD::DPAX_W_PH";
+ case MipsISD::DPSX_W_PH: return "MipsISD::DPSX_W_PH";
+ case MipsISD::DPSQX_S_W_PH: return "MipsISD::DPSQX_S_W_PH";
+ case MipsISD::DPSQX_SA_W_PH: return "MipsISD::DPSQX_SA_W_PH";
+ case MipsISD::MULSA_W_PH: return "MipsISD::MULSA_W_PH";
+ case MipsISD::MULT: return "MipsISD::MULT";
+ case MipsISD::MULTU: return "MipsISD::MULTU";
+ case MipsISD::MADD_DSP: return "MipsISD::MADD_DSP";
+ case MipsISD::MADDU_DSP: return "MipsISD::MADDU_DSP";
+ case MipsISD::MSUB_DSP: return "MipsISD::MSUB_DSP";
+ case MipsISD::MSUBU_DSP: return "MipsISD::MSUBU_DSP";
+ case MipsISD::SHLL_DSP: return "MipsISD::SHLL_DSP";
+ case MipsISD::SHRA_DSP: return "MipsISD::SHRA_DSP";
+ case MipsISD::SHRL_DSP: return "MipsISD::SHRL_DSP";
+ case MipsISD::SETCC_DSP: return "MipsISD::SETCC_DSP";
+ case MipsISD::SELECT_CC_DSP: return "MipsISD::SELECT_CC_DSP";
+ case MipsISD::VALL_ZERO: return "MipsISD::VALL_ZERO";
+ case MipsISD::VANY_ZERO: return "MipsISD::VANY_ZERO";
+ case MipsISD::VALL_NONZERO: return "MipsISD::VALL_NONZERO";
+ case MipsISD::VANY_NONZERO: return "MipsISD::VANY_NONZERO";
+ case MipsISD::VCEQ: return "MipsISD::VCEQ";
+ case MipsISD::VCLE_S: return "MipsISD::VCLE_S";
+ case MipsISD::VCLE_U: return "MipsISD::VCLE_U";
+ case MipsISD::VCLT_S: return "MipsISD::VCLT_S";
+ case MipsISD::VCLT_U: return "MipsISD::VCLT_U";
+ case MipsISD::VSMAX: return "MipsISD::VSMAX";
+ case MipsISD::VSMIN: return "MipsISD::VSMIN";
+ case MipsISD::VUMAX: return "MipsISD::VUMAX";
+ case MipsISD::VUMIN: return "MipsISD::VUMIN";
+ case MipsISD::VEXTRACT_SEXT_ELT: return "MipsISD::VEXTRACT_SEXT_ELT";
+ case MipsISD::VEXTRACT_ZEXT_ELT: return "MipsISD::VEXTRACT_ZEXT_ELT";
+ case MipsISD::VNOR: return "MipsISD::VNOR";
+ case MipsISD::VSHF: return "MipsISD::VSHF";
+ case MipsISD::SHF: return "MipsISD::SHF";
+ case MipsISD::ILVEV: return "MipsISD::ILVEV";
+ case MipsISD::ILVOD: return "MipsISD::ILVOD";
+ case MipsISD::ILVL: return "MipsISD::ILVL";
+ case MipsISD::ILVR: return "MipsISD::ILVR";
+ case MipsISD::PCKEV: return "MipsISD::PCKEV";
+ case MipsISD::PCKOD: return "MipsISD::PCKOD";
+ case MipsISD::INSVE: return "MipsISD::INSVE";
+ }
+ return nullptr;
+}
+
+MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
+ const MipsSubtarget &STI)
+ : TargetLowering(TM), Subtarget(STI), ABI(TM.getABI()) {
+ // Mips does not have i1 type, so use i32 for
+ // setcc operations results (slt, sgt, ...).
+ setBooleanContents(ZeroOrOneBooleanContent);
+ setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+ // The cmp.cond.fmt instruction in MIPS32r6/MIPS64r6 uses 0 and -1 like MSA
+ // does. Integer booleans still use 0 and 1.
+ if (Subtarget.hasMips32r6())
+ setBooleanContents(ZeroOrOneBooleanContent,
+ ZeroOrNegativeOneBooleanContent);
+
+ // Load extented operations for i1 types must be promoted
+ for (MVT VT : MVT::integer_valuetypes()) {
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+ }
+
+ // MIPS doesn't have extending float->double load/store. Set LoadExtAction
+ // for f32, f16
+ for (MVT VT : MVT::fp_valuetypes()) {
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
+ }
+
+ // Set LoadExtAction for f16 vectors to Expand
+ for (MVT VT : MVT::fp_vector_valuetypes()) {
+ MVT F16VT = MVT::getVectorVT(MVT::f16, VT.getVectorNumElements());
+ if (F16VT.isValid())
+ setLoadExtAction(ISD::EXTLOAD, VT, F16VT, Expand);
+ }
+
+ setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+ setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+
+ setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+ // Used by legalize types to correctly generate the setcc result.
+ // Without this, every float setcc comes with a AND/OR with the result,
+ // we don't want this, since the fpcmp result goes to a flag register,
+ // which is used implicitly by brcond and select operations.
+ AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
+
+ // Mips Custom Operations
+ setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+ setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+ setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
+ setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
+ setOperationAction(ISD::JumpTable, MVT::i32, Custom);
+ setOperationAction(ISD::ConstantPool, MVT::i32, Custom);
+ setOperationAction(ISD::SELECT, MVT::f32, Custom);
+ setOperationAction(ISD::SELECT, MVT::f64, Custom);
+ setOperationAction(ISD::SELECT, MVT::i32, Custom);
+ setOperationAction(ISD::SETCC, MVT::f32, Custom);
+ setOperationAction(ISD::SETCC, MVT::f64, Custom);
+ setOperationAction(ISD::BRCOND, MVT::Other, Custom);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
+ setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+
+ if (Subtarget.isGP64bit()) {
+ setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+ setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
+ setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
+ setOperationAction(ISD::JumpTable, MVT::i64, Custom);
+ setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
+ setOperationAction(ISD::SELECT, MVT::i64, Custom);
+ setOperationAction(ISD::LOAD, MVT::i64, Custom);
+ setOperationAction(ISD::STORE, MVT::i64, Custom);
+ setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+ setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
+ setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
+ setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
+ }
+
+ if (!Subtarget.isGP64bit()) {
+ setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
+ setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
+ setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
+ }
+
+ setOperationAction(ISD::EH_DWARF_CFA, MVT::i32, Custom);
+ if (Subtarget.isGP64bit())
+ setOperationAction(ISD::EH_DWARF_CFA, MVT::i64, Custom);
+
+ setOperationAction(ISD::SDIV, MVT::i32, Expand);
+ setOperationAction(ISD::SREM, MVT::i32, Expand);
+ setOperationAction(ISD::UDIV, MVT::i32, Expand);
+ setOperationAction(ISD::UREM, MVT::i32, Expand);
+ setOperationAction(ISD::SDIV, MVT::i64, Expand);
+ setOperationAction(ISD::SREM, MVT::i64, Expand);
+ setOperationAction(ISD::UDIV, MVT::i64, Expand);
+ setOperationAction(ISD::UREM, MVT::i64, Expand);
+
+ // Operations not directly supported by Mips.
+ setOperationAction(ISD::BR_CC, MVT::f32, Expand);
+ setOperationAction(ISD::BR_CC, MVT::f64, Expand);
+ setOperationAction(ISD::BR_CC, MVT::i32, Expand);
+ setOperationAction(ISD::BR_CC, MVT::i64, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
+ setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
+ setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
+ setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
+ setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+ if (Subtarget.hasCnMips()) {
+ setOperationAction(ISD::CTPOP, MVT::i32, Legal);
+ setOperationAction(ISD::CTPOP, MVT::i64, Legal);
+ } else {
+ setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+ setOperationAction(ISD::CTPOP, MVT::i64, Expand);
+ }
+ setOperationAction(ISD::CTTZ, MVT::i32, Expand);
+ setOperationAction(ISD::CTTZ, MVT::i64, Expand);
+ setOperationAction(ISD::ROTL, MVT::i32, Expand);
+ setOperationAction(ISD::ROTL, MVT::i64, Expand);
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
+
+ if (!Subtarget.hasMips32r2())
+ setOperationAction(ISD::ROTR, MVT::i32, Expand);
+
+ if (!Subtarget.hasMips64r2())
+ setOperationAction(ISD::ROTR, MVT::i64, Expand);
+
+ setOperationAction(ISD::FSIN, MVT::f32, Expand);
+ setOperationAction(ISD::FSIN, MVT::f64, Expand);
+ setOperationAction(ISD::FCOS, MVT::f32, Expand);
+ setOperationAction(ISD::FCOS, MVT::f64, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+ setOperationAction(ISD::FPOWI, MVT::f32, Expand);
+ setOperationAction(ISD::FPOW, MVT::f32, Expand);
+ setOperationAction(ISD::FPOW, MVT::f64, Expand);
+ setOperationAction(ISD::FLOG, MVT::f32, Expand);
+ setOperationAction(ISD::FLOG2, MVT::f32, Expand);
+ setOperationAction(ISD::FLOG10, MVT::f32, Expand);
+ setOperationAction(ISD::FEXP, MVT::f32, Expand);
+ setOperationAction(ISD::FMA, MVT::f32, Expand);
+ setOperationAction(ISD::FMA, MVT::f64, Expand);
+ setOperationAction(ISD::FREM, MVT::f32, Expand);
+ setOperationAction(ISD::FREM, MVT::f64, Expand);
+
+ // Lower f16 conversion operations into library calls
+ setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
+ setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
+ setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
+ setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
+
+ setOperationAction(ISD::EH_RETURN, MVT::Other, Custom);
+
+ setOperationAction(ISD::VASTART, MVT::Other, Custom);
+ setOperationAction(ISD::VAARG, MVT::Other, Custom);
+ setOperationAction(ISD::VACOPY, MVT::Other, Expand);
+ setOperationAction(ISD::VAEND, MVT::Other, Expand);
+
+ // Use the default for now
+ setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+ setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+
+ if (!Subtarget.isGP64bit()) {
+ setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Expand);
+ setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand);
+ }
+
+
+ if (!Subtarget.hasMips32r2()) {
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+ }
+
+ // MIPS16 lacks MIPS32's clz and clo instructions.
+ if (!Subtarget.hasMips32() || Subtarget.inMips16Mode())
+ setOperationAction(ISD::CTLZ, MVT::i32, Expand);
+ if (!Subtarget.hasMips64())
+ setOperationAction(ISD::CTLZ, MVT::i64, Expand);
+
+ if (!Subtarget.hasMips32r2())
+ setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+ if (!Subtarget.hasMips64r2())
+ setOperationAction(ISD::BSWAP, MVT::i64, Expand);
+
+ if (Subtarget.isGP64bit()) {
+ setLoadExtAction(ISD::SEXTLOAD, MVT::i64, MVT::i32, Custom);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, MVT::i32, Custom);
+ setLoadExtAction(ISD::EXTLOAD, MVT::i64, MVT::i32, Custom);
+ setTruncStoreAction(MVT::i64, MVT::i32, Custom);
+ }
+
+ setOperationAction(ISD::TRAP, MVT::Other, Legal);
+
+ setTargetDAGCombine(ISD::SDIVREM);
+ setTargetDAGCombine(ISD::UDIVREM);
+ setTargetDAGCombine(ISD::SELECT);
+ setTargetDAGCombine(ISD::AND);
+ setTargetDAGCombine(ISD::OR);
+ setTargetDAGCombine(ISD::ADD);
+ setTargetDAGCombine(ISD::AssertZext);
+
+ if (ABI.IsO32()) {
+ // These libcalls are not available in 32-bit.
+ setLibcallName(RTLIB::SHL_I128, nullptr);
+ setLibcallName(RTLIB::SRL_I128, nullptr);
+ setLibcallName(RTLIB::SRA_I128, nullptr);
+ }
+
+ setMinFunctionAlignment(Subtarget.isGP64bit() ? 3 : 2);
+
+ // The arguments on the stack are defined in terms of 4-byte slots on O32
+ // and 8-byte slots on N32/N64.
+ setMinStackArgumentAlignment((ABI.IsN32() || ABI.IsN64()) ? 8 : 4);
+
+ setStackPointerRegisterToSaveRestore(ABI.IsN64() ? Mips::SP_64 : Mips::SP);
+
+ MaxStoresPerMemcpy = 16;
+
+ isMicroMips = Subtarget.inMicroMipsMode();
+}
+
+const MipsTargetLowering *MipsTargetLowering::create(const MipsTargetMachine &TM,
+ const MipsSubtarget &STI) {
+ if (STI.inMips16Mode())
+ return llvm::createMips16TargetLowering(TM, STI);
+
+ return llvm::createMipsSETargetLowering(TM, STI);
+}
+
+// Create a fast isel object.
+FastISel *
+MipsTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo) const {
+ const MipsTargetMachine &TM =
+ static_cast<const MipsTargetMachine &>(funcInfo.MF->getTarget());
+
+ // We support only the standard encoding [MIPS32,MIPS32R5] ISAs.
+ bool UseFastISel = TM.Options.EnableFastISel && Subtarget.hasMips32() &&
+ !Subtarget.hasMips32r6() && !Subtarget.inMips16Mode() &&
+ !Subtarget.inMicroMipsMode();
+
+ // Disable if we don't generate PIC or the ABI isn't O32.
+ if (!TM.isPositionIndependent() || !TM.getABI().IsO32())
+ UseFastISel = false;
+
+ return UseFastISel ? Mips::createFastISel(funcInfo, libInfo) : nullptr;
+}
+
+EVT MipsTargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
+ EVT VT) const {
+ if (!VT.isVector())
+ return MVT::i32;
+ return VT.changeVectorElementTypeToInteger();
+}
+
+static SDValue performDivRemCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const MipsSubtarget &Subtarget) {
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ EVT Ty = N->getValueType(0);
+ unsigned LO = (Ty == MVT::i32) ? Mips::LO0 : Mips::LO0_64;
+ unsigned HI = (Ty == MVT::i32) ? Mips::HI0 : Mips::HI0_64;
+ unsigned Opc = N->getOpcode() == ISD::SDIVREM ? MipsISD::DivRem16 :
+ MipsISD::DivRemU16;
+ SDLoc DL(N);
+
+ SDValue DivRem = DAG.getNode(Opc, DL, MVT::Glue,
+ N->getOperand(0), N->getOperand(1));
+ SDValue InChain = DAG.getEntryNode();
+ SDValue InGlue = DivRem;
+
+ // insert MFLO
+ if (N->hasAnyUseOfValue(0)) {
+ SDValue CopyFromLo = DAG.getCopyFromReg(InChain, DL, LO, Ty,
+ InGlue);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyFromLo);
+ InChain = CopyFromLo.getValue(1);
+ InGlue = CopyFromLo.getValue(2);
+ }
+
+ // insert MFHI
+ if (N->hasAnyUseOfValue(1)) {
+ SDValue CopyFromHi = DAG.getCopyFromReg(InChain, DL,
+ HI, Ty, InGlue);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), CopyFromHi);
+ }
+
+ return SDValue();
+}
+
+static Mips::CondCode condCodeToFCC(ISD::CondCode CC) {
+ switch (CC) {
+ default: llvm_unreachable("Unknown fp condition code!");
+ case ISD::SETEQ:
+ case ISD::SETOEQ: return Mips::FCOND_OEQ;
+ case ISD::SETUNE: return Mips::FCOND_UNE;
+ case ISD::SETLT:
+ case ISD::SETOLT: return Mips::FCOND_OLT;
+ case ISD::SETGT:
+ case ISD::SETOGT: return Mips::FCOND_OGT;
+ case ISD::SETLE:
+ case ISD::SETOLE: return Mips::FCOND_OLE;
+ case ISD::SETGE:
+ case ISD::SETOGE: return Mips::FCOND_OGE;
+ case ISD::SETULT: return Mips::FCOND_ULT;
+ case ISD::SETULE: return Mips::FCOND_ULE;
+ case ISD::SETUGT: return Mips::FCOND_UGT;
+ case ISD::SETUGE: return Mips::FCOND_UGE;
+ case ISD::SETUO: return Mips::FCOND_UN;
+ case ISD::SETO: return Mips::FCOND_OR;
+ case ISD::SETNE:
+ case ISD::SETONE: return Mips::FCOND_ONE;
+ case ISD::SETUEQ: return Mips::FCOND_UEQ;
+ }
+}
+
+
+/// This function returns true if the floating point conditional branches and
+/// conditional moves which use condition code CC should be inverted.
+static bool invertFPCondCodeUser(Mips::CondCode CC) {
+ if (CC >= Mips::FCOND_F && CC <= Mips::FCOND_NGT)
+ return false;
+
+ assert((CC >= Mips::FCOND_T && CC <= Mips::FCOND_GT) &&
+ "Illegal Condition Code");
+
+ return true;
+}
+
+// Creates and returns an FPCmp node from a setcc node.
+// Returns Op if setcc is not a floating point comparison.
+static SDValue createFPCmp(SelectionDAG &DAG, const SDValue &Op) {
+ // must be a SETCC node
+ if (Op.getOpcode() != ISD::SETCC)
+ return Op;
+
+ SDValue LHS = Op.getOperand(0);
+
+ if (!LHS.getValueType().isFloatingPoint())
+ return Op;
+
+ SDValue RHS = Op.getOperand(1);
+ SDLoc DL(Op);
+
+ // Assume the 3rd operand is a CondCodeSDNode. Add code to check the type of
+ // node if necessary.
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+
+ return DAG.getNode(MipsISD::FPCmp, DL, MVT::Glue, LHS, RHS,
+ DAG.getConstant(condCodeToFCC(CC), DL, MVT::i32));
+}
+
+// Creates and returns a CMovFPT/F node.
+static SDValue createCMovFP(SelectionDAG &DAG, SDValue Cond, SDValue True,
+ SDValue False, const SDLoc &DL) {
+ ConstantSDNode *CC = cast<ConstantSDNode>(Cond.getOperand(2));
+ bool invert = invertFPCondCodeUser((Mips::CondCode)CC->getSExtValue());
+ SDValue FCC0 = DAG.getRegister(Mips::FCC0, MVT::i32);
+
+ return DAG.getNode((invert ? MipsISD::CMovFP_F : MipsISD::CMovFP_T), DL,
+ True.getValueType(), True, FCC0, False, Cond);
+}
+
+static SDValue performSELECTCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const MipsSubtarget &Subtarget) {
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ SDValue SetCC = N->getOperand(0);
+
+ if ((SetCC.getOpcode() != ISD::SETCC) ||
+ !SetCC.getOperand(0).getValueType().isInteger())
+ return SDValue();
+
+ SDValue False = N->getOperand(2);
+ EVT FalseTy = False.getValueType();
+
+ if (!FalseTy.isInteger())
+ return SDValue();
+
+ ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(False);
+
+ // If the RHS (False) is 0, we swap the order of the operands
+ // of ISD::SELECT (obviously also inverting the condition) so that we can
+ // take advantage of conditional moves using the $0 register.
+ // Example:
+ // return (a != 0) ? x : 0;
+ // load $reg, x
+ // movz $reg, $0, a
+ if (!FalseC)
+ return SDValue();
+
+ const SDLoc DL(N);
+
+ if (!FalseC->getZExtValue()) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
+ SDValue True = N->getOperand(1);
+
+ SetCC = DAG.getSetCC(DL, SetCC.getValueType(), SetCC.getOperand(0),
+ SetCC.getOperand(1), ISD::getSetCCInverse(CC, true));
+
+ return DAG.getNode(ISD::SELECT, DL, FalseTy, SetCC, False, True);
+ }
+
+ // If both operands are integer constants there's a possibility that we
+ // can do some interesting optimizations.
+ SDValue True = N->getOperand(1);
+ ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(True);
+
+ if (!TrueC || !True.getValueType().isInteger())
+ return SDValue();
+
+ // We'll also ignore MVT::i64 operands as this optimizations proves
+ // to be ineffective because of the required sign extensions as the result
+ // of a SETCC operator is always MVT::i32 for non-vector types.
+ if (True.getValueType() == MVT::i64)
+ return SDValue();
+
+ int64_t Diff = TrueC->getSExtValue() - FalseC->getSExtValue();
+
+ // 1) (a < x) ? y : y-1
+ // slti $reg1, a, x
+ // addiu $reg2, $reg1, y-1
+ if (Diff == 1)
+ return DAG.getNode(ISD::ADD, DL, SetCC.getValueType(), SetCC, False);
+
+ // 2) (a < x) ? y-1 : y
+ // slti $reg1, a, x
+ // xor $reg1, $reg1, 1
+ // addiu $reg2, $reg1, y-1
+ if (Diff == -1) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
+ SetCC = DAG.getSetCC(DL, SetCC.getValueType(), SetCC.getOperand(0),
+ SetCC.getOperand(1), ISD::getSetCCInverse(CC, true));
+ return DAG.getNode(ISD::ADD, DL, SetCC.getValueType(), SetCC, True);
+ }
+
+ // Couldn't optimize.
+ return SDValue();
+}
+
+static SDValue performCMovFPCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const MipsSubtarget &Subtarget) {
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ SDValue ValueIfTrue = N->getOperand(0), ValueIfFalse = N->getOperand(2);
+
+ ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(ValueIfFalse);
+ if (!FalseC || FalseC->getZExtValue())
+ return SDValue();
+
+ // Since RHS (False) is 0, we swap the order of the True/False operands
+ // (obviously also inverting the condition) so that we can
+ // take advantage of conditional moves using the $0 register.
+ // Example:
+ // return (a != 0) ? x : 0;
+ // load $reg, x
+ // movz $reg, $0, a
+ unsigned Opc = (N->getOpcode() == MipsISD::CMovFP_T) ? MipsISD::CMovFP_F :
+ MipsISD::CMovFP_T;
+
+ SDValue FCC = N->getOperand(1), Glue = N->getOperand(3);
+ return DAG.getNode(Opc, SDLoc(N), ValueIfFalse.getValueType(),
+ ValueIfFalse, FCC, ValueIfTrue, Glue);
+}
+
+static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const MipsSubtarget &Subtarget) {
+ // Pattern match EXT.
+ // $dst = and ((sra or srl) $src , pos), (2**size - 1)
+ // => ext $dst, $src, size, pos
+ if (DCI.isBeforeLegalizeOps() || !Subtarget.hasExtractInsert())
+ return SDValue();
+
+ SDValue ShiftRight = N->getOperand(0), Mask = N->getOperand(1);
+ unsigned ShiftRightOpc = ShiftRight.getOpcode();
+
+ // Op's first operand must be a shift right.
+ if (ShiftRightOpc != ISD::SRA && ShiftRightOpc != ISD::SRL)
+ return SDValue();
+
+ // The second operand of the shift must be an immediate.
+ ConstantSDNode *CN;
+ if (!(CN = dyn_cast<ConstantSDNode>(ShiftRight.getOperand(1))))
+ return SDValue();
+
+ uint64_t Pos = CN->getZExtValue();
+ uint64_t SMPos, SMSize;
+
+ // Op's second operand must be a shifted mask.
+ if (!(CN = dyn_cast<ConstantSDNode>(Mask)) ||
+ !isShiftedMask(CN->getZExtValue(), SMPos, SMSize))
+ return SDValue();
+
+ // Return if the shifted mask does not start at bit 0 or the sum of its size
+ // and Pos exceeds the word's size.
+ EVT ValTy = N->getValueType(0);
+ if (SMPos != 0 || Pos + SMSize > ValTy.getSizeInBits())
+ return SDValue();
+
+ SDLoc DL(N);
+ return DAG.getNode(MipsISD::Ext, DL, ValTy,
+ ShiftRight.getOperand(0),
+ DAG.getConstant(Pos, DL, MVT::i32),
+ DAG.getConstant(SMSize, DL, MVT::i32));
+}
+
+static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const MipsSubtarget &Subtarget) {
+ // Pattern match INS.
+ // $dst = or (and $src1 , mask0), (and (shl $src, pos), mask1),
+ // where mask1 = (2**size - 1) << pos, mask0 = ~mask1
+ // => ins $dst, $src, size, pos, $src1
+ if (DCI.isBeforeLegalizeOps() || !Subtarget.hasExtractInsert())
+ return SDValue();
+
+ SDValue And0 = N->getOperand(0), And1 = N->getOperand(1);
+ uint64_t SMPos0, SMSize0, SMPos1, SMSize1;
+ ConstantSDNode *CN;
+
+ // See if Op's first operand matches (and $src1 , mask0).
+ if (And0.getOpcode() != ISD::AND)
+ return SDValue();
+
+ if (!(CN = dyn_cast<ConstantSDNode>(And0.getOperand(1))) ||
+ !isShiftedMask(~CN->getSExtValue(), SMPos0, SMSize0))
+ return SDValue();
+
+ // See if Op's second operand matches (and (shl $src, pos), mask1).
+ if (And1.getOpcode() != ISD::AND)
+ return SDValue();
+
+ if (!(CN = dyn_cast<ConstantSDNode>(And1.getOperand(1))) ||
+ !isShiftedMask(CN->getZExtValue(), SMPos1, SMSize1))
+ return SDValue();
+
+ // The shift masks must have the same position and size.
+ if (SMPos0 != SMPos1 || SMSize0 != SMSize1)
+ return SDValue();
+
+ SDValue Shl = And1.getOperand(0);
+ if (Shl.getOpcode() != ISD::SHL)
+ return SDValue();
+
+ if (!(CN = dyn_cast<ConstantSDNode>(Shl.getOperand(1))))
+ return SDValue();
+
+ unsigned Shamt = CN->getZExtValue();
+
+ // Return if the shift amount and the first bit position of mask are not the
+ // same.
+ EVT ValTy = N->getValueType(0);
+ if ((Shamt != SMPos0) || (SMPos0 + SMSize0 > ValTy.getSizeInBits()))
+ return SDValue();
+
+ SDLoc DL(N);
+ return DAG.getNode(MipsISD::Ins, DL, ValTy, Shl.getOperand(0),
+ DAG.getConstant(SMPos0, DL, MVT::i32),
+ DAG.getConstant(SMSize0, DL, MVT::i32),
+ And0.getOperand(0));
+}
+
+static SDValue performADDCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const MipsSubtarget &Subtarget) {
+ // (add v0, (add v1, abs_lo(tjt))) => (add (add v0, v1), abs_lo(tjt))
+
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ SDValue Add = N->getOperand(1);
+
+ if (Add.getOpcode() != ISD::ADD)
+ return SDValue();
+
+ SDValue Lo = Add.getOperand(1);
+
+ if ((Lo.getOpcode() != MipsISD::Lo) ||
+ (Lo.getOperand(0).getOpcode() != ISD::TargetJumpTable))
+ return SDValue();
+
+ EVT ValTy = N->getValueType(0);
+ SDLoc DL(N);
+
+ SDValue Add1 = DAG.getNode(ISD::ADD, DL, ValTy, N->getOperand(0),
+ Add.getOperand(0));
+ return DAG.getNode(ISD::ADD, DL, ValTy, Add1, Lo);
+}
+
+static SDValue performAssertZextCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const MipsSubtarget &Subtarget) {
+ SDValue N0 = N->getOperand(0);
+ EVT NarrowerVT = cast<VTSDNode>(N->getOperand(1))->getVT();
+
+ if (N0.getOpcode() != ISD::TRUNCATE)
+ return SDValue();
+
+ if (N0.getOperand(0).getOpcode() != ISD::AssertZext)
+ return SDValue();
+
+ // fold (AssertZext (trunc (AssertZext x))) -> (trunc (AssertZext x))
+ // if the type of the extension of the innermost AssertZext node is
+ // smaller from that of the outermost node, eg:
+ // (AssertZext:i32 (trunc:i32 (AssertZext:i64 X, i32)), i8)
+ // -> (trunc:i32 (AssertZext X, i8))
+ SDValue WiderAssertZext = N0.getOperand(0);
+ EVT WiderVT = cast<VTSDNode>(WiderAssertZext->getOperand(1))->getVT();
+
+ if (NarrowerVT.bitsLT(WiderVT)) {
+ SDValue NewAssertZext = DAG.getNode(
+ ISD::AssertZext, SDLoc(N), WiderAssertZext.getValueType(),
+ WiderAssertZext.getOperand(0), DAG.getValueType(NarrowerVT));
+ return DAG.getNode(ISD::TRUNCATE, SDLoc(N), N->getValueType(0),
+ NewAssertZext);
+ }
+
+ return SDValue();
+}
+
+SDValue MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
+ const {
+ SelectionDAG &DAG = DCI.DAG;
+ unsigned Opc = N->getOpcode();
+
+ switch (Opc) {
+ default: break;
+ case ISD::SDIVREM:
+ case ISD::UDIVREM:
+ return performDivRemCombine(N, DAG, DCI, Subtarget);
+ case ISD::SELECT:
+ return performSELECTCombine(N, DAG, DCI, Subtarget);
+ case MipsISD::CMovFP_F:
+ case MipsISD::CMovFP_T:
+ return performCMovFPCombine(N, DAG, DCI, Subtarget);
+ case ISD::AND:
+ return performANDCombine(N, DAG, DCI, Subtarget);
+ case ISD::OR:
+ return performORCombine(N, DAG, DCI, Subtarget);
+ case ISD::ADD:
+ return performADDCombine(N, DAG, DCI, Subtarget);
+ case ISD::AssertZext:
+ return performAssertZextCombine(N, DAG, DCI, Subtarget);
+ }
+
+ return SDValue();
+}
+
+bool MipsTargetLowering::isCheapToSpeculateCttz() const {
+ return Subtarget.hasMips32();
+}
+
+bool MipsTargetLowering::isCheapToSpeculateCtlz() const {
+ return Subtarget.hasMips32();
+}
+
+void
+MipsTargetLowering::LowerOperationWrapper(SDNode *N,
+ SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const {
+ SDValue Res = LowerOperation(SDValue(N, 0), DAG);
+
+ for (unsigned I = 0, E = Res->getNumValues(); I != E; ++I)
+ Results.push_back(Res.getValue(I));
+}
+
+void
+MipsTargetLowering::ReplaceNodeResults(SDNode *N,
+ SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const {
+ return LowerOperationWrapper(N, Results, DAG);
+}
+
+SDValue MipsTargetLowering::
+LowerOperation(SDValue Op, SelectionDAG &DAG) const
+{
+ switch (Op.getOpcode())
+ {
+ case ISD::BRCOND: return lowerBRCOND(Op, DAG);
+ case ISD::ConstantPool: return lowerConstantPool(Op, DAG);
+ case ISD::GlobalAddress: return lowerGlobalAddress(Op, DAG);
+ case ISD::BlockAddress: return lowerBlockAddress(Op, DAG);
+ case ISD::GlobalTLSAddress: return lowerGlobalTLSAddress(Op, DAG);
+ case ISD::JumpTable: return lowerJumpTable(Op, DAG);
+ case ISD::SELECT: return lowerSELECT(Op, DAG);
+ case ISD::SETCC: return lowerSETCC(Op, DAG);
+ case ISD::VASTART: return lowerVASTART(Op, DAG);
+ case ISD::VAARG: return lowerVAARG(Op, DAG);
+ case ISD::FCOPYSIGN: return lowerFCOPYSIGN(Op, DAG);
+ case ISD::FRAMEADDR: return lowerFRAMEADDR(Op, DAG);
+ case ISD::RETURNADDR: return lowerRETURNADDR(Op, DAG);
+ case ISD::EH_RETURN: return lowerEH_RETURN(Op, DAG);
+ case ISD::ATOMIC_FENCE: return lowerATOMIC_FENCE(Op, DAG);
+ case ISD::SHL_PARTS: return lowerShiftLeftParts(Op, DAG);
+ case ISD::SRA_PARTS: return lowerShiftRightParts(Op, DAG, true);
+ case ISD::SRL_PARTS: return lowerShiftRightParts(Op, DAG, false);
+ case ISD::LOAD: return lowerLOAD(Op, DAG);
+ case ISD::STORE: return lowerSTORE(Op, DAG);
+ case ISD::EH_DWARF_CFA: return lowerEH_DWARF_CFA(Op, DAG);
+ case ISD::FP_TO_SINT: return lowerFP_TO_SINT(Op, DAG);
+ }
+ return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+// Lower helper functions
+//===----------------------------------------------------------------------===//
+
+// addLiveIn - This helper function adds the specified physical register to the
+// MachineFunction as a live in value. It also creates a corresponding
+// virtual register for it.
+static unsigned
+addLiveIn(MachineFunction &MF, unsigned PReg, const TargetRegisterClass *RC)
+{
+ unsigned VReg = MF.getRegInfo().createVirtualRegister(RC);
+ MF.getRegInfo().addLiveIn(PReg, VReg);
+ return VReg;
+}
+
+static MachineBasicBlock *insertDivByZeroTrap(MachineInstr &MI,
+ MachineBasicBlock &MBB,
+ const TargetInstrInfo &TII,
+ bool Is64Bit, bool IsMicroMips) {
+ if (NoZeroDivCheck)
+ return &MBB;
+
+ // Insert instruction "teq $divisor_reg, $zero, 7".
+ MachineBasicBlock::iterator I(MI);
+ MachineInstrBuilder MIB;
+ MachineOperand &Divisor = MI.getOperand(2);
+ MIB = BuildMI(MBB, std::next(I), MI.getDebugLoc(),
+ TII.get(IsMicroMips ? Mips::TEQ_MM : Mips::TEQ))
+ .addReg(Divisor.getReg(), getKillRegState(Divisor.isKill()))
+ .addReg(Mips::ZERO)
+ .addImm(7);
+
+ // Use the 32-bit sub-register if this is a 64-bit division.
+ if (Is64Bit)
+ MIB->getOperand(0).setSubReg(Mips::sub_32);
+
+ // Clear Divisor's kill flag.
+ Divisor.setIsKill(false);
+
+ // We would normally delete the original instruction here but in this case
+ // we only needed to inject an additional instruction rather than replace it.
+
+ return &MBB;
+}
+
+MachineBasicBlock *
+MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ switch (MI.getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected instr type to insert");
+ case Mips::ATOMIC_LOAD_ADD_I8:
+ return emitAtomicBinaryPartword(MI, BB, 1, Mips::ADDu);
+ case Mips::ATOMIC_LOAD_ADD_I16:
+ return emitAtomicBinaryPartword(MI, BB, 2, Mips::ADDu);
+ case Mips::ATOMIC_LOAD_ADD_I32:
+ return emitAtomicBinary(MI, BB, 4, Mips::ADDu);
+ case Mips::ATOMIC_LOAD_ADD_I64:
+ return emitAtomicBinary(MI, BB, 8, Mips::DADDu);
+
+ case Mips::ATOMIC_LOAD_AND_I8:
+ return emitAtomicBinaryPartword(MI, BB, 1, Mips::AND);
+ case Mips::ATOMIC_LOAD_AND_I16:
+ return emitAtomicBinaryPartword(MI, BB, 2, Mips::AND);
+ case Mips::ATOMIC_LOAD_AND_I32:
+ return emitAtomicBinary(MI, BB, 4, Mips::AND);
+ case Mips::ATOMIC_LOAD_AND_I64:
+ return emitAtomicBinary(MI, BB, 8, Mips::AND64);
+
+ case Mips::ATOMIC_LOAD_OR_I8:
+ return emitAtomicBinaryPartword(MI, BB, 1, Mips::OR);
+ case Mips::ATOMIC_LOAD_OR_I16:
+ return emitAtomicBinaryPartword(MI, BB, 2, Mips::OR);
+ case Mips::ATOMIC_LOAD_OR_I32:
+ return emitAtomicBinary(MI, BB, 4, Mips::OR);
+ case Mips::ATOMIC_LOAD_OR_I64:
+ return emitAtomicBinary(MI, BB, 8, Mips::OR64);
+
+ case Mips::ATOMIC_LOAD_XOR_I8:
+ return emitAtomicBinaryPartword(MI, BB, 1, Mips::XOR);
+ case Mips::ATOMIC_LOAD_XOR_I16:
+ return emitAtomicBinaryPartword(MI, BB, 2, Mips::XOR);
+ case Mips::ATOMIC_LOAD_XOR_I32:
+ return emitAtomicBinary(MI, BB, 4, Mips::XOR);
+ case Mips::ATOMIC_LOAD_XOR_I64:
+ return emitAtomicBinary(MI, BB, 8, Mips::XOR64);
+
+ case Mips::ATOMIC_LOAD_NAND_I8:
+ return emitAtomicBinaryPartword(MI, BB, 1, 0, true);
+ case Mips::ATOMIC_LOAD_NAND_I16:
+ return emitAtomicBinaryPartword(MI, BB, 2, 0, true);
+ case Mips::ATOMIC_LOAD_NAND_I32:
+ return emitAtomicBinary(MI, BB, 4, 0, true);
+ case Mips::ATOMIC_LOAD_NAND_I64:
+ return emitAtomicBinary(MI, BB, 8, 0, true);
+
+ case Mips::ATOMIC_LOAD_SUB_I8:
+ return emitAtomicBinaryPartword(MI, BB, 1, Mips::SUBu);
+ case Mips::ATOMIC_LOAD_SUB_I16:
+ return emitAtomicBinaryPartword(MI, BB, 2, Mips::SUBu);
+ case Mips::ATOMIC_LOAD_SUB_I32:
+ return emitAtomicBinary(MI, BB, 4, Mips::SUBu);
+ case Mips::ATOMIC_LOAD_SUB_I64:
+ return emitAtomicBinary(MI, BB, 8, Mips::DSUBu);
+
+ case Mips::ATOMIC_SWAP_I8:
+ return emitAtomicBinaryPartword(MI, BB, 1, 0);
+ case Mips::ATOMIC_SWAP_I16:
+ return emitAtomicBinaryPartword(MI, BB, 2, 0);
+ case Mips::ATOMIC_SWAP_I32:
+ return emitAtomicBinary(MI, BB, 4, 0);
+ case Mips::ATOMIC_SWAP_I64:
+ return emitAtomicBinary(MI, BB, 8, 0);
+
+ case Mips::ATOMIC_CMP_SWAP_I8:
+ return emitAtomicCmpSwapPartword(MI, BB, 1);
+ case Mips::ATOMIC_CMP_SWAP_I16:
+ return emitAtomicCmpSwapPartword(MI, BB, 2);
+ case Mips::ATOMIC_CMP_SWAP_I32:
+ return emitAtomicCmpSwap(MI, BB, 4);
+ case Mips::ATOMIC_CMP_SWAP_I64:
+ return emitAtomicCmpSwap(MI, BB, 8);
+ case Mips::PseudoSDIV:
+ case Mips::PseudoUDIV:
+ case Mips::DIV:
+ case Mips::DIVU:
+ case Mips::MOD:
+ case Mips::MODU:
+ return insertDivByZeroTrap(MI, *BB, *Subtarget.getInstrInfo(), false,
+ false);
+ case Mips::SDIV_MM_Pseudo:
+ case Mips::UDIV_MM_Pseudo:
+ case Mips::SDIV_MM:
+ case Mips::UDIV_MM:
+ case Mips::DIV_MMR6:
+ case Mips::DIVU_MMR6:
+ case Mips::MOD_MMR6:
+ case Mips::MODU_MMR6:
+ return insertDivByZeroTrap(MI, *BB, *Subtarget.getInstrInfo(), false, true);
+ case Mips::PseudoDSDIV:
+ case Mips::PseudoDUDIV:
+ case Mips::DDIV:
+ case Mips::DDIVU:
+ case Mips::DMOD:
+ case Mips::DMODU:
+ return insertDivByZeroTrap(MI, *BB, *Subtarget.getInstrInfo(), true, false);
+ case Mips::DDIV_MM64R6:
+ case Mips::DDIVU_MM64R6:
+ case Mips::DMOD_MM64R6:
+ case Mips::DMODU_MM64R6:
+ return insertDivByZeroTrap(MI, *BB, *Subtarget.getInstrInfo(), true, true);
+ case Mips::SEL_D:
+ case Mips::SEL_D_MMR6:
+ return emitSEL_D(MI, BB);
+
+ case Mips::PseudoSELECT_I:
+ case Mips::PseudoSELECT_I64:
+ case Mips::PseudoSELECT_S:
+ case Mips::PseudoSELECT_D32:
+ case Mips::PseudoSELECT_D64:
+ return emitPseudoSELECT(MI, BB, false, Mips::BNE);
+ case Mips::PseudoSELECTFP_F_I:
+ case Mips::PseudoSELECTFP_F_I64:
+ case Mips::PseudoSELECTFP_F_S:
+ case Mips::PseudoSELECTFP_F_D32:
+ case Mips::PseudoSELECTFP_F_D64:
+ return emitPseudoSELECT(MI, BB, true, Mips::BC1F);
+ case Mips::PseudoSELECTFP_T_I:
+ case Mips::PseudoSELECTFP_T_I64:
+ case Mips::PseudoSELECTFP_T_S:
+ case Mips::PseudoSELECTFP_T_D32:
+ case Mips::PseudoSELECTFP_T_D64:
+ return emitPseudoSELECT(MI, BB, true, Mips::BC1T);
+ }
+}
+
+// This function also handles Mips::ATOMIC_SWAP_I32 (when BinOpcode == 0), and
+// Mips::ATOMIC_LOAD_NAND_I32 (when Nand == true)
+MachineBasicBlock *MipsTargetLowering::emitAtomicBinary(MachineInstr &MI,
+ MachineBasicBlock *BB,
+ unsigned Size,
+ unsigned BinOpcode,
+ bool Nand) const {
+ assert((Size == 4 || Size == 8) && "Unsupported size for EmitAtomicBinary.");
+
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo &RegInfo = MF->getRegInfo();
+ const TargetRegisterClass *RC = getRegClassFor(MVT::getIntegerVT(Size * 8));
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ const bool ArePtrs64bit = ABI.ArePtrs64bit();
+ DebugLoc DL = MI.getDebugLoc();
+ unsigned LL, SC, AND, NOR, ZERO, BEQ;
+
+ if (Size == 4) {
+ if (isMicroMips) {
+ LL = Mips::LL_MM;
+ SC = Mips::SC_MM;
+ } else {
+ LL = Subtarget.hasMips32r6()
+ ? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
+ : (ArePtrs64bit ? Mips::LL64 : Mips::LL);
+ SC = Subtarget.hasMips32r6()
+ ? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
+ : (ArePtrs64bit ? Mips::SC64 : Mips::SC);
+ }
+
+ AND = Mips::AND;
+ NOR = Mips::NOR;
+ ZERO = Mips::ZERO;
+ BEQ = Mips::BEQ;
+ } else {
+ LL = Subtarget.hasMips64r6() ? Mips::LLD_R6 : Mips::LLD;
+ SC = Subtarget.hasMips64r6() ? Mips::SCD_R6 : Mips::SCD;
+ AND = Mips::AND64;
+ NOR = Mips::NOR64;
+ ZERO = Mips::ZERO_64;
+ BEQ = Mips::BEQ64;
+ }
+
+ unsigned OldVal = MI.getOperand(0).getReg();
+ unsigned Ptr = MI.getOperand(1).getReg();
+ unsigned Incr = MI.getOperand(2).getReg();
+
+ unsigned StoreVal = RegInfo.createVirtualRegister(RC);
+ unsigned AndRes = RegInfo.createVirtualRegister(RC);
+ unsigned Success = RegInfo.createVirtualRegister(RC);
+
+ // insert new blocks after the current block
+ const BasicBlock *LLVM_BB = BB->getBasicBlock();
+ MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineFunction::iterator It = ++BB->getIterator();
+ MF->insert(It, loopMBB);
+ MF->insert(It, exitMBB);
+
+ // Transfer the remainder of BB and its successor edges to exitMBB.
+ exitMBB->splice(exitMBB->begin(), BB,
+ std::next(MachineBasicBlock::iterator(MI)), BB->end());
+ exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+ // thisMBB:
+ // ...
+ // fallthrough --> loopMBB
+ BB->addSuccessor(loopMBB);
+ loopMBB->addSuccessor(loopMBB);
+ loopMBB->addSuccessor(exitMBB);
+
+ // loopMBB:
+ // ll oldval, 0(ptr)
+ // <binop> storeval, oldval, incr
+ // sc success, storeval, 0(ptr)
+ // beq success, $0, loopMBB
+ BB = loopMBB;
+ BuildMI(BB, DL, TII->get(LL), OldVal).addReg(Ptr).addImm(0);
+ if (Nand) {
+ // and andres, oldval, incr
+ // nor storeval, $0, andres
+ BuildMI(BB, DL, TII->get(AND), AndRes).addReg(OldVal).addReg(Incr);
+ BuildMI(BB, DL, TII->get(NOR), StoreVal).addReg(ZERO).addReg(AndRes);
+ } else if (BinOpcode) {
+ // <binop> storeval, oldval, incr
+ BuildMI(BB, DL, TII->get(BinOpcode), StoreVal).addReg(OldVal).addReg(Incr);
+ } else {
+ StoreVal = Incr;
+ }
+ BuildMI(BB, DL, TII->get(SC), Success).addReg(StoreVal).addReg(Ptr).addImm(0);
+ BuildMI(BB, DL, TII->get(BEQ)).addReg(Success).addReg(ZERO).addMBB(loopMBB);
+
+ MI.eraseFromParent(); // The instruction is gone now.
+
+ return exitMBB;
+}
+
+MachineBasicBlock *MipsTargetLowering::emitSignExtendToI32InReg(
+ MachineInstr &MI, MachineBasicBlock *BB, unsigned Size, unsigned DstReg,
+ unsigned SrcReg) const {
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ if (Subtarget.hasMips32r2() && Size == 1) {
+ BuildMI(BB, DL, TII->get(Mips::SEB), DstReg).addReg(SrcReg);
+ return BB;
+ }
+
+ if (Subtarget.hasMips32r2() && Size == 2) {
+ BuildMI(BB, DL, TII->get(Mips::SEH), DstReg).addReg(SrcReg);
+ return BB;
+ }
+
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo &RegInfo = MF->getRegInfo();
+ const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
+ unsigned ScrReg = RegInfo.createVirtualRegister(RC);
+
+ assert(Size < 32);
+ int64_t ShiftImm = 32 - (Size * 8);
+
+ BuildMI(BB, DL, TII->get(Mips::SLL), ScrReg).addReg(SrcReg).addImm(ShiftImm);
+ BuildMI(BB, DL, TII->get(Mips::SRA), DstReg).addReg(ScrReg).addImm(ShiftImm);
+
+ return BB;
+}
+
+MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword(
+ MachineInstr &MI, MachineBasicBlock *BB, unsigned Size, unsigned BinOpcode,
+ bool Nand) const {
+ assert((Size == 1 || Size == 2) &&
+ "Unsupported size for EmitAtomicBinaryPartial.");
+
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo &RegInfo = MF->getRegInfo();
+ const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
+ const bool ArePtrs64bit = ABI.ArePtrs64bit();
+ const TargetRegisterClass *RCp =
+ getRegClassFor(ArePtrs64bit ? MVT::i64 : MVT::i32);
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ DebugLoc DL = MI.getDebugLoc();
+
+ unsigned Dest = MI.getOperand(0).getReg();
+ unsigned Ptr = MI.getOperand(1).getReg();
+ unsigned Incr = MI.getOperand(2).getReg();
+
+ unsigned AlignedAddr = RegInfo.createVirtualRegister(RCp);
+ unsigned ShiftAmt = RegInfo.createVirtualRegister(RC);
+ unsigned Mask = RegInfo.createVirtualRegister(RC);
+ unsigned Mask2 = RegInfo.createVirtualRegister(RC);
+ unsigned NewVal = RegInfo.createVirtualRegister(RC);
+ unsigned OldVal = RegInfo.createVirtualRegister(RC);
+ unsigned Incr2 = RegInfo.createVirtualRegister(RC);
+ unsigned MaskLSB2 = RegInfo.createVirtualRegister(RCp);
+ unsigned PtrLSB2 = RegInfo.createVirtualRegister(RC);
+ unsigned MaskUpper = RegInfo.createVirtualRegister(RC);
+ unsigned AndRes = RegInfo.createVirtualRegister(RC);
+ unsigned BinOpRes = RegInfo.createVirtualRegister(RC);
+ unsigned MaskedOldVal0 = RegInfo.createVirtualRegister(RC);
+ unsigned StoreVal = RegInfo.createVirtualRegister(RC);
+ unsigned MaskedOldVal1 = RegInfo.createVirtualRegister(RC);
+ unsigned SrlRes = RegInfo.createVirtualRegister(RC);
+ unsigned Success = RegInfo.createVirtualRegister(RC);
+
+ unsigned LL, SC;
+ if (isMicroMips) {
+ LL = Mips::LL_MM;
+ SC = Mips::SC_MM;
+ } else {
+ LL = Subtarget.hasMips32r6() ? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
+ : (ArePtrs64bit ? Mips::LL64 : Mips::LL);
+ SC = Subtarget.hasMips32r6() ? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
+ : (ArePtrs64bit ? Mips::SC64 : Mips::SC);
+ }
+
+ // insert new blocks after the current block
+ const BasicBlock *LLVM_BB = BB->getBasicBlock();
+ MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineFunction::iterator It = ++BB->getIterator();
+ MF->insert(It, loopMBB);
+ MF->insert(It, sinkMBB);
+ MF->insert(It, exitMBB);
+
+ // Transfer the remainder of BB and its successor edges to exitMBB.
+ exitMBB->splice(exitMBB->begin(), BB,
+ std::next(MachineBasicBlock::iterator(MI)), BB->end());
+ exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+ BB->addSuccessor(loopMBB);
+ loopMBB->addSuccessor(loopMBB);
+ loopMBB->addSuccessor(sinkMBB);
+ sinkMBB->addSuccessor(exitMBB);
+
+ // thisMBB:
+ // addiu masklsb2,$0,-4 # 0xfffffffc
+ // and alignedaddr,ptr,masklsb2
+ // andi ptrlsb2,ptr,3
+ // sll shiftamt,ptrlsb2,3
+ // ori maskupper,$0,255 # 0xff
+ // sll mask,maskupper,shiftamt
+ // nor mask2,$0,mask
+ // sll incr2,incr,shiftamt
+
+ int64_t MaskImm = (Size == 1) ? 255 : 65535;
+ BuildMI(BB, DL, TII->get(ABI.GetPtrAddiuOp()), MaskLSB2)
+ .addReg(ABI.GetNullPtr()).addImm(-4);
+ BuildMI(BB, DL, TII->get(ABI.GetPtrAndOp()), AlignedAddr)
+ .addReg(Ptr).addReg(MaskLSB2);
+ BuildMI(BB, DL, TII->get(Mips::ANDi), PtrLSB2)
+ .addReg(Ptr, 0, ArePtrs64bit ? Mips::sub_32 : 0).addImm(3);
+ if (Subtarget.isLittle()) {
+ BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(PtrLSB2).addImm(3);
+ } else {
+ unsigned Off = RegInfo.createVirtualRegister(RC);
+ BuildMI(BB, DL, TII->get(Mips::XORi), Off)
+ .addReg(PtrLSB2).addImm((Size == 1) ? 3 : 2);
+ BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(Off).addImm(3);
+ }
+ BuildMI(BB, DL, TII->get(Mips::ORi), MaskUpper)
+ .addReg(Mips::ZERO).addImm(MaskImm);
+ BuildMI(BB, DL, TII->get(Mips::SLLV), Mask)
+ .addReg(MaskUpper).addReg(ShiftAmt);
+ BuildMI(BB, DL, TII->get(Mips::NOR), Mask2).addReg(Mips::ZERO).addReg(Mask);
+ BuildMI(BB, DL, TII->get(Mips::SLLV), Incr2).addReg(Incr).addReg(ShiftAmt);
+
+ // atomic.load.binop
+ // loopMBB:
+ // ll oldval,0(alignedaddr)
+ // binop binopres,oldval,incr2
+ // and newval,binopres,mask
+ // and maskedoldval0,oldval,mask2
+ // or storeval,maskedoldval0,newval
+ // sc success,storeval,0(alignedaddr)
+ // beq success,$0,loopMBB
+
+ // atomic.swap
+ // loopMBB:
+ // ll oldval,0(alignedaddr)
+ // and newval,incr2,mask
+ // and maskedoldval0,oldval,mask2
+ // or storeval,maskedoldval0,newval
+ // sc success,storeval,0(alignedaddr)
+ // beq success,$0,loopMBB
+
+ BB = loopMBB;
+ BuildMI(BB, DL, TII->get(LL), OldVal).addReg(AlignedAddr).addImm(0);
+ if (Nand) {
+ // and andres, oldval, incr2
+ // nor binopres, $0, andres
+ // and newval, binopres, mask
+ BuildMI(BB, DL, TII->get(Mips::AND), AndRes).addReg(OldVal).addReg(Incr2);
+ BuildMI(BB, DL, TII->get(Mips::NOR), BinOpRes)
+ .addReg(Mips::ZERO).addReg(AndRes);
+ BuildMI(BB, DL, TII->get(Mips::AND), NewVal).addReg(BinOpRes).addReg(Mask);
+ } else if (BinOpcode) {
+ // <binop> binopres, oldval, incr2
+ // and newval, binopres, mask
+ BuildMI(BB, DL, TII->get(BinOpcode), BinOpRes).addReg(OldVal).addReg(Incr2);
+ BuildMI(BB, DL, TII->get(Mips::AND), NewVal).addReg(BinOpRes).addReg(Mask);
+ } else { // atomic.swap
+ // and newval, incr2, mask
+ BuildMI(BB, DL, TII->get(Mips::AND), NewVal).addReg(Incr2).addReg(Mask);
+ }
+
+ BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal0)
+ .addReg(OldVal).addReg(Mask2);
+ BuildMI(BB, DL, TII->get(Mips::OR), StoreVal)
+ .addReg(MaskedOldVal0).addReg(NewVal);
+ BuildMI(BB, DL, TII->get(SC), Success)
+ .addReg(StoreVal).addReg(AlignedAddr).addImm(0);
+ BuildMI(BB, DL, TII->get(Mips::BEQ))
+ .addReg(Success).addReg(Mips::ZERO).addMBB(loopMBB);
+
+ // sinkMBB:
+ // and maskedoldval1,oldval,mask
+ // srl srlres,maskedoldval1,shiftamt
+ // sign_extend dest,srlres
+ BB = sinkMBB;
+
+ BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal1)
+ .addReg(OldVal).addReg(Mask);
+ BuildMI(BB, DL, TII->get(Mips::SRLV), SrlRes)
+ .addReg(MaskedOldVal1).addReg(ShiftAmt);
+ BB = emitSignExtendToI32InReg(MI, BB, Size, Dest, SrlRes);
+
+ MI.eraseFromParent(); // The instruction is gone now.
+
+ return exitMBB;
+}
+
+MachineBasicBlock *MipsTargetLowering::emitAtomicCmpSwap(MachineInstr &MI,
+ MachineBasicBlock *BB,
+ unsigned Size) const {
+ assert((Size == 4 || Size == 8) && "Unsupported size for EmitAtomicCmpSwap.");
+
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo &RegInfo = MF->getRegInfo();
+ const TargetRegisterClass *RC = getRegClassFor(MVT::getIntegerVT(Size * 8));
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ const bool ArePtrs64bit = ABI.ArePtrs64bit();
+ DebugLoc DL = MI.getDebugLoc();
+ unsigned LL, SC, ZERO, BNE, BEQ;
+
+ if (Size == 4) {
+ if (isMicroMips) {
+ LL = Mips::LL_MM;
+ SC = Mips::SC_MM;
+ } else {
+ LL = Subtarget.hasMips32r6()
+ ? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
+ : (ArePtrs64bit ? Mips::LL64 : Mips::LL);
+ SC = Subtarget.hasMips32r6()
+ ? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
+ : (ArePtrs64bit ? Mips::SC64 : Mips::SC);
+ }
+
+ ZERO = Mips::ZERO;
+ BNE = Mips::BNE;
+ BEQ = Mips::BEQ;
+ } else {
+ LL = Subtarget.hasMips64r6() ? Mips::LLD_R6 : Mips::LLD;
+ SC = Subtarget.hasMips64r6() ? Mips::SCD_R6 : Mips::SCD;
+ ZERO = Mips::ZERO_64;
+ BNE = Mips::BNE64;
+ BEQ = Mips::BEQ64;
+ }
+
+ unsigned Dest = MI.getOperand(0).getReg();
+ unsigned Ptr = MI.getOperand(1).getReg();
+ unsigned OldVal = MI.getOperand(2).getReg();
+ unsigned NewVal = MI.getOperand(3).getReg();
+
+ unsigned Success = RegInfo.createVirtualRegister(RC);
+
+ // insert new blocks after the current block
+ const BasicBlock *LLVM_BB = BB->getBasicBlock();
+ MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineFunction::iterator It = ++BB->getIterator();
+ MF->insert(It, loop1MBB);
+ MF->insert(It, loop2MBB);
+ MF->insert(It, exitMBB);
+
+ // Transfer the remainder of BB and its successor edges to exitMBB.
+ exitMBB->splice(exitMBB->begin(), BB,
+ std::next(MachineBasicBlock::iterator(MI)), BB->end());
+ exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+ // thisMBB:
+ // ...
+ // fallthrough --> loop1MBB
+ BB->addSuccessor(loop1MBB);
+ loop1MBB->addSuccessor(exitMBB);
+ loop1MBB->addSuccessor(loop2MBB);
+ loop2MBB->addSuccessor(loop1MBB);
+ loop2MBB->addSuccessor(exitMBB);
+
+ // loop1MBB:
+ // ll dest, 0(ptr)
+ // bne dest, oldval, exitMBB
+ BB = loop1MBB;
+ BuildMI(BB, DL, TII->get(LL), Dest).addReg(Ptr).addImm(0);
+ BuildMI(BB, DL, TII->get(BNE))
+ .addReg(Dest).addReg(OldVal).addMBB(exitMBB);
+
+ // loop2MBB:
+ // sc success, newval, 0(ptr)
+ // beq success, $0, loop1MBB
+ BB = loop2MBB;
+ BuildMI(BB, DL, TII->get(SC), Success)
+ .addReg(NewVal).addReg(Ptr).addImm(0);
+ BuildMI(BB, DL, TII->get(BEQ))
+ .addReg(Success).addReg(ZERO).addMBB(loop1MBB);
+
+ MI.eraseFromParent(); // The instruction is gone now.
+
+ return exitMBB;
+}
+
+MachineBasicBlock *MipsTargetLowering::emitAtomicCmpSwapPartword(
+ MachineInstr &MI, MachineBasicBlock *BB, unsigned Size) const {
+ assert((Size == 1 || Size == 2) &&
+ "Unsupported size for EmitAtomicCmpSwapPartial.");
+
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo &RegInfo = MF->getRegInfo();
+ const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
+ const bool ArePtrs64bit = ABI.ArePtrs64bit();
+ const TargetRegisterClass *RCp =
+ getRegClassFor(ArePtrs64bit ? MVT::i64 : MVT::i32);
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ DebugLoc DL = MI.getDebugLoc();
+
+ unsigned Dest = MI.getOperand(0).getReg();
+ unsigned Ptr = MI.getOperand(1).getReg();
+ unsigned CmpVal = MI.getOperand(2).getReg();
+ unsigned NewVal = MI.getOperand(3).getReg();
+
+ unsigned AlignedAddr = RegInfo.createVirtualRegister(RCp);
+ unsigned ShiftAmt = RegInfo.createVirtualRegister(RC);
+ unsigned Mask = RegInfo.createVirtualRegister(RC);
+ unsigned Mask2 = RegInfo.createVirtualRegister(RC);
+ unsigned ShiftedCmpVal = RegInfo.createVirtualRegister(RC);
+ unsigned OldVal = RegInfo.createVirtualRegister(RC);
+ unsigned MaskedOldVal0 = RegInfo.createVirtualRegister(RC);
+ unsigned ShiftedNewVal = RegInfo.createVirtualRegister(RC);
+ unsigned MaskLSB2 = RegInfo.createVirtualRegister(RCp);
+ unsigned PtrLSB2 = RegInfo.createVirtualRegister(RC);
+ unsigned MaskUpper = RegInfo.createVirtualRegister(RC);
+ unsigned MaskedCmpVal = RegInfo.createVirtualRegister(RC);
+ unsigned MaskedNewVal = RegInfo.createVirtualRegister(RC);
+ unsigned MaskedOldVal1 = RegInfo.createVirtualRegister(RC);
+ unsigned StoreVal = RegInfo.createVirtualRegister(RC);
+ unsigned SrlRes = RegInfo.createVirtualRegister(RC);
+ unsigned Success = RegInfo.createVirtualRegister(RC);
+ unsigned LL, SC;
+
+ if (isMicroMips) {
+ LL = Mips::LL_MM;
+ SC = Mips::SC_MM;
+ } else {
+ LL = Subtarget.hasMips32r6() ? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
+ : (ArePtrs64bit ? Mips::LL64 : Mips::LL);
+ SC = Subtarget.hasMips32r6() ? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
+ : (ArePtrs64bit ? Mips::SC64 : Mips::SC);
+ }
+
+ // insert new blocks after the current block
+ const BasicBlock *LLVM_BB = BB->getBasicBlock();
+ MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineFunction::iterator It = ++BB->getIterator();
+ MF->insert(It, loop1MBB);
+ MF->insert(It, loop2MBB);
+ MF->insert(It, sinkMBB);
+ MF->insert(It, exitMBB);
+
+ // Transfer the remainder of BB and its successor edges to exitMBB.
+ exitMBB->splice(exitMBB->begin(), BB,
+ std::next(MachineBasicBlock::iterator(MI)), BB->end());
+ exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+ BB->addSuccessor(loop1MBB);
+ loop1MBB->addSuccessor(sinkMBB);
+ loop1MBB->addSuccessor(loop2MBB);
+ loop2MBB->addSuccessor(loop1MBB);
+ loop2MBB->addSuccessor(sinkMBB);
+ sinkMBB->addSuccessor(exitMBB);
+
+ // FIXME: computation of newval2 can be moved to loop2MBB.
+ // thisMBB:
+ // addiu masklsb2,$0,-4 # 0xfffffffc
+ // and alignedaddr,ptr,masklsb2
+ // andi ptrlsb2,ptr,3
+ // xori ptrlsb2,ptrlsb2,3 # Only for BE
+ // sll shiftamt,ptrlsb2,3
+ // ori maskupper,$0,255 # 0xff
+ // sll mask,maskupper,shiftamt
+ // nor mask2,$0,mask
+ // andi maskedcmpval,cmpval,255
+ // sll shiftedcmpval,maskedcmpval,shiftamt
+ // andi maskednewval,newval,255
+ // sll shiftednewval,maskednewval,shiftamt
+ int64_t MaskImm = (Size == 1) ? 255 : 65535;
+ BuildMI(BB, DL, TII->get(ArePtrs64bit ? Mips::DADDiu : Mips::ADDiu), MaskLSB2)
+ .addReg(ABI.GetNullPtr()).addImm(-4);
+ BuildMI(BB, DL, TII->get(ArePtrs64bit ? Mips::AND64 : Mips::AND), AlignedAddr)
+ .addReg(Ptr).addReg(MaskLSB2);
+ BuildMI(BB, DL, TII->get(Mips::ANDi), PtrLSB2)
+ .addReg(Ptr, 0, ArePtrs64bit ? Mips::sub_32 : 0).addImm(3);
+ if (Subtarget.isLittle()) {
+ BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(PtrLSB2).addImm(3);
+ } else {
+ unsigned Off = RegInfo.createVirtualRegister(RC);
+ BuildMI(BB, DL, TII->get(Mips::XORi), Off)
+ .addReg(PtrLSB2).addImm((Size == 1) ? 3 : 2);
+ BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(Off).addImm(3);
+ }
+ BuildMI(BB, DL, TII->get(Mips::ORi), MaskUpper)
+ .addReg(Mips::ZERO).addImm(MaskImm);
+ BuildMI(BB, DL, TII->get(Mips::SLLV), Mask)
+ .addReg(MaskUpper).addReg(ShiftAmt);
+ BuildMI(BB, DL, TII->get(Mips::NOR), Mask2).addReg(Mips::ZERO).addReg(Mask);
+ BuildMI(BB, DL, TII->get(Mips::ANDi), MaskedCmpVal)
+ .addReg(CmpVal).addImm(MaskImm);
+ BuildMI(BB, DL, TII->get(Mips::SLLV), ShiftedCmpVal)
+ .addReg(MaskedCmpVal).addReg(ShiftAmt);
+ BuildMI(BB, DL, TII->get(Mips::ANDi), MaskedNewVal)
+ .addReg(NewVal).addImm(MaskImm);
+ BuildMI(BB, DL, TII->get(Mips::SLLV), ShiftedNewVal)
+ .addReg(MaskedNewVal).addReg(ShiftAmt);
+
+ // loop1MBB:
+ // ll oldval,0(alginedaddr)
+ // and maskedoldval0,oldval,mask
+ // bne maskedoldval0,shiftedcmpval,sinkMBB
+ BB = loop1MBB;
+ BuildMI(BB, DL, TII->get(LL), OldVal).addReg(AlignedAddr).addImm(0);
+ BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal0)
+ .addReg(OldVal).addReg(Mask);
+ BuildMI(BB, DL, TII->get(Mips::BNE))
+ .addReg(MaskedOldVal0).addReg(ShiftedCmpVal).addMBB(sinkMBB);
+
+ // loop2MBB:
+ // and maskedoldval1,oldval,mask2
+ // or storeval,maskedoldval1,shiftednewval
+ // sc success,storeval,0(alignedaddr)
+ // beq success,$0,loop1MBB
+ BB = loop2MBB;
+ BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal1)
+ .addReg(OldVal).addReg(Mask2);
+ BuildMI(BB, DL, TII->get(Mips::OR), StoreVal)
+ .addReg(MaskedOldVal1).addReg(ShiftedNewVal);
+ BuildMI(BB, DL, TII->get(SC), Success)
+ .addReg(StoreVal).addReg(AlignedAddr).addImm(0);
+ BuildMI(BB, DL, TII->get(Mips::BEQ))
+ .addReg(Success).addReg(Mips::ZERO).addMBB(loop1MBB);
+
+ // sinkMBB:
+ // srl srlres,maskedoldval0,shiftamt
+ // sign_extend dest,srlres
+ BB = sinkMBB;
+
+ BuildMI(BB, DL, TII->get(Mips::SRLV), SrlRes)
+ .addReg(MaskedOldVal0).addReg(ShiftAmt);
+ BB = emitSignExtendToI32InReg(MI, BB, Size, Dest, SrlRes);
+
+ MI.eraseFromParent(); // The instruction is gone now.
+
+ return exitMBB;
+}
+
+MachineBasicBlock *MipsTargetLowering::emitSEL_D(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ MachineFunction *MF = BB->getParent();
+ const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ MachineRegisterInfo &RegInfo = MF->getRegInfo();
+ DebugLoc DL = MI.getDebugLoc();
+ MachineBasicBlock::iterator II(MI);
+
+ unsigned Fc = MI.getOperand(1).getReg();
+ const auto &FGR64RegClass = TRI->getRegClass(Mips::FGR64RegClassID);
+
+ unsigned Fc2 = RegInfo.createVirtualRegister(FGR64RegClass);
+
+ BuildMI(*BB, II, DL, TII->get(Mips::SUBREG_TO_REG), Fc2)
+ .addImm(0)
+ .addReg(Fc)
+ .addImm(Mips::sub_lo);
+
+ // We don't erase the original instruction, we just replace the condition
+ // register with the 64-bit super-register.
+ MI.getOperand(1).setReg(Fc2);
+
+ return BB;
+}
+
+SDValue MipsTargetLowering::lowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
+ // The first operand is the chain, the second is the condition, the third is
+ // the block to branch to if the condition is true.
+ SDValue Chain = Op.getOperand(0);
+ SDValue Dest = Op.getOperand(2);
+ SDLoc DL(Op);
+
+ assert(!Subtarget.hasMips32r6() && !Subtarget.hasMips64r6());
+ SDValue CondRes = createFPCmp(DAG, Op.getOperand(1));
+
+ // Return if flag is not set by a floating point comparison.
+ if (CondRes.getOpcode() != MipsISD::FPCmp)
+ return Op;
+
+ SDValue CCNode = CondRes.getOperand(2);
+ Mips::CondCode CC =
+ (Mips::CondCode)cast<ConstantSDNode>(CCNode)->getZExtValue();
+ unsigned Opc = invertFPCondCodeUser(CC) ? Mips::BRANCH_F : Mips::BRANCH_T;
+ SDValue BrCode = DAG.getConstant(Opc, DL, MVT::i32);
+ SDValue FCC0 = DAG.getRegister(Mips::FCC0, MVT::i32);
+ return DAG.getNode(MipsISD::FPBrcond, DL, Op.getValueType(), Chain, BrCode,
+ FCC0, Dest, CondRes);
+}
+
+SDValue MipsTargetLowering::
+lowerSELECT(SDValue Op, SelectionDAG &DAG) const
+{
+ assert(!Subtarget.hasMips32r6() && !Subtarget.hasMips64r6());
+ SDValue Cond = createFPCmp(DAG, Op.getOperand(0));
+
+ // Return if flag is not set by a floating point comparison.
+ if (Cond.getOpcode() != MipsISD::FPCmp)
+ return Op;
+
+ return createCMovFP(DAG, Cond, Op.getOperand(1), Op.getOperand(2),
+ SDLoc(Op));
+}
+
+SDValue MipsTargetLowering::lowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+ assert(!Subtarget.hasMips32r6() && !Subtarget.hasMips64r6());
+ SDValue Cond = createFPCmp(DAG, Op);
+
+ assert(Cond.getOpcode() == MipsISD::FPCmp &&
+ "Floating point operand expected.");
+
+ SDLoc DL(Op);
+ SDValue True = DAG.getConstant(1, DL, MVT::i32);
+ SDValue False = DAG.getConstant(0, DL, MVT::i32);
+
+ return createCMovFP(DAG, Cond, True, False, DL);
+}
+
+SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT Ty = Op.getValueType();
+ GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
+ const GlobalValue *GV = N->getGlobal();
+
+ if (!isPositionIndependent() && !ABI.IsN64()) {
+ const MipsTargetObjectFile *TLOF =
+ static_cast<const MipsTargetObjectFile *>(
+ getTargetMachine().getObjFileLowering());
+ const GlobalObject *GO = GV->getBaseObject();
+ if (GO && TLOF->IsGlobalInSmallSection(GO, getTargetMachine()))
+ // %gp_rel relocation
+ return getAddrGPRel(N, SDLoc(N), Ty, DAG);
+
+ // %hi/%lo relocation
+ return getAddrNonPIC(N, SDLoc(N), Ty, DAG);
+ }
+
+ // Every other architecture would use shouldAssumeDSOLocal in here, but
+ // mips is special.
+ // * In PIC code mips requires got loads even for local statics!
+ // * To save on got entries, for local statics the got entry contains the
+ // page and an additional add instruction takes care of the low bits.
+ // * It is legal to access a hidden symbol with a non hidden undefined,
+ // so one cannot guarantee that all access to a hidden symbol will know
+ // it is hidden.
+ // * Mips linkers don't support creating a page and a full got entry for
+ // the same symbol.
+ // * Given all that, we have to use a full got entry for hidden symbols :-(
+ if (GV->hasLocalLinkage())
+ return getAddrLocal(N, SDLoc(N), Ty, DAG, ABI.IsN32() || ABI.IsN64());
+
+ if (LargeGOT)
+ return getAddrGlobalLargeGOT(
+ N, SDLoc(N), Ty, DAG, MipsII::MO_GOT_HI16, MipsII::MO_GOT_LO16,
+ DAG.getEntryNode(),
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+
+ return getAddrGlobal(
+ N, SDLoc(N), Ty, DAG,
+ (ABI.IsN32() || ABI.IsN64()) ? MipsII::MO_GOT_DISP : MipsII::MO_GOT,
+ DAG.getEntryNode(), MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+}
+
+SDValue MipsTargetLowering::lowerBlockAddress(SDValue Op,
+ SelectionDAG &DAG) const {
+ BlockAddressSDNode *N = cast<BlockAddressSDNode>(Op);
+ EVT Ty = Op.getValueType();
+
+ if (!isPositionIndependent() && !ABI.IsN64())
+ return getAddrNonPIC(N, SDLoc(N), Ty, DAG);
+
+ return getAddrLocal(N, SDLoc(N), Ty, DAG, ABI.IsN32() || ABI.IsN64());
+}
+
+SDValue MipsTargetLowering::
+lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
+{
+ // If the relocation model is PIC, use the General Dynamic TLS Model or
+ // Local Dynamic TLS model, otherwise use the Initial Exec or
+ // Local Exec TLS Model.
+
+ GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+ if (DAG.getTarget().Options.EmulatedTLS)
+ return LowerToTLSEmulatedModel(GA, DAG);
+
+ SDLoc DL(GA);
+ const GlobalValue *GV = GA->getGlobal();
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+ TLSModel::Model model = getTargetMachine().getTLSModel(GV);
+
+ if (model == TLSModel::GeneralDynamic || model == TLSModel::LocalDynamic) {
+ // General Dynamic and Local Dynamic TLS Model.
+ unsigned Flag = (model == TLSModel::LocalDynamic) ? MipsII::MO_TLSLDM
+ : MipsII::MO_TLSGD;
+
+ SDValue TGA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, Flag);
+ SDValue Argument = DAG.getNode(MipsISD::Wrapper, DL, PtrVT,
+ getGlobalReg(DAG, PtrVT), TGA);
+ unsigned PtrSize = PtrVT.getSizeInBits();
+ IntegerType *PtrTy = Type::getIntNTy(*DAG.getContext(), PtrSize);
+
+ SDValue TlsGetAddr = DAG.getExternalSymbol("__tls_get_addr", PtrVT);
+
+ ArgListTy Args;
+ ArgListEntry Entry;
+ Entry.Node = Argument;
+ Entry.Ty = PtrTy;
+ Args.push_back(Entry);
+
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(DL).setChain(DAG.getEntryNode())
+ .setCallee(CallingConv::C, PtrTy, TlsGetAddr, std::move(Args));
+ std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+
+ SDValue Ret = CallResult.first;
+
+ if (model != TLSModel::LocalDynamic)
+ return Ret;
+
+ SDValue TGAHi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+ MipsII::MO_DTPREL_HI);
+ SDValue Hi = DAG.getNode(MipsISD::Hi, DL, PtrVT, TGAHi);
+ SDValue TGALo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+ MipsII::MO_DTPREL_LO);
+ SDValue Lo = DAG.getNode(MipsISD::Lo, DL, PtrVT, TGALo);
+ SDValue Add = DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Ret);
+ return DAG.getNode(ISD::ADD, DL, PtrVT, Add, Lo);
+ }
+
+ SDValue Offset;
+ if (model == TLSModel::InitialExec) {
+ // Initial Exec TLS Model
+ SDValue TGA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+ MipsII::MO_GOTTPREL);
+ TGA = DAG.getNode(MipsISD::Wrapper, DL, PtrVT, getGlobalReg(DAG, PtrVT),
+ TGA);
+ Offset =
+ DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), TGA, MachinePointerInfo());
+ } else {
+ // Local Exec TLS Model
+ assert(model == TLSModel::LocalExec);
+ SDValue TGAHi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+ MipsII::MO_TPREL_HI);
+ SDValue TGALo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+ MipsII::MO_TPREL_LO);
+ SDValue Hi = DAG.getNode(MipsISD::Hi, DL, PtrVT, TGAHi);
+ SDValue Lo = DAG.getNode(MipsISD::Lo, DL, PtrVT, TGALo);
+ Offset = DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Lo);
+ }
+
+ SDValue ThreadPointer = DAG.getNode(MipsISD::ThreadPointer, DL, PtrVT);
+ return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadPointer, Offset);
+}
+
+SDValue MipsTargetLowering::
+lowerJumpTable(SDValue Op, SelectionDAG &DAG) const
+{
+ JumpTableSDNode *N = cast<JumpTableSDNode>(Op);
+ EVT Ty = Op.getValueType();
+
+ if (!isPositionIndependent() && !ABI.IsN64())
+ return getAddrNonPIC(N, SDLoc(N), Ty, DAG);
+
+ return getAddrLocal(N, SDLoc(N), Ty, DAG, ABI.IsN32() || ABI.IsN64());
+}
+
+SDValue MipsTargetLowering::
+lowerConstantPool(SDValue Op, SelectionDAG &DAG) const
+{
+ ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op);
+ EVT Ty = Op.getValueType();
+
+ if (!isPositionIndependent() && !ABI.IsN64()) {
+ const MipsTargetObjectFile *TLOF =
+ static_cast<const MipsTargetObjectFile *>(
+ getTargetMachine().getObjFileLowering());
+
+ if (TLOF->IsConstantInSmallSection(DAG.getDataLayout(), N->getConstVal(),
+ getTargetMachine()))
+ // %gp_rel relocation
+ return getAddrGPRel(N, SDLoc(N), Ty, DAG);
+
+ return getAddrNonPIC(N, SDLoc(N), Ty, DAG);
+ }
+
+ return getAddrLocal(N, SDLoc(N), Ty, DAG, ABI.IsN32() || ABI.IsN64());
+}
+
+SDValue MipsTargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MipsFunctionInfo *FuncInfo = MF.getInfo<MipsFunctionInfo>();
+
+ SDLoc DL(Op);
+ SDValue FI = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
+ getPointerTy(MF.getDataLayout()));
+
+ // vastart just stores the address of the VarArgsFrameIndex slot into the
+ // memory location argument.
+ const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+ return DAG.getStore(Op.getOperand(0), DL, FI, Op.getOperand(1),
+ MachinePointerInfo(SV));
+}
+
+SDValue MipsTargetLowering::lowerVAARG(SDValue Op, SelectionDAG &DAG) const {
+ SDNode *Node = Op.getNode();
+ EVT VT = Node->getValueType(0);
+ SDValue Chain = Node->getOperand(0);
+ SDValue VAListPtr = Node->getOperand(1);
+ unsigned Align = Node->getConstantOperandVal(3);
+ const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
+ SDLoc DL(Node);
+ unsigned ArgSlotSizeInBytes = (ABI.IsN32() || ABI.IsN64()) ? 8 : 4;
+
+ SDValue VAListLoad = DAG.getLoad(getPointerTy(DAG.getDataLayout()), DL, Chain,
+ VAListPtr, MachinePointerInfo(SV));
+ SDValue VAList = VAListLoad;
+
+ // Re-align the pointer if necessary.
+ // It should only ever be necessary for 64-bit types on O32 since the minimum
+ // argument alignment is the same as the maximum type alignment for N32/N64.
+ //
+ // FIXME: We currently align too often. The code generator doesn't notice
+ // when the pointer is still aligned from the last va_arg (or pair of
+ // va_args for the i64 on O32 case).
+ if (Align > getMinStackArgumentAlignment()) {
+ assert(((Align & (Align-1)) == 0) && "Expected Align to be a power of 2");
+
+ VAList = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
+ DAG.getConstant(Align - 1, DL, VAList.getValueType()));
+
+ VAList = DAG.getNode(ISD::AND, DL, VAList.getValueType(), VAList,
+ DAG.getConstant(-(int64_t)Align, DL,
+ VAList.getValueType()));
+ }
+
+ // Increment the pointer, VAList, to the next vaarg.
+ auto &TD = DAG.getDataLayout();
+ unsigned ArgSizeInBytes =
+ TD.getTypeAllocSize(VT.getTypeForEVT(*DAG.getContext()));
+ SDValue Tmp3 =
+ DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
+ DAG.getConstant(alignTo(ArgSizeInBytes, ArgSlotSizeInBytes),
+ DL, VAList.getValueType()));
+ // Store the incremented VAList to the legalized pointer
+ Chain = DAG.getStore(VAListLoad.getValue(1), DL, Tmp3, VAListPtr,
+ MachinePointerInfo(SV));
+
+ // In big-endian mode we must adjust the pointer when the load size is smaller
+ // than the argument slot size. We must also reduce the known alignment to
+ // match. For example in the N64 ABI, we must add 4 bytes to the offset to get
+ // the correct half of the slot, and reduce the alignment from 8 (slot
+ // alignment) down to 4 (type alignment).
+ if (!Subtarget.isLittle() && ArgSizeInBytes < ArgSlotSizeInBytes) {
+ unsigned Adjustment = ArgSlotSizeInBytes - ArgSizeInBytes;
+ VAList = DAG.getNode(ISD::ADD, DL, VAListPtr.getValueType(), VAList,
+ DAG.getIntPtrConstant(Adjustment, DL));
+ }
+ // Load the actual argument out of the pointer VAList
+ return DAG.getLoad(VT, DL, Chain, VAList, MachinePointerInfo());
+}
+
+static SDValue lowerFCOPYSIGN32(SDValue Op, SelectionDAG &DAG,
+ bool HasExtractInsert) {
+ EVT TyX = Op.getOperand(0).getValueType();
+ EVT TyY = Op.getOperand(1).getValueType();
+ SDLoc DL(Op);
+ SDValue Const1 = DAG.getConstant(1, DL, MVT::i32);
+ SDValue Const31 = DAG.getConstant(31, DL, MVT::i32);
+ SDValue Res;
+
+ // If operand is of type f64, extract the upper 32-bit. Otherwise, bitcast it
+ // to i32.
+ SDValue X = (TyX == MVT::f32) ?
+ DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op.getOperand(0)) :
+ DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32, Op.getOperand(0),
+ Const1);
+ SDValue Y = (TyY == MVT::f32) ?
+ DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op.getOperand(1)) :
+ DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32, Op.getOperand(1),
+ Const1);
+
+ if (HasExtractInsert) {
+ // ext E, Y, 31, 1 ; extract bit31 of Y
+ // ins X, E, 31, 1 ; insert extracted bit at bit31 of X
+ SDValue E = DAG.getNode(MipsISD::Ext, DL, MVT::i32, Y, Const31, Const1);
+ Res = DAG.getNode(MipsISD::Ins, DL, MVT::i32, E, Const31, Const1, X);
+ } else {
+ // sll SllX, X, 1
+ // srl SrlX, SllX, 1
+ // srl SrlY, Y, 31
+ // sll SllY, SrlX, 31
+ // or Or, SrlX, SllY
+ SDValue SllX = DAG.getNode(ISD::SHL, DL, MVT::i32, X, Const1);
+ SDValue SrlX = DAG.getNode(ISD::SRL, DL, MVT::i32, SllX, Const1);
+ SDValue SrlY = DAG.getNode(ISD::SRL, DL, MVT::i32, Y, Const31);
+ SDValue SllY = DAG.getNode(ISD::SHL, DL, MVT::i32, SrlY, Const31);
+ Res = DAG.getNode(ISD::OR, DL, MVT::i32, SrlX, SllY);
+ }
+
+ if (TyX == MVT::f32)
+ return DAG.getNode(ISD::BITCAST, DL, Op.getOperand(0).getValueType(), Res);
+
+ SDValue LowX = DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32,
+ Op.getOperand(0),
+ DAG.getConstant(0, DL, MVT::i32));
+ return DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64, LowX, Res);
+}
+
+static SDValue lowerFCOPYSIGN64(SDValue Op, SelectionDAG &DAG,
+ bool HasExtractInsert) {
+ unsigned WidthX = Op.getOperand(0).getValueSizeInBits();
+ unsigned WidthY = Op.getOperand(1).getValueSizeInBits();
+ EVT TyX = MVT::getIntegerVT(WidthX), TyY = MVT::getIntegerVT(WidthY);
+ SDLoc DL(Op);
+ SDValue Const1 = DAG.getConstant(1, DL, MVT::i32);
+
+ // Bitcast to integer nodes.
+ SDValue X = DAG.getNode(ISD::BITCAST, DL, TyX, Op.getOperand(0));
+ SDValue Y = DAG.getNode(ISD::BITCAST, DL, TyY, Op.getOperand(1));
+
+ if (HasExtractInsert) {
+ // ext E, Y, width(Y) - 1, 1 ; extract bit width(Y)-1 of Y
+ // ins X, E, width(X) - 1, 1 ; insert extracted bit at bit width(X)-1 of X
+ SDValue E = DAG.getNode(MipsISD::Ext, DL, TyY, Y,
+ DAG.getConstant(WidthY - 1, DL, MVT::i32), Const1);
+
+ if (WidthX > WidthY)
+ E = DAG.getNode(ISD::ZERO_EXTEND, DL, TyX, E);
+ else if (WidthY > WidthX)
+ E = DAG.getNode(ISD::TRUNCATE, DL, TyX, E);
+
+ SDValue I = DAG.getNode(MipsISD::Ins, DL, TyX, E,
+ DAG.getConstant(WidthX - 1, DL, MVT::i32), Const1,
+ X);
+ return DAG.getNode(ISD::BITCAST, DL, Op.getOperand(0).getValueType(), I);
+ }
+
+ // (d)sll SllX, X, 1
+ // (d)srl SrlX, SllX, 1
+ // (d)srl SrlY, Y, width(Y)-1
+ // (d)sll SllY, SrlX, width(Y)-1
+ // or Or, SrlX, SllY
+ SDValue SllX = DAG.getNode(ISD::SHL, DL, TyX, X, Const1);
+ SDValue SrlX = DAG.getNode(ISD::SRL, DL, TyX, SllX, Const1);
+ SDValue SrlY = DAG.getNode(ISD::SRL, DL, TyY, Y,
+ DAG.getConstant(WidthY - 1, DL, MVT::i32));
+
+ if (WidthX > WidthY)
+ SrlY = DAG.getNode(ISD::ZERO_EXTEND, DL, TyX, SrlY);
+ else if (WidthY > WidthX)
+ SrlY = DAG.getNode(ISD::TRUNCATE, DL, TyX, SrlY);
+
+ SDValue SllY = DAG.getNode(ISD::SHL, DL, TyX, SrlY,
+ DAG.getConstant(WidthX - 1, DL, MVT::i32));
+ SDValue Or = DAG.getNode(ISD::OR, DL, TyX, SrlX, SllY);
+ return DAG.getNode(ISD::BITCAST, DL, Op.getOperand(0).getValueType(), Or);
+}
+
+SDValue
+MipsTargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
+ if (Subtarget.isGP64bit())
+ return lowerFCOPYSIGN64(Op, DAG, Subtarget.hasExtractInsert());
+
+ return lowerFCOPYSIGN32(Op, DAG, Subtarget.hasExtractInsert());
+}
+
+SDValue MipsTargetLowering::
+lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
+ // check the depth
+ assert((cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() == 0) &&
+ "Frame address can only be determined for current frame.");
+
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ MFI.setFrameAddressIsTaken(true);
+ EVT VT = Op.getValueType();
+ SDLoc DL(Op);
+ SDValue FrameAddr = DAG.getCopyFromReg(
+ DAG.getEntryNode(), DL, ABI.IsN64() ? Mips::FP_64 : Mips::FP, VT);
+ return FrameAddr;
+}
+
+SDValue MipsTargetLowering::lowerRETURNADDR(SDValue Op,
+ SelectionDAG &DAG) const {
+ if (verifyReturnAddressArgumentIsConstant(Op, DAG))
+ return SDValue();
+
+ // check the depth
+ assert((cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() == 0) &&
+ "Return address can be determined only for current frame.");
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MVT VT = Op.getSimpleValueType();
+ unsigned RA = ABI.IsN64() ? Mips::RA_64 : Mips::RA;
+ MFI.setReturnAddressIsTaken(true);
+
+ // Return RA, which contains the return address. Mark it an implicit live-in.
+ unsigned Reg = MF.addLiveIn(RA, getRegClassFor(VT));
+ return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(Op), Reg, VT);
+}
+
+// An EH_RETURN is the result of lowering llvm.eh.return which in turn is
+// generated from __builtin_eh_return (offset, handler)
+// The effect of this is to adjust the stack pointer by "offset"
+// and then branch to "handler".
+SDValue MipsTargetLowering::lowerEH_RETURN(SDValue Op, SelectionDAG &DAG)
+ const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+ MipsFI->setCallsEhReturn();
+ SDValue Chain = Op.getOperand(0);
+ SDValue Offset = Op.getOperand(1);
+ SDValue Handler = Op.getOperand(2);
+ SDLoc DL(Op);
+ EVT Ty = ABI.IsN64() ? MVT::i64 : MVT::i32;
+
+ // Store stack offset in V1, store jump target in V0. Glue CopyToReg and
+ // EH_RETURN nodes, so that instructions are emitted back-to-back.
+ unsigned OffsetReg = ABI.IsN64() ? Mips::V1_64 : Mips::V1;
+ unsigned AddrReg = ABI.IsN64() ? Mips::V0_64 : Mips::V0;
+ Chain = DAG.getCopyToReg(Chain, DL, OffsetReg, Offset, SDValue());
+ Chain = DAG.getCopyToReg(Chain, DL, AddrReg, Handler, Chain.getValue(1));
+ return DAG.getNode(MipsISD::EH_RETURN, DL, MVT::Other, Chain,
+ DAG.getRegister(OffsetReg, Ty),
+ DAG.getRegister(AddrReg, getPointerTy(MF.getDataLayout())),
+ Chain.getValue(1));
+}
+
+SDValue MipsTargetLowering::lowerATOMIC_FENCE(SDValue Op,
+ SelectionDAG &DAG) const {
+ // FIXME: Need pseudo-fence for 'singlethread' fences
+ // FIXME: Set SType for weaker fences where supported/appropriate.
+ unsigned SType = 0;
+ SDLoc DL(Op);
+ return DAG.getNode(MipsISD::Sync, DL, MVT::Other, Op.getOperand(0),
+ DAG.getConstant(SType, DL, MVT::i32));
+}
+
+SDValue MipsTargetLowering::lowerShiftLeftParts(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ MVT VT = Subtarget.isGP64bit() ? MVT::i64 : MVT::i32;
+
+ SDValue Lo = Op.getOperand(0), Hi = Op.getOperand(1);
+ SDValue Shamt = Op.getOperand(2);
+ // if shamt < (VT.bits):
+ // lo = (shl lo, shamt)
+ // hi = (or (shl hi, shamt) (srl (srl lo, 1), ~shamt))
+ // else:
+ // lo = 0
+ // hi = (shl lo, shamt[4:0])
+ SDValue Not = DAG.getNode(ISD::XOR, DL, MVT::i32, Shamt,
+ DAG.getConstant(-1, DL, MVT::i32));
+ SDValue ShiftRight1Lo = DAG.getNode(ISD::SRL, DL, VT, Lo,
+ DAG.getConstant(1, DL, VT));
+ SDValue ShiftRightLo = DAG.getNode(ISD::SRL, DL, VT, ShiftRight1Lo, Not);
+ SDValue ShiftLeftHi = DAG.getNode(ISD::SHL, DL, VT, Hi, Shamt);
+ SDValue Or = DAG.getNode(ISD::OR, DL, VT, ShiftLeftHi, ShiftRightLo);
+ SDValue ShiftLeftLo = DAG.getNode(ISD::SHL, DL, VT, Lo, Shamt);
+ SDValue Cond = DAG.getNode(ISD::AND, DL, MVT::i32, Shamt,
+ DAG.getConstant(VT.getSizeInBits(), DL, MVT::i32));
+ Lo = DAG.getNode(ISD::SELECT, DL, VT, Cond,
+ DAG.getConstant(0, DL, VT), ShiftLeftLo);
+ Hi = DAG.getNode(ISD::SELECT, DL, VT, Cond, ShiftLeftLo, Or);
+
+ SDValue Ops[2] = {Lo, Hi};
+ return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue MipsTargetLowering::lowerShiftRightParts(SDValue Op, SelectionDAG &DAG,
+ bool IsSRA) const {
+ SDLoc DL(Op);
+ SDValue Lo = Op.getOperand(0), Hi = Op.getOperand(1);
+ SDValue Shamt = Op.getOperand(2);
+ MVT VT = Subtarget.isGP64bit() ? MVT::i64 : MVT::i32;
+
+ // if shamt < (VT.bits):
+ // lo = (or (shl (shl hi, 1), ~shamt) (srl lo, shamt))
+ // if isSRA:
+ // hi = (sra hi, shamt)
+ // else:
+ // hi = (srl hi, shamt)
+ // else:
+ // if isSRA:
+ // lo = (sra hi, shamt[4:0])
+ // hi = (sra hi, 31)
+ // else:
+ // lo = (srl hi, shamt[4:0])
+ // hi = 0
+ SDValue Not = DAG.getNode(ISD::XOR, DL, MVT::i32, Shamt,
+ DAG.getConstant(-1, DL, MVT::i32));
+ SDValue ShiftLeft1Hi = DAG.getNode(ISD::SHL, DL, VT, Hi,
+ DAG.getConstant(1, DL, VT));
+ SDValue ShiftLeftHi = DAG.getNode(ISD::SHL, DL, VT, ShiftLeft1Hi, Not);
+ SDValue ShiftRightLo = DAG.getNode(ISD::SRL, DL, VT, Lo, Shamt);
+ SDValue Or = DAG.getNode(ISD::OR, DL, VT, ShiftLeftHi, ShiftRightLo);
+ SDValue ShiftRightHi = DAG.getNode(IsSRA ? ISD::SRA : ISD::SRL,
+ DL, VT, Hi, Shamt);
+ SDValue Cond = DAG.getNode(ISD::AND, DL, MVT::i32, Shamt,
+ DAG.getConstant(VT.getSizeInBits(), DL, MVT::i32));
+ SDValue Ext = DAG.getNode(ISD::SRA, DL, VT, Hi,
+ DAG.getConstant(VT.getSizeInBits() - 1, DL, VT));
+ Lo = DAG.getNode(ISD::SELECT, DL, VT, Cond, ShiftRightHi, Or);
+ Hi = DAG.getNode(ISD::SELECT, DL, VT, Cond,
+ IsSRA ? Ext : DAG.getConstant(0, DL, VT), ShiftRightHi);
+
+ SDValue Ops[2] = {Lo, Hi};
+ return DAG.getMergeValues(Ops, DL);
+}
+
+static SDValue createLoadLR(unsigned Opc, SelectionDAG &DAG, LoadSDNode *LD,
+ SDValue Chain, SDValue Src, unsigned Offset) {
+ SDValue Ptr = LD->getBasePtr();
+ EVT VT = LD->getValueType(0), MemVT = LD->getMemoryVT();
+ EVT BasePtrVT = Ptr.getValueType();
+ SDLoc DL(LD);
+ SDVTList VTList = DAG.getVTList(VT, MVT::Other);
+
+ if (Offset)
+ Ptr = DAG.getNode(ISD::ADD, DL, BasePtrVT, Ptr,
+ DAG.getConstant(Offset, DL, BasePtrVT));
+
+ SDValue Ops[] = { Chain, Ptr, Src };
+ return DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, MemVT,
+ LD->getMemOperand());
+}
+
+// Expand an unaligned 32 or 64-bit integer load node.
+SDValue MipsTargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+ LoadSDNode *LD = cast<LoadSDNode>(Op);
+ EVT MemVT = LD->getMemoryVT();
+
+ if (Subtarget.systemSupportsUnalignedAccess())
+ return Op;
+
+ // Return if load is aligned or if MemVT is neither i32 nor i64.
+ if ((LD->getAlignment() >= MemVT.getSizeInBits() / 8) ||
+ ((MemVT != MVT::i32) && (MemVT != MVT::i64)))
+ return SDValue();
+
+ bool IsLittle = Subtarget.isLittle();
+ EVT VT = Op.getValueType();
+ ISD::LoadExtType ExtType = LD->getExtensionType();
+ SDValue Chain = LD->getChain(), Undef = DAG.getUNDEF(VT);
+
+ assert((VT == MVT::i32) || (VT == MVT::i64));
+
+ // Expand
+ // (set dst, (i64 (load baseptr)))
+ // to
+ // (set tmp, (ldl (add baseptr, 7), undef))
+ // (set dst, (ldr baseptr, tmp))
+ if ((VT == MVT::i64) && (ExtType == ISD::NON_EXTLOAD)) {
+ SDValue LDL = createLoadLR(MipsISD::LDL, DAG, LD, Chain, Undef,
+ IsLittle ? 7 : 0);
+ return createLoadLR(MipsISD::LDR, DAG, LD, LDL.getValue(1), LDL,
+ IsLittle ? 0 : 7);
+ }
+
+ SDValue LWL = createLoadLR(MipsISD::LWL, DAG, LD, Chain, Undef,
+ IsLittle ? 3 : 0);
+ SDValue LWR = createLoadLR(MipsISD::LWR, DAG, LD, LWL.getValue(1), LWL,
+ IsLittle ? 0 : 3);
+
+ // Expand
+ // (set dst, (i32 (load baseptr))) or
+ // (set dst, (i64 (sextload baseptr))) or
+ // (set dst, (i64 (extload baseptr)))
+ // to
+ // (set tmp, (lwl (add baseptr, 3), undef))
+ // (set dst, (lwr baseptr, tmp))
+ if ((VT == MVT::i32) || (ExtType == ISD::SEXTLOAD) ||
+ (ExtType == ISD::EXTLOAD))
+ return LWR;
+
+ assert((VT == MVT::i64) && (ExtType == ISD::ZEXTLOAD));
+
+ // Expand
+ // (set dst, (i64 (zextload baseptr)))
+ // to
+ // (set tmp0, (lwl (add baseptr, 3), undef))
+ // (set tmp1, (lwr baseptr, tmp0))
+ // (set tmp2, (shl tmp1, 32))
+ // (set dst, (srl tmp2, 32))
+ SDLoc DL(LD);
+ SDValue Const32 = DAG.getConstant(32, DL, MVT::i32);
+ SDValue SLL = DAG.getNode(ISD::SHL, DL, MVT::i64, LWR, Const32);
+ SDValue SRL = DAG.getNode(ISD::SRL, DL, MVT::i64, SLL, Const32);
+ SDValue Ops[] = { SRL, LWR.getValue(1) };
+ return DAG.getMergeValues(Ops, DL);
+}
+
+static SDValue createStoreLR(unsigned Opc, SelectionDAG &DAG, StoreSDNode *SD,
+ SDValue Chain, unsigned Offset) {
+ SDValue Ptr = SD->getBasePtr(), Value = SD->getValue();
+ EVT MemVT = SD->getMemoryVT(), BasePtrVT = Ptr.getValueType();
+ SDLoc DL(SD);
+ SDVTList VTList = DAG.getVTList(MVT::Other);
+
+ if (Offset)
+ Ptr = DAG.getNode(ISD::ADD, DL, BasePtrVT, Ptr,
+ DAG.getConstant(Offset, DL, BasePtrVT));
+
+ SDValue Ops[] = { Chain, Value, Ptr };
+ return DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, MemVT,
+ SD->getMemOperand());
+}
+
+// Expand an unaligned 32 or 64-bit integer store node.
+static SDValue lowerUnalignedIntStore(StoreSDNode *SD, SelectionDAG &DAG,
+ bool IsLittle) {
+ SDValue Value = SD->getValue(), Chain = SD->getChain();
+ EVT VT = Value.getValueType();
+
+ // Expand
+ // (store val, baseptr) or
+ // (truncstore val, baseptr)
+ // to
+ // (swl val, (add baseptr, 3))
+ // (swr val, baseptr)
+ if ((VT == MVT::i32) || SD->isTruncatingStore()) {
+ SDValue SWL = createStoreLR(MipsISD::SWL, DAG, SD, Chain,
+ IsLittle ? 3 : 0);
+ return createStoreLR(MipsISD::SWR, DAG, SD, SWL, IsLittle ? 0 : 3);
+ }
+
+ assert(VT == MVT::i64);
+
+ // Expand
+ // (store val, baseptr)
+ // to
+ // (sdl val, (add baseptr, 7))
+ // (sdr val, baseptr)
+ SDValue SDL = createStoreLR(MipsISD::SDL, DAG, SD, Chain, IsLittle ? 7 : 0);
+ return createStoreLR(MipsISD::SDR, DAG, SD, SDL, IsLittle ? 0 : 7);
+}
+
+// Lower (store (fp_to_sint $fp) $ptr) to (store (TruncIntFP $fp), $ptr).
+static SDValue lowerFP_TO_SINT_STORE(StoreSDNode *SD, SelectionDAG &DAG) {
+ SDValue Val = SD->getValue();
+
+ if (Val.getOpcode() != ISD::FP_TO_SINT)
+ return SDValue();
+
+ EVT FPTy = EVT::getFloatingPointVT(Val.getValueSizeInBits());
+ SDValue Tr = DAG.getNode(MipsISD::TruncIntFP, SDLoc(Val), FPTy,
+ Val.getOperand(0));
+ return DAG.getStore(SD->getChain(), SDLoc(SD), Tr, SD->getBasePtr(),
+ SD->getPointerInfo(), SD->getAlignment(),
+ SD->getMemOperand()->getFlags());
+}
+
+SDValue MipsTargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+ StoreSDNode *SD = cast<StoreSDNode>(Op);
+ EVT MemVT = SD->getMemoryVT();
+
+ // Lower unaligned integer stores.
+ if (!Subtarget.systemSupportsUnalignedAccess() &&
+ (SD->getAlignment() < MemVT.getSizeInBits() / 8) &&
+ ((MemVT == MVT::i32) || (MemVT == MVT::i64)))
+ return lowerUnalignedIntStore(SD, DAG, Subtarget.isLittle());
+
+ return lowerFP_TO_SINT_STORE(SD, DAG);
+}
+
+SDValue MipsTargetLowering::lowerEH_DWARF_CFA(SDValue Op,
+ SelectionDAG &DAG) const {
+
+ // Return a fixed StackObject with offset 0 which points to the old stack
+ // pointer.
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ EVT ValTy = Op->getValueType(0);
+ int FI = MFI.CreateFixedObject(Op.getValueSizeInBits() / 8, 0, false);
+ return DAG.getFrameIndex(FI, ValTy);
+}
+
+SDValue MipsTargetLowering::lowerFP_TO_SINT(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT FPTy = EVT::getFloatingPointVT(Op.getValueSizeInBits());
+ SDValue Trunc = DAG.getNode(MipsISD::TruncIntFP, SDLoc(Op), FPTy,
+ Op.getOperand(0));
+ return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op.getValueType(), Trunc);
+}
+
+//===----------------------------------------------------------------------===//
+// Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// TODO: Implement a generic logic using tblgen that can support this.
+// Mips O32 ABI rules:
+// ---
+// i32 - Passed in A0, A1, A2, A3 and stack
+// f32 - Only passed in f32 registers if no int reg has been used yet to hold
+// an argument. Otherwise, passed in A1, A2, A3 and stack.
+// f64 - Only passed in two aliased f32 registers if no int reg has been used
+// yet to hold an argument. Otherwise, use A2, A3 and stack. If A1 is
+// not used, it must be shadowed. If only A3 is available, shadow it and
+// go to stack.
+//
+// For vararg functions, all arguments are passed in A0, A1, A2, A3 and stack.
+//===----------------------------------------------------------------------===//
+
+static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
+ CCState &State, ArrayRef<MCPhysReg> F64Regs) {
+ const MipsSubtarget &Subtarget = static_cast<const MipsSubtarget &>(
+ State.getMachineFunction().getSubtarget());
+
+ static const MCPhysReg IntRegs[] = { Mips::A0, Mips::A1, Mips::A2, Mips::A3 };
+ static const MCPhysReg F32Regs[] = { Mips::F12, Mips::F14 };
+
+ // Do not process byval args here.
+ if (ArgFlags.isByVal())
+ return true;
+
+ // Promote i8 and i16
+ if (ArgFlags.isInReg() && !Subtarget.isLittle()) {
+ if (LocVT == MVT::i8 || LocVT == MVT::i16 || LocVT == MVT::i32) {
+ LocVT = MVT::i32;
+ if (ArgFlags.isSExt())
+ LocInfo = CCValAssign::SExtUpper;
+ else if (ArgFlags.isZExt())
+ LocInfo = CCValAssign::ZExtUpper;
+ else
+ LocInfo = CCValAssign::AExtUpper;
+ }
+ }
+
+ // Promote i8 and i16
+ if (LocVT == MVT::i8 || LocVT == MVT::i16) {
+ LocVT = MVT::i32;
+ if (ArgFlags.isSExt())
+ LocInfo = CCValAssign::SExt;
+ else if (ArgFlags.isZExt())
+ LocInfo = CCValAssign::ZExt;
+ else
+ LocInfo = CCValAssign::AExt;
+ }
+
+ unsigned Reg;
+
+ // f32 and f64 are allocated in A0, A1, A2, A3 when either of the following
+ // is true: function is vararg, argument is 3rd or higher, there is previous
+ // argument which is not f32 or f64.
+ bool AllocateFloatsInIntReg = State.isVarArg() || ValNo > 1 ||
+ State.getFirstUnallocated(F32Regs) != ValNo;
+ unsigned OrigAlign = ArgFlags.getOrigAlign();
+ bool isI64 = (ValVT == MVT::i32 && OrigAlign == 8);
+
+ if (ValVT == MVT::i32 || (ValVT == MVT::f32 && AllocateFloatsInIntReg)) {
+ Reg = State.AllocateReg(IntRegs);
+ // If this is the first part of an i64 arg,
+ // the allocated register must be either A0 or A2.
+ if (isI64 && (Reg == Mips::A1 || Reg == Mips::A3))
+ Reg = State.AllocateReg(IntRegs);
+ LocVT = MVT::i32;
+ } else if (ValVT == MVT::f64 && AllocateFloatsInIntReg) {
+ // Allocate int register and shadow next int register. If first
+ // available register is Mips::A1 or Mips::A3, shadow it too.
+ Reg = State.AllocateReg(IntRegs);
+ if (Reg == Mips::A1 || Reg == Mips::A3)
+ Reg = State.AllocateReg(IntRegs);
+ State.AllocateReg(IntRegs);
+ LocVT = MVT::i32;
+ } else if (ValVT.isFloatingPoint() && !AllocateFloatsInIntReg) {
+ // we are guaranteed to find an available float register
+ if (ValVT == MVT::f32) {
+ Reg = State.AllocateReg(F32Regs);
+ // Shadow int register
+ State.AllocateReg(IntRegs);
+ } else {
+ Reg = State.AllocateReg(F64Regs);
+ // Shadow int registers
+ unsigned Reg2 = State.AllocateReg(IntRegs);
+ if (Reg2 == Mips::A1 || Reg2 == Mips::A3)
+ State.AllocateReg(IntRegs);
+ State.AllocateReg(IntRegs);
+ }
+ } else
+ llvm_unreachable("Cannot handle this ValVT.");
+
+ if (!Reg) {
+ unsigned Offset = State.AllocateStack(ValVT.getSizeInBits() >> 3,
+ OrigAlign);
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ } else
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+
+ return false;
+}
+
+static bool CC_MipsO32_FP32(unsigned ValNo, MVT ValVT,
+ MVT LocVT, CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State) {
+ static const MCPhysReg F64Regs[] = { Mips::D6, Mips::D7 };
+
+ return CC_MipsO32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, F64Regs);
+}
+
+static bool CC_MipsO32_FP64(unsigned ValNo, MVT ValVT,
+ MVT LocVT, CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State) {
+ static const MCPhysReg F64Regs[] = { Mips::D12_64, Mips::D14_64 };
+
+ return CC_MipsO32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, F64Regs);
+}
+
+static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
+ CCState &State) LLVM_ATTRIBUTE_UNUSED;
+
+#include "MipsGenCallingConv.inc"
+
+//===----------------------------------------------------------------------===//
+// Call Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+// Return next O32 integer argument register.
+static unsigned getNextIntArgReg(unsigned Reg) {
+ assert((Reg == Mips::A0) || (Reg == Mips::A2));
+ return (Reg == Mips::A0) ? Mips::A1 : Mips::A3;
+}
+
+SDValue MipsTargetLowering::passArgOnStack(SDValue StackPtr, unsigned Offset,
+ SDValue Chain, SDValue Arg,
+ const SDLoc &DL, bool IsTailCall,
+ SelectionDAG &DAG) const {
+ if (!IsTailCall) {
+ SDValue PtrOff =
+ DAG.getNode(ISD::ADD, DL, getPointerTy(DAG.getDataLayout()), StackPtr,
+ DAG.getIntPtrConstant(Offset, DL));
+ return DAG.getStore(Chain, DL, Arg, PtrOff, MachinePointerInfo());
+ }
+
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ int FI = MFI.CreateFixedObject(Arg.getValueSizeInBits() / 8, Offset, false);
+ SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+ return DAG.getStore(Chain, DL, Arg, FIN, MachinePointerInfo(),
+ /* Alignment = */ 0, MachineMemOperand::MOVolatile);
+}
+
+void MipsTargetLowering::
+getOpndList(SmallVectorImpl<SDValue> &Ops,
+ std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
+ bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
+ bool IsCallReloc, CallLoweringInfo &CLI, SDValue Callee,
+ SDValue Chain) const {
+ // Insert node "GP copy globalreg" before call to function.
+ //
+ // R_MIPS_CALL* operators (emitted when non-internal functions are called
+ // in PIC mode) allow symbols to be resolved via lazy binding.
+ // The lazy binding stub requires GP to point to the GOT.
+ // Note that we don't need GP to point to the GOT for indirect calls
+ // (when R_MIPS_CALL* is not used for the call) because Mips linker generates
+ // lazy binding stub for a function only when R_MIPS_CALL* are the only relocs
+ // used for the function (that is, Mips linker doesn't generate lazy binding
+ // stub for a function whose address is taken in the program).
+ if (IsPICCall && !InternalLinkage && IsCallReloc) {
+ unsigned GPReg = ABI.IsN64() ? Mips::GP_64 : Mips::GP;
+ EVT Ty = ABI.IsN64() ? MVT::i64 : MVT::i32;
+ RegsToPass.push_back(std::make_pair(GPReg, getGlobalReg(CLI.DAG, Ty)));
+ }
+
+ // Build a sequence of copy-to-reg nodes chained together with token
+ // chain and flag operands which copy the outgoing args into registers.
+ // The InFlag in necessary since all emitted instructions must be
+ // stuck together.
+ SDValue InFlag;
+
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+ Chain = CLI.DAG.getCopyToReg(Chain, CLI.DL, RegsToPass[i].first,
+ RegsToPass[i].second, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ // Add argument registers to the end of the list so that they are
+ // known live into the call.
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+ Ops.push_back(CLI.DAG.getRegister(RegsToPass[i].first,
+ RegsToPass[i].second.getValueType()));
+
+ // Add a register mask operand representing the call-preserved registers.
+ const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+ const uint32_t *Mask =
+ TRI->getCallPreservedMask(CLI.DAG.getMachineFunction(), CLI.CallConv);
+ assert(Mask && "Missing call preserved mask for calling convention");
+ if (Subtarget.inMips16HardFloat()) {
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(CLI.Callee)) {
+ llvm::StringRef Sym = G->getGlobal()->getName();
+ Function *F = G->getGlobal()->getParent()->getFunction(Sym);
+ if (F && F->hasFnAttribute("__Mips16RetHelper")) {
+ Mask = MipsRegisterInfo::getMips16RetHelperMask();
+ }
+ }
+ }
+ Ops.push_back(CLI.DAG.getRegisterMask(Mask));
+
+ if (InFlag.getNode())
+ Ops.push_back(InFlag);
+}
+
+/// LowerCall - functions arguments are copied from virtual regs to
+/// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
+SDValue
+MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const {
+ SelectionDAG &DAG = CLI.DAG;
+ SDLoc DL = CLI.DL;
+ SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+ SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+ SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
+ SDValue Chain = CLI.Chain;
+ SDValue Callee = CLI.Callee;
+ bool &IsTailCall = CLI.IsTailCall;
+ CallingConv::ID CallConv = CLI.CallConv;
+ bool IsVarArg = CLI.IsVarArg;
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ const TargetFrameLowering *TFL = Subtarget.getFrameLowering();
+ MipsFunctionInfo *FuncInfo = MF.getInfo<MipsFunctionInfo>();
+ bool IsPIC = isPositionIndependent();
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ MipsCCState CCInfo(
+ CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext(),
+ MipsCCState::getSpecialCallingConvForCallee(Callee.getNode(), Subtarget));
+
+ // Allocate the reserved argument area. It seems strange to do this from the
+ // caller side but removing it breaks the frame size calculation.
+ CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(CallConv), 1);
+
+ CCInfo.AnalyzeCallOperands(Outs, CC_Mips, CLI.getArgs(), Callee.getNode());
+
+ // Get a count of how many bytes are to be pushed on the stack.
+ unsigned NextStackOffset = CCInfo.getNextStackOffset();
+
+ // Check if it's really possible to do a tail call. Restrict it to functions
+ // that are part of this compilation unit.
+ bool InternalLinkage = false;
+ if (IsTailCall) {
+ IsTailCall = isEligibleForTailCallOptimization(
+ CCInfo, NextStackOffset, *MF.getInfo<MipsFunctionInfo>());
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ InternalLinkage = G->getGlobal()->hasInternalLinkage();
+ IsTailCall &= (InternalLinkage || G->getGlobal()->hasLocalLinkage() ||
+ G->getGlobal()->hasPrivateLinkage() ||
+ G->getGlobal()->hasHiddenVisibility() ||
+ G->getGlobal()->hasProtectedVisibility());
+ }
+ }
+ if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall())
+ report_fatal_error("failed to perform tail call elimination on a call "
+ "site marked musttail");
+
+ if (IsTailCall)
+ ++NumTailCalls;
+
+ // Chain is the output chain of the last Load/Store or CopyToReg node.
+ // ByValChain is the output chain of the last Memcpy node created for copying
+ // byval arguments to the stack.
+ unsigned StackAlignment = TFL->getStackAlignment();
+ NextStackOffset = alignTo(NextStackOffset, StackAlignment);
+ SDValue NextStackOffsetVal = DAG.getIntPtrConstant(NextStackOffset, DL, true);
+
+ if (!IsTailCall)
+ Chain = DAG.getCALLSEQ_START(Chain, NextStackOffsetVal, DL);
+
+ SDValue StackPtr =
+ DAG.getCopyFromReg(Chain, DL, ABI.IsN64() ? Mips::SP_64 : Mips::SP,
+ getPointerTy(DAG.getDataLayout()));
+
+ std::deque< std::pair<unsigned, SDValue> > RegsToPass;
+ SmallVector<SDValue, 8> MemOpChains;
+
+ CCInfo.rewindByValRegsInfo();
+
+ // Walk the register/memloc assignments, inserting copies/loads.
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ SDValue Arg = OutVals[i];
+ CCValAssign &VA = ArgLocs[i];
+ MVT ValVT = VA.getValVT(), LocVT = VA.getLocVT();
+ ISD::ArgFlagsTy Flags = Outs[i].Flags;
+ bool UseUpperBits = false;
+
+ // ByVal Arg.
+ if (Flags.isByVal()) {
+ unsigned FirstByValReg, LastByValReg;
+ unsigned ByValIdx = CCInfo.getInRegsParamsProcessed();
+ CCInfo.getInRegsParamInfo(ByValIdx, FirstByValReg, LastByValReg);
+
+ assert(Flags.getByValSize() &&
+ "ByVal args of size 0 should have been ignored by front-end.");
+ assert(ByValIdx < CCInfo.getInRegsParamsCount());
+ assert(!IsTailCall &&
+ "Do not tail-call optimize if there is a byval argument.");
+ passByValArg(Chain, DL, RegsToPass, MemOpChains, StackPtr, MFI, DAG, Arg,
+ FirstByValReg, LastByValReg, Flags, Subtarget.isLittle(),
+ VA);
+ CCInfo.nextInRegsParam();
+ continue;
+ }
+
+ // Promote the value if needed.
+ switch (VA.getLocInfo()) {
+ default:
+ llvm_unreachable("Unknown loc info!");
+ case CCValAssign::Full:
+ if (VA.isRegLoc()) {
+ if ((ValVT == MVT::f32 && LocVT == MVT::i32) ||
+ (ValVT == MVT::f64 && LocVT == MVT::i64) ||
+ (ValVT == MVT::i64 && LocVT == MVT::f64))
+ Arg = DAG.getNode(ISD::BITCAST, DL, LocVT, Arg);
+ else if (ValVT == MVT::f64 && LocVT == MVT::i32) {
+ SDValue Lo = DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32,
+ Arg, DAG.getConstant(0, DL, MVT::i32));
+ SDValue Hi = DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32,
+ Arg, DAG.getConstant(1, DL, MVT::i32));
+ if (!Subtarget.isLittle())
+ std::swap(Lo, Hi);
+ unsigned LocRegLo = VA.getLocReg();
+ unsigned LocRegHigh = getNextIntArgReg(LocRegLo);
+ RegsToPass.push_back(std::make_pair(LocRegLo, Lo));
+ RegsToPass.push_back(std::make_pair(LocRegHigh, Hi));
+ continue;
+ }
+ }
+ break;
+ case CCValAssign::BCvt:
+ Arg = DAG.getNode(ISD::BITCAST, DL, LocVT, Arg);
+ break;
+ case CCValAssign::SExtUpper:
+ UseUpperBits = true;
+ LLVM_FALLTHROUGH;
+ case CCValAssign::SExt:
+ Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, LocVT, Arg);
+ break;
+ case CCValAssign::ZExtUpper:
+ UseUpperBits = true;
+ LLVM_FALLTHROUGH;
+ case CCValAssign::ZExt:
+ Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, LocVT, Arg);
+ break;
+ case CCValAssign::AExtUpper:
+ UseUpperBits = true;
+ LLVM_FALLTHROUGH;
+ case CCValAssign::AExt:
+ Arg = DAG.getNode(ISD::ANY_EXTEND, DL, LocVT, Arg);
+ break;
+ }
+
+ if (UseUpperBits) {
+ unsigned ValSizeInBits = Outs[i].ArgVT.getSizeInBits();
+ unsigned LocSizeInBits = VA.getLocVT().getSizeInBits();
+ Arg = DAG.getNode(
+ ISD::SHL, DL, VA.getLocVT(), Arg,
+ DAG.getConstant(LocSizeInBits - ValSizeInBits, DL, VA.getLocVT()));
+ }
+
+ // Arguments that can be passed on register must be kept at
+ // RegsToPass vector
+ if (VA.isRegLoc()) {
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+ continue;
+ }
+
+ // Register can't get to this point...
+ assert(VA.isMemLoc());
+
+ // emit ISD::STORE whichs stores the
+ // parameter value to a stack Location
+ MemOpChains.push_back(passArgOnStack(StackPtr, VA.getLocMemOffset(),
+ Chain, Arg, DL, IsTailCall, DAG));
+ }
+
+ // Transform all store nodes into one single node because all store
+ // nodes are independent of each other.
+ if (!MemOpChains.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
+
+ // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
+ // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
+ // node so that legalize doesn't hack it.
+ bool IsPICCall = (ABI.IsN64() || IsPIC); // true if calls are translated to
+ // jalr $25
+ SDValue CalleeLo;
+ EVT Ty = Callee.getValueType();
+ bool GlobalOrExternal = false, IsCallReloc = false;
+
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ if (IsPICCall) {
+ const GlobalValue *Val = G->getGlobal();
+ InternalLinkage = Val->hasInternalLinkage();
+
+ if (InternalLinkage)
+ Callee = getAddrLocal(G, DL, Ty, DAG, ABI.IsN32() || ABI.IsN64());
+ else if (LargeGOT) {
+ Callee = getAddrGlobalLargeGOT(G, DL, Ty, DAG, MipsII::MO_CALL_HI16,
+ MipsII::MO_CALL_LO16, Chain,
+ FuncInfo->callPtrInfo(Val));
+ IsCallReloc = true;
+ } else {
+ Callee = getAddrGlobal(G, DL, Ty, DAG, MipsII::MO_GOT_CALL, Chain,
+ FuncInfo->callPtrInfo(Val));
+ IsCallReloc = true;
+ }
+ } else
+ Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL,
+ getPointerTy(DAG.getDataLayout()), 0,
+ MipsII::MO_NO_FLAG);
+ GlobalOrExternal = true;
+ }
+ else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+ const char *Sym = S->getSymbol();
+
+ if (!ABI.IsN64() && !IsPIC) // !N64 && static
+ Callee = DAG.getTargetExternalSymbol(
+ Sym, getPointerTy(DAG.getDataLayout()), MipsII::MO_NO_FLAG);
+ else if (LargeGOT) {
+ Callee = getAddrGlobalLargeGOT(S, DL, Ty, DAG, MipsII::MO_CALL_HI16,
+ MipsII::MO_CALL_LO16, Chain,
+ FuncInfo->callPtrInfo(Sym));
+ IsCallReloc = true;
+ } else { // N64 || PIC
+ Callee = getAddrGlobal(S, DL, Ty, DAG, MipsII::MO_GOT_CALL, Chain,
+ FuncInfo->callPtrInfo(Sym));
+ IsCallReloc = true;
+ }
+
+ GlobalOrExternal = true;
+ }
+
+ SmallVector<SDValue, 8> Ops(1, Chain);
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+ getOpndList(Ops, RegsToPass, IsPICCall, GlobalOrExternal, InternalLinkage,
+ IsCallReloc, CLI, Callee, Chain);
+
+ if (IsTailCall) {
+ MF.getFrameInfo().setHasTailCall();
+ return DAG.getNode(MipsISD::TailCall, DL, MVT::Other, Ops);
+ }
+
+ Chain = DAG.getNode(MipsISD::JmpLink, DL, NodeTys, Ops);
+ SDValue InFlag = Chain.getValue(1);
+
+ // Create the CALLSEQ_END node.
+ Chain = DAG.getCALLSEQ_END(Chain, NextStackOffsetVal,
+ DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
+ InFlag = Chain.getValue(1);
+
+ // Handle result values, copying them out of physregs into vregs that we
+ // return.
+ return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
+ InVals, CLI);
+}
+
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+SDValue MipsTargetLowering::LowerCallResult(
+ SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
+ TargetLowering::CallLoweringInfo &CLI) const {
+ // Assign locations to each value returned by this call.
+ SmallVector<CCValAssign, 16> RVLocs;
+ MipsCCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
+ *DAG.getContext());
+ CCInfo.AnalyzeCallResult(Ins, RetCC_Mips, CLI);
+
+ // Copy all of the result registers out of their specified physreg.
+ for (unsigned i = 0; i != RVLocs.size(); ++i) {
+ CCValAssign &VA = RVLocs[i];
+ assert(VA.isRegLoc() && "Can only return in registers!");
+
+ SDValue Val = DAG.getCopyFromReg(Chain, DL, RVLocs[i].getLocReg(),
+ RVLocs[i].getLocVT(), InFlag);
+ Chain = Val.getValue(1);
+ InFlag = Val.getValue(2);
+
+ if (VA.isUpperBitsInLoc()) {
+ unsigned ValSizeInBits = Ins[i].ArgVT.getSizeInBits();
+ unsigned LocSizeInBits = VA.getLocVT().getSizeInBits();
+ unsigned Shift =
+ VA.getLocInfo() == CCValAssign::ZExtUpper ? ISD::SRL : ISD::SRA;
+ Val = DAG.getNode(
+ Shift, DL, VA.getLocVT(), Val,
+ DAG.getConstant(LocSizeInBits - ValSizeInBits, DL, VA.getLocVT()));
+ }
+
+ switch (VA.getLocInfo()) {
+ default:
+ llvm_unreachable("Unknown loc info!");
+ case CCValAssign::Full:
+ break;
+ case CCValAssign::BCvt:
+ Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
+ break;
+ case CCValAssign::AExt:
+ case CCValAssign::AExtUpper:
+ Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
+ break;
+ case CCValAssign::ZExt:
+ case CCValAssign::ZExtUpper:
+ Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
+ DAG.getValueType(VA.getValVT()));
+ Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
+ break;
+ case CCValAssign::SExt:
+ case CCValAssign::SExtUpper:
+ Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
+ DAG.getValueType(VA.getValVT()));
+ Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
+ break;
+ }
+
+ InVals.push_back(Val);
+ }
+
+ return Chain;
+}
+
+static SDValue UnpackFromArgumentSlot(SDValue Val, const CCValAssign &VA,
+ EVT ArgVT, const SDLoc &DL,
+ SelectionDAG &DAG) {
+ MVT LocVT = VA.getLocVT();
+ EVT ValVT = VA.getValVT();
+
+ // Shift into the upper bits if necessary.
+ switch (VA.getLocInfo()) {
+ default:
+ break;
+ case CCValAssign::AExtUpper:
+ case CCValAssign::SExtUpper:
+ case CCValAssign::ZExtUpper: {
+ unsigned ValSizeInBits = ArgVT.getSizeInBits();
+ unsigned LocSizeInBits = VA.getLocVT().getSizeInBits();
+ unsigned Opcode =
+ VA.getLocInfo() == CCValAssign::ZExtUpper ? ISD::SRL : ISD::SRA;
+ Val = DAG.getNode(
+ Opcode, DL, VA.getLocVT(), Val,
+ DAG.getConstant(LocSizeInBits - ValSizeInBits, DL, VA.getLocVT()));
+ break;
+ }
+ }
+
+ // If this is an value smaller than the argument slot size (32-bit for O32,
+ // 64-bit for N32/N64), it has been promoted in some way to the argument slot
+ // size. Extract the value and insert any appropriate assertions regarding
+ // sign/zero extension.
+ switch (VA.getLocInfo()) {
+ default:
+ llvm_unreachable("Unknown loc info!");
+ case CCValAssign::Full:
+ break;
+ case CCValAssign::AExtUpper:
+ case CCValAssign::AExt:
+ Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
+ break;
+ case CCValAssign::SExtUpper:
+ case CCValAssign::SExt:
+ Val = DAG.getNode(ISD::AssertSext, DL, LocVT, Val, DAG.getValueType(ValVT));
+ Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
+ break;
+ case CCValAssign::ZExtUpper:
+ case CCValAssign::ZExt:
+ Val = DAG.getNode(ISD::AssertZext, DL, LocVT, Val, DAG.getValueType(ValVT));
+ Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
+ break;
+ case CCValAssign::BCvt:
+ Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
+ break;
+ }
+
+ return Val;
+}
+
+//===----------------------------------------------------------------------===//
+// Formal Arguments Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+/// LowerFormalArguments - transform physical registers into virtual registers
+/// and generate load operations for arguments places on the stack.
+SDValue MipsTargetLowering::LowerFormalArguments(
+ SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+ MipsFI->setVarArgsFrameIndex(0);
+
+ // Used with vargs to acumulate store chains.
+ std::vector<SDValue> OutChains;
+
+ // Assign locations to all of the incoming arguments.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ MipsCCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
+ *DAG.getContext());
+ CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(CallConv), 1);
+ const Function *Func = DAG.getMachineFunction().getFunction();
+ Function::const_arg_iterator FuncArg = Func->arg_begin();
+
+ if (Func->hasFnAttribute("interrupt") && !Func->arg_empty())
+ report_fatal_error(
+ "Functions with the interrupt attribute cannot have arguments!");
+
+ CCInfo.AnalyzeFormalArguments(Ins, CC_Mips_FixedArg);
+ MipsFI->setFormalArgInfo(CCInfo.getNextStackOffset(),
+ CCInfo.getInRegsParamsCount() > 0);
+
+ unsigned CurArgIdx = 0;
+ CCInfo.rewindByValRegsInfo();
+
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ if (Ins[i].isOrigArg()) {
+ std::advance(FuncArg, Ins[i].getOrigArgIndex() - CurArgIdx);
+ CurArgIdx = Ins[i].getOrigArgIndex();
+ }
+ EVT ValVT = VA.getValVT();
+ ISD::ArgFlagsTy Flags = Ins[i].Flags;
+ bool IsRegLoc = VA.isRegLoc();
+
+ if (Flags.isByVal()) {
+ assert(Ins[i].isOrigArg() && "Byval arguments cannot be implicit");
+ unsigned FirstByValReg, LastByValReg;
+ unsigned ByValIdx = CCInfo.getInRegsParamsProcessed();
+ CCInfo.getInRegsParamInfo(ByValIdx, FirstByValReg, LastByValReg);
+
+ assert(Flags.getByValSize() &&
+ "ByVal args of size 0 should have been ignored by front-end.");
+ assert(ByValIdx < CCInfo.getInRegsParamsCount());
+ copyByValRegs(Chain, DL, OutChains, DAG, Flags, InVals, &*FuncArg,
+ FirstByValReg, LastByValReg, VA, CCInfo);
+ CCInfo.nextInRegsParam();
+ continue;
+ }
+
+ // Arguments stored on registers
+ if (IsRegLoc) {
+ MVT RegVT = VA.getLocVT();
+ unsigned ArgReg = VA.getLocReg();
+ const TargetRegisterClass *RC = getRegClassFor(RegVT);
+
+ // Transform the arguments stored on
+ // physical registers into virtual ones
+ unsigned Reg = addLiveIn(DAG.getMachineFunction(), ArgReg, RC);
+ SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
+
+ ArgValue = UnpackFromArgumentSlot(ArgValue, VA, Ins[i].ArgVT, DL, DAG);
+
+ // Handle floating point arguments passed in integer registers and
+ // long double arguments passed in floating point registers.
+ if ((RegVT == MVT::i32 && ValVT == MVT::f32) ||
+ (RegVT == MVT::i64 && ValVT == MVT::f64) ||
+ (RegVT == MVT::f64 && ValVT == MVT::i64))
+ ArgValue = DAG.getNode(ISD::BITCAST, DL, ValVT, ArgValue);
+ else if (ABI.IsO32() && RegVT == MVT::i32 &&
+ ValVT == MVT::f64) {
+ unsigned Reg2 = addLiveIn(DAG.getMachineFunction(),
+ getNextIntArgReg(ArgReg), RC);
+ SDValue ArgValue2 = DAG.getCopyFromReg(Chain, DL, Reg2, RegVT);
+ if (!Subtarget.isLittle())
+ std::swap(ArgValue, ArgValue2);
+ ArgValue = DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64,
+ ArgValue, ArgValue2);
+ }
+
+ InVals.push_back(ArgValue);
+ } else { // VA.isRegLoc()
+ MVT LocVT = VA.getLocVT();
+
+ if (ABI.IsO32()) {
+ // We ought to be able to use LocVT directly but O32 sets it to i32
+ // when allocating floating point values to integer registers.
+ // This shouldn't influence how we load the value into registers unless
+ // we are targeting softfloat.
+ if (VA.getValVT().isFloatingPoint() && !Subtarget.useSoftFloat())
+ LocVT = VA.getValVT();
+ }
+
+ // sanity check
+ assert(VA.isMemLoc());
+
+ // The stack pointer offset is relative to the caller stack frame.
+ int FI = MFI.CreateFixedObject(LocVT.getSizeInBits() / 8,
+ VA.getLocMemOffset(), true);
+
+ // Create load nodes to retrieve arguments from the stack
+ SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+ SDValue ArgValue = DAG.getLoad(
+ LocVT, DL, Chain, FIN,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+ OutChains.push_back(ArgValue.getValue(1));
+
+ ArgValue = UnpackFromArgumentSlot(ArgValue, VA, Ins[i].ArgVT, DL, DAG);
+
+ InVals.push_back(ArgValue);
+ }
+ }
+
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ // The mips ABIs for returning structs by value requires that we copy
+ // the sret argument into $v0 for the return. Save the argument into
+ // a virtual register so that we can access it from the return points.
+ if (Ins[i].Flags.isSRet()) {
+ unsigned Reg = MipsFI->getSRetReturnReg();
+ if (!Reg) {
+ Reg = MF.getRegInfo().createVirtualRegister(
+ getRegClassFor(ABI.IsN64() ? MVT::i64 : MVT::i32));
+ MipsFI->setSRetReturnReg(Reg);
+ }
+ SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[i]);
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
+ break;
+ }
+ }
+
+ if (IsVarArg)
+ writeVarArgRegs(OutChains, Chain, DL, DAG, CCInfo);
+
+ // All stores are grouped in one node to allow the matching between
+ // the size of Ins and InVals. This only happens when on varg functions
+ if (!OutChains.empty()) {
+ OutChains.push_back(Chain);
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
+ }
+
+ return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+bool
+MipsTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
+ MachineFunction &MF, bool IsVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ LLVMContext &Context) const {
+ SmallVector<CCValAssign, 16> RVLocs;
+ MipsCCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
+ return CCInfo.CheckReturn(Outs, RetCC_Mips);
+}
+
+bool
+MipsTargetLowering::shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const {
+ if (Subtarget.hasMips3() && Subtarget.useSoftFloat()) {
+ if (Type == MVT::i32)
+ return true;
+ }
+ return IsSigned;
+}
+
+SDValue
+MipsTargetLowering::LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,
+ const SDLoc &DL,
+ SelectionDAG &DAG) const {
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+ MipsFI->setISR();
+
+ return DAG.getNode(MipsISD::ERet, DL, MVT::Other, RetOps);
+}
+
+SDValue
+MipsTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+ bool IsVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SDLoc &DL, SelectionDAG &DAG) const {
+ // CCValAssign - represent the assignment of
+ // the return value to a location
+ SmallVector<CCValAssign, 16> RVLocs;
+ MachineFunction &MF = DAG.getMachineFunction();
+
+ // CCState - Info about the registers and stack slot.
+ MipsCCState CCInfo(CallConv, IsVarArg, MF, RVLocs, *DAG.getContext());
+
+ // Analyze return values.
+ CCInfo.AnalyzeReturn(Outs, RetCC_Mips);
+
+ SDValue Flag;
+ SmallVector<SDValue, 4> RetOps(1, Chain);
+
+ // Copy the result values into the output registers.
+ for (unsigned i = 0; i != RVLocs.size(); ++i) {
+ SDValue Val = OutVals[i];
+ CCValAssign &VA = RVLocs[i];
+ assert(VA.isRegLoc() && "Can only return in registers!");
+ bool UseUpperBits = false;
+
+ switch (VA.getLocInfo()) {
+ default:
+ llvm_unreachable("Unknown loc info!");
+ case CCValAssign::Full:
+ break;
+ case CCValAssign::BCvt:
+ Val = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Val);
+ break;
+ case CCValAssign::AExtUpper:
+ UseUpperBits = true;
+ LLVM_FALLTHROUGH;
+ case CCValAssign::AExt:
+ Val = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Val);
+ break;
+ case CCValAssign::ZExtUpper:
+ UseUpperBits = true;
+ LLVM_FALLTHROUGH;
+ case CCValAssign::ZExt:
+ Val = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Val);
+ break;
+ case CCValAssign::SExtUpper:
+ UseUpperBits = true;
+ LLVM_FALLTHROUGH;
+ case CCValAssign::SExt:
+ Val = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Val);
+ break;
+ }
+
+ if (UseUpperBits) {
+ unsigned ValSizeInBits = Outs[i].ArgVT.getSizeInBits();
+ unsigned LocSizeInBits = VA.getLocVT().getSizeInBits();
+ Val = DAG.getNode(
+ ISD::SHL, DL, VA.getLocVT(), Val,
+ DAG.getConstant(LocSizeInBits - ValSizeInBits, DL, VA.getLocVT()));
+ }
+
+ Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Val, Flag);
+
+ // Guarantee that all emitted copies are stuck together with flags.
+ Flag = Chain.getValue(1);
+ RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+ }
+
+ // The mips ABIs for returning structs by value requires that we copy
+ // the sret argument into $v0 for the return. We saved the argument into
+ // a virtual register in the entry block, so now we copy the value out
+ // and into $v0.
+ if (MF.getFunction()->hasStructRetAttr()) {
+ MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+ unsigned Reg = MipsFI->getSRetReturnReg();
+
+ if (!Reg)
+ llvm_unreachable("sret virtual register not created in the entry block");
+ SDValue Val =
+ DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(DAG.getDataLayout()));
+ unsigned V0 = ABI.IsN64() ? Mips::V0_64 : Mips::V0;
+
+ Chain = DAG.getCopyToReg(Chain, DL, V0, Val, Flag);
+ Flag = Chain.getValue(1);
+ RetOps.push_back(DAG.getRegister(V0, getPointerTy(DAG.getDataLayout())));
+ }
+
+ RetOps[0] = Chain; // Update chain.
+
+ // Add the flag if we have it.
+ if (Flag.getNode())
+ RetOps.push_back(Flag);
+
+ // ISRs must use "eret".
+ if (DAG.getMachineFunction().getFunction()->hasFnAttribute("interrupt"))
+ return LowerInterruptReturn(RetOps, DL, DAG);
+
+ // Standard return on Mips is a "jr $ra"
+ return DAG.getNode(MipsISD::Ret, DL, MVT::Other, RetOps);
+}
+
+//===----------------------------------------------------------------------===//
+// Mips Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+MipsTargetLowering::ConstraintType
+MipsTargetLowering::getConstraintType(StringRef Constraint) const {
+ // Mips specific constraints
+ // GCC config/mips/constraints.md
+ //
+ // 'd' : An address register. Equivalent to r
+ // unless generating MIPS16 code.
+ // 'y' : Equivalent to r; retained for
+ // backwards compatibility.
+ // 'c' : A register suitable for use in an indirect
+ // jump. This will always be $25 for -mabicalls.
+ // 'l' : The lo register. 1 word storage.
+ // 'x' : The hilo register pair. Double word storage.
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ default : break;
+ case 'd':
+ case 'y':
+ case 'f':
+ case 'c':
+ case 'l':
+ case 'x':
+ return C_RegisterClass;
+ case 'R':
+ return C_Memory;
+ }
+ }
+
+ if (Constraint == "ZC")
+ return C_Memory;
+
+ return TargetLowering::getConstraintType(Constraint);
+}
+
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+MipsTargetLowering::getSingleConstraintMatchWeight(
+ AsmOperandInfo &info, const char *constraint) const {
+ ConstraintWeight weight = CW_Invalid;
+ Value *CallOperandVal = info.CallOperandVal;
+ // If we don't have a value, we can't do a match,
+ // but allow it at the lowest weight.
+ if (!CallOperandVal)
+ return CW_Default;
+ Type *type = CallOperandVal->getType();
+ // Look at the constraint type.
+ switch (*constraint) {
+ default:
+ weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+ break;
+ case 'd':
+ case 'y':
+ if (type->isIntegerTy())
+ weight = CW_Register;
+ break;
+ case 'f': // FPU or MSA register
+ if (Subtarget.hasMSA() && type->isVectorTy() &&
+ cast<VectorType>(type)->getBitWidth() == 128)
+ weight = CW_Register;
+ else if (type->isFloatTy())
+ weight = CW_Register;
+ break;
+ case 'c': // $25 for indirect jumps
+ case 'l': // lo register
+ case 'x': // hilo register pair
+ if (type->isIntegerTy())
+ weight = CW_SpecificReg;
+ break;
+ case 'I': // signed 16 bit immediate
+ case 'J': // integer zero
+ case 'K': // unsigned 16 bit immediate
+ case 'L': // signed 32 bit immediate where lower 16 bits are 0
+ case 'N': // immediate in the range of -65535 to -1 (inclusive)
+ case 'O': // signed 15 bit immediate (+- 16383)
+ case 'P': // immediate in the range of 65535 to 1 (inclusive)
+ if (isa<ConstantInt>(CallOperandVal))
+ weight = CW_Constant;
+ break;
+ case 'R':
+ weight = CW_Memory;
+ break;
+ }
+ return weight;
+}
+
+/// This is a helper function to parse a physical register string and split it
+/// into non-numeric and numeric parts (Prefix and Reg). The first boolean flag
+/// that is returned indicates whether parsing was successful. The second flag
+/// is true if the numeric part exists.
+static std::pair<bool, bool> parsePhysicalReg(StringRef C, StringRef &Prefix,
+ unsigned long long &Reg) {
+ if (C.front() != '{' || C.back() != '}')
+ return std::make_pair(false, false);
+
+ // Search for the first numeric character.
+ StringRef::const_iterator I, B = C.begin() + 1, E = C.end() - 1;
+ I = std::find_if(B, E, isdigit);
+
+ Prefix = StringRef(B, I - B);
+
+ // The second flag is set to false if no numeric characters were found.
+ if (I == E)
+ return std::make_pair(true, false);
+
+ // Parse the numeric characters.
+ return std::make_pair(!getAsUnsignedInteger(StringRef(I, E - I), 10, Reg),
+ true);
+}
+
+std::pair<unsigned, const TargetRegisterClass *> MipsTargetLowering::
+parseRegForInlineAsmConstraint(StringRef C, MVT VT) const {
+ const TargetRegisterInfo *TRI =
+ Subtarget.getRegisterInfo();
+ const TargetRegisterClass *RC;
+ StringRef Prefix;
+ unsigned long long Reg;
+
+ std::pair<bool, bool> R = parsePhysicalReg(C, Prefix, Reg);
+
+ if (!R.first)
+ return std::make_pair(0U, nullptr);
+
+ if ((Prefix == "hi" || Prefix == "lo")) { // Parse hi/lo.
+ // No numeric characters follow "hi" or "lo".
+ if (R.second)
+ return std::make_pair(0U, nullptr);
+
+ RC = TRI->getRegClass(Prefix == "hi" ?
+ Mips::HI32RegClassID : Mips::LO32RegClassID);
+ return std::make_pair(*(RC->begin()), RC);
+ } else if (Prefix.startswith("$msa")) {
+ // Parse $msa(ir|csr|access|save|modify|request|map|unmap)
+
+ // No numeric characters follow the name.
+ if (R.second)
+ return std::make_pair(0U, nullptr);
+
+ Reg = StringSwitch<unsigned long long>(Prefix)
+ .Case("$msair", Mips::MSAIR)
+ .Case("$msacsr", Mips::MSACSR)
+ .Case("$msaaccess", Mips::MSAAccess)
+ .Case("$msasave", Mips::MSASave)
+ .Case("$msamodify", Mips::MSAModify)
+ .Case("$msarequest", Mips::MSARequest)
+ .Case("$msamap", Mips::MSAMap)
+ .Case("$msaunmap", Mips::MSAUnmap)
+ .Default(0);
+
+ if (!Reg)
+ return std::make_pair(0U, nullptr);
+
+ RC = TRI->getRegClass(Mips::MSACtrlRegClassID);
+ return std::make_pair(Reg, RC);
+ }
+
+ if (!R.second)
+ return std::make_pair(0U, nullptr);
+
+ if (Prefix == "$f") { // Parse $f0-$f31.
+ // If the size of FP registers is 64-bit or Reg is an even number, select
+ // the 64-bit register class. Otherwise, select the 32-bit register class.
+ if (VT == MVT::Other)
+ VT = (Subtarget.isFP64bit() || !(Reg % 2)) ? MVT::f64 : MVT::f32;
+
+ RC = getRegClassFor(VT);
+
+ if (RC == &Mips::AFGR64RegClass) {
+ assert(Reg % 2 == 0);
+ Reg >>= 1;
+ }
+ } else if (Prefix == "$fcc") // Parse $fcc0-$fcc7.
+ RC = TRI->getRegClass(Mips::FCCRegClassID);
+ else if (Prefix == "$w") { // Parse $w0-$w31.
+ RC = getRegClassFor((VT == MVT::Other) ? MVT::v16i8 : VT);
+ } else { // Parse $0-$31.
+ assert(Prefix == "$");
+ RC = getRegClassFor((VT == MVT::Other) ? MVT::i32 : VT);
+ }
+
+ assert(Reg < RC->getNumRegs());
+ return std::make_pair(*(RC->begin() + Reg), RC);
+}
+
+/// Given a register class constraint, like 'r', if this corresponds directly
+/// to an LLVM register class, return a register of 0 and the register class
+/// pointer.
+std::pair<unsigned, const TargetRegisterClass *>
+MipsTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ StringRef Constraint,
+ MVT VT) const {
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ case 'd': // Address register. Same as 'r' unless generating MIPS16 code.
+ case 'y': // Same as 'r'. Exists for compatibility.
+ case 'r':
+ if (VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8) {
+ if (Subtarget.inMips16Mode())
+ return std::make_pair(0U, &Mips::CPU16RegsRegClass);
+ return std::make_pair(0U, &Mips::GPR32RegClass);
+ }
+ if (VT == MVT::i64 && !Subtarget.isGP64bit())
+ return std::make_pair(0U, &Mips::GPR32RegClass);
+ if (VT == MVT::i64 && Subtarget.isGP64bit())
+ return std::make_pair(0U, &Mips::GPR64RegClass);
+ // This will generate an error message
+ return std::make_pair(0U, nullptr);
+ case 'f': // FPU or MSA register
+ if (VT == MVT::v16i8)
+ return std::make_pair(0U, &Mips::MSA128BRegClass);
+ else if (VT == MVT::v8i16 || VT == MVT::v8f16)
+ return std::make_pair(0U, &Mips::MSA128HRegClass);
+ else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+ return std::make_pair(0U, &Mips::MSA128WRegClass);
+ else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+ return std::make_pair(0U, &Mips::MSA128DRegClass);
+ else if (VT == MVT::f32)
+ return std::make_pair(0U, &Mips::FGR32RegClass);
+ else if ((VT == MVT::f64) && (!Subtarget.isSingleFloat())) {
+ if (Subtarget.isFP64bit())
+ return std::make_pair(0U, &Mips::FGR64RegClass);
+ return std::make_pair(0U, &Mips::AFGR64RegClass);
+ }
+ break;
+ case 'c': // register suitable for indirect jump
+ if (VT == MVT::i32)
+ return std::make_pair((unsigned)Mips::T9, &Mips::GPR32RegClass);
+ assert(VT == MVT::i64 && "Unexpected type.");
+ return std::make_pair((unsigned)Mips::T9_64, &Mips::GPR64RegClass);
+ case 'l': // register suitable for indirect jump
+ if (VT == MVT::i32)
+ return std::make_pair((unsigned)Mips::LO0, &Mips::LO32RegClass);
+ return std::make_pair((unsigned)Mips::LO0_64, &Mips::LO64RegClass);
+ case 'x': // register suitable for indirect jump
+ // Fixme: Not triggering the use of both hi and low
+ // This will generate an error message
+ return std::make_pair(0U, nullptr);
+ }
+ }
+
+ std::pair<unsigned, const TargetRegisterClass *> R;
+ R = parseRegForInlineAsmConstraint(Constraint, VT);
+
+ if (R.second)
+ return R;
+
+ return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+}
+
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector. If it is invalid, don't add anything to Ops.
+void MipsTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+ std::string &Constraint,
+ std::vector<SDValue>&Ops,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ SDValue Result;
+
+ // Only support length 1 constraints for now.
+ if (Constraint.length() > 1) return;
+
+ char ConstraintLetter = Constraint[0];
+ switch (ConstraintLetter) {
+ default: break; // This will fall through to the generic implementation
+ case 'I': // Signed 16 bit constant
+ // If this fails, the parent routine will give an error
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ EVT Type = Op.getValueType();
+ int64_t Val = C->getSExtValue();
+ if (isInt<16>(Val)) {
+ Result = DAG.getTargetConstant(Val, DL, Type);
+ break;
+ }
+ }
+ return;
+ case 'J': // integer zero
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ EVT Type = Op.getValueType();
+ int64_t Val = C->getZExtValue();
+ if (Val == 0) {
+ Result = DAG.getTargetConstant(0, DL, Type);
+ break;
+ }
+ }
+ return;
+ case 'K': // unsigned 16 bit immediate
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ EVT Type = Op.getValueType();
+ uint64_t Val = (uint64_t)C->getZExtValue();
+ if (isUInt<16>(Val)) {
+ Result = DAG.getTargetConstant(Val, DL, Type);
+ break;
+ }
+ }
+ return;
+ case 'L': // signed 32 bit immediate where lower 16 bits are 0
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ EVT Type = Op.getValueType();
+ int64_t Val = C->getSExtValue();
+ if ((isInt<32>(Val)) && ((Val & 0xffff) == 0)){
+ Result = DAG.getTargetConstant(Val, DL, Type);
+ break;
+ }
+ }
+ return;
+ case 'N': // immediate in the range of -65535 to -1 (inclusive)
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ EVT Type = Op.getValueType();
+ int64_t Val = C->getSExtValue();
+ if ((Val >= -65535) && (Val <= -1)) {
+ Result = DAG.getTargetConstant(Val, DL, Type);
+ break;
+ }
+ }
+ return;
+ case 'O': // signed 15 bit immediate
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ EVT Type = Op.getValueType();
+ int64_t Val = C->getSExtValue();
+ if ((isInt<15>(Val))) {
+ Result = DAG.getTargetConstant(Val, DL, Type);
+ break;
+ }
+ }
+ return;
+ case 'P': // immediate in the range of 1 to 65535 (inclusive)
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ EVT Type = Op.getValueType();
+ int64_t Val = C->getSExtValue();
+ if ((Val <= 65535) && (Val >= 1)) {
+ Result = DAG.getTargetConstant(Val, DL, Type);
+ break;
+ }
+ }
+ return;
+ }
+
+ if (Result.getNode()) {
+ Ops.push_back(Result);
+ return;
+ }
+
+ TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+bool MipsTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+ const AddrMode &AM, Type *Ty,
+ unsigned AS) const {
+ // No global is ever allowed as a base.
+ if (AM.BaseGV)
+ return false;
+
+ switch (AM.Scale) {
+ case 0: // "r+i" or just "i", depending on HasBaseReg.
+ break;
+ case 1:
+ if (!AM.HasBaseReg) // allow "r+i".
+ break;
+ return false; // disallow "r+r" or "r+r+i".
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+bool
+MipsTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+ // The Mips target isn't yet aware of offsets.
+ return false;
+}
+
+EVT MipsTargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
+ unsigned SrcAlign,
+ bool IsMemset, bool ZeroMemset,
+ bool MemcpyStrSrc,
+ MachineFunction &MF) const {
+ if (Subtarget.hasMips64())
+ return MVT::i64;
+
+ return MVT::i32;
+}
+
+bool MipsTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+ if (VT != MVT::f32 && VT != MVT::f64)
+ return false;
+ if (Imm.isNegZero())
+ return false;
+ return Imm.isZero();
+}
+
+unsigned MipsTargetLowering::getJumpTableEncoding() const {
+ if (ABI.IsN64())
+ return MachineJumpTableInfo::EK_GPRel64BlockAddress;
+
+ return TargetLowering::getJumpTableEncoding();
+}
+
+bool MipsTargetLowering::useSoftFloat() const {
+ return Subtarget.useSoftFloat();
+}
+
+void MipsTargetLowering::copyByValRegs(
+ SDValue Chain, const SDLoc &DL, std::vector<SDValue> &OutChains,
+ SelectionDAG &DAG, const ISD::ArgFlagsTy &Flags,
+ SmallVectorImpl<SDValue> &InVals, const Argument *FuncArg,
+ unsigned FirstReg, unsigned LastReg, const CCValAssign &VA,
+ MipsCCState &State) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ unsigned GPRSizeInBytes = Subtarget.getGPRSizeInBytes();
+ unsigned NumRegs = LastReg - FirstReg;
+ unsigned RegAreaSize = NumRegs * GPRSizeInBytes;
+ unsigned FrameObjSize = std::max(Flags.getByValSize(), RegAreaSize);
+ int FrameObjOffset;
+ ArrayRef<MCPhysReg> ByValArgRegs = ABI.GetByValArgRegs();
+
+ if (RegAreaSize)
+ FrameObjOffset =
+ (int)ABI.GetCalleeAllocdArgSizeInBytes(State.getCallingConv()) -
+ (int)((ByValArgRegs.size() - FirstReg) * GPRSizeInBytes);
+ else
+ FrameObjOffset = VA.getLocMemOffset();
+
+ // Create frame object.
+ EVT PtrTy = getPointerTy(DAG.getDataLayout());
+ int FI = MFI.CreateFixedObject(FrameObjSize, FrameObjOffset, true);
+ SDValue FIN = DAG.getFrameIndex(FI, PtrTy);
+ InVals.push_back(FIN);
+
+ if (!NumRegs)
+ return;
+
+ // Copy arg registers.
+ MVT RegTy = MVT::getIntegerVT(GPRSizeInBytes * 8);
+ const TargetRegisterClass *RC = getRegClassFor(RegTy);
+
+ for (unsigned I = 0; I < NumRegs; ++I) {
+ unsigned ArgReg = ByValArgRegs[FirstReg + I];
+ unsigned VReg = addLiveIn(MF, ArgReg, RC);
+ unsigned Offset = I * GPRSizeInBytes;
+ SDValue StorePtr = DAG.getNode(ISD::ADD, DL, PtrTy, FIN,
+ DAG.getConstant(Offset, DL, PtrTy));
+ SDValue Store = DAG.getStore(Chain, DL, DAG.getRegister(VReg, RegTy),
+ StorePtr, MachinePointerInfo(FuncArg, Offset));
+ OutChains.push_back(Store);
+ }
+}
+
+// Copy byVal arg to registers and stack.
+void MipsTargetLowering::passByValArg(
+ SDValue Chain, const SDLoc &DL,
+ std::deque<std::pair<unsigned, SDValue>> &RegsToPass,
+ SmallVectorImpl<SDValue> &MemOpChains, SDValue StackPtr,
+ MachineFrameInfo &MFI, SelectionDAG &DAG, SDValue Arg, unsigned FirstReg,
+ unsigned LastReg, const ISD::ArgFlagsTy &Flags, bool isLittle,
+ const CCValAssign &VA) const {
+ unsigned ByValSizeInBytes = Flags.getByValSize();
+ unsigned OffsetInBytes = 0; // From beginning of struct
+ unsigned RegSizeInBytes = Subtarget.getGPRSizeInBytes();
+ unsigned Alignment = std::min(Flags.getByValAlign(), RegSizeInBytes);
+ EVT PtrTy = getPointerTy(DAG.getDataLayout()),
+ RegTy = MVT::getIntegerVT(RegSizeInBytes * 8);
+ unsigned NumRegs = LastReg - FirstReg;
+
+ if (NumRegs) {
+ ArrayRef<MCPhysReg> ArgRegs = ABI.GetByValArgRegs();
+ bool LeftoverBytes = (NumRegs * RegSizeInBytes > ByValSizeInBytes);
+ unsigned I = 0;
+
+ // Copy words to registers.
+ for (; I < NumRegs - LeftoverBytes; ++I, OffsetInBytes += RegSizeInBytes) {
+ SDValue LoadPtr = DAG.getNode(ISD::ADD, DL, PtrTy, Arg,
+ DAG.getConstant(OffsetInBytes, DL, PtrTy));
+ SDValue LoadVal = DAG.getLoad(RegTy, DL, Chain, LoadPtr,
+ MachinePointerInfo(), Alignment);
+ MemOpChains.push_back(LoadVal.getValue(1));
+ unsigned ArgReg = ArgRegs[FirstReg + I];
+ RegsToPass.push_back(std::make_pair(ArgReg, LoadVal));
+ }
+
+ // Return if the struct has been fully copied.
+ if (ByValSizeInBytes == OffsetInBytes)
+ return;
+
+ // Copy the remainder of the byval argument with sub-word loads and shifts.
+ if (LeftoverBytes) {
+ SDValue Val;
+
+ for (unsigned LoadSizeInBytes = RegSizeInBytes / 2, TotalBytesLoaded = 0;
+ OffsetInBytes < ByValSizeInBytes; LoadSizeInBytes /= 2) {
+ unsigned RemainingSizeInBytes = ByValSizeInBytes - OffsetInBytes;
+
+ if (RemainingSizeInBytes < LoadSizeInBytes)
+ continue;
+
+ // Load subword.
+ SDValue LoadPtr = DAG.getNode(ISD::ADD, DL, PtrTy, Arg,
+ DAG.getConstant(OffsetInBytes, DL,
+ PtrTy));
+ SDValue LoadVal = DAG.getExtLoad(
+ ISD::ZEXTLOAD, DL, RegTy, Chain, LoadPtr, MachinePointerInfo(),
+ MVT::getIntegerVT(LoadSizeInBytes * 8), Alignment);
+ MemOpChains.push_back(LoadVal.getValue(1));
+
+ // Shift the loaded value.
+ unsigned Shamt;
+
+ if (isLittle)
+ Shamt = TotalBytesLoaded * 8;
+ else
+ Shamt = (RegSizeInBytes - (TotalBytesLoaded + LoadSizeInBytes)) * 8;
+
+ SDValue Shift = DAG.getNode(ISD::SHL, DL, RegTy, LoadVal,
+ DAG.getConstant(Shamt, DL, MVT::i32));
+
+ if (Val.getNode())
+ Val = DAG.getNode(ISD::OR, DL, RegTy, Val, Shift);
+ else
+ Val = Shift;
+
+ OffsetInBytes += LoadSizeInBytes;
+ TotalBytesLoaded += LoadSizeInBytes;
+ Alignment = std::min(Alignment, LoadSizeInBytes);
+ }
+
+ unsigned ArgReg = ArgRegs[FirstReg + I];
+ RegsToPass.push_back(std::make_pair(ArgReg, Val));
+ return;
+ }
+ }
+
+ // Copy remainder of byval arg to it with memcpy.
+ unsigned MemCpySize = ByValSizeInBytes - OffsetInBytes;
+ SDValue Src = DAG.getNode(ISD::ADD, DL, PtrTy, Arg,
+ DAG.getConstant(OffsetInBytes, DL, PtrTy));
+ SDValue Dst = DAG.getNode(ISD::ADD, DL, PtrTy, StackPtr,
+ DAG.getIntPtrConstant(VA.getLocMemOffset(), DL));
+ Chain = DAG.getMemcpy(Chain, DL, Dst, Src,
+ DAG.getConstant(MemCpySize, DL, PtrTy),
+ Alignment, /*isVolatile=*/false, /*AlwaysInline=*/false,
+ /*isTailCall=*/false,
+ MachinePointerInfo(), MachinePointerInfo());
+ MemOpChains.push_back(Chain);
+}
+
+void MipsTargetLowering::writeVarArgRegs(std::vector<SDValue> &OutChains,
+ SDValue Chain, const SDLoc &DL,
+ SelectionDAG &DAG,
+ CCState &State) const {
+ ArrayRef<MCPhysReg> ArgRegs = ABI.GetVarArgRegs();
+ unsigned Idx = State.getFirstUnallocated(ArgRegs);
+ unsigned RegSizeInBytes = Subtarget.getGPRSizeInBytes();
+ MVT RegTy = MVT::getIntegerVT(RegSizeInBytes * 8);
+ const TargetRegisterClass *RC = getRegClassFor(RegTy);
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+ // Offset of the first variable argument from stack pointer.
+ int VaArgOffset;
+
+ if (ArgRegs.size() == Idx)
+ VaArgOffset = alignTo(State.getNextStackOffset(), RegSizeInBytes);
+ else {
+ VaArgOffset =
+ (int)ABI.GetCalleeAllocdArgSizeInBytes(State.getCallingConv()) -
+ (int)(RegSizeInBytes * (ArgRegs.size() - Idx));
+ }
+
+ // Record the frame index of the first variable argument
+ // which is a value necessary to VASTART.
+ int FI = MFI.CreateFixedObject(RegSizeInBytes, VaArgOffset, true);
+ MipsFI->setVarArgsFrameIndex(FI);
+
+ // Copy the integer registers that have not been used for argument passing
+ // to the argument register save area. For O32, the save area is allocated
+ // in the caller's stack frame, while for N32/64, it is allocated in the
+ // callee's stack frame.
+ for (unsigned I = Idx; I < ArgRegs.size();
+ ++I, VaArgOffset += RegSizeInBytes) {
+ unsigned Reg = addLiveIn(MF, ArgRegs[I], RC);
+ SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegTy);
+ FI = MFI.CreateFixedObject(RegSizeInBytes, VaArgOffset, true);
+ SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+ SDValue Store =
+ DAG.getStore(Chain, DL, ArgValue, PtrOff, MachinePointerInfo());
+ cast<StoreSDNode>(Store.getNode())->getMemOperand()->setValue(
+ (Value *)nullptr);
+ OutChains.push_back(Store);
+ }
+}
+
+void MipsTargetLowering::HandleByVal(CCState *State, unsigned &Size,
+ unsigned Align) const {
+ const TargetFrameLowering *TFL = Subtarget.getFrameLowering();
+
+ assert(Size && "Byval argument's size shouldn't be 0.");
+
+ Align = std::min(Align, TFL->getStackAlignment());
+
+ unsigned FirstReg = 0;
+ unsigned NumRegs = 0;
+
+ if (State->getCallingConv() != CallingConv::Fast) {
+ unsigned RegSizeInBytes = Subtarget.getGPRSizeInBytes();
+ ArrayRef<MCPhysReg> IntArgRegs = ABI.GetByValArgRegs();
+ // FIXME: The O32 case actually describes no shadow registers.
+ const MCPhysReg *ShadowRegs =
+ ABI.IsO32() ? IntArgRegs.data() : Mips64DPRegs;
+
+ // We used to check the size as well but we can't do that anymore since
+ // CCState::HandleByVal() rounds up the size after calling this function.
+ assert(!(Align % RegSizeInBytes) &&
+ "Byval argument's alignment should be a multiple of"
+ "RegSizeInBytes.");
+
+ FirstReg = State->getFirstUnallocated(IntArgRegs);
+
+ // If Align > RegSizeInBytes, the first arg register must be even.
+ // FIXME: This condition happens to do the right thing but it's not the
+ // right way to test it. We want to check that the stack frame offset
+ // of the register is aligned.
+ if ((Align > RegSizeInBytes) && (FirstReg % 2)) {
+ State->AllocateReg(IntArgRegs[FirstReg], ShadowRegs[FirstReg]);
+ ++FirstReg;
+ }
+
+ // Mark the registers allocated.
+ Size = alignTo(Size, RegSizeInBytes);
+ for (unsigned I = FirstReg; Size > 0 && (I < IntArgRegs.size());
+ Size -= RegSizeInBytes, ++I, ++NumRegs)
+ State->AllocateReg(IntArgRegs[I], ShadowRegs[I]);
+ }
+
+ State->addInRegsParamInfo(FirstReg, FirstReg + NumRegs);
+}
+
+MachineBasicBlock *MipsTargetLowering::emitPseudoSELECT(MachineInstr &MI,
+ MachineBasicBlock *BB,
+ bool isFPCmp,
+ unsigned Opc) const {
+ assert(!(Subtarget.hasMips4() || Subtarget.hasMips32()) &&
+ "Subtarget already supports SELECT nodes with the use of"
+ "conditional-move instructions.");
+
+ const TargetInstrInfo *TII =
+ Subtarget.getInstrInfo();
+ DebugLoc DL = MI.getDebugLoc();
+
+ // To "insert" a SELECT instruction, we actually have to insert the
+ // diamond control-flow pattern. The incoming instruction knows the
+ // destination vreg to set, the condition code register to branch on, the
+ // true/false values to select between, and a branch opcode to use.
+ const BasicBlock *LLVM_BB = BB->getBasicBlock();
+ MachineFunction::iterator It = ++BB->getIterator();
+
+ // thisMBB:
+ // ...
+ // TrueVal = ...
+ // setcc r1, r2, r3
+ // bNE r1, r0, copy1MBB
+ // fallthrough --> copy0MBB
+ MachineBasicBlock *thisMBB = BB;
+ MachineFunction *F = BB->getParent();
+ MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ F->insert(It, copy0MBB);
+ F->insert(It, sinkMBB);
+
+ // Transfer the remainder of BB and its successor edges to sinkMBB.
+ sinkMBB->splice(sinkMBB->begin(), BB,
+ std::next(MachineBasicBlock::iterator(MI)), BB->end());
+ sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+ // Next, add the true and fallthrough blocks as its successors.
+ BB->addSuccessor(copy0MBB);
+ BB->addSuccessor(sinkMBB);
+
+ if (isFPCmp) {
+ // bc1[tf] cc, sinkMBB
+ BuildMI(BB, DL, TII->get(Opc))
+ .addReg(MI.getOperand(1).getReg())
+ .addMBB(sinkMBB);
+ } else {
+ // bne rs, $0, sinkMBB
+ BuildMI(BB, DL, TII->get(Opc))
+ .addReg(MI.getOperand(1).getReg())
+ .addReg(Mips::ZERO)
+ .addMBB(sinkMBB);
+ }
+
+ // copy0MBB:
+ // %FalseValue = ...
+ // # fallthrough to sinkMBB
+ BB = copy0MBB;
+
+ // Update machine-CFG edges
+ BB->addSuccessor(sinkMBB);
+
+ // sinkMBB:
+ // %Result = phi [ %TrueValue, thisMBB ], [ %FalseValue, copy0MBB ]
+ // ...
+ BB = sinkMBB;
+
+ BuildMI(*BB, BB->begin(), DL, TII->get(Mips::PHI), MI.getOperand(0).getReg())
+ .addReg(MI.getOperand(2).getReg())
+ .addMBB(thisMBB)
+ .addReg(MI.getOperand(3).getReg())
+ .addMBB(copy0MBB);
+
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
+
+ return BB;
+}
+
+// FIXME? Maybe this could be a TableGen attribute on some registers and
+// this table could be generated automatically from RegInfo.
+unsigned MipsTargetLowering::getRegisterByName(const char* RegName, EVT VT,
+ SelectionDAG &DAG) const {
+ // Named registers is expected to be fairly rare. For now, just support $28
+ // since the linux kernel uses it.
+ if (Subtarget.isGP64bit()) {
+ unsigned Reg = StringSwitch<unsigned>(RegName)
+ .Case("$28", Mips::GP_64)
+ .Default(0);
+ if (Reg)
+ return Reg;
+ } else {
+ unsigned Reg = StringSwitch<unsigned>(RegName)
+ .Case("$28", Mips::GP)
+ .Default(0);
+ if (Reg)
+ return Reg;
+ }
+ report_fatal_error("Invalid register name global variable");
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsISelLowering.h b/contrib/llvm/lib/Target/Mips/MipsISelLowering.h
new file mode 100644
index 000000000000..cddf0903ca6a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsISelLowering.h
@@ -0,0 +1,614 @@
+//===-- MipsISelLowering.h - Mips DAG Lowering Interface --------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Mips uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSISELLOWERING_H
+#define LLVM_LIB_TARGET_MIPS_MIPSISELLOWERING_H
+
+#include "MCTargetDesc/MipsABIInfo.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "Mips.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Target/TargetLowering.h"
+#include <deque>
+#include <string>
+
+namespace llvm {
+ namespace MipsISD {
+ enum NodeType : unsigned {
+ // Start the numbering from where ISD NodeType finishes.
+ FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+ // Jump and link (call)
+ JmpLink,
+
+ // Tail call
+ TailCall,
+
+ // Get the Higher 16 bits from a 32-bit immediate
+ // No relation with Mips Hi register
+ Hi,
+
+ // Get the Lower 16 bits from a 32-bit immediate
+ // No relation with Mips Lo register
+ Lo,
+
+ // Handle gp_rel (small data/bss sections) relocation.
+ GPRel,
+
+ // Thread Pointer
+ ThreadPointer,
+
+ // Floating Point Branch Conditional
+ FPBrcond,
+
+ // Floating Point Compare
+ FPCmp,
+
+ // Floating Point Conditional Moves
+ CMovFP_T,
+ CMovFP_F,
+
+ // FP-to-int truncation node.
+ TruncIntFP,
+
+ // Return
+ Ret,
+
+ // Interrupt, exception, error trap Return
+ ERet,
+
+ // Software Exception Return.
+ EH_RETURN,
+
+ // Node used to extract integer from accumulator.
+ MFHI,
+ MFLO,
+
+ // Node used to insert integers to accumulator.
+ MTLOHI,
+
+ // Mult nodes.
+ Mult,
+ Multu,
+
+ // MAdd/Sub nodes
+ MAdd,
+ MAddu,
+ MSub,
+ MSubu,
+
+ // DivRem(u)
+ DivRem,
+ DivRemU,
+ DivRem16,
+ DivRemU16,
+
+ BuildPairF64,
+ ExtractElementF64,
+
+ Wrapper,
+
+ DynAlloc,
+
+ Sync,
+
+ Ext,
+ Ins,
+
+ // EXTR.W instrinsic nodes.
+ EXTP,
+ EXTPDP,
+ EXTR_S_H,
+ EXTR_W,
+ EXTR_R_W,
+ EXTR_RS_W,
+ SHILO,
+ MTHLIP,
+
+ // DPA.W intrinsic nodes.
+ MULSAQ_S_W_PH,
+ MAQ_S_W_PHL,
+ MAQ_S_W_PHR,
+ MAQ_SA_W_PHL,
+ MAQ_SA_W_PHR,
+ DPAU_H_QBL,
+ DPAU_H_QBR,
+ DPSU_H_QBL,
+ DPSU_H_QBR,
+ DPAQ_S_W_PH,
+ DPSQ_S_W_PH,
+ DPAQ_SA_L_W,
+ DPSQ_SA_L_W,
+ DPA_W_PH,
+ DPS_W_PH,
+ DPAQX_S_W_PH,
+ DPAQX_SA_W_PH,
+ DPAX_W_PH,
+ DPSX_W_PH,
+ DPSQX_S_W_PH,
+ DPSQX_SA_W_PH,
+ MULSA_W_PH,
+
+ MULT,
+ MULTU,
+ MADD_DSP,
+ MADDU_DSP,
+ MSUB_DSP,
+ MSUBU_DSP,
+
+ // DSP shift nodes.
+ SHLL_DSP,
+ SHRA_DSP,
+ SHRL_DSP,
+
+ // DSP setcc and select_cc nodes.
+ SETCC_DSP,
+ SELECT_CC_DSP,
+
+ // Vector comparisons.
+ // These take a vector and return a boolean.
+ VALL_ZERO,
+ VANY_ZERO,
+ VALL_NONZERO,
+ VANY_NONZERO,
+
+ // These take a vector and return a vector bitmask.
+ VCEQ,
+ VCLE_S,
+ VCLE_U,
+ VCLT_S,
+ VCLT_U,
+
+ // Element-wise vector max/min.
+ VSMAX,
+ VSMIN,
+ VUMAX,
+ VUMIN,
+
+ // Vector Shuffle with mask as an operand
+ VSHF, // Generic shuffle
+ SHF, // 4-element set shuffle.
+ ILVEV, // Interleave even elements
+ ILVOD, // Interleave odd elements
+ ILVL, // Interleave left elements
+ ILVR, // Interleave right elements
+ PCKEV, // Pack even elements
+ PCKOD, // Pack odd elements
+
+ // Vector Lane Copy
+ INSVE, // Copy element from one vector to another
+
+ // Combined (XOR (OR $a, $b), -1)
+ VNOR,
+
+ // Extended vector element extraction
+ VEXTRACT_SEXT_ELT,
+ VEXTRACT_ZEXT_ELT,
+
+ // Load/Store Left/Right nodes.
+ LWL = ISD::FIRST_TARGET_MEMORY_OPCODE,
+ LWR,
+ SWL,
+ SWR,
+ LDL,
+ LDR,
+ SDL,
+ SDR
+ };
+ }
+
+ //===--------------------------------------------------------------------===//
+ // TargetLowering Implementation
+ //===--------------------------------------------------------------------===//
+ class MipsFunctionInfo;
+ class MipsSubtarget;
+ class MipsCCState;
+
+ class MipsTargetLowering : public TargetLowering {
+ bool isMicroMips;
+ public:
+ explicit MipsTargetLowering(const MipsTargetMachine &TM,
+ const MipsSubtarget &STI);
+
+ static const MipsTargetLowering *create(const MipsTargetMachine &TM,
+ const MipsSubtarget &STI);
+
+ /// createFastISel - This method returns a target specific FastISel object,
+ /// or null if the target does not support "fast" ISel.
+ FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo) const override;
+
+ MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
+ return MVT::i32;
+ }
+
+ bool isCheapToSpeculateCttz() const override;
+ bool isCheapToSpeculateCtlz() const override;
+
+ ISD::NodeType getExtendForAtomicOps() const override {
+ return ISD::SIGN_EXTEND;
+ }
+
+ void LowerOperationWrapper(SDNode *N,
+ SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const override;
+
+ /// LowerOperation - Provide custom lowering hooks for some operations.
+ SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+ /// ReplaceNodeResults - Replace the results of node with an illegal result
+ /// type with new values built out of custom code.
+ ///
+ void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+ SelectionDAG &DAG) const override;
+
+ /// getTargetNodeName - This method returns the name of a target specific
+ // DAG node.
+ const char *getTargetNodeName(unsigned Opcode) const override;
+
+ /// getSetCCResultType - get the ISD::SETCC result ValueType
+ EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+ EVT VT) const override;
+
+ SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+ MachineBasicBlock *
+ EmitInstrWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *MBB) const override;
+
+ void HandleByVal(CCState *, unsigned &, unsigned) const override;
+
+ unsigned getRegisterByName(const char* RegName, EVT VT,
+ SelectionDAG &DAG) const override;
+
+ /// If a physical register, this returns the register that receives the
+ /// exception address on entry to an EH pad.
+ unsigned
+ getExceptionPointerRegister(const Constant *PersonalityFn) const override {
+ return ABI.IsN64() ? Mips::A0_64 : Mips::A0;
+ }
+
+ /// If a physical register, this returns the register that receives the
+ /// exception typeid on entry to a landing pad.
+ unsigned
+ getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
+ return ABI.IsN64() ? Mips::A1_64 : Mips::A1;
+ }
+
+ /// Returns true if a cast between SrcAS and DestAS is a noop.
+ bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
+ // Mips doesn't have any special address spaces so we just reserve
+ // the first 256 for software use (e.g. OpenCL) and treat casts
+ // between them as noops.
+ return SrcAS < 256 && DestAS < 256;
+ }
+
+ bool isJumpTableRelative() const override {
+ return getTargetMachine().isPositionIndependent() || ABI.IsN64();
+ }
+
+ protected:
+ SDValue getGlobalReg(SelectionDAG &DAG, EVT Ty) const;
+
+ // This method creates the following nodes, which are necessary for
+ // computing a local symbol's address:
+ //
+ // (add (load (wrapper $gp, %got(sym)), %lo(sym))
+ template <class NodeTy>
+ SDValue getAddrLocal(NodeTy *N, const SDLoc &DL, EVT Ty, SelectionDAG &DAG,
+ bool IsN32OrN64) const {
+ unsigned GOTFlag = IsN32OrN64 ? MipsII::MO_GOT_PAGE : MipsII::MO_GOT;
+ SDValue GOT = DAG.getNode(MipsISD::Wrapper, DL, Ty, getGlobalReg(DAG, Ty),
+ getTargetNode(N, Ty, DAG, GOTFlag));
+ SDValue Load =
+ DAG.getLoad(Ty, DL, DAG.getEntryNode(), GOT,
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+ unsigned LoFlag = IsN32OrN64 ? MipsII::MO_GOT_OFST : MipsII::MO_ABS_LO;
+ SDValue Lo = DAG.getNode(MipsISD::Lo, DL, Ty,
+ getTargetNode(N, Ty, DAG, LoFlag));
+ return DAG.getNode(ISD::ADD, DL, Ty, Load, Lo);
+ }
+
+ // This method creates the following nodes, which are necessary for
+ // computing a global symbol's address:
+ //
+ // (load (wrapper $gp, %got(sym)))
+ template <class NodeTy>
+ SDValue getAddrGlobal(NodeTy *N, const SDLoc &DL, EVT Ty, SelectionDAG &DAG,
+ unsigned Flag, SDValue Chain,
+ const MachinePointerInfo &PtrInfo) const {
+ SDValue Tgt = DAG.getNode(MipsISD::Wrapper, DL, Ty, getGlobalReg(DAG, Ty),
+ getTargetNode(N, Ty, DAG, Flag));
+ return DAG.getLoad(Ty, DL, Chain, Tgt, PtrInfo);
+ }
+
+ // This method creates the following nodes, which are necessary for
+ // computing a global symbol's address in large-GOT mode:
+ //
+ // (load (wrapper (add %hi(sym), $gp), %lo(sym)))
+ template <class NodeTy>
+ SDValue getAddrGlobalLargeGOT(NodeTy *N, const SDLoc &DL, EVT Ty,
+ SelectionDAG &DAG, unsigned HiFlag,
+ unsigned LoFlag, SDValue Chain,
+ const MachinePointerInfo &PtrInfo) const {
+ SDValue Hi =
+ DAG.getNode(MipsISD::Hi, DL, Ty, getTargetNode(N, Ty, DAG, HiFlag));
+ Hi = DAG.getNode(ISD::ADD, DL, Ty, Hi, getGlobalReg(DAG, Ty));
+ SDValue Wrapper = DAG.getNode(MipsISD::Wrapper, DL, Ty, Hi,
+ getTargetNode(N, Ty, DAG, LoFlag));
+ return DAG.getLoad(Ty, DL, Chain, Wrapper, PtrInfo);
+ }
+
+ // This method creates the following nodes, which are necessary for
+ // computing a symbol's address in non-PIC mode:
+ //
+ // (add %hi(sym), %lo(sym))
+ template <class NodeTy>
+ SDValue getAddrNonPIC(NodeTy *N, const SDLoc &DL, EVT Ty,
+ SelectionDAG &DAG) const {
+ SDValue Hi = getTargetNode(N, Ty, DAG, MipsII::MO_ABS_HI);
+ SDValue Lo = getTargetNode(N, Ty, DAG, MipsII::MO_ABS_LO);
+ return DAG.getNode(ISD::ADD, DL, Ty,
+ DAG.getNode(MipsISD::Hi, DL, Ty, Hi),
+ DAG.getNode(MipsISD::Lo, DL, Ty, Lo));
+ }
+
+ // This method creates the following nodes, which are necessary for
+ // computing a symbol's address using gp-relative addressing:
+ //
+ // (add $gp, %gp_rel(sym))
+ template <class NodeTy>
+ SDValue getAddrGPRel(NodeTy *N, const SDLoc &DL, EVT Ty,
+ SelectionDAG &DAG) const {
+ assert(Ty == MVT::i32);
+ SDValue GPRel = getTargetNode(N, Ty, DAG, MipsII::MO_GPREL);
+ return DAG.getNode(ISD::ADD, DL, Ty,
+ DAG.getRegister(Mips::GP, Ty),
+ DAG.getNode(MipsISD::GPRel, DL, DAG.getVTList(Ty),
+ GPRel));
+ }
+
+ /// This function fills Ops, which is the list of operands that will later
+ /// be used when a function call node is created. It also generates
+ /// copyToReg nodes to set up argument registers.
+ virtual void
+ getOpndList(SmallVectorImpl<SDValue> &Ops,
+ std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
+ bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
+ bool IsCallReloc, CallLoweringInfo &CLI, SDValue Callee,
+ SDValue Chain) const;
+
+ protected:
+ SDValue lowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+
+ // Subtarget Info
+ const MipsSubtarget &Subtarget;
+ // Cache the ABI from the TargetMachine, we use it everywhere.
+ const MipsABIInfo &ABI;
+
+ private:
+ // Create a TargetGlobalAddress node.
+ SDValue getTargetNode(GlobalAddressSDNode *N, EVT Ty, SelectionDAG &DAG,
+ unsigned Flag) const;
+
+ // Create a TargetExternalSymbol node.
+ SDValue getTargetNode(ExternalSymbolSDNode *N, EVT Ty, SelectionDAG &DAG,
+ unsigned Flag) const;
+
+ // Create a TargetBlockAddress node.
+ SDValue getTargetNode(BlockAddressSDNode *N, EVT Ty, SelectionDAG &DAG,
+ unsigned Flag) const;
+
+ // Create a TargetJumpTable node.
+ SDValue getTargetNode(JumpTableSDNode *N, EVT Ty, SelectionDAG &DAG,
+ unsigned Flag) const;
+
+ // Create a TargetConstantPool node.
+ SDValue getTargetNode(ConstantPoolSDNode *N, EVT Ty, SelectionDAG &DAG,
+ unsigned Flag) const;
+
+ // Lower Operand helpers
+ SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+ CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals,
+ TargetLowering::CallLoweringInfo &CLI) const;
+
+ // Lower Operand specifics
+ SDValue lowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerVAARG(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFABS(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const;
+ SDValue lowerShiftLeftParts(SDValue Op, SelectionDAG& DAG) const;
+ SDValue lowerShiftRightParts(SDValue Op, SelectionDAG& DAG,
+ bool IsSRA) const;
+ SDValue lowerEH_DWARF_CFA(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
+
+ /// isEligibleForTailCallOptimization - Check whether the call is eligible
+ /// for tail call optimization.
+ virtual bool
+ isEligibleForTailCallOptimization(const CCState &CCInfo,
+ unsigned NextStackOffset,
+ const MipsFunctionInfo &FI) const = 0;
+
+ /// copyByValArg - Copy argument registers which were used to pass a byval
+ /// argument to the stack. Create a stack frame object for the byval
+ /// argument.
+ void copyByValRegs(SDValue Chain, const SDLoc &DL,
+ std::vector<SDValue> &OutChains, SelectionDAG &DAG,
+ const ISD::ArgFlagsTy &Flags,
+ SmallVectorImpl<SDValue> &InVals,
+ const Argument *FuncArg, unsigned FirstReg,
+ unsigned LastReg, const CCValAssign &VA,
+ MipsCCState &State) const;
+
+ /// passByValArg - Pass a byval argument in registers or on stack.
+ void passByValArg(SDValue Chain, const SDLoc &DL,
+ std::deque<std::pair<unsigned, SDValue>> &RegsToPass,
+ SmallVectorImpl<SDValue> &MemOpChains, SDValue StackPtr,
+ MachineFrameInfo &MFI, SelectionDAG &DAG, SDValue Arg,
+ unsigned FirstReg, unsigned LastReg,
+ const ISD::ArgFlagsTy &Flags, bool isLittle,
+ const CCValAssign &VA) const;
+
+ /// writeVarArgRegs - Write variable function arguments passed in registers
+ /// to the stack. Also create a stack frame object for the first variable
+ /// argument.
+ void writeVarArgRegs(std::vector<SDValue> &OutChains, SDValue Chain,
+ const SDLoc &DL, SelectionDAG &DAG,
+ CCState &State) const;
+
+ SDValue
+ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const override;
+
+ SDValue passArgOnStack(SDValue StackPtr, unsigned Offset, SDValue Chain,
+ SDValue Arg, const SDLoc &DL, bool IsTailCall,
+ SelectionDAG &DAG) const;
+
+ SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const override;
+
+ bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ LLVMContext &Context) const override;
+
+ SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SDLoc &dl, SelectionDAG &DAG) const override;
+
+ SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,
+ const SDLoc &DL, SelectionDAG &DAG) const;
+
+ bool shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const override;
+
+ // Inline asm support
+ ConstraintType getConstraintType(StringRef Constraint) const override;
+
+ /// Examine constraint string and operand type and determine a weight value.
+ /// The operand object must already have been set up with the operand type.
+ ConstraintWeight getSingleConstraintMatchWeight(
+ AsmOperandInfo &info, const char *constraint) const override;
+
+ /// This function parses registers that appear in inline-asm constraints.
+ /// It returns pair (0, 0) on failure.
+ std::pair<unsigned, const TargetRegisterClass *>
+ parseRegForInlineAsmConstraint(StringRef C, MVT VT) const;
+
+ std::pair<unsigned, const TargetRegisterClass *>
+ getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ StringRef Constraint, MVT VT) const override;
+
+ /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+ /// vector. If it is invalid, don't add anything to Ops. If hasMemory is
+ /// true it means one of the asm constraint of the inline asm instruction
+ /// being processed is 'm'.
+ void LowerAsmOperandForConstraint(SDValue Op,
+ std::string &Constraint,
+ std::vector<SDValue> &Ops,
+ SelectionDAG &DAG) const override;
+
+ unsigned
+ getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
+ if (ConstraintCode == "R")
+ return InlineAsm::Constraint_R;
+ else if (ConstraintCode == "ZC")
+ return InlineAsm::Constraint_ZC;
+ return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
+ }
+
+ bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
+ Type *Ty, unsigned AS) const override;
+
+ bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+
+ EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
+ unsigned SrcAlign,
+ bool IsMemset, bool ZeroMemset,
+ bool MemcpyStrSrc,
+ MachineFunction &MF) const override;
+
+ /// isFPImmLegal - Returns true if the target can instruction select the
+ /// specified FP immediate natively. If false, the legalizer will
+ /// materialize the FP immediate as a load from a constant pool.
+ bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+
+ unsigned getJumpTableEncoding() const override;
+ bool useSoftFloat() const override;
+
+ bool shouldInsertFencesForAtomic(const Instruction *I) const override {
+ return true;
+ }
+
+ /// Emit a sign-extension using sll/sra, seb, or seh appropriately.
+ MachineBasicBlock *emitSignExtendToI32InReg(MachineInstr &MI,
+ MachineBasicBlock *BB,
+ unsigned Size, unsigned DstReg,
+ unsigned SrcRec) const;
+
+ MachineBasicBlock *emitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB,
+ unsigned Size, unsigned BinOpcode,
+ bool Nand = false) const;
+ MachineBasicBlock *emitAtomicBinaryPartword(MachineInstr &MI,
+ MachineBasicBlock *BB,
+ unsigned Size,
+ unsigned BinOpcode,
+ bool Nand = false) const;
+ MachineBasicBlock *emitAtomicCmpSwap(MachineInstr &MI,
+ MachineBasicBlock *BB,
+ unsigned Size) const;
+ MachineBasicBlock *emitAtomicCmpSwapPartword(MachineInstr &MI,
+ MachineBasicBlock *BB,
+ unsigned Size) const;
+ MachineBasicBlock *emitSEL_D(MachineInstr &MI, MachineBasicBlock *BB) const;
+ MachineBasicBlock *emitPseudoSELECT(MachineInstr &MI, MachineBasicBlock *BB,
+ bool isFPCmp, unsigned Opc) const;
+ };
+
+ /// Create MipsTargetLowering objects.
+ const MipsTargetLowering *
+ createMips16TargetLowering(const MipsTargetMachine &TM,
+ const MipsSubtarget &STI);
+ const MipsTargetLowering *
+ createMipsSETargetLowering(const MipsTargetMachine &TM,
+ const MipsSubtarget &STI);
+
+ namespace Mips {
+ FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo);
+ }
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrFPU.td b/contrib/llvm/lib/Target/Mips/MipsInstrFPU.td
new file mode 100644
index 000000000000..ab7aa9dcdcae
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsInstrFPU.td
@@ -0,0 +1,687 @@
+//===-- MipsInstrFPU.td - Mips FPU Instruction Information -*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Mips FPU instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Floating Point Instructions
+// ------------------------
+// * 64bit fp:
+// - 32 64-bit registers (default mode)
+// - 16 even 32-bit registers (32-bit compatible mode) for
+// single and double access.
+// * 32bit fp:
+// - 16 even 32-bit registers - single and double (aliased)
+// - 32 32-bit registers (within single-only mode)
+//===----------------------------------------------------------------------===//
+
+// Floating Point Compare and Branch
+def SDT_MipsFPBrcond : SDTypeProfile<0, 3, [SDTCisInt<0>,
+ SDTCisVT<1, i32>,
+ SDTCisVT<2, OtherVT>]>;
+def SDT_MipsFPCmp : SDTypeProfile<0, 3, [SDTCisSameAs<0, 1>, SDTCisFP<1>,
+ SDTCisVT<2, i32>]>;
+def SDT_MipsCMovFP : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisVT<2, i32>,
+ SDTCisSameAs<1, 3>]>;
+def SDT_MipsTruncIntFP : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>;
+def SDT_MipsBuildPairF64 : SDTypeProfile<1, 2, [SDTCisVT<0, f64>,
+ SDTCisVT<1, i32>,
+ SDTCisSameAs<1, 2>]>;
+def SDT_MipsExtractElementF64 : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
+ SDTCisVT<1, f64>,
+ SDTCisVT<2, i32>]>;
+
+def MipsFPCmp : SDNode<"MipsISD::FPCmp", SDT_MipsFPCmp, [SDNPOutGlue]>;
+def MipsCMovFP_T : SDNode<"MipsISD::CMovFP_T", SDT_MipsCMovFP, [SDNPInGlue]>;
+def MipsCMovFP_F : SDNode<"MipsISD::CMovFP_F", SDT_MipsCMovFP, [SDNPInGlue]>;
+def MipsFPBrcond : SDNode<"MipsISD::FPBrcond", SDT_MipsFPBrcond,
+ [SDNPHasChain, SDNPOptInGlue]>;
+def MipsTruncIntFP : SDNode<"MipsISD::TruncIntFP", SDT_MipsTruncIntFP>;
+def MipsBuildPairF64 : SDNode<"MipsISD::BuildPairF64", SDT_MipsBuildPairF64>;
+def MipsExtractElementF64 : SDNode<"MipsISD::ExtractElementF64",
+ SDT_MipsExtractElementF64>;
+
+// Operand for printing out a condition code.
+let PrintMethod = "printFCCOperand", DecoderMethod = "DecodeCondCode" in
+ def condcode : Operand<i32>;
+
+//===----------------------------------------------------------------------===//
+// Feature predicates.
+//===----------------------------------------------------------------------===//
+
+def IsFP64bit : Predicate<"Subtarget->isFP64bit()">,
+ AssemblerPredicate<"FeatureFP64Bit">;
+def NotFP64bit : Predicate<"!Subtarget->isFP64bit()">,
+ AssemblerPredicate<"!FeatureFP64Bit">;
+def IsSingleFloat : Predicate<"Subtarget->isSingleFloat()">,
+ AssemblerPredicate<"FeatureSingleFloat">;
+def IsNotSingleFloat : Predicate<"!Subtarget->isSingleFloat()">,
+ AssemblerPredicate<"!FeatureSingleFloat">;
+def IsNotSoftFloat : Predicate<"!Subtarget->useSoftFloat()">,
+ AssemblerPredicate<"!FeatureSoftFloat">;
+
+//===----------------------------------------------------------------------===//
+// Mips FGR size adjectives.
+// They are mutually exclusive.
+//===----------------------------------------------------------------------===//
+
+class FGR_32 { list<Predicate> FGRPredicates = [NotFP64bit]; }
+class FGR_64 { list<Predicate> FGRPredicates = [IsFP64bit]; }
+class HARDFLOAT { list<Predicate> HardFloatPredicate = [IsNotSoftFloat]; }
+
+//===----------------------------------------------------------------------===//
+
+// FP immediate patterns.
+def fpimm0 : PatLeaf<(fpimm), [{
+ return N->isExactlyValue(+0.0);
+}]>;
+
+def fpimm0neg : PatLeaf<(fpimm), [{
+ return N->isExactlyValue(-0.0);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction Class Templates
+//
+// A set of multiclasses is used to address the register usage.
+//
+// S32 - single precision in 16 32bit even fp registers
+// single precision in 32 32bit fp registers in SingleOnly mode
+// S64 - single precision in 32 64bit fp registers (In64BitMode)
+// D32 - double precision in 16 32bit even fp registers
+// D64 - double precision in 32 64bit fp registers (In64BitMode)
+//
+// Only S32 and D32 are supported right now.
+//===----------------------------------------------------------------------===//
+class ADDS_FT<string opstr, RegisterOperand RC, InstrItinClass Itin, bit IsComm,
+ SDPatternOperator OpNode= null_frag> :
+ InstSE<(outs RC:$fd), (ins RC:$fs, RC:$ft),
+ !strconcat(opstr, "\t$fd, $fs, $ft"),
+ [(set RC:$fd, (OpNode RC:$fs, RC:$ft))], Itin, FrmFR, opstr>,
+ HARDFLOAT {
+ let isCommutable = IsComm;
+}
+
+multiclass ADDS_M<string opstr, InstrItinClass Itin, bit IsComm,
+ SDPatternOperator OpNode = null_frag> {
+ def _D32 : MMRel, ADDS_FT<opstr, AFGR64Opnd, Itin, IsComm, OpNode>, FGR_32;
+ def _D64 : ADDS_FT<opstr, FGR64Opnd, Itin, IsComm, OpNode>, FGR_64 {
+ string DecoderNamespace = "Mips64";
+ }
+}
+
+class ABSS_FT<string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
+ InstrItinClass Itin, SDPatternOperator OpNode= null_frag> :
+ InstSE<(outs DstRC:$fd), (ins SrcRC:$fs), !strconcat(opstr, "\t$fd, $fs"),
+ [(set DstRC:$fd, (OpNode SrcRC:$fs))], Itin, FrmFR, opstr>,
+ HARDFLOAT,
+ NeverHasSideEffects;
+
+multiclass ABSS_M<string opstr, InstrItinClass Itin,
+ SDPatternOperator OpNode= null_frag> {
+ def _D32 : MMRel, ABSS_FT<opstr, AFGR64Opnd, AFGR64Opnd, Itin, OpNode>,
+ FGR_32;
+ def _D64 : ABSS_FT<opstr, FGR64Opnd, FGR64Opnd, Itin, OpNode>, FGR_64 {
+ string DecoderNamespace = "Mips64";
+ }
+}
+
+multiclass ROUND_M<string opstr, InstrItinClass Itin> {
+ def _D32 : MMRel, ABSS_FT<opstr, FGR32Opnd, AFGR64Opnd, Itin>, FGR_32;
+ def _D64 : StdMMR6Rel, ABSS_FT<opstr, FGR32Opnd, FGR64Opnd, Itin>, FGR_64 {
+ let DecoderNamespace = "Mips64";
+ }
+}
+
+class MFC1_FT<string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
+ InstrItinClass Itin, SDPatternOperator OpNode= null_frag> :
+ InstSE<(outs DstRC:$rt), (ins SrcRC:$fs), !strconcat(opstr, "\t$rt, $fs"),
+ [(set DstRC:$rt, (OpNode SrcRC:$fs))], Itin, FrmFR, opstr>, HARDFLOAT;
+
+class MTC1_FT<string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
+ InstrItinClass Itin, SDPatternOperator OpNode= null_frag> :
+ InstSE<(outs DstRC:$fs), (ins SrcRC:$rt), !strconcat(opstr, "\t$rt, $fs"),
+ [(set DstRC:$fs, (OpNode SrcRC:$rt))], Itin, FrmFR, opstr>, HARDFLOAT;
+
+class MTC1_64_FT<string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
+ InstrItinClass Itin> :
+ InstSE<(outs DstRC:$fs), (ins DstRC:$fs_in, SrcRC:$rt),
+ !strconcat(opstr, "\t$rt, $fs"), [], Itin, FrmFR, opstr>, HARDFLOAT {
+ // $fs_in is part of a white lie to work around a widespread bug in the FPU
+ // implementation. See expandBuildPairF64 for details.
+ let Constraints = "$fs = $fs_in";
+}
+
+class LW_FT<string opstr, RegisterOperand RC, DAGOperand MO,
+ InstrItinClass Itin, SDPatternOperator OpNode = null_frag> :
+ InstSE<(outs RC:$rt), (ins MO:$addr), !strconcat(opstr, "\t$rt, $addr"),
+ [(set RC:$rt, (OpNode addrDefault:$addr))], Itin, FrmFI, opstr>,
+ HARDFLOAT {
+ let DecoderMethod = "DecodeFMem";
+ let mayLoad = 1;
+}
+
+class SW_FT<string opstr, RegisterOperand RC, DAGOperand MO,
+ InstrItinClass Itin, SDPatternOperator OpNode = null_frag> :
+ InstSE<(outs), (ins RC:$rt, MO:$addr), !strconcat(opstr, "\t$rt, $addr"),
+ [(OpNode RC:$rt, addrDefault:$addr)], Itin, FrmFI, opstr>, HARDFLOAT {
+ let DecoderMethod = "DecodeFMem";
+ let mayStore = 1;
+}
+
+class MADDS_FT<string opstr, RegisterOperand RC, InstrItinClass Itin,
+ SDPatternOperator OpNode = null_frag> :
+ InstSE<(outs RC:$fd), (ins RC:$fr, RC:$fs, RC:$ft),
+ !strconcat(opstr, "\t$fd, $fr, $fs, $ft"),
+ [(set RC:$fd, (OpNode (fmul RC:$fs, RC:$ft), RC:$fr))], Itin,
+ FrmFR, opstr>, HARDFLOAT;
+
+class NMADDS_FT<string opstr, RegisterOperand RC, InstrItinClass Itin,
+ SDPatternOperator OpNode = null_frag> :
+ InstSE<(outs RC:$fd), (ins RC:$fr, RC:$fs, RC:$ft),
+ !strconcat(opstr, "\t$fd, $fr, $fs, $ft"),
+ [(set RC:$fd, (fsub fpimm0, (OpNode (fmul RC:$fs, RC:$ft), RC:$fr)))],
+ Itin, FrmFR, opstr>, HARDFLOAT;
+
+class LWXC1_FT<string opstr, RegisterOperand DRC,
+ InstrItinClass Itin, SDPatternOperator OpNode = null_frag> :
+ InstSE<(outs DRC:$fd), (ins PtrRC:$base, PtrRC:$index),
+ !strconcat(opstr, "\t$fd, ${index}(${base})"),
+ [(set DRC:$fd, (OpNode (add iPTR:$base, iPTR:$index)))], Itin,
+ FrmFI, opstr>, HARDFLOAT {
+ let AddedComplexity = 20;
+}
+
+class SWXC1_FT<string opstr, RegisterOperand DRC,
+ InstrItinClass Itin, SDPatternOperator OpNode = null_frag> :
+ InstSE<(outs), (ins DRC:$fs, PtrRC:$base, PtrRC:$index),
+ !strconcat(opstr, "\t$fs, ${index}(${base})"),
+ [(OpNode DRC:$fs, (add iPTR:$base, iPTR:$index))], Itin,
+ FrmFI, opstr>, HARDFLOAT {
+ let AddedComplexity = 20;
+}
+
+class BC1F_FT<string opstr, DAGOperand opnd, InstrItinClass Itin,
+ SDPatternOperator Op = null_frag, bit DelaySlot = 1> :
+ InstSE<(outs), (ins FCCRegsOpnd:$fcc, opnd:$offset),
+ !strconcat(opstr, "\t$fcc, $offset"),
+ [(MipsFPBrcond Op, FCCRegsOpnd:$fcc, bb:$offset)], Itin,
+ FrmFI, opstr>, HARDFLOAT {
+ let isBranch = 1;
+ let isTerminator = 1;
+ let hasDelaySlot = DelaySlot;
+ let Defs = [AT];
+}
+
+class CEQS_FT<string typestr, RegisterClass RC, InstrItinClass Itin,
+ SDPatternOperator OpNode = null_frag> :
+ InstSE<(outs), (ins RC:$fs, RC:$ft, condcode:$cond),
+ !strconcat("c.$cond.", typestr, "\t$fs, $ft"),
+ [(OpNode RC:$fs, RC:$ft, imm:$cond)], Itin, FrmFR,
+ !strconcat("c.$cond.", typestr)>, HARDFLOAT {
+ let Defs = [FCC0];
+ let isCodeGenOnly = 1;
+}
+
+class C_COND_FT<string CondStr, string Typestr, RegisterOperand RC,
+ InstrItinClass itin> :
+ InstSE<(outs), (ins RC:$fs, RC:$ft),
+ !strconcat("c.", CondStr, ".", Typestr, "\t$fs, $ft"), [], itin,
+ FrmFR>, HARDFLOAT;
+
+multiclass C_COND_M<string TypeStr, RegisterOperand RC, bits<5> fmt,
+ InstrItinClass itin> {
+ def C_F_#NAME : C_COND_FT<"f", TypeStr, RC, itin>, C_COND_FM<fmt, 0>;
+ def C_UN_#NAME : C_COND_FT<"un", TypeStr, RC, itin>, C_COND_FM<fmt, 1>;
+ def C_EQ_#NAME : C_COND_FT<"eq", TypeStr, RC, itin>, C_COND_FM<fmt, 2>;
+ def C_UEQ_#NAME : C_COND_FT<"ueq", TypeStr, RC, itin>, C_COND_FM<fmt, 3>;
+ def C_OLT_#NAME : C_COND_FT<"olt", TypeStr, RC, itin>, C_COND_FM<fmt, 4>;
+ def C_ULT_#NAME : C_COND_FT<"ult", TypeStr, RC, itin>, C_COND_FM<fmt, 5>;
+ def C_OLE_#NAME : C_COND_FT<"ole", TypeStr, RC, itin>, C_COND_FM<fmt, 6>;
+ def C_ULE_#NAME : C_COND_FT<"ule", TypeStr, RC, itin>, C_COND_FM<fmt, 7>;
+ def C_SF_#NAME : C_COND_FT<"sf", TypeStr, RC, itin>, C_COND_FM<fmt, 8>;
+ def C_NGLE_#NAME : C_COND_FT<"ngle", TypeStr, RC, itin>, C_COND_FM<fmt, 9>;
+ def C_SEQ_#NAME : C_COND_FT<"seq", TypeStr, RC, itin>, C_COND_FM<fmt, 10>;
+ def C_NGL_#NAME : C_COND_FT<"ngl", TypeStr, RC, itin>, C_COND_FM<fmt, 11>;
+ def C_LT_#NAME : C_COND_FT<"lt", TypeStr, RC, itin>, C_COND_FM<fmt, 12>;
+ def C_NGE_#NAME : C_COND_FT<"nge", TypeStr, RC, itin>, C_COND_FM<fmt, 13>;
+ def C_LE_#NAME : C_COND_FT<"le", TypeStr, RC, itin>, C_COND_FM<fmt, 14>;
+ def C_NGT_#NAME : C_COND_FT<"ngt", TypeStr, RC, itin>, C_COND_FM<fmt, 15>;
+}
+
+defm S : C_COND_M<"s", FGR32Opnd, 16, II_C_CC_S>, ISA_MIPS1_NOT_32R6_64R6;
+defm D32 : C_COND_M<"d", AFGR64Opnd, 17, II_C_CC_D>, ISA_MIPS1_NOT_32R6_64R6,
+ FGR_32;
+let DecoderNamespace = "Mips64" in
+defm D64 : C_COND_M<"d", FGR64Opnd, 17, II_C_CC_D>, ISA_MIPS1_NOT_32R6_64R6,
+ FGR_64;
+
+//===----------------------------------------------------------------------===//
+// Floating Point Instructions
+//===----------------------------------------------------------------------===//
+def ROUND_W_S : MMRel, StdMMR6Rel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>,
+ ABSS_FM<0xc, 16>, ISA_MIPS2;
+defm ROUND_W : ROUND_M<"round.w.d", II_ROUND>, ABSS_FM<0xc, 17>, ISA_MIPS2;
+def TRUNC_W_S : MMRel, StdMMR6Rel, ABSS_FT<"trunc.w.s", FGR32Opnd, FGR32Opnd, II_TRUNC>,
+ ABSS_FM<0xd, 16>, ISA_MIPS2;
+def CEIL_W_S : MMRel, StdMMR6Rel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>,
+ ABSS_FM<0xe, 16>, ISA_MIPS2;
+def FLOOR_W_S : MMRel, StdMMR6Rel, ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd, II_FLOOR>,
+ ABSS_FM<0xf, 16>, ISA_MIPS2;
+def CVT_W_S : MMRel, ABSS_FT<"cvt.w.s", FGR32Opnd, FGR32Opnd, II_CVT>,
+ ABSS_FM<0x24, 16>;
+
+defm TRUNC_W : ROUND_M<"trunc.w.d", II_TRUNC>, ABSS_FM<0xd, 17>, ISA_MIPS2;
+defm CEIL_W : ROUND_M<"ceil.w.d", II_CEIL>, ABSS_FM<0xe, 17>, ISA_MIPS2;
+defm FLOOR_W : ROUND_M<"floor.w.d", II_FLOOR>, ABSS_FM<0xf, 17>, ISA_MIPS2;
+defm CVT_W : ROUND_M<"cvt.w.d", II_CVT>, ABSS_FM<0x24, 17>;
+
+let AdditionalPredicates = [NotInMicroMips] in {
+ def RECIP_S : MMRel, ABSS_FT<"recip.s", FGR32Opnd, FGR32Opnd, II_RECIP_S>,
+ ABSS_FM<0b010101, 0x10>, INSN_MIPS4_32R2;
+ def RECIP_D : MMRel, ABSS_FT<"recip.d", FGR64Opnd, FGR64Opnd, II_RECIP_D>,
+ ABSS_FM<0b010101, 0x11>, INSN_MIPS4_32R2;
+ def RSQRT_S : MMRel, ABSS_FT<"rsqrt.s", FGR32Opnd, FGR32Opnd, II_RSQRT_S>,
+ ABSS_FM<0b010110, 0x10>, INSN_MIPS4_32R2;
+ def RSQRT_D : MMRel, ABSS_FT<"rsqrt.d", FGR64Opnd, FGR64Opnd, II_RSQRT_D>,
+ ABSS_FM<0b010110, 0x11>, INSN_MIPS4_32R2;
+}
+let DecoderNamespace = "Mips64" in {
+ let AdditionalPredicates = [NotInMicroMips] in {
+ def ROUND_L_S : ABSS_FT<"round.l.s", FGR64Opnd, FGR32Opnd, II_ROUND>,
+ ABSS_FM<0x8, 16>, FGR_64;
+ def ROUND_L_D64 : ABSS_FT<"round.l.d", FGR64Opnd, FGR64Opnd, II_ROUND>,
+ ABSS_FM<0x8, 17>, FGR_64;
+ def TRUNC_L_S : ABSS_FT<"trunc.l.s", FGR64Opnd, FGR32Opnd, II_TRUNC>,
+ ABSS_FM<0x9, 16>, FGR_64;
+ def TRUNC_L_D64 : ABSS_FT<"trunc.l.d", FGR64Opnd, FGR64Opnd, II_TRUNC>,
+ ABSS_FM<0x9, 17>, FGR_64;
+ def CEIL_L_S : ABSS_FT<"ceil.l.s", FGR64Opnd, FGR32Opnd, II_CEIL>,
+ ABSS_FM<0xa, 16>, FGR_64;
+ def CEIL_L_D64 : ABSS_FT<"ceil.l.d", FGR64Opnd, FGR64Opnd, II_CEIL>,
+ ABSS_FM<0xa, 17>, FGR_64;
+ def FLOOR_L_S : ABSS_FT<"floor.l.s", FGR64Opnd, FGR32Opnd, II_FLOOR>,
+ ABSS_FM<0xb, 16>, FGR_64;
+ def FLOOR_L_D64 : ABSS_FT<"floor.l.d", FGR64Opnd, FGR64Opnd, II_FLOOR>,
+ ABSS_FM<0xb, 17>, FGR_64;
+ }
+}
+
+def CVT_S_W : MMRel, ABSS_FT<"cvt.s.w", FGR32Opnd, FGR32Opnd, II_CVT>,
+ ABSS_FM<0x20, 20>;
+let AdditionalPredicates = [NotInMicroMips] in{
+ def CVT_L_S : MMRel, ABSS_FT<"cvt.l.s", FGR64Opnd, FGR32Opnd, II_CVT>,
+ ABSS_FM<0x25, 16>, INSN_MIPS3_32R2;
+ def CVT_L_D64: MMRel, ABSS_FT<"cvt.l.d", FGR64Opnd, FGR64Opnd, II_CVT>,
+ ABSS_FM<0x25, 17>, INSN_MIPS3_32R2;
+}
+
+def CVT_S_D32 : MMRel, ABSS_FT<"cvt.s.d", FGR32Opnd, AFGR64Opnd, II_CVT>,
+ ABSS_FM<0x20, 17>, FGR_32;
+def CVT_D32_W : MMRel, ABSS_FT<"cvt.d.w", AFGR64Opnd, FGR32Opnd, II_CVT>,
+ ABSS_FM<0x21, 20>, FGR_32;
+def CVT_D32_S : MMRel, ABSS_FT<"cvt.d.s", AFGR64Opnd, FGR32Opnd, II_CVT>,
+ ABSS_FM<0x21, 16>, FGR_32;
+
+let DecoderNamespace = "Mips64" in {
+ def CVT_S_D64 : ABSS_FT<"cvt.s.d", FGR32Opnd, FGR64Opnd, II_CVT>,
+ ABSS_FM<0x20, 17>, FGR_64;
+ let AdditionalPredicates = [NotInMicroMips] in{
+ def CVT_S_L : ABSS_FT<"cvt.s.l", FGR32Opnd, FGR64Opnd, II_CVT>,
+ ABSS_FM<0x20, 21>, FGR_64;
+ }
+ def CVT_D64_W : ABSS_FT<"cvt.d.w", FGR64Opnd, FGR32Opnd, II_CVT>,
+ ABSS_FM<0x21, 20>, FGR_64;
+ def CVT_D64_S : ABSS_FT<"cvt.d.s", FGR64Opnd, FGR32Opnd, II_CVT>,
+ ABSS_FM<0x21, 16>, FGR_64;
+ def CVT_D64_L : ABSS_FT<"cvt.d.l", FGR64Opnd, FGR64Opnd, II_CVT>,
+ ABSS_FM<0x21, 21>, FGR_64;
+}
+
+let isPseudo = 1, isCodeGenOnly = 1 in {
+ def PseudoCVT_S_W : ABSS_FT<"", FGR32Opnd, GPR32Opnd, II_CVT>;
+ def PseudoCVT_D32_W : ABSS_FT<"", AFGR64Opnd, GPR32Opnd, II_CVT>;
+ def PseudoCVT_S_L : ABSS_FT<"", FGR64Opnd, GPR64Opnd, II_CVT>;
+ def PseudoCVT_D64_W : ABSS_FT<"", FGR64Opnd, GPR32Opnd, II_CVT>;
+ def PseudoCVT_D64_L : ABSS_FT<"", FGR64Opnd, GPR64Opnd, II_CVT>;
+}
+
+def FABS_S : MMRel, ABSS_FT<"abs.s", FGR32Opnd, FGR32Opnd, II_ABS, fabs>,
+ ABSS_FM<0x5, 16>;
+def FNEG_S : MMRel, ABSS_FT<"neg.s", FGR32Opnd, FGR32Opnd, II_NEG, fneg>,
+ ABSS_FM<0x7, 16>;
+defm FABS : ABSS_M<"abs.d", II_ABS, fabs>, ABSS_FM<0x5, 17>;
+defm FNEG : ABSS_M<"neg.d", II_NEG, fneg>, ABSS_FM<0x7, 17>;
+
+def FSQRT_S : MMRel, StdMMR6Rel, ABSS_FT<"sqrt.s", FGR32Opnd, FGR32Opnd,
+ II_SQRT_S, fsqrt>, ABSS_FM<0x4, 16>, ISA_MIPS2;
+defm FSQRT : ABSS_M<"sqrt.d", II_SQRT_D, fsqrt>, ABSS_FM<0x4, 17>, ISA_MIPS2;
+
+// The odd-numbered registers are only referenced when doing loads,
+// stores, and moves between floating-point and integer registers.
+// When defining instructions, we reference all 32-bit registers,
+// regardless of register aliasing.
+
+/// Move Control Registers From/To CPU Registers
+let AdditionalPredicates = [NotInMicroMips] in {
+ def CFC1 : MMRel, MFC1_FT<"cfc1", GPR32Opnd, CCROpnd, II_CFC1>, MFC1_FM<2>;
+ def CTC1 : MMRel, MTC1_FT<"ctc1", CCROpnd, GPR32Opnd, II_CTC1>, MFC1_FM<6>;
+}
+def MFC1 : MMRel, MFC1_FT<"mfc1", GPR32Opnd, FGR32Opnd, II_MFC1,
+ bitconvert>, MFC1_FM<0>;
+def MTC1 : MMRel, MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd, II_MTC1,
+ bitconvert>, MFC1_FM<4>;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def MFHC1_D32 : MMRel, MFC1_FT<"mfhc1", GPR32Opnd, AFGR64Opnd, II_MFHC1>,
+ MFC1_FM<3>, ISA_MIPS32R2, FGR_32;
+ def MFHC1_D64 : MFC1_FT<"mfhc1", GPR32Opnd, FGR64Opnd, II_MFHC1>,
+ MFC1_FM<3>, ISA_MIPS32R2, FGR_64 {
+ let DecoderNamespace = "Mips64";
+ }
+}
+let AdditionalPredicates = [NotInMicroMips] in {
+ def MTHC1_D32 : MMRel, StdMMR6Rel, MTC1_64_FT<"mthc1", AFGR64Opnd, GPR32Opnd, II_MTHC1>,
+ MFC1_FM<7>, ISA_MIPS32R2, FGR_32;
+ def MTHC1_D64 : MTC1_64_FT<"mthc1", FGR64Opnd, GPR32Opnd, II_MTHC1>,
+ MFC1_FM<7>, ISA_MIPS32R2, FGR_64 {
+ let DecoderNamespace = "Mips64";
+ }
+}
+let AdditionalPredicates = [NotInMicroMips] in {
+ def DMTC1 : MTC1_FT<"dmtc1", FGR64Opnd, GPR64Opnd, II_DMTC1,
+ bitconvert>, MFC1_FM<5>, ISA_MIPS3;
+ def DMFC1 : MFC1_FT<"dmfc1", GPR64Opnd, FGR64Opnd, II_DMFC1,
+ bitconvert>, MFC1_FM<1>, ISA_MIPS3;
+}
+
+def FMOV_S : MMRel, ABSS_FT<"mov.s", FGR32Opnd, FGR32Opnd, II_MOV_S>,
+ ABSS_FM<0x6, 16>;
+def FMOV_D32 : MMRel, ABSS_FT<"mov.d", AFGR64Opnd, AFGR64Opnd, II_MOV_D>,
+ ABSS_FM<0x6, 17>, FGR_32;
+def FMOV_D64 : ABSS_FT<"mov.d", FGR64Opnd, FGR64Opnd, II_MOV_D>,
+ ABSS_FM<0x6, 17>, FGR_64 {
+ let DecoderNamespace = "Mips64";
+}
+
+/// Floating Point Memory Instructions
+let AdditionalPredicates = [NotInMicroMips] in {
+ def LWC1 : MMRel, LW_FT<"lwc1", FGR32Opnd, mem_simm16, II_LWC1, load>,
+ LW_FM<0x31>;
+ def SWC1 : MMRel, SW_FT<"swc1", FGR32Opnd, mem_simm16, II_SWC1, store>,
+ LW_FM<0x39>;
+}
+
+let DecoderNamespace = "Mips64", AdditionalPredicates = [NotInMicroMips] in {
+ def LDC164 : StdMMR6Rel, LW_FT<"ldc1", FGR64Opnd, mem_simm16, II_LDC1, load>,
+ LW_FM<0x35>, ISA_MIPS2, FGR_64 {
+ let BaseOpcode = "LDC164";
+ }
+ def SDC164 : StdMMR6Rel, SW_FT<"sdc1", FGR64Opnd, mem_simm16, II_SDC1, store>,
+ LW_FM<0x3d>, ISA_MIPS2, FGR_64;
+}
+
+let AdditionalPredicates = [NotInMicroMips] in {
+ def LDC1 : MMRel, StdMMR6Rel, LW_FT<"ldc1", AFGR64Opnd, mem_simm16, II_LDC1,
+ load>, LW_FM<0x35>, ISA_MIPS2, FGR_32 {
+ let BaseOpcode = "LDC132";
+ }
+ def SDC1 : MMRel, SW_FT<"sdc1", AFGR64Opnd, mem_simm16, II_SDC1, store>,
+ LW_FM<0x3d>, ISA_MIPS2, FGR_32;
+}
+
+// Indexed loads and stores.
+// Base register + offset register addressing mode (indicated by "x" in the
+// instruction mnemonic) is disallowed under NaCl.
+let AdditionalPredicates = [IsNotNaCl] in {
+ def LWXC1 : MMRel, LWXC1_FT<"lwxc1", FGR32Opnd, II_LWXC1, load>, LWXC1_FM<0>,
+ INSN_MIPS4_32R2_NOT_32R6_64R6;
+ def SWXC1 : MMRel, SWXC1_FT<"swxc1", FGR32Opnd, II_SWXC1, store>, SWXC1_FM<8>,
+ INSN_MIPS4_32R2_NOT_32R6_64R6;
+}
+
+let AdditionalPredicates = [NotInMicroMips, IsNotNaCl] in {
+ def LDXC1 : LWXC1_FT<"ldxc1", AFGR64Opnd, II_LDXC1, load>, LWXC1_FM<1>,
+ INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32;
+ def SDXC1 : SWXC1_FT<"sdxc1", AFGR64Opnd, II_SDXC1, store>, SWXC1_FM<9>,
+ INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32;
+}
+
+let DecoderNamespace="Mips64" in {
+ def LDXC164 : LWXC1_FT<"ldxc1", FGR64Opnd, II_LDXC1, load>, LWXC1_FM<1>,
+ INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
+ def SDXC164 : SWXC1_FT<"sdxc1", FGR64Opnd, II_SDXC1, store>, SWXC1_FM<9>,
+ INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
+}
+
+// Load/store doubleword indexed unaligned.
+let AdditionalPredicates = [IsNotNaCl] in {
+ def LUXC1 : MMRel, LWXC1_FT<"luxc1", AFGR64Opnd, II_LUXC1>, LWXC1_FM<0x5>,
+ INSN_MIPS5_32R2_NOT_32R6_64R6, FGR_32;
+ def SUXC1 : MMRel, SWXC1_FT<"suxc1", AFGR64Opnd, II_SUXC1>, SWXC1_FM<0xd>,
+ INSN_MIPS5_32R2_NOT_32R6_64R6, FGR_32;
+}
+
+let DecoderNamespace="Mips64" in {
+ def LUXC164 : LWXC1_FT<"luxc1", FGR64Opnd, II_LUXC1>, LWXC1_FM<0x5>,
+ INSN_MIPS5_32R2_NOT_32R6_64R6, FGR_64;
+ def SUXC164 : SWXC1_FT<"suxc1", FGR64Opnd, II_SUXC1>, SWXC1_FM<0xd>,
+ INSN_MIPS5_32R2_NOT_32R6_64R6, FGR_64;
+}
+
+/// Floating-point Aritmetic
+def FADD_S : MMRel, ADDS_FT<"add.s", FGR32Opnd, II_ADD_S, 1, fadd>,
+ ADDS_FM<0x00, 16>;
+defm FADD : ADDS_M<"add.d", II_ADD_D, 1, fadd>, ADDS_FM<0x00, 17>;
+def FDIV_S : MMRel, ADDS_FT<"div.s", FGR32Opnd, II_DIV_S, 0, fdiv>,
+ ADDS_FM<0x03, 16>;
+defm FDIV : ADDS_M<"div.d", II_DIV_D, 0, fdiv>, ADDS_FM<0x03, 17>;
+def FMUL_S : MMRel, ADDS_FT<"mul.s", FGR32Opnd, II_MUL_S, 1, fmul>,
+ ADDS_FM<0x02, 16>;
+defm FMUL : ADDS_M<"mul.d", II_MUL_D, 1, fmul>, ADDS_FM<0x02, 17>;
+def FSUB_S : MMRel, ADDS_FT<"sub.s", FGR32Opnd, II_SUB_S, 0, fsub>,
+ ADDS_FM<0x01, 16>;
+defm FSUB : ADDS_M<"sub.d", II_SUB_D, 0, fsub>, ADDS_FM<0x01, 17>;
+
+def MADD_S : MMRel, MADDS_FT<"madd.s", FGR32Opnd, II_MADD_S, fadd>,
+ MADDS_FM<4, 0>, INSN_MIPS4_32R2_NOT_32R6_64R6;
+def MSUB_S : MMRel, MADDS_FT<"msub.s", FGR32Opnd, II_MSUB_S, fsub>,
+ MADDS_FM<5, 0>, INSN_MIPS4_32R2_NOT_32R6_64R6;
+
+let AdditionalPredicates = [NoNaNsFPMath] in {
+ def NMADD_S : MMRel, NMADDS_FT<"nmadd.s", FGR32Opnd, II_NMADD_S, fadd>,
+ MADDS_FM<6, 0>, INSN_MIPS4_32R2_NOT_32R6_64R6;
+ def NMSUB_S : MMRel, NMADDS_FT<"nmsub.s", FGR32Opnd, II_NMSUB_S, fsub>,
+ MADDS_FM<7, 0>, INSN_MIPS4_32R2_NOT_32R6_64R6;
+}
+
+def MADD_D32 : MMRel, MADDS_FT<"madd.d", AFGR64Opnd, II_MADD_D, fadd>,
+ MADDS_FM<4, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32;
+def MSUB_D32 : MMRel, MADDS_FT<"msub.d", AFGR64Opnd, II_MSUB_D, fsub>,
+ MADDS_FM<5, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32;
+
+let AdditionalPredicates = [NoNaNsFPMath] in {
+ def NMADD_D32 : MMRel, NMADDS_FT<"nmadd.d", AFGR64Opnd, II_NMADD_D, fadd>,
+ MADDS_FM<6, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32;
+ def NMSUB_D32 : MMRel, NMADDS_FT<"nmsub.d", AFGR64Opnd, II_NMSUB_D, fsub>,
+ MADDS_FM<7, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32;
+}
+
+let DecoderNamespace = "Mips64" in {
+ def MADD_D64 : MADDS_FT<"madd.d", FGR64Opnd, II_MADD_D, fadd>,
+ MADDS_FM<4, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
+ def MSUB_D64 : MADDS_FT<"msub.d", FGR64Opnd, II_MSUB_D, fsub>,
+ MADDS_FM<5, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
+}
+
+let AdditionalPredicates = [NoNaNsFPMath],
+ DecoderNamespace = "Mips64" in {
+ def NMADD_D64 : NMADDS_FT<"nmadd.d", FGR64Opnd, II_NMADD_D, fadd>,
+ MADDS_FM<6, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
+ def NMSUB_D64 : NMADDS_FT<"nmsub.d", FGR64Opnd, II_NMSUB_D, fsub>,
+ MADDS_FM<7, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating Point Branch Codes
+//===----------------------------------------------------------------------===//
+// Mips branch codes. These correspond to condcode in MipsInstrInfo.h.
+// They must be kept in synch.
+def MIPS_BRANCH_F : PatLeaf<(i32 0)>;
+def MIPS_BRANCH_T : PatLeaf<(i32 1)>;
+
+def BC1F : MMRel, BC1F_FT<"bc1f", brtarget, II_BC1F, MIPS_BRANCH_F>,
+ BC1F_FM<0, 0>, ISA_MIPS1_NOT_32R6_64R6;
+def BC1FL : MMRel, BC1F_FT<"bc1fl", brtarget, II_BC1FL, MIPS_BRANCH_F, 0>,
+ BC1F_FM<1, 0>, ISA_MIPS2_NOT_32R6_64R6;
+def BC1T : MMRel, BC1F_FT<"bc1t", brtarget, II_BC1T, MIPS_BRANCH_T>,
+ BC1F_FM<0, 1>, ISA_MIPS1_NOT_32R6_64R6;
+def BC1TL : MMRel, BC1F_FT<"bc1tl", brtarget, II_BC1TL, MIPS_BRANCH_T, 0>,
+ BC1F_FM<1, 1>, ISA_MIPS2_NOT_32R6_64R6;
+
+/// Floating Point Compare
+let AdditionalPredicates = [NotInMicroMips] in {
+ def FCMP_S32 : MMRel, CEQS_FT<"s", FGR32, II_C_CC_S, MipsFPCmp>, CEQS_FM<16>,
+ ISA_MIPS1_NOT_32R6_64R6;
+ def FCMP_D32 : MMRel, CEQS_FT<"d", AFGR64, II_C_CC_D, MipsFPCmp>, CEQS_FM<17>,
+ ISA_MIPS1_NOT_32R6_64R6, FGR_32;
+}
+let DecoderNamespace = "Mips64" in
+def FCMP_D64 : CEQS_FT<"d", FGR64, II_C_CC_D, MipsFPCmp>, CEQS_FM<17>,
+ ISA_MIPS1_NOT_32R6_64R6, FGR_64;
+
+//===----------------------------------------------------------------------===//
+// Floating Point Pseudo-Instructions
+//===----------------------------------------------------------------------===//
+
+// This pseudo instr gets expanded into 2 mtc1 instrs after register
+// allocation.
+class BuildPairF64Base<RegisterOperand RO> :
+ PseudoSE<(outs RO:$dst), (ins GPR32Opnd:$lo, GPR32Opnd:$hi),
+ [(set RO:$dst, (MipsBuildPairF64 GPR32Opnd:$lo, GPR32Opnd:$hi))],
+ II_MTC1>;
+
+def BuildPairF64 : BuildPairF64Base<AFGR64Opnd>, FGR_32, HARDFLOAT;
+def BuildPairF64_64 : BuildPairF64Base<FGR64Opnd>, FGR_64, HARDFLOAT;
+
+// This pseudo instr gets expanded into 2 mfc1 instrs after register
+// allocation.
+// if n is 0, lower part of src is extracted.
+// if n is 1, higher part of src is extracted.
+// This node has associated scheduling information as the pre RA scheduler
+// asserts otherwise.
+class ExtractElementF64Base<RegisterOperand RO> :
+ PseudoSE<(outs GPR32Opnd:$dst), (ins RO:$src, i32imm:$n),
+ [(set GPR32Opnd:$dst, (MipsExtractElementF64 RO:$src, imm:$n))],
+ II_MFC1>;
+
+def ExtractElementF64 : ExtractElementF64Base<AFGR64Opnd>, FGR_32, HARDFLOAT;
+def ExtractElementF64_64 : ExtractElementF64Base<FGR64Opnd>, FGR_64, HARDFLOAT;
+
+def PseudoTRUNC_W_S : MipsAsmPseudoInst<(outs FGR32Opnd:$fd),
+ (ins FGR32Opnd:$fs, GPR32Opnd:$rs),
+ "trunc.w.s\t$fd, $fs, $rs">;
+
+def PseudoTRUNC_W_D32 : MipsAsmPseudoInst<(outs FGR32Opnd:$fd),
+ (ins AFGR64Opnd:$fs, GPR32Opnd:$rs),
+ "trunc.w.d\t$fd, $fs, $rs">,
+ FGR_32, HARDFLOAT;
+
+def PseudoTRUNC_W_D : MipsAsmPseudoInst<(outs FGR32Opnd:$fd),
+ (ins FGR64Opnd:$fs, GPR32Opnd:$rs),
+ "trunc.w.d\t$fd, $fs, $rs">,
+ FGR_64, HARDFLOAT;
+
+//===----------------------------------------------------------------------===//
+// InstAliases.
+//===----------------------------------------------------------------------===//
+def : MipsInstAlias<"bc1t $offset", (BC1T FCC0, brtarget:$offset)>,
+ ISA_MIPS1_NOT_32R6_64R6, HARDFLOAT;
+def : MipsInstAlias<"bc1tl $offset", (BC1TL FCC0, brtarget:$offset)>,
+ ISA_MIPS2_NOT_32R6_64R6, HARDFLOAT;
+def : MipsInstAlias<"bc1f $offset", (BC1F FCC0, brtarget:$offset)>,
+ ISA_MIPS1_NOT_32R6_64R6, HARDFLOAT;
+def : MipsInstAlias<"bc1fl $offset", (BC1FL FCC0, brtarget:$offset)>,
+ ISA_MIPS2_NOT_32R6_64R6, HARDFLOAT;
+
+def : MipsInstAlias
+ <"s.s $fd, $addr", (SWC1 FGR32Opnd:$fd, mem_simm16:$addr), 0>,
+ ISA_MIPS2, HARDFLOAT;
+def : MipsInstAlias
+ <"s.d $fd, $addr", (SDC1 AFGR64Opnd:$fd, mem_simm16:$addr), 0>,
+ FGR_32, ISA_MIPS2, HARDFLOAT;
+def : MipsInstAlias
+ <"s.d $fd, $addr", (SDC164 FGR64Opnd:$fd, mem_simm16:$addr), 0>,
+ FGR_64, ISA_MIPS2, HARDFLOAT;
+
+def : MipsInstAlias
+ <"l.s $fd, $addr", (LWC1 FGR32Opnd:$fd, mem_simm16:$addr), 0>,
+ ISA_MIPS2, HARDFLOAT;
+def : MipsInstAlias
+ <"l.d $fd, $addr", (LDC1 AFGR64Opnd:$fd, mem_simm16:$addr), 0>,
+ FGR_32, ISA_MIPS2, HARDFLOAT;
+def : MipsInstAlias
+ <"l.d $fd, $addr", (LDC164 FGR64Opnd:$fd, mem_simm16:$addr), 0>,
+ FGR_64, ISA_MIPS2, HARDFLOAT;
+//===----------------------------------------------------------------------===//
+// Floating Point Patterns
+//===----------------------------------------------------------------------===//
+def : MipsPat<(f32 fpimm0), (MTC1 ZERO)>;
+def : MipsPat<(f32 fpimm0neg), (FNEG_S (MTC1 ZERO))>;
+
+def : MipsPat<(f32 (sint_to_fp GPR32Opnd:$src)),
+ (PseudoCVT_S_W GPR32Opnd:$src)>;
+def : MipsPat<(MipsTruncIntFP FGR32Opnd:$src),
+ (TRUNC_W_S FGR32Opnd:$src)>;
+
+def : MipsPat<(f64 (sint_to_fp GPR32Opnd:$src)),
+ (PseudoCVT_D32_W GPR32Opnd:$src)>, FGR_32;
+def : MipsPat<(MipsTruncIntFP AFGR64Opnd:$src),
+ (TRUNC_W_D32 AFGR64Opnd:$src)>, FGR_32;
+def : MipsPat<(f32 (fpround AFGR64Opnd:$src)),
+ (CVT_S_D32 AFGR64Opnd:$src)>, FGR_32;
+def : MipsPat<(f64 (fpextend FGR32Opnd:$src)),
+ (CVT_D32_S FGR32Opnd:$src)>, FGR_32;
+
+def : MipsPat<(f64 fpimm0), (DMTC1 ZERO_64)>, FGR_64;
+def : MipsPat<(f64 fpimm0neg), (FNEG_D64 (DMTC1 ZERO_64))>, FGR_64;
+
+def : MipsPat<(f64 (sint_to_fp GPR32Opnd:$src)),
+ (PseudoCVT_D64_W GPR32Opnd:$src)>, FGR_64;
+def : MipsPat<(f32 (sint_to_fp GPR64Opnd:$src)),
+ (EXTRACT_SUBREG (PseudoCVT_S_L GPR64Opnd:$src), sub_lo)>, FGR_64;
+def : MipsPat<(f64 (sint_to_fp GPR64Opnd:$src)),
+ (PseudoCVT_D64_L GPR64Opnd:$src)>, FGR_64;
+
+def : MipsPat<(MipsTruncIntFP FGR64Opnd:$src),
+ (TRUNC_W_D64 FGR64Opnd:$src)>, FGR_64;
+def : MipsPat<(MipsTruncIntFP FGR32Opnd:$src),
+ (TRUNC_L_S FGR32Opnd:$src)>, FGR_64;
+def : MipsPat<(MipsTruncIntFP FGR64Opnd:$src),
+ (TRUNC_L_D64 FGR64Opnd:$src)>, FGR_64;
+
+def : MipsPat<(f32 (fpround FGR64Opnd:$src)),
+ (CVT_S_D64 FGR64Opnd:$src)>, FGR_64;
+def : MipsPat<(f64 (fpextend FGR32Opnd:$src)),
+ (CVT_D64_S FGR32Opnd:$src)>, FGR_64;
+
+// Patterns for loads/stores with a reg+imm operand.
+let AdditionalPredicates = [NotInMicroMips] in {
+ let AddedComplexity = 40 in {
+ def : LoadRegImmPat<LWC1, f32, load>;
+ def : StoreRegImmPat<SWC1, f32>;
+
+ def : LoadRegImmPat<LDC164, f64, load>, FGR_64;
+ def : StoreRegImmPat<SDC164, f64>, FGR_64;
+
+ def : LoadRegImmPat<LDC1, f64, load>, FGR_32;
+ def : StoreRegImmPat<SDC1, f64>, FGR_32;
+ }
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrFormats.td b/contrib/llvm/lib/Target/Mips/MipsInstrFormats.td
new file mode 100644
index 000000000000..1437fb75434a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsInstrFormats.td
@@ -0,0 +1,968 @@
+//===-- MipsInstrFormats.td - Mips Instruction Formats -----*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Describe MIPS instructions format
+//
+// CPU INSTRUCTION FORMATS
+//
+// opcode - operation code.
+// rs - src reg.
+// rt - dst reg (on a 2 regs instr) or src reg (on a 3 reg instr).
+// rd - dst reg, only used on 3 regs instr.
+// shamt - only used on shift instructions, contains the shift amount.
+// funct - combined with opcode field give us an operation code.
+//
+//===----------------------------------------------------------------------===//
+
+// Format specifies the encoding used by the instruction. This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<4> val> {
+ bits<4> Value = val;
+}
+
+def Pseudo : Format<0>;
+def FrmR : Format<1>;
+def FrmI : Format<2>;
+def FrmJ : Format<3>;
+def FrmFR : Format<4>;
+def FrmFI : Format<5>;
+def FrmOther : Format<6>; // Instruction w/ a custom format
+
+class MMRel;
+
+def Std2MicroMips : InstrMapping {
+ let FilterClass = "MMRel";
+ // Instructions with the same BaseOpcode and isNVStore values form a row.
+ let RowFields = ["BaseOpcode"];
+ // Instructions with the same predicate sense form a column.
+ let ColFields = ["Arch"];
+ // The key column is the unpredicated instructions.
+ let KeyCol = ["se"];
+ // Value columns are PredSense=true and PredSense=false
+ let ValueCols = [["se"], ["micromips"]];
+}
+
+class StdMMR6Rel;
+
+def Std2MicroMipsR6 : InstrMapping {
+ let FilterClass = "StdMMR6Rel";
+ // Instructions with the same BaseOpcode and isNVStore values form a row.
+ let RowFields = ["BaseOpcode"];
+ // Instructions with the same predicate sense form a column.
+ let ColFields = ["Arch"];
+ // The key column is the unpredicated instructions.
+ let KeyCol = ["se"];
+ // Value columns are PredSense=true and PredSense=false
+ let ValueCols = [["se"], ["micromipsr6"]];
+}
+
+class StdArch {
+ string Arch = "se";
+}
+
+// Generic Mips Format
+class MipsInst<dag outs, dag ins, string asmstr, list<dag> pattern,
+ InstrItinClass itin, Format f>: Instruction
+{
+ field bits<32> Inst;
+ Format Form = f;
+
+ let Namespace = "Mips";
+
+ let Size = 4;
+
+ bits<6> Opcode = 0;
+
+ // Top 6 bits are the 'opcode' field
+ let Inst{31-26} = Opcode;
+
+ let OutOperandList = outs;
+ let InOperandList = ins;
+
+ let AsmString = asmstr;
+ let Pattern = pattern;
+ let Itinerary = itin;
+
+ //
+ // Attributes specific to Mips instructions...
+ //
+ bits<4> FormBits = Form.Value;
+ bit isCTI = 0; // Any form of Control Transfer Instruction.
+ // Required for MIPSR6
+ bit hasForbiddenSlot = 0; // Instruction has a forbidden slot.
+ bit IsPCRelativeLoad = 0; // Load instruction with implicit source register
+ // ($pc) and with explicit offset and destination
+ // register
+
+ // TSFlags layout should be kept in sync with MipsInstrInfo.h.
+ let TSFlags{3-0} = FormBits;
+ let TSFlags{4} = isCTI;
+ let TSFlags{5} = hasForbiddenSlot;
+ let TSFlags{6} = IsPCRelativeLoad;
+
+ let DecoderNamespace = "Mips";
+
+ field bits<32> SoftFail = 0;
+}
+
+// Mips32/64 Instruction Format
+class InstSE<dag outs, dag ins, string asmstr, list<dag> pattern,
+ InstrItinClass itin, Format f, string opstr = ""> :
+ MipsInst<outs, ins, asmstr, pattern, itin, f>, PredicateControl {
+ let EncodingPredicates = [HasStdEnc];
+ string BaseOpcode = opstr;
+ string Arch;
+}
+
+// Mips Pseudo Instructions Format
+class MipsPseudo<dag outs, dag ins, list<dag> pattern,
+ InstrItinClass itin = IIPseudo> :
+ MipsInst<outs, ins, "", pattern, itin, Pseudo> {
+ let isCodeGenOnly = 1;
+ let isPseudo = 1;
+}
+
+// Mips32/64 Pseudo Instruction Format
+class PseudoSE<dag outs, dag ins, list<dag> pattern,
+ InstrItinClass itin = IIPseudo> :
+ MipsPseudo<outs, ins, pattern, itin>, PredicateControl {
+ let EncodingPredicates = [HasStdEnc];
+}
+
+// Pseudo-instructions for alternate assembly syntax (never used by codegen).
+// These are aliases that require C++ handling to convert to the target
+// instruction, while InstAliases can be handled directly by tblgen.
+class MipsAsmPseudoInst<dag outs, dag ins, string asmstr>:
+ MipsInst<outs, ins, asmstr, [], IIPseudo, Pseudo>, PredicateControl {
+ let isPseudo = 1;
+ let Pattern = [];
+}
+//===----------------------------------------------------------------------===//
+// Format R instruction class in Mips : <|opcode|rs|rt|rd|shamt|funct|>
+//===----------------------------------------------------------------------===//
+
+class FR<bits<6> op, bits<6> _funct, dag outs, dag ins, string asmstr,
+ list<dag> pattern, InstrItinClass itin>:
+ InstSE<outs, ins, asmstr, pattern, itin, FrmR>
+{
+ bits<5> rd;
+ bits<5> rs;
+ bits<5> rt;
+ bits<5> shamt;
+ bits<6> funct;
+
+ let Opcode = op;
+ let funct = _funct;
+
+ let Inst{25-21} = rs;
+ let Inst{20-16} = rt;
+ let Inst{15-11} = rd;
+ let Inst{10-6} = shamt;
+ let Inst{5-0} = funct;
+}
+
+//===----------------------------------------------------------------------===//
+// Format I instruction class in Mips : <|opcode|rs|rt|immediate|>
+//===----------------------------------------------------------------------===//
+
+class FI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern,
+ InstrItinClass itin>: InstSE<outs, ins, asmstr, pattern, itin, FrmI>
+{
+ bits<5> rt;
+ bits<5> rs;
+ bits<16> imm16;
+
+ let Opcode = op;
+
+ let Inst{25-21} = rs;
+ let Inst{20-16} = rt;
+ let Inst{15-0} = imm16;
+}
+
+class BranchBase<bits<6> op, dag outs, dag ins, string asmstr,
+ list<dag> pattern, InstrItinClass itin>:
+ InstSE<outs, ins, asmstr, pattern, itin, FrmI>
+{
+ bits<5> rs;
+ bits<5> rt;
+ bits<16> imm16;
+
+ let Opcode = op;
+
+ let Inst{25-21} = rs;
+ let Inst{20-16} = rt;
+ let Inst{15-0} = imm16;
+}
+
+//===----------------------------------------------------------------------===//
+// Format J instruction class in Mips : <|opcode|address|>
+//===----------------------------------------------------------------------===//
+
+class FJ<bits<6> op> : StdArch
+{
+ bits<26> target;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = op;
+ let Inst{25-0} = target;
+}
+
+//===----------------------------------------------------------------------===//
+// MFC instruction class in Mips : <|op|mf|rt|rd|0000000|sel|>
+//===----------------------------------------------------------------------===//
+class MFC3OP_FM<bits<6> op, bits<5> mfmt>
+{
+ bits<5> rt;
+ bits<5> rd;
+ bits<3> sel;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = op;
+ let Inst{25-21} = mfmt;
+ let Inst{20-16} = rt;
+ let Inst{15-11} = rd;
+ let Inst{10-3} = 0;
+ let Inst{2-0} = sel;
+}
+
+class MFC2OP_FM<bits<6> op, bits<5> mfmt> : StdArch {
+ bits<5> rt;
+ bits<16> imm16;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = op;
+ let Inst{25-21} = mfmt;
+ let Inst{20-16} = rt;
+ let Inst{15-0} = imm16;
+}
+
+class ADD_FM<bits<6> op, bits<6> funct> : StdArch {
+ bits<5> rd;
+ bits<5> rs;
+ bits<5> rt;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = op;
+ let Inst{25-21} = rs;
+ let Inst{20-16} = rt;
+ let Inst{15-11} = rd;
+ let Inst{10-6} = 0;
+ let Inst{5-0} = funct;
+}
+
+class ADDI_FM<bits<6> op> : StdArch {
+ bits<5> rs;
+ bits<5> rt;
+ bits<16> imm16;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = op;
+ let Inst{25-21} = rs;
+ let Inst{20-16} = rt;
+ let Inst{15-0} = imm16;
+}
+
+class SRA_FM<bits<6> funct, bit rotate> : StdArch {
+ bits<5> rd;
+ bits<5> rt;
+ bits<5> shamt;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0;
+ let Inst{25-22} = 0;
+ let Inst{21} = rotate;
+ let Inst{20-16} = rt;
+ let Inst{15-11} = rd;
+ let Inst{10-6} = shamt;
+ let Inst{5-0} = funct;
+}
+
+class SRLV_FM<bits<6> funct, bit rotate> : StdArch {
+ bits<5> rd;
+ bits<5> rt;
+ bits<5> rs;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0;
+ let Inst{25-21} = rs;
+ let Inst{20-16} = rt;
+ let Inst{15-11} = rd;
+ let Inst{10-7} = 0;
+ let Inst{6} = rotate;
+ let Inst{5-0} = funct;
+}
+
+class BEQ_FM<bits<6> op> : StdArch {
+ bits<5> rs;
+ bits<5> rt;
+ bits<16> offset;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = op;
+ let Inst{25-21} = rs;
+ let Inst{20-16} = rt;
+ let Inst{15-0} = offset;
+}
+
+class BGEZ_FM<bits<6> op, bits<5> funct> : StdArch {
+ bits<5> rs;
+ bits<16> offset;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = op;
+ let Inst{25-21} = rs;
+ let Inst{20-16} = funct;
+ let Inst{15-0} = offset;
+}
+
+class BBIT_FM<bits<6> op> : StdArch {
+ bits<5> rs;
+ bits<5> p;
+ bits<16> offset;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = op;
+ let Inst{25-21} = rs;
+ let Inst{20-16} = p;
+ let Inst{15-0} = offset;
+}
+
+class SLTI_FM<bits<6> op> : StdArch {
+ bits<5> rt;
+ bits<5> rs;
+ bits<16> imm16;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = op;
+ let Inst{25-21} = rs;
+ let Inst{20-16} = rt;
+ let Inst{15-0} = imm16;
+}
+
+class MFLO_FM<bits<6> funct> : StdArch {
+ bits<5> rd;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0;
+ let Inst{25-16} = 0;
+ let Inst{15-11} = rd;
+ let Inst{10-6} = 0;
+ let Inst{5-0} = funct;
+}
+
+class MTLO_FM<bits<6> funct> : StdArch {
+ bits<5> rs;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0;
+ let Inst{25-21} = rs;
+ let Inst{20-6} = 0;
+ let Inst{5-0} = funct;
+}
+
+class SEB_FM<bits<5> funct, bits<6> funct2> : StdArch {
+ bits<5> rd;
+ bits<5> rt;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x1f;
+ let Inst{25-21} = 0;
+ let Inst{20-16} = rt;
+ let Inst{15-11} = rd;
+ let Inst{10-6} = funct;
+ let Inst{5-0} = funct2;
+}
+
+class CLO_FM<bits<6> funct> : StdArch {
+ bits<5> rd;
+ bits<5> rs;
+ bits<5> rt;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x1c;
+ let Inst{25-21} = rs;
+ let Inst{20-16} = rt;
+ let Inst{15-11} = rd;
+ let Inst{10-6} = 0;
+ let Inst{5-0} = funct;
+ let rt = rd;
+}
+
+class LUI_FM : StdArch {
+ bits<5> rt;
+ bits<16> imm16;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0xf;
+ let Inst{25-21} = 0;
+ let Inst{20-16} = rt;
+ let Inst{15-0} = imm16;
+}
+
+class JALR_FM {
+ bits<5> rd;
+ bits<5> rs;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0;
+ let Inst{25-21} = rs;
+ let Inst{20-16} = 0;
+ let Inst{15-11} = rd;
+ let Inst{10-6} = 0;
+ let Inst{5-0} = 9;
+}
+
+class BGEZAL_FM<bits<5> funct> : StdArch {
+ bits<5> rs;
+ bits<16> offset;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 1;
+ let Inst{25-21} = rs;
+ let Inst{20-16} = funct;
+ let Inst{15-0} = offset;
+}
+
+class SYNC_FM : StdArch {
+ bits<5> stype;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0;
+ let Inst{10-6} = stype;
+ let Inst{5-0} = 0xf;
+}
+
+class SYNCI_FM : StdArch {
+ // Produced by the mem_simm16 address as reg << 16 | imm (see getMemEncoding).
+ bits<21> addr;
+ bits<5> rs = addr{20-16};
+ bits<16> offset = addr{15-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b000001;
+ let Inst{25-21} = rs;
+ let Inst{20-16} = 0b11111;
+ let Inst{15-0} = offset;
+}
+
+class MULT_FM<bits<6> op, bits<6> funct> : StdArch {
+ bits<5> rs;
+ bits<5> rt;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = op;
+ let Inst{25-21} = rs;
+ let Inst{20-16} = rt;
+ let Inst{15-6} = 0;
+ let Inst{5-0} = funct;
+}
+
+class EXT_FM<bits<6> funct> : StdArch {
+ bits<5> rt;
+ bits<5> rs;
+ bits<5> pos;
+ bits<5> size;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x1f;
+ let Inst{25-21} = rs;
+ let Inst{20-16} = rt;
+ let Inst{15-11} = size;
+ let Inst{10-6} = pos;
+ let Inst{5-0} = funct;
+}
+
+class RDHWR_FM : StdArch {
+ bits<5> rt;
+ bits<5> rd;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x1f;
+ let Inst{25-21} = 0;
+ let Inst{20-16} = rt;
+ let Inst{15-11} = rd;
+ let Inst{10-6} = 0;
+ let Inst{5-0} = 0x3b;
+}
+
+class TEQ_FM<bits<6> funct> : StdArch {
+ bits<5> rs;
+ bits<5> rt;
+ bits<10> code_;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0;
+ let Inst{25-21} = rs;
+ let Inst{20-16} = rt;
+ let Inst{15-6} = code_;
+ let Inst{5-0} = funct;
+}
+
+class TEQI_FM<bits<5> funct> : StdArch {
+ bits<5> rs;
+ bits<16> imm16;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 1;
+ let Inst{25-21} = rs;
+ let Inst{20-16} = funct;
+ let Inst{15-0} = imm16;
+}
+
+class WAIT_FM : StdArch {
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x10;
+ let Inst{25} = 1;
+ let Inst{24-6} = 0;
+ let Inst{5-0} = 0x20;
+}
+
+class EXTS_FM<bits<6> funct> : StdArch {
+ bits<5> rt;
+ bits<5> rs;
+ bits<5> pos;
+ bits<5> lenm1;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x1c;
+ let Inst{25-21} = rs;
+ let Inst{20-16} = rt;
+ let Inst{15-11} = lenm1;
+ let Inst{10-6} = pos;
+ let Inst{5-0} = funct;
+}
+
+class MTMR_FM<bits<6> funct> : StdArch {
+ bits<5> rs;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x1c;
+ let Inst{25-21} = rs;
+ let Inst{20-6} = 0;
+ let Inst{5-0} = funct;
+}
+
+class POP_FM<bits<6> funct> : StdArch {
+ bits<5> rd;
+ bits<5> rs;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x1c;
+ let Inst{25-21} = rs;
+ let Inst{20-16} = 0;
+ let Inst{15-11} = rd;
+ let Inst{10-6} = 0;
+ let Inst{5-0} = funct;
+}
+
+class SEQ_FM<bits<6> funct> : StdArch {
+ bits<5> rd;
+ bits<5> rs;
+ bits<5> rt;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x1c;
+ let Inst{25-21} = rs;
+ let Inst{20-16} = rt;
+ let Inst{15-11} = rd;
+ let Inst{10-6} = 0;
+ let Inst{5-0} = funct;
+}
+
+class SEQI_FM<bits<6> funct> : StdArch {
+ bits<5> rs;
+ bits<5> rt;
+ bits<10> imm10;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x1c;
+ let Inst{25-21} = rs;
+ let Inst{20-16} = rt;
+ let Inst{15-6} = imm10;
+ let Inst{5-0} = funct;
+}
+
+//===----------------------------------------------------------------------===//
+// System calls format <op|code_|funct>
+//===----------------------------------------------------------------------===//
+
+class SYS_FM<bits<6> funct> : StdArch
+{
+ bits<20> code_;
+ bits<32> Inst;
+ let Inst{31-26} = 0x0;
+ let Inst{25-6} = code_;
+ let Inst{5-0} = funct;
+}
+
+//===----------------------------------------------------------------------===//
+// Break instruction format <op|code_1|funct>
+//===----------------------------------------------------------------------===//
+
+class BRK_FM<bits<6> funct> : StdArch
+{
+ bits<10> code_1;
+ bits<10> code_2;
+ bits<32> Inst;
+ let Inst{31-26} = 0x0;
+ let Inst{25-16} = code_1;
+ let Inst{15-6} = code_2;
+ let Inst{5-0} = funct;
+}
+
+//===----------------------------------------------------------------------===//
+// Exception return format <Cop0|1|0|funct>
+//===----------------------------------------------------------------------===//
+
+class ER_FM<bits<6> funct, bit LLBit> : StdArch
+{
+ bits<32> Inst;
+ let Inst{31-26} = 0x10;
+ let Inst{25} = 1;
+ let Inst{24-7} = 0;
+ let Inst{6} = LLBit;
+ let Inst{5-0} = funct;
+}
+
+//===----------------------------------------------------------------------===//
+// Enable/disable interrupt instruction format <Cop0|MFMC0|rt|12|0|sc|0|0>
+//===----------------------------------------------------------------------===//
+
+class EI_FM<bits<1> sc> : StdArch
+{
+ bits<32> Inst;
+ bits<5> rt;
+ let Inst{31-26} = 0x10;
+ let Inst{25-21} = 0xb;
+ let Inst{20-16} = rt;
+ let Inst{15-11} = 0xc;
+ let Inst{10-6} = 0;
+ let Inst{5} = sc;
+ let Inst{4-0} = 0;
+}
+
+//===----------------------------------------------------------------------===//
+//
+// FLOATING POINT INSTRUCTION FORMATS
+//
+// opcode - operation code.
+// fs - src reg.
+// ft - dst reg (on a 2 regs instr) or src reg (on a 3 reg instr).
+// fd - dst reg, only used on 3 regs instr.
+// fmt - double or single precision.
+// funct - combined with opcode field give us an operation code.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Format FI instruction class in Mips : <|opcode|base|ft|immediate|>
+//===----------------------------------------------------------------------===//
+
+class FFI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern>:
+ InstSE<outs, ins, asmstr, pattern, NoItinerary, FrmFI>
+{
+ bits<5> ft;
+ bits<5> base;
+ bits<16> imm16;
+
+ let Opcode = op;
+
+ let Inst{25-21} = base;
+ let Inst{20-16} = ft;
+ let Inst{15-0} = imm16;
+}
+
+class ADDS_FM<bits<6> funct, bits<5> fmt> : StdArch {
+ bits<5> fd;
+ bits<5> fs;
+ bits<5> ft;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x11;
+ let Inst{25-21} = fmt;
+ let Inst{20-16} = ft;
+ let Inst{15-11} = fs;
+ let Inst{10-6} = fd;
+ let Inst{5-0} = funct;
+}
+
+class ABSS_FM<bits<6> funct, bits<5> fmt> : StdArch {
+ bits<5> fd;
+ bits<5> fs;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x11;
+ let Inst{25-21} = fmt;
+ let Inst{20-16} = 0;
+ let Inst{15-11} = fs;
+ let Inst{10-6} = fd;
+ let Inst{5-0} = funct;
+}
+
+class MFC1_FM<bits<5> funct> : StdArch {
+ bits<5> rt;
+ bits<5> fs;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x11;
+ let Inst{25-21} = funct;
+ let Inst{20-16} = rt;
+ let Inst{15-11} = fs;
+ let Inst{10-0} = 0;
+}
+
+class LW_FM<bits<6> op> : StdArch {
+ bits<5> rt;
+ bits<21> addr;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = op;
+ let Inst{25-21} = addr{20-16};
+ let Inst{20-16} = rt;
+ let Inst{15-0} = addr{15-0};
+}
+
+class MADDS_FM<bits<3> funct, bits<3> fmt> : StdArch {
+ bits<5> fd;
+ bits<5> fr;
+ bits<5> fs;
+ bits<5> ft;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x13;
+ let Inst{25-21} = fr;
+ let Inst{20-16} = ft;
+ let Inst{15-11} = fs;
+ let Inst{10-6} = fd;
+ let Inst{5-3} = funct;
+ let Inst{2-0} = fmt;
+}
+
+class LWXC1_FM<bits<6> funct> : StdArch {
+ bits<5> fd;
+ bits<5> base;
+ bits<5> index;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x13;
+ let Inst{25-21} = base;
+ let Inst{20-16} = index;
+ let Inst{15-11} = 0;
+ let Inst{10-6} = fd;
+ let Inst{5-0} = funct;
+}
+
+class SWXC1_FM<bits<6> funct> : StdArch {
+ bits<5> fs;
+ bits<5> base;
+ bits<5> index;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x13;
+ let Inst{25-21} = base;
+ let Inst{20-16} = index;
+ let Inst{15-11} = fs;
+ let Inst{10-6} = 0;
+ let Inst{5-0} = funct;
+}
+
+class BC1F_FM<bit nd, bit tf> : StdArch {
+ bits<3> fcc;
+ bits<16> offset;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x11;
+ let Inst{25-21} = 0x8;
+ let Inst{20-18} = fcc;
+ let Inst{17} = nd;
+ let Inst{16} = tf;
+ let Inst{15-0} = offset;
+}
+
+class CEQS_FM<bits<5> fmt> : StdArch {
+ bits<5> fs;
+ bits<5> ft;
+ bits<4> cond;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x11;
+ let Inst{25-21} = fmt;
+ let Inst{20-16} = ft;
+ let Inst{15-11} = fs;
+ let Inst{10-8} = 0; // cc
+ let Inst{7-4} = 0x3;
+ let Inst{3-0} = cond;
+}
+
+class C_COND_FM<bits<5> fmt, bits<4> c> : CEQS_FM<fmt> {
+ let cond = c;
+}
+
+class CMov_I_F_FM<bits<6> funct, bits<5> fmt> : StdArch {
+ bits<5> fd;
+ bits<5> fs;
+ bits<5> rt;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x11;
+ let Inst{25-21} = fmt;
+ let Inst{20-16} = rt;
+ let Inst{15-11} = fs;
+ let Inst{10-6} = fd;
+ let Inst{5-0} = funct;
+}
+
+class CMov_F_I_FM<bit tf> : StdArch {
+ bits<5> rd;
+ bits<5> rs;
+ bits<3> fcc;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0;
+ let Inst{25-21} = rs;
+ let Inst{20-18} = fcc;
+ let Inst{17} = 0;
+ let Inst{16} = tf;
+ let Inst{15-11} = rd;
+ let Inst{10-6} = 0;
+ let Inst{5-0} = 1;
+}
+
+class CMov_F_F_FM<bits<5> fmt, bit tf> : StdArch {
+ bits<5> fd;
+ bits<5> fs;
+ bits<3> fcc;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x11;
+ let Inst{25-21} = fmt;
+ let Inst{20-18} = fcc;
+ let Inst{17} = 0;
+ let Inst{16} = tf;
+ let Inst{15-11} = fs;
+ let Inst{10-6} = fd;
+ let Inst{5-0} = 0x11;
+}
+
+class BARRIER_FM<bits<5> op> : StdArch {
+ bits<32> Inst;
+
+ let Inst{31-26} = 0; // SPECIAL
+ let Inst{25-21} = 0;
+ let Inst{20-16} = 0; // rt = 0
+ let Inst{15-11} = 0; // rd = 0
+ let Inst{10-6} = op; // Operation
+ let Inst{5-0} = 0; // SLL
+}
+
+class SDBBP_FM : StdArch {
+ bits<20> code_;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b011100; // SPECIAL2
+ let Inst{25-6} = code_;
+ let Inst{5-0} = 0b111111; // SDBBP
+}
+
+class JR_HB_FM<bits<6> op> : StdArch{
+ bits<5> rs;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0; // SPECIAL
+ let Inst{25-21} = rs;
+ let Inst{20-11} = 0;
+ let Inst{10} = 1;
+ let Inst{9-6} = 0;
+ let Inst{5-0} = op;
+}
+
+class JALR_HB_FM<bits<6> op> : StdArch {
+ bits<5> rd;
+ bits<5> rs;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0; // SPECIAL
+ let Inst{25-21} = rs;
+ let Inst{20-16} = 0;
+ let Inst{15-11} = rd;
+ let Inst{10} = 1;
+ let Inst{9-6} = 0;
+ let Inst{5-0} = op;
+}
+
+class COP0_TLB_FM<bits<6> op> : StdArch {
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x10; // COP0
+ let Inst{25} = 1; // CO
+ let Inst{24-6} = 0;
+ let Inst{5-0} = op; // Operation
+}
+
+class CACHEOP_FM<bits<6> op> : StdArch {
+ bits<21> addr;
+ bits<5> hint;
+ bits<5> base = addr{20-16};
+ bits<16> offset = addr{15-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = op;
+ let Inst{25-21} = base;
+ let Inst{20-16} = hint;
+ let Inst{15-0} = offset;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.cpp
new file mode 100644
index 000000000000..19af1914c819
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.cpp
@@ -0,0 +1,503 @@
+//===-- MipsInstrInfo.cpp - Mips Instruction Information ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsInstrInfo.h"
+#include "InstPrinter/MipsInstPrinter.h"
+#include "MipsMachineFunction.h"
+#include "MipsSubtarget.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "MipsGenInstrInfo.inc"
+
+// Pin the vtable to this file.
+void MipsInstrInfo::anchor() {}
+
+MipsInstrInfo::MipsInstrInfo(const MipsSubtarget &STI, unsigned UncondBr)
+ : MipsGenInstrInfo(Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP),
+ Subtarget(STI), UncondBrOpc(UncondBr) {}
+
+const MipsInstrInfo *MipsInstrInfo::create(MipsSubtarget &STI) {
+ if (STI.inMips16Mode())
+ return llvm::createMips16InstrInfo(STI);
+
+ return llvm::createMipsSEInstrInfo(STI);
+}
+
+bool MipsInstrInfo::isZeroImm(const MachineOperand &op) const {
+ return op.isImm() && op.getImm() == 0;
+}
+
+/// insertNoop - If data hazard condition is found insert the target nop
+/// instruction.
+// FIXME: This appears to be dead code.
+void MipsInstrInfo::
+insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const
+{
+ DebugLoc DL;
+ BuildMI(MBB, MI, DL, get(Mips::NOP));
+}
+
+MachineMemOperand *
+MipsInstrInfo::GetMemOperand(MachineBasicBlock &MBB, int FI,
+ MachineMemOperand::Flags Flags) const {
+ MachineFunction &MF = *MBB.getParent();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ unsigned Align = MFI.getObjectAlignment(FI);
+
+ return MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, FI),
+ Flags, MFI.getObjectSize(FI), Align);
+}
+
+//===----------------------------------------------------------------------===//
+// Branch Analysis
+//===----------------------------------------------------------------------===//
+
+void MipsInstrInfo::AnalyzeCondBr(const MachineInstr *Inst, unsigned Opc,
+ MachineBasicBlock *&BB,
+ SmallVectorImpl<MachineOperand> &Cond) const {
+ assert(getAnalyzableBrOpc(Opc) && "Not an analyzable branch");
+ int NumOp = Inst->getNumExplicitOperands();
+
+ // for both int and fp branches, the last explicit operand is the
+ // MBB.
+ BB = Inst->getOperand(NumOp-1).getMBB();
+ Cond.push_back(MachineOperand::CreateImm(Opc));
+
+ for (int i=0; i<NumOp-1; i++)
+ Cond.push_back(Inst->getOperand(i));
+}
+
+bool MipsInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const {
+ SmallVector<MachineInstr*, 2> BranchInstrs;
+ BranchType BT = analyzeBranch(MBB, TBB, FBB, Cond, AllowModify, BranchInstrs);
+
+ return (BT == BT_None) || (BT == BT_Indirect);
+}
+
+void MipsInstrInfo::BuildCondBr(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+ const DebugLoc &DL,
+ ArrayRef<MachineOperand> Cond) const {
+ unsigned Opc = Cond[0].getImm();
+ const MCInstrDesc &MCID = get(Opc);
+ MachineInstrBuilder MIB = BuildMI(&MBB, DL, MCID);
+
+ for (unsigned i = 1; i < Cond.size(); ++i) {
+ if (Cond[i].isReg())
+ MIB.addReg(Cond[i].getReg());
+ else if (Cond[i].isImm())
+ MIB.addImm(Cond[i].getImm());
+ else
+ assert(false && "Cannot copy operand");
+ }
+ MIB.addMBB(TBB);
+}
+
+unsigned MipsInstrInfo::insertBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB,
+ ArrayRef<MachineOperand> Cond,
+ const DebugLoc &DL,
+ int *BytesAdded) const {
+ // Shouldn't be a fall through.
+ assert(TBB && "insertBranch must not be told to insert a fallthrough");
+ assert(!BytesAdded && "code size not handled");
+
+ // # of condition operands:
+ // Unconditional branches: 0
+ // Floating point branches: 1 (opc)
+ // Int BranchZero: 2 (opc, reg)
+ // Int Branch: 3 (opc, reg0, reg1)
+ assert((Cond.size() <= 3) &&
+ "# of Mips branch conditions must be <= 3!");
+
+ // Two-way Conditional branch.
+ if (FBB) {
+ BuildCondBr(MBB, TBB, DL, Cond);
+ BuildMI(&MBB, DL, get(UncondBrOpc)).addMBB(FBB);
+ return 2;
+ }
+
+ // One way branch.
+ // Unconditional branch.
+ if (Cond.empty())
+ BuildMI(&MBB, DL, get(UncondBrOpc)).addMBB(TBB);
+ else // Conditional branch.
+ BuildCondBr(MBB, TBB, DL, Cond);
+ return 1;
+}
+
+unsigned MipsInstrInfo::removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved) const {
+ assert(!BytesRemoved && "code size not handled");
+
+ MachineBasicBlock::reverse_iterator I = MBB.rbegin(), REnd = MBB.rend();
+ unsigned removed;
+
+ // Skip all the debug instructions.
+ while (I != REnd && I->isDebugValue())
+ ++I;
+
+ if (I == REnd)
+ return 0;
+
+ MachineBasicBlock::iterator FirstBr = ++I.getReverse();
+
+ // Up to 2 branches are removed.
+ // Note that indirect branches are not removed.
+ for (removed = 0; I != REnd && removed < 2; ++I, ++removed)
+ if (!getAnalyzableBrOpc(I->getOpcode()))
+ break;
+
+ MBB.erase((--I).getReverse(), FirstBr);
+
+ return removed;
+}
+
+/// reverseBranchCondition - Return the inverse opcode of the
+/// specified Branch instruction.
+bool MipsInstrInfo::reverseBranchCondition(
+ SmallVectorImpl<MachineOperand> &Cond) const {
+ assert( (Cond.size() && Cond.size() <= 3) &&
+ "Invalid Mips branch condition!");
+ Cond[0].setImm(getOppositeBranchOpc(Cond[0].getImm()));
+ return false;
+}
+
+MipsInstrInfo::BranchType MipsInstrInfo::analyzeBranch(
+ MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond, bool AllowModify,
+ SmallVectorImpl<MachineInstr *> &BranchInstrs) const {
+
+ MachineBasicBlock::reverse_iterator I = MBB.rbegin(), REnd = MBB.rend();
+
+ // Skip all the debug instructions.
+ while (I != REnd && I->isDebugValue())
+ ++I;
+
+ if (I == REnd || !isUnpredicatedTerminator(*I)) {
+ // This block ends with no branches (it just falls through to its succ).
+ // Leave TBB/FBB null.
+ TBB = FBB = nullptr;
+ return BT_NoBranch;
+ }
+
+ MachineInstr *LastInst = &*I;
+ unsigned LastOpc = LastInst->getOpcode();
+ BranchInstrs.push_back(LastInst);
+
+ // Not an analyzable branch (e.g., indirect jump).
+ if (!getAnalyzableBrOpc(LastOpc))
+ return LastInst->isIndirectBranch() ? BT_Indirect : BT_None;
+
+ // Get the second to last instruction in the block.
+ unsigned SecondLastOpc = 0;
+ MachineInstr *SecondLastInst = nullptr;
+
+ if (++I != REnd) {
+ SecondLastInst = &*I;
+ SecondLastOpc = getAnalyzableBrOpc(SecondLastInst->getOpcode());
+
+ // Not an analyzable branch (must be an indirect jump).
+ if (isUnpredicatedTerminator(*SecondLastInst) && !SecondLastOpc)
+ return BT_None;
+ }
+
+ // If there is only one terminator instruction, process it.
+ if (!SecondLastOpc) {
+ // Unconditional branch.
+ if (LastInst->isUnconditionalBranch()) {
+ TBB = LastInst->getOperand(0).getMBB();
+ return BT_Uncond;
+ }
+
+ // Conditional branch
+ AnalyzeCondBr(LastInst, LastOpc, TBB, Cond);
+ return BT_Cond;
+ }
+
+ // If we reached here, there are two branches.
+ // If there are three terminators, we don't know what sort of block this is.
+ if (++I != REnd && isUnpredicatedTerminator(*I))
+ return BT_None;
+
+ BranchInstrs.insert(BranchInstrs.begin(), SecondLastInst);
+
+ // If second to last instruction is an unconditional branch,
+ // analyze it and remove the last instruction.
+ if (SecondLastInst->isUnconditionalBranch()) {
+ // Return if the last instruction cannot be removed.
+ if (!AllowModify)
+ return BT_None;
+
+ TBB = SecondLastInst->getOperand(0).getMBB();
+ LastInst->eraseFromParent();
+ BranchInstrs.pop_back();
+ return BT_Uncond;
+ }
+
+ // Conditional branch followed by an unconditional branch.
+ // The last one must be unconditional.
+ if (!LastInst->isUnconditionalBranch())
+ return BT_None;
+
+ AnalyzeCondBr(SecondLastInst, SecondLastOpc, TBB, Cond);
+ FBB = LastInst->getOperand(0).getMBB();
+
+ return BT_CondUncond;
+}
+
+/// Return the corresponding compact (no delay slot) form of a branch.
+unsigned MipsInstrInfo::getEquivalentCompactForm(
+ const MachineBasicBlock::iterator I) const {
+ unsigned Opcode = I->getOpcode();
+ bool canUseShortMicroMipsCTI = false;
+
+ if (Subtarget.inMicroMipsMode()) {
+ switch (Opcode) {
+ case Mips::BNE:
+ case Mips::BNE_MM:
+ case Mips::BEQ:
+ case Mips::BEQ_MM:
+ // microMIPS has NE,EQ branches that do not have delay slots provided one
+ // of the operands is zero.
+ if (I->getOperand(1).getReg() == Subtarget.getABI().GetZeroReg())
+ canUseShortMicroMipsCTI = true;
+ break;
+ // For microMIPS the PseudoReturn and PseudoIndirectBranch are always
+ // expanded to JR_MM, so they can be replaced with JRC16_MM.
+ case Mips::JR:
+ case Mips::PseudoReturn:
+ case Mips::PseudoIndirectBranch:
+ case Mips::TAILCALLREG:
+ canUseShortMicroMipsCTI = true;
+ break;
+ }
+ }
+
+ // MIPSR6 forbids both operands being the zero register.
+ if (Subtarget.hasMips32r6() && (I->getNumOperands() > 1) &&
+ (I->getOperand(0).isReg() &&
+ (I->getOperand(0).getReg() == Mips::ZERO ||
+ I->getOperand(0).getReg() == Mips::ZERO_64)) &&
+ (I->getOperand(1).isReg() &&
+ (I->getOperand(1).getReg() == Mips::ZERO ||
+ I->getOperand(1).getReg() == Mips::ZERO_64)))
+ return 0;
+
+ if (Subtarget.hasMips32r6() || canUseShortMicroMipsCTI) {
+ switch (Opcode) {
+ case Mips::B:
+ return Mips::BC;
+ case Mips::BAL:
+ return Mips::BALC;
+ case Mips::BEQ:
+ case Mips::BEQ_MM:
+ if (canUseShortMicroMipsCTI)
+ return Mips::BEQZC_MM;
+ else if (I->getOperand(0).getReg() == I->getOperand(1).getReg())
+ return 0;
+ return Mips::BEQC;
+ case Mips::BNE:
+ case Mips::BNE_MM:
+ if (canUseShortMicroMipsCTI)
+ return Mips::BNEZC_MM;
+ else if (I->getOperand(0).getReg() == I->getOperand(1).getReg())
+ return 0;
+ return Mips::BNEC;
+ case Mips::BGE:
+ if (I->getOperand(0).getReg() == I->getOperand(1).getReg())
+ return 0;
+ return Mips::BGEC;
+ case Mips::BGEU:
+ if (I->getOperand(0).getReg() == I->getOperand(1).getReg())
+ return 0;
+ return Mips::BGEUC;
+ case Mips::BGEZ:
+ return Mips::BGEZC;
+ case Mips::BGTZ:
+ return Mips::BGTZC;
+ case Mips::BLEZ:
+ return Mips::BLEZC;
+ case Mips::BLT:
+ if (I->getOperand(0).getReg() == I->getOperand(1).getReg())
+ return 0;
+ return Mips::BLTC;
+ case Mips::BLTU:
+ if (I->getOperand(0).getReg() == I->getOperand(1).getReg())
+ return 0;
+ return Mips::BLTUC;
+ case Mips::BLTZ:
+ return Mips::BLTZC;
+ case Mips::BEQ64:
+ if (I->getOperand(0).getReg() == I->getOperand(1).getReg())
+ return 0;
+ return Mips::BEQC64;
+ case Mips::BNE64:
+ if (I->getOperand(0).getReg() == I->getOperand(1).getReg())
+ return 0;
+ return Mips::BNEC64;
+ case Mips::BGTZ64:
+ return Mips::BGTZC64;
+ case Mips::BGEZ64:
+ return Mips::BGEZC64;
+ case Mips::BLTZ64:
+ return Mips::BLTZC64;
+ case Mips::BLEZ64:
+ return Mips::BLEZC64;
+ // For MIPSR6, the instruction 'jic' can be used for these cases. Some
+ // tools will accept 'jrc reg' as an alias for 'jic 0, $reg'.
+ case Mips::JR:
+ case Mips::PseudoReturn:
+ case Mips::PseudoIndirectBranch:
+ case Mips::TAILCALLREG:
+ if (canUseShortMicroMipsCTI)
+ return Mips::JRC16_MM;
+ return Mips::JIC;
+ case Mips::JALRPseudo:
+ return Mips::JIALC;
+ case Mips::JR64:
+ case Mips::PseudoReturn64:
+ case Mips::PseudoIndirectBranch64:
+ case Mips::TAILCALLREG64:
+ return Mips::JIC64;
+ case Mips::JALR64Pseudo:
+ return Mips::JIALC64;
+ default:
+ return 0;
+ }
+ }
+
+ return 0;
+}
+
+/// Predicate for distingushing between control transfer instructions and all
+/// other instructions for handling forbidden slots. Consider inline assembly
+/// as unsafe as well.
+bool MipsInstrInfo::SafeInForbiddenSlot(const MachineInstr &MI) const {
+ if (MI.isInlineAsm())
+ return false;
+
+ return (MI.getDesc().TSFlags & MipsII::IsCTI) == 0;
+
+}
+
+/// Predicate for distingushing instructions that have forbidden slots.
+bool MipsInstrInfo::HasForbiddenSlot(const MachineInstr &MI) const {
+ return (MI.getDesc().TSFlags & MipsII::HasForbiddenSlot) != 0;
+}
+
+/// Return the number of bytes of code the specified instruction may be.
+unsigned MipsInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ default:
+ return MI.getDesc().getSize();
+ case TargetOpcode::INLINEASM: { // Inline Asm: Variable size.
+ const MachineFunction *MF = MI.getParent()->getParent();
+ const char *AsmStr = MI.getOperand(0).getSymbolName();
+ return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
+ }
+ case Mips::CONSTPOOL_ENTRY:
+ // If this machine instr is a constant pool entry, its size is recorded as
+ // operand #2.
+ return MI.getOperand(2).getImm();
+ }
+}
+
+MachineInstrBuilder
+MipsInstrInfo::genInstrWithNewOpc(unsigned NewOpc,
+ MachineBasicBlock::iterator I) const {
+ MachineInstrBuilder MIB;
+
+ // Certain branches have two forms: e.g beq $1, $zero, dest vs beqz $1, dest
+ // Pick the zero form of the branch for readable assembly and for greater
+ // branch distance in non-microMIPS mode.
+ // Additional MIPSR6 does not permit the use of register $zero for compact
+ // branches.
+ // FIXME: Certain atomic sequences on mips64 generate 32bit references to
+ // Mips::ZERO, which is incorrect. This test should be updated to use
+ // Subtarget.getABI().GetZeroReg() when those atomic sequences and others
+ // are fixed.
+ int ZeroOperandPosition = -1;
+ bool BranchWithZeroOperand = false;
+ if (I->isBranch() && !I->isPseudo()) {
+ auto TRI = I->getParent()->getParent()->getSubtarget().getRegisterInfo();
+ ZeroOperandPosition = I->findRegisterUseOperandIdx(Mips::ZERO, false, TRI);
+ BranchWithZeroOperand = ZeroOperandPosition != -1;
+ }
+
+ if (BranchWithZeroOperand) {
+ switch (NewOpc) {
+ case Mips::BEQC:
+ NewOpc = Mips::BEQZC;
+ break;
+ case Mips::BNEC:
+ NewOpc = Mips::BNEZC;
+ break;
+ case Mips::BGEC:
+ NewOpc = Mips::BGEZC;
+ break;
+ case Mips::BLTC:
+ NewOpc = Mips::BLTZC;
+ break;
+ case Mips::BEQC64:
+ NewOpc = Mips::BEQZC64;
+ break;
+ case Mips::BNEC64:
+ NewOpc = Mips::BNEZC64;
+ break;
+ }
+ }
+
+ MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), get(NewOpc));
+
+ // For MIPSR6 JI*C requires an immediate 0 as an operand, JIALC(64) an
+ // immediate 0 as an operand and requires the removal of it's %RA<imp-def>
+ // implicit operand as copying the implicit operations of the instructio we're
+ // looking at will give us the correct flags.
+ if (NewOpc == Mips::JIC || NewOpc == Mips::JIALC || NewOpc == Mips::JIC64 ||
+ NewOpc == Mips::JIALC64) {
+
+ if (NewOpc == Mips::JIALC || NewOpc == Mips::JIALC64)
+ MIB->RemoveOperand(0);
+
+ for (unsigned J = 0, E = I->getDesc().getNumOperands(); J < E; ++J) {
+ MIB.addOperand(I->getOperand(J));
+ }
+
+ MIB.addImm(0);
+
+ } else {
+ for (unsigned J = 0, E = I->getDesc().getNumOperands(); J < E; ++J) {
+ if (BranchWithZeroOperand && (unsigned)ZeroOperandPosition == J)
+ continue;
+
+ MIB.addOperand(I->getOperand(J));
+ }
+ }
+
+ MIB.copyImplicitOps(*I);
+
+ MIB.setMemRefs(I->memoperands_begin(), I->memoperands_end());
+ return MIB;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.h b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.h
new file mode 100644
index 000000000000..347b9187d08c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.h
@@ -0,0 +1,161 @@
+//===-- MipsInstrInfo.h - Mips Instruction Information ----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips implementation of the TargetInstrInfo class.
+//
+// FIXME: We need to override TargetInstrInfo::getInlineAsmLength method in
+// order for MipsLongBranch pass to work correctly when the code has inline
+// assembly. The returned value doesn't have to be the asm instruction's exact
+// size in bytes; MipsLongBranch only expects it to be the correct upper bound.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSINSTRINFO_H
+#define LLVM_LIB_TARGET_MIPS_MIPSINSTRINFO_H
+
+#include "Mips.h"
+#include "MipsRegisterInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "MipsGenInstrInfo.inc"
+
+namespace llvm {
+class MipsSubtarget;
+class MipsInstrInfo : public MipsGenInstrInfo {
+ virtual void anchor();
+protected:
+ const MipsSubtarget &Subtarget;
+ unsigned UncondBrOpc;
+
+public:
+ enum BranchType {
+ BT_None, // Couldn't analyze branch.
+ BT_NoBranch, // No branches found.
+ BT_Uncond, // One unconditional branch.
+ BT_Cond, // One conditional branch.
+ BT_CondUncond, // A conditional branch followed by an unconditional branch.
+ BT_Indirect // One indirct branch.
+ };
+
+ explicit MipsInstrInfo(const MipsSubtarget &STI, unsigned UncondBrOpc);
+
+ static const MipsInstrInfo *create(MipsSubtarget &STI);
+
+ /// Branch Analysis
+ bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const override;
+
+ unsigned removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved = nullptr) const override;
+
+ unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+ const DebugLoc &DL,
+ int *BytesAdded = nullptr) const override;
+
+ bool
+ reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+
+ BranchType analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify,
+ SmallVectorImpl<MachineInstr *> &BranchInstrs) const;
+
+ /// Determine the opcode of a non-delay slot form for a branch if one exists.
+ unsigned getEquivalentCompactForm(const MachineBasicBlock::iterator I) const;
+
+ /// Predicate to determine if an instruction can go in a forbidden slot.
+ bool SafeInForbiddenSlot(const MachineInstr &MI) const;
+
+ /// Predicate to determine if an instruction has a forbidden slot.
+ bool HasForbiddenSlot(const MachineInstr &MI) const;
+
+ /// Insert nop instruction when hazard condition is found
+ void insertNoop(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI) const override;
+
+ /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As
+ /// such, whenever a client has an instance of instruction info, it should
+ /// always be able to get register info as well (through this method).
+ ///
+ virtual const MipsRegisterInfo &getRegisterInfo() const = 0;
+
+ virtual unsigned getOppositeBranchOpc(unsigned Opc) const = 0;
+
+ /// Return the number of bytes of code the specified instruction may be.
+ unsigned getInstSizeInBytes(const MachineInstr &MI) const override;
+
+ void storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ unsigned SrcReg, bool isKill, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override {
+ storeRegToStack(MBB, MBBI, SrcReg, isKill, FrameIndex, RC, TRI, 0);
+ }
+
+ void loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ unsigned DestReg, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override {
+ loadRegFromStack(MBB, MBBI, DestReg, FrameIndex, RC, TRI, 0);
+ }
+
+ virtual void storeRegToStack(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned SrcReg, bool isKill, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI,
+ int64_t Offset) const = 0;
+
+ virtual void loadRegFromStack(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned DestReg, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI,
+ int64_t Offset) const = 0;
+
+ virtual void adjustStackPtr(unsigned SP, int64_t Amount,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const = 0;
+
+ /// Create an instruction which has the same operands and memory operands
+ /// as MI but has a new opcode.
+ MachineInstrBuilder genInstrWithNewOpc(unsigned NewOpc,
+ MachineBasicBlock::iterator I) const;
+
+protected:
+ bool isZeroImm(const MachineOperand &op) const;
+
+ MachineMemOperand *GetMemOperand(MachineBasicBlock &MBB, int FI,
+ MachineMemOperand::Flags Flags) const;
+
+private:
+ virtual unsigned getAnalyzableBrOpc(unsigned Opc) const = 0;
+
+ void AnalyzeCondBr(const MachineInstr *Inst, unsigned Opc,
+ MachineBasicBlock *&BB,
+ SmallVectorImpl<MachineOperand> &Cond) const;
+
+ void BuildCondBr(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+ const DebugLoc &DL, ArrayRef<MachineOperand> Cond) const;
+};
+
+/// Create MipsInstrInfo objects.
+const MipsInstrInfo *createMips16InstrInfo(const MipsSubtarget &STI);
+const MipsInstrInfo *createMipsSEInstrInfo(const MipsSubtarget &STI);
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td
new file mode 100644
index 000000000000..5bc48336121a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td
@@ -0,0 +1,2868 @@
+//===- MipsInstrInfo.td - Target Description for Mips Target -*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// Mips profiles and nodes
+//===----------------------------------------------------------------------===//
+
+def SDT_MipsJmpLink : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>;
+def SDT_MipsCMov : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>,
+ SDTCisSameAs<1, 2>,
+ SDTCisSameAs<3, 4>,
+ SDTCisInt<4>]>;
+def SDT_MipsCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_MipsCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
+def SDT_MFLOHI : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisVT<1, untyped>]>;
+def SDT_MTLOHI : SDTypeProfile<1, 2, [SDTCisVT<0, untyped>,
+ SDTCisInt<1>, SDTCisSameAs<1, 2>]>;
+def SDT_MipsMultDiv : SDTypeProfile<1, 2, [SDTCisVT<0, untyped>, SDTCisInt<1>,
+ SDTCisSameAs<1, 2>]>;
+def SDT_MipsMAddMSub : SDTypeProfile<1, 3,
+ [SDTCisVT<0, untyped>, SDTCisSameAs<0, 3>,
+ SDTCisVT<1, i32>, SDTCisSameAs<1, 2>]>;
+def SDT_MipsDivRem16 : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>]>;
+
+def SDT_MipsThreadPointer : SDTypeProfile<1, 0, [SDTCisPtrTy<0>]>;
+
+def SDT_Sync : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
+
+def SDT_Ext : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
+ SDTCisVT<2, i32>, SDTCisSameAs<2, 3>]>;
+def SDT_Ins : SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
+ SDTCisVT<2, i32>, SDTCisSameAs<2, 3>,
+ SDTCisSameAs<0, 4>]>;
+
+def SDTMipsLoadLR : SDTypeProfile<1, 2,
+ [SDTCisInt<0>, SDTCisPtrTy<1>,
+ SDTCisSameAs<0, 2>]>;
+
+// Call
+def MipsJmpLink : SDNode<"MipsISD::JmpLink",SDT_MipsJmpLink,
+ [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
+ SDNPVariadic]>;
+
+// Tail call
+def MipsTailCall : SDNode<"MipsISD::TailCall", SDT_MipsJmpLink,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+// Hi and Lo nodes are used to handle global addresses. Used on
+// MipsISelLowering to lower stuff like GlobalAddress, ExternalSymbol
+// static model. (nothing to do with Mips Registers Hi and Lo)
+def MipsHi : SDNode<"MipsISD::Hi", SDTIntUnaryOp>;
+def MipsLo : SDNode<"MipsISD::Lo", SDTIntUnaryOp>;
+def MipsGPRel : SDNode<"MipsISD::GPRel", SDTIntUnaryOp>;
+
+// TlsGd node is used to handle General Dynamic TLS
+def MipsTlsGd : SDNode<"MipsISD::TlsGd", SDTIntUnaryOp>;
+
+// TprelHi and TprelLo nodes are used to handle Local Exec TLS
+def MipsTprelHi : SDNode<"MipsISD::TprelHi", SDTIntUnaryOp>;
+def MipsTprelLo : SDNode<"MipsISD::TprelLo", SDTIntUnaryOp>;
+
+// Thread pointer
+def MipsThreadPointer: SDNode<"MipsISD::ThreadPointer", SDT_MipsThreadPointer>;
+
+// Return
+def MipsRet : SDNode<"MipsISD::Ret", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+def MipsERet : SDNode<"MipsISD::ERet", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue, SDNPSideEffect]>;
+
+// These are target-independent nodes, but have target-specific formats.
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_MipsCallSeqStart,
+ [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>;
+def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_MipsCallSeqEnd,
+ [SDNPHasChain, SDNPSideEffect,
+ SDNPOptInGlue, SDNPOutGlue]>;
+
+// Nodes used to extract LO/HI registers.
+def MipsMFHI : SDNode<"MipsISD::MFHI", SDT_MFLOHI>;
+def MipsMFLO : SDNode<"MipsISD::MFLO", SDT_MFLOHI>;
+
+// Node used to insert 32-bit integers to LOHI register pair.
+def MipsMTLOHI : SDNode<"MipsISD::MTLOHI", SDT_MTLOHI>;
+
+// Mult nodes.
+def MipsMult : SDNode<"MipsISD::Mult", SDT_MipsMultDiv>;
+def MipsMultu : SDNode<"MipsISD::Multu", SDT_MipsMultDiv>;
+
+// MAdd*/MSub* nodes
+def MipsMAdd : SDNode<"MipsISD::MAdd", SDT_MipsMAddMSub>;
+def MipsMAddu : SDNode<"MipsISD::MAddu", SDT_MipsMAddMSub>;
+def MipsMSub : SDNode<"MipsISD::MSub", SDT_MipsMAddMSub>;
+def MipsMSubu : SDNode<"MipsISD::MSubu", SDT_MipsMAddMSub>;
+
+// DivRem(u) nodes
+def MipsDivRem : SDNode<"MipsISD::DivRem", SDT_MipsMultDiv>;
+def MipsDivRemU : SDNode<"MipsISD::DivRemU", SDT_MipsMultDiv>;
+def MipsDivRem16 : SDNode<"MipsISD::DivRem16", SDT_MipsDivRem16,
+ [SDNPOutGlue]>;
+def MipsDivRemU16 : SDNode<"MipsISD::DivRemU16", SDT_MipsDivRem16,
+ [SDNPOutGlue]>;
+
+// Target constant nodes that are not part of any isel patterns and remain
+// unchanged can cause instructions with illegal operands to be emitted.
+// Wrapper node patterns give the instruction selector a chance to replace
+// target constant nodes that would otherwise remain unchanged with ADDiu
+// nodes. Without these wrapper node patterns, the following conditional move
+// instruction is emitted when function cmov2 in test/CodeGen/Mips/cmov.ll is
+// compiled:
+// movn %got(d)($gp), %got(c)($gp), $4
+// This instruction is illegal since movn can take only register operands.
+
+def MipsWrapper : SDNode<"MipsISD::Wrapper", SDTIntBinOp>;
+
+def MipsSync : SDNode<"MipsISD::Sync", SDT_Sync, [SDNPHasChain,SDNPSideEffect]>;
+
+def MipsExt : SDNode<"MipsISD::Ext", SDT_Ext>;
+def MipsIns : SDNode<"MipsISD::Ins", SDT_Ins>;
+
+def MipsLWL : SDNode<"MipsISD::LWL", SDTMipsLoadLR,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def MipsLWR : SDNode<"MipsISD::LWR", SDTMipsLoadLR,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def MipsSWL : SDNode<"MipsISD::SWL", SDTStore,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def MipsSWR : SDNode<"MipsISD::SWR", SDTStore,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def MipsLDL : SDNode<"MipsISD::LDL", SDTMipsLoadLR,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def MipsLDR : SDNode<"MipsISD::LDR", SDTMipsLoadLR,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def MipsSDL : SDNode<"MipsISD::SDL", SDTStore,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def MipsSDR : SDNode<"MipsISD::SDR", SDTStore,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+//===----------------------------------------------------------------------===//
+// Mips Instruction Predicate Definitions.
+//===----------------------------------------------------------------------===//
+def HasMips2 : Predicate<"Subtarget->hasMips2()">,
+ AssemblerPredicate<"FeatureMips2">;
+def HasMips3_32 : Predicate<"Subtarget->hasMips3_32()">,
+ AssemblerPredicate<"FeatureMips3_32">;
+def HasMips3_32r2 : Predicate<"Subtarget->hasMips3_32r2()">,
+ AssemblerPredicate<"FeatureMips3_32r2">;
+def HasMips3 : Predicate<"Subtarget->hasMips3()">,
+ AssemblerPredicate<"FeatureMips3">;
+def NotMips3 : Predicate<"!Subtarget->hasMips3()">,
+ AssemblerPredicate<"!FeatureMips3">;
+def HasMips4_32 : Predicate<"Subtarget->hasMips4_32()">,
+ AssemblerPredicate<"FeatureMips4_32">;
+def NotMips4_32 : Predicate<"!Subtarget->hasMips4_32()">,
+ AssemblerPredicate<"!FeatureMips4_32">;
+def HasMips4_32r2 : Predicate<"Subtarget->hasMips4_32r2()">,
+ AssemblerPredicate<"FeatureMips4_32r2">;
+def HasMips5_32r2 : Predicate<"Subtarget->hasMips5_32r2()">,
+ AssemblerPredicate<"FeatureMips5_32r2">;
+def HasMips32 : Predicate<"Subtarget->hasMips32()">,
+ AssemblerPredicate<"FeatureMips32">;
+def HasMips32r2 : Predicate<"Subtarget->hasMips32r2()">,
+ AssemblerPredicate<"FeatureMips32r2">;
+def HasMips32r5 : Predicate<"Subtarget->hasMips32r5()">,
+ AssemblerPredicate<"FeatureMips32r5">;
+def HasMips32r6 : Predicate<"Subtarget->hasMips32r6()">,
+ AssemblerPredicate<"FeatureMips32r6">;
+def NotMips32r6 : Predicate<"!Subtarget->hasMips32r6()">,
+ AssemblerPredicate<"!FeatureMips32r6">;
+def IsGP64bit : Predicate<"Subtarget->isGP64bit()">,
+ AssemblerPredicate<"FeatureGP64Bit">;
+def IsGP32bit : Predicate<"!Subtarget->isGP64bit()">,
+ AssemblerPredicate<"!FeatureGP64Bit">;
+def IsPTR64bit : Predicate<"Subtarget->isABI_N64()">,
+ AssemblerPredicate<"FeaturePTR64Bit">;
+def IsPTR32bit : Predicate<"!Subtarget->isABI_N64()">,
+ AssemblerPredicate<"!FeaturePTR64Bit">;
+def HasMips64 : Predicate<"Subtarget->hasMips64()">,
+ AssemblerPredicate<"FeatureMips64">;
+def NotMips64 : Predicate<"!Subtarget->hasMips64()">,
+ AssemblerPredicate<"!FeatureMips64">;
+def HasMips64r2 : Predicate<"Subtarget->hasMips64r2()">,
+ AssemblerPredicate<"FeatureMips64r2">;
+def HasMips64r6 : Predicate<"Subtarget->hasMips64r6()">,
+ AssemblerPredicate<"FeatureMips64r6">;
+def NotMips64r6 : Predicate<"!Subtarget->hasMips64r6()">,
+ AssemblerPredicate<"!FeatureMips64r6">;
+def HasMicroMips32r6 : Predicate<"Subtarget->inMicroMips32r6Mode()">,
+ AssemblerPredicate<"FeatureMicroMips,FeatureMips32r6">;
+def HasMicroMips64r6 : Predicate<"Subtarget->inMicroMips64r6Mode()">,
+ AssemblerPredicate<"FeatureMicroMips,FeatureMips64r6">;
+def InMips16Mode : Predicate<"Subtarget->inMips16Mode()">,
+ AssemblerPredicate<"FeatureMips16">;
+def HasCnMips : Predicate<"Subtarget->hasCnMips()">,
+ AssemblerPredicate<"FeatureCnMips">;
+def NotCnMips : Predicate<"!Subtarget->hasCnMips()">,
+ AssemblerPredicate<"!FeatureCnMips">;
+def RelocNotPIC : Predicate<"!TM.isPositionIndependent()">;
+def RelocPIC : Predicate<"TM.isPositionIndependent()">;
+def NoNaNsFPMath : Predicate<"TM.Options.NoNaNsFPMath">;
+def HasStdEnc : Predicate<"Subtarget->hasStandardEncoding()">,
+ AssemblerPredicate<"!FeatureMips16">;
+def NotDSP : Predicate<"!Subtarget->hasDSP()">;
+def InMicroMips : Predicate<"Subtarget->inMicroMipsMode()">,
+ AssemblerPredicate<"FeatureMicroMips">;
+def NotInMicroMips : Predicate<"!Subtarget->inMicroMipsMode()">,
+ AssemblerPredicate<"!FeatureMicroMips">;
+def IsLE : Predicate<"Subtarget->isLittle()">;
+def IsBE : Predicate<"!Subtarget->isLittle()">;
+def IsNotNaCl : Predicate<"!Subtarget->isTargetNaCl()">;
+def UseTCCInDIV : AssemblerPredicate<"FeatureUseTCCInDIV">;
+def HasEVA : Predicate<"Subtarget->hasEVA()">,
+ AssemblerPredicate<"FeatureEVA,FeatureMips32r2">;
+def HasMSA : Predicate<"Subtarget->hasMSA()">,
+ AssemblerPredicate<"FeatureMSA">;
+
+
+//===----------------------------------------------------------------------===//
+// Mips GPR size adjectives.
+// They are mutually exclusive.
+//===----------------------------------------------------------------------===//
+
+class GPR_32 { list<Predicate> GPRPredicates = [IsGP32bit]; }
+class GPR_64 { list<Predicate> GPRPredicates = [IsGP64bit]; }
+
+class PTR_32 { list<Predicate> PTRPredicates = [IsPTR32bit]; }
+class PTR_64 { list<Predicate> PTRPredicates = [IsPTR64bit]; }
+
+//===----------------------------------------------------------------------===//
+// Mips ISA/ASE membership and instruction group membership adjectives.
+// They are mutually exclusive.
+//===----------------------------------------------------------------------===//
+
+// FIXME: I'd prefer to use additive predicates to build the instruction sets
+// but we are short on assembler feature bits at the moment. Using a
+// subtractive predicate will hopefully keep us under the 32 predicate
+// limit long enough to develop an alternative way to handle P1||P2
+// predicates.
+class ISA_MIPS1_NOT_MIPS3 {
+ list<Predicate> InsnPredicates = [NotMips3];
+}
+class ISA_MIPS1_NOT_4_32 {
+ list<Predicate> InsnPredicates = [NotMips4_32];
+}
+class ISA_MIPS1_NOT_32R6_64R6 {
+ list<Predicate> InsnPredicates = [NotMips32r6, NotMips64r6];
+}
+class ISA_MIPS2 { list<Predicate> InsnPredicates = [HasMips2]; }
+class ISA_MIPS2_NOT_32R6_64R6 {
+ list<Predicate> InsnPredicates = [HasMips2, NotMips32r6, NotMips64r6];
+}
+class ISA_MIPS3 { list<Predicate> InsnPredicates = [HasMips3]; }
+class ISA_MIPS3_NOT_32R6_64R6 {
+ list<Predicate> InsnPredicates = [HasMips3, NotMips32r6, NotMips64r6];
+}
+class ISA_MIPS32 { list<Predicate> InsnPredicates = [HasMips32]; }
+class ISA_MIPS32_NOT_32R6_64R6 {
+ list<Predicate> InsnPredicates = [HasMips32, NotMips32r6, NotMips64r6];
+}
+class ISA_MIPS32R2 { list<Predicate> InsnPredicates = [HasMips32r2]; }
+class ISA_MIPS32R2_NOT_32R6_64R6 {
+ list<Predicate> InsnPredicates = [HasMips32r2, NotMips32r6, NotMips64r6];
+}
+class ISA_MIPS32R5 { list<Predicate> InsnPredicates = [HasMips32r5]; }
+class ISA_MIPS64 { list<Predicate> InsnPredicates = [HasMips64]; }
+class ISA_MIPS64_NOT_64R6 {
+ list<Predicate> InsnPredicates = [HasMips64, NotMips64r6];
+}
+class ISA_MIPS64R2 { list<Predicate> InsnPredicates = [HasMips64r2]; }
+class ISA_MIPS32R6 { list<Predicate> InsnPredicates = [HasMips32r6]; }
+class ISA_MIPS64R6 { list<Predicate> InsnPredicates = [HasMips64r6]; }
+class ISA_MICROMIPS { list<Predicate> InsnPredicates = [InMicroMips]; }
+class ISA_MICROMIPS32R6 {
+ list<Predicate> InsnPredicates = [HasMicroMips32r6];
+}
+class ISA_MICROMIPS64R6 {
+ list<Predicate> InsnPredicates = [HasMicroMips64r6];
+}
+class ISA_MICROMIPS32_NOT_MIPS32R6 {
+ list<Predicate> InsnPredicates = [InMicroMips, NotMips32r6];
+}
+
+class INSN_EVA { list<Predicate> InsnPredicates = [HasEVA]; }
+class INSN_EVA_NOT_32R6_64R6 {
+ list<Predicate> InsnPredicates = [NotMips32r6, NotMips64r6, HasEVA];
+}
+
+// The portions of MIPS-III that were also added to MIPS32
+class INSN_MIPS3_32 { list<Predicate> InsnPredicates = [HasMips3_32]; }
+
+// The portions of MIPS-III that were also added to MIPS32 but were removed in
+// MIPS32r6 and MIPS64r6.
+class INSN_MIPS3_32_NOT_32R6_64R6 {
+ list<Predicate> InsnPredicates = [HasMips3_32, NotMips32r6, NotMips64r6];
+}
+
+// The portions of MIPS-III that were also added to MIPS32
+class INSN_MIPS3_32R2 { list<Predicate> InsnPredicates = [HasMips3_32r2]; }
+
+// The portions of MIPS-IV that were also added to MIPS32.
+class INSN_MIPS4_32 { list <Predicate> InsnPredicates = [HasMips4_32]; }
+
+// The portions of MIPS-IV that were also added to MIPS32 but were removed in
+// MIPS32r6 and MIPS64r6.
+class INSN_MIPS4_32_NOT_32R6_64R6 {
+ list<Predicate> InsnPredicates = [HasMips4_32, NotMips32r6, NotMips64r6];
+}
+
+// The portions of MIPS-IV that were also added to MIPS32r2 but were removed in
+// MIPS32r6 and MIPS64r6.
+class INSN_MIPS4_32R2_NOT_32R6_64R6 {
+ list<Predicate> InsnPredicates = [HasMips4_32r2, NotMips32r6, NotMips64r6];
+}
+
+// The portions of MIPS-IV that were also added to MIPS32r2.
+class INSN_MIPS4_32R2 {
+ list<Predicate> InsnPredicates = [HasMips4_32r2];
+}
+
+// The portions of MIPS-V that were also added to MIPS32r2 but were removed in
+// MIPS32r6 and MIPS64r6.
+class INSN_MIPS5_32R2_NOT_32R6_64R6 {
+ list<Predicate> InsnPredicates = [HasMips5_32r2, NotMips32r6, NotMips64r6];
+}
+
+class ASE_CNMIPS {
+ list<Predicate> InsnPredicates = [HasCnMips];
+}
+
+class NOT_ASE_CNMIPS {
+ list<Predicate> InsnPredicates = [NotCnMips];
+}
+
+class ASE_MIPS64_CNMIPS {
+ list<Predicate> InsnPredicates = [HasMips64, HasCnMips];
+}
+
+class ASE_MSA {
+ list<Predicate> InsnPredicates = [HasMSA];
+}
+
+class ASE_MSA_NOT_MSA64 {
+ list<Predicate> InsnPredicates = [HasMSA, NotMips64];
+}
+
+class ASE_MSA64 {
+ list<Predicate> InsnPredicates = [HasMSA, HasMips64];
+}
+
+// Class used for separating microMIPSr6 and microMIPS (r3) instruction.
+// It can be used only on instructions that doesn't inherit PredicateControl.
+class ISA_MICROMIPS_NOT_32R6_64R6 : PredicateControl {
+ let InsnPredicates = [InMicroMips, NotMips32r6, NotMips64r6];
+}
+
+class ASE_NOT_DSP {
+ list<Predicate> InsnPredicates = [NotDSP];
+}
+
+//===----------------------------------------------------------------------===//
+
+class MipsPat<dag pattern, dag result> : Pat<pattern, result>, PredicateControl {
+ let EncodingPredicates = [HasStdEnc];
+}
+
+class MipsInstAlias<string Asm, dag Result, bit Emit = 0b1> :
+ InstAlias<Asm, Result, Emit>, PredicateControl;
+
+class IsCommutable {
+ bit isCommutable = 1;
+}
+
+class IsBranch {
+ bit isBranch = 1;
+ bit isCTI = 1;
+}
+
+class IsReturn {
+ bit isReturn = 1;
+ bit isCTI = 1;
+}
+
+class IsCall {
+ bit isCall = 1;
+ bit isCTI = 1;
+}
+
+class IsTailCall {
+ bit isCall = 1;
+ bit isTerminator = 1;
+ bit isReturn = 1;
+ bit isBarrier = 1;
+ bit hasExtraSrcRegAllocReq = 1;
+ bit isCodeGenOnly = 1;
+ bit isCTI = 1;
+}
+
+class IsAsCheapAsAMove {
+ bit isAsCheapAsAMove = 1;
+}
+
+class NeverHasSideEffects {
+ bit hasSideEffects = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+
+include "MipsInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Mips Operand, Complex Patterns and Transformations Definitions.
+//===----------------------------------------------------------------------===//
+
+class ConstantSImmAsmOperandClass<int Bits, list<AsmOperandClass> Supers = [],
+ int Offset = 0> : AsmOperandClass {
+ let Name = "ConstantSImm" # Bits # "_" # Offset;
+ let RenderMethod = "addConstantSImmOperands<" # Bits # ", " # Offset # ">";
+ let PredicateMethod = "isConstantSImm<" # Bits # ", " # Offset # ">";
+ let SuperClasses = Supers;
+ let DiagnosticType = "SImm" # Bits # "_" # Offset;
+}
+
+class SimmLslAsmOperandClass<int Bits, list<AsmOperandClass> Supers = [],
+ int Shift = 0> : AsmOperandClass {
+ let Name = "Simm" # Bits # "_Lsl" # Shift;
+ let RenderMethod = "addImmOperands";
+ let PredicateMethod = "isScaledSImm<" # Bits # ", " # Shift # ">";
+ let SuperClasses = Supers;
+ let DiagnosticType = "SImm" # Bits # "_Lsl" # Shift;
+}
+
+class ConstantUImmAsmOperandClass<int Bits, list<AsmOperandClass> Supers = [],
+ int Offset = 0> : AsmOperandClass {
+ let Name = "ConstantUImm" # Bits # "_" # Offset;
+ let RenderMethod = "addConstantUImmOperands<" # Bits # ", " # Offset # ">";
+ let PredicateMethod = "isConstantUImm<" # Bits # ", " # Offset # ">";
+ let SuperClasses = Supers;
+ let DiagnosticType = "UImm" # Bits # "_" # Offset;
+}
+
+class ConstantUImmRangeAsmOperandClass<int Bottom, int Top,
+ list<AsmOperandClass> Supers = []>
+ : AsmOperandClass {
+ let Name = "ConstantUImmRange" # Bottom # "_" # Top;
+ let RenderMethod = "addImmOperands";
+ let PredicateMethod = "isConstantUImmRange<" # Bottom # ", " # Top # ">";
+ let SuperClasses = Supers;
+ let DiagnosticType = "UImmRange" # Bottom # "_" # Top;
+}
+
+class SImmAsmOperandClass<int Bits, list<AsmOperandClass> Supers = []>
+ : AsmOperandClass {
+ let Name = "SImm" # Bits;
+ let RenderMethod = "addSImmOperands<" # Bits # ">";
+ let PredicateMethod = "isSImm<" # Bits # ">";
+ let SuperClasses = Supers;
+ let DiagnosticType = "SImm" # Bits;
+}
+
+class UImmAsmOperandClass<int Bits, list<AsmOperandClass> Supers = []>
+ : AsmOperandClass {
+ let Name = "UImm" # Bits;
+ let RenderMethod = "addUImmOperands<" # Bits # ">";
+ let PredicateMethod = "isUImm<" # Bits # ">";
+ let SuperClasses = Supers;
+ let DiagnosticType = "UImm" # Bits;
+}
+
+// Generic case - only to support certain assembly pseudo instructions.
+class UImmAnyAsmOperandClass<int Bits, list<AsmOperandClass> Supers = []>
+ : AsmOperandClass {
+ let Name = "ImmAny";
+ let RenderMethod = "addConstantUImmOperands<32>";
+ let PredicateMethod = "isSImm<" # Bits # ">";
+ let SuperClasses = Supers;
+ let DiagnosticType = "ImmAny";
+}
+
+// AsmOperandClasses require a strict ordering which is difficult to manage
+// as a hierarchy. Instead, we use a linear ordering and impose an order that
+// is in some places arbitrary.
+//
+// Here the rules that are in use:
+// * Wider immediates are a superset of narrower immediates:
+// uimm4 < uimm5 < uimm6
+// * For the same bit-width, unsigned immediates are a superset of signed
+// immediates::
+// simm4 < uimm4 < simm5 < uimm5
+// * For the same upper-bound, signed immediates are a superset of unsigned
+// immediates:
+// uimm3 < simm4 < uimm4 < simm4
+// * Modified immediates are a superset of ordinary immediates:
+// uimm5 < uimm5_plus1 (1..32) < uimm5_plus32 (32..63) < uimm6
+// The term 'superset' starts to break down here since the uimm5_plus* classes
+// are not true supersets of uimm5 (but they are still subsets of uimm6).
+// * 'Relaxed' immediates are supersets of the corresponding unsigned immediate.
+// uimm16 < uimm16_relaxed
+// * The codeGen pattern type is arbitrarily ordered.
+// uimm5 < uimm5_64, and uimm5 < vsplat_uimm5
+// This is entirely arbitrary. We need an ordering and what we pick is
+// unimportant since only one is possible for a given mnemonic.
+
+def UImm32CoercedAsmOperandClass : UImmAnyAsmOperandClass<33, []> {
+ let Name = "UImm32_Coerced";
+ let DiagnosticType = "UImm32_Coerced";
+}
+def SImm32RelaxedAsmOperandClass
+ : SImmAsmOperandClass<32, [UImm32CoercedAsmOperandClass]> {
+ let Name = "SImm32_Relaxed";
+ let PredicateMethod = "isAnyImm<32>";
+ let DiagnosticType = "SImm32_Relaxed";
+}
+def SImm32AsmOperandClass
+ : SImmAsmOperandClass<32, [SImm32RelaxedAsmOperandClass]>;
+def ConstantUImm26AsmOperandClass
+ : ConstantUImmAsmOperandClass<26, [SImm32AsmOperandClass]>;
+def ConstantUImm20AsmOperandClass
+ : ConstantUImmAsmOperandClass<20, [ConstantUImm26AsmOperandClass]>;
+def ConstantSImm19Lsl2AsmOperandClass : AsmOperandClass {
+ let Name = "SImm19Lsl2";
+ let RenderMethod = "addImmOperands";
+ let PredicateMethod = "isScaledSImm<19, 2>";
+ let SuperClasses = [ConstantUImm20AsmOperandClass];
+ let DiagnosticType = "SImm19_Lsl2";
+}
+def UImm16RelaxedAsmOperandClass
+ : UImmAsmOperandClass<16, [ConstantUImm20AsmOperandClass]> {
+ let Name = "UImm16_Relaxed";
+ let PredicateMethod = "isAnyImm<16>";
+ let DiagnosticType = "UImm16_Relaxed";
+}
+// Similar to the relaxed classes which take an SImm and render it as
+// an UImm, this takes a UImm and renders it as an SImm.
+def UImm16AltRelaxedAsmOperandClass
+ : SImmAsmOperandClass<16, [UImm16RelaxedAsmOperandClass]> {
+ let Name = "UImm16_AltRelaxed";
+ let PredicateMethod = "isUImm<16>";
+ let DiagnosticType = "UImm16_AltRelaxed";
+}
+// FIXME: One of these should probably have UImm16AsmOperandClass as the
+// superclass instead of UImm16RelaxedasmOPerandClass.
+def UImm16AsmOperandClass
+ : UImmAsmOperandClass<16, [UImm16RelaxedAsmOperandClass]>;
+def SImm16RelaxedAsmOperandClass
+ : SImmAsmOperandClass<16, [UImm16RelaxedAsmOperandClass]> {
+ let Name = "SImm16_Relaxed";
+ let PredicateMethod = "isAnyImm<16>";
+ let DiagnosticType = "SImm16_Relaxed";
+}
+def SImm16AsmOperandClass
+ : SImmAsmOperandClass<16, [SImm16RelaxedAsmOperandClass]>;
+def ConstantSImm10Lsl3AsmOperandClass : AsmOperandClass {
+ let Name = "SImm10Lsl3";
+ let RenderMethod = "addImmOperands";
+ let PredicateMethod = "isScaledSImm<10, 3>";
+ let SuperClasses = [SImm16AsmOperandClass];
+ let DiagnosticType = "SImm10_Lsl3";
+}
+def ConstantSImm10Lsl2AsmOperandClass : AsmOperandClass {
+ let Name = "SImm10Lsl2";
+ let RenderMethod = "addImmOperands";
+ let PredicateMethod = "isScaledSImm<10, 2>";
+ let SuperClasses = [ConstantSImm10Lsl3AsmOperandClass];
+ let DiagnosticType = "SImm10_Lsl2";
+}
+def ConstantSImm11AsmOperandClass
+ : ConstantSImmAsmOperandClass<11, [ConstantSImm10Lsl2AsmOperandClass]>;
+def ConstantSImm10Lsl1AsmOperandClass : AsmOperandClass {
+ let Name = "SImm10Lsl1";
+ let RenderMethod = "addImmOperands";
+ let PredicateMethod = "isScaledSImm<10, 1>";
+ let SuperClasses = [ConstantSImm11AsmOperandClass];
+ let DiagnosticType = "SImm10_Lsl1";
+}
+def ConstantUImm10AsmOperandClass
+ : ConstantUImmAsmOperandClass<10, [ConstantSImm10Lsl1AsmOperandClass]>;
+def ConstantSImm10AsmOperandClass
+ : ConstantSImmAsmOperandClass<10, [ConstantUImm10AsmOperandClass]>;
+def ConstantSImm9AsmOperandClass
+ : ConstantSImmAsmOperandClass<9, [ConstantSImm10AsmOperandClass]>;
+def ConstantSImm7Lsl2AsmOperandClass : AsmOperandClass {
+ let Name = "SImm7Lsl2";
+ let RenderMethod = "addImmOperands";
+ let PredicateMethod = "isScaledSImm<7, 2>";
+ let SuperClasses = [ConstantSImm9AsmOperandClass];
+ let DiagnosticType = "SImm7_Lsl2";
+}
+def ConstantUImm8AsmOperandClass
+ : ConstantUImmAsmOperandClass<8, [ConstantSImm7Lsl2AsmOperandClass]>;
+def ConstantUImm7Sub1AsmOperandClass
+ : ConstantUImmAsmOperandClass<7, [ConstantUImm8AsmOperandClass], -1> {
+ // Specify the names since the -1 offset causes invalid identifiers otherwise.
+ let Name = "UImm7_N1";
+ let DiagnosticType = "UImm7_N1";
+}
+def ConstantUImm7AsmOperandClass
+ : ConstantUImmAsmOperandClass<7, [ConstantUImm7Sub1AsmOperandClass]>;
+def ConstantUImm6Lsl2AsmOperandClass : AsmOperandClass {
+ let Name = "UImm6Lsl2";
+ let RenderMethod = "addImmOperands";
+ let PredicateMethod = "isScaledUImm<6, 2>";
+ let SuperClasses = [ConstantUImm7AsmOperandClass];
+ let DiagnosticType = "UImm6_Lsl2";
+}
+def ConstantUImm6AsmOperandClass
+ : ConstantUImmAsmOperandClass<6, [ConstantUImm6Lsl2AsmOperandClass]>;
+def ConstantSImm6AsmOperandClass
+ : ConstantSImmAsmOperandClass<6, [ConstantUImm6AsmOperandClass]>;
+def ConstantUImm5Lsl2AsmOperandClass : AsmOperandClass {
+ let Name = "UImm5Lsl2";
+ let RenderMethod = "addImmOperands";
+ let PredicateMethod = "isScaledUImm<5, 2>";
+ let SuperClasses = [ConstantSImm6AsmOperandClass];
+ let DiagnosticType = "UImm5_Lsl2";
+}
+def ConstantUImm5_Range2_64AsmOperandClass
+ : ConstantUImmRangeAsmOperandClass<2, 64, [ConstantUImm5Lsl2AsmOperandClass]>;
+def ConstantUImm5Plus33AsmOperandClass
+ : ConstantUImmAsmOperandClass<5, [ConstantUImm5_Range2_64AsmOperandClass],
+ 33>;
+def ConstantUImm5ReportUImm6AsmOperandClass
+ : ConstantUImmAsmOperandClass<5, [ConstantUImm5Plus33AsmOperandClass]> {
+ let Name = "ConstantUImm5_0_Report_UImm6";
+ let DiagnosticType = "UImm5_0_Report_UImm6";
+}
+def ConstantUImm5Plus32AsmOperandClass
+ : ConstantUImmAsmOperandClass<
+ 5, [ConstantUImm5ReportUImm6AsmOperandClass], 32>;
+def ConstantUImm5Plus32NormalizeAsmOperandClass
+ : ConstantUImmAsmOperandClass<5, [ConstantUImm5Plus32AsmOperandClass], 32> {
+ let Name = "ConstantUImm5_32_Norm";
+ // We must also subtract 32 when we render the operand.
+ let RenderMethod = "addConstantUImmOperands<5, 32, -32>";
+}
+def ConstantUImm5Plus1AsmOperandClass
+ : ConstantUImmAsmOperandClass<
+ 5, [ConstantUImm5Plus32NormalizeAsmOperandClass], 1>;
+def ConstantUImm5AsmOperandClass
+ : ConstantUImmAsmOperandClass<5, [ConstantUImm5Plus1AsmOperandClass]>;
+def ConstantSImm5AsmOperandClass
+ : ConstantSImmAsmOperandClass<5, [ConstantUImm5AsmOperandClass]>;
+def ConstantUImm4AsmOperandClass
+ : ConstantUImmAsmOperandClass<4, [ConstantSImm5AsmOperandClass]>;
+def ConstantSImm4AsmOperandClass
+ : ConstantSImmAsmOperandClass<4, [ConstantUImm4AsmOperandClass]>;
+def ConstantUImm3AsmOperandClass
+ : ConstantUImmAsmOperandClass<3, [ConstantSImm4AsmOperandClass]>;
+def ConstantUImm2Plus1AsmOperandClass
+ : ConstantUImmAsmOperandClass<2, [ConstantUImm3AsmOperandClass], 1>;
+def ConstantUImm2AsmOperandClass
+ : ConstantUImmAsmOperandClass<2, [ConstantUImm3AsmOperandClass]>;
+def ConstantUImm1AsmOperandClass
+ : ConstantUImmAsmOperandClass<1, [ConstantUImm2AsmOperandClass]>;
+def ConstantImmzAsmOperandClass : AsmOperandClass {
+ let Name = "ConstantImmz";
+ let RenderMethod = "addConstantUImmOperands<1>";
+ let PredicateMethod = "isConstantImmz";
+ let SuperClasses = [ConstantUImm1AsmOperandClass];
+ let DiagnosticType = "Immz";
+}
+
+def Simm19Lsl2AsmOperand
+ : SimmLslAsmOperandClass<19, [], 2>;
+
+def MipsJumpTargetAsmOperand : AsmOperandClass {
+ let Name = "JumpTarget";
+ let ParserMethod = "parseJumpTarget";
+ let PredicateMethod = "isImm";
+ let RenderMethod = "addImmOperands";
+}
+
+// Instruction operand types
+def jmptarget : Operand<OtherVT> {
+ let EncoderMethod = "getJumpTargetOpValue";
+ let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+def brtarget : Operand<OtherVT> {
+ let EncoderMethod = "getBranchTargetOpValue";
+ let OperandType = "OPERAND_PCREL";
+ let DecoderMethod = "DecodeBranchTarget";
+ let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+def brtarget1SImm16 : Operand<OtherVT> {
+ let EncoderMethod = "getBranchTargetOpValue1SImm16";
+ let OperandType = "OPERAND_PCREL";
+ let DecoderMethod = "DecodeBranchTarget1SImm16";
+ let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+def calltarget : Operand<iPTR> {
+ let EncoderMethod = "getJumpTargetOpValue";
+ let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+def imm64: Operand<i64>;
+
+def simm19_lsl2 : Operand<i32> {
+ let EncoderMethod = "getSimm19Lsl2Encoding";
+ let DecoderMethod = "DecodeSimm19Lsl2";
+ let ParserMatchClass = Simm19Lsl2AsmOperand;
+}
+
+def simm18_lsl3 : Operand<i32> {
+ let EncoderMethod = "getSimm18Lsl3Encoding";
+ let DecoderMethod = "DecodeSimm18Lsl3";
+ let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+// Zero
+def uimmz : Operand<i32> {
+ let PrintMethod = "printUImm<0>";
+ let ParserMatchClass = ConstantImmzAsmOperandClass;
+}
+
+// size operand of ins instruction
+def uimm_range_2_64 : Operand<i32> {
+ let PrintMethod = "printUImm<6, 2>";
+ let EncoderMethod = "getSizeInsEncoding";
+ let DecoderMethod = "DecodeInsSize";
+ let ParserMatchClass = ConstantUImm5_Range2_64AsmOperandClass;
+}
+
+// Unsigned Operands
+foreach I = {1, 2, 3, 4, 5, 6, 7, 8, 10, 20, 26} in
+ def uimm # I : Operand<i32> {
+ let PrintMethod = "printUImm<" # I # ">";
+ let ParserMatchClass =
+ !cast<AsmOperandClass>("ConstantUImm" # I # "AsmOperandClass");
+ }
+
+def uimm2_plus1 : Operand<i32> {
+ let PrintMethod = "printUImm<2, 1>";
+ let EncoderMethod = "getUImmWithOffsetEncoding<2, 1>";
+ let DecoderMethod = "DecodeUImmWithOffset<2, 1>";
+ let ParserMatchClass = ConstantUImm2Plus1AsmOperandClass;
+}
+
+def uimm5_plus1 : Operand<i32> {
+ let PrintMethod = "printUImm<5, 1>";
+ let EncoderMethod = "getUImmWithOffsetEncoding<5, 1>";
+ let DecoderMethod = "DecodeUImmWithOffset<5, 1>";
+ let ParserMatchClass = ConstantUImm5Plus1AsmOperandClass;
+}
+
+def uimm5_plus32 : Operand<i32> {
+ let PrintMethod = "printUImm<5, 32>";
+ let ParserMatchClass = ConstantUImm5Plus32AsmOperandClass;
+}
+
+def uimm5_plus33 : Operand<i32> {
+ let PrintMethod = "printUImm<5, 33>";
+ let EncoderMethod = "getUImmWithOffsetEncoding<5, 1>";
+ let DecoderMethod = "DecodeUImmWithOffset<5, 1>";
+ let ParserMatchClass = ConstantUImm5Plus33AsmOperandClass;
+}
+
+def uimm5_inssize_plus1 : Operand<i32> {
+ let PrintMethod = "printUImm<6>";
+ let ParserMatchClass = ConstantUImm5Plus1AsmOperandClass;
+ let EncoderMethod = "getSizeInsEncoding";
+ let DecoderMethod = "DecodeInsSize";
+}
+
+def uimm5_plus32_normalize : Operand<i32> {
+ let PrintMethod = "printUImm<5>";
+ let ParserMatchClass = ConstantUImm5Plus32NormalizeAsmOperandClass;
+}
+
+def uimm5_lsl2 : Operand<OtherVT> {
+ let EncoderMethod = "getUImm5Lsl2Encoding";
+ let DecoderMethod = "DecodeUImmWithOffsetAndScale<5, 0, 4>";
+ let ParserMatchClass = ConstantUImm5Lsl2AsmOperandClass;
+}
+
+def uimm5_plus32_normalize_64 : Operand<i64> {
+ let PrintMethod = "printUImm<5>";
+ let ParserMatchClass = ConstantUImm5Plus32NormalizeAsmOperandClass;
+}
+
+def uimm6_lsl2 : Operand<OtherVT> {
+ let EncoderMethod = "getUImm6Lsl2Encoding";
+ let DecoderMethod = "DecodeUImmWithOffsetAndScale<6, 0, 4>";
+ let ParserMatchClass = ConstantUImm6Lsl2AsmOperandClass;
+}
+
+foreach I = {16} in
+ def uimm # I : Operand<i32> {
+ let PrintMethod = "printUImm<" # I # ">";
+ let ParserMatchClass =
+ !cast<AsmOperandClass>("UImm" # I # "AsmOperandClass");
+ }
+
+// Like uimm16_64 but coerces simm16 to uimm16.
+def uimm16_relaxed : Operand<i32> {
+ let PrintMethod = "printUImm<16>";
+ let ParserMatchClass =
+ !cast<AsmOperandClass>("UImm16RelaxedAsmOperandClass");
+}
+
+foreach I = {5} in
+ def uimm # I # _64 : Operand<i64> {
+ let PrintMethod = "printUImm<" # I # ">";
+ let ParserMatchClass =
+ !cast<AsmOperandClass>("ConstantUImm" # I # "AsmOperandClass");
+ }
+
+foreach I = {16} in
+ def uimm # I # _64 : Operand<i64> {
+ let PrintMethod = "printUImm<" # I # ">";
+ let ParserMatchClass =
+ !cast<AsmOperandClass>("UImm" # I # "AsmOperandClass");
+ }
+
+// Like uimm16_64 but coerces simm16 to uimm16.
+def uimm16_64_relaxed : Operand<i64> {
+ let PrintMethod = "printUImm<16>";
+ let ParserMatchClass =
+ !cast<AsmOperandClass>("UImm16RelaxedAsmOperandClass");
+}
+
+def uimm16_altrelaxed : Operand<i32> {
+ let PrintMethod = "printUImm<16>";
+ let ParserMatchClass =
+ !cast<AsmOperandClass>("UImm16AltRelaxedAsmOperandClass");
+}
+// Like uimm5 but reports a less confusing error for 32-63 when
+// an instruction alias permits that.
+def uimm5_report_uimm6 : Operand<i32> {
+ let PrintMethod = "printUImm<5>";
+ let ParserMatchClass = ConstantUImm5ReportUImm6AsmOperandClass;
+}
+
+// Like uimm5_64 but reports a less confusing error for 32-63 when
+// an instruction alias permits that.
+def uimm5_64_report_uimm6 : Operand<i64> {
+ let PrintMethod = "printUImm<5>";
+ let ParserMatchClass = ConstantUImm5ReportUImm6AsmOperandClass;
+}
+
+foreach I = {1, 2, 3, 4} in
+ def uimm # I # _ptr : Operand<iPTR> {
+ let PrintMethod = "printUImm<" # I # ">";
+ let ParserMatchClass =
+ !cast<AsmOperandClass>("ConstantUImm" # I # "AsmOperandClass");
+ }
+
+foreach I = {1, 2, 3, 4, 5, 6, 8} in
+ def vsplat_uimm # I : Operand<vAny> {
+ let PrintMethod = "printUImm<" # I # ">";
+ let ParserMatchClass =
+ !cast<AsmOperandClass>("ConstantUImm" # I # "AsmOperandClass");
+ }
+
+// Signed operands
+foreach I = {4, 5, 6, 9, 10, 11} in
+ def simm # I : Operand<i32> {
+ let DecoderMethod = "DecodeSImmWithOffsetAndScale<" # I # ">";
+ let ParserMatchClass =
+ !cast<AsmOperandClass>("ConstantSImm" # I # "AsmOperandClass");
+ }
+
+foreach I = {1, 2, 3} in
+ def simm10_lsl # I : Operand<i32> {
+ let DecoderMethod = "DecodeSImmWithOffsetAndScale<10, " # I # ">";
+ let ParserMatchClass =
+ !cast<AsmOperandClass>("ConstantSImm10Lsl" # I # "AsmOperandClass");
+ }
+
+foreach I = {10} in
+ def simm # I # _64 : Operand<i64> {
+ let DecoderMethod = "DecodeSImmWithOffsetAndScale<" # I # ">";
+ let ParserMatchClass =
+ !cast<AsmOperandClass>("ConstantSImm" # I # "AsmOperandClass");
+ }
+
+foreach I = {5, 10} in
+ def vsplat_simm # I : Operand<vAny> {
+ let ParserMatchClass =
+ !cast<AsmOperandClass>("ConstantSImm" # I # "AsmOperandClass");
+ }
+
+def simm7_lsl2 : Operand<OtherVT> {
+ let EncoderMethod = "getSImm7Lsl2Encoding";
+ let DecoderMethod = "DecodeSImmWithOffsetAndScale<" # I # ", 0, 4>";
+ let ParserMatchClass = ConstantSImm7Lsl2AsmOperandClass;
+}
+
+foreach I = {16, 32} in
+ def simm # I : Operand<i32> {
+ let DecoderMethod = "DecodeSImmWithOffsetAndScale<" # I # ">";
+ let ParserMatchClass = !cast<AsmOperandClass>("SImm" # I # "AsmOperandClass");
+ }
+
+// Like simm16 but coerces uimm16 to simm16.
+def simm16_relaxed : Operand<i32> {
+ let DecoderMethod = "DecodeSImmWithOffsetAndScale<16>";
+ let ParserMatchClass = !cast<AsmOperandClass>("SImm16RelaxedAsmOperandClass");
+}
+
+def simm16_64 : Operand<i64> {
+ let DecoderMethod = "DecodeSImmWithOffsetAndScale<16>";
+ let ParserMatchClass = !cast<AsmOperandClass>("SImm16AsmOperandClass");
+}
+
+// like simm32 but coerces simm32 to uimm32.
+def uimm32_coerced : Operand<i32> {
+ let ParserMatchClass = !cast<AsmOperandClass>("UImm32CoercedAsmOperandClass");
+}
+// Like simm32 but coerces uimm32 to simm32.
+def simm32_relaxed : Operand<i32> {
+ let DecoderMethod = "DecodeSImmWithOffsetAndScale<32>";
+ let ParserMatchClass = !cast<AsmOperandClass>("SImm32RelaxedAsmOperandClass");
+}
+
+// This is almost the same as a uimm7 but 0x7f is interpreted as -1.
+def li16_imm : Operand<i32> {
+ let DecoderMethod = "DecodeLi16Imm";
+ let ParserMatchClass = ConstantUImm7Sub1AsmOperandClass;
+}
+
+def MipsMemAsmOperand : AsmOperandClass {
+ let Name = "Mem";
+ let ParserMethod = "parseMemOperand";
+}
+
+def MipsMemSimm9AsmOperand : AsmOperandClass {
+ let Name = "MemOffsetSimm9";
+ let SuperClasses = [MipsMemAsmOperand];
+ let RenderMethod = "addMemOperands";
+ let ParserMethod = "parseMemOperand";
+ let PredicateMethod = "isMemWithSimmOffset<9>";
+ let DiagnosticType = "MemSImm9";
+}
+
+def MipsMemSimm10AsmOperand : AsmOperandClass {
+ let Name = "MemOffsetSimm10";
+ let SuperClasses = [MipsMemAsmOperand];
+ let RenderMethod = "addMemOperands";
+ let ParserMethod = "parseMemOperand";
+ let PredicateMethod = "isMemWithSimmOffset<10>";
+ let DiagnosticType = "MemSImm10";
+}
+
+def MipsMemSimm12AsmOperand : AsmOperandClass {
+ let Name = "MemOffsetSimm12";
+ let SuperClasses = [MipsMemAsmOperand];
+ let RenderMethod = "addMemOperands";
+ let ParserMethod = "parseMemOperand";
+ let PredicateMethod = "isMemWithSimmOffset<12>";
+ let DiagnosticType = "MemSImm12";
+}
+
+foreach I = {1, 2, 3} in
+ def MipsMemSimm10Lsl # I # AsmOperand : AsmOperandClass {
+ let Name = "MemOffsetSimm10_" # I;
+ let SuperClasses = [MipsMemAsmOperand];
+ let RenderMethod = "addMemOperands";
+ let ParserMethod = "parseMemOperand";
+ let PredicateMethod = "isMemWithSimmOffset<10, " # I # ">";
+ let DiagnosticType = "MemSImm10Lsl" # I;
+ }
+
+def MipsMemSimm11AsmOperand : AsmOperandClass {
+ let Name = "MemOffsetSimm11";
+ let SuperClasses = [MipsMemAsmOperand];
+ let RenderMethod = "addMemOperands";
+ let ParserMethod = "parseMemOperand";
+ let PredicateMethod = "isMemWithSimmOffset<11>";
+ let DiagnosticType = "MemSImm11";
+}
+
+def MipsMemSimm16AsmOperand : AsmOperandClass {
+ let Name = "MemOffsetSimm16";
+ let SuperClasses = [MipsMemAsmOperand];
+ let RenderMethod = "addMemOperands";
+ let ParserMethod = "parseMemOperand";
+ let PredicateMethod = "isMemWithSimmOffset<16>";
+ let DiagnosticType = "MemSImm16";
+}
+
+def MipsInvertedImmoperand : AsmOperandClass {
+ let Name = "InvNum";
+ let RenderMethod = "addImmOperands";
+ let ParserMethod = "parseInvNum";
+}
+
+def InvertedImOperand : Operand<i32> {
+ let ParserMatchClass = MipsInvertedImmoperand;
+}
+
+def InvertedImOperand64 : Operand<i64> {
+ let ParserMatchClass = MipsInvertedImmoperand;
+}
+
+class mem_generic : Operand<iPTR> {
+ let PrintMethod = "printMemOperand";
+ let MIOperandInfo = (ops ptr_rc, simm16);
+ let EncoderMethod = "getMemEncoding";
+ let ParserMatchClass = MipsMemAsmOperand;
+ let OperandType = "OPERAND_MEMORY";
+}
+
+// Address operand
+def mem : mem_generic;
+
+// MSA specific address operand
+def mem_msa : mem_generic {
+ let MIOperandInfo = (ops ptr_rc, simm10);
+ let EncoderMethod = "getMSAMemEncoding";
+}
+
+def simm12 : Operand<i32> {
+ let DecoderMethod = "DecodeSimm12";
+}
+
+def mem_simm9 : mem_generic {
+ let MIOperandInfo = (ops ptr_rc, simm9);
+ let EncoderMethod = "getMemEncoding";
+ let ParserMatchClass = MipsMemSimm9AsmOperand;
+}
+
+def mem_simm10 : mem_generic {
+ let MIOperandInfo = (ops ptr_rc, simm10);
+ let EncoderMethod = "getMemEncoding";
+ let ParserMatchClass = MipsMemSimm10AsmOperand;
+}
+
+foreach I = {1, 2, 3} in
+ def mem_simm10_lsl # I : mem_generic {
+ let MIOperandInfo = (ops ptr_rc, !cast<Operand>("simm10_lsl" # I));
+ let EncoderMethod = "getMemEncoding<" # I # ">";
+ let ParserMatchClass =
+ !cast<AsmOperandClass>("MipsMemSimm10Lsl" # I # "AsmOperand");
+ }
+
+def mem_simm11 : mem_generic {
+ let MIOperandInfo = (ops ptr_rc, simm11);
+ let EncoderMethod = "getMemEncoding";
+ let ParserMatchClass = MipsMemSimm11AsmOperand;
+}
+
+def mem_simm12 : mem_generic {
+ let MIOperandInfo = (ops ptr_rc, simm12);
+ let EncoderMethod = "getMemEncoding";
+ let ParserMatchClass = MipsMemSimm12AsmOperand;
+}
+
+def mem_simm16 : mem_generic {
+ let MIOperandInfo = (ops ptr_rc, simm16);
+ let EncoderMethod = "getMemEncoding";
+ let ParserMatchClass = MipsMemSimm16AsmOperand;
+}
+
+def mem_ea : Operand<iPTR> {
+ let PrintMethod = "printMemOperandEA";
+ let MIOperandInfo = (ops ptr_rc, simm16);
+ let EncoderMethod = "getMemEncoding";
+ let OperandType = "OPERAND_MEMORY";
+}
+
+def PtrRC : Operand<iPTR> {
+ let MIOperandInfo = (ops ptr_rc);
+ let DecoderMethod = "DecodePtrRegisterClass";
+ let ParserMatchClass = GPR32AsmOperand;
+}
+
+// size operand of ins instruction
+def size_ins : Operand<i32> {
+ let EncoderMethod = "getSizeInsEncoding";
+ let DecoderMethod = "DecodeInsSize";
+}
+
+// Transformation Function - get the lower 16 bits.
+def LO16 : SDNodeXForm<imm, [{
+ return getImm(N, N->getZExtValue() & 0xFFFF);
+}]>;
+
+// Transformation Function - get the higher 16 bits.
+def HI16 : SDNodeXForm<imm, [{
+ return getImm(N, (N->getZExtValue() >> 16) & 0xFFFF);
+}]>;
+
+// Plus 1.
+def Plus1 : SDNodeXForm<imm, [{ return getImm(N, N->getSExtValue() + 1); }]>;
+
+// Node immediate is zero (e.g. insve.d)
+def immz : PatLeaf<(imm), [{ return N->getSExtValue() == 0; }]>;
+
+// Node immediate fits as 16-bit sign extended on target immediate.
+// e.g. addi, andi
+def immSExt8 : PatLeaf<(imm), [{ return isInt<8>(N->getSExtValue()); }]>;
+
+// Node immediate fits as 16-bit sign extended on target immediate.
+// e.g. addi, andi
+def immSExt16 : PatLeaf<(imm), [{ return isInt<16>(N->getSExtValue()); }]>;
+
+// Node immediate fits as 7-bit zero extended on target immediate.
+def immZExt7 : PatLeaf<(imm), [{ return isUInt<7>(N->getZExtValue()); }]>;
+
+// Node immediate fits as 16-bit zero extended on target immediate.
+// The LO16 param means that only the lower 16 bits of the node
+// immediate are caught.
+// e.g. addiu, sltiu
+def immZExt16 : PatLeaf<(imm), [{
+ if (N->getValueType(0) == MVT::i32)
+ return (uint32_t)N->getZExtValue() == (unsigned short)N->getZExtValue();
+ else
+ return (uint64_t)N->getZExtValue() == (unsigned short)N->getZExtValue();
+}], LO16>;
+
+// Immediate can be loaded with LUi (32-bit int with lower 16-bit cleared).
+def immSExt32Low16Zero : PatLeaf<(imm), [{
+ int64_t Val = N->getSExtValue();
+ return isInt<32>(Val) && !(Val & 0xffff);
+}]>;
+
+// Zero-extended 32-bit unsigned int with lower 16-bit cleared.
+def immZExt32Low16Zero : PatLeaf<(imm), [{
+ uint64_t Val = N->getZExtValue();
+ return isUInt<32>(Val) && !(Val & 0xffff);
+}]>;
+
+// Note immediate fits as a 32 bit signed extended on target immediate.
+def immSExt32 : PatLeaf<(imm), [{ return isInt<32>(N->getSExtValue()); }]>;
+
+// Note immediate fits as a 32 bit zero extended on target immediate.
+def immZExt32 : PatLeaf<(imm), [{ return isUInt<32>(N->getZExtValue()); }]>;
+
+// shamt field must fit in 5 bits.
+def immZExt5 : ImmLeaf<i32, [{return Imm == (Imm & 0x1f);}]>;
+
+def immZExt5Plus1 : PatLeaf<(imm), [{
+ return isUInt<5>(N->getZExtValue() - 1);
+}]>;
+def immZExt5Plus32 : PatLeaf<(imm), [{
+ return isUInt<5>(N->getZExtValue() - 32);
+}]>;
+def immZExt5Plus33 : PatLeaf<(imm), [{
+ return isUInt<5>(N->getZExtValue() - 33);
+}]>;
+
+// True if (N + 1) fits in 16-bit field.
+def immSExt16Plus1 : PatLeaf<(imm), [{
+ return isInt<17>(N->getSExtValue()) && isInt<16>(N->getSExtValue() + 1);
+}]>;
+
+// Mips Address Mode! SDNode frameindex could possibily be a match
+// since load and store instructions from stack used it.
+def addr :
+ ComplexPattern<iPTR, 2, "selectIntAddr", [frameindex]>;
+
+def addrRegImm :
+ ComplexPattern<iPTR, 2, "selectAddrRegImm", [frameindex]>;
+
+def addrDefault :
+ ComplexPattern<iPTR, 2, "selectAddrDefault", [frameindex]>;
+
+def addrimm10 : ComplexPattern<iPTR, 2, "selectIntAddrSImm10", [frameindex]>;
+def addrimm10lsl1 : ComplexPattern<iPTR, 2, "selectIntAddrSImm10Lsl1",
+ [frameindex]>;
+def addrimm10lsl2 : ComplexPattern<iPTR, 2, "selectIntAddrSImm10Lsl2",
+ [frameindex]>;
+def addrimm10lsl3 : ComplexPattern<iPTR, 2, "selectIntAddrSImm10Lsl3",
+ [frameindex]>;
+
+//===----------------------------------------------------------------------===//
+// Instructions specific format
+//===----------------------------------------------------------------------===//
+
+// Arithmetic and logical instructions with 3 register operands.
+class ArithLogicR<string opstr, RegisterOperand RO, bit isComm = 0,
+ InstrItinClass Itin = NoItinerary,
+ SDPatternOperator OpNode = null_frag>:
+ InstSE<(outs RO:$rd), (ins RO:$rs, RO:$rt),
+ !strconcat(opstr, "\t$rd, $rs, $rt"),
+ [(set RO:$rd, (OpNode RO:$rs, RO:$rt))], Itin, FrmR, opstr> {
+ let isCommutable = isComm;
+ let isReMaterializable = 1;
+ let TwoOperandAliasConstraint = "$rd = $rs";
+}
+
+// Arithmetic and logical instructions with 2 register operands.
+class ArithLogicI<string opstr, Operand Od, RegisterOperand RO,
+ InstrItinClass Itin = NoItinerary,
+ SDPatternOperator imm_type = null_frag,
+ SDPatternOperator OpNode = null_frag> :
+ InstSE<(outs RO:$rt), (ins RO:$rs, Od:$imm16),
+ !strconcat(opstr, "\t$rt, $rs, $imm16"),
+ [(set RO:$rt, (OpNode RO:$rs, imm_type:$imm16))],
+ Itin, FrmI, opstr> {
+ let isReMaterializable = 1;
+ let TwoOperandAliasConstraint = "$rs = $rt";
+}
+
+// Arithmetic Multiply ADD/SUB
+class MArithR<string opstr, InstrItinClass itin, bit isComm = 0> :
+ InstSE<(outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
+ !strconcat(opstr, "\t$rs, $rt"), [], itin, FrmR, opstr> {
+ let Defs = [HI0, LO0];
+ let Uses = [HI0, LO0];
+ let isCommutable = isComm;
+}
+
+// Logical
+class LogicNOR<string opstr, RegisterOperand RO>:
+ InstSE<(outs RO:$rd), (ins RO:$rs, RO:$rt),
+ !strconcat(opstr, "\t$rd, $rs, $rt"),
+ [(set RO:$rd, (not (or RO:$rs, RO:$rt)))], II_NOR, FrmR, opstr> {
+ let isCommutable = 1;
+}
+
+// Shifts
+class shift_rotate_imm<string opstr, Operand ImmOpnd,
+ RegisterOperand RO, InstrItinClass itin,
+ SDPatternOperator OpNode = null_frag,
+ SDPatternOperator PF = null_frag> :
+ InstSE<(outs RO:$rd), (ins RO:$rt, ImmOpnd:$shamt),
+ !strconcat(opstr, "\t$rd, $rt, $shamt"),
+ [(set RO:$rd, (OpNode RO:$rt, PF:$shamt))], itin, FrmR, opstr> {
+ let TwoOperandAliasConstraint = "$rt = $rd";
+}
+
+class shift_rotate_reg<string opstr, RegisterOperand RO, InstrItinClass itin,
+ SDPatternOperator OpNode = null_frag>:
+ InstSE<(outs RO:$rd), (ins RO:$rt, GPR32Opnd:$rs),
+ !strconcat(opstr, "\t$rd, $rt, $rs"),
+ [(set RO:$rd, (OpNode RO:$rt, GPR32Opnd:$rs))], itin, FrmR,
+ opstr>;
+
+// Load Upper Immediate
+class LoadUpper<string opstr, RegisterOperand RO, Operand Imm>:
+ InstSE<(outs RO:$rt), (ins Imm:$imm16), !strconcat(opstr, "\t$rt, $imm16"),
+ [], II_LUI, FrmI, opstr>, IsAsCheapAsAMove {
+ let hasSideEffects = 0;
+ let isReMaterializable = 1;
+}
+
+// Memory Load/Store
+class LoadMemory<string opstr, DAGOperand RO, DAGOperand MO,
+ SDPatternOperator OpNode = null_frag,
+ InstrItinClass Itin = NoItinerary,
+ ComplexPattern Addr = addr> :
+ InstSE<(outs RO:$rt), (ins MO:$addr), !strconcat(opstr, "\t$rt, $addr"),
+ [(set RO:$rt, (OpNode Addr:$addr))], Itin, FrmI, opstr> {
+ let DecoderMethod = "DecodeMem";
+ let canFoldAsLoad = 1;
+ let mayLoad = 1;
+}
+
+class Load<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag,
+ InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr> :
+ LoadMemory<opstr, RO, mem, OpNode, Itin, Addr>;
+
+class StoreMemory<string opstr, DAGOperand RO, DAGOperand MO,
+ SDPatternOperator OpNode = null_frag,
+ InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr> :
+ InstSE<(outs), (ins RO:$rt, MO:$addr), !strconcat(opstr, "\t$rt, $addr"),
+ [(OpNode RO:$rt, Addr:$addr)], Itin, FrmI, opstr> {
+ let DecoderMethod = "DecodeMem";
+ let mayStore = 1;
+}
+
+class Store<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag,
+ InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr,
+ DAGOperand MO = mem> :
+ StoreMemory<opstr, RO, MO, OpNode, Itin, Addr>;
+
+// Load/Store Left/Right
+let canFoldAsLoad = 1 in
+class LoadLeftRight<string opstr, SDNode OpNode, RegisterOperand RO,
+ InstrItinClass Itin> :
+ InstSE<(outs RO:$rt), (ins mem:$addr, RO:$src),
+ !strconcat(opstr, "\t$rt, $addr"),
+ [(set RO:$rt, (OpNode addr:$addr, RO:$src))], Itin, FrmI> {
+ let DecoderMethod = "DecodeMem";
+ string Constraints = "$src = $rt";
+}
+
+class StoreLeftRight<string opstr, SDNode OpNode, RegisterOperand RO,
+ InstrItinClass Itin> :
+ InstSE<(outs), (ins RO:$rt, mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+ [(OpNode RO:$rt, addr:$addr)], Itin, FrmI> {
+ let DecoderMethod = "DecodeMem";
+}
+
+// COP2 Load/Store
+class LW_FT2<string opstr, RegisterOperand RC, InstrItinClass Itin,
+ SDPatternOperator OpNode= null_frag> :
+ InstSE<(outs RC:$rt), (ins mem_simm16:$addr),
+ !strconcat(opstr, "\t$rt, $addr"),
+ [(set RC:$rt, (OpNode addrDefault:$addr))], Itin, FrmFI, opstr> {
+ let DecoderMethod = "DecodeFMem2";
+ let mayLoad = 1;
+}
+
+class SW_FT2<string opstr, RegisterOperand RC, InstrItinClass Itin,
+ SDPatternOperator OpNode= null_frag> :
+ InstSE<(outs), (ins RC:$rt, mem_simm16:$addr),
+ !strconcat(opstr, "\t$rt, $addr"),
+ [(OpNode RC:$rt, addrDefault:$addr)], Itin, FrmFI, opstr> {
+ let DecoderMethod = "DecodeFMem2";
+ let mayStore = 1;
+}
+
+// COP3 Load/Store
+class LW_FT3<string opstr, RegisterOperand RC, InstrItinClass Itin,
+ SDPatternOperator OpNode= null_frag> :
+ InstSE<(outs RC:$rt), (ins mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+ [(set RC:$rt, (OpNode addrDefault:$addr))], Itin, FrmFI, opstr> {
+ let DecoderMethod = "DecodeFMem3";
+ let mayLoad = 1;
+}
+
+class SW_FT3<string opstr, RegisterOperand RC, InstrItinClass Itin,
+ SDPatternOperator OpNode= null_frag> :
+ InstSE<(outs), (ins RC:$rt, mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+ [(OpNode RC:$rt, addrDefault:$addr)], Itin, FrmFI, opstr> {
+ let DecoderMethod = "DecodeFMem3";
+ let mayStore = 1;
+}
+
+// Conditional Branch
+class CBranch<string opstr, DAGOperand opnd, PatFrag cond_op,
+ RegisterOperand RO, bit DelaySlot = 1> :
+ InstSE<(outs), (ins RO:$rs, RO:$rt, opnd:$offset),
+ !strconcat(opstr, "\t$rs, $rt, $offset"),
+ [(brcond (i32 (cond_op RO:$rs, RO:$rt)), bb:$offset)], II_BCC,
+ FrmI, opstr> {
+ let isBranch = 1;
+ let isTerminator = 1;
+ let hasDelaySlot = DelaySlot;
+ let Defs = [AT];
+ bit isCTI = 1;
+}
+
+class CBranchZero<string opstr, DAGOperand opnd, PatFrag cond_op,
+ RegisterOperand RO, bit DelaySlot = 1> :
+ InstSE<(outs), (ins RO:$rs, opnd:$offset),
+ !strconcat(opstr, "\t$rs, $offset"),
+ [(brcond (i32 (cond_op RO:$rs, 0)), bb:$offset)], II_BCCZ,
+ FrmI, opstr> {
+ let isBranch = 1;
+ let isTerminator = 1;
+ let hasDelaySlot = DelaySlot;
+ let Defs = [AT];
+ bit isCTI = 1;
+}
+
+// SetCC
+class SetCC_R<string opstr, PatFrag cond_op, RegisterOperand RO> :
+ InstSE<(outs GPR32Opnd:$rd), (ins RO:$rs, RO:$rt),
+ !strconcat(opstr, "\t$rd, $rs, $rt"),
+ [(set GPR32Opnd:$rd, (cond_op RO:$rs, RO:$rt))],
+ II_SLT_SLTU, FrmR, opstr>;
+
+class SetCC_I<string opstr, PatFrag cond_op, Operand Od, PatLeaf imm_type,
+ RegisterOperand RO>:
+ InstSE<(outs GPR32Opnd:$rt), (ins RO:$rs, Od:$imm16),
+ !strconcat(opstr, "\t$rt, $rs, $imm16"),
+ [(set GPR32Opnd:$rt, (cond_op RO:$rs, imm_type:$imm16))],
+ II_SLTI_SLTIU, FrmI, opstr>;
+
+// Jump
+class JumpFJ<DAGOperand opnd, string opstr, SDPatternOperator operator,
+ SDPatternOperator targetoperator, string bopstr> :
+ InstSE<(outs), (ins opnd:$target), !strconcat(opstr, "\t$target"),
+ [(operator targetoperator:$target)], II_J, FrmJ, bopstr> {
+ let isTerminator=1;
+ let isBarrier=1;
+ let hasDelaySlot = 1;
+ let DecoderMethod = "DecodeJumpTarget";
+ let Defs = [AT];
+ bit isCTI = 1;
+}
+
+// Unconditional branch
+class UncondBranch<Instruction BEQInst> :
+ PseudoSE<(outs), (ins brtarget:$offset), [(br bb:$offset)], II_B>,
+ PseudoInstExpansion<(BEQInst ZERO, ZERO, brtarget:$offset)> {
+ let isBranch = 1;
+ let isTerminator = 1;
+ let isBarrier = 1;
+ let hasDelaySlot = 1;
+ let AdditionalPredicates = [RelocPIC];
+ let Defs = [AT];
+ bit isCTI = 1;
+}
+
+// Base class for indirect branch and return instruction classes.
+let isTerminator=1, isBarrier=1, hasDelaySlot = 1, isCTI = 1 in
+class JumpFR<string opstr, RegisterOperand RO,
+ SDPatternOperator operator = null_frag>:
+ InstSE<(outs), (ins RO:$rs), "jr\t$rs", [(operator RO:$rs)], II_JR,
+ FrmR, opstr>;
+
+// Indirect branch
+class IndirectBranch<string opstr, RegisterOperand RO> : JumpFR<opstr, RO> {
+ let isBranch = 1;
+ let isIndirectBranch = 1;
+}
+
+// Jump and Link (Call)
+let isCall=1, hasDelaySlot=1, isCTI=1, Defs = [RA] in {
+ class JumpLink<string opstr, DAGOperand opnd> :
+ InstSE<(outs), (ins opnd:$target), !strconcat(opstr, "\t$target"),
+ [(MipsJmpLink tglobaladdr:$target)], II_JAL, FrmJ, opstr> {
+ let DecoderMethod = "DecodeJumpTarget";
+ }
+
+ class JumpLinkRegPseudo<RegisterOperand RO, Instruction JALRInst,
+ Register RetReg, RegisterOperand ResRO = RO>:
+ PseudoSE<(outs), (ins RO:$rs), [(MipsJmpLink RO:$rs)], II_JALR>,
+ PseudoInstExpansion<(JALRInst RetReg, ResRO:$rs)>;
+
+ class JumpLinkReg<string opstr, RegisterOperand RO>:
+ InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
+ [], II_JALR, FrmR, opstr>;
+
+ class BGEZAL_FT<string opstr, DAGOperand opnd,
+ RegisterOperand RO, bit DelaySlot = 1> :
+ InstSE<(outs), (ins RO:$rs, opnd:$offset),
+ !strconcat(opstr, "\t$rs, $offset"), [], II_BCCZAL, FrmI, opstr> {
+ let hasDelaySlot = DelaySlot;
+ }
+
+}
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, hasDelaySlot = 1,
+ hasExtraSrcRegAllocReq = 1, isCTI = 1, Defs = [AT] in {
+ class TailCall<Instruction JumpInst, DAGOperand Opnd> :
+ PseudoSE<(outs), (ins calltarget:$target), [], II_J>,
+ PseudoInstExpansion<(JumpInst Opnd:$target)>;
+
+ class TailCallReg<RegisterOperand RO> :
+ MipsPseudo<(outs), (ins RO:$rs), [(MipsTailCall RO:$rs)], II_JR>;
+}
+
+class BAL_BR_Pseudo<Instruction RealInst> :
+ PseudoSE<(outs), (ins brtarget:$offset), [], II_BCCZAL>,
+ PseudoInstExpansion<(RealInst ZERO, brtarget:$offset)> {
+ let isBranch = 1;
+ let isTerminator = 1;
+ let isBarrier = 1;
+ let hasDelaySlot = 1;
+ let Defs = [RA];
+ bit isCTI = 1;
+}
+
+let isCTI = 1 in {
+// Syscall
+class SYS_FT<string opstr, Operand ImmOp, InstrItinClass itin = NoItinerary> :
+ InstSE<(outs), (ins ImmOp:$code_),
+ !strconcat(opstr, "\t$code_"), [], itin, FrmI, opstr>;
+// Break
+class BRK_FT<string opstr> :
+ InstSE<(outs), (ins uimm10:$code_1, uimm10:$code_2),
+ !strconcat(opstr, "\t$code_1, $code_2"), [], II_BREAK,
+ FrmOther, opstr>;
+
+// (D)Eret
+class ER_FT<string opstr, InstrItinClass itin = NoItinerary> :
+ InstSE<(outs), (ins),
+ opstr, [], itin, FrmOther, opstr>;
+
+// Wait
+class WAIT_FT<string opstr> :
+ InstSE<(outs), (ins), opstr, [], II_WAIT, FrmOther, opstr>;
+}
+
+// Interrupts
+class DEI_FT<string opstr, RegisterOperand RO,
+ InstrItinClass itin = NoItinerary> :
+ InstSE<(outs RO:$rt), (ins),
+ !strconcat(opstr, "\t$rt"), [], itin, FrmOther, opstr>;
+
+// Sync
+let hasSideEffects = 1 in
+class SYNC_FT<string opstr> :
+ InstSE<(outs), (ins uimm5:$stype), "sync $stype",
+ [(MipsSync immZExt5:$stype)], II_SYNC, FrmOther, opstr>;
+
+class SYNCI_FT<string opstr> :
+ InstSE<(outs), (ins mem_simm16:$addr), !strconcat(opstr, "\t$addr"), [],
+ II_SYNCI, FrmOther, opstr> {
+ let hasSideEffects = 1;
+ let DecoderMethod = "DecodeSyncI";
+}
+
+let hasSideEffects = 1, isCTI = 1 in {
+class TEQ_FT<string opstr, RegisterOperand RO, Operand ImmOp,
+ InstrItinClass itin = NoItinerary> :
+ InstSE<(outs), (ins RO:$rs, RO:$rt, ImmOp:$code_),
+ !strconcat(opstr, "\t$rs, $rt, $code_"), [], itin, FrmI, opstr>;
+
+class TEQI_FT<string opstr, RegisterOperand RO,
+ InstrItinClass itin = NoItinerary> :
+ InstSE<(outs), (ins RO:$rs, simm16:$imm16),
+ !strconcat(opstr, "\t$rs, $imm16"), [], itin, FrmOther, opstr>;
+}
+
+// Mul, Div
+class Mult<string opstr, InstrItinClass itin, RegisterOperand RO,
+ list<Register> DefRegs> :
+ InstSE<(outs), (ins RO:$rs, RO:$rt), !strconcat(opstr, "\t$rs, $rt"), [],
+ itin, FrmR, opstr> {
+ let isCommutable = 1;
+ let Defs = DefRegs;
+ let hasSideEffects = 0;
+}
+
+// Pseudo multiply/divide instruction with explicit accumulator register
+// operands.
+class MultDivPseudo<Instruction RealInst, RegisterClass R0, RegisterOperand R1,
+ SDPatternOperator OpNode, InstrItinClass Itin,
+ bit IsComm = 1, bit HasSideEffects = 0,
+ bit UsesCustomInserter = 0> :
+ PseudoSE<(outs R0:$ac), (ins R1:$rs, R1:$rt),
+ [(set R0:$ac, (OpNode R1:$rs, R1:$rt))], Itin>,
+ PseudoInstExpansion<(RealInst R1:$rs, R1:$rt)> {
+ let isCommutable = IsComm;
+ let hasSideEffects = HasSideEffects;
+ let usesCustomInserter = UsesCustomInserter;
+}
+
+// Pseudo multiply add/sub instruction with explicit accumulator register
+// operands.
+class MAddSubPseudo<Instruction RealInst, SDPatternOperator OpNode,
+ InstrItinClass itin>
+ : PseudoSE<(outs ACC64:$ac),
+ (ins GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64:$acin),
+ [(set ACC64:$ac,
+ (OpNode GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64:$acin))],
+ itin>,
+ PseudoInstExpansion<(RealInst GPR32Opnd:$rs, GPR32Opnd:$rt)> {
+ string Constraints = "$acin = $ac";
+}
+
+class Div<string opstr, InstrItinClass itin, RegisterOperand RO,
+ list<Register> DefRegs> :
+ InstSE<(outs), (ins RO:$rs, RO:$rt), !strconcat(opstr, "\t$$zero, $rs, $rt"),
+ [], itin, FrmR, opstr> {
+ let Defs = DefRegs;
+}
+
+// Move from Hi/Lo
+class PseudoMFLOHI<RegisterClass DstRC, RegisterClass SrcRC, SDNode OpNode>
+ : PseudoSE<(outs DstRC:$rd), (ins SrcRC:$hilo),
+ [(set DstRC:$rd, (OpNode SrcRC:$hilo))], II_MFHI_MFLO>;
+
+class MoveFromLOHI<string opstr, RegisterOperand RO, Register UseReg>:
+ InstSE<(outs RO:$rd), (ins), !strconcat(opstr, "\t$rd"), [], II_MFHI_MFLO,
+ FrmR, opstr> {
+ let Uses = [UseReg];
+ let hasSideEffects = 0;
+}
+
+class PseudoMTLOHI<RegisterClass DstRC, RegisterClass SrcRC>
+ : PseudoSE<(outs DstRC:$lohi), (ins SrcRC:$lo, SrcRC:$hi),
+ [(set DstRC:$lohi, (MipsMTLOHI SrcRC:$lo, SrcRC:$hi))],
+ II_MTHI_MTLO>;
+
+class MoveToLOHI<string opstr, RegisterOperand RO, list<Register> DefRegs>:
+ InstSE<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"), [], II_MTHI_MTLO,
+ FrmR, opstr> {
+ let Defs = DefRegs;
+ let hasSideEffects = 0;
+}
+
+class EffectiveAddress<string opstr, RegisterOperand RO> :
+ InstSE<(outs RO:$rt), (ins mem_ea:$addr), !strconcat(opstr, "\t$rt, $addr"),
+ [(set RO:$rt, addr:$addr)], II_ADDIU, FrmI,
+ !strconcat(opstr, "_lea")> {
+ let isCodeGenOnly = 1;
+ let hasNoSchedulingInfo = 1;
+ let DecoderMethod = "DecodeMem";
+}
+
+// Count Leading Ones/Zeros in Word
+class CountLeading0<string opstr, RegisterOperand RO,
+ InstrItinClass itin = NoItinerary>:
+ InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
+ [(set RO:$rd, (ctlz RO:$rs))], itin, FrmR, opstr>;
+
+class CountLeading1<string opstr, RegisterOperand RO,
+ InstrItinClass itin = NoItinerary>:
+ InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
+ [(set RO:$rd, (ctlz (not RO:$rs)))], itin, FrmR, opstr>;
+
+// Sign Extend in Register.
+class SignExtInReg<string opstr, ValueType vt, RegisterOperand RO,
+ InstrItinClass itin> :
+ InstSE<(outs RO:$rd), (ins RO:$rt), !strconcat(opstr, "\t$rd, $rt"),
+ [(set RO:$rd, (sext_inreg RO:$rt, vt))], itin, FrmR, opstr>;
+
+// Subword Swap
+class SubwordSwap<string opstr, RegisterOperand RO,
+ InstrItinClass itin = NoItinerary>:
+ InstSE<(outs RO:$rd), (ins RO:$rt), !strconcat(opstr, "\t$rd, $rt"), [], itin,
+ FrmR, opstr> {
+ let hasSideEffects = 0;
+}
+
+// Read Hardware
+class ReadHardware<RegisterOperand CPURegOperand, RegisterOperand RO> :
+ InstSE<(outs CPURegOperand:$rt), (ins RO:$rd), "rdhwr\t$rt, $rd", [],
+ II_RDHWR, FrmR, "rdhwr">;
+
+// Ext and Ins
+class ExtBase<string opstr, RegisterOperand RO, Operand PosOpnd,
+ Operand SizeOpnd, PatFrag PosImm, PatFrag SizeImm,
+ SDPatternOperator Op = null_frag> :
+ InstSE<(outs RO:$rt), (ins RO:$rs, PosOpnd:$pos, SizeOpnd:$size),
+ !strconcat(opstr, " $rt, $rs, $pos, $size"),
+ [(set RO:$rt, (Op RO:$rs, PosImm:$pos, SizeImm:$size))], II_EXT,
+ FrmR, opstr>, ISA_MIPS32R2;
+
+class InsBase<string opstr, RegisterOperand RO, Operand PosOpnd,
+ Operand SizeOpnd, SDPatternOperator Op = null_frag>:
+ InstSE<(outs RO:$rt), (ins RO:$rs, PosOpnd:$pos, SizeOpnd:$size, RO:$src),
+ !strconcat(opstr, " $rt, $rs, $pos, $size"),
+ [(set RO:$rt, (Op RO:$rs, imm:$pos, imm:$size, RO:$src))],
+ II_INS, FrmR, opstr>, ISA_MIPS32R2 {
+ let Constraints = "$src = $rt";
+}
+
+// Atomic instructions with 2 source operands (ATOMIC_SWAP & ATOMIC_LOAD_*).
+class Atomic2Ops<PatFrag Op, RegisterClass DRC> :
+ PseudoSE<(outs DRC:$dst), (ins PtrRC:$ptr, DRC:$incr),
+ [(set DRC:$dst, (Op iPTR:$ptr, DRC:$incr))]>;
+
+// Atomic Compare & Swap.
+class AtomicCmpSwap<PatFrag Op, RegisterClass DRC> :
+ PseudoSE<(outs DRC:$dst), (ins PtrRC:$ptr, DRC:$cmp, DRC:$swap),
+ [(set DRC:$dst, (Op iPTR:$ptr, DRC:$cmp, DRC:$swap))]>;
+
+class LLBase<string opstr, RegisterOperand RO, DAGOperand MO = mem> :
+ InstSE<(outs RO:$rt), (ins MO:$addr), !strconcat(opstr, "\t$rt, $addr"),
+ [], II_LL, FrmI, opstr> {
+ let DecoderMethod = "DecodeMem";
+ let mayLoad = 1;
+}
+
+class SCBase<string opstr, RegisterOperand RO> :
+ InstSE<(outs RO:$dst), (ins RO:$rt, mem:$addr),
+ !strconcat(opstr, "\t$rt, $addr"), [], II_SC, FrmI> {
+ let DecoderMethod = "DecodeMem";
+ let mayStore = 1;
+ let Constraints = "$rt = $dst";
+}
+
+class MFC3OP<string asmstr, RegisterOperand RO, RegisterOperand RD,
+ InstrItinClass itin> :
+ InstSE<(outs RO:$rt), (ins RD:$rd, uimm3:$sel),
+ !strconcat(asmstr, "\t$rt, $rd, $sel"), [], itin, FrmFR>;
+
+class MTC3OP<string asmstr, RegisterOperand RO, RegisterOperand RD,
+ InstrItinClass itin> :
+ InstSE<(outs RO:$rd), (ins RD:$rt, uimm3:$sel),
+ !strconcat(asmstr, "\t$rt, $rd, $sel"), [], itin, FrmFR>;
+
+class TrapBase<Instruction RealInst>
+ : PseudoSE<(outs), (ins), [(trap)], II_TRAP>,
+ PseudoInstExpansion<(RealInst 0, 0)> {
+ let isBarrier = 1;
+ let isTerminator = 1;
+ let isCodeGenOnly = 1;
+ let isCTI = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions
+//===----------------------------------------------------------------------===//
+
+// Return RA.
+let isReturn=1, isTerminator=1, isBarrier=1, hasCtrlDep=1, isCTI=1 in {
+ let hasDelaySlot=1 in
+ def RetRA : PseudoSE<(outs), (ins), [(MipsRet)]>;
+
+ let hasSideEffects=1 in
+ def ERet : PseudoSE<(outs), (ins), [(MipsERet)]>;
+}
+
+let Defs = [SP], Uses = [SP], hasSideEffects = 1 in {
+def ADJCALLSTACKDOWN : MipsPseudo<(outs), (ins i32imm:$amt),
+ [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP : MipsPseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+ [(callseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+let usesCustomInserter = 1 in {
+ def ATOMIC_LOAD_ADD_I8 : Atomic2Ops<atomic_load_add_8, GPR32>;
+ def ATOMIC_LOAD_ADD_I16 : Atomic2Ops<atomic_load_add_16, GPR32>;
+ def ATOMIC_LOAD_ADD_I32 : Atomic2Ops<atomic_load_add_32, GPR32>;
+ def ATOMIC_LOAD_SUB_I8 : Atomic2Ops<atomic_load_sub_8, GPR32>;
+ def ATOMIC_LOAD_SUB_I16 : Atomic2Ops<atomic_load_sub_16, GPR32>;
+ def ATOMIC_LOAD_SUB_I32 : Atomic2Ops<atomic_load_sub_32, GPR32>;
+ def ATOMIC_LOAD_AND_I8 : Atomic2Ops<atomic_load_and_8, GPR32>;
+ def ATOMIC_LOAD_AND_I16 : Atomic2Ops<atomic_load_and_16, GPR32>;
+ def ATOMIC_LOAD_AND_I32 : Atomic2Ops<atomic_load_and_32, GPR32>;
+ def ATOMIC_LOAD_OR_I8 : Atomic2Ops<atomic_load_or_8, GPR32>;
+ def ATOMIC_LOAD_OR_I16 : Atomic2Ops<atomic_load_or_16, GPR32>;
+ def ATOMIC_LOAD_OR_I32 : Atomic2Ops<atomic_load_or_32, GPR32>;
+ def ATOMIC_LOAD_XOR_I8 : Atomic2Ops<atomic_load_xor_8, GPR32>;
+ def ATOMIC_LOAD_XOR_I16 : Atomic2Ops<atomic_load_xor_16, GPR32>;
+ def ATOMIC_LOAD_XOR_I32 : Atomic2Ops<atomic_load_xor_32, GPR32>;
+ def ATOMIC_LOAD_NAND_I8 : Atomic2Ops<atomic_load_nand_8, GPR32>;
+ def ATOMIC_LOAD_NAND_I16 : Atomic2Ops<atomic_load_nand_16, GPR32>;
+ def ATOMIC_LOAD_NAND_I32 : Atomic2Ops<atomic_load_nand_32, GPR32>;
+
+ def ATOMIC_SWAP_I8 : Atomic2Ops<atomic_swap_8, GPR32>;
+ def ATOMIC_SWAP_I16 : Atomic2Ops<atomic_swap_16, GPR32>;
+ def ATOMIC_SWAP_I32 : Atomic2Ops<atomic_swap_32, GPR32>;
+
+ def ATOMIC_CMP_SWAP_I8 : AtomicCmpSwap<atomic_cmp_swap_8, GPR32>;
+ def ATOMIC_CMP_SWAP_I16 : AtomicCmpSwap<atomic_cmp_swap_16, GPR32>;
+ def ATOMIC_CMP_SWAP_I32 : AtomicCmpSwap<atomic_cmp_swap_32, GPR32>;
+}
+
+/// Pseudo instructions for loading and storing accumulator registers.
+let isPseudo = 1, isCodeGenOnly = 1, hasNoSchedulingInfo = 1 in {
+ def LOAD_ACC64 : Load<"", ACC64>;
+ def STORE_ACC64 : Store<"", ACC64>;
+}
+
+// We need these two pseudo instructions to avoid offset calculation for long
+// branches. See the comment in file MipsLongBranch.cpp for detailed
+// explanation.
+
+// Expands to: lui $dst, %hi($tgt - $baltgt)
+def LONG_BRANCH_LUi : PseudoSE<(outs GPR32Opnd:$dst),
+ (ins brtarget:$tgt, brtarget:$baltgt), []>;
+
+// Expands to: addiu $dst, $src, %lo($tgt - $baltgt)
+def LONG_BRANCH_ADDiu : PseudoSE<(outs GPR32Opnd:$dst),
+ (ins GPR32Opnd:$src, brtarget:$tgt, brtarget:$baltgt), []>;
+
+//===----------------------------------------------------------------------===//
+// Instruction definition
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// MipsI Instructions
+//===----------------------------------------------------------------------===//
+
+/// Arithmetic Instructions (ALU Immediate)
+let AdditionalPredicates = [NotInMicroMips] in {
+ def ADDiu : MMRel, StdMMR6Rel, ArithLogicI<"addiu", simm16_relaxed, GPR32Opnd,
+ II_ADDIU, immSExt16, add>,
+ ADDI_FM<0x9>, IsAsCheapAsAMove;
+
+ def ANDi : MMRel, StdMMR6Rel,
+ ArithLogicI<"andi", uimm16, GPR32Opnd, II_ANDI, immZExt16, and>,
+ ADDI_FM<0xc>;
+ def ORi : MMRel, StdMMR6Rel,
+ ArithLogicI<"ori", uimm16, GPR32Opnd, II_ORI, immZExt16, or>,
+ ADDI_FM<0xd>;
+ def XORi : MMRel, StdMMR6Rel,
+ ArithLogicI<"xori", uimm16, GPR32Opnd, II_XORI, immZExt16, xor>,
+ ADDI_FM<0xe>;
+}
+def ADDi : MMRel, ArithLogicI<"addi", simm16_relaxed, GPR32Opnd, II_ADDI>, ADDI_FM<0x8>,
+ ISA_MIPS1_NOT_32R6_64R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def SLTi : MMRel, SetCC_I<"slti", setlt, simm16, immSExt16, GPR32Opnd>,
+ SLTI_FM<0xa>;
+ def SLTiu : MMRel, SetCC_I<"sltiu", setult, simm16, immSExt16, GPR32Opnd>,
+ SLTI_FM<0xb>;
+}
+def LUi : MMRel, LoadUpper<"lui", GPR32Opnd, uimm16_relaxed>, LUI_FM;
+let AdditionalPredicates = [NotInMicroMips] in {
+ /// Arithmetic Instructions (3-Operand, R-Type)
+ def ADDu : MMRel, StdMMR6Rel, ArithLogicR<"addu", GPR32Opnd, 1, II_ADDU, add>,
+ ADD_FM<0, 0x21>;
+ def SUBu : MMRel, StdMMR6Rel, ArithLogicR<"subu", GPR32Opnd, 0, II_SUBU, sub>,
+ ADD_FM<0, 0x23>;
+}
+let Defs = [HI0, LO0] in
+def MUL : MMRel, ArithLogicR<"mul", GPR32Opnd, 1, II_MUL, mul>,
+ ADD_FM<0x1c, 2>, ISA_MIPS32_NOT_32R6_64R6;
+def ADD : MMRel, StdMMR6Rel, ArithLogicR<"add", GPR32Opnd, 1, II_ADD>, ADD_FM<0, 0x20>;
+def SUB : MMRel, StdMMR6Rel, ArithLogicR<"sub", GPR32Opnd, 0, II_SUB>, ADD_FM<0, 0x22>;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def SLT : MMRel, SetCC_R<"slt", setlt, GPR32Opnd>, ADD_FM<0, 0x2a>;
+ def SLTu : MMRel, SetCC_R<"sltu", setult, GPR32Opnd>, ADD_FM<0, 0x2b>;
+ def AND : MMRel, StdMMR6Rel, ArithLogicR<"and", GPR32Opnd, 1, II_AND, and>,
+ ADD_FM<0, 0x24>;
+ def OR : MMRel, StdMMR6Rel, ArithLogicR<"or", GPR32Opnd, 1, II_OR, or>,
+ ADD_FM<0, 0x25>;
+ def XOR : MMRel, StdMMR6Rel, ArithLogicR<"xor", GPR32Opnd, 1, II_XOR, xor>,
+ ADD_FM<0, 0x26>;
+ def NOR : MMRel, StdMMR6Rel, LogicNOR<"nor", GPR32Opnd>, ADD_FM<0, 0x27>;
+}
+
+/// Shift Instructions
+let AdditionalPredicates = [NotInMicroMips] in {
+def SLL : MMRel, shift_rotate_imm<"sll", uimm5, GPR32Opnd, II_SLL, shl,
+ immZExt5>, SRA_FM<0, 0>;
+def SRL : MMRel, shift_rotate_imm<"srl", uimm5, GPR32Opnd, II_SRL, srl,
+ immZExt5>, SRA_FM<2, 0>;
+def SRA : MMRel, shift_rotate_imm<"sra", uimm5, GPR32Opnd, II_SRA, sra,
+ immZExt5>, SRA_FM<3, 0>;
+def SLLV : MMRel, shift_rotate_reg<"sllv", GPR32Opnd, II_SLLV, shl>,
+ SRLV_FM<4, 0>;
+def SRLV : MMRel, shift_rotate_reg<"srlv", GPR32Opnd, II_SRLV, srl>,
+ SRLV_FM<6, 0>;
+def SRAV : MMRel, shift_rotate_reg<"srav", GPR32Opnd, II_SRAV, sra>,
+ SRLV_FM<7, 0>;
+}
+
+// Rotate Instructions
+let AdditionalPredicates = [NotInMicroMips] in {
+ def ROTR : MMRel, shift_rotate_imm<"rotr", uimm5, GPR32Opnd, II_ROTR, rotr,
+ immZExt5>,
+ SRA_FM<2, 1>, ISA_MIPS32R2;
+ def ROTRV : MMRel, shift_rotate_reg<"rotrv", GPR32Opnd, II_ROTRV, rotr>,
+ SRLV_FM<6, 1>, ISA_MIPS32R2;
+}
+
+/// Load and Store Instructions
+/// aligned
+def LB : LoadMemory<"lb", GPR32Opnd, mem_simm16, sextloadi8, II_LB>, MMRel,
+ LW_FM<0x20>;
+def LBu : LoadMemory<"lbu", GPR32Opnd, mem_simm16, zextloadi8, II_LBU,
+ addrDefault>, MMRel, LW_FM<0x24>;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def LH : LoadMemory<"lh", GPR32Opnd, mem_simm16, sextloadi16, II_LH,
+ addrDefault>, MMRel, LW_FM<0x21>;
+ def LHu : LoadMemory<"lhu", GPR32Opnd, mem_simm16, zextloadi16, II_LHU>,
+ MMRel, LW_FM<0x25>;
+ def LW : StdMMR6Rel, Load<"lw", GPR32Opnd, load, II_LW, addrDefault>, MMRel,
+ LW_FM<0x23>;
+}
+def SB : StdMMR6Rel, Store<"sb", GPR32Opnd, truncstorei8, II_SB>, MMRel,
+ LW_FM<0x28>;
+def SH : Store<"sh", GPR32Opnd, truncstorei16, II_SH>, MMRel, LW_FM<0x29>;
+let AdditionalPredicates = [NotInMicroMips] in {
+def SW : Store<"sw", GPR32Opnd, store, II_SW>, MMRel, LW_FM<0x2b>;
+}
+
+/// load/store left/right
+let EncodingPredicates = []<Predicate>, // FIXME: Lack of HasStdEnc is probably a bug
+ AdditionalPredicates = [NotInMicroMips] in {
+def LWL : LoadLeftRight<"lwl", MipsLWL, GPR32Opnd, II_LWL>, LW_FM<0x22>,
+ ISA_MIPS1_NOT_32R6_64R6;
+def LWR : LoadLeftRight<"lwr", MipsLWR, GPR32Opnd, II_LWR>, LW_FM<0x26>,
+ ISA_MIPS1_NOT_32R6_64R6;
+def SWL : StoreLeftRight<"swl", MipsSWL, GPR32Opnd, II_SWL>, LW_FM<0x2a>,
+ ISA_MIPS1_NOT_32R6_64R6;
+def SWR : StoreLeftRight<"swr", MipsSWR, GPR32Opnd, II_SWR>, LW_FM<0x2e>,
+ ISA_MIPS1_NOT_32R6_64R6;
+}
+
+let AdditionalPredicates = [NotInMicroMips] in {
+// COP2 Memory Instructions
+def LWC2 : StdMMR6Rel, LW_FT2<"lwc2", COP2Opnd, II_LWC2, load>, LW_FM<0x32>,
+ ISA_MIPS1_NOT_32R6_64R6;
+def SWC2 : StdMMR6Rel, SW_FT2<"swc2", COP2Opnd, II_SWC2, store>,
+ LW_FM<0x3a>, ISA_MIPS1_NOT_32R6_64R6;
+def LDC2 : StdMMR6Rel, LW_FT2<"ldc2", COP2Opnd, II_LDC2, load>, LW_FM<0x36>,
+ ISA_MIPS2_NOT_32R6_64R6;
+def SDC2 : StdMMR6Rel, SW_FT2<"sdc2", COP2Opnd, II_SDC2, store>,
+ LW_FM<0x3e>, ISA_MIPS2_NOT_32R6_64R6;
+
+// COP3 Memory Instructions
+let DecoderNamespace = "COP3_" in {
+ def LWC3 : LW_FT3<"lwc3", COP3Opnd, II_LWC3, load>, LW_FM<0x33>;
+ def SWC3 : SW_FT3<"swc3", COP3Opnd, II_SWC3, store>, LW_FM<0x3b>;
+ def LDC3 : LW_FT3<"ldc3", COP3Opnd, II_LDC3, load>, LW_FM<0x37>,
+ ISA_MIPS2;
+ def SDC3 : SW_FT3<"sdc3", COP3Opnd, II_SDC3, store>, LW_FM<0x3f>,
+ ISA_MIPS2;
+}
+
+ def SYNC : MMRel, StdMMR6Rel, SYNC_FT<"sync">, SYNC_FM, ISA_MIPS2;
+ def SYNCI : MMRel, StdMMR6Rel, SYNCI_FT<"synci">, SYNCI_FM, ISA_MIPS32R2;
+}
+
+let AdditionalPredicates = [NotInMicroMips] in {
+ def TEQ : MMRel, TEQ_FT<"teq", GPR32Opnd, uimm10, II_TEQ>, TEQ_FM<0x34>, ISA_MIPS2;
+ def TGE : MMRel, TEQ_FT<"tge", GPR32Opnd, uimm10, II_TGE>, TEQ_FM<0x30>, ISA_MIPS2;
+ def TGEU : MMRel, TEQ_FT<"tgeu", GPR32Opnd, uimm10, II_TGEU>, TEQ_FM<0x31>, ISA_MIPS2;
+ def TLT : MMRel, TEQ_FT<"tlt", GPR32Opnd, uimm10, II_TLT>, TEQ_FM<0x32>, ISA_MIPS2;
+ def TLTU : MMRel, TEQ_FT<"tltu", GPR32Opnd, uimm10, II_TLTU>, TEQ_FM<0x33>, ISA_MIPS2;
+ def TNE : MMRel, TEQ_FT<"tne", GPR32Opnd, uimm10, II_TNE>, TEQ_FM<0x36>, ISA_MIPS2;
+}
+
+def TEQI : MMRel, TEQI_FT<"teqi", GPR32Opnd, II_TEQI>, TEQI_FM<0xc>,
+ ISA_MIPS2_NOT_32R6_64R6;
+def TGEI : MMRel, TEQI_FT<"tgei", GPR32Opnd, II_TGEI>, TEQI_FM<0x8>,
+ ISA_MIPS2_NOT_32R6_64R6;
+def TGEIU : MMRel, TEQI_FT<"tgeiu", GPR32Opnd, II_TGEIU>, TEQI_FM<0x9>,
+ ISA_MIPS2_NOT_32R6_64R6;
+def TLTI : MMRel, TEQI_FT<"tlti", GPR32Opnd, II_TLTI>, TEQI_FM<0xa>,
+ ISA_MIPS2_NOT_32R6_64R6;
+def TTLTIU : MMRel, TEQI_FT<"tltiu", GPR32Opnd, II_TTLTIU>, TEQI_FM<0xb>,
+ ISA_MIPS2_NOT_32R6_64R6;
+def TNEI : MMRel, TEQI_FT<"tnei", GPR32Opnd, II_TNEI>, TEQI_FM<0xe>,
+ ISA_MIPS2_NOT_32R6_64R6;
+
+let AdditionalPredicates = [NotInMicroMips] in {
+def BREAK : MMRel, StdMMR6Rel, BRK_FT<"break">, BRK_FM<0xd>;
+def SYSCALL : MMRel, SYS_FT<"syscall", uimm20, II_SYSCALL>, SYS_FM<0xc>;
+}
+def TRAP : TrapBase<BREAK>;
+let AdditionalPredicates = [NotInMicroMips] in {
+def SDBBP : MMRel, SYS_FT<"sdbbp", uimm20, II_SDBBP>, SDBBP_FM, ISA_MIPS32_NOT_32R6_64R6;
+}
+
+let AdditionalPredicates = [NotInMicroMips] in {
+ def ERET : MMRel, ER_FT<"eret", II_ERET>, ER_FM<0x18, 0x0>, INSN_MIPS3_32;
+ def ERETNC : MMRel, ER_FT<"eretnc", II_ERETNC>, ER_FM<0x18, 0x1>, ISA_MIPS32R5;
+ def DERET : MMRel, ER_FT<"deret", II_DERET>, ER_FM<0x1f, 0x0>, ISA_MIPS32;
+}
+
+let AdditionalPredicates = [NotInMicroMips] in {
+ def EI : MMRel, StdMMR6Rel, DEI_FT<"ei", GPR32Opnd, II_EI>, EI_FM<1>, ISA_MIPS32R2;
+ def DI : MMRel, StdMMR6Rel, DEI_FT<"di", GPR32Opnd, II_DI>, EI_FM<0>, ISA_MIPS32R2;
+}
+
+let EncodingPredicates = []<Predicate>, // FIXME: Lack of HasStdEnc is probably a bug
+ AdditionalPredicates = [NotInMicroMips] in {
+def WAIT : WAIT_FT<"wait">, WAIT_FM;
+}
+
+let AdditionalPredicates = [NotInMicroMips] in {
+/// Load-linked, Store-conditional
+def LL : LLBase<"ll", GPR32Opnd>, LW_FM<0x30>, PTR_32, ISA_MIPS2_NOT_32R6_64R6;
+def SC : SCBase<"sc", GPR32Opnd>, LW_FM<0x38>, PTR_32, ISA_MIPS2_NOT_32R6_64R6;
+}
+
+/// Jump and Branch Instructions
+def J : MMRel, JumpFJ<jmptarget, "j", br, bb, "j">, FJ<2>,
+ AdditionalRequires<[RelocNotPIC]>, IsBranch;
+def JR : MMRel, IndirectBranch<"jr", GPR32Opnd>, MTLO_FM<8>, ISA_MIPS1_NOT_32R6_64R6;
+def BEQ : MMRel, CBranch<"beq", brtarget, seteq, GPR32Opnd>, BEQ_FM<4>;
+def BEQL : MMRel, CBranch<"beql", brtarget, seteq, GPR32Opnd, 0>,
+ BEQ_FM<20>, ISA_MIPS2_NOT_32R6_64R6;
+def BNE : MMRel, CBranch<"bne", brtarget, setne, GPR32Opnd>, BEQ_FM<5>;
+def BNEL : MMRel, CBranch<"bnel", brtarget, setne, GPR32Opnd, 0>,
+ BEQ_FM<21>, ISA_MIPS2_NOT_32R6_64R6;
+def BGEZ : MMRel, CBranchZero<"bgez", brtarget, setge, GPR32Opnd>,
+ BGEZ_FM<1, 1>;
+def BGEZL : MMRel, CBranchZero<"bgezl", brtarget, setge, GPR32Opnd, 0>,
+ BGEZ_FM<1, 3>, ISA_MIPS2_NOT_32R6_64R6;
+def BGTZ : MMRel, CBranchZero<"bgtz", brtarget, setgt, GPR32Opnd>,
+ BGEZ_FM<7, 0>;
+def BGTZL : MMRel, CBranchZero<"bgtzl", brtarget, setgt, GPR32Opnd, 0>,
+ BGEZ_FM<23, 0>, ISA_MIPS2_NOT_32R6_64R6;
+def BLEZ : MMRel, CBranchZero<"blez", brtarget, setle, GPR32Opnd>,
+ BGEZ_FM<6, 0>;
+def BLEZL : MMRel, CBranchZero<"blezl", brtarget, setle, GPR32Opnd, 0>,
+ BGEZ_FM<22, 0>, ISA_MIPS2_NOT_32R6_64R6;
+def BLTZ : MMRel, CBranchZero<"bltz", brtarget, setlt, GPR32Opnd>,
+ BGEZ_FM<1, 0>;
+def BLTZL : MMRel, CBranchZero<"bltzl", brtarget, setlt, GPR32Opnd, 0>,
+ BGEZ_FM<1, 2>, ISA_MIPS2_NOT_32R6_64R6;
+def B : UncondBranch<BEQ>;
+
+def JAL : MMRel, JumpLink<"jal", calltarget>, FJ<3>;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def JALR : JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM;
+ def JALRPseudo : JumpLinkRegPseudo<GPR32Opnd, JALR, RA>;
+}
+
+def JALX : MMRel, JumpLink<"jalx", calltarget>, FJ<0x1D>,
+ ISA_MIPS32_NOT_32R6_64R6;
+def BGEZAL : MMRel, BGEZAL_FT<"bgezal", brtarget, GPR32Opnd>, BGEZAL_FM<0x11>,
+ ISA_MIPS1_NOT_32R6_64R6;
+def BGEZALL : MMRel, BGEZAL_FT<"bgezall", brtarget, GPR32Opnd, 0>,
+ BGEZAL_FM<0x13>, ISA_MIPS2_NOT_32R6_64R6;
+def BLTZAL : MMRel, BGEZAL_FT<"bltzal", brtarget, GPR32Opnd>, BGEZAL_FM<0x10>,
+ ISA_MIPS1_NOT_32R6_64R6;
+def BLTZALL : MMRel, BGEZAL_FT<"bltzall", brtarget, GPR32Opnd, 0>,
+ BGEZAL_FM<0x12>, ISA_MIPS2_NOT_32R6_64R6;
+def BAL_BR : BAL_BR_Pseudo<BGEZAL>;
+
+let Predicates = [NotInMicroMips] in {
+ def TAILCALL : TailCall<J, jmptarget>;
+}
+
+def TAILCALLREG : TailCallReg<GPR32Opnd>;
+
+// Indirect branches are matched as PseudoIndirectBranch/PseudoIndirectBranch64
+// then are expanded to JR, JR64, JALR, or JALR64 depending on the ISA.
+class PseudoIndirectBranchBase<RegisterOperand RO> :
+ MipsPseudo<(outs), (ins RO:$rs), [(brind RO:$rs)],
+ II_IndirectBranchPseudo> {
+ let isTerminator=1;
+ let isBarrier=1;
+ let hasDelaySlot = 1;
+ let isBranch = 1;
+ let isIndirectBranch = 1;
+ bit isCTI = 1;
+}
+
+def PseudoIndirectBranch : PseudoIndirectBranchBase<GPR32Opnd>;
+
+// Return instructions are matched as a RetRA instruction, then are expanded
+// into PseudoReturn/PseudoReturn64 after register allocation. Finally,
+// MipsAsmPrinter expands this into JR, JR64, JALR, or JALR64 depending on the
+// ISA.
+class PseudoReturnBase<RegisterOperand RO> : MipsPseudo<(outs), (ins RO:$rs),
+ [], II_ReturnPseudo> {
+ let isTerminator = 1;
+ let isBarrier = 1;
+ let hasDelaySlot = 1;
+ let isReturn = 1;
+ let isCodeGenOnly = 1;
+ let hasCtrlDep = 1;
+ let hasExtraSrcRegAllocReq = 1;
+ bit isCTI = 1;
+}
+
+def PseudoReturn : PseudoReturnBase<GPR32Opnd>;
+
+// Exception handling related node and instructions.
+// The conversion sequence is:
+// ISD::EH_RETURN -> MipsISD::EH_RETURN ->
+// MIPSeh_return -> (stack change + indirect branch)
+//
+// MIPSeh_return takes the place of regular return instruction
+// but takes two arguments (V1, V0) which are used for storing
+// the offset and return address respectively.
+def SDT_MipsEHRET : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisPtrTy<1>]>;
+
+def MIPSehret : SDNode<"MipsISD::EH_RETURN", SDT_MipsEHRET,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+let Uses = [V0, V1], isTerminator = 1, isReturn = 1, isBarrier = 1, isCTI = 1 in {
+ def MIPSeh_return32 : MipsPseudo<(outs), (ins GPR32:$spoff, GPR32:$dst),
+ [(MIPSehret GPR32:$spoff, GPR32:$dst)]>;
+ def MIPSeh_return64 : MipsPseudo<(outs), (ins GPR64:$spoff,
+ GPR64:$dst),
+ [(MIPSehret GPR64:$spoff, GPR64:$dst)]>;
+}
+
+/// Multiply and Divide Instructions.
+def MULT : MMRel, Mult<"mult", II_MULT, GPR32Opnd, [HI0, LO0]>,
+ MULT_FM<0, 0x18>, ISA_MIPS1_NOT_32R6_64R6;
+def MULTu : MMRel, Mult<"multu", II_MULTU, GPR32Opnd, [HI0, LO0]>,
+ MULT_FM<0, 0x19>, ISA_MIPS1_NOT_32R6_64R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def SDIV : MMRel, Div<"div", II_DIV, GPR32Opnd, [HI0, LO0]>,
+ MULT_FM<0, 0x1a>, ISA_MIPS1_NOT_32R6_64R6;
+ def UDIV : MMRel, Div<"divu", II_DIVU, GPR32Opnd, [HI0, LO0]>,
+ MULT_FM<0, 0x1b>, ISA_MIPS1_NOT_32R6_64R6;
+}
+def MTHI : MMRel, MoveToLOHI<"mthi", GPR32Opnd, [HI0]>, MTLO_FM<0x11>,
+ ISA_MIPS1_NOT_32R6_64R6;
+def MTLO : MMRel, MoveToLOHI<"mtlo", GPR32Opnd, [LO0]>, MTLO_FM<0x13>,
+ ISA_MIPS1_NOT_32R6_64R6;
+let EncodingPredicates = []<Predicate>, // FIXME: Lack of HasStdEnc is probably a bug
+ AdditionalPredicates = [NotInMicroMips] in {
+def MFHI : MMRel, MoveFromLOHI<"mfhi", GPR32Opnd, AC0>, MFLO_FM<0x10>,
+ ISA_MIPS1_NOT_32R6_64R6;
+def MFLO : MMRel, MoveFromLOHI<"mflo", GPR32Opnd, AC0>, MFLO_FM<0x12>,
+ ISA_MIPS1_NOT_32R6_64R6;
+}
+
+/// Sign Ext In Register Instructions.
+def SEB : MMRel, StdMMR6Rel, SignExtInReg<"seb", i8, GPR32Opnd, II_SEB>,
+ SEB_FM<0x10, 0x20>, ISA_MIPS32R2;
+def SEH : MMRel, StdMMR6Rel, SignExtInReg<"seh", i16, GPR32Opnd, II_SEH>,
+ SEB_FM<0x18, 0x20>, ISA_MIPS32R2;
+
+/// Count Leading
+def CLZ : MMRel, CountLeading0<"clz", GPR32Opnd, II_CLZ>, CLO_FM<0x20>,
+ ISA_MIPS32_NOT_32R6_64R6;
+def CLO : MMRel, CountLeading1<"clo", GPR32Opnd, II_CLO>, CLO_FM<0x21>,
+ ISA_MIPS32_NOT_32R6_64R6;
+
+let AdditionalPredicates = [NotInMicroMips] in {
+ /// Word Swap Bytes Within Halfwords
+ def WSBH : MMRel, SubwordSwap<"wsbh", GPR32Opnd, II_WSBH>, SEB_FM<2, 0x20>,
+ ISA_MIPS32R2;
+}
+
+/// No operation.
+def NOP : PseudoSE<(outs), (ins), []>, PseudoInstExpansion<(SLL ZERO, ZERO, 0)>;
+
+// FrameIndexes are legalized when they are operands from load/store
+// instructions. The same not happens for stack address copies, so an
+// add op with mem ComplexPattern is used and the stack address copy
+// can be matched. It's similar to Sparc LEA_ADDRi
+def LEA_ADDiu : MMRel, EffectiveAddress<"addiu", GPR32Opnd>, LW_FM<9>;
+
+// MADD*/MSUB*
+def MADD : MMRel, MArithR<"madd", II_MADD, 1>, MULT_FM<0x1c, 0>,
+ ISA_MIPS32_NOT_32R6_64R6;
+def MADDU : MMRel, MArithR<"maddu", II_MADDU, 1>, MULT_FM<0x1c, 1>,
+ ISA_MIPS32_NOT_32R6_64R6;
+def MSUB : MMRel, MArithR<"msub", II_MSUB>, MULT_FM<0x1c, 4>,
+ ISA_MIPS32_NOT_32R6_64R6;
+def MSUBU : MMRel, MArithR<"msubu", II_MSUBU>, MULT_FM<0x1c, 5>,
+ ISA_MIPS32_NOT_32R6_64R6;
+
+let AdditionalPredicates = [NotDSP] in {
+def PseudoMULT : MultDivPseudo<MULT, ACC64, GPR32Opnd, MipsMult, II_MULT>,
+ ISA_MIPS1_NOT_32R6_64R6;
+def PseudoMULTu : MultDivPseudo<MULTu, ACC64, GPR32Opnd, MipsMultu, II_MULTU>,
+ ISA_MIPS1_NOT_32R6_64R6;
+def PseudoMFHI : PseudoMFLOHI<GPR32, ACC64, MipsMFHI>, ISA_MIPS1_NOT_32R6_64R6;
+def PseudoMFLO : PseudoMFLOHI<GPR32, ACC64, MipsMFLO>, ISA_MIPS1_NOT_32R6_64R6;
+def PseudoMTLOHI : PseudoMTLOHI<ACC64, GPR32>, ISA_MIPS1_NOT_32R6_64R6;
+def PseudoMADD : MAddSubPseudo<MADD, MipsMAdd, II_MADD>,
+ ISA_MIPS32_NOT_32R6_64R6;
+def PseudoMADDU : MAddSubPseudo<MADDU, MipsMAddu, II_MADDU>,
+ ISA_MIPS32_NOT_32R6_64R6;
+def PseudoMSUB : MAddSubPseudo<MSUB, MipsMSub, II_MSUB>,
+ ISA_MIPS32_NOT_32R6_64R6;
+def PseudoMSUBU : MAddSubPseudo<MSUBU, MipsMSubu, II_MSUBU>,
+ ISA_MIPS32_NOT_32R6_64R6;
+}
+
+let AdditionalPredicates = [NotInMicroMips] in {
+ def PseudoSDIV : MultDivPseudo<SDIV, ACC64, GPR32Opnd, MipsDivRem, II_DIV,
+ 0, 1, 1>, ISA_MIPS1_NOT_32R6_64R6;
+ def PseudoUDIV : MultDivPseudo<UDIV, ACC64, GPR32Opnd, MipsDivRemU, II_DIVU,
+ 0, 1, 1>, ISA_MIPS1_NOT_32R6_64R6;
+ def RDHWR : MMRel, ReadHardware<GPR32Opnd, HWRegsOpnd>, RDHWR_FM;
+ // TODO: Add '0 < pos+size <= 32' constraint check to ext instruction
+ def EXT : MMRel, StdMMR6Rel, ExtBase<"ext", GPR32Opnd, uimm5, uimm5_plus1,
+ immZExt5, immZExt5Plus1, MipsExt>,
+ EXT_FM<0>;
+ def INS : MMRel, StdMMR6Rel, InsBase<"ins", GPR32Opnd, uimm5,
+ uimm5_inssize_plus1, MipsIns>,
+ EXT_FM<4>;
+}
+/// Move Control Registers From/To CPU Registers
+let AdditionalPredicates = [NotInMicroMips] in {
+ def MTC0 : MTC3OP<"mtc0", COP0Opnd, GPR32Opnd, II_MTC0>, MFC3OP_FM<0x10, 4>,
+ ISA_MIPS32;
+ def MFC0 : MFC3OP<"mfc0", GPR32Opnd, COP0Opnd, II_MFC0>, MFC3OP_FM<0x10, 0>,
+ ISA_MIPS32;
+}
+def MFC2 : MFC3OP<"mfc2", GPR32Opnd, COP2Opnd, II_MFC2>, MFC3OP_FM<0x12, 0>;
+def MTC2 : MTC3OP<"mtc2", COP2Opnd, GPR32Opnd, II_MTC2>, MFC3OP_FM<0x12, 4>;
+
+class Barrier<string asmstr, InstrItinClass itin = NoItinerary> :
+ InstSE<(outs), (ins), asmstr, [], itin, FrmOther, asmstr>;
+
+def SSNOP : MMRel, StdMMR6Rel, Barrier<"ssnop", II_SSNOP>, BARRIER_FM<1>;
+def EHB : MMRel, Barrier<"ehb", II_EHB>, BARRIER_FM<3>;
+
+let isCTI = 1 in
+def PAUSE : MMRel, StdMMR6Rel, Barrier<"pause", II_PAUSE>, BARRIER_FM<5>,
+ ISA_MIPS32R2;
+
+// JR_HB and JALR_HB are defined here using the new style naming
+// scheme because some of this code is shared with Mips32r6InstrInfo.td
+// and because of that it doesn't follow the naming convention of the
+// rest of the file. To avoid a mixture of old vs new style, the new
+// style was chosen.
+class JR_HB_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+ dag OutOperandList = (outs);
+ dag InOperandList = (ins GPROpnd:$rs);
+ string AsmString = !strconcat(instr_asm, "\t$rs");
+ list<dag> Pattern = [];
+}
+
+class JALR_HB_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+ dag OutOperandList = (outs GPROpnd:$rd);
+ dag InOperandList = (ins GPROpnd:$rs);
+ string AsmString = !strconcat(instr_asm, "\t$rd, $rs");
+ list<dag> Pattern = [];
+}
+
+class JR_HB_DESC : InstSE<(outs), (ins), "", [], II_JR_HB, FrmJ>,
+ JR_HB_DESC_BASE<"jr.hb", GPR32Opnd> {
+ let isBranch=1;
+ let isIndirectBranch=1;
+ let hasDelaySlot=1;
+ let isTerminator=1;
+ let isBarrier=1;
+ bit isCTI = 1;
+}
+
+class JALR_HB_DESC : InstSE<(outs), (ins), "", [], II_JALR_HB, FrmJ>,
+ JALR_HB_DESC_BASE<"jalr.hb", GPR32Opnd> {
+ let isIndirectBranch=1;
+ let hasDelaySlot=1;
+ bit isCTI = 1;
+}
+
+class JR_HB_ENC : JR_HB_FM<8>;
+class JALR_HB_ENC : JALR_HB_FM<9>;
+
+def JR_HB : JR_HB_DESC, JR_HB_ENC, ISA_MIPS32_NOT_32R6_64R6;
+def JALR_HB : JALR_HB_DESC, JALR_HB_ENC, ISA_MIPS32;
+
+class TLB<string asmstr, InstrItinClass itin = NoItinerary> :
+ InstSE<(outs), (ins), asmstr, [], itin, FrmOther, asmstr>;
+let AdditionalPredicates = [NotInMicroMips] in {
+def TLBP : MMRel, TLB<"tlbp", II_TLBP>, COP0_TLB_FM<0x08>;
+def TLBR : MMRel, TLB<"tlbr", II_TLBR>, COP0_TLB_FM<0x01>;
+def TLBWI : MMRel, TLB<"tlbwi", II_TLBWI>, COP0_TLB_FM<0x02>;
+def TLBWR : MMRel, TLB<"tlbwr", II_TLBWR>, COP0_TLB_FM<0x06>;
+}
+class CacheOp<string instr_asm, Operand MemOpnd,
+ InstrItinClass itin = NoItinerary> :
+ InstSE<(outs), (ins MemOpnd:$addr, uimm5:$hint),
+ !strconcat(instr_asm, "\t$hint, $addr"), [], itin, FrmOther,
+ instr_asm> {
+ let DecoderMethod = "DecodeCacheOp";
+}
+
+def CACHE : MMRel, CacheOp<"cache", mem, II_CACHE>, CACHEOP_FM<0b101111>,
+ INSN_MIPS3_32_NOT_32R6_64R6;
+def PREF : MMRel, CacheOp<"pref", mem, II_PREF>, CACHEOP_FM<0b110011>,
+ INSN_MIPS3_32_NOT_32R6_64R6;
+
+def ROL : MipsAsmPseudoInst<(outs),
+ (ins GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rd),
+ "rol\t$rs, $rt, $rd">;
+def ROLImm : MipsAsmPseudoInst<(outs),
+ (ins GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm),
+ "rol\t$rs, $rt, $imm">;
+def : MipsInstAlias<"rol $rd, $rs",
+ (ROL GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rs), 0>;
+def : MipsInstAlias<"rol $rd, $imm",
+ (ROLImm GPR32Opnd:$rd, GPR32Opnd:$rd, simm16:$imm), 0>;
+
+def ROR : MipsAsmPseudoInst<(outs),
+ (ins GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rd),
+ "ror\t$rs, $rt, $rd">;
+def RORImm : MipsAsmPseudoInst<(outs),
+ (ins GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm),
+ "ror\t$rs, $rt, $imm">;
+def : MipsInstAlias<"ror $rd, $rs",
+ (ROR GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rs), 0>;
+def : MipsInstAlias<"ror $rd, $imm",
+ (RORImm GPR32Opnd:$rd, GPR32Opnd:$rd, simm16:$imm), 0>;
+
+def DROL : MipsAsmPseudoInst<(outs),
+ (ins GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rd),
+ "drol\t$rs, $rt, $rd">, ISA_MIPS64;
+def DROLImm : MipsAsmPseudoInst<(outs),
+ (ins GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm),
+ "drol\t$rs, $rt, $imm">, ISA_MIPS64;
+def : MipsInstAlias<"drol $rd, $rs",
+ (DROL GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rs), 0>, ISA_MIPS64;
+def : MipsInstAlias<"drol $rd, $imm",
+ (DROLImm GPR32Opnd:$rd, GPR32Opnd:$rd, simm16:$imm), 0>, ISA_MIPS64;
+
+def DROR : MipsAsmPseudoInst<(outs),
+ (ins GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rd),
+ "dror\t$rs, $rt, $rd">, ISA_MIPS64;
+def DRORImm : MipsAsmPseudoInst<(outs),
+ (ins GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm),
+ "dror\t$rs, $rt, $imm">, ISA_MIPS64;
+def : MipsInstAlias<"dror $rd, $rs",
+ (DROR GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rs), 0>, ISA_MIPS64;
+def : MipsInstAlias<"dror $rd, $imm",
+ (DRORImm GPR32Opnd:$rd, GPR32Opnd:$rd, simm16:$imm), 0>, ISA_MIPS64;
+
+def ABSMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), (ins GPR32Opnd:$rs),
+ "abs\t$rd, $rs">;
+
+def SEQMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+ (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
+ "seq $rd, $rs, $rt">, NOT_ASE_CNMIPS;
+
+def : MipsInstAlias<"seq $rd, $rs",
+ (SEQMacro GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rs), 0>,
+ NOT_ASE_CNMIPS;
+
+def SEQIMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+ (ins GPR32Opnd:$rs, simm32_relaxed:$imm),
+ "seq $rd, $rs, $imm">, NOT_ASE_CNMIPS;
+
+def : MipsInstAlias<"seq $rd, $imm",
+ (SEQIMacro GPR32Opnd:$rd, GPR32Opnd:$rd, simm32:$imm), 0>,
+ NOT_ASE_CNMIPS;
+//===----------------------------------------------------------------------===//
+// Instruction aliases
+//===----------------------------------------------------------------------===//
+def : MipsInstAlias<"move $dst, $src",
+ (OR GPR32Opnd:$dst, GPR32Opnd:$src, ZERO), 1>,
+ GPR_32 {
+ let AdditionalPredicates = [NotInMicroMips];
+}
+def : MipsInstAlias<"move $dst, $src",
+ (ADDu GPR32Opnd:$dst, GPR32Opnd:$src, ZERO), 1>,
+ GPR_32 {
+ let AdditionalPredicates = [NotInMicroMips];
+}
+def : MipsInstAlias<"bal $offset", (BGEZAL ZERO, brtarget:$offset), 0>,
+ ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<
+ "addu $rs, $rt, $imm",
+ (ADDiu GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>;
+def : MipsInstAlias<
+ "addu $rs, $imm",
+ (ADDiu GPR32Opnd:$rs, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>;
+def : MipsInstAlias<
+ "add $rs, $rt, $imm",
+ (ADDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>,
+ ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<
+ "add $rs, $imm",
+ (ADDi GPR32Opnd:$rs, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>,
+ ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<
+ "and $rs, $rt, $imm",
+ (ANDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>;
+def : MipsInstAlias<
+ "and $rs, $imm",
+ (ANDi GPR32Opnd:$rs, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>;
+def : MipsInstAlias<"j $rs", (JR GPR32Opnd:$rs), 0>;
+let Predicates = [NotInMicroMips] in {
+def : MipsInstAlias<"jalr $rs", (JALR RA, GPR32Opnd:$rs), 0>;
+}
+def : MipsInstAlias<"jalr.hb $rs", (JALR_HB RA, GPR32Opnd:$rs), 1>, ISA_MIPS32;
+def : MipsInstAlias<"neg $rt, $rs",
+ (SUB GPR32Opnd:$rt, ZERO, GPR32Opnd:$rs), 1>;
+def : MipsInstAlias<"neg $rt",
+ (SUB GPR32Opnd:$rt, ZERO, GPR32Opnd:$rt), 1>;
+def : MipsInstAlias<"negu $rt, $rs",
+ (SUBu GPR32Opnd:$rt, ZERO, GPR32Opnd:$rs), 1>;
+def : MipsInstAlias<"negu $rt",
+ (SUBu GPR32Opnd:$rt, ZERO, GPR32Opnd:$rt), 1>;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def : MipsInstAlias<
+ "sgt $rd, $rs, $rt",
+ (SLT GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+ def : MipsInstAlias<
+ "sgt $rs, $rt",
+ (SLT GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+ def : MipsInstAlias<
+ "sgtu $rd, $rs, $rt",
+ (SLTu GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+ def : MipsInstAlias<
+ "sgtu $$rs, $rt",
+ (SLTu GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+ def : MipsInstAlias<
+ "slt $rs, $rt, $imm",
+ (SLTi GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>;
+ def : MipsInstAlias<
+ "sltu $rt, $rs, $imm",
+ (SLTiu GPR32Opnd:$rt, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>;
+ def : MipsInstAlias<
+ "and $rs, $rt, $imm",
+ (ANDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>;
+ def : MipsInstAlias<
+ "and $rs, $imm",
+ (ANDi GPR32Opnd:$rs, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>;
+ def : MipsInstAlias<
+ "xor $rs, $rt, $imm",
+ (XORi GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>;
+ def : MipsInstAlias<
+ "xor $rs, $imm",
+ (XORi GPR32Opnd:$rs, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>;
+ def : MipsInstAlias<
+ "or $rs, $rt, $imm",
+ (ORi GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>;
+ def : MipsInstAlias<
+ "or $rs, $imm",
+ (ORi GPR32Opnd:$rs, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>;
+ def : MipsInstAlias<
+ "not $rt, $rs",
+ (NOR GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>;
+ def : MipsInstAlias<
+ "not $rt",
+ (NOR GPR32Opnd:$rt, GPR32Opnd:$rt, ZERO), 0>;
+ def : MipsInstAlias<"nop", (SLL ZERO, ZERO, 0), 1>;
+}
+def : MipsInstAlias<"mfc0 $rt, $rd", (MFC0 GPR32Opnd:$rt, COP0Opnd:$rd, 0), 0>;
+def : MipsInstAlias<"mtc0 $rt, $rd", (MTC0 COP0Opnd:$rd, GPR32Opnd:$rt, 0), 0>;
+def : MipsInstAlias<"mfc2 $rt, $rd", (MFC2 GPR32Opnd:$rt, COP2Opnd:$rd, 0), 0>;
+def : MipsInstAlias<"mtc2 $rt, $rd", (MTC2 COP2Opnd:$rd, GPR32Opnd:$rt, 0), 0>;
+let AdditionalPredicates = [NotInMicroMips] in {
+def : MipsInstAlias<"b $offset", (BEQ ZERO, ZERO, brtarget:$offset), 0>;
+}
+def : MipsInstAlias<"bnez $rs,$offset",
+ (BNE GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
+def : MipsInstAlias<"bnezl $rs,$offset",
+ (BNEL GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
+def : MipsInstAlias<"beqz $rs,$offset",
+ (BEQ GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
+def : MipsInstAlias<"beqzl $rs,$offset",
+ (BEQL GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def : MipsInstAlias<"syscall", (SYSCALL 0), 1>;
+}
+
+def : MipsInstAlias<"break", (BREAK 0, 0), 1>;
+def : MipsInstAlias<"break $imm", (BREAK uimm10:$imm, 0), 1>;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def : MipsInstAlias<"ei", (EI ZERO), 1>, ISA_MIPS32R2;
+ def : MipsInstAlias<"di", (DI ZERO), 1>, ISA_MIPS32R2;
+}
+let AdditionalPredicates = [NotInMicroMips] in {
+ def : MipsInstAlias<"teq $rs, $rt",
+ (TEQ GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
+ def : MipsInstAlias<"tge $rs, $rt",
+ (TGE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
+ def : MipsInstAlias<"tgeu $rs, $rt",
+ (TGEU GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
+ def : MipsInstAlias<"tlt $rs, $rt",
+ (TLT GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
+ def : MipsInstAlias<"tltu $rs, $rt",
+ (TLTU GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
+ def : MipsInstAlias<"tne $rs, $rt",
+ (TNE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
+}
+def : MipsInstAlias<"sub, $rd, $rs, $imm",
+ (ADDi GPR32Opnd:$rd, GPR32Opnd:$rs,
+ InvertedImOperand:$imm), 0>, ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<"sub $rs, $imm",
+ (ADDi GPR32Opnd:$rs, GPR32Opnd:$rs, InvertedImOperand:$imm),
+ 0>, ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<"subu, $rd, $rs, $imm",
+ (ADDiu GPR32Opnd:$rd, GPR32Opnd:$rs,
+ InvertedImOperand:$imm), 0>;
+def : MipsInstAlias<"subu $rs, $imm", (ADDiu GPR32Opnd:$rs, GPR32Opnd:$rs,
+ InvertedImOperand:$imm), 0>;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def : MipsInstAlias<"sll $rd, $rt, $rs",
+ (SLLV GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+ def : MipsInstAlias<"sra $rd, $rt, $rs",
+ (SRAV GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+ def : MipsInstAlias<"srl $rd, $rt, $rs",
+ (SRLV GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+ def : MipsInstAlias<"sll $rd, $rt",
+ (SLLV GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rt), 0>;
+ def : MipsInstAlias<"sra $rd, $rt",
+ (SRAV GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rt), 0>;
+ def : MipsInstAlias<"srl $rd, $rt",
+ (SRLV GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rt), 0>;
+ def : MipsInstAlias<"seh $rd", (SEH GPR32Opnd:$rd, GPR32Opnd:$rd), 0>,
+ ISA_MIPS32R2;
+ def : MipsInstAlias<"seb $rd", (SEB GPR32Opnd:$rd, GPR32Opnd:$rd), 0>,
+ ISA_MIPS32R2;
+}
+def : MipsInstAlias<"sdbbp", (SDBBP 0)>, ISA_MIPS32_NOT_32R6_64R6;
+def : MipsInstAlias<"sync",
+ (SYNC 0), 1>, ISA_MIPS2;
+//===----------------------------------------------------------------------===//
+// Assembler Pseudo Instructions
+//===----------------------------------------------------------------------===//
+
+// We use uimm32_coerced to accept a 33 bit signed number that is rendered into
+// a 32 bit number.
+class LoadImmediate32<string instr_asm, Operand Od, RegisterOperand RO> :
+ MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm32),
+ !strconcat(instr_asm, "\t$rt, $imm32")> ;
+def LoadImm32 : LoadImmediate32<"li", uimm32_coerced, GPR32Opnd>;
+
+class LoadAddressFromReg32<string instr_asm, Operand MemOpnd,
+ RegisterOperand RO> :
+ MipsAsmPseudoInst<(outs RO:$rt), (ins MemOpnd:$addr),
+ !strconcat(instr_asm, "\t$rt, $addr")> ;
+def LoadAddrReg32 : LoadAddressFromReg32<"la", mem, GPR32Opnd>;
+
+class LoadAddressFromImm32<string instr_asm, Operand Od, RegisterOperand RO> :
+ MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm32),
+ !strconcat(instr_asm, "\t$rt, $imm32")> ;
+def LoadAddrImm32 : LoadAddressFromImm32<"la", i32imm, GPR32Opnd>;
+
+def JalTwoReg : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), (ins GPR32Opnd:$rs),
+ "jal\t$rd, $rs"> ;
+def JalOneReg : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs),
+ "jal\t$rs"> ;
+
+def NORImm : MipsAsmPseudoInst<
+ (outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt, simm32:$imm),
+ "nor\t$rs, $rt, $imm"> ;
+
+let hasDelaySlot = 1, isCTI = 1 in {
+def BneImm : MipsAsmPseudoInst<(outs GPR32Opnd:$rt),
+ (ins imm64:$imm64, brtarget:$offset),
+ "bne\t$rt, $imm64, $offset">;
+def BeqImm : MipsAsmPseudoInst<(outs GPR32Opnd:$rt),
+ (ins imm64:$imm64, brtarget:$offset),
+ "beq\t$rt, $imm64, $offset">;
+
+class CondBranchPseudo<string instr_asm> :
+ MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt,
+ brtarget:$offset),
+ !strconcat(instr_asm, "\t$rs, $rt, $offset")>;
+}
+
+def BLT : CondBranchPseudo<"blt">;
+def BLE : CondBranchPseudo<"ble">;
+def BGE : CondBranchPseudo<"bge">;
+def BGT : CondBranchPseudo<"bgt">;
+def BLTU : CondBranchPseudo<"bltu">;
+def BLEU : CondBranchPseudo<"bleu">;
+def BGEU : CondBranchPseudo<"bgeu">;
+def BGTU : CondBranchPseudo<"bgtu">;
+def BLTL : CondBranchPseudo<"bltl">, ISA_MIPS2_NOT_32R6_64R6;
+def BLEL : CondBranchPseudo<"blel">, ISA_MIPS2_NOT_32R6_64R6;
+def BGEL : CondBranchPseudo<"bgel">, ISA_MIPS2_NOT_32R6_64R6;
+def BGTL : CondBranchPseudo<"bgtl">, ISA_MIPS2_NOT_32R6_64R6;
+def BLTUL: CondBranchPseudo<"bltul">, ISA_MIPS2_NOT_32R6_64R6;
+def BLEUL: CondBranchPseudo<"bleul">, ISA_MIPS2_NOT_32R6_64R6;
+def BGEUL: CondBranchPseudo<"bgeul">, ISA_MIPS2_NOT_32R6_64R6;
+def BGTUL: CondBranchPseudo<"bgtul">, ISA_MIPS2_NOT_32R6_64R6;
+
+let isCTI = 1 in
+class CondBranchImmPseudo<string instr_asm> :
+ MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, imm64:$imm, brtarget:$offset),
+ !strconcat(instr_asm, "\t$rs, $imm, $offset")>;
+
+def BLTImmMacro : CondBranchImmPseudo<"blt">;
+def BLEImmMacro : CondBranchImmPseudo<"ble">;
+def BGEImmMacro : CondBranchImmPseudo<"bge">;
+def BGTImmMacro : CondBranchImmPseudo<"bgt">;
+def BLTUImmMacro : CondBranchImmPseudo<"bltu">;
+def BLEUImmMacro : CondBranchImmPseudo<"bleu">;
+def BGEUImmMacro : CondBranchImmPseudo<"bgeu">;
+def BGTUImmMacro : CondBranchImmPseudo<"bgtu">;
+def BLTLImmMacro : CondBranchImmPseudo<"bltl">, ISA_MIPS2_NOT_32R6_64R6;
+def BLELImmMacro : CondBranchImmPseudo<"blel">, ISA_MIPS2_NOT_32R6_64R6;
+def BGELImmMacro : CondBranchImmPseudo<"bgel">, ISA_MIPS2_NOT_32R6_64R6;
+def BGTLImmMacro : CondBranchImmPseudo<"bgtl">, ISA_MIPS2_NOT_32R6_64R6;
+def BLTULImmMacro : CondBranchImmPseudo<"bltul">, ISA_MIPS2_NOT_32R6_64R6;
+def BLEULImmMacro : CondBranchImmPseudo<"bleul">, ISA_MIPS2_NOT_32R6_64R6;
+def BGEULImmMacro : CondBranchImmPseudo<"bgeul">, ISA_MIPS2_NOT_32R6_64R6;
+def BGTULImmMacro : CondBranchImmPseudo<"bgtul">, ISA_MIPS2_NOT_32R6_64R6;
+
+// FIXME: Predicates are removed because instructions are matched regardless of
+// predicates, because PredicateControl was not in the hierarchy. This was
+// done to emit more precise error message from expansion function.
+// Once the tablegen-erated errors are made better, this needs to be fixed and
+// predicates needs to be restored.
+
+def SDivMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+ (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
+ "div\t$rd, $rs, $rt">,
+ ISA_MIPS1_NOT_32R6_64R6;
+def UDivMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+ (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
+ "divu\t$rd, $rs, $rt">,
+ ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<"div $rt, $rs", (SDivMacro GPR32Opnd:$rt, GPR32Opnd:$rt,
+ GPR32Opnd:$rs), 0>,
+ ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<"divu $rt, $rs", (UDivMacro GPR32Opnd:$rt, GPR32Opnd:$rt,
+ GPR32Opnd:$rs), 0>,
+ ISA_MIPS1_NOT_32R6_64R6;
+def DSDivMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+ (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
+ "ddiv\t$rd, $rs, $rt">,
+ ISA_MIPS64_NOT_64R6;
+def DUDivMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+ (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
+ "ddivu\t$rd, $rs, $rt">,
+ ISA_MIPS64_NOT_64R6;
+def : MipsInstAlias<"ddiv $rt, $rs", (DSDivMacro GPR32Opnd:$rt, GPR32Opnd:$rt,
+ GPR32Opnd:$rs), 0>,
+ ISA_MIPS64_NOT_64R6;
+def : MipsInstAlias<"ddivu $rt, $rs", (DUDivMacro GPR32Opnd:$rt, GPR32Opnd:$rt,
+ GPR32Opnd:$rs), 0>,
+ ISA_MIPS64_NOT_64R6;
+
+def Ulh : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins mem:$addr),
+ "ulh\t$rt, $addr">; //, ISA_MIPS1_NOT_32R6_64R6;
+
+def Ulhu : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins mem:$addr),
+ "ulhu\t$rt, $addr">; //, ISA_MIPS1_NOT_32R6_64R6;
+
+def Ulw : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins mem:$addr),
+ "ulw\t$rt, $addr">; //, ISA_MIPS1_NOT_32R6_64R6;
+
+def Ush : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins mem:$addr),
+ "ush\t$rt, $addr">; //, ISA_MIPS1_NOT_32R6_64R6;
+
+def Usw : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins mem:$addr),
+ "usw\t$rt, $addr">; //, ISA_MIPS1_NOT_32R6_64R6;
+
+def LDMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rt),
+ (ins mem_simm16:$addr), "ld $rt, $addr">,
+ ISA_MIPS1_NOT_MIPS3;
+def SDMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rt),
+ (ins mem_simm16:$addr), "sd $rt, $addr">,
+ ISA_MIPS1_NOT_MIPS3;
+//===----------------------------------------------------------------------===//
+// Arbitrary patterns that map to one or more instructions
+//===----------------------------------------------------------------------===//
+
+// Load/store pattern templates.
+class LoadRegImmPat<Instruction LoadInst, ValueType ValTy, PatFrag Node> :
+ MipsPat<(ValTy (Node addrRegImm:$a)), (LoadInst addrRegImm:$a)>;
+
+class StoreRegImmPat<Instruction StoreInst, ValueType ValTy> :
+ MipsPat<(store ValTy:$v, addrRegImm:$a), (StoreInst ValTy:$v, addrRegImm:$a)>;
+
+// Materialize constants.
+multiclass MaterializeImms<ValueType VT, Register ZEROReg,
+ Instruction ADDiuOp, Instruction LUiOp,
+ Instruction ORiOp> {
+
+// Small immediates
+def : MipsPat<(VT immSExt16:$imm), (ADDiuOp ZEROReg, imm:$imm)>;
+def : MipsPat<(VT immZExt16:$imm), (ORiOp ZEROReg, imm:$imm)>;
+
+// Bits 32-16 set, sign/zero extended.
+def : MipsPat<(VT immSExt32Low16Zero:$imm), (LUiOp (HI16 imm:$imm))>;
+
+// Arbitrary immediates
+def : MipsPat<(VT immSExt32:$imm), (ORiOp (LUiOp (HI16 imm:$imm)), (LO16 imm:$imm))>;
+}
+
+let AdditionalPredicates = [NotInMicroMips] in
+ defm : MaterializeImms<i32, ZERO, ADDiu, LUi, ORi>;
+
+// Carry MipsPatterns
+let AdditionalPredicates = [NotInMicroMips] in {
+ def : MipsPat<(subc GPR32:$lhs, GPR32:$rhs),
+ (SUBu GPR32:$lhs, GPR32:$rhs)>;
+}
+def : MipsPat<(addc GPR32:$lhs, GPR32:$rhs),
+ (ADDu GPR32:$lhs, GPR32:$rhs)>, ASE_NOT_DSP;
+def : MipsPat<(addc GPR32:$src, immSExt16:$imm),
+ (ADDiu GPR32:$src, imm:$imm)>, ASE_NOT_DSP;
+
+// Support multiplication for pre-Mips32 targets that don't have
+// the MUL instruction.
+def : MipsPat<(mul GPR32:$lhs, GPR32:$rhs),
+ (PseudoMFLO (PseudoMULT GPR32:$lhs, GPR32:$rhs))>,
+ ISA_MIPS1_NOT_32R6_64R6;
+
+// SYNC
+def : MipsPat<(MipsSync (i32 immz)),
+ (SYNC 0)>, ISA_MIPS2;
+
+// Call
+def : MipsPat<(MipsJmpLink (i32 texternalsym:$dst)),
+ (JAL texternalsym:$dst)>;
+//def : MipsPat<(MipsJmpLink GPR32:$dst),
+// (JALR GPR32:$dst)>;
+
+// Tail call
+def : MipsPat<(MipsTailCall (iPTR tglobaladdr:$dst)),
+ (TAILCALL tglobaladdr:$dst)>;
+def : MipsPat<(MipsTailCall (iPTR texternalsym:$dst)),
+ (TAILCALL texternalsym:$dst)>;
+// hi/lo relocs
+def : MipsPat<(MipsHi tglobaladdr:$in), (LUi tglobaladdr:$in)>;
+def : MipsPat<(MipsHi tblockaddress:$in), (LUi tblockaddress:$in)>;
+def : MipsPat<(MipsHi tjumptable:$in), (LUi tjumptable:$in)>;
+def : MipsPat<(MipsHi tconstpool:$in), (LUi tconstpool:$in)>;
+def : MipsPat<(MipsHi tglobaltlsaddr:$in), (LUi tglobaltlsaddr:$in)>;
+def : MipsPat<(MipsHi texternalsym:$in), (LUi texternalsym:$in)>;
+
+def : MipsPat<(MipsLo tglobaladdr:$in), (ADDiu ZERO, tglobaladdr:$in)>;
+def : MipsPat<(MipsLo tblockaddress:$in), (ADDiu ZERO, tblockaddress:$in)>;
+def : MipsPat<(MipsLo tjumptable:$in), (ADDiu ZERO, tjumptable:$in)>;
+def : MipsPat<(MipsLo tconstpool:$in), (ADDiu ZERO, tconstpool:$in)>;
+def : MipsPat<(MipsLo tglobaltlsaddr:$in), (ADDiu ZERO, tglobaltlsaddr:$in)>;
+def : MipsPat<(MipsLo texternalsym:$in), (ADDiu ZERO, texternalsym:$in)>;
+
+def : MipsPat<(add GPR32:$hi, (MipsLo tglobaladdr:$lo)),
+ (ADDiu GPR32:$hi, tglobaladdr:$lo)>;
+def : MipsPat<(add GPR32:$hi, (MipsLo tblockaddress:$lo)),
+ (ADDiu GPR32:$hi, tblockaddress:$lo)>;
+def : MipsPat<(add GPR32:$hi, (MipsLo tjumptable:$lo)),
+ (ADDiu GPR32:$hi, tjumptable:$lo)>;
+def : MipsPat<(add GPR32:$hi, (MipsLo tconstpool:$lo)),
+ (ADDiu GPR32:$hi, tconstpool:$lo)>;
+def : MipsPat<(add GPR32:$hi, (MipsLo tglobaltlsaddr:$lo)),
+ (ADDiu GPR32:$hi, tglobaltlsaddr:$lo)>;
+
+// gp_rel relocs
+def : MipsPat<(add GPR32:$gp, (MipsGPRel tglobaladdr:$in)),
+ (ADDiu GPR32:$gp, tglobaladdr:$in)>;
+def : MipsPat<(add GPR32:$gp, (MipsGPRel tconstpool:$in)),
+ (ADDiu GPR32:$gp, tconstpool:$in)>;
+
+// wrapper_pic
+class WrapperPat<SDNode node, Instruction ADDiuOp, RegisterClass RC>:
+ MipsPat<(MipsWrapper RC:$gp, node:$in),
+ (ADDiuOp RC:$gp, node:$in)>;
+
+def : WrapperPat<tglobaladdr, ADDiu, GPR32>;
+def : WrapperPat<tconstpool, ADDiu, GPR32>;
+def : WrapperPat<texternalsym, ADDiu, GPR32>;
+def : WrapperPat<tblockaddress, ADDiu, GPR32>;
+def : WrapperPat<tjumptable, ADDiu, GPR32>;
+def : WrapperPat<tglobaltlsaddr, ADDiu, GPR32>;
+
+let AdditionalPredicates = [NotInMicroMips] in {
+// Mips does not have "not", so we expand our way
+def : MipsPat<(not GPR32:$in),
+ (NOR GPR32Opnd:$in, ZERO)>;
+}
+
+// extended loads
+def : MipsPat<(i32 (extloadi1 addr:$src)), (LBu addr:$src)>;
+def : MipsPat<(i32 (extloadi8 addr:$src)), (LBu addr:$src)>;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def : MipsPat<(i32 (extloadi16 addr:$src)), (LHu addr:$src)>;
+}
+
+// peepholes
+def : MipsPat<(store (i32 0), addr:$dst), (SW ZERO, addr:$dst)>;
+
+// brcond patterns
+multiclass BrcondPats<RegisterClass RC, Instruction BEQOp, Instruction BEQOp1,
+ Instruction BNEOp, Instruction SLTOp, Instruction SLTuOp,
+ Instruction SLTiOp, Instruction SLTiuOp,
+ Register ZEROReg> {
+def : MipsPat<(brcond (i32 (setne RC:$lhs, 0)), bb:$dst),
+ (BNEOp RC:$lhs, ZEROReg, bb:$dst)>;
+def : MipsPat<(brcond (i32 (seteq RC:$lhs, 0)), bb:$dst),
+ (BEQOp RC:$lhs, ZEROReg, bb:$dst)>;
+
+def : MipsPat<(brcond (i32 (setge RC:$lhs, RC:$rhs)), bb:$dst),
+ (BEQOp1 (SLTOp RC:$lhs, RC:$rhs), ZERO, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setuge RC:$lhs, RC:$rhs)), bb:$dst),
+ (BEQOp1 (SLTuOp RC:$lhs, RC:$rhs), ZERO, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setge RC:$lhs, immSExt16:$rhs)), bb:$dst),
+ (BEQOp1 (SLTiOp RC:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setuge RC:$lhs, immSExt16:$rhs)), bb:$dst),
+ (BEQOp1 (SLTiuOp RC:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setgt RC:$lhs, immSExt16Plus1:$rhs)), bb:$dst),
+ (BEQOp1 (SLTiOp RC:$lhs, (Plus1 imm:$rhs)), ZERO, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setugt RC:$lhs, immSExt16Plus1:$rhs)), bb:$dst),
+ (BEQOp1 (SLTiuOp RC:$lhs, (Plus1 imm:$rhs)), ZERO, bb:$dst)>;
+
+def : MipsPat<(brcond (i32 (setle RC:$lhs, RC:$rhs)), bb:$dst),
+ (BEQOp1 (SLTOp RC:$rhs, RC:$lhs), ZERO, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setule RC:$lhs, RC:$rhs)), bb:$dst),
+ (BEQOp1 (SLTuOp RC:$rhs, RC:$lhs), ZERO, bb:$dst)>;
+
+def : MipsPat<(brcond RC:$cond, bb:$dst),
+ (BNEOp RC:$cond, ZEROReg, bb:$dst)>;
+}
+let AdditionalPredicates = [NotInMicroMips] in {
+ defm : BrcondPats<GPR32, BEQ, BEQ, BNE, SLT, SLTu, SLTi, SLTiu, ZERO>;
+}
+def : MipsPat<(brcond (i32 (setlt i32:$lhs, 1)), bb:$dst),
+ (BLEZ i32:$lhs, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setgt i32:$lhs, -1)), bb:$dst),
+ (BGEZ i32:$lhs, bb:$dst)>;
+
+// setcc patterns
+multiclass SeteqPats<RegisterClass RC, Instruction SLTiuOp, Instruction XOROp,
+ Instruction SLTuOp, Register ZEROReg> {
+ def : MipsPat<(seteq RC:$lhs, 0),
+ (SLTiuOp RC:$lhs, 1)>;
+ def : MipsPat<(setne RC:$lhs, 0),
+ (SLTuOp ZEROReg, RC:$lhs)>;
+ def : MipsPat<(seteq RC:$lhs, RC:$rhs),
+ (SLTiuOp (XOROp RC:$lhs, RC:$rhs), 1)>;
+ def : MipsPat<(setne RC:$lhs, RC:$rhs),
+ (SLTuOp ZEROReg, (XOROp RC:$lhs, RC:$rhs))>;
+}
+
+multiclass SetlePats<RegisterClass RC, Instruction XORiOp, Instruction SLTOp,
+ Instruction SLTuOp> {
+ def : MipsPat<(setle RC:$lhs, RC:$rhs),
+ (XORiOp (SLTOp RC:$rhs, RC:$lhs), 1)>;
+ def : MipsPat<(setule RC:$lhs, RC:$rhs),
+ (XORiOp (SLTuOp RC:$rhs, RC:$lhs), 1)>;
+}
+
+multiclass SetgtPats<RegisterClass RC, Instruction SLTOp, Instruction SLTuOp> {
+ def : MipsPat<(setgt RC:$lhs, RC:$rhs),
+ (SLTOp RC:$rhs, RC:$lhs)>;
+ def : MipsPat<(setugt RC:$lhs, RC:$rhs),
+ (SLTuOp RC:$rhs, RC:$lhs)>;
+}
+
+multiclass SetgePats<RegisterClass RC, Instruction XORiOp, Instruction SLTOp,
+ Instruction SLTuOp> {
+ def : MipsPat<(setge RC:$lhs, RC:$rhs),
+ (XORiOp (SLTOp RC:$lhs, RC:$rhs), 1)>;
+ def : MipsPat<(setuge RC:$lhs, RC:$rhs),
+ (XORiOp (SLTuOp RC:$lhs, RC:$rhs), 1)>;
+}
+
+multiclass SetgeImmPats<RegisterClass RC, Instruction XORiOp,
+ Instruction SLTiOp, Instruction SLTiuOp> {
+ def : MipsPat<(setge RC:$lhs, immSExt16:$rhs),
+ (XORiOp (SLTiOp RC:$lhs, immSExt16:$rhs), 1)>;
+ def : MipsPat<(setuge RC:$lhs, immSExt16:$rhs),
+ (XORiOp (SLTiuOp RC:$lhs, immSExt16:$rhs), 1)>;
+}
+
+let AdditionalPredicates = [NotInMicroMips] in {
+ defm : SeteqPats<GPR32, SLTiu, XOR, SLTu, ZERO>;
+ defm : SetlePats<GPR32, XORi, SLT, SLTu>;
+ defm : SetgtPats<GPR32, SLT, SLTu>;
+ defm : SetgePats<GPR32, XORi, SLT, SLTu>;
+ defm : SetgeImmPats<GPR32, XORi, SLTi, SLTiu>;
+}
+
+// bswap pattern
+def : MipsPat<(bswap GPR32:$rt), (ROTR (WSBH GPR32:$rt), 16)>;
+
+// Load halfword/word patterns.
+let AddedComplexity = 40 in {
+ def : LoadRegImmPat<LBu, i32, zextloadi8>;
+ let AdditionalPredicates = [NotInMicroMips] in {
+ def : LoadRegImmPat<LH, i32, sextloadi16>;
+ def : LoadRegImmPat<LW, i32, load>;
+ }
+}
+
+// Atomic load patterns.
+def : MipsPat<(atomic_load_8 addr:$a), (LB addr:$a)>;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def : MipsPat<(atomic_load_16 addr:$a), (LH addr:$a)>;
+}
+def : MipsPat<(atomic_load_32 addr:$a), (LW addr:$a)>;
+
+// Atomic store patterns.
+def : MipsPat<(atomic_store_8 addr:$a, GPR32:$v), (SB GPR32:$v, addr:$a)>;
+def : MipsPat<(atomic_store_16 addr:$a, GPR32:$v), (SH GPR32:$v, addr:$a)>;
+def : MipsPat<(atomic_store_32 addr:$a, GPR32:$v), (SW GPR32:$v, addr:$a)>;
+
+//===----------------------------------------------------------------------===//
+// Floating Point Support
+//===----------------------------------------------------------------------===//
+
+include "MipsInstrFPU.td"
+include "Mips64InstrInfo.td"
+include "MipsCondMov.td"
+
+include "Mips32r6InstrInfo.td"
+include "Mips64r6InstrInfo.td"
+
+//
+// Mips16
+
+include "Mips16InstrFormats.td"
+include "Mips16InstrInfo.td"
+
+// DSP
+include "MipsDSPInstrFormats.td"
+include "MipsDSPInstrInfo.td"
+
+// MSA
+include "MipsMSAInstrFormats.td"
+include "MipsMSAInstrInfo.td"
+
+// EVA
+include "MipsEVAInstrFormats.td"
+include "MipsEVAInstrInfo.td"
+
+// Micromips
+include "MicroMipsInstrFormats.td"
+include "MicroMipsInstrInfo.td"
+include "MicroMipsInstrFPU.td"
+
+// Micromips r6
+include "MicroMips32r6InstrFormats.td"
+include "MicroMips32r6InstrInfo.td"
+
+// Micromips64 r6
+include "MicroMips64r6InstrFormats.td"
+include "MicroMips64r6InstrInfo.td"
+
+// Micromips DSP
+include "MicroMipsDSPInstrFormats.td"
+include "MicroMipsDSPInstrInfo.td"
diff --git a/contrib/llvm/lib/Target/Mips/MipsLongBranch.cpp b/contrib/llvm/lib/Target/Mips/MipsLongBranch.cpp
new file mode 100644
index 000000000000..1087d0e0140e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsLongBranch.cpp
@@ -0,0 +1,532 @@
+//===-- MipsLongBranch.cpp - Emit long branches ---------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass expands a branch or jump instruction into a long branch if its
+// offset is too large to fit into its immediate field.
+//
+// FIXME: Fix pc-region jump instructions which cross 256MB segment boundaries.
+//===----------------------------------------------------------------------===//
+
+#include "Mips.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MCTargetDesc/MipsMCNaCl.h"
+#include "MipsMachineFunction.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-long-branch"
+
+STATISTIC(LongBranches, "Number of long branches.");
+
+static cl::opt<bool> SkipLongBranch(
+ "skip-mips-long-branch",
+ cl::init(false),
+ cl::desc("MIPS: Skip long branch pass."),
+ cl::Hidden);
+
+static cl::opt<bool> ForceLongBranch(
+ "force-mips-long-branch",
+ cl::init(false),
+ cl::desc("MIPS: Expand all branches to long format."),
+ cl::Hidden);
+
+namespace {
+ typedef MachineBasicBlock::iterator Iter;
+ typedef MachineBasicBlock::reverse_iterator ReverseIter;
+
+ struct MBBInfo {
+ uint64_t Size, Address;
+ bool HasLongBranch;
+ MachineInstr *Br;
+
+ MBBInfo() : Size(0), HasLongBranch(false), Br(nullptr) {}
+ };
+
+ class MipsLongBranch : public MachineFunctionPass {
+
+ public:
+ static char ID;
+ MipsLongBranch(TargetMachine &tm)
+ : MachineFunctionPass(ID), TM(tm), IsPIC(TM.isPositionIndependent()),
+ ABI(static_cast<const MipsTargetMachine &>(TM).getABI()) {}
+
+ StringRef getPassName() const override { return "Mips Long Branch"; }
+
+ bool runOnMachineFunction(MachineFunction &F) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+ private:
+ void splitMBB(MachineBasicBlock *MBB);
+ void initMBBInfo();
+ int64_t computeOffset(const MachineInstr *Br);
+ void replaceBranch(MachineBasicBlock &MBB, Iter Br, const DebugLoc &DL,
+ MachineBasicBlock *MBBOpnd);
+ void expandToLongBranch(MBBInfo &Info);
+
+ const TargetMachine &TM;
+ MachineFunction *MF;
+ SmallVector<MBBInfo, 16> MBBInfos;
+ bool IsPIC;
+ MipsABIInfo ABI;
+ unsigned LongBranchSeqSize;
+ };
+
+ char MipsLongBranch::ID = 0;
+} // end of anonymous namespace
+
+/// createMipsLongBranchPass - Returns a pass that converts branches to long
+/// branches.
+FunctionPass *llvm::createMipsLongBranchPass(MipsTargetMachine &tm) {
+ return new MipsLongBranch(tm);
+}
+
+/// Iterate over list of Br's operands and search for a MachineBasicBlock
+/// operand.
+static MachineBasicBlock *getTargetMBB(const MachineInstr &Br) {
+ for (unsigned I = 0, E = Br.getDesc().getNumOperands(); I < E; ++I) {
+ const MachineOperand &MO = Br.getOperand(I);
+
+ if (MO.isMBB())
+ return MO.getMBB();
+ }
+
+ llvm_unreachable("This instruction does not have an MBB operand.");
+}
+
+// Traverse the list of instructions backwards until a non-debug instruction is
+// found or it reaches E.
+static ReverseIter getNonDebugInstr(ReverseIter B, const ReverseIter &E) {
+ for (; B != E; ++B)
+ if (!B->isDebugValue())
+ return B;
+
+ return E;
+}
+
+// Split MBB if it has two direct jumps/branches.
+void MipsLongBranch::splitMBB(MachineBasicBlock *MBB) {
+ ReverseIter End = MBB->rend();
+ ReverseIter LastBr = getNonDebugInstr(MBB->rbegin(), End);
+
+ // Return if MBB has no branch instructions.
+ if ((LastBr == End) ||
+ (!LastBr->isConditionalBranch() && !LastBr->isUnconditionalBranch()))
+ return;
+
+ ReverseIter FirstBr = getNonDebugInstr(std::next(LastBr), End);
+
+ // MBB has only one branch instruction if FirstBr is not a branch
+ // instruction.
+ if ((FirstBr == End) ||
+ (!FirstBr->isConditionalBranch() && !FirstBr->isUnconditionalBranch()))
+ return;
+
+ assert(!FirstBr->isIndirectBranch() && "Unexpected indirect branch found.");
+
+ // Create a new MBB. Move instructions in MBB to the newly created MBB.
+ MachineBasicBlock *NewMBB =
+ MF->CreateMachineBasicBlock(MBB->getBasicBlock());
+
+ // Insert NewMBB and fix control flow.
+ MachineBasicBlock *Tgt = getTargetMBB(*FirstBr);
+ NewMBB->transferSuccessors(MBB);
+ NewMBB->removeSuccessor(Tgt, true);
+ MBB->addSuccessor(NewMBB);
+ MBB->addSuccessor(Tgt);
+ MF->insert(std::next(MachineFunction::iterator(MBB)), NewMBB);
+
+ NewMBB->splice(NewMBB->end(), MBB, LastBr.getReverse(), MBB->end());
+}
+
+// Fill MBBInfos.
+void MipsLongBranch::initMBBInfo() {
+ // Split the MBBs if they have two branches. Each basic block should have at
+ // most one branch after this loop is executed.
+ for (auto &MBB : *MF)
+ splitMBB(&MBB);
+
+ MF->RenumberBlocks();
+ MBBInfos.clear();
+ MBBInfos.resize(MF->size());
+
+ const MipsInstrInfo *TII =
+ static_cast<const MipsInstrInfo *>(MF->getSubtarget().getInstrInfo());
+ for (unsigned I = 0, E = MBBInfos.size(); I < E; ++I) {
+ MachineBasicBlock *MBB = MF->getBlockNumbered(I);
+
+ // Compute size of MBB.
+ for (MachineBasicBlock::instr_iterator MI = MBB->instr_begin();
+ MI != MBB->instr_end(); ++MI)
+ MBBInfos[I].Size += TII->getInstSizeInBytes(*MI);
+
+ // Search for MBB's branch instruction.
+ ReverseIter End = MBB->rend();
+ ReverseIter Br = getNonDebugInstr(MBB->rbegin(), End);
+
+ if ((Br != End) && !Br->isIndirectBranch() &&
+ (Br->isConditionalBranch() || (Br->isUnconditionalBranch() && IsPIC)))
+ MBBInfos[I].Br = &*Br;
+ }
+}
+
+// Compute offset of branch in number of bytes.
+int64_t MipsLongBranch::computeOffset(const MachineInstr *Br) {
+ int64_t Offset = 0;
+ int ThisMBB = Br->getParent()->getNumber();
+ int TargetMBB = getTargetMBB(*Br)->getNumber();
+
+ // Compute offset of a forward branch.
+ if (ThisMBB < TargetMBB) {
+ for (int N = ThisMBB + 1; N < TargetMBB; ++N)
+ Offset += MBBInfos[N].Size;
+
+ return Offset + 4;
+ }
+
+ // Compute offset of a backward branch.
+ for (int N = ThisMBB; N >= TargetMBB; --N)
+ Offset += MBBInfos[N].Size;
+
+ return -Offset + 4;
+}
+
+// Replace Br with a branch which has the opposite condition code and a
+// MachineBasicBlock operand MBBOpnd.
+void MipsLongBranch::replaceBranch(MachineBasicBlock &MBB, Iter Br,
+ const DebugLoc &DL,
+ MachineBasicBlock *MBBOpnd) {
+ const MipsInstrInfo *TII = static_cast<const MipsInstrInfo *>(
+ MBB.getParent()->getSubtarget().getInstrInfo());
+ unsigned NewOpc = TII->getOppositeBranchOpc(Br->getOpcode());
+ const MCInstrDesc &NewDesc = TII->get(NewOpc);
+
+ MachineInstrBuilder MIB = BuildMI(MBB, Br, DL, NewDesc);
+
+ for (unsigned I = 0, E = Br->getDesc().getNumOperands(); I < E; ++I) {
+ MachineOperand &MO = Br->getOperand(I);
+
+ if (!MO.isReg()) {
+ assert(MO.isMBB() && "MBB operand expected.");
+ break;
+ }
+
+ MIB.addReg(MO.getReg());
+ }
+
+ MIB.addMBB(MBBOpnd);
+
+ if (Br->hasDelaySlot()) {
+ // Bundle the instruction in the delay slot to the newly created branch
+ // and erase the original branch.
+ assert(Br->isBundledWithSucc());
+ MachineBasicBlock::instr_iterator II = Br.getInstrIterator();
+ MIBundleBuilder(&*MIB).append((++II)->removeFromBundle());
+ }
+ Br->eraseFromParent();
+}
+
+// Expand branch instructions to long branches.
+// TODO: This function has to be fixed for beqz16 and bnez16, because it
+// currently assumes that all branches have 16-bit offsets, and will produce
+// wrong code if branches whose allowed offsets are [-128, -126, ..., 126]
+// are present.
+void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
+ MachineBasicBlock::iterator Pos;
+ MachineBasicBlock *MBB = I.Br->getParent(), *TgtMBB = getTargetMBB(*I.Br);
+ DebugLoc DL = I.Br->getDebugLoc();
+ const BasicBlock *BB = MBB->getBasicBlock();
+ MachineFunction::iterator FallThroughMBB = ++MachineFunction::iterator(MBB);
+ MachineBasicBlock *LongBrMBB = MF->CreateMachineBasicBlock(BB);
+ const MipsSubtarget &Subtarget =
+ static_cast<const MipsSubtarget &>(MF->getSubtarget());
+ const MipsInstrInfo *TII =
+ static_cast<const MipsInstrInfo *>(Subtarget.getInstrInfo());
+
+ MF->insert(FallThroughMBB, LongBrMBB);
+ MBB->replaceSuccessor(TgtMBB, LongBrMBB);
+
+ if (IsPIC) {
+ MachineBasicBlock *BalTgtMBB = MF->CreateMachineBasicBlock(BB);
+ MF->insert(FallThroughMBB, BalTgtMBB);
+ LongBrMBB->addSuccessor(BalTgtMBB);
+ BalTgtMBB->addSuccessor(TgtMBB);
+
+ // We must select between the MIPS32r6/MIPS64r6 BAL (which is a normal
+ // instruction) and the pre-MIPS32r6/MIPS64r6 definition (which is an
+ // pseudo-instruction wrapping BGEZAL).
+ unsigned BalOp = Subtarget.hasMips32r6() ? Mips::BAL : Mips::BAL_BR;
+
+ if (!ABI.IsN64()) {
+ // $longbr:
+ // addiu $sp, $sp, -8
+ // sw $ra, 0($sp)
+ // lui $at, %hi($tgt - $baltgt)
+ // bal $baltgt
+ // addiu $at, $at, %lo($tgt - $baltgt)
+ // $baltgt:
+ // addu $at, $ra, $at
+ // lw $ra, 0($sp)
+ // jr $at
+ // addiu $sp, $sp, 8
+ // $fallthrough:
+ //
+
+ Pos = LongBrMBB->begin();
+
+ BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::SP)
+ .addReg(Mips::SP).addImm(-8);
+ BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::SW)).addReg(Mips::RA)
+ .addReg(Mips::SP).addImm(0);
+
+ // LUi and ADDiu instructions create 32-bit offset of the target basic
+ // block from the target of BAL instruction. We cannot use immediate
+ // value for this offset because it cannot be determined accurately when
+ // the program has inline assembly statements. We therefore use the
+ // relocation expressions %hi($tgt-$baltgt) and %lo($tgt-$baltgt) which
+ // are resolved during the fixup, so the values will always be correct.
+ //
+ // Since we cannot create %hi($tgt-$baltgt) and %lo($tgt-$baltgt)
+ // expressions at this point (it is possible only at the MC layer),
+ // we replace LUi and ADDiu with pseudo instructions
+ // LONG_BRANCH_LUi and LONG_BRANCH_ADDiu, and add both basic
+ // blocks as operands to these instructions. When lowering these pseudo
+ // instructions to LUi and ADDiu in the MC layer, we will create
+ // %hi($tgt-$baltgt) and %lo($tgt-$baltgt) expressions and add them as
+ // operands to lowered instructions.
+
+ BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_LUi), Mips::AT)
+ .addMBB(TgtMBB).addMBB(BalTgtMBB);
+ MIBundleBuilder(*LongBrMBB, Pos)
+ .append(BuildMI(*MF, DL, TII->get(BalOp)).addMBB(BalTgtMBB))
+ .append(BuildMI(*MF, DL, TII->get(Mips::LONG_BRANCH_ADDiu), Mips::AT)
+ .addReg(Mips::AT)
+ .addMBB(TgtMBB)
+ .addMBB(BalTgtMBB));
+
+ Pos = BalTgtMBB->begin();
+
+ BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDu), Mips::AT)
+ .addReg(Mips::RA).addReg(Mips::AT);
+ BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::LW), Mips::RA)
+ .addReg(Mips::SP).addImm(0);
+
+ // In NaCl, modifying the sp is not allowed in branch delay slot.
+ if (Subtarget.isTargetNaCl())
+ BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::SP)
+ .addReg(Mips::SP).addImm(8);
+
+ if (Subtarget.hasMips32r6())
+ BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JALR))
+ .addReg(Mips::ZERO).addReg(Mips::AT);
+ else
+ BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JR)).addReg(Mips::AT);
+
+ if (Subtarget.isTargetNaCl()) {
+ BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::NOP));
+ // Bundle-align the target of indirect branch JR.
+ TgtMBB->setAlignment(MIPS_NACL_BUNDLE_ALIGN);
+ } else
+ BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::SP)
+ .addReg(Mips::SP).addImm(8);
+
+ BalTgtMBB->rbegin()->bundleWithPred();
+ } else {
+ // $longbr:
+ // daddiu $sp, $sp, -16
+ // sd $ra, 0($sp)
+ // daddiu $at, $zero, %hi($tgt - $baltgt)
+ // dsll $at, $at, 16
+ // bal $baltgt
+ // daddiu $at, $at, %lo($tgt - $baltgt)
+ // $baltgt:
+ // daddu $at, $ra, $at
+ // ld $ra, 0($sp)
+ // jr64 $at
+ // daddiu $sp, $sp, 16
+ // $fallthrough:
+ //
+
+ // We assume the branch is within-function, and that offset is within
+ // +/- 2GB. High 32 bits will therefore always be zero.
+
+ // Note that this will work even if the offset is negative, because
+ // of the +1 modification that's added in that case. For example, if the
+ // offset is -1MB (0xFFFFFFFFFFF00000), the computation for %higher is
+ //
+ // 0xFFFFFFFFFFF00000 + 0x80008000 = 0x000000007FF08000
+ //
+ // and the bits [47:32] are zero. For %highest
+ //
+ // 0xFFFFFFFFFFF00000 + 0x800080008000 = 0x000080007FF08000
+ //
+ // and the bits [63:48] are zero.
+
+ Pos = LongBrMBB->begin();
+
+ BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::SP_64)
+ .addReg(Mips::SP_64).addImm(-16);
+ BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::SD)).addReg(Mips::RA_64)
+ .addReg(Mips::SP_64).addImm(0);
+ BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_DADDiu),
+ Mips::AT_64).addReg(Mips::ZERO_64)
+ .addMBB(TgtMBB, MipsII::MO_ABS_HI).addMBB(BalTgtMBB);
+ BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DSLL), Mips::AT_64)
+ .addReg(Mips::AT_64).addImm(16);
+
+ MIBundleBuilder(*LongBrMBB, Pos)
+ .append(BuildMI(*MF, DL, TII->get(BalOp)).addMBB(BalTgtMBB))
+ .append(
+ BuildMI(*MF, DL, TII->get(Mips::LONG_BRANCH_DADDiu), Mips::AT_64)
+ .addReg(Mips::AT_64)
+ .addMBB(TgtMBB, MipsII::MO_ABS_LO)
+ .addMBB(BalTgtMBB));
+
+ Pos = BalTgtMBB->begin();
+
+ BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DADDu), Mips::AT_64)
+ .addReg(Mips::RA_64).addReg(Mips::AT_64);
+ BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::LD), Mips::RA_64)
+ .addReg(Mips::SP_64).addImm(0);
+
+ if (Subtarget.hasMips64r6())
+ BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JALR64))
+ .addReg(Mips::ZERO_64).addReg(Mips::AT_64);
+ else
+ BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JR64)).addReg(Mips::AT_64);
+
+ BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::SP_64)
+ .addReg(Mips::SP_64).addImm(16);
+ BalTgtMBB->rbegin()->bundleWithPred();
+ }
+
+ assert(LongBrMBB->size() + BalTgtMBB->size() == LongBranchSeqSize);
+ } else {
+ // $longbr:
+ // j $tgt
+ // nop
+ // $fallthrough:
+ //
+ Pos = LongBrMBB->begin();
+ LongBrMBB->addSuccessor(TgtMBB);
+ MIBundleBuilder(*LongBrMBB, Pos)
+ .append(BuildMI(*MF, DL, TII->get(Mips::J)).addMBB(TgtMBB))
+ .append(BuildMI(*MF, DL, TII->get(Mips::NOP)));
+
+ assert(LongBrMBB->size() == LongBranchSeqSize);
+ }
+
+ if (I.Br->isUnconditionalBranch()) {
+ // Change branch destination.
+ assert(I.Br->getDesc().getNumOperands() == 1);
+ I.Br->RemoveOperand(0);
+ I.Br->addOperand(MachineOperand::CreateMBB(LongBrMBB));
+ } else
+ // Change branch destination and reverse condition.
+ replaceBranch(*MBB, I.Br, DL, &*FallThroughMBB);
+}
+
+static void emitGPDisp(MachineFunction &F, const MipsInstrInfo *TII) {
+ MachineBasicBlock &MBB = F.front();
+ MachineBasicBlock::iterator I = MBB.begin();
+ DebugLoc DL = MBB.findDebugLoc(MBB.begin());
+ BuildMI(MBB, I, DL, TII->get(Mips::LUi), Mips::V0)
+ .addExternalSymbol("_gp_disp", MipsII::MO_ABS_HI);
+ BuildMI(MBB, I, DL, TII->get(Mips::ADDiu), Mips::V0)
+ .addReg(Mips::V0).addExternalSymbol("_gp_disp", MipsII::MO_ABS_LO);
+ MBB.removeLiveIn(Mips::V0);
+}
+
+bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) {
+ const MipsSubtarget &STI =
+ static_cast<const MipsSubtarget &>(F.getSubtarget());
+ const MipsInstrInfo *TII =
+ static_cast<const MipsInstrInfo *>(STI.getInstrInfo());
+ LongBranchSeqSize =
+ !IsPIC ? 2 : (ABI.IsN64() ? 10 : (!STI.isTargetNaCl() ? 9 : 10));
+
+ if (STI.inMips16Mode() || !STI.enableLongBranchPass())
+ return false;
+ if (IsPIC && static_cast<const MipsTargetMachine &>(TM).getABI().IsO32() &&
+ F.getInfo<MipsFunctionInfo>()->globalBaseRegSet())
+ emitGPDisp(F, TII);
+
+ if (SkipLongBranch)
+ return true;
+
+ MF = &F;
+ initMBBInfo();
+
+ SmallVectorImpl<MBBInfo>::iterator I, E = MBBInfos.end();
+ bool EverMadeChange = false, MadeChange = true;
+
+ while (MadeChange) {
+ MadeChange = false;
+
+ for (I = MBBInfos.begin(); I != E; ++I) {
+ // Skip if this MBB doesn't have a branch or the branch has already been
+ // converted to a long branch.
+ if (!I->Br || I->HasLongBranch)
+ continue;
+
+ int ShVal = STI.inMicroMipsMode() ? 2 : 4;
+ int64_t Offset = computeOffset(I->Br) / ShVal;
+
+ if (STI.isTargetNaCl()) {
+ // The offset calculation does not include sandboxing instructions
+ // that will be added later in the MC layer. Since at this point we
+ // don't know the exact amount of code that "sandboxing" will add, we
+ // conservatively estimate that code will not grow more than 100%.
+ Offset *= 2;
+ }
+
+ // Check if offset fits into 16-bit immediate field of branches.
+ if (!ForceLongBranch && isInt<16>(Offset))
+ continue;
+
+ I->HasLongBranch = true;
+ I->Size += LongBranchSeqSize * 4;
+ ++LongBranches;
+ EverMadeChange = MadeChange = true;
+ }
+ }
+
+ if (!EverMadeChange)
+ return true;
+
+ // Compute basic block addresses.
+ if (IsPIC) {
+ uint64_t Address = 0;
+
+ for (I = MBBInfos.begin(); I != E; Address += I->Size, ++I)
+ I->Address = Address;
+ }
+
+ // Do the expansion.
+ for (I = MBBInfos.begin(); I != E; ++I)
+ if (I->HasLongBranch)
+ expandToLongBranch(*I);
+
+ MF->RenumberBlocks();
+
+ return true;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsMCInstLower.cpp b/contrib/llvm/lib/Target/Mips/MipsMCInstLower.cpp
new file mode 100644
index 000000000000..d5bc4e537c37
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsMCInstLower.cpp
@@ -0,0 +1,281 @@
+//===-- MipsMCInstLower.cpp - Convert Mips MachineInstr to MCInst ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower Mips MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+#include "MipsMCInstLower.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MipsAsmPrinter.h"
+#include "MipsInstrInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+
+using namespace llvm;
+
+MipsMCInstLower::MipsMCInstLower(MipsAsmPrinter &asmprinter)
+ : AsmPrinter(asmprinter) {}
+
+void MipsMCInstLower::Initialize(MCContext *C) {
+ Ctx = C;
+}
+
+MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
+ MachineOperandType MOTy,
+ unsigned Offset) const {
+ MCSymbolRefExpr::VariantKind Kind = MCSymbolRefExpr::VK_None;
+ MipsMCExpr::MipsExprKind TargetKind = MipsMCExpr::MEK_None;
+ bool IsGpOff = false;
+ const MCSymbol *Symbol;
+
+ switch(MO.getTargetFlags()) {
+ default:
+ llvm_unreachable("Invalid target flag!");
+ case MipsII::MO_NO_FLAG:
+ break;
+ case MipsII::MO_GPREL:
+ TargetKind = MipsMCExpr::MEK_GPREL;
+ break;
+ case MipsII::MO_GOT_CALL:
+ TargetKind = MipsMCExpr::MEK_GOT_CALL;
+ break;
+ case MipsII::MO_GOT:
+ TargetKind = MipsMCExpr::MEK_GOT;
+ break;
+ case MipsII::MO_ABS_HI:
+ TargetKind = MipsMCExpr::MEK_HI;
+ break;
+ case MipsII::MO_ABS_LO:
+ TargetKind = MipsMCExpr::MEK_LO;
+ break;
+ case MipsII::MO_TLSGD:
+ TargetKind = MipsMCExpr::MEK_TLSGD;
+ break;
+ case MipsII::MO_TLSLDM:
+ TargetKind = MipsMCExpr::MEK_TLSLDM;
+ break;
+ case MipsII::MO_DTPREL_HI:
+ TargetKind = MipsMCExpr::MEK_DTPREL_HI;
+ break;
+ case MipsII::MO_DTPREL_LO:
+ TargetKind = MipsMCExpr::MEK_DTPREL_LO;
+ break;
+ case MipsII::MO_GOTTPREL:
+ TargetKind = MipsMCExpr::MEK_GOTTPREL;
+ break;
+ case MipsII::MO_TPREL_HI:
+ TargetKind = MipsMCExpr::MEK_TPREL_HI;
+ break;
+ case MipsII::MO_TPREL_LO:
+ TargetKind = MipsMCExpr::MEK_TPREL_LO;
+ break;
+ case MipsII::MO_GPOFF_HI:
+ TargetKind = MipsMCExpr::MEK_HI;
+ IsGpOff = true;
+ break;
+ case MipsII::MO_GPOFF_LO:
+ TargetKind = MipsMCExpr::MEK_LO;
+ IsGpOff = true;
+ break;
+ case MipsII::MO_GOT_DISP:
+ TargetKind = MipsMCExpr::MEK_GOT_DISP;
+ break;
+ case MipsII::MO_GOT_HI16:
+ TargetKind = MipsMCExpr::MEK_GOT_HI16;
+ break;
+ case MipsII::MO_GOT_LO16:
+ TargetKind = MipsMCExpr::MEK_GOT_LO16;
+ break;
+ case MipsII::MO_GOT_PAGE:
+ TargetKind = MipsMCExpr::MEK_GOT_PAGE;
+ break;
+ case MipsII::MO_GOT_OFST:
+ TargetKind = MipsMCExpr::MEK_GOT_OFST;
+ break;
+ case MipsII::MO_HIGHER:
+ TargetKind = MipsMCExpr::MEK_HIGHER;
+ break;
+ case MipsII::MO_HIGHEST:
+ TargetKind = MipsMCExpr::MEK_HIGHEST;
+ break;
+ case MipsII::MO_CALL_HI16:
+ TargetKind = MipsMCExpr::MEK_CALL_HI16;
+ break;
+ case MipsII::MO_CALL_LO16:
+ TargetKind = MipsMCExpr::MEK_CALL_LO16;
+ break;
+ }
+
+ switch (MOTy) {
+ case MachineOperand::MO_MachineBasicBlock:
+ Symbol = MO.getMBB()->getSymbol();
+ break;
+
+ case MachineOperand::MO_GlobalAddress:
+ Symbol = AsmPrinter.getSymbol(MO.getGlobal());
+ Offset += MO.getOffset();
+ break;
+
+ case MachineOperand::MO_BlockAddress:
+ Symbol = AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress());
+ Offset += MO.getOffset();
+ break;
+
+ case MachineOperand::MO_ExternalSymbol:
+ Symbol = AsmPrinter.GetExternalSymbolSymbol(MO.getSymbolName());
+ Offset += MO.getOffset();
+ break;
+
+ case MachineOperand::MO_MCSymbol:
+ Symbol = MO.getMCSymbol();
+ Offset += MO.getOffset();
+ break;
+
+ case MachineOperand::MO_JumpTableIndex:
+ Symbol = AsmPrinter.GetJTISymbol(MO.getIndex());
+ break;
+
+ case MachineOperand::MO_ConstantPoolIndex:
+ Symbol = AsmPrinter.GetCPISymbol(MO.getIndex());
+ Offset += MO.getOffset();
+ break;
+
+ default:
+ llvm_unreachable("<unknown operand type>");
+ }
+
+ const MCExpr *Expr = MCSymbolRefExpr::create(Symbol, Kind, *Ctx);
+
+ if (Offset) {
+ // Assume offset is never negative.
+ assert(Offset > 0);
+
+ Expr = MCBinaryExpr::createAdd(Expr, MCConstantExpr::create(Offset, *Ctx),
+ *Ctx);
+ }
+
+ if (IsGpOff)
+ Expr = MipsMCExpr::createGpOff(TargetKind, Expr, *Ctx);
+ else if (TargetKind != MipsMCExpr::MEK_None)
+ Expr = MipsMCExpr::create(TargetKind, Expr, *Ctx);
+
+ return MCOperand::createExpr(Expr);
+}
+
+MCOperand MipsMCInstLower::LowerOperand(const MachineOperand &MO,
+ unsigned offset) const {
+ MachineOperandType MOTy = MO.getType();
+
+ switch (MOTy) {
+ default: llvm_unreachable("unknown operand type");
+ case MachineOperand::MO_Register:
+ // Ignore all implicit register operands.
+ if (MO.isImplicit()) break;
+ return MCOperand::createReg(MO.getReg());
+ case MachineOperand::MO_Immediate:
+ return MCOperand::createImm(MO.getImm() + offset);
+ case MachineOperand::MO_MachineBasicBlock:
+ case MachineOperand::MO_GlobalAddress:
+ case MachineOperand::MO_ExternalSymbol:
+ case MachineOperand::MO_MCSymbol:
+ case MachineOperand::MO_JumpTableIndex:
+ case MachineOperand::MO_ConstantPoolIndex:
+ case MachineOperand::MO_BlockAddress:
+ return LowerSymbolOperand(MO, MOTy, offset);
+ case MachineOperand::MO_RegisterMask:
+ break;
+ }
+
+ return MCOperand();
+}
+
+MCOperand MipsMCInstLower::createSub(MachineBasicBlock *BB1,
+ MachineBasicBlock *BB2,
+ MipsMCExpr::MipsExprKind Kind) const {
+ const MCSymbolRefExpr *Sym1 = MCSymbolRefExpr::create(BB1->getSymbol(), *Ctx);
+ const MCSymbolRefExpr *Sym2 = MCSymbolRefExpr::create(BB2->getSymbol(), *Ctx);
+ const MCBinaryExpr *Sub = MCBinaryExpr::createSub(Sym1, Sym2, *Ctx);
+
+ return MCOperand::createExpr(MipsMCExpr::create(Kind, Sub, *Ctx));
+}
+
+void MipsMCInstLower::
+lowerLongBranchLUi(const MachineInstr *MI, MCInst &OutMI) const {
+ OutMI.setOpcode(Mips::LUi);
+
+ // Lower register operand.
+ OutMI.addOperand(LowerOperand(MI->getOperand(0)));
+
+ // Create %hi($tgt-$baltgt).
+ OutMI.addOperand(createSub(MI->getOperand(1).getMBB(),
+ MI->getOperand(2).getMBB(),
+ MipsMCExpr::MEK_HI));
+}
+
+void MipsMCInstLower::lowerLongBranchADDiu(
+ const MachineInstr *MI, MCInst &OutMI, int Opcode,
+ MipsMCExpr::MipsExprKind Kind) const {
+ OutMI.setOpcode(Opcode);
+
+ // Lower two register operands.
+ for (unsigned I = 0, E = 2; I != E; ++I) {
+ const MachineOperand &MO = MI->getOperand(I);
+ OutMI.addOperand(LowerOperand(MO));
+ }
+
+ // Create %lo($tgt-$baltgt) or %hi($tgt-$baltgt).
+ OutMI.addOperand(createSub(MI->getOperand(2).getMBB(),
+ MI->getOperand(3).getMBB(), Kind));
+}
+
+bool MipsMCInstLower::lowerLongBranch(const MachineInstr *MI,
+ MCInst &OutMI) const {
+ switch (MI->getOpcode()) {
+ default:
+ return false;
+ case Mips::LONG_BRANCH_LUi:
+ lowerLongBranchLUi(MI, OutMI);
+ return true;
+ case Mips::LONG_BRANCH_ADDiu:
+ lowerLongBranchADDiu(MI, OutMI, Mips::ADDiu, MipsMCExpr::MEK_LO);
+ return true;
+ case Mips::LONG_BRANCH_DADDiu:
+ unsigned TargetFlags = MI->getOperand(2).getTargetFlags();
+ if (TargetFlags == MipsII::MO_ABS_HI)
+ lowerLongBranchADDiu(MI, OutMI, Mips::DADDiu, MipsMCExpr::MEK_HI);
+ else if (TargetFlags == MipsII::MO_ABS_LO)
+ lowerLongBranchADDiu(MI, OutMI, Mips::DADDiu, MipsMCExpr::MEK_LO);
+ else
+ report_fatal_error("Unexpected flags for LONG_BRANCH_DADDiu");
+ return true;
+ }
+}
+
+void MipsMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+ if (lowerLongBranch(MI, OutMI))
+ return;
+
+ OutMI.setOpcode(MI->getOpcode());
+
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+ MCOperand MCOp = LowerOperand(MO);
+
+ if (MCOp.isValid())
+ OutMI.addOperand(MCOp);
+ }
+}
+
diff --git a/contrib/llvm/lib/Target/Mips/MipsMCInstLower.h b/contrib/llvm/lib/Target/Mips/MipsMCInstLower.h
new file mode 100644
index 000000000000..c25f90005480
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsMCInstLower.h
@@ -0,0 +1,48 @@
+//===-- MipsMCInstLower.h - Lower MachineInstr to MCInst -------*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSMCINSTLOWER_H
+#define LLVM_LIB_TARGET_MIPS_MIPSMCINSTLOWER_H
+#include "MCTargetDesc/MipsMCExpr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+ class MCContext;
+ class MCInst;
+ class MCOperand;
+ class MachineInstr;
+ class MachineFunction;
+ class MipsAsmPrinter;
+
+/// MipsMCInstLower - This class is used to lower an MachineInstr into an
+// MCInst.
+class LLVM_LIBRARY_VISIBILITY MipsMCInstLower {
+ typedef MachineOperand::MachineOperandType MachineOperandType;
+ MCContext *Ctx;
+ MipsAsmPrinter &AsmPrinter;
+public:
+ MipsMCInstLower(MipsAsmPrinter &asmprinter);
+ void Initialize(MCContext *C);
+ void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+ MCOperand LowerOperand(const MachineOperand& MO, unsigned offset = 0) const;
+
+private:
+ MCOperand LowerSymbolOperand(const MachineOperand &MO,
+ MachineOperandType MOTy, unsigned Offset) const;
+ MCOperand createSub(MachineBasicBlock *BB1, MachineBasicBlock *BB2,
+ MipsMCExpr::MipsExprKind Kind) const;
+ void lowerLongBranchLUi(const MachineInstr *MI, MCInst &OutMI) const;
+ void lowerLongBranchADDiu(const MachineInstr *MI, MCInst &OutMI, int Opcode,
+ MipsMCExpr::MipsExprKind Kind) const;
+ bool lowerLongBranch(const MachineInstr *MI, MCInst &OutMI) const;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsMSAInstrFormats.td b/contrib/llvm/lib/Target/Mips/MipsMSAInstrFormats.td
new file mode 100644
index 000000000000..7d25ea56e3d5
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsMSAInstrFormats.td
@@ -0,0 +1,455 @@
+//===- MipsMSAInstrFormats.td - Mips Instruction Formats ---*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+class MSAInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>,
+ PredicateControl, ASE_MSA {
+ let EncodingPredicates = [HasStdEnc];
+ let Inst{31-26} = 0b011110;
+}
+
+class MSACBranch : MSAInst {
+ let Inst{31-26} = 0b010001;
+}
+
+class MSASpecial : MSAInst {
+ let Inst{31-26} = 0b000000;
+}
+
+class MSAPseudo<dag outs, dag ins, list<dag> pattern,
+ InstrItinClass itin = IIPseudo>:
+ MipsPseudo<outs, ins, pattern, itin> {
+ let Predicates = [HasMSA];
+}
+
+class MSA_BIT_B_FMT<bits<3> major, bits<6> minor>: MSAInst {
+ bits<5> ws;
+ bits<5> wd;
+ bits<3> m;
+
+ let Inst{25-23} = major;
+ let Inst{22-19} = 0b1110;
+ let Inst{18-16} = m;
+ let Inst{15-11} = ws;
+ let Inst{10-6} = wd;
+ let Inst{5-0} = minor;
+}
+
+class MSA_BIT_H_FMT<bits<3> major, bits<6> minor>: MSAInst {
+ bits<5> ws;
+ bits<5> wd;
+ bits<4> m;
+
+ let Inst{25-23} = major;
+ let Inst{22-20} = 0b110;
+ let Inst{19-16} = m;
+ let Inst{15-11} = ws;
+ let Inst{10-6} = wd;
+ let Inst{5-0} = minor;
+}
+
+class MSA_BIT_W_FMT<bits<3> major, bits<6> minor>: MSAInst {
+ bits<5> ws;
+ bits<5> wd;
+ bits<5> m;
+
+ let Inst{25-23} = major;
+ let Inst{22-21} = 0b10;
+ let Inst{20-16} = m;
+ let Inst{15-11} = ws;
+ let Inst{10-6} = wd;
+ let Inst{5-0} = minor;
+}
+
+class MSA_BIT_D_FMT<bits<3> major, bits<6> minor>: MSAInst {
+ bits<5> ws;
+ bits<5> wd;
+ bits<6> m;
+
+ let Inst{25-23} = major;
+ let Inst{22} = 0b0;
+ let Inst{21-16} = m;
+ let Inst{15-11} = ws;
+ let Inst{10-6} = wd;
+ let Inst{5-0} = minor;
+}
+
+class MSA_2R_FILL_FMT<bits<8> major, bits<2> df, bits<6> minor>: MSAInst {
+ bits<5> rs;
+ bits<5> wd;
+
+ let Inst{25-18} = major;
+ let Inst{17-16} = df;
+ let Inst{15-11} = rs;
+ let Inst{10-6} = wd;
+ let Inst{5-0} = minor;
+}
+
+class MSA_2R_FILL_D_FMT<bits<8> major, bits<2> df, bits<6> minor>: MSAInst {
+ bits<5> rs;
+ bits<5> wd;
+
+ let Inst{25-18} = major;
+ let Inst{17-16} = df;
+ let Inst{15-11} = rs;
+ let Inst{10-6} = wd;
+ let Inst{5-0} = minor;
+}
+
+class MSA_2R_FMT<bits<8> major, bits<2> df, bits<6> minor>: MSAInst {
+ bits<5> ws;
+ bits<5> wd;
+
+ let Inst{25-18} = major;
+ let Inst{17-16} = df;
+ let Inst{15-11} = ws;
+ let Inst{10-6} = wd;
+ let Inst{5-0} = minor;
+}
+
+class MSA_2RF_FMT<bits<9> major, bits<1> df, bits<6> minor>: MSAInst {
+ bits<5> ws;
+ bits<5> wd;
+
+ let Inst{25-17} = major;
+ let Inst{16} = df;
+ let Inst{15-11} = ws;
+ let Inst{10-6} = wd;
+ let Inst{5-0} = minor;
+}
+
+class MSA_3R_FMT<bits<3> major, bits<2> df, bits<6> minor>: MSAInst {
+ bits<5> wt;
+ bits<5> ws;
+ bits<5> wd;
+
+ let Inst{25-23} = major;
+ let Inst{22-21} = df;
+ let Inst{20-16} = wt;
+ let Inst{15-11} = ws;
+ let Inst{10-6} = wd;
+ let Inst{5-0} = minor;
+}
+
+class MSA_3RF_FMT<bits<4> major, bits<1> df, bits<6> minor>: MSAInst {
+ bits<5> wt;
+ bits<5> ws;
+ bits<5> wd;
+
+ let Inst{25-22} = major;
+ let Inst{21} = df;
+ let Inst{20-16} = wt;
+ let Inst{15-11} = ws;
+ let Inst{10-6} = wd;
+ let Inst{5-0} = minor;
+}
+
+class MSA_3R_INDEX_FMT<bits<3> major, bits<2> df, bits<6> minor>: MSAInst {
+ bits<5> rt;
+ bits<5> ws;
+ bits<5> wd;
+
+ let Inst{25-23} = major;
+ let Inst{22-21} = df;
+ let Inst{20-16} = rt;
+ let Inst{15-11} = ws;
+ let Inst{10-6} = wd;
+ let Inst{5-0} = minor;
+}
+
+class MSA_ELM_FMT<bits<10> major, bits<6> minor>: MSAInst {
+ bits<5> ws;
+ bits<5> wd;
+
+ let Inst{25-16} = major;
+ let Inst{15-11} = ws;
+ let Inst{10-6} = wd;
+ let Inst{5-0} = minor;
+}
+
+class MSA_ELM_CFCMSA_FMT<bits<10> major, bits<6> minor>: MSAInst {
+ bits<5> rd;
+ bits<5> cs;
+
+ let Inst{25-16} = major;
+ let Inst{15-11} = cs;
+ let Inst{10-6} = rd;
+ let Inst{5-0} = minor;
+}
+
+class MSA_ELM_CTCMSA_FMT<bits<10> major, bits<6> minor>: MSAInst {
+ bits<5> rs;
+ bits<5> cd;
+
+ let Inst{25-16} = major;
+ let Inst{15-11} = rs;
+ let Inst{10-6} = cd;
+ let Inst{5-0} = minor;
+}
+
+class MSA_ELM_B_FMT<bits<4> major, bits<6> minor>: MSAInst {
+ bits<4> n;
+ bits<5> ws;
+ bits<5> wd;
+
+ let Inst{25-22} = major;
+ let Inst{21-20} = 0b00;
+ let Inst{19-16} = n{3-0};
+ let Inst{15-11} = ws;
+ let Inst{10-6} = wd;
+ let Inst{5-0} = minor;
+}
+
+class MSA_ELM_H_FMT<bits<4> major, bits<6> minor>: MSAInst {
+ bits<4> n;
+ bits<5> ws;
+ bits<5> wd;
+
+ let Inst{25-22} = major;
+ let Inst{21-19} = 0b100;
+ let Inst{18-16} = n{2-0};
+ let Inst{15-11} = ws;
+ let Inst{10-6} = wd;
+ let Inst{5-0} = minor;
+}
+
+class MSA_ELM_W_FMT<bits<4> major, bits<6> minor>: MSAInst {
+ bits<4> n;
+ bits<5> ws;
+ bits<5> wd;
+
+ let Inst{25-22} = major;
+ let Inst{21-18} = 0b1100;
+ let Inst{17-16} = n{1-0};
+ let Inst{15-11} = ws;
+ let Inst{10-6} = wd;
+ let Inst{5-0} = minor;
+}
+
+class MSA_ELM_D_FMT<bits<4> major, bits<6> minor>: MSAInst {
+ bits<4> n;
+ bits<5> ws;
+ bits<5> wd;
+
+ let Inst{25-22} = major;
+ let Inst{21-17} = 0b11100;
+ let Inst{16} = n{0};
+ let Inst{15-11} = ws;
+ let Inst{10-6} = wd;
+ let Inst{5-0} = minor;
+}
+
+class MSA_ELM_COPY_B_FMT<bits<4> major, bits<6> minor>: MSAInst {
+ bits<4> n;
+ bits<5> ws;
+ bits<5> rd;
+
+ let Inst{25-22} = major;
+ let Inst{21-20} = 0b00;
+ let Inst{19-16} = n{3-0};
+ let Inst{15-11} = ws;
+ let Inst{10-6} = rd;
+ let Inst{5-0} = minor;
+}
+
+class MSA_ELM_COPY_H_FMT<bits<4> major, bits<6> minor>: MSAInst {
+ bits<4> n;
+ bits<5> ws;
+ bits<5> rd;
+
+ let Inst{25-22} = major;
+ let Inst{21-19} = 0b100;
+ let Inst{18-16} = n{2-0};
+ let Inst{15-11} = ws;
+ let Inst{10-6} = rd;
+ let Inst{5-0} = minor;
+}
+
+class MSA_ELM_COPY_W_FMT<bits<4> major, bits<6> minor>: MSAInst {
+ bits<4> n;
+ bits<5> ws;
+ bits<5> rd;
+
+ let Inst{25-22} = major;
+ let Inst{21-18} = 0b1100;
+ let Inst{17-16} = n{1-0};
+ let Inst{15-11} = ws;
+ let Inst{10-6} = rd;
+ let Inst{5-0} = minor;
+}
+
+class MSA_ELM_COPY_D_FMT<bits<4> major, bits<6> minor>: MSAInst {
+ bits<4> n;
+ bits<5> ws;
+ bits<5> rd;
+
+ let Inst{25-22} = major;
+ let Inst{21-17} = 0b11100;
+ let Inst{16} = n{0};
+ let Inst{15-11} = ws;
+ let Inst{10-6} = rd;
+ let Inst{5-0} = minor;
+}
+
+class MSA_ELM_INSERT_B_FMT<bits<4> major, bits<6> minor>: MSAInst {
+ bits<6> n;
+ bits<5> rs;
+ bits<5> wd;
+
+ let Inst{25-22} = major;
+ let Inst{21-20} = 0b00;
+ let Inst{19-16} = n{3-0};
+ let Inst{15-11} = rs;
+ let Inst{10-6} = wd;
+ let Inst{5-0} = minor;
+}
+
+class MSA_ELM_INSERT_H_FMT<bits<4> major, bits<6> minor>: MSAInst {
+ bits<6> n;
+ bits<5> rs;
+ bits<5> wd;
+
+ let Inst{25-22} = major;
+ let Inst{21-19} = 0b100;
+ let Inst{18-16} = n{2-0};
+ let Inst{15-11} = rs;
+ let Inst{10-6} = wd;
+ let Inst{5-0} = minor;
+}
+
+class MSA_ELM_INSERT_W_FMT<bits<4> major, bits<6> minor>: MSAInst {
+ bits<6> n;
+ bits<5> rs;
+ bits<5> wd;
+
+ let Inst{25-22} = major;
+ let Inst{21-18} = 0b1100;
+ let Inst{17-16} = n{1-0};
+ let Inst{15-11} = rs;
+ let Inst{10-6} = wd;
+ let Inst{5-0} = minor;
+}
+
+class MSA_ELM_INSERT_D_FMT<bits<4> major, bits<6> minor>: MSAInst {
+ bits<6> n;
+ bits<5> rs;
+ bits<5> wd;
+
+ let Inst{25-22} = major;
+ let Inst{21-17} = 0b11100;
+ let Inst{16} = n{0};
+ let Inst{15-11} = rs;
+ let Inst{10-6} = wd;
+ let Inst{5-0} = minor;
+}
+
+class MSA_I5_FMT<bits<3> major, bits<2> df, bits<6> minor>: MSAInst {
+ bits<5> imm;
+ bits<5> ws;
+ bits<5> wd;
+
+ let Inst{25-23} = major;
+ let Inst{22-21} = df;
+ let Inst{20-16} = imm;
+ let Inst{15-11} = ws;
+ let Inst{10-6} = wd;
+ let Inst{5-0} = minor;
+}
+
+class MSA_I8_FMT<bits<2> major, bits<6> minor>: MSAInst {
+ bits<8> u8;
+ bits<5> ws;
+ bits<5> wd;
+
+ let Inst{25-24} = major;
+ let Inst{23-16} = u8;
+ let Inst{15-11} = ws;
+ let Inst{10-6} = wd;
+ let Inst{5-0} = minor;
+}
+
+class MSA_I10_FMT<bits<3> major, bits<2> df, bits<6> minor>: MSAInst {
+ bits<10> s10;
+ bits<5> wd;
+
+ let Inst{25-23} = major;
+ let Inst{22-21} = df;
+ let Inst{20-11} = s10;
+ let Inst{10-6} = wd;
+ let Inst{5-0} = minor;
+}
+
+class MSA_MI10_FMT<bits<2> df, bits<4> minor>: MSAInst {
+ bits<21> addr;
+ bits<5> wd;
+
+ let Inst{25-16} = addr{9-0};
+ let Inst{15-11} = addr{20-16};
+ let Inst{10-6} = wd;
+ let Inst{5-2} = minor;
+ let Inst{1-0} = df;
+}
+
+class MSA_VEC_FMT<bits<5> major, bits<6> minor>: MSAInst {
+ bits<5> wt;
+ bits<5> ws;
+ bits<5> wd;
+
+ let Inst{25-21} = major;
+ let Inst{20-16} = wt;
+ let Inst{15-11} = ws;
+ let Inst{10-6} = wd;
+ let Inst{5-0} = minor;
+}
+
+class MSA_CBRANCH_FMT<bits<3> major, bits<2> df>: MSACBranch {
+ bits<16> offset;
+ bits<5> wt;
+
+ let Inst{25-23} = major;
+ let Inst{22-21} = df;
+ let Inst{20-16} = wt;
+ let Inst{15-0} = offset;
+}
+
+class MSA_CBRANCH_V_FMT<bits<5> major>: MSACBranch {
+ bits<16> offset;
+ bits<5> wt;
+
+ let Inst{25-21} = major;
+ let Inst{20-16} = wt;
+ let Inst{15-0} = offset;
+}
+
+class SPECIAL_LSA_FMT<bits<6> minor>: MSASpecial {
+ bits<5> rs;
+ bits<5> rt;
+ bits<5> rd;
+ bits<2> sa;
+
+ let Inst{25-21} = rs;
+ let Inst{20-16} = rt;
+ let Inst{15-11} = rd;
+ let Inst{10-8} = 0b000;
+ let Inst{7-6} = sa;
+ let Inst{5-0} = minor;
+}
+
+class SPECIAL_DLSA_FMT<bits<6> minor>: MSASpecial {
+ bits<5> rs;
+ bits<5> rt;
+ bits<5> rd;
+ bits<2> sa;
+
+ let Inst{25-21} = rs;
+ let Inst{20-16} = rt;
+ let Inst{15-11} = rd;
+ let Inst{10-8} = 0b000;
+ let Inst{7-6} = sa;
+ let Inst{5-0} = minor;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsMSAInstrInfo.td b/contrib/llvm/lib/Target/Mips/MipsMSAInstrInfo.td
new file mode 100644
index 000000000000..8b04fcb76920
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsMSAInstrInfo.td
@@ -0,0 +1,3946 @@
+//===- MipsMSAInstrInfo.td - MSA ASE instructions -*- tablegen ------------*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips MSA ASE instructions.
+//
+//===----------------------------------------------------------------------===//
+
+def SDT_MipsVecCond : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisVec<1>]>;
+def SDT_VSetCC : SDTypeProfile<1, 3, [SDTCisInt<0>,
+ SDTCisInt<1>,
+ SDTCisSameAs<1, 2>,
+ SDTCisVT<3, OtherVT>]>;
+def SDT_VFSetCC : SDTypeProfile<1, 3, [SDTCisInt<0>,
+ SDTCisFP<1>,
+ SDTCisSameAs<1, 2>,
+ SDTCisVT<3, OtherVT>]>;
+def SDT_VSHF : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisVec<0>,
+ SDTCisInt<1>, SDTCisVec<1>,
+ SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>]>;
+def SDT_SHF : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisVec<0>,
+ SDTCisVT<1, i32>, SDTCisSameAs<0, 2>]>;
+def SDT_ILV : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisVec<0>,
+ SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>]>;
+def SDT_INSVE : SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+ SDTCisVT<2, i32>, SDTCisSameAs<0, 3>,
+ SDTCisVT<4, i32>]>;
+
+def MipsVAllNonZero : SDNode<"MipsISD::VALL_NONZERO", SDT_MipsVecCond>;
+def MipsVAnyNonZero : SDNode<"MipsISD::VANY_NONZERO", SDT_MipsVecCond>;
+def MipsVAllZero : SDNode<"MipsISD::VALL_ZERO", SDT_MipsVecCond>;
+def MipsVAnyZero : SDNode<"MipsISD::VANY_ZERO", SDT_MipsVecCond>;
+def MipsVSMax : SDNode<"MipsISD::VSMAX", SDTIntBinOp,
+ [SDNPCommutative, SDNPAssociative]>;
+def MipsVSMin : SDNode<"MipsISD::VSMIN", SDTIntBinOp,
+ [SDNPCommutative, SDNPAssociative]>;
+def MipsVUMax : SDNode<"MipsISD::VUMAX", SDTIntBinOp,
+ [SDNPCommutative, SDNPAssociative]>;
+def MipsVUMin : SDNode<"MipsISD::VUMIN", SDTIntBinOp,
+ [SDNPCommutative, SDNPAssociative]>;
+def MipsVNOR : SDNode<"MipsISD::VNOR", SDTIntBinOp,
+ [SDNPCommutative, SDNPAssociative]>;
+def MipsVSHF : SDNode<"MipsISD::VSHF", SDT_VSHF>;
+def MipsSHF : SDNode<"MipsISD::SHF", SDT_SHF>;
+def MipsILVEV : SDNode<"MipsISD::ILVEV", SDT_ILV>;
+def MipsILVOD : SDNode<"MipsISD::ILVOD", SDT_ILV>;
+def MipsILVL : SDNode<"MipsISD::ILVL", SDT_ILV>;
+def MipsILVR : SDNode<"MipsISD::ILVR", SDT_ILV>;
+def MipsPCKEV : SDNode<"MipsISD::PCKEV", SDT_ILV>;
+def MipsPCKOD : SDNode<"MipsISD::PCKOD", SDT_ILV>;
+def MipsINSVE : SDNode<"MipsISD::INSVE", SDT_INSVE>;
+
+def vsetcc : SDNode<"ISD::SETCC", SDT_VSetCC>;
+def vfsetcc : SDNode<"ISD::SETCC", SDT_VFSetCC>;
+
+def MipsVExtractSExt : SDNode<"MipsISD::VEXTRACT_SEXT_ELT",
+ SDTypeProfile<1, 3, [SDTCisPtrTy<2>]>, []>;
+def MipsVExtractZExt : SDNode<"MipsISD::VEXTRACT_ZEXT_ELT",
+ SDTypeProfile<1, 3, [SDTCisPtrTy<2>]>, []>;
+
+def immZExt1Ptr : ImmLeaf<iPTR, [{return isUInt<1>(Imm);}]>;
+def immZExt2Ptr : ImmLeaf<iPTR, [{return isUInt<2>(Imm);}]>;
+def immZExt3Ptr : ImmLeaf<iPTR, [{return isUInt<3>(Imm);}]>;
+def immZExt4Ptr : ImmLeaf<iPTR, [{return isUInt<4>(Imm);}]>;
+
+// Operands
+
+def immZExt2Lsa : ImmLeaf<i32, [{return isUInt<2>(Imm - 1);}]>;
+
+// Pattern fragments
+def vextract_sext_i8 : PatFrag<(ops node:$vec, node:$idx),
+ (MipsVExtractSExt node:$vec, node:$idx, i8)>;
+def vextract_sext_i16 : PatFrag<(ops node:$vec, node:$idx),
+ (MipsVExtractSExt node:$vec, node:$idx, i16)>;
+def vextract_sext_i32 : PatFrag<(ops node:$vec, node:$idx),
+ (MipsVExtractSExt node:$vec, node:$idx, i32)>;
+def vextract_sext_i64 : PatFrag<(ops node:$vec, node:$idx),
+ (MipsVExtractSExt node:$vec, node:$idx, i64)>;
+
+def vextract_zext_i8 : PatFrag<(ops node:$vec, node:$idx),
+ (MipsVExtractZExt node:$vec, node:$idx, i8)>;
+def vextract_zext_i16 : PatFrag<(ops node:$vec, node:$idx),
+ (MipsVExtractZExt node:$vec, node:$idx, i16)>;
+def vextract_zext_i32 : PatFrag<(ops node:$vec, node:$idx),
+ (MipsVExtractZExt node:$vec, node:$idx, i32)>;
+def vextract_zext_i64 : PatFrag<(ops node:$vec, node:$idx),
+ (MipsVExtractZExt node:$vec, node:$idx, i64)>;
+
+def vinsert_v16i8 : PatFrag<(ops node:$vec, node:$val, node:$idx),
+ (v16i8 (vector_insert node:$vec, node:$val, node:$idx))>;
+def vinsert_v8i16 : PatFrag<(ops node:$vec, node:$val, node:$idx),
+ (v8i16 (vector_insert node:$vec, node:$val, node:$idx))>;
+def vinsert_v4i32 : PatFrag<(ops node:$vec, node:$val, node:$idx),
+ (v4i32 (vector_insert node:$vec, node:$val, node:$idx))>;
+def vinsert_v2i64 : PatFrag<(ops node:$vec, node:$val, node:$idx),
+ (v2i64 (vector_insert node:$vec, node:$val, node:$idx))>;
+
+def insve_v16i8 : PatFrag<(ops node:$v1, node:$i1, node:$v2, node:$i2),
+ (v16i8 (MipsINSVE node:$v1, node:$i1, node:$v2, node:$i2))>;
+def insve_v8i16 : PatFrag<(ops node:$v1, node:$i1, node:$v2, node:$i2),
+ (v8i16 (MipsINSVE node:$v1, node:$i1, node:$v2, node:$i2))>;
+def insve_v4i32 : PatFrag<(ops node:$v1, node:$i1, node:$v2, node:$i2),
+ (v4i32 (MipsINSVE node:$v1, node:$i1, node:$v2, node:$i2))>;
+def insve_v2i64 : PatFrag<(ops node:$v1, node:$i1, node:$v2, node:$i2),
+ (v2i64 (MipsINSVE node:$v1, node:$i1, node:$v2, node:$i2))>;
+
+class vfsetcc_type<ValueType ResTy, ValueType OpTy, CondCode CC> :
+ PatFrag<(ops node:$lhs, node:$rhs),
+ (ResTy (vfsetcc (OpTy node:$lhs), (OpTy node:$rhs), CC))>;
+
+// ISD::SETFALSE cannot occur
+def vfsetoeq_v4f32 : vfsetcc_type<v4i32, v4f32, SETOEQ>;
+def vfsetoeq_v2f64 : vfsetcc_type<v2i64, v2f64, SETOEQ>;
+def vfsetoge_v4f32 : vfsetcc_type<v4i32, v4f32, SETOGE>;
+def vfsetoge_v2f64 : vfsetcc_type<v2i64, v2f64, SETOGE>;
+def vfsetogt_v4f32 : vfsetcc_type<v4i32, v4f32, SETOGT>;
+def vfsetogt_v2f64 : vfsetcc_type<v2i64, v2f64, SETOGT>;
+def vfsetole_v4f32 : vfsetcc_type<v4i32, v4f32, SETOLE>;
+def vfsetole_v2f64 : vfsetcc_type<v2i64, v2f64, SETOLE>;
+def vfsetolt_v4f32 : vfsetcc_type<v4i32, v4f32, SETOLT>;
+def vfsetolt_v2f64 : vfsetcc_type<v2i64, v2f64, SETOLT>;
+def vfsetone_v4f32 : vfsetcc_type<v4i32, v4f32, SETONE>;
+def vfsetone_v2f64 : vfsetcc_type<v2i64, v2f64, SETONE>;
+def vfsetord_v4f32 : vfsetcc_type<v4i32, v4f32, SETO>;
+def vfsetord_v2f64 : vfsetcc_type<v2i64, v2f64, SETO>;
+def vfsetun_v4f32 : vfsetcc_type<v4i32, v4f32, SETUO>;
+def vfsetun_v2f64 : vfsetcc_type<v2i64, v2f64, SETUO>;
+def vfsetueq_v4f32 : vfsetcc_type<v4i32, v4f32, SETUEQ>;
+def vfsetueq_v2f64 : vfsetcc_type<v2i64, v2f64, SETUEQ>;
+def vfsetuge_v4f32 : vfsetcc_type<v4i32, v4f32, SETUGE>;
+def vfsetuge_v2f64 : vfsetcc_type<v2i64, v2f64, SETUGE>;
+def vfsetugt_v4f32 : vfsetcc_type<v4i32, v4f32, SETUGT>;
+def vfsetugt_v2f64 : vfsetcc_type<v2i64, v2f64, SETUGT>;
+def vfsetule_v4f32 : vfsetcc_type<v4i32, v4f32, SETULE>;
+def vfsetule_v2f64 : vfsetcc_type<v2i64, v2f64, SETULE>;
+def vfsetult_v4f32 : vfsetcc_type<v4i32, v4f32, SETULT>;
+def vfsetult_v2f64 : vfsetcc_type<v2i64, v2f64, SETULT>;
+def vfsetune_v4f32 : vfsetcc_type<v4i32, v4f32, SETUNE>;
+def vfsetune_v2f64 : vfsetcc_type<v2i64, v2f64, SETUNE>;
+// ISD::SETTRUE cannot occur
+// ISD::SETFALSE2 cannot occur
+// ISD::SETTRUE2 cannot occur
+
+class vsetcc_type<ValueType ResTy, CondCode CC> :
+ PatFrag<(ops node:$lhs, node:$rhs),
+ (ResTy (vsetcc node:$lhs, node:$rhs, CC))>;
+
+def vseteq_v16i8 : vsetcc_type<v16i8, SETEQ>;
+def vseteq_v8i16 : vsetcc_type<v8i16, SETEQ>;
+def vseteq_v4i32 : vsetcc_type<v4i32, SETEQ>;
+def vseteq_v2i64 : vsetcc_type<v2i64, SETEQ>;
+def vsetle_v16i8 : vsetcc_type<v16i8, SETLE>;
+def vsetle_v8i16 : vsetcc_type<v8i16, SETLE>;
+def vsetle_v4i32 : vsetcc_type<v4i32, SETLE>;
+def vsetle_v2i64 : vsetcc_type<v2i64, SETLE>;
+def vsetlt_v16i8 : vsetcc_type<v16i8, SETLT>;
+def vsetlt_v8i16 : vsetcc_type<v8i16, SETLT>;
+def vsetlt_v4i32 : vsetcc_type<v4i32, SETLT>;
+def vsetlt_v2i64 : vsetcc_type<v2i64, SETLT>;
+def vsetule_v16i8 : vsetcc_type<v16i8, SETULE>;
+def vsetule_v8i16 : vsetcc_type<v8i16, SETULE>;
+def vsetule_v4i32 : vsetcc_type<v4i32, SETULE>;
+def vsetule_v2i64 : vsetcc_type<v2i64, SETULE>;
+def vsetult_v16i8 : vsetcc_type<v16i8, SETULT>;
+def vsetult_v8i16 : vsetcc_type<v8i16, SETULT>;
+def vsetult_v4i32 : vsetcc_type<v4i32, SETULT>;
+def vsetult_v2i64 : vsetcc_type<v2i64, SETULT>;
+
+def vsplati8 : PatFrag<(ops node:$e0),
+ (v16i8 (build_vector node:$e0, node:$e0,
+ node:$e0, node:$e0,
+ node:$e0, node:$e0,
+ node:$e0, node:$e0,
+ node:$e0, node:$e0,
+ node:$e0, node:$e0,
+ node:$e0, node:$e0,
+ node:$e0, node:$e0))>;
+def vsplati16 : PatFrag<(ops node:$e0),
+ (v8i16 (build_vector node:$e0, node:$e0,
+ node:$e0, node:$e0,
+ node:$e0, node:$e0,
+ node:$e0, node:$e0))>;
+def vsplati32 : PatFrag<(ops node:$e0),
+ (v4i32 (build_vector node:$e0, node:$e0,
+ node:$e0, node:$e0))>;
+def vsplati64 : PatFrag<(ops node:$e0),
+ (v2i64 (build_vector node:$e0, node:$e0))>;
+def vsplatf32 : PatFrag<(ops node:$e0),
+ (v4f32 (build_vector node:$e0, node:$e0,
+ node:$e0, node:$e0))>;
+def vsplatf64 : PatFrag<(ops node:$e0),
+ (v2f64 (build_vector node:$e0, node:$e0))>;
+
+def vsplati8_elt : PatFrag<(ops node:$v, node:$i),
+ (MipsVSHF (vsplati8 node:$i), node:$v, node:$v)>;
+def vsplati16_elt : PatFrag<(ops node:$v, node:$i),
+ (MipsVSHF (vsplati16 node:$i), node:$v, node:$v)>;
+def vsplati32_elt : PatFrag<(ops node:$v, node:$i),
+ (MipsVSHF (vsplati32 node:$i), node:$v, node:$v)>;
+def vsplati64_elt : PatFrag<(ops node:$v, node:$i),
+ (MipsVSHF (vsplati64 node:$i), node:$v, node:$v)>;
+
+class SplatPatLeaf<Operand opclass, dag frag, code pred = [{}],
+ SDNodeXForm xform = NOOP_SDNodeXForm>
+ : PatLeaf<frag, pred, xform> {
+ Operand OpClass = opclass;
+}
+
+class SplatComplexPattern<Operand opclass, ValueType ty, int numops, string fn,
+ list<SDNode> roots = [],
+ list<SDNodeProperty> props = []> :
+ ComplexPattern<ty, numops, fn, roots, props> {
+ Operand OpClass = opclass;
+}
+
+def vsplati8_uimm3 : SplatComplexPattern<vsplat_uimm3, v16i8, 1,
+ "selectVSplatUimm3",
+ [build_vector, bitconvert]>;
+
+def vsplati8_uimm4 : SplatComplexPattern<vsplat_uimm4, v16i8, 1,
+ "selectVSplatUimm4",
+ [build_vector, bitconvert]>;
+
+def vsplati8_uimm5 : SplatComplexPattern<vsplat_uimm5, v16i8, 1,
+ "selectVSplatUimm5",
+ [build_vector, bitconvert]>;
+
+def vsplati8_uimm8 : SplatComplexPattern<vsplat_uimm8, v16i8, 1,
+ "selectVSplatUimm8",
+ [build_vector, bitconvert]>;
+
+def vsplati8_simm5 : SplatComplexPattern<vsplat_simm5, v16i8, 1,
+ "selectVSplatSimm5",
+ [build_vector, bitconvert]>;
+
+def vsplati16_uimm3 : SplatComplexPattern<vsplat_uimm3, v8i16, 1,
+ "selectVSplatUimm3",
+ [build_vector, bitconvert]>;
+
+def vsplati16_uimm4 : SplatComplexPattern<vsplat_uimm4, v8i16, 1,
+ "selectVSplatUimm4",
+ [build_vector, bitconvert]>;
+
+def vsplati16_uimm5 : SplatComplexPattern<vsplat_uimm5, v8i16, 1,
+ "selectVSplatUimm5",
+ [build_vector, bitconvert]>;
+
+def vsplati16_simm5 : SplatComplexPattern<vsplat_simm5, v8i16, 1,
+ "selectVSplatSimm5",
+ [build_vector, bitconvert]>;
+
+def vsplati32_uimm2 : SplatComplexPattern<vsplat_uimm2, v4i32, 1,
+ "selectVSplatUimm2",
+ [build_vector, bitconvert]>;
+
+def vsplati32_uimm5 : SplatComplexPattern<vsplat_uimm5, v4i32, 1,
+ "selectVSplatUimm5",
+ [build_vector, bitconvert]>;
+
+def vsplati32_simm5 : SplatComplexPattern<vsplat_simm5, v4i32, 1,
+ "selectVSplatSimm5",
+ [build_vector, bitconvert]>;
+
+def vsplati64_uimm1 : SplatComplexPattern<vsplat_uimm1, v2i64, 1,
+ "selectVSplatUimm1",
+ [build_vector, bitconvert]>;
+
+def vsplati64_uimm5 : SplatComplexPattern<vsplat_uimm5, v2i64, 1,
+ "selectVSplatUimm5",
+ [build_vector, bitconvert]>;
+
+def vsplati64_uimm6 : SplatComplexPattern<vsplat_uimm6, v2i64, 1,
+ "selectVSplatUimm6",
+ [build_vector, bitconvert]>;
+
+def vsplati64_simm5 : SplatComplexPattern<vsplat_simm5, v2i64, 1,
+ "selectVSplatSimm5",
+ [build_vector, bitconvert]>;
+
+// Any build_vector that is a constant splat with a value that is an exact
+// power of 2
+def vsplat_uimm_pow2 : ComplexPattern<vAny, 1, "selectVSplatUimmPow2",
+ [build_vector, bitconvert]>;
+
+// Any build_vector that is a constant splat with a value that is the bitwise
+// inverse of an exact power of 2
+def vsplat_uimm_inv_pow2 : ComplexPattern<vAny, 1, "selectVSplatUimmInvPow2",
+ [build_vector, bitconvert]>;
+
+// Any build_vector that is a constant splat with only a consecutive sequence
+// of left-most bits set.
+def vsplat_maskl_bits_uimm3
+ : SplatComplexPattern<vsplat_uimm3, vAny, 1, "selectVSplatMaskL",
+ [build_vector, bitconvert]>;
+def vsplat_maskl_bits_uimm4
+ : SplatComplexPattern<vsplat_uimm4, vAny, 1, "selectVSplatMaskL",
+ [build_vector, bitconvert]>;
+def vsplat_maskl_bits_uimm5
+ : SplatComplexPattern<vsplat_uimm5, vAny, 1, "selectVSplatMaskL",
+ [build_vector, bitconvert]>;
+def vsplat_maskl_bits_uimm6
+ : SplatComplexPattern<vsplat_uimm6, vAny, 1, "selectVSplatMaskL",
+ [build_vector, bitconvert]>;
+
+// Any build_vector that is a constant splat with only a consecutive sequence
+// of right-most bits set.
+def vsplat_maskr_bits_uimm3
+ : SplatComplexPattern<vsplat_uimm3, vAny, 1, "selectVSplatMaskR",
+ [build_vector, bitconvert]>;
+def vsplat_maskr_bits_uimm4
+ : SplatComplexPattern<vsplat_uimm4, vAny, 1, "selectVSplatMaskR",
+ [build_vector, bitconvert]>;
+def vsplat_maskr_bits_uimm5
+ : SplatComplexPattern<vsplat_uimm5, vAny, 1, "selectVSplatMaskR",
+ [build_vector, bitconvert]>;
+def vsplat_maskr_bits_uimm6
+ : SplatComplexPattern<vsplat_uimm6, vAny, 1, "selectVSplatMaskR",
+ [build_vector, bitconvert]>;
+
+// Any build_vector that is a constant splat with a value that equals 1
+// FIXME: These should be a ComplexPattern but we can't use them because the
+// ISel generator requires the uses to have a name, but providing a name
+// causes other errors ("used in pattern but not operand list")
+def vsplat_imm_eq_1 : PatLeaf<(build_vector), [{
+ APInt Imm;
+ EVT EltTy = N->getValueType(0).getVectorElementType();
+
+ return selectVSplat(N, Imm, EltTy.getSizeInBits()) &&
+ Imm.getBitWidth() == EltTy.getSizeInBits() && Imm == 1;
+}]>;
+
+def vsplati64_imm_eq_1 : PatLeaf<(bitconvert (v4i32 (build_vector))), [{
+ APInt Imm;
+ SDNode *BV = N->getOperand(0).getNode();
+ EVT EltTy = N->getValueType(0).getVectorElementType();
+
+ return selectVSplat(BV, Imm, EltTy.getSizeInBits()) &&
+ Imm.getBitWidth() == EltTy.getSizeInBits() && Imm == 1;
+}]>;
+
+def vbclr_b : PatFrag<(ops node:$ws, node:$wt),
+ (and node:$ws, (xor (shl vsplat_imm_eq_1, node:$wt),
+ immAllOnesV))>;
+def vbclr_h : PatFrag<(ops node:$ws, node:$wt),
+ (and node:$ws, (xor (shl vsplat_imm_eq_1, node:$wt),
+ immAllOnesV))>;
+def vbclr_w : PatFrag<(ops node:$ws, node:$wt),
+ (and node:$ws, (xor (shl vsplat_imm_eq_1, node:$wt),
+ immAllOnesV))>;
+def vbclr_d : PatFrag<(ops node:$ws, node:$wt),
+ (and node:$ws, (xor (shl (v2i64 vsplati64_imm_eq_1),
+ node:$wt),
+ (bitconvert (v4i32 immAllOnesV))))>;
+
+def vbneg_b : PatFrag<(ops node:$ws, node:$wt),
+ (xor node:$ws, (shl vsplat_imm_eq_1, node:$wt))>;
+def vbneg_h : PatFrag<(ops node:$ws, node:$wt),
+ (xor node:$ws, (shl vsplat_imm_eq_1, node:$wt))>;
+def vbneg_w : PatFrag<(ops node:$ws, node:$wt),
+ (xor node:$ws, (shl vsplat_imm_eq_1, node:$wt))>;
+def vbneg_d : PatFrag<(ops node:$ws, node:$wt),
+ (xor node:$ws, (shl (v2i64 vsplati64_imm_eq_1),
+ node:$wt))>;
+
+def vbset_b : PatFrag<(ops node:$ws, node:$wt),
+ (or node:$ws, (shl vsplat_imm_eq_1, node:$wt))>;
+def vbset_h : PatFrag<(ops node:$ws, node:$wt),
+ (or node:$ws, (shl vsplat_imm_eq_1, node:$wt))>;
+def vbset_w : PatFrag<(ops node:$ws, node:$wt),
+ (or node:$ws, (shl vsplat_imm_eq_1, node:$wt))>;
+def vbset_d : PatFrag<(ops node:$ws, node:$wt),
+ (or node:$ws, (shl (v2i64 vsplati64_imm_eq_1),
+ node:$wt))>;
+
+def fms : PatFrag<(ops node:$wd, node:$ws, node:$wt),
+ (fsub node:$wd, (fmul node:$ws, node:$wt))>;
+
+def muladd : PatFrag<(ops node:$wd, node:$ws, node:$wt),
+ (add node:$wd, (mul node:$ws, node:$wt))>;
+
+def mulsub : PatFrag<(ops node:$wd, node:$ws, node:$wt),
+ (sub node:$wd, (mul node:$ws, node:$wt))>;
+
+def mul_fexp2 : PatFrag<(ops node:$ws, node:$wt),
+ (fmul node:$ws, (fexp2 node:$wt))>;
+
+// Instruction encoding.
+class ADD_A_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b010000>;
+class ADD_A_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b010000>;
+class ADD_A_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b010000>;
+class ADD_A_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b010000>;
+
+class ADDS_A_B_ENC : MSA_3R_FMT<0b001, 0b00, 0b010000>;
+class ADDS_A_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b010000>;
+class ADDS_A_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b010000>;
+class ADDS_A_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b010000>;
+
+class ADDS_S_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b010000>;
+class ADDS_S_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b010000>;
+class ADDS_S_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b010000>;
+class ADDS_S_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b010000>;
+
+class ADDS_U_B_ENC : MSA_3R_FMT<0b011, 0b00, 0b010000>;
+class ADDS_U_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b010000>;
+class ADDS_U_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b010000>;
+class ADDS_U_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b010000>;
+
+class ADDV_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b001110>;
+class ADDV_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b001110>;
+class ADDV_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b001110>;
+class ADDV_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b001110>;
+
+class ADDVI_B_ENC : MSA_I5_FMT<0b000, 0b00, 0b000110>;
+class ADDVI_H_ENC : MSA_I5_FMT<0b000, 0b01, 0b000110>;
+class ADDVI_W_ENC : MSA_I5_FMT<0b000, 0b10, 0b000110>;
+class ADDVI_D_ENC : MSA_I5_FMT<0b000, 0b11, 0b000110>;
+
+class AND_V_ENC : MSA_VEC_FMT<0b00000, 0b011110>;
+
+class ANDI_B_ENC : MSA_I8_FMT<0b00, 0b000000>;
+
+class ASUB_S_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b010001>;
+class ASUB_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b010001>;
+class ASUB_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b010001>;
+class ASUB_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b010001>;
+
+class ASUB_U_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b010001>;
+class ASUB_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b010001>;
+class ASUB_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b010001>;
+class ASUB_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b010001>;
+
+class AVE_S_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b010000>;
+class AVE_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b010000>;
+class AVE_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b010000>;
+class AVE_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b010000>;
+
+class AVE_U_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b010000>;
+class AVE_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b010000>;
+class AVE_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b010000>;
+class AVE_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b010000>;
+
+class AVER_S_B_ENC : MSA_3R_FMT<0b110, 0b00, 0b010000>;
+class AVER_S_H_ENC : MSA_3R_FMT<0b110, 0b01, 0b010000>;
+class AVER_S_W_ENC : MSA_3R_FMT<0b110, 0b10, 0b010000>;
+class AVER_S_D_ENC : MSA_3R_FMT<0b110, 0b11, 0b010000>;
+
+class AVER_U_B_ENC : MSA_3R_FMT<0b111, 0b00, 0b010000>;
+class AVER_U_H_ENC : MSA_3R_FMT<0b111, 0b01, 0b010000>;
+class AVER_U_W_ENC : MSA_3R_FMT<0b111, 0b10, 0b010000>;
+class AVER_U_D_ENC : MSA_3R_FMT<0b111, 0b11, 0b010000>;
+
+class BCLR_B_ENC : MSA_3R_FMT<0b011, 0b00, 0b001101>;
+class BCLR_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b001101>;
+class BCLR_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b001101>;
+class BCLR_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b001101>;
+
+class BCLRI_B_ENC : MSA_BIT_B_FMT<0b011, 0b001001>;
+class BCLRI_H_ENC : MSA_BIT_H_FMT<0b011, 0b001001>;
+class BCLRI_W_ENC : MSA_BIT_W_FMT<0b011, 0b001001>;
+class BCLRI_D_ENC : MSA_BIT_D_FMT<0b011, 0b001001>;
+
+class BINSL_B_ENC : MSA_3R_FMT<0b110, 0b00, 0b001101>;
+class BINSL_H_ENC : MSA_3R_FMT<0b110, 0b01, 0b001101>;
+class BINSL_W_ENC : MSA_3R_FMT<0b110, 0b10, 0b001101>;
+class BINSL_D_ENC : MSA_3R_FMT<0b110, 0b11, 0b001101>;
+
+class BINSLI_B_ENC : MSA_BIT_B_FMT<0b110, 0b001001>;
+class BINSLI_H_ENC : MSA_BIT_H_FMT<0b110, 0b001001>;
+class BINSLI_W_ENC : MSA_BIT_W_FMT<0b110, 0b001001>;
+class BINSLI_D_ENC : MSA_BIT_D_FMT<0b110, 0b001001>;
+
+class BINSR_B_ENC : MSA_3R_FMT<0b111, 0b00, 0b001101>;
+class BINSR_H_ENC : MSA_3R_FMT<0b111, 0b01, 0b001101>;
+class BINSR_W_ENC : MSA_3R_FMT<0b111, 0b10, 0b001101>;
+class BINSR_D_ENC : MSA_3R_FMT<0b111, 0b11, 0b001101>;
+
+class BINSRI_B_ENC : MSA_BIT_B_FMT<0b111, 0b001001>;
+class BINSRI_H_ENC : MSA_BIT_H_FMT<0b111, 0b001001>;
+class BINSRI_W_ENC : MSA_BIT_W_FMT<0b111, 0b001001>;
+class BINSRI_D_ENC : MSA_BIT_D_FMT<0b111, 0b001001>;
+
+class BMNZ_V_ENC : MSA_VEC_FMT<0b00100, 0b011110>;
+
+class BMNZI_B_ENC : MSA_I8_FMT<0b00, 0b000001>;
+
+class BMZ_V_ENC : MSA_VEC_FMT<0b00101, 0b011110>;
+
+class BMZI_B_ENC : MSA_I8_FMT<0b01, 0b000001>;
+
+class BNEG_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b001101>;
+class BNEG_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b001101>;
+class BNEG_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b001101>;
+class BNEG_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b001101>;
+
+class BNEGI_B_ENC : MSA_BIT_B_FMT<0b101, 0b001001>;
+class BNEGI_H_ENC : MSA_BIT_H_FMT<0b101, 0b001001>;
+class BNEGI_W_ENC : MSA_BIT_W_FMT<0b101, 0b001001>;
+class BNEGI_D_ENC : MSA_BIT_D_FMT<0b101, 0b001001>;
+
+class BNZ_B_ENC : MSA_CBRANCH_FMT<0b111, 0b00>;
+class BNZ_H_ENC : MSA_CBRANCH_FMT<0b111, 0b01>;
+class BNZ_W_ENC : MSA_CBRANCH_FMT<0b111, 0b10>;
+class BNZ_D_ENC : MSA_CBRANCH_FMT<0b111, 0b11>;
+
+class BNZ_V_ENC : MSA_CBRANCH_V_FMT<0b01111>;
+
+class BSEL_V_ENC : MSA_VEC_FMT<0b00110, 0b011110>;
+
+class BSELI_B_ENC : MSA_I8_FMT<0b10, 0b000001>;
+
+class BSET_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b001101>;
+class BSET_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b001101>;
+class BSET_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b001101>;
+class BSET_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b001101>;
+
+class BSETI_B_ENC : MSA_BIT_B_FMT<0b100, 0b001001>;
+class BSETI_H_ENC : MSA_BIT_H_FMT<0b100, 0b001001>;
+class BSETI_W_ENC : MSA_BIT_W_FMT<0b100, 0b001001>;
+class BSETI_D_ENC : MSA_BIT_D_FMT<0b100, 0b001001>;
+
+class BZ_B_ENC : MSA_CBRANCH_FMT<0b110, 0b00>;
+class BZ_H_ENC : MSA_CBRANCH_FMT<0b110, 0b01>;
+class BZ_W_ENC : MSA_CBRANCH_FMT<0b110, 0b10>;
+class BZ_D_ENC : MSA_CBRANCH_FMT<0b110, 0b11>;
+
+class BZ_V_ENC : MSA_CBRANCH_V_FMT<0b01011>;
+
+class CEQ_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b001111>;
+class CEQ_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b001111>;
+class CEQ_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b001111>;
+class CEQ_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b001111>;
+
+class CEQI_B_ENC : MSA_I5_FMT<0b000, 0b00, 0b000111>;
+class CEQI_H_ENC : MSA_I5_FMT<0b000, 0b01, 0b000111>;
+class CEQI_W_ENC : MSA_I5_FMT<0b000, 0b10, 0b000111>;
+class CEQI_D_ENC : MSA_I5_FMT<0b000, 0b11, 0b000111>;
+
+class CFCMSA_ENC : MSA_ELM_CFCMSA_FMT<0b0001111110, 0b011001>;
+
+class CLE_S_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b001111>;
+class CLE_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b001111>;
+class CLE_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b001111>;
+class CLE_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b001111>;
+
+class CLE_U_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b001111>;
+class CLE_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b001111>;
+class CLE_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b001111>;
+class CLE_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b001111>;
+
+class CLEI_S_B_ENC : MSA_I5_FMT<0b100, 0b00, 0b000111>;
+class CLEI_S_H_ENC : MSA_I5_FMT<0b100, 0b01, 0b000111>;
+class CLEI_S_W_ENC : MSA_I5_FMT<0b100, 0b10, 0b000111>;
+class CLEI_S_D_ENC : MSA_I5_FMT<0b100, 0b11, 0b000111>;
+
+class CLEI_U_B_ENC : MSA_I5_FMT<0b101, 0b00, 0b000111>;
+class CLEI_U_H_ENC : MSA_I5_FMT<0b101, 0b01, 0b000111>;
+class CLEI_U_W_ENC : MSA_I5_FMT<0b101, 0b10, 0b000111>;
+class CLEI_U_D_ENC : MSA_I5_FMT<0b101, 0b11, 0b000111>;
+
+class CLT_S_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b001111>;
+class CLT_S_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b001111>;
+class CLT_S_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b001111>;
+class CLT_S_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b001111>;
+
+class CLT_U_B_ENC : MSA_3R_FMT<0b011, 0b00, 0b001111>;
+class CLT_U_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b001111>;
+class CLT_U_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b001111>;
+class CLT_U_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b001111>;
+
+class CLTI_S_B_ENC : MSA_I5_FMT<0b010, 0b00, 0b000111>;
+class CLTI_S_H_ENC : MSA_I5_FMT<0b010, 0b01, 0b000111>;
+class CLTI_S_W_ENC : MSA_I5_FMT<0b010, 0b10, 0b000111>;
+class CLTI_S_D_ENC : MSA_I5_FMT<0b010, 0b11, 0b000111>;
+
+class CLTI_U_B_ENC : MSA_I5_FMT<0b011, 0b00, 0b000111>;
+class CLTI_U_H_ENC : MSA_I5_FMT<0b011, 0b01, 0b000111>;
+class CLTI_U_W_ENC : MSA_I5_FMT<0b011, 0b10, 0b000111>;
+class CLTI_U_D_ENC : MSA_I5_FMT<0b011, 0b11, 0b000111>;
+
+class COPY_S_B_ENC : MSA_ELM_COPY_B_FMT<0b0010, 0b011001>;
+class COPY_S_H_ENC : MSA_ELM_COPY_H_FMT<0b0010, 0b011001>;
+class COPY_S_W_ENC : MSA_ELM_COPY_W_FMT<0b0010, 0b011001>;
+class COPY_S_D_ENC : MSA_ELM_COPY_D_FMT<0b0010, 0b011001>;
+
+class COPY_U_B_ENC : MSA_ELM_COPY_B_FMT<0b0011, 0b011001>;
+class COPY_U_H_ENC : MSA_ELM_COPY_H_FMT<0b0011, 0b011001>;
+class COPY_U_W_ENC : MSA_ELM_COPY_W_FMT<0b0011, 0b011001>;
+
+class CTCMSA_ENC : MSA_ELM_CTCMSA_FMT<0b0000111110, 0b011001>;
+
+class DIV_S_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b010010>;
+class DIV_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b010010>;
+class DIV_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b010010>;
+class DIV_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b010010>;
+
+class DIV_U_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b010010>;
+class DIV_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b010010>;
+class DIV_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b010010>;
+class DIV_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b010010>;
+
+class DOTP_S_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b010011>;
+class DOTP_S_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b010011>;
+class DOTP_S_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b010011>;
+
+class DOTP_U_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b010011>;
+class DOTP_U_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b010011>;
+class DOTP_U_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b010011>;
+
+class DPADD_S_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b010011>;
+class DPADD_S_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b010011>;
+class DPADD_S_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b010011>;
+
+class DPADD_U_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b010011>;
+class DPADD_U_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b010011>;
+class DPADD_U_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b010011>;
+
+class DPSUB_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b010011>;
+class DPSUB_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b010011>;
+class DPSUB_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b010011>;
+
+class DPSUB_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b010011>;
+class DPSUB_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b010011>;
+class DPSUB_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b010011>;
+
+class FADD_W_ENC : MSA_3RF_FMT<0b0000, 0b0, 0b011011>;
+class FADD_D_ENC : MSA_3RF_FMT<0b0000, 0b1, 0b011011>;
+
+class FCAF_W_ENC : MSA_3RF_FMT<0b0000, 0b0, 0b011010>;
+class FCAF_D_ENC : MSA_3RF_FMT<0b0000, 0b1, 0b011010>;
+
+class FCEQ_W_ENC : MSA_3RF_FMT<0b0010, 0b0, 0b011010>;
+class FCEQ_D_ENC : MSA_3RF_FMT<0b0010, 0b1, 0b011010>;
+
+class FCLASS_W_ENC : MSA_2RF_FMT<0b110010000, 0b0, 0b011110>;
+class FCLASS_D_ENC : MSA_2RF_FMT<0b110010000, 0b1, 0b011110>;
+
+class FCLE_W_ENC : MSA_3RF_FMT<0b0110, 0b0, 0b011010>;
+class FCLE_D_ENC : MSA_3RF_FMT<0b0110, 0b1, 0b011010>;
+
+class FCLT_W_ENC : MSA_3RF_FMT<0b0100, 0b0, 0b011010>;
+class FCLT_D_ENC : MSA_3RF_FMT<0b0100, 0b1, 0b011010>;
+
+class FCNE_W_ENC : MSA_3RF_FMT<0b0011, 0b0, 0b011100>;
+class FCNE_D_ENC : MSA_3RF_FMT<0b0011, 0b1, 0b011100>;
+
+class FCOR_W_ENC : MSA_3RF_FMT<0b0001, 0b0, 0b011100>;
+class FCOR_D_ENC : MSA_3RF_FMT<0b0001, 0b1, 0b011100>;
+
+class FCUEQ_W_ENC : MSA_3RF_FMT<0b0011, 0b0, 0b011010>;
+class FCUEQ_D_ENC : MSA_3RF_FMT<0b0011, 0b1, 0b011010>;
+
+class FCULE_W_ENC : MSA_3RF_FMT<0b0111, 0b0, 0b011010>;
+class FCULE_D_ENC : MSA_3RF_FMT<0b0111, 0b1, 0b011010>;
+
+class FCULT_W_ENC : MSA_3RF_FMT<0b0101, 0b0, 0b011010>;
+class FCULT_D_ENC : MSA_3RF_FMT<0b0101, 0b1, 0b011010>;
+
+class FCUN_W_ENC : MSA_3RF_FMT<0b0001, 0b0, 0b011010>;
+class FCUN_D_ENC : MSA_3RF_FMT<0b0001, 0b1, 0b011010>;
+
+class FCUNE_W_ENC : MSA_3RF_FMT<0b0010, 0b0, 0b011100>;
+class FCUNE_D_ENC : MSA_3RF_FMT<0b0010, 0b1, 0b011100>;
+
+class FDIV_W_ENC : MSA_3RF_FMT<0b0011, 0b0, 0b011011>;
+class FDIV_D_ENC : MSA_3RF_FMT<0b0011, 0b1, 0b011011>;
+
+class FEXDO_H_ENC : MSA_3RF_FMT<0b1000, 0b0, 0b011011>;
+class FEXDO_W_ENC : MSA_3RF_FMT<0b1000, 0b1, 0b011011>;
+
+class FEXP2_W_ENC : MSA_3RF_FMT<0b0111, 0b0, 0b011011>;
+class FEXP2_D_ENC : MSA_3RF_FMT<0b0111, 0b1, 0b011011>;
+
+class FEXUPL_W_ENC : MSA_2RF_FMT<0b110011000, 0b0, 0b011110>;
+class FEXUPL_D_ENC : MSA_2RF_FMT<0b110011000, 0b1, 0b011110>;
+
+class FEXUPR_W_ENC : MSA_2RF_FMT<0b110011001, 0b0, 0b011110>;
+class FEXUPR_D_ENC : MSA_2RF_FMT<0b110011001, 0b1, 0b011110>;
+
+class FFINT_S_W_ENC : MSA_2RF_FMT<0b110011110, 0b0, 0b011110>;
+class FFINT_S_D_ENC : MSA_2RF_FMT<0b110011110, 0b1, 0b011110>;
+
+class FFINT_U_W_ENC : MSA_2RF_FMT<0b110011111, 0b0, 0b011110>;
+class FFINT_U_D_ENC : MSA_2RF_FMT<0b110011111, 0b1, 0b011110>;
+
+class FFQL_W_ENC : MSA_2RF_FMT<0b110011010, 0b0, 0b011110>;
+class FFQL_D_ENC : MSA_2RF_FMT<0b110011010, 0b1, 0b011110>;
+
+class FFQR_W_ENC : MSA_2RF_FMT<0b110011011, 0b0, 0b011110>;
+class FFQR_D_ENC : MSA_2RF_FMT<0b110011011, 0b1, 0b011110>;
+
+class FILL_B_ENC : MSA_2R_FILL_FMT<0b11000000, 0b00, 0b011110>;
+class FILL_H_ENC : MSA_2R_FILL_FMT<0b11000000, 0b01, 0b011110>;
+class FILL_W_ENC : MSA_2R_FILL_FMT<0b11000000, 0b10, 0b011110>;
+class FILL_D_ENC : MSA_2R_FILL_D_FMT<0b11000000, 0b11, 0b011110>;
+
+class FLOG2_W_ENC : MSA_2RF_FMT<0b110010111, 0b0, 0b011110>;
+class FLOG2_D_ENC : MSA_2RF_FMT<0b110010111, 0b1, 0b011110>;
+
+class FMADD_W_ENC : MSA_3RF_FMT<0b0100, 0b0, 0b011011>;
+class FMADD_D_ENC : MSA_3RF_FMT<0b0100, 0b1, 0b011011>;
+
+class FMAX_W_ENC : MSA_3RF_FMT<0b1110, 0b0, 0b011011>;
+class FMAX_D_ENC : MSA_3RF_FMT<0b1110, 0b1, 0b011011>;
+
+class FMAX_A_W_ENC : MSA_3RF_FMT<0b1111, 0b0, 0b011011>;
+class FMAX_A_D_ENC : MSA_3RF_FMT<0b1111, 0b1, 0b011011>;
+
+class FMIN_W_ENC : MSA_3RF_FMT<0b1100, 0b0, 0b011011>;
+class FMIN_D_ENC : MSA_3RF_FMT<0b1100, 0b1, 0b011011>;
+
+class FMIN_A_W_ENC : MSA_3RF_FMT<0b1101, 0b0, 0b011011>;
+class FMIN_A_D_ENC : MSA_3RF_FMT<0b1101, 0b1, 0b011011>;
+
+class FMSUB_W_ENC : MSA_3RF_FMT<0b0101, 0b0, 0b011011>;
+class FMSUB_D_ENC : MSA_3RF_FMT<0b0101, 0b1, 0b011011>;
+
+class FMUL_W_ENC : MSA_3RF_FMT<0b0010, 0b0, 0b011011>;
+class FMUL_D_ENC : MSA_3RF_FMT<0b0010, 0b1, 0b011011>;
+
+class FRINT_W_ENC : MSA_2RF_FMT<0b110010110, 0b0, 0b011110>;
+class FRINT_D_ENC : MSA_2RF_FMT<0b110010110, 0b1, 0b011110>;
+
+class FRCP_W_ENC : MSA_2RF_FMT<0b110010101, 0b0, 0b011110>;
+class FRCP_D_ENC : MSA_2RF_FMT<0b110010101, 0b1, 0b011110>;
+
+class FRSQRT_W_ENC : MSA_2RF_FMT<0b110010100, 0b0, 0b011110>;
+class FRSQRT_D_ENC : MSA_2RF_FMT<0b110010100, 0b1, 0b011110>;
+
+class FSAF_W_ENC : MSA_3RF_FMT<0b1000, 0b0, 0b011010>;
+class FSAF_D_ENC : MSA_3RF_FMT<0b1000, 0b1, 0b011010>;
+
+class FSEQ_W_ENC : MSA_3RF_FMT<0b1010, 0b0, 0b011010>;
+class FSEQ_D_ENC : MSA_3RF_FMT<0b1010, 0b1, 0b011010>;
+
+class FSLE_W_ENC : MSA_3RF_FMT<0b1110, 0b0, 0b011010>;
+class FSLE_D_ENC : MSA_3RF_FMT<0b1110, 0b1, 0b011010>;
+
+class FSLT_W_ENC : MSA_3RF_FMT<0b1100, 0b0, 0b011010>;
+class FSLT_D_ENC : MSA_3RF_FMT<0b1100, 0b1, 0b011010>;
+
+class FSNE_W_ENC : MSA_3RF_FMT<0b1011, 0b0, 0b011100>;
+class FSNE_D_ENC : MSA_3RF_FMT<0b1011, 0b1, 0b011100>;
+
+class FSOR_W_ENC : MSA_3RF_FMT<0b1001, 0b0, 0b011100>;
+class FSOR_D_ENC : MSA_3RF_FMT<0b1001, 0b1, 0b011100>;
+
+class FSQRT_W_ENC : MSA_2RF_FMT<0b110010011, 0b0, 0b011110>;
+class FSQRT_D_ENC : MSA_2RF_FMT<0b110010011, 0b1, 0b011110>;
+
+class FSUB_W_ENC : MSA_3RF_FMT<0b0001, 0b0, 0b011011>;
+class FSUB_D_ENC : MSA_3RF_FMT<0b0001, 0b1, 0b011011>;
+
+class FSUEQ_W_ENC : MSA_3RF_FMT<0b1011, 0b0, 0b011010>;
+class FSUEQ_D_ENC : MSA_3RF_FMT<0b1011, 0b1, 0b011010>;
+
+class FSULE_W_ENC : MSA_3RF_FMT<0b1111, 0b0, 0b011010>;
+class FSULE_D_ENC : MSA_3RF_FMT<0b1111, 0b1, 0b011010>;
+
+class FSULT_W_ENC : MSA_3RF_FMT<0b1101, 0b0, 0b011010>;
+class FSULT_D_ENC : MSA_3RF_FMT<0b1101, 0b1, 0b011010>;
+
+class FSUN_W_ENC : MSA_3RF_FMT<0b1001, 0b0, 0b011010>;
+class FSUN_D_ENC : MSA_3RF_FMT<0b1001, 0b1, 0b011010>;
+
+class FSUNE_W_ENC : MSA_3RF_FMT<0b1010, 0b0, 0b011100>;
+class FSUNE_D_ENC : MSA_3RF_FMT<0b1010, 0b1, 0b011100>;
+
+class FTINT_S_W_ENC : MSA_2RF_FMT<0b110011100, 0b0, 0b011110>;
+class FTINT_S_D_ENC : MSA_2RF_FMT<0b110011100, 0b1, 0b011110>;
+
+class FTINT_U_W_ENC : MSA_2RF_FMT<0b110011101, 0b0, 0b011110>;
+class FTINT_U_D_ENC : MSA_2RF_FMT<0b110011101, 0b1, 0b011110>;
+
+class FTQ_H_ENC : MSA_3RF_FMT<0b1010, 0b0, 0b011011>;
+class FTQ_W_ENC : MSA_3RF_FMT<0b1010, 0b1, 0b011011>;
+
+class FTRUNC_S_W_ENC : MSA_2RF_FMT<0b110010001, 0b0, 0b011110>;
+class FTRUNC_S_D_ENC : MSA_2RF_FMT<0b110010001, 0b1, 0b011110>;
+
+class FTRUNC_U_W_ENC : MSA_2RF_FMT<0b110010010, 0b0, 0b011110>;
+class FTRUNC_U_D_ENC : MSA_2RF_FMT<0b110010010, 0b1, 0b011110>;
+
+class HADD_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b010101>;
+class HADD_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b010101>;
+class HADD_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b010101>;
+
+class HADD_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b010101>;
+class HADD_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b010101>;
+class HADD_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b010101>;
+
+class HSUB_S_H_ENC : MSA_3R_FMT<0b110, 0b01, 0b010101>;
+class HSUB_S_W_ENC : MSA_3R_FMT<0b110, 0b10, 0b010101>;
+class HSUB_S_D_ENC : MSA_3R_FMT<0b110, 0b11, 0b010101>;
+
+class HSUB_U_H_ENC : MSA_3R_FMT<0b111, 0b01, 0b010101>;
+class HSUB_U_W_ENC : MSA_3R_FMT<0b111, 0b10, 0b010101>;
+class HSUB_U_D_ENC : MSA_3R_FMT<0b111, 0b11, 0b010101>;
+
+class ILVEV_B_ENC : MSA_3R_FMT<0b110, 0b00, 0b010100>;
+class ILVEV_H_ENC : MSA_3R_FMT<0b110, 0b01, 0b010100>;
+class ILVEV_W_ENC : MSA_3R_FMT<0b110, 0b10, 0b010100>;
+class ILVEV_D_ENC : MSA_3R_FMT<0b110, 0b11, 0b010100>;
+
+class ILVL_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b010100>;
+class ILVL_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b010100>;
+class ILVL_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b010100>;
+class ILVL_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b010100>;
+
+class ILVOD_B_ENC : MSA_3R_FMT<0b111, 0b00, 0b010100>;
+class ILVOD_H_ENC : MSA_3R_FMT<0b111, 0b01, 0b010100>;
+class ILVOD_W_ENC : MSA_3R_FMT<0b111, 0b10, 0b010100>;
+class ILVOD_D_ENC : MSA_3R_FMT<0b111, 0b11, 0b010100>;
+
+class ILVR_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b010100>;
+class ILVR_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b010100>;
+class ILVR_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b010100>;
+class ILVR_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b010100>;
+
+class INSERT_B_ENC : MSA_ELM_INSERT_B_FMT<0b0100, 0b011001>;
+class INSERT_H_ENC : MSA_ELM_INSERT_H_FMT<0b0100, 0b011001>;
+class INSERT_W_ENC : MSA_ELM_INSERT_W_FMT<0b0100, 0b011001>;
+class INSERT_D_ENC : MSA_ELM_INSERT_D_FMT<0b0100, 0b011001>;
+
+class INSVE_B_ENC : MSA_ELM_B_FMT<0b0101, 0b011001>;
+class INSVE_H_ENC : MSA_ELM_H_FMT<0b0101, 0b011001>;
+class INSVE_W_ENC : MSA_ELM_W_FMT<0b0101, 0b011001>;
+class INSVE_D_ENC : MSA_ELM_D_FMT<0b0101, 0b011001>;
+
+class LD_B_ENC : MSA_MI10_FMT<0b00, 0b1000>;
+class LD_H_ENC : MSA_MI10_FMT<0b01, 0b1000>;
+class LD_W_ENC : MSA_MI10_FMT<0b10, 0b1000>;
+class LD_D_ENC : MSA_MI10_FMT<0b11, 0b1000>;
+
+class LDI_B_ENC : MSA_I10_FMT<0b110, 0b00, 0b000111>;
+class LDI_H_ENC : MSA_I10_FMT<0b110, 0b01, 0b000111>;
+class LDI_W_ENC : MSA_I10_FMT<0b110, 0b10, 0b000111>;
+class LDI_D_ENC : MSA_I10_FMT<0b110, 0b11, 0b000111>;
+
+class LSA_ENC : SPECIAL_LSA_FMT<0b000101>;
+class DLSA_ENC : SPECIAL_DLSA_FMT<0b010101>;
+
+class MADD_Q_H_ENC : MSA_3RF_FMT<0b0101, 0b0, 0b011100>;
+class MADD_Q_W_ENC : MSA_3RF_FMT<0b0101, 0b1, 0b011100>;
+
+class MADDR_Q_H_ENC : MSA_3RF_FMT<0b1101, 0b0, 0b011100>;
+class MADDR_Q_W_ENC : MSA_3RF_FMT<0b1101, 0b1, 0b011100>;
+
+class MADDV_B_ENC : MSA_3R_FMT<0b001, 0b00, 0b010010>;
+class MADDV_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b010010>;
+class MADDV_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b010010>;
+class MADDV_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b010010>;
+
+class MAX_A_B_ENC : MSA_3R_FMT<0b110, 0b00, 0b001110>;
+class MAX_A_H_ENC : MSA_3R_FMT<0b110, 0b01, 0b001110>;
+class MAX_A_W_ENC : MSA_3R_FMT<0b110, 0b10, 0b001110>;
+class MAX_A_D_ENC : MSA_3R_FMT<0b110, 0b11, 0b001110>;
+
+class MAX_S_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b001110>;
+class MAX_S_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b001110>;
+class MAX_S_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b001110>;
+class MAX_S_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b001110>;
+
+class MAX_U_B_ENC : MSA_3R_FMT<0b011, 0b00, 0b001110>;
+class MAX_U_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b001110>;
+class MAX_U_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b001110>;
+class MAX_U_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b001110>;
+
+class MAXI_S_B_ENC : MSA_I5_FMT<0b010, 0b00, 0b000110>;
+class MAXI_S_H_ENC : MSA_I5_FMT<0b010, 0b01, 0b000110>;
+class MAXI_S_W_ENC : MSA_I5_FMT<0b010, 0b10, 0b000110>;
+class MAXI_S_D_ENC : MSA_I5_FMT<0b010, 0b11, 0b000110>;
+
+class MAXI_U_B_ENC : MSA_I5_FMT<0b011, 0b00, 0b000110>;
+class MAXI_U_H_ENC : MSA_I5_FMT<0b011, 0b01, 0b000110>;
+class MAXI_U_W_ENC : MSA_I5_FMT<0b011, 0b10, 0b000110>;
+class MAXI_U_D_ENC : MSA_I5_FMT<0b011, 0b11, 0b000110>;
+
+class MIN_A_B_ENC : MSA_3R_FMT<0b111, 0b00, 0b001110>;
+class MIN_A_H_ENC : MSA_3R_FMT<0b111, 0b01, 0b001110>;
+class MIN_A_W_ENC : MSA_3R_FMT<0b111, 0b10, 0b001110>;
+class MIN_A_D_ENC : MSA_3R_FMT<0b111, 0b11, 0b001110>;
+
+class MIN_S_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b001110>;
+class MIN_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b001110>;
+class MIN_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b001110>;
+class MIN_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b001110>;
+
+class MIN_U_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b001110>;
+class MIN_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b001110>;
+class MIN_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b001110>;
+class MIN_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b001110>;
+
+class MINI_S_B_ENC : MSA_I5_FMT<0b100, 0b00, 0b000110>;
+class MINI_S_H_ENC : MSA_I5_FMT<0b100, 0b01, 0b000110>;
+class MINI_S_W_ENC : MSA_I5_FMT<0b100, 0b10, 0b000110>;
+class MINI_S_D_ENC : MSA_I5_FMT<0b100, 0b11, 0b000110>;
+
+class MINI_U_B_ENC : MSA_I5_FMT<0b101, 0b00, 0b000110>;
+class MINI_U_H_ENC : MSA_I5_FMT<0b101, 0b01, 0b000110>;
+class MINI_U_W_ENC : MSA_I5_FMT<0b101, 0b10, 0b000110>;
+class MINI_U_D_ENC : MSA_I5_FMT<0b101, 0b11, 0b000110>;
+
+class MOD_S_B_ENC : MSA_3R_FMT<0b110, 0b00, 0b010010>;
+class MOD_S_H_ENC : MSA_3R_FMT<0b110, 0b01, 0b010010>;
+class MOD_S_W_ENC : MSA_3R_FMT<0b110, 0b10, 0b010010>;
+class MOD_S_D_ENC : MSA_3R_FMT<0b110, 0b11, 0b010010>;
+
+class MOD_U_B_ENC : MSA_3R_FMT<0b111, 0b00, 0b010010>;
+class MOD_U_H_ENC : MSA_3R_FMT<0b111, 0b01, 0b010010>;
+class MOD_U_W_ENC : MSA_3R_FMT<0b111, 0b10, 0b010010>;
+class MOD_U_D_ENC : MSA_3R_FMT<0b111, 0b11, 0b010010>;
+
+class MOVE_V_ENC : MSA_ELM_FMT<0b0010111110, 0b011001>;
+
+class MSUB_Q_H_ENC : MSA_3RF_FMT<0b0110, 0b0, 0b011100>;
+class MSUB_Q_W_ENC : MSA_3RF_FMT<0b0110, 0b1, 0b011100>;
+
+class MSUBR_Q_H_ENC : MSA_3RF_FMT<0b1110, 0b0, 0b011100>;
+class MSUBR_Q_W_ENC : MSA_3RF_FMT<0b1110, 0b1, 0b011100>;
+
+class MSUBV_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b010010>;
+class MSUBV_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b010010>;
+class MSUBV_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b010010>;
+class MSUBV_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b010010>;
+
+class MUL_Q_H_ENC : MSA_3RF_FMT<0b0100, 0b0, 0b011100>;
+class MUL_Q_W_ENC : MSA_3RF_FMT<0b0100, 0b1, 0b011100>;
+
+class MULR_Q_H_ENC : MSA_3RF_FMT<0b1100, 0b0, 0b011100>;
+class MULR_Q_W_ENC : MSA_3RF_FMT<0b1100, 0b1, 0b011100>;
+
+class MULV_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b010010>;
+class MULV_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b010010>;
+class MULV_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b010010>;
+class MULV_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b010010>;
+
+class NLOC_B_ENC : MSA_2R_FMT<0b11000010, 0b00, 0b011110>;
+class NLOC_H_ENC : MSA_2R_FMT<0b11000010, 0b01, 0b011110>;
+class NLOC_W_ENC : MSA_2R_FMT<0b11000010, 0b10, 0b011110>;
+class NLOC_D_ENC : MSA_2R_FMT<0b11000010, 0b11, 0b011110>;
+
+class NLZC_B_ENC : MSA_2R_FMT<0b11000011, 0b00, 0b011110>;
+class NLZC_H_ENC : MSA_2R_FMT<0b11000011, 0b01, 0b011110>;
+class NLZC_W_ENC : MSA_2R_FMT<0b11000011, 0b10, 0b011110>;
+class NLZC_D_ENC : MSA_2R_FMT<0b11000011, 0b11, 0b011110>;
+
+class NOR_V_ENC : MSA_VEC_FMT<0b00010, 0b011110>;
+
+class NORI_B_ENC : MSA_I8_FMT<0b10, 0b000000>;
+
+class OR_V_ENC : MSA_VEC_FMT<0b00001, 0b011110>;
+
+class ORI_B_ENC : MSA_I8_FMT<0b01, 0b000000>;
+
+class PCKEV_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b010100>;
+class PCKEV_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b010100>;
+class PCKEV_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b010100>;
+class PCKEV_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b010100>;
+
+class PCKOD_B_ENC : MSA_3R_FMT<0b011, 0b00, 0b010100>;
+class PCKOD_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b010100>;
+class PCKOD_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b010100>;
+class PCKOD_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b010100>;
+
+class PCNT_B_ENC : MSA_2R_FMT<0b11000001, 0b00, 0b011110>;
+class PCNT_H_ENC : MSA_2R_FMT<0b11000001, 0b01, 0b011110>;
+class PCNT_W_ENC : MSA_2R_FMT<0b11000001, 0b10, 0b011110>;
+class PCNT_D_ENC : MSA_2R_FMT<0b11000001, 0b11, 0b011110>;
+
+class SAT_S_B_ENC : MSA_BIT_B_FMT<0b000, 0b001010>;
+class SAT_S_H_ENC : MSA_BIT_H_FMT<0b000, 0b001010>;
+class SAT_S_W_ENC : MSA_BIT_W_FMT<0b000, 0b001010>;
+class SAT_S_D_ENC : MSA_BIT_D_FMT<0b000, 0b001010>;
+
+class SAT_U_B_ENC : MSA_BIT_B_FMT<0b001, 0b001010>;
+class SAT_U_H_ENC : MSA_BIT_H_FMT<0b001, 0b001010>;
+class SAT_U_W_ENC : MSA_BIT_W_FMT<0b001, 0b001010>;
+class SAT_U_D_ENC : MSA_BIT_D_FMT<0b001, 0b001010>;
+
+class SHF_B_ENC : MSA_I8_FMT<0b00, 0b000010>;
+class SHF_H_ENC : MSA_I8_FMT<0b01, 0b000010>;
+class SHF_W_ENC : MSA_I8_FMT<0b10, 0b000010>;
+
+class SLD_B_ENC : MSA_3R_INDEX_FMT<0b000, 0b00, 0b010100>;
+class SLD_H_ENC : MSA_3R_INDEX_FMT<0b000, 0b01, 0b010100>;
+class SLD_W_ENC : MSA_3R_INDEX_FMT<0b000, 0b10, 0b010100>;
+class SLD_D_ENC : MSA_3R_INDEX_FMT<0b000, 0b11, 0b010100>;
+
+class SLDI_B_ENC : MSA_ELM_B_FMT<0b0000, 0b011001>;
+class SLDI_H_ENC : MSA_ELM_H_FMT<0b0000, 0b011001>;
+class SLDI_W_ENC : MSA_ELM_W_FMT<0b0000, 0b011001>;
+class SLDI_D_ENC : MSA_ELM_D_FMT<0b0000, 0b011001>;
+
+class SLL_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b001101>;
+class SLL_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b001101>;
+class SLL_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b001101>;
+class SLL_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b001101>;
+
+class SLLI_B_ENC : MSA_BIT_B_FMT<0b000, 0b001001>;
+class SLLI_H_ENC : MSA_BIT_H_FMT<0b000, 0b001001>;
+class SLLI_W_ENC : MSA_BIT_W_FMT<0b000, 0b001001>;
+class SLLI_D_ENC : MSA_BIT_D_FMT<0b000, 0b001001>;
+
+class SPLAT_B_ENC : MSA_3R_INDEX_FMT<0b001, 0b00, 0b010100>;
+class SPLAT_H_ENC : MSA_3R_INDEX_FMT<0b001, 0b01, 0b010100>;
+class SPLAT_W_ENC : MSA_3R_INDEX_FMT<0b001, 0b10, 0b010100>;
+class SPLAT_D_ENC : MSA_3R_INDEX_FMT<0b001, 0b11, 0b010100>;
+
+class SPLATI_B_ENC : MSA_ELM_B_FMT<0b0001, 0b011001>;
+class SPLATI_H_ENC : MSA_ELM_H_FMT<0b0001, 0b011001>;
+class SPLATI_W_ENC : MSA_ELM_W_FMT<0b0001, 0b011001>;
+class SPLATI_D_ENC : MSA_ELM_D_FMT<0b0001, 0b011001>;
+
+class SRA_B_ENC : MSA_3R_FMT<0b001, 0b00, 0b001101>;
+class SRA_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b001101>;
+class SRA_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b001101>;
+class SRA_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b001101>;
+
+class SRAI_B_ENC : MSA_BIT_B_FMT<0b001, 0b001001>;
+class SRAI_H_ENC : MSA_BIT_H_FMT<0b001, 0b001001>;
+class SRAI_W_ENC : MSA_BIT_W_FMT<0b001, 0b001001>;
+class SRAI_D_ENC : MSA_BIT_D_FMT<0b001, 0b001001>;
+
+class SRAR_B_ENC : MSA_3R_FMT<0b001, 0b00, 0b010101>;
+class SRAR_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b010101>;
+class SRAR_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b010101>;
+class SRAR_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b010101>;
+
+class SRARI_B_ENC : MSA_BIT_B_FMT<0b010, 0b001010>;
+class SRARI_H_ENC : MSA_BIT_H_FMT<0b010, 0b001010>;
+class SRARI_W_ENC : MSA_BIT_W_FMT<0b010, 0b001010>;
+class SRARI_D_ENC : MSA_BIT_D_FMT<0b010, 0b001010>;
+
+class SRL_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b001101>;
+class SRL_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b001101>;
+class SRL_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b001101>;
+class SRL_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b001101>;
+
+class SRLI_B_ENC : MSA_BIT_B_FMT<0b010, 0b001001>;
+class SRLI_H_ENC : MSA_BIT_H_FMT<0b010, 0b001001>;
+class SRLI_W_ENC : MSA_BIT_W_FMT<0b010, 0b001001>;
+class SRLI_D_ENC : MSA_BIT_D_FMT<0b010, 0b001001>;
+
+class SRLR_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b010101>;
+class SRLR_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b010101>;
+class SRLR_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b010101>;
+class SRLR_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b010101>;
+
+class SRLRI_B_ENC : MSA_BIT_B_FMT<0b011, 0b001010>;
+class SRLRI_H_ENC : MSA_BIT_H_FMT<0b011, 0b001010>;
+class SRLRI_W_ENC : MSA_BIT_W_FMT<0b011, 0b001010>;
+class SRLRI_D_ENC : MSA_BIT_D_FMT<0b011, 0b001010>;
+
+class ST_B_ENC : MSA_MI10_FMT<0b00, 0b1001>;
+class ST_H_ENC : MSA_MI10_FMT<0b01, 0b1001>;
+class ST_W_ENC : MSA_MI10_FMT<0b10, 0b1001>;
+class ST_D_ENC : MSA_MI10_FMT<0b11, 0b1001>;
+
+class SUBS_S_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b010001>;
+class SUBS_S_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b010001>;
+class SUBS_S_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b010001>;
+class SUBS_S_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b010001>;
+
+class SUBS_U_B_ENC : MSA_3R_FMT<0b001, 0b00, 0b010001>;
+class SUBS_U_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b010001>;
+class SUBS_U_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b010001>;
+class SUBS_U_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b010001>;
+
+class SUBSUS_U_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b010001>;
+class SUBSUS_U_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b010001>;
+class SUBSUS_U_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b010001>;
+class SUBSUS_U_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b010001>;
+
+class SUBSUU_S_B_ENC : MSA_3R_FMT<0b011, 0b00, 0b010001>;
+class SUBSUU_S_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b010001>;
+class SUBSUU_S_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b010001>;
+class SUBSUU_S_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b010001>;
+
+class SUBV_B_ENC : MSA_3R_FMT<0b001, 0b00, 0b001110>;
+class SUBV_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b001110>;
+class SUBV_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b001110>;
+class SUBV_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b001110>;
+
+class SUBVI_B_ENC : MSA_I5_FMT<0b001, 0b00, 0b000110>;
+class SUBVI_H_ENC : MSA_I5_FMT<0b001, 0b01, 0b000110>;
+class SUBVI_W_ENC : MSA_I5_FMT<0b001, 0b10, 0b000110>;
+class SUBVI_D_ENC : MSA_I5_FMT<0b001, 0b11, 0b000110>;
+
+class VSHF_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b010101>;
+class VSHF_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b010101>;
+class VSHF_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b010101>;
+class VSHF_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b010101>;
+
+class XOR_V_ENC : MSA_VEC_FMT<0b00011, 0b011110>;
+
+class XORI_B_ENC : MSA_I8_FMT<0b11, 0b000000>;
+
+// Instruction desc.
+class MSA_BIT_B_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ ComplexPattern Imm, RegisterOperand ROWD,
+ RegisterOperand ROWS = ROWD,
+ InstrItinClass itin = NoItinerary> {
+ dag OutOperandList = (outs ROWD:$wd);
+ dag InOperandList = (ins ROWS:$ws, vsplat_uimm3:$m);
+ string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+ list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, Imm:$m))];
+ InstrItinClass Itinerary = itin;
+}
+
+class MSA_BIT_H_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ ComplexPattern Imm, RegisterOperand ROWD,
+ RegisterOperand ROWS = ROWD,
+ InstrItinClass itin = NoItinerary> {
+ dag OutOperandList = (outs ROWD:$wd);
+ dag InOperandList = (ins ROWS:$ws, vsplat_uimm4:$m);
+ string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+ list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, Imm:$m))];
+ InstrItinClass Itinerary = itin;
+}
+
+class MSA_BIT_W_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ ComplexPattern Imm, RegisterOperand ROWD,
+ RegisterOperand ROWS = ROWD,
+ InstrItinClass itin = NoItinerary> {
+ dag OutOperandList = (outs ROWD:$wd);
+ dag InOperandList = (ins ROWS:$ws, vsplat_uimm5:$m);
+ string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+ list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, Imm:$m))];
+ InstrItinClass Itinerary = itin;
+}
+
+class MSA_BIT_D_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ ComplexPattern Imm, RegisterOperand ROWD,
+ RegisterOperand ROWS = ROWD,
+ InstrItinClass itin = NoItinerary> {
+ dag OutOperandList = (outs ROWD:$wd);
+ dag InOperandList = (ins ROWS:$ws, vsplat_uimm6:$m);
+ string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+ list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, Imm:$m))];
+ InstrItinClass Itinerary = itin;
+}
+
+class MSA_BIT_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ Operand ImmOp, ImmLeaf Imm, RegisterOperand ROWD,
+ RegisterOperand ROWS = ROWD,
+ InstrItinClass itin = NoItinerary> {
+ dag OutOperandList = (outs ROWD:$wd);
+ dag InOperandList = (ins ROWS:$ws, ImmOp:$m);
+ string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+ list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, Imm:$m))];
+ InstrItinClass Itinerary = itin;
+}
+
+class MSA_BIT_BINSXI_DESC_BASE<string instr_asm, ValueType Ty,
+ SplatComplexPattern Mask, RegisterOperand ROWD,
+ RegisterOperand ROWS = ROWD,
+ InstrItinClass itin = NoItinerary> {
+ dag OutOperandList = (outs ROWD:$wd);
+ dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, Mask.OpClass:$m);
+ string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+ // Note that binsxi and vselect treat the condition operand the opposite
+ // way to each other.
+ // (vselect cond, if_set, if_clear)
+ // (BSEL_V cond, if_clear, if_set)
+ list<dag> Pattern = [(set ROWD:$wd, (vselect (Ty Mask:$m), (Ty ROWD:$ws),
+ ROWS:$wd_in))];
+ InstrItinClass Itinerary = itin;
+ string Constraints = "$wd = $wd_in";
+}
+
+class MSA_BIT_BINSLI_DESC_BASE<string instr_asm, ValueType Ty,
+ SplatComplexPattern ImmOp, RegisterOperand ROWD,
+ RegisterOperand ROWS = ROWD,
+ InstrItinClass itin = NoItinerary> :
+ MSA_BIT_BINSXI_DESC_BASE<instr_asm, Ty, ImmOp, ROWD, ROWS, itin>;
+
+class MSA_BIT_BINSRI_DESC_BASE<string instr_asm, ValueType Ty,
+ SplatComplexPattern ImmOp, RegisterOperand ROWD,
+ RegisterOperand ROWS = ROWD,
+ InstrItinClass itin = NoItinerary> :
+ MSA_BIT_BINSXI_DESC_BASE<instr_asm, Ty, ImmOp, ROWD, ROWS, itin>;
+
+class MSA_BIT_SPLAT_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ SplatComplexPattern SplatImm,
+ RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+ InstrItinClass itin = NoItinerary> {
+ dag OutOperandList = (outs ROWD:$wd);
+ dag InOperandList = (ins ROWS:$ws, SplatImm.OpClass:$m);
+ string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+ list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, SplatImm:$m))];
+ InstrItinClass Itinerary = itin;
+}
+
+class MSA_COPY_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ ValueType VecTy, Operand ImmOp, ImmLeaf Imm,
+ RegisterOperand ROD, RegisterOperand ROWS,
+ InstrItinClass itin = NoItinerary> {
+ dag OutOperandList = (outs ROD:$rd);
+ dag InOperandList = (ins ROWS:$ws, ImmOp:$n);
+ string AsmString = !strconcat(instr_asm, "\t$rd, $ws[$n]");
+ list<dag> Pattern = [(set ROD:$rd, (OpNode (VecTy ROWS:$ws), Imm:$n))];
+ InstrItinClass Itinerary = itin;
+}
+
+class MSA_ELM_SLD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ RegisterOperand ROWD, RegisterOperand ROWS,
+ Operand ImmOp, ImmLeaf Imm,
+ InstrItinClass itin = NoItinerary> {
+ dag OutOperandList = (outs ROWD:$wd);
+ dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, ImmOp:$n);
+ string AsmString = !strconcat(instr_asm, "\t$wd, $ws[$n]");
+ list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in, ROWS:$ws,
+ Imm:$n))];
+ string Constraints = "$wd = $wd_in";
+ InstrItinClass Itinerary = itin;
+}
+
+class MSA_COPY_PSEUDO_BASE<SDPatternOperator OpNode, ValueType VecTy,
+ Operand ImmOp, ImmLeaf Imm, RegisterClass RCD,
+ RegisterClass RCWS> :
+ MSAPseudo<(outs RCD:$wd), (ins RCWS:$ws, ImmOp:$n),
+ [(set RCD:$wd, (OpNode (VecTy RCWS:$ws), Imm:$n))]> {
+ bit usesCustomInserter = 1;
+}
+
+class MSA_I5_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ SplatComplexPattern SplatImm, RegisterOperand ROWD,
+ RegisterOperand ROWS = ROWD,
+ InstrItinClass itin = NoItinerary> {
+ dag OutOperandList = (outs ROWD:$wd);
+ dag InOperandList = (ins ROWS:$ws, SplatImm.OpClass:$imm);
+ string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $imm");
+ list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, SplatImm:$imm))];
+ InstrItinClass Itinerary = itin;
+}
+
+class MSA_I8_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ SplatComplexPattern SplatImm, RegisterOperand ROWD,
+ RegisterOperand ROWS = ROWD,
+ InstrItinClass itin = NoItinerary> {
+ dag OutOperandList = (outs ROWD:$wd);
+ dag InOperandList = (ins ROWS:$ws, SplatImm.OpClass:$u8);
+ string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $u8");
+ list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, SplatImm:$u8))];
+ InstrItinClass Itinerary = itin;
+}
+
+class MSA_I8_SHF_DESC_BASE<string instr_asm, RegisterOperand ROWD,
+ RegisterOperand ROWS = ROWD,
+ InstrItinClass itin = NoItinerary> {
+ dag OutOperandList = (outs ROWD:$wd);
+ dag InOperandList = (ins ROWS:$ws, uimm8:$u8);
+ string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $u8");
+ list<dag> Pattern = [(set ROWD:$wd, (MipsSHF immZExt8:$u8, ROWS:$ws))];
+ InstrItinClass Itinerary = itin;
+}
+
+class MSA_I10_LDI_DESC_BASE<string instr_asm, RegisterOperand ROWD,
+ InstrItinClass itin = NoItinerary> {
+ dag OutOperandList = (outs ROWD:$wd);
+ dag InOperandList = (ins vsplat_simm10:$s10);
+ string AsmString = !strconcat(instr_asm, "\t$wd, $s10");
+ // LDI is matched using custom matching code in MipsSEISelDAGToDAG.cpp
+ list<dag> Pattern = [];
+ bit hasSideEffects = 0;
+ InstrItinClass Itinerary = itin;
+}
+
+class MSA_2R_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+ InstrItinClass itin = NoItinerary> {
+ dag OutOperandList = (outs ROWD:$wd);
+ dag InOperandList = (ins ROWS:$ws);
+ string AsmString = !strconcat(instr_asm, "\t$wd, $ws");
+ list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws))];
+ InstrItinClass Itinerary = itin;
+}
+
+class MSA_2R_FILL_DESC_BASE<string instr_asm, ValueType VT,
+ SDPatternOperator OpNode, RegisterOperand ROWD,
+ RegisterOperand ROS = ROWD,
+ InstrItinClass itin = NoItinerary> {
+ dag OutOperandList = (outs ROWD:$wd);
+ dag InOperandList = (ins ROS:$rs);
+ string AsmString = !strconcat(instr_asm, "\t$wd, $rs");
+ list<dag> Pattern = [(set ROWD:$wd, (VT (OpNode ROS:$rs)))];
+ InstrItinClass Itinerary = itin;
+}
+
+class MSA_2R_FILL_PSEUDO_BASE<ValueType VT, SDPatternOperator OpNode,
+ RegisterClass RCWD, RegisterClass RCWS = RCWD> :
+ MSAPseudo<(outs RCWD:$wd), (ins RCWS:$fs),
+ [(set RCWD:$wd, (OpNode RCWS:$fs))]> {
+ let usesCustomInserter = 1;
+}
+
+class MSA_2RF_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+ InstrItinClass itin = NoItinerary> {
+ dag OutOperandList = (outs ROWD:$wd);
+ dag InOperandList = (ins ROWS:$ws);
+ string AsmString = !strconcat(instr_asm, "\t$wd, $ws");
+ list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws))];
+ InstrItinClass Itinerary = itin;
+}
+
+class MSA_3R_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+ RegisterOperand ROWT = ROWD,
+ InstrItinClass itin = NoItinerary> {
+ dag OutOperandList = (outs ROWD:$wd);
+ dag InOperandList = (ins ROWS:$ws, ROWT:$wt);
+ string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $wt");
+ list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, ROWT:$wt))];
+ InstrItinClass Itinerary = itin;
+}
+
+class MSA_3R_BINSX_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+ RegisterOperand ROWT = ROWD,
+ InstrItinClass itin = NoItinerary> {
+ dag OutOperandList = (outs ROWD:$wd);
+ dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, ROWT:$wt);
+ string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $wt");
+ list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in, ROWS:$ws,
+ ROWT:$wt))];
+ string Constraints = "$wd = $wd_in";
+ InstrItinClass Itinerary = itin;
+}
+
+class MSA_3R_SPLAT_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+ InstrItinClass itin = NoItinerary> {
+ dag OutOperandList = (outs ROWD:$wd);
+ dag InOperandList = (ins ROWS:$ws, GPR32Opnd:$rt);
+ string AsmString = !strconcat(instr_asm, "\t$wd, $ws[$rt]");
+ list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, GPR32Opnd:$rt))];
+ InstrItinClass Itinerary = itin;
+}
+
+class MSA_3R_VSHF_DESC_BASE<string instr_asm, RegisterOperand ROWD,
+ RegisterOperand ROWS = ROWD,
+ RegisterOperand ROWT = ROWD,
+ InstrItinClass itin = NoItinerary> {
+ dag OutOperandList = (outs ROWD:$wd);
+ dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, ROWT:$wt);
+ string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $wt");
+ list<dag> Pattern = [(set ROWD:$wd, (MipsVSHF ROWD:$wd_in, ROWS:$ws,
+ ROWT:$wt))];
+ string Constraints = "$wd = $wd_in";
+ InstrItinClass Itinerary = itin;
+}
+
+class MSA_3R_SLD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+ InstrItinClass itin = NoItinerary> {
+ dag OutOperandList = (outs ROWD:$wd);
+ dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, GPR32Opnd:$rt);
+ string AsmString = !strconcat(instr_asm, "\t$wd, $ws[$rt]");
+ list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in, ROWS:$ws,
+ GPR32Opnd:$rt))];
+ InstrItinClass Itinerary = itin;
+ string Constraints = "$wd = $wd_in";
+}
+
+class MSA_3R_4R_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+ RegisterOperand ROWT = ROWD,
+ InstrItinClass itin = NoItinerary> {
+ dag OutOperandList = (outs ROWD:$wd);
+ dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, ROWT:$wt);
+ string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $wt");
+ list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in, ROWS:$ws,
+ ROWT:$wt))];
+ InstrItinClass Itinerary = itin;
+ string Constraints = "$wd = $wd_in";
+}
+
+class MSA_3RF_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+ RegisterOperand ROWT = ROWD,
+ InstrItinClass itin = NoItinerary> :
+ MSA_3R_DESC_BASE<instr_asm, OpNode, ROWD, ROWS, ROWT, itin>;
+
+class MSA_3RF_4RF_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+ RegisterOperand ROWT = ROWD,
+ InstrItinClass itin = NoItinerary> :
+ MSA_3R_4R_DESC_BASE<instr_asm, OpNode, ROWD, ROWS, ROWT, itin>;
+
+class MSA_CBRANCH_DESC_BASE<string instr_asm, RegisterOperand ROWD> {
+ dag OutOperandList = (outs);
+ dag InOperandList = (ins ROWD:$wt, brtarget:$offset);
+ string AsmString = !strconcat(instr_asm, "\t$wt, $offset");
+ list<dag> Pattern = [];
+ InstrItinClass Itinerary = NoItinerary;
+ bit isBranch = 1;
+ bit isTerminator = 1;
+ bit hasDelaySlot = 1;
+ list<Register> Defs = [AT];
+}
+
+class MSA_INSERT_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ Operand ImmOp, ImmLeaf Imm, RegisterOperand ROWD,
+ RegisterOperand ROS,
+ InstrItinClass itin = NoItinerary> {
+ dag OutOperandList = (outs ROWD:$wd);
+ dag InOperandList = (ins ROWD:$wd_in, ROS:$rs, ImmOp:$n);
+ string AsmString = !strconcat(instr_asm, "\t$wd[$n], $rs");
+ list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in, ROS:$rs, Imm:$n))];
+ InstrItinClass Itinerary = itin;
+ string Constraints = "$wd = $wd_in";
+}
+
+class MSA_INSERT_PSEUDO_BASE<SDPatternOperator OpNode, ValueType Ty,
+ Operand ImmOp, ImmLeaf Imm, RegisterOperand ROWD,
+ RegisterOperand ROFS> :
+ MSAPseudo<(outs ROWD:$wd), (ins ROWD:$wd_in, ImmOp:$n, ROFS:$fs),
+ [(set ROWD:$wd, (OpNode (Ty ROWD:$wd_in), ROFS:$fs, Imm:$n))]> {
+ bit usesCustomInserter = 1;
+ string Constraints = "$wd = $wd_in";
+}
+
+class MSA_INSERT_VIDX_PSEUDO_BASE<SDPatternOperator OpNode, ValueType Ty,
+ RegisterOperand ROWD, RegisterOperand ROFS,
+ RegisterOperand ROIdx> :
+ MSAPseudo<(outs ROWD:$wd), (ins ROWD:$wd_in, ROIdx:$n, ROFS:$fs),
+ [(set ROWD:$wd, (OpNode (Ty ROWD:$wd_in), ROFS:$fs,
+ ROIdx:$n))]> {
+ bit usesCustomInserter = 1;
+ string Constraints = "$wd = $wd_in";
+}
+
+class MSA_INSVE_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ Operand ImmOp, ImmLeaf Imm, RegisterOperand ROWD,
+ RegisterOperand ROWS = ROWD,
+ InstrItinClass itin = NoItinerary> {
+ dag OutOperandList = (outs ROWD:$wd);
+ dag InOperandList = (ins ROWD:$wd_in, ImmOp:$n, ROWS:$ws, uimmz:$n2);
+ string AsmString = !strconcat(instr_asm, "\t$wd[$n], $ws[$n2]");
+ list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in,
+ Imm:$n,
+ ROWS:$ws,
+ immz:$n2))];
+ InstrItinClass Itinerary = itin;
+ string Constraints = "$wd = $wd_in";
+}
+
+class MSA_VEC_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+ RegisterOperand ROWT = ROWD,
+ InstrItinClass itin = NoItinerary> {
+ dag OutOperandList = (outs ROWD:$wd);
+ dag InOperandList = (ins ROWS:$ws, ROWT:$wt);
+ string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $wt");
+ list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, ROWT:$wt))];
+ InstrItinClass Itinerary = itin;
+}
+
+class MSA_ELM_SPLAT_DESC_BASE<string instr_asm, SplatComplexPattern SplatImm,
+ RegisterOperand ROWD,
+ RegisterOperand ROWS = ROWD,
+ InstrItinClass itin = NoItinerary> {
+ dag OutOperandList = (outs ROWD:$wd);
+ dag InOperandList = (ins ROWS:$ws, SplatImm.OpClass:$n);
+ string AsmString = !strconcat(instr_asm, "\t$wd, $ws[$n]");
+ list<dag> Pattern = [(set ROWD:$wd, (MipsVSHF SplatImm:$n, ROWS:$ws,
+ ROWS:$ws))];
+ InstrItinClass Itinerary = itin;
+}
+
+class MSA_VEC_PSEUDO_BASE<SDPatternOperator OpNode, RegisterOperand ROWD,
+ RegisterOperand ROWS = ROWD,
+ RegisterOperand ROWT = ROWD> :
+ MSAPseudo<(outs ROWD:$wd), (ins ROWS:$ws, ROWT:$wt),
+ [(set ROWD:$wd, (OpNode ROWS:$ws, ROWT:$wt))]>;
+
+class ADD_A_B_DESC : MSA_3R_DESC_BASE<"add_a.b", int_mips_add_a_b, MSA128BOpnd>,
+ IsCommutable;
+class ADD_A_H_DESC : MSA_3R_DESC_BASE<"add_a.h", int_mips_add_a_h, MSA128HOpnd>,
+ IsCommutable;
+class ADD_A_W_DESC : MSA_3R_DESC_BASE<"add_a.w", int_mips_add_a_w, MSA128WOpnd>,
+ IsCommutable;
+class ADD_A_D_DESC : MSA_3R_DESC_BASE<"add_a.d", int_mips_add_a_d, MSA128DOpnd>,
+ IsCommutable;
+
+class ADDS_A_B_DESC : MSA_3R_DESC_BASE<"adds_a.b", int_mips_adds_a_b,
+ MSA128BOpnd>, IsCommutable;
+class ADDS_A_H_DESC : MSA_3R_DESC_BASE<"adds_a.h", int_mips_adds_a_h,
+ MSA128HOpnd>, IsCommutable;
+class ADDS_A_W_DESC : MSA_3R_DESC_BASE<"adds_a.w", int_mips_adds_a_w,
+ MSA128WOpnd>, IsCommutable;
+class ADDS_A_D_DESC : MSA_3R_DESC_BASE<"adds_a.d", int_mips_adds_a_d,
+ MSA128DOpnd>, IsCommutable;
+
+class ADDS_S_B_DESC : MSA_3R_DESC_BASE<"adds_s.b", int_mips_adds_s_b,
+ MSA128BOpnd>, IsCommutable;
+class ADDS_S_H_DESC : MSA_3R_DESC_BASE<"adds_s.h", int_mips_adds_s_h,
+ MSA128HOpnd>, IsCommutable;
+class ADDS_S_W_DESC : MSA_3R_DESC_BASE<"adds_s.w", int_mips_adds_s_w,
+ MSA128WOpnd>, IsCommutable;
+class ADDS_S_D_DESC : MSA_3R_DESC_BASE<"adds_s.d", int_mips_adds_s_d,
+ MSA128DOpnd>, IsCommutable;
+
+class ADDS_U_B_DESC : MSA_3R_DESC_BASE<"adds_u.b", int_mips_adds_u_b,
+ MSA128BOpnd>, IsCommutable;
+class ADDS_U_H_DESC : MSA_3R_DESC_BASE<"adds_u.h", int_mips_adds_u_h,
+ MSA128HOpnd>, IsCommutable;
+class ADDS_U_W_DESC : MSA_3R_DESC_BASE<"adds_u.w", int_mips_adds_u_w,
+ MSA128WOpnd>, IsCommutable;
+class ADDS_U_D_DESC : MSA_3R_DESC_BASE<"adds_u.d", int_mips_adds_u_d,
+ MSA128DOpnd>, IsCommutable;
+
+class ADDV_B_DESC : MSA_3R_DESC_BASE<"addv.b", add, MSA128BOpnd>, IsCommutable;
+class ADDV_H_DESC : MSA_3R_DESC_BASE<"addv.h", add, MSA128HOpnd>, IsCommutable;
+class ADDV_W_DESC : MSA_3R_DESC_BASE<"addv.w", add, MSA128WOpnd>, IsCommutable;
+class ADDV_D_DESC : MSA_3R_DESC_BASE<"addv.d", add, MSA128DOpnd>, IsCommutable;
+
+class ADDVI_B_DESC : MSA_I5_DESC_BASE<"addvi.b", add, vsplati8_uimm5,
+ MSA128BOpnd>;
+class ADDVI_H_DESC : MSA_I5_DESC_BASE<"addvi.h", add, vsplati16_uimm5,
+ MSA128HOpnd>;
+class ADDVI_W_DESC : MSA_I5_DESC_BASE<"addvi.w", add, vsplati32_uimm5,
+ MSA128WOpnd>;
+class ADDVI_D_DESC : MSA_I5_DESC_BASE<"addvi.d", add, vsplati64_uimm5,
+ MSA128DOpnd>;
+
+class AND_V_DESC : MSA_VEC_DESC_BASE<"and.v", and, MSA128BOpnd>;
+class AND_V_H_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<and, MSA128HOpnd>;
+class AND_V_W_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<and, MSA128WOpnd>;
+class AND_V_D_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<and, MSA128DOpnd>;
+
+class ANDI_B_DESC : MSA_I8_DESC_BASE<"andi.b", and, vsplati8_uimm8,
+ MSA128BOpnd>;
+
+class ASUB_S_B_DESC : MSA_3R_DESC_BASE<"asub_s.b", int_mips_asub_s_b,
+ MSA128BOpnd>;
+class ASUB_S_H_DESC : MSA_3R_DESC_BASE<"asub_s.h", int_mips_asub_s_h,
+ MSA128HOpnd>;
+class ASUB_S_W_DESC : MSA_3R_DESC_BASE<"asub_s.w", int_mips_asub_s_w,
+ MSA128WOpnd>;
+class ASUB_S_D_DESC : MSA_3R_DESC_BASE<"asub_s.d", int_mips_asub_s_d,
+ MSA128DOpnd>;
+
+class ASUB_U_B_DESC : MSA_3R_DESC_BASE<"asub_u.b", int_mips_asub_u_b,
+ MSA128BOpnd>;
+class ASUB_U_H_DESC : MSA_3R_DESC_BASE<"asub_u.h", int_mips_asub_u_h,
+ MSA128HOpnd>;
+class ASUB_U_W_DESC : MSA_3R_DESC_BASE<"asub_u.w", int_mips_asub_u_w,
+ MSA128WOpnd>;
+class ASUB_U_D_DESC : MSA_3R_DESC_BASE<"asub_u.d", int_mips_asub_u_d,
+ MSA128DOpnd>;
+
+class AVE_S_B_DESC : MSA_3R_DESC_BASE<"ave_s.b", int_mips_ave_s_b, MSA128BOpnd>,
+ IsCommutable;
+class AVE_S_H_DESC : MSA_3R_DESC_BASE<"ave_s.h", int_mips_ave_s_h, MSA128HOpnd>,
+ IsCommutable;
+class AVE_S_W_DESC : MSA_3R_DESC_BASE<"ave_s.w", int_mips_ave_s_w, MSA128WOpnd>,
+ IsCommutable;
+class AVE_S_D_DESC : MSA_3R_DESC_BASE<"ave_s.d", int_mips_ave_s_d, MSA128DOpnd>,
+ IsCommutable;
+
+class AVE_U_B_DESC : MSA_3R_DESC_BASE<"ave_u.b", int_mips_ave_u_b, MSA128BOpnd>,
+ IsCommutable;
+class AVE_U_H_DESC : MSA_3R_DESC_BASE<"ave_u.h", int_mips_ave_u_h, MSA128HOpnd>,
+ IsCommutable;
+class AVE_U_W_DESC : MSA_3R_DESC_BASE<"ave_u.w", int_mips_ave_u_w, MSA128WOpnd>,
+ IsCommutable;
+class AVE_U_D_DESC : MSA_3R_DESC_BASE<"ave_u.d", int_mips_ave_u_d, MSA128DOpnd>,
+ IsCommutable;
+
+class AVER_S_B_DESC : MSA_3R_DESC_BASE<"aver_s.b", int_mips_aver_s_b,
+ MSA128BOpnd>, IsCommutable;
+class AVER_S_H_DESC : MSA_3R_DESC_BASE<"aver_s.h", int_mips_aver_s_h,
+ MSA128HOpnd>, IsCommutable;
+class AVER_S_W_DESC : MSA_3R_DESC_BASE<"aver_s.w", int_mips_aver_s_w,
+ MSA128WOpnd>, IsCommutable;
+class AVER_S_D_DESC : MSA_3R_DESC_BASE<"aver_s.d", int_mips_aver_s_d,
+ MSA128DOpnd>, IsCommutable;
+
+class AVER_U_B_DESC : MSA_3R_DESC_BASE<"aver_u.b", int_mips_aver_u_b,
+ MSA128BOpnd>, IsCommutable;
+class AVER_U_H_DESC : MSA_3R_DESC_BASE<"aver_u.h", int_mips_aver_u_h,
+ MSA128HOpnd>, IsCommutable;
+class AVER_U_W_DESC : MSA_3R_DESC_BASE<"aver_u.w", int_mips_aver_u_w,
+ MSA128WOpnd>, IsCommutable;
+class AVER_U_D_DESC : MSA_3R_DESC_BASE<"aver_u.d", int_mips_aver_u_d,
+ MSA128DOpnd>, IsCommutable;
+
+class BCLR_B_DESC : MSA_3R_DESC_BASE<"bclr.b", vbclr_b, MSA128BOpnd>;
+class BCLR_H_DESC : MSA_3R_DESC_BASE<"bclr.h", vbclr_h, MSA128HOpnd>;
+class BCLR_W_DESC : MSA_3R_DESC_BASE<"bclr.w", vbclr_w, MSA128WOpnd>;
+class BCLR_D_DESC : MSA_3R_DESC_BASE<"bclr.d", vbclr_d, MSA128DOpnd>;
+
+class BCLRI_B_DESC : MSA_BIT_B_DESC_BASE<"bclri.b", and, vsplat_uimm_inv_pow2,
+ MSA128BOpnd>;
+class BCLRI_H_DESC : MSA_BIT_H_DESC_BASE<"bclri.h", and, vsplat_uimm_inv_pow2,
+ MSA128HOpnd>;
+class BCLRI_W_DESC : MSA_BIT_W_DESC_BASE<"bclri.w", and, vsplat_uimm_inv_pow2,
+ MSA128WOpnd>;
+class BCLRI_D_DESC : MSA_BIT_D_DESC_BASE<"bclri.d", and, vsplat_uimm_inv_pow2,
+ MSA128DOpnd>;
+
+class BINSL_B_DESC : MSA_3R_BINSX_DESC_BASE<"binsl.b", int_mips_binsl_b,
+ MSA128BOpnd>;
+class BINSL_H_DESC : MSA_3R_BINSX_DESC_BASE<"binsl.h", int_mips_binsl_h,
+ MSA128HOpnd>;
+class BINSL_W_DESC : MSA_3R_BINSX_DESC_BASE<"binsl.w", int_mips_binsl_w,
+ MSA128WOpnd>;
+class BINSL_D_DESC : MSA_3R_BINSX_DESC_BASE<"binsl.d", int_mips_binsl_d,
+ MSA128DOpnd>;
+
+class BINSLI_B_DESC : MSA_BIT_BINSLI_DESC_BASE<"binsli.b", v16i8, vsplat_maskl_bits_uimm3, MSA128BOpnd>;
+class BINSLI_H_DESC : MSA_BIT_BINSLI_DESC_BASE<"binsli.h", v8i16, vsplat_maskl_bits_uimm4, MSA128HOpnd>;
+class BINSLI_W_DESC : MSA_BIT_BINSLI_DESC_BASE<"binsli.w", v4i32, vsplat_maskl_bits_uimm5, MSA128WOpnd>;
+class BINSLI_D_DESC : MSA_BIT_BINSLI_DESC_BASE<"binsli.d", v2i64, vsplat_maskl_bits_uimm6, MSA128DOpnd>;
+
+class BINSR_B_DESC : MSA_3R_BINSX_DESC_BASE<"binsr.b", int_mips_binsr_b,
+ MSA128BOpnd>;
+class BINSR_H_DESC : MSA_3R_BINSX_DESC_BASE<"binsr.h", int_mips_binsr_h,
+ MSA128HOpnd>;
+class BINSR_W_DESC : MSA_3R_BINSX_DESC_BASE<"binsr.w", int_mips_binsr_w,
+ MSA128WOpnd>;
+class BINSR_D_DESC : MSA_3R_BINSX_DESC_BASE<"binsr.d", int_mips_binsr_d,
+ MSA128DOpnd>;
+
+class BINSRI_B_DESC
+ : MSA_BIT_BINSRI_DESC_BASE<"binsri.b", v16i8, vsplat_maskr_bits_uimm3,
+ MSA128BOpnd>;
+class BINSRI_H_DESC
+ : MSA_BIT_BINSRI_DESC_BASE<"binsri.h", v8i16, vsplat_maskr_bits_uimm4,
+ MSA128HOpnd>;
+class BINSRI_W_DESC
+ : MSA_BIT_BINSRI_DESC_BASE<"binsri.w", v4i32, vsplat_maskr_bits_uimm5,
+ MSA128WOpnd>;
+class BINSRI_D_DESC
+ : MSA_BIT_BINSRI_DESC_BASE<"binsri.d", v2i64, vsplat_maskr_bits_uimm6,
+ MSA128DOpnd>;
+
+class BMNZ_V_DESC {
+ dag OutOperandList = (outs MSA128BOpnd:$wd);
+ dag InOperandList = (ins MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
+ MSA128BOpnd:$wt);
+ string AsmString = "bmnz.v\t$wd, $ws, $wt";
+ list<dag> Pattern = [(set MSA128BOpnd:$wd, (vselect MSA128BOpnd:$wt,
+ MSA128BOpnd:$ws,
+ MSA128BOpnd:$wd_in))];
+ InstrItinClass Itinerary = NoItinerary;
+ string Constraints = "$wd = $wd_in";
+}
+
+class BMNZI_B_DESC {
+ dag OutOperandList = (outs MSA128BOpnd:$wd);
+ dag InOperandList = (ins MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
+ vsplat_uimm8:$u8);
+ string AsmString = "bmnzi.b\t$wd, $ws, $u8";
+ list<dag> Pattern = [(set MSA128BOpnd:$wd, (vselect vsplati8_uimm8:$u8,
+ MSA128BOpnd:$ws,
+ MSA128BOpnd:$wd_in))];
+ InstrItinClass Itinerary = NoItinerary;
+ string Constraints = "$wd = $wd_in";
+}
+
+class BMZ_V_DESC {
+ dag OutOperandList = (outs MSA128BOpnd:$wd);
+ dag InOperandList = (ins MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
+ MSA128BOpnd:$wt);
+ string AsmString = "bmz.v\t$wd, $ws, $wt";
+ list<dag> Pattern = [(set MSA128BOpnd:$wd, (vselect MSA128BOpnd:$wt,
+ MSA128BOpnd:$wd_in,
+ MSA128BOpnd:$ws))];
+ InstrItinClass Itinerary = NoItinerary;
+ string Constraints = "$wd = $wd_in";
+}
+
+class BMZI_B_DESC {
+ dag OutOperandList = (outs MSA128BOpnd:$wd);
+ dag InOperandList = (ins MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
+ vsplat_uimm8:$u8);
+ string AsmString = "bmzi.b\t$wd, $ws, $u8";
+ list<dag> Pattern = [(set MSA128BOpnd:$wd, (vselect vsplati8_uimm8:$u8,
+ MSA128BOpnd:$wd_in,
+ MSA128BOpnd:$ws))];
+ InstrItinClass Itinerary = NoItinerary;
+ string Constraints = "$wd = $wd_in";
+}
+
+class BNEG_B_DESC : MSA_3R_DESC_BASE<"bneg.b", vbneg_b, MSA128BOpnd>;
+class BNEG_H_DESC : MSA_3R_DESC_BASE<"bneg.h", vbneg_h, MSA128HOpnd>;
+class BNEG_W_DESC : MSA_3R_DESC_BASE<"bneg.w", vbneg_w, MSA128WOpnd>;
+class BNEG_D_DESC : MSA_3R_DESC_BASE<"bneg.d", vbneg_d, MSA128DOpnd>;
+
+class BNEGI_B_DESC : MSA_BIT_B_DESC_BASE<"bnegi.b", xor, vsplat_uimm_pow2,
+ MSA128BOpnd>;
+class BNEGI_H_DESC : MSA_BIT_H_DESC_BASE<"bnegi.h", xor, vsplat_uimm_pow2,
+ MSA128HOpnd>;
+class BNEGI_W_DESC : MSA_BIT_W_DESC_BASE<"bnegi.w", xor, vsplat_uimm_pow2,
+ MSA128WOpnd>;
+class BNEGI_D_DESC : MSA_BIT_D_DESC_BASE<"bnegi.d", xor, vsplat_uimm_pow2,
+ MSA128DOpnd>;
+
+class BNZ_B_DESC : MSA_CBRANCH_DESC_BASE<"bnz.b", MSA128BOpnd>;
+class BNZ_H_DESC : MSA_CBRANCH_DESC_BASE<"bnz.h", MSA128HOpnd>;
+class BNZ_W_DESC : MSA_CBRANCH_DESC_BASE<"bnz.w", MSA128WOpnd>;
+class BNZ_D_DESC : MSA_CBRANCH_DESC_BASE<"bnz.d", MSA128DOpnd>;
+
+class BNZ_V_DESC : MSA_CBRANCH_DESC_BASE<"bnz.v", MSA128BOpnd>;
+
+class BSEL_V_DESC {
+ dag OutOperandList = (outs MSA128BOpnd:$wd);
+ dag InOperandList = (ins MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
+ MSA128BOpnd:$wt);
+ string AsmString = "bsel.v\t$wd, $ws, $wt";
+ // Note that vselect and BSEL_V treat the condition operand the opposite way
+ // from each other.
+ // (vselect cond, if_set, if_clear)
+ // (BSEL_V cond, if_clear, if_set)
+ list<dag> Pattern = [(set MSA128BOpnd:$wd,
+ (vselect MSA128BOpnd:$wd_in, MSA128BOpnd:$wt,
+ MSA128BOpnd:$ws))];
+ InstrItinClass Itinerary = NoItinerary;
+ string Constraints = "$wd = $wd_in";
+}
+
+class BSELI_B_DESC {
+ dag OutOperandList = (outs MSA128BOpnd:$wd);
+ dag InOperandList = (ins MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
+ vsplat_uimm8:$u8);
+ string AsmString = "bseli.b\t$wd, $ws, $u8";
+ // Note that vselect and BSEL_V treat the condition operand the opposite way
+ // from each other.
+ // (vselect cond, if_set, if_clear)
+ // (BSEL_V cond, if_clear, if_set)
+ list<dag> Pattern = [(set MSA128BOpnd:$wd, (vselect MSA128BOpnd:$wd_in,
+ vsplati8_uimm8:$u8,
+ MSA128BOpnd:$ws))];
+ InstrItinClass Itinerary = NoItinerary;
+ string Constraints = "$wd = $wd_in";
+}
+
+class BSET_B_DESC : MSA_3R_DESC_BASE<"bset.b", vbset_b, MSA128BOpnd>;
+class BSET_H_DESC : MSA_3R_DESC_BASE<"bset.h", vbset_h, MSA128HOpnd>;
+class BSET_W_DESC : MSA_3R_DESC_BASE<"bset.w", vbset_w, MSA128WOpnd>;
+class BSET_D_DESC : MSA_3R_DESC_BASE<"bset.d", vbset_d, MSA128DOpnd>;
+
+class BSETI_B_DESC : MSA_BIT_B_DESC_BASE<"bseti.b", or, vsplat_uimm_pow2,
+ MSA128BOpnd>;
+class BSETI_H_DESC : MSA_BIT_H_DESC_BASE<"bseti.h", or, vsplat_uimm_pow2,
+ MSA128HOpnd>;
+class BSETI_W_DESC : MSA_BIT_W_DESC_BASE<"bseti.w", or, vsplat_uimm_pow2,
+ MSA128WOpnd>;
+class BSETI_D_DESC : MSA_BIT_D_DESC_BASE<"bseti.d", or, vsplat_uimm_pow2,
+ MSA128DOpnd>;
+
+class BZ_B_DESC : MSA_CBRANCH_DESC_BASE<"bz.b", MSA128BOpnd>;
+class BZ_H_DESC : MSA_CBRANCH_DESC_BASE<"bz.h", MSA128HOpnd>;
+class BZ_W_DESC : MSA_CBRANCH_DESC_BASE<"bz.w", MSA128WOpnd>;
+class BZ_D_DESC : MSA_CBRANCH_DESC_BASE<"bz.d", MSA128DOpnd>;
+
+class BZ_V_DESC : MSA_CBRANCH_DESC_BASE<"bz.v", MSA128BOpnd>;
+
+class CEQ_B_DESC : MSA_3R_DESC_BASE<"ceq.b", vseteq_v16i8, MSA128BOpnd>,
+ IsCommutable;
+class CEQ_H_DESC : MSA_3R_DESC_BASE<"ceq.h", vseteq_v8i16, MSA128HOpnd>,
+ IsCommutable;
+class CEQ_W_DESC : MSA_3R_DESC_BASE<"ceq.w", vseteq_v4i32, MSA128WOpnd>,
+ IsCommutable;
+class CEQ_D_DESC : MSA_3R_DESC_BASE<"ceq.d", vseteq_v2i64, MSA128DOpnd>,
+ IsCommutable;
+
+class CEQI_B_DESC : MSA_I5_DESC_BASE<"ceqi.b", vseteq_v16i8, vsplati8_simm5,
+ MSA128BOpnd>;
+class CEQI_H_DESC : MSA_I5_DESC_BASE<"ceqi.h", vseteq_v8i16, vsplati16_simm5,
+ MSA128HOpnd>;
+class CEQI_W_DESC : MSA_I5_DESC_BASE<"ceqi.w", vseteq_v4i32, vsplati32_simm5,
+ MSA128WOpnd>;
+class CEQI_D_DESC : MSA_I5_DESC_BASE<"ceqi.d", vseteq_v2i64, vsplati64_simm5,
+ MSA128DOpnd>;
+
+class CFCMSA_DESC {
+ dag OutOperandList = (outs GPR32Opnd:$rd);
+ dag InOperandList = (ins MSA128CROpnd:$cs);
+ string AsmString = "cfcmsa\t$rd, $cs";
+ InstrItinClass Itinerary = NoItinerary;
+ bit hasSideEffects = 1;
+}
+
+class CLE_S_B_DESC : MSA_3R_DESC_BASE<"cle_s.b", vsetle_v16i8, MSA128BOpnd>;
+class CLE_S_H_DESC : MSA_3R_DESC_BASE<"cle_s.h", vsetle_v8i16, MSA128HOpnd>;
+class CLE_S_W_DESC : MSA_3R_DESC_BASE<"cle_s.w", vsetle_v4i32, MSA128WOpnd>;
+class CLE_S_D_DESC : MSA_3R_DESC_BASE<"cle_s.d", vsetle_v2i64, MSA128DOpnd>;
+
+class CLE_U_B_DESC : MSA_3R_DESC_BASE<"cle_u.b", vsetule_v16i8, MSA128BOpnd>;
+class CLE_U_H_DESC : MSA_3R_DESC_BASE<"cle_u.h", vsetule_v8i16, MSA128HOpnd>;
+class CLE_U_W_DESC : MSA_3R_DESC_BASE<"cle_u.w", vsetule_v4i32, MSA128WOpnd>;
+class CLE_U_D_DESC : MSA_3R_DESC_BASE<"cle_u.d", vsetule_v2i64, MSA128DOpnd>;
+
+class CLEI_S_B_DESC : MSA_I5_DESC_BASE<"clei_s.b", vsetle_v16i8,
+ vsplati8_simm5, MSA128BOpnd>;
+class CLEI_S_H_DESC : MSA_I5_DESC_BASE<"clei_s.h", vsetle_v8i16,
+ vsplati16_simm5, MSA128HOpnd>;
+class CLEI_S_W_DESC : MSA_I5_DESC_BASE<"clei_s.w", vsetle_v4i32,
+ vsplati32_simm5, MSA128WOpnd>;
+class CLEI_S_D_DESC : MSA_I5_DESC_BASE<"clei_s.d", vsetle_v2i64,
+ vsplati64_simm5, MSA128DOpnd>;
+
+class CLEI_U_B_DESC : MSA_I5_DESC_BASE<"clei_u.b", vsetule_v16i8,
+ vsplati8_uimm5, MSA128BOpnd>;
+class CLEI_U_H_DESC : MSA_I5_DESC_BASE<"clei_u.h", vsetule_v8i16,
+ vsplati16_uimm5, MSA128HOpnd>;
+class CLEI_U_W_DESC : MSA_I5_DESC_BASE<"clei_u.w", vsetule_v4i32,
+ vsplati32_uimm5, MSA128WOpnd>;
+class CLEI_U_D_DESC : MSA_I5_DESC_BASE<"clei_u.d", vsetule_v2i64,
+ vsplati64_uimm5, MSA128DOpnd>;
+
+class CLT_S_B_DESC : MSA_3R_DESC_BASE<"clt_s.b", vsetlt_v16i8, MSA128BOpnd>;
+class CLT_S_H_DESC : MSA_3R_DESC_BASE<"clt_s.h", vsetlt_v8i16, MSA128HOpnd>;
+class CLT_S_W_DESC : MSA_3R_DESC_BASE<"clt_s.w", vsetlt_v4i32, MSA128WOpnd>;
+class CLT_S_D_DESC : MSA_3R_DESC_BASE<"clt_s.d", vsetlt_v2i64, MSA128DOpnd>;
+
+class CLT_U_B_DESC : MSA_3R_DESC_BASE<"clt_u.b", vsetult_v16i8, MSA128BOpnd>;
+class CLT_U_H_DESC : MSA_3R_DESC_BASE<"clt_u.h", vsetult_v8i16, MSA128HOpnd>;
+class CLT_U_W_DESC : MSA_3R_DESC_BASE<"clt_u.w", vsetult_v4i32, MSA128WOpnd>;
+class CLT_U_D_DESC : MSA_3R_DESC_BASE<"clt_u.d", vsetult_v2i64, MSA128DOpnd>;
+
+class CLTI_S_B_DESC : MSA_I5_DESC_BASE<"clti_s.b", vsetlt_v16i8,
+ vsplati8_simm5, MSA128BOpnd>;
+class CLTI_S_H_DESC : MSA_I5_DESC_BASE<"clti_s.h", vsetlt_v8i16,
+ vsplati16_simm5, MSA128HOpnd>;
+class CLTI_S_W_DESC : MSA_I5_DESC_BASE<"clti_s.w", vsetlt_v4i32,
+ vsplati32_simm5, MSA128WOpnd>;
+class CLTI_S_D_DESC : MSA_I5_DESC_BASE<"clti_s.d", vsetlt_v2i64,
+ vsplati64_simm5, MSA128DOpnd>;
+
+class CLTI_U_B_DESC : MSA_I5_DESC_BASE<"clti_u.b", vsetult_v16i8,
+ vsplati8_uimm5, MSA128BOpnd>;
+class CLTI_U_H_DESC : MSA_I5_DESC_BASE<"clti_u.h", vsetult_v8i16,
+ vsplati16_uimm5, MSA128HOpnd>;
+class CLTI_U_W_DESC : MSA_I5_DESC_BASE<"clti_u.w", vsetult_v4i32,
+ vsplati32_uimm5, MSA128WOpnd>;
+class CLTI_U_D_DESC : MSA_I5_DESC_BASE<"clti_u.d", vsetult_v2i64,
+ vsplati64_uimm5, MSA128DOpnd>;
+
+class COPY_S_B_DESC : MSA_COPY_DESC_BASE<"copy_s.b", vextract_sext_i8, v16i8,
+ uimm4_ptr, immZExt4Ptr, GPR32Opnd,
+ MSA128BOpnd>;
+class COPY_S_H_DESC : MSA_COPY_DESC_BASE<"copy_s.h", vextract_sext_i16, v8i16,
+ uimm3_ptr, immZExt3Ptr, GPR32Opnd,
+ MSA128HOpnd>;
+class COPY_S_W_DESC : MSA_COPY_DESC_BASE<"copy_s.w", vextract_sext_i32, v4i32,
+ uimm2_ptr, immZExt2Ptr, GPR32Opnd,
+ MSA128WOpnd>;
+class COPY_S_D_DESC : MSA_COPY_DESC_BASE<"copy_s.d", vextract_sext_i64, v2i64,
+ uimm1_ptr, immZExt1Ptr, GPR64Opnd,
+ MSA128DOpnd>;
+
+class COPY_U_B_DESC : MSA_COPY_DESC_BASE<"copy_u.b", vextract_zext_i8, v16i8,
+ uimm4_ptr, immZExt4Ptr, GPR32Opnd,
+ MSA128BOpnd>;
+class COPY_U_H_DESC : MSA_COPY_DESC_BASE<"copy_u.h", vextract_zext_i16, v8i16,
+ uimm3_ptr, immZExt3Ptr, GPR32Opnd,
+ MSA128HOpnd>;
+class COPY_U_W_DESC : MSA_COPY_DESC_BASE<"copy_u.w", vextract_zext_i32, v4i32,
+ uimm2_ptr, immZExt2Ptr, GPR32Opnd,
+ MSA128WOpnd>;
+
+class COPY_FW_PSEUDO_DESC : MSA_COPY_PSEUDO_BASE<vector_extract, v4f32,
+ uimm2_ptr, immZExt2Ptr, FGR32,
+ MSA128W>;
+class COPY_FD_PSEUDO_DESC : MSA_COPY_PSEUDO_BASE<vector_extract, v2f64,
+ uimm1_ptr, immZExt1Ptr, FGR64,
+ MSA128D>;
+
+class CTCMSA_DESC {
+ dag OutOperandList = (outs);
+ dag InOperandList = (ins MSA128CROpnd:$cd, GPR32Opnd:$rs);
+ string AsmString = "ctcmsa\t$cd, $rs";
+ InstrItinClass Itinerary = NoItinerary;
+ bit hasSideEffects = 1;
+}
+
+class DIV_S_B_DESC : MSA_3R_DESC_BASE<"div_s.b", sdiv, MSA128BOpnd>;
+class DIV_S_H_DESC : MSA_3R_DESC_BASE<"div_s.h", sdiv, MSA128HOpnd>;
+class DIV_S_W_DESC : MSA_3R_DESC_BASE<"div_s.w", sdiv, MSA128WOpnd>;
+class DIV_S_D_DESC : MSA_3R_DESC_BASE<"div_s.d", sdiv, MSA128DOpnd>;
+
+class DIV_U_B_DESC : MSA_3R_DESC_BASE<"div_u.b", udiv, MSA128BOpnd>;
+class DIV_U_H_DESC : MSA_3R_DESC_BASE<"div_u.h", udiv, MSA128HOpnd>;
+class DIV_U_W_DESC : MSA_3R_DESC_BASE<"div_u.w", udiv, MSA128WOpnd>;
+class DIV_U_D_DESC : MSA_3R_DESC_BASE<"div_u.d", udiv, MSA128DOpnd>;
+
+class DOTP_S_H_DESC : MSA_3R_DESC_BASE<"dotp_s.h", int_mips_dotp_s_h,
+ MSA128HOpnd, MSA128BOpnd, MSA128BOpnd>,
+ IsCommutable;
+class DOTP_S_W_DESC : MSA_3R_DESC_BASE<"dotp_s.w", int_mips_dotp_s_w,
+ MSA128WOpnd, MSA128HOpnd, MSA128HOpnd>,
+ IsCommutable;
+class DOTP_S_D_DESC : MSA_3R_DESC_BASE<"dotp_s.d", int_mips_dotp_s_d,
+ MSA128DOpnd, MSA128WOpnd, MSA128WOpnd>,
+ IsCommutable;
+
+class DOTP_U_H_DESC : MSA_3R_DESC_BASE<"dotp_u.h", int_mips_dotp_u_h,
+ MSA128HOpnd, MSA128BOpnd, MSA128BOpnd>,
+ IsCommutable;
+class DOTP_U_W_DESC : MSA_3R_DESC_BASE<"dotp_u.w", int_mips_dotp_u_w,
+ MSA128WOpnd, MSA128HOpnd, MSA128HOpnd>,
+ IsCommutable;
+class DOTP_U_D_DESC : MSA_3R_DESC_BASE<"dotp_u.d", int_mips_dotp_u_d,
+ MSA128DOpnd, MSA128WOpnd, MSA128WOpnd>,
+ IsCommutable;
+
+class DPADD_S_H_DESC : MSA_3R_4R_DESC_BASE<"dpadd_s.h", int_mips_dpadd_s_h,
+ MSA128HOpnd, MSA128BOpnd,
+ MSA128BOpnd>, IsCommutable;
+class DPADD_S_W_DESC : MSA_3R_4R_DESC_BASE<"dpadd_s.w", int_mips_dpadd_s_w,
+ MSA128WOpnd, MSA128HOpnd,
+ MSA128HOpnd>, IsCommutable;
+class DPADD_S_D_DESC : MSA_3R_4R_DESC_BASE<"dpadd_s.d", int_mips_dpadd_s_d,
+ MSA128DOpnd, MSA128WOpnd,
+ MSA128WOpnd>, IsCommutable;
+
+class DPADD_U_H_DESC : MSA_3R_4R_DESC_BASE<"dpadd_u.h", int_mips_dpadd_u_h,
+ MSA128HOpnd, MSA128BOpnd,
+ MSA128BOpnd>, IsCommutable;
+class DPADD_U_W_DESC : MSA_3R_4R_DESC_BASE<"dpadd_u.w", int_mips_dpadd_u_w,
+ MSA128WOpnd, MSA128HOpnd,
+ MSA128HOpnd>, IsCommutable;
+class DPADD_U_D_DESC : MSA_3R_4R_DESC_BASE<"dpadd_u.d", int_mips_dpadd_u_d,
+ MSA128DOpnd, MSA128WOpnd,
+ MSA128WOpnd>, IsCommutable;
+
+class DPSUB_S_H_DESC : MSA_3R_4R_DESC_BASE<"dpsub_s.h", int_mips_dpsub_s_h,
+ MSA128HOpnd, MSA128BOpnd,
+ MSA128BOpnd>;
+class DPSUB_S_W_DESC : MSA_3R_4R_DESC_BASE<"dpsub_s.w", int_mips_dpsub_s_w,
+ MSA128WOpnd, MSA128HOpnd,
+ MSA128HOpnd>;
+class DPSUB_S_D_DESC : MSA_3R_4R_DESC_BASE<"dpsub_s.d", int_mips_dpsub_s_d,
+ MSA128DOpnd, MSA128WOpnd,
+ MSA128WOpnd>;
+
+class DPSUB_U_H_DESC : MSA_3R_4R_DESC_BASE<"dpsub_u.h", int_mips_dpsub_u_h,
+ MSA128HOpnd, MSA128BOpnd,
+ MSA128BOpnd>;
+class DPSUB_U_W_DESC : MSA_3R_4R_DESC_BASE<"dpsub_u.w", int_mips_dpsub_u_w,
+ MSA128WOpnd, MSA128HOpnd,
+ MSA128HOpnd>;
+class DPSUB_U_D_DESC : MSA_3R_4R_DESC_BASE<"dpsub_u.d", int_mips_dpsub_u_d,
+ MSA128DOpnd, MSA128WOpnd,
+ MSA128WOpnd>;
+
+class FADD_W_DESC : MSA_3RF_DESC_BASE<"fadd.w", fadd, MSA128WOpnd>,
+ IsCommutable;
+class FADD_D_DESC : MSA_3RF_DESC_BASE<"fadd.d", fadd, MSA128DOpnd>,
+ IsCommutable;
+
+class FCAF_W_DESC : MSA_3RF_DESC_BASE<"fcaf.w", int_mips_fcaf_w, MSA128WOpnd>,
+ IsCommutable;
+class FCAF_D_DESC : MSA_3RF_DESC_BASE<"fcaf.d", int_mips_fcaf_d, MSA128DOpnd>,
+ IsCommutable;
+
+class FCEQ_W_DESC : MSA_3RF_DESC_BASE<"fceq.w", vfsetoeq_v4f32, MSA128WOpnd>,
+ IsCommutable;
+class FCEQ_D_DESC : MSA_3RF_DESC_BASE<"fceq.d", vfsetoeq_v2f64, MSA128DOpnd>,
+ IsCommutable;
+
+class FCLASS_W_DESC : MSA_2RF_DESC_BASE<"fclass.w", int_mips_fclass_w,
+ MSA128WOpnd>;
+class FCLASS_D_DESC : MSA_2RF_DESC_BASE<"fclass.d", int_mips_fclass_d,
+ MSA128DOpnd>;
+
+class FCLE_W_DESC : MSA_3RF_DESC_BASE<"fcle.w", vfsetole_v4f32, MSA128WOpnd>;
+class FCLE_D_DESC : MSA_3RF_DESC_BASE<"fcle.d", vfsetole_v2f64, MSA128DOpnd>;
+
+class FCLT_W_DESC : MSA_3RF_DESC_BASE<"fclt.w", vfsetolt_v4f32, MSA128WOpnd>;
+class FCLT_D_DESC : MSA_3RF_DESC_BASE<"fclt.d", vfsetolt_v2f64, MSA128DOpnd>;
+
+class FCNE_W_DESC : MSA_3RF_DESC_BASE<"fcne.w", vfsetone_v4f32, MSA128WOpnd>,
+ IsCommutable;
+class FCNE_D_DESC : MSA_3RF_DESC_BASE<"fcne.d", vfsetone_v2f64, MSA128DOpnd>,
+ IsCommutable;
+
+class FCOR_W_DESC : MSA_3RF_DESC_BASE<"fcor.w", vfsetord_v4f32, MSA128WOpnd>,
+ IsCommutable;
+class FCOR_D_DESC : MSA_3RF_DESC_BASE<"fcor.d", vfsetord_v2f64, MSA128DOpnd>,
+ IsCommutable;
+
+class FCUEQ_W_DESC : MSA_3RF_DESC_BASE<"fcueq.w", vfsetueq_v4f32, MSA128WOpnd>,
+ IsCommutable;
+class FCUEQ_D_DESC : MSA_3RF_DESC_BASE<"fcueq.d", vfsetueq_v2f64, MSA128DOpnd>,
+ IsCommutable;
+
+class FCULE_W_DESC : MSA_3RF_DESC_BASE<"fcule.w", vfsetule_v4f32, MSA128WOpnd>,
+ IsCommutable;
+class FCULE_D_DESC : MSA_3RF_DESC_BASE<"fcule.d", vfsetule_v2f64, MSA128DOpnd>,
+ IsCommutable;
+
+class FCULT_W_DESC : MSA_3RF_DESC_BASE<"fcult.w", vfsetult_v4f32, MSA128WOpnd>,
+ IsCommutable;
+class FCULT_D_DESC : MSA_3RF_DESC_BASE<"fcult.d", vfsetult_v2f64, MSA128DOpnd>,
+ IsCommutable;
+
+class FCUN_W_DESC : MSA_3RF_DESC_BASE<"fcun.w", vfsetun_v4f32, MSA128WOpnd>,
+ IsCommutable;
+class FCUN_D_DESC : MSA_3RF_DESC_BASE<"fcun.d", vfsetun_v2f64, MSA128DOpnd>,
+ IsCommutable;
+
+class FCUNE_W_DESC : MSA_3RF_DESC_BASE<"fcune.w", vfsetune_v4f32, MSA128WOpnd>,
+ IsCommutable;
+class FCUNE_D_DESC : MSA_3RF_DESC_BASE<"fcune.d", vfsetune_v2f64, MSA128DOpnd>,
+ IsCommutable;
+
+class FDIV_W_DESC : MSA_3RF_DESC_BASE<"fdiv.w", fdiv, MSA128WOpnd>;
+class FDIV_D_DESC : MSA_3RF_DESC_BASE<"fdiv.d", fdiv, MSA128DOpnd>;
+
+class FEXDO_H_DESC : MSA_3RF_DESC_BASE<"fexdo.h", int_mips_fexdo_h,
+ MSA128HOpnd, MSA128WOpnd, MSA128WOpnd>;
+class FEXDO_W_DESC : MSA_3RF_DESC_BASE<"fexdo.w", int_mips_fexdo_w,
+ MSA128WOpnd, MSA128DOpnd, MSA128DOpnd>;
+
+// The fexp2.df instruction multiplies the first operand by 2 to the power of
+// the second operand. We therefore need a pseudo-insn in order to invent the
+// 1.0 when we only need to match ISD::FEXP2.
+class FEXP2_W_DESC : MSA_3RF_DESC_BASE<"fexp2.w", mul_fexp2, MSA128WOpnd>;
+class FEXP2_D_DESC : MSA_3RF_DESC_BASE<"fexp2.d", mul_fexp2, MSA128DOpnd>;
+let usesCustomInserter = 1 in {
+ class FEXP2_W_1_PSEUDO_DESC :
+ MSAPseudo<(outs MSA128W:$wd), (ins MSA128W:$ws),
+ [(set MSA128W:$wd, (fexp2 MSA128W:$ws))]>;
+ class FEXP2_D_1_PSEUDO_DESC :
+ MSAPseudo<(outs MSA128D:$wd), (ins MSA128D:$ws),
+ [(set MSA128D:$wd, (fexp2 MSA128D:$ws))]>;
+}
+
+class FEXUPL_W_DESC : MSA_2RF_DESC_BASE<"fexupl.w", int_mips_fexupl_w,
+ MSA128WOpnd, MSA128HOpnd>;
+class FEXUPL_D_DESC : MSA_2RF_DESC_BASE<"fexupl.d", int_mips_fexupl_d,
+ MSA128DOpnd, MSA128WOpnd>;
+
+class FEXUPR_W_DESC : MSA_2RF_DESC_BASE<"fexupr.w", int_mips_fexupr_w,
+ MSA128WOpnd, MSA128HOpnd>;
+class FEXUPR_D_DESC : MSA_2RF_DESC_BASE<"fexupr.d", int_mips_fexupr_d,
+ MSA128DOpnd, MSA128WOpnd>;
+
+class FFINT_S_W_DESC : MSA_2RF_DESC_BASE<"ffint_s.w", sint_to_fp, MSA128WOpnd>;
+class FFINT_S_D_DESC : MSA_2RF_DESC_BASE<"ffint_s.d", sint_to_fp, MSA128DOpnd>;
+
+class FFINT_U_W_DESC : MSA_2RF_DESC_BASE<"ffint_u.w", uint_to_fp, MSA128WOpnd>;
+class FFINT_U_D_DESC : MSA_2RF_DESC_BASE<"ffint_u.d", uint_to_fp, MSA128DOpnd>;
+
+class FFQL_W_DESC : MSA_2RF_DESC_BASE<"ffql.w", int_mips_ffql_w,
+ MSA128WOpnd, MSA128HOpnd>;
+class FFQL_D_DESC : MSA_2RF_DESC_BASE<"ffql.d", int_mips_ffql_d,
+ MSA128DOpnd, MSA128WOpnd>;
+
+class FFQR_W_DESC : MSA_2RF_DESC_BASE<"ffqr.w", int_mips_ffqr_w,
+ MSA128WOpnd, MSA128HOpnd>;
+class FFQR_D_DESC : MSA_2RF_DESC_BASE<"ffqr.d", int_mips_ffqr_d,
+ MSA128DOpnd, MSA128WOpnd>;
+
+class FILL_B_DESC : MSA_2R_FILL_DESC_BASE<"fill.b", v16i8, vsplati8,
+ MSA128BOpnd, GPR32Opnd>;
+class FILL_H_DESC : MSA_2R_FILL_DESC_BASE<"fill.h", v8i16, vsplati16,
+ MSA128HOpnd, GPR32Opnd>;
+class FILL_W_DESC : MSA_2R_FILL_DESC_BASE<"fill.w", v4i32, vsplati32,
+ MSA128WOpnd, GPR32Opnd>;
+class FILL_D_DESC : MSA_2R_FILL_DESC_BASE<"fill.d", v2i64, vsplati64,
+ MSA128DOpnd, GPR64Opnd>;
+
+class FILL_FW_PSEUDO_DESC : MSA_2R_FILL_PSEUDO_BASE<v4f32, vsplatf32, MSA128W,
+ FGR32>;
+class FILL_FD_PSEUDO_DESC : MSA_2R_FILL_PSEUDO_BASE<v2f64, vsplatf64, MSA128D,
+ FGR64>;
+
+class FLOG2_W_DESC : MSA_2RF_DESC_BASE<"flog2.w", flog2, MSA128WOpnd>;
+class FLOG2_D_DESC : MSA_2RF_DESC_BASE<"flog2.d", flog2, MSA128DOpnd>;
+
+class FMADD_W_DESC : MSA_3RF_4RF_DESC_BASE<"fmadd.w", fma, MSA128WOpnd>;
+class FMADD_D_DESC : MSA_3RF_4RF_DESC_BASE<"fmadd.d", fma, MSA128DOpnd>;
+
+class FMAX_W_DESC : MSA_3RF_DESC_BASE<"fmax.w", int_mips_fmax_w, MSA128WOpnd>;
+class FMAX_D_DESC : MSA_3RF_DESC_BASE<"fmax.d", int_mips_fmax_d, MSA128DOpnd>;
+
+class FMAX_A_W_DESC : MSA_3RF_DESC_BASE<"fmax_a.w", int_mips_fmax_a_w,
+ MSA128WOpnd>;
+class FMAX_A_D_DESC : MSA_3RF_DESC_BASE<"fmax_a.d", int_mips_fmax_a_d,
+ MSA128DOpnd>;
+
+class FMIN_W_DESC : MSA_3RF_DESC_BASE<"fmin.w", int_mips_fmin_w, MSA128WOpnd>;
+class FMIN_D_DESC : MSA_3RF_DESC_BASE<"fmin.d", int_mips_fmin_d, MSA128DOpnd>;
+
+class FMIN_A_W_DESC : MSA_3RF_DESC_BASE<"fmin_a.w", int_mips_fmin_a_w,
+ MSA128WOpnd>;
+class FMIN_A_D_DESC : MSA_3RF_DESC_BASE<"fmin_a.d", int_mips_fmin_a_d,
+ MSA128DOpnd>;
+
+class FMSUB_W_DESC : MSA_3RF_4RF_DESC_BASE<"fmsub.w", fms, MSA128WOpnd>;
+class FMSUB_D_DESC : MSA_3RF_4RF_DESC_BASE<"fmsub.d", fms, MSA128DOpnd>;
+
+class FMUL_W_DESC : MSA_3RF_DESC_BASE<"fmul.w", fmul, MSA128WOpnd>;
+class FMUL_D_DESC : MSA_3RF_DESC_BASE<"fmul.d", fmul, MSA128DOpnd>;
+
+class FRINT_W_DESC : MSA_2RF_DESC_BASE<"frint.w", frint, MSA128WOpnd>;
+class FRINT_D_DESC : MSA_2RF_DESC_BASE<"frint.d", frint, MSA128DOpnd>;
+
+class FRCP_W_DESC : MSA_2RF_DESC_BASE<"frcp.w", int_mips_frcp_w, MSA128WOpnd>;
+class FRCP_D_DESC : MSA_2RF_DESC_BASE<"frcp.d", int_mips_frcp_d, MSA128DOpnd>;
+
+class FRSQRT_W_DESC : MSA_2RF_DESC_BASE<"frsqrt.w", int_mips_frsqrt_w,
+ MSA128WOpnd>;
+class FRSQRT_D_DESC : MSA_2RF_DESC_BASE<"frsqrt.d", int_mips_frsqrt_d,
+ MSA128DOpnd>;
+
+class FSAF_W_DESC : MSA_3RF_DESC_BASE<"fsaf.w", int_mips_fsaf_w, MSA128WOpnd>;
+class FSAF_D_DESC : MSA_3RF_DESC_BASE<"fsaf.d", int_mips_fsaf_d, MSA128DOpnd>;
+
+class FSEQ_W_DESC : MSA_3RF_DESC_BASE<"fseq.w", int_mips_fseq_w, MSA128WOpnd>;
+class FSEQ_D_DESC : MSA_3RF_DESC_BASE<"fseq.d", int_mips_fseq_d, MSA128DOpnd>;
+
+class FSLE_W_DESC : MSA_3RF_DESC_BASE<"fsle.w", int_mips_fsle_w, MSA128WOpnd>;
+class FSLE_D_DESC : MSA_3RF_DESC_BASE<"fsle.d", int_mips_fsle_d, MSA128DOpnd>;
+
+class FSLT_W_DESC : MSA_3RF_DESC_BASE<"fslt.w", int_mips_fslt_w, MSA128WOpnd>;
+class FSLT_D_DESC : MSA_3RF_DESC_BASE<"fslt.d", int_mips_fslt_d, MSA128DOpnd>;
+
+class FSNE_W_DESC : MSA_3RF_DESC_BASE<"fsne.w", int_mips_fsne_w, MSA128WOpnd>;
+class FSNE_D_DESC : MSA_3RF_DESC_BASE<"fsne.d", int_mips_fsne_d, MSA128DOpnd>;
+
+class FSOR_W_DESC : MSA_3RF_DESC_BASE<"fsor.w", int_mips_fsor_w, MSA128WOpnd>;
+class FSOR_D_DESC : MSA_3RF_DESC_BASE<"fsor.d", int_mips_fsor_d, MSA128DOpnd>;
+
+class FSQRT_W_DESC : MSA_2RF_DESC_BASE<"fsqrt.w", fsqrt, MSA128WOpnd>;
+class FSQRT_D_DESC : MSA_2RF_DESC_BASE<"fsqrt.d", fsqrt, MSA128DOpnd>;
+
+class FSUB_W_DESC : MSA_3RF_DESC_BASE<"fsub.w", fsub, MSA128WOpnd>;
+class FSUB_D_DESC : MSA_3RF_DESC_BASE<"fsub.d", fsub, MSA128DOpnd>;
+
+class FSUEQ_W_DESC : MSA_3RF_DESC_BASE<"fsueq.w", int_mips_fsueq_w,
+ MSA128WOpnd>;
+class FSUEQ_D_DESC : MSA_3RF_DESC_BASE<"fsueq.d", int_mips_fsueq_d,
+ MSA128DOpnd>;
+
+class FSULE_W_DESC : MSA_3RF_DESC_BASE<"fsule.w", int_mips_fsule_w,
+ MSA128WOpnd>;
+class FSULE_D_DESC : MSA_3RF_DESC_BASE<"fsule.d", int_mips_fsule_d,
+ MSA128DOpnd>;
+
+class FSULT_W_DESC : MSA_3RF_DESC_BASE<"fsult.w", int_mips_fsult_w,
+ MSA128WOpnd>;
+class FSULT_D_DESC : MSA_3RF_DESC_BASE<"fsult.d", int_mips_fsult_d,
+ MSA128DOpnd>;
+
+class FSUN_W_DESC : MSA_3RF_DESC_BASE<"fsun.w", int_mips_fsun_w,
+ MSA128WOpnd>;
+class FSUN_D_DESC : MSA_3RF_DESC_BASE<"fsun.d", int_mips_fsun_d,
+ MSA128DOpnd>;
+
+class FSUNE_W_DESC : MSA_3RF_DESC_BASE<"fsune.w", int_mips_fsune_w,
+ MSA128WOpnd>;
+class FSUNE_D_DESC : MSA_3RF_DESC_BASE<"fsune.d", int_mips_fsune_d,
+ MSA128DOpnd>;
+
+class FTINT_S_W_DESC : MSA_2RF_DESC_BASE<"ftint_s.w", int_mips_ftint_s_w,
+ MSA128WOpnd>;
+class FTINT_S_D_DESC : MSA_2RF_DESC_BASE<"ftint_s.d", int_mips_ftint_s_d,
+ MSA128DOpnd>;
+
+class FTINT_U_W_DESC : MSA_2RF_DESC_BASE<"ftint_u.w", int_mips_ftint_u_w,
+ MSA128WOpnd>;
+class FTINT_U_D_DESC : MSA_2RF_DESC_BASE<"ftint_u.d", int_mips_ftint_u_d,
+ MSA128DOpnd>;
+
+class FTQ_H_DESC : MSA_3RF_DESC_BASE<"ftq.h", int_mips_ftq_h,
+ MSA128HOpnd, MSA128WOpnd, MSA128WOpnd>;
+class FTQ_W_DESC : MSA_3RF_DESC_BASE<"ftq.w", int_mips_ftq_w,
+ MSA128WOpnd, MSA128DOpnd, MSA128DOpnd>;
+
+class FTRUNC_S_W_DESC : MSA_2RF_DESC_BASE<"ftrunc_s.w", fp_to_sint,
+ MSA128WOpnd>;
+class FTRUNC_S_D_DESC : MSA_2RF_DESC_BASE<"ftrunc_s.d", fp_to_sint,
+ MSA128DOpnd>;
+
+class FTRUNC_U_W_DESC : MSA_2RF_DESC_BASE<"ftrunc_u.w", fp_to_uint,
+ MSA128WOpnd>;
+class FTRUNC_U_D_DESC : MSA_2RF_DESC_BASE<"ftrunc_u.d", fp_to_uint,
+ MSA128DOpnd>;
+
+class HADD_S_H_DESC : MSA_3R_DESC_BASE<"hadd_s.h", int_mips_hadd_s_h,
+ MSA128HOpnd, MSA128BOpnd, MSA128BOpnd>;
+class HADD_S_W_DESC : MSA_3R_DESC_BASE<"hadd_s.w", int_mips_hadd_s_w,
+ MSA128WOpnd, MSA128HOpnd, MSA128HOpnd>;
+class HADD_S_D_DESC : MSA_3R_DESC_BASE<"hadd_s.d", int_mips_hadd_s_d,
+ MSA128DOpnd, MSA128WOpnd, MSA128WOpnd>;
+
+class HADD_U_H_DESC : MSA_3R_DESC_BASE<"hadd_u.h", int_mips_hadd_u_h,
+ MSA128HOpnd, MSA128BOpnd, MSA128BOpnd>;
+class HADD_U_W_DESC : MSA_3R_DESC_BASE<"hadd_u.w", int_mips_hadd_u_w,
+ MSA128WOpnd, MSA128HOpnd, MSA128HOpnd>;
+class HADD_U_D_DESC : MSA_3R_DESC_BASE<"hadd_u.d", int_mips_hadd_u_d,
+ MSA128DOpnd, MSA128WOpnd, MSA128WOpnd>;
+
+class HSUB_S_H_DESC : MSA_3R_DESC_BASE<"hsub_s.h", int_mips_hsub_s_h,
+ MSA128HOpnd, MSA128BOpnd, MSA128BOpnd>;
+class HSUB_S_W_DESC : MSA_3R_DESC_BASE<"hsub_s.w", int_mips_hsub_s_w,
+ MSA128WOpnd, MSA128HOpnd, MSA128HOpnd>;
+class HSUB_S_D_DESC : MSA_3R_DESC_BASE<"hsub_s.d", int_mips_hsub_s_d,
+ MSA128DOpnd, MSA128WOpnd, MSA128WOpnd>;
+
+class HSUB_U_H_DESC : MSA_3R_DESC_BASE<"hsub_u.h", int_mips_hsub_u_h,
+ MSA128HOpnd, MSA128BOpnd, MSA128BOpnd>;
+class HSUB_U_W_DESC : MSA_3R_DESC_BASE<"hsub_u.w", int_mips_hsub_u_w,
+ MSA128WOpnd, MSA128HOpnd, MSA128HOpnd>;
+class HSUB_U_D_DESC : MSA_3R_DESC_BASE<"hsub_u.d", int_mips_hsub_u_d,
+ MSA128DOpnd, MSA128WOpnd, MSA128WOpnd>;
+
+class ILVEV_B_DESC : MSA_3R_DESC_BASE<"ilvev.b", MipsILVEV, MSA128BOpnd>;
+class ILVEV_H_DESC : MSA_3R_DESC_BASE<"ilvev.h", MipsILVEV, MSA128HOpnd>;
+class ILVEV_W_DESC : MSA_3R_DESC_BASE<"ilvev.w", MipsILVEV, MSA128WOpnd>;
+class ILVEV_D_DESC : MSA_3R_DESC_BASE<"ilvev.d", MipsILVEV, MSA128DOpnd>;
+
+class ILVL_B_DESC : MSA_3R_DESC_BASE<"ilvl.b", MipsILVL, MSA128BOpnd>;
+class ILVL_H_DESC : MSA_3R_DESC_BASE<"ilvl.h", MipsILVL, MSA128HOpnd>;
+class ILVL_W_DESC : MSA_3R_DESC_BASE<"ilvl.w", MipsILVL, MSA128WOpnd>;
+class ILVL_D_DESC : MSA_3R_DESC_BASE<"ilvl.d", MipsILVL, MSA128DOpnd>;
+
+class ILVOD_B_DESC : MSA_3R_DESC_BASE<"ilvod.b", MipsILVOD, MSA128BOpnd>;
+class ILVOD_H_DESC : MSA_3R_DESC_BASE<"ilvod.h", MipsILVOD, MSA128HOpnd>;
+class ILVOD_W_DESC : MSA_3R_DESC_BASE<"ilvod.w", MipsILVOD, MSA128WOpnd>;
+class ILVOD_D_DESC : MSA_3R_DESC_BASE<"ilvod.d", MipsILVOD, MSA128DOpnd>;
+
+class ILVR_B_DESC : MSA_3R_DESC_BASE<"ilvr.b", MipsILVR, MSA128BOpnd>;
+class ILVR_H_DESC : MSA_3R_DESC_BASE<"ilvr.h", MipsILVR, MSA128HOpnd>;
+class ILVR_W_DESC : MSA_3R_DESC_BASE<"ilvr.w", MipsILVR, MSA128WOpnd>;
+class ILVR_D_DESC : MSA_3R_DESC_BASE<"ilvr.d", MipsILVR, MSA128DOpnd>;
+
+class INSERT_B_DESC : MSA_INSERT_DESC_BASE<"insert.b", vinsert_v16i8, uimm4,
+ immZExt4Ptr, MSA128BOpnd, GPR32Opnd>;
+class INSERT_H_DESC : MSA_INSERT_DESC_BASE<"insert.h", vinsert_v8i16, uimm3,
+ immZExt3Ptr, MSA128HOpnd, GPR32Opnd>;
+class INSERT_W_DESC : MSA_INSERT_DESC_BASE<"insert.w", vinsert_v4i32, uimm2,
+ immZExt2Ptr, MSA128WOpnd, GPR32Opnd>;
+class INSERT_D_DESC : MSA_INSERT_DESC_BASE<"insert.d", vinsert_v2i64, uimm1,
+ immZExt1Ptr, MSA128DOpnd, GPR64Opnd>;
+
+class INSERT_B_VIDX_PSEUDO_DESC :
+ MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v16i8, MSA128BOpnd, GPR32Opnd, GPR32Opnd>;
+class INSERT_H_VIDX_PSEUDO_DESC :
+ MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v8i16, MSA128HOpnd, GPR32Opnd, GPR32Opnd>;
+class INSERT_W_VIDX_PSEUDO_DESC :
+ MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v4i32, MSA128WOpnd, GPR32Opnd, GPR32Opnd>;
+class INSERT_D_VIDX_PSEUDO_DESC :
+ MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v2i64, MSA128DOpnd, GPR64Opnd, GPR32Opnd>;
+
+class INSERT_FW_PSEUDO_DESC : MSA_INSERT_PSEUDO_BASE<vector_insert, v4f32,
+ uimm2, immZExt2Ptr,
+ MSA128WOpnd, FGR32Opnd>;
+class INSERT_FD_PSEUDO_DESC : MSA_INSERT_PSEUDO_BASE<vector_insert, v2f64,
+ uimm1, immZExt1Ptr,
+ MSA128DOpnd, FGR64Opnd>;
+
+class INSERT_FW_VIDX_PSEUDO_DESC :
+ MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v4f32, MSA128WOpnd, FGR32Opnd, GPR32Opnd>;
+class INSERT_FD_VIDX_PSEUDO_DESC :
+ MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v2f64, MSA128DOpnd, FGR64Opnd, GPR32Opnd>;
+
+class INSERT_B_VIDX64_PSEUDO_DESC :
+ MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v16i8, MSA128BOpnd, GPR32Opnd, GPR64Opnd>;
+class INSERT_H_VIDX64_PSEUDO_DESC :
+ MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v8i16, MSA128HOpnd, GPR32Opnd, GPR64Opnd>;
+class INSERT_W_VIDX64_PSEUDO_DESC :
+ MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v4i32, MSA128WOpnd, GPR32Opnd, GPR64Opnd>;
+class INSERT_D_VIDX64_PSEUDO_DESC :
+ MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v2i64, MSA128DOpnd, GPR64Opnd, GPR64Opnd>;
+
+class INSERT_FW_VIDX64_PSEUDO_DESC :
+ MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v4f32, MSA128WOpnd, FGR32Opnd, GPR64Opnd>;
+class INSERT_FD_VIDX64_PSEUDO_DESC :
+ MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v2f64, MSA128DOpnd, FGR64Opnd, GPR64Opnd>;
+
+class INSVE_B_DESC : MSA_INSVE_DESC_BASE<"insve.b", insve_v16i8, uimm4, immZExt4,
+ MSA128BOpnd>;
+class INSVE_H_DESC : MSA_INSVE_DESC_BASE<"insve.h", insve_v8i16, uimm3, immZExt3,
+ MSA128HOpnd>;
+class INSVE_W_DESC : MSA_INSVE_DESC_BASE<"insve.w", insve_v4i32, uimm2, immZExt2,
+ MSA128WOpnd>;
+class INSVE_D_DESC : MSA_INSVE_DESC_BASE<"insve.d", insve_v2i64, uimm1, immZExt1,
+ MSA128DOpnd>;
+
+class LD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ ValueType TyNode, RegisterOperand ROWD,
+ Operand MemOpnd, ComplexPattern Addr = addrimm10,
+ InstrItinClass itin = NoItinerary> {
+ dag OutOperandList = (outs ROWD:$wd);
+ dag InOperandList = (ins MemOpnd:$addr);
+ string AsmString = !strconcat(instr_asm, "\t$wd, $addr");
+ list<dag> Pattern = [(set ROWD:$wd, (TyNode (OpNode Addr:$addr)))];
+ InstrItinClass Itinerary = itin;
+ string DecoderMethod = "DecodeMSA128Mem";
+}
+
+class LD_B_DESC : LD_DESC_BASE<"ld.b", load, v16i8, MSA128BOpnd, mem_simm10>;
+class LD_H_DESC : LD_DESC_BASE<"ld.h", load, v8i16, MSA128HOpnd,
+ mem_simm10_lsl1, addrimm10lsl1>;
+class LD_W_DESC : LD_DESC_BASE<"ld.w", load, v4i32, MSA128WOpnd,
+ mem_simm10_lsl2, addrimm10lsl2>;
+class LD_D_DESC : LD_DESC_BASE<"ld.d", load, v2i64, MSA128DOpnd,
+ mem_simm10_lsl3, addrimm10lsl3>;
+
+class LDI_B_DESC : MSA_I10_LDI_DESC_BASE<"ldi.b", MSA128BOpnd>;
+class LDI_H_DESC : MSA_I10_LDI_DESC_BASE<"ldi.h", MSA128HOpnd>;
+class LDI_W_DESC : MSA_I10_LDI_DESC_BASE<"ldi.w", MSA128WOpnd>;
+class LDI_D_DESC : MSA_I10_LDI_DESC_BASE<"ldi.d", MSA128DOpnd>;
+
+class LSA_DESC_BASE<string instr_asm, RegisterOperand RORD,
+ InstrItinClass itin = NoItinerary> {
+ dag OutOperandList = (outs RORD:$rd);
+ dag InOperandList = (ins RORD:$rs, RORD:$rt, uimm2_plus1:$sa);
+ string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt, $sa");
+ list<dag> Pattern = [(set RORD:$rd, (add RORD:$rt,
+ (shl RORD:$rs,
+ immZExt2Lsa:$sa)))];
+ InstrItinClass Itinerary = itin;
+}
+
+class LSA_DESC : LSA_DESC_BASE<"lsa", GPR32Opnd, II_LSA>;
+class DLSA_DESC : LSA_DESC_BASE<"dlsa", GPR64Opnd, II_DLSA>;
+
+class MADD_Q_H_DESC : MSA_3RF_4RF_DESC_BASE<"madd_q.h", int_mips_madd_q_h,
+ MSA128HOpnd>;
+class MADD_Q_W_DESC : MSA_3RF_4RF_DESC_BASE<"madd_q.w", int_mips_madd_q_w,
+ MSA128WOpnd>;
+
+class MADDR_Q_H_DESC : MSA_3RF_4RF_DESC_BASE<"maddr_q.h", int_mips_maddr_q_h,
+ MSA128HOpnd>;
+class MADDR_Q_W_DESC : MSA_3RF_4RF_DESC_BASE<"maddr_q.w", int_mips_maddr_q_w,
+ MSA128WOpnd>;
+
+class MADDV_B_DESC : MSA_3R_4R_DESC_BASE<"maddv.b", muladd, MSA128BOpnd>;
+class MADDV_H_DESC : MSA_3R_4R_DESC_BASE<"maddv.h", muladd, MSA128HOpnd>;
+class MADDV_W_DESC : MSA_3R_4R_DESC_BASE<"maddv.w", muladd, MSA128WOpnd>;
+class MADDV_D_DESC : MSA_3R_4R_DESC_BASE<"maddv.d", muladd, MSA128DOpnd>;
+
+class MAX_A_B_DESC : MSA_3R_DESC_BASE<"max_a.b", int_mips_max_a_b, MSA128BOpnd>;
+class MAX_A_H_DESC : MSA_3R_DESC_BASE<"max_a.h", int_mips_max_a_h, MSA128HOpnd>;
+class MAX_A_W_DESC : MSA_3R_DESC_BASE<"max_a.w", int_mips_max_a_w, MSA128WOpnd>;
+class MAX_A_D_DESC : MSA_3R_DESC_BASE<"max_a.d", int_mips_max_a_d, MSA128DOpnd>;
+
+class MAX_S_B_DESC : MSA_3R_DESC_BASE<"max_s.b", MipsVSMax, MSA128BOpnd>;
+class MAX_S_H_DESC : MSA_3R_DESC_BASE<"max_s.h", MipsVSMax, MSA128HOpnd>;
+class MAX_S_W_DESC : MSA_3R_DESC_BASE<"max_s.w", MipsVSMax, MSA128WOpnd>;
+class MAX_S_D_DESC : MSA_3R_DESC_BASE<"max_s.d", MipsVSMax, MSA128DOpnd>;
+
+class MAX_U_B_DESC : MSA_3R_DESC_BASE<"max_u.b", MipsVUMax, MSA128BOpnd>;
+class MAX_U_H_DESC : MSA_3R_DESC_BASE<"max_u.h", MipsVUMax, MSA128HOpnd>;
+class MAX_U_W_DESC : MSA_3R_DESC_BASE<"max_u.w", MipsVUMax, MSA128WOpnd>;
+class MAX_U_D_DESC : MSA_3R_DESC_BASE<"max_u.d", MipsVUMax, MSA128DOpnd>;
+
+class MAXI_S_B_DESC : MSA_I5_DESC_BASE<"maxi_s.b", MipsVSMax, vsplati8_simm5,
+ MSA128BOpnd>;
+class MAXI_S_H_DESC : MSA_I5_DESC_BASE<"maxi_s.h", MipsVSMax, vsplati16_simm5,
+ MSA128HOpnd>;
+class MAXI_S_W_DESC : MSA_I5_DESC_BASE<"maxi_s.w", MipsVSMax, vsplati32_simm5,
+ MSA128WOpnd>;
+class MAXI_S_D_DESC : MSA_I5_DESC_BASE<"maxi_s.d", MipsVSMax, vsplati64_simm5,
+ MSA128DOpnd>;
+
+class MAXI_U_B_DESC : MSA_I5_DESC_BASE<"maxi_u.b", MipsVUMax, vsplati8_uimm5,
+ MSA128BOpnd>;
+class MAXI_U_H_DESC : MSA_I5_DESC_BASE<"maxi_u.h", MipsVUMax, vsplati16_uimm5,
+ MSA128HOpnd>;
+class MAXI_U_W_DESC : MSA_I5_DESC_BASE<"maxi_u.w", MipsVUMax, vsplati32_uimm5,
+ MSA128WOpnd>;
+class MAXI_U_D_DESC : MSA_I5_DESC_BASE<"maxi_u.d", MipsVUMax, vsplati64_uimm5,
+ MSA128DOpnd>;
+
+class MIN_A_B_DESC : MSA_3R_DESC_BASE<"min_a.b", int_mips_min_a_b, MSA128BOpnd>;
+class MIN_A_H_DESC : MSA_3R_DESC_BASE<"min_a.h", int_mips_min_a_h, MSA128HOpnd>;
+class MIN_A_W_DESC : MSA_3R_DESC_BASE<"min_a.w", int_mips_min_a_w, MSA128WOpnd>;
+class MIN_A_D_DESC : MSA_3R_DESC_BASE<"min_a.d", int_mips_min_a_d, MSA128DOpnd>;
+
+class MIN_S_B_DESC : MSA_3R_DESC_BASE<"min_s.b", MipsVSMin, MSA128BOpnd>;
+class MIN_S_H_DESC : MSA_3R_DESC_BASE<"min_s.h", MipsVSMin, MSA128HOpnd>;
+class MIN_S_W_DESC : MSA_3R_DESC_BASE<"min_s.w", MipsVSMin, MSA128WOpnd>;
+class MIN_S_D_DESC : MSA_3R_DESC_BASE<"min_s.d", MipsVSMin, MSA128DOpnd>;
+
+class MIN_U_B_DESC : MSA_3R_DESC_BASE<"min_u.b", MipsVUMin, MSA128BOpnd>;
+class MIN_U_H_DESC : MSA_3R_DESC_BASE<"min_u.h", MipsVUMin, MSA128HOpnd>;
+class MIN_U_W_DESC : MSA_3R_DESC_BASE<"min_u.w", MipsVUMin, MSA128WOpnd>;
+class MIN_U_D_DESC : MSA_3R_DESC_BASE<"min_u.d", MipsVUMin, MSA128DOpnd>;
+
+class MINI_S_B_DESC : MSA_I5_DESC_BASE<"mini_s.b", MipsVSMin, vsplati8_simm5,
+ MSA128BOpnd>;
+class MINI_S_H_DESC : MSA_I5_DESC_BASE<"mini_s.h", MipsVSMin, vsplati16_simm5,
+ MSA128HOpnd>;
+class MINI_S_W_DESC : MSA_I5_DESC_BASE<"mini_s.w", MipsVSMin, vsplati32_simm5,
+ MSA128WOpnd>;
+class MINI_S_D_DESC : MSA_I5_DESC_BASE<"mini_s.d", MipsVSMin, vsplati64_simm5,
+ MSA128DOpnd>;
+
+class MINI_U_B_DESC : MSA_I5_DESC_BASE<"mini_u.b", MipsVUMin, vsplati8_uimm5,
+ MSA128BOpnd>;
+class MINI_U_H_DESC : MSA_I5_DESC_BASE<"mini_u.h", MipsVUMin, vsplati16_uimm5,
+ MSA128HOpnd>;
+class MINI_U_W_DESC : MSA_I5_DESC_BASE<"mini_u.w", MipsVUMin, vsplati32_uimm5,
+ MSA128WOpnd>;
+class MINI_U_D_DESC : MSA_I5_DESC_BASE<"mini_u.d", MipsVUMin, vsplati64_uimm5,
+ MSA128DOpnd>;
+
+class MOD_S_B_DESC : MSA_3R_DESC_BASE<"mod_s.b", srem, MSA128BOpnd>;
+class MOD_S_H_DESC : MSA_3R_DESC_BASE<"mod_s.h", srem, MSA128HOpnd>;
+class MOD_S_W_DESC : MSA_3R_DESC_BASE<"mod_s.w", srem, MSA128WOpnd>;
+class MOD_S_D_DESC : MSA_3R_DESC_BASE<"mod_s.d", srem, MSA128DOpnd>;
+
+class MOD_U_B_DESC : MSA_3R_DESC_BASE<"mod_u.b", urem, MSA128BOpnd>;
+class MOD_U_H_DESC : MSA_3R_DESC_BASE<"mod_u.h", urem, MSA128HOpnd>;
+class MOD_U_W_DESC : MSA_3R_DESC_BASE<"mod_u.w", urem, MSA128WOpnd>;
+class MOD_U_D_DESC : MSA_3R_DESC_BASE<"mod_u.d", urem, MSA128DOpnd>;
+
+class MOVE_V_DESC {
+ dag OutOperandList = (outs MSA128BOpnd:$wd);
+ dag InOperandList = (ins MSA128BOpnd:$ws);
+ string AsmString = "move.v\t$wd, $ws";
+ list<dag> Pattern = [];
+ InstrItinClass Itinerary = NoItinerary;
+}
+
+class MSUB_Q_H_DESC : MSA_3RF_4RF_DESC_BASE<"msub_q.h", int_mips_msub_q_h,
+ MSA128HOpnd>;
+class MSUB_Q_W_DESC : MSA_3RF_4RF_DESC_BASE<"msub_q.w", int_mips_msub_q_w,
+ MSA128WOpnd>;
+
+class MSUBR_Q_H_DESC : MSA_3RF_4RF_DESC_BASE<"msubr_q.h", int_mips_msubr_q_h,
+ MSA128HOpnd>;
+class MSUBR_Q_W_DESC : MSA_3RF_4RF_DESC_BASE<"msubr_q.w", int_mips_msubr_q_w,
+ MSA128WOpnd>;
+
+class MSUBV_B_DESC : MSA_3R_4R_DESC_BASE<"msubv.b", mulsub, MSA128BOpnd>;
+class MSUBV_H_DESC : MSA_3R_4R_DESC_BASE<"msubv.h", mulsub, MSA128HOpnd>;
+class MSUBV_W_DESC : MSA_3R_4R_DESC_BASE<"msubv.w", mulsub, MSA128WOpnd>;
+class MSUBV_D_DESC : MSA_3R_4R_DESC_BASE<"msubv.d", mulsub, MSA128DOpnd>;
+
+class MUL_Q_H_DESC : MSA_3RF_DESC_BASE<"mul_q.h", int_mips_mul_q_h,
+ MSA128HOpnd>;
+class MUL_Q_W_DESC : MSA_3RF_DESC_BASE<"mul_q.w", int_mips_mul_q_w,
+ MSA128WOpnd>;
+
+class MULR_Q_H_DESC : MSA_3RF_DESC_BASE<"mulr_q.h", int_mips_mulr_q_h,
+ MSA128HOpnd>;
+class MULR_Q_W_DESC : MSA_3RF_DESC_BASE<"mulr_q.w", int_mips_mulr_q_w,
+ MSA128WOpnd>;
+
+class MULV_B_DESC : MSA_3R_DESC_BASE<"mulv.b", mul, MSA128BOpnd>;
+class MULV_H_DESC : MSA_3R_DESC_BASE<"mulv.h", mul, MSA128HOpnd>;
+class MULV_W_DESC : MSA_3R_DESC_BASE<"mulv.w", mul, MSA128WOpnd>;
+class MULV_D_DESC : MSA_3R_DESC_BASE<"mulv.d", mul, MSA128DOpnd>;
+
+class NLOC_B_DESC : MSA_2R_DESC_BASE<"nloc.b", int_mips_nloc_b, MSA128BOpnd>;
+class NLOC_H_DESC : MSA_2R_DESC_BASE<"nloc.h", int_mips_nloc_h, MSA128HOpnd>;
+class NLOC_W_DESC : MSA_2R_DESC_BASE<"nloc.w", int_mips_nloc_w, MSA128WOpnd>;
+class NLOC_D_DESC : MSA_2R_DESC_BASE<"nloc.d", int_mips_nloc_d, MSA128DOpnd>;
+
+class NLZC_B_DESC : MSA_2R_DESC_BASE<"nlzc.b", ctlz, MSA128BOpnd>;
+class NLZC_H_DESC : MSA_2R_DESC_BASE<"nlzc.h", ctlz, MSA128HOpnd>;
+class NLZC_W_DESC : MSA_2R_DESC_BASE<"nlzc.w", ctlz, MSA128WOpnd>;
+class NLZC_D_DESC : MSA_2R_DESC_BASE<"nlzc.d", ctlz, MSA128DOpnd>;
+
+class NOR_V_DESC : MSA_VEC_DESC_BASE<"nor.v", MipsVNOR, MSA128BOpnd>;
+class NOR_V_H_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<MipsVNOR, MSA128HOpnd>;
+class NOR_V_W_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<MipsVNOR, MSA128WOpnd>;
+class NOR_V_D_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<MipsVNOR, MSA128DOpnd>;
+
+class NORI_B_DESC : MSA_I8_DESC_BASE<"nori.b", MipsVNOR, vsplati8_uimm8,
+ MSA128BOpnd>;
+
+class OR_V_DESC : MSA_VEC_DESC_BASE<"or.v", or, MSA128BOpnd>;
+class OR_V_H_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<or, MSA128HOpnd>;
+class OR_V_W_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<or, MSA128WOpnd>;
+class OR_V_D_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<or, MSA128DOpnd>;
+
+class ORI_B_DESC : MSA_I8_DESC_BASE<"ori.b", or, vsplati8_uimm8, MSA128BOpnd>;
+
+class PCKEV_B_DESC : MSA_3R_DESC_BASE<"pckev.b", MipsPCKEV, MSA128BOpnd>;
+class PCKEV_H_DESC : MSA_3R_DESC_BASE<"pckev.h", MipsPCKEV, MSA128HOpnd>;
+class PCKEV_W_DESC : MSA_3R_DESC_BASE<"pckev.w", MipsPCKEV, MSA128WOpnd>;
+class PCKEV_D_DESC : MSA_3R_DESC_BASE<"pckev.d", MipsPCKEV, MSA128DOpnd>;
+
+class PCKOD_B_DESC : MSA_3R_DESC_BASE<"pckod.b", MipsPCKOD, MSA128BOpnd>;
+class PCKOD_H_DESC : MSA_3R_DESC_BASE<"pckod.h", MipsPCKOD, MSA128HOpnd>;
+class PCKOD_W_DESC : MSA_3R_DESC_BASE<"pckod.w", MipsPCKOD, MSA128WOpnd>;
+class PCKOD_D_DESC : MSA_3R_DESC_BASE<"pckod.d", MipsPCKOD, MSA128DOpnd>;
+
+class PCNT_B_DESC : MSA_2R_DESC_BASE<"pcnt.b", ctpop, MSA128BOpnd>;
+class PCNT_H_DESC : MSA_2R_DESC_BASE<"pcnt.h", ctpop, MSA128HOpnd>;
+class PCNT_W_DESC : MSA_2R_DESC_BASE<"pcnt.w", ctpop, MSA128WOpnd>;
+class PCNT_D_DESC : MSA_2R_DESC_BASE<"pcnt.d", ctpop, MSA128DOpnd>;
+
+class SAT_S_B_DESC : MSA_BIT_X_DESC_BASE<"sat_s.b", int_mips_sat_s_b, uimm3,
+ immZExt3, MSA128BOpnd>;
+class SAT_S_H_DESC : MSA_BIT_X_DESC_BASE<"sat_s.h", int_mips_sat_s_h, uimm4,
+ immZExt4, MSA128HOpnd>;
+class SAT_S_W_DESC : MSA_BIT_X_DESC_BASE<"sat_s.w", int_mips_sat_s_w, uimm5,
+ immZExt5, MSA128WOpnd>;
+class SAT_S_D_DESC : MSA_BIT_X_DESC_BASE<"sat_s.d", int_mips_sat_s_d, uimm6,
+ immZExt6, MSA128DOpnd>;
+
+class SAT_U_B_DESC : MSA_BIT_X_DESC_BASE<"sat_u.b", int_mips_sat_u_b, uimm3,
+ immZExt3, MSA128BOpnd>;
+class SAT_U_H_DESC : MSA_BIT_X_DESC_BASE<"sat_u.h", int_mips_sat_u_h, uimm4,
+ immZExt4, MSA128HOpnd>;
+class SAT_U_W_DESC : MSA_BIT_X_DESC_BASE<"sat_u.w", int_mips_sat_u_w, uimm5,
+ immZExt5, MSA128WOpnd>;
+class SAT_U_D_DESC : MSA_BIT_X_DESC_BASE<"sat_u.d", int_mips_sat_u_d, uimm6,
+ immZExt6, MSA128DOpnd>;
+
+class SHF_B_DESC : MSA_I8_SHF_DESC_BASE<"shf.b", MSA128BOpnd>;
+class SHF_H_DESC : MSA_I8_SHF_DESC_BASE<"shf.h", MSA128HOpnd>;
+class SHF_W_DESC : MSA_I8_SHF_DESC_BASE<"shf.w", MSA128WOpnd>;
+
+class SLD_B_DESC : MSA_3R_SLD_DESC_BASE<"sld.b", int_mips_sld_b, MSA128BOpnd>;
+class SLD_H_DESC : MSA_3R_SLD_DESC_BASE<"sld.h", int_mips_sld_h, MSA128HOpnd>;
+class SLD_W_DESC : MSA_3R_SLD_DESC_BASE<"sld.w", int_mips_sld_w, MSA128WOpnd>;
+class SLD_D_DESC : MSA_3R_SLD_DESC_BASE<"sld.d", int_mips_sld_d, MSA128DOpnd>;
+
+class SLDI_B_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.b", int_mips_sldi_b,
+ MSA128BOpnd, MSA128BOpnd, uimm4,
+ immZExt4>;
+class SLDI_H_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.h", int_mips_sldi_h,
+ MSA128HOpnd, MSA128HOpnd, uimm3,
+ immZExt3>;
+class SLDI_W_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.w", int_mips_sldi_w,
+ MSA128WOpnd, MSA128WOpnd, uimm2,
+ immZExt2>;
+class SLDI_D_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.d", int_mips_sldi_d,
+ MSA128DOpnd, MSA128DOpnd, uimm1,
+ immZExt1>;
+
+class SLL_B_DESC : MSA_3R_DESC_BASE<"sll.b", shl, MSA128BOpnd>;
+class SLL_H_DESC : MSA_3R_DESC_BASE<"sll.h", shl, MSA128HOpnd>;
+class SLL_W_DESC : MSA_3R_DESC_BASE<"sll.w", shl, MSA128WOpnd>;
+class SLL_D_DESC : MSA_3R_DESC_BASE<"sll.d", shl, MSA128DOpnd>;
+
+class SLLI_B_DESC : MSA_BIT_SPLAT_DESC_BASE<"slli.b", shl, vsplati8_uimm3,
+ MSA128BOpnd>;
+class SLLI_H_DESC : MSA_BIT_SPLAT_DESC_BASE<"slli.h", shl, vsplati16_uimm4,
+ MSA128HOpnd>;
+class SLLI_W_DESC : MSA_BIT_SPLAT_DESC_BASE<"slli.w", shl, vsplati32_uimm5,
+ MSA128WOpnd>;
+class SLLI_D_DESC : MSA_BIT_SPLAT_DESC_BASE<"slli.d", shl, vsplati64_uimm6,
+ MSA128DOpnd>;
+
+class SPLAT_B_DESC : MSA_3R_SPLAT_DESC_BASE<"splat.b", vsplati8_elt,
+ MSA128BOpnd>;
+class SPLAT_H_DESC : MSA_3R_SPLAT_DESC_BASE<"splat.h", vsplati16_elt,
+ MSA128HOpnd>;
+class SPLAT_W_DESC : MSA_3R_SPLAT_DESC_BASE<"splat.w", vsplati32_elt,
+ MSA128WOpnd>;
+class SPLAT_D_DESC : MSA_3R_SPLAT_DESC_BASE<"splat.d", vsplati64_elt,
+ MSA128DOpnd>;
+
+class SPLATI_B_DESC : MSA_ELM_SPLAT_DESC_BASE<"splati.b", vsplati8_uimm4,
+ MSA128BOpnd>;
+class SPLATI_H_DESC : MSA_ELM_SPLAT_DESC_BASE<"splati.h", vsplati16_uimm3,
+ MSA128HOpnd>;
+class SPLATI_W_DESC : MSA_ELM_SPLAT_DESC_BASE<"splati.w", vsplati32_uimm2,
+ MSA128WOpnd>;
+class SPLATI_D_DESC : MSA_ELM_SPLAT_DESC_BASE<"splati.d", vsplati64_uimm1,
+ MSA128DOpnd>;
+
+class SRA_B_DESC : MSA_3R_DESC_BASE<"sra.b", sra, MSA128BOpnd>;
+class SRA_H_DESC : MSA_3R_DESC_BASE<"sra.h", sra, MSA128HOpnd>;
+class SRA_W_DESC : MSA_3R_DESC_BASE<"sra.w", sra, MSA128WOpnd>;
+class SRA_D_DESC : MSA_3R_DESC_BASE<"sra.d", sra, MSA128DOpnd>;
+
+class SRAI_B_DESC : MSA_BIT_SPLAT_DESC_BASE<"srai.b", sra, vsplati8_uimm3,
+ MSA128BOpnd>;
+class SRAI_H_DESC : MSA_BIT_SPLAT_DESC_BASE<"srai.h", sra, vsplati16_uimm4,
+ MSA128HOpnd>;
+class SRAI_W_DESC : MSA_BIT_SPLAT_DESC_BASE<"srai.w", sra, vsplati32_uimm5,
+ MSA128WOpnd>;
+class SRAI_D_DESC : MSA_BIT_SPLAT_DESC_BASE<"srai.d", sra, vsplati64_uimm6,
+ MSA128DOpnd>;
+
+class SRAR_B_DESC : MSA_3R_DESC_BASE<"srar.b", int_mips_srar_b, MSA128BOpnd>;
+class SRAR_H_DESC : MSA_3R_DESC_BASE<"srar.h", int_mips_srar_h, MSA128HOpnd>;
+class SRAR_W_DESC : MSA_3R_DESC_BASE<"srar.w", int_mips_srar_w, MSA128WOpnd>;
+class SRAR_D_DESC : MSA_3R_DESC_BASE<"srar.d", int_mips_srar_d, MSA128DOpnd>;
+
+class SRARI_B_DESC : MSA_BIT_X_DESC_BASE<"srari.b", int_mips_srari_b, uimm3,
+ immZExt3, MSA128BOpnd>;
+class SRARI_H_DESC : MSA_BIT_X_DESC_BASE<"srari.h", int_mips_srari_h, uimm4,
+ immZExt4, MSA128HOpnd>;
+class SRARI_W_DESC : MSA_BIT_X_DESC_BASE<"srari.w", int_mips_srari_w, uimm5,
+ immZExt5, MSA128WOpnd>;
+class SRARI_D_DESC : MSA_BIT_X_DESC_BASE<"srari.d", int_mips_srari_d, uimm6,
+ immZExt6, MSA128DOpnd>;
+
+class SRL_B_DESC : MSA_3R_DESC_BASE<"srl.b", srl, MSA128BOpnd>;
+class SRL_H_DESC : MSA_3R_DESC_BASE<"srl.h", srl, MSA128HOpnd>;
+class SRL_W_DESC : MSA_3R_DESC_BASE<"srl.w", srl, MSA128WOpnd>;
+class SRL_D_DESC : MSA_3R_DESC_BASE<"srl.d", srl, MSA128DOpnd>;
+
+class SRLI_B_DESC : MSA_BIT_SPLAT_DESC_BASE<"srli.b", srl, vsplati8_uimm3,
+ MSA128BOpnd>;
+class SRLI_H_DESC : MSA_BIT_SPLAT_DESC_BASE<"srli.h", srl, vsplati16_uimm4,
+ MSA128HOpnd>;
+class SRLI_W_DESC : MSA_BIT_SPLAT_DESC_BASE<"srli.w", srl, vsplati32_uimm5,
+ MSA128WOpnd>;
+class SRLI_D_DESC : MSA_BIT_SPLAT_DESC_BASE<"srli.d", srl, vsplati64_uimm6,
+ MSA128DOpnd>;
+
+class SRLR_B_DESC : MSA_3R_DESC_BASE<"srlr.b", int_mips_srlr_b, MSA128BOpnd>;
+class SRLR_H_DESC : MSA_3R_DESC_BASE<"srlr.h", int_mips_srlr_h, MSA128HOpnd>;
+class SRLR_W_DESC : MSA_3R_DESC_BASE<"srlr.w", int_mips_srlr_w, MSA128WOpnd>;
+class SRLR_D_DESC : MSA_3R_DESC_BASE<"srlr.d", int_mips_srlr_d, MSA128DOpnd>;
+
+class SRLRI_B_DESC : MSA_BIT_X_DESC_BASE<"srlri.b", int_mips_srlri_b, uimm3,
+ immZExt3, MSA128BOpnd>;
+class SRLRI_H_DESC : MSA_BIT_X_DESC_BASE<"srlri.h", int_mips_srlri_h, uimm4,
+ immZExt4, MSA128HOpnd>;
+class SRLRI_W_DESC : MSA_BIT_X_DESC_BASE<"srlri.w", int_mips_srlri_w, uimm5,
+ immZExt5, MSA128WOpnd>;
+class SRLRI_D_DESC : MSA_BIT_X_DESC_BASE<"srlri.d", int_mips_srlri_d, uimm6,
+ immZExt6, MSA128DOpnd>;
+
+class ST_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ ValueType TyNode, RegisterOperand ROWD,
+ Operand MemOpnd, ComplexPattern Addr = addrimm10,
+ InstrItinClass itin = NoItinerary> {
+ dag OutOperandList = (outs);
+ dag InOperandList = (ins ROWD:$wd, MemOpnd:$addr);
+ string AsmString = !strconcat(instr_asm, "\t$wd, $addr");
+ list<dag> Pattern = [(OpNode (TyNode ROWD:$wd), Addr:$addr)];
+ InstrItinClass Itinerary = itin;
+ string DecoderMethod = "DecodeMSA128Mem";
+}
+
+class ST_B_DESC : ST_DESC_BASE<"st.b", store, v16i8, MSA128BOpnd, mem_simm10>;
+class ST_H_DESC : ST_DESC_BASE<"st.h", store, v8i16, MSA128HOpnd,
+ mem_simm10_lsl1, addrimm10lsl1>;
+class ST_W_DESC : ST_DESC_BASE<"st.w", store, v4i32, MSA128WOpnd,
+ mem_simm10_lsl2, addrimm10lsl2>;
+class ST_D_DESC : ST_DESC_BASE<"st.d", store, v2i64, MSA128DOpnd,
+ mem_simm10_lsl3, addrimm10lsl3>;
+
+class SUBS_S_B_DESC : MSA_3R_DESC_BASE<"subs_s.b", int_mips_subs_s_b,
+ MSA128BOpnd>;
+class SUBS_S_H_DESC : MSA_3R_DESC_BASE<"subs_s.h", int_mips_subs_s_h,
+ MSA128HOpnd>;
+class SUBS_S_W_DESC : MSA_3R_DESC_BASE<"subs_s.w", int_mips_subs_s_w,
+ MSA128WOpnd>;
+class SUBS_S_D_DESC : MSA_3R_DESC_BASE<"subs_s.d", int_mips_subs_s_d,
+ MSA128DOpnd>;
+
+class SUBS_U_B_DESC : MSA_3R_DESC_BASE<"subs_u.b", int_mips_subs_u_b,
+ MSA128BOpnd>;
+class SUBS_U_H_DESC : MSA_3R_DESC_BASE<"subs_u.h", int_mips_subs_u_h,
+ MSA128HOpnd>;
+class SUBS_U_W_DESC : MSA_3R_DESC_BASE<"subs_u.w", int_mips_subs_u_w,
+ MSA128WOpnd>;
+class SUBS_U_D_DESC : MSA_3R_DESC_BASE<"subs_u.d", int_mips_subs_u_d,
+ MSA128DOpnd>;
+
+class SUBSUS_U_B_DESC : MSA_3R_DESC_BASE<"subsus_u.b", int_mips_subsus_u_b,
+ MSA128BOpnd>;
+class SUBSUS_U_H_DESC : MSA_3R_DESC_BASE<"subsus_u.h", int_mips_subsus_u_h,
+ MSA128HOpnd>;
+class SUBSUS_U_W_DESC : MSA_3R_DESC_BASE<"subsus_u.w", int_mips_subsus_u_w,
+ MSA128WOpnd>;
+class SUBSUS_U_D_DESC : MSA_3R_DESC_BASE<"subsus_u.d", int_mips_subsus_u_d,
+ MSA128DOpnd>;
+
+class SUBSUU_S_B_DESC : MSA_3R_DESC_BASE<"subsuu_s.b", int_mips_subsuu_s_b,
+ MSA128BOpnd>;
+class SUBSUU_S_H_DESC : MSA_3R_DESC_BASE<"subsuu_s.h", int_mips_subsuu_s_h,
+ MSA128HOpnd>;
+class SUBSUU_S_W_DESC : MSA_3R_DESC_BASE<"subsuu_s.w", int_mips_subsuu_s_w,
+ MSA128WOpnd>;
+class SUBSUU_S_D_DESC : MSA_3R_DESC_BASE<"subsuu_s.d", int_mips_subsuu_s_d,
+ MSA128DOpnd>;
+
+class SUBV_B_DESC : MSA_3R_DESC_BASE<"subv.b", sub, MSA128BOpnd>;
+class SUBV_H_DESC : MSA_3R_DESC_BASE<"subv.h", sub, MSA128HOpnd>;
+class SUBV_W_DESC : MSA_3R_DESC_BASE<"subv.w", sub, MSA128WOpnd>;
+class SUBV_D_DESC : MSA_3R_DESC_BASE<"subv.d", sub, MSA128DOpnd>;
+
+class SUBVI_B_DESC : MSA_I5_DESC_BASE<"subvi.b", sub, vsplati8_uimm5,
+ MSA128BOpnd>;
+class SUBVI_H_DESC : MSA_I5_DESC_BASE<"subvi.h", sub, vsplati16_uimm5,
+ MSA128HOpnd>;
+class SUBVI_W_DESC : MSA_I5_DESC_BASE<"subvi.w", sub, vsplati32_uimm5,
+ MSA128WOpnd>;
+class SUBVI_D_DESC : MSA_I5_DESC_BASE<"subvi.d", sub, vsplati64_uimm5,
+ MSA128DOpnd>;
+
+class VSHF_B_DESC : MSA_3R_VSHF_DESC_BASE<"vshf.b", MSA128BOpnd>;
+class VSHF_H_DESC : MSA_3R_VSHF_DESC_BASE<"vshf.h", MSA128HOpnd>;
+class VSHF_W_DESC : MSA_3R_VSHF_DESC_BASE<"vshf.w", MSA128WOpnd>;
+class VSHF_D_DESC : MSA_3R_VSHF_DESC_BASE<"vshf.d", MSA128DOpnd>;
+
+class XOR_V_DESC : MSA_VEC_DESC_BASE<"xor.v", xor, MSA128BOpnd>;
+class XOR_V_H_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<xor, MSA128HOpnd>;
+class XOR_V_W_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<xor, MSA128WOpnd>;
+class XOR_V_D_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<xor, MSA128DOpnd>;
+
+class XORI_B_DESC : MSA_I8_DESC_BASE<"xori.b", xor, vsplati8_uimm8,
+ MSA128BOpnd>;
+
+// Instruction defs.
+def ADD_A_B : ADD_A_B_ENC, ADD_A_B_DESC;
+def ADD_A_H : ADD_A_H_ENC, ADD_A_H_DESC;
+def ADD_A_W : ADD_A_W_ENC, ADD_A_W_DESC;
+def ADD_A_D : ADD_A_D_ENC, ADD_A_D_DESC;
+
+def ADDS_A_B : ADDS_A_B_ENC, ADDS_A_B_DESC;
+def ADDS_A_H : ADDS_A_H_ENC, ADDS_A_H_DESC;
+def ADDS_A_W : ADDS_A_W_ENC, ADDS_A_W_DESC;
+def ADDS_A_D : ADDS_A_D_ENC, ADDS_A_D_DESC;
+
+def ADDS_S_B : ADDS_S_B_ENC, ADDS_S_B_DESC;
+def ADDS_S_H : ADDS_S_H_ENC, ADDS_S_H_DESC;
+def ADDS_S_W : ADDS_S_W_ENC, ADDS_S_W_DESC;
+def ADDS_S_D : ADDS_S_D_ENC, ADDS_S_D_DESC;
+
+def ADDS_U_B : ADDS_U_B_ENC, ADDS_U_B_DESC;
+def ADDS_U_H : ADDS_U_H_ENC, ADDS_U_H_DESC;
+def ADDS_U_W : ADDS_U_W_ENC, ADDS_U_W_DESC;
+def ADDS_U_D : ADDS_U_D_ENC, ADDS_U_D_DESC;
+
+def ADDV_B : ADDV_B_ENC, ADDV_B_DESC;
+def ADDV_H : ADDV_H_ENC, ADDV_H_DESC;
+def ADDV_W : ADDV_W_ENC, ADDV_W_DESC;
+def ADDV_D : ADDV_D_ENC, ADDV_D_DESC;
+
+def ADDVI_B : ADDVI_B_ENC, ADDVI_B_DESC;
+def ADDVI_H : ADDVI_H_ENC, ADDVI_H_DESC;
+def ADDVI_W : ADDVI_W_ENC, ADDVI_W_DESC;
+def ADDVI_D : ADDVI_D_ENC, ADDVI_D_DESC;
+
+def AND_V : AND_V_ENC, AND_V_DESC;
+def AND_V_H_PSEUDO : AND_V_H_PSEUDO_DESC,
+ PseudoInstExpansion<(AND_V MSA128BOpnd:$wd,
+ MSA128BOpnd:$ws,
+ MSA128BOpnd:$wt)>;
+def AND_V_W_PSEUDO : AND_V_W_PSEUDO_DESC,
+ PseudoInstExpansion<(AND_V MSA128BOpnd:$wd,
+ MSA128BOpnd:$ws,
+ MSA128BOpnd:$wt)>;
+def AND_V_D_PSEUDO : AND_V_D_PSEUDO_DESC,
+ PseudoInstExpansion<(AND_V MSA128BOpnd:$wd,
+ MSA128BOpnd:$ws,
+ MSA128BOpnd:$wt)>;
+
+def ANDI_B : ANDI_B_ENC, ANDI_B_DESC;
+
+def ASUB_S_B : ASUB_S_B_ENC, ASUB_S_B_DESC;
+def ASUB_S_H : ASUB_S_H_ENC, ASUB_S_H_DESC;
+def ASUB_S_W : ASUB_S_W_ENC, ASUB_S_W_DESC;
+def ASUB_S_D : ASUB_S_D_ENC, ASUB_S_D_DESC;
+
+def ASUB_U_B : ASUB_U_B_ENC, ASUB_U_B_DESC;
+def ASUB_U_H : ASUB_U_H_ENC, ASUB_U_H_DESC;
+def ASUB_U_W : ASUB_U_W_ENC, ASUB_U_W_DESC;
+def ASUB_U_D : ASUB_U_D_ENC, ASUB_U_D_DESC;
+
+def AVE_S_B : AVE_S_B_ENC, AVE_S_B_DESC;
+def AVE_S_H : AVE_S_H_ENC, AVE_S_H_DESC;
+def AVE_S_W : AVE_S_W_ENC, AVE_S_W_DESC;
+def AVE_S_D : AVE_S_D_ENC, AVE_S_D_DESC;
+
+def AVE_U_B : AVE_U_B_ENC, AVE_U_B_DESC;
+def AVE_U_H : AVE_U_H_ENC, AVE_U_H_DESC;
+def AVE_U_W : AVE_U_W_ENC, AVE_U_W_DESC;
+def AVE_U_D : AVE_U_D_ENC, AVE_U_D_DESC;
+
+def AVER_S_B : AVER_S_B_ENC, AVER_S_B_DESC;
+def AVER_S_H : AVER_S_H_ENC, AVER_S_H_DESC;
+def AVER_S_W : AVER_S_W_ENC, AVER_S_W_DESC;
+def AVER_S_D : AVER_S_D_ENC, AVER_S_D_DESC;
+
+def AVER_U_B : AVER_U_B_ENC, AVER_U_B_DESC;
+def AVER_U_H : AVER_U_H_ENC, AVER_U_H_DESC;
+def AVER_U_W : AVER_U_W_ENC, AVER_U_W_DESC;
+def AVER_U_D : AVER_U_D_ENC, AVER_U_D_DESC;
+
+def BCLR_B : BCLR_B_ENC, BCLR_B_DESC;
+def BCLR_H : BCLR_H_ENC, BCLR_H_DESC;
+def BCLR_W : BCLR_W_ENC, BCLR_W_DESC;
+def BCLR_D : BCLR_D_ENC, BCLR_D_DESC;
+
+def BCLRI_B : BCLRI_B_ENC, BCLRI_B_DESC;
+def BCLRI_H : BCLRI_H_ENC, BCLRI_H_DESC;
+def BCLRI_W : BCLRI_W_ENC, BCLRI_W_DESC;
+def BCLRI_D : BCLRI_D_ENC, BCLRI_D_DESC;
+
+def BINSL_B : BINSL_B_ENC, BINSL_B_DESC;
+def BINSL_H : BINSL_H_ENC, BINSL_H_DESC;
+def BINSL_W : BINSL_W_ENC, BINSL_W_DESC;
+def BINSL_D : BINSL_D_ENC, BINSL_D_DESC;
+
+def BINSLI_B : BINSLI_B_ENC, BINSLI_B_DESC;
+def BINSLI_H : BINSLI_H_ENC, BINSLI_H_DESC;
+def BINSLI_W : BINSLI_W_ENC, BINSLI_W_DESC;
+def BINSLI_D : BINSLI_D_ENC, BINSLI_D_DESC;
+
+def BINSR_B : BINSR_B_ENC, BINSR_B_DESC;
+def BINSR_H : BINSR_H_ENC, BINSR_H_DESC;
+def BINSR_W : BINSR_W_ENC, BINSR_W_DESC;
+def BINSR_D : BINSR_D_ENC, BINSR_D_DESC;
+
+def BINSRI_B : BINSRI_B_ENC, BINSRI_B_DESC;
+def BINSRI_H : BINSRI_H_ENC, BINSRI_H_DESC;
+def BINSRI_W : BINSRI_W_ENC, BINSRI_W_DESC;
+def BINSRI_D : BINSRI_D_ENC, BINSRI_D_DESC;
+
+def BMNZ_V : BMNZ_V_ENC, BMNZ_V_DESC;
+
+def BMNZI_B : BMNZI_B_ENC, BMNZI_B_DESC;
+
+def BMZ_V : BMZ_V_ENC, BMZ_V_DESC;
+
+def BMZI_B : BMZI_B_ENC, BMZI_B_DESC;
+
+def BNEG_B : BNEG_B_ENC, BNEG_B_DESC;
+def BNEG_H : BNEG_H_ENC, BNEG_H_DESC;
+def BNEG_W : BNEG_W_ENC, BNEG_W_DESC;
+def BNEG_D : BNEG_D_ENC, BNEG_D_DESC;
+
+def BNEGI_B : BNEGI_B_ENC, BNEGI_B_DESC;
+def BNEGI_H : BNEGI_H_ENC, BNEGI_H_DESC;
+def BNEGI_W : BNEGI_W_ENC, BNEGI_W_DESC;
+def BNEGI_D : BNEGI_D_ENC, BNEGI_D_DESC;
+
+def BNZ_B : BNZ_B_ENC, BNZ_B_DESC;
+def BNZ_H : BNZ_H_ENC, BNZ_H_DESC;
+def BNZ_W : BNZ_W_ENC, BNZ_W_DESC;
+def BNZ_D : BNZ_D_ENC, BNZ_D_DESC;
+
+def BNZ_V : BNZ_V_ENC, BNZ_V_DESC;
+
+def BSEL_V : BSEL_V_ENC, BSEL_V_DESC;
+
+class MSA_BSEL_PSEUDO_BASE<RegisterOperand RO, ValueType Ty> :
+ MSAPseudo<(outs RO:$wd), (ins RO:$wd_in, RO:$ws, RO:$wt),
+ [(set RO:$wd, (Ty (vselect RO:$wd_in, RO:$wt, RO:$ws)))]>,
+ // Note that vselect and BSEL_V treat the condition operand the opposite way
+ // from each other.
+ // (vselect cond, if_set, if_clear)
+ // (BSEL_V cond, if_clear, if_set)
+ PseudoInstExpansion<(BSEL_V MSA128BOpnd:$wd, MSA128BOpnd:$wd_in,
+ MSA128BOpnd:$ws, MSA128BOpnd:$wt)> {
+ let Constraints = "$wd_in = $wd";
+}
+
+def BSEL_H_PSEUDO : MSA_BSEL_PSEUDO_BASE<MSA128HOpnd, v8i16>;
+def BSEL_W_PSEUDO : MSA_BSEL_PSEUDO_BASE<MSA128WOpnd, v4i32>;
+def BSEL_D_PSEUDO : MSA_BSEL_PSEUDO_BASE<MSA128DOpnd, v2i64>;
+def BSEL_FW_PSEUDO : MSA_BSEL_PSEUDO_BASE<MSA128WOpnd, v4f32>;
+def BSEL_FD_PSEUDO : MSA_BSEL_PSEUDO_BASE<MSA128DOpnd, v2f64>;
+
+def BSELI_B : BSELI_B_ENC, BSELI_B_DESC;
+
+def BSET_B : BSET_B_ENC, BSET_B_DESC;
+def BSET_H : BSET_H_ENC, BSET_H_DESC;
+def BSET_W : BSET_W_ENC, BSET_W_DESC;
+def BSET_D : BSET_D_ENC, BSET_D_DESC;
+
+def BSETI_B : BSETI_B_ENC, BSETI_B_DESC;
+def BSETI_H : BSETI_H_ENC, BSETI_H_DESC;
+def BSETI_W : BSETI_W_ENC, BSETI_W_DESC;
+def BSETI_D : BSETI_D_ENC, BSETI_D_DESC;
+
+def BZ_B : BZ_B_ENC, BZ_B_DESC;
+def BZ_H : BZ_H_ENC, BZ_H_DESC;
+def BZ_W : BZ_W_ENC, BZ_W_DESC;
+def BZ_D : BZ_D_ENC, BZ_D_DESC;
+
+def BZ_V : BZ_V_ENC, BZ_V_DESC;
+
+def CEQ_B : CEQ_B_ENC, CEQ_B_DESC;
+def CEQ_H : CEQ_H_ENC, CEQ_H_DESC;
+def CEQ_W : CEQ_W_ENC, CEQ_W_DESC;
+def CEQ_D : CEQ_D_ENC, CEQ_D_DESC;
+
+def CEQI_B : CEQI_B_ENC, CEQI_B_DESC;
+def CEQI_H : CEQI_H_ENC, CEQI_H_DESC;
+def CEQI_W : CEQI_W_ENC, CEQI_W_DESC;
+def CEQI_D : CEQI_D_ENC, CEQI_D_DESC;
+
+def CFCMSA : CFCMSA_ENC, CFCMSA_DESC;
+
+def CLE_S_B : CLE_S_B_ENC, CLE_S_B_DESC;
+def CLE_S_H : CLE_S_H_ENC, CLE_S_H_DESC;
+def CLE_S_W : CLE_S_W_ENC, CLE_S_W_DESC;
+def CLE_S_D : CLE_S_D_ENC, CLE_S_D_DESC;
+
+def CLE_U_B : CLE_U_B_ENC, CLE_U_B_DESC;
+def CLE_U_H : CLE_U_H_ENC, CLE_U_H_DESC;
+def CLE_U_W : CLE_U_W_ENC, CLE_U_W_DESC;
+def CLE_U_D : CLE_U_D_ENC, CLE_U_D_DESC;
+
+def CLEI_S_B : CLEI_S_B_ENC, CLEI_S_B_DESC;
+def CLEI_S_H : CLEI_S_H_ENC, CLEI_S_H_DESC;
+def CLEI_S_W : CLEI_S_W_ENC, CLEI_S_W_DESC;
+def CLEI_S_D : CLEI_S_D_ENC, CLEI_S_D_DESC;
+
+def CLEI_U_B : CLEI_U_B_ENC, CLEI_U_B_DESC;
+def CLEI_U_H : CLEI_U_H_ENC, CLEI_U_H_DESC;
+def CLEI_U_W : CLEI_U_W_ENC, CLEI_U_W_DESC;
+def CLEI_U_D : CLEI_U_D_ENC, CLEI_U_D_DESC;
+
+def CLT_S_B : CLT_S_B_ENC, CLT_S_B_DESC;
+def CLT_S_H : CLT_S_H_ENC, CLT_S_H_DESC;
+def CLT_S_W : CLT_S_W_ENC, CLT_S_W_DESC;
+def CLT_S_D : CLT_S_D_ENC, CLT_S_D_DESC;
+
+def CLT_U_B : CLT_U_B_ENC, CLT_U_B_DESC;
+def CLT_U_H : CLT_U_H_ENC, CLT_U_H_DESC;
+def CLT_U_W : CLT_U_W_ENC, CLT_U_W_DESC;
+def CLT_U_D : CLT_U_D_ENC, CLT_U_D_DESC;
+
+def CLTI_S_B : CLTI_S_B_ENC, CLTI_S_B_DESC;
+def CLTI_S_H : CLTI_S_H_ENC, CLTI_S_H_DESC;
+def CLTI_S_W : CLTI_S_W_ENC, CLTI_S_W_DESC;
+def CLTI_S_D : CLTI_S_D_ENC, CLTI_S_D_DESC;
+
+def CLTI_U_B : CLTI_U_B_ENC, CLTI_U_B_DESC;
+def CLTI_U_H : CLTI_U_H_ENC, CLTI_U_H_DESC;
+def CLTI_U_W : CLTI_U_W_ENC, CLTI_U_W_DESC;
+def CLTI_U_D : CLTI_U_D_ENC, CLTI_U_D_DESC;
+
+def COPY_S_B : COPY_S_B_ENC, COPY_S_B_DESC;
+def COPY_S_H : COPY_S_H_ENC, COPY_S_H_DESC;
+def COPY_S_W : COPY_S_W_ENC, COPY_S_W_DESC;
+def COPY_S_D : COPY_S_D_ENC, COPY_S_D_DESC, ASE_MSA64;
+
+def COPY_U_B : COPY_U_B_ENC, COPY_U_B_DESC;
+def COPY_U_H : COPY_U_H_ENC, COPY_U_H_DESC;
+def COPY_U_W : COPY_U_W_ENC, COPY_U_W_DESC, ASE_MSA64;
+
+def COPY_FW_PSEUDO : COPY_FW_PSEUDO_DESC;
+def COPY_FD_PSEUDO : COPY_FD_PSEUDO_DESC;
+
+def CTCMSA : CTCMSA_ENC, CTCMSA_DESC;
+
+def DIV_S_B : DIV_S_B_ENC, DIV_S_B_DESC;
+def DIV_S_H : DIV_S_H_ENC, DIV_S_H_DESC;
+def DIV_S_W : DIV_S_W_ENC, DIV_S_W_DESC;
+def DIV_S_D : DIV_S_D_ENC, DIV_S_D_DESC;
+
+def DIV_U_B : DIV_U_B_ENC, DIV_U_B_DESC;
+def DIV_U_H : DIV_U_H_ENC, DIV_U_H_DESC;
+def DIV_U_W : DIV_U_W_ENC, DIV_U_W_DESC;
+def DIV_U_D : DIV_U_D_ENC, DIV_U_D_DESC;
+
+def DOTP_S_H : DOTP_S_H_ENC, DOTP_S_H_DESC;
+def DOTP_S_W : DOTP_S_W_ENC, DOTP_S_W_DESC;
+def DOTP_S_D : DOTP_S_D_ENC, DOTP_S_D_DESC;
+
+def DOTP_U_H : DOTP_U_H_ENC, DOTP_U_H_DESC;
+def DOTP_U_W : DOTP_U_W_ENC, DOTP_U_W_DESC;
+def DOTP_U_D : DOTP_U_D_ENC, DOTP_U_D_DESC;
+
+def DPADD_S_H : DPADD_S_H_ENC, DPADD_S_H_DESC;
+def DPADD_S_W : DPADD_S_W_ENC, DPADD_S_W_DESC;
+def DPADD_S_D : DPADD_S_D_ENC, DPADD_S_D_DESC;
+
+def DPADD_U_H : DPADD_U_H_ENC, DPADD_U_H_DESC;
+def DPADD_U_W : DPADD_U_W_ENC, DPADD_U_W_DESC;
+def DPADD_U_D : DPADD_U_D_ENC, DPADD_U_D_DESC;
+
+def DPSUB_S_H : DPSUB_S_H_ENC, DPSUB_S_H_DESC;
+def DPSUB_S_W : DPSUB_S_W_ENC, DPSUB_S_W_DESC;
+def DPSUB_S_D : DPSUB_S_D_ENC, DPSUB_S_D_DESC;
+
+def DPSUB_U_H : DPSUB_U_H_ENC, DPSUB_U_H_DESC;
+def DPSUB_U_W : DPSUB_U_W_ENC, DPSUB_U_W_DESC;
+def DPSUB_U_D : DPSUB_U_D_ENC, DPSUB_U_D_DESC;
+
+def FADD_W : FADD_W_ENC, FADD_W_DESC;
+def FADD_D : FADD_D_ENC, FADD_D_DESC;
+
+def FCAF_W : FCAF_W_ENC, FCAF_W_DESC;
+def FCAF_D : FCAF_D_ENC, FCAF_D_DESC;
+
+def FCEQ_W : FCEQ_W_ENC, FCEQ_W_DESC;
+def FCEQ_D : FCEQ_D_ENC, FCEQ_D_DESC;
+
+def FCLE_W : FCLE_W_ENC, FCLE_W_DESC;
+def FCLE_D : FCLE_D_ENC, FCLE_D_DESC;
+
+def FCLT_W : FCLT_W_ENC, FCLT_W_DESC;
+def FCLT_D : FCLT_D_ENC, FCLT_D_DESC;
+
+def FCLASS_W : FCLASS_W_ENC, FCLASS_W_DESC;
+def FCLASS_D : FCLASS_D_ENC, FCLASS_D_DESC;
+
+def FCNE_W : FCNE_W_ENC, FCNE_W_DESC;
+def FCNE_D : FCNE_D_ENC, FCNE_D_DESC;
+
+def FCOR_W : FCOR_W_ENC, FCOR_W_DESC;
+def FCOR_D : FCOR_D_ENC, FCOR_D_DESC;
+
+def FCUEQ_W : FCUEQ_W_ENC, FCUEQ_W_DESC;
+def FCUEQ_D : FCUEQ_D_ENC, FCUEQ_D_DESC;
+
+def FCULE_W : FCULE_W_ENC, FCULE_W_DESC;
+def FCULE_D : FCULE_D_ENC, FCULE_D_DESC;
+
+def FCULT_W : FCULT_W_ENC, FCULT_W_DESC;
+def FCULT_D : FCULT_D_ENC, FCULT_D_DESC;
+
+def FCUN_W : FCUN_W_ENC, FCUN_W_DESC;
+def FCUN_D : FCUN_D_ENC, FCUN_D_DESC;
+
+def FCUNE_W : FCUNE_W_ENC, FCUNE_W_DESC;
+def FCUNE_D : FCUNE_D_ENC, FCUNE_D_DESC;
+
+def FDIV_W : FDIV_W_ENC, FDIV_W_DESC;
+def FDIV_D : FDIV_D_ENC, FDIV_D_DESC;
+
+def FEXDO_H : FEXDO_H_ENC, FEXDO_H_DESC;
+def FEXDO_W : FEXDO_W_ENC, FEXDO_W_DESC;
+
+def FEXP2_W : FEXP2_W_ENC, FEXP2_W_DESC;
+def FEXP2_D : FEXP2_D_ENC, FEXP2_D_DESC;
+def FEXP2_W_1_PSEUDO : FEXP2_W_1_PSEUDO_DESC;
+def FEXP2_D_1_PSEUDO : FEXP2_D_1_PSEUDO_DESC;
+
+def FEXUPL_W : FEXUPL_W_ENC, FEXUPL_W_DESC;
+def FEXUPL_D : FEXUPL_D_ENC, FEXUPL_D_DESC;
+
+def FEXUPR_W : FEXUPR_W_ENC, FEXUPR_W_DESC;
+def FEXUPR_D : FEXUPR_D_ENC, FEXUPR_D_DESC;
+
+def FFINT_S_W : FFINT_S_W_ENC, FFINT_S_W_DESC;
+def FFINT_S_D : FFINT_S_D_ENC, FFINT_S_D_DESC;
+
+def FFINT_U_W : FFINT_U_W_ENC, FFINT_U_W_DESC;
+def FFINT_U_D : FFINT_U_D_ENC, FFINT_U_D_DESC;
+
+def FFQL_W : FFQL_W_ENC, FFQL_W_DESC;
+def FFQL_D : FFQL_D_ENC, FFQL_D_DESC;
+
+def FFQR_W : FFQR_W_ENC, FFQR_W_DESC;
+def FFQR_D : FFQR_D_ENC, FFQR_D_DESC;
+
+def FILL_B : FILL_B_ENC, FILL_B_DESC;
+def FILL_H : FILL_H_ENC, FILL_H_DESC;
+def FILL_W : FILL_W_ENC, FILL_W_DESC;
+def FILL_D : FILL_D_ENC, FILL_D_DESC, ASE_MSA64;
+def FILL_FW_PSEUDO : FILL_FW_PSEUDO_DESC;
+def FILL_FD_PSEUDO : FILL_FD_PSEUDO_DESC;
+
+def FLOG2_W : FLOG2_W_ENC, FLOG2_W_DESC;
+def FLOG2_D : FLOG2_D_ENC, FLOG2_D_DESC;
+
+def FMADD_W : FMADD_W_ENC, FMADD_W_DESC;
+def FMADD_D : FMADD_D_ENC, FMADD_D_DESC;
+
+def FMAX_W : FMAX_W_ENC, FMAX_W_DESC;
+def FMAX_D : FMAX_D_ENC, FMAX_D_DESC;
+
+def FMAX_A_W : FMAX_A_W_ENC, FMAX_A_W_DESC;
+def FMAX_A_D : FMAX_A_D_ENC, FMAX_A_D_DESC;
+
+def FMIN_W : FMIN_W_ENC, FMIN_W_DESC;
+def FMIN_D : FMIN_D_ENC, FMIN_D_DESC;
+
+def FMIN_A_W : FMIN_A_W_ENC, FMIN_A_W_DESC;
+def FMIN_A_D : FMIN_A_D_ENC, FMIN_A_D_DESC;
+
+def FMSUB_W : FMSUB_W_ENC, FMSUB_W_DESC;
+def FMSUB_D : FMSUB_D_ENC, FMSUB_D_DESC;
+
+def FMUL_W : FMUL_W_ENC, FMUL_W_DESC;
+def FMUL_D : FMUL_D_ENC, FMUL_D_DESC;
+
+def FRINT_W : FRINT_W_ENC, FRINT_W_DESC;
+def FRINT_D : FRINT_D_ENC, FRINT_D_DESC;
+
+def FRCP_W : FRCP_W_ENC, FRCP_W_DESC;
+def FRCP_D : FRCP_D_ENC, FRCP_D_DESC;
+
+def FRSQRT_W : FRSQRT_W_ENC, FRSQRT_W_DESC;
+def FRSQRT_D : FRSQRT_D_ENC, FRSQRT_D_DESC;
+
+def FSAF_W : FSAF_W_ENC, FSAF_W_DESC;
+def FSAF_D : FSAF_D_ENC, FSAF_D_DESC;
+
+def FSEQ_W : FSEQ_W_ENC, FSEQ_W_DESC;
+def FSEQ_D : FSEQ_D_ENC, FSEQ_D_DESC;
+
+def FSLE_W : FSLE_W_ENC, FSLE_W_DESC;
+def FSLE_D : FSLE_D_ENC, FSLE_D_DESC;
+
+def FSLT_W : FSLT_W_ENC, FSLT_W_DESC;
+def FSLT_D : FSLT_D_ENC, FSLT_D_DESC;
+
+def FSNE_W : FSNE_W_ENC, FSNE_W_DESC;
+def FSNE_D : FSNE_D_ENC, FSNE_D_DESC;
+
+def FSOR_W : FSOR_W_ENC, FSOR_W_DESC;
+def FSOR_D : FSOR_D_ENC, FSOR_D_DESC;
+
+def FSQRT_W : FSQRT_W_ENC, FSQRT_W_DESC;
+def FSQRT_D : FSQRT_D_ENC, FSQRT_D_DESC;
+
+def FSUB_W : FSUB_W_ENC, FSUB_W_DESC;
+def FSUB_D : FSUB_D_ENC, FSUB_D_DESC;
+
+def FSUEQ_W : FSUEQ_W_ENC, FSUEQ_W_DESC;
+def FSUEQ_D : FSUEQ_D_ENC, FSUEQ_D_DESC;
+
+def FSULE_W : FSULE_W_ENC, FSULE_W_DESC;
+def FSULE_D : FSULE_D_ENC, FSULE_D_DESC;
+
+def FSULT_W : FSULT_W_ENC, FSULT_W_DESC;
+def FSULT_D : FSULT_D_ENC, FSULT_D_DESC;
+
+def FSUN_W : FSUN_W_ENC, FSUN_W_DESC;
+def FSUN_D : FSUN_D_ENC, FSUN_D_DESC;
+
+def FSUNE_W : FSUNE_W_ENC, FSUNE_W_DESC;
+def FSUNE_D : FSUNE_D_ENC, FSUNE_D_DESC;
+
+def FTINT_S_W : FTINT_S_W_ENC, FTINT_S_W_DESC;
+def FTINT_S_D : FTINT_S_D_ENC, FTINT_S_D_DESC;
+
+def FTINT_U_W : FTINT_U_W_ENC, FTINT_U_W_DESC;
+def FTINT_U_D : FTINT_U_D_ENC, FTINT_U_D_DESC;
+
+def FTQ_H : FTQ_H_ENC, FTQ_H_DESC;
+def FTQ_W : FTQ_W_ENC, FTQ_W_DESC;
+
+def FTRUNC_S_W : FTRUNC_S_W_ENC, FTRUNC_S_W_DESC;
+def FTRUNC_S_D : FTRUNC_S_D_ENC, FTRUNC_S_D_DESC;
+
+def FTRUNC_U_W : FTRUNC_U_W_ENC, FTRUNC_U_W_DESC;
+def FTRUNC_U_D : FTRUNC_U_D_ENC, FTRUNC_U_D_DESC;
+
+def HADD_S_H : HADD_S_H_ENC, HADD_S_H_DESC;
+def HADD_S_W : HADD_S_W_ENC, HADD_S_W_DESC;
+def HADD_S_D : HADD_S_D_ENC, HADD_S_D_DESC;
+
+def HADD_U_H : HADD_U_H_ENC, HADD_U_H_DESC;
+def HADD_U_W : HADD_U_W_ENC, HADD_U_W_DESC;
+def HADD_U_D : HADD_U_D_ENC, HADD_U_D_DESC;
+
+def HSUB_S_H : HSUB_S_H_ENC, HSUB_S_H_DESC;
+def HSUB_S_W : HSUB_S_W_ENC, HSUB_S_W_DESC;
+def HSUB_S_D : HSUB_S_D_ENC, HSUB_S_D_DESC;
+
+def HSUB_U_H : HSUB_U_H_ENC, HSUB_U_H_DESC;
+def HSUB_U_W : HSUB_U_W_ENC, HSUB_U_W_DESC;
+def HSUB_U_D : HSUB_U_D_ENC, HSUB_U_D_DESC;
+
+def ILVEV_B : ILVEV_B_ENC, ILVEV_B_DESC;
+def ILVEV_H : ILVEV_H_ENC, ILVEV_H_DESC;
+def ILVEV_W : ILVEV_W_ENC, ILVEV_W_DESC;
+def ILVEV_D : ILVEV_D_ENC, ILVEV_D_DESC;
+
+def ILVL_B : ILVL_B_ENC, ILVL_B_DESC;
+def ILVL_H : ILVL_H_ENC, ILVL_H_DESC;
+def ILVL_W : ILVL_W_ENC, ILVL_W_DESC;
+def ILVL_D : ILVL_D_ENC, ILVL_D_DESC;
+
+def ILVOD_B : ILVOD_B_ENC, ILVOD_B_DESC;
+def ILVOD_H : ILVOD_H_ENC, ILVOD_H_DESC;
+def ILVOD_W : ILVOD_W_ENC, ILVOD_W_DESC;
+def ILVOD_D : ILVOD_D_ENC, ILVOD_D_DESC;
+
+def ILVR_B : ILVR_B_ENC, ILVR_B_DESC;
+def ILVR_H : ILVR_H_ENC, ILVR_H_DESC;
+def ILVR_W : ILVR_W_ENC, ILVR_W_DESC;
+def ILVR_D : ILVR_D_ENC, ILVR_D_DESC;
+
+def INSERT_B : INSERT_B_ENC, INSERT_B_DESC;
+def INSERT_H : INSERT_H_ENC, INSERT_H_DESC;
+def INSERT_W : INSERT_W_ENC, INSERT_W_DESC;
+def INSERT_D : INSERT_D_ENC, INSERT_D_DESC, ASE_MSA64;
+
+// INSERT_FW_PSEUDO defined after INSVE_W
+// INSERT_FD_PSEUDO defined after INSVE_D
+
+// There is a fourth operand that is not present in the encoding. Use a
+// custom decoder to get a chance to add it.
+let DecoderMethod = "DecodeINSVE_DF" in {
+ def INSVE_B : INSVE_B_ENC, INSVE_B_DESC;
+ def INSVE_H : INSVE_H_ENC, INSVE_H_DESC;
+ def INSVE_W : INSVE_W_ENC, INSVE_W_DESC;
+ def INSVE_D : INSVE_D_ENC, INSVE_D_DESC;
+}
+
+def INSERT_FW_PSEUDO : INSERT_FW_PSEUDO_DESC;
+def INSERT_FD_PSEUDO : INSERT_FD_PSEUDO_DESC;
+
+def INSERT_B_VIDX_PSEUDO : INSERT_B_VIDX_PSEUDO_DESC;
+def INSERT_H_VIDX_PSEUDO : INSERT_H_VIDX_PSEUDO_DESC;
+def INSERT_W_VIDX_PSEUDO : INSERT_W_VIDX_PSEUDO_DESC;
+def INSERT_D_VIDX_PSEUDO : INSERT_D_VIDX_PSEUDO_DESC;
+def INSERT_FW_VIDX_PSEUDO : INSERT_FW_VIDX_PSEUDO_DESC;
+def INSERT_FD_VIDX_PSEUDO : INSERT_FD_VIDX_PSEUDO_DESC;
+
+def INSERT_B_VIDX64_PSEUDO : INSERT_B_VIDX64_PSEUDO_DESC;
+def INSERT_H_VIDX64_PSEUDO : INSERT_H_VIDX64_PSEUDO_DESC;
+def INSERT_W_VIDX64_PSEUDO : INSERT_W_VIDX64_PSEUDO_DESC;
+def INSERT_D_VIDX64_PSEUDO : INSERT_D_VIDX64_PSEUDO_DESC;
+def INSERT_FW_VIDX64_PSEUDO : INSERT_FW_VIDX64_PSEUDO_DESC;
+def INSERT_FD_VIDX64_PSEUDO : INSERT_FD_VIDX64_PSEUDO_DESC;
+
+def LD_B: LD_B_ENC, LD_B_DESC;
+def LD_H: LD_H_ENC, LD_H_DESC;
+def LD_W: LD_W_ENC, LD_W_DESC;
+def LD_D: LD_D_ENC, LD_D_DESC;
+
+def LDI_B : LDI_B_ENC, LDI_B_DESC;
+def LDI_H : LDI_H_ENC, LDI_H_DESC;
+def LDI_W : LDI_W_ENC, LDI_W_DESC;
+def LDI_D : LDI_D_ENC, LDI_D_DESC;
+
+def LSA : LSA_ENC, LSA_DESC;
+def DLSA : DLSA_ENC, DLSA_DESC, ASE_MSA64;
+
+def MADD_Q_H : MADD_Q_H_ENC, MADD_Q_H_DESC;
+def MADD_Q_W : MADD_Q_W_ENC, MADD_Q_W_DESC;
+
+def MADDR_Q_H : MADDR_Q_H_ENC, MADDR_Q_H_DESC;
+def MADDR_Q_W : MADDR_Q_W_ENC, MADDR_Q_W_DESC;
+
+def MADDV_B : MADDV_B_ENC, MADDV_B_DESC;
+def MADDV_H : MADDV_H_ENC, MADDV_H_DESC;
+def MADDV_W : MADDV_W_ENC, MADDV_W_DESC;
+def MADDV_D : MADDV_D_ENC, MADDV_D_DESC;
+
+def MAX_A_B : MAX_A_B_ENC, MAX_A_B_DESC;
+def MAX_A_H : MAX_A_H_ENC, MAX_A_H_DESC;
+def MAX_A_W : MAX_A_W_ENC, MAX_A_W_DESC;
+def MAX_A_D : MAX_A_D_ENC, MAX_A_D_DESC;
+
+def MAX_S_B : MAX_S_B_ENC, MAX_S_B_DESC;
+def MAX_S_H : MAX_S_H_ENC, MAX_S_H_DESC;
+def MAX_S_W : MAX_S_W_ENC, MAX_S_W_DESC;
+def MAX_S_D : MAX_S_D_ENC, MAX_S_D_DESC;
+
+def MAX_U_B : MAX_U_B_ENC, MAX_U_B_DESC;
+def MAX_U_H : MAX_U_H_ENC, MAX_U_H_DESC;
+def MAX_U_W : MAX_U_W_ENC, MAX_U_W_DESC;
+def MAX_U_D : MAX_U_D_ENC, MAX_U_D_DESC;
+
+def MAXI_S_B : MAXI_S_B_ENC, MAXI_S_B_DESC;
+def MAXI_S_H : MAXI_S_H_ENC, MAXI_S_H_DESC;
+def MAXI_S_W : MAXI_S_W_ENC, MAXI_S_W_DESC;
+def MAXI_S_D : MAXI_S_D_ENC, MAXI_S_D_DESC;
+
+def MAXI_U_B : MAXI_U_B_ENC, MAXI_U_B_DESC;
+def MAXI_U_H : MAXI_U_H_ENC, MAXI_U_H_DESC;
+def MAXI_U_W : MAXI_U_W_ENC, MAXI_U_W_DESC;
+def MAXI_U_D : MAXI_U_D_ENC, MAXI_U_D_DESC;
+
+def MIN_A_B : MIN_A_B_ENC, MIN_A_B_DESC;
+def MIN_A_H : MIN_A_H_ENC, MIN_A_H_DESC;
+def MIN_A_W : MIN_A_W_ENC, MIN_A_W_DESC;
+def MIN_A_D : MIN_A_D_ENC, MIN_A_D_DESC;
+
+def MIN_S_B : MIN_S_B_ENC, MIN_S_B_DESC;
+def MIN_S_H : MIN_S_H_ENC, MIN_S_H_DESC;
+def MIN_S_W : MIN_S_W_ENC, MIN_S_W_DESC;
+def MIN_S_D : MIN_S_D_ENC, MIN_S_D_DESC;
+
+def MIN_U_B : MIN_U_B_ENC, MIN_U_B_DESC;
+def MIN_U_H : MIN_U_H_ENC, MIN_U_H_DESC;
+def MIN_U_W : MIN_U_W_ENC, MIN_U_W_DESC;
+def MIN_U_D : MIN_U_D_ENC, MIN_U_D_DESC;
+
+def MINI_S_B : MINI_S_B_ENC, MINI_S_B_DESC;
+def MINI_S_H : MINI_S_H_ENC, MINI_S_H_DESC;
+def MINI_S_W : MINI_S_W_ENC, MINI_S_W_DESC;
+def MINI_S_D : MINI_S_D_ENC, MINI_S_D_DESC;
+
+def MINI_U_B : MINI_U_B_ENC, MINI_U_B_DESC;
+def MINI_U_H : MINI_U_H_ENC, MINI_U_H_DESC;
+def MINI_U_W : MINI_U_W_ENC, MINI_U_W_DESC;
+def MINI_U_D : MINI_U_D_ENC, MINI_U_D_DESC;
+
+def MOD_S_B : MOD_S_B_ENC, MOD_S_B_DESC;
+def MOD_S_H : MOD_S_H_ENC, MOD_S_H_DESC;
+def MOD_S_W : MOD_S_W_ENC, MOD_S_W_DESC;
+def MOD_S_D : MOD_S_D_ENC, MOD_S_D_DESC;
+
+def MOD_U_B : MOD_U_B_ENC, MOD_U_B_DESC;
+def MOD_U_H : MOD_U_H_ENC, MOD_U_H_DESC;
+def MOD_U_W : MOD_U_W_ENC, MOD_U_W_DESC;
+def MOD_U_D : MOD_U_D_ENC, MOD_U_D_DESC;
+
+def MOVE_V : MOVE_V_ENC, MOVE_V_DESC;
+
+def MSUB_Q_H : MSUB_Q_H_ENC, MSUB_Q_H_DESC;
+def MSUB_Q_W : MSUB_Q_W_ENC, MSUB_Q_W_DESC;
+
+def MSUBR_Q_H : MSUBR_Q_H_ENC, MSUBR_Q_H_DESC;
+def MSUBR_Q_W : MSUBR_Q_W_ENC, MSUBR_Q_W_DESC;
+
+def MSUBV_B : MSUBV_B_ENC, MSUBV_B_DESC;
+def MSUBV_H : MSUBV_H_ENC, MSUBV_H_DESC;
+def MSUBV_W : MSUBV_W_ENC, MSUBV_W_DESC;
+def MSUBV_D : MSUBV_D_ENC, MSUBV_D_DESC;
+
+def MUL_Q_H : MUL_Q_H_ENC, MUL_Q_H_DESC;
+def MUL_Q_W : MUL_Q_W_ENC, MUL_Q_W_DESC;
+
+def MULR_Q_H : MULR_Q_H_ENC, MULR_Q_H_DESC;
+def MULR_Q_W : MULR_Q_W_ENC, MULR_Q_W_DESC;
+
+def MULV_B : MULV_B_ENC, MULV_B_DESC;
+def MULV_H : MULV_H_ENC, MULV_H_DESC;
+def MULV_W : MULV_W_ENC, MULV_W_DESC;
+def MULV_D : MULV_D_ENC, MULV_D_DESC;
+
+def NLOC_B : NLOC_B_ENC, NLOC_B_DESC;
+def NLOC_H : NLOC_H_ENC, NLOC_H_DESC;
+def NLOC_W : NLOC_W_ENC, NLOC_W_DESC;
+def NLOC_D : NLOC_D_ENC, NLOC_D_DESC;
+
+def NLZC_B : NLZC_B_ENC, NLZC_B_DESC;
+def NLZC_H : NLZC_H_ENC, NLZC_H_DESC;
+def NLZC_W : NLZC_W_ENC, NLZC_W_DESC;
+def NLZC_D : NLZC_D_ENC, NLZC_D_DESC;
+
+def NOR_V : NOR_V_ENC, NOR_V_DESC;
+def NOR_V_H_PSEUDO : NOR_V_H_PSEUDO_DESC,
+ PseudoInstExpansion<(NOR_V MSA128BOpnd:$wd,
+ MSA128BOpnd:$ws,
+ MSA128BOpnd:$wt)>;
+def NOR_V_W_PSEUDO : NOR_V_W_PSEUDO_DESC,
+ PseudoInstExpansion<(NOR_V MSA128BOpnd:$wd,
+ MSA128BOpnd:$ws,
+ MSA128BOpnd:$wt)>;
+def NOR_V_D_PSEUDO : NOR_V_D_PSEUDO_DESC,
+ PseudoInstExpansion<(NOR_V MSA128BOpnd:$wd,
+ MSA128BOpnd:$ws,
+ MSA128BOpnd:$wt)>;
+
+def NORI_B : NORI_B_ENC, NORI_B_DESC;
+
+def OR_V : OR_V_ENC, OR_V_DESC;
+def OR_V_H_PSEUDO : OR_V_H_PSEUDO_DESC,
+ PseudoInstExpansion<(OR_V MSA128BOpnd:$wd,
+ MSA128BOpnd:$ws,
+ MSA128BOpnd:$wt)>;
+def OR_V_W_PSEUDO : OR_V_W_PSEUDO_DESC,
+ PseudoInstExpansion<(OR_V MSA128BOpnd:$wd,
+ MSA128BOpnd:$ws,
+ MSA128BOpnd:$wt)>;
+def OR_V_D_PSEUDO : OR_V_D_PSEUDO_DESC,
+ PseudoInstExpansion<(OR_V MSA128BOpnd:$wd,
+ MSA128BOpnd:$ws,
+ MSA128BOpnd:$wt)>;
+
+def ORI_B : ORI_B_ENC, ORI_B_DESC;
+
+def PCKEV_B : PCKEV_B_ENC, PCKEV_B_DESC;
+def PCKEV_H : PCKEV_H_ENC, PCKEV_H_DESC;
+def PCKEV_W : PCKEV_W_ENC, PCKEV_W_DESC;
+def PCKEV_D : PCKEV_D_ENC, PCKEV_D_DESC;
+
+def PCKOD_B : PCKOD_B_ENC, PCKOD_B_DESC;
+def PCKOD_H : PCKOD_H_ENC, PCKOD_H_DESC;
+def PCKOD_W : PCKOD_W_ENC, PCKOD_W_DESC;
+def PCKOD_D : PCKOD_D_ENC, PCKOD_D_DESC;
+
+def PCNT_B : PCNT_B_ENC, PCNT_B_DESC;
+def PCNT_H : PCNT_H_ENC, PCNT_H_DESC;
+def PCNT_W : PCNT_W_ENC, PCNT_W_DESC;
+def PCNT_D : PCNT_D_ENC, PCNT_D_DESC;
+
+def SAT_S_B : SAT_S_B_ENC, SAT_S_B_DESC;
+def SAT_S_H : SAT_S_H_ENC, SAT_S_H_DESC;
+def SAT_S_W : SAT_S_W_ENC, SAT_S_W_DESC;
+def SAT_S_D : SAT_S_D_ENC, SAT_S_D_DESC;
+
+def SAT_U_B : SAT_U_B_ENC, SAT_U_B_DESC;
+def SAT_U_H : SAT_U_H_ENC, SAT_U_H_DESC;
+def SAT_U_W : SAT_U_W_ENC, SAT_U_W_DESC;
+def SAT_U_D : SAT_U_D_ENC, SAT_U_D_DESC;
+
+def SHF_B : SHF_B_ENC, SHF_B_DESC;
+def SHF_H : SHF_H_ENC, SHF_H_DESC;
+def SHF_W : SHF_W_ENC, SHF_W_DESC;
+
+def SLD_B : SLD_B_ENC, SLD_B_DESC;
+def SLD_H : SLD_H_ENC, SLD_H_DESC;
+def SLD_W : SLD_W_ENC, SLD_W_DESC;
+def SLD_D : SLD_D_ENC, SLD_D_DESC;
+
+def SLDI_B : SLDI_B_ENC, SLDI_B_DESC;
+def SLDI_H : SLDI_H_ENC, SLDI_H_DESC;
+def SLDI_W : SLDI_W_ENC, SLDI_W_DESC;
+def SLDI_D : SLDI_D_ENC, SLDI_D_DESC;
+
+def SLL_B : SLL_B_ENC, SLL_B_DESC;
+def SLL_H : SLL_H_ENC, SLL_H_DESC;
+def SLL_W : SLL_W_ENC, SLL_W_DESC;
+def SLL_D : SLL_D_ENC, SLL_D_DESC;
+
+def SLLI_B : SLLI_B_ENC, SLLI_B_DESC;
+def SLLI_H : SLLI_H_ENC, SLLI_H_DESC;
+def SLLI_W : SLLI_W_ENC, SLLI_W_DESC;
+def SLLI_D : SLLI_D_ENC, SLLI_D_DESC;
+
+def SPLAT_B : SPLAT_B_ENC, SPLAT_B_DESC;
+def SPLAT_H : SPLAT_H_ENC, SPLAT_H_DESC;
+def SPLAT_W : SPLAT_W_ENC, SPLAT_W_DESC;
+def SPLAT_D : SPLAT_D_ENC, SPLAT_D_DESC;
+
+def SPLATI_B : SPLATI_B_ENC, SPLATI_B_DESC;
+def SPLATI_H : SPLATI_H_ENC, SPLATI_H_DESC;
+def SPLATI_W : SPLATI_W_ENC, SPLATI_W_DESC;
+def SPLATI_D : SPLATI_D_ENC, SPLATI_D_DESC;
+
+def SRA_B : SRA_B_ENC, SRA_B_DESC;
+def SRA_H : SRA_H_ENC, SRA_H_DESC;
+def SRA_W : SRA_W_ENC, SRA_W_DESC;
+def SRA_D : SRA_D_ENC, SRA_D_DESC;
+
+def SRAI_B : SRAI_B_ENC, SRAI_B_DESC;
+def SRAI_H : SRAI_H_ENC, SRAI_H_DESC;
+def SRAI_W : SRAI_W_ENC, SRAI_W_DESC;
+def SRAI_D : SRAI_D_ENC, SRAI_D_DESC;
+
+def SRAR_B : SRAR_B_ENC, SRAR_B_DESC;
+def SRAR_H : SRAR_H_ENC, SRAR_H_DESC;
+def SRAR_W : SRAR_W_ENC, SRAR_W_DESC;
+def SRAR_D : SRAR_D_ENC, SRAR_D_DESC;
+
+def SRARI_B : SRARI_B_ENC, SRARI_B_DESC;
+def SRARI_H : SRARI_H_ENC, SRARI_H_DESC;
+def SRARI_W : SRARI_W_ENC, SRARI_W_DESC;
+def SRARI_D : SRARI_D_ENC, SRARI_D_DESC;
+
+def SRL_B : SRL_B_ENC, SRL_B_DESC;
+def SRL_H : SRL_H_ENC, SRL_H_DESC;
+def SRL_W : SRL_W_ENC, SRL_W_DESC;
+def SRL_D : SRL_D_ENC, SRL_D_DESC;
+
+def SRLI_B : SRLI_B_ENC, SRLI_B_DESC;
+def SRLI_H : SRLI_H_ENC, SRLI_H_DESC;
+def SRLI_W : SRLI_W_ENC, SRLI_W_DESC;
+def SRLI_D : SRLI_D_ENC, SRLI_D_DESC;
+
+def SRLR_B : SRLR_B_ENC, SRLR_B_DESC;
+def SRLR_H : SRLR_H_ENC, SRLR_H_DESC;
+def SRLR_W : SRLR_W_ENC, SRLR_W_DESC;
+def SRLR_D : SRLR_D_ENC, SRLR_D_DESC;
+
+def SRLRI_B : SRLRI_B_ENC, SRLRI_B_DESC;
+def SRLRI_H : SRLRI_H_ENC, SRLRI_H_DESC;
+def SRLRI_W : SRLRI_W_ENC, SRLRI_W_DESC;
+def SRLRI_D : SRLRI_D_ENC, SRLRI_D_DESC;
+
+def ST_B: ST_B_ENC, ST_B_DESC;
+def ST_H: ST_H_ENC, ST_H_DESC;
+def ST_W: ST_W_ENC, ST_W_DESC;
+def ST_D: ST_D_ENC, ST_D_DESC;
+
+def SUBS_S_B : SUBS_S_B_ENC, SUBS_S_B_DESC;
+def SUBS_S_H : SUBS_S_H_ENC, SUBS_S_H_DESC;
+def SUBS_S_W : SUBS_S_W_ENC, SUBS_S_W_DESC;
+def SUBS_S_D : SUBS_S_D_ENC, SUBS_S_D_DESC;
+
+def SUBS_U_B : SUBS_U_B_ENC, SUBS_U_B_DESC;
+def SUBS_U_H : SUBS_U_H_ENC, SUBS_U_H_DESC;
+def SUBS_U_W : SUBS_U_W_ENC, SUBS_U_W_DESC;
+def SUBS_U_D : SUBS_U_D_ENC, SUBS_U_D_DESC;
+
+def SUBSUS_U_B : SUBSUS_U_B_ENC, SUBSUS_U_B_DESC;
+def SUBSUS_U_H : SUBSUS_U_H_ENC, SUBSUS_U_H_DESC;
+def SUBSUS_U_W : SUBSUS_U_W_ENC, SUBSUS_U_W_DESC;
+def SUBSUS_U_D : SUBSUS_U_D_ENC, SUBSUS_U_D_DESC;
+
+def SUBSUU_S_B : SUBSUU_S_B_ENC, SUBSUU_S_B_DESC;
+def SUBSUU_S_H : SUBSUU_S_H_ENC, SUBSUU_S_H_DESC;
+def SUBSUU_S_W : SUBSUU_S_W_ENC, SUBSUU_S_W_DESC;
+def SUBSUU_S_D : SUBSUU_S_D_ENC, SUBSUU_S_D_DESC;
+
+def SUBV_B : SUBV_B_ENC, SUBV_B_DESC;
+def SUBV_H : SUBV_H_ENC, SUBV_H_DESC;
+def SUBV_W : SUBV_W_ENC, SUBV_W_DESC;
+def SUBV_D : SUBV_D_ENC, SUBV_D_DESC;
+
+def SUBVI_B : SUBVI_B_ENC, SUBVI_B_DESC;
+def SUBVI_H : SUBVI_H_ENC, SUBVI_H_DESC;
+def SUBVI_W : SUBVI_W_ENC, SUBVI_W_DESC;
+def SUBVI_D : SUBVI_D_ENC, SUBVI_D_DESC;
+
+def VSHF_B : VSHF_B_ENC, VSHF_B_DESC;
+def VSHF_H : VSHF_H_ENC, VSHF_H_DESC;
+def VSHF_W : VSHF_W_ENC, VSHF_W_DESC;
+def VSHF_D : VSHF_D_ENC, VSHF_D_DESC;
+
+def XOR_V : XOR_V_ENC, XOR_V_DESC;
+def XOR_V_H_PSEUDO : XOR_V_H_PSEUDO_DESC,
+ PseudoInstExpansion<(XOR_V MSA128BOpnd:$wd,
+ MSA128BOpnd:$ws,
+ MSA128BOpnd:$wt)>;
+def XOR_V_W_PSEUDO : XOR_V_W_PSEUDO_DESC,
+ PseudoInstExpansion<(XOR_V MSA128BOpnd:$wd,
+ MSA128BOpnd:$ws,
+ MSA128BOpnd:$wt)>;
+def XOR_V_D_PSEUDO : XOR_V_D_PSEUDO_DESC,
+ PseudoInstExpansion<(XOR_V MSA128BOpnd:$wd,
+ MSA128BOpnd:$ws,
+ MSA128BOpnd:$wt)>;
+
+def XORI_B : XORI_B_ENC, XORI_B_DESC;
+
+// Patterns.
+class MSAPat<dag pattern, dag result, list<Predicate> pred = [HasMSA]> :
+ Pat<pattern, result>, Requires<pred>;
+
+def : MSAPat<(extractelt (v4i32 MSA128W:$ws), immZExt4:$idx),
+ (COPY_S_W MSA128W:$ws, immZExt4:$idx)>;
+
+def : MSAPat<(v8f16 (load addrimm10lsl1:$addr)), (LD_H addrimm10lsl1:$addr)>;
+def : MSAPat<(v4f32 (load addrimm10lsl2:$addr)), (LD_W addrimm10lsl2:$addr)>;
+def : MSAPat<(v2f64 (load addrimm10lsl3:$addr)), (LD_D addrimm10lsl3:$addr)>;
+
+def ST_FH : MSAPat<(store (v8f16 MSA128H:$ws), addrimm10lsl1:$addr),
+ (ST_H MSA128H:$ws, addrimm10lsl1:$addr)>;
+def ST_FW : MSAPat<(store (v4f32 MSA128W:$ws), addrimm10lsl2:$addr),
+ (ST_W MSA128W:$ws, addrimm10lsl2:$addr)>;
+def ST_FD : MSAPat<(store (v2f64 MSA128D:$ws), addrimm10lsl3:$addr),
+ (ST_D MSA128D:$ws, addrimm10lsl3:$addr)>;
+
+class MSA_FABS_PSEUDO_DESC_BASE<RegisterOperand ROWD,
+ RegisterOperand ROWS = ROWD,
+ InstrItinClass itin = NoItinerary> :
+ MSAPseudo<(outs ROWD:$wd),
+ (ins ROWS:$ws),
+ [(set ROWD:$wd, (fabs ROWS:$ws))]> {
+ InstrItinClass Itinerary = itin;
+}
+def FABS_W : MSA_FABS_PSEUDO_DESC_BASE<MSA128WOpnd>,
+ PseudoInstExpansion<(FMAX_A_W MSA128WOpnd:$wd, MSA128WOpnd:$ws,
+ MSA128WOpnd:$ws)>;
+def FABS_D : MSA_FABS_PSEUDO_DESC_BASE<MSA128DOpnd>,
+ PseudoInstExpansion<(FMAX_A_D MSA128DOpnd:$wd, MSA128DOpnd:$ws,
+ MSA128DOpnd:$ws)>;
+
+class MSABitconvertPat<ValueType DstVT, ValueType SrcVT,
+ RegisterClass DstRC, list<Predicate> preds = [HasMSA]> :
+ MSAPat<(DstVT (bitconvert SrcVT:$src)),
+ (COPY_TO_REGCLASS SrcVT:$src, DstRC), preds>;
+
+// These are endian-independent because the element size doesnt change
+def : MSABitconvertPat<v8i16, v8f16, MSA128H>;
+def : MSABitconvertPat<v4i32, v4f32, MSA128W>;
+def : MSABitconvertPat<v2i64, v2f64, MSA128D>;
+def : MSABitconvertPat<v8f16, v8i16, MSA128H>;
+def : MSABitconvertPat<v4f32, v4i32, MSA128W>;
+def : MSABitconvertPat<v2f64, v2i64, MSA128D>;
+
+// Little endian bitcasts are always no-ops
+def : MSABitconvertPat<v16i8, v8i16, MSA128B, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v16i8, v4i32, MSA128B, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v16i8, v2i64, MSA128B, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v16i8, v8f16, MSA128B, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v16i8, v4f32, MSA128B, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v16i8, v2f64, MSA128B, [HasMSA, IsLE]>;
+
+def : MSABitconvertPat<v8i16, v16i8, MSA128H, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v8i16, v4i32, MSA128H, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v8i16, v2i64, MSA128H, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v8i16, v4f32, MSA128H, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v8i16, v2f64, MSA128H, [HasMSA, IsLE]>;
+
+def : MSABitconvertPat<v4i32, v16i8, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4i32, v8i16, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4i32, v2i64, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4i32, v8f16, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4i32, v2f64, MSA128W, [HasMSA, IsLE]>;
+
+def : MSABitconvertPat<v2i64, v16i8, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2i64, v8i16, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2i64, v4i32, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2i64, v8f16, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2i64, v4f32, MSA128D, [HasMSA, IsLE]>;
+
+def : MSABitconvertPat<v4f32, v16i8, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4f32, v8i16, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4f32, v2i64, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4f32, v8f16, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4f32, v2f64, MSA128W, [HasMSA, IsLE]>;
+
+def : MSABitconvertPat<v2f64, v16i8, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2f64, v8i16, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2f64, v4i32, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2f64, v8f16, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2f64, v4f32, MSA128D, [HasMSA, IsLE]>;
+
+// Big endian bitcasts expand to shuffle instructions.
+// This is because bitcast is defined to be a store/load sequence and the
+// vector store/load instructions are mixed-endian with respect to the vector
+// as a whole (little endian with respect to element order, but big endian
+// elements).
+
+class MSABitconvertReverseQuartersPat<ValueType DstVT, ValueType SrcVT,
+ RegisterClass DstRC, MSAInst Insn,
+ RegisterClass ViaRC> :
+ MSAPat<(DstVT (bitconvert SrcVT:$src)),
+ (COPY_TO_REGCLASS (Insn (COPY_TO_REGCLASS SrcVT:$src, ViaRC), 27),
+ DstRC),
+ [HasMSA, IsBE]>;
+
+class MSABitconvertReverseHalvesPat<ValueType DstVT, ValueType SrcVT,
+ RegisterClass DstRC, MSAInst Insn,
+ RegisterClass ViaRC> :
+ MSAPat<(DstVT (bitconvert SrcVT:$src)),
+ (COPY_TO_REGCLASS (Insn (COPY_TO_REGCLASS SrcVT:$src, ViaRC), 177),
+ DstRC),
+ [HasMSA, IsBE]>;
+
+class MSABitconvertReverseBInHPat<ValueType DstVT, ValueType SrcVT,
+ RegisterClass DstRC> :
+ MSABitconvertReverseHalvesPat<DstVT, SrcVT, DstRC, SHF_B, MSA128B>;
+
+class MSABitconvertReverseBInWPat<ValueType DstVT, ValueType SrcVT,
+ RegisterClass DstRC> :
+ MSABitconvertReverseQuartersPat<DstVT, SrcVT, DstRC, SHF_B, MSA128B>;
+
+class MSABitconvertReverseBInDPat<ValueType DstVT, ValueType SrcVT,
+ RegisterClass DstRC> :
+ MSAPat<(DstVT (bitconvert SrcVT:$src)),
+ (COPY_TO_REGCLASS
+ (SHF_W
+ (COPY_TO_REGCLASS
+ (SHF_B (COPY_TO_REGCLASS SrcVT:$src, MSA128B), 27),
+ MSA128W), 177),
+ DstRC),
+ [HasMSA, IsBE]>;
+
+class MSABitconvertReverseHInWPat<ValueType DstVT, ValueType SrcVT,
+ RegisterClass DstRC> :
+ MSABitconvertReverseHalvesPat<DstVT, SrcVT, DstRC, SHF_H, MSA128H>;
+
+class MSABitconvertReverseHInDPat<ValueType DstVT, ValueType SrcVT,
+ RegisterClass DstRC> :
+ MSABitconvertReverseQuartersPat<DstVT, SrcVT, DstRC, SHF_H, MSA128H>;
+
+class MSABitconvertReverseWInDPat<ValueType DstVT, ValueType SrcVT,
+ RegisterClass DstRC> :
+ MSABitconvertReverseHalvesPat<DstVT, SrcVT, DstRC, SHF_W, MSA128W>;
+
+def : MSABitconvertReverseBInHPat<v8i16, v16i8, MSA128H>;
+def : MSABitconvertReverseBInHPat<v8f16, v16i8, MSA128H>;
+def : MSABitconvertReverseBInWPat<v4i32, v16i8, MSA128W>;
+def : MSABitconvertReverseBInWPat<v4f32, v16i8, MSA128W>;
+def : MSABitconvertReverseBInDPat<v2i64, v16i8, MSA128D>;
+def : MSABitconvertReverseBInDPat<v2f64, v16i8, MSA128D>;
+
+def : MSABitconvertReverseBInHPat<v16i8, v8i16, MSA128B>;
+def : MSABitconvertReverseHInWPat<v4i32, v8i16, MSA128W>;
+def : MSABitconvertReverseHInWPat<v4f32, v8i16, MSA128W>;
+def : MSABitconvertReverseHInDPat<v2i64, v8i16, MSA128D>;
+def : MSABitconvertReverseHInDPat<v2f64, v8i16, MSA128D>;
+
+def : MSABitconvertReverseBInHPat<v16i8, v8f16, MSA128B>;
+def : MSABitconvertReverseHInWPat<v4i32, v8f16, MSA128W>;
+def : MSABitconvertReverseHInWPat<v4f32, v8f16, MSA128W>;
+def : MSABitconvertReverseHInDPat<v2i64, v8f16, MSA128D>;
+def : MSABitconvertReverseHInDPat<v2f64, v8f16, MSA128D>;
+
+def : MSABitconvertReverseBInWPat<v16i8, v4i32, MSA128B>;
+def : MSABitconvertReverseHInWPat<v8i16, v4i32, MSA128H>;
+def : MSABitconvertReverseHInWPat<v8f16, v4i32, MSA128H>;
+def : MSABitconvertReverseWInDPat<v2i64, v4i32, MSA128D>;
+def : MSABitconvertReverseWInDPat<v2f64, v4i32, MSA128D>;
+
+def : MSABitconvertReverseBInWPat<v16i8, v4f32, MSA128B>;
+def : MSABitconvertReverseHInWPat<v8i16, v4f32, MSA128H>;
+def : MSABitconvertReverseHInWPat<v8f16, v4f32, MSA128H>;
+def : MSABitconvertReverseWInDPat<v2i64, v4f32, MSA128D>;
+def : MSABitconvertReverseWInDPat<v2f64, v4f32, MSA128D>;
+
+def : MSABitconvertReverseBInDPat<v16i8, v2i64, MSA128B>;
+def : MSABitconvertReverseHInDPat<v8i16, v2i64, MSA128H>;
+def : MSABitconvertReverseHInDPat<v8f16, v2i64, MSA128H>;
+def : MSABitconvertReverseWInDPat<v4i32, v2i64, MSA128W>;
+def : MSABitconvertReverseWInDPat<v4f32, v2i64, MSA128W>;
+
+def : MSABitconvertReverseBInDPat<v16i8, v2f64, MSA128B>;
+def : MSABitconvertReverseHInDPat<v8i16, v2f64, MSA128H>;
+def : MSABitconvertReverseHInDPat<v8f16, v2f64, MSA128H>;
+def : MSABitconvertReverseWInDPat<v4i32, v2f64, MSA128W>;
+def : MSABitconvertReverseWInDPat<v4f32, v2f64, MSA128W>;
+
+// Pseudos used to implement BNZ.df, and BZ.df
+
+class MSA_CBRANCH_PSEUDO_DESC_BASE<SDPatternOperator OpNode, ValueType TyNode,
+ RegisterClass RCWS,
+ InstrItinClass itin = NoItinerary> :
+ MipsPseudo<(outs GPR32:$dst),
+ (ins RCWS:$ws),
+ [(set GPR32:$dst, (OpNode (TyNode RCWS:$ws)))]> {
+ bit usesCustomInserter = 1;
+}
+
+def SNZ_B_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllNonZero, v16i8,
+ MSA128B, NoItinerary>;
+def SNZ_H_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllNonZero, v8i16,
+ MSA128H, NoItinerary>;
+def SNZ_W_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllNonZero, v4i32,
+ MSA128W, NoItinerary>;
+def SNZ_D_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllNonZero, v2i64,
+ MSA128D, NoItinerary>;
+def SNZ_V_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAnyNonZero, v16i8,
+ MSA128B, NoItinerary>;
+
+def SZ_B_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllZero, v16i8,
+ MSA128B, NoItinerary>;
+def SZ_H_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllZero, v8i16,
+ MSA128H, NoItinerary>;
+def SZ_W_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllZero, v4i32,
+ MSA128W, NoItinerary>;
+def SZ_D_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllZero, v2i64,
+ MSA128D, NoItinerary>;
+def SZ_V_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAnyZero, v16i8,
+ MSA128B, NoItinerary>;
+
+// Pseudoes used to implement transparent fp16 support.
+
+let Predicates = [HasMSA] in {
+ def ST_F16 : MipsPseudo<(outs), (ins MSA128F16:$ws, mem_simm10:$addr),
+ [(store (f16 MSA128F16:$ws), (addrimm10:$addr))]> {
+ let usesCustomInserter = 1;
+ }
+
+ def LD_F16 : MipsPseudo<(outs MSA128F16:$ws), (ins mem_simm10:$addr),
+ [(set MSA128F16:$ws, (f16 (load addrimm10:$addr)))]> {
+ let usesCustomInserter = 1;
+ }
+
+ def MSA_FP_EXTEND_W_PSEUDO : MipsPseudo<(outs FGR32Opnd:$fd),
+ (ins MSA128F16:$ws),
+ [(set FGR32Opnd:$fd,
+ (f32 (fpextend MSA128F16:$ws)))]> {
+ let usesCustomInserter = 1;
+ }
+
+ def MSA_FP_ROUND_W_PSEUDO : MipsPseudo<(outs MSA128F16:$wd),
+ (ins FGR32Opnd:$fs),
+ [(set MSA128F16:$wd,
+ (f16 (fpround FGR32Opnd:$fs)))]> {
+ let usesCustomInserter = 1;
+ }
+
+ def MSA_FP_EXTEND_D_PSEUDO : MipsPseudo<(outs FGR64Opnd:$fd),
+ (ins MSA128F16:$ws),
+ [(set FGR64Opnd:$fd,
+ (f64 (fpextend MSA128F16:$ws)))]> {
+ let usesCustomInserter = 1;
+ }
+
+ def MSA_FP_ROUND_D_PSEUDO : MipsPseudo<(outs MSA128F16:$wd),
+ (ins FGR64Opnd:$fs),
+ [(set MSA128F16:$wd,
+ (f16 (fpround FGR64Opnd:$fs)))]> {
+ let usesCustomInserter = 1;
+ }
+
+ def : MipsPat<(MipsTruncIntFP MSA128F16:$ws),
+ (TRUNC_W_D64 (MSA_FP_EXTEND_D_PSEUDO MSA128F16:$ws))>;
+
+ def : MipsPat<(MipsFPCmp MSA128F16:$ws, MSA128F16:$wt, imm:$cond),
+ (FCMP_S32 (MSA_FP_EXTEND_W_PSEUDO MSA128F16:$ws),
+ (MSA_FP_EXTEND_W_PSEUDO MSA128F16:$wt), imm:$cond)>,
+ ISA_MIPS1_NOT_32R6_64R6;
+}
+
+// Vector extraction with fixed index.
+//
+// Extracting 32-bit values on MSA32 should always use COPY_S_W rather than
+// COPY_U_W, even for the zero-extended case. This is because our forward
+// compatibility strategy is to consider registers to be infinitely
+// sign-extended so that a MIPS64 can execute MIPS32 code without getting
+// different register values.
+def : MSAPat<(vextract_zext_i32 (v4i32 MSA128W:$ws), immZExt2Ptr:$idx),
+ (COPY_S_W MSA128W:$ws, immZExt2:$idx)>, ASE_MSA_NOT_MSA64;
+def : MSAPat<(vextract_zext_i32 (v4f32 MSA128W:$ws), immZExt2Ptr:$idx),
+ (COPY_S_W MSA128W:$ws, immZExt2:$idx)>, ASE_MSA_NOT_MSA64;
+
+// Extracting 64-bit values on MSA64 should always use COPY_S_D rather than
+// COPY_U_D, even for the zero-extended case. This is because our forward
+// compatibility strategy is to consider registers to be infinitely
+// sign-extended so that a hypothetical MIPS128 would be able to execute MIPS64
+// code without getting different register values.
+def : MSAPat<(vextract_zext_i64 (v2i64 MSA128D:$ws), immZExt1Ptr:$idx),
+ (COPY_S_D MSA128D:$ws, immZExt1:$idx)>, ASE_MSA64;
+def : MSAPat<(vextract_zext_i64 (v2f64 MSA128D:$ws), immZExt1Ptr:$idx),
+ (COPY_S_D MSA128D:$ws, immZExt1:$idx)>, ASE_MSA64;
+
+// Vector extraction with variable index
+def : MSAPat<(i32 (vextract_sext_i8 v16i8:$ws, i32:$idx)),
+ (SRA (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG (SPLAT_B v16i8:$ws,
+ i32:$idx),
+ sub_lo)),
+ GPR32), (i32 24))>;
+def : MSAPat<(i32 (vextract_sext_i16 v8i16:$ws, i32:$idx)),
+ (SRA (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG (SPLAT_H v8i16:$ws,
+ i32:$idx),
+ sub_lo)),
+ GPR32), (i32 16))>;
+def : MSAPat<(i32 (vextract_sext_i32 v4i32:$ws, i32:$idx)),
+ (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG (SPLAT_W v4i32:$ws,
+ i32:$idx),
+ sub_lo)),
+ GPR32)>;
+def : MSAPat<(i64 (vextract_sext_i64 v2i64:$ws, i32:$idx)),
+ (COPY_TO_REGCLASS (i64 (EXTRACT_SUBREG (SPLAT_D v2i64:$ws,
+ i32:$idx),
+ sub_64)),
+ GPR64), [HasMSA, IsGP64bit]>;
+
+def : MSAPat<(i32 (vextract_zext_i8 v16i8:$ws, i32:$idx)),
+ (SRL (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG (SPLAT_B v16i8:$ws,
+ i32:$idx),
+ sub_lo)),
+ GPR32), (i32 24))>;
+def : MSAPat<(i32 (vextract_zext_i16 v8i16:$ws, i32:$idx)),
+ (SRL (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG (SPLAT_H v8i16:$ws,
+ i32:$idx),
+ sub_lo)),
+ GPR32), (i32 16))>;
+def : MSAPat<(i32 (vextract_zext_i32 v4i32:$ws, i32:$idx)),
+ (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG (SPLAT_W v4i32:$ws,
+ i32:$idx),
+ sub_lo)),
+ GPR32)>;
+def : MSAPat<(i64 (vextract_zext_i64 v2i64:$ws, i32:$idx)),
+ (COPY_TO_REGCLASS (i64 (EXTRACT_SUBREG (SPLAT_D v2i64:$ws,
+ i32:$idx),
+ sub_64)),
+ GPR64), [HasMSA, IsGP64bit]>;
+
+def : MSAPat<(f32 (vector_extract v4f32:$ws, i32:$idx)),
+ (f32 (EXTRACT_SUBREG (SPLAT_W v4f32:$ws,
+ i32:$idx),
+ sub_lo))>;
+def : MSAPat<(f64 (vector_extract v2f64:$ws, i32:$idx)),
+ (f64 (EXTRACT_SUBREG (SPLAT_D v2f64:$ws,
+ i32:$idx),
+ sub_64))>;
+
+// Vector extraction with variable index (N64 ABI)
+def : MSAPat<
+ (i32 (vextract_sext_i8 v16i8:$ws, i64:$idx)),
+ (SRA (COPY_TO_REGCLASS
+ (i32 (EXTRACT_SUBREG
+ (SPLAT_B v16i8:$ws,
+ (COPY_TO_REGCLASS
+ (i32 (EXTRACT_SUBREG i64:$idx, sub_32)), GPR32)),
+ sub_lo)),
+ GPR32),
+ (i32 24))>;
+def : MSAPat<
+ (i32 (vextract_sext_i16 v8i16:$ws, i64:$idx)),
+ (SRA (COPY_TO_REGCLASS
+ (i32 (EXTRACT_SUBREG
+ (SPLAT_H v8i16:$ws,
+ (COPY_TO_REGCLASS
+ (i32 (EXTRACT_SUBREG i64:$idx, sub_32)), GPR32)),
+ sub_lo)),
+ GPR32),
+ (i32 16))>;
+def : MSAPat<
+ (i32 (vextract_sext_i32 v4i32:$ws, i64:$idx)),
+ (COPY_TO_REGCLASS
+ (i32 (EXTRACT_SUBREG
+ (SPLAT_W v4i32:$ws,
+ (COPY_TO_REGCLASS
+ (i32 (EXTRACT_SUBREG i64:$idx, sub_32)), GPR32)),
+ sub_lo)),
+ GPR32)>;
+def : MSAPat<
+ (i64 (vextract_sext_i64 v2i64:$ws, i64:$idx)),
+ (COPY_TO_REGCLASS
+ (i64 (EXTRACT_SUBREG
+ (SPLAT_D v2i64:$ws,
+ (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG i64:$idx, sub_32)), GPR32)),
+ sub_64)),
+ GPR64), [HasMSA, IsGP64bit]>;
+
+def : MSAPat<
+ (i32 (vextract_zext_i8 v16i8:$ws, i64:$idx)),
+ (SRL (COPY_TO_REGCLASS
+ (i32 (EXTRACT_SUBREG
+ (SPLAT_B v16i8:$ws,
+ (COPY_TO_REGCLASS
+ (i32 (EXTRACT_SUBREG i64:$idx, sub_32)), GPR32)),
+ sub_lo)),
+ GPR32),
+ (i32 24))>;
+def : MSAPat<
+ (i32 (vextract_zext_i16 v8i16:$ws, i64:$idx)),
+ (SRL (COPY_TO_REGCLASS
+ (i32 (EXTRACT_SUBREG
+ (SPLAT_H v8i16:$ws,
+ (COPY_TO_REGCLASS
+ (i32 (EXTRACT_SUBREG i64:$idx, sub_32)), GPR32)),
+ sub_lo)),
+ GPR32),
+ (i32 16))>;
+def : MSAPat<
+ (i32 (vextract_zext_i32 v4i32:$ws, i64:$idx)),
+ (COPY_TO_REGCLASS
+ (i32 (EXTRACT_SUBREG
+ (SPLAT_W v4i32:$ws,
+ (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG i64:$idx, sub_32)), GPR32)),
+ sub_lo)),
+ GPR32)>;
+def : MSAPat<
+ (i64 (vextract_zext_i64 v2i64:$ws, i64:$idx)),
+ (COPY_TO_REGCLASS
+ (i64 (EXTRACT_SUBREG
+ (SPLAT_D v2i64:$ws,
+ (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG i64:$idx, sub_32)), GPR32)),
+ sub_64)),
+ GPR64),
+ [HasMSA, IsGP64bit]>;
+
+def : MSAPat<
+ (f32 (vector_extract v4f32:$ws, i64:$idx)),
+ (f32 (EXTRACT_SUBREG
+ (SPLAT_W v4f32:$ws,
+ (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG i64:$idx, sub_32)), GPR32)),
+ sub_lo))>;
+def : MSAPat<
+ (f64 (vector_extract v2f64:$ws, i64:$idx)),
+ (f64 (EXTRACT_SUBREG
+ (SPLAT_D v2f64:$ws,
+ (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG i64:$idx, sub_32)), GPR32)),
+ sub_64))>;
diff --git a/contrib/llvm/lib/Target/Mips/MipsMachineFunction.cpp b/contrib/llvm/lib/Target/Mips/MipsMachineFunction.cpp
new file mode 100644
index 000000000000..d0609b15341d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsMachineFunction.cpp
@@ -0,0 +1,104 @@
+//===-- MipsMachineFunctionInfo.cpp - Private data used for Mips ----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MipsInstrInfo.h"
+#include "MipsMachineFunction.h"
+#include "MipsSubtarget.h"
+#include "MipsTargetMachine.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+FixGlobalBaseReg("mips-fix-global-base-reg", cl::Hidden, cl::init(true),
+ cl::desc("Always use $gp as the global base register."));
+
+MipsFunctionInfo::~MipsFunctionInfo() {}
+
+bool MipsFunctionInfo::globalBaseRegSet() const {
+ return GlobalBaseReg;
+}
+
+unsigned MipsFunctionInfo::getGlobalBaseReg() {
+ // Return if it has already been initialized.
+ if (GlobalBaseReg)
+ return GlobalBaseReg;
+
+ MipsSubtarget const &STI =
+ static_cast<const MipsSubtarget &>(MF.getSubtarget());
+
+ const TargetRegisterClass *RC =
+ STI.inMips16Mode()
+ ? &Mips::CPU16RegsRegClass
+ : STI.inMicroMipsMode()
+ ? STI.hasMips64()
+ ? &Mips::GPRMM16_64RegClass
+ : &Mips::GPRMM16RegClass
+ : static_cast<const MipsTargetMachine &>(MF.getTarget())
+ .getABI()
+ .IsN64()
+ ? &Mips::GPR64RegClass
+ : &Mips::GPR32RegClass;
+ return GlobalBaseReg = MF.getRegInfo().createVirtualRegister(RC);
+}
+
+void MipsFunctionInfo::createEhDataRegsFI() {
+ for (int I = 0; I < 4; ++I) {
+ const TargetRegisterClass *RC =
+ static_cast<const MipsTargetMachine &>(MF.getTarget()).getABI().IsN64()
+ ? &Mips::GPR64RegClass
+ : &Mips::GPR32RegClass;
+
+ EhDataRegFI[I] = MF.getFrameInfo().CreateStackObject(RC->getSize(),
+ RC->getAlignment(), false);
+ }
+}
+
+void MipsFunctionInfo::createISRRegFI() {
+ // ISRs require spill slots for Status & ErrorPC Coprocessor 0 registers.
+ // The current implementation only supports Mips32r2+ not Mips64rX. Status
+ // is always 32 bits, ErrorPC is 32 or 64 bits dependent on architecture,
+ // however Mips32r2+ is the supported architecture.
+ const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+
+ for (int I = 0; I < 2; ++I)
+ ISRDataRegFI[I] = MF.getFrameInfo().CreateStackObject(
+ RC->getSize(), RC->getAlignment(), false);
+}
+
+bool MipsFunctionInfo::isEhDataRegFI(int FI) const {
+ return CallsEhReturn && (FI == EhDataRegFI[0] || FI == EhDataRegFI[1]
+ || FI == EhDataRegFI[2] || FI == EhDataRegFI[3]);
+}
+
+bool MipsFunctionInfo::isISRRegFI(int FI) const {
+ return IsISR && (FI == ISRDataRegFI[0] || FI == ISRDataRegFI[1]);
+}
+MachinePointerInfo MipsFunctionInfo::callPtrInfo(const char *ES) {
+ return MachinePointerInfo(MF.getPSVManager().getExternalSymbolCallEntry(ES));
+}
+
+MachinePointerInfo MipsFunctionInfo::callPtrInfo(const GlobalValue *GV) {
+ return MachinePointerInfo(MF.getPSVManager().getGlobalValueCallEntry(GV));
+}
+
+int MipsFunctionInfo::getMoveF64ViaSpillFI(const TargetRegisterClass *RC) {
+ if (MoveF64ViaSpillFI == -1) {
+ MoveF64ViaSpillFI = MF.getFrameInfo().CreateStackObject(
+ RC->getSize(), RC->getAlignment(), false);
+ }
+ return MoveF64ViaSpillFI;
+}
+
+void MipsFunctionInfo::anchor() { }
diff --git a/contrib/llvm/lib/Target/Mips/MipsMachineFunction.h b/contrib/llvm/lib/Target/Mips/MipsMachineFunction.h
new file mode 100644
index 000000000000..c9e5fddc1932
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsMachineFunction.h
@@ -0,0 +1,132 @@
+//===-- MipsMachineFunctionInfo.h - Private data used for Mips ----*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Mips specific subclass of MachineFunctionInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSMACHINEFUNCTION_H
+#define LLVM_LIB_TARGET_MIPS_MIPSMACHINEFUNCTION_H
+
+#include "Mips16HardFloatInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include <map>
+
+namespace llvm {
+
+/// MipsFunctionInfo - This class is derived from MachineFunction private
+/// Mips target-specific information for each MachineFunction.
+class MipsFunctionInfo : public MachineFunctionInfo {
+public:
+ MipsFunctionInfo(MachineFunction &MF)
+ : MF(MF), SRetReturnReg(0), GlobalBaseReg(0), VarArgsFrameIndex(0),
+ CallsEhReturn(false), IsISR(false), SaveS2(false),
+ MoveF64ViaSpillFI(-1) {}
+
+ ~MipsFunctionInfo();
+
+ unsigned getSRetReturnReg() const { return SRetReturnReg; }
+ void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+
+ bool globalBaseRegSet() const;
+ unsigned getGlobalBaseReg();
+
+ int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+ void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
+
+ bool hasByvalArg() const { return HasByvalArg; }
+ void setFormalArgInfo(unsigned Size, bool HasByval) {
+ IncomingArgSize = Size;
+ HasByvalArg = HasByval;
+ }
+
+ unsigned getIncomingArgSize() const { return IncomingArgSize; }
+
+ bool callsEhReturn() const { return CallsEhReturn; }
+ void setCallsEhReturn() { CallsEhReturn = true; }
+
+ void createEhDataRegsFI();
+ int getEhDataRegFI(unsigned Reg) const { return EhDataRegFI[Reg]; }
+ bool isEhDataRegFI(int FI) const;
+
+ /// Create a MachinePointerInfo that has an ExternalSymbolPseudoSourceValue
+ /// object representing a GOT entry for an external function.
+ MachinePointerInfo callPtrInfo(const char *ES);
+
+ // Functions with the "interrupt" attribute require special prologues,
+ // epilogues and additional spill slots.
+ bool isISR() const { return IsISR; }
+ void setISR() { IsISR = true; }
+ void createISRRegFI();
+ int getISRRegFI(unsigned Reg) const { return ISRDataRegFI[Reg]; }
+ bool isISRRegFI(int FI) const;
+
+ /// Create a MachinePointerInfo that has a GlobalValuePseudoSourceValue object
+ /// representing a GOT entry for a global function.
+ MachinePointerInfo callPtrInfo(const GlobalValue *GV);
+
+ void setSaveS2() { SaveS2 = true; }
+ bool hasSaveS2() const { return SaveS2; }
+
+ int getMoveF64ViaSpillFI(const TargetRegisterClass *RC);
+
+ std::map<const char *, const llvm::Mips16HardFloatInfo::FuncSignature *>
+ StubsNeeded;
+
+private:
+ virtual void anchor();
+
+ MachineFunction& MF;
+ /// SRetReturnReg - Some subtargets require that sret lowering includes
+ /// returning the value of the returned struct in a register. This field
+ /// holds the virtual register into which the sret argument is passed.
+ unsigned SRetReturnReg;
+
+ /// GlobalBaseReg - keeps track of the virtual register initialized for
+ /// use as the global base register. This is used for PIC in some PIC
+ /// relocation models.
+ unsigned GlobalBaseReg;
+
+ /// VarArgsFrameIndex - FrameIndex for start of varargs area.
+ int VarArgsFrameIndex;
+
+ /// True if function has a byval argument.
+ bool HasByvalArg;
+
+ /// Size of incoming argument area.
+ unsigned IncomingArgSize;
+
+ /// CallsEhReturn - Whether the function calls llvm.eh.return.
+ bool CallsEhReturn;
+
+ /// Frame objects for spilling eh data registers.
+ int EhDataRegFI[4];
+
+ /// ISR - Whether the function is an Interrupt Service Routine.
+ bool IsISR;
+
+ /// Frame objects for spilling C0_STATUS, C0_EPC
+ int ISRDataRegFI[2];
+
+ // saveS2
+ bool SaveS2;
+
+ /// FrameIndex for expanding BuildPairF64 nodes to spill and reload when the
+ /// O32 FPXX ABI is enabled. -1 is used to denote invalid index.
+ int MoveF64ViaSpillFI;
+};
+
+} // end of namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp
new file mode 100644
index 000000000000..cf85eb3f2416
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp
@@ -0,0 +1,50 @@
+//===----------------------------------------------------------------------===//
+// Instruction Selector Subtarget Control
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// This file defines a pass used to change the subtarget for the
+// Mips Instruction selector.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips.h"
+#include "MipsTargetMachine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-isel"
+
+namespace {
+ class MipsModuleDAGToDAGISel : public MachineFunctionPass {
+ public:
+ static char ID;
+
+ explicit MipsModuleDAGToDAGISel(MipsTargetMachine &TM_)
+ : MachineFunctionPass(ID), TM(TM_) {}
+
+ // Pass Name
+ StringRef getPassName() const override {
+ return "MIPS DAG->DAG Pattern Instruction Selection";
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ protected:
+ MipsTargetMachine &TM;
+ };
+
+ char MipsModuleDAGToDAGISel::ID = 0;
+}
+
+bool MipsModuleDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
+ DEBUG(errs() << "In MipsModuleDAGToDAGISel::runMachineFunction\n");
+ TM.resetSubtarget(&MF);
+ return false;
+}
+
+llvm::FunctionPass *llvm::createMipsModuleISelDagPass(MipsTargetMachine &TM) {
+ return new MipsModuleDAGToDAGISel(TM);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp b/contrib/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp
new file mode 100644
index 000000000000..f33857fe628f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp
@@ -0,0 +1,301 @@
+//===--------- MipsOptimizePICCall.cpp - Optimize PIC Calls ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass eliminates unnecessary instructions that set up $gp and replace
+// instructions that load target function addresses with copy instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MipsMachineFunction.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/ScopedHashTable.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "optimize-mips-pic-call"
+
+static cl::opt<bool> LoadTargetFromGOT("mips-load-target-from-got",
+ cl::init(true),
+ cl::desc("Load target address from GOT"),
+ cl::Hidden);
+
+static cl::opt<bool> EraseGPOpnd("mips-erase-gp-opnd",
+ cl::init(true), cl::desc("Erase GP Operand"),
+ cl::Hidden);
+
+namespace {
+typedef PointerUnion<const Value *, const PseudoSourceValue *> ValueType;
+
+typedef std::pair<unsigned, unsigned> CntRegP;
+typedef RecyclingAllocator<BumpPtrAllocator,
+ ScopedHashTableVal<ValueType, CntRegP> >
+AllocatorTy;
+typedef ScopedHashTable<ValueType, CntRegP, DenseMapInfo<ValueType>,
+ AllocatorTy> ScopedHTType;
+
+class MBBInfo {
+public:
+ MBBInfo(MachineDomTreeNode *N);
+ const MachineDomTreeNode *getNode() const;
+ bool isVisited() const;
+ void preVisit(ScopedHTType &ScopedHT);
+ void postVisit();
+
+private:
+ MachineDomTreeNode *Node;
+ ScopedHTType::ScopeTy *HTScope;
+};
+
+class OptimizePICCall : public MachineFunctionPass {
+public:
+ OptimizePICCall(TargetMachine &tm) : MachineFunctionPass(ID) {}
+
+ StringRef getPassName() const override { return "Mips OptimizePICCall"; }
+
+ bool runOnMachineFunction(MachineFunction &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineDominatorTree>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+private:
+ /// \brief Visit MBB.
+ bool visitNode(MBBInfo &MBBI);
+
+ /// \brief Test if MI jumps to a function via a register.
+ ///
+ /// Also, return the virtual register containing the target function's address
+ /// and the underlying object in Reg and Val respectively, if the function's
+ /// address can be resolved lazily.
+ bool isCallViaRegister(MachineInstr &MI, unsigned &Reg,
+ ValueType &Val) const;
+
+ /// \brief Return the number of instructions that dominate the current
+ /// instruction and load the function address from object Entry.
+ unsigned getCount(ValueType Entry);
+
+ /// \brief Return the destination virtual register of the last instruction
+ /// that loads from object Entry.
+ unsigned getReg(ValueType Entry);
+
+ /// \brief Update ScopedHT.
+ void incCntAndSetReg(ValueType Entry, unsigned Reg);
+
+ ScopedHTType ScopedHT;
+ static char ID;
+};
+
+char OptimizePICCall::ID = 0;
+} // end of anonymous namespace
+
+/// Return the first MachineOperand of MI if it is a used virtual register.
+static MachineOperand *getCallTargetRegOpnd(MachineInstr &MI) {
+ if (MI.getNumOperands() == 0)
+ return nullptr;
+
+ MachineOperand &MO = MI.getOperand(0);
+
+ if (!MO.isReg() || !MO.isUse() ||
+ !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+ return nullptr;
+
+ return &MO;
+}
+
+/// Return type of register Reg.
+static MVT::SimpleValueType getRegTy(unsigned Reg, MachineFunction &MF) {
+ const TargetRegisterClass *RC = MF.getRegInfo().getRegClass(Reg);
+ assert(RC->vt_end() - RC->vt_begin() == 1);
+ return *RC->vt_begin();
+}
+
+/// Do the following transformation:
+///
+/// jalr $vreg
+/// =>
+/// copy $t9, $vreg
+/// jalr $t9
+static void setCallTargetReg(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator I) {
+ MachineFunction &MF = *MBB->getParent();
+ const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+ unsigned SrcReg = I->getOperand(0).getReg();
+ unsigned DstReg = getRegTy(SrcReg, MF) == MVT::i32 ? Mips::T9 : Mips::T9_64;
+ BuildMI(*MBB, I, I->getDebugLoc(), TII.get(TargetOpcode::COPY), DstReg)
+ .addReg(SrcReg);
+ I->getOperand(0).setReg(DstReg);
+}
+
+/// Search MI's operands for register GP and erase it.
+static void eraseGPOpnd(MachineInstr &MI) {
+ if (!EraseGPOpnd)
+ return;
+
+ MachineFunction &MF = *MI.getParent()->getParent();
+ MVT::SimpleValueType Ty = getRegTy(MI.getOperand(0).getReg(), MF);
+ unsigned Reg = Ty == MVT::i32 ? Mips::GP : Mips::GP_64;
+
+ for (unsigned I = 0; I < MI.getNumOperands(); ++I) {
+ MachineOperand &MO = MI.getOperand(I);
+ if (MO.isReg() && MO.getReg() == Reg) {
+ MI.RemoveOperand(I);
+ return;
+ }
+ }
+
+ llvm_unreachable(nullptr);
+}
+
+MBBInfo::MBBInfo(MachineDomTreeNode *N) : Node(N), HTScope(nullptr) {}
+
+const MachineDomTreeNode *MBBInfo::getNode() const { return Node; }
+
+bool MBBInfo::isVisited() const { return HTScope; }
+
+void MBBInfo::preVisit(ScopedHTType &ScopedHT) {
+ HTScope = new ScopedHTType::ScopeTy(ScopedHT);
+}
+
+void MBBInfo::postVisit() {
+ delete HTScope;
+}
+
+// OptimizePICCall methods.
+bool OptimizePICCall::runOnMachineFunction(MachineFunction &F) {
+ if (static_cast<const MipsSubtarget &>(F.getSubtarget()).inMips16Mode())
+ return false;
+
+ // Do a pre-order traversal of the dominator tree.
+ MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>();
+ bool Changed = false;
+
+ SmallVector<MBBInfo, 8> WorkList(1, MBBInfo(MDT->getRootNode()));
+
+ while (!WorkList.empty()) {
+ MBBInfo &MBBI = WorkList.back();
+
+ // If this MBB has already been visited, destroy the scope for the MBB and
+ // pop it from the work list.
+ if (MBBI.isVisited()) {
+ MBBI.postVisit();
+ WorkList.pop_back();
+ continue;
+ }
+
+ // Visit the MBB and add its children to the work list.
+ MBBI.preVisit(ScopedHT);
+ Changed |= visitNode(MBBI);
+ const MachineDomTreeNode *Node = MBBI.getNode();
+ const std::vector<MachineDomTreeNode *> &Children = Node->getChildren();
+ WorkList.append(Children.begin(), Children.end());
+ }
+
+ return Changed;
+}
+
+bool OptimizePICCall::visitNode(MBBInfo &MBBI) {
+ bool Changed = false;
+ MachineBasicBlock *MBB = MBBI.getNode()->getBlock();
+
+ for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
+ ++I) {
+ unsigned Reg;
+ ValueType Entry;
+
+ // Skip instructions that are not call instructions via registers.
+ if (!isCallViaRegister(*I, Reg, Entry))
+ continue;
+
+ Changed = true;
+ unsigned N = getCount(Entry);
+
+ if (N != 0) {
+ // If a function has been called more than twice, we do not have to emit a
+ // load instruction to get the function address from the GOT, but can
+ // instead reuse the address that has been loaded before.
+ if (N >= 2 && !LoadTargetFromGOT)
+ getCallTargetRegOpnd(*I)->setReg(getReg(Entry));
+
+ // Erase the $gp operand if this isn't the first time a function has
+ // been called. $gp needs to be set up only if the function call can go
+ // through a lazy binding stub.
+ eraseGPOpnd(*I);
+ }
+
+ if (Entry)
+ incCntAndSetReg(Entry, Reg);
+
+ setCallTargetReg(MBB, I);
+ }
+
+ return Changed;
+}
+
+bool OptimizePICCall::isCallViaRegister(MachineInstr &MI, unsigned &Reg,
+ ValueType &Val) const {
+ if (!MI.isCall())
+ return false;
+
+ MachineOperand *MO = getCallTargetRegOpnd(MI);
+
+ // Return if MI is not a function call via a register.
+ if (!MO)
+ return false;
+
+ // Get the instruction that loads the function address from the GOT.
+ Reg = MO->getReg();
+ Val = (Value*)nullptr;
+ MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ MachineInstr *DefMI = MRI.getVRegDef(Reg);
+
+ assert(DefMI);
+
+ // See if DefMI is an instruction that loads from a GOT entry that holds the
+ // address of a lazy binding stub.
+ if (!DefMI->mayLoad() || DefMI->getNumOperands() < 3)
+ return true;
+
+ unsigned Flags = DefMI->getOperand(2).getTargetFlags();
+
+ if (Flags != MipsII::MO_GOT_CALL && Flags != MipsII::MO_CALL_LO16)
+ return true;
+
+ // Return the underlying object for the GOT entry in Val.
+ assert(DefMI->hasOneMemOperand());
+ Val = (*DefMI->memoperands_begin())->getValue();
+ if (!Val)
+ Val = (*DefMI->memoperands_begin())->getPseudoValue();
+ return true;
+}
+
+unsigned OptimizePICCall::getCount(ValueType Entry) {
+ return ScopedHT.lookup(Entry).first;
+}
+
+unsigned OptimizePICCall::getReg(ValueType Entry) {
+ unsigned Reg = ScopedHT.lookup(Entry).second;
+ assert(Reg);
+ return Reg;
+}
+
+void OptimizePICCall::incCntAndSetReg(ValueType Entry, unsigned Reg) {
+ CntRegP P = ScopedHT.lookup(Entry);
+ ScopedHT.insert(Entry, std::make_pair(P.first + 1, Reg));
+}
+
+/// Return an OptimizeCall object.
+FunctionPass *llvm::createMipsOptimizePICCallPass(MipsTargetMachine &TM) {
+ return new OptimizePICCall(TM);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsOptionRecord.h b/contrib/llvm/lib/Target/Mips/MipsOptionRecord.h
new file mode 100644
index 000000000000..23f0b7070d62
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsOptionRecord.h
@@ -0,0 +1,78 @@
+//===-- MipsOptionRecord.h - Abstraction for storing information ----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// MipsOptionRecord - Abstraction for storing arbitrary information in
+// ELF files. Arbitrary information (e.g. register usage) can be stored in Mips
+// specific ELF sections like .Mips.options. Specific records should subclass
+// MipsOptionRecord and provide an implementation to EmitMipsOptionRecord which
+// basically just dumps the information into an ELF section. More information
+// about .Mips.option can be found in the SysV ABI and the 64-bit ELF Object
+// specification.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSOPTIONRECORD_H
+#define LLVM_LIB_TARGET_MIPS_MIPSOPTIONRECORD_H
+
+#include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCRegisterInfo.h"
+
+namespace llvm {
+class MipsELFStreamer;
+class MCSubtargetInfo;
+
+class MipsOptionRecord {
+public:
+ virtual ~MipsOptionRecord(){};
+ virtual void EmitMipsOptionRecord() = 0;
+};
+
+class MipsRegInfoRecord : public MipsOptionRecord {
+public:
+ MipsRegInfoRecord(MipsELFStreamer *S, MCContext &Context)
+ : Streamer(S), Context(Context) {
+ ri_gprmask = 0;
+ ri_cprmask[0] = ri_cprmask[1] = ri_cprmask[2] = ri_cprmask[3] = 0;
+ ri_gp_value = 0;
+
+ const MCRegisterInfo *TRI = Context.getRegisterInfo();
+ GPR32RegClass = &(TRI->getRegClass(Mips::GPR32RegClassID));
+ GPR64RegClass = &(TRI->getRegClass(Mips::GPR64RegClassID));
+ FGR32RegClass = &(TRI->getRegClass(Mips::FGR32RegClassID));
+ FGR64RegClass = &(TRI->getRegClass(Mips::FGR64RegClassID));
+ AFGR64RegClass = &(TRI->getRegClass(Mips::AFGR64RegClassID));
+ MSA128BRegClass = &(TRI->getRegClass(Mips::MSA128BRegClassID));
+ COP0RegClass = &(TRI->getRegClass(Mips::COP0RegClassID));
+ COP2RegClass = &(TRI->getRegClass(Mips::COP2RegClassID));
+ COP3RegClass = &(TRI->getRegClass(Mips::COP3RegClassID));
+ }
+ ~MipsRegInfoRecord() override {}
+
+ void EmitMipsOptionRecord() override;
+ void SetPhysRegUsed(unsigned Reg, const MCRegisterInfo *MCRegInfo);
+
+private:
+ MipsELFStreamer *Streamer;
+ MCContext &Context;
+ const MCRegisterClass *GPR32RegClass;
+ const MCRegisterClass *GPR64RegClass;
+ const MCRegisterClass *FGR32RegClass;
+ const MCRegisterClass *FGR64RegClass;
+ const MCRegisterClass *AFGR64RegClass;
+ const MCRegisterClass *MSA128BRegClass;
+ const MCRegisterClass *COP0RegClass;
+ const MCRegisterClass *COP2RegClass;
+ const MCRegisterClass *COP3RegClass;
+ uint32_t ri_gprmask;
+ uint32_t ri_cprmask[4];
+ int64_t ri_gp_value;
+};
+} // namespace llvm
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsOs16.cpp b/contrib/llvm/lib/Target/Mips/MipsOs16.cpp
new file mode 100644
index 000000000000..51ac5620f585
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsOs16.cpp
@@ -0,0 +1,160 @@
+//===---- MipsOs16.cpp for Mips Option -Os16 --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an optimization phase for the MIPS target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Instructions.h"
+#include "Mips.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-os16"
+
+static cl::opt<std::string> Mips32FunctionMask(
+ "mips32-function-mask",
+ cl::init(""),
+ cl::desc("Force function to be mips32"),
+ cl::Hidden);
+
+namespace {
+ class MipsOs16 : public ModulePass {
+ public:
+ static char ID;
+
+ MipsOs16() : ModulePass(ID) {}
+
+ StringRef getPassName() const override { return "MIPS Os16 Optimization"; }
+
+ bool runOnModule(Module &M) override;
+ };
+
+ char MipsOs16::ID = 0;
+}
+
+// Figure out if we need float point based on the function signature.
+// We need to move variables in and/or out of floating point
+// registers because of the ABI
+//
+static bool needsFPFromSig(Function &F) {
+ Type* RetType = F.getReturnType();
+ switch (RetType->getTypeID()) {
+ case Type::FloatTyID:
+ case Type::DoubleTyID:
+ return true;
+ default:
+ ;
+ }
+ if (F.arg_size() >=1) {
+ Argument &Arg = F.getArgumentList().front();
+ switch (Arg.getType()->getTypeID()) {
+ case Type::FloatTyID:
+ case Type::DoubleTyID:
+ return true;
+ default:
+ ;
+ }
+ }
+ return false;
+}
+
+// Figure out if the function will need floating point operations
+//
+static bool needsFP(Function &F) {
+ if (needsFPFromSig(F))
+ return true;
+ for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+ for (BasicBlock::const_iterator I = BB->begin(), E = BB->end();
+ I != E; ++I) {
+ const Instruction &Inst = *I;
+ switch (Inst.getOpcode()) {
+ case Instruction::FAdd:
+ case Instruction::FSub:
+ case Instruction::FMul:
+ case Instruction::FDiv:
+ case Instruction::FRem:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::UIToFP:
+ case Instruction::SIToFP:
+ case Instruction::FPTrunc:
+ case Instruction::FPExt:
+ case Instruction::FCmp:
+ return true;
+ default:
+ ;
+ }
+ if (const CallInst *CI = dyn_cast<CallInst>(I)) {
+ DEBUG(dbgs() << "Working on call" << "\n");
+ Function &F_ = *CI->getCalledFunction();
+ if (needsFPFromSig(F_))
+ return true;
+ }
+ }
+ return false;
+}
+
+
+bool MipsOs16::runOnModule(Module &M) {
+ bool usingMask = Mips32FunctionMask.length() > 0;
+ bool doneUsingMask = false; // this will make it stop repeating
+
+ DEBUG(dbgs() << "Run on Module MipsOs16 \n" << Mips32FunctionMask << "\n");
+ if (usingMask)
+ DEBUG(dbgs() << "using mask \n" << Mips32FunctionMask << "\n");
+
+ unsigned int functionIndex = 0;
+ bool modified = false;
+
+ for (auto &F : M) {
+ if (F.isDeclaration())
+ continue;
+
+ DEBUG(dbgs() << "Working on " << F.getName() << "\n");
+ if (usingMask) {
+ if (!doneUsingMask) {
+ if (functionIndex == Mips32FunctionMask.length())
+ functionIndex = 0;
+ switch (Mips32FunctionMask[functionIndex]) {
+ case '1':
+ DEBUG(dbgs() << "mask forced mips32: " << F.getName() << "\n");
+ F.addFnAttr("nomips16");
+ break;
+ case '.':
+ doneUsingMask = true;
+ break;
+ default:
+ break;
+ }
+ functionIndex++;
+ }
+ }
+ else {
+ if (needsFP(F)) {
+ DEBUG(dbgs() << "os16 forced mips32: " << F.getName() << "\n");
+ F.addFnAttr("nomips16");
+ }
+ else {
+ DEBUG(dbgs() << "os16 forced mips16: " << F.getName() << "\n");
+ F.addFnAttr("mips16");
+ }
+ }
+ }
+
+ return modified;
+}
+
+ModulePass *llvm::createMipsOs16Pass(MipsTargetMachine &TM) {
+ return new MipsOs16;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
new file mode 100644
index 000000000000..65be350f259d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -0,0 +1,340 @@
+//===-- MipsRegisterInfo.cpp - MIPS Register Information -== --------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MIPS implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsRegisterInfo.h"
+#include "Mips.h"
+#include "MipsInstrInfo.h"
+#include "MipsMachineFunction.h"
+#include "MipsSubtarget.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-reg-info"
+
+#define GET_REGINFO_TARGET_DESC
+#include "MipsGenRegisterInfo.inc"
+
+MipsRegisterInfo::MipsRegisterInfo() : MipsGenRegisterInfo(Mips::RA) {}
+
+unsigned MipsRegisterInfo::getPICCallReg() { return Mips::T9; }
+
+const TargetRegisterClass *
+MipsRegisterInfo::getPointerRegClass(const MachineFunction &MF,
+ unsigned Kind) const {
+ MipsABIInfo ABI = MF.getSubtarget<MipsSubtarget>().getABI();
+ MipsPtrClass PtrClassKind = static_cast<MipsPtrClass>(Kind);
+
+ switch (PtrClassKind) {
+ case MipsPtrClass::Default:
+ return ABI.ArePtrs64bit() ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+ case MipsPtrClass::GPR16MM:
+ return ABI.ArePtrs64bit() ? &Mips::GPRMM16_64RegClass
+ : &Mips::GPRMM16RegClass;
+ case MipsPtrClass::StackPointer:
+ return ABI.ArePtrs64bit() ? &Mips::SP64RegClass : &Mips::SP32RegClass;
+ case MipsPtrClass::GlobalPointer:
+ return ABI.ArePtrs64bit() ? &Mips::GP64RegClass : &Mips::GP32RegClass;
+ }
+
+ llvm_unreachable("Unknown pointer kind");
+}
+
+unsigned
+MipsRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+ MachineFunction &MF) const {
+ switch (RC->getID()) {
+ default:
+ return 0;
+ case Mips::GPR32RegClassID:
+ case Mips::GPR64RegClassID:
+ case Mips::DSPRRegClassID: {
+ const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+ return 28 - TFI->hasFP(MF);
+ }
+ case Mips::FGR32RegClassID:
+ return 32;
+ case Mips::AFGR64RegClassID:
+ return 16;
+ case Mips::FGR64RegClassID:
+ return 32;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Callee Saved Registers methods
+//===----------------------------------------------------------------------===//
+
+/// Mips Callee Saved Registers
+const MCPhysReg *
+MipsRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+ const MipsSubtarget &Subtarget = MF->getSubtarget<MipsSubtarget>();
+ const Function *F = MF->getFunction();
+ if (F->hasFnAttribute("interrupt")) {
+ if (Subtarget.hasMips64())
+ return Subtarget.hasMips64r6() ? CSR_Interrupt_64R6_SaveList
+ : CSR_Interrupt_64_SaveList;
+ else
+ return Subtarget.hasMips32r6() ? CSR_Interrupt_32R6_SaveList
+ : CSR_Interrupt_32_SaveList;
+ }
+
+ if (Subtarget.isSingleFloat())
+ return CSR_SingleFloatOnly_SaveList;
+
+ if (Subtarget.isABI_N64())
+ return CSR_N64_SaveList;
+
+ if (Subtarget.isABI_N32())
+ return CSR_N32_SaveList;
+
+ if (Subtarget.isFP64bit())
+ return CSR_O32_FP64_SaveList;
+
+ if (Subtarget.isFPXX())
+ return CSR_O32_FPXX_SaveList;
+
+ return CSR_O32_SaveList;
+}
+
+const uint32_t *
+MipsRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
+ CallingConv::ID) const {
+ const MipsSubtarget &Subtarget = MF.getSubtarget<MipsSubtarget>();
+ if (Subtarget.isSingleFloat())
+ return CSR_SingleFloatOnly_RegMask;
+
+ if (Subtarget.isABI_N64())
+ return CSR_N64_RegMask;
+
+ if (Subtarget.isABI_N32())
+ return CSR_N32_RegMask;
+
+ if (Subtarget.isFP64bit())
+ return CSR_O32_FP64_RegMask;
+
+ if (Subtarget.isFPXX())
+ return CSR_O32_FPXX_RegMask;
+
+ return CSR_O32_RegMask;
+}
+
+const uint32_t *MipsRegisterInfo::getMips16RetHelperMask() {
+ return CSR_Mips16RetHelper_RegMask;
+}
+
+BitVector MipsRegisterInfo::
+getReservedRegs(const MachineFunction &MF) const {
+ static const MCPhysReg ReservedGPR32[] = {
+ Mips::ZERO, Mips::K0, Mips::K1, Mips::SP
+ };
+
+ static const MCPhysReg ReservedGPR64[] = {
+ Mips::ZERO_64, Mips::K0_64, Mips::K1_64, Mips::SP_64
+ };
+
+ BitVector Reserved(getNumRegs());
+ const MipsSubtarget &Subtarget = MF.getSubtarget<MipsSubtarget>();
+ typedef TargetRegisterClass::const_iterator RegIter;
+
+ for (unsigned I = 0; I < array_lengthof(ReservedGPR32); ++I)
+ Reserved.set(ReservedGPR32[I]);
+
+ // Reserve registers for the NaCl sandbox.
+ if (Subtarget.isTargetNaCl()) {
+ Reserved.set(Mips::T6); // Reserved for control flow mask.
+ Reserved.set(Mips::T7); // Reserved for memory access mask.
+ Reserved.set(Mips::T8); // Reserved for thread pointer.
+ }
+
+ for (unsigned I = 0; I < array_lengthof(ReservedGPR64); ++I)
+ Reserved.set(ReservedGPR64[I]);
+
+ // For mno-abicalls, GP is a program invariant!
+ if (!Subtarget.isABICalls()) {
+ Reserved.set(Mips::GP);
+ Reserved.set(Mips::GP_64);
+ }
+
+ if (Subtarget.isFP64bit()) {
+ // Reserve all registers in AFGR64.
+ for (RegIter Reg = Mips::AFGR64RegClass.begin(),
+ EReg = Mips::AFGR64RegClass.end(); Reg != EReg; ++Reg)
+ Reserved.set(*Reg);
+ } else {
+ // Reserve all registers in FGR64.
+ for (RegIter Reg = Mips::FGR64RegClass.begin(),
+ EReg = Mips::FGR64RegClass.end(); Reg != EReg; ++Reg)
+ Reserved.set(*Reg);
+ }
+ // Reserve FP if this function should have a dedicated frame pointer register.
+ if (Subtarget.getFrameLowering()->hasFP(MF)) {
+ if (Subtarget.inMips16Mode())
+ Reserved.set(Mips::S0);
+ else {
+ Reserved.set(Mips::FP);
+ Reserved.set(Mips::FP_64);
+
+ // Reserve the base register if we need to both realign the stack and
+ // allocate variable-sized objects at runtime. This should test the
+ // same conditions as MipsFrameLowering::hasBP().
+ if (needsStackRealignment(MF) &&
+ MF.getFrameInfo().hasVarSizedObjects()) {
+ Reserved.set(Mips::S7);
+ Reserved.set(Mips::S7_64);
+ }
+ }
+ }
+
+ // Reserve hardware registers.
+ Reserved.set(Mips::HWR29);
+
+ // Reserve DSP control register.
+ Reserved.set(Mips::DSPPos);
+ Reserved.set(Mips::DSPSCount);
+ Reserved.set(Mips::DSPCarry);
+ Reserved.set(Mips::DSPEFI);
+ Reserved.set(Mips::DSPOutFlag);
+
+ // Reserve MSA control registers.
+ Reserved.set(Mips::MSAIR);
+ Reserved.set(Mips::MSACSR);
+ Reserved.set(Mips::MSAAccess);
+ Reserved.set(Mips::MSASave);
+ Reserved.set(Mips::MSAModify);
+ Reserved.set(Mips::MSARequest);
+ Reserved.set(Mips::MSAMap);
+ Reserved.set(Mips::MSAUnmap);
+
+ // Reserve RA if in mips16 mode.
+ if (Subtarget.inMips16Mode()) {
+ const MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+ Reserved.set(Mips::RA);
+ Reserved.set(Mips::RA_64);
+ Reserved.set(Mips::T0);
+ Reserved.set(Mips::T1);
+ if (MF.getFunction()->hasFnAttribute("saveS2") || MipsFI->hasSaveS2())
+ Reserved.set(Mips::S2);
+ }
+
+ // Reserve GP if small section is used.
+ if (Subtarget.useSmallSection()) {
+ Reserved.set(Mips::GP);
+ Reserved.set(Mips::GP_64);
+ }
+
+ if (Subtarget.isABI_O32() && !Subtarget.useOddSPReg()) {
+ for (const auto &Reg : Mips::OddSPRegClass)
+ Reserved.set(Reg);
+ }
+
+ return Reserved;
+}
+
+bool
+MipsRegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const {
+ return true;
+}
+
+bool
+MipsRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+ return true;
+}
+
+// FrameIndex represent objects inside a abstract stack.
+// We must replace FrameIndex with an stack/frame pointer
+// direct reference.
+void MipsRegisterInfo::
+eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+ unsigned FIOperandNum, RegScavenger *RS) const {
+ MachineInstr &MI = *II;
+ MachineFunction &MF = *MI.getParent()->getParent();
+
+ DEBUG(errs() << "\nFunction : " << MF.getName() << "\n";
+ errs() << "<--------->\n" << MI);
+
+ int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+ uint64_t stackSize = MF.getFrameInfo().getStackSize();
+ int64_t spOffset = MF.getFrameInfo().getObjectOffset(FrameIndex);
+
+ DEBUG(errs() << "FrameIndex : " << FrameIndex << "\n"
+ << "spOffset : " << spOffset << "\n"
+ << "stackSize : " << stackSize << "\n");
+
+ eliminateFI(MI, FIOperandNum, FrameIndex, stackSize, spOffset);
+}
+
+unsigned MipsRegisterInfo::
+getFrameRegister(const MachineFunction &MF) const {
+ const MipsSubtarget &Subtarget = MF.getSubtarget<MipsSubtarget>();
+ const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
+ bool IsN64 =
+ static_cast<const MipsTargetMachine &>(MF.getTarget()).getABI().IsN64();
+
+ if (Subtarget.inMips16Mode())
+ return TFI->hasFP(MF) ? Mips::S0 : Mips::SP;
+ else
+ return TFI->hasFP(MF) ? (IsN64 ? Mips::FP_64 : Mips::FP) :
+ (IsN64 ? Mips::SP_64 : Mips::SP);
+}
+
+bool MipsRegisterInfo::canRealignStack(const MachineFunction &MF) const {
+ // Avoid realigning functions that explicitly do not want to be realigned.
+ // Normally, we should report an error when a function should be dynamically
+ // realigned but also has the attribute no-realign-stack. Unfortunately,
+ // with this attribute, MachineFrameInfo clamps each new object's alignment
+ // to that of the stack's alignment as specified by the ABI. As a result,
+ // the information of whether we have objects with larger alignment
+ // requirement than the stack's alignment is already lost at this point.
+ if (!TargetRegisterInfo::canRealignStack(MF))
+ return false;
+
+ const MipsSubtarget &Subtarget = MF.getSubtarget<MipsSubtarget>();
+ unsigned FP = Subtarget.isGP32bit() ? Mips::FP : Mips::FP_64;
+ unsigned BP = Subtarget.isGP32bit() ? Mips::S7 : Mips::S7_64;
+
+ // Support dynamic stack realignment only for targets with standard encoding.
+ if (!Subtarget.hasStandardEncoding())
+ return false;
+
+ // We can't perform dynamic stack realignment if we can't reserve the
+ // frame pointer register.
+ if (!MF.getRegInfo().canReserveReg(FP))
+ return false;
+
+ // We can realign the stack if we know the maximum call frame size and we
+ // don't have variable sized objects.
+ if (Subtarget.getFrameLowering()->hasReservedCallFrame(MF))
+ return true;
+
+ // We have to reserve the base pointer register in the presence of variable
+ // sized objects.
+ return MF.getRegInfo().canReserveReg(BP);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.h b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.h
new file mode 100644
index 000000000000..32f835e83108
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.h
@@ -0,0 +1,82 @@
+//===-- MipsRegisterInfo.h - Mips Register Information Impl -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSREGISTERINFO_H
+#define LLVM_LIB_TARGET_MIPS_MIPSREGISTERINFO_H
+
+#include "Mips.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "MipsGenRegisterInfo.inc"
+
+namespace llvm {
+class MipsRegisterInfo : public MipsGenRegisterInfo {
+public:
+ enum class MipsPtrClass {
+ /// The default register class for integer values.
+ Default = 0,
+ /// The subset of registers permitted in certain microMIPS instructions
+ /// such as lw16.
+ GPR16MM = 1,
+ /// The stack pointer only.
+ StackPointer = 2,
+ /// The global pointer only.
+ GlobalPointer = 3,
+ };
+
+ MipsRegisterInfo();
+
+ /// Get PIC indirect call register
+ static unsigned getPICCallReg();
+
+ /// Code Generation virtual methods...
+ const TargetRegisterClass *getPointerRegClass(const MachineFunction &MF,
+ unsigned Kind) const override;
+
+ unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+ MachineFunction &MF) const override;
+ const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+ const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+ CallingConv::ID) const override;
+ static const uint32_t *getMips16RetHelperMask();
+
+ BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+ bool requiresRegisterScavenging(const MachineFunction &MF) const override;
+
+ bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
+
+ /// Stack Frame Processing Methods
+ void eliminateFrameIndex(MachineBasicBlock::iterator II,
+ int SPAdj, unsigned FIOperandNum,
+ RegScavenger *RS = nullptr) const override;
+
+ // Stack realignment queries.
+ bool canRealignStack(const MachineFunction &MF) const override;
+
+ /// Debug information queries.
+ unsigned getFrameRegister(const MachineFunction &MF) const override;
+
+ /// \brief Return GPR register class.
+ virtual const TargetRegisterClass *intRegClass(unsigned Size) const = 0;
+
+private:
+ virtual void eliminateFI(MachineBasicBlock::iterator II, unsigned OpNo,
+ int FrameIndex, uint64_t StackSize,
+ int64_t SPOffset) const = 0;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.td b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.td
new file mode 100644
index 000000000000..8c82239ebbd3
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.td
@@ -0,0 +1,673 @@
+//===-- MipsRegisterInfo.td - Mips Register defs -----------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Declarations that describe the MIPS register file
+//===----------------------------------------------------------------------===//
+let Namespace = "Mips" in {
+def sub_32 : SubRegIndex<32>;
+def sub_64 : SubRegIndex<64>;
+def sub_lo : SubRegIndex<32>;
+def sub_hi : SubRegIndex<32, 32>;
+def sub_dsp16_19 : SubRegIndex<4, 16>;
+def sub_dsp20 : SubRegIndex<1, 20>;
+def sub_dsp21 : SubRegIndex<1, 21>;
+def sub_dsp22 : SubRegIndex<1, 22>;
+def sub_dsp23 : SubRegIndex<1, 23>;
+}
+
+class Unallocatable {
+ bit isAllocatable = 0;
+}
+
+// We have banks of 32 registers each.
+class MipsReg<bits<16> Enc, string n> : Register<n> {
+ let HWEncoding = Enc;
+ let Namespace = "Mips";
+}
+
+class MipsRegWithSubRegs<bits<16> Enc, string n, list<Register> subregs>
+ : RegisterWithSubRegs<n, subregs> {
+ let HWEncoding = Enc;
+ let Namespace = "Mips";
+}
+
+// Mips CPU Registers
+class MipsGPRReg<bits<16> Enc, string n> : MipsReg<Enc, n>;
+
+// Mips 64-bit CPU Registers
+class Mips64GPRReg<bits<16> Enc, string n, list<Register> subregs>
+ : MipsRegWithSubRegs<Enc, n, subregs> {
+ let SubRegIndices = [sub_32];
+}
+
+// Mips 32-bit FPU Registers
+class FPR<bits<16> Enc, string n> : MipsReg<Enc, n>;
+
+// Mips 64-bit (aliased) FPU Registers
+class AFPR<bits<16> Enc, string n, list<Register> subregs>
+ : MipsRegWithSubRegs<Enc, n, subregs> {
+ let SubRegIndices = [sub_lo, sub_hi];
+ let CoveredBySubRegs = 1;
+}
+
+class AFPR64<bits<16> Enc, string n, list<Register> subregs>
+ : MipsRegWithSubRegs<Enc, n, subregs> {
+ let SubRegIndices = [sub_lo, sub_hi];
+ let CoveredBySubRegs = 1;
+}
+
+// Mips 128-bit (aliased) MSA Registers
+class AFPR128<bits<16> Enc, string n, list<Register> subregs>
+ : MipsRegWithSubRegs<Enc, n, subregs> {
+ let SubRegIndices = [sub_64];
+}
+
+// Accumulator Registers
+class ACCReg<bits<16> Enc, string n, list<Register> subregs>
+ : MipsRegWithSubRegs<Enc, n, subregs> {
+ let SubRegIndices = [sub_lo, sub_hi];
+ let CoveredBySubRegs = 1;
+}
+
+// Mips Hardware Registers
+class HWR<bits<16> Enc, string n> : MipsReg<Enc, n>;
+
+//===----------------------------------------------------------------------===//
+// Registers
+//===----------------------------------------------------------------------===//
+
+let Namespace = "Mips" in {
+ // General Purpose Registers
+ def ZERO : MipsGPRReg< 0, "zero">, DwarfRegNum<[0]>;
+ def AT : MipsGPRReg< 1, "1">, DwarfRegNum<[1]>;
+ def V0 : MipsGPRReg< 2, "2">, DwarfRegNum<[2]>;
+ def V1 : MipsGPRReg< 3, "3">, DwarfRegNum<[3]>;
+ def A0 : MipsGPRReg< 4, "4">, DwarfRegNum<[4]>;
+ def A1 : MipsGPRReg< 5, "5">, DwarfRegNum<[5]>;
+ def A2 : MipsGPRReg< 6, "6">, DwarfRegNum<[6]>;
+ def A3 : MipsGPRReg< 7, "7">, DwarfRegNum<[7]>;
+ def T0 : MipsGPRReg< 8, "8">, DwarfRegNum<[8]>;
+ def T1 : MipsGPRReg< 9, "9">, DwarfRegNum<[9]>;
+ def T2 : MipsGPRReg< 10, "10">, DwarfRegNum<[10]>;
+ def T3 : MipsGPRReg< 11, "11">, DwarfRegNum<[11]>;
+ def T4 : MipsGPRReg< 12, "12">, DwarfRegNum<[12]>;
+ def T5 : MipsGPRReg< 13, "13">, DwarfRegNum<[13]>;
+ def T6 : MipsGPRReg< 14, "14">, DwarfRegNum<[14]>;
+ def T7 : MipsGPRReg< 15, "15">, DwarfRegNum<[15]>;
+ def S0 : MipsGPRReg< 16, "16">, DwarfRegNum<[16]>;
+ def S1 : MipsGPRReg< 17, "17">, DwarfRegNum<[17]>;
+ def S2 : MipsGPRReg< 18, "18">, DwarfRegNum<[18]>;
+ def S3 : MipsGPRReg< 19, "19">, DwarfRegNum<[19]>;
+ def S4 : MipsGPRReg< 20, "20">, DwarfRegNum<[20]>;
+ def S5 : MipsGPRReg< 21, "21">, DwarfRegNum<[21]>;
+ def S6 : MipsGPRReg< 22, "22">, DwarfRegNum<[22]>;
+ def S7 : MipsGPRReg< 23, "23">, DwarfRegNum<[23]>;
+ def T8 : MipsGPRReg< 24, "24">, DwarfRegNum<[24]>;
+ def T9 : MipsGPRReg< 25, "25">, DwarfRegNum<[25]>;
+ def K0 : MipsGPRReg< 26, "26">, DwarfRegNum<[26]>;
+ def K1 : MipsGPRReg< 27, "27">, DwarfRegNum<[27]>;
+ def GP : MipsGPRReg< 28, "gp">, DwarfRegNum<[28]>;
+ def SP : MipsGPRReg< 29, "sp">, DwarfRegNum<[29]>;
+ def FP : MipsGPRReg< 30, "fp">, DwarfRegNum<[30]>;
+ def RA : MipsGPRReg< 31, "ra">, DwarfRegNum<[31]>;
+
+ // General Purpose 64-bit Registers
+ def ZERO_64 : Mips64GPRReg< 0, "zero", [ZERO]>, DwarfRegNum<[0]>;
+ def AT_64 : Mips64GPRReg< 1, "1", [AT]>, DwarfRegNum<[1]>;
+ def V0_64 : Mips64GPRReg< 2, "2", [V0]>, DwarfRegNum<[2]>;
+ def V1_64 : Mips64GPRReg< 3, "3", [V1]>, DwarfRegNum<[3]>;
+ def A0_64 : Mips64GPRReg< 4, "4", [A0]>, DwarfRegNum<[4]>;
+ def A1_64 : Mips64GPRReg< 5, "5", [A1]>, DwarfRegNum<[5]>;
+ def A2_64 : Mips64GPRReg< 6, "6", [A2]>, DwarfRegNum<[6]>;
+ def A3_64 : Mips64GPRReg< 7, "7", [A3]>, DwarfRegNum<[7]>;
+ def T0_64 : Mips64GPRReg< 8, "8", [T0]>, DwarfRegNum<[8]>;
+ def T1_64 : Mips64GPRReg< 9, "9", [T1]>, DwarfRegNum<[9]>;
+ def T2_64 : Mips64GPRReg< 10, "10", [T2]>, DwarfRegNum<[10]>;
+ def T3_64 : Mips64GPRReg< 11, "11", [T3]>, DwarfRegNum<[11]>;
+ def T4_64 : Mips64GPRReg< 12, "12", [T4]>, DwarfRegNum<[12]>;
+ def T5_64 : Mips64GPRReg< 13, "13", [T5]>, DwarfRegNum<[13]>;
+ def T6_64 : Mips64GPRReg< 14, "14", [T6]>, DwarfRegNum<[14]>;
+ def T7_64 : Mips64GPRReg< 15, "15", [T7]>, DwarfRegNum<[15]>;
+ def S0_64 : Mips64GPRReg< 16, "16", [S0]>, DwarfRegNum<[16]>;
+ def S1_64 : Mips64GPRReg< 17, "17", [S1]>, DwarfRegNum<[17]>;
+ def S2_64 : Mips64GPRReg< 18, "18", [S2]>, DwarfRegNum<[18]>;
+ def S3_64 : Mips64GPRReg< 19, "19", [S3]>, DwarfRegNum<[19]>;
+ def S4_64 : Mips64GPRReg< 20, "20", [S4]>, DwarfRegNum<[20]>;
+ def S5_64 : Mips64GPRReg< 21, "21", [S5]>, DwarfRegNum<[21]>;
+ def S6_64 : Mips64GPRReg< 22, "22", [S6]>, DwarfRegNum<[22]>;
+ def S7_64 : Mips64GPRReg< 23, "23", [S7]>, DwarfRegNum<[23]>;
+ def T8_64 : Mips64GPRReg< 24, "24", [T8]>, DwarfRegNum<[24]>;
+ def T9_64 : Mips64GPRReg< 25, "25", [T9]>, DwarfRegNum<[25]>;
+ def K0_64 : Mips64GPRReg< 26, "26", [K0]>, DwarfRegNum<[26]>;
+ def K1_64 : Mips64GPRReg< 27, "27", [K1]>, DwarfRegNum<[27]>;
+ def GP_64 : Mips64GPRReg< 28, "gp", [GP]>, DwarfRegNum<[28]>;
+ def SP_64 : Mips64GPRReg< 29, "sp", [SP]>, DwarfRegNum<[29]>;
+ def FP_64 : Mips64GPRReg< 30, "fp", [FP]>, DwarfRegNum<[30]>;
+ def RA_64 : Mips64GPRReg< 31, "ra", [RA]>, DwarfRegNum<[31]>;
+
+ /// Mips Single point precision FPU Registers
+ foreach I = 0-31 in
+ def F#I : FPR<I, "f"#I>, DwarfRegNum<[!add(I, 32)]>;
+
+ // Higher half of 64-bit FP registers.
+ foreach I = 0-31 in
+ def F_HI#I : FPR<I, "f"#I>, DwarfRegNum<[!add(I, 32)]>;
+
+ /// Mips Double point precision FPU Registers (aliased
+ /// with the single precision to hold 64 bit values)
+ foreach I = 0-15 in
+ def D#I : AFPR<!shl(I, 1), "f"#!shl(I, 1),
+ [!cast<FPR>("F"#!shl(I, 1)),
+ !cast<FPR>("F"#!add(!shl(I, 1), 1))]>;
+
+ /// Mips Double point precision FPU Registers in MFP64 mode.
+ foreach I = 0-31 in
+ def D#I#_64 : AFPR64<I, "f"#I, [!cast<FPR>("F"#I), !cast<FPR>("F_HI"#I)]>,
+ DwarfRegNum<[!add(I, 32)]>;
+
+ /// Mips MSA registers
+ /// MSA and FPU cannot both be present unless the FPU has 64-bit registers
+ foreach I = 0-31 in
+ def W#I : AFPR128<I, "w"#I, [!cast<AFPR64>("D"#I#"_64")]>,
+ DwarfRegNum<[!add(I, 32)]>;
+
+ // Hi/Lo registers
+ def HI0 : MipsReg<0, "ac0">, DwarfRegNum<[64]>;
+ def HI1 : MipsReg<1, "ac1">, DwarfRegNum<[176]>;
+ def HI2 : MipsReg<2, "ac2">, DwarfRegNum<[178]>;
+ def HI3 : MipsReg<3, "ac3">, DwarfRegNum<[180]>;
+ def LO0 : MipsReg<0, "ac0">, DwarfRegNum<[65]>;
+ def LO1 : MipsReg<1, "ac1">, DwarfRegNum<[177]>;
+ def LO2 : MipsReg<2, "ac2">, DwarfRegNum<[179]>;
+ def LO3 : MipsReg<3, "ac3">, DwarfRegNum<[181]>;
+
+ let SubRegIndices = [sub_32] in {
+ def HI0_64 : RegisterWithSubRegs<"hi", [HI0]>;
+ def LO0_64 : RegisterWithSubRegs<"lo", [LO0]>;
+ }
+
+ // FP control registers.
+ foreach I = 0-31 in
+ def FCR#I : MipsReg<#I, ""#I>;
+
+ // FP condition code registers.
+ foreach I = 0-7 in
+ def FCC#I : MipsReg<#I, "fcc"#I>;
+
+ // COP0 registers.
+ foreach I = 0-31 in
+ def COP0#I : MipsReg<#I, ""#I>;
+
+ // COP2 registers.
+ foreach I = 0-31 in
+ def COP2#I : MipsReg<#I, ""#I>;
+
+ // COP3 registers.
+ foreach I = 0-31 in
+ def COP3#I : MipsReg<#I, ""#I>;
+
+ // PC register
+ def PC : Register<"pc">;
+
+ // Hardware registers
+ def HWR0 : MipsReg<0, "hwr_cpunum">;
+ def HWR1 : MipsReg<1, "hwr_synci_step">;
+ def HWR2 : MipsReg<2, "hwr_cc">;
+ def HWR3 : MipsReg<3, "hwr_ccres">;
+
+ foreach I = 4-31 in
+ def HWR#I : MipsReg<#I, ""#I>;
+
+ // Accum registers
+ foreach I = 0-3 in
+ def AC#I : ACCReg<#I, "ac"#I,
+ [!cast<Register>("LO"#I), !cast<Register>("HI"#I)]>;
+
+ def AC0_64 : ACCReg<0, "ac0", [LO0_64, HI0_64]>;
+
+ // DSP-ASE control register fields.
+ def DSPPos : Register<"">;
+ def DSPSCount : Register<"">;
+ def DSPCarry : Register<"">;
+ def DSPEFI : Register<"">;
+ def DSPOutFlag16_19 : Register<"">;
+ def DSPOutFlag20 : Register<"">;
+ def DSPOutFlag21 : Register<"">;
+ def DSPOutFlag22 : Register<"">;
+ def DSPOutFlag23 : Register<"">;
+ def DSPCCond : Register<"">;
+
+ let SubRegIndices = [sub_dsp16_19, sub_dsp20, sub_dsp21, sub_dsp22,
+ sub_dsp23] in
+ def DSPOutFlag : RegisterWithSubRegs<"", [DSPOutFlag16_19, DSPOutFlag20,
+ DSPOutFlag21, DSPOutFlag22,
+ DSPOutFlag23]>;
+
+ // MSA-ASE control registers.
+ def MSAIR : MipsReg<0, "0">;
+ def MSACSR : MipsReg<1, "1">;
+ def MSAAccess : MipsReg<2, "2">;
+ def MSASave : MipsReg<3, "3">;
+ def MSAModify : MipsReg<4, "4">;
+ def MSARequest : MipsReg<5, "5">;
+ def MSAMap : MipsReg<6, "6">;
+ def MSAUnmap : MipsReg<7, "7">;
+
+ // Octeon multiplier and product registers
+ def MPL0 : MipsReg<0, "mpl0">;
+ def MPL1 : MipsReg<1, "mpl1">;
+ def MPL2 : MipsReg<2, "mpl2">;
+ def P0 : MipsReg<0, "p0">;
+ def P1 : MipsReg<1, "p1">;
+ def P2 : MipsReg<2, "p2">;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Register Classes
+//===----------------------------------------------------------------------===//
+
+class GPR32Class<list<ValueType> regTypes> :
+ RegisterClass<"Mips", regTypes, 32, (add
+ // Reserved
+ ZERO, AT,
+ // Return Values and Arguments
+ V0, V1, A0, A1, A2, A3,
+ // Not preserved across procedure calls
+ T0, T1, T2, T3, T4, T5, T6, T7,
+ // Callee save
+ S0, S1, S2, S3, S4, S5, S6, S7,
+ // Not preserved across procedure calls
+ T8, T9,
+ // Reserved
+ K0, K1, GP, SP, FP, RA)>;
+
+def GPR32 : GPR32Class<[i32]>;
+def DSPR : GPR32Class<[v4i8, v2i16]>;
+
+def GPRMM16 : RegisterClass<"Mips", [i32], 32, (add
+ // Callee save
+ S0, S1,
+ // Return Values and Arguments
+ V0, V1, A0, A1, A2, A3)>;
+
+def GPRMM16Zero : RegisterClass<"Mips", [i32], 32, (add
+ // Reserved
+ ZERO,
+ // Callee save
+ S1,
+ // Return Values and Arguments
+ V0, V1, A0, A1, A2, A3)>;
+
+def GPRMM16MoveP : RegisterClass<"Mips", [i32], 32, (add
+ // Reserved
+ ZERO,
+ // Callee save
+ S1,
+ // Return Values and Arguments
+ V0, V1,
+ // Callee save
+ S0, S2, S3, S4)>;
+
+def GPR64 : RegisterClass<"Mips", [i64], 64, (add
+// Reserved
+ ZERO_64, AT_64,
+ // Return Values and Arguments
+ V0_64, V1_64, A0_64, A1_64, A2_64, A3_64,
+ // Not preserved across procedure calls
+ T0_64, T1_64, T2_64, T3_64, T4_64, T5_64, T6_64, T7_64,
+ // Callee save
+ S0_64, S1_64, S2_64, S3_64, S4_64, S5_64, S6_64, S7_64,
+ // Not preserved across procedure calls
+ T8_64, T9_64,
+ // Reserved
+ K0_64, K1_64, GP_64, SP_64, FP_64, RA_64)>;
+
+def GPRMM16_64 : RegisterClass<"Mips", [i64], 64, (add
+ // Callee save
+ S0_64, S1_64,
+ // Return Values and Arguments
+ V0_64, V1_64, A0_64, A1_64, A2_64, A3_64)>;
+
+def CPU16Regs : RegisterClass<"Mips", [i32], 32, (add
+ // Return Values and Arguments
+ V0, V1, A0, A1, A2, A3,
+ // Callee save
+ S0, S1)>;
+
+def CPU16RegsPlusSP : RegisterClass<"Mips", [i32], 32, (add
+ // Return Values and Arguments
+ V0, V1, A0, A1, A2, A3,
+ // Callee save
+ S0, S1,
+ SP)>;
+
+def CPURAReg : RegisterClass<"Mips", [i32], 32, (add RA)>, Unallocatable;
+
+def CPUSPReg : RegisterClass<"Mips", [i32], 32, (add SP)>, Unallocatable;
+
+// 64bit fp:
+// * FGR64 - 32 64-bit registers
+// * AFGR64 - 16 32-bit even registers (32-bit FP Mode)
+//
+// 32bit fp:
+// * FGR32 - 16 32-bit even registers
+// * FGR32 - 32 32-bit registers (single float only mode)
+def FGR32 : RegisterClass<"Mips", [f32], 32, (sequence "F%u", 0, 31)>;
+
+def FGRH32 : RegisterClass<"Mips", [f32], 32, (sequence "F_HI%u", 0, 31)>,
+ Unallocatable;
+
+def AFGR64 : RegisterClass<"Mips", [f64], 64, (add
+ // Return Values and Arguments
+ D0, D1,
+ // Not preserved across procedure calls
+ D2, D3, D4, D5,
+ // Return Values and Arguments
+ D6, D7,
+ // Not preserved across procedure calls
+ D8, D9,
+ // Callee save
+ D10, D11, D12, D13, D14, D15)>;
+
+def FGR64 : RegisterClass<"Mips", [f64], 64, (sequence "D%u_64", 0, 31)>;
+
+// Used to reserve odd registers when given -mattr=+nooddspreg
+// FIXME: Remove double precision registers from this set.
+def OddSP : RegisterClass<"Mips", [f32], 32,
+ (add (decimate (sequence "F%u", 1, 31), 2),
+ (decimate (sequence "F_HI%u", 1, 31), 2),
+ (decimate (sequence "D%u", 1, 15), 2),
+ (decimate (sequence "D%u_64", 1, 31), 2))>,
+ Unallocatable;
+
+// FP control registers.
+def CCR : RegisterClass<"Mips", [i32], 32, (sequence "FCR%u", 0, 31)>,
+ Unallocatable;
+
+// FP condition code registers.
+def FCC : RegisterClass<"Mips", [i32], 32, (sequence "FCC%u", 0, 7)>,
+ Unallocatable;
+
+// MIPS32r6/MIPS64r6 store FPU condition codes in normal FGR registers.
+// This class allows us to represent this in codegen patterns.
+def FGRCC : RegisterClass<"Mips", [i32], 32, (sequence "F%u", 0, 31)>;
+
+def MSA128F16 : RegisterClass<"Mips", [f16], 128, (sequence "W%u", 0, 31)>;
+
+def MSA128B: RegisterClass<"Mips", [v16i8], 128,
+ (sequence "W%u", 0, 31)>;
+def MSA128H: RegisterClass<"Mips", [v8i16, v8f16], 128,
+ (sequence "W%u", 0, 31)>;
+def MSA128W: RegisterClass<"Mips", [v4i32, v4f32], 128,
+ (sequence "W%u", 0, 31)>;
+def MSA128D: RegisterClass<"Mips", [v2i64, v2f64], 128,
+ (sequence "W%u", 0, 31)>;
+def MSA128WEvens: RegisterClass<"Mips", [v4i32, v4f32], 128,
+ (decimate (sequence "W%u", 0, 31), 2)>;
+
+def MSACtrl: RegisterClass<"Mips", [i32], 32, (add
+ MSAIR, MSACSR, MSAAccess, MSASave, MSAModify, MSARequest, MSAMap, MSAUnmap)>;
+
+// Hi/Lo Registers
+def LO32 : RegisterClass<"Mips", [i32], 32, (add LO0)>;
+def HI32 : RegisterClass<"Mips", [i32], 32, (add HI0)>;
+def LO32DSP : RegisterClass<"Mips", [i32], 32, (sequence "LO%u", 0, 3)>;
+def HI32DSP : RegisterClass<"Mips", [i32], 32, (sequence "HI%u", 0, 3)>;
+def LO64 : RegisterClass<"Mips", [i64], 64, (add LO0_64)>;
+def HI64 : RegisterClass<"Mips", [i64], 64, (add HI0_64)>;
+
+// Hardware registers
+def HWRegs : RegisterClass<"Mips", [i32], 32, (sequence "HWR%u", 0, 31)>,
+ Unallocatable;
+
+// Accumulator Registers
+def ACC64 : RegisterClass<"Mips", [untyped], 64, (add AC0)> {
+ let Size = 64;
+}
+
+def ACC128 : RegisterClass<"Mips", [untyped], 128, (add AC0_64)> {
+ let Size = 128;
+}
+
+def ACC64DSP : RegisterClass<"Mips", [untyped], 64, (sequence "AC%u", 0, 3)> {
+ let Size = 64;
+}
+
+def DSPCC : RegisterClass<"Mips", [v4i8, v2i16], 32, (add DSPCCond)>;
+
+// Coprocessor 0 registers.
+def COP0 : RegisterClass<"Mips", [i32], 32, (sequence "COP0%u", 0, 31)>,
+ Unallocatable;
+
+// Coprocessor 2 registers.
+def COP2 : RegisterClass<"Mips", [i32], 32, (sequence "COP2%u", 0, 31)>,
+ Unallocatable;
+
+// Coprocessor 3 registers.
+def COP3 : RegisterClass<"Mips", [i32], 32, (sequence "COP3%u", 0, 31)>,
+ Unallocatable;
+
+// Stack pointer and global pointer classes for instructions that are limited
+// to a single register such as lwgp/lwsp in microMIPS.
+def SP32 : RegisterClass<"Mips", [i32], 32, (add SP)>, Unallocatable;
+def SP64 : RegisterClass<"Mips", [i64], 64, (add SP_64)>, Unallocatable;
+def GP32 : RegisterClass<"Mips", [i32], 32, (add GP)>, Unallocatable;
+def GP64 : RegisterClass<"Mips", [i64], 64, (add GP_64)>, Unallocatable;
+
+// Octeon multiplier and product registers
+def OCTEON_MPL : RegisterClass<"Mips", [i64], 64, (add MPL0, MPL1, MPL2)>,
+ Unallocatable;
+def OCTEON_P : RegisterClass<"Mips", [i64], 64, (add P0, P1, P2)>,
+ Unallocatable;
+
+// Register Operands.
+
+class MipsAsmRegOperand : AsmOperandClass {
+ let ParserMethod = "parseAnyRegister";
+}
+
+def GPR64AsmOperand : MipsAsmRegOperand {
+ let Name = "GPR64AsmReg";
+ let PredicateMethod = "isGPRAsmReg";
+}
+
+def GPR32AsmOperand : MipsAsmRegOperand {
+ let Name = "GPR32AsmReg";
+ let PredicateMethod = "isGPRAsmReg";
+}
+
+def GPRMM16AsmOperand : MipsAsmRegOperand {
+ let Name = "GPRMM16AsmReg";
+ let PredicateMethod = "isMM16AsmReg";
+}
+
+def GPRMM16AsmOperandZero : MipsAsmRegOperand {
+ let Name = "GPRMM16AsmRegZero";
+ let PredicateMethod = "isMM16AsmRegZero";
+}
+
+def GPRMM16AsmOperandMoveP : MipsAsmRegOperand {
+ let Name = "GPRMM16AsmRegMoveP";
+ let PredicateMethod = "isMM16AsmRegMoveP";
+}
+
+def ACC64DSPAsmOperand : MipsAsmRegOperand {
+ let Name = "ACC64DSPAsmReg";
+ let PredicateMethod = "isACCAsmReg";
+}
+
+def HI32DSPAsmOperand : MipsAsmRegOperand {
+ let Name = "HI32DSPAsmReg";
+ let PredicateMethod = "isACCAsmReg";
+}
+
+def LO32DSPAsmOperand : MipsAsmRegOperand {
+ let Name = "LO32DSPAsmReg";
+ let PredicateMethod = "isACCAsmReg";
+}
+
+def CCRAsmOperand : MipsAsmRegOperand {
+ let Name = "CCRAsmReg";
+}
+
+def AFGR64AsmOperand : MipsAsmRegOperand {
+ let Name = "AFGR64AsmReg";
+ let PredicateMethod = "isFGRAsmReg";
+}
+
+def FGR64AsmOperand : MipsAsmRegOperand {
+ let Name = "FGR64AsmReg";
+ let PredicateMethod = "isFGRAsmReg";
+}
+
+def FGR32AsmOperand : MipsAsmRegOperand {
+ let Name = "FGR32AsmReg";
+ let PredicateMethod = "isFGRAsmReg";
+}
+
+def FGRH32AsmOperand : MipsAsmRegOperand {
+ let Name = "FGRH32AsmReg";
+ let PredicateMethod = "isFGRAsmReg";
+}
+
+def FCCRegsAsmOperand : MipsAsmRegOperand {
+ let Name = "FCCAsmReg";
+}
+
+def MSA128AsmOperand : MipsAsmRegOperand {
+ let Name = "MSA128AsmReg";
+}
+
+def MSACtrlAsmOperand : MipsAsmRegOperand {
+ let Name = "MSACtrlAsmReg";
+}
+
+def GPR32Opnd : RegisterOperand<GPR32> {
+ let ParserMatchClass = GPR32AsmOperand;
+}
+
+def GPRMM16Opnd : RegisterOperand<GPRMM16> {
+ let ParserMatchClass = GPRMM16AsmOperand;
+}
+
+def GPRMM16OpndZero : RegisterOperand<GPRMM16Zero> {
+ let ParserMatchClass = GPRMM16AsmOperandZero;
+}
+
+def GPRMM16OpndMoveP : RegisterOperand<GPRMM16MoveP> {
+ let ParserMatchClass = GPRMM16AsmOperandMoveP;
+}
+
+def GPR64Opnd : RegisterOperand<GPR64> {
+ let ParserMatchClass = GPR64AsmOperand;
+}
+
+def DSPROpnd : RegisterOperand<DSPR> {
+ let ParserMatchClass = GPR32AsmOperand;
+}
+
+def CCROpnd : RegisterOperand<CCR> {
+ let ParserMatchClass = CCRAsmOperand;
+}
+
+def HWRegsAsmOperand : MipsAsmRegOperand {
+ let Name = "HWRegsAsmReg";
+}
+
+def COP0AsmOperand : MipsAsmRegOperand {
+ let Name = "COP0AsmReg";
+}
+
+def COP2AsmOperand : MipsAsmRegOperand {
+ let Name = "COP2AsmReg";
+}
+
+def COP3AsmOperand : MipsAsmRegOperand {
+ let Name = "COP3AsmReg";
+}
+
+def HWRegsOpnd : RegisterOperand<HWRegs> {
+ let ParserMatchClass = HWRegsAsmOperand;
+}
+
+def AFGR64Opnd : RegisterOperand<AFGR64> {
+ let ParserMatchClass = AFGR64AsmOperand;
+}
+
+def FGR64Opnd : RegisterOperand<FGR64> {
+ let ParserMatchClass = FGR64AsmOperand;
+}
+
+def FGR32Opnd : RegisterOperand<FGR32> {
+ let ParserMatchClass = FGR32AsmOperand;
+}
+
+def FGRCCOpnd : RegisterOperand<FGRCC> {
+ // The assembler doesn't use register classes so we can re-use
+ // FGR32AsmOperand.
+ let ParserMatchClass = FGR32AsmOperand;
+}
+
+def FGRH32Opnd : RegisterOperand<FGRH32> {
+ let ParserMatchClass = FGRH32AsmOperand;
+}
+
+def FCCRegsOpnd : RegisterOperand<FCC> {
+ let ParserMatchClass = FCCRegsAsmOperand;
+}
+
+def LO32DSPOpnd : RegisterOperand<LO32DSP> {
+ let ParserMatchClass = LO32DSPAsmOperand;
+}
+
+def HI32DSPOpnd : RegisterOperand<HI32DSP> {
+ let ParserMatchClass = HI32DSPAsmOperand;
+}
+
+def ACC64DSPOpnd : RegisterOperand<ACC64DSP> {
+ let ParserMatchClass = ACC64DSPAsmOperand;
+}
+
+def COP0Opnd : RegisterOperand<COP0> {
+ let ParserMatchClass = COP0AsmOperand;
+}
+
+def COP2Opnd : RegisterOperand<COP2> {
+ let ParserMatchClass = COP2AsmOperand;
+}
+
+def COP3Opnd : RegisterOperand<COP3> {
+ let ParserMatchClass = COP3AsmOperand;
+}
+
+def MSA128F16Opnd : RegisterOperand<MSA128F16> {
+ let ParserMatchClass = MSA128AsmOperand;
+}
+
+def MSA128BOpnd : RegisterOperand<MSA128B> {
+ let ParserMatchClass = MSA128AsmOperand;
+}
+
+def MSA128HOpnd : RegisterOperand<MSA128H> {
+ let ParserMatchClass = MSA128AsmOperand;
+}
+
+def MSA128WOpnd : RegisterOperand<MSA128W> {
+ let ParserMatchClass = MSA128AsmOperand;
+}
+
+def MSA128DOpnd : RegisterOperand<MSA128D> {
+ let ParserMatchClass = MSA128AsmOperand;
+}
+
+def MSA128CROpnd : RegisterOperand<MSACtrl> {
+ let ParserMatchClass = MSACtrlAsmOperand;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
new file mode 100644
index 000000000000..4996d070eb29
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -0,0 +1,893 @@
+//===-- MipsSEFrameLowering.cpp - Mips32/64 Frame Information -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips32/64 implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsSEFrameLowering.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MipsMachineFunction.h"
+#include "MipsSEInstrInfo.h"
+#include "MipsSubtarget.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+namespace {
+typedef MachineBasicBlock::iterator Iter;
+
+static std::pair<unsigned, unsigned> getMFHiLoOpc(unsigned Src) {
+ if (Mips::ACC64RegClass.contains(Src))
+ return std::make_pair((unsigned)Mips::PseudoMFHI,
+ (unsigned)Mips::PseudoMFLO);
+
+ if (Mips::ACC64DSPRegClass.contains(Src))
+ return std::make_pair((unsigned)Mips::MFHI_DSP, (unsigned)Mips::MFLO_DSP);
+
+ if (Mips::ACC128RegClass.contains(Src))
+ return std::make_pair((unsigned)Mips::PseudoMFHI64,
+ (unsigned)Mips::PseudoMFLO64);
+
+ return std::make_pair(0, 0);
+}
+
+/// Helper class to expand pseudos.
+class ExpandPseudo {
+public:
+ ExpandPseudo(MachineFunction &MF);
+ bool expand();
+
+private:
+ bool expandInstr(MachineBasicBlock &MBB, Iter I);
+ void expandLoadCCond(MachineBasicBlock &MBB, Iter I);
+ void expandStoreCCond(MachineBasicBlock &MBB, Iter I);
+ void expandLoadACC(MachineBasicBlock &MBB, Iter I, unsigned RegSize);
+ void expandStoreACC(MachineBasicBlock &MBB, Iter I, unsigned MFHiOpc,
+ unsigned MFLoOpc, unsigned RegSize);
+ bool expandCopy(MachineBasicBlock &MBB, Iter I);
+ bool expandCopyACC(MachineBasicBlock &MBB, Iter I, unsigned MFHiOpc,
+ unsigned MFLoOpc);
+ bool expandBuildPairF64(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I, bool FP64) const;
+ bool expandExtractElementF64(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I, bool FP64) const;
+
+ MachineFunction &MF;
+ MachineRegisterInfo &MRI;
+ const MipsSubtarget &Subtarget;
+ const MipsSEInstrInfo &TII;
+ const MipsRegisterInfo &RegInfo;
+};
+}
+
+ExpandPseudo::ExpandPseudo(MachineFunction &MF_)
+ : MF(MF_), MRI(MF.getRegInfo()),
+ Subtarget(static_cast<const MipsSubtarget &>(MF.getSubtarget())),
+ TII(*static_cast<const MipsSEInstrInfo *>(Subtarget.getInstrInfo())),
+ RegInfo(*Subtarget.getRegisterInfo()) {}
+
+bool ExpandPseudo::expand() {
+ bool Expanded = false;
+
+ for (auto &MBB : MF) {
+ for (Iter I = MBB.begin(), End = MBB.end(); I != End;)
+ Expanded |= expandInstr(MBB, I++);
+ }
+
+ return Expanded;
+}
+
+bool ExpandPseudo::expandInstr(MachineBasicBlock &MBB, Iter I) {
+ switch(I->getOpcode()) {
+ case Mips::LOAD_CCOND_DSP:
+ expandLoadCCond(MBB, I);
+ break;
+ case Mips::STORE_CCOND_DSP:
+ expandStoreCCond(MBB, I);
+ break;
+ case Mips::LOAD_ACC64:
+ case Mips::LOAD_ACC64DSP:
+ expandLoadACC(MBB, I, 4);
+ break;
+ case Mips::LOAD_ACC128:
+ expandLoadACC(MBB, I, 8);
+ break;
+ case Mips::STORE_ACC64:
+ expandStoreACC(MBB, I, Mips::PseudoMFHI, Mips::PseudoMFLO, 4);
+ break;
+ case Mips::STORE_ACC64DSP:
+ expandStoreACC(MBB, I, Mips::MFHI_DSP, Mips::MFLO_DSP, 4);
+ break;
+ case Mips::STORE_ACC128:
+ expandStoreACC(MBB, I, Mips::PseudoMFHI64, Mips::PseudoMFLO64, 8);
+ break;
+ case Mips::BuildPairF64:
+ if (expandBuildPairF64(MBB, I, false))
+ MBB.erase(I);
+ return false;
+ case Mips::BuildPairF64_64:
+ if (expandBuildPairF64(MBB, I, true))
+ MBB.erase(I);
+ return false;
+ case Mips::ExtractElementF64:
+ if (expandExtractElementF64(MBB, I, false))
+ MBB.erase(I);
+ return false;
+ case Mips::ExtractElementF64_64:
+ if (expandExtractElementF64(MBB, I, true))
+ MBB.erase(I);
+ return false;
+ case TargetOpcode::COPY:
+ if (!expandCopy(MBB, I))
+ return false;
+ break;
+ default:
+ return false;
+ }
+
+ MBB.erase(I);
+ return true;
+}
+
+void ExpandPseudo::expandLoadCCond(MachineBasicBlock &MBB, Iter I) {
+ // load $vr, FI
+ // copy ccond, $vr
+
+ assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
+
+ const TargetRegisterClass *RC = RegInfo.intRegClass(4);
+ unsigned VR = MRI.createVirtualRegister(RC);
+ unsigned Dst = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
+
+ TII.loadRegFromStack(MBB, I, VR, FI, RC, &RegInfo, 0);
+ BuildMI(MBB, I, I->getDebugLoc(), TII.get(TargetOpcode::COPY), Dst)
+ .addReg(VR, RegState::Kill);
+}
+
+void ExpandPseudo::expandStoreCCond(MachineBasicBlock &MBB, Iter I) {
+ // copy $vr, ccond
+ // store $vr, FI
+
+ assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
+
+ const TargetRegisterClass *RC = RegInfo.intRegClass(4);
+ unsigned VR = MRI.createVirtualRegister(RC);
+ unsigned Src = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
+
+ BuildMI(MBB, I, I->getDebugLoc(), TII.get(TargetOpcode::COPY), VR)
+ .addReg(Src, getKillRegState(I->getOperand(0).isKill()));
+ TII.storeRegToStack(MBB, I, VR, true, FI, RC, &RegInfo, 0);
+}
+
+void ExpandPseudo::expandLoadACC(MachineBasicBlock &MBB, Iter I,
+ unsigned RegSize) {
+ // load $vr0, FI
+ // copy lo, $vr0
+ // load $vr1, FI + 4
+ // copy hi, $vr1
+
+ assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
+
+ const TargetRegisterClass *RC = RegInfo.intRegClass(RegSize);
+ unsigned VR0 = MRI.createVirtualRegister(RC);
+ unsigned VR1 = MRI.createVirtualRegister(RC);
+ unsigned Dst = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
+ unsigned Lo = RegInfo.getSubReg(Dst, Mips::sub_lo);
+ unsigned Hi = RegInfo.getSubReg(Dst, Mips::sub_hi);
+ DebugLoc DL = I->getDebugLoc();
+ const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY);
+
+ TII.loadRegFromStack(MBB, I, VR0, FI, RC, &RegInfo, 0);
+ BuildMI(MBB, I, DL, Desc, Lo).addReg(VR0, RegState::Kill);
+ TII.loadRegFromStack(MBB, I, VR1, FI, RC, &RegInfo, RegSize);
+ BuildMI(MBB, I, DL, Desc, Hi).addReg(VR1, RegState::Kill);
+}
+
+void ExpandPseudo::expandStoreACC(MachineBasicBlock &MBB, Iter I,
+ unsigned MFHiOpc, unsigned MFLoOpc,
+ unsigned RegSize) {
+ // mflo $vr0, src
+ // store $vr0, FI
+ // mfhi $vr1, src
+ // store $vr1, FI + 4
+
+ assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
+
+ const TargetRegisterClass *RC = RegInfo.intRegClass(RegSize);
+ unsigned VR0 = MRI.createVirtualRegister(RC);
+ unsigned VR1 = MRI.createVirtualRegister(RC);
+ unsigned Src = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
+ unsigned SrcKill = getKillRegState(I->getOperand(0).isKill());
+ DebugLoc DL = I->getDebugLoc();
+
+ BuildMI(MBB, I, DL, TII.get(MFLoOpc), VR0).addReg(Src);
+ TII.storeRegToStack(MBB, I, VR0, true, FI, RC, &RegInfo, 0);
+ BuildMI(MBB, I, DL, TII.get(MFHiOpc), VR1).addReg(Src, SrcKill);
+ TII.storeRegToStack(MBB, I, VR1, true, FI, RC, &RegInfo, RegSize);
+}
+
+bool ExpandPseudo::expandCopy(MachineBasicBlock &MBB, Iter I) {
+ unsigned Src = I->getOperand(1).getReg();
+ std::pair<unsigned, unsigned> Opcodes = getMFHiLoOpc(Src);
+
+ if (!Opcodes.first)
+ return false;
+
+ return expandCopyACC(MBB, I, Opcodes.first, Opcodes.second);
+}
+
+bool ExpandPseudo::expandCopyACC(MachineBasicBlock &MBB, Iter I,
+ unsigned MFHiOpc, unsigned MFLoOpc) {
+ // mflo $vr0, src
+ // copy dst_lo, $vr0
+ // mfhi $vr1, src
+ // copy dst_hi, $vr1
+
+ unsigned Dst = I->getOperand(0).getReg(), Src = I->getOperand(1).getReg();
+ unsigned VRegSize = RegInfo.getMinimalPhysRegClass(Dst)->getSize() / 2;
+ const TargetRegisterClass *RC = RegInfo.intRegClass(VRegSize);
+ unsigned VR0 = MRI.createVirtualRegister(RC);
+ unsigned VR1 = MRI.createVirtualRegister(RC);
+ unsigned SrcKill = getKillRegState(I->getOperand(1).isKill());
+ unsigned DstLo = RegInfo.getSubReg(Dst, Mips::sub_lo);
+ unsigned DstHi = RegInfo.getSubReg(Dst, Mips::sub_hi);
+ DebugLoc DL = I->getDebugLoc();
+
+ BuildMI(MBB, I, DL, TII.get(MFLoOpc), VR0).addReg(Src);
+ BuildMI(MBB, I, DL, TII.get(TargetOpcode::COPY), DstLo)
+ .addReg(VR0, RegState::Kill);
+ BuildMI(MBB, I, DL, TII.get(MFHiOpc), VR1).addReg(Src, SrcKill);
+ BuildMI(MBB, I, DL, TII.get(TargetOpcode::COPY), DstHi)
+ .addReg(VR1, RegState::Kill);
+ return true;
+}
+
+/// This method expands the same instruction that MipsSEInstrInfo::
+/// expandBuildPairF64 does, for the case when ABI is fpxx and mthc1 is not
+/// available and the case where the ABI is FP64A. It is implemented here
+/// because frame indexes are eliminated before MipsSEInstrInfo::
+/// expandBuildPairF64 is called.
+bool ExpandPseudo::expandBuildPairF64(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ bool FP64) const {
+ // For fpxx and when mthc1 is not available, use:
+ // spill + reload via ldc1
+ //
+ // The case where dmtc1 is available doesn't need to be handled here
+ // because it never creates a BuildPairF64 node.
+ //
+ // The FP64A ABI (fp64 with nooddspreg) must also use a spill/reload sequence
+ // for odd-numbered double precision values (because the lower 32-bits is
+ // transferred with mtc1 which is redirected to the upper half of the even
+ // register). Unfortunately, we have to make this decision before register
+ // allocation so for now we use a spill/reload sequence for all
+ // double-precision values in regardless of being an odd/even register.
+ if ((Subtarget.isABI_FPXX() && !Subtarget.hasMTHC1()) ||
+ (FP64 && !Subtarget.useOddSPReg())) {
+ unsigned DstReg = I->getOperand(0).getReg();
+ unsigned LoReg = I->getOperand(1).getReg();
+ unsigned HiReg = I->getOperand(2).getReg();
+
+ // It should be impossible to have FGR64 on MIPS-II or MIPS32r1 (which are
+ // the cases where mthc1 is not available). 64-bit architectures and
+ // MIPS32r2 or later can use FGR64 though.
+ assert(Subtarget.isGP64bit() || Subtarget.hasMTHC1() ||
+ !Subtarget.isFP64bit());
+
+ const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+ const TargetRegisterClass *RC2 =
+ FP64 ? &Mips::FGR64RegClass : &Mips::AFGR64RegClass;
+
+ // We re-use the same spill slot each time so that the stack frame doesn't
+ // grow too much in functions with a large number of moves.
+ int FI = MF.getInfo<MipsFunctionInfo>()->getMoveF64ViaSpillFI(RC2);
+ if (!Subtarget.isLittle())
+ std::swap(LoReg, HiReg);
+ TII.storeRegToStack(MBB, I, LoReg, I->getOperand(1).isKill(), FI, RC,
+ &RegInfo, 0);
+ TII.storeRegToStack(MBB, I, HiReg, I->getOperand(2).isKill(), FI, RC,
+ &RegInfo, 4);
+ TII.loadRegFromStack(MBB, I, DstReg, FI, RC2, &RegInfo, 0);
+ return true;
+ }
+
+ return false;
+}
+
+/// This method expands the same instruction that MipsSEInstrInfo::
+/// expandExtractElementF64 does, for the case when ABI is fpxx and mfhc1 is not
+/// available and the case where the ABI is FP64A. It is implemented here
+/// because frame indexes are eliminated before MipsSEInstrInfo::
+/// expandExtractElementF64 is called.
+bool ExpandPseudo::expandExtractElementF64(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ bool FP64) const {
+ const MachineOperand &Op1 = I->getOperand(1);
+ const MachineOperand &Op2 = I->getOperand(2);
+
+ if ((Op1.isReg() && Op1.isUndef()) || (Op2.isReg() && Op2.isUndef())) {
+ unsigned DstReg = I->getOperand(0).getReg();
+ BuildMI(MBB, I, I->getDebugLoc(), TII.get(Mips::IMPLICIT_DEF), DstReg);
+ return true;
+ }
+
+ // For fpxx and when mfhc1 is not available, use:
+ // spill + reload via ldc1
+ //
+ // The case where dmfc1 is available doesn't need to be handled here
+ // because it never creates a ExtractElementF64 node.
+ //
+ // The FP64A ABI (fp64 with nooddspreg) must also use a spill/reload sequence
+ // for odd-numbered double precision values (because the lower 32-bits is
+ // transferred with mfc1 which is redirected to the upper half of the even
+ // register). Unfortunately, we have to make this decision before register
+ // allocation so for now we use a spill/reload sequence for all
+ // double-precision values in regardless of being an odd/even register.
+
+ if ((Subtarget.isABI_FPXX() && !Subtarget.hasMTHC1()) ||
+ (FP64 && !Subtarget.useOddSPReg())) {
+ unsigned DstReg = I->getOperand(0).getReg();
+ unsigned SrcReg = Op1.getReg();
+ unsigned N = Op2.getImm();
+ int64_t Offset = 4 * (Subtarget.isLittle() ? N : (1 - N));
+
+ // It should be impossible to have FGR64 on MIPS-II or MIPS32r1 (which are
+ // the cases where mfhc1 is not available). 64-bit architectures and
+ // MIPS32r2 or later can use FGR64 though.
+ assert(Subtarget.isGP64bit() || Subtarget.hasMTHC1() ||
+ !Subtarget.isFP64bit());
+
+ const TargetRegisterClass *RC =
+ FP64 ? &Mips::FGR64RegClass : &Mips::AFGR64RegClass;
+ const TargetRegisterClass *RC2 = &Mips::GPR32RegClass;
+
+ // We re-use the same spill slot each time so that the stack frame doesn't
+ // grow too much in functions with a large number of moves.
+ int FI = MF.getInfo<MipsFunctionInfo>()->getMoveF64ViaSpillFI(RC);
+ TII.storeRegToStack(MBB, I, SrcReg, Op1.isKill(), FI, RC, &RegInfo, 0);
+ TII.loadRegFromStack(MBB, I, DstReg, FI, RC2, &RegInfo, Offset);
+ return true;
+ }
+
+ return false;
+}
+
+MipsSEFrameLowering::MipsSEFrameLowering(const MipsSubtarget &STI)
+ : MipsFrameLowering(STI, STI.stackAlignment()) {}
+
+void MipsSEFrameLowering::emitPrologue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+ const MipsSEInstrInfo &TII =
+ *static_cast<const MipsSEInstrInfo *>(STI.getInstrInfo());
+ const MipsRegisterInfo &RegInfo =
+ *static_cast<const MipsRegisterInfo *>(STI.getRegisterInfo());
+
+ MachineBasicBlock::iterator MBBI = MBB.begin();
+ DebugLoc dl;
+ MipsABIInfo ABI = STI.getABI();
+ unsigned SP = ABI.GetStackPtr();
+ unsigned FP = ABI.GetFramePtr();
+ unsigned ZERO = ABI.GetNullPtr();
+ unsigned MOVE = ABI.GetGPRMoveOp();
+ unsigned ADDiu = ABI.GetPtrAddiuOp();
+ unsigned AND = ABI.IsN64() ? Mips::AND64 : Mips::AND;
+
+ const TargetRegisterClass *RC = ABI.ArePtrs64bit() ?
+ &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+
+ // First, compute final stack size.
+ uint64_t StackSize = MFI.getStackSize();
+
+ // No need to allocate space on the stack.
+ if (StackSize == 0 && !MFI.adjustsStack()) return;
+
+ MachineModuleInfo &MMI = MF.getMMI();
+ const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+ MachineLocation DstML, SrcML;
+
+ // Adjust stack.
+ TII.adjustStackPtr(SP, -StackSize, MBB, MBBI);
+
+ // emit ".cfi_def_cfa_offset StackSize"
+ unsigned CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::createDefCfaOffset(nullptr, -StackSize));
+ BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+
+ if (MF.getFunction()->hasFnAttribute("interrupt"))
+ emitInterruptPrologueStub(MF, MBB);
+
+ const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+
+ if (CSI.size()) {
+ // Find the instruction past the last instruction that saves a callee-saved
+ // register to the stack.
+ for (unsigned i = 0; i < CSI.size(); ++i)
+ ++MBBI;
+
+ // Iterate over list of callee-saved registers and emit .cfi_offset
+ // directives.
+ for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
+ E = CSI.end(); I != E; ++I) {
+ int64_t Offset = MFI.getObjectOffset(I->getFrameIdx());
+ unsigned Reg = I->getReg();
+
+ // If Reg is a double precision register, emit two cfa_offsets,
+ // one for each of the paired single precision registers.
+ if (Mips::AFGR64RegClass.contains(Reg)) {
+ unsigned Reg0 =
+ MRI->getDwarfRegNum(RegInfo.getSubReg(Reg, Mips::sub_lo), true);
+ unsigned Reg1 =
+ MRI->getDwarfRegNum(RegInfo.getSubReg(Reg, Mips::sub_hi), true);
+
+ if (!STI.isLittle())
+ std::swap(Reg0, Reg1);
+
+ unsigned CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::createOffset(nullptr, Reg0, Offset));
+ BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+
+ CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::createOffset(nullptr, Reg1, Offset + 4));
+ BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+ } else if (Mips::FGR64RegClass.contains(Reg)) {
+ unsigned Reg0 = MRI->getDwarfRegNum(Reg, true);
+ unsigned Reg1 = MRI->getDwarfRegNum(Reg, true) + 1;
+
+ if (!STI.isLittle())
+ std::swap(Reg0, Reg1);
+
+ unsigned CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::createOffset(nullptr, Reg0, Offset));
+ BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+
+ CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::createOffset(nullptr, Reg1, Offset + 4));
+ BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+ } else {
+ // Reg is either in GPR32 or FGR32.
+ unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
+ nullptr, MRI->getDwarfRegNum(Reg, 1), Offset));
+ BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+ }
+ }
+ }
+
+ if (MipsFI->callsEhReturn()) {
+ // Insert instructions that spill eh data registers.
+ for (int I = 0; I < 4; ++I) {
+ if (!MBB.isLiveIn(ABI.GetEhDataReg(I)))
+ MBB.addLiveIn(ABI.GetEhDataReg(I));
+ TII.storeRegToStackSlot(MBB, MBBI, ABI.GetEhDataReg(I), false,
+ MipsFI->getEhDataRegFI(I), RC, &RegInfo);
+ }
+
+ // Emit .cfi_offset directives for eh data registers.
+ for (int I = 0; I < 4; ++I) {
+ int64_t Offset = MFI.getObjectOffset(MipsFI->getEhDataRegFI(I));
+ unsigned Reg = MRI->getDwarfRegNum(ABI.GetEhDataReg(I), true);
+ unsigned CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::createOffset(nullptr, Reg, Offset));
+ BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+ }
+ }
+
+ // if framepointer enabled, set it to point to the stack pointer.
+ if (hasFP(MF)) {
+ // Insert instruction "move $fp, $sp" at this location.
+ BuildMI(MBB, MBBI, dl, TII.get(MOVE), FP).addReg(SP).addReg(ZERO)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ // emit ".cfi_def_cfa_register $fp"
+ unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(
+ nullptr, MRI->getDwarfRegNum(FP, true)));
+ BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+
+ if (RegInfo.needsStackRealignment(MF)) {
+ // addiu $Reg, $zero, -MaxAlignment
+ // andi $sp, $sp, $Reg
+ unsigned VR = MF.getRegInfo().createVirtualRegister(RC);
+ assert(isInt<16>(MFI.getMaxAlignment()) &&
+ "Function's alignment size requirement is not supported.");
+ int MaxAlign = -(int)MFI.getMaxAlignment();
+
+ BuildMI(MBB, MBBI, dl, TII.get(ADDiu), VR).addReg(ZERO) .addImm(MaxAlign);
+ BuildMI(MBB, MBBI, dl, TII.get(AND), SP).addReg(SP).addReg(VR);
+
+ if (hasBP(MF)) {
+ // move $s7, $sp
+ unsigned BP = STI.isABI_N64() ? Mips::S7_64 : Mips::S7;
+ BuildMI(MBB, MBBI, dl, TII.get(MOVE), BP)
+ .addReg(SP)
+ .addReg(ZERO);
+ }
+ }
+ }
+}
+
+void MipsSEFrameLowering::emitInterruptPrologueStub(
+ MachineFunction &MF, MachineBasicBlock &MBB) const {
+
+ MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+ MachineBasicBlock::iterator MBBI = MBB.begin();
+ DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+ // Report an error the target doesn't support Mips32r2 or later.
+ // The epilogue relies on the use of the "ehb" to clear execution
+ // hazards. Pre R2 Mips relies on an implementation defined number
+ // of "ssnop"s to clear the execution hazard. Support for ssnop hazard
+ // clearing is not provided so reject that configuration.
+ if (!STI.hasMips32r2())
+ report_fatal_error(
+ "\"interrupt\" attribute is not supported on pre-MIPS32R2 or "
+ "MIPS16 targets.");
+
+ // The GP register contains the "user" value, so we cannot perform
+ // any gp relative loads until we restore the "kernel" or "system" gp
+ // value. Until support is written we shall only accept the static
+ // relocation model.
+ if ((STI.getRelocationModel() != Reloc::Static))
+ report_fatal_error("\"interrupt\" attribute is only supported for the "
+ "static relocation model on MIPS at the present time.");
+
+ if (!STI.isABI_O32() || STI.hasMips64())
+ report_fatal_error("\"interrupt\" attribute is only supported for the "
+ "O32 ABI on MIPS32R2+ at the present time.");
+
+ // Perform ISR handling like GCC
+ StringRef IntKind =
+ MF.getFunction()->getFnAttribute("interrupt").getValueAsString();
+ const TargetRegisterClass *PtrRC = &Mips::GPR32RegClass;
+
+ // EIC interrupt handling needs to read the Cause register to disable
+ // interrupts.
+ if (IntKind == "eic") {
+ // Coprocessor registers are always live per se.
+ MBB.addLiveIn(Mips::COP013);
+ BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MFC0), Mips::K0)
+ .addReg(Mips::COP013)
+ .addImm(0)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::EXT), Mips::K0)
+ .addReg(Mips::K0)
+ .addImm(10)
+ .addImm(6)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ // Fetch and spill EPC
+ MBB.addLiveIn(Mips::COP014);
+ BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MFC0), Mips::K1)
+ .addReg(Mips::COP014)
+ .addImm(0)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ STI.getInstrInfo()->storeRegToStack(MBB, MBBI, Mips::K1, false,
+ MipsFI->getISRRegFI(0), PtrRC,
+ STI.getRegisterInfo(), 0);
+
+ // Fetch and Spill Status
+ MBB.addLiveIn(Mips::COP012);
+ BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MFC0), Mips::K1)
+ .addReg(Mips::COP012)
+ .addImm(0)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ STI.getInstrInfo()->storeRegToStack(MBB, MBBI, Mips::K1, false,
+ MipsFI->getISRRegFI(1), PtrRC,
+ STI.getRegisterInfo(), 0);
+
+ // Build the configuration for disabling lower priority interrupts. Non EIC
+ // interrupts need to be masked off with zero, EIC from the Cause register.
+ unsigned InsPosition = 8;
+ unsigned InsSize = 0;
+ unsigned SrcReg = Mips::ZERO;
+
+ // If the interrupt we're tied to is the EIC, switch the source for the
+ // masking off interrupts to the cause register.
+ if (IntKind == "eic") {
+ SrcReg = Mips::K0;
+ InsPosition = 10;
+ InsSize = 6;
+ } else
+ InsSize = StringSwitch<unsigned>(IntKind)
+ .Case("sw0", 1)
+ .Case("sw1", 2)
+ .Case("hw0", 3)
+ .Case("hw1", 4)
+ .Case("hw2", 5)
+ .Case("hw3", 6)
+ .Case("hw4", 7)
+ .Case("hw5", 8)
+ .Default(0);
+ assert(InsSize != 0 && "Unknown interrupt type!");
+
+ BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::INS), Mips::K1)
+ .addReg(SrcReg)
+ .addImm(InsPosition)
+ .addImm(InsSize)
+ .addReg(Mips::K1)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ // Mask off KSU, ERL, EXL
+ BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::INS), Mips::K1)
+ .addReg(Mips::ZERO)
+ .addImm(1)
+ .addImm(4)
+ .addReg(Mips::K1)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ // Disable the FPU as we are not spilling those register sets.
+ if (!STI.useSoftFloat())
+ BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::INS), Mips::K1)
+ .addReg(Mips::ZERO)
+ .addImm(29)
+ .addImm(1)
+ .addReg(Mips::K1)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ // Set the new status
+ BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MTC0), Mips::COP012)
+ .addReg(Mips::K1)
+ .addImm(0)
+ .setMIFlag(MachineInstr::FrameSetup);
+}
+
+void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+ const MipsSEInstrInfo &TII =
+ *static_cast<const MipsSEInstrInfo *>(STI.getInstrInfo());
+ const MipsRegisterInfo &RegInfo =
+ *static_cast<const MipsRegisterInfo *>(STI.getRegisterInfo());
+
+ DebugLoc DL = MBBI->getDebugLoc();
+ MipsABIInfo ABI = STI.getABI();
+ unsigned SP = ABI.GetStackPtr();
+ unsigned FP = ABI.GetFramePtr();
+ unsigned ZERO = ABI.GetNullPtr();
+ unsigned MOVE = ABI.GetGPRMoveOp();
+
+ // if framepointer enabled, restore the stack pointer.
+ if (hasFP(MF)) {
+ // Find the first instruction that restores a callee-saved register.
+ MachineBasicBlock::iterator I = MBBI;
+
+ for (unsigned i = 0; i < MFI.getCalleeSavedInfo().size(); ++i)
+ --I;
+
+ // Insert instruction "move $sp, $fp" at this location.
+ BuildMI(MBB, I, DL, TII.get(MOVE), SP).addReg(FP).addReg(ZERO);
+ }
+
+ if (MipsFI->callsEhReturn()) {
+ const TargetRegisterClass *RC =
+ ABI.ArePtrs64bit() ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+
+ // Find first instruction that restores a callee-saved register.
+ MachineBasicBlock::iterator I = MBBI;
+ for (unsigned i = 0; i < MFI.getCalleeSavedInfo().size(); ++i)
+ --I;
+
+ // Insert instructions that restore eh data registers.
+ for (int J = 0; J < 4; ++J) {
+ TII.loadRegFromStackSlot(MBB, I, ABI.GetEhDataReg(J),
+ MipsFI->getEhDataRegFI(J), RC, &RegInfo);
+ }
+ }
+
+ if (MF.getFunction()->hasFnAttribute("interrupt"))
+ emitInterruptEpilogueStub(MF, MBB);
+
+ // Get the number of bytes from FrameInfo
+ uint64_t StackSize = MFI.getStackSize();
+
+ if (!StackSize)
+ return;
+
+ // Adjust stack.
+ TII.adjustStackPtr(SP, StackSize, MBB, MBBI);
+}
+
+void MipsSEFrameLowering::emitInterruptEpilogueStub(
+ MachineFunction &MF, MachineBasicBlock &MBB) const {
+
+ MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+ MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+ DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+ // Perform ISR handling like GCC
+ const TargetRegisterClass *PtrRC = &Mips::GPR32RegClass;
+
+ // Disable Interrupts.
+ BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::DI), Mips::ZERO);
+ BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::EHB));
+
+ // Restore EPC
+ STI.getInstrInfo()->loadRegFromStackSlot(MBB, MBBI, Mips::K1,
+ MipsFI->getISRRegFI(0), PtrRC,
+ STI.getRegisterInfo());
+ BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MTC0), Mips::COP014)
+ .addReg(Mips::K1)
+ .addImm(0);
+
+ // Restore Status
+ STI.getInstrInfo()->loadRegFromStackSlot(MBB, MBBI, Mips::K1,
+ MipsFI->getISRRegFI(1), PtrRC,
+ STI.getRegisterInfo());
+ BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MTC0), Mips::COP012)
+ .addReg(Mips::K1)
+ .addImm(0);
+}
+
+int MipsSEFrameLowering::getFrameIndexReference(const MachineFunction &MF,
+ int FI,
+ unsigned &FrameReg) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ MipsABIInfo ABI = STI.getABI();
+
+ if (MFI.isFixedObjectIndex(FI))
+ FrameReg = hasFP(MF) ? ABI.GetFramePtr() : ABI.GetStackPtr();
+ else
+ FrameReg = hasBP(MF) ? ABI.GetBasePtr() : ABI.GetStackPtr();
+
+ return MFI.getObjectOffset(FI) + MFI.getStackSize() -
+ getOffsetOfLocalArea() + MFI.getOffsetAdjustment();
+}
+
+bool MipsSEFrameLowering::
+spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const {
+ MachineFunction *MF = MBB.getParent();
+ MachineBasicBlock *EntryBlock = &MF->front();
+ const TargetInstrInfo &TII = *STI.getInstrInfo();
+
+ for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+ // Add the callee-saved register as live-in. Do not add if the register is
+ // RA and return address is taken, because it has already been added in
+ // method MipsTargetLowering::lowerRETURNADDR.
+ // It's killed at the spill, unless the register is RA and return address
+ // is taken.
+ unsigned Reg = CSI[i].getReg();
+ bool IsRAAndRetAddrIsTaken = (Reg == Mips::RA || Reg == Mips::RA_64)
+ && MF->getFrameInfo().isReturnAddressTaken();
+ if (!IsRAAndRetAddrIsTaken)
+ EntryBlock->addLiveIn(Reg);
+
+ // ISRs require HI/LO to be spilled into kernel registers to be then
+ // spilled to the stack frame.
+ bool IsLOHI = (Reg == Mips::LO0 || Reg == Mips::LO0_64 ||
+ Reg == Mips::HI0 || Reg == Mips::HI0_64);
+ const Function *Func = MBB.getParent()->getFunction();
+ if (IsLOHI && Func->hasFnAttribute("interrupt")) {
+ DebugLoc DL = MI->getDebugLoc();
+
+ unsigned Op = 0;
+ if (!STI.getABI().ArePtrs64bit()) {
+ Op = (Reg == Mips::HI0) ? Mips::MFHI : Mips::MFLO;
+ Reg = Mips::K0;
+ } else {
+ Op = (Reg == Mips::HI0) ? Mips::MFHI64 : Mips::MFLO64;
+ Reg = Mips::K0_64;
+ }
+ BuildMI(MBB, MI, DL, TII.get(Op), Mips::K0)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ // Insert the spill to the stack frame.
+ bool IsKill = !IsRAAndRetAddrIsTaken;
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+ TII.storeRegToStackSlot(*EntryBlock, MI, Reg, IsKill,
+ CSI[i].getFrameIdx(), RC, TRI);
+ }
+
+ return true;
+}
+
+bool
+MipsSEFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ // Reserve call frame if the size of the maximum call frame fits into 16-bit
+ // immediate field and there are no variable sized objects on the stack.
+ // Make sure the second register scavenger spill slot can be accessed with one
+ // instruction.
+ return isInt<16>(MFI.getMaxCallFrameSize() + getStackAlignment()) &&
+ !MFI.hasVarSizedObjects();
+}
+
+/// Mark \p Reg and all registers aliasing it in the bitset.
+static void setAliasRegs(MachineFunction &MF, BitVector &SavedRegs,
+ unsigned Reg) {
+ const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+ for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+ SavedRegs.set(*AI);
+}
+
+void MipsSEFrameLowering::determineCalleeSaves(MachineFunction &MF,
+ BitVector &SavedRegs,
+ RegScavenger *RS) const {
+ TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+ MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+ MipsABIInfo ABI = STI.getABI();
+ unsigned FP = ABI.GetFramePtr();
+ unsigned BP = ABI.IsN64() ? Mips::S7_64 : Mips::S7;
+
+ // Mark $fp as used if function has dedicated frame pointer.
+ if (hasFP(MF))
+ setAliasRegs(MF, SavedRegs, FP);
+ // Mark $s7 as used if function has dedicated base pointer.
+ if (hasBP(MF))
+ setAliasRegs(MF, SavedRegs, BP);
+
+ // Create spill slots for eh data registers if function calls eh_return.
+ if (MipsFI->callsEhReturn())
+ MipsFI->createEhDataRegsFI();
+
+ // Create spill slots for Coprocessor 0 registers if function is an ISR.
+ if (MipsFI->isISR())
+ MipsFI->createISRRegFI();
+
+ // Expand pseudo instructions which load, store or copy accumulators.
+ // Add an emergency spill slot if a pseudo was expanded.
+ if (ExpandPseudo(MF).expand()) {
+ // The spill slot should be half the size of the accumulator. If target is
+ // mips64, it should be 64-bit, otherwise it should be 32-bt.
+ const TargetRegisterClass *RC = STI.hasMips64() ?
+ &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+ int FI = MF.getFrameInfo().CreateStackObject(RC->getSize(),
+ RC->getAlignment(), false);
+ RS->addScavengingFrameIndex(FI);
+ }
+
+ // Set scavenging frame index if necessary.
+ uint64_t MaxSPOffset = MF.getInfo<MipsFunctionInfo>()->getIncomingArgSize() +
+ estimateStackSize(MF);
+
+ if (isInt<16>(MaxSPOffset))
+ return;
+
+ const TargetRegisterClass *RC =
+ ABI.ArePtrs64bit() ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+ int FI = MF.getFrameInfo().CreateStackObject(RC->getSize(),
+ RC->getAlignment(), false);
+ RS->addScavengingFrameIndex(FI);
+}
+
+const MipsFrameLowering *
+llvm::createMipsSEFrameLowering(const MipsSubtarget &ST) {
+ return new MipsSEFrameLowering(ST);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.h b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.h
new file mode 100644
index 000000000000..63cd3cebc56a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.h
@@ -0,0 +1,52 @@
+//===-- MipsSEFrameLowering.h - Mips32/64 frame lowering --------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSSEFRAMELOWERING_H
+#define LLVM_LIB_TARGET_MIPS_MIPSSEFRAMELOWERING_H
+
+#include "MipsFrameLowering.h"
+
+namespace llvm {
+
+class MipsSEFrameLowering : public MipsFrameLowering {
+public:
+ explicit MipsSEFrameLowering(const MipsSubtarget &STI);
+
+ /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+ /// the function.
+ void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+ void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+ int getFrameIndexReference(const MachineFunction &MF, int FI,
+ unsigned &FrameReg) const override;
+
+ bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const override;
+
+ bool hasReservedCallFrame(const MachineFunction &MF) const override;
+
+ void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+ RegScavenger *RS) const override;
+ unsigned ehDataReg(unsigned I) const;
+
+private:
+ void emitInterruptEpilogueStub(MachineFunction &MF,
+ MachineBasicBlock &MBB) const;
+ void emitInterruptPrologueStub(MachineFunction &MF,
+ MachineBasicBlock &MBB) const;
+};
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
new file mode 100644
index 000000000000..6f0fdddd7d55
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -0,0 +1,1067 @@
+//===-- MipsSEISelDAGToDAG.cpp - A Dag to Dag Inst Selector for MipsSE ----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsDAGToDAGISel specialized for mips32/64.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsSEISelDAGToDAG.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "Mips.h"
+#include "MipsAnalyzeImmediate.h"
+#include "MipsMachineFunction.h"
+#include "MipsRegisterInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-isel"
+
+bool MipsSEDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
+ Subtarget = &static_cast<const MipsSubtarget &>(MF.getSubtarget());
+ if (Subtarget->inMips16Mode())
+ return false;
+ return MipsDAGToDAGISel::runOnMachineFunction(MF);
+}
+
+void MipsSEDAGToDAGISel::addDSPCtrlRegOperands(bool IsDef, MachineInstr &MI,
+ MachineFunction &MF) {
+ MachineInstrBuilder MIB(MF, &MI);
+ unsigned Mask = MI.getOperand(1).getImm();
+ unsigned Flag =
+ IsDef ? RegState::ImplicitDefine : RegState::Implicit | RegState::Undef;
+
+ if (Mask & 1)
+ MIB.addReg(Mips::DSPPos, Flag);
+
+ if (Mask & 2)
+ MIB.addReg(Mips::DSPSCount, Flag);
+
+ if (Mask & 4)
+ MIB.addReg(Mips::DSPCarry, Flag);
+
+ if (Mask & 8)
+ MIB.addReg(Mips::DSPOutFlag, Flag);
+
+ if (Mask & 16)
+ MIB.addReg(Mips::DSPCCond, Flag);
+
+ if (Mask & 32)
+ MIB.addReg(Mips::DSPEFI, Flag);
+}
+
+unsigned MipsSEDAGToDAGISel::getMSACtrlReg(const SDValue RegIdx) const {
+ switch (cast<ConstantSDNode>(RegIdx)->getZExtValue()) {
+ default:
+ llvm_unreachable("Could not map int to register");
+ case 0: return Mips::MSAIR;
+ case 1: return Mips::MSACSR;
+ case 2: return Mips::MSAAccess;
+ case 3: return Mips::MSASave;
+ case 4: return Mips::MSAModify;
+ case 5: return Mips::MSARequest;
+ case 6: return Mips::MSAMap;
+ case 7: return Mips::MSAUnmap;
+ }
+}
+
+bool MipsSEDAGToDAGISel::replaceUsesWithZeroReg(MachineRegisterInfo *MRI,
+ const MachineInstr& MI) {
+ unsigned DstReg = 0, ZeroReg = 0;
+
+ // Check if MI is "addiu $dst, $zero, 0" or "daddiu $dst, $zero, 0".
+ if ((MI.getOpcode() == Mips::ADDiu) &&
+ (MI.getOperand(1).getReg() == Mips::ZERO) &&
+ (MI.getOperand(2).getImm() == 0)) {
+ DstReg = MI.getOperand(0).getReg();
+ ZeroReg = Mips::ZERO;
+ } else if ((MI.getOpcode() == Mips::DADDiu) &&
+ (MI.getOperand(1).getReg() == Mips::ZERO_64) &&
+ (MI.getOperand(2).getImm() == 0)) {
+ DstReg = MI.getOperand(0).getReg();
+ ZeroReg = Mips::ZERO_64;
+ }
+
+ if (!DstReg)
+ return false;
+
+ // Replace uses with ZeroReg.
+ for (MachineRegisterInfo::use_iterator U = MRI->use_begin(DstReg),
+ E = MRI->use_end(); U != E;) {
+ MachineOperand &MO = *U;
+ unsigned OpNo = U.getOperandNo();
+ MachineInstr *MI = MO.getParent();
+ ++U;
+
+ // Do not replace if it is a phi's operand or is tied to def operand.
+ if (MI->isPHI() || MI->isRegTiedToDefOperand(OpNo) || MI->isPseudo())
+ continue;
+
+ // Also, we have to check that the register class of the operand
+ // contains the zero register.
+ if (!MRI->getRegClass(MO.getReg())->contains(ZeroReg))
+ continue;
+
+ MO.setReg(ZeroReg);
+ }
+
+ return true;
+}
+
+void MipsSEDAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) {
+ MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+ if (!MipsFI->globalBaseRegSet())
+ return;
+
+ MachineBasicBlock &MBB = MF.front();
+ MachineBasicBlock::iterator I = MBB.begin();
+ MachineRegisterInfo &RegInfo = MF.getRegInfo();
+ const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
+ DebugLoc DL;
+ unsigned V0, V1, GlobalBaseReg = MipsFI->getGlobalBaseReg();
+ const TargetRegisterClass *RC;
+ const MipsABIInfo &ABI = static_cast<const MipsTargetMachine &>(TM).getABI();
+ RC = (ABI.IsN64()) ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+
+ V0 = RegInfo.createVirtualRegister(RC);
+ V1 = RegInfo.createVirtualRegister(RC);
+
+ if (ABI.IsN64()) {
+ MF.getRegInfo().addLiveIn(Mips::T9_64);
+ MBB.addLiveIn(Mips::T9_64);
+
+ // lui $v0, %hi(%neg(%gp_rel(fname)))
+ // daddu $v1, $v0, $t9
+ // daddiu $globalbasereg, $v1, %lo(%neg(%gp_rel(fname)))
+ const GlobalValue *FName = MF.getFunction();
+ BuildMI(MBB, I, DL, TII.get(Mips::LUi64), V0)
+ .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_HI);
+ BuildMI(MBB, I, DL, TII.get(Mips::DADDu), V1).addReg(V0)
+ .addReg(Mips::T9_64);
+ BuildMI(MBB, I, DL, TII.get(Mips::DADDiu), GlobalBaseReg).addReg(V1)
+ .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_LO);
+ return;
+ }
+
+ if (!MF.getTarget().isPositionIndependent()) {
+ // Set global register to __gnu_local_gp.
+ //
+ // lui $v0, %hi(__gnu_local_gp)
+ // addiu $globalbasereg, $v0, %lo(__gnu_local_gp)
+ BuildMI(MBB, I, DL, TII.get(Mips::LUi), V0)
+ .addExternalSymbol("__gnu_local_gp", MipsII::MO_ABS_HI);
+ BuildMI(MBB, I, DL, TII.get(Mips::ADDiu), GlobalBaseReg).addReg(V0)
+ .addExternalSymbol("__gnu_local_gp", MipsII::MO_ABS_LO);
+ return;
+ }
+
+ MF.getRegInfo().addLiveIn(Mips::T9);
+ MBB.addLiveIn(Mips::T9);
+
+ if (ABI.IsN32()) {
+ // lui $v0, %hi(%neg(%gp_rel(fname)))
+ // addu $v1, $v0, $t9
+ // addiu $globalbasereg, $v1, %lo(%neg(%gp_rel(fname)))
+ const GlobalValue *FName = MF.getFunction();
+ BuildMI(MBB, I, DL, TII.get(Mips::LUi), V0)
+ .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_HI);
+ BuildMI(MBB, I, DL, TII.get(Mips::ADDu), V1).addReg(V0).addReg(Mips::T9);
+ BuildMI(MBB, I, DL, TII.get(Mips::ADDiu), GlobalBaseReg).addReg(V1)
+ .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_LO);
+ return;
+ }
+
+ assert(ABI.IsO32());
+
+ // For O32 ABI, the following instruction sequence is emitted to initialize
+ // the global base register:
+ //
+ // 0. lui $2, %hi(_gp_disp)
+ // 1. addiu $2, $2, %lo(_gp_disp)
+ // 2. addu $globalbasereg, $2, $t9
+ //
+ // We emit only the last instruction here.
+ //
+ // GNU linker requires that the first two instructions appear at the beginning
+ // of a function and no instructions be inserted before or between them.
+ // The two instructions are emitted during lowering to MC layer in order to
+ // avoid any reordering.
+ //
+ // Register $2 (Mips::V0) is added to the list of live-in registers to ensure
+ // the value instruction 1 (addiu) defines is valid when instruction 2 (addu)
+ // reads it.
+ MF.getRegInfo().addLiveIn(Mips::V0);
+ MBB.addLiveIn(Mips::V0);
+ BuildMI(MBB, I, DL, TII.get(Mips::ADDu), GlobalBaseReg)
+ .addReg(Mips::V0).addReg(Mips::T9);
+}
+
+void MipsSEDAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) {
+ initGlobalBaseReg(MF);
+
+ MachineRegisterInfo *MRI = &MF.getRegInfo();
+
+ for (auto &MBB: MF) {
+ for (auto &MI: MBB) {
+ switch (MI.getOpcode()) {
+ case Mips::RDDSP:
+ addDSPCtrlRegOperands(false, MI, MF);
+ break;
+ case Mips::WRDSP:
+ addDSPCtrlRegOperands(true, MI, MF);
+ break;
+ default:
+ replaceUsesWithZeroReg(MRI, MI);
+ }
+ }
+ }
+}
+
+void MipsSEDAGToDAGISel::selectAddESubE(unsigned MOp, SDValue InFlag,
+ SDValue CmpLHS, const SDLoc &DL,
+ SDNode *Node) const {
+ unsigned Opc = InFlag.getOpcode(); (void)Opc;
+
+ assert(((Opc == ISD::ADDC || Opc == ISD::ADDE) ||
+ (Opc == ISD::SUBC || Opc == ISD::SUBE)) &&
+ "(ADD|SUB)E flag operand must come from (ADD|SUB)C/E insn");
+
+ unsigned SLTuOp = Mips::SLTu, ADDuOp = Mips::ADDu;
+ if (Subtarget->isGP64bit()) {
+ SLTuOp = Mips::SLTu64;
+ ADDuOp = Mips::DADDu;
+ }
+
+ SDValue Ops[] = { CmpLHS, InFlag.getOperand(1) };
+ SDValue LHS = Node->getOperand(0), RHS = Node->getOperand(1);
+ EVT VT = LHS.getValueType();
+
+ SDNode *Carry = CurDAG->getMachineNode(SLTuOp, DL, VT, Ops);
+
+ if (Subtarget->isGP64bit()) {
+ // On 64-bit targets, sltu produces an i64 but our backend currently says
+ // that SLTu64 produces an i32. We need to fix this in the long run but for
+ // now, just make the DAG type-correct by asserting the upper bits are zero.
+ Carry = CurDAG->getMachineNode(Mips::SUBREG_TO_REG, DL, VT,
+ CurDAG->getTargetConstant(0, DL, VT),
+ SDValue(Carry, 0),
+ CurDAG->getTargetConstant(Mips::sub_32, DL,
+ VT));
+ }
+
+ // Generate a second addition only if we know that RHS is not a
+ // constant-zero node.
+ SDNode *AddCarry = Carry;
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS);
+ if (!C || C->getZExtValue())
+ AddCarry = CurDAG->getMachineNode(ADDuOp, DL, VT, SDValue(Carry, 0), RHS);
+
+ CurDAG->SelectNodeTo(Node, MOp, VT, MVT::Glue, LHS, SDValue(AddCarry, 0));
+}
+
+/// Match frameindex
+bool MipsSEDAGToDAGISel::selectAddrFrameIndex(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const {
+ if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+ EVT ValTy = Addr.getValueType();
+
+ Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
+ Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), ValTy);
+ return true;
+ }
+ return false;
+}
+
+/// Match frameindex+offset and frameindex|offset
+bool MipsSEDAGToDAGISel::selectAddrFrameIndexOffset(
+ SDValue Addr, SDValue &Base, SDValue &Offset, unsigned OffsetBits,
+ unsigned ShiftAmount = 0) const {
+ if (CurDAG->isBaseWithConstantOffset(Addr)) {
+ ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+ if (isIntN(OffsetBits + ShiftAmount, CN->getSExtValue())) {
+ EVT ValTy = Addr.getValueType();
+
+ // If the first operand is a FI, get the TargetFI Node
+ if (FrameIndexSDNode *FIN =
+ dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
+ Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
+ else {
+ Base = Addr.getOperand(0);
+ // If base is a FI, additional offset calculation is done in
+ // eliminateFrameIndex, otherwise we need to check the alignment
+ if (OffsetToAlignment(CN->getZExtValue(), 1ull << ShiftAmount) != 0)
+ return false;
+ }
+
+ Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(Addr),
+ ValTy);
+ return true;
+ }
+ }
+ return false;
+}
+
+/// ComplexPattern used on MipsInstrInfo
+/// Used on Mips Load/Store instructions
+bool MipsSEDAGToDAGISel::selectAddrRegImm(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const {
+ // if Address is FI, get the TargetFrameIndex.
+ if (selectAddrFrameIndex(Addr, Base, Offset))
+ return true;
+
+ // on PIC code Load GA
+ if (Addr.getOpcode() == MipsISD::Wrapper) {
+ Base = Addr.getOperand(0);
+ Offset = Addr.getOperand(1);
+ return true;
+ }
+
+ if (!TM.isPositionIndependent()) {
+ if ((Addr.getOpcode() == ISD::TargetExternalSymbol ||
+ Addr.getOpcode() == ISD::TargetGlobalAddress))
+ return false;
+ }
+
+ // Addresses of the form FI+const or FI|const
+ if (selectAddrFrameIndexOffset(Addr, Base, Offset, 16))
+ return true;
+
+ // Operand is a result from an ADD.
+ if (Addr.getOpcode() == ISD::ADD) {
+ // When loading from constant pools, load the lower address part in
+ // the instruction itself. Example, instead of:
+ // lui $2, %hi($CPI1_0)
+ // addiu $2, $2, %lo($CPI1_0)
+ // lwc1 $f0, 0($2)
+ // Generate:
+ // lui $2, %hi($CPI1_0)
+ // lwc1 $f0, %lo($CPI1_0)($2)
+ if (Addr.getOperand(1).getOpcode() == MipsISD::Lo ||
+ Addr.getOperand(1).getOpcode() == MipsISD::GPRel) {
+ SDValue Opnd0 = Addr.getOperand(1).getOperand(0);
+ if (isa<ConstantPoolSDNode>(Opnd0) || isa<GlobalAddressSDNode>(Opnd0) ||
+ isa<JumpTableSDNode>(Opnd0)) {
+ Base = Addr.getOperand(0);
+ Offset = Opnd0;
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+/// ComplexPattern used on MipsInstrInfo
+/// Used on Mips Load/Store instructions
+bool MipsSEDAGToDAGISel::selectAddrDefault(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const {
+ Base = Addr;
+ Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), Addr.getValueType());
+ return true;
+}
+
+bool MipsSEDAGToDAGISel::selectIntAddr(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const {
+ return selectAddrRegImm(Addr, Base, Offset) ||
+ selectAddrDefault(Addr, Base, Offset);
+}
+
+bool MipsSEDAGToDAGISel::selectAddrRegImm9(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const {
+ if (selectAddrFrameIndex(Addr, Base, Offset))
+ return true;
+
+ if (selectAddrFrameIndexOffset(Addr, Base, Offset, 9))
+ return true;
+
+ return false;
+}
+
+/// Used on microMIPS LWC2, LDC2, SWC2 and SDC2 instructions (11-bit offset)
+bool MipsSEDAGToDAGISel::selectAddrRegImm11(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const {
+ if (selectAddrFrameIndex(Addr, Base, Offset))
+ return true;
+
+ if (selectAddrFrameIndexOffset(Addr, Base, Offset, 11))
+ return true;
+
+ return false;
+}
+
+/// Used on microMIPS Load/Store unaligned instructions (12-bit offset)
+bool MipsSEDAGToDAGISel::selectAddrRegImm12(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const {
+ if (selectAddrFrameIndex(Addr, Base, Offset))
+ return true;
+
+ if (selectAddrFrameIndexOffset(Addr, Base, Offset, 12))
+ return true;
+
+ return false;
+}
+
+bool MipsSEDAGToDAGISel::selectAddrRegImm16(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const {
+ if (selectAddrFrameIndex(Addr, Base, Offset))
+ return true;
+
+ if (selectAddrFrameIndexOffset(Addr, Base, Offset, 16))
+ return true;
+
+ return false;
+}
+
+bool MipsSEDAGToDAGISel::selectIntAddr11MM(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const {
+ return selectAddrRegImm11(Addr, Base, Offset) ||
+ selectAddrDefault(Addr, Base, Offset);
+}
+
+bool MipsSEDAGToDAGISel::selectIntAddr12MM(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const {
+ return selectAddrRegImm12(Addr, Base, Offset) ||
+ selectAddrDefault(Addr, Base, Offset);
+}
+
+bool MipsSEDAGToDAGISel::selectIntAddr16MM(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const {
+ return selectAddrRegImm16(Addr, Base, Offset) ||
+ selectAddrDefault(Addr, Base, Offset);
+}
+
+bool MipsSEDAGToDAGISel::selectIntAddrLSL2MM(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const {
+ if (selectAddrFrameIndexOffset(Addr, Base, Offset, 7)) {
+ if (isa<FrameIndexSDNode>(Base))
+ return false;
+
+ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Offset)) {
+ unsigned CnstOff = CN->getZExtValue();
+ return (CnstOff == (CnstOff & 0x3c));
+ }
+
+ return false;
+ }
+
+ // For all other cases where "lw" would be selected, don't select "lw16"
+ // because it would result in additional instructions to prepare operands.
+ if (selectAddrRegImm(Addr, Base, Offset))
+ return false;
+
+ return selectAddrDefault(Addr, Base, Offset);
+}
+
+bool MipsSEDAGToDAGISel::selectIntAddrSImm10(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const {
+
+ if (selectAddrFrameIndex(Addr, Base, Offset))
+ return true;
+
+ if (selectAddrFrameIndexOffset(Addr, Base, Offset, 10))
+ return true;
+
+ return selectAddrDefault(Addr, Base, Offset);
+}
+
+bool MipsSEDAGToDAGISel::selectIntAddrSImm10Lsl1(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const {
+ if (selectAddrFrameIndex(Addr, Base, Offset))
+ return true;
+
+ if (selectAddrFrameIndexOffset(Addr, Base, Offset, 10, 1))
+ return true;
+
+ return selectAddrDefault(Addr, Base, Offset);
+}
+
+bool MipsSEDAGToDAGISel::selectIntAddrSImm10Lsl2(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const {
+ if (selectAddrFrameIndex(Addr, Base, Offset))
+ return true;
+
+ if (selectAddrFrameIndexOffset(Addr, Base, Offset, 10, 2))
+ return true;
+
+ return selectAddrDefault(Addr, Base, Offset);
+}
+
+bool MipsSEDAGToDAGISel::selectIntAddrSImm10Lsl3(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const {
+ if (selectAddrFrameIndex(Addr, Base, Offset))
+ return true;
+
+ if (selectAddrFrameIndexOffset(Addr, Base, Offset, 10, 3))
+ return true;
+
+ return selectAddrDefault(Addr, Base, Offset);
+}
+
+// Select constant vector splats.
+//
+// Returns true and sets Imm if:
+// * MSA is enabled
+// * N is a ISD::BUILD_VECTOR representing a constant splat
+bool MipsSEDAGToDAGISel::selectVSplat(SDNode *N, APInt &Imm,
+ unsigned MinSizeInBits) const {
+ if (!Subtarget->hasMSA())
+ return false;
+
+ BuildVectorSDNode *Node = dyn_cast<BuildVectorSDNode>(N);
+
+ if (!Node)
+ return false;
+
+ APInt SplatValue, SplatUndef;
+ unsigned SplatBitSize;
+ bool HasAnyUndefs;
+
+ if (!Node->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs,
+ MinSizeInBits, !Subtarget->isLittle()))
+ return false;
+
+ Imm = SplatValue;
+
+ return true;
+}
+
+// Select constant vector splats.
+//
+// In addition to the requirements of selectVSplat(), this function returns
+// true and sets Imm if:
+// * The splat value is the same width as the elements of the vector
+// * The splat value fits in an integer with the specified signed-ness and
+// width.
+//
+// This function looks through ISD::BITCAST nodes.
+// TODO: This might not be appropriate for big-endian MSA since BITCAST is
+// sometimes a shuffle in big-endian mode.
+//
+// It's worth noting that this function is not used as part of the selection
+// of ldi.[bhwd] since it does not permit using the wrong-typed ldi.[bhwd]
+// instruction to achieve the desired bit pattern. ldi.[bhwd] is selected in
+// MipsSEDAGToDAGISel::selectNode.
+bool MipsSEDAGToDAGISel::
+selectVSplatCommon(SDValue N, SDValue &Imm, bool Signed,
+ unsigned ImmBitSize) const {
+ APInt ImmValue;
+ EVT EltTy = N->getValueType(0).getVectorElementType();
+
+ if (N->getOpcode() == ISD::BITCAST)
+ N = N->getOperand(0);
+
+ if (selectVSplat(N.getNode(), ImmValue, EltTy.getSizeInBits()) &&
+ ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
+
+ if (( Signed && ImmValue.isSignedIntN(ImmBitSize)) ||
+ (!Signed && ImmValue.isIntN(ImmBitSize))) {
+ Imm = CurDAG->getTargetConstant(ImmValue, SDLoc(N), EltTy);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+// Select constant vector splats.
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm1(SDValue N, SDValue &Imm) const {
+ return selectVSplatCommon(N, Imm, false, 1);
+}
+
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm2(SDValue N, SDValue &Imm) const {
+ return selectVSplatCommon(N, Imm, false, 2);
+}
+
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm3(SDValue N, SDValue &Imm) const {
+ return selectVSplatCommon(N, Imm, false, 3);
+}
+
+// Select constant vector splats.
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm4(SDValue N, SDValue &Imm) const {
+ return selectVSplatCommon(N, Imm, false, 4);
+}
+
+// Select constant vector splats.
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm5(SDValue N, SDValue &Imm) const {
+ return selectVSplatCommon(N, Imm, false, 5);
+}
+
+// Select constant vector splats.
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm6(SDValue N, SDValue &Imm) const {
+ return selectVSplatCommon(N, Imm, false, 6);
+}
+
+// Select constant vector splats.
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm8(SDValue N, SDValue &Imm) const {
+ return selectVSplatCommon(N, Imm, false, 8);
+}
+
+// Select constant vector splats.
+bool MipsSEDAGToDAGISel::
+selectVSplatSimm5(SDValue N, SDValue &Imm) const {
+ return selectVSplatCommon(N, Imm, true, 5);
+}
+
+// Select constant vector splats whose value is a power of 2.
+//
+// In addition to the requirements of selectVSplat(), this function returns
+// true and sets Imm if:
+// * The splat value is the same width as the elements of the vector
+// * The splat value is a power of two.
+//
+// This function looks through ISD::BITCAST nodes.
+// TODO: This might not be appropriate for big-endian MSA since BITCAST is
+// sometimes a shuffle in big-endian mode.
+bool MipsSEDAGToDAGISel::selectVSplatUimmPow2(SDValue N, SDValue &Imm) const {
+ APInt ImmValue;
+ EVT EltTy = N->getValueType(0).getVectorElementType();
+
+ if (N->getOpcode() == ISD::BITCAST)
+ N = N->getOperand(0);
+
+ if (selectVSplat(N.getNode(), ImmValue, EltTy.getSizeInBits()) &&
+ ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
+ int32_t Log2 = ImmValue.exactLogBase2();
+
+ if (Log2 != -1) {
+ Imm = CurDAG->getTargetConstant(Log2, SDLoc(N), EltTy);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+// Select constant vector splats whose value only has a consecutive sequence
+// of left-most bits set (e.g. 0b11...1100...00).
+//
+// In addition to the requirements of selectVSplat(), this function returns
+// true and sets Imm if:
+// * The splat value is the same width as the elements of the vector
+// * The splat value is a consecutive sequence of left-most bits.
+//
+// This function looks through ISD::BITCAST nodes.
+// TODO: This might not be appropriate for big-endian MSA since BITCAST is
+// sometimes a shuffle in big-endian mode.
+bool MipsSEDAGToDAGISel::selectVSplatMaskL(SDValue N, SDValue &Imm) const {
+ APInt ImmValue;
+ EVT EltTy = N->getValueType(0).getVectorElementType();
+
+ if (N->getOpcode() == ISD::BITCAST)
+ N = N->getOperand(0);
+
+ if (selectVSplat(N.getNode(), ImmValue, EltTy.getSizeInBits()) &&
+ ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
+ // Extract the run of set bits starting with bit zero from the bitwise
+ // inverse of ImmValue, and test that the inverse of this is the same
+ // as the original value.
+ if (ImmValue == ~(~ImmValue & ~(~ImmValue + 1))) {
+
+ Imm = CurDAG->getTargetConstant(ImmValue.countPopulation(), SDLoc(N),
+ EltTy);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+// Select constant vector splats whose value only has a consecutive sequence
+// of right-most bits set (e.g. 0b00...0011...11).
+//
+// In addition to the requirements of selectVSplat(), this function returns
+// true and sets Imm if:
+// * The splat value is the same width as the elements of the vector
+// * The splat value is a consecutive sequence of right-most bits.
+//
+// This function looks through ISD::BITCAST nodes.
+// TODO: This might not be appropriate for big-endian MSA since BITCAST is
+// sometimes a shuffle in big-endian mode.
+bool MipsSEDAGToDAGISel::selectVSplatMaskR(SDValue N, SDValue &Imm) const {
+ APInt ImmValue;
+ EVT EltTy = N->getValueType(0).getVectorElementType();
+
+ if (N->getOpcode() == ISD::BITCAST)
+ N = N->getOperand(0);
+
+ if (selectVSplat(N.getNode(), ImmValue, EltTy.getSizeInBits()) &&
+ ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
+ // Extract the run of set bits starting with bit zero, and test that the
+ // result is the same as the original value
+ if (ImmValue == (ImmValue & ~(ImmValue + 1))) {
+ Imm = CurDAG->getTargetConstant(ImmValue.countPopulation(), SDLoc(N),
+ EltTy);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool MipsSEDAGToDAGISel::selectVSplatUimmInvPow2(SDValue N,
+ SDValue &Imm) const {
+ APInt ImmValue;
+ EVT EltTy = N->getValueType(0).getVectorElementType();
+
+ if (N->getOpcode() == ISD::BITCAST)
+ N = N->getOperand(0);
+
+ if (selectVSplat(N.getNode(), ImmValue, EltTy.getSizeInBits()) &&
+ ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
+ int32_t Log2 = (~ImmValue).exactLogBase2();
+
+ if (Log2 != -1) {
+ Imm = CurDAG->getTargetConstant(Log2, SDLoc(N), EltTy);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) {
+ unsigned Opcode = Node->getOpcode();
+ SDLoc DL(Node);
+
+ ///
+ // Instruction Selection not handled by the auto-generated
+ // tablegen selection should be handled here.
+ ///
+ switch(Opcode) {
+ default: break;
+
+ case ISD::SUBE: {
+ SDValue InFlag = Node->getOperand(2);
+ unsigned Opc = Subtarget->isGP64bit() ? Mips::DSUBu : Mips::SUBu;
+ selectAddESubE(Opc, InFlag, InFlag.getOperand(0), DL, Node);
+ return true;
+ }
+
+ case ISD::ADDE: {
+ if (Subtarget->hasDSP()) // Select DSP instructions, ADDSC and ADDWC.
+ break;
+ SDValue InFlag = Node->getOperand(2);
+ unsigned Opc = Subtarget->isGP64bit() ? Mips::DADDu : Mips::ADDu;
+ selectAddESubE(Opc, InFlag, InFlag.getValue(0), DL, Node);
+ return true;
+ }
+
+ case ISD::ConstantFP: {
+ ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(Node);
+ if (Node->getValueType(0) == MVT::f64 && CN->isExactlyValue(+0.0)) {
+ if (Subtarget->isGP64bit()) {
+ SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
+ Mips::ZERO_64, MVT::i64);
+ ReplaceNode(Node,
+ CurDAG->getMachineNode(Mips::DMTC1, DL, MVT::f64, Zero));
+ } else if (Subtarget->isFP64bit()) {
+ SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
+ Mips::ZERO, MVT::i32);
+ ReplaceNode(Node, CurDAG->getMachineNode(Mips::BuildPairF64_64, DL,
+ MVT::f64, Zero, Zero));
+ } else {
+ SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
+ Mips::ZERO, MVT::i32);
+ ReplaceNode(Node, CurDAG->getMachineNode(Mips::BuildPairF64, DL,
+ MVT::f64, Zero, Zero));
+ }
+ return true;
+ }
+ break;
+ }
+
+ case ISD::Constant: {
+ const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Node);
+ int64_t Imm = CN->getSExtValue();
+ unsigned Size = CN->getValueSizeInBits(0);
+
+ if (isInt<32>(Imm))
+ break;
+
+ MipsAnalyzeImmediate AnalyzeImm;
+
+ const MipsAnalyzeImmediate::InstSeq &Seq =
+ AnalyzeImm.Analyze(Imm, Size, false);
+
+ MipsAnalyzeImmediate::InstSeq::const_iterator Inst = Seq.begin();
+ SDLoc DL(CN);
+ SDNode *RegOpnd;
+ SDValue ImmOpnd = CurDAG->getTargetConstant(SignExtend64<16>(Inst->ImmOpnd),
+ DL, MVT::i64);
+
+ // The first instruction can be a LUi which is different from other
+ // instructions (ADDiu, ORI and SLL) in that it does not have a register
+ // operand.
+ if (Inst->Opc == Mips::LUi64)
+ RegOpnd = CurDAG->getMachineNode(Inst->Opc, DL, MVT::i64, ImmOpnd);
+ else
+ RegOpnd =
+ CurDAG->getMachineNode(Inst->Opc, DL, MVT::i64,
+ CurDAG->getRegister(Mips::ZERO_64, MVT::i64),
+ ImmOpnd);
+
+ // The remaining instructions in the sequence are handled here.
+ for (++Inst; Inst != Seq.end(); ++Inst) {
+ ImmOpnd = CurDAG->getTargetConstant(SignExtend64<16>(Inst->ImmOpnd), DL,
+ MVT::i64);
+ RegOpnd = CurDAG->getMachineNode(Inst->Opc, DL, MVT::i64,
+ SDValue(RegOpnd, 0), ImmOpnd);
+ }
+
+ ReplaceNode(Node, RegOpnd);
+ return true;
+ }
+
+ case ISD::INTRINSIC_W_CHAIN: {
+ switch (cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue()) {
+ default:
+ break;
+
+ case Intrinsic::mips_cfcmsa: {
+ SDValue ChainIn = Node->getOperand(0);
+ SDValue RegIdx = Node->getOperand(2);
+ SDValue Reg = CurDAG->getCopyFromReg(ChainIn, DL,
+ getMSACtrlReg(RegIdx), MVT::i32);
+ ReplaceNode(Node, Reg.getNode());
+ return true;
+ }
+ }
+ break;
+ }
+
+ case ISD::INTRINSIC_WO_CHAIN: {
+ switch (cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue()) {
+ default:
+ break;
+
+ case Intrinsic::mips_move_v:
+ // Like an assignment but will always produce a move.v even if
+ // unnecessary.
+ ReplaceNode(Node, CurDAG->getMachineNode(Mips::MOVE_V, DL,
+ Node->getValueType(0),
+ Node->getOperand(1)));
+ return true;
+ }
+ break;
+ }
+
+ case ISD::INTRINSIC_VOID: {
+ switch (cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue()) {
+ default:
+ break;
+
+ case Intrinsic::mips_ctcmsa: {
+ SDValue ChainIn = Node->getOperand(0);
+ SDValue RegIdx = Node->getOperand(2);
+ SDValue Value = Node->getOperand(3);
+ SDValue ChainOut = CurDAG->getCopyToReg(ChainIn, DL,
+ getMSACtrlReg(RegIdx), Value);
+ ReplaceNode(Node, ChainOut.getNode());
+ return true;
+ }
+ }
+ break;
+ }
+
+ case MipsISD::ThreadPointer: {
+ EVT PtrVT = getTargetLowering()->getPointerTy(CurDAG->getDataLayout());
+ unsigned RdhwrOpc, DestReg;
+
+ if (PtrVT == MVT::i32) {
+ RdhwrOpc = Mips::RDHWR;
+ DestReg = Mips::V1;
+ } else {
+ RdhwrOpc = Mips::RDHWR64;
+ DestReg = Mips::V1_64;
+ }
+
+ SDNode *Rdhwr =
+ CurDAG->getMachineNode(RdhwrOpc, DL,
+ Node->getValueType(0),
+ CurDAG->getRegister(Mips::HWR29, MVT::i32));
+ SDValue Chain = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, DestReg,
+ SDValue(Rdhwr, 0));
+ SDValue ResNode = CurDAG->getCopyFromReg(Chain, DL, DestReg, PtrVT);
+ ReplaceNode(Node, ResNode.getNode());
+ return true;
+ }
+
+ case ISD::BUILD_VECTOR: {
+ // Select appropriate ldi.[bhwd] instructions for constant splats of
+ // 128-bit when MSA is enabled. Fixup any register class mismatches that
+ // occur as a result.
+ //
+ // This allows the compiler to use a wider range of immediates than would
+ // otherwise be allowed. If, for example, v4i32 could only use ldi.h then
+ // it would not be possible to load { 0x01010101, 0x01010101, 0x01010101,
+ // 0x01010101 } without using a constant pool. This would be sub-optimal
+ // when // 'ldi.b wd, 1' is capable of producing that bit-pattern in the
+ // same set/ of registers. Similarly, ldi.h isn't capable of producing {
+ // 0x00000000, 0x00000001, 0x00000000, 0x00000001 } but 'ldi.d wd, 1' can.
+
+ BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Node);
+ APInt SplatValue, SplatUndef;
+ unsigned SplatBitSize;
+ bool HasAnyUndefs;
+ unsigned LdiOp;
+ EVT ResVecTy = BVN->getValueType(0);
+ EVT ViaVecTy;
+
+ if (!Subtarget->hasMSA() || !BVN->getValueType(0).is128BitVector())
+ return false;
+
+ if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
+ HasAnyUndefs, 8,
+ !Subtarget->isLittle()))
+ return false;
+
+ switch (SplatBitSize) {
+ default:
+ return false;
+ case 8:
+ LdiOp = Mips::LDI_B;
+ ViaVecTy = MVT::v16i8;
+ break;
+ case 16:
+ LdiOp = Mips::LDI_H;
+ ViaVecTy = MVT::v8i16;
+ break;
+ case 32:
+ LdiOp = Mips::LDI_W;
+ ViaVecTy = MVT::v4i32;
+ break;
+ case 64:
+ LdiOp = Mips::LDI_D;
+ ViaVecTy = MVT::v2i64;
+ break;
+ }
+
+ if (!SplatValue.isSignedIntN(10))
+ return false;
+
+ SDValue Imm = CurDAG->getTargetConstant(SplatValue, DL,
+ ViaVecTy.getVectorElementType());
+
+ SDNode *Res = CurDAG->getMachineNode(LdiOp, DL, ViaVecTy, Imm);
+
+ if (ResVecTy != ViaVecTy) {
+ // If LdiOp is writing to a different register class to ResVecTy, then
+ // fix it up here. This COPY_TO_REGCLASS should never cause a move.v
+ // since the source and destination register sets contain the same
+ // registers.
+ const TargetLowering *TLI = getTargetLowering();
+ MVT ResVecTySimple = ResVecTy.getSimpleVT();
+ const TargetRegisterClass *RC = TLI->getRegClassFor(ResVecTySimple);
+ Res = CurDAG->getMachineNode(Mips::COPY_TO_REGCLASS, DL,
+ ResVecTy, SDValue(Res, 0),
+ CurDAG->getTargetConstant(RC->getID(), DL,
+ MVT::i32));
+ }
+
+ ReplaceNode(Node, Res);
+ return true;
+ }
+
+ }
+
+ return false;
+}
+
+bool MipsSEDAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
+ std::vector<SDValue> &OutOps) {
+ SDValue Base, Offset;
+
+ switch(ConstraintID) {
+ default:
+ llvm_unreachable("Unexpected asm memory constraint");
+ // All memory constraints can at least accept raw pointers.
+ case InlineAsm::Constraint_i:
+ OutOps.push_back(Op);
+ OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32));
+ return false;
+ case InlineAsm::Constraint_m:
+ if (selectAddrRegImm16(Op, Base, Offset)) {
+ OutOps.push_back(Base);
+ OutOps.push_back(Offset);
+ return false;
+ }
+ OutOps.push_back(Op);
+ OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32));
+ return false;
+ case InlineAsm::Constraint_R:
+ // The 'R' constraint is supposed to be much more complicated than this.
+ // However, it's becoming less useful due to architectural changes and
+ // ought to be replaced by other constraints such as 'ZC'.
+ // For now, support 9-bit signed offsets which is supportable by all
+ // subtargets for all instructions.
+ if (selectAddrRegImm9(Op, Base, Offset)) {
+ OutOps.push_back(Base);
+ OutOps.push_back(Offset);
+ return false;
+ }
+ OutOps.push_back(Op);
+ OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32));
+ return false;
+ case InlineAsm::Constraint_ZC:
+ // ZC matches whatever the pref, ll, and sc instructions can handle for the
+ // given subtarget.
+ if (Subtarget->inMicroMipsMode()) {
+ // On microMIPS, they can handle 12-bit offsets.
+ if (selectAddrRegImm12(Op, Base, Offset)) {
+ OutOps.push_back(Base);
+ OutOps.push_back(Offset);
+ return false;
+ }
+ } else if (Subtarget->hasMips32r6()) {
+ // On MIPS32r6/MIPS64r6, they can only handle 9-bit offsets.
+ if (selectAddrRegImm9(Op, Base, Offset)) {
+ OutOps.push_back(Base);
+ OutOps.push_back(Offset);
+ return false;
+ }
+ } else if (selectAddrRegImm16(Op, Base, Offset)) {
+ // Prior to MIPS32r6/MIPS64r6, they can handle 16-bit offsets.
+ OutOps.push_back(Base);
+ OutOps.push_back(Offset);
+ return false;
+ }
+ // In all cases, 0-bit offsets are acceptable.
+ OutOps.push_back(Op);
+ OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32));
+ return false;
+ }
+ return true;
+}
+
+FunctionPass *llvm::createMipsSEISelDag(MipsTargetMachine &TM,
+ CodeGenOpt::Level OptLevel) {
+ return new MipsSEDAGToDAGISel(TM, OptLevel);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.h b/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.h
new file mode 100644
index 000000000000..2a8e5877e848
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.h
@@ -0,0 +1,146 @@
+//===-- MipsSEISelDAGToDAG.h - A Dag to Dag Inst Selector for MipsSE -----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsDAGToDAGISel specialized for mips32/64.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSSEISELDAGTODAG_H
+#define LLVM_LIB_TARGET_MIPS_MIPSSEISELDAGTODAG_H
+
+#include "MipsISelDAGToDAG.h"
+
+namespace llvm {
+
+class MipsSEDAGToDAGISel : public MipsDAGToDAGISel {
+
+public:
+ explicit MipsSEDAGToDAGISel(MipsTargetMachine &TM, CodeGenOpt::Level OL)
+ : MipsDAGToDAGISel(TM, OL) {}
+
+private:
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ void addDSPCtrlRegOperands(bool IsDef, MachineInstr &MI,
+ MachineFunction &MF);
+
+ unsigned getMSACtrlReg(const SDValue RegIdx) const;
+
+ bool replaceUsesWithZeroReg(MachineRegisterInfo *MRI, const MachineInstr&);
+
+ std::pair<SDNode *, SDNode *> selectMULT(SDNode *N, unsigned Opc,
+ const SDLoc &dl, EVT Ty, bool HasLo,
+ bool HasHi);
+
+ void selectAddESubE(unsigned MOp, SDValue InFlag, SDValue CmpLHS,
+ const SDLoc &DL, SDNode *Node) const;
+
+ bool selectAddrFrameIndex(SDValue Addr, SDValue &Base, SDValue &Offset) const;
+ bool selectAddrFrameIndexOffset(SDValue Addr, SDValue &Base, SDValue &Offset,
+ unsigned OffsetBits,
+ unsigned ShiftAmount) const;
+
+ bool selectAddrRegImm(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const override;
+
+ bool selectAddrDefault(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const override;
+
+ bool selectIntAddr(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const override;
+
+ bool selectAddrRegImm9(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const;
+
+ bool selectAddrRegImm11(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const;
+
+ bool selectAddrRegImm12(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const;
+
+ bool selectAddrRegImm16(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const;
+
+ bool selectIntAddr11MM(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const override;
+
+ bool selectIntAddr12MM(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const override;
+
+ bool selectIntAddr16MM(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const override;
+
+ bool selectIntAddrLSL2MM(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const override;
+
+ bool selectIntAddrSImm10(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const override;
+
+ bool selectIntAddrSImm10Lsl1(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const override;
+
+ bool selectIntAddrSImm10Lsl2(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const override;
+
+ bool selectIntAddrSImm10Lsl3(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const override;
+
+ /// \brief Select constant vector splats.
+ bool selectVSplat(SDNode *N, APInt &Imm,
+ unsigned MinSizeInBits) const override;
+ /// \brief Select constant vector splats whose value fits in a given integer.
+ bool selectVSplatCommon(SDValue N, SDValue &Imm, bool Signed,
+ unsigned ImmBitSize) const;
+ /// \brief Select constant vector splats whose value fits in a uimm1.
+ bool selectVSplatUimm1(SDValue N, SDValue &Imm) const override;
+ /// \brief Select constant vector splats whose value fits in a uimm2.
+ bool selectVSplatUimm2(SDValue N, SDValue &Imm) const override;
+ /// \brief Select constant vector splats whose value fits in a uimm3.
+ bool selectVSplatUimm3(SDValue N, SDValue &Imm) const override;
+ /// \brief Select constant vector splats whose value fits in a uimm4.
+ bool selectVSplatUimm4(SDValue N, SDValue &Imm) const override;
+ /// \brief Select constant vector splats whose value fits in a uimm5.
+ bool selectVSplatUimm5(SDValue N, SDValue &Imm) const override;
+ /// \brief Select constant vector splats whose value fits in a uimm6.
+ bool selectVSplatUimm6(SDValue N, SDValue &Imm) const override;
+ /// \brief Select constant vector splats whose value fits in a uimm8.
+ bool selectVSplatUimm8(SDValue N, SDValue &Imm) const override;
+ /// \brief Select constant vector splats whose value fits in a simm5.
+ bool selectVSplatSimm5(SDValue N, SDValue &Imm) const override;
+ /// \brief Select constant vector splats whose value is a power of 2.
+ bool selectVSplatUimmPow2(SDValue N, SDValue &Imm) const override;
+ /// \brief Select constant vector splats whose value is the inverse of a
+ /// power of 2.
+ bool selectVSplatUimmInvPow2(SDValue N, SDValue &Imm) const override;
+ /// \brief Select constant vector splats whose value is a run of set bits
+ /// ending at the most significant bit
+ bool selectVSplatMaskL(SDValue N, SDValue &Imm) const override;
+ /// \brief Select constant vector splats whose value is a run of set bits
+ /// starting at bit zero.
+ bool selectVSplatMaskR(SDValue N, SDValue &Imm) const override;
+
+ bool trySelect(SDNode *Node) override;
+
+ void processFunctionAfterISel(MachineFunction &MF) override;
+
+ // Insert instructions to initialize the global base register in the
+ // first MBB of the function.
+ void initGlobalBaseReg(MachineFunction &MF);
+
+ bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+ unsigned ConstraintID,
+ std::vector<SDValue> &OutOps) override;
+};
+
+FunctionPass *createMipsSEISelDag(MipsTargetMachine &TM,
+ CodeGenOpt::Level OptLevel);
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
new file mode 100644
index 000000000000..26e0f9a94368
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -0,0 +1,3779 @@
+//===-- MipsSEISelLowering.cpp - MipsSE DAG Lowering Interface --*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsTargetLowering specialized for mips32/64.
+//
+//===----------------------------------------------------------------------===//
+#include "MipsSEISelLowering.h"
+#include "MipsMachineFunction.h"
+#include "MipsRegisterInfo.h"
+#include "MipsTargetMachine.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-isel"
+
+static cl::opt<bool>
+UseMipsTailCalls("mips-tail-calls", cl::Hidden,
+ cl::desc("MIPS: permit tail calls."), cl::init(false));
+
+static cl::opt<bool> NoDPLoadStore("mno-ldc1-sdc1", cl::init(false),
+ cl::desc("Expand double precision loads and "
+ "stores to their single precision "
+ "counterparts"));
+
+MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM,
+ const MipsSubtarget &STI)
+ : MipsTargetLowering(TM, STI) {
+ // Set up the register classes
+ addRegisterClass(MVT::i32, &Mips::GPR32RegClass);
+
+ if (Subtarget.isGP64bit())
+ addRegisterClass(MVT::i64, &Mips::GPR64RegClass);
+
+ if (Subtarget.hasDSP() || Subtarget.hasMSA()) {
+ // Expand all truncating stores and extending loads.
+ for (MVT VT0 : MVT::vector_valuetypes()) {
+ for (MVT VT1 : MVT::vector_valuetypes()) {
+ setTruncStoreAction(VT0, VT1, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, VT0, VT1, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, VT0, VT1, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT0, VT1, Expand);
+ }
+ }
+ }
+
+ if (Subtarget.hasDSP()) {
+ MVT::SimpleValueType VecTys[2] = {MVT::v2i16, MVT::v4i8};
+
+ for (unsigned i = 0; i < array_lengthof(VecTys); ++i) {
+ addRegisterClass(VecTys[i], &Mips::DSPRRegClass);
+
+ // Expand all builtin opcodes.
+ for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
+ setOperationAction(Opc, VecTys[i], Expand);
+
+ setOperationAction(ISD::ADD, VecTys[i], Legal);
+ setOperationAction(ISD::SUB, VecTys[i], Legal);
+ setOperationAction(ISD::LOAD, VecTys[i], Legal);
+ setOperationAction(ISD::STORE, VecTys[i], Legal);
+ setOperationAction(ISD::BITCAST, VecTys[i], Legal);
+ }
+
+ setTargetDAGCombine(ISD::SHL);
+ setTargetDAGCombine(ISD::SRA);
+ setTargetDAGCombine(ISD::SRL);
+ setTargetDAGCombine(ISD::SETCC);
+ setTargetDAGCombine(ISD::VSELECT);
+ }
+
+ if (Subtarget.hasDSPR2())
+ setOperationAction(ISD::MUL, MVT::v2i16, Legal);
+
+ if (Subtarget.hasMSA()) {
+ addMSAIntType(MVT::v16i8, &Mips::MSA128BRegClass);
+ addMSAIntType(MVT::v8i16, &Mips::MSA128HRegClass);
+ addMSAIntType(MVT::v4i32, &Mips::MSA128WRegClass);
+ addMSAIntType(MVT::v2i64, &Mips::MSA128DRegClass);
+ addMSAFloatType(MVT::v8f16, &Mips::MSA128HRegClass);
+ addMSAFloatType(MVT::v4f32, &Mips::MSA128WRegClass);
+ addMSAFloatType(MVT::v2f64, &Mips::MSA128DRegClass);
+
+ // f16 is a storage-only type, always promote it to f32.
+ addRegisterClass(MVT::f16, &Mips::MSA128HRegClass);
+ setOperationAction(ISD::SETCC, MVT::f16, Promote);
+ setOperationAction(ISD::BR_CC, MVT::f16, Promote);
+ setOperationAction(ISD::SELECT_CC, MVT::f16, Promote);
+ setOperationAction(ISD::SELECT, MVT::f16, Promote);
+ setOperationAction(ISD::FADD, MVT::f16, Promote);
+ setOperationAction(ISD::FSUB, MVT::f16, Promote);
+ setOperationAction(ISD::FMUL, MVT::f16, Promote);
+ setOperationAction(ISD::FDIV, MVT::f16, Promote);
+ setOperationAction(ISD::FREM, MVT::f16, Promote);
+ setOperationAction(ISD::FMA, MVT::f16, Promote);
+ setOperationAction(ISD::FNEG, MVT::f16, Promote);
+ setOperationAction(ISD::FABS, MVT::f16, Promote);
+ setOperationAction(ISD::FCEIL, MVT::f16, Promote);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote);
+ setOperationAction(ISD::FCOS, MVT::f16, Promote);
+ setOperationAction(ISD::FP_EXTEND, MVT::f16, Promote);
+ setOperationAction(ISD::FFLOOR, MVT::f16, Promote);
+ setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote);
+ setOperationAction(ISD::FPOW, MVT::f16, Promote);
+ setOperationAction(ISD::FPOWI, MVT::f16, Promote);
+ setOperationAction(ISD::FRINT, MVT::f16, Promote);
+ setOperationAction(ISD::FSIN, MVT::f16, Promote);
+ setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
+ setOperationAction(ISD::FSQRT, MVT::f16, Promote);
+ setOperationAction(ISD::FEXP, MVT::f16, Promote);
+ setOperationAction(ISD::FEXP2, MVT::f16, Promote);
+ setOperationAction(ISD::FLOG, MVT::f16, Promote);
+ setOperationAction(ISD::FLOG2, MVT::f16, Promote);
+ setOperationAction(ISD::FLOG10, MVT::f16, Promote);
+ setOperationAction(ISD::FROUND, MVT::f16, Promote);
+ setOperationAction(ISD::FTRUNC, MVT::f16, Promote);
+ setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
+ setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
+ setOperationAction(ISD::FMINNAN, MVT::f16, Promote);
+ setOperationAction(ISD::FMAXNAN, MVT::f16, Promote);
+
+ setTargetDAGCombine(ISD::AND);
+ setTargetDAGCombine(ISD::OR);
+ setTargetDAGCombine(ISD::SRA);
+ setTargetDAGCombine(ISD::VSELECT);
+ setTargetDAGCombine(ISD::XOR);
+ }
+
+ if (!Subtarget.useSoftFloat()) {
+ addRegisterClass(MVT::f32, &Mips::FGR32RegClass);
+
+ // When dealing with single precision only, use libcalls
+ if (!Subtarget.isSingleFloat()) {
+ if (Subtarget.isFP64bit())
+ addRegisterClass(MVT::f64, &Mips::FGR64RegClass);
+ else
+ addRegisterClass(MVT::f64, &Mips::AFGR64RegClass);
+ }
+ }
+
+ setOperationAction(ISD::SMUL_LOHI, MVT::i32, Custom);
+ setOperationAction(ISD::UMUL_LOHI, MVT::i32, Custom);
+ setOperationAction(ISD::MULHS, MVT::i32, Custom);
+ setOperationAction(ISD::MULHU, MVT::i32, Custom);
+
+ if (Subtarget.hasCnMips())
+ setOperationAction(ISD::MUL, MVT::i64, Legal);
+ else if (Subtarget.isGP64bit())
+ setOperationAction(ISD::MUL, MVT::i64, Custom);
+
+ if (Subtarget.isGP64bit()) {
+ setOperationAction(ISD::SMUL_LOHI, MVT::i64, Custom);
+ setOperationAction(ISD::UMUL_LOHI, MVT::i64, Custom);
+ setOperationAction(ISD::MULHS, MVT::i64, Custom);
+ setOperationAction(ISD::MULHU, MVT::i64, Custom);
+ setOperationAction(ISD::SDIVREM, MVT::i64, Custom);
+ setOperationAction(ISD::UDIVREM, MVT::i64, Custom);
+ }
+
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
+
+ setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
+ setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
+ setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
+ setOperationAction(ISD::LOAD, MVT::i32, Custom);
+ setOperationAction(ISD::STORE, MVT::i32, Custom);
+
+ setTargetDAGCombine(ISD::ADDE);
+ setTargetDAGCombine(ISD::SUBE);
+ setTargetDAGCombine(ISD::MUL);
+
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+
+ if (NoDPLoadStore) {
+ setOperationAction(ISD::LOAD, MVT::f64, Custom);
+ setOperationAction(ISD::STORE, MVT::f64, Custom);
+ }
+
+ if (Subtarget.hasMips32r6()) {
+ // MIPS32r6 replaces the accumulator-based multiplies with a three register
+ // instruction
+ setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+ setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+ setOperationAction(ISD::MUL, MVT::i32, Legal);
+ setOperationAction(ISD::MULHS, MVT::i32, Legal);
+ setOperationAction(ISD::MULHU, MVT::i32, Legal);
+
+ // MIPS32r6 replaces the accumulator-based division/remainder with separate
+ // three register division and remainder instructions.
+ setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+ setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+ setOperationAction(ISD::SDIV, MVT::i32, Legal);
+ setOperationAction(ISD::UDIV, MVT::i32, Legal);
+ setOperationAction(ISD::SREM, MVT::i32, Legal);
+ setOperationAction(ISD::UREM, MVT::i32, Legal);
+
+ // MIPS32r6 replaces conditional moves with an equivalent that removes the
+ // need for three GPR read ports.
+ setOperationAction(ISD::SETCC, MVT::i32, Legal);
+ setOperationAction(ISD::SELECT, MVT::i32, Legal);
+ setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
+
+ setOperationAction(ISD::SETCC, MVT::f32, Legal);
+ setOperationAction(ISD::SELECT, MVT::f32, Legal);
+ setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
+
+ assert(Subtarget.isFP64bit() && "FR=1 is required for MIPS32r6");
+ setOperationAction(ISD::SETCC, MVT::f64, Legal);
+ setOperationAction(ISD::SELECT, MVT::f64, Legal);
+ setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
+
+ setOperationAction(ISD::BRCOND, MVT::Other, Legal);
+
+ // Floating point > and >= are supported via < and <=
+ setCondCodeAction(ISD::SETOGE, MVT::f32, Expand);
+ setCondCodeAction(ISD::SETOGT, MVT::f32, Expand);
+ setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
+ setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
+
+ setCondCodeAction(ISD::SETOGE, MVT::f64, Expand);
+ setCondCodeAction(ISD::SETOGT, MVT::f64, Expand);
+ setCondCodeAction(ISD::SETUGE, MVT::f64, Expand);
+ setCondCodeAction(ISD::SETUGT, MVT::f64, Expand);
+ }
+
+ if (Subtarget.hasMips64r6()) {
+ // MIPS64r6 replaces the accumulator-based multiplies with a three register
+ // instruction
+ setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+ setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+ setOperationAction(ISD::MUL, MVT::i64, Legal);
+ setOperationAction(ISD::MULHS, MVT::i64, Legal);
+ setOperationAction(ISD::MULHU, MVT::i64, Legal);
+
+ // MIPS32r6 replaces the accumulator-based division/remainder with separate
+ // three register division and remainder instructions.
+ setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+ setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
+ setOperationAction(ISD::SDIV, MVT::i64, Legal);
+ setOperationAction(ISD::UDIV, MVT::i64, Legal);
+ setOperationAction(ISD::SREM, MVT::i64, Legal);
+ setOperationAction(ISD::UREM, MVT::i64, Legal);
+
+ // MIPS64r6 replaces conditional moves with an equivalent that removes the
+ // need for three GPR read ports.
+ setOperationAction(ISD::SETCC, MVT::i64, Legal);
+ setOperationAction(ISD::SELECT, MVT::i64, Legal);
+ setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
+ }
+
+ computeRegisterProperties(Subtarget.getRegisterInfo());
+}
+
+const MipsTargetLowering *
+llvm::createMipsSETargetLowering(const MipsTargetMachine &TM,
+ const MipsSubtarget &STI) {
+ return new MipsSETargetLowering(TM, STI);
+}
+
+const TargetRegisterClass *
+MipsSETargetLowering::getRepRegClassFor(MVT VT) const {
+ if (VT == MVT::Untyped)
+ return Subtarget.hasDSP() ? &Mips::ACC64DSPRegClass : &Mips::ACC64RegClass;
+
+ return TargetLowering::getRepRegClassFor(VT);
+}
+
+// Enable MSA support for the given integer type and Register class.
+void MipsSETargetLowering::
+addMSAIntType(MVT::SimpleValueType Ty, const TargetRegisterClass *RC) {
+ addRegisterClass(Ty, RC);
+
+ // Expand all builtin opcodes.
+ for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
+ setOperationAction(Opc, Ty, Expand);
+
+ setOperationAction(ISD::BITCAST, Ty, Legal);
+ setOperationAction(ISD::LOAD, Ty, Legal);
+ setOperationAction(ISD::STORE, Ty, Legal);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, Ty, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, Ty, Legal);
+ setOperationAction(ISD::BUILD_VECTOR, Ty, Custom);
+
+ setOperationAction(ISD::ADD, Ty, Legal);
+ setOperationAction(ISD::AND, Ty, Legal);
+ setOperationAction(ISD::CTLZ, Ty, Legal);
+ setOperationAction(ISD::CTPOP, Ty, Legal);
+ setOperationAction(ISD::MUL, Ty, Legal);
+ setOperationAction(ISD::OR, Ty, Legal);
+ setOperationAction(ISD::SDIV, Ty, Legal);
+ setOperationAction(ISD::SREM, Ty, Legal);
+ setOperationAction(ISD::SHL, Ty, Legal);
+ setOperationAction(ISD::SRA, Ty, Legal);
+ setOperationAction(ISD::SRL, Ty, Legal);
+ setOperationAction(ISD::SUB, Ty, Legal);
+ setOperationAction(ISD::UDIV, Ty, Legal);
+ setOperationAction(ISD::UREM, Ty, Legal);
+ setOperationAction(ISD::VECTOR_SHUFFLE, Ty, Custom);
+ setOperationAction(ISD::VSELECT, Ty, Legal);
+ setOperationAction(ISD::XOR, Ty, Legal);
+
+ if (Ty == MVT::v4i32 || Ty == MVT::v2i64) {
+ setOperationAction(ISD::FP_TO_SINT, Ty, Legal);
+ setOperationAction(ISD::FP_TO_UINT, Ty, Legal);
+ setOperationAction(ISD::SINT_TO_FP, Ty, Legal);
+ setOperationAction(ISD::UINT_TO_FP, Ty, Legal);
+ }
+
+ setOperationAction(ISD::SETCC, Ty, Legal);
+ setCondCodeAction(ISD::SETNE, Ty, Expand);
+ setCondCodeAction(ISD::SETGE, Ty, Expand);
+ setCondCodeAction(ISD::SETGT, Ty, Expand);
+ setCondCodeAction(ISD::SETUGE, Ty, Expand);
+ setCondCodeAction(ISD::SETUGT, Ty, Expand);
+}
+
+// Enable MSA support for the given floating-point type and Register class.
+void MipsSETargetLowering::
+addMSAFloatType(MVT::SimpleValueType Ty, const TargetRegisterClass *RC) {
+ addRegisterClass(Ty, RC);
+
+ // Expand all builtin opcodes.
+ for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
+ setOperationAction(Opc, Ty, Expand);
+
+ setOperationAction(ISD::LOAD, Ty, Legal);
+ setOperationAction(ISD::STORE, Ty, Legal);
+ setOperationAction(ISD::BITCAST, Ty, Legal);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, Ty, Legal);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, Ty, Legal);
+ setOperationAction(ISD::BUILD_VECTOR, Ty, Custom);
+
+ if (Ty != MVT::v8f16) {
+ setOperationAction(ISD::FABS, Ty, Legal);
+ setOperationAction(ISD::FADD, Ty, Legal);
+ setOperationAction(ISD::FDIV, Ty, Legal);
+ setOperationAction(ISD::FEXP2, Ty, Legal);
+ setOperationAction(ISD::FLOG2, Ty, Legal);
+ setOperationAction(ISD::FMA, Ty, Legal);
+ setOperationAction(ISD::FMUL, Ty, Legal);
+ setOperationAction(ISD::FRINT, Ty, Legal);
+ setOperationAction(ISD::FSQRT, Ty, Legal);
+ setOperationAction(ISD::FSUB, Ty, Legal);
+ setOperationAction(ISD::VSELECT, Ty, Legal);
+
+ setOperationAction(ISD::SETCC, Ty, Legal);
+ setCondCodeAction(ISD::SETOGE, Ty, Expand);
+ setCondCodeAction(ISD::SETOGT, Ty, Expand);
+ setCondCodeAction(ISD::SETUGE, Ty, Expand);
+ setCondCodeAction(ISD::SETUGT, Ty, Expand);
+ setCondCodeAction(ISD::SETGE, Ty, Expand);
+ setCondCodeAction(ISD::SETGT, Ty, Expand);
+ }
+}
+
+bool
+MipsSETargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+ unsigned,
+ unsigned,
+ bool *Fast) const {
+ MVT::SimpleValueType SVT = VT.getSimpleVT().SimpleTy;
+
+ if (Subtarget.systemSupportsUnalignedAccess()) {
+ // MIPS32r6/MIPS64r6 is required to support unaligned access. It's
+ // implementation defined whether this is handled by hardware, software, or
+ // a hybrid of the two but it's expected that most implementations will
+ // handle the majority of cases in hardware.
+ if (Fast)
+ *Fast = true;
+ return true;
+ }
+
+ switch (SVT) {
+ case MVT::i64:
+ case MVT::i32:
+ if (Fast)
+ *Fast = true;
+ return true;
+ default:
+ return false;
+ }
+}
+
+SDValue MipsSETargetLowering::LowerOperation(SDValue Op,
+ SelectionDAG &DAG) const {
+ switch(Op.getOpcode()) {
+ case ISD::LOAD: return lowerLOAD(Op, DAG);
+ case ISD::STORE: return lowerSTORE(Op, DAG);
+ case ISD::SMUL_LOHI: return lowerMulDiv(Op, MipsISD::Mult, true, true, DAG);
+ case ISD::UMUL_LOHI: return lowerMulDiv(Op, MipsISD::Multu, true, true, DAG);
+ case ISD::MULHS: return lowerMulDiv(Op, MipsISD::Mult, false, true, DAG);
+ case ISD::MULHU: return lowerMulDiv(Op, MipsISD::Multu, false, true, DAG);
+ case ISD::MUL: return lowerMulDiv(Op, MipsISD::Mult, true, false, DAG);
+ case ISD::SDIVREM: return lowerMulDiv(Op, MipsISD::DivRem, true, true, DAG);
+ case ISD::UDIVREM: return lowerMulDiv(Op, MipsISD::DivRemU, true, true,
+ DAG);
+ case ISD::INTRINSIC_WO_CHAIN: return lowerINTRINSIC_WO_CHAIN(Op, DAG);
+ case ISD::INTRINSIC_W_CHAIN: return lowerINTRINSIC_W_CHAIN(Op, DAG);
+ case ISD::INTRINSIC_VOID: return lowerINTRINSIC_VOID(Op, DAG);
+ case ISD::EXTRACT_VECTOR_ELT: return lowerEXTRACT_VECTOR_ELT(Op, DAG);
+ case ISD::BUILD_VECTOR: return lowerBUILD_VECTOR(Op, DAG);
+ case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, DAG);
+ }
+
+ return MipsTargetLowering::LowerOperation(Op, DAG);
+}
+
+// selectMADD -
+// Transforms a subgraph in CurDAG if the following pattern is found:
+// (addc multLo, Lo0), (adde multHi, Hi0),
+// where,
+// multHi/Lo: product of multiplication
+// Lo0: initial value of Lo register
+// Hi0: initial value of Hi register
+// Return true if pattern matching was successful.
+static bool selectMADD(SDNode *ADDENode, SelectionDAG *CurDAG) {
+ // ADDENode's second operand must be a flag output of an ADDC node in order
+ // for the matching to be successful.
+ SDNode *ADDCNode = ADDENode->getOperand(2).getNode();
+
+ if (ADDCNode->getOpcode() != ISD::ADDC)
+ return false;
+
+ SDValue MultHi = ADDENode->getOperand(0);
+ SDValue MultLo = ADDCNode->getOperand(0);
+ SDNode *MultNode = MultHi.getNode();
+ unsigned MultOpc = MultHi.getOpcode();
+
+ // MultHi and MultLo must be generated by the same node,
+ if (MultLo.getNode() != MultNode)
+ return false;
+
+ // and it must be a multiplication.
+ if (MultOpc != ISD::SMUL_LOHI && MultOpc != ISD::UMUL_LOHI)
+ return false;
+
+ // MultLo amd MultHi must be the first and second output of MultNode
+ // respectively.
+ if (MultHi.getResNo() != 1 || MultLo.getResNo() != 0)
+ return false;
+
+ // Transform this to a MADD only if ADDENode and ADDCNode are the only users
+ // of the values of MultNode, in which case MultNode will be removed in later
+ // phases.
+ // If there exist users other than ADDENode or ADDCNode, this function returns
+ // here, which will result in MultNode being mapped to a single MULT
+ // instruction node rather than a pair of MULT and MADD instructions being
+ // produced.
+ if (!MultHi.hasOneUse() || !MultLo.hasOneUse())
+ return false;
+
+ SDLoc DL(ADDENode);
+
+ // Initialize accumulator.
+ SDValue ACCIn = CurDAG->getNode(MipsISD::MTLOHI, DL, MVT::Untyped,
+ ADDCNode->getOperand(1),
+ ADDENode->getOperand(1));
+
+ // create MipsMAdd(u) node
+ MultOpc = MultOpc == ISD::UMUL_LOHI ? MipsISD::MAddu : MipsISD::MAdd;
+
+ SDValue MAdd = CurDAG->getNode(MultOpc, DL, MVT::Untyped,
+ MultNode->getOperand(0),// Factor 0
+ MultNode->getOperand(1),// Factor 1
+ ACCIn);
+
+ // replace uses of adde and addc here
+ if (!SDValue(ADDCNode, 0).use_empty()) {
+ SDValue LoOut = CurDAG->getNode(MipsISD::MFLO, DL, MVT::i32, MAdd);
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(ADDCNode, 0), LoOut);
+ }
+ if (!SDValue(ADDENode, 0).use_empty()) {
+ SDValue HiOut = CurDAG->getNode(MipsISD::MFHI, DL, MVT::i32, MAdd);
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(ADDENode, 0), HiOut);
+ }
+
+ return true;
+}
+
+// selectMSUB -
+// Transforms a subgraph in CurDAG if the following pattern is found:
+// (addc Lo0, multLo), (sube Hi0, multHi),
+// where,
+// multHi/Lo: product of multiplication
+// Lo0: initial value of Lo register
+// Hi0: initial value of Hi register
+// Return true if pattern matching was successful.
+static bool selectMSUB(SDNode *SUBENode, SelectionDAG *CurDAG) {
+ // SUBENode's second operand must be a flag output of an SUBC node in order
+ // for the matching to be successful.
+ SDNode *SUBCNode = SUBENode->getOperand(2).getNode();
+
+ if (SUBCNode->getOpcode() != ISD::SUBC)
+ return false;
+
+ SDValue MultHi = SUBENode->getOperand(1);
+ SDValue MultLo = SUBCNode->getOperand(1);
+ SDNode *MultNode = MultHi.getNode();
+ unsigned MultOpc = MultHi.getOpcode();
+
+ // MultHi and MultLo must be generated by the same node,
+ if (MultLo.getNode() != MultNode)
+ return false;
+
+ // and it must be a multiplication.
+ if (MultOpc != ISD::SMUL_LOHI && MultOpc != ISD::UMUL_LOHI)
+ return false;
+
+ // MultLo amd MultHi must be the first and second output of MultNode
+ // respectively.
+ if (MultHi.getResNo() != 1 || MultLo.getResNo() != 0)
+ return false;
+
+ // Transform this to a MSUB only if SUBENode and SUBCNode are the only users
+ // of the values of MultNode, in which case MultNode will be removed in later
+ // phases.
+ // If there exist users other than SUBENode or SUBCNode, this function returns
+ // here, which will result in MultNode being mapped to a single MULT
+ // instruction node rather than a pair of MULT and MSUB instructions being
+ // produced.
+ if (!MultHi.hasOneUse() || !MultLo.hasOneUse())
+ return false;
+
+ SDLoc DL(SUBENode);
+
+ // Initialize accumulator.
+ SDValue ACCIn = CurDAG->getNode(MipsISD::MTLOHI, DL, MVT::Untyped,
+ SUBCNode->getOperand(0),
+ SUBENode->getOperand(0));
+
+ // create MipsSub(u) node
+ MultOpc = MultOpc == ISD::UMUL_LOHI ? MipsISD::MSubu : MipsISD::MSub;
+
+ SDValue MSub = CurDAG->getNode(MultOpc, DL, MVT::Glue,
+ MultNode->getOperand(0),// Factor 0
+ MultNode->getOperand(1),// Factor 1
+ ACCIn);
+
+ // replace uses of sube and subc here
+ if (!SDValue(SUBCNode, 0).use_empty()) {
+ SDValue LoOut = CurDAG->getNode(MipsISD::MFLO, DL, MVT::i32, MSub);
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(SUBCNode, 0), LoOut);
+ }
+ if (!SDValue(SUBENode, 0).use_empty()) {
+ SDValue HiOut = CurDAG->getNode(MipsISD::MFHI, DL, MVT::i32, MSub);
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(SUBENode, 0), HiOut);
+ }
+
+ return true;
+}
+
+static SDValue performADDECombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const MipsSubtarget &Subtarget) {
+ if (DCI.isBeforeLegalize())
+ return SDValue();
+
+ if (Subtarget.hasMips32() && !Subtarget.hasMips32r6() &&
+ N->getValueType(0) == MVT::i32 && selectMADD(N, &DAG))
+ return SDValue(N, 0);
+
+ return SDValue();
+}
+
+// Fold zero extensions into MipsISD::VEXTRACT_[SZ]EXT_ELT
+//
+// Performs the following transformations:
+// - Changes MipsISD::VEXTRACT_[SZ]EXT_ELT to zero extension if its
+// sign/zero-extension is completely overwritten by the new one performed by
+// the ISD::AND.
+// - Removes redundant zero extensions performed by an ISD::AND.
+static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const MipsSubtarget &Subtarget) {
+ if (!Subtarget.hasMSA())
+ return SDValue();
+
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ unsigned Op0Opcode = Op0->getOpcode();
+
+ // (and (MipsVExtract[SZ]Ext $a, $b, $c), imm:$d)
+ // where $d + 1 == 2^n and n == 32
+ // or $d + 1 == 2^n and n <= 32 and ZExt
+ // -> (MipsVExtractZExt $a, $b, $c)
+ if (Op0Opcode == MipsISD::VEXTRACT_SEXT_ELT ||
+ Op0Opcode == MipsISD::VEXTRACT_ZEXT_ELT) {
+ ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Op1);
+
+ if (!Mask)
+ return SDValue();
+
+ int32_t Log2IfPositive = (Mask->getAPIntValue() + 1).exactLogBase2();
+
+ if (Log2IfPositive <= 0)
+ return SDValue(); // Mask+1 is not a power of 2
+
+ SDValue Op0Op2 = Op0->getOperand(2);
+ EVT ExtendTy = cast<VTSDNode>(Op0Op2)->getVT();
+ unsigned ExtendTySize = ExtendTy.getSizeInBits();
+ unsigned Log2 = Log2IfPositive;
+
+ if ((Op0Opcode == MipsISD::VEXTRACT_ZEXT_ELT && Log2 >= ExtendTySize) ||
+ Log2 == ExtendTySize) {
+ SDValue Ops[] = { Op0->getOperand(0), Op0->getOperand(1), Op0Op2 };
+ return DAG.getNode(MipsISD::VEXTRACT_ZEXT_ELT, SDLoc(Op0),
+ Op0->getVTList(),
+ makeArrayRef(Ops, Op0->getNumOperands()));
+ }
+ }
+
+ return SDValue();
+}
+
+// Determine if the specified node is a constant vector splat.
+//
+// Returns true and sets Imm if:
+// * N is a ISD::BUILD_VECTOR representing a constant splat
+//
+// This function is quite similar to MipsSEDAGToDAGISel::selectVSplat. The
+// differences are that it assumes the MSA has already been checked and the
+// arbitrary requirement for a maximum of 32-bit integers isn't applied (and
+// must not be in order for binsri.d to be selectable).
+static bool isVSplat(SDValue N, APInt &Imm, bool IsLittleEndian) {
+ BuildVectorSDNode *Node = dyn_cast<BuildVectorSDNode>(N.getNode());
+
+ if (!Node)
+ return false;
+
+ APInt SplatValue, SplatUndef;
+ unsigned SplatBitSize;
+ bool HasAnyUndefs;
+
+ if (!Node->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs,
+ 8, !IsLittleEndian))
+ return false;
+
+ Imm = SplatValue;
+
+ return true;
+}
+
+// Test whether the given node is an all-ones build_vector.
+static bool isVectorAllOnes(SDValue N) {
+ // Look through bitcasts. Endianness doesn't matter because we are looking
+ // for an all-ones value.
+ if (N->getOpcode() == ISD::BITCAST)
+ N = N->getOperand(0);
+
+ BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N);
+
+ if (!BVN)
+ return false;
+
+ APInt SplatValue, SplatUndef;
+ unsigned SplatBitSize;
+ bool HasAnyUndefs;
+
+ // Endianness doesn't matter in this context because we are looking for
+ // an all-ones value.
+ if (BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs))
+ return SplatValue.isAllOnesValue();
+
+ return false;
+}
+
+// Test whether N is the bitwise inverse of OfNode.
+static bool isBitwiseInverse(SDValue N, SDValue OfNode) {
+ if (N->getOpcode() != ISD::XOR)
+ return false;
+
+ if (isVectorAllOnes(N->getOperand(0)))
+ return N->getOperand(1) == OfNode;
+
+ if (isVectorAllOnes(N->getOperand(1)))
+ return N->getOperand(0) == OfNode;
+
+ return false;
+}
+
+// Perform combines where ISD::OR is the root node.
+//
+// Performs the following transformations:
+// - (or (and $a, $mask), (and $b, $inv_mask)) => (vselect $mask, $a, $b)
+// where $inv_mask is the bitwise inverse of $mask and the 'or' has a 128-bit
+// vector type.
+static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const MipsSubtarget &Subtarget) {
+ if (!Subtarget.hasMSA())
+ return SDValue();
+
+ EVT Ty = N->getValueType(0);
+
+ if (!Ty.is128BitVector())
+ return SDValue();
+
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+
+ if (Op0->getOpcode() == ISD::AND && Op1->getOpcode() == ISD::AND) {
+ SDValue Op0Op0 = Op0->getOperand(0);
+ SDValue Op0Op1 = Op0->getOperand(1);
+ SDValue Op1Op0 = Op1->getOperand(0);
+ SDValue Op1Op1 = Op1->getOperand(1);
+ bool IsLittleEndian = !Subtarget.isLittle();
+
+ SDValue IfSet, IfClr, Cond;
+ bool IsConstantMask = false;
+ APInt Mask, InvMask;
+
+ // If Op0Op0 is an appropriate mask, try to find it's inverse in either
+ // Op1Op0, or Op1Op1. Keep track of the Cond, IfSet, and IfClr nodes, while
+ // looking.
+ // IfClr will be set if we find a valid match.
+ if (isVSplat(Op0Op0, Mask, IsLittleEndian)) {
+ Cond = Op0Op0;
+ IfSet = Op0Op1;
+
+ if (isVSplat(Op1Op0, InvMask, IsLittleEndian) &&
+ Mask.getBitWidth() == InvMask.getBitWidth() && Mask == ~InvMask)
+ IfClr = Op1Op1;
+ else if (isVSplat(Op1Op1, InvMask, IsLittleEndian) &&
+ Mask.getBitWidth() == InvMask.getBitWidth() && Mask == ~InvMask)
+ IfClr = Op1Op0;
+
+ IsConstantMask = true;
+ }
+
+ // If IfClr is not yet set, and Op0Op1 is an appropriate mask, try the same
+ // thing again using this mask.
+ // IfClr will be set if we find a valid match.
+ if (!IfClr.getNode() && isVSplat(Op0Op1, Mask, IsLittleEndian)) {
+ Cond = Op0Op1;
+ IfSet = Op0Op0;
+
+ if (isVSplat(Op1Op0, InvMask, IsLittleEndian) &&
+ Mask.getBitWidth() == InvMask.getBitWidth() && Mask == ~InvMask)
+ IfClr = Op1Op1;
+ else if (isVSplat(Op1Op1, InvMask, IsLittleEndian) &&
+ Mask.getBitWidth() == InvMask.getBitWidth() && Mask == ~InvMask)
+ IfClr = Op1Op0;
+
+ IsConstantMask = true;
+ }
+
+ // If IfClr is not yet set, try looking for a non-constant match.
+ // IfClr will be set if we find a valid match amongst the eight
+ // possibilities.
+ if (!IfClr.getNode()) {
+ if (isBitwiseInverse(Op0Op0, Op1Op0)) {
+ Cond = Op1Op0;
+ IfSet = Op1Op1;
+ IfClr = Op0Op1;
+ } else if (isBitwiseInverse(Op0Op1, Op1Op0)) {
+ Cond = Op1Op0;
+ IfSet = Op1Op1;
+ IfClr = Op0Op0;
+ } else if (isBitwiseInverse(Op0Op0, Op1Op1)) {
+ Cond = Op1Op1;
+ IfSet = Op1Op0;
+ IfClr = Op0Op1;
+ } else if (isBitwiseInverse(Op0Op1, Op1Op1)) {
+ Cond = Op1Op1;
+ IfSet = Op1Op0;
+ IfClr = Op0Op0;
+ } else if (isBitwiseInverse(Op1Op0, Op0Op0)) {
+ Cond = Op0Op0;
+ IfSet = Op0Op1;
+ IfClr = Op1Op1;
+ } else if (isBitwiseInverse(Op1Op1, Op0Op0)) {
+ Cond = Op0Op0;
+ IfSet = Op0Op1;
+ IfClr = Op1Op0;
+ } else if (isBitwiseInverse(Op1Op0, Op0Op1)) {
+ Cond = Op0Op1;
+ IfSet = Op0Op0;
+ IfClr = Op1Op1;
+ } else if (isBitwiseInverse(Op1Op1, Op0Op1)) {
+ Cond = Op0Op1;
+ IfSet = Op0Op0;
+ IfClr = Op1Op0;
+ }
+ }
+
+ // At this point, IfClr will be set if we have a valid match.
+ if (!IfClr.getNode())
+ return SDValue();
+
+ assert(Cond.getNode() && IfSet.getNode());
+
+ // Fold degenerate cases.
+ if (IsConstantMask) {
+ if (Mask.isAllOnesValue())
+ return IfSet;
+ else if (Mask == 0)
+ return IfClr;
+ }
+
+ // Transform the DAG into an equivalent VSELECT.
+ return DAG.getNode(ISD::VSELECT, SDLoc(N), Ty, Cond, IfSet, IfClr);
+ }
+
+ return SDValue();
+}
+
+static SDValue performSUBECombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const MipsSubtarget &Subtarget) {
+ if (DCI.isBeforeLegalize())
+ return SDValue();
+
+ if (Subtarget.hasMips32() && N->getValueType(0) == MVT::i32 &&
+ selectMSUB(N, &DAG))
+ return SDValue(N, 0);
+
+ return SDValue();
+}
+
+static SDValue genConstMult(SDValue X, uint64_t C, const SDLoc &DL, EVT VT,
+ EVT ShiftTy, SelectionDAG &DAG) {
+ // Clear the upper (64 - VT.sizeInBits) bits.
+ C &= ((uint64_t)-1) >> (64 - VT.getSizeInBits());
+
+ // Return 0.
+ if (C == 0)
+ return DAG.getConstant(0, DL, VT);
+
+ // Return x.
+ if (C == 1)
+ return X;
+
+ // If c is power of 2, return (shl x, log2(c)).
+ if (isPowerOf2_64(C))
+ return DAG.getNode(ISD::SHL, DL, VT, X,
+ DAG.getConstant(Log2_64(C), DL, ShiftTy));
+
+ unsigned Log2Ceil = Log2_64_Ceil(C);
+ uint64_t Floor = 1LL << Log2_64(C);
+ uint64_t Ceil = Log2Ceil == 64 ? 0LL : 1LL << Log2Ceil;
+
+ // If |c - floor_c| <= |c - ceil_c|,
+ // where floor_c = pow(2, floor(log2(c))) and ceil_c = pow(2, ceil(log2(c))),
+ // return (add constMult(x, floor_c), constMult(x, c - floor_c)).
+ if (C - Floor <= Ceil - C) {
+ SDValue Op0 = genConstMult(X, Floor, DL, VT, ShiftTy, DAG);
+ SDValue Op1 = genConstMult(X, C - Floor, DL, VT, ShiftTy, DAG);
+ return DAG.getNode(ISD::ADD, DL, VT, Op0, Op1);
+ }
+
+ // If |c - floor_c| > |c - ceil_c|,
+ // return (sub constMult(x, ceil_c), constMult(x, ceil_c - c)).
+ SDValue Op0 = genConstMult(X, Ceil, DL, VT, ShiftTy, DAG);
+ SDValue Op1 = genConstMult(X, Ceil - C, DL, VT, ShiftTy, DAG);
+ return DAG.getNode(ISD::SUB, DL, VT, Op0, Op1);
+}
+
+static SDValue performMULCombine(SDNode *N, SelectionDAG &DAG,
+ const TargetLowering::DAGCombinerInfo &DCI,
+ const MipsSETargetLowering *TL) {
+ EVT VT = N->getValueType(0);
+
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)))
+ if (!VT.isVector())
+ return genConstMult(N->getOperand(0), C->getZExtValue(), SDLoc(N), VT,
+ TL->getScalarShiftAmountTy(DAG.getDataLayout(), VT),
+ DAG);
+
+ return SDValue(N, 0);
+}
+
+static SDValue performDSPShiftCombine(unsigned Opc, SDNode *N, EVT Ty,
+ SelectionDAG &DAG,
+ const MipsSubtarget &Subtarget) {
+ // See if this is a vector splat immediate node.
+ APInt SplatValue, SplatUndef;
+ unsigned SplatBitSize;
+ bool HasAnyUndefs;
+ unsigned EltSize = Ty.getScalarSizeInBits();
+ BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
+
+ if (!Subtarget.hasDSP())
+ return SDValue();
+
+ if (!BV ||
+ !BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs,
+ EltSize, !Subtarget.isLittle()) ||
+ (SplatBitSize != EltSize) ||
+ (SplatValue.getZExtValue() >= EltSize))
+ return SDValue();
+
+ SDLoc DL(N);
+ return DAG.getNode(Opc, DL, Ty, N->getOperand(0),
+ DAG.getConstant(SplatValue.getZExtValue(), DL, MVT::i32));
+}
+
+static SDValue performSHLCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const MipsSubtarget &Subtarget) {
+ EVT Ty = N->getValueType(0);
+
+ if ((Ty != MVT::v2i16) && (Ty != MVT::v4i8))
+ return SDValue();
+
+ return performDSPShiftCombine(MipsISD::SHLL_DSP, N, Ty, DAG, Subtarget);
+}
+
+// Fold sign-extensions into MipsISD::VEXTRACT_[SZ]EXT_ELT for MSA and fold
+// constant splats into MipsISD::SHRA_DSP for DSPr2.
+//
+// Performs the following transformations:
+// - Changes MipsISD::VEXTRACT_[SZ]EXT_ELT to sign extension if its
+// sign/zero-extension is completely overwritten by the new one performed by
+// the ISD::SRA and ISD::SHL nodes.
+// - Removes redundant sign extensions performed by an ISD::SRA and ISD::SHL
+// sequence.
+//
+// See performDSPShiftCombine for more information about the transformation
+// used for DSPr2.
+static SDValue performSRACombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const MipsSubtarget &Subtarget) {
+ EVT Ty = N->getValueType(0);
+
+ if (Subtarget.hasMSA()) {
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+
+ // (sra (shl (MipsVExtract[SZ]Ext $a, $b, $c), imm:$d), imm:$d)
+ // where $d + sizeof($c) == 32
+ // or $d + sizeof($c) <= 32 and SExt
+ // -> (MipsVExtractSExt $a, $b, $c)
+ if (Op0->getOpcode() == ISD::SHL && Op1 == Op0->getOperand(1)) {
+ SDValue Op0Op0 = Op0->getOperand(0);
+ ConstantSDNode *ShAmount = dyn_cast<ConstantSDNode>(Op1);
+
+ if (!ShAmount)
+ return SDValue();
+
+ if (Op0Op0->getOpcode() != MipsISD::VEXTRACT_SEXT_ELT &&
+ Op0Op0->getOpcode() != MipsISD::VEXTRACT_ZEXT_ELT)
+ return SDValue();
+
+ EVT ExtendTy = cast<VTSDNode>(Op0Op0->getOperand(2))->getVT();
+ unsigned TotalBits = ShAmount->getZExtValue() + ExtendTy.getSizeInBits();
+
+ if (TotalBits == 32 ||
+ (Op0Op0->getOpcode() == MipsISD::VEXTRACT_SEXT_ELT &&
+ TotalBits <= 32)) {
+ SDValue Ops[] = { Op0Op0->getOperand(0), Op0Op0->getOperand(1),
+ Op0Op0->getOperand(2) };
+ return DAG.getNode(MipsISD::VEXTRACT_SEXT_ELT, SDLoc(Op0Op0),
+ Op0Op0->getVTList(),
+ makeArrayRef(Ops, Op0Op0->getNumOperands()));
+ }
+ }
+ }
+
+ if ((Ty != MVT::v2i16) && ((Ty != MVT::v4i8) || !Subtarget.hasDSPR2()))
+ return SDValue();
+
+ return performDSPShiftCombine(MipsISD::SHRA_DSP, N, Ty, DAG, Subtarget);
+}
+
+
+static SDValue performSRLCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const MipsSubtarget &Subtarget) {
+ EVT Ty = N->getValueType(0);
+
+ if (((Ty != MVT::v2i16) || !Subtarget.hasDSPR2()) && (Ty != MVT::v4i8))
+ return SDValue();
+
+ return performDSPShiftCombine(MipsISD::SHRL_DSP, N, Ty, DAG, Subtarget);
+}
+
+static bool isLegalDSPCondCode(EVT Ty, ISD::CondCode CC) {
+ bool IsV216 = (Ty == MVT::v2i16);
+
+ switch (CC) {
+ case ISD::SETEQ:
+ case ISD::SETNE: return true;
+ case ISD::SETLT:
+ case ISD::SETLE:
+ case ISD::SETGT:
+ case ISD::SETGE: return IsV216;
+ case ISD::SETULT:
+ case ISD::SETULE:
+ case ISD::SETUGT:
+ case ISD::SETUGE: return !IsV216;
+ default: return false;
+ }
+}
+
+static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG) {
+ EVT Ty = N->getValueType(0);
+
+ if ((Ty != MVT::v2i16) && (Ty != MVT::v4i8))
+ return SDValue();
+
+ if (!isLegalDSPCondCode(Ty, cast<CondCodeSDNode>(N->getOperand(2))->get()))
+ return SDValue();
+
+ return DAG.getNode(MipsISD::SETCC_DSP, SDLoc(N), Ty, N->getOperand(0),
+ N->getOperand(1), N->getOperand(2));
+}
+
+static SDValue performVSELECTCombine(SDNode *N, SelectionDAG &DAG) {
+ EVT Ty = N->getValueType(0);
+
+ if (Ty.is128BitVector() && Ty.isInteger()) {
+ // Try the following combines:
+ // (vselect (setcc $a, $b, SETLT), $b, $a)) -> (vsmax $a, $b)
+ // (vselect (setcc $a, $b, SETLE), $b, $a)) -> (vsmax $a, $b)
+ // (vselect (setcc $a, $b, SETLT), $a, $b)) -> (vsmin $a, $b)
+ // (vselect (setcc $a, $b, SETLE), $a, $b)) -> (vsmin $a, $b)
+ // (vselect (setcc $a, $b, SETULT), $b, $a)) -> (vumax $a, $b)
+ // (vselect (setcc $a, $b, SETULE), $b, $a)) -> (vumax $a, $b)
+ // (vselect (setcc $a, $b, SETULT), $a, $b)) -> (vumin $a, $b)
+ // (vselect (setcc $a, $b, SETULE), $a, $b)) -> (vumin $a, $b)
+ // SETGT/SETGE/SETUGT/SETUGE variants of these will show up initially but
+ // will be expanded to equivalent SETLT/SETLE/SETULT/SETULE versions by the
+ // legalizer.
+ SDValue Op0 = N->getOperand(0);
+
+ if (Op0->getOpcode() != ISD::SETCC)
+ return SDValue();
+
+ ISD::CondCode CondCode = cast<CondCodeSDNode>(Op0->getOperand(2))->get();
+ bool Signed;
+
+ if (CondCode == ISD::SETLT || CondCode == ISD::SETLE)
+ Signed = true;
+ else if (CondCode == ISD::SETULT || CondCode == ISD::SETULE)
+ Signed = false;
+ else
+ return SDValue();
+
+ SDValue Op1 = N->getOperand(1);
+ SDValue Op2 = N->getOperand(2);
+ SDValue Op0Op0 = Op0->getOperand(0);
+ SDValue Op0Op1 = Op0->getOperand(1);
+
+ if (Op1 == Op0Op0 && Op2 == Op0Op1)
+ return DAG.getNode(Signed ? MipsISD::VSMIN : MipsISD::VUMIN, SDLoc(N),
+ Ty, Op1, Op2);
+ else if (Op1 == Op0Op1 && Op2 == Op0Op0)
+ return DAG.getNode(Signed ? MipsISD::VSMAX : MipsISD::VUMAX, SDLoc(N),
+ Ty, Op1, Op2);
+ } else if ((Ty == MVT::v2i16) || (Ty == MVT::v4i8)) {
+ SDValue SetCC = N->getOperand(0);
+
+ if (SetCC.getOpcode() != MipsISD::SETCC_DSP)
+ return SDValue();
+
+ return DAG.getNode(MipsISD::SELECT_CC_DSP, SDLoc(N), Ty,
+ SetCC.getOperand(0), SetCC.getOperand(1),
+ N->getOperand(1), N->getOperand(2), SetCC.getOperand(2));
+ }
+
+ return SDValue();
+}
+
+static SDValue performXORCombine(SDNode *N, SelectionDAG &DAG,
+ const MipsSubtarget &Subtarget) {
+ EVT Ty = N->getValueType(0);
+
+ if (Subtarget.hasMSA() && Ty.is128BitVector() && Ty.isInteger()) {
+ // Try the following combines:
+ // (xor (or $a, $b), (build_vector allones))
+ // (xor (or $a, $b), (bitcast (build_vector allones)))
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ SDValue NotOp;
+
+ if (ISD::isBuildVectorAllOnes(Op0.getNode()))
+ NotOp = Op1;
+ else if (ISD::isBuildVectorAllOnes(Op1.getNode()))
+ NotOp = Op0;
+ else
+ return SDValue();
+
+ if (NotOp->getOpcode() == ISD::OR)
+ return DAG.getNode(MipsISD::VNOR, SDLoc(N), Ty, NotOp->getOperand(0),
+ NotOp->getOperand(1));
+ }
+
+ return SDValue();
+}
+
+SDValue
+MipsSETargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue Val;
+
+ switch (N->getOpcode()) {
+ case ISD::ADDE:
+ return performADDECombine(N, DAG, DCI, Subtarget);
+ case ISD::AND:
+ Val = performANDCombine(N, DAG, DCI, Subtarget);
+ break;
+ case ISD::OR:
+ Val = performORCombine(N, DAG, DCI, Subtarget);
+ break;
+ case ISD::SUBE:
+ return performSUBECombine(N, DAG, DCI, Subtarget);
+ case ISD::MUL:
+ return performMULCombine(N, DAG, DCI, this);
+ case ISD::SHL:
+ return performSHLCombine(N, DAG, DCI, Subtarget);
+ case ISD::SRA:
+ return performSRACombine(N, DAG, DCI, Subtarget);
+ case ISD::SRL:
+ return performSRLCombine(N, DAG, DCI, Subtarget);
+ case ISD::VSELECT:
+ return performVSELECTCombine(N, DAG);
+ case ISD::XOR:
+ Val = performXORCombine(N, DAG, Subtarget);
+ break;
+ case ISD::SETCC:
+ Val = performSETCCCombine(N, DAG);
+ break;
+ }
+
+ if (Val.getNode()) {
+ DEBUG(dbgs() << "\nMipsSE DAG Combine:\n";
+ N->printrWithDepth(dbgs(), &DAG);
+ dbgs() << "\n=> \n";
+ Val.getNode()->printrWithDepth(dbgs(), &DAG);
+ dbgs() << "\n");
+ return Val;
+ }
+
+ return MipsTargetLowering::PerformDAGCombine(N, DCI);
+}
+
+MachineBasicBlock *
+MipsSETargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ switch (MI.getOpcode()) {
+ default:
+ return MipsTargetLowering::EmitInstrWithCustomInserter(MI, BB);
+ case Mips::BPOSGE32_PSEUDO:
+ return emitBPOSGE32(MI, BB);
+ case Mips::SNZ_B_PSEUDO:
+ return emitMSACBranchPseudo(MI, BB, Mips::BNZ_B);
+ case Mips::SNZ_H_PSEUDO:
+ return emitMSACBranchPseudo(MI, BB, Mips::BNZ_H);
+ case Mips::SNZ_W_PSEUDO:
+ return emitMSACBranchPseudo(MI, BB, Mips::BNZ_W);
+ case Mips::SNZ_D_PSEUDO:
+ return emitMSACBranchPseudo(MI, BB, Mips::BNZ_D);
+ case Mips::SNZ_V_PSEUDO:
+ return emitMSACBranchPseudo(MI, BB, Mips::BNZ_V);
+ case Mips::SZ_B_PSEUDO:
+ return emitMSACBranchPseudo(MI, BB, Mips::BZ_B);
+ case Mips::SZ_H_PSEUDO:
+ return emitMSACBranchPseudo(MI, BB, Mips::BZ_H);
+ case Mips::SZ_W_PSEUDO:
+ return emitMSACBranchPseudo(MI, BB, Mips::BZ_W);
+ case Mips::SZ_D_PSEUDO:
+ return emitMSACBranchPseudo(MI, BB, Mips::BZ_D);
+ case Mips::SZ_V_PSEUDO:
+ return emitMSACBranchPseudo(MI, BB, Mips::BZ_V);
+ case Mips::COPY_FW_PSEUDO:
+ return emitCOPY_FW(MI, BB);
+ case Mips::COPY_FD_PSEUDO:
+ return emitCOPY_FD(MI, BB);
+ case Mips::INSERT_FW_PSEUDO:
+ return emitINSERT_FW(MI, BB);
+ case Mips::INSERT_FD_PSEUDO:
+ return emitINSERT_FD(MI, BB);
+ case Mips::INSERT_B_VIDX_PSEUDO:
+ case Mips::INSERT_B_VIDX64_PSEUDO:
+ return emitINSERT_DF_VIDX(MI, BB, 1, false);
+ case Mips::INSERT_H_VIDX_PSEUDO:
+ case Mips::INSERT_H_VIDX64_PSEUDO:
+ return emitINSERT_DF_VIDX(MI, BB, 2, false);
+ case Mips::INSERT_W_VIDX_PSEUDO:
+ case Mips::INSERT_W_VIDX64_PSEUDO:
+ return emitINSERT_DF_VIDX(MI, BB, 4, false);
+ case Mips::INSERT_D_VIDX_PSEUDO:
+ case Mips::INSERT_D_VIDX64_PSEUDO:
+ return emitINSERT_DF_VIDX(MI, BB, 8, false);
+ case Mips::INSERT_FW_VIDX_PSEUDO:
+ case Mips::INSERT_FW_VIDX64_PSEUDO:
+ return emitINSERT_DF_VIDX(MI, BB, 4, true);
+ case Mips::INSERT_FD_VIDX_PSEUDO:
+ case Mips::INSERT_FD_VIDX64_PSEUDO:
+ return emitINSERT_DF_VIDX(MI, BB, 8, true);
+ case Mips::FILL_FW_PSEUDO:
+ return emitFILL_FW(MI, BB);
+ case Mips::FILL_FD_PSEUDO:
+ return emitFILL_FD(MI, BB);
+ case Mips::FEXP2_W_1_PSEUDO:
+ return emitFEXP2_W_1(MI, BB);
+ case Mips::FEXP2_D_1_PSEUDO:
+ return emitFEXP2_D_1(MI, BB);
+ case Mips::ST_F16:
+ return emitST_F16_PSEUDO(MI, BB);
+ case Mips::LD_F16:
+ return emitLD_F16_PSEUDO(MI, BB);
+ case Mips::MSA_FP_EXTEND_W_PSEUDO:
+ return emitFPEXTEND_PSEUDO(MI, BB, false);
+ case Mips::MSA_FP_ROUND_W_PSEUDO:
+ return emitFPROUND_PSEUDO(MI, BB, false);
+ case Mips::MSA_FP_EXTEND_D_PSEUDO:
+ return emitFPEXTEND_PSEUDO(MI, BB, true);
+ case Mips::MSA_FP_ROUND_D_PSEUDO:
+ return emitFPROUND_PSEUDO(MI, BB, true);
+ }
+}
+
+bool MipsSETargetLowering::isEligibleForTailCallOptimization(
+ const CCState &CCInfo, unsigned NextStackOffset,
+ const MipsFunctionInfo &FI) const {
+ if (!UseMipsTailCalls)
+ return false;
+
+ // Exception has to be cleared with eret.
+ if (FI.isISR())
+ return false;
+
+ // Return false if either the callee or caller has a byval argument.
+ if (CCInfo.getInRegsParamsCount() > 0 || FI.hasByvalArg())
+ return false;
+
+ // Return true if the callee's argument area is no larger than the
+ // caller's.
+ return NextStackOffset <= FI.getIncomingArgSize();
+}
+
+void MipsSETargetLowering::
+getOpndList(SmallVectorImpl<SDValue> &Ops,
+ std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
+ bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
+ bool IsCallReloc, CallLoweringInfo &CLI, SDValue Callee,
+ SDValue Chain) const {
+ Ops.push_back(Callee);
+ MipsTargetLowering::getOpndList(Ops, RegsToPass, IsPICCall, GlobalOrExternal,
+ InternalLinkage, IsCallReloc, CLI, Callee,
+ Chain);
+}
+
+SDValue MipsSETargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+ LoadSDNode &Nd = *cast<LoadSDNode>(Op);
+
+ if (Nd.getMemoryVT() != MVT::f64 || !NoDPLoadStore)
+ return MipsTargetLowering::lowerLOAD(Op, DAG);
+
+ // Replace a double precision load with two i32 loads and a buildpair64.
+ SDLoc DL(Op);
+ SDValue Ptr = Nd.getBasePtr(), Chain = Nd.getChain();
+ EVT PtrVT = Ptr.getValueType();
+
+ // i32 load from lower address.
+ SDValue Lo = DAG.getLoad(MVT::i32, DL, Chain, Ptr, MachinePointerInfo(),
+ Nd.getAlignment(), Nd.getMemOperand()->getFlags());
+
+ // i32 load from higher address.
+ Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, DAG.getConstant(4, DL, PtrVT));
+ SDValue Hi = DAG.getLoad(
+ MVT::i32, DL, Lo.getValue(1), Ptr, MachinePointerInfo(),
+ std::min(Nd.getAlignment(), 4U), Nd.getMemOperand()->getFlags());
+
+ if (!Subtarget.isLittle())
+ std::swap(Lo, Hi);
+
+ SDValue BP = DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64, Lo, Hi);
+ SDValue Ops[2] = {BP, Hi.getValue(1)};
+ return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue MipsSETargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+ StoreSDNode &Nd = *cast<StoreSDNode>(Op);
+
+ if (Nd.getMemoryVT() != MVT::f64 || !NoDPLoadStore)
+ return MipsTargetLowering::lowerSTORE(Op, DAG);
+
+ // Replace a double precision store with two extractelement64s and i32 stores.
+ SDLoc DL(Op);
+ SDValue Val = Nd.getValue(), Ptr = Nd.getBasePtr(), Chain = Nd.getChain();
+ EVT PtrVT = Ptr.getValueType();
+ SDValue Lo = DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32,
+ Val, DAG.getConstant(0, DL, MVT::i32));
+ SDValue Hi = DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32,
+ Val, DAG.getConstant(1, DL, MVT::i32));
+
+ if (!Subtarget.isLittle())
+ std::swap(Lo, Hi);
+
+ // i32 store to lower address.
+ Chain =
+ DAG.getStore(Chain, DL, Lo, Ptr, MachinePointerInfo(), Nd.getAlignment(),
+ Nd.getMemOperand()->getFlags(), Nd.getAAInfo());
+
+ // i32 store to higher address.
+ Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, DAG.getConstant(4, DL, PtrVT));
+ return DAG.getStore(Chain, DL, Hi, Ptr, MachinePointerInfo(),
+ std::min(Nd.getAlignment(), 4U),
+ Nd.getMemOperand()->getFlags(), Nd.getAAInfo());
+}
+
+SDValue MipsSETargetLowering::lowerMulDiv(SDValue Op, unsigned NewOpc,
+ bool HasLo, bool HasHi,
+ SelectionDAG &DAG) const {
+ // MIPS32r6/MIPS64r6 removed accumulator based multiplies.
+ assert(!Subtarget.hasMips32r6());
+
+ EVT Ty = Op.getOperand(0).getValueType();
+ SDLoc DL(Op);
+ SDValue Mult = DAG.getNode(NewOpc, DL, MVT::Untyped,
+ Op.getOperand(0), Op.getOperand(1));
+ SDValue Lo, Hi;
+
+ if (HasLo)
+ Lo = DAG.getNode(MipsISD::MFLO, DL, Ty, Mult);
+ if (HasHi)
+ Hi = DAG.getNode(MipsISD::MFHI, DL, Ty, Mult);
+
+ if (!HasLo || !HasHi)
+ return HasLo ? Lo : Hi;
+
+ SDValue Vals[] = { Lo, Hi };
+ return DAG.getMergeValues(Vals, DL);
+}
+
+static SDValue initAccumulator(SDValue In, const SDLoc &DL, SelectionDAG &DAG) {
+ SDValue InLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, In,
+ DAG.getConstant(0, DL, MVT::i32));
+ SDValue InHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, In,
+ DAG.getConstant(1, DL, MVT::i32));
+ return DAG.getNode(MipsISD::MTLOHI, DL, MVT::Untyped, InLo, InHi);
+}
+
+static SDValue extractLOHI(SDValue Op, const SDLoc &DL, SelectionDAG &DAG) {
+ SDValue Lo = DAG.getNode(MipsISD::MFLO, DL, MVT::i32, Op);
+ SDValue Hi = DAG.getNode(MipsISD::MFHI, DL, MVT::i32, Op);
+ return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi);
+}
+
+// This function expands mips intrinsic nodes which have 64-bit input operands
+// or output values.
+//
+// out64 = intrinsic-node in64
+// =>
+// lo = copy (extract-element (in64, 0))
+// hi = copy (extract-element (in64, 1))
+// mips-specific-node
+// v0 = copy lo
+// v1 = copy hi
+// out64 = merge-values (v0, v1)
+//
+static SDValue lowerDSPIntr(SDValue Op, SelectionDAG &DAG, unsigned Opc) {
+ SDLoc DL(Op);
+ bool HasChainIn = Op->getOperand(0).getValueType() == MVT::Other;
+ SmallVector<SDValue, 3> Ops;
+ unsigned OpNo = 0;
+
+ // See if Op has a chain input.
+ if (HasChainIn)
+ Ops.push_back(Op->getOperand(OpNo++));
+
+ // The next operand is the intrinsic opcode.
+ assert(Op->getOperand(OpNo).getOpcode() == ISD::TargetConstant);
+
+ // See if the next operand has type i64.
+ SDValue Opnd = Op->getOperand(++OpNo), In64;
+
+ if (Opnd.getValueType() == MVT::i64)
+ In64 = initAccumulator(Opnd, DL, DAG);
+ else
+ Ops.push_back(Opnd);
+
+ // Push the remaining operands.
+ for (++OpNo ; OpNo < Op->getNumOperands(); ++OpNo)
+ Ops.push_back(Op->getOperand(OpNo));
+
+ // Add In64 to the end of the list.
+ if (In64.getNode())
+ Ops.push_back(In64);
+
+ // Scan output.
+ SmallVector<EVT, 2> ResTys;
+
+ for (SDNode::value_iterator I = Op->value_begin(), E = Op->value_end();
+ I != E; ++I)
+ ResTys.push_back((*I == MVT::i64) ? MVT::Untyped : *I);
+
+ // Create node.
+ SDValue Val = DAG.getNode(Opc, DL, ResTys, Ops);
+ SDValue Out = (ResTys[0] == MVT::Untyped) ? extractLOHI(Val, DL, DAG) : Val;
+
+ if (!HasChainIn)
+ return Out;
+
+ assert(Val->getValueType(1) == MVT::Other);
+ SDValue Vals[] = { Out, SDValue(Val.getNode(), 1) };
+ return DAG.getMergeValues(Vals, DL);
+}
+
+// Lower an MSA copy intrinsic into the specified SelectionDAG node
+static SDValue lowerMSACopyIntr(SDValue Op, SelectionDAG &DAG, unsigned Opc) {
+ SDLoc DL(Op);
+ SDValue Vec = Op->getOperand(1);
+ SDValue Idx = Op->getOperand(2);
+ EVT ResTy = Op->getValueType(0);
+ EVT EltTy = Vec->getValueType(0).getVectorElementType();
+
+ SDValue Result = DAG.getNode(Opc, DL, ResTy, Vec, Idx,
+ DAG.getValueType(EltTy));
+
+ return Result;
+}
+
+static SDValue lowerMSASplatZExt(SDValue Op, unsigned OpNr, SelectionDAG &DAG) {
+ EVT ResVecTy = Op->getValueType(0);
+ EVT ViaVecTy = ResVecTy;
+ SDLoc DL(Op);
+
+ // When ResVecTy == MVT::v2i64, LaneA is the upper 32 bits of the lane and
+ // LaneB is the lower 32-bits. Otherwise LaneA and LaneB are alternating
+ // lanes.
+ SDValue LaneA;
+ SDValue LaneB = Op->getOperand(2);
+
+ if (ResVecTy == MVT::v2i64) {
+ LaneA = DAG.getConstant(0, DL, MVT::i32);
+ ViaVecTy = MVT::v4i32;
+ } else
+ LaneA = LaneB;
+
+ SDValue Ops[16] = { LaneA, LaneB, LaneA, LaneB, LaneA, LaneB, LaneA, LaneB,
+ LaneA, LaneB, LaneA, LaneB, LaneA, LaneB, LaneA, LaneB };
+
+ SDValue Result = DAG.getBuildVector(
+ ViaVecTy, DL, makeArrayRef(Ops, ViaVecTy.getVectorNumElements()));
+
+ if (ViaVecTy != ResVecTy)
+ Result = DAG.getNode(ISD::BITCAST, DL, ResVecTy, Result);
+
+ return Result;
+}
+
+static SDValue lowerMSASplatImm(SDValue Op, unsigned ImmOp, SelectionDAG &DAG) {
+ return DAG.getConstant(Op->getConstantOperandVal(ImmOp), SDLoc(Op),
+ Op->getValueType(0));
+}
+
+static SDValue getBuildVectorSplat(EVT VecTy, SDValue SplatValue,
+ bool BigEndian, SelectionDAG &DAG) {
+ EVT ViaVecTy = VecTy;
+ SDValue SplatValueA = SplatValue;
+ SDValue SplatValueB = SplatValue;
+ SDLoc DL(SplatValue);
+
+ if (VecTy == MVT::v2i64) {
+ // v2i64 BUILD_VECTOR must be performed via v4i32 so split into i32's.
+ ViaVecTy = MVT::v4i32;
+
+ SplatValueA = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, SplatValue);
+ SplatValueB = DAG.getNode(ISD::SRL, DL, MVT::i64, SplatValue,
+ DAG.getConstant(32, DL, MVT::i32));
+ SplatValueB = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, SplatValueB);
+ }
+
+ // We currently hold the parts in little endian order. Swap them if
+ // necessary.
+ if (BigEndian)
+ std::swap(SplatValueA, SplatValueB);
+
+ SDValue Ops[16] = { SplatValueA, SplatValueB, SplatValueA, SplatValueB,
+ SplatValueA, SplatValueB, SplatValueA, SplatValueB,
+ SplatValueA, SplatValueB, SplatValueA, SplatValueB,
+ SplatValueA, SplatValueB, SplatValueA, SplatValueB };
+
+ SDValue Result = DAG.getBuildVector(
+ ViaVecTy, DL, makeArrayRef(Ops, ViaVecTy.getVectorNumElements()));
+
+ if (VecTy != ViaVecTy)
+ Result = DAG.getNode(ISD::BITCAST, DL, VecTy, Result);
+
+ return Result;
+}
+
+static SDValue lowerMSABinaryBitImmIntr(SDValue Op, SelectionDAG &DAG,
+ unsigned Opc, SDValue Imm,
+ bool BigEndian) {
+ EVT VecTy = Op->getValueType(0);
+ SDValue Exp2Imm;
+ SDLoc DL(Op);
+
+ // The DAG Combiner can't constant fold bitcasted vectors yet so we must do it
+ // here for now.
+ if (VecTy == MVT::v2i64) {
+ if (ConstantSDNode *CImm = dyn_cast<ConstantSDNode>(Imm)) {
+ APInt BitImm = APInt(64, 1) << CImm->getAPIntValue();
+
+ SDValue BitImmHiOp = DAG.getConstant(BitImm.lshr(32).trunc(32), DL,
+ MVT::i32);
+ SDValue BitImmLoOp = DAG.getConstant(BitImm.trunc(32), DL, MVT::i32);
+
+ if (BigEndian)
+ std::swap(BitImmLoOp, BitImmHiOp);
+
+ Exp2Imm = DAG.getNode(
+ ISD::BITCAST, DL, MVT::v2i64,
+ DAG.getBuildVector(MVT::v4i32, DL,
+ {BitImmLoOp, BitImmHiOp, BitImmLoOp, BitImmHiOp}));
+ }
+ }
+
+ if (!Exp2Imm.getNode()) {
+ // We couldnt constant fold, do a vector shift instead
+
+ // Extend i32 to i64 if necessary. Sign or zero extend doesn't matter since
+ // only values 0-63 are valid.
+ if (VecTy == MVT::v2i64)
+ Imm = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Imm);
+
+ Exp2Imm = getBuildVectorSplat(VecTy, Imm, BigEndian, DAG);
+
+ Exp2Imm = DAG.getNode(ISD::SHL, DL, VecTy, DAG.getConstant(1, DL, VecTy),
+ Exp2Imm);
+ }
+
+ return DAG.getNode(Opc, DL, VecTy, Op->getOperand(1), Exp2Imm);
+}
+
+static SDValue lowerMSABitClear(SDValue Op, SelectionDAG &DAG) {
+ EVT ResTy = Op->getValueType(0);
+ SDLoc DL(Op);
+ SDValue One = DAG.getConstant(1, DL, ResTy);
+ SDValue Bit = DAG.getNode(ISD::SHL, DL, ResTy, One, Op->getOperand(2));
+
+ return DAG.getNode(ISD::AND, DL, ResTy, Op->getOperand(1),
+ DAG.getNOT(DL, Bit, ResTy));
+}
+
+static SDValue lowerMSABitClearImm(SDValue Op, SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ EVT ResTy = Op->getValueType(0);
+ APInt BitImm = APInt(ResTy.getScalarSizeInBits(), 1)
+ << cast<ConstantSDNode>(Op->getOperand(2))->getAPIntValue();
+ SDValue BitMask = DAG.getConstant(~BitImm, DL, ResTy);
+
+ return DAG.getNode(ISD::AND, DL, ResTy, Op->getOperand(1), BitMask);
+}
+
+SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+
+ switch (cast<ConstantSDNode>(Op->getOperand(0))->getZExtValue()) {
+ default:
+ return SDValue();
+ case Intrinsic::mips_shilo:
+ return lowerDSPIntr(Op, DAG, MipsISD::SHILO);
+ case Intrinsic::mips_dpau_h_qbl:
+ return lowerDSPIntr(Op, DAG, MipsISD::DPAU_H_QBL);
+ case Intrinsic::mips_dpau_h_qbr:
+ return lowerDSPIntr(Op, DAG, MipsISD::DPAU_H_QBR);
+ case Intrinsic::mips_dpsu_h_qbl:
+ return lowerDSPIntr(Op, DAG, MipsISD::DPSU_H_QBL);
+ case Intrinsic::mips_dpsu_h_qbr:
+ return lowerDSPIntr(Op, DAG, MipsISD::DPSU_H_QBR);
+ case Intrinsic::mips_dpa_w_ph:
+ return lowerDSPIntr(Op, DAG, MipsISD::DPA_W_PH);
+ case Intrinsic::mips_dps_w_ph:
+ return lowerDSPIntr(Op, DAG, MipsISD::DPS_W_PH);
+ case Intrinsic::mips_dpax_w_ph:
+ return lowerDSPIntr(Op, DAG, MipsISD::DPAX_W_PH);
+ case Intrinsic::mips_dpsx_w_ph:
+ return lowerDSPIntr(Op, DAG, MipsISD::DPSX_W_PH);
+ case Intrinsic::mips_mulsa_w_ph:
+ return lowerDSPIntr(Op, DAG, MipsISD::MULSA_W_PH);
+ case Intrinsic::mips_mult:
+ return lowerDSPIntr(Op, DAG, MipsISD::Mult);
+ case Intrinsic::mips_multu:
+ return lowerDSPIntr(Op, DAG, MipsISD::Multu);
+ case Intrinsic::mips_madd:
+ return lowerDSPIntr(Op, DAG, MipsISD::MAdd);
+ case Intrinsic::mips_maddu:
+ return lowerDSPIntr(Op, DAG, MipsISD::MAddu);
+ case Intrinsic::mips_msub:
+ return lowerDSPIntr(Op, DAG, MipsISD::MSub);
+ case Intrinsic::mips_msubu:
+ return lowerDSPIntr(Op, DAG, MipsISD::MSubu);
+ case Intrinsic::mips_addv_b:
+ case Intrinsic::mips_addv_h:
+ case Intrinsic::mips_addv_w:
+ case Intrinsic::mips_addv_d:
+ return DAG.getNode(ISD::ADD, DL, Op->getValueType(0), Op->getOperand(1),
+ Op->getOperand(2));
+ case Intrinsic::mips_addvi_b:
+ case Intrinsic::mips_addvi_h:
+ case Intrinsic::mips_addvi_w:
+ case Intrinsic::mips_addvi_d:
+ return DAG.getNode(ISD::ADD, DL, Op->getValueType(0), Op->getOperand(1),
+ lowerMSASplatImm(Op, 2, DAG));
+ case Intrinsic::mips_and_v:
+ return DAG.getNode(ISD::AND, DL, Op->getValueType(0), Op->getOperand(1),
+ Op->getOperand(2));
+ case Intrinsic::mips_andi_b:
+ return DAG.getNode(ISD::AND, DL, Op->getValueType(0), Op->getOperand(1),
+ lowerMSASplatImm(Op, 2, DAG));
+ case Intrinsic::mips_bclr_b:
+ case Intrinsic::mips_bclr_h:
+ case Intrinsic::mips_bclr_w:
+ case Intrinsic::mips_bclr_d:
+ return lowerMSABitClear(Op, DAG);
+ case Intrinsic::mips_bclri_b:
+ case Intrinsic::mips_bclri_h:
+ case Intrinsic::mips_bclri_w:
+ case Intrinsic::mips_bclri_d:
+ return lowerMSABitClearImm(Op, DAG);
+ case Intrinsic::mips_binsli_b:
+ case Intrinsic::mips_binsli_h:
+ case Intrinsic::mips_binsli_w:
+ case Intrinsic::mips_binsli_d: {
+ // binsli_x(IfClear, IfSet, nbits) -> (vselect LBitsMask, IfSet, IfClear)
+ EVT VecTy = Op->getValueType(0);
+ EVT EltTy = VecTy.getVectorElementType();
+ APInt Mask = APInt::getHighBitsSet(EltTy.getSizeInBits(),
+ Op->getConstantOperandVal(3));
+ return DAG.getNode(ISD::VSELECT, DL, VecTy,
+ DAG.getConstant(Mask, DL, VecTy, true),
+ Op->getOperand(2), Op->getOperand(1));
+ }
+ case Intrinsic::mips_binsri_b:
+ case Intrinsic::mips_binsri_h:
+ case Intrinsic::mips_binsri_w:
+ case Intrinsic::mips_binsri_d: {
+ // binsri_x(IfClear, IfSet, nbits) -> (vselect RBitsMask, IfSet, IfClear)
+ EVT VecTy = Op->getValueType(0);
+ EVT EltTy = VecTy.getVectorElementType();
+ APInt Mask = APInt::getLowBitsSet(EltTy.getSizeInBits(),
+ Op->getConstantOperandVal(3));
+ return DAG.getNode(ISD::VSELECT, DL, VecTy,
+ DAG.getConstant(Mask, DL, VecTy, true),
+ Op->getOperand(2), Op->getOperand(1));
+ }
+ case Intrinsic::mips_bmnz_v:
+ return DAG.getNode(ISD::VSELECT, DL, Op->getValueType(0), Op->getOperand(3),
+ Op->getOperand(2), Op->getOperand(1));
+ case Intrinsic::mips_bmnzi_b:
+ return DAG.getNode(ISD::VSELECT, DL, Op->getValueType(0),
+ lowerMSASplatImm(Op, 3, DAG), Op->getOperand(2),
+ Op->getOperand(1));
+ case Intrinsic::mips_bmz_v:
+ return DAG.getNode(ISD::VSELECT, DL, Op->getValueType(0), Op->getOperand(3),
+ Op->getOperand(1), Op->getOperand(2));
+ case Intrinsic::mips_bmzi_b:
+ return DAG.getNode(ISD::VSELECT, DL, Op->getValueType(0),
+ lowerMSASplatImm(Op, 3, DAG), Op->getOperand(1),
+ Op->getOperand(2));
+ case Intrinsic::mips_bneg_b:
+ case Intrinsic::mips_bneg_h:
+ case Intrinsic::mips_bneg_w:
+ case Intrinsic::mips_bneg_d: {
+ EVT VecTy = Op->getValueType(0);
+ SDValue One = DAG.getConstant(1, DL, VecTy);
+
+ return DAG.getNode(ISD::XOR, DL, VecTy, Op->getOperand(1),
+ DAG.getNode(ISD::SHL, DL, VecTy, One,
+ Op->getOperand(2)));
+ }
+ case Intrinsic::mips_bnegi_b:
+ case Intrinsic::mips_bnegi_h:
+ case Intrinsic::mips_bnegi_w:
+ case Intrinsic::mips_bnegi_d:
+ return lowerMSABinaryBitImmIntr(Op, DAG, ISD::XOR, Op->getOperand(2),
+ !Subtarget.isLittle());
+ case Intrinsic::mips_bnz_b:
+ case Intrinsic::mips_bnz_h:
+ case Intrinsic::mips_bnz_w:
+ case Intrinsic::mips_bnz_d:
+ return DAG.getNode(MipsISD::VALL_NONZERO, DL, Op->getValueType(0),
+ Op->getOperand(1));
+ case Intrinsic::mips_bnz_v:
+ return DAG.getNode(MipsISD::VANY_NONZERO, DL, Op->getValueType(0),
+ Op->getOperand(1));
+ case Intrinsic::mips_bsel_v:
+ // bsel_v(Mask, IfClear, IfSet) -> (vselect Mask, IfSet, IfClear)
+ return DAG.getNode(ISD::VSELECT, DL, Op->getValueType(0),
+ Op->getOperand(1), Op->getOperand(3),
+ Op->getOperand(2));
+ case Intrinsic::mips_bseli_b:
+ // bseli_v(Mask, IfClear, IfSet) -> (vselect Mask, IfSet, IfClear)
+ return DAG.getNode(ISD::VSELECT, DL, Op->getValueType(0),
+ Op->getOperand(1), lowerMSASplatImm(Op, 3, DAG),
+ Op->getOperand(2));
+ case Intrinsic::mips_bset_b:
+ case Intrinsic::mips_bset_h:
+ case Intrinsic::mips_bset_w:
+ case Intrinsic::mips_bset_d: {
+ EVT VecTy = Op->getValueType(0);
+ SDValue One = DAG.getConstant(1, DL, VecTy);
+
+ return DAG.getNode(ISD::OR, DL, VecTy, Op->getOperand(1),
+ DAG.getNode(ISD::SHL, DL, VecTy, One,
+ Op->getOperand(2)));
+ }
+ case Intrinsic::mips_bseti_b:
+ case Intrinsic::mips_bseti_h:
+ case Intrinsic::mips_bseti_w:
+ case Intrinsic::mips_bseti_d:
+ return lowerMSABinaryBitImmIntr(Op, DAG, ISD::OR, Op->getOperand(2),
+ !Subtarget.isLittle());
+ case Intrinsic::mips_bz_b:
+ case Intrinsic::mips_bz_h:
+ case Intrinsic::mips_bz_w:
+ case Intrinsic::mips_bz_d:
+ return DAG.getNode(MipsISD::VALL_ZERO, DL, Op->getValueType(0),
+ Op->getOperand(1));
+ case Intrinsic::mips_bz_v:
+ return DAG.getNode(MipsISD::VANY_ZERO, DL, Op->getValueType(0),
+ Op->getOperand(1));
+ case Intrinsic::mips_ceq_b:
+ case Intrinsic::mips_ceq_h:
+ case Intrinsic::mips_ceq_w:
+ case Intrinsic::mips_ceq_d:
+ return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+ Op->getOperand(2), ISD::SETEQ);
+ case Intrinsic::mips_ceqi_b:
+ case Intrinsic::mips_ceqi_h:
+ case Intrinsic::mips_ceqi_w:
+ case Intrinsic::mips_ceqi_d:
+ return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+ lowerMSASplatImm(Op, 2, DAG), ISD::SETEQ);
+ case Intrinsic::mips_cle_s_b:
+ case Intrinsic::mips_cle_s_h:
+ case Intrinsic::mips_cle_s_w:
+ case Intrinsic::mips_cle_s_d:
+ return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+ Op->getOperand(2), ISD::SETLE);
+ case Intrinsic::mips_clei_s_b:
+ case Intrinsic::mips_clei_s_h:
+ case Intrinsic::mips_clei_s_w:
+ case Intrinsic::mips_clei_s_d:
+ return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+ lowerMSASplatImm(Op, 2, DAG), ISD::SETLE);
+ case Intrinsic::mips_cle_u_b:
+ case Intrinsic::mips_cle_u_h:
+ case Intrinsic::mips_cle_u_w:
+ case Intrinsic::mips_cle_u_d:
+ return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+ Op->getOperand(2), ISD::SETULE);
+ case Intrinsic::mips_clei_u_b:
+ case Intrinsic::mips_clei_u_h:
+ case Intrinsic::mips_clei_u_w:
+ case Intrinsic::mips_clei_u_d:
+ return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+ lowerMSASplatImm(Op, 2, DAG), ISD::SETULE);
+ case Intrinsic::mips_clt_s_b:
+ case Intrinsic::mips_clt_s_h:
+ case Intrinsic::mips_clt_s_w:
+ case Intrinsic::mips_clt_s_d:
+ return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+ Op->getOperand(2), ISD::SETLT);
+ case Intrinsic::mips_clti_s_b:
+ case Intrinsic::mips_clti_s_h:
+ case Intrinsic::mips_clti_s_w:
+ case Intrinsic::mips_clti_s_d:
+ return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+ lowerMSASplatImm(Op, 2, DAG), ISD::SETLT);
+ case Intrinsic::mips_clt_u_b:
+ case Intrinsic::mips_clt_u_h:
+ case Intrinsic::mips_clt_u_w:
+ case Intrinsic::mips_clt_u_d:
+ return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+ Op->getOperand(2), ISD::SETULT);
+ case Intrinsic::mips_clti_u_b:
+ case Intrinsic::mips_clti_u_h:
+ case Intrinsic::mips_clti_u_w:
+ case Intrinsic::mips_clti_u_d:
+ return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+ lowerMSASplatImm(Op, 2, DAG), ISD::SETULT);
+ case Intrinsic::mips_copy_s_b:
+ case Intrinsic::mips_copy_s_h:
+ case Intrinsic::mips_copy_s_w:
+ return lowerMSACopyIntr(Op, DAG, MipsISD::VEXTRACT_SEXT_ELT);
+ case Intrinsic::mips_copy_s_d:
+ if (Subtarget.hasMips64())
+ // Lower directly into VEXTRACT_SEXT_ELT since i64 is legal on Mips64.
+ return lowerMSACopyIntr(Op, DAG, MipsISD::VEXTRACT_SEXT_ELT);
+ else {
+ // Lower into the generic EXTRACT_VECTOR_ELT node and let the type
+ // legalizer and EXTRACT_VECTOR_ELT lowering sort it out.
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op),
+ Op->getValueType(0), Op->getOperand(1),
+ Op->getOperand(2));
+ }
+ case Intrinsic::mips_copy_u_b:
+ case Intrinsic::mips_copy_u_h:
+ case Intrinsic::mips_copy_u_w:
+ return lowerMSACopyIntr(Op, DAG, MipsISD::VEXTRACT_ZEXT_ELT);
+ case Intrinsic::mips_copy_u_d:
+ if (Subtarget.hasMips64())
+ // Lower directly into VEXTRACT_ZEXT_ELT since i64 is legal on Mips64.
+ return lowerMSACopyIntr(Op, DAG, MipsISD::VEXTRACT_ZEXT_ELT);
+ else {
+ // Lower into the generic EXTRACT_VECTOR_ELT node and let the type
+ // legalizer and EXTRACT_VECTOR_ELT lowering sort it out.
+ // Note: When i64 is illegal, this results in copy_s.w instructions
+ // instead of copy_u.w instructions. This makes no difference to the
+ // behaviour since i64 is only illegal when the register file is 32-bit.
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op),
+ Op->getValueType(0), Op->getOperand(1),
+ Op->getOperand(2));
+ }
+ case Intrinsic::mips_div_s_b:
+ case Intrinsic::mips_div_s_h:
+ case Intrinsic::mips_div_s_w:
+ case Intrinsic::mips_div_s_d:
+ return DAG.getNode(ISD::SDIV, DL, Op->getValueType(0), Op->getOperand(1),
+ Op->getOperand(2));
+ case Intrinsic::mips_div_u_b:
+ case Intrinsic::mips_div_u_h:
+ case Intrinsic::mips_div_u_w:
+ case Intrinsic::mips_div_u_d:
+ return DAG.getNode(ISD::UDIV, DL, Op->getValueType(0), Op->getOperand(1),
+ Op->getOperand(2));
+ case Intrinsic::mips_fadd_w:
+ case Intrinsic::mips_fadd_d: {
+ // TODO: If intrinsics have fast-math-flags, propagate them.
+ return DAG.getNode(ISD::FADD, DL, Op->getValueType(0), Op->getOperand(1),
+ Op->getOperand(2));
+ }
+ // Don't lower mips_fcaf_[wd] since LLVM folds SETFALSE condcodes away
+ case Intrinsic::mips_fceq_w:
+ case Intrinsic::mips_fceq_d:
+ return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+ Op->getOperand(2), ISD::SETOEQ);
+ case Intrinsic::mips_fcle_w:
+ case Intrinsic::mips_fcle_d:
+ return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+ Op->getOperand(2), ISD::SETOLE);
+ case Intrinsic::mips_fclt_w:
+ case Intrinsic::mips_fclt_d:
+ return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+ Op->getOperand(2), ISD::SETOLT);
+ case Intrinsic::mips_fcne_w:
+ case Intrinsic::mips_fcne_d:
+ return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+ Op->getOperand(2), ISD::SETONE);
+ case Intrinsic::mips_fcor_w:
+ case Intrinsic::mips_fcor_d:
+ return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+ Op->getOperand(2), ISD::SETO);
+ case Intrinsic::mips_fcueq_w:
+ case Intrinsic::mips_fcueq_d:
+ return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+ Op->getOperand(2), ISD::SETUEQ);
+ case Intrinsic::mips_fcule_w:
+ case Intrinsic::mips_fcule_d:
+ return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+ Op->getOperand(2), ISD::SETULE);
+ case Intrinsic::mips_fcult_w:
+ case Intrinsic::mips_fcult_d:
+ return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+ Op->getOperand(2), ISD::SETULT);
+ case Intrinsic::mips_fcun_w:
+ case Intrinsic::mips_fcun_d:
+ return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+ Op->getOperand(2), ISD::SETUO);
+ case Intrinsic::mips_fcune_w:
+ case Intrinsic::mips_fcune_d:
+ return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+ Op->getOperand(2), ISD::SETUNE);
+ case Intrinsic::mips_fdiv_w:
+ case Intrinsic::mips_fdiv_d: {
+ // TODO: If intrinsics have fast-math-flags, propagate them.
+ return DAG.getNode(ISD::FDIV, DL, Op->getValueType(0), Op->getOperand(1),
+ Op->getOperand(2));
+ }
+ case Intrinsic::mips_ffint_u_w:
+ case Intrinsic::mips_ffint_u_d:
+ return DAG.getNode(ISD::UINT_TO_FP, DL, Op->getValueType(0),
+ Op->getOperand(1));
+ case Intrinsic::mips_ffint_s_w:
+ case Intrinsic::mips_ffint_s_d:
+ return DAG.getNode(ISD::SINT_TO_FP, DL, Op->getValueType(0),
+ Op->getOperand(1));
+ case Intrinsic::mips_fill_b:
+ case Intrinsic::mips_fill_h:
+ case Intrinsic::mips_fill_w:
+ case Intrinsic::mips_fill_d: {
+ EVT ResTy = Op->getValueType(0);
+ SmallVector<SDValue, 16> Ops(ResTy.getVectorNumElements(),
+ Op->getOperand(1));
+
+ // If ResTy is v2i64 then the type legalizer will break this node down into
+ // an equivalent v4i32.
+ return DAG.getBuildVector(ResTy, DL, Ops);
+ }
+ case Intrinsic::mips_fexp2_w:
+ case Intrinsic::mips_fexp2_d: {
+ // TODO: If intrinsics have fast-math-flags, propagate them.
+ EVT ResTy = Op->getValueType(0);
+ return DAG.getNode(
+ ISD::FMUL, SDLoc(Op), ResTy, Op->getOperand(1),
+ DAG.getNode(ISD::FEXP2, SDLoc(Op), ResTy, Op->getOperand(2)));
+ }
+ case Intrinsic::mips_flog2_w:
+ case Intrinsic::mips_flog2_d:
+ return DAG.getNode(ISD::FLOG2, DL, Op->getValueType(0), Op->getOperand(1));
+ case Intrinsic::mips_fmadd_w:
+ case Intrinsic::mips_fmadd_d:
+ return DAG.getNode(ISD::FMA, SDLoc(Op), Op->getValueType(0),
+ Op->getOperand(1), Op->getOperand(2), Op->getOperand(3));
+ case Intrinsic::mips_fmul_w:
+ case Intrinsic::mips_fmul_d: {
+ // TODO: If intrinsics have fast-math-flags, propagate them.
+ return DAG.getNode(ISD::FMUL, DL, Op->getValueType(0), Op->getOperand(1),
+ Op->getOperand(2));
+ }
+ case Intrinsic::mips_fmsub_w:
+ case Intrinsic::mips_fmsub_d: {
+ // TODO: If intrinsics have fast-math-flags, propagate them.
+ EVT ResTy = Op->getValueType(0);
+ return DAG.getNode(ISD::FSUB, SDLoc(Op), ResTy, Op->getOperand(1),
+ DAG.getNode(ISD::FMUL, SDLoc(Op), ResTy,
+ Op->getOperand(2), Op->getOperand(3)));
+ }
+ case Intrinsic::mips_frint_w:
+ case Intrinsic::mips_frint_d:
+ return DAG.getNode(ISD::FRINT, DL, Op->getValueType(0), Op->getOperand(1));
+ case Intrinsic::mips_fsqrt_w:
+ case Intrinsic::mips_fsqrt_d:
+ return DAG.getNode(ISD::FSQRT, DL, Op->getValueType(0), Op->getOperand(1));
+ case Intrinsic::mips_fsub_w:
+ case Intrinsic::mips_fsub_d: {
+ // TODO: If intrinsics have fast-math-flags, propagate them.
+ return DAG.getNode(ISD::FSUB, DL, Op->getValueType(0), Op->getOperand(1),
+ Op->getOperand(2));
+ }
+ case Intrinsic::mips_ftrunc_u_w:
+ case Intrinsic::mips_ftrunc_u_d:
+ return DAG.getNode(ISD::FP_TO_UINT, DL, Op->getValueType(0),
+ Op->getOperand(1));
+ case Intrinsic::mips_ftrunc_s_w:
+ case Intrinsic::mips_ftrunc_s_d:
+ return DAG.getNode(ISD::FP_TO_SINT, DL, Op->getValueType(0),
+ Op->getOperand(1));
+ case Intrinsic::mips_ilvev_b:
+ case Intrinsic::mips_ilvev_h:
+ case Intrinsic::mips_ilvev_w:
+ case Intrinsic::mips_ilvev_d:
+ return DAG.getNode(MipsISD::ILVEV, DL, Op->getValueType(0),
+ Op->getOperand(1), Op->getOperand(2));
+ case Intrinsic::mips_ilvl_b:
+ case Intrinsic::mips_ilvl_h:
+ case Intrinsic::mips_ilvl_w:
+ case Intrinsic::mips_ilvl_d:
+ return DAG.getNode(MipsISD::ILVL, DL, Op->getValueType(0),
+ Op->getOperand(1), Op->getOperand(2));
+ case Intrinsic::mips_ilvod_b:
+ case Intrinsic::mips_ilvod_h:
+ case Intrinsic::mips_ilvod_w:
+ case Intrinsic::mips_ilvod_d:
+ return DAG.getNode(MipsISD::ILVOD, DL, Op->getValueType(0),
+ Op->getOperand(1), Op->getOperand(2));
+ case Intrinsic::mips_ilvr_b:
+ case Intrinsic::mips_ilvr_h:
+ case Intrinsic::mips_ilvr_w:
+ case Intrinsic::mips_ilvr_d:
+ return DAG.getNode(MipsISD::ILVR, DL, Op->getValueType(0),
+ Op->getOperand(1), Op->getOperand(2));
+ case Intrinsic::mips_insert_b:
+ case Intrinsic::mips_insert_h:
+ case Intrinsic::mips_insert_w:
+ case Intrinsic::mips_insert_d:
+ return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), Op->getValueType(0),
+ Op->getOperand(1), Op->getOperand(3), Op->getOperand(2));
+ case Intrinsic::mips_insve_b:
+ case Intrinsic::mips_insve_h:
+ case Intrinsic::mips_insve_w:
+ case Intrinsic::mips_insve_d:
+ return DAG.getNode(MipsISD::INSVE, DL, Op->getValueType(0),
+ Op->getOperand(1), Op->getOperand(2), Op->getOperand(3),
+ DAG.getConstant(0, DL, MVT::i32));
+ case Intrinsic::mips_ldi_b:
+ case Intrinsic::mips_ldi_h:
+ case Intrinsic::mips_ldi_w:
+ case Intrinsic::mips_ldi_d:
+ return lowerMSASplatImm(Op, 1, DAG);
+ case Intrinsic::mips_lsa:
+ case Intrinsic::mips_dlsa: {
+ EVT ResTy = Op->getValueType(0);
+ return DAG.getNode(ISD::ADD, SDLoc(Op), ResTy, Op->getOperand(1),
+ DAG.getNode(ISD::SHL, SDLoc(Op), ResTy,
+ Op->getOperand(2), Op->getOperand(3)));
+ }
+ case Intrinsic::mips_maddv_b:
+ case Intrinsic::mips_maddv_h:
+ case Intrinsic::mips_maddv_w:
+ case Intrinsic::mips_maddv_d: {
+ EVT ResTy = Op->getValueType(0);
+ return DAG.getNode(ISD::ADD, SDLoc(Op), ResTy, Op->getOperand(1),
+ DAG.getNode(ISD::MUL, SDLoc(Op), ResTy,
+ Op->getOperand(2), Op->getOperand(3)));
+ }
+ case Intrinsic::mips_max_s_b:
+ case Intrinsic::mips_max_s_h:
+ case Intrinsic::mips_max_s_w:
+ case Intrinsic::mips_max_s_d:
+ return DAG.getNode(MipsISD::VSMAX, DL, Op->getValueType(0),
+ Op->getOperand(1), Op->getOperand(2));
+ case Intrinsic::mips_max_u_b:
+ case Intrinsic::mips_max_u_h:
+ case Intrinsic::mips_max_u_w:
+ case Intrinsic::mips_max_u_d:
+ return DAG.getNode(MipsISD::VUMAX, DL, Op->getValueType(0),
+ Op->getOperand(1), Op->getOperand(2));
+ case Intrinsic::mips_maxi_s_b:
+ case Intrinsic::mips_maxi_s_h:
+ case Intrinsic::mips_maxi_s_w:
+ case Intrinsic::mips_maxi_s_d:
+ return DAG.getNode(MipsISD::VSMAX, DL, Op->getValueType(0),
+ Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+ case Intrinsic::mips_maxi_u_b:
+ case Intrinsic::mips_maxi_u_h:
+ case Intrinsic::mips_maxi_u_w:
+ case Intrinsic::mips_maxi_u_d:
+ return DAG.getNode(MipsISD::VUMAX, DL, Op->getValueType(0),
+ Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+ case Intrinsic::mips_min_s_b:
+ case Intrinsic::mips_min_s_h:
+ case Intrinsic::mips_min_s_w:
+ case Intrinsic::mips_min_s_d:
+ return DAG.getNode(MipsISD::VSMIN, DL, Op->getValueType(0),
+ Op->getOperand(1), Op->getOperand(2));
+ case Intrinsic::mips_min_u_b:
+ case Intrinsic::mips_min_u_h:
+ case Intrinsic::mips_min_u_w:
+ case Intrinsic::mips_min_u_d:
+ return DAG.getNode(MipsISD::VUMIN, DL, Op->getValueType(0),
+ Op->getOperand(1), Op->getOperand(2));
+ case Intrinsic::mips_mini_s_b:
+ case Intrinsic::mips_mini_s_h:
+ case Intrinsic::mips_mini_s_w:
+ case Intrinsic::mips_mini_s_d:
+ return DAG.getNode(MipsISD::VSMIN, DL, Op->getValueType(0),
+ Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+ case Intrinsic::mips_mini_u_b:
+ case Intrinsic::mips_mini_u_h:
+ case Intrinsic::mips_mini_u_w:
+ case Intrinsic::mips_mini_u_d:
+ return DAG.getNode(MipsISD::VUMIN, DL, Op->getValueType(0),
+ Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+ case Intrinsic::mips_mod_s_b:
+ case Intrinsic::mips_mod_s_h:
+ case Intrinsic::mips_mod_s_w:
+ case Intrinsic::mips_mod_s_d:
+ return DAG.getNode(ISD::SREM, DL, Op->getValueType(0), Op->getOperand(1),
+ Op->getOperand(2));
+ case Intrinsic::mips_mod_u_b:
+ case Intrinsic::mips_mod_u_h:
+ case Intrinsic::mips_mod_u_w:
+ case Intrinsic::mips_mod_u_d:
+ return DAG.getNode(ISD::UREM, DL, Op->getValueType(0), Op->getOperand(1),
+ Op->getOperand(2));
+ case Intrinsic::mips_mulv_b:
+ case Intrinsic::mips_mulv_h:
+ case Intrinsic::mips_mulv_w:
+ case Intrinsic::mips_mulv_d:
+ return DAG.getNode(ISD::MUL, DL, Op->getValueType(0), Op->getOperand(1),
+ Op->getOperand(2));
+ case Intrinsic::mips_msubv_b:
+ case Intrinsic::mips_msubv_h:
+ case Intrinsic::mips_msubv_w:
+ case Intrinsic::mips_msubv_d: {
+ EVT ResTy = Op->getValueType(0);
+ return DAG.getNode(ISD::SUB, SDLoc(Op), ResTy, Op->getOperand(1),
+ DAG.getNode(ISD::MUL, SDLoc(Op), ResTy,
+ Op->getOperand(2), Op->getOperand(3)));
+ }
+ case Intrinsic::mips_nlzc_b:
+ case Intrinsic::mips_nlzc_h:
+ case Intrinsic::mips_nlzc_w:
+ case Intrinsic::mips_nlzc_d:
+ return DAG.getNode(ISD::CTLZ, DL, Op->getValueType(0), Op->getOperand(1));
+ case Intrinsic::mips_nor_v: {
+ SDValue Res = DAG.getNode(ISD::OR, DL, Op->getValueType(0),
+ Op->getOperand(1), Op->getOperand(2));
+ return DAG.getNOT(DL, Res, Res->getValueType(0));
+ }
+ case Intrinsic::mips_nori_b: {
+ SDValue Res = DAG.getNode(ISD::OR, DL, Op->getValueType(0),
+ Op->getOperand(1),
+ lowerMSASplatImm(Op, 2, DAG));
+ return DAG.getNOT(DL, Res, Res->getValueType(0));
+ }
+ case Intrinsic::mips_or_v:
+ return DAG.getNode(ISD::OR, DL, Op->getValueType(0), Op->getOperand(1),
+ Op->getOperand(2));
+ case Intrinsic::mips_ori_b:
+ return DAG.getNode(ISD::OR, DL, Op->getValueType(0),
+ Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+ case Intrinsic::mips_pckev_b:
+ case Intrinsic::mips_pckev_h:
+ case Intrinsic::mips_pckev_w:
+ case Intrinsic::mips_pckev_d:
+ return DAG.getNode(MipsISD::PCKEV, DL, Op->getValueType(0),
+ Op->getOperand(1), Op->getOperand(2));
+ case Intrinsic::mips_pckod_b:
+ case Intrinsic::mips_pckod_h:
+ case Intrinsic::mips_pckod_w:
+ case Intrinsic::mips_pckod_d:
+ return DAG.getNode(MipsISD::PCKOD, DL, Op->getValueType(0),
+ Op->getOperand(1), Op->getOperand(2));
+ case Intrinsic::mips_pcnt_b:
+ case Intrinsic::mips_pcnt_h:
+ case Intrinsic::mips_pcnt_w:
+ case Intrinsic::mips_pcnt_d:
+ return DAG.getNode(ISD::CTPOP, DL, Op->getValueType(0), Op->getOperand(1));
+ case Intrinsic::mips_shf_b:
+ case Intrinsic::mips_shf_h:
+ case Intrinsic::mips_shf_w:
+ return DAG.getNode(MipsISD::SHF, DL, Op->getValueType(0),
+ Op->getOperand(2), Op->getOperand(1));
+ case Intrinsic::mips_sll_b:
+ case Intrinsic::mips_sll_h:
+ case Intrinsic::mips_sll_w:
+ case Intrinsic::mips_sll_d:
+ return DAG.getNode(ISD::SHL, DL, Op->getValueType(0), Op->getOperand(1),
+ Op->getOperand(2));
+ case Intrinsic::mips_slli_b:
+ case Intrinsic::mips_slli_h:
+ case Intrinsic::mips_slli_w:
+ case Intrinsic::mips_slli_d:
+ return DAG.getNode(ISD::SHL, DL, Op->getValueType(0),
+ Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+ case Intrinsic::mips_splat_b:
+ case Intrinsic::mips_splat_h:
+ case Intrinsic::mips_splat_w:
+ case Intrinsic::mips_splat_d:
+ // We can't lower via VECTOR_SHUFFLE because it requires constant shuffle
+ // masks, nor can we lower via BUILD_VECTOR & EXTRACT_VECTOR_ELT because
+ // EXTRACT_VECTOR_ELT can't extract i64's on MIPS32.
+ // Instead we lower to MipsISD::VSHF and match from there.
+ return DAG.getNode(MipsISD::VSHF, DL, Op->getValueType(0),
+ lowerMSASplatZExt(Op, 2, DAG), Op->getOperand(1),
+ Op->getOperand(1));
+ case Intrinsic::mips_splati_b:
+ case Intrinsic::mips_splati_h:
+ case Intrinsic::mips_splati_w:
+ case Intrinsic::mips_splati_d:
+ return DAG.getNode(MipsISD::VSHF, DL, Op->getValueType(0),
+ lowerMSASplatImm(Op, 2, DAG), Op->getOperand(1),
+ Op->getOperand(1));
+ case Intrinsic::mips_sra_b:
+ case Intrinsic::mips_sra_h:
+ case Intrinsic::mips_sra_w:
+ case Intrinsic::mips_sra_d:
+ return DAG.getNode(ISD::SRA, DL, Op->getValueType(0), Op->getOperand(1),
+ Op->getOperand(2));
+ case Intrinsic::mips_srai_b:
+ case Intrinsic::mips_srai_h:
+ case Intrinsic::mips_srai_w:
+ case Intrinsic::mips_srai_d:
+ return DAG.getNode(ISD::SRA, DL, Op->getValueType(0),
+ Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+ case Intrinsic::mips_srl_b:
+ case Intrinsic::mips_srl_h:
+ case Intrinsic::mips_srl_w:
+ case Intrinsic::mips_srl_d:
+ return DAG.getNode(ISD::SRL, DL, Op->getValueType(0), Op->getOperand(1),
+ Op->getOperand(2));
+ case Intrinsic::mips_srli_b:
+ case Intrinsic::mips_srli_h:
+ case Intrinsic::mips_srli_w:
+ case Intrinsic::mips_srli_d:
+ return DAG.getNode(ISD::SRL, DL, Op->getValueType(0),
+ Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+ case Intrinsic::mips_subv_b:
+ case Intrinsic::mips_subv_h:
+ case Intrinsic::mips_subv_w:
+ case Intrinsic::mips_subv_d:
+ return DAG.getNode(ISD::SUB, DL, Op->getValueType(0), Op->getOperand(1),
+ Op->getOperand(2));
+ case Intrinsic::mips_subvi_b:
+ case Intrinsic::mips_subvi_h:
+ case Intrinsic::mips_subvi_w:
+ case Intrinsic::mips_subvi_d:
+ return DAG.getNode(ISD::SUB, DL, Op->getValueType(0),
+ Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+ case Intrinsic::mips_vshf_b:
+ case Intrinsic::mips_vshf_h:
+ case Intrinsic::mips_vshf_w:
+ case Intrinsic::mips_vshf_d:
+ return DAG.getNode(MipsISD::VSHF, DL, Op->getValueType(0),
+ Op->getOperand(1), Op->getOperand(2), Op->getOperand(3));
+ case Intrinsic::mips_xor_v:
+ return DAG.getNode(ISD::XOR, DL, Op->getValueType(0), Op->getOperand(1),
+ Op->getOperand(2));
+ case Intrinsic::mips_xori_b:
+ return DAG.getNode(ISD::XOR, DL, Op->getValueType(0),
+ Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+ case Intrinsic::thread_pointer: {
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ return DAG.getNode(MipsISD::ThreadPointer, DL, PtrVT);
+ }
+ }
+}
+
+static SDValue lowerMSALoadIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr) {
+ SDLoc DL(Op);
+ SDValue ChainIn = Op->getOperand(0);
+ SDValue Address = Op->getOperand(2);
+ SDValue Offset = Op->getOperand(3);
+ EVT ResTy = Op->getValueType(0);
+ EVT PtrTy = Address->getValueType(0);
+
+ Address = DAG.getNode(ISD::ADD, DL, PtrTy, Address, Offset);
+ return DAG.getLoad(ResTy, DL, ChainIn, Address, MachinePointerInfo(),
+ /* Alignment = */ 16);
+}
+
+SDValue MipsSETargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
+ SelectionDAG &DAG) const {
+ unsigned Intr = cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue();
+ switch (Intr) {
+ default:
+ return SDValue();
+ case Intrinsic::mips_extp:
+ return lowerDSPIntr(Op, DAG, MipsISD::EXTP);
+ case Intrinsic::mips_extpdp:
+ return lowerDSPIntr(Op, DAG, MipsISD::EXTPDP);
+ case Intrinsic::mips_extr_w:
+ return lowerDSPIntr(Op, DAG, MipsISD::EXTR_W);
+ case Intrinsic::mips_extr_r_w:
+ return lowerDSPIntr(Op, DAG, MipsISD::EXTR_R_W);
+ case Intrinsic::mips_extr_rs_w:
+ return lowerDSPIntr(Op, DAG, MipsISD::EXTR_RS_W);
+ case Intrinsic::mips_extr_s_h:
+ return lowerDSPIntr(Op, DAG, MipsISD::EXTR_S_H);
+ case Intrinsic::mips_mthlip:
+ return lowerDSPIntr(Op, DAG, MipsISD::MTHLIP);
+ case Intrinsic::mips_mulsaq_s_w_ph:
+ return lowerDSPIntr(Op, DAG, MipsISD::MULSAQ_S_W_PH);
+ case Intrinsic::mips_maq_s_w_phl:
+ return lowerDSPIntr(Op, DAG, MipsISD::MAQ_S_W_PHL);
+ case Intrinsic::mips_maq_s_w_phr:
+ return lowerDSPIntr(Op, DAG, MipsISD::MAQ_S_W_PHR);
+ case Intrinsic::mips_maq_sa_w_phl:
+ return lowerDSPIntr(Op, DAG, MipsISD::MAQ_SA_W_PHL);
+ case Intrinsic::mips_maq_sa_w_phr:
+ return lowerDSPIntr(Op, DAG, MipsISD::MAQ_SA_W_PHR);
+ case Intrinsic::mips_dpaq_s_w_ph:
+ return lowerDSPIntr(Op, DAG, MipsISD::DPAQ_S_W_PH);
+ case Intrinsic::mips_dpsq_s_w_ph:
+ return lowerDSPIntr(Op, DAG, MipsISD::DPSQ_S_W_PH);
+ case Intrinsic::mips_dpaq_sa_l_w:
+ return lowerDSPIntr(Op, DAG, MipsISD::DPAQ_SA_L_W);
+ case Intrinsic::mips_dpsq_sa_l_w:
+ return lowerDSPIntr(Op, DAG, MipsISD::DPSQ_SA_L_W);
+ case Intrinsic::mips_dpaqx_s_w_ph:
+ return lowerDSPIntr(Op, DAG, MipsISD::DPAQX_S_W_PH);
+ case Intrinsic::mips_dpaqx_sa_w_ph:
+ return lowerDSPIntr(Op, DAG, MipsISD::DPAQX_SA_W_PH);
+ case Intrinsic::mips_dpsqx_s_w_ph:
+ return lowerDSPIntr(Op, DAG, MipsISD::DPSQX_S_W_PH);
+ case Intrinsic::mips_dpsqx_sa_w_ph:
+ return lowerDSPIntr(Op, DAG, MipsISD::DPSQX_SA_W_PH);
+ case Intrinsic::mips_ld_b:
+ case Intrinsic::mips_ld_h:
+ case Intrinsic::mips_ld_w:
+ case Intrinsic::mips_ld_d:
+ return lowerMSALoadIntr(Op, DAG, Intr);
+ }
+}
+
+static SDValue lowerMSAStoreIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr) {
+ SDLoc DL(Op);
+ SDValue ChainIn = Op->getOperand(0);
+ SDValue Value = Op->getOperand(2);
+ SDValue Address = Op->getOperand(3);
+ SDValue Offset = Op->getOperand(4);
+ EVT PtrTy = Address->getValueType(0);
+
+ Address = DAG.getNode(ISD::ADD, DL, PtrTy, Address, Offset);
+
+ return DAG.getStore(ChainIn, DL, Value, Address, MachinePointerInfo(),
+ /* Alignment = */ 16);
+}
+
+SDValue MipsSETargetLowering::lowerINTRINSIC_VOID(SDValue Op,
+ SelectionDAG &DAG) const {
+ unsigned Intr = cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue();
+ switch (Intr) {
+ default:
+ return SDValue();
+ case Intrinsic::mips_st_b:
+ case Intrinsic::mips_st_h:
+ case Intrinsic::mips_st_w:
+ case Intrinsic::mips_st_d:
+ return lowerMSAStoreIntr(Op, DAG, Intr);
+ }
+}
+
+/// \brief Check if the given BuildVectorSDNode is a splat.
+/// This method currently relies on DAG nodes being reused when equivalent,
+/// so it's possible for this to return false even when isConstantSplat returns
+/// true.
+static bool isSplatVector(const BuildVectorSDNode *N) {
+ unsigned int nOps = N->getNumOperands();
+ assert(nOps > 1 && "isSplatVector has 0 or 1 sized build vector");
+
+ SDValue Operand0 = N->getOperand(0);
+
+ for (unsigned int i = 1; i < nOps; ++i) {
+ if (N->getOperand(i) != Operand0)
+ return false;
+ }
+
+ return true;
+}
+
+// Lower ISD::EXTRACT_VECTOR_ELT into MipsISD::VEXTRACT_SEXT_ELT.
+//
+// The non-value bits resulting from ISD::EXTRACT_VECTOR_ELT are undefined. We
+// choose to sign-extend but we could have equally chosen zero-extend. The
+// DAGCombiner will fold any sign/zero extension of the ISD::EXTRACT_VECTOR_ELT
+// result into this node later (possibly changing it to a zero-extend in the
+// process).
+SDValue MipsSETargetLowering::
+lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ EVT ResTy = Op->getValueType(0);
+ SDValue Op0 = Op->getOperand(0);
+ EVT VecTy = Op0->getValueType(0);
+
+ if (!VecTy.is128BitVector())
+ return SDValue();
+
+ if (ResTy.isInteger()) {
+ SDValue Op1 = Op->getOperand(1);
+ EVT EltTy = VecTy.getVectorElementType();
+ return DAG.getNode(MipsISD::VEXTRACT_SEXT_ELT, DL, ResTy, Op0, Op1,
+ DAG.getValueType(EltTy));
+ }
+
+ return Op;
+}
+
+static bool isConstantOrUndef(const SDValue Op) {
+ if (Op->isUndef())
+ return true;
+ if (isa<ConstantSDNode>(Op))
+ return true;
+ if (isa<ConstantFPSDNode>(Op))
+ return true;
+ return false;
+}
+
+static bool isConstantOrUndefBUILD_VECTOR(const BuildVectorSDNode *Op) {
+ for (unsigned i = 0; i < Op->getNumOperands(); ++i)
+ if (isConstantOrUndef(Op->getOperand(i)))
+ return true;
+ return false;
+}
+
+// Lowers ISD::BUILD_VECTOR into appropriate SelectionDAG nodes for the
+// backend.
+//
+// Lowers according to the following rules:
+// - Constant splats are legal as-is as long as the SplatBitSize is a power of
+// 2 less than or equal to 64 and the value fits into a signed 10-bit
+// immediate
+// - Constant splats are lowered to bitconverted BUILD_VECTORs if SplatBitSize
+// is a power of 2 less than or equal to 64 and the value does not fit into a
+// signed 10-bit immediate
+// - Non-constant splats are legal as-is.
+// - Non-constant non-splats are lowered to sequences of INSERT_VECTOR_ELT.
+// - All others are illegal and must be expanded.
+SDValue MipsSETargetLowering::lowerBUILD_VECTOR(SDValue Op,
+ SelectionDAG &DAG) const {
+ BuildVectorSDNode *Node = cast<BuildVectorSDNode>(Op);
+ EVT ResTy = Op->getValueType(0);
+ SDLoc DL(Op);
+ APInt SplatValue, SplatUndef;
+ unsigned SplatBitSize;
+ bool HasAnyUndefs;
+
+ if (!Subtarget.hasMSA() || !ResTy.is128BitVector())
+ return SDValue();
+
+ if (Node->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
+ HasAnyUndefs, 8,
+ !Subtarget.isLittle()) && SplatBitSize <= 64) {
+ // We can only cope with 8, 16, 32, or 64-bit elements
+ if (SplatBitSize != 8 && SplatBitSize != 16 && SplatBitSize != 32 &&
+ SplatBitSize != 64)
+ return SDValue();
+
+ // If the value fits into a simm10 then we can use ldi.[bhwd]
+ // However, if it isn't an integer type we will have to bitcast from an
+ // integer type first. Also, if there are any undefs, we must lower them
+ // to defined values first.
+ if (ResTy.isInteger() && !HasAnyUndefs && SplatValue.isSignedIntN(10))
+ return Op;
+
+ EVT ViaVecTy;
+
+ switch (SplatBitSize) {
+ default:
+ return SDValue();
+ case 8:
+ ViaVecTy = MVT::v16i8;
+ break;
+ case 16:
+ ViaVecTy = MVT::v8i16;
+ break;
+ case 32:
+ ViaVecTy = MVT::v4i32;
+ break;
+ case 64:
+ // There's no fill.d to fall back on for 64-bit values
+ return SDValue();
+ }
+
+ // SelectionDAG::getConstant will promote SplatValue appropriately.
+ SDValue Result = DAG.getConstant(SplatValue, DL, ViaVecTy);
+
+ // Bitcast to the type we originally wanted
+ if (ViaVecTy != ResTy)
+ Result = DAG.getNode(ISD::BITCAST, SDLoc(Node), ResTy, Result);
+
+ return Result;
+ } else if (isSplatVector(Node))
+ return Op;
+ else if (!isConstantOrUndefBUILD_VECTOR(Node)) {
+ // Use INSERT_VECTOR_ELT operations rather than expand to stores.
+ // The resulting code is the same length as the expansion, but it doesn't
+ // use memory operations
+ EVT ResTy = Node->getValueType(0);
+
+ assert(ResTy.isVector());
+
+ unsigned NumElts = ResTy.getVectorNumElements();
+ SDValue Vector = DAG.getUNDEF(ResTy);
+ for (unsigned i = 0; i < NumElts; ++i) {
+ Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ResTy, Vector,
+ Node->getOperand(i),
+ DAG.getConstant(i, DL, MVT::i32));
+ }
+ return Vector;
+ }
+
+ return SDValue();
+}
+
+// Lower VECTOR_SHUFFLE into SHF (if possible).
+//
+// SHF splits the vector into blocks of four elements, then shuffles these
+// elements according to a <4 x i2> constant (encoded as an integer immediate).
+//
+// It is therefore possible to lower into SHF when the mask takes the form:
+// <a, b, c, d, a+4, b+4, c+4, d+4, a+8, b+8, c+8, d+8, ...>
+// When undef's appear they are treated as if they were whatever value is
+// necessary in order to fit the above forms.
+//
+// For example:
+// %2 = shufflevector <8 x i16> %0, <8 x i16> undef,
+// <8 x i32> <i32 3, i32 2, i32 1, i32 0,
+// i32 7, i32 6, i32 5, i32 4>
+// is lowered to:
+// (SHF_H $w0, $w1, 27)
+// where the 27 comes from:
+// 3 + (2 << 2) + (1 << 4) + (0 << 6)
+static SDValue lowerVECTOR_SHUFFLE_SHF(SDValue Op, EVT ResTy,
+ SmallVector<int, 16> Indices,
+ SelectionDAG &DAG) {
+ int SHFIndices[4] = { -1, -1, -1, -1 };
+
+ if (Indices.size() < 4)
+ return SDValue();
+
+ for (unsigned i = 0; i < 4; ++i) {
+ for (unsigned j = i; j < Indices.size(); j += 4) {
+ int Idx = Indices[j];
+
+ // Convert from vector index to 4-element subvector index
+ // If an index refers to an element outside of the subvector then give up
+ if (Idx != -1) {
+ Idx -= 4 * (j / 4);
+ if (Idx < 0 || Idx >= 4)
+ return SDValue();
+ }
+
+ // If the mask has an undef, replace it with the current index.
+ // Note that it might still be undef if the current index is also undef
+ if (SHFIndices[i] == -1)
+ SHFIndices[i] = Idx;
+
+ // Check that non-undef values are the same as in the mask. If they
+ // aren't then give up
+ if (!(Idx == -1 || Idx == SHFIndices[i]))
+ return SDValue();
+ }
+ }
+
+ // Calculate the immediate. Replace any remaining undefs with zero
+ APInt Imm(32, 0);
+ for (int i = 3; i >= 0; --i) {
+ int Idx = SHFIndices[i];
+
+ if (Idx == -1)
+ Idx = 0;
+
+ Imm <<= 2;
+ Imm |= Idx & 0x3;
+ }
+
+ SDLoc DL(Op);
+ return DAG.getNode(MipsISD::SHF, DL, ResTy,
+ DAG.getConstant(Imm, DL, MVT::i32), Op->getOperand(0));
+}
+
+/// Determine whether a range fits a regular pattern of values.
+/// This function accounts for the possibility of jumping over the End iterator.
+template <typename ValType>
+static bool
+fitsRegularPattern(typename SmallVectorImpl<ValType>::const_iterator Begin,
+ unsigned CheckStride,
+ typename SmallVectorImpl<ValType>::const_iterator End,
+ ValType ExpectedIndex, unsigned ExpectedIndexStride) {
+ auto &I = Begin;
+
+ while (I != End) {
+ if (*I != -1 && *I != ExpectedIndex)
+ return false;
+ ExpectedIndex += ExpectedIndexStride;
+
+ // Incrementing past End is undefined behaviour so we must increment one
+ // step at a time and check for End at each step.
+ for (unsigned n = 0; n < CheckStride && I != End; ++n, ++I)
+ ; // Empty loop body.
+ }
+ return true;
+}
+
+// Determine whether VECTOR_SHUFFLE is a SPLATI.
+//
+// It is a SPLATI when the mask is:
+// <x, x, x, ...>
+// where x is any valid index.
+//
+// When undef's appear in the mask they are treated as if they were whatever
+// value is necessary in order to fit the above form.
+static bool isVECTOR_SHUFFLE_SPLATI(SDValue Op, EVT ResTy,
+ SmallVector<int, 16> Indices,
+ SelectionDAG &DAG) {
+ assert((Indices.size() % 2) == 0);
+
+ int SplatIndex = -1;
+ for (const auto &V : Indices) {
+ if (V != -1) {
+ SplatIndex = V;
+ break;
+ }
+ }
+
+ return fitsRegularPattern<int>(Indices.begin(), 1, Indices.end(), SplatIndex,
+ 0);
+}
+
+// Lower VECTOR_SHUFFLE into ILVEV (if possible).
+//
+// ILVEV interleaves the even elements from each vector.
+//
+// It is possible to lower into ILVEV when the mask consists of two of the
+// following forms interleaved:
+// <0, 2, 4, ...>
+// <n, n+2, n+4, ...>
+// where n is the number of elements in the vector.
+// For example:
+// <0, 0, 2, 2, 4, 4, ...>
+// <0, n, 2, n+2, 4, n+4, ...>
+//
+// When undef's appear in the mask they are treated as if they were whatever
+// value is necessary in order to fit the above forms.
+static SDValue lowerVECTOR_SHUFFLE_ILVEV(SDValue Op, EVT ResTy,
+ SmallVector<int, 16> Indices,
+ SelectionDAG &DAG) {
+ assert((Indices.size() % 2) == 0);
+
+ SDValue Wt;
+ SDValue Ws;
+ const auto &Begin = Indices.begin();
+ const auto &End = Indices.end();
+
+ // Check even elements are taken from the even elements of one half or the
+ // other and pick an operand accordingly.
+ if (fitsRegularPattern<int>(Begin, 2, End, 0, 2))
+ Wt = Op->getOperand(0);
+ else if (fitsRegularPattern<int>(Begin, 2, End, Indices.size(), 2))
+ Wt = Op->getOperand(1);
+ else
+ return SDValue();
+
+ // Check odd elements are taken from the even elements of one half or the
+ // other and pick an operand accordingly.
+ if (fitsRegularPattern<int>(Begin + 1, 2, End, 0, 2))
+ Ws = Op->getOperand(0);
+ else if (fitsRegularPattern<int>(Begin + 1, 2, End, Indices.size(), 2))
+ Ws = Op->getOperand(1);
+ else
+ return SDValue();
+
+ return DAG.getNode(MipsISD::ILVEV, SDLoc(Op), ResTy, Ws, Wt);
+}
+
+// Lower VECTOR_SHUFFLE into ILVOD (if possible).
+//
+// ILVOD interleaves the odd elements from each vector.
+//
+// It is possible to lower into ILVOD when the mask consists of two of the
+// following forms interleaved:
+// <1, 3, 5, ...>
+// <n+1, n+3, n+5, ...>
+// where n is the number of elements in the vector.
+// For example:
+// <1, 1, 3, 3, 5, 5, ...>
+// <1, n+1, 3, n+3, 5, n+5, ...>
+//
+// When undef's appear in the mask they are treated as if they were whatever
+// value is necessary in order to fit the above forms.
+static SDValue lowerVECTOR_SHUFFLE_ILVOD(SDValue Op, EVT ResTy,
+ SmallVector<int, 16> Indices,
+ SelectionDAG &DAG) {
+ assert((Indices.size() % 2) == 0);
+
+ SDValue Wt;
+ SDValue Ws;
+ const auto &Begin = Indices.begin();
+ const auto &End = Indices.end();
+
+ // Check even elements are taken from the odd elements of one half or the
+ // other and pick an operand accordingly.
+ if (fitsRegularPattern<int>(Begin, 2, End, 1, 2))
+ Wt = Op->getOperand(0);
+ else if (fitsRegularPattern<int>(Begin, 2, End, Indices.size() + 1, 2))
+ Wt = Op->getOperand(1);
+ else
+ return SDValue();
+
+ // Check odd elements are taken from the odd elements of one half or the
+ // other and pick an operand accordingly.
+ if (fitsRegularPattern<int>(Begin + 1, 2, End, 1, 2))
+ Ws = Op->getOperand(0);
+ else if (fitsRegularPattern<int>(Begin + 1, 2, End, Indices.size() + 1, 2))
+ Ws = Op->getOperand(1);
+ else
+ return SDValue();
+
+ return DAG.getNode(MipsISD::ILVOD, SDLoc(Op), ResTy, Wt, Ws);
+}
+
+// Lower VECTOR_SHUFFLE into ILVR (if possible).
+//
+// ILVR interleaves consecutive elements from the right (lowest-indexed) half of
+// each vector.
+//
+// It is possible to lower into ILVR when the mask consists of two of the
+// following forms interleaved:
+// <0, 1, 2, ...>
+// <n, n+1, n+2, ...>
+// where n is the number of elements in the vector.
+// For example:
+// <0, 0, 1, 1, 2, 2, ...>
+// <0, n, 1, n+1, 2, n+2, ...>
+//
+// When undef's appear in the mask they are treated as if they were whatever
+// value is necessary in order to fit the above forms.
+static SDValue lowerVECTOR_SHUFFLE_ILVR(SDValue Op, EVT ResTy,
+ SmallVector<int, 16> Indices,
+ SelectionDAG &DAG) {
+ assert((Indices.size() % 2) == 0);
+
+ SDValue Wt;
+ SDValue Ws;
+ const auto &Begin = Indices.begin();
+ const auto &End = Indices.end();
+
+ // Check even elements are taken from the right (lowest-indexed) elements of
+ // one half or the other and pick an operand accordingly.
+ if (fitsRegularPattern<int>(Begin, 2, End, 0, 1))
+ Wt = Op->getOperand(0);
+ else if (fitsRegularPattern<int>(Begin, 2, End, Indices.size(), 1))
+ Wt = Op->getOperand(1);
+ else
+ return SDValue();
+
+ // Check odd elements are taken from the right (lowest-indexed) elements of
+ // one half or the other and pick an operand accordingly.
+ if (fitsRegularPattern<int>(Begin + 1, 2, End, 0, 1))
+ Ws = Op->getOperand(0);
+ else if (fitsRegularPattern<int>(Begin + 1, 2, End, Indices.size(), 1))
+ Ws = Op->getOperand(1);
+ else
+ return SDValue();
+
+ return DAG.getNode(MipsISD::ILVR, SDLoc(Op), ResTy, Ws, Wt);
+}
+
+// Lower VECTOR_SHUFFLE into ILVL (if possible).
+//
+// ILVL interleaves consecutive elements from the left (highest-indexed) half
+// of each vector.
+//
+// It is possible to lower into ILVL when the mask consists of two of the
+// following forms interleaved:
+// <x, x+1, x+2, ...>
+// <n+x, n+x+1, n+x+2, ...>
+// where n is the number of elements in the vector and x is half n.
+// For example:
+// <x, x, x+1, x+1, x+2, x+2, ...>
+// <x, n+x, x+1, n+x+1, x+2, n+x+2, ...>
+//
+// When undef's appear in the mask they are treated as if they were whatever
+// value is necessary in order to fit the above forms.
+static SDValue lowerVECTOR_SHUFFLE_ILVL(SDValue Op, EVT ResTy,
+ SmallVector<int, 16> Indices,
+ SelectionDAG &DAG) {
+ assert((Indices.size() % 2) == 0);
+
+ unsigned HalfSize = Indices.size() / 2;
+ SDValue Wt;
+ SDValue Ws;
+ const auto &Begin = Indices.begin();
+ const auto &End = Indices.end();
+
+ // Check even elements are taken from the left (highest-indexed) elements of
+ // one half or the other and pick an operand accordingly.
+ if (fitsRegularPattern<int>(Begin, 2, End, HalfSize, 1))
+ Wt = Op->getOperand(0);
+ else if (fitsRegularPattern<int>(Begin, 2, End, Indices.size() + HalfSize, 1))
+ Wt = Op->getOperand(1);
+ else
+ return SDValue();
+
+ // Check odd elements are taken from the left (highest-indexed) elements of
+ // one half or the other and pick an operand accordingly.
+ if (fitsRegularPattern<int>(Begin + 1, 2, End, HalfSize, 1))
+ Ws = Op->getOperand(0);
+ else if (fitsRegularPattern<int>(Begin + 1, 2, End, Indices.size() + HalfSize,
+ 1))
+ Ws = Op->getOperand(1);
+ else
+ return SDValue();
+
+ return DAG.getNode(MipsISD::ILVL, SDLoc(Op), ResTy, Ws, Wt);
+}
+
+// Lower VECTOR_SHUFFLE into PCKEV (if possible).
+//
+// PCKEV copies the even elements of each vector into the result vector.
+//
+// It is possible to lower into PCKEV when the mask consists of two of the
+// following forms concatenated:
+// <0, 2, 4, ...>
+// <n, n+2, n+4, ...>
+// where n is the number of elements in the vector.
+// For example:
+// <0, 2, 4, ..., 0, 2, 4, ...>
+// <0, 2, 4, ..., n, n+2, n+4, ...>
+//
+// When undef's appear in the mask they are treated as if they were whatever
+// value is necessary in order to fit the above forms.
+static SDValue lowerVECTOR_SHUFFLE_PCKEV(SDValue Op, EVT ResTy,
+ SmallVector<int, 16> Indices,
+ SelectionDAG &DAG) {
+ assert((Indices.size() % 2) == 0);
+
+ SDValue Wt;
+ SDValue Ws;
+ const auto &Begin = Indices.begin();
+ const auto &Mid = Indices.begin() + Indices.size() / 2;
+ const auto &End = Indices.end();
+
+ if (fitsRegularPattern<int>(Begin, 1, Mid, 0, 2))
+ Wt = Op->getOperand(0);
+ else if (fitsRegularPattern<int>(Begin, 1, Mid, Indices.size(), 2))
+ Wt = Op->getOperand(1);
+ else
+ return SDValue();
+
+ if (fitsRegularPattern<int>(Mid, 1, End, 0, 2))
+ Ws = Op->getOperand(0);
+ else if (fitsRegularPattern<int>(Mid, 1, End, Indices.size(), 2))
+ Ws = Op->getOperand(1);
+ else
+ return SDValue();
+
+ return DAG.getNode(MipsISD::PCKEV, SDLoc(Op), ResTy, Ws, Wt);
+}
+
+// Lower VECTOR_SHUFFLE into PCKOD (if possible).
+//
+// PCKOD copies the odd elements of each vector into the result vector.
+//
+// It is possible to lower into PCKOD when the mask consists of two of the
+// following forms concatenated:
+// <1, 3, 5, ...>
+// <n+1, n+3, n+5, ...>
+// where n is the number of elements in the vector.
+// For example:
+// <1, 3, 5, ..., 1, 3, 5, ...>
+// <1, 3, 5, ..., n+1, n+3, n+5, ...>
+//
+// When undef's appear in the mask they are treated as if they were whatever
+// value is necessary in order to fit the above forms.
+static SDValue lowerVECTOR_SHUFFLE_PCKOD(SDValue Op, EVT ResTy,
+ SmallVector<int, 16> Indices,
+ SelectionDAG &DAG) {
+ assert((Indices.size() % 2) == 0);
+
+ SDValue Wt;
+ SDValue Ws;
+ const auto &Begin = Indices.begin();
+ const auto &Mid = Indices.begin() + Indices.size() / 2;
+ const auto &End = Indices.end();
+
+ if (fitsRegularPattern<int>(Begin, 1, Mid, 1, 2))
+ Wt = Op->getOperand(0);
+ else if (fitsRegularPattern<int>(Begin, 1, Mid, Indices.size() + 1, 2))
+ Wt = Op->getOperand(1);
+ else
+ return SDValue();
+
+ if (fitsRegularPattern<int>(Mid, 1, End, 1, 2))
+ Ws = Op->getOperand(0);
+ else if (fitsRegularPattern<int>(Mid, 1, End, Indices.size() + 1, 2))
+ Ws = Op->getOperand(1);
+ else
+ return SDValue();
+
+ return DAG.getNode(MipsISD::PCKOD, SDLoc(Op), ResTy, Ws, Wt);
+}
+
+// Lower VECTOR_SHUFFLE into VSHF.
+//
+// This mostly consists of converting the shuffle indices in Indices into a
+// BUILD_VECTOR and adding it as an operand to the resulting VSHF. There is
+// also code to eliminate unused operands of the VECTOR_SHUFFLE. For example,
+// if the type is v8i16 and all the indices are less than 8 then the second
+// operand is unused and can be replaced with anything. We choose to replace it
+// with the used operand since this reduces the number of instructions overall.
+static SDValue lowerVECTOR_SHUFFLE_VSHF(SDValue Op, EVT ResTy,
+ SmallVector<int, 16> Indices,
+ SelectionDAG &DAG) {
+ SmallVector<SDValue, 16> Ops;
+ SDValue Op0;
+ SDValue Op1;
+ EVT MaskVecTy = ResTy.changeVectorElementTypeToInteger();
+ EVT MaskEltTy = MaskVecTy.getVectorElementType();
+ bool Using1stVec = false;
+ bool Using2ndVec = false;
+ SDLoc DL(Op);
+ int ResTyNumElts = ResTy.getVectorNumElements();
+
+ for (int i = 0; i < ResTyNumElts; ++i) {
+ // Idx == -1 means UNDEF
+ int Idx = Indices[i];
+
+ if (0 <= Idx && Idx < ResTyNumElts)
+ Using1stVec = true;
+ if (ResTyNumElts <= Idx && Idx < ResTyNumElts * 2)
+ Using2ndVec = true;
+ }
+
+ for (SmallVector<int, 16>::iterator I = Indices.begin(); I != Indices.end();
+ ++I)
+ Ops.push_back(DAG.getTargetConstant(*I, DL, MaskEltTy));
+
+ SDValue MaskVec = DAG.getBuildVector(MaskVecTy, DL, Ops);
+
+ if (Using1stVec && Using2ndVec) {
+ Op0 = Op->getOperand(0);
+ Op1 = Op->getOperand(1);
+ } else if (Using1stVec)
+ Op0 = Op1 = Op->getOperand(0);
+ else if (Using2ndVec)
+ Op0 = Op1 = Op->getOperand(1);
+ else
+ llvm_unreachable("shuffle vector mask references neither vector operand?");
+
+ // VECTOR_SHUFFLE concatenates the vectors in an vectorwise fashion.
+ // <0b00, 0b01> + <0b10, 0b11> -> <0b00, 0b01, 0b10, 0b11>
+ // VSHF concatenates the vectors in a bitwise fashion:
+ // <0b00, 0b01> + <0b10, 0b11> ->
+ // 0b0100 + 0b1110 -> 0b01001110
+ // <0b10, 0b11, 0b00, 0b01>
+ // We must therefore swap the operands to get the correct result.
+ return DAG.getNode(MipsISD::VSHF, DL, ResTy, MaskVec, Op1, Op0);
+}
+
+// Lower VECTOR_SHUFFLE into one of a number of instructions depending on the
+// indices in the shuffle.
+SDValue MipsSETargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
+ SelectionDAG &DAG) const {
+ ShuffleVectorSDNode *Node = cast<ShuffleVectorSDNode>(Op);
+ EVT ResTy = Op->getValueType(0);
+
+ if (!ResTy.is128BitVector())
+ return SDValue();
+
+ int ResTyNumElts = ResTy.getVectorNumElements();
+ SmallVector<int, 16> Indices;
+
+ for (int i = 0; i < ResTyNumElts; ++i)
+ Indices.push_back(Node->getMaskElt(i));
+
+ // splati.[bhwd] is preferable to the others but is matched from
+ // MipsISD::VSHF.
+ if (isVECTOR_SHUFFLE_SPLATI(Op, ResTy, Indices, DAG))
+ return lowerVECTOR_SHUFFLE_VSHF(Op, ResTy, Indices, DAG);
+ SDValue Result;
+ if ((Result = lowerVECTOR_SHUFFLE_ILVEV(Op, ResTy, Indices, DAG)))
+ return Result;
+ if ((Result = lowerVECTOR_SHUFFLE_ILVOD(Op, ResTy, Indices, DAG)))
+ return Result;
+ if ((Result = lowerVECTOR_SHUFFLE_ILVL(Op, ResTy, Indices, DAG)))
+ return Result;
+ if ((Result = lowerVECTOR_SHUFFLE_ILVR(Op, ResTy, Indices, DAG)))
+ return Result;
+ if ((Result = lowerVECTOR_SHUFFLE_PCKEV(Op, ResTy, Indices, DAG)))
+ return Result;
+ if ((Result = lowerVECTOR_SHUFFLE_PCKOD(Op, ResTy, Indices, DAG)))
+ return Result;
+ if ((Result = lowerVECTOR_SHUFFLE_SHF(Op, ResTy, Indices, DAG)))
+ return Result;
+ return lowerVECTOR_SHUFFLE_VSHF(Op, ResTy, Indices, DAG);
+}
+
+MachineBasicBlock *
+MipsSETargetLowering::emitBPOSGE32(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ // $bb:
+ // bposge32_pseudo $vr0
+ // =>
+ // $bb:
+ // bposge32 $tbb
+ // $fbb:
+ // li $vr2, 0
+ // b $sink
+ // $tbb:
+ // li $vr1, 1
+ // $sink:
+ // $vr0 = phi($vr2, $fbb, $vr1, $tbb)
+
+ MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+ DebugLoc DL = MI.getDebugLoc();
+ const BasicBlock *LLVM_BB = BB->getBasicBlock();
+ MachineFunction::iterator It = std::next(MachineFunction::iterator(BB));
+ MachineFunction *F = BB->getParent();
+ MachineBasicBlock *FBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *TBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *Sink = F->CreateMachineBasicBlock(LLVM_BB);
+ F->insert(It, FBB);
+ F->insert(It, TBB);
+ F->insert(It, Sink);
+
+ // Transfer the remainder of BB and its successor edges to Sink.
+ Sink->splice(Sink->begin(), BB, std::next(MachineBasicBlock::iterator(MI)),
+ BB->end());
+ Sink->transferSuccessorsAndUpdatePHIs(BB);
+
+ // Add successors.
+ BB->addSuccessor(FBB);
+ BB->addSuccessor(TBB);
+ FBB->addSuccessor(Sink);
+ TBB->addSuccessor(Sink);
+
+ // Insert the real bposge32 instruction to $BB.
+ BuildMI(BB, DL, TII->get(Mips::BPOSGE32)).addMBB(TBB);
+ // Insert the real bposge32c instruction to $BB.
+ BuildMI(BB, DL, TII->get(Mips::BPOSGE32C_MMR3)).addMBB(TBB);
+
+ // Fill $FBB.
+ unsigned VR2 = RegInfo.createVirtualRegister(RC);
+ BuildMI(*FBB, FBB->end(), DL, TII->get(Mips::ADDiu), VR2)
+ .addReg(Mips::ZERO).addImm(0);
+ BuildMI(*FBB, FBB->end(), DL, TII->get(Mips::B)).addMBB(Sink);
+
+ // Fill $TBB.
+ unsigned VR1 = RegInfo.createVirtualRegister(RC);
+ BuildMI(*TBB, TBB->end(), DL, TII->get(Mips::ADDiu), VR1)
+ .addReg(Mips::ZERO).addImm(1);
+
+ // Insert phi function to $Sink.
+ BuildMI(*Sink, Sink->begin(), DL, TII->get(Mips::PHI),
+ MI.getOperand(0).getReg())
+ .addReg(VR2)
+ .addMBB(FBB)
+ .addReg(VR1)
+ .addMBB(TBB);
+
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
+ return Sink;
+}
+
+MachineBasicBlock *MipsSETargetLowering::emitMSACBranchPseudo(
+ MachineInstr &MI, MachineBasicBlock *BB, unsigned BranchOp) const {
+ // $bb:
+ // vany_nonzero $rd, $ws
+ // =>
+ // $bb:
+ // bnz.b $ws, $tbb
+ // b $fbb
+ // $fbb:
+ // li $rd1, 0
+ // b $sink
+ // $tbb:
+ // li $rd2, 1
+ // $sink:
+ // $rd = phi($rd1, $fbb, $rd2, $tbb)
+
+ MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+ DebugLoc DL = MI.getDebugLoc();
+ const BasicBlock *LLVM_BB = BB->getBasicBlock();
+ MachineFunction::iterator It = std::next(MachineFunction::iterator(BB));
+ MachineFunction *F = BB->getParent();
+ MachineBasicBlock *FBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *TBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *Sink = F->CreateMachineBasicBlock(LLVM_BB);
+ F->insert(It, FBB);
+ F->insert(It, TBB);
+ F->insert(It, Sink);
+
+ // Transfer the remainder of BB and its successor edges to Sink.
+ Sink->splice(Sink->begin(), BB, std::next(MachineBasicBlock::iterator(MI)),
+ BB->end());
+ Sink->transferSuccessorsAndUpdatePHIs(BB);
+
+ // Add successors.
+ BB->addSuccessor(FBB);
+ BB->addSuccessor(TBB);
+ FBB->addSuccessor(Sink);
+ TBB->addSuccessor(Sink);
+
+ // Insert the real bnz.b instruction to $BB.
+ BuildMI(BB, DL, TII->get(BranchOp))
+ .addReg(MI.getOperand(1).getReg())
+ .addMBB(TBB);
+
+ // Fill $FBB.
+ unsigned RD1 = RegInfo.createVirtualRegister(RC);
+ BuildMI(*FBB, FBB->end(), DL, TII->get(Mips::ADDiu), RD1)
+ .addReg(Mips::ZERO).addImm(0);
+ BuildMI(*FBB, FBB->end(), DL, TII->get(Mips::B)).addMBB(Sink);
+
+ // Fill $TBB.
+ unsigned RD2 = RegInfo.createVirtualRegister(RC);
+ BuildMI(*TBB, TBB->end(), DL, TII->get(Mips::ADDiu), RD2)
+ .addReg(Mips::ZERO).addImm(1);
+
+ // Insert phi function to $Sink.
+ BuildMI(*Sink, Sink->begin(), DL, TII->get(Mips::PHI),
+ MI.getOperand(0).getReg())
+ .addReg(RD1)
+ .addMBB(FBB)
+ .addReg(RD2)
+ .addMBB(TBB);
+
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
+ return Sink;
+}
+
+// Emit the COPY_FW pseudo instruction.
+//
+// copy_fw_pseudo $fd, $ws, n
+// =>
+// copy_u_w $rt, $ws, $n
+// mtc1 $rt, $fd
+//
+// When n is zero, the equivalent operation can be performed with (potentially)
+// zero instructions due to register overlaps. This optimization is never valid
+// for lane 1 because it would require FR=0 mode which isn't supported by MSA.
+MachineBasicBlock *
+MipsSETargetLowering::emitCOPY_FW(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+ DebugLoc DL = MI.getDebugLoc();
+ unsigned Fd = MI.getOperand(0).getReg();
+ unsigned Ws = MI.getOperand(1).getReg();
+ unsigned Lane = MI.getOperand(2).getImm();
+
+ if (Lane == 0) {
+ unsigned Wt = Ws;
+ if (!Subtarget.useOddSPReg()) {
+ // We must copy to an even-numbered MSA register so that the
+ // single-precision sub-register is also guaranteed to be even-numbered.
+ Wt = RegInfo.createVirtualRegister(&Mips::MSA128WEvensRegClass);
+
+ BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Wt).addReg(Ws);
+ }
+
+ BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Wt, 0, Mips::sub_lo);
+ } else {
+ unsigned Wt = RegInfo.createVirtualRegister(
+ Subtarget.useOddSPReg() ? &Mips::MSA128WRegClass :
+ &Mips::MSA128WEvensRegClass);
+
+ BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_W), Wt).addReg(Ws).addImm(Lane);
+ BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Wt, 0, Mips::sub_lo);
+ }
+
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
+ return BB;
+}
+
+// Emit the COPY_FD pseudo instruction.
+//
+// copy_fd_pseudo $fd, $ws, n
+// =>
+// splati.d $wt, $ws, $n
+// copy $fd, $wt:sub_64
+//
+// When n is zero, the equivalent operation can be performed with (potentially)
+// zero instructions due to register overlaps. This optimization is always
+// valid because FR=1 mode which is the only supported mode in MSA.
+MachineBasicBlock *
+MipsSETargetLowering::emitCOPY_FD(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ assert(Subtarget.isFP64bit());
+
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+ unsigned Fd = MI.getOperand(0).getReg();
+ unsigned Ws = MI.getOperand(1).getReg();
+ unsigned Lane = MI.getOperand(2).getImm() * 2;
+ DebugLoc DL = MI.getDebugLoc();
+
+ if (Lane == 0)
+ BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Ws, 0, Mips::sub_64);
+ else {
+ unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
+
+ BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_D), Wt).addReg(Ws).addImm(1);
+ BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Wt, 0, Mips::sub_64);
+ }
+
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
+ return BB;
+}
+
+// Emit the INSERT_FW pseudo instruction.
+//
+// insert_fw_pseudo $wd, $wd_in, $n, $fs
+// =>
+// subreg_to_reg $wt:sub_lo, $fs
+// insve_w $wd[$n], $wd_in, $wt[0]
+MachineBasicBlock *
+MipsSETargetLowering::emitINSERT_FW(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+ DebugLoc DL = MI.getDebugLoc();
+ unsigned Wd = MI.getOperand(0).getReg();
+ unsigned Wd_in = MI.getOperand(1).getReg();
+ unsigned Lane = MI.getOperand(2).getImm();
+ unsigned Fs = MI.getOperand(3).getReg();
+ unsigned Wt = RegInfo.createVirtualRegister(
+ Subtarget.useOddSPReg() ? &Mips::MSA128WRegClass :
+ &Mips::MSA128WEvensRegClass);
+
+ BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Wt)
+ .addImm(0)
+ .addReg(Fs)
+ .addImm(Mips::sub_lo);
+ BuildMI(*BB, MI, DL, TII->get(Mips::INSVE_W), Wd)
+ .addReg(Wd_in)
+ .addImm(Lane)
+ .addReg(Wt)
+ .addImm(0);
+
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
+ return BB;
+}
+
+// Emit the INSERT_FD pseudo instruction.
+//
+// insert_fd_pseudo $wd, $fs, n
+// =>
+// subreg_to_reg $wt:sub_64, $fs
+// insve_d $wd[$n], $wd_in, $wt[0]
+MachineBasicBlock *
+MipsSETargetLowering::emitINSERT_FD(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ assert(Subtarget.isFP64bit());
+
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+ DebugLoc DL = MI.getDebugLoc();
+ unsigned Wd = MI.getOperand(0).getReg();
+ unsigned Wd_in = MI.getOperand(1).getReg();
+ unsigned Lane = MI.getOperand(2).getImm();
+ unsigned Fs = MI.getOperand(3).getReg();
+ unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
+
+ BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Wt)
+ .addImm(0)
+ .addReg(Fs)
+ .addImm(Mips::sub_64);
+ BuildMI(*BB, MI, DL, TII->get(Mips::INSVE_D), Wd)
+ .addReg(Wd_in)
+ .addImm(Lane)
+ .addReg(Wt)
+ .addImm(0);
+
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
+ return BB;
+}
+
+// Emit the INSERT_([BHWD]|F[WD])_VIDX pseudo instruction.
+//
+// For integer:
+// (INSERT_([BHWD]|F[WD])_PSEUDO $wd, $wd_in, $n, $rs)
+// =>
+// (SLL $lanetmp1, $lane, <log2size)
+// (SLD_B $wdtmp1, $wd_in, $wd_in, $lanetmp1)
+// (INSERT_[BHWD], $wdtmp2, $wdtmp1, 0, $rs)
+// (NEG $lanetmp2, $lanetmp1)
+// (SLD_B $wd, $wdtmp2, $wdtmp2, $lanetmp2)
+//
+// For floating point:
+// (INSERT_([BHWD]|F[WD])_PSEUDO $wd, $wd_in, $n, $fs)
+// =>
+// (SUBREG_TO_REG $wt, $fs, <subreg>)
+// (SLL $lanetmp1, $lane, <log2size)
+// (SLD_B $wdtmp1, $wd_in, $wd_in, $lanetmp1)
+// (INSVE_[WD], $wdtmp2, 0, $wdtmp1, 0)
+// (NEG $lanetmp2, $lanetmp1)
+// (SLD_B $wd, $wdtmp2, $wdtmp2, $lanetmp2)
+MachineBasicBlock *MipsSETargetLowering::emitINSERT_DF_VIDX(
+ MachineInstr &MI, MachineBasicBlock *BB, unsigned EltSizeInBytes,
+ bool IsFP) const {
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+ DebugLoc DL = MI.getDebugLoc();
+ unsigned Wd = MI.getOperand(0).getReg();
+ unsigned SrcVecReg = MI.getOperand(1).getReg();
+ unsigned LaneReg = MI.getOperand(2).getReg();
+ unsigned SrcValReg = MI.getOperand(3).getReg();
+
+ const TargetRegisterClass *VecRC = nullptr;
+ // FIXME: This should be true for N32 too.
+ const TargetRegisterClass *GPRRC =
+ Subtarget.isABI_N64() ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+ unsigned SubRegIdx = Subtarget.isABI_N64() ? Mips::sub_32 : 0;
+ unsigned ShiftOp = Subtarget.isABI_N64() ? Mips::DSLL : Mips::SLL;
+ unsigned EltLog2Size;
+ unsigned InsertOp = 0;
+ unsigned InsveOp = 0;
+ switch (EltSizeInBytes) {
+ default:
+ llvm_unreachable("Unexpected size");
+ case 1:
+ EltLog2Size = 0;
+ InsertOp = Mips::INSERT_B;
+ InsveOp = Mips::INSVE_B;
+ VecRC = &Mips::MSA128BRegClass;
+ break;
+ case 2:
+ EltLog2Size = 1;
+ InsertOp = Mips::INSERT_H;
+ InsveOp = Mips::INSVE_H;
+ VecRC = &Mips::MSA128HRegClass;
+ break;
+ case 4:
+ EltLog2Size = 2;
+ InsertOp = Mips::INSERT_W;
+ InsveOp = Mips::INSVE_W;
+ VecRC = &Mips::MSA128WRegClass;
+ break;
+ case 8:
+ EltLog2Size = 3;
+ InsertOp = Mips::INSERT_D;
+ InsveOp = Mips::INSVE_D;
+ VecRC = &Mips::MSA128DRegClass;
+ break;
+ }
+
+ if (IsFP) {
+ unsigned Wt = RegInfo.createVirtualRegister(VecRC);
+ BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Wt)
+ .addImm(0)
+ .addReg(SrcValReg)
+ .addImm(EltSizeInBytes == 8 ? Mips::sub_64 : Mips::sub_lo);
+ SrcValReg = Wt;
+ }
+
+ // Convert the lane index into a byte index
+ if (EltSizeInBytes != 1) {
+ unsigned LaneTmp1 = RegInfo.createVirtualRegister(GPRRC);
+ BuildMI(*BB, MI, DL, TII->get(ShiftOp), LaneTmp1)
+ .addReg(LaneReg)
+ .addImm(EltLog2Size);
+ LaneReg = LaneTmp1;
+ }
+
+ // Rotate bytes around so that the desired lane is element zero
+ unsigned WdTmp1 = RegInfo.createVirtualRegister(VecRC);
+ BuildMI(*BB, MI, DL, TII->get(Mips::SLD_B), WdTmp1)
+ .addReg(SrcVecReg)
+ .addReg(SrcVecReg)
+ .addReg(LaneReg, 0, SubRegIdx);
+
+ unsigned WdTmp2 = RegInfo.createVirtualRegister(VecRC);
+ if (IsFP) {
+ // Use insve.df to insert to element zero
+ BuildMI(*BB, MI, DL, TII->get(InsveOp), WdTmp2)
+ .addReg(WdTmp1)
+ .addImm(0)
+ .addReg(SrcValReg)
+ .addImm(0);
+ } else {
+ // Use insert.df to insert to element zero
+ BuildMI(*BB, MI, DL, TII->get(InsertOp), WdTmp2)
+ .addReg(WdTmp1)
+ .addReg(SrcValReg)
+ .addImm(0);
+ }
+
+ // Rotate elements the rest of the way for a full rotation.
+ // sld.df inteprets $rt modulo the number of columns so we only need to negate
+ // the lane index to do this.
+ unsigned LaneTmp2 = RegInfo.createVirtualRegister(GPRRC);
+ BuildMI(*BB, MI, DL, TII->get(Subtarget.isABI_N64() ? Mips::DSUB : Mips::SUB),
+ LaneTmp2)
+ .addReg(Subtarget.isABI_N64() ? Mips::ZERO_64 : Mips::ZERO)
+ .addReg(LaneReg);
+ BuildMI(*BB, MI, DL, TII->get(Mips::SLD_B), Wd)
+ .addReg(WdTmp2)
+ .addReg(WdTmp2)
+ .addReg(LaneTmp2, 0, SubRegIdx);
+
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
+ return BB;
+}
+
+// Emit the FILL_FW pseudo instruction.
+//
+// fill_fw_pseudo $wd, $fs
+// =>
+// implicit_def $wt1
+// insert_subreg $wt2:subreg_lo, $wt1, $fs
+// splati.w $wd, $wt2[0]
+MachineBasicBlock *
+MipsSETargetLowering::emitFILL_FW(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+ DebugLoc DL = MI.getDebugLoc();
+ unsigned Wd = MI.getOperand(0).getReg();
+ unsigned Fs = MI.getOperand(1).getReg();
+ unsigned Wt1 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+ unsigned Wt2 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+
+ BuildMI(*BB, MI, DL, TII->get(Mips::IMPLICIT_DEF), Wt1);
+ BuildMI(*BB, MI, DL, TII->get(Mips::INSERT_SUBREG), Wt2)
+ .addReg(Wt1)
+ .addReg(Fs)
+ .addImm(Mips::sub_lo);
+ BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_W), Wd).addReg(Wt2).addImm(0);
+
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
+ return BB;
+}
+
+// Emit the FILL_FD pseudo instruction.
+//
+// fill_fd_pseudo $wd, $fs
+// =>
+// implicit_def $wt1
+// insert_subreg $wt2:subreg_64, $wt1, $fs
+// splati.d $wd, $wt2[0]
+MachineBasicBlock *
+MipsSETargetLowering::emitFILL_FD(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ assert(Subtarget.isFP64bit());
+
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+ DebugLoc DL = MI.getDebugLoc();
+ unsigned Wd = MI.getOperand(0).getReg();
+ unsigned Fs = MI.getOperand(1).getReg();
+ unsigned Wt1 = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
+ unsigned Wt2 = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
+
+ BuildMI(*BB, MI, DL, TII->get(Mips::IMPLICIT_DEF), Wt1);
+ BuildMI(*BB, MI, DL, TII->get(Mips::INSERT_SUBREG), Wt2)
+ .addReg(Wt1)
+ .addReg(Fs)
+ .addImm(Mips::sub_64);
+ BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_D), Wd).addReg(Wt2).addImm(0);
+
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
+ return BB;
+}
+
+// Emit the ST_F16_PSEDUO instruction to store a f16 value from an MSA
+// register.
+//
+// STF16 MSA128F16:$wd, mem_simm10:$addr
+// =>
+// copy_u.h $rtemp,$wd[0]
+// sh $rtemp, $addr
+//
+// Safety: We can't use st.h & co as they would over write the memory after
+// the destination. It would require half floats be allocated 16 bytes(!) of
+// space.
+MachineBasicBlock *
+MipsSETargetLowering::emitST_F16_PSEUDO(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+ DebugLoc DL = MI.getDebugLoc();
+ unsigned Ws = MI.getOperand(0).getReg();
+ unsigned Rt = MI.getOperand(1).getReg();
+ const MachineMemOperand &MMO = **MI.memoperands_begin();
+ unsigned Imm = MMO.getOffset();
+
+ // Caution: A load via the GOT can expand to a GPR32 operand, a load via
+ // spill and reload can expand as a GPR64 operand. Examine the
+ // operand in detail and default to ABI.
+ const TargetRegisterClass *RC =
+ MI.getOperand(1).isReg() ? RegInfo.getRegClass(MI.getOperand(1).getReg())
+ : (Subtarget.isABI_O32() ? &Mips::GPR32RegClass
+ : &Mips::GPR64RegClass);
+ const bool UsingMips32 = RC == &Mips::GPR32RegClass;
+ unsigned Rs = RegInfo.createVirtualRegister(RC);
+
+ BuildMI(*BB, MI, DL, TII->get(Mips::COPY_U_H), Rs).addReg(Ws).addImm(0);
+ BuildMI(*BB, MI, DL, TII->get(UsingMips32 ? Mips::SH : Mips::SH64))
+ .addReg(Rs)
+ .addReg(Rt)
+ .addImm(Imm)
+ .addMemOperand(BB->getParent()->getMachineMemOperand(
+ &MMO, MMO.getOffset(), MMO.getSize()));
+
+ MI.eraseFromParent();
+ return BB;
+}
+
+// Emit the LD_F16_PSEDUO instruction to load a f16 value into an MSA register.
+//
+// LD_F16 MSA128F16:$wd, mem_simm10:$addr
+// =>
+// lh $rtemp, $addr
+// fill.h $wd, $rtemp
+//
+// Safety: We can't use ld.h & co as they over-read from the source.
+// Additionally, if the address is not modulo 16, 2 cases can occur:
+// a) Segmentation fault as the load instruction reads from a memory page
+// memory it's not supposed to.
+// b) The load crosses an implementation specific boundary, requiring OS
+// intervention.
+//
+MachineBasicBlock *
+MipsSETargetLowering::emitLD_F16_PSEUDO(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+ DebugLoc DL = MI.getDebugLoc();
+ unsigned Wd = MI.getOperand(0).getReg();
+
+ // Caution: A load via the GOT can expand to a GPR32 operand, a load via
+ // spill and reload can expand as a GPR64 operand. Examine the
+ // operand in detail and default to ABI.
+ const TargetRegisterClass *RC =
+ MI.getOperand(1).isReg() ? RegInfo.getRegClass(MI.getOperand(1).getReg())
+ : (Subtarget.isABI_O32() ? &Mips::GPR32RegClass
+ : &Mips::GPR64RegClass);
+
+ const bool UsingMips32 = RC == &Mips::GPR32RegClass;
+ unsigned Rt = RegInfo.createVirtualRegister(RC);
+
+ MachineInstrBuilder MIB =
+ BuildMI(*BB, MI, DL, TII->get(UsingMips32 ? Mips::LH : Mips::LH64), Rt);
+ for (unsigned i = 1; i < MI.getNumOperands(); i++)
+ MIB.addOperand(MI.getOperand(i));
+
+ BuildMI(*BB, MI, DL, TII->get(Mips::FILL_H), Wd).addReg(Rt);
+
+ MI.eraseFromParent();
+ return BB;
+}
+
+// Emit the FPROUND_PSEUDO instruction.
+//
+// Round an FGR64Opnd, FGR32Opnd to an f16.
+//
+// Safety: Cycle the operand through the GPRs so the result always ends up
+// the correct MSA register.
+//
+// FIXME: This copying is strictly unnecessary. If we could tie FGR32Opnd:$Fs
+// / FGR64Opnd:$Fs and MSA128F16:$Wd to the same physical register
+// (which they can be, as the MSA registers are defined to alias the
+// FPU's 64 bit and 32 bit registers) the result can be accessed using
+// the correct register class. That requires operands be tie-able across
+// register classes which have a sub/super register class relationship.
+//
+// For FPG32Opnd:
+//
+// FPROUND MSA128F16:$wd, FGR32Opnd:$fs
+// =>
+// mfc1 $rtemp, $fs
+// fill.w $rtemp, $wtemp
+// fexdo.w $wd, $wtemp, $wtemp
+//
+// For FPG64Opnd on mips32r2+:
+//
+// FPROUND MSA128F16:$wd, FGR64Opnd:$fs
+// =>
+// mfc1 $rtemp, $fs
+// fill.w $rtemp, $wtemp
+// mfhc1 $rtemp2, $fs
+// insert.w $wtemp[1], $rtemp2
+// insert.w $wtemp[3], $rtemp2
+// fexdo.w $wtemp2, $wtemp, $wtemp
+// fexdo.h $wd, $temp2, $temp2
+//
+// For FGR64Opnd on mips64r2+:
+//
+// FPROUND MSA128F16:$wd, FGR64Opnd:$fs
+// =>
+// dmfc1 $rtemp, $fs
+// fill.d $rtemp, $wtemp
+// fexdo.w $wtemp2, $wtemp, $wtemp
+// fexdo.h $wd, $wtemp2, $wtemp2
+//
+// Safety note: As $wtemp is UNDEF, we may provoke a spurious exception if the
+// undef bits are "just right" and the exception enable bits are
+// set. By using fill.w to replicate $fs into all elements over
+// insert.w for one element, we avoid that potiential case. If
+// fexdo.[hw] causes an exception in, the exception is valid and it
+// occurs for all elements.
+//
+MachineBasicBlock *
+MipsSETargetLowering::emitFPROUND_PSEUDO(MachineInstr &MI,
+ MachineBasicBlock *BB,
+ bool IsFGR64) const {
+
+ // Strictly speaking, we need MIPS32R5 to support MSA. We'll be generous
+ // here. It's technically doable to support MIPS32 here, but the ISA forbids
+ // it.
+ assert(Subtarget.hasMSA() && Subtarget.hasMips32r2());
+
+ bool IsFGR64onMips64 = Subtarget.hasMips64() && IsFGR64;
+
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ DebugLoc DL = MI.getDebugLoc();
+ unsigned Wd = MI.getOperand(0).getReg();
+ unsigned Fs = MI.getOperand(1).getReg();
+
+ MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+ unsigned Wtemp = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+ const TargetRegisterClass *GPRRC =
+ IsFGR64onMips64 ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+ unsigned MFC1Opc = IsFGR64onMips64 ? Mips::DMFC1 : Mips::MFC1;
+ unsigned FILLOpc = IsFGR64onMips64 ? Mips::FILL_D : Mips::FILL_W;
+
+ // Perform the register class copy as mentioned above.
+ unsigned Rtemp = RegInfo.createVirtualRegister(GPRRC);
+ BuildMI(*BB, MI, DL, TII->get(MFC1Opc), Rtemp).addReg(Fs);
+ BuildMI(*BB, MI, DL, TII->get(FILLOpc), Wtemp).addReg(Rtemp);
+ unsigned WPHI = Wtemp;
+
+ if (!Subtarget.hasMips64() && IsFGR64) {
+ unsigned Rtemp2 = RegInfo.createVirtualRegister(GPRRC);
+ BuildMI(*BB, MI, DL, TII->get(Mips::MFHC1_D64), Rtemp2).addReg(Fs);
+ unsigned Wtemp2 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+ unsigned Wtemp3 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+ BuildMI(*BB, MI, DL, TII->get(Mips::INSERT_W), Wtemp2)
+ .addReg(Wtemp)
+ .addReg(Rtemp2)
+ .addImm(1);
+ BuildMI(*BB, MI, DL, TII->get(Mips::INSERT_W), Wtemp3)
+ .addReg(Wtemp2)
+ .addReg(Rtemp2)
+ .addImm(3);
+ WPHI = Wtemp3;
+ }
+
+ if (IsFGR64) {
+ unsigned Wtemp2 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+ BuildMI(*BB, MI, DL, TII->get(Mips::FEXDO_W), Wtemp2)
+ .addReg(WPHI)
+ .addReg(WPHI);
+ WPHI = Wtemp2;
+ }
+
+ BuildMI(*BB, MI, DL, TII->get(Mips::FEXDO_H), Wd).addReg(WPHI).addReg(WPHI);
+
+ MI.eraseFromParent();
+ return BB;
+}
+
+// Emit the FPEXTEND_PSEUDO instruction.
+//
+// Expand an f16 to either a FGR32Opnd or FGR64Opnd.
+//
+// Safety: Cycle the result through the GPRs so the result always ends up
+// the correct floating point register.
+//
+// FIXME: This copying is strictly unnecessary. If we could tie FGR32Opnd:$Fd
+// / FGR64Opnd:$Fd and MSA128F16:$Ws to the same physical register
+// (which they can be, as the MSA registers are defined to alias the
+// FPU's 64 bit and 32 bit registers) the result can be accessed using
+// the correct register class. That requires operands be tie-able across
+// register classes which have a sub/super register class relationship. I
+// haven't checked.
+//
+// For FGR32Opnd:
+//
+// FPEXTEND FGR32Opnd:$fd, MSA128F16:$ws
+// =>
+// fexupr.w $wtemp, $ws
+// copy_s.w $rtemp, $ws[0]
+// mtc1 $rtemp, $fd
+//
+// For FGR64Opnd on Mips64:
+//
+// FPEXTEND FGR64Opnd:$fd, MSA128F16:$ws
+// =>
+// fexupr.w $wtemp, $ws
+// fexupr.d $wtemp2, $wtemp
+// copy_s.d $rtemp, $wtemp2s[0]
+// dmtc1 $rtemp, $fd
+//
+// For FGR64Opnd on Mips32:
+//
+// FPEXTEND FGR64Opnd:$fd, MSA128F16:$ws
+// =>
+// fexupr.w $wtemp, $ws
+// fexupr.d $wtemp2, $wtemp
+// copy_s.w $rtemp, $wtemp2[0]
+// mtc1 $rtemp, $ftemp
+// copy_s.w $rtemp2, $wtemp2[1]
+// $fd = mthc1 $rtemp2, $ftemp
+//
+MachineBasicBlock *
+MipsSETargetLowering::emitFPEXTEND_PSEUDO(MachineInstr &MI,
+ MachineBasicBlock *BB,
+ bool IsFGR64) const {
+
+ // Strictly speaking, we need MIPS32R5 to support MSA. We'll be generous
+ // here. It's technically doable to support MIPS32 here, but the ISA forbids
+ // it.
+ assert(Subtarget.hasMSA() && Subtarget.hasMips32r2());
+
+ bool IsFGR64onMips64 = Subtarget.hasMips64() && IsFGR64;
+ bool IsFGR64onMips32 = !Subtarget.hasMips64() && IsFGR64;
+
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ DebugLoc DL = MI.getDebugLoc();
+ unsigned Fd = MI.getOperand(0).getReg();
+ unsigned Ws = MI.getOperand(1).getReg();
+
+ MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+ const TargetRegisterClass *GPRRC =
+ IsFGR64onMips64 ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+ unsigned MTC1Opc = IsFGR64onMips64 ? Mips::DMTC1 : Mips::MTC1;
+ unsigned COPYOpc = IsFGR64onMips64 ? Mips::COPY_S_D : Mips::COPY_S_W;
+
+ unsigned Wtemp = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+ unsigned WPHI = Wtemp;
+
+ BuildMI(*BB, MI, DL, TII->get(Mips::FEXUPR_W), Wtemp).addReg(Ws);
+ if (IsFGR64) {
+ WPHI = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
+ BuildMI(*BB, MI, DL, TII->get(Mips::FEXUPR_D), WPHI).addReg(Wtemp);
+ }
+
+ // Perform the safety regclass copy mentioned above.
+ unsigned Rtemp = RegInfo.createVirtualRegister(GPRRC);
+ unsigned FPRPHI = IsFGR64onMips32
+ ? RegInfo.createVirtualRegister(&Mips::FGR64RegClass)
+ : Fd;
+ BuildMI(*BB, MI, DL, TII->get(COPYOpc), Rtemp).addReg(WPHI).addImm(0);
+ BuildMI(*BB, MI, DL, TII->get(MTC1Opc), FPRPHI).addReg(Rtemp);
+
+ if (IsFGR64onMips32) {
+ unsigned Rtemp2 = RegInfo.createVirtualRegister(GPRRC);
+ BuildMI(*BB, MI, DL, TII->get(Mips::COPY_S_W), Rtemp2)
+ .addReg(WPHI)
+ .addImm(1);
+ BuildMI(*BB, MI, DL, TII->get(Mips::MTHC1_D64), Fd)
+ .addReg(FPRPHI)
+ .addReg(Rtemp2);
+ }
+
+ MI.eraseFromParent();
+ return BB;
+}
+
+// Emit the FEXP2_W_1 pseudo instructions.
+//
+// fexp2_w_1_pseudo $wd, $wt
+// =>
+// ldi.w $ws, 1
+// fexp2.w $wd, $ws, $wt
+MachineBasicBlock *
+MipsSETargetLowering::emitFEXP2_W_1(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+ const TargetRegisterClass *RC = &Mips::MSA128WRegClass;
+ unsigned Ws1 = RegInfo.createVirtualRegister(RC);
+ unsigned Ws2 = RegInfo.createVirtualRegister(RC);
+ DebugLoc DL = MI.getDebugLoc();
+
+ // Splat 1.0 into a vector
+ BuildMI(*BB, MI, DL, TII->get(Mips::LDI_W), Ws1).addImm(1);
+ BuildMI(*BB, MI, DL, TII->get(Mips::FFINT_U_W), Ws2).addReg(Ws1);
+
+ // Emit 1.0 * fexp2(Wt)
+ BuildMI(*BB, MI, DL, TII->get(Mips::FEXP2_W), MI.getOperand(0).getReg())
+ .addReg(Ws2)
+ .addReg(MI.getOperand(1).getReg());
+
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
+ return BB;
+}
+
+// Emit the FEXP2_D_1 pseudo instructions.
+//
+// fexp2_d_1_pseudo $wd, $wt
+// =>
+// ldi.d $ws, 1
+// fexp2.d $wd, $ws, $wt
+MachineBasicBlock *
+MipsSETargetLowering::emitFEXP2_D_1(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+ const TargetRegisterClass *RC = &Mips::MSA128DRegClass;
+ unsigned Ws1 = RegInfo.createVirtualRegister(RC);
+ unsigned Ws2 = RegInfo.createVirtualRegister(RC);
+ DebugLoc DL = MI.getDebugLoc();
+
+ // Splat 1.0 into a vector
+ BuildMI(*BB, MI, DL, TII->get(Mips::LDI_D), Ws1).addImm(1);
+ BuildMI(*BB, MI, DL, TII->get(Mips::FFINT_U_D), Ws2).addReg(Ws1);
+
+ // Emit 1.0 * fexp2(Wt)
+ BuildMI(*BB, MI, DL, TII->get(Mips::FEXP2_D), MI.getOperand(0).getReg())
+ .addReg(Ws2)
+ .addReg(MI.getOperand(1).getReg());
+
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
+ return BB;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.h b/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.h
new file mode 100644
index 000000000000..0abb9b318bda
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.h
@@ -0,0 +1,131 @@
+//===-- MipsSEISelLowering.h - MipsSE DAG Lowering Interface ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsTargetLowering specialized for mips32/64.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSSEISELLOWERING_H
+#define LLVM_LIB_TARGET_MIPS_MIPSSEISELLOWERING_H
+
+#include "MipsISelLowering.h"
+#include "MipsRegisterInfo.h"
+
+namespace llvm {
+ class MipsSETargetLowering : public MipsTargetLowering {
+ public:
+ explicit MipsSETargetLowering(const MipsTargetMachine &TM,
+ const MipsSubtarget &STI);
+
+ /// \brief Enable MSA support for the given integer type and Register
+ /// class.
+ void addMSAIntType(MVT::SimpleValueType Ty, const TargetRegisterClass *RC);
+ /// \brief Enable MSA support for the given floating-point type and
+ /// Register class.
+ void addMSAFloatType(MVT::SimpleValueType Ty,
+ const TargetRegisterClass *RC);
+
+ bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS = 0,
+ unsigned Align = 1,
+ bool *Fast = nullptr) const override;
+
+ SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+ SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+ MachineBasicBlock *
+ EmitInstrWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *MBB) const override;
+
+ bool isShuffleMaskLegal(const SmallVectorImpl<int> &Mask,
+ EVT VT) const override {
+ return false;
+ }
+
+ const TargetRegisterClass *getRepRegClassFor(MVT VT) const override;
+
+ private:
+ bool isEligibleForTailCallOptimization(
+ const CCState &CCInfo, unsigned NextStackOffset,
+ const MipsFunctionInfo &FI) const override;
+
+ void
+ getOpndList(SmallVectorImpl<SDValue> &Ops,
+ std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
+ bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
+ bool IsCallReloc, CallLoweringInfo &CLI, SDValue Callee,
+ SDValue Chain) const override;
+
+ SDValue lowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue lowerMulDiv(SDValue Op, unsigned NewOpc, bool HasLo, bool HasHi,
+ SelectionDAG &DAG) const;
+
+ SDValue lowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+ /// \brief Lower VECTOR_SHUFFLE into one of a number of instructions
+ /// depending on the indices in the shuffle.
+ SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+
+ MachineBasicBlock *emitBPOSGE32(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+ MachineBasicBlock *emitMSACBranchPseudo(MachineInstr &MI,
+ MachineBasicBlock *BB,
+ unsigned BranchOp) const;
+ /// \brief Emit the COPY_FW pseudo instruction
+ MachineBasicBlock *emitCOPY_FW(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+ /// \brief Emit the COPY_FD pseudo instruction
+ MachineBasicBlock *emitCOPY_FD(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+ /// \brief Emit the INSERT_FW pseudo instruction
+ MachineBasicBlock *emitINSERT_FW(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+ /// \brief Emit the INSERT_FD pseudo instruction
+ MachineBasicBlock *emitINSERT_FD(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+ /// \brief Emit the INSERT_([BHWD]|F[WD])_VIDX pseudo instruction
+ MachineBasicBlock *emitINSERT_DF_VIDX(MachineInstr &MI,
+ MachineBasicBlock *BB,
+ unsigned EltSizeInBytes,
+ bool IsFP) const;
+ /// \brief Emit the FILL_FW pseudo instruction
+ MachineBasicBlock *emitFILL_FW(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+ /// \brief Emit the FILL_FD pseudo instruction
+ MachineBasicBlock *emitFILL_FD(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+ /// \brief Emit the FEXP2_W_1 pseudo instructions.
+ MachineBasicBlock *emitFEXP2_W_1(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+ /// \brief Emit the FEXP2_D_1 pseudo instructions.
+ MachineBasicBlock *emitFEXP2_D_1(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+ /// \brief Emit the FILL_FW pseudo instruction
+ MachineBasicBlock *emitLD_F16_PSEUDO(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+ /// \brief Emit the FILL_FD pseudo instruction
+ MachineBasicBlock *emitST_F16_PSEUDO(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+ /// \brief Emit the FEXP2_W_1 pseudo instructions.
+ MachineBasicBlock *emitFPEXTEND_PSEUDO(MachineInstr &MI,
+ MachineBasicBlock *BB,
+ bool IsFGR64) const;
+ /// \brief Emit the FEXP2_D_1 pseudo instructions.
+ MachineBasicBlock *emitFPROUND_PSEUDO(MachineInstr &MI,
+ MachineBasicBlock *BBi,
+ bool IsFGR64) const;
+ };
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
new file mode 100644
index 000000000000..ea703d0edd96
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -0,0 +1,754 @@
+//===-- MipsSEInstrInfo.cpp - Mips32/64 Instruction Information -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips32/64 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsSEInstrInfo.h"
+#include "InstPrinter/MipsInstPrinter.h"
+#include "MipsAnalyzeImmediate.h"
+#include "MipsMachineFunction.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+MipsSEInstrInfo::MipsSEInstrInfo(const MipsSubtarget &STI)
+ : MipsInstrInfo(STI, STI.isPositionIndependent() ? Mips::B : Mips::J),
+ RI() {}
+
+const MipsRegisterInfo &MipsSEInstrInfo::getRegisterInfo() const {
+ return RI;
+}
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the destination along with the FrameIndex of the loaded stack slot. If
+/// not, return 0. This predicate must return 0 if the instruction has
+/// any side effects other than loading from the stack slot.
+unsigned MipsSEInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const {
+ unsigned Opc = MI.getOpcode();
+
+ if ((Opc == Mips::LW) || (Opc == Mips::LD) ||
+ (Opc == Mips::LWC1) || (Opc == Mips::LDC1) || (Opc == Mips::LDC164)) {
+ if ((MI.getOperand(1).isFI()) && // is a stack slot
+ (MI.getOperand(2).isImm()) && // the imm is zero
+ (isZeroImm(MI.getOperand(2)))) {
+ FrameIndex = MI.getOperand(1).getIndex();
+ return MI.getOperand(0).getReg();
+ }
+ }
+
+ return 0;
+}
+
+/// isStoreToStackSlot - If the specified machine instruction is a direct
+/// store to a stack slot, return the virtual or physical register number of
+/// the source reg along with the FrameIndex of the loaded stack slot. If
+/// not, return 0. This predicate must return 0 if the instruction has
+/// any side effects other than storing to the stack slot.
+unsigned MipsSEInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const {
+ unsigned Opc = MI.getOpcode();
+
+ if ((Opc == Mips::SW) || (Opc == Mips::SD) ||
+ (Opc == Mips::SWC1) || (Opc == Mips::SDC1) || (Opc == Mips::SDC164)) {
+ if ((MI.getOperand(1).isFI()) && // is a stack slot
+ (MI.getOperand(2).isImm()) && // the imm is zero
+ (isZeroImm(MI.getOperand(2)))) {
+ FrameIndex = MI.getOperand(1).getIndex();
+ return MI.getOperand(0).getReg();
+ }
+ }
+ return 0;
+}
+
+void MipsSEInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL, unsigned DestReg,
+ unsigned SrcReg, bool KillSrc) const {
+ unsigned Opc = 0, ZeroReg = 0;
+ bool isMicroMips = Subtarget.inMicroMipsMode();
+
+ if (Mips::GPR32RegClass.contains(DestReg)) { // Copy to CPU Reg.
+ if (Mips::GPR32RegClass.contains(SrcReg)) {
+ if (isMicroMips)
+ Opc = Mips::MOVE16_MM;
+ else
+ Opc = Mips::OR, ZeroReg = Mips::ZERO;
+ } else if (Mips::CCRRegClass.contains(SrcReg))
+ Opc = Mips::CFC1;
+ else if (Mips::FGR32RegClass.contains(SrcReg))
+ Opc = Mips::MFC1;
+ else if (Mips::HI32RegClass.contains(SrcReg)) {
+ Opc = isMicroMips ? Mips::MFHI16_MM : Mips::MFHI;
+ SrcReg = 0;
+ } else if (Mips::LO32RegClass.contains(SrcReg)) {
+ Opc = isMicroMips ? Mips::MFLO16_MM : Mips::MFLO;
+ SrcReg = 0;
+ } else if (Mips::HI32DSPRegClass.contains(SrcReg))
+ Opc = Mips::MFHI_DSP;
+ else if (Mips::LO32DSPRegClass.contains(SrcReg))
+ Opc = Mips::MFLO_DSP;
+ else if (Mips::DSPCCRegClass.contains(SrcReg)) {
+ BuildMI(MBB, I, DL, get(Mips::RDDSP), DestReg).addImm(1 << 4)
+ .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+ return;
+ }
+ else if (Mips::MSACtrlRegClass.contains(SrcReg))
+ Opc = Mips::CFCMSA;
+ }
+ else if (Mips::GPR32RegClass.contains(SrcReg)) { // Copy from CPU Reg.
+ if (Mips::CCRRegClass.contains(DestReg))
+ Opc = Mips::CTC1;
+ else if (Mips::FGR32RegClass.contains(DestReg))
+ Opc = Mips::MTC1;
+ else if (Mips::HI32RegClass.contains(DestReg))
+ Opc = Mips::MTHI, DestReg = 0;
+ else if (Mips::LO32RegClass.contains(DestReg))
+ Opc = Mips::MTLO, DestReg = 0;
+ else if (Mips::HI32DSPRegClass.contains(DestReg))
+ Opc = Mips::MTHI_DSP;
+ else if (Mips::LO32DSPRegClass.contains(DestReg))
+ Opc = Mips::MTLO_DSP;
+ else if (Mips::DSPCCRegClass.contains(DestReg)) {
+ BuildMI(MBB, I, DL, get(Mips::WRDSP))
+ .addReg(SrcReg, getKillRegState(KillSrc)).addImm(1 << 4)
+ .addReg(DestReg, RegState::ImplicitDefine);
+ return;
+ } else if (Mips::MSACtrlRegClass.contains(DestReg)) {
+ BuildMI(MBB, I, DL, get(Mips::CTCMSA))
+ .addReg(DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+ }
+ }
+ else if (Mips::FGR32RegClass.contains(DestReg, SrcReg))
+ Opc = Mips::FMOV_S;
+ else if (Mips::AFGR64RegClass.contains(DestReg, SrcReg))
+ Opc = Mips::FMOV_D32;
+ else if (Mips::FGR64RegClass.contains(DestReg, SrcReg))
+ Opc = Mips::FMOV_D64;
+ else if (Mips::GPR64RegClass.contains(DestReg)) { // Copy to CPU64 Reg.
+ if (Mips::GPR64RegClass.contains(SrcReg))
+ Opc = Mips::OR64, ZeroReg = Mips::ZERO_64;
+ else if (Mips::HI64RegClass.contains(SrcReg))
+ Opc = Mips::MFHI64, SrcReg = 0;
+ else if (Mips::LO64RegClass.contains(SrcReg))
+ Opc = Mips::MFLO64, SrcReg = 0;
+ else if (Mips::FGR64RegClass.contains(SrcReg))
+ Opc = Mips::DMFC1;
+ }
+ else if (Mips::GPR64RegClass.contains(SrcReg)) { // Copy from CPU64 Reg.
+ if (Mips::HI64RegClass.contains(DestReg))
+ Opc = Mips::MTHI64, DestReg = 0;
+ else if (Mips::LO64RegClass.contains(DestReg))
+ Opc = Mips::MTLO64, DestReg = 0;
+ else if (Mips::FGR64RegClass.contains(DestReg))
+ Opc = Mips::DMTC1;
+ }
+ else if (Mips::MSA128BRegClass.contains(DestReg)) { // Copy to MSA reg
+ if (Mips::MSA128BRegClass.contains(SrcReg))
+ Opc = Mips::MOVE_V;
+ }
+
+ assert(Opc && "Cannot copy registers");
+
+ MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opc));
+
+ if (DestReg)
+ MIB.addReg(DestReg, RegState::Define);
+
+ if (SrcReg)
+ MIB.addReg(SrcReg, getKillRegState(KillSrc));
+
+ if (ZeroReg)
+ MIB.addReg(ZeroReg);
+}
+
+void MipsSEInstrInfo::
+storeRegToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ unsigned SrcReg, bool isKill, int FI,
+ const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
+ int64_t Offset) const {
+ DebugLoc DL;
+ MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOStore);
+
+ unsigned Opc = 0;
+
+ if (Mips::GPR32RegClass.hasSubClassEq(RC))
+ Opc = Mips::SW;
+ else if (Mips::GPR64RegClass.hasSubClassEq(RC))
+ Opc = Mips::SD;
+ else if (Mips::ACC64RegClass.hasSubClassEq(RC))
+ Opc = Mips::STORE_ACC64;
+ else if (Mips::ACC64DSPRegClass.hasSubClassEq(RC))
+ Opc = Mips::STORE_ACC64DSP;
+ else if (Mips::ACC128RegClass.hasSubClassEq(RC))
+ Opc = Mips::STORE_ACC128;
+ else if (Mips::DSPCCRegClass.hasSubClassEq(RC))
+ Opc = Mips::STORE_CCOND_DSP;
+ else if (Mips::FGR32RegClass.hasSubClassEq(RC))
+ Opc = Mips::SWC1;
+ else if (Mips::AFGR64RegClass.hasSubClassEq(RC))
+ Opc = Mips::SDC1;
+ else if (Mips::FGR64RegClass.hasSubClassEq(RC))
+ Opc = Mips::SDC164;
+ else if (RC->hasType(MVT::v16i8))
+ Opc = Mips::ST_B;
+ else if (RC->hasType(MVT::v8i16) || RC->hasType(MVT::v8f16))
+ Opc = Mips::ST_H;
+ else if (RC->hasType(MVT::v4i32) || RC->hasType(MVT::v4f32))
+ Opc = Mips::ST_W;
+ else if (RC->hasType(MVT::v2i64) || RC->hasType(MVT::v2f64))
+ Opc = Mips::ST_D;
+ else if (Mips::LO32RegClass.hasSubClassEq(RC))
+ Opc = Mips::SW;
+ else if (Mips::LO64RegClass.hasSubClassEq(RC))
+ Opc = Mips::SD;
+ else if (Mips::HI32RegClass.hasSubClassEq(RC))
+ Opc = Mips::SW;
+ else if (Mips::HI64RegClass.hasSubClassEq(RC))
+ Opc = Mips::SD;
+
+ // Hi, Lo are normally caller save but they are callee save
+ // for interrupt handling.
+ const Function *Func = MBB.getParent()->getFunction();
+ if (Func->hasFnAttribute("interrupt")) {
+ if (Mips::HI32RegClass.hasSubClassEq(RC)) {
+ BuildMI(MBB, I, DL, get(Mips::MFHI), Mips::K0);
+ SrcReg = Mips::K0;
+ } else if (Mips::HI64RegClass.hasSubClassEq(RC)) {
+ BuildMI(MBB, I, DL, get(Mips::MFHI64), Mips::K0_64);
+ SrcReg = Mips::K0_64;
+ } else if (Mips::LO32RegClass.hasSubClassEq(RC)) {
+ BuildMI(MBB, I, DL, get(Mips::MFLO), Mips::K0);
+ SrcReg = Mips::K0;
+ } else if (Mips::LO64RegClass.hasSubClassEq(RC)) {
+ BuildMI(MBB, I, DL, get(Mips::MFLO64), Mips::K0_64);
+ SrcReg = Mips::K0_64;
+ }
+ }
+
+ assert(Opc && "Register class not handled!");
+ BuildMI(MBB, I, DL, get(Opc)).addReg(SrcReg, getKillRegState(isKill))
+ .addFrameIndex(FI).addImm(Offset).addMemOperand(MMO);
+}
+
+void MipsSEInstrInfo::
+loadRegFromStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ unsigned DestReg, int FI, const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI, int64_t Offset) const {
+ DebugLoc DL;
+ if (I != MBB.end()) DL = I->getDebugLoc();
+ MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOLoad);
+ unsigned Opc = 0;
+
+ const Function *Func = MBB.getParent()->getFunction();
+ bool ReqIndirectLoad = Func->hasFnAttribute("interrupt") &&
+ (DestReg == Mips::LO0 || DestReg == Mips::LO0_64 ||
+ DestReg == Mips::HI0 || DestReg == Mips::HI0_64);
+
+ if (Mips::GPR32RegClass.hasSubClassEq(RC))
+ Opc = Mips::LW;
+ else if (Mips::GPR64RegClass.hasSubClassEq(RC))
+ Opc = Mips::LD;
+ else if (Mips::ACC64RegClass.hasSubClassEq(RC))
+ Opc = Mips::LOAD_ACC64;
+ else if (Mips::ACC64DSPRegClass.hasSubClassEq(RC))
+ Opc = Mips::LOAD_ACC64DSP;
+ else if (Mips::ACC128RegClass.hasSubClassEq(RC))
+ Opc = Mips::LOAD_ACC128;
+ else if (Mips::DSPCCRegClass.hasSubClassEq(RC))
+ Opc = Mips::LOAD_CCOND_DSP;
+ else if (Mips::FGR32RegClass.hasSubClassEq(RC))
+ Opc = Mips::LWC1;
+ else if (Mips::AFGR64RegClass.hasSubClassEq(RC))
+ Opc = Mips::LDC1;
+ else if (Mips::FGR64RegClass.hasSubClassEq(RC))
+ Opc = Mips::LDC164;
+ else if (RC->hasType(MVT::v16i8))
+ Opc = Mips::LD_B;
+ else if (RC->hasType(MVT::v8i16) || RC->hasType(MVT::v8f16))
+ Opc = Mips::LD_H;
+ else if (RC->hasType(MVT::v4i32) || RC->hasType(MVT::v4f32))
+ Opc = Mips::LD_W;
+ else if (RC->hasType(MVT::v2i64) || RC->hasType(MVT::v2f64))
+ Opc = Mips::LD_D;
+ else if (Mips::HI32RegClass.hasSubClassEq(RC))
+ Opc = Mips::LW;
+ else if (Mips::HI64RegClass.hasSubClassEq(RC))
+ Opc = Mips::LD;
+ else if (Mips::LO32RegClass.hasSubClassEq(RC))
+ Opc = Mips::LW;
+ else if (Mips::LO64RegClass.hasSubClassEq(RC))
+ Opc = Mips::LD;
+
+ assert(Opc && "Register class not handled!");
+
+ if (!ReqIndirectLoad)
+ BuildMI(MBB, I, DL, get(Opc), DestReg)
+ .addFrameIndex(FI)
+ .addImm(Offset)
+ .addMemOperand(MMO);
+ else {
+ // Load HI/LO through K0. Notably the DestReg is encoded into the
+ // instruction itself.
+ unsigned Reg = Mips::K0;
+ unsigned LdOp = Mips::MTLO;
+ if (DestReg == Mips::HI0)
+ LdOp = Mips::MTHI;
+
+ if (Subtarget.getABI().ArePtrs64bit()) {
+ Reg = Mips::K0_64;
+ if (DestReg == Mips::HI0_64)
+ LdOp = Mips::MTHI64;
+ else
+ LdOp = Mips::MTLO64;
+ }
+
+ BuildMI(MBB, I, DL, get(Opc), Reg)
+ .addFrameIndex(FI)
+ .addImm(Offset)
+ .addMemOperand(MMO);
+ BuildMI(MBB, I, DL, get(LdOp)).addReg(Reg);
+ }
+}
+
+bool MipsSEInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+ MachineBasicBlock &MBB = *MI.getParent();
+ bool isMicroMips = Subtarget.inMicroMipsMode();
+ unsigned Opc;
+
+ switch (MI.getDesc().getOpcode()) {
+ default:
+ return false;
+ case Mips::RetRA:
+ expandRetRA(MBB, MI);
+ break;
+ case Mips::ERet:
+ expandERet(MBB, MI);
+ break;
+ case Mips::PseudoMFHI:
+ Opc = isMicroMips ? Mips::MFHI16_MM : Mips::MFHI;
+ expandPseudoMFHiLo(MBB, MI, Opc);
+ break;
+ case Mips::PseudoMFLO:
+ Opc = isMicroMips ? Mips::MFLO16_MM : Mips::MFLO;
+ expandPseudoMFHiLo(MBB, MI, Opc);
+ break;
+ case Mips::PseudoMFHI64:
+ expandPseudoMFHiLo(MBB, MI, Mips::MFHI64);
+ break;
+ case Mips::PseudoMFLO64:
+ expandPseudoMFHiLo(MBB, MI, Mips::MFLO64);
+ break;
+ case Mips::PseudoMTLOHI:
+ expandPseudoMTLoHi(MBB, MI, Mips::MTLO, Mips::MTHI, false);
+ break;
+ case Mips::PseudoMTLOHI64:
+ expandPseudoMTLoHi(MBB, MI, Mips::MTLO64, Mips::MTHI64, false);
+ break;
+ case Mips::PseudoMTLOHI_DSP:
+ expandPseudoMTLoHi(MBB, MI, Mips::MTLO_DSP, Mips::MTHI_DSP, true);
+ break;
+ case Mips::PseudoCVT_S_W:
+ expandCvtFPInt(MBB, MI, Mips::CVT_S_W, Mips::MTC1, false);
+ break;
+ case Mips::PseudoCVT_D32_W:
+ expandCvtFPInt(MBB, MI, Mips::CVT_D32_W, Mips::MTC1, false);
+ break;
+ case Mips::PseudoCVT_S_L:
+ expandCvtFPInt(MBB, MI, Mips::CVT_S_L, Mips::DMTC1, true);
+ break;
+ case Mips::PseudoCVT_D64_W:
+ expandCvtFPInt(MBB, MI, Mips::CVT_D64_W, Mips::MTC1, true);
+ break;
+ case Mips::PseudoCVT_D64_L:
+ expandCvtFPInt(MBB, MI, Mips::CVT_D64_L, Mips::DMTC1, true);
+ break;
+ case Mips::BuildPairF64:
+ expandBuildPairF64(MBB, MI, false);
+ break;
+ case Mips::BuildPairF64_64:
+ expandBuildPairF64(MBB, MI, true);
+ break;
+ case Mips::ExtractElementF64:
+ expandExtractElementF64(MBB, MI, false);
+ break;
+ case Mips::ExtractElementF64_64:
+ expandExtractElementF64(MBB, MI, true);
+ break;
+ case Mips::MIPSeh_return32:
+ case Mips::MIPSeh_return64:
+ expandEhReturn(MBB, MI);
+ break;
+ }
+
+ MBB.erase(MI);
+ return true;
+}
+
+/// getOppositeBranchOpc - Return the inverse of the specified
+/// opcode, e.g. turning BEQ to BNE.
+unsigned MipsSEInstrInfo::getOppositeBranchOpc(unsigned Opc) const {
+ switch (Opc) {
+ default: llvm_unreachable("Illegal opcode!");
+ case Mips::BEQ: return Mips::BNE;
+ case Mips::BEQ_MM: return Mips::BNE_MM;
+ case Mips::BNE: return Mips::BEQ;
+ case Mips::BNE_MM: return Mips::BEQ_MM;
+ case Mips::BGTZ: return Mips::BLEZ;
+ case Mips::BGEZ: return Mips::BLTZ;
+ case Mips::BLTZ: return Mips::BGEZ;
+ case Mips::BLEZ: return Mips::BGTZ;
+ case Mips::BEQ64: return Mips::BNE64;
+ case Mips::BNE64: return Mips::BEQ64;
+ case Mips::BGTZ64: return Mips::BLEZ64;
+ case Mips::BGEZ64: return Mips::BLTZ64;
+ case Mips::BLTZ64: return Mips::BGEZ64;
+ case Mips::BLEZ64: return Mips::BGTZ64;
+ case Mips::BC1T: return Mips::BC1F;
+ case Mips::BC1F: return Mips::BC1T;
+ case Mips::BEQZC_MM: return Mips::BNEZC_MM;
+ case Mips::BNEZC_MM: return Mips::BEQZC_MM;
+ case Mips::BEQZC: return Mips::BNEZC;
+ case Mips::BNEZC: return Mips::BEQZC;
+ case Mips::BEQC: return Mips::BNEC;
+ case Mips::BNEC: return Mips::BEQC;
+ case Mips::BGTZC: return Mips::BLEZC;
+ case Mips::BGEZC: return Mips::BLTZC;
+ case Mips::BLTZC: return Mips::BGEZC;
+ case Mips::BLEZC: return Mips::BGTZC;
+ case Mips::BEQZC64: return Mips::BNEZC64;
+ case Mips::BNEZC64: return Mips::BEQZC64;
+ case Mips::BEQC64: return Mips::BNEC64;
+ case Mips::BNEC64: return Mips::BEQC64;
+ case Mips::BGEC64: return Mips::BLTC64;
+ case Mips::BGEUC64: return Mips::BLTUC64;
+ case Mips::BLTC64: return Mips::BGEC64;
+ case Mips::BLTUC64: return Mips::BGEUC64;
+ case Mips::BGTZC64: return Mips::BLEZC64;
+ case Mips::BGEZC64: return Mips::BLTZC64;
+ case Mips::BLTZC64: return Mips::BGEZC64;
+ case Mips::BLEZC64: return Mips::BGTZC64;
+ }
+}
+
+/// Adjust SP by Amount bytes.
+void MipsSEInstrInfo::adjustStackPtr(unsigned SP, int64_t Amount,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
+ MipsABIInfo ABI = Subtarget.getABI();
+ DebugLoc DL;
+ unsigned ADDiu = ABI.GetPtrAddiuOp();
+
+ if (Amount == 0)
+ return;
+
+ if (isInt<16>(Amount)) {
+ // addi sp, sp, amount
+ BuildMI(MBB, I, DL, get(ADDiu), SP).addReg(SP).addImm(Amount);
+ } else {
+ // For numbers which are not 16bit integers we synthesize Amount inline
+ // then add or subtract it from sp.
+ unsigned Opc = ABI.GetPtrAdduOp();
+ if (Amount < 0) {
+ Opc = ABI.GetPtrSubuOp();
+ Amount = -Amount;
+ }
+ unsigned Reg = loadImmediate(Amount, MBB, I, DL, nullptr);
+ BuildMI(MBB, I, DL, get(Opc), SP).addReg(SP).addReg(Reg, RegState::Kill);
+ }
+}
+
+/// This function generates the sequence of instructions needed to get the
+/// result of adding register REG and immediate IMM.
+unsigned MipsSEInstrInfo::loadImmediate(int64_t Imm, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator II,
+ const DebugLoc &DL,
+ unsigned *NewImm) const {
+ MipsAnalyzeImmediate AnalyzeImm;
+ const MipsSubtarget &STI = Subtarget;
+ MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
+ unsigned Size = STI.isABI_N64() ? 64 : 32;
+ unsigned LUi = STI.isABI_N64() ? Mips::LUi64 : Mips::LUi;
+ unsigned ZEROReg = STI.isABI_N64() ? Mips::ZERO_64 : Mips::ZERO;
+ const TargetRegisterClass *RC = STI.isABI_N64() ?
+ &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+ bool LastInstrIsADDiu = NewImm;
+
+ const MipsAnalyzeImmediate::InstSeq &Seq =
+ AnalyzeImm.Analyze(Imm, Size, LastInstrIsADDiu);
+ MipsAnalyzeImmediate::InstSeq::const_iterator Inst = Seq.begin();
+
+ assert(Seq.size() && (!LastInstrIsADDiu || (Seq.size() > 1)));
+
+ // The first instruction can be a LUi, which is different from other
+ // instructions (ADDiu, ORI and SLL) in that it does not have a register
+ // operand.
+ unsigned Reg = RegInfo.createVirtualRegister(RC);
+
+ if (Inst->Opc == LUi)
+ BuildMI(MBB, II, DL, get(LUi), Reg).addImm(SignExtend64<16>(Inst->ImmOpnd));
+ else
+ BuildMI(MBB, II, DL, get(Inst->Opc), Reg).addReg(ZEROReg)
+ .addImm(SignExtend64<16>(Inst->ImmOpnd));
+
+ // Build the remaining instructions in Seq.
+ for (++Inst; Inst != Seq.end() - LastInstrIsADDiu; ++Inst)
+ BuildMI(MBB, II, DL, get(Inst->Opc), Reg).addReg(Reg, RegState::Kill)
+ .addImm(SignExtend64<16>(Inst->ImmOpnd));
+
+ if (LastInstrIsADDiu)
+ *NewImm = Inst->ImmOpnd;
+
+ return Reg;
+}
+
+unsigned MipsSEInstrInfo::getAnalyzableBrOpc(unsigned Opc) const {
+ return (Opc == Mips::BEQ || Opc == Mips::BEQ_MM || Opc == Mips::BNE ||
+ Opc == Mips::BNE_MM || Opc == Mips::BGTZ || Opc == Mips::BGEZ ||
+ Opc == Mips::BLTZ || Opc == Mips::BLEZ || Opc == Mips::BEQ64 ||
+ Opc == Mips::BNE64 || Opc == Mips::BGTZ64 || Opc == Mips::BGEZ64 ||
+ Opc == Mips::BLTZ64 || Opc == Mips::BLEZ64 || Opc == Mips::BC1T ||
+ Opc == Mips::BC1F || Opc == Mips::B || Opc == Mips::J ||
+ Opc == Mips::BEQZC_MM || Opc == Mips::BNEZC_MM || Opc == Mips::BEQC ||
+ Opc == Mips::BNEC || Opc == Mips::BLTC || Opc == Mips::BGEC ||
+ Opc == Mips::BLTUC || Opc == Mips::BGEUC || Opc == Mips::BGTZC ||
+ Opc == Mips::BLEZC || Opc == Mips::BGEZC || Opc == Mips::BLTZC ||
+ Opc == Mips::BEQZC || Opc == Mips::BNEZC || Opc == Mips::BEQZC64 ||
+ Opc == Mips::BNEZC64 || Opc == Mips::BEQC64 || Opc == Mips::BNEC64 ||
+ Opc == Mips::BGEC64 || Opc == Mips::BGEUC64 || Opc == Mips::BLTC64 ||
+ Opc == Mips::BLTUC64 || Opc == Mips::BGTZC64 ||
+ Opc == Mips::BGEZC64 || Opc == Mips::BLTZC64 ||
+ Opc == Mips::BLEZC64 || Opc == Mips::BC) ? Opc : 0;
+}
+
+void MipsSEInstrInfo::expandRetRA(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
+ if (Subtarget.isGP64bit())
+ BuildMI(MBB, I, I->getDebugLoc(), get(Mips::PseudoReturn64))
+ .addReg(Mips::RA_64);
+ else
+ BuildMI(MBB, I, I->getDebugLoc(), get(Mips::PseudoReturn)).addReg(Mips::RA);
+}
+
+void MipsSEInstrInfo::expandERet(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
+ BuildMI(MBB, I, I->getDebugLoc(), get(Mips::ERET));
+}
+
+std::pair<bool, bool>
+MipsSEInstrInfo::compareOpndSize(unsigned Opc,
+ const MachineFunction &MF) const {
+ const MCInstrDesc &Desc = get(Opc);
+ assert(Desc.NumOperands == 2 && "Unary instruction expected.");
+ const MipsRegisterInfo *RI = &getRegisterInfo();
+ unsigned DstRegSize = getRegClass(Desc, 0, RI, MF)->getSize();
+ unsigned SrcRegSize = getRegClass(Desc, 1, RI, MF)->getSize();
+
+ return std::make_pair(DstRegSize > SrcRegSize, DstRegSize < SrcRegSize);
+}
+
+void MipsSEInstrInfo::expandPseudoMFHiLo(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ unsigned NewOpc) const {
+ BuildMI(MBB, I, I->getDebugLoc(), get(NewOpc), I->getOperand(0).getReg());
+}
+
+void MipsSEInstrInfo::expandPseudoMTLoHi(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ unsigned LoOpc,
+ unsigned HiOpc,
+ bool HasExplicitDef) const {
+ // Expand
+ // lo_hi pseudomtlohi $gpr0, $gpr1
+ // to these two instructions:
+ // mtlo $gpr0
+ // mthi $gpr1
+
+ DebugLoc DL = I->getDebugLoc();
+ const MachineOperand &SrcLo = I->getOperand(1), &SrcHi = I->getOperand(2);
+ MachineInstrBuilder LoInst = BuildMI(MBB, I, DL, get(LoOpc));
+ MachineInstrBuilder HiInst = BuildMI(MBB, I, DL, get(HiOpc));
+
+ // Add lo/hi registers if the mtlo/hi instructions created have explicit
+ // def registers.
+ if (HasExplicitDef) {
+ unsigned DstReg = I->getOperand(0).getReg();
+ unsigned DstLo = getRegisterInfo().getSubReg(DstReg, Mips::sub_lo);
+ unsigned DstHi = getRegisterInfo().getSubReg(DstReg, Mips::sub_hi);
+ LoInst.addReg(DstLo, RegState::Define);
+ HiInst.addReg(DstHi, RegState::Define);
+ }
+
+ LoInst.addReg(SrcLo.getReg(), getKillRegState(SrcLo.isKill()));
+ HiInst.addReg(SrcHi.getReg(), getKillRegState(SrcHi.isKill()));
+}
+
+void MipsSEInstrInfo::expandCvtFPInt(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ unsigned CvtOpc, unsigned MovOpc,
+ bool IsI64) const {
+ const MCInstrDesc &CvtDesc = get(CvtOpc), &MovDesc = get(MovOpc);
+ const MachineOperand &Dst = I->getOperand(0), &Src = I->getOperand(1);
+ unsigned DstReg = Dst.getReg(), SrcReg = Src.getReg(), TmpReg = DstReg;
+ unsigned KillSrc = getKillRegState(Src.isKill());
+ DebugLoc DL = I->getDebugLoc();
+ bool DstIsLarger, SrcIsLarger;
+
+ std::tie(DstIsLarger, SrcIsLarger) =
+ compareOpndSize(CvtOpc, *MBB.getParent());
+
+ if (DstIsLarger)
+ TmpReg = getRegisterInfo().getSubReg(DstReg, Mips::sub_lo);
+
+ if (SrcIsLarger)
+ DstReg = getRegisterInfo().getSubReg(DstReg, Mips::sub_lo);
+
+ BuildMI(MBB, I, DL, MovDesc, TmpReg).addReg(SrcReg, KillSrc);
+ BuildMI(MBB, I, DL, CvtDesc, DstReg).addReg(TmpReg, RegState::Kill);
+}
+
+void MipsSEInstrInfo::expandExtractElementF64(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ bool FP64) const {
+ unsigned DstReg = I->getOperand(0).getReg();
+ unsigned SrcReg = I->getOperand(1).getReg();
+ unsigned N = I->getOperand(2).getImm();
+ DebugLoc dl = I->getDebugLoc();
+
+ assert(N < 2 && "Invalid immediate");
+ unsigned SubIdx = N ? Mips::sub_hi : Mips::sub_lo;
+ unsigned SubReg = getRegisterInfo().getSubReg(SrcReg, SubIdx);
+
+ // FPXX on MIPS-II or MIPS32r1 should have been handled with a spill/reload
+ // in MipsSEFrameLowering.cpp.
+ assert(!(Subtarget.isABI_FPXX() && !Subtarget.hasMips32r2()));
+
+ // FP64A (FP64 with nooddspreg) should have been handled with a spill/reload
+ // in MipsSEFrameLowering.cpp.
+ assert(!(Subtarget.isFP64bit() && !Subtarget.useOddSPReg()));
+
+ if (SubIdx == Mips::sub_hi && Subtarget.hasMTHC1()) {
+ // FIXME: Strictly speaking MFHC1 only reads the top 32-bits however, we
+ // claim to read the whole 64-bits as part of a white lie used to
+ // temporarily work around a widespread bug in the -mfp64 support.
+ // The problem is that none of the 32-bit fpu ops mention the fact
+ // that they clobber the upper 32-bits of the 64-bit FPR. Fixing that
+ // requires a major overhaul of the FPU implementation which can't
+ // be done right now due to time constraints.
+ // MFHC1 is one of two instructions that are affected since they are
+ // the only instructions that don't read the lower 32-bits.
+ // We therefore pretend that it reads the bottom 32-bits to
+ // artificially create a dependency and prevent the scheduler
+ // changing the behaviour of the code.
+ BuildMI(MBB, I, dl, get(FP64 ? Mips::MFHC1_D64 : Mips::MFHC1_D32), DstReg)
+ .addReg(SrcReg);
+ } else
+ BuildMI(MBB, I, dl, get(Mips::MFC1), DstReg).addReg(SubReg);
+}
+
+void MipsSEInstrInfo::expandBuildPairF64(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ bool FP64) const {
+ unsigned DstReg = I->getOperand(0).getReg();
+ unsigned LoReg = I->getOperand(1).getReg(), HiReg = I->getOperand(2).getReg();
+ const MCInstrDesc& Mtc1Tdd = get(Mips::MTC1);
+ DebugLoc dl = I->getDebugLoc();
+ const TargetRegisterInfo &TRI = getRegisterInfo();
+
+ // When mthc1 is available, use:
+ // mtc1 Lo, $fp
+ // mthc1 Hi, $fp
+ //
+ // Otherwise, for O32 FPXX ABI:
+ // spill + reload via ldc1
+ // This case is handled by the frame lowering code.
+ //
+ // Otherwise, for FP32:
+ // mtc1 Lo, $fp
+ // mtc1 Hi, $fp + 1
+ //
+ // The case where dmtc1 is available doesn't need to be handled here
+ // because it never creates a BuildPairF64 node.
+
+ // FPXX on MIPS-II or MIPS32r1 should have been handled with a spill/reload
+ // in MipsSEFrameLowering.cpp.
+ assert(!(Subtarget.isABI_FPXX() && !Subtarget.hasMips32r2()));
+
+ // FP64A (FP64 with nooddspreg) should have been handled with a spill/reload
+ // in MipsSEFrameLowering.cpp.
+ assert(!(Subtarget.isFP64bit() && !Subtarget.useOddSPReg()));
+
+ BuildMI(MBB, I, dl, Mtc1Tdd, TRI.getSubReg(DstReg, Mips::sub_lo))
+ .addReg(LoReg);
+
+ if (Subtarget.hasMTHC1()) {
+ // FIXME: The .addReg(DstReg) is a white lie used to temporarily work
+ // around a widespread bug in the -mfp64 support.
+ // The problem is that none of the 32-bit fpu ops mention the fact
+ // that they clobber the upper 32-bits of the 64-bit FPR. Fixing that
+ // requires a major overhaul of the FPU implementation which can't
+ // be done right now due to time constraints.
+ // MTHC1 is one of two instructions that are affected since they are
+ // the only instructions that don't read the lower 32-bits.
+ // We therefore pretend that it reads the bottom 32-bits to
+ // artificially create a dependency and prevent the scheduler
+ // changing the behaviour of the code.
+ BuildMI(MBB, I, dl, get(FP64 ? Mips::MTHC1_D64 : Mips::MTHC1_D32), DstReg)
+ .addReg(DstReg)
+ .addReg(HiReg);
+ } else if (Subtarget.isABI_FPXX())
+ llvm_unreachable("BuildPairF64 not expanded in frame lowering code!");
+ else
+ BuildMI(MBB, I, dl, Mtc1Tdd, TRI.getSubReg(DstReg, Mips::sub_hi))
+ .addReg(HiReg);
+}
+
+void MipsSEInstrInfo::expandEhReturn(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
+ // This pseudo instruction is generated as part of the lowering of
+ // ISD::EH_RETURN. We convert it to a stack increment by OffsetReg, and
+ // indirect jump to TargetReg
+ MipsABIInfo ABI = Subtarget.getABI();
+ unsigned ADDU = ABI.GetPtrAdduOp();
+ unsigned SP = Subtarget.isGP64bit() ? Mips::SP_64 : Mips::SP;
+ unsigned RA = Subtarget.isGP64bit() ? Mips::RA_64 : Mips::RA;
+ unsigned T9 = Subtarget.isGP64bit() ? Mips::T9_64 : Mips::T9;
+ unsigned ZERO = Subtarget.isGP64bit() ? Mips::ZERO_64 : Mips::ZERO;
+ unsigned OffsetReg = I->getOperand(0).getReg();
+ unsigned TargetReg = I->getOperand(1).getReg();
+
+ // addu $ra, $v0, $zero
+ // addu $sp, $sp, $v1
+ // jr $ra (via RetRA)
+ const TargetMachine &TM = MBB.getParent()->getTarget();
+ if (TM.isPositionIndependent())
+ BuildMI(MBB, I, I->getDebugLoc(), get(ADDU), T9)
+ .addReg(TargetReg)
+ .addReg(ZERO);
+ BuildMI(MBB, I, I->getDebugLoc(), get(ADDU), RA)
+ .addReg(TargetReg)
+ .addReg(ZERO);
+ BuildMI(MBB, I, I->getDebugLoc(), get(ADDU), SP).addReg(SP).addReg(OffsetReg);
+ expandRetRA(MBB, I);
+}
+
+const MipsInstrInfo *llvm::createMipsSEInstrInfo(const MipsSubtarget &STI) {
+ return new MipsSEInstrInfo(STI);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.h b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.h
new file mode 100644
index 000000000000..b356909bf1cf
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.h
@@ -0,0 +1,119 @@
+//===-- MipsSEInstrInfo.h - Mips32/64 Instruction Information ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips32/64 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSSEINSTRINFO_H
+#define LLVM_LIB_TARGET_MIPS_MIPSSEINSTRINFO_H
+
+#include "MipsInstrInfo.h"
+#include "MipsSERegisterInfo.h"
+
+namespace llvm {
+
+class MipsSEInstrInfo : public MipsInstrInfo {
+ const MipsSERegisterInfo RI;
+
+public:
+ explicit MipsSEInstrInfo(const MipsSubtarget &STI);
+
+ const MipsRegisterInfo &getRegisterInfo() const override;
+
+ /// isLoadFromStackSlot - If the specified machine instruction is a direct
+ /// load from a stack slot, return the virtual or physical register number of
+ /// the destination along with the FrameIndex of the loaded stack slot. If
+ /// not, return 0. This predicate must return 0 if the instruction has
+ /// any side effects other than loading from the stack slot.
+ unsigned isLoadFromStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const override;
+
+ /// isStoreToStackSlot - If the specified machine instruction is a direct
+ /// store to a stack slot, return the virtual or physical register number of
+ /// the source reg along with the FrameIndex of the loaded stack slot. If
+ /// not, return 0. This predicate must return 0 if the instruction has
+ /// any side effects other than storing to the stack slot.
+ unsigned isStoreToStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const override;
+
+ void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+ bool KillSrc) const override;
+
+ void storeRegToStack(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned SrcReg, bool isKill, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI,
+ int64_t Offset) const override;
+
+ void loadRegFromStack(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned DestReg, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI,
+ int64_t Offset) const override;
+
+ bool expandPostRAPseudo(MachineInstr &MI) const override;
+
+ unsigned getOppositeBranchOpc(unsigned Opc) const override;
+
+ /// Adjust SP by Amount bytes.
+ void adjustStackPtr(unsigned SP, int64_t Amount, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const override;
+
+ /// Emit a series of instructions to load an immediate. If NewImm is a
+ /// non-NULL parameter, the last instruction is not emitted, but instead
+ /// its immediate operand is returned in NewImm.
+ unsigned loadImmediate(int64_t Imm, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator II, const DebugLoc &DL,
+ unsigned *NewImm) const;
+
+private:
+ unsigned getAnalyzableBrOpc(unsigned Opc) const override;
+
+ void expandRetRA(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const;
+
+ void expandERet(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const;
+
+ std::pair<bool, bool> compareOpndSize(unsigned Opc,
+ const MachineFunction &MF) const;
+
+ void expandPseudoMFHiLo(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ unsigned NewOpc) const;
+
+ void expandPseudoMTLoHi(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ unsigned LoOpc, unsigned HiOpc,
+ bool HasExplicitDef) const;
+
+ /// Expand pseudo Int-to-FP conversion instructions.
+ ///
+ /// For example, the following pseudo instruction
+ /// PseudoCVT_D32_W D2, A5
+ /// gets expanded into these two instructions:
+ /// MTC1 F4, A5
+ /// CVT_D32_W D2, F4
+ ///
+ /// We do this expansion post-RA to avoid inserting a floating point copy
+ /// instruction between MTC1 and CVT_D32_W.
+ void expandCvtFPInt(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ unsigned CvtOpc, unsigned MovOpc, bool IsI64) const;
+
+ void expandExtractElementF64(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I, bool FP64) const;
+ void expandBuildPairF64(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I, bool FP64) const;
+ void expandEhReturn(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp
new file mode 100644
index 000000000000..86bd24166bb6
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp
@@ -0,0 +1,260 @@
+//===-- MipsSERegisterInfo.cpp - MIPS32/64 Register Information -== -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MIPS32/64 implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsSERegisterInfo.h"
+#include "Mips.h"
+#include "MipsMachineFunction.h"
+#include "MipsSEInstrInfo.h"
+#include "MipsSubtarget.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-reg-info"
+
+MipsSERegisterInfo::MipsSERegisterInfo() : MipsRegisterInfo() {}
+
+bool MipsSERegisterInfo::
+requiresRegisterScavenging(const MachineFunction &MF) const {
+ return true;
+}
+
+bool MipsSERegisterInfo::
+requiresFrameIndexScavenging(const MachineFunction &MF) const {
+ return true;
+}
+
+const TargetRegisterClass *
+MipsSERegisterInfo::intRegClass(unsigned Size) const {
+ if (Size == 4)
+ return &Mips::GPR32RegClass;
+
+ assert(Size == 8);
+ return &Mips::GPR64RegClass;
+}
+
+/// Get the size of the offset supported by the given load/store/inline asm.
+/// The result includes the effects of any scale factors applied to the
+/// instruction immediate.
+static inline unsigned getLoadStoreOffsetSizeInBits(const unsigned Opcode,
+ MachineOperand MO) {
+ switch (Opcode) {
+ case Mips::LD_B:
+ case Mips::ST_B:
+ return 10;
+ case Mips::LD_H:
+ case Mips::ST_H:
+ return 10 + 1 /* scale factor */;
+ case Mips::LD_W:
+ case Mips::ST_W:
+ return 10 + 2 /* scale factor */;
+ case Mips::LD_D:
+ case Mips::ST_D:
+ return 10 + 3 /* scale factor */;
+ case Mips::LL:
+ case Mips::LL64:
+ case Mips::LLD:
+ case Mips::LLE:
+ case Mips::SC:
+ case Mips::SC64:
+ case Mips::SCD:
+ case Mips::SCE:
+ return 16;
+ case Mips::LLE_MM:
+ case Mips::LLE_MMR6:
+ case Mips::LL_MM:
+ case Mips::SCE_MM:
+ case Mips::SCE_MMR6:
+ case Mips::SC_MM:
+ return 12;
+ case Mips::LL64_R6:
+ case Mips::LL_R6:
+ case Mips::LLD_R6:
+ case Mips::SC64_R6:
+ case Mips::SCD_R6:
+ case Mips::SC_R6:
+ return 9;
+ case Mips::INLINEASM: {
+ unsigned ConstraintID = InlineAsm::getMemoryConstraintID(MO.getImm());
+ switch (ConstraintID) {
+ case InlineAsm::Constraint_ZC: {
+ const MipsSubtarget &Subtarget = MO.getParent()
+ ->getParent()
+ ->getParent()
+ ->getSubtarget<MipsSubtarget>();
+ if (Subtarget.inMicroMipsMode())
+ return 12;
+
+ if (Subtarget.hasMips32r6())
+ return 9;
+
+ return 16;
+ }
+ default:
+ return 16;
+ }
+ }
+ default:
+ return 16;
+ }
+}
+
+/// Get the scale factor applied to the immediate in the given load/store.
+static inline unsigned getLoadStoreOffsetAlign(const unsigned Opcode) {
+ switch (Opcode) {
+ case Mips::LD_H:
+ case Mips::ST_H:
+ return 2;
+ case Mips::LD_W:
+ case Mips::ST_W:
+ return 4;
+ case Mips::LD_D:
+ case Mips::ST_D:
+ return 8;
+ default:
+ return 1;
+ }
+}
+
+void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
+ unsigned OpNo, int FrameIndex,
+ uint64_t StackSize,
+ int64_t SPOffset) const {
+ MachineInstr &MI = *II;
+ MachineFunction &MF = *MI.getParent()->getParent();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+ MipsABIInfo ABI =
+ static_cast<const MipsTargetMachine &>(MF.getTarget()).getABI();
+ const MipsRegisterInfo *RegInfo =
+ static_cast<const MipsRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
+
+ const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+ int MinCSFI = 0;
+ int MaxCSFI = -1;
+
+ if (CSI.size()) {
+ MinCSFI = CSI[0].getFrameIdx();
+ MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
+ }
+
+ bool EhDataRegFI = MipsFI->isEhDataRegFI(FrameIndex);
+ bool IsISRRegFI = MipsFI->isISRRegFI(FrameIndex);
+ // The following stack frame objects are always referenced relative to $sp:
+ // 1. Outgoing arguments.
+ // 2. Pointer to dynamically allocated stack space.
+ // 3. Locations for callee-saved registers.
+ // 4. Locations for eh data registers.
+ // 5. Locations for ISR saved Coprocessor 0 registers 12 & 14.
+ // Everything else is referenced relative to whatever register
+ // getFrameRegister() returns.
+ unsigned FrameReg;
+
+ if ((FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI) || EhDataRegFI ||
+ IsISRRegFI)
+ FrameReg = ABI.GetStackPtr();
+ else if (RegInfo->needsStackRealignment(MF)) {
+ if (MFI.hasVarSizedObjects() && !MFI.isFixedObjectIndex(FrameIndex))
+ FrameReg = ABI.GetBasePtr();
+ else if (MFI.isFixedObjectIndex(FrameIndex))
+ FrameReg = getFrameRegister(MF);
+ else
+ FrameReg = ABI.GetStackPtr();
+ } else
+ FrameReg = getFrameRegister(MF);
+
+ // Calculate final offset.
+ // - There is no need to change the offset if the frame object is one of the
+ // following: an outgoing argument, pointer to a dynamically allocated
+ // stack space or a $gp restore location,
+ // - If the frame object is any of the following, its offset must be adjusted
+ // by adding the size of the stack:
+ // incoming argument, callee-saved register location or local variable.
+ bool IsKill = false;
+ int64_t Offset;
+
+ Offset = SPOffset + (int64_t)StackSize;
+ Offset += MI.getOperand(OpNo + 1).getImm();
+
+ DEBUG(errs() << "Offset : " << Offset << "\n" << "<--------->\n");
+
+ if (!MI.isDebugValue()) {
+ // Make sure Offset fits within the field available.
+ // For MSA instructions, this is a 10-bit signed immediate (scaled by
+ // element size), otherwise it is a 16-bit signed immediate.
+ unsigned OffsetBitSize =
+ getLoadStoreOffsetSizeInBits(MI.getOpcode(), MI.getOperand(OpNo - 1));
+ unsigned OffsetAlign = getLoadStoreOffsetAlign(MI.getOpcode());
+
+ if (OffsetBitSize < 16 && isInt<16>(Offset) &&
+ (!isIntN(OffsetBitSize, Offset) ||
+ OffsetToAlignment(Offset, OffsetAlign) != 0)) {
+ // If we have an offset that needs to fit into a signed n-bit immediate
+ // (where n < 16) and doesn't, but does fit into 16-bits then use an ADDiu
+ MachineBasicBlock &MBB = *MI.getParent();
+ DebugLoc DL = II->getDebugLoc();
+ const TargetRegisterClass *PtrRC =
+ ABI.ArePtrs64bit() ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+ MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
+ unsigned Reg = RegInfo.createVirtualRegister(PtrRC);
+ const MipsSEInstrInfo &TII =
+ *static_cast<const MipsSEInstrInfo *>(
+ MBB.getParent()->getSubtarget().getInstrInfo());
+ BuildMI(MBB, II, DL, TII.get(ABI.GetPtrAddiuOp()), Reg)
+ .addReg(FrameReg)
+ .addImm(Offset);
+
+ FrameReg = Reg;
+ Offset = 0;
+ IsKill = true;
+ } else if (!isInt<16>(Offset)) {
+ // Otherwise split the offset into 16-bit pieces and add it in multiple
+ // instructions.
+ MachineBasicBlock &MBB = *MI.getParent();
+ DebugLoc DL = II->getDebugLoc();
+ unsigned NewImm = 0;
+ const MipsSEInstrInfo &TII =
+ *static_cast<const MipsSEInstrInfo *>(
+ MBB.getParent()->getSubtarget().getInstrInfo());
+ unsigned Reg = TII.loadImmediate(Offset, MBB, II, DL,
+ OffsetBitSize == 16 ? &NewImm : nullptr);
+ BuildMI(MBB, II, DL, TII.get(ABI.GetPtrAdduOp()), Reg).addReg(FrameReg)
+ .addReg(Reg, RegState::Kill);
+
+ FrameReg = Reg;
+ Offset = SignExtend64<16>(NewImm);
+ IsKill = true;
+ }
+ }
+
+ MI.getOperand(OpNo).ChangeToRegister(FrameReg, false, false, IsKill);
+ MI.getOperand(OpNo + 1).ChangeToImmediate(Offset);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.h b/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.h
new file mode 100644
index 000000000000..ebae1909d233
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.h
@@ -0,0 +1,41 @@
+//===-- MipsSERegisterInfo.h - Mips32/64 Register Information ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips32/64 implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSSEREGISTERINFO_H
+#define LLVM_LIB_TARGET_MIPS_MIPSSEREGISTERINFO_H
+
+#include "MipsRegisterInfo.h"
+
+namespace llvm {
+class MipsSEInstrInfo;
+
+class MipsSERegisterInfo : public MipsRegisterInfo {
+public:
+ MipsSERegisterInfo();
+
+ bool requiresRegisterScavenging(const MachineFunction &MF) const override;
+
+ bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
+
+ const TargetRegisterClass *intRegClass(unsigned Size) const override;
+
+private:
+ void eliminateFI(MachineBasicBlock::iterator II, unsigned OpNo,
+ int FrameIndex, uint64_t StackSize,
+ int64_t SPOffset) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsSchedule.td b/contrib/llvm/lib/Target/Mips/MipsSchedule.td
new file mode 100644
index 000000000000..c0de59ba15f5
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSchedule.td
@@ -0,0 +1,674 @@
+//===-- MipsSchedule.td - Mips Scheduling Definitions ------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Functional units across Mips chips sets. Based on GCC/Mips backend files.
+//===----------------------------------------------------------------------===//
+def ALU : FuncUnit;
+def IMULDIV : FuncUnit;
+
+//===----------------------------------------------------------------------===//
+// Instruction Itinerary classes used for Mips
+//===----------------------------------------------------------------------===//
+// IIM16Alu is a placeholder class for most MIPS16 instructions.
+def IIM16Alu : InstrItinClass;
+def IIPseudo : InstrItinClass;
+
+def II_ABS : InstrItinClass;
+def II_ADDI : InstrItinClass;
+def II_ADDIU : InstrItinClass;
+def II_ADDIUPC : InstrItinClass;
+def II_ADD : InstrItinClass;
+def II_ADDU : InstrItinClass;
+def II_ADD_D : InstrItinClass;
+def II_ADD_S : InstrItinClass;
+def II_ALIGN : InstrItinClass;
+def II_AND : InstrItinClass;
+def II_ANDI : InstrItinClass;
+def II_ALUIPC : InstrItinClass;
+def II_AUI : InstrItinClass;
+def II_AUIPC : InstrItinClass;
+def II_B : InstrItinClass;
+def II_BADDU : InstrItinClass;
+def II_BBIT : InstrItinClass; // bbit[01], bbit[01]32
+def II_BALC : InstrItinClass;
+def II_BC : InstrItinClass;
+def II_BC1F : InstrItinClass;
+def II_BC1FL : InstrItinClass;
+def II_BC1T : InstrItinClass;
+def II_BC1TL : InstrItinClass;
+def II_BC1CCZ : InstrItinClass;
+def II_BC2CCZ : InstrItinClass;
+def II_BCC : InstrItinClass; // beq and bne
+def II_BCCZ : InstrItinClass; // b[gl][et]z
+def II_BCCC : InstrItinClass; // b<cc>c
+def II_BCCZAL : InstrItinClass; // bgezal and bltzal
+def II_BCCZALS : InstrItinClass; // bgezals and bltzals
+def II_BCCZC : InstrItinClass; // beqzc, bnezc
+def II_BITSWAP : InstrItinClass;
+def II_CEIL : InstrItinClass;
+def II_CFC1 : InstrItinClass;
+def II_CFC2 : InstrItinClass;
+def II_CLO : InstrItinClass;
+def II_CLZ : InstrItinClass;
+def II_CTC1 : InstrItinClass;
+def II_CTC2 : InstrItinClass;
+def II_CVT : InstrItinClass;
+def II_C_CC_D : InstrItinClass; // Any c.<cc>.d instruction
+def II_C_CC_S : InstrItinClass; // Any c.<cc>.s instruction
+def II_CMP_CC_D : InstrItinClass; // Any cmp.<cc>.d instruction
+def II_CMP_CC_S : InstrItinClass; // Any cmp.<cc>.s instruction
+def II_CLASS_D : InstrItinClass;
+def II_CLASS_S : InstrItinClass;
+def II_DADDIU : InstrItinClass;
+def II_DADDU : InstrItinClass;
+def II_DADDI : InstrItinClass;
+def II_DADD : InstrItinClass;
+def II_DAHI : InstrItinClass;
+def II_DATI : InstrItinClass;
+def II_DAUI : InstrItinClass;
+def II_DALIGN : InstrItinClass;
+def II_DBITSWAP : InstrItinClass;
+def II_DCLO : InstrItinClass;
+def II_DCLZ : InstrItinClass;
+def II_DDIV : InstrItinClass;
+def II_DDIVU : InstrItinClass;
+def II_DIV : InstrItinClass;
+def II_DIVU : InstrItinClass;
+def II_DIV_D : InstrItinClass;
+def II_DIV_S : InstrItinClass;
+def II_DMFC0 : InstrItinClass;
+def II_DMTC0 : InstrItinClass;
+def II_DMFC1 : InstrItinClass;
+def II_DMTC1 : InstrItinClass;
+def II_DMOD : InstrItinClass;
+def II_DMODU : InstrItinClass;
+def II_DMUH : InstrItinClass;
+def II_DMUHU : InstrItinClass;
+def II_DMFC2 : InstrItinClass;
+def II_DMTC2 : InstrItinClass;
+def II_DMUL : InstrItinClass;
+def II_DMULU : InstrItinClass;
+def II_DMULT : InstrItinClass;
+def II_DMULTU : InstrItinClass;
+def II_DROTR : InstrItinClass;
+def II_DROTR32 : InstrItinClass;
+def II_DROTRV : InstrItinClass;
+def II_DSLL : InstrItinClass;
+def II_DSLL32 : InstrItinClass;
+def II_DSLLV : InstrItinClass;
+def II_DSRA : InstrItinClass;
+def II_DSRA32 : InstrItinClass;
+def II_DSRAV : InstrItinClass;
+def II_DSRL : InstrItinClass;
+def II_DSRL32 : InstrItinClass;
+def II_DSRLV : InstrItinClass;
+def II_DSBH : InstrItinClass;
+def II_DSHD : InstrItinClass;
+def II_DSUBU : InstrItinClass;
+def II_DSUB : InstrItinClass;
+def II_EXT : InstrItinClass; // Any EXT instruction
+def II_FLOOR : InstrItinClass;
+def II_INS : InstrItinClass; // Any INS instruction
+def II_IndirectBranchPseudo : InstrItinClass; // Indirect branch pseudo.
+def II_J : InstrItinClass;
+def II_JAL : InstrItinClass;
+def II_JALR : InstrItinClass;
+def II_JALR_HB : InstrItinClass;
+def II_JALRC : InstrItinClass;
+def II_JALRS : InstrItinClass;
+def II_JALS : InstrItinClass;
+def II_JIC : InstrItinClass;
+def II_JIALC : InstrItinClass;
+def II_JR : InstrItinClass;
+def II_JR_HB : InstrItinClass;
+def II_JRADDIUSP : InstrItinClass;
+def II_JRC : InstrItinClass;
+def II_ReturnPseudo : InstrItinClass; // Return pseudo.
+def II_ERET : InstrItinClass;
+def II_DERET : InstrItinClass;
+def II_ERETNC : InstrItinClass;
+def II_EHB : InstrItinClass;
+def II_SDBBP : InstrItinClass;
+def II_SSNOP : InstrItinClass;
+def II_SYSCALL : InstrItinClass;
+def II_PAUSE : InstrItinClass;
+def II_WAIT : InstrItinClass;
+def II_EI : InstrItinClass;
+def II_DI : InstrItinClass;
+def II_TEQ : InstrItinClass;
+def II_TEQI : InstrItinClass;
+def II_TGE : InstrItinClass;
+def II_TGEI : InstrItinClass;
+def II_TGEIU : InstrItinClass;
+def II_TGEU : InstrItinClass;
+def II_TNE : InstrItinClass;
+def II_TNEI : InstrItinClass;
+def II_TLT : InstrItinClass;
+def II_TLTI : InstrItinClass;
+def II_TLTU : InstrItinClass;
+def II_TTLTIU : InstrItinClass;
+def II_TLBP : InstrItinClass;
+def II_TLBR : InstrItinClass;
+def II_TLBWI : InstrItinClass;
+def II_TLBWR : InstrItinClass;
+def II_TRAP : InstrItinClass;
+def II_BREAK : InstrItinClass;
+def II_SYNC : InstrItinClass;
+def II_SYNCI : InstrItinClass;
+def II_LB : InstrItinClass;
+def II_LBE : InstrItinClass;
+def II_LBU : InstrItinClass;
+def II_LBUE : InstrItinClass;
+def II_LD : InstrItinClass;
+def II_LDC1 : InstrItinClass;
+def II_LDC2 : InstrItinClass;
+def II_LDC3 : InstrItinClass;
+def II_LDL : InstrItinClass;
+def II_LDR : InstrItinClass;
+def II_LDPC : InstrItinClass;
+def II_LDXC1 : InstrItinClass;
+def II_LH : InstrItinClass;
+def II_LHE : InstrItinClass;
+def II_LHU : InstrItinClass;
+def II_LHUE : InstrItinClass;
+def II_LL : InstrItinClass;
+def II_LI : InstrItinClass;
+def II_LLD : InstrItinClass;
+def II_LUI : InstrItinClass;
+def II_LUXC1 : InstrItinClass;
+def II_LW : InstrItinClass;
+def II_LWE : InstrItinClass;
+def II_LWC1 : InstrItinClass;
+def II_LWC2 : InstrItinClass;
+def II_LWC3 : InstrItinClass;
+def II_LWM : InstrItinClass;
+def II_LWL : InstrItinClass;
+def II_LWLE : InstrItinClass;
+def II_LWPC : InstrItinClass;
+def II_LWP : InstrItinClass;
+def II_LWR : InstrItinClass;
+def II_LWRE : InstrItinClass;
+def II_LWU : InstrItinClass;
+def II_LWUPC : InstrItinClass;
+def II_LWXC1 : InstrItinClass;
+def II_LWXS : InstrItinClass;
+def II_LSA : InstrItinClass;
+def II_DLSA : InstrItinClass;
+def II_MADD : InstrItinClass;
+def II_MADDU : InstrItinClass;
+def II_MADD_D : InstrItinClass;
+def II_MADD_S : InstrItinClass;
+def II_MADDF_D : InstrItinClass;
+def II_MADDF_S : InstrItinClass;
+def II_MAX_D : InstrItinClass;
+def II_MAX_S : InstrItinClass;
+def II_MAXA_D : InstrItinClass;
+def II_MAXA_S : InstrItinClass;
+def II_MIN_D : InstrItinClass;
+def II_MIN_S : InstrItinClass;
+def II_MINA_D : InstrItinClass;
+def II_MINA_S : InstrItinClass;
+def II_MFC0 : InstrItinClass;
+def II_MFHC0 : InstrItinClass;
+def II_MFC1 : InstrItinClass;
+def II_MFHC1 : InstrItinClass;
+def II_MFC2 : InstrItinClass;
+def II_MFHI_MFLO : InstrItinClass; // mfhi and mflo
+def II_MOD : InstrItinClass;
+def II_MODU : InstrItinClass;
+def II_MOVE : InstrItinClass;
+def II_MOVF : InstrItinClass;
+def II_MOVF_D : InstrItinClass;
+def II_MOVF_S : InstrItinClass;
+def II_MOVN : InstrItinClass;
+def II_MOVN_D : InstrItinClass;
+def II_MOVN_S : InstrItinClass;
+def II_MOVT : InstrItinClass;
+def II_MOVT_D : InstrItinClass;
+def II_MOVT_S : InstrItinClass;
+def II_MOVZ : InstrItinClass;
+def II_MOVZ_D : InstrItinClass;
+def II_MOVZ_S : InstrItinClass;
+def II_MOV_D : InstrItinClass;
+def II_MOV_S : InstrItinClass;
+def II_MSUB : InstrItinClass;
+def II_MSUBU : InstrItinClass;
+def II_MSUB_D : InstrItinClass;
+def II_MSUB_S : InstrItinClass;
+def II_MSUBF_D : InstrItinClass;
+def II_MSUBF_S : InstrItinClass;
+def II_MTC0 : InstrItinClass;
+def II_MTHC0 : InstrItinClass;
+def II_MTC1 : InstrItinClass;
+def II_MTHC1 : InstrItinClass;
+def II_MTC2 : InstrItinClass;
+def II_MTHI_MTLO : InstrItinClass; // mthi and mtlo
+def II_MUL : InstrItinClass;
+def II_MUH : InstrItinClass;
+def II_MUHU : InstrItinClass;
+def II_MULU : InstrItinClass;
+def II_MULT : InstrItinClass;
+def II_MULTU : InstrItinClass;
+def II_MUL_D : InstrItinClass;
+def II_MUL_S : InstrItinClass;
+def II_NEG : InstrItinClass;
+def II_NMADD_D : InstrItinClass;
+def II_NMADD_S : InstrItinClass;
+def II_NMSUB_D : InstrItinClass;
+def II_NMSUB_S : InstrItinClass;
+def II_NOR : InstrItinClass;
+def II_NOT : InstrItinClass;
+def II_OR : InstrItinClass;
+def II_ORI : InstrItinClass;
+def II_POP : InstrItinClass;
+def II_RDHWR : InstrItinClass;
+def II_RESTORE : InstrItinClass;
+def II_RECIP_S : InstrItinClass;
+def II_RECIP_D : InstrItinClass;
+def II_RINT_S : InstrItinClass;
+def II_RINT_D : InstrItinClass;
+def II_ROTR : InstrItinClass;
+def II_ROTRV : InstrItinClass;
+def II_ROUND : InstrItinClass;
+def II_RSQRT_S : InstrItinClass;
+def II_RSQRT_D : InstrItinClass;
+def II_SAVE : InstrItinClass;
+def II_SC : InstrItinClass;
+def II_SCD : InstrItinClass;
+def II_SB : InstrItinClass;
+def II_SBE : InstrItinClass;
+def II_SD : InstrItinClass;
+def II_SDC1 : InstrItinClass;
+def II_SDC2 : InstrItinClass;
+def II_SDC3 : InstrItinClass;
+def II_SDL : InstrItinClass;
+def II_SDR : InstrItinClass;
+def II_SDXC1 : InstrItinClass;
+def II_SEB : InstrItinClass;
+def II_SEH : InstrItinClass;
+def II_SELCCZ : InstrItinClass;
+def II_SELCCZ_D : InstrItinClass;
+def II_SELCCZ_S : InstrItinClass;
+def II_SEQ_SNE : InstrItinClass; // seq and sne
+def II_SEQI_SNEI : InstrItinClass; // seqi and snei
+def II_SH : InstrItinClass;
+def II_SHE : InstrItinClass;
+def II_SLL : InstrItinClass;
+def II_SLLV : InstrItinClass;
+def II_SLTI_SLTIU : InstrItinClass; // slti and sltiu
+def II_SLT_SLTU : InstrItinClass; // slt and sltu
+def II_SQRT_D : InstrItinClass;
+def II_SQRT_S : InstrItinClass;
+def II_SEL_D : InstrItinClass;
+def II_SEL_S : InstrItinClass;
+def II_SRA : InstrItinClass;
+def II_SRAV : InstrItinClass;
+def II_SRL : InstrItinClass;
+def II_SRLV : InstrItinClass;
+def II_SUB : InstrItinClass;
+def II_SUBU : InstrItinClass;
+def II_SUB_D : InstrItinClass;
+def II_SUB_S : InstrItinClass;
+def II_SUXC1 : InstrItinClass;
+def II_SW : InstrItinClass;
+def II_SWE : InstrItinClass;
+def II_SWC1 : InstrItinClass;
+def II_SWC2 : InstrItinClass;
+def II_SWC3 : InstrItinClass;
+def II_SWL : InstrItinClass;
+def II_SWLE : InstrItinClass;
+def II_SWM : InstrItinClass;
+def II_SWP : InstrItinClass;
+def II_SWR : InstrItinClass;
+def II_SWRE : InstrItinClass;
+def II_SWXC1 : InstrItinClass;
+def II_TRUNC : InstrItinClass;
+def II_WSBH : InstrItinClass;
+def II_XOR : InstrItinClass;
+def II_XORI : InstrItinClass;
+def II_CACHE : InstrItinClass;
+def II_PREF : InstrItinClass;
+def II_CACHEE : InstrItinClass;
+def II_PREFE : InstrItinClass;
+def II_LLE : InstrItinClass;
+def II_SCE : InstrItinClass;
+def II_TLBINV : InstrItinClass;
+def II_TLBINVF : InstrItinClass;
+def II_WRPGPR : InstrItinClass;
+def II_RDPGPR : InstrItinClass;
+def II_DVP : InstrItinClass;
+def II_EVP : InstrItinClass;
+
+//===----------------------------------------------------------------------===//
+// Mips Generic instruction itineraries.
+//===----------------------------------------------------------------------===//
+def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [
+ InstrItinData<IIM16Alu , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_ADDI , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_ADDIU , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_ADDIUPC , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_ADD , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_ADDU , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_AUI , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_AND , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_ALUIPC , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_AUIPC , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_ALIGN , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_BADDU , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_BITSWAP , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SLL , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SRA , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SRL , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_ROTR , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SLLV , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SRAV , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SRLV , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_ROTRV , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_CLO , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_CLZ , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_DADDIU , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_DADDU , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_DADDI , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_DADD , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_DALIGN , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_DAHI , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_DATI , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_DAUI , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_DBITSWAP , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_DCLO , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_DCLZ , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_DMOD , [InstrStage<17, [IMULDIV]>]>,
+ InstrItinData<II_DMODU , [InstrStage<17, [IMULDIV]>]>,
+ InstrItinData<II_DSLL , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_DSLL32 , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_DSRL , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_DSRL32 , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_DSRA , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_DSRA32 , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_DSLLV , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_DSRLV , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_DSRAV , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_DSUBU , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_DSUB , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_DROTR , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_DROTR32 , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_DROTRV , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_DSBH , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_DSHD , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_DCLO , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_DCLZ , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_EXT , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_INS , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_LUI , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_MOVE , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_MOVF , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_MOVN , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_MOVN_S , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_MOVN_D , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_MOVT , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_MOVZ , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_NOR , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_NOT , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_OR , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_POP , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_RDHWR , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SUB , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SUBU , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_XOR , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_ANDI , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_ORI , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_XORI , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_LB , [InstrStage<3, [ALU]>]>,
+ InstrItinData<II_LBE , [InstrStage<3, [ALU]>]>,
+ InstrItinData<II_LBU , [InstrStage<3, [ALU]>]>,
+ InstrItinData<II_LBUE , [InstrStage<3, [ALU]>]>,
+ InstrItinData<II_LH , [InstrStage<3, [ALU]>]>,
+ InstrItinData<II_LHU , [InstrStage<3, [ALU]>]>,
+ InstrItinData<II_LHUE , [InstrStage<3, [ALU]>]>,
+ InstrItinData<II_LW , [InstrStage<3, [ALU]>]>,
+ InstrItinData<II_LWM , [InstrStage<3, [ALU]>]>,
+ InstrItinData<II_LWP , [InstrStage<3, [ALU]>]>,
+ InstrItinData<II_LWPC , [InstrStage<3, [ALU]>]>,
+ InstrItinData<II_LWL , [InstrStage<3, [ALU]>]>,
+ InstrItinData<II_LWLE , [InstrStage<3, [ALU]>]>,
+ InstrItinData<II_LWR , [InstrStage<3, [ALU]>]>,
+ InstrItinData<II_LWRE , [InstrStage<3, [ALU]>]>,
+ InstrItinData<II_LWUPC , [InstrStage<3, [ALU]>]>,
+ InstrItinData<II_LD , [InstrStage<3, [ALU]>]>,
+ InstrItinData<II_LDL , [InstrStage<3, [ALU]>]>,
+ InstrItinData<II_LDR , [InstrStage<3, [ALU]>]>,
+ InstrItinData<II_LDPC , [InstrStage<3, [ALU]>]>,
+ InstrItinData<II_LI , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_LL , [InstrStage<3, [ALU]>]>,
+ InstrItinData<II_LLD , [InstrStage<3, [ALU]>]>,
+ InstrItinData<II_RESTORE , [InstrStage<3, [ALU]>]>,
+ InstrItinData<II_SB , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SH , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SHE , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SW , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SWM , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SWL , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SWR , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SWP , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SDL , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SDR , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SD , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SC , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SCD , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SAVE , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SELCCZ_S , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SELCCZ_D , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SEQ_SNE , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SEQI_SNEI , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SLTI_SLTIU , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SLT_SLTU , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_B , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_BALC , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_BBIT , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_BC , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_BC1F , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_BC1FL , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_BC1T , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_BC1TL , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_BC1CCZ , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_BC2CCZ , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_BCC , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_BCCC , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_BCCZ , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_BCCZAL , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_BCCZALS , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_BCCZC , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_CLASS_D , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_CLASS_S , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_IndirectBranchPseudo, [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_J , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_JAL , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_JALR , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_JALR_HB , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_JALRC , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_JALRS , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_JALS , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_JIC , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_JIALC , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_JR , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_JR_HB , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_JRADDIUSP , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_JRC , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_ReturnPseudo , [InstrStage<1, [ALU]>]>,
+ InstrItinData<IIPseudo , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_DMUH , [InstrStage<17, [IMULDIV]>]>,
+ InstrItinData<II_DMUHU , [InstrStage<17, [IMULDIV]>]>,
+ InstrItinData<II_ERET , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_DERET , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_ERETNC , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_EHB , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SDBBP , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SSNOP , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SYSCALL , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_PAUSE , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_WAIT , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_EI , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_DI , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_TEQ , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_TEQI , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_TGE , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_TGEI , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_TGEIU , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_TGEU , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_TNE , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_TNEI , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_TLT , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_TLTI , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_TLTU , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_TTLTIU , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_TLBP , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_TLBR , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_TLBWI , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_TLBWR , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_TRAP , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_BREAK , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SYNC , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SYNCI , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_DMUL , [InstrStage<17, [IMULDIV]>]>,
+ InstrItinData<II_DMULT , [InstrStage<17, [IMULDIV]>]>,
+ InstrItinData<II_DMULTU , [InstrStage<17, [IMULDIV]>]>,
+ InstrItinData<II_DMULU , [InstrStage<17, [IMULDIV]>]>,
+ InstrItinData<II_MADD , [InstrStage<17, [IMULDIV]>]>,
+ InstrItinData<II_MADDU , [InstrStage<17, [IMULDIV]>]>,
+ InstrItinData<II_MFHI_MFLO , [InstrStage<1, [IMULDIV]>]>,
+ InstrItinData<II_MAX_D , [InstrStage<4, [ALU]>]>,
+ InstrItinData<II_MAX_S , [InstrStage<4, [ALU]>]>,
+ InstrItinData<II_MAXA_D , [InstrStage<4, [ALU]>]>,
+ InstrItinData<II_MAXA_S , [InstrStage<4, [ALU]>]>,
+ InstrItinData<II_MIN_S , [InstrStage<4, [ALU]>]>,
+ InstrItinData<II_MIN_D , [InstrStage<4, [ALU]>]>,
+ InstrItinData<II_MINA_S , [InstrStage<4, [ALU]>]>,
+ InstrItinData<II_MINA_D , [InstrStage<4, [ALU]>]>,
+ InstrItinData<II_MOD , [InstrStage<38, [IMULDIV]>]>,
+ InstrItinData<II_MODU , [InstrStage<38, [IMULDIV]>]>,
+ InstrItinData<II_MSUB , [InstrStage<17, [IMULDIV]>]>,
+ InstrItinData<II_MSUBU , [InstrStage<17, [IMULDIV]>]>,
+ InstrItinData<II_MTHI_MTLO , [InstrStage<1, [IMULDIV]>]>,
+ InstrItinData<II_MUH , [InstrStage<17, [IMULDIV]>]>,
+ InstrItinData<II_MUHU , [InstrStage<17, [IMULDIV]>]>,
+ InstrItinData<II_MUL , [InstrStage<17, [IMULDIV]>]>,
+ InstrItinData<II_MULT , [InstrStage<17, [IMULDIV]>]>,
+ InstrItinData<II_MULTU , [InstrStage<17, [IMULDIV]>]>,
+ InstrItinData<II_MULU , [InstrStage<17, [IMULDIV]>]>,
+ InstrItinData<II_MSUB , [InstrStage<17, [IMULDIV]>]>,
+ InstrItinData<II_MSUBU , [InstrStage<17, [IMULDIV]>]>,
+ InstrItinData<II_DIV , [InstrStage<38, [IMULDIV]>]>,
+ InstrItinData<II_DIVU , [InstrStage<38, [IMULDIV]>]>,
+ InstrItinData<II_DDIV , [InstrStage<38, [IMULDIV]>]>,
+ InstrItinData<II_DDIVU , [InstrStage<38, [IMULDIV]>]>,
+ InstrItinData<II_CEIL , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_CVT , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_ABS , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_FLOOR , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_NEG , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_ROUND , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_TRUNC , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_MOV_D , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_MOV_S , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_CFC1 , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_CTC1 , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_CFC2 , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_CTC2 , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_MOVF_D , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_MOVF_S , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_MOVT_D , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_MOVT_S , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_MOVZ_D , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_MOVZ_S , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_C_CC_S , [InstrStage<3, [ALU]>]>,
+ InstrItinData<II_C_CC_D , [InstrStage<3, [ALU]>]>,
+ InstrItinData<II_CMP_CC_S , [InstrStage<3, [ALU]>]>,
+ InstrItinData<II_CMP_CC_D , [InstrStage<3, [ALU]>]>,
+ InstrItinData<II_ADD_D , [InstrStage<4, [ALU]>]>,
+ InstrItinData<II_ADD_S , [InstrStage<4, [ALU]>]>,
+ InstrItinData<II_SUB_D , [InstrStage<4, [ALU]>]>,
+ InstrItinData<II_SUB_S , [InstrStage<4, [ALU]>]>,
+ InstrItinData<II_MUL_S , [InstrStage<7, [ALU]>]>,
+ InstrItinData<II_MADD_S , [InstrStage<7, [ALU]>]>,
+ InstrItinData<II_MADDF_S , [InstrStage<7, [ALU]>]>,
+ InstrItinData<II_MSUB_S , [InstrStage<7, [ALU]>]>,
+ InstrItinData<II_MSUBF_S , [InstrStage<7, [ALU]>]>,
+ InstrItinData<II_NMADD_S , [InstrStage<7, [ALU]>]>,
+ InstrItinData<II_NMSUB_S , [InstrStage<7, [ALU]>]>,
+ InstrItinData<II_MUL_D , [InstrStage<8, [ALU]>]>,
+ InstrItinData<II_MADD_D , [InstrStage<8, [ALU]>]>,
+ InstrItinData<II_MADDF_D , [InstrStage<8, [ALU]>]>,
+ InstrItinData<II_MSUB_D , [InstrStage<8, [ALU]>]>,
+ InstrItinData<II_MSUBF_D , [InstrStage<8, [ALU]>]>,
+ InstrItinData<II_NMADD_D , [InstrStage<8, [ALU]>]>,
+ InstrItinData<II_NMSUB_D , [InstrStage<8, [ALU]>]>,
+ InstrItinData<II_DIV_S , [InstrStage<23, [ALU]>]>,
+ InstrItinData<II_DIV_D , [InstrStage<36, [ALU]>]>,
+ InstrItinData<II_RECIP_D , [InstrStage<25, [ALU]>]>,
+ InstrItinData<II_RECIP_S , [InstrStage<13, [ALU]>]>,
+ InstrItinData<II_RSQRT_D , [InstrStage<29, [ALU]>]>,
+ InstrItinData<II_RSQRT_S , [InstrStage<14, [ALU]>]>,
+ InstrItinData<II_RINT_D , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_RINT_S , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SQRT_S , [InstrStage<54, [ALU]>]>,
+ InstrItinData<II_SQRT_D , [InstrStage<12, [ALU]>]>,
+ InstrItinData<II_SEL_D , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SEL_S , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_WSBH , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_LSA , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_DLSA , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_LDC1 , [InstrStage<3, [ALU]>]>,
+ InstrItinData<II_LDC2 , [InstrStage<3, [ALU]>]>,
+ InstrItinData<II_LDC3 , [InstrStage<3, [ALU]>]>,
+ InstrItinData<II_LWC1 , [InstrStage<3, [ALU]>]>,
+ InstrItinData<II_LWC2 , [InstrStage<3, [ALU]>]>,
+ InstrItinData<II_LWC3 , [InstrStage<3, [ALU]>]>,
+ InstrItinData<II_LDXC1 , [InstrStage<3, [ALU]>]>,
+ InstrItinData<II_LWXC1 , [InstrStage<3, [ALU]>]>,
+ InstrItinData<II_LUXC1 , [InstrStage<3, [ALU]>]>,
+ InstrItinData<II_LWXS , [InstrStage<3, [ALU]>]>,
+ InstrItinData<II_SDC1 , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SDC2 , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SDC3 , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SWC1 , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SWC2 , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SWC3 , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SDXC1 , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SWXC1 , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_SUXC1 , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_DMFC0 , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_DMFC1 , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_DMFC2 , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_DMTC0 , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_DMTC1 , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_DMTC2 , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_MFC0 , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_MFHC0 , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_MFC1 , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_MFC2 , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_MTC0 , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_MTHC0 , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_MTC1 , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_MTC2 , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_MFHC1 , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_MTHC1 , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_CACHE , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_PREF , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_CACHEE , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_PREFE , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_TLBINV , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_TLBINVF , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_LLE , [InstrStage<3, [ALU]>]>,
+ InstrItinData<II_SCE , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_WRPGPR , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_RDPGPR , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_DVP , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_EVP , [InstrStage<1, [ALU]>]>
+]>;
diff --git a/contrib/llvm/lib/Target/Mips/MipsScheduleGeneric.td b/contrib/llvm/lib/Target/Mips/MipsScheduleGeneric.td
new file mode 100644
index 000000000000..15a0401b781e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsScheduleGeneric.td
@@ -0,0 +1,1048 @@
+//=- MipsScheduleGeneric.td - Generic Scheduling Definitions -*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the interAptiv processor in a manner of speaking. It
+// describes a hypothetical version of the in-order MIPS32R2 interAptiv with all
+// branches of the MIPS ISAs, ASEs and ISA variants. The itinerary lists are
+// broken down into per ISA lists, so that this file can be used to rapidly
+// develop new schedule models.
+//
+//===----------------------------------------------------------------------===//
+def MipsGenericModel : SchedMachineModel {
+ int IssueWidth = 1;
+ int MicroOpBufferSize = 0;
+
+ // These figures assume an L1 hit.
+ int LoadLatency = 2;
+ int MispredictPenalty = 4;
+
+ int HighLatency = 37;
+ list<Predicate> UnsupportedFeatures = [];
+
+ let CompleteModel = 1;
+ let PostRAScheduler = 1;
+}
+
+let SchedModel = MipsGenericModel in {
+
+// ALU Pipeline
+// ============
+
+def GenericALU : ProcResource<1> { let BufferSize = 1; }
+def GenericIssueALU : ProcResource<1> { let Super = GenericALU; }
+
+def GenericWriteALU : SchedWriteRes<[GenericIssueALU]>;
+
+// and, lui, nor, or, slti, sltiu, sub, subu, xor
+// add, addi, addiu, addu, andi, ori, rotr, se[bh], sllv?, sr[al]v?, slt, sltu,
+// xori
+def : ItinRW<[GenericWriteALU], [II_ADD, II_ADDU, II_ADDI, II_ADDIU, II_ANDI,
+ II_AND, II_ANDI, II_CLO, II_CLZ, II_EXT,
+ II_INS, II_LUI, II_MULT, II_MULTU, II_NOR,
+ II_ORI, II_OR, II_ROTR, II_ROTRV, II_SEB,
+ II_SEH, II_SLTI_SLTIU, II_SLT_SLTU, II_SLL,
+ II_SRA, II_SRL, II_SLLV, II_SRAV, II_SRLV,
+ II_SSNOP, II_SUB, II_SUBU, II_WSBH, II_XOR,
+ II_XORI]>;
+
+def : InstRW<[GenericWriteALU], (instrs COPY)>;
+
+def GenericMDU : ProcResource<1> { let BufferSize = 1; }
+def GenericIssueMDU : ProcResource<1> { let Super = GenericALU; }
+def GenericIssueDIV : ProcResource<1> { let Super = GenericMDU; }
+def GenericWriteHILO : SchedWriteRes<[GenericIssueMDU]>;
+def GenericWriteALULong : SchedWriteRes<[GenericIssueALU]> { let Latency = 5; }
+def GenericWriteMove : SchedWriteRes<[GenericIssueALU]> { let Latency = 2; }
+
+def : ItinRW<[GenericWriteHILO], [II_MADD, II_MADDU, II_MSUB, II_MSUBU]>;
+
+def GenericWriteMDUtoGPR : SchedWriteRes<[GenericIssueMDU]> {
+ let Latency = 5;
+}
+
+def : ItinRW<[GenericWriteMDUtoGPR], [II_MUL]>;
+
+def GenericWriteDIV : SchedWriteRes<[GenericIssueDIV]> {
+ // Estimated worst case
+ let Latency = 33;
+ let ResourceCycles = [1, 33];
+}
+def GenericWriteDIVU : SchedWriteRes<[GenericIssueDIV]> {
+ // Estimated worst case
+ let Latency = 31;
+ let ResourceCycles = [1, 31];
+}
+
+def : ItinRW<[GenericWriteDIV], [II_DIV]>;
+
+def : ItinRW<[GenericWriteDIVU], [II_DIVU]>;
+
+// MIPS64
+// ======
+
+def : ItinRW<[GenericWriteALU], [II_DADDIU, II_DADDU, II_DADDI, II_DADD,
+ II_DCLO, II_DCLZ, II_DROTR, II_DROTR32,
+ II_DROTRV, II_DSBH, II_DSHD, II_DSLL,
+ II_DSLL32, II_DSLLV, II_DSRA, II_DSRA32,
+ II_DSRAV, II_DSRL, II_DSRL32, II_DSRLV,
+ II_DSUBU, II_DSUB]>;
+
+def : ItinRW<[GenericWriteDIV], [II_DDIV]>;
+
+def : ItinRW<[GenericWriteDIVU], [II_DDIVU]>;
+
+def : ItinRW<[GenericWriteMDUtoGPR], [II_DMUL]>;
+
+def : ItinRW<[GenericWriteHILO], [II_DMULU, II_DMULT, II_DMULTU]>;
+
+// MIPS16e
+// =======
+
+def : ItinRW<[GenericWriteALU], [IIM16Alu, IIPseudo]>;
+
+// microMIPS
+// =========
+
+def : ItinRW<[GenericWriteALU], [II_MOVE, II_LI, II_NOT]>;
+
+// MIPSR6
+// ======
+
+def GenericWriteMul : SchedWriteRes<[GenericIssueMDU]> { let Latency = 4; }
+def : ItinRW<[GenericWriteMul], [II_MUH, II_MUHU, II_MULU]>;
+
+def : ItinRW<[GenericWriteDIV], [II_MOD, II_MODU]>;
+
+def : ItinRW<[GenericWriteALU], [II_ADDIUPC, II_ALIGN, II_ALUIPC, II_AUI,
+ II_AUIPC, II_BITSWAP, II_LSA, II_SELCCZ]>;
+
+// MIPS64R6
+// ========
+
+def : ItinRW<[GenericWriteALU], [II_DALIGN, II_DAHI, II_DATI, II_DAUI,
+ II_DBITSWAP, II_DLSA]>;
+
+def : ItinRW<[GenericWriteMDUtoGPR], [II_DMUH, II_DMUHU]>;
+def : ItinRW<[GenericWriteDIV], [II_DMOD, II_DMODU]>;
+
+// clo, clz, di, mfhi, mflo
+def : ItinRW<[GenericWriteALULong], [II_MFHI_MFLO]>;
+def : ItinRW<[GenericWriteALU], [II_MOVN, II_MOVZ]>;
+def : ItinRW<[GenericWriteMove], [II_MTHI_MTLO, II_RDHWR]>;
+
+
+// CTISTD Pipeline
+// ---------------
+
+def GenericIssueCTISTD : ProcResource<1> { let Super = GenericALU; }
+
+def GenericLDST : ProcResource<1> { let BufferSize = 1; }
+def GenericIssueLDST : ProcResource<1> { let Super = GenericLDST; }
+
+def GenericWriteJump : SchedWriteRes<[GenericIssueCTISTD]>;
+def GenericWriteJumpAndLink : SchedWriteRes<[GenericIssueCTISTD]> {
+ let Latency = 2;
+}
+
+// b, beq, beql, bg[et]z, bl[et]z, bne, bnel, j, syscall, jal, bltzal, jalx,
+// jalr, jr.hb, jr, jalr.hb, jarlc, jialc
+def : ItinRW<[GenericWriteJump], [II_B, II_BCC, II_BCCZ, II_BCCZAL, II_J,
+ II_JR, II_JR_HB, II_ERET, II_ERETNC,
+ II_DERET]>;
+
+def : ItinRW<[GenericWriteJumpAndLink], [II_JAL, II_JALR, II_JALR_HB,
+ II_BC2CCZ]>;
+
+def : ItinRW<[GenericWriteJump], [II_JRC, II_JRADDIUSP]>;
+
+def : ItinRW<[GenericWriteJumpAndLink], [II_BCCZALS, II_JALS, II_JALRS]>;
+
+// MIPSR6
+// ======
+
+def : ItinRW<[GenericWriteJumpAndLink], [II_BALC, II_JALRC, II_JIALC]>;
+
+def : ItinRW<[GenericWriteJump], [II_JIC, II_BC, II_BCCC, II_BCCZC]>;
+
+
+def GenericWriteTrap : SchedWriteRes<[GenericIssueCTISTD]>;
+
+def : ItinRW<[GenericWriteTrap], [II_BREAK, II_SYSCALL, II_TEQ, II_TEQI,
+ II_TGE, II_TGEI, II_TGEIU, II_TGEU, II_TNE,
+ II_TNEI, II_TLT, II_TLTI, II_TLTU, II_TTLTIU,
+ II_TRAP, II_SDBBP]>;
+
+// COP0 Pipeline
+// =============
+
+def GenericCOP0 : ProcResource<1> { let BufferSize = 1; }
+
+def GenericIssueCOP0 : ProcResource<1> { let Super = GenericCOP0; }
+def GenericWriteCOP0TLB : SchedWriteRes<[GenericIssueCOP0]> { let Latency = 4; }
+def GenericWriteCOP0 : SchedWriteRes<[GenericIssueCOP0]> { let Latency = 3; }
+def GenericReadCOP0 : SchedWriteRes<[GenericIssueCOP0]> { let Latency = 2; }
+def GnereicReadWritePGPR : SchedWriteRes<[GenericIssueCOP0]>;
+
+def : ItinRW<[GenericWriteCOP0TLB], [II_TLBP, II_TLBR, II_TLBWI, II_TLBWR]>;
+def : ItinRW<[GenericWriteCOP0TLB], [II_TLBINV, II_TLBINVF]>;
+
+def : ItinRW<[GenericReadCOP0], [II_MFC0]>;
+def : ItinRW<[GenericWriteCOP0], [II_MTC0]>;
+
+def : ItinRW<[GenericWriteCOP0], [II_EVP, II_DVP]>;
+
+// MIPSR5
+// ======
+def : ItinRW<[GenericReadCOP0], [II_MFHC0]>;
+def : ItinRW<[GenericWriteCOP0], [II_MTHC0]>;
+
+// MIPS64
+// ======
+
+def : ItinRW<[GenericReadCOP0], [II_DMFC0]>;
+def : ItinRW<[GenericWriteCOP0], [II_DMTC0]>;
+
+def : ItinRW<[GenericWriteCOP0], [II_RDPGPR, II_WRPGPR]>;
+
+def : ItinRW<[GenericWriteCOP0], [II_DI, II_EI]>;
+
+def : ItinRW<[GenericWriteCOP0], [II_EHB, II_PAUSE, II_WAIT]>;
+
+def GenericCOP2 : ProcResource<1> { let BufferSize = 1; }
+def GenericWriteCOPOther : SchedWriteRes<[GenericCOP2]>;
+
+def : ItinRW<[GenericWriteCOPOther], [II_MFC2, II_MTC2, II_DMFC2, II_DMTC2]>;
+
+// LDST Pipeline
+// -------------
+
+def GenericWriteLoad : SchedWriteRes<[GenericIssueLDST]> {
+ let Latency = 2;
+}
+
+def GenericWritePref : SchedWriteRes<[GenericIssueLDST]>;
+def GenericWriteSync : SchedWriteRes<[GenericIssueLDST]>;
+def GenericWriteCache : SchedWriteRes<[GenericIssueLDST]> { let Latency = 5; }
+
+def GenericWriteStore : SchedWriteRes<[GenericIssueLDST]>;
+def GenericWriteStoreSC : SchedWriteRes<[GenericIssueLDST]> { let Latency = 2; }
+
+def GenericWriteGPRFromBypass : SchedWriteRes<[GenericIssueLDST]> {
+ let Latency = 2;
+}
+
+def GenericWriteStoreFromOtherUnits : SchedWriteRes<[GenericIssueLDST]>;
+def GenericWriteLoadToOtherUnits : SchedWriteRes<[GenericIssueLDST]> {
+ let Latency = 0;
+}
+
+// l[bhw], l[bh]u, ll
+def : ItinRW<[GenericWriteLoad], [II_LB, II_LBU, II_LH, II_LHU, II_LW, II_LL,
+ II_LWC2, II_LWC3, II_LDC2, II_LDC3]>;
+
+// lw[lr]
+def : ItinRW<[GenericWriteLoad], [II_LWL, II_LWR]>;
+
+// MIPS64 loads
+def : ItinRW<[GenericWriteLoad], [II_LD, II_LLD, II_LWU]>;
+
+// ld[lr]
+def : ItinRW<[GenericWriteLoad], [II_LDL, II_LDR]>;
+
+// MIPS32 EVA
+def : ItinRW<[GenericWriteLoad], [II_LBE, II_LBUE, II_LHE, II_LHUE, II_LWE,
+ II_LLE]>;
+
+def : ItinRW<[GenericWriteLoad], [II_LWLE, II_LWRE]>;
+
+// MIPS32R6 and MIPS16e
+// ====================
+
+def : ItinRW<[GenericWriteLoad], [II_LWPC]>;
+
+// MIPS64R6
+// ====================
+
+def : ItinRW<[GenericWriteLoad], [II_LWUPC, II_LDPC]>;
+
+
+// s[bhw], sc, s[dw]c[23]
+def : ItinRW<[GenericWriteStore], [II_SB, II_SH, II_SW, II_SWC2, II_SWC3,
+ II_SDC2, II_SDC3]>;
+
+def : ItinRW<[GenericWriteStoreSC], [II_SC]>;
+
+// PreMIPSR6 sw[lr]
+def : ItinRW<[GenericWriteStore], [II_SWL, II_SWR]>;
+
+// EVA ASE stores
+def : ItinRW<[GenericWriteStore], [II_SBE, II_SHE, II_SWE, II_SCE]>;
+
+def : ItinRW<[GenericWriteStore], [II_SWLE, II_SWRE]>;
+
+// MIPS64
+// ======
+
+def : ItinRW<[GenericWriteStore], [II_SD, II_SCD]>;
+
+// PreMIPSR6 stores
+// ================
+
+def : ItinRW<[GenericWriteStore], [II_SDL, II_SDR]>;
+
+// MIPS16e
+// =======
+
+def : ItinRW<[GenericWriteLoad], [II_RESTORE]>;
+
+def : ItinRW<[GenericWriteStore], [II_SAVE]>;
+
+// microMIPS
+// =========
+
+def : ItinRW<[GenericWriteLoad], [II_LWM, II_LWP, II_LWXS]>;
+
+def : ItinRW<[GenericWriteStore], [II_SWM, II_SWP]>;
+
+// pref
+def : ItinRW<[GenericWritePref], [II_PREF]>;
+
+def : ItinRW<[GenericWritePref], [II_PREFE]>;
+
+// cache
+def : ItinRW<[GenericWriteCache], [II_CACHE]>;
+
+def : ItinRW<[GenericWriteCache], [II_CACHEE]>;
+
+// sync
+def : ItinRW<[GenericWriteSync], [II_SYNC]>;
+
+def : ItinRW<[GenericWriteSync], [II_SYNCI]>;
+
+// FPU Pipelines
+// =============
+
+def GenericFPQ : ProcResource<1> { let BufferSize = 1; }
+def GenericIssueFPUS : ProcResource<1> { let Super = GenericFPQ; }
+def GenericIssueFPUL : ProcResource<1> { let Super = GenericFPQ; }
+def GenericIssueFPULoad : ProcResource<1> { let Super = GenericFPQ; }
+def GenericIssueFPUStore : ProcResource<1> { let Super = GenericFPQ; }
+def GenericIssueFPUMove : ProcResource<1> { let Super = GenericFPQ; }
+def GenericFPUDivSqrt : ProcResource<1> { let Super = GenericFPQ; }
+
+// The floating point compare of the 24k series including interAptiv has a
+// listed latency of 1-2. Using the higher latency here.
+
+def GenericWriteFPUCmp : SchedWriteRes<[GenericIssueFPUS]> { let Latency = 2; }
+def GenericWriteFPUS : SchedWriteRes<[GenericIssueFPUS]> { let Latency = 4; }
+def GenericWriteFPUL : SchedWriteRes<[GenericIssueFPUL]> { let Latency = 5; }
+def GenericWriteFPUStore : SchedWriteRes<[GenericIssueFPUStore]> { let
+ Latency = 1;
+}
+def GenericWriteFPULoad : SchedWriteRes<[GenericIssueFPULoad]> {
+ let Latency = 2;
+}
+def GenericWriteFPUMoveFP : SchedWriteRes<[GenericIssueFPUMove]> {
+ let Latency = 4;
+}
+def GenericWriteFPUMoveGPRFPU : SchedWriteRes<[GenericIssueFPUMove]> {
+ let Latency = 2;
+}
+def GenericWriteFPUDivS : SchedWriteRes<[GenericFPUDivSqrt]> {
+ let Latency = 17;
+ let ResourceCycles = [ 14 ];
+}
+def GenericWriteFPUDivD : SchedWriteRes<[GenericFPUDivSqrt]> {
+ let Latency = 32;
+ let ResourceCycles = [ 29 ];
+}
+def GenericWriteFPURcpS : SchedWriteRes<[GenericFPUDivSqrt]> {
+ let Latency = 13;
+ let ResourceCycles = [ 10 ];
+}
+def GenericWriteFPURcpD : SchedWriteRes<[GenericFPUDivSqrt]> {
+ let Latency = 25;
+ let ResourceCycles = [ 21 ];
+}
+def GenericWriteFPURsqrtS : SchedWriteRes<[GenericFPUDivSqrt]> {
+ let Latency = 17;
+ let ResourceCycles = [ 14 ];
+}
+def GenericWriteFPURsqrtD : SchedWriteRes<[GenericFPUDivSqrt]> {
+ let Latency = 32;
+ let ResourceCycles = [ 29 ];
+}
+def GenericWriteFPUSqrtS : SchedWriteRes<[GenericFPUDivSqrt]> {
+ let Latency = 17;
+ let ResourceCycles = [ 14 ];
+}
+def GenericWriteFPUSqrtD : SchedWriteRes<[GenericFPUDivSqrt]> {
+ let Latency = 29;
+ let ResourceCycles = [ 29 ];
+}
+
+// Floating point compare and branch
+// ---------------------------------
+//
+// c.<cc>.[ds], bc1[tf], bc1[tf]l
+def : ItinRW<[GenericWriteFPUCmp], [II_C_CC_D, II_C_CC_S, II_BC1F, II_BC1T,
+ II_BC1FL, II_BC1TL]>;
+
+def : ItinRW<[GenericWriteFPUCmp], [II_CMP_CC_D, II_CMP_CC_S]>;
+
+// Short Pipe
+// ----------
+//
+// abs.[ds], abs.ps, add.[ds], neg.[ds], neg.ps, madd.s, msub.s, nmadd,s
+// nmsub.s, sub.[ds], mul.s
+
+def : ItinRW<[GenericWriteFPUS], [II_ABS, II_ADD_D, II_ADD_S, II_MADD_S,
+ II_MSUB_S, II_MUL_S, II_NEG, II_NMADD_S,
+ II_NMSUB_S, II_SUB_S, II_SUB_D]>;
+// mov[tf].[ds]
+
+def : ItinRW<[GenericWriteFPUS], [II_MOVF_S, II_MOVF_D, II_MOVT_S, II_MOVT_D]>;
+
+// MIPSR6
+// ------
+//
+// sel(eq|ne).[ds], max.[ds], maxa.[ds], min.[ds], mina.[ds], class.[ds]
+def : ItinRW<[GenericWriteFPUS], [II_SELCCZ_S, II_SELCCZ_D, II_MAX_S,
+ II_MAX_D, II_MAXA_S, II_MAXA_D, II_MIN_S,
+ II_MIN_D, II_MINA_S, II_MINA_D, II_CLASS_S,
+ II_CLASS_D]>;
+
+// Long Pipe
+// ----------
+//
+// nmadd.d, nmsub.d, mul.[ds], mul.ps, ceil.[wl].[sd], cvt.d.[sw], cvt.s.[dw],
+// cvt.w.[sd], cvt.[sw].ps, trunc.w.[ds], trunc.w.ps, floor.[ds],
+// round.[lw].[ds], floor.[lw].ds
+
+// madd.d, msub.dm mul.d, mul.ps, nmadd.d, nmsub.d, ceil.[wl].[sd], cvt.d.[sw],
+// cvt.s.[dw], cvt.w.[sd], cvt.[sw].ps, round.[lw].[ds], floor.[lw].ds,
+// trunc.w.[ds], trunc.w.ps,
+def : ItinRW<[GenericWriteFPUL], [II_MADD_D, II_MSUB_D, II_MUL_D, II_NMADD_D,
+ II_NMSUB_D, II_CEIL, II_CVT,
+ II_FLOOR, II_ROUND, II_TRUNC]>;
+
+// div.[ds], div.ps
+def : ItinRW<[GenericWriteFPUDivS], [II_DIV_S]>;
+def : ItinRW<[GenericWriteFPUDivD], [II_DIV_D]>;
+
+// sqrt.[ds], sqrt.ps
+def : ItinRW<[GenericWriteFPUSqrtS], [II_SQRT_S]>;
+def : ItinRW<[GenericWriteFPUSqrtD], [II_SQRT_D]>;
+
+// rsqrt.[ds], recip.[ds]
+def : ItinRW<[GenericWriteFPURcpS], [II_RECIP_S, II_RSQRT_S]>;
+def : ItinRW<[GenericWriteFPURcpD], [II_RECIP_D, II_RSQRT_D]>;
+
+// MIPSR6
+// ======
+//
+// rint.[ds]
+def : ItinRW<[GenericWriteFPUL], [II_RINT_S, II_RINT_D]>;
+
+// Load Pipe
+// ---------
+
+// ctc1, mtc1, mthc1, cfc1, mfc1, mfhc1
+def : ItinRW<[GenericWriteFPUMoveGPRFPU], [II_CFC1, II_CTC1, II_MFC1, II_MFHC1,
+ II_MTC1, II_MTHC1]>;
+
+// swc1, swxc1
+def : ItinRW<[GenericWriteFPUStore], [II_SDC1, II_SDXC1, II_SUXC1, II_SWC1,
+ II_SWXC1]>;
+
+// movn.[ds], movz.[ds]
+def : ItinRW<[GenericWriteFPUMoveFP], [II_MOV_D, II_MOV_S, II_MOVF, II_MOVT,
+ II_MOVN_D, II_MOVN_S, II_MOVZ_D,
+ II_MOVZ_S]>;
+
+// l[dw]x?c1
+def : ItinRW<[GenericWriteFPULoad], [II_LDC1, II_LDXC1, II_LUXC1, II_LWC1,
+ II_LWXC1]>;
+
+// MIPS64
+// ======
+
+def : ItinRW<[GenericWriteFPUMoveGPRFPU], [II_DMFC1, II_DMTC1]>;
+
+// MIPSR6
+// ======
+
+def : ItinRW<[GenericWriteFPUS], [II_MADDF_S, II_MSUBF_S]>;
+
+def : ItinRW<[GenericWriteFPUS], [II_MADDF_D, II_MSUBF_D]>;
+
+def : ItinRW<[GenericWriteFPUCmp], [II_BC1CCZ, II_SEL_D, II_SEL_S]>;
+
+// Cavium Networks MIPS (cnMIPS) - Octeon, HasCnMips
+// =================================================
+
+def : ItinRW<[GenericWriteALU], [II_SEQ_SNE, II_SEQI_SNEI, II_POP, II_BADDU,
+ II_BBIT]>;
+
+// MIPS DSP ASE, HasDSP
+// ====================
+
+def GenericDSP : ProcResource<1> { let BufferSize = 1; }
+def GenericDSPShort : SchedWriteRes<[GenericDSP]> { let Latency = 2; }
+def GenericDSPLong : SchedWriteRes<[GenericDSP]> { let Latency = 6; }
+def GenericDSPBypass : SchedWriteRes<[GenericDSP]> { let Latency = 1; }
+def GenericDSPMTHILO : SchedWriteRes<[GenericDSP]> { let Latency = 5; }
+def GenericDSPLoad : SchedWriteRes<[GenericDSP]> { let Latency = 4; }
+def GenericDSPMTHLIP : SchedWriteRes<[GenericDSP]> { let Latency = 5; }
+
+def : InstRW<[GenericDSPLong], (instregex "^EXTRV_RS_W$")>;
+def : InstRW<[GenericDSPLong], (instregex "^EXTRV_R_W$")>;
+def : InstRW<[GenericDSPLong], (instregex "^EXTRV_S_H$")>;
+def : InstRW<[GenericDSPLong], (instregex "^EXTRV_W$")>;
+def : InstRW<[GenericDSPLong], (instregex "^EXTR_RS_W$")>;
+def : InstRW<[GenericDSPLong], (instregex "^EXTR_R_W$")>;
+def : InstRW<[GenericDSPLong], (instregex "^EXTR_S_H$")>;
+def : InstRW<[GenericDSPLong], (instregex "^EXTR_W$")>;
+def : InstRW<[GenericDSPLong], (instregex "^INSV$")>;
+
+def : InstRW<[GenericDSPMTHLIP], (instregex "^MTHLIP$")>;
+def : InstRW<[GenericDSPMTHILO], (instregex "^MTHI_DSP$")>;
+def : InstRW<[GenericDSPMTHILO], (instregex "^MTLO_DSP$")>;
+
+def : InstRW<[GenericDSPShort], (instregex "^ABSQ_S_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ABSQ_S_W$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDQ_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDQ_S_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDQ_S_W$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDSC$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDU_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDU_S_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDWC$")>;
+def : InstRW<[GenericDSPShort], (instregex "^BITREV$")>;
+def : InstRW<[GenericDSPShort], (instregex "^BPOSGE32$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMPGU_EQ_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMPGU_LE_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMPGU_LT_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMPU_EQ_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMPU_LE_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMPU_LT_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMP_EQ_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMP_LE_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMP_LT_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPAQ_SA_L_W$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPAQ_S_W_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPAU_H_QBL$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPAU_H_QBR$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPSQ_SA_L_W$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPSQ_S_W_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPSU_H_QBL$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPSU_H_QBR$")>;
+def : InstRW<[GenericDSPShort], (instregex "^EXTPDPV$")>;
+def : InstRW<[GenericDSPShort], (instregex "^EXTPDP$")>;
+def : InstRW<[GenericDSPShort], (instregex "^EXTPV$")>;
+def : InstRW<[GenericDSPShort], (instregex "^EXTP$")>;
+def : InstRW<[GenericDSPShort], (instregex "^LBUX$")>;
+def : InstRW<[GenericDSPShort], (instregex "^LHX$")>;
+def : InstRW<[GenericDSPShort], (instregex "^LWX$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MADDU_DSP$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MADD_DSP$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MAQ_SA_W_PHL$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MAQ_SA_W_PHR$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MAQ_S_W_PHL$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MAQ_S_W_PHR$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MFHI_DSP$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MFLO_DSP$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MODSUB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MSUBU_DSP$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MSUB_DSP$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULEQ_S_W_PHL$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULEQ_S_W_PHR$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULEU_S_PH_QBL$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULEU_S_PH_QBR$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULQ_RS_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULSAQ_S_W_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULTU_DSP$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULT_DSP$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PACKRL_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PICK_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PICK_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEQU_PH_QBLA$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEQU_PH_QBL$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEQU_PH_QBRA$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEQU_PH_QBR$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEQ_W_PHL$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEQ_W_PHR$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEU_PH_QBLA$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEU_PH_QBL$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEU_PH_QBRA$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEU_PH_QBR$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECRQU_S_QB_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECRQ_PH_W$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECRQ_QB_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECRQ_RS_PH_W$")>;
+def : InstRW<[GenericDSPShort], (instregex "^RADDU_W_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^RDDSP$")>;
+def : InstRW<[GenericDSPShort], (instregex "^REPLV_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^REPLV_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^REPL_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^REPL_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHILOV$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHILO$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHLLV_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHLLV_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHLLV_S_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHLLV_S_W$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHLL_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHLL_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHLL_S_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHLL_S_W$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRAV_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRAV_R_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRAV_R_W$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRA_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRA_R_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRA_R_W$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRLV_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRL_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBQ_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBQ_S_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBQ_S_W$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBU_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBU_S_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^WRDSP$")>;
+
+// MIPS DSP R2 - hasDSP, HasDSPR2, InMicroMips
+// ===========================================
+
+def : InstRW<[GenericDSPShort], (instregex "^ABSQ_S_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDQH_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDQH_R_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDQH_R_W$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDQH_W$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDUH_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDUH_R_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDU_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDU_S_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^APPEND$")>;
+def : InstRW<[GenericDSPShort], (instregex "^BALIGN$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMPGDU_EQ_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMPGDU_LE_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMPGDU_LT_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPA_W_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPAQX_SA_W_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPAQX_S_W_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPAX_W_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPS_W_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPSQX_S_W_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPSQX_SA_W_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPSX_W_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MUL_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MUL_S_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULQ_RS_W$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULQ_S_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULQ_S_W$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULSA_W_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECR_QB_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECR_SRA_PH_W$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECR_SRA_R_PH_W$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PREPEND$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRA_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRA_R_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRAV_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRAV_R_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRL_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRLV_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBQH_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBQH_R_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBQH_W$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBQH_R_W$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBU_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBU_S_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBUH_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBUH_R_QB$")>;
+
+// microMIPS DSP R1 - HasDSP, InMicroMips
+// ======================================
+
+def : InstRW<[GenericDSPShort], (instregex "^ABSQ_S_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ABSQ_S_W_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDQ_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDQ_S_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDQ_S_W_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDSC_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDU_QB_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDU_S_QB_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDWC_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^BITREV_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^BPOSGE32_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMPGU_EQ_QB_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMPGU_LE_QB_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMPGU_LT_QB_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMPU_EQ_QB_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMPU_LE_QB_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMPU_LT_QB_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMP_EQ_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMP_LE_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMP_LT_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPAQ_SA_L_W_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPAQ_S_W_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPAU_H_QBL_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPAU_H_QBR_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPSQ_SA_L_W_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPSQ_S_W_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPSU_H_QBL_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPSU_H_QBR_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^EXTPDPV_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^EXTPDP_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^EXTPV_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^EXTP_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^EXTRV_RS_W_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^EXTRV_R_W_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^EXTRV_S_H_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^EXTRV_W_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^EXTR_RS_W_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^EXTR_R_W_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^EXTR_S_H_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^EXTR_W_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^INSV_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^LBUX_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^LHX_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^LWX_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MADDU_DSP_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MADD_DSP_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MAQ_SA_W_PHL_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MAQ_SA_W_PHR_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MAQ_S_W_PHL_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MAQ_S_W_PHR_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MFHI_DSP_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MFLO_DSP_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MODSUB_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MOVEP_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MOVN_I_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MOVZ_I_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MSUBU_DSP_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MSUB_DSP_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MTHI_DSP_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MTHLIP_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MTLO_DSP_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULEQ_S_W_PHL_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULEQ_S_W_PHR_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULEU_S_PH_QBL_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULEU_S_PH_QBR_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULQ_RS_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULSAQ_S_W_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULTU_DSP_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULT_DSP_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PACKRL_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PICK_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PICK_QB_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEQU_PH_QBLA_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEQU_PH_QBL_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEQU_PH_QBRA_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEQU_PH_QBR_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEQ_W_PHL_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEQ_W_PHR_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEU_PH_QBLA_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEU_PH_QBL_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEU_PH_QBRA_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEU_PH_QBR_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECRQU_S_QB_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECRQ_PH_W_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECRQ_QB_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECRQ_RS_PH_W_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^RADDU_W_QB_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^RDDSP_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^REPLV_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^REPLV_QB_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^REPL_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^REPL_QB_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHILOV_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHILO_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHLLV_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHLLV_QB_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHLLV_S_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHLLV_S_W_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHLL_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHLL_QB_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHLL_S_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHLL_S_W_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRAV_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRAV_R_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRAV_R_W_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRA_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRA_R_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRA_R_W_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRLV_QB_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRL_QB_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBQ_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBQ_S_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBQ_S_W_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBU_QB_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBU_S_QB_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^WRDSP_MM$")>;
+
+
+// microMIPS DSP R2 - hasDSP, HasDSPR2, InMicroMips
+// ================================================
+
+def : InstRW<[GenericDSPShort], (instregex "^ABSQ_S_QB_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDQH_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDQH_R_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDQH_R_W_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDQH_W_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDUH_QB_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDUH_R_QB_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDU_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDU_S_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^APPEND_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^BALIGN_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMPGDU_EQ_QB_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMPGDU_LE_QB_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMPGDU_LT_QB_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPA_W_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPAQX_SA_W_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPAQX_S_W_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPAX_W_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPS_W_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPSQX_S_W_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPSQX_SA_W_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPSX_W_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MUL_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MUL_S_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULQ_RS_W_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULQ_S_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULQ_S_W_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULSA_W_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECR_QB_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECR_SRA_PH_W_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECR_SRA_R_PH_W_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PREPEND_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRA_QB_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRA_R_QB_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRAV_QB_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRAV_R_QB_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRL_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRLV_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBQH_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBQH_R_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBQH_W_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBQH_R_W_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBU_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBU_S_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBUH_QB_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBUH_R_QB_MMR2$")>;
+
+// microMIPS DSP R3 - hasDSP, hasDSPR2, hasDSPR3, InMicroMips
+// ==========================================================
+
+def : InstRW<[GenericDSPShort], (instregex "^BPOSGE32C_MMR3$")>;
+
+// MIPS MSA ASE - hasMSA
+// =====================
+
+def GenericWriteMSAShortLogic : SchedWriteRes<[GenericIssueFPUS]>;
+def GenericWriteMSAShortInt : SchedWriteRes<[GenericIssueFPUS]> {
+let Latency = 2;
+}
+def GenericWriteMoveOtherUnitsToFPU : SchedWriteRes<[GenericIssueFPUS]>;
+def GenericWriteMSAOther3 : SchedWriteRes<[GenericIssueFPUS]> {
+let Latency = 3;
+}
+def GenericWriteMSALongInt : SchedWriteRes<[GenericIssueFPUS]> {
+let Latency = 5;
+}
+def GenericWriteFPUDivI : SchedWriteRes<[GenericFPQ]> {
+ let Latency = 33;
+ let ResourceCycles = [ 33 ];
+}
+
+// FPUS is also used in moves from floating point and MSA registers to general
+// purpose registers.
+def GenericWriteMoveFPUSToOtherUnits : SchedWriteRes<[GenericIssueFPUS]> {
+ let Latency = 0;
+}
+
+// FPUL is also used in moves from floating point and MSA registers to general
+// purpose registers.
+def GenericWriteMoveFPULToOtherUnits : SchedWriteRes<[GenericIssueFPUL]>;
+
+
+// adds_a.[bhwd], adds_[asu].[bhwd], addvi?.[bhwd], asub_[us].[bhwd],
+// aver?_[us].[bhwd]
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^ADD_A_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^ADDS_[ASU]_[BHWD]$")>;
+
+// TODO: ADDVI_[BHW] might be 1 cycle latency rather than 2. Need to confirm it.
+// add.[bhwd], addvi.[bhwd], asub_[us].[bhwd], ave.[bhwd], aver.[bhwd]
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^ADDVI?_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^ASUB_[US].[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^AVER?_[US].[BHWD]$")>;
+
+// and.v, andi.b, move.v, ldi.[bhwd], xor.v, nor.v, xori.b, nori.b
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^MOVE_V$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^LDI_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(AND|OR|[XN]OR)_V$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(AND|OR|[XN]OR)I_B$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(AND|OR|[XN]OR)I_B$")>;
+
+// vshf.[bhwd], binsl.[bhwd], binsr.[bhwd], insert.[bhwd], sld?.[bhwd],
+// bset.[bhwd], bclr.[bhwd], bneg.[bhwd], bsel_v, bseli_b
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^VSHF_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^(BINSL|BINSLI)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^(BINSR|BINSRI)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^INSERT_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^(SLD|SLDI)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^(BSET|BSETI)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^(BCLR|BCLRI)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^(BNEG|BNEGI)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^(BSEL_V|BSELI_B)$")>;
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^BMN*Z.*$")>;
+
+// pcnt.[bhwd], sat_s.[bhwd], sat_u.bhwd]
+def : InstRW<[GenericWriteMSAOther3], (instregex "^PCNT_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAOther3], (instregex "^SAT_(S|U)_[BHWD]$")>;
+
+// bnz.[bhwdv], cfcmsa, ctcmsa
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(BNZ|BZ)_[BHWDV]$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^C(F|T)CMSA$")>;
+
+// shf.[bhw], fill[bhwd], splat?.[bhwd]
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^SHF_[BHW]$")>;
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^FILL_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^(SPLAT|SPLATI)_[BHWD]$")>;
+
+// pcnt.[bhwd], sat_s.[bhwd], sat_u.bhwd]
+def : InstRW<[GenericWriteMSAOther3], (instregex "^PCNT_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAOther3], (instregex "^SAT_(S|U)_[BHWD]$")>;
+
+// fexp2_w, fexp2_d
+def : InstRW<[GenericWriteFPUS], (instregex "^FEXP2_(W|D)$")>;
+
+// compare, converts, round to int, floating point truncate.
+def : InstRW<[GenericWriteFPUS], (instregex "^(CLT|CLTI)_(S|U)_[BHWD]$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^(CLE|CLEI)_(S|U)_[BHWD]$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^(CEQ|CEQI)_[BHWD]$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^CMP_UN_(S|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^CMP_UEQ_(S|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^CMP_EQ_(S|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^CMP_LT_(S|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^CMP_ULT_(S|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^CMP_LE_(S|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^CMP_ULE_(S|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FS(AF|EQ|LT|LE|NE|OR)_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FSUEQ_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FSULE_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FSULT_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FSUNE_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FSUN_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FCAF_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FCEQ_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FCLE_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FCLT_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FCNE_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FCOR_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FCUEQ_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FCULE_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FCULT_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FCUNE_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FCUN_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FABS_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FFINT_(U|S)_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FFQL_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FFQR_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FTINT_(U|S)_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FRINT_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FTQ_(H|W)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FTRUNC_(U|S)_(W|D)$")>;
+
+// fexdo.[hw], fexupl.[wd], fexupr.[wd]
+def : InstRW<[GenericWriteFPUS], (instregex "^FEXDO_(H|W)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FEXUPL_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FEXUPR_(W|D)$")>;
+
+// fclass.[wd], fmax.[wd], fmax_a.[wd], fmin.[wd], fmin_a.[wd], flog2.[wd]
+def : InstRW<[GenericWriteFPUS], (instregex "^FCLASS_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FMAX_A_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FMAX_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FMIN_A_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FMIN_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FLOG2_(W|D)$")>;
+
+// interleave right/left, interleave even/odd, insert
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(ILVR|ILVL)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(ILVEV|ILVOD)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^INSVE_[BHWD]$")>;
+
+// subs_?.[bhwd], subsus_?.[bhwd], subsuu_?.[bhwd], subvi.[bhwd], subv.[bhwd],
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^SUBS_(S|U)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^SUBSUS_(S|U)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^SUBSUU_(S|U)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^SUBVI_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^SUBV_[BHWD]$")>;
+
+// mod_[su].[bhwd], div_[su].[bhwd]
+def : InstRW<[GenericWriteFPUDivI], (instregex "^MOD_(S|U)_[BHWD]$")>;
+def : InstRW<[GenericWriteFPUDivI], (instregex "^DIV_(S|U)_[BHWD]$")>;
+
+// hadd_[su].[bhwd], hsub_[su].[bhwd], max_[sua].[bhwd], min_[sua].[bhwd],
+// maxi_[su].[bhwd], mini_[su].[bhwd], sra?.[bhwd], srar?.[bhwd], srlr.[bhwd],
+// sll?.[bhwd], pckev.[bhwd], pckod.[bhwd], nloc.[bhwd], nlzc.[bhwd],
+// insve.[bhwd]
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^HADD_(S|U)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^HSUB_(S|U)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(MAX|MIN)_S_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(MAX|MIN)_U_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(MAX|MIN)_A_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortLogic],
+ (instregex "^(MAXI|MINI)_(S|U)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(SRA|SRAI)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(SRL|SRLI)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(SRAR|SRARI)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(SRLR|SRLRI)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(SLL|SLLI)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(PCKEV|PCKOD)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(NLOC|NLZC)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^INSVE_[BHWD]$")>;
+
+// dpadd_?.[bhwd], dpsub_?.[bhwd], dotp_?.[bhwd], msubv.[bhwd], maddv.[bhwd]
+// mulv.[bhwd].
+def : InstRW<[GenericWriteMSALongInt], (instregex "^DPADD_(S|U)_[HWD]$")>;
+def : InstRW<[GenericWriteMSALongInt], (instregex "^DPSUB_(S|U)_[HWD]$")>;
+def : InstRW<[GenericWriteMSALongInt], (instregex "^DOTP_(S|U)_[HWD]$")>;
+def : InstRW<[GenericWriteMSALongInt], (instregex "^MSUBV_[BHWD]$")>;
+def : InstRW<[GenericWriteMSALongInt], (instregex "^MADDV_[BHWD]$")>;
+def : InstRW<[GenericWriteMSALongInt], (instregex "^MULV_[BHWD]$")>;
+
+// madd?.q.[hw], msub?.q.[hw], mul?.q.[hw]
+def : InstRW<[GenericWriteMSALongInt], (instregex "^MADDR_Q_[HW]$")>;
+def : InstRW<[GenericWriteMSALongInt], (instregex "^MADD_Q_[HW]$")>;
+def : InstRW<[GenericWriteMSALongInt], (instregex "^MSUBR_Q_[HW]$")>;
+def : InstRW<[GenericWriteMSALongInt], (instregex "^MSUB_Q_[HW]$")>;
+def : InstRW<[GenericWriteMSALongInt], (instregex "^MULR_Q_[HW]$")>;
+def : InstRW<[GenericWriteMSALongInt], (instregex "^MUL_Q_[HW]$")>;
+
+// fadd.[dw], fmadd.[dw], fmul.[dw], frcp.[dw], frsqrt.[dw], fsqrt.[dw]
+// fsub.[dw], fdiv.[dw]
+def : InstRW<[GenericWriteFPUL], (instregex "^FADD_[DW]$")>;
+def : InstRW<[GenericWriteFPUL], (instregex "^FMADD_[DW]$")>;
+def : InstRW<[GenericWriteFPUL], (instregex "^FMSUB_[DW]$")>;
+def : InstRW<[GenericWriteFPUL], (instregex "^FMUL_[DW]$")>;
+def : InstRW<[GenericWriteFPUL], (instregex "^FRCP_[DW]$")>;
+def : InstRW<[GenericWriteFPUL], (instregex "^FRSQRT_[DW]$")>;
+def : InstRW<[GenericWriteFPUL], (instregex "^FSQRT_[DW]$")>;
+def : InstRW<[GenericWriteFPUL], (instregex "^FSUB_[DW]$")>;
+def : InstRW<[GenericWriteFPUL], (instregex "^FDIV_[DW]$")>;
+
+// copy.[su]_[bhwd]
+def : InstRW<[GenericWriteFPUMoveGPRFPU], (instregex "^COPY_U_[BHW]$")>;
+def : InstRW<[GenericWriteFPUMoveGPRFPU], (instregex "^COPY_S_[BHWD]$")>;
+
+def : InstRW<[GenericWriteFPUStore], (instregex "^ST_[BHWD]$")>;
+def : InstRW<[GenericWriteFPULoad], (instregex "^LD_[BHWD]$")>;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsScheduleP5600.td b/contrib/llvm/lib/Target/Mips/MipsScheduleP5600.td
new file mode 100644
index 000000000000..882a241d1426
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsScheduleP5600.td
@@ -0,0 +1,586 @@
+//==- MipsScheduleP5600.td - P5600 Scheduling Definitions --*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def MipsP5600Model : SchedMachineModel {
+ int IssueWidth = 2; // 2x dispatched per cycle
+ int MicroOpBufferSize = 48; // min(48, 48, 64)
+ int LoadLatency = 4;
+ int MispredictPenalty = 8; // TODO: Estimated
+
+ let CompleteModel = 1;
+
+ list<Predicate> UnsupportedFeatures = [HasMips32r6, HasMips64r6,
+ HasMips64, HasMips64r2, HasCnMips,
+ InMicroMips, InMips16Mode,
+ HasMicroMips32r6, HasMicroMips64r6,
+ HasDSP, HasDSPR2];
+
+}
+
+let SchedModel = MipsP5600Model in {
+
+// ALQ Pipelines
+// =============
+
+def P5600ALQ : ProcResource<1> { let BufferSize = 16; }
+def P5600IssueALU : ProcResource<1> { let Super = P5600ALQ; }
+
+// ALU Pipeline
+// ------------
+
+def P5600WriteALU : SchedWriteRes<[P5600IssueALU]>;
+
+// and, lui, nor, or, slti, sltiu, sub, subu, xor
+def : ItinRW<[P5600WriteALU],
+ [II_AND, II_LUI, II_NOR, II_OR, II_SLTI_SLTIU, II_SUB, II_SUBU,
+ II_XOR]>;
+
+// AGQ Pipelines
+// =============
+
+def P5600AGQ : ProcResource<3> { let BufferSize = 16; }
+def P5600IssueAL2 : ProcResource<1> { let Super = P5600AGQ; }
+def P5600IssueCTISTD : ProcResource<1> { let Super = P5600AGQ; }
+def P5600IssueLDST : ProcResource<1> { let Super = P5600AGQ; }
+
+def P5600AL2Div : ProcResource<1>;
+// Pseudo-resource used to block CTISTD when handling multi-pipeline splits.
+def P5600CTISTD : ProcResource<1>;
+
+// CTISTD Pipeline
+// ---------------
+
+def P5600WriteJump : SchedWriteRes<[P5600IssueCTISTD, P5600CTISTD]>;
+def P5600WriteJumpAndLink : SchedWriteRes<[P5600IssueCTISTD, P5600CTISTD]> {
+ let Latency = 2;
+}
+
+// b, beq, beql, bg[et]z, bl[et]z, bne, bnel, j, syscall, jal, bltzal,
+// jalr, jr.hb, jr
+def : ItinRW<[P5600WriteJump], [II_B, II_BCC, II_BCCZ, II_BCCZAL, II_J, II_JR,
+ II_JR_HB, II_DERET, II_ERET, II_ERETNC,
+ II_SYSCALL, II_BREAK, II_SDBBP, II_SSNOP,
+ II_TEQ, II_TEQI, II_TGE, II_TGEI, II_TGEIU,
+ II_TGEU, II_TLT, II_TLTI, II_TLTU, II_TNE,
+ II_TNEI, II_TRAP, II_TTLTIU, II_WAIT,
+ II_PAUSE]>;
+
+def : ItinRW<[P5600WriteJumpAndLink], [II_JAL, II_JALR, II_JALR_HB]>;
+
+def P5600COP0 : SchedWriteRes<[P5600IssueCTISTD, P5600CTISTD]>;
+
+def : ItinRW<[P5600COP0], [II_TLBINV, II_TLBINVF, II_TLBP, II_TLBR, II_TLBWI,
+ II_TLBWR, II_MFC0, II_MTC0]>;
+// LDST Pipeline
+// -------------
+
+def P5600WriteLoad : SchedWriteRes<[P5600IssueLDST]> {
+ let Latency = 4;
+}
+
+def P5600WriteLoadShifted : SchedWriteRes<[P5600IssueLDST, P5600CTISTD]> {
+ let Latency = 4;
+}
+
+def P5600WriteCache : SchedWriteRes<[P5600IssueLDST]>;
+
+def P5600WriteStore : SchedWriteRes<[P5600IssueLDST, P5600CTISTD]> {
+ // FIXME: This is a bit pessimistic. P5600CTISTD is only used during cycle 2
+ // not during 0, 1, and 2.
+ let ResourceCycles = [ 1, 3 ];
+}
+
+def P5600WriteGPRFromBypass : SchedWriteRes<[P5600IssueLDST]> {
+ let Latency = 2;
+}
+
+def P5600WriteStoreFromOtherUnits : SchedWriteRes<[P5600IssueLDST]>;
+def P5600WriteLoadToOtherUnits : SchedWriteRes<[P5600IssueLDST]> {
+ let Latency = 0;
+}
+
+// l[bhw], l[bh]u, ll
+def : ItinRW<[P5600WriteLoad], [II_LB, II_LBE, II_LBU, II_LBUE, II_LH, II_LHE,
+ II_LHU, II_LHUE, II_LW, II_LWE, II_LL, II_LLE,
+ II_LWPC]>;
+
+// lw[lr]
+def : ItinRW<[P5600WriteLoadShifted], [II_LWL, II_LWLE, II_LWR, II_LWRE]>;
+
+// s[bhw], sw[lr]
+def : ItinRW<[P5600WriteStore], [II_SB, II_SBE, II_SH, II_SHE, II_SW, II_SWE,
+ II_SWL, II_SWLE, II_SWR, II_SWRE, II_SC,
+ II_SCE]>;
+
+// pref, cache, sync, synci
+def : ItinRW<[P5600WriteCache], [II_PREF, II_PREFE, II_CACHE, II_CACHEE,
+ II_SYNC, II_SYNCI]>;
+
+// LDST is also used in moves from general purpose registers to floating point
+// and MSA.
+def P5600WriteMoveGPRToOtherUnits : SchedWriteRes<[P5600IssueLDST]> {
+ let Latency = 0;
+}
+
+// AL2 Pipeline
+// ------------
+
+def P5600WriteAL2 : SchedWriteRes<[P5600IssueAL2]>;
+def P5600WriteAL2BitExt : SchedWriteRes<[P5600IssueAL2]> { let Latency = 2; }
+def P5600WriteAL2ShadowMov : SchedWriteRes<[P5600IssueAL2]> { let Latency = 2; }
+def P5600WriteAL2CondMov : SchedWriteRes<[P5600IssueAL2, P5600CTISTD]> {
+ let Latency = 2;
+}
+def P5600WriteAL2Div : SchedWriteRes<[P5600IssueAL2, P5600AL2Div]> {
+ // Estimated worst case
+ let Latency = 34;
+ let ResourceCycles = [1, 34];
+}
+def P5600WriteAL2DivU : SchedWriteRes<[P5600IssueAL2, P5600AL2Div]> {
+ // Estimated worst case
+ let Latency = 34;
+ let ResourceCycles = [1, 34];
+}
+def P5600WriteAL2Mul : SchedWriteRes<[P5600IssueAL2]> { let Latency = 3; }
+def P5600WriteAL2Mult: SchedWriteRes<[P5600IssueAL2]> { let Latency = 5; }
+def P5600WriteAL2MAdd: SchedWriteRes<[P5600IssueAL2, P5600CTISTD]> {
+ let Latency = 5;
+}
+
+// clo, clz, di, ei, mfhi, mflo
+def : ItinRW<[P5600WriteAL2], [II_CLO, II_CLZ, II_DI, II_EI, II_MFHI_MFLO]>;
+
+// ehb, rdhwr, rdpgpr, wrpgpr, wsbh
+def : ItinRW<[P5600WriteAL2ShadowMov], [II_EHB, II_RDHWR, II_WSBH]>;
+
+// mov[nz]
+def : ItinRW<[P5600WriteAL2CondMov], [II_MOVN, II_MOVZ]>;
+
+// divu?
+def : ItinRW<[P5600WriteAL2Div], [II_DIV]>;
+def : ItinRW<[P5600WriteAL2DivU], [II_DIVU]>;
+
+// mul
+def : ItinRW<[P5600WriteAL2Mul], [II_MUL]>;
+// multu?, multu?
+def : ItinRW<[P5600WriteAL2Mult], [II_MULT, II_MULTU]>;
+// maddu?, msubu?, mthi, mtlo
+def : ItinRW<[P5600WriteAL2MAdd],
+ [II_MADD, II_MADDU, II_MSUB, II_MSUBU, II_MTHI_MTLO]>;
+
+// ext, ins
+def : ItinRW<[P5600WriteAL2BitExt], [II_EXT, II_INS]>;
+
+// Either ALU or AL2 Pipelines
+// ---------------------------
+//
+// Some instructions can choose between ALU and AL2, but once dispatched to
+// ALQ or AGQ respectively they are committed to that path.
+// The decision is based on the outcome of the most recent selection when the
+// choice was last available. For now, we assume ALU is always chosen.
+
+def P5600WriteEitherALU : SchedWriteVariant<
+ // FIXME: Implement selection predicate
+ [SchedVar<SchedPredicate<[{1}]>, [P5600WriteALU]>,
+ SchedVar<SchedPredicate<[{0}]>, [P5600WriteAL2]>
+ ]>;
+
+// add, addi, addiu, addu, andi, ori, rotr, se[bh], sllv?, sr[al]v?, slt, sltu,
+// xori
+def : ItinRW<[P5600WriteEitherALU],
+ [II_ADD, II_ADDI, II_ADDIU, II_ANDI, II_ORI, II_ROTR, II_SEB, II_SEH,
+ II_SLT_SLTU, II_SLL, II_SRA, II_SRL, II_XORI, II_ADDU, II_SLLV,
+ II_SRAV, II_SRLV, II_LSA]>;
+def : InstRW<[], (instrs COPY)>;
+
+// FPU Pipelines
+// =============
+
+def P5600FPQ : ProcResource<3> { let BufferSize = 16; }
+def P5600IssueFPUS : ProcResource<1> { let Super = P5600FPQ; }
+def P5600IssueFPUL : ProcResource<1> { let Super = P5600FPQ; }
+def P5600IssueFPULoad : ProcResource<1> { let Super = P5600FPQ; }
+
+def P5600FPUDivSqrt : ProcResource<2>;
+
+def P5600WriteFPUS : SchedWriteRes<[P5600IssueFPUS]>;
+def P5600WriteFPUL : SchedWriteRes<[P5600IssueFPUL]> { let Latency = 4; }
+def P5600WriteFPUL_MADDSUB : SchedWriteRes<[P5600IssueFPUL]> { let Latency = 6; }
+def P5600WriteFPUDivI : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> {
+ // Best/Common/Worst case = 7 / 23 / 27
+ let Latency = 23; // Using common case
+ let ResourceCycles = [ 1, 23 ];
+}
+def P5600WriteFPUDivS : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> {
+ // Best/Common/Worst case = 7 / 23 / 27
+ let Latency = 23; // Using common case
+ let ResourceCycles = [ 1, 23 ];
+}
+def P5600WriteFPUDivD : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> {
+ // Best/Common/Worst case = 7 / 31 / 35
+ let Latency = 31; // Using common case
+ let ResourceCycles = [ 1, 31 ];
+}
+def P5600WriteFPURcpS : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> {
+ // Best/Common/Worst case = 7 / 19 / 23
+ let Latency = 19; // Using common case
+ let ResourceCycles = [ 1, 19 ];
+}
+def P5600WriteFPURcpD : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> {
+ // Best/Common/Worst case = 7 / 27 / 31
+ let Latency = 27; // Using common case
+ let ResourceCycles = [ 1, 27 ];
+}
+def P5600WriteFPURsqrtS : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> {
+ // Best/Common/Worst case = 7 / 27 / 27
+ let Latency = 27; // Using common case
+ let ResourceCycles = [ 1, 27 ];
+}
+def P5600WriteFPURsqrtD : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> {
+ // Best/Common/Worst case = 7 / 27 / 31
+ let Latency = 27; // Using common case
+ let ResourceCycles = [ 1, 27 ];
+}
+def P5600WriteFPUSqrtS : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> {
+ // Best/Common/Worst case = 7 / 27 / 31
+ let Latency = 27; // Using common case
+ let ResourceCycles = [ 1, 27 ];
+}
+def P5600WriteFPUSqrtD : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> {
+ // Best/Common/Worst case = 7 / 35 / 39
+ let Latency = 35; // Using common case
+ let ResourceCycles = [ 1, 35 ];
+}
+def P5600WriteMSAShortLogic : SchedWriteRes<[P5600IssueFPUS]>;
+def P5600WriteMSAShortInt : SchedWriteRes<[P5600IssueFPUS]> { let Latency = 2; }
+def P5600WriteMoveOtherUnitsToFPU : SchedWriteRes<[P5600IssueFPUS]>;
+def P5600WriteMSAOther3 : SchedWriteRes<[P5600IssueFPUS]> { let Latency = 3; }
+def P5600WriteMSALongInt : SchedWriteRes<[P5600IssueFPUS]> { let Latency = 5; }
+
+// vshf.[bhwd], binsl.[bhwd], binsr.[bhwd], insert.[bhwd], sld?.[bhwd],
+// bset.[bhwd], bclr.[bhwd], bneg.[bhwd], bsel_v, bseli_b
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^VSHF_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BINSL|BINSLI)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BINSR|BINSRI)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^INSERT_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^(SLD|SLDI)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BSET|BSETI)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BCLR|BCLRI)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BNEG|BNEGI)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BSEL_V|BSELI_B)$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^BMN*Z.*$")>;
+
+// pcnt.[bhwd], sat_s.[bhwd], sat_u.bhwd]
+def : InstRW<[P5600WriteMSAOther3], (instregex "^PCNT_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAOther3], (instregex "^SAT_(S|U)_[BHWD]$")>;
+
+// bnz.[bhwdv], cfcmsa, ctcmsa
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(BNZ|BZ)_[BHWDV]$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^C(F|T)CMSA$")>;
+
+// FPUS is also used in moves from floating point and MSA registers to general
+// purpose registers.
+def P5600WriteMoveFPUSToOtherUnits : SchedWriteRes<[P5600IssueFPUS]> {
+ let Latency = 0;
+}
+
+// FPUL is also used in moves from floating point and MSA registers to general
+// purpose registers.
+def P5600WriteMoveFPULToOtherUnits : SchedWriteRes<[P5600IssueFPUL]>;
+
+// Short Pipe
+// ----------
+//
+// abs.[ds], abs.ps, bc1[tf]l?, mov[tf].[ds], mov[tf], mov.[ds], [cm][ft]c1,
+// m[ft]hc1, neg.[ds], neg.ps, nor.v, nori.b, or.v, ori.b, xor.v, xori.b,
+// sdxc1, sdc1, st.[bhwd], swc1, swxc1
+def : ItinRW<[P5600WriteFPUS], [II_ABS, II_MOVF_D, II_MOVF_S, II_MOVT_D,
+ II_MOVT_S, II_MOV_D, II_MOV_S, II_NEG]>;
+
+// adds_a.[bhwd], adds_[asu].[bhwd], addvi?.[bhwd], asub_[us].[bhwd],
+// aver?_[us].[bhwd], shf.[bhw], fill[bhwd], splat?.[bhwd]
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^ADD_A_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^ADDS_[ASU]_[BHWD]$")>;
+// TODO: ADDVI_[BHW] might be 1 cycle latency rather than 2. Need to confirm it.
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^ADDVI?_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^ASUB_[US].[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^AVER?_[US].[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^SHF_[BHW]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^FILL_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^(SPLAT|SPLATI)_[BHWD]$")>;
+
+// and.v, andi.b, move.v, ldi.[bhwd]
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^MOVE_V$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^LDI_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(AND|OR|[XN]OR)_V$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(AND|OR|[XN]OR)I_B$")>;
+
+// vshf.[bhwd], binsl.[bhwd], binsr.[bhwd], insert.[bhwd], sld?.[bhwd],
+// bset.[bhwd], bclr.[bhwd], bneg.[bhwd], bsel_v, bseli_b
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^VSHF_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BINSL|BINSLI)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BINSR|BINSRI)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^INSERT_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^(SLD|SLDI)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BSET|BSETI)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BCLR|BCLRI)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BNEG|BNEGI)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BSEL_V|BSELI_B)$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^BMN*Z.*$")>;
+
+// pcnt.[bhwd], sat_s.[bhwd], sat_u.bhwd]
+def : InstRW<[P5600WriteMSAOther3], (instregex "^PCNT_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAOther3], (instregex "^SAT_(S|U)_[BHWD]$")>;
+
+// fexp2_w, fexp2_d
+def : InstRW<[P5600WriteFPUS], (instregex "^FEXP2_(W|D)$")>;
+
+// compare, converts, round to int, floating point truncate.
+def : InstRW<[P5600WriteFPUS], (instregex "^(CLT|CLTI)_(S|U)_[BHWD]$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^(CLE|CLEI)_(S|U)_[BHWD]$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^(CEQ|CEQI)_[BHWD]$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^CMP_UN_(S|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^CMP_UEQ_(S|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^CMP_EQ_(S|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^CMP_LT_(S|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^CMP_ULT_(S|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^CMP_LE_(S|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^CMP_ULE_(S|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FS(AF|EQ|LT|LE|NE|OR)_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FSUEQ_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FSULE_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FSULT_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FSUNE_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FSUN_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FCAF_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FCEQ_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FCLE_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FCLT_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FCNE_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FCOR_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FCUEQ_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FCULE_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FCULT_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FCUNE_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FCUN_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FABS_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FFINT_(U|S)_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FFQL_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FFQR_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FTINT_(U|S)_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FRINT_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FTQ_(H|W)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FTRUNC_(U|S)_(W|D)$")>;
+
+// fexdo.[hw], fexupl.[wd], fexupr.[wd]
+def : InstRW<[P5600WriteFPUS], (instregex "^FEXDO_(H|W)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FEXUPL_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FEXUPR_(W|D)$")>;
+
+// fclass.[wd], fmax.[wd], fmax_a.[wd], fmin.[wd], fmin_a.[wd], flog2.[wd]
+def : InstRW<[P5600WriteFPUS], (instregex "^FCLASS_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FMAX_A_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FMAX_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FMIN_A_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FMIN_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FLOG2_(W|D)$")>;
+
+// interleave right/left, interleave even/odd, insert
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(ILVR|ILVL)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(ILVEV|ILVOD)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^INSVE_[BHWD]$")>;
+
+// subs_?.[bhwd], subsus_?.[bhwd], subsuu_?.[bhwd], subvi.[bhwd], subv.[bhwd],
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^SUBS_(S|U)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^SUBSUS_(S|U)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^SUBSUU_(S|U)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^SUBVI_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^SUBV_[BHWD]$")>;
+
+// mod_[su].[bhwd], div_[su].[bhwd]
+def : InstRW<[P5600WriteFPUDivI], (instregex "^MOD_(S|U)_[BHWD]$")>;
+def : InstRW<[P5600WriteFPUDivI], (instregex "^DIV_(S|U)_[BHWD]$")>;
+
+// hadd_[su].[bhwd], hsub_[su].[bhwd], max_[sua].[bhwd], min_[sua].[bhwd],
+// maxi_[su].[bhwd], mini_[su].[bhwd], sra?.[bhwd], srar?.[bhwd], srlr.[bhwd],
+// sll?.[bhwd], pckev.[bhwd], pckod.[bhwd], nloc.[bhwd], nlzc.[bhwd],
+// insve.[bhwd]
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^HADD_(S|U)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^HSUB_(S|U)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(MAX|MIN)_S_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(MAX|MIN)_U_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(MAX|MIN)_A_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(MAXI|MINI)_(S|U)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(SRA|SRAI)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(SRL|SRLI)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(SRAR|SRARI)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(SRLR|SRLRI)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(SLL|SLLI)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(PCKEV|PCKOD)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(NLOC|NLZC)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^INSVE_[BHWD]$")>;
+
+// Long Pipe
+// ----------
+//
+// add.[ds], add.ps, cvt.d.[sw], cvt.s.[dw], cvt.w.[sd], cvt.[sw].ps,
+// cvt.ps.[sw], c.<cc>.[ds], c.<cc>.ps, mul.[ds], mul.ps, sub.[ds], sub.ps,
+// trunc.w.[ds], trunc.w.ps
+def : ItinRW<[P5600WriteFPUL],
+ [II_ADD_D, II_ADD_S, II_CVT, II_C_CC_D, II_C_CC_S, II_MUL_D,
+ II_MUL_S, II_SUB_D, II_SUB_S, II_TRUNC]>;
+
+// div.[ds], div.ps
+def : ItinRW<[P5600WriteFPUDivS], [II_DIV_S]>;
+def : ItinRW<[P5600WriteFPUDivD], [II_DIV_D]>;
+
+// sqrt.[ds], sqrt.ps
+def : ItinRW<[P5600WriteFPUSqrtS], [II_SQRT_S]>;
+def : ItinRW<[P5600WriteFPUSqrtD], [II_SQRT_D]>;
+
+// frcp.[wd], frsqrt.[wd]
+def : InstRW<[P5600WriteFPURsqrtD], (instregex "^FRCP_(W|D)$")>;
+def : InstRW<[P5600WriteFPURsqrtD], (instregex "^FRSQRT_(W|D)$")>;
+
+def : ItinRW<[P5600WriteFPURsqrtD], [II_RECIP_D, II_RSQRT_D]>;
+def : ItinRW<[P5600WriteFPURsqrtS], [II_RECIP_S, II_RSQRT_S]>;
+
+// fmadd.[wd], fmsubb.[wd], fdiv.[wd], fsqrt.[wd], fmul.[wd], fadd.[wd],
+// fsub.[wd]
+def : InstRW<[P5600WriteFPUL_MADDSUB], (instregex "^FMADD_(W|D)$")>;
+def : InstRW<[P5600WriteFPUL_MADDSUB], (instregex "^FMSUB_(W|D)$")>;
+def : InstRW<[P5600WriteFPUDivS], (instregex "^FDIV_W$")>;
+def : InstRW<[P5600WriteFPUDivD], (instregex "^FDIV_D$")>;
+def : InstRW<[P5600WriteFPUSqrtS], (instregex "^FSQRT_W$")>;
+def : InstRW<[P5600WriteFPUSqrtD], (instregex "^FSQRT_D$")>;
+def : InstRW<[P5600WriteFPUL], (instregex "^FMUL_(W|D)$")>;
+def : InstRW<[P5600WriteFPUL], (instregex "^FADD_(W|D)$")>;
+def : InstRW<[P5600WriteFPUL], (instregex "^FSUB_(W|D)$")>;
+
+// dpadd_?.[bhwd], dpsub_?.[bhwd], dotp_?.[bhwd], msubv.[bhwd], maddv.[bhwd]
+// mulv.[bhwd].
+def : InstRW<[P5600WriteMSALongInt], (instregex "^DPADD_(S|U)_[HWD]$")>;
+def : InstRW<[P5600WriteMSALongInt], (instregex "^DPSUB_(S|U)_[HWD]$")>;
+def : InstRW<[P5600WriteMSALongInt], (instregex "^DOTP_(S|U)_[HWD]$")>;
+def : InstRW<[P5600WriteMSALongInt], (instregex "^MSUBV_[BHWD]$")>;
+def : InstRW<[P5600WriteMSALongInt], (instregex "^MADDV_[BHWD]$")>;
+def : InstRW<[P5600WriteMSALongInt], (instregex "^MULV_[BHWD]$")>;
+
+def : InstRW<[P5600WriteMSALongInt], (instregex "^MADDR_Q_[HW]$")>;
+def : InstRW<[P5600WriteMSALongInt], (instregex "^MADD_Q_[HW]$")>;
+def : InstRW<[P5600WriteMSALongInt], (instregex "^MSUBR_Q_[HW]$")>;
+def : InstRW<[P5600WriteMSALongInt], (instregex "^MSUB_Q_[HW]$")>;
+def : InstRW<[P5600WriteMSALongInt], (instregex "^MULR_Q_[HW]$")>;
+def : InstRW<[P5600WriteMSALongInt], (instregex "^MUL_Q_[HW]$")>;
+
+// madd.[ds], msub.[ds], nmadd.[ds], nmsub.[ds],
+// Operand 0 is read on cycle 5. All other operands are read on operand 0.
+def : ItinRW<[SchedReadAdvance<5>, P5600WriteFPUL_MADDSUB],
+ [II_MADD_D, II_MADD_S, II_MSUB_D, II_MSUB_S, II_NMADD_D,
+ II_NMADD_S, II_NMSUB_D, II_NMSUB_S]>;
+
+// madd.ps, msub.ps, nmadd.ps, nmsub.ps
+// Operand 0 and 1 are read on cycle 5. All others are read on operand 0.
+// (none of these instructions exist in the backend yet)
+
+// Load Pipe
+// ---------
+//
+// This is typically used in conjunction with the load pipeline under the AGQ
+// All the instructions are in the 'Tricky Instructions' section.
+
+def P5600WriteLoadOtherUnitsToFPU : SchedWriteRes<[P5600IssueFPULoad]> {
+ let Latency = 4;
+}
+
+// Tricky Instructions
+// ===================
+//
+// These instructions are split across multiple uops (in different pipelines)
+// that must cooperate to complete the operation
+
+// FIXME: This isn't quite right since the implementation of WriteSequence
+// current aggregates the resources and ignores the exact cycle they are
+// used.
+def P5600WriteMoveGPRToFPU : WriteSequence<[P5600WriteMoveGPRToOtherUnits,
+ P5600WriteMoveOtherUnitsToFPU]>;
+
+// FIXME: This isn't quite right since the implementation of WriteSequence
+// current aggregates the resources and ignores the exact cycle they are
+// used.
+def P5600WriteMoveFPUToGPR : WriteSequence<[P5600WriteMoveFPUSToOtherUnits,
+ P5600WriteGPRFromBypass]>;
+
+// FIXME: This isn't quite right since the implementation of WriteSequence
+// current aggregates the resources and ignores the exact cycle they are
+// used.
+def P5600WriteStoreFPUS : WriteSequence<[P5600WriteMoveFPUSToOtherUnits,
+ P5600WriteStoreFromOtherUnits]>;
+
+// FIXME: This isn't quite right since the implementation of WriteSequence
+// current aggregates the resources and ignores the exact cycle they are
+// used.
+def P5600WriteStoreFPUL : WriteSequence<[P5600WriteMoveFPULToOtherUnits,
+ P5600WriteStoreFromOtherUnits]>;
+
+// FIXME: This isn't quite right since the implementation of WriteSequence
+// current aggregates the resources and ignores the exact cycle they are
+// used.
+def P5600WriteLoadFPU : WriteSequence<[P5600WriteLoadToOtherUnits,
+ P5600WriteLoadOtherUnitsToFPU]>;
+
+// ctc1, mtc1, mthc1
+def : ItinRW<[P5600WriteMoveGPRToFPU], [II_CTC1, II_MTC1, II_MTHC1]>;
+
+// copy.[su]_[bhwd]
+def : InstRW<[P5600WriteMoveFPUToGPR], (instregex "^COPY_U_[BHW]$")>;
+def : InstRW<[P5600WriteMoveFPUToGPR], (instregex "^COPY_S_[BHWD]$")>;
+
+// bc1[ft], cfc1, mfc1, mfhc1, movf, movt
+def : ItinRW<[P5600WriteMoveFPUToGPR],
+ [II_BC1F, II_BC1FL, II_BC1T, II_BC1TL, II_CFC1, II_MFC1, II_MFHC1, II_MOVF, II_MOVT]>;
+
+// swc1, swxc1, st.[bhwd]
+def : ItinRW<[P5600WriteStoreFPUS], [II_SDC1, II_SDXC1, II_SUXC1, II_SWC1,
+ II_SWXC1]>;
+def : InstRW<[P5600WriteStoreFPUS], (instregex "^ST_[BHWD]$")>;
+
+// movn.[ds], movz.[ds]
+def : ItinRW<[P5600WriteStoreFPUL], [II_MOVN_D, II_MOVN_S, II_MOVZ_D, II_MOVZ_S]>;
+
+// l[dw]x?c1, ld.[bhwd]
+def : ItinRW<[P5600WriteLoadFPU], [II_LDC1, II_LDXC1, II_LWC1, II_LWXC1, II_LUXC1]>;
+def : InstRW<[P5600WriteLoadFPU], (instregex "LD_[BHWD]")>;
+
+// Unsupported Instructions
+// ========================
+//
+// The following instruction classes are never valid on P5600.
+// II_DADDIU, II_DADDU, II_DMFC1, II_DMTC1, II_DMULT, II_DMULTU, II_DROTR,
+// II_DROTR32, II_DROTRV, II_DDIV, II_DSLL, II_DSLL32, II_DSLLV, II_DSRA,
+// II_DSRA32, II_DSRAV, II_DSRL, II_DSRL32, II_DSRLV, II_DSUBU, II_DDIVU,
+// II_JALRC, II_LD, II_LD[LR], II_RESTORE, II_SAVE, II_SD, II_SDC1, II_SD[LR]
+//
+// The following instructions are never valid on P5600.
+// addq.ph, repl.ph, repl.qb, subq.ph, subu_s.qb
+//
+// Guesswork
+// =========
+//
+// This section is largely temporary guesswork.
+
+// ceil.[lw].[ds], floor.[lw].[ds]
+// Reason behind guess: trunc.[lw].ds and the various cvt's are in FPUL
+def : ItinRW<[P5600WriteFPUL], [II_CEIL, II_FLOOR, II_ROUND]>;
+
+// rotrv
+// Reason behind guess: rotr is in the same category and the two register forms
+// generally follow the immediate forms in this category
+def : ItinRW<[P5600WriteEitherALU], [II_ROTRV]>;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsSubtarget.cpp b/contrib/llvm/lib/Target/Mips/MipsSubtarget.cpp
new file mode 100644
index 000000000000..3e7570ff46ed
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSubtarget.cpp
@@ -0,0 +1,174 @@
+//===-- MipsSubtarget.cpp - Mips Subtarget Information --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Mips specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsMachineFunction.h"
+#include "Mips.h"
+#include "MipsRegisterInfo.h"
+#include "MipsSubtarget.h"
+#include "MipsTargetMachine.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "MipsGenSubtargetInfo.inc"
+
+// FIXME: Maybe this should be on by default when Mips16 is specified
+//
+static cl::opt<bool>
+ Mixed16_32("mips-mixed-16-32", cl::init(false),
+ cl::desc("Allow for a mixture of Mips16 "
+ "and Mips32 code in a single output file"),
+ cl::Hidden);
+
+static cl::opt<bool> Mips_Os16("mips-os16", cl::init(false),
+ cl::desc("Compile all functions that don't use "
+ "floating point as Mips 16"),
+ cl::Hidden);
+
+static cl::opt<bool> Mips16HardFloat("mips16-hard-float", cl::NotHidden,
+ cl::desc("Enable mips16 hard float."),
+ cl::init(false));
+
+static cl::opt<bool>
+ Mips16ConstantIslands("mips16-constant-islands", cl::NotHidden,
+ cl::desc("Enable mips16 constant islands."),
+ cl::init(true));
+
+static cl::opt<bool>
+ GPOpt("mgpopt", cl::Hidden,
+ cl::desc("Enable gp-relative addressing of mips small data items"));
+
+void MipsSubtarget::anchor() { }
+
+MipsSubtarget::MipsSubtarget(const Triple &TT, const std::string &CPU,
+ const std::string &FS, bool little,
+ const MipsTargetMachine &TM)
+ : MipsGenSubtargetInfo(TT, CPU, FS), MipsArchVersion(MipsDefault),
+ IsLittle(little), IsSoftFloat(false), IsSingleFloat(false), IsFPXX(false),
+ NoABICalls(false), IsFP64bit(false), UseOddSPReg(true),
+ IsNaN2008bit(false), IsGP64bit(false), HasVFPU(false), HasCnMips(false),
+ HasMips3_32(false), HasMips3_32r2(false), HasMips4_32(false),
+ HasMips4_32r2(false), HasMips5_32r2(false), InMips16Mode(false),
+ InMips16HardFloat(Mips16HardFloat), InMicroMipsMode(false), HasDSP(false),
+ HasDSPR2(false), HasDSPR3(false), AllowMixed16_32(Mixed16_32 | Mips_Os16),
+ Os16(Mips_Os16), HasMSA(false), UseTCCInDIV(false), HasEVA(false), TM(TM),
+ TargetTriple(TT), TSInfo(),
+ InstrInfo(
+ MipsInstrInfo::create(initializeSubtargetDependencies(CPU, FS, TM))),
+ FrameLowering(MipsFrameLowering::create(*this)),
+ TLInfo(MipsTargetLowering::create(TM, *this)) {
+
+ PreviousInMips16Mode = InMips16Mode;
+
+ if (MipsArchVersion == MipsDefault)
+ MipsArchVersion = Mips32;
+
+ // Don't even attempt to generate code for MIPS-I and MIPS-V. They have not
+ // been tested and currently exist for the integrated assembler only.
+ if (MipsArchVersion == Mips1)
+ report_fatal_error("Code generation for MIPS-I is not implemented", false);
+ if (MipsArchVersion == Mips5)
+ report_fatal_error("Code generation for MIPS-V is not implemented", false);
+
+ // Check if Architecture and ABI are compatible.
+ assert(((!isGP64bit() && isABI_O32()) ||
+ (isGP64bit() && (isABI_N32() || isABI_N64()))) &&
+ "Invalid Arch & ABI pair.");
+
+ if (hasMSA() && !isFP64bit())
+ report_fatal_error("MSA requires a 64-bit FPU register file (FR=1 mode). "
+ "See -mattr=+fp64.",
+ false);
+
+ if (!isABI_O32() && !useOddSPReg())
+ report_fatal_error("-mattr=+nooddspreg requires the O32 ABI.", false);
+
+ if (IsFPXX && (isABI_N32() || isABI_N64()))
+ report_fatal_error("FPXX is not permitted for the N32/N64 ABI's.", false);
+
+ if (hasMips32r6()) {
+ StringRef ISA = hasMips64r6() ? "MIPS64r6" : "MIPS32r6";
+
+ assert(isFP64bit());
+ assert(isNaN2008());
+ if (hasDSP())
+ report_fatal_error(ISA + " is not compatible with the DSP ASE", false);
+ }
+
+ if (NoABICalls && TM.isPositionIndependent())
+ report_fatal_error("position-independent code requires '-mabicalls'");
+
+ // Set UseSmallSection.
+ UseSmallSection = GPOpt;
+ if (!NoABICalls && GPOpt) {
+ errs() << "warning: cannot use small-data accesses for '-mabicalls'"
+ << "\n";
+ UseSmallSection = false;
+ }
+}
+
+bool MipsSubtarget::isPositionIndependent() const {
+ return TM.isPositionIndependent();
+}
+
+/// This overrides the PostRAScheduler bit in the SchedModel for any CPU.
+bool MipsSubtarget::enablePostRAScheduler() const { return true; }
+
+void MipsSubtarget::getCriticalPathRCs(RegClassVector &CriticalPathRCs) const {
+ CriticalPathRCs.clear();
+ CriticalPathRCs.push_back(isGP64bit() ?
+ &Mips::GPR64RegClass : &Mips::GPR32RegClass);
+}
+
+CodeGenOpt::Level MipsSubtarget::getOptLevelToEnablePostRAScheduler() const {
+ return CodeGenOpt::Aggressive;
+}
+
+MipsSubtarget &
+MipsSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS,
+ const TargetMachine &TM) {
+ std::string CPUName = MIPS_MC::selectMipsCPU(TM.getTargetTriple(), CPU);
+
+ // Parse features string.
+ ParseSubtargetFeatures(CPUName, FS);
+ // Initialize scheduling itinerary for the specified CPU.
+ InstrItins = getInstrItineraryForCPU(CPUName);
+
+ if (InMips16Mode && !IsSoftFloat)
+ InMips16HardFloat = true;
+
+ return *this;
+}
+
+bool MipsSubtarget::useConstantIslands() {
+ DEBUG(dbgs() << "use constant islands " << Mips16ConstantIslands << "\n");
+ return Mips16ConstantIslands;
+}
+
+Reloc::Model MipsSubtarget::getRelocationModel() const {
+ return TM.getRelocationModel();
+}
+
+bool MipsSubtarget::isABI_N64() const { return getABI().IsN64(); }
+bool MipsSubtarget::isABI_N32() const { return getABI().IsN32(); }
+bool MipsSubtarget::isABI_O32() const { return getABI().IsO32(); }
+const MipsABIInfo &MipsSubtarget::getABI() const { return TM.getABI(); }
diff --git a/contrib/llvm/lib/Target/Mips/MipsSubtarget.h b/contrib/llvm/lib/Target/Mips/MipsSubtarget.h
new file mode 100644
index 000000000000..38d3cee70477
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSubtarget.h
@@ -0,0 +1,316 @@
+//===-- MipsSubtarget.h - Define Subtarget for the Mips ---------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Mips specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSSUBTARGET_H
+#define LLVM_LIB_TARGET_MIPS_MIPSSUBTARGET_H
+
+#include "MCTargetDesc/MipsABIInfo.h"
+#include "MipsFrameLowering.h"
+#include "MipsISelLowering.h"
+#include "MipsInstrInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "MipsGenSubtargetInfo.inc"
+
+namespace llvm {
+class StringRef;
+
+class MipsTargetMachine;
+
+class MipsSubtarget : public MipsGenSubtargetInfo {
+ virtual void anchor();
+
+ enum MipsArchEnum {
+ MipsDefault,
+ Mips1, Mips2, Mips32, Mips32r2, Mips32r3, Mips32r5, Mips32r6, Mips32Max,
+ Mips3, Mips4, Mips5, Mips64, Mips64r2, Mips64r3, Mips64r5, Mips64r6
+ };
+
+ enum class CPU { P5600 };
+
+ // Mips architecture version
+ MipsArchEnum MipsArchVersion;
+
+ // Processor implementation (unused but required to exist by
+ // tablegen-erated code).
+ CPU ProcImpl;
+
+ // IsLittle - The target is Little Endian
+ bool IsLittle;
+
+ // IsSoftFloat - The target does not support any floating point instructions.
+ bool IsSoftFloat;
+
+ // IsSingleFloat - The target only supports single precision float
+ // point operations. This enable the target to use all 32 32-bit
+ // floating point registers instead of only using even ones.
+ bool IsSingleFloat;
+
+ // IsFPXX - MIPS O32 modeless ABI.
+ bool IsFPXX;
+
+ // NoABICalls - Disable SVR4-style position-independent code.
+ bool NoABICalls;
+
+ // IsFP64bit - The target processor has 64-bit floating point registers.
+ bool IsFP64bit;
+
+ /// Are odd single-precision registers permitted?
+ /// This corresponds to -modd-spreg and -mno-odd-spreg
+ bool UseOddSPReg;
+
+ // IsNan2008 - IEEE 754-2008 NaN encoding.
+ bool IsNaN2008bit;
+
+ // IsFP64bit - General-purpose registers are 64 bits wide
+ bool IsGP64bit;
+
+ // IsPTR64bit - Pointers are 64 bit wide
+ bool IsPTR64bit;
+
+ // HasVFPU - Processor has a vector floating point unit.
+ bool HasVFPU;
+
+ // CPU supports cnMIPS (Cavium Networks Octeon CPU).
+ bool HasCnMips;
+
+ // isLinux - Target system is Linux. Is false we consider ELFOS for now.
+ bool IsLinux;
+
+ // UseSmallSection - Small section is used.
+ bool UseSmallSection;
+
+ /// Features related to the presence of specific instructions.
+
+ // HasMips3_32 - The subset of MIPS-III instructions added to MIPS32
+ bool HasMips3_32;
+
+ // HasMips3_32r2 - The subset of MIPS-III instructions added to MIPS32r2
+ bool HasMips3_32r2;
+
+ // HasMips4_32 - Has the subset of MIPS-IV present in MIPS32
+ bool HasMips4_32;
+
+ // HasMips4_32r2 - Has the subset of MIPS-IV present in MIPS32r2
+ bool HasMips4_32r2;
+
+ // HasMips5_32r2 - Has the subset of MIPS-V present in MIPS32r2
+ bool HasMips5_32r2;
+
+ // InMips16 -- can process Mips16 instructions
+ bool InMips16Mode;
+
+ // Mips16 hard float
+ bool InMips16HardFloat;
+
+ // PreviousInMips16 -- the function we just processed was in Mips 16 Mode
+ bool PreviousInMips16Mode;
+
+ // InMicroMips -- can process MicroMips instructions
+ bool InMicroMipsMode;
+
+ // HasDSP, HasDSPR2, HasDSPR3 -- supports DSP ASE.
+ bool HasDSP, HasDSPR2, HasDSPR3;
+
+ // Allow mixed Mips16 and Mips32 in one source file
+ bool AllowMixed16_32;
+
+ // Optimize for space by compiling all functions as Mips 16 unless
+ // it needs floating point. Functions needing floating point are
+ // compiled as Mips32
+ bool Os16;
+
+ // HasMSA -- supports MSA ASE.
+ bool HasMSA;
+
+ // UseTCCInDIV -- Enables the use of trapping in the assembler.
+ bool UseTCCInDIV;
+
+ // HasEVA -- supports EVA ASE.
+ bool HasEVA;
+
+ InstrItineraryData InstrItins;
+
+ // We can override the determination of whether we are in mips16 mode
+ // as from the command line
+ enum {NoOverride, Mips16Override, NoMips16Override} OverrideMode;
+
+ const MipsTargetMachine &TM;
+
+ Triple TargetTriple;
+
+ const SelectionDAGTargetInfo TSInfo;
+ std::unique_ptr<const MipsInstrInfo> InstrInfo;
+ std::unique_ptr<const MipsFrameLowering> FrameLowering;
+ std::unique_ptr<const MipsTargetLowering> TLInfo;
+
+public:
+ bool isPositionIndependent() const;
+ /// This overrides the PostRAScheduler bit in the SchedModel for each CPU.
+ bool enablePostRAScheduler() const override;
+ void getCriticalPathRCs(RegClassVector &CriticalPathRCs) const override;
+ CodeGenOpt::Level getOptLevelToEnablePostRAScheduler() const override;
+
+ bool isABI_N64() const;
+ bool isABI_N32() const;
+ bool isABI_O32() const;
+ const MipsABIInfo &getABI() const;
+ bool isABI_FPXX() const { return isABI_O32() && IsFPXX; }
+
+ /// This constructor initializes the data members to match that
+ /// of the specified triple.
+ MipsSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS,
+ bool little, const MipsTargetMachine &TM);
+
+ /// ParseSubtargetFeatures - Parses features string setting specified
+ /// subtarget options. Definition of function is auto generated by tblgen.
+ void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+ bool hasMips1() const { return MipsArchVersion >= Mips1; }
+ bool hasMips2() const { return MipsArchVersion >= Mips2; }
+ bool hasMips3() const { return MipsArchVersion >= Mips3; }
+ bool hasMips4() const { return MipsArchVersion >= Mips4; }
+ bool hasMips5() const { return MipsArchVersion >= Mips5; }
+ bool hasMips4_32() const { return HasMips4_32; }
+ bool hasMips4_32r2() const { return HasMips4_32r2; }
+ bool hasMips32() const {
+ return (MipsArchVersion >= Mips32 && MipsArchVersion < Mips32Max) ||
+ hasMips64();
+ }
+ bool hasMips32r2() const {
+ return (MipsArchVersion >= Mips32r2 && MipsArchVersion < Mips32Max) ||
+ hasMips64r2();
+ }
+ bool hasMips32r3() const {
+ return (MipsArchVersion >= Mips32r3 && MipsArchVersion < Mips32Max) ||
+ hasMips64r2();
+ }
+ bool hasMips32r5() const {
+ return (MipsArchVersion >= Mips32r5 && MipsArchVersion < Mips32Max) ||
+ hasMips64r5();
+ }
+ bool hasMips32r6() const {
+ return (MipsArchVersion >= Mips32r6 && MipsArchVersion < Mips32Max) ||
+ hasMips64r6();
+ }
+ bool hasMips64() const { return MipsArchVersion >= Mips64; }
+ bool hasMips64r2() const { return MipsArchVersion >= Mips64r2; }
+ bool hasMips64r3() const { return MipsArchVersion >= Mips64r3; }
+ bool hasMips64r5() const { return MipsArchVersion >= Mips64r5; }
+ bool hasMips64r6() const { return MipsArchVersion >= Mips64r6; }
+
+ bool hasCnMips() const { return HasCnMips; }
+
+ bool isLittle() const { return IsLittle; }
+ bool isABICalls() const { return !NoABICalls; }
+ bool isFPXX() const { return IsFPXX; }
+ bool isFP64bit() const { return IsFP64bit; }
+ bool useOddSPReg() const { return UseOddSPReg; }
+ bool noOddSPReg() const { return !UseOddSPReg; }
+ bool isNaN2008() const { return IsNaN2008bit; }
+ bool isGP64bit() const { return IsGP64bit; }
+ bool isGP32bit() const { return !IsGP64bit; }
+ unsigned getGPRSizeInBytes() const { return isGP64bit() ? 8 : 4; }
+ bool isPTR64bit() const { return IsPTR64bit; }
+ bool isPTR32bit() const { return !IsPTR64bit; }
+ bool isSingleFloat() const { return IsSingleFloat; }
+ bool hasVFPU() const { return HasVFPU; }
+ bool inMips16Mode() const { return InMips16Mode; }
+ bool inMips16ModeDefault() const {
+ return InMips16Mode;
+ }
+ // Hard float for mips16 means essentially to compile as soft float
+ // but to use a runtime library for soft float that is written with
+ // native mips32 floating point instructions (those runtime routines
+ // run in mips32 hard float mode).
+ bool inMips16HardFloat() const {
+ return inMips16Mode() && InMips16HardFloat;
+ }
+ bool inMicroMipsMode() const { return InMicroMipsMode; }
+ bool inMicroMips32r6Mode() const { return InMicroMipsMode && hasMips32r6(); }
+ bool inMicroMips64r6Mode() const { return InMicroMipsMode && hasMips64r6(); }
+ bool hasDSP() const { return HasDSP; }
+ bool hasDSPR2() const { return HasDSPR2; }
+ bool hasDSPR3() const { return HasDSPR3; }
+ bool hasMSA() const { return HasMSA; }
+ bool hasEVA() const { return HasEVA; }
+ bool useSmallSection() const { return UseSmallSection; }
+
+ bool hasStandardEncoding() const { return !inMips16Mode(); }
+
+ bool useSoftFloat() const { return IsSoftFloat; }
+
+ bool enableLongBranchPass() const {
+ return hasStandardEncoding() || allowMixed16_32();
+ }
+
+ /// Features related to the presence of specific instructions.
+ bool hasExtractInsert() const { return !inMips16Mode() && hasMips32r2(); }
+ bool hasMTHC1() const { return hasMips32r2(); }
+
+ bool allowMixed16_32() const { return inMips16ModeDefault() |
+ AllowMixed16_32; }
+
+ bool os16() const { return Os16; }
+
+ bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
+
+ // for now constant islands are on for the whole compilation unit but we only
+ // really use them if in addition we are in mips16 mode
+ static bool useConstantIslands();
+
+ unsigned stackAlignment() const { return hasMips64() ? 16 : 8; }
+
+ // Grab relocation model
+ Reloc::Model getRelocationModel() const;
+
+ MipsSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS,
+ const TargetMachine &TM);
+
+ /// Does the system support unaligned memory access.
+ ///
+ /// MIPS32r6/MIPS64r6 require full unaligned access support but does not
+ /// specify which component of the system provides it. Hardware, software, and
+ /// hybrid implementations are all valid.
+ bool systemSupportsUnalignedAccess() const { return hasMips32r6(); }
+
+ // Set helper classes
+ void setHelperClassesMips16();
+ void setHelperClassesMipsSE();
+
+ const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+ return &TSInfo;
+ }
+ const MipsInstrInfo *getInstrInfo() const override { return InstrInfo.get(); }
+ const TargetFrameLowering *getFrameLowering() const override {
+ return FrameLowering.get();
+ }
+ const MipsRegisterInfo *getRegisterInfo() const override {
+ return &InstrInfo->getRegisterInfo();
+ }
+ const MipsTargetLowering *getTargetLowering() const override {
+ return TLInfo.get();
+ }
+ const InstrItineraryData *getInstrItineraryData() const override {
+ return &InstrItins;
+ }
+};
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp b/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp
new file mode 100644
index 000000000000..bb48188e3b87
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp
@@ -0,0 +1,270 @@
+//===-- MipsTargetMachine.cpp - Define TargetMachine for Mips -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements the info about Mips target spec.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsTargetMachine.h"
+#include "Mips.h"
+#include "Mips16FrameLowering.h"
+#include "Mips16ISelDAGToDAG.h"
+#include "Mips16ISelLowering.h"
+#include "Mips16InstrInfo.h"
+#include "MipsFrameLowering.h"
+#include "MipsInstrInfo.h"
+#include "MipsSEFrameLowering.h"
+#include "MipsSEISelDAGToDAG.h"
+#include "MipsSEISelLowering.h"
+#include "MipsSEInstrInfo.h"
+#include "MipsTargetObjectFile.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips"
+
+extern "C" void LLVMInitializeMipsTarget() {
+ // Register the target.
+ RegisterTargetMachine<MipsebTargetMachine> X(getTheMipsTarget());
+ RegisterTargetMachine<MipselTargetMachine> Y(getTheMipselTarget());
+ RegisterTargetMachine<MipsebTargetMachine> A(getTheMips64Target());
+ RegisterTargetMachine<MipselTargetMachine> B(getTheMips64elTarget());
+}
+
+static std::string computeDataLayout(const Triple &TT, StringRef CPU,
+ const TargetOptions &Options,
+ bool isLittle) {
+ std::string Ret = "";
+ MipsABIInfo ABI = MipsABIInfo::computeTargetABI(TT, CPU, Options.MCOptions);
+
+ // There are both little and big endian mips.
+ if (isLittle)
+ Ret += "e";
+ else
+ Ret += "E";
+
+ if (ABI.IsO32())
+ Ret += "-m:m";
+ else
+ Ret += "-m:e";
+
+ // Pointers are 32 bit on some ABIs.
+ if (!ABI.IsN64())
+ Ret += "-p:32:32";
+
+ // 8 and 16 bit integers only need to have natural alignment, but try to
+ // align them to 32 bits. 64 bit integers have natural alignment.
+ Ret += "-i8:8:32-i16:16:32-i64:64";
+
+ // 32 bit registers are always available and the stack is at least 64 bit
+ // aligned. On N64 64 bit registers are also available and the stack is
+ // 128 bit aligned.
+ if (ABI.IsN64() || ABI.IsN32())
+ Ret += "-n32:64-S128";
+ else
+ Ret += "-n32-S64";
+
+ return Ret;
+}
+
+static Reloc::Model getEffectiveRelocModel(CodeModel::Model CM,
+ Optional<Reloc::Model> RM) {
+ if (!RM.hasValue() || CM == CodeModel::JITDefault)
+ return Reloc::Static;
+ return *RM;
+}
+
+// On function prologue, the stack is created by decrementing
+// its pointer. Once decremented, all references are done with positive
+// offset from the stack/frame pointer, using StackGrowsUp enables
+// an easier handling.
+// Using CodeModel::Large enables different CALL behavior.
+MipsTargetMachine::MipsTargetMachine(const Target &T, const Triple &TT,
+ StringRef CPU, StringRef FS,
+ const TargetOptions &Options,
+ Optional<Reloc::Model> RM,
+ CodeModel::Model CM, CodeGenOpt::Level OL,
+ bool isLittle)
+ : LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options, isLittle), TT,
+ CPU, FS, Options, getEffectiveRelocModel(CM, RM), CM,
+ OL),
+ isLittle(isLittle), TLOF(make_unique<MipsTargetObjectFile>()),
+ ABI(MipsABIInfo::computeTargetABI(TT, CPU, Options.MCOptions)),
+ Subtarget(nullptr), DefaultSubtarget(TT, CPU, FS, isLittle, *this),
+ NoMips16Subtarget(TT, CPU, FS.empty() ? "-mips16" : FS.str() + ",-mips16",
+ isLittle, *this),
+ Mips16Subtarget(TT, CPU, FS.empty() ? "+mips16" : FS.str() + ",+mips16",
+ isLittle, *this) {
+ Subtarget = &DefaultSubtarget;
+ initAsmInfo();
+}
+
+MipsTargetMachine::~MipsTargetMachine() {}
+
+void MipsebTargetMachine::anchor() { }
+
+MipsebTargetMachine::MipsebTargetMachine(const Target &T, const Triple &TT,
+ StringRef CPU, StringRef FS,
+ const TargetOptions &Options,
+ Optional<Reloc::Model> RM,
+ CodeModel::Model CM,
+ CodeGenOpt::Level OL)
+ : MipsTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
+
+void MipselTargetMachine::anchor() { }
+
+MipselTargetMachine::MipselTargetMachine(const Target &T, const Triple &TT,
+ StringRef CPU, StringRef FS,
+ const TargetOptions &Options,
+ Optional<Reloc::Model> RM,
+ CodeModel::Model CM,
+ CodeGenOpt::Level OL)
+ : MipsTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
+
+const MipsSubtarget *
+MipsTargetMachine::getSubtargetImpl(const Function &F) const {
+ Attribute CPUAttr = F.getFnAttribute("target-cpu");
+ Attribute FSAttr = F.getFnAttribute("target-features");
+
+ std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
+ ? CPUAttr.getValueAsString().str()
+ : TargetCPU;
+ std::string FS = !FSAttr.hasAttribute(Attribute::None)
+ ? FSAttr.getValueAsString().str()
+ : TargetFS;
+ bool hasMips16Attr =
+ !F.getFnAttribute("mips16").hasAttribute(Attribute::None);
+ bool hasNoMips16Attr =
+ !F.getFnAttribute("nomips16").hasAttribute(Attribute::None);
+
+ // FIXME: This is related to the code below to reset the target options,
+ // we need to know whether or not the soft float flag is set on the
+ // function, so we can enable it as a subtarget feature.
+ bool softFloat =
+ F.hasFnAttribute("use-soft-float") &&
+ F.getFnAttribute("use-soft-float").getValueAsString() == "true";
+
+ if (hasMips16Attr)
+ FS += FS.empty() ? "+mips16" : ",+mips16";
+ else if (hasNoMips16Attr)
+ FS += FS.empty() ? "-mips16" : ",-mips16";
+ if (softFloat)
+ FS += FS.empty() ? "+soft-float" : ",+soft-float";
+
+ auto &I = SubtargetMap[CPU + FS];
+ if (!I) {
+ // This needs to be done before we create a new subtarget since any
+ // creation will depend on the TM and the code generation flags on the
+ // function that reside in TargetOptions.
+ resetTargetOptions(F);
+ I = llvm::make_unique<MipsSubtarget>(TargetTriple, CPU, FS, isLittle,
+ *this);
+ }
+ return I.get();
+}
+
+void MipsTargetMachine::resetSubtarget(MachineFunction *MF) {
+ DEBUG(dbgs() << "resetSubtarget\n");
+
+ Subtarget = const_cast<MipsSubtarget *>(getSubtargetImpl(*MF->getFunction()));
+ MF->setSubtarget(Subtarget);
+ return;
+}
+
+namespace {
+/// Mips Code Generator Pass Configuration Options.
+class MipsPassConfig : public TargetPassConfig {
+public:
+ MipsPassConfig(MipsTargetMachine *TM, PassManagerBase &PM)
+ : TargetPassConfig(TM, PM) {
+ // The current implementation of long branch pass requires a scratch
+ // register ($at) to be available before branch instructions. Tail merging
+ // can break this requirement, so disable it when long branch pass is
+ // enabled.
+ EnableTailMerge = !getMipsSubtarget().enableLongBranchPass();
+ }
+
+ MipsTargetMachine &getMipsTargetMachine() const {
+ return getTM<MipsTargetMachine>();
+ }
+
+ const MipsSubtarget &getMipsSubtarget() const {
+ return *getMipsTargetMachine().getSubtargetImpl();
+ }
+
+ void addIRPasses() override;
+ bool addInstSelector() override;
+ void addPreEmitPass() override;
+
+ void addPreRegAlloc() override;
+
+};
+} // namespace
+
+TargetPassConfig *MipsTargetMachine::createPassConfig(PassManagerBase &PM) {
+ return new MipsPassConfig(this, PM);
+}
+
+void MipsPassConfig::addIRPasses() {
+ TargetPassConfig::addIRPasses();
+ addPass(createAtomicExpandPass(&getMipsTargetMachine()));
+ if (getMipsSubtarget().os16())
+ addPass(createMipsOs16Pass(getMipsTargetMachine()));
+ if (getMipsSubtarget().inMips16HardFloat())
+ addPass(createMips16HardFloatPass(getMipsTargetMachine()));
+}
+// Install an instruction selector pass using
+// the ISelDag to gen Mips code.
+bool MipsPassConfig::addInstSelector() {
+ addPass(createMipsModuleISelDagPass(getMipsTargetMachine()));
+ addPass(createMips16ISelDag(getMipsTargetMachine(), getOptLevel()));
+ addPass(createMipsSEISelDag(getMipsTargetMachine(), getOptLevel()));
+ return false;
+}
+
+void MipsPassConfig::addPreRegAlloc() {
+ addPass(createMipsOptimizePICCallPass(getMipsTargetMachine()));
+}
+
+TargetIRAnalysis MipsTargetMachine::getTargetIRAnalysis() {
+ return TargetIRAnalysis([this](const Function &F) {
+ if (Subtarget->allowMixed16_32()) {
+ DEBUG(errs() << "No Target Transform Info Pass Added\n");
+ // FIXME: This is no longer necessary as the TTI returned is per-function.
+ return TargetTransformInfo(F.getParent()->getDataLayout());
+ }
+
+ DEBUG(errs() << "Target Transform Info Pass Added\n");
+ return TargetTransformInfo(BasicTTIImpl(this, F));
+ });
+}
+
+// Implemented by targets that want to run passes immediately before
+// machine code is emitted. return true if -print-machineinstrs should
+// print out the code after the passes.
+void MipsPassConfig::addPreEmitPass() {
+ MipsTargetMachine &TM = getMipsTargetMachine();
+
+ // The delay slot filler pass can potientially create forbidden slot (FS)
+ // hazards for MIPSR6 which the hazard schedule pass (HSP) will fix. Any
+ // (new) pass that creates compact branches after the HSP must handle FS
+ // hazards itself or be pipelined before the HSP.
+ addPass(createMipsDelaySlotFillerPass(TM));
+ addPass(createMipsHazardSchedule());
+ addPass(createMipsLongBranchPass(TM));
+ addPass(createMipsConstantIslandPass());
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetMachine.h b/contrib/llvm/lib/Target/Mips/MipsTargetMachine.h
new file mode 100644
index 000000000000..e4cf17e2abd8
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsTargetMachine.h
@@ -0,0 +1,96 @@
+//===-- MipsTargetMachine.h - Define TargetMachine for Mips -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Mips specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSTARGETMACHINE_H
+#define LLVM_LIB_TARGET_MIPS_MIPSTARGETMACHINE_H
+
+#include "MCTargetDesc/MipsABIInfo.h"
+#include "MipsSubtarget.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class formatted_raw_ostream;
+class MipsRegisterInfo;
+
+class MipsTargetMachine : public LLVMTargetMachine {
+ bool isLittle;
+ std::unique_ptr<TargetLoweringObjectFile> TLOF;
+ // Selected ABI
+ MipsABIInfo ABI;
+ MipsSubtarget *Subtarget;
+ MipsSubtarget DefaultSubtarget;
+ MipsSubtarget NoMips16Subtarget;
+ MipsSubtarget Mips16Subtarget;
+
+ mutable StringMap<std::unique_ptr<MipsSubtarget>> SubtargetMap;
+
+public:
+ MipsTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL, bool isLittle);
+ ~MipsTargetMachine() override;
+
+ TargetIRAnalysis getTargetIRAnalysis() override;
+
+ const MipsSubtarget *getSubtargetImpl() const {
+ if (Subtarget)
+ return Subtarget;
+ return &DefaultSubtarget;
+ }
+
+ const MipsSubtarget *getSubtargetImpl(const Function &F) const override;
+
+ /// \brief Reset the subtarget for the Mips target.
+ void resetSubtarget(MachineFunction *MF);
+
+ // Pass Pipeline Configuration
+ TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+ TargetLoweringObjectFile *getObjFileLowering() const override {
+ return TLOF.get();
+ }
+
+ bool isLittleEndian() const { return isLittle; }
+ const MipsABIInfo &getABI() const { return ABI; }
+};
+
+/// Mips32/64 big endian target machine.
+///
+class MipsebTargetMachine : public MipsTargetMachine {
+ virtual void anchor();
+public:
+ MipsebTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL);
+};
+
+/// Mips32/64 little endian target machine.
+///
+class MipselTargetMachine : public MipsTargetMachine {
+ virtual void anchor();
+public:
+ MipselTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL);
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp b/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp
new file mode 100644
index 000000000000..fadab7806120
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp
@@ -0,0 +1,150 @@
+//===-- MipsTargetObjectFile.cpp - Mips Object Files ----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsTargetObjectFile.h"
+#include "MipsSubtarget.h"
+#include "MipsTargetMachine.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+static cl::opt<unsigned>
+SSThreshold("mips-ssection-threshold", cl::Hidden,
+ cl::desc("Small data and bss section threshold size (default=8)"),
+ cl::init(8));
+
+static cl::opt<bool>
+LocalSData("mlocal-sdata", cl::Hidden,
+ cl::desc("MIPS: Use gp_rel for object-local data."),
+ cl::init(true));
+
+static cl::opt<bool>
+ExternSData("mextern-sdata", cl::Hidden,
+ cl::desc("MIPS: Use gp_rel for data that is not defined by the "
+ "current object."),
+ cl::init(true));
+
+void MipsTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM){
+ TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+ InitializeELF(TM.Options.UseInitArray);
+
+ SmallDataSection = getContext().getELFSection(
+ ".sdata", ELF::SHT_PROGBITS,
+ ELF::SHF_WRITE | ELF::SHF_ALLOC | ELF::SHF_MIPS_GPREL);
+
+ SmallBSSSection = getContext().getELFSection(".sbss", ELF::SHT_NOBITS,
+ ELF::SHF_WRITE | ELF::SHF_ALLOC |
+ ELF::SHF_MIPS_GPREL);
+ this->TM = &static_cast<const MipsTargetMachine &>(TM);
+}
+
+// A address must be loaded from a small section if its size is less than the
+// small section size threshold. Data in this section must be addressed using
+// gp_rel operator.
+static bool IsInSmallSection(uint64_t Size) {
+ // gcc has traditionally not treated zero-sized objects as small data, so this
+ // is effectively part of the ABI.
+ return Size > 0 && Size <= SSThreshold;
+}
+
+/// Return true if this global address should be placed into small data/bss
+/// section.
+bool MipsTargetObjectFile::IsGlobalInSmallSection(
+ const GlobalObject *GO, const TargetMachine &TM) const {
+ // We first check the case where global is a declaration, because finding
+ // section kind using getKindForGlobal() is only allowed for global
+ // definitions.
+ if (GO->isDeclaration() || GO->hasAvailableExternallyLinkage())
+ return IsGlobalInSmallSectionImpl(GO, TM);
+
+ return IsGlobalInSmallSection(GO, TM, getKindForGlobal(GO, TM));
+}
+
+/// Return true if this global address should be placed into small data/bss
+/// section.
+bool MipsTargetObjectFile::
+IsGlobalInSmallSection(const GlobalObject *GO, const TargetMachine &TM,
+ SectionKind Kind) const {
+ return (IsGlobalInSmallSectionImpl(GO, TM) &&
+ (Kind.isData() || Kind.isBSS() || Kind.isCommon()));
+}
+
+/// Return true if this global address should be placed into small data/bss
+/// section. This method does all the work, except for checking the section
+/// kind.
+bool MipsTargetObjectFile::
+IsGlobalInSmallSectionImpl(const GlobalObject *GO,
+ const TargetMachine &TM) const {
+ const MipsSubtarget &Subtarget =
+ *static_cast<const MipsTargetMachine &>(TM).getSubtargetImpl();
+
+ // Return if small section is not available.
+ if (!Subtarget.useSmallSection())
+ return false;
+
+ // Only global variables, not functions.
+ const GlobalVariable *GVA = dyn_cast<GlobalVariable>(GO);
+ if (!GVA)
+ return false;
+
+ // Enforce -mlocal-sdata.
+ if (!LocalSData && GVA->hasLocalLinkage())
+ return false;
+
+ // Enforce -mextern-sdata.
+ if (!ExternSData && ((GVA->hasExternalLinkage() && GVA->isDeclaration()) ||
+ GVA->hasCommonLinkage()))
+ return false;
+
+ Type *Ty = GVA->getValueType();
+ return IsInSmallSection(
+ GVA->getParent()->getDataLayout().getTypeAllocSize(Ty));
+}
+
+MCSection *MipsTargetObjectFile::SelectSectionForGlobal(
+ const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+ // TODO: Could also support "weak" symbols as well with ".gnu.linkonce.s.*"
+ // sections?
+
+ // Handle Small Section classification here.
+ if (Kind.isBSS() && IsGlobalInSmallSection(GO, TM, Kind))
+ return SmallBSSSection;
+ if (Kind.isData() && IsGlobalInSmallSection(GO, TM, Kind))
+ return SmallDataSection;
+
+ // Otherwise, we work the same as ELF.
+ return TargetLoweringObjectFileELF::SelectSectionForGlobal(GO, Kind, TM);
+}
+
+/// Return true if this constant should be placed into small data section.
+bool MipsTargetObjectFile::IsConstantInSmallSection(
+ const DataLayout &DL, const Constant *CN, const TargetMachine &TM) const {
+ return (static_cast<const MipsTargetMachine &>(TM)
+ .getSubtargetImpl()
+ ->useSmallSection() &&
+ LocalSData && IsInSmallSection(DL.getTypeAllocSize(CN->getType())));
+}
+
+/// Return true if this constant should be placed into small data section.
+MCSection *MipsTargetObjectFile::getSectionForConstant(const DataLayout &DL,
+ SectionKind Kind,
+ const Constant *C,
+ unsigned &Align) const {
+ if (IsConstantInSmallSection(DL, C, *TM))
+ return SmallDataSection;
+
+ // Otherwise, we work the same as ELF.
+ return TargetLoweringObjectFileELF::getSectionForConstant(DL, Kind, C, Align);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.h b/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.h
new file mode 100644
index 000000000000..e5423f9578a8
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.h
@@ -0,0 +1,48 @@
+//===-- llvm/Target/MipsTargetObjectFile.h - Mips Object Info ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSTARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_MIPS_MIPSTARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+
+namespace llvm {
+class MipsTargetMachine;
+ class MipsTargetObjectFile : public TargetLoweringObjectFileELF {
+ MCSection *SmallDataSection;
+ MCSection *SmallBSSSection;
+ const MipsTargetMachine *TM;
+
+ bool IsGlobalInSmallSection(const GlobalObject *GO, const TargetMachine &TM,
+ SectionKind Kind) const;
+ bool IsGlobalInSmallSectionImpl(const GlobalObject *GO,
+ const TargetMachine &TM) const;
+ public:
+
+ void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+
+ /// Return true if this global address should be placed into small data/bss
+ /// section.
+ bool IsGlobalInSmallSection(const GlobalObject *GO,
+ const TargetMachine &TM) const;
+
+ MCSection *SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind,
+ const TargetMachine &TM) const override;
+
+ /// Return true if this constant should be placed into small data section.
+ bool IsConstantInSmallSection(const DataLayout &DL, const Constant *CN,
+ const TargetMachine &TM) const;
+
+ MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
+ const Constant *C,
+ unsigned &Align) const override;
+ };
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetStreamer.h b/contrib/llvm/lib/Target/Mips/MipsTargetStreamer.h
new file mode 100644
index 000000000000..41ebe411b98d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsTargetStreamer.h
@@ -0,0 +1,324 @@
+//===-- MipsTargetStreamer.h - Mips Target Streamer ------------*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSTARGETSTREAMER_H
+#define LLVM_LIB_TARGET_MIPS_MIPSTARGETSTREAMER_H
+
+#include "MCTargetDesc/MipsABIFlagsSection.h"
+#include "MCTargetDesc/MipsABIInfo.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+
+struct MipsABIFlagsSection;
+
+class MipsTargetStreamer : public MCTargetStreamer {
+public:
+ MipsTargetStreamer(MCStreamer &S);
+
+ virtual void setPic(bool Value) {}
+
+ virtual void emitDirectiveSetMicroMips();
+ virtual void emitDirectiveSetNoMicroMips();
+ virtual void setUsesMicroMips();
+ virtual void emitDirectiveSetMips16();
+ virtual void emitDirectiveSetNoMips16();
+
+ virtual void emitDirectiveSetReorder();
+ virtual void emitDirectiveSetNoReorder();
+ virtual void emitDirectiveSetMacro();
+ virtual void emitDirectiveSetNoMacro();
+ virtual void emitDirectiveSetMsa();
+ virtual void emitDirectiveSetNoMsa();
+ virtual void emitDirectiveSetAt();
+ virtual void emitDirectiveSetAtWithArg(unsigned RegNo);
+ virtual void emitDirectiveSetNoAt();
+ virtual void emitDirectiveEnd(StringRef Name);
+
+ virtual void emitDirectiveEnt(const MCSymbol &Symbol);
+ virtual void emitDirectiveAbiCalls();
+ virtual void emitDirectiveNaN2008();
+ virtual void emitDirectiveNaNLegacy();
+ virtual void emitDirectiveOptionPic0();
+ virtual void emitDirectiveOptionPic2();
+ virtual void emitDirectiveInsn();
+ virtual void emitFrame(unsigned StackReg, unsigned StackSize,
+ unsigned ReturnReg);
+ virtual void emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff);
+ virtual void emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff);
+
+ virtual void emitDirectiveSetArch(StringRef Arch);
+ virtual void emitDirectiveSetMips0();
+ virtual void emitDirectiveSetMips1();
+ virtual void emitDirectiveSetMips2();
+ virtual void emitDirectiveSetMips3();
+ virtual void emitDirectiveSetMips4();
+ virtual void emitDirectiveSetMips5();
+ virtual void emitDirectiveSetMips32();
+ virtual void emitDirectiveSetMips32R2();
+ virtual void emitDirectiveSetMips32R3();
+ virtual void emitDirectiveSetMips32R5();
+ virtual void emitDirectiveSetMips32R6();
+ virtual void emitDirectiveSetMips64();
+ virtual void emitDirectiveSetMips64R2();
+ virtual void emitDirectiveSetMips64R3();
+ virtual void emitDirectiveSetMips64R5();
+ virtual void emitDirectiveSetMips64R6();
+ virtual void emitDirectiveSetDsp();
+ virtual void emitDirectiveSetNoDsp();
+ virtual void emitDirectiveSetPop();
+ virtual void emitDirectiveSetPush();
+ virtual void emitDirectiveSetSoftFloat();
+ virtual void emitDirectiveSetHardFloat();
+
+ // PIC support
+ virtual void emitDirectiveCpLoad(unsigned RegNo);
+ virtual bool emitDirectiveCpRestore(int Offset,
+ function_ref<unsigned()> GetATReg,
+ SMLoc IDLoc, const MCSubtargetInfo *STI);
+ virtual void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
+ const MCSymbol &Sym, bool IsReg);
+ virtual void emitDirectiveCpreturn(unsigned SaveLocation,
+ bool SaveLocationIsRegister);
+
+ // FP abiflags directives
+ virtual void emitDirectiveModuleFP();
+ virtual void emitDirectiveModuleOddSPReg();
+ virtual void emitDirectiveModuleSoftFloat();
+ virtual void emitDirectiveModuleHardFloat();
+ virtual void emitDirectiveSetFp(MipsABIFlagsSection::FpABIKind Value);
+ virtual void emitDirectiveSetOddSPReg();
+ virtual void emitDirectiveSetNoOddSPReg();
+
+ void emitR(unsigned Opcode, unsigned Reg0, SMLoc IDLoc,
+ const MCSubtargetInfo *STI);
+ void emitII(unsigned Opcode, int16_t Imm1, int16_t Imm2, SMLoc IDLoc,
+ const MCSubtargetInfo *STI);
+ void emitRX(unsigned Opcode, unsigned Reg0, MCOperand Op1, SMLoc IDLoc,
+ const MCSubtargetInfo *STI);
+ void emitRI(unsigned Opcode, unsigned Reg0, int32_t Imm, SMLoc IDLoc,
+ const MCSubtargetInfo *STI);
+ void emitRR(unsigned Opcode, unsigned Reg0, unsigned Reg1, SMLoc IDLoc,
+ const MCSubtargetInfo *STI);
+ void emitRRX(unsigned Opcode, unsigned Reg0, unsigned Reg1, MCOperand Op2,
+ SMLoc IDLoc, const MCSubtargetInfo *STI);
+ void emitRRR(unsigned Opcode, unsigned Reg0, unsigned Reg1, unsigned Reg2,
+ SMLoc IDLoc, const MCSubtargetInfo *STI);
+ void emitRRI(unsigned Opcode, unsigned Reg0, unsigned Reg1, int16_t Imm,
+ SMLoc IDLoc, const MCSubtargetInfo *STI);
+ void emitAddu(unsigned DstReg, unsigned SrcReg, unsigned TrgReg, bool Is64Bit,
+ const MCSubtargetInfo *STI);
+ void emitDSLL(unsigned DstReg, unsigned SrcReg, int16_t ShiftAmount,
+ SMLoc IDLoc, const MCSubtargetInfo *STI);
+ void emitEmptyDelaySlot(bool hasShortDelaySlot, SMLoc IDLoc,
+ const MCSubtargetInfo *STI);
+ void emitNop(SMLoc IDLoc, const MCSubtargetInfo *STI);
+
+ /// Emit a store instruction with an offset. If the offset is out of range
+ /// then it will be synthesized using the assembler temporary.
+ ///
+ /// GetATReg() is a callback that can be used to obtain the current assembler
+ /// temporary and is only called when the assembler temporary is required. It
+ /// must handle the case where no assembler temporary is available (typically
+ /// by reporting an error).
+ void emitStoreWithImmOffset(unsigned Opcode, unsigned SrcReg,
+ unsigned BaseReg, int64_t Offset,
+ function_ref<unsigned()> GetATReg, SMLoc IDLoc,
+ const MCSubtargetInfo *STI);
+ void emitStoreWithSymOffset(unsigned Opcode, unsigned SrcReg,
+ unsigned BaseReg, MCOperand &HiOperand,
+ MCOperand &LoOperand, unsigned ATReg, SMLoc IDLoc,
+ const MCSubtargetInfo *STI);
+ void emitLoadWithImmOffset(unsigned Opcode, unsigned DstReg, unsigned BaseReg,
+ int64_t Offset, unsigned TmpReg, SMLoc IDLoc,
+ const MCSubtargetInfo *STI);
+ void emitLoadWithSymOffset(unsigned Opcode, unsigned DstReg, unsigned BaseReg,
+ MCOperand &HiOperand, MCOperand &LoOperand,
+ unsigned ATReg, SMLoc IDLoc,
+ const MCSubtargetInfo *STI);
+ void emitGPRestore(int Offset, SMLoc IDLoc, const MCSubtargetInfo *STI);
+
+ void forbidModuleDirective() { ModuleDirectiveAllowed = false; }
+ void reallowModuleDirective() { ModuleDirectiveAllowed = true; }
+ bool isModuleDirectiveAllowed() { return ModuleDirectiveAllowed; }
+
+ // This method enables template classes to set internal abi flags
+ // structure values.
+ template <class PredicateLibrary>
+ void updateABIInfo(const PredicateLibrary &P) {
+ ABI = P.getABI();
+ ABIFlagsSection.setAllFromPredicates(P);
+ }
+
+ MipsABIFlagsSection &getABIFlagsSection() { return ABIFlagsSection; }
+ const MipsABIInfo &getABI() const {
+ assert(ABI.hasValue() && "ABI hasn't been set!");
+ return *ABI;
+ }
+
+protected:
+ llvm::Optional<MipsABIInfo> ABI;
+ MipsABIFlagsSection ABIFlagsSection;
+
+ bool GPRInfoSet;
+ unsigned GPRBitMask;
+ int GPROffset;
+
+ bool FPRInfoSet;
+ unsigned FPRBitMask;
+ int FPROffset;
+
+ bool FrameInfoSet;
+ int FrameOffset;
+ unsigned FrameReg;
+ unsigned ReturnReg;
+
+private:
+ bool ModuleDirectiveAllowed;
+};
+
+// This part is for ascii assembly output
+class MipsTargetAsmStreamer : public MipsTargetStreamer {
+ formatted_raw_ostream &OS;
+
+public:
+ MipsTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
+ void emitDirectiveSetMicroMips() override;
+ void emitDirectiveSetNoMicroMips() override;
+ void emitDirectiveSetMips16() override;
+ void emitDirectiveSetNoMips16() override;
+
+ void emitDirectiveSetReorder() override;
+ void emitDirectiveSetNoReorder() override;
+ void emitDirectiveSetMacro() override;
+ void emitDirectiveSetNoMacro() override;
+ void emitDirectiveSetMsa() override;
+ void emitDirectiveSetNoMsa() override;
+ void emitDirectiveSetAt() override;
+ void emitDirectiveSetAtWithArg(unsigned RegNo) override;
+ void emitDirectiveSetNoAt() override;
+ void emitDirectiveEnd(StringRef Name) override;
+
+ void emitDirectiveEnt(const MCSymbol &Symbol) override;
+ void emitDirectiveAbiCalls() override;
+ void emitDirectiveNaN2008() override;
+ void emitDirectiveNaNLegacy() override;
+ void emitDirectiveOptionPic0() override;
+ void emitDirectiveOptionPic2() override;
+ void emitDirectiveInsn() override;
+ void emitFrame(unsigned StackReg, unsigned StackSize,
+ unsigned ReturnReg) override;
+ void emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff) override;
+ void emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff) override;
+
+ void emitDirectiveSetArch(StringRef Arch) override;
+ void emitDirectiveSetMips0() override;
+ void emitDirectiveSetMips1() override;
+ void emitDirectiveSetMips2() override;
+ void emitDirectiveSetMips3() override;
+ void emitDirectiveSetMips4() override;
+ void emitDirectiveSetMips5() override;
+ void emitDirectiveSetMips32() override;
+ void emitDirectiveSetMips32R2() override;
+ void emitDirectiveSetMips32R3() override;
+ void emitDirectiveSetMips32R5() override;
+ void emitDirectiveSetMips32R6() override;
+ void emitDirectiveSetMips64() override;
+ void emitDirectiveSetMips64R2() override;
+ void emitDirectiveSetMips64R3() override;
+ void emitDirectiveSetMips64R5() override;
+ void emitDirectiveSetMips64R6() override;
+ void emitDirectiveSetDsp() override;
+ void emitDirectiveSetNoDsp() override;
+ void emitDirectiveSetPop() override;
+ void emitDirectiveSetPush() override;
+ void emitDirectiveSetSoftFloat() override;
+ void emitDirectiveSetHardFloat() override;
+
+ // PIC support
+ void emitDirectiveCpLoad(unsigned RegNo) override;
+
+ /// Emit a .cprestore directive. If the offset is out of range then it will
+ /// be synthesized using the assembler temporary.
+ ///
+ /// GetATReg() is a callback that can be used to obtain the current assembler
+ /// temporary and is only called when the assembler temporary is required. It
+ /// must handle the case where no assembler temporary is available (typically
+ /// by reporting an error).
+ bool emitDirectiveCpRestore(int Offset, function_ref<unsigned()> GetATReg,
+ SMLoc IDLoc, const MCSubtargetInfo *STI) override;
+ void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
+ const MCSymbol &Sym, bool IsReg) override;
+ void emitDirectiveCpreturn(unsigned SaveLocation,
+ bool SaveLocationIsRegister) override;
+
+ // FP abiflags directives
+ void emitDirectiveModuleFP() override;
+ void emitDirectiveModuleOddSPReg() override;
+ void emitDirectiveModuleSoftFloat() override;
+ void emitDirectiveModuleHardFloat() override;
+ void emitDirectiveSetFp(MipsABIFlagsSection::FpABIKind Value) override;
+ void emitDirectiveSetOddSPReg() override;
+ void emitDirectiveSetNoOddSPReg() override;
+};
+
+// This part is for ELF object output
+class MipsTargetELFStreamer : public MipsTargetStreamer {
+ bool MicroMipsEnabled;
+ const MCSubtargetInfo &STI;
+ bool Pic;
+
+public:
+ bool isMicroMipsEnabled() const { return MicroMipsEnabled; }
+ MCELFStreamer &getStreamer();
+ MipsTargetELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI);
+
+ void setPic(bool Value) override { Pic = Value; }
+
+ void emitLabel(MCSymbol *Symbol) override;
+ void emitAssignment(MCSymbol *Symbol, const MCExpr *Value) override;
+ void finish() override;
+
+ void emitDirectiveSetMicroMips() override;
+ void emitDirectiveSetNoMicroMips() override;
+ void setUsesMicroMips() override;
+ void emitDirectiveSetMips16() override;
+
+ void emitDirectiveSetNoReorder() override;
+ void emitDirectiveEnd(StringRef Name) override;
+
+ void emitDirectiveEnt(const MCSymbol &Symbol) override;
+ void emitDirectiveAbiCalls() override;
+ void emitDirectiveNaN2008() override;
+ void emitDirectiveNaNLegacy() override;
+ void emitDirectiveOptionPic0() override;
+ void emitDirectiveOptionPic2() override;
+ void emitDirectiveInsn() override;
+ void emitFrame(unsigned StackReg, unsigned StackSize,
+ unsigned ReturnReg) override;
+ void emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff) override;
+ void emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff) override;
+
+ // PIC support
+ void emitDirectiveCpLoad(unsigned RegNo) override;
+ bool emitDirectiveCpRestore(int Offset, function_ref<unsigned()> GetATReg,
+ SMLoc IDLoc, const MCSubtargetInfo *STI) override;
+ void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
+ const MCSymbol &Sym, bool IsReg) override;
+ void emitDirectiveCpreturn(unsigned SaveLocation,
+ bool SaveLocationIsRegister) override;
+
+ void emitMipsAbiFlags();
+};
+}
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp b/contrib/llvm/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp
new file mode 100644
index 000000000000..4c1edfaaaeca
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp
@@ -0,0 +1,48 @@
+//===-- MipsTargetInfo.cpp - Mips Target Implementation -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target &llvm::getTheMipsTarget() {
+ static Target TheMipsTarget;
+ return TheMipsTarget;
+}
+Target &llvm::getTheMipselTarget() {
+ static Target TheMipselTarget;
+ return TheMipselTarget;
+}
+Target &llvm::getTheMips64Target() {
+ static Target TheMips64Target;
+ return TheMips64Target;
+}
+Target &llvm::getTheMips64elTarget() {
+ static Target TheMips64elTarget;
+ return TheMips64elTarget;
+}
+
+extern "C" void LLVMInitializeMipsTargetInfo() {
+ RegisterTarget<Triple::mips,
+ /*HasJIT=*/true>
+ X(getTheMipsTarget(), "mips", "Mips");
+
+ RegisterTarget<Triple::mipsel,
+ /*HasJIT=*/true>
+ Y(getTheMipselTarget(), "mipsel", "Mipsel");
+
+ RegisterTarget<Triple::mips64,
+ /*HasJIT=*/true>
+ A(getTheMips64Target(), "mips64", "Mips64 [experimental]");
+
+ RegisterTarget<Triple::mips64el,
+ /*HasJIT=*/true>
+ B(getTheMips64elTarget(), "mips64el", "Mips64el [experimental]");
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp b/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
new file mode 100644
index 000000000000..4594c22b8701
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
@@ -0,0 +1,286 @@
+//===-- NVPTXInstPrinter.cpp - PTX assembly instruction printing ----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Print MCInst instructions to .ptx format.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstPrinter/NVPTXInstPrinter.h"
+#include "MCTargetDesc/NVPTXBaseInfo.h"
+#include "NVPTX.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include <cctype>
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+#include "NVPTXGenAsmWriter.inc"
+
+NVPTXInstPrinter::NVPTXInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI)
+ : MCInstPrinter(MAI, MII, MRI) {}
+
+void NVPTXInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+ // Decode the virtual register
+ // Must be kept in sync with NVPTXAsmPrinter::encodeVirtualRegister
+ unsigned RCId = (RegNo >> 28);
+ switch (RCId) {
+ default: report_fatal_error("Bad virtual register encoding");
+ case 0:
+ // This is actually a physical register, so defer to the autogenerated
+ // register printer
+ OS << getRegisterName(RegNo);
+ return;
+ case 1:
+ OS << "%p";
+ break;
+ case 2:
+ OS << "%rs";
+ break;
+ case 3:
+ OS << "%r";
+ break;
+ case 4:
+ OS << "%rd";
+ break;
+ case 5:
+ OS << "%f";
+ break;
+ case 6:
+ OS << "%fd";
+ break;
+ }
+
+ unsigned VReg = RegNo & 0x0FFFFFFF;
+ OS << VReg;
+}
+
+void NVPTXInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+ StringRef Annot, const MCSubtargetInfo &STI) {
+ printInstruction(MI, OS);
+
+ // Next always print the annotation.
+ printAnnotation(OS, Annot);
+}
+
+void NVPTXInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isReg()) {
+ unsigned Reg = Op.getReg();
+ printRegName(O, Reg);
+ } else if (Op.isImm()) {
+ O << markup("<imm:") << formatImm(Op.getImm()) << markup(">");
+ } else {
+ assert(Op.isExpr() && "Unknown operand kind in printOperand");
+ Op.getExpr()->print(O, &MAI);
+ }
+}
+
+void NVPTXInstPrinter::printCvtMode(const MCInst *MI, int OpNum, raw_ostream &O,
+ const char *Modifier) {
+ const MCOperand &MO = MI->getOperand(OpNum);
+ int64_t Imm = MO.getImm();
+
+ if (strcmp(Modifier, "ftz") == 0) {
+ // FTZ flag
+ if (Imm & NVPTX::PTXCvtMode::FTZ_FLAG)
+ O << ".ftz";
+ } else if (strcmp(Modifier, "sat") == 0) {
+ // SAT flag
+ if (Imm & NVPTX::PTXCvtMode::SAT_FLAG)
+ O << ".sat";
+ } else if (strcmp(Modifier, "base") == 0) {
+ // Default operand
+ switch (Imm & NVPTX::PTXCvtMode::BASE_MASK) {
+ default:
+ return;
+ case NVPTX::PTXCvtMode::NONE:
+ break;
+ case NVPTX::PTXCvtMode::RNI:
+ O << ".rni";
+ break;
+ case NVPTX::PTXCvtMode::RZI:
+ O << ".rzi";
+ break;
+ case NVPTX::PTXCvtMode::RMI:
+ O << ".rmi";
+ break;
+ case NVPTX::PTXCvtMode::RPI:
+ O << ".rpi";
+ break;
+ case NVPTX::PTXCvtMode::RN:
+ O << ".rn";
+ break;
+ case NVPTX::PTXCvtMode::RZ:
+ O << ".rz";
+ break;
+ case NVPTX::PTXCvtMode::RM:
+ O << ".rm";
+ break;
+ case NVPTX::PTXCvtMode::RP:
+ O << ".rp";
+ break;
+ }
+ } else {
+ llvm_unreachable("Invalid conversion modifier");
+ }
+}
+
+void NVPTXInstPrinter::printCmpMode(const MCInst *MI, int OpNum, raw_ostream &O,
+ const char *Modifier) {
+ const MCOperand &MO = MI->getOperand(OpNum);
+ int64_t Imm = MO.getImm();
+
+ if (strcmp(Modifier, "ftz") == 0) {
+ // FTZ flag
+ if (Imm & NVPTX::PTXCmpMode::FTZ_FLAG)
+ O << ".ftz";
+ } else if (strcmp(Modifier, "base") == 0) {
+ switch (Imm & NVPTX::PTXCmpMode::BASE_MASK) {
+ default:
+ return;
+ case NVPTX::PTXCmpMode::EQ:
+ O << ".eq";
+ break;
+ case NVPTX::PTXCmpMode::NE:
+ O << ".ne";
+ break;
+ case NVPTX::PTXCmpMode::LT:
+ O << ".lt";
+ break;
+ case NVPTX::PTXCmpMode::LE:
+ O << ".le";
+ break;
+ case NVPTX::PTXCmpMode::GT:
+ O << ".gt";
+ break;
+ case NVPTX::PTXCmpMode::GE:
+ O << ".ge";
+ break;
+ case NVPTX::PTXCmpMode::LO:
+ O << ".lo";
+ break;
+ case NVPTX::PTXCmpMode::LS:
+ O << ".ls";
+ break;
+ case NVPTX::PTXCmpMode::HI:
+ O << ".hi";
+ break;
+ case NVPTX::PTXCmpMode::HS:
+ O << ".hs";
+ break;
+ case NVPTX::PTXCmpMode::EQU:
+ O << ".equ";
+ break;
+ case NVPTX::PTXCmpMode::NEU:
+ O << ".neu";
+ break;
+ case NVPTX::PTXCmpMode::LTU:
+ O << ".ltu";
+ break;
+ case NVPTX::PTXCmpMode::LEU:
+ O << ".leu";
+ break;
+ case NVPTX::PTXCmpMode::GTU:
+ O << ".gtu";
+ break;
+ case NVPTX::PTXCmpMode::GEU:
+ O << ".geu";
+ break;
+ case NVPTX::PTXCmpMode::NUM:
+ O << ".num";
+ break;
+ case NVPTX::PTXCmpMode::NotANumber:
+ O << ".nan";
+ break;
+ }
+ } else {
+ llvm_unreachable("Empty Modifier");
+ }
+}
+
+void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
+ raw_ostream &O, const char *Modifier) {
+ if (Modifier) {
+ const MCOperand &MO = MI->getOperand(OpNum);
+ int Imm = (int) MO.getImm();
+ if (!strcmp(Modifier, "volatile")) {
+ if (Imm)
+ O << ".volatile";
+ } else if (!strcmp(Modifier, "addsp")) {
+ switch (Imm) {
+ case NVPTX::PTXLdStInstCode::GLOBAL:
+ O << ".global";
+ break;
+ case NVPTX::PTXLdStInstCode::SHARED:
+ O << ".shared";
+ break;
+ case NVPTX::PTXLdStInstCode::LOCAL:
+ O << ".local";
+ break;
+ case NVPTX::PTXLdStInstCode::PARAM:
+ O << ".param";
+ break;
+ case NVPTX::PTXLdStInstCode::CONSTANT:
+ O << ".const";
+ break;
+ case NVPTX::PTXLdStInstCode::GENERIC:
+ break;
+ default:
+ llvm_unreachable("Wrong Address Space");
+ }
+ } else if (!strcmp(Modifier, "sign")) {
+ if (Imm == NVPTX::PTXLdStInstCode::Signed)
+ O << "s";
+ else if (Imm == NVPTX::PTXLdStInstCode::Unsigned)
+ O << "u";
+ else
+ O << "f";
+ } else if (!strcmp(Modifier, "vec")) {
+ if (Imm == NVPTX::PTXLdStInstCode::V2)
+ O << ".v2";
+ else if (Imm == NVPTX::PTXLdStInstCode::V4)
+ O << ".v4";
+ } else
+ llvm_unreachable("Unknown Modifier");
+ } else
+ llvm_unreachable("Empty Modifier");
+}
+
+void NVPTXInstPrinter::printMemOperand(const MCInst *MI, int OpNum,
+ raw_ostream &O, const char *Modifier) {
+ printOperand(MI, OpNum, O);
+
+ if (Modifier && !strcmp(Modifier, "add")) {
+ O << ", ";
+ printOperand(MI, OpNum + 1, O);
+ } else {
+ if (MI->getOperand(OpNum + 1).isImm() &&
+ MI->getOperand(OpNum + 1).getImm() == 0)
+ return; // don't print ',0' or '+0'
+ O << "+";
+ printOperand(MI, OpNum + 1, O);
+ }
+}
+
+void NVPTXInstPrinter::printProtoIdent(const MCInst *MI, int OpNum,
+ raw_ostream &O, const char *Modifier) {
+ const MCOperand &Op = MI->getOperand(OpNum);
+ assert(Op.isExpr() && "Call prototype is not an MCExpr?");
+ const MCExpr *Expr = Op.getExpr();
+ const MCSymbol &Sym = cast<MCSymbolRefExpr>(Expr)->getSymbol();
+ O << Sym.getName();
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h b/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h
new file mode 100644
index 000000000000..f0f223aa057b
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h
@@ -0,0 +1,52 @@
+//= NVPTXInstPrinter.h - Convert NVPTX MCInst to assembly syntax --*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an NVPTX MCInst to .ptx file syntax.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_INSTPRINTER_NVPTXINSTPRINTER_H
+#define LLVM_LIB_TARGET_NVPTX_INSTPRINTER_NVPTXINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+
+class MCSubtargetInfo;
+
+class NVPTXInstPrinter : public MCInstPrinter {
+public:
+ NVPTXInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI);
+
+ void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+ void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot,
+ const MCSubtargetInfo &STI) override;
+
+ // Autogenerated by tblgen.
+ void printInstruction(const MCInst *MI, raw_ostream &O);
+ static const char *getRegisterName(unsigned RegNo);
+ // End
+
+ void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printCvtMode(const MCInst *MI, int OpNum, raw_ostream &O,
+ const char *Modifier = nullptr);
+ void printCmpMode(const MCInst *MI, int OpNum, raw_ostream &O,
+ const char *Modifier = nullptr);
+ void printLdStCode(const MCInst *MI, int OpNum,
+ raw_ostream &O, const char *Modifier = nullptr);
+ void printMemOperand(const MCInst *MI, int OpNum,
+ raw_ostream &O, const char *Modifier = nullptr);
+ void printProtoIdent(const MCInst *MI, int OpNum,
+ raw_ostream &O, const char *Modifier = nullptr);
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h
new file mode 100644
index 000000000000..1cb92005979d
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h
@@ -0,0 +1,46 @@
+//===-- NVPTXBaseInfo.h - Top-level definitions for NVPTX -------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the NVPTX target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_MCTARGETDESC_NVPTXBASEINFO_H
+#define LLVM_LIB_TARGET_NVPTX_MCTARGETDESC_NVPTXBASEINFO_H
+
+namespace llvm {
+
+enum AddressSpace {
+ ADDRESS_SPACE_GENERIC = 0,
+ ADDRESS_SPACE_GLOBAL = 1,
+ ADDRESS_SPACE_SHARED = 3,
+ ADDRESS_SPACE_CONST = 4,
+ ADDRESS_SPACE_LOCAL = 5,
+
+ // NVVM Internal
+ ADDRESS_SPACE_PARAM = 101
+};
+
+namespace NVPTXII {
+enum {
+ // These must be kept in sync with TSFlags in NVPTXInstrFormats.td
+ IsTexFlag = 0x80,
+ IsSuldMask = 0x300,
+ IsSuldShift = 8,
+ IsSustFlag = 0x400,
+ IsSurfTexQueryFlag = 0x800,
+ IsTexModeUnifiedFlag = 0x1000
+};
+} // namespace NVPTXII
+
+} // namespace llvm
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
new file mode 100644
index 000000000000..78bdf4e698d8
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
@@ -0,0 +1,59 @@
+//===-- NVPTXMCAsmInfo.cpp - NVPTX asm properties -------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the NVPTXMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXMCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+// -debug-compile - Command line option to inform opt and llc passes to
+// compile for debugging
+static cl::opt<bool> CompileForDebugging("debug-compile",
+ cl::desc("Compile for debugging"),
+ cl::Hidden, cl::init(false));
+
+void NVPTXMCAsmInfo::anchor() {}
+
+NVPTXMCAsmInfo::NVPTXMCAsmInfo(const Triple &TheTriple) {
+ if (TheTriple.getArch() == Triple::nvptx64) {
+ PointerSize = CalleeSaveStackSlotSize = 8;
+ }
+
+ CommentString = "//";
+
+ HasSingleParameterDotFile = false;
+
+ InlineAsmStart = " begin inline asm";
+ InlineAsmEnd = " end inline asm";
+
+ SupportsDebugInformation = CompileForDebugging;
+ // PTX does not allow .align on functions.
+ HasFunctionAlignment = false;
+ HasDotTypeDotSizeDirective = false;
+ // PTX does not allow .hidden or .protected
+ HiddenDeclarationVisibilityAttr = HiddenVisibilityAttr = MCSA_Invalid;
+ ProtectedVisibilityAttr = MCSA_Invalid;
+
+ Data8bitsDirective = " .b8 ";
+ Data16bitsDirective = " .b16 ";
+ Data32bitsDirective = " .b32 ";
+ Data64bitsDirective = " .b64 ";
+ ZeroDirective = " .b8";
+ AsciiDirective = " .b8";
+ AscizDirective = " .b8";
+
+ // @TODO: Can we just disable this?
+ WeakDirective = "\t// .weak\t";
+ GlobalDirective = "\t// .globl\t";
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
new file mode 100644
index 000000000000..9ac3c8850f75
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
@@ -0,0 +1,31 @@
+//===-- NVPTXMCAsmInfo.h - NVPTX asm properties ----------------*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the NVPTXMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_MCTARGETDESC_NVPTXMCASMINFO_H
+#define LLVM_LIB_TARGET_NVPTX_MCTARGETDESC_NVPTXMCASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+class Target;
+class Triple;
+
+class NVPTXMCAsmInfo : public MCAsmInfo {
+ virtual void anchor();
+
+public:
+ explicit NVPTXMCAsmInfo(const Triple &TheTriple);
+};
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
new file mode 100644
index 000000000000..12f992749366
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
@@ -0,0 +1,79 @@
+//===-- NVPTXMCTargetDesc.cpp - NVPTX Target Descriptions -------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides NVPTX specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXMCTargetDesc.h"
+#include "InstPrinter/NVPTXInstPrinter.h"
+#include "NVPTXMCAsmInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "NVPTXGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "NVPTXGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "NVPTXGenRegisterInfo.inc"
+
+static MCInstrInfo *createNVPTXMCInstrInfo() {
+ MCInstrInfo *X = new MCInstrInfo();
+ InitNVPTXMCInstrInfo(X);
+ return X;
+}
+
+static MCRegisterInfo *createNVPTXMCRegisterInfo(const Triple &TT) {
+ MCRegisterInfo *X = new MCRegisterInfo();
+ // PTX does not have a return address register.
+ InitNVPTXMCRegisterInfo(X, 0);
+ return X;
+}
+
+static MCSubtargetInfo *
+createNVPTXMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
+ return createNVPTXMCSubtargetInfoImpl(TT, CPU, FS);
+}
+
+static MCInstPrinter *createNVPTXMCInstPrinter(const Triple &T,
+ unsigned SyntaxVariant,
+ const MCAsmInfo &MAI,
+ const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI) {
+ if (SyntaxVariant == 0)
+ return new NVPTXInstPrinter(MAI, MII, MRI);
+ return nullptr;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeNVPTXTargetMC() {
+ for (Target *T : {&getTheNVPTXTarget32(), &getTheNVPTXTarget64()}) {
+ // Register the MC asm info.
+ RegisterMCAsmInfo<NVPTXMCAsmInfo> X(*T);
+
+ // Register the MC instruction info.
+ TargetRegistry::RegisterMCInstrInfo(*T, createNVPTXMCInstrInfo);
+
+ // Register the MC register info.
+ TargetRegistry::RegisterMCRegInfo(*T, createNVPTXMCRegisterInfo);
+
+ // Register the MC subtarget info.
+ TargetRegistry::RegisterMCSubtargetInfo(*T, createNVPTXMCSubtargetInfo);
+
+ // Register the MCInstPrinter.
+ TargetRegistry::RegisterMCInstPrinter(*T, createNVPTXMCInstPrinter);
+ }
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h
new file mode 100644
index 000000000000..0c9ad977e7ec
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h
@@ -0,0 +1,38 @@
+//===-- NVPTXMCTargetDesc.h - NVPTX Target Descriptions ---------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides NVPTX specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_MCTARGETDESC_NVPTXMCTARGETDESC_H
+#define LLVM_LIB_TARGET_NVPTX_MCTARGETDESC_NVPTXMCTARGETDESC_H
+
+#include <stdint.h>
+
+namespace llvm {
+class Target;
+
+Target &getTheNVPTXTarget32();
+Target &getTheNVPTXTarget64();
+
+} // End llvm namespace
+
+// Defines symbolic names for PTX registers.
+#define GET_REGINFO_ENUM
+#include "NVPTXGenRegisterInfo.inc"
+
+// Defines symbolic names for the PTX instructions.
+#define GET_INSTRINFO_ENUM
+#include "NVPTXGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "NVPTXGenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/ManagedStringPool.h b/contrib/llvm/lib/Target/NVPTX/ManagedStringPool.h
new file mode 100644
index 000000000000..a2d670f8d39d
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/ManagedStringPool.h
@@ -0,0 +1,48 @@
+//===-- ManagedStringPool.h - Managed String Pool ---------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The strings allocated from a managed string pool are owned by the string
+// pool and will be deleted together with the managed string pool.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_MANAGEDSTRINGPOOL_H
+#define LLVM_LIB_TARGET_NVPTX_MANAGEDSTRINGPOOL_H
+
+#include "llvm/ADT/SmallVector.h"
+#include <string>
+
+namespace llvm {
+
+/// ManagedStringPool - The strings allocated from a managed string pool are
+/// owned by the string pool and will be deleted together with the managed
+/// string pool.
+class ManagedStringPool {
+ SmallVector<std::string *, 8> Pool;
+
+public:
+ ManagedStringPool() {}
+ ~ManagedStringPool() {
+ SmallVectorImpl<std::string *>::iterator Current = Pool.begin();
+ while (Current != Pool.end()) {
+ delete *Current;
+ Current++;
+ }
+ }
+
+ std::string *getManagedString(const char *S) {
+ std::string *Str = new std::string(S);
+ Pool.push_back(Str);
+ return Str;
+ }
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTX.h b/contrib/llvm/lib/Target/NVPTX/NVPTX.h
new file mode 100644
index 000000000000..c455a437d8d5
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTX.h
@@ -0,0 +1,178 @@
+//===-- NVPTX.h - Top-level interface for NVPTX representation --*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in
+// the LLVM NVPTX back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTX_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTX_H
+
+#include "MCTargetDesc/NVPTXBaseInfo.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cassert>
+#include <iosfwd>
+
+namespace llvm {
+class NVPTXTargetMachine;
+class FunctionPass;
+class MachineFunctionPass;
+class formatted_raw_ostream;
+
+namespace NVPTXCC {
+enum CondCodes {
+ EQ,
+ NE,
+ LT,
+ LE,
+ GT,
+ GE
+};
+}
+
+FunctionPass *createNVPTXISelDag(NVPTXTargetMachine &TM,
+ llvm::CodeGenOpt::Level OptLevel);
+ModulePass *createNVPTXAssignValidGlobalNamesPass();
+ModulePass *createGenericToNVVMPass();
+FunctionPass *createNVPTXInferAddressSpacesPass();
+FunctionPass *createNVVMIntrRangePass(unsigned int SmVersion);
+FunctionPass *createNVVMReflectPass();
+FunctionPass *createNVVMReflectPass(const StringMap<int> &Mapping);
+MachineFunctionPass *createNVPTXPrologEpilogPass();
+MachineFunctionPass *createNVPTXReplaceImageHandlesPass();
+FunctionPass *createNVPTXImageOptimizerPass();
+FunctionPass *createNVPTXLowerArgsPass(const NVPTXTargetMachine *TM);
+BasicBlockPass *createNVPTXLowerAllocaPass();
+MachineFunctionPass *createNVPTXPeephole();
+
+Target &getTheNVPTXTarget32();
+Target &getTheNVPTXTarget64();
+
+namespace NVPTX {
+enum DrvInterface {
+ NVCL,
+ CUDA
+};
+
+// A field inside TSFlags needs a shift and a mask. The usage is
+// always as follows :
+// ((TSFlags & fieldMask) >> fieldShift)
+// The enum keeps the mask, the shift, and all valid values of the
+// field in one place.
+enum VecInstType {
+ VecInstTypeShift = 0,
+ VecInstTypeMask = 0xF,
+
+ VecNOP = 0,
+ VecLoad = 1,
+ VecStore = 2,
+ VecBuild = 3,
+ VecShuffle = 4,
+ VecExtract = 5,
+ VecInsert = 6,
+ VecDest = 7,
+ VecOther = 15
+};
+
+enum SimpleMove {
+ SimpleMoveMask = 0x10,
+ SimpleMoveShift = 4
+};
+enum LoadStore {
+ isLoadMask = 0x20,
+ isLoadShift = 5,
+ isStoreMask = 0x40,
+ isStoreShift = 6
+};
+
+namespace PTXLdStInstCode {
+enum AddressSpace {
+ GENERIC = 0,
+ GLOBAL = 1,
+ CONSTANT = 2,
+ SHARED = 3,
+ PARAM = 4,
+ LOCAL = 5
+};
+enum FromType {
+ Unsigned = 0,
+ Signed,
+ Float
+};
+enum VecType {
+ Scalar = 1,
+ V2 = 2,
+ V4 = 4
+};
+}
+
+/// PTXCvtMode - Conversion code enumeration
+namespace PTXCvtMode {
+enum CvtMode {
+ NONE = 0,
+ RNI,
+ RZI,
+ RMI,
+ RPI,
+ RN,
+ RZ,
+ RM,
+ RP,
+
+ BASE_MASK = 0x0F,
+ FTZ_FLAG = 0x10,
+ SAT_FLAG = 0x20
+};
+}
+
+/// PTXCmpMode - Comparison mode enumeration
+namespace PTXCmpMode {
+enum CmpMode {
+ EQ = 0,
+ NE,
+ LT,
+ LE,
+ GT,
+ GE,
+ LO,
+ LS,
+ HI,
+ HS,
+ EQU,
+ NEU,
+ LTU,
+ LEU,
+ GTU,
+ GEU,
+ NUM,
+ // NAN is a MACRO
+ NotANumber,
+
+ BASE_MASK = 0xFF,
+ FTZ_FLAG = 0x100
+};
+}
+}
+} // end namespace llvm;
+
+// Defines symbolic names for NVPTX registers. This defines a mapping from
+// register name to register number.
+#define GET_REGINFO_ENUM
+#include "NVPTXGenRegisterInfo.inc"
+
+// Defines symbolic names for the NVPTX instructions.
+#define GET_INSTRINFO_ENUM
+#include "NVPTXGenInstrInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTX.td b/contrib/llvm/lib/Target/NVPTX/NVPTX.td
new file mode 100644
index 000000000000..c77ddbc99789
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTX.td
@@ -0,0 +1,96 @@
+//===- NVPTX.td - Describe the NVPTX Target Machine -----------*- tblgen -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This is the top level entry point for the NVPTX target.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+include "NVPTXRegisterInfo.td"
+include "NVPTXInstrInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Subtarget Features.
+// - We use the SM version number instead of explicit feature table.
+// - Need at least one feature to avoid generating zero sized array by
+// TableGen in NVPTXGenSubtarget.inc.
+//===----------------------------------------------------------------------===//
+
+// SM Versions
+def SM20 : SubtargetFeature<"sm_20", "SmVersion", "20",
+ "Target SM 2.0">;
+def SM21 : SubtargetFeature<"sm_21", "SmVersion", "21",
+ "Target SM 2.1">;
+def SM30 : SubtargetFeature<"sm_30", "SmVersion", "30",
+ "Target SM 3.0">;
+def SM32 : SubtargetFeature<"sm_32", "SmVersion", "32",
+ "Target SM 3.2">;
+def SM35 : SubtargetFeature<"sm_35", "SmVersion", "35",
+ "Target SM 3.5">;
+def SM37 : SubtargetFeature<"sm_37", "SmVersion", "37",
+ "Target SM 3.7">;
+def SM50 : SubtargetFeature<"sm_50", "SmVersion", "50",
+ "Target SM 5.0">;
+def SM52 : SubtargetFeature<"sm_52", "SmVersion", "52",
+ "Target SM 5.2">;
+def SM53 : SubtargetFeature<"sm_53", "SmVersion", "53",
+ "Target SM 5.3">;
+def SM60 : SubtargetFeature<"sm_60", "SmVersion", "60",
+ "Target SM 6.0">;
+def SM61 : SubtargetFeature<"sm_61", "SmVersion", "61",
+ "Target SM 6.1">;
+def SM62 : SubtargetFeature<"sm_62", "SmVersion", "62",
+ "Target SM 6.2">;
+
+def SATOM : SubtargetFeature<"satom", "HasAtomScope", "true",
+ "Atomic operations with scope">;
+
+// PTX Versions
+def PTX32 : SubtargetFeature<"ptx32", "PTXVersion", "32",
+ "Use PTX version 3.2">;
+def PTX40 : SubtargetFeature<"ptx40", "PTXVersion", "40",
+ "Use PTX version 4.0">;
+def PTX41 : SubtargetFeature<"ptx41", "PTXVersion", "41",
+ "Use PTX version 4.1">;
+def PTX42 : SubtargetFeature<"ptx42", "PTXVersion", "42",
+ "Use PTX version 4.2">;
+def PTX43 : SubtargetFeature<"ptx43", "PTXVersion", "43",
+ "Use PTX version 4.3">;
+def PTX50 : SubtargetFeature<"ptx50", "PTXVersion", "50",
+ "Use PTX version 5.0">;
+
+//===----------------------------------------------------------------------===//
+// NVPTX supported processors.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"sm_20", [SM20]>;
+def : Proc<"sm_21", [SM21]>;
+def : Proc<"sm_30", [SM30]>;
+def : Proc<"sm_32", [SM32, PTX40]>;
+def : Proc<"sm_35", [SM35]>;
+def : Proc<"sm_37", [SM37, PTX41]>;
+def : Proc<"sm_50", [SM50, PTX40]>;
+def : Proc<"sm_52", [SM52, PTX41]>;
+def : Proc<"sm_53", [SM53, PTX42]>;
+def : Proc<"sm_60", [SM60, PTX50, SATOM]>;
+def : Proc<"sm_61", [SM61, PTX50, SATOM]>;
+def : Proc<"sm_62", [SM62, PTX50, SATOM]>;
+
+def NVPTXInstrInfo : InstrInfo {
+}
+
+def NVPTX : Target {
+ let InstructionSet = NVPTXInstrInfo;
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp
new file mode 100644
index 000000000000..bed52293197d
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp
@@ -0,0 +1,70 @@
+//===-- AllocaHoisting.cpp - Hoist allocas to the entry block --*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Hoist the alloca instructions in the non-entry blocks to the entry blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXAllocaHoisting.h"
+#include "llvm/CodeGen/StackProtector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+using namespace llvm;
+
+namespace {
+// Hoisting the alloca instructions in the non-entry blocks to the entry
+// block.
+class NVPTXAllocaHoisting : public FunctionPass {
+public:
+ static char ID; // Pass ID
+ NVPTXAllocaHoisting() : FunctionPass(ID) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addPreserved<StackProtector>();
+ }
+
+ StringRef getPassName() const override {
+ return "NVPTX specific alloca hoisting";
+ }
+
+ bool runOnFunction(Function &function) override;
+};
+} // namespace
+
+bool NVPTXAllocaHoisting::runOnFunction(Function &function) {
+ bool functionModified = false;
+ Function::iterator I = function.begin();
+ TerminatorInst *firstTerminatorInst = (I++)->getTerminator();
+
+ for (Function::iterator E = function.end(); I != E; ++I) {
+ for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
+ AllocaInst *allocaInst = dyn_cast<AllocaInst>(BI++);
+ if (allocaInst && isa<ConstantInt>(allocaInst->getArraySize())) {
+ allocaInst->moveBefore(firstTerminatorInst);
+ functionModified = true;
+ }
+ }
+ }
+
+ return functionModified;
+}
+
+char NVPTXAllocaHoisting::ID = 0;
+
+namespace llvm {
+void initializeNVPTXAllocaHoistingPass(PassRegistry &);
+}
+
+INITIALIZE_PASS(
+ NVPTXAllocaHoisting, "alloca-hoisting",
+ "Hoisting alloca instructions in non-entry blocks to the entry block",
+ false, false)
+
+FunctionPass *llvm::createAllocaHoisting() { return new NVPTXAllocaHoisting; }
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.h b/contrib/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.h
new file mode 100644
index 000000000000..7a6fc7d9b14d
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.h
@@ -0,0 +1,23 @@
+//===-- AllocaHoisting.h - Hosist allocas to the entry block ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Hoist the alloca instructions in the non-entry blocks to the entry blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXALLOCAHOISTING_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXALLOCAHOISTING_H
+
+namespace llvm {
+class FunctionPass;
+
+extern FunctionPass *createAllocaHoisting();
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
new file mode 100644
index 000000000000..04c8d5c0443e
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -0,0 +1,2387 @@
+//===-- NVPTXAsmPrinter.cpp - NVPTX LLVM assembly writer ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to NVPTX assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXAsmPrinter.h"
+#include "InstPrinter/NVPTXInstPrinter.h"
+#include "MCTargetDesc/NVPTXMCAsmInfo.h"
+#include "NVPTX.h"
+#include "NVPTXInstrInfo.h"
+#include "NVPTXMCExpr.h"
+#include "NVPTXMachineFunctionInfo.h"
+#include "NVPTXRegisterInfo.h"
+#include "NVPTXTargetMachine.h"
+#include "NVPTXUtilities.h"
+#include "cl_common_defines.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include <sstream>
+using namespace llvm;
+
+#define DEPOTNAME "__local_depot"
+
+static cl::opt<bool>
+EmitLineNumbers("nvptx-emit-line-numbers", cl::Hidden,
+ cl::desc("NVPTX Specific: Emit Line numbers even without -G"),
+ cl::init(true));
+
+static cl::opt<bool>
+InterleaveSrc("nvptx-emit-src", cl::ZeroOrMore, cl::Hidden,
+ cl::desc("NVPTX Specific: Emit source line in ptx file"),
+ cl::init(false));
+
+namespace {
+/// DiscoverDependentGlobals - Return a set of GlobalVariables on which \p V
+/// depends.
+void DiscoverDependentGlobals(const Value *V,
+ DenseSet<const GlobalVariable *> &Globals) {
+ if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
+ Globals.insert(GV);
+ else {
+ if (const User *U = dyn_cast<User>(V)) {
+ for (unsigned i = 0, e = U->getNumOperands(); i != e; ++i) {
+ DiscoverDependentGlobals(U->getOperand(i), Globals);
+ }
+ }
+ }
+}
+
+/// VisitGlobalVariableForEmission - Add \p GV to the list of GlobalVariable
+/// instances to be emitted, but only after any dependents have been added
+/// first.
+void VisitGlobalVariableForEmission(
+ const GlobalVariable *GV, SmallVectorImpl<const GlobalVariable *> &Order,
+ DenseSet<const GlobalVariable *> &Visited,
+ DenseSet<const GlobalVariable *> &Visiting) {
+ // Have we already visited this one?
+ if (Visited.count(GV))
+ return;
+
+ // Do we have a circular dependency?
+ if (!Visiting.insert(GV).second)
+ report_fatal_error("Circular dependency found in global variable set");
+
+ // Make sure we visit all dependents first
+ DenseSet<const GlobalVariable *> Others;
+ for (unsigned i = 0, e = GV->getNumOperands(); i != e; ++i)
+ DiscoverDependentGlobals(GV->getOperand(i), Others);
+
+ for (DenseSet<const GlobalVariable *>::iterator I = Others.begin(),
+ E = Others.end();
+ I != E; ++I)
+ VisitGlobalVariableForEmission(*I, Order, Visited, Visiting);
+
+ // Now we can visit ourself
+ Order.push_back(GV);
+ Visited.insert(GV);
+ Visiting.erase(GV);
+}
+}
+
+void NVPTXAsmPrinter::emitLineNumberAsDotLoc(const MachineInstr &MI) {
+ if (!EmitLineNumbers)
+ return;
+ if (ignoreLoc(MI))
+ return;
+
+ const DebugLoc &curLoc = MI.getDebugLoc();
+
+ if (!prevDebugLoc && !curLoc)
+ return;
+
+ if (prevDebugLoc == curLoc)
+ return;
+
+ prevDebugLoc = curLoc;
+
+ if (!curLoc)
+ return;
+
+ auto *Scope = cast_or_null<DIScope>(curLoc.getScope());
+ if (!Scope)
+ return;
+
+ StringRef fileName(Scope->getFilename());
+ StringRef dirName(Scope->getDirectory());
+ SmallString<128> FullPathName = dirName;
+ if (!dirName.empty() && !sys::path::is_absolute(fileName)) {
+ sys::path::append(FullPathName, fileName);
+ fileName = FullPathName;
+ }
+
+ if (filenameMap.find(fileName) == filenameMap.end())
+ return;
+
+ // Emit the line from the source file.
+ if (InterleaveSrc)
+ this->emitSrcInText(fileName, curLoc.getLine());
+
+ std::stringstream temp;
+ temp << "\t.loc " << filenameMap[fileName] << " " << curLoc.getLine()
+ << " " << curLoc.getCol();
+ OutStreamer->EmitRawText(temp.str());
+}
+
+void NVPTXAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+ SmallString<128> Str;
+ raw_svector_ostream OS(Str);
+ if (static_cast<NVPTXTargetMachine &>(TM).getDrvInterface() == NVPTX::CUDA)
+ emitLineNumberAsDotLoc(*MI);
+
+ MCInst Inst;
+ lowerToMCInst(MI, Inst);
+ EmitToStreamer(*OutStreamer, Inst);
+}
+
+// Handle symbol backtracking for targets that do not support image handles
+bool NVPTXAsmPrinter::lowerImageHandleOperand(const MachineInstr *MI,
+ unsigned OpNo, MCOperand &MCOp) {
+ const MachineOperand &MO = MI->getOperand(OpNo);
+ const MCInstrDesc &MCID = MI->getDesc();
+
+ if (MCID.TSFlags & NVPTXII::IsTexFlag) {
+ // This is a texture fetch, so operand 4 is a texref and operand 5 is
+ // a samplerref
+ if (OpNo == 4 && MO.isImm()) {
+ lowerImageHandleSymbol(MO.getImm(), MCOp);
+ return true;
+ }
+ if (OpNo == 5 && MO.isImm() && !(MCID.TSFlags & NVPTXII::IsTexModeUnifiedFlag)) {
+ lowerImageHandleSymbol(MO.getImm(), MCOp);
+ return true;
+ }
+
+ return false;
+ } else if (MCID.TSFlags & NVPTXII::IsSuldMask) {
+ unsigned VecSize =
+ 1 << (((MCID.TSFlags & NVPTXII::IsSuldMask) >> NVPTXII::IsSuldShift) - 1);
+
+ // For a surface load of vector size N, the Nth operand will be the surfref
+ if (OpNo == VecSize && MO.isImm()) {
+ lowerImageHandleSymbol(MO.getImm(), MCOp);
+ return true;
+ }
+
+ return false;
+ } else if (MCID.TSFlags & NVPTXII::IsSustFlag) {
+ // This is a surface store, so operand 0 is a surfref
+ if (OpNo == 0 && MO.isImm()) {
+ lowerImageHandleSymbol(MO.getImm(), MCOp);
+ return true;
+ }
+
+ return false;
+ } else if (MCID.TSFlags & NVPTXII::IsSurfTexQueryFlag) {
+ // This is a query, so operand 1 is a surfref/texref
+ if (OpNo == 1 && MO.isImm()) {
+ lowerImageHandleSymbol(MO.getImm(), MCOp);
+ return true;
+ }
+
+ return false;
+ }
+
+ return false;
+}
+
+void NVPTXAsmPrinter::lowerImageHandleSymbol(unsigned Index, MCOperand &MCOp) {
+ // Ewwww
+ TargetMachine &TM = const_cast<TargetMachine&>(MF->getTarget());
+ NVPTXTargetMachine &nvTM = static_cast<NVPTXTargetMachine&>(TM);
+ const NVPTXMachineFunctionInfo *MFI = MF->getInfo<NVPTXMachineFunctionInfo>();
+ const char *Sym = MFI->getImageHandleSymbol(Index);
+ std::string *SymNamePtr =
+ nvTM.getManagedStrPool()->getManagedString(Sym);
+ MCOp = GetSymbolRef(OutContext.getOrCreateSymbol(StringRef(*SymNamePtr)));
+}
+
+void NVPTXAsmPrinter::lowerToMCInst(const MachineInstr *MI, MCInst &OutMI) {
+ OutMI.setOpcode(MI->getOpcode());
+ // Special: Do not mangle symbol operand of CALL_PROTOTYPE
+ if (MI->getOpcode() == NVPTX::CALL_PROTOTYPE) {
+ const MachineOperand &MO = MI->getOperand(0);
+ OutMI.addOperand(GetSymbolRef(
+ OutContext.getOrCreateSymbol(Twine(MO.getSymbolName()))));
+ return;
+ }
+
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+
+ MCOperand MCOp;
+ if (!nvptxSubtarget->hasImageHandles()) {
+ if (lowerImageHandleOperand(MI, i, MCOp)) {
+ OutMI.addOperand(MCOp);
+ continue;
+ }
+ }
+
+ if (lowerOperand(MO, MCOp))
+ OutMI.addOperand(MCOp);
+ }
+}
+
+bool NVPTXAsmPrinter::lowerOperand(const MachineOperand &MO,
+ MCOperand &MCOp) {
+ switch (MO.getType()) {
+ default: llvm_unreachable("unknown operand type");
+ case MachineOperand::MO_Register:
+ MCOp = MCOperand::createReg(encodeVirtualRegister(MO.getReg()));
+ break;
+ case MachineOperand::MO_Immediate:
+ MCOp = MCOperand::createImm(MO.getImm());
+ break;
+ case MachineOperand::MO_MachineBasicBlock:
+ MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(
+ MO.getMBB()->getSymbol(), OutContext));
+ break;
+ case MachineOperand::MO_ExternalSymbol:
+ MCOp = GetSymbolRef(GetExternalSymbolSymbol(MO.getSymbolName()));
+ break;
+ case MachineOperand::MO_GlobalAddress:
+ MCOp = GetSymbolRef(getSymbol(MO.getGlobal()));
+ break;
+ case MachineOperand::MO_FPImmediate: {
+ const ConstantFP *Cnt = MO.getFPImm();
+ const APFloat &Val = Cnt->getValueAPF();
+
+ switch (Cnt->getType()->getTypeID()) {
+ default: report_fatal_error("Unsupported FP type"); break;
+ case Type::FloatTyID:
+ MCOp = MCOperand::createExpr(
+ NVPTXFloatMCExpr::createConstantFPSingle(Val, OutContext));
+ break;
+ case Type::DoubleTyID:
+ MCOp = MCOperand::createExpr(
+ NVPTXFloatMCExpr::createConstantFPDouble(Val, OutContext));
+ break;
+ }
+ break;
+ }
+ }
+ return true;
+}
+
+unsigned NVPTXAsmPrinter::encodeVirtualRegister(unsigned Reg) {
+ if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+
+ DenseMap<unsigned, unsigned> &RegMap = VRegMapping[RC];
+ unsigned RegNum = RegMap[Reg];
+
+ // Encode the register class in the upper 4 bits
+ // Must be kept in sync with NVPTXInstPrinter::printRegName
+ unsigned Ret = 0;
+ if (RC == &NVPTX::Int1RegsRegClass) {
+ Ret = (1 << 28);
+ } else if (RC == &NVPTX::Int16RegsRegClass) {
+ Ret = (2 << 28);
+ } else if (RC == &NVPTX::Int32RegsRegClass) {
+ Ret = (3 << 28);
+ } else if (RC == &NVPTX::Int64RegsRegClass) {
+ Ret = (4 << 28);
+ } else if (RC == &NVPTX::Float32RegsRegClass) {
+ Ret = (5 << 28);
+ } else if (RC == &NVPTX::Float64RegsRegClass) {
+ Ret = (6 << 28);
+ } else {
+ report_fatal_error("Bad register class");
+ }
+
+ // Insert the vreg number
+ Ret |= (RegNum & 0x0FFFFFFF);
+ return Ret;
+ } else {
+ // Some special-use registers are actually physical registers.
+ // Encode this as the register class ID of 0 and the real register ID.
+ return Reg & 0x0FFFFFFF;
+ }
+}
+
+MCOperand NVPTXAsmPrinter::GetSymbolRef(const MCSymbol *Symbol) {
+ const MCExpr *Expr;
+ Expr = MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None,
+ OutContext);
+ return MCOperand::createExpr(Expr);
+}
+
+void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
+ const DataLayout &DL = getDataLayout();
+ const TargetLowering *TLI = nvptxSubtarget->getTargetLowering();
+
+ Type *Ty = F->getReturnType();
+
+ bool isABI = (nvptxSubtarget->getSmVersion() >= 20);
+
+ if (Ty->getTypeID() == Type::VoidTyID)
+ return;
+
+ O << " (";
+
+ if (isABI) {
+ if (Ty->isFloatingPointTy() || Ty->isIntegerTy()) {
+ unsigned size = 0;
+ if (auto *ITy = dyn_cast<IntegerType>(Ty)) {
+ size = ITy->getBitWidth();
+ if (size < 32)
+ size = 32;
+ } else {
+ assert(Ty->isFloatingPointTy() && "Floating point type expected here");
+ size = Ty->getPrimitiveSizeInBits();
+ }
+
+ O << ".param .b" << size << " func_retval0";
+ } else if (isa<PointerType>(Ty)) {
+ O << ".param .b" << TLI->getPointerTy(DL).getSizeInBits()
+ << " func_retval0";
+ } else if (Ty->isAggregateType() || Ty->isVectorTy()) {
+ unsigned totalsz = DL.getTypeAllocSize(Ty);
+ unsigned retAlignment = 0;
+ if (!llvm::getAlign(*F, 0, retAlignment))
+ retAlignment = DL.getABITypeAlignment(Ty);
+ O << ".param .align " << retAlignment << " .b8 func_retval0[" << totalsz
+ << "]";
+ } else
+ llvm_unreachable("Unknown return type");
+ } else {
+ SmallVector<EVT, 16> vtparts;
+ ComputeValueVTs(*TLI, DL, Ty, vtparts);
+ unsigned idx = 0;
+ for (unsigned i = 0, e = vtparts.size(); i != e; ++i) {
+ unsigned elems = 1;
+ EVT elemtype = vtparts[i];
+ if (vtparts[i].isVector()) {
+ elems = vtparts[i].getVectorNumElements();
+ elemtype = vtparts[i].getVectorElementType();
+ }
+
+ for (unsigned j = 0, je = elems; j != je; ++j) {
+ unsigned sz = elemtype.getSizeInBits();
+ if (elemtype.isInteger() && (sz < 32))
+ sz = 32;
+ O << ".reg .b" << sz << " func_retval" << idx;
+ if (j < je - 1)
+ O << ", ";
+ ++idx;
+ }
+ if (i < e - 1)
+ O << ", ";
+ }
+ }
+ O << ") ";
+ return;
+}
+
+void NVPTXAsmPrinter::printReturnValStr(const MachineFunction &MF,
+ raw_ostream &O) {
+ const Function *F = MF.getFunction();
+ printReturnValStr(F, O);
+}
+
+// Return true if MBB is the header of a loop marked with
+// llvm.loop.unroll.disable.
+// TODO: consider "#pragma unroll 1" which is equivalent to "#pragma nounroll".
+bool NVPTXAsmPrinter::isLoopHeaderOfNoUnroll(
+ const MachineBasicBlock &MBB) const {
+ MachineLoopInfo &LI = getAnalysis<MachineLoopInfo>();
+ // We insert .pragma "nounroll" only to the loop header.
+ if (!LI.isLoopHeader(&MBB))
+ return false;
+
+ // llvm.loop.unroll.disable is marked on the back edges of a loop. Therefore,
+ // we iterate through each back edge of the loop with header MBB, and check
+ // whether its metadata contains llvm.loop.unroll.disable.
+ for (auto I = MBB.pred_begin(); I != MBB.pred_end(); ++I) {
+ const MachineBasicBlock *PMBB = *I;
+ if (LI.getLoopFor(PMBB) != LI.getLoopFor(&MBB)) {
+ // Edges from other loops to MBB are not back edges.
+ continue;
+ }
+ if (const BasicBlock *PBB = PMBB->getBasicBlock()) {
+ if (MDNode *LoopID =
+ PBB->getTerminator()->getMetadata(LLVMContext::MD_loop)) {
+ if (GetUnrollMetadata(LoopID, "llvm.loop.unroll.disable"))
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+void NVPTXAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const {
+ AsmPrinter::EmitBasicBlockStart(MBB);
+ if (isLoopHeaderOfNoUnroll(MBB))
+ OutStreamer->EmitRawText(StringRef("\t.pragma \"nounroll\";\n"));
+}
+
+void NVPTXAsmPrinter::EmitFunctionEntryLabel() {
+ SmallString<128> Str;
+ raw_svector_ostream O(Str);
+
+ if (!GlobalsEmitted) {
+ emitGlobals(*MF->getFunction()->getParent());
+ GlobalsEmitted = true;
+ }
+
+ // Set up
+ MRI = &MF->getRegInfo();
+ F = MF->getFunction();
+ emitLinkageDirective(F, O);
+ if (llvm::isKernelFunction(*F))
+ O << ".entry ";
+ else {
+ O << ".func ";
+ printReturnValStr(*MF, O);
+ }
+
+ CurrentFnSym->print(O, MAI);
+
+ emitFunctionParamList(*MF, O);
+
+ if (llvm::isKernelFunction(*F))
+ emitKernelFunctionDirectives(*F, O);
+
+ OutStreamer->EmitRawText(O.str());
+
+ prevDebugLoc = DebugLoc();
+}
+
+void NVPTXAsmPrinter::EmitFunctionBodyStart() {
+ VRegMapping.clear();
+ OutStreamer->EmitRawText(StringRef("{\n"));
+ setAndEmitFunctionVirtualRegisters(*MF);
+
+ SmallString<128> Str;
+ raw_svector_ostream O(Str);
+ emitDemotedVars(MF->getFunction(), O);
+ OutStreamer->EmitRawText(O.str());
+}
+
+void NVPTXAsmPrinter::EmitFunctionBodyEnd() {
+ OutStreamer->EmitRawText(StringRef("}\n"));
+ VRegMapping.clear();
+}
+
+void NVPTXAsmPrinter::emitImplicitDef(const MachineInstr *MI) const {
+ unsigned RegNo = MI->getOperand(0).getReg();
+ if (TargetRegisterInfo::isVirtualRegister(RegNo)) {
+ OutStreamer->AddComment(Twine("implicit-def: ") +
+ getVirtualRegisterName(RegNo));
+ } else {
+ OutStreamer->AddComment(Twine("implicit-def: ") +
+ nvptxSubtarget->getRegisterInfo()->getName(RegNo));
+ }
+ OutStreamer->AddBlankLine();
+}
+
+void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
+ raw_ostream &O) const {
+ // If the NVVM IR has some of reqntid* specified, then output
+ // the reqntid directive, and set the unspecified ones to 1.
+ // If none of reqntid* is specified, don't output reqntid directive.
+ unsigned reqntidx, reqntidy, reqntidz;
+ bool specified = false;
+ if (!llvm::getReqNTIDx(F, reqntidx))
+ reqntidx = 1;
+ else
+ specified = true;
+ if (!llvm::getReqNTIDy(F, reqntidy))
+ reqntidy = 1;
+ else
+ specified = true;
+ if (!llvm::getReqNTIDz(F, reqntidz))
+ reqntidz = 1;
+ else
+ specified = true;
+
+ if (specified)
+ O << ".reqntid " << reqntidx << ", " << reqntidy << ", " << reqntidz
+ << "\n";
+
+ // If the NVVM IR has some of maxntid* specified, then output
+ // the maxntid directive, and set the unspecified ones to 1.
+ // If none of maxntid* is specified, don't output maxntid directive.
+ unsigned maxntidx, maxntidy, maxntidz;
+ specified = false;
+ if (!llvm::getMaxNTIDx(F, maxntidx))
+ maxntidx = 1;
+ else
+ specified = true;
+ if (!llvm::getMaxNTIDy(F, maxntidy))
+ maxntidy = 1;
+ else
+ specified = true;
+ if (!llvm::getMaxNTIDz(F, maxntidz))
+ maxntidz = 1;
+ else
+ specified = true;
+
+ if (specified)
+ O << ".maxntid " << maxntidx << ", " << maxntidy << ", " << maxntidz
+ << "\n";
+
+ unsigned mincta;
+ if (llvm::getMinCTASm(F, mincta))
+ O << ".minnctapersm " << mincta << "\n";
+
+ unsigned maxnreg;
+ if (llvm::getMaxNReg(F, maxnreg))
+ O << ".maxnreg " << maxnreg << "\n";
+}
+
+std::string
+NVPTXAsmPrinter::getVirtualRegisterName(unsigned Reg) const {
+ const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+
+ std::string Name;
+ raw_string_ostream NameStr(Name);
+
+ VRegRCMap::const_iterator I = VRegMapping.find(RC);
+ assert(I != VRegMapping.end() && "Bad register class");
+ const DenseMap<unsigned, unsigned> &RegMap = I->second;
+
+ VRegMap::const_iterator VI = RegMap.find(Reg);
+ assert(VI != RegMap.end() && "Bad virtual register");
+ unsigned MappedVR = VI->second;
+
+ NameStr << getNVPTXRegClassStr(RC) << MappedVR;
+
+ NameStr.flush();
+ return Name;
+}
+
+void NVPTXAsmPrinter::emitVirtualRegister(unsigned int vr,
+ raw_ostream &O) {
+ O << getVirtualRegisterName(vr);
+}
+
+void NVPTXAsmPrinter::printVecModifiedImmediate(
+ const MachineOperand &MO, const char *Modifier, raw_ostream &O) {
+ static const char vecelem[] = { '0', '1', '2', '3', '0', '1', '2', '3' };
+ int Imm = (int) MO.getImm();
+ if (0 == strcmp(Modifier, "vecelem"))
+ O << "_" << vecelem[Imm];
+ else if (0 == strcmp(Modifier, "vecv4comm1")) {
+ if ((Imm < 0) || (Imm > 3))
+ O << "//";
+ } else if (0 == strcmp(Modifier, "vecv4comm2")) {
+ if ((Imm < 4) || (Imm > 7))
+ O << "//";
+ } else if (0 == strcmp(Modifier, "vecv4pos")) {
+ if (Imm < 0)
+ Imm = 0;
+ O << "_" << vecelem[Imm % 4];
+ } else if (0 == strcmp(Modifier, "vecv2comm1")) {
+ if ((Imm < 0) || (Imm > 1))
+ O << "//";
+ } else if (0 == strcmp(Modifier, "vecv2comm2")) {
+ if ((Imm < 2) || (Imm > 3))
+ O << "//";
+ } else if (0 == strcmp(Modifier, "vecv2pos")) {
+ if (Imm < 0)
+ Imm = 0;
+ O << "_" << vecelem[Imm % 2];
+ } else
+ llvm_unreachable("Unknown Modifier on immediate operand");
+}
+
+
+
+void NVPTXAsmPrinter::emitDeclaration(const Function *F, raw_ostream &O) {
+
+ emitLinkageDirective(F, O);
+ if (llvm::isKernelFunction(*F))
+ O << ".entry ";
+ else
+ O << ".func ";
+ printReturnValStr(F, O);
+ getSymbol(F)->print(O, MAI);
+ O << "\n";
+ emitFunctionParamList(F, O);
+ O << ";\n";
+}
+
+static bool usedInGlobalVarDef(const Constant *C) {
+ if (!C)
+ return false;
+
+ if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) {
+ return GV->getName() != "llvm.used";
+ }
+
+ for (const User *U : C->users())
+ if (const Constant *C = dyn_cast<Constant>(U))
+ if (usedInGlobalVarDef(C))
+ return true;
+
+ return false;
+}
+
+static bool usedInOneFunc(const User *U, Function const *&oneFunc) {
+ if (const GlobalVariable *othergv = dyn_cast<GlobalVariable>(U)) {
+ if (othergv->getName() == "llvm.used")
+ return true;
+ }
+
+ if (const Instruction *instr = dyn_cast<Instruction>(U)) {
+ if (instr->getParent() && instr->getParent()->getParent()) {
+ const Function *curFunc = instr->getParent()->getParent();
+ if (oneFunc && (curFunc != oneFunc))
+ return false;
+ oneFunc = curFunc;
+ return true;
+ } else
+ return false;
+ }
+
+ for (const User *UU : U->users())
+ if (!usedInOneFunc(UU, oneFunc))
+ return false;
+
+ return true;
+}
+
+/* Find out if a global variable can be demoted to local scope.
+ * Currently, this is valid for CUDA shared variables, which have local
+ * scope and global lifetime. So the conditions to check are :
+ * 1. Is the global variable in shared address space?
+ * 2. Does it have internal linkage?
+ * 3. Is the global variable referenced only in one function?
+ */
+static bool canDemoteGlobalVar(const GlobalVariable *gv, Function const *&f) {
+ if (!gv->hasInternalLinkage())
+ return false;
+ PointerType *Pty = gv->getType();
+ if (Pty->getAddressSpace() != llvm::ADDRESS_SPACE_SHARED)
+ return false;
+
+ const Function *oneFunc = nullptr;
+
+ bool flag = usedInOneFunc(gv, oneFunc);
+ if (!flag)
+ return false;
+ if (!oneFunc)
+ return false;
+ f = oneFunc;
+ return true;
+}
+
+static bool useFuncSeen(const Constant *C,
+ llvm::DenseMap<const Function *, bool> &seenMap) {
+ for (const User *U : C->users()) {
+ if (const Constant *cu = dyn_cast<Constant>(U)) {
+ if (useFuncSeen(cu, seenMap))
+ return true;
+ } else if (const Instruction *I = dyn_cast<Instruction>(U)) {
+ const BasicBlock *bb = I->getParent();
+ if (!bb)
+ continue;
+ const Function *caller = bb->getParent();
+ if (!caller)
+ continue;
+ if (seenMap.find(caller) != seenMap.end())
+ return true;
+ }
+ }
+ return false;
+}
+
+void NVPTXAsmPrinter::emitDeclarations(const Module &M, raw_ostream &O) {
+ llvm::DenseMap<const Function *, bool> seenMap;
+ for (Module::const_iterator FI = M.begin(), FE = M.end(); FI != FE; ++FI) {
+ const Function *F = &*FI;
+
+ if (F->isDeclaration()) {
+ if (F->use_empty())
+ continue;
+ if (F->getIntrinsicID())
+ continue;
+ emitDeclaration(F, O);
+ continue;
+ }
+ for (const User *U : F->users()) {
+ if (const Constant *C = dyn_cast<Constant>(U)) {
+ if (usedInGlobalVarDef(C)) {
+ // The use is in the initialization of a global variable
+ // that is a function pointer, so print a declaration
+ // for the original function
+ emitDeclaration(F, O);
+ break;
+ }
+ // Emit a declaration of this function if the function that
+ // uses this constant expr has already been seen.
+ if (useFuncSeen(C, seenMap)) {
+ emitDeclaration(F, O);
+ break;
+ }
+ }
+
+ if (!isa<Instruction>(U))
+ continue;
+ const Instruction *instr = cast<Instruction>(U);
+ const BasicBlock *bb = instr->getParent();
+ if (!bb)
+ continue;
+ const Function *caller = bb->getParent();
+ if (!caller)
+ continue;
+
+ // If a caller has already been seen, then the caller is
+ // appearing in the module before the callee. so print out
+ // a declaration for the callee.
+ if (seenMap.find(caller) != seenMap.end()) {
+ emitDeclaration(F, O);
+ break;
+ }
+ }
+ seenMap[F] = true;
+ }
+}
+
+void NVPTXAsmPrinter::recordAndEmitFilenames(Module &M) {
+ DebugInfoFinder DbgFinder;
+ DbgFinder.processModule(M);
+
+ unsigned i = 1;
+ for (const DICompileUnit *DIUnit : DbgFinder.compile_units()) {
+ StringRef Filename = DIUnit->getFilename();
+ StringRef Dirname = DIUnit->getDirectory();
+ SmallString<128> FullPathName = Dirname;
+ if (!Dirname.empty() && !sys::path::is_absolute(Filename)) {
+ sys::path::append(FullPathName, Filename);
+ Filename = FullPathName;
+ }
+ if (filenameMap.find(Filename) != filenameMap.end())
+ continue;
+ filenameMap[Filename] = i;
+ OutStreamer->EmitDwarfFileDirective(i, "", Filename);
+ ++i;
+ }
+
+ for (DISubprogram *SP : DbgFinder.subprograms()) {
+ StringRef Filename = SP->getFilename();
+ StringRef Dirname = SP->getDirectory();
+ SmallString<128> FullPathName = Dirname;
+ if (!Dirname.empty() && !sys::path::is_absolute(Filename)) {
+ sys::path::append(FullPathName, Filename);
+ Filename = FullPathName;
+ }
+ if (filenameMap.find(Filename) != filenameMap.end())
+ continue;
+ filenameMap[Filename] = i;
+ OutStreamer->EmitDwarfFileDirective(i, "", Filename);
+ ++i;
+ }
+}
+
+static bool isEmptyXXStructor(GlobalVariable *GV) {
+ if (!GV) return true;
+ const ConstantArray *InitList = dyn_cast<ConstantArray>(GV->getInitializer());
+ if (!InitList) return true; // Not an array; we don't know how to parse.
+ return InitList->getNumOperands() == 0;
+}
+
+bool NVPTXAsmPrinter::doInitialization(Module &M) {
+ // Construct a default subtarget off of the TargetMachine defaults. The
+ // rest of NVPTX isn't friendly to change subtargets per function and
+ // so the default TargetMachine will have all of the options.
+ const Triple &TT = TM.getTargetTriple();
+ StringRef CPU = TM.getTargetCPU();
+ StringRef FS = TM.getTargetFeatureString();
+ const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM);
+ const NVPTXSubtarget STI(TT, CPU, FS, NTM);
+
+ if (M.alias_size()) {
+ report_fatal_error("Module has aliases, which NVPTX does not support.");
+ return true; // error
+ }
+ if (!isEmptyXXStructor(M.getNamedGlobal("llvm.global_ctors"))) {
+ report_fatal_error(
+ "Module has a nontrivial global ctor, which NVPTX does not support.");
+ return true; // error
+ }
+ if (!isEmptyXXStructor(M.getNamedGlobal("llvm.global_dtors"))) {
+ report_fatal_error(
+ "Module has a nontrivial global dtor, which NVPTX does not support.");
+ return true; // error
+ }
+
+ SmallString<128> Str1;
+ raw_svector_ostream OS1(Str1);
+
+ MMI = getAnalysisIfAvailable<MachineModuleInfo>();
+
+ // We need to call the parent's one explicitly.
+ //bool Result = AsmPrinter::doInitialization(M);
+
+ // Initialize TargetLoweringObjectFile since we didn't do in
+ // AsmPrinter::doInitialization either right above or where it's commented out
+ // below.
+ const_cast<TargetLoweringObjectFile &>(getObjFileLowering())
+ .Initialize(OutContext, TM);
+
+ // Emit header before any dwarf directives are emitted below.
+ emitHeader(M, OS1, STI);
+ OutStreamer->EmitRawText(OS1.str());
+
+ // Already commented out
+ //bool Result = AsmPrinter::doInitialization(M);
+
+ // Emit module-level inline asm if it exists.
+ if (!M.getModuleInlineAsm().empty()) {
+ OutStreamer->AddComment("Start of file scope inline assembly");
+ OutStreamer->AddBlankLine();
+ OutStreamer->EmitRawText(StringRef(M.getModuleInlineAsm()));
+ OutStreamer->AddBlankLine();
+ OutStreamer->AddComment("End of file scope inline assembly");
+ OutStreamer->AddBlankLine();
+ }
+
+ // If we're not NVCL we're CUDA, go ahead and emit filenames.
+ if (TM.getTargetTriple().getOS() != Triple::NVCL)
+ recordAndEmitFilenames(M);
+
+ GlobalsEmitted = false;
+
+ return false; // success
+}
+
+void NVPTXAsmPrinter::emitGlobals(const Module &M) {
+ SmallString<128> Str2;
+ raw_svector_ostream OS2(Str2);
+
+ emitDeclarations(M, OS2);
+
+ // As ptxas does not support forward references of globals, we need to first
+ // sort the list of module-level globals in def-use order. We visit each
+ // global variable in order, and ensure that we emit it *after* its dependent
+ // globals. We use a little extra memory maintaining both a set and a list to
+ // have fast searches while maintaining a strict ordering.
+ SmallVector<const GlobalVariable *, 8> Globals;
+ DenseSet<const GlobalVariable *> GVVisited;
+ DenseSet<const GlobalVariable *> GVVisiting;
+
+ // Visit each global variable, in order
+ for (const GlobalVariable &I : M.globals())
+ VisitGlobalVariableForEmission(&I, Globals, GVVisited, GVVisiting);
+
+ assert(GVVisited.size() == M.getGlobalList().size() &&
+ "Missed a global variable");
+ assert(GVVisiting.size() == 0 && "Did not fully process a global variable");
+
+ // Print out module-level global variables in proper order
+ for (unsigned i = 0, e = Globals.size(); i != e; ++i)
+ printModuleLevelGV(Globals[i], OS2);
+
+ OS2 << '\n';
+
+ OutStreamer->EmitRawText(OS2.str());
+}
+
+void NVPTXAsmPrinter::emitHeader(Module &M, raw_ostream &O,
+ const NVPTXSubtarget &STI) {
+ O << "//\n";
+ O << "// Generated by LLVM NVPTX Back-End\n";
+ O << "//\n";
+ O << "\n";
+
+ unsigned PTXVersion = STI.getPTXVersion();
+ O << ".version " << (PTXVersion / 10) << "." << (PTXVersion % 10) << "\n";
+
+ O << ".target ";
+ O << STI.getTargetName();
+
+ const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM);
+ if (NTM.getDrvInterface() == NVPTX::NVCL)
+ O << ", texmode_independent";
+ else {
+ if (!STI.hasDouble())
+ O << ", map_f64_to_f32";
+ }
+
+ if (MAI->doesSupportDebugInformation())
+ O << ", debug";
+
+ O << "\n";
+
+ O << ".address_size ";
+ if (NTM.is64Bit())
+ O << "64";
+ else
+ O << "32";
+ O << "\n";
+
+ O << "\n";
+}
+
+bool NVPTXAsmPrinter::doFinalization(Module &M) {
+ // If we did not emit any functions, then the global declarations have not
+ // yet been emitted.
+ if (!GlobalsEmitted) {
+ emitGlobals(M);
+ GlobalsEmitted = true;
+ }
+
+ // XXX Temproarily remove global variables so that doFinalization() will not
+ // emit them again (global variables are emitted at beginning).
+
+ Module::GlobalListType &global_list = M.getGlobalList();
+ int i, n = global_list.size();
+ GlobalVariable **gv_array = new GlobalVariable *[n];
+
+ // first, back-up GlobalVariable in gv_array
+ i = 0;
+ for (Module::global_iterator I = global_list.begin(), E = global_list.end();
+ I != E; ++I)
+ gv_array[i++] = &*I;
+
+ // second, empty global_list
+ while (!global_list.empty())
+ global_list.remove(global_list.begin());
+
+ // call doFinalization
+ bool ret = AsmPrinter::doFinalization(M);
+
+ // now we restore global variables
+ for (i = 0; i < n; i++)
+ global_list.insert(global_list.end(), gv_array[i]);
+
+ clearAnnotationCache(&M);
+
+ delete[] gv_array;
+ return ret;
+
+ //bool Result = AsmPrinter::doFinalization(M);
+ // Instead of calling the parents doFinalization, we may
+ // clone parents doFinalization and customize here.
+ // Currently, we if NVISA out the EmitGlobals() in
+ // parent's doFinalization, which is too intrusive.
+ //
+ // Same for the doInitialization.
+ //return Result;
+}
+
+// This function emits appropriate linkage directives for
+// functions and global variables.
+//
+// extern function declaration -> .extern
+// extern function definition -> .visible
+// external global variable with init -> .visible
+// external without init -> .extern
+// appending -> not allowed, assert.
+// for any linkage other than
+// internal, private, linker_private,
+// linker_private_weak, linker_private_weak_def_auto,
+// we emit -> .weak.
+
+void NVPTXAsmPrinter::emitLinkageDirective(const GlobalValue *V,
+ raw_ostream &O) {
+ if (static_cast<NVPTXTargetMachine &>(TM).getDrvInterface() == NVPTX::CUDA) {
+ if (V->hasExternalLinkage()) {
+ if (isa<GlobalVariable>(V)) {
+ const GlobalVariable *GVar = cast<GlobalVariable>(V);
+ if (GVar) {
+ if (GVar->hasInitializer())
+ O << ".visible ";
+ else
+ O << ".extern ";
+ }
+ } else if (V->isDeclaration())
+ O << ".extern ";
+ else
+ O << ".visible ";
+ } else if (V->hasAppendingLinkage()) {
+ std::string msg;
+ msg.append("Error: ");
+ msg.append("Symbol ");
+ if (V->hasName())
+ msg.append(V->getName());
+ msg.append("has unsupported appending linkage type");
+ llvm_unreachable(msg.c_str());
+ } else if (!V->hasInternalLinkage() &&
+ !V->hasPrivateLinkage()) {
+ O << ".weak ";
+ }
+ }
+}
+
+void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
+ raw_ostream &O,
+ bool processDemoted) {
+
+ // Skip meta data
+ if (GVar->hasSection()) {
+ if (GVar->getSection() == "llvm.metadata")
+ return;
+ }
+
+ // Skip LLVM intrinsic global variables
+ if (GVar->getName().startswith("llvm.") ||
+ GVar->getName().startswith("nvvm."))
+ return;
+
+ const DataLayout &DL = getDataLayout();
+
+ // GlobalVariables are always constant pointers themselves.
+ PointerType *PTy = GVar->getType();
+ Type *ETy = GVar->getValueType();
+
+ if (GVar->hasExternalLinkage()) {
+ if (GVar->hasInitializer())
+ O << ".visible ";
+ else
+ O << ".extern ";
+ } else if (GVar->hasLinkOnceLinkage() || GVar->hasWeakLinkage() ||
+ GVar->hasAvailableExternallyLinkage() ||
+ GVar->hasCommonLinkage()) {
+ O << ".weak ";
+ }
+
+ if (llvm::isTexture(*GVar)) {
+ O << ".global .texref " << llvm::getTextureName(*GVar) << ";\n";
+ return;
+ }
+
+ if (llvm::isSurface(*GVar)) {
+ O << ".global .surfref " << llvm::getSurfaceName(*GVar) << ";\n";
+ return;
+ }
+
+ if (GVar->isDeclaration()) {
+ // (extern) declarations, no definition or initializer
+ // Currently the only known declaration is for an automatic __local
+ // (.shared) promoted to global.
+ emitPTXGlobalVariable(GVar, O);
+ O << ";\n";
+ return;
+ }
+
+ if (llvm::isSampler(*GVar)) {
+ O << ".global .samplerref " << llvm::getSamplerName(*GVar);
+
+ const Constant *Initializer = nullptr;
+ if (GVar->hasInitializer())
+ Initializer = GVar->getInitializer();
+ const ConstantInt *CI = nullptr;
+ if (Initializer)
+ CI = dyn_cast<ConstantInt>(Initializer);
+ if (CI) {
+ unsigned sample = CI->getZExtValue();
+
+ O << " = { ";
+
+ for (int i = 0,
+ addr = ((sample & __CLK_ADDRESS_MASK) >> __CLK_ADDRESS_BASE);
+ i < 3; i++) {
+ O << "addr_mode_" << i << " = ";
+ switch (addr) {
+ case 0:
+ O << "wrap";
+ break;
+ case 1:
+ O << "clamp_to_border";
+ break;
+ case 2:
+ O << "clamp_to_edge";
+ break;
+ case 3:
+ O << "wrap";
+ break;
+ case 4:
+ O << "mirror";
+ break;
+ }
+ O << ", ";
+ }
+ O << "filter_mode = ";
+ switch ((sample & __CLK_FILTER_MASK) >> __CLK_FILTER_BASE) {
+ case 0:
+ O << "nearest";
+ break;
+ case 1:
+ O << "linear";
+ break;
+ case 2:
+ llvm_unreachable("Anisotropic filtering is not supported");
+ default:
+ O << "nearest";
+ break;
+ }
+ if (!((sample & __CLK_NORMALIZED_MASK) >> __CLK_NORMALIZED_BASE)) {
+ O << ", force_unnormalized_coords = 1";
+ }
+ O << " }";
+ }
+
+ O << ";\n";
+ return;
+ }
+
+ if (GVar->hasPrivateLinkage()) {
+
+ if (!strncmp(GVar->getName().data(), "unrollpragma", 12))
+ return;
+
+ // FIXME - need better way (e.g. Metadata) to avoid generating this global
+ if (!strncmp(GVar->getName().data(), "filename", 8))
+ return;
+ if (GVar->use_empty())
+ return;
+ }
+
+ const Function *demotedFunc = nullptr;
+ if (!processDemoted && canDemoteGlobalVar(GVar, demotedFunc)) {
+ O << "// " << GVar->getName() << " has been demoted\n";
+ if (localDecls.find(demotedFunc) != localDecls.end())
+ localDecls[demotedFunc].push_back(GVar);
+ else {
+ std::vector<const GlobalVariable *> temp;
+ temp.push_back(GVar);
+ localDecls[demotedFunc] = temp;
+ }
+ return;
+ }
+
+ O << ".";
+ emitPTXAddressSpace(PTy->getAddressSpace(), O);
+
+ if (isManaged(*GVar)) {
+ O << " .attribute(.managed)";
+ }
+
+ if (GVar->getAlignment() == 0)
+ O << " .align " << (int)DL.getPrefTypeAlignment(ETy);
+ else
+ O << " .align " << GVar->getAlignment();
+
+ if (ETy->isFloatingPointTy() || ETy->isIntegerTy() || ETy->isPointerTy()) {
+ O << " .";
+ // Special case: ABI requires that we use .u8 for predicates
+ if (ETy->isIntegerTy(1))
+ O << "u8";
+ else
+ O << getPTXFundamentalTypeStr(ETy, false);
+ O << " ";
+ getSymbol(GVar)->print(O, MAI);
+
+ // Ptx allows variable initilization only for constant and global state
+ // spaces.
+ if (GVar->hasInitializer()) {
+ if ((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) ||
+ (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) {
+ const Constant *Initializer = GVar->getInitializer();
+ // 'undef' is treated as there is no value specified.
+ if (!Initializer->isNullValue() && !isa<UndefValue>(Initializer)) {
+ O << " = ";
+ printScalarConstant(Initializer, O);
+ }
+ } else {
+ // The frontend adds zero-initializer to device and constant variables
+ // that don't have an initial value, and UndefValue to shared
+ // variables, so skip warning for this case.
+ if (!GVar->getInitializer()->isNullValue() &&
+ !isa<UndefValue>(GVar->getInitializer())) {
+ report_fatal_error("initial value of '" + GVar->getName() +
+ "' is not allowed in addrspace(" +
+ Twine(PTy->getAddressSpace()) + ")");
+ }
+ }
+ }
+ } else {
+ unsigned int ElementSize = 0;
+
+ // Although PTX has direct support for struct type and array type and
+ // LLVM IR is very similar to PTX, the LLVM CodeGen does not support for
+ // targets that support these high level field accesses. Structs, arrays
+ // and vectors are lowered into arrays of bytes.
+ switch (ETy->getTypeID()) {
+ case Type::StructTyID:
+ case Type::ArrayTyID:
+ case Type::VectorTyID:
+ ElementSize = DL.getTypeStoreSize(ETy);
+ // Ptx allows variable initilization only for constant and
+ // global state spaces.
+ if (((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) ||
+ (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) &&
+ GVar->hasInitializer()) {
+ const Constant *Initializer = GVar->getInitializer();
+ if (!isa<UndefValue>(Initializer) && !Initializer->isNullValue()) {
+ AggBuffer aggBuffer(ElementSize, O, *this);
+ bufferAggregateConstant(Initializer, &aggBuffer);
+ if (aggBuffer.numSymbols) {
+ if (static_cast<const NVPTXTargetMachine &>(TM).is64Bit()) {
+ O << " .u64 ";
+ getSymbol(GVar)->print(O, MAI);
+ O << "[";
+ O << ElementSize / 8;
+ } else {
+ O << " .u32 ";
+ getSymbol(GVar)->print(O, MAI);
+ O << "[";
+ O << ElementSize / 4;
+ }
+ O << "]";
+ } else {
+ O << " .b8 ";
+ getSymbol(GVar)->print(O, MAI);
+ O << "[";
+ O << ElementSize;
+ O << "]";
+ }
+ O << " = {";
+ aggBuffer.print();
+ O << "}";
+ } else {
+ O << " .b8 ";
+ getSymbol(GVar)->print(O, MAI);
+ if (ElementSize) {
+ O << "[";
+ O << ElementSize;
+ O << "]";
+ }
+ }
+ } else {
+ O << " .b8 ";
+ getSymbol(GVar)->print(O, MAI);
+ if (ElementSize) {
+ O << "[";
+ O << ElementSize;
+ O << "]";
+ }
+ }
+ break;
+ default:
+ llvm_unreachable("type not supported yet");
+ }
+
+ }
+ O << ";\n";
+}
+
+void NVPTXAsmPrinter::emitDemotedVars(const Function *f, raw_ostream &O) {
+ if (localDecls.find(f) == localDecls.end())
+ return;
+
+ std::vector<const GlobalVariable *> &gvars = localDecls[f];
+
+ for (unsigned i = 0, e = gvars.size(); i != e; ++i) {
+ O << "\t// demoted variable\n\t";
+ printModuleLevelGV(gvars[i], O, true);
+ }
+}
+
+void NVPTXAsmPrinter::emitPTXAddressSpace(unsigned int AddressSpace,
+ raw_ostream &O) const {
+ switch (AddressSpace) {
+ case llvm::ADDRESS_SPACE_LOCAL:
+ O << "local";
+ break;
+ case llvm::ADDRESS_SPACE_GLOBAL:
+ O << "global";
+ break;
+ case llvm::ADDRESS_SPACE_CONST:
+ O << "const";
+ break;
+ case llvm::ADDRESS_SPACE_SHARED:
+ O << "shared";
+ break;
+ default:
+ report_fatal_error("Bad address space found while emitting PTX");
+ break;
+ }
+}
+
+std::string
+NVPTXAsmPrinter::getPTXFundamentalTypeStr(Type *Ty, bool useB4PTR) const {
+ switch (Ty->getTypeID()) {
+ default:
+ llvm_unreachable("unexpected type");
+ break;
+ case Type::IntegerTyID: {
+ unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth();
+ if (NumBits == 1)
+ return "pred";
+ else if (NumBits <= 64) {
+ std::string name = "u";
+ return name + utostr(NumBits);
+ } else {
+ llvm_unreachable("Integer too large");
+ break;
+ }
+ break;
+ }
+ case Type::FloatTyID:
+ return "f32";
+ case Type::DoubleTyID:
+ return "f64";
+ case Type::PointerTyID:
+ if (static_cast<const NVPTXTargetMachine &>(TM).is64Bit())
+ if (useB4PTR)
+ return "b64";
+ else
+ return "u64";
+ else if (useB4PTR)
+ return "b32";
+ else
+ return "u32";
+ }
+ llvm_unreachable("unexpected type");
+ return nullptr;
+}
+
+void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
+ raw_ostream &O) {
+
+ const DataLayout &DL = getDataLayout();
+
+ // GlobalVariables are always constant pointers themselves.
+ Type *ETy = GVar->getValueType();
+
+ O << ".";
+ emitPTXAddressSpace(GVar->getType()->getAddressSpace(), O);
+ if (GVar->getAlignment() == 0)
+ O << " .align " << (int)DL.getPrefTypeAlignment(ETy);
+ else
+ O << " .align " << GVar->getAlignment();
+
+ if (ETy->isFloatingPointTy() || ETy->isIntegerTy() || ETy->isPointerTy()) {
+ O << " .";
+ O << getPTXFundamentalTypeStr(ETy);
+ O << " ";
+ getSymbol(GVar)->print(O, MAI);
+ return;
+ }
+
+ int64_t ElementSize = 0;
+
+ // Although PTX has direct support for struct type and array type and LLVM IR
+ // is very similar to PTX, the LLVM CodeGen does not support for targets that
+ // support these high level field accesses. Structs and arrays are lowered
+ // into arrays of bytes.
+ switch (ETy->getTypeID()) {
+ case Type::StructTyID:
+ case Type::ArrayTyID:
+ case Type::VectorTyID:
+ ElementSize = DL.getTypeStoreSize(ETy);
+ O << " .b8 ";
+ getSymbol(GVar)->print(O, MAI);
+ O << "[";
+ if (ElementSize) {
+ O << ElementSize;
+ }
+ O << "]";
+ break;
+ default:
+ llvm_unreachable("type not supported yet");
+ }
+ return;
+}
+
+static unsigned int getOpenCLAlignment(const DataLayout &DL, Type *Ty) {
+ if (Ty->isSingleValueType())
+ return DL.getPrefTypeAlignment(Ty);
+
+ auto *ATy = dyn_cast<ArrayType>(Ty);
+ if (ATy)
+ return getOpenCLAlignment(DL, ATy->getElementType());
+
+ auto *STy = dyn_cast<StructType>(Ty);
+ if (STy) {
+ unsigned int alignStruct = 1;
+ // Go through each element of the struct and find the
+ // largest alignment.
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; i++) {
+ Type *ETy = STy->getElementType(i);
+ unsigned int align = getOpenCLAlignment(DL, ETy);
+ if (align > alignStruct)
+ alignStruct = align;
+ }
+ return alignStruct;
+ }
+
+ auto *FTy = dyn_cast<FunctionType>(Ty);
+ if (FTy)
+ return DL.getPointerPrefAlignment();
+ return DL.getPrefTypeAlignment(Ty);
+}
+
+void NVPTXAsmPrinter::printParamName(Function::const_arg_iterator I,
+ int paramIndex, raw_ostream &O) {
+ getSymbol(I->getParent())->print(O, MAI);
+ O << "_param_" << paramIndex;
+}
+
+void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
+ const DataLayout &DL = getDataLayout();
+ const AttributeSet &PAL = F->getAttributes();
+ const TargetLowering *TLI = nvptxSubtarget->getTargetLowering();
+ Function::const_arg_iterator I, E;
+ unsigned paramIndex = 0;
+ bool first = true;
+ bool isKernelFunc = llvm::isKernelFunction(*F);
+ bool isABI = (nvptxSubtarget->getSmVersion() >= 20);
+ MVT thePointerTy = TLI->getPointerTy(DL);
+
+ if (F->arg_empty()) {
+ O << "()\n";
+ return;
+ }
+
+ O << "(\n";
+
+ for (I = F->arg_begin(), E = F->arg_end(); I != E; ++I, paramIndex++) {
+ Type *Ty = I->getType();
+
+ if (!first)
+ O << ",\n";
+
+ first = false;
+
+ // Handle image/sampler parameters
+ if (isKernelFunction(*F)) {
+ if (isSampler(*I) || isImage(*I)) {
+ if (isImage(*I)) {
+ std::string sname = I->getName();
+ if (isImageWriteOnly(*I) || isImageReadWrite(*I)) {
+ if (nvptxSubtarget->hasImageHandles())
+ O << "\t.param .u64 .ptr .surfref ";
+ else
+ O << "\t.param .surfref ";
+ CurrentFnSym->print(O, MAI);
+ O << "_param_" << paramIndex;
+ }
+ else { // Default image is read_only
+ if (nvptxSubtarget->hasImageHandles())
+ O << "\t.param .u64 .ptr .texref ";
+ else
+ O << "\t.param .texref ";
+ CurrentFnSym->print(O, MAI);
+ O << "_param_" << paramIndex;
+ }
+ } else {
+ if (nvptxSubtarget->hasImageHandles())
+ O << "\t.param .u64 .ptr .samplerref ";
+ else
+ O << "\t.param .samplerref ";
+ CurrentFnSym->print(O, MAI);
+ O << "_param_" << paramIndex;
+ }
+ continue;
+ }
+ }
+
+ if (!PAL.hasAttribute(paramIndex + 1, Attribute::ByVal)) {
+ if (Ty->isAggregateType() || Ty->isVectorTy()) {
+ // Just print .param .align <a> .b8 .param[size];
+ // <a> = PAL.getparamalignment
+ // size = typeallocsize of element type
+ unsigned align = PAL.getParamAlignment(paramIndex + 1);
+ if (align == 0)
+ align = DL.getABITypeAlignment(Ty);
+
+ unsigned sz = DL.getTypeAllocSize(Ty);
+ O << "\t.param .align " << align << " .b8 ";
+ printParamName(I, paramIndex, O);
+ O << "[" << sz << "]";
+
+ continue;
+ }
+ // Just a scalar
+ auto *PTy = dyn_cast<PointerType>(Ty);
+ if (isKernelFunc) {
+ if (PTy) {
+ // Special handling for pointer arguments to kernel
+ O << "\t.param .u" << thePointerTy.getSizeInBits() << " ";
+
+ if (static_cast<NVPTXTargetMachine &>(TM).getDrvInterface() !=
+ NVPTX::CUDA) {
+ Type *ETy = PTy->getElementType();
+ int addrSpace = PTy->getAddressSpace();
+ switch (addrSpace) {
+ default:
+ O << ".ptr ";
+ break;
+ case llvm::ADDRESS_SPACE_CONST:
+ O << ".ptr .const ";
+ break;
+ case llvm::ADDRESS_SPACE_SHARED:
+ O << ".ptr .shared ";
+ break;
+ case llvm::ADDRESS_SPACE_GLOBAL:
+ O << ".ptr .global ";
+ break;
+ }
+ O << ".align " << (int)getOpenCLAlignment(DL, ETy) << " ";
+ }
+ printParamName(I, paramIndex, O);
+ continue;
+ }
+
+ // non-pointer scalar to kernel func
+ O << "\t.param .";
+ // Special case: predicate operands become .u8 types
+ if (Ty->isIntegerTy(1))
+ O << "u8";
+ else
+ O << getPTXFundamentalTypeStr(Ty);
+ O << " ";
+ printParamName(I, paramIndex, O);
+ continue;
+ }
+ // Non-kernel function, just print .param .b<size> for ABI
+ // and .reg .b<size> for non-ABI
+ unsigned sz = 0;
+ if (isa<IntegerType>(Ty)) {
+ sz = cast<IntegerType>(Ty)->getBitWidth();
+ if (sz < 32)
+ sz = 32;
+ } else if (isa<PointerType>(Ty))
+ sz = thePointerTy.getSizeInBits();
+ else
+ sz = Ty->getPrimitiveSizeInBits();
+ if (isABI)
+ O << "\t.param .b" << sz << " ";
+ else
+ O << "\t.reg .b" << sz << " ";
+ printParamName(I, paramIndex, O);
+ continue;
+ }
+
+ // param has byVal attribute. So should be a pointer
+ auto *PTy = dyn_cast<PointerType>(Ty);
+ assert(PTy && "Param with byval attribute should be a pointer type");
+ Type *ETy = PTy->getElementType();
+
+ if (isABI || isKernelFunc) {
+ // Just print .param .align <a> .b8 .param[size];
+ // <a> = PAL.getparamalignment
+ // size = typeallocsize of element type
+ unsigned align = PAL.getParamAlignment(paramIndex + 1);
+ if (align == 0)
+ align = DL.getABITypeAlignment(ETy);
+ // Work around a bug in ptxas. When PTX code takes address of
+ // byval parameter with alignment < 4, ptxas generates code to
+ // spill argument into memory. Alas on sm_50+ ptxas generates
+ // SASS code that fails with misaligned access. To work around
+ // the problem, make sure that we align byval parameters by at
+ // least 4. Matching change must be made in LowerCall() where we
+ // prepare parameters for the call.
+ //
+ // TODO: this will need to be undone when we get to support multi-TU
+ // device-side compilation as it breaks ABI compatibility with nvcc.
+ // Hopefully ptxas bug is fixed by then.
+ if (!isKernelFunc && align < 4)
+ align = 4;
+ unsigned sz = DL.getTypeAllocSize(ETy);
+ O << "\t.param .align " << align << " .b8 ";
+ printParamName(I, paramIndex, O);
+ O << "[" << sz << "]";
+ continue;
+ } else {
+ // Split the ETy into constituent parts and
+ // print .param .b<size> <name> for each part.
+ // Further, if a part is vector, print the above for
+ // each vector element.
+ SmallVector<EVT, 16> vtparts;
+ ComputeValueVTs(*TLI, DL, ETy, vtparts);
+ for (unsigned i = 0, e = vtparts.size(); i != e; ++i) {
+ unsigned elems = 1;
+ EVT elemtype = vtparts[i];
+ if (vtparts[i].isVector()) {
+ elems = vtparts[i].getVectorNumElements();
+ elemtype = vtparts[i].getVectorElementType();
+ }
+
+ for (unsigned j = 0, je = elems; j != je; ++j) {
+ unsigned sz = elemtype.getSizeInBits();
+ if (elemtype.isInteger() && (sz < 32))
+ sz = 32;
+ O << "\t.reg .b" << sz << " ";
+ printParamName(I, paramIndex, O);
+ if (j < je - 1)
+ O << ",\n";
+ ++paramIndex;
+ }
+ if (i < e - 1)
+ O << ",\n";
+ }
+ --paramIndex;
+ continue;
+ }
+ }
+
+ O << "\n)\n";
+}
+
+void NVPTXAsmPrinter::emitFunctionParamList(const MachineFunction &MF,
+ raw_ostream &O) {
+ const Function *F = MF.getFunction();
+ emitFunctionParamList(F, O);
+}
+
+void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters(
+ const MachineFunction &MF) {
+ SmallString<128> Str;
+ raw_svector_ostream O(Str);
+
+ // Map the global virtual register number to a register class specific
+ // virtual register number starting from 1 with that class.
+ const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+ //unsigned numRegClasses = TRI->getNumRegClasses();
+
+ // Emit the Fake Stack Object
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ int NumBytes = (int) MFI.getStackSize();
+ if (NumBytes) {
+ O << "\t.local .align " << MFI.getMaxAlignment() << " .b8 \t" << DEPOTNAME
+ << getFunctionNumber() << "[" << NumBytes << "];\n";
+ if (static_cast<const NVPTXTargetMachine &>(MF.getTarget()).is64Bit()) {
+ O << "\t.reg .b64 \t%SP;\n";
+ O << "\t.reg .b64 \t%SPL;\n";
+ } else {
+ O << "\t.reg .b32 \t%SP;\n";
+ O << "\t.reg .b32 \t%SPL;\n";
+ }
+ }
+
+ // Go through all virtual registers to establish the mapping between the
+ // global virtual
+ // register number and the per class virtual register number.
+ // We use the per class virtual register number in the ptx output.
+ unsigned int numVRs = MRI->getNumVirtRegs();
+ for (unsigned i = 0; i < numVRs; i++) {
+ unsigned int vr = TRI->index2VirtReg(i);
+ const TargetRegisterClass *RC = MRI->getRegClass(vr);
+ DenseMap<unsigned, unsigned> &regmap = VRegMapping[RC];
+ int n = regmap.size();
+ regmap.insert(std::make_pair(vr, n + 1));
+ }
+
+ // Emit register declarations
+ // @TODO: Extract out the real register usage
+ // O << "\t.reg .pred %p<" << NVPTXNumRegisters << ">;\n";
+ // O << "\t.reg .s16 %rc<" << NVPTXNumRegisters << ">;\n";
+ // O << "\t.reg .s16 %rs<" << NVPTXNumRegisters << ">;\n";
+ // O << "\t.reg .s32 %r<" << NVPTXNumRegisters << ">;\n";
+ // O << "\t.reg .s64 %rd<" << NVPTXNumRegisters << ">;\n";
+ // O << "\t.reg .f32 %f<" << NVPTXNumRegisters << ">;\n";
+ // O << "\t.reg .f64 %fd<" << NVPTXNumRegisters << ">;\n";
+
+ // Emit declaration of the virtual registers or 'physical' registers for
+ // each register class
+ for (unsigned i=0; i< TRI->getNumRegClasses(); i++) {
+ const TargetRegisterClass *RC = TRI->getRegClass(i);
+ DenseMap<unsigned, unsigned> &regmap = VRegMapping[RC];
+ std::string rcname = getNVPTXRegClassName(RC);
+ std::string rcStr = getNVPTXRegClassStr(RC);
+ int n = regmap.size();
+
+ // Only declare those registers that may be used.
+ if (n) {
+ O << "\t.reg " << rcname << " \t" << rcStr << "<" << (n+1)
+ << ">;\n";
+ }
+ }
+
+ OutStreamer->EmitRawText(O.str());
+}
+
+void NVPTXAsmPrinter::printFPConstant(const ConstantFP *Fp, raw_ostream &O) {
+ APFloat APF = APFloat(Fp->getValueAPF()); // make a copy
+ bool ignored;
+ unsigned int numHex;
+ const char *lead;
+
+ if (Fp->getType()->getTypeID() == Type::FloatTyID) {
+ numHex = 8;
+ lead = "0f";
+ APF.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven, &ignored);
+ } else if (Fp->getType()->getTypeID() == Type::DoubleTyID) {
+ numHex = 16;
+ lead = "0d";
+ APF.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven, &ignored);
+ } else
+ llvm_unreachable("unsupported fp type");
+
+ APInt API = APF.bitcastToAPInt();
+ std::string hexstr(utohexstr(API.getZExtValue()));
+ O << lead;
+ if (hexstr.length() < numHex)
+ O << std::string(numHex - hexstr.length(), '0');
+ O << utohexstr(API.getZExtValue());
+}
+
+void NVPTXAsmPrinter::printScalarConstant(const Constant *CPV, raw_ostream &O) {
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(CPV)) {
+ O << CI->getValue();
+ return;
+ }
+ if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CPV)) {
+ printFPConstant(CFP, O);
+ return;
+ }
+ if (isa<ConstantPointerNull>(CPV)) {
+ O << "0";
+ return;
+ }
+ if (const GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) {
+ bool IsNonGenericPointer = false;
+ if (GVar->getType()->getAddressSpace() != 0) {
+ IsNonGenericPointer = true;
+ }
+ if (EmitGeneric && !isa<Function>(CPV) && !IsNonGenericPointer) {
+ O << "generic(";
+ getSymbol(GVar)->print(O, MAI);
+ O << ")";
+ } else {
+ getSymbol(GVar)->print(O, MAI);
+ }
+ return;
+ }
+ if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+ const Value *v = Cexpr->stripPointerCasts();
+ PointerType *PTy = dyn_cast<PointerType>(Cexpr->getType());
+ bool IsNonGenericPointer = false;
+ if (PTy && PTy->getAddressSpace() != 0) {
+ IsNonGenericPointer = true;
+ }
+ if (const GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
+ if (EmitGeneric && !isa<Function>(v) && !IsNonGenericPointer) {
+ O << "generic(";
+ getSymbol(GVar)->print(O, MAI);
+ O << ")";
+ } else {
+ getSymbol(GVar)->print(O, MAI);
+ }
+ return;
+ } else {
+ lowerConstant(CPV)->print(O, MAI);
+ return;
+ }
+ }
+ llvm_unreachable("Not scalar type found in printScalarConstant()");
+}
+
+// These utility functions assure we get the right sequence of bytes for a given
+// type even for big-endian machines
+template <typename T> static void ConvertIntToBytes(unsigned char *p, T val) {
+ int64_t vp = (int64_t)val;
+ for (unsigned i = 0; i < sizeof(T); ++i) {
+ p[i] = (unsigned char)vp;
+ vp >>= 8;
+ }
+}
+static void ConvertFloatToBytes(unsigned char *p, float val) {
+ int32_t *vp = (int32_t *)&val;
+ for (unsigned i = 0; i < sizeof(int32_t); ++i) {
+ p[i] = (unsigned char)*vp;
+ *vp >>= 8;
+ }
+}
+static void ConvertDoubleToBytes(unsigned char *p, double val) {
+ int64_t *vp = (int64_t *)&val;
+ for (unsigned i = 0; i < sizeof(int64_t); ++i) {
+ p[i] = (unsigned char)*vp;
+ *vp >>= 8;
+ }
+}
+
+void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
+ AggBuffer *aggBuffer) {
+
+ const DataLayout &DL = getDataLayout();
+
+ if (isa<UndefValue>(CPV) || CPV->isNullValue()) {
+ int s = DL.getTypeAllocSize(CPV->getType());
+ if (s < Bytes)
+ s = Bytes;
+ aggBuffer->addZeros(s);
+ return;
+ }
+
+ unsigned char ptr[8];
+ switch (CPV->getType()->getTypeID()) {
+
+ case Type::IntegerTyID: {
+ Type *ETy = CPV->getType();
+ if (ETy == Type::getInt8Ty(CPV->getContext())) {
+ unsigned char c = (unsigned char)cast<ConstantInt>(CPV)->getZExtValue();
+ ConvertIntToBytes<>(ptr, c);
+ aggBuffer->addBytes(ptr, 1, Bytes);
+ } else if (ETy == Type::getInt16Ty(CPV->getContext())) {
+ short int16 = (short)cast<ConstantInt>(CPV)->getZExtValue();
+ ConvertIntToBytes<>(ptr, int16);
+ aggBuffer->addBytes(ptr, 2, Bytes);
+ } else if (ETy == Type::getInt32Ty(CPV->getContext())) {
+ if (const ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) {
+ int int32 = (int)(constInt->getZExtValue());
+ ConvertIntToBytes<>(ptr, int32);
+ aggBuffer->addBytes(ptr, 4, Bytes);
+ break;
+ } else if (const auto *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+ if (const auto *constInt = dyn_cast_or_null<ConstantInt>(
+ ConstantFoldConstant(Cexpr, DL))) {
+ int int32 = (int)(constInt->getZExtValue());
+ ConvertIntToBytes<>(ptr, int32);
+ aggBuffer->addBytes(ptr, 4, Bytes);
+ break;
+ }
+ if (Cexpr->getOpcode() == Instruction::PtrToInt) {
+ Value *v = Cexpr->getOperand(0)->stripPointerCasts();
+ aggBuffer->addSymbol(v, Cexpr->getOperand(0));
+ aggBuffer->addZeros(4);
+ break;
+ }
+ }
+ llvm_unreachable("unsupported integer const type");
+ } else if (ETy == Type::getInt64Ty(CPV->getContext())) {
+ if (const ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) {
+ long long int64 = (long long)(constInt->getZExtValue());
+ ConvertIntToBytes<>(ptr, int64);
+ aggBuffer->addBytes(ptr, 8, Bytes);
+ break;
+ } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+ if (const auto *constInt = dyn_cast_or_null<ConstantInt>(
+ ConstantFoldConstant(Cexpr, DL))) {
+ long long int64 = (long long)(constInt->getZExtValue());
+ ConvertIntToBytes<>(ptr, int64);
+ aggBuffer->addBytes(ptr, 8, Bytes);
+ break;
+ }
+ if (Cexpr->getOpcode() == Instruction::PtrToInt) {
+ Value *v = Cexpr->getOperand(0)->stripPointerCasts();
+ aggBuffer->addSymbol(v, Cexpr->getOperand(0));
+ aggBuffer->addZeros(8);
+ break;
+ }
+ }
+ llvm_unreachable("unsupported integer const type");
+ } else
+ llvm_unreachable("unsupported integer const type");
+ break;
+ }
+ case Type::FloatTyID:
+ case Type::DoubleTyID: {
+ const ConstantFP *CFP = dyn_cast<ConstantFP>(CPV);
+ Type *Ty = CFP->getType();
+ if (Ty == Type::getFloatTy(CPV->getContext())) {
+ float float32 = (float) CFP->getValueAPF().convertToFloat();
+ ConvertFloatToBytes(ptr, float32);
+ aggBuffer->addBytes(ptr, 4, Bytes);
+ } else if (Ty == Type::getDoubleTy(CPV->getContext())) {
+ double float64 = CFP->getValueAPF().convertToDouble();
+ ConvertDoubleToBytes(ptr, float64);
+ aggBuffer->addBytes(ptr, 8, Bytes);
+ } else {
+ llvm_unreachable("unsupported fp const type");
+ }
+ break;
+ }
+ case Type::PointerTyID: {
+ if (const GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) {
+ aggBuffer->addSymbol(GVar, GVar);
+ } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+ const Value *v = Cexpr->stripPointerCasts();
+ aggBuffer->addSymbol(v, Cexpr);
+ }
+ unsigned int s = DL.getTypeAllocSize(CPV->getType());
+ aggBuffer->addZeros(s);
+ break;
+ }
+
+ case Type::ArrayTyID:
+ case Type::VectorTyID:
+ case Type::StructTyID: {
+ if (isa<ConstantAggregate>(CPV) || isa<ConstantDataSequential>(CPV)) {
+ int ElementSize = DL.getTypeAllocSize(CPV->getType());
+ bufferAggregateConstant(CPV, aggBuffer);
+ if (Bytes > ElementSize)
+ aggBuffer->addZeros(Bytes - ElementSize);
+ } else if (isa<ConstantAggregateZero>(CPV))
+ aggBuffer->addZeros(Bytes);
+ else
+ llvm_unreachable("Unexpected Constant type");
+ break;
+ }
+
+ default:
+ llvm_unreachable("unsupported type");
+ }
+}
+
+void NVPTXAsmPrinter::bufferAggregateConstant(const Constant *CPV,
+ AggBuffer *aggBuffer) {
+ const DataLayout &DL = getDataLayout();
+ int Bytes;
+
+ // Old constants
+ if (isa<ConstantArray>(CPV) || isa<ConstantVector>(CPV)) {
+ if (CPV->getNumOperands())
+ for (unsigned i = 0, e = CPV->getNumOperands(); i != e; ++i)
+ bufferLEByte(cast<Constant>(CPV->getOperand(i)), 0, aggBuffer);
+ return;
+ }
+
+ if (const ConstantDataSequential *CDS =
+ dyn_cast<ConstantDataSequential>(CPV)) {
+ if (CDS->getNumElements())
+ for (unsigned i = 0; i < CDS->getNumElements(); ++i)
+ bufferLEByte(cast<Constant>(CDS->getElementAsConstant(i)), 0,
+ aggBuffer);
+ return;
+ }
+
+ if (isa<ConstantStruct>(CPV)) {
+ if (CPV->getNumOperands()) {
+ StructType *ST = cast<StructType>(CPV->getType());
+ for (unsigned i = 0, e = CPV->getNumOperands(); i != e; ++i) {
+ if (i == (e - 1))
+ Bytes = DL.getStructLayout(ST)->getElementOffset(0) +
+ DL.getTypeAllocSize(ST) -
+ DL.getStructLayout(ST)->getElementOffset(i);
+ else
+ Bytes = DL.getStructLayout(ST)->getElementOffset(i + 1) -
+ DL.getStructLayout(ST)->getElementOffset(i);
+ bufferLEByte(cast<Constant>(CPV->getOperand(i)), Bytes, aggBuffer);
+ }
+ }
+ return;
+ }
+ llvm_unreachable("unsupported constant type in printAggregateConstant()");
+}
+
+// buildTypeNameMap - Run through symbol table looking for type names.
+//
+
+
+bool NVPTXAsmPrinter::ignoreLoc(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ default:
+ return false;
+ case NVPTX::CallArgBeginInst:
+ case NVPTX::CallArgEndInst0:
+ case NVPTX::CallArgEndInst1:
+ case NVPTX::CallArgF32:
+ case NVPTX::CallArgF64:
+ case NVPTX::CallArgI16:
+ case NVPTX::CallArgI32:
+ case NVPTX::CallArgI32imm:
+ case NVPTX::CallArgI64:
+ case NVPTX::CallArgParam:
+ case NVPTX::CallVoidInst:
+ case NVPTX::CallVoidInstReg:
+ case NVPTX::Callseq_End:
+ case NVPTX::CallVoidInstReg64:
+ case NVPTX::DeclareParamInst:
+ case NVPTX::DeclareRetMemInst:
+ case NVPTX::DeclareRetRegInst:
+ case NVPTX::DeclareRetScalarInst:
+ case NVPTX::DeclareScalarParamInst:
+ case NVPTX::DeclareScalarRegInst:
+ case NVPTX::StoreParamF32:
+ case NVPTX::StoreParamF64:
+ case NVPTX::StoreParamI16:
+ case NVPTX::StoreParamI32:
+ case NVPTX::StoreParamI64:
+ case NVPTX::StoreParamI8:
+ case NVPTX::StoreRetvalF32:
+ case NVPTX::StoreRetvalF64:
+ case NVPTX::StoreRetvalI16:
+ case NVPTX::StoreRetvalI32:
+ case NVPTX::StoreRetvalI64:
+ case NVPTX::StoreRetvalI8:
+ case NVPTX::LastCallArgF32:
+ case NVPTX::LastCallArgF64:
+ case NVPTX::LastCallArgI16:
+ case NVPTX::LastCallArgI32:
+ case NVPTX::LastCallArgI32imm:
+ case NVPTX::LastCallArgI64:
+ case NVPTX::LastCallArgParam:
+ case NVPTX::LoadParamMemF32:
+ case NVPTX::LoadParamMemF64:
+ case NVPTX::LoadParamMemI16:
+ case NVPTX::LoadParamMemI32:
+ case NVPTX::LoadParamMemI64:
+ case NVPTX::LoadParamMemI8:
+ case NVPTX::PrototypeInst:
+ case NVPTX::DBG_VALUE:
+ return true;
+ }
+ return false;
+}
+
+/// lowerConstantForGV - Return an MCExpr for the given Constant. This is mostly
+/// a copy from AsmPrinter::lowerConstant, except customized to only handle
+/// expressions that are representable in PTX and create
+/// NVPTXGenericMCSymbolRefExpr nodes for addrspacecast instructions.
+const MCExpr *
+NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric) {
+ MCContext &Ctx = OutContext;
+
+ if (CV->isNullValue() || isa<UndefValue>(CV))
+ return MCConstantExpr::create(0, Ctx);
+
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV))
+ return MCConstantExpr::create(CI->getZExtValue(), Ctx);
+
+ if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV)) {
+ const MCSymbolRefExpr *Expr =
+ MCSymbolRefExpr::create(getSymbol(GV), Ctx);
+ if (ProcessingGeneric) {
+ return NVPTXGenericMCSymbolRefExpr::create(Expr, Ctx);
+ } else {
+ return Expr;
+ }
+ }
+
+ const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV);
+ if (!CE) {
+ llvm_unreachable("Unknown constant value to lower!");
+ }
+
+ switch (CE->getOpcode()) {
+ default:
+ // If the code isn't optimized, there may be outstanding folding
+ // opportunities. Attempt to fold the expression using DataLayout as a
+ // last resort before giving up.
+ if (Constant *C = ConstantFoldConstant(CE, getDataLayout()))
+ if (C && C != CE)
+ return lowerConstantForGV(C, ProcessingGeneric);
+
+ // Otherwise report the problem to the user.
+ {
+ std::string S;
+ raw_string_ostream OS(S);
+ OS << "Unsupported expression in static initializer: ";
+ CE->printAsOperand(OS, /*PrintType=*/false,
+ !MF ? nullptr : MF->getFunction()->getParent());
+ report_fatal_error(OS.str());
+ }
+
+ case Instruction::AddrSpaceCast: {
+ // Strip the addrspacecast and pass along the operand
+ PointerType *DstTy = cast<PointerType>(CE->getType());
+ if (DstTy->getAddressSpace() == 0) {
+ return lowerConstantForGV(cast<const Constant>(CE->getOperand(0)), true);
+ }
+ std::string S;
+ raw_string_ostream OS(S);
+ OS << "Unsupported expression in static initializer: ";
+ CE->printAsOperand(OS, /*PrintType=*/ false,
+ !MF ? 0 : MF->getFunction()->getParent());
+ report_fatal_error(OS.str());
+ }
+
+ case Instruction::GetElementPtr: {
+ const DataLayout &DL = getDataLayout();
+
+ // Generate a symbolic expression for the byte address
+ APInt OffsetAI(DL.getPointerTypeSizeInBits(CE->getType()), 0);
+ cast<GEPOperator>(CE)->accumulateConstantOffset(DL, OffsetAI);
+
+ const MCExpr *Base = lowerConstantForGV(CE->getOperand(0),
+ ProcessingGeneric);
+ if (!OffsetAI)
+ return Base;
+
+ int64_t Offset = OffsetAI.getSExtValue();
+ return MCBinaryExpr::createAdd(Base, MCConstantExpr::create(Offset, Ctx),
+ Ctx);
+ }
+
+ case Instruction::Trunc:
+ // We emit the value and depend on the assembler to truncate the generated
+ // expression properly. This is important for differences between
+ // blockaddress labels. Since the two labels are in the same function, it
+ // is reasonable to treat their delta as a 32-bit value.
+ LLVM_FALLTHROUGH;
+ case Instruction::BitCast:
+ return lowerConstantForGV(CE->getOperand(0), ProcessingGeneric);
+
+ case Instruction::IntToPtr: {
+ const DataLayout &DL = getDataLayout();
+
+ // Handle casts to pointers by changing them into casts to the appropriate
+ // integer type. This promotes constant folding and simplifies this code.
+ Constant *Op = CE->getOperand(0);
+ Op = ConstantExpr::getIntegerCast(Op, DL.getIntPtrType(CV->getType()),
+ false/*ZExt*/);
+ return lowerConstantForGV(Op, ProcessingGeneric);
+ }
+
+ case Instruction::PtrToInt: {
+ const DataLayout &DL = getDataLayout();
+
+ // Support only foldable casts to/from pointers that can be eliminated by
+ // changing the pointer to the appropriately sized integer type.
+ Constant *Op = CE->getOperand(0);
+ Type *Ty = CE->getType();
+
+ const MCExpr *OpExpr = lowerConstantForGV(Op, ProcessingGeneric);
+
+ // We can emit the pointer value into this slot if the slot is an
+ // integer slot equal to the size of the pointer.
+ if (DL.getTypeAllocSize(Ty) == DL.getTypeAllocSize(Op->getType()))
+ return OpExpr;
+
+ // Otherwise the pointer is smaller than the resultant integer, mask off
+ // the high bits so we are sure to get a proper truncation if the input is
+ // a constant expr.
+ unsigned InBits = DL.getTypeAllocSizeInBits(Op->getType());
+ const MCExpr *MaskExpr = MCConstantExpr::create(~0ULL >> (64-InBits), Ctx);
+ return MCBinaryExpr::createAnd(OpExpr, MaskExpr, Ctx);
+ }
+
+ // The MC library also has a right-shift operator, but it isn't consistently
+ // signed or unsigned between different targets.
+ case Instruction::Add: {
+ const MCExpr *LHS = lowerConstantForGV(CE->getOperand(0), ProcessingGeneric);
+ const MCExpr *RHS = lowerConstantForGV(CE->getOperand(1), ProcessingGeneric);
+ switch (CE->getOpcode()) {
+ default: llvm_unreachable("Unknown binary operator constant cast expr");
+ case Instruction::Add: return MCBinaryExpr::createAdd(LHS, RHS, Ctx);
+ }
+ }
+ }
+}
+
+// Copy of MCExpr::print customized for NVPTX
+void NVPTXAsmPrinter::printMCExpr(const MCExpr &Expr, raw_ostream &OS) {
+ switch (Expr.getKind()) {
+ case MCExpr::Target:
+ return cast<MCTargetExpr>(&Expr)->printImpl(OS, MAI);
+ case MCExpr::Constant:
+ OS << cast<MCConstantExpr>(Expr).getValue();
+ return;
+
+ case MCExpr::SymbolRef: {
+ const MCSymbolRefExpr &SRE = cast<MCSymbolRefExpr>(Expr);
+ const MCSymbol &Sym = SRE.getSymbol();
+ Sym.print(OS, MAI);
+ return;
+ }
+
+ case MCExpr::Unary: {
+ const MCUnaryExpr &UE = cast<MCUnaryExpr>(Expr);
+ switch (UE.getOpcode()) {
+ case MCUnaryExpr::LNot: OS << '!'; break;
+ case MCUnaryExpr::Minus: OS << '-'; break;
+ case MCUnaryExpr::Not: OS << '~'; break;
+ case MCUnaryExpr::Plus: OS << '+'; break;
+ }
+ printMCExpr(*UE.getSubExpr(), OS);
+ return;
+ }
+
+ case MCExpr::Binary: {
+ const MCBinaryExpr &BE = cast<MCBinaryExpr>(Expr);
+
+ // Only print parens around the LHS if it is non-trivial.
+ if (isa<MCConstantExpr>(BE.getLHS()) || isa<MCSymbolRefExpr>(BE.getLHS()) ||
+ isa<NVPTXGenericMCSymbolRefExpr>(BE.getLHS())) {
+ printMCExpr(*BE.getLHS(), OS);
+ } else {
+ OS << '(';
+ printMCExpr(*BE.getLHS(), OS);
+ OS<< ')';
+ }
+
+ switch (BE.getOpcode()) {
+ case MCBinaryExpr::Add:
+ // Print "X-42" instead of "X+-42".
+ if (const MCConstantExpr *RHSC = dyn_cast<MCConstantExpr>(BE.getRHS())) {
+ if (RHSC->getValue() < 0) {
+ OS << RHSC->getValue();
+ return;
+ }
+ }
+
+ OS << '+';
+ break;
+ default: llvm_unreachable("Unhandled binary operator");
+ }
+
+ // Only print parens around the LHS if it is non-trivial.
+ if (isa<MCConstantExpr>(BE.getRHS()) || isa<MCSymbolRefExpr>(BE.getRHS())) {
+ printMCExpr(*BE.getRHS(), OS);
+ } else {
+ OS << '(';
+ printMCExpr(*BE.getRHS(), OS);
+ OS << ')';
+ }
+ return;
+ }
+ }
+
+ llvm_unreachable("Invalid expression kind!");
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool NVPTXAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant,
+ const char *ExtraCode, raw_ostream &O) {
+ if (ExtraCode && ExtraCode[0]) {
+ if (ExtraCode[1] != 0)
+ return true; // Unknown modifier.
+
+ switch (ExtraCode[0]) {
+ default:
+ // See if this is a generic print operand
+ return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+ case 'r':
+ break;
+ }
+ }
+
+ printOperand(MI, OpNo, O);
+
+ return false;
+}
+
+bool NVPTXAsmPrinter::PrintAsmMemoryOperand(
+ const MachineInstr *MI, unsigned OpNo, unsigned AsmVariant,
+ const char *ExtraCode, raw_ostream &O) {
+ if (ExtraCode && ExtraCode[0])
+ return true; // Unknown modifier
+
+ O << '[';
+ printMemOperand(MI, OpNo, O);
+ O << ']';
+
+ return false;
+}
+
+void NVPTXAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
+ raw_ostream &O, const char *Modifier) {
+ const MachineOperand &MO = MI->getOperand(opNum);
+ switch (MO.getType()) {
+ case MachineOperand::MO_Register:
+ if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+ if (MO.getReg() == NVPTX::VRDepot)
+ O << DEPOTNAME << getFunctionNumber();
+ else
+ O << NVPTXInstPrinter::getRegisterName(MO.getReg());
+ } else {
+ emitVirtualRegister(MO.getReg(), O);
+ }
+ return;
+
+ case MachineOperand::MO_Immediate:
+ if (!Modifier)
+ O << MO.getImm();
+ else if (strstr(Modifier, "vec") == Modifier)
+ printVecModifiedImmediate(MO, Modifier, O);
+ else
+ llvm_unreachable(
+ "Don't know how to handle modifier on immediate operand");
+ return;
+
+ case MachineOperand::MO_FPImmediate:
+ printFPConstant(MO.getFPImm(), O);
+ break;
+
+ case MachineOperand::MO_GlobalAddress:
+ getSymbol(MO.getGlobal())->print(O, MAI);
+ break;
+
+ case MachineOperand::MO_MachineBasicBlock:
+ MO.getMBB()->getSymbol()->print(O, MAI);
+ return;
+
+ default:
+ llvm_unreachable("Operand type not supported.");
+ }
+}
+
+void NVPTXAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum,
+ raw_ostream &O, const char *Modifier) {
+ printOperand(MI, opNum, O);
+
+ if (Modifier && !strcmp(Modifier, "add")) {
+ O << ", ";
+ printOperand(MI, opNum + 1, O);
+ } else {
+ if (MI->getOperand(opNum + 1).isImm() &&
+ MI->getOperand(opNum + 1).getImm() == 0)
+ return; // don't print ',0' or '+0'
+ O << "+";
+ printOperand(MI, opNum + 1, O);
+ }
+}
+
+void NVPTXAsmPrinter::emitSrcInText(StringRef filename, unsigned line) {
+ std::stringstream temp;
+ LineReader *reader = this->getReader(filename);
+ temp << "\n//";
+ temp << filename.str();
+ temp << ":";
+ temp << line;
+ temp << " ";
+ temp << reader->readLine(line);
+ temp << "\n";
+ this->OutStreamer->EmitRawText(temp.str());
+}
+
+LineReader *NVPTXAsmPrinter::getReader(const std::string &filename) {
+ if (!reader) {
+ reader = new LineReader(filename);
+ }
+
+ if (reader->fileName() != filename) {
+ delete reader;
+ reader = new LineReader(filename);
+ }
+
+ return reader;
+}
+
+std::string LineReader::readLine(unsigned lineNum) {
+ if (lineNum < theCurLine) {
+ theCurLine = 0;
+ fstr.seekg(0, std::ios::beg);
+ }
+ while (theCurLine < lineNum) {
+ fstr.getline(buff, 500);
+ theCurLine++;
+ }
+ return buff;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeNVPTXAsmPrinter() {
+ RegisterAsmPrinter<NVPTXAsmPrinter> X(getTheNVPTXTarget32());
+ RegisterAsmPrinter<NVPTXAsmPrinter> Y(getTheNVPTXTarget64());
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
new file mode 100644
index 000000000000..3dcc0e358a14
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -0,0 +1,343 @@
+//===-- NVPTXAsmPrinter.h - NVPTX LLVM assembly writer --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to NVPTX assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXASMPRINTER_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXASMPRINTER_H
+
+#include "NVPTX.h"
+#include "NVPTXSubtarget.h"
+#include "NVPTXTargetMachine.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <fstream>
+
+// The ptx syntax and format is very different from that usually seem in a .s
+// file,
+// therefore we are not able to use the MCAsmStreamer interface here.
+//
+// We are handcrafting the output method here.
+//
+// A better approach is to clone the MCAsmStreamer to a MCPTXAsmStreamer
+// (subclass of MCStreamer).
+
+namespace llvm {
+ class MCOperand;
+
+class LineReader {
+private:
+ unsigned theCurLine;
+ std::ifstream fstr;
+ char buff[512];
+ std::string theFileName;
+ SmallVector<unsigned, 32> lineOffset;
+public:
+ LineReader(std::string filename) {
+ theCurLine = 0;
+ fstr.open(filename.c_str());
+ theFileName = filename;
+ }
+ std::string fileName() { return theFileName; }
+ ~LineReader() { fstr.close(); }
+ std::string readLine(unsigned line);
+};
+
+class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
+
+ class AggBuffer {
+ // Used to buffer the emitted string for initializing global
+ // aggregates.
+ //
+ // Normally an aggregate (array, vector or structure) is emitted
+ // as a u8[]. However, if one element/field of the aggregate
+ // is a non-NULL address, then the aggregate is emitted as u32[]
+ // or u64[].
+ //
+ // We first layout the aggregate in 'buffer' in bytes, except for
+ // those symbol addresses. For the i-th symbol address in the
+ //aggregate, its corresponding 4-byte or 8-byte elements in 'buffer'
+ // are filled with 0s. symbolPosInBuffer[i-1] records its position
+ // in 'buffer', and Symbols[i-1] records the Value*.
+ //
+ // Once we have this AggBuffer setup, we can choose how to print
+ // it out.
+ public:
+ unsigned numSymbols; // number of symbol addresses
+
+ private:
+ const unsigned size; // size of the buffer in bytes
+ std::vector<unsigned char> buffer; // the buffer
+ SmallVector<unsigned, 4> symbolPosInBuffer;
+ SmallVector<const Value *, 4> Symbols;
+ // SymbolsBeforeStripping[i] is the original form of Symbols[i] before
+ // stripping pointer casts, i.e.,
+ // Symbols[i] == SymbolsBeforeStripping[i]->stripPointerCasts().
+ //
+ // We need to keep these values because AggBuffer::print decides whether to
+ // emit a "generic()" cast for Symbols[i] depending on the address space of
+ // SymbolsBeforeStripping[i].
+ SmallVector<const Value *, 4> SymbolsBeforeStripping;
+ unsigned curpos;
+ raw_ostream &O;
+ NVPTXAsmPrinter &AP;
+ bool EmitGeneric;
+
+ public:
+ AggBuffer(unsigned size, raw_ostream &O, NVPTXAsmPrinter &AP)
+ : size(size), buffer(size), O(O), AP(AP) {
+ curpos = 0;
+ numSymbols = 0;
+ EmitGeneric = AP.EmitGeneric;
+ }
+ unsigned addBytes(unsigned char *Ptr, int Num, int Bytes) {
+ assert((curpos + Num) <= size);
+ assert((curpos + Bytes) <= size);
+ for (int i = 0; i < Num; ++i) {
+ buffer[curpos] = Ptr[i];
+ curpos++;
+ }
+ for (int i = Num; i < Bytes; ++i) {
+ buffer[curpos] = 0;
+ curpos++;
+ }
+ return curpos;
+ }
+ unsigned addZeros(int Num) {
+ assert((curpos + Num) <= size);
+ for (int i = 0; i < Num; ++i) {
+ buffer[curpos] = 0;
+ curpos++;
+ }
+ return curpos;
+ }
+ void addSymbol(const Value *GVar, const Value *GVarBeforeStripping) {
+ symbolPosInBuffer.push_back(curpos);
+ Symbols.push_back(GVar);
+ SymbolsBeforeStripping.push_back(GVarBeforeStripping);
+ numSymbols++;
+ }
+ void print() {
+ if (numSymbols == 0) {
+ // print out in bytes
+ for (unsigned i = 0; i < size; i++) {
+ if (i)
+ O << ", ";
+ O << (unsigned int) buffer[i];
+ }
+ } else {
+ // print out in 4-bytes or 8-bytes
+ unsigned int pos = 0;
+ unsigned int nSym = 0;
+ unsigned int nextSymbolPos = symbolPosInBuffer[nSym];
+ unsigned int nBytes = 4;
+ if (static_cast<const NVPTXTargetMachine &>(AP.TM).is64Bit())
+ nBytes = 8;
+ for (pos = 0; pos < size; pos += nBytes) {
+ if (pos)
+ O << ", ";
+ if (pos == nextSymbolPos) {
+ const Value *v = Symbols[nSym];
+ const Value *v0 = SymbolsBeforeStripping[nSym];
+ if (const GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
+ MCSymbol *Name = AP.getSymbol(GVar);
+ PointerType *PTy = dyn_cast<PointerType>(v0->getType());
+ bool IsNonGenericPointer = false; // Is v0 a non-generic pointer?
+ if (PTy && PTy->getAddressSpace() != 0) {
+ IsNonGenericPointer = true;
+ }
+ if (EmitGeneric && !isa<Function>(v) && !IsNonGenericPointer) {
+ O << "generic(";
+ Name->print(O, AP.MAI);
+ O << ")";
+ } else {
+ Name->print(O, AP.MAI);
+ }
+ } else if (const ConstantExpr *CExpr = dyn_cast<ConstantExpr>(v0)) {
+ const MCExpr *Expr =
+ AP.lowerConstantForGV(cast<Constant>(CExpr), false);
+ AP.printMCExpr(*Expr, O);
+ } else
+ llvm_unreachable("symbol type unknown");
+ nSym++;
+ if (nSym >= numSymbols)
+ nextSymbolPos = size + 1;
+ else
+ nextSymbolPos = symbolPosInBuffer[nSym];
+ } else if (nBytes == 4)
+ O << *(unsigned int *)(&buffer[pos]);
+ else
+ O << *(unsigned long long *)(&buffer[pos]);
+ }
+ }
+ }
+ };
+
+ friend class AggBuffer;
+
+ void emitSrcInText(StringRef filename, unsigned line);
+
+private:
+ StringRef getPassName() const override { return "NVPTX Assembly Printer"; }
+
+ const Function *F;
+ std::string CurrentFnName;
+
+ void EmitBasicBlockStart(const MachineBasicBlock &MBB) const override;
+ void EmitFunctionEntryLabel() override;
+ void EmitFunctionBodyStart() override;
+ void EmitFunctionBodyEnd() override;
+ void emitImplicitDef(const MachineInstr *MI) const override;
+
+ void EmitInstruction(const MachineInstr *) override;
+ void lowerToMCInst(const MachineInstr *MI, MCInst &OutMI);
+ bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp);
+ MCOperand GetSymbolRef(const MCSymbol *Symbol);
+ unsigned encodeVirtualRegister(unsigned Reg);
+
+ void printVecModifiedImmediate(const MachineOperand &MO, const char *Modifier,
+ raw_ostream &O);
+ void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
+ const char *Modifier = nullptr);
+ void printModuleLevelGV(const GlobalVariable *GVar, raw_ostream &O,
+ bool = false);
+ void printParamName(Function::const_arg_iterator I, int paramIndex,
+ raw_ostream &O);
+ void emitGlobals(const Module &M);
+ void emitHeader(Module &M, raw_ostream &O, const NVPTXSubtarget &STI);
+ void emitKernelFunctionDirectives(const Function &F, raw_ostream &O) const;
+ void emitVirtualRegister(unsigned int vr, raw_ostream &);
+ void emitFunctionParamList(const Function *, raw_ostream &O);
+ void emitFunctionParamList(const MachineFunction &MF, raw_ostream &O);
+ void setAndEmitFunctionVirtualRegisters(const MachineFunction &MF);
+ void printReturnValStr(const Function *, raw_ostream &O);
+ void printReturnValStr(const MachineFunction &MF, raw_ostream &O);
+ bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &) override;
+ void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
+ const char *Modifier = nullptr);
+ bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &) override;
+
+ const MCExpr *lowerConstantForGV(const Constant *CV, bool ProcessingGeneric);
+ void printMCExpr(const MCExpr &Expr, raw_ostream &OS);
+
+protected:
+ bool doInitialization(Module &M) override;
+ bool doFinalization(Module &M) override;
+
+private:
+ std::string CurrentBankselLabelInBasicBlock;
+
+ bool GlobalsEmitted;
+
+ // This is specific per MachineFunction.
+ const MachineRegisterInfo *MRI;
+ // The contents are specific for each
+ // MachineFunction. But the size of the
+ // array is not.
+ typedef DenseMap<unsigned, unsigned> VRegMap;
+ typedef DenseMap<const TargetRegisterClass *, VRegMap> VRegRCMap;
+ VRegRCMap VRegMapping;
+
+ // Cache the subtarget here.
+ const NVPTXSubtarget *nvptxSubtarget;
+
+ // Build the map between type name and ID based on module's type
+ // symbol table.
+ std::map<Type *, std::string> TypeNameMap;
+
+ // List of variables demoted to a function scope.
+ std::map<const Function *, std::vector<const GlobalVariable *> > localDecls;
+
+ // To record filename to ID mapping
+ std::map<std::string, unsigned> filenameMap;
+ void recordAndEmitFilenames(Module &);
+
+ void emitPTXGlobalVariable(const GlobalVariable *GVar, raw_ostream &O);
+ void emitPTXAddressSpace(unsigned int AddressSpace, raw_ostream &O) const;
+ std::string getPTXFundamentalTypeStr(Type *Ty, bool = true) const;
+ void printScalarConstant(const Constant *CPV, raw_ostream &O);
+ void printFPConstant(const ConstantFP *Fp, raw_ostream &O);
+ void bufferLEByte(const Constant *CPV, int Bytes, AggBuffer *aggBuffer);
+ void bufferAggregateConstant(const Constant *CV, AggBuffer *aggBuffer);
+
+ void emitLinkageDirective(const GlobalValue *V, raw_ostream &O);
+ void emitDeclarations(const Module &, raw_ostream &O);
+ void emitDeclaration(const Function *, raw_ostream &O);
+ void emitDemotedVars(const Function *, raw_ostream &);
+
+ bool lowerImageHandleOperand(const MachineInstr *MI, unsigned OpNo,
+ MCOperand &MCOp);
+ void lowerImageHandleSymbol(unsigned Index, MCOperand &MCOp);
+
+ bool isLoopHeaderOfNoUnroll(const MachineBasicBlock &MBB) const;
+
+ LineReader *reader;
+ LineReader *getReader(const std::string &);
+
+ // Used to control the need to emit .generic() in the initializer of
+ // module scope variables.
+ // Although ptx supports the hybrid mode like the following,
+ // .global .u32 a;
+ // .global .u32 b;
+ // .global .u32 addr[] = {a, generic(b)}
+ // we have difficulty representing the difference in the NVVM IR.
+ //
+ // Since the address value should always be generic in CUDA C and always
+ // be specific in OpenCL, we use this simple control here.
+ //
+ bool EmitGeneric;
+
+public:
+ NVPTXAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
+ : AsmPrinter(TM, std::move(Streamer)),
+ EmitGeneric(static_cast<NVPTXTargetMachine &>(TM).getDrvInterface() ==
+ NVPTX::CUDA) {
+ CurrentBankselLabelInBasicBlock = "";
+ reader = nullptr;
+ }
+
+ ~NVPTXAsmPrinter() {
+ if (!reader)
+ delete reader;
+ }
+
+ bool runOnMachineFunction(MachineFunction &F) override {
+ nvptxSubtarget = &F.getSubtarget<NVPTXSubtarget>();
+ return AsmPrinter::runOnMachineFunction(F);
+ }
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineLoopInfo>();
+ AsmPrinter::getAnalysisUsage(AU);
+ }
+
+ bool ignoreLoc(const MachineInstr &);
+
+ std::string getVirtualRegisterName(unsigned) const;
+
+ DebugLoc prevDebugLoc;
+ void emitLineNumberAsDotLoc(const MachineInstr &);
+};
+} // end of namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp
new file mode 100644
index 000000000000..7d4be8e809cf
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp
@@ -0,0 +1,84 @@
+//===-- NVPTXAssignValidGlobalNames.cpp - Assign valid names to globals ---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Clean up the names of global variables in the module to not contain symbols
+// that are invalid in PTX.
+//
+// Currently NVPTX, like other backends, relies on generic symbol name
+// sanitizing done by MC. However, the ptxas assembler is more stringent and
+// disallows some additional characters in symbol names. This pass makes sure
+// such names do not reach MC at all.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/raw_ostream.h"
+#include <string>
+
+using namespace llvm;
+
+namespace {
+/// \brief NVPTXAssignValidGlobalNames
+class NVPTXAssignValidGlobalNames : public ModulePass {
+public:
+ static char ID;
+ NVPTXAssignValidGlobalNames() : ModulePass(ID) {}
+
+ bool runOnModule(Module &M) override;
+
+ /// \brief Clean up the name to remove symbols invalid in PTX.
+ std::string cleanUpName(StringRef Name);
+};
+}
+
+char NVPTXAssignValidGlobalNames::ID = 0;
+
+namespace llvm {
+void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry &);
+}
+
+INITIALIZE_PASS(NVPTXAssignValidGlobalNames, "nvptx-assign-valid-global-names",
+ "Assign valid PTX names to globals", false, false)
+
+bool NVPTXAssignValidGlobalNames::runOnModule(Module &M) {
+ for (GlobalVariable &GV : M.globals()) {
+ // We are only allowed to rename local symbols.
+ if (GV.hasLocalLinkage()) {
+ // setName doesn't do extra work if the name does not change.
+ // Note: this does not create collisions - if setName is asked to set the
+ // name to something that already exists, it adds a proper postfix to
+ // avoid collisions.
+ GV.setName(cleanUpName(GV.getName()));
+ }
+ }
+
+ return true;
+}
+
+std::string NVPTXAssignValidGlobalNames::cleanUpName(StringRef Name) {
+ std::string ValidName;
+ raw_string_ostream ValidNameStream(ValidName);
+ for (unsigned I = 0, E = Name.size(); I != E; ++I) {
+ char C = Name[I];
+ if (C == '.' || C == '@') {
+ ValidNameStream << "_$_";
+ } else {
+ ValidNameStream << C;
+ }
+ }
+
+ return ValidNameStream.str();
+}
+
+ModulePass *llvm::createNVPTXAssignValidGlobalNamesPass() {
+ return new NVPTXAssignValidGlobalNames();
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
new file mode 100644
index 000000000000..6ced2f6967cf
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
@@ -0,0 +1,78 @@
+//=======- NVPTXFrameLowering.cpp - NVPTX Frame Information ---*- C++ -*-=====//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXFrameLowering.h"
+#include "NVPTX.h"
+#include "NVPTXRegisterInfo.h"
+#include "NVPTXSubtarget.h"
+#include "NVPTXTargetMachine.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+NVPTXFrameLowering::NVPTXFrameLowering()
+ : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, 8, 0) {}
+
+bool NVPTXFrameLowering::hasFP(const MachineFunction &MF) const { return true; }
+
+void NVPTXFrameLowering::emitPrologue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ if (MF.getFrameInfo().hasStackObjects()) {
+ assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
+ MachineInstr *MI = &MBB.front();
+ MachineRegisterInfo &MR = MF.getRegInfo();
+
+ // This instruction really occurs before first instruction
+ // in the BB, so giving it no debug location.
+ DebugLoc dl = DebugLoc();
+
+ // Emits
+ // mov %SPL, %depot;
+ // cvta.local %SP, %SPL;
+ // for local address accesses in MF.
+ bool Is64Bit =
+ static_cast<const NVPTXTargetMachine &>(MF.getTarget()).is64Bit();
+ unsigned CvtaLocalOpcode =
+ (Is64Bit ? NVPTX::cvta_local_yes_64 : NVPTX::cvta_local_yes);
+ unsigned MovDepotOpcode =
+ (Is64Bit ? NVPTX::MOV_DEPOT_ADDR_64 : NVPTX::MOV_DEPOT_ADDR);
+ if (!MR.use_empty(NVPTX::VRFrame)) {
+ // If %SP is not used, do not bother emitting "cvta.local %SP, %SPL".
+ MI = BuildMI(MBB, MI, dl,
+ MF.getSubtarget().getInstrInfo()->get(CvtaLocalOpcode),
+ NVPTX::VRFrame)
+ .addReg(NVPTX::VRFrameLocal);
+ }
+ BuildMI(MBB, MI, dl, MF.getSubtarget().getInstrInfo()->get(MovDepotOpcode),
+ NVPTX::VRFrameLocal)
+ .addImm(MF.getFunctionNumber());
+ }
+}
+
+void NVPTXFrameLowering::emitEpilogue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {}
+
+// This function eliminates ADJCALLSTACKDOWN,
+// ADJCALLSTACKUP pseudo instructions
+MachineBasicBlock::iterator NVPTXFrameLowering::eliminateCallFramePseudoInstr(
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
+ // Simply discard ADJCALLSTACKDOWN,
+ // ADJCALLSTACKUP instructions.
+ return MBB.erase(I);
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h b/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h
new file mode 100644
index 000000000000..320ca9a2f095
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h
@@ -0,0 +1,36 @@
+//===--- NVPTXFrameLowering.h - Define frame lowering for NVPTX -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXFRAMELOWERING_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXFRAMELOWERING_H
+
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+class NVPTXSubtarget;
+class NVPTXFrameLowering : public TargetFrameLowering {
+public:
+ explicit NVPTXFrameLowering();
+
+ bool hasFP(const MachineFunction &MF) const override;
+ void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+ void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+ MachineBasicBlock::iterator
+ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const override;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
new file mode 100644
index 000000000000..390776212ce7
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
@@ -0,0 +1,354 @@
+//===-- GenericToNVVM.cpp - Convert generic module to NVVM module - C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Convert generic global variables into either .global or .const access based
+// on the variable's "constant" qualifier.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "MCTargetDesc/NVPTXBaseInfo.h"
+#include "NVPTXUtilities.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+
+using namespace llvm;
+
+namespace llvm {
+void initializeGenericToNVVMPass(PassRegistry &);
+}
+
+namespace {
+class GenericToNVVM : public ModulePass {
+public:
+ static char ID;
+
+ GenericToNVVM() : ModulePass(ID) {}
+
+ bool runOnModule(Module &M) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {}
+
+private:
+ Value *getOrInsertCVTA(Module *M, Function *F, GlobalVariable *GV,
+ IRBuilder<> &Builder);
+ Value *remapConstant(Module *M, Function *F, Constant *C,
+ IRBuilder<> &Builder);
+ Value *remapConstantVectorOrConstantAggregate(Module *M, Function *F,
+ Constant *C,
+ IRBuilder<> &Builder);
+ Value *remapConstantExpr(Module *M, Function *F, ConstantExpr *C,
+ IRBuilder<> &Builder);
+
+ typedef ValueMap<GlobalVariable *, GlobalVariable *> GVMapTy;
+ typedef ValueMap<Constant *, Value *> ConstantToValueMapTy;
+ GVMapTy GVMap;
+ ConstantToValueMapTy ConstantToValueMap;
+};
+} // end namespace
+
+char GenericToNVVM::ID = 0;
+
+ModulePass *llvm::createGenericToNVVMPass() { return new GenericToNVVM(); }
+
+INITIALIZE_PASS(
+ GenericToNVVM, "generic-to-nvvm",
+ "Ensure that the global variables are in the global address space", false,
+ false)
+
+bool GenericToNVVM::runOnModule(Module &M) {
+ // Create a clone of each global variable that has the default address space.
+ // The clone is created with the global address space specifier, and the pair
+ // of original global variable and its clone is placed in the GVMap for later
+ // use.
+
+ for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+ I != E;) {
+ GlobalVariable *GV = &*I++;
+ if (GV->getType()->getAddressSpace() == llvm::ADDRESS_SPACE_GENERIC &&
+ !llvm::isTexture(*GV) && !llvm::isSurface(*GV) &&
+ !llvm::isSampler(*GV) && !GV->getName().startswith("llvm.")) {
+ GlobalVariable *NewGV = new GlobalVariable(
+ M, GV->getValueType(), GV->isConstant(),
+ GV->getLinkage(),
+ GV->hasInitializer() ? GV->getInitializer() : nullptr,
+ "", GV, GV->getThreadLocalMode(), llvm::ADDRESS_SPACE_GLOBAL);
+ NewGV->copyAttributesFrom(GV);
+ GVMap[GV] = NewGV;
+ }
+ }
+
+ // Return immediately, if every global variable has a specific address space
+ // specifier.
+ if (GVMap.empty()) {
+ return false;
+ }
+
+ // Walk through the instructions in function defitinions, and replace any use
+ // of original global variables in GVMap with a use of the corresponding
+ // copies in GVMap. If necessary, promote constants to instructions.
+ for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+ if (I->isDeclaration()) {
+ continue;
+ }
+ IRBuilder<> Builder(I->getEntryBlock().getFirstNonPHIOrDbg());
+ for (Function::iterator BBI = I->begin(), BBE = I->end(); BBI != BBE;
+ ++BBI) {
+ for (BasicBlock::iterator II = BBI->begin(), IE = BBI->end(); II != IE;
+ ++II) {
+ for (unsigned i = 0, e = II->getNumOperands(); i < e; ++i) {
+ Value *Operand = II->getOperand(i);
+ if (isa<Constant>(Operand)) {
+ II->setOperand(
+ i, remapConstant(&M, &*I, cast<Constant>(Operand), Builder));
+ }
+ }
+ }
+ }
+ ConstantToValueMap.clear();
+ }
+
+ // Copy GVMap over to a standard value map.
+ ValueToValueMapTy VM;
+ for (auto I = GVMap.begin(), E = GVMap.end(); I != E; ++I)
+ VM[I->first] = I->second;
+
+ // Walk through the global variable initializers, and replace any use of
+ // original global variables in GVMap with a use of the corresponding copies
+ // in GVMap. The copies need to be bitcast to the original global variable
+ // types, as we cannot use cvta in global variable initializers.
+ for (GVMapTy::iterator I = GVMap.begin(), E = GVMap.end(); I != E;) {
+ GlobalVariable *GV = I->first;
+ GlobalVariable *NewGV = I->second;
+
+ // Remove GV from the map so that it can be RAUWed. Note that
+ // DenseMap::erase() won't invalidate any iterators but this one.
+ auto Next = std::next(I);
+ GVMap.erase(I);
+ I = Next;
+
+ Constant *BitCastNewGV = ConstantExpr::getPointerCast(NewGV, GV->getType());
+ // At this point, the remaining uses of GV should be found only in global
+ // variable initializers, as other uses have been already been removed
+ // while walking through the instructions in function definitions.
+ GV->replaceAllUsesWith(BitCastNewGV);
+ std::string Name = GV->getName();
+ GV->eraseFromParent();
+ NewGV->setName(Name);
+ }
+ assert(GVMap.empty() && "Expected it to be empty by now");
+
+ return true;
+}
+
+Value *GenericToNVVM::getOrInsertCVTA(Module *M, Function *F,
+ GlobalVariable *GV,
+ IRBuilder<> &Builder) {
+ PointerType *GVType = GV->getType();
+ Value *CVTA = nullptr;
+
+ // See if the address space conversion requires the operand to be bitcast
+ // to i8 addrspace(n)* first.
+ EVT ExtendedGVType = EVT::getEVT(GV->getValueType(), true);
+ if (!ExtendedGVType.isInteger() && !ExtendedGVType.isFloatingPoint()) {
+ // A bitcast to i8 addrspace(n)* on the operand is needed.
+ LLVMContext &Context = M->getContext();
+ unsigned int AddrSpace = GVType->getAddressSpace();
+ Type *DestTy = PointerType::get(Type::getInt8Ty(Context), AddrSpace);
+ CVTA = Builder.CreateBitCast(GV, DestTy, "cvta");
+ // Insert the address space conversion.
+ Type *ResultType =
+ PointerType::get(Type::getInt8Ty(Context), llvm::ADDRESS_SPACE_GENERIC);
+ Function *CVTAFunction = Intrinsic::getDeclaration(
+ M, Intrinsic::nvvm_ptr_global_to_gen, {ResultType, DestTy});
+ CVTA = Builder.CreateCall(CVTAFunction, CVTA, "cvta");
+ // Another bitcast from i8 * to <the element type of GVType> * is
+ // required.
+ DestTy =
+ PointerType::get(GV->getValueType(), llvm::ADDRESS_SPACE_GENERIC);
+ CVTA = Builder.CreateBitCast(CVTA, DestTy, "cvta");
+ } else {
+ // A simple CVTA is enough.
+ SmallVector<Type *, 2> ParamTypes;
+ ParamTypes.push_back(PointerType::get(GV->getValueType(),
+ llvm::ADDRESS_SPACE_GENERIC));
+ ParamTypes.push_back(GVType);
+ Function *CVTAFunction = Intrinsic::getDeclaration(
+ M, Intrinsic::nvvm_ptr_global_to_gen, ParamTypes);
+ CVTA = Builder.CreateCall(CVTAFunction, GV, "cvta");
+ }
+
+ return CVTA;
+}
+
+Value *GenericToNVVM::remapConstant(Module *M, Function *F, Constant *C,
+ IRBuilder<> &Builder) {
+ // If the constant C has been converted already in the given function F, just
+ // return the converted value.
+ ConstantToValueMapTy::iterator CTII = ConstantToValueMap.find(C);
+ if (CTII != ConstantToValueMap.end()) {
+ return CTII->second;
+ }
+
+ Value *NewValue = C;
+ if (isa<GlobalVariable>(C)) {
+ // If the constant C is a global variable and is found in GVMap, generate a
+ // set set of instructions that convert the clone of C with the global
+ // address space specifier to a generic pointer.
+ // The constant C cannot be used here, as it will be erased from the
+ // module eventually. And the clone of C with the global address space
+ // specifier cannot be used here either, as it will affect the types of
+ // other instructions in the function. Hence, this address space conversion
+ // is required.
+ GVMapTy::iterator I = GVMap.find(cast<GlobalVariable>(C));
+ if (I != GVMap.end()) {
+ NewValue = getOrInsertCVTA(M, F, I->second, Builder);
+ }
+ } else if (isa<ConstantAggregate>(C)) {
+ // If any element in the constant vector or aggregate C is or uses a global
+ // variable in GVMap, the constant C needs to be reconstructed, using a set
+ // of instructions.
+ NewValue = remapConstantVectorOrConstantAggregate(M, F, C, Builder);
+ } else if (isa<ConstantExpr>(C)) {
+ // If any operand in the constant expression C is or uses a global variable
+ // in GVMap, the constant expression C needs to be reconstructed, using a
+ // set of instructions.
+ NewValue = remapConstantExpr(M, F, cast<ConstantExpr>(C), Builder);
+ }
+
+ ConstantToValueMap[C] = NewValue;
+ return NewValue;
+}
+
+Value *GenericToNVVM::remapConstantVectorOrConstantAggregate(
+ Module *M, Function *F, Constant *C, IRBuilder<> &Builder) {
+ bool OperandChanged = false;
+ SmallVector<Value *, 4> NewOperands;
+ unsigned NumOperands = C->getNumOperands();
+
+ // Check if any element is or uses a global variable in GVMap, and thus
+ // converted to another value.
+ for (unsigned i = 0; i < NumOperands; ++i) {
+ Value *Operand = C->getOperand(i);
+ Value *NewOperand = remapConstant(M, F, cast<Constant>(Operand), Builder);
+ OperandChanged |= Operand != NewOperand;
+ NewOperands.push_back(NewOperand);
+ }
+
+ // If none of the elements has been modified, return C as it is.
+ if (!OperandChanged) {
+ return C;
+ }
+
+ // If any of the elements has been modified, construct the equivalent
+ // vector or aggregate value with a set instructions and the converted
+ // elements.
+ Value *NewValue = UndefValue::get(C->getType());
+ if (isa<ConstantVector>(C)) {
+ for (unsigned i = 0; i < NumOperands; ++i) {
+ Value *Idx = ConstantInt::get(Type::getInt32Ty(M->getContext()), i);
+ NewValue = Builder.CreateInsertElement(NewValue, NewOperands[i], Idx);
+ }
+ } else {
+ for (unsigned i = 0; i < NumOperands; ++i) {
+ NewValue =
+ Builder.CreateInsertValue(NewValue, NewOperands[i], makeArrayRef(i));
+ }
+ }
+
+ return NewValue;
+}
+
+Value *GenericToNVVM::remapConstantExpr(Module *M, Function *F, ConstantExpr *C,
+ IRBuilder<> &Builder) {
+ bool OperandChanged = false;
+ SmallVector<Value *, 4> NewOperands;
+ unsigned NumOperands = C->getNumOperands();
+
+ // Check if any operand is or uses a global variable in GVMap, and thus
+ // converted to another value.
+ for (unsigned i = 0; i < NumOperands; ++i) {
+ Value *Operand = C->getOperand(i);
+ Value *NewOperand = remapConstant(M, F, cast<Constant>(Operand), Builder);
+ OperandChanged |= Operand != NewOperand;
+ NewOperands.push_back(NewOperand);
+ }
+
+ // If none of the operands has been modified, return C as it is.
+ if (!OperandChanged) {
+ return C;
+ }
+
+ // If any of the operands has been modified, construct the instruction with
+ // the converted operands.
+ unsigned Opcode = C->getOpcode();
+ switch (Opcode) {
+ case Instruction::ICmp:
+ // CompareConstantExpr (icmp)
+ return Builder.CreateICmp(CmpInst::Predicate(C->getPredicate()),
+ NewOperands[0], NewOperands[1]);
+ case Instruction::FCmp:
+ // CompareConstantExpr (fcmp)
+ llvm_unreachable("Address space conversion should have no effect "
+ "on float point CompareConstantExpr (fcmp)!");
+ case Instruction::ExtractElement:
+ // ExtractElementConstantExpr
+ return Builder.CreateExtractElement(NewOperands[0], NewOperands[1]);
+ case Instruction::InsertElement:
+ // InsertElementConstantExpr
+ return Builder.CreateInsertElement(NewOperands[0], NewOperands[1],
+ NewOperands[2]);
+ case Instruction::ShuffleVector:
+ // ShuffleVector
+ return Builder.CreateShuffleVector(NewOperands[0], NewOperands[1],
+ NewOperands[2]);
+ case Instruction::ExtractValue:
+ // ExtractValueConstantExpr
+ return Builder.CreateExtractValue(NewOperands[0], C->getIndices());
+ case Instruction::InsertValue:
+ // InsertValueConstantExpr
+ return Builder.CreateInsertValue(NewOperands[0], NewOperands[1],
+ C->getIndices());
+ case Instruction::GetElementPtr:
+ // GetElementPtrConstantExpr
+ return cast<GEPOperator>(C)->isInBounds()
+ ? Builder.CreateGEP(
+ cast<GEPOperator>(C)->getSourceElementType(),
+ NewOperands[0],
+ makeArrayRef(&NewOperands[1], NumOperands - 1))
+ : Builder.CreateInBoundsGEP(
+ cast<GEPOperator>(C)->getSourceElementType(),
+ NewOperands[0],
+ makeArrayRef(&NewOperands[1], NumOperands - 1));
+ case Instruction::Select:
+ // SelectConstantExpr
+ return Builder.CreateSelect(NewOperands[0], NewOperands[1], NewOperands[2]);
+ default:
+ // BinaryConstantExpr
+ if (Instruction::isBinaryOp(Opcode)) {
+ return Builder.CreateBinOp(Instruction::BinaryOps(C->getOpcode()),
+ NewOperands[0], NewOperands[1]);
+ }
+ // UnaryConstantExpr
+ if (Instruction::isCast(Opcode)) {
+ return Builder.CreateCast(Instruction::CastOps(C->getOpcode()),
+ NewOperands[0], C->getType());
+ }
+ llvm_unreachable("GenericToNVVM encountered an unsupported ConstantExpr");
+ }
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
new file mode 100644
index 000000000000..43c478f4212f
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -0,0 +1,5259 @@
+//===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the NVPTX target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXISelDAGToDAG.h"
+#include "NVPTXUtilities.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "nvptx-isel"
+
+static cl::opt<int> UsePrecDivF32(
+ "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden,
+ cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
+ " IEEE Compliant F32 div.rnd if available."),
+ cl::init(2));
+
+static cl::opt<bool>
+UsePrecSqrtF32("nvptx-prec-sqrtf32", cl::Hidden,
+ cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
+ cl::init(true));
+
+static cl::opt<bool>
+FtzEnabled("nvptx-f32ftz", cl::ZeroOrMore, cl::Hidden,
+ cl::desc("NVPTX Specific: Flush f32 subnormals to sign-preserving zero."),
+ cl::init(false));
+
+
+/// createNVPTXISelDag - This pass converts a legalized DAG into a
+/// NVPTX-specific DAG, ready for instruction scheduling.
+FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM,
+ llvm::CodeGenOpt::Level OptLevel) {
+ return new NVPTXDAGToDAGISel(TM, OptLevel);
+}
+
+NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
+ CodeGenOpt::Level OptLevel)
+ : SelectionDAGISel(tm, OptLevel), TM(tm) {
+ doMulWide = (OptLevel > 0);
+}
+
+bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
+ Subtarget = &static_cast<const NVPTXSubtarget &>(MF.getSubtarget());
+ return SelectionDAGISel::runOnMachineFunction(MF);
+}
+
+int NVPTXDAGToDAGISel::getDivF32Level() const {
+ if (UsePrecDivF32.getNumOccurrences() > 0) {
+ // If nvptx-prec-div32=N is used on the command-line, always honor it
+ return UsePrecDivF32;
+ } else {
+ // Otherwise, use div.approx if fast math is enabled
+ if (TM.Options.UnsafeFPMath)
+ return 0;
+ else
+ return 2;
+ }
+}
+
+bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {
+ if (UsePrecSqrtF32.getNumOccurrences() > 0) {
+ // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
+ return UsePrecSqrtF32;
+ } else {
+ // Otherwise, use sqrt.approx if fast math is enabled
+ return !TM.Options.UnsafeFPMath;
+ }
+}
+
+bool NVPTXDAGToDAGISel::useF32FTZ() const {
+ if (FtzEnabled.getNumOccurrences() > 0) {
+ // If nvptx-f32ftz is used on the command-line, always honor it
+ return FtzEnabled;
+ } else {
+ const Function *F = MF->getFunction();
+ // Otherwise, check for an nvptx-f32ftz attribute on the function
+ if (F->hasFnAttribute("nvptx-f32ftz"))
+ return F->getFnAttribute("nvptx-f32ftz").getValueAsString() == "true";
+ else
+ return false;
+ }
+}
+
+bool NVPTXDAGToDAGISel::allowFMA() const {
+ const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
+ return TL->allowFMA(*MF, OptLevel);
+}
+
+/// Select - Select instructions not customized! Used for
+/// expanded, promoted and normal instructions.
+void NVPTXDAGToDAGISel::Select(SDNode *N) {
+
+ if (N->isMachineOpcode()) {
+ N->setNodeId(-1);
+ return; // Already selected.
+ }
+
+ switch (N->getOpcode()) {
+ case ISD::LOAD:
+ if (tryLoad(N))
+ return;
+ break;
+ case ISD::STORE:
+ if (tryStore(N))
+ return;
+ break;
+ case NVPTXISD::LoadV2:
+ case NVPTXISD::LoadV4:
+ if (tryLoadVector(N))
+ return;
+ break;
+ case NVPTXISD::LDGV2:
+ case NVPTXISD::LDGV4:
+ case NVPTXISD::LDUV2:
+ case NVPTXISD::LDUV4:
+ if (tryLDGLDU(N))
+ return;
+ break;
+ case NVPTXISD::StoreV2:
+ case NVPTXISD::StoreV4:
+ if (tryStoreVector(N))
+ return;
+ break;
+ case NVPTXISD::LoadParam:
+ case NVPTXISD::LoadParamV2:
+ case NVPTXISD::LoadParamV4:
+ if (tryLoadParam(N))
+ return;
+ break;
+ case NVPTXISD::StoreRetval:
+ case NVPTXISD::StoreRetvalV2:
+ case NVPTXISD::StoreRetvalV4:
+ if (tryStoreRetval(N))
+ return;
+ break;
+ case NVPTXISD::StoreParam:
+ case NVPTXISD::StoreParamV2:
+ case NVPTXISD::StoreParamV4:
+ case NVPTXISD::StoreParamS32:
+ case NVPTXISD::StoreParamU32:
+ if (tryStoreParam(N))
+ return;
+ break;
+ case ISD::INTRINSIC_WO_CHAIN:
+ if (tryIntrinsicNoChain(N))
+ return;
+ break;
+ case ISD::INTRINSIC_W_CHAIN:
+ if (tryIntrinsicChain(N))
+ return;
+ break;
+ case NVPTXISD::Tex1DFloatS32:
+ case NVPTXISD::Tex1DFloatFloat:
+ case NVPTXISD::Tex1DFloatFloatLevel:
+ case NVPTXISD::Tex1DFloatFloatGrad:
+ case NVPTXISD::Tex1DS32S32:
+ case NVPTXISD::Tex1DS32Float:
+ case NVPTXISD::Tex1DS32FloatLevel:
+ case NVPTXISD::Tex1DS32FloatGrad:
+ case NVPTXISD::Tex1DU32S32:
+ case NVPTXISD::Tex1DU32Float:
+ case NVPTXISD::Tex1DU32FloatLevel:
+ case NVPTXISD::Tex1DU32FloatGrad:
+ case NVPTXISD::Tex1DArrayFloatS32:
+ case NVPTXISD::Tex1DArrayFloatFloat:
+ case NVPTXISD::Tex1DArrayFloatFloatLevel:
+ case NVPTXISD::Tex1DArrayFloatFloatGrad:
+ case NVPTXISD::Tex1DArrayS32S32:
+ case NVPTXISD::Tex1DArrayS32Float:
+ case NVPTXISD::Tex1DArrayS32FloatLevel:
+ case NVPTXISD::Tex1DArrayS32FloatGrad:
+ case NVPTXISD::Tex1DArrayU32S32:
+ case NVPTXISD::Tex1DArrayU32Float:
+ case NVPTXISD::Tex1DArrayU32FloatLevel:
+ case NVPTXISD::Tex1DArrayU32FloatGrad:
+ case NVPTXISD::Tex2DFloatS32:
+ case NVPTXISD::Tex2DFloatFloat:
+ case NVPTXISD::Tex2DFloatFloatLevel:
+ case NVPTXISD::Tex2DFloatFloatGrad:
+ case NVPTXISD::Tex2DS32S32:
+ case NVPTXISD::Tex2DS32Float:
+ case NVPTXISD::Tex2DS32FloatLevel:
+ case NVPTXISD::Tex2DS32FloatGrad:
+ case NVPTXISD::Tex2DU32S32:
+ case NVPTXISD::Tex2DU32Float:
+ case NVPTXISD::Tex2DU32FloatLevel:
+ case NVPTXISD::Tex2DU32FloatGrad:
+ case NVPTXISD::Tex2DArrayFloatS32:
+ case NVPTXISD::Tex2DArrayFloatFloat:
+ case NVPTXISD::Tex2DArrayFloatFloatLevel:
+ case NVPTXISD::Tex2DArrayFloatFloatGrad:
+ case NVPTXISD::Tex2DArrayS32S32:
+ case NVPTXISD::Tex2DArrayS32Float:
+ case NVPTXISD::Tex2DArrayS32FloatLevel:
+ case NVPTXISD::Tex2DArrayS32FloatGrad:
+ case NVPTXISD::Tex2DArrayU32S32:
+ case NVPTXISD::Tex2DArrayU32Float:
+ case NVPTXISD::Tex2DArrayU32FloatLevel:
+ case NVPTXISD::Tex2DArrayU32FloatGrad:
+ case NVPTXISD::Tex3DFloatS32:
+ case NVPTXISD::Tex3DFloatFloat:
+ case NVPTXISD::Tex3DFloatFloatLevel:
+ case NVPTXISD::Tex3DFloatFloatGrad:
+ case NVPTXISD::Tex3DS32S32:
+ case NVPTXISD::Tex3DS32Float:
+ case NVPTXISD::Tex3DS32FloatLevel:
+ case NVPTXISD::Tex3DS32FloatGrad:
+ case NVPTXISD::Tex3DU32S32:
+ case NVPTXISD::Tex3DU32Float:
+ case NVPTXISD::Tex3DU32FloatLevel:
+ case NVPTXISD::Tex3DU32FloatGrad:
+ case NVPTXISD::TexCubeFloatFloat:
+ case NVPTXISD::TexCubeFloatFloatLevel:
+ case NVPTXISD::TexCubeS32Float:
+ case NVPTXISD::TexCubeS32FloatLevel:
+ case NVPTXISD::TexCubeU32Float:
+ case NVPTXISD::TexCubeU32FloatLevel:
+ case NVPTXISD::TexCubeArrayFloatFloat:
+ case NVPTXISD::TexCubeArrayFloatFloatLevel:
+ case NVPTXISD::TexCubeArrayS32Float:
+ case NVPTXISD::TexCubeArrayS32FloatLevel:
+ case NVPTXISD::TexCubeArrayU32Float:
+ case NVPTXISD::TexCubeArrayU32FloatLevel:
+ case NVPTXISD::Tld4R2DFloatFloat:
+ case NVPTXISD::Tld4G2DFloatFloat:
+ case NVPTXISD::Tld4B2DFloatFloat:
+ case NVPTXISD::Tld4A2DFloatFloat:
+ case NVPTXISD::Tld4R2DS64Float:
+ case NVPTXISD::Tld4G2DS64Float:
+ case NVPTXISD::Tld4B2DS64Float:
+ case NVPTXISD::Tld4A2DS64Float:
+ case NVPTXISD::Tld4R2DU64Float:
+ case NVPTXISD::Tld4G2DU64Float:
+ case NVPTXISD::Tld4B2DU64Float:
+ case NVPTXISD::Tld4A2DU64Float:
+ case NVPTXISD::TexUnified1DFloatS32:
+ case NVPTXISD::TexUnified1DFloatFloat:
+ case NVPTXISD::TexUnified1DFloatFloatLevel:
+ case NVPTXISD::TexUnified1DFloatFloatGrad:
+ case NVPTXISD::TexUnified1DS32S32:
+ case NVPTXISD::TexUnified1DS32Float:
+ case NVPTXISD::TexUnified1DS32FloatLevel:
+ case NVPTXISD::TexUnified1DS32FloatGrad:
+ case NVPTXISD::TexUnified1DU32S32:
+ case NVPTXISD::TexUnified1DU32Float:
+ case NVPTXISD::TexUnified1DU32FloatLevel:
+ case NVPTXISD::TexUnified1DU32FloatGrad:
+ case NVPTXISD::TexUnified1DArrayFloatS32:
+ case NVPTXISD::TexUnified1DArrayFloatFloat:
+ case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
+ case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
+ case NVPTXISD::TexUnified1DArrayS32S32:
+ case NVPTXISD::TexUnified1DArrayS32Float:
+ case NVPTXISD::TexUnified1DArrayS32FloatLevel:
+ case NVPTXISD::TexUnified1DArrayS32FloatGrad:
+ case NVPTXISD::TexUnified1DArrayU32S32:
+ case NVPTXISD::TexUnified1DArrayU32Float:
+ case NVPTXISD::TexUnified1DArrayU32FloatLevel:
+ case NVPTXISD::TexUnified1DArrayU32FloatGrad:
+ case NVPTXISD::TexUnified2DFloatS32:
+ case NVPTXISD::TexUnified2DFloatFloat:
+ case NVPTXISD::TexUnified2DFloatFloatLevel:
+ case NVPTXISD::TexUnified2DFloatFloatGrad:
+ case NVPTXISD::TexUnified2DS32S32:
+ case NVPTXISD::TexUnified2DS32Float:
+ case NVPTXISD::TexUnified2DS32FloatLevel:
+ case NVPTXISD::TexUnified2DS32FloatGrad:
+ case NVPTXISD::TexUnified2DU32S32:
+ case NVPTXISD::TexUnified2DU32Float:
+ case NVPTXISD::TexUnified2DU32FloatLevel:
+ case NVPTXISD::TexUnified2DU32FloatGrad:
+ case NVPTXISD::TexUnified2DArrayFloatS32:
+ case NVPTXISD::TexUnified2DArrayFloatFloat:
+ case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
+ case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
+ case NVPTXISD::TexUnified2DArrayS32S32:
+ case NVPTXISD::TexUnified2DArrayS32Float:
+ case NVPTXISD::TexUnified2DArrayS32FloatLevel:
+ case NVPTXISD::TexUnified2DArrayS32FloatGrad:
+ case NVPTXISD::TexUnified2DArrayU32S32:
+ case NVPTXISD::TexUnified2DArrayU32Float:
+ case NVPTXISD::TexUnified2DArrayU32FloatLevel:
+ case NVPTXISD::TexUnified2DArrayU32FloatGrad:
+ case NVPTXISD::TexUnified3DFloatS32:
+ case NVPTXISD::TexUnified3DFloatFloat:
+ case NVPTXISD::TexUnified3DFloatFloatLevel:
+ case NVPTXISD::TexUnified3DFloatFloatGrad:
+ case NVPTXISD::TexUnified3DS32S32:
+ case NVPTXISD::TexUnified3DS32Float:
+ case NVPTXISD::TexUnified3DS32FloatLevel:
+ case NVPTXISD::TexUnified3DS32FloatGrad:
+ case NVPTXISD::TexUnified3DU32S32:
+ case NVPTXISD::TexUnified3DU32Float:
+ case NVPTXISD::TexUnified3DU32FloatLevel:
+ case NVPTXISD::TexUnified3DU32FloatGrad:
+ case NVPTXISD::TexUnifiedCubeFloatFloat:
+ case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
+ case NVPTXISD::TexUnifiedCubeS32Float:
+ case NVPTXISD::TexUnifiedCubeS32FloatLevel:
+ case NVPTXISD::TexUnifiedCubeU32Float:
+ case NVPTXISD::TexUnifiedCubeU32FloatLevel:
+ case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
+ case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
+ case NVPTXISD::TexUnifiedCubeArrayS32Float:
+ case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
+ case NVPTXISD::TexUnifiedCubeArrayU32Float:
+ case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
+ case NVPTXISD::Tld4UnifiedR2DFloatFloat:
+ case NVPTXISD::Tld4UnifiedG2DFloatFloat:
+ case NVPTXISD::Tld4UnifiedB2DFloatFloat:
+ case NVPTXISD::Tld4UnifiedA2DFloatFloat:
+ case NVPTXISD::Tld4UnifiedR2DS64Float:
+ case NVPTXISD::Tld4UnifiedG2DS64Float:
+ case NVPTXISD::Tld4UnifiedB2DS64Float:
+ case NVPTXISD::Tld4UnifiedA2DS64Float:
+ case NVPTXISD::Tld4UnifiedR2DU64Float:
+ case NVPTXISD::Tld4UnifiedG2DU64Float:
+ case NVPTXISD::Tld4UnifiedB2DU64Float:
+ case NVPTXISD::Tld4UnifiedA2DU64Float:
+ if (tryTextureIntrinsic(N))
+ return;
+ break;
+ case NVPTXISD::Suld1DI8Clamp:
+ case NVPTXISD::Suld1DI16Clamp:
+ case NVPTXISD::Suld1DI32Clamp:
+ case NVPTXISD::Suld1DI64Clamp:
+ case NVPTXISD::Suld1DV2I8Clamp:
+ case NVPTXISD::Suld1DV2I16Clamp:
+ case NVPTXISD::Suld1DV2I32Clamp:
+ case NVPTXISD::Suld1DV2I64Clamp:
+ case NVPTXISD::Suld1DV4I8Clamp:
+ case NVPTXISD::Suld1DV4I16Clamp:
+ case NVPTXISD::Suld1DV4I32Clamp:
+ case NVPTXISD::Suld1DArrayI8Clamp:
+ case NVPTXISD::Suld1DArrayI16Clamp:
+ case NVPTXISD::Suld1DArrayI32Clamp:
+ case NVPTXISD::Suld1DArrayI64Clamp:
+ case NVPTXISD::Suld1DArrayV2I8Clamp:
+ case NVPTXISD::Suld1DArrayV2I16Clamp:
+ case NVPTXISD::Suld1DArrayV2I32Clamp:
+ case NVPTXISD::Suld1DArrayV2I64Clamp:
+ case NVPTXISD::Suld1DArrayV4I8Clamp:
+ case NVPTXISD::Suld1DArrayV4I16Clamp:
+ case NVPTXISD::Suld1DArrayV4I32Clamp:
+ case NVPTXISD::Suld2DI8Clamp:
+ case NVPTXISD::Suld2DI16Clamp:
+ case NVPTXISD::Suld2DI32Clamp:
+ case NVPTXISD::Suld2DI64Clamp:
+ case NVPTXISD::Suld2DV2I8Clamp:
+ case NVPTXISD::Suld2DV2I16Clamp:
+ case NVPTXISD::Suld2DV2I32Clamp:
+ case NVPTXISD::Suld2DV2I64Clamp:
+ case NVPTXISD::Suld2DV4I8Clamp:
+ case NVPTXISD::Suld2DV4I16Clamp:
+ case NVPTXISD::Suld2DV4I32Clamp:
+ case NVPTXISD::Suld2DArrayI8Clamp:
+ case NVPTXISD::Suld2DArrayI16Clamp:
+ case NVPTXISD::Suld2DArrayI32Clamp:
+ case NVPTXISD::Suld2DArrayI64Clamp:
+ case NVPTXISD::Suld2DArrayV2I8Clamp:
+ case NVPTXISD::Suld2DArrayV2I16Clamp:
+ case NVPTXISD::Suld2DArrayV2I32Clamp:
+ case NVPTXISD::Suld2DArrayV2I64Clamp:
+ case NVPTXISD::Suld2DArrayV4I8Clamp:
+ case NVPTXISD::Suld2DArrayV4I16Clamp:
+ case NVPTXISD::Suld2DArrayV4I32Clamp:
+ case NVPTXISD::Suld3DI8Clamp:
+ case NVPTXISD::Suld3DI16Clamp:
+ case NVPTXISD::Suld3DI32Clamp:
+ case NVPTXISD::Suld3DI64Clamp:
+ case NVPTXISD::Suld3DV2I8Clamp:
+ case NVPTXISD::Suld3DV2I16Clamp:
+ case NVPTXISD::Suld3DV2I32Clamp:
+ case NVPTXISD::Suld3DV2I64Clamp:
+ case NVPTXISD::Suld3DV4I8Clamp:
+ case NVPTXISD::Suld3DV4I16Clamp:
+ case NVPTXISD::Suld3DV4I32Clamp:
+ case NVPTXISD::Suld1DI8Trap:
+ case NVPTXISD::Suld1DI16Trap:
+ case NVPTXISD::Suld1DI32Trap:
+ case NVPTXISD::Suld1DI64Trap:
+ case NVPTXISD::Suld1DV2I8Trap:
+ case NVPTXISD::Suld1DV2I16Trap:
+ case NVPTXISD::Suld1DV2I32Trap:
+ case NVPTXISD::Suld1DV2I64Trap:
+ case NVPTXISD::Suld1DV4I8Trap:
+ case NVPTXISD::Suld1DV4I16Trap:
+ case NVPTXISD::Suld1DV4I32Trap:
+ case NVPTXISD::Suld1DArrayI8Trap:
+ case NVPTXISD::Suld1DArrayI16Trap:
+ case NVPTXISD::Suld1DArrayI32Trap:
+ case NVPTXISD::Suld1DArrayI64Trap:
+ case NVPTXISD::Suld1DArrayV2I8Trap:
+ case NVPTXISD::Suld1DArrayV2I16Trap:
+ case NVPTXISD::Suld1DArrayV2I32Trap:
+ case NVPTXISD::Suld1DArrayV2I64Trap:
+ case NVPTXISD::Suld1DArrayV4I8Trap:
+ case NVPTXISD::Suld1DArrayV4I16Trap:
+ case NVPTXISD::Suld1DArrayV4I32Trap:
+ case NVPTXISD::Suld2DI8Trap:
+ case NVPTXISD::Suld2DI16Trap:
+ case NVPTXISD::Suld2DI32Trap:
+ case NVPTXISD::Suld2DI64Trap:
+ case NVPTXISD::Suld2DV2I8Trap:
+ case NVPTXISD::Suld2DV2I16Trap:
+ case NVPTXISD::Suld2DV2I32Trap:
+ case NVPTXISD::Suld2DV2I64Trap:
+ case NVPTXISD::Suld2DV4I8Trap:
+ case NVPTXISD::Suld2DV4I16Trap:
+ case NVPTXISD::Suld2DV4I32Trap:
+ case NVPTXISD::Suld2DArrayI8Trap:
+ case NVPTXISD::Suld2DArrayI16Trap:
+ case NVPTXISD::Suld2DArrayI32Trap:
+ case NVPTXISD::Suld2DArrayI64Trap:
+ case NVPTXISD::Suld2DArrayV2I8Trap:
+ case NVPTXISD::Suld2DArrayV2I16Trap:
+ case NVPTXISD::Suld2DArrayV2I32Trap:
+ case NVPTXISD::Suld2DArrayV2I64Trap:
+ case NVPTXISD::Suld2DArrayV4I8Trap:
+ case NVPTXISD::Suld2DArrayV4I16Trap:
+ case NVPTXISD::Suld2DArrayV4I32Trap:
+ case NVPTXISD::Suld3DI8Trap:
+ case NVPTXISD::Suld3DI16Trap:
+ case NVPTXISD::Suld3DI32Trap:
+ case NVPTXISD::Suld3DI64Trap:
+ case NVPTXISD::Suld3DV2I8Trap:
+ case NVPTXISD::Suld3DV2I16Trap:
+ case NVPTXISD::Suld3DV2I32Trap:
+ case NVPTXISD::Suld3DV2I64Trap:
+ case NVPTXISD::Suld3DV4I8Trap:
+ case NVPTXISD::Suld3DV4I16Trap:
+ case NVPTXISD::Suld3DV4I32Trap:
+ case NVPTXISD::Suld1DI8Zero:
+ case NVPTXISD::Suld1DI16Zero:
+ case NVPTXISD::Suld1DI32Zero:
+ case NVPTXISD::Suld1DI64Zero:
+ case NVPTXISD::Suld1DV2I8Zero:
+ case NVPTXISD::Suld1DV2I16Zero:
+ case NVPTXISD::Suld1DV2I32Zero:
+ case NVPTXISD::Suld1DV2I64Zero:
+ case NVPTXISD::Suld1DV4I8Zero:
+ case NVPTXISD::Suld1DV4I16Zero:
+ case NVPTXISD::Suld1DV4I32Zero:
+ case NVPTXISD::Suld1DArrayI8Zero:
+ case NVPTXISD::Suld1DArrayI16Zero:
+ case NVPTXISD::Suld1DArrayI32Zero:
+ case NVPTXISD::Suld1DArrayI64Zero:
+ case NVPTXISD::Suld1DArrayV2I8Zero:
+ case NVPTXISD::Suld1DArrayV2I16Zero:
+ case NVPTXISD::Suld1DArrayV2I32Zero:
+ case NVPTXISD::Suld1DArrayV2I64Zero:
+ case NVPTXISD::Suld1DArrayV4I8Zero:
+ case NVPTXISD::Suld1DArrayV4I16Zero:
+ case NVPTXISD::Suld1DArrayV4I32Zero:
+ case NVPTXISD::Suld2DI8Zero:
+ case NVPTXISD::Suld2DI16Zero:
+ case NVPTXISD::Suld2DI32Zero:
+ case NVPTXISD::Suld2DI64Zero:
+ case NVPTXISD::Suld2DV2I8Zero:
+ case NVPTXISD::Suld2DV2I16Zero:
+ case NVPTXISD::Suld2DV2I32Zero:
+ case NVPTXISD::Suld2DV2I64Zero:
+ case NVPTXISD::Suld2DV4I8Zero:
+ case NVPTXISD::Suld2DV4I16Zero:
+ case NVPTXISD::Suld2DV4I32Zero:
+ case NVPTXISD::Suld2DArrayI8Zero:
+ case NVPTXISD::Suld2DArrayI16Zero:
+ case NVPTXISD::Suld2DArrayI32Zero:
+ case NVPTXISD::Suld2DArrayI64Zero:
+ case NVPTXISD::Suld2DArrayV2I8Zero:
+ case NVPTXISD::Suld2DArrayV2I16Zero:
+ case NVPTXISD::Suld2DArrayV2I32Zero:
+ case NVPTXISD::Suld2DArrayV2I64Zero:
+ case NVPTXISD::Suld2DArrayV4I8Zero:
+ case NVPTXISD::Suld2DArrayV4I16Zero:
+ case NVPTXISD::Suld2DArrayV4I32Zero:
+ case NVPTXISD::Suld3DI8Zero:
+ case NVPTXISD::Suld3DI16Zero:
+ case NVPTXISD::Suld3DI32Zero:
+ case NVPTXISD::Suld3DI64Zero:
+ case NVPTXISD::Suld3DV2I8Zero:
+ case NVPTXISD::Suld3DV2I16Zero:
+ case NVPTXISD::Suld3DV2I32Zero:
+ case NVPTXISD::Suld3DV2I64Zero:
+ case NVPTXISD::Suld3DV4I8Zero:
+ case NVPTXISD::Suld3DV4I16Zero:
+ case NVPTXISD::Suld3DV4I32Zero:
+ if (trySurfaceIntrinsic(N))
+ return;
+ break;
+ case ISD::AND:
+ case ISD::SRA:
+ case ISD::SRL:
+ // Try to select BFE
+ if (tryBFE(N))
+ return;
+ break;
+ case ISD::ADDRSPACECAST:
+ SelectAddrSpaceCast(N);
+ return;
+ default:
+ break;
+ }
+ SelectCode(N);
+}
+
+bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
+ unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ switch (IID) {
+ default:
+ return false;
+ case Intrinsic::nvvm_ldg_global_f:
+ case Intrinsic::nvvm_ldg_global_i:
+ case Intrinsic::nvvm_ldg_global_p:
+ case Intrinsic::nvvm_ldu_global_f:
+ case Intrinsic::nvvm_ldu_global_i:
+ case Intrinsic::nvvm_ldu_global_p:
+ return tryLDGLDU(N);
+ }
+}
+
+static unsigned int getCodeAddrSpace(MemSDNode *N) {
+ const Value *Src = N->getMemOperand()->getValue();
+
+ if (!Src)
+ return NVPTX::PTXLdStInstCode::GENERIC;
+
+ if (auto *PT = dyn_cast<PointerType>(Src->getType())) {
+ switch (PT->getAddressSpace()) {
+ case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::PTXLdStInstCode::LOCAL;
+ case llvm::ADDRESS_SPACE_GLOBAL: return NVPTX::PTXLdStInstCode::GLOBAL;
+ case llvm::ADDRESS_SPACE_SHARED: return NVPTX::PTXLdStInstCode::SHARED;
+ case llvm::ADDRESS_SPACE_GENERIC: return NVPTX::PTXLdStInstCode::GENERIC;
+ case llvm::ADDRESS_SPACE_PARAM: return NVPTX::PTXLdStInstCode::PARAM;
+ case llvm::ADDRESS_SPACE_CONST: return NVPTX::PTXLdStInstCode::CONSTANT;
+ default: break;
+ }
+ }
+ return NVPTX::PTXLdStInstCode::GENERIC;
+}
+
+static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
+ unsigned CodeAddrSpace, MachineFunction *F) {
+ // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
+ // space.
+ //
+ // We have two ways of identifying invariant loads: Loads may be explicitly
+ // marked as invariant, or we may infer them to be invariant.
+ //
+ // We currently infer invariance only for kernel function pointer params that
+ // are noalias (i.e. __restrict) and never written to.
+ //
+ // TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally
+ // not during the SelectionDAG phase).
+ //
+ // TODO: Infer invariance only at -O2. We still want to use ldg at -O0 for
+ // explicitly invariant loads because these are how clang tells us to use ldg
+ // when the user uses a builtin.
+ if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL)
+ return false;
+
+ if (N->isInvariant())
+ return true;
+
+ // Load wasn't explicitly invariant. Attempt to infer invariance.
+ if (!isKernelFunction(*F->getFunction()))
+ return false;
+
+ // We use GetUnderlyingObjects() here instead of
+ // GetUnderlyingObject() mainly because the former looks through phi
+ // nodes while the latter does not. We need to look through phi
+ // nodes to handle pointer induction variables.
+ SmallVector<Value *, 8> Objs;
+ GetUnderlyingObjects(const_cast<Value *>(N->getMemOperand()->getValue()),
+ Objs, F->getDataLayout());
+ for (Value *Obj : Objs) {
+ auto *A = dyn_cast<const Argument>(Obj);
+ if (!A || !A->onlyReadsMemory() || !A->hasNoAliasAttr()) return false;
+ }
+
+ return true;
+}
+
+bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {
+ unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+ switch (IID) {
+ default:
+ return false;
+ case Intrinsic::nvvm_texsurf_handle_internal:
+ SelectTexSurfHandle(N);
+ return true;
+ }
+}
+
+void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) {
+ // Op 0 is the intrinsic ID
+ SDValue Wrapper = N->getOperand(1);
+ SDValue GlobalVal = Wrapper.getOperand(0);
+ ReplaceNode(N, CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N),
+ MVT::i64, GlobalVal));
+}
+
+void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
+ SDValue Src = N->getOperand(0);
+ AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
+ unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
+ unsigned DstAddrSpace = CastN->getDestAddressSpace();
+
+ assert(SrcAddrSpace != DstAddrSpace &&
+ "addrspacecast must be between different address spaces");
+
+ if (DstAddrSpace == ADDRESS_SPACE_GENERIC) {
+ // Specific to generic
+ unsigned Opc;
+ switch (SrcAddrSpace) {
+ default: report_fatal_error("Bad address space in addrspacecast");
+ case ADDRESS_SPACE_GLOBAL:
+ Opc = TM.is64Bit() ? NVPTX::cvta_global_yes_64 : NVPTX::cvta_global_yes;
+ break;
+ case ADDRESS_SPACE_SHARED:
+ Opc = TM.is64Bit() ? NVPTX::cvta_shared_yes_64 : NVPTX::cvta_shared_yes;
+ break;
+ case ADDRESS_SPACE_CONST:
+ Opc = TM.is64Bit() ? NVPTX::cvta_const_yes_64 : NVPTX::cvta_const_yes;
+ break;
+ case ADDRESS_SPACE_LOCAL:
+ Opc = TM.is64Bit() ? NVPTX::cvta_local_yes_64 : NVPTX::cvta_local_yes;
+ break;
+ }
+ ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
+ Src));
+ return;
+ } else {
+ // Generic to specific
+ if (SrcAddrSpace != 0)
+ report_fatal_error("Cannot cast between two non-generic address spaces");
+ unsigned Opc;
+ switch (DstAddrSpace) {
+ default: report_fatal_error("Bad address space in addrspacecast");
+ case ADDRESS_SPACE_GLOBAL:
+ Opc = TM.is64Bit() ? NVPTX::cvta_to_global_yes_64
+ : NVPTX::cvta_to_global_yes;
+ break;
+ case ADDRESS_SPACE_SHARED:
+ Opc = TM.is64Bit() ? NVPTX::cvta_to_shared_yes_64
+ : NVPTX::cvta_to_shared_yes;
+ break;
+ case ADDRESS_SPACE_CONST:
+ Opc =
+ TM.is64Bit() ? NVPTX::cvta_to_const_yes_64 : NVPTX::cvta_to_const_yes;
+ break;
+ case ADDRESS_SPACE_LOCAL:
+ Opc =
+ TM.is64Bit() ? NVPTX::cvta_to_local_yes_64 : NVPTX::cvta_to_local_yes;
+ break;
+ case ADDRESS_SPACE_PARAM:
+ Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64
+ : NVPTX::nvvm_ptr_gen_to_param;
+ break;
+ }
+ ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
+ Src));
+ return;
+ }
+}
+
+bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
+ SDLoc dl(N);
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ EVT LoadedVT = LD->getMemoryVT();
+ SDNode *NVPTXLD = nullptr;
+
+ // do not support pre/post inc/dec
+ if (LD->isIndexed())
+ return false;
+
+ if (!LoadedVT.isSimple())
+ return false;
+
+ // Address Space Setting
+ unsigned int codeAddrSpace = getCodeAddrSpace(LD);
+
+ if (canLowerToLDG(LD, *Subtarget, codeAddrSpace, MF)) {
+ return tryLDGLDU(N);
+ }
+
+ // Volatile Setting
+ // - .volatile is only availalble for .global and .shared
+ bool isVolatile = LD->isVolatile();
+ if (codeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
+ codeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
+ codeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
+ isVolatile = false;
+
+ // Vector Setting
+ MVT SimpleVT = LoadedVT.getSimpleVT();
+ unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
+ if (SimpleVT.isVector()) {
+ unsigned num = SimpleVT.getVectorNumElements();
+ if (num == 2)
+ vecType = NVPTX::PTXLdStInstCode::V2;
+ else if (num == 4)
+ vecType = NVPTX::PTXLdStInstCode::V4;
+ else
+ return false;
+ }
+
+ // Type Setting: fromType + fromTypeWidth
+ //
+ // Sign : ISD::SEXTLOAD
+ // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
+ // type is integer
+ // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
+ MVT ScalarVT = SimpleVT.getScalarType();
+ // Read at least 8 bits (predicates are stored as 8-bit values)
+ unsigned fromTypeWidth = std::max(8U, ScalarVT.getSizeInBits());
+ unsigned int fromType;
+ if ((LD->getExtensionType() == ISD::SEXTLOAD))
+ fromType = NVPTX::PTXLdStInstCode::Signed;
+ else if (ScalarVT.isFloatingPoint())
+ fromType = NVPTX::PTXLdStInstCode::Float;
+ else
+ fromType = NVPTX::PTXLdStInstCode::Unsigned;
+
+ // Create the machine instruction DAG
+ SDValue Chain = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDValue Addr;
+ SDValue Offset, Base;
+ unsigned Opcode;
+ MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
+
+ if (SelectDirectAddr(N1, Addr)) {
+ switch (TargetVT) {
+ case MVT::i8:
+ Opcode = NVPTX::LD_i8_avar;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::LD_i16_avar;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::LD_i32_avar;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::LD_i64_avar;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::LD_f32_avar;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::LD_f64_avar;
+ break;
+ default:
+ return false;
+ }
+ SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
+ getI32Imm(vecType, dl), getI32Imm(fromType, dl),
+ getI32Imm(fromTypeWidth, dl), Addr, Chain };
+ NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops);
+ } else if (TM.is64Bit() ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
+ : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
+ switch (TargetVT) {
+ case MVT::i8:
+ Opcode = NVPTX::LD_i8_asi;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::LD_i16_asi;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::LD_i32_asi;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::LD_i64_asi;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::LD_f32_asi;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::LD_f64_asi;
+ break;
+ default:
+ return false;
+ }
+ SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
+ getI32Imm(vecType, dl), getI32Imm(fromType, dl),
+ getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
+ NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops);
+ } else if (TM.is64Bit() ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
+ : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
+ if (TM.is64Bit()) {
+ switch (TargetVT) {
+ case MVT::i8:
+ Opcode = NVPTX::LD_i8_ari_64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::LD_i16_ari_64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::LD_i32_ari_64;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::LD_i64_ari_64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::LD_f32_ari_64;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::LD_f64_ari_64;
+ break;
+ default:
+ return false;
+ }
+ } else {
+ switch (TargetVT) {
+ case MVT::i8:
+ Opcode = NVPTX::LD_i8_ari;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::LD_i16_ari;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::LD_i32_ari;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::LD_i64_ari;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::LD_f32_ari;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::LD_f64_ari;
+ break;
+ default:
+ return false;
+ }
+ }
+ SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
+ getI32Imm(vecType, dl), getI32Imm(fromType, dl),
+ getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
+ NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops);
+ } else {
+ if (TM.is64Bit()) {
+ switch (TargetVT) {
+ case MVT::i8:
+ Opcode = NVPTX::LD_i8_areg_64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::LD_i16_areg_64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::LD_i32_areg_64;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::LD_i64_areg_64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::LD_f32_areg_64;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::LD_f64_areg_64;
+ break;
+ default:
+ return false;
+ }
+ } else {
+ switch (TargetVT) {
+ case MVT::i8:
+ Opcode = NVPTX::LD_i8_areg;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::LD_i16_areg;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::LD_i32_areg;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::LD_i64_areg;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::LD_f32_areg;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::LD_f64_areg;
+ break;
+ default:
+ return false;
+ }
+ }
+ SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
+ getI32Imm(vecType, dl), getI32Imm(fromType, dl),
+ getI32Imm(fromTypeWidth, dl), N1, Chain };
+ NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops);
+ }
+
+ if (!NVPTXLD)
+ return false;
+
+ MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+ MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+ cast<MachineSDNode>(NVPTXLD)->setMemRefs(MemRefs0, MemRefs0 + 1);
+
+ ReplaceNode(N, NVPTXLD);
+ return true;
+}
+
+bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
+
+ SDValue Chain = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ SDValue Addr, Offset, Base;
+ unsigned Opcode;
+ SDLoc DL(N);
+ SDNode *LD;
+ MemSDNode *MemSD = cast<MemSDNode>(N);
+ EVT LoadedVT = MemSD->getMemoryVT();
+
+ if (!LoadedVT.isSimple())
+ return false;
+
+ // Address Space Setting
+ unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
+
+ if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {
+ return tryLDGLDU(N);
+ }
+
+ // Volatile Setting
+ // - .volatile is only availalble for .global and .shared
+ bool IsVolatile = MemSD->isVolatile();
+ if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
+ CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
+ CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
+ IsVolatile = false;
+
+ // Vector Setting
+ MVT SimpleVT = LoadedVT.getSimpleVT();
+
+ // Type Setting: fromType + fromTypeWidth
+ //
+ // Sign : ISD::SEXTLOAD
+ // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
+ // type is integer
+ // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
+ MVT ScalarVT = SimpleVT.getScalarType();
+ // Read at least 8 bits (predicates are stored as 8-bit values)
+ unsigned FromTypeWidth = std::max(8U, ScalarVT.getSizeInBits());
+ unsigned int FromType;
+ // The last operand holds the original LoadSDNode::getExtensionType() value
+ unsigned ExtensionType = cast<ConstantSDNode>(
+ N->getOperand(N->getNumOperands() - 1))->getZExtValue();
+ if (ExtensionType == ISD::SEXTLOAD)
+ FromType = NVPTX::PTXLdStInstCode::Signed;
+ else if (ScalarVT.isFloatingPoint())
+ FromType = NVPTX::PTXLdStInstCode::Float;
+ else
+ FromType = NVPTX::PTXLdStInstCode::Unsigned;
+
+ unsigned VecType;
+
+ switch (N->getOpcode()) {
+ case NVPTXISD::LoadV2:
+ VecType = NVPTX::PTXLdStInstCode::V2;
+ break;
+ case NVPTXISD::LoadV4:
+ VecType = NVPTX::PTXLdStInstCode::V4;
+ break;
+ default:
+ return false;
+ }
+
+ EVT EltVT = N->getValueType(0);
+
+ if (SelectDirectAddr(Op1, Addr)) {
+ switch (N->getOpcode()) {
+ default:
+ return false;
+ case NVPTXISD::LoadV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::LDV_i8_v2_avar;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::LDV_i16_v2_avar;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::LDV_i32_v2_avar;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::LDV_i64_v2_avar;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::LDV_f32_v2_avar;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::LDV_f64_v2_avar;
+ break;
+ }
+ break;
+ case NVPTXISD::LoadV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::LDV_i8_v4_avar;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::LDV_i16_v4_avar;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::LDV_i32_v4_avar;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::LDV_f32_v4_avar;
+ break;
+ }
+ break;
+ }
+
+ SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
+ getI32Imm(VecType, DL), getI32Imm(FromType, DL),
+ getI32Imm(FromTypeWidth, DL), Addr, Chain };
+ LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+ } else if (TM.is64Bit() ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
+ : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
+ switch (N->getOpcode()) {
+ default:
+ return false;
+ case NVPTXISD::LoadV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::LDV_i8_v2_asi;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::LDV_i16_v2_asi;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::LDV_i32_v2_asi;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::LDV_i64_v2_asi;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::LDV_f32_v2_asi;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::LDV_f64_v2_asi;
+ break;
+ }
+ break;
+ case NVPTXISD::LoadV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::LDV_i8_v4_asi;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::LDV_i16_v4_asi;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::LDV_i32_v4_asi;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::LDV_f32_v4_asi;
+ break;
+ }
+ break;
+ }
+
+ SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
+ getI32Imm(VecType, DL), getI32Imm(FromType, DL),
+ getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
+ LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+ } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
+ : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
+ if (TM.is64Bit()) {
+ switch (N->getOpcode()) {
+ default:
+ return false;
+ case NVPTXISD::LoadV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::LDV_i8_v2_ari_64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::LDV_i16_v2_ari_64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::LDV_i32_v2_ari_64;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::LDV_i64_v2_ari_64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::LDV_f32_v2_ari_64;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::LDV_f64_v2_ari_64;
+ break;
+ }
+ break;
+ case NVPTXISD::LoadV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::LDV_i8_v4_ari_64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::LDV_i16_v4_ari_64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::LDV_i32_v4_ari_64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::LDV_f32_v4_ari_64;
+ break;
+ }
+ break;
+ }
+ } else {
+ switch (N->getOpcode()) {
+ default:
+ return false;
+ case NVPTXISD::LoadV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::LDV_i8_v2_ari;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::LDV_i16_v2_ari;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::LDV_i32_v2_ari;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::LDV_i64_v2_ari;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::LDV_f32_v2_ari;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::LDV_f64_v2_ari;
+ break;
+ }
+ break;
+ case NVPTXISD::LoadV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::LDV_i8_v4_ari;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::LDV_i16_v4_ari;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::LDV_i32_v4_ari;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::LDV_f32_v4_ari;
+ break;
+ }
+ break;
+ }
+ }
+
+ SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
+ getI32Imm(VecType, DL), getI32Imm(FromType, DL),
+ getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
+
+ LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+ } else {
+ if (TM.is64Bit()) {
+ switch (N->getOpcode()) {
+ default:
+ return false;
+ case NVPTXISD::LoadV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::LDV_i8_v2_areg_64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::LDV_i16_v2_areg_64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::LDV_i32_v2_areg_64;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::LDV_i64_v2_areg_64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::LDV_f32_v2_areg_64;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::LDV_f64_v2_areg_64;
+ break;
+ }
+ break;
+ case NVPTXISD::LoadV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::LDV_i8_v4_areg_64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::LDV_i16_v4_areg_64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::LDV_i32_v4_areg_64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::LDV_f32_v4_areg_64;
+ break;
+ }
+ break;
+ }
+ } else {
+ switch (N->getOpcode()) {
+ default:
+ return false;
+ case NVPTXISD::LoadV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::LDV_i8_v2_areg;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::LDV_i16_v2_areg;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::LDV_i32_v2_areg;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::LDV_i64_v2_areg;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::LDV_f32_v2_areg;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::LDV_f64_v2_areg;
+ break;
+ }
+ break;
+ case NVPTXISD::LoadV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::LDV_i8_v4_areg;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::LDV_i16_v4_areg;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::LDV_i32_v4_areg;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::LDV_f32_v4_areg;
+ break;
+ }
+ break;
+ }
+ }
+
+ SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
+ getI32Imm(VecType, DL), getI32Imm(FromType, DL),
+ getI32Imm(FromTypeWidth, DL), Op1, Chain };
+ LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+ }
+
+ MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+ MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+ cast<MachineSDNode>(LD)->setMemRefs(MemRefs0, MemRefs0 + 1);
+
+ ReplaceNode(N, LD);
+ return true;
+}
+
+bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
+
+ SDValue Chain = N->getOperand(0);
+ SDValue Op1;
+ MemSDNode *Mem;
+ bool IsLDG = true;
+
+ // If this is an LDG intrinsic, the address is the third operand. If its an
+ // LDG/LDU SD node (from custom vector handling), then its the second operand
+ if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
+ Op1 = N->getOperand(2);
+ Mem = cast<MemIntrinsicSDNode>(N);
+ unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ switch (IID) {
+ default:
+ return false;
+ case Intrinsic::nvvm_ldg_global_f:
+ case Intrinsic::nvvm_ldg_global_i:
+ case Intrinsic::nvvm_ldg_global_p:
+ IsLDG = true;
+ break;
+ case Intrinsic::nvvm_ldu_global_f:
+ case Intrinsic::nvvm_ldu_global_i:
+ case Intrinsic::nvvm_ldu_global_p:
+ IsLDG = false;
+ break;
+ }
+ } else {
+ Op1 = N->getOperand(1);
+ Mem = cast<MemSDNode>(N);
+ }
+
+ unsigned Opcode;
+ SDLoc DL(N);
+ SDNode *LD;
+ SDValue Base, Offset, Addr;
+
+ EVT EltVT = Mem->getMemoryVT();
+ unsigned NumElts = 1;
+ if (EltVT.isVector()) {
+ NumElts = EltVT.getVectorNumElements();
+ EltVT = EltVT.getVectorElementType();
+ }
+
+ // Build the "promoted" result VTList for the load. If we are really loading
+ // i8s, then the return type will be promoted to i16 since we do not expose
+ // 8-bit registers in NVPTX.
+ EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT;
+ SmallVector<EVT, 5> InstVTs;
+ for (unsigned i = 0; i != NumElts; ++i) {
+ InstVTs.push_back(NodeVT);
+ }
+ InstVTs.push_back(MVT::Other);
+ SDVTList InstVTList = CurDAG->getVTList(InstVTs);
+
+ if (SelectDirectAddr(Op1, Addr)) {
+ switch (N->getOpcode()) {
+ default:
+ return false;
+ case ISD::INTRINSIC_W_CHAIN:
+ if (IsLDG) {
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8avar;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16avar;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32avar;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64avar;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32avar;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64avar;
+ break;
+ }
+ } else {
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8avar;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16avar;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32avar;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64avar;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32avar;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64avar;
+ break;
+ }
+ }
+ break;
+ case NVPTXISD::LDGV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar;
+ break;
+ }
+ break;
+ case NVPTXISD::LDUV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar;
+ break;
+ }
+ break;
+ case NVPTXISD::LDGV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar;
+ break;
+ }
+ break;
+ case NVPTXISD::LDUV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar;
+ break;
+ }
+ break;
+ }
+
+ SDValue Ops[] = { Addr, Chain };
+ LD = CurDAG->getMachineNode(Opcode, DL, InstVTList, Ops);
+ } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
+ : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
+ if (TM.is64Bit()) {
+ switch (N->getOpcode()) {
+ default:
+ return false;
+ case ISD::LOAD:
+ case ISD::INTRINSIC_W_CHAIN:
+ if (IsLDG) {
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8ari64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16ari64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32ari64;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64ari64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32ari64;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64ari64;
+ break;
+ }
+ } else {
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8ari64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16ari64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32ari64;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64ari64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32ari64;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64ari64;
+ break;
+ }
+ }
+ break;
+ case NVPTXISD::LoadV2:
+ case NVPTXISD::LDGV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64;
+ break;
+ }
+ break;
+ case NVPTXISD::LDUV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64;
+ break;
+ }
+ break;
+ case NVPTXISD::LoadV4:
+ case NVPTXISD::LDGV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64;
+ break;
+ }
+ break;
+ case NVPTXISD::LDUV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64;
+ break;
+ }
+ break;
+ }
+ } else {
+ switch (N->getOpcode()) {
+ default:
+ return false;
+ case ISD::LOAD:
+ case ISD::INTRINSIC_W_CHAIN:
+ if (IsLDG) {
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8ari;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16ari;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32ari;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64ari;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32ari;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64ari;
+ break;
+ }
+ } else {
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8ari;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16ari;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32ari;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64ari;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32ari;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64ari;
+ break;
+ }
+ }
+ break;
+ case NVPTXISD::LoadV2:
+ case NVPTXISD::LDGV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32;
+ break;
+ }
+ break;
+ case NVPTXISD::LDUV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32;
+ break;
+ }
+ break;
+ case NVPTXISD::LoadV4:
+ case NVPTXISD::LDGV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32;
+ break;
+ }
+ break;
+ case NVPTXISD::LDUV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32;
+ break;
+ }
+ break;
+ }
+ }
+
+ SDValue Ops[] = { Base, Offset, Chain };
+
+ LD = CurDAG->getMachineNode(Opcode, DL, InstVTList, Ops);
+ } else {
+ if (TM.is64Bit()) {
+ switch (N->getOpcode()) {
+ default:
+ return false;
+ case ISD::LOAD:
+ case ISD::INTRINSIC_W_CHAIN:
+ if (IsLDG) {
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8areg64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16areg64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32areg64;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64areg64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32areg64;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64areg64;
+ break;
+ }
+ } else {
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8areg64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16areg64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32areg64;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64areg64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32areg64;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64areg64;
+ break;
+ }
+ }
+ break;
+ case NVPTXISD::LoadV2:
+ case NVPTXISD::LDGV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64;
+ break;
+ }
+ break;
+ case NVPTXISD::LDUV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64;
+ break;
+ }
+ break;
+ case NVPTXISD::LoadV4:
+ case NVPTXISD::LDGV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64;
+ break;
+ }
+ break;
+ case NVPTXISD::LDUV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64;
+ break;
+ }
+ break;
+ }
+ } else {
+ switch (N->getOpcode()) {
+ default:
+ return false;
+ case ISD::LOAD:
+ case ISD::INTRINSIC_W_CHAIN:
+ if (IsLDG) {
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8areg;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16areg;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32areg;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64areg;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32areg;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64areg;
+ break;
+ }
+ } else {
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8areg;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16areg;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32areg;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64areg;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32areg;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64areg;
+ break;
+ }
+ }
+ break;
+ case NVPTXISD::LoadV2:
+ case NVPTXISD::LDGV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32;
+ break;
+ }
+ break;
+ case NVPTXISD::LDUV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32;
+ break;
+ }
+ break;
+ case NVPTXISD::LoadV4:
+ case NVPTXISD::LDGV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32;
+ break;
+ }
+ break;
+ case NVPTXISD::LDUV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32;
+ break;
+ }
+ break;
+ }
+ }
+
+ SDValue Ops[] = { Op1, Chain };
+ LD = CurDAG->getMachineNode(Opcode, DL, InstVTList, Ops);
+ }
+
+ MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+ MemRefs0[0] = Mem->getMemOperand();
+ cast<MachineSDNode>(LD)->setMemRefs(MemRefs0, MemRefs0 + 1);
+
+ // For automatic generation of LDG (through SelectLoad[Vector], not the
+ // intrinsics), we may have an extending load like:
+ //
+ // i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64
+ //
+ // In this case, the matching logic above will select a load for the original
+ // memory type (in this case, i8) and our types will not match (the node needs
+ // to return an i32 in this case). Our LDG/LDU nodes do not support the
+ // concept of sign-/zero-extension, so emulate it here by adding an explicit
+ // CVT instruction. Ptxas should clean up any redundancies here.
+
+ EVT OrigType = N->getValueType(0);
+ LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N);
+
+ if (OrigType != EltVT && LdNode) {
+ // We have an extending-load. The instruction we selected operates on the
+ // smaller type, but the SDNode we are replacing has the larger type. We
+ // need to emit a CVT to make the types match.
+ bool IsSigned = LdNode->getExtensionType() == ISD::SEXTLOAD;
+ unsigned CvtOpc = GetConvertOpcode(OrigType.getSimpleVT(),
+ EltVT.getSimpleVT(), IsSigned);
+
+ // For each output value, apply the manual sign/zero-extension and make sure
+ // all users of the load go through that CVT.
+ for (unsigned i = 0; i != NumElts; ++i) {
+ SDValue Res(LD, i);
+ SDValue OrigVal(N, i);
+
+ SDNode *CvtNode =
+ CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res,
+ CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE,
+ DL, MVT::i32));
+ ReplaceUses(OrigVal, SDValue(CvtNode, 0));
+ }
+ }
+
+ ReplaceNode(N, LD);
+ return true;
+}
+
+bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
+ SDLoc dl(N);
+ StoreSDNode *ST = cast<StoreSDNode>(N);
+ EVT StoreVT = ST->getMemoryVT();
+ SDNode *NVPTXST = nullptr;
+
+ // do not support pre/post inc/dec
+ if (ST->isIndexed())
+ return false;
+
+ if (!StoreVT.isSimple())
+ return false;
+
+ // Address Space Setting
+ unsigned int codeAddrSpace = getCodeAddrSpace(ST);
+
+ // Volatile Setting
+ // - .volatile is only availalble for .global and .shared
+ bool isVolatile = ST->isVolatile();
+ if (codeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
+ codeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
+ codeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
+ isVolatile = false;
+
+ // Vector Setting
+ MVT SimpleVT = StoreVT.getSimpleVT();
+ unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
+ if (SimpleVT.isVector()) {
+ unsigned num = SimpleVT.getVectorNumElements();
+ if (num == 2)
+ vecType = NVPTX::PTXLdStInstCode::V2;
+ else if (num == 4)
+ vecType = NVPTX::PTXLdStInstCode::V4;
+ else
+ return false;
+ }
+
+ // Type Setting: toType + toTypeWidth
+ // - for integer type, always use 'u'
+ //
+ MVT ScalarVT = SimpleVT.getScalarType();
+ unsigned toTypeWidth = ScalarVT.getSizeInBits();
+ unsigned int toType;
+ if (ScalarVT.isFloatingPoint())
+ toType = NVPTX::PTXLdStInstCode::Float;
+ else
+ toType = NVPTX::PTXLdStInstCode::Unsigned;
+
+ // Create the machine instruction DAG
+ SDValue Chain = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDValue N2 = N->getOperand(2);
+ SDValue Addr;
+ SDValue Offset, Base;
+ unsigned Opcode;
+ MVT::SimpleValueType SourceVT = N1.getNode()->getSimpleValueType(0).SimpleTy;
+
+ if (SelectDirectAddr(N2, Addr)) {
+ switch (SourceVT) {
+ case MVT::i8:
+ Opcode = NVPTX::ST_i8_avar;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::ST_i16_avar;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::ST_i32_avar;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::ST_i64_avar;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::ST_f32_avar;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::ST_f64_avar;
+ break;
+ default:
+ return false;
+ }
+ SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
+ getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl),
+ getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Addr,
+ Chain };
+ NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
+ } else if (TM.is64Bit() ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
+ : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
+ switch (SourceVT) {
+ case MVT::i8:
+ Opcode = NVPTX::ST_i8_asi;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::ST_i16_asi;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::ST_i32_asi;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::ST_i64_asi;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::ST_f32_asi;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::ST_f64_asi;
+ break;
+ default:
+ return false;
+ }
+ SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
+ getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl),
+ getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Base,
+ Offset, Chain };
+ NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
+ } else if (TM.is64Bit() ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
+ : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
+ if (TM.is64Bit()) {
+ switch (SourceVT) {
+ case MVT::i8:
+ Opcode = NVPTX::ST_i8_ari_64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::ST_i16_ari_64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::ST_i32_ari_64;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::ST_i64_ari_64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::ST_f32_ari_64;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::ST_f64_ari_64;
+ break;
+ default:
+ return false;
+ }
+ } else {
+ switch (SourceVT) {
+ case MVT::i8:
+ Opcode = NVPTX::ST_i8_ari;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::ST_i16_ari;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::ST_i32_ari;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::ST_i64_ari;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::ST_f32_ari;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::ST_f64_ari;
+ break;
+ default:
+ return false;
+ }
+ }
+ SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
+ getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl),
+ getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Base,
+ Offset, Chain };
+ NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
+ } else {
+ if (TM.is64Bit()) {
+ switch (SourceVT) {
+ case MVT::i8:
+ Opcode = NVPTX::ST_i8_areg_64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::ST_i16_areg_64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::ST_i32_areg_64;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::ST_i64_areg_64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::ST_f32_areg_64;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::ST_f64_areg_64;
+ break;
+ default:
+ return false;
+ }
+ } else {
+ switch (SourceVT) {
+ case MVT::i8:
+ Opcode = NVPTX::ST_i8_areg;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::ST_i16_areg;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::ST_i32_areg;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::ST_i64_areg;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::ST_f32_areg;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::ST_f64_areg;
+ break;
+ default:
+ return false;
+ }
+ }
+ SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
+ getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl),
+ getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), N2,
+ Chain };
+ NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
+ }
+
+ if (!NVPTXST)
+ return false;
+
+ MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+ MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+ cast<MachineSDNode>(NVPTXST)->setMemRefs(MemRefs0, MemRefs0 + 1);
+ ReplaceNode(N, NVPTXST);
+ return true;
+}
+
+bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
+ SDValue Chain = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ SDValue Addr, Offset, Base;
+ unsigned Opcode;
+ SDLoc DL(N);
+ SDNode *ST;
+ EVT EltVT = Op1.getValueType();
+ MemSDNode *MemSD = cast<MemSDNode>(N);
+ EVT StoreVT = MemSD->getMemoryVT();
+
+ // Address Space Setting
+ unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
+
+ if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) {
+ report_fatal_error("Cannot store to pointer that points to constant "
+ "memory space");
+ }
+
+ // Volatile Setting
+ // - .volatile is only availalble for .global and .shared
+ bool IsVolatile = MemSD->isVolatile();
+ if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
+ CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
+ CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
+ IsVolatile = false;
+
+ // Type Setting: toType + toTypeWidth
+ // - for integer type, always use 'u'
+ assert(StoreVT.isSimple() && "Store value is not simple");
+ MVT ScalarVT = StoreVT.getSimpleVT().getScalarType();
+ unsigned ToTypeWidth = ScalarVT.getSizeInBits();
+ unsigned ToType;
+ if (ScalarVT.isFloatingPoint())
+ ToType = NVPTX::PTXLdStInstCode::Float;
+ else
+ ToType = NVPTX::PTXLdStInstCode::Unsigned;
+
+ SmallVector<SDValue, 12> StOps;
+ SDValue N2;
+ unsigned VecType;
+
+ switch (N->getOpcode()) {
+ case NVPTXISD::StoreV2:
+ VecType = NVPTX::PTXLdStInstCode::V2;
+ StOps.push_back(N->getOperand(1));
+ StOps.push_back(N->getOperand(2));
+ N2 = N->getOperand(3);
+ break;
+ case NVPTXISD::StoreV4:
+ VecType = NVPTX::PTXLdStInstCode::V4;
+ StOps.push_back(N->getOperand(1));
+ StOps.push_back(N->getOperand(2));
+ StOps.push_back(N->getOperand(3));
+ StOps.push_back(N->getOperand(4));
+ N2 = N->getOperand(5);
+ break;
+ default:
+ return false;
+ }
+
+ StOps.push_back(getI32Imm(IsVolatile, DL));
+ StOps.push_back(getI32Imm(CodeAddrSpace, DL));
+ StOps.push_back(getI32Imm(VecType, DL));
+ StOps.push_back(getI32Imm(ToType, DL));
+ StOps.push_back(getI32Imm(ToTypeWidth, DL));
+
+ if (SelectDirectAddr(N2, Addr)) {
+ switch (N->getOpcode()) {
+ default:
+ return false;
+ case NVPTXISD::StoreV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::STV_i8_v2_avar;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::STV_i16_v2_avar;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::STV_i32_v2_avar;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::STV_i64_v2_avar;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::STV_f32_v2_avar;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::STV_f64_v2_avar;
+ break;
+ }
+ break;
+ case NVPTXISD::StoreV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::STV_i8_v4_avar;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::STV_i16_v4_avar;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::STV_i32_v4_avar;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::STV_f32_v4_avar;
+ break;
+ }
+ break;
+ }
+ StOps.push_back(Addr);
+ } else if (TM.is64Bit() ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
+ : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
+ switch (N->getOpcode()) {
+ default:
+ return false;
+ case NVPTXISD::StoreV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::STV_i8_v2_asi;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::STV_i16_v2_asi;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::STV_i32_v2_asi;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::STV_i64_v2_asi;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::STV_f32_v2_asi;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::STV_f64_v2_asi;
+ break;
+ }
+ break;
+ case NVPTXISD::StoreV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::STV_i8_v4_asi;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::STV_i16_v4_asi;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::STV_i32_v4_asi;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::STV_f32_v4_asi;
+ break;
+ }
+ break;
+ }
+ StOps.push_back(Base);
+ StOps.push_back(Offset);
+ } else if (TM.is64Bit() ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
+ : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
+ if (TM.is64Bit()) {
+ switch (N->getOpcode()) {
+ default:
+ return false;
+ case NVPTXISD::StoreV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::STV_i8_v2_ari_64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::STV_i16_v2_ari_64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::STV_i32_v2_ari_64;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::STV_i64_v2_ari_64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::STV_f32_v2_ari_64;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::STV_f64_v2_ari_64;
+ break;
+ }
+ break;
+ case NVPTXISD::StoreV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::STV_i8_v4_ari_64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::STV_i16_v4_ari_64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::STV_i32_v4_ari_64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::STV_f32_v4_ari_64;
+ break;
+ }
+ break;
+ }
+ } else {
+ switch (N->getOpcode()) {
+ default:
+ return false;
+ case NVPTXISD::StoreV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::STV_i8_v2_ari;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::STV_i16_v2_ari;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::STV_i32_v2_ari;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::STV_i64_v2_ari;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::STV_f32_v2_ari;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::STV_f64_v2_ari;
+ break;
+ }
+ break;
+ case NVPTXISD::StoreV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::STV_i8_v4_ari;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::STV_i16_v4_ari;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::STV_i32_v4_ari;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::STV_f32_v4_ari;
+ break;
+ }
+ break;
+ }
+ }
+ StOps.push_back(Base);
+ StOps.push_back(Offset);
+ } else {
+ if (TM.is64Bit()) {
+ switch (N->getOpcode()) {
+ default:
+ return false;
+ case NVPTXISD::StoreV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::STV_i8_v2_areg_64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::STV_i16_v2_areg_64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::STV_i32_v2_areg_64;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::STV_i64_v2_areg_64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::STV_f32_v2_areg_64;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::STV_f64_v2_areg_64;
+ break;
+ }
+ break;
+ case NVPTXISD::StoreV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::STV_i8_v4_areg_64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::STV_i16_v4_areg_64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::STV_i32_v4_areg_64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::STV_f32_v4_areg_64;
+ break;
+ }
+ break;
+ }
+ } else {
+ switch (N->getOpcode()) {
+ default:
+ return false;
+ case NVPTXISD::StoreV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::STV_i8_v2_areg;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::STV_i16_v2_areg;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::STV_i32_v2_areg;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::STV_i64_v2_areg;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::STV_f32_v2_areg;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::STV_f64_v2_areg;
+ break;
+ }
+ break;
+ case NVPTXISD::StoreV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ Opcode = NVPTX::STV_i8_v4_areg;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::STV_i16_v4_areg;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::STV_i32_v4_areg;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::STV_f32_v4_areg;
+ break;
+ }
+ break;
+ }
+ }
+ StOps.push_back(N2);
+ }
+
+ StOps.push_back(Chain);
+
+ ST = CurDAG->getMachineNode(Opcode, DL, MVT::Other, StOps);
+
+ MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+ MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+ cast<MachineSDNode>(ST)->setMemRefs(MemRefs0, MemRefs0 + 1);
+
+ ReplaceNode(N, ST);
+ return true;
+}
+
+bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
+ SDValue Chain = Node->getOperand(0);
+ SDValue Offset = Node->getOperand(2);
+ SDValue Flag = Node->getOperand(3);
+ SDLoc DL(Node);
+ MemSDNode *Mem = cast<MemSDNode>(Node);
+
+ unsigned VecSize;
+ switch (Node->getOpcode()) {
+ default:
+ return false;
+ case NVPTXISD::LoadParam:
+ VecSize = 1;
+ break;
+ case NVPTXISD::LoadParamV2:
+ VecSize = 2;
+ break;
+ case NVPTXISD::LoadParamV4:
+ VecSize = 4;
+ break;
+ }
+
+ EVT EltVT = Node->getValueType(0);
+ EVT MemVT = Mem->getMemoryVT();
+
+ unsigned Opc = 0;
+
+ switch (VecSize) {
+ default:
+ return false;
+ case 1:
+ switch (MemVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i1:
+ Opc = NVPTX::LoadParamMemI8;
+ break;
+ case MVT::i8:
+ Opc = NVPTX::LoadParamMemI8;
+ break;
+ case MVT::i16:
+ Opc = NVPTX::LoadParamMemI16;
+ break;
+ case MVT::i32:
+ Opc = NVPTX::LoadParamMemI32;
+ break;
+ case MVT::i64:
+ Opc = NVPTX::LoadParamMemI64;
+ break;
+ case MVT::f32:
+ Opc = NVPTX::LoadParamMemF32;
+ break;
+ case MVT::f64:
+ Opc = NVPTX::LoadParamMemF64;
+ break;
+ }
+ break;
+ case 2:
+ switch (MemVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i1:
+ Opc = NVPTX::LoadParamMemV2I8;
+ break;
+ case MVT::i8:
+ Opc = NVPTX::LoadParamMemV2I8;
+ break;
+ case MVT::i16:
+ Opc = NVPTX::LoadParamMemV2I16;
+ break;
+ case MVT::i32:
+ Opc = NVPTX::LoadParamMemV2I32;
+ break;
+ case MVT::i64:
+ Opc = NVPTX::LoadParamMemV2I64;
+ break;
+ case MVT::f32:
+ Opc = NVPTX::LoadParamMemV2F32;
+ break;
+ case MVT::f64:
+ Opc = NVPTX::LoadParamMemV2F64;
+ break;
+ }
+ break;
+ case 4:
+ switch (MemVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i1:
+ Opc = NVPTX::LoadParamMemV4I8;
+ break;
+ case MVT::i8:
+ Opc = NVPTX::LoadParamMemV4I8;
+ break;
+ case MVT::i16:
+ Opc = NVPTX::LoadParamMemV4I16;
+ break;
+ case MVT::i32:
+ Opc = NVPTX::LoadParamMemV4I32;
+ break;
+ case MVT::f32:
+ Opc = NVPTX::LoadParamMemV4F32;
+ break;
+ }
+ break;
+ }
+
+ SDVTList VTs;
+ if (VecSize == 1) {
+ VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue);
+ } else if (VecSize == 2) {
+ VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue);
+ } else {
+ EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue };
+ VTs = CurDAG->getVTList(EVTs);
+ }
+
+ unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
+
+ SmallVector<SDValue, 2> Ops;
+ Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
+ Ops.push_back(Chain);
+ Ops.push_back(Flag);
+
+ ReplaceNode(Node, CurDAG->getMachineNode(Opc, DL, VTs, Ops));
+ return true;
+}
+
+bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
+ SDLoc DL(N);
+ SDValue Chain = N->getOperand(0);
+ SDValue Offset = N->getOperand(1);
+ unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
+ MemSDNode *Mem = cast<MemSDNode>(N);
+
+ // How many elements do we have?
+ unsigned NumElts = 1;
+ switch (N->getOpcode()) {
+ default:
+ return false;
+ case NVPTXISD::StoreRetval:
+ NumElts = 1;
+ break;
+ case NVPTXISD::StoreRetvalV2:
+ NumElts = 2;
+ break;
+ case NVPTXISD::StoreRetvalV4:
+ NumElts = 4;
+ break;
+ }
+
+ // Build vector of operands
+ SmallVector<SDValue, 6> Ops;
+ for (unsigned i = 0; i < NumElts; ++i)
+ Ops.push_back(N->getOperand(i + 2));
+ Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
+ Ops.push_back(Chain);
+
+ // Determine target opcode
+ // If we have an i1, use an 8-bit store. The lowering code in
+ // NVPTXISelLowering will have already emitted an upcast.
+ unsigned Opcode = 0;
+ switch (NumElts) {
+ default:
+ return false;
+ case 1:
+ switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i1:
+ Opcode = NVPTX::StoreRetvalI8;
+ break;
+ case MVT::i8:
+ Opcode = NVPTX::StoreRetvalI8;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::StoreRetvalI16;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::StoreRetvalI32;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::StoreRetvalI64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::StoreRetvalF32;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::StoreRetvalF64;
+ break;
+ }
+ break;
+ case 2:
+ switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i1:
+ Opcode = NVPTX::StoreRetvalV2I8;
+ break;
+ case MVT::i8:
+ Opcode = NVPTX::StoreRetvalV2I8;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::StoreRetvalV2I16;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::StoreRetvalV2I32;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::StoreRetvalV2I64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::StoreRetvalV2F32;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::StoreRetvalV2F64;
+ break;
+ }
+ break;
+ case 4:
+ switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i1:
+ Opcode = NVPTX::StoreRetvalV4I8;
+ break;
+ case MVT::i8:
+ Opcode = NVPTX::StoreRetvalV4I8;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::StoreRetvalV4I16;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::StoreRetvalV4I32;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::StoreRetvalV4F32;
+ break;
+ }
+ break;
+ }
+
+ SDNode *Ret =
+ CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops);
+ MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+ MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+ cast<MachineSDNode>(Ret)->setMemRefs(MemRefs0, MemRefs0 + 1);
+
+ ReplaceNode(N, Ret);
+ return true;
+}
+
+bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
+ SDLoc DL(N);
+ SDValue Chain = N->getOperand(0);
+ SDValue Param = N->getOperand(1);
+ unsigned ParamVal = cast<ConstantSDNode>(Param)->getZExtValue();
+ SDValue Offset = N->getOperand(2);
+ unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
+ MemSDNode *Mem = cast<MemSDNode>(N);
+ SDValue Flag = N->getOperand(N->getNumOperands() - 1);
+
+ // How many elements do we have?
+ unsigned NumElts = 1;
+ switch (N->getOpcode()) {
+ default:
+ return false;
+ case NVPTXISD::StoreParamU32:
+ case NVPTXISD::StoreParamS32:
+ case NVPTXISD::StoreParam:
+ NumElts = 1;
+ break;
+ case NVPTXISD::StoreParamV2:
+ NumElts = 2;
+ break;
+ case NVPTXISD::StoreParamV4:
+ NumElts = 4;
+ break;
+ }
+
+ // Build vector of operands
+ SmallVector<SDValue, 8> Ops;
+ for (unsigned i = 0; i < NumElts; ++i)
+ Ops.push_back(N->getOperand(i + 3));
+ Ops.push_back(CurDAG->getTargetConstant(ParamVal, DL, MVT::i32));
+ Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
+ Ops.push_back(Chain);
+ Ops.push_back(Flag);
+
+ // Determine target opcode
+ // If we have an i1, use an 8-bit store. The lowering code in
+ // NVPTXISelLowering will have already emitted an upcast.
+ unsigned Opcode = 0;
+ switch (N->getOpcode()) {
+ default:
+ switch (NumElts) {
+ default:
+ return false;
+ case 1:
+ switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i1:
+ Opcode = NVPTX::StoreParamI8;
+ break;
+ case MVT::i8:
+ Opcode = NVPTX::StoreParamI8;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::StoreParamI16;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::StoreParamI32;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::StoreParamI64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::StoreParamF32;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::StoreParamF64;
+ break;
+ }
+ break;
+ case 2:
+ switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i1:
+ Opcode = NVPTX::StoreParamV2I8;
+ break;
+ case MVT::i8:
+ Opcode = NVPTX::StoreParamV2I8;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::StoreParamV2I16;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::StoreParamV2I32;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::StoreParamV2I64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::StoreParamV2F32;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::StoreParamV2F64;
+ break;
+ }
+ break;
+ case 4:
+ switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i1:
+ Opcode = NVPTX::StoreParamV4I8;
+ break;
+ case MVT::i8:
+ Opcode = NVPTX::StoreParamV4I8;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::StoreParamV4I16;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::StoreParamV4I32;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::StoreParamV4F32;
+ break;
+ }
+ break;
+ }
+ break;
+ // Special case: if we have a sign-extend/zero-extend node, insert the
+ // conversion instruction first, and use that as the value operand to
+ // the selected StoreParam node.
+ case NVPTXISD::StoreParamU32: {
+ Opcode = NVPTX::StoreParamI32;
+ SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL,
+ MVT::i32);
+ SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u32_u16, DL,
+ MVT::i32, Ops[0], CvtNone);
+ Ops[0] = SDValue(Cvt, 0);
+ break;
+ }
+ case NVPTXISD::StoreParamS32: {
+ Opcode = NVPTX::StoreParamI32;
+ SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL,
+ MVT::i32);
+ SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_s32_s16, DL,
+ MVT::i32, Ops[0], CvtNone);
+ Ops[0] = SDValue(Cvt, 0);
+ break;
+ }
+ }
+
+ SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
+ SDNode *Ret =
+ CurDAG->getMachineNode(Opcode, DL, RetVTs, Ops);
+ MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+ MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+ cast<MachineSDNode>(Ret)->setMemRefs(MemRefs0, MemRefs0 + 1);
+
+ ReplaceNode(N, Ret);
+ return true;
+}
+
+bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) {
+ SDValue Chain = N->getOperand(0);
+ unsigned Opc = 0;
+ SmallVector<SDValue, 8> Ops;
+
+ switch (N->getOpcode()) {
+ default: return false;
+ case NVPTXISD::Tex1DFloatS32:
+ Opc = NVPTX::TEX_1D_F32_S32;
+ break;
+ case NVPTXISD::Tex1DFloatFloat:
+ Opc = NVPTX::TEX_1D_F32_F32;
+ break;
+ case NVPTXISD::Tex1DFloatFloatLevel:
+ Opc = NVPTX::TEX_1D_F32_F32_LEVEL;
+ break;
+ case NVPTXISD::Tex1DFloatFloatGrad:
+ Opc = NVPTX::TEX_1D_F32_F32_GRAD;
+ break;
+ case NVPTXISD::Tex1DS32S32:
+ Opc = NVPTX::TEX_1D_S32_S32;
+ break;
+ case NVPTXISD::Tex1DS32Float:
+ Opc = NVPTX::TEX_1D_S32_F32;
+ break;
+ case NVPTXISD::Tex1DS32FloatLevel:
+ Opc = NVPTX::TEX_1D_S32_F32_LEVEL;
+ break;
+ case NVPTXISD::Tex1DS32FloatGrad:
+ Opc = NVPTX::TEX_1D_S32_F32_GRAD;
+ break;
+ case NVPTXISD::Tex1DU32S32:
+ Opc = NVPTX::TEX_1D_U32_S32;
+ break;
+ case NVPTXISD::Tex1DU32Float:
+ Opc = NVPTX::TEX_1D_U32_F32;
+ break;
+ case NVPTXISD::Tex1DU32FloatLevel:
+ Opc = NVPTX::TEX_1D_U32_F32_LEVEL;
+ break;
+ case NVPTXISD::Tex1DU32FloatGrad:
+ Opc = NVPTX::TEX_1D_U32_F32_GRAD;
+ break;
+ case NVPTXISD::Tex1DArrayFloatS32:
+ Opc = NVPTX::TEX_1D_ARRAY_F32_S32;
+ break;
+ case NVPTXISD::Tex1DArrayFloatFloat:
+ Opc = NVPTX::TEX_1D_ARRAY_F32_F32;
+ break;
+ case NVPTXISD::Tex1DArrayFloatFloatLevel:
+ Opc = NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL;
+ break;
+ case NVPTXISD::Tex1DArrayFloatFloatGrad:
+ Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD;
+ break;
+ case NVPTXISD::Tex1DArrayS32S32:
+ Opc = NVPTX::TEX_1D_ARRAY_S32_S32;
+ break;
+ case NVPTXISD::Tex1DArrayS32Float:
+ Opc = NVPTX::TEX_1D_ARRAY_S32_F32;
+ break;
+ case NVPTXISD::Tex1DArrayS32FloatLevel:
+ Opc = NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL;
+ break;
+ case NVPTXISD::Tex1DArrayS32FloatGrad:
+ Opc = NVPTX::TEX_1D_ARRAY_S32_F32_GRAD;
+ break;
+ case NVPTXISD::Tex1DArrayU32S32:
+ Opc = NVPTX::TEX_1D_ARRAY_U32_S32;
+ break;
+ case NVPTXISD::Tex1DArrayU32Float:
+ Opc = NVPTX::TEX_1D_ARRAY_U32_F32;
+ break;
+ case NVPTXISD::Tex1DArrayU32FloatLevel:
+ Opc = NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL;
+ break;
+ case NVPTXISD::Tex1DArrayU32FloatGrad:
+ Opc = NVPTX::TEX_1D_ARRAY_U32_F32_GRAD;
+ break;
+ case NVPTXISD::Tex2DFloatS32:
+ Opc = NVPTX::TEX_2D_F32_S32;
+ break;
+ case NVPTXISD::Tex2DFloatFloat:
+ Opc = NVPTX::TEX_2D_F32_F32;
+ break;
+ case NVPTXISD::Tex2DFloatFloatLevel:
+ Opc = NVPTX::TEX_2D_F32_F32_LEVEL;
+ break;
+ case NVPTXISD::Tex2DFloatFloatGrad:
+ Opc = NVPTX::TEX_2D_F32_F32_GRAD;
+ break;
+ case NVPTXISD::Tex2DS32S32:
+ Opc = NVPTX::TEX_2D_S32_S32;
+ break;
+ case NVPTXISD::Tex2DS32Float:
+ Opc = NVPTX::TEX_2D_S32_F32;
+ break;
+ case NVPTXISD::Tex2DS32FloatLevel:
+ Opc = NVPTX::TEX_2D_S32_F32_LEVEL;
+ break;
+ case NVPTXISD::Tex2DS32FloatGrad:
+ Opc = NVPTX::TEX_2D_S32_F32_GRAD;
+ break;
+ case NVPTXISD::Tex2DU32S32:
+ Opc = NVPTX::TEX_2D_U32_S32;
+ break;
+ case NVPTXISD::Tex2DU32Float:
+ Opc = NVPTX::TEX_2D_U32_F32;
+ break;
+ case NVPTXISD::Tex2DU32FloatLevel:
+ Opc = NVPTX::TEX_2D_U32_F32_LEVEL;
+ break;
+ case NVPTXISD::Tex2DU32FloatGrad:
+ Opc = NVPTX::TEX_2D_U32_F32_GRAD;
+ break;
+ case NVPTXISD::Tex2DArrayFloatS32:
+ Opc = NVPTX::TEX_2D_ARRAY_F32_S32;
+ break;
+ case NVPTXISD::Tex2DArrayFloatFloat:
+ Opc = NVPTX::TEX_2D_ARRAY_F32_F32;
+ break;
+ case NVPTXISD::Tex2DArrayFloatFloatLevel:
+ Opc = NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL;
+ break;
+ case NVPTXISD::Tex2DArrayFloatFloatGrad:
+ Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD;
+ break;
+ case NVPTXISD::Tex2DArrayS32S32:
+ Opc = NVPTX::TEX_2D_ARRAY_S32_S32;
+ break;
+ case NVPTXISD::Tex2DArrayS32Float:
+ Opc = NVPTX::TEX_2D_ARRAY_S32_F32;
+ break;
+ case NVPTXISD::Tex2DArrayS32FloatLevel:
+ Opc = NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL;
+ break;
+ case NVPTXISD::Tex2DArrayS32FloatGrad:
+ Opc = NVPTX::TEX_2D_ARRAY_S32_F32_GRAD;
+ break;
+ case NVPTXISD::Tex2DArrayU32S32:
+ Opc = NVPTX::TEX_2D_ARRAY_U32_S32;
+ break;
+ case NVPTXISD::Tex2DArrayU32Float:
+ Opc = NVPTX::TEX_2D_ARRAY_U32_F32;
+ break;
+ case NVPTXISD::Tex2DArrayU32FloatLevel:
+ Opc = NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL;
+ break;
+ case NVPTXISD::Tex2DArrayU32FloatGrad:
+ Opc = NVPTX::TEX_2D_ARRAY_U32_F32_GRAD;
+ break;
+ case NVPTXISD::Tex3DFloatS32:
+ Opc = NVPTX::TEX_3D_F32_S32;
+ break;
+ case NVPTXISD::Tex3DFloatFloat:
+ Opc = NVPTX::TEX_3D_F32_F32;
+ break;
+ case NVPTXISD::Tex3DFloatFloatLevel:
+ Opc = NVPTX::TEX_3D_F32_F32_LEVEL;
+ break;
+ case NVPTXISD::Tex3DFloatFloatGrad:
+ Opc = NVPTX::TEX_3D_F32_F32_GRAD;
+ break;
+ case NVPTXISD::Tex3DS32S32:
+ Opc = NVPTX::TEX_3D_S32_S32;
+ break;
+ case NVPTXISD::Tex3DS32Float:
+ Opc = NVPTX::TEX_3D_S32_F32;
+ break;
+ case NVPTXISD::Tex3DS32FloatLevel:
+ Opc = NVPTX::TEX_3D_S32_F32_LEVEL;
+ break;
+ case NVPTXISD::Tex3DS32FloatGrad:
+ Opc = NVPTX::TEX_3D_S32_F32_GRAD;
+ break;
+ case NVPTXISD::Tex3DU32S32:
+ Opc = NVPTX::TEX_3D_U32_S32;
+ break;
+ case NVPTXISD::Tex3DU32Float:
+ Opc = NVPTX::TEX_3D_U32_F32;
+ break;
+ case NVPTXISD::Tex3DU32FloatLevel:
+ Opc = NVPTX::TEX_3D_U32_F32_LEVEL;
+ break;
+ case NVPTXISD::Tex3DU32FloatGrad:
+ Opc = NVPTX::TEX_3D_U32_F32_GRAD;
+ break;
+ case NVPTXISD::TexCubeFloatFloat:
+ Opc = NVPTX::TEX_CUBE_F32_F32;
+ break;
+ case NVPTXISD::TexCubeFloatFloatLevel:
+ Opc = NVPTX::TEX_CUBE_F32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexCubeS32Float:
+ Opc = NVPTX::TEX_CUBE_S32_F32;
+ break;
+ case NVPTXISD::TexCubeS32FloatLevel:
+ Opc = NVPTX::TEX_CUBE_S32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexCubeU32Float:
+ Opc = NVPTX::TEX_CUBE_U32_F32;
+ break;
+ case NVPTXISD::TexCubeU32FloatLevel:
+ Opc = NVPTX::TEX_CUBE_U32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexCubeArrayFloatFloat:
+ Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32;
+ break;
+ case NVPTXISD::TexCubeArrayFloatFloatLevel:
+ Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexCubeArrayS32Float:
+ Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32;
+ break;
+ case NVPTXISD::TexCubeArrayS32FloatLevel:
+ Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexCubeArrayU32Float:
+ Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32;
+ break;
+ case NVPTXISD::TexCubeArrayU32FloatLevel:
+ Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL;
+ break;
+ case NVPTXISD::Tld4R2DFloatFloat:
+ Opc = NVPTX::TLD4_R_2D_F32_F32;
+ break;
+ case NVPTXISD::Tld4G2DFloatFloat:
+ Opc = NVPTX::TLD4_G_2D_F32_F32;
+ break;
+ case NVPTXISD::Tld4B2DFloatFloat:
+ Opc = NVPTX::TLD4_B_2D_F32_F32;
+ break;
+ case NVPTXISD::Tld4A2DFloatFloat:
+ Opc = NVPTX::TLD4_A_2D_F32_F32;
+ break;
+ case NVPTXISD::Tld4R2DS64Float:
+ Opc = NVPTX::TLD4_R_2D_S32_F32;
+ break;
+ case NVPTXISD::Tld4G2DS64Float:
+ Opc = NVPTX::TLD4_G_2D_S32_F32;
+ break;
+ case NVPTXISD::Tld4B2DS64Float:
+ Opc = NVPTX::TLD4_B_2D_S32_F32;
+ break;
+ case NVPTXISD::Tld4A2DS64Float:
+ Opc = NVPTX::TLD4_A_2D_S32_F32;
+ break;
+ case NVPTXISD::Tld4R2DU64Float:
+ Opc = NVPTX::TLD4_R_2D_U32_F32;
+ break;
+ case NVPTXISD::Tld4G2DU64Float:
+ Opc = NVPTX::TLD4_G_2D_U32_F32;
+ break;
+ case NVPTXISD::Tld4B2DU64Float:
+ Opc = NVPTX::TLD4_B_2D_U32_F32;
+ break;
+ case NVPTXISD::Tld4A2DU64Float:
+ Opc = NVPTX::TLD4_A_2D_U32_F32;
+ break;
+ case NVPTXISD::TexUnified1DFloatS32:
+ Opc = NVPTX::TEX_UNIFIED_1D_F32_S32;
+ break;
+ case NVPTXISD::TexUnified1DFloatFloat:
+ Opc = NVPTX::TEX_UNIFIED_1D_F32_F32;
+ break;
+ case NVPTXISD::TexUnified1DFloatFloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnified1DFloatFloatGrad:
+ Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD;
+ break;
+ case NVPTXISD::TexUnified1DS32S32:
+ Opc = NVPTX::TEX_UNIFIED_1D_S32_S32;
+ break;
+ case NVPTXISD::TexUnified1DS32Float:
+ Opc = NVPTX::TEX_UNIFIED_1D_S32_F32;
+ break;
+ case NVPTXISD::TexUnified1DS32FloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnified1DS32FloatGrad:
+ Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD;
+ break;
+ case NVPTXISD::TexUnified1DU32S32:
+ Opc = NVPTX::TEX_UNIFIED_1D_U32_S32;
+ break;
+ case NVPTXISD::TexUnified1DU32Float:
+ Opc = NVPTX::TEX_UNIFIED_1D_U32_F32;
+ break;
+ case NVPTXISD::TexUnified1DU32FloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnified1DU32FloatGrad:
+ Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD;
+ break;
+ case NVPTXISD::TexUnified1DArrayFloatS32:
+ Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32;
+ break;
+ case NVPTXISD::TexUnified1DArrayFloatFloat:
+ Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32;
+ break;
+ case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
+ Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD;
+ break;
+ case NVPTXISD::TexUnified1DArrayS32S32:
+ Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32;
+ break;
+ case NVPTXISD::TexUnified1DArrayS32Float:
+ Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32;
+ break;
+ case NVPTXISD::TexUnified1DArrayS32FloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnified1DArrayS32FloatGrad:
+ Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD;
+ break;
+ case NVPTXISD::TexUnified1DArrayU32S32:
+ Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32;
+ break;
+ case NVPTXISD::TexUnified1DArrayU32Float:
+ Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32;
+ break;
+ case NVPTXISD::TexUnified1DArrayU32FloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnified1DArrayU32FloatGrad:
+ Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD;
+ break;
+ case NVPTXISD::TexUnified2DFloatS32:
+ Opc = NVPTX::TEX_UNIFIED_2D_F32_S32;
+ break;
+ case NVPTXISD::TexUnified2DFloatFloat:
+ Opc = NVPTX::TEX_UNIFIED_2D_F32_F32;
+ break;
+ case NVPTXISD::TexUnified2DFloatFloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnified2DFloatFloatGrad:
+ Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD;
+ break;
+ case NVPTXISD::TexUnified2DS32S32:
+ Opc = NVPTX::TEX_UNIFIED_2D_S32_S32;
+ break;
+ case NVPTXISD::TexUnified2DS32Float:
+ Opc = NVPTX::TEX_UNIFIED_2D_S32_F32;
+ break;
+ case NVPTXISD::TexUnified2DS32FloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnified2DS32FloatGrad:
+ Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD;
+ break;
+ case NVPTXISD::TexUnified2DU32S32:
+ Opc = NVPTX::TEX_UNIFIED_2D_U32_S32;
+ break;
+ case NVPTXISD::TexUnified2DU32Float:
+ Opc = NVPTX::TEX_UNIFIED_2D_U32_F32;
+ break;
+ case NVPTXISD::TexUnified2DU32FloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnified2DU32FloatGrad:
+ Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD;
+ break;
+ case NVPTXISD::TexUnified2DArrayFloatS32:
+ Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32;
+ break;
+ case NVPTXISD::TexUnified2DArrayFloatFloat:
+ Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32;
+ break;
+ case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
+ Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD;
+ break;
+ case NVPTXISD::TexUnified2DArrayS32S32:
+ Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32;
+ break;
+ case NVPTXISD::TexUnified2DArrayS32Float:
+ Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32;
+ break;
+ case NVPTXISD::TexUnified2DArrayS32FloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnified2DArrayS32FloatGrad:
+ Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD;
+ break;
+ case NVPTXISD::TexUnified2DArrayU32S32:
+ Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32;
+ break;
+ case NVPTXISD::TexUnified2DArrayU32Float:
+ Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32;
+ break;
+ case NVPTXISD::TexUnified2DArrayU32FloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnified2DArrayU32FloatGrad:
+ Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD;
+ break;
+ case NVPTXISD::TexUnified3DFloatS32:
+ Opc = NVPTX::TEX_UNIFIED_3D_F32_S32;
+ break;
+ case NVPTXISD::TexUnified3DFloatFloat:
+ Opc = NVPTX::TEX_UNIFIED_3D_F32_F32;
+ break;
+ case NVPTXISD::TexUnified3DFloatFloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnified3DFloatFloatGrad:
+ Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD;
+ break;
+ case NVPTXISD::TexUnified3DS32S32:
+ Opc = NVPTX::TEX_UNIFIED_3D_S32_S32;
+ break;
+ case NVPTXISD::TexUnified3DS32Float:
+ Opc = NVPTX::TEX_UNIFIED_3D_S32_F32;
+ break;
+ case NVPTXISD::TexUnified3DS32FloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnified3DS32FloatGrad:
+ Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD;
+ break;
+ case NVPTXISD::TexUnified3DU32S32:
+ Opc = NVPTX::TEX_UNIFIED_3D_U32_S32;
+ break;
+ case NVPTXISD::TexUnified3DU32Float:
+ Opc = NVPTX::TEX_UNIFIED_3D_U32_F32;
+ break;
+ case NVPTXISD::TexUnified3DU32FloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnified3DU32FloatGrad:
+ Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD;
+ break;
+ case NVPTXISD::TexUnifiedCubeFloatFloat:
+ Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32;
+ break;
+ case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnifiedCubeS32Float:
+ Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32;
+ break;
+ case NVPTXISD::TexUnifiedCubeS32FloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnifiedCubeU32Float:
+ Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32;
+ break;
+ case NVPTXISD::TexUnifiedCubeU32FloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
+ Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32;
+ break;
+ case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnifiedCubeArrayS32Float:
+ Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32;
+ break;
+ case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnifiedCubeArrayU32Float:
+ Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32;
+ break;
+ case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL;
+ break;
+ case NVPTXISD::Tld4UnifiedR2DFloatFloat:
+ Opc = NVPTX::TLD4_UNIFIED_R_2D_F32_F32;
+ break;
+ case NVPTXISD::Tld4UnifiedG2DFloatFloat:
+ Opc = NVPTX::TLD4_UNIFIED_G_2D_F32_F32;
+ break;
+ case NVPTXISD::Tld4UnifiedB2DFloatFloat:
+ Opc = NVPTX::TLD4_UNIFIED_B_2D_F32_F32;
+ break;
+ case NVPTXISD::Tld4UnifiedA2DFloatFloat:
+ Opc = NVPTX::TLD4_UNIFIED_A_2D_F32_F32;
+ break;
+ case NVPTXISD::Tld4UnifiedR2DS64Float:
+ Opc = NVPTX::TLD4_UNIFIED_R_2D_S32_F32;
+ break;
+ case NVPTXISD::Tld4UnifiedG2DS64Float:
+ Opc = NVPTX::TLD4_UNIFIED_G_2D_S32_F32;
+ break;
+ case NVPTXISD::Tld4UnifiedB2DS64Float:
+ Opc = NVPTX::TLD4_UNIFIED_B_2D_S32_F32;
+ break;
+ case NVPTXISD::Tld4UnifiedA2DS64Float:
+ Opc = NVPTX::TLD4_UNIFIED_A_2D_S32_F32;
+ break;
+ case NVPTXISD::Tld4UnifiedR2DU64Float:
+ Opc = NVPTX::TLD4_UNIFIED_R_2D_U32_F32;
+ break;
+ case NVPTXISD::Tld4UnifiedG2DU64Float:
+ Opc = NVPTX::TLD4_UNIFIED_G_2D_U32_F32;
+ break;
+ case NVPTXISD::Tld4UnifiedB2DU64Float:
+ Opc = NVPTX::TLD4_UNIFIED_B_2D_U32_F32;
+ break;
+ case NVPTXISD::Tld4UnifiedA2DU64Float:
+ Opc = NVPTX::TLD4_UNIFIED_A_2D_U32_F32;
+ break;
+ }
+
+ // Copy over operands
+ for (unsigned i = 1; i < N->getNumOperands(); ++i) {
+ Ops.push_back(N->getOperand(i));
+ }
+
+ Ops.push_back(Chain);
+ ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
+ return true;
+}
+
+bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode *N) {
+ SDValue Chain = N->getOperand(0);
+ SDValue TexHandle = N->getOperand(1);
+ unsigned Opc = 0;
+ SmallVector<SDValue, 8> Ops;
+ switch (N->getOpcode()) {
+ default: return false;
+ case NVPTXISD::Suld1DI8Clamp:
+ Opc = NVPTX::SULD_1D_I8_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DI16Clamp:
+ Opc = NVPTX::SULD_1D_I16_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DI32Clamp:
+ Opc = NVPTX::SULD_1D_I32_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DI64Clamp:
+ Opc = NVPTX::SULD_1D_I64_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV2I8Clamp:
+ Opc = NVPTX::SULD_1D_V2I8_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV2I16Clamp:
+ Opc = NVPTX::SULD_1D_V2I16_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV2I32Clamp:
+ Opc = NVPTX::SULD_1D_V2I32_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV2I64Clamp:
+ Opc = NVPTX::SULD_1D_V2I64_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV4I8Clamp:
+ Opc = NVPTX::SULD_1D_V4I8_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV4I16Clamp:
+ Opc = NVPTX::SULD_1D_V4I16_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV4I32Clamp:
+ Opc = NVPTX::SULD_1D_V4I32_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayI8Clamp:
+ Opc = NVPTX::SULD_1D_ARRAY_I8_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayI16Clamp:
+ Opc = NVPTX::SULD_1D_ARRAY_I16_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayI32Clamp:
+ Opc = NVPTX::SULD_1D_ARRAY_I32_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayI64Clamp:
+ Opc = NVPTX::SULD_1D_ARRAY_I64_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV2I8Clamp:
+ Opc = NVPTX::SULD_1D_ARRAY_V2I8_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV2I16Clamp:
+ Opc = NVPTX::SULD_1D_ARRAY_V2I16_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV2I32Clamp:
+ Opc = NVPTX::SULD_1D_ARRAY_V2I32_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV2I64Clamp:
+ Opc = NVPTX::SULD_1D_ARRAY_V2I64_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV4I8Clamp:
+ Opc = NVPTX::SULD_1D_ARRAY_V4I8_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV4I16Clamp:
+ Opc = NVPTX::SULD_1D_ARRAY_V4I16_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV4I32Clamp:
+ Opc = NVPTX::SULD_1D_ARRAY_V4I32_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DI8Clamp:
+ Opc = NVPTX::SULD_2D_I8_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DI16Clamp:
+ Opc = NVPTX::SULD_2D_I16_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DI32Clamp:
+ Opc = NVPTX::SULD_2D_I32_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DI64Clamp:
+ Opc = NVPTX::SULD_2D_I64_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV2I8Clamp:
+ Opc = NVPTX::SULD_2D_V2I8_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV2I16Clamp:
+ Opc = NVPTX::SULD_2D_V2I16_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV2I32Clamp:
+ Opc = NVPTX::SULD_2D_V2I32_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV2I64Clamp:
+ Opc = NVPTX::SULD_2D_V2I64_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV4I8Clamp:
+ Opc = NVPTX::SULD_2D_V4I8_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV4I16Clamp:
+ Opc = NVPTX::SULD_2D_V4I16_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV4I32Clamp:
+ Opc = NVPTX::SULD_2D_V4I32_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayI8Clamp:
+ Opc = NVPTX::SULD_2D_ARRAY_I8_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayI16Clamp:
+ Opc = NVPTX::SULD_2D_ARRAY_I16_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayI32Clamp:
+ Opc = NVPTX::SULD_2D_ARRAY_I32_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayI64Clamp:
+ Opc = NVPTX::SULD_2D_ARRAY_I64_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV2I8Clamp:
+ Opc = NVPTX::SULD_2D_ARRAY_V2I8_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV2I16Clamp:
+ Opc = NVPTX::SULD_2D_ARRAY_V2I16_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV2I32Clamp:
+ Opc = NVPTX::SULD_2D_ARRAY_V2I32_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV2I64Clamp:
+ Opc = NVPTX::SULD_2D_ARRAY_V2I64_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV4I8Clamp:
+ Opc = NVPTX::SULD_2D_ARRAY_V4I8_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV4I16Clamp:
+ Opc = NVPTX::SULD_2D_ARRAY_V4I16_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV4I32Clamp:
+ Opc = NVPTX::SULD_2D_ARRAY_V4I32_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DI8Clamp:
+ Opc = NVPTX::SULD_3D_I8_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DI16Clamp:
+ Opc = NVPTX::SULD_3D_I16_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DI32Clamp:
+ Opc = NVPTX::SULD_3D_I32_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DI64Clamp:
+ Opc = NVPTX::SULD_3D_I64_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV2I8Clamp:
+ Opc = NVPTX::SULD_3D_V2I8_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV2I16Clamp:
+ Opc = NVPTX::SULD_3D_V2I16_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV2I32Clamp:
+ Opc = NVPTX::SULD_3D_V2I32_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV2I64Clamp:
+ Opc = NVPTX::SULD_3D_V2I64_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV4I8Clamp:
+ Opc = NVPTX::SULD_3D_V4I8_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV4I16Clamp:
+ Opc = NVPTX::SULD_3D_V4I16_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV4I32Clamp:
+ Opc = NVPTX::SULD_3D_V4I32_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DI8Trap:
+ Opc = NVPTX::SULD_1D_I8_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DI16Trap:
+ Opc = NVPTX::SULD_1D_I16_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DI32Trap:
+ Opc = NVPTX::SULD_1D_I32_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DI64Trap:
+ Opc = NVPTX::SULD_1D_I64_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV2I8Trap:
+ Opc = NVPTX::SULD_1D_V2I8_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV2I16Trap:
+ Opc = NVPTX::SULD_1D_V2I16_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV2I32Trap:
+ Opc = NVPTX::SULD_1D_V2I32_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV2I64Trap:
+ Opc = NVPTX::SULD_1D_V2I64_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV4I8Trap:
+ Opc = NVPTX::SULD_1D_V4I8_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV4I16Trap:
+ Opc = NVPTX::SULD_1D_V4I16_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV4I32Trap:
+ Opc = NVPTX::SULD_1D_V4I32_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayI8Trap:
+ Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayI16Trap:
+ Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayI32Trap:
+ Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayI64Trap:
+ Opc = NVPTX::SULD_1D_ARRAY_I64_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV2I8Trap:
+ Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV2I16Trap:
+ Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV2I32Trap:
+ Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV2I64Trap:
+ Opc = NVPTX::SULD_1D_ARRAY_V2I64_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV4I8Trap:
+ Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV4I16Trap:
+ Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV4I32Trap:
+ Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DI8Trap:
+ Opc = NVPTX::SULD_2D_I8_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DI16Trap:
+ Opc = NVPTX::SULD_2D_I16_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DI32Trap:
+ Opc = NVPTX::SULD_2D_I32_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DI64Trap:
+ Opc = NVPTX::SULD_2D_I64_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV2I8Trap:
+ Opc = NVPTX::SULD_2D_V2I8_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV2I16Trap:
+ Opc = NVPTX::SULD_2D_V2I16_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV2I32Trap:
+ Opc = NVPTX::SULD_2D_V2I32_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV2I64Trap:
+ Opc = NVPTX::SULD_2D_V2I64_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV4I8Trap:
+ Opc = NVPTX::SULD_2D_V4I8_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV4I16Trap:
+ Opc = NVPTX::SULD_2D_V4I16_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV4I32Trap:
+ Opc = NVPTX::SULD_2D_V4I32_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayI8Trap:
+ Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayI16Trap:
+ Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayI32Trap:
+ Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayI64Trap:
+ Opc = NVPTX::SULD_2D_ARRAY_I64_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV2I8Trap:
+ Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV2I16Trap:
+ Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV2I32Trap:
+ Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV2I64Trap:
+ Opc = NVPTX::SULD_2D_ARRAY_V2I64_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV4I8Trap:
+ Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV4I16Trap:
+ Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV4I32Trap:
+ Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DI8Trap:
+ Opc = NVPTX::SULD_3D_I8_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DI16Trap:
+ Opc = NVPTX::SULD_3D_I16_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DI32Trap:
+ Opc = NVPTX::SULD_3D_I32_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DI64Trap:
+ Opc = NVPTX::SULD_3D_I64_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV2I8Trap:
+ Opc = NVPTX::SULD_3D_V2I8_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV2I16Trap:
+ Opc = NVPTX::SULD_3D_V2I16_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV2I32Trap:
+ Opc = NVPTX::SULD_3D_V2I32_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV2I64Trap:
+ Opc = NVPTX::SULD_3D_V2I64_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV4I8Trap:
+ Opc = NVPTX::SULD_3D_V4I8_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV4I16Trap:
+ Opc = NVPTX::SULD_3D_V4I16_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV4I32Trap:
+ Opc = NVPTX::SULD_3D_V4I32_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DI8Zero:
+ Opc = NVPTX::SULD_1D_I8_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DI16Zero:
+ Opc = NVPTX::SULD_1D_I16_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DI32Zero:
+ Opc = NVPTX::SULD_1D_I32_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DI64Zero:
+ Opc = NVPTX::SULD_1D_I64_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV2I8Zero:
+ Opc = NVPTX::SULD_1D_V2I8_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV2I16Zero:
+ Opc = NVPTX::SULD_1D_V2I16_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV2I32Zero:
+ Opc = NVPTX::SULD_1D_V2I32_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV2I64Zero:
+ Opc = NVPTX::SULD_1D_V2I64_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV4I8Zero:
+ Opc = NVPTX::SULD_1D_V4I8_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV4I16Zero:
+ Opc = NVPTX::SULD_1D_V4I16_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV4I32Zero:
+ Opc = NVPTX::SULD_1D_V4I32_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayI8Zero:
+ Opc = NVPTX::SULD_1D_ARRAY_I8_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayI16Zero:
+ Opc = NVPTX::SULD_1D_ARRAY_I16_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayI32Zero:
+ Opc = NVPTX::SULD_1D_ARRAY_I32_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayI64Zero:
+ Opc = NVPTX::SULD_1D_ARRAY_I64_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV2I8Zero:
+ Opc = NVPTX::SULD_1D_ARRAY_V2I8_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV2I16Zero:
+ Opc = NVPTX::SULD_1D_ARRAY_V2I16_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV2I32Zero:
+ Opc = NVPTX::SULD_1D_ARRAY_V2I32_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV2I64Zero:
+ Opc = NVPTX::SULD_1D_ARRAY_V2I64_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV4I8Zero:
+ Opc = NVPTX::SULD_1D_ARRAY_V4I8_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV4I16Zero:
+ Opc = NVPTX::SULD_1D_ARRAY_V4I16_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV4I32Zero:
+ Opc = NVPTX::SULD_1D_ARRAY_V4I32_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DI8Zero:
+ Opc = NVPTX::SULD_2D_I8_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DI16Zero:
+ Opc = NVPTX::SULD_2D_I16_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DI32Zero:
+ Opc = NVPTX::SULD_2D_I32_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DI64Zero:
+ Opc = NVPTX::SULD_2D_I64_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV2I8Zero:
+ Opc = NVPTX::SULD_2D_V2I8_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV2I16Zero:
+ Opc = NVPTX::SULD_2D_V2I16_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV2I32Zero:
+ Opc = NVPTX::SULD_2D_V2I32_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV2I64Zero:
+ Opc = NVPTX::SULD_2D_V2I64_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV4I8Zero:
+ Opc = NVPTX::SULD_2D_V4I8_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV4I16Zero:
+ Opc = NVPTX::SULD_2D_V4I16_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV4I32Zero:
+ Opc = NVPTX::SULD_2D_V4I32_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayI8Zero:
+ Opc = NVPTX::SULD_2D_ARRAY_I8_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayI16Zero:
+ Opc = NVPTX::SULD_2D_ARRAY_I16_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayI32Zero:
+ Opc = NVPTX::SULD_2D_ARRAY_I32_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayI64Zero:
+ Opc = NVPTX::SULD_2D_ARRAY_I64_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV2I8Zero:
+ Opc = NVPTX::SULD_2D_ARRAY_V2I8_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV2I16Zero:
+ Opc = NVPTX::SULD_2D_ARRAY_V2I16_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV2I32Zero:
+ Opc = NVPTX::SULD_2D_ARRAY_V2I32_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV2I64Zero:
+ Opc = NVPTX::SULD_2D_ARRAY_V2I64_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV4I8Zero:
+ Opc = NVPTX::SULD_2D_ARRAY_V4I8_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV4I16Zero:
+ Opc = NVPTX::SULD_2D_ARRAY_V4I16_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV4I32Zero:
+ Opc = NVPTX::SULD_2D_ARRAY_V4I32_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DI8Zero:
+ Opc = NVPTX::SULD_3D_I8_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DI16Zero:
+ Opc = NVPTX::SULD_3D_I16_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DI32Zero:
+ Opc = NVPTX::SULD_3D_I32_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DI64Zero:
+ Opc = NVPTX::SULD_3D_I64_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV2I8Zero:
+ Opc = NVPTX::SULD_3D_V2I8_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV2I16Zero:
+ Opc = NVPTX::SULD_3D_V2I16_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV2I32Zero:
+ Opc = NVPTX::SULD_3D_V2I32_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV2I64Zero:
+ Opc = NVPTX::SULD_3D_V2I64_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV4I8Zero:
+ Opc = NVPTX::SULD_3D_V4I8_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV4I16Zero:
+ Opc = NVPTX::SULD_3D_V4I16_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV4I32Zero:
+ Opc = NVPTX::SULD_3D_V4I32_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ }
+ ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
+ return true;
+}
+
+
+/// SelectBFE - Look for instruction sequences that can be made more efficient
+/// by using the 'bfe' (bit-field extract) PTX instruction
+bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
+ SDLoc DL(N);
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ SDValue Len;
+ SDValue Start;
+ SDValue Val;
+ bool IsSigned = false;
+
+ if (N->getOpcode() == ISD::AND) {
+ // Canonicalize the operands
+ // We want 'and %val, %mask'
+ if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) {
+ std::swap(LHS, RHS);
+ }
+
+ ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS);
+ if (!Mask) {
+ // We need a constant mask on the RHS of the AND
+ return false;
+ }
+
+ // Extract the mask bits
+ uint64_t MaskVal = Mask->getZExtValue();
+ if (!isMask_64(MaskVal)) {
+ // We *could* handle shifted masks here, but doing so would require an
+ // 'and' operation to fix up the low-order bits so we would trade
+ // shr+and for bfe+and, which has the same throughput
+ return false;
+ }
+
+ // How many bits are in our mask?
+ uint64_t NumBits = countTrailingOnes(MaskVal);
+ Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
+
+ if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {
+ // We have a 'srl/and' pair, extract the effective start bit and length
+ Val = LHS.getNode()->getOperand(0);
+ Start = LHS.getNode()->getOperand(1);
+ ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start);
+ if (StartConst) {
+ uint64_t StartVal = StartConst->getZExtValue();
+ // How many "good" bits do we have left? "good" is defined here as bits
+ // that exist in the original value, not shifted in.
+ uint64_t GoodBits = Start.getValueSizeInBits() - StartVal;
+ if (NumBits > GoodBits) {
+ // Do not handle the case where bits have been shifted in. In theory
+ // we could handle this, but the cost is likely higher than just
+ // emitting the srl/and pair.
+ return false;
+ }
+ Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32);
+ } else {
+ // Do not handle the case where the shift amount (can be zero if no srl
+ // was found) is not constant. We could handle this case, but it would
+ // require run-time logic that would be more expensive than just
+ // emitting the srl/and pair.
+ return false;
+ }
+ } else {
+ // Do not handle the case where the LHS of the and is not a shift. While
+ // it would be trivial to handle this case, it would just transform
+ // 'and' -> 'bfe', but 'and' has higher-throughput.
+ return false;
+ }
+ } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
+ if (LHS->getOpcode() == ISD::AND) {
+ ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS);
+ if (!ShiftCnst) {
+ // Shift amount must be constant
+ return false;
+ }
+
+ uint64_t ShiftAmt = ShiftCnst->getZExtValue();
+
+ SDValue AndLHS = LHS->getOperand(0);
+ SDValue AndRHS = LHS->getOperand(1);
+
+ // Canonicalize the AND to have the mask on the RHS
+ if (isa<ConstantSDNode>(AndLHS)) {
+ std::swap(AndLHS, AndRHS);
+ }
+
+ ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS);
+ if (!MaskCnst) {
+ // Mask must be constant
+ return false;
+ }
+
+ uint64_t MaskVal = MaskCnst->getZExtValue();
+ uint64_t NumZeros;
+ uint64_t NumBits;
+ if (isMask_64(MaskVal)) {
+ NumZeros = 0;
+ // The number of bits in the result bitfield will be the number of
+ // trailing ones (the AND) minus the number of bits we shift off
+ NumBits = countTrailingOnes(MaskVal) - ShiftAmt;
+ } else if (isShiftedMask_64(MaskVal)) {
+ NumZeros = countTrailingZeros(MaskVal);
+ unsigned NumOnes = countTrailingOnes(MaskVal >> NumZeros);
+ // The number of bits in the result bitfield will be the number of
+ // trailing zeros plus the number of set bits in the mask minus the
+ // number of bits we shift off
+ NumBits = NumZeros + NumOnes - ShiftAmt;
+ } else {
+ // This is not a mask we can handle
+ return false;
+ }
+
+ if (ShiftAmt < NumZeros) {
+ // Handling this case would require extra logic that would make this
+ // transformation non-profitable
+ return false;
+ }
+
+ Val = AndLHS;
+ Start = CurDAG->getTargetConstant(ShiftAmt, DL, MVT::i32);
+ Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
+ } else if (LHS->getOpcode() == ISD::SHL) {
+ // Here, we have a pattern like:
+ //
+ // (sra (shl val, NN), MM)
+ // or
+ // (srl (shl val, NN), MM)
+ //
+ // If MM >= NN, we can efficiently optimize this with bfe
+ Val = LHS->getOperand(0);
+
+ SDValue ShlRHS = LHS->getOperand(1);
+ ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS);
+ if (!ShlCnst) {
+ // Shift amount must be constant
+ return false;
+ }
+ uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
+
+ SDValue ShrRHS = RHS;
+ ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS);
+ if (!ShrCnst) {
+ // Shift amount must be constant
+ return false;
+ }
+ uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
+
+ // To avoid extra codegen and be profitable, we need Outer >= Inner
+ if (OuterShiftAmt < InnerShiftAmt) {
+ return false;
+ }
+
+ // If the outer shift is more than the type size, we have no bitfield to
+ // extract (since we also check that the inner shift is <= the outer shift
+ // then this also implies that the inner shift is < the type size)
+ if (OuterShiftAmt >= Val.getValueSizeInBits()) {
+ return false;
+ }
+
+ Start = CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, DL,
+ MVT::i32);
+ Len = CurDAG->getTargetConstant(Val.getValueSizeInBits() - OuterShiftAmt,
+ DL, MVT::i32);
+
+ if (N->getOpcode() == ISD::SRA) {
+ // If we have a arithmetic right shift, we need to use the signed bfe
+ // variant
+ IsSigned = true;
+ }
+ } else {
+ // No can do...
+ return false;
+ }
+ } else {
+ // No can do...
+ return false;
+ }
+
+
+ unsigned Opc;
+ // For the BFE operations we form here from "and" and "srl", always use the
+ // unsigned variants.
+ if (Val.getValueType() == MVT::i32) {
+ if (IsSigned) {
+ Opc = NVPTX::BFE_S32rii;
+ } else {
+ Opc = NVPTX::BFE_U32rii;
+ }
+ } else if (Val.getValueType() == MVT::i64) {
+ if (IsSigned) {
+ Opc = NVPTX::BFE_S64rii;
+ } else {
+ Opc = NVPTX::BFE_U64rii;
+ }
+ } else {
+ // We cannot handle this type
+ return false;
+ }
+
+ SDValue Ops[] = {
+ Val, Start, Len
+ };
+
+ ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops));
+ return true;
+}
+
+// SelectDirectAddr - Match a direct address for DAG.
+// A direct address could be a globaladdress or externalsymbol.
+bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
+ // Return true if TGA or ES.
+ if (N.getOpcode() == ISD::TargetGlobalAddress ||
+ N.getOpcode() == ISD::TargetExternalSymbol) {
+ Address = N;
+ return true;
+ }
+ if (N.getOpcode() == NVPTXISD::Wrapper) {
+ Address = N.getOperand(0);
+ return true;
+ }
+ // addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol
+ if (AddrSpaceCastSDNode *CastN = dyn_cast<AddrSpaceCastSDNode>(N)) {
+ if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC &&
+ CastN->getDestAddressSpace() == ADDRESS_SPACE_PARAM &&
+ CastN->getOperand(0).getOpcode() == NVPTXISD::MoveParam)
+ return SelectDirectAddr(CastN->getOperand(0).getOperand(0), Address);
+ }
+ return false;
+}
+
+// symbol+offset
+bool NVPTXDAGToDAGISel::SelectADDRsi_imp(
+ SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
+ if (Addr.getOpcode() == ISD::ADD) {
+ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
+ SDValue base = Addr.getOperand(0);
+ if (SelectDirectAddr(base, Base)) {
+ Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
+ mvt);
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+// symbol+offset
+bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr,
+ SDValue &Base, SDValue &Offset) {
+ return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i32);
+}
+
+// symbol+offset
+bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr,
+ SDValue &Base, SDValue &Offset) {
+ return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i64);
+}
+
+// register+offset
+bool NVPTXDAGToDAGISel::SelectADDRri_imp(
+ SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
+ if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+ Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
+ Offset = CurDAG->getTargetConstant(0, SDLoc(OpNode), mvt);
+ return true;
+ }
+ if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+ Addr.getOpcode() == ISD::TargetGlobalAddress)
+ return false; // direct calls.
+
+ if (Addr.getOpcode() == ISD::ADD) {
+ if (SelectDirectAddr(Addr.getOperand(0), Addr)) {
+ return false;
+ }
+ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
+ if (FrameIndexSDNode *FIN =
+ dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
+ // Constant offset from frame ref.
+ Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
+ else
+ Base = Addr.getOperand(0);
+ Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
+ mvt);
+ return true;
+ }
+ }
+ return false;
+}
+
+// register+offset
+bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr,
+ SDValue &Base, SDValue &Offset) {
+ return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i32);
+}
+
+// register+offset
+bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr,
+ SDValue &Base, SDValue &Offset) {
+ return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64);
+}
+
+bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
+ unsigned int spN) const {
+ const Value *Src = nullptr;
+ if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) {
+ if (spN == 0 && mN->getMemOperand()->getPseudoValue())
+ return true;
+ Src = mN->getMemOperand()->getValue();
+ }
+ if (!Src)
+ return false;
+ if (auto *PT = dyn_cast<PointerType>(Src->getType()))
+ return (PT->getAddressSpace() == spN);
+ return false;
+}
+
+/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+/// inline asm expressions.
+bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand(
+ const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) {
+ SDValue Op0, Op1;
+ switch (ConstraintID) {
+ default:
+ return true;
+ case InlineAsm::Constraint_m: // memory
+ if (SelectDirectAddr(Op, Op0)) {
+ OutOps.push_back(Op0);
+ OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32));
+ return false;
+ }
+ if (SelectADDRri(Op.getNode(), Op, Op0, Op1)) {
+ OutOps.push_back(Op0);
+ OutOps.push_back(Op1);
+ return false;
+ }
+ break;
+ }
+ return true;
+}
+
+/// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a
+/// conversion from \p SrcTy to \p DestTy.
+unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
+ bool IsSigned) {
+ switch (SrcTy.SimpleTy) {
+ default:
+ llvm_unreachable("Unhandled source type");
+ case MVT::i8:
+ switch (DestTy.SimpleTy) {
+ default:
+ llvm_unreachable("Unhandled dest type");
+ case MVT::i16:
+ return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8;
+ case MVT::i32:
+ return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8;
+ case MVT::i64:
+ return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8;
+ }
+ case MVT::i16:
+ switch (DestTy.SimpleTy) {
+ default:
+ llvm_unreachable("Unhandled dest type");
+ case MVT::i8:
+ return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16;
+ case MVT::i32:
+ return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16;
+ case MVT::i64:
+ return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16;
+ }
+ case MVT::i32:
+ switch (DestTy.SimpleTy) {
+ default:
+ llvm_unreachable("Unhandled dest type");
+ case MVT::i8:
+ return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32;
+ case MVT::i16:
+ return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32;
+ case MVT::i64:
+ return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32;
+ }
+ case MVT::i64:
+ switch (DestTy.SimpleTy) {
+ default:
+ llvm_unreachable("Unhandled dest type");
+ case MVT::i8:
+ return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64;
+ case MVT::i16:
+ return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64;
+ case MVT::i32:
+ return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64;
+ }
+ }
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
new file mode 100644
index 000000000000..0591035a6aa8
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -0,0 +1,100 @@
+//===-- NVPTXISelDAGToDAG.h - A dag to dag inst selector for NVPTX --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the NVPTX target.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXISELDAGTODAG_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXISELDAGTODAG_H
+
+#include "NVPTX.h"
+#include "NVPTXISelLowering.h"
+#include "NVPTXRegisterInfo.h"
+#include "NVPTXTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+
+class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
+ const NVPTXTargetMachine &TM;
+
+ // If true, generate mul.wide from sext and mul
+ bool doMulWide;
+
+ int getDivF32Level() const;
+ bool usePrecSqrtF32() const;
+ bool useF32FTZ() const;
+ bool allowFMA() const;
+
+public:
+ explicit NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
+ CodeGenOpt::Level OptLevel);
+
+ // Pass Name
+ StringRef getPassName() const override {
+ return "NVPTX DAG->DAG Pattern Instruction Selection";
+ }
+ bool runOnMachineFunction(MachineFunction &MF) override;
+ const NVPTXSubtarget *Subtarget;
+
+ bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+ unsigned ConstraintID,
+ std::vector<SDValue> &OutOps) override;
+private:
+// Include the pieces autogenerated from the target description.
+#include "NVPTXGenDAGISel.inc"
+
+ void Select(SDNode *N) override;
+ bool tryIntrinsicNoChain(SDNode *N);
+ bool tryIntrinsicChain(SDNode *N);
+ void SelectTexSurfHandle(SDNode *N);
+ bool tryLoad(SDNode *N);
+ bool tryLoadVector(SDNode *N);
+ bool tryLDGLDU(SDNode *N);
+ bool tryStore(SDNode *N);
+ bool tryStoreVector(SDNode *N);
+ bool tryLoadParam(SDNode *N);
+ bool tryStoreRetval(SDNode *N);
+ bool tryStoreParam(SDNode *N);
+ void SelectAddrSpaceCast(SDNode *N);
+ bool tryTextureIntrinsic(SDNode *N);
+ bool trySurfaceIntrinsic(SDNode *N);
+ bool tryBFE(SDNode *N);
+
+ inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
+ return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
+ }
+
+ // Match direct address complex pattern.
+ bool SelectDirectAddr(SDValue N, SDValue &Address);
+
+ bool SelectADDRri_imp(SDNode *OpNode, SDValue Addr, SDValue &Base,
+ SDValue &Offset, MVT mvt);
+ bool SelectADDRri(SDNode *OpNode, SDValue Addr, SDValue &Base,
+ SDValue &Offset);
+ bool SelectADDRri64(SDNode *OpNode, SDValue Addr, SDValue &Base,
+ SDValue &Offset);
+
+ bool SelectADDRsi_imp(SDNode *OpNode, SDValue Addr, SDValue &Base,
+ SDValue &Offset, MVT mvt);
+ bool SelectADDRsi(SDNode *OpNode, SDValue Addr, SDValue &Base,
+ SDValue &Offset);
+ bool SelectADDRsi64(SDNode *OpNode, SDValue Addr, SDValue &Base,
+ SDValue &Offset);
+
+ bool ChkMemSDNodeAddressSpace(SDNode *N, unsigned int spN) const;
+
+ static unsigned GetConvertOpcode(MVT DestTy, MVT SrcTy, bool IsSigned);
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
new file mode 100644
index 000000000000..2e4764feff11
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -0,0 +1,4639 @@
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that NVPTX uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXISelLowering.h"
+#include "NVPTX.h"
+#include "NVPTXTargetMachine.h"
+#include "NVPTXTargetObjectFile.h"
+#include "NVPTXUtilities.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <sstream>
+
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "nvptx-lower"
+
+using namespace llvm;
+
+static unsigned int uniqueCallSite = 0;
+
+static cl::opt<bool> sched4reg(
+ "nvptx-sched4reg",
+ cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
+
+static cl::opt<unsigned>
+FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden,
+ cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
+ " 1: do it 2: do it aggressively"),
+ cl::init(2));
+
+static bool IsPTXVectorType(MVT VT) {
+ switch (VT.SimpleTy) {
+ default:
+ return false;
+ case MVT::v2i1:
+ case MVT::v4i1:
+ case MVT::v2i8:
+ case MVT::v4i8:
+ case MVT::v2i16:
+ case MVT::v4i16:
+ case MVT::v2i32:
+ case MVT::v4i32:
+ case MVT::v2i64:
+ case MVT::v2f32:
+ case MVT::v4f32:
+ case MVT::v2f64:
+ return true;
+ }
+}
+
+/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
+/// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors
+/// into their primitive components.
+/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
+/// same number of types as the Ins/Outs arrays in LowerFormalArguments,
+/// LowerCall, and LowerReturn.
+static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
+ Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
+ SmallVectorImpl<uint64_t> *Offsets = nullptr,
+ uint64_t StartingOffset = 0) {
+ SmallVector<EVT, 16> TempVTs;
+ SmallVector<uint64_t, 16> TempOffsets;
+
+ ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
+ for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
+ EVT VT = TempVTs[i];
+ uint64_t Off = TempOffsets[i];
+ if (VT.isVector())
+ for (unsigned j = 0, je = VT.getVectorNumElements(); j != je; ++j) {
+ ValueVTs.push_back(VT.getVectorElementType());
+ if (Offsets)
+ Offsets->push_back(Off+j*VT.getVectorElementType().getStoreSize());
+ }
+ else {
+ ValueVTs.push_back(VT);
+ if (Offsets)
+ Offsets->push_back(Off);
+ }
+ }
+}
+
+// NVPTXTargetLowering Constructor.
+NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
+ const NVPTXSubtarget &STI)
+ : TargetLowering(TM), nvTM(&TM), STI(STI) {
+
+ // always lower memset, memcpy, and memmove intrinsics to load/store
+ // instructions, rather
+ // then generating calls to memset, mempcy or memmove.
+ MaxStoresPerMemset = (unsigned) 0xFFFFFFFF;
+ MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF;
+ MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF;
+
+ setBooleanContents(ZeroOrNegativeOneBooleanContent);
+ setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+
+ // Jump is Expensive. Don't create extra control flow for 'and', 'or'
+ // condition branches.
+ setJumpIsExpensive(true);
+
+ // Wide divides are _very_ slow. Try to reduce the width of the divide if
+ // possible.
+ addBypassSlowDiv(64, 32);
+
+ // By default, use the Source scheduling
+ if (sched4reg)
+ setSchedulingPreference(Sched::RegPressure);
+ else
+ setSchedulingPreference(Sched::Source);
+
+ addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
+ addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
+ addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
+ addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
+ addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
+ addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
+
+ // Operations not directly supported by NVPTX.
+ setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::i8, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
+ setOperationAction(ISD::BR_CC, MVT::f32, Expand);
+ setOperationAction(ISD::BR_CC, MVT::f64, Expand);
+ setOperationAction(ISD::BR_CC, MVT::i1, Expand);
+ setOperationAction(ISD::BR_CC, MVT::i8, Expand);
+ setOperationAction(ISD::BR_CC, MVT::i16, Expand);
+ setOperationAction(ISD::BR_CC, MVT::i32, Expand);
+ setOperationAction(ISD::BR_CC, MVT::i64, Expand);
+ // Some SIGN_EXTEND_INREG can be done using cvt instruction.
+ // For others we will expand to a SHL/SRA pair.
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+ setOperationAction(ISD::SHL_PARTS, MVT::i32 , Custom);
+ setOperationAction(ISD::SRA_PARTS, MVT::i32 , Custom);
+ setOperationAction(ISD::SRL_PARTS, MVT::i32 , Custom);
+ setOperationAction(ISD::SHL_PARTS, MVT::i64 , Custom);
+ setOperationAction(ISD::SRA_PARTS, MVT::i64 , Custom);
+ setOperationAction(ISD::SRL_PARTS, MVT::i64 , Custom);
+
+ if (STI.hasROT64()) {
+ setOperationAction(ISD::ROTL, MVT::i64, Legal);
+ setOperationAction(ISD::ROTR, MVT::i64, Legal);
+ } else {
+ setOperationAction(ISD::ROTL, MVT::i64, Expand);
+ setOperationAction(ISD::ROTR, MVT::i64, Expand);
+ }
+ if (STI.hasROT32()) {
+ setOperationAction(ISD::ROTL, MVT::i32, Legal);
+ setOperationAction(ISD::ROTR, MVT::i32, Legal);
+ } else {
+ setOperationAction(ISD::ROTL, MVT::i32, Expand);
+ setOperationAction(ISD::ROTR, MVT::i32, Expand);
+ }
+
+ setOperationAction(ISD::ROTL, MVT::i16, Expand);
+ setOperationAction(ISD::ROTR, MVT::i16, Expand);
+ setOperationAction(ISD::ROTL, MVT::i8, Expand);
+ setOperationAction(ISD::ROTR, MVT::i8, Expand);
+ setOperationAction(ISD::BSWAP, MVT::i16, Expand);
+ setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+ setOperationAction(ISD::BSWAP, MVT::i64, Expand);
+
+ // Indirect branch is not supported.
+ // This also disables Jump Table creation.
+ setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+ setOperationAction(ISD::BRIND, MVT::Other, Expand);
+
+ setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+ setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+
+ // We want to legalize constant related memmove and memcopy
+ // intrinsics.
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+
+ // Turn FP extload into load/fpextend
+ setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
+ // Turn FP truncstore into trunc + store.
+ // FIXME: vector types should also be expanded
+ setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+ setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+ setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+ // PTX does not support load / store predicate registers
+ setOperationAction(ISD::LOAD, MVT::i1, Custom);
+ setOperationAction(ISD::STORE, MVT::i1, Custom);
+
+ for (MVT VT : MVT::integer_valuetypes()) {
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+ setTruncStoreAction(VT, MVT::i1, Expand);
+ }
+
+ // This is legal in NVPTX
+ setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
+ setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+
+ // TRAP can be lowered to PTX trap
+ setOperationAction(ISD::TRAP, MVT::Other, Legal);
+
+ setOperationAction(ISD::ADDC, MVT::i64, Expand);
+ setOperationAction(ISD::ADDE, MVT::i64, Expand);
+
+ // Register custom handling for vector loads/stores
+ for (MVT VT : MVT::vector_valuetypes()) {
+ if (IsPTXVectorType(VT)) {
+ setOperationAction(ISD::LOAD, VT, Custom);
+ setOperationAction(ISD::STORE, VT, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom);
+ }
+ }
+
+ // Custom handling for i8 intrinsics
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
+
+ setOperationAction(ISD::CTLZ, MVT::i16, Legal);
+ setOperationAction(ISD::CTLZ, MVT::i32, Legal);
+ setOperationAction(ISD::CTLZ, MVT::i64, Legal);
+ setOperationAction(ISD::CTTZ, MVT::i16, Expand);
+ setOperationAction(ISD::CTTZ, MVT::i32, Expand);
+ setOperationAction(ISD::CTTZ, MVT::i64, Expand);
+ setOperationAction(ISD::CTPOP, MVT::i16, Legal);
+ setOperationAction(ISD::CTPOP, MVT::i32, Legal);
+ setOperationAction(ISD::CTPOP, MVT::i64, Legal);
+
+ // PTX does not directly support SELP of i1, so promote to i32 first
+ setOperationAction(ISD::SELECT, MVT::i1, Custom);
+
+ // PTX cannot multiply two i64s in a single instruction.
+ setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+ setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+
+ // We have some custom DAG combine patterns for these nodes
+ setTargetDAGCombine(ISD::ADD);
+ setTargetDAGCombine(ISD::AND);
+ setTargetDAGCombine(ISD::FADD);
+ setTargetDAGCombine(ISD::MUL);
+ setTargetDAGCombine(ISD::SHL);
+ setTargetDAGCombine(ISD::SELECT);
+ setTargetDAGCombine(ISD::SREM);
+ setTargetDAGCombine(ISD::UREM);
+
+ // Library functions. These default to Expand, but we have instructions
+ // for them.
+ setOperationAction(ISD::FCEIL, MVT::f32, Legal);
+ setOperationAction(ISD::FCEIL, MVT::f64, Legal);
+ setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
+ setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
+ setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
+ setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
+ setOperationAction(ISD::FRINT, MVT::f32, Legal);
+ setOperationAction(ISD::FRINT, MVT::f64, Legal);
+ setOperationAction(ISD::FROUND, MVT::f32, Legal);
+ setOperationAction(ISD::FROUND, MVT::f64, Legal);
+ setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
+ setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
+ setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
+ setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
+ setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
+ setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
+
+ // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate.
+ // No FPOW or FREM in PTX.
+
+ // Now deduce the information based on the above mentioned
+ // actions
+ computeRegisterProperties(STI.getRegisterInfo());
+}
+
+const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
+ switch ((NVPTXISD::NodeType)Opcode) {
+ case NVPTXISD::FIRST_NUMBER:
+ break;
+ case NVPTXISD::CALL:
+ return "NVPTXISD::CALL";
+ case NVPTXISD::RET_FLAG:
+ return "NVPTXISD::RET_FLAG";
+ case NVPTXISD::LOAD_PARAM:
+ return "NVPTXISD::LOAD_PARAM";
+ case NVPTXISD::Wrapper:
+ return "NVPTXISD::Wrapper";
+ case NVPTXISD::DeclareParam:
+ return "NVPTXISD::DeclareParam";
+ case NVPTXISD::DeclareScalarParam:
+ return "NVPTXISD::DeclareScalarParam";
+ case NVPTXISD::DeclareRet:
+ return "NVPTXISD::DeclareRet";
+ case NVPTXISD::DeclareScalarRet:
+ return "NVPTXISD::DeclareScalarRet";
+ case NVPTXISD::DeclareRetParam:
+ return "NVPTXISD::DeclareRetParam";
+ case NVPTXISD::PrintCall:
+ return "NVPTXISD::PrintCall";
+ case NVPTXISD::PrintConvergentCall:
+ return "NVPTXISD::PrintConvergentCall";
+ case NVPTXISD::PrintCallUni:
+ return "NVPTXISD::PrintCallUni";
+ case NVPTXISD::PrintConvergentCallUni:
+ return "NVPTXISD::PrintConvergentCallUni";
+ case NVPTXISD::LoadParam:
+ return "NVPTXISD::LoadParam";
+ case NVPTXISD::LoadParamV2:
+ return "NVPTXISD::LoadParamV2";
+ case NVPTXISD::LoadParamV4:
+ return "NVPTXISD::LoadParamV4";
+ case NVPTXISD::StoreParam:
+ return "NVPTXISD::StoreParam";
+ case NVPTXISD::StoreParamV2:
+ return "NVPTXISD::StoreParamV2";
+ case NVPTXISD::StoreParamV4:
+ return "NVPTXISD::StoreParamV4";
+ case NVPTXISD::StoreParamS32:
+ return "NVPTXISD::StoreParamS32";
+ case NVPTXISD::StoreParamU32:
+ return "NVPTXISD::StoreParamU32";
+ case NVPTXISD::CallArgBegin:
+ return "NVPTXISD::CallArgBegin";
+ case NVPTXISD::CallArg:
+ return "NVPTXISD::CallArg";
+ case NVPTXISD::LastCallArg:
+ return "NVPTXISD::LastCallArg";
+ case NVPTXISD::CallArgEnd:
+ return "NVPTXISD::CallArgEnd";
+ case NVPTXISD::CallVoid:
+ return "NVPTXISD::CallVoid";
+ case NVPTXISD::CallVal:
+ return "NVPTXISD::CallVal";
+ case NVPTXISD::CallSymbol:
+ return "NVPTXISD::CallSymbol";
+ case NVPTXISD::Prototype:
+ return "NVPTXISD::Prototype";
+ case NVPTXISD::MoveParam:
+ return "NVPTXISD::MoveParam";
+ case NVPTXISD::StoreRetval:
+ return "NVPTXISD::StoreRetval";
+ case NVPTXISD::StoreRetvalV2:
+ return "NVPTXISD::StoreRetvalV2";
+ case NVPTXISD::StoreRetvalV4:
+ return "NVPTXISD::StoreRetvalV4";
+ case NVPTXISD::PseudoUseParam:
+ return "NVPTXISD::PseudoUseParam";
+ case NVPTXISD::RETURN:
+ return "NVPTXISD::RETURN";
+ case NVPTXISD::CallSeqBegin:
+ return "NVPTXISD::CallSeqBegin";
+ case NVPTXISD::CallSeqEnd:
+ return "NVPTXISD::CallSeqEnd";
+ case NVPTXISD::CallPrototype:
+ return "NVPTXISD::CallPrototype";
+ case NVPTXISD::LoadV2:
+ return "NVPTXISD::LoadV2";
+ case NVPTXISD::LoadV4:
+ return "NVPTXISD::LoadV4";
+ case NVPTXISD::LDGV2:
+ return "NVPTXISD::LDGV2";
+ case NVPTXISD::LDGV4:
+ return "NVPTXISD::LDGV4";
+ case NVPTXISD::LDUV2:
+ return "NVPTXISD::LDUV2";
+ case NVPTXISD::LDUV4:
+ return "NVPTXISD::LDUV4";
+ case NVPTXISD::StoreV2:
+ return "NVPTXISD::StoreV2";
+ case NVPTXISD::StoreV4:
+ return "NVPTXISD::StoreV4";
+ case NVPTXISD::FUN_SHFL_CLAMP:
+ return "NVPTXISD::FUN_SHFL_CLAMP";
+ case NVPTXISD::FUN_SHFR_CLAMP:
+ return "NVPTXISD::FUN_SHFR_CLAMP";
+ case NVPTXISD::IMAD:
+ return "NVPTXISD::IMAD";
+ case NVPTXISD::Dummy:
+ return "NVPTXISD::Dummy";
+ case NVPTXISD::MUL_WIDE_SIGNED:
+ return "NVPTXISD::MUL_WIDE_SIGNED";
+ case NVPTXISD::MUL_WIDE_UNSIGNED:
+ return "NVPTXISD::MUL_WIDE_UNSIGNED";
+ case NVPTXISD::Tex1DFloatS32: return "NVPTXISD::Tex1DFloatS32";
+ case NVPTXISD::Tex1DFloatFloat: return "NVPTXISD::Tex1DFloatFloat";
+ case NVPTXISD::Tex1DFloatFloatLevel:
+ return "NVPTXISD::Tex1DFloatFloatLevel";
+ case NVPTXISD::Tex1DFloatFloatGrad:
+ return "NVPTXISD::Tex1DFloatFloatGrad";
+ case NVPTXISD::Tex1DS32S32: return "NVPTXISD::Tex1DS32S32";
+ case NVPTXISD::Tex1DS32Float: return "NVPTXISD::Tex1DS32Float";
+ case NVPTXISD::Tex1DS32FloatLevel:
+ return "NVPTXISD::Tex1DS32FloatLevel";
+ case NVPTXISD::Tex1DS32FloatGrad:
+ return "NVPTXISD::Tex1DS32FloatGrad";
+ case NVPTXISD::Tex1DU32S32: return "NVPTXISD::Tex1DU32S32";
+ case NVPTXISD::Tex1DU32Float: return "NVPTXISD::Tex1DU32Float";
+ case NVPTXISD::Tex1DU32FloatLevel:
+ return "NVPTXISD::Tex1DU32FloatLevel";
+ case NVPTXISD::Tex1DU32FloatGrad:
+ return "NVPTXISD::Tex1DU32FloatGrad";
+ case NVPTXISD::Tex1DArrayFloatS32: return "NVPTXISD::Tex1DArrayFloatS32";
+ case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat";
+ case NVPTXISD::Tex1DArrayFloatFloatLevel:
+ return "NVPTXISD::Tex1DArrayFloatFloatLevel";
+ case NVPTXISD::Tex1DArrayFloatFloatGrad:
+ return "NVPTXISD::Tex1DArrayFloatFloatGrad";
+ case NVPTXISD::Tex1DArrayS32S32: return "NVPTXISD::Tex1DArrayS32S32";
+ case NVPTXISD::Tex1DArrayS32Float: return "NVPTXISD::Tex1DArrayS32Float";
+ case NVPTXISD::Tex1DArrayS32FloatLevel:
+ return "NVPTXISD::Tex1DArrayS32FloatLevel";
+ case NVPTXISD::Tex1DArrayS32FloatGrad:
+ return "NVPTXISD::Tex1DArrayS32FloatGrad";
+ case NVPTXISD::Tex1DArrayU32S32: return "NVPTXISD::Tex1DArrayU32S32";
+ case NVPTXISD::Tex1DArrayU32Float: return "NVPTXISD::Tex1DArrayU32Float";
+ case NVPTXISD::Tex1DArrayU32FloatLevel:
+ return "NVPTXISD::Tex1DArrayU32FloatLevel";
+ case NVPTXISD::Tex1DArrayU32FloatGrad:
+ return "NVPTXISD::Tex1DArrayU32FloatGrad";
+ case NVPTXISD::Tex2DFloatS32: return "NVPTXISD::Tex2DFloatS32";
+ case NVPTXISD::Tex2DFloatFloat: return "NVPTXISD::Tex2DFloatFloat";
+ case NVPTXISD::Tex2DFloatFloatLevel:
+ return "NVPTXISD::Tex2DFloatFloatLevel";
+ case NVPTXISD::Tex2DFloatFloatGrad:
+ return "NVPTXISD::Tex2DFloatFloatGrad";
+ case NVPTXISD::Tex2DS32S32: return "NVPTXISD::Tex2DS32S32";
+ case NVPTXISD::Tex2DS32Float: return "NVPTXISD::Tex2DS32Float";
+ case NVPTXISD::Tex2DS32FloatLevel:
+ return "NVPTXISD::Tex2DS32FloatLevel";
+ case NVPTXISD::Tex2DS32FloatGrad:
+ return "NVPTXISD::Tex2DS32FloatGrad";
+ case NVPTXISD::Tex2DU32S32: return "NVPTXISD::Tex2DU32S32";
+ case NVPTXISD::Tex2DU32Float: return "NVPTXISD::Tex2DU32Float";
+ case NVPTXISD::Tex2DU32FloatLevel:
+ return "NVPTXISD::Tex2DU32FloatLevel";
+ case NVPTXISD::Tex2DU32FloatGrad:
+ return "NVPTXISD::Tex2DU32FloatGrad";
+ case NVPTXISD::Tex2DArrayFloatS32: return "NVPTXISD::Tex2DArrayFloatS32";
+ case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
+ case NVPTXISD::Tex2DArrayFloatFloatLevel:
+ return "NVPTXISD::Tex2DArrayFloatFloatLevel";
+ case NVPTXISD::Tex2DArrayFloatFloatGrad:
+ return "NVPTXISD::Tex2DArrayFloatFloatGrad";
+ case NVPTXISD::Tex2DArrayS32S32: return "NVPTXISD::Tex2DArrayS32S32";
+ case NVPTXISD::Tex2DArrayS32Float: return "NVPTXISD::Tex2DArrayS32Float";
+ case NVPTXISD::Tex2DArrayS32FloatLevel:
+ return "NVPTXISD::Tex2DArrayS32FloatLevel";
+ case NVPTXISD::Tex2DArrayS32FloatGrad:
+ return "NVPTXISD::Tex2DArrayS32FloatGrad";
+ case NVPTXISD::Tex2DArrayU32S32: return "NVPTXISD::Tex2DArrayU32S32";
+ case NVPTXISD::Tex2DArrayU32Float: return "NVPTXISD::Tex2DArrayU32Float";
+ case NVPTXISD::Tex2DArrayU32FloatLevel:
+ return "NVPTXISD::Tex2DArrayU32FloatLevel";
+ case NVPTXISD::Tex2DArrayU32FloatGrad:
+ return "NVPTXISD::Tex2DArrayU32FloatGrad";
+ case NVPTXISD::Tex3DFloatS32: return "NVPTXISD::Tex3DFloatS32";
+ case NVPTXISD::Tex3DFloatFloat: return "NVPTXISD::Tex3DFloatFloat";
+ case NVPTXISD::Tex3DFloatFloatLevel:
+ return "NVPTXISD::Tex3DFloatFloatLevel";
+ case NVPTXISD::Tex3DFloatFloatGrad:
+ return "NVPTXISD::Tex3DFloatFloatGrad";
+ case NVPTXISD::Tex3DS32S32: return "NVPTXISD::Tex3DS32S32";
+ case NVPTXISD::Tex3DS32Float: return "NVPTXISD::Tex3DS32Float";
+ case NVPTXISD::Tex3DS32FloatLevel:
+ return "NVPTXISD::Tex3DS32FloatLevel";
+ case NVPTXISD::Tex3DS32FloatGrad:
+ return "NVPTXISD::Tex3DS32FloatGrad";
+ case NVPTXISD::Tex3DU32S32: return "NVPTXISD::Tex3DU32S32";
+ case NVPTXISD::Tex3DU32Float: return "NVPTXISD::Tex3DU32Float";
+ case NVPTXISD::Tex3DU32FloatLevel:
+ return "NVPTXISD::Tex3DU32FloatLevel";
+ case NVPTXISD::Tex3DU32FloatGrad:
+ return "NVPTXISD::Tex3DU32FloatGrad";
+ case NVPTXISD::TexCubeFloatFloat: return "NVPTXISD::TexCubeFloatFloat";
+ case NVPTXISD::TexCubeFloatFloatLevel:
+ return "NVPTXISD::TexCubeFloatFloatLevel";
+ case NVPTXISD::TexCubeS32Float: return "NVPTXISD::TexCubeS32Float";
+ case NVPTXISD::TexCubeS32FloatLevel:
+ return "NVPTXISD::TexCubeS32FloatLevel";
+ case NVPTXISD::TexCubeU32Float: return "NVPTXISD::TexCubeU32Float";
+ case NVPTXISD::TexCubeU32FloatLevel:
+ return "NVPTXISD::TexCubeU32FloatLevel";
+ case NVPTXISD::TexCubeArrayFloatFloat:
+ return "NVPTXISD::TexCubeArrayFloatFloat";
+ case NVPTXISD::TexCubeArrayFloatFloatLevel:
+ return "NVPTXISD::TexCubeArrayFloatFloatLevel";
+ case NVPTXISD::TexCubeArrayS32Float:
+ return "NVPTXISD::TexCubeArrayS32Float";
+ case NVPTXISD::TexCubeArrayS32FloatLevel:
+ return "NVPTXISD::TexCubeArrayS32FloatLevel";
+ case NVPTXISD::TexCubeArrayU32Float:
+ return "NVPTXISD::TexCubeArrayU32Float";
+ case NVPTXISD::TexCubeArrayU32FloatLevel:
+ return "NVPTXISD::TexCubeArrayU32FloatLevel";
+ case NVPTXISD::Tld4R2DFloatFloat:
+ return "NVPTXISD::Tld4R2DFloatFloat";
+ case NVPTXISD::Tld4G2DFloatFloat:
+ return "NVPTXISD::Tld4G2DFloatFloat";
+ case NVPTXISD::Tld4B2DFloatFloat:
+ return "NVPTXISD::Tld4B2DFloatFloat";
+ case NVPTXISD::Tld4A2DFloatFloat:
+ return "NVPTXISD::Tld4A2DFloatFloat";
+ case NVPTXISD::Tld4R2DS64Float:
+ return "NVPTXISD::Tld4R2DS64Float";
+ case NVPTXISD::Tld4G2DS64Float:
+ return "NVPTXISD::Tld4G2DS64Float";
+ case NVPTXISD::Tld4B2DS64Float:
+ return "NVPTXISD::Tld4B2DS64Float";
+ case NVPTXISD::Tld4A2DS64Float:
+ return "NVPTXISD::Tld4A2DS64Float";
+ case NVPTXISD::Tld4R2DU64Float:
+ return "NVPTXISD::Tld4R2DU64Float";
+ case NVPTXISD::Tld4G2DU64Float:
+ return "NVPTXISD::Tld4G2DU64Float";
+ case NVPTXISD::Tld4B2DU64Float:
+ return "NVPTXISD::Tld4B2DU64Float";
+ case NVPTXISD::Tld4A2DU64Float:
+ return "NVPTXISD::Tld4A2DU64Float";
+
+ case NVPTXISD::TexUnified1DFloatS32:
+ return "NVPTXISD::TexUnified1DFloatS32";
+ case NVPTXISD::TexUnified1DFloatFloat:
+ return "NVPTXISD::TexUnified1DFloatFloat";
+ case NVPTXISD::TexUnified1DFloatFloatLevel:
+ return "NVPTXISD::TexUnified1DFloatFloatLevel";
+ case NVPTXISD::TexUnified1DFloatFloatGrad:
+ return "NVPTXISD::TexUnified1DFloatFloatGrad";
+ case NVPTXISD::TexUnified1DS32S32:
+ return "NVPTXISD::TexUnified1DS32S32";
+ case NVPTXISD::TexUnified1DS32Float:
+ return "NVPTXISD::TexUnified1DS32Float";
+ case NVPTXISD::TexUnified1DS32FloatLevel:
+ return "NVPTXISD::TexUnified1DS32FloatLevel";
+ case NVPTXISD::TexUnified1DS32FloatGrad:
+ return "NVPTXISD::TexUnified1DS32FloatGrad";
+ case NVPTXISD::TexUnified1DU32S32:
+ return "NVPTXISD::TexUnified1DU32S32";
+ case NVPTXISD::TexUnified1DU32Float:
+ return "NVPTXISD::TexUnified1DU32Float";
+ case NVPTXISD::TexUnified1DU32FloatLevel:
+ return "NVPTXISD::TexUnified1DU32FloatLevel";
+ case NVPTXISD::TexUnified1DU32FloatGrad:
+ return "NVPTXISD::TexUnified1DU32FloatGrad";
+ case NVPTXISD::TexUnified1DArrayFloatS32:
+ return "NVPTXISD::TexUnified1DArrayFloatS32";
+ case NVPTXISD::TexUnified1DArrayFloatFloat:
+ return "NVPTXISD::TexUnified1DArrayFloatFloat";
+ case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
+ return "NVPTXISD::TexUnified1DArrayFloatFloatLevel";
+ case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
+ return "NVPTXISD::TexUnified1DArrayFloatFloatGrad";
+ case NVPTXISD::TexUnified1DArrayS32S32:
+ return "NVPTXISD::TexUnified1DArrayS32S32";
+ case NVPTXISD::TexUnified1DArrayS32Float:
+ return "NVPTXISD::TexUnified1DArrayS32Float";
+ case NVPTXISD::TexUnified1DArrayS32FloatLevel:
+ return "NVPTXISD::TexUnified1DArrayS32FloatLevel";
+ case NVPTXISD::TexUnified1DArrayS32FloatGrad:
+ return "NVPTXISD::TexUnified1DArrayS32FloatGrad";
+ case NVPTXISD::TexUnified1DArrayU32S32:
+ return "NVPTXISD::TexUnified1DArrayU32S32";
+ case NVPTXISD::TexUnified1DArrayU32Float:
+ return "NVPTXISD::TexUnified1DArrayU32Float";
+ case NVPTXISD::TexUnified1DArrayU32FloatLevel:
+ return "NVPTXISD::TexUnified1DArrayU32FloatLevel";
+ case NVPTXISD::TexUnified1DArrayU32FloatGrad:
+ return "NVPTXISD::TexUnified1DArrayU32FloatGrad";
+ case NVPTXISD::TexUnified2DFloatS32:
+ return "NVPTXISD::TexUnified2DFloatS32";
+ case NVPTXISD::TexUnified2DFloatFloat:
+ return "NVPTXISD::TexUnified2DFloatFloat";
+ case NVPTXISD::TexUnified2DFloatFloatLevel:
+ return "NVPTXISD::TexUnified2DFloatFloatLevel";
+ case NVPTXISD::TexUnified2DFloatFloatGrad:
+ return "NVPTXISD::TexUnified2DFloatFloatGrad";
+ case NVPTXISD::TexUnified2DS32S32:
+ return "NVPTXISD::TexUnified2DS32S32";
+ case NVPTXISD::TexUnified2DS32Float:
+ return "NVPTXISD::TexUnified2DS32Float";
+ case NVPTXISD::TexUnified2DS32FloatLevel:
+ return "NVPTXISD::TexUnified2DS32FloatLevel";
+ case NVPTXISD::TexUnified2DS32FloatGrad:
+ return "NVPTXISD::TexUnified2DS32FloatGrad";
+ case NVPTXISD::TexUnified2DU32S32:
+ return "NVPTXISD::TexUnified2DU32S32";
+ case NVPTXISD::TexUnified2DU32Float:
+ return "NVPTXISD::TexUnified2DU32Float";
+ case NVPTXISD::TexUnified2DU32FloatLevel:
+ return "NVPTXISD::TexUnified2DU32FloatLevel";
+ case NVPTXISD::TexUnified2DU32FloatGrad:
+ return "NVPTXISD::TexUnified2DU32FloatGrad";
+ case NVPTXISD::TexUnified2DArrayFloatS32:
+ return "NVPTXISD::TexUnified2DArrayFloatS32";
+ case NVPTXISD::TexUnified2DArrayFloatFloat:
+ return "NVPTXISD::TexUnified2DArrayFloatFloat";
+ case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
+ return "NVPTXISD::TexUnified2DArrayFloatFloatLevel";
+ case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
+ return "NVPTXISD::TexUnified2DArrayFloatFloatGrad";
+ case NVPTXISD::TexUnified2DArrayS32S32:
+ return "NVPTXISD::TexUnified2DArrayS32S32";
+ case NVPTXISD::TexUnified2DArrayS32Float:
+ return "NVPTXISD::TexUnified2DArrayS32Float";
+ case NVPTXISD::TexUnified2DArrayS32FloatLevel:
+ return "NVPTXISD::TexUnified2DArrayS32FloatLevel";
+ case NVPTXISD::TexUnified2DArrayS32FloatGrad:
+ return "NVPTXISD::TexUnified2DArrayS32FloatGrad";
+ case NVPTXISD::TexUnified2DArrayU32S32:
+ return "NVPTXISD::TexUnified2DArrayU32S32";
+ case NVPTXISD::TexUnified2DArrayU32Float:
+ return "NVPTXISD::TexUnified2DArrayU32Float";
+ case NVPTXISD::TexUnified2DArrayU32FloatLevel:
+ return "NVPTXISD::TexUnified2DArrayU32FloatLevel";
+ case NVPTXISD::TexUnified2DArrayU32FloatGrad:
+ return "NVPTXISD::TexUnified2DArrayU32FloatGrad";
+ case NVPTXISD::TexUnified3DFloatS32:
+ return "NVPTXISD::TexUnified3DFloatS32";
+ case NVPTXISD::TexUnified3DFloatFloat:
+ return "NVPTXISD::TexUnified3DFloatFloat";
+ case NVPTXISD::TexUnified3DFloatFloatLevel:
+ return "NVPTXISD::TexUnified3DFloatFloatLevel";
+ case NVPTXISD::TexUnified3DFloatFloatGrad:
+ return "NVPTXISD::TexUnified3DFloatFloatGrad";
+ case NVPTXISD::TexUnified3DS32S32:
+ return "NVPTXISD::TexUnified3DS32S32";
+ case NVPTXISD::TexUnified3DS32Float:
+ return "NVPTXISD::TexUnified3DS32Float";
+ case NVPTXISD::TexUnified3DS32FloatLevel:
+ return "NVPTXISD::TexUnified3DS32FloatLevel";
+ case NVPTXISD::TexUnified3DS32FloatGrad:
+ return "NVPTXISD::TexUnified3DS32FloatGrad";
+ case NVPTXISD::TexUnified3DU32S32:
+ return "NVPTXISD::TexUnified3DU32S32";
+ case NVPTXISD::TexUnified3DU32Float:
+ return "NVPTXISD::TexUnified3DU32Float";
+ case NVPTXISD::TexUnified3DU32FloatLevel:
+ return "NVPTXISD::TexUnified3DU32FloatLevel";
+ case NVPTXISD::TexUnified3DU32FloatGrad:
+ return "NVPTXISD::TexUnified3DU32FloatGrad";
+ case NVPTXISD::TexUnifiedCubeFloatFloat:
+ return "NVPTXISD::TexUnifiedCubeFloatFloat";
+ case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
+ return "NVPTXISD::TexUnifiedCubeFloatFloatLevel";
+ case NVPTXISD::TexUnifiedCubeS32Float:
+ return "NVPTXISD::TexUnifiedCubeS32Float";
+ case NVPTXISD::TexUnifiedCubeS32FloatLevel:
+ return "NVPTXISD::TexUnifiedCubeS32FloatLevel";
+ case NVPTXISD::TexUnifiedCubeU32Float:
+ return "NVPTXISD::TexUnifiedCubeU32Float";
+ case NVPTXISD::TexUnifiedCubeU32FloatLevel:
+ return "NVPTXISD::TexUnifiedCubeU32FloatLevel";
+ case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
+ return "NVPTXISD::TexUnifiedCubeArrayFloatFloat";
+ case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
+ return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel";
+ case NVPTXISD::TexUnifiedCubeArrayS32Float:
+ return "NVPTXISD::TexUnifiedCubeArrayS32Float";
+ case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
+ return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel";
+ case NVPTXISD::TexUnifiedCubeArrayU32Float:
+ return "NVPTXISD::TexUnifiedCubeArrayU32Float";
+ case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
+ return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel";
+ case NVPTXISD::Tld4UnifiedR2DFloatFloat:
+ return "NVPTXISD::Tld4UnifiedR2DFloatFloat";
+ case NVPTXISD::Tld4UnifiedG2DFloatFloat:
+ return "NVPTXISD::Tld4UnifiedG2DFloatFloat";
+ case NVPTXISD::Tld4UnifiedB2DFloatFloat:
+ return "NVPTXISD::Tld4UnifiedB2DFloatFloat";
+ case NVPTXISD::Tld4UnifiedA2DFloatFloat:
+ return "NVPTXISD::Tld4UnifiedA2DFloatFloat";
+ case NVPTXISD::Tld4UnifiedR2DS64Float:
+ return "NVPTXISD::Tld4UnifiedR2DS64Float";
+ case NVPTXISD::Tld4UnifiedG2DS64Float:
+ return "NVPTXISD::Tld4UnifiedG2DS64Float";
+ case NVPTXISD::Tld4UnifiedB2DS64Float:
+ return "NVPTXISD::Tld4UnifiedB2DS64Float";
+ case NVPTXISD::Tld4UnifiedA2DS64Float:
+ return "NVPTXISD::Tld4UnifiedA2DS64Float";
+ case NVPTXISD::Tld4UnifiedR2DU64Float:
+ return "NVPTXISD::Tld4UnifiedR2DU64Float";
+ case NVPTXISD::Tld4UnifiedG2DU64Float:
+ return "NVPTXISD::Tld4UnifiedG2DU64Float";
+ case NVPTXISD::Tld4UnifiedB2DU64Float:
+ return "NVPTXISD::Tld4UnifiedB2DU64Float";
+ case NVPTXISD::Tld4UnifiedA2DU64Float:
+ return "NVPTXISD::Tld4UnifiedA2DU64Float";
+
+ case NVPTXISD::Suld1DI8Clamp: return "NVPTXISD::Suld1DI8Clamp";
+ case NVPTXISD::Suld1DI16Clamp: return "NVPTXISD::Suld1DI16Clamp";
+ case NVPTXISD::Suld1DI32Clamp: return "NVPTXISD::Suld1DI32Clamp";
+ case NVPTXISD::Suld1DI64Clamp: return "NVPTXISD::Suld1DI64Clamp";
+ case NVPTXISD::Suld1DV2I8Clamp: return "NVPTXISD::Suld1DV2I8Clamp";
+ case NVPTXISD::Suld1DV2I16Clamp: return "NVPTXISD::Suld1DV2I16Clamp";
+ case NVPTXISD::Suld1DV2I32Clamp: return "NVPTXISD::Suld1DV2I32Clamp";
+ case NVPTXISD::Suld1DV2I64Clamp: return "NVPTXISD::Suld1DV2I64Clamp";
+ case NVPTXISD::Suld1DV4I8Clamp: return "NVPTXISD::Suld1DV4I8Clamp";
+ case NVPTXISD::Suld1DV4I16Clamp: return "NVPTXISD::Suld1DV4I16Clamp";
+ case NVPTXISD::Suld1DV4I32Clamp: return "NVPTXISD::Suld1DV4I32Clamp";
+
+ case NVPTXISD::Suld1DArrayI8Clamp: return "NVPTXISD::Suld1DArrayI8Clamp";
+ case NVPTXISD::Suld1DArrayI16Clamp: return "NVPTXISD::Suld1DArrayI16Clamp";
+ case NVPTXISD::Suld1DArrayI32Clamp: return "NVPTXISD::Suld1DArrayI32Clamp";
+ case NVPTXISD::Suld1DArrayI64Clamp: return "NVPTXISD::Suld1DArrayI64Clamp";
+ case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp";
+ case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp";
+ case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp";
+ case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp";
+ case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp";
+ case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp";
+ case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp";
+
+ case NVPTXISD::Suld2DI8Clamp: return "NVPTXISD::Suld2DI8Clamp";
+ case NVPTXISD::Suld2DI16Clamp: return "NVPTXISD::Suld2DI16Clamp";
+ case NVPTXISD::Suld2DI32Clamp: return "NVPTXISD::Suld2DI32Clamp";
+ case NVPTXISD::Suld2DI64Clamp: return "NVPTXISD::Suld2DI64Clamp";
+ case NVPTXISD::Suld2DV2I8Clamp: return "NVPTXISD::Suld2DV2I8Clamp";
+ case NVPTXISD::Suld2DV2I16Clamp: return "NVPTXISD::Suld2DV2I16Clamp";
+ case NVPTXISD::Suld2DV2I32Clamp: return "NVPTXISD::Suld2DV2I32Clamp";
+ case NVPTXISD::Suld2DV2I64Clamp: return "NVPTXISD::Suld2DV2I64Clamp";
+ case NVPTXISD::Suld2DV4I8Clamp: return "NVPTXISD::Suld2DV4I8Clamp";
+ case NVPTXISD::Suld2DV4I16Clamp: return "NVPTXISD::Suld2DV4I16Clamp";
+ case NVPTXISD::Suld2DV4I32Clamp: return "NVPTXISD::Suld2DV4I32Clamp";
+
+ case NVPTXISD::Suld2DArrayI8Clamp: return "NVPTXISD::Suld2DArrayI8Clamp";
+ case NVPTXISD::Suld2DArrayI16Clamp: return "NVPTXISD::Suld2DArrayI16Clamp";
+ case NVPTXISD::Suld2DArrayI32Clamp: return "NVPTXISD::Suld2DArrayI32Clamp";
+ case NVPTXISD::Suld2DArrayI64Clamp: return "NVPTXISD::Suld2DArrayI64Clamp";
+ case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp";
+ case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp";
+ case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp";
+ case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp";
+ case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp";
+ case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp";
+ case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp";
+
+ case NVPTXISD::Suld3DI8Clamp: return "NVPTXISD::Suld3DI8Clamp";
+ case NVPTXISD::Suld3DI16Clamp: return "NVPTXISD::Suld3DI16Clamp";
+ case NVPTXISD::Suld3DI32Clamp: return "NVPTXISD::Suld3DI32Clamp";
+ case NVPTXISD::Suld3DI64Clamp: return "NVPTXISD::Suld3DI64Clamp";
+ case NVPTXISD::Suld3DV2I8Clamp: return "NVPTXISD::Suld3DV2I8Clamp";
+ case NVPTXISD::Suld3DV2I16Clamp: return "NVPTXISD::Suld3DV2I16Clamp";
+ case NVPTXISD::Suld3DV2I32Clamp: return "NVPTXISD::Suld3DV2I32Clamp";
+ case NVPTXISD::Suld3DV2I64Clamp: return "NVPTXISD::Suld3DV2I64Clamp";
+ case NVPTXISD::Suld3DV4I8Clamp: return "NVPTXISD::Suld3DV4I8Clamp";
+ case NVPTXISD::Suld3DV4I16Clamp: return "NVPTXISD::Suld3DV4I16Clamp";
+ case NVPTXISD::Suld3DV4I32Clamp: return "NVPTXISD::Suld3DV4I32Clamp";
+
+ case NVPTXISD::Suld1DI8Trap: return "NVPTXISD::Suld1DI8Trap";
+ case NVPTXISD::Suld1DI16Trap: return "NVPTXISD::Suld1DI16Trap";
+ case NVPTXISD::Suld1DI32Trap: return "NVPTXISD::Suld1DI32Trap";
+ case NVPTXISD::Suld1DI64Trap: return "NVPTXISD::Suld1DI64Trap";
+ case NVPTXISD::Suld1DV2I8Trap: return "NVPTXISD::Suld1DV2I8Trap";
+ case NVPTXISD::Suld1DV2I16Trap: return "NVPTXISD::Suld1DV2I16Trap";
+ case NVPTXISD::Suld1DV2I32Trap: return "NVPTXISD::Suld1DV2I32Trap";
+ case NVPTXISD::Suld1DV2I64Trap: return "NVPTXISD::Suld1DV2I64Trap";
+ case NVPTXISD::Suld1DV4I8Trap: return "NVPTXISD::Suld1DV4I8Trap";
+ case NVPTXISD::Suld1DV4I16Trap: return "NVPTXISD::Suld1DV4I16Trap";
+ case NVPTXISD::Suld1DV4I32Trap: return "NVPTXISD::Suld1DV4I32Trap";
+
+ case NVPTXISD::Suld1DArrayI8Trap: return "NVPTXISD::Suld1DArrayI8Trap";
+ case NVPTXISD::Suld1DArrayI16Trap: return "NVPTXISD::Suld1DArrayI16Trap";
+ case NVPTXISD::Suld1DArrayI32Trap: return "NVPTXISD::Suld1DArrayI32Trap";
+ case NVPTXISD::Suld1DArrayI64Trap: return "NVPTXISD::Suld1DArrayI64Trap";
+ case NVPTXISD::Suld1DArrayV2I8Trap: return "NVPTXISD::Suld1DArrayV2I8Trap";
+ case NVPTXISD::Suld1DArrayV2I16Trap: return "NVPTXISD::Suld1DArrayV2I16Trap";
+ case NVPTXISD::Suld1DArrayV2I32Trap: return "NVPTXISD::Suld1DArrayV2I32Trap";
+ case NVPTXISD::Suld1DArrayV2I64Trap: return "NVPTXISD::Suld1DArrayV2I64Trap";
+ case NVPTXISD::Suld1DArrayV4I8Trap: return "NVPTXISD::Suld1DArrayV4I8Trap";
+ case NVPTXISD::Suld1DArrayV4I16Trap: return "NVPTXISD::Suld1DArrayV4I16Trap";
+ case NVPTXISD::Suld1DArrayV4I32Trap: return "NVPTXISD::Suld1DArrayV4I32Trap";
+
+ case NVPTXISD::Suld2DI8Trap: return "NVPTXISD::Suld2DI8Trap";
+ case NVPTXISD::Suld2DI16Trap: return "NVPTXISD::Suld2DI16Trap";
+ case NVPTXISD::Suld2DI32Trap: return "NVPTXISD::Suld2DI32Trap";
+ case NVPTXISD::Suld2DI64Trap: return "NVPTXISD::Suld2DI64Trap";
+ case NVPTXISD::Suld2DV2I8Trap: return "NVPTXISD::Suld2DV2I8Trap";
+ case NVPTXISD::Suld2DV2I16Trap: return "NVPTXISD::Suld2DV2I16Trap";
+ case NVPTXISD::Suld2DV2I32Trap: return "NVPTXISD::Suld2DV2I32Trap";
+ case NVPTXISD::Suld2DV2I64Trap: return "NVPTXISD::Suld2DV2I64Trap";
+ case NVPTXISD::Suld2DV4I8Trap: return "NVPTXISD::Suld2DV4I8Trap";
+ case NVPTXISD::Suld2DV4I16Trap: return "NVPTXISD::Suld2DV4I16Trap";
+ case NVPTXISD::Suld2DV4I32Trap: return "NVPTXISD::Suld2DV4I32Trap";
+
+ case NVPTXISD::Suld2DArrayI8Trap: return "NVPTXISD::Suld2DArrayI8Trap";
+ case NVPTXISD::Suld2DArrayI16Trap: return "NVPTXISD::Suld2DArrayI16Trap";
+ case NVPTXISD::Suld2DArrayI32Trap: return "NVPTXISD::Suld2DArrayI32Trap";
+ case NVPTXISD::Suld2DArrayI64Trap: return "NVPTXISD::Suld2DArrayI64Trap";
+ case NVPTXISD::Suld2DArrayV2I8Trap: return "NVPTXISD::Suld2DArrayV2I8Trap";
+ case NVPTXISD::Suld2DArrayV2I16Trap: return "NVPTXISD::Suld2DArrayV2I16Trap";
+ case NVPTXISD::Suld2DArrayV2I32Trap: return "NVPTXISD::Suld2DArrayV2I32Trap";
+ case NVPTXISD::Suld2DArrayV2I64Trap: return "NVPTXISD::Suld2DArrayV2I64Trap";
+ case NVPTXISD::Suld2DArrayV4I8Trap: return "NVPTXISD::Suld2DArrayV4I8Trap";
+ case NVPTXISD::Suld2DArrayV4I16Trap: return "NVPTXISD::Suld2DArrayV4I16Trap";
+ case NVPTXISD::Suld2DArrayV4I32Trap: return "NVPTXISD::Suld2DArrayV4I32Trap";
+
+ case NVPTXISD::Suld3DI8Trap: return "NVPTXISD::Suld3DI8Trap";
+ case NVPTXISD::Suld3DI16Trap: return "NVPTXISD::Suld3DI16Trap";
+ case NVPTXISD::Suld3DI32Trap: return "NVPTXISD::Suld3DI32Trap";
+ case NVPTXISD::Suld3DI64Trap: return "NVPTXISD::Suld3DI64Trap";
+ case NVPTXISD::Suld3DV2I8Trap: return "NVPTXISD::Suld3DV2I8Trap";
+ case NVPTXISD::Suld3DV2I16Trap: return "NVPTXISD::Suld3DV2I16Trap";
+ case NVPTXISD::Suld3DV2I32Trap: return "NVPTXISD::Suld3DV2I32Trap";
+ case NVPTXISD::Suld3DV2I64Trap: return "NVPTXISD::Suld3DV2I64Trap";
+ case NVPTXISD::Suld3DV4I8Trap: return "NVPTXISD::Suld3DV4I8Trap";
+ case NVPTXISD::Suld3DV4I16Trap: return "NVPTXISD::Suld3DV4I16Trap";
+ case NVPTXISD::Suld3DV4I32Trap: return "NVPTXISD::Suld3DV4I32Trap";
+
+ case NVPTXISD::Suld1DI8Zero: return "NVPTXISD::Suld1DI8Zero";
+ case NVPTXISD::Suld1DI16Zero: return "NVPTXISD::Suld1DI16Zero";
+ case NVPTXISD::Suld1DI32Zero: return "NVPTXISD::Suld1DI32Zero";
+ case NVPTXISD::Suld1DI64Zero: return "NVPTXISD::Suld1DI64Zero";
+ case NVPTXISD::Suld1DV2I8Zero: return "NVPTXISD::Suld1DV2I8Zero";
+ case NVPTXISD::Suld1DV2I16Zero: return "NVPTXISD::Suld1DV2I16Zero";
+ case NVPTXISD::Suld1DV2I32Zero: return "NVPTXISD::Suld1DV2I32Zero";
+ case NVPTXISD::Suld1DV2I64Zero: return "NVPTXISD::Suld1DV2I64Zero";
+ case NVPTXISD::Suld1DV4I8Zero: return "NVPTXISD::Suld1DV4I8Zero";
+ case NVPTXISD::Suld1DV4I16Zero: return "NVPTXISD::Suld1DV4I16Zero";
+ case NVPTXISD::Suld1DV4I32Zero: return "NVPTXISD::Suld1DV4I32Zero";
+
+ case NVPTXISD::Suld1DArrayI8Zero: return "NVPTXISD::Suld1DArrayI8Zero";
+ case NVPTXISD::Suld1DArrayI16Zero: return "NVPTXISD::Suld1DArrayI16Zero";
+ case NVPTXISD::Suld1DArrayI32Zero: return "NVPTXISD::Suld1DArrayI32Zero";
+ case NVPTXISD::Suld1DArrayI64Zero: return "NVPTXISD::Suld1DArrayI64Zero";
+ case NVPTXISD::Suld1DArrayV2I8Zero: return "NVPTXISD::Suld1DArrayV2I8Zero";
+ case NVPTXISD::Suld1DArrayV2I16Zero: return "NVPTXISD::Suld1DArrayV2I16Zero";
+ case NVPTXISD::Suld1DArrayV2I32Zero: return "NVPTXISD::Suld1DArrayV2I32Zero";
+ case NVPTXISD::Suld1DArrayV2I64Zero: return "NVPTXISD::Suld1DArrayV2I64Zero";
+ case NVPTXISD::Suld1DArrayV4I8Zero: return "NVPTXISD::Suld1DArrayV4I8Zero";
+ case NVPTXISD::Suld1DArrayV4I16Zero: return "NVPTXISD::Suld1DArrayV4I16Zero";
+ case NVPTXISD::Suld1DArrayV4I32Zero: return "NVPTXISD::Suld1DArrayV4I32Zero";
+
+ case NVPTXISD::Suld2DI8Zero: return "NVPTXISD::Suld2DI8Zero";
+ case NVPTXISD::Suld2DI16Zero: return "NVPTXISD::Suld2DI16Zero";
+ case NVPTXISD::Suld2DI32Zero: return "NVPTXISD::Suld2DI32Zero";
+ case NVPTXISD::Suld2DI64Zero: return "NVPTXISD::Suld2DI64Zero";
+ case NVPTXISD::Suld2DV2I8Zero: return "NVPTXISD::Suld2DV2I8Zero";
+ case NVPTXISD::Suld2DV2I16Zero: return "NVPTXISD::Suld2DV2I16Zero";
+ case NVPTXISD::Suld2DV2I32Zero: return "NVPTXISD::Suld2DV2I32Zero";
+ case NVPTXISD::Suld2DV2I64Zero: return "NVPTXISD::Suld2DV2I64Zero";
+ case NVPTXISD::Suld2DV4I8Zero: return "NVPTXISD::Suld2DV4I8Zero";
+ case NVPTXISD::Suld2DV4I16Zero: return "NVPTXISD::Suld2DV4I16Zero";
+ case NVPTXISD::Suld2DV4I32Zero: return "NVPTXISD::Suld2DV4I32Zero";
+
+ case NVPTXISD::Suld2DArrayI8Zero: return "NVPTXISD::Suld2DArrayI8Zero";
+ case NVPTXISD::Suld2DArrayI16Zero: return "NVPTXISD::Suld2DArrayI16Zero";
+ case NVPTXISD::Suld2DArrayI32Zero: return "NVPTXISD::Suld2DArrayI32Zero";
+ case NVPTXISD::Suld2DArrayI64Zero: return "NVPTXISD::Suld2DArrayI64Zero";
+ case NVPTXISD::Suld2DArrayV2I8Zero: return "NVPTXISD::Suld2DArrayV2I8Zero";
+ case NVPTXISD::Suld2DArrayV2I16Zero: return "NVPTXISD::Suld2DArrayV2I16Zero";
+ case NVPTXISD::Suld2DArrayV2I32Zero: return "NVPTXISD::Suld2DArrayV2I32Zero";
+ case NVPTXISD::Suld2DArrayV2I64Zero: return "NVPTXISD::Suld2DArrayV2I64Zero";
+ case NVPTXISD::Suld2DArrayV4I8Zero: return "NVPTXISD::Suld2DArrayV4I8Zero";
+ case NVPTXISD::Suld2DArrayV4I16Zero: return "NVPTXISD::Suld2DArrayV4I16Zero";
+ case NVPTXISD::Suld2DArrayV4I32Zero: return "NVPTXISD::Suld2DArrayV4I32Zero";
+
+ case NVPTXISD::Suld3DI8Zero: return "NVPTXISD::Suld3DI8Zero";
+ case NVPTXISD::Suld3DI16Zero: return "NVPTXISD::Suld3DI16Zero";
+ case NVPTXISD::Suld3DI32Zero: return "NVPTXISD::Suld3DI32Zero";
+ case NVPTXISD::Suld3DI64Zero: return "NVPTXISD::Suld3DI64Zero";
+ case NVPTXISD::Suld3DV2I8Zero: return "NVPTXISD::Suld3DV2I8Zero";
+ case NVPTXISD::Suld3DV2I16Zero: return "NVPTXISD::Suld3DV2I16Zero";
+ case NVPTXISD::Suld3DV2I32Zero: return "NVPTXISD::Suld3DV2I32Zero";
+ case NVPTXISD::Suld3DV2I64Zero: return "NVPTXISD::Suld3DV2I64Zero";
+ case NVPTXISD::Suld3DV4I8Zero: return "NVPTXISD::Suld3DV4I8Zero";
+ case NVPTXISD::Suld3DV4I16Zero: return "NVPTXISD::Suld3DV4I16Zero";
+ case NVPTXISD::Suld3DV4I32Zero: return "NVPTXISD::Suld3DV4I32Zero";
+ }
+ return nullptr;
+}
+
+TargetLoweringBase::LegalizeTypeAction
+NVPTXTargetLowering::getPreferredVectorAction(EVT VT) const {
+ if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1)
+ return TypeSplitVector;
+
+ return TargetLoweringBase::getPreferredVectorAction(VT);
+}
+
+SDValue
+NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+ Op = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
+ return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
+}
+
+std::string NVPTXTargetLowering::getPrototype(
+ const DataLayout &DL, Type *retTy, const ArgListTy &Args,
+ const SmallVectorImpl<ISD::OutputArg> &Outs, unsigned retAlignment,
+ const ImmutableCallSite *CS) const {
+ auto PtrVT = getPointerTy(DL);
+
+ bool isABI = (STI.getSmVersion() >= 20);
+ assert(isABI && "Non-ABI compilation is not supported");
+ if (!isABI)
+ return "";
+
+ std::stringstream O;
+ O << "prototype_" << uniqueCallSite << " : .callprototype ";
+
+ if (retTy->getTypeID() == Type::VoidTyID) {
+ O << "()";
+ } else {
+ O << "(";
+ if (retTy->isFloatingPointTy() || retTy->isIntegerTy()) {
+ unsigned size = 0;
+ if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
+ size = ITy->getBitWidth();
+ if (size < 32)
+ size = 32;
+ } else {
+ assert(retTy->isFloatingPointTy() &&
+ "Floating point type expected here");
+ size = retTy->getPrimitiveSizeInBits();
+ }
+
+ O << ".param .b" << size << " _";
+ } else if (isa<PointerType>(retTy)) {
+ O << ".param .b" << PtrVT.getSizeInBits() << " _";
+ } else if ((retTy->getTypeID() == Type::StructTyID) ||
+ isa<VectorType>(retTy)) {
+ auto &DL = CS->getCalledFunction()->getParent()->getDataLayout();
+ O << ".param .align " << retAlignment << " .b8 _["
+ << DL.getTypeAllocSize(retTy) << "]";
+ } else {
+ llvm_unreachable("Unknown return type");
+ }
+ O << ") ";
+ }
+ O << "_ (";
+
+ bool first = true;
+
+ unsigned OIdx = 0;
+ for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
+ Type *Ty = Args[i].Ty;
+ if (!first) {
+ O << ", ";
+ }
+ first = false;
+
+ if (!Outs[OIdx].Flags.isByVal()) {
+ if (Ty->isAggregateType() || Ty->isVectorTy()) {
+ unsigned align = 0;
+ const CallInst *CallI = cast<CallInst>(CS->getInstruction());
+ // +1 because index 0 is reserved for return type alignment
+ if (!llvm::getAlign(*CallI, i + 1, align))
+ align = DL.getABITypeAlignment(Ty);
+ unsigned sz = DL.getTypeAllocSize(Ty);
+ O << ".param .align " << align << " .b8 ";
+ O << "_";
+ O << "[" << sz << "]";
+ // update the index for Outs
+ SmallVector<EVT, 16> vtparts;
+ ComputeValueVTs(*this, DL, Ty, vtparts);
+ if (unsigned len = vtparts.size())
+ OIdx += len - 1;
+ continue;
+ }
+ // i8 types in IR will be i16 types in SDAG
+ assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
+ (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
+ "type mismatch between callee prototype and arguments");
+ // scalar type
+ unsigned sz = 0;
+ if (isa<IntegerType>(Ty)) {
+ sz = cast<IntegerType>(Ty)->getBitWidth();
+ if (sz < 32)
+ sz = 32;
+ } else if (isa<PointerType>(Ty))
+ sz = PtrVT.getSizeInBits();
+ else
+ sz = Ty->getPrimitiveSizeInBits();
+ O << ".param .b" << sz << " ";
+ O << "_";
+ continue;
+ }
+ auto *PTy = dyn_cast<PointerType>(Ty);
+ assert(PTy && "Param with byval attribute should be a pointer type");
+ Type *ETy = PTy->getElementType();
+
+ unsigned align = Outs[OIdx].Flags.getByValAlign();
+ unsigned sz = DL.getTypeAllocSize(ETy);
+ O << ".param .align " << align << " .b8 ";
+ O << "_";
+ O << "[" << sz << "]";
+ }
+ O << ");";
+ return O.str();
+}
+
+unsigned NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
+ const ImmutableCallSite *CS,
+ Type *Ty, unsigned Idx,
+ const DataLayout &DL) const {
+ if (!CS) {
+ // CallSite is zero, fallback to ABI type alignment
+ return DL.getABITypeAlignment(Ty);
+ }
+
+ unsigned Align = 0;
+ const Value *DirectCallee = CS->getCalledFunction();
+
+ if (!DirectCallee) {
+ // We don't have a direct function symbol, but that may be because of
+ // constant cast instructions in the call.
+ const Instruction *CalleeI = CS->getInstruction();
+ assert(CalleeI && "Call target is not a function or derived value?");
+
+ // With bitcast'd call targets, the instruction will be the call
+ if (isa<CallInst>(CalleeI)) {
+ // Check if we have call alignment metadata
+ if (llvm::getAlign(*cast<CallInst>(CalleeI), Idx, Align))
+ return Align;
+
+ const Value *CalleeV = cast<CallInst>(CalleeI)->getCalledValue();
+ // Ignore any bitcast instructions
+ while (isa<ConstantExpr>(CalleeV)) {
+ const ConstantExpr *CE = cast<ConstantExpr>(CalleeV);
+ if (!CE->isCast())
+ break;
+ // Look through the bitcast
+ CalleeV = cast<ConstantExpr>(CalleeV)->getOperand(0);
+ }
+
+ // We have now looked past all of the bitcasts. Do we finally have a
+ // Function?
+ if (isa<Function>(CalleeV))
+ DirectCallee = CalleeV;
+ }
+ }
+
+ // Check for function alignment information if we found that the
+ // ultimate target is a Function
+ if (DirectCallee)
+ if (llvm::getAlign(*cast<Function>(DirectCallee), Idx, Align))
+ return Align;
+
+ // Call is indirect or alignment information is not available, fall back to
+ // the ABI type alignment
+ return DL.getABITypeAlignment(Ty);
+}
+
+SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const {
+ SelectionDAG &DAG = CLI.DAG;
+ SDLoc dl = CLI.DL;
+ SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+ SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+ SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
+ SDValue Chain = CLI.Chain;
+ SDValue Callee = CLI.Callee;
+ bool &isTailCall = CLI.IsTailCall;
+ ArgListTy &Args = CLI.getArgs();
+ Type *retTy = CLI.RetTy;
+ ImmutableCallSite *CS = CLI.CS;
+
+ bool isABI = (STI.getSmVersion() >= 20);
+ assert(isABI && "Non-ABI compilation is not supported");
+ if (!isABI)
+ return Chain;
+ MachineFunction &MF = DAG.getMachineFunction();
+ const Function *F = MF.getFunction();
+ auto &DL = MF.getDataLayout();
+
+ SDValue tempChain = Chain;
+ Chain = DAG.getCALLSEQ_START(Chain,
+ DAG.getIntPtrConstant(uniqueCallSite, dl, true),
+ dl);
+ SDValue InFlag = Chain.getValue(1);
+
+ unsigned paramCount = 0;
+ // Args.size() and Outs.size() need not match.
+ // Outs.size() will be larger
+ // * if there is an aggregate argument with multiple fields (each field
+ // showing up separately in Outs)
+ // * if there is a vector argument with more than typical vector-length
+ // elements (generally if more than 4) where each vector element is
+ // individually present in Outs.
+ // So a different index should be used for indexing into Outs/OutVals.
+ // See similar issue in LowerFormalArguments.
+ unsigned OIdx = 0;
+ // Declare the .params or .reg need to pass values
+ // to the function
+ for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
+ EVT VT = Outs[OIdx].VT;
+ Type *Ty = Args[i].Ty;
+
+ if (!Outs[OIdx].Flags.isByVal()) {
+ if (Ty->isAggregateType()) {
+ // aggregate
+ SmallVector<EVT, 16> vtparts;
+ SmallVector<uint64_t, 16> Offsets;
+ ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts, &Offsets,
+ 0);
+
+ unsigned align =
+ getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL);
+ // declare .param .align <align> .b8 .param<n>[<size>];
+ unsigned sz = DL.getTypeAllocSize(Ty);
+ SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue DeclareParamOps[] = { Chain, DAG.getConstant(align, dl,
+ MVT::i32),
+ DAG.getConstant(paramCount, dl, MVT::i32),
+ DAG.getConstant(sz, dl, MVT::i32),
+ InFlag };
+ Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
+ DeclareParamOps);
+ InFlag = Chain.getValue(1);
+ for (unsigned j = 0, je = vtparts.size(); j != je; ++j) {
+ EVT elemtype = vtparts[j];
+ unsigned ArgAlign = GreatestCommonDivisor64(align, Offsets[j]);
+ if (elemtype.isInteger() && (sz < 8))
+ sz = 8;
+ SDValue StVal = OutVals[OIdx];
+ if (elemtype.getSizeInBits() < 16) {
+ StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
+ }
+ SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue CopyParamOps[] = { Chain,
+ DAG.getConstant(paramCount, dl, MVT::i32),
+ DAG.getConstant(Offsets[j], dl, MVT::i32),
+ StVal, InFlag };
+ Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl,
+ CopyParamVTs, CopyParamOps,
+ elemtype, MachinePointerInfo(),
+ ArgAlign);
+ InFlag = Chain.getValue(1);
+ ++OIdx;
+ }
+ if (vtparts.size() > 0)
+ --OIdx;
+ ++paramCount;
+ continue;
+ }
+ if (Ty->isVectorTy()) {
+ EVT ObjectVT = getValueType(DL, Ty);
+ unsigned align =
+ getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL);
+ // declare .param .align <align> .b8 .param<n>[<size>];
+ unsigned sz = DL.getTypeAllocSize(Ty);
+ SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue DeclareParamOps[] = { Chain,
+ DAG.getConstant(align, dl, MVT::i32),
+ DAG.getConstant(paramCount, dl, MVT::i32),
+ DAG.getConstant(sz, dl, MVT::i32),
+ InFlag };
+ Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
+ DeclareParamOps);
+ InFlag = Chain.getValue(1);
+ unsigned NumElts = ObjectVT.getVectorNumElements();
+ EVT EltVT = ObjectVT.getVectorElementType();
+ EVT MemVT = EltVT;
+ bool NeedExtend = false;
+ if (EltVT.getSizeInBits() < 16) {
+ NeedExtend = true;
+ EltVT = MVT::i16;
+ }
+
+ // V1 store
+ if (NumElts == 1) {
+ SDValue Elt = OutVals[OIdx++];
+ if (NeedExtend)
+ Elt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt);
+
+ SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue CopyParamOps[] = { Chain,
+ DAG.getConstant(paramCount, dl, MVT::i32),
+ DAG.getConstant(0, dl, MVT::i32), Elt,
+ InFlag };
+ Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl,
+ CopyParamVTs, CopyParamOps,
+ MemVT, MachinePointerInfo());
+ InFlag = Chain.getValue(1);
+ } else if (NumElts == 2) {
+ SDValue Elt0 = OutVals[OIdx++];
+ SDValue Elt1 = OutVals[OIdx++];
+ if (NeedExtend) {
+ Elt0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt0);
+ Elt1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt1);
+ }
+
+ SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue CopyParamOps[] = { Chain,
+ DAG.getConstant(paramCount, dl, MVT::i32),
+ DAG.getConstant(0, dl, MVT::i32), Elt0,
+ Elt1, InFlag };
+ Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParamV2, dl,
+ CopyParamVTs, CopyParamOps,
+ MemVT, MachinePointerInfo());
+ InFlag = Chain.getValue(1);
+ } else {
+ unsigned curOffset = 0;
+ // V4 stores
+ // We have at least 4 elements (<3 x Ty> expands to 4 elements) and
+ // the
+ // vector will be expanded to a power of 2 elements, so we know we can
+ // always round up to the next multiple of 4 when creating the vector
+ // stores.
+ // e.g. 4 elem => 1 st.v4
+ // 6 elem => 2 st.v4
+ // 8 elem => 2 st.v4
+ // 11 elem => 3 st.v4
+ unsigned VecSize = 4;
+ if (EltVT.getSizeInBits() == 64)
+ VecSize = 2;
+
+ // This is potentially only part of a vector, so assume all elements
+ // are packed together.
+ unsigned PerStoreOffset = MemVT.getStoreSizeInBits() / 8 * VecSize;
+
+ for (unsigned i = 0; i < NumElts; i += VecSize) {
+ // Get values
+ SDValue StoreVal;
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(Chain);
+ Ops.push_back(DAG.getConstant(paramCount, dl, MVT::i32));
+ Ops.push_back(DAG.getConstant(curOffset, dl, MVT::i32));
+
+ unsigned Opc = NVPTXISD::StoreParamV2;
+
+ StoreVal = OutVals[OIdx++];
+ if (NeedExtend)
+ StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
+ Ops.push_back(StoreVal);
+
+ if (i + 1 < NumElts) {
+ StoreVal = OutVals[OIdx++];
+ if (NeedExtend)
+ StoreVal =
+ DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
+ } else {
+ StoreVal = DAG.getUNDEF(EltVT);
+ }
+ Ops.push_back(StoreVal);
+
+ if (VecSize == 4) {
+ Opc = NVPTXISD::StoreParamV4;
+ if (i + 2 < NumElts) {
+ StoreVal = OutVals[OIdx++];
+ if (NeedExtend)
+ StoreVal =
+ DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
+ } else {
+ StoreVal = DAG.getUNDEF(EltVT);
+ }
+ Ops.push_back(StoreVal);
+
+ if (i + 3 < NumElts) {
+ StoreVal = OutVals[OIdx++];
+ if (NeedExtend)
+ StoreVal =
+ DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
+ } else {
+ StoreVal = DAG.getUNDEF(EltVT);
+ }
+ Ops.push_back(StoreVal);
+ }
+
+ Ops.push_back(InFlag);
+
+ SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ Chain = DAG.getMemIntrinsicNode(Opc, dl, CopyParamVTs, Ops,
+ MemVT, MachinePointerInfo());
+ InFlag = Chain.getValue(1);
+ curOffset += PerStoreOffset;
+ }
+ }
+ ++paramCount;
+ --OIdx;
+ continue;
+ }
+ // Plain scalar
+ // for ABI, declare .param .b<size> .param<n>;
+ unsigned sz = VT.getSizeInBits();
+ bool needExtend = false;
+ if (VT.isInteger()) {
+ if (sz < 16)
+ needExtend = true;
+ if (sz < 32)
+ sz = 32;
+ }
+ SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue DeclareParamOps[] = { Chain,
+ DAG.getConstant(paramCount, dl, MVT::i32),
+ DAG.getConstant(sz, dl, MVT::i32),
+ DAG.getConstant(0, dl, MVT::i32), InFlag };
+ Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
+ DeclareParamOps);
+ InFlag = Chain.getValue(1);
+ SDValue OutV = OutVals[OIdx];
+ if (needExtend) {
+ // zext/sext i1 to i16
+ unsigned opc = ISD::ZERO_EXTEND;
+ if (Outs[OIdx].Flags.isSExt())
+ opc = ISD::SIGN_EXTEND;
+ OutV = DAG.getNode(opc, dl, MVT::i16, OutV);
+ }
+ SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue CopyParamOps[] = { Chain,
+ DAG.getConstant(paramCount, dl, MVT::i32),
+ DAG.getConstant(0, dl, MVT::i32), OutV,
+ InFlag };
+
+ unsigned opcode = NVPTXISD::StoreParam;
+ if (Outs[OIdx].Flags.isZExt() && VT.getSizeInBits() < 32)
+ opcode = NVPTXISD::StoreParamU32;
+ else if (Outs[OIdx].Flags.isSExt() && VT.getSizeInBits() < 32)
+ opcode = NVPTXISD::StoreParamS32;
+ Chain = DAG.getMemIntrinsicNode(opcode, dl, CopyParamVTs, CopyParamOps,
+ VT, MachinePointerInfo());
+
+ InFlag = Chain.getValue(1);
+ ++paramCount;
+ continue;
+ }
+ // struct or vector
+ SmallVector<EVT, 16> vtparts;
+ SmallVector<uint64_t, 16> Offsets;
+ auto *PTy = dyn_cast<PointerType>(Args[i].Ty);
+ assert(PTy && "Type of a byval parameter should be pointer");
+ ComputePTXValueVTs(*this, DAG.getDataLayout(), PTy->getElementType(),
+ vtparts, &Offsets, 0);
+
+ // declare .param .align <align> .b8 .param<n>[<size>];
+ unsigned sz = Outs[OIdx].Flags.getByValSize();
+ SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ unsigned ArgAlign = Outs[OIdx].Flags.getByValAlign();
+ // The ByValAlign in the Outs[OIdx].Flags is alway set at this point,
+ // so we don't need to worry about natural alignment or not.
+ // See TargetLowering::LowerCallTo().
+
+ // Enforce minumum alignment of 4 to work around ptxas miscompile
+ // for sm_50+. See corresponding alignment adjustment in
+ // emitFunctionParamList() for details.
+ if (ArgAlign < 4)
+ ArgAlign = 4;
+ SDValue DeclareParamOps[] = {Chain, DAG.getConstant(ArgAlign, dl, MVT::i32),
+ DAG.getConstant(paramCount, dl, MVT::i32),
+ DAG.getConstant(sz, dl, MVT::i32), InFlag};
+ Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
+ DeclareParamOps);
+ InFlag = Chain.getValue(1);
+ for (unsigned j = 0, je = vtparts.size(); j != je; ++j) {
+ EVT elemtype = vtparts[j];
+ int curOffset = Offsets[j];
+ unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset);
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx],
+ DAG.getConstant(curOffset, dl, PtrVT));
+ SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
+ MachinePointerInfo(), PartAlign);
+ if (elemtype.getSizeInBits() < 16) {
+ theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal);
+ }
+ SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue CopyParamOps[] = { Chain,
+ DAG.getConstant(paramCount, dl, MVT::i32),
+ DAG.getConstant(curOffset, dl, MVT::i32),
+ theVal, InFlag };
+ Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs,
+ CopyParamOps, elemtype,
+ MachinePointerInfo());
+
+ InFlag = Chain.getValue(1);
+ }
+ ++paramCount;
+ }
+
+ GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
+ unsigned retAlignment = 0;
+
+ // Handle Result
+ if (Ins.size() > 0) {
+ SmallVector<EVT, 16> resvtparts;
+ ComputeValueVTs(*this, DL, retTy, resvtparts);
+
+ // Declare
+ // .param .align 16 .b8 retval0[<size-in-bytes>], or
+ // .param .b<size-in-bits> retval0
+ unsigned resultsz = DL.getTypeAllocSizeInBits(retTy);
+ // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for
+ // these three types to match the logic in
+ // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype.
+ // Plus, this behavior is consistent with nvcc's.
+ if (retTy->isFloatingPointTy() || retTy->isIntegerTy() ||
+ retTy->isPointerTy()) {
+ // Scalar needs to be at least 32bit wide
+ if (resultsz < 32)
+ resultsz = 32;
+ SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
+ DAG.getConstant(resultsz, dl, MVT::i32),
+ DAG.getConstant(0, dl, MVT::i32), InFlag };
+ Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
+ DeclareRetOps);
+ InFlag = Chain.getValue(1);
+ } else {
+ retAlignment = getArgumentAlignment(Callee, CS, retTy, 0, DL);
+ SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue DeclareRetOps[] = { Chain,
+ DAG.getConstant(retAlignment, dl, MVT::i32),
+ DAG.getConstant(resultsz / 8, dl, MVT::i32),
+ DAG.getConstant(0, dl, MVT::i32), InFlag };
+ Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
+ DeclareRetOps);
+ InFlag = Chain.getValue(1);
+ }
+ }
+
+ if (!Func) {
+ // This is indirect function call case : PTX requires a prototype of the
+ // form
+ // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
+ // to be emitted, and the label has to used as the last arg of call
+ // instruction.
+ // The prototype is embedded in a string and put as the operand for a
+ // CallPrototype SDNode which will print out to the value of the string.
+ SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ std::string Proto =
+ getPrototype(DAG.getDataLayout(), retTy, Args, Outs, retAlignment, CS);
+ const char *ProtoStr =
+ nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str();
+ SDValue ProtoOps[] = {
+ Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag,
+ };
+ Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
+ InFlag = Chain.getValue(1);
+ }
+ // Op to just print "call"
+ SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue PrintCallOps[] = {
+ Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InFlag
+ };
+ // We model convergent calls as separate opcodes.
+ unsigned Opcode = Func ? NVPTXISD::PrintCallUni : NVPTXISD::PrintCall;
+ if (CLI.IsConvergent)
+ Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni
+ : NVPTXISD::PrintConvergentCall;
+ Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
+ InFlag = Chain.getValue(1);
+
+ // Ops to print out the function name
+ SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue CallVoidOps[] = { Chain, Callee, InFlag };
+ Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
+ InFlag = Chain.getValue(1);
+
+ // Ops to print out the param list
+ SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue CallArgBeginOps[] = { Chain, InFlag };
+ Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
+ CallArgBeginOps);
+ InFlag = Chain.getValue(1);
+
+ for (unsigned i = 0, e = paramCount; i != e; ++i) {
+ unsigned opcode;
+ if (i == (e - 1))
+ opcode = NVPTXISD::LastCallArg;
+ else
+ opcode = NVPTXISD::CallArg;
+ SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
+ DAG.getConstant(i, dl, MVT::i32), InFlag };
+ Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
+ InFlag = Chain.getValue(1);
+ }
+ SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue CallArgEndOps[] = { Chain,
+ DAG.getConstant(Func ? 1 : 0, dl, MVT::i32),
+ InFlag };
+ Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
+ InFlag = Chain.getValue(1);
+
+ if (!Func) {
+ SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue PrototypeOps[] = { Chain,
+ DAG.getConstant(uniqueCallSite, dl, MVT::i32),
+ InFlag };
+ Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
+ InFlag = Chain.getValue(1);
+ }
+
+ // Generate loads from param memory/moves from registers for result
+ if (Ins.size() > 0) {
+ if (retTy && retTy->isVectorTy()) {
+ EVT ObjectVT = getValueType(DL, retTy);
+ unsigned NumElts = ObjectVT.getVectorNumElements();
+ EVT EltVT = ObjectVT.getVectorElementType();
+ assert(STI.getTargetLowering()->getNumRegisters(F->getContext(),
+ ObjectVT) == NumElts &&
+ "Vector was not scalarized");
+ unsigned sz = EltVT.getSizeInBits();
+ bool needTruncate = sz < 8;
+
+ if (NumElts == 1) {
+ // Just a simple load
+ SmallVector<EVT, 4> LoadRetVTs;
+ if (EltVT == MVT::i1 || EltVT == MVT::i8) {
+ // If loading i1/i8 result, generate
+ // load.b8 i16
+ // if i1
+ // trunc i16 to i1
+ LoadRetVTs.push_back(MVT::i16);
+ } else
+ LoadRetVTs.push_back(EltVT);
+ LoadRetVTs.push_back(MVT::Other);
+ LoadRetVTs.push_back(MVT::Glue);
+ SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
+ DAG.getConstant(0, dl, MVT::i32), InFlag};
+ SDValue retval = DAG.getMemIntrinsicNode(
+ NVPTXISD::LoadParam, dl,
+ DAG.getVTList(LoadRetVTs), LoadRetOps, EltVT, MachinePointerInfo());
+ Chain = retval.getValue(1);
+ InFlag = retval.getValue(2);
+ SDValue Ret0 = retval;
+ if (needTruncate)
+ Ret0 = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Ret0);
+ InVals.push_back(Ret0);
+ } else if (NumElts == 2) {
+ // LoadV2
+ SmallVector<EVT, 4> LoadRetVTs;
+ if (EltVT == MVT::i1 || EltVT == MVT::i8) {
+ // If loading i1/i8 result, generate
+ // load.b8 i16
+ // if i1
+ // trunc i16 to i1
+ LoadRetVTs.push_back(MVT::i16);
+ LoadRetVTs.push_back(MVT::i16);
+ } else {
+ LoadRetVTs.push_back(EltVT);
+ LoadRetVTs.push_back(EltVT);
+ }
+ LoadRetVTs.push_back(MVT::Other);
+ LoadRetVTs.push_back(MVT::Glue);
+ SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
+ DAG.getConstant(0, dl, MVT::i32), InFlag};
+ SDValue retval = DAG.getMemIntrinsicNode(
+ NVPTXISD::LoadParamV2, dl,
+ DAG.getVTList(LoadRetVTs), LoadRetOps, EltVT, MachinePointerInfo());
+ Chain = retval.getValue(2);
+ InFlag = retval.getValue(3);
+ SDValue Ret0 = retval.getValue(0);
+ SDValue Ret1 = retval.getValue(1);
+ if (needTruncate) {
+ Ret0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ret0);
+ InVals.push_back(Ret0);
+ Ret1 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ret1);
+ InVals.push_back(Ret1);
+ } else {
+ InVals.push_back(Ret0);
+ InVals.push_back(Ret1);
+ }
+ } else {
+ // Split into N LoadV4
+ unsigned Ofst = 0;
+ unsigned VecSize = 4;
+ unsigned Opc = NVPTXISD::LoadParamV4;
+ if (EltVT.getSizeInBits() == 64) {
+ VecSize = 2;
+ Opc = NVPTXISD::LoadParamV2;
+ }
+ EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize);
+ for (unsigned i = 0; i < NumElts; i += VecSize) {
+ SmallVector<EVT, 8> LoadRetVTs;
+ if (EltVT == MVT::i1 || EltVT == MVT::i8) {
+ // If loading i1/i8 result, generate
+ // load.b8 i16
+ // if i1
+ // trunc i16 to i1
+ for (unsigned j = 0; j < VecSize; ++j)
+ LoadRetVTs.push_back(MVT::i16);
+ } else {
+ for (unsigned j = 0; j < VecSize; ++j)
+ LoadRetVTs.push_back(EltVT);
+ }
+ LoadRetVTs.push_back(MVT::Other);
+ LoadRetVTs.push_back(MVT::Glue);
+ SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
+ DAG.getConstant(Ofst, dl, MVT::i32), InFlag};
+ SDValue retval = DAG.getMemIntrinsicNode(
+ Opc, dl, DAG.getVTList(LoadRetVTs),
+ LoadRetOps, EltVT, MachinePointerInfo());
+ if (VecSize == 2) {
+ Chain = retval.getValue(2);
+ InFlag = retval.getValue(3);
+ } else {
+ Chain = retval.getValue(4);
+ InFlag = retval.getValue(5);
+ }
+
+ for (unsigned j = 0; j < VecSize; ++j) {
+ if (i + j >= NumElts)
+ break;
+ SDValue Elt = retval.getValue(j);
+ if (needTruncate)
+ Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
+ InVals.push_back(Elt);
+ }
+ Ofst += DL.getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
+ }
+ }
+ } else {
+ SmallVector<EVT, 16> VTs;
+ SmallVector<uint64_t, 16> Offsets;
+ auto &DL = DAG.getDataLayout();
+ ComputePTXValueVTs(*this, DL, retTy, VTs, &Offsets, 0);
+ assert(VTs.size() == Ins.size() && "Bad value decomposition");
+ unsigned RetAlign = getArgumentAlignment(Callee, CS, retTy, 0, DL);
+ for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+ unsigned sz = VTs[i].getSizeInBits();
+ unsigned AlignI = GreatestCommonDivisor64(RetAlign, Offsets[i]);
+ bool needTruncate = false;
+ if (VTs[i].isInteger() && sz < 8) {
+ sz = 8;
+ needTruncate = true;
+ }
+
+ SmallVector<EVT, 4> LoadRetVTs;
+ EVT TheLoadType = VTs[i];
+ if (retTy->isIntegerTy() && DL.getTypeAllocSizeInBits(retTy) < 32) {
+ // This is for integer types only, and specifically not for
+ // aggregates.
+ LoadRetVTs.push_back(MVT::i32);
+ TheLoadType = MVT::i32;
+ needTruncate = true;
+ } else if (sz < 16) {
+ // If loading i1/i8 result, generate
+ // load i8 (-> i16)
+ // trunc i16 to i1/i8
+
+ // FIXME: Do we need to set needTruncate to true here, too? We could
+ // not figure out what this branch is for in D17872, so we left it
+ // alone. The comment above about loading i1/i8 may be wrong, as the
+ // branch above seems to cover integers of size < 32.
+ LoadRetVTs.push_back(MVT::i16);
+ } else
+ LoadRetVTs.push_back(Ins[i].VT);
+ LoadRetVTs.push_back(MVT::Other);
+ LoadRetVTs.push_back(MVT::Glue);
+
+ SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
+ DAG.getConstant(Offsets[i], dl, MVT::i32),
+ InFlag};
+ SDValue retval = DAG.getMemIntrinsicNode(
+ NVPTXISD::LoadParam, dl,
+ DAG.getVTList(LoadRetVTs), LoadRetOps,
+ TheLoadType, MachinePointerInfo(), AlignI);
+ Chain = retval.getValue(1);
+ InFlag = retval.getValue(2);
+ SDValue Ret0 = retval.getValue(0);
+ if (needTruncate)
+ Ret0 = DAG.getNode(ISD::TRUNCATE, dl, Ins[i].VT, Ret0);
+ InVals.push_back(Ret0);
+ }
+ }
+ }
+
+ Chain = DAG.getCALLSEQ_END(Chain,
+ DAG.getIntPtrConstant(uniqueCallSite, dl, true),
+ DAG.getIntPtrConstant(uniqueCallSite + 1, dl,
+ true),
+ InFlag, dl);
+ uniqueCallSite++;
+
+ // set isTailCall to false for now, until we figure out how to express
+ // tail call optimization in PTX
+ isTailCall = false;
+ return Chain;
+}
+
+// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
+// (see LegalizeDAG.cpp). This is slow and uses local memory.
+// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
+SDValue
+NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
+ SDNode *Node = Op.getNode();
+ SDLoc dl(Node);
+ SmallVector<SDValue, 8> Ops;
+ unsigned NumOperands = Node->getNumOperands();
+ for (unsigned i = 0; i < NumOperands; ++i) {
+ SDValue SubOp = Node->getOperand(i);
+ EVT VVT = SubOp.getNode()->getValueType(0);
+ EVT EltVT = VVT.getVectorElementType();
+ unsigned NumSubElem = VVT.getVectorNumElements();
+ for (unsigned j = 0; j < NumSubElem; ++j) {
+ Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
+ DAG.getIntPtrConstant(j, dl)));
+ }
+ }
+ return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
+}
+
+/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
+/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
+/// amount, or
+/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
+/// amount.
+SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+ assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
+
+ EVT VT = Op.getValueType();
+ unsigned VTBits = VT.getSizeInBits();
+ SDLoc dl(Op);
+ SDValue ShOpLo = Op.getOperand(0);
+ SDValue ShOpHi = Op.getOperand(1);
+ SDValue ShAmt = Op.getOperand(2);
+ unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
+
+ if (VTBits == 32 && STI.getSmVersion() >= 35) {
+
+ // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
+ // {dHi, dLo} = {aHi, aLo} >> Amt
+ // dHi = aHi >> Amt
+ // dLo = shf.r.clamp aLo, aHi, Amt
+
+ SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
+ SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
+ ShAmt);
+
+ SDValue Ops[2] = { Lo, Hi };
+ return DAG.getMergeValues(Ops, dl);
+ }
+ else {
+
+ // {dHi, dLo} = {aHi, aLo} >> Amt
+ // - if (Amt>=size) then
+ // dLo = aHi >> (Amt-size)
+ // dHi = aHi >> Amt (this is either all 0 or all 1)
+ // else
+ // dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
+ // dHi = aHi >> Amt
+
+ SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
+ DAG.getConstant(VTBits, dl, MVT::i32),
+ ShAmt);
+ SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
+ SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
+ DAG.getConstant(VTBits, dl, MVT::i32));
+ SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
+ SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+ SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
+
+ SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
+ DAG.getConstant(VTBits, dl, MVT::i32),
+ ISD::SETGE);
+ SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
+ SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
+
+ SDValue Ops[2] = { Lo, Hi };
+ return DAG.getMergeValues(Ops, dl);
+ }
+}
+
+/// LowerShiftLeftParts - Lower SHL_PARTS, which
+/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
+/// amount, or
+/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
+/// amount.
+SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+ assert(Op.getOpcode() == ISD::SHL_PARTS);
+
+ EVT VT = Op.getValueType();
+ unsigned VTBits = VT.getSizeInBits();
+ SDLoc dl(Op);
+ SDValue ShOpLo = Op.getOperand(0);
+ SDValue ShOpHi = Op.getOperand(1);
+ SDValue ShAmt = Op.getOperand(2);
+
+ if (VTBits == 32 && STI.getSmVersion() >= 35) {
+
+ // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
+ // {dHi, dLo} = {aHi, aLo} << Amt
+ // dHi = shf.l.clamp aLo, aHi, Amt
+ // dLo = aLo << Amt
+
+ SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
+ ShAmt);
+ SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+
+ SDValue Ops[2] = { Lo, Hi };
+ return DAG.getMergeValues(Ops, dl);
+ }
+ else {
+
+ // {dHi, dLo} = {aHi, aLo} << Amt
+ // - if (Amt>=size) then
+ // dLo = aLo << Amt (all 0)
+ // dLo = aLo << (Amt-size)
+ // else
+ // dLo = aLo << Amt
+ // dHi = (aHi << Amt) | (aLo >> (size-Amt))
+
+ SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
+ DAG.getConstant(VTBits, dl, MVT::i32),
+ ShAmt);
+ SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
+ SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
+ DAG.getConstant(VTBits, dl, MVT::i32));
+ SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
+ SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+ SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
+
+ SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
+ DAG.getConstant(VTBits, dl, MVT::i32),
+ ISD::SETGE);
+ SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+ SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
+
+ SDValue Ops[2] = { Lo, Hi };
+ return DAG.getMergeValues(Ops, dl);
+ }
+}
+
+SDValue
+NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+ switch (Op.getOpcode()) {
+ case ISD::RETURNADDR:
+ return SDValue();
+ case ISD::FRAMEADDR:
+ return SDValue();
+ case ISD::GlobalAddress:
+ return LowerGlobalAddress(Op, DAG);
+ case ISD::INTRINSIC_W_CHAIN:
+ return Op;
+ case ISD::BUILD_VECTOR:
+ case ISD::EXTRACT_SUBVECTOR:
+ return Op;
+ case ISD::CONCAT_VECTORS:
+ return LowerCONCAT_VECTORS(Op, DAG);
+ case ISD::STORE:
+ return LowerSTORE(Op, DAG);
+ case ISD::LOAD:
+ return LowerLOAD(Op, DAG);
+ case ISD::SHL_PARTS:
+ return LowerShiftLeftParts(Op, DAG);
+ case ISD::SRA_PARTS:
+ case ISD::SRL_PARTS:
+ return LowerShiftRightParts(Op, DAG);
+ case ISD::SELECT:
+ return LowerSelect(Op, DAG);
+ default:
+ llvm_unreachable("Custom lowering not defined for operation");
+ }
+}
+
+SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
+ SDValue Op0 = Op->getOperand(0);
+ SDValue Op1 = Op->getOperand(1);
+ SDValue Op2 = Op->getOperand(2);
+ SDLoc DL(Op.getNode());
+
+ assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
+
+ Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
+ Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
+ SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
+
+ return Trunc;
+}
+
+SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+ if (Op.getValueType() == MVT::i1)
+ return LowerLOADi1(Op, DAG);
+ else
+ return SDValue();
+}
+
+// v = ld i1* addr
+// =>
+// v1 = ld i8* addr (-> i16)
+// v = trunc i16 to i1
+SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
+ SDNode *Node = Op.getNode();
+ LoadSDNode *LD = cast<LoadSDNode>(Node);
+ SDLoc dl(Node);
+ assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
+ assert(Node->getValueType(0) == MVT::i1 &&
+ "Custom lowering for i1 load only");
+ SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
+ LD->getPointerInfo(), LD->getAlignment(),
+ LD->getMemOperand()->getFlags());
+ SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
+ // The legalizer (the caller) is expecting two values from the legalized
+ // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
+ // in LegalizeDAG.cpp which also uses MergeValues.
+ SDValue Ops[] = { result, LD->getChain() };
+ return DAG.getMergeValues(Ops, dl);
+}
+
+SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+ EVT ValVT = Op.getOperand(1).getValueType();
+ if (ValVT == MVT::i1)
+ return LowerSTOREi1(Op, DAG);
+ else if (ValVT.isVector())
+ return LowerSTOREVector(Op, DAG);
+ else
+ return SDValue();
+}
+
+SDValue
+NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
+ SDNode *N = Op.getNode();
+ SDValue Val = N->getOperand(1);
+ SDLoc DL(N);
+ EVT ValVT = Val.getValueType();
+
+ if (ValVT.isVector()) {
+ // We only handle "native" vector sizes for now, e.g. <4 x double> is not
+ // legal. We can (and should) split that into 2 stores of <2 x double> here
+ // but I'm leaving that as a TODO for now.
+ if (!ValVT.isSimple())
+ return SDValue();
+ switch (ValVT.getSimpleVT().SimpleTy) {
+ default:
+ return SDValue();
+ case MVT::v2i8:
+ case MVT::v2i16:
+ case MVT::v2i32:
+ case MVT::v2i64:
+ case MVT::v2f32:
+ case MVT::v2f64:
+ case MVT::v4i8:
+ case MVT::v4i16:
+ case MVT::v4i32:
+ case MVT::v4f32:
+ // This is a "native" vector type
+ break;
+ }
+
+ MemSDNode *MemSD = cast<MemSDNode>(N);
+ const DataLayout &TD = DAG.getDataLayout();
+
+ unsigned Align = MemSD->getAlignment();
+ unsigned PrefAlign =
+ TD.getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext()));
+ if (Align < PrefAlign) {
+ // This store is not sufficiently aligned, so bail out and let this vector
+ // store be scalarized. Note that we may still be able to emit smaller
+ // vector stores. For example, if we are storing a <4 x float> with an
+ // alignment of 8, this check will fail but the legalizer will try again
+ // with 2 x <2 x float>, which will succeed with an alignment of 8.
+ return SDValue();
+ }
+
+ unsigned Opcode = 0;
+ EVT EltVT = ValVT.getVectorElementType();
+ unsigned NumElts = ValVT.getVectorNumElements();
+
+ // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
+ // Therefore, we must ensure the type is legal. For i1 and i8, we set the
+ // stored type to i16 and propagate the "real" type as the memory type.
+ bool NeedExt = false;
+ if (EltVT.getSizeInBits() < 16)
+ NeedExt = true;
+
+ switch (NumElts) {
+ default:
+ return SDValue();
+ case 2:
+ Opcode = NVPTXISD::StoreV2;
+ break;
+ case 4: {
+ Opcode = NVPTXISD::StoreV4;
+ break;
+ }
+ }
+
+ SmallVector<SDValue, 8> Ops;
+
+ // First is the chain
+ Ops.push_back(N->getOperand(0));
+
+ // Then the split values
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
+ DAG.getIntPtrConstant(i, DL));
+ if (NeedExt)
+ ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
+ Ops.push_back(ExtVal);
+ }
+
+ // Then any remaining arguments
+ Ops.append(N->op_begin() + 2, N->op_end());
+
+ SDValue NewSt = DAG.getMemIntrinsicNode(
+ Opcode, DL, DAG.getVTList(MVT::Other), Ops,
+ MemSD->getMemoryVT(), MemSD->getMemOperand());
+
+ //return DCI.CombineTo(N, NewSt, true);
+ return NewSt;
+ }
+
+ return SDValue();
+}
+
+// st i1 v, addr
+// =>
+// v1 = zxt v to i16
+// st.u8 i16, addr
+SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
+ SDNode *Node = Op.getNode();
+ SDLoc dl(Node);
+ StoreSDNode *ST = cast<StoreSDNode>(Node);
+ SDValue Tmp1 = ST->getChain();
+ SDValue Tmp2 = ST->getBasePtr();
+ SDValue Tmp3 = ST->getValue();
+ assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
+ Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
+ SDValue Result =
+ DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
+ ST->getAlignment(), ST->getMemOperand()->getFlags());
+ return Result;
+}
+
+SDValue
+NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const {
+ std::string ParamSym;
+ raw_string_ostream ParamStr(ParamSym);
+
+ ParamStr << DAG.getMachineFunction().getName() << "_param_" << idx;
+ ParamStr.flush();
+
+ std::string *SavedStr =
+ nvTM->getManagedStrPool()->getManagedString(ParamSym.c_str());
+ return DAG.getTargetExternalSymbol(SavedStr->c_str(), v);
+}
+
+// Check to see if the kernel argument is image*_t or sampler_t
+
+static bool isImageOrSamplerVal(const Value *arg, const Module *context) {
+ static const char *const specialTypes[] = { "struct._image2d_t",
+ "struct._image3d_t",
+ "struct._sampler_t" };
+
+ Type *Ty = arg->getType();
+ auto *PTy = dyn_cast<PointerType>(Ty);
+
+ if (!PTy)
+ return false;
+
+ if (!context)
+ return false;
+
+ auto *STy = dyn_cast<StructType>(PTy->getElementType());
+ if (!STy || STy->isLiteral())
+ return false;
+
+ return std::find(std::begin(specialTypes), std::end(specialTypes),
+ STy->getName()) != std::end(specialTypes);
+}
+
+SDValue NVPTXTargetLowering::LowerFormalArguments(
+ SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ const DataLayout &DL = DAG.getDataLayout();
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+ const Function *F = MF.getFunction();
+ const AttributeSet &PAL = F->getAttributes();
+ const TargetLowering *TLI = STI.getTargetLowering();
+
+ SDValue Root = DAG.getRoot();
+ std::vector<SDValue> OutChains;
+
+ bool isABI = (STI.getSmVersion() >= 20);
+ assert(isABI && "Non-ABI compilation is not supported");
+ if (!isABI)
+ return Chain;
+
+ std::vector<Type *> argTypes;
+ std::vector<const Argument *> theArgs;
+ for (const Argument &I : F->args()) {
+ theArgs.push_back(&I);
+ argTypes.push_back(I.getType());
+ }
+ // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
+ // Ins.size() will be larger
+ // * if there is an aggregate argument with multiple fields (each field
+ // showing up separately in Ins)
+ // * if there is a vector argument with more than typical vector-length
+ // elements (generally if more than 4) where each vector element is
+ // individually present in Ins.
+ // So a different index should be used for indexing into Ins.
+ // See similar issue in LowerCall.
+ unsigned InsIdx = 0;
+
+ int idx = 0;
+ for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) {
+ Type *Ty = argTypes[i];
+
+ // If the kernel argument is image*_t or sampler_t, convert it to
+ // a i32 constant holding the parameter position. This can later
+ // matched in the AsmPrinter to output the correct mangled name.
+ if (isImageOrSamplerVal(
+ theArgs[i],
+ (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent()
+ : nullptr))) {
+ assert(llvm::isKernelFunction(*F) &&
+ "Only kernels can have image/sampler params");
+ InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32));
+ continue;
+ }
+
+ if (theArgs[i]->use_empty()) {
+ // argument is dead
+ if (Ty->isAggregateType()) {
+ SmallVector<EVT, 16> vtparts;
+
+ ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
+ assert(vtparts.size() > 0 && "empty aggregate type not expected");
+ for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
+ ++parti) {
+ InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
+ ++InsIdx;
+ }
+ if (vtparts.size() > 0)
+ --InsIdx;
+ continue;
+ }
+ if (Ty->isVectorTy()) {
+ EVT ObjectVT = getValueType(DL, Ty);
+ unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
+ for (unsigned parti = 0; parti < NumRegs; ++parti) {
+ InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
+ ++InsIdx;
+ }
+ if (NumRegs > 0)
+ --InsIdx;
+ continue;
+ }
+ InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
+ continue;
+ }
+
+ // In the following cases, assign a node order of "idx+1"
+ // to newly created nodes. The SDNodes for params have to
+ // appear in the same order as their order of appearance
+ // in the original function. "idx+1" holds that order.
+ if (!PAL.hasAttribute(i + 1, Attribute::ByVal)) {
+ if (Ty->isAggregateType()) {
+ SmallVector<EVT, 16> vtparts;
+ SmallVector<uint64_t, 16> offsets;
+
+ // NOTE: Here, we lose the ability to issue vector loads for vectors
+ // that are a part of a struct. This should be investigated in the
+ // future.
+ ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts, &offsets,
+ 0);
+ assert(vtparts.size() > 0 && "empty aggregate type not expected");
+ bool aggregateIsPacked = false;
+ if (StructType *STy = llvm::dyn_cast<StructType>(Ty))
+ aggregateIsPacked = STy->isPacked();
+
+ SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
+ for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
+ ++parti) {
+ EVT partVT = vtparts[parti];
+ Value *srcValue = Constant::getNullValue(
+ PointerType::get(partVT.getTypeForEVT(F->getContext()),
+ llvm::ADDRESS_SPACE_PARAM));
+ SDValue srcAddr =
+ DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
+ DAG.getConstant(offsets[parti], dl, PtrVT));
+ unsigned partAlign = aggregateIsPacked
+ ? 1
+ : DL.getABITypeAlignment(
+ partVT.getTypeForEVT(F->getContext()));
+ SDValue p;
+ if (Ins[InsIdx].VT.getSizeInBits() > partVT.getSizeInBits()) {
+ ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ?
+ ISD::SEXTLOAD : ISD::ZEXTLOAD;
+ p = DAG.getExtLoad(ExtOp, dl, Ins[InsIdx].VT, Root, srcAddr,
+ MachinePointerInfo(srcValue), partVT, partAlign);
+ } else {
+ p = DAG.getLoad(partVT, dl, Root, srcAddr,
+ MachinePointerInfo(srcValue), partAlign);
+ }
+ if (p.getNode())
+ p.getNode()->setIROrder(idx + 1);
+ InVals.push_back(p);
+ ++InsIdx;
+ }
+ if (vtparts.size() > 0)
+ --InsIdx;
+ continue;
+ }
+ if (Ty->isVectorTy()) {
+ EVT ObjectVT = getValueType(DL, Ty);
+ SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
+ unsigned NumElts = ObjectVT.getVectorNumElements();
+ assert(TLI->getNumRegisters(F->getContext(), ObjectVT) == NumElts &&
+ "Vector was not scalarized");
+ EVT EltVT = ObjectVT.getVectorElementType();
+
+ // V1 load
+ // f32 = load ...
+ if (NumElts == 1) {
+ // We only have one element, so just directly load it
+ Value *SrcValue = Constant::getNullValue(PointerType::get(
+ EltVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
+ SDValue P = DAG.getLoad(
+ EltVT, dl, Root, Arg, MachinePointerInfo(SrcValue),
+ DL.getABITypeAlignment(EltVT.getTypeForEVT(F->getContext())),
+ MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant);
+ if (P.getNode())
+ P.getNode()->setIROrder(idx + 1);
+
+ if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits())
+ P = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, P);
+ InVals.push_back(P);
+ ++InsIdx;
+ } else if (NumElts == 2) {
+ // V2 load
+ // f32,f32 = load ...
+ EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, 2);
+ Value *SrcValue = Constant::getNullValue(PointerType::get(
+ VecVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
+ SDValue P = DAG.getLoad(
+ VecVT, dl, Root, Arg, MachinePointerInfo(SrcValue),
+ DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())),
+ MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant);
+ if (P.getNode())
+ P.getNode()->setIROrder(idx + 1);
+
+ SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P,
+ DAG.getIntPtrConstant(0, dl));
+ SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P,
+ DAG.getIntPtrConstant(1, dl));
+
+ if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits()) {
+ Elt0 = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt0);
+ Elt1 = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt1);
+ }
+
+ InVals.push_back(Elt0);
+ InVals.push_back(Elt1);
+ InsIdx += 2;
+ } else {
+ // V4 loads
+ // We have at least 4 elements (<3 x Ty> expands to 4 elements) and
+ // the vector will be expanded to a power of 2 elements, so we know we
+ // can always round up to the next multiple of 4 when creating the
+ // vector loads.
+ // e.g. 4 elem => 1 ld.v4
+ // 6 elem => 2 ld.v4
+ // 8 elem => 2 ld.v4
+ // 11 elem => 3 ld.v4
+ unsigned VecSize = 4;
+ if (EltVT.getSizeInBits() == 64) {
+ VecSize = 2;
+ }
+ EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize);
+ unsigned Ofst = 0;
+ for (unsigned i = 0; i < NumElts; i += VecSize) {
+ Value *SrcValue = Constant::getNullValue(
+ PointerType::get(VecVT.getTypeForEVT(F->getContext()),
+ llvm::ADDRESS_SPACE_PARAM));
+ SDValue SrcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
+ DAG.getConstant(Ofst, dl, PtrVT));
+ SDValue P = DAG.getLoad(
+ VecVT, dl, Root, SrcAddr, MachinePointerInfo(SrcValue),
+ DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())),
+ MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant);
+ if (P.getNode())
+ P.getNode()->setIROrder(idx + 1);
+
+ for (unsigned j = 0; j < VecSize; ++j) {
+ if (i + j >= NumElts)
+ break;
+ SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P,
+ DAG.getIntPtrConstant(j, dl));
+ if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits())
+ Elt = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt);
+ InVals.push_back(Elt);
+ }
+ Ofst += DL.getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
+ }
+ InsIdx += NumElts;
+ }
+
+ if (NumElts > 0)
+ --InsIdx;
+ continue;
+ }
+ // A plain scalar.
+ EVT ObjectVT = getValueType(DL, Ty);
+ // If ABI, load from the param symbol
+ SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
+ Value *srcValue = Constant::getNullValue(PointerType::get(
+ ObjectVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
+ SDValue p;
+ if (ObjectVT.getSizeInBits() < Ins[InsIdx].VT.getSizeInBits()) {
+ ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ?
+ ISD::SEXTLOAD : ISD::ZEXTLOAD;
+ p = DAG.getExtLoad(
+ ExtOp, dl, Ins[InsIdx].VT, Root, Arg, MachinePointerInfo(srcValue),
+ ObjectVT,
+ DL.getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
+ } else {
+ p = DAG.getLoad(
+ Ins[InsIdx].VT, dl, Root, Arg, MachinePointerInfo(srcValue),
+ DL.getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
+ }
+ if (p.getNode())
+ p.getNode()->setIROrder(idx + 1);
+ InVals.push_back(p);
+ continue;
+ }
+
+ // Param has ByVal attribute
+ // Return MoveParam(param symbol).
+ // Ideally, the param symbol can be returned directly,
+ // but when SDNode builder decides to use it in a CopyToReg(),
+ // machine instruction fails because TargetExternalSymbol
+ // (not lowered) is target dependent, and CopyToReg assumes
+ // the source is lowered.
+ EVT ObjectVT = getValueType(DL, Ty);
+ assert(ObjectVT == Ins[InsIdx].VT &&
+ "Ins type did not match function type");
+ SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
+ SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
+ if (p.getNode())
+ p.getNode()->setIROrder(idx + 1);
+ InVals.push_back(p);
+ }
+
+ // Clang will check explicit VarArg and issue error if any. However, Clang
+ // will let code with
+ // implicit var arg like f() pass. See bug 617733.
+ // We treat this case as if the arg list is empty.
+ // if (F.isVarArg()) {
+ // assert(0 && "VarArg not supported yet!");
+ //}
+
+ if (!OutChains.empty())
+ DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
+
+ return Chain;
+}
+
+SDValue
+NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SDLoc &dl, SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ const Function *F = MF.getFunction();
+ Type *RetTy = F->getReturnType();
+ const DataLayout &TD = DAG.getDataLayout();
+
+ bool isABI = (STI.getSmVersion() >= 20);
+ assert(isABI && "Non-ABI compilation is not supported");
+ if (!isABI)
+ return Chain;
+
+ if (VectorType *VTy = dyn_cast<VectorType>(RetTy)) {
+ // If we have a vector type, the OutVals array will be the scalarized
+ // components and we have combine them into 1 or more vector stores.
+ unsigned NumElts = VTy->getNumElements();
+ assert(NumElts == Outs.size() && "Bad scalarization of return value");
+
+ // const_cast can be removed in later LLVM versions
+ EVT EltVT = getValueType(TD, RetTy).getVectorElementType();
+ bool NeedExtend = false;
+ if (EltVT.getSizeInBits() < 16)
+ NeedExtend = true;
+
+ // V1 store
+ if (NumElts == 1) {
+ SDValue StoreVal = OutVals[0];
+ // We only have one element, so just directly store it
+ if (NeedExtend)
+ StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
+ SDValue Ops[] = { Chain, DAG.getConstant(0, dl, MVT::i32), StoreVal };
+ Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
+ DAG.getVTList(MVT::Other), Ops,
+ EltVT, MachinePointerInfo());
+
+ } else if (NumElts == 2) {
+ // V2 store
+ SDValue StoreVal0 = OutVals[0];
+ SDValue StoreVal1 = OutVals[1];
+
+ if (NeedExtend) {
+ StoreVal0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal0);
+ StoreVal1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal1);
+ }
+
+ SDValue Ops[] = { Chain, DAG.getConstant(0, dl, MVT::i32), StoreVal0,
+ StoreVal1 };
+ Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetvalV2, dl,
+ DAG.getVTList(MVT::Other), Ops,
+ EltVT, MachinePointerInfo());
+ } else {
+ // V4 stores
+ // We have at least 4 elements (<3 x Ty> expands to 4 elements) and the
+ // vector will be expanded to a power of 2 elements, so we know we can
+ // always round up to the next multiple of 4 when creating the vector
+ // stores.
+ // e.g. 4 elem => 1 st.v4
+ // 6 elem => 2 st.v4
+ // 8 elem => 2 st.v4
+ // 11 elem => 3 st.v4
+
+ unsigned VecSize = 4;
+ if (OutVals[0].getValueSizeInBits() == 64)
+ VecSize = 2;
+
+ unsigned Offset = 0;
+
+ EVT VecVT =
+ EVT::getVectorVT(F->getContext(), EltVT, VecSize);
+ unsigned PerStoreOffset =
+ TD.getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
+
+ for (unsigned i = 0; i < NumElts; i += VecSize) {
+ // Get values
+ SDValue StoreVal;
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(Chain);
+ Ops.push_back(DAG.getConstant(Offset, dl, MVT::i32));
+ unsigned Opc = NVPTXISD::StoreRetvalV2;
+ EVT ExtendedVT = (NeedExtend) ? MVT::i16 : OutVals[0].getValueType();
+
+ StoreVal = OutVals[i];
+ if (NeedExtend)
+ StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
+ Ops.push_back(StoreVal);
+
+ if (i + 1 < NumElts) {
+ StoreVal = OutVals[i + 1];
+ if (NeedExtend)
+ StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
+ } else {
+ StoreVal = DAG.getUNDEF(ExtendedVT);
+ }
+ Ops.push_back(StoreVal);
+
+ if (VecSize == 4) {
+ Opc = NVPTXISD::StoreRetvalV4;
+ if (i + 2 < NumElts) {
+ StoreVal = OutVals[i + 2];
+ if (NeedExtend)
+ StoreVal =
+ DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
+ } else {
+ StoreVal = DAG.getUNDEF(ExtendedVT);
+ }
+ Ops.push_back(StoreVal);
+
+ if (i + 3 < NumElts) {
+ StoreVal = OutVals[i + 3];
+ if (NeedExtend)
+ StoreVal =
+ DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
+ } else {
+ StoreVal = DAG.getUNDEF(ExtendedVT);
+ }
+ Ops.push_back(StoreVal);
+ }
+
+ // Chain = DAG.getNode(Opc, dl, MVT::Other, &Ops[0], Ops.size());
+ Chain =
+ DAG.getMemIntrinsicNode(Opc, dl, DAG.getVTList(MVT::Other), Ops,
+ EltVT, MachinePointerInfo());
+ Offset += PerStoreOffset;
+ }
+ }
+ } else {
+ SmallVector<EVT, 16> ValVTs;
+ SmallVector<uint64_t, 16> Offsets;
+ ComputePTXValueVTs(*this, DAG.getDataLayout(), RetTy, ValVTs, &Offsets, 0);
+ assert(ValVTs.size() == OutVals.size() && "Bad return value decomposition");
+
+ for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+ SDValue theVal = OutVals[i];
+ EVT TheValType = theVal.getValueType();
+ unsigned numElems = 1;
+ if (TheValType.isVector())
+ numElems = TheValType.getVectorNumElements();
+ for (unsigned j = 0, je = numElems; j != je; ++j) {
+ SDValue TmpVal = theVal;
+ if (TheValType.isVector())
+ TmpVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+ TheValType.getVectorElementType(), TmpVal,
+ DAG.getIntPtrConstant(j, dl));
+ EVT TheStoreType = ValVTs[i];
+ if (RetTy->isIntegerTy() && TD.getTypeAllocSizeInBits(RetTy) < 32) {
+ // The following zero-extension is for integer types only, and
+ // specifically not for aggregates.
+ TmpVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, TmpVal);
+ TheStoreType = MVT::i32;
+ }
+ else if (TmpVal.getValueSizeInBits() < 16)
+ TmpVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, TmpVal);
+
+ SDValue Ops[] = {
+ Chain,
+ DAG.getConstant(Offsets[i], dl, MVT::i32),
+ TmpVal };
+ Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
+ DAG.getVTList(MVT::Other), Ops,
+ TheStoreType,
+ MachinePointerInfo());
+ }
+ }
+ }
+
+ return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain);
+}
+
+
+void NVPTXTargetLowering::LowerAsmOperandForConstraint(
+ SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
+ SelectionDAG &DAG) const {
+ if (Constraint.length() > 1)
+ return;
+ else
+ TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
+ switch (Intrinsic) {
+ default:
+ return 0;
+
+ case Intrinsic::nvvm_tex_1d_v4f32_s32:
+ return NVPTXISD::Tex1DFloatS32;
+ case Intrinsic::nvvm_tex_1d_v4f32_f32:
+ return NVPTXISD::Tex1DFloatFloat;
+ case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
+ return NVPTXISD::Tex1DFloatFloatLevel;
+ case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
+ return NVPTXISD::Tex1DFloatFloatGrad;
+ case Intrinsic::nvvm_tex_1d_v4s32_s32:
+ return NVPTXISD::Tex1DS32S32;
+ case Intrinsic::nvvm_tex_1d_v4s32_f32:
+ return NVPTXISD::Tex1DS32Float;
+ case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
+ return NVPTXISD::Tex1DS32FloatLevel;
+ case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
+ return NVPTXISD::Tex1DS32FloatGrad;
+ case Intrinsic::nvvm_tex_1d_v4u32_s32:
+ return NVPTXISD::Tex1DU32S32;
+ case Intrinsic::nvvm_tex_1d_v4u32_f32:
+ return NVPTXISD::Tex1DU32Float;
+ case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
+ return NVPTXISD::Tex1DU32FloatLevel;
+ case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
+ return NVPTXISD::Tex1DU32FloatGrad;
+
+ case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
+ return NVPTXISD::Tex1DArrayFloatS32;
+ case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
+ return NVPTXISD::Tex1DArrayFloatFloat;
+ case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
+ return NVPTXISD::Tex1DArrayFloatFloatLevel;
+ case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
+ return NVPTXISD::Tex1DArrayFloatFloatGrad;
+ case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
+ return NVPTXISD::Tex1DArrayS32S32;
+ case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
+ return NVPTXISD::Tex1DArrayS32Float;
+ case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
+ return NVPTXISD::Tex1DArrayS32FloatLevel;
+ case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
+ return NVPTXISD::Tex1DArrayS32FloatGrad;
+ case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
+ return NVPTXISD::Tex1DArrayU32S32;
+ case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
+ return NVPTXISD::Tex1DArrayU32Float;
+ case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
+ return NVPTXISD::Tex1DArrayU32FloatLevel;
+ case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
+ return NVPTXISD::Tex1DArrayU32FloatGrad;
+
+ case Intrinsic::nvvm_tex_2d_v4f32_s32:
+ return NVPTXISD::Tex2DFloatS32;
+ case Intrinsic::nvvm_tex_2d_v4f32_f32:
+ return NVPTXISD::Tex2DFloatFloat;
+ case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
+ return NVPTXISD::Tex2DFloatFloatLevel;
+ case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
+ return NVPTXISD::Tex2DFloatFloatGrad;
+ case Intrinsic::nvvm_tex_2d_v4s32_s32:
+ return NVPTXISD::Tex2DS32S32;
+ case Intrinsic::nvvm_tex_2d_v4s32_f32:
+ return NVPTXISD::Tex2DS32Float;
+ case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
+ return NVPTXISD::Tex2DS32FloatLevel;
+ case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
+ return NVPTXISD::Tex2DS32FloatGrad;
+ case Intrinsic::nvvm_tex_2d_v4u32_s32:
+ return NVPTXISD::Tex2DU32S32;
+ case Intrinsic::nvvm_tex_2d_v4u32_f32:
+ return NVPTXISD::Tex2DU32Float;
+ case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
+ return NVPTXISD::Tex2DU32FloatLevel;
+ case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
+ return NVPTXISD::Tex2DU32FloatGrad;
+
+ case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
+ return NVPTXISD::Tex2DArrayFloatS32;
+ case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
+ return NVPTXISD::Tex2DArrayFloatFloat;
+ case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
+ return NVPTXISD::Tex2DArrayFloatFloatLevel;
+ case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
+ return NVPTXISD::Tex2DArrayFloatFloatGrad;
+ case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
+ return NVPTXISD::Tex2DArrayS32S32;
+ case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
+ return NVPTXISD::Tex2DArrayS32Float;
+ case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
+ return NVPTXISD::Tex2DArrayS32FloatLevel;
+ case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
+ return NVPTXISD::Tex2DArrayS32FloatGrad;
+ case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
+ return NVPTXISD::Tex2DArrayU32S32;
+ case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
+ return NVPTXISD::Tex2DArrayU32Float;
+ case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
+ return NVPTXISD::Tex2DArrayU32FloatLevel;
+ case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
+ return NVPTXISD::Tex2DArrayU32FloatGrad;
+
+ case Intrinsic::nvvm_tex_3d_v4f32_s32:
+ return NVPTXISD::Tex3DFloatS32;
+ case Intrinsic::nvvm_tex_3d_v4f32_f32:
+ return NVPTXISD::Tex3DFloatFloat;
+ case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
+ return NVPTXISD::Tex3DFloatFloatLevel;
+ case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
+ return NVPTXISD::Tex3DFloatFloatGrad;
+ case Intrinsic::nvvm_tex_3d_v4s32_s32:
+ return NVPTXISD::Tex3DS32S32;
+ case Intrinsic::nvvm_tex_3d_v4s32_f32:
+ return NVPTXISD::Tex3DS32Float;
+ case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
+ return NVPTXISD::Tex3DS32FloatLevel;
+ case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
+ return NVPTXISD::Tex3DS32FloatGrad;
+ case Intrinsic::nvvm_tex_3d_v4u32_s32:
+ return NVPTXISD::Tex3DU32S32;
+ case Intrinsic::nvvm_tex_3d_v4u32_f32:
+ return NVPTXISD::Tex3DU32Float;
+ case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
+ return NVPTXISD::Tex3DU32FloatLevel;
+ case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
+ return NVPTXISD::Tex3DU32FloatGrad;
+
+ case Intrinsic::nvvm_tex_cube_v4f32_f32:
+ return NVPTXISD::TexCubeFloatFloat;
+ case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
+ return NVPTXISD::TexCubeFloatFloatLevel;
+ case Intrinsic::nvvm_tex_cube_v4s32_f32:
+ return NVPTXISD::TexCubeS32Float;
+ case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
+ return NVPTXISD::TexCubeS32FloatLevel;
+ case Intrinsic::nvvm_tex_cube_v4u32_f32:
+ return NVPTXISD::TexCubeU32Float;
+ case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
+ return NVPTXISD::TexCubeU32FloatLevel;
+
+ case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
+ return NVPTXISD::TexCubeArrayFloatFloat;
+ case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
+ return NVPTXISD::TexCubeArrayFloatFloatLevel;
+ case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
+ return NVPTXISD::TexCubeArrayS32Float;
+ case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
+ return NVPTXISD::TexCubeArrayS32FloatLevel;
+ case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
+ return NVPTXISD::TexCubeArrayU32Float;
+ case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
+ return NVPTXISD::TexCubeArrayU32FloatLevel;
+
+ case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
+ return NVPTXISD::Tld4R2DFloatFloat;
+ case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
+ return NVPTXISD::Tld4G2DFloatFloat;
+ case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
+ return NVPTXISD::Tld4B2DFloatFloat;
+ case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
+ return NVPTXISD::Tld4A2DFloatFloat;
+ case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
+ return NVPTXISD::Tld4R2DS64Float;
+ case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
+ return NVPTXISD::Tld4G2DS64Float;
+ case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
+ return NVPTXISD::Tld4B2DS64Float;
+ case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
+ return NVPTXISD::Tld4A2DS64Float;
+ case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
+ return NVPTXISD::Tld4R2DU64Float;
+ case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
+ return NVPTXISD::Tld4G2DU64Float;
+ case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
+ return NVPTXISD::Tld4B2DU64Float;
+ case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
+ return NVPTXISD::Tld4A2DU64Float;
+
+ case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
+ return NVPTXISD::TexUnified1DFloatS32;
+ case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
+ return NVPTXISD::TexUnified1DFloatFloat;
+ case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
+ return NVPTXISD::TexUnified1DFloatFloatLevel;
+ case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
+ return NVPTXISD::TexUnified1DFloatFloatGrad;
+ case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
+ return NVPTXISD::TexUnified1DS32S32;
+ case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
+ return NVPTXISD::TexUnified1DS32Float;
+ case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
+ return NVPTXISD::TexUnified1DS32FloatLevel;
+ case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
+ return NVPTXISD::TexUnified1DS32FloatGrad;
+ case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
+ return NVPTXISD::TexUnified1DU32S32;
+ case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
+ return NVPTXISD::TexUnified1DU32Float;
+ case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
+ return NVPTXISD::TexUnified1DU32FloatLevel;
+ case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
+ return NVPTXISD::TexUnified1DU32FloatGrad;
+
+ case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
+ return NVPTXISD::TexUnified1DArrayFloatS32;
+ case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
+ return NVPTXISD::TexUnified1DArrayFloatFloat;
+ case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
+ return NVPTXISD::TexUnified1DArrayFloatFloatLevel;
+ case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
+ return NVPTXISD::TexUnified1DArrayFloatFloatGrad;
+ case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
+ return NVPTXISD::TexUnified1DArrayS32S32;
+ case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
+ return NVPTXISD::TexUnified1DArrayS32Float;
+ case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
+ return NVPTXISD::TexUnified1DArrayS32FloatLevel;
+ case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
+ return NVPTXISD::TexUnified1DArrayS32FloatGrad;
+ case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
+ return NVPTXISD::TexUnified1DArrayU32S32;
+ case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
+ return NVPTXISD::TexUnified1DArrayU32Float;
+ case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
+ return NVPTXISD::TexUnified1DArrayU32FloatLevel;
+ case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
+ return NVPTXISD::TexUnified1DArrayU32FloatGrad;
+
+ case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
+ return NVPTXISD::TexUnified2DFloatS32;
+ case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
+ return NVPTXISD::TexUnified2DFloatFloat;
+ case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
+ return NVPTXISD::TexUnified2DFloatFloatLevel;
+ case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
+ return NVPTXISD::TexUnified2DFloatFloatGrad;
+ case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
+ return NVPTXISD::TexUnified2DS32S32;
+ case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
+ return NVPTXISD::TexUnified2DS32Float;
+ case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
+ return NVPTXISD::TexUnified2DS32FloatLevel;
+ case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
+ return NVPTXISD::TexUnified2DS32FloatGrad;
+ case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
+ return NVPTXISD::TexUnified2DU32S32;
+ case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
+ return NVPTXISD::TexUnified2DU32Float;
+ case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
+ return NVPTXISD::TexUnified2DU32FloatLevel;
+ case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
+ return NVPTXISD::TexUnified2DU32FloatGrad;
+
+ case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
+ return NVPTXISD::TexUnified2DArrayFloatS32;
+ case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
+ return NVPTXISD::TexUnified2DArrayFloatFloat;
+ case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
+ return NVPTXISD::TexUnified2DArrayFloatFloatLevel;
+ case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
+ return NVPTXISD::TexUnified2DArrayFloatFloatGrad;
+ case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
+ return NVPTXISD::TexUnified2DArrayS32S32;
+ case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
+ return NVPTXISD::TexUnified2DArrayS32Float;
+ case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
+ return NVPTXISD::TexUnified2DArrayS32FloatLevel;
+ case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
+ return NVPTXISD::TexUnified2DArrayS32FloatGrad;
+ case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
+ return NVPTXISD::TexUnified2DArrayU32S32;
+ case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
+ return NVPTXISD::TexUnified2DArrayU32Float;
+ case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
+ return NVPTXISD::TexUnified2DArrayU32FloatLevel;
+ case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
+ return NVPTXISD::TexUnified2DArrayU32FloatGrad;
+
+ case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
+ return NVPTXISD::TexUnified3DFloatS32;
+ case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
+ return NVPTXISD::TexUnified3DFloatFloat;
+ case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
+ return NVPTXISD::TexUnified3DFloatFloatLevel;
+ case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
+ return NVPTXISD::TexUnified3DFloatFloatGrad;
+ case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
+ return NVPTXISD::TexUnified3DS32S32;
+ case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
+ return NVPTXISD::TexUnified3DS32Float;
+ case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
+ return NVPTXISD::TexUnified3DS32FloatLevel;
+ case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
+ return NVPTXISD::TexUnified3DS32FloatGrad;
+ case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
+ return NVPTXISD::TexUnified3DU32S32;
+ case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
+ return NVPTXISD::TexUnified3DU32Float;
+ case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
+ return NVPTXISD::TexUnified3DU32FloatLevel;
+ case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
+ return NVPTXISD::TexUnified3DU32FloatGrad;
+
+ case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
+ return NVPTXISD::TexUnifiedCubeFloatFloat;
+ case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
+ return NVPTXISD::TexUnifiedCubeFloatFloatLevel;
+ case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
+ return NVPTXISD::TexUnifiedCubeS32Float;
+ case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
+ return NVPTXISD::TexUnifiedCubeS32FloatLevel;
+ case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
+ return NVPTXISD::TexUnifiedCubeU32Float;
+ case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
+ return NVPTXISD::TexUnifiedCubeU32FloatLevel;
+
+ case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
+ return NVPTXISD::TexUnifiedCubeArrayFloatFloat;
+ case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
+ return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel;
+ case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
+ return NVPTXISD::TexUnifiedCubeArrayS32Float;
+ case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
+ return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel;
+ case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
+ return NVPTXISD::TexUnifiedCubeArrayU32Float;
+ case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
+ return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel;
+
+ case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
+ return NVPTXISD::Tld4UnifiedR2DFloatFloat;
+ case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
+ return NVPTXISD::Tld4UnifiedG2DFloatFloat;
+ case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
+ return NVPTXISD::Tld4UnifiedB2DFloatFloat;
+ case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
+ return NVPTXISD::Tld4UnifiedA2DFloatFloat;
+ case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
+ return NVPTXISD::Tld4UnifiedR2DS64Float;
+ case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
+ return NVPTXISD::Tld4UnifiedG2DS64Float;
+ case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
+ return NVPTXISD::Tld4UnifiedB2DS64Float;
+ case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
+ return NVPTXISD::Tld4UnifiedA2DS64Float;
+ case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
+ return NVPTXISD::Tld4UnifiedR2DU64Float;
+ case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
+ return NVPTXISD::Tld4UnifiedG2DU64Float;
+ case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
+ return NVPTXISD::Tld4UnifiedB2DU64Float;
+ case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
+ return NVPTXISD::Tld4UnifiedA2DU64Float;
+ }
+}
+
+static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
+ switch (Intrinsic) {
+ default:
+ return 0;
+ case Intrinsic::nvvm_suld_1d_i8_clamp:
+ return NVPTXISD::Suld1DI8Clamp;
+ case Intrinsic::nvvm_suld_1d_i16_clamp:
+ return NVPTXISD::Suld1DI16Clamp;
+ case Intrinsic::nvvm_suld_1d_i32_clamp:
+ return NVPTXISD::Suld1DI32Clamp;
+ case Intrinsic::nvvm_suld_1d_i64_clamp:
+ return NVPTXISD::Suld1DI64Clamp;
+ case Intrinsic::nvvm_suld_1d_v2i8_clamp:
+ return NVPTXISD::Suld1DV2I8Clamp;
+ case Intrinsic::nvvm_suld_1d_v2i16_clamp:
+ return NVPTXISD::Suld1DV2I16Clamp;
+ case Intrinsic::nvvm_suld_1d_v2i32_clamp:
+ return NVPTXISD::Suld1DV2I32Clamp;
+ case Intrinsic::nvvm_suld_1d_v2i64_clamp:
+ return NVPTXISD::Suld1DV2I64Clamp;
+ case Intrinsic::nvvm_suld_1d_v4i8_clamp:
+ return NVPTXISD::Suld1DV4I8Clamp;
+ case Intrinsic::nvvm_suld_1d_v4i16_clamp:
+ return NVPTXISD::Suld1DV4I16Clamp;
+ case Intrinsic::nvvm_suld_1d_v4i32_clamp:
+ return NVPTXISD::Suld1DV4I32Clamp;
+ case Intrinsic::nvvm_suld_1d_array_i8_clamp:
+ return NVPTXISD::Suld1DArrayI8Clamp;
+ case Intrinsic::nvvm_suld_1d_array_i16_clamp:
+ return NVPTXISD::Suld1DArrayI16Clamp;
+ case Intrinsic::nvvm_suld_1d_array_i32_clamp:
+ return NVPTXISD::Suld1DArrayI32Clamp;
+ case Intrinsic::nvvm_suld_1d_array_i64_clamp:
+ return NVPTXISD::Suld1DArrayI64Clamp;
+ case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
+ return NVPTXISD::Suld1DArrayV2I8Clamp;
+ case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
+ return NVPTXISD::Suld1DArrayV2I16Clamp;
+ case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
+ return NVPTXISD::Suld1DArrayV2I32Clamp;
+ case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
+ return NVPTXISD::Suld1DArrayV2I64Clamp;
+ case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
+ return NVPTXISD::Suld1DArrayV4I8Clamp;
+ case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
+ return NVPTXISD::Suld1DArrayV4I16Clamp;
+ case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
+ return NVPTXISD::Suld1DArrayV4I32Clamp;
+ case Intrinsic::nvvm_suld_2d_i8_clamp:
+ return NVPTXISD::Suld2DI8Clamp;
+ case Intrinsic::nvvm_suld_2d_i16_clamp:
+ return NVPTXISD::Suld2DI16Clamp;
+ case Intrinsic::nvvm_suld_2d_i32_clamp:
+ return NVPTXISD::Suld2DI32Clamp;
+ case Intrinsic::nvvm_suld_2d_i64_clamp:
+ return NVPTXISD::Suld2DI64Clamp;
+ case Intrinsic::nvvm_suld_2d_v2i8_clamp:
+ return NVPTXISD::Suld2DV2I8Clamp;
+ case Intrinsic::nvvm_suld_2d_v2i16_clamp:
+ return NVPTXISD::Suld2DV2I16Clamp;
+ case Intrinsic::nvvm_suld_2d_v2i32_clamp:
+ return NVPTXISD::Suld2DV2I32Clamp;
+ case Intrinsic::nvvm_suld_2d_v2i64_clamp:
+ return NVPTXISD::Suld2DV2I64Clamp;
+ case Intrinsic::nvvm_suld_2d_v4i8_clamp:
+ return NVPTXISD::Suld2DV4I8Clamp;
+ case Intrinsic::nvvm_suld_2d_v4i16_clamp:
+ return NVPTXISD::Suld2DV4I16Clamp;
+ case Intrinsic::nvvm_suld_2d_v4i32_clamp:
+ return NVPTXISD::Suld2DV4I32Clamp;
+ case Intrinsic::nvvm_suld_2d_array_i8_clamp:
+ return NVPTXISD::Suld2DArrayI8Clamp;
+ case Intrinsic::nvvm_suld_2d_array_i16_clamp:
+ return NVPTXISD::Suld2DArrayI16Clamp;
+ case Intrinsic::nvvm_suld_2d_array_i32_clamp:
+ return NVPTXISD::Suld2DArrayI32Clamp;
+ case Intrinsic::nvvm_suld_2d_array_i64_clamp:
+ return NVPTXISD::Suld2DArrayI64Clamp;
+ case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
+ return NVPTXISD::Suld2DArrayV2I8Clamp;
+ case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
+ return NVPTXISD::Suld2DArrayV2I16Clamp;
+ case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
+ return NVPTXISD::Suld2DArrayV2I32Clamp;
+ case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
+ return NVPTXISD::Suld2DArrayV2I64Clamp;
+ case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
+ return NVPTXISD::Suld2DArrayV4I8Clamp;
+ case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
+ return NVPTXISD::Suld2DArrayV4I16Clamp;
+ case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
+ return NVPTXISD::Suld2DArrayV4I32Clamp;
+ case Intrinsic::nvvm_suld_3d_i8_clamp:
+ return NVPTXISD::Suld3DI8Clamp;
+ case Intrinsic::nvvm_suld_3d_i16_clamp:
+ return NVPTXISD::Suld3DI16Clamp;
+ case Intrinsic::nvvm_suld_3d_i32_clamp:
+ return NVPTXISD::Suld3DI32Clamp;
+ case Intrinsic::nvvm_suld_3d_i64_clamp:
+ return NVPTXISD::Suld3DI64Clamp;
+ case Intrinsic::nvvm_suld_3d_v2i8_clamp:
+ return NVPTXISD::Suld3DV2I8Clamp;
+ case Intrinsic::nvvm_suld_3d_v2i16_clamp:
+ return NVPTXISD::Suld3DV2I16Clamp;
+ case Intrinsic::nvvm_suld_3d_v2i32_clamp:
+ return NVPTXISD::Suld3DV2I32Clamp;
+ case Intrinsic::nvvm_suld_3d_v2i64_clamp:
+ return NVPTXISD::Suld3DV2I64Clamp;
+ case Intrinsic::nvvm_suld_3d_v4i8_clamp:
+ return NVPTXISD::Suld3DV4I8Clamp;
+ case Intrinsic::nvvm_suld_3d_v4i16_clamp:
+ return NVPTXISD::Suld3DV4I16Clamp;
+ case Intrinsic::nvvm_suld_3d_v4i32_clamp:
+ return NVPTXISD::Suld3DV4I32Clamp;
+ case Intrinsic::nvvm_suld_1d_i8_trap:
+ return NVPTXISD::Suld1DI8Trap;
+ case Intrinsic::nvvm_suld_1d_i16_trap:
+ return NVPTXISD::Suld1DI16Trap;
+ case Intrinsic::nvvm_suld_1d_i32_trap:
+ return NVPTXISD::Suld1DI32Trap;
+ case Intrinsic::nvvm_suld_1d_i64_trap:
+ return NVPTXISD::Suld1DI64Trap;
+ case Intrinsic::nvvm_suld_1d_v2i8_trap:
+ return NVPTXISD::Suld1DV2I8Trap;
+ case Intrinsic::nvvm_suld_1d_v2i16_trap:
+ return NVPTXISD::Suld1DV2I16Trap;
+ case Intrinsic::nvvm_suld_1d_v2i32_trap:
+ return NVPTXISD::Suld1DV2I32Trap;
+ case Intrinsic::nvvm_suld_1d_v2i64_trap:
+ return NVPTXISD::Suld1DV2I64Trap;
+ case Intrinsic::nvvm_suld_1d_v4i8_trap:
+ return NVPTXISD::Suld1DV4I8Trap;
+ case Intrinsic::nvvm_suld_1d_v4i16_trap:
+ return NVPTXISD::Suld1DV4I16Trap;
+ case Intrinsic::nvvm_suld_1d_v4i32_trap:
+ return NVPTXISD::Suld1DV4I32Trap;
+ case Intrinsic::nvvm_suld_1d_array_i8_trap:
+ return NVPTXISD::Suld1DArrayI8Trap;
+ case Intrinsic::nvvm_suld_1d_array_i16_trap:
+ return NVPTXISD::Suld1DArrayI16Trap;
+ case Intrinsic::nvvm_suld_1d_array_i32_trap:
+ return NVPTXISD::Suld1DArrayI32Trap;
+ case Intrinsic::nvvm_suld_1d_array_i64_trap:
+ return NVPTXISD::Suld1DArrayI64Trap;
+ case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
+ return NVPTXISD::Suld1DArrayV2I8Trap;
+ case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
+ return NVPTXISD::Suld1DArrayV2I16Trap;
+ case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
+ return NVPTXISD::Suld1DArrayV2I32Trap;
+ case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
+ return NVPTXISD::Suld1DArrayV2I64Trap;
+ case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
+ return NVPTXISD::Suld1DArrayV4I8Trap;
+ case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
+ return NVPTXISD::Suld1DArrayV4I16Trap;
+ case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
+ return NVPTXISD::Suld1DArrayV4I32Trap;
+ case Intrinsic::nvvm_suld_2d_i8_trap:
+ return NVPTXISD::Suld2DI8Trap;
+ case Intrinsic::nvvm_suld_2d_i16_trap:
+ return NVPTXISD::Suld2DI16Trap;
+ case Intrinsic::nvvm_suld_2d_i32_trap:
+ return NVPTXISD::Suld2DI32Trap;
+ case Intrinsic::nvvm_suld_2d_i64_trap:
+ return NVPTXISD::Suld2DI64Trap;
+ case Intrinsic::nvvm_suld_2d_v2i8_trap:
+ return NVPTXISD::Suld2DV2I8Trap;
+ case Intrinsic::nvvm_suld_2d_v2i16_trap:
+ return NVPTXISD::Suld2DV2I16Trap;
+ case Intrinsic::nvvm_suld_2d_v2i32_trap:
+ return NVPTXISD::Suld2DV2I32Trap;
+ case Intrinsic::nvvm_suld_2d_v2i64_trap:
+ return NVPTXISD::Suld2DV2I64Trap;
+ case Intrinsic::nvvm_suld_2d_v4i8_trap:
+ return NVPTXISD::Suld2DV4I8Trap;
+ case Intrinsic::nvvm_suld_2d_v4i16_trap:
+ return NVPTXISD::Suld2DV4I16Trap;
+ case Intrinsic::nvvm_suld_2d_v4i32_trap:
+ return NVPTXISD::Suld2DV4I32Trap;
+ case Intrinsic::nvvm_suld_2d_array_i8_trap:
+ return NVPTXISD::Suld2DArrayI8Trap;
+ case Intrinsic::nvvm_suld_2d_array_i16_trap:
+ return NVPTXISD::Suld2DArrayI16Trap;
+ case Intrinsic::nvvm_suld_2d_array_i32_trap:
+ return NVPTXISD::Suld2DArrayI32Trap;
+ case Intrinsic::nvvm_suld_2d_array_i64_trap:
+ return NVPTXISD::Suld2DArrayI64Trap;
+ case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
+ return NVPTXISD::Suld2DArrayV2I8Trap;
+ case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
+ return NVPTXISD::Suld2DArrayV2I16Trap;
+ case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
+ return NVPTXISD::Suld2DArrayV2I32Trap;
+ case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
+ return NVPTXISD::Suld2DArrayV2I64Trap;
+ case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
+ return NVPTXISD::Suld2DArrayV4I8Trap;
+ case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
+ return NVPTXISD::Suld2DArrayV4I16Trap;
+ case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
+ return NVPTXISD::Suld2DArrayV4I32Trap;
+ case Intrinsic::nvvm_suld_3d_i8_trap:
+ return NVPTXISD::Suld3DI8Trap;
+ case Intrinsic::nvvm_suld_3d_i16_trap:
+ return NVPTXISD::Suld3DI16Trap;
+ case Intrinsic::nvvm_suld_3d_i32_trap:
+ return NVPTXISD::Suld3DI32Trap;
+ case Intrinsic::nvvm_suld_3d_i64_trap:
+ return NVPTXISD::Suld3DI64Trap;
+ case Intrinsic::nvvm_suld_3d_v2i8_trap:
+ return NVPTXISD::Suld3DV2I8Trap;
+ case Intrinsic::nvvm_suld_3d_v2i16_trap:
+ return NVPTXISD::Suld3DV2I16Trap;
+ case Intrinsic::nvvm_suld_3d_v2i32_trap:
+ return NVPTXISD::Suld3DV2I32Trap;
+ case Intrinsic::nvvm_suld_3d_v2i64_trap:
+ return NVPTXISD::Suld3DV2I64Trap;
+ case Intrinsic::nvvm_suld_3d_v4i8_trap:
+ return NVPTXISD::Suld3DV4I8Trap;
+ case Intrinsic::nvvm_suld_3d_v4i16_trap:
+ return NVPTXISD::Suld3DV4I16Trap;
+ case Intrinsic::nvvm_suld_3d_v4i32_trap:
+ return NVPTXISD::Suld3DV4I32Trap;
+ case Intrinsic::nvvm_suld_1d_i8_zero:
+ return NVPTXISD::Suld1DI8Zero;
+ case Intrinsic::nvvm_suld_1d_i16_zero:
+ return NVPTXISD::Suld1DI16Zero;
+ case Intrinsic::nvvm_suld_1d_i32_zero:
+ return NVPTXISD::Suld1DI32Zero;
+ case Intrinsic::nvvm_suld_1d_i64_zero:
+ return NVPTXISD::Suld1DI64Zero;
+ case Intrinsic::nvvm_suld_1d_v2i8_zero:
+ return NVPTXISD::Suld1DV2I8Zero;
+ case Intrinsic::nvvm_suld_1d_v2i16_zero:
+ return NVPTXISD::Suld1DV2I16Zero;
+ case Intrinsic::nvvm_suld_1d_v2i32_zero:
+ return NVPTXISD::Suld1DV2I32Zero;
+ case Intrinsic::nvvm_suld_1d_v2i64_zero:
+ return NVPTXISD::Suld1DV2I64Zero;
+ case Intrinsic::nvvm_suld_1d_v4i8_zero:
+ return NVPTXISD::Suld1DV4I8Zero;
+ case Intrinsic::nvvm_suld_1d_v4i16_zero:
+ return NVPTXISD::Suld1DV4I16Zero;
+ case Intrinsic::nvvm_suld_1d_v4i32_zero:
+ return NVPTXISD::Suld1DV4I32Zero;
+ case Intrinsic::nvvm_suld_1d_array_i8_zero:
+ return NVPTXISD::Suld1DArrayI8Zero;
+ case Intrinsic::nvvm_suld_1d_array_i16_zero:
+ return NVPTXISD::Suld1DArrayI16Zero;
+ case Intrinsic::nvvm_suld_1d_array_i32_zero:
+ return NVPTXISD::Suld1DArrayI32Zero;
+ case Intrinsic::nvvm_suld_1d_array_i64_zero:
+ return NVPTXISD::Suld1DArrayI64Zero;
+ case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
+ return NVPTXISD::Suld1DArrayV2I8Zero;
+ case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
+ return NVPTXISD::Suld1DArrayV2I16Zero;
+ case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
+ return NVPTXISD::Suld1DArrayV2I32Zero;
+ case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
+ return NVPTXISD::Suld1DArrayV2I64Zero;
+ case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
+ return NVPTXISD::Suld1DArrayV4I8Zero;
+ case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
+ return NVPTXISD::Suld1DArrayV4I16Zero;
+ case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
+ return NVPTXISD::Suld1DArrayV4I32Zero;
+ case Intrinsic::nvvm_suld_2d_i8_zero:
+ return NVPTXISD::Suld2DI8Zero;
+ case Intrinsic::nvvm_suld_2d_i16_zero:
+ return NVPTXISD::Suld2DI16Zero;
+ case Intrinsic::nvvm_suld_2d_i32_zero:
+ return NVPTXISD::Suld2DI32Zero;
+ case Intrinsic::nvvm_suld_2d_i64_zero:
+ return NVPTXISD::Suld2DI64Zero;
+ case Intrinsic::nvvm_suld_2d_v2i8_zero:
+ return NVPTXISD::Suld2DV2I8Zero;
+ case Intrinsic::nvvm_suld_2d_v2i16_zero:
+ return NVPTXISD::Suld2DV2I16Zero;
+ case Intrinsic::nvvm_suld_2d_v2i32_zero:
+ return NVPTXISD::Suld2DV2I32Zero;
+ case Intrinsic::nvvm_suld_2d_v2i64_zero:
+ return NVPTXISD::Suld2DV2I64Zero;
+ case Intrinsic::nvvm_suld_2d_v4i8_zero:
+ return NVPTXISD::Suld2DV4I8Zero;
+ case Intrinsic::nvvm_suld_2d_v4i16_zero:
+ return NVPTXISD::Suld2DV4I16Zero;
+ case Intrinsic::nvvm_suld_2d_v4i32_zero:
+ return NVPTXISD::Suld2DV4I32Zero;
+ case Intrinsic::nvvm_suld_2d_array_i8_zero:
+ return NVPTXISD::Suld2DArrayI8Zero;
+ case Intrinsic::nvvm_suld_2d_array_i16_zero:
+ return NVPTXISD::Suld2DArrayI16Zero;
+ case Intrinsic::nvvm_suld_2d_array_i32_zero:
+ return NVPTXISD::Suld2DArrayI32Zero;
+ case Intrinsic::nvvm_suld_2d_array_i64_zero:
+ return NVPTXISD::Suld2DArrayI64Zero;
+ case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
+ return NVPTXISD::Suld2DArrayV2I8Zero;
+ case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
+ return NVPTXISD::Suld2DArrayV2I16Zero;
+ case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
+ return NVPTXISD::Suld2DArrayV2I32Zero;
+ case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
+ return NVPTXISD::Suld2DArrayV2I64Zero;
+ case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
+ return NVPTXISD::Suld2DArrayV4I8Zero;
+ case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
+ return NVPTXISD::Suld2DArrayV4I16Zero;
+ case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
+ return NVPTXISD::Suld2DArrayV4I32Zero;
+ case Intrinsic::nvvm_suld_3d_i8_zero:
+ return NVPTXISD::Suld3DI8Zero;
+ case Intrinsic::nvvm_suld_3d_i16_zero:
+ return NVPTXISD::Suld3DI16Zero;
+ case Intrinsic::nvvm_suld_3d_i32_zero:
+ return NVPTXISD::Suld3DI32Zero;
+ case Intrinsic::nvvm_suld_3d_i64_zero:
+ return NVPTXISD::Suld3DI64Zero;
+ case Intrinsic::nvvm_suld_3d_v2i8_zero:
+ return NVPTXISD::Suld3DV2I8Zero;
+ case Intrinsic::nvvm_suld_3d_v2i16_zero:
+ return NVPTXISD::Suld3DV2I16Zero;
+ case Intrinsic::nvvm_suld_3d_v2i32_zero:
+ return NVPTXISD::Suld3DV2I32Zero;
+ case Intrinsic::nvvm_suld_3d_v2i64_zero:
+ return NVPTXISD::Suld3DV2I64Zero;
+ case Intrinsic::nvvm_suld_3d_v4i8_zero:
+ return NVPTXISD::Suld3DV4I8Zero;
+ case Intrinsic::nvvm_suld_3d_v4i16_zero:
+ return NVPTXISD::Suld3DV4I16Zero;
+ case Intrinsic::nvvm_suld_3d_v4i32_zero:
+ return NVPTXISD::Suld3DV4I32Zero;
+ }
+}
+
+// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
+// TgtMemIntrinsic
+// because we need the information that is only available in the "Value" type
+// of destination
+// pointer. In particular, the address space information.
+bool NVPTXTargetLowering::getTgtMemIntrinsic(
+ IntrinsicInfo &Info, const CallInst &I, unsigned Intrinsic) const {
+ switch (Intrinsic) {
+ default:
+ return false;
+
+ case Intrinsic::nvvm_atomic_load_add_f32:
+ case Intrinsic::nvvm_atomic_load_inc_32:
+ case Intrinsic::nvvm_atomic_load_dec_32:
+
+ case Intrinsic::nvvm_atomic_add_gen_f_cta:
+ case Intrinsic::nvvm_atomic_add_gen_f_sys:
+ case Intrinsic::nvvm_atomic_add_gen_i_cta:
+ case Intrinsic::nvvm_atomic_add_gen_i_sys:
+ case Intrinsic::nvvm_atomic_and_gen_i_cta:
+ case Intrinsic::nvvm_atomic_and_gen_i_sys:
+ case Intrinsic::nvvm_atomic_cas_gen_i_cta:
+ case Intrinsic::nvvm_atomic_cas_gen_i_sys:
+ case Intrinsic::nvvm_atomic_dec_gen_i_cta:
+ case Intrinsic::nvvm_atomic_dec_gen_i_sys:
+ case Intrinsic::nvvm_atomic_inc_gen_i_cta:
+ case Intrinsic::nvvm_atomic_inc_gen_i_sys:
+ case Intrinsic::nvvm_atomic_max_gen_i_cta:
+ case Intrinsic::nvvm_atomic_max_gen_i_sys:
+ case Intrinsic::nvvm_atomic_min_gen_i_cta:
+ case Intrinsic::nvvm_atomic_min_gen_i_sys:
+ case Intrinsic::nvvm_atomic_or_gen_i_cta:
+ case Intrinsic::nvvm_atomic_or_gen_i_sys:
+ case Intrinsic::nvvm_atomic_exch_gen_i_cta:
+ case Intrinsic::nvvm_atomic_exch_gen_i_sys:
+ case Intrinsic::nvvm_atomic_xor_gen_i_cta:
+ case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
+ auto &DL = I.getModule()->getDataLayout();
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = getValueType(DL, I.getType());
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.vol = 0;
+ Info.readMem = true;
+ Info.writeMem = true;
+ Info.align = 0;
+ return true;
+ }
+
+ case Intrinsic::nvvm_ldu_global_i:
+ case Intrinsic::nvvm_ldu_global_f:
+ case Intrinsic::nvvm_ldu_global_p: {
+ auto &DL = I.getModule()->getDataLayout();
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
+ Info.memVT = getValueType(DL, I.getType());
+ else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
+ Info.memVT = getPointerTy(DL);
+ else
+ Info.memVT = getValueType(DL, I.getType());
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.vol = 0;
+ Info.readMem = true;
+ Info.writeMem = false;
+ Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
+
+ return true;
+ }
+ case Intrinsic::nvvm_ldg_global_i:
+ case Intrinsic::nvvm_ldg_global_f:
+ case Intrinsic::nvvm_ldg_global_p: {
+ auto &DL = I.getModule()->getDataLayout();
+
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
+ Info.memVT = getValueType(DL, I.getType());
+ else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
+ Info.memVT = getPointerTy(DL);
+ else
+ Info.memVT = getValueType(DL, I.getType());
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.vol = 0;
+ Info.readMem = true;
+ Info.writeMem = false;
+ Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
+
+ return true;
+ }
+
+ case Intrinsic::nvvm_tex_1d_v4f32_s32:
+ case Intrinsic::nvvm_tex_1d_v4f32_f32:
+ case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
+ case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
+ case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
+ case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
+ case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
+ case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
+ case Intrinsic::nvvm_tex_2d_v4f32_s32:
+ case Intrinsic::nvvm_tex_2d_v4f32_f32:
+ case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
+ case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
+ case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
+ case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
+ case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
+ case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
+ case Intrinsic::nvvm_tex_3d_v4f32_s32:
+ case Intrinsic::nvvm_tex_3d_v4f32_f32:
+ case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
+ case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
+ case Intrinsic::nvvm_tex_cube_v4f32_f32:
+ case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
+ case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
+ case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
+ case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
+ case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
+ case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
+ case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
+ case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
+ case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
+ case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
+ case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
+ case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
+ case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
+ case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
+ case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
+ case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
+ case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
+ case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
+ case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
+ case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
+ case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
+ case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
+ case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
+ case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
+ case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
+ case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
+ case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
+ case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
+ case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
+ case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
+ case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
+ case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
+ case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
+ case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
+ case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: {
+ Info.opc = getOpcForTextureInstr(Intrinsic);
+ Info.memVT = MVT::v4f32;
+ Info.ptrVal = nullptr;
+ Info.offset = 0;
+ Info.vol = 0;
+ Info.readMem = true;
+ Info.writeMem = false;
+ Info.align = 16;
+ return true;
+ }
+ case Intrinsic::nvvm_tex_1d_v4s32_s32:
+ case Intrinsic::nvvm_tex_1d_v4s32_f32:
+ case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
+ case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
+ case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
+ case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
+ case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
+ case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
+ case Intrinsic::nvvm_tex_2d_v4s32_s32:
+ case Intrinsic::nvvm_tex_2d_v4s32_f32:
+ case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
+ case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
+ case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
+ case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
+ case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
+ case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
+ case Intrinsic::nvvm_tex_3d_v4s32_s32:
+ case Intrinsic::nvvm_tex_3d_v4s32_f32:
+ case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
+ case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
+ case Intrinsic::nvvm_tex_cube_v4s32_f32:
+ case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
+ case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
+ case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
+ case Intrinsic::nvvm_tex_cube_v4u32_f32:
+ case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
+ case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
+ case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
+ case Intrinsic::nvvm_tex_1d_v4u32_s32:
+ case Intrinsic::nvvm_tex_1d_v4u32_f32:
+ case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
+ case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
+ case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
+ case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
+ case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
+ case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
+ case Intrinsic::nvvm_tex_2d_v4u32_s32:
+ case Intrinsic::nvvm_tex_2d_v4u32_f32:
+ case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
+ case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
+ case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
+ case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
+ case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
+ case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
+ case Intrinsic::nvvm_tex_3d_v4u32_s32:
+ case Intrinsic::nvvm_tex_3d_v4u32_f32:
+ case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
+ case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
+ case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
+ case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
+ case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
+ case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
+ case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
+ case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
+ case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
+ case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
+ case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
+ case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
+ case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
+ case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
+ case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
+ case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
+ case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
+ case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
+ case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
+ case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
+ case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
+ case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
+ case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
+ case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
+ case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
+ case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
+ case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
+ case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
+ case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
+ case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
+ case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
+ case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
+ case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
+ case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
+ case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
+ case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
+ case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
+ case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
+ case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
+ case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
+ case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
+ case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
+ case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
+ case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
+ case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
+ case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
+ case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
+ case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
+ case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
+ case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
+ case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
+ case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
+ case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
+ case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
+ case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
+ case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
+ case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
+ case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
+ case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
+ case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
+ case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
+ case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
+ case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
+ case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
+ case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
+ case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: {
+ Info.opc = getOpcForTextureInstr(Intrinsic);
+ Info.memVT = MVT::v4i32;
+ Info.ptrVal = nullptr;
+ Info.offset = 0;
+ Info.vol = 0;
+ Info.readMem = true;
+ Info.writeMem = false;
+ Info.align = 16;
+ return true;
+ }
+ case Intrinsic::nvvm_suld_1d_i8_clamp:
+ case Intrinsic::nvvm_suld_1d_v2i8_clamp:
+ case Intrinsic::nvvm_suld_1d_v4i8_clamp:
+ case Intrinsic::nvvm_suld_1d_array_i8_clamp:
+ case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
+ case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
+ case Intrinsic::nvvm_suld_2d_i8_clamp:
+ case Intrinsic::nvvm_suld_2d_v2i8_clamp:
+ case Intrinsic::nvvm_suld_2d_v4i8_clamp:
+ case Intrinsic::nvvm_suld_2d_array_i8_clamp:
+ case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
+ case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
+ case Intrinsic::nvvm_suld_3d_i8_clamp:
+ case Intrinsic::nvvm_suld_3d_v2i8_clamp:
+ case Intrinsic::nvvm_suld_3d_v4i8_clamp:
+ case Intrinsic::nvvm_suld_1d_i8_trap:
+ case Intrinsic::nvvm_suld_1d_v2i8_trap:
+ case Intrinsic::nvvm_suld_1d_v4i8_trap:
+ case Intrinsic::nvvm_suld_1d_array_i8_trap:
+ case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
+ case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
+ case Intrinsic::nvvm_suld_2d_i8_trap:
+ case Intrinsic::nvvm_suld_2d_v2i8_trap:
+ case Intrinsic::nvvm_suld_2d_v4i8_trap:
+ case Intrinsic::nvvm_suld_2d_array_i8_trap:
+ case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
+ case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
+ case Intrinsic::nvvm_suld_3d_i8_trap:
+ case Intrinsic::nvvm_suld_3d_v2i8_trap:
+ case Intrinsic::nvvm_suld_3d_v4i8_trap:
+ case Intrinsic::nvvm_suld_1d_i8_zero:
+ case Intrinsic::nvvm_suld_1d_v2i8_zero:
+ case Intrinsic::nvvm_suld_1d_v4i8_zero:
+ case Intrinsic::nvvm_suld_1d_array_i8_zero:
+ case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
+ case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
+ case Intrinsic::nvvm_suld_2d_i8_zero:
+ case Intrinsic::nvvm_suld_2d_v2i8_zero:
+ case Intrinsic::nvvm_suld_2d_v4i8_zero:
+ case Intrinsic::nvvm_suld_2d_array_i8_zero:
+ case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
+ case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
+ case Intrinsic::nvvm_suld_3d_i8_zero:
+ case Intrinsic::nvvm_suld_3d_v2i8_zero:
+ case Intrinsic::nvvm_suld_3d_v4i8_zero: {
+ Info.opc = getOpcForSurfaceInstr(Intrinsic);
+ Info.memVT = MVT::i8;
+ Info.ptrVal = nullptr;
+ Info.offset = 0;
+ Info.vol = 0;
+ Info.readMem = true;
+ Info.writeMem = false;
+ Info.align = 16;
+ return true;
+ }
+ case Intrinsic::nvvm_suld_1d_i16_clamp:
+ case Intrinsic::nvvm_suld_1d_v2i16_clamp:
+ case Intrinsic::nvvm_suld_1d_v4i16_clamp:
+ case Intrinsic::nvvm_suld_1d_array_i16_clamp:
+ case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
+ case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
+ case Intrinsic::nvvm_suld_2d_i16_clamp:
+ case Intrinsic::nvvm_suld_2d_v2i16_clamp:
+ case Intrinsic::nvvm_suld_2d_v4i16_clamp:
+ case Intrinsic::nvvm_suld_2d_array_i16_clamp:
+ case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
+ case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
+ case Intrinsic::nvvm_suld_3d_i16_clamp:
+ case Intrinsic::nvvm_suld_3d_v2i16_clamp:
+ case Intrinsic::nvvm_suld_3d_v4i16_clamp:
+ case Intrinsic::nvvm_suld_1d_i16_trap:
+ case Intrinsic::nvvm_suld_1d_v2i16_trap:
+ case Intrinsic::nvvm_suld_1d_v4i16_trap:
+ case Intrinsic::nvvm_suld_1d_array_i16_trap:
+ case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
+ case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
+ case Intrinsic::nvvm_suld_2d_i16_trap:
+ case Intrinsic::nvvm_suld_2d_v2i16_trap:
+ case Intrinsic::nvvm_suld_2d_v4i16_trap:
+ case Intrinsic::nvvm_suld_2d_array_i16_trap:
+ case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
+ case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
+ case Intrinsic::nvvm_suld_3d_i16_trap:
+ case Intrinsic::nvvm_suld_3d_v2i16_trap:
+ case Intrinsic::nvvm_suld_3d_v4i16_trap:
+ case Intrinsic::nvvm_suld_1d_i16_zero:
+ case Intrinsic::nvvm_suld_1d_v2i16_zero:
+ case Intrinsic::nvvm_suld_1d_v4i16_zero:
+ case Intrinsic::nvvm_suld_1d_array_i16_zero:
+ case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
+ case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
+ case Intrinsic::nvvm_suld_2d_i16_zero:
+ case Intrinsic::nvvm_suld_2d_v2i16_zero:
+ case Intrinsic::nvvm_suld_2d_v4i16_zero:
+ case Intrinsic::nvvm_suld_2d_array_i16_zero:
+ case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
+ case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
+ case Intrinsic::nvvm_suld_3d_i16_zero:
+ case Intrinsic::nvvm_suld_3d_v2i16_zero:
+ case Intrinsic::nvvm_suld_3d_v4i16_zero: {
+ Info.opc = getOpcForSurfaceInstr(Intrinsic);
+ Info.memVT = MVT::i16;
+ Info.ptrVal = nullptr;
+ Info.offset = 0;
+ Info.vol = 0;
+ Info.readMem = true;
+ Info.writeMem = false;
+ Info.align = 16;
+ return true;
+ }
+ case Intrinsic::nvvm_suld_1d_i32_clamp:
+ case Intrinsic::nvvm_suld_1d_v2i32_clamp:
+ case Intrinsic::nvvm_suld_1d_v4i32_clamp:
+ case Intrinsic::nvvm_suld_1d_array_i32_clamp:
+ case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
+ case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
+ case Intrinsic::nvvm_suld_2d_i32_clamp:
+ case Intrinsic::nvvm_suld_2d_v2i32_clamp:
+ case Intrinsic::nvvm_suld_2d_v4i32_clamp:
+ case Intrinsic::nvvm_suld_2d_array_i32_clamp:
+ case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
+ case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
+ case Intrinsic::nvvm_suld_3d_i32_clamp:
+ case Intrinsic::nvvm_suld_3d_v2i32_clamp:
+ case Intrinsic::nvvm_suld_3d_v4i32_clamp:
+ case Intrinsic::nvvm_suld_1d_i32_trap:
+ case Intrinsic::nvvm_suld_1d_v2i32_trap:
+ case Intrinsic::nvvm_suld_1d_v4i32_trap:
+ case Intrinsic::nvvm_suld_1d_array_i32_trap:
+ case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
+ case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
+ case Intrinsic::nvvm_suld_2d_i32_trap:
+ case Intrinsic::nvvm_suld_2d_v2i32_trap:
+ case Intrinsic::nvvm_suld_2d_v4i32_trap:
+ case Intrinsic::nvvm_suld_2d_array_i32_trap:
+ case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
+ case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
+ case Intrinsic::nvvm_suld_3d_i32_trap:
+ case Intrinsic::nvvm_suld_3d_v2i32_trap:
+ case Intrinsic::nvvm_suld_3d_v4i32_trap:
+ case Intrinsic::nvvm_suld_1d_i32_zero:
+ case Intrinsic::nvvm_suld_1d_v2i32_zero:
+ case Intrinsic::nvvm_suld_1d_v4i32_zero:
+ case Intrinsic::nvvm_suld_1d_array_i32_zero:
+ case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
+ case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
+ case Intrinsic::nvvm_suld_2d_i32_zero:
+ case Intrinsic::nvvm_suld_2d_v2i32_zero:
+ case Intrinsic::nvvm_suld_2d_v4i32_zero:
+ case Intrinsic::nvvm_suld_2d_array_i32_zero:
+ case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
+ case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
+ case Intrinsic::nvvm_suld_3d_i32_zero:
+ case Intrinsic::nvvm_suld_3d_v2i32_zero:
+ case Intrinsic::nvvm_suld_3d_v4i32_zero: {
+ Info.opc = getOpcForSurfaceInstr(Intrinsic);
+ Info.memVT = MVT::i32;
+ Info.ptrVal = nullptr;
+ Info.offset = 0;
+ Info.vol = 0;
+ Info.readMem = true;
+ Info.writeMem = false;
+ Info.align = 16;
+ return true;
+ }
+ case Intrinsic::nvvm_suld_1d_i64_clamp:
+ case Intrinsic::nvvm_suld_1d_v2i64_clamp:
+ case Intrinsic::nvvm_suld_1d_array_i64_clamp:
+ case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
+ case Intrinsic::nvvm_suld_2d_i64_clamp:
+ case Intrinsic::nvvm_suld_2d_v2i64_clamp:
+ case Intrinsic::nvvm_suld_2d_array_i64_clamp:
+ case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
+ case Intrinsic::nvvm_suld_3d_i64_clamp:
+ case Intrinsic::nvvm_suld_3d_v2i64_clamp:
+ case Intrinsic::nvvm_suld_1d_i64_trap:
+ case Intrinsic::nvvm_suld_1d_v2i64_trap:
+ case Intrinsic::nvvm_suld_1d_array_i64_trap:
+ case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
+ case Intrinsic::nvvm_suld_2d_i64_trap:
+ case Intrinsic::nvvm_suld_2d_v2i64_trap:
+ case Intrinsic::nvvm_suld_2d_array_i64_trap:
+ case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
+ case Intrinsic::nvvm_suld_3d_i64_trap:
+ case Intrinsic::nvvm_suld_3d_v2i64_trap:
+ case Intrinsic::nvvm_suld_1d_i64_zero:
+ case Intrinsic::nvvm_suld_1d_v2i64_zero:
+ case Intrinsic::nvvm_suld_1d_array_i64_zero:
+ case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
+ case Intrinsic::nvvm_suld_2d_i64_zero:
+ case Intrinsic::nvvm_suld_2d_v2i64_zero:
+ case Intrinsic::nvvm_suld_2d_array_i64_zero:
+ case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
+ case Intrinsic::nvvm_suld_3d_i64_zero:
+ case Intrinsic::nvvm_suld_3d_v2i64_zero: {
+ Info.opc = getOpcForSurfaceInstr(Intrinsic);
+ Info.memVT = MVT::i64;
+ Info.ptrVal = nullptr;
+ Info.offset = 0;
+ Info.vol = 0;
+ Info.readMem = true;
+ Info.writeMem = false;
+ Info.align = 16;
+ return true;
+ }
+ }
+ return false;
+}
+
+/// isLegalAddressingMode - Return true if the addressing mode represented
+/// by AM is legal for this target, for a load/store of the specified type.
+/// Used to guide target specific optimizations, like loop strength reduction
+/// (LoopStrengthReduce.cpp) and memory optimization for address mode
+/// (CodeGenPrepare.cpp)
+bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+ const AddrMode &AM, Type *Ty,
+ unsigned AS) const {
+
+ // AddrMode - This represents an addressing mode of:
+ // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
+ //
+ // The legal address modes are
+ // - [avar]
+ // - [areg]
+ // - [areg+immoff]
+ // - [immAddr]
+
+ if (AM.BaseGV) {
+ return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
+ }
+
+ switch (AM.Scale) {
+ case 0: // "r", "r+i" or "i" is allowed
+ break;
+ case 1:
+ if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
+ return false;
+ // Otherwise we have r+i.
+ break;
+ default:
+ // No scale > 1 is allowed
+ return false;
+ }
+ return true;
+}
+
+//===----------------------------------------------------------------------===//
+// NVPTX Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+NVPTXTargetLowering::ConstraintType
+NVPTXTargetLowering::getConstraintType(StringRef Constraint) const {
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ default:
+ break;
+ case 'b':
+ case 'r':
+ case 'h':
+ case 'c':
+ case 'l':
+ case 'f':
+ case 'd':
+ case '0':
+ case 'N':
+ return C_RegisterClass;
+ }
+ }
+ return TargetLowering::getConstraintType(Constraint);
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ StringRef Constraint,
+ MVT VT) const {
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ case 'b':
+ return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
+ case 'c':
+ return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
+ case 'h':
+ return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
+ case 'r':
+ return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
+ case 'l':
+ case 'N':
+ return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
+ case 'f':
+ return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
+ case 'd':
+ return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
+ }
+ }
+ return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+}
+
+//===----------------------------------------------------------------------===//
+// NVPTX DAG Combining
+//===----------------------------------------------------------------------===//
+
+bool NVPTXTargetLowering::allowFMA(MachineFunction &MF,
+ CodeGenOpt::Level OptLevel) const {
+ const Function *F = MF.getFunction();
+ const TargetOptions &TO = MF.getTarget().Options;
+
+ // Always honor command-line argument
+ if (FMAContractLevelOpt.getNumOccurrences() > 0) {
+ return FMAContractLevelOpt > 0;
+ } else if (OptLevel == 0) {
+ // Do not contract if we're not optimizing the code
+ return false;
+ } else if (TO.AllowFPOpFusion == FPOpFusion::Fast || TO.UnsafeFPMath) {
+ // Honor TargetOptions flags that explicitly say fusion is okay
+ return true;
+ } else if (F->hasFnAttribute("unsafe-fp-math")) {
+ // Check for unsafe-fp-math=true coming from Clang
+ Attribute Attr = F->getFnAttribute("unsafe-fp-math");
+ StringRef Val = Attr.getValueAsString();
+ if (Val == "true")
+ return true;
+ }
+
+ // We did not have a clear indication that fusion is allowed, so assume not
+ return false;
+}
+
+/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
+/// operands N0 and N1. This is a helper for PerformADDCombine that is
+/// called with the default operands, and if that fails, with commuted
+/// operands.
+static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const NVPTXSubtarget &Subtarget,
+ CodeGenOpt::Level OptLevel) {
+ SelectionDAG &DAG = DCI.DAG;
+ // Skip non-integer, non-scalar case
+ EVT VT=N0.getValueType();
+ if (VT.isVector())
+ return SDValue();
+
+ // fold (add (mul a, b), c) -> (mad a, b, c)
+ //
+ if (N0.getOpcode() == ISD::MUL) {
+ assert (VT.isInteger());
+ // For integer:
+ // Since integer multiply-add costs the same as integer multiply
+ // but is more costly than integer add, do the fusion only when
+ // the mul is only used in the add.
+ if (OptLevel==CodeGenOpt::None || VT != MVT::i32 ||
+ !N0.getNode()->hasOneUse())
+ return SDValue();
+
+ // Do the folding
+ return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
+ N0.getOperand(0), N0.getOperand(1), N1);
+ }
+ else if (N0.getOpcode() == ISD::FMUL) {
+ if (VT == MVT::f32 || VT == MVT::f64) {
+ const auto *TLI = static_cast<const NVPTXTargetLowering *>(
+ &DAG.getTargetLoweringInfo());
+ if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel))
+ return SDValue();
+
+ // For floating point:
+ // Do the fusion only when the mul has less than 5 uses and all
+ // are add.
+ // The heuristic is that if a use is not an add, then that use
+ // cannot be fused into fma, therefore mul is still needed anyway.
+ // If there are more than 4 uses, even if they are all add, fusing
+ // them will increase register pressue.
+ //
+ int numUses = 0;
+ int nonAddCount = 0;
+ for (SDNode::use_iterator UI = N0.getNode()->use_begin(),
+ UE = N0.getNode()->use_end();
+ UI != UE; ++UI) {
+ numUses++;
+ SDNode *User = *UI;
+ if (User->getOpcode() != ISD::FADD)
+ ++nonAddCount;
+ }
+ if (numUses >= 5)
+ return SDValue();
+ if (nonAddCount) {
+ int orderNo = N->getIROrder();
+ int orderNo2 = N0.getNode()->getIROrder();
+ // simple heuristics here for considering potential register
+ // pressure, the logics here is that the differnce are used
+ // to measure the distance between def and use, the longer distance
+ // more likely cause register pressure.
+ if (orderNo - orderNo2 < 500)
+ return SDValue();
+
+ // Now, check if at least one of the FMUL's operands is live beyond the node N,
+ // which guarantees that the FMA will not increase register pressure at node N.
+ bool opIsLive = false;
+ const SDNode *left = N0.getOperand(0).getNode();
+ const SDNode *right = N0.getOperand(1).getNode();
+
+ if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
+ opIsLive = true;
+
+ if (!opIsLive)
+ for (SDNode::use_iterator UI = left->use_begin(), UE = left->use_end(); UI != UE; ++UI) {
+ SDNode *User = *UI;
+ int orderNo3 = User->getIROrder();
+ if (orderNo3 > orderNo) {
+ opIsLive = true;
+ break;
+ }
+ }
+
+ if (!opIsLive)
+ for (SDNode::use_iterator UI = right->use_begin(), UE = right->use_end(); UI != UE; ++UI) {
+ SDNode *User = *UI;
+ int orderNo3 = User->getIROrder();
+ if (orderNo3 > orderNo) {
+ opIsLive = true;
+ break;
+ }
+ }
+
+ if (!opIsLive)
+ return SDValue();
+ }
+
+ return DAG.getNode(ISD::FMA, SDLoc(N), VT,
+ N0.getOperand(0), N0.getOperand(1), N1);
+ }
+ }
+
+ return SDValue();
+}
+
+/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
+///
+static SDValue PerformADDCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const NVPTXSubtarget &Subtarget,
+ CodeGenOpt::Level OptLevel) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ // First try with the default operand order.
+ if (SDValue Result =
+ PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel))
+ return Result;
+
+ // If that didn't work, try again with the operands commuted.
+ return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel);
+}
+
+static SDValue PerformANDCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ // The type legalizer turns a vector load of i8 values into a zextload to i16
+ // registers, optionally ANY_EXTENDs it (if target type is integer),
+ // and ANDs off the high 8 bits. Since we turn this load into a
+ // target-specific DAG node, the DAG combiner fails to eliminate these AND
+ // nodes. Do that here.
+ SDValue Val = N->getOperand(0);
+ SDValue Mask = N->getOperand(1);
+
+ if (isa<ConstantSDNode>(Val)) {
+ std::swap(Val, Mask);
+ }
+
+ SDValue AExt;
+ // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
+ if (Val.getOpcode() == ISD::ANY_EXTEND) {
+ AExt = Val;
+ Val = Val->getOperand(0);
+ }
+
+ if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {
+ Val = Val->getOperand(0);
+ }
+
+ if (Val->getOpcode() == NVPTXISD::LoadV2 ||
+ Val->getOpcode() == NVPTXISD::LoadV4) {
+ ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
+ if (!MaskCnst) {
+ // Not an AND with a constant
+ return SDValue();
+ }
+
+ uint64_t MaskVal = MaskCnst->getZExtValue();
+ if (MaskVal != 0xff) {
+ // Not an AND that chops off top 8 bits
+ return SDValue();
+ }
+
+ MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
+ if (!Mem) {
+ // Not a MemSDNode?!?
+ return SDValue();
+ }
+
+ EVT MemVT = Mem->getMemoryVT();
+ if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
+ // We only handle the i8 case
+ return SDValue();
+ }
+
+ unsigned ExtType =
+ cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))->
+ getZExtValue();
+ if (ExtType == ISD::SEXTLOAD) {
+ // If for some reason the load is a sextload, the and is needed to zero
+ // out the high 8 bits
+ return SDValue();
+ }
+
+ bool AddTo = false;
+ if (AExt.getNode() != 0) {
+ // Re-insert the ext as a zext.
+ Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
+ AExt.getValueType(), Val);
+ AddTo = true;
+ }
+
+ // If we get here, the AND is unnecessary. Just replace it with the load
+ DCI.CombineTo(N, Val, AddTo);
+ }
+
+ return SDValue();
+}
+
+static SDValue PerformSELECTCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ // Currently this detects patterns for integer min and max and
+ // lowers them to PTX-specific intrinsics that enable hardware
+ // support.
+
+ const SDValue Cond = N->getOperand(0);
+ if (Cond.getOpcode() != ISD::SETCC) return SDValue();
+
+ const SDValue LHS = Cond.getOperand(0);
+ const SDValue RHS = Cond.getOperand(1);
+ const SDValue True = N->getOperand(1);
+ const SDValue False = N->getOperand(2);
+ if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
+ return SDValue();
+
+ const EVT VT = N->getValueType(0);
+ if (VT != MVT::i32 && VT != MVT::i64) return SDValue();
+
+ const ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+ SDValue Larger; // The larger of LHS and RHS when condition is true.
+ switch (CC) {
+ case ISD::SETULT:
+ case ISD::SETULE:
+ case ISD::SETLT:
+ case ISD::SETLE:
+ Larger = RHS;
+ break;
+
+ case ISD::SETGT:
+ case ISD::SETGE:
+ case ISD::SETUGT:
+ case ISD::SETUGE:
+ Larger = LHS;
+ break;
+
+ default:
+ return SDValue();
+ }
+ const bool IsMax = (Larger == True);
+ const bool IsSigned = ISD::isSignedIntSetCC(CC);
+
+ unsigned IntrinsicId;
+ if (VT == MVT::i32) {
+ if (IsSigned)
+ IntrinsicId = IsMax ? Intrinsic::nvvm_max_i : Intrinsic::nvvm_min_i;
+ else
+ IntrinsicId = IsMax ? Intrinsic::nvvm_max_ui : Intrinsic::nvvm_min_ui;
+ } else {
+ assert(VT == MVT::i64);
+ if (IsSigned)
+ IntrinsicId = IsMax ? Intrinsic::nvvm_max_ll : Intrinsic::nvvm_min_ll;
+ else
+ IntrinsicId = IsMax ? Intrinsic::nvvm_max_ull : Intrinsic::nvvm_min_ull;
+ }
+
+ SDLoc DL(N);
+ return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+ DCI.DAG.getConstant(IntrinsicId, DL, VT), LHS, RHS);
+}
+
+static SDValue PerformREMCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ CodeGenOpt::Level OptLevel) {
+ assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
+
+ // Don't do anything at less than -O2.
+ if (OptLevel < CodeGenOpt::Default)
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+ bool IsSigned = N->getOpcode() == ISD::SREM;
+ unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
+
+ const SDValue &Num = N->getOperand(0);
+ const SDValue &Den = N->getOperand(1);
+
+ for (const SDNode *U : Num->uses()) {
+ if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
+ U->getOperand(1) == Den) {
+ // Num % Den -> Num - (Num / Den) * Den
+ return DAG.getNode(ISD::SUB, DL, VT, Num,
+ DAG.getNode(ISD::MUL, DL, VT,
+ DAG.getNode(DivOpc, DL, VT, Num, Den),
+ Den));
+ }
+ }
+ return SDValue();
+}
+
+enum OperandSignedness {
+ Signed = 0,
+ Unsigned,
+ Unknown
+};
+
+/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
+/// that can be demoted to \p OptSize bits without loss of information. The
+/// signedness of the operand, if determinable, is placed in \p S.
+static bool IsMulWideOperandDemotable(SDValue Op,
+ unsigned OptSize,
+ OperandSignedness &S) {
+ S = Unknown;
+
+ if (Op.getOpcode() == ISD::SIGN_EXTEND ||
+ Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+ EVT OrigVT = Op.getOperand(0).getValueType();
+ if (OrigVT.getSizeInBits() <= OptSize) {
+ S = Signed;
+ return true;
+ }
+ } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
+ EVT OrigVT = Op.getOperand(0).getValueType();
+ if (OrigVT.getSizeInBits() <= OptSize) {
+ S = Unsigned;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
+/// be demoted to \p OptSize bits without loss of information. If the operands
+/// contain a constant, it should appear as the RHS operand. The signedness of
+/// the operands is placed in \p IsSigned.
+static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS,
+ unsigned OptSize,
+ bool &IsSigned) {
+
+ OperandSignedness LHSSign;
+
+ // The LHS operand must be a demotable op
+ if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
+ return false;
+
+ // We should have been able to determine the signedness from the LHS
+ if (LHSSign == Unknown)
+ return false;
+
+ IsSigned = (LHSSign == Signed);
+
+ // The RHS can be a demotable op or a constant
+ if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
+ const APInt &Val = CI->getAPIntValue();
+ if (LHSSign == Unsigned) {
+ return Val.isIntN(OptSize);
+ } else {
+ return Val.isSignedIntN(OptSize);
+ }
+ } else {
+ OperandSignedness RHSSign;
+ if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
+ return false;
+
+ return LHSSign == RHSSign;
+ }
+}
+
+/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
+/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
+/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
+/// amount.
+static SDValue TryMULWIDECombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ EVT MulType = N->getValueType(0);
+ if (MulType != MVT::i32 && MulType != MVT::i64) {
+ return SDValue();
+ }
+
+ SDLoc DL(N);
+ unsigned OptSize = MulType.getSizeInBits() >> 1;
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ // Canonicalize the multiply so the constant (if any) is on the right
+ if (N->getOpcode() == ISD::MUL) {
+ if (isa<ConstantSDNode>(LHS)) {
+ std::swap(LHS, RHS);
+ }
+ }
+
+ // If we have a SHL, determine the actual multiply amount
+ if (N->getOpcode() == ISD::SHL) {
+ ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
+ if (!ShlRHS) {
+ return SDValue();
+ }
+
+ APInt ShiftAmt = ShlRHS->getAPIntValue();
+ unsigned BitWidth = MulType.getSizeInBits();
+ if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
+ APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
+ RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
+ } else {
+ return SDValue();
+ }
+ }
+
+ bool Signed;
+ // Verify that our operands are demotable
+ if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
+ return SDValue();
+ }
+
+ EVT DemotedVT;
+ if (MulType == MVT::i32) {
+ DemotedVT = MVT::i16;
+ } else {
+ DemotedVT = MVT::i32;
+ }
+
+ // Truncate the operands to the correct size. Note that these are just for
+ // type consistency and will (likely) be eliminated in later phases.
+ SDValue TruncLHS =
+ DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
+ SDValue TruncRHS =
+ DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
+
+ unsigned Opc;
+ if (Signed) {
+ Opc = NVPTXISD::MUL_WIDE_SIGNED;
+ } else {
+ Opc = NVPTXISD::MUL_WIDE_UNSIGNED;
+ }
+
+ return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
+}
+
+/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
+static SDValue PerformMULCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ CodeGenOpt::Level OptLevel) {
+ if (OptLevel > 0) {
+ // Try mul.wide combining at OptLevel > 0
+ if (SDValue Ret = TryMULWIDECombine(N, DCI))
+ return Ret;
+ }
+
+ return SDValue();
+}
+
+/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
+static SDValue PerformSHLCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ CodeGenOpt::Level OptLevel) {
+ if (OptLevel > 0) {
+ // Try mul.wide combining at OptLevel > 0
+ if (SDValue Ret = TryMULWIDECombine(N, DCI))
+ return Ret;
+ }
+
+ return SDValue();
+}
+
+SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ CodeGenOpt::Level OptLevel = getTargetMachine().getOptLevel();
+ switch (N->getOpcode()) {
+ default: break;
+ case ISD::ADD:
+ case ISD::FADD:
+ return PerformADDCombine(N, DCI, STI, OptLevel);
+ case ISD::MUL:
+ return PerformMULCombine(N, DCI, OptLevel);
+ case ISD::SHL:
+ return PerformSHLCombine(N, DCI, OptLevel);
+ case ISD::AND:
+ return PerformANDCombine(N, DCI);
+ case ISD::SELECT:
+ return PerformSELECTCombine(N, DCI);
+ case ISD::UREM:
+ case ISD::SREM:
+ return PerformREMCombine(N, DCI, OptLevel);
+ }
+ return SDValue();
+}
+
+/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
+static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &Results) {
+ EVT ResVT = N->getValueType(0);
+ SDLoc DL(N);
+
+ assert(ResVT.isVector() && "Vector load must have vector type");
+
+ // We only handle "native" vector sizes for now, e.g. <4 x double> is not
+ // legal. We can (and should) split that into 2 loads of <2 x double> here
+ // but I'm leaving that as a TODO for now.
+ assert(ResVT.isSimple() && "Can only handle simple types");
+ switch (ResVT.getSimpleVT().SimpleTy) {
+ default:
+ return;
+ case MVT::v2i8:
+ case MVT::v2i16:
+ case MVT::v2i32:
+ case MVT::v2i64:
+ case MVT::v2f32:
+ case MVT::v2f64:
+ case MVT::v4i8:
+ case MVT::v4i16:
+ case MVT::v4i32:
+ case MVT::v4f32:
+ // This is a "native" vector type
+ break;
+ }
+
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+
+ unsigned Align = LD->getAlignment();
+ auto &TD = DAG.getDataLayout();
+ unsigned PrefAlign =
+ TD.getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext()));
+ if (Align < PrefAlign) {
+ // This load is not sufficiently aligned, so bail out and let this vector
+ // load be scalarized. Note that we may still be able to emit smaller
+ // vector loads. For example, if we are loading a <4 x float> with an
+ // alignment of 8, this check will fail but the legalizer will try again
+ // with 2 x <2 x float>, which will succeed with an alignment of 8.
+ return;
+ }
+
+ EVT EltVT = ResVT.getVectorElementType();
+ unsigned NumElts = ResVT.getVectorNumElements();
+
+ // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
+ // Therefore, we must ensure the type is legal. For i1 and i8, we set the
+ // loaded type to i16 and propagate the "real" type as the memory type.
+ bool NeedTrunc = false;
+ if (EltVT.getSizeInBits() < 16) {
+ EltVT = MVT::i16;
+ NeedTrunc = true;
+ }
+
+ unsigned Opcode = 0;
+ SDVTList LdResVTs;
+
+ switch (NumElts) {
+ default:
+ return;
+ case 2:
+ Opcode = NVPTXISD::LoadV2;
+ LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
+ break;
+ case 4: {
+ Opcode = NVPTXISD::LoadV4;
+ EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
+ LdResVTs = DAG.getVTList(ListVTs);
+ break;
+ }
+ }
+
+ // Copy regular operands
+ SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end());
+
+ // The select routine does not have access to the LoadSDNode instance, so
+ // pass along the extension information
+ OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
+
+ SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
+ LD->getMemoryVT(),
+ LD->getMemOperand());
+
+ SmallVector<SDValue, 4> ScalarRes;
+
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue Res = NewLD.getValue(i);
+ if (NeedTrunc)
+ Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
+ ScalarRes.push_back(Res);
+ }
+
+ SDValue LoadChain = NewLD.getValue(NumElts);
+
+ SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes);
+
+ Results.push_back(BuildVec);
+ Results.push_back(LoadChain);
+}
+
+static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &Results) {
+ SDValue Chain = N->getOperand(0);
+ SDValue Intrin = N->getOperand(1);
+ SDLoc DL(N);
+
+ // Get the intrinsic ID
+ unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue();
+ switch (IntrinNo) {
+ default:
+ return;
+ case Intrinsic::nvvm_ldg_global_i:
+ case Intrinsic::nvvm_ldg_global_f:
+ case Intrinsic::nvvm_ldg_global_p:
+ case Intrinsic::nvvm_ldu_global_i:
+ case Intrinsic::nvvm_ldu_global_f:
+ case Intrinsic::nvvm_ldu_global_p: {
+ EVT ResVT = N->getValueType(0);
+
+ if (ResVT.isVector()) {
+ // Vector LDG/LDU
+
+ unsigned NumElts = ResVT.getVectorNumElements();
+ EVT EltVT = ResVT.getVectorElementType();
+
+ // Since LDU/LDG are target nodes, we cannot rely on DAG type
+ // legalization.
+ // Therefore, we must ensure the type is legal. For i1 and i8, we set the
+ // loaded type to i16 and propagate the "real" type as the memory type.
+ bool NeedTrunc = false;
+ if (EltVT.getSizeInBits() < 16) {
+ EltVT = MVT::i16;
+ NeedTrunc = true;
+ }
+
+ unsigned Opcode = 0;
+ SDVTList LdResVTs;
+
+ switch (NumElts) {
+ default:
+ return;
+ case 2:
+ switch (IntrinNo) {
+ default:
+ return;
+ case Intrinsic::nvvm_ldg_global_i:
+ case Intrinsic::nvvm_ldg_global_f:
+ case Intrinsic::nvvm_ldg_global_p:
+ Opcode = NVPTXISD::LDGV2;
+ break;
+ case Intrinsic::nvvm_ldu_global_i:
+ case Intrinsic::nvvm_ldu_global_f:
+ case Intrinsic::nvvm_ldu_global_p:
+ Opcode = NVPTXISD::LDUV2;
+ break;
+ }
+ LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
+ break;
+ case 4: {
+ switch (IntrinNo) {
+ default:
+ return;
+ case Intrinsic::nvvm_ldg_global_i:
+ case Intrinsic::nvvm_ldg_global_f:
+ case Intrinsic::nvvm_ldg_global_p:
+ Opcode = NVPTXISD::LDGV4;
+ break;
+ case Intrinsic::nvvm_ldu_global_i:
+ case Intrinsic::nvvm_ldu_global_f:
+ case Intrinsic::nvvm_ldu_global_p:
+ Opcode = NVPTXISD::LDUV4;
+ break;
+ }
+ EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
+ LdResVTs = DAG.getVTList(ListVTs);
+ break;
+ }
+ }
+
+ SmallVector<SDValue, 8> OtherOps;
+
+ // Copy regular operands
+
+ OtherOps.push_back(Chain); // Chain
+ // Skip operand 1 (intrinsic ID)
+ // Others
+ OtherOps.append(N->op_begin() + 2, N->op_end());
+
+ MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
+
+ SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
+ MemSD->getMemoryVT(),
+ MemSD->getMemOperand());
+
+ SmallVector<SDValue, 4> ScalarRes;
+
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue Res = NewLD.getValue(i);
+ if (NeedTrunc)
+ Res =
+ DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
+ ScalarRes.push_back(Res);
+ }
+
+ SDValue LoadChain = NewLD.getValue(NumElts);
+
+ SDValue BuildVec =
+ DAG.getBuildVector(ResVT, DL, ScalarRes);
+
+ Results.push_back(BuildVec);
+ Results.push_back(LoadChain);
+ } else {
+ // i8 LDG/LDU
+ assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
+ "Custom handling of non-i8 ldu/ldg?");
+
+ // Just copy all operands as-is
+ SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
+
+ // Force output to i16
+ SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
+
+ MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
+
+ // We make sure the memory type is i8, which will be used during isel
+ // to select the proper instruction.
+ SDValue NewLD =
+ DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops,
+ MVT::i8, MemSD->getMemOperand());
+
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
+ NewLD.getValue(0)));
+ Results.push_back(NewLD.getValue(1));
+ }
+ }
+ }
+}
+
+void NVPTXTargetLowering::ReplaceNodeResults(
+ SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
+ switch (N->getOpcode()) {
+ default:
+ report_fatal_error("Unhandled custom legalization");
+ case ISD::LOAD:
+ ReplaceLoadVector(N, DAG, Results);
+ return;
+ case ISD::INTRINSIC_W_CHAIN:
+ ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);
+ return;
+ }
+}
+
+// Pin NVPTXSection's and NVPTXTargetObjectFile's vtables to this file.
+void NVPTXSection::anchor() {}
+
+NVPTXTargetObjectFile::~NVPTXTargetObjectFile() {
+ delete static_cast<NVPTXSection *>(TextSection);
+ delete static_cast<NVPTXSection *>(DataSection);
+ delete static_cast<NVPTXSection *>(BSSSection);
+ delete static_cast<NVPTXSection *>(ReadOnlySection);
+
+ delete static_cast<NVPTXSection *>(StaticCtorSection);
+ delete static_cast<NVPTXSection *>(StaticDtorSection);
+ delete static_cast<NVPTXSection *>(LSDASection);
+ delete static_cast<NVPTXSection *>(EHFrameSection);
+ delete static_cast<NVPTXSection *>(DwarfAbbrevSection);
+ delete static_cast<NVPTXSection *>(DwarfInfoSection);
+ delete static_cast<NVPTXSection *>(DwarfLineSection);
+ delete static_cast<NVPTXSection *>(DwarfFrameSection);
+ delete static_cast<NVPTXSection *>(DwarfPubTypesSection);
+ delete static_cast<const NVPTXSection *>(DwarfDebugInlineSection);
+ delete static_cast<NVPTXSection *>(DwarfStrSection);
+ delete static_cast<NVPTXSection *>(DwarfLocSection);
+ delete static_cast<NVPTXSection *>(DwarfARangesSection);
+ delete static_cast<NVPTXSection *>(DwarfRangesSection);
+ delete static_cast<NVPTXSection *>(DwarfMacinfoSection);
+}
+
+MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal(
+ const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+ return getDataSection();
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
new file mode 100644
index 000000000000..e433aed7781b
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -0,0 +1,547 @@
+//===-- NVPTXISelLowering.h - NVPTX DAG Lowering Interface ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that NVPTX uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXISELLOWERING_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXISELLOWERING_H
+
+#include "NVPTX.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+namespace NVPTXISD {
+enum NodeType : unsigned {
+ // Start the numbering from where ISD NodeType finishes.
+ FIRST_NUMBER = ISD::BUILTIN_OP_END,
+ Wrapper,
+ CALL,
+ RET_FLAG,
+ LOAD_PARAM,
+ DeclareParam,
+ DeclareScalarParam,
+ DeclareRetParam,
+ DeclareRet,
+ DeclareScalarRet,
+ PrintCall,
+ PrintConvergentCall,
+ PrintCallUni,
+ PrintConvergentCallUni,
+ CallArgBegin,
+ CallArg,
+ LastCallArg,
+ CallArgEnd,
+ CallVoid,
+ CallVal,
+ CallSymbol,
+ Prototype,
+ MoveParam,
+ PseudoUseParam,
+ RETURN,
+ CallSeqBegin,
+ CallSeqEnd,
+ CallPrototype,
+ FUN_SHFL_CLAMP,
+ FUN_SHFR_CLAMP,
+ MUL_WIDE_SIGNED,
+ MUL_WIDE_UNSIGNED,
+ IMAD,
+ Dummy,
+
+ LoadV2 = ISD::FIRST_TARGET_MEMORY_OPCODE,
+ LoadV4,
+ LDGV2, // LDG.v2
+ LDGV4, // LDG.v4
+ LDUV2, // LDU.v2
+ LDUV4, // LDU.v4
+ StoreV2,
+ StoreV4,
+ LoadParam,
+ LoadParamV2,
+ LoadParamV4,
+ StoreParam,
+ StoreParamV2,
+ StoreParamV4,
+ StoreParamS32, // to sext and store a <32bit value, not used currently
+ StoreParamU32, // to zext and store a <32bit value, not used currently
+ StoreRetval,
+ StoreRetvalV2,
+ StoreRetvalV4,
+
+ // Texture intrinsics
+ Tex1DFloatS32,
+ Tex1DFloatFloat,
+ Tex1DFloatFloatLevel,
+ Tex1DFloatFloatGrad,
+ Tex1DS32S32,
+ Tex1DS32Float,
+ Tex1DS32FloatLevel,
+ Tex1DS32FloatGrad,
+ Tex1DU32S32,
+ Tex1DU32Float,
+ Tex1DU32FloatLevel,
+ Tex1DU32FloatGrad,
+ Tex1DArrayFloatS32,
+ Tex1DArrayFloatFloat,
+ Tex1DArrayFloatFloatLevel,
+ Tex1DArrayFloatFloatGrad,
+ Tex1DArrayS32S32,
+ Tex1DArrayS32Float,
+ Tex1DArrayS32FloatLevel,
+ Tex1DArrayS32FloatGrad,
+ Tex1DArrayU32S32,
+ Tex1DArrayU32Float,
+ Tex1DArrayU32FloatLevel,
+ Tex1DArrayU32FloatGrad,
+ Tex2DFloatS32,
+ Tex2DFloatFloat,
+ Tex2DFloatFloatLevel,
+ Tex2DFloatFloatGrad,
+ Tex2DS32S32,
+ Tex2DS32Float,
+ Tex2DS32FloatLevel,
+ Tex2DS32FloatGrad,
+ Tex2DU32S32,
+ Tex2DU32Float,
+ Tex2DU32FloatLevel,
+ Tex2DU32FloatGrad,
+ Tex2DArrayFloatS32,
+ Tex2DArrayFloatFloat,
+ Tex2DArrayFloatFloatLevel,
+ Tex2DArrayFloatFloatGrad,
+ Tex2DArrayS32S32,
+ Tex2DArrayS32Float,
+ Tex2DArrayS32FloatLevel,
+ Tex2DArrayS32FloatGrad,
+ Tex2DArrayU32S32,
+ Tex2DArrayU32Float,
+ Tex2DArrayU32FloatLevel,
+ Tex2DArrayU32FloatGrad,
+ Tex3DFloatS32,
+ Tex3DFloatFloat,
+ Tex3DFloatFloatLevel,
+ Tex3DFloatFloatGrad,
+ Tex3DS32S32,
+ Tex3DS32Float,
+ Tex3DS32FloatLevel,
+ Tex3DS32FloatGrad,
+ Tex3DU32S32,
+ Tex3DU32Float,
+ Tex3DU32FloatLevel,
+ Tex3DU32FloatGrad,
+ TexCubeFloatFloat,
+ TexCubeFloatFloatLevel,
+ TexCubeS32Float,
+ TexCubeS32FloatLevel,
+ TexCubeU32Float,
+ TexCubeU32FloatLevel,
+ TexCubeArrayFloatFloat,
+ TexCubeArrayFloatFloatLevel,
+ TexCubeArrayS32Float,
+ TexCubeArrayS32FloatLevel,
+ TexCubeArrayU32Float,
+ TexCubeArrayU32FloatLevel,
+ Tld4R2DFloatFloat,
+ Tld4G2DFloatFloat,
+ Tld4B2DFloatFloat,
+ Tld4A2DFloatFloat,
+ Tld4R2DS64Float,
+ Tld4G2DS64Float,
+ Tld4B2DS64Float,
+ Tld4A2DS64Float,
+ Tld4R2DU64Float,
+ Tld4G2DU64Float,
+ Tld4B2DU64Float,
+ Tld4A2DU64Float,
+ TexUnified1DFloatS32,
+ TexUnified1DFloatFloat,
+ TexUnified1DFloatFloatLevel,
+ TexUnified1DFloatFloatGrad,
+ TexUnified1DS32S32,
+ TexUnified1DS32Float,
+ TexUnified1DS32FloatLevel,
+ TexUnified1DS32FloatGrad,
+ TexUnified1DU32S32,
+ TexUnified1DU32Float,
+ TexUnified1DU32FloatLevel,
+ TexUnified1DU32FloatGrad,
+ TexUnified1DArrayFloatS32,
+ TexUnified1DArrayFloatFloat,
+ TexUnified1DArrayFloatFloatLevel,
+ TexUnified1DArrayFloatFloatGrad,
+ TexUnified1DArrayS32S32,
+ TexUnified1DArrayS32Float,
+ TexUnified1DArrayS32FloatLevel,
+ TexUnified1DArrayS32FloatGrad,
+ TexUnified1DArrayU32S32,
+ TexUnified1DArrayU32Float,
+ TexUnified1DArrayU32FloatLevel,
+ TexUnified1DArrayU32FloatGrad,
+ TexUnified2DFloatS32,
+ TexUnified2DFloatFloat,
+ TexUnified2DFloatFloatLevel,
+ TexUnified2DFloatFloatGrad,
+ TexUnified2DS32S32,
+ TexUnified2DS32Float,
+ TexUnified2DS32FloatLevel,
+ TexUnified2DS32FloatGrad,
+ TexUnified2DU32S32,
+ TexUnified2DU32Float,
+ TexUnified2DU32FloatLevel,
+ TexUnified2DU32FloatGrad,
+ TexUnified2DArrayFloatS32,
+ TexUnified2DArrayFloatFloat,
+ TexUnified2DArrayFloatFloatLevel,
+ TexUnified2DArrayFloatFloatGrad,
+ TexUnified2DArrayS32S32,
+ TexUnified2DArrayS32Float,
+ TexUnified2DArrayS32FloatLevel,
+ TexUnified2DArrayS32FloatGrad,
+ TexUnified2DArrayU32S32,
+ TexUnified2DArrayU32Float,
+ TexUnified2DArrayU32FloatLevel,
+ TexUnified2DArrayU32FloatGrad,
+ TexUnified3DFloatS32,
+ TexUnified3DFloatFloat,
+ TexUnified3DFloatFloatLevel,
+ TexUnified3DFloatFloatGrad,
+ TexUnified3DS32S32,
+ TexUnified3DS32Float,
+ TexUnified3DS32FloatLevel,
+ TexUnified3DS32FloatGrad,
+ TexUnified3DU32S32,
+ TexUnified3DU32Float,
+ TexUnified3DU32FloatLevel,
+ TexUnified3DU32FloatGrad,
+ TexUnifiedCubeFloatFloat,
+ TexUnifiedCubeFloatFloatLevel,
+ TexUnifiedCubeS32Float,
+ TexUnifiedCubeS32FloatLevel,
+ TexUnifiedCubeU32Float,
+ TexUnifiedCubeU32FloatLevel,
+ TexUnifiedCubeArrayFloatFloat,
+ TexUnifiedCubeArrayFloatFloatLevel,
+ TexUnifiedCubeArrayS32Float,
+ TexUnifiedCubeArrayS32FloatLevel,
+ TexUnifiedCubeArrayU32Float,
+ TexUnifiedCubeArrayU32FloatLevel,
+ Tld4UnifiedR2DFloatFloat,
+ Tld4UnifiedG2DFloatFloat,
+ Tld4UnifiedB2DFloatFloat,
+ Tld4UnifiedA2DFloatFloat,
+ Tld4UnifiedR2DS64Float,
+ Tld4UnifiedG2DS64Float,
+ Tld4UnifiedB2DS64Float,
+ Tld4UnifiedA2DS64Float,
+ Tld4UnifiedR2DU64Float,
+ Tld4UnifiedG2DU64Float,
+ Tld4UnifiedB2DU64Float,
+ Tld4UnifiedA2DU64Float,
+
+ // Surface intrinsics
+ Suld1DI8Clamp,
+ Suld1DI16Clamp,
+ Suld1DI32Clamp,
+ Suld1DI64Clamp,
+ Suld1DV2I8Clamp,
+ Suld1DV2I16Clamp,
+ Suld1DV2I32Clamp,
+ Suld1DV2I64Clamp,
+ Suld1DV4I8Clamp,
+ Suld1DV4I16Clamp,
+ Suld1DV4I32Clamp,
+
+ Suld1DArrayI8Clamp,
+ Suld1DArrayI16Clamp,
+ Suld1DArrayI32Clamp,
+ Suld1DArrayI64Clamp,
+ Suld1DArrayV2I8Clamp,
+ Suld1DArrayV2I16Clamp,
+ Suld1DArrayV2I32Clamp,
+ Suld1DArrayV2I64Clamp,
+ Suld1DArrayV4I8Clamp,
+ Suld1DArrayV4I16Clamp,
+ Suld1DArrayV4I32Clamp,
+
+ Suld2DI8Clamp,
+ Suld2DI16Clamp,
+ Suld2DI32Clamp,
+ Suld2DI64Clamp,
+ Suld2DV2I8Clamp,
+ Suld2DV2I16Clamp,
+ Suld2DV2I32Clamp,
+ Suld2DV2I64Clamp,
+ Suld2DV4I8Clamp,
+ Suld2DV4I16Clamp,
+ Suld2DV4I32Clamp,
+
+ Suld2DArrayI8Clamp,
+ Suld2DArrayI16Clamp,
+ Suld2DArrayI32Clamp,
+ Suld2DArrayI64Clamp,
+ Suld2DArrayV2I8Clamp,
+ Suld2DArrayV2I16Clamp,
+ Suld2DArrayV2I32Clamp,
+ Suld2DArrayV2I64Clamp,
+ Suld2DArrayV4I8Clamp,
+ Suld2DArrayV4I16Clamp,
+ Suld2DArrayV4I32Clamp,
+
+ Suld3DI8Clamp,
+ Suld3DI16Clamp,
+ Suld3DI32Clamp,
+ Suld3DI64Clamp,
+ Suld3DV2I8Clamp,
+ Suld3DV2I16Clamp,
+ Suld3DV2I32Clamp,
+ Suld3DV2I64Clamp,
+ Suld3DV4I8Clamp,
+ Suld3DV4I16Clamp,
+ Suld3DV4I32Clamp,
+
+ Suld1DI8Trap,
+ Suld1DI16Trap,
+ Suld1DI32Trap,
+ Suld1DI64Trap,
+ Suld1DV2I8Trap,
+ Suld1DV2I16Trap,
+ Suld1DV2I32Trap,
+ Suld1DV2I64Trap,
+ Suld1DV4I8Trap,
+ Suld1DV4I16Trap,
+ Suld1DV4I32Trap,
+
+ Suld1DArrayI8Trap,
+ Suld1DArrayI16Trap,
+ Suld1DArrayI32Trap,
+ Suld1DArrayI64Trap,
+ Suld1DArrayV2I8Trap,
+ Suld1DArrayV2I16Trap,
+ Suld1DArrayV2I32Trap,
+ Suld1DArrayV2I64Trap,
+ Suld1DArrayV4I8Trap,
+ Suld1DArrayV4I16Trap,
+ Suld1DArrayV4I32Trap,
+
+ Suld2DI8Trap,
+ Suld2DI16Trap,
+ Suld2DI32Trap,
+ Suld2DI64Trap,
+ Suld2DV2I8Trap,
+ Suld2DV2I16Trap,
+ Suld2DV2I32Trap,
+ Suld2DV2I64Trap,
+ Suld2DV4I8Trap,
+ Suld2DV4I16Trap,
+ Suld2DV4I32Trap,
+
+ Suld2DArrayI8Trap,
+ Suld2DArrayI16Trap,
+ Suld2DArrayI32Trap,
+ Suld2DArrayI64Trap,
+ Suld2DArrayV2I8Trap,
+ Suld2DArrayV2I16Trap,
+ Suld2DArrayV2I32Trap,
+ Suld2DArrayV2I64Trap,
+ Suld2DArrayV4I8Trap,
+ Suld2DArrayV4I16Trap,
+ Suld2DArrayV4I32Trap,
+
+ Suld3DI8Trap,
+ Suld3DI16Trap,
+ Suld3DI32Trap,
+ Suld3DI64Trap,
+ Suld3DV2I8Trap,
+ Suld3DV2I16Trap,
+ Suld3DV2I32Trap,
+ Suld3DV2I64Trap,
+ Suld3DV4I8Trap,
+ Suld3DV4I16Trap,
+ Suld3DV4I32Trap,
+
+ Suld1DI8Zero,
+ Suld1DI16Zero,
+ Suld1DI32Zero,
+ Suld1DI64Zero,
+ Suld1DV2I8Zero,
+ Suld1DV2I16Zero,
+ Suld1DV2I32Zero,
+ Suld1DV2I64Zero,
+ Suld1DV4I8Zero,
+ Suld1DV4I16Zero,
+ Suld1DV4I32Zero,
+
+ Suld1DArrayI8Zero,
+ Suld1DArrayI16Zero,
+ Suld1DArrayI32Zero,
+ Suld1DArrayI64Zero,
+ Suld1DArrayV2I8Zero,
+ Suld1DArrayV2I16Zero,
+ Suld1DArrayV2I32Zero,
+ Suld1DArrayV2I64Zero,
+ Suld1DArrayV4I8Zero,
+ Suld1DArrayV4I16Zero,
+ Suld1DArrayV4I32Zero,
+
+ Suld2DI8Zero,
+ Suld2DI16Zero,
+ Suld2DI32Zero,
+ Suld2DI64Zero,
+ Suld2DV2I8Zero,
+ Suld2DV2I16Zero,
+ Suld2DV2I32Zero,
+ Suld2DV2I64Zero,
+ Suld2DV4I8Zero,
+ Suld2DV4I16Zero,
+ Suld2DV4I32Zero,
+
+ Suld2DArrayI8Zero,
+ Suld2DArrayI16Zero,
+ Suld2DArrayI32Zero,
+ Suld2DArrayI64Zero,
+ Suld2DArrayV2I8Zero,
+ Suld2DArrayV2I16Zero,
+ Suld2DArrayV2I32Zero,
+ Suld2DArrayV2I64Zero,
+ Suld2DArrayV4I8Zero,
+ Suld2DArrayV4I16Zero,
+ Suld2DArrayV4I32Zero,
+
+ Suld3DI8Zero,
+ Suld3DI16Zero,
+ Suld3DI32Zero,
+ Suld3DI64Zero,
+ Suld3DV2I8Zero,
+ Suld3DV2I16Zero,
+ Suld3DV2I32Zero,
+ Suld3DV2I64Zero,
+ Suld3DV4I8Zero,
+ Suld3DV4I16Zero,
+ Suld3DV4I32Zero
+};
+}
+
+class NVPTXSubtarget;
+
+//===--------------------------------------------------------------------===//
+// TargetLowering Implementation
+//===--------------------------------------------------------------------===//
+class NVPTXTargetLowering : public TargetLowering {
+public:
+ explicit NVPTXTargetLowering(const NVPTXTargetMachine &TM,
+ const NVPTXSubtarget &STI);
+ SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+ SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+
+ const char *getTargetNodeName(unsigned Opcode) const override;
+
+ bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
+ unsigned Intrinsic) const override;
+
+ /// isLegalAddressingMode - Return true if the addressing mode represented
+ /// by AM is legal for this target, for a load/store of the specified type
+ /// Used to guide target specific optimizations, like loop strength
+ /// reduction (LoopStrengthReduce.cpp) and memory optimization for
+ /// address mode (CodeGenPrepare.cpp)
+ bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
+ unsigned AS) const override;
+
+ bool isTruncateFree(Type *SrcTy, Type *DstTy) const override {
+ // Truncating 64-bit to 32-bit is free in SASS.
+ if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
+ return false;
+ return SrcTy->getPrimitiveSizeInBits() == 64 &&
+ DstTy->getPrimitiveSizeInBits() == 32;
+ }
+
+ EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
+ EVT VT) const override {
+ if (VT.isVector())
+ return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
+ return MVT::i1;
+ }
+
+ ConstraintType getConstraintType(StringRef Constraint) const override;
+ std::pair<unsigned, const TargetRegisterClass *>
+ getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ StringRef Constraint, MVT VT) const override;
+
+ SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const override;
+
+ SDValue LowerCall(CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const override;
+
+ std::string getPrototype(const DataLayout &DL, Type *, const ArgListTy &,
+ const SmallVectorImpl<ISD::OutputArg> &,
+ unsigned retAlignment,
+ const ImmutableCallSite *CS) const;
+
+ SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals, const SDLoc &dl,
+ SelectionDAG &DAG) const override;
+
+ void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
+ std::vector<SDValue> &Ops,
+ SelectionDAG &DAG) const override;
+
+ const NVPTXTargetMachine *nvTM;
+
+ // PTX always uses 32-bit shift amounts
+ MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
+ return MVT::i32;
+ }
+
+ TargetLoweringBase::LegalizeTypeAction
+ getPreferredVectorAction(EVT VT) const override;
+
+ bool allowFMA(MachineFunction &MF, CodeGenOpt::Level OptLevel) const;
+
+ bool isFMAFasterThanFMulAndFAdd(EVT) const override { return true; }
+
+ bool enableAggressiveFMAFusion(EVT VT) const override { return true; }
+
+private:
+ const NVPTXSubtarget &STI; // cache the subtarget here
+ SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const;
+
+ SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerLOADi1(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue LowerSelect(SDValue Op, SelectionDAG &DAG) const;
+
+ void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const override;
+ SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+ unsigned getArgumentAlignment(SDValue Callee, const ImmutableCallSite *CS,
+ Type *Ty, unsigned Idx,
+ const DataLayout &DL) const;
+};
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
new file mode 100644
index 000000000000..8d00bbb5e9c2
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
@@ -0,0 +1,181 @@
+//===-- NVPTXImageOptimizer.cpp - Image optimization pass -----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements IR-level optimizations of image access code,
+// including:
+//
+// 1. Eliminate istypep intrinsics when image access qualifier is known
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "NVPTXUtilities.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+namespace {
+class NVPTXImageOptimizer : public FunctionPass {
+private:
+ static char ID;
+ SmallVector<Instruction*, 4> InstrToDelete;
+
+public:
+ NVPTXImageOptimizer();
+
+ bool runOnFunction(Function &F) override;
+
+private:
+ bool replaceIsTypePSampler(Instruction &I);
+ bool replaceIsTypePSurface(Instruction &I);
+ bool replaceIsTypePTexture(Instruction &I);
+ Value *cleanupValue(Value *V);
+ void replaceWith(Instruction *From, ConstantInt *To);
+};
+}
+
+char NVPTXImageOptimizer::ID = 0;
+
+NVPTXImageOptimizer::NVPTXImageOptimizer()
+ : FunctionPass(ID) {}
+
+bool NVPTXImageOptimizer::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ bool Changed = false;
+ InstrToDelete.clear();
+
+ // Look for call instructions in the function
+ for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE;
+ ++BI) {
+ for (BasicBlock::iterator I = (*BI).begin(), E = (*BI).end();
+ I != E; ++I) {
+ Instruction &Instr = *I;
+ if (CallInst *CI = dyn_cast<CallInst>(I)) {
+ Function *CalledF = CI->getCalledFunction();
+ if (CalledF && CalledF->isIntrinsic()) {
+ // This is an intrinsic function call, check if its an istypep
+ switch (CalledF->getIntrinsicID()) {
+ default: break;
+ case Intrinsic::nvvm_istypep_sampler:
+ Changed |= replaceIsTypePSampler(Instr);
+ break;
+ case Intrinsic::nvvm_istypep_surface:
+ Changed |= replaceIsTypePSurface(Instr);
+ break;
+ case Intrinsic::nvvm_istypep_texture:
+ Changed |= replaceIsTypePTexture(Instr);
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ // Delete any istypep instances we replaced in the IR
+ for (unsigned i = 0, e = InstrToDelete.size(); i != e; ++i)
+ InstrToDelete[i]->eraseFromParent();
+
+ return Changed;
+}
+
+bool NVPTXImageOptimizer::replaceIsTypePSampler(Instruction &I) {
+ Value *TexHandle = cleanupValue(I.getOperand(0));
+ if (isSampler(*TexHandle)) {
+ // This is an OpenCL sampler, so it must be a samplerref
+ replaceWith(&I, ConstantInt::getTrue(I.getContext()));
+ return true;
+ } else if (isImageWriteOnly(*TexHandle) ||
+ isImageReadWrite(*TexHandle) ||
+ isImageReadOnly(*TexHandle)) {
+ // This is an OpenCL image, so it cannot be a samplerref
+ replaceWith(&I, ConstantInt::getFalse(I.getContext()));
+ return true;
+ } else {
+ // The image type is unknown, so we cannot eliminate the intrinsic
+ return false;
+ }
+}
+
+bool NVPTXImageOptimizer::replaceIsTypePSurface(Instruction &I) {
+ Value *TexHandle = cleanupValue(I.getOperand(0));
+ if (isImageReadWrite(*TexHandle) ||
+ isImageWriteOnly(*TexHandle)) {
+ // This is an OpenCL read-only/read-write image, so it must be a surfref
+ replaceWith(&I, ConstantInt::getTrue(I.getContext()));
+ return true;
+ } else if (isImageReadOnly(*TexHandle) ||
+ isSampler(*TexHandle)) {
+ // This is an OpenCL read-only/ imageor sampler, so it cannot be
+ // a surfref
+ replaceWith(&I, ConstantInt::getFalse(I.getContext()));
+ return true;
+ } else {
+ // The image type is unknown, so we cannot eliminate the intrinsic
+ return false;
+ }
+}
+
+bool NVPTXImageOptimizer::replaceIsTypePTexture(Instruction &I) {
+ Value *TexHandle = cleanupValue(I.getOperand(0));
+ if (isImageReadOnly(*TexHandle)) {
+ // This is an OpenCL read-only image, so it must be a texref
+ replaceWith(&I, ConstantInt::getTrue(I.getContext()));
+ return true;
+ } else if (isImageWriteOnly(*TexHandle) ||
+ isImageReadWrite(*TexHandle) ||
+ isSampler(*TexHandle)) {
+ // This is an OpenCL read-write/write-only image or a sampler, so it
+ // cannot be a texref
+ replaceWith(&I, ConstantInt::getFalse(I.getContext()));
+ return true;
+ } else {
+ // The image type is unknown, so we cannot eliminate the intrinsic
+ return false;
+ }
+}
+
+void NVPTXImageOptimizer::replaceWith(Instruction *From, ConstantInt *To) {
+ // We implement "poor man's DCE" here to make sure any code that is no longer
+ // live is actually unreachable and can be trivially eliminated by the
+ // unreachable block elimination pass.
+ for (CallInst::use_iterator UI = From->use_begin(), UE = From->use_end();
+ UI != UE; ++UI) {
+ if (BranchInst *BI = dyn_cast<BranchInst>(*UI)) {
+ if (BI->isUnconditional()) continue;
+ BasicBlock *Dest;
+ if (To->isZero())
+ // Get false block
+ Dest = BI->getSuccessor(1);
+ else
+ // Get true block
+ Dest = BI->getSuccessor(0);
+ BranchInst::Create(Dest, BI);
+ InstrToDelete.push_back(BI);
+ }
+ }
+ From->replaceAllUsesWith(To);
+ InstrToDelete.push_back(From);
+}
+
+Value *NVPTXImageOptimizer::cleanupValue(Value *V) {
+ if (ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(V)) {
+ return cleanupValue(EVI->getAggregateOperand());
+ }
+ return V;
+}
+
+FunctionPass *llvm::createNVPTXImageOptimizerPass() {
+ return new NVPTXImageOptimizer();
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInferAddressSpaces.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXInferAddressSpaces.cpp
new file mode 100644
index 000000000000..f4940c937a2d
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInferAddressSpaces.cpp
@@ -0,0 +1,583 @@
+//===-- NVPTXInferAddressSpace.cpp - ---------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// CUDA C/C++ includes memory space designation as variable type qualifers (such
+// as __global__ and __shared__). Knowing the space of a memory access allows
+// CUDA compilers to emit faster PTX loads and stores. For example, a load from
+// shared memory can be translated to `ld.shared` which is roughly 10% faster
+// than a generic `ld` on an NVIDIA Tesla K40c.
+//
+// Unfortunately, type qualifiers only apply to variable declarations, so CUDA
+// compilers must infer the memory space of an address expression from
+// type-qualified variables.
+//
+// LLVM IR uses non-zero (so-called) specific address spaces to represent memory
+// spaces (e.g. addrspace(3) means shared memory). The Clang frontend
+// places only type-qualified variables in specific address spaces, and then
+// conservatively `addrspacecast`s each type-qualified variable to addrspace(0)
+// (so-called the generic address space) for other instructions to use.
+//
+// For example, the Clang translates the following CUDA code
+// __shared__ float a[10];
+// float v = a[i];
+// to
+// %0 = addrspacecast [10 x float] addrspace(3)* @a to [10 x float]*
+// %1 = gep [10 x float], [10 x float]* %0, i64 0, i64 %i
+// %v = load float, float* %1 ; emits ld.f32
+// @a is in addrspace(3) since it's type-qualified, but its use from %1 is
+// redirected to %0 (the generic version of @a).
+//
+// The optimization implemented in this file propagates specific address spaces
+// from type-qualified variable declarations to its users. For example, it
+// optimizes the above IR to
+// %1 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i
+// %v = load float addrspace(3)* %1 ; emits ld.shared.f32
+// propagating the addrspace(3) from @a to %1. As the result, the NVPTX
+// codegen is able to emit ld.shared.f32 for %v.
+//
+// Address space inference works in two steps. First, it uses a data-flow
+// analysis to infer as many generic pointers as possible to point to only one
+// specific address space. In the above example, it can prove that %1 only
+// points to addrspace(3). This algorithm was published in
+// CUDA: Compiling and optimizing for a GPU platform
+// Chakrabarti, Grover, Aarts, Kong, Kudlur, Lin, Marathe, Murphy, Wang
+// ICCS 2012
+//
+// Then, address space inference replaces all refinable generic pointers with
+// equivalent specific pointers.
+//
+// The major challenge of implementing this optimization is handling PHINodes,
+// which may create loops in the data flow graph. This brings two complications.
+//
+// First, the data flow analysis in Step 1 needs to be circular. For example,
+// %generic.input = addrspacecast float addrspace(3)* %input to float*
+// loop:
+// %y = phi [ %generic.input, %y2 ]
+// %y2 = getelementptr %y, 1
+// %v = load %y2
+// br ..., label %loop, ...
+// proving %y specific requires proving both %generic.input and %y2 specific,
+// but proving %y2 specific circles back to %y. To address this complication,
+// the data flow analysis operates on a lattice:
+// uninitialized > specific address spaces > generic.
+// All address expressions (our implementation only considers phi, bitcast,
+// addrspacecast, and getelementptr) start with the uninitialized address space.
+// The monotone transfer function moves the address space of a pointer down a
+// lattice path from uninitialized to specific and then to generic. A join
+// operation of two different specific address spaces pushes the expression down
+// to the generic address space. The analysis completes once it reaches a fixed
+// point.
+//
+// Second, IR rewriting in Step 2 also needs to be circular. For example,
+// converting %y to addrspace(3) requires the compiler to know the converted
+// %y2, but converting %y2 needs the converted %y. To address this complication,
+// we break these cycles using "undef" placeholders. When converting an
+// instruction `I` to a new address space, if its operand `Op` is not converted
+// yet, we let `I` temporarily use `undef` and fix all the uses of undef later.
+// For instance, our algorithm first converts %y to
+// %y' = phi float addrspace(3)* [ %input, undef ]
+// Then, it converts %y2 to
+// %y2' = getelementptr %y', 1
+// Finally, it fixes the undef in %y' so that
+// %y' = phi float addrspace(3)* [ %input, %y2' ]
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "nvptx-infer-addrspace"
+
+#include "NVPTX.h"
+#include "MCTargetDesc/NVPTXBaseInfo.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+
+using namespace llvm;
+
+namespace {
+const unsigned ADDRESS_SPACE_UNINITIALIZED = (unsigned)-1;
+
+using ValueToAddrSpaceMapTy = DenseMap<const Value *, unsigned>;
+
+/// \brief NVPTXInferAddressSpaces
+class NVPTXInferAddressSpaces: public FunctionPass {
+public:
+ static char ID;
+
+ NVPTXInferAddressSpaces() : FunctionPass(ID) {}
+
+ bool runOnFunction(Function &F) override;
+
+private:
+ // Returns the new address space of V if updated; otherwise, returns None.
+ Optional<unsigned>
+ updateAddressSpace(const Value &V,
+ const ValueToAddrSpaceMapTy &InferredAddrSpace);
+
+ // Tries to infer the specific address space of each address expression in
+ // Postorder.
+ void inferAddressSpaces(const std::vector<Value *> &Postorder,
+ ValueToAddrSpaceMapTy *InferredAddrSpace);
+
+ // Changes the generic address expressions in function F to point to specific
+ // address spaces if InferredAddrSpace says so. Postorder is the postorder of
+ // all generic address expressions in the use-def graph of function F.
+ bool
+ rewriteWithNewAddressSpaces(const std::vector<Value *> &Postorder,
+ const ValueToAddrSpaceMapTy &InferredAddrSpace,
+ Function *F);
+};
+} // end anonymous namespace
+
+char NVPTXInferAddressSpaces::ID = 0;
+
+namespace llvm {
+void initializeNVPTXInferAddressSpacesPass(PassRegistry &);
+}
+INITIALIZE_PASS(NVPTXInferAddressSpaces, "nvptx-infer-addrspace",
+ "Infer address spaces",
+ false, false)
+
+// Returns true if V is an address expression.
+// TODO: Currently, we consider only phi, bitcast, addrspacecast, and
+// getelementptr operators.
+static bool isAddressExpression(const Value &V) {
+ if (!isa<Operator>(V))
+ return false;
+
+ switch (cast<Operator>(V).getOpcode()) {
+ case Instruction::PHI:
+ case Instruction::BitCast:
+ case Instruction::AddrSpaceCast:
+ case Instruction::GetElementPtr:
+ return true;
+ default:
+ return false;
+ }
+}
+
+// Returns the pointer operands of V.
+//
+// Precondition: V is an address expression.
+static SmallVector<Value *, 2> getPointerOperands(const Value &V) {
+ assert(isAddressExpression(V));
+ const Operator& Op = cast<Operator>(V);
+ switch (Op.getOpcode()) {
+ case Instruction::PHI: {
+ auto IncomingValues = cast<PHINode>(Op).incoming_values();
+ return SmallVector<Value *, 2>(IncomingValues.begin(),
+ IncomingValues.end());
+ }
+ case Instruction::BitCast:
+ case Instruction::AddrSpaceCast:
+ case Instruction::GetElementPtr:
+ return {Op.getOperand(0)};
+ default:
+ llvm_unreachable("Unexpected instruction type.");
+ }
+}
+
+// If V is an unvisited generic address expression, appends V to PostorderStack
+// and marks it as visited.
+static void appendsGenericAddressExpressionToPostorderStack(
+ Value *V, std::vector<std::pair<Value *, bool>> *PostorderStack,
+ DenseSet<Value *> *Visited) {
+ assert(V->getType()->isPointerTy());
+ if (isAddressExpression(*V) &&
+ V->getType()->getPointerAddressSpace() ==
+ AddressSpace::ADDRESS_SPACE_GENERIC) {
+ if (Visited->insert(V).second)
+ PostorderStack->push_back(std::make_pair(V, false));
+ }
+}
+
+// Returns all generic address expressions in function F. The elements are
+// ordered in postorder.
+static std::vector<Value *> collectGenericAddressExpressions(Function &F) {
+ // This function implements a non-recursive postorder traversal of a partial
+ // use-def graph of function F.
+ std::vector<std::pair<Value*, bool>> PostorderStack;
+ // The set of visited expressions.
+ DenseSet<Value*> Visited;
+ // We only explore address expressions that are reachable from loads and
+ // stores for now because we aim at generating faster loads and stores.
+ for (Instruction &I : instructions(F)) {
+ if (isa<LoadInst>(I)) {
+ appendsGenericAddressExpressionToPostorderStack(
+ I.getOperand(0), &PostorderStack, &Visited);
+ } else if (isa<StoreInst>(I)) {
+ appendsGenericAddressExpressionToPostorderStack(
+ I.getOperand(1), &PostorderStack, &Visited);
+ }
+ }
+
+ std::vector<Value *> Postorder; // The resultant postorder.
+ while (!PostorderStack.empty()) {
+ // If the operands of the expression on the top are already explored,
+ // adds that expression to the resultant postorder.
+ if (PostorderStack.back().second) {
+ Postorder.push_back(PostorderStack.back().first);
+ PostorderStack.pop_back();
+ continue;
+ }
+ // Otherwise, adds its operands to the stack and explores them.
+ PostorderStack.back().second = true;
+ for (Value *PtrOperand : getPointerOperands(*PostorderStack.back().first)) {
+ appendsGenericAddressExpressionToPostorderStack(
+ PtrOperand, &PostorderStack, &Visited);
+ }
+ }
+ return Postorder;
+}
+
+// A helper function for cloneInstructionWithNewAddressSpace. Returns the clone
+// of OperandUse.get() in the new address space. If the clone is not ready yet,
+// returns an undef in the new address space as a placeholder.
+static Value *operandWithNewAddressSpaceOrCreateUndef(
+ const Use &OperandUse, unsigned NewAddrSpace,
+ const ValueToValueMapTy &ValueWithNewAddrSpace,
+ SmallVectorImpl<const Use *> *UndefUsesToFix) {
+ Value *Operand = OperandUse.get();
+ if (Value *NewOperand = ValueWithNewAddrSpace.lookup(Operand))
+ return NewOperand;
+
+ UndefUsesToFix->push_back(&OperandUse);
+ return UndefValue::get(
+ Operand->getType()->getPointerElementType()->getPointerTo(NewAddrSpace));
+}
+
+// Returns a clone of `I` with its operands converted to those specified in
+// ValueWithNewAddrSpace. Due to potential cycles in the data flow graph, an
+// operand whose address space needs to be modified might not exist in
+// ValueWithNewAddrSpace. In that case, uses undef as a placeholder operand and
+// adds that operand use to UndefUsesToFix so that caller can fix them later.
+//
+// Note that we do not necessarily clone `I`, e.g., if it is an addrspacecast
+// from a pointer whose type already matches. Therefore, this function returns a
+// Value* instead of an Instruction*.
+static Value *cloneInstructionWithNewAddressSpace(
+ Instruction *I, unsigned NewAddrSpace,
+ const ValueToValueMapTy &ValueWithNewAddrSpace,
+ SmallVectorImpl<const Use *> *UndefUsesToFix) {
+ Type *NewPtrType =
+ I->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
+
+ if (I->getOpcode() == Instruction::AddrSpaceCast) {
+ Value *Src = I->getOperand(0);
+ // Because `I` is generic, the source address space must be specific.
+ // Therefore, the inferred address space must be the source space, according
+ // to our algorithm.
+ assert(Src->getType()->getPointerAddressSpace() == NewAddrSpace);
+ if (Src->getType() != NewPtrType)
+ return new BitCastInst(Src, NewPtrType);
+ return Src;
+ }
+
+ // Computes the converted pointer operands.
+ SmallVector<Value *, 4> NewPointerOperands;
+ for (const Use &OperandUse : I->operands()) {
+ if (!OperandUse.get()->getType()->isPointerTy())
+ NewPointerOperands.push_back(nullptr);
+ else
+ NewPointerOperands.push_back(operandWithNewAddressSpaceOrCreateUndef(
+ OperandUse, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix));
+ }
+
+ switch (I->getOpcode()) {
+ case Instruction::BitCast:
+ return new BitCastInst(NewPointerOperands[0], NewPtrType);
+ case Instruction::PHI: {
+ assert(I->getType()->isPointerTy());
+ PHINode *PHI = cast<PHINode>(I);
+ PHINode *NewPHI = PHINode::Create(NewPtrType, PHI->getNumIncomingValues());
+ for (unsigned Index = 0; Index < PHI->getNumIncomingValues(); ++Index) {
+ unsigned OperandNo = PHINode::getOperandNumForIncomingValue(Index);
+ NewPHI->addIncoming(NewPointerOperands[OperandNo],
+ PHI->getIncomingBlock(Index));
+ }
+ return NewPHI;
+ }
+ case Instruction::GetElementPtr: {
+ GetElementPtrInst *GEP = cast<GetElementPtrInst>(I);
+ GetElementPtrInst *NewGEP = GetElementPtrInst::Create(
+ GEP->getSourceElementType(), NewPointerOperands[0],
+ SmallVector<Value *, 4>(GEP->idx_begin(), GEP->idx_end()));
+ NewGEP->setIsInBounds(GEP->isInBounds());
+ return NewGEP;
+ }
+ default:
+ llvm_unreachable("Unexpected opcode");
+ }
+}
+
+// Similar to cloneInstructionWithNewAddressSpace, returns a clone of the
+// constant expression `CE` with its operands replaced as specified in
+// ValueWithNewAddrSpace.
+static Value *cloneConstantExprWithNewAddressSpace(
+ ConstantExpr *CE, unsigned NewAddrSpace,
+ const ValueToValueMapTy &ValueWithNewAddrSpace) {
+ Type *TargetType =
+ CE->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
+
+ if (CE->getOpcode() == Instruction::AddrSpaceCast) {
+ // Because CE is generic, the source address space must be specific.
+ // Therefore, the inferred address space must be the source space according
+ // to our algorithm.
+ assert(CE->getOperand(0)->getType()->getPointerAddressSpace() ==
+ NewAddrSpace);
+ return ConstantExpr::getBitCast(CE->getOperand(0), TargetType);
+ }
+
+ // Computes the operands of the new constant expression.
+ SmallVector<Constant *, 4> NewOperands;
+ for (unsigned Index = 0; Index < CE->getNumOperands(); ++Index) {
+ Constant *Operand = CE->getOperand(Index);
+ // If the address space of `Operand` needs to be modified, the new operand
+ // with the new address space should already be in ValueWithNewAddrSpace
+ // because (1) the constant expressions we consider (i.e. addrspacecast,
+ // bitcast, and getelementptr) do not incur cycles in the data flow graph
+ // and (2) this function is called on constant expressions in postorder.
+ if (Value *NewOperand = ValueWithNewAddrSpace.lookup(Operand)) {
+ NewOperands.push_back(cast<Constant>(NewOperand));
+ } else {
+ // Otherwise, reuses the old operand.
+ NewOperands.push_back(Operand);
+ }
+ }
+
+ if (CE->getOpcode() == Instruction::GetElementPtr) {
+ // Needs to specify the source type while constructing a getelementptr
+ // constant expression.
+ return CE->getWithOperands(
+ NewOperands, TargetType, /*OnlyIfReduced=*/false,
+ NewOperands[0]->getType()->getPointerElementType());
+ }
+
+ return CE->getWithOperands(NewOperands, TargetType);
+}
+
+// Returns a clone of the value `V`, with its operands replaced as specified in
+// ValueWithNewAddrSpace. This function is called on every generic address
+// expression whose address space needs to be modified, in postorder.
+//
+// See cloneInstructionWithNewAddressSpace for the meaning of UndefUsesToFix.
+static Value *
+cloneValueWithNewAddressSpace(Value *V, unsigned NewAddrSpace,
+ const ValueToValueMapTy &ValueWithNewAddrSpace,
+ SmallVectorImpl<const Use *> *UndefUsesToFix) {
+ // All values in Postorder are generic address expressions.
+ assert(isAddressExpression(*V) &&
+ V->getType()->getPointerAddressSpace() ==
+ AddressSpace::ADDRESS_SPACE_GENERIC);
+
+ if (Instruction *I = dyn_cast<Instruction>(V)) {
+ Value *NewV = cloneInstructionWithNewAddressSpace(
+ I, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix);
+ if (Instruction *NewI = dyn_cast<Instruction>(NewV)) {
+ if (NewI->getParent() == nullptr) {
+ NewI->insertBefore(I);
+ NewI->takeName(I);
+ }
+ }
+ return NewV;
+ }
+
+ return cloneConstantExprWithNewAddressSpace(
+ cast<ConstantExpr>(V), NewAddrSpace, ValueWithNewAddrSpace);
+}
+
+// Defines the join operation on the address space lattice (see the file header
+// comments).
+static unsigned joinAddressSpaces(unsigned AS1, unsigned AS2) {
+ if (AS1 == AddressSpace::ADDRESS_SPACE_GENERIC ||
+ AS2 == AddressSpace::ADDRESS_SPACE_GENERIC)
+ return AddressSpace::ADDRESS_SPACE_GENERIC;
+
+ if (AS1 == ADDRESS_SPACE_UNINITIALIZED)
+ return AS2;
+ if (AS2 == ADDRESS_SPACE_UNINITIALIZED)
+ return AS1;
+
+ // The join of two different specific address spaces is generic.
+ return AS1 == AS2 ? AS1 : (unsigned)AddressSpace::ADDRESS_SPACE_GENERIC;
+}
+
+bool NVPTXInferAddressSpaces::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ // Collects all generic address expressions in postorder.
+ std::vector<Value *> Postorder = collectGenericAddressExpressions(F);
+
+ // Runs a data-flow analysis to refine the address spaces of every expression
+ // in Postorder.
+ ValueToAddrSpaceMapTy InferredAddrSpace;
+ inferAddressSpaces(Postorder, &InferredAddrSpace);
+
+ // Changes the address spaces of the generic address expressions who are
+ // inferred to point to a specific address space.
+ return rewriteWithNewAddressSpaces(Postorder, InferredAddrSpace, &F);
+}
+
+void NVPTXInferAddressSpaces::inferAddressSpaces(
+ const std::vector<Value *> &Postorder,
+ ValueToAddrSpaceMapTy *InferredAddrSpace) {
+ SetVector<Value *> Worklist(Postorder.begin(), Postorder.end());
+ // Initially, all expressions are in the uninitialized address space.
+ for (Value *V : Postorder)
+ (*InferredAddrSpace)[V] = ADDRESS_SPACE_UNINITIALIZED;
+
+ while (!Worklist.empty()) {
+ Value* V = Worklist.pop_back_val();
+
+ // Tries to update the address space of the stack top according to the
+ // address spaces of its operands.
+ DEBUG(dbgs() << "Updating the address space of\n"
+ << " " << *V << "\n");
+ Optional<unsigned> NewAS = updateAddressSpace(*V, *InferredAddrSpace);
+ if (!NewAS.hasValue())
+ continue;
+ // If any updates are made, grabs its users to the worklist because
+ // their address spaces can also be possibly updated.
+ DEBUG(dbgs() << " to " << NewAS.getValue() << "\n");
+ (*InferredAddrSpace)[V] = NewAS.getValue();
+
+ for (Value *User : V->users()) {
+ // Skip if User is already in the worklist.
+ if (Worklist.count(User))
+ continue;
+
+ auto Pos = InferredAddrSpace->find(User);
+ // Our algorithm only updates the address spaces of generic address
+ // expressions, which are those in InferredAddrSpace.
+ if (Pos == InferredAddrSpace->end())
+ continue;
+
+ // Function updateAddressSpace moves the address space down a lattice
+ // path. Therefore, nothing to do if User is already inferred as
+ // generic (the bottom element in the lattice).
+ if (Pos->second == AddressSpace::ADDRESS_SPACE_GENERIC)
+ continue;
+
+ Worklist.insert(User);
+ }
+ }
+}
+
+Optional<unsigned> NVPTXInferAddressSpaces::updateAddressSpace(
+ const Value &V, const ValueToAddrSpaceMapTy &InferredAddrSpace) {
+ assert(InferredAddrSpace.count(&V));
+
+ // The new inferred address space equals the join of the address spaces
+ // of all its pointer operands.
+ unsigned NewAS = ADDRESS_SPACE_UNINITIALIZED;
+ for (Value *PtrOperand : getPointerOperands(V)) {
+ unsigned OperandAS;
+ if (InferredAddrSpace.count(PtrOperand))
+ OperandAS = InferredAddrSpace.lookup(PtrOperand);
+ else
+ OperandAS = PtrOperand->getType()->getPointerAddressSpace();
+ NewAS = joinAddressSpaces(NewAS, OperandAS);
+ // join(generic, *) = generic. So we can break if NewAS is already generic.
+ if (NewAS == AddressSpace::ADDRESS_SPACE_GENERIC)
+ break;
+ }
+
+ unsigned OldAS = InferredAddrSpace.lookup(&V);
+ assert(OldAS != AddressSpace::ADDRESS_SPACE_GENERIC);
+ if (OldAS == NewAS)
+ return None;
+ return NewAS;
+}
+
+bool NVPTXInferAddressSpaces::rewriteWithNewAddressSpaces(
+ const std::vector<Value *> &Postorder,
+ const ValueToAddrSpaceMapTy &InferredAddrSpace, Function *F) {
+ // For each address expression to be modified, creates a clone of it with its
+ // pointer operands converted to the new address space. Since the pointer
+ // operands are converted, the clone is naturally in the new address space by
+ // construction.
+ ValueToValueMapTy ValueWithNewAddrSpace;
+ SmallVector<const Use *, 32> UndefUsesToFix;
+ for (Value* V : Postorder) {
+ unsigned NewAddrSpace = InferredAddrSpace.lookup(V);
+ if (V->getType()->getPointerAddressSpace() != NewAddrSpace) {
+ ValueWithNewAddrSpace[V] = cloneValueWithNewAddressSpace(
+ V, NewAddrSpace, ValueWithNewAddrSpace, &UndefUsesToFix);
+ }
+ }
+
+ if (ValueWithNewAddrSpace.empty())
+ return false;
+
+ // Fixes all the undef uses generated by cloneInstructionWithNewAddressSpace.
+ for (const Use* UndefUse : UndefUsesToFix) {
+ User *V = UndefUse->getUser();
+ User *NewV = cast<User>(ValueWithNewAddrSpace.lookup(V));
+ unsigned OperandNo = UndefUse->getOperandNo();
+ assert(isa<UndefValue>(NewV->getOperand(OperandNo)));
+ NewV->setOperand(OperandNo, ValueWithNewAddrSpace.lookup(UndefUse->get()));
+ }
+
+ // Replaces the uses of the old address expressions with the new ones.
+ for (Value *V : Postorder) {
+ Value *NewV = ValueWithNewAddrSpace.lookup(V);
+ if (NewV == nullptr)
+ continue;
+
+ SmallVector<Use *, 4> Uses;
+ for (Use &U : V->uses())
+ Uses.push_back(&U);
+ DEBUG(dbgs() << "Replacing the uses of " << *V << "\n to\n " << *NewV
+ << "\n");
+ for (Use *U : Uses) {
+ if (isa<LoadInst>(U->getUser()) ||
+ (isa<StoreInst>(U->getUser()) && U->getOperandNo() == 1)) {
+ // If V is used as the pointer operand of a load/store, sets the pointer
+ // operand to NewV. This replacement does not change the element type,
+ // so the resultant load/store is still valid.
+ U->set(NewV);
+ } else if (isa<Instruction>(U->getUser())) {
+ // Otherwise, replaces the use with generic(NewV).
+ // TODO: Some optimization opportunities are missed. For example, in
+ // %0 = icmp eq float* %p, %q
+ // if both p and q are inferred to be shared, we can rewrite %0 as
+ // %0 = icmp eq float addrspace(3)* %new_p, %new_q
+ // instead of currently
+ // %generic_p = addrspacecast float addrspace(3)* %new_p to float*
+ // %generic_q = addrspacecast float addrspace(3)* %new_q to float*
+ // %0 = icmp eq float* %generic_p, %generic_q
+ if (Instruction *I = dyn_cast<Instruction>(V)) {
+ BasicBlock::iterator InsertPos = std::next(I->getIterator());
+ while (isa<PHINode>(InsertPos))
+ ++InsertPos;
+ U->set(new AddrSpaceCastInst(NewV, V->getType(), "", &*InsertPos));
+ } else {
+ U->set(ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV),
+ V->getType()));
+ }
+ }
+ }
+ if (V->use_empty())
+ RecursivelyDeleteTriviallyDeadInstructions(V);
+ }
+
+ return true;
+}
+
+FunctionPass *llvm::createNVPTXInferAddressSpacesPass() {
+ return new NVPTXInferAddressSpaces();
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrFormats.td b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrFormats.td
new file mode 100644
index 000000000000..ffcb5d5273a2
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrFormats.td
@@ -0,0 +1,59 @@
+//===- NVPTXInstrFormats.td - NVPTX Instruction Formats-------*- tblgen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Describe NVPTX instructions format
+//
+//===----------------------------------------------------------------------===//
+
+// Vector instruction type enum
+class VecInstTypeEnum<bits<4> val> {
+ bits<4> Value=val;
+}
+def VecNOP : VecInstTypeEnum<0>;
+
+// Generic NVPTX Format
+
+class NVPTXInst<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : Instruction {
+ field bits<14> Inst;
+
+ let Namespace = "NVPTX";
+ dag OutOperandList = outs;
+ dag InOperandList = ins;
+ let AsmString = asmstr;
+ let Pattern = pattern;
+
+ // TSFlagFields
+ bits<4> VecInstType = VecNOP.Value;
+ bit IsSimpleMove = 0;
+ bit IsLoad = 0;
+ bit IsStore = 0;
+
+ bit IsTex = 0;
+ bit IsSust = 0;
+ bit IsSurfTexQuery = 0;
+ bit IsTexModeUnified = 0;
+
+ // The following field is encoded as log2 of the vector size minus one,
+ // with 0 meaning the operation is not a surface instruction. For example,
+ // if IsSuld == 2, then the instruction is a suld instruction with vector size
+ // 2**(2-1) = 2.
+ bits<2> IsSuld = 0;
+
+ let TSFlags{3-0} = VecInstType;
+ let TSFlags{4-4} = IsSimpleMove;
+ let TSFlags{5-5} = IsLoad;
+ let TSFlags{6-6} = IsStore;
+ let TSFlags{7} = IsTex;
+ let TSFlags{9-8} = IsSuld;
+ let TSFlags{10} = IsSust;
+ let TSFlags{11} = IsSurfTexQuery;
+ let TSFlags{12} = IsTexModeUnified;
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
new file mode 100644
index 000000000000..7f89742a3215
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -0,0 +1,248 @@
+//===- NVPTXInstrInfo.cpp - NVPTX Instruction Information -----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "NVPTXInstrInfo.h"
+#include "NVPTXTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "NVPTXGenInstrInfo.inc"
+
+// Pin the vtable to this file.
+void NVPTXInstrInfo::anchor() {}
+
+NVPTXInstrInfo::NVPTXInstrInfo() : NVPTXGenInstrInfo(), RegInfo() {}
+
+void NVPTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL, unsigned DestReg,
+ unsigned SrcReg, bool KillSrc) const {
+ const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ const TargetRegisterClass *DestRC = MRI.getRegClass(DestReg);
+ const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
+
+ if (DestRC->getSize() != SrcRC->getSize())
+ report_fatal_error("Copy one register into another with a different width");
+
+ unsigned Op;
+ if (DestRC == &NVPTX::Int1RegsRegClass) {
+ Op = NVPTX::IMOV1rr;
+ } else if (DestRC == &NVPTX::Int16RegsRegClass) {
+ Op = NVPTX::IMOV16rr;
+ } else if (DestRC == &NVPTX::Int32RegsRegClass) {
+ Op = (SrcRC == &NVPTX::Int32RegsRegClass ? NVPTX::IMOV32rr
+ : NVPTX::BITCONVERT_32_F2I);
+ } else if (DestRC == &NVPTX::Int64RegsRegClass) {
+ Op = (SrcRC == &NVPTX::Int64RegsRegClass ? NVPTX::IMOV64rr
+ : NVPTX::BITCONVERT_64_F2I);
+ } else if (DestRC == &NVPTX::Float32RegsRegClass) {
+ Op = (SrcRC == &NVPTX::Float32RegsRegClass ? NVPTX::FMOV32rr
+ : NVPTX::BITCONVERT_32_I2F);
+ } else if (DestRC == &NVPTX::Float64RegsRegClass) {
+ Op = (SrcRC == &NVPTX::Float64RegsRegClass ? NVPTX::FMOV64rr
+ : NVPTX::BITCONVERT_64_I2F);
+ } else {
+ llvm_unreachable("Bad register copy");
+ }
+ BuildMI(MBB, I, DL, get(Op), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+}
+
+bool NVPTXInstrInfo::isMoveInstr(const MachineInstr &MI, unsigned &SrcReg,
+ unsigned &DestReg) const {
+ // Look for the appropriate part of TSFlags
+ bool isMove = false;
+
+ unsigned TSFlags =
+ (MI.getDesc().TSFlags & NVPTX::SimpleMoveMask) >> NVPTX::SimpleMoveShift;
+ isMove = (TSFlags == 1);
+
+ if (isMove) {
+ MachineOperand dest = MI.getOperand(0);
+ MachineOperand src = MI.getOperand(1);
+ assert(dest.isReg() && "dest of a movrr is not a reg");
+ assert(src.isReg() && "src of a movrr is not a reg");
+
+ SrcReg = src.getReg();
+ DestReg = dest.getReg();
+ return true;
+ }
+
+ return false;
+}
+
+bool NVPTXInstrInfo::isLoadInstr(const MachineInstr &MI,
+ unsigned &AddrSpace) const {
+ bool isLoad = false;
+ unsigned TSFlags =
+ (MI.getDesc().TSFlags & NVPTX::isLoadMask) >> NVPTX::isLoadShift;
+ isLoad = (TSFlags == 1);
+ if (isLoad)
+ AddrSpace = getLdStCodeAddrSpace(MI);
+ return isLoad;
+}
+
+bool NVPTXInstrInfo::isStoreInstr(const MachineInstr &MI,
+ unsigned &AddrSpace) const {
+ bool isStore = false;
+ unsigned TSFlags =
+ (MI.getDesc().TSFlags & NVPTX::isStoreMask) >> NVPTX::isStoreShift;
+ isStore = (TSFlags == 1);
+ if (isStore)
+ AddrSpace = getLdStCodeAddrSpace(MI);
+ return isStore;
+}
+
+/// AnalyzeBranch - Analyze the branching code at the end of MBB, returning
+/// true if it cannot be understood (e.g. it's a switch dispatch or isn't
+/// implemented for a target). Upon success, this returns false and returns
+/// with the following information in various cases:
+///
+/// 1. If this block ends with no branches (it just falls through to its succ)
+/// just return false, leaving TBB/FBB null.
+/// 2. If this block ends with only an unconditional branch, it sets TBB to be
+/// the destination block.
+/// 3. If this block ends with an conditional branch and it falls through to
+/// an successor block, it sets TBB to be the branch destination block and a
+/// list of operands that evaluate the condition. These
+/// operands can be passed to other TargetInstrInfo methods to create new
+/// branches.
+/// 4. If this block ends with an conditional branch and an unconditional
+/// block, it returns the 'true' destination in TBB, the 'false' destination
+/// in FBB, and a list of operands that evaluate the condition. These
+/// operands can be passed to other TargetInstrInfo methods to create new
+/// branches.
+///
+/// Note that removeBranch and insertBranch must be implemented to support
+/// cases where this method returns success.
+///
+bool NVPTXInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const {
+ // If the block has no terminators, it just falls into the block after it.
+ MachineBasicBlock::iterator I = MBB.end();
+ if (I == MBB.begin() || !isUnpredicatedTerminator(*--I))
+ return false;
+
+ // Get the last instruction in the block.
+ MachineInstr &LastInst = *I;
+
+ // If there is only one terminator instruction, process it.
+ if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
+ if (LastInst.getOpcode() == NVPTX::GOTO) {
+ TBB = LastInst.getOperand(0).getMBB();
+ return false;
+ } else if (LastInst.getOpcode() == NVPTX::CBranch) {
+ // Block ends with fall-through condbranch.
+ TBB = LastInst.getOperand(1).getMBB();
+ Cond.push_back(LastInst.getOperand(0));
+ return false;
+ }
+ // Otherwise, don't know what this is.
+ return true;
+ }
+
+ // Get the instruction before it if it's a terminator.
+ MachineInstr &SecondLastInst = *I;
+
+ // If there are three terminators, we don't know what sort of block this is.
+ if (I != MBB.begin() && isUnpredicatedTerminator(*--I))
+ return true;
+
+ // If the block ends with NVPTX::GOTO and NVPTX:CBranch, handle it.
+ if (SecondLastInst.getOpcode() == NVPTX::CBranch &&
+ LastInst.getOpcode() == NVPTX::GOTO) {
+ TBB = SecondLastInst.getOperand(1).getMBB();
+ Cond.push_back(SecondLastInst.getOperand(0));
+ FBB = LastInst.getOperand(0).getMBB();
+ return false;
+ }
+
+ // If the block ends with two NVPTX:GOTOs, handle it. The second one is not
+ // executed, so remove it.
+ if (SecondLastInst.getOpcode() == NVPTX::GOTO &&
+ LastInst.getOpcode() == NVPTX::GOTO) {
+ TBB = SecondLastInst.getOperand(0).getMBB();
+ I = LastInst;
+ if (AllowModify)
+ I->eraseFromParent();
+ return false;
+ }
+
+ // Otherwise, can't handle this.
+ return true;
+}
+
+unsigned NVPTXInstrInfo::removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved) const {
+ assert(!BytesRemoved && "code size not handled");
+ MachineBasicBlock::iterator I = MBB.end();
+ if (I == MBB.begin())
+ return 0;
+ --I;
+ if (I->getOpcode() != NVPTX::GOTO && I->getOpcode() != NVPTX::CBranch)
+ return 0;
+
+ // Remove the branch.
+ I->eraseFromParent();
+
+ I = MBB.end();
+
+ if (I == MBB.begin())
+ return 1;
+ --I;
+ if (I->getOpcode() != NVPTX::CBranch)
+ return 1;
+
+ // Remove the branch.
+ I->eraseFromParent();
+ return 2;
+}
+
+unsigned NVPTXInstrInfo::insertBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB,
+ ArrayRef<MachineOperand> Cond,
+ const DebugLoc &DL,
+ int *BytesAdded) const {
+ assert(!BytesAdded && "code size not handled");
+
+ // Shouldn't be a fall through.
+ assert(TBB && "insertBranch must not be told to insert a fallthrough");
+ assert((Cond.size() == 1 || Cond.size() == 0) &&
+ "NVPTX branch conditions have two components!");
+
+ // One-way branch.
+ if (!FBB) {
+ if (Cond.empty()) // Unconditional branch
+ BuildMI(&MBB, DL, get(NVPTX::GOTO)).addMBB(TBB);
+ else // Conditional branch
+ BuildMI(&MBB, DL, get(NVPTX::CBranch)).addReg(Cond[0].getReg())
+ .addMBB(TBB);
+ return 1;
+ }
+
+ // Two-way Conditional Branch.
+ BuildMI(&MBB, DL, get(NVPTX::CBranch)).addReg(Cond[0].getReg()).addMBB(TBB);
+ BuildMI(&MBB, DL, get(NVPTX::GOTO)).addMBB(FBB);
+ return 2;
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
new file mode 100644
index 000000000000..d284282e28c5
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
@@ -0,0 +1,79 @@
+//===- NVPTXInstrInfo.h - NVPTX Instruction Information----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the niversity of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXINSTRINFO_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXINSTRINFO_H
+
+#include "NVPTX.h"
+#include "NVPTXRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "NVPTXGenInstrInfo.inc"
+
+namespace llvm {
+
+class NVPTXInstrInfo : public NVPTXGenInstrInfo {
+ const NVPTXRegisterInfo RegInfo;
+ virtual void anchor();
+public:
+ explicit NVPTXInstrInfo();
+
+ const NVPTXRegisterInfo &getRegisterInfo() const { return RegInfo; }
+
+ /* The following virtual functions are used in register allocation.
+ * They are not implemented because the existing interface and the logic
+ * at the caller side do not work for the elementized vector load and store.
+ *
+ * virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
+ * int &FrameIndex) const;
+ * virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
+ * int &FrameIndex) const;
+ * virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+ * MachineBasicBlock::iterator MBBI,
+ * unsigned SrcReg, bool isKill, int FrameIndex,
+ * const TargetRegisterClass *RC) const;
+ * virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+ * MachineBasicBlock::iterator MBBI,
+ * unsigned DestReg, int FrameIndex,
+ * const TargetRegisterClass *RC) const;
+ */
+
+ void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+ bool KillSrc) const override;
+ virtual bool isMoveInstr(const MachineInstr &MI, unsigned &SrcReg,
+ unsigned &DestReg) const;
+ bool isLoadInstr(const MachineInstr &MI, unsigned &AddrSpace) const;
+ bool isStoreInstr(const MachineInstr &MI, unsigned &AddrSpace) const;
+
+ // Branch analysis.
+ bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const override;
+ unsigned removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved = nullptr) const override;
+ unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+ const DebugLoc &DL,
+ int *BytesAdded = nullptr) const override;
+ unsigned getLdStCodeAddrSpace(const MachineInstr &MI) const {
+ return MI.getOperand(2).getImm();
+ }
+
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
new file mode 100644
index 000000000000..92a88c7f2506
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -0,0 +1,2807 @@
+//===- NVPTXInstrInfo.td - NVPTX Instruction defs -------------*- tblgen-*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the PTX instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+include "NVPTXInstrFormats.td"
+
+// A NOP instruction
+let hasSideEffects = 0 in {
+ def NOP : NVPTXInst<(outs), (ins), "", []>;
+}
+
+// List of vector specific properties
+def isVecLD : VecInstTypeEnum<1>;
+def isVecST : VecInstTypeEnum<2>;
+def isVecBuild : VecInstTypeEnum<3>;
+def isVecShuffle : VecInstTypeEnum<4>;
+def isVecExtract : VecInstTypeEnum<5>;
+def isVecInsert : VecInstTypeEnum<6>;
+def isVecDest : VecInstTypeEnum<7>;
+def isVecOther : VecInstTypeEnum<15>;
+
+//===----------------------------------------------------------------------===//
+// NVPTX Operand Definitions.
+//===----------------------------------------------------------------------===//
+
+def brtarget : Operand<OtherVT>;
+
+// CVT conversion modes
+// These must match the enum in NVPTX.h
+def CvtNONE : PatLeaf<(i32 0x0)>;
+def CvtRNI : PatLeaf<(i32 0x1)>;
+def CvtRZI : PatLeaf<(i32 0x2)>;
+def CvtRMI : PatLeaf<(i32 0x3)>;
+def CvtRPI : PatLeaf<(i32 0x4)>;
+def CvtRN : PatLeaf<(i32 0x5)>;
+def CvtRZ : PatLeaf<(i32 0x6)>;
+def CvtRM : PatLeaf<(i32 0x7)>;
+def CvtRP : PatLeaf<(i32 0x8)>;
+
+def CvtNONE_FTZ : PatLeaf<(i32 0x10)>;
+def CvtRNI_FTZ : PatLeaf<(i32 0x11)>;
+def CvtRZI_FTZ : PatLeaf<(i32 0x12)>;
+def CvtRMI_FTZ : PatLeaf<(i32 0x13)>;
+def CvtRPI_FTZ : PatLeaf<(i32 0x14)>;
+def CvtRN_FTZ : PatLeaf<(i32 0x15)>;
+def CvtRZ_FTZ : PatLeaf<(i32 0x16)>;
+def CvtRM_FTZ : PatLeaf<(i32 0x17)>;
+def CvtRP_FTZ : PatLeaf<(i32 0x18)>;
+
+def CvtSAT : PatLeaf<(i32 0x20)>;
+def CvtSAT_FTZ : PatLeaf<(i32 0x30)>;
+
+def CvtMode : Operand<i32> {
+ let PrintMethod = "printCvtMode";
+}
+
+// Compare modes
+// These must match the enum in NVPTX.h
+def CmpEQ : PatLeaf<(i32 0)>;
+def CmpNE : PatLeaf<(i32 1)>;
+def CmpLT : PatLeaf<(i32 2)>;
+def CmpLE : PatLeaf<(i32 3)>;
+def CmpGT : PatLeaf<(i32 4)>;
+def CmpGE : PatLeaf<(i32 5)>;
+def CmpEQU : PatLeaf<(i32 10)>;
+def CmpNEU : PatLeaf<(i32 11)>;
+def CmpLTU : PatLeaf<(i32 12)>;
+def CmpLEU : PatLeaf<(i32 13)>;
+def CmpGTU : PatLeaf<(i32 14)>;
+def CmpGEU : PatLeaf<(i32 15)>;
+def CmpNUM : PatLeaf<(i32 16)>;
+def CmpNAN : PatLeaf<(i32 17)>;
+
+def CmpEQ_FTZ : PatLeaf<(i32 0x100)>;
+def CmpNE_FTZ : PatLeaf<(i32 0x101)>;
+def CmpLT_FTZ : PatLeaf<(i32 0x102)>;
+def CmpLE_FTZ : PatLeaf<(i32 0x103)>;
+def CmpGT_FTZ : PatLeaf<(i32 0x104)>;
+def CmpGE_FTZ : PatLeaf<(i32 0x105)>;
+def CmpEQU_FTZ : PatLeaf<(i32 0x10A)>;
+def CmpNEU_FTZ : PatLeaf<(i32 0x10B)>;
+def CmpLTU_FTZ : PatLeaf<(i32 0x10C)>;
+def CmpLEU_FTZ : PatLeaf<(i32 0x10D)>;
+def CmpGTU_FTZ : PatLeaf<(i32 0x10E)>;
+def CmpGEU_FTZ : PatLeaf<(i32 0x10F)>;
+def CmpNUM_FTZ : PatLeaf<(i32 0x110)>;
+def CmpNAN_FTZ : PatLeaf<(i32 0x111)>;
+
+def CmpMode : Operand<i32> {
+ let PrintMethod = "printCmpMode";
+}
+
+//===----------------------------------------------------------------------===//
+// NVPTX Instruction Predicate Definitions
+//===----------------------------------------------------------------------===//
+
+
+def hasAtomRedG32 : Predicate<"Subtarget->hasAtomRedG32()">;
+def hasAtomRedS32 : Predicate<"Subtarget->hasAtomRedS32()">;
+def hasAtomRedGen32 : Predicate<"Subtarget->hasAtomRedGen32()">;
+def useAtomRedG32forGen32 :
+ Predicate<"!Subtarget->hasAtomRedGen32() && Subtarget->hasAtomRedG32()">;
+def hasBrkPt : Predicate<"Subtarget->hasBrkPt()">;
+def hasAtomRedG64 : Predicate<"Subtarget->hasAtomRedG64()">;
+def hasAtomRedS64 : Predicate<"Subtarget->hasAtomRedS64()">;
+def hasAtomRedGen64 : Predicate<"Subtarget->hasAtomRedGen64()">;
+def useAtomRedG64forGen64 :
+ Predicate<"!Subtarget->hasAtomRedGen64() && Subtarget->hasAtomRedG64()">;
+def hasAtomAddF32 : Predicate<"Subtarget->hasAtomAddF32()">;
+def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">;
+def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">;
+def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">;
+def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">;
+def hasVote : Predicate<"Subtarget->hasVote()">;
+def hasDouble : Predicate<"Subtarget->hasDouble()">;
+def reqPTX20 : Predicate<"Subtarget->reqPTX20()">;
+def hasLDG : Predicate<"Subtarget->hasLDG()">;
+def hasLDU : Predicate<"Subtarget->hasLDU()">;
+def hasGenericLdSt : Predicate<"Subtarget->hasGenericLdSt()">;
+
+def doF32FTZ : Predicate<"useF32FTZ()">;
+def doNoF32FTZ : Predicate<"!useF32FTZ()">;
+
+def doMulWide : Predicate<"doMulWide">;
+
+def allowFMA : Predicate<"allowFMA()">;
+def noFMA : Predicate<"!allowFMA()">;
+
+def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">;
+def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">;
+
+def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">;
+def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
+
+def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
+def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
+
+def true : Predicate<"1">;
+
+def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">;
+
+
+//===----------------------------------------------------------------------===//
+// Some Common Instruction Class Templates
+//===----------------------------------------------------------------------===//
+
+// Template for instructions which take three int64, int32, or int16 args.
+// The instructions are named "<OpcStr><Width>" (e.g. "add.s64").
+multiclass I3<string OpcStr, SDNode OpNode> {
+ def i64rr :
+ NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
+ !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+ [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
+ def i64ri :
+ NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
+ !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+ [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
+ def i32rr :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+ !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+ [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
+ def i32ri :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+ !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+ [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+ def i16rr :
+ NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+ !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+ [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
+ def i16ri :
+ NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+ !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+ [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>;
+}
+
+// Template for instructions which take 3 int32 args. The instructions are
+// named "<OpcStr>.s32" (e.g. "addc.cc.s32").
+multiclass ADD_SUB_INT_32<string OpcStr, SDNode OpNode> {
+ def i32rr :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+ !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
+ [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
+ def i32ri :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+ !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
+ [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+}
+
+// Template for instructions which take three fp64 or fp32 args. The
+// instructions are named "<OpcStr>.f<Width>" (e.g. "min.f64").
+//
+// Also defines ftz (flush subnormal inputs and results to sign-preserving
+// zero) variants for fp32 functions.
+//
+// This multiclass should be used for nodes that cannot be folded into FMAs.
+// For nodes that can be folded into FMAs (i.e. adds and muls), use
+// F3_fma_component.
+multiclass F3<string OpcStr, SDNode OpNode> {
+ def f64rr :
+ NVPTXInst<(outs Float64Regs:$dst),
+ (ins Float64Regs:$a, Float64Regs:$b),
+ !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
+ [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>;
+ def f64ri :
+ NVPTXInst<(outs Float64Regs:$dst),
+ (ins Float64Regs:$a, f64imm:$b),
+ !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
+ [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>;
+ def f32rr_ftz :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b),
+ !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
+ [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+ Requires<[doF32FTZ]>;
+ def f32ri_ftz :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b),
+ !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
+ [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
+ Requires<[doF32FTZ]>;
+ def f32rr :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b),
+ !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
+ [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>;
+ def f32ri :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b),
+ !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
+ [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>;
+}
+
+// Template for instructions which take three fp64 or fp32 args. The
+// instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64").
+//
+// Also defines ftz (flush subnormal inputs and results to sign-preserving
+// zero) variants for fp32 functions.
+//
+// This multiclass should be used for nodes that can be folded to make fma ops.
+// In this case, we use the ".rn" variant when FMA is disabled, as this behaves
+// just like the non ".rn" op, but prevents ptxas from creating FMAs.
+multiclass F3_fma_component<string OpcStr, SDNode OpNode> {
+ def f64rr :
+ NVPTXInst<(outs Float64Regs:$dst),
+ (ins Float64Regs:$a, Float64Regs:$b),
+ !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
+ [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
+ Requires<[allowFMA]>;
+ def f64ri :
+ NVPTXInst<(outs Float64Regs:$dst),
+ (ins Float64Regs:$a, f64imm:$b),
+ !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
+ [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
+ Requires<[allowFMA]>;
+ def f32rr_ftz :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b),
+ !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
+ [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+ Requires<[allowFMA, doF32FTZ]>;
+ def f32ri_ftz :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b),
+ !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
+ [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
+ Requires<[allowFMA, doF32FTZ]>;
+ def f32rr :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b),
+ !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
+ [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+ Requires<[allowFMA]>;
+ def f32ri :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b),
+ !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
+ [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
+ Requires<[allowFMA]>;
+
+ // These have strange names so we don't perturb existing mir tests.
+ def _rnf64rr :
+ NVPTXInst<(outs Float64Regs:$dst),
+ (ins Float64Regs:$a, Float64Regs:$b),
+ !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
+ [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
+ Requires<[noFMA]>;
+ def _rnf64ri :
+ NVPTXInst<(outs Float64Regs:$dst),
+ (ins Float64Regs:$a, f64imm:$b),
+ !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
+ [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
+ Requires<[noFMA]>;
+ def _rnf32rr_ftz :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b),
+ !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
+ [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+ Requires<[noFMA, doF32FTZ]>;
+ def _rnf32ri_ftz :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b),
+ !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
+ [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
+ Requires<[noFMA, doF32FTZ]>;
+ def _rnf32rr :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b),
+ !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
+ [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+ Requires<[noFMA]>;
+ def _rnf32ri :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b),
+ !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
+ [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
+ Requires<[noFMA]>;
+}
+
+// Template for operations which take two f32 or f64 operands. Provides three
+// instructions: <OpcStr>.f64, <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush
+// subnormal inputs and results to zero).
+multiclass F2<string OpcStr, SDNode OpNode> {
+ def f64 : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a),
+ !strconcat(OpcStr, ".f64 \t$dst, $a;"),
+ [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>;
+ def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
+ !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"),
+ [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>,
+ Requires<[doF32FTZ]>;
+ def f32 : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
+ !strconcat(OpcStr, ".f32 \t$dst, $a;"),
+ [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// NVPTX Instructions.
+//===----------------------------------------------------------------------===//
+
+//-----------------------------------
+// Type Conversion
+//-----------------------------------
+
+let hasSideEffects = 0 in {
+ // Generate a cvt to the given type from all possible types. Each instance
+ // takes a CvtMode immediate that defines the conversion mode to use. It can
+ // be CvtNONE to omit a conversion mode.
+ multiclass CVT_FROM_ALL<string FromName, RegisterClass RC> {
+ def _s8 :
+ NVPTXInst<(outs RC:$dst),
+ (ins Int16Regs:$src, CvtMode:$mode),
+ !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+ FromName, ".s8\t$dst, $src;"), []>;
+ def _u8 :
+ NVPTXInst<(outs RC:$dst),
+ (ins Int16Regs:$src, CvtMode:$mode),
+ !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+ FromName, ".u8\t$dst, $src;"), []>;
+ def _s16 :
+ NVPTXInst<(outs RC:$dst),
+ (ins Int16Regs:$src, CvtMode:$mode),
+ !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+ FromName, ".s16\t$dst, $src;"), []>;
+ def _u16 :
+ NVPTXInst<(outs RC:$dst),
+ (ins Int16Regs:$src, CvtMode:$mode),
+ !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+ FromName, ".u16\t$dst, $src;"), []>;
+ def _f16 :
+ NVPTXInst<(outs RC:$dst),
+ (ins Int16Regs:$src, CvtMode:$mode),
+ !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+ FromName, ".f16\t$dst, $src;"), []>;
+ def _s32 :
+ NVPTXInst<(outs RC:$dst),
+ (ins Int32Regs:$src, CvtMode:$mode),
+ !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+ FromName, ".s32\t$dst, $src;"), []>;
+ def _u32 :
+ NVPTXInst<(outs RC:$dst),
+ (ins Int32Regs:$src, CvtMode:$mode),
+ !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+ FromName, ".u32\t$dst, $src;"), []>;
+ def _s64 :
+ NVPTXInst<(outs RC:$dst),
+ (ins Int64Regs:$src, CvtMode:$mode),
+ !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+ FromName, ".s64\t$dst, $src;"), []>;
+ def _u64 :
+ NVPTXInst<(outs RC:$dst),
+ (ins Int64Regs:$src, CvtMode:$mode),
+ !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+ FromName, ".u64\t$dst, $src;"), []>;
+ def _f32 :
+ NVPTXInst<(outs RC:$dst),
+ (ins Float32Regs:$src, CvtMode:$mode),
+ !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+ FromName, ".f32\t$dst, $src;"), []>;
+ def _f64 :
+ NVPTXInst<(outs RC:$dst),
+ (ins Float64Regs:$src, CvtMode:$mode),
+ !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+ FromName, ".f64\t$dst, $src;"), []>;
+ }
+
+ // Generate cvts from all types to all types.
+ defm CVT_s8 : CVT_FROM_ALL<"s8", Int16Regs>;
+ defm CVT_u8 : CVT_FROM_ALL<"u8", Int16Regs>;
+ defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>;
+ defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>;
+ defm CVT_f16 : CVT_FROM_ALL<"f16", Int16Regs>;
+ defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>;
+ defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>;
+ defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>;
+ defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>;
+ defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>;
+ defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>;
+
+ // These cvts are different from those above: The source and dest registers
+ // are of the same type.
+ def CVT_INREG_s16_s8 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+ "cvt.s16.s8 \t$dst, $src;", []>;
+ def CVT_INREG_s32_s8 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+ "cvt.s32.s8 \t$dst, $src;", []>;
+ def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+ "cvt.s32.s16 \t$dst, $src;", []>;
+ def CVT_INREG_s64_s8 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+ "cvt.s64.s8 \t$dst, $src;", []>;
+ def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+ "cvt.s64.s16 \t$dst, $src;", []>;
+ def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+ "cvt.s64.s32 \t$dst, $src;", []>;
+}
+
+//-----------------------------------
+// Integer Arithmetic
+//-----------------------------------
+
+// Template for xor masquerading as int1 arithmetic.
+multiclass ADD_SUB_i1<SDNode OpNode> {
+ def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
+ "xor.pred \t$dst, $a, $b;",
+ [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
+ def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
+ "xor.pred \t$dst, $a, $b;",
+ [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>;
+}
+
+// int1 addition and subtraction are both just xor.
+defm ADD_i1 : ADD_SUB_i1<add>;
+defm SUB_i1 : ADD_SUB_i1<sub>;
+
+// int16, int32, and int64 signed addition. Since nvptx is 2's compliment, we
+// also use these for unsigned arithmetic.
+defm ADD : I3<"add.s", add>;
+defm SUB : I3<"sub.s", sub>;
+
+// int32 addition and subtraction with carry-out.
+// FIXME: PTX 4.3 adds a 64-bit add.cc (and maybe also 64-bit addc.cc?).
+defm ADDCC : ADD_SUB_INT_32<"add.cc", addc>;
+defm SUBCC : ADD_SUB_INT_32<"sub.cc", subc>;
+
+// int32 addition and subtraction with carry-in and carry-out.
+defm ADDCCC : ADD_SUB_INT_32<"addc.cc", adde>;
+defm SUBCCC : ADD_SUB_INT_32<"subc.cc", sube>;
+
+defm MULT : I3<"mul.lo.s", mul>;
+
+defm MULTHS : I3<"mul.hi.s", mulhs>;
+defm MULTHU : I3<"mul.hi.u", mulhu>;
+
+defm SDIV : I3<"div.s", sdiv>;
+defm UDIV : I3<"div.u", udiv>;
+
+// The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM
+// will lower it.
+defm SREM : I3<"rem.s", srem>;
+defm UREM : I3<"rem.u", urem>;
+
+
+//
+// Wide multiplication
+//
+def MULWIDES64 :
+ NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+ "mul.wide.s32 \t$dst, $a, $b;", []>;
+def MULWIDES64Imm :
+ NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+ "mul.wide.s32 \t$dst, $a, $b;", []>;
+def MULWIDES64Imm64 :
+ NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
+ "mul.wide.s32 \t$dst, $a, $b;", []>;
+
+def MULWIDEU64 :
+ NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+ "mul.wide.u32 \t$dst, $a, $b;", []>;
+def MULWIDEU64Imm :
+ NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+ "mul.wide.u32 \t$dst, $a, $b;", []>;
+def MULWIDEU64Imm64 :
+ NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
+ "mul.wide.u32 \t$dst, $a, $b;", []>;
+
+def MULWIDES32 :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+ "mul.wide.s16 \t$dst, $a, $b;", []>;
+def MULWIDES32Imm :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+ "mul.wide.s16 \t$dst, $a, $b;", []>;
+def MULWIDES32Imm32 :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
+ "mul.wide.s16 \t$dst, $a, $b;", []>;
+
+def MULWIDEU32 :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+ "mul.wide.u16 \t$dst, $a, $b;", []>;
+def MULWIDEU32Imm :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+ "mul.wide.u16 \t$dst, $a, $b;", []>;
+def MULWIDEU32Imm32 :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
+ "mul.wide.u16 \t$dst, $a, $b;", []>;
+
+def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>;
+def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>;
+def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>;
+
+// Matchers for signed, unsigned mul.wide ISD nodes.
+def : Pat<(i32 (mul_wide_signed Int16Regs:$a, Int16Regs:$b)),
+ (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
+ Requires<[doMulWide]>;
+def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)),
+ (MULWIDES32Imm Int16Regs:$a, imm:$b)>,
+ Requires<[doMulWide]>;
+def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, Int16Regs:$b)),
+ (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
+ Requires<[doMulWide]>;
+def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)),
+ (MULWIDEU32Imm Int16Regs:$a, imm:$b)>,
+ Requires<[doMulWide]>;
+
+def : Pat<(i64 (mul_wide_signed Int32Regs:$a, Int32Regs:$b)),
+ (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
+ Requires<[doMulWide]>;
+def : Pat<(i64 (mul_wide_signed Int32Regs:$a, imm:$b)),
+ (MULWIDES64Imm Int32Regs:$a, imm:$b)>,
+ Requires<[doMulWide]>;
+def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, Int32Regs:$b)),
+ (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
+ Requires<[doMulWide]>;
+def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, imm:$b)),
+ (MULWIDEU64Imm Int32Regs:$a, imm:$b)>,
+ Requires<[doMulWide]>;
+
+// Predicates used for converting some patterns to mul.wide.
+def SInt32Const : PatLeaf<(imm), [{
+ const APInt &v = N->getAPIntValue();
+ return v.isSignedIntN(32);
+}]>;
+
+def UInt32Const : PatLeaf<(imm), [{
+ const APInt &v = N->getAPIntValue();
+ return v.isIntN(32);
+}]>;
+
+def SInt16Const : PatLeaf<(imm), [{
+ const APInt &v = N->getAPIntValue();
+ return v.isSignedIntN(16);
+}]>;
+
+def UInt16Const : PatLeaf<(imm), [{
+ const APInt &v = N->getAPIntValue();
+ return v.isIntN(16);
+}]>;
+
+def Int5Const : PatLeaf<(imm), [{
+ // Check if 0 <= v < 32; only then will the result of (x << v) be an int32.
+ const APInt &v = N->getAPIntValue();
+ return v.sge(0) && v.slt(32);
+}]>;
+
+def Int4Const : PatLeaf<(imm), [{
+ // Check if 0 <= v < 16; only then will the result of (x << v) be an int16.
+ const APInt &v = N->getAPIntValue();
+ return v.sge(0) && v.slt(16);
+}]>;
+
+def SHL2MUL32 : SDNodeXForm<imm, [{
+ const APInt &v = N->getAPIntValue();
+ APInt temp(32, 1);
+ return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i32);
+}]>;
+
+def SHL2MUL16 : SDNodeXForm<imm, [{
+ const APInt &v = N->getAPIntValue();
+ APInt temp(16, 1);
+ return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16);
+}]>;
+
+// Convert "sign/zero-extend, then shift left by an immediate" to mul.wide.
+def : Pat<(shl (sext Int32Regs:$a), (i32 Int5Const:$b)),
+ (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
+ Requires<[doMulWide]>;
+def : Pat<(shl (zext Int32Regs:$a), (i32 Int5Const:$b)),
+ (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
+ Requires<[doMulWide]>;
+
+def : Pat<(shl (sext Int16Regs:$a), (i16 Int4Const:$b)),
+ (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
+ Requires<[doMulWide]>;
+def : Pat<(shl (zext Int16Regs:$a), (i16 Int4Const:$b)),
+ (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
+ Requires<[doMulWide]>;
+
+// Convert "sign/zero-extend then multiply" to mul.wide.
+def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)),
+ (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
+ Requires<[doMulWide]>;
+def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)),
+ (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>,
+ Requires<[doMulWide]>;
+
+def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)),
+ (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
+ Requires<[doMulWide]>;
+def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)),
+ (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>,
+ Requires<[doMulWide]>;
+
+def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)),
+ (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
+ Requires<[doMulWide]>;
+def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)),
+ (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>,
+ Requires<[doMulWide]>;
+
+def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)),
+ (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
+ Requires<[doMulWide]>;
+def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)),
+ (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>,
+ Requires<[doMulWide]>;
+
+//
+// Integer multiply-add
+//
+def SDTIMAD :
+ SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>,
+ SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>;
+def imad : SDNode<"NVPTXISD::IMAD", SDTIMAD>;
+
+def MAD16rrr :
+ NVPTXInst<(outs Int16Regs:$dst),
+ (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
+ "mad.lo.s16 \t$dst, $a, $b, $c;",
+ [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>;
+def MAD16rri :
+ NVPTXInst<(outs Int16Regs:$dst),
+ (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c),
+ "mad.lo.s16 \t$dst, $a, $b, $c;",
+ [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>;
+def MAD16rir :
+ NVPTXInst<(outs Int16Regs:$dst),
+ (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c),
+ "mad.lo.s16 \t$dst, $a, $b, $c;",
+ [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>;
+def MAD16rii :
+ NVPTXInst<(outs Int16Regs:$dst),
+ (ins Int16Regs:$a, i16imm:$b, i16imm:$c),
+ "mad.lo.s16 \t$dst, $a, $b, $c;",
+ [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, imm:$c))]>;
+
+def MAD32rrr :
+ NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
+ "mad.lo.s32 \t$dst, $a, $b, $c;",
+ [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, Int32Regs:$c))]>;
+def MAD32rri :
+ NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c),
+ "mad.lo.s32 \t$dst, $a, $b, $c;",
+ [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, imm:$c))]>;
+def MAD32rir :
+ NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c),
+ "mad.lo.s32 \t$dst, $a, $b, $c;",
+ [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, Int32Regs:$c))]>;
+def MAD32rii :
+ NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$a, i32imm:$b, i32imm:$c),
+ "mad.lo.s32 \t$dst, $a, $b, $c;",
+ [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, imm:$c))]>;
+
+def MAD64rrr :
+ NVPTXInst<(outs Int64Regs:$dst),
+ (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c),
+ "mad.lo.s64 \t$dst, $a, $b, $c;",
+ [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>;
+def MAD64rri :
+ NVPTXInst<(outs Int64Regs:$dst),
+ (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c),
+ "mad.lo.s64 \t$dst, $a, $b, $c;",
+ [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>;
+def MAD64rir :
+ NVPTXInst<(outs Int64Regs:$dst),
+ (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c),
+ "mad.lo.s64 \t$dst, $a, $b, $c;",
+ [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>;
+def MAD64rii :
+ NVPTXInst<(outs Int64Regs:$dst),
+ (ins Int64Regs:$a, i64imm:$b, i64imm:$c),
+ "mad.lo.s64 \t$dst, $a, $b, $c;",
+ [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, imm:$c))]>;
+
+def INEG16 :
+ NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+ "neg.s16 \t$dst, $src;",
+ [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>;
+def INEG32 :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+ "neg.s32 \t$dst, $src;",
+ [(set Int32Regs:$dst, (ineg Int32Regs:$src))]>;
+def INEG64 :
+ NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+ "neg.s64 \t$dst, $src;",
+ [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>;
+
+//-----------------------------------
+// Floating Point Arithmetic
+//-----------------------------------
+
+// Constant 1.0f
+def FloatConst1 : PatLeaf<(fpimm), [{
+ return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEsingle() &&
+ N->getValueAPF().convertToFloat() == 1.0f;
+}]>;
+// Constant 1.0 (double)
+def DoubleConst1 : PatLeaf<(fpimm), [{
+ return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() &&
+ N->getValueAPF().convertToDouble() == 1.0;
+}]>;
+
+defm FADD : F3_fma_component<"add", fadd>;
+defm FSUB : F3_fma_component<"sub", fsub>;
+defm FMUL : F3_fma_component<"mul", fmul>;
+
+defm FMIN : F3<"min", fminnum>;
+defm FMAX : F3<"max", fmaxnum>;
+
+defm FABS : F2<"abs", fabs>;
+defm FNEG : F2<"neg", fneg>;
+defm FSQRT : F2<"sqrt.rn", fsqrt>;
+
+//
+// F64 division
+//
+def FDIV641r :
+ NVPTXInst<(outs Float64Regs:$dst),
+ (ins f64imm:$a, Float64Regs:$b),
+ "rcp.rn.f64 \t$dst, $b;",
+ [(set Float64Regs:$dst, (fdiv DoubleConst1:$a, Float64Regs:$b))]>;
+def FDIV64rr :
+ NVPTXInst<(outs Float64Regs:$dst),
+ (ins Float64Regs:$a, Float64Regs:$b),
+ "div.rn.f64 \t$dst, $a, $b;",
+ [(set Float64Regs:$dst, (fdiv Float64Regs:$a, Float64Regs:$b))]>;
+def FDIV64ri :
+ NVPTXInst<(outs Float64Regs:$dst),
+ (ins Float64Regs:$a, f64imm:$b),
+ "div.rn.f64 \t$dst, $a, $b;",
+ [(set Float64Regs:$dst, (fdiv Float64Regs:$a, fpimm:$b))]>;
+
+//
+// F32 Approximate reciprocal
+//
+def FDIV321r_ftz :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins f32imm:$a, Float32Regs:$b),
+ "rcp.approx.ftz.f32 \t$dst, $b;",
+ [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+ Requires<[do_DIVF32_APPROX, doF32FTZ]>;
+def FDIV321r :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins f32imm:$a, Float32Regs:$b),
+ "rcp.approx.f32 \t$dst, $b;",
+ [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+ Requires<[do_DIVF32_APPROX]>;
+//
+// F32 Approximate division
+//
+def FDIV32approxrr_ftz :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b),
+ "div.approx.ftz.f32 \t$dst, $a, $b;",
+ [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+ Requires<[do_DIVF32_APPROX, doF32FTZ]>;
+def FDIV32approxri_ftz :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b),
+ "div.approx.ftz.f32 \t$dst, $a, $b;",
+ [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+ Requires<[do_DIVF32_APPROX, doF32FTZ]>;
+def FDIV32approxrr :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b),
+ "div.approx.f32 \t$dst, $a, $b;",
+ [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+ Requires<[do_DIVF32_APPROX]>;
+def FDIV32approxri :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b),
+ "div.approx.f32 \t$dst, $a, $b;",
+ [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+ Requires<[do_DIVF32_APPROX]>;
+//
+// F32 Semi-accurate reciprocal
+//
+// rcp.approx gives the same result as div.full(1.0f, a) and is faster.
+//
+def FDIV321r_approx_ftz :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins f32imm:$a, Float32Regs:$b),
+ "rcp.approx.ftz.f32 \t$dst, $b;",
+ [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+ Requires<[do_DIVF32_FULL, doF32FTZ]>;
+def FDIV321r_approx :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins f32imm:$a, Float32Regs:$b),
+ "rcp.approx.f32 \t$dst, $b;",
+ [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+ Requires<[do_DIVF32_FULL]>;
+//
+// F32 Semi-accurate division
+//
+def FDIV32rr_ftz :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b),
+ "div.full.ftz.f32 \t$dst, $a, $b;",
+ [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+ Requires<[do_DIVF32_FULL, doF32FTZ]>;
+def FDIV32ri_ftz :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b),
+ "div.full.ftz.f32 \t$dst, $a, $b;",
+ [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+ Requires<[do_DIVF32_FULL, doF32FTZ]>;
+def FDIV32rr :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b),
+ "div.full.f32 \t$dst, $a, $b;",
+ [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+ Requires<[do_DIVF32_FULL]>;
+def FDIV32ri :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b),
+ "div.full.f32 \t$dst, $a, $b;",
+ [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+ Requires<[do_DIVF32_FULL]>;
+//
+// F32 Accurate reciprocal
+//
+def FDIV321r_prec_ftz :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins f32imm:$a, Float32Regs:$b),
+ "rcp.rn.ftz.f32 \t$dst, $b;",
+ [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+ Requires<[reqPTX20, doF32FTZ]>;
+def FDIV321r_prec :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins f32imm:$a, Float32Regs:$b),
+ "rcp.rn.f32 \t$dst, $b;",
+ [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+ Requires<[reqPTX20]>;
+//
+// F32 Accurate division
+//
+def FDIV32rr_prec_ftz :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b),
+ "div.rn.ftz.f32 \t$dst, $a, $b;",
+ [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+ Requires<[doF32FTZ, reqPTX20]>;
+def FDIV32ri_prec_ftz :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b),
+ "div.rn.ftz.f32 \t$dst, $a, $b;",
+ [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+ Requires<[doF32FTZ, reqPTX20]>;
+def FDIV32rr_prec :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b),
+ "div.rn.f32 \t$dst, $a, $b;",
+ [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+ Requires<[reqPTX20]>;
+def FDIV32ri_prec :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b),
+ "div.rn.f32 \t$dst, $a, $b;",
+ [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+ Requires<[reqPTX20]>;
+
+//
+// F32 rsqrt
+//
+
+def RSQRTF32approx1r : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$b),
+ "rsqrt.approx.f32 \t$dst, $b;", []>;
+
+// Convert 1.0f/sqrt(x) to rsqrt.approx.f32. (There is an rsqrt.approx.f64, but
+// it's emulated in software.)
+def: Pat<(fdiv FloatConst1, (int_nvvm_sqrt_f Float32Regs:$b)),
+ (RSQRTF32approx1r Float32Regs:$b)>,
+ Requires<[do_DIVF32_FULL, do_SQRTF32_APPROX, doNoF32FTZ]>;
+
+multiclass FMA<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred> {
+ def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
+ !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+ [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>,
+ Requires<[Pred]>;
+ def rri : NVPTXInst<(outs RC:$dst),
+ (ins RC:$a, RC:$b, ImmCls:$c),
+ !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+ [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>,
+ Requires<[Pred]>;
+ def rir : NVPTXInst<(outs RC:$dst),
+ (ins RC:$a, ImmCls:$b, RC:$c),
+ !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+ [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>,
+ Requires<[Pred]>;
+ def rii : NVPTXInst<(outs RC:$dst),
+ (ins RC:$a, ImmCls:$b, ImmCls:$c),
+ !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+ [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>,
+ Requires<[Pred]>;
+}
+
+defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>;
+defm FMA32 : FMA<"fma.rn.f32", Float32Regs, f32imm, true>;
+defm FMA64 : FMA<"fma.rn.f64", Float64Regs, f64imm, true>;
+
+// sin/cos
+def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
+ "sin.approx.f32 \t$dst, $src;",
+ [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>;
+def COSF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
+ "cos.approx.f32 \t$dst, $src;",
+ [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>;
+
+// Lower (frem x, y) into (sub x, (mul (floor (div x, y)) y)),
+// i.e. "poor man's fmod()"
+
+// frem - f32 FTZ
+def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
+ (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32
+ (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRMI_FTZ),
+ Float32Regs:$y))>,
+ Requires<[doF32FTZ]>;
+def : Pat<(frem Float32Regs:$x, fpimm:$y),
+ (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32
+ (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRMI_FTZ),
+ fpimm:$y))>,
+ Requires<[doF32FTZ]>;
+
+// frem - f32
+def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
+ (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32
+ (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRMI),
+ Float32Regs:$y))>;
+def : Pat<(frem Float32Regs:$x, fpimm:$y),
+ (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32
+ (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRMI),
+ fpimm:$y))>;
+
+// frem - f64
+def : Pat<(frem Float64Regs:$x, Float64Regs:$y),
+ (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64
+ (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRMI),
+ Float64Regs:$y))>;
+def : Pat<(frem Float64Regs:$x, fpimm:$y),
+ (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64
+ (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRMI),
+ fpimm:$y))>;
+
+//-----------------------------------
+// Bitwise operations
+//-----------------------------------
+
+// Template for three-arg bitwise operations. Takes three args, Creates .b16,
+// .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr.
+multiclass BITWISE<string OpcStr, SDNode OpNode> {
+ def b1rr :
+ NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
+ !strconcat(OpcStr, ".pred \t$dst, $a, $b;"),
+ [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
+ def b1ri :
+ NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
+ !strconcat(OpcStr, ".pred \t$dst, $a, $b;"),
+ [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>;
+ def b16rr :
+ NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+ !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"),
+ [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
+ def b16ri :
+ NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+ !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"),
+ [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>;
+ def b32rr :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+ !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"),
+ [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
+ def b32ri :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+ !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"),
+ [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+ def b64rr :
+ NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
+ !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"),
+ [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
+ def b64ri :
+ NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
+ !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"),
+ [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
+}
+
+defm OR : BITWISE<"or", or>;
+defm AND : BITWISE<"and", and>;
+defm XOR : BITWISE<"xor", xor>;
+
+def NOT1 : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
+ "not.pred \t$dst, $src;",
+ [(set Int1Regs:$dst, (not Int1Regs:$src))]>;
+def NOT16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+ "not.b16 \t$dst, $src;",
+ [(set Int16Regs:$dst, (not Int16Regs:$src))]>;
+def NOT32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+ "not.b32 \t$dst, $src;",
+ [(set Int32Regs:$dst, (not Int32Regs:$src))]>;
+def NOT64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+ "not.b64 \t$dst, $src;",
+ [(set Int64Regs:$dst, (not Int64Regs:$src))]>;
+
+// Template for left/right shifts. Takes three operands,
+// [dest (reg), src (reg), shift (reg or imm)].
+// dest and src may be int64, int32, or int16, but shift is always int32.
+//
+// This template also defines a 32-bit shift (imm, imm) instruction.
+multiclass SHIFT<string OpcStr, SDNode OpNode> {
+ def i64rr :
+ NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int32Regs:$b),
+ !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+ [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int32Regs:$b))]>;
+ def i64ri :
+ NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b),
+ !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+ [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 imm:$b)))]>;
+ def i32rr :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+ !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+ [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
+ def i32ri :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+ !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+ [(set Int32Regs:$dst, (OpNode Int32Regs:$a, (i32 imm:$b)))]>;
+ def i32ii :
+ NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b),
+ !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+ [(set Int32Regs:$dst, (OpNode (i32 imm:$a), (i32 imm:$b)))]>;
+ def i16rr :
+ NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int32Regs:$b),
+ !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+ [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int32Regs:$b))]>;
+ def i16ri :
+ NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
+ !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+ [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 imm:$b)))]>;
+}
+
+defm SHL : SHIFT<"shl.b", shl>;
+defm SRA : SHIFT<"shr.s", sra>;
+defm SRL : SHIFT<"shr.u", srl>;
+
+//
+// Rotate: Use ptx shf instruction if available.
+//
+
+// 32 bit r2 = rotl r1, n
+// =>
+// r2 = shf.l r1, r1, n
+def ROTL32imm_hw :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
+ "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
+ [(set Int32Regs:$dst, (rotl Int32Regs:$src, (i32 imm:$amt)))]>,
+ Requires<[hasHWROT32]>;
+
+def ROTL32reg_hw :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
+ "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
+ [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
+ Requires<[hasHWROT32]>;
+
+// 32 bit r2 = rotr r1, n
+// =>
+// r2 = shf.r r1, r1, n
+def ROTR32imm_hw :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
+ "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
+ [(set Int32Regs:$dst, (rotr Int32Regs:$src, (i32 imm:$amt)))]>,
+ Requires<[hasHWROT32]>;
+
+def ROTR32reg_hw :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
+ "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
+ [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
+ Requires<[hasHWROT32]>;
+
+// 32-bit software rotate by immediate. $amt2 should equal 32 - $amt1.
+def ROT32imm_sw :
+ NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2),
+ "{{\n\t"
+ ".reg .b32 %lhs;\n\t"
+ ".reg .b32 %rhs;\n\t"
+ "shl.b32 \t%lhs, $src, $amt1;\n\t"
+ "shr.b32 \t%rhs, $src, $amt2;\n\t"
+ "add.u32 \t$dst, %lhs, %rhs;\n\t"
+ "}}",
+ []>;
+
+def SUB_FRM_32 : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(32 - N->getZExtValue(), SDLoc(N), MVT::i32);
+}]>;
+
+def : Pat<(rotl Int32Regs:$src, (i32 imm:$amt)),
+ (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
+ Requires<[noHWROT32]>;
+def : Pat<(rotr Int32Regs:$src, (i32 imm:$amt)),
+ (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>,
+ Requires<[noHWROT32]>;
+
+// 32-bit software rotate left by register.
+def ROTL32reg_sw :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
+ "{{\n\t"
+ ".reg .b32 %lhs;\n\t"
+ ".reg .b32 %rhs;\n\t"
+ ".reg .b32 %amt2;\n\t"
+ "shl.b32 \t%lhs, $src, $amt;\n\t"
+ "sub.s32 \t%amt2, 32, $amt;\n\t"
+ "shr.b32 \t%rhs, $src, %amt2;\n\t"
+ "add.u32 \t$dst, %lhs, %rhs;\n\t"
+ "}}",
+ [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
+ Requires<[noHWROT32]>;
+
+// 32-bit software rotate right by register.
+def ROTR32reg_sw :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
+ "{{\n\t"
+ ".reg .b32 %lhs;\n\t"
+ ".reg .b32 %rhs;\n\t"
+ ".reg .b32 %amt2;\n\t"
+ "shr.b32 \t%lhs, $src, $amt;\n\t"
+ "sub.s32 \t%amt2, 32, $amt;\n\t"
+ "shl.b32 \t%rhs, $src, %amt2;\n\t"
+ "add.u32 \t$dst, %lhs, %rhs;\n\t"
+ "}}",
+ [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
+ Requires<[noHWROT32]>;
+
+// 64-bit software rotate by immediate. $amt2 should equal 64 - $amt1.
+def ROT64imm_sw :
+ NVPTXInst<(outs Int64Regs:$dst),
+ (ins Int64Regs:$src, i32imm:$amt1, i32imm:$amt2),
+ "{{\n\t"
+ ".reg .b64 %lhs;\n\t"
+ ".reg .b64 %rhs;\n\t"
+ "shl.b64 \t%lhs, $src, $amt1;\n\t"
+ "shr.b64 \t%rhs, $src, $amt2;\n\t"
+ "add.u64 \t$dst, %lhs, %rhs;\n\t"
+ "}}",
+ []>;
+
+def SUB_FRM_64 : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(64-N->getZExtValue(), SDLoc(N), MVT::i32);
+}]>;
+
+def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)),
+ (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>;
+def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)),
+ (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>;
+
+// 64-bit software rotate left by register.
+def ROTL64reg_sw :
+ NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
+ "{{\n\t"
+ ".reg .b64 %lhs;\n\t"
+ ".reg .b64 %rhs;\n\t"
+ ".reg .u32 %amt2;\n\t"
+ "shl.b64 \t%lhs, $src, $amt;\n\t"
+ "sub.u32 \t%amt2, 64, $amt;\n\t"
+ "shr.b64 \t%rhs, $src, %amt2;\n\t"
+ "add.u64 \t$dst, %lhs, %rhs;\n\t"
+ "}}",
+ [(set Int64Regs:$dst, (rotl Int64Regs:$src, Int32Regs:$amt))]>;
+
+def ROTR64reg_sw :
+ NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
+ "{{\n\t"
+ ".reg .b64 %lhs;\n\t"
+ ".reg .b64 %rhs;\n\t"
+ ".reg .u32 %amt2;\n\t"
+ "shr.b64 \t%lhs, $src, $amt;\n\t"
+ "sub.u32 \t%amt2, 64, $amt;\n\t"
+ "shl.b64 \t%rhs, $src, %amt2;\n\t"
+ "add.u64 \t$dst, %lhs, %rhs;\n\t"
+ "}}",
+ [(set Int64Regs:$dst, (rotr Int64Regs:$src, Int32Regs:$amt))]>;
+
+//
+// Funnnel shift in clamp mode
+//
+
+// Create SDNodes so they can be used in the DAG code, e.g.
+// NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts)
+def SDTIntShiftDOp :
+ SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+ SDTCisInt<0>, SDTCisInt<3>]>;
+def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>;
+def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>;
+
+def FUNSHFLCLAMP :
+ NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+ "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;",
+ [(set Int32Regs:$dst,
+ (FUN_SHFL_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>;
+
+def FUNSHFRCLAMP :
+ NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+ "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;",
+ [(set Int32Regs:$dst,
+ (FUN_SHFR_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>;
+
+//
+// BFE - bit-field extract
+//
+
+// Template for BFE instructions. Takes four args,
+// [dest (reg), src (reg), start (reg or imm), end (reg or imm)].
+// Start may be an imm only if end is also an imm. FIXME: Is this a
+// restriction in PTX?
+//
+// dest and src may be int32 or int64, but start and end are always int32.
+multiclass BFE<string TyStr, RegisterClass RC> {
+ def rrr
+ : NVPTXInst<(outs RC:$d),
+ (ins RC:$a, Int32Regs:$b, Int32Regs:$c),
+ !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
+ def rri
+ : NVPTXInst<(outs RC:$d),
+ (ins RC:$a, Int32Regs:$b, i32imm:$c),
+ !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
+ def rii
+ : NVPTXInst<(outs RC:$d),
+ (ins RC:$a, i32imm:$b, i32imm:$c),
+ !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
+}
+
+let hasSideEffects = 0 in {
+ defm BFE_S32 : BFE<"s32", Int32Regs>;
+ defm BFE_U32 : BFE<"u32", Int32Regs>;
+ defm BFE_S64 : BFE<"s64", Int64Regs>;
+ defm BFE_U64 : BFE<"u64", Int64Regs>;
+}
+
+//-----------------------------------
+// Comparison instructions (setp, set)
+//-----------------------------------
+
+// FIXME: This doesn't cover versions of set and setp that combine with a
+// boolean predicate, e.g. setp.eq.and.b16.
+
+let hasSideEffects = 0 in {
+ multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> {
+ def rr :
+ NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b, CmpMode:$cmp),
+ !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
+ "\t$dst, $a, $b;"), []>;
+ def ri :
+ NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
+ !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
+ "\t$dst, $a, $b;"), []>;
+ def ir :
+ NVPTXInst<(outs Int1Regs:$dst), (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
+ !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
+ "\t$dst, $a, $b;"), []>;
+ }
+}
+
+defm SETP_b16 : SETP<"b16", Int16Regs, i16imm>;
+defm SETP_s16 : SETP<"s16", Int16Regs, i16imm>;
+defm SETP_u16 : SETP<"u16", Int16Regs, i16imm>;
+defm SETP_b32 : SETP<"b32", Int32Regs, i32imm>;
+defm SETP_s32 : SETP<"s32", Int32Regs, i32imm>;
+defm SETP_u32 : SETP<"u32", Int32Regs, i32imm>;
+defm SETP_b64 : SETP<"b64", Int64Regs, i64imm>;
+defm SETP_s64 : SETP<"s64", Int64Regs, i64imm>;
+defm SETP_u64 : SETP<"u64", Int64Regs, i64imm>;
+defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>;
+defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>;
+
+// FIXME: This doesn't appear to be correct. The "set" mnemonic has the form
+// "set.CmpOp{.ftz}.dtype.stype", where dtype is the type of the destination
+// reg, either u32, s32, or f32. Anyway these aren't used at the moment.
+
+let hasSideEffects = 0 in {
+ multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> {
+ def rr : NVPTXInst<(outs Int32Regs:$dst),
+ (ins RC:$a, RC:$b, CmpMode:$cmp),
+ !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>;
+ def ri : NVPTXInst<(outs Int32Regs:$dst),
+ (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
+ !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>;
+ def ir : NVPTXInst<(outs Int32Regs:$dst),
+ (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
+ !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>;
+ }
+}
+
+defm SET_b16 : SET<"b16", Int16Regs, i16imm>;
+defm SET_s16 : SET<"s16", Int16Regs, i16imm>;
+defm SET_u16 : SET<"u16", Int16Regs, i16imm>;
+defm SET_b32 : SET<"b32", Int32Regs, i32imm>;
+defm SET_s32 : SET<"s32", Int32Regs, i32imm>;
+defm SET_u32 : SET<"u32", Int32Regs, i32imm>;
+defm SET_b64 : SET<"b64", Int64Regs, i64imm>;
+defm SET_s64 : SET<"s64", Int64Regs, i64imm>;
+defm SET_u64 : SET<"u64", Int64Regs, i64imm>;
+defm SET_f32 : SET<"f32", Float32Regs, f32imm>;
+defm SET_f64 : SET<"f64", Float64Regs, f64imm>;
+
+//-----------------------------------
+// Selection instructions (selp)
+//-----------------------------------
+
+// FIXME: Missing slct
+
+// selp instructions that don't have any pattern matches; we explicitly use
+// them within this file.
+let hasSideEffects = 0 in {
+ multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> {
+ def rr : NVPTXInst<(outs RC:$dst),
+ (ins RC:$a, RC:$b, Int1Regs:$p),
+ !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
+ def ri : NVPTXInst<(outs RC:$dst),
+ (ins RC:$a, ImmCls:$b, Int1Regs:$p),
+ !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
+ def ir : NVPTXInst<(outs RC:$dst),
+ (ins ImmCls:$a, RC:$b, Int1Regs:$p),
+ !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
+ def ii : NVPTXInst<(outs RC:$dst),
+ (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
+ !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
+ }
+
+ multiclass SELP_PATTERN<string TypeStr, RegisterClass RC, Operand ImmCls,
+ SDNode ImmNode> {
+ def rr :
+ NVPTXInst<(outs RC:$dst),
+ (ins RC:$a, RC:$b, Int1Regs:$p),
+ !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
+ [(set RC:$dst, (select Int1Regs:$p, RC:$a, RC:$b))]>;
+ def ri :
+ NVPTXInst<(outs RC:$dst),
+ (ins RC:$a, ImmCls:$b, Int1Regs:$p),
+ !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
+ [(set RC:$dst, (select Int1Regs:$p, RC:$a, ImmNode:$b))]>;
+ def ir :
+ NVPTXInst<(outs RC:$dst),
+ (ins ImmCls:$a, RC:$b, Int1Regs:$p),
+ !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
+ [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, RC:$b))]>;
+ def ii :
+ NVPTXInst<(outs RC:$dst),
+ (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
+ !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
+ [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>;
+ }
+}
+
+// Don't pattern match on selp.{s,u}{16,32,64} -- selp.b{16,32,64} is just as
+// good.
+defm SELP_b16 : SELP_PATTERN<"b16", Int16Regs, i16imm, imm>;
+defm SELP_s16 : SELP<"s16", Int16Regs, i16imm>;
+defm SELP_u16 : SELP<"u16", Int16Regs, i16imm>;
+defm SELP_b32 : SELP_PATTERN<"b32", Int32Regs, i32imm, imm>;
+defm SELP_s32 : SELP<"s32", Int32Regs, i32imm>;
+defm SELP_u32 : SELP<"u32", Int32Regs, i32imm>;
+defm SELP_b64 : SELP_PATTERN<"b64", Int64Regs, i64imm, imm>;
+defm SELP_s64 : SELP<"s64", Int64Regs, i64imm>;
+defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>;
+defm SELP_f32 : SELP_PATTERN<"f32", Float32Regs, f32imm, fpimm>;
+defm SELP_f64 : SELP_PATTERN<"f64", Float64Regs, f64imm, fpimm>;
+
+//-----------------------------------
+// Data Movement (Load / Store, Move)
+//-----------------------------------
+
+def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex],
+ [SDNPWantRoot]>;
+def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex],
+ [SDNPWantRoot]>;
+
+def MEMri : Operand<i32> {
+ let PrintMethod = "printMemOperand";
+ let MIOperandInfo = (ops Int32Regs, i32imm);
+}
+def MEMri64 : Operand<i64> {
+ let PrintMethod = "printMemOperand";
+ let MIOperandInfo = (ops Int64Regs, i64imm);
+}
+
+def imem : Operand<iPTR> {
+ let PrintMethod = "printOperand";
+}
+
+def imemAny : Operand<iPTRAny> {
+ let PrintMethod = "printOperand";
+}
+
+def LdStCode : Operand<i32> {
+ let PrintMethod = "printLdStCode";
+}
+
+def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+def Wrapper : SDNode<"NVPTXISD::Wrapper", SDTWrapper>;
+
+// Load a memory address into a u32 or u64 register.
+def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a),
+ "mov.u32 \t$dst, $a;",
+ [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>;
+def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a),
+ "mov.u64 \t$dst, $a;",
+ [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>;
+
+// Get pointer to local stack.
+let hasSideEffects = 0 in {
+ def MOV_DEPOT_ADDR : NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num),
+ "mov.u32 \t$d, __local_depot$num;", []>;
+ def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num),
+ "mov.u64 \t$d, __local_depot$num;", []>;
+}
+
+
+// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp
+let IsSimpleMove=1, hasSideEffects=0 in {
+ def IMOV1rr : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss),
+ "mov.pred \t$dst, $sss;", []>;
+ def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
+ "mov.u16 \t$dst, $sss;", []>;
+ def IMOV32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
+ "mov.u32 \t$dst, $sss;", []>;
+ def IMOV64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
+ "mov.u64 \t$dst, $sss;", []>;
+
+ def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
+ "mov.f32 \t$dst, $src;", []>;
+ def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src),
+ "mov.f64 \t$dst, $src;", []>;
+}
+
+def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src),
+ "mov.pred \t$dst, $src;",
+ [(set Int1Regs:$dst, imm:$src)]>;
+def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
+ "mov.u16 \t$dst, $src;",
+ [(set Int16Regs:$dst, imm:$src)]>;
+def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
+ "mov.u32 \t$dst, $src;",
+ [(set Int32Regs:$dst, imm:$src)]>;
+def IMOV64i : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
+ "mov.u64 \t$dst, $src;",
+ [(set Int64Regs:$dst, imm:$src)]>;
+
+def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src),
+ "mov.f32 \t$dst, $src;",
+ [(set Float32Regs:$dst, fpimm:$src)]>;
+def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src),
+ "mov.f64 \t$dst, $src;",
+ [(set Float64Regs:$dst, fpimm:$src)]>;
+
+def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>;
+
+//---- Copy Frame Index ----
+def LEA_ADDRi : NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr),
+ "add.u32 \t$dst, ${addr:add};",
+ [(set Int32Regs:$dst, ADDRri:$addr)]>;
+def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr),
+ "add.u64 \t$dst, ${addr:add};",
+ [(set Int64Regs:$dst, ADDRri64:$addr)]>;
+
+//-----------------------------------
+// Comparison and Selection
+//-----------------------------------
+
+multiclass ISET_FORMAT<PatFrag OpNode, PatLeaf Mode,
+ Instruction setp_16rr,
+ Instruction setp_16ri,
+ Instruction setp_16ir,
+ Instruction setp_32rr,
+ Instruction setp_32ri,
+ Instruction setp_32ir,
+ Instruction setp_64rr,
+ Instruction setp_64ri,
+ Instruction setp_64ir,
+ Instruction set_16rr,
+ Instruction set_16ri,
+ Instruction set_16ir,
+ Instruction set_32rr,
+ Instruction set_32ri,
+ Instruction set_32ir,
+ Instruction set_64rr,
+ Instruction set_64ri,
+ Instruction set_64ir> {
+ // i16 -> pred
+ def : Pat<(i1 (OpNode Int16Regs:$a, Int16Regs:$b)),
+ (setp_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
+ def : Pat<(i1 (OpNode Int16Regs:$a, imm:$b)),
+ (setp_16ri Int16Regs:$a, imm:$b, Mode)>;
+ def : Pat<(i1 (OpNode imm:$a, Int16Regs:$b)),
+ (setp_16ir imm:$a, Int16Regs:$b, Mode)>;
+ // i32 -> pred
+ def : Pat<(i1 (OpNode Int32Regs:$a, Int32Regs:$b)),
+ (setp_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
+ def : Pat<(i1 (OpNode Int32Regs:$a, imm:$b)),
+ (setp_32ri Int32Regs:$a, imm:$b, Mode)>;
+ def : Pat<(i1 (OpNode imm:$a, Int32Regs:$b)),
+ (setp_32ir imm:$a, Int32Regs:$b, Mode)>;
+ // i64 -> pred
+ def : Pat<(i1 (OpNode Int64Regs:$a, Int64Regs:$b)),
+ (setp_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
+ def : Pat<(i1 (OpNode Int64Regs:$a, imm:$b)),
+ (setp_64ri Int64Regs:$a, imm:$b, Mode)>;
+ def : Pat<(i1 (OpNode imm:$a, Int64Regs:$b)),
+ (setp_64ir imm:$a, Int64Regs:$b, Mode)>;
+
+ // i16 -> i32
+ def : Pat<(i32 (OpNode Int16Regs:$a, Int16Regs:$b)),
+ (set_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
+ def : Pat<(i32 (OpNode Int16Regs:$a, imm:$b)),
+ (set_16ri Int16Regs:$a, imm:$b, Mode)>;
+ def : Pat<(i32 (OpNode imm:$a, Int16Regs:$b)),
+ (set_16ir imm:$a, Int16Regs:$b, Mode)>;
+ // i32 -> i32
+ def : Pat<(i32 (OpNode Int32Regs:$a, Int32Regs:$b)),
+ (set_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
+ def : Pat<(i32 (OpNode Int32Regs:$a, imm:$b)),
+ (set_32ri Int32Regs:$a, imm:$b, Mode)>;
+ def : Pat<(i32 (OpNode imm:$a, Int32Regs:$b)),
+ (set_32ir imm:$a, Int32Regs:$b, Mode)>;
+ // i64 -> i32
+ def : Pat<(i32 (OpNode Int64Regs:$a, Int64Regs:$b)),
+ (set_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
+ def : Pat<(i32 (OpNode Int64Regs:$a, imm:$b)),
+ (set_64ri Int64Regs:$a, imm:$b, Mode)>;
+ def : Pat<(i32 (OpNode imm:$a, Int64Regs:$b)),
+ (set_64ir imm:$a, Int64Regs:$b, Mode)>;
+}
+
+multiclass ISET_FORMAT_SIGNED<PatFrag OpNode, PatLeaf Mode>
+ : ISET_FORMAT<OpNode, Mode,
+ SETP_s16rr, SETP_s16ri, SETP_s16ir,
+ SETP_s32rr, SETP_s32ri, SETP_s32ir,
+ SETP_s64rr, SETP_s64ri, SETP_s64ir,
+ SET_s16rr, SET_s16ri, SET_s16ir,
+ SET_s32rr, SET_s32ri, SET_s32ir,
+ SET_s64rr, SET_s64ri, SET_s64ir> {
+ // TableGen doesn't like empty multiclasses.
+ def : PatLeaf<(i32 0)>;
+}
+
+multiclass ISET_FORMAT_UNSIGNED<PatFrag OpNode, PatLeaf Mode>
+ : ISET_FORMAT<OpNode, Mode,
+ SETP_u16rr, SETP_u16ri, SETP_u16ir,
+ SETP_u32rr, SETP_u32ri, SETP_u32ir,
+ SETP_u64rr, SETP_u64ri, SETP_u64ir,
+ SET_u16rr, SET_u16ri, SET_u16ir,
+ SET_u32rr, SET_u32ri, SET_u32ir,
+ SET_u64rr, SET_u64ri, SET_u64ir> {
+ // TableGen doesn't like empty multiclasses.
+ def : PatLeaf<(i32 0)>;
+}
+
+defm : ISET_FORMAT_SIGNED<setgt, CmpGT>;
+defm : ISET_FORMAT_SIGNED<setlt, CmpLT>;
+defm : ISET_FORMAT_SIGNED<setge, CmpGE>;
+defm : ISET_FORMAT_SIGNED<setle, CmpLE>;
+defm : ISET_FORMAT_SIGNED<seteq, CmpEQ>;
+defm : ISET_FORMAT_SIGNED<setne, CmpNE>;
+defm : ISET_FORMAT_UNSIGNED<setugt, CmpGT>;
+defm : ISET_FORMAT_UNSIGNED<setult, CmpLT>;
+defm : ISET_FORMAT_UNSIGNED<setuge, CmpGE>;
+defm : ISET_FORMAT_UNSIGNED<setule, CmpLE>;
+defm : ISET_FORMAT_UNSIGNED<setueq, CmpEQ>;
+defm : ISET_FORMAT_UNSIGNED<setune, CmpNE>;
+
+// i1 compares
+def : Pat<(setne Int1Regs:$a, Int1Regs:$b),
+ (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
+def : Pat<(setune Int1Regs:$a, Int1Regs:$b),
+ (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
+
+def : Pat<(seteq Int1Regs:$a, Int1Regs:$b),
+ (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
+def : Pat<(setueq Int1Regs:$a, Int1Regs:$b),
+ (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
+
+// i1 compare -> i32
+def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
+ (SELP_u32ii -1, 0, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
+def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
+ (SELP_u32ii 0, -1, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
+
+
+
+multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
+ // f32 -> pred
+ def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
+ (SETP_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
+ Requires<[doF32FTZ]>;
+ def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
+ (SETP_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
+ def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
+ (SETP_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
+ Requires<[doF32FTZ]>;
+ def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
+ (SETP_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
+ def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
+ (SETP_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
+ Requires<[doF32FTZ]>;
+ def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
+ (SETP_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
+
+ // f64 -> pred
+ def : Pat<(i1 (OpNode Float64Regs:$a, Float64Regs:$b)),
+ (SETP_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
+ def : Pat<(i1 (OpNode Float64Regs:$a, fpimm:$b)),
+ (SETP_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
+ def : Pat<(i1 (OpNode fpimm:$a, Float64Regs:$b)),
+ (SETP_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
+
+ // f32 -> i32
+ def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
+ (SET_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
+ Requires<[doF32FTZ]>;
+ def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
+ (SET_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
+ def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
+ (SET_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
+ Requires<[doF32FTZ]>;
+ def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
+ (SET_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
+ def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
+ (SET_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
+ Requires<[doF32FTZ]>;
+ def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
+ (SET_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
+
+ // f64 -> i32
+ def : Pat<(i32 (OpNode Float64Regs:$a, Float64Regs:$b)),
+ (SET_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
+ def : Pat<(i32 (OpNode Float64Regs:$a, fpimm:$b)),
+ (SET_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
+ def : Pat<(i32 (OpNode fpimm:$a, Float64Regs:$b)),
+ (SET_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
+}
+
+defm FSetOGT : FSET_FORMAT<setogt, CmpGT, CmpGT_FTZ>;
+defm FSetOLT : FSET_FORMAT<setolt, CmpLT, CmpLT_FTZ>;
+defm FSetOGE : FSET_FORMAT<setoge, CmpGE, CmpGE_FTZ>;
+defm FSetOLE : FSET_FORMAT<setole, CmpLE, CmpLE_FTZ>;
+defm FSetOEQ : FSET_FORMAT<setoeq, CmpEQ, CmpEQ_FTZ>;
+defm FSetONE : FSET_FORMAT<setone, CmpNE, CmpNE_FTZ>;
+
+defm FSetUGT : FSET_FORMAT<setugt, CmpGTU, CmpGTU_FTZ>;
+defm FSetULT : FSET_FORMAT<setult, CmpLTU, CmpLTU_FTZ>;
+defm FSetUGE : FSET_FORMAT<setuge, CmpGEU, CmpGEU_FTZ>;
+defm FSetULE : FSET_FORMAT<setule, CmpLEU, CmpLEU_FTZ>;
+defm FSetUEQ : FSET_FORMAT<setueq, CmpEQU, CmpEQU_FTZ>;
+defm FSetUNE : FSET_FORMAT<setune, CmpNEU, CmpNEU_FTZ>;
+
+defm FSetGT : FSET_FORMAT<setgt, CmpGT, CmpGT_FTZ>;
+defm FSetLT : FSET_FORMAT<setlt, CmpLT, CmpLT_FTZ>;
+defm FSetGE : FSET_FORMAT<setge, CmpGE, CmpGE_FTZ>;
+defm FSetLE : FSET_FORMAT<setle, CmpLE, CmpLE_FTZ>;
+defm FSetEQ : FSET_FORMAT<seteq, CmpEQ, CmpEQ_FTZ>;
+defm FSetNE : FSET_FORMAT<setne, CmpNE, CmpNE_FTZ>;
+
+defm FSetNUM : FSET_FORMAT<seto, CmpNUM, CmpNUM_FTZ>;
+defm FSetNAN : FSET_FORMAT<setuo, CmpNAN, CmpNAN_FTZ>;
+
+// FIXME: What is this doing here? Can it be deleted?
+// def ld_param : SDNode<"NVPTXISD::LOAD_PARAM", SDTLoad,
+// [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+def SDTDeclareParamProfile :
+ SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
+def SDTDeclareScalarParamProfile :
+ SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
+def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
+def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>;
+def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>;
+def SDTPrintCallProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def SDTPrintCallUniProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
+def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>;
+def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>;
+def SDTStoreParam32Profile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
+def SDTCallArgProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
+def SDTCallArgMarkProfile : SDTypeProfile<0, 0, []>;
+def SDTCallVoidProfile : SDTypeProfile<0, 1, []>;
+def SDTCallValProfile : SDTypeProfile<1, 0, []>;
+def SDTMoveParamProfile : SDTypeProfile<1, 1, []>;
+def SDTStoreRetvalProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
+def SDTStoreRetvalV2Profile : SDTypeProfile<0, 3, [SDTCisInt<0>]>;
+def SDTStoreRetvalV4Profile : SDTypeProfile<0, 5, [SDTCisInt<0>]>;
+def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>;
+
+def DeclareParam :
+ SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def DeclareScalarParam :
+ SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParamProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def DeclareRetParam :
+ SDNode<"NVPTXISD::DeclareRetParam", SDTDeclareParamProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def DeclareRet :
+ SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def LoadParam :
+ SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile,
+ [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
+def LoadParamV2 :
+ SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile,
+ [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
+def LoadParamV4 :
+ SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile,
+ [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
+def PrintCall :
+ SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def PrintConvergentCall :
+ SDNode<"NVPTXISD::PrintConvergentCall", SDTPrintCallProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def PrintCallUni :
+ SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def PrintConvergentCallUni :
+ SDNode<"NVPTXISD::PrintConvergentCallUni", SDTPrintCallUniProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParam :
+ SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParamV2 :
+ SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParamV4 :
+ SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParamU32 :
+ SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParamS32 :
+ SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallArgBegin :
+ SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallArg :
+ SDNode<"NVPTXISD::CallArg", SDTCallArgProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def LastCallArg :
+ SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallArgEnd :
+ SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallVoid :
+ SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def Prototype :
+ SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallVal :
+ SDNode<"NVPTXISD::CallVal", SDTCallValProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def MoveParam :
+ SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>;
+def StoreRetval :
+ SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile,
+ [SDNPHasChain, SDNPSideEffect]>;
+def StoreRetvalV2 :
+ SDNode<"NVPTXISD::StoreRetvalV2", SDTStoreRetvalV2Profile,
+ [SDNPHasChain, SDNPSideEffect]>;
+def StoreRetvalV4 :
+ SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile,
+ [SDNPHasChain, SDNPSideEffect]>;
+def PseudoUseParam :
+ SDNode<"NVPTXISD::PseudoUseParam", SDTPseudoUseParamProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def RETURNNode :
+ SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+let mayLoad = 1 in {
+ class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
+ !strconcat(!strconcat("ld.param", opstr),
+ "\t$dst, [retval0+$b];"),
+ []>;
+
+ class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b),
+ !strconcat("ld.param.v2", opstr,
+ "\t{{$dst, $dst2}}, [retval0+$b];"), []>;
+
+ class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3,
+ regclass:$dst4),
+ (ins i32imm:$b),
+ !strconcat("ld.param.v4", opstr,
+ "\t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"),
+ []>;
+}
+
+class LoadParamRegInst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
+ !strconcat("mov", opstr, "\t$dst, retval$b;"),
+ [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>;
+
+let mayStore = 1 in {
+ class StoreParamInst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
+ !strconcat("st.param", opstr, "\t[param$a+$b], $val;"),
+ []>;
+
+ class StoreParamV2Inst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs), (ins regclass:$val, regclass:$val2,
+ i32imm:$a, i32imm:$b),
+ !strconcat("st.param.v2", opstr,
+ "\t[param$a+$b], {{$val, $val2}};"),
+ []>;
+
+ class StoreParamV4Inst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, regclass:$val3,
+ regclass:$val4, i32imm:$a,
+ i32imm:$b),
+ !strconcat("st.param.v4", opstr,
+ "\t[param$a+$b], {{$val, $val2, $val3, $val4}};"),
+ []>;
+
+ class StoreRetvalInst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs), (ins regclass:$val, i32imm:$a),
+ !strconcat("st.param", opstr, "\t[func_retval0+$a], $val;"),
+ []>;
+
+ class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a),
+ !strconcat("st.param.v2", opstr,
+ "\t[func_retval0+$a], {{$val, $val2}};"),
+ []>;
+
+ class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs),
+ (ins regclass:$val, regclass:$val2, regclass:$val3,
+ regclass:$val4, i32imm:$a),
+ !strconcat("st.param.v4", opstr,
+ "\t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"),
+ []>;
+}
+
+let isCall=1 in {
+ multiclass CALL<string OpcStr, SDNode OpNode> {
+ def PrintCallNoRetInst : NVPTXInst<(outs), (ins),
+ !strconcat(OpcStr, " "), [(OpNode (i32 0))]>;
+ def PrintCallRetInst1 : NVPTXInst<(outs), (ins),
+ !strconcat(OpcStr, " (retval0), "), [(OpNode (i32 1))]>;
+ def PrintCallRetInst2 : NVPTXInst<(outs), (ins),
+ !strconcat(OpcStr, " (retval0, retval1), "), [(OpNode (i32 2))]>;
+ def PrintCallRetInst3 : NVPTXInst<(outs), (ins),
+ !strconcat(OpcStr, " (retval0, retval1, retval2), "), [(OpNode (i32 3))]>;
+ def PrintCallRetInst4 : NVPTXInst<(outs), (ins),
+ !strconcat(OpcStr, " (retval0, retval1, retval2, retval3), "),
+ [(OpNode (i32 4))]>;
+ def PrintCallRetInst5 : NVPTXInst<(outs), (ins),
+ !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4), "),
+ [(OpNode (i32 5))]>;
+ def PrintCallRetInst6 : NVPTXInst<(outs), (ins),
+ !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
+ "retval5), "),
+ [(OpNode (i32 6))]>;
+ def PrintCallRetInst7 : NVPTXInst<(outs), (ins),
+ !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
+ "retval5, retval6), "),
+ [(OpNode (i32 7))]>;
+ def PrintCallRetInst8 : NVPTXInst<(outs), (ins),
+ !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
+ "retval5, retval6, retval7), "),
+ [(OpNode (i32 8))]>;
+ }
+}
+
+defm Call : CALL<"call", PrintCall>;
+defm CallUni : CALL<"call.uni", PrintCallUni>;
+
+// Convergent call instructions. These are identical to regular calls, except
+// they have the isConvergent bit set.
+let isConvergent=1 in {
+ defm ConvergentCall : CALL<"call", PrintConvergentCall>;
+ defm ConvergentCallUni : CALL<"call.uni", PrintConvergentCallUni>;
+}
+
+def LoadParamMemI64 : LoadParamMemInst<Int64Regs, ".b64">;
+def LoadParamMemI32 : LoadParamMemInst<Int32Regs, ".b32">;
+def LoadParamMemI16 : LoadParamMemInst<Int16Regs, ".b16">;
+def LoadParamMemI8 : LoadParamMemInst<Int16Regs, ".b8">;
+def LoadParamMemV2I64 : LoadParamV2MemInst<Int64Regs, ".b64">;
+def LoadParamMemV2I32 : LoadParamV2MemInst<Int32Regs, ".b32">;
+def LoadParamMemV2I16 : LoadParamV2MemInst<Int16Regs, ".b16">;
+def LoadParamMemV2I8 : LoadParamV2MemInst<Int16Regs, ".b8">;
+def LoadParamMemV4I32 : LoadParamV4MemInst<Int32Regs, ".b32">;
+def LoadParamMemV4I16 : LoadParamV4MemInst<Int16Regs, ".b16">;
+def LoadParamMemV4I8 : LoadParamV4MemInst<Int16Regs, ".b8">;
+def LoadParamMemF32 : LoadParamMemInst<Float32Regs, ".f32">;
+def LoadParamMemF64 : LoadParamMemInst<Float64Regs, ".f64">;
+def LoadParamMemV2F32 : LoadParamV2MemInst<Float32Regs, ".f32">;
+def LoadParamMemV2F64 : LoadParamV2MemInst<Float64Regs, ".f64">;
+def LoadParamMemV4F32 : LoadParamV4MemInst<Float32Regs, ".f32">;
+
+def StoreParamI64 : StoreParamInst<Int64Regs, ".b64">;
+def StoreParamI32 : StoreParamInst<Int32Regs, ".b32">;
+
+def StoreParamI16 : StoreParamInst<Int16Regs, ".b16">;
+def StoreParamI8 : StoreParamInst<Int16Regs, ".b8">;
+def StoreParamV2I64 : StoreParamV2Inst<Int64Regs, ".b64">;
+def StoreParamV2I32 : StoreParamV2Inst<Int32Regs, ".b32">;
+def StoreParamV2I16 : StoreParamV2Inst<Int16Regs, ".b16">;
+def StoreParamV2I8 : StoreParamV2Inst<Int16Regs, ".b8">;
+
+def StoreParamV4I32 : StoreParamV4Inst<Int32Regs, ".b32">;
+def StoreParamV4I16 : StoreParamV4Inst<Int16Regs, ".b16">;
+def StoreParamV4I8 : StoreParamV4Inst<Int16Regs, ".b8">;
+
+def StoreParamF32 : StoreParamInst<Float32Regs, ".f32">;
+def StoreParamF64 : StoreParamInst<Float64Regs, ".f64">;
+def StoreParamV2F32 : StoreParamV2Inst<Float32Regs, ".f32">;
+def StoreParamV2F64 : StoreParamV2Inst<Float64Regs, ".f64">;
+def StoreParamV4F32 : StoreParamV4Inst<Float32Regs, ".f32">;
+
+def StoreRetvalI64 : StoreRetvalInst<Int64Regs, ".b64">;
+def StoreRetvalI32 : StoreRetvalInst<Int32Regs, ".b32">;
+def StoreRetvalI16 : StoreRetvalInst<Int16Regs, ".b16">;
+def StoreRetvalI8 : StoreRetvalInst<Int16Regs, ".b8">;
+def StoreRetvalV2I64 : StoreRetvalV2Inst<Int64Regs, ".b64">;
+def StoreRetvalV2I32 : StoreRetvalV2Inst<Int32Regs, ".b32">;
+def StoreRetvalV2I16 : StoreRetvalV2Inst<Int16Regs, ".b16">;
+def StoreRetvalV2I8 : StoreRetvalV2Inst<Int16Regs, ".b8">;
+def StoreRetvalV4I32 : StoreRetvalV4Inst<Int32Regs, ".b32">;
+def StoreRetvalV4I16 : StoreRetvalV4Inst<Int16Regs, ".b16">;
+def StoreRetvalV4I8 : StoreRetvalV4Inst<Int16Regs, ".b8">;
+
+def StoreRetvalF64 : StoreRetvalInst<Float64Regs, ".f64">;
+def StoreRetvalF32 : StoreRetvalInst<Float32Regs, ".f32">;
+def StoreRetvalV2F64 : StoreRetvalV2Inst<Float64Regs, ".f64">;
+def StoreRetvalV2F32 : StoreRetvalV2Inst<Float32Regs, ".f32">;
+def StoreRetvalV4F32 : StoreRetvalV4Inst<Float32Regs, ".f32">;
+
+def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>;
+def CallArgEndInst1 : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>;
+def CallArgEndInst0 : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>;
+def RETURNInst : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>;
+
+class CallArgInst<NVPTXRegClass regclass> :
+ NVPTXInst<(outs), (ins regclass:$a), "$a, ",
+ [(CallArg (i32 0), regclass:$a)]>;
+
+class LastCallArgInst<NVPTXRegClass regclass> :
+ NVPTXInst<(outs), (ins regclass:$a), "$a",
+ [(LastCallArg (i32 0), regclass:$a)]>;
+
+def CallArgI64 : CallArgInst<Int64Regs>;
+def CallArgI32 : CallArgInst<Int32Regs>;
+def CallArgI16 : CallArgInst<Int16Regs>;
+def CallArgF64 : CallArgInst<Float64Regs>;
+def CallArgF32 : CallArgInst<Float32Regs>;
+
+def LastCallArgI64 : LastCallArgInst<Int64Regs>;
+def LastCallArgI32 : LastCallArgInst<Int32Regs>;
+def LastCallArgI16 : LastCallArgInst<Int16Regs>;
+def LastCallArgF64 : LastCallArgInst<Float64Regs>;
+def LastCallArgF32 : LastCallArgInst<Float32Regs>;
+
+def CallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a, ",
+ [(CallArg (i32 0), (i32 imm:$a))]>;
+def LastCallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a",
+ [(LastCallArg (i32 0), (i32 imm:$a))]>;
+
+def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ",
+ [(CallArg (i32 1), (i32 imm:$a))]>;
+def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a",
+ [(LastCallArg (i32 1), (i32 imm:$a))]>;
+
+def CallVoidInst : NVPTXInst<(outs), (ins imem:$addr), "$addr, ",
+ [(CallVoid (Wrapper tglobaladdr:$addr))]>;
+def CallVoidInstReg : NVPTXInst<(outs), (ins Int32Regs:$addr), "$addr, ",
+ [(CallVoid Int32Regs:$addr)]>;
+def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr), "$addr, ",
+ [(CallVoid Int64Regs:$addr)]>;
+def PrototypeInst : NVPTXInst<(outs), (ins i32imm:$val), ", prototype_$val;",
+ [(Prototype (i32 imm:$val))]>;
+
+def DeclareRetMemInst :
+ NVPTXInst<(outs), (ins i32imm:$align, i32imm:$size, i32imm:$num),
+ ".param .align $align .b8 retval$num[$size];",
+ [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>;
+def DeclareRetScalarInst :
+ NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
+ ".param .b$size retval$num;",
+ [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>;
+def DeclareRetRegInst :
+ NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
+ ".reg .b$size retval$num;",
+ [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>;
+
+def DeclareParamInst :
+ NVPTXInst<(outs), (ins i32imm:$align, i32imm:$a, i32imm:$size),
+ ".param .align $align .b8 param$a[$size];",
+ [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>;
+def DeclareScalarParamInst :
+ NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
+ ".param .b$size param$a;",
+ [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>;
+def DeclareScalarRegInst :
+ NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
+ ".reg .b$size param$a;",
+ [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>;
+
+class MoveParamInst<NVPTXRegClass regclass, string asmstr> :
+ NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
+ !strconcat("mov", asmstr, "\t$dst, $src;"),
+ [(set regclass:$dst, (MoveParam regclass:$src))]>;
+
+def MoveParamI64 : MoveParamInst<Int64Regs, ".b64">;
+def MoveParamI32 : MoveParamInst<Int32Regs, ".b32">;
+def MoveParamI16 :
+ NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+ "cvt.u16.u32\t$dst, $src;",
+ [(set Int16Regs:$dst, (MoveParam Int16Regs:$src))]>;
+def MoveParamF64 : MoveParamInst<Float64Regs, ".f64">;
+def MoveParamF32 : MoveParamInst<Float32Regs, ".f32">;
+
+class PseudoUseParamInst<NVPTXRegClass regclass> :
+ NVPTXInst<(outs), (ins regclass:$src),
+ "// Pseudo use of $src",
+ [(PseudoUseParam regclass:$src)]>;
+
+def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs>;
+def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs>;
+def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs>;
+def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs>;
+def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs>;
+
+
+//
+// Load / Store Handling
+//
+multiclass LD<NVPTXRegClass regclass> {
+ def _avar : NVPTXInst<
+ (outs regclass:$dst),
+ (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, imem:$addr),
+ "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t$dst, [$addr];", []>;
+ def _areg : NVPTXInst<
+ (outs regclass:$dst),
+ (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, Int32Regs:$addr),
+ "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t$dst, [$addr];", []>;
+ def _areg_64 : NVPTXInst<
+ (outs regclass:$dst),
+ (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, Int64Regs:$addr),
+ "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t$dst, [$addr];", []>;
+ def _ari : NVPTXInst<
+ (outs regclass:$dst),
+ (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+ "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t$dst, [$addr+$offset];", []>;
+ def _ari_64 : NVPTXInst<
+ (outs regclass:$dst),
+ (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+ "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t$dst, [$addr+$offset];", []>;
+ def _asi : NVPTXInst<
+ (outs regclass:$dst),
+ (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+ "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t$dst, [$addr+$offset];", []>;
+}
+
+let mayLoad=1, hasSideEffects=0 in {
+ defm LD_i8 : LD<Int16Regs>;
+ defm LD_i16 : LD<Int16Regs>;
+ defm LD_i32 : LD<Int32Regs>;
+ defm LD_i64 : LD<Int64Regs>;
+ defm LD_f32 : LD<Float32Regs>;
+ defm LD_f64 : LD<Float64Regs>;
+}
+
+multiclass ST<NVPTXRegClass regclass> {
+ def _avar : NVPTXInst<
+ (outs),
+ (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
+ "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ " \t[$addr], $src;", []>;
+ def _areg : NVPTXInst<
+ (outs),
+ (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp,
+ LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
+ "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ " \t[$addr], $src;", []>;
+ def _areg_64 : NVPTXInst<
+ (outs),
+ (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
+ "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ " \t[$addr], $src;", []>;
+ def _ari : NVPTXInst<
+ (outs),
+ (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset),
+ "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ " \t[$addr+$offset], $src;", []>;
+ def _ari_64 : NVPTXInst<
+ (outs),
+ (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset),
+ "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ " \t[$addr+$offset], $src;", []>;
+ def _asi : NVPTXInst<
+ (outs),
+ (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset),
+ "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ " \t[$addr+$offset], $src;", []>;
+}
+
+let mayStore=1, hasSideEffects=0 in {
+ defm ST_i8 : ST<Int16Regs>;
+ defm ST_i16 : ST<Int16Regs>;
+ defm ST_i32 : ST<Int32Regs>;
+ defm ST_i64 : ST<Int64Regs>;
+ defm ST_f32 : ST<Float32Regs>;
+ defm ST_f64 : ST<Float64Regs>;
+}
+
+// The following is used only in and after vector elementizations. Vector
+// elementization happens at the machine instruction level, so the following
+// instructions never appear in the DAG.
+multiclass LD_VEC<NVPTXRegClass regclass> {
+ def _v2_avar : NVPTXInst<
+ (outs regclass:$dst1, regclass:$dst2),
+ (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, imem:$addr),
+ "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t{{$dst1, $dst2}}, [$addr];", []>;
+ def _v2_areg : NVPTXInst<
+ (outs regclass:$dst1, regclass:$dst2),
+ (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, Int32Regs:$addr),
+ "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t{{$dst1, $dst2}}, [$addr];", []>;
+ def _v2_areg_64 : NVPTXInst<
+ (outs regclass:$dst1, regclass:$dst2),
+ (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, Int64Regs:$addr),
+ "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t{{$dst1, $dst2}}, [$addr];", []>;
+ def _v2_ari : NVPTXInst<
+ (outs regclass:$dst1, regclass:$dst2),
+ (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+ "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
+ def _v2_ari_64 : NVPTXInst<
+ (outs regclass:$dst1, regclass:$dst2),
+ (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+ "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
+ def _v2_asi : NVPTXInst<
+ (outs regclass:$dst1, regclass:$dst2),
+ (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+ "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
+ def _v4_avar : NVPTXInst<
+ (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
+ (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, imem:$addr),
+ "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
+ def _v4_areg : NVPTXInst<
+ (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
+ (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, Int32Regs:$addr),
+ "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
+ def _v4_areg_64 : NVPTXInst<
+ (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
+ (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, Int64Regs:$addr),
+ "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
+ def _v4_ari : NVPTXInst<
+ (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
+ (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+ "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
+ def _v4_ari_64 : NVPTXInst<
+ (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
+ (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+ "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
+ def _v4_asi : NVPTXInst<
+ (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
+ (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+ "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
+}
+let mayLoad=1, hasSideEffects=0 in {
+ defm LDV_i8 : LD_VEC<Int16Regs>;
+ defm LDV_i16 : LD_VEC<Int16Regs>;
+ defm LDV_i32 : LD_VEC<Int32Regs>;
+ defm LDV_i64 : LD_VEC<Int64Regs>;
+ defm LDV_f32 : LD_VEC<Float32Regs>;
+ defm LDV_f64 : LD_VEC<Float64Regs>;
+}
+
+multiclass ST_VEC<NVPTXRegClass regclass> {
+ def _v2_avar : NVPTXInst<
+ (outs),
+ (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+ LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
+ "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t[$addr], {{$src1, $src2}};", []>;
+ def _v2_areg : NVPTXInst<
+ (outs),
+ (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+ LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
+ "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t[$addr], {{$src1, $src2}};", []>;
+ def _v2_areg_64 : NVPTXInst<
+ (outs),
+ (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+ LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
+ "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t[$addr], {{$src1, $src2}};", []>;
+ def _v2_ari : NVPTXInst<
+ (outs),
+ (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+ LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr,
+ i32imm:$offset),
+ "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t[$addr+$offset], {{$src1, $src2}};", []>;
+ def _v2_ari_64 : NVPTXInst<
+ (outs),
+ (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+ LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr,
+ i32imm:$offset),
+ "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t[$addr+$offset], {{$src1, $src2}};", []>;
+ def _v2_asi : NVPTXInst<
+ (outs),
+ (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+ LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr,
+ i32imm:$offset),
+ "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t[$addr+$offset], {{$src1, $src2}};", []>;
+ def _v4_avar : NVPTXInst<
+ (outs),
+ (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+ LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, imem:$addr),
+ "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
+ def _v4_areg : NVPTXInst<
+ (outs),
+ (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+ LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, Int32Regs:$addr),
+ "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
+ def _v4_areg_64 : NVPTXInst<
+ (outs),
+ (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+ LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, Int64Regs:$addr),
+ "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
+ def _v4_ari : NVPTXInst<
+ (outs),
+ (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+ LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+ "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
+ def _v4_ari_64 : NVPTXInst<
+ (outs),
+ (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+ LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+ "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
+ def _v4_asi : NVPTXInst<
+ (outs),
+ (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+ LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+ "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}"
+ "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
+}
+
+let mayStore=1, hasSideEffects=0 in {
+ defm STV_i8 : ST_VEC<Int16Regs>;
+ defm STV_i16 : ST_VEC<Int16Regs>;
+ defm STV_i32 : ST_VEC<Int32Regs>;
+ defm STV_i64 : ST_VEC<Int64Regs>;
+ defm STV_f32 : ST_VEC<Float32Regs>;
+ defm STV_f64 : ST_VEC<Float64Regs>;
+}
+
+
+//---- Conversion ----
+
+class F_BITCONVERT<string SzStr, NVPTXRegClass regclassIn,
+ NVPTXRegClass regclassOut> :
+ NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a),
+ !strconcat("mov.b", !strconcat(SzStr, " \t $d, $a;")),
+ [(set regclassOut:$d, (bitconvert regclassIn:$a))]>;
+
+def BITCONVERT_32_I2F : F_BITCONVERT<"32", Int32Regs, Float32Regs>;
+def BITCONVERT_32_F2I : F_BITCONVERT<"32", Float32Regs, Int32Regs>;
+def BITCONVERT_64_I2F : F_BITCONVERT<"64", Int64Regs, Float64Regs>;
+def BITCONVERT_64_F2I : F_BITCONVERT<"64", Float64Regs, Int64Regs>;
+
+// NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where
+// we cannot specify floating-point literals in isel patterns. Therefore, we
+// use an integer selp to select either 1 or 0 and then cvt to floating-point.
+
+// sint -> f32
+def : Pat<(f32 (sint_to_fp Int1Regs:$a)),
+ (CVT_f32_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+def : Pat<(f32 (sint_to_fp Int16Regs:$a)),
+ (CVT_f32_s16 Int16Regs:$a, CvtRN)>;
+def : Pat<(f32 (sint_to_fp Int32Regs:$a)),
+ (CVT_f32_s32 Int32Regs:$a, CvtRN)>;
+def : Pat<(f32 (sint_to_fp Int64Regs:$a)),
+ (CVT_f32_s64 Int64Regs:$a, CvtRN)>;
+
+// uint -> f32
+def : Pat<(f32 (uint_to_fp Int1Regs:$a)),
+ (CVT_f32_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+def : Pat<(f32 (uint_to_fp Int16Regs:$a)),
+ (CVT_f32_u16 Int16Regs:$a, CvtRN)>;
+def : Pat<(f32 (uint_to_fp Int32Regs:$a)),
+ (CVT_f32_u32 Int32Regs:$a, CvtRN)>;
+def : Pat<(f32 (uint_to_fp Int64Regs:$a)),
+ (CVT_f32_u64 Int64Regs:$a, CvtRN)>;
+
+// sint -> f64
+def : Pat<(f64 (sint_to_fp Int1Regs:$a)),
+ (CVT_f64_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+def : Pat<(f64 (sint_to_fp Int16Regs:$a)),
+ (CVT_f64_s16 Int16Regs:$a, CvtRN)>;
+def : Pat<(f64 (sint_to_fp Int32Regs:$a)),
+ (CVT_f64_s32 Int32Regs:$a, CvtRN)>;
+def : Pat<(f64 (sint_to_fp Int64Regs:$a)),
+ (CVT_f64_s64 Int64Regs:$a, CvtRN)>;
+
+// uint -> f64
+def : Pat<(f64 (uint_to_fp Int1Regs:$a)),
+ (CVT_f64_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+def : Pat<(f64 (uint_to_fp Int16Regs:$a)),
+ (CVT_f64_u16 Int16Regs:$a, CvtRN)>;
+def : Pat<(f64 (uint_to_fp Int32Regs:$a)),
+ (CVT_f64_u32 Int32Regs:$a, CvtRN)>;
+def : Pat<(f64 (uint_to_fp Int64Regs:$a)),
+ (CVT_f64_u64 Int64Regs:$a, CvtRN)>;
+
+
+// f32 -> sint
+def : Pat<(i1 (fp_to_sint Float32Regs:$a)),
+ (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
+ (CVT_s16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
+ (CVT_s16_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
+ (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
+ (CVT_s32_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
+ (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
+ (CVT_s64_f32 Float32Regs:$a, CvtRZI)>;
+
+// f32 -> uint
+def : Pat<(i1 (fp_to_uint Float32Regs:$a)),
+ (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
+ (CVT_u16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
+ (CVT_u16_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
+ (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
+ (CVT_u32_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
+ (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
+ (CVT_u64_f32 Float32Regs:$a, CvtRZI)>;
+
+// f64 -> sint
+def : Pat<(i1 (fp_to_sint Float64Regs:$a)),
+ (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_sint Float64Regs:$a)),
+ (CVT_s16_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(i32 (fp_to_sint Float64Regs:$a)),
+ (CVT_s32_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(i64 (fp_to_sint Float64Regs:$a)),
+ (CVT_s64_f64 Float64Regs:$a, CvtRZI)>;
+
+// f64 -> uint
+def : Pat<(i1 (fp_to_uint Float64Regs:$a)),
+ (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_uint Float64Regs:$a)),
+ (CVT_u16_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(i32 (fp_to_uint Float64Regs:$a)),
+ (CVT_u32_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(i64 (fp_to_uint Float64Regs:$a)),
+ (CVT_u64_f64 Float64Regs:$a, CvtRZI)>;
+
+// sext i1
+def : Pat<(i16 (sext Int1Regs:$a)),
+ (SELP_s16ii -1, 0, Int1Regs:$a)>;
+def : Pat<(i32 (sext Int1Regs:$a)),
+ (SELP_s32ii -1, 0, Int1Regs:$a)>;
+def : Pat<(i64 (sext Int1Regs:$a)),
+ (SELP_s64ii -1, 0, Int1Regs:$a)>;
+
+// zext i1
+def : Pat<(i16 (zext Int1Regs:$a)),
+ (SELP_u16ii 1, 0, Int1Regs:$a)>;
+def : Pat<(i32 (zext Int1Regs:$a)),
+ (SELP_u32ii 1, 0, Int1Regs:$a)>;
+def : Pat<(i64 (zext Int1Regs:$a)),
+ (SELP_u64ii 1, 0, Int1Regs:$a)>;
+
+// anyext i1
+def : Pat<(i16 (anyext Int1Regs:$a)),
+ (SELP_u16ii -1, 0, Int1Regs:$a)>;
+def : Pat<(i32 (anyext Int1Regs:$a)),
+ (SELP_u32ii -1, 0, Int1Regs:$a)>;
+def : Pat<(i64 (anyext Int1Regs:$a)),
+ (SELP_u64ii -1, 0, Int1Regs:$a)>;
+
+// sext i16
+def : Pat<(i32 (sext Int16Regs:$a)),
+ (CVT_s32_s16 Int16Regs:$a, CvtNONE)>;
+def : Pat<(i64 (sext Int16Regs:$a)),
+ (CVT_s64_s16 Int16Regs:$a, CvtNONE)>;
+
+// zext i16
+def : Pat<(i32 (zext Int16Regs:$a)),
+ (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
+def : Pat<(i64 (zext Int16Regs:$a)),
+ (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
+
+// anyext i16
+def : Pat<(i32 (anyext Int16Regs:$a)),
+ (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
+def : Pat<(i64 (anyext Int16Regs:$a)),
+ (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
+
+// sext i32
+def : Pat<(i64 (sext Int32Regs:$a)),
+ (CVT_s64_s32 Int32Regs:$a, CvtNONE)>;
+
+// zext i32
+def : Pat<(i64 (zext Int32Regs:$a)),
+ (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
+
+// anyext i32
+def : Pat<(i64 (anyext Int32Regs:$a)),
+ (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
+
+
+// truncate i64
+def : Pat<(i32 (trunc Int64Regs:$a)),
+ (CVT_u32_u64 Int64Regs:$a, CvtNONE)>;
+def : Pat<(i16 (trunc Int64Regs:$a)),
+ (CVT_u16_u64 Int64Regs:$a, CvtNONE)>;
+def : Pat<(i1 (trunc Int64Regs:$a)),
+ (SETP_b64ri (ANDb64ri Int64Regs:$a, 1), 1, CmpEQ)>;
+
+// truncate i32
+def : Pat<(i16 (trunc Int32Regs:$a)),
+ (CVT_u16_u32 Int32Regs:$a, CvtNONE)>;
+def : Pat<(i1 (trunc Int32Regs:$a)),
+ (SETP_b32ri (ANDb32ri Int32Regs:$a, 1), 1, CmpEQ)>;
+
+// truncate i16
+def : Pat<(i1 (trunc Int16Regs:$a)),
+ (SETP_b16ri (ANDb16ri Int16Regs:$a, 1), 1, CmpEQ)>;
+
+// sext_inreg
+def : Pat<(sext_inreg Int16Regs:$a, i8), (CVT_INREG_s16_s8 Int16Regs:$a)>;
+def : Pat<(sext_inreg Int32Regs:$a, i8), (CVT_INREG_s32_s8 Int32Regs:$a)>;
+def : Pat<(sext_inreg Int32Regs:$a, i16), (CVT_INREG_s32_s16 Int32Regs:$a)>;
+def : Pat<(sext_inreg Int64Regs:$a, i8), (CVT_INREG_s64_s8 Int64Regs:$a)>;
+def : Pat<(sext_inreg Int64Regs:$a, i16), (CVT_INREG_s64_s16 Int64Regs:$a)>;
+def : Pat<(sext_inreg Int64Regs:$a, i32), (CVT_INREG_s64_s32 Int64Regs:$a)>;
+
+
+// Select instructions with 32-bit predicates
+def : Pat<(select Int32Regs:$pred, Int16Regs:$a, Int16Regs:$b),
+ (SELP_b16rr Int16Regs:$a, Int16Regs:$b,
+ (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+def : Pat<(select Int32Regs:$pred, Int32Regs:$a, Int32Regs:$b),
+ (SELP_b32rr Int32Regs:$a, Int32Regs:$b,
+ (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+def : Pat<(select Int32Regs:$pred, Int64Regs:$a, Int64Regs:$b),
+ (SELP_b64rr Int64Regs:$a, Int64Regs:$b,
+ (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+def : Pat<(select Int32Regs:$pred, Float32Regs:$a, Float32Regs:$b),
+ (SELP_f32rr Float32Regs:$a, Float32Regs:$b,
+ (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+def : Pat<(select Int32Regs:$pred, Float64Regs:$a, Float64Regs:$b),
+ (SELP_f64rr Float64Regs:$a, Float64Regs:$b,
+ (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+
+
+let hasSideEffects = 0 in {
+ // pack a set of smaller int registers to a larger int register
+ def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d),
+ (ins Int16Regs:$s1, Int16Regs:$s2,
+ Int16Regs:$s3, Int16Regs:$s4),
+ "mov.b64\t$d, {{$s1, $s2, $s3, $s4}};", []>;
+ def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d),
+ (ins Int16Regs:$s1, Int16Regs:$s2),
+ "mov.b32\t$d, {{$s1, $s2}};", []>;
+ def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d),
+ (ins Int32Regs:$s1, Int32Regs:$s2),
+ "mov.b64\t$d, {{$s1, $s2}};", []>;
+ def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d),
+ (ins Float32Regs:$s1, Float32Regs:$s2),
+ "mov.b64\t$d, {{$s1, $s2}};", []>;
+
+ // unpack a larger int register to a set of smaller int registers
+ def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2,
+ Int16Regs:$d3, Int16Regs:$d4),
+ (ins Int64Regs:$s),
+ "mov.b64\t{{$d1, $d2, $d3, $d4}}, $s;", []>;
+ def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2),
+ (ins Int32Regs:$s),
+ "mov.b32\t{{$d1, $d2}}, $s;", []>;
+ def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2),
+ (ins Int64Regs:$s),
+ "mov.b64\t{{$d1, $d2}}, $s;", []>;
+ def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2),
+ (ins Float64Regs:$s),
+ "mov.b64\t{{$d1, $d2}}, $s;", []>;
+}
+
+// Count leading zeros
+let hasSideEffects = 0 in {
+ def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
+ "clz.b32\t$d, $a;", []>;
+ def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+ "clz.b64\t$d, $a;", []>;
+}
+
+// 32-bit has a direct PTX instruction
+def : Pat<(ctlz Int32Regs:$a), (CLZr32 Int32Regs:$a)>;
+
+// For 64-bit, the result in PTX is actually 32-bit so we zero-extend
+// to 64-bit to match the LLVM semantics
+def : Pat<(ctlz Int64Regs:$a), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
+
+// For 16-bit, we zero-extend to 32-bit, then trunc the result back
+// to 16-bits (ctlz of a 16-bit value is guaranteed to require less
+// than 16 bits to store). We also need to subtract 16 because the
+// high-order 16 zeros were counted.
+def : Pat<(ctlz Int16Regs:$a),
+ (SUBi16ri (CVT_u16_u32 (CLZr32
+ (CVT_u32_u16 Int16Regs:$a, CvtNONE)),
+ CvtNONE), 16)>;
+
+// Population count
+let hasSideEffects = 0 in {
+ def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
+ "popc.b32\t$d, $a;", []>;
+ def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+ "popc.b64\t$d, $a;", []>;
+}
+
+// 32-bit has a direct PTX instruction
+def : Pat<(ctpop Int32Regs:$a), (POPCr32 Int32Regs:$a)>;
+
+// For 64-bit, the result in PTX is actually 32-bit so we zero-extend
+// to 64-bit to match the LLVM semantics
+def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>;
+
+// For 16-bit, we zero-extend to 32-bit, then trunc the result back
+// to 16-bits (ctpop of a 16-bit value is guaranteed to require less
+// than 16 bits to store)
+def : Pat<(ctpop Int16Regs:$a),
+ (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>;
+
+// fpround f64 -> f32
+def : Pat<(f32 (fpround Float64Regs:$a)),
+ (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f32 (fpround Float64Regs:$a)),
+ (CVT_f32_f64 Float64Regs:$a, CvtRN)>;
+
+// fpextend f32 -> f64
+def : Pat<(f64 (fpextend Float32Regs:$a)),
+ (CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f64 (fpextend Float32Regs:$a)),
+ (CVT_f64_f32 Float32Regs:$a, CvtNONE)>;
+
+def retflag : SDNode<"NVPTXISD::RET_FLAG", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue]>;
+
+// fceil, ffloor, fround, ftrunc.
+
+def : Pat<(fceil Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtRPI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(fceil Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtRPI)>, Requires<[doNoF32FTZ]>;
+def : Pat<(fceil Float64Regs:$a),
+ (CVT_f64_f64 Float64Regs:$a, CvtRPI)>;
+
+def : Pat<(ffloor Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtRMI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(ffloor Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtRMI)>, Requires<[doNoF32FTZ]>;
+def : Pat<(ffloor Float64Regs:$a),
+ (CVT_f64_f64 Float64Regs:$a, CvtRMI)>;
+
+def : Pat<(fround Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f32 (fround Float32Regs:$a)),
+ (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
+def : Pat<(f64 (fround Float64Regs:$a)),
+ (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
+
+def : Pat<(ftrunc Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(ftrunc Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtRZI)>, Requires<[doNoF32FTZ]>;
+def : Pat<(ftrunc Float64Regs:$a),
+ (CVT_f64_f64 Float64Regs:$a, CvtRZI)>;
+
+// nearbyint and rint are implemented as rounding to nearest even. This isn't
+// strictly correct, because it causes us to ignore the rounding mode. But it
+// matches what CUDA's "libm" does.
+
+def : Pat<(fnearbyint Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(fnearbyint Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
+def : Pat<(fnearbyint Float64Regs:$a),
+ (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
+
+def : Pat<(frint Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(frint Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
+def : Pat<(frint Float64Regs:$a),
+ (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
+
+
+//-----------------------------------
+// Control-flow
+//-----------------------------------
+
+let isTerminator=1 in {
+ let isReturn=1, isBarrier=1 in
+ def Return : NVPTXInst<(outs), (ins), "ret;", [(retflag)]>;
+
+ let isBranch=1 in
+ def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
+ "@$a bra \t$target;",
+ [(brcond Int1Regs:$a, bb:$target)]>;
+ let isBranch=1 in
+ def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
+ "@!$a bra \t$target;", []>;
+
+ let isBranch=1, isBarrier=1 in
+ def GOTO : NVPTXInst<(outs), (ins brtarget:$target),
+ "bra.uni \t$target;", [(br bb:$target)]>;
+}
+
+def : Pat<(brcond Int32Regs:$a, bb:$target),
+ (CBranch (SETP_u32ri Int32Regs:$a, 0, CmpNE), bb:$target)>;
+
+// SelectionDAGBuilder::visitSWitchCase() will invert the condition of a
+// conditional branch if the target block is the next block so that the code
+// can fall through to the target block. The invertion is done by 'xor
+// condition, 1', which will be translated to (setne condition, -1). Since ptx
+// supports '@!pred bra target', we should use it.
+def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target),
+ (CBranchOther Int1Regs:$a, bb:$target)>;
+
+// Call
+def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_NVPTXCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
+
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart,
+ [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
+def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+ SDNPSideEffect]>;
+
+def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
+def call : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def calltarget : Operand<i32>;
+let isCall=1 in {
+ def CALL : NVPTXInst<(outs), (ins calltarget:$dst), "call \t$dst, (1);", []>;
+}
+
+def : Pat<(call tglobaladdr:$dst), (CALL tglobaladdr:$dst)>;
+def : Pat<(call texternalsym:$dst), (CALL texternalsym:$dst)>;
+
+// Pseudo instructions.
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : NVPTXInst<outs, ins, asmstr, pattern>;
+
+def Callseq_Start :
+ NVPTXInst<(outs), (ins i32imm:$amt),
+ "\\{ // callseq $amt\n"
+ "\t.reg .b32 temp_param_reg;",
+ [(callseq_start timm:$amt)]>;
+def Callseq_End :
+ NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+ "\\} // callseq $amt1",
+ [(callseq_end timm:$amt1, timm:$amt2)]>;
+
+// trap instruction
+def trapinst : NVPTXInst<(outs), (ins), "trap;", [(trap)]>;
+
+// Call prototype wrapper
+def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def CallPrototype :
+ SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def ProtoIdent : Operand<i32> {
+ let PrintMethod = "printProtoIdent";
+}
+def CALL_PROTOTYPE :
+ NVPTXInst<(outs), (ins ProtoIdent:$ident),
+ "$ident", [(CallPrototype (i32 texternalsym:$ident))]>;
+
+
+include "NVPTXIntrinsics.td"
+
+
+//-----------------------------------
+// Notes
+//-----------------------------------
+// BSWAP is currently expanded. The following is a more efficient
+// - for < sm_20, use vector scalar mov, as tesla support native 16-bit register
+// - for sm_20, use pmpt (use vector scalar mov to get the pack and
+// unpack). sm_20 supports native 32-bit register, but not native 16-bit
+// register.
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/contrib/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
new file mode 100644
index 000000000000..b0408f12f5b1
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -0,0 +1,7260 @@
+//===- NVPTXIntrinsics.td - PTX Intrinsics Instructions -------*- tblgen -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def immFloat0 : PatLeaf<(fpimm), [{
+ float f = (float)N->getValueAPF().convertToFloat();
+ return (f==0.0f);
+}]>;
+
+def immFloat1 : PatLeaf<(fpimm), [{
+ float f = (float)N->getValueAPF().convertToFloat();
+ return (f==1.0f);
+}]>;
+
+def immDouble0 : PatLeaf<(fpimm), [{
+ double d = (double)N->getValueAPF().convertToDouble();
+ return (d==0.0);
+}]>;
+
+def immDouble1 : PatLeaf<(fpimm), [{
+ double d = (double)N->getValueAPF().convertToDouble();
+ return (d==1.0);
+}]>;
+
+
+
+//-----------------------------------
+// Synchronization and shuffle functions
+//-----------------------------------
+let isConvergent = 1 in {
+def INT_BARRIER0 : NVPTXInst<(outs), (ins),
+ "bar.sync \t0;",
+ [(int_nvvm_barrier0)]>;
+def INT_BARRIER0_POPC : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
+ !strconcat("{{ \n\t",
+ !strconcat(".reg .pred \t%p1; \n\t",
+ !strconcat("setp.ne.u32 \t%p1, $pred, 0; \n\t",
+ !strconcat("bar.red.popc.u32 \t$dst, 0, %p1; \n\t",
+ !strconcat("}}", ""))))),
+ [(set Int32Regs:$dst, (int_nvvm_barrier0_popc Int32Regs:$pred))]>;
+def INT_BARRIER0_AND : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
+ !strconcat("{{ \n\t",
+ !strconcat(".reg .pred \t%p1; \n\t",
+ !strconcat(".reg .pred \t%p2; \n\t",
+ !strconcat("setp.ne.u32 \t%p1, $pred, 0; \n\t",
+ !strconcat("bar.red.and.pred \t%p2, 0, %p1; \n\t",
+ !strconcat("selp.u32 \t$dst, 1, 0, %p2; \n\t",
+ !strconcat("}}", ""))))))),
+ [(set Int32Regs:$dst, (int_nvvm_barrier0_and Int32Regs:$pred))]>;
+def INT_BARRIER0_OR : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
+ !strconcat("{{ \n\t",
+ !strconcat(".reg .pred \t%p1; \n\t",
+ !strconcat(".reg .pred \t%p2; \n\t",
+ !strconcat("setp.ne.u32 \t%p1, $pred, 0; \n\t",
+ !strconcat("bar.red.or.pred \t%p2, 0, %p1; \n\t",
+ !strconcat("selp.u32 \t$dst, 1, 0, %p2; \n\t",
+ !strconcat("}}", ""))))))),
+ [(set Int32Regs:$dst, (int_nvvm_barrier0_or Int32Regs:$pred))]>;
+
+def INT_BAR_SYNC : NVPTXInst<(outs), (ins i32imm:$i), "bar.sync\t$i;",
+ [(int_nvvm_bar_sync imm:$i)]>;
+
+// shfl.{up,down,bfly,idx}.b32
+multiclass SHFL<NVPTXRegClass regclass, string mode, Intrinsic IntOp> {
+ // The last two parameters to shfl can be regs or imms. ptxas is smart
+ // enough to inline constant registers, so strictly speaking we don't need to
+ // handle immediates here. But it's easy enough, and it makes our ptx more
+ // readable.
+ def reg : NVPTXInst<
+ (outs regclass:$dst),
+ (ins regclass:$src, Int32Regs:$offset, Int32Regs:$mask),
+ !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
+ [(set regclass:$dst, (IntOp regclass:$src, Int32Regs:$offset, Int32Regs:$mask))]>;
+
+ def imm1 : NVPTXInst<
+ (outs regclass:$dst),
+ (ins regclass:$src, i32imm:$offset, Int32Regs:$mask),
+ !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
+ [(set regclass:$dst, (IntOp regclass:$src, imm:$offset, Int32Regs:$mask))]>;
+
+ def imm2 : NVPTXInst<
+ (outs regclass:$dst),
+ (ins regclass:$src, Int32Regs:$offset, i32imm:$mask),
+ !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
+ [(set regclass:$dst, (IntOp regclass:$src, Int32Regs:$offset, imm:$mask))]>;
+
+ def imm3 : NVPTXInst<
+ (outs regclass:$dst),
+ (ins regclass:$src, i32imm:$offset, i32imm:$mask),
+ !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
+ [(set regclass:$dst, (IntOp regclass:$src, imm:$offset, imm:$mask))]>;
+}
+
+defm INT_SHFL_DOWN_I32 : SHFL<Int32Regs, "down", int_nvvm_shfl_down_i32>;
+defm INT_SHFL_DOWN_F32 : SHFL<Float32Regs, "down", int_nvvm_shfl_down_f32>;
+defm INT_SHFL_UP_I32 : SHFL<Int32Regs, "up", int_nvvm_shfl_up_i32>;
+defm INT_SHFL_UP_F32 : SHFL<Float32Regs, "up", int_nvvm_shfl_up_f32>;
+defm INT_SHFL_BFLY_I32 : SHFL<Int32Regs, "bfly", int_nvvm_shfl_bfly_i32>;
+defm INT_SHFL_BFLY_F32 : SHFL<Float32Regs, "bfly", int_nvvm_shfl_bfly_f32>;
+defm INT_SHFL_IDX_I32 : SHFL<Int32Regs, "idx", int_nvvm_shfl_idx_i32>;
+defm INT_SHFL_IDX_F32 : SHFL<Float32Regs, "idx", int_nvvm_shfl_idx_f32>;
+
+} // isConvergent = 1
+
+
+//-----------------------------------
+// Explicit Memory Fence Functions
+//-----------------------------------
+class MEMBAR<string StrOp, Intrinsic IntOP> :
+ NVPTXInst<(outs), (ins),
+ StrOp, [(IntOP)]>;
+
+def INT_MEMBAR_CTA : MEMBAR<"membar.cta;", int_nvvm_membar_cta>;
+def INT_MEMBAR_GL : MEMBAR<"membar.gl;", int_nvvm_membar_gl>;
+def INT_MEMBAR_SYS : MEMBAR<"membar.sys;", int_nvvm_membar_sys>;
+
+
+//-----------------------------------
+// Math Functions
+//-----------------------------------
+
+// Map min(1.0, max(0.0, x)) to sat(x)
+// Note that max(0.0, min(x, 1.0)) cannot be mapped to sat(x) because when x is
+// NaN
+// max(0.0, min(x, 1.0)) is 1.0 while sat(x) is 0.
+// Same story for fmax, fmin.
+
+def : Pat<(int_nvvm_fmin_f immFloat1,
+ (int_nvvm_fmax_f immFloat0, Float32Regs:$a)),
+ (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
+def : Pat<(int_nvvm_fmin_f immFloat1,
+ (int_nvvm_fmax_f Float32Regs:$a, immFloat0)),
+ (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
+def : Pat<(int_nvvm_fmin_f
+ (int_nvvm_fmax_f immFloat0, Float32Regs:$a), immFloat1),
+ (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
+def : Pat<(int_nvvm_fmin_f
+ (int_nvvm_fmax_f Float32Regs:$a, immFloat0), immFloat1),
+ (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
+
+def : Pat<(int_nvvm_fmin_d immDouble1,
+ (int_nvvm_fmax_d immDouble0, Float64Regs:$a)),
+ (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
+def : Pat<(int_nvvm_fmin_d immDouble1,
+ (int_nvvm_fmax_d Float64Regs:$a, immDouble0)),
+ (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
+def : Pat<(int_nvvm_fmin_d
+ (int_nvvm_fmax_d immDouble0, Float64Regs:$a), immDouble1),
+ (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
+def : Pat<(int_nvvm_fmin_d
+ (int_nvvm_fmax_d Float64Regs:$a, immDouble0), immDouble1),
+ (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
+
+
+// We need a full string for OpcStr here because we need to deal with case like
+// INT_PTX_RECIP.
+class F_MATH_1<string OpcStr, NVPTXRegClass target_regclass,
+ NVPTXRegClass src_regclass, Intrinsic IntOP>
+ : NVPTXInst<(outs target_regclass:$dst), (ins src_regclass:$src0),
+ OpcStr,
+ [(set target_regclass:$dst, (IntOP src_regclass:$src0))]>;
+
+// We need a full string for OpcStr here because we need to deal with the case
+// like INT_PTX_NATIVE_POWR_F.
+class F_MATH_2<string OpcStr, NVPTXRegClass t_regclass,
+ NVPTXRegClass s0_regclass, NVPTXRegClass s1_regclass, Intrinsic IntOP>
+ : NVPTXInst<(outs t_regclass:$dst),
+ (ins s0_regclass:$src0, s1_regclass:$src1),
+ OpcStr,
+ [(set t_regclass:$dst, (IntOP s0_regclass:$src0, s1_regclass:$src1))]>;
+
+class F_MATH_3<string OpcStr, NVPTXRegClass t_regclass,
+ NVPTXRegClass s0_regclass, NVPTXRegClass s1_regclass,
+ NVPTXRegClass s2_regclass, Intrinsic IntOP>
+ : NVPTXInst<(outs t_regclass:$dst),
+ (ins s0_regclass:$src0, s1_regclass:$src1, s2_regclass:$src2),
+ OpcStr,
+ [(set t_regclass:$dst,
+ (IntOP s0_regclass:$src0, s1_regclass:$src1, s2_regclass:$src2))]>;
+
+//
+// MISC
+//
+
+def INT_NVVM_CLZ_I : F_MATH_1<"clz.b32 \t$dst, $src0;", Int32Regs, Int32Regs,
+ int_nvvm_clz_i>;
+def INT_NVVM_CLZ_LL : F_MATH_1<"clz.b64 \t$dst, $src0;", Int32Regs, Int64Regs,
+ int_nvvm_clz_ll>;
+
+def INT_NVVM_POPC_I : F_MATH_1<"popc.b32 \t$dst, $src0;", Int32Regs, Int32Regs,
+ int_nvvm_popc_i>;
+def INT_NVVM_POPC_LL : F_MATH_1<"popc.b64 \t$dst, $src0;", Int32Regs, Int64Regs,
+ int_nvvm_popc_ll>;
+
+def INT_NVVM_PRMT : F_MATH_3<"prmt.b32 \t$dst, $src0, $src1, $src2;", Int32Regs,
+ Int32Regs, Int32Regs, Int32Regs, int_nvvm_prmt>;
+
+//
+// Min Max
+//
+
+def INT_NVVM_MIN_I : F_MATH_2<"min.s32 \t$dst, $src0, $src1;", Int32Regs,
+ Int32Regs, Int32Regs, int_nvvm_min_i>;
+def INT_NVVM_MIN_UI : F_MATH_2<"min.u32 \t$dst, $src0, $src1;", Int32Regs,
+ Int32Regs, Int32Regs, int_nvvm_min_ui>;
+
+def INT_NVVM_MIN_LL : F_MATH_2<"min.s64 \t$dst, $src0, $src1;", Int64Regs,
+ Int64Regs, Int64Regs, int_nvvm_min_ll>;
+def INT_NVVM_MIN_ULL : F_MATH_2<"min.u64 \t$dst, $src0, $src1;", Int64Regs,
+ Int64Regs, Int64Regs, int_nvvm_min_ull>;
+
+def INT_NVVM_MAX_I : F_MATH_2<"max.s32 \t$dst, $src0, $src1;", Int32Regs,
+ Int32Regs, Int32Regs, int_nvvm_max_i>;
+def INT_NVVM_MAX_UI : F_MATH_2<"max.u32 \t$dst, $src0, $src1;", Int32Regs,
+ Int32Regs, Int32Regs, int_nvvm_max_ui>;
+
+def INT_NVVM_MAX_LL : F_MATH_2<"max.s64 \t$dst, $src0, $src1;", Int64Regs,
+ Int64Regs, Int64Regs, int_nvvm_max_ll>;
+def INT_NVVM_MAX_ULL : F_MATH_2<"max.u64 \t$dst, $src0, $src1;", Int64Regs,
+ Int64Regs, Int64Regs, int_nvvm_max_ull>;
+
+def INT_NVVM_FMIN_F : F_MATH_2<"min.f32 \t$dst, $src0, $src1;", Float32Regs,
+ Float32Regs, Float32Regs, int_nvvm_fmin_f>;
+def INT_NVVM_FMIN_FTZ_F : F_MATH_2<"min.ftz.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_ftz_f>;
+
+def INT_NVVM_FMAX_F : F_MATH_2<"max.f32 \t$dst, $src0, $src1;", Float32Regs,
+ Float32Regs, Float32Regs, int_nvvm_fmax_f>;
+def INT_NVVM_FMAX_FTZ_F : F_MATH_2<"max.ftz.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_ftz_f>;
+
+def INT_NVVM_FMIN_D : F_MATH_2<"min.f64 \t$dst, $src0, $src1;", Float64Regs,
+ Float64Regs, Float64Regs, int_nvvm_fmin_d>;
+def INT_NVVM_FMAX_D : F_MATH_2<"max.f64 \t$dst, $src0, $src1;", Float64Regs,
+ Float64Regs, Float64Regs, int_nvvm_fmax_d>;
+
+//
+// Multiplication
+//
+
+def INT_NVVM_MULHI_I : F_MATH_2<"mul.hi.s32 \t$dst, $src0, $src1;", Int32Regs,
+ Int32Regs, Int32Regs, int_nvvm_mulhi_i>;
+def INT_NVVM_MULHI_UI : F_MATH_2<"mul.hi.u32 \t$dst, $src0, $src1;", Int32Regs,
+ Int32Regs, Int32Regs, int_nvvm_mulhi_ui>;
+
+def INT_NVVM_MULHI_LL : F_MATH_2<"mul.hi.s64 \t$dst, $src0, $src1;", Int64Regs,
+ Int64Regs, Int64Regs, int_nvvm_mulhi_ll>;
+def INT_NVVM_MULHI_ULL : F_MATH_2<"mul.hi.u64 \t$dst, $src0, $src1;", Int64Regs,
+ Int64Regs, Int64Regs, int_nvvm_mulhi_ull>;
+
+def INT_NVVM_MUL_RN_FTZ_F : F_MATH_2<"mul.rn.ftz.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rn_ftz_f>;
+def INT_NVVM_MUL_RN_F : F_MATH_2<"mul.rn.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rn_f>;
+def INT_NVVM_MUL_RZ_FTZ_F : F_MATH_2<"mul.rz.ftz.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rz_ftz_f>;
+def INT_NVVM_MUL_RZ_F : F_MATH_2<"mul.rz.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rz_f>;
+def INT_NVVM_MUL_RM_FTZ_F : F_MATH_2<"mul.rm.ftz.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rm_ftz_f>;
+def INT_NVVM_MUL_RM_F : F_MATH_2<"mul.rm.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rm_f>;
+def INT_NVVM_MUL_RP_FTZ_F : F_MATH_2<"mul.rp.ftz.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rp_ftz_f>;
+def INT_NVVM_MUL_RP_F : F_MATH_2<"mul.rp.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rp_f>;
+
+def INT_NVVM_MUL_RN_D : F_MATH_2<"mul.rn.f64 \t$dst, $src0, $src1;",
+ Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rn_d>;
+def INT_NVVM_MUL_RZ_D : F_MATH_2<"mul.rz.f64 \t$dst, $src0, $src1;",
+ Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rz_d>;
+def INT_NVVM_MUL_RM_D : F_MATH_2<"mul.rm.f64 \t$dst, $src0, $src1;",
+ Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rm_d>;
+def INT_NVVM_MUL_RP_D : F_MATH_2<"mul.rp.f64 \t$dst, $src0, $src1;",
+ Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rp_d>;
+
+def INT_NVVM_MUL24_I : F_MATH_2<"mul24.lo.s32 \t$dst, $src0, $src1;",
+ Int32Regs, Int32Regs, Int32Regs, int_nvvm_mul24_i>;
+def INT_NVVM_MUL24_UI : F_MATH_2<"mul24.lo.u32 \t$dst, $src0, $src1;",
+ Int32Regs, Int32Regs, Int32Regs, int_nvvm_mul24_ui>;
+
+//
+// Div
+//
+
+def INT_NVVM_DIV_APPROX_FTZ_F
+ : F_MATH_2<"div.approx.ftz.f32 \t$dst, $src0, $src1;", Float32Regs,
+ Float32Regs, Float32Regs, int_nvvm_div_approx_ftz_f>;
+def INT_NVVM_DIV_APPROX_F : F_MATH_2<"div.approx.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_approx_f>;
+
+def INT_NVVM_DIV_RN_FTZ_F : F_MATH_2<"div.rn.ftz.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rn_ftz_f>;
+def INT_NVVM_DIV_RN_F : F_MATH_2<"div.rn.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rn_f>;
+def INT_NVVM_DIV_RZ_FTZ_F : F_MATH_2<"div.rz.ftz.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rz_ftz_f>;
+def INT_NVVM_DIV_RZ_F : F_MATH_2<"div.rz.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rz_f>;
+def INT_NVVM_DIV_RM_FTZ_F : F_MATH_2<"div.rm.ftz.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rm_ftz_f>;
+def INT_NVVM_DIV_RM_F : F_MATH_2<"div.rm.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rm_f>;
+def INT_NVVM_DIV_RP_FTZ_F : F_MATH_2<"div.rp.ftz.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rp_ftz_f>;
+def INT_NVVM_DIV_RP_F : F_MATH_2<"div.rp.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rp_f>;
+
+def INT_NVVM_DIV_RN_D : F_MATH_2<"div.rn.f64 \t$dst, $src0, $src1;",
+ Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rn_d>;
+def INT_NVVM_DIV_RZ_D : F_MATH_2<"div.rz.f64 \t$dst, $src0, $src1;",
+ Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rz_d>;
+def INT_NVVM_DIV_RM_D : F_MATH_2<"div.rm.f64 \t$dst, $src0, $src1;",
+ Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rm_d>;
+def INT_NVVM_DIV_RP_D : F_MATH_2<"div.rp.f64 \t$dst, $src0, $src1;",
+ Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rp_d>;
+
+//
+// Brev
+//
+
+def INT_NVVM_BREV32 : F_MATH_1<"brev.b32 \t$dst, $src0;", Int32Regs, Int32Regs,
+ int_nvvm_brev32>;
+def INT_NVVM_BREV64 : F_MATH_1<"brev.b64 \t$dst, $src0;", Int64Regs, Int64Regs,
+ int_nvvm_brev64>;
+
+//
+// Sad
+//
+
+def INT_NVVM_SAD_I : F_MATH_3<"sad.s32 \t$dst, $src0, $src1, $src2;",
+ Int32Regs, Int32Regs, Int32Regs, Int32Regs, int_nvvm_sad_i>;
+def INT_NVVM_SAD_UI : F_MATH_3<"sad.u32 \t$dst, $src0, $src1, $src2;",
+ Int32Regs, Int32Regs, Int32Regs, Int32Regs, int_nvvm_sad_ui>;
+
+//
+// Floor Ceil
+//
+
+def : Pat<(int_nvvm_floor_ftz_f Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtRMI_FTZ)>;
+def : Pat<(int_nvvm_floor_f Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_floor_d Float64Regs:$a),
+ (CVT_f64_f64 Float64Regs:$a, CvtRMI)>;
+
+def : Pat<(int_nvvm_ceil_ftz_f Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtRPI_FTZ)>;
+def : Pat<(int_nvvm_ceil_f Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtRPI)>;
+def : Pat<(int_nvvm_ceil_d Float64Regs:$a),
+ (CVT_f64_f64 Float64Regs:$a, CvtRPI)>;
+
+//
+// Abs
+//
+
+def INT_NVVM_ABS_I : F_MATH_1<"abs.s32 \t$dst, $src0;", Int32Regs, Int32Regs,
+ int_nvvm_abs_i>;
+def INT_NVVM_ABS_LL : F_MATH_1<"abs.s64 \t$dst, $src0;", Int64Regs, Int64Regs,
+ int_nvvm_abs_ll>;
+
+def INT_NVVM_FABS_FTZ_F : F_MATH_1<"abs.ftz.f32 \t$dst, $src0;", Float32Regs,
+ Float32Regs, int_nvvm_fabs_ftz_f>;
+def INT_NVVM_FABS_F : F_MATH_1<"abs.f32 \t$dst, $src0;", Float32Regs,
+ Float32Regs, int_nvvm_fabs_f>;
+
+def INT_NVVM_FABS_D : F_MATH_1<"abs.f64 \t$dst, $src0;", Float64Regs,
+ Float64Regs, int_nvvm_fabs_d>;
+
+//
+// Round
+//
+
+def : Pat<(int_nvvm_round_ftz_f Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>;
+def : Pat<(int_nvvm_round_f Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_round_d Float64Regs:$a),
+ (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
+
+//
+// Trunc
+//
+
+def : Pat<(int_nvvm_trunc_ftz_f Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtRZI_FTZ)>;
+def : Pat<(int_nvvm_trunc_f Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_trunc_d Float64Regs:$a),
+ (CVT_f64_f64 Float64Regs:$a, CvtRZI)>;
+
+//
+// Saturate
+//
+
+def : Pat<(int_nvvm_saturate_ftz_f Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtSAT_FTZ)>;
+def : Pat<(int_nvvm_saturate_f Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
+def : Pat<(int_nvvm_saturate_d Float64Regs:$a),
+ (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
+
+//
+// Exp2 Log2
+//
+
+def INT_NVVM_EX2_APPROX_FTZ_F : F_MATH_1<"ex2.approx.ftz.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_ex2_approx_ftz_f>;
+def INT_NVVM_EX2_APPROX_F : F_MATH_1<"ex2.approx.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_ex2_approx_f>;
+def INT_NVVM_EX2_APPROX_D : F_MATH_1<"ex2.approx.f64 \t$dst, $src0;",
+ Float64Regs, Float64Regs, int_nvvm_ex2_approx_d>;
+
+def INT_NVVM_LG2_APPROX_FTZ_F : F_MATH_1<"lg2.approx.ftz.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_lg2_approx_ftz_f>;
+def INT_NVVM_LG2_APPROX_F : F_MATH_1<"lg2.approx.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_lg2_approx_f>;
+def INT_NVVM_LG2_APPROX_D : F_MATH_1<"lg2.approx.f64 \t$dst, $src0;",
+ Float64Regs, Float64Regs, int_nvvm_lg2_approx_d>;
+
+//
+// Sin Cos
+//
+
+def INT_NVVM_SIN_APPROX_FTZ_F : F_MATH_1<"sin.approx.ftz.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_sin_approx_ftz_f>;
+def INT_NVVM_SIN_APPROX_F : F_MATH_1<"sin.approx.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_sin_approx_f>;
+
+def INT_NVVM_COS_APPROX_FTZ_F : F_MATH_1<"cos.approx.ftz.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_cos_approx_ftz_f>;
+def INT_NVVM_COS_APPROX_F : F_MATH_1<"cos.approx.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_cos_approx_f>;
+
+//
+// Fma
+//
+
+def INT_NVVM_FMA_RN_FTZ_F
+ : F_MATH_3<"fma.rn.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs,
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rn_ftz_f>;
+def INT_NVVM_FMA_RN_F : F_MATH_3<"fma.rn.f32 \t$dst, $src0, $src1, $src2;",
+ Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rn_f>;
+def INT_NVVM_FMA_RZ_FTZ_F
+ : F_MATH_3<"fma.rz.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs,
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rz_ftz_f>;
+def INT_NVVM_FMA_RZ_F : F_MATH_3<"fma.rz.f32 \t$dst, $src0, $src1, $src2;",
+ Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rz_f>;
+def INT_NVVM_FMA_RM_FTZ_F
+ : F_MATH_3<"fma.rm.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs,
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rm_ftz_f>;
+def INT_NVVM_FMA_RM_F : F_MATH_3<"fma.rm.f32 \t$dst, $src0, $src1, $src2;",
+ Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rm_f>;
+def INT_NVVM_FMA_RP_FTZ_F
+ : F_MATH_3<"fma.rp.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs,
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rp_ftz_f>;
+def INT_NVVM_FMA_RP_F : F_MATH_3<"fma.rp.f32 \t$dst, $src0, $src1, $src2;",
+ Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rp_f>;
+
+def INT_NVVM_FMA_RN_D : F_MATH_3<"fma.rn.f64 \t$dst, $src0, $src1, $src2;",
+ Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rn_d>;
+def INT_NVVM_FMA_RZ_D : F_MATH_3<"fma.rz.f64 \t$dst, $src0, $src1, $src2;",
+ Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rz_d>;
+def INT_NVVM_FMA_RM_D : F_MATH_3<"fma.rm.f64 \t$dst, $src0, $src1, $src2;",
+ Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rm_d>;
+def INT_NVVM_FMA_RP_D : F_MATH_3<"fma.rp.f64 \t$dst, $src0, $src1, $src2;",
+ Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rp_d>;
+
+//
+// Rcp
+//
+
+def INT_NVVM_RCP_RN_FTZ_F : F_MATH_1<"rcp.rn.ftz.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_rcp_rn_ftz_f>;
+def INT_NVVM_RCP_RN_F : F_MATH_1<"rcp.rn.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_rcp_rn_f>;
+def INT_NVVM_RCP_RZ_FTZ_F : F_MATH_1<"rcp.rz.ftz.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_rcp_rz_ftz_f>;
+def INT_NVVM_RCP_RZ_F : F_MATH_1<"rcp.rz.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_rcp_rz_f>;
+def INT_NVVM_RCP_RM_FTZ_F : F_MATH_1<"rcp.rm.ftz.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_rcp_rm_ftz_f>;
+def INT_NVVM_RCP_RM_F : F_MATH_1<"rcp.rm.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_rcp_rm_f>;
+def INT_NVVM_RCP_RP_FTZ_F : F_MATH_1<"rcp.rp.ftz.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_rcp_rp_ftz_f>;
+def INT_NVVM_RCP_RP_F : F_MATH_1<"rcp.rp.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_rcp_rp_f>;
+
+def INT_NVVM_RCP_RN_D : F_MATH_1<"rcp.rn.f64 \t$dst, $src0;", Float64Regs,
+ Float64Regs, int_nvvm_rcp_rn_d>;
+def INT_NVVM_RCP_RZ_D : F_MATH_1<"rcp.rz.f64 \t$dst, $src0;", Float64Regs,
+ Float64Regs, int_nvvm_rcp_rz_d>;
+def INT_NVVM_RCP_RM_D : F_MATH_1<"rcp.rm.f64 \t$dst, $src0;", Float64Regs,
+ Float64Regs, int_nvvm_rcp_rm_d>;
+def INT_NVVM_RCP_RP_D : F_MATH_1<"rcp.rp.f64 \t$dst, $src0;", Float64Regs,
+ Float64Regs, int_nvvm_rcp_rp_d>;
+
+def INT_NVVM_RCP_APPROX_FTZ_D : F_MATH_1<"rcp.approx.ftz.f64 \t$dst, $src0;",
+ Float64Regs, Float64Regs, int_nvvm_rcp_approx_ftz_d>;
+
+//
+// Sqrt
+//
+
+def INT_NVVM_SQRT_RN_FTZ_F : F_MATH_1<"sqrt.rn.ftz.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_sqrt_rn_ftz_f>;
+def INT_NVVM_SQRT_RN_F : F_MATH_1<"sqrt.rn.f32 \t$dst, $src0;", Float32Regs,
+ Float32Regs, int_nvvm_sqrt_rn_f>;
+def INT_NVVM_SQRT_RZ_FTZ_F : F_MATH_1<"sqrt.rz.ftz.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_sqrt_rz_ftz_f>;
+def INT_NVVM_SQRT_RZ_F : F_MATH_1<"sqrt.rz.f32 \t$dst, $src0;", Float32Regs,
+ Float32Regs, int_nvvm_sqrt_rz_f>;
+def INT_NVVM_SQRT_RM_FTZ_F : F_MATH_1<"sqrt.rm.ftz.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_sqrt_rm_ftz_f>;
+def INT_NVVM_SQRT_RM_F : F_MATH_1<"sqrt.rm.f32 \t$dst, $src0;", Float32Regs,
+ Float32Regs, int_nvvm_sqrt_rm_f>;
+def INT_NVVM_SQRT_RP_FTZ_F : F_MATH_1<"sqrt.rp.ftz.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_sqrt_rp_ftz_f>;
+def INT_NVVM_SQRT_RP_F : F_MATH_1<"sqrt.rp.f32 \t$dst, $src0;", Float32Regs,
+ Float32Regs, int_nvvm_sqrt_rp_f>;
+def INT_NVVM_SQRT_APPROX_FTZ_F : F_MATH_1<"sqrt.approx.ftz.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_sqrt_approx_ftz_f>;
+def INT_NVVM_SQRT_APPROX_F : F_MATH_1<"sqrt.approx.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_sqrt_approx_f>;
+
+def INT_NVVM_SQRT_RN_D : F_MATH_1<"sqrt.rn.f64 \t$dst, $src0;", Float64Regs,
+ Float64Regs, int_nvvm_sqrt_rn_d>;
+def INT_NVVM_SQRT_RZ_D : F_MATH_1<"sqrt.rz.f64 \t$dst, $src0;", Float64Regs,
+ Float64Regs, int_nvvm_sqrt_rz_d>;
+def INT_NVVM_SQRT_RM_D : F_MATH_1<"sqrt.rm.f64 \t$dst, $src0;", Float64Regs,
+ Float64Regs, int_nvvm_sqrt_rm_d>;
+def INT_NVVM_SQRT_RP_D : F_MATH_1<"sqrt.rp.f64 \t$dst, $src0;", Float64Regs,
+ Float64Regs, int_nvvm_sqrt_rp_d>;
+
+// nvvm_sqrt intrinsic
+def : Pat<(int_nvvm_sqrt_f Float32Regs:$a),
+ (INT_NVVM_SQRT_RN_FTZ_F Float32Regs:$a)>, Requires<[doF32FTZ, do_SQRTF32_RN]>;
+def : Pat<(int_nvvm_sqrt_f Float32Regs:$a),
+ (INT_NVVM_SQRT_RN_F Float32Regs:$a)>, Requires<[do_SQRTF32_RN]>;
+def : Pat<(int_nvvm_sqrt_f Float32Regs:$a),
+ (INT_NVVM_SQRT_APPROX_FTZ_F Float32Regs:$a)>, Requires<[doF32FTZ]>;
+def : Pat<(int_nvvm_sqrt_f Float32Regs:$a),
+ (INT_NVVM_SQRT_APPROX_F Float32Regs:$a)>;
+
+//
+// Rsqrt
+//
+
+def INT_NVVM_RSQRT_APPROX_FTZ_F
+ : F_MATH_1<"rsqrt.approx.ftz.f32 \t$dst, $src0;", Float32Regs, Float32Regs,
+ int_nvvm_rsqrt_approx_ftz_f>;
+def INT_NVVM_RSQRT_APPROX_F : F_MATH_1<"rsqrt.approx.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_rsqrt_approx_f>;
+def INT_NVVM_RSQRT_APPROX_D : F_MATH_1<"rsqrt.approx.f64 \t$dst, $src0;",
+ Float64Regs, Float64Regs, int_nvvm_rsqrt_approx_d>;
+
+//
+// Add
+//
+
+def INT_NVVM_ADD_RN_FTZ_F : F_MATH_2<"add.rn.ftz.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rn_ftz_f>;
+def INT_NVVM_ADD_RN_F : F_MATH_2<"add.rn.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rn_f>;
+def INT_NVVM_ADD_RZ_FTZ_F : F_MATH_2<"add.rz.ftz.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rz_ftz_f>;
+def INT_NVVM_ADD_RZ_F : F_MATH_2<"add.rz.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rz_f>;
+def INT_NVVM_ADD_RM_FTZ_F : F_MATH_2<"add.rm.ftz.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rm_ftz_f>;
+def INT_NVVM_ADD_RM_F : F_MATH_2<"add.rm.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rm_f>;
+def INT_NVVM_ADD_RP_FTZ_F : F_MATH_2<"add.rp.ftz.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rp_ftz_f>;
+def INT_NVVM_ADD_RP_F : F_MATH_2<"add.rp.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rp_f>;
+
+def INT_NVVM_ADD_RN_D : F_MATH_2<"add.rn.f64 \t$dst, $src0, $src1;",
+ Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rn_d>;
+def INT_NVVM_ADD_RZ_D : F_MATH_2<"add.rz.f64 \t$dst, $src0, $src1;",
+ Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rz_d>;
+def INT_NVVM_ADD_RM_D : F_MATH_2<"add.rm.f64 \t$dst, $src0, $src1;",
+ Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rm_d>;
+def INT_NVVM_ADD_RP_D : F_MATH_2<"add.rp.f64 \t$dst, $src0, $src1;",
+ Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rp_d>;
+
+//
+// Convert
+//
+
+def : Pat<(int_nvvm_d2f_rn_ftz Float64Regs:$a),
+ (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>;
+def : Pat<(int_nvvm_d2f_rn Float64Regs:$a),
+ (CVT_f32_f64 Float64Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_d2f_rz_ftz Float64Regs:$a),
+ (CVT_f32_f64 Float64Regs:$a, CvtRZ_FTZ)>;
+def : Pat<(int_nvvm_d2f_rz Float64Regs:$a),
+ (CVT_f32_f64 Float64Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_d2f_rm_ftz Float64Regs:$a),
+ (CVT_f32_f64 Float64Regs:$a, CvtRM_FTZ)>;
+def : Pat<(int_nvvm_d2f_rm Float64Regs:$a),
+ (CVT_f32_f64 Float64Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_d2f_rp_ftz Float64Regs:$a),
+ (CVT_f32_f64 Float64Regs:$a, CvtRP_FTZ)>;
+def : Pat<(int_nvvm_d2f_rp Float64Regs:$a),
+ (CVT_f32_f64 Float64Regs:$a, CvtRP)>;
+
+def : Pat<(int_nvvm_d2i_rn Float64Regs:$a),
+ (CVT_s32_f64 Float64Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_d2i_rz Float64Regs:$a),
+ (CVT_s32_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_d2i_rm Float64Regs:$a),
+ (CVT_s32_f64 Float64Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_d2i_rp Float64Regs:$a),
+ (CVT_s32_f64 Float64Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_d2ui_rn Float64Regs:$a),
+ (CVT_u32_f64 Float64Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_d2ui_rz Float64Regs:$a),
+ (CVT_u32_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_d2ui_rm Float64Regs:$a),
+ (CVT_u32_f64 Float64Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_d2ui_rp Float64Regs:$a),
+ (CVT_u32_f64 Float64Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_i2d_rn Int32Regs:$a),
+ (CVT_f64_s32 Int32Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_i2d_rz Int32Regs:$a),
+ (CVT_f64_s32 Int32Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_i2d_rm Int32Regs:$a),
+ (CVT_f64_s32 Int32Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_i2d_rp Int32Regs:$a),
+ (CVT_f64_s32 Int32Regs:$a, CvtRP)>;
+
+def : Pat<(int_nvvm_ui2d_rn Int32Regs:$a),
+ (CVT_f64_u32 Int32Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_ui2d_rz Int32Regs:$a),
+ (CVT_f64_u32 Int32Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_ui2d_rm Int32Regs:$a),
+ (CVT_f64_u32 Int32Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_ui2d_rp Int32Regs:$a),
+ (CVT_f64_u32 Int32Regs:$a, CvtRP)>;
+
+def : Pat<(int_nvvm_f2i_rn_ftz Float32Regs:$a),
+ (CVT_s32_f32 Float32Regs:$a, CvtRNI_FTZ)>;
+def : Pat<(int_nvvm_f2i_rn Float32Regs:$a),
+ (CVT_s32_f32 Float32Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_f2i_rz_ftz Float32Regs:$a),
+ (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>;
+def : Pat<(int_nvvm_f2i_rz Float32Regs:$a),
+ (CVT_s32_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_f2i_rm_ftz Float32Regs:$a),
+ (CVT_s32_f32 Float32Regs:$a, CvtRMI_FTZ)>;
+def : Pat<(int_nvvm_f2i_rm Float32Regs:$a),
+ (CVT_s32_f32 Float32Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_f2i_rp_ftz Float32Regs:$a),
+ (CVT_s32_f32 Float32Regs:$a, CvtRPI_FTZ)>;
+def : Pat<(int_nvvm_f2i_rp Float32Regs:$a),
+ (CVT_s32_f32 Float32Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_f2ui_rn_ftz Float32Regs:$a),
+ (CVT_u32_f32 Float32Regs:$a, CvtRNI_FTZ)>;
+def : Pat<(int_nvvm_f2ui_rn Float32Regs:$a),
+ (CVT_u32_f32 Float32Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_f2ui_rz_ftz Float32Regs:$a),
+ (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>;
+def : Pat<(int_nvvm_f2ui_rz Float32Regs:$a),
+ (CVT_u32_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_f2ui_rm_ftz Float32Regs:$a),
+ (CVT_u32_f32 Float32Regs:$a, CvtRMI_FTZ)>;
+def : Pat<(int_nvvm_f2ui_rm Float32Regs:$a),
+ (CVT_u32_f32 Float32Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_f2ui_rp_ftz Float32Regs:$a),
+ (CVT_u32_f32 Float32Regs:$a, CvtRPI_FTZ)>;
+def : Pat<(int_nvvm_f2ui_rp Float32Regs:$a),
+ (CVT_u32_f32 Float32Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_i2f_rn Int32Regs:$a),
+ (CVT_f32_s32 Int32Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_i2f_rz Int32Regs:$a),
+ (CVT_f32_s32 Int32Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_i2f_rm Int32Regs:$a),
+ (CVT_f32_s32 Int32Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_i2f_rp Int32Regs:$a),
+ (CVT_f32_s32 Int32Regs:$a, CvtRP)>;
+
+def : Pat<(int_nvvm_ui2f_rn Int32Regs:$a),
+ (CVT_f32_u32 Int32Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_ui2f_rz Int32Regs:$a),
+ (CVT_f32_u32 Int32Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_ui2f_rm Int32Regs:$a),
+ (CVT_f32_u32 Int32Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_ui2f_rp Int32Regs:$a),
+ (CVT_f32_u32 Int32Regs:$a, CvtRP)>;
+
+def INT_NVVM_LOHI_I2D : F_MATH_2<"mov.b64 \t$dst, {{$src0, $src1}};",
+ Float64Regs, Int32Regs, Int32Regs, int_nvvm_lohi_i2d>;
+
+def INT_NVVM_D2I_LO : F_MATH_1<!strconcat("{{\n\t",
+ !strconcat(".reg .b32 %temp; \n\t",
+ !strconcat("mov.b64 \t{$dst, %temp}, $src0;\n\t",
+ "}}"))),
+ Int32Regs, Float64Regs, int_nvvm_d2i_lo>;
+def INT_NVVM_D2I_HI : F_MATH_1<!strconcat("{{\n\t",
+ !strconcat(".reg .b32 %temp; \n\t",
+ !strconcat("mov.b64 \t{%temp, $dst}, $src0;\n\t",
+ "}}"))),
+ Int32Regs, Float64Regs, int_nvvm_d2i_hi>;
+
+def : Pat<(int_nvvm_f2ll_rn_ftz Float32Regs:$a),
+ (CVT_s64_f32 Float32Regs:$a, CvtRNI_FTZ)>;
+def : Pat<(int_nvvm_f2ll_rn Float32Regs:$a),
+ (CVT_s64_f32 Float32Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_f2ll_rz_ftz Float32Regs:$a),
+ (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>;
+def : Pat<(int_nvvm_f2ll_rz Float32Regs:$a),
+ (CVT_s64_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_f2ll_rm_ftz Float32Regs:$a),
+ (CVT_s64_f32 Float32Regs:$a, CvtRMI_FTZ)>;
+def : Pat<(int_nvvm_f2ll_rm Float32Regs:$a),
+ (CVT_s64_f32 Float32Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_f2ll_rp_ftz Float32Regs:$a),
+ (CVT_s64_f32 Float32Regs:$a, CvtRPI_FTZ)>;
+def : Pat<(int_nvvm_f2ll_rp Float32Regs:$a),
+ (CVT_s64_f32 Float32Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_f2ull_rn_ftz Float32Regs:$a),
+ (CVT_u64_f32 Float32Regs:$a, CvtRNI_FTZ)>;
+def : Pat<(int_nvvm_f2ull_rn Float32Regs:$a),
+ (CVT_u64_f32 Float32Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_f2ull_rz_ftz Float32Regs:$a),
+ (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>;
+def : Pat<(int_nvvm_f2ull_rz Float32Regs:$a),
+ (CVT_u64_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_f2ull_rm_ftz Float32Regs:$a),
+ (CVT_u64_f32 Float32Regs:$a, CvtRMI_FTZ)>;
+def : Pat<(int_nvvm_f2ull_rm Float32Regs:$a),
+ (CVT_u64_f32 Float32Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_f2ull_rp_ftz Float32Regs:$a),
+ (CVT_u64_f32 Float32Regs:$a, CvtRPI_FTZ)>;
+def : Pat<(int_nvvm_f2ull_rp Float32Regs:$a),
+ (CVT_u64_f32 Float32Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_d2ll_rn Float64Regs:$a),
+ (CVT_s64_f64 Float64Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_d2ll_rz Float64Regs:$a),
+ (CVT_s64_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_d2ll_rm Float64Regs:$a),
+ (CVT_s64_f64 Float64Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_d2ll_rp Float64Regs:$a),
+ (CVT_s64_f64 Float64Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_d2ull_rn Float64Regs:$a),
+ (CVT_u64_f64 Float64Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_d2ull_rz Float64Regs:$a),
+ (CVT_u64_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_d2ull_rm Float64Regs:$a),
+ (CVT_u64_f64 Float64Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_d2ull_rp Float64Regs:$a),
+ (CVT_u64_f64 Float64Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_ll2f_rn Int64Regs:$a),
+ (CVT_f32_s64 Int64Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_ll2f_rz Int64Regs:$a),
+ (CVT_f32_s64 Int64Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_ll2f_rm Int64Regs:$a),
+ (CVT_f32_s64 Int64Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_ll2f_rp Int64Regs:$a),
+ (CVT_f32_s64 Int64Regs:$a, CvtRP)>;
+
+def : Pat<(int_nvvm_ull2f_rn Int64Regs:$a),
+ (CVT_f32_u64 Int64Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_ull2f_rz Int64Regs:$a),
+ (CVT_f32_u64 Int64Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_ull2f_rm Int64Regs:$a),
+ (CVT_f32_u64 Int64Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_ull2f_rp Int64Regs:$a),
+ (CVT_f32_u64 Int64Regs:$a, CvtRP)>;
+
+def : Pat<(int_nvvm_ll2d_rn Int64Regs:$a),
+ (CVT_f64_s64 Int64Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_ll2d_rz Int64Regs:$a),
+ (CVT_f64_s64 Int64Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_ll2d_rm Int64Regs:$a),
+ (CVT_f64_s64 Int64Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_ll2d_rp Int64Regs:$a),
+ (CVT_f64_s64 Int64Regs:$a, CvtRP)>;
+
+def : Pat<(int_nvvm_ull2d_rn Int64Regs:$a),
+ (CVT_f64_u64 Int64Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_ull2d_rz Int64Regs:$a),
+ (CVT_f64_u64 Int64Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_ull2d_rm Int64Regs:$a),
+ (CVT_f64_u64 Int64Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_ull2d_rp Int64Regs:$a),
+ (CVT_f64_u64 Int64Regs:$a, CvtRP)>;
+
+
+// FIXME: Ideally, we could use these patterns instead of the scope-creating
+// patterns, but ptxas does not like these since .s16 is not compatible with
+// .f16. The solution is to use .bXX for all integer register types, but we
+// are not there yet.
+//def : Pat<(int_nvvm_f2h_rn_ftz Float32Regs:$a),
+// (CVT_f16_f32 Float32Regs:$a, CvtRN_FTZ)>;
+//def : Pat<(int_nvvm_f2h_rn Float32Regs:$a),
+// (CVT_f16_f32 Float32Regs:$a, CvtRN)>;
+//
+//def : Pat<(int_nvvm_h2f Int16Regs:$a),
+// (CVT_f32_f16 Int16Regs:$a, CvtNONE)>;
+
+def INT_NVVM_F2H_RN_FTZ : F_MATH_1<!strconcat("{{\n\t",
+ !strconcat(".reg .b16 %temp;\n\t",
+ !strconcat("cvt.rn.ftz.f16.f32 \t%temp, $src0;\n\t",
+ !strconcat("mov.b16 \t$dst, %temp;\n",
+ "}}")))),
+ Int16Regs, Float32Regs, int_nvvm_f2h_rn_ftz>;
+def INT_NVVM_F2H_RN : F_MATH_1<!strconcat("{{\n\t",
+ !strconcat(".reg .b16 %temp;\n\t",
+ !strconcat("cvt.rn.f16.f32 \t%temp, $src0;\n\t",
+ !strconcat("mov.b16 \t$dst, %temp;\n",
+ "}}")))),
+ Int16Regs, Float32Regs, int_nvvm_f2h_rn>;
+
+def INT_NVVM_H2F : F_MATH_1<!strconcat("{{\n\t",
+ !strconcat(".reg .b16 %temp;\n\t",
+ !strconcat("mov.b16 \t%temp, $src0;\n\t",
+ !strconcat("cvt.f32.f16 \t$dst, %temp;\n\t",
+ "}}")))),
+ Float32Regs, Int16Regs, int_nvvm_h2f>;
+
+def : Pat<(f32 (f16_to_fp Int16Regs:$a)),
+ (CVT_f32_f16 Int16Regs:$a, CvtNONE)>;
+def : Pat<(i16 (fp_to_f16 Float32Regs:$a)),
+ (CVT_f16_f32 Float32Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i16 (fp_to_f16 Float32Regs:$a)),
+ (CVT_f16_f32 Float32Regs:$a, CvtRN)>;
+
+def : Pat<(f64 (f16_to_fp Int16Regs:$a)),
+ (CVT_f64_f16 Int16Regs:$a, CvtNONE)>;
+def : Pat<(i16 (fp_to_f16 Float64Regs:$a)),
+ (CVT_f16_f64 Float64Regs:$a, CvtRN)>;
+
+//
+// Bitcast
+//
+
+def INT_NVVM_BITCAST_F2I : F_MATH_1<"mov.b32 \t$dst, $src0;", Int32Regs,
+ Float32Regs, int_nvvm_bitcast_f2i>;
+def INT_NVVM_BITCAST_I2F : F_MATH_1<"mov.b32 \t$dst, $src0;", Float32Regs,
+ Int32Regs, int_nvvm_bitcast_i2f>;
+
+def INT_NVVM_BITCAST_LL2D : F_MATH_1<"mov.b64 \t$dst, $src0;", Float64Regs,
+ Int64Regs, int_nvvm_bitcast_ll2d>;
+def INT_NVVM_BITCAST_D2LL : F_MATH_1<"mov.b64 \t$dst, $src0;", Int64Regs,
+ Float64Regs, int_nvvm_bitcast_d2ll>;
+
+//-----------------------------------
+// Atomic Functions
+//-----------------------------------
+
+class ATOMIC_GLOBAL_CHK <dag ops, dag frag>
+ : PatFrag<ops, frag, [{
+ return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL);
+}]>;
+class ATOMIC_SHARED_CHK <dag ops, dag frag>
+ : PatFrag<ops, frag, [{
+ return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_SHARED);
+}]>;
+class ATOMIC_GENERIC_CHK <dag ops, dag frag>
+ : PatFrag<ops, frag, [{
+ return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GENERIC);
+}]>;
+
+multiclass F_ATOMIC_2_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass,
+ string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
+ Operand IMMType, SDNode IMM, Predicate Pred> {
+ def reg : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b),
+ !strconcat("atom",
+ !strconcat(SpaceStr,
+ !strconcat(OpcStr,
+ !strconcat(TypeStr,
+ !strconcat(" \t$dst, [$addr], $b;", ""))))),
+ [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b))]>,
+ Requires<[Pred]>;
+ def imm : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, IMMType:$b),
+ !strconcat("atom",
+ !strconcat(SpaceStr,
+ !strconcat(OpcStr,
+ !strconcat(TypeStr,
+ !strconcat(" \t$dst, [$addr], $b;", ""))))),
+ [(set regclass:$dst, (IntOp ptrclass:$addr, IMM:$b))]>,
+ Requires<[Pred]>;
+}
+multiclass F_ATOMIC_2<NVPTXRegClass regclass, string SpaceStr, string TypeStr,
+ string OpcStr, PatFrag IntOp, Operand IMMType, SDNode IMM, Predicate Pred> {
+ defm p32 : F_ATOMIC_2_imp<Int32Regs, regclass, SpaceStr, TypeStr, OpcStr,
+ IntOp, IMMType, IMM, Pred>;
+ defm p64 : F_ATOMIC_2_imp<Int64Regs, regclass, SpaceStr, TypeStr, OpcStr,
+ IntOp, IMMType, IMM, Pred>;
+}
+
+// has 2 operands, neg the second one
+multiclass F_ATOMIC_2_NEG_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass,
+ string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
+ Operand IMMType, Predicate Pred> {
+ def reg : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b),
+ !strconcat("{{ \n\t",
+ !strconcat(".reg \t.s",
+ !strconcat(TypeStr,
+ !strconcat(" temp; \n\t",
+ !strconcat("neg.s",
+ !strconcat(TypeStr,
+ !strconcat(" \ttemp, $b; \n\t",
+ !strconcat("atom",
+ !strconcat(SpaceStr,
+ !strconcat(OpcStr,
+ !strconcat(".u",
+ !strconcat(TypeStr,
+ !strconcat(" \t$dst, [$addr], temp; \n\t",
+ !strconcat("}}", "")))))))))))))),
+ [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b))]>,
+ Requires<[Pred]>;
+}
+multiclass F_ATOMIC_2_NEG<NVPTXRegClass regclass, string SpaceStr,
+ string TypeStr, string OpcStr, PatFrag IntOp, Operand IMMType,
+ Predicate Pred> {
+ defm p32: F_ATOMIC_2_NEG_imp<Int32Regs, regclass, SpaceStr, TypeStr, OpcStr,
+ IntOp, IMMType, Pred> ;
+ defm p64: F_ATOMIC_2_NEG_imp<Int64Regs, regclass, SpaceStr, TypeStr, OpcStr,
+ IntOp, IMMType, Pred> ;
+}
+
+// has 3 operands
+multiclass F_ATOMIC_3_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass,
+ string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
+ Operand IMMType, Predicate Pred> {
+ def reg : NVPTXInst<(outs regclass:$dst),
+ (ins ptrclass:$addr, regclass:$b, regclass:$c),
+ !strconcat("atom",
+ !strconcat(SpaceStr,
+ !strconcat(OpcStr,
+ !strconcat(TypeStr,
+ !strconcat(" \t$dst, [$addr], $b, $c;", ""))))),
+ [(set regclass:$dst,
+ (IntOp ptrclass:$addr, regclass:$b, regclass:$c))]>,
+ Requires<[Pred]>;
+ def imm1 : NVPTXInst<(outs regclass:$dst),
+ (ins ptrclass:$addr, IMMType:$b, regclass:$c),
+ !strconcat("atom",
+ !strconcat(SpaceStr,
+ !strconcat(OpcStr,
+ !strconcat(TypeStr,
+ !strconcat(" \t$dst, [$addr], $b, $c;", ""))))),
+ [(set regclass:$dst, (IntOp ptrclass:$addr, imm:$b, regclass:$c))]>,
+ Requires<[Pred]>;
+ def imm2 : NVPTXInst<(outs regclass:$dst),
+ (ins ptrclass:$addr, regclass:$b, IMMType:$c),
+ !strconcat("atom",
+ !strconcat(SpaceStr,
+ !strconcat(OpcStr,
+ !strconcat(TypeStr,
+ !strconcat(" \t$dst, [$addr], $b, $c;", ""))))),
+ [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b, imm:$c))]>,
+ Requires<[Pred]>;
+ def imm3 : NVPTXInst<(outs regclass:$dst),
+ (ins ptrclass:$addr, IMMType:$b, IMMType:$c),
+ !strconcat("atom",
+ !strconcat(SpaceStr,
+ !strconcat(OpcStr,
+ !strconcat(TypeStr,
+ !strconcat(" \t$dst, [$addr], $b, $c;", ""))))),
+ [(set regclass:$dst, (IntOp ptrclass:$addr, imm:$b, imm:$c))]>,
+ Requires<[Pred]>;
+}
+multiclass F_ATOMIC_3<NVPTXRegClass regclass, string SpaceStr, string TypeStr,
+ string OpcStr, PatFrag IntOp, Operand IMMType, Predicate Pred> {
+ defm p32 : F_ATOMIC_3_imp<Int32Regs, regclass, SpaceStr, TypeStr, OpcStr,
+ IntOp, IMMType, Pred>;
+ defm p64 : F_ATOMIC_3_imp<Int64Regs, regclass, SpaceStr, TypeStr, OpcStr,
+ IntOp, IMMType, Pred>;
+}
+
+// atom_add
+
+def atomic_load_add_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_add_32 node:$a, node:$b)>;
+def atomic_load_add_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_add_32 node:$a, node:$b)>;
+def atomic_load_add_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_add_32 node:$a, node:$b)>;
+def atomic_load_add_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_add_64 node:$a, node:$b)>;
+def atomic_load_add_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_add_64 node:$a, node:$b)>;
+def atomic_load_add_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_add_64 node:$a, node:$b)>;
+def atomic_load_add_f32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (int_nvvm_atomic_load_add_f32 node:$a, node:$b)>;
+def atomic_load_add_f32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (int_nvvm_atomic_load_add_f32 node:$a, node:$b)>;
+def atomic_load_add_f32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (int_nvvm_atomic_load_add_f32 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_ADD_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".add",
+ atomic_load_add_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_ADD_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32", ".add",
+ atomic_load_add_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_ADD_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".add",
+ atomic_load_add_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_ADD_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".u32",
+ ".add", atomic_load_add_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+
+defm INT_PTX_ATOM_ADD_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".u64", ".add",
+ atomic_load_add_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_ADD_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".u64", ".add",
+ atomic_load_add_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_ADD_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".u64", ".add",
+ atomic_load_add_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_ADD_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".u64",
+ ".add", atomic_load_add_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+
+defm INT_PTX_ATOM_ADD_G_F32 : F_ATOMIC_2<Float32Regs, ".global", ".f32", ".add",
+ atomic_load_add_f32_g, f32imm, fpimm, hasAtomAddF32>;
+defm INT_PTX_ATOM_ADD_S_F32 : F_ATOMIC_2<Float32Regs, ".shared", ".f32", ".add",
+ atomic_load_add_f32_s, f32imm, fpimm, hasAtomAddF32>;
+defm INT_PTX_ATOM_ADD_GEN_F32 : F_ATOMIC_2<Float32Regs, "", ".f32", ".add",
+ atomic_load_add_f32_gen, f32imm, fpimm, hasAtomAddF32>;
+
+// atom_sub
+
+def atomic_load_sub_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_sub_32 node:$a, node:$b)>;
+def atomic_load_sub_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_sub_32 node:$a, node:$b)>;
+def atomic_load_sub_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_sub_32 node:$a, node:$b)>;
+def atomic_load_sub_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_sub_64 node:$a, node:$b)>;
+def atomic_load_sub_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_sub_64 node:$a, node:$b)>;
+def atomic_load_sub_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_sub_64 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_SUB_G_32 : F_ATOMIC_2_NEG<Int32Regs, ".global", "32", ".add",
+ atomic_load_sub_32_g, i32imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_SUB_G_64 : F_ATOMIC_2_NEG<Int64Regs, ".global", "64", ".add",
+ atomic_load_sub_64_g, i64imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_SUB_GEN_32 : F_ATOMIC_2_NEG<Int32Regs, "", "32", ".add",
+ atomic_load_sub_32_gen, i32imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_SUB_GEN_32_USE_G : F_ATOMIC_2_NEG<Int32Regs, ".global", "32",
+ ".add", atomic_load_sub_32_gen, i32imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_SUB_S_32 : F_ATOMIC_2_NEG<Int32Regs, ".shared", "32", ".add",
+ atomic_load_sub_32_s, i32imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_SUB_S_64 : F_ATOMIC_2_NEG<Int64Regs, ".shared", "64", ".add",
+ atomic_load_sub_64_s, i64imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_SUB_GEN_64 : F_ATOMIC_2_NEG<Int64Regs, "", "64", ".add",
+ atomic_load_sub_64_gen, i64imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_SUB_GEN_64_USE_G : F_ATOMIC_2_NEG<Int64Regs, ".global", "64",
+ ".add", atomic_load_sub_64_gen, i64imm, useAtomRedG64forGen64>;
+
+// atom_swap
+
+def atomic_swap_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_swap_32 node:$a, node:$b)>;
+def atomic_swap_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_swap_32 node:$a, node:$b)>;
+def atomic_swap_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_swap_32 node:$a, node:$b)>;
+def atomic_swap_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_swap_64 node:$a, node:$b)>;
+def atomic_swap_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_swap_64 node:$a, node:$b)>;
+def atomic_swap_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_swap_64 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_SWAP_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".exch",
+ atomic_swap_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_SWAP_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".exch",
+ atomic_swap_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_SWAP_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".exch",
+ atomic_swap_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_SWAP_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
+ ".exch", atomic_swap_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_SWAP_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".exch",
+ atomic_swap_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_SWAP_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".exch",
+ atomic_swap_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_SWAP_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".exch",
+ atomic_swap_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_SWAP_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64",
+ ".exch", atomic_swap_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+
+// atom_max
+
+def atomic_load_max_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b)
+ , (atomic_load_max_32 node:$a, node:$b)>;
+def atomic_load_max_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_max_32 node:$a, node:$b)>;
+def atomic_load_max_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_max_32 node:$a, node:$b)>;
+def atomic_load_max_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b)
+ , (atomic_load_max_64 node:$a, node:$b)>;
+def atomic_load_max_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_max_64 node:$a, node:$b)>;
+def atomic_load_max_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_max_64 node:$a, node:$b)>;
+def atomic_load_umax_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_umax_32 node:$a, node:$b)>;
+def atomic_load_umax_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_umax_32 node:$a, node:$b)>;
+def atomic_load_umax_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_umax_32 node:$a, node:$b)>;
+def atomic_load_umax_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_umax_64 node:$a, node:$b)>;
+def atomic_load_umax_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_umax_64 node:$a, node:$b)>;
+def atomic_load_umax_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_umax_64 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_LOAD_MAX_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".s32",
+ ".max", atomic_load_max_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_LOAD_MAX_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".s32",
+ ".max", atomic_load_max_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_LOAD_MAX_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".s32", ".max",
+ atomic_load_max_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_LOAD_MAX_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
+ ".s32", ".max", atomic_load_max_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_LOAD_MAX_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".s64",
+ ".max", atomic_load_max_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_LOAD_MAX_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".s64",
+ ".max", atomic_load_max_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_LOAD_MAX_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".s64", ".max",
+ atomic_load_max_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_LOAD_MAX_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global",
+ ".s64", ".max", atomic_load_max_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+defm INT_PTX_ATOM_LOAD_UMAX_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32",
+ ".max", atomic_load_umax_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_LOAD_UMAX_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32",
+ ".max", atomic_load_umax_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_LOAD_UMAX_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".max",
+ atomic_load_umax_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_LOAD_UMAX_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
+ ".u32", ".max", atomic_load_umax_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_LOAD_UMAX_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".u64",
+ ".max", atomic_load_umax_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_LOAD_UMAX_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".u64",
+ ".max", atomic_load_umax_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_LOAD_UMAX_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".u64", ".max",
+ atomic_load_umax_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_LOAD_UMAX_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global",
+ ".u64", ".max", atomic_load_umax_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+
+// atom_min
+
+def atomic_load_min_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_min_32 node:$a, node:$b)>;
+def atomic_load_min_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_min_32 node:$a, node:$b)>;
+def atomic_load_min_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_min_32 node:$a, node:$b)>;
+def atomic_load_min_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_min_64 node:$a, node:$b)>;
+def atomic_load_min_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_min_64 node:$a, node:$b)>;
+def atomic_load_min_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_min_64 node:$a, node:$b)>;
+def atomic_load_umin_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_umin_32 node:$a, node:$b)>;
+def atomic_load_umin_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_umin_32 node:$a, node:$b)>;
+def atomic_load_umin_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_umin_32 node:$a, node:$b)>;
+def atomic_load_umin_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_umin_64 node:$a, node:$b)>;
+def atomic_load_umin_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_umin_64 node:$a, node:$b)>;
+def atomic_load_umin_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_umin_64 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_LOAD_MIN_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".s32",
+ ".min", atomic_load_min_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_LOAD_MIN_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".s32",
+ ".min", atomic_load_min_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_LOAD_MIN_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".s32", ".min",
+ atomic_load_min_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_LOAD_MIN_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
+ ".s32", ".min", atomic_load_min_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_LOAD_MIN_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".s64",
+ ".min", atomic_load_min_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_LOAD_MIN_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".s64",
+ ".min", atomic_load_min_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_LOAD_MIN_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".s64", ".min",
+ atomic_load_min_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_LOAD_MIN_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global",
+ ".s64", ".min", atomic_load_min_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+defm INT_PTX_ATOM_LOAD_UMIN_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32",
+ ".min", atomic_load_umin_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_LOAD_UMIN_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32",
+ ".min", atomic_load_umin_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_LOAD_UMIN_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".min",
+ atomic_load_umin_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_LOAD_UMIN_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
+ ".u32", ".min", atomic_load_umin_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_LOAD_UMIN_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".u64",
+ ".min", atomic_load_umin_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_LOAD_UMIN_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".u64",
+ ".min", atomic_load_umin_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_LOAD_UMIN_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".u64", ".min",
+ atomic_load_umin_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_LOAD_UMIN_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global",
+ ".u64", ".min", atomic_load_umin_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+
+// atom_inc atom_dec
+
+def atomic_load_inc_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (int_nvvm_atomic_load_inc_32 node:$a, node:$b)>;
+def atomic_load_inc_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (int_nvvm_atomic_load_inc_32 node:$a, node:$b)>;
+def atomic_load_inc_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (int_nvvm_atomic_load_inc_32 node:$a, node:$b)>;
+def atomic_load_dec_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (int_nvvm_atomic_load_dec_32 node:$a, node:$b)>;
+def atomic_load_dec_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (int_nvvm_atomic_load_dec_32 node:$a, node:$b)>;
+def atomic_load_dec_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (int_nvvm_atomic_load_dec_32 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_INC_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".inc",
+ atomic_load_inc_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_INC_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32", ".inc",
+ atomic_load_inc_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_INC_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".inc",
+ atomic_load_inc_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_INC_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".u32",
+ ".inc", atomic_load_inc_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_DEC_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".dec",
+ atomic_load_dec_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_DEC_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32", ".dec",
+ atomic_load_dec_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_DEC_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".dec",
+ atomic_load_dec_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_DEC_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".u32",
+ ".dec", atomic_load_dec_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+
+// atom_and
+
+def atomic_load_and_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_and_32 node:$a, node:$b)>;
+def atomic_load_and_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_and_32 node:$a, node:$b)>;
+def atomic_load_and_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_and_32 node:$a, node:$b)>;
+def atomic_load_and_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_and_64 node:$a, node:$b)>;
+def atomic_load_and_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_and_64 node:$a, node:$b)>;
+def atomic_load_and_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_and_64 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_AND_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".and",
+ atomic_load_and_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_AND_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".and",
+ atomic_load_and_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_AND_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".and",
+ atomic_load_and_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_AND_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
+ ".and", atomic_load_and_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_AND_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".and",
+ atomic_load_and_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_AND_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".and",
+ atomic_load_and_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_AND_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".and",
+ atomic_load_and_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_AND_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64",
+ ".and", atomic_load_and_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+
+// atom_or
+
+def atomic_load_or_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_or_32 node:$a, node:$b)>;
+def atomic_load_or_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_or_32 node:$a, node:$b)>;
+def atomic_load_or_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_or_32 node:$a, node:$b)>;
+def atomic_load_or_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_or_64 node:$a, node:$b)>;
+def atomic_load_or_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_or_64 node:$a, node:$b)>;
+def atomic_load_or_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_or_64 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_OR_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".or",
+ atomic_load_or_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_OR_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".or",
+ atomic_load_or_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_OR_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
+ ".or", atomic_load_or_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_OR_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".or",
+ atomic_load_or_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_OR_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".or",
+ atomic_load_or_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_OR_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".or",
+ atomic_load_or_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_OR_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64",
+ ".or", atomic_load_or_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+defm INT_PTX_ATOM_OR_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".or",
+ atomic_load_or_64_s, i64imm, imm, hasAtomRedS64>;
+
+// atom_xor
+
+def atomic_load_xor_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_xor_32 node:$a, node:$b)>;
+def atomic_load_xor_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_xor_32 node:$a, node:$b)>;
+def atomic_load_xor_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_xor_32 node:$a, node:$b)>;
+def atomic_load_xor_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_xor_64 node:$a, node:$b)>;
+def atomic_load_xor_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_xor_64 node:$a, node:$b)>;
+def atomic_load_xor_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_xor_64 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_XOR_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".xor",
+ atomic_load_xor_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_XOR_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".xor",
+ atomic_load_xor_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_XOR_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".xor",
+ atomic_load_xor_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_XOR_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
+ ".xor", atomic_load_xor_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_XOR_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".xor",
+ atomic_load_xor_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_XOR_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".xor",
+ atomic_load_xor_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_XOR_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".xor",
+ atomic_load_xor_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_XOR_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64",
+ ".xor", atomic_load_xor_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+
+// atom_cas
+
+def atomic_cmp_swap_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
+ (atomic_cmp_swap_32 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
+ (atomic_cmp_swap_32 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
+ (atomic_cmp_swap_32 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
+ (atomic_cmp_swap_64 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
+ (atomic_cmp_swap_64 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
+ (atomic_cmp_swap_64 node:$a, node:$b, node:$c)>;
+
+defm INT_PTX_ATOM_CAS_G_32 : F_ATOMIC_3<Int32Regs, ".global", ".b32", ".cas",
+ atomic_cmp_swap_32_g, i32imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_CAS_S_32 : F_ATOMIC_3<Int32Regs, ".shared", ".b32", ".cas",
+ atomic_cmp_swap_32_s, i32imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_CAS_GEN_32 : F_ATOMIC_3<Int32Regs, "", ".b32", ".cas",
+ atomic_cmp_swap_32_gen, i32imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_CAS_GEN_32_USE_G : F_ATOMIC_3<Int32Regs, ".global", ".b32",
+ ".cas", atomic_cmp_swap_32_gen, i32imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_CAS_G_64 : F_ATOMIC_3<Int64Regs, ".global", ".b64", ".cas",
+ atomic_cmp_swap_64_g, i64imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_CAS_S_64 : F_ATOMIC_3<Int64Regs, ".shared", ".b64", ".cas",
+ atomic_cmp_swap_64_s, i64imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_CAS_GEN_64 : F_ATOMIC_3<Int64Regs, "", ".b64", ".cas",
+ atomic_cmp_swap_64_gen, i64imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_CAS_GEN_64_USE_G : F_ATOMIC_3<Int64Regs, ".global", ".b64",
+ ".cas", atomic_cmp_swap_64_gen, i64imm, useAtomRedG64forGen64>;
+
+// Support for scoped atomic operations. Matches
+// int_nvvm_atomic_{op}_{space}_{type}_{scope}
+// and converts it into the appropriate instruction.
+// NOTE: not all possible combinations are implemented
+// 'space' is limited to generic as it's the only one needed to support CUDA.
+// 'scope' = 'gpu' is default and is handled by regular atomic instructions.
+class ATOM23_impl<string AsmStr, NVPTXRegClass regclass, list<Predicate> Preds,
+ dag ins, dag Operands>
+ : NVPTXInst<(outs regclass:$result), ins,
+ AsmStr,
+ [(set regclass:$result, Operands)]>,
+ Requires<Preds>;
+
+// Define instruction variants for all addressing modes.
+multiclass ATOM2P_impl<string AsmStr, Intrinsic Intr,
+ NVPTXRegClass regclass, Operand ImmType,
+ SDNode Imm, ValueType ImmTy,
+ list<Predicate> Preds> {
+ let AddedComplexity = 1 in {
+ def : ATOM23_impl<AsmStr, regclass, Preds,
+ (ins Int32Regs:$src, regclass:$b),
+ (Intr Int32Regs:$src, regclass:$b)>;
+ def : ATOM23_impl<AsmStr, regclass, Preds,
+ (ins Int64Regs:$src, regclass:$b),
+ (Intr Int64Regs:$src, regclass:$b)>;
+ }
+ // tablegen can't infer argument types from Intrinsic (though it can
+ // from Instruction) so we have to enforce specific type on
+ // immediates via explicit cast to ImmTy.
+ def : ATOM23_impl<AsmStr, regclass, Preds,
+ (ins Int32Regs:$src, ImmType:$b),
+ (Intr Int32Regs:$src, (ImmTy Imm:$b))>;
+ def : ATOM23_impl<AsmStr, regclass, Preds,
+ (ins Int64Regs:$src, ImmType:$b),
+ (Intr Int64Regs:$src, (ImmTy Imm:$b))>;
+}
+
+multiclass ATOM3P_impl<string AsmStr, Intrinsic Intr,
+ NVPTXRegClass regclass, Operand ImmType,
+ SDNode Imm, ValueType ImmTy,
+ list<Predicate> Preds> {
+ // Variants for register/immediate permutations of $b and $c
+ let AddedComplexity = 2 in {
+ def : ATOM23_impl<AsmStr, regclass, Preds,
+ (ins Int32Regs:$src, regclass:$b, regclass:$c),
+ (Intr Int32Regs:$src, regclass:$b, regclass:$c)>;
+ def : ATOM23_impl<AsmStr, regclass, Preds,
+ (ins Int64Regs:$src, regclass:$b, regclass:$c),
+ (Intr Int64Regs:$src, regclass:$b, regclass:$c)>;
+ }
+ let AddedComplexity = 1 in {
+ def : ATOM23_impl<AsmStr, regclass, Preds,
+ (ins Int32Regs:$src, ImmType:$b, regclass:$c),
+ (Intr Int32Regs:$src, (ImmTy Imm:$b), regclass:$c)>;
+ def : ATOM23_impl<AsmStr, regclass, Preds,
+ (ins Int64Regs:$src, ImmType:$b, regclass:$c),
+ (Intr Int64Regs:$src, (ImmTy Imm:$b), regclass:$c)>;
+ def : ATOM23_impl<AsmStr, regclass, Preds,
+ (ins Int32Regs:$src, regclass:$b, ImmType:$c),
+ (Intr Int32Regs:$src, regclass:$b, (ImmTy Imm:$c))>;
+ def : ATOM23_impl<AsmStr, regclass, Preds,
+ (ins Int64Regs:$src, regclass:$b, ImmType:$c),
+ (Intr Int64Regs:$src, regclass:$b, (ImmTy Imm:$c))>;
+ }
+ def : ATOM23_impl<AsmStr, regclass, Preds,
+ (ins Int32Regs:$src, ImmType:$b, ImmType:$c),
+ (Intr Int32Regs:$src, (ImmTy Imm:$b), (ImmTy Imm:$c))>;
+ def : ATOM23_impl<AsmStr, regclass, Preds,
+ (ins Int64Regs:$src, ImmType:$b, ImmType:$c),
+ (Intr Int64Regs:$src, (ImmTy Imm:$b), (ImmTy Imm:$c))>;
+}
+
+// Constructs instrinsic name and instruction asm strings.
+multiclass ATOM2N_impl<string OpStr, string IntTypeStr, string TypeStr,
+ string ScopeStr, string SpaceStr,
+ NVPTXRegClass regclass, Operand ImmType, SDNode Imm,
+ ValueType ImmTy, list<Predicate> Preds> {
+ defm : ATOM2P_impl<"atom" # !if(!eq(SpaceStr, "gen"), "", "." # SpaceStr)
+ # !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr)
+ # "." # OpStr # "." # TypeStr
+ # " \t$result, [$src], $b;",
+ !cast<Intrinsic>(
+ "int_nvvm_atomic_" # OpStr
+ # "_" # SpaceStr # "_" # IntTypeStr
+ # !if(!eq(ScopeStr,""), "", "_" # ScopeStr)),
+ regclass, ImmType, Imm, ImmTy, Preds>;
+}
+multiclass ATOM3N_impl<string OpStr, string IntTypeStr, string TypeStr,
+ string ScopeStr, string SpaceStr,
+ NVPTXRegClass regclass, Operand ImmType, SDNode Imm,
+ ValueType ImmTy, list<Predicate> Preds> {
+ defm : ATOM3P_impl<"atom" # !if(!eq(SpaceStr, "gen"), "", "." # SpaceStr)
+ # !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr)
+ # "." # OpStr # "." # TypeStr
+ # " \t$result, [$src], $b, $c;",
+ !cast<Intrinsic>(
+ "int_nvvm_atomic_" # OpStr
+ # "_" # SpaceStr # "_" # IntTypeStr
+ # !if(!eq(ScopeStr,""), "", "_" # ScopeStr)),
+ regclass, ImmType, Imm, ImmTy, Preds>;
+}
+
+// Constructs variants for different address spaces.
+// For now we only need variants for generic space pointers.
+multiclass ATOM2A_impl<string OpStr, string IntTypeStr, string TypeStr,
+ string ScopeStr, NVPTXRegClass regclass, Operand ImmType,
+ SDNode Imm, ValueType ImmTy, list<Predicate> Preds> {
+ defm _gen_ : ATOM2N_impl<OpStr, IntTypeStr, TypeStr, ScopeStr, "gen",
+ regclass, ImmType, Imm, ImmTy, Preds>;
+}
+multiclass ATOM3A_impl<string OpStr, string IntTypeStr, string TypeStr,
+ string ScopeStr, NVPTXRegClass regclass, Operand ImmType,
+ SDNode Imm, ValueType ImmTy, list<Predicate> Preds> {
+ defm _gen_ : ATOM3N_impl<OpStr, IntTypeStr, TypeStr, ScopeStr, "gen",
+ regclass, ImmType, Imm, ImmTy, Preds>;
+}
+
+// Constructs variants for different scopes of atomic op.
+multiclass ATOM2S_impl<string OpStr, string IntTypeStr, string TypeStr,
+ NVPTXRegClass regclass, Operand ImmType, SDNode Imm,
+ ValueType ImmTy, list<Predicate> Preds> {
+ // .gpu scope is default and is currently covered by existing
+ // atomics w/o explicitly specified scope.
+ defm _cta : ATOM2A_impl<OpStr, IntTypeStr, TypeStr, "cta",
+ regclass, ImmType, Imm, ImmTy,
+ !listconcat(Preds,[hasAtomScope])>;
+ defm _sys : ATOM2A_impl<OpStr, IntTypeStr, TypeStr, "sys",
+ regclass, ImmType, Imm, ImmTy,
+ !listconcat(Preds,[hasAtomScope])>;
+}
+multiclass ATOM3S_impl<string OpStr, string IntTypeStr, string TypeStr,
+ NVPTXRegClass regclass, Operand ImmType, SDNode Imm, ValueType ImmTy,
+ list<Predicate> Preds> {
+ // No need to define ".gpu"-scoped atomics. They do the same thing
+ // as the regular, non-scoped atomics defined elsewhere.
+ defm _cta : ATOM3A_impl<OpStr, IntTypeStr, TypeStr, "cta",
+ regclass, ImmType, Imm, ImmTy,
+ !listconcat(Preds,[hasAtomScope])>;
+ defm _sys : ATOM3A_impl<OpStr, IntTypeStr, TypeStr, "sys",
+ regclass, ImmType, Imm, ImmTy,
+ !listconcat(Preds,[hasAtomScope])>;
+}
+
+// atom.add
+multiclass ATOM2_add_impl<string OpStr> {
+ defm _s32 : ATOM2S_impl<OpStr, "i", "s32", Int32Regs, i32imm, imm, i32, []>;
+ defm _u32 : ATOM2S_impl<OpStr, "i", "u32", Int32Regs, i32imm, imm, i32, []>;
+ defm _u64 : ATOM2S_impl<OpStr, "i", "u64", Int64Regs, i64imm, imm, i64, []>;
+ defm _f32 : ATOM2S_impl<OpStr, "f", "f32", Float32Regs, f32imm, fpimm, f32,
+ [hasAtomAddF32]>;
+ defm _f64 : ATOM2S_impl<OpStr, "f", "f64", Float64Regs, f64imm, fpimm, f64,
+ [hasAtomAddF64]>;
+}
+
+// atom.{and,or,xor}
+multiclass ATOM2_bitwise_impl<string OpStr> {
+ defm _b32 : ATOM2S_impl<OpStr, "i", "b32", Int32Regs, i32imm, imm, i32, []>;
+ defm _b64 : ATOM2S_impl<OpStr, "i", "b64", Int64Regs, i64imm, imm, i64,
+ [hasAtomBitwise64]>;
+}
+
+// atom.exch
+multiclass ATOM2_exch_impl<string OpStr> {
+ defm _b32 : ATOM2S_impl<OpStr, "i", "b32", Int32Regs, i32imm, imm, i32, []>;
+ defm _b64 : ATOM2S_impl<OpStr, "i", "b64", Int64Regs, i64imm, imm, i64, []>;
+}
+
+// atom.{min,max}
+multiclass ATOM2_minmax_impl<string OpStr> {
+ defm _s32 : ATOM2S_impl<OpStr, "i", "s32", Int32Regs, i32imm, imm, i32, []>;
+ defm _u32 : ATOM2S_impl<OpStr, "i", "u32", Int32Regs, i32imm, imm, i32, []>;
+ defm _s64 : ATOM2S_impl<OpStr, "i", "s64", Int64Regs, i64imm, imm, i64,
+ [hasAtomMinMax64]>;
+ defm _u64 : ATOM2S_impl<OpStr, "i", "u64", Int64Regs, i64imm, imm, i64,
+ [hasAtomMinMax64]>;
+}
+
+// atom.{inc,dec}
+multiclass ATOM2_incdec_impl<string OpStr> {
+ defm _u32 : ATOM2S_impl<OpStr, "i", "u32", Int32Regs, i32imm, imm, i32, []>;
+}
+
+// atom.cas
+multiclass ATOM3_cas_impl<string OpStr> {
+ defm _b32 : ATOM3S_impl<OpStr, "i", "b32", Int32Regs, i32imm, imm, i32, []>;
+ defm _b64 : ATOM3S_impl<OpStr, "i", "b64", Int64Regs, i64imm, imm, i64, []>;
+}
+
+defm INT_PTX_SATOM_ADD : ATOM2_add_impl<"add">;
+defm INT_PTX_SATOM_AND : ATOM2_bitwise_impl<"and">;
+defm INT_PTX_SATOM_CAS : ATOM3_cas_impl<"cas">;
+defm INT_PTX_SATOM_DEC : ATOM2_incdec_impl<"dec">;
+defm INT_PTX_SATOM_EXCH: ATOM2_exch_impl<"exch">;
+defm INT_PTX_SATOM_INC : ATOM2_incdec_impl<"inc">;
+defm INT_PTX_SATOM_MAX : ATOM2_minmax_impl<"max">;
+defm INT_PTX_SATOM_MIN : ATOM2_minmax_impl<"min">;
+defm INT_PTX_SATOM_OR : ATOM2_bitwise_impl<"or">;
+defm INT_PTX_SATOM_XOR : ATOM2_bitwise_impl<"xor">;
+
+//-----------------------------------
+// Support for ldu on sm_20 or later
+//-----------------------------------
+
+// Don't annotate ldu instructions as mayLoad, as they load from memory that is
+// read-only in a kernel.
+
+// Scalar
+
+multiclass LDU_G<string TyStr, NVPTXRegClass regclass> {
+ def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src),
+ !strconcat("ldu.global.", TyStr),
+ []>, Requires<[hasLDU]>;
+ def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src),
+ !strconcat("ldu.global.", TyStr),
+ []>, Requires<[hasLDU]>;
+ def avar: NVPTXInst<(outs regclass:$result), (ins imemAny:$src),
+ !strconcat("ldu.global.", TyStr),
+ []>, Requires<[hasLDU]>;
+ def ari : NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
+ !strconcat("ldu.global.", TyStr),
+ []>, Requires<[hasLDU]>;
+ def ari64 : NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
+ !strconcat("ldu.global.", TyStr),
+ []>, Requires<[hasLDU]>;
+}
+
+defm INT_PTX_LDU_GLOBAL_i8 : LDU_G<"u8 \t$result, [$src];", Int16Regs>;
+defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16 \t$result, [$src];", Int16Regs>;
+defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32 \t$result, [$src];", Int32Regs>;
+defm INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64 \t$result, [$src];", Int64Regs>;
+defm INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32 \t$result, [$src];", Float32Regs>;
+defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64 \t$result, [$src];", Float64Regs>;
+defm INT_PTX_LDU_GLOBAL_p32 : LDU_G<"u32 \t$result, [$src];", Int32Regs>;
+defm INT_PTX_LDU_GLOBAL_p64 : LDU_G<"u64 \t$result, [$src];", Int64Regs>;
+
+// vector
+
+// Elementized vector ldu
+multiclass VLDU_G_ELE_V2<string TyStr, NVPTXRegClass regclass> {
+ def _areg32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+ (ins Int32Regs:$src),
+ !strconcat("ldu.global.", TyStr), []>;
+ def _areg64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+ (ins Int64Regs:$src),
+ !strconcat("ldu.global.", TyStr), []>;
+ def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+ (ins MEMri:$src),
+ !strconcat("ldu.global.", TyStr), []>;
+ def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+ (ins MEMri64:$src),
+ !strconcat("ldu.global.", TyStr), []>;
+ def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+ (ins imemAny:$src),
+ !strconcat("ldu.global.", TyStr), []>;
+}
+
+multiclass VLDU_G_ELE_V4<string TyStr, NVPTXRegClass regclass> {
+ def _areg32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+ regclass:$dst4), (ins Int32Regs:$src),
+ !strconcat("ldu.global.", TyStr), []>;
+ def _areg64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+ regclass:$dst4), (ins Int64Regs:$src),
+ !strconcat("ldu.global.", TyStr), []>;
+ def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+ regclass:$dst4), (ins MEMri:$src),
+ !strconcat("ldu.global.", TyStr), []>;
+ def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+ regclass:$dst4), (ins MEMri64:$src),
+ !strconcat("ldu.global.", TyStr), []>;
+ def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+ regclass:$dst4), (ins imemAny:$src),
+ !strconcat("ldu.global.", TyStr), []>;
+}
+
+defm INT_PTX_LDU_G_v2i8_ELE
+ : VLDU_G_ELE_V2<"v2.u8 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
+defm INT_PTX_LDU_G_v2i16_ELE
+ : VLDU_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
+defm INT_PTX_LDU_G_v2i32_ELE
+ : VLDU_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>;
+defm INT_PTX_LDU_G_v2f32_ELE
+ : VLDU_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>;
+defm INT_PTX_LDU_G_v2i64_ELE
+ : VLDU_G_ELE_V2<"v2.u64 \t{{$dst1, $dst2}}, [$src];", Int64Regs>;
+defm INT_PTX_LDU_G_v2f64_ELE
+ : VLDU_G_ELE_V2<"v2.f64 \t{{$dst1, $dst2}}, [$src];", Float64Regs>;
+defm INT_PTX_LDU_G_v4i8_ELE
+ : VLDU_G_ELE_V4<"v4.u8 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
+defm INT_PTX_LDU_G_v4i16_ELE
+ : VLDU_G_ELE_V4<"v4.u16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
+ Int16Regs>;
+defm INT_PTX_LDU_G_v4i32_ELE
+ : VLDU_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
+ Int32Regs>;
+defm INT_PTX_LDU_G_v4f32_ELE
+ : VLDU_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
+ Float32Regs>;
+
+
+//-----------------------------------
+// Support for ldg on sm_35 or later
+//-----------------------------------
+
+// Don't annotate ld.global.nc as mayLoad, because these loads go through the
+// non-coherent texture cache, and therefore the values read must be read-only
+// during the lifetime of the kernel.
+
+multiclass LDG_G<string TyStr, NVPTXRegClass regclass> {
+ def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src),
+ !strconcat("ld.global.nc.", TyStr),
+ []>, Requires<[hasLDG]>;
+ def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src),
+ !strconcat("ld.global.nc.", TyStr),
+ []>, Requires<[hasLDG]>;
+ def avar: NVPTXInst<(outs regclass:$result), (ins imemAny:$src),
+ !strconcat("ld.global.nc.", TyStr),
+ []>, Requires<[hasLDG]>;
+ def ari : NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
+ !strconcat("ld.global.nc.", TyStr),
+ []>, Requires<[hasLDG]>;
+ def ari64 : NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
+ !strconcat("ld.global.nc.", TyStr),
+ []>, Requires<[hasLDG]>;
+}
+
+defm INT_PTX_LDG_GLOBAL_i8
+ : LDG_G<"u8 \t$result, [$src];", Int16Regs>;
+defm INT_PTX_LDG_GLOBAL_i16
+ : LDG_G<"u16 \t$result, [$src];", Int16Regs>;
+defm INT_PTX_LDG_GLOBAL_i32
+ : LDG_G<"u32 \t$result, [$src];", Int32Regs>;
+defm INT_PTX_LDG_GLOBAL_i64
+ : LDG_G<"u64 \t$result, [$src];", Int64Regs>;
+defm INT_PTX_LDG_GLOBAL_f32
+ : LDG_G<"f32 \t$result, [$src];", Float32Regs>;
+defm INT_PTX_LDG_GLOBAL_f64
+ : LDG_G<"f64 \t$result, [$src];", Float64Regs>;
+defm INT_PTX_LDG_GLOBAL_p32
+ : LDG_G<"u32 \t$result, [$src];", Int32Regs>;
+defm INT_PTX_LDG_GLOBAL_p64
+ : LDG_G<"u64 \t$result, [$src];", Int64Regs>;
+
+// vector
+
+// Elementized vector ldg
+multiclass VLDG_G_ELE_V2<string TyStr, NVPTXRegClass regclass> {
+ def _areg32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+ (ins Int32Regs:$src),
+ !strconcat("ld.global.nc.", TyStr), []>;
+ def _areg64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+ (ins Int64Regs:$src),
+ !strconcat("ld.global.nc.", TyStr), []>;
+ def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+ (ins MEMri:$src),
+ !strconcat("ld.global.nc.", TyStr), []>;
+ def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+ (ins MEMri64:$src),
+ !strconcat("ld.global.nc.", TyStr), []>;
+ def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+ (ins imemAny:$src),
+ !strconcat("ld.global.nc.", TyStr), []>;
+}
+
+multiclass VLDG_G_ELE_V4<string TyStr, NVPTXRegClass regclass> {
+ def _areg32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+ regclass:$dst4), (ins Int32Regs:$src),
+ !strconcat("ld.global.nc.", TyStr), []>;
+ def _areg64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+ regclass:$dst4), (ins Int64Regs:$src),
+ !strconcat("ld.global.nc.", TyStr), []>;
+ def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+ regclass:$dst4), (ins MEMri:$src),
+ !strconcat("ld.global.nc.", TyStr), []>;
+ def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+ regclass:$dst4), (ins MEMri64:$src),
+ !strconcat("ld.global.nc.", TyStr), []>;
+ def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+ regclass:$dst4), (ins imemAny:$src),
+ !strconcat("ld.global.nc.", TyStr), []>;
+}
+
+// FIXME: 8-bit LDG should be fixed once LDG/LDU nodes are made into proper loads.
+defm INT_PTX_LDG_G_v2i8_ELE
+ : VLDG_G_ELE_V2<"v2.u8 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
+defm INT_PTX_LDG_G_v2i16_ELE
+ : VLDG_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
+defm INT_PTX_LDG_G_v2i32_ELE
+ : VLDG_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>;
+defm INT_PTX_LDG_G_v2f32_ELE
+ : VLDG_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>;
+defm INT_PTX_LDG_G_v2i64_ELE
+ : VLDG_G_ELE_V2<"v2.u64 \t{{$dst1, $dst2}}, [$src];", Int64Regs>;
+defm INT_PTX_LDG_G_v2f64_ELE
+ : VLDG_G_ELE_V2<"v2.f64 \t{{$dst1, $dst2}}, [$src];", Float64Regs>;
+defm INT_PTX_LDG_G_v4i8_ELE
+ : VLDG_G_ELE_V4<"v4.u8 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
+defm INT_PTX_LDG_G_v4i16_ELE
+ : VLDG_G_ELE_V4<"v4.u16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
+defm INT_PTX_LDG_G_v4i32_ELE
+ : VLDG_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int32Regs>;
+defm INT_PTX_LDG_G_v4f32_ELE
+ : VLDG_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Float32Regs>;
+
+
+multiclass NG_TO_G<string Str, Intrinsic Intrin> {
+ def _yes : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
+ !strconcat("cvta.", !strconcat(Str, ".u32 \t$result, $src;")),
+ [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>,
+ Requires<[hasGenericLdSt]>;
+ def _yes_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
+ !strconcat("cvta.", !strconcat(Str, ".u64 \t$result, $src;")),
+ [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>,
+ Requires<[hasGenericLdSt]>;
+
+// @TODO: Are these actually needed? I believe global addresses will be copied
+// to register values anyway.
+ /*def __addr_yes : NVPTXInst<(outs Int32Regs:$result), (ins imemAny:$src),
+ !strconcat("cvta.", !strconcat(Str, ".u32 \t$result, $src;")),
+ [(set Int32Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>,
+ Requires<[hasGenericLdSt]>;
+ def __addr_yes_64 : NVPTXInst<(outs Int64Regs:$result), (ins imemAny:$src),
+ !strconcat("cvta.", !strconcat(Str, ".u64 \t$result, $src;")),
+ [(set Int64Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>,
+ Requires<[hasGenericLdSt]>;*/
+
+ def _no : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
+ "mov.u32 \t$result, $src;",
+ [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>;
+ def _no_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
+ "mov.u64 \t$result, $src;",
+ [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>;
+
+// @TODO: Are these actually needed? I believe global addresses will be copied
+// to register values anyway.
+ /*def _addr_no : NVPTXInst<(outs Int32Regs:$result), (ins imem:$src),
+ "mov.u32 \t$result, $src;",
+ [(set Int32Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>;
+ def _addr_no_64 : NVPTXInst<(outs Int64Regs:$result), (ins imem:$src),
+ "mov.u64 \t$result, $src;",
+ [(set Int64Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>;*/
+}
+
+multiclass G_TO_NG<string Str, Intrinsic Intrin> {
+ def _yes : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
+ !strconcat("cvta.to.", !strconcat(Str, ".u32 \t$result, $src;")),
+ [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>,
+ Requires<[hasGenericLdSt]>;
+ def _yes_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
+ !strconcat("cvta.to.", !strconcat(Str, ".u64 \t$result, $src;")),
+ [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>,
+ Requires<[hasGenericLdSt]>;
+ def _no : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
+ "mov.u32 \t$result, $src;",
+ [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>;
+ def _no_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
+ "mov.u64 \t$result, $src;",
+ [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>;
+}
+
+defm cvta_local : NG_TO_G<"local", int_nvvm_ptr_local_to_gen>;
+defm cvta_shared : NG_TO_G<"shared", int_nvvm_ptr_shared_to_gen>;
+defm cvta_global : NG_TO_G<"global", int_nvvm_ptr_global_to_gen>;
+defm cvta_const : NG_TO_G<"const", int_nvvm_ptr_constant_to_gen>;
+
+defm cvta_to_local : G_TO_NG<"local", int_nvvm_ptr_gen_to_local>;
+defm cvta_to_shared : G_TO_NG<"shared", int_nvvm_ptr_gen_to_shared>;
+defm cvta_to_global : G_TO_NG<"global", int_nvvm_ptr_gen_to_global>;
+defm cvta_to_const : G_TO_NG<"const", int_nvvm_ptr_gen_to_constant>;
+
+
+// nvvm.ptr.gen.to.param
+def nvvm_ptr_gen_to_param : NVPTXInst<(outs Int32Regs:$result),
+ (ins Int32Regs:$src),
+ "mov.u32 \t$result, $src;",
+ [(set Int32Regs:$result,
+ (int_nvvm_ptr_gen_to_param Int32Regs:$src))]>;
+def nvvm_ptr_gen_to_param_64 : NVPTXInst<(outs Int64Regs:$result),
+ (ins Int64Regs:$src),
+ "mov.u64 \t$result, $src;",
+ [(set Int64Regs:$result,
+ (int_nvvm_ptr_gen_to_param Int64Regs:$src))]>;
+
+
+// nvvm.move intrinsicc
+def nvvm_move_i16 : NVPTXInst<(outs Int16Regs:$r), (ins Int16Regs:$s),
+ "mov.b16 \t$r, $s;",
+ [(set Int16Regs:$r,
+ (int_nvvm_move_i16 Int16Regs:$s))]>;
+def nvvm_move_i32 : NVPTXInst<(outs Int32Regs:$r), (ins Int32Regs:$s),
+ "mov.b32 \t$r, $s;",
+ [(set Int32Regs:$r,
+ (int_nvvm_move_i32 Int32Regs:$s))]>;
+def nvvm_move_i64 : NVPTXInst<(outs Int64Regs:$r), (ins Int64Regs:$s),
+ "mov.b64 \t$r, $s;",
+ [(set Int64Regs:$r,
+ (int_nvvm_move_i64 Int64Regs:$s))]>;
+def nvvm_move_float : NVPTXInst<(outs Float32Regs:$r), (ins Float32Regs:$s),
+ "mov.f32 \t$r, $s;",
+ [(set Float32Regs:$r,
+ (int_nvvm_move_float Float32Regs:$s))]>;
+def nvvm_move_double : NVPTXInst<(outs Float64Regs:$r), (ins Float64Regs:$s),
+ "mov.f64 \t$r, $s;",
+ [(set Float64Regs:$r,
+ (int_nvvm_move_double Float64Regs:$s))]>;
+def nvvm_move_ptr32 : NVPTXInst<(outs Int32Regs:$r), (ins Int32Regs:$s),
+ "mov.u32 \t$r, $s;",
+ [(set Int32Regs:$r,
+ (int_nvvm_move_ptr Int32Regs:$s))]>;
+def nvvm_move_ptr64 : NVPTXInst<(outs Int64Regs:$r), (ins Int64Regs:$s),
+ "mov.u64 \t$r, $s;",
+ [(set Int64Regs:$r,
+ (int_nvvm_move_ptr Int64Regs:$s))]>;
+
+// @TODO: Are these actually needed, or will we always just see symbols
+// copied to registers first?
+/*def nvvm_move_sym32 : NVPTXInst<(outs Int32Regs:$r), (ins imem:$s),
+ "mov.u32 \t$r, $s;",
+ [(set Int32Regs:$r,
+ (int_nvvm_move_ptr texternalsym:$s))]>;
+def nvvm_move_sym64 : NVPTXInst<(outs Int64Regs:$r), (ins imem:$s),
+ "mov.u64 \t$r, $s;",
+ [(set Int64Regs:$r,
+ (int_nvvm_move_ptr texternalsym:$s))]>;*/
+
+
+// MoveParam %r1, param
+// ptr_local_to_gen %r2, %r1
+// ptr_gen_to_local %r3, %r2
+// ->
+// mov %r1, param
+
+// @TODO: Revisit this. There is a type
+// contradiction between iPTRAny and iPTR for the addr defs, so the move_sym
+// instructions are not currently defined. However, we can use the ptr
+// variants and the asm printer will do the right thing.
+def : Pat<(i64 (int_nvvm_ptr_gen_to_local (int_nvvm_ptr_local_to_gen
+ (MoveParam texternalsym:$src)))),
+ (nvvm_move_ptr64 texternalsym:$src)>;
+def : Pat<(i32 (int_nvvm_ptr_gen_to_local (int_nvvm_ptr_local_to_gen
+ (MoveParam texternalsym:$src)))),
+ (nvvm_move_ptr32 texternalsym:$src)>;
+
+def texsurf_handles
+ : NVPTXInst<(outs Int64Regs:$result), (ins imem:$src),
+ "mov.u64 \t$result, $src;", []>;
+
+//-----------------------------------
+// Compiler Error Warn
+// - Just ignore them in codegen
+//-----------------------------------
+
+def INT_NVVM_COMPILER_WARN_32 : NVPTXInst<(outs), (ins Int32Regs:$a),
+ "// llvm.nvvm.compiler.warn()",
+ [(int_nvvm_compiler_warn Int32Regs:$a)]>;
+def INT_NVVM_COMPILER_WARN_64 : NVPTXInst<(outs), (ins Int64Regs:$a),
+ "// llvm.nvvm.compiler.warn()",
+ [(int_nvvm_compiler_warn Int64Regs:$a)]>;
+def INT_NVVM_COMPILER_ERROR_32 : NVPTXInst<(outs), (ins Int32Regs:$a),
+ "// llvm.nvvm.compiler.error()",
+ [(int_nvvm_compiler_error Int32Regs:$a)]>;
+def INT_NVVM_COMPILER_ERROR_64 : NVPTXInst<(outs), (ins Int64Regs:$a),
+ "// llvm.nvvm.compiler.error()",
+ [(int_nvvm_compiler_error Int64Regs:$a)]>;
+
+
+// isspacep
+
+def ISSPACEP_CONST_32
+ : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a),
+ "isspacep.const \t$d, $a;",
+ [(set Int1Regs:$d, (int_nvvm_isspacep_const Int32Regs:$a))]>,
+ Requires<[hasPTX31]>;
+def ISSPACEP_CONST_64
+ : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+ "isspacep.const \t$d, $a;",
+ [(set Int1Regs:$d, (int_nvvm_isspacep_const Int64Regs:$a))]>,
+ Requires<[hasPTX31]>;
+def ISSPACEP_GLOBAL_32
+ : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a),
+ "isspacep.global \t$d, $a;",
+ [(set Int1Regs:$d, (int_nvvm_isspacep_global Int32Regs:$a))]>;
+def ISSPACEP_GLOBAL_64
+ : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+ "isspacep.global \t$d, $a;",
+ [(set Int1Regs:$d, (int_nvvm_isspacep_global Int64Regs:$a))]>;
+def ISSPACEP_LOCAL_32
+ : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a),
+ "isspacep.local \t$d, $a;",
+ [(set Int1Regs:$d, (int_nvvm_isspacep_local Int32Regs:$a))]>;
+def ISSPACEP_LOCAL_64
+ : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+ "isspacep.local \t$d, $a;",
+ [(set Int1Regs:$d, (int_nvvm_isspacep_local Int64Regs:$a))]>;
+def ISSPACEP_SHARED_32
+ : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a),
+ "isspacep.shared \t$d, $a;",
+ [(set Int1Regs:$d, (int_nvvm_isspacep_shared Int32Regs:$a))]>;
+def ISSPACEP_SHARED_64
+ : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+ "isspacep.shared \t$d, $a;",
+ [(set Int1Regs:$d, (int_nvvm_isspacep_shared Int64Regs:$a))]>;
+
+
+// Special register reads
+def MOV_SPECIAL : NVPTXInst<(outs Int32Regs:$d),
+ (ins SpecialRegs:$r),
+ "mov.b32\t$d, $r;", []>;
+
+def : Pat<(int_nvvm_read_ptx_sreg_envreg0), (MOV_SPECIAL ENVREG0)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg1), (MOV_SPECIAL ENVREG1)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg2), (MOV_SPECIAL ENVREG2)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg3), (MOV_SPECIAL ENVREG3)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg4), (MOV_SPECIAL ENVREG4)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg5), (MOV_SPECIAL ENVREG5)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg6), (MOV_SPECIAL ENVREG6)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg7), (MOV_SPECIAL ENVREG7)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg8), (MOV_SPECIAL ENVREG8)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg9), (MOV_SPECIAL ENVREG9)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg10), (MOV_SPECIAL ENVREG10)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg11), (MOV_SPECIAL ENVREG11)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg12), (MOV_SPECIAL ENVREG12)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg13), (MOV_SPECIAL ENVREG13)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg14), (MOV_SPECIAL ENVREG14)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg15), (MOV_SPECIAL ENVREG15)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg16), (MOV_SPECIAL ENVREG16)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg17), (MOV_SPECIAL ENVREG17)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg18), (MOV_SPECIAL ENVREG18)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg19), (MOV_SPECIAL ENVREG19)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg20), (MOV_SPECIAL ENVREG20)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg21), (MOV_SPECIAL ENVREG21)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg22), (MOV_SPECIAL ENVREG22)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg23), (MOV_SPECIAL ENVREG23)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg24), (MOV_SPECIAL ENVREG24)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg25), (MOV_SPECIAL ENVREG25)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg26), (MOV_SPECIAL ENVREG26)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg27), (MOV_SPECIAL ENVREG27)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg28), (MOV_SPECIAL ENVREG28)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg29), (MOV_SPECIAL ENVREG29)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg30), (MOV_SPECIAL ENVREG30)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg31), (MOV_SPECIAL ENVREG31)>;
+
+
+// rotate builtin support
+
+def ROTATE_B32_HW_IMM
+ : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$src, i32imm:$amt),
+ "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
+ [(set Int32Regs:$dst,
+ (int_nvvm_rotate_b32 Int32Regs:$src, (i32 imm:$amt)))]>,
+ Requires<[hasHWROT32]> ;
+
+def ROTATE_B32_HW_REG
+ : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$src, Int32Regs:$amt),
+ "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
+ [(set Int32Regs:$dst,
+ (int_nvvm_rotate_b32 Int32Regs:$src, Int32Regs:$amt))]>,
+ Requires<[hasHWROT32]> ;
+
+def : Pat<(int_nvvm_rotate_b32 Int32Regs:$src, (i32 imm:$amt)),
+ (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
+ Requires<[noHWROT32]> ;
+
+def : Pat<(int_nvvm_rotate_b32 Int32Regs:$src, Int32Regs:$amt),
+ (ROTL32reg_sw Int32Regs:$src, Int32Regs:$amt)>,
+ Requires<[noHWROT32]> ;
+
+let hasSideEffects = 0 in {
+ def GET_LO_INT64
+ : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src),
+ !strconcat("{{\n\t",
+ !strconcat(".reg .b32 %dummy;\n\t",
+ !strconcat("mov.b64 \t{$dst,%dummy}, $src;\n\t",
+ !strconcat("}}", "")))),
+ []> ;
+
+ def GET_HI_INT64
+ : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src),
+ !strconcat("{{\n\t",
+ !strconcat(".reg .b32 %dummy;\n\t",
+ !strconcat("mov.b64 \t{%dummy,$dst}, $src;\n\t",
+ !strconcat("}}", "")))),
+ []> ;
+}
+
+let hasSideEffects = 0 in {
+ def PACK_TWO_INT32
+ : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$lo, Int32Regs:$hi),
+ "mov.b64 \t$dst, {{$lo, $hi}};", []> ;
+}
+
+def : Pat<(int_nvvm_swap_lo_hi_b64 Int64Regs:$src),
+ (PACK_TWO_INT32 (GET_HI_INT64 Int64Regs:$src),
+ (GET_LO_INT64 Int64Regs:$src))> ;
+
+// Funnel shift, requires >= sm_32. Does not trap if amt is out of range, so
+// no side effects.
+let hasSideEffects = 0 in {
+ def SHF_L_WRAP_B32_IMM
+ : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt),
+ "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+ Requires<[hasHWROT32]>;
+
+ def SHF_L_WRAP_B32_REG
+ : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+ "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+ Requires<[hasHWROT32]>;
+
+ def SHF_R_WRAP_B32_IMM
+ : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt),
+ "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+ Requires<[hasHWROT32]>;
+
+ def SHF_R_WRAP_B32_REG
+ : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+ "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+ Requires<[hasHWROT32]>;
+}
+
+// HW version of rotate 64
+def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, (i32 imm:$amt)),
+ (PACK_TWO_INT32
+ (SHF_L_WRAP_B32_IMM (GET_HI_INT64 Int64Regs:$src),
+ (GET_LO_INT64 Int64Regs:$src), imm:$amt),
+ (SHF_L_WRAP_B32_IMM (GET_LO_INT64 Int64Regs:$src),
+ (GET_HI_INT64 Int64Regs:$src), imm:$amt))>,
+ Requires<[hasHWROT32]>;
+
+def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, Int32Regs:$amt),
+ (PACK_TWO_INT32
+ (SHF_L_WRAP_B32_REG (GET_HI_INT64 Int64Regs:$src),
+ (GET_LO_INT64 Int64Regs:$src), Int32Regs:$amt),
+ (SHF_L_WRAP_B32_REG (GET_LO_INT64 Int64Regs:$src),
+ (GET_HI_INT64 Int64Regs:$src), Int32Regs:$amt))>,
+ Requires<[hasHWROT32]>;
+
+
+def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, (i32 imm:$amt)),
+ (PACK_TWO_INT32
+ (SHF_R_WRAP_B32_IMM (GET_LO_INT64 Int64Regs:$src),
+ (GET_HI_INT64 Int64Regs:$src), imm:$amt),
+ (SHF_R_WRAP_B32_IMM (GET_HI_INT64 Int64Regs:$src),
+ (GET_LO_INT64 Int64Regs:$src), imm:$amt))>,
+ Requires<[hasHWROT32]>;
+
+def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, Int32Regs:$amt),
+ (PACK_TWO_INT32
+ (SHF_R_WRAP_B32_REG (GET_LO_INT64 Int64Regs:$src),
+ (GET_HI_INT64 Int64Regs:$src), Int32Regs:$amt),
+ (SHF_R_WRAP_B32_REG (GET_HI_INT64 Int64Regs:$src),
+ (GET_LO_INT64 Int64Regs:$src), Int32Regs:$amt))>,
+ Requires<[hasHWROT32]>;
+
+// SW version of rotate 64
+def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, (i32 imm:$amt)),
+ (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
+ Requires<[noHWROT32]>;
+def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, Int32Regs:$amt),
+ (ROTL64reg_sw Int64Regs:$src, Int32Regs:$amt)>,
+ Requires<[noHWROT32]>;
+def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, (i32 imm:$amt)),
+ (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>,
+ Requires<[noHWROT32]>;
+def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, Int32Regs:$amt),
+ (ROTR64reg_sw Int64Regs:$src, Int32Regs:$amt)>,
+ Requires<[noHWROT32]>;
+
+
+//-----------------------------------
+// Texture Intrinsics
+//-----------------------------------
+
+// NOTE: For Fermi support, any new texture/surface/sampler intrinsics must be
+// also defined in NVPTXReplaceImageHandles.cpp
+
+// texmode_independent
+let IsTex = 1, IsTexModeUnified = 0 in {
+// Texture fetch instructions using handles
+def TEX_1D_F32_S32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x),
+ "tex.1d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+ []>;
+def TEX_1D_F32_F32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x),
+ "tex.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+ []>;
+def TEX_1D_F32_F32_LEVEL
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$lod),
+ "tex.level.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x\\}], $lod;",
+ []>;
+def TEX_1D_F32_F32_GRAD
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
+ Float32Regs:$gradx, Float32Regs:$grady),
+ "tex.grad.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
+ []>;
+def TEX_1D_S32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x),
+ "tex.1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+ []>;
+def TEX_1D_S32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x),
+ "tex.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+ []>;
+def TEX_1D_S32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
+ Float32Regs:$lod),
+ "tex.level.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x\\}], $lod;",
+ []>;
+def TEX_1D_S32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
+ Float32Regs:$gradx, Float32Regs:$grady),
+ "tex.grad.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
+ []>;
+def TEX_1D_U32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x),
+ "tex.1d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+ []>;
+def TEX_1D_U32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x),
+ "tex.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+ []>;
+def TEX_1D_U32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
+ Float32Regs:$lod),
+ "tex.level.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x\\}], $lod;",
+ []>;
+def TEX_1D_U32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
+ Float32Regs:$gradx, Float32Regs:$grady),
+ "tex.grad.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
+ []>;
+
+def TEX_1D_ARRAY_F32_S32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "tex.a1d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x\\}];",
+ []>;
+def TEX_1D_ARRAY_F32_F32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x),
+ "tex.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x\\}];",
+ []>;
+def TEX_1D_ARRAY_F32_F32_LEVEL
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$lod),
+ "tex.level.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x\\}], $lod;",
+ []>;
+def TEX_1D_ARRAY_F32_F32_GRAD
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$gradx, Float32Regs:$grady),
+ "tex.grad.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
+ []>;
+def TEX_1D_ARRAY_S32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "tex.a1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x\\}];",
+ []>;
+def TEX_1D_ARRAY_S32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x),
+ "tex.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x\\}];",
+ []>;
+def TEX_1D_ARRAY_S32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$lod),
+ "tex.level.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x\\}], $lod;",
+ []>;
+def TEX_1D_ARRAY_S32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$gradx, Float32Regs:$grady),
+ "tex.grad.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
+ []>;
+def TEX_1D_ARRAY_U32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "tex.a1d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x\\}];",
+ []>;
+def TEX_1D_ARRAY_U32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x),
+ "tex.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x\\}];",
+ []>;
+def TEX_1D_ARRAY_U32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$lod),
+ "tex.level.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x\\}], $lod;",
+ []>;
+def TEX_1D_ARRAY_U32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$gradx, Float32Regs:$grady),
+ "tex.grad.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
+ []>;
+
+def TEX_2D_F32_S32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "tex.2d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y\\}];",
+ []>;
+def TEX_2D_F32_F32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+ "tex.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y\\}];",
+ []>;
+def TEX_2D_F32_F32_LEVEL
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$lod),
+ "tex.level.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y\\}], $lod;",
+ []>;
+def TEX_2D_F32_F32_GRAD
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$gradx0, Float32Regs:$gradx1,
+ Float32Regs:$grady0, Float32Regs:$grady1),
+ "tex.grad.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
+ "\\{$grady0, $grady1\\};",
+ []>;
+def TEX_2D_S32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "tex.2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y\\}];",
+ []>;
+def TEX_2D_S32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+ "tex.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y\\}];",
+ []>;
+def TEX_2D_S32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$lod),
+ "tex.level.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y\\}], $lod;",
+ []>;
+def TEX_2D_S32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$gradx0, Float32Regs:$gradx1,
+ Float32Regs:$grady0, Float32Regs:$grady1),
+ "tex.grad.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
+ "\\{$grady0, $grady1\\};",
+ []>;
+def TEX_2D_U32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "tex.2d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y\\}];",
+ []>;
+def TEX_2D_U32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+ "tex.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y\\}];",
+ []>;
+def TEX_2D_U32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$lod),
+ "tex.level.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y\\}], $lod;",
+ []>;
+def TEX_2D_U32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$gradx0, Float32Regs:$gradx1,
+ Float32Regs:$grady0, Float32Regs:$grady1),
+ "tex.grad.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
+ "\\{$grady0, $grady1\\};",
+ []>;
+
+def TEX_2D_ARRAY_F32_S32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$y),
+ "tex.a2d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def TEX_2D_ARRAY_F32_F32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$y),
+ "tex.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def TEX_2D_ARRAY_F32_F32_LEVEL
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$y, Float32Regs:$lod),
+ "tex.level.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x, $y, $y\\}], $lod;",
+ []>;
+def TEX_2D_ARRAY_F32_F32_GRAD
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$y, Float32Regs:$gradx0, Float32Regs:$gradx1,
+ Float32Regs:$grady0, Float32Regs:$grady1),
+ "tex.grad.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
+ "\\{$grady0, $grady1\\};",
+ []>;
+def TEX_2D_ARRAY_S32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$y),
+ "tex.a2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def TEX_2D_ARRAY_S32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$y),
+ "tex.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def TEX_2D_ARRAY_S32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$y, Float32Regs:$lod),
+ "tex.level.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x, $y, $y\\}], $lod;",
+ []>;
+def TEX_2D_ARRAY_S32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$y,
+ Float32Regs:$gradx0, Float32Regs:$gradx1,
+ Float32Regs:$grady0, Float32Regs:$grady1),
+ "tex.grad.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
+ "\\{$grady0, $grady1\\};",
+ []>;
+def TEX_2D_ARRAY_U32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$y),
+ "tex.a2d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def TEX_2D_ARRAY_U32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$y),
+ "tex.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def TEX_2D_ARRAY_U32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$y, Float32Regs:$lod),
+ "tex.level.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x, $y, $y\\}], $lod;",
+ []>;
+def TEX_2D_ARRAY_U32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$y,
+ Float32Regs:$gradx0, Float32Regs:$gradx1,
+ Float32Regs:$grady0, Float32Regs:$grady1),
+ "tex.grad.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
+ "\\{$grady0, $grady1\\};",
+ []>;
+
+def TEX_3D_F32_S32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$z),
+ "tex.3d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def TEX_3D_F32_F32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$z),
+ "tex.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def TEX_3D_F32_F32_LEVEL
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$z, Float32Regs:$lod),
+ "tex.level.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
+ []>;
+def TEX_3D_F32_F32_GRAD
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$z,
+ Float32Regs:$gradx0, Float32Regs:$gradx1,
+ Float32Regs:$gradx2, Float32Regs:$grady0,
+ Float32Regs:$grady1, Float32Regs:$grady2),
+ "tex.grad.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y, $z, $z\\}], "
+ "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
+ "\\{$grady0, $grady1, $grady2, $grady2\\};",
+ []>;
+def TEX_3D_S32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$z),
+ "tex.3d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def TEX_3D_S32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$z),
+ "tex.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def TEX_3D_S32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$z, Float32Regs:$lod),
+ "tex.level.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
+ []>;
+def TEX_3D_S32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$z,
+ Float32Regs:$gradx0, Float32Regs:$gradx1,
+ Float32Regs:$gradx2, Float32Regs:$grady0,
+ Float32Regs:$grady1, Float32Regs:$grady2),
+ "tex.grad.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y, $z, $z\\}], "
+ "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
+ "\\{$grady0, $grady1, $grady2, $grady2\\};",
+ []>;
+def TEX_3D_U32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$z),
+ "tex.3d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def TEX_3D_U32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$z),
+ "tex.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def TEX_3D_U32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$z, Float32Regs:$lod),
+ "tex.level.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
+ []>;
+def TEX_3D_U32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$z,
+ Float32Regs:$gradx0, Float32Regs:$gradx1,
+ Float32Regs:$gradx2, Float32Regs:$grady0,
+ Float32Regs:$grady1, Float32Regs:$grady2),
+ "tex.grad.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y, $z, $z\\}], "
+ "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
+ "\\{$grady0, $grady1, $grady2, $grady2\\};",
+ []>;
+
+def TEX_CUBE_F32_F32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+ "tex.cube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def TEX_CUBE_F32_F32_LEVEL
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+ Float32Regs:$lod),
+ "tex.level.cube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
+ []>;
+def TEX_CUBE_S32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+ "tex.cube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def TEX_CUBE_S32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+ Float32Regs:$lod),
+ "tex.level.cube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
+ []>;
+def TEX_CUBE_U32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+ "tex.cube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def TEX_CUBE_U32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+ Float32Regs:$lod),
+ "tex.level.cube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
+ []>;
+
+def TEX_CUBE_ARRAY_F32_F32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+ "tex.acube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x, $y, $z\\}];",
+ []>;
+def TEX_CUBE_ARRAY_F32_F32_LEVEL
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+ Float32Regs:$lod),
+ "tex.level.acube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x, $y, $z\\}], $lod;",
+ []>;
+def TEX_CUBE_ARRAY_S32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+ "tex.acube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x, $y, $z\\}];",
+ []>;
+def TEX_CUBE_ARRAY_S32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+ Float32Regs:$lod),
+ "tex.level.acube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x, $y, $z\\}], $lod;",
+ []>;
+def TEX_CUBE_ARRAY_U32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+ "tex.acube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x, $y, $z\\}];",
+ []>;
+def TEX_CUBE_ARRAY_U32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+ Float32Regs:$lod),
+ "tex.level.acube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x, $y, $z\\}], $lod;",
+ []>;
+
+def TLD4_R_2D_F32_F32
+ : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+ Float32Regs:$v2, Float32Regs:$v3),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+ "tld4.r.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, $s, \\{$x, $y\\}];",
+ []>;
+def TLD4_G_2D_F32_F32
+ : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+ Float32Regs:$v2, Float32Regs:$v3),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+ "tld4.g.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, $s, \\{$x, $y\\}];",
+ []>;
+def TLD4_B_2D_F32_F32
+ : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+ Float32Regs:$v2, Float32Regs:$v3),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+ "tld4.b.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, $s, \\{$x, $y\\}];",
+ []>;
+def TLD4_A_2D_F32_F32
+ : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+ Float32Regs:$v2, Float32Regs:$v3),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+ "tld4.a.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, $s, \\{$x, $y\\}];",
+ []>;
+def TLD4_R_2D_S32_F32
+ : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+ Int32Regs:$v2, Int32Regs:$v3),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+ "tld4.r.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, $s, \\{$x, $y\\}];",
+ []>;
+def TLD4_G_2D_S32_F32
+ : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+ Int32Regs:$v2, Int32Regs:$v3),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+ "tld4.g.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, $s, \\{$x, $y\\}];",
+ []>;
+def TLD4_B_2D_S32_F32
+ : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+ Int32Regs:$v2, Int32Regs:$v3),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+ "tld4.b.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, $s, \\{$x, $y\\}];",
+ []>;
+def TLD4_A_2D_S32_F32
+ : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+ Int32Regs:$v2, Int32Regs:$v3),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+ "tld4.a.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, $s, \\{$x, $y\\}];",
+ []>;
+def TLD4_R_2D_U32_F32
+ : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+ Int32Regs:$v2, Int32Regs:$v3),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+ "tld4.r.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, $s, \\{$x, $y\\}];",
+ []>;
+def TLD4_G_2D_U32_F32
+ : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+ Int32Regs:$v2, Int32Regs:$v3),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+ "tld4.g.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, $s, \\{$x, $y\\}];",
+ []>;
+def TLD4_B_2D_U32_F32
+ : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+ Int32Regs:$v2, Int32Regs:$v3),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+ "tld4.b.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, $s, \\{$x, $y\\}];",
+ []>;
+def TLD4_A_2D_U32_F32
+ : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+ Int32Regs:$v2, Int32Regs:$v3),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+ "tld4.a.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, $s, \\{$x, $y\\}];",
+ []>;
+}
+
+
+// texmode_unified
+let IsTex = 1, IsTexModeUnified = 1 in {
+// Texture fetch instructions using handles
+def TEX_UNIFIED_1D_F32_S32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$x),
+ "tex.1d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+ []>;
+def TEX_UNIFIED_1D_F32_F32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x),
+ "tex.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+ []>;
+def TEX_UNIFIED_1D_F32_F32_LEVEL
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$lod),
+ "tex.level.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x\\}], $lod;",
+ []>;
+def TEX_UNIFIED_1D_F32_F32_GRAD
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x,
+ Float32Regs:$gradx, Float32Regs:$grady),
+ "tex.grad.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
+ []>;
+def TEX_UNIFIED_1D_S32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$x),
+ "tex.1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+ []>;
+def TEX_UNIFIED_1D_S32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x),
+ "tex.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+ []>;
+def TEX_UNIFIED_1D_S32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x,
+ Float32Regs:$lod),
+ "tex.level.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x\\}], $lod;",
+ []>;
+def TEX_UNIFIED_1D_S32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x,
+ Float32Regs:$gradx, Float32Regs:$grady),
+ "tex.grad.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
+ []>;
+def TEX_UNIFIED_1D_U32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$x),
+ "tex.1d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+ []>;
+def TEX_UNIFIED_1D_U32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x),
+ "tex.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+ []>;
+def TEX_UNIFIED_1D_U32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x,
+ Float32Regs:$lod),
+ "tex.level.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x\\}], $lod;",
+ []>;
+def TEX_UNIFIED_1D_U32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x,
+ Float32Regs:$gradx, Float32Regs:$grady),
+ "tex.grad.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
+ []>;
+
+def TEX_UNIFIED_1D_ARRAY_F32_S32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x),
+ "tex.a1d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x\\}];",
+ []>;
+def TEX_UNIFIED_1D_ARRAY_F32_F32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x),
+ "tex.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x\\}];",
+ []>;
+def TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$lod),
+ "tex.level.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x\\}], $lod;",
+ []>;
+def TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$gradx, Float32Regs:$grady),
+ "tex.grad.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
+ []>;
+def TEX_UNIFIED_1D_ARRAY_S32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x),
+ "tex.a1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x\\}];",
+ []>;
+def TEX_UNIFIED_1D_ARRAY_S32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x),
+ "tex.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x\\}];",
+ []>;
+def TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$lod),
+ "tex.level.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x\\}], $lod;",
+ []>;
+def TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$gradx, Float32Regs:$grady),
+ "tex.grad.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
+ []>;
+def TEX_UNIFIED_1D_ARRAY_U32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x),
+ "tex.a1d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x\\}];",
+ []>;
+def TEX_UNIFIED_1D_ARRAY_U32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x),
+ "tex.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x\\}];",
+ []>;
+def TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$lod),
+ "tex.level.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x\\}], $lod;",
+ []>;
+def TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$gradx, Float32Regs:$grady),
+ "tex.grad.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
+ []>;
+
+def TEX_UNIFIED_2D_F32_S32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y),
+ "tex.2d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y\\}];",
+ []>;
+def TEX_UNIFIED_2D_F32_F32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+ "tex.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y\\}];",
+ []>;
+def TEX_UNIFIED_2D_F32_F32_LEVEL
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$lod),
+ "tex.level.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y\\}], $lod;",
+ []>;
+def TEX_UNIFIED_2D_F32_F32_GRAD
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$gradx0, Float32Regs:$gradx1,
+ Float32Regs:$grady0, Float32Regs:$grady1),
+ "tex.grad.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
+ "\\{$grady0, $grady1\\};",
+ []>;
+def TEX_UNIFIED_2D_S32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y),
+ "tex.2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y\\}];",
+ []>;
+def TEX_UNIFIED_2D_S32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+ "tex.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y\\}];",
+ []>;
+def TEX_UNIFIED_2D_S32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$lod),
+ "tex.level.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y\\}], $lod;",
+ []>;
+def TEX_UNIFIED_2D_S32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$gradx0, Float32Regs:$gradx1,
+ Float32Regs:$grady0, Float32Regs:$grady1),
+ "tex.grad.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
+ "\\{$grady0, $grady1\\};",
+ []>;
+def TEX_UNIFIED_2D_U32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y),
+ "tex.2d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y\\}];",
+ []>;
+def TEX_UNIFIED_2D_U32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+ "tex.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y\\}];",
+ []>;
+def TEX_UNIFIED_2D_U32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$lod),
+ "tex.level.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y\\}], $lod;",
+ []>;
+def TEX_UNIFIED_2D_U32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$gradx0, Float32Regs:$gradx1,
+ Float32Regs:$grady0, Float32Regs:$grady1),
+ "tex.grad.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
+ "\\{$grady0, $grady1\\};",
+ []>;
+
+def TEX_UNIFIED_2D_ARRAY_F32_S32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$y),
+ "tex.a2d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x, $y, $y\\}];",
+ []>;
+def TEX_UNIFIED_2D_ARRAY_F32_F32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$y),
+ "tex.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x, $y, $y\\}];",
+ []>;
+def TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$y, Float32Regs:$lod),
+ "tex.level.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x, $y, $y\\}], $lod;",
+ []>;
+def TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$y, Float32Regs:$gradx0, Float32Regs:$gradx1,
+ Float32Regs:$grady0, Float32Regs:$grady1),
+ "tex.grad.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
+ "\\{$grady0, $grady1\\};",
+ []>;
+def TEX_UNIFIED_2D_ARRAY_S32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$y),
+ "tex.a2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x, $y, $y\\}];",
+ []>;
+def TEX_UNIFIED_2D_ARRAY_S32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$y),
+ "tex.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x, $y, $y\\}];",
+ []>;
+def TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$y, Float32Regs:$lod),
+ "tex.level.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x, $y, $y\\}], $lod;",
+ []>;
+def TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$y,
+ Float32Regs:$gradx0, Float32Regs:$gradx1,
+ Float32Regs:$grady0, Float32Regs:$grady1),
+ "tex.grad.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
+ "\\{$grady0, $grady1\\};",
+ []>;
+def TEX_UNIFIED_2D_ARRAY_U32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$y),
+ "tex.a2d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x, $y, $y\\}];",
+ []>;
+def TEX_UNIFIED_2D_ARRAY_U32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$y),
+ "tex.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x, $y, $y\\}];",
+ []>;
+def TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$y, Float32Regs:$lod),
+ "tex.level.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x, $y, $y\\}], $lod;",
+ []>;
+def TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$y,
+ Float32Regs:$gradx0, Float32Regs:$gradx1,
+ Float32Regs:$grady0, Float32Regs:$grady1),
+ "tex.grad.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
+ "\\{$grady0, $grady1\\};",
+ []>;
+
+def TEX_UNIFIED_3D_F32_S32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$z),
+ "tex.3d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y, $z, $z\\}];",
+ []>;
+def TEX_UNIFIED_3D_F32_F32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$z),
+ "tex.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y, $z, $z\\}];",
+ []>;
+def TEX_UNIFIED_3D_F32_F32_LEVEL
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$z, Float32Regs:$lod),
+ "tex.level.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y, $z, $z\\}], $lod;",
+ []>;
+def TEX_UNIFIED_3D_F32_F32_GRAD
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$z,
+ Float32Regs:$gradx0, Float32Regs:$gradx1,
+ Float32Regs:$gradx2, Float32Regs:$grady0,
+ Float32Regs:$grady1, Float32Regs:$grady2),
+ "tex.grad.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y, $z, $z\\}], "
+ "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
+ "\\{$grady0, $grady1, $grady2, $grady2\\};",
+ []>;
+def TEX_UNIFIED_3D_S32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$z),
+ "tex.3d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y, $z, $z\\}];",
+ []>;
+def TEX_UNIFIED_3D_S32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$z),
+ "tex.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y, $z, $z\\}];",
+ []>;
+def TEX_UNIFIED_3D_S32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$z, Float32Regs:$lod),
+ "tex.level.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y, $z, $z\\}], $lod;",
+ []>;
+def TEX_UNIFIED_3D_S32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$z,
+ Float32Regs:$gradx0, Float32Regs:$gradx1,
+ Float32Regs:$gradx2, Float32Regs:$grady0,
+ Float32Regs:$grady1, Float32Regs:$grady2),
+ "tex.grad.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y, $z, $z\\}], "
+ "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
+ "\\{$grady0, $grady1, $grady2, $grady2\\};",
+ []>;
+def TEX_UNIFIED_3D_U32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$z),
+ "tex.3d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y, $z, $z\\}];",
+ []>;
+def TEX_UNIFIED_3D_U32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$z),
+ "tex.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y, $z, $z\\}];",
+ []>;
+def TEX_UNIFIED_3D_U32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$z, Float32Regs:$lod),
+ "tex.level.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y, $z, $z\\}], $lod;",
+ []>;
+def TEX_UNIFIED_3D_U32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$z,
+ Float32Regs:$gradx0, Float32Regs:$gradx1,
+ Float32Regs:$gradx2, Float32Regs:$grady0,
+ Float32Regs:$grady1, Float32Regs:$grady2),
+ "tex.grad.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y, $z, $z\\}], "
+ "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
+ "\\{$grady0, $grady1, $grady2, $grady2\\};",
+ []>;
+
+def TEX_UNIFIED_CUBE_F32_F32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+ "tex.cube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y, $z, $z\\}];",
+ []>;
+def TEX_UNIFIED_CUBE_F32_F32_LEVEL
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+ Float32Regs:$lod),
+ "tex.level.cube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y, $z, $z\\}], $lod;",
+ []>;
+def TEX_UNIFIED_CUBE_S32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+ "tex.cube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y, $z, $z\\}];",
+ []>;
+def TEX_UNIFIED_CUBE_S32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+ Float32Regs:$lod),
+ "tex.level.cube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y, $z, $z\\}], $lod;",
+ []>;
+def TEX_UNIFIED_CUBE_U32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+ "tex.cube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y, $z, $z\\}];",
+ []>;
+def TEX_UNIFIED_CUBE_U32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+ Float32Regs:$lod),
+ "tex.level.cube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y, $z, $z\\}], $lod;",
+ []>;
+
+def TEX_UNIFIED_CUBE_ARRAY_F32_F32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+ "tex.acube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x, $y, $z\\}];",
+ []>;
+def TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+ Float32Regs:$lod),
+ "tex.level.acube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x, $y, $z\\}], $lod;",
+ []>;
+def TEX_UNIFIED_CUBE_ARRAY_S32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+ "tex.acube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x, $y, $z\\}];",
+ []>;
+def TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+ Float32Regs:$lod),
+ "tex.level.acube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x, $y, $z\\}], $lod;",
+ []>;
+def TEX_UNIFIED_CUBE_ARRAY_U32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+ "tex.acube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x, $y, $z\\}];",
+ []>;
+def TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+ Float32Regs:$lod),
+ "tex.level.acube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x, $y, $z\\}], $lod;",
+ []>;
+
+def TLD4_UNIFIED_R_2D_F32_F32
+ : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+ Float32Regs:$v2, Float32Regs:$v3),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+ "tld4.r.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, \\{$x, $y\\}];",
+ []>;
+def TLD4_UNIFIED_G_2D_F32_F32
+ : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+ Float32Regs:$v2, Float32Regs:$v3),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+ "tld4.g.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, \\{$x, $y\\}];",
+ []>;
+def TLD4_UNIFIED_B_2D_F32_F32
+ : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+ Float32Regs:$v2, Float32Regs:$v3),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+ "tld4.b.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, \\{$x, $y\\}];",
+ []>;
+def TLD4_UNIFIED_A_2D_F32_F32
+ : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+ Float32Regs:$v2, Float32Regs:$v3),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+ "tld4.a.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, \\{$x, $y\\}];",
+ []>;
+def TLD4_UNIFIED_R_2D_S32_F32
+ : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+ Int32Regs:$v2, Int32Regs:$v3),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+ "tld4.r.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, \\{$x, $y\\}];",
+ []>;
+def TLD4_UNIFIED_G_2D_S32_F32
+ : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+ Int32Regs:$v2, Int32Regs:$v3),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+ "tld4.g.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, \\{$x, $y\\}];",
+ []>;
+def TLD4_UNIFIED_B_2D_S32_F32
+ : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+ Int32Regs:$v2, Int32Regs:$v3),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+ "tld4.b.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, \\{$x, $y\\}];",
+ []>;
+def TLD4_UNIFIED_A_2D_S32_F32
+ : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+ Int32Regs:$v2, Int32Regs:$v3),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+ "tld4.a.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, \\{$x, $y\\}];",
+ []>;
+def TLD4_UNIFIED_R_2D_U32_F32
+ : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+ Int32Regs:$v2, Int32Regs:$v3),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+ "tld4.r.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, \\{$x, $y\\}];",
+ []>;
+def TLD4_UNIFIED_G_2D_U32_F32
+ : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+ Int32Regs:$v2, Int32Regs:$v3),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+ "tld4.g.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, \\{$x, $y\\}];",
+ []>;
+def TLD4_UNIFIED_B_2D_U32_F32
+ : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+ Int32Regs:$v2, Int32Regs:$v3),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+ "tld4.b.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, \\{$x, $y\\}];",
+ []>;
+def TLD4_UNIFIED_A_2D_U32_F32
+ : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+ Int32Regs:$v2, Int32Regs:$v3),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+ "tld4.a.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, \\{$x, $y\\}];",
+ []>;
+}
+
+
+
+//=== Surface load instructions
+// .clamp variant
+let IsSuld = 1 in {
+def SULD_1D_I8_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.b8.clamp \\{$r\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_I16_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.b16.clamp \\{$r\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_I32_CLAMP
+ : NVPTXInst<(outs Int32Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.b32.clamp \\{$r\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_I64_CLAMP
+ : NVPTXInst<(outs Int64Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.b64.clamp \\{$r\\}, [$s, \\{$x\\}];",
+ []>;
+
+def SULD_1D_ARRAY_I8_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.b8.clamp \\{$r\\}, [$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_I16_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.b16.clamp \\{$r\\}, [$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_I32_CLAMP
+ : NVPTXInst<(outs Int32Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.b32.clamp \\{$r\\}, [$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_I64_CLAMP
+ : NVPTXInst<(outs Int64Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.b64.clamp \\{$r\\}, [$s, \\{$l, $x\\}];",
+ []>;
+
+def SULD_2D_I8_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.b8.clamp \\{$r\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_I16_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.b16.clamp \\{$r\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_I32_CLAMP
+ : NVPTXInst<(outs Int32Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.b32.clamp \\{$r\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_I64_CLAMP
+ : NVPTXInst<(outs Int64Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.b64.clamp \\{$r\\}, [$s, \\{$x, $y\\}];",
+ []>;
+
+def SULD_2D_ARRAY_I8_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.b8.clamp \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_I16_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.b16.clamp \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_I32_CLAMP
+ : NVPTXInst<(outs Int32Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.b32.clamp \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_I64_CLAMP
+ : NVPTXInst<(outs Int64Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.b64.clamp \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+
+def SULD_3D_I8_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.b8.clamp \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_I16_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.b16.clamp \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_I32_CLAMP
+ : NVPTXInst<(outs Int32Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.b32.clamp \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_I64_CLAMP
+ : NVPTXInst<(outs Int64Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.b64.clamp \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+}
+
+let IsSuld = 2 in {
+def SULD_1D_V2I8_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v2.b8.clamp \\{$r, $g\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_V2I16_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v2.b16.clamp \\{$r, $g\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_V2I32_CLAMP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v2.b32.clamp \\{$r, $g\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_V2I64_CLAMP
+ : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v2.b64.clamp \\{$r, $g\\}, [$s, \\{$x\\}];",
+ []>;
+
+def SULD_1D_ARRAY_V2I8_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v2.b8.clamp \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_V2I16_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v2.b16.clamp \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_V2I32_CLAMP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v2.b32.clamp \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_V2I64_CLAMP
+ : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v2.b64.clamp \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+ []>;
+
+def SULD_2D_V2I8_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v2.b8.clamp \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_V2I16_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v2.b16.clamp \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_V2I32_CLAMP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v2.b32.clamp \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_V2I64_CLAMP
+ : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v2.b64.clamp \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+ []>;
+
+def SULD_2D_ARRAY_V2I8_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v2.b8.clamp \\{$r, $g\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_V2I16_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v2.b16.clamp \\{$r, $g\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_V2I32_CLAMP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v2.b32.clamp \\{$r, $g\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_V2I64_CLAMP
+ : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v2.b64.clamp \\{$r, $g\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+
+def SULD_3D_V2I8_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v2.b8.clamp \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_V2I16_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v2.b16.clamp \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_V2I32_CLAMP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v2.b32.clamp \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_V2I64_CLAMP
+ : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v2.b64.clamp \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+}
+
+let IsSuld = 3 in {
+def SULD_1D_V4I8_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v4.b8.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_V4I16_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v4.b16.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_V4I32_CLAMP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v4.b32.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+ []>;
+
+def SULD_1D_ARRAY_V4I8_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v4.b8.clamp \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_V4I16_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v4.b16.clamp \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_V4I32_CLAMP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v4.b32.clamp \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$l, $x\\}];",
+ []>;
+
+def SULD_2D_V4I8_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v4.b8.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_V4I16_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v4.b16.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_V4I32_CLAMP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v4.b32.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+ []>;
+
+def SULD_2D_ARRAY_V4I8_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v4.b8.clamp \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_V4I16_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v4.b16.clamp \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_V4I32_CLAMP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v4.b32.clamp \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+
+
+def SULD_3D_V4I8_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v4.b8.clamp \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_V4I16_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v4.b16.clamp \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_V4I32_CLAMP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v4.b32.clamp \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+}
+
+
+// .trap variant
+let IsSuld = 1 in {
+def SULD_1D_I8_TRAP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.b8.trap \\{$r\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_I16_TRAP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.b16.trap \\{$r\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_I32_TRAP
+ : NVPTXInst<(outs Int32Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.b32.trap \\{$r\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_I64_TRAP
+ : NVPTXInst<(outs Int64Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.b64.trap \\{$r\\}, [$s, \\{$x\\}];",
+ []>;
+
+def SULD_1D_ARRAY_I8_TRAP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.b8.trap \\{$r\\}, [$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_I16_TRAP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.b16.trap \\{$r\\}, [$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_I32_TRAP
+ : NVPTXInst<(outs Int32Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.b32.trap \\{$r\\}, [$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_I64_TRAP
+ : NVPTXInst<(outs Int64Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.b64.trap \\{$r\\}, [$s, \\{$l, $x\\}];",
+ []>;
+
+def SULD_2D_I8_TRAP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.b8.trap \\{$r\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_I16_TRAP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.b16.trap \\{$r\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_I32_TRAP
+ : NVPTXInst<(outs Int32Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.b32.trap \\{$r\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_I64_TRAP
+ : NVPTXInst<(outs Int64Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.b64.trap \\{$r\\}, [$s, \\{$x, $y\\}];",
+ []>;
+
+def SULD_2D_ARRAY_I8_TRAP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.b8.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_I16_TRAP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.b16.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_I32_TRAP
+ : NVPTXInst<(outs Int32Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.b32.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_I64_TRAP
+ : NVPTXInst<(outs Int64Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.b64.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+
+def SULD_3D_I8_TRAP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.b8.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_I16_TRAP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.b16.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_I32_TRAP
+ : NVPTXInst<(outs Int32Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.b32.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_I64_TRAP
+ : NVPTXInst<(outs Int64Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.b64.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+}
+
+let IsSuld = 2 in {
+def SULD_1D_V2I8_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_V2I16_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_V2I32_TRAP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_V2I64_TRAP
+ : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v2.b64.trap \\{$r, $g\\}, [$s, \\{$x\\}];",
+ []>;
+
+def SULD_1D_ARRAY_V2I8_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_V2I16_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_V2I32_TRAP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_V2I64_TRAP
+ : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v2.b64.trap \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+ []>;
+
+def SULD_2D_V2I8_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_V2I16_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_V2I32_TRAP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_V2I64_TRAP
+ : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v2.b64.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+ []>;
+
+def SULD_2D_ARRAY_V2I8_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v2.b8.trap \\{$r, $g\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_V2I16_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v2.b16.trap \\{$r, $g\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_V2I32_TRAP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v2.b32.trap \\{$r, $g\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_V2I64_TRAP
+ : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v2.b64.trap \\{$r, $g\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+
+def SULD_3D_V2I8_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_V2I16_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_V2I32_TRAP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_V2I64_TRAP
+ : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v2.b64.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+}
+
+let IsSuld = 3 in {
+def SULD_1D_V4I8_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v4.b8.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_V4I16_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v4.b16.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_V4I32_TRAP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v4.b32.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+ []>;
+
+def SULD_1D_ARRAY_V4I8_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v4.b8.trap \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_V4I16_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v4.b16.trap \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_V4I32_TRAP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v4.b32.trap \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$l, $x\\}];",
+ []>;
+
+def SULD_2D_V4I8_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v4.b8.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_V4I16_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v4.b16.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_V4I32_TRAP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v4.b32.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+ []>;
+
+def SULD_2D_ARRAY_V4I8_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v4.b8.trap \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_V4I16_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v4.b16.trap \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_V4I32_TRAP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v4.b32.trap \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+
+
+def SULD_3D_V4I8_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v4.b8.trap \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_V4I16_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v4.b16.trap \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_V4I32_TRAP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v4.b32.trap \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+}
+
+// .zero variant
+let IsSuld = 1 in {
+def SULD_1D_I8_ZERO
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.b8.zero \\{$r\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_I16_ZERO
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.b16.zero \\{$r\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_I32_ZERO
+ : NVPTXInst<(outs Int32Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.b32.zero \\{$r\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_I64_ZERO
+ : NVPTXInst<(outs Int64Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.b64.zero \\{$r\\}, [$s, \\{$x\\}];",
+ []>;
+
+def SULD_1D_ARRAY_I8_ZERO
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.b8.zero \\{$r\\}, [$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_I16_ZERO
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.b16.zero \\{$r\\}, [$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_I32_ZERO
+ : NVPTXInst<(outs Int32Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.b32.zero \\{$r\\}, [$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_I64_ZERO
+ : NVPTXInst<(outs Int64Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.b64.zero \\{$r\\}, [$s, \\{$l, $x\\}];",
+ []>;
+
+def SULD_2D_I8_ZERO
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.b8.zero \\{$r\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_I16_ZERO
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.b16.zero \\{$r\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_I32_ZERO
+ : NVPTXInst<(outs Int32Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.b32.zero \\{$r\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_I64_ZERO
+ : NVPTXInst<(outs Int64Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.b64.zero \\{$r\\}, [$s, \\{$x, $y\\}];",
+ []>;
+
+def SULD_2D_ARRAY_I8_ZERO
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.b8.zero \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_I16_ZERO
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.b16.zero \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_I32_ZERO
+ : NVPTXInst<(outs Int32Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.b32.zero \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_I64_ZERO
+ : NVPTXInst<(outs Int64Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.b64.zero \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+
+def SULD_3D_I8_ZERO
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.b8.zero \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_I16_ZERO
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.b16.zero \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_I32_ZERO
+ : NVPTXInst<(outs Int32Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.b32.zero \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_I64_ZERO
+ : NVPTXInst<(outs Int64Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.b64.zero \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+}
+
+let IsSuld = 2 in {
+def SULD_1D_V2I8_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v2.b8.zero \\{$r, $g\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_V2I16_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v2.b16.zero \\{$r, $g\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_V2I32_ZERO
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v2.b32.zero \\{$r, $g\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_V2I64_ZERO
+ : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v2.b64.zero \\{$r, $g\\}, [$s, \\{$x\\}];",
+ []>;
+
+def SULD_1D_ARRAY_V2I8_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v2.b8.zero \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_V2I16_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v2.b16.zero \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_V2I32_ZERO
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v2.b32.zero \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_V2I64_ZERO
+ : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v2.b64.zero \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+ []>;
+
+def SULD_2D_V2I8_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v2.b8.zero \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_V2I16_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v2.b16.zero \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_V2I32_ZERO
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v2.b32.zero \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_V2I64_ZERO
+ : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v2.b64.zero \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+ []>;
+
+def SULD_2D_ARRAY_V2I8_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v2.b8.zero \\{$r, $g\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_V2I16_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v2.b16.zero \\{$r, $g\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_V2I32_ZERO
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v2.b32.zero \\{$r, $g\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_V2I64_ZERO
+ : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v2.b64.zero \\{$r, $g\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+
+def SULD_3D_V2I8_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v2.b8.zero \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_V2I16_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v2.b16.zero \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_V2I32_ZERO
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v2.b32.zero \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_V2I64_ZERO
+ : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v2.b64.zero \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+}
+
+let IsSuld = 3 in {
+def SULD_1D_V4I8_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v4.b8.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_V4I16_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v4.b16.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_V4I32_ZERO
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v4.b32.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+ []>;
+
+def SULD_1D_ARRAY_V4I8_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v4.b8.zero \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_V4I16_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v4.b16.zero \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_V4I32_ZERO
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v4.b32.zero \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$l, $x\\}];",
+ []>;
+
+def SULD_2D_V4I8_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v4.b8.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_V4I16_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v4.b16.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_V4I32_ZERO
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v4.b32.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+ []>;
+
+def SULD_2D_ARRAY_V4I8_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v4.b8.zero \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_V4I16_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v4.b16.zero \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_V4I32_ZERO
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v4.b32.zero \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+
+
+def SULD_3D_V4I8_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v4.b8.zero \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_V4I16_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v4.b16.zero \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_V4I32_ZERO
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v4.b32.zero \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+}
+
+//-----------------------------------
+// Texture Query Intrinsics
+//-----------------------------------
+
+let IsSurfTexQuery = 1 in {
+def TXQ_CHANNEL_ORDER
+ : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+ "txq.channel_order.b32 \t$d, [$a];",
+ []>;
+def TXQ_CHANNEL_DATA_TYPE
+ : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+ "txq.channel_data_type.b32 \t$d, [$a];",
+ []>;
+def TXQ_WIDTH
+ : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+ "txq.width.b32 \t$d, [$a];",
+ []>;
+def TXQ_HEIGHT
+ : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+ "txq.height.b32 \t$d, [$a];",
+ []>;
+def TXQ_DEPTH
+ : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+ "txq.depth.b32 \t$d, [$a];",
+ []>;
+def TXQ_ARRAY_SIZE
+ : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+ "txq.array_size.b32 \t$d, [$a];",
+ []>;
+def TXQ_NUM_SAMPLES
+ : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+ "txq.num_samples.b32 \t$d, [$a];",
+ []>;
+def TXQ_NUM_MIPMAP_LEVELS
+ : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+ "txq.num_mipmap_levels.b32 \t$d, [$a];",
+ []>;
+}
+
+def : Pat<(int_nvvm_txq_channel_order Int64Regs:$a),
+ (TXQ_CHANNEL_ORDER Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_channel_data_type Int64Regs:$a),
+ (TXQ_CHANNEL_DATA_TYPE Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_width Int64Regs:$a),
+ (TXQ_WIDTH Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_height Int64Regs:$a),
+ (TXQ_HEIGHT Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_depth Int64Regs:$a),
+ (TXQ_DEPTH Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_array_size Int64Regs:$a),
+ (TXQ_ARRAY_SIZE Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_num_samples Int64Regs:$a),
+ (TXQ_NUM_SAMPLES Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_num_mipmap_levels Int64Regs:$a),
+ (TXQ_NUM_MIPMAP_LEVELS Int64Regs:$a)>;
+
+
+//-----------------------------------
+// Surface Query Intrinsics
+//-----------------------------------
+
+let IsSurfTexQuery = 1 in {
+def SUQ_CHANNEL_ORDER
+ : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+ "suq.channel_order.b32 \t$d, [$a];",
+ []>;
+def SUQ_CHANNEL_DATA_TYPE
+ : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+ "suq.channel_data_type.b32 \t$d, [$a];",
+ []>;
+def SUQ_WIDTH
+ : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+ "suq.width.b32 \t$d, [$a];",
+ []>;
+def SUQ_HEIGHT
+ : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+ "suq.height.b32 \t$d, [$a];",
+ []>;
+def SUQ_DEPTH
+ : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+ "suq.depth.b32 \t$d, [$a];",
+ []>;
+def SUQ_ARRAY_SIZE
+ : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+ "suq.array_size.b32 \t$d, [$a];",
+ []>;
+}
+
+def : Pat<(int_nvvm_suq_channel_order Int64Regs:$a),
+ (SUQ_CHANNEL_ORDER Int64Regs:$a)>;
+def : Pat<(int_nvvm_suq_channel_data_type Int64Regs:$a),
+ (SUQ_CHANNEL_DATA_TYPE Int64Regs:$a)>;
+def : Pat<(int_nvvm_suq_width Int64Regs:$a),
+ (SUQ_WIDTH Int64Regs:$a)>;
+def : Pat<(int_nvvm_suq_height Int64Regs:$a),
+ (SUQ_HEIGHT Int64Regs:$a)>;
+def : Pat<(int_nvvm_suq_depth Int64Regs:$a),
+ (SUQ_DEPTH Int64Regs:$a)>;
+def : Pat<(int_nvvm_suq_array_size Int64Regs:$a),
+ (SUQ_ARRAY_SIZE Int64Regs:$a)>;
+
+
+//===- Handle Query -------------------------------------------------------===//
+
+// TODO: These intrinsics are not yet finalized, pending PTX ISA design work
+def ISTYPEP_SAMPLER
+ : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+ "istypep.samplerref \t$d, $a;",
+ [(set Int1Regs:$d, (int_nvvm_istypep_sampler Int64Regs:$a))]>;
+def ISTYPEP_SURFACE
+ : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+ "istypep.surfref \t$d, $a;",
+ [(set Int1Regs:$d, (int_nvvm_istypep_surface Int64Regs:$a))]>;
+def ISTYPEP_TEXTURE
+ : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+ "istypep.texref \t$d, $a;",
+ [(set Int1Regs:$d, (int_nvvm_istypep_texture Int64Regs:$a))]>;
+
+//===- Surface Stores -----------------------------------------------------===//
+
+let IsSust = 1 in {
+// Unformatted
+// .clamp variant
+def SUST_B_1D_B8_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+ "sust.b.1d.b8.clamp \t[$s, \\{$x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_B16_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+ "sust.b.1d.b16.clamp \t[$s, \\{$x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_B32_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+ "sust.b.1d.b32.clamp \t[$s, \\{$x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_B64_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
+ "sust.b.1d.b64.clamp \t[$s, \\{$x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_V2B8_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ "sust.b.1d.v2.b8.clamp \t[$s, \\{$x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_V2B16_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ "sust.b.1d.v2.b16.clamp \t[$s, \\{$x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_V2B32_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+ "sust.b.1d.v2.b32.clamp \t[$s, \\{$x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_V2B64_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+ "sust.b.1d.v2.b64.clamp \t[$s, \\{$x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_V4B8_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+ Int16Regs:$b, Int16Regs:$a),
+ "sust.b.1d.v4.b8.clamp \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_1D_V4B16_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+ Int16Regs:$b, Int16Regs:$a),
+ "sust.b.1d.v4.b16.clamp \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_1D_V4B32_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ "sust.b.1d.v4.b32.clamp \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+ []>;
+
+
+def SUST_B_1D_ARRAY_B8_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+ "sust.b.a1d.b8.clamp \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_ARRAY_B16_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+ "sust.b.a1d.b16.clamp \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_ARRAY_B32_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r),
+ "sust.b.a1d.b32.clamp \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_ARRAY_B64_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r),
+ "sust.b.a1d.b64.clamp \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_ARRAY_V2B8_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+ Int16Regs:$g),
+ "sust.b.a1d.v2.b8.clamp \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_ARRAY_V2B16_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+ Int16Regs:$g),
+ "sust.b.a1d.v2.b16.clamp \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_ARRAY_V2B32_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+ Int32Regs:$g),
+ "sust.b.a1d.v2.b32.clamp \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_ARRAY_V2B64_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r,
+ Int64Regs:$g),
+ "sust.b.a1d.v2.b64.clamp \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_ARRAY_V4B8_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+ Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.a1d.v4.b8.clamp \t[$s, \\{$idx, $x\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_1D_ARRAY_V4B16_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+ Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.a1d.v4.b16.clamp \t[$s, \\{$idx, $x\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_1D_ARRAY_V4B32_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+ Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ "sust.b.a1d.v4.b32.clamp \t[$s, \\{$idx, $x\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+
+
+def SUST_B_2D_B8_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ "sust.b.2d.b8.clamp \t[$s, \\{$x, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_B16_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ "sust.b.2d.b16.clamp \t[$s, \\{$x, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_B32_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+ "sust.b.2d.b32.clamp \t[$s, \\{$x, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_B64_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+ "sust.b.2d.b64.clamp \t[$s, \\{$x, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_V2B8_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+ Int16Regs:$g),
+ "sust.b.2d.v2.b8.clamp \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_2D_V2B16_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+ Int16Regs:$g),
+ "sust.b.2d.v2.b16.clamp \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_2D_V2B32_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+ Int32Regs:$g),
+ "sust.b.2d.v2.b32.clamp \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_2D_V2B64_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
+ Int64Regs:$g),
+ "sust.b.2d.v2.b64.clamp \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_2D_V4B8_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+ Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.2d.v4.b8.clamp \t[$s, \\{$x, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_2D_V4B16_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+ Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.2d.v4.b16.clamp \t[$s, \\{$x, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_2D_V4B32_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+ Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ "sust.b.2d.v4.b32.clamp \t[$s, \\{$x, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+
+
+def SUST_B_2D_ARRAY_B8_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r),
+ "sust.b.a2d.b8.clamp \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_ARRAY_B16_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r),
+ "sust.b.a2d.b16.clamp \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_ARRAY_B32_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r),
+ "sust.b.a2d.b32.clamp \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_ARRAY_B64_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int64Regs:$r),
+ "sust.b.a2d.b64.clamp \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_ARRAY_V2B8_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g),
+ "sust.b.a2d.v2.b8.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_2D_ARRAY_V2B16_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g),
+ "sust.b.a2d.v2.b16.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_2D_ARRAY_V2B32_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g),
+ "sust.b.a2d.v2.b32.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_2D_ARRAY_V2B64_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int64Regs:$r, Int64Regs:$g),
+ "sust.b.a2d.v2.b64.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_2D_ARRAY_V4B8_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.a2d.v4.b8.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_2D_ARRAY_V4B16_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.a2d.v4.b16.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_2D_ARRAY_V4B32_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ "sust.b.a2d.v4.b32.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+
+
+def SUST_B_3D_B8_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r),
+ "sust.b.3d.b8.clamp \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+ []>;
+def SUST_B_3D_B16_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r),
+ "sust.b.3d.b16.clamp \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+ []>;
+def SUST_B_3D_B32_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r),
+ "sust.b.3d.b32.clamp \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+ []>;
+def SUST_B_3D_B64_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int64Regs:$r),
+ "sust.b.3d.b64.clamp \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+ []>;
+def SUST_B_3D_V2B8_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g),
+ "sust.b.3d.v2.b8.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_3D_V2B16_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g),
+ "sust.b.3d.v2.b16.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_3D_V2B32_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g),
+ "sust.b.3d.v2.b32.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_3D_V2B64_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int64Regs:$r, Int64Regs:$g),
+ "sust.b.3d.v2.b64.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_3D_V4B8_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.3d.v4.b8.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_3D_V4B16_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.3d.v4.b16.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_3D_V4B32_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ "sust.b.3d.v4.b32.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+
+
+// .trap variant
+def SUST_B_1D_B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+ "sust.b.1d.b8.trap \t[$s, \\{$x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+ "sust.b.1d.b16.trap \t[$s, \\{$x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+ "sust.b.1d.b32.trap \t[$s, \\{$x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_B64_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
+ "sust.b.1d.b64.trap \t[$s, \\{$x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_V2B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ "sust.b.1d.v2.b8.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_V2B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ "sust.b.1d.v2.b16.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_V2B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+ "sust.b.1d.v2.b32.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_V2B64_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+ "sust.b.1d.v2.b64.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_V4B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+ Int16Regs:$b, Int16Regs:$a),
+ "sust.b.1d.v4.b8.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_1D_V4B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+ Int16Regs:$b, Int16Regs:$a),
+ "sust.b.1d.v4.b16.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_1D_V4B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ "sust.b.1d.v4.b32.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+ []>;
+
+
+def SUST_B_1D_ARRAY_B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+ "sust.b.a1d.b8.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_ARRAY_B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+ "sust.b.a1d.b16.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_ARRAY_B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r),
+ "sust.b.a1d.b32.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_ARRAY_B64_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r),
+ "sust.b.a1d.b64.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_ARRAY_V2B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+ Int16Regs:$g),
+ "sust.b.a1d.v2.b8.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_ARRAY_V2B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+ Int16Regs:$g),
+ "sust.b.a1d.v2.b16.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_ARRAY_V2B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+ Int32Regs:$g),
+ "sust.b.a1d.v2.b32.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_ARRAY_V2B64_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r,
+ Int64Regs:$g),
+ "sust.b.a1d.v2.b64.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_ARRAY_V4B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+ Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.a1d.v4.b8.trap \t[$s, \\{$idx, $x\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_1D_ARRAY_V4B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+ Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.a1d.v4.b16.trap \t[$s, \\{$idx, $x\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_1D_ARRAY_V4B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+ Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ "sust.b.a1d.v4.b32.trap \t[$s, \\{$idx, $x\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+
+
+def SUST_B_2D_B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ "sust.b.2d.b8.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ "sust.b.2d.b16.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+ "sust.b.2d.b32.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_B64_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+ "sust.b.2d.b64.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_V2B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+ Int16Regs:$g),
+ "sust.b.2d.v2.b8.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_2D_V2B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+ Int16Regs:$g),
+ "sust.b.2d.v2.b16.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_2D_V2B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+ Int32Regs:$g),
+ "sust.b.2d.v2.b32.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_2D_V2B64_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
+ Int64Regs:$g),
+ "sust.b.2d.v2.b64.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_2D_V4B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+ Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.2d.v4.b8.trap \t[$s, \\{$x, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_2D_V4B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+ Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.2d.v4.b16.trap \t[$s, \\{$x, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_2D_V4B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+ Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ "sust.b.2d.v4.b32.trap \t[$s, \\{$x, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+
+
+def SUST_B_2D_ARRAY_B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r),
+ "sust.b.a2d.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_ARRAY_B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r),
+ "sust.b.a2d.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_ARRAY_B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r),
+ "sust.b.a2d.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_ARRAY_B64_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int64Regs:$r),
+ "sust.b.a2d.b64.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_ARRAY_V2B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g),
+ "sust.b.a2d.v2.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_2D_ARRAY_V2B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g),
+ "sust.b.a2d.v2.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_2D_ARRAY_V2B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g),
+ "sust.b.a2d.v2.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_2D_ARRAY_V2B64_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int64Regs:$r, Int64Regs:$g),
+ "sust.b.a2d.v2.b64.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_2D_ARRAY_V4B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.a2d.v4.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_2D_ARRAY_V4B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.a2d.v4.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_2D_ARRAY_V4B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ "sust.b.a2d.v4.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+
+
+def SUST_B_3D_B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r),
+ "sust.b.3d.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+ []>;
+def SUST_B_3D_B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r),
+ "sust.b.3d.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+ []>;
+def SUST_B_3D_B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r),
+ "sust.b.3d.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+ []>;
+def SUST_B_3D_B64_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int64Regs:$r),
+ "sust.b.3d.b64.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+ []>;
+def SUST_B_3D_V2B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g),
+ "sust.b.3d.v2.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_3D_V2B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g),
+ "sust.b.3d.v2.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_3D_V2B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g),
+ "sust.b.3d.v2.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_3D_V2B64_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int64Regs:$r, Int64Regs:$g),
+ "sust.b.3d.v2.b64.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_3D_V4B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.3d.v4.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_3D_V4B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.3d.v4.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_3D_V4B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ "sust.b.3d.v4.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+
+
+// .zero variant
+def SUST_B_1D_B8_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+ "sust.b.1d.b8.zero \t[$s, \\{$x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_B16_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+ "sust.b.1d.b16.zero \t[$s, \\{$x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_B32_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+ "sust.b.1d.b32.zero \t[$s, \\{$x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_B64_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
+ "sust.b.1d.b64.zero \t[$s, \\{$x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_V2B8_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ "sust.b.1d.v2.b8.zero \t[$s, \\{$x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_V2B16_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ "sust.b.1d.v2.b16.zero \t[$s, \\{$x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_V2B32_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+ "sust.b.1d.v2.b32.zero \t[$s, \\{$x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_V2B64_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+ "sust.b.1d.v2.b64.zero \t[$s, \\{$x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_V4B8_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+ Int16Regs:$b, Int16Regs:$a),
+ "sust.b.1d.v4.b8.zero \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_1D_V4B16_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+ Int16Regs:$b, Int16Regs:$a),
+ "sust.b.1d.v4.b16.zero \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_1D_V4B32_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ "sust.b.1d.v4.b32.zero \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+ []>;
+
+
+def SUST_B_1D_ARRAY_B8_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+ "sust.b.a1d.b8.zero \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_ARRAY_B16_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+ "sust.b.a1d.b16.zero \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_ARRAY_B32_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r),
+ "sust.b.a1d.b32.zero \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_ARRAY_B64_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r),
+ "sust.b.a1d.b64.zero \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_ARRAY_V2B8_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+ Int16Regs:$g),
+ "sust.b.a1d.v2.b8.zero \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_ARRAY_V2B16_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+ Int16Regs:$g),
+ "sust.b.a1d.v2.b16.zero \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_ARRAY_V2B32_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+ Int32Regs:$g),
+ "sust.b.a1d.v2.b32.zero \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_ARRAY_V2B64_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r,
+ Int64Regs:$g),
+ "sust.b.a1d.v2.b64.zero \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_ARRAY_V4B8_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+ Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.a1d.v4.b8.zero \t[$s, \\{$idx, $x\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_1D_ARRAY_V4B16_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+ Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.a1d.v4.b16.zero \t[$s, \\{$idx, $x\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_1D_ARRAY_V4B32_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+ Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ "sust.b.a1d.v4.b32.zero \t[$s, \\{$idx, $x\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+
+
+def SUST_B_2D_B8_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ "sust.b.2d.b8.zero \t[$s, \\{$x, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_B16_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ "sust.b.2d.b16.zero \t[$s, \\{$x, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_B32_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+ "sust.b.2d.b32.zero \t[$s, \\{$x, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_B64_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+ "sust.b.2d.b64.zero \t[$s, \\{$x, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_V2B8_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+ Int16Regs:$g),
+ "sust.b.2d.v2.b8.zero \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_2D_V2B16_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+ Int16Regs:$g),
+ "sust.b.2d.v2.b16.zero \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_2D_V2B32_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+ Int32Regs:$g),
+ "sust.b.2d.v2.b32.zero \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_2D_V2B64_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
+ Int64Regs:$g),
+ "sust.b.2d.v2.b64.zero \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_2D_V4B8_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+ Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.2d.v4.b8.zero \t[$s, \\{$x, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_2D_V4B16_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+ Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.2d.v4.b16.zero \t[$s, \\{$x, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_2D_V4B32_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+ Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ "sust.b.2d.v4.b32.zero \t[$s, \\{$x, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+
+
+def SUST_B_2D_ARRAY_B8_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r),
+ "sust.b.a2d.b8.zero \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_ARRAY_B16_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r),
+ "sust.b.a2d.b16.zero \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_ARRAY_B32_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r),
+ "sust.b.a2d.b32.zero \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_ARRAY_B64_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int64Regs:$r),
+ "sust.b.a2d.b64.zero \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_ARRAY_V2B8_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g),
+ "sust.b.a2d.v2.b8.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_2D_ARRAY_V2B16_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g),
+ "sust.b.a2d.v2.b16.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_2D_ARRAY_V2B32_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g),
+ "sust.b.a2d.v2.b32.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_2D_ARRAY_V2B64_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int64Regs:$r, Int64Regs:$g),
+ "sust.b.a2d.v2.b64.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_2D_ARRAY_V4B8_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.a2d.v4.b8.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_2D_ARRAY_V4B16_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.a2d.v4.b16.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_2D_ARRAY_V4B32_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ "sust.b.a2d.v4.b32.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+
+
+def SUST_B_3D_B8_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r),
+ "sust.b.3d.b8.zero \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+ []>;
+def SUST_B_3D_B16_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r),
+ "sust.b.3d.b16.zero \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+ []>;
+def SUST_B_3D_B32_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r),
+ "sust.b.3d.b32.zero \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+ []>;
+def SUST_B_3D_B64_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int64Regs:$r),
+ "sust.b.3d.b64.zero \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+ []>;
+def SUST_B_3D_V2B8_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g),
+ "sust.b.3d.v2.b8.zero \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_3D_V2B16_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g),
+ "sust.b.3d.v2.b16.zero \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_3D_V2B32_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g),
+ "sust.b.3d.v2.b32.zero \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_3D_V2B64_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int64Regs:$r, Int64Regs:$g),
+ "sust.b.3d.v2.b64.zero \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_3D_V4B8_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.3d.v4.b8.zero \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_3D_V4B16_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.3d.v4.b16.zero \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_3D_V4B32_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ "sust.b.3d.v4.b32.zero \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+
+
+
+// Formatted
+
+def SUST_P_1D_B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+ "sust.p.1d.b8.trap \t[$s, \\{$x\\}], \\{$r\\};",
+ []>;
+def SUST_P_1D_B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+ "sust.p.1d.b16.trap \t[$s, \\{$x\\}], \\{$r\\};",
+ []>;
+def SUST_P_1D_B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+ "sust.p.1d.b32.trap \t[$s, \\{$x\\}], \\{$r\\};",
+ []>;
+def SUST_P_1D_V2B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ "sust.p.1d.v2.b8.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_P_1D_V2B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ "sust.p.1d.v2.b16.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_P_1D_V2B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+ "sust.p.1d.v2.b32.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_P_1D_V4B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+ Int16Regs:$b, Int16Regs:$a),
+ "sust.p.1d.v4.b8.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_P_1D_V4B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+ Int16Regs:$b, Int16Regs:$a),
+ "sust.p.1d.v4.b16.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_P_1D_V4B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ "sust.p.1d.v4.b32.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+ []>;
+
+
+def SUST_P_1D_ARRAY_B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+ "sust.p.a1d.b8.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+ []>;
+def SUST_P_1D_ARRAY_B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+ "sust.p.a1d.b16.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+ []>;
+def SUST_P_1D_ARRAY_B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r),
+ "sust.p.a1d.b32.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+ []>;
+def SUST_P_1D_ARRAY_V2B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+ Int16Regs:$g),
+ "sust.p.a1d.v2.b8.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_P_1D_ARRAY_V2B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+ Int16Regs:$g),
+ "sust.p.a1d.v2.b16.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_P_1D_ARRAY_V2B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+ Int32Regs:$g),
+ "sust.p.a1d.v2.b32.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_P_1D_ARRAY_V4B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+ Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.p.a1d.v4.b8.trap \t[$s, \\{$idx, $x\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_P_1D_ARRAY_V4B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+ Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.p.a1d.v4.b16.trap \t[$s, \\{$idx, $x\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_P_1D_ARRAY_V4B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+ Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ "sust.p.a1d.v4.b32.trap \t[$s, \\{$idx, $x\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+
+
+def SUST_P_2D_B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ "sust.p.2d.b8.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+ []>;
+def SUST_P_2D_B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ "sust.p.2d.b16.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+ []>;
+def SUST_P_2D_B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+ "sust.p.2d.b32.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+ []>;
+def SUST_P_2D_V2B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+ Int16Regs:$g),
+ "sust.p.2d.v2.b8.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+ []>;
+def SUST_P_2D_V2B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+ Int16Regs:$g),
+ "sust.p.2d.v2.b16.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+ []>;
+def SUST_P_2D_V2B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+ Int32Regs:$g),
+ "sust.p.2d.v2.b32.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+ []>;
+def SUST_P_2D_V4B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+ Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.p.2d.v4.b8.trap \t[$s, \\{$x, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_P_2D_V4B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+ Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.p.2d.v4.b16.trap \t[$s, \\{$x, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_P_2D_V4B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+ Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ "sust.p.2d.v4.b32.trap \t[$s, \\{$x, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+
+
+def SUST_P_2D_ARRAY_B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r),
+ "sust.p.a2d.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+ []>;
+def SUST_P_2D_ARRAY_B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r),
+ "sust.p.a2d.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+ []>;
+def SUST_P_2D_ARRAY_B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r),
+ "sust.p.a2d.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+ []>;
+def SUST_P_2D_ARRAY_V2B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g),
+ "sust.p.a2d.v2.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_P_2D_ARRAY_V2B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g),
+ "sust.p.a2d.v2.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_P_2D_ARRAY_V2B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g),
+ "sust.p.a2d.v2.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_P_2D_ARRAY_V4B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.p.a2d.v4.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_P_2D_ARRAY_V4B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.p.a2d.v4.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_P_2D_ARRAY_V4B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ "sust.p.a2d.v4.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+
+
+def SUST_P_3D_B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r),
+ "sust.p.3d.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+ []>;
+def SUST_P_3D_B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r),
+ "sust.p.3d.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+ []>;
+def SUST_P_3D_B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r),
+ "sust.p.3d.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+ []>;
+def SUST_P_3D_V2B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g),
+ "sust.p.3d.v2.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_P_3D_V2B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g),
+ "sust.p.3d.v2.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_P_3D_V2B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g),
+ "sust.p.3d.v2.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_P_3D_V4B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.p.3d.v4.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_P_3D_V4B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.p.3d.v4.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_P_3D_V4B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ "sust.p.3d.v4.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+}
+
+// Surface store instruction patterns
+// I'm not sure why we can't just include these in the instruction definitions,
+// but TableGen complains of type errors :(
+
+// .clamp variant
+def : Pat<(int_nvvm_sust_b_1d_i8_clamp
+ Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+ (SUST_B_1D_B8_CLAMP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i16_clamp
+ Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+ (SUST_B_1D_B16_CLAMP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i32_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+ (SUST_B_1D_B32_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i64_clamp
+ Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
+ (SUST_B_1D_B64_CLAMP Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i8_clamp
+ Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_1D_V2B8_CLAMP Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i16_clamp
+ Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_1D_V2B16_CLAMP Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i32_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+ (SUST_B_1D_V2B32_CLAMP Int64Regs:$s, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i64_clamp
+ Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+ (SUST_B_1D_V2B64_CLAMP Int64Regs:$s, Int32Regs:$x,
+ Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i8_clamp
+ Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_1D_V4B8_CLAMP Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i16_clamp
+ Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_1D_V4B16_CLAMP Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i32_clamp
+ Int64Regs:$s, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_B_1D_V4B32_CLAMP Int64Regs:$s, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_1d_array_i8_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+ (SUST_B_1D_ARRAY_B8_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i16_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+ (SUST_B_1D_ARRAY_B16_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i32_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r),
+ (SUST_B_1D_ARRAY_B32_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i64_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r),
+ (SUST_B_1D_ARRAY_B64_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i8_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_1D_ARRAY_V2B8_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i16_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_1D_ARRAY_V2B16_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i32_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+ (SUST_B_1D_ARRAY_V2B32_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i64_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+ (SUST_B_1D_ARRAY_V2B64_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i8_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_1D_ARRAY_V4B8_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i16_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_1D_ARRAY_V4B16_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i32_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_B_1D_ARRAY_V4B32_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_2d_i8_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ (SUST_B_2D_B8_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i16_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ (SUST_B_2D_B16_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i32_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+ (SUST_B_2D_B32_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i64_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+ (SUST_B_2D_B64_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i8_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_2D_V2B8_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i16_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_2D_V2B16_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i32_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g),
+ (SUST_B_2D_V2B32_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i64_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g),
+ (SUST_B_2D_V2B64_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i8_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_2D_V4B8_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i16_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_2D_V4B16_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i32_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_B_2D_V4B32_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_2d_array_i8_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ (SUST_B_2D_ARRAY_B8_CLAMP Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i16_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ (SUST_B_2D_ARRAY_B16_CLAMP Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i32_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+ (SUST_B_2D_ARRAY_B32_CLAMP Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i64_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+ (SUST_B_2D_ARRAY_B64_CLAMP Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i8_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_2D_ARRAY_V2B8_CLAMP Int64Regs:$s, Int32Regs:$l,
+ Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i16_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_2D_ARRAY_V2B16_CLAMP Int64Regs:$s, Int32Regs:$l,
+ Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i32_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+ Int32Regs:$g),
+ (SUST_B_2D_ARRAY_V2B32_CLAMP Int64Regs:$s, Int32Regs:$l,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i64_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
+ Int64Regs:$g),
+ (SUST_B_2D_ARRAY_V2B64_CLAMP Int64Regs:$s, Int32Regs:$l,
+ Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i8_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_2D_ARRAY_V4B8_CLAMP Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i16_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_2D_ARRAY_V4B16_CLAMP Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i32_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_B_2D_ARRAY_V4B32_CLAMP Int64Regs:$s, Int32Regs:$l,
+ Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_3d_i8_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r),
+ (SUST_B_3D_B8_CLAMP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i16_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r),
+ (SUST_B_3D_B16_CLAMP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i32_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r),
+ (SUST_B_3D_B32_CLAMP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i64_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int64Regs:$r),
+ (SUST_B_3D_B64_CLAMP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i8_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_3D_V2B8_CLAMP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i16_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_3D_V2B16_CLAMP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i32_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g),
+ (SUST_B_3D_V2B32_CLAMP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i64_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int64Regs:$r, Int64Regs:$g),
+ (SUST_B_3D_V2B64_CLAMP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i8_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_3D_V4B8_CLAMP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i16_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_3D_V4B16_CLAMP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i32_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_B_3D_V4B32_CLAMP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+// .trap variant
+def : Pat<(int_nvvm_sust_b_1d_i8_trap
+ Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+ (SUST_B_1D_B8_TRAP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i16_trap
+ Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+ (SUST_B_1D_B16_TRAP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i32_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+ (SUST_B_1D_B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i64_trap
+ Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
+ (SUST_B_1D_B64_TRAP Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i8_trap
+ Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_1D_V2B8_TRAP Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i16_trap
+ Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_1D_V2B16_TRAP Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i32_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+ (SUST_B_1D_V2B32_TRAP Int64Regs:$s, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i64_trap
+ Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+ (SUST_B_1D_V2B64_TRAP Int64Regs:$s, Int32Regs:$x,
+ Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i8_trap
+ Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_1D_V4B8_TRAP Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i16_trap
+ Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_1D_V4B16_TRAP Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i32_trap
+ Int64Regs:$s, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_B_1D_V4B32_TRAP Int64Regs:$s, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_1d_array_i8_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+ (SUST_B_1D_ARRAY_B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i16_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+ (SUST_B_1D_ARRAY_B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i32_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r),
+ (SUST_B_1D_ARRAY_B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i64_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r),
+ (SUST_B_1D_ARRAY_B64_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i8_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_1D_ARRAY_V2B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i16_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_1D_ARRAY_V2B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i32_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+ (SUST_B_1D_ARRAY_V2B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i64_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+ (SUST_B_1D_ARRAY_V2B64_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i8_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_1D_ARRAY_V4B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i16_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_1D_ARRAY_V4B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i32_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_B_1D_ARRAY_V4B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_2d_i8_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ (SUST_B_2D_B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i16_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ (SUST_B_2D_B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i32_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+ (SUST_B_2D_B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i64_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+ (SUST_B_2D_B64_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i8_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_2D_V2B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i16_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_2D_V2B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i32_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g),
+ (SUST_B_2D_V2B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i64_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g),
+ (SUST_B_2D_V2B64_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i8_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_2D_V4B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i16_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_2D_V4B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i32_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_B_2D_V4B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_2d_array_i8_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ (SUST_B_2D_ARRAY_B8_TRAP Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i16_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ (SUST_B_2D_ARRAY_B16_TRAP Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i32_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+ (SUST_B_2D_ARRAY_B32_TRAP Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i64_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+ (SUST_B_2D_ARRAY_B64_TRAP Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i8_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_2D_ARRAY_V2B8_TRAP Int64Regs:$s, Int32Regs:$l,
+ Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i16_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_2D_ARRAY_V2B16_TRAP Int64Regs:$s, Int32Regs:$l,
+ Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i32_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+ Int32Regs:$g),
+ (SUST_B_2D_ARRAY_V2B32_TRAP Int64Regs:$s, Int32Regs:$l,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i64_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
+ Int64Regs:$g),
+ (SUST_B_2D_ARRAY_V2B64_TRAP Int64Regs:$s, Int32Regs:$l,
+ Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i8_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_2D_ARRAY_V4B8_TRAP Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i16_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_2D_ARRAY_V4B16_TRAP Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i32_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_B_2D_ARRAY_V4B32_TRAP Int64Regs:$s, Int32Regs:$l,
+ Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_3d_i8_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r),
+ (SUST_B_3D_B8_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i16_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r),
+ (SUST_B_3D_B16_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i32_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r),
+ (SUST_B_3D_B32_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i64_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int64Regs:$r),
+ (SUST_B_3D_B64_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i8_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_3D_V2B8_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i16_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_3D_V2B16_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i32_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g),
+ (SUST_B_3D_V2B32_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i64_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int64Regs:$r, Int64Regs:$g),
+ (SUST_B_3D_V2B64_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i8_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_3D_V4B8_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i16_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_3D_V4B16_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i32_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_B_3D_V4B32_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+// .zero variant
+def : Pat<(int_nvvm_sust_b_1d_i8_zero
+ Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+ (SUST_B_1D_B8_ZERO Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i16_zero
+ Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+ (SUST_B_1D_B16_ZERO Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i32_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+ (SUST_B_1D_B32_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i64_zero
+ Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
+ (SUST_B_1D_B64_ZERO Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i8_zero
+ Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_1D_V2B8_ZERO Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i16_zero
+ Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_1D_V2B16_ZERO Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i32_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+ (SUST_B_1D_V2B32_ZERO Int64Regs:$s, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i64_zero
+ Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+ (SUST_B_1D_V2B64_ZERO Int64Regs:$s, Int32Regs:$x,
+ Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i8_zero
+ Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_1D_V4B8_ZERO Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i16_zero
+ Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_1D_V4B16_ZERO Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i32_zero
+ Int64Regs:$s, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_B_1D_V4B32_ZERO Int64Regs:$s, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_1d_array_i8_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+ (SUST_B_1D_ARRAY_B8_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i16_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+ (SUST_B_1D_ARRAY_B16_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i32_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r),
+ (SUST_B_1D_ARRAY_B32_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i64_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r),
+ (SUST_B_1D_ARRAY_B64_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i8_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_1D_ARRAY_V2B8_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i16_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_1D_ARRAY_V2B16_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i32_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+ (SUST_B_1D_ARRAY_V2B32_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i64_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+ (SUST_B_1D_ARRAY_V2B64_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i8_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_1D_ARRAY_V4B8_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i16_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_1D_ARRAY_V4B16_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i32_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_B_1D_ARRAY_V4B32_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_2d_i8_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ (SUST_B_2D_B8_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i16_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ (SUST_B_2D_B16_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i32_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+ (SUST_B_2D_B32_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i64_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+ (SUST_B_2D_B64_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i8_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_2D_V2B8_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i16_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_2D_V2B16_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i32_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g),
+ (SUST_B_2D_V2B32_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i64_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g),
+ (SUST_B_2D_V2B64_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i8_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_2D_V4B8_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i16_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_2D_V4B16_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i32_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_B_2D_V4B32_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_2d_array_i8_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ (SUST_B_2D_ARRAY_B8_ZERO Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i16_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ (SUST_B_2D_ARRAY_B16_ZERO Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i32_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+ (SUST_B_2D_ARRAY_B32_ZERO Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i64_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+ (SUST_B_2D_ARRAY_B64_ZERO Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i8_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_2D_ARRAY_V2B8_ZERO Int64Regs:$s, Int32Regs:$l,
+ Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i16_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_2D_ARRAY_V2B16_ZERO Int64Regs:$s, Int32Regs:$l,
+ Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i32_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+ Int32Regs:$g),
+ (SUST_B_2D_ARRAY_V2B32_ZERO Int64Regs:$s, Int32Regs:$l,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i64_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
+ Int64Regs:$g),
+ (SUST_B_2D_ARRAY_V2B64_ZERO Int64Regs:$s, Int32Regs:$l,
+ Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i8_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_2D_ARRAY_V4B8_ZERO Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i16_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_2D_ARRAY_V4B16_ZERO Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i32_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_B_2D_ARRAY_V4B32_ZERO Int64Regs:$s, Int32Regs:$l,
+ Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_3d_i8_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r),
+ (SUST_B_3D_B8_ZERO Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i16_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r),
+ (SUST_B_3D_B16_ZERO Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i32_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r),
+ (SUST_B_3D_B32_ZERO Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i64_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int64Regs:$r),
+ (SUST_B_3D_B64_ZERO Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i8_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_3D_V2B8_ZERO Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i16_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_3D_V2B16_ZERO Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i32_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g),
+ (SUST_B_3D_V2B32_ZERO Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i64_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int64Regs:$r, Int64Regs:$g),
+ (SUST_B_3D_V2B64_ZERO Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i8_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_3D_V4B8_ZERO Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i16_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_3D_V4B16_ZERO Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i32_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_B_3D_V4B32_ZERO Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+
+def : Pat<(int_nvvm_sust_p_1d_i8_trap
+ Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+ (SUST_P_1D_B8_TRAP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_1d_i16_trap
+ Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+ (SUST_P_1D_B16_TRAP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_1d_i32_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+ (SUST_P_1D_B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_1d_v2i8_trap
+ Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ (SUST_P_1D_V2B8_TRAP Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_1d_v2i16_trap
+ Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ (SUST_P_1D_V2B16_TRAP Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_1d_v2i32_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+ (SUST_P_1D_V2B32_TRAP Int64Regs:$s, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_1d_v4i8_trap
+ Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_P_1D_V4B8_TRAP Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_1d_v4i16_trap
+ Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_P_1D_V4B16_TRAP Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_1d_v4i32_trap
+ Int64Regs:$s, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_P_1D_V4B32_TRAP Int64Regs:$s, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_p_1d_array_i8_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+ (SUST_P_1D_ARRAY_B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_i16_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+ (SUST_P_1D_ARRAY_B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_i32_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r),
+ (SUST_P_1D_ARRAY_B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_v2i8_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ (SUST_P_1D_ARRAY_V2B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_v2i16_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ (SUST_P_1D_ARRAY_V2B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_v2i32_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+ (SUST_P_1D_ARRAY_V2B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_v4i8_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_P_1D_ARRAY_V4B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_v4i16_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_P_1D_ARRAY_V4B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_v4i32_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_P_1D_ARRAY_V4B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_p_2d_i8_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ (SUST_P_2D_B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_2d_i16_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ (SUST_P_2D_B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_2d_i32_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+ (SUST_P_2D_B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_2d_v2i8_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+ (SUST_P_2D_V2B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_2d_v2i16_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+ (SUST_P_2D_V2B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_2d_v2i32_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g),
+ (SUST_P_2D_V2B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_2d_v4i8_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_P_2D_V4B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_2d_v4i16_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_P_2D_V4B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_2d_v4i32_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_P_2D_V4B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_p_2d_array_i8_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ (SUST_P_2D_ARRAY_B8_TRAP Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_i16_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ (SUST_P_2D_ARRAY_B16_TRAP Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_i32_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+ (SUST_P_2D_ARRAY_B32_TRAP Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_v2i8_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g),
+ (SUST_P_2D_ARRAY_V2B8_TRAP Int64Regs:$s, Int32Regs:$l,
+ Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_v2i16_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g),
+ (SUST_P_2D_ARRAY_V2B16_TRAP Int64Regs:$s, Int32Regs:$l,
+ Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_v2i32_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+ Int32Regs:$g),
+ (SUST_P_2D_ARRAY_V2B32_TRAP Int64Regs:$s, Int32Regs:$l,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_v4i8_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_P_2D_ARRAY_V4B8_TRAP Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_v4i16_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_P_2D_ARRAY_V4B16_TRAP Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_v4i32_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_P_2D_ARRAY_V4B32_TRAP Int64Regs:$s, Int32Regs:$l,
+ Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_p_3d_i8_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r),
+ (SUST_P_3D_B8_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_3d_i16_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r),
+ (SUST_P_3D_B16_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_3d_i32_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r),
+ (SUST_P_3D_B32_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_3d_v2i8_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g),
+ (SUST_P_3D_V2B8_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_3d_v2i16_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g),
+ (SUST_P_3D_V2B16_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_3d_v2i32_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g),
+ (SUST_P_3D_V2B32_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_3d_v4i8_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_P_3D_V4B8_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_3d_v4i16_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_P_3D_V4B16_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_3d_v4i32_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_P_3D_V4B32_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+//-----------------------------------
+// Read Special Registers
+//-----------------------------------
+
+class PTX_READ_SREG_R64<string regname, Intrinsic intop>
+ : NVPTXInst<(outs Int64Regs:$d), (ins),
+ !strconcat(!strconcat("mov.u64\t$d, %", regname), ";"),
+ [(set Int64Regs:$d, (intop))]>;
+
+class PTX_READ_SREG_R32<string regname, Intrinsic intop>
+ : NVPTXInst<(outs Int32Regs:$d), (ins),
+ !strconcat(!strconcat("mov.u32\t$d, %", regname), ";"),
+ [(set Int32Regs:$d, (intop))]>;
+
+// TODO Add read vector-version of special registers
+
+def INT_PTX_SREG_TID_X :
+ PTX_READ_SREG_R32<"tid.x", int_nvvm_read_ptx_sreg_tid_x>;
+def INT_PTX_SREG_TID_Y :
+ PTX_READ_SREG_R32<"tid.y", int_nvvm_read_ptx_sreg_tid_y>;
+def INT_PTX_SREG_TID_Z :
+ PTX_READ_SREG_R32<"tid.z", int_nvvm_read_ptx_sreg_tid_z>;
+def INT_PTX_SREG_TID_W :
+ PTX_READ_SREG_R32<"tid.w", int_nvvm_read_ptx_sreg_tid_w>;
+
+def INT_PTX_SREG_NTID_X :
+ PTX_READ_SREG_R32<"ntid.x", int_nvvm_read_ptx_sreg_ntid_x>;
+def INT_PTX_SREG_NTID_Y :
+ PTX_READ_SREG_R32<"ntid.y", int_nvvm_read_ptx_sreg_ntid_y>;
+def INT_PTX_SREG_NTID_Z :
+ PTX_READ_SREG_R32<"ntid.z", int_nvvm_read_ptx_sreg_ntid_z>;
+def INT_PTX_SREG_NTID_W :
+ PTX_READ_SREG_R32<"ntid.w", int_nvvm_read_ptx_sreg_ntid_w>;
+
+def INT_PTX_SREG_LANEID :
+ PTX_READ_SREG_R32<"laneid", int_nvvm_read_ptx_sreg_laneid>;
+def INT_PTX_SREG_WARPID :
+ PTX_READ_SREG_R32<"warpid", int_nvvm_read_ptx_sreg_warpid>;
+def INT_PTX_SREG_NWARPID :
+ PTX_READ_SREG_R32<"nwarpid", int_nvvm_read_ptx_sreg_nwarpid>;
+
+def INT_PTX_SREG_CTAID_X :
+ PTX_READ_SREG_R32<"ctaid.x", int_nvvm_read_ptx_sreg_ctaid_x>;
+def INT_PTX_SREG_CTAID_Y :
+ PTX_READ_SREG_R32<"ctaid.y", int_nvvm_read_ptx_sreg_ctaid_y>;
+def INT_PTX_SREG_CTAID_Z :
+ PTX_READ_SREG_R32<"ctaid.z", int_nvvm_read_ptx_sreg_ctaid_z>;
+def INT_PTX_SREG_CTAID_W :
+ PTX_READ_SREG_R32<"ctaid.w", int_nvvm_read_ptx_sreg_ctaid_w>;
+
+def INT_PTX_SREG_NCTAID_X :
+ PTX_READ_SREG_R32<"nctaid.x", int_nvvm_read_ptx_sreg_nctaid_x>;
+def INT_PTX_SREG_NCTAID_Y :
+ PTX_READ_SREG_R32<"nctaid.y", int_nvvm_read_ptx_sreg_nctaid_y>;
+def INT_PTX_SREG_NCTAID_Z :
+ PTX_READ_SREG_R32<"nctaid.z", int_nvvm_read_ptx_sreg_nctaid_z>;
+def INT_PTX_SREG_NCTAID_W :
+ PTX_READ_SREG_R32<"nctaid.w", int_nvvm_read_ptx_sreg_nctaid_w>;
+
+def INT_PTX_SREG_SMID :
+ PTX_READ_SREG_R32<"smid", int_nvvm_read_ptx_sreg_smid>;
+def INT_PTX_SREG_NSMID :
+ PTX_READ_SREG_R32<"nsmid", int_nvvm_read_ptx_sreg_nsmid>;
+def INT_PTX_SREG_GRIDID :
+ PTX_READ_SREG_R32<"gridid", int_nvvm_read_ptx_sreg_gridid>;
+
+def INT_PTX_SREG_LANEMASK_EQ :
+ PTX_READ_SREG_R32<"lanemask_eq", int_nvvm_read_ptx_sreg_lanemask_eq>;
+def INT_PTX_SREG_LANEMASK_LE :
+ PTX_READ_SREG_R32<"lanemask_le", int_nvvm_read_ptx_sreg_lanemask_le>;
+def INT_PTX_SREG_LANEMASK_LT :
+ PTX_READ_SREG_R32<"lanemask_lt", int_nvvm_read_ptx_sreg_lanemask_lt>;
+def INT_PTX_SREG_LANEMASK_GE :
+ PTX_READ_SREG_R32<"lanemask_ge", int_nvvm_read_ptx_sreg_lanemask_ge>;
+def INT_PTX_SREG_LANEMASK_GT :
+ PTX_READ_SREG_R32<"lanemask_gt", int_nvvm_read_ptx_sreg_lanemask_gt>;
+
+def INT_PTX_SREG_CLOCK :
+ PTX_READ_SREG_R32<"clock", int_nvvm_read_ptx_sreg_clock>;
+def INT_PTX_SREG_CLOCK64 :
+ PTX_READ_SREG_R64<"clock64", int_nvvm_read_ptx_sreg_clock64>;
+
+def INT_PTX_SREG_PM0 : PTX_READ_SREG_R32<"pm0", int_nvvm_read_ptx_sreg_pm0>;
+def INT_PTX_SREG_PM1 : PTX_READ_SREG_R32<"pm1", int_nvvm_read_ptx_sreg_pm1>;
+def INT_PTX_SREG_PM2 : PTX_READ_SREG_R32<"pm2", int_nvvm_read_ptx_sreg_pm2>;
+def INT_PTX_SREG_PM3 : PTX_READ_SREG_R32<"pm3", int_nvvm_read_ptx_sreg_pm3>;
+
+// TODO: It would be nice to use PTX_READ_SREG here, but it doesn't
+// handle the constant.
+def INT_PTX_SREG_WARPSIZE :
+ NVPTXInst<(outs Int32Regs:$dst), (ins), "mov.u32 \t$dst, WARP_SZ;",
+ [(set Int32Regs:$dst, (int_nvvm_read_ptx_sreg_warpsize))]>;
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
new file mode 100644
index 000000000000..b925b632ee4a
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
@@ -0,0 +1,349 @@
+//===- NVPTXLowerAggrCopies.cpp - ------------------------------*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// Lower aggregate copies, memset, memcpy, memmov intrinsics into loops when
+// the size is large or is not a compile-time constant.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXLowerAggrCopies.h"
+#include "llvm/CodeGen/StackProtector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "nvptx"
+
+using namespace llvm;
+
+namespace {
+
+// actual analysis class, which is a functionpass
+struct NVPTXLowerAggrCopies : public FunctionPass {
+ static char ID;
+
+ NVPTXLowerAggrCopies() : FunctionPass(ID) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addPreserved<StackProtector>();
+ }
+
+ bool runOnFunction(Function &F) override;
+
+ static const unsigned MaxAggrCopySize = 128;
+
+ StringRef getPassName() const override {
+ return "Lower aggregate copies/intrinsics into loops";
+ }
+};
+
+char NVPTXLowerAggrCopies::ID = 0;
+
+// Lower memcpy to loop.
+void convertMemCpyToLoop(Instruction *ConvertedInst, Value *SrcAddr,
+ Value *DstAddr, Value *CopyLen, bool SrcIsVolatile,
+ bool DstIsVolatile, LLVMContext &Context,
+ Function &F) {
+ Type *TypeOfCopyLen = CopyLen->getType();
+
+ BasicBlock *OrigBB = ConvertedInst->getParent();
+ BasicBlock *NewBB =
+ ConvertedInst->getParent()->splitBasicBlock(ConvertedInst, "split");
+ BasicBlock *LoopBB = BasicBlock::Create(Context, "loadstoreloop", &F, NewBB);
+
+ OrigBB->getTerminator()->setSuccessor(0, LoopBB);
+ IRBuilder<> Builder(OrigBB->getTerminator());
+
+ // SrcAddr and DstAddr are expected to be pointer types,
+ // so no check is made here.
+ unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
+ unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+
+ // Cast pointers to (char *)
+ SrcAddr = Builder.CreateBitCast(SrcAddr, Builder.getInt8PtrTy(SrcAS));
+ DstAddr = Builder.CreateBitCast(DstAddr, Builder.getInt8PtrTy(DstAS));
+
+ IRBuilder<> LoopBuilder(LoopBB);
+ PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
+ LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), OrigBB);
+
+ // load from SrcAddr+LoopIndex
+ // TODO: we can leverage the align parameter of llvm.memcpy for more efficient
+ // word-sized loads and stores.
+ Value *Element =
+ LoopBuilder.CreateLoad(LoopBuilder.CreateInBoundsGEP(
+ LoopBuilder.getInt8Ty(), SrcAddr, LoopIndex),
+ SrcIsVolatile);
+ // store at DstAddr+LoopIndex
+ LoopBuilder.CreateStore(Element,
+ LoopBuilder.CreateInBoundsGEP(LoopBuilder.getInt8Ty(),
+ DstAddr, LoopIndex),
+ DstIsVolatile);
+
+ // The value for LoopIndex coming from backedge is (LoopIndex + 1)
+ Value *NewIndex =
+ LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1));
+ LoopIndex->addIncoming(NewIndex, LoopBB);
+
+ LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB,
+ NewBB);
+}
+
+// Lower memmove to IR. memmove is required to correctly copy overlapping memory
+// regions; therefore, it has to check the relative positions of the source and
+// destination pointers and choose the copy direction accordingly.
+//
+// The code below is an IR rendition of this C function:
+//
+// void* memmove(void* dst, const void* src, size_t n) {
+// unsigned char* d = dst;
+// const unsigned char* s = src;
+// if (s < d) {
+// // copy backwards
+// while (n--) {
+// d[n] = s[n];
+// }
+// } else {
+// // copy forward
+// for (size_t i = 0; i < n; ++i) {
+// d[i] = s[i];
+// }
+// }
+// return dst;
+// }
+void convertMemMoveToLoop(Instruction *ConvertedInst, Value *SrcAddr,
+ Value *DstAddr, Value *CopyLen, bool SrcIsVolatile,
+ bool DstIsVolatile, LLVMContext &Context,
+ Function &F) {
+ Type *TypeOfCopyLen = CopyLen->getType();
+ BasicBlock *OrigBB = ConvertedInst->getParent();
+
+ // Create the a comparison of src and dst, based on which we jump to either
+ // the forward-copy part of the function (if src >= dst) or the backwards-copy
+ // part (if src < dst).
+ // SplitBlockAndInsertIfThenElse conveniently creates the basic if-then-else
+ // structure. Its block terminators (unconditional branches) are replaced by
+ // the appropriate conditional branches when the loop is built.
+ ICmpInst *PtrCompare = new ICmpInst(ConvertedInst, ICmpInst::ICMP_ULT,
+ SrcAddr, DstAddr, "compare_src_dst");
+ TerminatorInst *ThenTerm, *ElseTerm;
+ SplitBlockAndInsertIfThenElse(PtrCompare, ConvertedInst, &ThenTerm,
+ &ElseTerm);
+
+ // Each part of the function consists of two blocks:
+ // copy_backwards: used to skip the loop when n == 0
+ // copy_backwards_loop: the actual backwards loop BB
+ // copy_forward: used to skip the loop when n == 0
+ // copy_forward_loop: the actual forward loop BB
+ BasicBlock *CopyBackwardsBB = ThenTerm->getParent();
+ CopyBackwardsBB->setName("copy_backwards");
+ BasicBlock *CopyForwardBB = ElseTerm->getParent();
+ CopyForwardBB->setName("copy_forward");
+ BasicBlock *ExitBB = ConvertedInst->getParent();
+ ExitBB->setName("memmove_done");
+
+ // Initial comparison of n == 0 that lets us skip the loops altogether. Shared
+ // between both backwards and forward copy clauses.
+ ICmpInst *CompareN =
+ new ICmpInst(OrigBB->getTerminator(), ICmpInst::ICMP_EQ, CopyLen,
+ ConstantInt::get(TypeOfCopyLen, 0), "compare_n_to_0");
+
+ // Copying backwards.
+ BasicBlock *LoopBB =
+ BasicBlock::Create(Context, "copy_backwards_loop", &F, CopyForwardBB);
+ IRBuilder<> LoopBuilder(LoopBB);
+ PHINode *LoopPhi = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
+ Value *IndexPtr = LoopBuilder.CreateSub(
+ LoopPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_ptr");
+ Value *Element = LoopBuilder.CreateLoad(
+ LoopBuilder.CreateInBoundsGEP(SrcAddr, IndexPtr), "element");
+ LoopBuilder.CreateStore(Element,
+ LoopBuilder.CreateInBoundsGEP(DstAddr, IndexPtr));
+ LoopBuilder.CreateCondBr(
+ LoopBuilder.CreateICmpEQ(IndexPtr, ConstantInt::get(TypeOfCopyLen, 0)),
+ ExitBB, LoopBB);
+ LoopPhi->addIncoming(IndexPtr, LoopBB);
+ LoopPhi->addIncoming(CopyLen, CopyBackwardsBB);
+ BranchInst::Create(ExitBB, LoopBB, CompareN, ThenTerm);
+ ThenTerm->eraseFromParent();
+
+ // Copying forward.
+ BasicBlock *FwdLoopBB =
+ BasicBlock::Create(Context, "copy_forward_loop", &F, ExitBB);
+ IRBuilder<> FwdLoopBuilder(FwdLoopBB);
+ PHINode *FwdCopyPhi = FwdLoopBuilder.CreatePHI(TypeOfCopyLen, 0, "index_ptr");
+ Value *FwdElement = FwdLoopBuilder.CreateLoad(
+ FwdLoopBuilder.CreateInBoundsGEP(SrcAddr, FwdCopyPhi), "element");
+ FwdLoopBuilder.CreateStore(
+ FwdElement, FwdLoopBuilder.CreateInBoundsGEP(DstAddr, FwdCopyPhi));
+ Value *FwdIndexPtr = FwdLoopBuilder.CreateAdd(
+ FwdCopyPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_increment");
+ FwdLoopBuilder.CreateCondBr(FwdLoopBuilder.CreateICmpEQ(FwdIndexPtr, CopyLen),
+ ExitBB, FwdLoopBB);
+ FwdCopyPhi->addIncoming(FwdIndexPtr, FwdLoopBB);
+ FwdCopyPhi->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), CopyForwardBB);
+
+ BranchInst::Create(ExitBB, FwdLoopBB, CompareN, ElseTerm);
+ ElseTerm->eraseFromParent();
+}
+
+// Lower memset to loop.
+void convertMemSetToLoop(Instruction *ConvertedInst, Value *DstAddr,
+ Value *CopyLen, Value *SetValue, LLVMContext &Context,
+ Function &F) {
+ BasicBlock *OrigBB = ConvertedInst->getParent();
+ BasicBlock *NewBB =
+ ConvertedInst->getParent()->splitBasicBlock(ConvertedInst, "split");
+ BasicBlock *LoopBB = BasicBlock::Create(Context, "loadstoreloop", &F, NewBB);
+
+ OrigBB->getTerminator()->setSuccessor(0, LoopBB);
+ IRBuilder<> Builder(OrigBB->getTerminator());
+
+ // Cast pointer to the type of value getting stored
+ unsigned dstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+ DstAddr = Builder.CreateBitCast(DstAddr,
+ PointerType::get(SetValue->getType(), dstAS));
+
+ IRBuilder<> LoopBuilder(LoopBB);
+ PHINode *LoopIndex = LoopBuilder.CreatePHI(CopyLen->getType(), 0);
+ LoopIndex->addIncoming(ConstantInt::get(CopyLen->getType(), 0), OrigBB);
+
+ LoopBuilder.CreateStore(
+ SetValue,
+ LoopBuilder.CreateInBoundsGEP(SetValue->getType(), DstAddr, LoopIndex),
+ false);
+
+ Value *NewIndex =
+ LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(CopyLen->getType(), 1));
+ LoopIndex->addIncoming(NewIndex, LoopBB);
+
+ LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB,
+ NewBB);
+}
+
+bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
+ SmallVector<LoadInst *, 4> AggrLoads;
+ SmallVector<MemIntrinsic *, 4> MemCalls;
+
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ LLVMContext &Context = F.getParent()->getContext();
+
+ // Collect all aggregate loads and mem* calls.
+ for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
+ for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;
+ ++II) {
+ if (LoadInst *LI = dyn_cast<LoadInst>(II)) {
+ if (!LI->hasOneUse())
+ continue;
+
+ if (DL.getTypeStoreSize(LI->getType()) < MaxAggrCopySize)
+ continue;
+
+ if (StoreInst *SI = dyn_cast<StoreInst>(LI->user_back())) {
+ if (SI->getOperand(0) != LI)
+ continue;
+ AggrLoads.push_back(LI);
+ }
+ } else if (MemIntrinsic *IntrCall = dyn_cast<MemIntrinsic>(II)) {
+ // Convert intrinsic calls with variable size or with constant size
+ // larger than the MaxAggrCopySize threshold.
+ if (ConstantInt *LenCI = dyn_cast<ConstantInt>(IntrCall->getLength())) {
+ if (LenCI->getZExtValue() >= MaxAggrCopySize) {
+ MemCalls.push_back(IntrCall);
+ }
+ } else {
+ MemCalls.push_back(IntrCall);
+ }
+ }
+ }
+ }
+
+ if (AggrLoads.size() == 0 && MemCalls.size() == 0) {
+ return false;
+ }
+
+ //
+ // Do the transformation of an aggr load/copy/set to a loop
+ //
+ for (LoadInst *LI : AggrLoads) {
+ StoreInst *SI = dyn_cast<StoreInst>(*LI->user_begin());
+ Value *SrcAddr = LI->getOperand(0);
+ Value *DstAddr = SI->getOperand(1);
+ unsigned NumLoads = DL.getTypeStoreSize(LI->getType());
+ Value *CopyLen = ConstantInt::get(Type::getInt32Ty(Context), NumLoads);
+
+ convertMemCpyToLoop(/* ConvertedInst */ SI,
+ /* SrcAddr */ SrcAddr, /* DstAddr */ DstAddr,
+ /* CopyLen */ CopyLen,
+ /* SrcIsVolatile */ LI->isVolatile(),
+ /* DstIsVolatile */ SI->isVolatile(),
+ /* Context */ Context,
+ /* Function F */ F);
+
+ SI->eraseFromParent();
+ LI->eraseFromParent();
+ }
+
+ // Transform mem* intrinsic calls.
+ for (MemIntrinsic *MemCall : MemCalls) {
+ if (MemCpyInst *Memcpy = dyn_cast<MemCpyInst>(MemCall)) {
+ convertMemCpyToLoop(/* ConvertedInst */ Memcpy,
+ /* SrcAddr */ Memcpy->getRawSource(),
+ /* DstAddr */ Memcpy->getRawDest(),
+ /* CopyLen */ Memcpy->getLength(),
+ /* SrcIsVolatile */ Memcpy->isVolatile(),
+ /* DstIsVolatile */ Memcpy->isVolatile(),
+ /* Context */ Context,
+ /* Function F */ F);
+ } else if (MemMoveInst *Memmove = dyn_cast<MemMoveInst>(MemCall)) {
+ convertMemMoveToLoop(/* ConvertedInst */ Memmove,
+ /* SrcAddr */ Memmove->getRawSource(),
+ /* DstAddr */ Memmove->getRawDest(),
+ /* CopyLen */ Memmove->getLength(),
+ /* SrcIsVolatile */ Memmove->isVolatile(),
+ /* DstIsVolatile */ Memmove->isVolatile(),
+ /* Context */ Context,
+ /* Function F */ F);
+
+ } else if (MemSetInst *Memset = dyn_cast<MemSetInst>(MemCall)) {
+ convertMemSetToLoop(/* ConvertedInst */ Memset,
+ /* DstAddr */ Memset->getRawDest(),
+ /* CopyLen */ Memset->getLength(),
+ /* SetValue */ Memset->getValue(),
+ /* Context */ Context,
+ /* Function F */ F);
+ }
+ MemCall->eraseFromParent();
+ }
+
+ return true;
+}
+
+} // namespace
+
+namespace llvm {
+void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
+}
+
+INITIALIZE_PASS(NVPTXLowerAggrCopies, "nvptx-lower-aggr-copies",
+ "Lower aggregate copies, and llvm.mem* intrinsics into loops",
+ false, false)
+
+FunctionPass *llvm::createLowerAggrCopies() {
+ return new NVPTXLowerAggrCopies();
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.h b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.h
new file mode 100644
index 000000000000..3c39f53eb30a
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.h
@@ -0,0 +1,24 @@
+//===-- llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.h ------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the NVIDIA specific lowering of
+// aggregate copies
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXLOWERAGGRCOPIES_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXLOWERAGGRCOPIES_H
+
+namespace llvm {
+class FunctionPass;
+
+FunctionPass *createLowerAggrCopies();
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
new file mode 100644
index 000000000000..e94c1914029d
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
@@ -0,0 +1,118 @@
+//===-- NVPTXLowerAlloca.cpp - Make alloca to use local memory =====--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// For all alloca instructions, and add a pair of cast to local address for
+// each of them. For example,
+//
+// %A = alloca i32
+// store i32 0, i32* %A ; emits st.u32
+//
+// will be transformed to
+//
+// %A = alloca i32
+// %Local = addrspacecast i32* %A to i32 addrspace(5)*
+// %Generic = addrspacecast i32 addrspace(5)* %A to i32*
+// store i32 0, i32 addrspace(5)* %Generic ; emits st.local.u32
+//
+// And we will rely on NVPTXInferAddressSpaces to combine the last two
+// instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "NVPTXUtilities.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+namespace llvm {
+void initializeNVPTXLowerAllocaPass(PassRegistry &);
+}
+
+namespace {
+class NVPTXLowerAlloca : public BasicBlockPass {
+ bool runOnBasicBlock(BasicBlock &BB) override;
+
+public:
+ static char ID; // Pass identification, replacement for typeid
+ NVPTXLowerAlloca() : BasicBlockPass(ID) {}
+ StringRef getPassName() const override {
+ return "convert address space of alloca'ed memory to local";
+ }
+};
+} // namespace
+
+char NVPTXLowerAlloca::ID = 1;
+
+INITIALIZE_PASS(NVPTXLowerAlloca, "nvptx-lower-alloca",
+ "Lower Alloca", false, false)
+
+// =============================================================================
+// Main function for this pass.
+// =============================================================================
+bool NVPTXLowerAlloca::runOnBasicBlock(BasicBlock &BB) {
+ if (skipBasicBlock(BB))
+ return false;
+
+ bool Changed = false;
+ for (auto &I : BB) {
+ if (auto allocaInst = dyn_cast<AllocaInst>(&I)) {
+ Changed = true;
+ auto PTy = dyn_cast<PointerType>(allocaInst->getType());
+ auto ETy = PTy->getElementType();
+ auto LocalAddrTy = PointerType::get(ETy, ADDRESS_SPACE_LOCAL);
+ auto NewASCToLocal = new AddrSpaceCastInst(allocaInst, LocalAddrTy, "");
+ auto GenericAddrTy = PointerType::get(ETy, ADDRESS_SPACE_GENERIC);
+ auto NewASCToGeneric = new AddrSpaceCastInst(NewASCToLocal,
+ GenericAddrTy, "");
+ NewASCToLocal->insertAfter(allocaInst);
+ NewASCToGeneric->insertAfter(NewASCToLocal);
+ for (Value::use_iterator UI = allocaInst->use_begin(),
+ UE = allocaInst->use_end();
+ UI != UE; ) {
+ // Check Load, Store, GEP, and BitCast Uses on alloca and make them
+ // use the converted generic address, in order to expose non-generic
+ // addrspacecast to NVPTXInferAddressSpaces. For other types
+ // of instructions this is unnecessary and may introduce redundant
+ // address cast.
+ const auto &AllocaUse = *UI++;
+ auto LI = dyn_cast<LoadInst>(AllocaUse.getUser());
+ if (LI && LI->getPointerOperand() == allocaInst && !LI->isVolatile()) {
+ LI->setOperand(LI->getPointerOperandIndex(), NewASCToGeneric);
+ continue;
+ }
+ auto SI = dyn_cast<StoreInst>(AllocaUse.getUser());
+ if (SI && SI->getPointerOperand() == allocaInst && !SI->isVolatile()) {
+ SI->setOperand(SI->getPointerOperandIndex(), NewASCToGeneric);
+ continue;
+ }
+ auto GI = dyn_cast<GetElementPtrInst>(AllocaUse.getUser());
+ if (GI && GI->getPointerOperand() == allocaInst) {
+ GI->setOperand(GI->getPointerOperandIndex(), NewASCToGeneric);
+ continue;
+ }
+ auto BI = dyn_cast<BitCastInst>(AllocaUse.getUser());
+ if (BI && BI->getOperand(0) == allocaInst) {
+ BI->setOperand(0, NewASCToGeneric);
+ continue;
+ }
+ }
+ }
+ }
+ return Changed;
+}
+
+BasicBlockPass *llvm::createNVPTXLowerAllocaPass() {
+ return new NVPTXLowerAlloca();
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
new file mode 100644
index 000000000000..3f0c7be7863d
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -0,0 +1,253 @@
+//===-- NVPTXLowerArgs.cpp - Lower arguments ------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+// Arguments to kernel and device functions are passed via param space,
+// which imposes certain restrictions:
+// http://docs.nvidia.com/cuda/parallel-thread-execution/#state-spaces
+//
+// Kernel parameters are read-only and accessible only via ld.param
+// instruction, directly or via a pointer. Pointers to kernel
+// arguments can't be converted to generic address space.
+//
+// Device function parameters are directly accessible via
+// ld.param/st.param, but taking the address of one returns a pointer
+// to a copy created in local space which *can't* be used with
+// ld.param/st.param.
+//
+// Copying a byval struct into local memory in IR allows us to enforce
+// the param space restrictions, gives the rest of IR a pointer w/o
+// param space restrictions, and gives us an opportunity to eliminate
+// the copy.
+//
+// Pointer arguments to kernel functions need more work to be lowered:
+//
+// 1. Convert non-byval pointer arguments of CUDA kernels to pointers in the
+// global address space. This allows later optimizations to emit
+// ld.global.*/st.global.* for accessing these pointer arguments. For
+// example,
+//
+// define void @foo(float* %input) {
+// %v = load float, float* %input, align 4
+// ...
+// }
+//
+// becomes
+//
+// define void @foo(float* %input) {
+// %input2 = addrspacecast float* %input to float addrspace(1)*
+// %input3 = addrspacecast float addrspace(1)* %input2 to float*
+// %v = load float, float* %input3, align 4
+// ...
+// }
+//
+// Later, NVPTXInferAddressSpaces will optimize it to
+//
+// define void @foo(float* %input) {
+// %input2 = addrspacecast float* %input to float addrspace(1)*
+// %v = load float, float addrspace(1)* %input2, align 4
+// ...
+// }
+//
+// 2. Convert pointers in a byval kernel parameter to pointers in the global
+// address space. As #2, it allows NVPTX to emit more ld/st.global. E.g.,
+//
+// struct S {
+// int *x;
+// int *y;
+// };
+// __global__ void foo(S s) {
+// int *b = s.y;
+// // use b
+// }
+//
+// "b" points to the global address space. In the IR level,
+//
+// define void @foo({i32*, i32*}* byval %input) {
+// %b_ptr = getelementptr {i32*, i32*}, {i32*, i32*}* %input, i64 0, i32 1
+// %b = load i32*, i32** %b_ptr
+// ; use %b
+// }
+//
+// becomes
+//
+// define void @foo({i32*, i32*}* byval %input) {
+// %b_ptr = getelementptr {i32*, i32*}, {i32*, i32*}* %input, i64 0, i32 1
+// %b = load i32*, i32** %b_ptr
+// %b_global = addrspacecast i32* %b to i32 addrspace(1)*
+// %b_generic = addrspacecast i32 addrspace(1)* %b_global to i32*
+// ; use %b_generic
+// }
+//
+// TODO: merge this pass with NVPTXInferAddressSpaces so that other passes don't
+// cancel the addrspacecast pair this pass emits.
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "NVPTXUtilities.h"
+#include "NVPTXTargetMachine.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+namespace llvm {
+void initializeNVPTXLowerArgsPass(PassRegistry &);
+}
+
+namespace {
+class NVPTXLowerArgs : public FunctionPass {
+ bool runOnFunction(Function &F) override;
+
+ bool runOnKernelFunction(Function &F);
+ bool runOnDeviceFunction(Function &F);
+
+ // handle byval parameters
+ void handleByValParam(Argument *Arg);
+ // Knowing Ptr must point to the global address space, this function
+ // addrspacecasts Ptr to global and then back to generic. This allows
+ // NVPTXInferAddressSpaces to fold the global-to-generic cast into
+ // loads/stores that appear later.
+ void markPointerAsGlobal(Value *Ptr);
+
+public:
+ static char ID; // Pass identification, replacement for typeid
+ NVPTXLowerArgs(const NVPTXTargetMachine *TM = nullptr)
+ : FunctionPass(ID), TM(TM) {}
+ StringRef getPassName() const override {
+ return "Lower pointer arguments of CUDA kernels";
+ }
+
+private:
+ const NVPTXTargetMachine *TM;
+};
+} // namespace
+
+char NVPTXLowerArgs::ID = 1;
+
+INITIALIZE_PASS(NVPTXLowerArgs, "nvptx-lower-args",
+ "Lower arguments (NVPTX)", false, false)
+
+// =============================================================================
+// If the function had a byval struct ptr arg, say foo(%struct.x* byval %d),
+// then add the following instructions to the first basic block:
+//
+// %temp = alloca %struct.x, align 8
+// %tempd = addrspacecast %struct.x* %d to %struct.x addrspace(101)*
+// %tv = load %struct.x addrspace(101)* %tempd
+// store %struct.x %tv, %struct.x* %temp, align 8
+//
+// The above code allocates some space in the stack and copies the incoming
+// struct from param space to local space.
+// Then replace all occurrences of %d by %temp.
+// =============================================================================
+void NVPTXLowerArgs::handleByValParam(Argument *Arg) {
+ Function *Func = Arg->getParent();
+ Instruction *FirstInst = &(Func->getEntryBlock().front());
+ PointerType *PType = dyn_cast<PointerType>(Arg->getType());
+
+ assert(PType && "Expecting pointer type in handleByValParam");
+
+ Type *StructType = PType->getElementType();
+ AllocaInst *AllocA = new AllocaInst(StructType, Arg->getName(), FirstInst);
+ // Set the alignment to alignment of the byval parameter. This is because,
+ // later load/stores assume that alignment, and we are going to replace
+ // the use of the byval parameter with this alloca instruction.
+ AllocA->setAlignment(Func->getParamAlignment(Arg->getArgNo() + 1));
+ Arg->replaceAllUsesWith(AllocA);
+
+ Value *ArgInParam = new AddrSpaceCastInst(
+ Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(),
+ FirstInst);
+ LoadInst *LI = new LoadInst(ArgInParam, Arg->getName(), FirstInst);
+ new StoreInst(LI, AllocA, FirstInst);
+}
+
+void NVPTXLowerArgs::markPointerAsGlobal(Value *Ptr) {
+ if (Ptr->getType()->getPointerAddressSpace() == ADDRESS_SPACE_GLOBAL)
+ return;
+
+ // Deciding where to emit the addrspacecast pair.
+ BasicBlock::iterator InsertPt;
+ if (Argument *Arg = dyn_cast<Argument>(Ptr)) {
+ // Insert at the functon entry if Ptr is an argument.
+ InsertPt = Arg->getParent()->getEntryBlock().begin();
+ } else {
+ // Insert right after Ptr if Ptr is an instruction.
+ InsertPt = ++cast<Instruction>(Ptr)->getIterator();
+ assert(InsertPt != InsertPt->getParent()->end() &&
+ "We don't call this function with Ptr being a terminator.");
+ }
+
+ Instruction *PtrInGlobal = new AddrSpaceCastInst(
+ Ptr, PointerType::get(Ptr->getType()->getPointerElementType(),
+ ADDRESS_SPACE_GLOBAL),
+ Ptr->getName(), &*InsertPt);
+ Value *PtrInGeneric = new AddrSpaceCastInst(PtrInGlobal, Ptr->getType(),
+ Ptr->getName(), &*InsertPt);
+ // Replace with PtrInGeneric all uses of Ptr except PtrInGlobal.
+ Ptr->replaceAllUsesWith(PtrInGeneric);
+ PtrInGlobal->setOperand(0, Ptr);
+}
+
+// =============================================================================
+// Main function for this pass.
+// =============================================================================
+bool NVPTXLowerArgs::runOnKernelFunction(Function &F) {
+ if (TM && TM->getDrvInterface() == NVPTX::CUDA) {
+ // Mark pointers in byval structs as global.
+ for (auto &B : F) {
+ for (auto &I : B) {
+ if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
+ if (LI->getType()->isPointerTy()) {
+ Value *UO = GetUnderlyingObject(LI->getPointerOperand(),
+ F.getParent()->getDataLayout());
+ if (Argument *Arg = dyn_cast<Argument>(UO)) {
+ if (Arg->hasByValAttr()) {
+ // LI is a load from a pointer within a byval kernel parameter.
+ markPointerAsGlobal(LI);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ for (Argument &Arg : F.args()) {
+ if (Arg.getType()->isPointerTy()) {
+ if (Arg.hasByValAttr())
+ handleByValParam(&Arg);
+ else if (TM && TM->getDrvInterface() == NVPTX::CUDA)
+ markPointerAsGlobal(&Arg);
+ }
+ }
+ return true;
+}
+
+// Device functions only need to copy byval args into local memory.
+bool NVPTXLowerArgs::runOnDeviceFunction(Function &F) {
+ for (Argument &Arg : F.args())
+ if (Arg.getType()->isPointerTy() && Arg.hasByValAttr())
+ handleByValParam(&Arg);
+ return true;
+}
+
+bool NVPTXLowerArgs::runOnFunction(Function &F) {
+ return isKernelFunction(F) ? runOnKernelFunction(F) : runOnDeviceFunction(F);
+}
+
+FunctionPass *
+llvm::createNVPTXLowerArgsPass(const NVPTXTargetMachine *TM) {
+ return new NVPTXLowerArgs(TM);
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
new file mode 100644
index 000000000000..eab5ee80561e
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
@@ -0,0 +1,60 @@
+//===-- NVPTXMCExpr.cpp - NVPTX specific MC expression classes ------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXMCExpr.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "nvptx-mcexpr"
+
+const NVPTXFloatMCExpr *
+NVPTXFloatMCExpr::create(VariantKind Kind, const APFloat &Flt, MCContext &Ctx) {
+ return new (Ctx) NVPTXFloatMCExpr(Kind, Flt);
+}
+
+void NVPTXFloatMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
+ bool Ignored;
+ unsigned NumHex;
+ APFloat APF = getAPFloat();
+
+ switch (Kind) {
+ default: llvm_unreachable("Invalid kind!");
+ case VK_NVPTX_SINGLE_PREC_FLOAT:
+ OS << "0f";
+ NumHex = 8;
+ APF.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven, &Ignored);
+ break;
+ case VK_NVPTX_DOUBLE_PREC_FLOAT:
+ OS << "0d";
+ NumHex = 16;
+ APF.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven, &Ignored);
+ break;
+ }
+
+ APInt API = APF.bitcastToAPInt();
+ std::string HexStr(utohexstr(API.getZExtValue()));
+ if (HexStr.length() < NumHex)
+ OS << std::string(NumHex - HexStr.length(), '0');
+ OS << utohexstr(API.getZExtValue());
+}
+
+const NVPTXGenericMCSymbolRefExpr*
+NVPTXGenericMCSymbolRefExpr::create(const MCSymbolRefExpr *SymExpr,
+ MCContext &Ctx) {
+ return new (Ctx) NVPTXGenericMCSymbolRefExpr(SymExpr);
+}
+
+void NVPTXGenericMCSymbolRefExpr::printImpl(raw_ostream &OS,
+ const MCAsmInfo *MAI) const {
+ OS << "generic(";
+ SymExpr->print(OS, MAI);
+ OS << ")";
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.h b/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.h
new file mode 100644
index 000000000000..7f833c42fa8f
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.h
@@ -0,0 +1,125 @@
+//===-- NVPTXMCExpr.h - NVPTX specific MC expression classes ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// Modeled after ARMMCExpr
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXMCEXPR_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXMCEXPR_H
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/MC/MCExpr.h"
+#include <utility>
+
+namespace llvm {
+
+class NVPTXFloatMCExpr : public MCTargetExpr {
+public:
+ enum VariantKind {
+ VK_NVPTX_None,
+ VK_NVPTX_SINGLE_PREC_FLOAT, // FP constant in single-precision
+ VK_NVPTX_DOUBLE_PREC_FLOAT // FP constant in double-precision
+ };
+
+private:
+ const VariantKind Kind;
+ const APFloat Flt;
+
+ explicit NVPTXFloatMCExpr(VariantKind Kind, APFloat Flt)
+ : Kind(Kind), Flt(std::move(Flt)) {}
+
+public:
+ /// @name Construction
+ /// @{
+
+ static const NVPTXFloatMCExpr *create(VariantKind Kind, const APFloat &Flt,
+ MCContext &Ctx);
+
+ static const NVPTXFloatMCExpr *createConstantFPSingle(const APFloat &Flt,
+ MCContext &Ctx) {
+ return create(VK_NVPTX_SINGLE_PREC_FLOAT, Flt, Ctx);
+ }
+
+ static const NVPTXFloatMCExpr *createConstantFPDouble(const APFloat &Flt,
+ MCContext &Ctx) {
+ return create(VK_NVPTX_DOUBLE_PREC_FLOAT, Flt, Ctx);
+ }
+
+ /// @}
+ /// @name Accessors
+ /// @{
+
+ /// getOpcode - Get the kind of this expression.
+ VariantKind getKind() const { return Kind; }
+
+ /// getSubExpr - Get the child of this expression.
+ APFloat getAPFloat() const { return Flt; }
+
+/// @}
+
+ void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+ bool evaluateAsRelocatableImpl(MCValue &Res,
+ const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const override {
+ return false;
+ }
+ void visitUsedExpr(MCStreamer &Streamer) const override {};
+ MCFragment *findAssociatedFragment() const override { return nullptr; }
+
+ // There are no TLS NVPTXMCExprs at the moment.
+ void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {}
+
+ static bool classof(const MCExpr *E) {
+ return E->getKind() == MCExpr::Target;
+ }
+};
+
+/// A wrapper for MCSymbolRefExpr that tells the assembly printer that the
+/// symbol should be enclosed by generic().
+class NVPTXGenericMCSymbolRefExpr : public MCTargetExpr {
+private:
+ const MCSymbolRefExpr *SymExpr;
+
+ explicit NVPTXGenericMCSymbolRefExpr(const MCSymbolRefExpr *_SymExpr)
+ : SymExpr(_SymExpr) {}
+
+public:
+ /// @name Construction
+ /// @{
+
+ static const NVPTXGenericMCSymbolRefExpr
+ *create(const MCSymbolRefExpr *SymExpr, MCContext &Ctx);
+
+ /// @}
+ /// @name Accessors
+ /// @{
+
+ /// getOpcode - Get the kind of this expression.
+ const MCSymbolRefExpr *getSymbolExpr() const { return SymExpr; }
+
+ /// @}
+
+ void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+ bool evaluateAsRelocatableImpl(MCValue &Res,
+ const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const override {
+ return false;
+ }
+ void visitUsedExpr(MCStreamer &Streamer) const override {};
+ MCFragment *findAssociatedFragment() const override { return nullptr; }
+
+ // There are no TLS NVPTXMCExprs at the moment.
+ void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {}
+
+ static bool classof(const MCExpr *E) {
+ return E->getKind() == MCExpr::Target;
+ }
+ };
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h b/contrib/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h
new file mode 100644
index 000000000000..10f1135ad841
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h
@@ -0,0 +1,51 @@
+//===-- NVPTXMachineFunctionInfo.h - NVPTX-specific Function Info --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class is attached to a MachineFunction instance and tracks target-
+// dependent information
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXMACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXMACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+class NVPTXMachineFunctionInfo : public MachineFunctionInfo {
+private:
+ /// Stores a mapping from index to symbol name for removing image handles
+ /// on Fermi.
+ SmallVector<std::string, 8> ImageHandleList;
+
+public:
+ NVPTXMachineFunctionInfo(MachineFunction &MF) {}
+
+ /// Returns the index for the symbol \p Symbol. If the symbol was previously,
+ /// added, the same index is returned. Otherwise, the symbol is added and the
+ /// new index is returned.
+ unsigned getImageHandleSymbolIndex(const char *Symbol) {
+ // Is the symbol already present?
+ for (unsigned i = 0, e = ImageHandleList.size(); i != e; ++i)
+ if (ImageHandleList[i] == std::string(Symbol))
+ return i;
+ // Nope, insert it
+ ImageHandleList.push_back(Symbol);
+ return ImageHandleList.size()-1;
+ }
+
+ /// Returns the symbol name at the given index.
+ const char *getImageHandleSymbol(unsigned Idx) const {
+ assert(ImageHandleList.size() > Idx && "Bad index");
+ return ImageHandleList[Idx].c_str();
+ }
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXPeephole.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXPeephole.cpp
new file mode 100644
index 000000000000..49e639793efc
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXPeephole.cpp
@@ -0,0 +1,157 @@
+//===-- NVPTXPeephole.cpp - NVPTX Peephole Optimiztions -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// In NVPTX, NVPTXFrameLowering will emit following instruction at the beginning
+// of a MachineFunction.
+//
+// mov %SPL, %depot
+// cvta.local %SP, %SPL
+//
+// Because Frame Index is a generic address and alloca can only return generic
+// pointer, without this pass the instructions producing alloca'ed address will
+// be based on %SP. NVPTXLowerAlloca tends to help replace store and load on
+// this address with their .local versions, but this may introduce a lot of
+// cvta.to.local instructions. Performance can be improved if we avoid casting
+// address back and forth and directly calculate local address based on %SPL.
+// This peephole pass optimizes these cases, for example
+//
+// It will transform the following pattern
+// %vreg0<def> = LEA_ADDRi64 %VRFrame, 4
+// %vreg1<def> = cvta_to_local_yes_64 %vreg0
+//
+// into
+// %vreg1<def> = LEA_ADDRi64 %VRFrameLocal, 4
+//
+// %VRFrameLocal is the virtual register name of %SPL
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "nvptx-peephole"
+
+namespace llvm {
+void initializeNVPTXPeepholePass(PassRegistry &);
+}
+
+namespace {
+struct NVPTXPeephole : public MachineFunctionPass {
+ public:
+ static char ID;
+ NVPTXPeephole() : MachineFunctionPass(ID) {
+ initializeNVPTXPeepholePass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "NVPTX optimize redundant cvta.to.local instruction";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+}
+
+char NVPTXPeephole::ID = 0;
+
+INITIALIZE_PASS(NVPTXPeephole, "nvptx-peephole", "NVPTX Peephole", false, false)
+
+static bool isCVTAToLocalCombinationCandidate(MachineInstr &Root) {
+ auto &MBB = *Root.getParent();
+ auto &MF = *MBB.getParent();
+ // Check current instruction is cvta.to.local
+ if (Root.getOpcode() != NVPTX::cvta_to_local_yes_64 &&
+ Root.getOpcode() != NVPTX::cvta_to_local_yes)
+ return false;
+
+ auto &Op = Root.getOperand(1);
+ const auto &MRI = MF.getRegInfo();
+ MachineInstr *GenericAddrDef = nullptr;
+ if (Op.isReg() && TargetRegisterInfo::isVirtualRegister(Op.getReg())) {
+ GenericAddrDef = MRI.getUniqueVRegDef(Op.getReg());
+ }
+
+ // Check the register operand is uniquely defined by LEA_ADDRi instruction
+ if (!GenericAddrDef || GenericAddrDef->getParent() != &MBB ||
+ (GenericAddrDef->getOpcode() != NVPTX::LEA_ADDRi64 &&
+ GenericAddrDef->getOpcode() != NVPTX::LEA_ADDRi)) {
+ return false;
+ }
+
+ // Check the LEA_ADDRi operand is Frame index
+ auto &BaseAddrOp = GenericAddrDef->getOperand(1);
+ if (BaseAddrOp.isReg() && BaseAddrOp.getReg() == NVPTX::VRFrame) {
+ return true;
+ }
+
+ return false;
+}
+
+static void CombineCVTAToLocal(MachineInstr &Root) {
+ auto &MBB = *Root.getParent();
+ auto &MF = *MBB.getParent();
+ const auto &MRI = MF.getRegInfo();
+ const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+ auto &Prev = *MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
+
+ MachineInstrBuilder MIB =
+ BuildMI(MF, Root.getDebugLoc(), TII->get(Prev.getOpcode()),
+ Root.getOperand(0).getReg())
+ .addReg(NVPTX::VRFrameLocal)
+ .addOperand(Prev.getOperand(2));
+
+ MBB.insert((MachineBasicBlock::iterator)&Root, MIB);
+
+ // Check if MRI has only one non dbg use, which is Root
+ if (MRI.hasOneNonDBGUse(Prev.getOperand(0).getReg())) {
+ Prev.eraseFromParentAndMarkDBGValuesForRemoval();
+ }
+ Root.eraseFromParentAndMarkDBGValuesForRemoval();
+}
+
+bool NVPTXPeephole::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
+ bool Changed = false;
+ // Loop over all of the basic blocks.
+ for (auto &MBB : MF) {
+ // Traverse the basic block.
+ auto BlockIter = MBB.begin();
+
+ while (BlockIter != MBB.end()) {
+ auto &MI = *BlockIter++;
+ if (isCVTAToLocalCombinationCandidate(MI)) {
+ CombineCVTAToLocal(MI);
+ Changed = true;
+ }
+ } // Instruction
+ } // Basic Block
+
+ // Remove unnecessary %VRFrame = cvta.local %VRFrameLocal
+ const auto &MRI = MF.getRegInfo();
+ if (MRI.use_empty(NVPTX::VRFrame)) {
+ if (auto MI = MRI.getUniqueVRegDef(NVPTX::VRFrame)) {
+ MI->eraseFromParentAndMarkDBGValuesForRemoval();
+ }
+ }
+
+ return Changed;
+}
+
+MachineFunctionPass *llvm::createNVPTXPeephole() { return new NVPTXPeephole(); }
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
new file mode 100644
index 000000000000..88288abe64f9
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
@@ -0,0 +1,227 @@
+//===-- NVPTXPrologEpilogPass.cpp - NVPTX prolog/epilog inserter ----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a copy of the generic LLVM PrologEpilogInserter pass, modified
+// to remove unneeded functionality and to handle virtual registers. Most code
+// here is a copy of PrologEpilogInserter.cpp.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "nvptx-prolog-epilog"
+
+namespace {
+class NVPTXPrologEpilogPass : public MachineFunctionPass {
+public:
+ static char ID;
+ NVPTXPrologEpilogPass() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+ void calculateFrameObjectOffsets(MachineFunction &Fn);
+};
+}
+
+MachineFunctionPass *llvm::createNVPTXPrologEpilogPass() {
+ return new NVPTXPrologEpilogPass();
+}
+
+char NVPTXPrologEpilogPass::ID = 0;
+
+bool NVPTXPrologEpilogPass::runOnMachineFunction(MachineFunction &MF) {
+ const TargetSubtargetInfo &STI = MF.getSubtarget();
+ const TargetFrameLowering &TFI = *STI.getFrameLowering();
+ const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+ bool Modified = false;
+
+ calculateFrameObjectOffsets(MF);
+
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ if (!MI.getOperand(i).isFI())
+ continue;
+ TRI.eliminateFrameIndex(MI, 0, i, nullptr);
+ Modified = true;
+ }
+ }
+ }
+
+ // Add function prolog/epilog
+ TFI.emitPrologue(MF, MF.front());
+
+ for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
+ // If last instruction is a return instruction, add an epilogue
+ if (I->isReturnBlock())
+ TFI.emitEpilogue(MF, *I);
+ }
+
+ return Modified;
+}
+
+/// AdjustStackOffset - Helper function used to adjust the stack frame offset.
+static inline void
+AdjustStackOffset(MachineFrameInfo &MFI, int FrameIdx,
+ bool StackGrowsDown, int64_t &Offset,
+ unsigned &MaxAlign) {
+ // If the stack grows down, add the object size to find the lowest address.
+ if (StackGrowsDown)
+ Offset += MFI.getObjectSize(FrameIdx);
+
+ unsigned Align = MFI.getObjectAlignment(FrameIdx);
+
+ // If the alignment of this object is greater than that of the stack, then
+ // increase the stack alignment to match.
+ MaxAlign = std::max(MaxAlign, Align);
+
+ // Adjust to alignment boundary.
+ Offset = (Offset + Align - 1) / Align * Align;
+
+ if (StackGrowsDown) {
+ DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << -Offset << "]\n");
+ MFI.setObjectOffset(FrameIdx, -Offset); // Set the computed offset
+ } else {
+ DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << Offset << "]\n");
+ MFI.setObjectOffset(FrameIdx, Offset);
+ Offset += MFI.getObjectSize(FrameIdx);
+ }
+}
+
+void
+NVPTXPrologEpilogPass::calculateFrameObjectOffsets(MachineFunction &Fn) {
+ const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
+ const TargetRegisterInfo *RegInfo = Fn.getSubtarget().getRegisterInfo();
+
+ bool StackGrowsDown =
+ TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
+
+ // Loop over all of the stack objects, assigning sequential addresses...
+ MachineFrameInfo &MFI = Fn.getFrameInfo();
+
+ // Start at the beginning of the local area.
+ // The Offset is the distance from the stack top in the direction
+ // of stack growth -- so it's always nonnegative.
+ int LocalAreaOffset = TFI.getOffsetOfLocalArea();
+ if (StackGrowsDown)
+ LocalAreaOffset = -LocalAreaOffset;
+ assert(LocalAreaOffset >= 0
+ && "Local area offset should be in direction of stack growth");
+ int64_t Offset = LocalAreaOffset;
+
+ // If there are fixed sized objects that are preallocated in the local area,
+ // non-fixed objects can't be allocated right at the start of local area.
+ // We currently don't support filling in holes in between fixed sized
+ // objects, so we adjust 'Offset' to point to the end of last fixed sized
+ // preallocated object.
+ for (int i = MFI.getObjectIndexBegin(); i != 0; ++i) {
+ int64_t FixedOff;
+ if (StackGrowsDown) {
+ // The maximum distance from the stack pointer is at lower address of
+ // the object -- which is given by offset. For down growing stack
+ // the offset is negative, so we negate the offset to get the distance.
+ FixedOff = -MFI.getObjectOffset(i);
+ } else {
+ // The maximum distance from the start pointer is at the upper
+ // address of the object.
+ FixedOff = MFI.getObjectOffset(i) + MFI.getObjectSize(i);
+ }
+ if (FixedOff > Offset) Offset = FixedOff;
+ }
+
+ // NOTE: We do not have a call stack
+
+ unsigned MaxAlign = MFI.getMaxAlignment();
+
+ // No scavenger
+
+ // FIXME: Once this is working, then enable flag will change to a target
+ // check for whether the frame is large enough to want to use virtual
+ // frame index registers. Functions which don't want/need this optimization
+ // will continue to use the existing code path.
+ if (MFI.getUseLocalStackAllocationBlock()) {
+ unsigned Align = MFI.getLocalFrameMaxAlign();
+
+ // Adjust to alignment boundary.
+ Offset = (Offset + Align - 1) / Align * Align;
+
+ DEBUG(dbgs() << "Local frame base offset: " << Offset << "\n");
+
+ // Resolve offsets for objects in the local block.
+ for (unsigned i = 0, e = MFI.getLocalFrameObjectCount(); i != e; ++i) {
+ std::pair<int, int64_t> Entry = MFI.getLocalFrameObjectMap(i);
+ int64_t FIOffset = (StackGrowsDown ? -Offset : Offset) + Entry.second;
+ DEBUG(dbgs() << "alloc FI(" << Entry.first << ") at SP[" <<
+ FIOffset << "]\n");
+ MFI.setObjectOffset(Entry.first, FIOffset);
+ }
+ // Allocate the local block
+ Offset += MFI.getLocalFrameSize();
+
+ MaxAlign = std::max(Align, MaxAlign);
+ }
+
+ // No stack protector
+
+ // Then assign frame offsets to stack objects that are not used to spill
+ // callee saved registers.
+ for (unsigned i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
+ if (MFI.isObjectPreAllocated(i) &&
+ MFI.getUseLocalStackAllocationBlock())
+ continue;
+ if (MFI.isDeadObjectIndex(i))
+ continue;
+
+ AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign);
+ }
+
+ // No scavenger
+
+ if (!TFI.targetHandlesStackFrameRounding()) {
+ // If we have reserved argument space for call sites in the function
+ // immediately on entry to the current function, count it as part of the
+ // overall stack size.
+ if (MFI.adjustsStack() && TFI.hasReservedCallFrame(Fn))
+ Offset += MFI.getMaxCallFrameSize();
+
+ // Round up the size to a multiple of the alignment. If the function has
+ // any calls or alloca's, align to the target's StackAlignment value to
+ // ensure that the callee's frame or the alloca data is suitably aligned;
+ // otherwise, for leaf functions, align to the TransientStackAlignment
+ // value.
+ unsigned StackAlign;
+ if (MFI.adjustsStack() || MFI.hasVarSizedObjects() ||
+ (RegInfo->needsStackRealignment(Fn) && MFI.getObjectIndexEnd() != 0))
+ StackAlign = TFI.getStackAlignment();
+ else
+ StackAlign = TFI.getTransientStackAlignment();
+
+ // If the frame pointer is eliminated, all frame offsets will be relative to
+ // SP not FP. Align to MaxAlign so this works.
+ StackAlign = std::max(StackAlign, MaxAlign);
+ unsigned AlignMask = StackAlign - 1;
+ Offset = (Offset + AlignMask) & ~uint64_t(AlignMask);
+ }
+
+ // Update frame info to pretend that this is part of the stack...
+ int64_t StackSize = Offset - LocalAreaOffset;
+ MFI.setStackSize(StackSize);
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
new file mode 100644
index 000000000000..6cbf0604d7ef
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
@@ -0,0 +1,128 @@
+//===- NVPTXRegisterInfo.cpp - NVPTX Register Information -----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXRegisterInfo.h"
+#include "NVPTX.h"
+#include "NVPTXSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "nvptx-reg-info"
+
+namespace llvm {
+std::string getNVPTXRegClassName(TargetRegisterClass const *RC) {
+ if (RC == &NVPTX::Float32RegsRegClass) {
+ return ".f32";
+ }
+ if (RC == &NVPTX::Float64RegsRegClass) {
+ return ".f64";
+ } else if (RC == &NVPTX::Int64RegsRegClass) {
+ // We use untyped (.b) integer registers here as NVCC does.
+ // Correctness of generated code does not depend on register type,
+ // but using .s/.u registers runs into ptxas bug that prevents
+ // assembly of otherwise valid PTX into SASS. Despite PTX ISA
+ // specifying only argument size for fp16 instructions, ptxas does
+ // not allow using .s16 or .u16 arguments for .fp16
+ // instructions. At the same time it allows using .s32/.u32
+ // arguments for .fp16v2 instructions:
+ //
+ // .reg .b16 rb16
+ // .reg .s16 rs16
+ // add.f16 rb16,rb16,rb16; // OK
+ // add.f16 rs16,rs16,rs16; // Arguments mismatch for instruction 'add'
+ // but:
+ // .reg .b32 rb32
+ // .reg .s32 rs32
+ // add.f16v2 rb32,rb32,rb32; // OK
+ // add.f16v2 rs32,rs32,rs32; // OK
+ return ".b64";
+ } else if (RC == &NVPTX::Int32RegsRegClass) {
+ return ".b32";
+ } else if (RC == &NVPTX::Int16RegsRegClass) {
+ return ".b16";
+ } else if (RC == &NVPTX::Int1RegsRegClass) {
+ return ".pred";
+ } else if (RC == &NVPTX::SpecialRegsRegClass) {
+ return "!Special!";
+ } else {
+ return "INTERNAL";
+ }
+ return "";
+}
+
+std::string getNVPTXRegClassStr(TargetRegisterClass const *RC) {
+ if (RC == &NVPTX::Float32RegsRegClass) {
+ return "%f";
+ }
+ if (RC == &NVPTX::Float64RegsRegClass) {
+ return "%fd";
+ } else if (RC == &NVPTX::Int64RegsRegClass) {
+ return "%rd";
+ } else if (RC == &NVPTX::Int32RegsRegClass) {
+ return "%r";
+ } else if (RC == &NVPTX::Int16RegsRegClass) {
+ return "%rs";
+ } else if (RC == &NVPTX::Int1RegsRegClass) {
+ return "%p";
+ } else if (RC == &NVPTX::SpecialRegsRegClass) {
+ return "!Special!";
+ } else {
+ return "INTERNAL";
+ }
+ return "";
+}
+}
+
+NVPTXRegisterInfo::NVPTXRegisterInfo() : NVPTXGenRegisterInfo(0) {}
+
+#define GET_REGINFO_TARGET_DESC
+#include "NVPTXGenRegisterInfo.inc"
+
+/// NVPTX Callee Saved Registers
+const MCPhysReg *
+NVPTXRegisterInfo::getCalleeSavedRegs(const MachineFunction *) const {
+ static const MCPhysReg CalleeSavedRegs[] = { 0 };
+ return CalleeSavedRegs;
+}
+
+BitVector NVPTXRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+ BitVector Reserved(getNumRegs());
+ return Reserved;
+}
+
+void NVPTXRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+ int SPAdj, unsigned FIOperandNum,
+ RegScavenger *RS) const {
+ assert(SPAdj == 0 && "Unexpected");
+
+ MachineInstr &MI = *II;
+ int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+
+ MachineFunction &MF = *MI.getParent()->getParent();
+ int Offset = MF.getFrameInfo().getObjectOffset(FrameIndex) +
+ MI.getOperand(FIOperandNum + 1).getImm();
+
+ // Using I0 as the frame pointer
+ MI.getOperand(FIOperandNum).ChangeToRegister(NVPTX::VRFrame, false);
+ MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+}
+
+unsigned NVPTXRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+ return NVPTX::VRFrame;
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.h b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.h
new file mode 100644
index 000000000000..c310a9c1ad0c
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.h
@@ -0,0 +1,65 @@
+//===- NVPTXRegisterInfo.h - NVPTX Register Information Impl ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXREGISTERINFO_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXREGISTERINFO_H
+
+#include "ManagedStringPool.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <sstream>
+
+#define GET_REGINFO_HEADER
+#include "NVPTXGenRegisterInfo.inc"
+
+namespace llvm {
+class NVPTXRegisterInfo : public NVPTXGenRegisterInfo {
+private:
+ // Hold Strings that can be free'd all together with NVPTXRegisterInfo
+ ManagedStringPool ManagedStrPool;
+
+public:
+ NVPTXRegisterInfo();
+
+ //------------------------------------------------------
+ // Pure virtual functions from TargetRegisterInfo
+ //------------------------------------------------------
+
+ // NVPTX callee saved registers
+ const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+
+ BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+ void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
+ unsigned FIOperandNum,
+ RegScavenger *RS = nullptr) const override;
+
+ unsigned getFrameRegister(const MachineFunction &MF) const override;
+
+ ManagedStringPool *getStrPool() const {
+ return const_cast<ManagedStringPool *>(&ManagedStrPool);
+ }
+
+ const char *getName(unsigned RegNo) const {
+ std::stringstream O;
+ O << "reg" << RegNo;
+ return getStrPool()->getManagedString(O.str().c_str())->c_str();
+ }
+
+};
+
+std::string getNVPTXRegClassName(const TargetRegisterClass *RC);
+std::string getNVPTXRegClassStr(const TargetRegisterClass *RC);
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
new file mode 100644
index 000000000000..ff6ccc457db7
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
@@ -0,0 +1,69 @@
+//===-- NVPTXRegisterInfo.td - NVPTX Register defs ---------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Declarations that describe the PTX register file
+//===----------------------------------------------------------------------===//
+
+class NVPTXReg<string n> : Register<n> {
+ let Namespace = "NVPTX";
+}
+
+class NVPTXRegClass<list<ValueType> regTypes, int alignment, dag regList>
+ : RegisterClass <"NVPTX", regTypes, alignment, regList>;
+
+//===----------------------------------------------------------------------===//
+// Registers
+//===----------------------------------------------------------------------===//
+
+// Special Registers used as stack pointer
+def VRFrame : NVPTXReg<"%SP">;
+def VRFrameLocal : NVPTXReg<"%SPL">;
+
+// Special Registers used as the stack
+def VRDepot : NVPTXReg<"%Depot">;
+
+// We use virtual registers, but define a few physical registers here to keep
+// SDAG and the MachineInstr layers happy.
+foreach i = 0-4 in {
+ def P#i : NVPTXReg<"%p"#i>; // Predicate
+ def RS#i : NVPTXReg<"%rs"#i>; // 16-bit
+ def R#i : NVPTXReg<"%r"#i>; // 32-bit
+ def RL#i : NVPTXReg<"%rd"#i>; // 64-bit
+ def F#i : NVPTXReg<"%f"#i>; // 32-bit float
+ def FL#i : NVPTXReg<"%fd"#i>; // 64-bit float
+
+ // Arguments
+ def ia#i : NVPTXReg<"%ia"#i>;
+ def la#i : NVPTXReg<"%la"#i>;
+ def fa#i : NVPTXReg<"%fa"#i>;
+ def da#i : NVPTXReg<"%da"#i>;
+}
+
+foreach i = 0-31 in {
+ def ENVREG#i : NVPTXReg<"%envreg"#i>;
+}
+
+//===----------------------------------------------------------------------===//
+// Register classes
+//===----------------------------------------------------------------------===//
+def Int1Regs : NVPTXRegClass<[i1], 8, (add (sequence "P%u", 0, 4))>;
+def Int16Regs : NVPTXRegClass<[i16], 16, (add (sequence "RS%u", 0, 4))>;
+def Int32Regs : NVPTXRegClass<[i32], 32, (add (sequence "R%u", 0, 4))>;
+def Int64Regs : NVPTXRegClass<[i64], 64, (add (sequence "RL%u", 0, 4))>;
+def Float32Regs : NVPTXRegClass<[f32], 32, (add (sequence "F%u", 0, 4))>;
+def Float64Regs : NVPTXRegClass<[f64], 64, (add (sequence "FL%u", 0, 4))>;
+def Int32ArgRegs : NVPTXRegClass<[i32], 32, (add (sequence "ia%u", 0, 4))>;
+def Int64ArgRegs : NVPTXRegClass<[i64], 64, (add (sequence "la%u", 0, 4))>;
+def Float32ArgRegs : NVPTXRegClass<[f32], 32, (add (sequence "fa%u", 0, 4))>;
+def Float64ArgRegs : NVPTXRegClass<[f64], 64, (add (sequence "da%u", 0, 4))>;
+
+// Read NVPTXRegisterInfo.cpp to see how VRFrame and VRDepot are used.
+def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRFrameLocal, VRDepot,
+ (sequence "ENVREG%u", 0, 31))>;
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
new file mode 100644
index 000000000000..2022caca76ee
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
@@ -0,0 +1,191 @@
+//===-- NVPTXReplaceImageHandles.cpp - Replace image handles for Fermi ----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// On Fermi, image handles are not supported. To work around this, we traverse
+// the machine code and replace image handles with concrete symbols. For this
+// to work reliably, inlining of all function call must be performed.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "NVPTXMachineFunctionInfo.h"
+#include "NVPTXSubtarget.h"
+#include "NVPTXTargetMachine.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+class NVPTXReplaceImageHandles : public MachineFunctionPass {
+private:
+ static char ID;
+ DenseSet<MachineInstr *> InstrsToRemove;
+
+public:
+ NVPTXReplaceImageHandles();
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "NVPTX Replace Image Handles";
+ }
+private:
+ bool processInstr(MachineInstr &MI);
+ void replaceImageHandle(MachineOperand &Op, MachineFunction &MF);
+ bool findIndexForHandle(MachineOperand &Op, MachineFunction &MF,
+ unsigned &Idx);
+};
+}
+
+char NVPTXReplaceImageHandles::ID = 0;
+
+NVPTXReplaceImageHandles::NVPTXReplaceImageHandles()
+ : MachineFunctionPass(ID) {}
+
+bool NVPTXReplaceImageHandles::runOnMachineFunction(MachineFunction &MF) {
+ bool Changed = false;
+ InstrsToRemove.clear();
+
+ for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
+ ++BI) {
+ for (MachineBasicBlock::iterator I = (*BI).begin(), E = (*BI).end();
+ I != E; ++I) {
+ MachineInstr &MI = *I;
+ Changed |= processInstr(MI);
+ }
+ }
+
+ // Now clean up any handle-access instructions
+ // This is needed in debug mode when code cleanup passes are not executed,
+ // but we need the handle access to be eliminated because they are not
+ // valid instructions when image handles are disabled.
+ for (DenseSet<MachineInstr *>::iterator I = InstrsToRemove.begin(),
+ E = InstrsToRemove.end(); I != E; ++I) {
+ (*I)->eraseFromParent();
+ }
+ return Changed;
+}
+
+bool NVPTXReplaceImageHandles::processInstr(MachineInstr &MI) {
+ MachineFunction &MF = *MI.getParent()->getParent();
+ const MCInstrDesc &MCID = MI.getDesc();
+
+ if (MCID.TSFlags & NVPTXII::IsTexFlag) {
+ // This is a texture fetch, so operand 4 is a texref and operand 5 is
+ // a samplerref
+ MachineOperand &TexHandle = MI.getOperand(4);
+ replaceImageHandle(TexHandle, MF);
+
+ if (!(MCID.TSFlags & NVPTXII::IsTexModeUnifiedFlag)) {
+ MachineOperand &SampHandle = MI.getOperand(5);
+ replaceImageHandle(SampHandle, MF);
+ }
+
+ return true;
+ } else if (MCID.TSFlags & NVPTXII::IsSuldMask) {
+ unsigned VecSize =
+ 1 << (((MCID.TSFlags & NVPTXII::IsSuldMask) >> NVPTXII::IsSuldShift) - 1);
+
+ // For a surface load of vector size N, the Nth operand will be the surfref
+ MachineOperand &SurfHandle = MI.getOperand(VecSize);
+
+ replaceImageHandle(SurfHandle, MF);
+
+ return true;
+ } else if (MCID.TSFlags & NVPTXII::IsSustFlag) {
+ // This is a surface store, so operand 0 is a surfref
+ MachineOperand &SurfHandle = MI.getOperand(0);
+
+ replaceImageHandle(SurfHandle, MF);
+
+ return true;
+ } else if (MCID.TSFlags & NVPTXII::IsSurfTexQueryFlag) {
+ // This is a query, so operand 1 is a surfref/texref
+ MachineOperand &Handle = MI.getOperand(1);
+
+ replaceImageHandle(Handle, MF);
+
+ return true;
+ }
+
+ return false;
+}
+
+void NVPTXReplaceImageHandles::
+replaceImageHandle(MachineOperand &Op, MachineFunction &MF) {
+ unsigned Idx;
+ if (findIndexForHandle(Op, MF, Idx)) {
+ Op.ChangeToImmediate(Idx);
+ }
+}
+
+bool NVPTXReplaceImageHandles::
+findIndexForHandle(MachineOperand &Op, MachineFunction &MF, unsigned &Idx) {
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ NVPTXMachineFunctionInfo *MFI = MF.getInfo<NVPTXMachineFunctionInfo>();
+
+ assert(Op.isReg() && "Handle is not in a reg?");
+
+ // Which instruction defines the handle?
+ MachineInstr &TexHandleDef = *MRI.getVRegDef(Op.getReg());
+
+ switch (TexHandleDef.getOpcode()) {
+ case NVPTX::LD_i64_avar: {
+ // The handle is a parameter value being loaded, replace with the
+ // parameter symbol
+ const NVPTXTargetMachine &TM =
+ static_cast<const NVPTXTargetMachine &>(MF.getTarget());
+ if (TM.getDrvInterface() == NVPTX::CUDA) {
+ // For CUDA, we preserve the param loads coming from function arguments
+ return false;
+ }
+
+ assert(TexHandleDef.getOperand(6).isSymbol() && "Load is not a symbol!");
+ StringRef Sym = TexHandleDef.getOperand(6).getSymbolName();
+ std::string ParamBaseName = MF.getName();
+ ParamBaseName += "_param_";
+ assert(Sym.startswith(ParamBaseName) && "Invalid symbol reference");
+ unsigned Param = atoi(Sym.data()+ParamBaseName.size());
+ std::string NewSym;
+ raw_string_ostream NewSymStr(NewSym);
+ NewSymStr << MF.getFunction()->getName() << "_param_" << Param;
+
+ InstrsToRemove.insert(&TexHandleDef);
+ Idx = MFI->getImageHandleSymbolIndex(NewSymStr.str().c_str());
+ return true;
+ }
+ case NVPTX::texsurf_handles: {
+ // The handle is a global variable, replace with the global variable name
+ assert(TexHandleDef.getOperand(1).isGlobal() && "Load is not a global!");
+ const GlobalValue *GV = TexHandleDef.getOperand(1).getGlobal();
+ assert(GV->hasName() && "Global sampler must be named!");
+ InstrsToRemove.insert(&TexHandleDef);
+ Idx = MFI->getImageHandleSymbolIndex(GV->getName().data());
+ return true;
+ }
+ case NVPTX::nvvm_move_i64:
+ case TargetOpcode::COPY: {
+ bool Res = findIndexForHandle(TexHandleDef.getOperand(1), MF, Idx);
+ if (Res) {
+ InstrsToRemove.insert(&TexHandleDef);
+ }
+ return Res;
+ }
+ default:
+ llvm_unreachable("Unknown instruction operating on handle");
+ }
+}
+
+MachineFunctionPass *llvm::createNVPTXReplaceImageHandlesPass() {
+ return new NVPTXReplaceImageHandles();
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h b/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h
new file mode 100644
index 000000000000..cad4f5668fdf
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h
@@ -0,0 +1,43 @@
+//===- NVPTXSection.h - NVPTX-specific section representation -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the NVPTXSection class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXSECTION_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXSECTION_H
+
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/MC/MCSection.h"
+
+namespace llvm {
+/// Represents a section in PTX PTX does not have sections. We create this class
+/// in order to use the ASMPrint interface.
+///
+class NVPTXSection final : public MCSection {
+ virtual void anchor();
+public:
+ NVPTXSection(SectionVariant V, SectionKind K) : MCSection(V, K, nullptr) {}
+ ~NVPTXSection() {}
+
+ /// Override this as NVPTX has its own way of printing switching
+ /// to a section.
+ void PrintSwitchToSection(const MCAsmInfo &MAI,
+ raw_ostream &OS,
+ const MCExpr *Subsection) const override {}
+
+ /// Base address of PTX sections is zero.
+ bool UseCodeAlign() const override { return false; }
+ bool isVirtualSection() const override { return false; }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
new file mode 100644
index 000000000000..6e1f427ed021
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -0,0 +1,59 @@
+//===- NVPTXSubtarget.cpp - NVPTX Subtarget Information -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the NVPTX specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXSubtarget.h"
+#include "NVPTXTargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "nvptx-subtarget"
+
+#define GET_SUBTARGETINFO_ENUM
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "NVPTXGenSubtargetInfo.inc"
+
+// Pin the vtable to this file.
+void NVPTXSubtarget::anchor() {}
+
+NVPTXSubtarget &NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU,
+ StringRef FS) {
+ // Provide the default CPU if we don't have one.
+ TargetName = CPU.empty() ? "sm_20" : CPU;
+
+ ParseSubtargetFeatures(TargetName, FS);
+
+ // Set default to PTX 3.2 (CUDA 5.5)
+ if (PTXVersion == 0) {
+ PTXVersion = 32;
+ }
+
+ return *this;
+}
+
+NVPTXSubtarget::NVPTXSubtarget(const Triple &TT, const std::string &CPU,
+ const std::string &FS,
+ const NVPTXTargetMachine &TM)
+ : NVPTXGenSubtargetInfo(TT, CPU, FS), PTXVersion(0), SmVersion(20), TM(TM),
+ InstrInfo(), TLInfo(TM, initializeSubtargetDependencies(CPU, FS)),
+ FrameLowering() {}
+
+bool NVPTXSubtarget::hasImageHandles() const {
+ // Enable handles for Kepler+, where CUDA supports indirect surfaces and
+ // textures
+ if (TM.getDrvInterface() == NVPTX::CUDA)
+ return (SmVersion >= 30);
+
+ // Disabled, otherwise
+ return false;
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
new file mode 100644
index 000000000000..da020a94bcdd
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -0,0 +1,116 @@
+//=====-- NVPTXSubtarget.h - Define Subtarget for the NVPTX ---*- C++ -*--====//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the NVPTX specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H
+
+#include "NVPTX.h"
+#include "NVPTXFrameLowering.h"
+#include "NVPTXISelLowering.h"
+#include "NVPTXInstrInfo.h"
+#include "NVPTXRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "NVPTXGenSubtargetInfo.inc"
+
+namespace llvm {
+
+class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
+ virtual void anchor();
+ std::string TargetName;
+
+ // PTX version x.y is represented as 10*x+y, e.g. 3.1 == 31
+ unsigned PTXVersion;
+
+ // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31
+ unsigned int SmVersion;
+
+ const NVPTXTargetMachine &TM;
+ NVPTXInstrInfo InstrInfo;
+ NVPTXTargetLowering TLInfo;
+ SelectionDAGTargetInfo TSInfo;
+
+ // NVPTX does not have any call stack frame, but need a NVPTX specific
+ // FrameLowering class because TargetFrameLowering is abstract.
+ NVPTXFrameLowering FrameLowering;
+
+protected:
+ // Processor supports scoped atomic operations.
+ bool HasAtomScope;
+
+public:
+ /// This constructor initializes the data members to match that
+ /// of the specified module.
+ ///
+ NVPTXSubtarget(const Triple &TT, const std::string &CPU,
+ const std::string &FS, const NVPTXTargetMachine &TM);
+
+ const TargetFrameLowering *getFrameLowering() const override {
+ return &FrameLowering;
+ }
+ const NVPTXInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+ const NVPTXRegisterInfo *getRegisterInfo() const override {
+ return &InstrInfo.getRegisterInfo();
+ }
+ const NVPTXTargetLowering *getTargetLowering() const override {
+ return &TLInfo;
+ }
+ const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+ return &TSInfo;
+ }
+
+ bool hasBrkPt() const { return SmVersion >= 11; }
+ bool hasAtomRedG32() const { return SmVersion >= 11; }
+ bool hasAtomRedS32() const { return SmVersion >= 12; }
+ bool hasAtomRedG64() const { return SmVersion >= 12; }
+ bool hasAtomRedS64() const { return SmVersion >= 20; }
+ bool hasAtomRedGen32() const { return SmVersion >= 20; }
+ bool hasAtomRedGen64() const { return SmVersion >= 20; }
+ bool hasAtomAddF32() const { return SmVersion >= 20; }
+ bool hasAtomAddF64() const { return SmVersion >= 60; }
+ bool hasAtomScope() const { return HasAtomScope; }
+ bool hasAtomBitwise64() const { return SmVersion >= 32; }
+ bool hasAtomMinMax64() const { return SmVersion >= 32; }
+ bool hasVote() const { return SmVersion >= 12; }
+ bool hasDouble() const { return SmVersion >= 13; }
+ bool reqPTX20() const { return SmVersion >= 20; }
+ bool hasF32FTZ() const { return SmVersion >= 20; }
+ bool hasFMAF32() const { return SmVersion >= 20; }
+ bool hasFMAF64() const { return SmVersion >= 13; }
+ bool hasLDG() const { return SmVersion >= 32; }
+ bool hasLDU() const { return ((SmVersion >= 20) && (SmVersion < 30)); }
+ bool hasGenericLdSt() const { return SmVersion >= 20; }
+ inline bool hasHWROT32() const { return SmVersion >= 32; }
+ inline bool hasSWROT32() const {
+ return ((SmVersion >= 20) && (SmVersion < 32));
+ }
+ inline bool hasROT32() const { return hasHWROT32() || hasSWROT32(); }
+ inline bool hasROT64() const { return SmVersion >= 20; }
+ bool hasImageHandles() const;
+
+ unsigned int getSmVersion() const { return SmVersion; }
+ std::string getTargetName() const { return TargetName; }
+
+ unsigned getPTXVersion() const { return PTXVersion; }
+
+ NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
+ void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
new file mode 100644
index 000000000000..6c68a2c9370d
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -0,0 +1,371 @@
+//===-- NVPTXTargetMachine.cpp - Define TargetMachine for NVPTX -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Top-level implementation for the NVPTX target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXTargetMachine.h"
+#include "MCTargetDesc/NVPTXMCAsmInfo.h"
+#include "NVPTX.h"
+#include "NVPTXAllocaHoisting.h"
+#include "NVPTXLowerAggrCopies.h"
+#include "NVPTXTargetObjectFile.h"
+#include "NVPTXTargetTransformInfo.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Vectorize.h"
+
+using namespace llvm;
+
+// LSV is still relatively new; this switch lets us turn it off in case we
+// encounter (or suspect) a bug.
+static cl::opt<bool>
+ DisableLoadStoreVectorizer("disable-nvptx-load-store-vectorizer",
+ cl::desc("Disable load/store vectorizer"),
+ cl::init(false), cl::Hidden);
+
+namespace llvm {
+void initializeNVVMIntrRangePass(PassRegistry&);
+void initializeNVVMReflectPass(PassRegistry&);
+void initializeGenericToNVVMPass(PassRegistry&);
+void initializeNVPTXAllocaHoistingPass(PassRegistry &);
+void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&);
+void initializeNVPTXInferAddressSpacesPass(PassRegistry &);
+void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
+void initializeNVPTXLowerArgsPass(PassRegistry &);
+void initializeNVPTXLowerAllocaPass(PassRegistry &);
+}
+
+extern "C" void LLVMInitializeNVPTXTarget() {
+ // Register the target.
+ RegisterTargetMachine<NVPTXTargetMachine32> X(getTheNVPTXTarget32());
+ RegisterTargetMachine<NVPTXTargetMachine64> Y(getTheNVPTXTarget64());
+
+ // FIXME: This pass is really intended to be invoked during IR optimization,
+ // but it's very NVPTX-specific.
+ PassRegistry &PR = *PassRegistry::getPassRegistry();
+ initializeNVVMReflectPass(PR);
+ initializeNVVMIntrRangePass(PR);
+ initializeGenericToNVVMPass(PR);
+ initializeNVPTXAllocaHoistingPass(PR);
+ initializeNVPTXAssignValidGlobalNamesPass(PR);
+ initializeNVPTXInferAddressSpacesPass(PR);
+ initializeNVPTXLowerArgsPass(PR);
+ initializeNVPTXLowerAllocaPass(PR);
+ initializeNVPTXLowerAggrCopiesPass(PR);
+}
+
+static std::string computeDataLayout(bool is64Bit) {
+ std::string Ret = "e";
+
+ if (!is64Bit)
+ Ret += "-p:32:32";
+
+ Ret += "-i64:64-v16:16-v32:32-n16:32:64";
+
+ return Ret;
+}
+
+NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT,
+ StringRef CPU, StringRef FS,
+ const TargetOptions &Options,
+ Optional<Reloc::Model> RM,
+ CodeModel::Model CM,
+ CodeGenOpt::Level OL, bool is64bit)
+ // The pic relocation model is used regardless of what the client has
+ // specified, as it is the only relocation model currently supported.
+ : LLVMTargetMachine(T, computeDataLayout(is64bit), TT, CPU, FS, Options,
+ Reloc::PIC_, CM, OL),
+ is64bit(is64bit),
+ TLOF(make_unique<NVPTXTargetObjectFile>()),
+ Subtarget(TT, CPU, FS, *this) {
+ if (TT.getOS() == Triple::NVCL)
+ drvInterface = NVPTX::NVCL;
+ else
+ drvInterface = NVPTX::CUDA;
+ initAsmInfo();
+}
+
+NVPTXTargetMachine::~NVPTXTargetMachine() {}
+
+void NVPTXTargetMachine32::anchor() {}
+
+NVPTXTargetMachine32::NVPTXTargetMachine32(const Target &T, const Triple &TT,
+ StringRef CPU, StringRef FS,
+ const TargetOptions &Options,
+ Optional<Reloc::Model> RM,
+ CodeModel::Model CM,
+ CodeGenOpt::Level OL)
+ : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
+
+void NVPTXTargetMachine64::anchor() {}
+
+NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, const Triple &TT,
+ StringRef CPU, StringRef FS,
+ const TargetOptions &Options,
+ Optional<Reloc::Model> RM,
+ CodeModel::Model CM,
+ CodeGenOpt::Level OL)
+ : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
+
+namespace {
+class NVPTXPassConfig : public TargetPassConfig {
+public:
+ NVPTXPassConfig(NVPTXTargetMachine *TM, PassManagerBase &PM)
+ : TargetPassConfig(TM, PM) {}
+
+ NVPTXTargetMachine &getNVPTXTargetMachine() const {
+ return getTM<NVPTXTargetMachine>();
+ }
+
+ void addIRPasses() override;
+ bool addInstSelector() override;
+ void addPostRegAlloc() override;
+ void addMachineSSAOptimization() override;
+
+ FunctionPass *createTargetRegisterAllocator(bool) override;
+ void addFastRegAlloc(FunctionPass *RegAllocPass) override;
+ void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
+
+private:
+ // If the opt level is aggressive, add GVN; otherwise, add EarlyCSE. This
+ // function is only called in opt mode.
+ void addEarlyCSEOrGVNPass();
+
+ // Add passes that propagate special memory spaces.
+ void addAddressSpaceInferencePasses();
+
+ // Add passes that perform straight-line scalar optimizations.
+ void addStraightLineScalarOptimizationPasses();
+};
+} // end anonymous namespace
+
+TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
+ return new NVPTXPassConfig(this, PM);
+}
+
+void NVPTXTargetMachine::addEarlyAsPossiblePasses(PassManagerBase &PM) {
+ PM.add(createNVVMReflectPass());
+ PM.add(createNVVMIntrRangePass(Subtarget.getSmVersion()));
+}
+
+TargetIRAnalysis NVPTXTargetMachine::getTargetIRAnalysis() {
+ return TargetIRAnalysis([this](const Function &F) {
+ return TargetTransformInfo(NVPTXTTIImpl(this, F));
+ });
+}
+
+void NVPTXPassConfig::addEarlyCSEOrGVNPass() {
+ if (getOptLevel() == CodeGenOpt::Aggressive)
+ addPass(createGVNPass());
+ else
+ addPass(createEarlyCSEPass());
+}
+
+void NVPTXPassConfig::addAddressSpaceInferencePasses() {
+ // NVPTXLowerArgs emits alloca for byval parameters which can often
+ // be eliminated by SROA.
+ addPass(createSROAPass());
+ addPass(createNVPTXLowerAllocaPass());
+ addPass(createNVPTXInferAddressSpacesPass());
+}
+
+void NVPTXPassConfig::addStraightLineScalarOptimizationPasses() {
+ addPass(createSeparateConstOffsetFromGEPPass());
+ addPass(createSpeculativeExecutionPass());
+ // ReassociateGEPs exposes more opportunites for SLSR. See
+ // the example in reassociate-geps-and-slsr.ll.
+ addPass(createStraightLineStrengthReducePass());
+ // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
+ // EarlyCSE can reuse. GVN generates significantly better code than EarlyCSE
+ // for some of our benchmarks.
+ addEarlyCSEOrGVNPass();
+ // Run NaryReassociate after EarlyCSE/GVN to be more effective.
+ addPass(createNaryReassociatePass());
+ // NaryReassociate on GEPs creates redundant common expressions, so run
+ // EarlyCSE after it.
+ addPass(createEarlyCSEPass());
+}
+
+void NVPTXPassConfig::addIRPasses() {
+ // The following passes are known to not play well with virtual regs hanging
+ // around after register allocation (which in our case, is *all* registers).
+ // We explicitly disable them here. We do, however, need some functionality
+ // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the
+ // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp).
+ disablePass(&PrologEpilogCodeInserterID);
+ disablePass(&MachineCopyPropagationID);
+ disablePass(&TailDuplicateID);
+ disablePass(&StackMapLivenessID);
+ disablePass(&LiveDebugValuesID);
+ disablePass(&PostRASchedulerID);
+ disablePass(&FuncletLayoutID);
+ disablePass(&PatchableFunctionID);
+
+ // NVVMReflectPass is added in addEarlyAsPossiblePasses, so hopefully running
+ // it here does nothing. But since we need it for correctness when lowering
+ // to NVPTX, run it here too, in case whoever built our pass pipeline didn't
+ // call addEarlyAsPossiblePasses.
+ addPass(createNVVMReflectPass());
+
+ if (getOptLevel() != CodeGenOpt::None)
+ addPass(createNVPTXImageOptimizerPass());
+ addPass(createNVPTXAssignValidGlobalNamesPass());
+ addPass(createGenericToNVVMPass());
+
+ // NVPTXLowerArgs is required for correctness and should be run right
+ // before the address space inference passes.
+ addPass(createNVPTXLowerArgsPass(&getNVPTXTargetMachine()));
+ if (getOptLevel() != CodeGenOpt::None) {
+ addAddressSpaceInferencePasses();
+ if (!DisableLoadStoreVectorizer)
+ addPass(createLoadStoreVectorizerPass());
+ addStraightLineScalarOptimizationPasses();
+ }
+
+ // === LSR and other generic IR passes ===
+ TargetPassConfig::addIRPasses();
+ // EarlyCSE is not always strong enough to clean up what LSR produces. For
+ // example, GVN can combine
+ //
+ // %0 = add %a, %b
+ // %1 = add %b, %a
+ //
+ // and
+ //
+ // %0 = shl nsw %a, 2
+ // %1 = shl %a, 2
+ //
+ // but EarlyCSE can do neither of them.
+ if (getOptLevel() != CodeGenOpt::None)
+ addEarlyCSEOrGVNPass();
+}
+
+bool NVPTXPassConfig::addInstSelector() {
+ const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl();
+
+ addPass(createLowerAggrCopies());
+ addPass(createAllocaHoisting());
+ addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel()));
+
+ if (!ST.hasImageHandles())
+ addPass(createNVPTXReplaceImageHandlesPass());
+
+ return false;
+}
+
+void NVPTXPassConfig::addPostRegAlloc() {
+ addPass(createNVPTXPrologEpilogPass(), false);
+ if (getOptLevel() != CodeGenOpt::None) {
+ // NVPTXPrologEpilogPass calculates frame object offset and replace frame
+ // index with VRFrame register. NVPTXPeephole need to be run after that and
+ // will replace VRFrame with VRFrameLocal when possible.
+ addPass(createNVPTXPeephole());
+ }
+}
+
+FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) {
+ return nullptr; // No reg alloc
+}
+
+void NVPTXPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
+ assert(!RegAllocPass && "NVPTX uses no regalloc!");
+ addPass(&PHIEliminationID);
+ addPass(&TwoAddressInstructionPassID);
+}
+
+void NVPTXPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
+ assert(!RegAllocPass && "NVPTX uses no regalloc!");
+
+ addPass(&ProcessImplicitDefsID);
+ addPass(&LiveVariablesID);
+ addPass(&MachineLoopInfoID);
+ addPass(&PHIEliminationID);
+
+ addPass(&TwoAddressInstructionPassID);
+ addPass(&RegisterCoalescerID);
+
+ // PreRA instruction scheduling.
+ if (addPass(&MachineSchedulerID))
+ printAndVerify("After Machine Scheduling");
+
+
+ addPass(&StackSlotColoringID);
+
+ // FIXME: Needs physical registers
+ //addPass(&PostRAMachineLICMID);
+
+ printAndVerify("After StackSlotColoring");
+}
+
+void NVPTXPassConfig::addMachineSSAOptimization() {
+ // Pre-ra tail duplication.
+ if (addPass(&EarlyTailDuplicateID))
+ printAndVerify("After Pre-RegAlloc TailDuplicate");
+
+ // Optimize PHIs before DCE: removing dead PHI cycles may make more
+ // instructions dead.
+ addPass(&OptimizePHIsID);
+
+ // This pass merges large allocas. StackSlotColoring is a different pass
+ // which merges spill slots.
+ addPass(&StackColoringID);
+
+ // If the target requests it, assign local variables to stack slots relative
+ // to one another and simplify frame index references where possible.
+ addPass(&LocalStackSlotAllocationID);
+
+ // With optimization, dead code should already be eliminated. However
+ // there is one known exception: lowered code for arguments that are only
+ // used by tail calls, where the tail calls reuse the incoming stack
+ // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll).
+ addPass(&DeadMachineInstructionElimID);
+ printAndVerify("After codegen DCE pass");
+
+ // Allow targets to insert passes that improve instruction level parallelism,
+ // like if-conversion. Such passes will typically need dominator trees and
+ // loop info, just like LICM and CSE below.
+ if (addILPOpts())
+ printAndVerify("After ILP optimizations");
+
+ addPass(&MachineLICMID);
+ addPass(&MachineCSEID);
+
+ addPass(&MachineSinkingID);
+ printAndVerify("After Machine LICM, CSE and Sinking passes");
+
+ addPass(&PeepholeOptimizerID);
+ printAndVerify("After codegen peephole optimization pass");
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
new file mode 100644
index 000000000000..78a053831772
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
@@ -0,0 +1,89 @@
+//===-- NVPTXTargetMachine.h - Define TargetMachine for NVPTX ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the NVPTX specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXTARGETMACHINE_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXTARGETMACHINE_H
+
+#include "ManagedStringPool.h"
+#include "NVPTXSubtarget.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+/// NVPTXTargetMachine
+///
+class NVPTXTargetMachine : public LLVMTargetMachine {
+ bool is64bit;
+ std::unique_ptr<TargetLoweringObjectFile> TLOF;
+ NVPTX::DrvInterface drvInterface;
+ NVPTXSubtarget Subtarget;
+
+ // Hold Strings that can be free'd all together with NVPTXTargetMachine
+ ManagedStringPool ManagedStrPool;
+
+public:
+ NVPTXTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
+ CodeGenOpt::Level OP, bool is64bit);
+
+ ~NVPTXTargetMachine() override;
+ const NVPTXSubtarget *getSubtargetImpl(const Function &) const override {
+ return &Subtarget;
+ }
+ const NVPTXSubtarget *getSubtargetImpl() const { return &Subtarget; }
+ bool is64Bit() const { return is64bit; }
+ NVPTX::DrvInterface getDrvInterface() const { return drvInterface; }
+ ManagedStringPool *getManagedStrPool() const {
+ return const_cast<ManagedStringPool *>(&ManagedStrPool);
+ }
+
+ TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+ // Emission of machine code through MCJIT is not supported.
+ bool addPassesToEmitMC(PassManagerBase &, MCContext *&, raw_pwrite_stream &,
+ bool = true) override {
+ return true;
+ }
+ TargetLoweringObjectFile *getObjFileLowering() const override {
+ return TLOF.get();
+ }
+
+ void addEarlyAsPossiblePasses(PassManagerBase &PM) override;
+ TargetIRAnalysis getTargetIRAnalysis() override;
+
+}; // NVPTXTargetMachine.
+
+class NVPTXTargetMachine32 : public NVPTXTargetMachine {
+ virtual void anchor();
+public:
+ NVPTXTargetMachine32(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL);
+};
+
+class NVPTXTargetMachine64 : public NVPTXTargetMachine {
+ virtual void anchor();
+public:
+ NVPTXTargetMachine64(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h
new file mode 100644
index 000000000000..dc367a90594a
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h
@@ -0,0 +1,105 @@
+//===-- NVPTXTargetObjectFile.h - NVPTX Object Info -------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXTARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXTARGETOBJECTFILE_H
+
+#include "NVPTXSection.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+namespace llvm {
+class GlobalVariable;
+class Module;
+
+class NVPTXTargetObjectFile : public TargetLoweringObjectFile {
+
+public:
+ NVPTXTargetObjectFile() {
+ TextSection = nullptr;
+ DataSection = nullptr;
+ BSSSection = nullptr;
+ ReadOnlySection = nullptr;
+
+ StaticCtorSection = nullptr;
+ StaticDtorSection = nullptr;
+ LSDASection = nullptr;
+ EHFrameSection = nullptr;
+ DwarfAbbrevSection = nullptr;
+ DwarfInfoSection = nullptr;
+ DwarfLineSection = nullptr;
+ DwarfFrameSection = nullptr;
+ DwarfPubTypesSection = nullptr;
+ DwarfDebugInlineSection = nullptr;
+ DwarfStrSection = nullptr;
+ DwarfLocSection = nullptr;
+ DwarfARangesSection = nullptr;
+ DwarfRangesSection = nullptr;
+ DwarfMacinfoSection = nullptr;
+ }
+
+ virtual ~NVPTXTargetObjectFile();
+
+ void Initialize(MCContext &ctx, const TargetMachine &TM) override {
+ TargetLoweringObjectFile::Initialize(ctx, TM);
+ TextSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getText());
+ DataSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getData());
+ BSSSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getBSS());
+ ReadOnlySection =
+ new NVPTXSection(MCSection::SV_ELF, SectionKind::getReadOnly());
+
+ StaticCtorSection =
+ new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+ StaticDtorSection =
+ new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+ LSDASection =
+ new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+ EHFrameSection =
+ new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+ DwarfAbbrevSection =
+ new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+ DwarfInfoSection =
+ new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+ DwarfLineSection =
+ new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+ DwarfFrameSection =
+ new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+ DwarfPubTypesSection =
+ new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+ DwarfDebugInlineSection =
+ new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+ DwarfStrSection =
+ new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+ DwarfLocSection =
+ new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+ DwarfARangesSection =
+ new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+ DwarfRangesSection =
+ new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+ DwarfMacinfoSection =
+ new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+ }
+
+ MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
+ const Constant *C,
+ unsigned &Align) const override {
+ return ReadOnlySection;
+ }
+
+ MCSection *getExplicitSectionGlobal(const GlobalObject *GO, SectionKind Kind,
+ const TargetMachine &TM) const override {
+ return DataSection;
+ }
+
+ MCSection *SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind,
+ const TargetMachine &TM) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
new file mode 100644
index 000000000000..48928ee2d540
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -0,0 +1,154 @@
+//===-- NVPTXTargetTransformInfo.cpp - NVPTX specific TTI -----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXTargetTransformInfo.h"
+#include "NVPTXUtilities.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/CostTable.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "NVPTXtti"
+
+// Whether the given intrinsic reads threadIdx.x/y/z.
+static bool readsThreadIndex(const IntrinsicInst *II) {
+ switch (II->getIntrinsicID()) {
+ default: return false;
+ case Intrinsic::nvvm_read_ptx_sreg_tid_x:
+ case Intrinsic::nvvm_read_ptx_sreg_tid_y:
+ case Intrinsic::nvvm_read_ptx_sreg_tid_z:
+ return true;
+ }
+}
+
+static bool readsLaneId(const IntrinsicInst *II) {
+ return II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_laneid;
+}
+
+// Whether the given intrinsic is an atomic instruction in PTX.
+static bool isNVVMAtomic(const IntrinsicInst *II) {
+ switch (II->getIntrinsicID()) {
+ default: return false;
+ case Intrinsic::nvvm_atomic_load_add_f32:
+ case Intrinsic::nvvm_atomic_load_inc_32:
+ case Intrinsic::nvvm_atomic_load_dec_32:
+
+ case Intrinsic::nvvm_atomic_add_gen_f_cta:
+ case Intrinsic::nvvm_atomic_add_gen_f_sys:
+ case Intrinsic::nvvm_atomic_add_gen_i_cta:
+ case Intrinsic::nvvm_atomic_add_gen_i_sys:
+ case Intrinsic::nvvm_atomic_and_gen_i_cta:
+ case Intrinsic::nvvm_atomic_and_gen_i_sys:
+ case Intrinsic::nvvm_atomic_cas_gen_i_cta:
+ case Intrinsic::nvvm_atomic_cas_gen_i_sys:
+ case Intrinsic::nvvm_atomic_dec_gen_i_cta:
+ case Intrinsic::nvvm_atomic_dec_gen_i_sys:
+ case Intrinsic::nvvm_atomic_inc_gen_i_cta:
+ case Intrinsic::nvvm_atomic_inc_gen_i_sys:
+ case Intrinsic::nvvm_atomic_max_gen_i_cta:
+ case Intrinsic::nvvm_atomic_max_gen_i_sys:
+ case Intrinsic::nvvm_atomic_min_gen_i_cta:
+ case Intrinsic::nvvm_atomic_min_gen_i_sys:
+ case Intrinsic::nvvm_atomic_or_gen_i_cta:
+ case Intrinsic::nvvm_atomic_or_gen_i_sys:
+ case Intrinsic::nvvm_atomic_exch_gen_i_cta:
+ case Intrinsic::nvvm_atomic_exch_gen_i_sys:
+ case Intrinsic::nvvm_atomic_xor_gen_i_cta:
+ case Intrinsic::nvvm_atomic_xor_gen_i_sys:
+ return true;
+ }
+}
+
+bool NVPTXTTIImpl::isSourceOfDivergence(const Value *V) {
+ // Without inter-procedural analysis, we conservatively assume that arguments
+ // to __device__ functions are divergent.
+ if (const Argument *Arg = dyn_cast<Argument>(V))
+ return !isKernelFunction(*Arg->getParent());
+
+ if (const Instruction *I = dyn_cast<Instruction>(V)) {
+ // Without pointer analysis, we conservatively assume values loaded from
+ // generic or local address space are divergent.
+ if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
+ unsigned AS = LI->getPointerAddressSpace();
+ return AS == ADDRESS_SPACE_GENERIC || AS == ADDRESS_SPACE_LOCAL;
+ }
+ // Atomic instructions may cause divergence. Atomic instructions are
+ // executed sequentially across all threads in a warp. Therefore, an earlier
+ // executed thread may see different memory inputs than a later executed
+ // thread. For example, suppose *a = 0 initially.
+ //
+ // atom.global.add.s32 d, [a], 1
+ //
+ // returns 0 for the first thread that enters the critical region, and 1 for
+ // the second thread.
+ if (I->isAtomic())
+ return true;
+ if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+ // Instructions that read threadIdx are obviously divergent.
+ if (readsThreadIndex(II) || readsLaneId(II))
+ return true;
+ // Handle the NVPTX atomic instrinsics that cannot be represented as an
+ // atomic IR instruction.
+ if (isNVVMAtomic(II))
+ return true;
+ }
+ // Conservatively consider the return value of function calls as divergent.
+ // We could analyze callees with bodies more precisely using
+ // inter-procedural analysis.
+ if (isa<CallInst>(I))
+ return true;
+ }
+
+ return false;
+}
+
+int NVPTXTTIImpl::getArithmeticInstrCost(
+ unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
+ TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
+ TTI::OperandValueProperties Opd2PropInfo) {
+ // Legalize the type.
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+
+ switch (ISD) {
+ default:
+ return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+ Opd1PropInfo, Opd2PropInfo);
+ case ISD::ADD:
+ case ISD::MUL:
+ case ISD::XOR:
+ case ISD::OR:
+ case ISD::AND:
+ // The machine code (SASS) simulates an i64 with two i32. Therefore, we
+ // estimate that arithmetic operations on i64 are twice as expensive as
+ // those on types that can fit into one machine register.
+ if (LT.second.SimpleTy == MVT::i64)
+ return 2 * LT.first;
+ // Delegate other cases to the basic TTI.
+ return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+ Opd1PropInfo, Opd2PropInfo);
+ }
+}
+
+void NVPTXTTIImpl::getUnrollingPreferences(Loop *L,
+ TTI::UnrollingPreferences &UP) {
+ BaseT::getUnrollingPreferences(L, UP);
+
+ // Enable partial unrolling and runtime unrolling, but reduce the
+ // threshold. This partially unrolls small loops which are often
+ // unrolled by the PTX to SASS compiler and unrolling earlier can be
+ // beneficial.
+ UP.Partial = UP.Runtime = true;
+ UP.PartialThreshold = UP.Threshold / 4;
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
new file mode 100644
index 000000000000..d953aa8a7199
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -0,0 +1,64 @@
+//===-- NVPTXTargetTransformInfo.h - NVPTX specific TTI ---------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file a TargetTransformInfo::Concept conforming object specific to the
+/// NVPTX target machine. It uses the target's detailed information to
+/// provide more precise answers to certain TTI queries, while letting the
+/// target independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXTARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXTARGETTRANSFORMINFO_H
+
+#include "NVPTX.h"
+#include "NVPTXTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+class NVPTXTTIImpl : public BasicTTIImplBase<NVPTXTTIImpl> {
+ typedef BasicTTIImplBase<NVPTXTTIImpl> BaseT;
+ typedef TargetTransformInfo TTI;
+ friend BaseT;
+
+ const NVPTXSubtarget *ST;
+ const NVPTXTargetLowering *TLI;
+
+ const NVPTXSubtarget *getST() const { return ST; };
+ const NVPTXTargetLowering *getTLI() const { return TLI; };
+
+public:
+ explicit NVPTXTTIImpl(const NVPTXTargetMachine *TM, const Function &F)
+ : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl()),
+ TLI(ST->getTargetLowering()) {}
+
+ bool hasBranchDivergence() { return true; }
+
+ bool isSourceOfDivergence(const Value *V);
+
+ // Increase the inlining cost threshold by a factor of 5, reflecting that
+ // calls are particularly expensive in NVPTX.
+ unsigned getInliningThresholdMultiplier() { return 5; }
+
+ int getArithmeticInstrCost(
+ unsigned Opcode, Type *Ty,
+ TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+ TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+ TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+
+ void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
new file mode 100644
index 000000000000..e464f474b1d5
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -0,0 +1,317 @@
+//===- NVPTXUtilities.cpp - Utility Functions -----------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains miscellaneous utility functions
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXUtilities.h"
+#include "NVPTX.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/MutexGuard.h"
+#include <algorithm>
+#include <cstring>
+#include <map>
+#include <string>
+#include <vector>
+
+namespace llvm {
+
+namespace {
+typedef std::map<std::string, std::vector<unsigned> > key_val_pair_t;
+typedef std::map<const GlobalValue *, key_val_pair_t> global_val_annot_t;
+typedef std::map<const Module *, global_val_annot_t> per_module_annot_t;
+} // anonymous namespace
+
+static ManagedStatic<per_module_annot_t> annotationCache;
+static sys::Mutex Lock;
+
+void clearAnnotationCache(const Module *Mod) {
+ MutexGuard Guard(Lock);
+ annotationCache->erase(Mod);
+}
+
+static void cacheAnnotationFromMD(const MDNode *md, key_val_pair_t &retval) {
+ MutexGuard Guard(Lock);
+ assert(md && "Invalid mdnode for annotation");
+ assert((md->getNumOperands() % 2) == 1 && "Invalid number of operands");
+ // start index = 1, to skip the global variable key
+ // increment = 2, to skip the value for each property-value pairs
+ for (unsigned i = 1, e = md->getNumOperands(); i != e; i += 2) {
+ // property
+ const MDString *prop = dyn_cast<MDString>(md->getOperand(i));
+ assert(prop && "Annotation property not a string");
+
+ // value
+ ConstantInt *Val = mdconst::dyn_extract<ConstantInt>(md->getOperand(i + 1));
+ assert(Val && "Value operand not a constant int");
+
+ std::string keyname = prop->getString().str();
+ if (retval.find(keyname) != retval.end())
+ retval[keyname].push_back(Val->getZExtValue());
+ else {
+ std::vector<unsigned> tmp;
+ tmp.push_back(Val->getZExtValue());
+ retval[keyname] = tmp;
+ }
+ }
+}
+
+static void cacheAnnotationFromMD(const Module *m, const GlobalValue *gv) {
+ MutexGuard Guard(Lock);
+ NamedMDNode *NMD = m->getNamedMetadata("nvvm.annotations");
+ if (!NMD)
+ return;
+ key_val_pair_t tmp;
+ for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
+ const MDNode *elem = NMD->getOperand(i);
+
+ GlobalValue *entity =
+ mdconst::dyn_extract_or_null<GlobalValue>(elem->getOperand(0));
+ // entity may be null due to DCE
+ if (!entity)
+ continue;
+ if (entity != gv)
+ continue;
+
+ // accumulate annotations for entity in tmp
+ cacheAnnotationFromMD(elem, tmp);
+ }
+
+ if (tmp.empty()) // no annotations for this gv
+ return;
+
+ if ((*annotationCache).find(m) != (*annotationCache).end())
+ (*annotationCache)[m][gv] = std::move(tmp);
+ else {
+ global_val_annot_t tmp1;
+ tmp1[gv] = std::move(tmp);
+ (*annotationCache)[m] = std::move(tmp1);
+ }
+}
+
+bool findOneNVVMAnnotation(const GlobalValue *gv, const std::string &prop,
+ unsigned &retval) {
+ MutexGuard Guard(Lock);
+ const Module *m = gv->getParent();
+ if ((*annotationCache).find(m) == (*annotationCache).end())
+ cacheAnnotationFromMD(m, gv);
+ else if ((*annotationCache)[m].find(gv) == (*annotationCache)[m].end())
+ cacheAnnotationFromMD(m, gv);
+ if ((*annotationCache)[m][gv].find(prop) == (*annotationCache)[m][gv].end())
+ return false;
+ retval = (*annotationCache)[m][gv][prop][0];
+ return true;
+}
+
+bool findAllNVVMAnnotation(const GlobalValue *gv, const std::string &prop,
+ std::vector<unsigned> &retval) {
+ MutexGuard Guard(Lock);
+ const Module *m = gv->getParent();
+ if ((*annotationCache).find(m) == (*annotationCache).end())
+ cacheAnnotationFromMD(m, gv);
+ else if ((*annotationCache)[m].find(gv) == (*annotationCache)[m].end())
+ cacheAnnotationFromMD(m, gv);
+ if ((*annotationCache)[m][gv].find(prop) == (*annotationCache)[m][gv].end())
+ return false;
+ retval = (*annotationCache)[m][gv][prop];
+ return true;
+}
+
+bool isTexture(const Value &val) {
+ if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
+ unsigned annot;
+ if (findOneNVVMAnnotation(gv, "texture", annot)) {
+ assert((annot == 1) && "Unexpected annotation on a texture symbol");
+ return true;
+ }
+ }
+ return false;
+}
+
+bool isSurface(const Value &val) {
+ if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
+ unsigned annot;
+ if (findOneNVVMAnnotation(gv, "surface", annot)) {
+ assert((annot == 1) && "Unexpected annotation on a surface symbol");
+ return true;
+ }
+ }
+ return false;
+}
+
+bool isSampler(const Value &val) {
+ const char *AnnotationName = "sampler";
+
+ if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
+ unsigned annot;
+ if (findOneNVVMAnnotation(gv, AnnotationName, annot)) {
+ assert((annot == 1) && "Unexpected annotation on a sampler symbol");
+ return true;
+ }
+ }
+ if (const Argument *arg = dyn_cast<Argument>(&val)) {
+ const Function *func = arg->getParent();
+ std::vector<unsigned> annot;
+ if (findAllNVVMAnnotation(func, AnnotationName, annot)) {
+ if (is_contained(annot, arg->getArgNo()))
+ return true;
+ }
+ }
+ return false;
+}
+
+bool isImageReadOnly(const Value &val) {
+ if (const Argument *arg = dyn_cast<Argument>(&val)) {
+ const Function *func = arg->getParent();
+ std::vector<unsigned> annot;
+ if (findAllNVVMAnnotation(func, "rdoimage", annot)) {
+ if (is_contained(annot, arg->getArgNo()))
+ return true;
+ }
+ }
+ return false;
+}
+
+bool isImageWriteOnly(const Value &val) {
+ if (const Argument *arg = dyn_cast<Argument>(&val)) {
+ const Function *func = arg->getParent();
+ std::vector<unsigned> annot;
+ if (findAllNVVMAnnotation(func, "wroimage", annot)) {
+ if (is_contained(annot, arg->getArgNo()))
+ return true;
+ }
+ }
+ return false;
+}
+
+bool isImageReadWrite(const Value &val) {
+ if (const Argument *arg = dyn_cast<Argument>(&val)) {
+ const Function *func = arg->getParent();
+ std::vector<unsigned> annot;
+ if (findAllNVVMAnnotation(func, "rdwrimage", annot)) {
+ if (is_contained(annot, arg->getArgNo()))
+ return true;
+ }
+ }
+ return false;
+}
+
+bool isImage(const Value &val) {
+ return isImageReadOnly(val) || isImageWriteOnly(val) || isImageReadWrite(val);
+}
+
+bool isManaged(const Value &val) {
+ if(const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
+ unsigned annot;
+ if (findOneNVVMAnnotation(gv, "managed", annot)) {
+ assert((annot == 1) && "Unexpected annotation on a managed symbol");
+ return true;
+ }
+ }
+ return false;
+}
+
+std::string getTextureName(const Value &val) {
+ assert(val.hasName() && "Found texture variable with no name");
+ return val.getName();
+}
+
+std::string getSurfaceName(const Value &val) {
+ assert(val.hasName() && "Found surface variable with no name");
+ return val.getName();
+}
+
+std::string getSamplerName(const Value &val) {
+ assert(val.hasName() && "Found sampler variable with no name");
+ return val.getName();
+}
+
+bool getMaxNTIDx(const Function &F, unsigned &x) {
+ return findOneNVVMAnnotation(&F, "maxntidx", x);
+}
+
+bool getMaxNTIDy(const Function &F, unsigned &y) {
+ return findOneNVVMAnnotation(&F, "maxntidy", y);
+}
+
+bool getMaxNTIDz(const Function &F, unsigned &z) {
+ return findOneNVVMAnnotation(&F, "maxntidz", z);
+}
+
+bool getReqNTIDx(const Function &F, unsigned &x) {
+ return findOneNVVMAnnotation(&F, "reqntidx", x);
+}
+
+bool getReqNTIDy(const Function &F, unsigned &y) {
+ return findOneNVVMAnnotation(&F, "reqntidy", y);
+}
+
+bool getReqNTIDz(const Function &F, unsigned &z) {
+ return findOneNVVMAnnotation(&F, "reqntidz", z);
+}
+
+bool getMinCTASm(const Function &F, unsigned &x) {
+ return findOneNVVMAnnotation(&F, "minctasm", x);
+}
+
+bool getMaxNReg(const Function &F, unsigned &x) {
+ return findOneNVVMAnnotation(&F, "maxnreg", x);
+}
+
+bool isKernelFunction(const Function &F) {
+ unsigned x = 0;
+ bool retval = findOneNVVMAnnotation(&F, "kernel", x);
+ if (!retval) {
+ // There is no NVVM metadata, check the calling convention
+ return F.getCallingConv() == CallingConv::PTX_Kernel;
+ }
+ return (x == 1);
+}
+
+bool getAlign(const Function &F, unsigned index, unsigned &align) {
+ std::vector<unsigned> Vs;
+ bool retval = findAllNVVMAnnotation(&F, "align", Vs);
+ if (!retval)
+ return false;
+ for (int i = 0, e = Vs.size(); i < e; i++) {
+ unsigned v = Vs[i];
+ if ((v >> 16) == index) {
+ align = v & 0xFFFF;
+ return true;
+ }
+ }
+ return false;
+}
+
+bool getAlign(const CallInst &I, unsigned index, unsigned &align) {
+ if (MDNode *alignNode = I.getMetadata("callalign")) {
+ for (int i = 0, n = alignNode->getNumOperands(); i < n; i++) {
+ if (const ConstantInt *CI =
+ mdconst::dyn_extract<ConstantInt>(alignNode->getOperand(i))) {
+ unsigned v = CI->getZExtValue();
+ if ((v >> 16) == index) {
+ align = v & 0xFFFF;
+ return true;
+ }
+ if ((v >> 16) > index) {
+ return false;
+ }
+ }
+ }
+ }
+ return false;
+}
+
+} // namespace llvm
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.h
new file mode 100644
index 000000000000..a0cc4e78ac21
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.h
@@ -0,0 +1,65 @@
+//===-- NVPTXUtilities - Utilities -----------------------------*- C++ -*-====//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the NVVM specific utility functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXUTILITIES_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXUTILITIES_H
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Value.h"
+#include <cstdarg>
+#include <set>
+#include <string>
+#include <vector>
+
+namespace llvm {
+
+void clearAnnotationCache(const Module *);
+
+bool findOneNVVMAnnotation(const GlobalValue *, const std::string &,
+ unsigned &);
+bool findAllNVVMAnnotation(const GlobalValue *, const std::string &,
+ std::vector<unsigned> &);
+
+bool isTexture(const Value &);
+bool isSurface(const Value &);
+bool isSampler(const Value &);
+bool isImage(const Value &);
+bool isImageReadOnly(const Value &);
+bool isImageWriteOnly(const Value &);
+bool isImageReadWrite(const Value &);
+bool isManaged(const Value &);
+
+std::string getTextureName(const Value &);
+std::string getSurfaceName(const Value &);
+std::string getSamplerName(const Value &);
+
+bool getMaxNTIDx(const Function &, unsigned &);
+bool getMaxNTIDy(const Function &, unsigned &);
+bool getMaxNTIDz(const Function &, unsigned &);
+
+bool getReqNTIDx(const Function &, unsigned &);
+bool getReqNTIDy(const Function &, unsigned &);
+bool getReqNTIDz(const Function &, unsigned &);
+
+bool getMinCTASm(const Function &, unsigned &);
+bool getMaxNReg(const Function &, unsigned &);
+bool isKernelFunction(const Function &);
+
+bool getAlign(const Function &, unsigned index, unsigned &);
+bool getAlign(const CallInst &, unsigned index, unsigned &);
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXVector.td b/contrib/llvm/lib/Target/NVPTX/NVPTXVector.td
new file mode 100644
index 000000000000..e69bbba9f193
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXVector.td
@@ -0,0 +1,1479 @@
+//===- NVPTXVector.td - NVPTX Vector Specific Instruction defs -*- tblgen-*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//-----------------------------------
+// Vector Specific
+//-----------------------------------
+
+//
+// All vector instructions derive from NVPTXVecInst
+//
+
+class NVPTXVecInst<dag outs, dag ins, string asmstr, list<dag> pattern,
+ NVPTXInst sInst=NOP>
+ : NVPTXInst<outs, ins, asmstr, pattern> {
+ NVPTXInst scalarInst=sInst;
+}
+
+let isAsCheapAsAMove=1, VecInstType=isVecExtract.Value in {
+// Extract v2i16
+def V2i16Extract : NVPTXVecInst<(outs Int16Regs:$dst),
+ (ins V2I16Regs:$src, i8imm:$c),
+ "mov.u16 \t$dst, $src${c:vecelem};",
+ [(set Int16Regs:$dst, (extractelt
+ (v2i16 V2I16Regs:$src), imm:$c))],
+ IMOV16rr>;
+
+// Extract v4i16
+def V4i16Extract : NVPTXVecInst<(outs Int16Regs:$dst),
+ (ins V4I16Regs:$src, i8imm:$c),
+ "mov.u16 \t$dst, $src${c:vecelem};",
+ [(set Int16Regs:$dst, (extractelt
+ (v4i16 V4I16Regs:$src), imm:$c))],
+ IMOV16rr>;
+
+// Extract v2i8
+def V2i8Extract : NVPTXVecInst<(outs Int8Regs:$dst),
+ (ins V2I8Regs:$src, i8imm:$c),
+ "mov.u16 \t$dst, $src${c:vecelem};",
+ [(set Int8Regs:$dst, (extractelt
+ (v2i8 V2I8Regs:$src), imm:$c))],
+ IMOV8rr>;
+
+// Extract v4i8
+def V4i8Extract : NVPTXVecInst<(outs Int8Regs:$dst),
+ (ins V4I8Regs:$src, i8imm:$c),
+ "mov.u16 \t$dst, $src${c:vecelem};",
+ [(set Int8Regs:$dst, (extractelt
+ (v4i8 V4I8Regs:$src), imm:$c))],
+ IMOV8rr>;
+
+// Extract v2i32
+def V2i32Extract : NVPTXVecInst<(outs Int32Regs:$dst),
+ (ins V2I32Regs:$src, i8imm:$c),
+ "mov.u32 \t$dst, $src${c:vecelem};",
+ [(set Int32Regs:$dst, (extractelt
+ (v2i32 V2I32Regs:$src), imm:$c))],
+ IMOV32rr>;
+
+// Extract v2f32
+def V2f32Extract : NVPTXVecInst<(outs Float32Regs:$dst),
+ (ins V2F32Regs:$src, i8imm:$c),
+ "mov.f32 \t$dst, $src${c:vecelem};",
+ [(set Float32Regs:$dst, (extractelt
+ (v2f32 V2F32Regs:$src), imm:$c))],
+ FMOV32rr>;
+
+// Extract v2i64
+def V2i64Extract : NVPTXVecInst<(outs Int64Regs:$dst),
+ (ins V2I64Regs:$src, i8imm:$c),
+ "mov.u64 \t$dst, $src${c:vecelem};",
+ [(set Int64Regs:$dst, (extractelt
+ (v2i64 V2I64Regs:$src), imm:$c))],
+ IMOV64rr>;
+
+// Extract v2f64
+def V2f64Extract : NVPTXVecInst<(outs Float64Regs:$dst),
+ (ins V2F64Regs:$src, i8imm:$c),
+ "mov.f64 \t$dst, $src${c:vecelem};",
+ [(set Float64Regs:$dst, (extractelt
+ (v2f64 V2F64Regs:$src), imm:$c))],
+ FMOV64rr>;
+
+// Extract v4i32
+def V4i32Extract : NVPTXVecInst<(outs Int32Regs:$dst),
+ (ins V4I32Regs:$src, i8imm:$c),
+ "mov.u32 \t$dst, $src${c:vecelem};",
+ [(set Int32Regs:$dst, (extractelt
+ (v4i32 V4I32Regs:$src), imm:$c))],
+ IMOV32rr>;
+
+// Extract v4f32
+def V4f32Extract : NVPTXVecInst<(outs Float32Regs:$dst),
+ (ins V4F32Regs:$src, i8imm:$c),
+ "mov.f32 \t$dst, $src${c:vecelem};",
+ [(set Float32Regs:$dst, (extractelt
+ (v4f32 V4F32Regs:$src), imm:$c))],
+ FMOV32rr>;
+}
+
+let isAsCheapAsAMove=1, VecInstType=isVecInsert.Value in {
+// Insert v2i8
+def V2i8Insert : NVPTXVecInst<(outs V2I8Regs:$dst),
+ (ins V2I8Regs:$src, Int8Regs:$val, i8imm:$c),
+ "mov.v2.u16 \t${dst:vecfull}, ${src:vecfull};"
+ "\n\tmov.u16 \t$dst${c:vecelem}, $val;",
+ [(set V2I8Regs:$dst,
+ (insertelt V2I8Regs:$src, Int8Regs:$val, imm:$c))], IMOV8rr>;
+
+// Insert v4i8
+def V4i8Insert : NVPTXVecInst<(outs V4I8Regs:$dst),
+ (ins V4I8Regs:$src, Int8Regs:$val, i8imm:$c),
+ "mov.v4.u16 \t${dst:vecfull}, ${src:vecfull};"
+ "\n\tmov.u16 \t$dst${c:vecelem}, $val;",
+ [(set V4I8Regs:$dst,
+ (insertelt V4I8Regs:$src, Int8Regs:$val, imm:$c))], IMOV8rr>;
+
+// Insert v2i16
+def V2i16Insert : NVPTXVecInst<(outs V2I16Regs:$dst),
+ (ins V2I16Regs:$src, Int16Regs:$val, i8imm:$c),
+ "mov.v2.u16 \t${dst:vecfull}, ${src:vecfull};"
+ "\n\tmov.u16 \t$dst${c:vecelem}, $val;",
+ [(set V2I16Regs:$dst,
+ (insertelt V2I16Regs:$src, Int16Regs:$val, imm:$c))],
+ IMOV16rr>;
+
+// Insert v4i16
+def V4i16Insert : NVPTXVecInst<(outs V4I16Regs:$dst),
+ (ins V4I16Regs:$src, Int16Regs:$val, i8imm:$c),
+ "mov.v4.u16 \t${dst:vecfull}, ${src:vecfull};"
+ "\n\tmov.u16 \t$dst${c:vecelem}, $val;",
+ [(set V4I16Regs:$dst,
+ (insertelt V4I16Regs:$src, Int16Regs:$val, imm:$c))],
+ IMOV16rr>;
+
+// Insert v2i32
+def V2i32Insert : NVPTXVecInst<(outs V2I32Regs:$dst),
+ (ins V2I32Regs:$src, Int32Regs:$val, i8imm:$c),
+ "mov.v2.u32 \t${dst:vecfull}, ${src:vecfull};"
+ "\n\tmov.u32 \t$dst${c:vecelem}, $val;",
+ [(set V2I32Regs:$dst,
+ (insertelt V2I32Regs:$src, Int32Regs:$val, imm:$c))],
+ IMOV32rr>;
+
+// Insert v2f32
+def V2f32Insert : NVPTXVecInst<(outs V2F32Regs:$dst),
+ (ins V2F32Regs:$src, Float32Regs:$val, i8imm:$c),
+ "mov.v2.f32 \t${dst:vecfull}, ${src:vecfull};"
+ "\n\tmov.f32 \t$dst${c:vecelem}, $val;",
+ [(set V2F32Regs:$dst,
+ (insertelt V2F32Regs:$src, Float32Regs:$val, imm:$c))],
+ FMOV32rr>;
+
+// Insert v2i64
+def V2i64Insert : NVPTXVecInst<(outs V2I64Regs:$dst),
+ (ins V2I64Regs:$src, Int64Regs:$val, i8imm:$c),
+ "mov.v2.u64 \t${dst:vecfull}, ${src:vecfull};"
+ "\n\tmov.u64 \t$dst${c:vecelem}, $val;",
+ [(set V2I64Regs:$dst,
+ (insertelt V2I64Regs:$src, Int64Regs:$val, imm:$c))],
+ IMOV64rr>;
+
+// Insert v2f64
+def V2f64Insert : NVPTXVecInst<(outs V2F64Regs:$dst),
+ (ins V2F64Regs:$src, Float64Regs:$val, i8imm:$c),
+ "mov.v2.f64 \t${dst:vecfull}, ${src:vecfull};"
+ "\n\tmov.f64 \t$dst${c:vecelem}, $val;",
+ [(set V2F64Regs:$dst,
+ (insertelt V2F64Regs:$src, Float64Regs:$val, imm:$c))],
+ FMOV64rr>;
+
+// Insert v4i32
+def V4i32Insert : NVPTXVecInst<(outs V4I32Regs:$dst),
+ (ins V4I32Regs:$src, Int32Regs:$val, i8imm:$c),
+ "mov.v4.u32 \t${dst:vecfull}, ${src:vecfull};"
+ "\n\tmov.u32 \t$dst${c:vecelem}, $val;",
+ [(set V4I32Regs:$dst,
+ (insertelt V4I32Regs:$src, Int32Regs:$val, imm:$c))],
+ IMOV32rr>;
+
+// Insert v4f32
+def V4f32Insert : NVPTXVecInst<(outs V4F32Regs:$dst),
+ (ins V4F32Regs:$src, Float32Regs:$val, i8imm:$c),
+ "mov.v4.f32 \t${dst:vecfull}, ${src:vecfull};"
+ "\n\tmov.f32 \t$dst${c:vecelem}, $val;",
+ [(set V4F32Regs:$dst,
+ (insertelt V4F32Regs:$src, Float32Regs:$val, imm:$c))],
+ FMOV32rr>;
+}
+
+class BinOpAsmString<string c> {
+ string s = c;
+}
+
+class V4AsmStr<string opcode> : BinOpAsmString<
+ !strconcat(!strconcat(!strconcat(!strconcat(
+ !strconcat(!strconcat(!strconcat(
+ opcode, " \t${dst}_0, ${a}_0, ${b}_0;\n\t"),
+ opcode), " \t${dst}_1, ${a}_1, ${b}_1;\n\t"),
+ opcode), " \t${dst}_2, ${a}_2, ${b}_2;\n\t"),
+ opcode), " \t${dst}_3, ${a}_3, ${b}_3;")>;
+
+class V2AsmStr<string opcode> : BinOpAsmString<
+ !strconcat(!strconcat(!strconcat(
+ opcode, " \t${dst}_0, ${a}_0, ${b}_0;\n\t"),
+ opcode), " \t${dst}_1, ${a}_1, ${b}_1;")>;
+
+class V4MADStr<string opcode> : BinOpAsmString<
+ !strconcat(!strconcat(!strconcat(!strconcat(
+ !strconcat(!strconcat(!strconcat(
+ opcode, " \t${dst}_0, ${a}_0, ${b}_0, ${c}_0;\n\t"),
+ opcode), " \t${dst}_1, ${a}_1, ${b}_1, ${c}_1;\n\t"),
+ opcode), " \t${dst}_2, ${a}_2, ${b}_2, ${c}_2;\n\t"),
+ opcode), " \t${dst}_3, ${a}_3, ${b}_3, ${c}_3;")>;
+
+class V2MADStr<string opcode> : BinOpAsmString<
+ !strconcat(!strconcat(!strconcat(
+ opcode, " \t${dst}_0, ${a}_0, ${b}_0, ${c}_0;\n\t"),
+ opcode), " \t${dst}_1, ${a}_1, ${b}_1, ${c}_1;")>;
+
+class V4UnaryStr<string opcode> : BinOpAsmString<
+ !strconcat(!strconcat(!strconcat(!strconcat(
+ !strconcat(!strconcat(!strconcat(
+ opcode, " \t${dst}_0, ${a}_0;\n\t"),
+ opcode), " \t${dst}_1, ${a}_1;\n\t"),
+ opcode), " \t${dst}_2, ${a}_2;\n\t"),
+ opcode), " \t${dst}_3, ${a}_3;")>;
+
+class V2UnaryStr<string opcode> : BinOpAsmString<
+ !strconcat(!strconcat(!strconcat(
+ opcode, " \t${dst}_0, ${a}_0;\n\t"),
+ opcode), " \t${dst}_1, ${a}_1;")>;
+
+class VecBinaryOp<BinOpAsmString asmstr, SDNode OpNode, NVPTXRegClass regclass,
+ NVPTXInst sInst=NOP> :
+ NVPTXVecInst<(outs regclass:$dst), (ins regclass:$a, regclass:$b),
+ asmstr.s,
+ [(set regclass:$dst, (OpNode regclass:$a, regclass:$b))],
+ sInst>;
+
+class VecShiftOp<BinOpAsmString asmstr, SDNode OpNode, NVPTXRegClass regclass1,
+ NVPTXRegClass regclass2, NVPTXInst sInst=NOP> :
+ NVPTXVecInst<(outs regclass1:$dst), (ins regclass1:$a, regclass2:$b),
+ asmstr.s,
+ [(set regclass1:$dst, (OpNode regclass1:$a, regclass2:$b))],
+ sInst>;
+
+class VecUnaryOp<BinOpAsmString asmstr, PatFrag OpNode, NVPTXRegClass regclass,
+ NVPTXInst sInst=NOP> :
+ NVPTXVecInst<(outs regclass:$dst), (ins regclass:$a),
+ asmstr.s,
+ [(set regclass:$dst, (OpNode regclass:$a))], sInst>;
+
+multiclass IntBinVOp<string asmstr, SDNode OpNode,
+ NVPTXInst i64op=NOP, NVPTXInst i32op=NOP, NVPTXInst
+ i16op=NOP, NVPTXInst i8op=NOP> {
+ def V2I64 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "64")>, OpNode, V2I64Regs,
+ i64op>;
+ def V4I32 : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "32")>, OpNode, V4I32Regs,
+ i32op>;
+ def V2I32 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "32")>, OpNode, V2I32Regs,
+ i32op>;
+ def V4I16 : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "16")>, OpNode, V4I16Regs,
+ i16op>;
+ def V2I16 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "16")>, OpNode, V2I16Regs,
+ i16op>;
+ def V4I8 : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "16")>, OpNode, V4I8Regs,
+ i8op>;
+ def V2I8 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "16")>, OpNode, V2I8Regs,
+ i8op>;
+}
+
+multiclass FloatBinVOp<string asmstr, SDNode OpNode,
+ NVPTXInst f64=NOP, NVPTXInst f32=NOP,
+ NVPTXInst f32_ftz=NOP> {
+ def V2F64 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "f64")>, OpNode,
+ V2F64Regs, f64>;
+ def V4F32_ftz : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "ftz.f32")>, OpNode,
+ V4F32Regs, f32_ftz>, Requires<[doF32FTZ]>;
+ def V2F32_ftz : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "ftz.f32")>, OpNode,
+ V2F32Regs, f32_ftz>, Requires<[doF32FTZ]>;
+ def V4F32 : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "f32")>, OpNode,
+ V4F32Regs, f32>;
+ def V2F32 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "f32")>, OpNode,
+ V2F32Regs, f32>;
+}
+
+multiclass IntUnaryVOp<string asmstr, PatFrag OpNode,
+ NVPTXInst i64op=NOP, NVPTXInst i32op=NOP,
+ NVPTXInst i16op=NOP, NVPTXInst i8op=NOP> {
+ def V2I64 : VecUnaryOp<V2UnaryStr<!strconcat(asmstr, "64")>, OpNode,
+ V2I64Regs, i64op>;
+ def V4I32 : VecUnaryOp<V4UnaryStr<!strconcat(asmstr, "32")>, OpNode,
+ V4I32Regs, i32op>;
+ def V2I32 : VecUnaryOp<V2UnaryStr<!strconcat(asmstr, "32")>, OpNode,
+ V2I32Regs, i32op>;
+ def V4I16 : VecUnaryOp<V4UnaryStr<!strconcat(asmstr, "16")>, OpNode,
+ V4I16Regs, i16op>;
+ def V2I16 : VecUnaryOp<V2UnaryStr<!strconcat(asmstr, "16")>, OpNode,
+ V2I16Regs, i16op>;
+ def V4I8 : VecUnaryOp<V4UnaryStr<!strconcat(asmstr, "16")>, OpNode,
+ V4I8Regs, i8op>;
+ def V2I8 : VecUnaryOp<V2UnaryStr<!strconcat(asmstr, "16")>, OpNode,
+ V2I8Regs, i8op>;
+}
+
+
+// Integer Arithmetic
+let VecInstType=isVecOther.Value in {
+defm VAdd : IntBinVOp<"add.s", add, ADDi64rr, ADDi32rr, ADDi16rr, ADDi8rr>;
+defm VSub : IntBinVOp<"sub.s", sub, SUBi64rr, SUBi32rr, SUBi16rr, SUBi8rr>;
+
+def AddCCV4I32 : VecBinaryOp<V4AsmStr<"add.cc.s32">, addc, V4I32Regs,
+ ADDCCi32rr>;
+def AddCCV2I32 : VecBinaryOp<V2AsmStr<"add.cc.s32">, addc, V2I32Regs,
+ ADDCCi32rr>;
+def SubCCV4I32 : VecBinaryOp<V4AsmStr<"sub.cc.s32">, subc, V4I32Regs,
+ SUBCCi32rr>;
+def SubCCV2I32 : VecBinaryOp<V2AsmStr<"sub.cc.s32">, subc, V2I32Regs,
+ SUBCCi32rr>;
+def AddCCCV4I32 : VecBinaryOp<V4AsmStr<"addc.cc.s32">, adde, V4I32Regs,
+ ADDCCCi32rr>;
+def AddCCCV2I32 : VecBinaryOp<V2AsmStr<"addc.cc.s32">, adde, V2I32Regs,
+ ADDCCCi32rr>;
+def SubCCCV4I32 : VecBinaryOp<V4AsmStr<"subc.cc.s32">, sube, V4I32Regs,
+ SUBCCCi32rr>;
+def SubCCCV2I32 : VecBinaryOp<V2AsmStr<"subc.cc.s32">, sube, V2I32Regs,
+ SUBCCCi32rr>;
+
+def ShiftLV2I64 : VecShiftOp<V2AsmStr<"shl.b64">, shl, V2I64Regs, V2I32Regs,
+ SHLi64rr>;
+def ShiftLV2I32 : VecShiftOp<V2AsmStr<"shl.b32">, shl, V2I32Regs, V2I32Regs,
+ SHLi32rr>;
+def ShiftLV4I32 : VecShiftOp<V4AsmStr<"shl.b32">, shl, V4I32Regs, V4I32Regs,
+ SHLi32rr>;
+def ShiftLV2I16 : VecShiftOp<V2AsmStr<"shl.b16">, shl, V2I16Regs, V2I32Regs,
+ SHLi16rr>;
+def ShiftLV4I16 : VecShiftOp<V4AsmStr<"shl.b16">, shl, V4I16Regs, V4I32Regs,
+ SHLi16rr>;
+def ShiftLV2I8 : VecShiftOp<V2AsmStr<"shl.b16">, shl, V2I8Regs, V2I32Regs,
+ SHLi8rr>;
+def ShiftLV4I8 : VecShiftOp<V4AsmStr<"shl.b16">, shl, V4I8Regs, V4I32Regs,
+ SHLi8rr>;
+}
+
+// cvt to v*i32, helpers for shift
+class CVTtoVeci32<NVPTXRegClass inclass, NVPTXRegClass outclass, string asmstr,
+ NVPTXInst sInst=NOP> :
+ NVPTXVecInst<(outs outclass:$d), (ins inclass:$s), asmstr, [], sInst>;
+
+class VecCVTStrHelper<string op, string dest, string src> {
+ string s=!strconcat(op, !strconcat("\t",
+ !strconcat(dest, !strconcat(", ", !strconcat(src, ";")))));
+}
+
+class Vec2CVTStr<string op> {
+ string s=!strconcat(VecCVTStrHelper<op, "${d}_0", "${s}_0">.s,
+ !strconcat("\n\t", VecCVTStrHelper<op, "${d}_1", "${s}_1">.s));
+}
+
+class Vec4CVTStr<string op> {
+ string s=!strconcat(VecCVTStrHelper<op, "${d}_0", "${s}_0">.s,
+ !strconcat("\n\t",
+ !strconcat(VecCVTStrHelper<op, "${d}_1", "${s}_1">.s,
+ !strconcat("\n\t",
+ !strconcat(VecCVTStrHelper<op, "${d}_2", "${s}_2">.s,
+ !strconcat("\n\t", VecCVTStrHelper<op, "${d}_3", "${s}_3">.s))))));
+}
+
+let VecInstType=isVecOther.Value in {
+def CVTv2i8tov2i32 : CVTtoVeci32<V2I8Regs, V2I32Regs,
+ Vec2CVTStr<"cvt.u32.u16">.s, Zint_extendext8to32>;
+def CVTv2i16tov2i32 : CVTtoVeci32<V2I16Regs, V2I32Regs,
+ Vec2CVTStr<"cvt.u32.u16">.s, Zint_extendext16to32>;
+def CVTv4i8tov4i32 : CVTtoVeci32<V4I8Regs, V4I32Regs,
+ Vec4CVTStr<"cvt.u32.u16">.s, Zint_extendext8to32>;
+def CVTv4i16tov4i32 : CVTtoVeci32<V4I16Regs, V4I32Regs,
+ Vec4CVTStr<"cvt.u32.u16">.s, Zint_extendext16to32>;
+def CVTv2i64tov2i32 : CVTtoVeci32<V2I64Regs, V2I32Regs,
+ Vec2CVTStr<"cvt.u32.u64">.s, TRUNC_64to32>;
+}
+
+def : Pat<(shl V2I16Regs:$src1, V2I16Regs:$src2),
+ (ShiftLV2I16 V2I16Regs:$src1, (CVTv2i16tov2i32 V2I16Regs:$src2))>;
+def : Pat<(shl V2I8Regs:$src1, V2I8Regs:$src2),
+ (ShiftLV2I8 V2I8Regs:$src1, (CVTv2i8tov2i32 V2I8Regs:$src2))>;
+def : Pat<(shl V2I64Regs:$src1, V2I64Regs:$src2),
+ (ShiftLV2I64 V2I64Regs:$src1, (CVTv2i64tov2i32 V2I64Regs:$src2))>;
+
+def : Pat<(shl V4I16Regs:$src1, V4I16Regs:$src2),
+ (ShiftLV4I16 V4I16Regs:$src1, (CVTv4i16tov4i32 V4I16Regs:$src2))>;
+def : Pat<(shl V4I8Regs:$src1, V4I8Regs:$src2),
+ (ShiftLV4I8 V4I8Regs:$src1, (CVTv4i8tov4i32 V4I8Regs:$src2))>;
+
+let VecInstType=isVecOther.Value in {
+def ShiftRAV2I64 : VecShiftOp<V2AsmStr<"shr.s64">, sra, V2I64Regs, V2I32Regs,
+ SRAi64rr>;
+def ShiftRAV2I32 : VecShiftOp<V2AsmStr<"shr.s32">, sra, V2I32Regs, V2I32Regs,
+ SRAi32rr>;
+def ShiftRAV4I32 : VecShiftOp<V4AsmStr<"shr.s32">, sra, V4I32Regs, V4I32Regs,
+ SRAi32rr>;
+def ShiftRAV2I16 : VecShiftOp<V2AsmStr<"shr.s16">, sra, V2I16Regs, V2I32Regs,
+ SRAi16rr>;
+def ShiftRAV4I16 : VecShiftOp<V4AsmStr<"shr.s16">, sra, V4I16Regs, V4I32Regs,
+ SRAi16rr>;
+def ShiftRAV2I8 : VecShiftOp<V2AsmStr<"shr.s16">, sra, V2I8Regs, V2I32Regs,
+ SRAi8rr>;
+def ShiftRAV4I8 : VecShiftOp<V4AsmStr<"shr.s16">, sra, V4I8Regs, V4I32Regs,
+ SRAi8rr>;
+
+def ShiftRLV2I64 : VecShiftOp<V2AsmStr<"shr.u64">, srl, V2I64Regs, V2I32Regs,
+ SRLi64rr>;
+def ShiftRLV2I32 : VecShiftOp<V2AsmStr<"shr.u32">, srl, V2I32Regs, V2I32Regs,
+ SRLi32rr>;
+def ShiftRLV4I32 : VecShiftOp<V4AsmStr<"shr.u32">, srl, V4I32Regs, V4I32Regs,
+ SRLi32rr>;
+def ShiftRLV2I16 : VecShiftOp<V2AsmStr<"shr.u16">, srl, V2I16Regs, V2I32Regs,
+ SRLi16rr>;
+def ShiftRLV4I16 : VecShiftOp<V4AsmStr<"shr.u16">, srl, V4I16Regs, V4I32Regs,
+ SRLi16rr>;
+def ShiftRLV2I8 : VecShiftOp<V2AsmStr<"shr.u16">, srl, V2I8Regs, V2I32Regs,
+ SRLi8rr>;
+def ShiftRLV4I8 : VecShiftOp<V4AsmStr<"shr.u16">, srl, V4I8Regs, V4I32Regs,
+ SRLi8rr>;
+
+defm VMult : IntBinVOp<"mul.lo.s", mul, MULTi64rr, MULTi32rr, MULTi16rr,
+ MULTi8rr>;
+defm VMultHS : IntBinVOp<"mul.hi.s", mulhs, MULTHSi64rr, MULTHSi32rr,
+ MULTHSi16rr,
+ MULTHSi8rr>;
+defm VMultHU : IntBinVOp<"mul.hi.u", mulhu, MULTHUi64rr, MULTHUi32rr,
+ MULTHUi16rr,
+ MULTHUi8rr>;
+defm VSDiv : IntBinVOp<"div.s", sdiv, SDIVi64rr, SDIVi32rr, SDIVi16rr,
+ SDIVi8rr>;
+defm VUDiv : IntBinVOp<"div.u", udiv, UDIVi64rr, UDIVi32rr, UDIVi16rr,
+ UDIVi8rr>;
+defm VSRem : IntBinVOp<"rem.s", srem, SREMi64rr, SREMi32rr, SREMi16rr,
+ SREMi8rr>;
+defm VURem : IntBinVOp<"rem.u", urem, UREMi64rr, UREMi32rr, UREMi16rr,
+ UREMi8rr>;
+}
+
+def : Pat<(sra V2I16Regs:$src1, V2I16Regs:$src2),
+ (ShiftRAV2I16 V2I16Regs:$src1, (CVTv2i16tov2i32 V2I16Regs:$src2))>;
+def : Pat<(sra V2I8Regs:$src1, V2I8Regs:$src2),
+ (ShiftRAV2I8 V2I8Regs:$src1, (CVTv2i8tov2i32 V2I8Regs:$src2))>;
+def : Pat<(sra V2I64Regs:$src1, V2I64Regs:$src2),
+ (ShiftRAV2I64 V2I64Regs:$src1, (CVTv2i64tov2i32 V2I64Regs:$src2))>;
+
+def : Pat<(sra V4I16Regs:$src1, V4I16Regs:$src2),
+ (ShiftRAV4I16 V4I16Regs:$src1, (CVTv4i16tov4i32 V4I16Regs:$src2))>;
+def : Pat<(sra V4I8Regs:$src1, V4I8Regs:$src2),
+ (ShiftRAV4I8 V4I8Regs:$src1, (CVTv4i8tov4i32 V4I8Regs:$src2))>;
+
+def : Pat<(srl V2I16Regs:$src1, V2I16Regs:$src2),
+ (ShiftRLV2I16 V2I16Regs:$src1, (CVTv2i16tov2i32 V2I16Regs:$src2))>;
+def : Pat<(srl V2I8Regs:$src1, V2I8Regs:$src2),
+ (ShiftRLV2I8 V2I8Regs:$src1, (CVTv2i8tov2i32 V2I8Regs:$src2))>;
+def : Pat<(srl V2I64Regs:$src1, V2I64Regs:$src2),
+ (ShiftRLV2I64 V2I64Regs:$src1, (CVTv2i64tov2i32 V2I64Regs:$src2))>;
+
+def : Pat<(srl V4I16Regs:$src1, V4I16Regs:$src2),
+ (ShiftRLV4I16 V4I16Regs:$src1, (CVTv4i16tov4i32 V4I16Regs:$src2))>;
+def : Pat<(srl V4I8Regs:$src1, V4I8Regs:$src2),
+ (ShiftRLV4I8 V4I8Regs:$src1, (CVTv4i8tov4i32 V4I8Regs:$src2))>;
+
+multiclass VMAD<string asmstr, NVPTXRegClass regclassv4,
+ NVPTXRegClass regclassv2,
+ SDNode an=add, SDNode mn=mul, NVPTXInst sop=NOP,
+ Predicate Pred> {
+ def V4 : NVPTXVecInst<(outs regclassv4:$dst),
+ (ins regclassv4:$a, regclassv4:$b, regclassv4:$c),
+ V4MADStr<asmstr>.s,
+ [(set regclassv4:$dst,
+ (an (mn regclassv4:$a, regclassv4:$b), regclassv4:$c))],
+ sop>,
+ Requires<[Pred]>;
+ def V2 : NVPTXVecInst<(outs regclassv2:$dst),
+ (ins regclassv2:$a, regclassv2:$b, regclassv2:$c),
+ V2MADStr<asmstr>.s,
+ [(set regclassv2:$dst,
+ (an (mn regclassv2:$a, regclassv2:$b), regclassv2:$c))],
+ sop>,
+ Requires<[Pred]>;
+}
+
+multiclass VMADV2Only<string asmstr, NVPTXRegClass regclass, NVPTXInst sop=NOP,
+ Predicate Pred> {
+ def V2 : NVPTXVecInst<(outs regclass:$dst),
+ (ins regclass:$a, regclass:$b, regclass:$c),
+ V2MADStr<asmstr>.s,
+ [(set regclass:$dst, (add
+ (mul regclass:$a, regclass:$b), regclass:$c))], sop>,
+ Requires<[Pred]>;
+}
+multiclass VFMADV2Only<string asmstr, NVPTXRegClass regclass, NVPTXInst sop=NOP,
+ Predicate Pred> {
+ def V2 : NVPTXVecInst<(outs regclass:$dst),
+ (ins regclass:$a, regclass:$b, regclass:$c),
+ V2MADStr<asmstr>.s,
+ [(set regclass:$dst, (fadd
+ (fmul regclass:$a, regclass:$b), regclass:$c))], sop>,
+ Requires<[Pred]>;
+}
+
+let VecInstType=isVecOther.Value in {
+defm I8MAD : VMAD<"mad.lo.s16", V4I8Regs, V2I8Regs, add, mul, MAD8rrr, true>;
+defm I16MAD : VMAD<"mad.lo.s16", V4I16Regs, V2I16Regs, add, mul, MAD16rrr,
+ true>;
+defm I32MAD : VMAD<"mad.lo.s32", V4I32Regs, V2I32Regs, add, mul, MAD32rrr,
+ true>;
+defm I64MAD : VMADV2Only<"mad.lo.s64", V2I64Regs, MAD64rrr, true>;
+
+defm VNeg : IntUnaryVOp<"neg.s", ineg, INEG64, INEG32, INEG16, INEG8>;
+
+defm VAddf : FloatBinVOp<"add.", fadd, FADDf64rr, FADDf32rr, FADDf32rr_ftz>;
+defm VSubf : FloatBinVOp<"sub.", fsub, FSUBf64rr, FSUBf32rr, FSUBf32rr_ftz>;
+defm VMulf : FloatBinVOp<"mul.", fmul, FMULf64rr, FMULf32rr, FMULf32rr_ftz>;
+
+defm F32MAD_ftz : VMAD<"mad.ftz.f32", V4F32Regs, V2F32Regs, fadd, fmul,
+ FMAD32_ftzrrr, doFMADF32_ftz>;
+defm F32FMA_ftz : VMAD<"fma.rn.ftz.f32", V4F32Regs, V2F32Regs, fadd, fmul,
+ FMA32_ftzrrr, doFMAF32_ftz>;
+defm F32MAD : VMAD<"mad.f32", V4F32Regs, V2F32Regs, fadd, fmul, FMAD32rrr,
+ doFMADF32>;
+defm F32FMA : VMAD<"fma.rn.f32", V4F32Regs, V2F32Regs, fadd, fmul, FMA32rrr,
+ doFMAF32>;
+defm F64FMA : VFMADV2Only<"fma.rn.f64", V2F64Regs, FMA64rrr, doFMAF64>;
+}
+
+let VecInstType=isVecOther.Value in {
+def V4F32Div_prec_ftz : VecBinaryOp<V4AsmStr<"div.rn.ftz.f32">, fdiv, V4F32Regs,
+ FDIV32rr_prec_ftz>, Requires<[doF32FTZ, reqPTX20]>;
+def V2F32Div_prec_ftz : VecBinaryOp<V2AsmStr<"div.rn.ftz.f32">, fdiv, V2F32Regs,
+ FDIV32rr_prec_ftz>, Requires<[doF32FTZ, reqPTX20]>;
+def V4F32Div_prec : VecBinaryOp<V4AsmStr<"div.rn.f32">, fdiv, V4F32Regs,
+ FDIV32rr_prec>, Requires<[reqPTX20]>;
+def V2F32Div_prec : VecBinaryOp<V2AsmStr<"div.rn.f32">, fdiv, V2F32Regs,
+ FDIV32rr_prec>, Requires<[reqPTX20]>;
+def V2F32Div_ftz : VecBinaryOp<V2AsmStr<"div.full.ftz.f32">, fdiv, V2F32Regs,
+ FDIV32rr_ftz>, Requires<[doF32FTZ]>;
+def V4F32Div_ftz : VecBinaryOp<V4AsmStr<"div.full.ftz.f32">, fdiv, V4F32Regs,
+ FDIV32rr_ftz>, Requires<[doF32FTZ]>;
+def V2F32Div : VecBinaryOp<V2AsmStr<"div.full.f32">, fdiv, V2F32Regs, FDIV32rr>;
+def V4F32Div : VecBinaryOp<V4AsmStr<"div.full.f32">, fdiv, V4F32Regs, FDIV32rr>;
+def V2F64Div : VecBinaryOp<V2AsmStr<"div.rn.f64">, fdiv, V2F64Regs, FDIV64rr>;
+}
+
+def fnegpat : PatFrag<(ops node:$in), (fneg node:$in)>;
+
+let VecInstType=isVecOther.Value in {
+def VNegv2f32_ftz : VecUnaryOp<V2UnaryStr<"neg.ftz.f32">, fnegpat, V2F32Regs,
+ FNEGf32_ftz>, Requires<[doF32FTZ]>;
+def VNegv4f32_ftz : VecUnaryOp<V4UnaryStr<"neg.ftz.f32">, fnegpat, V4F32Regs,
+ FNEGf32_ftz>, Requires<[doF32FTZ]>;
+def VNegv2f32 : VecUnaryOp<V2UnaryStr<"neg.f32">, fnegpat, V2F32Regs, FNEGf32>;
+def VNegv4f32 : VecUnaryOp<V4UnaryStr<"neg.f32">, fnegpat, V4F32Regs, FNEGf32>;
+def VNegv2f64 : VecUnaryOp<V2UnaryStr<"neg.f64">, fnegpat, V2F64Regs, FNEGf64>;
+
+// Logical Arithmetic
+defm VAnd : IntBinVOp<"and.b", and, ANDb64rr, ANDb32rr, ANDb16rr, ANDb8rr>;
+defm VOr : IntBinVOp<"or.b", or, ORb64rr, ORb32rr, ORb16rr, ORb8rr>;
+defm VXor : IntBinVOp<"xor.b", xor, XORb64rr, XORb32rr, XORb16rr, XORb8rr>;
+
+defm VNot : IntUnaryVOp<"not.b", not, NOT64, NOT32, NOT16, NOT8>;
+}
+
+
+multiclass V2FPCONTRACT32_SUB_PAT<NVPTXInst Inst, Predicate Pred> {
+ def : Pat<(fsub V2F32Regs:$a, (fmul V2F32Regs:$b, V2F32Regs:$c)),
+ (Inst (VNegv2f32 V2F32Regs:$b), V2F32Regs:$c, V2F32Regs:$a)>,
+ Requires<[Pred]>;
+
+ def : Pat<(fsub (fmul V2F32Regs:$a, V2F32Regs:$b), V2F32Regs:$c),
+ (Inst V2F32Regs:$a, V2F32Regs:$b, (VNegv2f32 V2F32Regs:$c))>,
+ Requires<[Pred]>;
+}
+
+defm V2FMAF32ext_ftz : V2FPCONTRACT32_SUB_PAT<F32FMA_ftzV2, doFMAF32AGG_ftz>;
+defm V2FMADF32ext_ftz : V2FPCONTRACT32_SUB_PAT<F32MAD_ftzV2, doFMADF32_ftz>;
+defm V2FMAF32ext : V2FPCONTRACT32_SUB_PAT<F32FMAV2, doFMAF32AGG>;
+defm V2FMADF32ext : V2FPCONTRACT32_SUB_PAT<F32MADV2, doFMADF32>;
+
+multiclass V4FPCONTRACT32_SUB_PAT<NVPTXInst Inst, Predicate Pred> {
+ def : Pat<(fsub V4F32Regs:$a, (fmul V4F32Regs:$b, V4F32Regs:$c)),
+ (Inst (VNegv4f32 V4F32Regs:$b), V4F32Regs:$c, V4F32Regs:$a)>,
+ Requires<[Pred]>;
+
+ def : Pat<(fsub (fmul V4F32Regs:$a, V4F32Regs:$b), V4F32Regs:$c),
+ (Inst V4F32Regs:$a, V4F32Regs:$b, (VNegv4f32 V4F32Regs:$c))>,
+ Requires<[Pred]>;
+}
+
+defm V4FMAF32ext_ftz : V4FPCONTRACT32_SUB_PAT<F32FMA_ftzV4, doFMAF32AGG_ftz>;
+defm V4FMADF32ext_ftz : V4FPCONTRACT32_SUB_PAT<F32MAD_ftzV4, doFMADF32_ftz>;
+defm V4FMAF32ext : V4FPCONTRACT32_SUB_PAT<F32FMAV4, doFMAF32AGG>;
+defm V4FMADF32ext : V4FPCONTRACT32_SUB_PAT<F32MADV4, doFMADF32>;
+
+multiclass V2FPCONTRACT64_SUB_PAT<NVPTXInst Inst, Predicate Pred> {
+ def : Pat<(fsub V2F64Regs:$a, (fmul V2F64Regs:$b, V2F64Regs:$c)),
+ (Inst (VNegv2f64 V2F64Regs:$b), V2F64Regs:$c, V2F64Regs:$a)>,
+ Requires<[Pred]>;
+
+ def : Pat<(fsub (fmul V2F64Regs:$a, V2F64Regs:$b), V2F64Regs:$c),
+ (Inst V2F64Regs:$a, V2F64Regs:$b, (VNegv2f64 V2F64Regs:$c))>,
+ Requires<[Pred]>;
+}
+
+defm V2FMAF64ext : V2FPCONTRACT64_SUB_PAT<F64FMAV2, doFMAF64AGG>;
+
+class VecModStr<string vecsize, string elem, string extra, string l="">
+{
+ string t1 = !strconcat("${c", elem);
+ string t2 = !strconcat(t1, ":vecv");
+ string t3 = !strconcat(t2, vecsize);
+ string t4 = !strconcat(t3, extra);
+ string t5 = !strconcat(t4, l);
+ string s = !strconcat(t5, "}");
+}
+class ShuffleOneLine<string vecsize, string elem, string type>
+{
+ string t1 = VecModStr<vecsize, elem, "comm", "1">.s;
+ string t2 = !strconcat(t1, "mov.");
+ string t3 = !strconcat(t2, type);
+ string t4 = !strconcat(t3, " \t${dst}_");
+ string t5 = !strconcat(t4, elem);
+ string t6 = !strconcat(t5, ", $src1");
+ string t7 = !strconcat(t6, VecModStr<vecsize, elem, "pos">.s);
+ string t8 = !strconcat(t7, ";\n\t");
+ string t9 = !strconcat(t8, VecModStr<vecsize, elem, "comm", "2">.s);
+ string t10 = !strconcat(t9, "mov.");
+ string t11 = !strconcat(t10, type);
+ string t12 = !strconcat(t11, " \t${dst}_");
+ string t13 = !strconcat(t12, elem);
+ string t14 = !strconcat(t13, ", $src2");
+ string t15 = !strconcat(t14, VecModStr<vecsize, elem, "pos">.s);
+ string s = !strconcat(t15, ";");
+}
+class ShuffleAsmStr2<string type>
+{
+ string t1 = ShuffleOneLine<"2", "0", type>.s;
+ string t2 = !strconcat(t1, "\n\t");
+ string s = !strconcat(t2, ShuffleOneLine<"2", "1", type>.s);
+}
+class ShuffleAsmStr4<string type>
+{
+ string t1 = ShuffleOneLine<"4", "0", type>.s;
+ string t2 = !strconcat(t1, "\n\t");
+ string t3 = !strconcat(t2, ShuffleOneLine<"4", "1", type>.s);
+ string t4 = !strconcat(t3, "\n\t");
+ string t5 = !strconcat(t4, ShuffleOneLine<"4", "2", type>.s);
+ string t6 = !strconcat(t5, "\n\t");
+ string s = !strconcat(t6, ShuffleOneLine<"4", "3", type>.s);
+}
+
+let hasSideEffects=0, VecInstType=isVecShuffle.Value in {
+def VecShuffle_v4f32 : NVPTXVecInst<(outs V4F32Regs:$dst),
+ (ins V4F32Regs:$src1, V4F32Regs:$src2,
+ i8imm:$c0, i8imm:$c1, i8imm:$c2, i8imm:$c3),
+ !strconcat("//Mov $dst, $src1, $src2, $c0, $c1, $c2, $c3;\n\t",
+ ShuffleAsmStr4<"f32">.s),
+ [], FMOV32rr>;
+
+def VecShuffle_v4i32 : NVPTXVecInst<(outs V4I32Regs:$dst),
+ (ins V4I32Regs:$src1, V4I32Regs:$src2,
+ i8imm:$c0, i8imm:$c1, i8imm:$c2, i8imm:$c3),
+ !strconcat("//Mov $dst, $src1, $src2, $c0, $c1, $c2, $c3;\n\t",
+ ShuffleAsmStr4<"u32">.s),
+ [], IMOV32rr>;
+
+def VecShuffle_v4i16 : NVPTXVecInst<(outs V4I16Regs:$dst),
+ (ins V4I16Regs:$src1, V4I16Regs:$src2,
+ i8imm:$c0, i8imm:$c1, i8imm:$c2, i8imm:$c3),
+ !strconcat("//Mov $dst, $src1, $src2, $c0, $c1, $c2, $c3;\n\t",
+ ShuffleAsmStr4<"u16">.s),
+ [], IMOV16rr>;
+
+def VecShuffle_v4i8 : NVPTXVecInst<(outs V4I8Regs:$dst),
+ (ins V4I8Regs:$src1, V4I8Regs:$src2,
+ i8imm:$c0, i8imm:$c1, i8imm:$c2, i8imm:$c3),
+ !strconcat("//Mov $dst, $src1, $src2, $c0, $c1, $c2, $c3;\n\t",
+ ShuffleAsmStr4<"u16">.s),
+ [], IMOV8rr>;
+
+def VecShuffle_v2f32 : NVPTXVecInst<(outs V2F32Regs:$dst),
+ (ins V2F32Regs:$src1, V2F32Regs:$src2,
+ i8imm:$c0, i8imm:$c1),
+ !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+ ShuffleAsmStr2<"f32">.s),
+ [], FMOV32rr>;
+
+def VecShuffle_v2i32 : NVPTXVecInst<(outs V2I32Regs:$dst),
+ (ins V2I32Regs:$src1, V2I32Regs:$src2,
+ i8imm:$c0, i8imm:$c1),
+ !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+ ShuffleAsmStr2<"u32">.s),
+ [], IMOV32rr>;
+
+def VecShuffle_v2i8 : NVPTXVecInst<(outs V2I8Regs:$dst),
+ (ins V2I8Regs:$src1, V2I8Regs:$src2,
+ i8imm:$c0, i8imm:$c1),
+ !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+ ShuffleAsmStr2<"u16">.s),
+ [], IMOV8rr>;
+
+def VecShuffle_v2i16 : NVPTXVecInst<(outs V2I16Regs:$dst),
+ (ins V2I16Regs:$src1, V2I16Regs:$src2,
+ i8imm:$c0, i8imm:$c1),
+ !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+ ShuffleAsmStr2<"u16">.s),
+ [], IMOV16rr>;
+
+def VecShuffle_v2f64 : NVPTXVecInst<(outs V2F64Regs:$dst),
+ (ins V2F64Regs:$src1, V2F64Regs:$src2,
+ i8imm:$c0, i8imm:$c1),
+ !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+ ShuffleAsmStr2<"f64">.s),
+ [], FMOV64rr>;
+
+def VecShuffle_v2i64 : NVPTXVecInst<(outs V2I64Regs:$dst),
+ (ins V2I64Regs:$src1, V2I64Regs:$src2,
+ i8imm:$c0, i8imm:$c1),
+ !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+ ShuffleAsmStr2<"u64">.s),
+ [], IMOV64rr>;
+}
+
+def ShuffleMask0 : SDNodeXForm<vector_shuffle, [{
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ return CurDAG->getTargetConstant(SVOp->getMaskElt(0), SDLoc(N), MVT::i32);
+}]>;
+def ShuffleMask1 : SDNodeXForm<vector_shuffle, [{
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ return CurDAG->getTargetConstant(SVOp->getMaskElt(1), SDLoc(N), MVT::i32);
+}]>;
+def ShuffleMask2 : SDNodeXForm<vector_shuffle, [{
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ return CurDAG->getTargetConstant(SVOp->getMaskElt(2), SDLoc(N), MVT::i32);
+}]>;
+def ShuffleMask3 : SDNodeXForm<vector_shuffle, [{
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ return CurDAG->getTargetConstant(SVOp->getMaskElt(3), SDLoc(N), MVT::i32);
+}]>;
+
+// The spurious call is here to silence a compiler warning about N being
+// unused.
+def vec_shuf : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs),
+ [{ N->getGluedNode(); return true; }]>;
+
+def : Pat<(v2f64 (vec_shuf:$op V2F64Regs:$src1, V2F64Regs:$src2)),
+ (VecShuffle_v2f64 V2F64Regs:$src1, V2F64Regs:$src2,
+ (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+def : Pat<(v4f32 (vec_shuf:$op V4F32Regs:$src1, V4F32Regs:$src2)),
+ (VecShuffle_v4f32 V4F32Regs:$src1, V4F32Regs:$src2,
+ (ShuffleMask0 node:$op), (ShuffleMask1 node:$op),
+ (ShuffleMask2 node:$op), (ShuffleMask3 node:$op))>;
+
+def : Pat<(v2f32 (vec_shuf:$op V2F32Regs:$src1, V2F32Regs:$src2)),
+ (VecShuffle_v2f32 V2F32Regs:$src1, V2F32Regs:$src2,
+ (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+def : Pat<(v2i64 (vec_shuf:$op V2I64Regs:$src1, V2I64Regs:$src2)),
+ (VecShuffle_v2i64 V2I64Regs:$src1, V2I64Regs:$src2,
+ (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+def : Pat<(v4i32 (vec_shuf:$op V4I32Regs:$src1, V4I32Regs:$src2)),
+ (VecShuffle_v4i32 V4I32Regs:$src1, V4I32Regs:$src2,
+ (ShuffleMask0 node:$op), (ShuffleMask1 node:$op),
+ (ShuffleMask2 node:$op), (ShuffleMask3 node:$op))>;
+
+def : Pat<(v2i32 (vec_shuf:$op V2I32Regs:$src1, V2I32Regs:$src2)),
+ (VecShuffle_v2i32 V2I32Regs:$src1, V2I32Regs:$src2,
+ (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+def : Pat<(v4i16 (vec_shuf:$op V4I16Regs:$src1, V4I16Regs:$src2)),
+ (VecShuffle_v4i16 V4I16Regs:$src1, V4I16Regs:$src2,
+ (ShuffleMask0 node:$op), (ShuffleMask1 node:$op),
+ (ShuffleMask2 node:$op), (ShuffleMask3 node:$op))>;
+
+def : Pat<(v2i16 (vec_shuf:$op V2I16Regs:$src1, V2I16Regs:$src2)),
+ (VecShuffle_v2i16 V2I16Regs:$src1, V2I16Regs:$src2,
+ (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+def : Pat<(v4i8 (vec_shuf:$op V4I8Regs:$src1, V4I8Regs:$src2)),
+ (VecShuffle_v4i8 V4I8Regs:$src1, V4I8Regs:$src2,
+ (ShuffleMask0 node:$op), (ShuffleMask1 node:$op),
+ (ShuffleMask2 node:$op), (ShuffleMask3 node:$op))>;
+
+def : Pat<(v2i8 (vec_shuf:$op V2I8Regs:$src1, V2I8Regs:$src2)),
+ (VecShuffle_v2i8 V2I8Regs:$src1, V2I8Regs:$src2,
+ (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+class Build_Vector2<string asmstr, NVPTXRegClass vclass, NVPTXRegClass sclass,
+ NVPTXInst si>
+ : NVPTXVecInst<(outs vclass:$dst),
+ (ins sclass:$a1, sclass:$a2),
+ !strconcat(asmstr, "\t${dst:vecfull}, {{$a1, $a2}};"),
+ [(set vclass:$dst, (build_vector sclass:$a1, sclass:$a2))],
+ si>;
+class Build_Vector4<string asmstr, NVPTXRegClass vclass, NVPTXRegClass sclass,
+ NVPTXInst si>
+ : NVPTXVecInst<(outs vclass:$dst),
+ (ins sclass:$a1, sclass:$a2, sclass:$a3, sclass:$a4),
+ !strconcat(asmstr, "\t${dst:vecfull}, {{$a1, $a2, $a3, $a4}};"),
+ [(set vclass:$dst,
+ (build_vector sclass:$a1, sclass:$a2,
+ sclass:$a3, sclass:$a4))], si>;
+
+let isAsCheapAsAMove=1, VecInstType=isVecBuild.Value in {
+def Build_Vector2_f32 : Build_Vector2<"mov.v2.f32", V2F32Regs, Float32Regs,
+ FMOV32rr>;
+def Build_Vector2_f64 : Build_Vector2<"mov.v2.f64", V2F64Regs, Float64Regs,
+ FMOV64rr>;
+
+def Build_Vector2_i32 : Build_Vector2<"mov.v2.u32", V2I32Regs, Int32Regs,
+ IMOV32rr>;
+def Build_Vector2_i64 : Build_Vector2<"mov.v2.u64", V2I64Regs, Int64Regs,
+ IMOV64rr>;
+def Build_Vector2_i16 : Build_Vector2<"mov.v2.u16", V2I16Regs, Int16Regs,
+ IMOV16rr>;
+def Build_Vector2_i8 : Build_Vector2<"mov.v2.u16", V2I8Regs, Int8Regs,
+ IMOV8rr>;
+
+def Build_Vector4_f32 : Build_Vector4<"mov.v4.f32", V4F32Regs, Float32Regs,
+ FMOV32rr>;
+
+def Build_Vector4_i32 : Build_Vector4<"mov.v4.u32", V4I32Regs, Int32Regs,
+ IMOV32rr>;
+def Build_Vector4_i16 : Build_Vector4<"mov.v4.u16", V4I16Regs, Int16Regs,
+ IMOV16rr>;
+def Build_Vector4_i8 : Build_Vector4<"mov.v4.u16", V4I8Regs, Int8Regs,
+ IMOV8rr>;
+}
+
+class Vec_Move<string asmstr, NVPTXRegClass vclass, NVPTXInst sop=NOP>
+ : NVPTXVecInst<(outs vclass:$dst), (ins vclass:$src),
+ !strconcat(asmstr, "\t${dst:vecfull}, ${src:vecfull};"),
+ [], sop>;
+
+let isAsCheapAsAMove=1, hasSideEffects=0, IsSimpleMove=1,
+ VecInstType=isVecOther.Value in {
+def V4f32Mov : Vec_Move<"mov.v4.f32", V4F32Regs, FMOV32rr>;
+def V2f32Mov : Vec_Move<"mov.v2.f32", V2F32Regs, FMOV32rr>;
+
+def V4i32Mov : Vec_Move<"mov.v4.u32", V4I32Regs, IMOV32rr>;
+def V2i32Mov : Vec_Move<"mov.v2.u32", V2I32Regs, IMOV32rr>;
+
+def V4i16Mov : Vec_Move<"mov.v4.u16", V4I16Regs, IMOV16rr>;
+def V2i16Mov : Vec_Move<"mov.v2.u16", V2I16Regs, IMOV16rr>;
+
+def V4i8Mov : Vec_Move<"mov.v4.u16", V4I8Regs, IMOV8rr>;
+def V2i8Mov : Vec_Move<"mov.v2.u16", V2I8Regs, IMOV8rr>;
+
+def V2f64Mov : Vec_Move<"mov.v2.f64", V2F64Regs, FMOV64rr>;
+def V2i64Mov : Vec_Move<"mov.v2.u64", V2I64Regs, IMOV64rr>;
+}
+
+// extract subvector patterns
+def extract_subvec : SDNode<"ISD::EXTRACT_SUBVECTOR",
+ SDTypeProfile<1, 2, [SDTCisPtrTy<2>]>>;
+
+def : Pat<(v2f32 (extract_subvec V4F32Regs:$src, 0)),
+ (Build_Vector2_f32 (V4f32Extract V4F32Regs:$src, 0),
+ (V4f32Extract V4F32Regs:$src, 1))>;
+def : Pat<(v2f32 (extract_subvec V4F32Regs:$src, 2)),
+ (Build_Vector2_f32 (V4f32Extract V4F32Regs:$src, 2),
+ (V4f32Extract V4F32Regs:$src, 3))>;
+def : Pat<(v2i32 (extract_subvec V4I32Regs:$src, 0)),
+ (Build_Vector2_i32 (V4i32Extract V4I32Regs:$src, 0),
+ (V4i32Extract V4I32Regs:$src, 1))>;
+def : Pat<(v2i32 (extract_subvec V4I32Regs:$src, 2)),
+ (Build_Vector2_i32 (V4i32Extract V4I32Regs:$src, 2),
+ (V4i32Extract V4I32Regs:$src, 3))>;
+def : Pat<(v2i16 (extract_subvec V4I16Regs:$src, 0)),
+ (Build_Vector2_i16 (V4i16Extract V4I16Regs:$src, 0),
+ (V4i16Extract V4I16Regs:$src, 1))>;
+def : Pat<(v2i16 (extract_subvec V4I16Regs:$src, 2)),
+ (Build_Vector2_i16 (V4i16Extract V4I16Regs:$src, 2),
+ (V4i16Extract V4I16Regs:$src, 3))>;
+def : Pat<(v2i8 (extract_subvec V4I8Regs:$src, 0)),
+ (Build_Vector2_i8 (V4i8Extract V4I8Regs:$src, 0),
+ (V4i8Extract V4I8Regs:$src, 1))>;
+def : Pat<(v2i8 (extract_subvec V4I8Regs:$src, 2)),
+ (Build_Vector2_i8 (V4i8Extract V4I8Regs:$src, 2),
+ (V4i8Extract V4I8Regs:$src, 3))>;
+
+// Select instructions
+class Select_OneLine<string type, string pos> {
+ string t1 = !strconcat("selp.", type);
+ string t2 = !strconcat(t1, " \t${dst}_");
+ string t3 = !strconcat(t2, pos);
+ string t4 = !strconcat(t3, ", ${src1}_");
+ string t5 = !strconcat(t4, pos);
+ string t6 = !strconcat(t5, ", ${src2}_");
+ string t7 = !strconcat(t6, pos);
+ string s = !strconcat(t7, ", $p;");
+}
+
+class Select_Str2<string type> {
+ string t1 = Select_OneLine<type, "0">.s;
+ string t2 = !strconcat(t1, "\n\t");
+ string s = !strconcat(t2, Select_OneLine<type, "1">.s);
+}
+
+class Select_Str4<string type> {
+ string t1 = Select_OneLine<type, "0">.s;
+ string t2 = !strconcat(t1, "\n\t");
+ string t3 = !strconcat(t2, Select_OneLine<type, "1">.s);
+ string t4 = !strconcat(t3, "\n\t");
+ string t5 = !strconcat(t4, Select_OneLine<type, "2">.s);
+ string t6 = !strconcat(t5, "\n\t");
+ string s = !strconcat(t6, Select_OneLine<type, "3">.s);
+
+}
+
+class Vec_Select<NVPTXRegClass vclass, string asmstr, NVPTXInst sop>
+ : NVPTXVecInst<(outs vclass:$dst),
+ (ins vclass:$src1, vclass:$src2, Int1Regs:$p),
+ asmstr,
+ [(set vclass:$dst, (select Int1Regs:$p, vclass:$src1,
+ vclass:$src2))],
+ sop>;
+
+let VecInstType=isVecOther.Value in {
+def V2I64_Select : Vec_Select<V2I64Regs, Select_Str2<"b64">.s, SELECTi64rr>;
+def V4I32_Select : Vec_Select<V4I32Regs, Select_Str4<"b32">.s, SELECTi32rr>;
+def V2I32_Select : Vec_Select<V2I32Regs, Select_Str2<"b32">.s, SELECTi32rr>;
+def V4I16_Select : Vec_Select<V4I16Regs, Select_Str4<"b16">.s, SELECTi16rr>;
+def V2I16_Select : Vec_Select<V2I16Regs, Select_Str2<"b16">.s, SELECTi16rr>;
+def V4I8_Select : Vec_Select<V4I8Regs, Select_Str4<"b16">.s, SELECTi8rr>;
+def V2I8_Select : Vec_Select<V2I8Regs, Select_Str2<"b16">.s, SELECTi8rr>;
+
+def V2F64_Select : Vec_Select<V2F64Regs, Select_Str2<"f64">.s, SELECTf64rr>;
+def V4F32_Select : Vec_Select<V4F32Regs, Select_Str4<"f32">.s, SELECTf32rr>;
+def V2F32_Select : Vec_Select<V2F32Regs, Select_Str2<"f32">.s, SELECTf32rr>;
+}
+
+// Comparison instructions
+
+// setcc convenience fragments.
+def vsetoeq : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETOEQ)>;
+def vsetogt : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETOGT)>;
+def vsetoge : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETOGE)>;
+def vsetolt : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETOLT)>;
+def vsetole : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETOLE)>;
+def vsetone : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETONE)>;
+def vseto : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETO)>;
+def vsetuo : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETUO)>;
+def vsetueq : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETUEQ)>;
+def vsetugt : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETUGT)>;
+def vsetuge : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETUGE)>;
+def vsetult : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETULT)>;
+def vsetule : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETULE)>;
+def vsetune : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETUNE)>;
+def vseteq : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETEQ)>;
+def vsetgt : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETGT)>;
+def vsetge : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETGE)>;
+def vsetlt : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETLT)>;
+def vsetle : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETLE)>;
+def vsetne : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETNE)>;
+
+class Vec_Compare<PatFrag op, NVPTXRegClass outrclass, NVPTXRegClass inrclass,
+ NVPTXInst sop>
+ : NVPTXVecInst<(outs outrclass:$dst),
+ (ins inrclass:$a, inrclass:$b),
+ "Unsupported",
+ [(set outrclass:$dst, (op inrclass:$a, inrclass:$b))],
+ sop>;
+
+multiclass Vec_Compare_All<PatFrag op,
+ NVPTXInst inst8,
+ NVPTXInst inst16,
+ NVPTXInst inst32,
+ NVPTXInst inst64>
+{
+ def V2I8 : Vec_Compare<op, V2I8Regs, V2I8Regs, inst8>;
+ def V4I8 : Vec_Compare<op, V4I8Regs, V4I8Regs, inst8>;
+ def V2I16 : Vec_Compare<op, V2I16Regs, V2I16Regs, inst16>;
+ def V4I16 : Vec_Compare<op, V4I16Regs, V4I16Regs, inst16>;
+ def V2I32 : Vec_Compare<op, V2I32Regs, V2I32Regs, inst32>;
+ def V4I32 : Vec_Compare<op, V4I32Regs, V4I32Regs, inst32>;
+ def V2I64 : Vec_Compare<op, V2I64Regs, V2I64Regs, inst64>;
+}
+
+let VecInstType=isVecOther.Value in {
+ defm VecSGT : Vec_Compare_All<vsetgt, ISetSGTi8rr_toi8, ISetSGTi16rr_toi16,
+ ISetSGTi32rr_toi32, ISetSGTi64rr_toi64>;
+ defm VecUGT : Vec_Compare_All<vsetugt, ISetUGTi8rr_toi8, ISetUGTi16rr_toi16,
+ ISetUGTi32rr_toi32, ISetUGTi64rr_toi64>;
+ defm VecSLT : Vec_Compare_All<vsetlt, ISetSLTi8rr_toi8, ISetSLTi16rr_toi16,
+ ISetSLTi32rr_toi32, ISetSLTi64rr_toi64>;
+ defm VecULT : Vec_Compare_All<vsetult, ISetULTi8rr_toi8, ISetULTi16rr_toi16,
+ ISetULTi32rr_toi32, ISetULTi64rr_toi64>;
+ defm VecSGE : Vec_Compare_All<vsetge, ISetSGEi8rr_toi8, ISetSGEi16rr_toi16,
+ ISetSGEi32rr_toi32, ISetSGEi64rr_toi64>;
+ defm VecUGE : Vec_Compare_All<vsetuge, ISetUGEi8rr_toi8, ISetUGEi16rr_toi16,
+ ISetUGEi32rr_toi32, ISetUGEi64rr_toi64>;
+ defm VecSLE : Vec_Compare_All<vsetle, ISetSLEi8rr_toi8, ISetSLEi16rr_toi16,
+ ISetSLEi32rr_toi32, ISetSLEi64rr_toi64>;
+ defm VecULE : Vec_Compare_All<vsetule, ISetULEi8rr_toi8, ISetULEi16rr_toi16,
+ ISetULEi32rr_toi32, ISetULEi64rr_toi64>;
+ defm VecSEQ : Vec_Compare_All<vseteq, ISetSEQi8rr_toi8, ISetSEQi16rr_toi16,
+ ISetSEQi32rr_toi32, ISetSEQi64rr_toi64>;
+ defm VecUEQ : Vec_Compare_All<vsetueq, ISetUEQi8rr_toi8, ISetUEQi16rr_toi16,
+ ISetUEQi32rr_toi32, ISetUEQi64rr_toi64>;
+ defm VecSNE : Vec_Compare_All<vsetne, ISetSNEi8rr_toi8, ISetSNEi16rr_toi16,
+ ISetSNEi32rr_toi32, ISetSNEi64rr_toi64>;
+ defm VecUNE : Vec_Compare_All<vsetune, ISetUNEi8rr_toi8, ISetUNEi16rr_toi16,
+ ISetUNEi32rr_toi32, ISetUNEi64rr_toi64>;
+}
+
+multiclass FVec_Compare_All<PatFrag op,
+ NVPTXInst instf32,
+ NVPTXInst instf64>
+{
+ def V2F32 : Vec_Compare<op, V2I32Regs, V2F32Regs, instf32>;
+ def V4F32 : Vec_Compare<op, V4I32Regs, V4F32Regs, instf32>;
+ def V2F64 : Vec_Compare<op, V2I64Regs, V2F64Regs, instf64>;
+}
+
+let VecInstType=isVecOther.Value in {
+ defm FVecGT : FVec_Compare_All<vsetogt, FSetGTf32rr_toi32,
+ FSetGTf64rr_toi64>;
+ defm FVecLT : FVec_Compare_All<vsetolt, FSetLTf32rr_toi32,
+ FSetLTf64rr_toi64>;
+ defm FVecGE : FVec_Compare_All<vsetoge, FSetGEf32rr_toi32,
+ FSetGEf64rr_toi64>;
+ defm FVecLE : FVec_Compare_All<vsetole, FSetLEf32rr_toi32,
+ FSetLEf64rr_toi64>;
+ defm FVecEQ : FVec_Compare_All<vsetoeq, FSetEQf32rr_toi32,
+ FSetEQf64rr_toi64>;
+ defm FVecNE : FVec_Compare_All<vsetone, FSetNEf32rr_toi32,
+ FSetNEf64rr_toi64>;
+
+ defm FVecUGT : FVec_Compare_All<vsetugt, FSetUGTf32rr_toi32,
+ FSetUGTf64rr_toi64>;
+ defm FVecULT : FVec_Compare_All<vsetult, FSetULTf32rr_toi32,
+ FSetULTf64rr_toi64>;
+ defm FVecUGE : FVec_Compare_All<vsetuge, FSetUGEf32rr_toi32,
+ FSetUGEf64rr_toi64>;
+ defm FVecULE : FVec_Compare_All<vsetule, FSetULEf32rr_toi32,
+ FSetULEf64rr_toi64>;
+ defm FVecUEQ : FVec_Compare_All<vsetueq, FSetUEQf32rr_toi32,
+ FSetUEQf64rr_toi64>;
+ defm FVecUNE : FVec_Compare_All<vsetune, FSetUNEf32rr_toi32,
+ FSetUNEf64rr_toi64>;
+
+ defm FVecNUM : FVec_Compare_All<vseto, FSetNUMf32rr_toi32,
+ FSetNUMf64rr_toi64>;
+ defm FVecNAN : FVec_Compare_All<vsetuo, FSetNANf32rr_toi32,
+ FSetNANf64rr_toi64>;
+}
+
+class LoadParamScalar4Inst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs regclass:$d1, regclass:$d2, regclass:$d3, regclass:$d4),
+ (ins i32imm:$a, i32imm:$b),
+ !strconcat(!strconcat("ld.param", opstr),
+ "\t{{$d1, $d2, $d3, $d4}}, [retval0+$b];"), []>;
+
+class LoadParamScalar2Inst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs regclass:$d1, regclass:$d2),
+ (ins i32imm:$a, i32imm:$b),
+ !strconcat(!strconcat("ld.param", opstr),
+ "\t{{$d1, $d2}}, [retval0+$b];"), []>;
+
+
+class StoreParamScalar4Inst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs),
+ (ins regclass:$s1, regclass:$s2, regclass:$s3, regclass:$s4,
+ i32imm:$a, i32imm:$b),
+ !strconcat(!strconcat("st.param", opstr),
+ "\t[param$a+$b], {{$s1, $s2, $s3, $s4}};"), []>;
+
+class StoreParamScalar2Inst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs),
+ (ins regclass:$s1, regclass:$s2, i32imm:$a, i32imm:$b),
+ !strconcat(!strconcat("st.param", opstr),
+ "\t[param$a+$b], {{$s1, $s2}};"), []>;
+
+class StoreRetvalScalar4Inst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs),
+ (ins regclass:$s1, regclass:$s2, regclass:$s3, regclass:$s4,
+ i32imm:$a),
+ !strconcat(!strconcat("st.param", opstr),
+ "\t[func_retval+$a], {{$s1, $s2, $s3, $s4}};"), []>;
+
+class StoreRetvalScalar2Inst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs),
+ (ins regclass:$s1, regclass:$s2, i32imm:$a),
+ !strconcat(!strconcat("st.param", opstr),
+ "\t[func_retval+$a], {{$s1, $s2}};"), []>;
+
+def LoadParamScalar4I32 : LoadParamScalar4Inst<Int32Regs, ".v4.b32">;
+def LoadParamScalar4I16 : LoadParamScalar4Inst<Int16Regs, ".v4.b16">;
+def LoadParamScalar4I8 : LoadParamScalar4Inst<Int8Regs, ".v4.b8">;
+
+def LoadParamScalar2I64 : LoadParamScalar2Inst<Int32Regs, ".v2.b64">;
+def LoadParamScalar2I32 : LoadParamScalar2Inst<Int32Regs, ".v2.b32">;
+def LoadParamScalar2I16 : LoadParamScalar2Inst<Int32Regs, ".v2.b16">;
+def LoadParamScalar2I8 : LoadParamScalar2Inst<Int32Regs, ".v2.b8">;
+
+def LoadParamScalar4F32 : LoadParamScalar4Inst<Float32Regs, ".v4.f32">;
+def LoadParamScalar2F32 : LoadParamScalar2Inst<Float32Regs, ".v2.f32">;
+def LoadParamScalar2F64 : LoadParamScalar2Inst<Float64Regs, ".v2.f64">;
+
+def StoreParamScalar4I32 : StoreParamScalar4Inst<Int32Regs, ".v4.b32">;
+def StoreParamScalar4I16 : StoreParamScalar4Inst<Int16Regs, ".v4.b16">;
+def StoreParamScalar4I8 : StoreParamScalar4Inst<Int8Regs, ".v4.b8">;
+
+def StoreParamScalar2I64 : StoreParamScalar2Inst<Int64Regs, ".v2.b64">;
+def StoreParamScalar2I32 : StoreParamScalar2Inst<Int32Regs, ".v2.b32">;
+def StoreParamScalar2I16 : StoreParamScalar2Inst<Int16Regs, ".v2.b16">;
+def StoreParamScalar2I8 : StoreParamScalar2Inst<Int8Regs, ".v2.b8">;
+
+def StoreParamScalar4F32 : StoreParamScalar4Inst<Float32Regs, ".v4.f32">;
+def StoreParamScalar2F32 : StoreParamScalar2Inst<Float32Regs, ".v2.f32">;
+def StoreParamScalar2F64 : StoreParamScalar2Inst<Float64Regs, ".v2.f64">;
+
+def StoreRetvalScalar4I32 : StoreRetvalScalar4Inst<Int32Regs, ".v4.b32">;
+def StoreRetvalScalar4I16 : StoreRetvalScalar4Inst<Int16Regs, ".v4.b16">;
+def StoreRetvalScalar4I8 : StoreRetvalScalar4Inst<Int8Regs, ".v4.b8">;
+
+def StoreRetvalScalar2I64 : StoreRetvalScalar2Inst<Int64Regs, ".v2.b64">;
+def StoreRetvalScalar2I32 : StoreRetvalScalar2Inst<Int32Regs, ".v2.b32">;
+def StoreRetvalScalar2I16 : StoreRetvalScalar2Inst<Int16Regs, ".v2.b16">;
+def StoreRetvalScalar2I8 : StoreRetvalScalar2Inst<Int8Regs, ".v2.b8">;
+
+def StoreRetvalScalar4F32 : StoreRetvalScalar4Inst<Float32Regs, ".v4.f32">;
+def StoreRetvalScalar2F32 : StoreRetvalScalar2Inst<Float32Regs, ".v2.f32">;
+def StoreRetvalScalar2F64 : StoreRetvalScalar2Inst<Float64Regs, ".v2.f64">;
+
+class LoadParamVecInst<NVPTXRegClass regclass, string opstr, NVPTXInst sop=NOP>:
+ NVPTXVecInst<(outs regclass:$dst), (ins i32imm:$a, i32imm:$b),
+ "loadparam : $dst <- [$a, $b]",
+ [(set regclass:$dst, (LoadParam (i32 imm:$a), (i32 imm:$b)))],
+ sop>;
+
+class StoreParamVecInst<NVPTXRegClass regclass, string opstr, NVPTXInst sop=NOP>
+ : NVPTXVecInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
+ "storeparam : [$a, $b] <- $val",
+ [(StoreParam (i32 imm:$a), (i32 imm:$b), regclass:$val)], sop>;
+
+class StoreRetvalVecInst<NVPTXRegClass regclass, string opstr,
+ NVPTXInst sop=NOP>
+ : NVPTXVecInst<(outs), (ins regclass:$val, i32imm:$a),
+ "storeretval : retval[$a] <- $val",
+ [(StoreRetval (i32 imm:$a), regclass:$val)], sop>;
+
+let VecInstType=isVecLD.Value in {
+def LoadParamV4I32 : LoadParamVecInst<V4I32Regs, ".v4.b32",
+ LoadParamScalar4I32>;
+def LoadParamV4I16 : LoadParamVecInst<V4I16Regs, ".v4.b16",
+ LoadParamScalar4I16>;
+def LoadParamV4I8 : LoadParamVecInst<V4I8Regs, ".v4.b8",
+ LoadParamScalar4I8>;
+
+def LoadParamV2I64 : LoadParamVecInst<V2I64Regs, ".v2.b64",
+ LoadParamScalar2I64>;
+def LoadParamV2I32 : LoadParamVecInst<V2I32Regs, ".v2.b32",
+ LoadParamScalar2I32>;
+def LoadParamV2I16 : LoadParamVecInst<V2I16Regs, ".v2.b16",
+ LoadParamScalar2I16>;
+def LoadParamV2I8 : LoadParamVecInst<V2I8Regs, ".v2.b8",
+ LoadParamScalar2I8>;
+
+def LoadParamV4F32 : LoadParamVecInst<V4F32Regs, ".v4.f32",
+ LoadParamScalar4F32>;
+def LoadParamV2F32 : LoadParamVecInst<V2F32Regs, ".v2.f32",
+ LoadParamScalar2F32>;
+def LoadParamV2F64 : LoadParamVecInst<V2F64Regs, ".v2.f64",
+ LoadParamScalar2F64>;
+}
+
+let VecInstType=isVecST.Value in {
+def StoreParamV4I32 : StoreParamVecInst<V4I32Regs, ".v4.b32",
+ StoreParamScalar4I32>;
+def StoreParamV4I16 : StoreParamVecInst<V4I16Regs, ".v4.b16",
+ StoreParamScalar4I16>;
+def StoreParamV4I8 : StoreParamVecInst<V4I8Regs, ".v4.b8",
+ StoreParamScalar4I8>;
+
+def StoreParamV2I64 : StoreParamVecInst<V2I64Regs, ".v2.b64",
+ StoreParamScalar2I64>;
+def StoreParamV2I32 : StoreParamVecInst<V2I32Regs, ".v2.b32",
+ StoreParamScalar2I32>;
+def StoreParamV2I16 : StoreParamVecInst<V2I16Regs, ".v2.b16",
+ StoreParamScalar2I16>;
+def StoreParamV2I8 : StoreParamVecInst<V2I8Regs, ".v2.b8",
+ StoreParamScalar2I8>;
+
+def StoreParamV4F32 : StoreParamVecInst<V4F32Regs, ".v4.f32",
+ StoreParamScalar4F32>;
+def StoreParamV2F32 : StoreParamVecInst<V2F32Regs, ".v2.f32",
+ StoreParamScalar2F32>;
+def StoreParamV2F64 : StoreParamVecInst<V2F64Regs, ".v2.f64",
+ StoreParamScalar2F64>;
+
+def StoreRetvalV4I32 : StoreRetvalVecInst<V4I32Regs, ".v4.b32",
+ StoreRetvalScalar4I32>;
+def StoreRetvalV4I16 : StoreRetvalVecInst<V4I16Regs, ".v4.b16",
+ StoreRetvalScalar4I16>;
+def StoreRetvalV4I8 : StoreRetvalVecInst<V4I8Regs, ".v4.b8",
+ StoreRetvalScalar4I8>;
+
+def StoreRetvalV2I64 : StoreRetvalVecInst<V2I64Regs, ".v2.b64",
+ StoreRetvalScalar2I64>;
+def StoreRetvalV2I32 : StoreRetvalVecInst<V2I32Regs, ".v2.b32",
+ StoreRetvalScalar2I32>;
+def StoreRetvalV2I16 : StoreRetvalVecInst<V2I16Regs, ".v2.b16",
+ StoreRetvalScalar2I16>;
+def StoreRetvalV2I8 : StoreRetvalVecInst<V2I8Regs, ".v2.b8",
+ StoreRetvalScalar2I8>;
+
+def StoreRetvalV4F32 : StoreRetvalVecInst<V4F32Regs, ".v4.f32",
+ StoreRetvalScalar4F32>;
+def StoreRetvalV2F32 : StoreRetvalVecInst<V2F32Regs, ".v2.f32",
+ StoreRetvalScalar2F32>;
+def StoreRetvalV2F64 : StoreRetvalVecInst<V2F64Regs, ".v2.f64",
+ StoreRetvalScalar2F64>;
+
+}
+
+
+// Int vector to int scalar bit convert
+// v4i8 -> i32
+def : Pat<(i32 (bitconvert V4I8Regs:$s)),
+ (V4I8toI32 (V4i8Extract V4I8Regs:$s,0), (V4i8Extract V4I8Regs:$s,1),
+ (V4i8Extract V4I8Regs:$s,2), (V4i8Extract V4I8Regs:$s,3))>;
+// v4i16 -> i64
+def : Pat<(i64 (bitconvert V4I16Regs:$s)),
+ (V4I16toI64 (V4i16Extract V4I16Regs:$s,0),
+ (V4i16Extract V4I16Regs:$s,1),
+ (V4i16Extract V4I16Regs:$s,2),
+ (V4i16Extract V4I16Regs:$s,3))>;
+// v2i8 -> i16
+def : Pat<(i16 (bitconvert V2I8Regs:$s)),
+ (V2I8toI16 (V2i8Extract V2I8Regs:$s,0), (V2i8Extract V2I8Regs:$s,1))>;
+// v2i16 -> i32
+def : Pat<(i32 (bitconvert V2I16Regs:$s)),
+ (V2I16toI32 (V2i16Extract V2I16Regs:$s,0),
+ (V2i16Extract V2I16Regs:$s,1))>;
+// v2i32 -> i64
+def : Pat<(i64 (bitconvert V2I32Regs:$s)),
+ (V2I32toI64 (V2i32Extract V2I32Regs:$s,0),
+ (V2i32Extract V2I32Regs:$s,1))>;
+
+// Int scalar to int vector bit convert
+let VecInstType=isVecDest.Value in {
+// i32 -> v4i8
+def VecI32toV4I8 : NVPTXVecInst<(outs V4I8Regs:$d), (ins Int32Regs:$s),
+ "Error!",
+ [(set V4I8Regs:$d, (bitconvert Int32Regs:$s))],
+ I32toV4I8>;
+// i64 -> v4i16
+def VecI64toV4I16 : NVPTXVecInst<(outs V4I16Regs:$d), (ins Int64Regs:$s),
+ "Error!",
+ [(set V4I16Regs:$d, (bitconvert Int64Regs:$s))],
+ I64toV4I16>;
+// i16 -> v2i8
+def VecI16toV2I8 : NVPTXVecInst<(outs V2I8Regs:$d), (ins Int16Regs:$s),
+ "Error!",
+ [(set V2I8Regs:$d, (bitconvert Int16Regs:$s))],
+ I16toV2I8>;
+// i32 -> v2i16
+def VecI32toV2I16 : NVPTXVecInst<(outs V2I16Regs:$d), (ins Int32Regs:$s),
+ "Error!",
+ [(set V2I16Regs:$d, (bitconvert Int32Regs:$s))],
+ I32toV2I16>;
+// i64 -> v2i32
+def VecI64toV2I32 : NVPTXVecInst<(outs V2I32Regs:$d), (ins Int64Regs:$s),
+ "Error!",
+ [(set V2I32Regs:$d, (bitconvert Int64Regs:$s))],
+ I64toV2I32>;
+}
+
+// Int vector to int vector bit convert
+// v4i8 -> v2i16
+def : Pat<(v2i16 (bitconvert V4I8Regs:$s)),
+ (VecI32toV2I16
+ (V4I8toI32 (V4i8Extract V4I8Regs:$s,0), (V4i8Extract V4I8Regs:$s,1),
+ (V4i8Extract V4I8Regs:$s,2), (V4i8Extract V4I8Regs:$s,3)))>;
+// v4i16 -> v2i32
+def : Pat<(v2i32 (bitconvert V4I16Regs:$s)),
+ (VecI64toV2I32
+ (V4I16toI64 (V4i16Extract V4I16Regs:$s,0), (V4i16Extract V4I16Regs:$s,1),
+ (V4i16Extract V4I16Regs:$s,2), (V4i16Extract V4I16Regs:$s,3)))>;
+// v2i16 -> v4i8
+def : Pat<(v4i8 (bitconvert V2I16Regs:$s)),
+ (VecI32toV4I8
+ (V2I16toI32 (V2i16Extract V2I16Regs:$s,0), (V2i16Extract V2I16Regs:$s,1)))>;
+// v2i32 -> v4i16
+def : Pat<(v4i16 (bitconvert V2I32Regs:$s)),
+ (VecI64toV4I16
+ (V2I32toI64 (V2i32Extract V2I32Regs:$s,0), (V2i32Extract V2I32Regs:$s,1)))>;
+// v2i64 -> v4i32
+def : Pat<(v4i32 (bitconvert V2I64Regs:$s)),
+ (Build_Vector4_i32
+ (V2i32Extract (VecI64toV2I32 (V2i64Extract V2I64Regs:$s, 0)), 0),
+ (V2i32Extract (VecI64toV2I32 (V2i64Extract V2I64Regs:$s, 0)), 1),
+ (V2i32Extract (VecI64toV2I32 (V2i64Extract V2I64Regs:$s, 1)), 0),
+ (V2i32Extract (VecI64toV2I32 (V2i64Extract V2I64Regs:$s, 1)), 1))>;
+// v4i32 -> v2i64
+def : Pat<(v2i64 (bitconvert V4I32Regs:$s)),
+ (Build_Vector2_i64
+ (V2I32toI64 (V4i32Extract V4I32Regs:$s,0), (V4i32Extract V4I32Regs:$s,1)),
+ (V2I32toI64 (V4i32Extract V4I32Regs:$s,2), (V4i32Extract V4I32Regs:$s,3)))>;
+
+// Fp scalar to fp vector convert
+// f64 -> v2f32
+let VecInstType=isVecDest.Value in {
+def VecF64toV2F32 : NVPTXVecInst<(outs V2F32Regs:$d), (ins Float64Regs:$s),
+ "Error!",
+ [(set V2F32Regs:$d, (bitconvert Float64Regs:$s))],
+ F64toV2F32>;
+}
+
+// Fp vector to fp scalar convert
+// v2f32 -> f64
+def : Pat<(f64 (bitconvert V2F32Regs:$s)),
+ (V2F32toF64 (V2f32Extract V2F32Regs:$s,0), (V2f32Extract V2F32Regs:$s,1))>;
+
+// Fp scalar to int vector convert
+// f32 -> v4i8
+def : Pat<(v4i8 (bitconvert Float32Regs:$s)),
+ (VecI32toV4I8 (BITCONVERT_32_F2I Float32Regs:$s))>;
+// f32 -> v2i16
+def : Pat<(v2i16 (bitconvert Float32Regs:$s)),
+ (VecI32toV2I16 (BITCONVERT_32_F2I Float32Regs:$s))>;
+// f64 -> v4i16
+def : Pat<(v4i16 (bitconvert Float64Regs:$s)),
+ (VecI64toV4I16 (BITCONVERT_64_F2I Float64Regs:$s))>;
+// f64 -> v2i32
+def : Pat<(v2i32 (bitconvert Float64Regs:$s)),
+ (VecI64toV2I32 (BITCONVERT_64_F2I Float64Regs:$s))>;
+
+// Int vector to fp scalar convert
+// v4i8 -> f32
+def : Pat<(f32 (bitconvert V4I8Regs:$s)),
+ (BITCONVERT_32_I2F
+ (V4I8toI32 (V4i8Extract V4I8Regs:$s,0), (V4i8Extract V4I8Regs:$s,1),
+ (V4i8Extract V4I8Regs:$s,2), (V4i8Extract V4I8Regs:$s,3)))>;
+// v4i16 -> f64
+def : Pat<(f64 (bitconvert V4I16Regs:$s)),
+ (BITCONVERT_64_I2F
+ (V4I16toI64 (V4i16Extract V4I16Regs:$s,0), (V4i16Extract V4I16Regs:$s,1),
+ (V4i16Extract V4I16Regs:$s,2), (V4i16Extract V4I16Regs:$s,3)))>;
+// v2i16 -> f32
+def : Pat<(f32 (bitconvert V2I16Regs:$s)),
+ (BITCONVERT_32_I2F
+ (V2I16toI32 (V2i16Extract V2I16Regs:$s,0), (V2i16Extract V2I16Regs:$s,1)))>;
+// v2i32 -> f64
+def : Pat<(f64 (bitconvert V2I32Regs:$s)),
+ (BITCONVERT_64_I2F
+ (V2I32toI64 (V2i32Extract V2I32Regs:$s,0), (V2i32Extract V2I32Regs:$s,1)))>;
+
+// Int scalar to fp vector convert
+// i64 -> v2f32
+def : Pat<(v2f32 (bitconvert Int64Regs:$s)),
+ (VecF64toV2F32 (BITCONVERT_64_I2F Int64Regs:$s))>;
+
+// Fp vector to int scalar convert
+// v2f32 -> i64
+def : Pat<(i64 (bitconvert V2F32Regs:$s)),
+ (BITCONVERT_64_F2I
+ (V2F32toF64 (V2f32Extract V2F32Regs:$s,0), (V2f32Extract V2F32Regs:$s,1)))>;
+
+// Int vector to fp vector convert
+// v2i64 -> v4f32
+def : Pat<(v4f32 (bitconvert V2I64Regs:$s)),
+ (Build_Vector4_f32
+ (BITCONVERT_32_I2F (V2i32Extract (VecI64toV2I32
+ (V2i64Extract V2I64Regs:$s, 0)), 0)),
+ (BITCONVERT_32_I2F (V2i32Extract (VecI64toV2I32
+ (V2i64Extract V2I64Regs:$s, 0)), 1)),
+ (BITCONVERT_32_I2F (V2i32Extract (VecI64toV2I32
+ (V2i64Extract V2I64Regs:$s, 1)), 0)),
+ (BITCONVERT_32_I2F (V2i32Extract (VecI64toV2I32
+ (V2i64Extract V2I64Regs:$s, 1)), 1)))>;
+// v2i64 -> v2f64
+def : Pat<(v2f64 (bitconvert V2I64Regs:$s)),
+ (Build_Vector2_f64
+ (BITCONVERT_64_I2F (V2i64Extract V2I64Regs:$s,0)),
+ (BITCONVERT_64_I2F (V2i64Extract V2I64Regs:$s,1)))>;
+// v2i32 -> v2f32
+def : Pat<(v2f32 (bitconvert V2I32Regs:$s)),
+ (Build_Vector2_f32
+ (BITCONVERT_32_I2F (V2i32Extract V2I32Regs:$s,0)),
+ (BITCONVERT_32_I2F (V2i32Extract V2I32Regs:$s,1)))>;
+// v4i32 -> v2f64
+def : Pat<(v2f64 (bitconvert V4I32Regs:$s)),
+ (Build_Vector2_f64
+ (BITCONVERT_64_I2F (V2I32toI64 (V4i32Extract V4I32Regs:$s,0),
+ (V4i32Extract V4I32Regs:$s,1))),
+ (BITCONVERT_64_I2F (V2I32toI64 (V4i32Extract V4I32Regs:$s,2),
+ (V4i32Extract V4I32Regs:$s,3))))>;
+// v4i32 -> v4f32
+def : Pat<(v4f32 (bitconvert V4I32Regs:$s)),
+ (Build_Vector4_f32
+ (BITCONVERT_32_I2F (V4i32Extract V4I32Regs:$s,0)),
+ (BITCONVERT_32_I2F (V4i32Extract V4I32Regs:$s,1)),
+ (BITCONVERT_32_I2F (V4i32Extract V4I32Regs:$s,2)),
+ (BITCONVERT_32_I2F (V4i32Extract V4I32Regs:$s,3)))>;
+// v4i16 -> v2f32
+def : Pat<(v2f32 (bitconvert V4I16Regs:$s)),
+ (VecF64toV2F32 (BITCONVERT_64_I2F
+ (V4I16toI64 (V4i16Extract V4I16Regs:$s,0),
+ (V4i16Extract V4I16Regs:$s,1),
+ (V4i16Extract V4I16Regs:$s,2),
+ (V4i16Extract V4I16Regs:$s,3))))>;
+
+// Fp vector to int vector convert
+// v2i64 <- v4f32
+def : Pat<(v2i64 (bitconvert V4F32Regs:$s)),
+ (Build_Vector2_i64
+ (BITCONVERT_64_F2I (V2F32toF64 (V4f32Extract V4F32Regs:$s,0),
+ (V4f32Extract V4F32Regs:$s,1))),
+ (BITCONVERT_64_F2I (V2F32toF64 (V4f32Extract V4F32Regs:$s,2),
+ (V4f32Extract V4F32Regs:$s,3))))>;
+// v2i64 <- v2f64
+def : Pat<(v2i64 (bitconvert V2F64Regs:$s)),
+ (Build_Vector2_i64
+ (BITCONVERT_64_F2I (V2f64Extract V2F64Regs:$s,0)),
+ (BITCONVERT_64_F2I (V2f64Extract V2F64Regs:$s,1)))>;
+// v2i32 <- v2f32
+def : Pat<(v2i32 (bitconvert V2F32Regs:$s)),
+ (Build_Vector2_i32
+ (BITCONVERT_32_F2I (V2f32Extract V2F32Regs:$s,0)),
+ (BITCONVERT_32_F2I (V2f32Extract V2F32Regs:$s,1)))>;
+// v4i32 <- v2f64
+def : Pat<(v4i32 (bitconvert V2F64Regs:$s)),
+ (Build_Vector4_i32
+ (BITCONVERT_32_F2I (V2f32Extract (VecF64toV2F32
+ (V2f64Extract V2F64Regs:$s, 0)), 0)),
+ (BITCONVERT_32_F2I (V2f32Extract (VecF64toV2F32
+ (V2f64Extract V2F64Regs:$s, 0)), 1)),
+ (BITCONVERT_32_F2I (V2f32Extract (VecF64toV2F32
+ (V2f64Extract V2F64Regs:$s, 1)), 0)),
+ (BITCONVERT_32_F2I (V2f32Extract (VecF64toV2F32
+ (V2f64Extract V2F64Regs:$s, 1)), 1)))>;
+// v4i32 <- v4f32
+def : Pat<(v4i32 (bitconvert V4F32Regs:$s)),
+ (Build_Vector4_i32
+ (BITCONVERT_32_F2I (V4f32Extract V4F32Regs:$s,0)),
+ (BITCONVERT_32_F2I (V4f32Extract V4F32Regs:$s,1)),
+ (BITCONVERT_32_F2I (V4f32Extract V4F32Regs:$s,2)),
+ (BITCONVERT_32_F2I (V4f32Extract V4F32Regs:$s,3)))>;
+// v4i16 <- v2f32
+def : Pat<(v4i16 (bitconvert V2F32Regs:$s)),
+ (VecI64toV4I16 (BITCONVERT_64_F2I
+ (V2F32toF64 (V2f32Extract V2F32Regs:$s,0),
+ (V2f32Extract V2F32Regs:$s,1))))>;
diff --git a/contrib/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp b/contrib/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp
new file mode 100644
index 000000000000..9c71a2ee165b
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp
@@ -0,0 +1,152 @@
+//===- NVVMIntrRange.cpp - Set !range metadata for NVVM intrinsics --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass adds appropriate !range metadata for calls to NVVM
+// intrinsics that return a limited range of values.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Instructions.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "nvvm-intr-range"
+
+namespace llvm { void initializeNVVMIntrRangePass(PassRegistry &); }
+
+// Add !range metadata based on limits of given SM variant.
+static cl::opt<unsigned> NVVMIntrRangeSM("nvvm-intr-range-sm", cl::init(20),
+ cl::Hidden, cl::desc("SM variant"));
+
+namespace {
+class NVVMIntrRange : public FunctionPass {
+ private:
+ struct {
+ unsigned x, y, z;
+ } MaxBlockSize, MaxGridSize;
+
+ public:
+ static char ID;
+ NVVMIntrRange() : NVVMIntrRange(NVVMIntrRangeSM) {}
+ NVVMIntrRange(unsigned int SmVersion) : FunctionPass(ID) {
+ MaxBlockSize.x = 1024;
+ MaxBlockSize.y = 1024;
+ MaxBlockSize.z = 64;
+
+ MaxGridSize.x = SmVersion >= 30 ? 0x7fffffff : 0xffff;
+ MaxGridSize.y = 0xffff;
+ MaxGridSize.z = 0xffff;
+
+ initializeNVVMIntrRangePass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &) override;
+};
+}
+
+FunctionPass *llvm::createNVVMIntrRangePass(unsigned int SmVersion) {
+ return new NVVMIntrRange(SmVersion);
+}
+
+char NVVMIntrRange::ID = 0;
+INITIALIZE_PASS(NVVMIntrRange, "nvvm-intr-range",
+ "Add !range metadata to NVVM intrinsics.", false, false)
+
+// Adds the passed-in [Low,High) range information as metadata to the
+// passed-in call instruction.
+static bool addRangeMetadata(uint64_t Low, uint64_t High, CallInst *C) {
+ // This call already has range metadata, nothing to do.
+ if (C->getMetadata(LLVMContext::MD_range))
+ return false;
+
+ LLVMContext &Context = C->getParent()->getContext();
+ IntegerType *Int32Ty = Type::getInt32Ty(Context);
+ Metadata *LowAndHigh[] = {
+ ConstantAsMetadata::get(ConstantInt::get(Int32Ty, Low)),
+ ConstantAsMetadata::get(ConstantInt::get(Int32Ty, High))};
+ C->setMetadata(LLVMContext::MD_range, MDNode::get(Context, LowAndHigh));
+ return true;
+}
+
+bool NVVMIntrRange::runOnFunction(Function &F) {
+ // Go through the calls in this function.
+ bool Changed = false;
+ for (Instruction &I : instructions(F)) {
+ CallInst *Call = dyn_cast<CallInst>(&I);
+ if (!Call)
+ continue;
+
+ if (Function *Callee = Call->getCalledFunction()) {
+ switch (Callee->getIntrinsicID()) {
+ // Index within block
+ case Intrinsic::nvvm_read_ptx_sreg_tid_x:
+ Changed |= addRangeMetadata(0, MaxBlockSize.x, Call);
+ break;
+ case Intrinsic::nvvm_read_ptx_sreg_tid_y:
+ Changed |= addRangeMetadata(0, MaxBlockSize.y, Call);
+ break;
+ case Intrinsic::nvvm_read_ptx_sreg_tid_z:
+ Changed |= addRangeMetadata(0, MaxBlockSize.z, Call);
+ break;
+
+ // Block size
+ case Intrinsic::nvvm_read_ptx_sreg_ntid_x:
+ Changed |= addRangeMetadata(1, MaxBlockSize.x+1, Call);
+ break;
+ case Intrinsic::nvvm_read_ptx_sreg_ntid_y:
+ Changed |= addRangeMetadata(1, MaxBlockSize.y+1, Call);
+ break;
+ case Intrinsic::nvvm_read_ptx_sreg_ntid_z:
+ Changed |= addRangeMetadata(1, MaxBlockSize.z+1, Call);
+ break;
+
+ // Index within grid
+ case Intrinsic::nvvm_read_ptx_sreg_ctaid_x:
+ Changed |= addRangeMetadata(0, MaxGridSize.x, Call);
+ break;
+ case Intrinsic::nvvm_read_ptx_sreg_ctaid_y:
+ Changed |= addRangeMetadata(0, MaxGridSize.y, Call);
+ break;
+ case Intrinsic::nvvm_read_ptx_sreg_ctaid_z:
+ Changed |= addRangeMetadata(0, MaxGridSize.z, Call);
+ break;
+
+ // Grid size
+ case Intrinsic::nvvm_read_ptx_sreg_nctaid_x:
+ Changed |= addRangeMetadata(1, MaxGridSize.x+1, Call);
+ break;
+ case Intrinsic::nvvm_read_ptx_sreg_nctaid_y:
+ Changed |= addRangeMetadata(1, MaxGridSize.y+1, Call);
+ break;
+ case Intrinsic::nvvm_read_ptx_sreg_nctaid_z:
+ Changed |= addRangeMetadata(1, MaxGridSize.z+1, Call);
+ break;
+
+ // warp size is constant 32.
+ case Intrinsic::nvvm_read_ptx_sreg_warpsize:
+ Changed |= addRangeMetadata(32, 32+1, Call);
+ break;
+
+ // Lane ID is [0..warpsize)
+ case Intrinsic::nvvm_read_ptx_sreg_laneid:
+ Changed |= addRangeMetadata(0, 32, Call);
+ break;
+
+ default:
+ break;
+ }
+ }
+ }
+
+ return Changed;
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp
new file mode 100644
index 000000000000..c639c4dc0683
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -0,0 +1,219 @@
+//===- NVVMReflect.cpp - NVVM Emulate conditional compilation -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass replaces occurrences of __nvvm_reflect("foo") and llvm.nvvm.reflect
+// with an integer.
+//
+// We choose the value we use by looking, in this order, at:
+//
+// * the -nvvm-reflect-list flag, which has the format "foo=1,bar=42",
+// * the StringMap passed to the pass's constructor, and
+// * metadata in the module itself.
+//
+// If we see an unknown string, we replace its call with 0.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_os_ostream.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include <sstream>
+#include <string>
+#define NVVM_REFLECT_FUNCTION "__nvvm_reflect"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "nvptx-reflect"
+
+namespace llvm { void initializeNVVMReflectPass(PassRegistry &); }
+
+namespace {
+class NVVMReflect : public FunctionPass {
+private:
+ StringMap<int> VarMap;
+
+public:
+ static char ID;
+ NVVMReflect() : NVVMReflect(StringMap<int>()) {}
+
+ NVVMReflect(const StringMap<int> &Mapping)
+ : FunctionPass(ID), VarMap(Mapping) {
+ initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
+ setVarMap();
+ }
+
+ bool runOnFunction(Function &) override;
+
+private:
+ void setVarMap();
+};
+}
+
+FunctionPass *llvm::createNVVMReflectPass() { return new NVVMReflect(); }
+FunctionPass *llvm::createNVVMReflectPass(const StringMap<int> &Mapping) {
+ return new NVVMReflect(Mapping);
+}
+
+static cl::opt<bool>
+NVVMReflectEnabled("nvvm-reflect-enable", cl::init(true), cl::Hidden,
+ cl::desc("NVVM reflection, enabled by default"));
+
+char NVVMReflect::ID = 0;
+INITIALIZE_PASS(NVVMReflect, "nvvm-reflect",
+ "Replace occurrences of __nvvm_reflect() calls with 0/1", false,
+ false)
+
+static cl::list<std::string>
+ReflectList("nvvm-reflect-list", cl::value_desc("name=<int>"), cl::Hidden,
+ cl::desc("A list of string=num assignments"),
+ cl::ValueRequired);
+
+/// The command line can look as follows :
+/// -nvvm-reflect-list a=1,b=2 -nvvm-reflect-list c=3,d=0 -R e=2
+/// The strings "a=1,b=2", "c=3,d=0", "e=2" are available in the
+/// ReflectList vector. First, each of ReflectList[i] is 'split'
+/// using "," as the delimiter. Then each of this part is split
+/// using "=" as the delimiter.
+void NVVMReflect::setVarMap() {
+ for (unsigned i = 0, e = ReflectList.size(); i != e; ++i) {
+ DEBUG(dbgs() << "Option : " << ReflectList[i] << "\n");
+ SmallVector<StringRef, 4> NameValList;
+ StringRef(ReflectList[i]).split(NameValList, ',');
+ for (unsigned j = 0, ej = NameValList.size(); j != ej; ++j) {
+ SmallVector<StringRef, 2> NameValPair;
+ NameValList[j].split(NameValPair, '=');
+ assert(NameValPair.size() == 2 && "name=val expected");
+ std::stringstream ValStream(NameValPair[1]);
+ int Val;
+ ValStream >> Val;
+ assert((!(ValStream.fail())) && "integer value expected");
+ VarMap[NameValPair[0]] = Val;
+ }
+ }
+}
+
+bool NVVMReflect::runOnFunction(Function &F) {
+ if (!NVVMReflectEnabled)
+ return false;
+
+ if (F.getName() == NVVM_REFLECT_FUNCTION) {
+ assert(F.isDeclaration() && "_reflect function should not have a body");
+ assert(F.getReturnType()->isIntegerTy() &&
+ "_reflect's return type should be integer");
+ return false;
+ }
+
+ SmallVector<Instruction *, 4> ToRemove;
+
+ // Go through the calls in this function. Each call to __nvvm_reflect or
+ // llvm.nvvm.reflect should be a CallInst with a ConstantArray argument.
+ // First validate that. If the c-string corresponding to the ConstantArray can
+ // be found successfully, see if it can be found in VarMap. If so, replace the
+ // uses of CallInst with the value found in VarMap. If not, replace the use
+ // with value 0.
+
+ // The IR for __nvvm_reflect calls differs between CUDA versions.
+ //
+ // CUDA 6.5 and earlier uses this sequence:
+ // %ptr = tail call i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8
+ // (i8 addrspace(4)* getelementptr inbounds
+ // ([8 x i8], [8 x i8] addrspace(4)* @str, i32 0, i32 0))
+ // %reflect = tail call i32 @__nvvm_reflect(i8* %ptr)
+ //
+ // The value returned by Sym->getOperand(0) is a Constant with a
+ // ConstantDataSequential operand which can be converted to string and used
+ // for lookup.
+ //
+ // CUDA 7.0 does it slightly differently:
+ // %reflect = call i32 @__nvvm_reflect(i8* addrspacecast
+ // (i8 addrspace(1)* getelementptr inbounds
+ // ([8 x i8], [8 x i8] addrspace(1)* @str, i32 0, i32 0) to i8*))
+ //
+ // In this case, we get a Constant with a GlobalVariable operand and we need
+ // to dig deeper to find its initializer with the string we'll use for lookup.
+ for (Instruction &I : instructions(F)) {
+ CallInst *Call = dyn_cast<CallInst>(&I);
+ if (!Call)
+ continue;
+ Function *Callee = Call->getCalledFunction();
+ if (!Callee || (Callee->getName() != NVVM_REFLECT_FUNCTION &&
+ Callee->getIntrinsicID() != Intrinsic::nvvm_reflect))
+ continue;
+
+ // FIXME: Improve error handling here and elsewhere in this pass.
+ assert(Call->getNumOperands() == 2 &&
+ "Wrong number of operands to __nvvm_reflect function");
+
+ // In cuda 6.5 and earlier, we will have an extra constant-to-generic
+ // conversion of the string.
+ const Value *Str = Call->getArgOperand(0);
+ if (const CallInst *ConvCall = dyn_cast<CallInst>(Str)) {
+ // FIXME: Add assertions about ConvCall.
+ Str = ConvCall->getArgOperand(0);
+ }
+ assert(isa<ConstantExpr>(Str) &&
+ "Format of __nvvm__reflect function not recognized");
+ const ConstantExpr *GEP = cast<ConstantExpr>(Str);
+
+ const Value *Sym = GEP->getOperand(0);
+ assert(isa<Constant>(Sym) &&
+ "Format of __nvvm_reflect function not recognized");
+
+ const Value *Operand = cast<Constant>(Sym)->getOperand(0);
+ if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Operand)) {
+ // For CUDA-7.0 style __nvvm_reflect calls, we need to find the operand's
+ // initializer.
+ assert(GV->hasInitializer() &&
+ "Format of _reflect function not recognized");
+ const Constant *Initializer = GV->getInitializer();
+ Operand = Initializer;
+ }
+
+ assert(isa<ConstantDataSequential>(Operand) &&
+ "Format of _reflect function not recognized");
+ assert(cast<ConstantDataSequential>(Operand)->isCString() &&
+ "Format of _reflect function not recognized");
+
+ StringRef ReflectArg = cast<ConstantDataSequential>(Operand)->getAsString();
+ ReflectArg = ReflectArg.substr(0, ReflectArg.size() - 1);
+ DEBUG(dbgs() << "Arg of _reflect : " << ReflectArg << "\n");
+
+ int ReflectVal = 0; // The default value is 0
+ auto Iter = VarMap.find(ReflectArg);
+ if (Iter != VarMap.end())
+ ReflectVal = Iter->second;
+ else if (ReflectArg == "__CUDA_FTZ") {
+ // Try to pull __CUDA_FTZ from the nvvm-reflect-ftz module flag.
+ if (auto *Flag = mdconst::extract_or_null<ConstantInt>(
+ F.getParent()->getModuleFlag("nvvm-reflect-ftz")))
+ ReflectVal = Flag->getSExtValue();
+ }
+ Call->replaceAllUsesWith(ConstantInt::get(Call->getType(), ReflectVal));
+ ToRemove.push_back(Call);
+ }
+
+ for (Instruction *I : ToRemove)
+ I->eraseFromParent();
+
+ return ToRemove.size() > 0;
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp b/contrib/llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp
new file mode 100644
index 000000000000..d44876abf729
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp
@@ -0,0 +1,29 @@
+//===-- NVPTXTargetInfo.cpp - NVPTX Target Implementation -----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target &llvm::getTheNVPTXTarget32() {
+ static Target TheNVPTXTarget32;
+ return TheNVPTXTarget32;
+}
+Target &llvm::getTheNVPTXTarget64() {
+ static Target TheNVPTXTarget64;
+ return TheNVPTXTarget64;
+}
+
+extern "C" void LLVMInitializeNVPTXTargetInfo() {
+ RegisterTarget<Triple::nvptx> X(getTheNVPTXTarget32(), "nvptx",
+ "NVIDIA PTX 32-bit");
+ RegisterTarget<Triple::nvptx64> Y(getTheNVPTXTarget64(), "nvptx64",
+ "NVIDIA PTX 64-bit");
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/cl_common_defines.h b/contrib/llvm/lib/Target/NVPTX/cl_common_defines.h
new file mode 100644
index 000000000000..02c5a94c3d03
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/cl_common_defines.h
@@ -0,0 +1,122 @@
+#ifndef CL_COMMON_DEFINES_H
+#define CL_COMMON_DEFINES_H
+// This file includes defines that are common to both kernel code and
+// the NVPTX back-end.
+
+//
+// Common defines for Image intrinsics
+// Channel order
+enum {
+ CLK_R = 0x10B0,
+ CLK_A = 0x10B1,
+ CLK_RG = 0x10B2,
+ CLK_RA = 0x10B3,
+ CLK_RGB = 0x10B4,
+ CLK_RGBA = 0x10B5,
+ CLK_BGRA = 0x10B6,
+ CLK_ARGB = 0x10B7,
+
+#if (__NV_CL_C_VERSION == __NV_CL_C_VERSION_1_0)
+ CLK_xRGB = 0x10B7,
+#endif
+
+ CLK_INTENSITY = 0x10B8,
+ CLK_LUMINANCE = 0x10B9
+
+#if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_1_1)
+ ,
+ CLK_Rx = 0x10BA,
+ CLK_RGx = 0x10BB,
+ CLK_RGBx = 0x10BC
+#endif
+};
+
+typedef enum clk_channel_type {
+ // valid formats for float return types
+ CLK_SNORM_INT8 = 0x10D0, // four channel RGBA unorm8
+ CLK_SNORM_INT16 = 0x10D1, // four channel RGBA unorm16
+ CLK_UNORM_INT8 = 0x10D2, // four channel RGBA unorm8
+ CLK_UNORM_INT16 = 0x10D3, // four channel RGBA unorm16
+ CLK_HALF_FLOAT = 0x10DD, // four channel RGBA half
+ CLK_FLOAT = 0x10DE, // four channel RGBA float
+
+#if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_1_1)
+ CLK_UNORM_SHORT_565 = 0x10D4,
+ CLK_UNORM_SHORT_555 = 0x10D5,
+ CLK_UNORM_INT_101010 = 0x10D6,
+#endif
+
+ // valid only for integer return types
+ CLK_SIGNED_INT8 = 0x10D7,
+ CLK_SIGNED_INT16 = 0x10D8,
+ CLK_SIGNED_INT32 = 0x10D9,
+ CLK_UNSIGNED_INT8 = 0x10DA,
+ CLK_UNSIGNED_INT16 = 0x10DB,
+ CLK_UNSIGNED_INT32 = 0x10DC,
+
+ // CI SPI for CPU
+ __CLK_UNORM_INT8888, // four channel ARGB unorm8
+ __CLK_UNORM_INT8888R, // four channel BGRA unorm8
+
+ __CLK_VALID_IMAGE_TYPE_COUNT,
+ __CLK_INVALID_IMAGE_TYPE = __CLK_VALID_IMAGE_TYPE_COUNT,
+ __CLK_VALID_IMAGE_TYPE_MASK_BITS = 4, // number of bits required to
+ // represent any image type
+ __CLK_VALID_IMAGE_TYPE_MASK = (1 << __CLK_VALID_IMAGE_TYPE_MASK_BITS) - 1
+} clk_channel_type;
+
+typedef enum clk_sampler_type {
+ __CLK_ADDRESS_BASE = 0,
+ CLK_ADDRESS_NONE = 0 << __CLK_ADDRESS_BASE,
+ CLK_ADDRESS_CLAMP = 1 << __CLK_ADDRESS_BASE,
+ CLK_ADDRESS_CLAMP_TO_EDGE = 2 << __CLK_ADDRESS_BASE,
+ CLK_ADDRESS_REPEAT = 3 << __CLK_ADDRESS_BASE,
+ CLK_ADDRESS_MIRROR = 4 << __CLK_ADDRESS_BASE,
+
+#if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_1_1)
+ CLK_ADDRESS_MIRRORED_REPEAT = CLK_ADDRESS_MIRROR,
+#endif
+ __CLK_ADDRESS_MASK =
+ CLK_ADDRESS_NONE | CLK_ADDRESS_CLAMP | CLK_ADDRESS_CLAMP_TO_EDGE |
+ CLK_ADDRESS_REPEAT | CLK_ADDRESS_MIRROR,
+ __CLK_ADDRESS_BITS = 3, // number of bits required to
+ // represent address info
+
+ __CLK_NORMALIZED_BASE = __CLK_ADDRESS_BITS,
+ CLK_NORMALIZED_COORDS_FALSE = 0,
+ CLK_NORMALIZED_COORDS_TRUE = 1 << __CLK_NORMALIZED_BASE,
+ __CLK_NORMALIZED_MASK =
+ CLK_NORMALIZED_COORDS_FALSE | CLK_NORMALIZED_COORDS_TRUE,
+ __CLK_NORMALIZED_BITS = 1, // number of bits required to
+ // represent normalization
+
+ __CLK_FILTER_BASE = __CLK_NORMALIZED_BASE + __CLK_NORMALIZED_BITS,
+ CLK_FILTER_NEAREST = 0 << __CLK_FILTER_BASE,
+ CLK_FILTER_LINEAR = 1 << __CLK_FILTER_BASE,
+ CLK_FILTER_ANISOTROPIC = 2 << __CLK_FILTER_BASE,
+ __CLK_FILTER_MASK =
+ CLK_FILTER_NEAREST | CLK_FILTER_LINEAR | CLK_FILTER_ANISOTROPIC,
+ __CLK_FILTER_BITS = 2, // number of bits required to
+ // represent address info
+
+ __CLK_MIP_BASE = __CLK_FILTER_BASE + __CLK_FILTER_BITS,
+ CLK_MIP_NEAREST = 0 << __CLK_MIP_BASE,
+ CLK_MIP_LINEAR = 1 << __CLK_MIP_BASE,
+ CLK_MIP_ANISOTROPIC = 2 << __CLK_MIP_BASE,
+ __CLK_MIP_MASK = CLK_MIP_NEAREST | CLK_MIP_LINEAR | CLK_MIP_ANISOTROPIC,
+ __CLK_MIP_BITS = 2,
+
+ __CLK_SAMPLER_BITS = __CLK_MIP_BASE + __CLK_MIP_BITS,
+ __CLK_SAMPLER_MASK = __CLK_MIP_MASK | __CLK_FILTER_MASK |
+ __CLK_NORMALIZED_MASK | __CLK_ADDRESS_MASK,
+
+ __CLK_ANISOTROPIC_RATIO_BITS = 5,
+ __CLK_ANISOTROPIC_RATIO_MASK =
+ (int) 0x80000000 >> (__CLK_ANISOTROPIC_RATIO_BITS - 1)
+} clk_sampler_type;
+
+// Memory synchronization
+#define CLK_LOCAL_MEM_FENCE (1 << 0)
+#define CLK_GLOBAL_MEM_FENCE (1 << 1)
+
+#endif // CL_COMMON_DEFINES_H
diff --git a/contrib/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/contrib/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
new file mode 100644
index 000000000000..52432a5820fb
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -0,0 +1,1966 @@
+//===-- PPCAsmParser.cpp - Parse PowerPC asm to MCInst instructions -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/PPCMCExpr.h"
+#include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "PPCTargetStreamer.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+static const MCPhysReg RRegs[32] = {
+ PPC::R0, PPC::R1, PPC::R2, PPC::R3,
+ PPC::R4, PPC::R5, PPC::R6, PPC::R7,
+ PPC::R8, PPC::R9, PPC::R10, PPC::R11,
+ PPC::R12, PPC::R13, PPC::R14, PPC::R15,
+ PPC::R16, PPC::R17, PPC::R18, PPC::R19,
+ PPC::R20, PPC::R21, PPC::R22, PPC::R23,
+ PPC::R24, PPC::R25, PPC::R26, PPC::R27,
+ PPC::R28, PPC::R29, PPC::R30, PPC::R31
+};
+static const MCPhysReg RRegsNoR0[32] = {
+ PPC::ZERO,
+ PPC::R1, PPC::R2, PPC::R3,
+ PPC::R4, PPC::R5, PPC::R6, PPC::R7,
+ PPC::R8, PPC::R9, PPC::R10, PPC::R11,
+ PPC::R12, PPC::R13, PPC::R14, PPC::R15,
+ PPC::R16, PPC::R17, PPC::R18, PPC::R19,
+ PPC::R20, PPC::R21, PPC::R22, PPC::R23,
+ PPC::R24, PPC::R25, PPC::R26, PPC::R27,
+ PPC::R28, PPC::R29, PPC::R30, PPC::R31
+};
+static const MCPhysReg XRegs[32] = {
+ PPC::X0, PPC::X1, PPC::X2, PPC::X3,
+ PPC::X4, PPC::X5, PPC::X6, PPC::X7,
+ PPC::X8, PPC::X9, PPC::X10, PPC::X11,
+ PPC::X12, PPC::X13, PPC::X14, PPC::X15,
+ PPC::X16, PPC::X17, PPC::X18, PPC::X19,
+ PPC::X20, PPC::X21, PPC::X22, PPC::X23,
+ PPC::X24, PPC::X25, PPC::X26, PPC::X27,
+ PPC::X28, PPC::X29, PPC::X30, PPC::X31
+};
+static const MCPhysReg XRegsNoX0[32] = {
+ PPC::ZERO8,
+ PPC::X1, PPC::X2, PPC::X3,
+ PPC::X4, PPC::X5, PPC::X6, PPC::X7,
+ PPC::X8, PPC::X9, PPC::X10, PPC::X11,
+ PPC::X12, PPC::X13, PPC::X14, PPC::X15,
+ PPC::X16, PPC::X17, PPC::X18, PPC::X19,
+ PPC::X20, PPC::X21, PPC::X22, PPC::X23,
+ PPC::X24, PPC::X25, PPC::X26, PPC::X27,
+ PPC::X28, PPC::X29, PPC::X30, PPC::X31
+};
+static const MCPhysReg FRegs[32] = {
+ PPC::F0, PPC::F1, PPC::F2, PPC::F3,
+ PPC::F4, PPC::F5, PPC::F6, PPC::F7,
+ PPC::F8, PPC::F9, PPC::F10, PPC::F11,
+ PPC::F12, PPC::F13, PPC::F14, PPC::F15,
+ PPC::F16, PPC::F17, PPC::F18, PPC::F19,
+ PPC::F20, PPC::F21, PPC::F22, PPC::F23,
+ PPC::F24, PPC::F25, PPC::F26, PPC::F27,
+ PPC::F28, PPC::F29, PPC::F30, PPC::F31
+};
+static const MCPhysReg VFRegs[32] = {
+ PPC::VF0, PPC::VF1, PPC::VF2, PPC::VF3,
+ PPC::VF4, PPC::VF5, PPC::VF6, PPC::VF7,
+ PPC::VF8, PPC::VF9, PPC::VF10, PPC::VF11,
+ PPC::VF12, PPC::VF13, PPC::VF14, PPC::VF15,
+ PPC::VF16, PPC::VF17, PPC::VF18, PPC::VF19,
+ PPC::VF20, PPC::VF21, PPC::VF22, PPC::VF23,
+ PPC::VF24, PPC::VF25, PPC::VF26, PPC::VF27,
+ PPC::VF28, PPC::VF29, PPC::VF30, PPC::VF31
+};
+static const MCPhysReg VRegs[32] = {
+ PPC::V0, PPC::V1, PPC::V2, PPC::V3,
+ PPC::V4, PPC::V5, PPC::V6, PPC::V7,
+ PPC::V8, PPC::V9, PPC::V10, PPC::V11,
+ PPC::V12, PPC::V13, PPC::V14, PPC::V15,
+ PPC::V16, PPC::V17, PPC::V18, PPC::V19,
+ PPC::V20, PPC::V21, PPC::V22, PPC::V23,
+ PPC::V24, PPC::V25, PPC::V26, PPC::V27,
+ PPC::V28, PPC::V29, PPC::V30, PPC::V31
+};
+static const MCPhysReg VSRegs[64] = {
+ PPC::VSL0, PPC::VSL1, PPC::VSL2, PPC::VSL3,
+ PPC::VSL4, PPC::VSL5, PPC::VSL6, PPC::VSL7,
+ PPC::VSL8, PPC::VSL9, PPC::VSL10, PPC::VSL11,
+ PPC::VSL12, PPC::VSL13, PPC::VSL14, PPC::VSL15,
+ PPC::VSL16, PPC::VSL17, PPC::VSL18, PPC::VSL19,
+ PPC::VSL20, PPC::VSL21, PPC::VSL22, PPC::VSL23,
+ PPC::VSL24, PPC::VSL25, PPC::VSL26, PPC::VSL27,
+ PPC::VSL28, PPC::VSL29, PPC::VSL30, PPC::VSL31,
+
+ PPC::V0, PPC::V1, PPC::V2, PPC::V3,
+ PPC::V4, PPC::V5, PPC::V6, PPC::V7,
+ PPC::V8, PPC::V9, PPC::V10, PPC::V11,
+ PPC::V12, PPC::V13, PPC::V14, PPC::V15,
+ PPC::V16, PPC::V17, PPC::V18, PPC::V19,
+ PPC::V20, PPC::V21, PPC::V22, PPC::V23,
+ PPC::V24, PPC::V25, PPC::V26, PPC::V27,
+ PPC::V28, PPC::V29, PPC::V30, PPC::V31
+};
+static const MCPhysReg VSFRegs[64] = {
+ PPC::F0, PPC::F1, PPC::F2, PPC::F3,
+ PPC::F4, PPC::F5, PPC::F6, PPC::F7,
+ PPC::F8, PPC::F9, PPC::F10, PPC::F11,
+ PPC::F12, PPC::F13, PPC::F14, PPC::F15,
+ PPC::F16, PPC::F17, PPC::F18, PPC::F19,
+ PPC::F20, PPC::F21, PPC::F22, PPC::F23,
+ PPC::F24, PPC::F25, PPC::F26, PPC::F27,
+ PPC::F28, PPC::F29, PPC::F30, PPC::F31,
+
+ PPC::VF0, PPC::VF1, PPC::VF2, PPC::VF3,
+ PPC::VF4, PPC::VF5, PPC::VF6, PPC::VF7,
+ PPC::VF8, PPC::VF9, PPC::VF10, PPC::VF11,
+ PPC::VF12, PPC::VF13, PPC::VF14, PPC::VF15,
+ PPC::VF16, PPC::VF17, PPC::VF18, PPC::VF19,
+ PPC::VF20, PPC::VF21, PPC::VF22, PPC::VF23,
+ PPC::VF24, PPC::VF25, PPC::VF26, PPC::VF27,
+ PPC::VF28, PPC::VF29, PPC::VF30, PPC::VF31
+};
+static const MCPhysReg VSSRegs[64] = {
+ PPC::F0, PPC::F1, PPC::F2, PPC::F3,
+ PPC::F4, PPC::F5, PPC::F6, PPC::F7,
+ PPC::F8, PPC::F9, PPC::F10, PPC::F11,
+ PPC::F12, PPC::F13, PPC::F14, PPC::F15,
+ PPC::F16, PPC::F17, PPC::F18, PPC::F19,
+ PPC::F20, PPC::F21, PPC::F22, PPC::F23,
+ PPC::F24, PPC::F25, PPC::F26, PPC::F27,
+ PPC::F28, PPC::F29, PPC::F30, PPC::F31,
+
+ PPC::VF0, PPC::VF1, PPC::VF2, PPC::VF3,
+ PPC::VF4, PPC::VF5, PPC::VF6, PPC::VF7,
+ PPC::VF8, PPC::VF9, PPC::VF10, PPC::VF11,
+ PPC::VF12, PPC::VF13, PPC::VF14, PPC::VF15,
+ PPC::VF16, PPC::VF17, PPC::VF18, PPC::VF19,
+ PPC::VF20, PPC::VF21, PPC::VF22, PPC::VF23,
+ PPC::VF24, PPC::VF25, PPC::VF26, PPC::VF27,
+ PPC::VF28, PPC::VF29, PPC::VF30, PPC::VF31
+};
+static unsigned QFRegs[32] = {
+ PPC::QF0, PPC::QF1, PPC::QF2, PPC::QF3,
+ PPC::QF4, PPC::QF5, PPC::QF6, PPC::QF7,
+ PPC::QF8, PPC::QF9, PPC::QF10, PPC::QF11,
+ PPC::QF12, PPC::QF13, PPC::QF14, PPC::QF15,
+ PPC::QF16, PPC::QF17, PPC::QF18, PPC::QF19,
+ PPC::QF20, PPC::QF21, PPC::QF22, PPC::QF23,
+ PPC::QF24, PPC::QF25, PPC::QF26, PPC::QF27,
+ PPC::QF28, PPC::QF29, PPC::QF30, PPC::QF31
+};
+static const MCPhysReg CRBITRegs[32] = {
+ PPC::CR0LT, PPC::CR0GT, PPC::CR0EQ, PPC::CR0UN,
+ PPC::CR1LT, PPC::CR1GT, PPC::CR1EQ, PPC::CR1UN,
+ PPC::CR2LT, PPC::CR2GT, PPC::CR2EQ, PPC::CR2UN,
+ PPC::CR3LT, PPC::CR3GT, PPC::CR3EQ, PPC::CR3UN,
+ PPC::CR4LT, PPC::CR4GT, PPC::CR4EQ, PPC::CR4UN,
+ PPC::CR5LT, PPC::CR5GT, PPC::CR5EQ, PPC::CR5UN,
+ PPC::CR6LT, PPC::CR6GT, PPC::CR6EQ, PPC::CR6UN,
+ PPC::CR7LT, PPC::CR7GT, PPC::CR7EQ, PPC::CR7UN
+};
+static const MCPhysReg CRRegs[8] = {
+ PPC::CR0, PPC::CR1, PPC::CR2, PPC::CR3,
+ PPC::CR4, PPC::CR5, PPC::CR6, PPC::CR7
+};
+
+// Evaluate an expression containing condition register
+// or condition register field symbols. Returns positive
+// value on success, or -1 on error.
+static int64_t
+EvaluateCRExpr(const MCExpr *E) {
+ switch (E->getKind()) {
+ case MCExpr::Target:
+ return -1;
+
+ case MCExpr::Constant: {
+ int64_t Res = cast<MCConstantExpr>(E)->getValue();
+ return Res < 0 ? -1 : Res;
+ }
+
+ case MCExpr::SymbolRef: {
+ const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(E);
+ StringRef Name = SRE->getSymbol().getName();
+
+ if (Name == "lt") return 0;
+ if (Name == "gt") return 1;
+ if (Name == "eq") return 2;
+ if (Name == "so") return 3;
+ if (Name == "un") return 3;
+
+ if (Name == "cr0") return 0;
+ if (Name == "cr1") return 1;
+ if (Name == "cr2") return 2;
+ if (Name == "cr3") return 3;
+ if (Name == "cr4") return 4;
+ if (Name == "cr5") return 5;
+ if (Name == "cr6") return 6;
+ if (Name == "cr7") return 7;
+
+ return -1;
+ }
+
+ case MCExpr::Unary:
+ return -1;
+
+ case MCExpr::Binary: {
+ const MCBinaryExpr *BE = cast<MCBinaryExpr>(E);
+ int64_t LHSVal = EvaluateCRExpr(BE->getLHS());
+ int64_t RHSVal = EvaluateCRExpr(BE->getRHS());
+ int64_t Res;
+
+ if (LHSVal < 0 || RHSVal < 0)
+ return -1;
+
+ switch (BE->getOpcode()) {
+ default: return -1;
+ case MCBinaryExpr::Add: Res = LHSVal + RHSVal; break;
+ case MCBinaryExpr::Mul: Res = LHSVal * RHSVal; break;
+ }
+
+ return Res < 0 ? -1 : Res;
+ }
+ }
+
+ llvm_unreachable("Invalid expression kind!");
+}
+
+namespace {
+
+struct PPCOperand;
+
+class PPCAsmParser : public MCTargetAsmParser {
+ const MCInstrInfo &MII;
+ bool IsPPC64;
+ bool IsDarwin;
+
+ void Warning(SMLoc L, const Twine &Msg) { getParser().Warning(L, Msg); }
+
+ bool isPPC64() const { return IsPPC64; }
+ bool isDarwin() const { return IsDarwin; }
+
+ bool MatchRegisterName(unsigned &RegNo, int64_t &IntVal);
+
+ bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+
+ const MCExpr *ExtractModifierFromExpr(const MCExpr *E,
+ PPCMCExpr::VariantKind &Variant);
+ const MCExpr *FixupVariantKind(const MCExpr *E);
+ bool ParseExpression(const MCExpr *&EVal);
+ bool ParseDarwinExpression(const MCExpr *&EVal);
+
+ bool ParseOperand(OperandVector &Operands);
+
+ bool ParseDirectiveWord(unsigned Size, AsmToken ID);
+ bool ParseDirectiveTC(unsigned Size, AsmToken ID);
+ bool ParseDirectiveMachine(SMLoc L);
+ bool ParseDarwinDirectiveMachine(SMLoc L);
+ bool ParseDirectiveAbiVersion(SMLoc L);
+ bool ParseDirectiveLocalEntry(SMLoc L);
+
+ bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands, MCStreamer &Out,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) override;
+
+ void ProcessInstruction(MCInst &Inst, const OperandVector &Ops);
+
+ /// @name Auto-generated Match Functions
+ /// {
+
+#define GET_ASSEMBLER_HEADER
+#include "PPCGenAsmMatcher.inc"
+
+ /// }
+
+
+public:
+ PPCAsmParser(const MCSubtargetInfo &STI, MCAsmParser &,
+ const MCInstrInfo &MII, const MCTargetOptions &Options)
+ : MCTargetAsmParser(Options, STI), MII(MII) {
+ // Check for 64-bit vs. 32-bit pointer mode.
+ const Triple &TheTriple = STI.getTargetTriple();
+ IsPPC64 = (TheTriple.getArch() == Triple::ppc64 ||
+ TheTriple.getArch() == Triple::ppc64le);
+ IsDarwin = TheTriple.isMacOSX();
+ // Initialize the set of available features.
+ setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+ }
+
+ bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+ SMLoc NameLoc, OperandVector &Operands) override;
+
+ bool ParseDirective(AsmToken DirectiveID) override;
+
+ unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
+ unsigned Kind) override;
+
+ const MCExpr *applyModifierToExpr(const MCExpr *E,
+ MCSymbolRefExpr::VariantKind,
+ MCContext &Ctx) override;
+};
+
+/// PPCOperand - Instances of this class represent a parsed PowerPC machine
+/// instruction.
+struct PPCOperand : public MCParsedAsmOperand {
+ enum KindTy {
+ Token,
+ Immediate,
+ ContextImmediate,
+ Expression,
+ TLSRegister
+ } Kind;
+
+ SMLoc StartLoc, EndLoc;
+ bool IsPPC64;
+
+ struct TokOp {
+ const char *Data;
+ unsigned Length;
+ };
+
+ struct ImmOp {
+ int64_t Val;
+ };
+
+ struct ExprOp {
+ const MCExpr *Val;
+ int64_t CRVal; // Cached result of EvaluateCRExpr(Val)
+ };
+
+ struct TLSRegOp {
+ const MCSymbolRefExpr *Sym;
+ };
+
+ union {
+ struct TokOp Tok;
+ struct ImmOp Imm;
+ struct ExprOp Expr;
+ struct TLSRegOp TLSReg;
+ };
+
+ PPCOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
+public:
+ PPCOperand(const PPCOperand &o) : MCParsedAsmOperand() {
+ Kind = o.Kind;
+ StartLoc = o.StartLoc;
+ EndLoc = o.EndLoc;
+ IsPPC64 = o.IsPPC64;
+ switch (Kind) {
+ case Token:
+ Tok = o.Tok;
+ break;
+ case Immediate:
+ case ContextImmediate:
+ Imm = o.Imm;
+ break;
+ case Expression:
+ Expr = o.Expr;
+ break;
+ case TLSRegister:
+ TLSReg = o.TLSReg;
+ break;
+ }
+ }
+
+ // Disable use of sized deallocation due to overallocation of PPCOperand
+ // objects in CreateTokenWithStringCopy.
+ void operator delete(void *p) { ::operator delete(p); }
+
+ /// getStartLoc - Get the location of the first token of this operand.
+ SMLoc getStartLoc() const override { return StartLoc; }
+
+ /// getEndLoc - Get the location of the last token of this operand.
+ SMLoc getEndLoc() const override { return EndLoc; }
+
+ /// isPPC64 - True if this operand is for an instruction in 64-bit mode.
+ bool isPPC64() const { return IsPPC64; }
+
+ int64_t getImm() const {
+ assert(Kind == Immediate && "Invalid access!");
+ return Imm.Val;
+ }
+ int64_t getImmS16Context() const {
+ assert((Kind == Immediate || Kind == ContextImmediate) &&
+ "Invalid access!");
+ if (Kind == Immediate)
+ return Imm.Val;
+ return static_cast<int16_t>(Imm.Val);
+ }
+ int64_t getImmU16Context() const {
+ assert((Kind == Immediate || Kind == ContextImmediate) &&
+ "Invalid access!");
+ return Imm.Val;
+ }
+
+ const MCExpr *getExpr() const {
+ assert(Kind == Expression && "Invalid access!");
+ return Expr.Val;
+ }
+
+ int64_t getExprCRVal() const {
+ assert(Kind == Expression && "Invalid access!");
+ return Expr.CRVal;
+ }
+
+ const MCExpr *getTLSReg() const {
+ assert(Kind == TLSRegister && "Invalid access!");
+ return TLSReg.Sym;
+ }
+
+ unsigned getReg() const override {
+ assert(isRegNumber() && "Invalid access!");
+ return (unsigned) Imm.Val;
+ }
+
+ unsigned getVSReg() const {
+ assert(isVSRegNumber() && "Invalid access!");
+ return (unsigned) Imm.Val;
+ }
+
+ unsigned getCCReg() const {
+ assert(isCCRegNumber() && "Invalid access!");
+ return (unsigned) (Kind == Immediate ? Imm.Val : Expr.CRVal);
+ }
+
+ unsigned getCRBit() const {
+ assert(isCRBitNumber() && "Invalid access!");
+ return (unsigned) (Kind == Immediate ? Imm.Val : Expr.CRVal);
+ }
+
+ unsigned getCRBitMask() const {
+ assert(isCRBitMask() && "Invalid access!");
+ return 7 - countTrailingZeros<uint64_t>(Imm.Val);
+ }
+
+ bool isToken() const override { return Kind == Token; }
+ bool isImm() const override {
+ return Kind == Immediate || Kind == Expression;
+ }
+ bool isU1Imm() const { return Kind == Immediate && isUInt<1>(getImm()); }
+ bool isU2Imm() const { return Kind == Immediate && isUInt<2>(getImm()); }
+ bool isU3Imm() const { return Kind == Immediate && isUInt<3>(getImm()); }
+ bool isU4Imm() const { return Kind == Immediate && isUInt<4>(getImm()); }
+ bool isU5Imm() const { return Kind == Immediate && isUInt<5>(getImm()); }
+ bool isS5Imm() const { return Kind == Immediate && isInt<5>(getImm()); }
+ bool isU6Imm() const { return Kind == Immediate && isUInt<6>(getImm()); }
+ bool isU6ImmX2() const { return Kind == Immediate &&
+ isUInt<6>(getImm()) &&
+ (getImm() & 1) == 0; }
+ bool isU7Imm() const { return Kind == Immediate && isUInt<7>(getImm()); }
+ bool isU7ImmX4() const { return Kind == Immediate &&
+ isUInt<7>(getImm()) &&
+ (getImm() & 3) == 0; }
+ bool isU8Imm() const { return Kind == Immediate && isUInt<8>(getImm()); }
+ bool isU8ImmX8() const { return Kind == Immediate &&
+ isUInt<8>(getImm()) &&
+ (getImm() & 7) == 0; }
+
+ bool isU10Imm() const { return Kind == Immediate && isUInt<10>(getImm()); }
+ bool isU12Imm() const { return Kind == Immediate && isUInt<12>(getImm()); }
+ bool isU16Imm() const {
+ switch (Kind) {
+ case Expression:
+ return true;
+ case Immediate:
+ case ContextImmediate:
+ return isUInt<16>(getImmU16Context());
+ default:
+ return false;
+ }
+ }
+ bool isS16Imm() const {
+ switch (Kind) {
+ case Expression:
+ return true;
+ case Immediate:
+ case ContextImmediate:
+ return isInt<16>(getImmS16Context());
+ default:
+ return false;
+ }
+ }
+ bool isS16ImmX4() const { return Kind == Expression ||
+ (Kind == Immediate && isInt<16>(getImm()) &&
+ (getImm() & 3) == 0); }
+ bool isS16ImmX16() const { return Kind == Expression ||
+ (Kind == Immediate && isInt<16>(getImm()) &&
+ (getImm() & 15) == 0); }
+ bool isS17Imm() const {
+ switch (Kind) {
+ case Expression:
+ return true;
+ case Immediate:
+ case ContextImmediate:
+ return isInt<17>(getImmS16Context());
+ default:
+ return false;
+ }
+ }
+ bool isTLSReg() const { return Kind == TLSRegister; }
+ bool isDirectBr() const {
+ if (Kind == Expression)
+ return true;
+ if (Kind != Immediate)
+ return false;
+ // Operand must be 64-bit aligned, signed 27-bit immediate.
+ if ((getImm() & 3) != 0)
+ return false;
+ if (isInt<26>(getImm()))
+ return true;
+ if (!IsPPC64) {
+ // In 32-bit mode, large 32-bit quantities wrap around.
+ if (isUInt<32>(getImm()) && isInt<26>(static_cast<int32_t>(getImm())))
+ return true;
+ }
+ return false;
+ }
+ bool isCondBr() const { return Kind == Expression ||
+ (Kind == Immediate && isInt<16>(getImm()) &&
+ (getImm() & 3) == 0); }
+ bool isRegNumber() const { return Kind == Immediate && isUInt<5>(getImm()); }
+ bool isVSRegNumber() const {
+ return Kind == Immediate && isUInt<6>(getImm());
+ }
+ bool isCCRegNumber() const { return (Kind == Expression
+ && isUInt<3>(getExprCRVal())) ||
+ (Kind == Immediate
+ && isUInt<3>(getImm())); }
+ bool isCRBitNumber() const { return (Kind == Expression
+ && isUInt<5>(getExprCRVal())) ||
+ (Kind == Immediate
+ && isUInt<5>(getImm())); }
+ bool isCRBitMask() const { return Kind == Immediate && isUInt<8>(getImm()) &&
+ isPowerOf2_32(getImm()); }
+ bool isATBitsAsHint() const { return false; }
+ bool isMem() const override { return false; }
+ bool isReg() const override { return false; }
+
+ void addRegOperands(MCInst &Inst, unsigned N) const {
+ llvm_unreachable("addRegOperands");
+ }
+
+ void addRegGPRCOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(RRegs[getReg()]));
+ }
+
+ void addRegGPRCNoR0Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(RRegsNoR0[getReg()]));
+ }
+
+ void addRegG8RCOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(XRegs[getReg()]));
+ }
+
+ void addRegG8RCNoX0Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(XRegsNoX0[getReg()]));
+ }
+
+ void addRegGxRCOperands(MCInst &Inst, unsigned N) const {
+ if (isPPC64())
+ addRegG8RCOperands(Inst, N);
+ else
+ addRegGPRCOperands(Inst, N);
+ }
+
+ void addRegGxRCNoR0Operands(MCInst &Inst, unsigned N) const {
+ if (isPPC64())
+ addRegG8RCNoX0Operands(Inst, N);
+ else
+ addRegGPRCNoR0Operands(Inst, N);
+ }
+
+ void addRegF4RCOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(FRegs[getReg()]));
+ }
+
+ void addRegF8RCOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(FRegs[getReg()]));
+ }
+
+ void addRegVFRCOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(VFRegs[getReg()]));
+ }
+
+ void addRegVRRCOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(VRegs[getReg()]));
+ }
+
+ void addRegVSRCOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(VSRegs[getVSReg()]));
+ }
+
+ void addRegVSFRCOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(VSFRegs[getVSReg()]));
+ }
+
+ void addRegVSSRCOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(VSSRegs[getVSReg()]));
+ }
+
+ void addRegQFRCOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(QFRegs[getReg()]));
+ }
+
+ void addRegQSRCOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(QFRegs[getReg()]));
+ }
+
+ void addRegQBRCOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(QFRegs[getReg()]));
+ }
+
+ void addRegCRBITRCOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(CRBITRegs[getCRBit()]));
+ }
+
+ void addRegCRRCOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(CRRegs[getCCReg()]));
+ }
+
+ void addCRBitMaskOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(CRRegs[getCRBitMask()]));
+ }
+
+ void addImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ if (Kind == Immediate)
+ Inst.addOperand(MCOperand::createImm(getImm()));
+ else
+ Inst.addOperand(MCOperand::createExpr(getExpr()));
+ }
+
+ void addS16ImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ switch (Kind) {
+ case Immediate:
+ Inst.addOperand(MCOperand::createImm(getImm()));
+ break;
+ case ContextImmediate:
+ Inst.addOperand(MCOperand::createImm(getImmS16Context()));
+ break;
+ default:
+ Inst.addOperand(MCOperand::createExpr(getExpr()));
+ break;
+ }
+ }
+
+ void addU16ImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ switch (Kind) {
+ case Immediate:
+ Inst.addOperand(MCOperand::createImm(getImm()));
+ break;
+ case ContextImmediate:
+ Inst.addOperand(MCOperand::createImm(getImmU16Context()));
+ break;
+ default:
+ Inst.addOperand(MCOperand::createExpr(getExpr()));
+ break;
+ }
+ }
+
+ void addBranchTargetOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ if (Kind == Immediate)
+ Inst.addOperand(MCOperand::createImm(getImm() / 4));
+ else
+ Inst.addOperand(MCOperand::createExpr(getExpr()));
+ }
+
+ void addTLSRegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createExpr(getTLSReg()));
+ }
+
+ StringRef getToken() const {
+ assert(Kind == Token && "Invalid access!");
+ return StringRef(Tok.Data, Tok.Length);
+ }
+
+ void print(raw_ostream &OS) const override;
+
+ static std::unique_ptr<PPCOperand> CreateToken(StringRef Str, SMLoc S,
+ bool IsPPC64) {
+ auto Op = make_unique<PPCOperand>(Token);
+ Op->Tok.Data = Str.data();
+ Op->Tok.Length = Str.size();
+ Op->StartLoc = S;
+ Op->EndLoc = S;
+ Op->IsPPC64 = IsPPC64;
+ return Op;
+ }
+
+ static std::unique_ptr<PPCOperand>
+ CreateTokenWithStringCopy(StringRef Str, SMLoc S, bool IsPPC64) {
+ // Allocate extra memory for the string and copy it.
+ // FIXME: This is incorrect, Operands are owned by unique_ptr with a default
+ // deleter which will destroy them by simply using "delete", not correctly
+ // calling operator delete on this extra memory after calling the dtor
+ // explicitly.
+ void *Mem = ::operator new(sizeof(PPCOperand) + Str.size());
+ std::unique_ptr<PPCOperand> Op(new (Mem) PPCOperand(Token));
+ Op->Tok.Data = reinterpret_cast<const char *>(Op.get() + 1);
+ Op->Tok.Length = Str.size();
+ std::memcpy(const_cast<char *>(Op->Tok.Data), Str.data(), Str.size());
+ Op->StartLoc = S;
+ Op->EndLoc = S;
+ Op->IsPPC64 = IsPPC64;
+ return Op;
+ }
+
+ static std::unique_ptr<PPCOperand> CreateImm(int64_t Val, SMLoc S, SMLoc E,
+ bool IsPPC64) {
+ auto Op = make_unique<PPCOperand>(Immediate);
+ Op->Imm.Val = Val;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ Op->IsPPC64 = IsPPC64;
+ return Op;
+ }
+
+ static std::unique_ptr<PPCOperand> CreateExpr(const MCExpr *Val, SMLoc S,
+ SMLoc E, bool IsPPC64) {
+ auto Op = make_unique<PPCOperand>(Expression);
+ Op->Expr.Val = Val;
+ Op->Expr.CRVal = EvaluateCRExpr(Val);
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ Op->IsPPC64 = IsPPC64;
+ return Op;
+ }
+
+ static std::unique_ptr<PPCOperand>
+ CreateTLSReg(const MCSymbolRefExpr *Sym, SMLoc S, SMLoc E, bool IsPPC64) {
+ auto Op = make_unique<PPCOperand>(TLSRegister);
+ Op->TLSReg.Sym = Sym;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ Op->IsPPC64 = IsPPC64;
+ return Op;
+ }
+
+ static std::unique_ptr<PPCOperand>
+ CreateContextImm(int64_t Val, SMLoc S, SMLoc E, bool IsPPC64) {
+ auto Op = make_unique<PPCOperand>(ContextImmediate);
+ Op->Imm.Val = Val;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ Op->IsPPC64 = IsPPC64;
+ return Op;
+ }
+
+ static std::unique_ptr<PPCOperand>
+ CreateFromMCExpr(const MCExpr *Val, SMLoc S, SMLoc E, bool IsPPC64) {
+ if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Val))
+ return CreateImm(CE->getValue(), S, E, IsPPC64);
+
+ if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(Val))
+ if (SRE->getKind() == MCSymbolRefExpr::VK_PPC_TLS)
+ return CreateTLSReg(SRE, S, E, IsPPC64);
+
+ if (const PPCMCExpr *TE = dyn_cast<PPCMCExpr>(Val)) {
+ int64_t Res;
+ if (TE->evaluateAsConstant(Res))
+ return CreateContextImm(Res, S, E, IsPPC64);
+ }
+
+ return CreateExpr(Val, S, E, IsPPC64);
+ }
+};
+
+} // end anonymous namespace.
+
+void PPCOperand::print(raw_ostream &OS) const {
+ switch (Kind) {
+ case Token:
+ OS << "'" << getToken() << "'";
+ break;
+ case Immediate:
+ case ContextImmediate:
+ OS << getImm();
+ break;
+ case Expression:
+ OS << *getExpr();
+ break;
+ case TLSRegister:
+ OS << *getTLSReg();
+ break;
+ }
+}
+
+static void
+addNegOperand(MCInst &Inst, MCOperand &Op, MCContext &Ctx) {
+ if (Op.isImm()) {
+ Inst.addOperand(MCOperand::createImm(-Op.getImm()));
+ return;
+ }
+ const MCExpr *Expr = Op.getExpr();
+ if (const MCUnaryExpr *UnExpr = dyn_cast<MCUnaryExpr>(Expr)) {
+ if (UnExpr->getOpcode() == MCUnaryExpr::Minus) {
+ Inst.addOperand(MCOperand::createExpr(UnExpr->getSubExpr()));
+ return;
+ }
+ } else if (const MCBinaryExpr *BinExpr = dyn_cast<MCBinaryExpr>(Expr)) {
+ if (BinExpr->getOpcode() == MCBinaryExpr::Sub) {
+ const MCExpr *NE = MCBinaryExpr::createSub(BinExpr->getRHS(),
+ BinExpr->getLHS(), Ctx);
+ Inst.addOperand(MCOperand::createExpr(NE));
+ return;
+ }
+ }
+ Inst.addOperand(MCOperand::createExpr(MCUnaryExpr::createMinus(Expr, Ctx)));
+}
+
+void PPCAsmParser::ProcessInstruction(MCInst &Inst,
+ const OperandVector &Operands) {
+ int Opcode = Inst.getOpcode();
+ switch (Opcode) {
+ case PPC::DCBTx:
+ case PPC::DCBTT:
+ case PPC::DCBTSTx:
+ case PPC::DCBTSTT: {
+ MCInst TmpInst;
+ TmpInst.setOpcode((Opcode == PPC::DCBTx || Opcode == PPC::DCBTT) ?
+ PPC::DCBT : PPC::DCBTST);
+ TmpInst.addOperand(MCOperand::createImm(
+ (Opcode == PPC::DCBTx || Opcode == PPC::DCBTSTx) ? 0 : 16));
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+ Inst = TmpInst;
+ break;
+ }
+ case PPC::DCBTCT:
+ case PPC::DCBTDS: {
+ MCInst TmpInst;
+ TmpInst.setOpcode(PPC::DCBT);
+ TmpInst.addOperand(Inst.getOperand(2));
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+ Inst = TmpInst;
+ break;
+ }
+ case PPC::DCBTSTCT:
+ case PPC::DCBTSTDS: {
+ MCInst TmpInst;
+ TmpInst.setOpcode(PPC::DCBTST);
+ TmpInst.addOperand(Inst.getOperand(2));
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+ Inst = TmpInst;
+ break;
+ }
+ case PPC::DCBFx:
+ case PPC::DCBFL:
+ case PPC::DCBFLP: {
+ int L = 0;
+ if (Opcode == PPC::DCBFL)
+ L = 1;
+ else if (Opcode == PPC::DCBFLP)
+ L = 3;
+
+ MCInst TmpInst;
+ TmpInst.setOpcode(PPC::DCBF);
+ TmpInst.addOperand(MCOperand::createImm(L));
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+ Inst = TmpInst;
+ break;
+ }
+ case PPC::LAx: {
+ MCInst TmpInst;
+ TmpInst.setOpcode(PPC::LA);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(2));
+ TmpInst.addOperand(Inst.getOperand(1));
+ Inst = TmpInst;
+ break;
+ }
+ case PPC::SUBI: {
+ MCInst TmpInst;
+ TmpInst.setOpcode(PPC::ADDI);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+ addNegOperand(TmpInst, Inst.getOperand(2), getContext());
+ Inst = TmpInst;
+ break;
+ }
+ case PPC::SUBIS: {
+ MCInst TmpInst;
+ TmpInst.setOpcode(PPC::ADDIS);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+ addNegOperand(TmpInst, Inst.getOperand(2), getContext());
+ Inst = TmpInst;
+ break;
+ }
+ case PPC::SUBIC: {
+ MCInst TmpInst;
+ TmpInst.setOpcode(PPC::ADDIC);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+ addNegOperand(TmpInst, Inst.getOperand(2), getContext());
+ Inst = TmpInst;
+ break;
+ }
+ case PPC::SUBICo: {
+ MCInst TmpInst;
+ TmpInst.setOpcode(PPC::ADDICo);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+ addNegOperand(TmpInst, Inst.getOperand(2), getContext());
+ Inst = TmpInst;
+ break;
+ }
+ case PPC::EXTLWI:
+ case PPC::EXTLWIo: {
+ MCInst TmpInst;
+ int64_t N = Inst.getOperand(2).getImm();
+ int64_t B = Inst.getOperand(3).getImm();
+ TmpInst.setOpcode(Opcode == PPC::EXTLWI? PPC::RLWINM : PPC::RLWINMo);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+ TmpInst.addOperand(MCOperand::createImm(B));
+ TmpInst.addOperand(MCOperand::createImm(0));
+ TmpInst.addOperand(MCOperand::createImm(N - 1));
+ Inst = TmpInst;
+ break;
+ }
+ case PPC::EXTRWI:
+ case PPC::EXTRWIo: {
+ MCInst TmpInst;
+ int64_t N = Inst.getOperand(2).getImm();
+ int64_t B = Inst.getOperand(3).getImm();
+ TmpInst.setOpcode(Opcode == PPC::EXTRWI? PPC::RLWINM : PPC::RLWINMo);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+ TmpInst.addOperand(MCOperand::createImm(B + N));
+ TmpInst.addOperand(MCOperand::createImm(32 - N));
+ TmpInst.addOperand(MCOperand::createImm(31));
+ Inst = TmpInst;
+ break;
+ }
+ case PPC::INSLWI:
+ case PPC::INSLWIo: {
+ MCInst TmpInst;
+ int64_t N = Inst.getOperand(2).getImm();
+ int64_t B = Inst.getOperand(3).getImm();
+ TmpInst.setOpcode(Opcode == PPC::INSLWI? PPC::RLWIMI : PPC::RLWIMIo);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+ TmpInst.addOperand(MCOperand::createImm(32 - B));
+ TmpInst.addOperand(MCOperand::createImm(B));
+ TmpInst.addOperand(MCOperand::createImm((B + N) - 1));
+ Inst = TmpInst;
+ break;
+ }
+ case PPC::INSRWI:
+ case PPC::INSRWIo: {
+ MCInst TmpInst;
+ int64_t N = Inst.getOperand(2).getImm();
+ int64_t B = Inst.getOperand(3).getImm();
+ TmpInst.setOpcode(Opcode == PPC::INSRWI? PPC::RLWIMI : PPC::RLWIMIo);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+ TmpInst.addOperand(MCOperand::createImm(32 - (B + N)));
+ TmpInst.addOperand(MCOperand::createImm(B));
+ TmpInst.addOperand(MCOperand::createImm((B + N) - 1));
+ Inst = TmpInst;
+ break;
+ }
+ case PPC::ROTRWI:
+ case PPC::ROTRWIo: {
+ MCInst TmpInst;
+ int64_t N = Inst.getOperand(2).getImm();
+ TmpInst.setOpcode(Opcode == PPC::ROTRWI? PPC::RLWINM : PPC::RLWINMo);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+ TmpInst.addOperand(MCOperand::createImm(32 - N));
+ TmpInst.addOperand(MCOperand::createImm(0));
+ TmpInst.addOperand(MCOperand::createImm(31));
+ Inst = TmpInst;
+ break;
+ }
+ case PPC::SLWI:
+ case PPC::SLWIo: {
+ MCInst TmpInst;
+ int64_t N = Inst.getOperand(2).getImm();
+ TmpInst.setOpcode(Opcode == PPC::SLWI? PPC::RLWINM : PPC::RLWINMo);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+ TmpInst.addOperand(MCOperand::createImm(N));
+ TmpInst.addOperand(MCOperand::createImm(0));
+ TmpInst.addOperand(MCOperand::createImm(31 - N));
+ Inst = TmpInst;
+ break;
+ }
+ case PPC::SRWI:
+ case PPC::SRWIo: {
+ MCInst TmpInst;
+ int64_t N = Inst.getOperand(2).getImm();
+ TmpInst.setOpcode(Opcode == PPC::SRWI? PPC::RLWINM : PPC::RLWINMo);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+ TmpInst.addOperand(MCOperand::createImm(32 - N));
+ TmpInst.addOperand(MCOperand::createImm(N));
+ TmpInst.addOperand(MCOperand::createImm(31));
+ Inst = TmpInst;
+ break;
+ }
+ case PPC::CLRRWI:
+ case PPC::CLRRWIo: {
+ MCInst TmpInst;
+ int64_t N = Inst.getOperand(2).getImm();
+ TmpInst.setOpcode(Opcode == PPC::CLRRWI? PPC::RLWINM : PPC::RLWINMo);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+ TmpInst.addOperand(MCOperand::createImm(0));
+ TmpInst.addOperand(MCOperand::createImm(0));
+ TmpInst.addOperand(MCOperand::createImm(31 - N));
+ Inst = TmpInst;
+ break;
+ }
+ case PPC::CLRLSLWI:
+ case PPC::CLRLSLWIo: {
+ MCInst TmpInst;
+ int64_t B = Inst.getOperand(2).getImm();
+ int64_t N = Inst.getOperand(3).getImm();
+ TmpInst.setOpcode(Opcode == PPC::CLRLSLWI? PPC::RLWINM : PPC::RLWINMo);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+ TmpInst.addOperand(MCOperand::createImm(N));
+ TmpInst.addOperand(MCOperand::createImm(B - N));
+ TmpInst.addOperand(MCOperand::createImm(31 - N));
+ Inst = TmpInst;
+ break;
+ }
+ case PPC::EXTLDI:
+ case PPC::EXTLDIo: {
+ MCInst TmpInst;
+ int64_t N = Inst.getOperand(2).getImm();
+ int64_t B = Inst.getOperand(3).getImm();
+ TmpInst.setOpcode(Opcode == PPC::EXTLDI? PPC::RLDICR : PPC::RLDICRo);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+ TmpInst.addOperand(MCOperand::createImm(B));
+ TmpInst.addOperand(MCOperand::createImm(N - 1));
+ Inst = TmpInst;
+ break;
+ }
+ case PPC::EXTRDI:
+ case PPC::EXTRDIo: {
+ MCInst TmpInst;
+ int64_t N = Inst.getOperand(2).getImm();
+ int64_t B = Inst.getOperand(3).getImm();
+ TmpInst.setOpcode(Opcode == PPC::EXTRDI? PPC::RLDICL : PPC::RLDICLo);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+ TmpInst.addOperand(MCOperand::createImm(B + N));
+ TmpInst.addOperand(MCOperand::createImm(64 - N));
+ Inst = TmpInst;
+ break;
+ }
+ case PPC::INSRDI:
+ case PPC::INSRDIo: {
+ MCInst TmpInst;
+ int64_t N = Inst.getOperand(2).getImm();
+ int64_t B = Inst.getOperand(3).getImm();
+ TmpInst.setOpcode(Opcode == PPC::INSRDI? PPC::RLDIMI : PPC::RLDIMIo);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+ TmpInst.addOperand(MCOperand::createImm(64 - (B + N)));
+ TmpInst.addOperand(MCOperand::createImm(B));
+ Inst = TmpInst;
+ break;
+ }
+ case PPC::ROTRDI:
+ case PPC::ROTRDIo: {
+ MCInst TmpInst;
+ int64_t N = Inst.getOperand(2).getImm();
+ TmpInst.setOpcode(Opcode == PPC::ROTRDI? PPC::RLDICL : PPC::RLDICLo);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+ TmpInst.addOperand(MCOperand::createImm(64 - N));
+ TmpInst.addOperand(MCOperand::createImm(0));
+ Inst = TmpInst;
+ break;
+ }
+ case PPC::SLDI:
+ case PPC::SLDIo: {
+ MCInst TmpInst;
+ int64_t N = Inst.getOperand(2).getImm();
+ TmpInst.setOpcode(Opcode == PPC::SLDI? PPC::RLDICR : PPC::RLDICRo);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+ TmpInst.addOperand(MCOperand::createImm(N));
+ TmpInst.addOperand(MCOperand::createImm(63 - N));
+ Inst = TmpInst;
+ break;
+ }
+ case PPC::SRDI:
+ case PPC::SRDIo: {
+ MCInst TmpInst;
+ int64_t N = Inst.getOperand(2).getImm();
+ TmpInst.setOpcode(Opcode == PPC::SRDI? PPC::RLDICL : PPC::RLDICLo);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+ TmpInst.addOperand(MCOperand::createImm(64 - N));
+ TmpInst.addOperand(MCOperand::createImm(N));
+ Inst = TmpInst;
+ break;
+ }
+ case PPC::CLRRDI:
+ case PPC::CLRRDIo: {
+ MCInst TmpInst;
+ int64_t N = Inst.getOperand(2).getImm();
+ TmpInst.setOpcode(Opcode == PPC::CLRRDI? PPC::RLDICR : PPC::RLDICRo);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+ TmpInst.addOperand(MCOperand::createImm(0));
+ TmpInst.addOperand(MCOperand::createImm(63 - N));
+ Inst = TmpInst;
+ break;
+ }
+ case PPC::CLRLSLDI:
+ case PPC::CLRLSLDIo: {
+ MCInst TmpInst;
+ int64_t B = Inst.getOperand(2).getImm();
+ int64_t N = Inst.getOperand(3).getImm();
+ TmpInst.setOpcode(Opcode == PPC::CLRLSLDI? PPC::RLDIC : PPC::RLDICo);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+ TmpInst.addOperand(MCOperand::createImm(N));
+ TmpInst.addOperand(MCOperand::createImm(B - N));
+ Inst = TmpInst;
+ break;
+ }
+ case PPC::RLWINMbm:
+ case PPC::RLWINMobm: {
+ unsigned MB, ME;
+ int64_t BM = Inst.getOperand(3).getImm();
+ if (!isRunOfOnes(BM, MB, ME))
+ break;
+
+ MCInst TmpInst;
+ TmpInst.setOpcode(Opcode == PPC::RLWINMbm ? PPC::RLWINM : PPC::RLWINMo);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+ TmpInst.addOperand(Inst.getOperand(2));
+ TmpInst.addOperand(MCOperand::createImm(MB));
+ TmpInst.addOperand(MCOperand::createImm(ME));
+ Inst = TmpInst;
+ break;
+ }
+ case PPC::RLWIMIbm:
+ case PPC::RLWIMIobm: {
+ unsigned MB, ME;
+ int64_t BM = Inst.getOperand(3).getImm();
+ if (!isRunOfOnes(BM, MB, ME))
+ break;
+
+ MCInst TmpInst;
+ TmpInst.setOpcode(Opcode == PPC::RLWIMIbm ? PPC::RLWIMI : PPC::RLWIMIo);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(0)); // The tied operand.
+ TmpInst.addOperand(Inst.getOperand(1));
+ TmpInst.addOperand(Inst.getOperand(2));
+ TmpInst.addOperand(MCOperand::createImm(MB));
+ TmpInst.addOperand(MCOperand::createImm(ME));
+ Inst = TmpInst;
+ break;
+ }
+ case PPC::RLWNMbm:
+ case PPC::RLWNMobm: {
+ unsigned MB, ME;
+ int64_t BM = Inst.getOperand(3).getImm();
+ if (!isRunOfOnes(BM, MB, ME))
+ break;
+
+ MCInst TmpInst;
+ TmpInst.setOpcode(Opcode == PPC::RLWNMbm ? PPC::RLWNM : PPC::RLWNMo);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+ TmpInst.addOperand(Inst.getOperand(2));
+ TmpInst.addOperand(MCOperand::createImm(MB));
+ TmpInst.addOperand(MCOperand::createImm(ME));
+ Inst = TmpInst;
+ break;
+ }
+ case PPC::MFTB: {
+ if (getSTI().getFeatureBits()[PPC::FeatureMFTB]) {
+ assert(Inst.getNumOperands() == 2 && "Expecting two operands");
+ Inst.setOpcode(PPC::MFSPR);
+ }
+ break;
+ }
+ case PPC::CP_COPYx:
+ case PPC::CP_COPY_FIRST: {
+ MCInst TmpInst;
+ TmpInst.setOpcode(PPC::CP_COPY);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+ TmpInst.addOperand(MCOperand::createImm(Opcode == PPC::CP_COPYx ? 0 : 1));
+
+ Inst = TmpInst;
+ break;
+ }
+ case PPC::CP_PASTEx :
+ case PPC::CP_PASTE_LAST: {
+ MCInst TmpInst;
+ TmpInst.setOpcode(Opcode == PPC::CP_PASTEx ?
+ PPC::CP_PASTE : PPC::CP_PASTEo);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+ TmpInst.addOperand(MCOperand::createImm(Opcode == PPC::CP_PASTEx ? 0 : 1));
+
+ Inst = TmpInst;
+ break;
+ }
+ }
+}
+
+bool PPCAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands,
+ MCStreamer &Out, uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) {
+ MCInst Inst;
+
+ switch (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm)) {
+ case Match_Success:
+ // Post-process instructions (typically extended mnemonics)
+ ProcessInstruction(Inst, Operands);
+ Inst.setLoc(IDLoc);
+ Out.EmitInstruction(Inst, getSTI());
+ return false;
+ case Match_MissingFeature:
+ return Error(IDLoc, "instruction use requires an option to be enabled");
+ case Match_MnemonicFail:
+ return Error(IDLoc, "unrecognized instruction mnemonic");
+ case Match_InvalidOperand: {
+ SMLoc ErrorLoc = IDLoc;
+ if (ErrorInfo != ~0ULL) {
+ if (ErrorInfo >= Operands.size())
+ return Error(IDLoc, "too few operands for instruction");
+
+ ErrorLoc = ((PPCOperand &)*Operands[ErrorInfo]).getStartLoc();
+ if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
+ }
+
+ return Error(ErrorLoc, "invalid operand for instruction");
+ }
+ }
+
+ llvm_unreachable("Implement any new match types added!");
+}
+
+bool PPCAsmParser::MatchRegisterName(unsigned &RegNo, int64_t &IntVal) {
+ if (getParser().getTok().is(AsmToken::Identifier)) {
+ StringRef Name = getParser().getTok().getString();
+ if (Name.equals_lower("lr")) {
+ RegNo = isPPC64()? PPC::LR8 : PPC::LR;
+ IntVal = 8;
+ } else if (Name.equals_lower("ctr")) {
+ RegNo = isPPC64()? PPC::CTR8 : PPC::CTR;
+ IntVal = 9;
+ } else if (Name.equals_lower("vrsave")) {
+ RegNo = PPC::VRSAVE;
+ IntVal = 256;
+ } else if (Name.startswith_lower("r") &&
+ !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
+ RegNo = isPPC64()? XRegs[IntVal] : RRegs[IntVal];
+ } else if (Name.startswith_lower("f") &&
+ !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
+ RegNo = FRegs[IntVal];
+ } else if (Name.startswith_lower("vs") &&
+ !Name.substr(2).getAsInteger(10, IntVal) && IntVal < 64) {
+ RegNo = VSRegs[IntVal];
+ } else if (Name.startswith_lower("v") &&
+ !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
+ RegNo = VRegs[IntVal];
+ } else if (Name.startswith_lower("q") &&
+ !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
+ RegNo = QFRegs[IntVal];
+ } else if (Name.startswith_lower("cr") &&
+ !Name.substr(2).getAsInteger(10, IntVal) && IntVal < 8) {
+ RegNo = CRRegs[IntVal];
+ } else
+ return true;
+ getParser().Lex();
+ return false;
+ }
+ return true;
+}
+
+bool PPCAsmParser::
+ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) {
+ const AsmToken &Tok = getParser().getTok();
+ StartLoc = Tok.getLoc();
+ EndLoc = Tok.getEndLoc();
+ RegNo = 0;
+ int64_t IntVal;
+ if (MatchRegisterName(RegNo, IntVal))
+ return TokError("invalid register name");
+ return false;
+}
+
+/// Extract \code @l/@ha \endcode modifier from expression. Recursively scan
+/// the expression and check for VK_PPC_LO/HI/HA
+/// symbol variants. If all symbols with modifier use the same
+/// variant, return the corresponding PPCMCExpr::VariantKind,
+/// and a modified expression using the default symbol variant.
+/// Otherwise, return NULL.
+const MCExpr *PPCAsmParser::
+ExtractModifierFromExpr(const MCExpr *E,
+ PPCMCExpr::VariantKind &Variant) {
+ MCContext &Context = getParser().getContext();
+ Variant = PPCMCExpr::VK_PPC_None;
+
+ switch (E->getKind()) {
+ case MCExpr::Target:
+ case MCExpr::Constant:
+ return nullptr;
+
+ case MCExpr::SymbolRef: {
+ const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(E);
+
+ switch (SRE->getKind()) {
+ case MCSymbolRefExpr::VK_PPC_LO:
+ Variant = PPCMCExpr::VK_PPC_LO;
+ break;
+ case MCSymbolRefExpr::VK_PPC_HI:
+ Variant = PPCMCExpr::VK_PPC_HI;
+ break;
+ case MCSymbolRefExpr::VK_PPC_HA:
+ Variant = PPCMCExpr::VK_PPC_HA;
+ break;
+ case MCSymbolRefExpr::VK_PPC_HIGHER:
+ Variant = PPCMCExpr::VK_PPC_HIGHER;
+ break;
+ case MCSymbolRefExpr::VK_PPC_HIGHERA:
+ Variant = PPCMCExpr::VK_PPC_HIGHERA;
+ break;
+ case MCSymbolRefExpr::VK_PPC_HIGHEST:
+ Variant = PPCMCExpr::VK_PPC_HIGHEST;
+ break;
+ case MCSymbolRefExpr::VK_PPC_HIGHESTA:
+ Variant = PPCMCExpr::VK_PPC_HIGHESTA;
+ break;
+ default:
+ return nullptr;
+ }
+
+ return MCSymbolRefExpr::create(&SRE->getSymbol(), Context);
+ }
+
+ case MCExpr::Unary: {
+ const MCUnaryExpr *UE = cast<MCUnaryExpr>(E);
+ const MCExpr *Sub = ExtractModifierFromExpr(UE->getSubExpr(), Variant);
+ if (!Sub)
+ return nullptr;
+ return MCUnaryExpr::create(UE->getOpcode(), Sub, Context);
+ }
+
+ case MCExpr::Binary: {
+ const MCBinaryExpr *BE = cast<MCBinaryExpr>(E);
+ PPCMCExpr::VariantKind LHSVariant, RHSVariant;
+ const MCExpr *LHS = ExtractModifierFromExpr(BE->getLHS(), LHSVariant);
+ const MCExpr *RHS = ExtractModifierFromExpr(BE->getRHS(), RHSVariant);
+
+ if (!LHS && !RHS)
+ return nullptr;
+
+ if (!LHS) LHS = BE->getLHS();
+ if (!RHS) RHS = BE->getRHS();
+
+ if (LHSVariant == PPCMCExpr::VK_PPC_None)
+ Variant = RHSVariant;
+ else if (RHSVariant == PPCMCExpr::VK_PPC_None)
+ Variant = LHSVariant;
+ else if (LHSVariant == RHSVariant)
+ Variant = LHSVariant;
+ else
+ return nullptr;
+
+ return MCBinaryExpr::create(BE->getOpcode(), LHS, RHS, Context);
+ }
+ }
+
+ llvm_unreachable("Invalid expression kind!");
+}
+
+/// Find all VK_TLSGD/VK_TLSLD symbol references in expression and replace
+/// them by VK_PPC_TLSGD/VK_PPC_TLSLD. This is necessary to avoid having
+/// _GLOBAL_OFFSET_TABLE_ created via ELFObjectWriter::RelocNeedsGOT.
+/// FIXME: This is a hack.
+const MCExpr *PPCAsmParser::
+FixupVariantKind(const MCExpr *E) {
+ MCContext &Context = getParser().getContext();
+
+ switch (E->getKind()) {
+ case MCExpr::Target:
+ case MCExpr::Constant:
+ return E;
+
+ case MCExpr::SymbolRef: {
+ const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(E);
+ MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
+
+ switch (SRE->getKind()) {
+ case MCSymbolRefExpr::VK_TLSGD:
+ Variant = MCSymbolRefExpr::VK_PPC_TLSGD;
+ break;
+ case MCSymbolRefExpr::VK_TLSLD:
+ Variant = MCSymbolRefExpr::VK_PPC_TLSLD;
+ break;
+ default:
+ return E;
+ }
+ return MCSymbolRefExpr::create(&SRE->getSymbol(), Variant, Context);
+ }
+
+ case MCExpr::Unary: {
+ const MCUnaryExpr *UE = cast<MCUnaryExpr>(E);
+ const MCExpr *Sub = FixupVariantKind(UE->getSubExpr());
+ if (Sub == UE->getSubExpr())
+ return E;
+ return MCUnaryExpr::create(UE->getOpcode(), Sub, Context);
+ }
+
+ case MCExpr::Binary: {
+ const MCBinaryExpr *BE = cast<MCBinaryExpr>(E);
+ const MCExpr *LHS = FixupVariantKind(BE->getLHS());
+ const MCExpr *RHS = FixupVariantKind(BE->getRHS());
+ if (LHS == BE->getLHS() && RHS == BE->getRHS())
+ return E;
+ return MCBinaryExpr::create(BE->getOpcode(), LHS, RHS, Context);
+ }
+ }
+
+ llvm_unreachable("Invalid expression kind!");
+}
+
+/// ParseExpression. This differs from the default "parseExpression" in that
+/// it handles modifiers.
+bool PPCAsmParser::
+ParseExpression(const MCExpr *&EVal) {
+
+ if (isDarwin())
+ return ParseDarwinExpression(EVal);
+
+ // (ELF Platforms)
+ // Handle \code @l/@ha \endcode
+ if (getParser().parseExpression(EVal))
+ return true;
+
+ EVal = FixupVariantKind(EVal);
+
+ PPCMCExpr::VariantKind Variant;
+ const MCExpr *E = ExtractModifierFromExpr(EVal, Variant);
+ if (E)
+ EVal = PPCMCExpr::create(Variant, E, false, getParser().getContext());
+
+ return false;
+}
+
+/// ParseDarwinExpression. (MachO Platforms)
+/// This differs from the default "parseExpression" in that it handles detection
+/// of the \code hi16(), ha16() and lo16() \endcode modifiers. At present,
+/// parseExpression() doesn't recognise the modifiers when in the Darwin/MachO
+/// syntax form so it is done here. TODO: Determine if there is merit in
+/// arranging for this to be done at a higher level.
+bool PPCAsmParser::
+ParseDarwinExpression(const MCExpr *&EVal) {
+ MCAsmParser &Parser = getParser();
+ PPCMCExpr::VariantKind Variant = PPCMCExpr::VK_PPC_None;
+ switch (getLexer().getKind()) {
+ default:
+ break;
+ case AsmToken::Identifier:
+ // Compiler-generated Darwin identifiers begin with L,l,_ or "; thus
+ // something starting with any other char should be part of the
+ // asm syntax. If handwritten asm includes an identifier like lo16,
+ // then all bets are off - but no-one would do that, right?
+ StringRef poss = Parser.getTok().getString();
+ if (poss.equals_lower("lo16")) {
+ Variant = PPCMCExpr::VK_PPC_LO;
+ } else if (poss.equals_lower("hi16")) {
+ Variant = PPCMCExpr::VK_PPC_HI;
+ } else if (poss.equals_lower("ha16")) {
+ Variant = PPCMCExpr::VK_PPC_HA;
+ }
+ if (Variant != PPCMCExpr::VK_PPC_None) {
+ Parser.Lex(); // Eat the xx16
+ if (getLexer().isNot(AsmToken::LParen))
+ return Error(Parser.getTok().getLoc(), "expected '('");
+ Parser.Lex(); // Eat the '('
+ }
+ break;
+ }
+
+ if (getParser().parseExpression(EVal))
+ return true;
+
+ if (Variant != PPCMCExpr::VK_PPC_None) {
+ if (getLexer().isNot(AsmToken::RParen))
+ return Error(Parser.getTok().getLoc(), "expected ')'");
+ Parser.Lex(); // Eat the ')'
+ EVal = PPCMCExpr::create(Variant, EVal, false, getParser().getContext());
+ }
+ return false;
+}
+
+/// ParseOperand
+/// This handles registers in the form 'NN', '%rNN' for ELF platforms and
+/// rNN for MachO.
+bool PPCAsmParser::ParseOperand(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ SMLoc S = Parser.getTok().getLoc();
+ SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+ const MCExpr *EVal;
+
+ // Attempt to parse the next token as an immediate
+ switch (getLexer().getKind()) {
+ // Special handling for register names. These are interpreted
+ // as immediates corresponding to the register number.
+ case AsmToken::Percent:
+ Parser.Lex(); // Eat the '%'.
+ unsigned RegNo;
+ int64_t IntVal;
+ if (MatchRegisterName(RegNo, IntVal))
+ return Error(S, "invalid register name");
+
+ Operands.push_back(PPCOperand::CreateImm(IntVal, S, E, isPPC64()));
+ return false;
+
+ case AsmToken::Identifier:
+ case AsmToken::LParen:
+ case AsmToken::Plus:
+ case AsmToken::Minus:
+ case AsmToken::Integer:
+ case AsmToken::Dot:
+ case AsmToken::Dollar:
+ case AsmToken::Exclaim:
+ case AsmToken::Tilde:
+ // Note that non-register-name identifiers from the compiler will begin
+ // with '_', 'L'/'l' or '"'. Of course, handwritten asm could include
+ // identifiers like r31foo - so we fall through in the event that parsing
+ // a register name fails.
+ if (isDarwin()) {
+ unsigned RegNo;
+ int64_t IntVal;
+ if (!MatchRegisterName(RegNo, IntVal)) {
+ Operands.push_back(PPCOperand::CreateImm(IntVal, S, E, isPPC64()));
+ return false;
+ }
+ }
+ // All other expressions
+
+ if (!ParseExpression(EVal))
+ break;
+ // Fall-through
+ LLVM_FALLTHROUGH;
+ default:
+ return Error(S, "unknown operand");
+ }
+
+ // Push the parsed operand into the list of operands
+ Operands.push_back(PPCOperand::CreateFromMCExpr(EVal, S, E, isPPC64()));
+
+ // Check whether this is a TLS call expression
+ bool TLSCall = false;
+ if (const MCSymbolRefExpr *Ref = dyn_cast<MCSymbolRefExpr>(EVal))
+ TLSCall = Ref->getSymbol().getName() == "__tls_get_addr";
+
+ if (TLSCall && getLexer().is(AsmToken::LParen)) {
+ const MCExpr *TLSSym;
+
+ Parser.Lex(); // Eat the '('.
+ S = Parser.getTok().getLoc();
+ if (ParseExpression(TLSSym))
+ return Error(S, "invalid TLS call expression");
+ if (getLexer().isNot(AsmToken::RParen))
+ return Error(Parser.getTok().getLoc(), "missing ')'");
+ E = Parser.getTok().getLoc();
+ Parser.Lex(); // Eat the ')'.
+
+ Operands.push_back(PPCOperand::CreateFromMCExpr(TLSSym, S, E, isPPC64()));
+ }
+
+ // Otherwise, check for D-form memory operands
+ if (!TLSCall && getLexer().is(AsmToken::LParen)) {
+ Parser.Lex(); // Eat the '('.
+ S = Parser.getTok().getLoc();
+
+ int64_t IntVal;
+ switch (getLexer().getKind()) {
+ case AsmToken::Percent:
+ Parser.Lex(); // Eat the '%'.
+ unsigned RegNo;
+ if (MatchRegisterName(RegNo, IntVal))
+ return Error(S, "invalid register name");
+ break;
+
+ case AsmToken::Integer:
+ if (isDarwin())
+ return Error(S, "unexpected integer value");
+ else if (getParser().parseAbsoluteExpression(IntVal) || IntVal < 0 ||
+ IntVal > 31)
+ return Error(S, "invalid register number");
+ break;
+ case AsmToken::Identifier:
+ if (isDarwin()) {
+ unsigned RegNo;
+ if (!MatchRegisterName(RegNo, IntVal)) {
+ break;
+ }
+ }
+ LLVM_FALLTHROUGH;
+
+ default:
+ return Error(S, "invalid memory operand");
+ }
+
+ E = Parser.getTok().getLoc();
+ if (parseToken(AsmToken::RParen, "missing ')'"))
+ return true;
+ Operands.push_back(PPCOperand::CreateImm(IntVal, S, E, isPPC64()));
+ }
+
+ return false;
+}
+
+/// Parse an instruction mnemonic followed by its operands.
+bool PPCAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+ SMLoc NameLoc, OperandVector &Operands) {
+ // The first operand is the token for the instruction name.
+ // If the next character is a '+' or '-', we need to add it to the
+ // instruction name, to match what TableGen is doing.
+ std::string NewOpcode;
+ if (parseOptionalToken(AsmToken::Plus)) {
+ NewOpcode = Name;
+ NewOpcode += '+';
+ Name = NewOpcode;
+ }
+ if (parseOptionalToken(AsmToken::Minus)) {
+ NewOpcode = Name;
+ NewOpcode += '-';
+ Name = NewOpcode;
+ }
+ // If the instruction ends in a '.', we need to create a separate
+ // token for it, to match what TableGen is doing.
+ size_t Dot = Name.find('.');
+ StringRef Mnemonic = Name.slice(0, Dot);
+ if (!NewOpcode.empty()) // Underlying memory for Name is volatile.
+ Operands.push_back(
+ PPCOperand::CreateTokenWithStringCopy(Mnemonic, NameLoc, isPPC64()));
+ else
+ Operands.push_back(PPCOperand::CreateToken(Mnemonic, NameLoc, isPPC64()));
+ if (Dot != StringRef::npos) {
+ SMLoc DotLoc = SMLoc::getFromPointer(NameLoc.getPointer() + Dot);
+ StringRef DotStr = Name.slice(Dot, StringRef::npos);
+ if (!NewOpcode.empty()) // Underlying memory for Name is volatile.
+ Operands.push_back(
+ PPCOperand::CreateTokenWithStringCopy(DotStr, DotLoc, isPPC64()));
+ else
+ Operands.push_back(PPCOperand::CreateToken(DotStr, DotLoc, isPPC64()));
+ }
+
+ // If there are no more operands then finish
+ if (parseOptionalToken(AsmToken::EndOfStatement))
+ return false;
+
+ // Parse the first operand
+ if (ParseOperand(Operands))
+ return true;
+
+ while (!parseOptionalToken(AsmToken::EndOfStatement)) {
+ if (parseToken(AsmToken::Comma) || ParseOperand(Operands))
+ return true;
+ }
+
+ // We'll now deal with an unfortunate special case: the syntax for the dcbt
+ // and dcbtst instructions differs for server vs. embedded cores.
+ // The syntax for dcbt is:
+ // dcbt ra, rb, th [server]
+ // dcbt th, ra, rb [embedded]
+ // where th can be omitted when it is 0. dcbtst is the same. We take the
+ // server form to be the default, so swap the operands if we're parsing for
+ // an embedded core (they'll be swapped again upon printing).
+ if (getSTI().getFeatureBits()[PPC::FeatureBookE] &&
+ Operands.size() == 4 &&
+ (Name == "dcbt" || Name == "dcbtst")) {
+ std::swap(Operands[1], Operands[3]);
+ std::swap(Operands[2], Operands[1]);
+ }
+
+ return false;
+}
+
+/// ParseDirective parses the PPC specific directives
+bool PPCAsmParser::ParseDirective(AsmToken DirectiveID) {
+ StringRef IDVal = DirectiveID.getIdentifier();
+ if (isDarwin()) {
+ if (IDVal == ".machine")
+ ParseDarwinDirectiveMachine(DirectiveID.getLoc());
+ else
+ return true;
+ } else if (IDVal == ".word")
+ ParseDirectiveWord(2, DirectiveID);
+ else if (IDVal == ".llong")
+ ParseDirectiveWord(8, DirectiveID);
+ else if (IDVal == ".tc")
+ ParseDirectiveTC(isPPC64() ? 8 : 4, DirectiveID);
+ else if (IDVal == ".machine")
+ ParseDirectiveMachine(DirectiveID.getLoc());
+ else if (IDVal == ".abiversion")
+ ParseDirectiveAbiVersion(DirectiveID.getLoc());
+ else if (IDVal == ".localentry")
+ ParseDirectiveLocalEntry(DirectiveID.getLoc());
+ else
+ return true;
+ return false;
+}
+
+/// ParseDirectiveWord
+/// ::= .word [ expression (, expression)* ]
+bool PPCAsmParser::ParseDirectiveWord(unsigned Size, AsmToken ID) {
+ auto parseOp = [&]() -> bool {
+ const MCExpr *Value;
+ SMLoc ExprLoc = getParser().getTok().getLoc();
+ if (getParser().parseExpression(Value))
+ return true;
+ if (const auto *MCE = dyn_cast<MCConstantExpr>(Value)) {
+ assert(Size <= 8 && "Invalid size");
+ uint64_t IntValue = MCE->getValue();
+ if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue))
+ return Error(ExprLoc, "literal value out of range for '" +
+ ID.getIdentifier() + "' directive");
+ getStreamer().EmitIntValue(IntValue, Size);
+ } else
+ getStreamer().EmitValue(Value, Size, ExprLoc);
+ return false;
+ };
+
+ if (parseMany(parseOp))
+ return addErrorSuffix(" in '" + ID.getIdentifier() + "' directive");
+ return false;
+}
+
+/// ParseDirectiveTC
+/// ::= .tc [ symbol (, expression)* ]
+bool PPCAsmParser::ParseDirectiveTC(unsigned Size, AsmToken ID) {
+ MCAsmParser &Parser = getParser();
+ // Skip TC symbol, which is only used with XCOFF.
+ while (getLexer().isNot(AsmToken::EndOfStatement)
+ && getLexer().isNot(AsmToken::Comma))
+ Parser.Lex();
+ if (parseToken(AsmToken::Comma))
+ return addErrorSuffix(" in '.tc' directive");
+
+ // Align to word size.
+ getParser().getStreamer().EmitValueToAlignment(Size);
+
+ // Emit expressions.
+ return ParseDirectiveWord(Size, ID);
+}
+
+/// ParseDirectiveMachine (ELF platforms)
+/// ::= .machine [ cpu | "push" | "pop" ]
+bool PPCAsmParser::ParseDirectiveMachine(SMLoc L) {
+ MCAsmParser &Parser = getParser();
+ if (Parser.getTok().isNot(AsmToken::Identifier) &&
+ Parser.getTok().isNot(AsmToken::String))
+ return Error(L, "unexpected token in '.machine' directive");
+
+ StringRef CPU = Parser.getTok().getIdentifier();
+
+ // FIXME: Right now, the parser always allows any available
+ // instruction, so the .machine directive is not useful.
+ // Implement ".machine any" (by doing nothing) for the benefit
+ // of existing assembler code. Likewise, we can then implement
+ // ".machine push" and ".machine pop" as no-op.
+ if (CPU != "any" && CPU != "push" && CPU != "pop")
+ return TokError("unrecognized machine type");
+
+ Parser.Lex();
+
+ if (parseToken(AsmToken::EndOfStatement))
+ return addErrorSuffix(" in '.machine' directive");
+
+ PPCTargetStreamer &TStreamer =
+ *static_cast<PPCTargetStreamer *>(
+ getParser().getStreamer().getTargetStreamer());
+ TStreamer.emitMachine(CPU);
+
+ return false;
+}
+
+/// ParseDarwinDirectiveMachine (Mach-o platforms)
+/// ::= .machine cpu-identifier
+bool PPCAsmParser::ParseDarwinDirectiveMachine(SMLoc L) {
+ MCAsmParser &Parser = getParser();
+ if (Parser.getTok().isNot(AsmToken::Identifier) &&
+ Parser.getTok().isNot(AsmToken::String))
+ return Error(L, "unexpected token in directive");
+
+ StringRef CPU = Parser.getTok().getIdentifier();
+ Parser.Lex();
+
+ // FIXME: this is only the 'default' set of cpu variants.
+ // However we don't act on this information at present, this is simply
+ // allowing parsing to proceed with minimal sanity checking.
+ if (check(CPU != "ppc7400" && CPU != "ppc" && CPU != "ppc64", L,
+ "unrecognized cpu type") ||
+ check(isPPC64() && (CPU == "ppc7400" || CPU == "ppc"), L,
+ "wrong cpu type specified for 64bit") ||
+ check(!isPPC64() && CPU == "ppc64", L,
+ "wrong cpu type specified for 32bit") ||
+ parseToken(AsmToken::EndOfStatement))
+ return addErrorSuffix(" in '.machine' directive");
+ return false;
+}
+
+/// ParseDirectiveAbiVersion
+/// ::= .abiversion constant-expression
+bool PPCAsmParser::ParseDirectiveAbiVersion(SMLoc L) {
+ int64_t AbiVersion;
+ if (check(getParser().parseAbsoluteExpression(AbiVersion), L,
+ "expected constant expression") ||
+ parseToken(AsmToken::EndOfStatement))
+ return addErrorSuffix(" in '.abiversion' directive");
+
+ PPCTargetStreamer &TStreamer =
+ *static_cast<PPCTargetStreamer *>(
+ getParser().getStreamer().getTargetStreamer());
+ TStreamer.emitAbiVersion(AbiVersion);
+
+ return false;
+}
+
+/// ParseDirectiveLocalEntry
+/// ::= .localentry symbol, expression
+bool PPCAsmParser::ParseDirectiveLocalEntry(SMLoc L) {
+ StringRef Name;
+ if (getParser().parseIdentifier(Name))
+ return Error(L, "expected identifier in '.localentry' directive");
+
+ MCSymbolELF *Sym = cast<MCSymbolELF>(getContext().getOrCreateSymbol(Name));
+ const MCExpr *Expr;
+
+ if (parseToken(AsmToken::Comma) ||
+ check(getParser().parseExpression(Expr), L, "expected expression") ||
+ parseToken(AsmToken::EndOfStatement))
+ return addErrorSuffix(" in '.localentry' directive");
+
+ PPCTargetStreamer &TStreamer =
+ *static_cast<PPCTargetStreamer *>(
+ getParser().getStreamer().getTargetStreamer());
+ TStreamer.emitLocalEntry(Sym, Expr);
+
+ return false;
+}
+
+
+
+/// Force static initialization.
+extern "C" void LLVMInitializePowerPCAsmParser() {
+ RegisterMCAsmParser<PPCAsmParser> A(getThePPC32Target());
+ RegisterMCAsmParser<PPCAsmParser> B(getThePPC64Target());
+ RegisterMCAsmParser<PPCAsmParser> C(getThePPC64LETarget());
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#include "PPCGenAsmMatcher.inc"
+
+// Define this matcher function after the auto-generated include so we
+// have the match class enum definitions.
+unsigned PPCAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
+ unsigned Kind) {
+ // If the kind is a token for a literal immediate, check if our asm
+ // operand matches. This is for InstAliases which have a fixed-value
+ // immediate in the syntax.
+ int64_t ImmVal;
+ switch (Kind) {
+ case MCK_0: ImmVal = 0; break;
+ case MCK_1: ImmVal = 1; break;
+ case MCK_2: ImmVal = 2; break;
+ case MCK_3: ImmVal = 3; break;
+ case MCK_4: ImmVal = 4; break;
+ case MCK_5: ImmVal = 5; break;
+ case MCK_6: ImmVal = 6; break;
+ case MCK_7: ImmVal = 7; break;
+ default: return Match_InvalidOperand;
+ }
+
+ PPCOperand &Op = static_cast<PPCOperand &>(AsmOp);
+ if (Op.isImm() && Op.getImm() == ImmVal)
+ return Match_Success;
+
+ return Match_InvalidOperand;
+}
+
+const MCExpr *
+PPCAsmParser::applyModifierToExpr(const MCExpr *E,
+ MCSymbolRefExpr::VariantKind Variant,
+ MCContext &Ctx) {
+ switch (Variant) {
+ case MCSymbolRefExpr::VK_PPC_LO:
+ return PPCMCExpr::create(PPCMCExpr::VK_PPC_LO, E, false, Ctx);
+ case MCSymbolRefExpr::VK_PPC_HI:
+ return PPCMCExpr::create(PPCMCExpr::VK_PPC_HI, E, false, Ctx);
+ case MCSymbolRefExpr::VK_PPC_HA:
+ return PPCMCExpr::create(PPCMCExpr::VK_PPC_HA, E, false, Ctx);
+ case MCSymbolRefExpr::VK_PPC_HIGHER:
+ return PPCMCExpr::create(PPCMCExpr::VK_PPC_HIGHER, E, false, Ctx);
+ case MCSymbolRefExpr::VK_PPC_HIGHERA:
+ return PPCMCExpr::create(PPCMCExpr::VK_PPC_HIGHERA, E, false, Ctx);
+ case MCSymbolRefExpr::VK_PPC_HIGHEST:
+ return PPCMCExpr::create(PPCMCExpr::VK_PPC_HIGHEST, E, false, Ctx);
+ case MCSymbolRefExpr::VK_PPC_HIGHESTA:
+ return PPCMCExpr::create(PPCMCExpr::VK_PPC_HIGHESTA, E, false, Ctx);
+ default:
+ return nullptr;
+ }
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
new file mode 100644
index 000000000000..12ffbfdeacc1
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
@@ -0,0 +1,440 @@
+//===------ PPCDisassembler.cpp - Disassembler for PowerPC ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-disassembler"
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+class PPCDisassembler : public MCDisassembler {
+ bool IsLittleEndian;
+
+public:
+ PPCDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
+ bool IsLittleEndian)
+ : MCDisassembler(STI, Ctx), IsLittleEndian(IsLittleEndian) {}
+
+ DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes, uint64_t Address,
+ raw_ostream &VStream,
+ raw_ostream &CStream) const override;
+};
+} // end anonymous namespace
+
+static MCDisassembler *createPPCDisassembler(const Target &T,
+ const MCSubtargetInfo &STI,
+ MCContext &Ctx) {
+ return new PPCDisassembler(STI, Ctx, /*IsLittleEndian=*/false);
+}
+
+static MCDisassembler *createPPCLEDisassembler(const Target &T,
+ const MCSubtargetInfo &STI,
+ MCContext &Ctx) {
+ return new PPCDisassembler(STI, Ctx, /*IsLittleEndian=*/true);
+}
+
+extern "C" void LLVMInitializePowerPCDisassembler() {
+ // Register the disassembler for each target.
+ TargetRegistry::RegisterMCDisassembler(getThePPC32Target(),
+ createPPCDisassembler);
+ TargetRegistry::RegisterMCDisassembler(getThePPC64Target(),
+ createPPCDisassembler);
+ TargetRegistry::RegisterMCDisassembler(getThePPC64LETarget(),
+ createPPCLEDisassembler);
+}
+
+// FIXME: These can be generated by TableGen from the existing register
+// encoding values!
+
+static const unsigned CRRegs[] = {
+ PPC::CR0, PPC::CR1, PPC::CR2, PPC::CR3,
+ PPC::CR4, PPC::CR5, PPC::CR6, PPC::CR7
+};
+
+static const unsigned CRBITRegs[] = {
+ PPC::CR0LT, PPC::CR0GT, PPC::CR0EQ, PPC::CR0UN,
+ PPC::CR1LT, PPC::CR1GT, PPC::CR1EQ, PPC::CR1UN,
+ PPC::CR2LT, PPC::CR2GT, PPC::CR2EQ, PPC::CR2UN,
+ PPC::CR3LT, PPC::CR3GT, PPC::CR3EQ, PPC::CR3UN,
+ PPC::CR4LT, PPC::CR4GT, PPC::CR4EQ, PPC::CR4UN,
+ PPC::CR5LT, PPC::CR5GT, PPC::CR5EQ, PPC::CR5UN,
+ PPC::CR6LT, PPC::CR6GT, PPC::CR6EQ, PPC::CR6UN,
+ PPC::CR7LT, PPC::CR7GT, PPC::CR7EQ, PPC::CR7UN
+};
+
+static const unsigned FRegs[] = {
+ PPC::F0, PPC::F1, PPC::F2, PPC::F3,
+ PPC::F4, PPC::F5, PPC::F6, PPC::F7,
+ PPC::F8, PPC::F9, PPC::F10, PPC::F11,
+ PPC::F12, PPC::F13, PPC::F14, PPC::F15,
+ PPC::F16, PPC::F17, PPC::F18, PPC::F19,
+ PPC::F20, PPC::F21, PPC::F22, PPC::F23,
+ PPC::F24, PPC::F25, PPC::F26, PPC::F27,
+ PPC::F28, PPC::F29, PPC::F30, PPC::F31
+};
+
+static const unsigned VFRegs[] = {
+ PPC::VF0, PPC::VF1, PPC::VF2, PPC::VF3,
+ PPC::VF4, PPC::VF5, PPC::VF6, PPC::VF7,
+ PPC::VF8, PPC::VF9, PPC::VF10, PPC::VF11,
+ PPC::VF12, PPC::VF13, PPC::VF14, PPC::VF15,
+ PPC::VF16, PPC::VF17, PPC::VF18, PPC::VF19,
+ PPC::VF20, PPC::VF21, PPC::VF22, PPC::VF23,
+ PPC::VF24, PPC::VF25, PPC::VF26, PPC::VF27,
+ PPC::VF28, PPC::VF29, PPC::VF30, PPC::VF31
+};
+
+static const unsigned VRegs[] = {
+ PPC::V0, PPC::V1, PPC::V2, PPC::V3,
+ PPC::V4, PPC::V5, PPC::V6, PPC::V7,
+ PPC::V8, PPC::V9, PPC::V10, PPC::V11,
+ PPC::V12, PPC::V13, PPC::V14, PPC::V15,
+ PPC::V16, PPC::V17, PPC::V18, PPC::V19,
+ PPC::V20, PPC::V21, PPC::V22, PPC::V23,
+ PPC::V24, PPC::V25, PPC::V26, PPC::V27,
+ PPC::V28, PPC::V29, PPC::V30, PPC::V31
+};
+
+static const unsigned VSRegs[] = {
+ PPC::VSL0, PPC::VSL1, PPC::VSL2, PPC::VSL3,
+ PPC::VSL4, PPC::VSL5, PPC::VSL6, PPC::VSL7,
+ PPC::VSL8, PPC::VSL9, PPC::VSL10, PPC::VSL11,
+ PPC::VSL12, PPC::VSL13, PPC::VSL14, PPC::VSL15,
+ PPC::VSL16, PPC::VSL17, PPC::VSL18, PPC::VSL19,
+ PPC::VSL20, PPC::VSL21, PPC::VSL22, PPC::VSL23,
+ PPC::VSL24, PPC::VSL25, PPC::VSL26, PPC::VSL27,
+ PPC::VSL28, PPC::VSL29, PPC::VSL30, PPC::VSL31,
+
+ PPC::V0, PPC::V1, PPC::V2, PPC::V3,
+ PPC::V4, PPC::V5, PPC::V6, PPC::V7,
+ PPC::V8, PPC::V9, PPC::V10, PPC::V11,
+ PPC::V12, PPC::V13, PPC::V14, PPC::V15,
+ PPC::V16, PPC::V17, PPC::V18, PPC::V19,
+ PPC::V20, PPC::V21, PPC::V22, PPC::V23,
+ PPC::V24, PPC::V25, PPC::V26, PPC::V27,
+ PPC::V28, PPC::V29, PPC::V30, PPC::V31
+};
+
+static const unsigned VSFRegs[] = {
+ PPC::F0, PPC::F1, PPC::F2, PPC::F3,
+ PPC::F4, PPC::F5, PPC::F6, PPC::F7,
+ PPC::F8, PPC::F9, PPC::F10, PPC::F11,
+ PPC::F12, PPC::F13, PPC::F14, PPC::F15,
+ PPC::F16, PPC::F17, PPC::F18, PPC::F19,
+ PPC::F20, PPC::F21, PPC::F22, PPC::F23,
+ PPC::F24, PPC::F25, PPC::F26, PPC::F27,
+ PPC::F28, PPC::F29, PPC::F30, PPC::F31,
+
+ PPC::VF0, PPC::VF1, PPC::VF2, PPC::VF3,
+ PPC::VF4, PPC::VF5, PPC::VF6, PPC::VF7,
+ PPC::VF8, PPC::VF9, PPC::VF10, PPC::VF11,
+ PPC::VF12, PPC::VF13, PPC::VF14, PPC::VF15,
+ PPC::VF16, PPC::VF17, PPC::VF18, PPC::VF19,
+ PPC::VF20, PPC::VF21, PPC::VF22, PPC::VF23,
+ PPC::VF24, PPC::VF25, PPC::VF26, PPC::VF27,
+ PPC::VF28, PPC::VF29, PPC::VF30, PPC::VF31
+};
+
+static const unsigned VSSRegs[] = {
+ PPC::F0, PPC::F1, PPC::F2, PPC::F3,
+ PPC::F4, PPC::F5, PPC::F6, PPC::F7,
+ PPC::F8, PPC::F9, PPC::F10, PPC::F11,
+ PPC::F12, PPC::F13, PPC::F14, PPC::F15,
+ PPC::F16, PPC::F17, PPC::F18, PPC::F19,
+ PPC::F20, PPC::F21, PPC::F22, PPC::F23,
+ PPC::F24, PPC::F25, PPC::F26, PPC::F27,
+ PPC::F28, PPC::F29, PPC::F30, PPC::F31,
+
+ PPC::VF0, PPC::VF1, PPC::VF2, PPC::VF3,
+ PPC::VF4, PPC::VF5, PPC::VF6, PPC::VF7,
+ PPC::VF8, PPC::VF9, PPC::VF10, PPC::VF11,
+ PPC::VF12, PPC::VF13, PPC::VF14, PPC::VF15,
+ PPC::VF16, PPC::VF17, PPC::VF18, PPC::VF19,
+ PPC::VF20, PPC::VF21, PPC::VF22, PPC::VF23,
+ PPC::VF24, PPC::VF25, PPC::VF26, PPC::VF27,
+ PPC::VF28, PPC::VF29, PPC::VF30, PPC::VF31
+};
+
+static const unsigned GPRegs[] = {
+ PPC::R0, PPC::R1, PPC::R2, PPC::R3,
+ PPC::R4, PPC::R5, PPC::R6, PPC::R7,
+ PPC::R8, PPC::R9, PPC::R10, PPC::R11,
+ PPC::R12, PPC::R13, PPC::R14, PPC::R15,
+ PPC::R16, PPC::R17, PPC::R18, PPC::R19,
+ PPC::R20, PPC::R21, PPC::R22, PPC::R23,
+ PPC::R24, PPC::R25, PPC::R26, PPC::R27,
+ PPC::R28, PPC::R29, PPC::R30, PPC::R31
+};
+
+static const unsigned GP0Regs[] = {
+ PPC::ZERO, PPC::R1, PPC::R2, PPC::R3,
+ PPC::R4, PPC::R5, PPC::R6, PPC::R7,
+ PPC::R8, PPC::R9, PPC::R10, PPC::R11,
+ PPC::R12, PPC::R13, PPC::R14, PPC::R15,
+ PPC::R16, PPC::R17, PPC::R18, PPC::R19,
+ PPC::R20, PPC::R21, PPC::R22, PPC::R23,
+ PPC::R24, PPC::R25, PPC::R26, PPC::R27,
+ PPC::R28, PPC::R29, PPC::R30, PPC::R31
+};
+
+static const unsigned G8Regs[] = {
+ PPC::X0, PPC::X1, PPC::X2, PPC::X3,
+ PPC::X4, PPC::X5, PPC::X6, PPC::X7,
+ PPC::X8, PPC::X9, PPC::X10, PPC::X11,
+ PPC::X12, PPC::X13, PPC::X14, PPC::X15,
+ PPC::X16, PPC::X17, PPC::X18, PPC::X19,
+ PPC::X20, PPC::X21, PPC::X22, PPC::X23,
+ PPC::X24, PPC::X25, PPC::X26, PPC::X27,
+ PPC::X28, PPC::X29, PPC::X30, PPC::X31
+};
+
+static const unsigned QFRegs[] = {
+ PPC::QF0, PPC::QF1, PPC::QF2, PPC::QF3,
+ PPC::QF4, PPC::QF5, PPC::QF6, PPC::QF7,
+ PPC::QF8, PPC::QF9, PPC::QF10, PPC::QF11,
+ PPC::QF12, PPC::QF13, PPC::QF14, PPC::QF15,
+ PPC::QF16, PPC::QF17, PPC::QF18, PPC::QF19,
+ PPC::QF20, PPC::QF21, PPC::QF22, PPC::QF23,
+ PPC::QF24, PPC::QF25, PPC::QF26, PPC::QF27,
+ PPC::QF28, PPC::QF29, PPC::QF30, PPC::QF31
+};
+
+template <std::size_t N>
+static DecodeStatus decodeRegisterClass(MCInst &Inst, uint64_t RegNo,
+ const unsigned (&Regs)[N]) {
+ assert(RegNo < N && "Invalid register number");
+ Inst.addOperand(MCOperand::createReg(Regs[RegNo]));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCRRCRegisterClass(MCInst &Inst, uint64_t RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeRegisterClass(Inst, RegNo, CRRegs);
+}
+
+static DecodeStatus DecodeCRRC0RegisterClass(MCInst &Inst, uint64_t RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeRegisterClass(Inst, RegNo, CRRegs);
+}
+
+static DecodeStatus DecodeCRBITRCRegisterClass(MCInst &Inst, uint64_t RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeRegisterClass(Inst, RegNo, CRBITRegs);
+}
+
+static DecodeStatus DecodeF4RCRegisterClass(MCInst &Inst, uint64_t RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeRegisterClass(Inst, RegNo, FRegs);
+}
+
+static DecodeStatus DecodeF8RCRegisterClass(MCInst &Inst, uint64_t RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeRegisterClass(Inst, RegNo, FRegs);
+}
+
+static DecodeStatus DecodeVFRCRegisterClass(MCInst &Inst, uint64_t RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeRegisterClass(Inst, RegNo, VFRegs);
+}
+
+static DecodeStatus DecodeVRRCRegisterClass(MCInst &Inst, uint64_t RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeRegisterClass(Inst, RegNo, VRegs);
+}
+
+static DecodeStatus DecodeVSRCRegisterClass(MCInst &Inst, uint64_t RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeRegisterClass(Inst, RegNo, VSRegs);
+}
+
+static DecodeStatus DecodeVSFRCRegisterClass(MCInst &Inst, uint64_t RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeRegisterClass(Inst, RegNo, VSFRegs);
+}
+
+static DecodeStatus DecodeVSSRCRegisterClass(MCInst &Inst, uint64_t RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeRegisterClass(Inst, RegNo, VSSRegs);
+}
+
+static DecodeStatus DecodeGPRCRegisterClass(MCInst &Inst, uint64_t RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeRegisterClass(Inst, RegNo, GPRegs);
+}
+
+static DecodeStatus DecodeGPRC_NOR0RegisterClass(MCInst &Inst, uint64_t RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeRegisterClass(Inst, RegNo, GP0Regs);
+}
+
+static DecodeStatus DecodeG8RCRegisterClass(MCInst &Inst, uint64_t RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeRegisterClass(Inst, RegNo, G8Regs);
+}
+
+#define DecodePointerLikeRegClass0 DecodeGPRCRegisterClass
+#define DecodePointerLikeRegClass1 DecodeGPRC_NOR0RegisterClass
+
+static DecodeStatus DecodeQFRCRegisterClass(MCInst &Inst, uint64_t RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeRegisterClass(Inst, RegNo, QFRegs);
+}
+
+#define DecodeQSRCRegisterClass DecodeQFRCRegisterClass
+#define DecodeQBRCRegisterClass DecodeQFRCRegisterClass
+
+template<unsigned N>
+static DecodeStatus decodeUImmOperand(MCInst &Inst, uint64_t Imm,
+ int64_t Address, const void *Decoder) {
+ assert(isUInt<N>(Imm) && "Invalid immediate");
+ Inst.addOperand(MCOperand::createImm(Imm));
+ return MCDisassembler::Success;
+}
+
+template<unsigned N>
+static DecodeStatus decodeSImmOperand(MCInst &Inst, uint64_t Imm,
+ int64_t Address, const void *Decoder) {
+ assert(isUInt<N>(Imm) && "Invalid immediate");
+ Inst.addOperand(MCOperand::createImm(SignExtend64<N>(Imm)));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeMemRIOperands(MCInst &Inst, uint64_t Imm,
+ int64_t Address, const void *Decoder) {
+ // Decode the memri field (imm, reg), which has the low 16-bits as the
+ // displacement and the next 5 bits as the register #.
+
+ uint64_t Base = Imm >> 16;
+ uint64_t Disp = Imm & 0xFFFF;
+
+ assert(Base < 32 && "Invalid base register");
+
+ switch (Inst.getOpcode()) {
+ default: break;
+ case PPC::LBZU:
+ case PPC::LHAU:
+ case PPC::LHZU:
+ case PPC::LWZU:
+ case PPC::LFSU:
+ case PPC::LFDU:
+ // Add the tied output operand.
+ Inst.addOperand(MCOperand::createReg(GP0Regs[Base]));
+ break;
+ case PPC::STBU:
+ case PPC::STHU:
+ case PPC::STWU:
+ case PPC::STFSU:
+ case PPC::STFDU:
+ Inst.insert(Inst.begin(), MCOperand::createReg(GP0Regs[Base]));
+ break;
+ }
+
+ Inst.addOperand(MCOperand::createImm(SignExtend64<16>(Disp)));
+ Inst.addOperand(MCOperand::createReg(GP0Regs[Base]));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeMemRIXOperands(MCInst &Inst, uint64_t Imm,
+ int64_t Address, const void *Decoder) {
+ // Decode the memrix field (imm, reg), which has the low 14-bits as the
+ // displacement and the next 5 bits as the register #.
+
+ uint64_t Base = Imm >> 14;
+ uint64_t Disp = Imm & 0x3FFF;
+
+ assert(Base < 32 && "Invalid base register");
+
+ if (Inst.getOpcode() == PPC::LDU)
+ // Add the tied output operand.
+ Inst.addOperand(MCOperand::createReg(GP0Regs[Base]));
+ else if (Inst.getOpcode() == PPC::STDU)
+ Inst.insert(Inst.begin(), MCOperand::createReg(GP0Regs[Base]));
+
+ Inst.addOperand(MCOperand::createImm(SignExtend64<16>(Disp << 2)));
+ Inst.addOperand(MCOperand::createReg(GP0Regs[Base]));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeMemRIX16Operands(MCInst &Inst, uint64_t Imm,
+ int64_t Address, const void *Decoder) {
+ // Decode the memrix16 field (imm, reg), which has the low 12-bits as the
+ // displacement with 16-byte aligned, and the next 5 bits as the register #.
+
+ uint64_t Base = Imm >> 12;
+ uint64_t Disp = Imm & 0xFFF;
+
+ assert(Base < 32 && "Invalid base register");
+
+ Inst.addOperand(MCOperand::createImm(SignExtend64<16>(Disp << 4)));
+ Inst.addOperand(MCOperand::createReg(GP0Regs[Base]));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeCRBitMOperand(MCInst &Inst, uint64_t Imm,
+ int64_t Address, const void *Decoder) {
+ // The cr bit encoding is 0x80 >> cr_reg_num.
+
+ unsigned Zeros = countTrailingZeros(Imm);
+ assert(Zeros < 8 && "Invalid CR bit value");
+
+ Inst.addOperand(MCOperand::createReg(CRRegs[7 - Zeros]));
+ return MCDisassembler::Success;
+}
+
+#include "PPCGenDisassemblerTables.inc"
+
+DecodeStatus PPCDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes,
+ uint64_t Address, raw_ostream &OS,
+ raw_ostream &CS) const {
+ // Get the four bytes of the instruction.
+ Size = 4;
+ if (Bytes.size() < 4) {
+ Size = 0;
+ return MCDisassembler::Fail;
+ }
+
+ // Read the instruction in the proper endianness.
+ uint32_t Inst = IsLittleEndian ? support::endian::read32le(Bytes.data())
+ : support::endian::read32be(Bytes.data());
+
+ if (STI.getFeatureBits()[PPC::FeatureQPX]) {
+ DecodeStatus result =
+ decodeInstruction(DecoderTableQPX32, MI, Inst, Address, this, STI);
+ if (result != MCDisassembler::Fail)
+ return result;
+ }
+
+ return decodeInstruction(DecoderTable32, MI, Inst, Address, this, STI);
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
new file mode 100644
index 000000000000..609d959c6d08
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
@@ -0,0 +1,506 @@
+//===-- PPCInstPrinter.cpp - Convert PPC MCInst to assembly syntax --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an PPC MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCInstPrinter.h"
+#include "PPCInstrInfo.h"
+#include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOpcodes.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+// FIXME: Once the integrated assembler supports full register names, tie this
+// to the verbose-asm setting.
+static cl::opt<bool>
+FullRegNames("ppc-asm-full-reg-names", cl::Hidden, cl::init(false),
+ cl::desc("Use full register names when printing assembly"));
+
+// Useful for testing purposes. Prints vs{31-63} as v{0-31} respectively.
+static cl::opt<bool>
+ShowVSRNumsAsVR("ppc-vsr-nums-as-vr", cl::Hidden, cl::init(false),
+ cl::desc("Prints full register names with vs{31-63} as v{0-31}"));
+
+#define PRINT_ALIAS_INSTR
+#include "PPCGenAsmWriter.inc"
+
+void PPCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+ const char *RegName = getRegisterName(RegNo);
+ if (RegName[0] == 'q' /* QPX */) {
+ // The system toolchain on the BG/Q does not understand QPX register names
+ // in .cfi_* directives, so print the name of the floating-point
+ // subregister instead.
+ std::string RN(RegName);
+
+ RN[0] = 'f';
+ OS << RN;
+
+ return;
+ }
+
+ OS << RegName;
+}
+
+void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+ StringRef Annot, const MCSubtargetInfo &STI) {
+ // Check for slwi/srwi mnemonics.
+ if (MI->getOpcode() == PPC::RLWINM) {
+ unsigned char SH = MI->getOperand(2).getImm();
+ unsigned char MB = MI->getOperand(3).getImm();
+ unsigned char ME = MI->getOperand(4).getImm();
+ bool useSubstituteMnemonic = false;
+ if (SH <= 31 && MB == 0 && ME == (31-SH)) {
+ O << "\tslwi "; useSubstituteMnemonic = true;
+ }
+ if (SH <= 31 && MB == (32-SH) && ME == 31) {
+ O << "\tsrwi "; useSubstituteMnemonic = true;
+ SH = 32-SH;
+ }
+ if (useSubstituteMnemonic) {
+ printOperand(MI, 0, O);
+ O << ", ";
+ printOperand(MI, 1, O);
+ O << ", " << (unsigned int)SH;
+
+ printAnnotation(O, Annot);
+ return;
+ }
+ }
+
+ if ((MI->getOpcode() == PPC::OR || MI->getOpcode() == PPC::OR8) &&
+ MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) {
+ O << "\tmr ";
+ printOperand(MI, 0, O);
+ O << ", ";
+ printOperand(MI, 1, O);
+ printAnnotation(O, Annot);
+ return;
+ }
+
+ if (MI->getOpcode() == PPC::RLDICR) {
+ unsigned char SH = MI->getOperand(2).getImm();
+ unsigned char ME = MI->getOperand(3).getImm();
+ // rldicr RA, RS, SH, 63-SH == sldi RA, RS, SH
+ if (63-SH == ME) {
+ O << "\tsldi ";
+ printOperand(MI, 0, O);
+ O << ", ";
+ printOperand(MI, 1, O);
+ O << ", " << (unsigned int)SH;
+ printAnnotation(O, Annot);
+ return;
+ }
+ }
+
+ // dcbt[st] is printed manually here because:
+ // 1. The assembly syntax is different between embedded and server targets
+ // 2. We must print the short mnemonics for TH == 0 because the
+ // embedded/server syntax default will not be stable across assemblers
+ // The syntax for dcbt is:
+ // dcbt ra, rb, th [server]
+ // dcbt th, ra, rb [embedded]
+ // where th can be omitted when it is 0. dcbtst is the same.
+ if (MI->getOpcode() == PPC::DCBT || MI->getOpcode() == PPC::DCBTST) {
+ unsigned char TH = MI->getOperand(0).getImm();
+ O << "\tdcbt";
+ if (MI->getOpcode() == PPC::DCBTST)
+ O << "st";
+ if (TH == 16)
+ O << "t";
+ O << " ";
+
+ bool IsBookE = STI.getFeatureBits()[PPC::FeatureBookE];
+ if (IsBookE && TH != 0 && TH != 16)
+ O << (unsigned int) TH << ", ";
+
+ printOperand(MI, 1, O);
+ O << ", ";
+ printOperand(MI, 2, O);
+
+ if (!IsBookE && TH != 0 && TH != 16)
+ O << ", " << (unsigned int) TH;
+
+ printAnnotation(O, Annot);
+ return;
+ }
+
+ if (MI->getOpcode() == PPC::DCBF) {
+ unsigned char L = MI->getOperand(0).getImm();
+ if (!L || L == 1 || L == 3) {
+ O << "\tdcbf";
+ if (L == 1 || L == 3)
+ O << "l";
+ if (L == 3)
+ O << "p";
+ O << " ";
+
+ printOperand(MI, 1, O);
+ O << ", ";
+ printOperand(MI, 2, O);
+
+ printAnnotation(O, Annot);
+ return;
+ }
+ }
+
+ if (!printAliasInstr(MI, O))
+ printInstruction(MI, O);
+ printAnnotation(O, Annot);
+}
+
+
+void PPCInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O,
+ const char *Modifier) {
+ unsigned Code = MI->getOperand(OpNo).getImm();
+
+ if (StringRef(Modifier) == "cc") {
+ switch ((PPC::Predicate)Code) {
+ case PPC::PRED_LT_MINUS:
+ case PPC::PRED_LT_PLUS:
+ case PPC::PRED_LT:
+ O << "lt";
+ return;
+ case PPC::PRED_LE_MINUS:
+ case PPC::PRED_LE_PLUS:
+ case PPC::PRED_LE:
+ O << "le";
+ return;
+ case PPC::PRED_EQ_MINUS:
+ case PPC::PRED_EQ_PLUS:
+ case PPC::PRED_EQ:
+ O << "eq";
+ return;
+ case PPC::PRED_GE_MINUS:
+ case PPC::PRED_GE_PLUS:
+ case PPC::PRED_GE:
+ O << "ge";
+ return;
+ case PPC::PRED_GT_MINUS:
+ case PPC::PRED_GT_PLUS:
+ case PPC::PRED_GT:
+ O << "gt";
+ return;
+ case PPC::PRED_NE_MINUS:
+ case PPC::PRED_NE_PLUS:
+ case PPC::PRED_NE:
+ O << "ne";
+ return;
+ case PPC::PRED_UN_MINUS:
+ case PPC::PRED_UN_PLUS:
+ case PPC::PRED_UN:
+ O << "un";
+ return;
+ case PPC::PRED_NU_MINUS:
+ case PPC::PRED_NU_PLUS:
+ case PPC::PRED_NU:
+ O << "nu";
+ return;
+ case PPC::PRED_BIT_SET:
+ case PPC::PRED_BIT_UNSET:
+ llvm_unreachable("Invalid use of bit predicate code");
+ }
+ llvm_unreachable("Invalid predicate code");
+ }
+
+ if (StringRef(Modifier) == "pm") {
+ switch ((PPC::Predicate)Code) {
+ case PPC::PRED_LT:
+ case PPC::PRED_LE:
+ case PPC::PRED_EQ:
+ case PPC::PRED_GE:
+ case PPC::PRED_GT:
+ case PPC::PRED_NE:
+ case PPC::PRED_UN:
+ case PPC::PRED_NU:
+ return;
+ case PPC::PRED_LT_MINUS:
+ case PPC::PRED_LE_MINUS:
+ case PPC::PRED_EQ_MINUS:
+ case PPC::PRED_GE_MINUS:
+ case PPC::PRED_GT_MINUS:
+ case PPC::PRED_NE_MINUS:
+ case PPC::PRED_UN_MINUS:
+ case PPC::PRED_NU_MINUS:
+ O << "-";
+ return;
+ case PPC::PRED_LT_PLUS:
+ case PPC::PRED_LE_PLUS:
+ case PPC::PRED_EQ_PLUS:
+ case PPC::PRED_GE_PLUS:
+ case PPC::PRED_GT_PLUS:
+ case PPC::PRED_NE_PLUS:
+ case PPC::PRED_UN_PLUS:
+ case PPC::PRED_NU_PLUS:
+ O << "+";
+ return;
+ case PPC::PRED_BIT_SET:
+ case PPC::PRED_BIT_UNSET:
+ llvm_unreachable("Invalid use of bit predicate code");
+ }
+ llvm_unreachable("Invalid predicate code");
+ }
+
+ assert(StringRef(Modifier) == "reg" &&
+ "Need to specify 'cc', 'pm' or 'reg' as predicate op modifier!");
+ printOperand(MI, OpNo+1, O);
+}
+
+void PPCInstPrinter::printATBitsAsHint(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ unsigned Code = MI->getOperand(OpNo).getImm();
+ if (Code == 2)
+ O << "-";
+ else if (Code == 3)
+ O << "+";
+}
+
+void PPCInstPrinter::printU1ImmOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ unsigned int Value = MI->getOperand(OpNo).getImm();
+ assert(Value <= 1 && "Invalid u1imm argument!");
+ O << (unsigned int)Value;
+}
+
+void PPCInstPrinter::printU2ImmOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ unsigned int Value = MI->getOperand(OpNo).getImm();
+ assert(Value <= 3 && "Invalid u2imm argument!");
+ O << (unsigned int)Value;
+}
+
+void PPCInstPrinter::printU3ImmOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ unsigned int Value = MI->getOperand(OpNo).getImm();
+ assert(Value <= 8 && "Invalid u3imm argument!");
+ O << (unsigned int)Value;
+}
+
+void PPCInstPrinter::printU4ImmOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ unsigned int Value = MI->getOperand(OpNo).getImm();
+ assert(Value <= 15 && "Invalid u4imm argument!");
+ O << (unsigned int)Value;
+}
+
+void PPCInstPrinter::printS5ImmOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ int Value = MI->getOperand(OpNo).getImm();
+ Value = SignExtend32<5>(Value);
+ O << (int)Value;
+}
+
+void PPCInstPrinter::printU5ImmOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ unsigned int Value = MI->getOperand(OpNo).getImm();
+ assert(Value <= 31 && "Invalid u5imm argument!");
+ O << (unsigned int)Value;
+}
+
+void PPCInstPrinter::printU6ImmOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ unsigned int Value = MI->getOperand(OpNo).getImm();
+ assert(Value <= 63 && "Invalid u6imm argument!");
+ O << (unsigned int)Value;
+}
+
+void PPCInstPrinter::printU7ImmOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ unsigned int Value = MI->getOperand(OpNo).getImm();
+ assert(Value <= 127 && "Invalid u7imm argument!");
+ O << (unsigned int)Value;
+}
+
+// Operands of BUILD_VECTOR are signed and we use this to print operands
+// of XXSPLTIB which are unsigned. So we simply truncate to 8 bits and
+// print as unsigned.
+void PPCInstPrinter::printU8ImmOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ unsigned char Value = MI->getOperand(OpNo).getImm();
+ O << (unsigned int)Value;
+}
+
+void PPCInstPrinter::printU10ImmOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ unsigned short Value = MI->getOperand(OpNo).getImm();
+ assert(Value <= 1023 && "Invalid u10imm argument!");
+ O << (unsigned short)Value;
+}
+
+void PPCInstPrinter::printU12ImmOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ unsigned short Value = MI->getOperand(OpNo).getImm();
+ assert(Value <= 4095 && "Invalid u12imm argument!");
+ O << (unsigned short)Value;
+}
+
+void PPCInstPrinter::printS16ImmOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ if (MI->getOperand(OpNo).isImm())
+ O << (short)MI->getOperand(OpNo).getImm();
+ else
+ printOperand(MI, OpNo, O);
+}
+
+void PPCInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ if (MI->getOperand(OpNo).isImm())
+ O << (unsigned short)MI->getOperand(OpNo).getImm();
+ else
+ printOperand(MI, OpNo, O);
+}
+
+void PPCInstPrinter::printBranchOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ if (!MI->getOperand(OpNo).isImm())
+ return printOperand(MI, OpNo, O);
+
+ // Branches can take an immediate operand. This is used by the branch
+ // selection pass to print .+8, an eight byte displacement from the PC.
+ O << ".+";
+ printAbsBranchOperand(MI, OpNo, O);
+}
+
+void PPCInstPrinter::printAbsBranchOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ if (!MI->getOperand(OpNo).isImm())
+ return printOperand(MI, OpNo, O);
+
+ O << SignExtend32<32>((unsigned)MI->getOperand(OpNo).getImm() << 2);
+}
+
+
+void PPCInstPrinter::printcrbitm(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ unsigned CCReg = MI->getOperand(OpNo).getReg();
+ unsigned RegNo;
+ switch (CCReg) {
+ default: llvm_unreachable("Unknown CR register");
+ case PPC::CR0: RegNo = 0; break;
+ case PPC::CR1: RegNo = 1; break;
+ case PPC::CR2: RegNo = 2; break;
+ case PPC::CR3: RegNo = 3; break;
+ case PPC::CR4: RegNo = 4; break;
+ case PPC::CR5: RegNo = 5; break;
+ case PPC::CR6: RegNo = 6; break;
+ case PPC::CR7: RegNo = 7; break;
+ }
+ O << (0x80 >> RegNo);
+}
+
+void PPCInstPrinter::printMemRegImm(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ printS16ImmOperand(MI, OpNo, O);
+ O << '(';
+ if (MI->getOperand(OpNo+1).getReg() == PPC::R0)
+ O << "0";
+ else
+ printOperand(MI, OpNo+1, O);
+ O << ')';
+}
+
+void PPCInstPrinter::printMemRegReg(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ // When used as the base register, r0 reads constant zero rather than
+ // the value contained in the register. For this reason, the darwin
+ // assembler requires that we print r0 as 0 (no r) when used as the base.
+ if (MI->getOperand(OpNo).getReg() == PPC::R0)
+ O << "0";
+ else
+ printOperand(MI, OpNo, O);
+ O << ", ";
+ printOperand(MI, OpNo+1, O);
+}
+
+void PPCInstPrinter::printTLSCall(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ // On PPC64, VariantKind is VK_None, but on PPC32, it's VK_PLT, and it must
+ // come at the _end_ of the expression.
+ const MCOperand &Op = MI->getOperand(OpNo);
+ const MCSymbolRefExpr &refExp = cast<MCSymbolRefExpr>(*Op.getExpr());
+ O << refExp.getSymbol().getName();
+ O << '(';
+ printOperand(MI, OpNo+1, O);
+ O << ')';
+ if (refExp.getKind() != MCSymbolRefExpr::VK_None)
+ O << '@' << MCSymbolRefExpr::getVariantKindName(refExp.getKind());
+}
+
+
+/// stripRegisterPrefix - This method strips the character prefix from a
+/// register name so that only the number is left. Used by for linux asm.
+static const char *stripRegisterPrefix(const char *RegName) {
+ if (FullRegNames || ShowVSRNumsAsVR)
+ return RegName;
+
+ switch (RegName[0]) {
+ case 'r':
+ case 'f':
+ case 'q': // for QPX
+ case 'v':
+ if (RegName[1] == 's')
+ return RegName + 2;
+ return RegName + 1;
+ case 'c': if (RegName[1] == 'r') return RegName + 2;
+ }
+
+ return RegName;
+}
+
+void PPCInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isReg()) {
+ unsigned Reg = Op.getReg();
+
+ // There are VSX instructions that use VSX register numbering (vs0 - vs63)
+ // as well as those that use VMX register numbering (v0 - v31 which
+ // correspond to vs32 - vs63). If we have an instruction that uses VSX
+ // numbering, we need to convert the VMX registers to VSX registers.
+ // Namely, we print 32-63 when the instruction operates on one of the
+ // VMX registers.
+ // (Please synchronize with PPCAsmPrinter::printOperand)
+ if ((MII.get(MI->getOpcode()).TSFlags & PPCII::UseVSXReg) &&
+ !ShowVSRNumsAsVR) {
+ if (PPCInstrInfo::isVRRegister(Reg))
+ Reg = PPC::VSX32 + (Reg - PPC::V0);
+ else if (PPCInstrInfo::isVFRegister(Reg))
+ Reg = PPC::VSX32 + (Reg - PPC::VF0);
+ }
+
+ const char *RegName = getRegisterName(Reg);
+ // The linux and AIX assembler does not take register prefixes.
+ if (!isDarwinSyntax())
+ RegName = stripRegisterPrefix(RegName);
+
+ O << RegName;
+ return;
+ }
+
+ if (Op.isImm()) {
+ O << Op.getImm();
+ return;
+ }
+
+ assert(Op.isExpr() && "unknown operand kind in printOperand");
+ Op.getExpr()->print(O, &MAI);
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h b/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
new file mode 100644
index 000000000000..9c79ffb1176c
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
@@ -0,0 +1,74 @@
+//===- PPCInstPrinter.h - Convert PPC MCInst to assembly syntax -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an PPC MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_INSTPRINTER_PPCINSTPRINTER_H
+#define LLVM_LIB_TARGET_POWERPC_INSTPRINTER_PPCINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+
+class PPCInstPrinter : public MCInstPrinter {
+ bool IsDarwin;
+public:
+ PPCInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI, bool isDarwin)
+ : MCInstPrinter(MAI, MII, MRI), IsDarwin(isDarwin) {}
+
+ bool isDarwinSyntax() const {
+ return IsDarwin;
+ }
+
+ void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+ void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
+ const MCSubtargetInfo &STI) override;
+
+ // Autogenerated by tblgen.
+ void printInstruction(const MCInst *MI, raw_ostream &O);
+ static const char *getRegisterName(unsigned RegNo);
+
+ bool printAliasInstr(const MCInst *MI, raw_ostream &OS);
+ void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+ unsigned PrintMethodIdx,
+ raw_ostream &OS);
+
+ void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printPredicateOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O, const char *Modifier = nullptr);
+ void printATBitsAsHint(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+ void printU1ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printU2ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printU3ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printU4ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printS5ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printU5ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printU6ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printU7ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printU10ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printU12ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printS16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printU16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printBranchOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printAbsBranchOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printTLSCall(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+ void printcrbitm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+ void printMemRegImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printMemRegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
new file mode 100644
index 000000000000..5847b3a52bfc
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -0,0 +1,241 @@
+//===-- PPCAsmBackend.cpp - PPC Assembler Backend -------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "MCTargetDesc/PPCFixupKinds.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
+ switch (Kind) {
+ default:
+ llvm_unreachable("Unknown fixup kind!");
+ case FK_Data_1:
+ case FK_Data_2:
+ case FK_Data_4:
+ case FK_Data_8:
+ case PPC::fixup_ppc_nofixup:
+ return Value;
+ case PPC::fixup_ppc_brcond14:
+ case PPC::fixup_ppc_brcond14abs:
+ return Value & 0xfffc;
+ case PPC::fixup_ppc_br24:
+ case PPC::fixup_ppc_br24abs:
+ return Value & 0x3fffffc;
+ case PPC::fixup_ppc_half16:
+ return Value & 0xffff;
+ case PPC::fixup_ppc_half16ds:
+ return Value & 0xfffc;
+ }
+}
+
+static unsigned getFixupKindNumBytes(unsigned Kind) {
+ switch (Kind) {
+ default:
+ llvm_unreachable("Unknown fixup kind!");
+ case FK_Data_1:
+ return 1;
+ case FK_Data_2:
+ case PPC::fixup_ppc_half16:
+ case PPC::fixup_ppc_half16ds:
+ return 2;
+ case FK_Data_4:
+ case PPC::fixup_ppc_brcond14:
+ case PPC::fixup_ppc_brcond14abs:
+ case PPC::fixup_ppc_br24:
+ case PPC::fixup_ppc_br24abs:
+ return 4;
+ case FK_Data_8:
+ return 8;
+ case PPC::fixup_ppc_nofixup:
+ return 0;
+ }
+}
+
+namespace {
+
+class PPCAsmBackend : public MCAsmBackend {
+ const Target &TheTarget;
+ bool IsLittleEndian;
+public:
+ PPCAsmBackend(const Target &T, bool isLittle) : MCAsmBackend(), TheTarget(T),
+ IsLittleEndian(isLittle) {}
+
+ unsigned getNumFixupKinds() const override {
+ return PPC::NumTargetFixupKinds;
+ }
+
+ const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
+ const static MCFixupKindInfo InfosBE[PPC::NumTargetFixupKinds] = {
+ // name offset bits flags
+ { "fixup_ppc_br24", 6, 24, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_ppc_brcond14", 16, 14, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_ppc_br24abs", 6, 24, 0 },
+ { "fixup_ppc_brcond14abs", 16, 14, 0 },
+ { "fixup_ppc_half16", 0, 16, 0 },
+ { "fixup_ppc_half16ds", 0, 14, 0 },
+ { "fixup_ppc_nofixup", 0, 0, 0 }
+ };
+ const static MCFixupKindInfo InfosLE[PPC::NumTargetFixupKinds] = {
+ // name offset bits flags
+ { "fixup_ppc_br24", 2, 24, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_ppc_brcond14", 2, 14, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_ppc_br24abs", 2, 24, 0 },
+ { "fixup_ppc_brcond14abs", 2, 14, 0 },
+ { "fixup_ppc_half16", 0, 16, 0 },
+ { "fixup_ppc_half16ds", 2, 14, 0 },
+ { "fixup_ppc_nofixup", 0, 0, 0 }
+ };
+
+ if (Kind < FirstTargetFixupKind)
+ return MCAsmBackend::getFixupKindInfo(Kind);
+
+ assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+ "Invalid kind!");
+ return (IsLittleEndian? InfosLE : InfosBE)[Kind - FirstTargetFixupKind];
+ }
+
+ void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+ uint64_t Value, bool IsPCRel) const override {
+ Value = adjustFixupValue(Fixup.getKind(), Value);
+ if (!Value) return; // Doesn't change encoding.
+
+ unsigned Offset = Fixup.getOffset();
+ unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
+
+ // For each byte of the fragment that the fixup touches, mask in the bits
+ // from the fixup value. The Value has been "split up" into the appropriate
+ // bitfields above.
+ for (unsigned i = 0; i != NumBytes; ++i) {
+ unsigned Idx = IsLittleEndian ? i : (NumBytes - 1 - i);
+ Data[Offset + i] |= uint8_t((Value >> (Idx * 8)) & 0xff);
+ }
+ }
+
+ void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
+ const MCFixup &Fixup, const MCFragment *DF,
+ const MCValue &Target, uint64_t &Value,
+ bool &IsResolved) override {
+ switch ((PPC::Fixups)Fixup.getKind()) {
+ default: break;
+ case PPC::fixup_ppc_br24:
+ case PPC::fixup_ppc_br24abs:
+ // If the target symbol has a local entry point we must not attempt
+ // to resolve the fixup directly. Emit a relocation and leave
+ // resolution of the final target address to the linker.
+ if (const MCSymbolRefExpr *A = Target.getSymA()) {
+ if (const auto *S = dyn_cast<MCSymbolELF>(&A->getSymbol())) {
+ // The "other" values are stored in the last 6 bits of the second
+ // byte. The traditional defines for STO values assume the full byte
+ // and thus the shift to pack it.
+ unsigned Other = S->getOther() << 2;
+ if ((Other & ELF::STO_PPC64_LOCAL_MASK) != 0)
+ IsResolved = false;
+ }
+ }
+ break;
+ }
+ }
+
+ bool mayNeedRelaxation(const MCInst &Inst) const override {
+ // FIXME.
+ return false;
+ }
+
+ bool fixupNeedsRelaxation(const MCFixup &Fixup,
+ uint64_t Value,
+ const MCRelaxableFragment *DF,
+ const MCAsmLayout &Layout) const override {
+ // FIXME.
+ llvm_unreachable("relaxInstruction() unimplemented");
+ }
+
+ void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+ MCInst &Res) const override {
+ // FIXME.
+ llvm_unreachable("relaxInstruction() unimplemented");
+ }
+
+ bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override {
+ uint64_t NumNops = Count / 4;
+ for (uint64_t i = 0; i != NumNops; ++i)
+ OW->write32(0x60000000);
+
+ OW->WriteZeros(Count % 4);
+
+ return true;
+ }
+
+ unsigned getPointerSize() const {
+ StringRef Name = TheTarget.getName();
+ if (Name == "ppc64" || Name == "ppc64le") return 8;
+ assert(Name == "ppc32" && "Unknown target name!");
+ return 4;
+ }
+
+ bool isLittleEndian() const {
+ return IsLittleEndian;
+ }
+};
+} // end anonymous namespace
+
+
+// FIXME: This should be in a separate file.
+namespace {
+ class DarwinPPCAsmBackend : public PPCAsmBackend {
+ public:
+ DarwinPPCAsmBackend(const Target &T) : PPCAsmBackend(T, false) { }
+
+ MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+ bool is64 = getPointerSize() == 8;
+ return createPPCMachObjectWriter(
+ OS,
+ /*Is64Bit=*/is64,
+ (is64 ? MachO::CPU_TYPE_POWERPC64 : MachO::CPU_TYPE_POWERPC),
+ MachO::CPU_SUBTYPE_POWERPC_ALL);
+ }
+ };
+
+ class ELFPPCAsmBackend : public PPCAsmBackend {
+ uint8_t OSABI;
+ public:
+ ELFPPCAsmBackend(const Target &T, bool IsLittleEndian, uint8_t OSABI) :
+ PPCAsmBackend(T, IsLittleEndian), OSABI(OSABI) { }
+
+ MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+ bool is64 = getPointerSize() == 8;
+ return createPPCELFObjectWriter(OS, is64, isLittleEndian(), OSABI);
+ }
+ };
+
+} // end anonymous namespace
+
+MCAsmBackend *llvm::createPPCAsmBackend(const Target &T,
+ const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options) {
+ if (TT.isOSDarwin())
+ return new DarwinPPCAsmBackend(T);
+
+ uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TT.getOS());
+ bool IsLittleEndian = TT.getArch() == Triple::ppc64le;
+ return new ELFPPCAsmBackend(T, IsLittleEndian, OSABI);
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
new file mode 100644
index 000000000000..fd279c60f3f5
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
@@ -0,0 +1,425 @@
+//===-- PPCELFObjectWriter.cpp - PPC ELF Writer ---------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "MCTargetDesc/PPCFixupKinds.h"
+#include "MCTargetDesc/PPCMCExpr.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+ class PPCELFObjectWriter : public MCELFObjectTargetWriter {
+ public:
+ PPCELFObjectWriter(bool Is64Bit, uint8_t OSABI);
+
+ protected:
+ unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+ const MCFixup &Fixup, bool IsPCRel) const override;
+
+ bool needsRelocateWithSymbol(const MCSymbol &Sym,
+ unsigned Type) const override;
+ };
+}
+
+PPCELFObjectWriter::PPCELFObjectWriter(bool Is64Bit, uint8_t OSABI)
+ : MCELFObjectTargetWriter(Is64Bit, OSABI,
+ Is64Bit ? ELF::EM_PPC64 : ELF::EM_PPC,
+ /*HasRelocationAddend*/ true) {}
+
+static MCSymbolRefExpr::VariantKind getAccessVariant(const MCValue &Target,
+ const MCFixup &Fixup) {
+ const MCExpr *Expr = Fixup.getValue();
+
+ if (Expr->getKind() != MCExpr::Target)
+ return Target.getAccessVariant();
+
+ switch (cast<PPCMCExpr>(Expr)->getKind()) {
+ case PPCMCExpr::VK_PPC_None:
+ return MCSymbolRefExpr::VK_None;
+ case PPCMCExpr::VK_PPC_LO:
+ return MCSymbolRefExpr::VK_PPC_LO;
+ case PPCMCExpr::VK_PPC_HI:
+ return MCSymbolRefExpr::VK_PPC_HI;
+ case PPCMCExpr::VK_PPC_HA:
+ return MCSymbolRefExpr::VK_PPC_HA;
+ case PPCMCExpr::VK_PPC_HIGHERA:
+ return MCSymbolRefExpr::VK_PPC_HIGHERA;
+ case PPCMCExpr::VK_PPC_HIGHER:
+ return MCSymbolRefExpr::VK_PPC_HIGHER;
+ case PPCMCExpr::VK_PPC_HIGHEST:
+ return MCSymbolRefExpr::VK_PPC_HIGHEST;
+ case PPCMCExpr::VK_PPC_HIGHESTA:
+ return MCSymbolRefExpr::VK_PPC_HIGHESTA;
+ }
+ llvm_unreachable("unknown PPCMCExpr kind");
+}
+
+unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
+ const MCFixup &Fixup,
+ bool IsPCRel) const {
+ MCSymbolRefExpr::VariantKind Modifier = getAccessVariant(Target, Fixup);
+
+ // determine the type of the relocation
+ unsigned Type;
+ if (IsPCRel) {
+ switch ((unsigned)Fixup.getKind()) {
+ default:
+ llvm_unreachable("Unimplemented");
+ case PPC::fixup_ppc_br24:
+ case PPC::fixup_ppc_br24abs:
+ switch (Modifier) {
+ default: llvm_unreachable("Unsupported Modifier");
+ case MCSymbolRefExpr::VK_None:
+ Type = ELF::R_PPC_REL24;
+ break;
+ case MCSymbolRefExpr::VK_PLT:
+ Type = ELF::R_PPC_PLTREL24;
+ break;
+ case MCSymbolRefExpr::VK_PPC_LOCAL:
+ Type = ELF::R_PPC_LOCAL24PC;
+ break;
+ }
+ break;
+ case PPC::fixup_ppc_brcond14:
+ case PPC::fixup_ppc_brcond14abs:
+ Type = ELF::R_PPC_REL14;
+ break;
+ case PPC::fixup_ppc_half16:
+ switch (Modifier) {
+ default: llvm_unreachable("Unsupported Modifier");
+ case MCSymbolRefExpr::VK_None:
+ Type = ELF::R_PPC_REL16;
+ break;
+ case MCSymbolRefExpr::VK_PPC_LO:
+ Type = ELF::R_PPC_REL16_LO;
+ break;
+ case MCSymbolRefExpr::VK_PPC_HI:
+ Type = ELF::R_PPC_REL16_HI;
+ break;
+ case MCSymbolRefExpr::VK_PPC_HA:
+ Type = ELF::R_PPC_REL16_HA;
+ break;
+ }
+ break;
+ case PPC::fixup_ppc_half16ds:
+ Target.print(errs());
+ errs() << '\n';
+ report_fatal_error("Invalid PC-relative half16ds relocation");
+ case FK_Data_4:
+ case FK_PCRel_4:
+ Type = ELF::R_PPC_REL32;
+ break;
+ case FK_Data_8:
+ case FK_PCRel_8:
+ Type = ELF::R_PPC64_REL64;
+ break;
+ }
+ } else {
+ switch ((unsigned)Fixup.getKind()) {
+ default: llvm_unreachable("invalid fixup kind!");
+ case PPC::fixup_ppc_br24abs:
+ Type = ELF::R_PPC_ADDR24;
+ break;
+ case PPC::fixup_ppc_brcond14abs:
+ Type = ELF::R_PPC_ADDR14; // XXX: or BRNTAKEN?_
+ break;
+ case PPC::fixup_ppc_half16:
+ switch (Modifier) {
+ default: llvm_unreachable("Unsupported Modifier");
+ case MCSymbolRefExpr::VK_None:
+ Type = ELF::R_PPC_ADDR16;
+ break;
+ case MCSymbolRefExpr::VK_PPC_LO:
+ Type = ELF::R_PPC_ADDR16_LO;
+ break;
+ case MCSymbolRefExpr::VK_PPC_HI:
+ Type = ELF::R_PPC_ADDR16_HI;
+ break;
+ case MCSymbolRefExpr::VK_PPC_HA:
+ Type = ELF::R_PPC_ADDR16_HA;
+ break;
+ case MCSymbolRefExpr::VK_PPC_HIGHER:
+ Type = ELF::R_PPC64_ADDR16_HIGHER;
+ break;
+ case MCSymbolRefExpr::VK_PPC_HIGHERA:
+ Type = ELF::R_PPC64_ADDR16_HIGHERA;
+ break;
+ case MCSymbolRefExpr::VK_PPC_HIGHEST:
+ Type = ELF::R_PPC64_ADDR16_HIGHEST;
+ break;
+ case MCSymbolRefExpr::VK_PPC_HIGHESTA:
+ Type = ELF::R_PPC64_ADDR16_HIGHESTA;
+ break;
+ case MCSymbolRefExpr::VK_GOT:
+ Type = ELF::R_PPC_GOT16;
+ break;
+ case MCSymbolRefExpr::VK_PPC_GOT_LO:
+ Type = ELF::R_PPC_GOT16_LO;
+ break;
+ case MCSymbolRefExpr::VK_PPC_GOT_HI:
+ Type = ELF::R_PPC_GOT16_HI;
+ break;
+ case MCSymbolRefExpr::VK_PPC_GOT_HA:
+ Type = ELF::R_PPC_GOT16_HA;
+ break;
+ case MCSymbolRefExpr::VK_PPC_TOC:
+ Type = ELF::R_PPC64_TOC16;
+ break;
+ case MCSymbolRefExpr::VK_PPC_TOC_LO:
+ Type = ELF::R_PPC64_TOC16_LO;
+ break;
+ case MCSymbolRefExpr::VK_PPC_TOC_HI:
+ Type = ELF::R_PPC64_TOC16_HI;
+ break;
+ case MCSymbolRefExpr::VK_PPC_TOC_HA:
+ Type = ELF::R_PPC64_TOC16_HA;
+ break;
+ case MCSymbolRefExpr::VK_TPREL:
+ Type = ELF::R_PPC_TPREL16;
+ break;
+ case MCSymbolRefExpr::VK_PPC_TPREL_LO:
+ Type = ELF::R_PPC_TPREL16_LO;
+ break;
+ case MCSymbolRefExpr::VK_PPC_TPREL_HI:
+ Type = ELF::R_PPC_TPREL16_HI;
+ break;
+ case MCSymbolRefExpr::VK_PPC_TPREL_HA:
+ Type = ELF::R_PPC_TPREL16_HA;
+ break;
+ case MCSymbolRefExpr::VK_PPC_TPREL_HIGHER:
+ Type = ELF::R_PPC64_TPREL16_HIGHER;
+ break;
+ case MCSymbolRefExpr::VK_PPC_TPREL_HIGHERA:
+ Type = ELF::R_PPC64_TPREL16_HIGHERA;
+ break;
+ case MCSymbolRefExpr::VK_PPC_TPREL_HIGHEST:
+ Type = ELF::R_PPC64_TPREL16_HIGHEST;
+ break;
+ case MCSymbolRefExpr::VK_PPC_TPREL_HIGHESTA:
+ Type = ELF::R_PPC64_TPREL16_HIGHESTA;
+ break;
+ case MCSymbolRefExpr::VK_DTPREL:
+ Type = ELF::R_PPC64_DTPREL16;
+ break;
+ case MCSymbolRefExpr::VK_PPC_DTPREL_LO:
+ Type = ELF::R_PPC64_DTPREL16_LO;
+ break;
+ case MCSymbolRefExpr::VK_PPC_DTPREL_HI:
+ Type = ELF::R_PPC64_DTPREL16_HI;
+ break;
+ case MCSymbolRefExpr::VK_PPC_DTPREL_HA:
+ Type = ELF::R_PPC64_DTPREL16_HA;
+ break;
+ case MCSymbolRefExpr::VK_PPC_DTPREL_HIGHER:
+ Type = ELF::R_PPC64_DTPREL16_HIGHER;
+ break;
+ case MCSymbolRefExpr::VK_PPC_DTPREL_HIGHERA:
+ Type = ELF::R_PPC64_DTPREL16_HIGHERA;
+ break;
+ case MCSymbolRefExpr::VK_PPC_DTPREL_HIGHEST:
+ Type = ELF::R_PPC64_DTPREL16_HIGHEST;
+ break;
+ case MCSymbolRefExpr::VK_PPC_DTPREL_HIGHESTA:
+ Type = ELF::R_PPC64_DTPREL16_HIGHESTA;
+ break;
+ case MCSymbolRefExpr::VK_PPC_GOT_TLSGD:
+ if (is64Bit())
+ Type = ELF::R_PPC64_GOT_TLSGD16;
+ else
+ Type = ELF::R_PPC_GOT_TLSGD16;
+ break;
+ case MCSymbolRefExpr::VK_PPC_GOT_TLSGD_LO:
+ Type = ELF::R_PPC64_GOT_TLSGD16_LO;
+ break;
+ case MCSymbolRefExpr::VK_PPC_GOT_TLSGD_HI:
+ Type = ELF::R_PPC64_GOT_TLSGD16_HI;
+ break;
+ case MCSymbolRefExpr::VK_PPC_GOT_TLSGD_HA:
+ Type = ELF::R_PPC64_GOT_TLSGD16_HA;
+ break;
+ case MCSymbolRefExpr::VK_PPC_GOT_TLSLD:
+ if (is64Bit())
+ Type = ELF::R_PPC64_GOT_TLSLD16;
+ else
+ Type = ELF::R_PPC_GOT_TLSLD16;
+ break;
+ case MCSymbolRefExpr::VK_PPC_GOT_TLSLD_LO:
+ Type = ELF::R_PPC64_GOT_TLSLD16_LO;
+ break;
+ case MCSymbolRefExpr::VK_PPC_GOT_TLSLD_HI:
+ Type = ELF::R_PPC64_GOT_TLSLD16_HI;
+ break;
+ case MCSymbolRefExpr::VK_PPC_GOT_TLSLD_HA:
+ Type = ELF::R_PPC64_GOT_TLSLD16_HA;
+ break;
+ case MCSymbolRefExpr::VK_PPC_GOT_TPREL:
+ /* We don't have R_PPC64_GOT_TPREL16, but since GOT offsets
+ are always 4-aligned, we can use R_PPC64_GOT_TPREL16_DS. */
+ Type = ELF::R_PPC64_GOT_TPREL16_DS;
+ break;
+ case MCSymbolRefExpr::VK_PPC_GOT_TPREL_LO:
+ /* We don't have R_PPC64_GOT_TPREL16_LO, but since GOT offsets
+ are always 4-aligned, we can use R_PPC64_GOT_TPREL16_LO_DS. */
+ Type = ELF::R_PPC64_GOT_TPREL16_LO_DS;
+ break;
+ case MCSymbolRefExpr::VK_PPC_GOT_TPREL_HI:
+ Type = ELF::R_PPC64_GOT_TPREL16_HI;
+ break;
+ case MCSymbolRefExpr::VK_PPC_GOT_DTPREL:
+ /* We don't have R_PPC64_GOT_DTPREL16, but since GOT offsets
+ are always 4-aligned, we can use R_PPC64_GOT_DTPREL16_DS. */
+ Type = ELF::R_PPC64_GOT_DTPREL16_DS;
+ break;
+ case MCSymbolRefExpr::VK_PPC_GOT_DTPREL_LO:
+ /* We don't have R_PPC64_GOT_DTPREL16_LO, but since GOT offsets
+ are always 4-aligned, we can use R_PPC64_GOT_DTPREL16_LO_DS. */
+ Type = ELF::R_PPC64_GOT_DTPREL16_LO_DS;
+ break;
+ case MCSymbolRefExpr::VK_PPC_GOT_TPREL_HA:
+ Type = ELF::R_PPC64_GOT_TPREL16_HA;
+ break;
+ case MCSymbolRefExpr::VK_PPC_GOT_DTPREL_HI:
+ Type = ELF::R_PPC64_GOT_DTPREL16_HI;
+ break;
+ case MCSymbolRefExpr::VK_PPC_GOT_DTPREL_HA:
+ Type = ELF::R_PPC64_GOT_DTPREL16_HA;
+ break;
+ }
+ break;
+ case PPC::fixup_ppc_half16ds:
+ switch (Modifier) {
+ default: llvm_unreachable("Unsupported Modifier");
+ case MCSymbolRefExpr::VK_None:
+ Type = ELF::R_PPC64_ADDR16_DS;
+ break;
+ case MCSymbolRefExpr::VK_PPC_LO:
+ Type = ELF::R_PPC64_ADDR16_LO_DS;
+ break;
+ case MCSymbolRefExpr::VK_GOT:
+ Type = ELF::R_PPC64_GOT16_DS;
+ break;
+ case MCSymbolRefExpr::VK_PPC_GOT_LO:
+ Type = ELF::R_PPC64_GOT16_LO_DS;
+ break;
+ case MCSymbolRefExpr::VK_PPC_TOC:
+ Type = ELF::R_PPC64_TOC16_DS;
+ break;
+ case MCSymbolRefExpr::VK_PPC_TOC_LO:
+ Type = ELF::R_PPC64_TOC16_LO_DS;
+ break;
+ case MCSymbolRefExpr::VK_TPREL:
+ Type = ELF::R_PPC64_TPREL16_DS;
+ break;
+ case MCSymbolRefExpr::VK_PPC_TPREL_LO:
+ Type = ELF::R_PPC64_TPREL16_LO_DS;
+ break;
+ case MCSymbolRefExpr::VK_DTPREL:
+ Type = ELF::R_PPC64_DTPREL16_DS;
+ break;
+ case MCSymbolRefExpr::VK_PPC_DTPREL_LO:
+ Type = ELF::R_PPC64_DTPREL16_LO_DS;
+ break;
+ case MCSymbolRefExpr::VK_PPC_GOT_TPREL:
+ Type = ELF::R_PPC64_GOT_TPREL16_DS;
+ break;
+ case MCSymbolRefExpr::VK_PPC_GOT_TPREL_LO:
+ Type = ELF::R_PPC64_GOT_TPREL16_LO_DS;
+ break;
+ case MCSymbolRefExpr::VK_PPC_GOT_DTPREL:
+ Type = ELF::R_PPC64_GOT_DTPREL16_DS;
+ break;
+ case MCSymbolRefExpr::VK_PPC_GOT_DTPREL_LO:
+ Type = ELF::R_PPC64_GOT_DTPREL16_LO_DS;
+ break;
+ }
+ break;
+ case PPC::fixup_ppc_nofixup:
+ switch (Modifier) {
+ default: llvm_unreachable("Unsupported Modifier");
+ case MCSymbolRefExpr::VK_PPC_TLSGD:
+ if (is64Bit())
+ Type = ELF::R_PPC64_TLSGD;
+ else
+ Type = ELF::R_PPC_TLSGD;
+ break;
+ case MCSymbolRefExpr::VK_PPC_TLSLD:
+ if (is64Bit())
+ Type = ELF::R_PPC64_TLSLD;
+ else
+ Type = ELF::R_PPC_TLSLD;
+ break;
+ case MCSymbolRefExpr::VK_PPC_TLS:
+ if (is64Bit())
+ Type = ELF::R_PPC64_TLS;
+ else
+ Type = ELF::R_PPC_TLS;
+ break;
+ }
+ break;
+ case FK_Data_8:
+ switch (Modifier) {
+ default: llvm_unreachable("Unsupported Modifier");
+ case MCSymbolRefExpr::VK_PPC_TOCBASE:
+ Type = ELF::R_PPC64_TOC;
+ break;
+ case MCSymbolRefExpr::VK_None:
+ Type = ELF::R_PPC64_ADDR64;
+ break;
+ case MCSymbolRefExpr::VK_PPC_DTPMOD:
+ Type = ELF::R_PPC64_DTPMOD64;
+ break;
+ case MCSymbolRefExpr::VK_TPREL:
+ Type = ELF::R_PPC64_TPREL64;
+ break;
+ case MCSymbolRefExpr::VK_DTPREL:
+ Type = ELF::R_PPC64_DTPREL64;
+ break;
+ }
+ break;
+ case FK_Data_4:
+ Type = ELF::R_PPC_ADDR32;
+ break;
+ case FK_Data_2:
+ Type = ELF::R_PPC_ADDR16;
+ break;
+ }
+ }
+ return Type;
+}
+
+bool PPCELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
+ unsigned Type) const {
+ switch (Type) {
+ default:
+ return false;
+
+ case ELF::R_PPC_REL24:
+ // If the target symbol has a local entry point, we must keep the
+ // target symbol to preserve that information for the linker.
+ // The "other" values are stored in the last 6 bits of the second byte.
+ // The traditional defines for STO values assume the full byte and thus
+ // the shift to pack it.
+ unsigned Other = cast<MCSymbolELF>(Sym).getOther() << 2;
+ return (Other & ELF::STO_PPC64_LOCAL_MASK) != 0;
+ }
+}
+
+MCObjectWriter *llvm::createPPCELFObjectWriter(raw_pwrite_stream &OS,
+ bool Is64Bit,
+ bool IsLittleEndian,
+ uint8_t OSABI) {
+ MCELFObjectTargetWriter *MOTW = new PPCELFObjectWriter(Is64Bit, OSABI);
+ return createELFObjectWriter(MOTW, OS, IsLittleEndian);
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
new file mode 100644
index 000000000000..ae43e59d3cb1
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
@@ -0,0 +1,56 @@
+//===-- PPCFixupKinds.h - PPC Specific Fixup Entries ------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCFIXUPKINDS_H
+#define LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCFIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+#undef PPC
+
+namespace llvm {
+namespace PPC {
+enum Fixups {
+ // fixup_ppc_br24 - 24-bit PC relative relocation for direct branches like 'b'
+ // and 'bl'.
+ fixup_ppc_br24 = FirstTargetFixupKind,
+
+ /// fixup_ppc_brcond14 - 14-bit PC relative relocation for conditional
+ /// branches.
+ fixup_ppc_brcond14,
+
+ /// fixup_ppc_br24abs - 24-bit absolute relocation for direct branches
+ /// like 'ba' and 'bla'.
+ fixup_ppc_br24abs,
+
+ /// fixup_ppc_brcond14abs - 14-bit absolute relocation for conditional
+ /// branches.
+ fixup_ppc_brcond14abs,
+
+ /// fixup_ppc_half16 - A 16-bit fixup corresponding to lo16(_foo)
+ /// or ha16(_foo) for instrs like 'li' or 'addis'.
+ fixup_ppc_half16,
+
+ /// fixup_ppc_half16ds - A 14-bit fixup corresponding to lo16(_foo) with
+ /// implied 2 zero bits for instrs like 'std'.
+ fixup_ppc_half16ds,
+
+ /// fixup_ppc_nofixup - Not a true fixup, but ties a symbol to a call
+ /// to __tls_get_addr for the TLS general and local dynamic models,
+ /// or inserts the thread-pointer register number.
+ fixup_ppc_nofixup,
+
+ // Marker
+ LastTargetFixupKind,
+ NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+}
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
new file mode 100644
index 000000000000..d8fab5b7c01a
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
@@ -0,0 +1,83 @@
+//===-- PPCMCAsmInfo.cpp - PPC asm properties -----------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the MCAsmInfoDarwin properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCMCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+
+using namespace llvm;
+
+void PPCMCAsmInfoDarwin::anchor() { }
+
+PPCMCAsmInfoDarwin::PPCMCAsmInfoDarwin(bool is64Bit, const Triple& T) {
+ if (is64Bit) {
+ PointerSize = CalleeSaveStackSlotSize = 8;
+ }
+ IsLittleEndian = false;
+
+ CommentString = ";";
+ ExceptionsType = ExceptionHandling::DwarfCFI;
+
+ if (!is64Bit)
+ Data64bitsDirective = nullptr; // We can't emit a 64-bit unit in PPC32 mode.
+
+ AssemblerDialect = 1; // New-Style mnemonics.
+ SupportsDebugInformation= true; // Debug information.
+
+ // The installed assembler for OSX < 10.6 lacks some directives.
+ // FIXME: this should really be a check on the assembler characteristics
+ // rather than OS version
+ if (T.isMacOSX() && T.isMacOSXVersionLT(10, 6))
+ HasWeakDefCanBeHiddenDirective = false;
+
+ UseIntegratedAssembler = true;
+}
+
+void PPCELFMCAsmInfo::anchor() { }
+
+PPCELFMCAsmInfo::PPCELFMCAsmInfo(bool is64Bit, const Triple& T) {
+ // FIXME: This is not always needed. For example, it is not needed in the
+ // v2 abi.
+ NeedsLocalForSize = true;
+
+ if (is64Bit) {
+ PointerSize = CalleeSaveStackSlotSize = 8;
+ }
+ IsLittleEndian = T.getArch() == Triple::ppc64le;
+
+ // ".comm align is in bytes but .align is pow-2."
+ AlignmentIsInBytes = false;
+
+ CommentString = "#";
+
+ // Uses '.section' before '.bss' directive
+ UsesELFSectionDirectiveForBSS = true;
+
+ // Debug Information
+ SupportsDebugInformation = true;
+
+ DollarIsPC = true;
+
+ // Set up DWARF directives
+ MinInstAlignment = 4;
+
+ // Exceptions handling
+ ExceptionsType = ExceptionHandling::DwarfCFI;
+
+ ZeroDirective = "\t.space\t";
+ Data64bitsDirective = is64Bit ? "\t.quad\t" : nullptr;
+ AssemblerDialect = 1; // New-Style mnemonics.
+ LCOMMDirectiveAlignmentType = LCOMM::ByteAlignment;
+
+ UseIntegratedAssembler = true;
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
new file mode 100644
index 000000000000..e252ac944d40
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
@@ -0,0 +1,39 @@
+//===-- PPCMCAsmInfo.h - PPC asm properties --------------------*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the MCAsmInfoDarwin class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCASMINFO_H
+#define LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCASMINFO_H
+
+#include "llvm/MC/MCAsmInfoDarwin.h"
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+class Triple;
+
+class PPCMCAsmInfoDarwin : public MCAsmInfoDarwin {
+ virtual void anchor();
+
+public:
+ explicit PPCMCAsmInfoDarwin(bool is64Bit, const Triple &);
+};
+
+class PPCELFMCAsmInfo : public MCAsmInfoELF {
+ void anchor() override;
+
+public:
+ explicit PPCELFMCAsmInfo(bool is64Bit, const Triple &);
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
new file mode 100644
index 000000000000..017d21af08a8
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -0,0 +1,389 @@
+//===-- PPCMCCodeEmitter.cpp - Convert PPC code to machine code -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PPCMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCInstrInfo.h"
+#include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "MCTargetDesc/PPCFixupKinds.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOpcodes.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "mccodeemitter"
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
+
+namespace {
+class PPCMCCodeEmitter : public MCCodeEmitter {
+ PPCMCCodeEmitter(const PPCMCCodeEmitter &) = delete;
+ void operator=(const PPCMCCodeEmitter &) = delete;
+
+ const MCInstrInfo &MCII;
+ const MCContext &CTX;
+ bool IsLittleEndian;
+
+public:
+ PPCMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
+ : MCII(mcii), CTX(ctx),
+ IsLittleEndian(ctx.getAsmInfo()->isLittleEndian()) {}
+
+ ~PPCMCCodeEmitter() override {}
+
+ unsigned getDirectBrEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ unsigned getCondBrEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ unsigned getAbsDirectBrEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ unsigned getAbsCondBrEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ unsigned getImm16Encoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ unsigned getMemRIEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ unsigned getMemRIXEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ unsigned getMemRIX16Encoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ unsigned getSPE8DisEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ unsigned getSPE4DisEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ unsigned getSPE2DisEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ unsigned getTLSRegEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ unsigned getTLSCallEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ unsigned get_crbitm_encoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getMachineOpValue - Return binary encoding of operand. If the machine
+ /// operand requires relocation, record the relocation and return zero.
+ unsigned getMachineOpValue(const MCInst &MI,const MCOperand &MO,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ // getBinaryCodeForInstr - TableGen'erated function for getting the
+ // binary encoding for an instruction.
+ uint64_t getBinaryCodeForInstr(const MCInst &MI,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const override {
+ verifyInstructionPredicates(MI,
+ computeAvailableFeatures(STI.getFeatureBits()));
+
+ unsigned Opcode = MI.getOpcode();
+ const MCInstrDesc &Desc = MCII.get(Opcode);
+
+ uint64_t Bits = getBinaryCodeForInstr(MI, Fixups, STI);
+
+ // Output the constant in big/little endian byte order.
+ unsigned Size = Desc.getSize();
+ switch (Size) {
+ case 0:
+ break;
+ case 4:
+ if (IsLittleEndian) {
+ support::endian::Writer<support::little>(OS).write<uint32_t>(Bits);
+ } else {
+ support::endian::Writer<support::big>(OS).write<uint32_t>(Bits);
+ }
+ break;
+ case 8:
+ // If we emit a pair of instructions, the first one is
+ // always in the top 32 bits, even on little-endian.
+ if (IsLittleEndian) {
+ uint64_t Swapped = (Bits << 32) | (Bits >> 32);
+ support::endian::Writer<support::little>(OS).write<uint64_t>(Swapped);
+ } else {
+ support::endian::Writer<support::big>(OS).write<uint64_t>(Bits);
+ }
+ break;
+ default:
+ llvm_unreachable ("Invalid instruction size");
+ }
+
+ ++MCNumEmitted; // Keep track of the # of mi's emitted.
+ }
+
+private:
+ uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;
+ void verifyInstructionPredicates(const MCInst &MI,
+ uint64_t AvailableFeatures) const;
+};
+
+} // end anonymous namespace
+
+MCCodeEmitter *llvm::createPPCMCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx) {
+ return new PPCMCCodeEmitter(MCII, Ctx);
+}
+
+unsigned PPCMCCodeEmitter::
+getDirectBrEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpNo);
+ if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI);
+
+ // Add a fixup for the branch target.
+ Fixups.push_back(MCFixup::create(0, MO.getExpr(),
+ (MCFixupKind)PPC::fixup_ppc_br24));
+ return 0;
+}
+
+unsigned PPCMCCodeEmitter::getCondBrEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpNo);
+ if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI);
+
+ // Add a fixup for the branch target.
+ Fixups.push_back(MCFixup::create(0, MO.getExpr(),
+ (MCFixupKind)PPC::fixup_ppc_brcond14));
+ return 0;
+}
+
+unsigned PPCMCCodeEmitter::
+getAbsDirectBrEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpNo);
+ if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI);
+
+ // Add a fixup for the branch target.
+ Fixups.push_back(MCFixup::create(0, MO.getExpr(),
+ (MCFixupKind)PPC::fixup_ppc_br24abs));
+ return 0;
+}
+
+unsigned PPCMCCodeEmitter::
+getAbsCondBrEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpNo);
+ if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI);
+
+ // Add a fixup for the branch target.
+ Fixups.push_back(MCFixup::create(0, MO.getExpr(),
+ (MCFixupKind)PPC::fixup_ppc_brcond14abs));
+ return 0;
+}
+
+unsigned PPCMCCodeEmitter::getImm16Encoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpNo);
+ if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI);
+
+ // Add a fixup for the immediate field.
+ Fixups.push_back(MCFixup::create(IsLittleEndian? 0 : 2, MO.getExpr(),
+ (MCFixupKind)PPC::fixup_ppc_half16));
+ return 0;
+}
+
+unsigned PPCMCCodeEmitter::getMemRIEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // Encode (imm, reg) as a memri, which has the low 16-bits as the
+ // displacement and the next 5 bits as the register #.
+ assert(MI.getOperand(OpNo+1).isReg());
+ unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI) << 16;
+
+ const MCOperand &MO = MI.getOperand(OpNo);
+ if (MO.isImm())
+ return (getMachineOpValue(MI, MO, Fixups, STI) & 0xFFFF) | RegBits;
+
+ // Add a fixup for the displacement field.
+ Fixups.push_back(MCFixup::create(IsLittleEndian? 0 : 2, MO.getExpr(),
+ (MCFixupKind)PPC::fixup_ppc_half16));
+ return RegBits;
+}
+
+
+unsigned PPCMCCodeEmitter::getMemRIXEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // Encode (imm, reg) as a memrix, which has the low 14-bits as the
+ // displacement and the next 5 bits as the register #.
+ assert(MI.getOperand(OpNo+1).isReg());
+ unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI) << 14;
+
+ const MCOperand &MO = MI.getOperand(OpNo);
+ if (MO.isImm())
+ return ((getMachineOpValue(MI, MO, Fixups, STI) >> 2) & 0x3FFF) | RegBits;
+
+ // Add a fixup for the displacement field.
+ Fixups.push_back(MCFixup::create(IsLittleEndian? 0 : 2, MO.getExpr(),
+ (MCFixupKind)PPC::fixup_ppc_half16ds));
+ return RegBits;
+}
+
+unsigned PPCMCCodeEmitter::getMemRIX16Encoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // Encode (imm, reg) as a memrix16, which has the low 12-bits as the
+ // displacement and the next 5 bits as the register #.
+ assert(MI.getOperand(OpNo+1).isReg());
+ unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI) << 12;
+
+ const MCOperand &MO = MI.getOperand(OpNo);
+ assert(MO.isImm());
+
+ return ((getMachineOpValue(MI, MO, Fixups, STI) >> 4) & 0xFFF) | RegBits;
+}
+
+unsigned PPCMCCodeEmitter::getSPE8DisEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI)
+ const {
+ // Encode (imm, reg) as a spe8dis, which has the low 5-bits of (imm / 8)
+ // as the displacement and the next 5 bits as the register #.
+ assert(MI.getOperand(OpNo+1).isReg());
+ uint32_t RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI) << 5;
+
+ const MCOperand &MO = MI.getOperand(OpNo);
+ assert(MO.isImm());
+ uint32_t Imm = getMachineOpValue(MI, MO, Fixups, STI) >> 3;
+ return reverseBits(Imm | RegBits) >> 22;
+}
+
+
+unsigned PPCMCCodeEmitter::getSPE4DisEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI)
+ const {
+ // Encode (imm, reg) as a spe4dis, which has the low 5-bits of (imm / 4)
+ // as the displacement and the next 5 bits as the register #.
+ assert(MI.getOperand(OpNo+1).isReg());
+ uint32_t RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI) << 5;
+
+ const MCOperand &MO = MI.getOperand(OpNo);
+ assert(MO.isImm());
+ uint32_t Imm = getMachineOpValue(MI, MO, Fixups, STI) >> 2;
+ return reverseBits(Imm | RegBits) >> 22;
+}
+
+
+unsigned PPCMCCodeEmitter::getSPE2DisEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI)
+ const {
+ // Encode (imm, reg) as a spe2dis, which has the low 5-bits of (imm / 2)
+ // as the displacement and the next 5 bits as the register #.
+ assert(MI.getOperand(OpNo+1).isReg());
+ uint32_t RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI) << 5;
+
+ const MCOperand &MO = MI.getOperand(OpNo);
+ assert(MO.isImm());
+ uint32_t Imm = getMachineOpValue(MI, MO, Fixups, STI) >> 1;
+ return reverseBits(Imm | RegBits) >> 22;
+}
+
+
+unsigned PPCMCCodeEmitter::getTLSRegEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpNo);
+ if (MO.isReg()) return getMachineOpValue(MI, MO, Fixups, STI);
+
+ // Add a fixup for the TLS register, which simply provides a relocation
+ // hint to the linker that this statement is part of a relocation sequence.
+ // Return the thread-pointer register's encoding.
+ Fixups.push_back(MCFixup::create(0, MO.getExpr(),
+ (MCFixupKind)PPC::fixup_ppc_nofixup));
+ const Triple &TT = STI.getTargetTriple();
+ bool isPPC64 = TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le;
+ return CTX.getRegisterInfo()->getEncodingValue(isPPC64 ? PPC::X13 : PPC::R2);
+}
+
+unsigned PPCMCCodeEmitter::getTLSCallEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // For special TLS calls, we need two fixups; one for the branch target
+ // (__tls_get_addr), which we create via getDirectBrEncoding as usual,
+ // and one for the TLSGD or TLSLD symbol, which is emitted here.
+ const MCOperand &MO = MI.getOperand(OpNo+1);
+ Fixups.push_back(MCFixup::create(0, MO.getExpr(),
+ (MCFixupKind)PPC::fixup_ppc_nofixup));
+ return getDirectBrEncoding(MI, OpNo, Fixups, STI);
+}
+
+unsigned PPCMCCodeEmitter::
+get_crbitm_encoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpNo);
+ assert((MI.getOpcode() == PPC::MTOCRF || MI.getOpcode() == PPC::MTOCRF8 ||
+ MI.getOpcode() == PPC::MFOCRF || MI.getOpcode() == PPC::MFOCRF8) &&
+ (MO.getReg() >= PPC::CR0 && MO.getReg() <= PPC::CR7));
+ return 0x80 >> CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
+}
+
+unsigned PPCMCCodeEmitter::
+getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ if (MO.isReg()) {
+ // MTOCRF/MFOCRF should go through get_crbitm_encoding for the CR operand.
+ // The GPR operand should come through here though.
+ assert((MI.getOpcode() != PPC::MTOCRF && MI.getOpcode() != PPC::MTOCRF8 &&
+ MI.getOpcode() != PPC::MFOCRF && MI.getOpcode() != PPC::MFOCRF8) ||
+ MO.getReg() < PPC::CR0 || MO.getReg() > PPC::CR7);
+ unsigned Reg = MO.getReg();
+ unsigned Encode = CTX.getRegisterInfo()->getEncodingValue(Reg);
+
+ if ((MCII.get(MI.getOpcode()).TSFlags & PPCII::UseVSXReg))
+ if (PPCInstrInfo::isVRRegister(Reg))
+ Encode += 32;
+
+ return Encode;
+ }
+
+ assert(MO.isImm() &&
+ "Relocation required in an instruction that we cannot encode!");
+ return MO.getImm();
+}
+
+
+
+#define ENABLE_INSTR_PREDICATE_VERIFIER
+#include "PPCGenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
new file mode 100644
index 000000000000..6b97d4c1456b
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
@@ -0,0 +1,150 @@
+//===-- PPCMCExpr.cpp - PPC specific MC expression classes ----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCFixupKinds.h"
+#include "PPCMCExpr.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCObjectStreamer.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppcmcexpr"
+
+const PPCMCExpr*
+PPCMCExpr::create(VariantKind Kind, const MCExpr *Expr,
+ bool isDarwin, MCContext &Ctx) {
+ return new (Ctx) PPCMCExpr(Kind, Expr, isDarwin);
+}
+
+void PPCMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
+ if (isDarwinSyntax()) {
+ switch (Kind) {
+ default: llvm_unreachable("Invalid kind!");
+ case VK_PPC_LO: OS << "lo16"; break;
+ case VK_PPC_HI: OS << "hi16"; break;
+ case VK_PPC_HA: OS << "ha16"; break;
+ }
+
+ OS << '(';
+ getSubExpr()->print(OS, MAI);
+ OS << ')';
+ } else {
+ getSubExpr()->print(OS, MAI);
+
+ switch (Kind) {
+ default: llvm_unreachable("Invalid kind!");
+ case VK_PPC_LO: OS << "@l"; break;
+ case VK_PPC_HI: OS << "@h"; break;
+ case VK_PPC_HA: OS << "@ha"; break;
+ case VK_PPC_HIGHER: OS << "@higher"; break;
+ case VK_PPC_HIGHERA: OS << "@highera"; break;
+ case VK_PPC_HIGHEST: OS << "@highest"; break;
+ case VK_PPC_HIGHESTA: OS << "@highesta"; break;
+ }
+ }
+}
+
+bool
+PPCMCExpr::evaluateAsConstant(int64_t &Res) const {
+ MCValue Value;
+
+ if (!getSubExpr()->evaluateAsRelocatable(Value, nullptr, nullptr))
+ return false;
+
+ if (!Value.isAbsolute())
+ return false;
+
+ Res = evaluateAsInt64(Value.getConstant());
+ return true;
+}
+
+int64_t
+PPCMCExpr::evaluateAsInt64(int64_t Value) const {
+ switch (Kind) {
+ case VK_PPC_LO:
+ return Value & 0xffff;
+ case VK_PPC_HI:
+ return (Value >> 16) & 0xffff;
+ case VK_PPC_HA:
+ return ((Value + 0x8000) >> 16) & 0xffff;
+ case VK_PPC_HIGHER:
+ return (Value >> 32) & 0xffff;
+ case VK_PPC_HIGHERA:
+ return ((Value + 0x8000) >> 32) & 0xffff;
+ case VK_PPC_HIGHEST:
+ return (Value >> 48) & 0xffff;
+ case VK_PPC_HIGHESTA:
+ return ((Value + 0x8000) >> 48) & 0xffff;
+ case VK_PPC_None:
+ break;
+ }
+ llvm_unreachable("Invalid kind!");
+}
+
+bool
+PPCMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
+ const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const {
+ MCValue Value;
+
+ if (!getSubExpr()->evaluateAsRelocatable(Value, Layout, Fixup))
+ return false;
+
+ if (Value.isAbsolute()) {
+ int64_t Result = evaluateAsInt64(Value.getConstant());
+ if ((Fixup == nullptr || (unsigned)Fixup->getKind() != PPC::fixup_ppc_half16) &&
+ (Result >= 0x8000))
+ return false;
+ Res = MCValue::get(Result);
+ } else {
+ if (!Layout)
+ return false;
+
+ MCContext &Context = Layout->getAssembler().getContext();
+ const MCSymbolRefExpr *Sym = Value.getSymA();
+ MCSymbolRefExpr::VariantKind Modifier = Sym->getKind();
+ if (Modifier != MCSymbolRefExpr::VK_None)
+ return false;
+ switch (Kind) {
+ default:
+ llvm_unreachable("Invalid kind!");
+ case VK_PPC_LO:
+ Modifier = MCSymbolRefExpr::VK_PPC_LO;
+ break;
+ case VK_PPC_HI:
+ Modifier = MCSymbolRefExpr::VK_PPC_HI;
+ break;
+ case VK_PPC_HA:
+ Modifier = MCSymbolRefExpr::VK_PPC_HA;
+ break;
+ case VK_PPC_HIGHERA:
+ Modifier = MCSymbolRefExpr::VK_PPC_HIGHERA;
+ break;
+ case VK_PPC_HIGHER:
+ Modifier = MCSymbolRefExpr::VK_PPC_HIGHER;
+ break;
+ case VK_PPC_HIGHEST:
+ Modifier = MCSymbolRefExpr::VK_PPC_HIGHEST;
+ break;
+ case VK_PPC_HIGHESTA:
+ Modifier = MCSymbolRefExpr::VK_PPC_HIGHESTA;
+ break;
+ }
+ Sym = MCSymbolRefExpr::create(&Sym->getSymbol(), Modifier, Context);
+ Res = MCValue::get(Sym, Value.getSymB(), Value.getConstant());
+ }
+
+ return true;
+}
+
+void PPCMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
+ Streamer.visitUsedExpr(*getSubExpr());
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
new file mode 100644
index 000000000000..d42a111cc43e
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
@@ -0,0 +1,100 @@
+//===-- PPCMCExpr.h - PPC specific MC expression classes --------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCEXPR_H
+#define LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCEXPR_H
+
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+
+namespace llvm {
+
+class PPCMCExpr : public MCTargetExpr {
+public:
+ enum VariantKind {
+ VK_PPC_None,
+ VK_PPC_LO,
+ VK_PPC_HI,
+ VK_PPC_HA,
+ VK_PPC_HIGHER,
+ VK_PPC_HIGHERA,
+ VK_PPC_HIGHEST,
+ VK_PPC_HIGHESTA
+ };
+
+private:
+ const VariantKind Kind;
+ const MCExpr *Expr;
+ bool IsDarwin;
+
+ int64_t evaluateAsInt64(int64_t Value) const;
+
+ explicit PPCMCExpr(VariantKind Kind, const MCExpr *Expr, bool IsDarwin)
+ : Kind(Kind), Expr(Expr), IsDarwin(IsDarwin) {}
+
+public:
+ /// @name Construction
+ /// @{
+
+ static const PPCMCExpr *create(VariantKind Kind, const MCExpr *Expr,
+ bool isDarwin, MCContext &Ctx);
+
+ static const PPCMCExpr *createLo(const MCExpr *Expr,
+ bool isDarwin, MCContext &Ctx) {
+ return create(VK_PPC_LO, Expr, isDarwin, Ctx);
+ }
+
+ static const PPCMCExpr *createHi(const MCExpr *Expr,
+ bool isDarwin, MCContext &Ctx) {
+ return create(VK_PPC_HI, Expr, isDarwin, Ctx);
+ }
+
+ static const PPCMCExpr *createHa(const MCExpr *Expr,
+ bool isDarwin, MCContext &Ctx) {
+ return create(VK_PPC_HA, Expr, isDarwin, Ctx);
+ }
+
+ /// @}
+ /// @name Accessors
+ /// @{
+
+ /// getOpcode - Get the kind of this expression.
+ VariantKind getKind() const { return Kind; }
+
+ /// getSubExpr - Get the child of this expression.
+ const MCExpr *getSubExpr() const { return Expr; }
+
+ /// isDarwinSyntax - True if expression is to be printed using Darwin syntax.
+ bool isDarwinSyntax() const { return IsDarwin; }
+
+
+ /// @}
+
+ void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+ bool evaluateAsRelocatableImpl(MCValue &Res,
+ const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const override;
+ void visitUsedExpr(MCStreamer &Streamer) const override;
+ MCFragment *findAssociatedFragment() const override {
+ return getSubExpr()->findAssociatedFragment();
+ }
+
+ // There are no TLS PPCMCExprs at the moment.
+ void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {}
+
+ bool evaluateAsConstant(int64_t &Res) const;
+
+ static bool classof(const MCExpr *E) {
+ return E->getKind() == MCExpr::Target;
+ }
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
new file mode 100644
index 000000000000..bbd10e5b260f
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -0,0 +1,264 @@
+//===-- PPCMCTargetDesc.cpp - PowerPC Target Descriptions -----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides PowerPC specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCMCTargetDesc.h"
+#include "InstPrinter/PPCInstPrinter.h"
+#include "PPCMCAsmInfo.h"
+#include "PPCTargetStreamer.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "PPCGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "PPCGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "PPCGenRegisterInfo.inc"
+
+// Pin the vtable to this file.
+PPCTargetStreamer::~PPCTargetStreamer() {}
+PPCTargetStreamer::PPCTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
+
+static MCInstrInfo *createPPCMCInstrInfo() {
+ MCInstrInfo *X = new MCInstrInfo();
+ InitPPCMCInstrInfo(X);
+ return X;
+}
+
+static MCRegisterInfo *createPPCMCRegisterInfo(const Triple &TT) {
+ bool isPPC64 =
+ (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le);
+ unsigned Flavour = isPPC64 ? 0 : 1;
+ unsigned RA = isPPC64 ? PPC::LR8 : PPC::LR;
+
+ MCRegisterInfo *X = new MCRegisterInfo();
+ InitPPCMCRegisterInfo(X, RA, Flavour, Flavour);
+ return X;
+}
+
+static MCSubtargetInfo *createPPCMCSubtargetInfo(const Triple &TT,
+ StringRef CPU, StringRef FS) {
+ return createPPCMCSubtargetInfoImpl(TT, CPU, FS);
+}
+
+static MCAsmInfo *createPPCMCAsmInfo(const MCRegisterInfo &MRI,
+ const Triple &TheTriple) {
+ bool isPPC64 = (TheTriple.getArch() == Triple::ppc64 ||
+ TheTriple.getArch() == Triple::ppc64le);
+
+ MCAsmInfo *MAI;
+ if (TheTriple.isOSDarwin())
+ MAI = new PPCMCAsmInfoDarwin(isPPC64, TheTriple);
+ else
+ MAI = new PPCELFMCAsmInfo(isPPC64, TheTriple);
+
+ // Initial state of the frame pointer is R1.
+ unsigned Reg = isPPC64 ? PPC::X1 : PPC::R1;
+ MCCFIInstruction Inst =
+ MCCFIInstruction::createDefCfa(nullptr, MRI.getDwarfRegNum(Reg, true), 0);
+ MAI->addInitialFrameState(Inst);
+
+ return MAI;
+}
+
+static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM,
+ CodeModel::Model &CM) {
+ if (CM == CodeModel::Default) {
+ if (!TT.isOSDarwin() &&
+ (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le))
+ CM = CodeModel::Medium;
+ }
+}
+
+namespace {
+class PPCTargetAsmStreamer : public PPCTargetStreamer {
+ formatted_raw_ostream &OS;
+
+public:
+ PPCTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS)
+ : PPCTargetStreamer(S), OS(OS) {}
+ void emitTCEntry(const MCSymbol &S) override {
+ OS << "\t.tc ";
+ OS << S.getName();
+ OS << "[TC],";
+ OS << S.getName();
+ OS << '\n';
+ }
+ void emitMachine(StringRef CPU) override {
+ OS << "\t.machine " << CPU << '\n';
+ }
+ void emitAbiVersion(int AbiVersion) override {
+ OS << "\t.abiversion " << AbiVersion << '\n';
+ }
+ void emitLocalEntry(MCSymbolELF *S, const MCExpr *LocalOffset) override {
+ const MCAsmInfo *MAI = Streamer.getContext().getAsmInfo();
+
+ OS << "\t.localentry\t";
+ S->print(OS, MAI);
+ OS << ", ";
+ LocalOffset->print(OS, MAI);
+ OS << '\n';
+ }
+};
+
+class PPCTargetELFStreamer : public PPCTargetStreamer {
+public:
+ PPCTargetELFStreamer(MCStreamer &S) : PPCTargetStreamer(S) {}
+ MCELFStreamer &getStreamer() {
+ return static_cast<MCELFStreamer &>(Streamer);
+ }
+ void emitTCEntry(const MCSymbol &S) override {
+ // Creates a R_PPC64_TOC relocation
+ Streamer.EmitValueToAlignment(8);
+ Streamer.EmitSymbolValue(&S, 8);
+ }
+ void emitMachine(StringRef CPU) override {
+ // FIXME: Is there anything to do in here or does this directive only
+ // limit the parser?
+ }
+ void emitAbiVersion(int AbiVersion) override {
+ MCAssembler &MCA = getStreamer().getAssembler();
+ unsigned Flags = MCA.getELFHeaderEFlags();
+ Flags &= ~ELF::EF_PPC64_ABI;
+ Flags |= (AbiVersion & ELF::EF_PPC64_ABI);
+ MCA.setELFHeaderEFlags(Flags);
+ }
+ void emitLocalEntry(MCSymbolELF *S, const MCExpr *LocalOffset) override {
+ MCAssembler &MCA = getStreamer().getAssembler();
+
+ int64_t Res;
+ if (!LocalOffset->evaluateAsAbsolute(Res, MCA))
+ report_fatal_error(".localentry expression must be absolute.");
+
+ unsigned Encoded = ELF::encodePPC64LocalEntryOffset(Res);
+ if (Res != ELF::decodePPC64LocalEntryOffset(Encoded))
+ report_fatal_error(".localentry expression cannot be encoded.");
+
+ unsigned Other = S->getOther();
+ Other &= ~ELF::STO_PPC64_LOCAL_MASK;
+ Other |= Encoded;
+ S->setOther(Other);
+
+ // For GAS compatibility, unless we already saw a .abiversion directive,
+ // set e_flags to indicate ELFv2 ABI.
+ unsigned Flags = MCA.getELFHeaderEFlags();
+ if ((Flags & ELF::EF_PPC64_ABI) == 0)
+ MCA.setELFHeaderEFlags(Flags | 2);
+ }
+ void emitAssignment(MCSymbol *S, const MCExpr *Value) override {
+ auto *Symbol = cast<MCSymbolELF>(S);
+ // When encoding an assignment to set symbol A to symbol B, also copy
+ // the st_other bits encoding the local entry point offset.
+ if (Value->getKind() != MCExpr::SymbolRef)
+ return;
+ const auto &RhsSym = cast<MCSymbolELF>(
+ static_cast<const MCSymbolRefExpr *>(Value)->getSymbol());
+ unsigned Other = Symbol->getOther();
+ Other &= ~ELF::STO_PPC64_LOCAL_MASK;
+ Other |= RhsSym.getOther() & ELF::STO_PPC64_LOCAL_MASK;
+ Symbol->setOther(Other);
+ }
+};
+
+class PPCTargetMachOStreamer : public PPCTargetStreamer {
+public:
+ PPCTargetMachOStreamer(MCStreamer &S) : PPCTargetStreamer(S) {}
+ void emitTCEntry(const MCSymbol &S) override {
+ llvm_unreachable("Unknown pseudo-op: .tc");
+ }
+ void emitMachine(StringRef CPU) override {
+ // FIXME: We should update the CPUType, CPUSubType in the Object file if
+ // the new values are different from the defaults.
+ }
+ void emitAbiVersion(int AbiVersion) override {
+ llvm_unreachable("Unknown pseudo-op: .abiversion");
+ }
+ void emitLocalEntry(MCSymbolELF *S, const MCExpr *LocalOffset) override {
+ llvm_unreachable("Unknown pseudo-op: .localentry");
+ }
+};
+}
+
+static MCTargetStreamer *createAsmTargetStreamer(MCStreamer &S,
+ formatted_raw_ostream &OS,
+ MCInstPrinter *InstPrint,
+ bool isVerboseAsm) {
+ return new PPCTargetAsmStreamer(S, OS);
+}
+
+static MCTargetStreamer *
+createObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
+ const Triple &TT = STI.getTargetTriple();
+ if (TT.isOSBinFormatELF())
+ return new PPCTargetELFStreamer(S);
+ return new PPCTargetMachOStreamer(S);
+}
+
+static MCInstPrinter *createPPCMCInstPrinter(const Triple &T,
+ unsigned SyntaxVariant,
+ const MCAsmInfo &MAI,
+ const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI) {
+ return new PPCInstPrinter(MAI, MII, MRI, T.isOSDarwin());
+}
+
+extern "C" void LLVMInitializePowerPCTargetMC() {
+ for (Target *T :
+ {&getThePPC32Target(), &getThePPC64Target(), &getThePPC64LETarget()}) {
+ // Register the MC asm info.
+ RegisterMCAsmInfoFn C(*T, createPPCMCAsmInfo);
+
+ // Register the MC codegen info.
+ TargetRegistry::registerMCAdjustCodeGenOpts(*T, adjustCodeGenOpts);
+
+ // Register the MC instruction info.
+ TargetRegistry::RegisterMCInstrInfo(*T, createPPCMCInstrInfo);
+
+ // Register the MC register info.
+ TargetRegistry::RegisterMCRegInfo(*T, createPPCMCRegisterInfo);
+
+ // Register the MC subtarget info.
+ TargetRegistry::RegisterMCSubtargetInfo(*T, createPPCMCSubtargetInfo);
+
+ // Register the MC Code Emitter
+ TargetRegistry::RegisterMCCodeEmitter(*T, createPPCMCCodeEmitter);
+
+ // Register the asm backend.
+ TargetRegistry::RegisterMCAsmBackend(*T, createPPCAsmBackend);
+
+ // Register the object target streamer.
+ TargetRegistry::RegisterObjectTargetStreamer(*T,
+ createObjectTargetStreamer);
+
+ // Register the asm target streamer.
+ TargetRegistry::RegisterAsmTargetStreamer(*T, createAsmTargetStreamer);
+
+ // Register the MCInstPrinter.
+ TargetRegistry::RegisterMCInstPrinter(*T, createPPCMCInstPrinter);
+ }
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
new file mode 100644
index 000000000000..0989e0c8e268
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
@@ -0,0 +1,106 @@
+//===-- PPCMCTargetDesc.h - PowerPC Target Descriptions ---------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides PowerPC specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCTARGETDESC_H
+#define LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCTARGETDESC_H
+
+// GCC #defines PPC on Linux but we use it as our namespace name
+#undef PPC
+
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/MathExtras.h"
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class MCTargetOptions;
+class Target;
+class Triple;
+class StringRef;
+class raw_pwrite_stream;
+class raw_ostream;
+
+Target &getThePPC32Target();
+Target &getThePPC64Target();
+Target &getThePPC64LETarget();
+
+MCCodeEmitter *createPPCMCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx);
+
+MCAsmBackend *createPPCAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options);
+
+/// Construct an PPC ELF object writer.
+MCObjectWriter *createPPCELFObjectWriter(raw_pwrite_stream &OS, bool Is64Bit,
+ bool IsLittleEndian, uint8_t OSABI);
+/// Construct a PPC Mach-O object writer.
+MCObjectWriter *createPPCMachObjectWriter(raw_pwrite_stream &OS, bool Is64Bit,
+ uint32_t CPUType,
+ uint32_t CPUSubtype);
+
+/// Returns true iff Val consists of one contiguous run of 1s with any number of
+/// 0s on either side. The 1s are allowed to wrap from LSB to MSB, so
+/// 0x000FFF0, 0x0000FFFF, and 0xFF0000FF are all runs. 0x0F0F0000 is not,
+/// since all 1s are not contiguous.
+static inline bool isRunOfOnes(unsigned Val, unsigned &MB, unsigned &ME) {
+ if (!Val)
+ return false;
+
+ if (isShiftedMask_32(Val)) {
+ // look for the first non-zero bit
+ MB = countLeadingZeros(Val);
+ // look for the first zero bit after the run of ones
+ ME = countLeadingZeros((Val - 1) ^ Val);
+ return true;
+ } else {
+ Val = ~Val; // invert mask
+ if (isShiftedMask_32(Val)) {
+ // effectively look for the first zero bit
+ ME = countLeadingZeros(Val) - 1;
+ // effectively look for the first one bit after the run of zeros
+ MB = countLeadingZeros((Val - 1) ^ Val) + 1;
+ return true;
+ }
+ }
+ // no run present
+ return false;
+}
+
+} // End llvm namespace
+
+// Generated files will use "namespace PPC". To avoid symbol clash,
+// undefine PPC here. PPC may be predefined on some hosts.
+#undef PPC
+
+// Defines symbolic names for PowerPC registers. This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "PPCGenRegisterInfo.inc"
+
+// Defines symbolic names for the PowerPC instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "PPCGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "PPCGenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
new file mode 100644
index 000000000000..1f38a8c947e7
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
@@ -0,0 +1,383 @@
+//===-- PPCMachObjectWriter.cpp - PPC Mach-O Writer -----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "MCTargetDesc/PPCFixupKinds.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MachO.h"
+
+using namespace llvm;
+
+namespace {
+class PPCMachObjectWriter : public MCMachObjectTargetWriter {
+ bool recordScatteredRelocation(MachObjectWriter *Writer,
+ const MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFragment *Fragment,
+ const MCFixup &Fixup, MCValue Target,
+ unsigned Log2Size, uint64_t &FixedValue);
+
+ void RecordPPCRelocation(MachObjectWriter *Writer, const MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFragment *Fragment, const MCFixup &Fixup,
+ MCValue Target, uint64_t &FixedValue);
+
+public:
+ PPCMachObjectWriter(bool Is64Bit, uint32_t CPUType, uint32_t CPUSubtype)
+ : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype) {}
+
+ void recordRelocation(MachObjectWriter *Writer, MCAssembler &Asm,
+ const MCAsmLayout &Layout, const MCFragment *Fragment,
+ const MCFixup &Fixup, MCValue Target,
+ uint64_t &FixedValue) override {
+ if (Writer->is64Bit()) {
+ report_fatal_error("Relocation emission for MachO/PPC64 unimplemented.");
+ } else
+ RecordPPCRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+ FixedValue);
+ }
+};
+}
+
+/// computes the log2 of the size of the relocation,
+/// used for relocation_info::r_length.
+static unsigned getFixupKindLog2Size(unsigned Kind) {
+ switch (Kind) {
+ default:
+ report_fatal_error("log2size(FixupKind): Unhandled fixup kind!");
+ case FK_PCRel_1:
+ case FK_Data_1:
+ return 0;
+ case FK_PCRel_2:
+ case FK_Data_2:
+ return 1;
+ case FK_PCRel_4:
+ case PPC::fixup_ppc_brcond14:
+ case PPC::fixup_ppc_half16:
+ case PPC::fixup_ppc_br24:
+ case FK_Data_4:
+ return 2;
+ case FK_PCRel_8:
+ case FK_Data_8:
+ return 3;
+ }
+ return 0;
+}
+
+/// Translates generic PPC fixup kind to Mach-O/PPC relocation type enum.
+/// Outline based on PPCELFObjectWriter::getRelocType().
+static unsigned getRelocType(const MCValue &Target,
+ const MCFixupKind FixupKind, // from
+ // Fixup.getKind()
+ const bool IsPCRel) {
+ const MCSymbolRefExpr::VariantKind Modifier =
+ Target.isAbsolute() ? MCSymbolRefExpr::VK_None
+ : Target.getSymA()->getKind();
+ // determine the type of the relocation
+ unsigned Type = MachO::GENERIC_RELOC_VANILLA;
+ if (IsPCRel) { // relative to PC
+ switch ((unsigned)FixupKind) {
+ default:
+ report_fatal_error("Unimplemented fixup kind (relative)");
+ case PPC::fixup_ppc_br24:
+ Type = MachO::PPC_RELOC_BR24; // R_PPC_REL24
+ break;
+ case PPC::fixup_ppc_brcond14:
+ Type = MachO::PPC_RELOC_BR14;
+ break;
+ case PPC::fixup_ppc_half16:
+ switch (Modifier) {
+ default:
+ llvm_unreachable("Unsupported modifier for half16 fixup");
+ case MCSymbolRefExpr::VK_PPC_HA:
+ Type = MachO::PPC_RELOC_HA16;
+ break;
+ case MCSymbolRefExpr::VK_PPC_LO:
+ Type = MachO::PPC_RELOC_LO16;
+ break;
+ case MCSymbolRefExpr::VK_PPC_HI:
+ Type = MachO::PPC_RELOC_HI16;
+ break;
+ }
+ break;
+ }
+ } else {
+ switch ((unsigned)FixupKind) {
+ default:
+ report_fatal_error("Unimplemented fixup kind (absolute)!");
+ case PPC::fixup_ppc_half16:
+ switch (Modifier) {
+ default:
+ llvm_unreachable("Unsupported modifier for half16 fixup");
+ case MCSymbolRefExpr::VK_PPC_HA:
+ Type = MachO::PPC_RELOC_HA16_SECTDIFF;
+ break;
+ case MCSymbolRefExpr::VK_PPC_LO:
+ Type = MachO::PPC_RELOC_LO16_SECTDIFF;
+ break;
+ case MCSymbolRefExpr::VK_PPC_HI:
+ Type = MachO::PPC_RELOC_HI16_SECTDIFF;
+ break;
+ }
+ break;
+ case FK_Data_4:
+ break;
+ case FK_Data_2:
+ break;
+ }
+ }
+ return Type;
+}
+
+static void makeRelocationInfo(MachO::any_relocation_info &MRE,
+ const uint32_t FixupOffset, const uint32_t Index,
+ const unsigned IsPCRel, const unsigned Log2Size,
+ const unsigned IsExtern, const unsigned Type) {
+ MRE.r_word0 = FixupOffset;
+ // The bitfield offsets that work (as determined by trial-and-error)
+ // are different than what is documented in the mach-o manuals.
+ // This appears to be an endianness issue; reversing the order of the
+ // documented bitfields in <llvm/Support/MachO.h> fixes this (but
+ // breaks x86/ARM assembly).
+ MRE.r_word1 = ((Index << 8) | // was << 0
+ (IsPCRel << 7) | // was << 24
+ (Log2Size << 5) | // was << 25
+ (IsExtern << 4) | // was << 27
+ (Type << 0)); // was << 28
+}
+
+static void
+makeScatteredRelocationInfo(MachO::any_relocation_info &MRE,
+ const uint32_t Addr, const unsigned Type,
+ const unsigned Log2Size, const unsigned IsPCRel,
+ const uint32_t Value2) {
+ // For notes on bitfield positions and endianness, see:
+ // https://developer.apple.com/library/mac/documentation/developertools/conceptual/MachORuntime/Reference/reference.html#//apple_ref/doc/uid/20001298-scattered_relocation_entry
+ MRE.r_word0 = ((Addr << 0) | (Type << 24) | (Log2Size << 28) |
+ (IsPCRel << 30) | MachO::R_SCATTERED);
+ MRE.r_word1 = Value2;
+}
+
+/// Compute fixup offset (address).
+static uint32_t getFixupOffset(const MCAsmLayout &Layout,
+ const MCFragment *Fragment,
+ const MCFixup &Fixup) {
+ uint32_t FixupOffset = Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
+ // On Mach-O, ppc_fixup_half16 relocations must refer to the
+ // start of the instruction, not the second halfword, as ELF does
+ if (unsigned(Fixup.getKind()) == PPC::fixup_ppc_half16)
+ FixupOffset &= ~uint32_t(3);
+ return FixupOffset;
+}
+
+/// \return false if falling back to using non-scattered relocation,
+/// otherwise true for normal scattered relocation.
+/// based on X86MachObjectWriter::recordScatteredRelocation
+/// and ARMMachObjectWriter::recordScatteredRelocation
+bool PPCMachObjectWriter::recordScatteredRelocation(
+ MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout,
+ const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
+ unsigned Log2Size, uint64_t &FixedValue) {
+ // caller already computes these, can we just pass and reuse?
+ const uint32_t FixupOffset = getFixupOffset(Layout, Fragment, Fixup);
+ const MCFixupKind FK = Fixup.getKind();
+ const unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, FK);
+ const unsigned Type = getRelocType(Target, FK, IsPCRel);
+
+ // Is this a local or SECTDIFF relocation entry?
+ // SECTDIFF relocation entries have symbol subtractions,
+ // and require two entries, the first for the add-symbol value,
+ // the second for the subtract-symbol value.
+
+ // See <reloc.h>.
+ const MCSymbol *A = &Target.getSymA()->getSymbol();
+
+ if (!A->getFragment())
+ report_fatal_error("symbol '" + A->getName() +
+ "' can not be undefined in a subtraction expression");
+
+ uint32_t Value = Writer->getSymbolAddress(*A, Layout);
+ uint64_t SecAddr = Writer->getSectionAddress(A->getFragment()->getParent());
+ FixedValue += SecAddr;
+ uint32_t Value2 = 0;
+
+ if (const MCSymbolRefExpr *B = Target.getSymB()) {
+ const MCSymbol *SB = &B->getSymbol();
+
+ if (!SB->getFragment())
+ report_fatal_error("symbol '" + B->getSymbol().getName() +
+ "' can not be undefined in a subtraction expression");
+
+ // FIXME: is Type correct? see include/llvm/Support/MachO.h
+ Value2 = Writer->getSymbolAddress(B->getSymbol(), Layout);
+ FixedValue -= Writer->getSectionAddress(SB->getFragment()->getParent());
+ }
+ // FIXME: does FixedValue get used??
+
+ // Relocations are written out in reverse order, so the PAIR comes first.
+ if (Type == MachO::PPC_RELOC_SECTDIFF ||
+ Type == MachO::PPC_RELOC_HI16_SECTDIFF ||
+ Type == MachO::PPC_RELOC_LO16_SECTDIFF ||
+ Type == MachO::PPC_RELOC_HA16_SECTDIFF ||
+ Type == MachO::PPC_RELOC_LO14_SECTDIFF ||
+ Type == MachO::PPC_RELOC_LOCAL_SECTDIFF) {
+ // X86 had this piece, but ARM does not
+ // If the offset is too large to fit in a scattered relocation,
+ // we're hosed. It's an unfortunate limitation of the MachO format.
+ if (FixupOffset > 0xffffff) {
+ char Buffer[32];
+ format("0x%x", FixupOffset).print(Buffer, sizeof(Buffer));
+ Asm.getContext().reportError(Fixup.getLoc(),
+ Twine("Section too large, can't encode "
+ "r_address (") +
+ Buffer + ") into 24 bits of scattered "
+ "relocation entry.");
+ return false;
+ }
+
+ // Is this supposed to follow MCTarget/PPCAsmBackend.cpp:adjustFixupValue()?
+ // see PPCMCExpr::evaluateAsRelocatableImpl()
+ uint32_t other_half = 0;
+ switch (Type) {
+ case MachO::PPC_RELOC_LO16_SECTDIFF:
+ other_half = (FixedValue >> 16) & 0xffff;
+ // applyFixupOffset longer extracts the high part because it now assumes
+ // this was already done.
+ // It looks like this is not true for the FixedValue needed with Mach-O
+ // relocs.
+ // So we need to adjust FixedValue again here.
+ FixedValue &= 0xffff;
+ break;
+ case MachO::PPC_RELOC_HA16_SECTDIFF:
+ other_half = FixedValue & 0xffff;
+ FixedValue =
+ ((FixedValue >> 16) + ((FixedValue & 0x8000) ? 1 : 0)) & 0xffff;
+ break;
+ case MachO::PPC_RELOC_HI16_SECTDIFF:
+ other_half = FixedValue & 0xffff;
+ FixedValue = (FixedValue >> 16) & 0xffff;
+ break;
+ default:
+ llvm_unreachable("Invalid PPC scattered relocation type.");
+ break;
+ }
+
+ MachO::any_relocation_info MRE;
+ makeScatteredRelocationInfo(MRE, other_half, MachO::GENERIC_RELOC_PAIR,
+ Log2Size, IsPCRel, Value2);
+ Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
+ } else {
+ // If the offset is more than 24-bits, it won't fit in a scattered
+ // relocation offset field, so we fall back to using a non-scattered
+ // relocation. This is a bit risky, as if the offset reaches out of
+ // the block and the linker is doing scattered loading on this
+ // symbol, things can go badly.
+ //
+ // Required for 'as' compatibility.
+ if (FixupOffset > 0xffffff)
+ return false;
+ }
+ MachO::any_relocation_info MRE;
+ makeScatteredRelocationInfo(MRE, FixupOffset, Type, Log2Size, IsPCRel, Value);
+ Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
+ return true;
+}
+
+// see PPCELFObjectWriter for a general outline of cases
+void PPCMachObjectWriter::RecordPPCRelocation(
+ MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout,
+ const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
+ uint64_t &FixedValue) {
+ const MCFixupKind FK = Fixup.getKind(); // unsigned
+ const unsigned Log2Size = getFixupKindLog2Size(FK);
+ const bool IsPCRel = Writer->isFixupKindPCRel(Asm, FK);
+ const unsigned RelocType = getRelocType(Target, FK, IsPCRel);
+
+ // If this is a difference or a defined symbol plus an offset, then we need a
+ // scattered relocation entry. Differences always require scattered
+ // relocations.
+ if (Target.getSymB() &&
+ // Q: are branch targets ever scattered?
+ RelocType != MachO::PPC_RELOC_BR24 &&
+ RelocType != MachO::PPC_RELOC_BR14) {
+ recordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+ Log2Size, FixedValue);
+ return;
+ }
+
+ // this doesn't seem right for RIT_PPC_BR24
+ // Get the symbol data, if any.
+ const MCSymbol *A = nullptr;
+ if (Target.getSymA())
+ A = &Target.getSymA()->getSymbol();
+
+ // See <reloc.h>.
+ const uint32_t FixupOffset = getFixupOffset(Layout, Fragment, Fixup);
+ unsigned Index = 0;
+ unsigned Type = RelocType;
+
+ const MCSymbol *RelSymbol = nullptr;
+ if (Target.isAbsolute()) { // constant
+ // SymbolNum of 0 indicates the absolute section.
+ //
+ // FIXME: Currently, these are never generated (see code below). I cannot
+ // find a case where they are actually emitted.
+ report_fatal_error("FIXME: relocations to absolute targets "
+ "not yet implemented");
+ // the above line stolen from ARM, not sure
+ } else {
+ // Resolve constant variables.
+ if (A->isVariable()) {
+ int64_t Res;
+ if (A->getVariableValue()->evaluateAsAbsolute(
+ Res, Layout, Writer->getSectionAddressMap())) {
+ FixedValue = Res;
+ return;
+ }
+ }
+
+ // Check whether we need an external or internal relocation.
+ if (Writer->doesSymbolRequireExternRelocation(*A)) {
+ RelSymbol = A;
+ // For external relocations, make sure to offset the fixup value to
+ // compensate for the addend of the symbol address, if it was
+ // undefined. This occurs with weak definitions, for example.
+ if (!A->isUndefined())
+ FixedValue -= Layout.getSymbolOffset(*A);
+ } else {
+ // The index is the section ordinal (1-based).
+ const MCSection &Sec = A->getSection();
+ Index = Sec.getOrdinal() + 1;
+ FixedValue += Writer->getSectionAddress(&Sec);
+ }
+ if (IsPCRel)
+ FixedValue -= Writer->getSectionAddress(Fragment->getParent());
+ }
+
+ // struct relocation_info (8 bytes)
+ MachO::any_relocation_info MRE;
+ makeRelocationInfo(MRE, FixupOffset, Index, IsPCRel, Log2Size, false, Type);
+ Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
+}
+
+MCObjectWriter *llvm::createPPCMachObjectWriter(raw_pwrite_stream &OS,
+ bool Is64Bit, uint32_t CPUType,
+ uint32_t CPUSubtype) {
+ return createMachObjectWriter(
+ new PPCMachObjectWriter(Is64Bit, CPUType, CPUSubtype), OS,
+ /*IsLittleEndian=*/false);
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp
new file mode 100644
index 000000000000..c2987b641c04
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp
@@ -0,0 +1,86 @@
+//===-- PPCPredicates.cpp - PPC Branch Predicate Information --------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PowerPC branch predicates.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCPredicates.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+using namespace llvm;
+
+PPC::Predicate PPC::InvertPredicate(PPC::Predicate Opcode) {
+ switch (Opcode) {
+ case PPC::PRED_EQ: return PPC::PRED_NE;
+ case PPC::PRED_NE: return PPC::PRED_EQ;
+ case PPC::PRED_LT: return PPC::PRED_GE;
+ case PPC::PRED_GE: return PPC::PRED_LT;
+ case PPC::PRED_GT: return PPC::PRED_LE;
+ case PPC::PRED_LE: return PPC::PRED_GT;
+ case PPC::PRED_NU: return PPC::PRED_UN;
+ case PPC::PRED_UN: return PPC::PRED_NU;
+ case PPC::PRED_EQ_MINUS: return PPC::PRED_NE_PLUS;
+ case PPC::PRED_NE_MINUS: return PPC::PRED_EQ_PLUS;
+ case PPC::PRED_LT_MINUS: return PPC::PRED_GE_PLUS;
+ case PPC::PRED_GE_MINUS: return PPC::PRED_LT_PLUS;
+ case PPC::PRED_GT_MINUS: return PPC::PRED_LE_PLUS;
+ case PPC::PRED_LE_MINUS: return PPC::PRED_GT_PLUS;
+ case PPC::PRED_NU_MINUS: return PPC::PRED_UN_PLUS;
+ case PPC::PRED_UN_MINUS: return PPC::PRED_NU_PLUS;
+ case PPC::PRED_EQ_PLUS: return PPC::PRED_NE_MINUS;
+ case PPC::PRED_NE_PLUS: return PPC::PRED_EQ_MINUS;
+ case PPC::PRED_LT_PLUS: return PPC::PRED_GE_MINUS;
+ case PPC::PRED_GE_PLUS: return PPC::PRED_LT_MINUS;
+ case PPC::PRED_GT_PLUS: return PPC::PRED_LE_MINUS;
+ case PPC::PRED_LE_PLUS: return PPC::PRED_GT_MINUS;
+ case PPC::PRED_NU_PLUS: return PPC::PRED_UN_MINUS;
+ case PPC::PRED_UN_PLUS: return PPC::PRED_NU_MINUS;
+
+ // Simple predicates for single condition-register bits.
+ case PPC::PRED_BIT_SET: return PPC::PRED_BIT_UNSET;
+ case PPC::PRED_BIT_UNSET: return PPC::PRED_BIT_SET;
+ }
+ llvm_unreachable("Unknown PPC branch opcode!");
+}
+
+PPC::Predicate PPC::getSwappedPredicate(PPC::Predicate Opcode) {
+ switch (Opcode) {
+ case PPC::PRED_EQ: return PPC::PRED_EQ;
+ case PPC::PRED_NE: return PPC::PRED_NE;
+ case PPC::PRED_LT: return PPC::PRED_GT;
+ case PPC::PRED_GE: return PPC::PRED_LE;
+ case PPC::PRED_GT: return PPC::PRED_LT;
+ case PPC::PRED_LE: return PPC::PRED_GE;
+ case PPC::PRED_NU: return PPC::PRED_NU;
+ case PPC::PRED_UN: return PPC::PRED_UN;
+ case PPC::PRED_EQ_MINUS: return PPC::PRED_EQ_MINUS;
+ case PPC::PRED_NE_MINUS: return PPC::PRED_NE_MINUS;
+ case PPC::PRED_LT_MINUS: return PPC::PRED_GT_MINUS;
+ case PPC::PRED_GE_MINUS: return PPC::PRED_LE_MINUS;
+ case PPC::PRED_GT_MINUS: return PPC::PRED_LT_MINUS;
+ case PPC::PRED_LE_MINUS: return PPC::PRED_GE_MINUS;
+ case PPC::PRED_NU_MINUS: return PPC::PRED_NU_MINUS;
+ case PPC::PRED_UN_MINUS: return PPC::PRED_UN_MINUS;
+ case PPC::PRED_EQ_PLUS: return PPC::PRED_EQ_PLUS;
+ case PPC::PRED_NE_PLUS: return PPC::PRED_NE_PLUS;
+ case PPC::PRED_LT_PLUS: return PPC::PRED_GT_PLUS;
+ case PPC::PRED_GE_PLUS: return PPC::PRED_LE_PLUS;
+ case PPC::PRED_GT_PLUS: return PPC::PRED_LT_PLUS;
+ case PPC::PRED_LE_PLUS: return PPC::PRED_GE_PLUS;
+ case PPC::PRED_NU_PLUS: return PPC::PRED_NU_PLUS;
+ case PPC::PRED_UN_PLUS: return PPC::PRED_UN_PLUS;
+
+ case PPC::PRED_BIT_SET:
+ case PPC::PRED_BIT_UNSET:
+ llvm_unreachable("Invalid use of bit predicate code");
+ }
+ llvm_unreachable("Unknown PPC branch opcode!");
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
new file mode 100644
index 000000000000..acea600fbb0d
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
@@ -0,0 +1,76 @@
+//===-- PPCPredicates.h - PPC Branch Predicate Information ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the PowerPC branch predicates.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCPREDICATES_H
+#define LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCPREDICATES_H
+
+// GCC #defines PPC on Linux but we use it as our namespace name
+#undef PPC
+
+// Generated files will use "namespace PPC". To avoid symbol clash,
+// undefine PPC here. PPC may be predefined on some hosts.
+#undef PPC
+
+namespace llvm {
+namespace PPC {
+ /// Predicate - These are "(BI << 5) | BO" for various predicates.
+ enum Predicate {
+ PRED_LT = (0 << 5) | 12,
+ PRED_LE = (1 << 5) | 4,
+ PRED_EQ = (2 << 5) | 12,
+ PRED_GE = (0 << 5) | 4,
+ PRED_GT = (1 << 5) | 12,
+ PRED_NE = (2 << 5) | 4,
+ PRED_UN = (3 << 5) | 12,
+ PRED_NU = (3 << 5) | 4,
+ PRED_LT_MINUS = (0 << 5) | 14,
+ PRED_LE_MINUS = (1 << 5) | 6,
+ PRED_EQ_MINUS = (2 << 5) | 14,
+ PRED_GE_MINUS = (0 << 5) | 6,
+ PRED_GT_MINUS = (1 << 5) | 14,
+ PRED_NE_MINUS = (2 << 5) | 6,
+ PRED_UN_MINUS = (3 << 5) | 14,
+ PRED_NU_MINUS = (3 << 5) | 6,
+ PRED_LT_PLUS = (0 << 5) | 15,
+ PRED_LE_PLUS = (1 << 5) | 7,
+ PRED_EQ_PLUS = (2 << 5) | 15,
+ PRED_GE_PLUS = (0 << 5) | 7,
+ PRED_GT_PLUS = (1 << 5) | 15,
+ PRED_NE_PLUS = (2 << 5) | 7,
+ PRED_UN_PLUS = (3 << 5) | 15,
+ PRED_NU_PLUS = (3 << 5) | 7,
+
+ // When dealing with individual condition-register bits, we have simple set
+ // and unset predicates.
+ PRED_BIT_SET = 1024,
+ PRED_BIT_UNSET = 1025
+ };
+
+ // Bit for branch taken (plus) or not-taken (minus) hint
+ enum BranchHintBit {
+ BR_NO_HINT = 0x0,
+ BR_NONTAKEN_HINT = 0x2,
+ BR_TAKEN_HINT = 0x3,
+ BR_HINT_MASK = 0X3
+ };
+
+ /// Invert the specified predicate. != -> ==, < -> >=.
+ Predicate InvertPredicate(Predicate Opcode);
+
+ /// Assume the condition register is set by MI(a,b), return the predicate if
+ /// we modify the instructions such that condition register is set by MI(b,a).
+ Predicate getSwappedPredicate(Predicate Opcode);
+}
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/P9InstrResources.td b/contrib/llvm/lib/Target/PowerPC/P9InstrResources.td
new file mode 100644
index 000000000000..aea022f88766
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/P9InstrResources.td
@@ -0,0 +1,808 @@
+//===- P9InstrResources.td - P9 Instruction Resource Defs -*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines resources required by some of P9 instruction. This is part
+// P9 processor model used for instruction scheduling. Not every instruction
+// is listed here. Instructions in this file belong to itinerary classes that
+// have instructions with different resource requirements.
+//
+//===----------------------------------------------------------------------===//
+
+
+def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C,
+ DISP_1C, DISP_1C],
+ (instrs
+ VADDCUW,
+ VADDUBM,
+ VADDUDM,
+ VADDUHM,
+ VADDUWM,
+ VAND,
+ VANDC,
+ VCMPEQUB,
+ VCMPEQUBo,
+ VCMPEQUD,
+ VCMPEQUDo,
+ VCMPEQUH,
+ VCMPEQUHo,
+ VCMPEQUW,
+ VCMPEQUWo,
+ VCMPGTSB,
+ VCMPGTSBo,
+ VCMPGTSD,
+ VCMPGTSDo,
+ VCMPGTSH,
+ VCMPGTSHo,
+ VCMPGTSW,
+ VCMPGTSWo,
+ VCMPGTUB,
+ VCMPGTUBo,
+ VCMPGTUD,
+ VCMPGTUDo,
+ VCMPGTUH,
+ VCMPGTUHo,
+ VCMPGTUW,
+ VCMPGTUWo,
+ VCMPNEB,
+ VCMPNEBo,
+ VCMPNEH,
+ VCMPNEHo,
+ VCMPNEW,
+ VCMPNEWo,
+ VCMPNEZB,
+ VCMPNEZBo,
+ VCMPNEZH,
+ VCMPNEZHo,
+ VCMPNEZW,
+ VCMPNEZWo,
+ VEQV,
+ VEXTSB2D,
+ VEXTSB2W,
+ VEXTSH2D,
+ VEXTSH2W,
+ VEXTSW2D,
+ VMRGEW,
+ VMRGOW,
+ VNAND,
+ VNEGD,
+ VNEGW,
+ VNOR,
+ VOR,
+ VORC,
+ VPOPCNTB,
+ VPOPCNTH,
+ VPOPCNTW,
+ VSEL,
+ VSUBCUW,
+ VSUBUBM,
+ VSUBUDM,
+ VSUBUHM,
+ VSUBUWM,
+ VXOR,
+ V_SET0B,
+ V_SET0H,
+ V_SET0,
+ XVABSDP,
+ XVABSSP,
+ XVCPSGNDP,
+ XVCPSGNSP,
+ XVIEXPDP,
+ XVNABSDP,
+ XVNABSSP,
+ XVNEGDP,
+ XVNEGSP,
+ XVXEXPDP,
+ XXLAND,
+ XXLANDC,
+ XXLEQV,
+ XXLNAND,
+ XXLNOR,
+ XXLOR,
+ XXLORf,
+ XXLORC,
+ XXLXOR,
+ XXSEL
+)>;
+
+def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C],
+ (instrs
+ XSABSQP,
+ XSCPSGNQP,
+ XSIEXPQP,
+ XSNABSQP,
+ XSNEGQP,
+ XSXEXPQP,
+ XSABSDP,
+ XSCPSGNDP,
+ XSIEXPDP,
+ XSNABSDP,
+ XSNEGDP,
+ XSXEXPDP
+)>;
+
+def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+ (instrs
+
+ VMINSB,
+ VMINSD,
+ VMINSH,
+ VMINSW,
+ VMINUB,
+ VMINUD,
+ VMINUH,
+ VMINUW,
+ VPOPCNTD,
+ VPRTYBD,
+ VPRTYBW,
+ VRLB,
+ VRLD,
+ VRLDMI,
+ VRLDNM,
+ VRLH,
+ VRLW,
+ VRLWMI,
+ VRLWNM,
+ VSHASIGMAD,
+ VSHASIGMAW,
+ VSLB,
+ VSLD,
+ VSLH,
+ VSLW,
+ VSRAB,
+ VSRAD,
+ VSRAH,
+ VSRAW,
+ VSRB,
+ VSRD,
+ VSRH,
+ VSRW,
+ VSUBSBS,
+ VSUBSHS,
+ VSUBSWS,
+ VSUBUBS,
+ VSUBUHS,
+ VSUBUWS,
+ XSCMPEQDP,
+ XSCMPEXPDP,
+ XSCMPGEDP,
+ XSCMPGTDP,
+ XSCMPODP,
+ XSCMPUDP,
+ XSCVSPDPN,
+ XSMAXCDP,
+ XSMAXDP,
+ XSMAXJDP,
+ XSMINCDP,
+ XSMINDP,
+ XSMINJDP,
+ XSTDIVDP,
+ XSTSQRTDP,
+ XSTSTDCDP,
+ XSTSTDCSP,
+ XSXSIGDP,
+ XVCMPEQDP,
+ XVCMPEQDPo,
+ XVCMPEQSP,
+ XVCMPEQSPo,
+ XVCMPGEDP,
+ XVCMPGEDPo,
+ XVCMPGESP,
+ XVCMPGESPo,
+ XVCMPGTDP,
+ XVCMPGTDPo,
+ XVCMPGTSP,
+ XVCMPGTSPo,
+ XVIEXPSP,
+ XVMAXDP,
+ XVMAXSP,
+ XVMINDP,
+ XVMINSP,
+ XVTDIVDP,
+ XVTDIVSP,
+ XVTSQRTDP,
+ XVTSQRTSP,
+ XVTSTDCDP,
+ XVTSTDCSP,
+ XVXEXPSP,
+ XVXSIGDP,
+ XVXSIGSP
+)>;
+
+def : InstRW<[P9_ALUE_4C, P9_ALUO_4C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+ (instrs
+ VABSDUB,
+ VABSDUH,
+ VABSDUW,
+ VADDSBS,
+ VADDSHS,
+ VADDSWS,
+ VADDUBS,
+ VADDUHS,
+ VADDUWS,
+ VAVGSB,
+ VAVGSH,
+ VAVGSW,
+ VAVGUB,
+ VAVGUH,
+ VAVGUW,
+ VBPERMD,
+ VCLZB,
+ VCLZD,
+ VCLZH,
+ VCLZW,
+ VCMPBFP,
+ VCMPBFPo,
+ VCMPGTFP,
+ VCMPGTFPo,
+ VCTZB,
+ VCTZD,
+ VCTZH,
+ VCTZW,
+ VMAXFP,
+ VMAXSB,
+ VMAXSD,
+ VMAXSH,
+ VMAXSW,
+ VMAXUB,
+ VMAXUD,
+ VMAXUH,
+ VMAXUW,
+ VMINFP,
+ VCMPEQFP,
+ VCMPEQFPo,
+ VCMPGEFP,
+ VCMPGEFPo
+)>;
+
+def : InstRW<[P9_DPE_7C, P9_DPO_7C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+ (instrs
+ VADDFP,
+ VCTSXS,
+ VCTSXS_0,
+ VCTUXS,
+ VCTUXS_0,
+ VEXPTEFP,
+ VLOGEFP,
+ VMADDFP,
+ VMHADDSHS,
+ VNMSUBFP,
+ VREFP,
+ VRFIM,
+ VRFIN,
+ VRFIP,
+ VRFIZ,
+ VRSQRTEFP,
+ VSUBFP,
+ XVADDDP,
+ XVADDSP,
+ XVCVDPSP,
+ XVCVDPSXDS,
+ XVCVDPSXWS,
+ XVCVDPUXDS,
+ XVCVDPUXWS,
+ XVCVHPSP,
+ XVCVSPDP,
+ XVCVSPHP,
+ XVCVSPSXDS,
+ XVCVSPSXWS,
+ XVCVSPUXDS,
+ XVCVSPUXWS,
+ XVCVSXDDP,
+ XVCVSXDSP,
+ XVCVSXWDP,
+ XVCVSXWSP,
+ XVCVUXDDP,
+ XVCVUXDSP,
+ XVCVUXWDP,
+ XVCVUXWSP,
+ XVMADDADP,
+ XVMADDASP,
+ XVMADDMDP,
+ XVMADDMSP,
+ XVMSUBADP,
+ XVMSUBASP,
+ XVMSUBMDP,
+ XVMSUBMSP,
+ XVMULDP,
+ XVMULSP,
+ XVNMADDADP,
+ XVNMADDASP,
+ XVNMADDMDP,
+ XVNMADDMSP,
+ XVNMSUBADP,
+ XVNMSUBASP,
+ XVNMSUBMDP,
+ XVNMSUBMSP,
+ XVRDPI,
+ XVRDPIC,
+ XVRDPIM,
+ XVRDPIP,
+ XVRDPIZ,
+ XVREDP,
+ XVRESP,
+ XVRSPI,
+ XVRSPIC,
+ XVRSPIM,
+ XVRSPIP,
+ XVRSPIZ,
+ XVRSQRTEDP,
+ XVRSQRTESP,
+ XVSUBDP,
+ XVSUBSP,
+ VCFSX,
+ VCFSX_0,
+ VCFUX,
+ VCFUX_0,
+ VMHRADDSHS,
+ VMLADDUHM,
+ VMSUMMBM,
+ VMSUMSHM,
+ VMSUMSHS,
+ VMSUMUBM,
+ VMSUMUHM,
+ VMSUMUHS,
+ VMULESB,
+ VMULESH,
+ VMULESW,
+ VMULEUB,
+ VMULEUH,
+ VMULEUW,
+ VMULOSB,
+ VMULOSH,
+ VMULOSW,
+ VMULOUB,
+ VMULOUH,
+ VMULOUW,
+ VMULUWM,
+ VSUM2SWS,
+ VSUM4SBS,
+ VSUM4SHS,
+ VSUM4UBS,
+ VSUMSWS
+)>;
+
+def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ XSMADDADP,
+ XSMADDASP,
+ XSMADDMDP,
+ XSMADDMSP,
+ XSMSUBADP,
+ XSMSUBASP,
+ XSMSUBMDP,
+ XSMSUBMSP,
+ XSMULDP,
+ XSMULSP,
+ XSNMADDADP,
+ XSNMADDASP,
+ XSNMADDMDP,
+ XSNMADDMSP,
+ XSNMSUBADP,
+ XSNMSUBASP,
+ XSNMSUBMDP,
+ XSNMSUBMSP
+)>;
+
+
+def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C],
+ (instrs
+ XSADDDP,
+ XSADDSP,
+ XSCVDPHP,
+ XSCVDPSP,
+ XSCVDPSXDS,
+ XSCVDPSXWS,
+ XSCVDPUXDS,
+ XSCVDPUXWS,
+ XSCVHPDP,
+ XSCVSPDP,
+ XSCVSXDDP,
+ XSCVSXDSP,
+ XSCVUXDDP,
+ XSCVUXDSP,
+ XSRDPI,
+ XSRDPIC,
+ XSRDPIM,
+ XSRDPIP,
+ XSRDPIZ,
+ XSREDP,
+ XSRESP,
+ //XSRSP,
+ XSRSQRTEDP,
+ XSRSQRTESP,
+ XSSUBDP,
+ XSSUBSP,
+ XSCVDPSPN
+)>;
+
+def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C],
+ (instrs
+ VBPERMQ,
+ VCLZLSBB,
+ VCTZLSBB,
+ VEXTRACTD,
+ VEXTRACTUB,
+ VEXTRACTUH,
+ VEXTRACTUW,
+ VEXTUBLX,
+ VEXTUBRX,
+ VEXTUHLX,
+ VEXTUHRX,
+ VEXTUWLX,
+ VEXTUWRX,
+ VGBBD,
+ VINSERTB,
+ VINSERTD,
+ VINSERTH,
+ VINSERTW,
+ VMRGHB,
+ VMRGHH,
+ VMRGHW,
+ VMRGLB,
+ VMRGLH,
+ VMRGLW,
+ VPERM,
+ VPERMR,
+ VPERMXOR,
+ VPKPX,
+ VPKSDSS,
+ VPKSDUS,
+ VPKSHSS,
+ VPKSHUS,
+ VPKSWSS,
+ VPKSWUS,
+ VPKUDUM,
+ VPKUDUS,
+ VPKUHUM,
+ VPKUHUS,
+ VPKUWUM,
+ VPKUWUS,
+ VPRTYBQ,
+ VSL,
+ VSLDOI,
+ VSLO,
+ VSLV,
+ VSPLTB,
+ VSPLTH,
+ VSPLTISB,
+ VSPLTISH,
+ VSPLTISW,
+ VSPLTW,
+ VSR,
+ VSRO,
+ VSRV,
+ VUPKHPX,
+ VUPKHSB,
+ VUPKHSH,
+ VUPKHSW,
+ VUPKLPX,
+ VUPKLSB,
+ VUPKLSH,
+ VUPKLSW,
+ XXBRD,
+ XXBRH,
+ XXBRQ,
+ XXBRW,
+ XXEXTRACTUW,
+ XXINSERTW,
+ XXMRGHW,
+ XXMRGLW,
+ XXPERM,
+ XXPERMR,
+ XXSLDWI,
+ XXSPLTIB,
+ XXSPLTW,
+ VADDCUQ,
+ VADDECUQ,
+ VADDEUQM,
+ VADDUQM,
+ VMUL10CUQ,
+ VMUL10ECUQ,
+ VMUL10EUQ,
+ VMUL10UQ,
+ VSUBCUQ,
+ VSUBECUQ,
+ VSUBEUQM,
+ VSUBUQM,
+ XSCMPEXPQP,
+ XSCMPOQP,
+ XSCMPUQP,
+ XSTSTDCQP,
+ XSXSIGQP
+)>;
+
+def : InstRW<[P9_DFU_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+ (instrs
+ XSADDQP,
+ XSADDQPO,
+ XSCVDPQP,
+ XSCVQPDP,
+ XSCVQPDPO,
+ XSCVQPSDZ,
+ XSCVQPSWZ,
+ XSCVQPUDZ,
+ XSCVQPUWZ,
+ XSCVSDQP,
+ XSCVUDQP,
+ XSRQPI,
+ XSRQPXP,
+ XSSUBQP,
+ XSSUBQPO
+)>;
+
+def : InstRW<[P9_DFU_24C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+ (instrs
+ XSMADDQP,
+ XSMADDQPO,
+ XSMSUBQP,
+ XSMSUBQPO,
+ XSMULQP,
+ XSMULQPO,
+ XSNMADDQP,
+ XSNMADDQPO,
+ XSNMSUBQP,
+ XSNMSUBQPO
+)>;
+
+def : InstRW<[P9_DFU_58C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+ (instrs
+ XSDIVQP,
+ XSDIVQPO
+)>;
+
+def : InstRW<[P9_DFU_76C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+ (instrs
+ XSSQRTQP,
+ XSSQRTQPO
+)>;
+
+// Load Operation in IIC_LdStLFD
+
+def : InstRW<[P9_LS_5C, IP_AGEN_1C, DISP_1C, DISP_1C],
+ (instrs
+ LXSDX,
+ LXVD2X,
+ LXSIWZX,
+ LXV,
+ LXSD
+)>;
+
+def : InstRW<[P9_LS_5C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ LFIWZX,
+ LFDX,
+ LFD
+)>;
+
+def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ LXSSPX,
+ LXSIWAX,
+ LXSSP
+)>;
+
+def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ LFIWAX,
+ LFSX,
+ LFS
+)>;
+
+def : InstRW<[P9_LoadAndPMOp_8C, IP_AGEN_1C, IP_EXEC_1C, DISP_1C, DISP_1C],
+ (instrs
+ LXVDSX,
+ LXVW4X
+)>;
+
+// Store Operations in IIC_LdStSTFD.
+
+def : InstRW<[P9_LS_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ STFS,
+ STFD,
+ STFIWX,
+ STFSX,
+ STFDX,
+ STXSDX,
+ STXSSPX,
+ STXSIWX
+)>;
+
+def : InstRW<[P9_LS_1C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C],
+ (instrs
+ STXVD2X,
+ STXVW4X
+)>;
+
+
+// Divide Operations in IIC_IntDivW, IIC_IntDivD.
+
+def : InstRW<[P9_DIV_16C_8, IP_EXECE_1C, DISP_1C, DISP_1C],
+ (instrs
+ DIVW,
+ DIVWU
+)>;
+
+def : InstRW<[P9_DIV_24C_8, IP_EXECE_1C, DISP_1C, DISP_1C],
+ (instrs
+ DIVWE,
+ DIVD,
+ DIVWEU,
+ DIVDU
+)>;
+
+def : InstRW<[P9_DIV_40C_8, IP_EXECE_1C, DISP_1C, DISP_1C],
+ (instrs
+ DIVDE,
+ DIVDEU
+)>;
+
+def : InstRW<[P9_IntDivAndALUOp_26C_8, IP_EXECE_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ DIVWEo,
+ DIVWEUo
+)>;
+
+def : InstRW<[P9_IntDivAndALUOp_42C_8, IP_EXECE_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ DIVDEo,
+ DIVDEUo
+)>;
+
+// Rotate Operations in IIC_IntRotateD, IIC_IntRotateDI
+def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C],
+ (instrs
+ SLD,
+ SRD,
+ SRAD,
+ SRADI,
+ RLDIC
+)>;
+
+def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ RLDCL,
+ RLDCR,
+ RLDIMI,
+ RLDICL,
+ RLDICR,
+ RLDICL_32_64
+)>;
+
+// CR access instructions in _BrMCR, IIC_BrMCRX.
+
+def : InstRW<[P9_ALU_2C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ MTOCRF,
+ MTOCRF8,
+ MTCRF,
+ MTCRF8
+)>;
+
+def : InstRW<[P9_ALU_5C, IP_EXEC_1C, DISP_1C, DISP_1C],
+ (instrs
+ MCRF,
+ MCRXRX
+)>;
+
+def : InstRW<[P9_ALU_5C, P9_ALU_5C, IP_EXEC_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ MCRFS
+)>;
+
+// FP Div instructions in IIC_FPDivD and IIC_FPDivS.
+
+def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ FDIV,
+ XSDIVDP
+)>;
+
+def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ FDIVS,
+ XSDIVSP
+)>;
+
+def : InstRW<[P9_DP_24C_8, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+ (instrs
+ XVDIVSP
+)>;
+
+def : InstRW<[P9_DP_33C_8, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+ (instrs
+ XVDIVDP
+)>;
+
+// FP Instructions in IIC_FPGeneral, IIC_FPFused
+
+def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ FRSP,
+ FRIND,
+ FRINS,
+ FRIPD,
+ FRIPS,
+ FRIZD,
+ FRIZS,
+ FRIMD,
+ FRIMS,
+ FRE,
+ FRES,
+ FRSQRTE,
+ FRSQRTES,
+ FMADDS,
+ FMADD,
+ FMSUBS,
+ FMSUB,
+ FNMADDS,
+ FNMADD,
+ FNMSUBS,
+ FNMSUB,
+ FSELD,
+ FSELS,
+ FADDS,
+ FMULS,
+ FMUL,
+ FSUBS,
+ FCFID,
+ FCTID,
+ FCTIDZ,
+ FCFIDU,
+ FCFIDS,
+ FCFIDUS,
+ FCTIDUZ,
+ FCTIWUZ,
+ FCTIW,
+ FCTIWZ
+)>;
+
+def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ FMR,
+ FABSD,
+ FABSS,
+ FNABSD,
+ FNABSS,
+ FNEGD,
+ FNEGS,
+ FCPSGND,
+ FCPSGNS
+)>;
+
+def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ FCMPUS,
+ FCMPUD
+)>;
+
+// Load instructions in IIC_LdStLFDU and IIC_LdStLFDUX.
+
+def : InstRW<[P9_LoadAndALUOp_7C, P9_ALU_2C,
+ IP_AGEN_1C, IP_EXEC_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ LFSU,
+ LFSUX
+)>;
+
+def : InstRW<[P9_LS_5C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ LFDU,
+ LFDUX
+)>;
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPC.h b/contrib/llvm/lib/Target/PowerPC/PPC.h
new file mode 100644
index 000000000000..e01f49dce81e
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPC.h
@@ -0,0 +1,104 @@
+//===-- PPC.h - Top-level interface for PowerPC Target ----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// PowerPC back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_PPC_H
+#define LLVM_LIB_TARGET_POWERPC_PPC_H
+
+#include "MCTargetDesc/PPCMCTargetDesc.h"
+
+// GCC #defines PPC on Linux but we use it as our namespace name
+#undef PPC
+
+namespace llvm {
+ class PPCTargetMachine;
+ class PassRegistry;
+ class FunctionPass;
+ class ImmutablePass;
+ class MachineInstr;
+ class AsmPrinter;
+ class MCInst;
+
+ FunctionPass *createPPCCTRLoops(PPCTargetMachine &TM);
+#ifndef NDEBUG
+ FunctionPass *createPPCCTRLoopsVerify();
+#endif
+ FunctionPass *createPPCLoopPreIncPrepPass(PPCTargetMachine &TM);
+ FunctionPass *createPPCTOCRegDepsPass();
+ FunctionPass *createPPCEarlyReturnPass();
+ FunctionPass *createPPCVSXCopyPass();
+ FunctionPass *createPPCVSXFMAMutatePass();
+ FunctionPass *createPPCVSXSwapRemovalPass();
+ FunctionPass *createPPCMIPeepholePass();
+ FunctionPass *createPPCBranchSelectionPass();
+ FunctionPass *createPPCQPXLoadSplatPass();
+ FunctionPass *createPPCISelDag(PPCTargetMachine &TM);
+ FunctionPass *createPPCTLSDynamicCallPass();
+ FunctionPass *createPPCBoolRetToIntPass();
+ void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
+ AsmPrinter &AP, bool isDarwin);
+
+ void initializePPCVSXFMAMutatePass(PassRegistry&);
+ void initializePPCBoolRetToIntPass(PassRegistry&);
+ extern char &PPCVSXFMAMutateID;
+
+ namespace PPCII {
+
+ /// Target Operand Flag enum.
+ enum TOF {
+ //===------------------------------------------------------------------===//
+ // PPC Specific MachineOperand flags.
+ MO_NO_FLAG,
+
+ /// On a symbol operand "FOO", this indicates that the reference is actually
+ /// to "FOO@plt". This is used for calls and jumps to external functions on
+ /// for PIC calls on Linux and ELF systems.
+ MO_PLT = 1,
+
+ /// MO_PIC_FLAG - If this bit is set, the symbol reference is relative to
+ /// the function's picbase, e.g. lo16(symbol-picbase).
+ MO_PIC_FLAG = 2,
+
+ /// MO_NLP_FLAG - If this bit is set, the symbol reference is actually to
+ /// the non_lazy_ptr for the global, e.g. lo16(symbol$non_lazy_ptr-picbase).
+ MO_NLP_FLAG = 4,
+
+ /// MO_NLP_HIDDEN_FLAG - If this bit is set, the symbol reference is to a
+ /// symbol with hidden visibility. This causes a different kind of
+ /// non-lazy-pointer to be generated.
+ MO_NLP_HIDDEN_FLAG = 8,
+
+ /// The next are not flags but distinct values.
+ MO_ACCESS_MASK = 0xf0,
+
+ /// MO_LO, MO_HA - lo16(symbol) and ha16(symbol)
+ MO_LO = 1 << 4,
+ MO_HA = 2 << 4,
+
+ MO_TPREL_LO = 4 << 4,
+ MO_TPREL_HA = 3 << 4,
+
+ /// These values identify relocations on immediates folded
+ /// into memory operations.
+ MO_DTPREL_LO = 5 << 4,
+ MO_TLSLD_LO = 6 << 4,
+ MO_TOC_LO = 7 << 4,
+
+ // Symbol for VK_PPC_TLS fixup attached to an ADD instruction
+ MO_TLS = 8 << 4
+ };
+ } // end namespace PPCII
+
+} // end namespace llvm;
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPC.td b/contrib/llvm/lib/Target/PowerPC/PPC.td
new file mode 100644
index 000000000000..46502208b175
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPC.td
@@ -0,0 +1,468 @@
+//===-- PPC.td - Describe the PowerPC Target Machine -------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the top level entry point for the PowerPC target.
+//
+//===----------------------------------------------------------------------===//
+
+// Get the target-independent interfaces which we are implementing.
+//
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// PowerPC Subtarget features.
+//
+
+//===----------------------------------------------------------------------===//
+// CPU Directives //
+//===----------------------------------------------------------------------===//
+
+def Directive440 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_440", "">;
+def Directive601 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_601", "">;
+def Directive602 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_602", "">;
+def Directive603 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_603", "">;
+def Directive604 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_603", "">;
+def Directive620 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_603", "">;
+def Directive7400: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_7400", "">;
+def Directive750 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_750", "">;
+def Directive970 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_970", "">;
+def Directive32 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_32", "">;
+def Directive64 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_64", "">;
+def DirectiveA2 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_A2", "">;
+def DirectiveE500mc : SubtargetFeature<"", "DarwinDirective",
+ "PPC::DIR_E500mc", "">;
+def DirectiveE5500 : SubtargetFeature<"", "DarwinDirective",
+ "PPC::DIR_E5500", "">;
+def DirectivePwr3: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR3", "">;
+def DirectivePwr4: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR4", "">;
+def DirectivePwr5: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR5", "">;
+def DirectivePwr5x
+ : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR5X", "">;
+def DirectivePwr6: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR6", "">;
+def DirectivePwr6x
+ : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR6X", "">;
+def DirectivePwr7: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR7", "">;
+def DirectivePwr8: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR8", "">;
+def DirectivePwr9: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR9", "">;
+
+def Feature64Bit : SubtargetFeature<"64bit","Has64BitSupport", "true",
+ "Enable 64-bit instructions">;
+def FeatureHardFloat : SubtargetFeature<"hard-float", "HasHardFloat", "true",
+ "Enable floating-point instructions">;
+def Feature64BitRegs : SubtargetFeature<"64bitregs","Use64BitRegs", "true",
+ "Enable 64-bit registers usage for ppc32 [beta]">;
+def FeatureCRBits : SubtargetFeature<"crbits", "UseCRBits", "true",
+ "Use condition-register bits individually">;
+def FeatureAltivec : SubtargetFeature<"altivec","HasAltivec", "true",
+ "Enable Altivec instructions",
+ [FeatureHardFloat]>;
+def FeatureSPE : SubtargetFeature<"spe","HasSPE", "true",
+ "Enable SPE instructions",
+ [FeatureHardFloat]>;
+def FeatureMFOCRF : SubtargetFeature<"mfocrf","HasMFOCRF", "true",
+ "Enable the MFOCRF instruction">;
+def FeatureFSqrt : SubtargetFeature<"fsqrt","HasFSQRT", "true",
+ "Enable the fsqrt instruction",
+ [FeatureHardFloat]>;
+def FeatureFCPSGN : SubtargetFeature<"fcpsgn", "HasFCPSGN", "true",
+ "Enable the fcpsgn instruction",
+ [FeatureHardFloat]>;
+def FeatureFRE : SubtargetFeature<"fre", "HasFRE", "true",
+ "Enable the fre instruction",
+ [FeatureHardFloat]>;
+def FeatureFRES : SubtargetFeature<"fres", "HasFRES", "true",
+ "Enable the fres instruction",
+ [FeatureHardFloat]>;
+def FeatureFRSQRTE : SubtargetFeature<"frsqrte", "HasFRSQRTE", "true",
+ "Enable the frsqrte instruction",
+ [FeatureHardFloat]>;
+def FeatureFRSQRTES : SubtargetFeature<"frsqrtes", "HasFRSQRTES", "true",
+ "Enable the frsqrtes instruction",
+ [FeatureHardFloat]>;
+def FeatureRecipPrec : SubtargetFeature<"recipprec", "HasRecipPrec", "true",
+ "Assume higher precision reciprocal estimates">;
+def FeatureSTFIWX : SubtargetFeature<"stfiwx","HasSTFIWX", "true",
+ "Enable the stfiwx instruction",
+ [FeatureHardFloat]>;
+def FeatureLFIWAX : SubtargetFeature<"lfiwax","HasLFIWAX", "true",
+ "Enable the lfiwax instruction",
+ [FeatureHardFloat]>;
+def FeatureFPRND : SubtargetFeature<"fprnd", "HasFPRND", "true",
+ "Enable the fri[mnpz] instructions",
+ [FeatureHardFloat]>;
+def FeatureFPCVT : SubtargetFeature<"fpcvt", "HasFPCVT", "true",
+ "Enable fc[ft]* (unsigned and single-precision) and lfiwzx instructions",
+ [FeatureHardFloat]>;
+def FeatureISEL : SubtargetFeature<"isel","HasISEL", "true",
+ "Enable the isel instruction">;
+def FeatureBPERMD : SubtargetFeature<"bpermd", "HasBPERMD", "true",
+ "Enable the bpermd instruction">;
+def FeatureExtDiv : SubtargetFeature<"extdiv", "HasExtDiv", "true",
+ "Enable extended divide instructions">;
+def FeatureLDBRX : SubtargetFeature<"ldbrx","HasLDBRX", "true",
+ "Enable the ldbrx instruction">;
+def FeatureCMPB : SubtargetFeature<"cmpb", "HasCMPB", "true",
+ "Enable the cmpb instruction">;
+def FeatureICBT : SubtargetFeature<"icbt","HasICBT", "true",
+ "Enable icbt instruction">;
+def FeatureBookE : SubtargetFeature<"booke", "IsBookE", "true",
+ "Enable Book E instructions",
+ [FeatureICBT]>;
+def FeatureMSYNC : SubtargetFeature<"msync", "HasOnlyMSYNC", "true",
+ "Has only the msync instruction instead of sync",
+ [FeatureBookE]>;
+def FeatureE500 : SubtargetFeature<"e500", "IsE500", "true",
+ "Enable E500/E500mc instructions">;
+def FeaturePPC4xx : SubtargetFeature<"ppc4xx", "IsPPC4xx", "true",
+ "Enable PPC 4xx instructions">;
+def FeaturePPC6xx : SubtargetFeature<"ppc6xx", "IsPPC6xx", "true",
+ "Enable PPC 6xx instructions">;
+def FeatureQPX : SubtargetFeature<"qpx","HasQPX", "true",
+ "Enable QPX instructions",
+ [FeatureHardFloat]>;
+def FeatureVSX : SubtargetFeature<"vsx","HasVSX", "true",
+ "Enable VSX instructions",
+ [FeatureAltivec]>;
+def FeatureP8Altivec : SubtargetFeature<"power8-altivec", "HasP8Altivec", "true",
+ "Enable POWER8 Altivec instructions",
+ [FeatureAltivec]>;
+def FeatureP8Crypto : SubtargetFeature<"crypto", "HasP8Crypto", "true",
+ "Enable POWER8 Crypto instructions",
+ [FeatureP8Altivec]>;
+def FeatureP8Vector : SubtargetFeature<"power8-vector", "HasP8Vector", "true",
+ "Enable POWER8 vector instructions",
+ [FeatureVSX, FeatureP8Altivec]>;
+def FeatureDirectMove :
+ SubtargetFeature<"direct-move", "HasDirectMove", "true",
+ "Enable Power8 direct move instructions",
+ [FeatureVSX]>;
+def FeaturePartwordAtomic : SubtargetFeature<"partword-atomics",
+ "HasPartwordAtomics", "true",
+ "Enable l[bh]arx and st[bh]cx.">;
+def FeatureInvariantFunctionDescriptors :
+ SubtargetFeature<"invariant-function-descriptors",
+ "HasInvariantFunctionDescriptors", "true",
+ "Assume function descriptors are invariant">;
+def FeatureLongCall : SubtargetFeature<"longcall", "UseLongCalls", "true",
+ "Always use indirect calls">;
+def FeatureHTM : SubtargetFeature<"htm", "HasHTM", "true",
+ "Enable Hardware Transactional Memory instructions">;
+def FeatureMFTB : SubtargetFeature<"", "FeatureMFTB", "true",
+ "Implement mftb using the mfspr instruction">;
+def FeatureFusion : SubtargetFeature<"fusion", "HasFusion", "true",
+ "Target supports add/load integer fusion.">;
+def FeatureFloat128 :
+ SubtargetFeature<"float128", "HasFloat128", "true",
+ "Enable the __float128 data type for IEEE-754R Binary128.",
+ [FeatureVSX]>;
+def FeaturePOPCNTD : SubtargetFeature<"popcntd","HasPOPCNTD",
+ "POPCNTD_Fast",
+ "Enable the popcnt[dw] instructions">;
+// Note that for the a2/a2q processor models we should not use popcnt[dw] by
+// default. These processors do support the instructions, but they're
+// microcoded, and the software emulation is about twice as fast.
+def FeatureSlowPOPCNTD : SubtargetFeature<"slow-popcntd","HasPOPCNTD",
+ "POPCNTD_Slow",
+ "Has slow popcnt[dw] instructions">;
+
+def DeprecatedDST : SubtargetFeature<"", "DeprecatedDST", "true",
+ "Treat vector data stream cache control instructions as deprecated">;
+
+def FeatureISA3_0 : SubtargetFeature<"isa-v30-instructions", "IsISA3_0",
+ "true",
+ "Enable instructions added in ISA 3.0.">;
+def FeatureP9Altivec : SubtargetFeature<"power9-altivec", "HasP9Altivec", "true",
+ "Enable POWER9 Altivec instructions",
+ [FeatureISA3_0, FeatureP8Altivec]>;
+def FeatureP9Vector : SubtargetFeature<"power9-vector", "HasP9Vector", "true",
+ "Enable POWER9 vector instructions",
+ [FeatureISA3_0, FeatureP8Vector,
+ FeatureP9Altivec]>;
+
+// Since new processors generally contain a superset of features of those that
+// came before them, the idea is to make implementations of new processors
+// less error prone and easier to read.
+// Namely:
+// list<SubtargetFeature> Power8FeatureList = ...
+// list<SubtargetFeature> FutureProcessorSpecificFeatureList =
+// [ features that Power8 does not support ]
+// list<SubtargetFeature> FutureProcessorFeatureList =
+// !listconcat(Power8FeatureList, FutureProcessorSpecificFeatureList)
+
+// Makes it explicit and obvious what is new in FutureProcesor vs. Power8 as
+// well as providing a single point of definition if the feature set will be
+// used elsewhere.
+def ProcessorFeatures {
+ list<SubtargetFeature> Power7FeatureList =
+ [DirectivePwr7, FeatureAltivec, FeatureVSX,
+ FeatureMFOCRF, FeatureFCPSGN, FeatureFSqrt, FeatureFRE,
+ FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES,
+ FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX,
+ FeatureFPRND, FeatureFPCVT, FeatureISEL,
+ FeaturePOPCNTD, FeatureCMPB, FeatureLDBRX,
+ Feature64Bit /*, Feature64BitRegs */,
+ FeatureBPERMD, FeatureExtDiv,
+ FeatureMFTB, DeprecatedDST];
+ list<SubtargetFeature> Power8SpecificFeatures =
+ [DirectivePwr8, FeatureP8Altivec, FeatureP8Vector, FeatureP8Crypto,
+ FeatureHTM, FeatureDirectMove, FeatureICBT, FeaturePartwordAtomic,
+ FeatureFusion];
+ list<SubtargetFeature> Power8FeatureList =
+ !listconcat(Power7FeatureList, Power8SpecificFeatures);
+ list<SubtargetFeature> Power9SpecificFeatures =
+ [DirectivePwr9, FeatureP9Altivec, FeatureP9Vector, FeatureISA3_0];
+ list<SubtargetFeature> Power9FeatureList =
+ !listconcat(Power8FeatureList, Power9SpecificFeatures);
+}
+
+// Note: Future features to add when support is extended to more
+// recent ISA levels:
+//
+// DFP p6, p6x, p7 decimal floating-point instructions
+// POPCNTB p5 through p7 popcntb and related instructions
+
+//===----------------------------------------------------------------------===//
+// Classes used for relation maps.
+//===----------------------------------------------------------------------===//
+// RecFormRel - Filter class used to relate non-record-form instructions with
+// their record-form variants.
+class RecFormRel;
+
+// AltVSXFMARel - Filter class used to relate the primary addend-killing VSX
+// FMA instruction forms with their corresponding factor-killing forms.
+class AltVSXFMARel {
+ bit IsVSXFMAAlt = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Relation Map Definitions.
+//===----------------------------------------------------------------------===//
+
+def getRecordFormOpcode : InstrMapping {
+ let FilterClass = "RecFormRel";
+ // Instructions with the same BaseName and Interpretation64Bit values
+ // form a row.
+ let RowFields = ["BaseName", "Interpretation64Bit"];
+ // Instructions with the same RC value form a column.
+ let ColFields = ["RC"];
+ // The key column are the non-record-form instructions.
+ let KeyCol = ["0"];
+ // Value columns RC=1
+ let ValueCols = [["1"]];
+}
+
+def getNonRecordFormOpcode : InstrMapping {
+ let FilterClass = "RecFormRel";
+ // Instructions with the same BaseName and Interpretation64Bit values
+ // form a row.
+ let RowFields = ["BaseName", "Interpretation64Bit"];
+ // Instructions with the same RC value form a column.
+ let ColFields = ["RC"];
+ // The key column are the record-form instructions.
+ let KeyCol = ["1"];
+ // Value columns are RC=0
+ let ValueCols = [["0"]];
+}
+
+def getAltVSXFMAOpcode : InstrMapping {
+ let FilterClass = "AltVSXFMARel";
+ // Instructions with the same BaseName and Interpretation64Bit values
+ // form a row.
+ let RowFields = ["BaseName"];
+ // Instructions with the same RC value form a column.
+ let ColFields = ["IsVSXFMAAlt"];
+ // The key column are the (default) addend-killing instructions.
+ let KeyCol = ["0"];
+ // Value columns IsVSXFMAAlt=1
+ let ValueCols = [["1"]];
+}
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "PPCRegisterInfo.td"
+include "PPCSchedule.td"
+
+//===----------------------------------------------------------------------===//
+// PowerPC processors supported.
+//
+
+def : Processor<"generic", G3Itineraries, [Directive32, FeatureHardFloat,
+ FeatureMFTB]>;
+def : ProcessorModel<"440", PPC440Model, [Directive440, FeatureISEL,
+ FeatureFRES, FeatureFRSQRTE,
+ FeatureICBT, FeatureBookE,
+ FeatureMSYNC, FeatureMFTB]>;
+def : ProcessorModel<"450", PPC440Model, [Directive440, FeatureISEL,
+ FeatureFRES, FeatureFRSQRTE,
+ FeatureICBT, FeatureBookE,
+ FeatureMSYNC, FeatureMFTB]>;
+def : Processor<"601", G3Itineraries, [Directive601, FeatureHardFloat]>;
+def : Processor<"602", G3Itineraries, [Directive602, FeatureHardFloat,
+ FeatureMFTB]>;
+def : Processor<"603", G3Itineraries, [Directive603,
+ FeatureFRES, FeatureFRSQRTE,
+ FeatureMFTB]>;
+def : Processor<"603e", G3Itineraries, [Directive603,
+ FeatureFRES, FeatureFRSQRTE,
+ FeatureMFTB]>;
+def : Processor<"603ev", G3Itineraries, [Directive603,
+ FeatureFRES, FeatureFRSQRTE,
+ FeatureMFTB]>;
+def : Processor<"604", G3Itineraries, [Directive604,
+ FeatureFRES, FeatureFRSQRTE,
+ FeatureMFTB]>;
+def : Processor<"604e", G3Itineraries, [Directive604,
+ FeatureFRES, FeatureFRSQRTE,
+ FeatureMFTB]>;
+def : Processor<"620", G3Itineraries, [Directive620,
+ FeatureFRES, FeatureFRSQRTE,
+ FeatureMFTB]>;
+def : Processor<"750", G4Itineraries, [Directive750,
+ FeatureFRES, FeatureFRSQRTE,
+ FeatureMFTB]>;
+def : Processor<"g3", G3Itineraries, [Directive750,
+ FeatureFRES, FeatureFRSQRTE,
+ FeatureMFTB]>;
+def : Processor<"7400", G4Itineraries, [Directive7400, FeatureAltivec,
+ FeatureFRES, FeatureFRSQRTE,
+ FeatureMFTB]>;
+def : Processor<"g4", G4Itineraries, [Directive7400, FeatureAltivec,
+ FeatureFRES, FeatureFRSQRTE,
+ FeatureMFTB]>;
+def : Processor<"7450", G4PlusItineraries, [Directive7400, FeatureAltivec,
+ FeatureFRES, FeatureFRSQRTE,
+ FeatureMFTB]>;
+def : Processor<"g4+", G4PlusItineraries, [Directive7400, FeatureAltivec,
+ FeatureFRES, FeatureFRSQRTE,
+ FeatureMFTB]>;
+
+def : ProcessorModel<"970", G5Model,
+ [Directive970, FeatureAltivec,
+ FeatureMFOCRF, FeatureFSqrt,
+ FeatureFRES, FeatureFRSQRTE, FeatureSTFIWX,
+ Feature64Bit /*, Feature64BitRegs */,
+ FeatureMFTB]>;
+def : ProcessorModel<"g5", G5Model,
+ [Directive970, FeatureAltivec,
+ FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
+ FeatureFRES, FeatureFRSQRTE,
+ Feature64Bit /*, Feature64BitRegs */,
+ FeatureMFTB, DeprecatedDST]>;
+def : ProcessorModel<"e500mc", PPCE500mcModel,
+ [DirectiveE500mc,
+ FeatureSTFIWX, FeatureICBT, FeatureBookE,
+ FeatureISEL, FeatureMFTB]>;
+def : ProcessorModel<"e5500", PPCE5500Model,
+ [DirectiveE5500, FeatureMFOCRF, Feature64Bit,
+ FeatureSTFIWX, FeatureICBT, FeatureBookE,
+ FeatureISEL, FeatureMFTB]>;
+def : ProcessorModel<"a2", PPCA2Model,
+ [DirectiveA2, FeatureICBT, FeatureBookE, FeatureMFOCRF,
+ FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES,
+ FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
+ FeatureSTFIWX, FeatureLFIWAX,
+ FeatureFPRND, FeatureFPCVT, FeatureISEL,
+ FeatureSlowPOPCNTD, FeatureCMPB, FeatureLDBRX,
+ Feature64Bit /*, Feature64BitRegs */, FeatureMFTB]>;
+def : ProcessorModel<"a2q", PPCA2Model,
+ [DirectiveA2, FeatureICBT, FeatureBookE, FeatureMFOCRF,
+ FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES,
+ FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
+ FeatureSTFIWX, FeatureLFIWAX,
+ FeatureFPRND, FeatureFPCVT, FeatureISEL,
+ FeatureSlowPOPCNTD, FeatureCMPB, FeatureLDBRX,
+ Feature64Bit /*, Feature64BitRegs */, FeatureQPX,
+ FeatureMFTB]>;
+def : ProcessorModel<"pwr3", G5Model,
+ [DirectivePwr3, FeatureAltivec,
+ FeatureFRES, FeatureFRSQRTE, FeatureMFOCRF,
+ FeatureSTFIWX, Feature64Bit]>;
+def : ProcessorModel<"pwr4", G5Model,
+ [DirectivePwr4, FeatureAltivec, FeatureMFOCRF,
+ FeatureFSqrt, FeatureFRES, FeatureFRSQRTE,
+ FeatureSTFIWX, Feature64Bit, FeatureMFTB]>;
+def : ProcessorModel<"pwr5", G5Model,
+ [DirectivePwr5, FeatureAltivec, FeatureMFOCRF,
+ FeatureFSqrt, FeatureFRE, FeatureFRES,
+ FeatureFRSQRTE, FeatureFRSQRTES,
+ FeatureSTFIWX, Feature64Bit,
+ FeatureMFTB, DeprecatedDST]>;
+def : ProcessorModel<"pwr5x", G5Model,
+ [DirectivePwr5x, FeatureAltivec, FeatureMFOCRF,
+ FeatureFSqrt, FeatureFRE, FeatureFRES,
+ FeatureFRSQRTE, FeatureFRSQRTES,
+ FeatureSTFIWX, FeatureFPRND, Feature64Bit,
+ FeatureMFTB, DeprecatedDST]>;
+def : ProcessorModel<"pwr6", G5Model,
+ [DirectivePwr6, FeatureAltivec,
+ FeatureMFOCRF, FeatureFCPSGN, FeatureFSqrt, FeatureFRE,
+ FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES,
+ FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX, FeatureCMPB,
+ FeatureFPRND, Feature64Bit /*, Feature64BitRegs */,
+ FeatureMFTB, DeprecatedDST]>;
+def : ProcessorModel<"pwr6x", G5Model,
+ [DirectivePwr5x, FeatureAltivec, FeatureMFOCRF,
+ FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES,
+ FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
+ FeatureSTFIWX, FeatureLFIWAX, FeatureCMPB,
+ FeatureFPRND, Feature64Bit,
+ FeatureMFTB, DeprecatedDST]>;
+def : ProcessorModel<"pwr7", P7Model, ProcessorFeatures.Power7FeatureList>;
+def : ProcessorModel<"pwr8", P8Model, ProcessorFeatures.Power8FeatureList>;
+def : ProcessorModel<"pwr9", P9Model, ProcessorFeatures.Power9FeatureList>;
+def : Processor<"ppc", G3Itineraries, [Directive32, FeatureHardFloat,
+ FeatureMFTB]>;
+def : Processor<"ppc32", G3Itineraries, [Directive32, FeatureHardFloat,
+ FeatureMFTB]>;
+def : ProcessorModel<"ppc64", G5Model,
+ [Directive64, FeatureAltivec,
+ FeatureMFOCRF, FeatureFSqrt, FeatureFRES,
+ FeatureFRSQRTE, FeatureSTFIWX,
+ Feature64Bit /*, Feature64BitRegs */,
+ FeatureMFTB]>;
+def : ProcessorModel<"ppc64le", P8Model, ProcessorFeatures.Power8FeatureList>;
+
+//===----------------------------------------------------------------------===//
+// Calling Conventions
+//===----------------------------------------------------------------------===//
+
+include "PPCCallingConv.td"
+
+def PPCInstrInfo : InstrInfo {
+ let isLittleEndianEncoding = 1;
+
+ // FIXME: Unset this when no longer needed!
+ let decodePositionallyEncodedOperands = 1;
+
+ let noNamedPositionallyEncodedOperands = 1;
+}
+
+def PPCAsmParser : AsmParser {
+ let ShouldEmitMatchRegisterName = 0;
+}
+
+def PPCAsmParserVariant : AsmParserVariant {
+ int Variant = 0;
+
+ // We do not use hard coded registers in asm strings. However, some
+ // InstAlias definitions use immediate literals. Set RegisterPrefix
+ // so that those are not misinterpreted as registers.
+ string RegisterPrefix = "%";
+ string BreakCharacters = ".";
+}
+
+def PPC : Target {
+ // Information about the instructions.
+ let InstructionSet = PPCInstrInfo;
+
+ let AssemblyParsers = [PPCAsmParser];
+ let AssemblyParserVariants = [PPCAsmParserVariant];
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
new file mode 100644
index 000000000000..f0e0ebc4946c
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -0,0 +1,1471 @@
+//===-- PPCAsmPrinter.cpp - Print machine instrs to PowerPC assembly ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to PowerPC assembly language. This printer is
+// the output mechanism used by `llc'.
+//
+// Documentation at http://developer.apple.com/documentation/DeveloperTools/
+// Reference/Assembler/ASMIntroduction/chapter_1_section_1.html
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "PPCInstrInfo.h"
+#include "InstPrinter/PPCInstPrinter.h"
+#include "MCTargetDesc/PPCMCExpr.h"
+#include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCSubtarget.h"
+#include "PPCTargetMachine.h"
+#include "PPCTargetStreamer.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/SectionKind.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <new>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "asmprinter"
+
+namespace {
+
+class PPCAsmPrinter : public AsmPrinter {
+protected:
+ MapVector<MCSymbol *, MCSymbol *> TOC;
+ const PPCSubtarget *Subtarget;
+ StackMaps SM;
+
+public:
+ explicit PPCAsmPrinter(TargetMachine &TM,
+ std::unique_ptr<MCStreamer> Streamer)
+ : AsmPrinter(TM, std::move(Streamer)), SM(*this) {}
+
+ StringRef getPassName() const override { return "PowerPC Assembly Printer"; }
+
+ MCSymbol *lookUpOrCreateTOCEntry(MCSymbol *Sym);
+
+ bool doInitialization(Module &M) override {
+ if (!TOC.empty())
+ TOC.clear();
+ return AsmPrinter::doInitialization(M);
+ }
+
+ void EmitInstruction(const MachineInstr *MI) override;
+
+ void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
+
+ bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &O) override;
+ bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &O) override;
+
+ void EmitEndOfAsmFile(Module &M) override;
+
+ void LowerSTACKMAP(StackMaps &SM, const MachineInstr &MI);
+ void LowerPATCHPOINT(StackMaps &SM, const MachineInstr &MI);
+ void EmitTlsCall(const MachineInstr *MI, MCSymbolRefExpr::VariantKind VK);
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ Subtarget = &MF.getSubtarget<PPCSubtarget>();
+ return AsmPrinter::runOnMachineFunction(MF);
+ }
+ };
+
+ /// PPCLinuxAsmPrinter - PowerPC assembly printer, customized for Linux
+ class PPCLinuxAsmPrinter : public PPCAsmPrinter {
+ public:
+ explicit PPCLinuxAsmPrinter(TargetMachine &TM,
+ std::unique_ptr<MCStreamer> Streamer)
+ : PPCAsmPrinter(TM, std::move(Streamer)) {}
+
+ StringRef getPassName() const override {
+ return "Linux PPC Assembly Printer";
+ }
+
+ bool doFinalization(Module &M) override;
+ void EmitStartOfAsmFile(Module &M) override;
+
+ void EmitFunctionEntryLabel() override;
+
+ void EmitFunctionBodyStart() override;
+ void EmitFunctionBodyEnd() override;
+ };
+
+ /// PPCDarwinAsmPrinter - PowerPC assembly printer, customized for Darwin/Mac
+ /// OS X
+ class PPCDarwinAsmPrinter : public PPCAsmPrinter {
+ public:
+ explicit PPCDarwinAsmPrinter(TargetMachine &TM,
+ std::unique_ptr<MCStreamer> Streamer)
+ : PPCAsmPrinter(TM, std::move(Streamer)) {}
+
+ StringRef getPassName() const override {
+ return "Darwin PPC Assembly Printer";
+ }
+
+ bool doFinalization(Module &M) override;
+ void EmitStartOfAsmFile(Module &M) override;
+ };
+
+} // end anonymous namespace
+
+/// stripRegisterPrefix - This method strips the character prefix from a
+/// register name so that only the number is left. Used by for linux asm.
+static const char *stripRegisterPrefix(const char *RegName) {
+ switch (RegName[0]) {
+ case 'r':
+ case 'f':
+ case 'q': // for QPX
+ case 'v':
+ if (RegName[1] == 's')
+ return RegName + 2;
+ return RegName + 1;
+ case 'c': if (RegName[1] == 'r') return RegName + 2;
+ }
+
+ return RegName;
+}
+
+void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const DataLayout &DL = getDataLayout();
+ const MachineOperand &MO = MI->getOperand(OpNo);
+
+ switch (MO.getType()) {
+ case MachineOperand::MO_Register: {
+ unsigned Reg = MO.getReg();
+
+ // There are VSX instructions that use VSX register numbering (vs0 - vs63)
+ // as well as those that use VMX register numbering (v0 - v31 which
+ // correspond to vs32 - vs63). If we have an instruction that uses VSX
+ // numbering, we need to convert the VMX registers to VSX registers.
+ // Namely, we print 32-63 when the instruction operates on one of the
+ // VMX registers.
+ // (Please synchronize with PPCInstPrinter::printOperand)
+ if (MI->getDesc().TSFlags & PPCII::UseVSXReg) {
+ if (PPCInstrInfo::isVRRegister(Reg))
+ Reg = PPC::VSX32 + (Reg - PPC::V0);
+ else if (PPCInstrInfo::isVFRegister(Reg))
+ Reg = PPC::VSX32 + (Reg - PPC::VF0);
+ }
+ const char *RegName = PPCInstPrinter::getRegisterName(Reg);
+
+ // Linux assembler (Others?) does not take register mnemonics.
+ // FIXME - What about special registers used in mfspr/mtspr?
+ if (!Subtarget->isDarwin())
+ RegName = stripRegisterPrefix(RegName);
+ O << RegName;
+ return;
+ }
+ case MachineOperand::MO_Immediate:
+ O << MO.getImm();
+ return;
+
+ case MachineOperand::MO_MachineBasicBlock:
+ MO.getMBB()->getSymbol()->print(O, MAI);
+ return;
+ case MachineOperand::MO_ConstantPoolIndex:
+ O << DL.getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << '_'
+ << MO.getIndex();
+ return;
+ case MachineOperand::MO_BlockAddress:
+ GetBlockAddressSymbol(MO.getBlockAddress())->print(O, MAI);
+ return;
+ case MachineOperand::MO_GlobalAddress: {
+ // Computing the address of a global symbol, not calling it.
+ const GlobalValue *GV = MO.getGlobal();
+ MCSymbol *SymToPrint;
+
+ // External or weakly linked global variables need non-lazily-resolved stubs
+ if (Subtarget->hasLazyResolverStub(GV)) {
+ SymToPrint = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+ MachineModuleInfoImpl::StubValueTy &StubSym =
+ MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(
+ SymToPrint);
+ if (!StubSym.getPointer())
+ StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(GV),
+ !GV->hasInternalLinkage());
+ } else {
+ SymToPrint = getSymbol(GV);
+ }
+
+ SymToPrint->print(O, MAI);
+
+ printOffset(MO.getOffset(), O);
+ return;
+ }
+
+ default:
+ O << "<unknown operand type: " << (unsigned)MO.getType() << ">";
+ return;
+ }
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool PPCAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant,
+ const char *ExtraCode, raw_ostream &O) {
+ // Does this asm operand have a single letter operand modifier?
+ if (ExtraCode && ExtraCode[0]) {
+ if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+ switch (ExtraCode[0]) {
+ default:
+ // See if this is a generic print operand
+ return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+ case 'c': // Don't print "$" before a global var name or constant.
+ break; // PPC never has a prefix.
+ case 'L': // Write second word of DImode reference.
+ // Verify that this operand has two consecutive registers.
+ if (!MI->getOperand(OpNo).isReg() ||
+ OpNo+1 == MI->getNumOperands() ||
+ !MI->getOperand(OpNo+1).isReg())
+ return true;
+ ++OpNo; // Return the high-part.
+ break;
+ case 'I':
+ // Write 'i' if an integer constant, otherwise nothing. Used to print
+ // addi vs add, etc.
+ if (MI->getOperand(OpNo).isImm())
+ O << "i";
+ return false;
+ }
+ }
+
+ printOperand(MI, OpNo, O);
+ return false;
+}
+
+// At the moment, all inline asm memory operands are a single register.
+// In any case, the output of this routine should always be just one
+// assembler operand.
+
+bool PPCAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant,
+ const char *ExtraCode,
+ raw_ostream &O) {
+ if (ExtraCode && ExtraCode[0]) {
+ if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+ switch (ExtraCode[0]) {
+ default: return true; // Unknown modifier.
+ case 'y': // A memory reference for an X-form instruction
+ {
+ const char *RegName = "r0";
+ if (!Subtarget->isDarwin())
+ RegName = stripRegisterPrefix(RegName);
+ O << RegName << ", ";
+ printOperand(MI, OpNo, O);
+ return false;
+ }
+ case 'U': // Print 'u' for update form.
+ case 'X': // Print 'x' for indexed form.
+ {
+ // FIXME: Currently for PowerPC memory operands are always loaded
+ // into a register, so we never get an update or indexed form.
+ // This is bad even for offset forms, since even if we know we
+ // have a value in -16(r1), we will generate a load into r<n>
+ // and then load from 0(r<n>). Until that issue is fixed,
+ // tolerate 'U' and 'X' but don't output anything.
+ assert(MI->getOperand(OpNo).isReg());
+ return false;
+ }
+ }
+ }
+
+ assert(MI->getOperand(OpNo).isReg());
+ O << "0(";
+ printOperand(MI, OpNo, O);
+ O << ")";
+ return false;
+}
+
+/// lookUpOrCreateTOCEntry -- Given a symbol, look up whether a TOC entry
+/// exists for it. If not, create one. Then return a symbol that references
+/// the TOC entry.
+MCSymbol *PPCAsmPrinter::lookUpOrCreateTOCEntry(MCSymbol *Sym) {
+ MCSymbol *&TOCEntry = TOC[Sym];
+ if (!TOCEntry)
+ TOCEntry = createTempSymbol("C");
+ return TOCEntry;
+}
+
+void PPCAsmPrinter::EmitEndOfAsmFile(Module &M) {
+ SM.serializeToStackMapSection();
+}
+
+void PPCAsmPrinter::LowerSTACKMAP(StackMaps &SM, const MachineInstr &MI) {
+ unsigned NumNOPBytes = MI.getOperand(1).getImm();
+
+ SM.recordStackMap(MI);
+ assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
+
+ // Scan ahead to trim the shadow.
+ const MachineBasicBlock &MBB = *MI.getParent();
+ MachineBasicBlock::const_iterator MII(MI);
+ ++MII;
+ while (NumNOPBytes > 0) {
+ if (MII == MBB.end() || MII->isCall() ||
+ MII->getOpcode() == PPC::DBG_VALUE ||
+ MII->getOpcode() == TargetOpcode::PATCHPOINT ||
+ MII->getOpcode() == TargetOpcode::STACKMAP)
+ break;
+ ++MII;
+ NumNOPBytes -= 4;
+ }
+
+ // Emit nops.
+ for (unsigned i = 0; i < NumNOPBytes; i += 4)
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::NOP));
+}
+
+// Lower a patchpoint of the form:
+// [<def>], <id>, <numBytes>, <target>, <numArgs>
+void PPCAsmPrinter::LowerPATCHPOINT(StackMaps &SM, const MachineInstr &MI) {
+ SM.recordPatchPoint(MI);
+ PatchPointOpers Opers(&MI);
+
+ unsigned EncodedBytes = 0;
+ const MachineOperand &CalleeMO = Opers.getCallTarget();
+
+ if (CalleeMO.isImm()) {
+ int64_t CallTarget = CalleeMO.getImm();
+ if (CallTarget) {
+ assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget &&
+ "High 16 bits of call target should be zero.");
+ unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
+ EncodedBytes = 0;
+ // Materialize the jump address:
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LI8)
+ .addReg(ScratchReg)
+ .addImm((CallTarget >> 32) & 0xFFFF));
+ ++EncodedBytes;
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::RLDIC)
+ .addReg(ScratchReg)
+ .addReg(ScratchReg)
+ .addImm(32).addImm(16));
+ ++EncodedBytes;
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ORIS8)
+ .addReg(ScratchReg)
+ .addReg(ScratchReg)
+ .addImm((CallTarget >> 16) & 0xFFFF));
+ ++EncodedBytes;
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ORI8)
+ .addReg(ScratchReg)
+ .addReg(ScratchReg)
+ .addImm(CallTarget & 0xFFFF));
+
+ // Save the current TOC pointer before the remote call.
+ int TOCSaveOffset = Subtarget->isELFv2ABI() ? 24 : 40;
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::STD)
+ .addReg(PPC::X2)
+ .addImm(TOCSaveOffset)
+ .addReg(PPC::X1));
+ ++EncodedBytes;
+
+ // If we're on ELFv1, then we need to load the actual function pointer
+ // from the function descriptor.
+ if (!Subtarget->isELFv2ABI()) {
+ // Load the new TOC pointer and the function address, but not r11
+ // (needing this is rare, and loading it here would prevent passing it
+ // via a 'nest' parameter.
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LD)
+ .addReg(PPC::X2)
+ .addImm(8)
+ .addReg(ScratchReg));
+ ++EncodedBytes;
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LD)
+ .addReg(ScratchReg)
+ .addImm(0)
+ .addReg(ScratchReg));
+ ++EncodedBytes;
+ }
+
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MTCTR8)
+ .addReg(ScratchReg));
+ ++EncodedBytes;
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::BCTRL8));
+ ++EncodedBytes;
+
+ // Restore the TOC pointer after the call.
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LD)
+ .addReg(PPC::X2)
+ .addImm(TOCSaveOffset)
+ .addReg(PPC::X1));
+ ++EncodedBytes;
+ }
+ } else if (CalleeMO.isGlobal()) {
+ const GlobalValue *GValue = CalleeMO.getGlobal();
+ MCSymbol *MOSymbol = getSymbol(GValue);
+ const MCExpr *SymVar = MCSymbolRefExpr::create(MOSymbol, OutContext);
+
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::BL8_NOP)
+ .addExpr(SymVar));
+ EncodedBytes += 2;
+ }
+
+ // Each instruction is 4 bytes.
+ EncodedBytes *= 4;
+
+ // Emit padding.
+ unsigned NumBytes = Opers.getNumPatchBytes();
+ assert(NumBytes >= EncodedBytes &&
+ "Patchpoint can't request size less than the length of a call.");
+ assert((NumBytes - EncodedBytes) % 4 == 0 &&
+ "Invalid number of NOP bytes requested!");
+ for (unsigned i = EncodedBytes; i < NumBytes; i += 4)
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::NOP));
+}
+
+/// EmitTlsCall -- Given a GETtls[ld]ADDR[32] instruction, print a
+/// call to __tls_get_addr to the current output stream.
+void PPCAsmPrinter::EmitTlsCall(const MachineInstr *MI,
+ MCSymbolRefExpr::VariantKind VK) {
+ StringRef Name = "__tls_get_addr";
+ MCSymbol *TlsGetAddr = OutContext.getOrCreateSymbol(Name);
+ MCSymbolRefExpr::VariantKind Kind = MCSymbolRefExpr::VK_None;
+
+ assert(MI->getOperand(0).isReg() &&
+ ((Subtarget->isPPC64() && MI->getOperand(0).getReg() == PPC::X3) ||
+ (!Subtarget->isPPC64() && MI->getOperand(0).getReg() == PPC::R3)) &&
+ "GETtls[ld]ADDR[32] must define GPR3");
+ assert(MI->getOperand(1).isReg() &&
+ ((Subtarget->isPPC64() && MI->getOperand(1).getReg() == PPC::X3) ||
+ (!Subtarget->isPPC64() && MI->getOperand(1).getReg() == PPC::R3)) &&
+ "GETtls[ld]ADDR[32] must read GPR3");
+
+ if (!Subtarget->isPPC64() && !Subtarget->isDarwin() &&
+ isPositionIndependent())
+ Kind = MCSymbolRefExpr::VK_PLT;
+ const MCSymbolRefExpr *TlsRef =
+ MCSymbolRefExpr::create(TlsGetAddr, Kind, OutContext);
+ const MachineOperand &MO = MI->getOperand(2);
+ const GlobalValue *GValue = MO.getGlobal();
+ MCSymbol *MOSymbol = getSymbol(GValue);
+ const MCExpr *SymVar = MCSymbolRefExpr::create(MOSymbol, VK, OutContext);
+ EmitToStreamer(*OutStreamer,
+ MCInstBuilder(Subtarget->isPPC64() ?
+ PPC::BL8_NOP_TLS : PPC::BL_TLS)
+ .addExpr(TlsRef)
+ .addExpr(SymVar));
+}
+
+/// EmitInstruction -- Print out a single PowerPC MI in Darwin syntax to
+/// the current output stream.
+///
+void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+ MCInst TmpInst;
+ bool isPPC64 = Subtarget->isPPC64();
+ bool isDarwin = TM.getTargetTriple().isOSDarwin();
+ const Module *M = MF->getFunction()->getParent();
+ PICLevel::Level PL = M->getPICLevel();
+
+ // Lower multi-instruction pseudo operations.
+ switch (MI->getOpcode()) {
+ default: break;
+ case TargetOpcode::DBG_VALUE:
+ llvm_unreachable("Should be handled target independently");
+ case TargetOpcode::STACKMAP:
+ return LowerSTACKMAP(SM, *MI);
+ case TargetOpcode::PATCHPOINT:
+ return LowerPATCHPOINT(SM, *MI);
+
+ case PPC::MoveGOTtoLR: {
+ // Transform %LR = MoveGOTtoLR
+ // Into this: bl _GLOBAL_OFFSET_TABLE_@local-4
+ // _GLOBAL_OFFSET_TABLE_@local-4 (instruction preceding
+ // _GLOBAL_OFFSET_TABLE_) has exactly one instruction:
+ // blrl
+ // This will return the pointer to _GLOBAL_OFFSET_TABLE_@local
+ MCSymbol *GOTSymbol =
+ OutContext.getOrCreateSymbol(StringRef("_GLOBAL_OFFSET_TABLE_"));
+ const MCExpr *OffsExpr =
+ MCBinaryExpr::createSub(MCSymbolRefExpr::create(GOTSymbol,
+ MCSymbolRefExpr::VK_PPC_LOCAL,
+ OutContext),
+ MCConstantExpr::create(4, OutContext),
+ OutContext);
+
+ // Emit the 'bl'.
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::BL).addExpr(OffsExpr));
+ return;
+ }
+ case PPC::MovePCtoLR:
+ case PPC::MovePCtoLR8: {
+ // Transform %LR = MovePCtoLR
+ // Into this, where the label is the PIC base:
+ // bl L1$pb
+ // L1$pb:
+ MCSymbol *PICBase = MF->getPICBaseSymbol();
+
+ // Emit the 'bl'.
+ EmitToStreamer(*OutStreamer,
+ MCInstBuilder(PPC::BL)
+ // FIXME: We would like an efficient form for this, so we
+ // don't have to do a lot of extra uniquing.
+ .addExpr(MCSymbolRefExpr::create(PICBase, OutContext)));
+
+ // Emit the label.
+ OutStreamer->EmitLabel(PICBase);
+ return;
+ }
+ case PPC::UpdateGBR: {
+ // Transform %Rd = UpdateGBR(%Rt, %Ri)
+ // Into: lwz %Rt, .L0$poff - .L0$pb(%Ri)
+ // add %Rd, %Rt, %Ri
+ // Get the offset from the GOT Base Register to the GOT
+ LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
+ MCSymbol *PICOffset =
+ MF->getInfo<PPCFunctionInfo>()->getPICOffsetSymbol();
+ TmpInst.setOpcode(PPC::LWZ);
+ const MCExpr *Exp =
+ MCSymbolRefExpr::create(PICOffset, MCSymbolRefExpr::VK_None, OutContext);
+ const MCExpr *PB =
+ MCSymbolRefExpr::create(MF->getPICBaseSymbol(),
+ MCSymbolRefExpr::VK_None,
+ OutContext);
+ const MCOperand TR = TmpInst.getOperand(1);
+ const MCOperand PICR = TmpInst.getOperand(0);
+
+ // Step 1: lwz %Rt, .L$poff - .L$pb(%Ri)
+ TmpInst.getOperand(1) =
+ MCOperand::createExpr(MCBinaryExpr::createSub(Exp, PB, OutContext));
+ TmpInst.getOperand(0) = TR;
+ TmpInst.getOperand(2) = PICR;
+ EmitToStreamer(*OutStreamer, TmpInst);
+
+ TmpInst.setOpcode(PPC::ADD4);
+ TmpInst.getOperand(0) = PICR;
+ TmpInst.getOperand(1) = TR;
+ TmpInst.getOperand(2) = PICR;
+ EmitToStreamer(*OutStreamer, TmpInst);
+ return;
+ }
+ case PPC::LWZtoc: {
+ // Transform %R3 = LWZtoc <ga:@min1>, %R2
+ LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
+
+ // Change the opcode to LWZ, and the global address operand to be a
+ // reference to the GOT entry we will synthesize later.
+ TmpInst.setOpcode(PPC::LWZ);
+ const MachineOperand &MO = MI->getOperand(1);
+
+ // Map symbol -> label of TOC entry
+ assert(MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress());
+ MCSymbol *MOSymbol = nullptr;
+ if (MO.isGlobal())
+ MOSymbol = getSymbol(MO.getGlobal());
+ else if (MO.isCPI())
+ MOSymbol = GetCPISymbol(MO.getIndex());
+ else if (MO.isJTI())
+ MOSymbol = GetJTISymbol(MO.getIndex());
+ else if (MO.isBlockAddress())
+ MOSymbol = GetBlockAddressSymbol(MO.getBlockAddress());
+
+ if (PL == PICLevel::SmallPIC) {
+ const MCExpr *Exp =
+ MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_GOT,
+ OutContext);
+ TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
+ } else {
+ MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol);
+
+ const MCExpr *Exp =
+ MCSymbolRefExpr::create(TOCEntry, MCSymbolRefExpr::VK_None,
+ OutContext);
+ const MCExpr *PB =
+ MCSymbolRefExpr::create(OutContext.getOrCreateSymbol(Twine(".LTOC")),
+ OutContext);
+ Exp = MCBinaryExpr::createSub(Exp, PB, OutContext);
+ TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
+ }
+ EmitToStreamer(*OutStreamer, TmpInst);
+ return;
+ }
+ case PPC::LDtocJTI:
+ case PPC::LDtocCPT:
+ case PPC::LDtocBA:
+ case PPC::LDtoc: {
+ // Transform %X3 = LDtoc <ga:@min1>, %X2
+ LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
+
+ // Change the opcode to LD, and the global address operand to be a
+ // reference to the TOC entry we will synthesize later.
+ TmpInst.setOpcode(PPC::LD);
+ const MachineOperand &MO = MI->getOperand(1);
+
+ // Map symbol -> label of TOC entry
+ assert(MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress());
+ MCSymbol *MOSymbol = nullptr;
+ if (MO.isGlobal())
+ MOSymbol = getSymbol(MO.getGlobal());
+ else if (MO.isCPI())
+ MOSymbol = GetCPISymbol(MO.getIndex());
+ else if (MO.isJTI())
+ MOSymbol = GetJTISymbol(MO.getIndex());
+ else if (MO.isBlockAddress())
+ MOSymbol = GetBlockAddressSymbol(MO.getBlockAddress());
+
+ MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol);
+
+ const MCExpr *Exp =
+ MCSymbolRefExpr::create(TOCEntry, MCSymbolRefExpr::VK_PPC_TOC,
+ OutContext);
+ TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
+ EmitToStreamer(*OutStreamer, TmpInst);
+ return;
+ }
+
+ case PPC::ADDIStocHA: {
+ // Transform %Xd = ADDIStocHA %X2, <ga:@sym>
+ LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
+
+ // Change the opcode to ADDIS8. If the global address is external, has
+ // common linkage, is a non-local function address, or is a jump table
+ // address, then generate a TOC entry and reference that. Otherwise
+ // reference the symbol directly.
+ TmpInst.setOpcode(PPC::ADDIS8);
+ const MachineOperand &MO = MI->getOperand(2);
+ assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() ||
+ MO.isBlockAddress()) &&
+ "Invalid operand for ADDIStocHA!");
+ MCSymbol *MOSymbol = nullptr;
+ bool GlobalToc = false;
+
+ if (MO.isGlobal()) {
+ const GlobalValue *GV = MO.getGlobal();
+ MOSymbol = getSymbol(GV);
+ unsigned char GVFlags = Subtarget->classifyGlobalReference(GV);
+ GlobalToc = (GVFlags & PPCII::MO_NLP_FLAG);
+ } else if (MO.isCPI()) {
+ MOSymbol = GetCPISymbol(MO.getIndex());
+ } else if (MO.isJTI()) {
+ MOSymbol = GetJTISymbol(MO.getIndex());
+ } else if (MO.isBlockAddress()) {
+ MOSymbol = GetBlockAddressSymbol(MO.getBlockAddress());
+ }
+
+ if (GlobalToc || MO.isJTI() || MO.isBlockAddress() ||
+ TM.getCodeModel() == CodeModel::Large)
+ MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
+
+ const MCExpr *Exp =
+ MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_HA,
+ OutContext);
+
+ if (!MO.isJTI() && MO.getOffset())
+ Exp = MCBinaryExpr::createAdd(Exp,
+ MCConstantExpr::create(MO.getOffset(),
+ OutContext),
+ OutContext);
+
+ TmpInst.getOperand(2) = MCOperand::createExpr(Exp);
+ EmitToStreamer(*OutStreamer, TmpInst);
+ return;
+ }
+ case PPC::LDtocL: {
+ // Transform %Xd = LDtocL <ga:@sym>, %Xs
+ LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
+
+ // Change the opcode to LD. If the global address is external, has
+ // common linkage, or is a jump table address, then reference the
+ // associated TOC entry. Otherwise reference the symbol directly.
+ TmpInst.setOpcode(PPC::LD);
+ const MachineOperand &MO = MI->getOperand(1);
+ assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() ||
+ MO.isBlockAddress()) &&
+ "Invalid operand for LDtocL!");
+ MCSymbol *MOSymbol = nullptr;
+
+ if (MO.isJTI())
+ MOSymbol = lookUpOrCreateTOCEntry(GetJTISymbol(MO.getIndex()));
+ else if (MO.isBlockAddress()) {
+ MOSymbol = GetBlockAddressSymbol(MO.getBlockAddress());
+ MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
+ }
+ else if (MO.isCPI()) {
+ MOSymbol = GetCPISymbol(MO.getIndex());
+ if (TM.getCodeModel() == CodeModel::Large)
+ MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
+ }
+ else if (MO.isGlobal()) {
+ const GlobalValue *GV = MO.getGlobal();
+ MOSymbol = getSymbol(GV);
+ DEBUG(
+ unsigned char GVFlags = Subtarget->classifyGlobalReference(GV);
+ assert((GVFlags & PPCII::MO_NLP_FLAG) &&
+ "LDtocL used on symbol that could be accessed directly is "
+ "invalid. Must match ADDIStocHA."));
+ MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
+ }
+
+ const MCExpr *Exp =
+ MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_LO,
+ OutContext);
+ TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
+ EmitToStreamer(*OutStreamer, TmpInst);
+ return;
+ }
+ case PPC::ADDItocL: {
+ // Transform %Xd = ADDItocL %Xs, <ga:@sym>
+ LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
+
+ // Change the opcode to ADDI8. If the global address is external, then
+ // generate a TOC entry and reference that. Otherwise reference the
+ // symbol directly.
+ TmpInst.setOpcode(PPC::ADDI8);
+ const MachineOperand &MO = MI->getOperand(2);
+ assert((MO.isGlobal() || MO.isCPI()) && "Invalid operand for ADDItocL");
+ MCSymbol *MOSymbol = nullptr;
+
+ if (MO.isGlobal()) {
+ const GlobalValue *GV = MO.getGlobal();
+ DEBUG(
+ unsigned char GVFlags = Subtarget->classifyGlobalReference(GV);
+ assert (
+ !(GVFlags & PPCII::MO_NLP_FLAG) &&
+ "Interposable definitions must use indirect access."));
+ MOSymbol = getSymbol(GV);
+ } else if (MO.isCPI()) {
+ MOSymbol = GetCPISymbol(MO.getIndex());
+ }
+
+ const MCExpr *Exp =
+ MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_LO,
+ OutContext);
+ TmpInst.getOperand(2) = MCOperand::createExpr(Exp);
+ EmitToStreamer(*OutStreamer, TmpInst);
+ return;
+ }
+ case PPC::ADDISgotTprelHA: {
+ // Transform: %Xd = ADDISgotTprelHA %X2, <ga:@sym>
+ // Into: %Xd = ADDIS8 %X2, sym@got@tlsgd@ha
+ assert(Subtarget->isPPC64() && "Not supported for 32-bit PowerPC");
+ const MachineOperand &MO = MI->getOperand(2);
+ const GlobalValue *GValue = MO.getGlobal();
+ MCSymbol *MOSymbol = getSymbol(GValue);
+ const MCExpr *SymGotTprel =
+ MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL_HA,
+ OutContext);
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS8)
+ .addReg(MI->getOperand(0).getReg())
+ .addReg(MI->getOperand(1).getReg())
+ .addExpr(SymGotTprel));
+ return;
+ }
+ case PPC::LDgotTprelL:
+ case PPC::LDgotTprelL32: {
+ // Transform %Xd = LDgotTprelL <ga:@sym>, %Xs
+ LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
+
+ // Change the opcode to LD.
+ TmpInst.setOpcode(isPPC64 ? PPC::LD : PPC::LWZ);
+ const MachineOperand &MO = MI->getOperand(1);
+ const GlobalValue *GValue = MO.getGlobal();
+ MCSymbol *MOSymbol = getSymbol(GValue);
+ const MCExpr *Exp =
+ MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL_LO,
+ OutContext);
+ TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
+ EmitToStreamer(*OutStreamer, TmpInst);
+ return;
+ }
+
+ case PPC::PPC32PICGOT: {
+ MCSymbol *GOTSymbol = OutContext.getOrCreateSymbol(StringRef("_GLOBAL_OFFSET_TABLE_"));
+ MCSymbol *GOTRef = OutContext.createTempSymbol();
+ MCSymbol *NextInstr = OutContext.createTempSymbol();
+
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::BL)
+ // FIXME: We would like an efficient form for this, so we don't have to do
+ // a lot of extra uniquing.
+ .addExpr(MCSymbolRefExpr::create(NextInstr, OutContext)));
+ const MCExpr *OffsExpr =
+ MCBinaryExpr::createSub(MCSymbolRefExpr::create(GOTSymbol, OutContext),
+ MCSymbolRefExpr::create(GOTRef, OutContext),
+ OutContext);
+ OutStreamer->EmitLabel(GOTRef);
+ OutStreamer->EmitValue(OffsExpr, 4);
+ OutStreamer->EmitLabel(NextInstr);
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MFLR)
+ .addReg(MI->getOperand(0).getReg()));
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LWZ)
+ .addReg(MI->getOperand(1).getReg())
+ .addImm(0)
+ .addReg(MI->getOperand(0).getReg()));
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADD4)
+ .addReg(MI->getOperand(0).getReg())
+ .addReg(MI->getOperand(1).getReg())
+ .addReg(MI->getOperand(0).getReg()));
+ return;
+ }
+ case PPC::PPC32GOT: {
+ MCSymbol *GOTSymbol =
+ OutContext.getOrCreateSymbol(StringRef("_GLOBAL_OFFSET_TABLE_"));
+ const MCExpr *SymGotTlsL = MCSymbolRefExpr::create(
+ GOTSymbol, MCSymbolRefExpr::VK_PPC_LO, OutContext);
+ const MCExpr *SymGotTlsHA = MCSymbolRefExpr::create(
+ GOTSymbol, MCSymbolRefExpr::VK_PPC_HA, OutContext);
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LI)
+ .addReg(MI->getOperand(0).getReg())
+ .addExpr(SymGotTlsL));
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS)
+ .addReg(MI->getOperand(0).getReg())
+ .addReg(MI->getOperand(0).getReg())
+ .addExpr(SymGotTlsHA));
+ return;
+ }
+ case PPC::ADDIStlsgdHA: {
+ // Transform: %Xd = ADDIStlsgdHA %X2, <ga:@sym>
+ // Into: %Xd = ADDIS8 %X2, sym@got@tlsgd@ha
+ assert(Subtarget->isPPC64() && "Not supported for 32-bit PowerPC");
+ const MachineOperand &MO = MI->getOperand(2);
+ const GlobalValue *GValue = MO.getGlobal();
+ MCSymbol *MOSymbol = getSymbol(GValue);
+ const MCExpr *SymGotTlsGD =
+ MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSGD_HA,
+ OutContext);
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS8)
+ .addReg(MI->getOperand(0).getReg())
+ .addReg(MI->getOperand(1).getReg())
+ .addExpr(SymGotTlsGD));
+ return;
+ }
+ case PPC::ADDItlsgdL:
+ // Transform: %Xd = ADDItlsgdL %Xs, <ga:@sym>
+ // Into: %Xd = ADDI8 %Xs, sym@got@tlsgd@l
+ case PPC::ADDItlsgdL32: {
+ // Transform: %Rd = ADDItlsgdL32 %Rs, <ga:@sym>
+ // Into: %Rd = ADDI %Rs, sym@got@tlsgd
+ const MachineOperand &MO = MI->getOperand(2);
+ const GlobalValue *GValue = MO.getGlobal();
+ MCSymbol *MOSymbol = getSymbol(GValue);
+ const MCExpr *SymGotTlsGD = MCSymbolRefExpr::create(
+ MOSymbol, Subtarget->isPPC64() ? MCSymbolRefExpr::VK_PPC_GOT_TLSGD_LO
+ : MCSymbolRefExpr::VK_PPC_GOT_TLSGD,
+ OutContext);
+ EmitToStreamer(*OutStreamer,
+ MCInstBuilder(Subtarget->isPPC64() ? PPC::ADDI8 : PPC::ADDI)
+ .addReg(MI->getOperand(0).getReg())
+ .addReg(MI->getOperand(1).getReg())
+ .addExpr(SymGotTlsGD));
+ return;
+ }
+ case PPC::GETtlsADDR:
+ // Transform: %X3 = GETtlsADDR %X3, <ga:@sym>
+ // Into: BL8_NOP_TLS __tls_get_addr(sym at tlsgd)
+ case PPC::GETtlsADDR32: {
+ // Transform: %R3 = GETtlsADDR32 %R3, <ga:@sym>
+ // Into: BL_TLS __tls_get_addr(sym at tlsgd)@PLT
+ EmitTlsCall(MI, MCSymbolRefExpr::VK_PPC_TLSGD);
+ return;
+ }
+ case PPC::ADDIStlsldHA: {
+ // Transform: %Xd = ADDIStlsldHA %X2, <ga:@sym>
+ // Into: %Xd = ADDIS8 %X2, sym@got@tlsld@ha
+ assert(Subtarget->isPPC64() && "Not supported for 32-bit PowerPC");
+ const MachineOperand &MO = MI->getOperand(2);
+ const GlobalValue *GValue = MO.getGlobal();
+ MCSymbol *MOSymbol = getSymbol(GValue);
+ const MCExpr *SymGotTlsLD =
+ MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSLD_HA,
+ OutContext);
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS8)
+ .addReg(MI->getOperand(0).getReg())
+ .addReg(MI->getOperand(1).getReg())
+ .addExpr(SymGotTlsLD));
+ return;
+ }
+ case PPC::ADDItlsldL:
+ // Transform: %Xd = ADDItlsldL %Xs, <ga:@sym>
+ // Into: %Xd = ADDI8 %Xs, sym@got@tlsld@l
+ case PPC::ADDItlsldL32: {
+ // Transform: %Rd = ADDItlsldL32 %Rs, <ga:@sym>
+ // Into: %Rd = ADDI %Rs, sym@got@tlsld
+ const MachineOperand &MO = MI->getOperand(2);
+ const GlobalValue *GValue = MO.getGlobal();
+ MCSymbol *MOSymbol = getSymbol(GValue);
+ const MCExpr *SymGotTlsLD = MCSymbolRefExpr::create(
+ MOSymbol, Subtarget->isPPC64() ? MCSymbolRefExpr::VK_PPC_GOT_TLSLD_LO
+ : MCSymbolRefExpr::VK_PPC_GOT_TLSLD,
+ OutContext);
+ EmitToStreamer(*OutStreamer,
+ MCInstBuilder(Subtarget->isPPC64() ? PPC::ADDI8 : PPC::ADDI)
+ .addReg(MI->getOperand(0).getReg())
+ .addReg(MI->getOperand(1).getReg())
+ .addExpr(SymGotTlsLD));
+ return;
+ }
+ case PPC::GETtlsldADDR:
+ // Transform: %X3 = GETtlsldADDR %X3, <ga:@sym>
+ // Into: BL8_NOP_TLS __tls_get_addr(sym at tlsld)
+ case PPC::GETtlsldADDR32: {
+ // Transform: %R3 = GETtlsldADDR32 %R3, <ga:@sym>
+ // Into: BL_TLS __tls_get_addr(sym at tlsld)@PLT
+ EmitTlsCall(MI, MCSymbolRefExpr::VK_PPC_TLSLD);
+ return;
+ }
+ case PPC::ADDISdtprelHA:
+ // Transform: %Xd = ADDISdtprelHA %Xs, <ga:@sym>
+ // Into: %Xd = ADDIS8 %Xs, sym@dtprel@ha
+ case PPC::ADDISdtprelHA32: {
+ // Transform: %Rd = ADDISdtprelHA32 %Rs, <ga:@sym>
+ // Into: %Rd = ADDIS %Rs, sym@dtprel@ha
+ const MachineOperand &MO = MI->getOperand(2);
+ const GlobalValue *GValue = MO.getGlobal();
+ MCSymbol *MOSymbol = getSymbol(GValue);
+ const MCExpr *SymDtprel =
+ MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_DTPREL_HA,
+ OutContext);
+ EmitToStreamer(
+ *OutStreamer,
+ MCInstBuilder(Subtarget->isPPC64() ? PPC::ADDIS8 : PPC::ADDIS)
+ .addReg(MI->getOperand(0).getReg())
+ .addReg(MI->getOperand(1).getReg())
+ .addExpr(SymDtprel));
+ return;
+ }
+ case PPC::ADDIdtprelL:
+ // Transform: %Xd = ADDIdtprelL %Xs, <ga:@sym>
+ // Into: %Xd = ADDI8 %Xs, sym@dtprel@l
+ case PPC::ADDIdtprelL32: {
+ // Transform: %Rd = ADDIdtprelL32 %Rs, <ga:@sym>
+ // Into: %Rd = ADDI %Rs, sym@dtprel@l
+ const MachineOperand &MO = MI->getOperand(2);
+ const GlobalValue *GValue = MO.getGlobal();
+ MCSymbol *MOSymbol = getSymbol(GValue);
+ const MCExpr *SymDtprel =
+ MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_DTPREL_LO,
+ OutContext);
+ EmitToStreamer(*OutStreamer,
+ MCInstBuilder(Subtarget->isPPC64() ? PPC::ADDI8 : PPC::ADDI)
+ .addReg(MI->getOperand(0).getReg())
+ .addReg(MI->getOperand(1).getReg())
+ .addExpr(SymDtprel));
+ return;
+ }
+ case PPC::MFOCRF:
+ case PPC::MFOCRF8:
+ if (!Subtarget->hasMFOCRF()) {
+ // Transform: %R3 = MFOCRF %CR7
+ // Into: %R3 = MFCR ;; cr7
+ unsigned NewOpcode =
+ MI->getOpcode() == PPC::MFOCRF ? PPC::MFCR : PPC::MFCR8;
+ OutStreamer->AddComment(PPCInstPrinter::
+ getRegisterName(MI->getOperand(1).getReg()));
+ EmitToStreamer(*OutStreamer, MCInstBuilder(NewOpcode)
+ .addReg(MI->getOperand(0).getReg()));
+ return;
+ }
+ break;
+ case PPC::MTOCRF:
+ case PPC::MTOCRF8:
+ if (!Subtarget->hasMFOCRF()) {
+ // Transform: %CR7 = MTOCRF %R3
+ // Into: MTCRF mask, %R3 ;; cr7
+ unsigned NewOpcode =
+ MI->getOpcode() == PPC::MTOCRF ? PPC::MTCRF : PPC::MTCRF8;
+ unsigned Mask = 0x80 >> OutContext.getRegisterInfo()
+ ->getEncodingValue(MI->getOperand(0).getReg());
+ OutStreamer->AddComment(PPCInstPrinter::
+ getRegisterName(MI->getOperand(0).getReg()));
+ EmitToStreamer(*OutStreamer, MCInstBuilder(NewOpcode)
+ .addImm(Mask)
+ .addReg(MI->getOperand(1).getReg()));
+ return;
+ }
+ break;
+ case PPC::LD:
+ case PPC::STD:
+ case PPC::LWA_32:
+ case PPC::LWA: {
+ // Verify alignment is legal, so we don't create relocations
+ // that can't be supported.
+ // FIXME: This test is currently disabled for Darwin. The test
+ // suite shows a handful of test cases that fail this check for
+ // Darwin. Those need to be investigated before this sanity test
+ // can be enabled for those subtargets.
+ if (!Subtarget->isDarwin()) {
+ unsigned OpNum = (MI->getOpcode() == PPC::STD) ? 2 : 1;
+ const MachineOperand &MO = MI->getOperand(OpNum);
+ if (MO.isGlobal() && MO.getGlobal()->getAlignment() < 4)
+ llvm_unreachable("Global must be word-aligned for LD, STD, LWA!");
+ }
+ // Now process the instruction normally.
+ break;
+ }
+ }
+
+ LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
+ EmitToStreamer(*OutStreamer, TmpInst);
+}
+
+void PPCLinuxAsmPrinter::EmitStartOfAsmFile(Module &M) {
+ if (static_cast<const PPCTargetMachine &>(TM).isELFv2ABI()) {
+ PPCTargetStreamer *TS =
+ static_cast<PPCTargetStreamer *>(OutStreamer->getTargetStreamer());
+
+ if (TS)
+ TS->emitAbiVersion(2);
+ }
+
+ if (static_cast<const PPCTargetMachine &>(TM).isPPC64() ||
+ !isPositionIndependent())
+ return AsmPrinter::EmitStartOfAsmFile(M);
+
+ if (M.getPICLevel() == PICLevel::SmallPIC)
+ return AsmPrinter::EmitStartOfAsmFile(M);
+
+ OutStreamer->SwitchSection(OutContext.getELFSection(
+ ".got2", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC));
+
+ MCSymbol *TOCSym = OutContext.getOrCreateSymbol(Twine(".LTOC"));
+ MCSymbol *CurrentPos = OutContext.createTempSymbol();
+
+ OutStreamer->EmitLabel(CurrentPos);
+
+ // The GOT pointer points to the middle of the GOT, in order to reference the
+ // entire 64kB range. 0x8000 is the midpoint.
+ const MCExpr *tocExpr =
+ MCBinaryExpr::createAdd(MCSymbolRefExpr::create(CurrentPos, OutContext),
+ MCConstantExpr::create(0x8000, OutContext),
+ OutContext);
+
+ OutStreamer->EmitAssignment(TOCSym, tocExpr);
+
+ OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
+}
+
+void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() {
+ // linux/ppc32 - Normal entry label.
+ if (!Subtarget->isPPC64() &&
+ (!isPositionIndependent() ||
+ MF->getFunction()->getParent()->getPICLevel() == PICLevel::SmallPIC))
+ return AsmPrinter::EmitFunctionEntryLabel();
+
+ if (!Subtarget->isPPC64()) {
+ const PPCFunctionInfo *PPCFI = MF->getInfo<PPCFunctionInfo>();
+ if (PPCFI->usesPICBase()) {
+ MCSymbol *RelocSymbol = PPCFI->getPICOffsetSymbol();
+ MCSymbol *PICBase = MF->getPICBaseSymbol();
+ OutStreamer->EmitLabel(RelocSymbol);
+
+ const MCExpr *OffsExpr =
+ MCBinaryExpr::createSub(
+ MCSymbolRefExpr::create(OutContext.getOrCreateSymbol(Twine(".LTOC")),
+ OutContext),
+ MCSymbolRefExpr::create(PICBase, OutContext),
+ OutContext);
+ OutStreamer->EmitValue(OffsExpr, 4);
+ OutStreamer->EmitLabel(CurrentFnSym);
+ return;
+ } else
+ return AsmPrinter::EmitFunctionEntryLabel();
+ }
+
+ // ELFv2 ABI - Normal entry label.
+ if (Subtarget->isELFv2ABI()) {
+ // In the Large code model, we allow arbitrary displacements between
+ // the text section and its associated TOC section. We place the
+ // full 8-byte offset to the TOC in memory immediatedly preceding
+ // the function global entry point.
+ if (TM.getCodeModel() == CodeModel::Large
+ && !MF->getRegInfo().use_empty(PPC::X2)) {
+ const PPCFunctionInfo *PPCFI = MF->getInfo<PPCFunctionInfo>();
+
+ MCSymbol *TOCSymbol = OutContext.getOrCreateSymbol(StringRef(".TOC."));
+ MCSymbol *GlobalEPSymbol = PPCFI->getGlobalEPSymbol();
+ const MCExpr *TOCDeltaExpr =
+ MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCSymbol, OutContext),
+ MCSymbolRefExpr::create(GlobalEPSymbol,
+ OutContext),
+ OutContext);
+
+ OutStreamer->EmitLabel(PPCFI->getTOCOffsetSymbol());
+ OutStreamer->EmitValue(TOCDeltaExpr, 8);
+ }
+ return AsmPrinter::EmitFunctionEntryLabel();
+ }
+
+ // Emit an official procedure descriptor.
+ MCSectionSubPair Current = OutStreamer->getCurrentSection();
+ MCSectionELF *Section = OutStreamer->getContext().getELFSection(
+ ".opd", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
+ OutStreamer->SwitchSection(Section);
+ OutStreamer->EmitLabel(CurrentFnSym);
+ OutStreamer->EmitValueToAlignment(8);
+ MCSymbol *Symbol1 = CurrentFnSymForSize;
+ // Generates a R_PPC64_ADDR64 (from FK_DATA_8) relocation for the function
+ // entry point.
+ OutStreamer->EmitValue(MCSymbolRefExpr::create(Symbol1, OutContext),
+ 8 /*size*/);
+ MCSymbol *Symbol2 = OutContext.getOrCreateSymbol(StringRef(".TOC."));
+ // Generates a R_PPC64_TOC relocation for TOC base insertion.
+ OutStreamer->EmitValue(
+ MCSymbolRefExpr::create(Symbol2, MCSymbolRefExpr::VK_PPC_TOCBASE, OutContext),
+ 8/*size*/);
+ // Emit a null environment pointer.
+ OutStreamer->EmitIntValue(0, 8 /* size */);
+ OutStreamer->SwitchSection(Current.first, Current.second);
+}
+
+bool PPCLinuxAsmPrinter::doFinalization(Module &M) {
+ const DataLayout &DL = getDataLayout();
+
+ bool isPPC64 = DL.getPointerSizeInBits() == 64;
+
+ PPCTargetStreamer &TS =
+ static_cast<PPCTargetStreamer &>(*OutStreamer->getTargetStreamer());
+
+ if (!TOC.empty()) {
+ MCSectionELF *Section;
+
+ if (isPPC64)
+ Section = OutStreamer->getContext().getELFSection(
+ ".toc", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
+ else
+ Section = OutStreamer->getContext().getELFSection(
+ ".got2", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
+ OutStreamer->SwitchSection(Section);
+
+ for (MapVector<MCSymbol*, MCSymbol*>::iterator I = TOC.begin(),
+ E = TOC.end(); I != E; ++I) {
+ OutStreamer->EmitLabel(I->second);
+ MCSymbol *S = I->first;
+ if (isPPC64) {
+ TS.emitTCEntry(*S);
+ } else {
+ OutStreamer->EmitValueToAlignment(4);
+ OutStreamer->EmitSymbolValue(S, 4);
+ }
+ }
+ }
+
+ return AsmPrinter::doFinalization(M);
+}
+
+/// EmitFunctionBodyStart - Emit a global entry point prefix for ELFv2.
+void PPCLinuxAsmPrinter::EmitFunctionBodyStart() {
+ // In the ELFv2 ABI, in functions that use the TOC register, we need to
+ // provide two entry points. The ABI guarantees that when calling the
+ // local entry point, r2 is set up by the caller to contain the TOC base
+ // for this function, and when calling the global entry point, r12 is set
+ // up by the caller to hold the address of the global entry point. We
+ // thus emit a prefix sequence along the following lines:
+ //
+ // func:
+ // .Lfunc_gepNN:
+ // # global entry point
+ // addis r2,r12,(.TOC.-.Lfunc_gepNN)@ha
+ // addi r2,r2,(.TOC.-.Lfunc_gepNN)@l
+ // .Lfunc_lepNN:
+ // .localentry func, .Lfunc_lepNN-.Lfunc_gepNN
+ // # local entry point, followed by function body
+ //
+ // For the Large code model, we create
+ //
+ // .Lfunc_tocNN:
+ // .quad .TOC.-.Lfunc_gepNN # done by EmitFunctionEntryLabel
+ // func:
+ // .Lfunc_gepNN:
+ // # global entry point
+ // ld r2,.Lfunc_tocNN-.Lfunc_gepNN(r12)
+ // add r2,r2,r12
+ // .Lfunc_lepNN:
+ // .localentry func, .Lfunc_lepNN-.Lfunc_gepNN
+ // # local entry point, followed by function body
+ //
+ // This ensures we have r2 set up correctly while executing the function
+ // body, no matter which entry point is called.
+ if (Subtarget->isELFv2ABI()
+ // Only do all that if the function uses r2 in the first place.
+ && !MF->getRegInfo().use_empty(PPC::X2)) {
+ // Note: The logic here must be synchronized with the code in the
+ // branch-selection pass which sets the offset of the first block in the
+ // function. This matters because it affects the alignment.
+ const PPCFunctionInfo *PPCFI = MF->getInfo<PPCFunctionInfo>();
+
+ MCSymbol *GlobalEntryLabel = PPCFI->getGlobalEPSymbol();
+ OutStreamer->EmitLabel(GlobalEntryLabel);
+ const MCSymbolRefExpr *GlobalEntryLabelExp =
+ MCSymbolRefExpr::create(GlobalEntryLabel, OutContext);
+
+ if (TM.getCodeModel() != CodeModel::Large) {
+ MCSymbol *TOCSymbol = OutContext.getOrCreateSymbol(StringRef(".TOC."));
+ const MCExpr *TOCDeltaExpr =
+ MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCSymbol, OutContext),
+ GlobalEntryLabelExp, OutContext);
+
+ const MCExpr *TOCDeltaHi =
+ PPCMCExpr::createHa(TOCDeltaExpr, false, OutContext);
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS)
+ .addReg(PPC::X2)
+ .addReg(PPC::X12)
+ .addExpr(TOCDeltaHi));
+
+ const MCExpr *TOCDeltaLo =
+ PPCMCExpr::createLo(TOCDeltaExpr, false, OutContext);
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDI)
+ .addReg(PPC::X2)
+ .addReg(PPC::X2)
+ .addExpr(TOCDeltaLo));
+ } else {
+ MCSymbol *TOCOffset = PPCFI->getTOCOffsetSymbol();
+ const MCExpr *TOCOffsetDeltaExpr =
+ MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCOffset, OutContext),
+ GlobalEntryLabelExp, OutContext);
+
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LD)
+ .addReg(PPC::X2)
+ .addExpr(TOCOffsetDeltaExpr)
+ .addReg(PPC::X12));
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADD8)
+ .addReg(PPC::X2)
+ .addReg(PPC::X2)
+ .addReg(PPC::X12));
+ }
+
+ MCSymbol *LocalEntryLabel = PPCFI->getLocalEPSymbol();
+ OutStreamer->EmitLabel(LocalEntryLabel);
+ const MCSymbolRefExpr *LocalEntryLabelExp =
+ MCSymbolRefExpr::create(LocalEntryLabel, OutContext);
+ const MCExpr *LocalOffsetExp =
+ MCBinaryExpr::createSub(LocalEntryLabelExp,
+ GlobalEntryLabelExp, OutContext);
+
+ PPCTargetStreamer *TS =
+ static_cast<PPCTargetStreamer *>(OutStreamer->getTargetStreamer());
+
+ if (TS)
+ TS->emitLocalEntry(cast<MCSymbolELF>(CurrentFnSym), LocalOffsetExp);
+ }
+}
+
+/// EmitFunctionBodyEnd - Print the traceback table before the .size
+/// directive.
+///
+void PPCLinuxAsmPrinter::EmitFunctionBodyEnd() {
+ // Only the 64-bit target requires a traceback table. For now,
+ // we only emit the word of zeroes that GDB requires to find
+ // the end of the function, and zeroes for the eight-byte
+ // mandatory fields.
+ // FIXME: We should fill in the eight-byte mandatory fields as described in
+ // the PPC64 ELF ABI (this is a low-priority item because GDB does not
+ // currently make use of these fields).
+ if (Subtarget->isPPC64()) {
+ OutStreamer->EmitIntValue(0, 4/*size*/);
+ OutStreamer->EmitIntValue(0, 8/*size*/);
+ }
+}
+
+void PPCDarwinAsmPrinter::EmitStartOfAsmFile(Module &M) {
+ static const char *const CPUDirectives[] = {
+ "",
+ "ppc",
+ "ppc440",
+ "ppc601",
+ "ppc602",
+ "ppc603",
+ "ppc7400",
+ "ppc750",
+ "ppc970",
+ "ppcA2",
+ "ppce500mc",
+ "ppce5500",
+ "power3",
+ "power4",
+ "power5",
+ "power5x",
+ "power6",
+ "power6x",
+ "power7",
+ // FIXME: why is power8 missing here?
+ "ppc64",
+ "ppc64le",
+ "power9"
+ };
+
+ // Get the numerically largest directive.
+ // FIXME: How should we merge darwin directives?
+ unsigned Directive = PPC::DIR_NONE;
+ for (const Function &F : M) {
+ const PPCSubtarget &STI = TM.getSubtarget<PPCSubtarget>(F);
+ unsigned FDir = STI.getDarwinDirective();
+ Directive = Directive > FDir ? FDir : STI.getDarwinDirective();
+ if (STI.hasMFOCRF() && Directive < PPC::DIR_970)
+ Directive = PPC::DIR_970;
+ if (STI.hasAltivec() && Directive < PPC::DIR_7400)
+ Directive = PPC::DIR_7400;
+ if (STI.isPPC64() && Directive < PPC::DIR_64)
+ Directive = PPC::DIR_64;
+ }
+
+ assert(Directive <= PPC::DIR_64 && "Directive out of range.");
+
+ assert(Directive < array_lengthof(CPUDirectives) &&
+ "CPUDirectives[] might not be up-to-date!");
+ PPCTargetStreamer &TStreamer =
+ *static_cast<PPCTargetStreamer *>(OutStreamer->getTargetStreamer());
+ TStreamer.emitMachine(CPUDirectives[Directive]);
+
+ // Prime text sections so they are adjacent. This reduces the likelihood a
+ // large data or debug section causes a branch to exceed 16M limit.
+ const TargetLoweringObjectFileMachO &TLOFMacho =
+ static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering());
+ OutStreamer->SwitchSection(TLOFMacho.getTextCoalSection());
+ if (TM.getRelocationModel() == Reloc::PIC_) {
+ OutStreamer->SwitchSection(
+ OutContext.getMachOSection("__TEXT", "__picsymbolstub1",
+ MachO::S_SYMBOL_STUBS |
+ MachO::S_ATTR_PURE_INSTRUCTIONS,
+ 32, SectionKind::getText()));
+ } else if (TM.getRelocationModel() == Reloc::DynamicNoPIC) {
+ OutStreamer->SwitchSection(
+ OutContext.getMachOSection("__TEXT","__symbol_stub1",
+ MachO::S_SYMBOL_STUBS |
+ MachO::S_ATTR_PURE_INSTRUCTIONS,
+ 16, SectionKind::getText()));
+ }
+ OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
+}
+
+bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
+ bool isPPC64 = getDataLayout().getPointerSizeInBits() == 64;
+
+ // Darwin/PPC always uses mach-o.
+ const TargetLoweringObjectFileMachO &TLOFMacho =
+ static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering());
+ if (MMI) {
+ MachineModuleInfoMachO &MMIMacho =
+ MMI->getObjFileInfo<MachineModuleInfoMachO>();
+
+ if (MAI->doesSupportExceptionHandling()) {
+ // Add the (possibly multiple) personalities to the set of global values.
+ // Only referenced functions get into the Personalities list.
+ for (const Function *Personality : MMI->getPersonalities()) {
+ if (Personality) {
+ MCSymbol *NLPSym =
+ getSymbolWithGlobalValueBase(Personality, "$non_lazy_ptr");
+ MachineModuleInfoImpl::StubValueTy &StubSym =
+ MMIMacho.getGVStubEntry(NLPSym);
+ StubSym =
+ MachineModuleInfoImpl::StubValueTy(getSymbol(Personality), true);
+ }
+ }
+ }
+
+ // Output stubs for dynamically-linked functions.
+ MachineModuleInfoMachO::SymbolListTy Stubs = MMIMacho.GetGVStubList();
+
+ // Output macho stubs for external and common global variables.
+ if (!Stubs.empty()) {
+ // Switch with ".non_lazy_symbol_pointer" directive.
+ OutStreamer->SwitchSection(TLOFMacho.getNonLazySymbolPointerSection());
+ EmitAlignment(isPPC64 ? 3 : 2);
+
+ for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+ // L_foo$stub:
+ OutStreamer->EmitLabel(Stubs[i].first);
+ // .indirect_symbol _foo
+ MachineModuleInfoImpl::StubValueTy &MCSym = Stubs[i].second;
+ OutStreamer->EmitSymbolAttribute(MCSym.getPointer(),
+ MCSA_IndirectSymbol);
+
+ if (MCSym.getInt())
+ // External to current translation unit.
+ OutStreamer->EmitIntValue(0, isPPC64 ? 8 : 4 /*size*/);
+ else
+ // Internal to current translation unit.
+ //
+ // When we place the LSDA into the TEXT section, the type info
+ // pointers
+ // need to be indirect and pc-rel. We accomplish this by using NLPs.
+ // However, sometimes the types are local to the file. So we need to
+ // fill in the value for the NLP in those cases.
+ OutStreamer->EmitValue(
+ MCSymbolRefExpr::create(MCSym.getPointer(), OutContext),
+ isPPC64 ? 8 : 4 /*size*/);
+ }
+
+ Stubs.clear();
+ OutStreamer->AddBlankLine();
+ }
+ }
+
+ // Funny Darwin hack: This flag tells the linker that no global symbols
+ // contain code that falls through to other global symbols (e.g. the obvious
+ // implementation of multiple entry points). If this doesn't occur, the
+ // linker can safely perform dead code stripping. Since LLVM never generates
+ // code that does this, it is always safe to set.
+ OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+
+ return AsmPrinter::doFinalization(M);
+}
+
+/// createPPCAsmPrinterPass - Returns a pass that prints the PPC assembly code
+/// for a MachineFunction to the given output stream, in a format that the
+/// Darwin assembler can deal with.
+///
+static AsmPrinter *
+createPPCAsmPrinterPass(TargetMachine &tm,
+ std::unique_ptr<MCStreamer> &&Streamer) {
+ if (tm.getTargetTriple().isMacOSX())
+ return new PPCDarwinAsmPrinter(tm, std::move(Streamer));
+ return new PPCLinuxAsmPrinter(tm, std::move(Streamer));
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializePowerPCAsmPrinter() {
+ TargetRegistry::RegisterAsmPrinter(getThePPC32Target(),
+ createPPCAsmPrinterPass);
+ TargetRegistry::RegisterAsmPrinter(getThePPC64Target(),
+ createPPCAsmPrinterPass);
+ TargetRegistry::RegisterAsmPrinter(getThePPC64LETarget(),
+ createPPCAsmPrinterPass);
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp b/contrib/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp
new file mode 100644
index 000000000000..93c201d03869
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp
@@ -0,0 +1,274 @@
+//===- PPCBoolRetToInt.cpp ------------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements converting i1 values to i32 if they could be more
+// profitably allocated as GPRs rather than CRs. This pass will become totally
+// unnecessary if Register Bank Allocation and Global Instruction Selection ever
+// go upstream.
+//
+// Presently, the pass converts i1 Constants, and Arguments to i32 if the
+// transitive closure of their uses includes only PHINodes, CallInsts, and
+// ReturnInsts. The rational is that arguments are generally passed and returned
+// in GPRs rather than CRs, so casting them to i32 at the LLVM IR level will
+// actually save casts at the Machine Instruction level.
+//
+// It might be useful to expand this pass to add bit-wise operations to the list
+// of safe transitive closure types. Also, we miss some opportunities when LLVM
+// represents logical AND and OR operations with control flow rather than data
+// flow. For example by lowering the expression: return (A && B && C)
+//
+// as: return A ? true : B && C.
+//
+// There's code in SimplifyCFG that code be used to turn control flow in data
+// flow using SelectInsts. Selects are slow on some architectures (P7/P8), so
+// this probably isn't good in general, but for the special case of i1, the
+// Selects could be further lowered to bit operations that are fast everywhere.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/OperandTraits.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Pass.h"
+#include <cassert>
+
+using namespace llvm;
+
+namespace {
+
+#define DEBUG_TYPE "bool-ret-to-int"
+
+STATISTIC(NumBoolRetPromotion,
+ "Number of times a bool feeding a RetInst was promoted to an int");
+STATISTIC(NumBoolCallPromotion,
+ "Number of times a bool feeding a CallInst was promoted to an int");
+STATISTIC(NumBoolToIntPromotion,
+ "Total number of times a bool was promoted to an int");
+
+class PPCBoolRetToInt : public FunctionPass {
+ static SmallPtrSet<Value *, 8> findAllDefs(Value *V) {
+ SmallPtrSet<Value *, 8> Defs;
+ SmallVector<Value *, 8> WorkList;
+ WorkList.push_back(V);
+ Defs.insert(V);
+ while (!WorkList.empty()) {
+ Value *Curr = WorkList.back();
+ WorkList.pop_back();
+ auto *CurrUser = dyn_cast<User>(Curr);
+ // Operands of CallInst are skipped because they may not be Bool type,
+ // and their positions are defined by ABI.
+ if (CurrUser && !isa<CallInst>(Curr))
+ for (auto &Op : CurrUser->operands())
+ if (Defs.insert(Op).second)
+ WorkList.push_back(Op);
+ }
+ return Defs;
+ }
+
+ // Translate a i1 value to an equivalent i32 value:
+ static Value *translate(Value *V) {
+ Type *Int32Ty = Type::getInt32Ty(V->getContext());
+ if (auto *C = dyn_cast<Constant>(V))
+ return ConstantExpr::getZExt(C, Int32Ty);
+ if (auto *P = dyn_cast<PHINode>(V)) {
+ // Temporarily set the operands to 0. We'll fix this later in
+ // runOnUse.
+ Value *Zero = Constant::getNullValue(Int32Ty);
+ PHINode *Q =
+ PHINode::Create(Int32Ty, P->getNumIncomingValues(), P->getName(), P);
+ for (unsigned i = 0; i < P->getNumOperands(); ++i)
+ Q->addIncoming(Zero, P->getIncomingBlock(i));
+ return Q;
+ }
+
+ auto *A = dyn_cast<Argument>(V);
+ auto *I = dyn_cast<Instruction>(V);
+ assert((A || I) && "Unknown value type");
+
+ auto InstPt =
+ A ? &*A->getParent()->getEntryBlock().begin() : I->getNextNode();
+ return new ZExtInst(V, Int32Ty, "", InstPt);
+ }
+
+ typedef SmallPtrSet<const PHINode *, 8> PHINodeSet;
+
+ // A PHINode is Promotable if:
+ // 1. Its type is i1 AND
+ // 2. All of its uses are ReturnInt, CallInst, PHINode, or DbgInfoIntrinsic
+ // AND
+ // 3. All of its operands are Constant or Argument or
+ // CallInst or PHINode AND
+ // 4. All of its PHINode uses are Promotable AND
+ // 5. All of its PHINode operands are Promotable
+ static PHINodeSet getPromotablePHINodes(const Function &F) {
+ PHINodeSet Promotable;
+ // Condition 1
+ for (auto &BB : F)
+ for (auto &I : BB)
+ if (const auto *P = dyn_cast<PHINode>(&I))
+ if (P->getType()->isIntegerTy(1))
+ Promotable.insert(P);
+
+ SmallVector<const PHINode *, 8> ToRemove;
+ for (const PHINode *P : Promotable) {
+ // Condition 2 and 3
+ auto IsValidUser = [] (const Value *V) -> bool {
+ return isa<ReturnInst>(V) || isa<CallInst>(V) || isa<PHINode>(V) ||
+ isa<DbgInfoIntrinsic>(V);
+ };
+ auto IsValidOperand = [] (const Value *V) -> bool {
+ return isa<Constant>(V) || isa<Argument>(V) || isa<CallInst>(V) ||
+ isa<PHINode>(V);
+ };
+ const auto &Users = P->users();
+ const auto &Operands = P->operands();
+ if (!llvm::all_of(Users, IsValidUser) ||
+ !llvm::all_of(Operands, IsValidOperand))
+ ToRemove.push_back(P);
+ }
+
+ // Iterate to convergence
+ auto IsPromotable = [&Promotable] (const Value *V) -> bool {
+ const auto *Phi = dyn_cast<PHINode>(V);
+ return !Phi || Promotable.count(Phi);
+ };
+ while (!ToRemove.empty()) {
+ for (auto &User : ToRemove)
+ Promotable.erase(User);
+ ToRemove.clear();
+
+ for (const PHINode *P : Promotable) {
+ // Condition 4 and 5
+ const auto &Users = P->users();
+ const auto &Operands = P->operands();
+ if (!llvm::all_of(Users, IsPromotable) ||
+ !llvm::all_of(Operands, IsPromotable))
+ ToRemove.push_back(P);
+ }
+ }
+
+ return Promotable;
+ }
+
+ typedef DenseMap<Value *, Value *> B2IMap;
+
+ public:
+ static char ID;
+
+ PPCBoolRetToInt() : FunctionPass(ID) {
+ initializePPCBoolRetToIntPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ PHINodeSet PromotablePHINodes = getPromotablePHINodes(F);
+ B2IMap Bool2IntMap;
+ bool Changed = false;
+ for (auto &BB : F) {
+ for (auto &I : BB) {
+ if (auto *R = dyn_cast<ReturnInst>(&I))
+ if (F.getReturnType()->isIntegerTy(1))
+ Changed |=
+ runOnUse(R->getOperandUse(0), PromotablePHINodes, Bool2IntMap);
+
+ if (auto *CI = dyn_cast<CallInst>(&I))
+ for (auto &U : CI->operands())
+ if (U->getType()->isIntegerTy(1))
+ Changed |= runOnUse(U, PromotablePHINodes, Bool2IntMap);
+ }
+ }
+
+ return Changed;
+ }
+
+ static bool runOnUse(Use &U, const PHINodeSet &PromotablePHINodes,
+ B2IMap &BoolToIntMap) {
+ auto Defs = findAllDefs(U);
+
+ // If the values are all Constants or Arguments, don't bother
+ if (llvm::none_of(Defs, isa<Instruction, Value *>))
+ return false;
+
+ // Presently, we only know how to handle PHINode, Constant, Arguments and
+ // CallInst. Potentially, bitwise operations (AND, OR, XOR, NOT) and sign
+ // extension could also be handled in the future.
+ for (Value *V : Defs)
+ if (!isa<PHINode>(V) && !isa<Constant>(V) &&
+ !isa<Argument>(V) && !isa<CallInst>(V))
+ return false;
+
+ for (Value *V : Defs)
+ if (const auto *P = dyn_cast<PHINode>(V))
+ if (!PromotablePHINodes.count(P))
+ return false;
+
+ if (isa<ReturnInst>(U.getUser()))
+ ++NumBoolRetPromotion;
+ if (isa<CallInst>(U.getUser()))
+ ++NumBoolCallPromotion;
+ ++NumBoolToIntPromotion;
+
+ for (Value *V : Defs)
+ if (!BoolToIntMap.count(V))
+ BoolToIntMap[V] = translate(V);
+
+ // Replace the operands of the translated instructions. They were set to
+ // zero in the translate function.
+ for (auto &Pair : BoolToIntMap) {
+ auto *First = dyn_cast<User>(Pair.first);
+ auto *Second = dyn_cast<User>(Pair.second);
+ assert((!First || Second) && "translated from user to non-user!?");
+ // Operands of CallInst are skipped because they may not be Bool type,
+ // and their positions are defined by ABI.
+ if (First && !isa<CallInst>(First))
+ for (unsigned i = 0; i < First->getNumOperands(); ++i)
+ Second->setOperand(i, BoolToIntMap[First->getOperand(i)]);
+ }
+
+ Value *IntRetVal = BoolToIntMap[U];
+ Type *Int1Ty = Type::getInt1Ty(U->getContext());
+ auto *I = cast<Instruction>(U.getUser());
+ Value *BackToBool = new TruncInst(IntRetVal, Int1Ty, "backToBool", I);
+ U.set(BackToBool);
+
+ return true;
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ FunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // end anonymous namespace
+
+char PPCBoolRetToInt::ID = 0;
+INITIALIZE_PASS(PPCBoolRetToInt, "bool-ret-to-int",
+ "Convert i1 constants to i32 if they are returned",
+ false, false)
+
+FunctionPass *llvm::createPPCBoolRetToIntPass() { return new PPCBoolRetToInt(); }
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp b/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp
new file mode 100644
index 000000000000..ae76386fdfb6
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp
@@ -0,0 +1,283 @@
+//===-- PPCBranchSelector.cpp - Emit long conditional branches ------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that scans a machine function to determine which
+// conditional branches need more than 16 bits of displacement to reach their
+// target basic block. It does this in two passes; a calculation of basic block
+// positions pass, and a branch pseudo op to machine branch opcode pass. This
+// pass should be run last, just before the assembly printer.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "PPCInstrBuilder.h"
+#include "PPCInstrInfo.h"
+#include "PPCSubtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-branch-select"
+
+STATISTIC(NumExpanded, "Number of branches expanded to long format");
+
+namespace llvm {
+ void initializePPCBSelPass(PassRegistry&);
+}
+
+namespace {
+ struct PPCBSel : public MachineFunctionPass {
+ static char ID;
+ PPCBSel() : MachineFunctionPass(ID) {
+ initializePPCBSelPass(*PassRegistry::getPassRegistry());
+ }
+
+ // The sizes of the basic blocks in the function (the first
+ // element of the pair); the second element of the pair is the amount of the
+ // size that is due to potential padding.
+ std::vector<std::pair<unsigned, unsigned>> BlockSizes;
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+ StringRef getPassName() const override { return "PowerPC Branch Selector"; }
+ };
+ char PPCBSel::ID = 0;
+}
+
+INITIALIZE_PASS(PPCBSel, "ppc-branch-select", "PowerPC Branch Selector",
+ false, false)
+
+/// createPPCBranchSelectionPass - returns an instance of the Branch Selection
+/// Pass
+///
+FunctionPass *llvm::createPPCBranchSelectionPass() {
+ return new PPCBSel();
+}
+
+bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) {
+ const PPCInstrInfo *TII =
+ static_cast<const PPCInstrInfo *>(Fn.getSubtarget().getInstrInfo());
+ // Give the blocks of the function a dense, in-order, numbering.
+ Fn.RenumberBlocks();
+ BlockSizes.resize(Fn.getNumBlockIDs());
+
+ auto GetAlignmentAdjustment =
+ [TII](MachineBasicBlock &MBB, unsigned Offset) -> unsigned {
+ unsigned Align = MBB.getAlignment();
+ if (!Align)
+ return 0;
+
+ unsigned AlignAmt = 1 << Align;
+ unsigned ParentAlign = MBB.getParent()->getAlignment();
+
+ if (Align <= ParentAlign)
+ return OffsetToAlignment(Offset, AlignAmt);
+
+ // The alignment of this MBB is larger than the function's alignment, so we
+ // can't tell whether or not it will insert nops. Assume that it will.
+ return AlignAmt + OffsetToAlignment(Offset, AlignAmt);
+ };
+
+ // We need to be careful about the offset of the first block in the function
+ // because it might not have the function's alignment. This happens because,
+ // under the ELFv2 ABI, for functions which require a TOC pointer, we add a
+ // two-instruction sequence to the start of the function.
+ // Note: This needs to be synchronized with the check in
+ // PPCLinuxAsmPrinter::EmitFunctionBodyStart.
+ unsigned InitialOffset = 0;
+ if (Fn.getSubtarget<PPCSubtarget>().isELFv2ABI() &&
+ !Fn.getRegInfo().use_empty(PPC::X2))
+ InitialOffset = 8;
+
+ // Measure each MBB and compute a size for the entire function.
+ unsigned FuncSize = InitialOffset;
+ for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+ ++MFI) {
+ MachineBasicBlock *MBB = &*MFI;
+
+ // The end of the previous block may have extra nops if this block has an
+ // alignment requirement.
+ if (MBB->getNumber() > 0) {
+ unsigned AlignExtra = GetAlignmentAdjustment(*MBB, FuncSize);
+
+ auto &BS = BlockSizes[MBB->getNumber()-1];
+ BS.first += AlignExtra;
+ BS.second = AlignExtra;
+
+ FuncSize += AlignExtra;
+ }
+
+ unsigned BlockSize = 0;
+ for (MachineInstr &MI : *MBB)
+ BlockSize += TII->getInstSizeInBytes(MI);
+
+ BlockSizes[MBB->getNumber()].first = BlockSize;
+ FuncSize += BlockSize;
+ }
+
+ // If the entire function is smaller than the displacement of a branch field,
+ // we know we don't need to shrink any branches in this function. This is a
+ // common case.
+ if (FuncSize < (1 << 15)) {
+ BlockSizes.clear();
+ return false;
+ }
+
+ // For each conditional branch, if the offset to its destination is larger
+ // than the offset field allows, transform it into a long branch sequence
+ // like this:
+ // short branch:
+ // bCC MBB
+ // long branch:
+ // b!CC $PC+8
+ // b MBB
+ //
+ bool MadeChange = true;
+ bool EverMadeChange = false;
+ while (MadeChange) {
+ // Iteratively expand branches until we reach a fixed point.
+ MadeChange = false;
+
+ for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+ ++MFI) {
+ MachineBasicBlock &MBB = *MFI;
+ unsigned MBBStartOffset = 0;
+ for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+ I != E; ++I) {
+ MachineBasicBlock *Dest = nullptr;
+ if (I->getOpcode() == PPC::BCC && !I->getOperand(2).isImm())
+ Dest = I->getOperand(2).getMBB();
+ else if ((I->getOpcode() == PPC::BC || I->getOpcode() == PPC::BCn) &&
+ !I->getOperand(1).isImm())
+ Dest = I->getOperand(1).getMBB();
+ else if ((I->getOpcode() == PPC::BDNZ8 || I->getOpcode() == PPC::BDNZ ||
+ I->getOpcode() == PPC::BDZ8 || I->getOpcode() == PPC::BDZ) &&
+ !I->getOperand(0).isImm())
+ Dest = I->getOperand(0).getMBB();
+
+ if (!Dest) {
+ MBBStartOffset += TII->getInstSizeInBytes(*I);
+ continue;
+ }
+
+ // Determine the offset from the current branch to the destination
+ // block.
+ int BranchSize;
+ if (Dest->getNumber() <= MBB.getNumber()) {
+ // If this is a backwards branch, the delta is the offset from the
+ // start of this block to this branch, plus the sizes of all blocks
+ // from this block to the dest.
+ BranchSize = MBBStartOffset;
+
+ for (unsigned i = Dest->getNumber(), e = MBB.getNumber(); i != e; ++i)
+ BranchSize += BlockSizes[i].first;
+ } else {
+ // Otherwise, add the size of the blocks between this block and the
+ // dest to the number of bytes left in this block.
+ BranchSize = -MBBStartOffset;
+
+ for (unsigned i = MBB.getNumber(), e = Dest->getNumber(); i != e; ++i)
+ BranchSize += BlockSizes[i].first;
+ }
+
+ // If this branch is in range, ignore it.
+ if (isInt<16>(BranchSize)) {
+ MBBStartOffset += 4;
+ continue;
+ }
+
+ // Otherwise, we have to expand it to a long branch.
+ MachineInstr &OldBranch = *I;
+ DebugLoc dl = OldBranch.getDebugLoc();
+
+ if (I->getOpcode() == PPC::BCC) {
+ // The BCC operands are:
+ // 0. PPC branch predicate
+ // 1. CR register
+ // 2. Target MBB
+ PPC::Predicate Pred = (PPC::Predicate)I->getOperand(0).getImm();
+ unsigned CRReg = I->getOperand(1).getReg();
+
+ // Jump over the uncond branch inst (i.e. $PC+8) on opposite condition.
+ BuildMI(MBB, I, dl, TII->get(PPC::BCC))
+ .addImm(PPC::InvertPredicate(Pred)).addReg(CRReg).addImm(2);
+ } else if (I->getOpcode() == PPC::BC) {
+ unsigned CRBit = I->getOperand(0).getReg();
+ BuildMI(MBB, I, dl, TII->get(PPC::BCn)).addReg(CRBit).addImm(2);
+ } else if (I->getOpcode() == PPC::BCn) {
+ unsigned CRBit = I->getOperand(0).getReg();
+ BuildMI(MBB, I, dl, TII->get(PPC::BC)).addReg(CRBit).addImm(2);
+ } else if (I->getOpcode() == PPC::BDNZ) {
+ BuildMI(MBB, I, dl, TII->get(PPC::BDZ)).addImm(2);
+ } else if (I->getOpcode() == PPC::BDNZ8) {
+ BuildMI(MBB, I, dl, TII->get(PPC::BDZ8)).addImm(2);
+ } else if (I->getOpcode() == PPC::BDZ) {
+ BuildMI(MBB, I, dl, TII->get(PPC::BDNZ)).addImm(2);
+ } else if (I->getOpcode() == PPC::BDZ8) {
+ BuildMI(MBB, I, dl, TII->get(PPC::BDNZ8)).addImm(2);
+ } else {
+ llvm_unreachable("Unhandled branch type!");
+ }
+
+ // Uncond branch to the real destination.
+ I = BuildMI(MBB, I, dl, TII->get(PPC::B)).addMBB(Dest);
+
+ // Remove the old branch from the function.
+ OldBranch.eraseFromParent();
+
+ // Remember that this instruction is 8-bytes, increase the size of the
+ // block by 4, remember to iterate.
+ BlockSizes[MBB.getNumber()].first += 4;
+ MBBStartOffset += 8;
+ ++NumExpanded;
+ MadeChange = true;
+ }
+ }
+
+ if (MadeChange) {
+ // If we're going to iterate again, make sure we've updated our
+ // padding-based contributions to the block sizes.
+ unsigned Offset = InitialOffset;
+ for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+ ++MFI) {
+ MachineBasicBlock *MBB = &*MFI;
+
+ if (MBB->getNumber() > 0) {
+ auto &BS = BlockSizes[MBB->getNumber()-1];
+ BS.first -= BS.second;
+ Offset -= BS.second;
+
+ unsigned AlignExtra = GetAlignmentAdjustment(*MBB, Offset);
+
+ BS.first += AlignExtra;
+ BS.second = AlignExtra;
+
+ Offset += AlignExtra;
+ }
+
+ Offset += BlockSizes[MBB->getNumber()].first;
+ }
+ }
+
+ EverMadeChange |= MadeChange;
+ }
+
+ BlockSizes.clear();
+ return true;
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCCCState.cpp b/contrib/llvm/lib/Target/PowerPC/PPCCCState.cpp
new file mode 100644
index 000000000000..5510a95430f5
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCCCState.cpp
@@ -0,0 +1,36 @@
+//===---- PPCCCState.cpp - CCState with PowerPC specific extensions ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCCCState.h"
+#include "PPCSubtarget.h"
+#include "llvm/IR/Module.h"
+using namespace llvm;
+
+// Identify lowered values that originated from ppcf128 arguments and record
+// this.
+void PPCCCState::PreAnalyzeCallOperands(
+ const SmallVectorImpl<ISD::OutputArg> &Outs) {
+ for (const auto &I : Outs) {
+ if (I.ArgVT == llvm::MVT::ppcf128)
+ OriginalArgWasPPCF128.push_back(true);
+ else
+ OriginalArgWasPPCF128.push_back(false);
+ }
+}
+
+void PPCCCState::PreAnalyzeFormalArguments(
+ const SmallVectorImpl<ISD::InputArg> &Ins) {
+ for (const auto &I : Ins) {
+ if (I.ArgVT == llvm::MVT::ppcf128) {
+ OriginalArgWasPPCF128.push_back(true);
+ } else {
+ OriginalArgWasPPCF128.push_back(false);
+ }
+ }
+} \ No newline at end of file
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCCCState.h b/contrib/llvm/lib/Target/PowerPC/PPCCCState.h
new file mode 100644
index 000000000000..9be9f11dbea3
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCCCState.h
@@ -0,0 +1,42 @@
+//===---- PPCCCState.h - CCState with PowerPC specific extensions -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPCCCSTATE_H
+#define PPCCCSTATE_H
+
+#include "PPCISelLowering.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+
+namespace llvm {
+
+class PPCCCState : public CCState {
+public:
+
+ void
+ PreAnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs);
+ void
+ PreAnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins);
+
+private:
+
+ // Records whether the value has been lowered from an ppcf128.
+ SmallVector<bool, 4> OriginalArgWasPPCF128;
+
+public:
+ PPCCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
+ SmallVectorImpl<CCValAssign> &locs, LLVMContext &C)
+ : CCState(CC, isVarArg, MF, locs, C) {}
+
+ bool WasOriginalArgPPCF128(unsigned ValNo) { return OriginalArgWasPPCF128[ValNo]; }
+ void clearWasPPCF128() { OriginalArgWasPPCF128.clear(); }
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp b/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp
new file mode 100644
index 000000000000..2c62a0f1d909
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -0,0 +1,728 @@
+//===-- PPCCTRLoops.cpp - Identify and generate CTR loops -----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass identifies loops where we can generate the PPC branch instructions
+// that decrement and test the count register (CTR) (bdnz and friends).
+//
+// The pattern that defines the induction variable can changed depending on
+// prior optimizations. For example, the IndVarSimplify phase run by 'opt'
+// normalizes induction variables, and the Loop Strength Reduction pass
+// run by 'llc' may also make changes to the induction variable.
+//
+// Criteria for CTR loops:
+// - Countable loops (w/ ind. var for a trip count)
+// - Try inner-most loops first
+// - No nested CTR loops.
+// - No function calls in loops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "PPC.h"
+#include "PPCTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/PassSupport.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+#ifndef NDEBUG
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#endif
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ctrloops"
+
+#ifndef NDEBUG
+static cl::opt<int> CTRLoopLimit("ppc-max-ctrloop", cl::Hidden, cl::init(-1));
+#endif
+
+STATISTIC(NumCTRLoops, "Number of loops converted to CTR loops");
+
+namespace llvm {
+ void initializePPCCTRLoopsPass(PassRegistry&);
+#ifndef NDEBUG
+ void initializePPCCTRLoopsVerifyPass(PassRegistry&);
+#endif
+}
+
+namespace {
+ struct PPCCTRLoops : public FunctionPass {
+
+#ifndef NDEBUG
+ static int Counter;
+#endif
+
+ public:
+ static char ID;
+
+ PPCCTRLoops() : FunctionPass(ID), TM(nullptr) {
+ initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry());
+ }
+ PPCCTRLoops(PPCTargetMachine &TM) : FunctionPass(ID), TM(&TM) {
+ initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ }
+
+ private:
+ bool mightUseCTR(const Triple &TT, BasicBlock *BB);
+ bool convertToCTRLoop(Loop *L);
+
+ private:
+ PPCTargetMachine *TM;
+ LoopInfo *LI;
+ ScalarEvolution *SE;
+ const DataLayout *DL;
+ DominatorTree *DT;
+ const TargetLibraryInfo *LibInfo;
+ bool PreserveLCSSA;
+ };
+
+ char PPCCTRLoops::ID = 0;
+#ifndef NDEBUG
+ int PPCCTRLoops::Counter = 0;
+#endif
+
+#ifndef NDEBUG
+ struct PPCCTRLoopsVerify : public MachineFunctionPass {
+ public:
+ static char ID;
+
+ PPCCTRLoopsVerify() : MachineFunctionPass(ID) {
+ initializePPCCTRLoopsVerifyPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineDominatorTree>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ private:
+ MachineDominatorTree *MDT;
+ };
+
+ char PPCCTRLoopsVerify::ID = 0;
+#endif // NDEBUG
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
+ false, false)
+
+FunctionPass *llvm::createPPCCTRLoops(PPCTargetMachine &TM) {
+ return new PPCCTRLoops(TM);
+}
+
+#ifndef NDEBUG
+INITIALIZE_PASS_BEGIN(PPCCTRLoopsVerify, "ppc-ctr-loops-verify",
+ "PowerPC CTR Loops Verify", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(PPCCTRLoopsVerify, "ppc-ctr-loops-verify",
+ "PowerPC CTR Loops Verify", false, false)
+
+FunctionPass *llvm::createPPCCTRLoopsVerify() {
+ return new PPCCTRLoopsVerify();
+}
+#endif // NDEBUG
+
+bool PPCCTRLoops::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ DL = &F.getParent()->getDataLayout();
+ auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+ LibInfo = TLIP ? &TLIP->getTLI() : nullptr;
+ PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+
+ bool MadeChange = false;
+
+ for (LoopInfo::iterator I = LI->begin(), E = LI->end();
+ I != E; ++I) {
+ Loop *L = *I;
+ if (!L->getParentLoop())
+ MadeChange |= convertToCTRLoop(L);
+ }
+
+ return MadeChange;
+}
+
+static bool isLargeIntegerTy(bool Is32Bit, Type *Ty) {
+ if (IntegerType *ITy = dyn_cast<IntegerType>(Ty))
+ return ITy->getBitWidth() > (Is32Bit ? 32U : 64U);
+
+ return false;
+}
+
+// Determining the address of a TLS variable results in a function call in
+// certain TLS models.
+static bool memAddrUsesCTR(const PPCTargetMachine *TM,
+ const Value *MemAddr) {
+ const auto *GV = dyn_cast<GlobalValue>(MemAddr);
+ if (!GV) {
+ // Recurse to check for constants that refer to TLS global variables.
+ if (const auto *CV = dyn_cast<Constant>(MemAddr))
+ for (const auto &CO : CV->operands())
+ if (memAddrUsesCTR(TM, CO))
+ return true;
+
+ return false;
+ }
+
+ if (!GV->isThreadLocal())
+ return false;
+ if (!TM)
+ return true;
+ TLSModel::Model Model = TM->getTLSModel(GV);
+ return Model == TLSModel::GeneralDynamic || Model == TLSModel::LocalDynamic;
+}
+
+bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
+ for (BasicBlock::iterator J = BB->begin(), JE = BB->end();
+ J != JE; ++J) {
+ if (CallInst *CI = dyn_cast<CallInst>(J)) {
+ if (InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue())) {
+ // Inline ASM is okay, unless it clobbers the ctr register.
+ InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints();
+ for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) {
+ InlineAsm::ConstraintInfo &C = CIV[i];
+ if (C.Type != InlineAsm::isInput)
+ for (unsigned j = 0, je = C.Codes.size(); j < je; ++j)
+ if (StringRef(C.Codes[j]).equals_lower("{ctr}"))
+ return true;
+ }
+
+ continue;
+ }
+
+ if (!TM)
+ return true;
+ const TargetLowering *TLI =
+ TM->getSubtargetImpl(*BB->getParent())->getTargetLowering();
+
+ if (Function *F = CI->getCalledFunction()) {
+ // Most intrinsics don't become function calls, but some might.
+ // sin, cos, exp and log are always calls.
+ unsigned Opcode = 0;
+ if (F->getIntrinsicID() != Intrinsic::not_intrinsic) {
+ switch (F->getIntrinsicID()) {
+ default: continue;
+ // If we have a call to ppc_is_decremented_ctr_nonzero, or ppc_mtctr
+ // we're definitely using CTR.
+ case Intrinsic::ppc_is_decremented_ctr_nonzero:
+ case Intrinsic::ppc_mtctr:
+ return true;
+
+// VisualStudio defines setjmp as _setjmp
+#if defined(_MSC_VER) && defined(setjmp) && \
+ !defined(setjmp_undefined_for_msvc)
+# pragma push_macro("setjmp")
+# undef setjmp
+# define setjmp_undefined_for_msvc
+#endif
+
+ case Intrinsic::setjmp:
+
+#if defined(_MSC_VER) && defined(setjmp_undefined_for_msvc)
+ // let's return it to _setjmp state
+# pragma pop_macro("setjmp")
+# undef setjmp_undefined_for_msvc
+#endif
+
+ case Intrinsic::longjmp:
+
+ // Exclude eh_sjlj_setjmp; we don't need to exclude eh_sjlj_longjmp
+ // because, although it does clobber the counter register, the
+ // control can't then return to inside the loop unless there is also
+ // an eh_sjlj_setjmp.
+ case Intrinsic::eh_sjlj_setjmp:
+
+ case Intrinsic::memcpy:
+ case Intrinsic::memmove:
+ case Intrinsic::memset:
+ case Intrinsic::powi:
+ case Intrinsic::log:
+ case Intrinsic::log2:
+ case Intrinsic::log10:
+ case Intrinsic::exp:
+ case Intrinsic::exp2:
+ case Intrinsic::pow:
+ case Intrinsic::sin:
+ case Intrinsic::cos:
+ return true;
+ case Intrinsic::copysign:
+ if (CI->getArgOperand(0)->getType()->getScalarType()->
+ isPPC_FP128Ty())
+ return true;
+ else
+ continue; // ISD::FCOPYSIGN is never a library call.
+ case Intrinsic::sqrt: Opcode = ISD::FSQRT; break;
+ case Intrinsic::floor: Opcode = ISD::FFLOOR; break;
+ case Intrinsic::ceil: Opcode = ISD::FCEIL; break;
+ case Intrinsic::trunc: Opcode = ISD::FTRUNC; break;
+ case Intrinsic::rint: Opcode = ISD::FRINT; break;
+ case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break;
+ case Intrinsic::round: Opcode = ISD::FROUND; break;
+ case Intrinsic::minnum: Opcode = ISD::FMINNUM; break;
+ case Intrinsic::maxnum: Opcode = ISD::FMAXNUM; break;
+ }
+ }
+
+ // PowerPC does not use [US]DIVREM or other library calls for
+ // operations on regular types which are not otherwise library calls
+ // (i.e. soft float or atomics). If adapting for targets that do,
+ // additional care is required here.
+
+ LibFunc::Func Func;
+ if (!F->hasLocalLinkage() && F->hasName() && LibInfo &&
+ LibInfo->getLibFunc(F->getName(), Func) &&
+ LibInfo->hasOptimizedCodeGen(Func)) {
+ // Non-read-only functions are never treated as intrinsics.
+ if (!CI->onlyReadsMemory())
+ return true;
+
+ // Conversion happens only for FP calls.
+ if (!CI->getArgOperand(0)->getType()->isFloatingPointTy())
+ return true;
+
+ switch (Func) {
+ default: return true;
+ case LibFunc::copysign:
+ case LibFunc::copysignf:
+ continue; // ISD::FCOPYSIGN is never a library call.
+ case LibFunc::copysignl:
+ return true;
+ case LibFunc::fabs:
+ case LibFunc::fabsf:
+ case LibFunc::fabsl:
+ continue; // ISD::FABS is never a library call.
+ case LibFunc::sqrt:
+ case LibFunc::sqrtf:
+ case LibFunc::sqrtl:
+ Opcode = ISD::FSQRT; break;
+ case LibFunc::floor:
+ case LibFunc::floorf:
+ case LibFunc::floorl:
+ Opcode = ISD::FFLOOR; break;
+ case LibFunc::nearbyint:
+ case LibFunc::nearbyintf:
+ case LibFunc::nearbyintl:
+ Opcode = ISD::FNEARBYINT; break;
+ case LibFunc::ceil:
+ case LibFunc::ceilf:
+ case LibFunc::ceill:
+ Opcode = ISD::FCEIL; break;
+ case LibFunc::rint:
+ case LibFunc::rintf:
+ case LibFunc::rintl:
+ Opcode = ISD::FRINT; break;
+ case LibFunc::round:
+ case LibFunc::roundf:
+ case LibFunc::roundl:
+ Opcode = ISD::FROUND; break;
+ case LibFunc::trunc:
+ case LibFunc::truncf:
+ case LibFunc::truncl:
+ Opcode = ISD::FTRUNC; break;
+ case LibFunc::fmin:
+ case LibFunc::fminf:
+ case LibFunc::fminl:
+ Opcode = ISD::FMINNUM; break;
+ case LibFunc::fmax:
+ case LibFunc::fmaxf:
+ case LibFunc::fmaxl:
+ Opcode = ISD::FMAXNUM; break;
+ }
+ }
+
+ if (Opcode) {
+ auto &DL = CI->getModule()->getDataLayout();
+ MVT VTy = TLI->getSimpleValueType(DL, CI->getArgOperand(0)->getType(),
+ true);
+ if (VTy == MVT::Other)
+ return true;
+
+ if (TLI->isOperationLegalOrCustom(Opcode, VTy))
+ continue;
+ else if (VTy.isVector() &&
+ TLI->isOperationLegalOrCustom(Opcode, VTy.getScalarType()))
+ continue;
+
+ return true;
+ }
+ }
+
+ return true;
+ } else if (isa<BinaryOperator>(J) &&
+ J->getType()->getScalarType()->isPPC_FP128Ty()) {
+ // Most operations on ppc_f128 values become calls.
+ return true;
+ } else if (isa<UIToFPInst>(J) || isa<SIToFPInst>(J) ||
+ isa<FPToUIInst>(J) || isa<FPToSIInst>(J)) {
+ CastInst *CI = cast<CastInst>(J);
+ if (CI->getSrcTy()->getScalarType()->isPPC_FP128Ty() ||
+ CI->getDestTy()->getScalarType()->isPPC_FP128Ty() ||
+ isLargeIntegerTy(TT.isArch32Bit(), CI->getSrcTy()->getScalarType()) ||
+ isLargeIntegerTy(TT.isArch32Bit(), CI->getDestTy()->getScalarType()))
+ return true;
+ } else if (isLargeIntegerTy(TT.isArch32Bit(),
+ J->getType()->getScalarType()) &&
+ (J->getOpcode() == Instruction::UDiv ||
+ J->getOpcode() == Instruction::SDiv ||
+ J->getOpcode() == Instruction::URem ||
+ J->getOpcode() == Instruction::SRem)) {
+ return true;
+ } else if (TT.isArch32Bit() &&
+ isLargeIntegerTy(false, J->getType()->getScalarType()) &&
+ (J->getOpcode() == Instruction::Shl ||
+ J->getOpcode() == Instruction::AShr ||
+ J->getOpcode() == Instruction::LShr)) {
+ // Only on PPC32, for 128-bit integers (specifically not 64-bit
+ // integers), these might be runtime calls.
+ return true;
+ } else if (isa<IndirectBrInst>(J) || isa<InvokeInst>(J)) {
+ // On PowerPC, indirect jumps use the counter register.
+ return true;
+ } else if (SwitchInst *SI = dyn_cast<SwitchInst>(J)) {
+ if (!TM)
+ return true;
+ const TargetLowering *TLI =
+ TM->getSubtargetImpl(*BB->getParent())->getTargetLowering();
+
+ if (SI->getNumCases() + 1 >= (unsigned)TLI->getMinimumJumpTableEntries())
+ return true;
+ }
+
+ if (TM->getSubtargetImpl(*BB->getParent())->getTargetLowering()->useSoftFloat()) {
+ switch(J->getOpcode()) {
+ case Instruction::FAdd:
+ case Instruction::FSub:
+ case Instruction::FMul:
+ case Instruction::FDiv:
+ case Instruction::FRem:
+ case Instruction::FPTrunc:
+ case Instruction::FPExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::UIToFP:
+ case Instruction::SIToFP:
+ case Instruction::FCmp:
+ return true;
+ }
+ }
+
+ for (Value *Operand : J->operands())
+ if (memAddrUsesCTR(TM, Operand))
+ return true;
+ }
+
+ return false;
+}
+
+bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
+ bool MadeChange = false;
+
+ const Triple TT =
+ Triple(L->getHeader()->getParent()->getParent()->getTargetTriple());
+ if (!TT.isArch32Bit() && !TT.isArch64Bit())
+ return MadeChange; // Unknown arch. type.
+
+ // Process nested loops first.
+ for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
+ MadeChange |= convertToCTRLoop(*I);
+ DEBUG(dbgs() << "Nested loop converted\n");
+ }
+
+ // If a nested loop has been converted, then we can't convert this loop.
+ if (MadeChange)
+ return MadeChange;
+
+#ifndef NDEBUG
+ // Stop trying after reaching the limit (if any).
+ int Limit = CTRLoopLimit;
+ if (Limit >= 0) {
+ if (Counter >= CTRLoopLimit)
+ return false;
+ Counter++;
+ }
+#endif
+
+ // We don't want to spill/restore the counter register, and so we don't
+ // want to use the counter register if the loop contains calls.
+ for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
+ I != IE; ++I)
+ if (mightUseCTR(TT, *I))
+ return MadeChange;
+
+ SmallVector<BasicBlock*, 4> ExitingBlocks;
+ L->getExitingBlocks(ExitingBlocks);
+
+ BasicBlock *CountedExitBlock = nullptr;
+ const SCEV *ExitCount = nullptr;
+ BranchInst *CountedExitBranch = nullptr;
+ for (SmallVectorImpl<BasicBlock *>::iterator I = ExitingBlocks.begin(),
+ IE = ExitingBlocks.end(); I != IE; ++I) {
+ const SCEV *EC = SE->getExitCount(L, *I);
+ DEBUG(dbgs() << "Exit Count for " << *L << " from block " <<
+ (*I)->getName() << ": " << *EC << "\n");
+ if (isa<SCEVCouldNotCompute>(EC))
+ continue;
+ if (const SCEVConstant *ConstEC = dyn_cast<SCEVConstant>(EC)) {
+ if (ConstEC->getValue()->isZero())
+ continue;
+ } else if (!SE->isLoopInvariant(EC, L))
+ continue;
+
+ if (SE->getTypeSizeInBits(EC->getType()) > (TT.isArch64Bit() ? 64 : 32))
+ continue;
+
+ // We now have a loop-invariant count of loop iterations (which is not the
+ // constant zero) for which we know that this loop will not exit via this
+ // exisiting block.
+
+ // We need to make sure that this block will run on every loop iteration.
+ // For this to be true, we must dominate all blocks with backedges. Such
+ // blocks are in-loop predecessors to the header block.
+ bool NotAlways = false;
+ for (pred_iterator PI = pred_begin(L->getHeader()),
+ PIE = pred_end(L->getHeader()); PI != PIE; ++PI) {
+ if (!L->contains(*PI))
+ continue;
+
+ if (!DT->dominates(*I, *PI)) {
+ NotAlways = true;
+ break;
+ }
+ }
+
+ if (NotAlways)
+ continue;
+
+ // Make sure this blocks ends with a conditional branch.
+ Instruction *TI = (*I)->getTerminator();
+ if (!TI)
+ continue;
+
+ if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+ if (!BI->isConditional())
+ continue;
+
+ CountedExitBranch = BI;
+ } else
+ continue;
+
+ // Note that this block may not be the loop latch block, even if the loop
+ // has a latch block.
+ CountedExitBlock = *I;
+ ExitCount = EC;
+ break;
+ }
+
+ if (!CountedExitBlock)
+ return MadeChange;
+
+ BasicBlock *Preheader = L->getLoopPreheader();
+
+ // If we don't have a preheader, then insert one. If we already have a
+ // preheader, then we can use it (except if the preheader contains a use of
+ // the CTR register because some such uses might be reordered by the
+ // selection DAG after the mtctr instruction).
+ if (!Preheader || mightUseCTR(TT, Preheader))
+ Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA);
+ if (!Preheader)
+ return MadeChange;
+
+ DEBUG(dbgs() << "Preheader for exit count: " << Preheader->getName() << "\n");
+
+ // Insert the count into the preheader and replace the condition used by the
+ // selected branch.
+ MadeChange = true;
+
+ SCEVExpander SCEVE(*SE, Preheader->getModule()->getDataLayout(), "loopcnt");
+ LLVMContext &C = SE->getContext();
+ Type *CountType = TT.isArch64Bit() ? Type::getInt64Ty(C) :
+ Type::getInt32Ty(C);
+ if (!ExitCount->getType()->isPointerTy() &&
+ ExitCount->getType() != CountType)
+ ExitCount = SE->getZeroExtendExpr(ExitCount, CountType);
+ ExitCount = SE->getAddExpr(ExitCount, SE->getOne(CountType));
+ Value *ECValue =
+ SCEVE.expandCodeFor(ExitCount, CountType, Preheader->getTerminator());
+
+ IRBuilder<> CountBuilder(Preheader->getTerminator());
+ Module *M = Preheader->getParent()->getParent();
+ Value *MTCTRFunc = Intrinsic::getDeclaration(M, Intrinsic::ppc_mtctr,
+ CountType);
+ CountBuilder.CreateCall(MTCTRFunc, ECValue);
+
+ IRBuilder<> CondBuilder(CountedExitBranch);
+ Value *DecFunc =
+ Intrinsic::getDeclaration(M, Intrinsic::ppc_is_decremented_ctr_nonzero);
+ Value *NewCond = CondBuilder.CreateCall(DecFunc, {});
+ Value *OldCond = CountedExitBranch->getCondition();
+ CountedExitBranch->setCondition(NewCond);
+
+ // The false branch must exit the loop.
+ if (!L->contains(CountedExitBranch->getSuccessor(0)))
+ CountedExitBranch->swapSuccessors();
+
+ // The old condition may be dead now, and may have even created a dead PHI
+ // (the original induction variable).
+ RecursivelyDeleteTriviallyDeadInstructions(OldCond);
+ DeleteDeadPHIs(CountedExitBlock);
+
+ ++NumCTRLoops;
+ return MadeChange;
+}
+
+#ifndef NDEBUG
+static bool clobbersCTR(const MachineInstr &MI) {
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI.getOperand(i);
+ if (MO.isReg()) {
+ if (MO.isDef() && (MO.getReg() == PPC::CTR || MO.getReg() == PPC::CTR8))
+ return true;
+ } else if (MO.isRegMask()) {
+ if (MO.clobbersPhysReg(PPC::CTR) || MO.clobbersPhysReg(PPC::CTR8))
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static bool verifyCTRBranch(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator I) {
+ MachineBasicBlock::iterator BI = I;
+ SmallSet<MachineBasicBlock *, 16> Visited;
+ SmallVector<MachineBasicBlock *, 8> Preds;
+ bool CheckPreds;
+
+ if (I == MBB->begin()) {
+ Visited.insert(MBB);
+ goto queue_preds;
+ } else
+ --I;
+
+check_block:
+ Visited.insert(MBB);
+ if (I == MBB->end())
+ goto queue_preds;
+
+ CheckPreds = true;
+ for (MachineBasicBlock::iterator IE = MBB->begin();; --I) {
+ unsigned Opc = I->getOpcode();
+ if (Opc == PPC::MTCTRloop || Opc == PPC::MTCTR8loop) {
+ CheckPreds = false;
+ break;
+ }
+
+ if (I != BI && clobbersCTR(*I)) {
+ DEBUG(dbgs() << "BB#" << MBB->getNumber() << " (" <<
+ MBB->getFullName() << ") instruction " << *I <<
+ " clobbers CTR, invalidating " << "BB#" <<
+ BI->getParent()->getNumber() << " (" <<
+ BI->getParent()->getFullName() << ") instruction " <<
+ *BI << "\n");
+ return false;
+ }
+
+ if (I == IE)
+ break;
+ }
+
+ if (!CheckPreds && Preds.empty())
+ return true;
+
+ if (CheckPreds) {
+queue_preds:
+ if (MachineFunction::iterator(MBB) == MBB->getParent()->begin()) {
+ DEBUG(dbgs() << "Unable to find a MTCTR instruction for BB#" <<
+ BI->getParent()->getNumber() << " (" <<
+ BI->getParent()->getFullName() << ") instruction " <<
+ *BI << "\n");
+ return false;
+ }
+
+ for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
+ PIE = MBB->pred_end(); PI != PIE; ++PI)
+ Preds.push_back(*PI);
+ }
+
+ do {
+ MBB = Preds.pop_back_val();
+ if (!Visited.count(MBB)) {
+ I = MBB->getLastNonDebugInstr();
+ goto check_block;
+ }
+ } while (!Preds.empty());
+
+ return true;
+}
+
+bool PPCCTRLoopsVerify::runOnMachineFunction(MachineFunction &MF) {
+ MDT = &getAnalysis<MachineDominatorTree>();
+
+ // Verify that all bdnz/bdz instructions are dominated by a loop mtctr before
+ // any other instructions that might clobber the ctr register.
+ for (MachineFunction::iterator I = MF.begin(), IE = MF.end();
+ I != IE; ++I) {
+ MachineBasicBlock *MBB = &*I;
+ if (!MDT->isReachableFromEntry(MBB))
+ continue;
+
+ for (MachineBasicBlock::iterator MII = MBB->getFirstTerminator(),
+ MIIE = MBB->end(); MII != MIIE; ++MII) {
+ unsigned Opc = MII->getOpcode();
+ if (Opc == PPC::BDNZ8 || Opc == PPC::BDNZ ||
+ Opc == PPC::BDZ8 || Opc == PPC::BDZ)
+ if (!verifyCTRBranch(MBB, MII))
+ llvm_unreachable("Invalid PPC CTR loop!");
+ }
+ }
+
+ return false;
+}
+#endif // NDEBUG
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCCallingConv.h b/contrib/llvm/lib/Target/PowerPC/PPCCallingConv.h
new file mode 100644
index 000000000000..eb904a858592
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCCallingConv.h
@@ -0,0 +1,35 @@
+//=== PPCCallingConv.h - PPC Custom Calling Convention Routines -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the custom routines for the PPC Calling Convention that
+// aren't done by tablegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_PPC_PPCCALLINGCONV_H
+#define LLVM_LIB_TARGET_PPC_PPCCALLINGCONV_H
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/IR/CallingConv.h"
+
+namespace llvm {
+
+inline bool CC_PPC_AnyReg_Error(unsigned &, MVT &, MVT &,
+ CCValAssign::LocInfo &, ISD::ArgFlagsTy &,
+ CCState &) {
+ llvm_unreachable("The AnyReg calling convention is only supported by the " \
+ "stackmap and patchpoint intrinsics.");
+ // gracefully fallback to PPC C calling convention on Release builds.
+ return false;
+}
+
+} // End llvm namespace
+
+#endif
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCCallingConv.td b/contrib/llvm/lib/Target/PowerPC/PPCCallingConv.td
new file mode 100644
index 000000000000..a4f4c8688cc1
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCCallingConv.td
@@ -0,0 +1,284 @@
+//===- PPCCallingConv.td - Calling Conventions for PowerPC -*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the PowerPC 32- and 64-bit
+// architectures.
+//
+//===----------------------------------------------------------------------===//
+
+/// CCIfSubtarget - Match if the current subtarget has a feature F.
+class CCIfSubtarget<string F, CCAction A>
+ : CCIf<!strconcat("static_cast<const PPCSubtarget&>"
+ "(State.getMachineFunction().getSubtarget()).",
+ F),
+ A>;
+class CCIfNotSubtarget<string F, CCAction A>
+ : CCIf<!strconcat("!static_cast<const PPCSubtarget&>"
+ "(State.getMachineFunction().getSubtarget()).",
+ F),
+ A>;
+class CCIfOrigArgWasNotPPCF128<CCAction A>
+ : CCIf<"!static_cast<PPCCCState *>(&State)->WasOriginalArgPPCF128(ValNo)",
+ A>;
+class CCIfOrigArgWasPPCF128<CCAction A>
+ : CCIf<"static_cast<PPCCCState *>(&State)->WasOriginalArgPPCF128(ValNo)",
+ A>;
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Convention
+//===----------------------------------------------------------------------===//
+
+// PPC64 AnyReg return-value convention. No explicit register is specified for
+// the return-value. The register allocator is allowed and expected to choose
+// any free register.
+//
+// This calling convention is currently only supported by the stackmap and
+// patchpoint intrinsics. All other uses will result in an assert on Debug
+// builds. On Release builds we fallback to the PPC C calling convention.
+def RetCC_PPC64_AnyReg : CallingConv<[
+ CCCustom<"CC_PPC_AnyReg_Error">
+]>;
+
+// Return-value convention for PowerPC
+def RetCC_PPC : CallingConv<[
+ CCIfCC<"CallingConv::AnyReg", CCDelegateTo<RetCC_PPC64_AnyReg>>,
+
+ // On PPC64, integer return values are always promoted to i64
+ CCIfType<[i32, i1], CCIfSubtarget<"isPPC64()", CCPromoteToType<i64>>>,
+ CCIfType<[i1], CCIfNotSubtarget<"isPPC64()", CCPromoteToType<i32>>>,
+
+ CCIfType<[i32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>,
+ CCIfType<[i64], CCAssignToReg<[X3, X4, X5, X6]>>,
+ CCIfType<[i128], CCAssignToReg<[X3, X4, X5, X6]>>,
+
+ // Floating point types returned as "direct" go into F1 .. F8; note that
+ // only the ELFv2 ABI fully utilizes all these registers.
+ CCIfType<[f32], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
+ CCIfType<[f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
+
+ // QPX vectors are returned in QF1 and QF2.
+ CCIfType<[v4f64, v4f32, v4i1],
+ CCIfSubtarget<"hasQPX()", CCAssignToReg<[QF1, QF2]>>>,
+
+ // Vector types returned as "direct" go into V2 .. V9; note that only the
+ // ELFv2 ABI fully utilizes all these registers.
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64],
+ CCIfSubtarget<"hasAltivec()",
+ CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>>
+]>;
+
+// No explicit register is specified for the AnyReg calling convention. The
+// register allocator may assign the arguments to any free register.
+//
+// This calling convention is currently only supported by the stackmap and
+// patchpoint intrinsics. All other uses will result in an assert on Debug
+// builds. On Release builds we fallback to the PPC C calling convention.
+def CC_PPC64_AnyReg : CallingConv<[
+ CCCustom<"CC_PPC_AnyReg_Error">
+]>;
+
+// Note that we don't currently have calling conventions for 64-bit
+// PowerPC, but handle all the complexities of the ABI in the lowering
+// logic. FIXME: See if the logic can be simplified with use of CCs.
+// This may require some extensions to current table generation.
+
+// Simple calling convention for 64-bit ELF PowerPC fast isel.
+// Only handle ints and floats. All ints are promoted to i64.
+// Vector types and quadword ints are not handled.
+def CC_PPC64_ELF_FIS : CallingConv<[
+ CCIfCC<"CallingConv::AnyReg", CCDelegateTo<CC_PPC64_AnyReg>>,
+
+ CCIfType<[i1], CCPromoteToType<i64>>,
+ CCIfType<[i8], CCPromoteToType<i64>>,
+ CCIfType<[i16], CCPromoteToType<i64>>,
+ CCIfType<[i32], CCPromoteToType<i64>>,
+ CCIfType<[i64], CCAssignToReg<[X3, X4, X5, X6, X7, X8, X9, X10]>>,
+ CCIfType<[f32, f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>
+]>;
+
+// Simple return-value convention for 64-bit ELF PowerPC fast isel.
+// All small ints are promoted to i64. Vector types, quadword ints,
+// and multiple register returns are "supported" to avoid compile
+// errors, but none are handled by the fast selector.
+def RetCC_PPC64_ELF_FIS : CallingConv<[
+ CCIfCC<"CallingConv::AnyReg", CCDelegateTo<RetCC_PPC64_AnyReg>>,
+
+ CCIfType<[i1], CCPromoteToType<i64>>,
+ CCIfType<[i8], CCPromoteToType<i64>>,
+ CCIfType<[i16], CCPromoteToType<i64>>,
+ CCIfType<[i32], CCPromoteToType<i64>>,
+ CCIfType<[i64], CCAssignToReg<[X3, X4, X5, X6]>>,
+ CCIfType<[i128], CCAssignToReg<[X3, X4, X5, X6]>>,
+ CCIfType<[f32], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
+ CCIfType<[f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
+ CCIfType<[v4f64, v4f32, v4i1],
+ CCIfSubtarget<"hasQPX()", CCAssignToReg<[QF1, QF2]>>>,
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64],
+ CCIfSubtarget<"hasAltivec()",
+ CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// PowerPC System V Release 4 32-bit ABI
+//===----------------------------------------------------------------------===//
+
+def CC_PPC32_SVR4_Common : CallingConv<[
+ CCIfType<[i1], CCPromoteToType<i32>>,
+
+ // The ABI requires i64 to be passed in two adjacent registers with the first
+ // register having an odd register number.
+ CCIfType<[i32],
+ CCIfSplit<CCIfSubtarget<"useSoftFloat()",
+ CCIfOrigArgWasNotPPCF128<
+ CCCustom<"CC_PPC32_SVR4_Custom_AlignArgRegs">>>>>,
+
+ CCIfType<[i32],
+ CCIfSplit<CCIfNotSubtarget<"useSoftFloat()",
+ CCCustom<"CC_PPC32_SVR4_Custom_AlignArgRegs">>>>,
+ CCIfSplit<CCIfSubtarget<"useSoftFloat()",
+ CCIfOrigArgWasPPCF128<CCCustom<
+ "CC_PPC32_SVR4_Custom_SkipLastArgRegsPPCF128">>>>,
+
+ // The 'nest' parameter, if any, is passed in R11.
+ CCIfNest<CCAssignToReg<[R11]>>,
+
+ // The first 8 integer arguments are passed in integer registers.
+ CCIfType<[i32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>,
+
+ // Make sure the i64 words from a long double are either both passed in
+ // registers or both passed on the stack.
+ CCIfType<[f64], CCIfSplit<CCCustom<"CC_PPC32_SVR4_Custom_AlignFPArgRegs">>>,
+
+ // FP values are passed in F1 - F8.
+ CCIfType<[f32, f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
+
+ // Split arguments have an alignment of 8 bytes on the stack.
+ CCIfType<[i32], CCIfSplit<CCAssignToStack<4, 8>>>,
+
+ CCIfType<[i32], CCAssignToStack<4, 4>>,
+
+ // Floats are stored in double precision format, thus they have the same
+ // alignment and size as doubles.
+ CCIfType<[f32,f64], CCAssignToStack<8, 8>>,
+
+ // QPX vectors that are stored in double precision need 32-byte alignment.
+ CCIfType<[v4f64, v4i1], CCAssignToStack<32, 32>>,
+
+ // Vectors get 16-byte stack slots that are 16-byte aligned.
+ CCIfType<[v16i8, v8i16, v4i32, v4f32, v2f64, v2i64], CCAssignToStack<16, 16>>
+]>;
+
+// This calling convention puts vector arguments always on the stack. It is used
+// to assign vector arguments which belong to the variable portion of the
+// parameter list of a variable argument function.
+def CC_PPC32_SVR4_VarArg : CallingConv<[
+ CCDelegateTo<CC_PPC32_SVR4_Common>
+]>;
+
+// In contrast to CC_PPC32_SVR4_VarArg, this calling convention first tries to
+// put vector arguments in vector registers before putting them on the stack.
+def CC_PPC32_SVR4 : CallingConv<[
+ // QPX vectors mirror the scalar FP convention.
+ CCIfType<[v4f64, v4f32, v4i1], CCIfSubtarget<"hasQPX()",
+ CCAssignToReg<[QF1, QF2, QF3, QF4, QF5, QF6, QF7, QF8]>>>,
+
+ // The first 12 Vector arguments are passed in AltiVec registers.
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64],
+ CCIfSubtarget<"hasAltivec()", CCAssignToReg<[V2, V3, V4, V5, V6, V7,
+ V8, V9, V10, V11, V12, V13]>>>,
+
+ CCDelegateTo<CC_PPC32_SVR4_Common>
+]>;
+
+// Helper "calling convention" to handle aggregate by value arguments.
+// Aggregate by value arguments are always placed in the local variable space
+// of the caller. This calling convention is only used to assign those stack
+// offsets in the callers stack frame.
+//
+// Still, the address of the aggregate copy in the callers stack frame is passed
+// in a GPR (or in the parameter list area if all GPRs are allocated) from the
+// caller to the callee. The location for the address argument is assigned by
+// the CC_PPC32_SVR4 calling convention.
+//
+// The only purpose of CC_PPC32_SVR4_Custom_Dummy is to skip arguments which are
+// not passed by value.
+
+def CC_PPC32_SVR4_ByVal : CallingConv<[
+ CCIfByVal<CCPassByVal<4, 4>>,
+
+ CCCustom<"CC_PPC32_SVR4_Custom_Dummy">
+]>;
+
+def CSR_Altivec : CalleeSavedRegs<(add V20, V21, V22, V23, V24, V25, V26, V27,
+ V28, V29, V30, V31)>;
+
+def CSR_Darwin32 : CalleeSavedRegs<(add R13, R14, R15, R16, R17, R18, R19, R20,
+ R21, R22, R23, R24, R25, R26, R27, R28,
+ R29, R30, R31, F14, F15, F16, F17, F18,
+ F19, F20, F21, F22, F23, F24, F25, F26,
+ F27, F28, F29, F30, F31, CR2, CR3, CR4
+ )>;
+
+def CSR_Darwin32_Altivec : CalleeSavedRegs<(add CSR_Darwin32, CSR_Altivec)>;
+
+def CSR_SVR432 : CalleeSavedRegs<(add R14, R15, R16, R17, R18, R19, R20,
+ R21, R22, R23, R24, R25, R26, R27, R28,
+ R29, R30, R31, F14, F15, F16, F17, F18,
+ F19, F20, F21, F22, F23, F24, F25, F26,
+ F27, F28, F29, F30, F31, CR2, CR3, CR4
+ )>;
+
+def CSR_SVR432_Altivec : CalleeSavedRegs<(add CSR_SVR432, CSR_Altivec)>;
+
+def CSR_Darwin64 : CalleeSavedRegs<(add X13, X14, X15, X16, X17, X18, X19, X20,
+ X21, X22, X23, X24, X25, X26, X27, X28,
+ X29, X30, X31, F14, F15, F16, F17, F18,
+ F19, F20, F21, F22, F23, F24, F25, F26,
+ F27, F28, F29, F30, F31, CR2, CR3, CR4
+ )>;
+
+def CSR_Darwin64_Altivec : CalleeSavedRegs<(add CSR_Darwin64, CSR_Altivec)>;
+
+def CSR_SVR464 : CalleeSavedRegs<(add X14, X15, X16, X17, X18, X19, X20,
+ X21, X22, X23, X24, X25, X26, X27, X28,
+ X29, X30, X31, F14, F15, F16, F17, F18,
+ F19, F20, F21, F22, F23, F24, F25, F26,
+ F27, F28, F29, F30, F31, CR2, CR3, CR4
+ )>;
+
+// CSRs that are handled by prologue, epilogue.
+def CSR_SRV464_TLS_PE : CalleeSavedRegs<(add)>;
+
+def CSR_SVR464_ViaCopy : CalleeSavedRegs<(add CSR_SVR464)>;
+
+def CSR_SVR464_Altivec : CalleeSavedRegs<(add CSR_SVR464, CSR_Altivec)>;
+
+def CSR_SVR464_Altivec_ViaCopy : CalleeSavedRegs<(add CSR_SVR464_Altivec)>;
+
+def CSR_SVR464_R2 : CalleeSavedRegs<(add CSR_SVR464, X2)>;
+
+def CSR_SVR464_R2_ViaCopy : CalleeSavedRegs<(add CSR_SVR464_R2)>;
+
+def CSR_SVR464_R2_Altivec : CalleeSavedRegs<(add CSR_SVR464_Altivec, X2)>;
+
+def CSR_SVR464_R2_Altivec_ViaCopy : CalleeSavedRegs<(add CSR_SVR464_R2_Altivec)>;
+
+def CSR_NoRegs : CalleeSavedRegs<(add)>;
+
+def CSR_64_AllRegs: CalleeSavedRegs<(add X0, (sequence "X%u", 3, 10),
+ (sequence "X%u", 14, 31),
+ (sequence "F%u", 0, 31),
+ (sequence "CR%u", 0, 7))>;
+
+def CSR_64_AllRegs_Altivec : CalleeSavedRegs<(add CSR_64_AllRegs,
+ (sequence "V%u", 0, 31))>;
+
+def CSR_64_AllRegs_VSX : CalleeSavedRegs<(add CSR_64_AllRegs_Altivec,
+ (sequence "VSL%u", 0, 31))>;
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp b/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp
new file mode 100644
index 000000000000..6bd229625fc3
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp
@@ -0,0 +1,213 @@
+//===------------- PPCEarlyReturn.cpp - Form Early Returns ----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// A pass that form early (predicated) returns. If-conversion handles some of
+// this, but this pass picks up some remaining cases.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "PPCInstrBuilder.h"
+#include "PPCInstrInfo.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-early-ret"
+STATISTIC(NumBCLR, "Number of early conditional returns");
+STATISTIC(NumBLR, "Number of early returns");
+
+namespace llvm {
+ void initializePPCEarlyReturnPass(PassRegistry&);
+}
+
+namespace {
+ // PPCEarlyReturn pass - For simple functions without epilogue code, move
+ // returns up, and create conditional returns, to avoid unnecessary
+ // branch-to-blr sequences.
+ struct PPCEarlyReturn : public MachineFunctionPass {
+ static char ID;
+ PPCEarlyReturn() : MachineFunctionPass(ID) {
+ initializePPCEarlyReturnPass(*PassRegistry::getPassRegistry());
+ }
+
+ const TargetInstrInfo *TII;
+
+protected:
+ bool processBlock(MachineBasicBlock &ReturnMBB) {
+ bool Changed = false;
+
+ MachineBasicBlock::iterator I = ReturnMBB.begin();
+ I = ReturnMBB.SkipPHIsLabelsAndDebug(I);
+
+ // The block must be essentially empty except for the blr.
+ if (I == ReturnMBB.end() ||
+ (I->getOpcode() != PPC::BLR && I->getOpcode() != PPC::BLR8) ||
+ I != ReturnMBB.getLastNonDebugInstr())
+ return Changed;
+
+ SmallVector<MachineBasicBlock*, 8> PredToRemove;
+ for (MachineBasicBlock::pred_iterator PI = ReturnMBB.pred_begin(),
+ PIE = ReturnMBB.pred_end(); PI != PIE; ++PI) {
+ bool OtherReference = false, BlockChanged = false;
+
+ if ((*PI)->empty())
+ continue;
+
+ for (MachineBasicBlock::iterator J = (*PI)->getLastNonDebugInstr();;) {
+ if (J == (*PI)->end())
+ break;
+
+ if (J->getOpcode() == PPC::B) {
+ if (J->getOperand(0).getMBB() == &ReturnMBB) {
+ // This is an unconditional branch to the return. Replace the
+ // branch with a blr.
+ BuildMI(**PI, J, J->getDebugLoc(), TII->get(I->getOpcode()))
+ .copyImplicitOps(*I);
+ MachineBasicBlock::iterator K = J--;
+ K->eraseFromParent();
+ BlockChanged = true;
+ ++NumBLR;
+ continue;
+ }
+ } else if (J->getOpcode() == PPC::BCC) {
+ if (J->getOperand(2).getMBB() == &ReturnMBB) {
+ // This is a conditional branch to the return. Replace the branch
+ // with a bclr.
+ BuildMI(**PI, J, J->getDebugLoc(), TII->get(PPC::BCCLR))
+ .addImm(J->getOperand(0).getImm())
+ .addReg(J->getOperand(1).getReg())
+ .copyImplicitOps(*I);
+ MachineBasicBlock::iterator K = J--;
+ K->eraseFromParent();
+ BlockChanged = true;
+ ++NumBCLR;
+ continue;
+ }
+ } else if (J->getOpcode() == PPC::BC || J->getOpcode() == PPC::BCn) {
+ if (J->getOperand(1).getMBB() == &ReturnMBB) {
+ // This is a conditional branch to the return. Replace the branch
+ // with a bclr.
+ BuildMI(
+ **PI, J, J->getDebugLoc(),
+ TII->get(J->getOpcode() == PPC::BC ? PPC::BCLR : PPC::BCLRn))
+ .addReg(J->getOperand(0).getReg())
+ .copyImplicitOps(*I);
+ MachineBasicBlock::iterator K = J--;
+ K->eraseFromParent();
+ BlockChanged = true;
+ ++NumBCLR;
+ continue;
+ }
+ } else if (J->isBranch()) {
+ if (J->isIndirectBranch()) {
+ if (ReturnMBB.hasAddressTaken())
+ OtherReference = true;
+ } else
+ for (unsigned i = 0; i < J->getNumOperands(); ++i)
+ if (J->getOperand(i).isMBB() &&
+ J->getOperand(i).getMBB() == &ReturnMBB)
+ OtherReference = true;
+ } else if (!J->isTerminator() && !J->isDebugValue())
+ break;
+
+ if (J == (*PI)->begin())
+ break;
+
+ --J;
+ }
+
+ if ((*PI)->canFallThrough() && (*PI)->isLayoutSuccessor(&ReturnMBB))
+ OtherReference = true;
+
+ // Predecessors are stored in a vector and can't be removed here.
+ if (!OtherReference && BlockChanged) {
+ PredToRemove.push_back(*PI);
+ }
+
+ if (BlockChanged)
+ Changed = true;
+ }
+
+ for (unsigned i = 0, ie = PredToRemove.size(); i != ie; ++i)
+ PredToRemove[i]->removeSuccessor(&ReturnMBB, true);
+
+ if (Changed && !ReturnMBB.hasAddressTaken()) {
+ // We now might be able to merge this blr-only block into its
+ // by-layout predecessor.
+ if (ReturnMBB.pred_size() == 1) {
+ MachineBasicBlock &PrevMBB = **ReturnMBB.pred_begin();
+ if (PrevMBB.isLayoutSuccessor(&ReturnMBB) && PrevMBB.canFallThrough()) {
+ // Move the blr into the preceding block.
+ PrevMBB.splice(PrevMBB.end(), &ReturnMBB, I);
+ PrevMBB.removeSuccessor(&ReturnMBB, true);
+ }
+ }
+
+ if (ReturnMBB.pred_empty())
+ ReturnMBB.eraseFromParent();
+ }
+
+ return Changed;
+ }
+
+public:
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
+ TII = MF.getSubtarget().getInstrInfo();
+
+ bool Changed = false;
+
+ // If the function does not have at least two blocks, then there is
+ // nothing to do.
+ if (MF.size() < 2)
+ return Changed;
+
+ for (MachineFunction::iterator I = MF.begin(); I != MF.end();) {
+ MachineBasicBlock &B = *I++;
+ if (processBlock(B))
+ Changed = true;
+ }
+
+ return Changed;
+ }
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+ };
+}
+
+INITIALIZE_PASS(PPCEarlyReturn, DEBUG_TYPE,
+ "PowerPC Early-Return Creation", false, false)
+
+char PPCEarlyReturn::ID = 0;
+FunctionPass*
+llvm::createPPCEarlyReturnPass() { return new PPCEarlyReturn(); }
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp
new file mode 100644
index 000000000000..9b91b9ab8f82
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp
@@ -0,0 +1,2374 @@
+//===-- PPCFastISel.cpp - PowerPC FastISel implementation -----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the PowerPC-specific support for the FastISel class. Some
+// of the target-specific code is generated by tablegen in the file
+// PPCGenFastISel.inc, which is #included here.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "PPCCallingConv.h"
+#include "PPCCCState.h"
+#include "PPCISelLowering.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCSubtarget.h"
+#include "PPCTargetMachine.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+//===----------------------------------------------------------------------===//
+//
+// TBD:
+// fastLowerArguments: Handle simple cases.
+// PPCMaterializeGV: Handle TLS.
+// SelectCall: Handle function pointers.
+// SelectCall: Handle multi-register return values.
+// SelectCall: Optimize away nops for local calls.
+// processCallArgs: Handle bit-converted arguments.
+// finishCall: Handle multi-register return values.
+// PPCComputeAddress: Handle parameter references as FrameIndex's.
+// PPCEmitCmp: Handle immediate as operand 1.
+// SelectCall: Handle small byval arguments.
+// SelectIntrinsicCall: Implement.
+// SelectSelect: Implement.
+// Consider factoring isTypeLegal into the base class.
+// Implement switches and jump tables.
+//
+//===----------------------------------------------------------------------===//
+using namespace llvm;
+
+#define DEBUG_TYPE "ppcfastisel"
+
+namespace {
+
+typedef struct Address {
+ enum {
+ RegBase,
+ FrameIndexBase
+ } BaseType;
+
+ union {
+ unsigned Reg;
+ int FI;
+ } Base;
+
+ long Offset;
+
+ // Innocuous defaults for our address.
+ Address()
+ : BaseType(RegBase), Offset(0) {
+ Base.Reg = 0;
+ }
+} Address;
+
+class PPCFastISel final : public FastISel {
+
+ const TargetMachine &TM;
+ const PPCSubtarget *PPCSubTarget;
+ PPCFunctionInfo *PPCFuncInfo;
+ const TargetInstrInfo &TII;
+ const TargetLowering &TLI;
+ LLVMContext *Context;
+
+ public:
+ explicit PPCFastISel(FunctionLoweringInfo &FuncInfo,
+ const TargetLibraryInfo *LibInfo)
+ : FastISel(FuncInfo, LibInfo), TM(FuncInfo.MF->getTarget()),
+ PPCSubTarget(&FuncInfo.MF->getSubtarget<PPCSubtarget>()),
+ PPCFuncInfo(FuncInfo.MF->getInfo<PPCFunctionInfo>()),
+ TII(*PPCSubTarget->getInstrInfo()),
+ TLI(*PPCSubTarget->getTargetLowering()),
+ Context(&FuncInfo.Fn->getContext()) {}
+
+ // Backend specific FastISel code.
+ private:
+ bool fastSelectInstruction(const Instruction *I) override;
+ unsigned fastMaterializeConstant(const Constant *C) override;
+ unsigned fastMaterializeAlloca(const AllocaInst *AI) override;
+ bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
+ const LoadInst *LI) override;
+ bool fastLowerArguments() override;
+ unsigned fastEmit_i(MVT Ty, MVT RetTy, unsigned Opc, uint64_t Imm) override;
+ unsigned fastEmitInst_ri(unsigned MachineInstOpcode,
+ const TargetRegisterClass *RC,
+ unsigned Op0, bool Op0IsKill,
+ uint64_t Imm);
+ unsigned fastEmitInst_r(unsigned MachineInstOpcode,
+ const TargetRegisterClass *RC,
+ unsigned Op0, bool Op0IsKill);
+ unsigned fastEmitInst_rr(unsigned MachineInstOpcode,
+ const TargetRegisterClass *RC,
+ unsigned Op0, bool Op0IsKill,
+ unsigned Op1, bool Op1IsKill);
+
+ bool fastLowerCall(CallLoweringInfo &CLI) override;
+
+ // Instruction selection routines.
+ private:
+ bool SelectLoad(const Instruction *I);
+ bool SelectStore(const Instruction *I);
+ bool SelectBranch(const Instruction *I);
+ bool SelectIndirectBr(const Instruction *I);
+ bool SelectFPExt(const Instruction *I);
+ bool SelectFPTrunc(const Instruction *I);
+ bool SelectIToFP(const Instruction *I, bool IsSigned);
+ bool SelectFPToI(const Instruction *I, bool IsSigned);
+ bool SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode);
+ bool SelectRet(const Instruction *I);
+ bool SelectTrunc(const Instruction *I);
+ bool SelectIntExt(const Instruction *I);
+
+ // Utility routines.
+ private:
+ bool isTypeLegal(Type *Ty, MVT &VT);
+ bool isLoadTypeLegal(Type *Ty, MVT &VT);
+ bool isValueAvailable(const Value *V) const;
+ bool isVSFRCRegClass(const TargetRegisterClass *RC) const {
+ return RC->getID() == PPC::VSFRCRegClassID;
+ }
+ bool isVSSRCRegClass(const TargetRegisterClass *RC) const {
+ return RC->getID() == PPC::VSSRCRegClassID;
+ }
+ bool PPCEmitCmp(const Value *Src1Value, const Value *Src2Value,
+ bool isZExt, unsigned DestReg);
+ bool PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+ const TargetRegisterClass *RC, bool IsZExt = true,
+ unsigned FP64LoadOpc = PPC::LFD);
+ bool PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr);
+ bool PPCComputeAddress(const Value *Obj, Address &Addr);
+ void PPCSimplifyAddress(Address &Addr, bool &UseOffset,
+ unsigned &IndexReg);
+ bool PPCEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+ unsigned DestReg, bool IsZExt);
+ unsigned PPCMaterializeFP(const ConstantFP *CFP, MVT VT);
+ unsigned PPCMaterializeGV(const GlobalValue *GV, MVT VT);
+ unsigned PPCMaterializeInt(const ConstantInt *CI, MVT VT,
+ bool UseSExt = true);
+ unsigned PPCMaterialize32BitInt(int64_t Imm,
+ const TargetRegisterClass *RC);
+ unsigned PPCMaterialize64BitInt(int64_t Imm,
+ const TargetRegisterClass *RC);
+ unsigned PPCMoveToIntReg(const Instruction *I, MVT VT,
+ unsigned SrcReg, bool IsSigned);
+ unsigned PPCMoveToFPReg(MVT VT, unsigned SrcReg, bool IsSigned);
+
+ // Call handling routines.
+ private:
+ bool processCallArgs(SmallVectorImpl<Value*> &Args,
+ SmallVectorImpl<unsigned> &ArgRegs,
+ SmallVectorImpl<MVT> &ArgVTs,
+ SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
+ SmallVectorImpl<unsigned> &RegArgs,
+ CallingConv::ID CC,
+ unsigned &NumBytes,
+ bool IsVarArg);
+ bool finishCall(MVT RetVT, CallLoweringInfo &CLI, unsigned &NumBytes);
+ LLVM_ATTRIBUTE_UNUSED CCAssignFn *usePPC32CCs(unsigned Flag);
+
+ private:
+ #include "PPCGenFastISel.inc"
+
+};
+
+} // end anonymous namespace
+
+#include "PPCGenCallingConv.inc"
+
+// Function whose sole purpose is to kill compiler warnings
+// stemming from unused functions included from PPCGenCallingConv.inc.
+CCAssignFn *PPCFastISel::usePPC32CCs(unsigned Flag) {
+ if (Flag == 1)
+ return CC_PPC32_SVR4;
+ else if (Flag == 2)
+ return CC_PPC32_SVR4_ByVal;
+ else if (Flag == 3)
+ return CC_PPC32_SVR4_VarArg;
+ else
+ return RetCC_PPC;
+}
+
+static Optional<PPC::Predicate> getComparePred(CmpInst::Predicate Pred) {
+ switch (Pred) {
+ // These are not representable with any single compare.
+ case CmpInst::FCMP_FALSE:
+ case CmpInst::FCMP_TRUE:
+ // Major concern about the following 6 cases is NaN result. The comparison
+ // result consists of 4 bits, indicating lt, eq, gt and un (unordered),
+ // only one of which will be set. The result is generated by fcmpu
+ // instruction. However, bc instruction only inspects one of the first 3
+ // bits, so when un is set, bc instruction may jump to to an undesired
+ // place.
+ //
+ // More specifically, if we expect an unordered comparison and un is set, we
+ // expect to always go to true branch; in such case UEQ, UGT and ULT still
+ // give false, which are undesired; but UNE, UGE, ULE happen to give true,
+ // since they are tested by inspecting !eq, !lt, !gt, respectively.
+ //
+ // Similarly, for ordered comparison, when un is set, we always expect the
+ // result to be false. In such case OGT, OLT and OEQ is good, since they are
+ // actually testing GT, LT, and EQ respectively, which are false. OGE, OLE
+ // and ONE are tested through !lt, !gt and !eq, and these are true.
+ case CmpInst::FCMP_UEQ:
+ case CmpInst::FCMP_UGT:
+ case CmpInst::FCMP_ULT:
+ case CmpInst::FCMP_OGE:
+ case CmpInst::FCMP_OLE:
+ case CmpInst::FCMP_ONE:
+ default:
+ return Optional<PPC::Predicate>();
+
+ case CmpInst::FCMP_OEQ:
+ case CmpInst::ICMP_EQ:
+ return PPC::PRED_EQ;
+
+ case CmpInst::FCMP_OGT:
+ case CmpInst::ICMP_UGT:
+ case CmpInst::ICMP_SGT:
+ return PPC::PRED_GT;
+
+ case CmpInst::FCMP_UGE:
+ case CmpInst::ICMP_UGE:
+ case CmpInst::ICMP_SGE:
+ return PPC::PRED_GE;
+
+ case CmpInst::FCMP_OLT:
+ case CmpInst::ICMP_ULT:
+ case CmpInst::ICMP_SLT:
+ return PPC::PRED_LT;
+
+ case CmpInst::FCMP_ULE:
+ case CmpInst::ICMP_ULE:
+ case CmpInst::ICMP_SLE:
+ return PPC::PRED_LE;
+
+ case CmpInst::FCMP_UNE:
+ case CmpInst::ICMP_NE:
+ return PPC::PRED_NE;
+
+ case CmpInst::FCMP_ORD:
+ return PPC::PRED_NU;
+
+ case CmpInst::FCMP_UNO:
+ return PPC::PRED_UN;
+ }
+}
+
+// Determine whether the type Ty is simple enough to be handled by
+// fast-isel, and return its equivalent machine type in VT.
+// FIXME: Copied directly from ARM -- factor into base class?
+bool PPCFastISel::isTypeLegal(Type *Ty, MVT &VT) {
+ EVT Evt = TLI.getValueType(DL, Ty, true);
+
+ // Only handle simple types.
+ if (Evt == MVT::Other || !Evt.isSimple()) return false;
+ VT = Evt.getSimpleVT();
+
+ // Handle all legal types, i.e. a register that will directly hold this
+ // value.
+ return TLI.isTypeLegal(VT);
+}
+
+// Determine whether the type Ty is simple enough to be handled by
+// fast-isel as a load target, and return its equivalent machine type in VT.
+bool PPCFastISel::isLoadTypeLegal(Type *Ty, MVT &VT) {
+ if (isTypeLegal(Ty, VT)) return true;
+
+ // If this is a type than can be sign or zero-extended to a basic operation
+ // go ahead and accept it now.
+ if (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) {
+ return true;
+ }
+
+ return false;
+}
+
+bool PPCFastISel::isValueAvailable(const Value *V) const {
+ if (!isa<Instruction>(V))
+ return true;
+
+ const auto *I = cast<Instruction>(V);
+ return FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB;
+}
+
+// Given a value Obj, create an Address object Addr that represents its
+// address. Return false if we can't handle it.
+bool PPCFastISel::PPCComputeAddress(const Value *Obj, Address &Addr) {
+ const User *U = nullptr;
+ unsigned Opcode = Instruction::UserOp1;
+ if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
+ // Don't walk into other basic blocks unless the object is an alloca from
+ // another block, otherwise it may not have a virtual register assigned.
+ if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(Obj)) ||
+ FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
+ Opcode = I->getOpcode();
+ U = I;
+ }
+ } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(Obj)) {
+ Opcode = C->getOpcode();
+ U = C;
+ }
+
+ switch (Opcode) {
+ default:
+ break;
+ case Instruction::BitCast:
+ // Look through bitcasts.
+ return PPCComputeAddress(U->getOperand(0), Addr);
+ case Instruction::IntToPtr:
+ // Look past no-op inttoptrs.
+ if (TLI.getValueType(DL, U->getOperand(0)->getType()) ==
+ TLI.getPointerTy(DL))
+ return PPCComputeAddress(U->getOperand(0), Addr);
+ break;
+ case Instruction::PtrToInt:
+ // Look past no-op ptrtoints.
+ if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
+ return PPCComputeAddress(U->getOperand(0), Addr);
+ break;
+ case Instruction::GetElementPtr: {
+ Address SavedAddr = Addr;
+ long TmpOffset = Addr.Offset;
+
+ // Iterate through the GEP folding the constants into offsets where
+ // we can.
+ gep_type_iterator GTI = gep_type_begin(U);
+ for (User::const_op_iterator II = U->op_begin() + 1, IE = U->op_end();
+ II != IE; ++II, ++GTI) {
+ const Value *Op = *II;
+ if (StructType *STy = GTI.getStructTypeOrNull()) {
+ const StructLayout *SL = DL.getStructLayout(STy);
+ unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
+ TmpOffset += SL->getElementOffset(Idx);
+ } else {
+ uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
+ for (;;) {
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+ // Constant-offset addressing.
+ TmpOffset += CI->getSExtValue() * S;
+ break;
+ }
+ if (canFoldAddIntoGEP(U, Op)) {
+ // A compatible add with a constant operand. Fold the constant.
+ ConstantInt *CI =
+ cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+ TmpOffset += CI->getSExtValue() * S;
+ // Iterate on the other operand.
+ Op = cast<AddOperator>(Op)->getOperand(0);
+ continue;
+ }
+ // Unsupported
+ goto unsupported_gep;
+ }
+ }
+ }
+
+ // Try to grab the base operand now.
+ Addr.Offset = TmpOffset;
+ if (PPCComputeAddress(U->getOperand(0), Addr)) return true;
+
+ // We failed, restore everything and try the other options.
+ Addr = SavedAddr;
+
+ unsupported_gep:
+ break;
+ }
+ case Instruction::Alloca: {
+ const AllocaInst *AI = cast<AllocaInst>(Obj);
+ DenseMap<const AllocaInst*, int>::iterator SI =
+ FuncInfo.StaticAllocaMap.find(AI);
+ if (SI != FuncInfo.StaticAllocaMap.end()) {
+ Addr.BaseType = Address::FrameIndexBase;
+ Addr.Base.FI = SI->second;
+ return true;
+ }
+ break;
+ }
+ }
+
+ // FIXME: References to parameters fall through to the behavior
+ // below. They should be able to reference a frame index since
+ // they are stored to the stack, so we can get "ld rx, offset(r1)"
+ // instead of "addi ry, r1, offset / ld rx, 0(ry)". Obj will
+ // just contain the parameter. Try to handle this with a FI.
+
+ // Try to get this in a register if nothing else has worked.
+ if (Addr.Base.Reg == 0)
+ Addr.Base.Reg = getRegForValue(Obj);
+
+ // Prevent assignment of base register to X0, which is inappropriate
+ // for loads and stores alike.
+ if (Addr.Base.Reg != 0)
+ MRI.setRegClass(Addr.Base.Reg, &PPC::G8RC_and_G8RC_NOX0RegClass);
+
+ return Addr.Base.Reg != 0;
+}
+
+// Fix up some addresses that can't be used directly. For example, if
+// an offset won't fit in an instruction field, we may need to move it
+// into an index register.
+void PPCFastISel::PPCSimplifyAddress(Address &Addr, bool &UseOffset,
+ unsigned &IndexReg) {
+
+ // Check whether the offset fits in the instruction field.
+ if (!isInt<16>(Addr.Offset))
+ UseOffset = false;
+
+ // If this is a stack pointer and the offset needs to be simplified then
+ // put the alloca address into a register, set the base type back to
+ // register and continue. This should almost never happen.
+ if (!UseOffset && Addr.BaseType == Address::FrameIndexBase) {
+ unsigned ResultReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDI8),
+ ResultReg).addFrameIndex(Addr.Base.FI).addImm(0);
+ Addr.Base.Reg = ResultReg;
+ Addr.BaseType = Address::RegBase;
+ }
+
+ if (!UseOffset) {
+ IntegerType *OffsetTy = Type::getInt64Ty(*Context);
+ const ConstantInt *Offset =
+ ConstantInt::getSigned(OffsetTy, (int64_t)(Addr.Offset));
+ IndexReg = PPCMaterializeInt(Offset, MVT::i64);
+ assert(IndexReg && "Unexpected error in PPCMaterializeInt!");
+ }
+}
+
+// Emit a load instruction if possible, returning true if we succeeded,
+// otherwise false. See commentary below for how the register class of
+// the load is determined.
+bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+ const TargetRegisterClass *RC,
+ bool IsZExt, unsigned FP64LoadOpc) {
+ unsigned Opc;
+ bool UseOffset = true;
+
+ // If ResultReg is given, it determines the register class of the load.
+ // Otherwise, RC is the register class to use. If the result of the
+ // load isn't anticipated in this block, both may be zero, in which
+ // case we must make a conservative guess. In particular, don't assign
+ // R0 or X0 to the result register, as the result may be used in a load,
+ // store, add-immediate, or isel that won't permit this. (Though
+ // perhaps the spill and reload of live-exit values would handle this?)
+ const TargetRegisterClass *UseRC =
+ (ResultReg ? MRI.getRegClass(ResultReg) :
+ (RC ? RC :
+ (VT == MVT::f64 ? &PPC::F8RCRegClass :
+ (VT == MVT::f32 ? &PPC::F4RCRegClass :
+ (VT == MVT::i64 ? &PPC::G8RC_and_G8RC_NOX0RegClass :
+ &PPC::GPRC_and_GPRC_NOR0RegClass)))));
+
+ bool Is32BitInt = UseRC->hasSuperClassEq(&PPC::GPRCRegClass);
+
+ switch (VT.SimpleTy) {
+ default: // e.g., vector types not handled
+ return false;
+ case MVT::i8:
+ Opc = Is32BitInt ? PPC::LBZ : PPC::LBZ8;
+ break;
+ case MVT::i16:
+ Opc = (IsZExt ? (Is32BitInt ? PPC::LHZ : PPC::LHZ8)
+ : (Is32BitInt ? PPC::LHA : PPC::LHA8));
+ break;
+ case MVT::i32:
+ Opc = (IsZExt ? (Is32BitInt ? PPC::LWZ : PPC::LWZ8)
+ : (Is32BitInt ? PPC::LWA_32 : PPC::LWA));
+ if ((Opc == PPC::LWA || Opc == PPC::LWA_32) && ((Addr.Offset & 3) != 0))
+ UseOffset = false;
+ break;
+ case MVT::i64:
+ Opc = PPC::LD;
+ assert(UseRC->hasSuperClassEq(&PPC::G8RCRegClass) &&
+ "64-bit load with 32-bit target??");
+ UseOffset = ((Addr.Offset & 3) == 0);
+ break;
+ case MVT::f32:
+ Opc = PPC::LFS;
+ break;
+ case MVT::f64:
+ Opc = FP64LoadOpc;
+ break;
+ }
+
+ // If necessary, materialize the offset into a register and use
+ // the indexed form. Also handle stack pointers with special needs.
+ unsigned IndexReg = 0;
+ PPCSimplifyAddress(Addr, UseOffset, IndexReg);
+
+ // If this is a potential VSX load with an offset of 0, a VSX indexed load can
+ // be used.
+ bool IsVSSRC = isVSSRCRegClass(UseRC);
+ bool IsVSFRC = isVSFRCRegClass(UseRC);
+ bool Is32VSXLoad = IsVSSRC && Opc == PPC::LFS;
+ bool Is64VSXLoad = IsVSFRC && Opc == PPC::LFD;
+ if ((Is32VSXLoad || Is64VSXLoad) &&
+ (Addr.BaseType != Address::FrameIndexBase) && UseOffset &&
+ (Addr.Offset == 0)) {
+ UseOffset = false;
+ }
+
+ if (ResultReg == 0)
+ ResultReg = createResultReg(UseRC);
+
+ // Note: If we still have a frame index here, we know the offset is
+ // in range, as otherwise PPCSimplifyAddress would have converted it
+ // into a RegBase.
+ if (Addr.BaseType == Address::FrameIndexBase) {
+ // VSX only provides an indexed load.
+ if (Is32VSXLoad || Is64VSXLoad) return false;
+
+ MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*FuncInfo.MF, Addr.Base.FI,
+ Addr.Offset),
+ MachineMemOperand::MOLoad, MFI.getObjectSize(Addr.Base.FI),
+ MFI.getObjectAlignment(Addr.Base.FI));
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+ .addImm(Addr.Offset).addFrameIndex(Addr.Base.FI).addMemOperand(MMO);
+
+ // Base reg with offset in range.
+ } else if (UseOffset) {
+ // VSX only provides an indexed load.
+ if (Is32VSXLoad || Is64VSXLoad) return false;
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+ .addImm(Addr.Offset).addReg(Addr.Base.Reg);
+
+ // Indexed form.
+ } else {
+ // Get the RR opcode corresponding to the RI one. FIXME: It would be
+ // preferable to use the ImmToIdxMap from PPCRegisterInfo.cpp, but it
+ // is hard to get at.
+ switch (Opc) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case PPC::LBZ: Opc = PPC::LBZX; break;
+ case PPC::LBZ8: Opc = PPC::LBZX8; break;
+ case PPC::LHZ: Opc = PPC::LHZX; break;
+ case PPC::LHZ8: Opc = PPC::LHZX8; break;
+ case PPC::LHA: Opc = PPC::LHAX; break;
+ case PPC::LHA8: Opc = PPC::LHAX8; break;
+ case PPC::LWZ: Opc = PPC::LWZX; break;
+ case PPC::LWZ8: Opc = PPC::LWZX8; break;
+ case PPC::LWA: Opc = PPC::LWAX; break;
+ case PPC::LWA_32: Opc = PPC::LWAX_32; break;
+ case PPC::LD: Opc = PPC::LDX; break;
+ case PPC::LFS: Opc = IsVSSRC ? PPC::LXSSPX : PPC::LFSX; break;
+ case PPC::LFD: Opc = IsVSFRC ? PPC::LXSDX : PPC::LFDX; break;
+ }
+
+ auto MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
+ ResultReg);
+
+ // If we have an index register defined we use it in the store inst,
+ // otherwise we use X0 as base as it makes the vector instructions to
+ // use zero in the computation of the effective address regardless the
+ // content of the register.
+ if (IndexReg)
+ MIB.addReg(Addr.Base.Reg).addReg(IndexReg);
+ else
+ MIB.addReg(PPC::ZERO8).addReg(Addr.Base.Reg);
+ }
+
+ return true;
+}
+
+// Attempt to fast-select a load instruction.
+bool PPCFastISel::SelectLoad(const Instruction *I) {
+ // FIXME: No atomic loads are supported.
+ if (cast<LoadInst>(I)->isAtomic())
+ return false;
+
+ // Verify we have a legal type before going any further.
+ MVT VT;
+ if (!isLoadTypeLegal(I->getType(), VT))
+ return false;
+
+ // See if we can handle this address.
+ Address Addr;
+ if (!PPCComputeAddress(I->getOperand(0), Addr))
+ return false;
+
+ // Look at the currently assigned register for this instruction
+ // to determine the required register class. This is necessary
+ // to constrain RA from using R0/X0 when this is not legal.
+ unsigned AssignedReg = FuncInfo.ValueMap[I];
+ const TargetRegisterClass *RC =
+ AssignedReg ? MRI.getRegClass(AssignedReg) : nullptr;
+
+ unsigned ResultReg = 0;
+ if (!PPCEmitLoad(VT, ResultReg, Addr, RC))
+ return false;
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+// Emit a store instruction to store SrcReg at Addr.
+bool PPCFastISel::PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr) {
+ assert(SrcReg && "Nothing to store!");
+ unsigned Opc;
+ bool UseOffset = true;
+
+ const TargetRegisterClass *RC = MRI.getRegClass(SrcReg);
+ bool Is32BitInt = RC->hasSuperClassEq(&PPC::GPRCRegClass);
+
+ switch (VT.SimpleTy) {
+ default: // e.g., vector types not handled
+ return false;
+ case MVT::i8:
+ Opc = Is32BitInt ? PPC::STB : PPC::STB8;
+ break;
+ case MVT::i16:
+ Opc = Is32BitInt ? PPC::STH : PPC::STH8;
+ break;
+ case MVT::i32:
+ assert(Is32BitInt && "Not GPRC for i32??");
+ Opc = PPC::STW;
+ break;
+ case MVT::i64:
+ Opc = PPC::STD;
+ UseOffset = ((Addr.Offset & 3) == 0);
+ break;
+ case MVT::f32:
+ Opc = PPC::STFS;
+ break;
+ case MVT::f64:
+ Opc = PPC::STFD;
+ break;
+ }
+
+ // If necessary, materialize the offset into a register and use
+ // the indexed form. Also handle stack pointers with special needs.
+ unsigned IndexReg = 0;
+ PPCSimplifyAddress(Addr, UseOffset, IndexReg);
+
+ // If this is a potential VSX store with an offset of 0, a VSX indexed store
+ // can be used.
+ bool IsVSSRC = isVSSRCRegClass(RC);
+ bool IsVSFRC = isVSFRCRegClass(RC);
+ bool Is32VSXStore = IsVSSRC && Opc == PPC::STFS;
+ bool Is64VSXStore = IsVSFRC && Opc == PPC::STFD;
+ if ((Is32VSXStore || Is64VSXStore) &&
+ (Addr.BaseType != Address::FrameIndexBase) && UseOffset &&
+ (Addr.Offset == 0)) {
+ UseOffset = false;
+ }
+
+ // Note: If we still have a frame index here, we know the offset is
+ // in range, as otherwise PPCSimplifyAddress would have converted it
+ // into a RegBase.
+ if (Addr.BaseType == Address::FrameIndexBase) {
+ // VSX only provides an indexed store.
+ if (Is32VSXStore || Is64VSXStore) return false;
+
+ MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*FuncInfo.MF, Addr.Base.FI,
+ Addr.Offset),
+ MachineMemOperand::MOStore, MFI.getObjectSize(Addr.Base.FI),
+ MFI.getObjectAlignment(Addr.Base.FI));
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
+ .addReg(SrcReg)
+ .addImm(Addr.Offset)
+ .addFrameIndex(Addr.Base.FI)
+ .addMemOperand(MMO);
+
+ // Base reg with offset in range.
+ } else if (UseOffset) {
+ // VSX only provides an indexed store.
+ if (Is32VSXStore || Is64VSXStore)
+ return false;
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
+ .addReg(SrcReg).addImm(Addr.Offset).addReg(Addr.Base.Reg);
+
+ // Indexed form.
+ } else {
+ // Get the RR opcode corresponding to the RI one. FIXME: It would be
+ // preferable to use the ImmToIdxMap from PPCRegisterInfo.cpp, but it
+ // is hard to get at.
+ switch (Opc) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case PPC::STB: Opc = PPC::STBX; break;
+ case PPC::STH : Opc = PPC::STHX; break;
+ case PPC::STW : Opc = PPC::STWX; break;
+ case PPC::STB8: Opc = PPC::STBX8; break;
+ case PPC::STH8: Opc = PPC::STHX8; break;
+ case PPC::STW8: Opc = PPC::STWX8; break;
+ case PPC::STD: Opc = PPC::STDX; break;
+ case PPC::STFS: Opc = IsVSSRC ? PPC::STXSSPX : PPC::STFSX; break;
+ case PPC::STFD: Opc = IsVSFRC ? PPC::STXSDX : PPC::STFDX; break;
+ }
+
+ auto MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
+ .addReg(SrcReg);
+
+ // If we have an index register defined we use it in the store inst,
+ // otherwise we use X0 as base as it makes the vector instructions to
+ // use zero in the computation of the effective address regardless the
+ // content of the register.
+ if (IndexReg)
+ MIB.addReg(Addr.Base.Reg).addReg(IndexReg);
+ else
+ MIB.addReg(PPC::ZERO8).addReg(Addr.Base.Reg);
+ }
+
+ return true;
+}
+
+// Attempt to fast-select a store instruction.
+bool PPCFastISel::SelectStore(const Instruction *I) {
+ Value *Op0 = I->getOperand(0);
+ unsigned SrcReg = 0;
+
+ // FIXME: No atomics loads are supported.
+ if (cast<StoreInst>(I)->isAtomic())
+ return false;
+
+ // Verify we have a legal type before going any further.
+ MVT VT;
+ if (!isLoadTypeLegal(Op0->getType(), VT))
+ return false;
+
+ // Get the value to be stored into a register.
+ SrcReg = getRegForValue(Op0);
+ if (SrcReg == 0)
+ return false;
+
+ // See if we can handle this address.
+ Address Addr;
+ if (!PPCComputeAddress(I->getOperand(1), Addr))
+ return false;
+
+ if (!PPCEmitStore(VT, SrcReg, Addr))
+ return false;
+
+ return true;
+}
+
+// Attempt to fast-select a branch instruction.
+bool PPCFastISel::SelectBranch(const Instruction *I) {
+ const BranchInst *BI = cast<BranchInst>(I);
+ MachineBasicBlock *BrBB = FuncInfo.MBB;
+ MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
+ MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
+
+ // For now, just try the simplest case where it's fed by a compare.
+ if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
+ if (isValueAvailable(CI)) {
+ Optional<PPC::Predicate> OptPPCPred = getComparePred(CI->getPredicate());
+ if (!OptPPCPred)
+ return false;
+
+ PPC::Predicate PPCPred = OptPPCPred.getValue();
+
+ // Take advantage of fall-through opportunities.
+ if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+ std::swap(TBB, FBB);
+ PPCPred = PPC::InvertPredicate(PPCPred);
+ }
+
+ unsigned CondReg = createResultReg(&PPC::CRRCRegClass);
+
+ if (!PPCEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned(),
+ CondReg))
+ return false;
+
+ BuildMI(*BrBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::BCC))
+ .addImm(PPCPred).addReg(CondReg).addMBB(TBB);
+ finishCondBranch(BI->getParent(), TBB, FBB);
+ return true;
+ }
+ } else if (const ConstantInt *CI =
+ dyn_cast<ConstantInt>(BI->getCondition())) {
+ uint64_t Imm = CI->getZExtValue();
+ MachineBasicBlock *Target = (Imm == 0) ? FBB : TBB;
+ fastEmitBranch(Target, DbgLoc);
+ return true;
+ }
+
+ // FIXME: ARM looks for a case where the block containing the compare
+ // has been split from the block containing the branch. If this happens,
+ // there is a vreg available containing the result of the compare. I'm
+ // not sure we can do much, as we've lost the predicate information with
+ // the compare instruction -- we have a 4-bit CR but don't know which bit
+ // to test here.
+ return false;
+}
+
+// Attempt to emit a compare of the two source values. Signed and unsigned
+// comparisons are supported. Return false if we can't handle it.
+bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2,
+ bool IsZExt, unsigned DestReg) {
+ Type *Ty = SrcValue1->getType();
+ EVT SrcEVT = TLI.getValueType(DL, Ty, true);
+ if (!SrcEVT.isSimple())
+ return false;
+ MVT SrcVT = SrcEVT.getSimpleVT();
+
+ if (SrcVT == MVT::i1 && PPCSubTarget->useCRBits())
+ return false;
+
+ // See if operand 2 is an immediate encodeable in the compare.
+ // FIXME: Operands are not in canonical order at -O0, so an immediate
+ // operand in position 1 is a lost opportunity for now. We are
+ // similar to ARM in this regard.
+ long Imm = 0;
+ bool UseImm = false;
+
+ // Only 16-bit integer constants can be represented in compares for
+ // PowerPC. Others will be materialized into a register.
+ if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(SrcValue2)) {
+ if (SrcVT == MVT::i64 || SrcVT == MVT::i32 || SrcVT == MVT::i16 ||
+ SrcVT == MVT::i8 || SrcVT == MVT::i1) {
+ const APInt &CIVal = ConstInt->getValue();
+ Imm = (IsZExt) ? (long)CIVal.getZExtValue() : (long)CIVal.getSExtValue();
+ if ((IsZExt && isUInt<16>(Imm)) || (!IsZExt && isInt<16>(Imm)))
+ UseImm = true;
+ }
+ }
+
+ unsigned CmpOpc;
+ bool NeedsExt = false;
+ switch (SrcVT.SimpleTy) {
+ default: return false;
+ case MVT::f32:
+ CmpOpc = PPC::FCMPUS;
+ break;
+ case MVT::f64:
+ CmpOpc = PPC::FCMPUD;
+ break;
+ case MVT::i1:
+ case MVT::i8:
+ case MVT::i16:
+ NeedsExt = true;
+ // Intentional fall-through.
+ case MVT::i32:
+ if (!UseImm)
+ CmpOpc = IsZExt ? PPC::CMPLW : PPC::CMPW;
+ else
+ CmpOpc = IsZExt ? PPC::CMPLWI : PPC::CMPWI;
+ break;
+ case MVT::i64:
+ if (!UseImm)
+ CmpOpc = IsZExt ? PPC::CMPLD : PPC::CMPD;
+ else
+ CmpOpc = IsZExt ? PPC::CMPLDI : PPC::CMPDI;
+ break;
+ }
+
+ unsigned SrcReg1 = getRegForValue(SrcValue1);
+ if (SrcReg1 == 0)
+ return false;
+
+ unsigned SrcReg2 = 0;
+ if (!UseImm) {
+ SrcReg2 = getRegForValue(SrcValue2);
+ if (SrcReg2 == 0)
+ return false;
+ }
+
+ if (NeedsExt) {
+ unsigned ExtReg = createResultReg(&PPC::GPRCRegClass);
+ if (!PPCEmitIntExt(SrcVT, SrcReg1, MVT::i32, ExtReg, IsZExt))
+ return false;
+ SrcReg1 = ExtReg;
+
+ if (!UseImm) {
+ unsigned ExtReg = createResultReg(&PPC::GPRCRegClass);
+ if (!PPCEmitIntExt(SrcVT, SrcReg2, MVT::i32, ExtReg, IsZExt))
+ return false;
+ SrcReg2 = ExtReg;
+ }
+ }
+
+ if (!UseImm)
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc), DestReg)
+ .addReg(SrcReg1).addReg(SrcReg2);
+ else
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc), DestReg)
+ .addReg(SrcReg1).addImm(Imm);
+
+ return true;
+}
+
+// Attempt to fast-select a floating-point extend instruction.
+bool PPCFastISel::SelectFPExt(const Instruction *I) {
+ Value *Src = I->getOperand(0);
+ EVT SrcVT = TLI.getValueType(DL, Src->getType(), true);
+ EVT DestVT = TLI.getValueType(DL, I->getType(), true);
+
+ if (SrcVT != MVT::f32 || DestVT != MVT::f64)
+ return false;
+
+ unsigned SrcReg = getRegForValue(Src);
+ if (!SrcReg)
+ return false;
+
+ // No code is generated for a FP extend.
+ updateValueMap(I, SrcReg);
+ return true;
+}
+
+// Attempt to fast-select a floating-point truncate instruction.
+bool PPCFastISel::SelectFPTrunc(const Instruction *I) {
+ Value *Src = I->getOperand(0);
+ EVT SrcVT = TLI.getValueType(DL, Src->getType(), true);
+ EVT DestVT = TLI.getValueType(DL, I->getType(), true);
+
+ if (SrcVT != MVT::f64 || DestVT != MVT::f32)
+ return false;
+
+ unsigned SrcReg = getRegForValue(Src);
+ if (!SrcReg)
+ return false;
+
+ // Round the result to single precision.
+ unsigned DestReg = createResultReg(&PPC::F4RCRegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::FRSP), DestReg)
+ .addReg(SrcReg);
+
+ updateValueMap(I, DestReg);
+ return true;
+}
+
+// Move an i32 or i64 value in a GPR to an f64 value in an FPR.
+// FIXME: When direct register moves are implemented (see PowerISA 2.07),
+// those should be used instead of moving via a stack slot when the
+// subtarget permits.
+// FIXME: The code here is sloppy for the 4-byte case. Can use a 4-byte
+// stack slot and 4-byte store/load sequence. Or just sext the 4-byte
+// case to 8 bytes which produces tighter code but wastes stack space.
+unsigned PPCFastISel::PPCMoveToFPReg(MVT SrcVT, unsigned SrcReg,
+ bool IsSigned) {
+
+ // If necessary, extend 32-bit int to 64-bit.
+ if (SrcVT == MVT::i32) {
+ unsigned TmpReg = createResultReg(&PPC::G8RCRegClass);
+ if (!PPCEmitIntExt(MVT::i32, SrcReg, MVT::i64, TmpReg, !IsSigned))
+ return 0;
+ SrcReg = TmpReg;
+ }
+
+ // Get a stack slot 8 bytes wide, aligned on an 8-byte boundary.
+ Address Addr;
+ Addr.BaseType = Address::FrameIndexBase;
+ Addr.Base.FI = MFI.CreateStackObject(8, 8, false);
+
+ // Store the value from the GPR.
+ if (!PPCEmitStore(MVT::i64, SrcReg, Addr))
+ return 0;
+
+ // Load the integer value into an FPR. The kind of load used depends
+ // on a number of conditions.
+ unsigned LoadOpc = PPC::LFD;
+
+ if (SrcVT == MVT::i32) {
+ if (!IsSigned) {
+ LoadOpc = PPC::LFIWZX;
+ Addr.Offset = (PPCSubTarget->isLittleEndian()) ? 0 : 4;
+ } else if (PPCSubTarget->hasLFIWAX()) {
+ LoadOpc = PPC::LFIWAX;
+ Addr.Offset = (PPCSubTarget->isLittleEndian()) ? 0 : 4;
+ }
+ }
+
+ const TargetRegisterClass *RC = &PPC::F8RCRegClass;
+ unsigned ResultReg = 0;
+ if (!PPCEmitLoad(MVT::f64, ResultReg, Addr, RC, !IsSigned, LoadOpc))
+ return 0;
+
+ return ResultReg;
+}
+
+// Attempt to fast-select an integer-to-floating-point conversion.
+// FIXME: Once fast-isel has better support for VSX, conversions using
+// direct moves should be implemented.
+bool PPCFastISel::SelectIToFP(const Instruction *I, bool IsSigned) {
+ MVT DstVT;
+ Type *DstTy = I->getType();
+ if (!isTypeLegal(DstTy, DstVT))
+ return false;
+
+ if (DstVT != MVT::f32 && DstVT != MVT::f64)
+ return false;
+
+ Value *Src = I->getOperand(0);
+ EVT SrcEVT = TLI.getValueType(DL, Src->getType(), true);
+ if (!SrcEVT.isSimple())
+ return false;
+
+ MVT SrcVT = SrcEVT.getSimpleVT();
+
+ if (SrcVT != MVT::i8 && SrcVT != MVT::i16 &&
+ SrcVT != MVT::i32 && SrcVT != MVT::i64)
+ return false;
+
+ unsigned SrcReg = getRegForValue(Src);
+ if (SrcReg == 0)
+ return false;
+
+ // We can only lower an unsigned convert if we have the newer
+ // floating-point conversion operations.
+ if (!IsSigned && !PPCSubTarget->hasFPCVT())
+ return false;
+
+ // FIXME: For now we require the newer floating-point conversion operations
+ // (which are present only on P7 and A2 server models) when converting
+ // to single-precision float. Otherwise we have to generate a lot of
+ // fiddly code to avoid double rounding. If necessary, the fiddly code
+ // can be found in PPCTargetLowering::LowerINT_TO_FP().
+ if (DstVT == MVT::f32 && !PPCSubTarget->hasFPCVT())
+ return false;
+
+ // Extend the input if necessary.
+ if (SrcVT == MVT::i8 || SrcVT == MVT::i16) {
+ unsigned TmpReg = createResultReg(&PPC::G8RCRegClass);
+ if (!PPCEmitIntExt(SrcVT, SrcReg, MVT::i64, TmpReg, !IsSigned))
+ return false;
+ SrcVT = MVT::i64;
+ SrcReg = TmpReg;
+ }
+
+ // Move the integer value to an FPR.
+ unsigned FPReg = PPCMoveToFPReg(SrcVT, SrcReg, IsSigned);
+ if (FPReg == 0)
+ return false;
+
+ // Determine the opcode for the conversion.
+ const TargetRegisterClass *RC = &PPC::F8RCRegClass;
+ unsigned DestReg = createResultReg(RC);
+ unsigned Opc;
+
+ if (DstVT == MVT::f32)
+ Opc = IsSigned ? PPC::FCFIDS : PPC::FCFIDUS;
+ else
+ Opc = IsSigned ? PPC::FCFID : PPC::FCFIDU;
+
+ // Generate the convert.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
+ .addReg(FPReg);
+
+ updateValueMap(I, DestReg);
+ return true;
+}
+
+// Move the floating-point value in SrcReg into an integer destination
+// register, and return the register (or zero if we can't handle it).
+// FIXME: When direct register moves are implemented (see PowerISA 2.07),
+// those should be used instead of moving via a stack slot when the
+// subtarget permits.
+unsigned PPCFastISel::PPCMoveToIntReg(const Instruction *I, MVT VT,
+ unsigned SrcReg, bool IsSigned) {
+ // Get a stack slot 8 bytes wide, aligned on an 8-byte boundary.
+ // Note that if have STFIWX available, we could use a 4-byte stack
+ // slot for i32, but this being fast-isel we'll just go with the
+ // easiest code gen possible.
+ Address Addr;
+ Addr.BaseType = Address::FrameIndexBase;
+ Addr.Base.FI = MFI.CreateStackObject(8, 8, false);
+
+ // Store the value from the FPR.
+ if (!PPCEmitStore(MVT::f64, SrcReg, Addr))
+ return 0;
+
+ // Reload it into a GPR. If we want an i32 on big endian, modify the
+ // address to have a 4-byte offset so we load from the right place.
+ if (VT == MVT::i32)
+ Addr.Offset = (PPCSubTarget->isLittleEndian()) ? 0 : 4;
+
+ // Look at the currently assigned register for this instruction
+ // to determine the required register class.
+ unsigned AssignedReg = FuncInfo.ValueMap[I];
+ const TargetRegisterClass *RC =
+ AssignedReg ? MRI.getRegClass(AssignedReg) : nullptr;
+
+ unsigned ResultReg = 0;
+ if (!PPCEmitLoad(VT, ResultReg, Addr, RC, !IsSigned))
+ return 0;
+
+ return ResultReg;
+}
+
+// Attempt to fast-select a floating-point-to-integer conversion.
+// FIXME: Once fast-isel has better support for VSX, conversions using
+// direct moves should be implemented.
+bool PPCFastISel::SelectFPToI(const Instruction *I, bool IsSigned) {
+ MVT DstVT, SrcVT;
+ Type *DstTy = I->getType();
+ if (!isTypeLegal(DstTy, DstVT))
+ return false;
+
+ if (DstVT != MVT::i32 && DstVT != MVT::i64)
+ return false;
+
+ // If we don't have FCTIDUZ and we need it, punt to SelectionDAG.
+ if (DstVT == MVT::i64 && !IsSigned && !PPCSubTarget->hasFPCVT())
+ return false;
+
+ Value *Src = I->getOperand(0);
+ Type *SrcTy = Src->getType();
+ if (!isTypeLegal(SrcTy, SrcVT))
+ return false;
+
+ if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
+ return false;
+
+ unsigned SrcReg = getRegForValue(Src);
+ if (SrcReg == 0)
+ return false;
+
+ // Convert f32 to f64 if necessary. This is just a meaningless copy
+ // to get the register class right.
+ const TargetRegisterClass *InRC = MRI.getRegClass(SrcReg);
+ if (InRC == &PPC::F4RCRegClass) {
+ unsigned TmpReg = createResultReg(&PPC::F8RCRegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), TmpReg)
+ .addReg(SrcReg);
+ SrcReg = TmpReg;
+ }
+
+ // Determine the opcode for the conversion, which takes place
+ // entirely within FPRs.
+ unsigned DestReg = createResultReg(&PPC::F8RCRegClass);
+ unsigned Opc;
+
+ if (DstVT == MVT::i32)
+ if (IsSigned)
+ Opc = PPC::FCTIWZ;
+ else
+ Opc = PPCSubTarget->hasFPCVT() ? PPC::FCTIWUZ : PPC::FCTIDZ;
+ else
+ Opc = IsSigned ? PPC::FCTIDZ : PPC::FCTIDUZ;
+
+ // Generate the convert.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
+ .addReg(SrcReg);
+
+ // Now move the integer value from a float register to an integer register.
+ unsigned IntReg = PPCMoveToIntReg(I, DstVT, DestReg, IsSigned);
+ if (IntReg == 0)
+ return false;
+
+ updateValueMap(I, IntReg);
+ return true;
+}
+
+// Attempt to fast-select a binary integer operation that isn't already
+// handled automatically.
+bool PPCFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) {
+ EVT DestVT = TLI.getValueType(DL, I->getType(), true);
+
+ // We can get here in the case when we have a binary operation on a non-legal
+ // type and the target independent selector doesn't know how to handle it.
+ if (DestVT != MVT::i16 && DestVT != MVT::i8)
+ return false;
+
+ // Look at the currently assigned register for this instruction
+ // to determine the required register class. If there is no register,
+ // make a conservative choice (don't assign R0).
+ unsigned AssignedReg = FuncInfo.ValueMap[I];
+ const TargetRegisterClass *RC =
+ (AssignedReg ? MRI.getRegClass(AssignedReg) :
+ &PPC::GPRC_and_GPRC_NOR0RegClass);
+ bool IsGPRC = RC->hasSuperClassEq(&PPC::GPRCRegClass);
+
+ unsigned Opc;
+ switch (ISDOpcode) {
+ default: return false;
+ case ISD::ADD:
+ Opc = IsGPRC ? PPC::ADD4 : PPC::ADD8;
+ break;
+ case ISD::OR:
+ Opc = IsGPRC ? PPC::OR : PPC::OR8;
+ break;
+ case ISD::SUB:
+ Opc = IsGPRC ? PPC::SUBF : PPC::SUBF8;
+ break;
+ }
+
+ unsigned ResultReg = createResultReg(RC ? RC : &PPC::G8RCRegClass);
+ unsigned SrcReg1 = getRegForValue(I->getOperand(0));
+ if (SrcReg1 == 0) return false;
+
+ // Handle case of small immediate operand.
+ if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(I->getOperand(1))) {
+ const APInt &CIVal = ConstInt->getValue();
+ int Imm = (int)CIVal.getSExtValue();
+ bool UseImm = true;
+ if (isInt<16>(Imm)) {
+ switch (Opc) {
+ default:
+ llvm_unreachable("Missing case!");
+ case PPC::ADD4:
+ Opc = PPC::ADDI;
+ MRI.setRegClass(SrcReg1, &PPC::GPRC_and_GPRC_NOR0RegClass);
+ break;
+ case PPC::ADD8:
+ Opc = PPC::ADDI8;
+ MRI.setRegClass(SrcReg1, &PPC::G8RC_and_G8RC_NOX0RegClass);
+ break;
+ case PPC::OR:
+ Opc = PPC::ORI;
+ break;
+ case PPC::OR8:
+ Opc = PPC::ORI8;
+ break;
+ case PPC::SUBF:
+ if (Imm == -32768)
+ UseImm = false;
+ else {
+ Opc = PPC::ADDI;
+ MRI.setRegClass(SrcReg1, &PPC::GPRC_and_GPRC_NOR0RegClass);
+ Imm = -Imm;
+ }
+ break;
+ case PPC::SUBF8:
+ if (Imm == -32768)
+ UseImm = false;
+ else {
+ Opc = PPC::ADDI8;
+ MRI.setRegClass(SrcReg1, &PPC::G8RC_and_G8RC_NOX0RegClass);
+ Imm = -Imm;
+ }
+ break;
+ }
+
+ if (UseImm) {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
+ ResultReg)
+ .addReg(SrcReg1)
+ .addImm(Imm);
+ updateValueMap(I, ResultReg);
+ return true;
+ }
+ }
+ }
+
+ // Reg-reg case.
+ unsigned SrcReg2 = getRegForValue(I->getOperand(1));
+ if (SrcReg2 == 0) return false;
+
+ // Reverse operands for subtract-from.
+ if (ISDOpcode == ISD::SUB)
+ std::swap(SrcReg1, SrcReg2);
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+ .addReg(SrcReg1).addReg(SrcReg2);
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+// Handle arguments to a call that we're attempting to fast-select.
+// Return false if the arguments are too complex for us at the moment.
+bool PPCFastISel::processCallArgs(SmallVectorImpl<Value*> &Args,
+ SmallVectorImpl<unsigned> &ArgRegs,
+ SmallVectorImpl<MVT> &ArgVTs,
+ SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
+ SmallVectorImpl<unsigned> &RegArgs,
+ CallingConv::ID CC,
+ unsigned &NumBytes,
+ bool IsVarArg) {
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, ArgLocs, *Context);
+
+ // Reserve space for the linkage area on the stack.
+ unsigned LinkageSize = PPCSubTarget->getFrameLowering()->getLinkageSize();
+ CCInfo.AllocateStack(LinkageSize, 8);
+
+ CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CC_PPC64_ELF_FIS);
+
+ // Bail out if we can't handle any of the arguments.
+ for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
+ CCValAssign &VA = ArgLocs[I];
+ MVT ArgVT = ArgVTs[VA.getValNo()];
+
+ // Skip vector arguments for now, as well as long double and
+ // uint128_t, and anything that isn't passed in a register.
+ if (ArgVT.isVector() || ArgVT.getSizeInBits() > 64 || ArgVT == MVT::i1 ||
+ !VA.isRegLoc() || VA.needsCustom())
+ return false;
+
+ // Skip bit-converted arguments for now.
+ if (VA.getLocInfo() == CCValAssign::BCvt)
+ return false;
+ }
+
+ // Get a count of how many bytes are to be pushed onto the stack.
+ NumBytes = CCInfo.getNextStackOffset();
+
+ // The prolog code of the callee may store up to 8 GPR argument registers to
+ // the stack, allowing va_start to index over them in memory if its varargs.
+ // Because we cannot tell if this is needed on the caller side, we have to
+ // conservatively assume that it is needed. As such, make sure we have at
+ // least enough stack space for the caller to store the 8 GPRs.
+ // FIXME: On ELFv2, it may be unnecessary to allocate the parameter area.
+ NumBytes = std::max(NumBytes, LinkageSize + 64);
+
+ // Issue CALLSEQ_START.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TII.getCallFrameSetupOpcode()))
+ .addImm(NumBytes);
+
+ // Prepare to assign register arguments. Every argument uses up a
+ // GPR protocol register even if it's passed in a floating-point
+ // register (unless we're using the fast calling convention).
+ unsigned NextGPR = PPC::X3;
+ unsigned NextFPR = PPC::F1;
+
+ // Process arguments.
+ for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
+ CCValAssign &VA = ArgLocs[I];
+ unsigned Arg = ArgRegs[VA.getValNo()];
+ MVT ArgVT = ArgVTs[VA.getValNo()];
+
+ // Handle argument promotion and bitcasts.
+ switch (VA.getLocInfo()) {
+ default:
+ llvm_unreachable("Unknown loc info!");
+ case CCValAssign::Full:
+ break;
+ case CCValAssign::SExt: {
+ MVT DestVT = VA.getLocVT();
+ const TargetRegisterClass *RC =
+ (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
+ unsigned TmpReg = createResultReg(RC);
+ if (!PPCEmitIntExt(ArgVT, Arg, DestVT, TmpReg, /*IsZExt*/false))
+ llvm_unreachable("Failed to emit a sext!");
+ ArgVT = DestVT;
+ Arg = TmpReg;
+ break;
+ }
+ case CCValAssign::AExt:
+ case CCValAssign::ZExt: {
+ MVT DestVT = VA.getLocVT();
+ const TargetRegisterClass *RC =
+ (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
+ unsigned TmpReg = createResultReg(RC);
+ if (!PPCEmitIntExt(ArgVT, Arg, DestVT, TmpReg, /*IsZExt*/true))
+ llvm_unreachable("Failed to emit a zext!");
+ ArgVT = DestVT;
+ Arg = TmpReg;
+ break;
+ }
+ case CCValAssign::BCvt: {
+ // FIXME: Not yet handled.
+ llvm_unreachable("Should have bailed before getting here!");
+ break;
+ }
+ }
+
+ // Copy this argument to the appropriate register.
+ unsigned ArgReg;
+ if (ArgVT == MVT::f32 || ArgVT == MVT::f64) {
+ ArgReg = NextFPR++;
+ if (CC != CallingConv::Fast)
+ ++NextGPR;
+ } else
+ ArgReg = NextGPR++;
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ArgReg).addReg(Arg);
+ RegArgs.push_back(ArgReg);
+ }
+
+ return true;
+}
+
+// For a call that we've determined we can fast-select, finish the
+// call sequence and generate a copy to obtain the return value (if any).
+bool PPCFastISel::finishCall(MVT RetVT, CallLoweringInfo &CLI, unsigned &NumBytes) {
+ CallingConv::ID CC = CLI.CallConv;
+
+ // Issue CallSEQ_END.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TII.getCallFrameDestroyOpcode()))
+ .addImm(NumBytes).addImm(0);
+
+ // Next, generate a copy to obtain the return value.
+ // FIXME: No multi-register return values yet, though I don't foresee
+ // any real difficulties there.
+ if (RetVT != MVT::isVoid) {
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CC, false, *FuncInfo.MF, RVLocs, *Context);
+ CCInfo.AnalyzeCallResult(RetVT, RetCC_PPC64_ELF_FIS);
+ CCValAssign &VA = RVLocs[0];
+ assert(RVLocs.size() == 1 && "No support for multi-reg return values!");
+ assert(VA.isRegLoc() && "Can only return in registers!");
+
+ MVT DestVT = VA.getValVT();
+ MVT CopyVT = DestVT;
+
+ // Ints smaller than a register still arrive in a full 64-bit
+ // register, so make sure we recognize this.
+ if (RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32)
+ CopyVT = MVT::i64;
+
+ unsigned SourcePhysReg = VA.getLocReg();
+ unsigned ResultReg = 0;
+
+ if (RetVT == CopyVT) {
+ const TargetRegisterClass *CpyRC = TLI.getRegClassFor(CopyVT);
+ ResultReg = createResultReg(CpyRC);
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg)
+ .addReg(SourcePhysReg);
+
+ // If necessary, round the floating result to single precision.
+ } else if (CopyVT == MVT::f64) {
+ ResultReg = createResultReg(TLI.getRegClassFor(RetVT));
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::FRSP),
+ ResultReg).addReg(SourcePhysReg);
+
+ // If only the low half of a general register is needed, generate
+ // a GPRC copy instead of a G8RC copy. (EXTRACT_SUBREG can't be
+ // used along the fast-isel path (not lowered), and downstream logic
+ // also doesn't like a direct subreg copy on a physical reg.)
+ } else if (RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32) {
+ ResultReg = createResultReg(&PPC::GPRCRegClass);
+ // Convert physical register from G8RC to GPRC.
+ SourcePhysReg -= PPC::X0 - PPC::R0;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg)
+ .addReg(SourcePhysReg);
+ }
+
+ assert(ResultReg && "ResultReg unset!");
+ CLI.InRegs.push_back(SourcePhysReg);
+ CLI.ResultReg = ResultReg;
+ CLI.NumResultRegs = 1;
+ }
+
+ return true;
+}
+
+bool PPCFastISel::fastLowerCall(CallLoweringInfo &CLI) {
+ CallingConv::ID CC = CLI.CallConv;
+ bool IsTailCall = CLI.IsTailCall;
+ bool IsVarArg = CLI.IsVarArg;
+ const Value *Callee = CLI.Callee;
+ const MCSymbol *Symbol = CLI.Symbol;
+
+ if (!Callee && !Symbol)
+ return false;
+
+ // Allow SelectionDAG isel to handle tail calls.
+ if (IsTailCall)
+ return false;
+
+ // Let SDISel handle vararg functions.
+ if (IsVarArg)
+ return false;
+
+ // Handle simple calls for now, with legal return types and
+ // those that can be extended.
+ Type *RetTy = CLI.RetTy;
+ MVT RetVT;
+ if (RetTy->isVoidTy())
+ RetVT = MVT::isVoid;
+ else if (!isTypeLegal(RetTy, RetVT) && RetVT != MVT::i16 &&
+ RetVT != MVT::i8)
+ return false;
+ else if (RetVT == MVT::i1 && PPCSubTarget->useCRBits())
+ // We can't handle boolean returns when CR bits are in use.
+ return false;
+
+ // FIXME: No multi-register return values yet.
+ if (RetVT != MVT::isVoid && RetVT != MVT::i8 && RetVT != MVT::i16 &&
+ RetVT != MVT::i32 && RetVT != MVT::i64 && RetVT != MVT::f32 &&
+ RetVT != MVT::f64) {
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, RVLocs, *Context);
+ CCInfo.AnalyzeCallResult(RetVT, RetCC_PPC64_ELF_FIS);
+ if (RVLocs.size() > 1)
+ return false;
+ }
+
+ // Bail early if more than 8 arguments, as we only currently
+ // handle arguments passed in registers.
+ unsigned NumArgs = CLI.OutVals.size();
+ if (NumArgs > 8)
+ return false;
+
+ // Set up the argument vectors.
+ SmallVector<Value*, 8> Args;
+ SmallVector<unsigned, 8> ArgRegs;
+ SmallVector<MVT, 8> ArgVTs;
+ SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
+
+ Args.reserve(NumArgs);
+ ArgRegs.reserve(NumArgs);
+ ArgVTs.reserve(NumArgs);
+ ArgFlags.reserve(NumArgs);
+
+ for (unsigned i = 0, ie = NumArgs; i != ie; ++i) {
+ // Only handle easy calls for now. It would be reasonably easy
+ // to handle <= 8-byte structures passed ByVal in registers, but we
+ // have to ensure they are right-justified in the register.
+ ISD::ArgFlagsTy Flags = CLI.OutFlags[i];
+ if (Flags.isInReg() || Flags.isSRet() || Flags.isNest() || Flags.isByVal())
+ return false;
+
+ Value *ArgValue = CLI.OutVals[i];
+ Type *ArgTy = ArgValue->getType();
+ MVT ArgVT;
+ if (!isTypeLegal(ArgTy, ArgVT) && ArgVT != MVT::i16 && ArgVT != MVT::i8)
+ return false;
+
+ if (ArgVT.isVector())
+ return false;
+
+ unsigned Arg = getRegForValue(ArgValue);
+ if (Arg == 0)
+ return false;
+
+ Args.push_back(ArgValue);
+ ArgRegs.push_back(Arg);
+ ArgVTs.push_back(ArgVT);
+ ArgFlags.push_back(Flags);
+ }
+
+ // Process the arguments.
+ SmallVector<unsigned, 8> RegArgs;
+ unsigned NumBytes;
+
+ if (!processCallArgs(Args, ArgRegs, ArgVTs, ArgFlags,
+ RegArgs, CC, NumBytes, IsVarArg))
+ return false;
+
+ MachineInstrBuilder MIB;
+ // FIXME: No handling for function pointers yet. This requires
+ // implementing the function descriptor (OPD) setup.
+ const GlobalValue *GV = dyn_cast<GlobalValue>(Callee);
+ if (!GV) {
+ // patchpoints are a special case; they always dispatch to a pointer value.
+ // However, we don't actually want to generate the indirect call sequence
+ // here (that will be generated, as necessary, during asm printing), and
+ // the call we generate here will be erased by FastISel::selectPatchpoint,
+ // so don't try very hard...
+ if (CLI.IsPatchPoint)
+ MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::NOP));
+ else
+ return false;
+ } else {
+ // Build direct call with NOP for TOC restore.
+ // FIXME: We can and should optimize away the NOP for local calls.
+ MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(PPC::BL8_NOP));
+ // Add callee.
+ MIB.addGlobalAddress(GV);
+ }
+
+ // Add implicit physical register uses to the call.
+ for (unsigned II = 0, IE = RegArgs.size(); II != IE; ++II)
+ MIB.addReg(RegArgs[II], RegState::Implicit);
+
+ // Direct calls, in both the ELF V1 and V2 ABIs, need the TOC register live
+ // into the call.
+ PPCFuncInfo->setUsesTOCBasePtr();
+ MIB.addReg(PPC::X2, RegState::Implicit);
+
+ // Add a register mask with the call-preserved registers. Proper
+ // defs for return values will be added by setPhysRegsDeadExcept().
+ MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC));
+
+ CLI.Call = MIB;
+
+ // Finish off the call including any return values.
+ return finishCall(RetVT, CLI, NumBytes);
+}
+
+// Attempt to fast-select a return instruction.
+bool PPCFastISel::SelectRet(const Instruction *I) {
+
+ if (!FuncInfo.CanLowerReturn)
+ return false;
+
+ if (TLI.supportSplitCSR(FuncInfo.MF))
+ return false;
+
+ const ReturnInst *Ret = cast<ReturnInst>(I);
+ const Function &F = *I->getParent()->getParent();
+
+ // Build a list of return value registers.
+ SmallVector<unsigned, 4> RetRegs;
+ CallingConv::ID CC = F.getCallingConv();
+
+ if (Ret->getNumOperands() > 0) {
+ SmallVector<ISD::OutputArg, 4> Outs;
+ GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL);
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ValLocs;
+ CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, *Context);
+ CCInfo.AnalyzeReturn(Outs, RetCC_PPC64_ELF_FIS);
+ const Value *RV = Ret->getOperand(0);
+
+ // FIXME: Only one output register for now.
+ if (ValLocs.size() > 1)
+ return false;
+
+ // Special case for returning a constant integer of any size - materialize
+ // the constant as an i64 and copy it to the return register.
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(RV)) {
+ CCValAssign &VA = ValLocs[0];
+
+ unsigned RetReg = VA.getLocReg();
+ // We still need to worry about properly extending the sign. For example,
+ // we could have only a single bit or a constant that needs zero
+ // extension rather than sign extension. Make sure we pass the return
+ // value extension property to integer materialization.
+ unsigned SrcReg =
+ PPCMaterializeInt(CI, MVT::i64, VA.getLocInfo() != CCValAssign::ZExt);
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), RetReg).addReg(SrcReg);
+
+ RetRegs.push_back(RetReg);
+
+ } else {
+ unsigned Reg = getRegForValue(RV);
+
+ if (Reg == 0)
+ return false;
+
+ // Copy the result values into the output registers.
+ for (unsigned i = 0; i < ValLocs.size(); ++i) {
+
+ CCValAssign &VA = ValLocs[i];
+ assert(VA.isRegLoc() && "Can only return in registers!");
+ RetRegs.push_back(VA.getLocReg());
+ unsigned SrcReg = Reg + VA.getValNo();
+
+ EVT RVEVT = TLI.getValueType(DL, RV->getType());
+ if (!RVEVT.isSimple())
+ return false;
+ MVT RVVT = RVEVT.getSimpleVT();
+ MVT DestVT = VA.getLocVT();
+
+ if (RVVT != DestVT && RVVT != MVT::i8 &&
+ RVVT != MVT::i16 && RVVT != MVT::i32)
+ return false;
+
+ if (RVVT != DestVT) {
+ switch (VA.getLocInfo()) {
+ default:
+ llvm_unreachable("Unknown loc info!");
+ case CCValAssign::Full:
+ llvm_unreachable("Full value assign but types don't match?");
+ case CCValAssign::AExt:
+ case CCValAssign::ZExt: {
+ const TargetRegisterClass *RC =
+ (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
+ unsigned TmpReg = createResultReg(RC);
+ if (!PPCEmitIntExt(RVVT, SrcReg, DestVT, TmpReg, true))
+ return false;
+ SrcReg = TmpReg;
+ break;
+ }
+ case CCValAssign::SExt: {
+ const TargetRegisterClass *RC =
+ (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
+ unsigned TmpReg = createResultReg(RC);
+ if (!PPCEmitIntExt(RVVT, SrcReg, DestVT, TmpReg, false))
+ return false;
+ SrcReg = TmpReg;
+ break;
+ }
+ }
+ }
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), RetRegs[i])
+ .addReg(SrcReg);
+ }
+ }
+ }
+
+ MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(PPC::BLR8));
+
+ for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
+ MIB.addReg(RetRegs[i], RegState::Implicit);
+
+ return true;
+}
+
+// Attempt to emit an integer extend of SrcReg into DestReg. Both
+// signed and zero extensions are supported. Return false if we
+// can't handle it.
+bool PPCFastISel::PPCEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+ unsigned DestReg, bool IsZExt) {
+ if (DestVT != MVT::i32 && DestVT != MVT::i64)
+ return false;
+ if (SrcVT != MVT::i8 && SrcVT != MVT::i16 && SrcVT != MVT::i32)
+ return false;
+
+ // Signed extensions use EXTSB, EXTSH, EXTSW.
+ if (!IsZExt) {
+ unsigned Opc;
+ if (SrcVT == MVT::i8)
+ Opc = (DestVT == MVT::i32) ? PPC::EXTSB : PPC::EXTSB8_32_64;
+ else if (SrcVT == MVT::i16)
+ Opc = (DestVT == MVT::i32) ? PPC::EXTSH : PPC::EXTSH8_32_64;
+ else {
+ assert(DestVT == MVT::i64 && "Signed extend from i32 to i32??");
+ Opc = PPC::EXTSW_32_64;
+ }
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
+ .addReg(SrcReg);
+
+ // Unsigned 32-bit extensions use RLWINM.
+ } else if (DestVT == MVT::i32) {
+ unsigned MB;
+ if (SrcVT == MVT::i8)
+ MB = 24;
+ else {
+ assert(SrcVT == MVT::i16 && "Unsigned extend from i32 to i32??");
+ MB = 16;
+ }
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::RLWINM),
+ DestReg)
+ .addReg(SrcReg).addImm(/*SH=*/0).addImm(MB).addImm(/*ME=*/31);
+
+ // Unsigned 64-bit extensions use RLDICL (with a 32-bit source).
+ } else {
+ unsigned MB;
+ if (SrcVT == MVT::i8)
+ MB = 56;
+ else if (SrcVT == MVT::i16)
+ MB = 48;
+ else
+ MB = 32;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(PPC::RLDICL_32_64), DestReg)
+ .addReg(SrcReg).addImm(/*SH=*/0).addImm(MB);
+ }
+
+ return true;
+}
+
+// Attempt to fast-select an indirect branch instruction.
+bool PPCFastISel::SelectIndirectBr(const Instruction *I) {
+ unsigned AddrReg = getRegForValue(I->getOperand(0));
+ if (AddrReg == 0)
+ return false;
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::MTCTR8))
+ .addReg(AddrReg);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::BCTR8));
+
+ const IndirectBrInst *IB = cast<IndirectBrInst>(I);
+ for (const BasicBlock *SuccBB : IB->successors())
+ FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[SuccBB]);
+
+ return true;
+}
+
+// Attempt to fast-select an integer truncate instruction.
+bool PPCFastISel::SelectTrunc(const Instruction *I) {
+ Value *Src = I->getOperand(0);
+ EVT SrcVT = TLI.getValueType(DL, Src->getType(), true);
+ EVT DestVT = TLI.getValueType(DL, I->getType(), true);
+
+ if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16)
+ return false;
+
+ if (DestVT != MVT::i32 && DestVT != MVT::i16 && DestVT != MVT::i8)
+ return false;
+
+ unsigned SrcReg = getRegForValue(Src);
+ if (!SrcReg)
+ return false;
+
+ // The only interesting case is when we need to switch register classes.
+ if (SrcVT == MVT::i64) {
+ unsigned ResultReg = createResultReg(&PPC::GPRCRegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY),
+ ResultReg).addReg(SrcReg, 0, PPC::sub_32);
+ SrcReg = ResultReg;
+ }
+
+ updateValueMap(I, SrcReg);
+ return true;
+}
+
+// Attempt to fast-select an integer extend instruction.
+bool PPCFastISel::SelectIntExt(const Instruction *I) {
+ Type *DestTy = I->getType();
+ Value *Src = I->getOperand(0);
+ Type *SrcTy = Src->getType();
+
+ bool IsZExt = isa<ZExtInst>(I);
+ unsigned SrcReg = getRegForValue(Src);
+ if (!SrcReg) return false;
+
+ EVT SrcEVT, DestEVT;
+ SrcEVT = TLI.getValueType(DL, SrcTy, true);
+ DestEVT = TLI.getValueType(DL, DestTy, true);
+ if (!SrcEVT.isSimple())
+ return false;
+ if (!DestEVT.isSimple())
+ return false;
+
+ MVT SrcVT = SrcEVT.getSimpleVT();
+ MVT DestVT = DestEVT.getSimpleVT();
+
+ // If we know the register class needed for the result of this
+ // instruction, use it. Otherwise pick the register class of the
+ // correct size that does not contain X0/R0, since we don't know
+ // whether downstream uses permit that assignment.
+ unsigned AssignedReg = FuncInfo.ValueMap[I];
+ const TargetRegisterClass *RC =
+ (AssignedReg ? MRI.getRegClass(AssignedReg) :
+ (DestVT == MVT::i64 ? &PPC::G8RC_and_G8RC_NOX0RegClass :
+ &PPC::GPRC_and_GPRC_NOR0RegClass));
+ unsigned ResultReg = createResultReg(RC);
+
+ if (!PPCEmitIntExt(SrcVT, SrcReg, DestVT, ResultReg, IsZExt))
+ return false;
+
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+// Attempt to fast-select an instruction that wasn't handled by
+// the table-generated machinery.
+bool PPCFastISel::fastSelectInstruction(const Instruction *I) {
+
+ switch (I->getOpcode()) {
+ case Instruction::Load:
+ return SelectLoad(I);
+ case Instruction::Store:
+ return SelectStore(I);
+ case Instruction::Br:
+ return SelectBranch(I);
+ case Instruction::IndirectBr:
+ return SelectIndirectBr(I);
+ case Instruction::FPExt:
+ return SelectFPExt(I);
+ case Instruction::FPTrunc:
+ return SelectFPTrunc(I);
+ case Instruction::SIToFP:
+ return SelectIToFP(I, /*IsSigned*/ true);
+ case Instruction::UIToFP:
+ return SelectIToFP(I, /*IsSigned*/ false);
+ case Instruction::FPToSI:
+ return SelectFPToI(I, /*IsSigned*/ true);
+ case Instruction::FPToUI:
+ return SelectFPToI(I, /*IsSigned*/ false);
+ case Instruction::Add:
+ return SelectBinaryIntOp(I, ISD::ADD);
+ case Instruction::Or:
+ return SelectBinaryIntOp(I, ISD::OR);
+ case Instruction::Sub:
+ return SelectBinaryIntOp(I, ISD::SUB);
+ case Instruction::Call:
+ return selectCall(I);
+ case Instruction::Ret:
+ return SelectRet(I);
+ case Instruction::Trunc:
+ return SelectTrunc(I);
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ return SelectIntExt(I);
+ // Here add other flavors of Instruction::XXX that automated
+ // cases don't catch. For example, switches are terminators
+ // that aren't yet handled.
+ default:
+ break;
+ }
+ return false;
+}
+
+// Materialize a floating-point constant into a register, and return
+// the register number (or zero if we failed to handle it).
+unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) {
+ // No plans to handle long double here.
+ if (VT != MVT::f32 && VT != MVT::f64)
+ return 0;
+
+ // All FP constants are loaded from the constant pool.
+ unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
+ assert(Align > 0 && "Unexpectedly missing alignment information!");
+ unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align);
+ const TargetRegisterClass *RC =
+ (VT == MVT::f32) ? &PPC::F4RCRegClass : &PPC::F8RCRegClass;
+ unsigned DestReg = createResultReg(RC);
+ CodeModel::Model CModel = TM.getCodeModel();
+
+ MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+ MachinePointerInfo::getConstantPool(*FuncInfo.MF),
+ MachineMemOperand::MOLoad, (VT == MVT::f32) ? 4 : 8, Align);
+
+ unsigned Opc = (VT == MVT::f32) ? PPC::LFS : PPC::LFD;
+ unsigned TmpReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
+
+ PPCFuncInfo->setUsesTOCBasePtr();
+ // For small code model, generate a LF[SD](0, LDtocCPT(Idx, X2)).
+ if (CModel == CodeModel::Small || CModel == CodeModel::JITDefault) {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocCPT),
+ TmpReg)
+ .addConstantPoolIndex(Idx).addReg(PPC::X2);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
+ .addImm(0).addReg(TmpReg).addMemOperand(MMO);
+ } else {
+ // Otherwise we generate LF[SD](Idx[lo], ADDIStocHA(X2, Idx)).
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDIStocHA),
+ TmpReg).addReg(PPC::X2).addConstantPoolIndex(Idx);
+ // But for large code model, we must generate a LDtocL followed
+ // by the LF[SD].
+ if (CModel == CodeModel::Large) {
+ unsigned TmpReg2 = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocL),
+ TmpReg2).addConstantPoolIndex(Idx).addReg(TmpReg);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
+ .addImm(0)
+ .addReg(TmpReg2);
+ } else
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
+ .addConstantPoolIndex(Idx, 0, PPCII::MO_TOC_LO)
+ .addReg(TmpReg)
+ .addMemOperand(MMO);
+ }
+
+ return DestReg;
+}
+
+// Materialize the address of a global value into a register, and return
+// the register number (or zero if we failed to handle it).
+unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
+ assert(VT == MVT::i64 && "Non-address!");
+ const TargetRegisterClass *RC = &PPC::G8RC_and_G8RC_NOX0RegClass;
+ unsigned DestReg = createResultReg(RC);
+
+ // Global values may be plain old object addresses, TLS object
+ // addresses, constant pool entries, or jump tables. How we generate
+ // code for these may depend on small, medium, or large code model.
+ CodeModel::Model CModel = TM.getCodeModel();
+
+ // FIXME: Jump tables are not yet required because fast-isel doesn't
+ // handle switches; if that changes, we need them as well. For now,
+ // what follows assumes everything's a generic (or TLS) global address.
+
+ // FIXME: We don't yet handle the complexity of TLS.
+ if (GV->isThreadLocal())
+ return 0;
+
+ PPCFuncInfo->setUsesTOCBasePtr();
+ // For small code model, generate a simple TOC load.
+ if (CModel == CodeModel::Small || CModel == CodeModel::JITDefault)
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtoc),
+ DestReg)
+ .addGlobalAddress(GV)
+ .addReg(PPC::X2);
+ else {
+ // If the address is an externally defined symbol, a symbol with common
+ // or externally available linkage, a non-local function address, or a
+ // jump table address (not yet needed), or if we are generating code
+ // for large code model, we generate:
+ // LDtocL(GV, ADDIStocHA(%X2, GV))
+ // Otherwise we generate:
+ // ADDItocL(ADDIStocHA(%X2, GV), GV)
+ // Either way, start with the ADDIStocHA:
+ unsigned HighPartReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDIStocHA),
+ HighPartReg).addReg(PPC::X2).addGlobalAddress(GV);
+
+ unsigned char GVFlags = PPCSubTarget->classifyGlobalReference(GV);
+ if (GVFlags & PPCII::MO_NLP_FLAG) {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocL),
+ DestReg).addGlobalAddress(GV).addReg(HighPartReg);
+ } else {
+ // Otherwise generate the ADDItocL.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDItocL),
+ DestReg).addReg(HighPartReg).addGlobalAddress(GV);
+ }
+ }
+
+ return DestReg;
+}
+
+// Materialize a 32-bit integer constant into a register, and return
+// the register number (or zero if we failed to handle it).
+unsigned PPCFastISel::PPCMaterialize32BitInt(int64_t Imm,
+ const TargetRegisterClass *RC) {
+ unsigned Lo = Imm & 0xFFFF;
+ unsigned Hi = (Imm >> 16) & 0xFFFF;
+
+ unsigned ResultReg = createResultReg(RC);
+ bool IsGPRC = RC->hasSuperClassEq(&PPC::GPRCRegClass);
+
+ if (isInt<16>(Imm))
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(IsGPRC ? PPC::LI : PPC::LI8), ResultReg)
+ .addImm(Imm);
+ else if (Lo) {
+ // Both Lo and Hi have nonzero bits.
+ unsigned TmpReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(IsGPRC ? PPC::LIS : PPC::LIS8), TmpReg)
+ .addImm(Hi);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(IsGPRC ? PPC::ORI : PPC::ORI8), ResultReg)
+ .addReg(TmpReg).addImm(Lo);
+ } else
+ // Just Hi bits.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(IsGPRC ? PPC::LIS : PPC::LIS8), ResultReg)
+ .addImm(Hi);
+
+ return ResultReg;
+}
+
+// Materialize a 64-bit integer constant into a register, and return
+// the register number (or zero if we failed to handle it).
+unsigned PPCFastISel::PPCMaterialize64BitInt(int64_t Imm,
+ const TargetRegisterClass *RC) {
+ unsigned Remainder = 0;
+ unsigned Shift = 0;
+
+ // If the value doesn't fit in 32 bits, see if we can shift it
+ // so that it fits in 32 bits.
+ if (!isInt<32>(Imm)) {
+ Shift = countTrailingZeros<uint64_t>(Imm);
+ int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift;
+
+ if (isInt<32>(ImmSh))
+ Imm = ImmSh;
+ else {
+ Remainder = Imm;
+ Shift = 32;
+ Imm >>= 32;
+ }
+ }
+
+ // Handle the high-order 32 bits (if shifted) or the whole 32 bits
+ // (if not shifted).
+ unsigned TmpReg1 = PPCMaterialize32BitInt(Imm, RC);
+ if (!Shift)
+ return TmpReg1;
+
+ // If upper 32 bits were not zero, we've built them and need to shift
+ // them into place.
+ unsigned TmpReg2;
+ if (Imm) {
+ TmpReg2 = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::RLDICR),
+ TmpReg2).addReg(TmpReg1).addImm(Shift).addImm(63 - Shift);
+ } else
+ TmpReg2 = TmpReg1;
+
+ unsigned TmpReg3, Hi, Lo;
+ if ((Hi = (Remainder >> 16) & 0xFFFF)) {
+ TmpReg3 = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ORIS8),
+ TmpReg3).addReg(TmpReg2).addImm(Hi);
+ } else
+ TmpReg3 = TmpReg2;
+
+ if ((Lo = Remainder & 0xFFFF)) {
+ unsigned ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ORI8),
+ ResultReg).addReg(TmpReg3).addImm(Lo);
+ return ResultReg;
+ }
+
+ return TmpReg3;
+}
+
+// Materialize an integer constant into a register, and return
+// the register number (or zero if we failed to handle it).
+unsigned PPCFastISel::PPCMaterializeInt(const ConstantInt *CI, MVT VT,
+ bool UseSExt) {
+ // If we're using CR bit registers for i1 values, handle that as a special
+ // case first.
+ if (VT == MVT::i1 && PPCSubTarget->useCRBits()) {
+ unsigned ImmReg = createResultReg(&PPC::CRBITRCRegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(CI->isZero() ? PPC::CRUNSET : PPC::CRSET), ImmReg);
+ return ImmReg;
+ }
+
+ if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8 &&
+ VT != MVT::i1)
+ return 0;
+
+ const TargetRegisterClass *RC =
+ ((VT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass);
+ int64_t Imm = UseSExt ? CI->getSExtValue() : CI->getZExtValue();
+
+ // If the constant is in range, use a load-immediate.
+ // Since LI will sign extend the constant we need to make sure that for
+ // our zeroext constants that the sign extended constant fits into 16-bits -
+ // a range of 0..0x7fff.
+ if (isInt<16>(Imm)) {
+ unsigned Opc = (VT == MVT::i64) ? PPC::LI8 : PPC::LI;
+ unsigned ImmReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ImmReg)
+ .addImm(Imm);
+ return ImmReg;
+ }
+
+ // Construct the constant piecewise.
+ if (VT == MVT::i64)
+ return PPCMaterialize64BitInt(Imm, RC);
+ else if (VT == MVT::i32)
+ return PPCMaterialize32BitInt(Imm, RC);
+
+ return 0;
+}
+
+// Materialize a constant into a register, and return the register
+// number (or zero if we failed to handle it).
+unsigned PPCFastISel::fastMaterializeConstant(const Constant *C) {
+ EVT CEVT = TLI.getValueType(DL, C->getType(), true);
+
+ // Only handle simple types.
+ if (!CEVT.isSimple()) return 0;
+ MVT VT = CEVT.getSimpleVT();
+
+ if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
+ return PPCMaterializeFP(CFP, VT);
+ else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+ return PPCMaterializeGV(GV, VT);
+ else if (const ConstantInt *CI = dyn_cast<ConstantInt>(C))
+ // Note that the code in FunctionLoweringInfo::ComputePHILiveOutRegInfo
+ // assumes that constant PHI operands will be zero extended, and failure to
+ // match that assumption will cause problems if we sign extend here but
+ // some user of a PHI is in a block for which we fall back to full SDAG
+ // instruction selection.
+ return PPCMaterializeInt(CI, VT, false);
+
+ return 0;
+}
+
+// Materialize the address created by an alloca into a register, and
+// return the register number (or zero if we failed to handle it).
+unsigned PPCFastISel::fastMaterializeAlloca(const AllocaInst *AI) {
+ // Don't handle dynamic allocas.
+ if (!FuncInfo.StaticAllocaMap.count(AI)) return 0;
+
+ MVT VT;
+ if (!isLoadTypeLegal(AI->getType(), VT)) return 0;
+
+ DenseMap<const AllocaInst*, int>::iterator SI =
+ FuncInfo.StaticAllocaMap.find(AI);
+
+ if (SI != FuncInfo.StaticAllocaMap.end()) {
+ unsigned ResultReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDI8),
+ ResultReg).addFrameIndex(SI->second).addImm(0);
+ return ResultReg;
+ }
+
+ return 0;
+}
+
+// Fold loads into extends when possible.
+// FIXME: We can have multiple redundant extend/trunc instructions
+// following a load. The folding only picks up one. Extend this
+// to check subsequent instructions for the same pattern and remove
+// them. Thus ResultReg should be the def reg for the last redundant
+// instruction in a chain, and all intervening instructions can be
+// removed from parent. Change test/CodeGen/PowerPC/fast-isel-fold.ll
+// to add ELF64-NOT: rldicl to the appropriate tests when this works.
+bool PPCFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
+ const LoadInst *LI) {
+ // Verify we have a legal type before going any further.
+ MVT VT;
+ if (!isLoadTypeLegal(LI->getType(), VT))
+ return false;
+
+ // Combine load followed by zero- or sign-extend.
+ bool IsZExt = false;
+ switch(MI->getOpcode()) {
+ default:
+ return false;
+
+ case PPC::RLDICL:
+ case PPC::RLDICL_32_64: {
+ IsZExt = true;
+ unsigned MB = MI->getOperand(3).getImm();
+ if ((VT == MVT::i8 && MB <= 56) ||
+ (VT == MVT::i16 && MB <= 48) ||
+ (VT == MVT::i32 && MB <= 32))
+ break;
+ return false;
+ }
+
+ case PPC::RLWINM:
+ case PPC::RLWINM8: {
+ IsZExt = true;
+ unsigned MB = MI->getOperand(3).getImm();
+ if ((VT == MVT::i8 && MB <= 24) ||
+ (VT == MVT::i16 && MB <= 16))
+ break;
+ return false;
+ }
+
+ case PPC::EXTSB:
+ case PPC::EXTSB8:
+ case PPC::EXTSB8_32_64:
+ /* There is no sign-extending load-byte instruction. */
+ return false;
+
+ case PPC::EXTSH:
+ case PPC::EXTSH8:
+ case PPC::EXTSH8_32_64: {
+ if (VT != MVT::i16 && VT != MVT::i8)
+ return false;
+ break;
+ }
+
+ case PPC::EXTSW:
+ case PPC::EXTSW_32_64: {
+ if (VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8)
+ return false;
+ break;
+ }
+ }
+
+ // See if we can handle this address.
+ Address Addr;
+ if (!PPCComputeAddress(LI->getOperand(0), Addr))
+ return false;
+
+ unsigned ResultReg = MI->getOperand(0).getReg();
+
+ if (!PPCEmitLoad(VT, ResultReg, Addr, nullptr, IsZExt))
+ return false;
+
+ MI->eraseFromParent();
+ return true;
+}
+
+// Attempt to lower call arguments in a faster way than done by
+// the selection DAG code.
+bool PPCFastISel::fastLowerArguments() {
+ // Defer to normal argument lowering for now. It's reasonably
+ // efficient. Consider doing something like ARM to handle the
+ // case where all args fit in registers, no varargs, no float
+ // or vector args.
+ return false;
+}
+
+// Handle materializing integer constants into a register. This is not
+// automatically generated for PowerPC, so must be explicitly created here.
+unsigned PPCFastISel::fastEmit_i(MVT Ty, MVT VT, unsigned Opc, uint64_t Imm) {
+
+ if (Opc != ISD::Constant)
+ return 0;
+
+ // If we're using CR bit registers for i1 values, handle that as a special
+ // case first.
+ if (VT == MVT::i1 && PPCSubTarget->useCRBits()) {
+ unsigned ImmReg = createResultReg(&PPC::CRBITRCRegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Imm == 0 ? PPC::CRUNSET : PPC::CRSET), ImmReg);
+ return ImmReg;
+ }
+
+ if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8 &&
+ VT != MVT::i1)
+ return 0;
+
+ const TargetRegisterClass *RC = ((VT == MVT::i64) ? &PPC::G8RCRegClass :
+ &PPC::GPRCRegClass);
+ if (VT == MVT::i64)
+ return PPCMaterialize64BitInt(Imm, RC);
+ else
+ return PPCMaterialize32BitInt(Imm, RC);
+}
+
+// Override for ADDI and ADDI8 to set the correct register class
+// on RHS operand 0. The automatic infrastructure naively assumes
+// GPRC for i32 and G8RC for i64; the concept of "no R0" is lost
+// for these cases. At the moment, none of the other automatically
+// generated RI instructions require special treatment. However, once
+// SelectSelect is implemented, "isel" requires similar handling.
+//
+// Also be conservative about the output register class. Avoid
+// assigning R0 or X0 to the output register for GPRC and G8RC
+// register classes, as any such result could be used in ADDI, etc.,
+// where those regs have another meaning.
+unsigned PPCFastISel::fastEmitInst_ri(unsigned MachineInstOpcode,
+ const TargetRegisterClass *RC,
+ unsigned Op0, bool Op0IsKill,
+ uint64_t Imm) {
+ if (MachineInstOpcode == PPC::ADDI)
+ MRI.setRegClass(Op0, &PPC::GPRC_and_GPRC_NOR0RegClass);
+ else if (MachineInstOpcode == PPC::ADDI8)
+ MRI.setRegClass(Op0, &PPC::G8RC_and_G8RC_NOX0RegClass);
+
+ const TargetRegisterClass *UseRC =
+ (RC == &PPC::GPRCRegClass ? &PPC::GPRC_and_GPRC_NOR0RegClass :
+ (RC == &PPC::G8RCRegClass ? &PPC::G8RC_and_G8RC_NOX0RegClass : RC));
+
+ return FastISel::fastEmitInst_ri(MachineInstOpcode, UseRC,
+ Op0, Op0IsKill, Imm);
+}
+
+// Override for instructions with one register operand to avoid use of
+// R0/X0. The automatic infrastructure isn't aware of the context so
+// we must be conservative.
+unsigned PPCFastISel::fastEmitInst_r(unsigned MachineInstOpcode,
+ const TargetRegisterClass* RC,
+ unsigned Op0, bool Op0IsKill) {
+ const TargetRegisterClass *UseRC =
+ (RC == &PPC::GPRCRegClass ? &PPC::GPRC_and_GPRC_NOR0RegClass :
+ (RC == &PPC::G8RCRegClass ? &PPC::G8RC_and_G8RC_NOX0RegClass : RC));
+
+ return FastISel::fastEmitInst_r(MachineInstOpcode, UseRC, Op0, Op0IsKill);
+}
+
+// Override for instructions with two register operands to avoid use
+// of R0/X0. The automatic infrastructure isn't aware of the context
+// so we must be conservative.
+unsigned PPCFastISel::fastEmitInst_rr(unsigned MachineInstOpcode,
+ const TargetRegisterClass* RC,
+ unsigned Op0, bool Op0IsKill,
+ unsigned Op1, bool Op1IsKill) {
+ const TargetRegisterClass *UseRC =
+ (RC == &PPC::GPRCRegClass ? &PPC::GPRC_and_GPRC_NOR0RegClass :
+ (RC == &PPC::G8RCRegClass ? &PPC::G8RC_and_G8RC_NOX0RegClass : RC));
+
+ return FastISel::fastEmitInst_rr(MachineInstOpcode, UseRC, Op0, Op0IsKill,
+ Op1, Op1IsKill);
+}
+
+namespace llvm {
+ // Create the fast instruction selector for PowerPC64 ELF.
+ FastISel *PPC::createFastISel(FunctionLoweringInfo &FuncInfo,
+ const TargetLibraryInfo *LibInfo) {
+ // Only available on 64-bit ELF for now.
+ const PPCSubtarget &Subtarget = FuncInfo.MF->getSubtarget<PPCSubtarget>();
+ if (Subtarget.isPPC64() && Subtarget.isSVR4ABI())
+ return new PPCFastISel(FuncInfo, LibInfo);
+ return nullptr;
+ }
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
new file mode 100644
index 000000000000..e786ef9aee0e
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -0,0 +1,2164 @@
+//===-- PPCFrameLowering.cpp - PPC Frame Information ----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PPC implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCFrameLowering.h"
+#include "PPCInstrBuilder.h"
+#include "PPCInstrInfo.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCSubtarget.h"
+#include "PPCTargetMachine.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+/// VRRegNo - Map from a numbered VR register to its enum value.
+///
+static const MCPhysReg VRRegNo[] = {
+ PPC::V0 , PPC::V1 , PPC::V2 , PPC::V3 , PPC::V4 , PPC::V5 , PPC::V6 , PPC::V7 ,
+ PPC::V8 , PPC::V9 , PPC::V10, PPC::V11, PPC::V12, PPC::V13, PPC::V14, PPC::V15,
+ PPC::V16, PPC::V17, PPC::V18, PPC::V19, PPC::V20, PPC::V21, PPC::V22, PPC::V23,
+ PPC::V24, PPC::V25, PPC::V26, PPC::V27, PPC::V28, PPC::V29, PPC::V30, PPC::V31
+};
+
+static unsigned computeReturnSaveOffset(const PPCSubtarget &STI) {
+ if (STI.isDarwinABI())
+ return STI.isPPC64() ? 16 : 8;
+ // SVR4 ABI:
+ return STI.isPPC64() ? 16 : 4;
+}
+
+static unsigned computeTOCSaveOffset(const PPCSubtarget &STI) {
+ return STI.isELFv2ABI() ? 24 : 40;
+}
+
+static unsigned computeFramePointerSaveOffset(const PPCSubtarget &STI) {
+ // For the Darwin ABI:
+ // We cannot use the TOC save slot (offset +20) in the PowerPC linkage area
+ // for saving the frame pointer (if needed.) While the published ABI has
+ // not used this slot since at least MacOSX 10.2, there is older code
+ // around that does use it, and that needs to continue to work.
+ if (STI.isDarwinABI())
+ return STI.isPPC64() ? -8U : -4U;
+
+ // SVR4 ABI: First slot in the general register save area.
+ return STI.isPPC64() ? -8U : -4U;
+}
+
+static unsigned computeLinkageSize(const PPCSubtarget &STI) {
+ if (STI.isDarwinABI() || STI.isPPC64())
+ return (STI.isELFv2ABI() ? 4 : 6) * (STI.isPPC64() ? 8 : 4);
+
+ // SVR4 ABI:
+ return 8;
+}
+
+static unsigned computeBasePointerSaveOffset(const PPCSubtarget &STI) {
+ if (STI.isDarwinABI())
+ return STI.isPPC64() ? -16U : -8U;
+
+ // SVR4 ABI: First slot in the general register save area.
+ return STI.isPPC64()
+ ? -16U
+ : STI.getTargetMachine().isPositionIndependent() ? -12U : -8U;
+}
+
+PPCFrameLowering::PPCFrameLowering(const PPCSubtarget &STI)
+ : TargetFrameLowering(TargetFrameLowering::StackGrowsDown,
+ STI.getPlatformStackAlignment(), 0),
+ Subtarget(STI), ReturnSaveOffset(computeReturnSaveOffset(Subtarget)),
+ TOCSaveOffset(computeTOCSaveOffset(Subtarget)),
+ FramePointerSaveOffset(computeFramePointerSaveOffset(Subtarget)),
+ LinkageSize(computeLinkageSize(Subtarget)),
+ BasePointerSaveOffset(computeBasePointerSaveOffset(STI)) {}
+
+// With the SVR4 ABI, callee-saved registers have fixed offsets on the stack.
+const PPCFrameLowering::SpillSlot *PPCFrameLowering::getCalleeSavedSpillSlots(
+ unsigned &NumEntries) const {
+ if (Subtarget.isDarwinABI()) {
+ NumEntries = 1;
+ if (Subtarget.isPPC64()) {
+ static const SpillSlot darwin64Offsets = {PPC::X31, -8};
+ return &darwin64Offsets;
+ } else {
+ static const SpillSlot darwinOffsets = {PPC::R31, -4};
+ return &darwinOffsets;
+ }
+ }
+
+ // Early exit if not using the SVR4 ABI.
+ if (!Subtarget.isSVR4ABI()) {
+ NumEntries = 0;
+ return nullptr;
+ }
+
+ // Note that the offsets here overlap, but this is fixed up in
+ // processFunctionBeforeFrameFinalized.
+
+ static const SpillSlot Offsets[] = {
+ // Floating-point register save area offsets.
+ {PPC::F31, -8},
+ {PPC::F30, -16},
+ {PPC::F29, -24},
+ {PPC::F28, -32},
+ {PPC::F27, -40},
+ {PPC::F26, -48},
+ {PPC::F25, -56},
+ {PPC::F24, -64},
+ {PPC::F23, -72},
+ {PPC::F22, -80},
+ {PPC::F21, -88},
+ {PPC::F20, -96},
+ {PPC::F19, -104},
+ {PPC::F18, -112},
+ {PPC::F17, -120},
+ {PPC::F16, -128},
+ {PPC::F15, -136},
+ {PPC::F14, -144},
+
+ // General register save area offsets.
+ {PPC::R31, -4},
+ {PPC::R30, -8},
+ {PPC::R29, -12},
+ {PPC::R28, -16},
+ {PPC::R27, -20},
+ {PPC::R26, -24},
+ {PPC::R25, -28},
+ {PPC::R24, -32},
+ {PPC::R23, -36},
+ {PPC::R22, -40},
+ {PPC::R21, -44},
+ {PPC::R20, -48},
+ {PPC::R19, -52},
+ {PPC::R18, -56},
+ {PPC::R17, -60},
+ {PPC::R16, -64},
+ {PPC::R15, -68},
+ {PPC::R14, -72},
+
+ // CR save area offset. We map each of the nonvolatile CR fields
+ // to the slot for CR2, which is the first of the nonvolatile CR
+ // fields to be assigned, so that we only allocate one save slot.
+ // See PPCRegisterInfo::hasReservedSpillSlot() for more information.
+ {PPC::CR2, -4},
+
+ // VRSAVE save area offset.
+ {PPC::VRSAVE, -4},
+
+ // Vector register save area
+ {PPC::V31, -16},
+ {PPC::V30, -32},
+ {PPC::V29, -48},
+ {PPC::V28, -64},
+ {PPC::V27, -80},
+ {PPC::V26, -96},
+ {PPC::V25, -112},
+ {PPC::V24, -128},
+ {PPC::V23, -144},
+ {PPC::V22, -160},
+ {PPC::V21, -176},
+ {PPC::V20, -192}};
+
+ static const SpillSlot Offsets64[] = {
+ // Floating-point register save area offsets.
+ {PPC::F31, -8},
+ {PPC::F30, -16},
+ {PPC::F29, -24},
+ {PPC::F28, -32},
+ {PPC::F27, -40},
+ {PPC::F26, -48},
+ {PPC::F25, -56},
+ {PPC::F24, -64},
+ {PPC::F23, -72},
+ {PPC::F22, -80},
+ {PPC::F21, -88},
+ {PPC::F20, -96},
+ {PPC::F19, -104},
+ {PPC::F18, -112},
+ {PPC::F17, -120},
+ {PPC::F16, -128},
+ {PPC::F15, -136},
+ {PPC::F14, -144},
+
+ // General register save area offsets.
+ {PPC::X31, -8},
+ {PPC::X30, -16},
+ {PPC::X29, -24},
+ {PPC::X28, -32},
+ {PPC::X27, -40},
+ {PPC::X26, -48},
+ {PPC::X25, -56},
+ {PPC::X24, -64},
+ {PPC::X23, -72},
+ {PPC::X22, -80},
+ {PPC::X21, -88},
+ {PPC::X20, -96},
+ {PPC::X19, -104},
+ {PPC::X18, -112},
+ {PPC::X17, -120},
+ {PPC::X16, -128},
+ {PPC::X15, -136},
+ {PPC::X14, -144},
+
+ // VRSAVE save area offset.
+ {PPC::VRSAVE, -4},
+
+ // Vector register save area
+ {PPC::V31, -16},
+ {PPC::V30, -32},
+ {PPC::V29, -48},
+ {PPC::V28, -64},
+ {PPC::V27, -80},
+ {PPC::V26, -96},
+ {PPC::V25, -112},
+ {PPC::V24, -128},
+ {PPC::V23, -144},
+ {PPC::V22, -160},
+ {PPC::V21, -176},
+ {PPC::V20, -192}};
+
+ if (Subtarget.isPPC64()) {
+ NumEntries = array_lengthof(Offsets64);
+
+ return Offsets64;
+ } else {
+ NumEntries = array_lengthof(Offsets);
+
+ return Offsets;
+ }
+}
+
+/// RemoveVRSaveCode - We have found that this function does not need any code
+/// to manipulate the VRSAVE register, even though it uses vector registers.
+/// This can happen when the only registers used are known to be live in or out
+/// of the function. Remove all of the VRSAVE related code from the function.
+/// FIXME: The removal of the code results in a compile failure at -O0 when the
+/// function contains a function call, as the GPR containing original VRSAVE
+/// contents is spilled and reloaded around the call. Without the prolog code,
+/// the spill instruction refers to an undefined register. This code needs
+/// to account for all uses of that GPR.
+static void RemoveVRSaveCode(MachineInstr &MI) {
+ MachineBasicBlock *Entry = MI.getParent();
+ MachineFunction *MF = Entry->getParent();
+
+ // We know that the MTVRSAVE instruction immediately follows MI. Remove it.
+ MachineBasicBlock::iterator MBBI = MI;
+ ++MBBI;
+ assert(MBBI != Entry->end() && MBBI->getOpcode() == PPC::MTVRSAVE);
+ MBBI->eraseFromParent();
+
+ bool RemovedAllMTVRSAVEs = true;
+ // See if we can find and remove the MTVRSAVE instruction from all of the
+ // epilog blocks.
+ for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I) {
+ // If last instruction is a return instruction, add an epilogue
+ if (I->isReturnBlock()) {
+ bool FoundIt = false;
+ for (MBBI = I->end(); MBBI != I->begin(); ) {
+ --MBBI;
+ if (MBBI->getOpcode() == PPC::MTVRSAVE) {
+ MBBI->eraseFromParent(); // remove it.
+ FoundIt = true;
+ break;
+ }
+ }
+ RemovedAllMTVRSAVEs &= FoundIt;
+ }
+ }
+
+ // If we found and removed all MTVRSAVE instructions, remove the read of
+ // VRSAVE as well.
+ if (RemovedAllMTVRSAVEs) {
+ MBBI = MI;
+ assert(MBBI != Entry->begin() && "UPDATE_VRSAVE is first instr in block?");
+ --MBBI;
+ assert(MBBI->getOpcode() == PPC::MFVRSAVE && "VRSAVE instrs wandered?");
+ MBBI->eraseFromParent();
+ }
+
+ // Finally, nuke the UPDATE_VRSAVE.
+ MI.eraseFromParent();
+}
+
+// HandleVRSaveUpdate - MI is the UPDATE_VRSAVE instruction introduced by the
+// instruction selector. Based on the vector registers that have been used,
+// transform this into the appropriate ORI instruction.
+static void HandleVRSaveUpdate(MachineInstr &MI, const TargetInstrInfo &TII) {
+ MachineFunction *MF = MI.getParent()->getParent();
+ const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+ DebugLoc dl = MI.getDebugLoc();
+
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
+ unsigned UsedRegMask = 0;
+ for (unsigned i = 0; i != 32; ++i)
+ if (MRI.isPhysRegModified(VRRegNo[i]))
+ UsedRegMask |= 1 << (31-i);
+
+ // Live in and live out values already must be in the mask, so don't bother
+ // marking them.
+ for (MachineRegisterInfo::livein_iterator
+ I = MF->getRegInfo().livein_begin(),
+ E = MF->getRegInfo().livein_end(); I != E; ++I) {
+ unsigned RegNo = TRI->getEncodingValue(I->first);
+ if (VRRegNo[RegNo] == I->first) // If this really is a vector reg.
+ UsedRegMask &= ~(1 << (31-RegNo)); // Doesn't need to be marked.
+ }
+
+ // Live out registers appear as use operands on return instructions.
+ for (MachineFunction::const_iterator BI = MF->begin(), BE = MF->end();
+ UsedRegMask != 0 && BI != BE; ++BI) {
+ const MachineBasicBlock &MBB = *BI;
+ if (!MBB.isReturnBlock())
+ continue;
+ const MachineInstr &Ret = MBB.back();
+ for (unsigned I = 0, E = Ret.getNumOperands(); I != E; ++I) {
+ const MachineOperand &MO = Ret.getOperand(I);
+ if (!MO.isReg() || !PPC::VRRCRegClass.contains(MO.getReg()))
+ continue;
+ unsigned RegNo = TRI->getEncodingValue(MO.getReg());
+ UsedRegMask &= ~(1 << (31-RegNo));
+ }
+ }
+
+ // If no registers are used, turn this into a copy.
+ if (UsedRegMask == 0) {
+ // Remove all VRSAVE code.
+ RemoveVRSaveCode(MI);
+ return;
+ }
+
+ unsigned SrcReg = MI.getOperand(1).getReg();
+ unsigned DstReg = MI.getOperand(0).getReg();
+
+ if ((UsedRegMask & 0xFFFF) == UsedRegMask) {
+ if (DstReg != SrcReg)
+ BuildMI(*MI.getParent(), MI, dl, TII.get(PPC::ORI), DstReg)
+ .addReg(SrcReg)
+ .addImm(UsedRegMask);
+ else
+ BuildMI(*MI.getParent(), MI, dl, TII.get(PPC::ORI), DstReg)
+ .addReg(SrcReg, RegState::Kill)
+ .addImm(UsedRegMask);
+ } else if ((UsedRegMask & 0xFFFF0000) == UsedRegMask) {
+ if (DstReg != SrcReg)
+ BuildMI(*MI.getParent(), MI, dl, TII.get(PPC::ORIS), DstReg)
+ .addReg(SrcReg)
+ .addImm(UsedRegMask >> 16);
+ else
+ BuildMI(*MI.getParent(), MI, dl, TII.get(PPC::ORIS), DstReg)
+ .addReg(SrcReg, RegState::Kill)
+ .addImm(UsedRegMask >> 16);
+ } else {
+ if (DstReg != SrcReg)
+ BuildMI(*MI.getParent(), MI, dl, TII.get(PPC::ORIS), DstReg)
+ .addReg(SrcReg)
+ .addImm(UsedRegMask >> 16);
+ else
+ BuildMI(*MI.getParent(), MI, dl, TII.get(PPC::ORIS), DstReg)
+ .addReg(SrcReg, RegState::Kill)
+ .addImm(UsedRegMask >> 16);
+
+ BuildMI(*MI.getParent(), MI, dl, TII.get(PPC::ORI), DstReg)
+ .addReg(DstReg, RegState::Kill)
+ .addImm(UsedRegMask & 0xFFFF);
+ }
+
+ // Remove the old UPDATE_VRSAVE instruction.
+ MI.eraseFromParent();
+}
+
+static bool spillsCR(const MachineFunction &MF) {
+ const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+ return FuncInfo->isCRSpilled();
+}
+
+static bool spillsVRSAVE(const MachineFunction &MF) {
+ const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+ return FuncInfo->isVRSAVESpilled();
+}
+
+static bool hasSpills(const MachineFunction &MF) {
+ const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+ return FuncInfo->hasSpills();
+}
+
+static bool hasNonRISpills(const MachineFunction &MF) {
+ const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+ return FuncInfo->hasNonRISpills();
+}
+
+/// MustSaveLR - Return true if this function requires that we save the LR
+/// register onto the stack in the prolog and restore it in the epilog of the
+/// function.
+static bool MustSaveLR(const MachineFunction &MF, unsigned LR) {
+ const PPCFunctionInfo *MFI = MF.getInfo<PPCFunctionInfo>();
+
+ // We need a save/restore of LR if there is any def of LR (which is
+ // defined by calls, including the PIC setup sequence), or if there is
+ // some use of the LR stack slot (e.g. for builtin_return_address).
+ // (LR comes in 32 and 64 bit versions.)
+ MachineRegisterInfo::def_iterator RI = MF.getRegInfo().def_begin(LR);
+ return RI !=MF.getRegInfo().def_end() || MFI->isLRStoreRequired();
+}
+
+/// determineFrameLayout - Determine the size of the frame and maximum call
+/// frame size.
+unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF,
+ bool UpdateMF,
+ bool UseEstimate) const {
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ // Get the number of bytes to allocate from the FrameInfo
+ unsigned FrameSize =
+ UseEstimate ? MFI.estimateStackSize(MF) : MFI.getStackSize();
+
+ // Get stack alignments. The frame must be aligned to the greatest of these:
+ unsigned TargetAlign = getStackAlignment(); // alignment required per the ABI
+ unsigned MaxAlign = MFI.getMaxAlignment(); // algmt required by data in frame
+ unsigned AlignMask = std::max(MaxAlign, TargetAlign) - 1;
+
+ const PPCRegisterInfo *RegInfo =
+ static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+
+ // If we are a leaf function, and use up to 224 bytes of stack space,
+ // don't have a frame pointer, calls, or dynamic alloca then we do not need
+ // to adjust the stack pointer (we fit in the Red Zone).
+ // The 32-bit SVR4 ABI has no Red Zone. However, it can still generate
+ // stackless code if all local vars are reg-allocated.
+ bool DisableRedZone = MF.getFunction()->hasFnAttribute(Attribute::NoRedZone);
+ unsigned LR = RegInfo->getRARegister();
+ if (!DisableRedZone &&
+ (Subtarget.isPPC64() || // 32-bit SVR4, no stack-
+ !Subtarget.isSVR4ABI() || // allocated locals.
+ FrameSize == 0) &&
+ FrameSize <= 224 && // Fits in red zone.
+ !MFI.hasVarSizedObjects() && // No dynamic alloca.
+ !MFI.adjustsStack() && // No calls.
+ !MustSaveLR(MF, LR) &&
+ !RegInfo->hasBasePointer(MF)) { // No special alignment.
+ // No need for frame
+ if (UpdateMF)
+ MFI.setStackSize(0);
+ return 0;
+ }
+
+ // Get the maximum call frame size of all the calls.
+ unsigned maxCallFrameSize = MFI.getMaxCallFrameSize();
+
+ // Maximum call frame needs to be at least big enough for linkage area.
+ unsigned minCallFrameSize = getLinkageSize();
+ maxCallFrameSize = std::max(maxCallFrameSize, minCallFrameSize);
+
+ // If we have dynamic alloca then maxCallFrameSize needs to be aligned so
+ // that allocations will be aligned.
+ if (MFI.hasVarSizedObjects())
+ maxCallFrameSize = (maxCallFrameSize + AlignMask) & ~AlignMask;
+
+ // Update maximum call frame size.
+ if (UpdateMF)
+ MFI.setMaxCallFrameSize(maxCallFrameSize);
+
+ // Include call frame size in total.
+ FrameSize += maxCallFrameSize;
+
+ // Make sure the frame is aligned.
+ FrameSize = (FrameSize + AlignMask) & ~AlignMask;
+
+ // Update frame info.
+ if (UpdateMF)
+ MFI.setStackSize(FrameSize);
+
+ return FrameSize;
+}
+
+// hasFP - Return true if the specified function actually has a dedicated frame
+// pointer register.
+bool PPCFrameLowering::hasFP(const MachineFunction &MF) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ // FIXME: This is pretty much broken by design: hasFP() might be called really
+ // early, before the stack layout was calculated and thus hasFP() might return
+ // true or false here depending on the time of call.
+ return (MFI.getStackSize()) && needsFP(MF);
+}
+
+// needsFP - Return true if the specified function should have a dedicated frame
+// pointer register. This is true if the function has variable sized allocas or
+// if frame pointer elimination is disabled.
+bool PPCFrameLowering::needsFP(const MachineFunction &MF) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ // Naked functions have no stack frame pushed, so we don't have a frame
+ // pointer.
+ if (MF.getFunction()->hasFnAttribute(Attribute::Naked))
+ return false;
+
+ return MF.getTarget().Options.DisableFramePointerElim(MF) ||
+ MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint() ||
+ (MF.getTarget().Options.GuaranteedTailCallOpt &&
+ MF.getInfo<PPCFunctionInfo>()->hasFastCall());
+}
+
+void PPCFrameLowering::replaceFPWithRealFP(MachineFunction &MF) const {
+ bool is31 = needsFP(MF);
+ unsigned FPReg = is31 ? PPC::R31 : PPC::R1;
+ unsigned FP8Reg = is31 ? PPC::X31 : PPC::X1;
+
+ const PPCRegisterInfo *RegInfo =
+ static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+ bool HasBP = RegInfo->hasBasePointer(MF);
+ unsigned BPReg = HasBP ? (unsigned) RegInfo->getBaseRegister(MF) : FPReg;
+ unsigned BP8Reg = HasBP ? (unsigned) PPC::X30 : FPReg;
+
+ for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+ BI != BE; ++BI)
+ for (MachineBasicBlock::iterator MBBI = BI->end(); MBBI != BI->begin(); ) {
+ --MBBI;
+ for (unsigned I = 0, E = MBBI->getNumOperands(); I != E; ++I) {
+ MachineOperand &MO = MBBI->getOperand(I);
+ if (!MO.isReg())
+ continue;
+
+ switch (MO.getReg()) {
+ case PPC::FP:
+ MO.setReg(FPReg);
+ break;
+ case PPC::FP8:
+ MO.setReg(FP8Reg);
+ break;
+ case PPC::BP:
+ MO.setReg(BPReg);
+ break;
+ case PPC::BP8:
+ MO.setReg(BP8Reg);
+ break;
+
+ }
+ }
+ }
+}
+
+/* This function will do the following:
+ - If MBB is an entry or exit block, set SR1 and SR2 to R0 and R12
+ respectively (defaults recommended by the ABI) and return true
+ - If MBB is not an entry block, initialize the register scavenger and look
+ for available registers.
+ - If the defaults (R0/R12) are available, return true
+ - If TwoUniqueRegsRequired is set to true, it looks for two unique
+ registers. Otherwise, look for a single available register.
+ - If the required registers are found, set SR1 and SR2 and return true.
+ - If the required registers are not found, set SR2 or both SR1 and SR2 to
+ PPC::NoRegister and return false.
+
+ Note that if both SR1 and SR2 are valid parameters and TwoUniqueRegsRequired
+ is not set, this function will attempt to find two different registers, but
+ still return true if only one register is available (and set SR1 == SR2).
+*/
+bool
+PPCFrameLowering::findScratchRegister(MachineBasicBlock *MBB,
+ bool UseAtEnd,
+ bool TwoUniqueRegsRequired,
+ unsigned *SR1,
+ unsigned *SR2) const {
+ RegScavenger RS;
+ unsigned R0 = Subtarget.isPPC64() ? PPC::X0 : PPC::R0;
+ unsigned R12 = Subtarget.isPPC64() ? PPC::X12 : PPC::R12;
+
+ // Set the defaults for the two scratch registers.
+ if (SR1)
+ *SR1 = R0;
+
+ if (SR2) {
+ assert (SR1 && "Asking for the second scratch register but not the first?");
+ *SR2 = R12;
+ }
+
+ // If MBB is an entry or exit block, use R0 and R12 as the scratch registers.
+ if ((UseAtEnd && MBB->isReturnBlock()) ||
+ (!UseAtEnd && (&MBB->getParent()->front() == MBB)))
+ return true;
+
+ RS.enterBasicBlock(*MBB);
+
+ if (UseAtEnd && !MBB->empty()) {
+ // The scratch register will be used at the end of the block, so must
+ // consider all registers used within the block
+
+ MachineBasicBlock::iterator MBBI = MBB->getFirstTerminator();
+ // If no terminator, back iterator up to previous instruction.
+ if (MBBI == MBB->end())
+ MBBI = std::prev(MBBI);
+
+ if (MBBI != MBB->begin())
+ RS.forward(MBBI);
+ }
+
+ // If the two registers are available, we're all good.
+ // Note that we only return here if both R0 and R12 are available because
+ // although the function may not require two unique registers, it may benefit
+ // from having two so we should try to provide them.
+ if (!RS.isRegUsed(R0) && !RS.isRegUsed(R12))
+ return true;
+
+ // Get the list of callee-saved registers for the target.
+ const PPCRegisterInfo *RegInfo =
+ static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+ const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(MBB->getParent());
+
+ // Get all the available registers in the block.
+ BitVector BV = RS.getRegsAvailable(Subtarget.isPPC64() ? &PPC::G8RCRegClass :
+ &PPC::GPRCRegClass);
+
+ // We shouldn't use callee-saved registers as scratch registers as they may be
+ // available when looking for a candidate block for shrink wrapping but not
+ // available when the actual prologue/epilogue is being emitted because they
+ // were added as live-in to the prologue block by PrologueEpilogueInserter.
+ for (int i = 0; CSRegs[i]; ++i)
+ BV.reset(CSRegs[i]);
+
+ // Set the first scratch register to the first available one.
+ if (SR1) {
+ int FirstScratchReg = BV.find_first();
+ *SR1 = FirstScratchReg == -1 ? (unsigned)PPC::NoRegister : FirstScratchReg;
+ }
+
+ // If there is another one available, set the second scratch register to that.
+ // Otherwise, set it to either PPC::NoRegister if this function requires two
+ // or to whatever SR1 is set to if this function doesn't require two.
+ if (SR2) {
+ int SecondScratchReg = BV.find_next(*SR1);
+ if (SecondScratchReg != -1)
+ *SR2 = SecondScratchReg;
+ else
+ *SR2 = TwoUniqueRegsRequired ? (unsigned)PPC::NoRegister : *SR1;
+ }
+
+ // Now that we've done our best to provide both registers, double check
+ // whether we were unable to provide enough.
+ if (BV.count() < (TwoUniqueRegsRequired ? 2U : 1U))
+ return false;
+
+ return true;
+}
+
+// We need a scratch register for spilling LR and for spilling CR. By default,
+// we use two scratch registers to hide latency. However, if only one scratch
+// register is available, we can adjust for that by not overlapping the spill
+// code. However, if we need to realign the stack (i.e. have a base pointer)
+// and the stack frame is large, we need two scratch registers.
+bool
+PPCFrameLowering::twoUniqueScratchRegsRequired(MachineBasicBlock *MBB) const {
+ const PPCRegisterInfo *RegInfo =
+ static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+ MachineFunction &MF = *(MBB->getParent());
+ bool HasBP = RegInfo->hasBasePointer(MF);
+ unsigned FrameSize = determineFrameLayout(MF, false);
+ int NegFrameSize = -FrameSize;
+ bool IsLargeFrame = !isInt<16>(NegFrameSize);
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ unsigned MaxAlign = MFI.getMaxAlignment();
+ bool HasRedZone = Subtarget.isPPC64() || !Subtarget.isSVR4ABI();
+
+ return (IsLargeFrame || !HasRedZone) && HasBP && MaxAlign > 1;
+}
+
+bool PPCFrameLowering::canUseAsPrologue(const MachineBasicBlock &MBB) const {
+ MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
+
+ return findScratchRegister(TmpMBB, false,
+ twoUniqueScratchRegsRequired(TmpMBB));
+}
+
+bool PPCFrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const {
+ MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
+
+ return findScratchRegister(TmpMBB, true);
+}
+
+void PPCFrameLowering::emitPrologue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ MachineBasicBlock::iterator MBBI = MBB.begin();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ const PPCInstrInfo &TII =
+ *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo());
+ const PPCRegisterInfo *RegInfo =
+ static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+
+ MachineModuleInfo &MMI = MF.getMMI();
+ const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+ DebugLoc dl;
+ bool needsCFI = MMI.hasDebugInfo() ||
+ MF.getFunction()->needsUnwindTableEntry();
+
+ // Get processor type.
+ bool isPPC64 = Subtarget.isPPC64();
+ // Get the ABI.
+ bool isSVR4ABI = Subtarget.isSVR4ABI();
+ bool isELFv2ABI = Subtarget.isELFv2ABI();
+ assert((Subtarget.isDarwinABI() || isSVR4ABI) &&
+ "Currently only Darwin and SVR4 ABIs are supported for PowerPC.");
+
+ // Scan the prolog, looking for an UPDATE_VRSAVE instruction. If we find it,
+ // process it.
+ if (!isSVR4ABI)
+ for (unsigned i = 0; MBBI != MBB.end(); ++i, ++MBBI) {
+ if (MBBI->getOpcode() == PPC::UPDATE_VRSAVE) {
+ HandleVRSaveUpdate(*MBBI, TII);
+ break;
+ }
+ }
+
+ // Move MBBI back to the beginning of the prologue block.
+ MBBI = MBB.begin();
+
+ // Work out frame sizes.
+ unsigned FrameSize = determineFrameLayout(MF);
+ int NegFrameSize = -FrameSize;
+ if (!isInt<32>(NegFrameSize))
+ llvm_unreachable("Unhandled stack size!");
+
+ if (MFI.isFrameAddressTaken())
+ replaceFPWithRealFP(MF);
+
+ // Check if the link register (LR) must be saved.
+ PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+ bool MustSaveLR = FI->mustSaveLR();
+ const SmallVectorImpl<unsigned> &MustSaveCRs = FI->getMustSaveCRs();
+ bool MustSaveCR = !MustSaveCRs.empty();
+ // Do we have a frame pointer and/or base pointer for this function?
+ bool HasFP = hasFP(MF);
+ bool HasBP = RegInfo->hasBasePointer(MF);
+ bool HasRedZone = isPPC64 || !isSVR4ABI;
+
+ unsigned SPReg = isPPC64 ? PPC::X1 : PPC::R1;
+ unsigned BPReg = RegInfo->getBaseRegister(MF);
+ unsigned FPReg = isPPC64 ? PPC::X31 : PPC::R31;
+ unsigned LRReg = isPPC64 ? PPC::LR8 : PPC::LR;
+ unsigned ScratchReg = 0;
+ unsigned TempReg = isPPC64 ? PPC::X12 : PPC::R12; // another scratch reg
+ // ...(R12/X12 is volatile in both Darwin & SVR4, & can't be a function arg.)
+ const MCInstrDesc& MFLRInst = TII.get(isPPC64 ? PPC::MFLR8
+ : PPC::MFLR );
+ const MCInstrDesc& StoreInst = TII.get(isPPC64 ? PPC::STD
+ : PPC::STW );
+ const MCInstrDesc& StoreUpdtInst = TII.get(isPPC64 ? PPC::STDU
+ : PPC::STWU );
+ const MCInstrDesc& StoreUpdtIdxInst = TII.get(isPPC64 ? PPC::STDUX
+ : PPC::STWUX);
+ const MCInstrDesc& LoadImmShiftedInst = TII.get(isPPC64 ? PPC::LIS8
+ : PPC::LIS );
+ const MCInstrDesc& OrImmInst = TII.get(isPPC64 ? PPC::ORI8
+ : PPC::ORI );
+ const MCInstrDesc& OrInst = TII.get(isPPC64 ? PPC::OR8
+ : PPC::OR );
+ const MCInstrDesc& SubtractCarryingInst = TII.get(isPPC64 ? PPC::SUBFC8
+ : PPC::SUBFC);
+ const MCInstrDesc& SubtractImmCarryingInst = TII.get(isPPC64 ? PPC::SUBFIC8
+ : PPC::SUBFIC);
+
+ // Regarding this assert: Even though LR is saved in the caller's frame (i.e.,
+ // LROffset is positive), that slot is callee-owned. Because PPC32 SVR4 has no
+ // Red Zone, an asynchronous event (a form of "callee") could claim a frame &
+ // overwrite it, so PPC32 SVR4 must claim at least a minimal frame to save LR.
+ assert((isPPC64 || !isSVR4ABI || !(!FrameSize && (MustSaveLR || HasFP))) &&
+ "FrameSize must be >0 to save/restore the FP or LR for 32-bit SVR4.");
+
+ // Using the same bool variable as below to suppress compiler warnings.
+ bool SingleScratchReg =
+ findScratchRegister(&MBB, false, twoUniqueScratchRegsRequired(&MBB),
+ &ScratchReg, &TempReg);
+ assert(SingleScratchReg &&
+ "Required number of registers not available in this block");
+
+ SingleScratchReg = ScratchReg == TempReg;
+
+ int LROffset = getReturnSaveOffset();
+
+ int FPOffset = 0;
+ if (HasFP) {
+ if (isSVR4ABI) {
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ int FPIndex = FI->getFramePointerSaveIndex();
+ assert(FPIndex && "No Frame Pointer Save Slot!");
+ FPOffset = MFI.getObjectOffset(FPIndex);
+ } else {
+ FPOffset = getFramePointerSaveOffset();
+ }
+ }
+
+ int BPOffset = 0;
+ if (HasBP) {
+ if (isSVR4ABI) {
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ int BPIndex = FI->getBasePointerSaveIndex();
+ assert(BPIndex && "No Base Pointer Save Slot!");
+ BPOffset = MFI.getObjectOffset(BPIndex);
+ } else {
+ BPOffset = getBasePointerSaveOffset();
+ }
+ }
+
+ int PBPOffset = 0;
+ if (FI->usesPICBase()) {
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ int PBPIndex = FI->getPICBasePointerSaveIndex();
+ assert(PBPIndex && "No PIC Base Pointer Save Slot!");
+ PBPOffset = MFI.getObjectOffset(PBPIndex);
+ }
+
+ // Get stack alignments.
+ unsigned MaxAlign = MFI.getMaxAlignment();
+ if (HasBP && MaxAlign > 1)
+ assert(isPowerOf2_32(MaxAlign) && isInt<16>(MaxAlign) &&
+ "Invalid alignment!");
+
+ // Frames of 32KB & larger require special handling because they cannot be
+ // indexed into with a simple STDU/STWU/STD/STW immediate offset operand.
+ bool isLargeFrame = !isInt<16>(NegFrameSize);
+
+ assert((isPPC64 || !MustSaveCR) &&
+ "Prologue CR saving supported only in 64-bit mode");
+
+ // If we need to spill the CR and the LR but we don't have two separate
+ // registers available, we must spill them one at a time
+ if (MustSaveCR && SingleScratchReg && MustSaveLR) {
+ // In the ELFv2 ABI, we are not required to save all CR fields.
+ // If only one or two CR fields are clobbered, it is more efficient to use
+ // mfocrf to selectively save just those fields, because mfocrf has short
+ // latency compares to mfcr.
+ unsigned MfcrOpcode = PPC::MFCR8;
+ unsigned CrState = RegState::ImplicitKill;
+ if (isELFv2ABI && MustSaveCRs.size() == 1) {
+ MfcrOpcode = PPC::MFOCRF8;
+ CrState = RegState::Kill;
+ }
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MBBI, dl, TII.get(MfcrOpcode), TempReg);
+ for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i)
+ MIB.addReg(MustSaveCRs[i], CrState);
+ BuildMI(MBB, MBBI, dl, TII.get(PPC::STW8))
+ .addReg(TempReg, getKillRegState(true))
+ .addImm(8)
+ .addReg(SPReg);
+ }
+
+ if (MustSaveLR)
+ BuildMI(MBB, MBBI, dl, MFLRInst, ScratchReg);
+
+ if (MustSaveCR &&
+ !(SingleScratchReg && MustSaveLR)) { // will only occur for PPC64
+ // In the ELFv2 ABI, we are not required to save all CR fields.
+ // If only one or two CR fields are clobbered, it is more efficient to use
+ // mfocrf to selectively save just those fields, because mfocrf has short
+ // latency compares to mfcr.
+ unsigned MfcrOpcode = PPC::MFCR8;
+ unsigned CrState = RegState::ImplicitKill;
+ if (isELFv2ABI && MustSaveCRs.size() == 1) {
+ MfcrOpcode = PPC::MFOCRF8;
+ CrState = RegState::Kill;
+ }
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MBBI, dl, TII.get(MfcrOpcode), TempReg);
+ for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i)
+ MIB.addReg(MustSaveCRs[i], CrState);
+ }
+
+ if (HasRedZone) {
+ if (HasFP)
+ BuildMI(MBB, MBBI, dl, StoreInst)
+ .addReg(FPReg)
+ .addImm(FPOffset)
+ .addReg(SPReg);
+ if (FI->usesPICBase())
+ BuildMI(MBB, MBBI, dl, StoreInst)
+ .addReg(PPC::R30)
+ .addImm(PBPOffset)
+ .addReg(SPReg);
+ if (HasBP)
+ BuildMI(MBB, MBBI, dl, StoreInst)
+ .addReg(BPReg)
+ .addImm(BPOffset)
+ .addReg(SPReg);
+ }
+
+ if (MustSaveLR)
+ BuildMI(MBB, MBBI, dl, StoreInst)
+ .addReg(ScratchReg, getKillRegState(true))
+ .addImm(LROffset)
+ .addReg(SPReg);
+
+ if (MustSaveCR &&
+ !(SingleScratchReg && MustSaveLR)) { // will only occur for PPC64
+ assert(HasRedZone && "A red zone is always available on PPC64");
+ BuildMI(MBB, MBBI, dl, TII.get(PPC::STW8))
+ .addReg(TempReg, getKillRegState(true))
+ .addImm(8)
+ .addReg(SPReg);
+ }
+
+ // Skip the rest if this is a leaf function & all spills fit in the Red Zone.
+ if (!FrameSize)
+ return;
+
+ // Adjust stack pointer: r1 += NegFrameSize.
+ // If there is a preferred stack alignment, align R1 now
+
+ if (HasBP && HasRedZone) {
+ // Save a copy of r1 as the base pointer.
+ BuildMI(MBB, MBBI, dl, OrInst, BPReg)
+ .addReg(SPReg)
+ .addReg(SPReg);
+ }
+
+ // Have we generated a STUX instruction to claim stack frame? If so,
+ // the negated frame size will be placed in ScratchReg.
+ bool HasSTUX = false;
+
+ // This condition must be kept in sync with canUseAsPrologue.
+ if (HasBP && MaxAlign > 1) {
+ if (isPPC64)
+ BuildMI(MBB, MBBI, dl, TII.get(PPC::RLDICL), ScratchReg)
+ .addReg(SPReg)
+ .addImm(0)
+ .addImm(64 - Log2_32(MaxAlign));
+ else // PPC32...
+ BuildMI(MBB, MBBI, dl, TII.get(PPC::RLWINM), ScratchReg)
+ .addReg(SPReg)
+ .addImm(0)
+ .addImm(32 - Log2_32(MaxAlign))
+ .addImm(31);
+ if (!isLargeFrame) {
+ BuildMI(MBB, MBBI, dl, SubtractImmCarryingInst, ScratchReg)
+ .addReg(ScratchReg, RegState::Kill)
+ .addImm(NegFrameSize);
+ } else {
+ assert(!SingleScratchReg && "Only a single scratch reg available");
+ BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, TempReg)
+ .addImm(NegFrameSize >> 16);
+ BuildMI(MBB, MBBI, dl, OrImmInst, TempReg)
+ .addReg(TempReg, RegState::Kill)
+ .addImm(NegFrameSize & 0xFFFF);
+ BuildMI(MBB, MBBI, dl, SubtractCarryingInst, ScratchReg)
+ .addReg(ScratchReg, RegState::Kill)
+ .addReg(TempReg, RegState::Kill);
+ }
+
+ BuildMI(MBB, MBBI, dl, StoreUpdtIdxInst, SPReg)
+ .addReg(SPReg, RegState::Kill)
+ .addReg(SPReg)
+ .addReg(ScratchReg);
+ HasSTUX = true;
+
+ } else if (!isLargeFrame) {
+ BuildMI(MBB, MBBI, dl, StoreUpdtInst, SPReg)
+ .addReg(SPReg)
+ .addImm(NegFrameSize)
+ .addReg(SPReg);
+
+ } else {
+ BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg)
+ .addImm(NegFrameSize >> 16);
+ BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg)
+ .addReg(ScratchReg, RegState::Kill)
+ .addImm(NegFrameSize & 0xFFFF);
+ BuildMI(MBB, MBBI, dl, StoreUpdtIdxInst, SPReg)
+ .addReg(SPReg, RegState::Kill)
+ .addReg(SPReg)
+ .addReg(ScratchReg);
+ HasSTUX = true;
+ }
+
+ if (!HasRedZone) {
+ assert(!isPPC64 && "A red zone is always available on PPC64");
+ if (HasSTUX) {
+ // The negated frame size is in ScratchReg, and the SPReg has been
+ // decremented by the frame size: SPReg = old SPReg + ScratchReg.
+ // Since FPOffset, PBPOffset, etc. are relative to the beginning of
+ // the stack frame (i.e. the old SP), ideally, we would put the old
+ // SP into a register and use it as the base for the stores. The
+ // problem is that the only available register may be ScratchReg,
+ // which could be R0, and R0 cannot be used as a base address.
+
+ // First, set ScratchReg to the old SP. This may need to be modified
+ // later.
+ BuildMI(MBB, MBBI, dl, TII.get(PPC::SUBF), ScratchReg)
+ .addReg(ScratchReg, RegState::Kill)
+ .addReg(SPReg);
+
+ if (ScratchReg == PPC::R0) {
+ // R0 cannot be used as a base register, but it can be used as an
+ // index in a store-indexed.
+ int LastOffset = 0;
+ if (HasFP) {
+ // R0 += (FPOffset-LastOffset).
+ // Need addic, since addi treats R0 as 0.
+ BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDIC), ScratchReg)
+ .addReg(ScratchReg)
+ .addImm(FPOffset-LastOffset);
+ LastOffset = FPOffset;
+ // Store FP into *R0.
+ BuildMI(MBB, MBBI, dl, TII.get(PPC::STWX))
+ .addReg(FPReg, RegState::Kill) // Save FP.
+ .addReg(PPC::ZERO)
+ .addReg(ScratchReg); // This will be the index (R0 is ok here).
+ }
+ if (FI->usesPICBase()) {
+ // R0 += (PBPOffset-LastOffset).
+ BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDIC), ScratchReg)
+ .addReg(ScratchReg)
+ .addImm(PBPOffset-LastOffset);
+ LastOffset = PBPOffset;
+ BuildMI(MBB, MBBI, dl, TII.get(PPC::STWX))
+ .addReg(PPC::R30, RegState::Kill) // Save PIC base pointer.
+ .addReg(PPC::ZERO)
+ .addReg(ScratchReg); // This will be the index (R0 is ok here).
+ }
+ if (HasBP) {
+ // R0 += (BPOffset-LastOffset).
+ BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDIC), ScratchReg)
+ .addReg(ScratchReg)
+ .addImm(BPOffset-LastOffset);
+ LastOffset = BPOffset;
+ BuildMI(MBB, MBBI, dl, TII.get(PPC::STWX))
+ .addReg(BPReg, RegState::Kill) // Save BP.
+ .addReg(PPC::ZERO)
+ .addReg(ScratchReg); // This will be the index (R0 is ok here).
+ // BP = R0-LastOffset
+ BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDIC), BPReg)
+ .addReg(ScratchReg, RegState::Kill)
+ .addImm(-LastOffset);
+ }
+ } else {
+ // ScratchReg is not R0, so use it as the base register. It is
+ // already set to the old SP, so we can use the offsets directly.
+
+ // Now that the stack frame has been allocated, save all the necessary
+ // registers using ScratchReg as the base address.
+ if (HasFP)
+ BuildMI(MBB, MBBI, dl, StoreInst)
+ .addReg(FPReg)
+ .addImm(FPOffset)
+ .addReg(ScratchReg);
+ if (FI->usesPICBase())
+ BuildMI(MBB, MBBI, dl, StoreInst)
+ .addReg(PPC::R30)
+ .addImm(PBPOffset)
+ .addReg(ScratchReg);
+ if (HasBP) {
+ BuildMI(MBB, MBBI, dl, StoreInst)
+ .addReg(BPReg)
+ .addImm(BPOffset)
+ .addReg(ScratchReg);
+ BuildMI(MBB, MBBI, dl, OrInst, BPReg)
+ .addReg(ScratchReg, RegState::Kill)
+ .addReg(ScratchReg);
+ }
+ }
+ } else {
+ // The frame size is a known 16-bit constant (fitting in the immediate
+ // field of STWU). To be here we have to be compiling for PPC32.
+ // Since the SPReg has been decreased by FrameSize, add it back to each
+ // offset.
+ if (HasFP)
+ BuildMI(MBB, MBBI, dl, StoreInst)
+ .addReg(FPReg)
+ .addImm(FrameSize + FPOffset)
+ .addReg(SPReg);
+ if (FI->usesPICBase())
+ BuildMI(MBB, MBBI, dl, StoreInst)
+ .addReg(PPC::R30)
+ .addImm(FrameSize + PBPOffset)
+ .addReg(SPReg);
+ if (HasBP) {
+ BuildMI(MBB, MBBI, dl, StoreInst)
+ .addReg(BPReg)
+ .addImm(FrameSize + BPOffset)
+ .addReg(SPReg);
+ BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDI), BPReg)
+ .addReg(SPReg)
+ .addImm(FrameSize);
+ }
+ }
+ }
+
+ // Add Call Frame Information for the instructions we generated above.
+ if (needsCFI) {
+ unsigned CFIIndex;
+
+ if (HasBP) {
+ // Define CFA in terms of BP. Do this in preference to using FP/SP,
+ // because if the stack needed aligning then CFA won't be at a fixed
+ // offset from FP/SP.
+ unsigned Reg = MRI->getDwarfRegNum(BPReg, true);
+ CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::createDefCfaRegister(nullptr, Reg));
+ } else {
+ // Adjust the definition of CFA to account for the change in SP.
+ assert(NegFrameSize);
+ CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::createDefCfaOffset(nullptr, NegFrameSize));
+ }
+ BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+
+ if (HasFP) {
+ // Describe where FP was saved, at a fixed offset from CFA.
+ unsigned Reg = MRI->getDwarfRegNum(FPReg, true);
+ CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::createOffset(nullptr, Reg, FPOffset));
+ BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+ }
+
+ if (FI->usesPICBase()) {
+ // Describe where FP was saved, at a fixed offset from CFA.
+ unsigned Reg = MRI->getDwarfRegNum(PPC::R30, true);
+ CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::createOffset(nullptr, Reg, PBPOffset));
+ BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+ }
+
+ if (HasBP) {
+ // Describe where BP was saved, at a fixed offset from CFA.
+ unsigned Reg = MRI->getDwarfRegNum(BPReg, true);
+ CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::createOffset(nullptr, Reg, BPOffset));
+ BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+ }
+
+ if (MustSaveLR) {
+ // Describe where LR was saved, at a fixed offset from CFA.
+ unsigned Reg = MRI->getDwarfRegNum(LRReg, true);
+ CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::createOffset(nullptr, Reg, LROffset));
+ BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+ }
+ }
+
+ // If there is a frame pointer, copy R1 into R31
+ if (HasFP) {
+ BuildMI(MBB, MBBI, dl, OrInst, FPReg)
+ .addReg(SPReg)
+ .addReg(SPReg);
+
+ if (!HasBP && needsCFI) {
+ // Change the definition of CFA from SP+offset to FP+offset, because SP
+ // will change at every alloca.
+ unsigned Reg = MRI->getDwarfRegNum(FPReg, true);
+ unsigned CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::createDefCfaRegister(nullptr, Reg));
+
+ BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+ }
+ }
+
+ if (needsCFI) {
+ // Describe where callee saved registers were saved, at fixed offsets from
+ // CFA.
+ const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+ for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
+ unsigned Reg = CSI[I].getReg();
+ if (Reg == PPC::LR || Reg == PPC::LR8 || Reg == PPC::RM) continue;
+
+ // This is a bit of a hack: CR2LT, CR2GT, CR2EQ and CR2UN are just
+ // subregisters of CR2. We just need to emit a move of CR2.
+ if (PPC::CRBITRCRegClass.contains(Reg))
+ continue;
+
+ // For SVR4, don't emit a move for the CR spill slot if we haven't
+ // spilled CRs.
+ if (isSVR4ABI && (PPC::CR2 <= Reg && Reg <= PPC::CR4)
+ && !MustSaveCR)
+ continue;
+
+ // For 64-bit SVR4 when we have spilled CRs, the spill location
+ // is SP+8, not a frame-relative slot.
+ if (isSVR4ABI && isPPC64 && (PPC::CR2 <= Reg && Reg <= PPC::CR4)) {
+ // In the ELFv1 ABI, only CR2 is noted in CFI and stands in for
+ // the whole CR word. In the ELFv2 ABI, every CR that was
+ // actually saved gets its own CFI record.
+ unsigned CRReg = isELFv2ABI? Reg : (unsigned) PPC::CR2;
+ unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
+ nullptr, MRI->getDwarfRegNum(CRReg, true), 8));
+ BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+ continue;
+ }
+
+ int Offset = MFI.getObjectOffset(CSI[I].getFrameIdx());
+ unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
+ nullptr, MRI->getDwarfRegNum(Reg, true), Offset));
+ BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+ }
+ }
+}
+
+void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+ DebugLoc dl;
+
+ if (MBBI != MBB.end())
+ dl = MBBI->getDebugLoc();
+
+ const PPCInstrInfo &TII =
+ *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo());
+ const PPCRegisterInfo *RegInfo =
+ static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+
+ // Get alignment info so we know how to restore the SP.
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ // Get the number of bytes allocated from the FrameInfo.
+ int FrameSize = MFI.getStackSize();
+
+ // Get processor type.
+ bool isPPC64 = Subtarget.isPPC64();
+ // Get the ABI.
+ bool isSVR4ABI = Subtarget.isSVR4ABI();
+
+ // Check if the link register (LR) has been saved.
+ PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+ bool MustSaveLR = FI->mustSaveLR();
+ const SmallVectorImpl<unsigned> &MustSaveCRs = FI->getMustSaveCRs();
+ bool MustSaveCR = !MustSaveCRs.empty();
+ // Do we have a frame pointer and/or base pointer for this function?
+ bool HasFP = hasFP(MF);
+ bool HasBP = RegInfo->hasBasePointer(MF);
+ bool HasRedZone = Subtarget.isPPC64() || !Subtarget.isSVR4ABI();
+
+ unsigned SPReg = isPPC64 ? PPC::X1 : PPC::R1;
+ unsigned BPReg = RegInfo->getBaseRegister(MF);
+ unsigned FPReg = isPPC64 ? PPC::X31 : PPC::R31;
+ unsigned ScratchReg = 0;
+ unsigned TempReg = isPPC64 ? PPC::X12 : PPC::R12; // another scratch reg
+ const MCInstrDesc& MTLRInst = TII.get( isPPC64 ? PPC::MTLR8
+ : PPC::MTLR );
+ const MCInstrDesc& LoadInst = TII.get( isPPC64 ? PPC::LD
+ : PPC::LWZ );
+ const MCInstrDesc& LoadImmShiftedInst = TII.get( isPPC64 ? PPC::LIS8
+ : PPC::LIS );
+ const MCInstrDesc& OrInst = TII.get(isPPC64 ? PPC::OR8
+ : PPC::OR );
+ const MCInstrDesc& OrImmInst = TII.get( isPPC64 ? PPC::ORI8
+ : PPC::ORI );
+ const MCInstrDesc& AddImmInst = TII.get( isPPC64 ? PPC::ADDI8
+ : PPC::ADDI );
+ const MCInstrDesc& AddInst = TII.get( isPPC64 ? PPC::ADD8
+ : PPC::ADD4 );
+
+ int LROffset = getReturnSaveOffset();
+
+ int FPOffset = 0;
+
+ // Using the same bool variable as below to suppress compiler warnings.
+ bool SingleScratchReg = findScratchRegister(&MBB, true, false, &ScratchReg,
+ &TempReg);
+ assert(SingleScratchReg &&
+ "Could not find an available scratch register");
+
+ SingleScratchReg = ScratchReg == TempReg;
+
+ if (HasFP) {
+ if (isSVR4ABI) {
+ int FPIndex = FI->getFramePointerSaveIndex();
+ assert(FPIndex && "No Frame Pointer Save Slot!");
+ FPOffset = MFI.getObjectOffset(FPIndex);
+ } else {
+ FPOffset = getFramePointerSaveOffset();
+ }
+ }
+
+ int BPOffset = 0;
+ if (HasBP) {
+ if (isSVR4ABI) {
+ int BPIndex = FI->getBasePointerSaveIndex();
+ assert(BPIndex && "No Base Pointer Save Slot!");
+ BPOffset = MFI.getObjectOffset(BPIndex);
+ } else {
+ BPOffset = getBasePointerSaveOffset();
+ }
+ }
+
+ int PBPOffset = 0;
+ if (FI->usesPICBase()) {
+ int PBPIndex = FI->getPICBasePointerSaveIndex();
+ assert(PBPIndex && "No PIC Base Pointer Save Slot!");
+ PBPOffset = MFI.getObjectOffset(PBPIndex);
+ }
+
+ bool IsReturnBlock = (MBBI != MBB.end() && MBBI->isReturn());
+
+ if (IsReturnBlock) {
+ unsigned RetOpcode = MBBI->getOpcode();
+ bool UsesTCRet = RetOpcode == PPC::TCRETURNri ||
+ RetOpcode == PPC::TCRETURNdi ||
+ RetOpcode == PPC::TCRETURNai ||
+ RetOpcode == PPC::TCRETURNri8 ||
+ RetOpcode == PPC::TCRETURNdi8 ||
+ RetOpcode == PPC::TCRETURNai8;
+
+ if (UsesTCRet) {
+ int MaxTCRetDelta = FI->getTailCallSPDelta();
+ MachineOperand &StackAdjust = MBBI->getOperand(1);
+ assert(StackAdjust.isImm() && "Expecting immediate value.");
+ // Adjust stack pointer.
+ int StackAdj = StackAdjust.getImm();
+ int Delta = StackAdj - MaxTCRetDelta;
+ assert((Delta >= 0) && "Delta must be positive");
+ if (MaxTCRetDelta>0)
+ FrameSize += (StackAdj +Delta);
+ else
+ FrameSize += StackAdj;
+ }
+ }
+
+ // Frames of 32KB & larger require special handling because they cannot be
+ // indexed into with a simple LD/LWZ immediate offset operand.
+ bool isLargeFrame = !isInt<16>(FrameSize);
+
+ // On targets without red zone, the SP needs to be restored last, so that
+ // all live contents of the stack frame are upwards of the SP. This means
+ // that we cannot restore SP just now, since there may be more registers
+ // to restore from the stack frame (e.g. R31). If the frame size is not
+ // a simple immediate value, we will need a spare register to hold the
+ // restored SP. If the frame size is known and small, we can simply adjust
+ // the offsets of the registers to be restored, and still use SP to restore
+ // them. In such case, the final update of SP will be to add the frame
+ // size to it.
+ // To simplify the code, set RBReg to the base register used to restore
+ // values from the stack, and set SPAdd to the value that needs to be added
+ // to the SP at the end. The default values are as if red zone was present.
+ unsigned RBReg = SPReg;
+ unsigned SPAdd = 0;
+
+ if (FrameSize) {
+ // In the prologue, the loaded (or persistent) stack pointer value is
+ // offset by the STDU/STDUX/STWU/STWUX instruction. For targets with red
+ // zone add this offset back now.
+
+ // If this function contained a fastcc call and GuaranteedTailCallOpt is
+ // enabled (=> hasFastCall()==true) the fastcc call might contain a tail
+ // call which invalidates the stack pointer value in SP(0). So we use the
+ // value of R31 in this case.
+ if (FI->hasFastCall()) {
+ assert(HasFP && "Expecting a valid frame pointer.");
+ if (!HasRedZone)
+ RBReg = FPReg;
+ if (!isLargeFrame) {
+ BuildMI(MBB, MBBI, dl, AddImmInst, RBReg)
+ .addReg(FPReg).addImm(FrameSize);
+ } else {
+ BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg)
+ .addImm(FrameSize >> 16);
+ BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg)
+ .addReg(ScratchReg, RegState::Kill)
+ .addImm(FrameSize & 0xFFFF);
+ BuildMI(MBB, MBBI, dl, AddInst)
+ .addReg(RBReg)
+ .addReg(FPReg)
+ .addReg(ScratchReg);
+ }
+ } else if (!isLargeFrame && !HasBP && !MFI.hasVarSizedObjects()) {
+ if (HasRedZone) {
+ BuildMI(MBB, MBBI, dl, AddImmInst, SPReg)
+ .addReg(SPReg)
+ .addImm(FrameSize);
+ } else {
+ // Make sure that adding FrameSize will not overflow the max offset
+ // size.
+ assert(FPOffset <= 0 && BPOffset <= 0 && PBPOffset <= 0 &&
+ "Local offsets should be negative");
+ SPAdd = FrameSize;
+ FPOffset += FrameSize;
+ BPOffset += FrameSize;
+ PBPOffset += FrameSize;
+ }
+ } else {
+ // We don't want to use ScratchReg as a base register, because it
+ // could happen to be R0. Use FP instead, but make sure to preserve it.
+ if (!HasRedZone) {
+ // If FP is not saved, copy it to ScratchReg.
+ if (!HasFP)
+ BuildMI(MBB, MBBI, dl, OrInst, ScratchReg)
+ .addReg(FPReg)
+ .addReg(FPReg);
+ RBReg = FPReg;
+ }
+ BuildMI(MBB, MBBI, dl, LoadInst, RBReg)
+ .addImm(0)
+ .addReg(SPReg);
+ }
+ }
+ assert(RBReg != ScratchReg && "Should have avoided ScratchReg");
+ // If there is no red zone, ScratchReg may be needed for holding a useful
+ // value (although not the base register). Make sure it is not overwritten
+ // too early.
+
+ assert((isPPC64 || !MustSaveCR) &&
+ "Epilogue CR restoring supported only in 64-bit mode");
+
+ // If we need to restore both the LR and the CR and we only have one
+ // available scratch register, we must do them one at a time.
+ if (MustSaveCR && SingleScratchReg && MustSaveLR) {
+ // Here TempReg == ScratchReg, and in the absence of red zone ScratchReg
+ // is live here.
+ assert(HasRedZone && "Expecting red zone");
+ BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ8), TempReg)
+ .addImm(8)
+ .addReg(SPReg);
+ for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i)
+ BuildMI(MBB, MBBI, dl, TII.get(PPC::MTOCRF8), MustSaveCRs[i])
+ .addReg(TempReg, getKillRegState(i == e-1));
+ }
+
+ // Delay restoring of the LR if ScratchReg is needed. This is ok, since
+ // LR is stored in the caller's stack frame. ScratchReg will be needed
+ // if RBReg is anything other than SP. We shouldn't use ScratchReg as
+ // a base register anyway, because it may happen to be R0.
+ bool LoadedLR = false;
+ if (MustSaveLR && RBReg == SPReg && isInt<16>(LROffset+SPAdd)) {
+ BuildMI(MBB, MBBI, dl, LoadInst, ScratchReg)
+ .addImm(LROffset+SPAdd)
+ .addReg(RBReg);
+ LoadedLR = true;
+ }
+
+ if (MustSaveCR && !(SingleScratchReg && MustSaveLR)) {
+ // This will only occur for PPC64.
+ assert(isPPC64 && "Expecting 64-bit mode");
+ assert(RBReg == SPReg && "Should be using SP as a base register");
+ BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ8), TempReg)
+ .addImm(8)
+ .addReg(RBReg);
+ }
+
+ if (HasFP) {
+ // If there is red zone, restore FP directly, since SP has already been
+ // restored. Otherwise, restore the value of FP into ScratchReg.
+ if (HasRedZone || RBReg == SPReg)
+ BuildMI(MBB, MBBI, dl, LoadInst, FPReg)
+ .addImm(FPOffset)
+ .addReg(SPReg);
+ else
+ BuildMI(MBB, MBBI, dl, LoadInst, ScratchReg)
+ .addImm(FPOffset)
+ .addReg(RBReg);
+ }
+
+ if (FI->usesPICBase())
+ BuildMI(MBB, MBBI, dl, LoadInst)
+ .addReg(PPC::R30)
+ .addImm(PBPOffset)
+ .addReg(RBReg);
+
+ if (HasBP)
+ BuildMI(MBB, MBBI, dl, LoadInst, BPReg)
+ .addImm(BPOffset)
+ .addReg(RBReg);
+
+ // There is nothing more to be loaded from the stack, so now we can
+ // restore SP: SP = RBReg + SPAdd.
+ if (RBReg != SPReg || SPAdd != 0) {
+ assert(!HasRedZone && "This should not happen with red zone");
+ // If SPAdd is 0, generate a copy.
+ if (SPAdd == 0)
+ BuildMI(MBB, MBBI, dl, OrInst, SPReg)
+ .addReg(RBReg)
+ .addReg(RBReg);
+ else
+ BuildMI(MBB, MBBI, dl, AddImmInst, SPReg)
+ .addReg(RBReg)
+ .addImm(SPAdd);
+
+ assert(RBReg != ScratchReg && "Should be using FP or SP as base register");
+ if (RBReg == FPReg)
+ BuildMI(MBB, MBBI, dl, OrInst, FPReg)
+ .addReg(ScratchReg)
+ .addReg(ScratchReg);
+
+ // Now load the LR from the caller's stack frame.
+ if (MustSaveLR && !LoadedLR)
+ BuildMI(MBB, MBBI, dl, LoadInst, ScratchReg)
+ .addImm(LROffset)
+ .addReg(SPReg);
+ }
+
+ if (MustSaveCR &&
+ !(SingleScratchReg && MustSaveLR)) // will only occur for PPC64
+ for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i)
+ BuildMI(MBB, MBBI, dl, TII.get(PPC::MTOCRF8), MustSaveCRs[i])
+ .addReg(TempReg, getKillRegState(i == e-1));
+
+ if (MustSaveLR)
+ BuildMI(MBB, MBBI, dl, MTLRInst).addReg(ScratchReg);
+
+ // Callee pop calling convention. Pop parameter/linkage area. Used for tail
+ // call optimization
+ if (IsReturnBlock) {
+ unsigned RetOpcode = MBBI->getOpcode();
+ if (MF.getTarget().Options.GuaranteedTailCallOpt &&
+ (RetOpcode == PPC::BLR || RetOpcode == PPC::BLR8) &&
+ MF.getFunction()->getCallingConv() == CallingConv::Fast) {
+ PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+ unsigned CallerAllocatedAmt = FI->getMinReservedArea();
+
+ if (CallerAllocatedAmt && isInt<16>(CallerAllocatedAmt)) {
+ BuildMI(MBB, MBBI, dl, AddImmInst, SPReg)
+ .addReg(SPReg).addImm(CallerAllocatedAmt);
+ } else {
+ BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg)
+ .addImm(CallerAllocatedAmt >> 16);
+ BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg)
+ .addReg(ScratchReg, RegState::Kill)
+ .addImm(CallerAllocatedAmt & 0xFFFF);
+ BuildMI(MBB, MBBI, dl, AddInst)
+ .addReg(SPReg)
+ .addReg(FPReg)
+ .addReg(ScratchReg);
+ }
+ } else {
+ createTailCallBranchInstr(MBB);
+ }
+ }
+}
+
+void PPCFrameLowering::createTailCallBranchInstr(MachineBasicBlock &MBB) const {
+ MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+ DebugLoc dl;
+
+ if (MBBI != MBB.end())
+ dl = MBBI->getDebugLoc();
+
+ const PPCInstrInfo &TII =
+ *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo());
+
+ // Create branch instruction for pseudo tail call return instruction
+ unsigned RetOpcode = MBBI->getOpcode();
+ if (RetOpcode == PPC::TCRETURNdi) {
+ MBBI = MBB.getLastNonDebugInstr();
+ MachineOperand &JumpTarget = MBBI->getOperand(0);
+ BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB)).
+ addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset());
+ } else if (RetOpcode == PPC::TCRETURNri) {
+ MBBI = MBB.getLastNonDebugInstr();
+ assert(MBBI->getOperand(0).isReg() && "Expecting register operand.");
+ BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBCTR));
+ } else if (RetOpcode == PPC::TCRETURNai) {
+ MBBI = MBB.getLastNonDebugInstr();
+ MachineOperand &JumpTarget = MBBI->getOperand(0);
+ BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBA)).addImm(JumpTarget.getImm());
+ } else if (RetOpcode == PPC::TCRETURNdi8) {
+ MBBI = MBB.getLastNonDebugInstr();
+ MachineOperand &JumpTarget = MBBI->getOperand(0);
+ BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB8)).
+ addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset());
+ } else if (RetOpcode == PPC::TCRETURNri8) {
+ MBBI = MBB.getLastNonDebugInstr();
+ assert(MBBI->getOperand(0).isReg() && "Expecting register operand.");
+ BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBCTR8));
+ } else if (RetOpcode == PPC::TCRETURNai8) {
+ MBBI = MBB.getLastNonDebugInstr();
+ MachineOperand &JumpTarget = MBBI->getOperand(0);
+ BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBA8)).addImm(JumpTarget.getImm());
+ }
+}
+
+void PPCFrameLowering::determineCalleeSaves(MachineFunction &MF,
+ BitVector &SavedRegs,
+ RegScavenger *RS) const {
+ TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+
+ const PPCRegisterInfo *RegInfo =
+ static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+
+ // Save and clear the LR state.
+ PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+ unsigned LR = RegInfo->getRARegister();
+ FI->setMustSaveLR(MustSaveLR(MF, LR));
+ SavedRegs.reset(LR);
+
+ // Save R31 if necessary
+ int FPSI = FI->getFramePointerSaveIndex();
+ bool isPPC64 = Subtarget.isPPC64();
+ bool isDarwinABI = Subtarget.isDarwinABI();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ // If the frame pointer save index hasn't been defined yet.
+ if (!FPSI && needsFP(MF)) {
+ // Find out what the fix offset of the frame pointer save area.
+ int FPOffset = getFramePointerSaveOffset();
+ // Allocate the frame index for frame pointer save area.
+ FPSI = MFI.CreateFixedObject(isPPC64? 8 : 4, FPOffset, true);
+ // Save the result.
+ FI->setFramePointerSaveIndex(FPSI);
+ }
+
+ int BPSI = FI->getBasePointerSaveIndex();
+ if (!BPSI && RegInfo->hasBasePointer(MF)) {
+ int BPOffset = getBasePointerSaveOffset();
+ // Allocate the frame index for the base pointer save area.
+ BPSI = MFI.CreateFixedObject(isPPC64? 8 : 4, BPOffset, true);
+ // Save the result.
+ FI->setBasePointerSaveIndex(BPSI);
+ }
+
+ // Reserve stack space for the PIC Base register (R30).
+ // Only used in SVR4 32-bit.
+ if (FI->usesPICBase()) {
+ int PBPSI = MFI.CreateFixedObject(4, -8, true);
+ FI->setPICBasePointerSaveIndex(PBPSI);
+ }
+
+ // Make sure we don't explicitly spill r31, because, for example, we have
+ // some inline asm which explicity clobbers it, when we otherwise have a
+ // frame pointer and are using r31's spill slot for the prologue/epilogue
+ // code. Same goes for the base pointer and the PIC base register.
+ if (needsFP(MF))
+ SavedRegs.reset(isPPC64 ? PPC::X31 : PPC::R31);
+ if (RegInfo->hasBasePointer(MF))
+ SavedRegs.reset(RegInfo->getBaseRegister(MF));
+ if (FI->usesPICBase())
+ SavedRegs.reset(PPC::R30);
+
+ // Reserve stack space to move the linkage area to in case of a tail call.
+ int TCSPDelta = 0;
+ if (MF.getTarget().Options.GuaranteedTailCallOpt &&
+ (TCSPDelta = FI->getTailCallSPDelta()) < 0) {
+ MFI.CreateFixedObject(-1 * TCSPDelta, TCSPDelta, true);
+ }
+
+ // For 32-bit SVR4, allocate the nonvolatile CR spill slot iff the
+ // function uses CR 2, 3, or 4.
+ if (!isPPC64 && !isDarwinABI &&
+ (SavedRegs.test(PPC::CR2) ||
+ SavedRegs.test(PPC::CR3) ||
+ SavedRegs.test(PPC::CR4))) {
+ int FrameIdx = MFI.CreateFixedObject((uint64_t)4, (int64_t)-4, true);
+ FI->setCRSpillFrameIndex(FrameIdx);
+ }
+}
+
+void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
+ RegScavenger *RS) const {
+ // Early exit if not using the SVR4 ABI.
+ if (!Subtarget.isSVR4ABI()) {
+ addScavengingSpillSlot(MF, RS);
+ return;
+ }
+
+ // Get callee saved register information.
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+
+ // If the function is shrink-wrapped, and if the function has a tail call, the
+ // tail call might not be in the new RestoreBlock, so real branch instruction
+ // won't be generated by emitEpilogue(), because shrink-wrap has chosen new
+ // RestoreBlock. So we handle this case here.
+ if (MFI.getSavePoint() && MFI.hasTailCall()) {
+ MachineBasicBlock *RestoreBlock = MFI.getRestorePoint();
+ for (MachineBasicBlock &MBB : MF) {
+ if (MBB.isReturnBlock() && (&MBB) != RestoreBlock)
+ createTailCallBranchInstr(MBB);
+ }
+ }
+
+ // Early exit if no callee saved registers are modified!
+ if (CSI.empty() && !needsFP(MF)) {
+ addScavengingSpillSlot(MF, RS);
+ return;
+ }
+
+ unsigned MinGPR = PPC::R31;
+ unsigned MinG8R = PPC::X31;
+ unsigned MinFPR = PPC::F31;
+ unsigned MinVR = PPC::V31;
+
+ bool HasGPSaveArea = false;
+ bool HasG8SaveArea = false;
+ bool HasFPSaveArea = false;
+ bool HasVRSAVESaveArea = false;
+ bool HasVRSaveArea = false;
+
+ SmallVector<CalleeSavedInfo, 18> GPRegs;
+ SmallVector<CalleeSavedInfo, 18> G8Regs;
+ SmallVector<CalleeSavedInfo, 18> FPRegs;
+ SmallVector<CalleeSavedInfo, 18> VRegs;
+
+ for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+ unsigned Reg = CSI[i].getReg();
+ if (PPC::GPRCRegClass.contains(Reg)) {
+ HasGPSaveArea = true;
+
+ GPRegs.push_back(CSI[i]);
+
+ if (Reg < MinGPR) {
+ MinGPR = Reg;
+ }
+ } else if (PPC::G8RCRegClass.contains(Reg)) {
+ HasG8SaveArea = true;
+
+ G8Regs.push_back(CSI[i]);
+
+ if (Reg < MinG8R) {
+ MinG8R = Reg;
+ }
+ } else if (PPC::F8RCRegClass.contains(Reg)) {
+ HasFPSaveArea = true;
+
+ FPRegs.push_back(CSI[i]);
+
+ if (Reg < MinFPR) {
+ MinFPR = Reg;
+ }
+ } else if (PPC::CRBITRCRegClass.contains(Reg) ||
+ PPC::CRRCRegClass.contains(Reg)) {
+ ; // do nothing, as we already know whether CRs are spilled
+ } else if (PPC::VRSAVERCRegClass.contains(Reg)) {
+ HasVRSAVESaveArea = true;
+ } else if (PPC::VRRCRegClass.contains(Reg)) {
+ HasVRSaveArea = true;
+
+ VRegs.push_back(CSI[i]);
+
+ if (Reg < MinVR) {
+ MinVR = Reg;
+ }
+ } else {
+ llvm_unreachable("Unknown RegisterClass!");
+ }
+ }
+
+ PPCFunctionInfo *PFI = MF.getInfo<PPCFunctionInfo>();
+ const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+
+ int64_t LowerBound = 0;
+
+ // Take into account stack space reserved for tail calls.
+ int TCSPDelta = 0;
+ if (MF.getTarget().Options.GuaranteedTailCallOpt &&
+ (TCSPDelta = PFI->getTailCallSPDelta()) < 0) {
+ LowerBound = TCSPDelta;
+ }
+
+ // The Floating-point register save area is right below the back chain word
+ // of the previous stack frame.
+ if (HasFPSaveArea) {
+ for (unsigned i = 0, e = FPRegs.size(); i != e; ++i) {
+ int FI = FPRegs[i].getFrameIdx();
+
+ MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI));
+ }
+
+ LowerBound -= (31 - TRI->getEncodingValue(MinFPR) + 1) * 8;
+ }
+
+ // Check whether the frame pointer register is allocated. If so, make sure it
+ // is spilled to the correct offset.
+ if (needsFP(MF)) {
+ HasGPSaveArea = true;
+
+ int FI = PFI->getFramePointerSaveIndex();
+ assert(FI && "No Frame Pointer Save Slot!");
+
+ MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI));
+ }
+
+ if (PFI->usesPICBase()) {
+ HasGPSaveArea = true;
+
+ int FI = PFI->getPICBasePointerSaveIndex();
+ assert(FI && "No PIC Base Pointer Save Slot!");
+
+ MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI));
+ }
+
+ const PPCRegisterInfo *RegInfo =
+ static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+ if (RegInfo->hasBasePointer(MF)) {
+ HasGPSaveArea = true;
+
+ int FI = PFI->getBasePointerSaveIndex();
+ assert(FI && "No Base Pointer Save Slot!");
+
+ MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI));
+ }
+
+ // General register save area starts right below the Floating-point
+ // register save area.
+ if (HasGPSaveArea || HasG8SaveArea) {
+ // Move general register save area spill slots down, taking into account
+ // the size of the Floating-point register save area.
+ for (unsigned i = 0, e = GPRegs.size(); i != e; ++i) {
+ int FI = GPRegs[i].getFrameIdx();
+
+ MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI));
+ }
+
+ // Move general register save area spill slots down, taking into account
+ // the size of the Floating-point register save area.
+ for (unsigned i = 0, e = G8Regs.size(); i != e; ++i) {
+ int FI = G8Regs[i].getFrameIdx();
+
+ MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI));
+ }
+
+ unsigned MinReg =
+ std::min<unsigned>(TRI->getEncodingValue(MinGPR),
+ TRI->getEncodingValue(MinG8R));
+
+ if (Subtarget.isPPC64()) {
+ LowerBound -= (31 - MinReg + 1) * 8;
+ } else {
+ LowerBound -= (31 - MinReg + 1) * 4;
+ }
+ }
+
+ // For 32-bit only, the CR save area is below the general register
+ // save area. For 64-bit SVR4, the CR save area is addressed relative
+ // to the stack pointer and hence does not need an adjustment here.
+ // Only CR2 (the first nonvolatile spilled) has an associated frame
+ // index so that we have a single uniform save area.
+ if (spillsCR(MF) && !(Subtarget.isPPC64() && Subtarget.isSVR4ABI())) {
+ // Adjust the frame index of the CR spill slot.
+ for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+ unsigned Reg = CSI[i].getReg();
+
+ if ((Subtarget.isSVR4ABI() && Reg == PPC::CR2)
+ // Leave Darwin logic as-is.
+ || (!Subtarget.isSVR4ABI() &&
+ (PPC::CRBITRCRegClass.contains(Reg) ||
+ PPC::CRRCRegClass.contains(Reg)))) {
+ int FI = CSI[i].getFrameIdx();
+
+ MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI));
+ }
+ }
+
+ LowerBound -= 4; // The CR save area is always 4 bytes long.
+ }
+
+ if (HasVRSAVESaveArea) {
+ // FIXME SVR4: Is it actually possible to have multiple elements in CSI
+ // which have the VRSAVE register class?
+ // Adjust the frame index of the VRSAVE spill slot.
+ for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+ unsigned Reg = CSI[i].getReg();
+
+ if (PPC::VRSAVERCRegClass.contains(Reg)) {
+ int FI = CSI[i].getFrameIdx();
+
+ MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI));
+ }
+ }
+
+ LowerBound -= 4; // The VRSAVE save area is always 4 bytes long.
+ }
+
+ if (HasVRSaveArea) {
+ // Insert alignment padding, we need 16-byte alignment.
+ LowerBound = (LowerBound - 15) & ~(15);
+
+ for (unsigned i = 0, e = VRegs.size(); i != e; ++i) {
+ int FI = VRegs[i].getFrameIdx();
+
+ MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI));
+ }
+ }
+
+ addScavengingSpillSlot(MF, RS);
+}
+
+void
+PPCFrameLowering::addScavengingSpillSlot(MachineFunction &MF,
+ RegScavenger *RS) const {
+ // Reserve a slot closest to SP or frame pointer if we have a dynalloc or
+ // a large stack, which will require scavenging a register to materialize a
+ // large offset.
+
+ // We need to have a scavenger spill slot for spills if the frame size is
+ // large. In case there is no free register for large-offset addressing,
+ // this slot is used for the necessary emergency spill. Also, we need the
+ // slot for dynamic stack allocations.
+
+ // The scavenger might be invoked if the frame offset does not fit into
+ // the 16-bit immediate. We don't know the complete frame size here
+ // because we've not yet computed callee-saved register spills or the
+ // needed alignment padding.
+ unsigned StackSize = determineFrameLayout(MF, false, true);
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ if (MFI.hasVarSizedObjects() || spillsCR(MF) || spillsVRSAVE(MF) ||
+ hasNonRISpills(MF) || (hasSpills(MF) && !isInt<16>(StackSize))) {
+ const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+ const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+ const TargetRegisterClass *RC = Subtarget.isPPC64() ? G8RC : GPRC;
+ RS->addScavengingFrameIndex(MFI.CreateStackObject(RC->getSize(),
+ RC->getAlignment(),
+ false));
+
+ // Might we have over-aligned allocas?
+ bool HasAlVars = MFI.hasVarSizedObjects() &&
+ MFI.getMaxAlignment() > getStackAlignment();
+
+ // These kinds of spills might need two registers.
+ if (spillsCR(MF) || spillsVRSAVE(MF) || HasAlVars)
+ RS->addScavengingFrameIndex(MFI.CreateStackObject(RC->getSize(),
+ RC->getAlignment(),
+ false));
+
+ }
+}
+
+bool
+PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const {
+
+ // Currently, this function only handles SVR4 32- and 64-bit ABIs.
+ // Return false otherwise to maintain pre-existing behavior.
+ if (!Subtarget.isSVR4ABI())
+ return false;
+
+ MachineFunction *MF = MBB.getParent();
+ const PPCInstrInfo &TII =
+ *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo());
+ DebugLoc DL;
+ bool CRSpilled = false;
+ MachineInstrBuilder CRMIB;
+
+ for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+ unsigned Reg = CSI[i].getReg();
+ // Only Darwin actually uses the VRSAVE register, but it can still appear
+ // here if, for example, @llvm.eh.unwind.init() is used. If we're not on
+ // Darwin, ignore it.
+ if (Reg == PPC::VRSAVE && !Subtarget.isDarwinABI())
+ continue;
+
+ // CR2 through CR4 are the nonvolatile CR fields.
+ bool IsCRField = PPC::CR2 <= Reg && Reg <= PPC::CR4;
+
+ // Add the callee-saved register as live-in; it's killed at the spill.
+ MBB.addLiveIn(Reg);
+
+ if (CRSpilled && IsCRField) {
+ CRMIB.addReg(Reg, RegState::ImplicitKill);
+ continue;
+ }
+
+ // Insert the spill to the stack frame.
+ if (IsCRField) {
+ PPCFunctionInfo *FuncInfo = MF->getInfo<PPCFunctionInfo>();
+ if (Subtarget.isPPC64()) {
+ // The actual spill will happen at the start of the prologue.
+ FuncInfo->addMustSaveCR(Reg);
+ } else {
+ CRSpilled = true;
+ FuncInfo->setSpillsCR();
+
+ // 32-bit: FP-relative. Note that we made sure CR2-CR4 all have
+ // the same frame index in PPCRegisterInfo::hasReservedSpillSlot.
+ CRMIB = BuildMI(*MF, DL, TII.get(PPC::MFCR), PPC::R12)
+ .addReg(Reg, RegState::ImplicitKill);
+
+ MBB.insert(MI, CRMIB);
+ MBB.insert(MI, addFrameReference(BuildMI(*MF, DL, TII.get(PPC::STW))
+ .addReg(PPC::R12,
+ getKillRegState(true)),
+ CSI[i].getFrameIdx()));
+ }
+ } else {
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+ TII.storeRegToStackSlot(MBB, MI, Reg, true,
+ CSI[i].getFrameIdx(), RC, TRI);
+ }
+ }
+ return true;
+}
+
+static void
+restoreCRs(bool isPPC64, bool is31,
+ bool CR2Spilled, bool CR3Spilled, bool CR4Spilled,
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI, unsigned CSIIndex) {
+
+ MachineFunction *MF = MBB.getParent();
+ const PPCInstrInfo &TII = *MF->getSubtarget<PPCSubtarget>().getInstrInfo();
+ DebugLoc DL;
+ unsigned RestoreOp, MoveReg;
+
+ if (isPPC64)
+ // This is handled during epilogue generation.
+ return;
+ else {
+ // 32-bit: FP-relative
+ MBB.insert(MI, addFrameReference(BuildMI(*MF, DL, TII.get(PPC::LWZ),
+ PPC::R12),
+ CSI[CSIIndex].getFrameIdx()));
+ RestoreOp = PPC::MTOCRF;
+ MoveReg = PPC::R12;
+ }
+
+ if (CR2Spilled)
+ MBB.insert(MI, BuildMI(*MF, DL, TII.get(RestoreOp), PPC::CR2)
+ .addReg(MoveReg, getKillRegState(!CR3Spilled && !CR4Spilled)));
+
+ if (CR3Spilled)
+ MBB.insert(MI, BuildMI(*MF, DL, TII.get(RestoreOp), PPC::CR3)
+ .addReg(MoveReg, getKillRegState(!CR4Spilled)));
+
+ if (CR4Spilled)
+ MBB.insert(MI, BuildMI(*MF, DL, TII.get(RestoreOp), PPC::CR4)
+ .addReg(MoveReg, getKillRegState(true)));
+}
+
+MachineBasicBlock::iterator PPCFrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+ if (MF.getTarget().Options.GuaranteedTailCallOpt &&
+ I->getOpcode() == PPC::ADJCALLSTACKUP) {
+ // Add (actually subtract) back the amount the callee popped on return.
+ if (int CalleeAmt = I->getOperand(1).getImm()) {
+ bool is64Bit = Subtarget.isPPC64();
+ CalleeAmt *= -1;
+ unsigned StackReg = is64Bit ? PPC::X1 : PPC::R1;
+ unsigned TmpReg = is64Bit ? PPC::X0 : PPC::R0;
+ unsigned ADDIInstr = is64Bit ? PPC::ADDI8 : PPC::ADDI;
+ unsigned ADDInstr = is64Bit ? PPC::ADD8 : PPC::ADD4;
+ unsigned LISInstr = is64Bit ? PPC::LIS8 : PPC::LIS;
+ unsigned ORIInstr = is64Bit ? PPC::ORI8 : PPC::ORI;
+ const DebugLoc &dl = I->getDebugLoc();
+
+ if (isInt<16>(CalleeAmt)) {
+ BuildMI(MBB, I, dl, TII.get(ADDIInstr), StackReg)
+ .addReg(StackReg, RegState::Kill)
+ .addImm(CalleeAmt);
+ } else {
+ MachineBasicBlock::iterator MBBI = I;
+ BuildMI(MBB, MBBI, dl, TII.get(LISInstr), TmpReg)
+ .addImm(CalleeAmt >> 16);
+ BuildMI(MBB, MBBI, dl, TII.get(ORIInstr), TmpReg)
+ .addReg(TmpReg, RegState::Kill)
+ .addImm(CalleeAmt & 0xFFFF);
+ BuildMI(MBB, MBBI, dl, TII.get(ADDInstr), StackReg)
+ .addReg(StackReg, RegState::Kill)
+ .addReg(TmpReg);
+ }
+ }
+ }
+ // Simply discard ADJCALLSTACKDOWN, ADJCALLSTACKUP instructions.
+ return MBB.erase(I);
+}
+
+bool
+PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const {
+
+ // Currently, this function only handles SVR4 32- and 64-bit ABIs.
+ // Return false otherwise to maintain pre-existing behavior.
+ if (!Subtarget.isSVR4ABI())
+ return false;
+
+ MachineFunction *MF = MBB.getParent();
+ const PPCInstrInfo &TII =
+ *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo());
+ bool CR2Spilled = false;
+ bool CR3Spilled = false;
+ bool CR4Spilled = false;
+ unsigned CSIIndex = 0;
+
+ // Initialize insertion-point logic; we will be restoring in reverse
+ // order of spill.
+ MachineBasicBlock::iterator I = MI, BeforeI = I;
+ bool AtStart = I == MBB.begin();
+
+ if (!AtStart)
+ --BeforeI;
+
+ for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+ unsigned Reg = CSI[i].getReg();
+
+ // Only Darwin actually uses the VRSAVE register, but it can still appear
+ // here if, for example, @llvm.eh.unwind.init() is used. If we're not on
+ // Darwin, ignore it.
+ if (Reg == PPC::VRSAVE && !Subtarget.isDarwinABI())
+ continue;
+
+ if (Reg == PPC::CR2) {
+ CR2Spilled = true;
+ // The spill slot is associated only with CR2, which is the
+ // first nonvolatile spilled. Save it here.
+ CSIIndex = i;
+ continue;
+ } else if (Reg == PPC::CR3) {
+ CR3Spilled = true;
+ continue;
+ } else if (Reg == PPC::CR4) {
+ CR4Spilled = true;
+ continue;
+ } else {
+ // When we first encounter a non-CR register after seeing at
+ // least one CR register, restore all spilled CRs together.
+ if ((CR2Spilled || CR3Spilled || CR4Spilled)
+ && !(PPC::CR2 <= Reg && Reg <= PPC::CR4)) {
+ bool is31 = needsFP(*MF);
+ restoreCRs(Subtarget.isPPC64(), is31,
+ CR2Spilled, CR3Spilled, CR4Spilled,
+ MBB, I, CSI, CSIIndex);
+ CR2Spilled = CR3Spilled = CR4Spilled = false;
+ }
+
+ // Default behavior for non-CR saves.
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+ TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(),
+ RC, TRI);
+ assert(I != MBB.begin() &&
+ "loadRegFromStackSlot didn't insert any code!");
+ }
+
+ // Insert in reverse order.
+ if (AtStart)
+ I = MBB.begin();
+ else {
+ I = BeforeI;
+ ++I;
+ }
+ }
+
+ // If we haven't yet spilled the CRs, do so now.
+ if (CR2Spilled || CR3Spilled || CR4Spilled) {
+ bool is31 = needsFP(*MF);
+ restoreCRs(Subtarget.isPPC64(), is31, CR2Spilled, CR3Spilled, CR4Spilled,
+ MBB, I, CSI, CSIIndex);
+ }
+
+ return true;
+}
+
+bool PPCFrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
+ return (MF.getSubtarget<PPCSubtarget>().isSVR4ABI() &&
+ MF.getSubtarget<PPCSubtarget>().isPPC64());
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.h b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.h
new file mode 100644
index 000000000000..28b0c57f0ffb
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.h
@@ -0,0 +1,149 @@
+//===-- PPCFrameLowering.h - Define frame lowering for PowerPC --*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCFRAMELOWERING_H
+#define LLVM_LIB_TARGET_POWERPC_PPCFRAMELOWERING_H
+
+#include "PPC.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class PPCSubtarget;
+
+class PPCFrameLowering: public TargetFrameLowering {
+ const PPCSubtarget &Subtarget;
+ const unsigned ReturnSaveOffset;
+ const unsigned TOCSaveOffset;
+ const unsigned FramePointerSaveOffset;
+ const unsigned LinkageSize;
+ const unsigned BasePointerSaveOffset;
+
+ /**
+ * \brief Find register[s] that can be used in function prologue and epilogue
+ *
+ * Find register[s] that can be use as scratch register[s] in function
+ * prologue and epilogue to save various registers (Link Register, Base
+ * Pointer, etc.). Prefer R0/R12, if available. Otherwise choose whatever
+ * register[s] are available.
+ *
+ * This method will return true if it is able to find enough unique scratch
+ * registers (1 or 2 depending on the requirement). If it is unable to find
+ * enough available registers in the block, it will return false and set
+ * any passed output parameter that corresponds to a required unique register
+ * to PPC::NoRegister.
+ *
+ * \param[in] MBB The machine basic block to find an available register for
+ * \param[in] UseAtEnd Specify whether the scratch register will be used at
+ * the end of the basic block (i.e., will the scratch
+ * register kill a register defined in the basic block)
+ * \param[in] TwoUniqueRegsRequired Specify whether this basic block will
+ * require two unique scratch registers.
+ * \param[out] SR1 The scratch register to use
+ * \param[out] SR2 The second scratch register. If this pointer is not null
+ * the function will attempt to set it to an available
+ * register regardless of whether there is a hard requirement
+ * for two unique scratch registers.
+ * \return true if the required number of registers was found.
+ * false if the required number of scratch register weren't available.
+ * If either output parameter refers to a required scratch register
+ * that isn't available, it will be set to an invalid value.
+ */
+ bool findScratchRegister(MachineBasicBlock *MBB,
+ bool UseAtEnd,
+ bool TwoUniqueRegsRequired = false,
+ unsigned *SR1 = nullptr,
+ unsigned *SR2 = nullptr) const;
+ bool twoUniqueScratchRegsRequired(MachineBasicBlock *MBB) const;
+
+ /**
+ * \brief Create branch instruction for PPC::TCRETURN* (tail call return)
+ *
+ * \param[in] MBB that is terminated by PPC::TCRETURN*
+ */
+ void createTailCallBranchInstr(MachineBasicBlock &MBB) const;
+
+public:
+ PPCFrameLowering(const PPCSubtarget &STI);
+
+ unsigned determineFrameLayout(MachineFunction &MF,
+ bool UpdateMF = true,
+ bool UseEstimate = false) const;
+
+ /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+ /// the function.
+ void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+ void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+ bool hasFP(const MachineFunction &MF) const override;
+ bool needsFP(const MachineFunction &MF) const;
+ void replaceFPWithRealFP(MachineFunction &MF) const;
+
+ void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+ RegScavenger *RS = nullptr) const override;
+ void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+ RegScavenger *RS = nullptr) const override;
+ void addScavengingSpillSlot(MachineFunction &MF, RegScavenger *RS) const;
+
+ bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const override;
+
+ MachineBasicBlock::iterator
+ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const override;
+
+ bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const override;
+
+ /// targetHandlesStackFrameRounding - Returns true if the target is
+ /// responsible for rounding up the stack frame (probably at emitPrologue
+ /// time).
+ bool targetHandlesStackFrameRounding() const override { return true; }
+
+ /// getReturnSaveOffset - Return the previous frame offset to save the
+ /// return address.
+ unsigned getReturnSaveOffset() const { return ReturnSaveOffset; }
+
+ /// getTOCSaveOffset - Return the previous frame offset to save the
+ /// TOC register -- 64-bit SVR4 ABI only.
+ unsigned getTOCSaveOffset() const { return TOCSaveOffset; }
+
+ /// getFramePointerSaveOffset - Return the previous frame offset to save the
+ /// frame pointer.
+ unsigned getFramePointerSaveOffset() const { return FramePointerSaveOffset; }
+
+ /// getBasePointerSaveOffset - Return the previous frame offset to save the
+ /// base pointer.
+ unsigned getBasePointerSaveOffset() const { return BasePointerSaveOffset; }
+
+ /// getLinkageSize - Return the size of the PowerPC ABI linkage area.
+ ///
+ unsigned getLinkageSize() const { return LinkageSize; }
+
+ const SpillSlot *
+ getCalleeSavedSpillSlots(unsigned &NumEntries) const override;
+
+ bool enableShrinkWrapping(const MachineFunction &MF) const override;
+
+ /// Methods used by shrink wrapping to determine if MBB can be used for the
+ /// function prologue/epilogue.
+ bool canUseAsPrologue(const MachineBasicBlock &MBB) const override;
+ bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override;
+};
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp
new file mode 100644
index 000000000000..f327396370f6
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp
@@ -0,0 +1,436 @@
+//===-- PPCHazardRecognizers.cpp - PowerPC Hazard Recognizer Impls --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements hazard recognizers for scheduling on PowerPC processors.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCHazardRecognizers.h"
+#include "PPC.h"
+#include "PPCInstrInfo.h"
+#include "PPCTargetMachine.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "pre-RA-sched"
+
+bool PPCDispatchGroupSBHazardRecognizer::isLoadAfterStore(SUnit *SU) {
+ // FIXME: Move this.
+ if (isBCTRAfterSet(SU))
+ return true;
+
+ const MCInstrDesc *MCID = DAG->getInstrDesc(SU);
+ if (!MCID)
+ return false;
+
+ if (!MCID->mayLoad())
+ return false;
+
+ // SU is a load; for any predecessors in this dispatch group, that are stores,
+ // and with which we have an ordering dependency, return true.
+ for (unsigned i = 0, ie = (unsigned) SU->Preds.size(); i != ie; ++i) {
+ const MCInstrDesc *PredMCID = DAG->getInstrDesc(SU->Preds[i].getSUnit());
+ if (!PredMCID || !PredMCID->mayStore())
+ continue;
+
+ if (!SU->Preds[i].isNormalMemory() && !SU->Preds[i].isBarrier())
+ continue;
+
+ for (unsigned j = 0, je = CurGroup.size(); j != je; ++j)
+ if (SU->Preds[i].getSUnit() == CurGroup[j])
+ return true;
+ }
+
+ return false;
+}
+
+bool PPCDispatchGroupSBHazardRecognizer::isBCTRAfterSet(SUnit *SU) {
+ const MCInstrDesc *MCID = DAG->getInstrDesc(SU);
+ if (!MCID)
+ return false;
+
+ if (!MCID->isBranch())
+ return false;
+
+ // SU is a branch; for any predecessors in this dispatch group, with which we
+ // have a data dependence and set the counter register, return true.
+ for (unsigned i = 0, ie = (unsigned) SU->Preds.size(); i != ie; ++i) {
+ const MCInstrDesc *PredMCID = DAG->getInstrDesc(SU->Preds[i].getSUnit());
+ if (!PredMCID || PredMCID->getSchedClass() != PPC::Sched::IIC_SprMTSPR)
+ continue;
+
+ if (SU->Preds[i].isCtrl())
+ continue;
+
+ for (unsigned j = 0, je = CurGroup.size(); j != je; ++j)
+ if (SU->Preds[i].getSUnit() == CurGroup[j])
+ return true;
+ }
+
+ return false;
+}
+
+// FIXME: Remove this when we don't need this:
+namespace llvm { namespace PPC { extern int getNonRecordFormOpcode(uint16_t); } }
+
+// FIXME: A lot of code in PPCDispatchGroupSBHazardRecognizer is P7 specific.
+
+bool PPCDispatchGroupSBHazardRecognizer::mustComeFirst(const MCInstrDesc *MCID,
+ unsigned &NSlots) {
+ // FIXME: Indirectly, this information is contained in the itinerary, and
+ // we should derive it from there instead of separately specifying it
+ // here.
+ unsigned IIC = MCID->getSchedClass();
+ switch (IIC) {
+ default:
+ NSlots = 1;
+ break;
+ case PPC::Sched::IIC_IntDivW:
+ case PPC::Sched::IIC_IntDivD:
+ case PPC::Sched::IIC_LdStLoadUpd:
+ case PPC::Sched::IIC_LdStLDU:
+ case PPC::Sched::IIC_LdStLFDU:
+ case PPC::Sched::IIC_LdStLFDUX:
+ case PPC::Sched::IIC_LdStLHA:
+ case PPC::Sched::IIC_LdStLHAU:
+ case PPC::Sched::IIC_LdStLWA:
+ case PPC::Sched::IIC_LdStSTDU:
+ case PPC::Sched::IIC_LdStSTFDU:
+ NSlots = 2;
+ break;
+ case PPC::Sched::IIC_LdStLoadUpdX:
+ case PPC::Sched::IIC_LdStLDUX:
+ case PPC::Sched::IIC_LdStLHAUX:
+ case PPC::Sched::IIC_LdStLWARX:
+ case PPC::Sched::IIC_LdStLDARX:
+ case PPC::Sched::IIC_LdStSTDUX:
+ case PPC::Sched::IIC_LdStSTDCX:
+ case PPC::Sched::IIC_LdStSTWCX:
+ case PPC::Sched::IIC_BrMCRX: // mtcr
+ // FIXME: Add sync/isync (here and in the itinerary).
+ NSlots = 4;
+ break;
+ }
+
+ // FIXME: record-form instructions need a different itinerary class.
+ if (NSlots == 1 && PPC::getNonRecordFormOpcode(MCID->getOpcode()) != -1)
+ NSlots = 2;
+
+ switch (IIC) {
+ default:
+ // All multi-slot instructions must come first.
+ return NSlots > 1;
+ case PPC::Sched::IIC_BrCR: // cr logicals
+ case PPC::Sched::IIC_SprMFCR:
+ case PPC::Sched::IIC_SprMFCRF:
+ case PPC::Sched::IIC_SprMTSPR:
+ return true;
+ }
+}
+
+ScheduleHazardRecognizer::HazardType
+PPCDispatchGroupSBHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
+ if (Stalls == 0 && isLoadAfterStore(SU))
+ return NoopHazard;
+
+ return ScoreboardHazardRecognizer::getHazardType(SU, Stalls);
+}
+
+bool PPCDispatchGroupSBHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
+ const MCInstrDesc *MCID = DAG->getInstrDesc(SU);
+ unsigned NSlots;
+ if (MCID && mustComeFirst(MCID, NSlots) && CurSlots)
+ return true;
+
+ return ScoreboardHazardRecognizer::ShouldPreferAnother(SU);
+}
+
+unsigned PPCDispatchGroupSBHazardRecognizer::PreEmitNoops(SUnit *SU) {
+ // We only need to fill out a maximum of 5 slots here: The 6th slot could
+ // only be a second branch, and otherwise the next instruction will start a
+ // new group.
+ if (isLoadAfterStore(SU) && CurSlots < 6) {
+ unsigned Directive =
+ DAG->MF.getSubtarget<PPCSubtarget>().getDarwinDirective();
+ // If we're using a special group-terminating nop, then we need only one.
+ // FIXME: the same for P9 as previous gen until POWER9 scheduling is ready
+ if (Directive == PPC::DIR_PWR6 || Directive == PPC::DIR_PWR7 ||
+ Directive == PPC::DIR_PWR8 || Directive == PPC::DIR_PWR9)
+ return 1;
+
+ return 5 - CurSlots;
+ }
+
+ return ScoreboardHazardRecognizer::PreEmitNoops(SU);
+}
+
+void PPCDispatchGroupSBHazardRecognizer::EmitInstruction(SUnit *SU) {
+ const MCInstrDesc *MCID = DAG->getInstrDesc(SU);
+ if (MCID) {
+ if (CurSlots == 5 || (MCID->isBranch() && CurBranches == 1)) {
+ CurGroup.clear();
+ CurSlots = CurBranches = 0;
+ } else {
+ DEBUG(dbgs() << "**** Adding to dispatch group: SU(" <<
+ SU->NodeNum << "): ");
+ DEBUG(DAG->dumpNode(SU));
+
+ unsigned NSlots;
+ bool MustBeFirst = mustComeFirst(MCID, NSlots);
+
+ // If this instruction must come first, but does not, then it starts a
+ // new group.
+ if (MustBeFirst && CurSlots) {
+ CurSlots = CurBranches = 0;
+ CurGroup.clear();
+ }
+
+ CurSlots += NSlots;
+ CurGroup.push_back(SU);
+
+ if (MCID->isBranch())
+ ++CurBranches;
+ }
+ }
+
+ return ScoreboardHazardRecognizer::EmitInstruction(SU);
+}
+
+void PPCDispatchGroupSBHazardRecognizer::AdvanceCycle() {
+ return ScoreboardHazardRecognizer::AdvanceCycle();
+}
+
+void PPCDispatchGroupSBHazardRecognizer::RecedeCycle() {
+ llvm_unreachable("Bottom-up scheduling not supported");
+}
+
+void PPCDispatchGroupSBHazardRecognizer::Reset() {
+ CurGroup.clear();
+ CurSlots = CurBranches = 0;
+ return ScoreboardHazardRecognizer::Reset();
+}
+
+void PPCDispatchGroupSBHazardRecognizer::EmitNoop() {
+ unsigned Directive =
+ DAG->MF.getSubtarget<PPCSubtarget>().getDarwinDirective();
+ // If the group has now filled all of its slots, or if we're using a special
+ // group-terminating nop, the group is complete.
+ // FIXME: the same for P9 as previous gen until POWER9 scheduling is ready
+ if (Directive == PPC::DIR_PWR6 || Directive == PPC::DIR_PWR7 ||
+ Directive == PPC::DIR_PWR8 || Directive == PPC::DIR_PWR9 ||
+ CurSlots == 6) {
+ CurGroup.clear();
+ CurSlots = CurBranches = 0;
+ } else {
+ CurGroup.push_back(nullptr);
+ ++CurSlots;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// PowerPC 970 Hazard Recognizer
+//
+// This models the dispatch group formation of the PPC970 processor. Dispatch
+// groups are bundles of up to five instructions that can contain various mixes
+// of instructions. The PPC970 can dispatch a peak of 4 non-branch and one
+// branch instruction per-cycle.
+//
+// There are a number of restrictions to dispatch group formation: some
+// instructions can only be issued in the first slot of a dispatch group, & some
+// instructions fill an entire dispatch group. Additionally, only branches can
+// issue in the 5th (last) slot.
+//
+// Finally, there are a number of "structural" hazards on the PPC970. These
+// conditions cause large performance penalties due to misprediction, recovery,
+// and replay logic that has to happen. These cases include setting a CTR and
+// branching through it in the same dispatch group, and storing to an address,
+// then loading from the same address within a dispatch group. To avoid these
+// conditions, we insert no-op instructions when appropriate.
+//
+// FIXME: This is missing some significant cases:
+// 1. Modeling of microcoded instructions.
+// 2. Handling of serialized operations.
+// 3. Handling of the esoteric cases in "Resource-based Instruction Grouping".
+//
+
+PPCHazardRecognizer970::PPCHazardRecognizer970(const ScheduleDAG &DAG)
+ : DAG(DAG) {
+ EndDispatchGroup();
+}
+
+void PPCHazardRecognizer970::EndDispatchGroup() {
+ DEBUG(errs() << "=== Start of dispatch group\n");
+ NumIssued = 0;
+
+ // Structural hazard info.
+ HasCTRSet = false;
+ NumStores = 0;
+}
+
+
+PPCII::PPC970_Unit
+PPCHazardRecognizer970::GetInstrType(unsigned Opcode,
+ bool &isFirst, bool &isSingle,
+ bool &isCracked,
+ bool &isLoad, bool &isStore) {
+ const MCInstrDesc &MCID = DAG.TII->get(Opcode);
+
+ isLoad = MCID.mayLoad();
+ isStore = MCID.mayStore();
+
+ uint64_t TSFlags = MCID.TSFlags;
+
+ isFirst = TSFlags & PPCII::PPC970_First;
+ isSingle = TSFlags & PPCII::PPC970_Single;
+ isCracked = TSFlags & PPCII::PPC970_Cracked;
+ return (PPCII::PPC970_Unit)(TSFlags & PPCII::PPC970_Mask);
+}
+
+/// isLoadOfStoredAddress - If we have a load from the previously stored pointer
+/// as indicated by StorePtr1/StorePtr2/StoreSize, return true.
+bool PPCHazardRecognizer970::
+isLoadOfStoredAddress(uint64_t LoadSize, int64_t LoadOffset,
+ const Value *LoadValue) const {
+ for (unsigned i = 0, e = NumStores; i != e; ++i) {
+ // Handle exact and commuted addresses.
+ if (LoadValue == StoreValue[i] && LoadOffset == StoreOffset[i])
+ return true;
+
+ // Okay, we don't have an exact match, if this is an indexed offset, see if
+ // we have overlap (which happens during fp->int conversion for example).
+ if (StoreValue[i] == LoadValue) {
+ // Okay the base pointers match, so we have [c1+r] vs [c2+r]. Check
+ // to see if the load and store actually overlap.
+ if (StoreOffset[i] < LoadOffset) {
+ if (int64_t(StoreOffset[i]+StoreSize[i]) > LoadOffset) return true;
+ } else {
+ if (int64_t(LoadOffset+LoadSize) > StoreOffset[i]) return true;
+ }
+ }
+ }
+ return false;
+}
+
+/// getHazardType - We return hazard for any non-branch instruction that would
+/// terminate the dispatch group. We turn NoopHazard for any
+/// instructions that wouldn't terminate the dispatch group that would cause a
+/// pipeline flush.
+ScheduleHazardRecognizer::HazardType PPCHazardRecognizer970::
+getHazardType(SUnit *SU, int Stalls) {
+ assert(Stalls == 0 && "PPC hazards don't support scoreboard lookahead");
+
+ MachineInstr *MI = SU->getInstr();
+
+ if (MI->isDebugValue())
+ return NoHazard;
+
+ unsigned Opcode = MI->getOpcode();
+ bool isFirst, isSingle, isCracked, isLoad, isStore;
+ PPCII::PPC970_Unit InstrType =
+ GetInstrType(Opcode, isFirst, isSingle, isCracked,
+ isLoad, isStore);
+ if (InstrType == PPCII::PPC970_Pseudo) return NoHazard;
+
+ // We can only issue a PPC970_First/PPC970_Single instruction (such as
+ // crand/mtspr/etc) if this is the first cycle of the dispatch group.
+ if (NumIssued != 0 && (isFirst || isSingle))
+ return Hazard;
+
+ // If this instruction is cracked into two ops by the decoder, we know that
+ // it is not a branch and that it cannot issue if 3 other instructions are
+ // already in the dispatch group.
+ if (isCracked && NumIssued > 2)
+ return Hazard;
+
+ switch (InstrType) {
+ default: llvm_unreachable("Unknown instruction type!");
+ case PPCII::PPC970_FXU:
+ case PPCII::PPC970_LSU:
+ case PPCII::PPC970_FPU:
+ case PPCII::PPC970_VALU:
+ case PPCII::PPC970_VPERM:
+ // We can only issue a branch as the last instruction in a group.
+ if (NumIssued == 4) return Hazard;
+ break;
+ case PPCII::PPC970_CRU:
+ // We can only issue a CR instruction in the first two slots.
+ if (NumIssued >= 2) return Hazard;
+ break;
+ case PPCII::PPC970_BRU:
+ break;
+ }
+
+ // Do not allow MTCTR and BCTRL to be in the same dispatch group.
+ if (HasCTRSet && Opcode == PPC::BCTRL)
+ return NoopHazard;
+
+ // If this is a load following a store, make sure it's not to the same or
+ // overlapping address.
+ if (isLoad && NumStores && !MI->memoperands_empty()) {
+ MachineMemOperand *MO = *MI->memoperands_begin();
+ if (isLoadOfStoredAddress(MO->getSize(),
+ MO->getOffset(), MO->getValue()))
+ return NoopHazard;
+ }
+
+ return NoHazard;
+}
+
+void PPCHazardRecognizer970::EmitInstruction(SUnit *SU) {
+ MachineInstr *MI = SU->getInstr();
+
+ if (MI->isDebugValue())
+ return;
+
+ unsigned Opcode = MI->getOpcode();
+ bool isFirst, isSingle, isCracked, isLoad, isStore;
+ PPCII::PPC970_Unit InstrType =
+ GetInstrType(Opcode, isFirst, isSingle, isCracked,
+ isLoad, isStore);
+ if (InstrType == PPCII::PPC970_Pseudo) return;
+
+ // Update structural hazard information.
+ if (Opcode == PPC::MTCTR || Opcode == PPC::MTCTR8) HasCTRSet = true;
+
+ // Track the address stored to.
+ if (isStore && NumStores < 4 && !MI->memoperands_empty()) {
+ MachineMemOperand *MO = *MI->memoperands_begin();
+ StoreSize[NumStores] = MO->getSize();
+ StoreOffset[NumStores] = MO->getOffset();
+ StoreValue[NumStores] = MO->getValue();
+ ++NumStores;
+ }
+
+ if (InstrType == PPCII::PPC970_BRU || isSingle)
+ NumIssued = 4; // Terminate a d-group.
+ ++NumIssued;
+
+ // If this instruction is cracked into two ops by the decoder, remember that
+ // we issued two pieces.
+ if (isCracked)
+ ++NumIssued;
+
+ if (NumIssued == 5)
+ EndDispatchGroup();
+}
+
+void PPCHazardRecognizer970::AdvanceCycle() {
+ assert(NumIssued < 5 && "Illegal dispatch group!");
+ ++NumIssued;
+ if (NumIssued == 5)
+ EndDispatchGroup();
+}
+
+void PPCHazardRecognizer970::Reset() {
+ EndDispatchGroup();
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.h b/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.h
new file mode 100644
index 000000000000..4b502147ca63
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.h
@@ -0,0 +1,102 @@
+//===-- PPCHazardRecognizers.h - PowerPC Hazard Recognizers -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines hazard recognizers for scheduling on PowerPC processors.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCHAZARDRECOGNIZERS_H
+#define LLVM_LIB_TARGET_POWERPC_PPCHAZARDRECOGNIZERS_H
+
+#include "PPCInstrInfo.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+
+namespace llvm {
+
+/// PPCDispatchGroupSBHazardRecognizer - This class implements a scoreboard-based
+/// hazard recognizer for PPC ooo processors with dispatch-group hazards.
+class PPCDispatchGroupSBHazardRecognizer : public ScoreboardHazardRecognizer {
+ const ScheduleDAG *DAG;
+ SmallVector<SUnit *, 7> CurGroup;
+ unsigned CurSlots, CurBranches;
+
+ bool isLoadAfterStore(SUnit *SU);
+ bool isBCTRAfterSet(SUnit *SU);
+ bool mustComeFirst(const MCInstrDesc *MCID, unsigned &NSlots);
+public:
+ PPCDispatchGroupSBHazardRecognizer(const InstrItineraryData *ItinData,
+ const ScheduleDAG *DAG_) :
+ ScoreboardHazardRecognizer(ItinData, DAG_), DAG(DAG_),
+ CurSlots(0), CurBranches(0) {}
+
+ HazardType getHazardType(SUnit *SU, int Stalls) override;
+ bool ShouldPreferAnother(SUnit* SU) override;
+ unsigned PreEmitNoops(SUnit *SU) override;
+ void EmitInstruction(SUnit *SU) override;
+ void AdvanceCycle() override;
+ void RecedeCycle() override;
+ void Reset() override;
+ void EmitNoop() override;
+};
+
+/// PPCHazardRecognizer970 - This class defines a finite state automata that
+/// models the dispatch logic on the PowerPC 970 (aka G5) processor. This
+/// promotes good dispatch group formation and implements noop insertion to
+/// avoid structural hazards that cause significant performance penalties (e.g.
+/// setting the CTR register then branching through it within a dispatch group),
+/// or storing then loading from the same address within a dispatch group.
+class PPCHazardRecognizer970 : public ScheduleHazardRecognizer {
+ const ScheduleDAG &DAG;
+
+ unsigned NumIssued; // Number of insts issued, including advanced cycles.
+
+ // Various things that can cause a structural hazard.
+
+ // HasCTRSet - If the CTR register is set in this group, disallow BCTRL.
+ bool HasCTRSet;
+
+ // StoredPtr - Keep track of the address of any store. If we see a load from
+ // the same address (or one that aliases it), disallow the store. We can have
+ // up to four stores in one dispatch group, hence we track up to 4.
+ //
+ // This is null if we haven't seen a store yet. We keep track of both
+ // operands of the store here, since we support [r+r] and [r+i] addressing.
+ const Value *StoreValue[4];
+ int64_t StoreOffset[4];
+ uint64_t StoreSize[4];
+ unsigned NumStores;
+
+public:
+ PPCHazardRecognizer970(const ScheduleDAG &DAG);
+ HazardType getHazardType(SUnit *SU, int Stalls) override;
+ void EmitInstruction(SUnit *SU) override;
+ void AdvanceCycle() override;
+ void Reset() override;
+
+private:
+ /// EndDispatchGroup - Called when we are finishing a new dispatch group.
+ ///
+ void EndDispatchGroup();
+
+ /// GetInstrType - Classify the specified powerpc opcode according to its
+ /// pipeline.
+ PPCII::PPC970_Unit GetInstrType(unsigned Opcode,
+ bool &isFirst, bool &isSingle,bool &isCracked,
+ bool &isLoad, bool &isStore);
+
+ bool isLoadOfStoredAddress(uint64_t LoadSize, int64_t LoadOffset,
+ const Value *LoadValue) const;
+};
+
+} // end namespace llvm
+
+#endif
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
new file mode 100644
index 000000000000..1e51c1f651c9
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -0,0 +1,4529 @@
+//===-- PPCISelDAGToDAG.cpp - PPC --pattern matching inst selector --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pattern matching instruction selector for PowerPC,
+// converting from a legalized dag to a PPC dag.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCTargetMachine.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-codegen"
+
+// FIXME: Remove this once the bug has been fixed!
+cl::opt<bool> ANDIGlueBug("expose-ppc-andi-glue-bug",
+cl::desc("expose the ANDI glue bug on PPC"), cl::Hidden);
+
+static cl::opt<bool>
+ UseBitPermRewriter("ppc-use-bit-perm-rewriter", cl::init(true),
+ cl::desc("use aggressive ppc isel for bit permutations"),
+ cl::Hidden);
+static cl::opt<bool> BPermRewriterNoMasking(
+ "ppc-bit-perm-rewriter-stress-rotates",
+ cl::desc("stress rotate selection in aggressive ppc isel for "
+ "bit permutations"),
+ cl::Hidden);
+
+static cl::opt<bool> EnableBranchHint(
+ "ppc-use-branch-hint", cl::init(true),
+ cl::desc("Enable static hinting of branches on ppc"),
+ cl::Hidden);
+
+namespace {
+ //===--------------------------------------------------------------------===//
+ /// PPCDAGToDAGISel - PPC specific code to select PPC machine
+ /// instructions for SelectionDAG operations.
+ ///
+ class PPCDAGToDAGISel : public SelectionDAGISel {
+ const PPCTargetMachine &TM;
+ const PPCSubtarget *PPCSubTarget;
+ const PPCTargetLowering *PPCLowering;
+ unsigned GlobalBaseReg;
+ public:
+ explicit PPCDAGToDAGISel(PPCTargetMachine &tm)
+ : SelectionDAGISel(tm), TM(tm) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ // Make sure we re-emit a set of the global base reg if necessary
+ GlobalBaseReg = 0;
+ PPCSubTarget = &MF.getSubtarget<PPCSubtarget>();
+ PPCLowering = PPCSubTarget->getTargetLowering();
+ SelectionDAGISel::runOnMachineFunction(MF);
+
+ if (!PPCSubTarget->isSVR4ABI())
+ InsertVRSaveCode(MF);
+
+ return true;
+ }
+
+ void PreprocessISelDAG() override;
+ void PostprocessISelDAG() override;
+
+ /// getI32Imm - Return a target constant with the specified value, of type
+ /// i32.
+ inline SDValue getI32Imm(unsigned Imm, const SDLoc &dl) {
+ return CurDAG->getTargetConstant(Imm, dl, MVT::i32);
+ }
+
+ /// getI64Imm - Return a target constant with the specified value, of type
+ /// i64.
+ inline SDValue getI64Imm(uint64_t Imm, const SDLoc &dl) {
+ return CurDAG->getTargetConstant(Imm, dl, MVT::i64);
+ }
+
+ /// getSmallIPtrImm - Return a target constant of pointer type.
+ inline SDValue getSmallIPtrImm(unsigned Imm, const SDLoc &dl) {
+ return CurDAG->getTargetConstant(
+ Imm, dl, PPCLowering->getPointerTy(CurDAG->getDataLayout()));
+ }
+
+ /// isRotateAndMask - Returns true if Mask and Shift can be folded into a
+ /// rotate and mask opcode and mask operation.
+ static bool isRotateAndMask(SDNode *N, unsigned Mask, bool isShiftMask,
+ unsigned &SH, unsigned &MB, unsigned &ME);
+
+ /// getGlobalBaseReg - insert code into the entry mbb to materialize the PIC
+ /// base register. Return the virtual register that holds this value.
+ SDNode *getGlobalBaseReg();
+
+ void selectFrameIndex(SDNode *SN, SDNode *N, unsigned Offset = 0);
+
+ // Select - Convert the specified operand from a target-independent to a
+ // target-specific node if it hasn't already been changed.
+ void Select(SDNode *N) override;
+
+ bool tryBitfieldInsert(SDNode *N);
+ bool tryBitPermutation(SDNode *N);
+
+ /// SelectCC - Select a comparison of the specified values with the
+ /// specified condition code, returning the CR# of the expression.
+ SDValue SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+ const SDLoc &dl);
+
+ /// SelectAddrImm - Returns true if the address N can be represented by
+ /// a base register plus a signed 16-bit displacement [r+imm].
+ bool SelectAddrImm(SDValue N, SDValue &Disp,
+ SDValue &Base) {
+ return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, false);
+ }
+
+ /// SelectAddrImmOffs - Return true if the operand is valid for a preinc
+ /// immediate field. Note that the operand at this point is already the
+ /// result of a prior SelectAddressRegImm call.
+ bool SelectAddrImmOffs(SDValue N, SDValue &Out) const {
+ if (N.getOpcode() == ISD::TargetConstant ||
+ N.getOpcode() == ISD::TargetGlobalAddress) {
+ Out = N;
+ return true;
+ }
+
+ return false;
+ }
+
+ /// SelectAddrIdx - Given the specified addressed, check to see if it can be
+ /// represented as an indexed [r+r] operation. Returns false if it can
+ /// be represented by [r+imm], which are preferred.
+ bool SelectAddrIdx(SDValue N, SDValue &Base, SDValue &Index) {
+ return PPCLowering->SelectAddressRegReg(N, Base, Index, *CurDAG);
+ }
+
+ /// SelectAddrIdxOnly - Given the specified addressed, force it to be
+ /// represented as an indexed [r+r] operation.
+ bool SelectAddrIdxOnly(SDValue N, SDValue &Base, SDValue &Index) {
+ return PPCLowering->SelectAddressRegRegOnly(N, Base, Index, *CurDAG);
+ }
+
+ /// SelectAddrImmX4 - Returns true if the address N can be represented by
+ /// a base register plus a signed 16-bit displacement that is a multiple of 4.
+ /// Suitable for use by STD and friends.
+ bool SelectAddrImmX4(SDValue N, SDValue &Disp, SDValue &Base) {
+ return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, true);
+ }
+
+ // Select an address into a single register.
+ bool SelectAddr(SDValue N, SDValue &Base) {
+ Base = N;
+ return true;
+ }
+
+ /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+ /// inline asm expressions. It is always correct to compute the value into
+ /// a register. The case of adding a (possibly relocatable) constant to a
+ /// register can be improved, but it is wrong to substitute Reg+Reg for
+ /// Reg in an asm, because the load or store opcode would have to change.
+ bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+ unsigned ConstraintID,
+ std::vector<SDValue> &OutOps) override {
+
+ switch(ConstraintID) {
+ default:
+ errs() << "ConstraintID: " << ConstraintID << "\n";
+ llvm_unreachable("Unexpected asm memory constraint");
+ case InlineAsm::Constraint_es:
+ case InlineAsm::Constraint_i:
+ case InlineAsm::Constraint_m:
+ case InlineAsm::Constraint_o:
+ case InlineAsm::Constraint_Q:
+ case InlineAsm::Constraint_Z:
+ case InlineAsm::Constraint_Zy:
+ // We need to make sure that this one operand does not end up in r0
+ // (because we might end up lowering this as 0(%op)).
+ const TargetRegisterInfo *TRI = PPCSubTarget->getRegisterInfo();
+ const TargetRegisterClass *TRC = TRI->getPointerRegClass(*MF, /*Kind=*/1);
+ SDLoc dl(Op);
+ SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i32);
+ SDValue NewOp =
+ SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+ dl, Op.getValueType(),
+ Op, RC), 0);
+
+ OutOps.push_back(NewOp);
+ return false;
+ }
+ return true;
+ }
+
+ void InsertVRSaveCode(MachineFunction &MF);
+
+ StringRef getPassName() const override {
+ return "PowerPC DAG->DAG Pattern Instruction Selection";
+ }
+
+// Include the pieces autogenerated from the target description.
+#include "PPCGenDAGISel.inc"
+
+private:
+ bool trySETCC(SDNode *N);
+
+ void PeepholePPC64();
+ void PeepholePPC64ZExt();
+ void PeepholeCROps();
+
+ SDValue combineToCMPB(SDNode *N);
+ void foldBoolExts(SDValue &Res, SDNode *&N);
+
+ bool AllUsersSelectZero(SDNode *N);
+ void SwapAllSelectUsers(SDNode *N);
+
+ void transferMemOperands(SDNode *N, SDNode *Result);
+ };
+}
+
+/// InsertVRSaveCode - Once the entire function has been instruction selected,
+/// all virtual registers are created and all machine instructions are built,
+/// check to see if we need to save/restore VRSAVE. If so, do it.
+void PPCDAGToDAGISel::InsertVRSaveCode(MachineFunction &Fn) {
+ // Check to see if this function uses vector registers, which means we have to
+ // save and restore the VRSAVE register and update it with the regs we use.
+ //
+ // In this case, there will be virtual registers of vector type created
+ // by the scheduler. Detect them now.
+ bool HasVectorVReg = false;
+ for (unsigned i = 0, e = RegInfo->getNumVirtRegs(); i != e; ++i) {
+ unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+ if (RegInfo->getRegClass(Reg) == &PPC::VRRCRegClass) {
+ HasVectorVReg = true;
+ break;
+ }
+ }
+ if (!HasVectorVReg) return; // nothing to do.
+
+ // If we have a vector register, we want to emit code into the entry and exit
+ // blocks to save and restore the VRSAVE register. We do this here (instead
+ // of marking all vector instructions as clobbering VRSAVE) for two reasons:
+ //
+ // 1. This (trivially) reduces the load on the register allocator, by not
+ // having to represent the live range of the VRSAVE register.
+ // 2. This (more significantly) allows us to create a temporary virtual
+ // register to hold the saved VRSAVE value, allowing this temporary to be
+ // register allocated, instead of forcing it to be spilled to the stack.
+
+ // Create two vregs - one to hold the VRSAVE register that is live-in to the
+ // function and one for the value after having bits or'd into it.
+ unsigned InVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
+ unsigned UpdatedVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
+
+ const TargetInstrInfo &TII = *PPCSubTarget->getInstrInfo();
+ MachineBasicBlock &EntryBB = *Fn.begin();
+ DebugLoc dl;
+ // Emit the following code into the entry block:
+ // InVRSAVE = MFVRSAVE
+ // UpdatedVRSAVE = UPDATE_VRSAVE InVRSAVE
+ // MTVRSAVE UpdatedVRSAVE
+ MachineBasicBlock::iterator IP = EntryBB.begin(); // Insert Point
+ BuildMI(EntryBB, IP, dl, TII.get(PPC::MFVRSAVE), InVRSAVE);
+ BuildMI(EntryBB, IP, dl, TII.get(PPC::UPDATE_VRSAVE),
+ UpdatedVRSAVE).addReg(InVRSAVE);
+ BuildMI(EntryBB, IP, dl, TII.get(PPC::MTVRSAVE)).addReg(UpdatedVRSAVE);
+
+ // Find all return blocks, outputting a restore in each epilog.
+ for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) {
+ if (BB->isReturnBlock()) {
+ IP = BB->end(); --IP;
+
+ // Skip over all terminator instructions, which are part of the return
+ // sequence.
+ MachineBasicBlock::iterator I2 = IP;
+ while (I2 != BB->begin() && (--I2)->isTerminator())
+ IP = I2;
+
+ // Emit: MTVRSAVE InVRSave
+ BuildMI(*BB, IP, dl, TII.get(PPC::MTVRSAVE)).addReg(InVRSAVE);
+ }
+ }
+}
+
+
+/// getGlobalBaseReg - Output the instructions required to put the
+/// base address to use for accessing globals into a register.
+///
+SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
+ if (!GlobalBaseReg) {
+ const TargetInstrInfo &TII = *PPCSubTarget->getInstrInfo();
+ // Insert the set of GlobalBaseReg into the first MBB of the function
+ MachineBasicBlock &FirstMBB = MF->front();
+ MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+ const Module *M = MF->getFunction()->getParent();
+ DebugLoc dl;
+
+ if (PPCLowering->getPointerTy(CurDAG->getDataLayout()) == MVT::i32) {
+ if (PPCSubTarget->isTargetELF()) {
+ GlobalBaseReg = PPC::R30;
+ if (M->getPICLevel() == PICLevel::SmallPIC) {
+ BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MoveGOTtoLR));
+ BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg);
+ MF->getInfo<PPCFunctionInfo>()->setUsesPICBase(true);
+ } else {
+ BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR));
+ BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg);
+ unsigned TempReg = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
+ BuildMI(FirstMBB, MBBI, dl,
+ TII.get(PPC::UpdateGBR), GlobalBaseReg)
+ .addReg(TempReg, RegState::Define).addReg(GlobalBaseReg);
+ MF->getInfo<PPCFunctionInfo>()->setUsesPICBase(true);
+ }
+ } else {
+ GlobalBaseReg =
+ RegInfo->createVirtualRegister(&PPC::GPRC_and_GPRC_NOR0RegClass);
+ BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR));
+ BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg);
+ }
+ } else {
+ GlobalBaseReg = RegInfo->createVirtualRegister(&PPC::G8RC_and_G8RC_NOX0RegClass);
+ BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR8));
+ BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR8), GlobalBaseReg);
+ }
+ }
+ return CurDAG->getRegister(GlobalBaseReg,
+ PPCLowering->getPointerTy(CurDAG->getDataLayout()))
+ .getNode();
+}
+
+/// isIntS16Immediate - This method tests to see if the node is either a 32-bit
+/// or 64-bit immediate, and if the value can be accurately represented as a
+/// sign extension from a 16-bit value. If so, this returns true and the
+/// immediate.
+static bool isIntS16Immediate(SDNode *N, short &Imm) {
+ if (N->getOpcode() != ISD::Constant)
+ return false;
+
+ Imm = (short)cast<ConstantSDNode>(N)->getZExtValue();
+ if (N->getValueType(0) == MVT::i32)
+ return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue();
+ else
+ return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue();
+}
+
+static bool isIntS16Immediate(SDValue Op, short &Imm) {
+ return isIntS16Immediate(Op.getNode(), Imm);
+}
+
+
+/// isInt32Immediate - This method tests to see if the node is a 32-bit constant
+/// operand. If so Imm will receive the 32-bit value.
+static bool isInt32Immediate(SDNode *N, unsigned &Imm) {
+ if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i32) {
+ Imm = cast<ConstantSDNode>(N)->getZExtValue();
+ return true;
+ }
+ return false;
+}
+
+/// isInt64Immediate - This method tests to see if the node is a 64-bit constant
+/// operand. If so Imm will receive the 64-bit value.
+static bool isInt64Immediate(SDNode *N, uint64_t &Imm) {
+ if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i64) {
+ Imm = cast<ConstantSDNode>(N)->getZExtValue();
+ return true;
+ }
+ return false;
+}
+
+// isInt32Immediate - This method tests to see if a constant operand.
+// If so Imm will receive the 32 bit value.
+static bool isInt32Immediate(SDValue N, unsigned &Imm) {
+ return isInt32Immediate(N.getNode(), Imm);
+}
+
+static unsigned getBranchHint(unsigned PCC, FunctionLoweringInfo *FuncInfo,
+ const SDValue &DestMBB) {
+ assert(isa<BasicBlockSDNode>(DestMBB));
+
+ if (!FuncInfo->BPI) return PPC::BR_NO_HINT;
+
+ const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
+ const TerminatorInst *BBTerm = BB->getTerminator();
+
+ if (BBTerm->getNumSuccessors() != 2) return PPC::BR_NO_HINT;
+
+ const BasicBlock *TBB = BBTerm->getSuccessor(0);
+ const BasicBlock *FBB = BBTerm->getSuccessor(1);
+
+ auto TProb = FuncInfo->BPI->getEdgeProbability(BB, TBB);
+ auto FProb = FuncInfo->BPI->getEdgeProbability(BB, FBB);
+
+ // We only want to handle cases which are easy to predict at static time, e.g.
+ // C++ throw statement, that is very likely not taken, or calling never
+ // returned function, e.g. stdlib exit(). So we set Threshold to filter
+ // unwanted cases.
+ //
+ // Below is LLVM branch weight table, we only want to handle case 1, 2
+ //
+ // Case Taken:Nontaken Example
+ // 1. Unreachable 1048575:1 C++ throw, stdlib exit(),
+ // 2. Invoke-terminating 1:1048575
+ // 3. Coldblock 4:64 __builtin_expect
+ // 4. Loop Branch 124:4 For loop
+ // 5. PH/ZH/FPH 20:12
+ const uint32_t Threshold = 10000;
+
+ if (std::max(TProb, FProb) / Threshold < std::min(TProb, FProb))
+ return PPC::BR_NO_HINT;
+
+ DEBUG(dbgs() << "Use branch hint for '" << FuncInfo->Fn->getName() << "::"
+ << BB->getName() << "'\n"
+ << " -> " << TBB->getName() << ": " << TProb << "\n"
+ << " -> " << FBB->getName() << ": " << FProb << "\n");
+
+ const BasicBlockSDNode *BBDN = cast<BasicBlockSDNode>(DestMBB);
+
+ // If Dest BasicBlock is False-BasicBlock (FBB), swap branch probabilities,
+ // because we want 'TProb' stands for 'branch probability' to Dest BasicBlock
+ if (BBDN->getBasicBlock()->getBasicBlock() != TBB)
+ std::swap(TProb, FProb);
+
+ return (TProb > FProb) ? PPC::BR_TAKEN_HINT : PPC::BR_NONTAKEN_HINT;
+}
+
+// isOpcWithIntImmediate - This method tests to see if the node is a specific
+// opcode and that it has a immediate integer right operand.
+// If so Imm will receive the 32 bit value.
+static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) {
+ return N->getOpcode() == Opc
+ && isInt32Immediate(N->getOperand(1).getNode(), Imm);
+}
+
+void PPCDAGToDAGISel::selectFrameIndex(SDNode *SN, SDNode *N, unsigned Offset) {
+ SDLoc dl(SN);
+ int FI = cast<FrameIndexSDNode>(N)->getIndex();
+ SDValue TFI = CurDAG->getTargetFrameIndex(FI, N->getValueType(0));
+ unsigned Opc = N->getValueType(0) == MVT::i32 ? PPC::ADDI : PPC::ADDI8;
+ if (SN->hasOneUse())
+ CurDAG->SelectNodeTo(SN, Opc, N->getValueType(0), TFI,
+ getSmallIPtrImm(Offset, dl));
+ else
+ ReplaceNode(SN, CurDAG->getMachineNode(Opc, dl, N->getValueType(0), TFI,
+ getSmallIPtrImm(Offset, dl)));
+}
+
+bool PPCDAGToDAGISel::isRotateAndMask(SDNode *N, unsigned Mask,
+ bool isShiftMask, unsigned &SH,
+ unsigned &MB, unsigned &ME) {
+ // Don't even go down this path for i64, since different logic will be
+ // necessary for rldicl/rldicr/rldimi.
+ if (N->getValueType(0) != MVT::i32)
+ return false;
+
+ unsigned Shift = 32;
+ unsigned Indeterminant = ~0; // bit mask marking indeterminant results
+ unsigned Opcode = N->getOpcode();
+ if (N->getNumOperands() != 2 ||
+ !isInt32Immediate(N->getOperand(1).getNode(), Shift) || (Shift > 31))
+ return false;
+
+ if (Opcode == ISD::SHL) {
+ // apply shift left to mask if it comes first
+ if (isShiftMask) Mask = Mask << Shift;
+ // determine which bits are made indeterminant by shift
+ Indeterminant = ~(0xFFFFFFFFu << Shift);
+ } else if (Opcode == ISD::SRL) {
+ // apply shift right to mask if it comes first
+ if (isShiftMask) Mask = Mask >> Shift;
+ // determine which bits are made indeterminant by shift
+ Indeterminant = ~(0xFFFFFFFFu >> Shift);
+ // adjust for the left rotate
+ Shift = 32 - Shift;
+ } else if (Opcode == ISD::ROTL) {
+ Indeterminant = 0;
+ } else {
+ return false;
+ }
+
+ // if the mask doesn't intersect any Indeterminant bits
+ if (Mask && !(Mask & Indeterminant)) {
+ SH = Shift & 31;
+ // make sure the mask is still a mask (wrap arounds may not be)
+ return isRunOfOnes(Mask, MB, ME);
+ }
+ return false;
+}
+
+/// Turn an or of two masked values into the rotate left word immediate then
+/// mask insert (rlwimi) instruction.
+bool PPCDAGToDAGISel::tryBitfieldInsert(SDNode *N) {
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ SDLoc dl(N);
+
+ APInt LKZ, LKO, RKZ, RKO;
+ CurDAG->computeKnownBits(Op0, LKZ, LKO);
+ CurDAG->computeKnownBits(Op1, RKZ, RKO);
+
+ unsigned TargetMask = LKZ.getZExtValue();
+ unsigned InsertMask = RKZ.getZExtValue();
+
+ if ((TargetMask | InsertMask) == 0xFFFFFFFF) {
+ unsigned Op0Opc = Op0.getOpcode();
+ unsigned Op1Opc = Op1.getOpcode();
+ unsigned Value, SH = 0;
+ TargetMask = ~TargetMask;
+ InsertMask = ~InsertMask;
+
+ // If the LHS has a foldable shift and the RHS does not, then swap it to the
+ // RHS so that we can fold the shift into the insert.
+ if (Op0Opc == ISD::AND && Op1Opc == ISD::AND) {
+ if (Op0.getOperand(0).getOpcode() == ISD::SHL ||
+ Op0.getOperand(0).getOpcode() == ISD::SRL) {
+ if (Op1.getOperand(0).getOpcode() != ISD::SHL &&
+ Op1.getOperand(0).getOpcode() != ISD::SRL) {
+ std::swap(Op0, Op1);
+ std::swap(Op0Opc, Op1Opc);
+ std::swap(TargetMask, InsertMask);
+ }
+ }
+ } else if (Op0Opc == ISD::SHL || Op0Opc == ISD::SRL) {
+ if (Op1Opc == ISD::AND && Op1.getOperand(0).getOpcode() != ISD::SHL &&
+ Op1.getOperand(0).getOpcode() != ISD::SRL) {
+ std::swap(Op0, Op1);
+ std::swap(Op0Opc, Op1Opc);
+ std::swap(TargetMask, InsertMask);
+ }
+ }
+
+ unsigned MB, ME;
+ if (isRunOfOnes(InsertMask, MB, ME)) {
+ SDValue Tmp1, Tmp2;
+
+ if ((Op1Opc == ISD::SHL || Op1Opc == ISD::SRL) &&
+ isInt32Immediate(Op1.getOperand(1), Value)) {
+ Op1 = Op1.getOperand(0);
+ SH = (Op1Opc == ISD::SHL) ? Value : 32 - Value;
+ }
+ if (Op1Opc == ISD::AND) {
+ // The AND mask might not be a constant, and we need to make sure that
+ // if we're going to fold the masking with the insert, all bits not
+ // know to be zero in the mask are known to be one.
+ APInt MKZ, MKO;
+ CurDAG->computeKnownBits(Op1.getOperand(1), MKZ, MKO);
+ bool CanFoldMask = InsertMask == MKO.getZExtValue();
+
+ unsigned SHOpc = Op1.getOperand(0).getOpcode();
+ if ((SHOpc == ISD::SHL || SHOpc == ISD::SRL) && CanFoldMask &&
+ isInt32Immediate(Op1.getOperand(0).getOperand(1), Value)) {
+ // Note that Value must be in range here (less than 32) because
+ // otherwise there would not be any bits set in InsertMask.
+ Op1 = Op1.getOperand(0).getOperand(0);
+ SH = (SHOpc == ISD::SHL) ? Value : 32 - Value;
+ }
+ }
+
+ SH &= 31;
+ SDValue Ops[] = { Op0, Op1, getI32Imm(SH, dl), getI32Imm(MB, dl),
+ getI32Imm(ME, dl) };
+ ReplaceNode(N, CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops));
+ return true;
+ }
+ }
+ return false;
+}
+
+// Predict the number of instructions that would be generated by calling
+// getInt64(N).
+static unsigned getInt64CountDirect(int64_t Imm) {
+ // Assume no remaining bits.
+ unsigned Remainder = 0;
+ // Assume no shift required.
+ unsigned Shift = 0;
+
+ // If it can't be represented as a 32 bit value.
+ if (!isInt<32>(Imm)) {
+ Shift = countTrailingZeros<uint64_t>(Imm);
+ int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift;
+
+ // If the shifted value fits 32 bits.
+ if (isInt<32>(ImmSh)) {
+ // Go with the shifted value.
+ Imm = ImmSh;
+ } else {
+ // Still stuck with a 64 bit value.
+ Remainder = Imm;
+ Shift = 32;
+ Imm >>= 32;
+ }
+ }
+
+ // Intermediate operand.
+ unsigned Result = 0;
+
+ // Handle first 32 bits.
+ unsigned Lo = Imm & 0xFFFF;
+
+ // Simple value.
+ if (isInt<16>(Imm)) {
+ // Just the Lo bits.
+ ++Result;
+ } else if (Lo) {
+ // Handle the Hi bits and Lo bits.
+ Result += 2;
+ } else {
+ // Just the Hi bits.
+ ++Result;
+ }
+
+ // If no shift, we're done.
+ if (!Shift) return Result;
+
+ // If Hi word == Lo word,
+ // we can use rldimi to insert the Lo word into Hi word.
+ if ((unsigned)(Imm & 0xFFFFFFFF) == Remainder) {
+ ++Result;
+ return Result;
+ }
+
+ // Shift for next step if the upper 32-bits were not zero.
+ if (Imm)
+ ++Result;
+
+ // Add in the last bits as required.
+ if ((Remainder >> 16) & 0xFFFF)
+ ++Result;
+ if (Remainder & 0xFFFF)
+ ++Result;
+
+ return Result;
+}
+
+static uint64_t Rot64(uint64_t Imm, unsigned R) {
+ return (Imm << R) | (Imm >> (64 - R));
+}
+
+static unsigned getInt64Count(int64_t Imm) {
+ unsigned Count = getInt64CountDirect(Imm);
+ if (Count == 1)
+ return Count;
+
+ for (unsigned r = 1; r < 63; ++r) {
+ uint64_t RImm = Rot64(Imm, r);
+ unsigned RCount = getInt64CountDirect(RImm) + 1;
+ Count = std::min(Count, RCount);
+
+ // See comments in getInt64 for an explanation of the logic below.
+ unsigned LS = findLastSet(RImm);
+ if (LS != r-1)
+ continue;
+
+ uint64_t OnesMask = -(int64_t) (UINT64_C(1) << (LS+1));
+ uint64_t RImmWithOnes = RImm | OnesMask;
+
+ RCount = getInt64CountDirect(RImmWithOnes) + 1;
+ Count = std::min(Count, RCount);
+ }
+
+ return Count;
+}
+
+// Select a 64-bit constant. For cost-modeling purposes, getInt64Count
+// (above) needs to be kept in sync with this function.
+static SDNode *getInt64Direct(SelectionDAG *CurDAG, const SDLoc &dl,
+ int64_t Imm) {
+ // Assume no remaining bits.
+ unsigned Remainder = 0;
+ // Assume no shift required.
+ unsigned Shift = 0;
+
+ // If it can't be represented as a 32 bit value.
+ if (!isInt<32>(Imm)) {
+ Shift = countTrailingZeros<uint64_t>(Imm);
+ int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift;
+
+ // If the shifted value fits 32 bits.
+ if (isInt<32>(ImmSh)) {
+ // Go with the shifted value.
+ Imm = ImmSh;
+ } else {
+ // Still stuck with a 64 bit value.
+ Remainder = Imm;
+ Shift = 32;
+ Imm >>= 32;
+ }
+ }
+
+ // Intermediate operand.
+ SDNode *Result;
+
+ // Handle first 32 bits.
+ unsigned Lo = Imm & 0xFFFF;
+ unsigned Hi = (Imm >> 16) & 0xFFFF;
+
+ auto getI32Imm = [CurDAG, dl](unsigned Imm) {
+ return CurDAG->getTargetConstant(Imm, dl, MVT::i32);
+ };
+
+ // Simple value.
+ if (isInt<16>(Imm)) {
+ // Just the Lo bits.
+ Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, getI32Imm(Lo));
+ } else if (Lo) {
+ // Handle the Hi bits.
+ unsigned OpC = Hi ? PPC::LIS8 : PPC::LI8;
+ Result = CurDAG->getMachineNode(OpC, dl, MVT::i64, getI32Imm(Hi));
+ // And Lo bits.
+ Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64,
+ SDValue(Result, 0), getI32Imm(Lo));
+ } else {
+ // Just the Hi bits.
+ Result = CurDAG->getMachineNode(PPC::LIS8, dl, MVT::i64, getI32Imm(Hi));
+ }
+
+ // If no shift, we're done.
+ if (!Shift) return Result;
+
+ // If Hi word == Lo word,
+ // we can use rldimi to insert the Lo word into Hi word.
+ if ((unsigned)(Imm & 0xFFFFFFFF) == Remainder) {
+ SDValue Ops[] =
+ { SDValue(Result, 0), SDValue(Result, 0), getI32Imm(Shift), getI32Imm(0)};
+ return CurDAG->getMachineNode(PPC::RLDIMI, dl, MVT::i64, Ops);
+ }
+
+ // Shift for next step if the upper 32-bits were not zero.
+ if (Imm) {
+ Result = CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64,
+ SDValue(Result, 0),
+ getI32Imm(Shift),
+ getI32Imm(63 - Shift));
+ }
+
+ // Add in the last bits as required.
+ if ((Hi = (Remainder >> 16) & 0xFFFF)) {
+ Result = CurDAG->getMachineNode(PPC::ORIS8, dl, MVT::i64,
+ SDValue(Result, 0), getI32Imm(Hi));
+ }
+ if ((Lo = Remainder & 0xFFFF)) {
+ Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64,
+ SDValue(Result, 0), getI32Imm(Lo));
+ }
+
+ return Result;
+}
+
+static SDNode *getInt64(SelectionDAG *CurDAG, const SDLoc &dl, int64_t Imm) {
+ unsigned Count = getInt64CountDirect(Imm);
+ if (Count == 1)
+ return getInt64Direct(CurDAG, dl, Imm);
+
+ unsigned RMin = 0;
+
+ int64_t MatImm;
+ unsigned MaskEnd;
+
+ for (unsigned r = 1; r < 63; ++r) {
+ uint64_t RImm = Rot64(Imm, r);
+ unsigned RCount = getInt64CountDirect(RImm) + 1;
+ if (RCount < Count) {
+ Count = RCount;
+ RMin = r;
+ MatImm = RImm;
+ MaskEnd = 63;
+ }
+
+ // If the immediate to generate has many trailing zeros, it might be
+ // worthwhile to generate a rotated value with too many leading ones
+ // (because that's free with li/lis's sign-extension semantics), and then
+ // mask them off after rotation.
+
+ unsigned LS = findLastSet(RImm);
+ // We're adding (63-LS) higher-order ones, and we expect to mask them off
+ // after performing the inverse rotation by (64-r). So we need that:
+ // 63-LS == 64-r => LS == r-1
+ if (LS != r-1)
+ continue;
+
+ uint64_t OnesMask = -(int64_t) (UINT64_C(1) << (LS+1));
+ uint64_t RImmWithOnes = RImm | OnesMask;
+
+ RCount = getInt64CountDirect(RImmWithOnes) + 1;
+ if (RCount < Count) {
+ Count = RCount;
+ RMin = r;
+ MatImm = RImmWithOnes;
+ MaskEnd = LS;
+ }
+ }
+
+ if (!RMin)
+ return getInt64Direct(CurDAG, dl, Imm);
+
+ auto getI32Imm = [CurDAG, dl](unsigned Imm) {
+ return CurDAG->getTargetConstant(Imm, dl, MVT::i32);
+ };
+
+ SDValue Val = SDValue(getInt64Direct(CurDAG, dl, MatImm), 0);
+ return CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64, Val,
+ getI32Imm(64 - RMin), getI32Imm(MaskEnd));
+}
+
+// Select a 64-bit constant.
+static SDNode *getInt64(SelectionDAG *CurDAG, SDNode *N) {
+ SDLoc dl(N);
+
+ // Get 64 bit value.
+ int64_t Imm = cast<ConstantSDNode>(N)->getZExtValue();
+ return getInt64(CurDAG, dl, Imm);
+}
+
+namespace {
+class BitPermutationSelector {
+ struct ValueBit {
+ SDValue V;
+
+ // The bit number in the value, using a convention where bit 0 is the
+ // lowest-order bit.
+ unsigned Idx;
+
+ enum Kind {
+ ConstZero,
+ Variable
+ } K;
+
+ ValueBit(SDValue V, unsigned I, Kind K = Variable)
+ : V(V), Idx(I), K(K) {}
+ ValueBit(Kind K = Variable)
+ : V(SDValue(nullptr, 0)), Idx(UINT32_MAX), K(K) {}
+
+ bool isZero() const {
+ return K == ConstZero;
+ }
+
+ bool hasValue() const {
+ return K == Variable;
+ }
+
+ SDValue getValue() const {
+ assert(hasValue() && "Cannot get the value of a constant bit");
+ return V;
+ }
+
+ unsigned getValueBitIndex() const {
+ assert(hasValue() && "Cannot get the value bit index of a constant bit");
+ return Idx;
+ }
+ };
+
+ // A bit group has the same underlying value and the same rotate factor.
+ struct BitGroup {
+ SDValue V;
+ unsigned RLAmt;
+ unsigned StartIdx, EndIdx;
+
+ // This rotation amount assumes that the lower 32 bits of the quantity are
+ // replicated in the high 32 bits by the rotation operator (which is done
+ // by rlwinm and friends in 64-bit mode).
+ bool Repl32;
+ // Did converting to Repl32 == true change the rotation factor? If it did,
+ // it decreased it by 32.
+ bool Repl32CR;
+ // Was this group coalesced after setting Repl32 to true?
+ bool Repl32Coalesced;
+
+ BitGroup(SDValue V, unsigned R, unsigned S, unsigned E)
+ : V(V), RLAmt(R), StartIdx(S), EndIdx(E), Repl32(false), Repl32CR(false),
+ Repl32Coalesced(false) {
+ DEBUG(dbgs() << "\tbit group for " << V.getNode() << " RLAmt = " << R <<
+ " [" << S << ", " << E << "]\n");
+ }
+ };
+
+ // Information on each (Value, RLAmt) pair (like the number of groups
+ // associated with each) used to choose the lowering method.
+ struct ValueRotInfo {
+ SDValue V;
+ unsigned RLAmt;
+ unsigned NumGroups;
+ unsigned FirstGroupStartIdx;
+ bool Repl32;
+
+ ValueRotInfo()
+ : RLAmt(UINT32_MAX), NumGroups(0), FirstGroupStartIdx(UINT32_MAX),
+ Repl32(false) {}
+
+ // For sorting (in reverse order) by NumGroups, and then by
+ // FirstGroupStartIdx.
+ bool operator < (const ValueRotInfo &Other) const {
+ // We need to sort so that the non-Repl32 come first because, when we're
+ // doing masking, the Repl32 bit groups might be subsumed into the 64-bit
+ // masking operation.
+ if (Repl32 < Other.Repl32)
+ return true;
+ else if (Repl32 > Other.Repl32)
+ return false;
+ else if (NumGroups > Other.NumGroups)
+ return true;
+ else if (NumGroups < Other.NumGroups)
+ return false;
+ else if (FirstGroupStartIdx < Other.FirstGroupStartIdx)
+ return true;
+ return false;
+ }
+ };
+
+ using ValueBitsMemoizedValue = std::pair<bool, SmallVector<ValueBit, 64>>;
+ using ValueBitsMemoizer =
+ DenseMap<SDValue, std::unique_ptr<ValueBitsMemoizedValue>>;
+ ValueBitsMemoizer Memoizer;
+
+ // Return a pair of bool and a SmallVector pointer to a memoization entry.
+ // The bool is true if something interesting was deduced, otherwise if we're
+ // providing only a generic representation of V (or something else likewise
+ // uninteresting for instruction selection) through the SmallVector.
+ std::pair<bool, SmallVector<ValueBit, 64> *> getValueBits(SDValue V,
+ unsigned NumBits) {
+ auto &ValueEntry = Memoizer[V];
+ if (ValueEntry)
+ return std::make_pair(ValueEntry->first, &ValueEntry->second);
+ ValueEntry.reset(new ValueBitsMemoizedValue());
+ bool &Interesting = ValueEntry->first;
+ SmallVector<ValueBit, 64> &Bits = ValueEntry->second;
+ Bits.resize(NumBits);
+
+ switch (V.getOpcode()) {
+ default: break;
+ case ISD::ROTL:
+ if (isa<ConstantSDNode>(V.getOperand(1))) {
+ unsigned RotAmt = V.getConstantOperandVal(1);
+
+ const auto &LHSBits = *getValueBits(V.getOperand(0), NumBits).second;
+
+ for (unsigned i = 0; i < NumBits; ++i)
+ Bits[i] = LHSBits[i < RotAmt ? i + (NumBits - RotAmt) : i - RotAmt];
+
+ return std::make_pair(Interesting = true, &Bits);
+ }
+ break;
+ case ISD::SHL:
+ if (isa<ConstantSDNode>(V.getOperand(1))) {
+ unsigned ShiftAmt = V.getConstantOperandVal(1);
+
+ const auto &LHSBits = *getValueBits(V.getOperand(0), NumBits).second;
+
+ for (unsigned i = ShiftAmt; i < NumBits; ++i)
+ Bits[i] = LHSBits[i - ShiftAmt];
+
+ for (unsigned i = 0; i < ShiftAmt; ++i)
+ Bits[i] = ValueBit(ValueBit::ConstZero);
+
+ return std::make_pair(Interesting = true, &Bits);
+ }
+ break;
+ case ISD::SRL:
+ if (isa<ConstantSDNode>(V.getOperand(1))) {
+ unsigned ShiftAmt = V.getConstantOperandVal(1);
+
+ const auto &LHSBits = *getValueBits(V.getOperand(0), NumBits).second;
+
+ for (unsigned i = 0; i < NumBits - ShiftAmt; ++i)
+ Bits[i] = LHSBits[i + ShiftAmt];
+
+ for (unsigned i = NumBits - ShiftAmt; i < NumBits; ++i)
+ Bits[i] = ValueBit(ValueBit::ConstZero);
+
+ return std::make_pair(Interesting = true, &Bits);
+ }
+ break;
+ case ISD::AND:
+ if (isa<ConstantSDNode>(V.getOperand(1))) {
+ uint64_t Mask = V.getConstantOperandVal(1);
+
+ const SmallVector<ValueBit, 64> *LHSBits;
+ // Mark this as interesting, only if the LHS was also interesting. This
+ // prevents the overall procedure from matching a single immediate 'and'
+ // (which is non-optimal because such an and might be folded with other
+ // things if we don't select it here).
+ std::tie(Interesting, LHSBits) = getValueBits(V.getOperand(0), NumBits);
+
+ for (unsigned i = 0; i < NumBits; ++i)
+ if (((Mask >> i) & 1) == 1)
+ Bits[i] = (*LHSBits)[i];
+ else
+ Bits[i] = ValueBit(ValueBit::ConstZero);
+
+ return std::make_pair(Interesting, &Bits);
+ }
+ break;
+ case ISD::OR: {
+ const auto &LHSBits = *getValueBits(V.getOperand(0), NumBits).second;
+ const auto &RHSBits = *getValueBits(V.getOperand(1), NumBits).second;
+
+ bool AllDisjoint = true;
+ for (unsigned i = 0; i < NumBits; ++i)
+ if (LHSBits[i].isZero())
+ Bits[i] = RHSBits[i];
+ else if (RHSBits[i].isZero())
+ Bits[i] = LHSBits[i];
+ else {
+ AllDisjoint = false;
+ break;
+ }
+
+ if (!AllDisjoint)
+ break;
+
+ return std::make_pair(Interesting = true, &Bits);
+ }
+ }
+
+ for (unsigned i = 0; i < NumBits; ++i)
+ Bits[i] = ValueBit(V, i);
+
+ return std::make_pair(Interesting = false, &Bits);
+ }
+
+ // For each value (except the constant ones), compute the left-rotate amount
+ // to get it from its original to final position.
+ void computeRotationAmounts() {
+ HasZeros = false;
+ RLAmt.resize(Bits.size());
+ for (unsigned i = 0; i < Bits.size(); ++i)
+ if (Bits[i].hasValue()) {
+ unsigned VBI = Bits[i].getValueBitIndex();
+ if (i >= VBI)
+ RLAmt[i] = i - VBI;
+ else
+ RLAmt[i] = Bits.size() - (VBI - i);
+ } else if (Bits[i].isZero()) {
+ HasZeros = true;
+ RLAmt[i] = UINT32_MAX;
+ } else {
+ llvm_unreachable("Unknown value bit type");
+ }
+ }
+
+ // Collect groups of consecutive bits with the same underlying value and
+ // rotation factor. If we're doing late masking, we ignore zeros, otherwise
+ // they break up groups.
+ void collectBitGroups(bool LateMask) {
+ BitGroups.clear();
+
+ unsigned LastRLAmt = RLAmt[0];
+ SDValue LastValue = Bits[0].hasValue() ? Bits[0].getValue() : SDValue();
+ unsigned LastGroupStartIdx = 0;
+ for (unsigned i = 1; i < Bits.size(); ++i) {
+ unsigned ThisRLAmt = RLAmt[i];
+ SDValue ThisValue = Bits[i].hasValue() ? Bits[i].getValue() : SDValue();
+ if (LateMask && !ThisValue) {
+ ThisValue = LastValue;
+ ThisRLAmt = LastRLAmt;
+ // If we're doing late masking, then the first bit group always starts
+ // at zero (even if the first bits were zero).
+ if (BitGroups.empty())
+ LastGroupStartIdx = 0;
+ }
+
+ // If this bit has the same underlying value and the same rotate factor as
+ // the last one, then they're part of the same group.
+ if (ThisRLAmt == LastRLAmt && ThisValue == LastValue)
+ continue;
+
+ if (LastValue.getNode())
+ BitGroups.push_back(BitGroup(LastValue, LastRLAmt, LastGroupStartIdx,
+ i-1));
+ LastRLAmt = ThisRLAmt;
+ LastValue = ThisValue;
+ LastGroupStartIdx = i;
+ }
+ if (LastValue.getNode())
+ BitGroups.push_back(BitGroup(LastValue, LastRLAmt, LastGroupStartIdx,
+ Bits.size()-1));
+
+ if (BitGroups.empty())
+ return;
+
+ // We might be able to combine the first and last groups.
+ if (BitGroups.size() > 1) {
+ // If the first and last groups are the same, then remove the first group
+ // in favor of the last group, making the ending index of the last group
+ // equal to the ending index of the to-be-removed first group.
+ if (BitGroups[0].StartIdx == 0 &&
+ BitGroups[BitGroups.size()-1].EndIdx == Bits.size()-1 &&
+ BitGroups[0].V == BitGroups[BitGroups.size()-1].V &&
+ BitGroups[0].RLAmt == BitGroups[BitGroups.size()-1].RLAmt) {
+ DEBUG(dbgs() << "\tcombining final bit group with initial one\n");
+ BitGroups[BitGroups.size()-1].EndIdx = BitGroups[0].EndIdx;
+ BitGroups.erase(BitGroups.begin());
+ }
+ }
+ }
+
+ // Take all (SDValue, RLAmt) pairs and sort them by the number of groups
+ // associated with each. If there is a degeneracy, pick the one that occurs
+ // first (in the final value).
+ void collectValueRotInfo() {
+ ValueRots.clear();
+
+ for (auto &BG : BitGroups) {
+ unsigned RLAmtKey = BG.RLAmt + (BG.Repl32 ? 64 : 0);
+ ValueRotInfo &VRI = ValueRots[std::make_pair(BG.V, RLAmtKey)];
+ VRI.V = BG.V;
+ VRI.RLAmt = BG.RLAmt;
+ VRI.Repl32 = BG.Repl32;
+ VRI.NumGroups += 1;
+ VRI.FirstGroupStartIdx = std::min(VRI.FirstGroupStartIdx, BG.StartIdx);
+ }
+
+ // Now that we've collected the various ValueRotInfo instances, we need to
+ // sort them.
+ ValueRotsVec.clear();
+ for (auto &I : ValueRots) {
+ ValueRotsVec.push_back(I.second);
+ }
+ std::sort(ValueRotsVec.begin(), ValueRotsVec.end());
+ }
+
+ // In 64-bit mode, rlwinm and friends have a rotation operator that
+ // replicates the low-order 32 bits into the high-order 32-bits. The mask
+ // indices of these instructions can only be in the lower 32 bits, so they
+ // can only represent some 64-bit bit groups. However, when they can be used,
+ // the 32-bit replication can be used to represent, as a single bit group,
+ // otherwise separate bit groups. We'll convert to replicated-32-bit bit
+ // groups when possible. Returns true if any of the bit groups were
+ // converted.
+ void assignRepl32BitGroups() {
+ // If we have bits like this:
+ //
+ // Indices: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+ // V bits: ... 7 6 5 4 3 2 1 0 31 30 29 28 27 26 25 24
+ // Groups: | RLAmt = 8 | RLAmt = 40 |
+ //
+ // But, making use of a 32-bit operation that replicates the low-order 32
+ // bits into the high-order 32 bits, this can be one bit group with a RLAmt
+ // of 8.
+
+ auto IsAllLow32 = [this](BitGroup & BG) {
+ if (BG.StartIdx <= BG.EndIdx) {
+ for (unsigned i = BG.StartIdx; i <= BG.EndIdx; ++i) {
+ if (!Bits[i].hasValue())
+ continue;
+ if (Bits[i].getValueBitIndex() >= 32)
+ return false;
+ }
+ } else {
+ for (unsigned i = BG.StartIdx; i < Bits.size(); ++i) {
+ if (!Bits[i].hasValue())
+ continue;
+ if (Bits[i].getValueBitIndex() >= 32)
+ return false;
+ }
+ for (unsigned i = 0; i <= BG.EndIdx; ++i) {
+ if (!Bits[i].hasValue())
+ continue;
+ if (Bits[i].getValueBitIndex() >= 32)
+ return false;
+ }
+ }
+
+ return true;
+ };
+
+ for (auto &BG : BitGroups) {
+ if (BG.StartIdx < 32 && BG.EndIdx < 32) {
+ if (IsAllLow32(BG)) {
+ if (BG.RLAmt >= 32) {
+ BG.RLAmt -= 32;
+ BG.Repl32CR = true;
+ }
+
+ BG.Repl32 = true;
+
+ DEBUG(dbgs() << "\t32-bit replicated bit group for " <<
+ BG.V.getNode() << " RLAmt = " << BG.RLAmt <<
+ " [" << BG.StartIdx << ", " << BG.EndIdx << "]\n");
+ }
+ }
+ }
+
+ // Now walk through the bit groups, consolidating where possible.
+ for (auto I = BitGroups.begin(); I != BitGroups.end();) {
+ // We might want to remove this bit group by merging it with the previous
+ // group (which might be the ending group).
+ auto IP = (I == BitGroups.begin()) ?
+ std::prev(BitGroups.end()) : std::prev(I);
+ if (I->Repl32 && IP->Repl32 && I->V == IP->V && I->RLAmt == IP->RLAmt &&
+ I->StartIdx == (IP->EndIdx + 1) % 64 && I != IP) {
+
+ DEBUG(dbgs() << "\tcombining 32-bit replicated bit group for " <<
+ I->V.getNode() << " RLAmt = " << I->RLAmt <<
+ " [" << I->StartIdx << ", " << I->EndIdx <<
+ "] with group with range [" <<
+ IP->StartIdx << ", " << IP->EndIdx << "]\n");
+
+ IP->EndIdx = I->EndIdx;
+ IP->Repl32CR = IP->Repl32CR || I->Repl32CR;
+ IP->Repl32Coalesced = true;
+ I = BitGroups.erase(I);
+ continue;
+ } else {
+ // There is a special case worth handling: If there is a single group
+ // covering the entire upper 32 bits, and it can be merged with both
+ // the next and previous groups (which might be the same group), then
+ // do so. If it is the same group (so there will be only one group in
+ // total), then we need to reverse the order of the range so that it
+ // covers the entire 64 bits.
+ if (I->StartIdx == 32 && I->EndIdx == 63) {
+ assert(std::next(I) == BitGroups.end() &&
+ "bit group ends at index 63 but there is another?");
+ auto IN = BitGroups.begin();
+
+ if (IP->Repl32 && IN->Repl32 && I->V == IP->V && I->V == IN->V &&
+ (I->RLAmt % 32) == IP->RLAmt && (I->RLAmt % 32) == IN->RLAmt &&
+ IP->EndIdx == 31 && IN->StartIdx == 0 && I != IP &&
+ IsAllLow32(*I)) {
+
+ DEBUG(dbgs() << "\tcombining bit group for " <<
+ I->V.getNode() << " RLAmt = " << I->RLAmt <<
+ " [" << I->StartIdx << ", " << I->EndIdx <<
+ "] with 32-bit replicated groups with ranges [" <<
+ IP->StartIdx << ", " << IP->EndIdx << "] and [" <<
+ IN->StartIdx << ", " << IN->EndIdx << "]\n");
+
+ if (IP == IN) {
+ // There is only one other group; change it to cover the whole
+ // range (backward, so that it can still be Repl32 but cover the
+ // whole 64-bit range).
+ IP->StartIdx = 31;
+ IP->EndIdx = 30;
+ IP->Repl32CR = IP->Repl32CR || I->RLAmt >= 32;
+ IP->Repl32Coalesced = true;
+ I = BitGroups.erase(I);
+ } else {
+ // There are two separate groups, one before this group and one
+ // after us (at the beginning). We're going to remove this group,
+ // but also the group at the very beginning.
+ IP->EndIdx = IN->EndIdx;
+ IP->Repl32CR = IP->Repl32CR || IN->Repl32CR || I->RLAmt >= 32;
+ IP->Repl32Coalesced = true;
+ I = BitGroups.erase(I);
+ BitGroups.erase(BitGroups.begin());
+ }
+
+ // This must be the last group in the vector (and we might have
+ // just invalidated the iterator above), so break here.
+ break;
+ }
+ }
+ }
+
+ ++I;
+ }
+ }
+
+ SDValue getI32Imm(unsigned Imm, const SDLoc &dl) {
+ return CurDAG->getTargetConstant(Imm, dl, MVT::i32);
+ }
+
+ uint64_t getZerosMask() {
+ uint64_t Mask = 0;
+ for (unsigned i = 0; i < Bits.size(); ++i) {
+ if (Bits[i].hasValue())
+ continue;
+ Mask |= (UINT64_C(1) << i);
+ }
+
+ return ~Mask;
+ }
+
+ // Depending on the number of groups for a particular value, it might be
+ // better to rotate, mask explicitly (using andi/andis), and then or the
+ // result. Select this part of the result first.
+ void SelectAndParts32(const SDLoc &dl, SDValue &Res, unsigned *InstCnt) {
+ if (BPermRewriterNoMasking)
+ return;
+
+ for (ValueRotInfo &VRI : ValueRotsVec) {
+ unsigned Mask = 0;
+ for (unsigned i = 0; i < Bits.size(); ++i) {
+ if (!Bits[i].hasValue() || Bits[i].getValue() != VRI.V)
+ continue;
+ if (RLAmt[i] != VRI.RLAmt)
+ continue;
+ Mask |= (1u << i);
+ }
+
+ // Compute the masks for andi/andis that would be necessary.
+ unsigned ANDIMask = (Mask & UINT16_MAX), ANDISMask = Mask >> 16;
+ assert((ANDIMask != 0 || ANDISMask != 0) &&
+ "No set bits in mask for value bit groups");
+ bool NeedsRotate = VRI.RLAmt != 0;
+
+ // We're trying to minimize the number of instructions. If we have one
+ // group, using one of andi/andis can break even. If we have three
+ // groups, we can use both andi and andis and break even (to use both
+ // andi and andis we also need to or the results together). We need four
+ // groups if we also need to rotate. To use andi/andis we need to do more
+ // than break even because rotate-and-mask instructions tend to be easier
+ // to schedule.
+
+ // FIXME: We've biased here against using andi/andis, which is right for
+ // POWER cores, but not optimal everywhere. For example, on the A2,
+ // andi/andis have single-cycle latency whereas the rotate-and-mask
+ // instructions take two cycles, and it would be better to bias toward
+ // andi/andis in break-even cases.
+
+ unsigned NumAndInsts = (unsigned) NeedsRotate +
+ (unsigned) (ANDIMask != 0) +
+ (unsigned) (ANDISMask != 0) +
+ (unsigned) (ANDIMask != 0 && ANDISMask != 0) +
+ (unsigned) (bool) Res;
+
+ DEBUG(dbgs() << "\t\trotation groups for " << VRI.V.getNode() <<
+ " RL: " << VRI.RLAmt << ":" <<
+ "\n\t\t\tisel using masking: " << NumAndInsts <<
+ " using rotates: " << VRI.NumGroups << "\n");
+
+ if (NumAndInsts >= VRI.NumGroups)
+ continue;
+
+ DEBUG(dbgs() << "\t\t\t\tusing masking\n");
+
+ if (InstCnt) *InstCnt += NumAndInsts;
+
+ SDValue VRot;
+ if (VRI.RLAmt) {
+ SDValue Ops[] =
+ { VRI.V, getI32Imm(VRI.RLAmt, dl), getI32Imm(0, dl),
+ getI32Imm(31, dl) };
+ VRot = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32,
+ Ops), 0);
+ } else {
+ VRot = VRI.V;
+ }
+
+ SDValue ANDIVal, ANDISVal;
+ if (ANDIMask != 0)
+ ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo, dl, MVT::i32,
+ VRot, getI32Imm(ANDIMask, dl)), 0);
+ if (ANDISMask != 0)
+ ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo, dl, MVT::i32,
+ VRot, getI32Imm(ANDISMask, dl)), 0);
+
+ SDValue TotalVal;
+ if (!ANDIVal)
+ TotalVal = ANDISVal;
+ else if (!ANDISVal)
+ TotalVal = ANDIVal;
+ else
+ TotalVal = SDValue(CurDAG->getMachineNode(PPC::OR, dl, MVT::i32,
+ ANDIVal, ANDISVal), 0);
+
+ if (!Res)
+ Res = TotalVal;
+ else
+ Res = SDValue(CurDAG->getMachineNode(PPC::OR, dl, MVT::i32,
+ Res, TotalVal), 0);
+
+ // Now, remove all groups with this underlying value and rotation
+ // factor.
+ eraseMatchingBitGroups([VRI](const BitGroup &BG) {
+ return BG.V == VRI.V && BG.RLAmt == VRI.RLAmt;
+ });
+ }
+ }
+
+ // Instruction selection for the 32-bit case.
+ SDNode *Select32(SDNode *N, bool LateMask, unsigned *InstCnt) {
+ SDLoc dl(N);
+ SDValue Res;
+
+ if (InstCnt) *InstCnt = 0;
+
+ // Take care of cases that should use andi/andis first.
+ SelectAndParts32(dl, Res, InstCnt);
+
+ // If we've not yet selected a 'starting' instruction, and we have no zeros
+ // to fill in, select the (Value, RLAmt) with the highest priority (largest
+ // number of groups), and start with this rotated value.
+ if ((!HasZeros || LateMask) && !Res) {
+ ValueRotInfo &VRI = ValueRotsVec[0];
+ if (VRI.RLAmt) {
+ if (InstCnt) *InstCnt += 1;
+ SDValue Ops[] =
+ { VRI.V, getI32Imm(VRI.RLAmt, dl), getI32Imm(0, dl),
+ getI32Imm(31, dl) };
+ Res = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops),
+ 0);
+ } else {
+ Res = VRI.V;
+ }
+
+ // Now, remove all groups with this underlying value and rotation factor.
+ eraseMatchingBitGroups([VRI](const BitGroup &BG) {
+ return BG.V == VRI.V && BG.RLAmt == VRI.RLAmt;
+ });
+ }
+
+ if (InstCnt) *InstCnt += BitGroups.size();
+
+ // Insert the other groups (one at a time).
+ for (auto &BG : BitGroups) {
+ if (!Res) {
+ SDValue Ops[] =
+ { BG.V, getI32Imm(BG.RLAmt, dl),
+ getI32Imm(Bits.size() - BG.EndIdx - 1, dl),
+ getI32Imm(Bits.size() - BG.StartIdx - 1, dl) };
+ Res = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0);
+ } else {
+ SDValue Ops[] =
+ { Res, BG.V, getI32Imm(BG.RLAmt, dl),
+ getI32Imm(Bits.size() - BG.EndIdx - 1, dl),
+ getI32Imm(Bits.size() - BG.StartIdx - 1, dl) };
+ Res = SDValue(CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops), 0);
+ }
+ }
+
+ if (LateMask) {
+ unsigned Mask = (unsigned) getZerosMask();
+
+ unsigned ANDIMask = (Mask & UINT16_MAX), ANDISMask = Mask >> 16;
+ assert((ANDIMask != 0 || ANDISMask != 0) &&
+ "No set bits in zeros mask?");
+
+ if (InstCnt) *InstCnt += (unsigned) (ANDIMask != 0) +
+ (unsigned) (ANDISMask != 0) +
+ (unsigned) (ANDIMask != 0 && ANDISMask != 0);
+
+ SDValue ANDIVal, ANDISVal;
+ if (ANDIMask != 0)
+ ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo, dl, MVT::i32,
+ Res, getI32Imm(ANDIMask, dl)), 0);
+ if (ANDISMask != 0)
+ ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo, dl, MVT::i32,
+ Res, getI32Imm(ANDISMask, dl)), 0);
+
+ if (!ANDIVal)
+ Res = ANDISVal;
+ else if (!ANDISVal)
+ Res = ANDIVal;
+ else
+ Res = SDValue(CurDAG->getMachineNode(PPC::OR, dl, MVT::i32,
+ ANDIVal, ANDISVal), 0);
+ }
+
+ return Res.getNode();
+ }
+
+ unsigned SelectRotMask64Count(unsigned RLAmt, bool Repl32,
+ unsigned MaskStart, unsigned MaskEnd,
+ bool IsIns) {
+ // In the notation used by the instructions, 'start' and 'end' are reversed
+ // because bits are counted from high to low order.
+ unsigned InstMaskStart = 64 - MaskEnd - 1,
+ InstMaskEnd = 64 - MaskStart - 1;
+
+ if (Repl32)
+ return 1;
+
+ if ((!IsIns && (InstMaskEnd == 63 || InstMaskStart == 0)) ||
+ InstMaskEnd == 63 - RLAmt)
+ return 1;
+
+ return 2;
+ }
+
+ // For 64-bit values, not all combinations of rotates and masks are
+ // available. Produce one if it is available.
+ SDValue SelectRotMask64(SDValue V, const SDLoc &dl, unsigned RLAmt,
+ bool Repl32, unsigned MaskStart, unsigned MaskEnd,
+ unsigned *InstCnt = nullptr) {
+ // In the notation used by the instructions, 'start' and 'end' are reversed
+ // because bits are counted from high to low order.
+ unsigned InstMaskStart = 64 - MaskEnd - 1,
+ InstMaskEnd = 64 - MaskStart - 1;
+
+ if (InstCnt) *InstCnt += 1;
+
+ if (Repl32) {
+ // This rotation amount assumes that the lower 32 bits of the quantity
+ // are replicated in the high 32 bits by the rotation operator (which is
+ // done by rlwinm and friends).
+ assert(InstMaskStart >= 32 && "Mask cannot start out of range");
+ assert(InstMaskEnd >= 32 && "Mask cannot end out of range");
+ SDValue Ops[] =
+ { V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart - 32, dl),
+ getI32Imm(InstMaskEnd - 32, dl) };
+ return SDValue(CurDAG->getMachineNode(PPC::RLWINM8, dl, MVT::i64,
+ Ops), 0);
+ }
+
+ if (InstMaskEnd == 63) {
+ SDValue Ops[] =
+ { V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart, dl) };
+ return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, Ops), 0);
+ }
+
+ if (InstMaskStart == 0) {
+ SDValue Ops[] =
+ { V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskEnd, dl) };
+ return SDValue(CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64, Ops), 0);
+ }
+
+ if (InstMaskEnd == 63 - RLAmt) {
+ SDValue Ops[] =
+ { V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart, dl) };
+ return SDValue(CurDAG->getMachineNode(PPC::RLDIC, dl, MVT::i64, Ops), 0);
+ }
+
+ // We cannot do this with a single instruction, so we'll use two. The
+ // problem is that we're not free to choose both a rotation amount and mask
+ // start and end independently. We can choose an arbitrary mask start and
+ // end, but then the rotation amount is fixed. Rotation, however, can be
+ // inverted, and so by applying an "inverse" rotation first, we can get the
+ // desired result.
+ if (InstCnt) *InstCnt += 1;
+
+ // The rotation mask for the second instruction must be MaskStart.
+ unsigned RLAmt2 = MaskStart;
+ // The first instruction must rotate V so that the overall rotation amount
+ // is RLAmt.
+ unsigned RLAmt1 = (64 + RLAmt - RLAmt2) % 64;
+ if (RLAmt1)
+ V = SelectRotMask64(V, dl, RLAmt1, false, 0, 63);
+ return SelectRotMask64(V, dl, RLAmt2, false, MaskStart, MaskEnd);
+ }
+
+ // For 64-bit values, not all combinations of rotates and masks are
+ // available. Produce a rotate-mask-and-insert if one is available.
+ SDValue SelectRotMaskIns64(SDValue Base, SDValue V, const SDLoc &dl,
+ unsigned RLAmt, bool Repl32, unsigned MaskStart,
+ unsigned MaskEnd, unsigned *InstCnt = nullptr) {
+ // In the notation used by the instructions, 'start' and 'end' are reversed
+ // because bits are counted from high to low order.
+ unsigned InstMaskStart = 64 - MaskEnd - 1,
+ InstMaskEnd = 64 - MaskStart - 1;
+
+ if (InstCnt) *InstCnt += 1;
+
+ if (Repl32) {
+ // This rotation amount assumes that the lower 32 bits of the quantity
+ // are replicated in the high 32 bits by the rotation operator (which is
+ // done by rlwinm and friends).
+ assert(InstMaskStart >= 32 && "Mask cannot start out of range");
+ assert(InstMaskEnd >= 32 && "Mask cannot end out of range");
+ SDValue Ops[] =
+ { Base, V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart - 32, dl),
+ getI32Imm(InstMaskEnd - 32, dl) };
+ return SDValue(CurDAG->getMachineNode(PPC::RLWIMI8, dl, MVT::i64,
+ Ops), 0);
+ }
+
+ if (InstMaskEnd == 63 - RLAmt) {
+ SDValue Ops[] =
+ { Base, V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart, dl) };
+ return SDValue(CurDAG->getMachineNode(PPC::RLDIMI, dl, MVT::i64, Ops), 0);
+ }
+
+ // We cannot do this with a single instruction, so we'll use two. The
+ // problem is that we're not free to choose both a rotation amount and mask
+ // start and end independently. We can choose an arbitrary mask start and
+ // end, but then the rotation amount is fixed. Rotation, however, can be
+ // inverted, and so by applying an "inverse" rotation first, we can get the
+ // desired result.
+ if (InstCnt) *InstCnt += 1;
+
+ // The rotation mask for the second instruction must be MaskStart.
+ unsigned RLAmt2 = MaskStart;
+ // The first instruction must rotate V so that the overall rotation amount
+ // is RLAmt.
+ unsigned RLAmt1 = (64 + RLAmt - RLAmt2) % 64;
+ if (RLAmt1)
+ V = SelectRotMask64(V, dl, RLAmt1, false, 0, 63);
+ return SelectRotMaskIns64(Base, V, dl, RLAmt2, false, MaskStart, MaskEnd);
+ }
+
+ void SelectAndParts64(const SDLoc &dl, SDValue &Res, unsigned *InstCnt) {
+ if (BPermRewriterNoMasking)
+ return;
+
+ // The idea here is the same as in the 32-bit version, but with additional
+ // complications from the fact that Repl32 might be true. Because we
+ // aggressively convert bit groups to Repl32 form (which, for small
+ // rotation factors, involves no other change), and then coalesce, it might
+ // be the case that a single 64-bit masking operation could handle both
+ // some Repl32 groups and some non-Repl32 groups. If converting to Repl32
+ // form allowed coalescing, then we must use a 32-bit rotaton in order to
+ // completely capture the new combined bit group.
+
+ for (ValueRotInfo &VRI : ValueRotsVec) {
+ uint64_t Mask = 0;
+
+ // We need to add to the mask all bits from the associated bit groups.
+ // If Repl32 is false, we need to add bits from bit groups that have
+ // Repl32 true, but are trivially convertable to Repl32 false. Such a
+ // group is trivially convertable if it overlaps only with the lower 32
+ // bits, and the group has not been coalesced.
+ auto MatchingBG = [VRI](const BitGroup &BG) {
+ if (VRI.V != BG.V)
+ return false;
+
+ unsigned EffRLAmt = BG.RLAmt;
+ if (!VRI.Repl32 && BG.Repl32) {
+ if (BG.StartIdx < 32 && BG.EndIdx < 32 && BG.StartIdx <= BG.EndIdx &&
+ !BG.Repl32Coalesced) {
+ if (BG.Repl32CR)
+ EffRLAmt += 32;
+ } else {
+ return false;
+ }
+ } else if (VRI.Repl32 != BG.Repl32) {
+ return false;
+ }
+
+ return VRI.RLAmt == EffRLAmt;
+ };
+
+ for (auto &BG : BitGroups) {
+ if (!MatchingBG(BG))
+ continue;
+
+ if (BG.StartIdx <= BG.EndIdx) {
+ for (unsigned i = BG.StartIdx; i <= BG.EndIdx; ++i)
+ Mask |= (UINT64_C(1) << i);
+ } else {
+ for (unsigned i = BG.StartIdx; i < Bits.size(); ++i)
+ Mask |= (UINT64_C(1) << i);
+ for (unsigned i = 0; i <= BG.EndIdx; ++i)
+ Mask |= (UINT64_C(1) << i);
+ }
+ }
+
+ // We can use the 32-bit andi/andis technique if the mask does not
+ // require any higher-order bits. This can save an instruction compared
+ // to always using the general 64-bit technique.
+ bool Use32BitInsts = isUInt<32>(Mask);
+ // Compute the masks for andi/andis that would be necessary.
+ unsigned ANDIMask = (Mask & UINT16_MAX),
+ ANDISMask = (Mask >> 16) & UINT16_MAX;
+
+ bool NeedsRotate = VRI.RLAmt || (VRI.Repl32 && !isUInt<32>(Mask));
+
+ unsigned NumAndInsts = (unsigned) NeedsRotate +
+ (unsigned) (bool) Res;
+ if (Use32BitInsts)
+ NumAndInsts += (unsigned) (ANDIMask != 0) + (unsigned) (ANDISMask != 0) +
+ (unsigned) (ANDIMask != 0 && ANDISMask != 0);
+ else
+ NumAndInsts += getInt64Count(Mask) + /* and */ 1;
+
+ unsigned NumRLInsts = 0;
+ bool FirstBG = true;
+ bool MoreBG = false;
+ for (auto &BG : BitGroups) {
+ if (!MatchingBG(BG)) {
+ MoreBG = true;
+ continue;
+ }
+ NumRLInsts +=
+ SelectRotMask64Count(BG.RLAmt, BG.Repl32, BG.StartIdx, BG.EndIdx,
+ !FirstBG);
+ FirstBG = false;
+ }
+
+ DEBUG(dbgs() << "\t\trotation groups for " << VRI.V.getNode() <<
+ " RL: " << VRI.RLAmt << (VRI.Repl32 ? " (32):" : ":") <<
+ "\n\t\t\tisel using masking: " << NumAndInsts <<
+ " using rotates: " << NumRLInsts << "\n");
+
+ // When we'd use andi/andis, we bias toward using the rotates (andi only
+ // has a record form, and is cracked on POWER cores). However, when using
+ // general 64-bit constant formation, bias toward the constant form,
+ // because that exposes more opportunities for CSE.
+ if (NumAndInsts > NumRLInsts)
+ continue;
+ // When merging multiple bit groups, instruction or is used.
+ // But when rotate is used, rldimi can inert the rotated value into any
+ // register, so instruction or can be avoided.
+ if ((Use32BitInsts || MoreBG) && NumAndInsts == NumRLInsts)
+ continue;
+
+ DEBUG(dbgs() << "\t\t\t\tusing masking\n");
+
+ if (InstCnt) *InstCnt += NumAndInsts;
+
+ SDValue VRot;
+ // We actually need to generate a rotation if we have a non-zero rotation
+ // factor or, in the Repl32 case, if we care about any of the
+ // higher-order replicated bits. In the latter case, we generate a mask
+ // backward so that it actually includes the entire 64 bits.
+ if (VRI.RLAmt || (VRI.Repl32 && !isUInt<32>(Mask)))
+ VRot = SelectRotMask64(VRI.V, dl, VRI.RLAmt, VRI.Repl32,
+ VRI.Repl32 ? 31 : 0, VRI.Repl32 ? 30 : 63);
+ else
+ VRot = VRI.V;
+
+ SDValue TotalVal;
+ if (Use32BitInsts) {
+ assert((ANDIMask != 0 || ANDISMask != 0) &&
+ "No set bits in mask when using 32-bit ands for 64-bit value");
+
+ SDValue ANDIVal, ANDISVal;
+ if (ANDIMask != 0)
+ ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo8, dl, MVT::i64,
+ VRot, getI32Imm(ANDIMask, dl)), 0);
+ if (ANDISMask != 0)
+ ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo8, dl, MVT::i64,
+ VRot, getI32Imm(ANDISMask, dl)), 0);
+
+ if (!ANDIVal)
+ TotalVal = ANDISVal;
+ else if (!ANDISVal)
+ TotalVal = ANDIVal;
+ else
+ TotalVal = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64,
+ ANDIVal, ANDISVal), 0);
+ } else {
+ TotalVal = SDValue(getInt64(CurDAG, dl, Mask), 0);
+ TotalVal =
+ SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64,
+ VRot, TotalVal), 0);
+ }
+
+ if (!Res)
+ Res = TotalVal;
+ else
+ Res = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64,
+ Res, TotalVal), 0);
+
+ // Now, remove all groups with this underlying value and rotation
+ // factor.
+ eraseMatchingBitGroups(MatchingBG);
+ }
+ }
+
+ // Instruction selection for the 64-bit case.
+ SDNode *Select64(SDNode *N, bool LateMask, unsigned *InstCnt) {
+ SDLoc dl(N);
+ SDValue Res;
+
+ if (InstCnt) *InstCnt = 0;
+
+ // Take care of cases that should use andi/andis first.
+ SelectAndParts64(dl, Res, InstCnt);
+
+ // If we've not yet selected a 'starting' instruction, and we have no zeros
+ // to fill in, select the (Value, RLAmt) with the highest priority (largest
+ // number of groups), and start with this rotated value.
+ if ((!HasZeros || LateMask) && !Res) {
+ // If we have both Repl32 groups and non-Repl32 groups, the non-Repl32
+ // groups will come first, and so the VRI representing the largest number
+ // of groups might not be first (it might be the first Repl32 groups).
+ unsigned MaxGroupsIdx = 0;
+ if (!ValueRotsVec[0].Repl32) {
+ for (unsigned i = 0, ie = ValueRotsVec.size(); i < ie; ++i)
+ if (ValueRotsVec[i].Repl32) {
+ if (ValueRotsVec[i].NumGroups > ValueRotsVec[0].NumGroups)
+ MaxGroupsIdx = i;
+ break;
+ }
+ }
+
+ ValueRotInfo &VRI = ValueRotsVec[MaxGroupsIdx];
+ bool NeedsRotate = false;
+ if (VRI.RLAmt) {
+ NeedsRotate = true;
+ } else if (VRI.Repl32) {
+ for (auto &BG : BitGroups) {
+ if (BG.V != VRI.V || BG.RLAmt != VRI.RLAmt ||
+ BG.Repl32 != VRI.Repl32)
+ continue;
+
+ // We don't need a rotate if the bit group is confined to the lower
+ // 32 bits.
+ if (BG.StartIdx < 32 && BG.EndIdx < 32 && BG.StartIdx < BG.EndIdx)
+ continue;
+
+ NeedsRotate = true;
+ break;
+ }
+ }
+
+ if (NeedsRotate)
+ Res = SelectRotMask64(VRI.V, dl, VRI.RLAmt, VRI.Repl32,
+ VRI.Repl32 ? 31 : 0, VRI.Repl32 ? 30 : 63,
+ InstCnt);
+ else
+ Res = VRI.V;
+
+ // Now, remove all groups with this underlying value and rotation factor.
+ if (Res)
+ eraseMatchingBitGroups([VRI](const BitGroup &BG) {
+ return BG.V == VRI.V && BG.RLAmt == VRI.RLAmt &&
+ BG.Repl32 == VRI.Repl32;
+ });
+ }
+
+ // Because 64-bit rotates are more flexible than inserts, we might have a
+ // preference regarding which one we do first (to save one instruction).
+ if (!Res)
+ for (auto I = BitGroups.begin(), IE = BitGroups.end(); I != IE; ++I) {
+ if (SelectRotMask64Count(I->RLAmt, I->Repl32, I->StartIdx, I->EndIdx,
+ false) <
+ SelectRotMask64Count(I->RLAmt, I->Repl32, I->StartIdx, I->EndIdx,
+ true)) {
+ if (I != BitGroups.begin()) {
+ BitGroup BG = *I;
+ BitGroups.erase(I);
+ BitGroups.insert(BitGroups.begin(), BG);
+ }
+
+ break;
+ }
+ }
+
+ // Insert the other groups (one at a time).
+ for (auto &BG : BitGroups) {
+ if (!Res)
+ Res = SelectRotMask64(BG.V, dl, BG.RLAmt, BG.Repl32, BG.StartIdx,
+ BG.EndIdx, InstCnt);
+ else
+ Res = SelectRotMaskIns64(Res, BG.V, dl, BG.RLAmt, BG.Repl32,
+ BG.StartIdx, BG.EndIdx, InstCnt);
+ }
+
+ if (LateMask) {
+ uint64_t Mask = getZerosMask();
+
+ // We can use the 32-bit andi/andis technique if the mask does not
+ // require any higher-order bits. This can save an instruction compared
+ // to always using the general 64-bit technique.
+ bool Use32BitInsts = isUInt<32>(Mask);
+ // Compute the masks for andi/andis that would be necessary.
+ unsigned ANDIMask = (Mask & UINT16_MAX),
+ ANDISMask = (Mask >> 16) & UINT16_MAX;
+
+ if (Use32BitInsts) {
+ assert((ANDIMask != 0 || ANDISMask != 0) &&
+ "No set bits in mask when using 32-bit ands for 64-bit value");
+
+ if (InstCnt) *InstCnt += (unsigned) (ANDIMask != 0) +
+ (unsigned) (ANDISMask != 0) +
+ (unsigned) (ANDIMask != 0 && ANDISMask != 0);
+
+ SDValue ANDIVal, ANDISVal;
+ if (ANDIMask != 0)
+ ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo8, dl, MVT::i64,
+ Res, getI32Imm(ANDIMask, dl)), 0);
+ if (ANDISMask != 0)
+ ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo8, dl, MVT::i64,
+ Res, getI32Imm(ANDISMask, dl)), 0);
+
+ if (!ANDIVal)
+ Res = ANDISVal;
+ else if (!ANDISVal)
+ Res = ANDIVal;
+ else
+ Res = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64,
+ ANDIVal, ANDISVal), 0);
+ } else {
+ if (InstCnt) *InstCnt += getInt64Count(Mask) + /* and */ 1;
+
+ SDValue MaskVal = SDValue(getInt64(CurDAG, dl, Mask), 0);
+ Res =
+ SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64,
+ Res, MaskVal), 0);
+ }
+ }
+
+ return Res.getNode();
+ }
+
+ SDNode *Select(SDNode *N, bool LateMask, unsigned *InstCnt = nullptr) {
+ // Fill in BitGroups.
+ collectBitGroups(LateMask);
+ if (BitGroups.empty())
+ return nullptr;
+
+ // For 64-bit values, figure out when we can use 32-bit instructions.
+ if (Bits.size() == 64)
+ assignRepl32BitGroups();
+
+ // Fill in ValueRotsVec.
+ collectValueRotInfo();
+
+ if (Bits.size() == 32) {
+ return Select32(N, LateMask, InstCnt);
+ } else {
+ assert(Bits.size() == 64 && "Not 64 bits here?");
+ return Select64(N, LateMask, InstCnt);
+ }
+
+ return nullptr;
+ }
+
+ void eraseMatchingBitGroups(function_ref<bool(const BitGroup &)> F) {
+ BitGroups.erase(remove_if(BitGroups, F), BitGroups.end());
+ }
+
+ SmallVector<ValueBit, 64> Bits;
+
+ bool HasZeros;
+ SmallVector<unsigned, 64> RLAmt;
+
+ SmallVector<BitGroup, 16> BitGroups;
+
+ DenseMap<std::pair<SDValue, unsigned>, ValueRotInfo> ValueRots;
+ SmallVector<ValueRotInfo, 16> ValueRotsVec;
+
+ SelectionDAG *CurDAG;
+
+public:
+ BitPermutationSelector(SelectionDAG *DAG)
+ : CurDAG(DAG) {}
+
+ // Here we try to match complex bit permutations into a set of
+ // rotate-and-shift/shift/and/or instructions, using a set of heuristics
+ // known to produce optimial code for common cases (like i32 byte swapping).
+ SDNode *Select(SDNode *N) {
+ Memoizer.clear();
+ auto Result =
+ getValueBits(SDValue(N, 0), N->getValueType(0).getSizeInBits());
+ if (!Result.first)
+ return nullptr;
+ Bits = std::move(*Result.second);
+
+ DEBUG(dbgs() << "Considering bit-permutation-based instruction"
+ " selection for: ");
+ DEBUG(N->dump(CurDAG));
+
+ // Fill it RLAmt and set HasZeros.
+ computeRotationAmounts();
+
+ if (!HasZeros)
+ return Select(N, false);
+
+ // We currently have two techniques for handling results with zeros: early
+ // masking (the default) and late masking. Late masking is sometimes more
+ // efficient, but because the structure of the bit groups is different, it
+ // is hard to tell without generating both and comparing the results. With
+ // late masking, we ignore zeros in the resulting value when inserting each
+ // set of bit groups, and then mask in the zeros at the end. With early
+ // masking, we only insert the non-zero parts of the result at every step.
+
+ unsigned InstCnt, InstCntLateMask;
+ DEBUG(dbgs() << "\tEarly masking:\n");
+ SDNode *RN = Select(N, false, &InstCnt);
+ DEBUG(dbgs() << "\t\tisel would use " << InstCnt << " instructions\n");
+
+ DEBUG(dbgs() << "\tLate masking:\n");
+ SDNode *RNLM = Select(N, true, &InstCntLateMask);
+ DEBUG(dbgs() << "\t\tisel would use " << InstCntLateMask <<
+ " instructions\n");
+
+ if (InstCnt <= InstCntLateMask) {
+ DEBUG(dbgs() << "\tUsing early-masking for isel\n");
+ return RN;
+ }
+
+ DEBUG(dbgs() << "\tUsing late-masking for isel\n");
+ return RNLM;
+ }
+};
+} // anonymous namespace
+
+bool PPCDAGToDAGISel::tryBitPermutation(SDNode *N) {
+ if (N->getValueType(0) != MVT::i32 &&
+ N->getValueType(0) != MVT::i64)
+ return false;
+
+ if (!UseBitPermRewriter)
+ return false;
+
+ switch (N->getOpcode()) {
+ default: break;
+ case ISD::ROTL:
+ case ISD::SHL:
+ case ISD::SRL:
+ case ISD::AND:
+ case ISD::OR: {
+ BitPermutationSelector BPS(CurDAG);
+ if (SDNode *New = BPS.Select(N)) {
+ ReplaceNode(N, New);
+ return true;
+ }
+ return false;
+ }
+ }
+
+ return false;
+}
+
+/// SelectCC - Select a comparison of the specified values with the specified
+/// condition code, returning the CR# of the expression.
+SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+ const SDLoc &dl) {
+ // Always select the LHS.
+ unsigned Opc;
+
+ if (LHS.getValueType() == MVT::i32) {
+ unsigned Imm;
+ if (CC == ISD::SETEQ || CC == ISD::SETNE) {
+ if (isInt32Immediate(RHS, Imm)) {
+ // SETEQ/SETNE comparison with 16-bit immediate, fold it.
+ if (isUInt<16>(Imm))
+ return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, LHS,
+ getI32Imm(Imm & 0xFFFF, dl)),
+ 0);
+ // If this is a 16-bit signed immediate, fold it.
+ if (isInt<16>((int)Imm))
+ return SDValue(CurDAG->getMachineNode(PPC::CMPWI, dl, MVT::i32, LHS,
+ getI32Imm(Imm & 0xFFFF, dl)),
+ 0);
+
+ // For non-equality comparisons, the default code would materialize the
+ // constant, then compare against it, like this:
+ // lis r2, 4660
+ // ori r2, r2, 22136
+ // cmpw cr0, r3, r2
+ // Since we are just comparing for equality, we can emit this instead:
+ // xoris r0,r3,0x1234
+ // cmplwi cr0,r0,0x5678
+ // beq cr0,L6
+ SDValue Xor(CurDAG->getMachineNode(PPC::XORIS, dl, MVT::i32, LHS,
+ getI32Imm(Imm >> 16, dl)), 0);
+ return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, Xor,
+ getI32Imm(Imm & 0xFFFF, dl)), 0);
+ }
+ Opc = PPC::CMPLW;
+ } else if (ISD::isUnsignedIntSetCC(CC)) {
+ if (isInt32Immediate(RHS, Imm) && isUInt<16>(Imm))
+ return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, LHS,
+ getI32Imm(Imm & 0xFFFF, dl)), 0);
+ Opc = PPC::CMPLW;
+ } else {
+ short SImm;
+ if (isIntS16Immediate(RHS, SImm))
+ return SDValue(CurDAG->getMachineNode(PPC::CMPWI, dl, MVT::i32, LHS,
+ getI32Imm((int)SImm & 0xFFFF,
+ dl)),
+ 0);
+ Opc = PPC::CMPW;
+ }
+ } else if (LHS.getValueType() == MVT::i64) {
+ uint64_t Imm;
+ if (CC == ISD::SETEQ || CC == ISD::SETNE) {
+ if (isInt64Immediate(RHS.getNode(), Imm)) {
+ // SETEQ/SETNE comparison with 16-bit immediate, fold it.
+ if (isUInt<16>(Imm))
+ return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, LHS,
+ getI32Imm(Imm & 0xFFFF, dl)),
+ 0);
+ // If this is a 16-bit signed immediate, fold it.
+ if (isInt<16>(Imm))
+ return SDValue(CurDAG->getMachineNode(PPC::CMPDI, dl, MVT::i64, LHS,
+ getI32Imm(Imm & 0xFFFF, dl)),
+ 0);
+
+ // For non-equality comparisons, the default code would materialize the
+ // constant, then compare against it, like this:
+ // lis r2, 4660
+ // ori r2, r2, 22136
+ // cmpd cr0, r3, r2
+ // Since we are just comparing for equality, we can emit this instead:
+ // xoris r0,r3,0x1234
+ // cmpldi cr0,r0,0x5678
+ // beq cr0,L6
+ if (isUInt<32>(Imm)) {
+ SDValue Xor(CurDAG->getMachineNode(PPC::XORIS8, dl, MVT::i64, LHS,
+ getI64Imm(Imm >> 16, dl)), 0);
+ return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, Xor,
+ getI64Imm(Imm & 0xFFFF, dl)),
+ 0);
+ }
+ }
+ Opc = PPC::CMPLD;
+ } else if (ISD::isUnsignedIntSetCC(CC)) {
+ if (isInt64Immediate(RHS.getNode(), Imm) && isUInt<16>(Imm))
+ return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, LHS,
+ getI64Imm(Imm & 0xFFFF, dl)), 0);
+ Opc = PPC::CMPLD;
+ } else {
+ short SImm;
+ if (isIntS16Immediate(RHS, SImm))
+ return SDValue(CurDAG->getMachineNode(PPC::CMPDI, dl, MVT::i64, LHS,
+ getI64Imm(SImm & 0xFFFF, dl)),
+ 0);
+ Opc = PPC::CMPD;
+ }
+ } else if (LHS.getValueType() == MVT::f32) {
+ Opc = PPC::FCMPUS;
+ } else {
+ assert(LHS.getValueType() == MVT::f64 && "Unknown vt!");
+ Opc = PPCSubTarget->hasVSX() ? PPC::XSCMPUDP : PPC::FCMPUD;
+ }
+ return SDValue(CurDAG->getMachineNode(Opc, dl, MVT::i32, LHS, RHS), 0);
+}
+
+static PPC::Predicate getPredicateForSetCC(ISD::CondCode CC) {
+ switch (CC) {
+ case ISD::SETUEQ:
+ case ISD::SETONE:
+ case ISD::SETOLE:
+ case ISD::SETOGE:
+ llvm_unreachable("Should be lowered by legalize!");
+ default: llvm_unreachable("Unknown condition!");
+ case ISD::SETOEQ:
+ case ISD::SETEQ: return PPC::PRED_EQ;
+ case ISD::SETUNE:
+ case ISD::SETNE: return PPC::PRED_NE;
+ case ISD::SETOLT:
+ case ISD::SETLT: return PPC::PRED_LT;
+ case ISD::SETULE:
+ case ISD::SETLE: return PPC::PRED_LE;
+ case ISD::SETOGT:
+ case ISD::SETGT: return PPC::PRED_GT;
+ case ISD::SETUGE:
+ case ISD::SETGE: return PPC::PRED_GE;
+ case ISD::SETO: return PPC::PRED_NU;
+ case ISD::SETUO: return PPC::PRED_UN;
+ // These two are invalid for floating point. Assume we have int.
+ case ISD::SETULT: return PPC::PRED_LT;
+ case ISD::SETUGT: return PPC::PRED_GT;
+ }
+}
+
+/// getCRIdxForSetCC - Return the index of the condition register field
+/// associated with the SetCC condition, and whether or not the field is
+/// treated as inverted. That is, lt = 0; ge = 0 inverted.
+static unsigned getCRIdxForSetCC(ISD::CondCode CC, bool &Invert) {
+ Invert = false;
+ switch (CC) {
+ default: llvm_unreachable("Unknown condition!");
+ case ISD::SETOLT:
+ case ISD::SETLT: return 0; // Bit #0 = SETOLT
+ case ISD::SETOGT:
+ case ISD::SETGT: return 1; // Bit #1 = SETOGT
+ case ISD::SETOEQ:
+ case ISD::SETEQ: return 2; // Bit #2 = SETOEQ
+ case ISD::SETUO: return 3; // Bit #3 = SETUO
+ case ISD::SETUGE:
+ case ISD::SETGE: Invert = true; return 0; // !Bit #0 = SETUGE
+ case ISD::SETULE:
+ case ISD::SETLE: Invert = true; return 1; // !Bit #1 = SETULE
+ case ISD::SETUNE:
+ case ISD::SETNE: Invert = true; return 2; // !Bit #2 = SETUNE
+ case ISD::SETO: Invert = true; return 3; // !Bit #3 = SETO
+ case ISD::SETUEQ:
+ case ISD::SETOGE:
+ case ISD::SETOLE:
+ case ISD::SETONE:
+ llvm_unreachable("Invalid branch code: should be expanded by legalize");
+ // These are invalid for floating point. Assume integer.
+ case ISD::SETULT: return 0;
+ case ISD::SETUGT: return 1;
+ }
+}
+
+// getVCmpInst: return the vector compare instruction for the specified
+// vector type and condition code. Since this is for altivec specific code,
+// only support the altivec types (v16i8, v8i16, v4i32, v2i64, and v4f32).
+static unsigned int getVCmpInst(MVT VecVT, ISD::CondCode CC,
+ bool HasVSX, bool &Swap, bool &Negate) {
+ Swap = false;
+ Negate = false;
+
+ if (VecVT.isFloatingPoint()) {
+ /* Handle some cases by swapping input operands. */
+ switch (CC) {
+ case ISD::SETLE: CC = ISD::SETGE; Swap = true; break;
+ case ISD::SETLT: CC = ISD::SETGT; Swap = true; break;
+ case ISD::SETOLE: CC = ISD::SETOGE; Swap = true; break;
+ case ISD::SETOLT: CC = ISD::SETOGT; Swap = true; break;
+ case ISD::SETUGE: CC = ISD::SETULE; Swap = true; break;
+ case ISD::SETUGT: CC = ISD::SETULT; Swap = true; break;
+ default: break;
+ }
+ /* Handle some cases by negating the result. */
+ switch (CC) {
+ case ISD::SETNE: CC = ISD::SETEQ; Negate = true; break;
+ case ISD::SETUNE: CC = ISD::SETOEQ; Negate = true; break;
+ case ISD::SETULE: CC = ISD::SETOGT; Negate = true; break;
+ case ISD::SETULT: CC = ISD::SETOGE; Negate = true; break;
+ default: break;
+ }
+ /* We have instructions implementing the remaining cases. */
+ switch (CC) {
+ case ISD::SETEQ:
+ case ISD::SETOEQ:
+ if (VecVT == MVT::v4f32)
+ return HasVSX ? PPC::XVCMPEQSP : PPC::VCMPEQFP;
+ else if (VecVT == MVT::v2f64)
+ return PPC::XVCMPEQDP;
+ break;
+ case ISD::SETGT:
+ case ISD::SETOGT:
+ if (VecVT == MVT::v4f32)
+ return HasVSX ? PPC::XVCMPGTSP : PPC::VCMPGTFP;
+ else if (VecVT == MVT::v2f64)
+ return PPC::XVCMPGTDP;
+ break;
+ case ISD::SETGE:
+ case ISD::SETOGE:
+ if (VecVT == MVT::v4f32)
+ return HasVSX ? PPC::XVCMPGESP : PPC::VCMPGEFP;
+ else if (VecVT == MVT::v2f64)
+ return PPC::XVCMPGEDP;
+ break;
+ default:
+ break;
+ }
+ llvm_unreachable("Invalid floating-point vector compare condition");
+ } else {
+ /* Handle some cases by swapping input operands. */
+ switch (CC) {
+ case ISD::SETGE: CC = ISD::SETLE; Swap = true; break;
+ case ISD::SETLT: CC = ISD::SETGT; Swap = true; break;
+ case ISD::SETUGE: CC = ISD::SETULE; Swap = true; break;
+ case ISD::SETULT: CC = ISD::SETUGT; Swap = true; break;
+ default: break;
+ }
+ /* Handle some cases by negating the result. */
+ switch (CC) {
+ case ISD::SETNE: CC = ISD::SETEQ; Negate = true; break;
+ case ISD::SETUNE: CC = ISD::SETUEQ; Negate = true; break;
+ case ISD::SETLE: CC = ISD::SETGT; Negate = true; break;
+ case ISD::SETULE: CC = ISD::SETUGT; Negate = true; break;
+ default: break;
+ }
+ /* We have instructions implementing the remaining cases. */
+ switch (CC) {
+ case ISD::SETEQ:
+ case ISD::SETUEQ:
+ if (VecVT == MVT::v16i8)
+ return PPC::VCMPEQUB;
+ else if (VecVT == MVT::v8i16)
+ return PPC::VCMPEQUH;
+ else if (VecVT == MVT::v4i32)
+ return PPC::VCMPEQUW;
+ else if (VecVT == MVT::v2i64)
+ return PPC::VCMPEQUD;
+ break;
+ case ISD::SETGT:
+ if (VecVT == MVT::v16i8)
+ return PPC::VCMPGTSB;
+ else if (VecVT == MVT::v8i16)
+ return PPC::VCMPGTSH;
+ else if (VecVT == MVT::v4i32)
+ return PPC::VCMPGTSW;
+ else if (VecVT == MVT::v2i64)
+ return PPC::VCMPGTSD;
+ break;
+ case ISD::SETUGT:
+ if (VecVT == MVT::v16i8)
+ return PPC::VCMPGTUB;
+ else if (VecVT == MVT::v8i16)
+ return PPC::VCMPGTUH;
+ else if (VecVT == MVT::v4i32)
+ return PPC::VCMPGTUW;
+ else if (VecVT == MVT::v2i64)
+ return PPC::VCMPGTUD;
+ break;
+ default:
+ break;
+ }
+ llvm_unreachable("Invalid integer vector compare condition");
+ }
+}
+
+bool PPCDAGToDAGISel::trySETCC(SDNode *N) {
+ SDLoc dl(N);
+ unsigned Imm;
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ EVT PtrVT =
+ CurDAG->getTargetLoweringInfo().getPointerTy(CurDAG->getDataLayout());
+ bool isPPC64 = (PtrVT == MVT::i64);
+
+ if (!PPCSubTarget->useCRBits() &&
+ isInt32Immediate(N->getOperand(1), Imm)) {
+ // We can codegen setcc op, imm very efficiently compared to a brcond.
+ // Check for those cases here.
+ // setcc op, 0
+ if (Imm == 0) {
+ SDValue Op = N->getOperand(0);
+ switch (CC) {
+ default: break;
+ case ISD::SETEQ: {
+ Op = SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Op), 0);
+ SDValue Ops[] = { Op, getI32Imm(27, dl), getI32Imm(5, dl),
+ getI32Imm(31, dl) };
+ CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+ return true;
+ }
+ case ISD::SETNE: {
+ if (isPPC64) break;
+ SDValue AD =
+ SDValue(CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue,
+ Op, getI32Imm(~0U, dl)), 0);
+ CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, AD, Op, AD.getValue(1));
+ return true;
+ }
+ case ISD::SETLT: {
+ SDValue Ops[] = { Op, getI32Imm(1, dl), getI32Imm(31, dl),
+ getI32Imm(31, dl) };
+ CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+ return true;
+ }
+ case ISD::SETGT: {
+ SDValue T =
+ SDValue(CurDAG->getMachineNode(PPC::NEG, dl, MVT::i32, Op), 0);
+ T = SDValue(CurDAG->getMachineNode(PPC::ANDC, dl, MVT::i32, T, Op), 0);
+ SDValue Ops[] = { T, getI32Imm(1, dl), getI32Imm(31, dl),
+ getI32Imm(31, dl) };
+ CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+ return true;
+ }
+ }
+ } else if (Imm == ~0U) { // setcc op, -1
+ SDValue Op = N->getOperand(0);
+ switch (CC) {
+ default: break;
+ case ISD::SETEQ:
+ if (isPPC64) break;
+ Op = SDValue(CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue,
+ Op, getI32Imm(1, dl)), 0);
+ CurDAG->SelectNodeTo(N, PPC::ADDZE, MVT::i32,
+ SDValue(CurDAG->getMachineNode(PPC::LI, dl,
+ MVT::i32,
+ getI32Imm(0, dl)),
+ 0), Op.getValue(1));
+ return true;
+ case ISD::SETNE: {
+ if (isPPC64) break;
+ Op = SDValue(CurDAG->getMachineNode(PPC::NOR, dl, MVT::i32, Op, Op), 0);
+ SDNode *AD = CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue,
+ Op, getI32Imm(~0U, dl));
+ CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, SDValue(AD, 0), Op,
+ SDValue(AD, 1));
+ return true;
+ }
+ case ISD::SETLT: {
+ SDValue AD = SDValue(CurDAG->getMachineNode(PPC::ADDI, dl, MVT::i32, Op,
+ getI32Imm(1, dl)), 0);
+ SDValue AN = SDValue(CurDAG->getMachineNode(PPC::AND, dl, MVT::i32, AD,
+ Op), 0);
+ SDValue Ops[] = { AN, getI32Imm(1, dl), getI32Imm(31, dl),
+ getI32Imm(31, dl) };
+ CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+ return true;
+ }
+ case ISD::SETGT: {
+ SDValue Ops[] = { Op, getI32Imm(1, dl), getI32Imm(31, dl),
+ getI32Imm(31, dl) };
+ Op = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0);
+ CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Op, getI32Imm(1, dl));
+ return true;
+ }
+ }
+ }
+ }
+
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ // Altivec Vector compare instructions do not set any CR register by default and
+ // vector compare operations return the same type as the operands.
+ if (LHS.getValueType().isVector()) {
+ if (PPCSubTarget->hasQPX())
+ return false;
+
+ EVT VecVT = LHS.getValueType();
+ bool Swap, Negate;
+ unsigned int VCmpInst = getVCmpInst(VecVT.getSimpleVT(), CC,
+ PPCSubTarget->hasVSX(), Swap, Negate);
+ if (Swap)
+ std::swap(LHS, RHS);
+
+ EVT ResVT = VecVT.changeVectorElementTypeToInteger();
+ if (Negate) {
+ SDValue VCmp(CurDAG->getMachineNode(VCmpInst, dl, ResVT, LHS, RHS), 0);
+ CurDAG->SelectNodeTo(N, PPCSubTarget->hasVSX() ? PPC::XXLNOR : PPC::VNOR,
+ ResVT, VCmp, VCmp);
+ return true;
+ }
+
+ CurDAG->SelectNodeTo(N, VCmpInst, ResVT, LHS, RHS);
+ return true;
+ }
+
+ if (PPCSubTarget->useCRBits())
+ return false;
+
+ bool Inv;
+ unsigned Idx = getCRIdxForSetCC(CC, Inv);
+ SDValue CCReg = SelectCC(LHS, RHS, CC, dl);
+ SDValue IntCR;
+
+ // Force the ccreg into CR7.
+ SDValue CR7Reg = CurDAG->getRegister(PPC::CR7, MVT::i32);
+
+ SDValue InFlag(nullptr, 0); // Null incoming flag value.
+ CCReg = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, CR7Reg, CCReg,
+ InFlag).getValue(1);
+
+ IntCR = SDValue(CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32, CR7Reg,
+ CCReg), 0);
+
+ SDValue Ops[] = { IntCR, getI32Imm((32 - (3 - Idx)) & 31, dl),
+ getI32Imm(31, dl), getI32Imm(31, dl) };
+ if (!Inv) {
+ CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+ return true;
+ }
+
+ // Get the specified bit.
+ SDValue Tmp =
+ SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0);
+ CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Tmp, getI32Imm(1, dl));
+ return true;
+}
+
+void PPCDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) {
+ // Transfer memoperands.
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
+ cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);
+}
+
+
+// Select - Convert the specified operand from a target-independent to a
+// target-specific node if it hasn't already been changed.
+void PPCDAGToDAGISel::Select(SDNode *N) {
+ SDLoc dl(N);
+ if (N->isMachineOpcode()) {
+ N->setNodeId(-1);
+ return; // Already selected.
+ }
+
+ // In case any misguided DAG-level optimizations form an ADD with a
+ // TargetConstant operand, crash here instead of miscompiling (by selecting
+ // an r+r add instead of some kind of r+i add).
+ if (N->getOpcode() == ISD::ADD &&
+ N->getOperand(1).getOpcode() == ISD::TargetConstant)
+ llvm_unreachable("Invalid ADD with TargetConstant operand");
+
+ // Try matching complex bit permutations before doing anything else.
+ if (tryBitPermutation(N))
+ return;
+
+ switch (N->getOpcode()) {
+ default: break;
+
+ case ISD::Constant: {
+ if (N->getValueType(0) == MVT::i64) {
+ ReplaceNode(N, getInt64(CurDAG, N));
+ return;
+ }
+ break;
+ }
+
+ case ISD::SETCC: {
+ if (trySETCC(N))
+ return;
+ break;
+ }
+ case PPCISD::GlobalBaseReg:
+ ReplaceNode(N, getGlobalBaseReg());
+ return;
+
+ case ISD::FrameIndex:
+ selectFrameIndex(N, N);
+ return;
+
+ case PPCISD::MFOCRF: {
+ SDValue InFlag = N->getOperand(1);
+ ReplaceNode(N, CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32,
+ N->getOperand(0), InFlag));
+ return;
+ }
+
+ case PPCISD::READ_TIME_BASE: {
+ ReplaceNode(N, CurDAG->getMachineNode(PPC::ReadTB, dl, MVT::i32, MVT::i32,
+ MVT::Other, N->getOperand(0)));
+ return;
+ }
+
+ case PPCISD::SRA_ADDZE: {
+ SDValue N0 = N->getOperand(0);
+ SDValue ShiftAmt =
+ CurDAG->getTargetConstant(*cast<ConstantSDNode>(N->getOperand(1))->
+ getConstantIntValue(), dl,
+ N->getValueType(0));
+ if (N->getValueType(0) == MVT::i64) {
+ SDNode *Op =
+ CurDAG->getMachineNode(PPC::SRADI, dl, MVT::i64, MVT::Glue,
+ N0, ShiftAmt);
+ CurDAG->SelectNodeTo(N, PPC::ADDZE8, MVT::i64, SDValue(Op, 0),
+ SDValue(Op, 1));
+ return;
+ } else {
+ assert(N->getValueType(0) == MVT::i32 &&
+ "Expecting i64 or i32 in PPCISD::SRA_ADDZE");
+ SDNode *Op =
+ CurDAG->getMachineNode(PPC::SRAWI, dl, MVT::i32, MVT::Glue,
+ N0, ShiftAmt);
+ CurDAG->SelectNodeTo(N, PPC::ADDZE, MVT::i32, SDValue(Op, 0),
+ SDValue(Op, 1));
+ return;
+ }
+ }
+
+ case ISD::LOAD: {
+ // Handle preincrement loads.
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ EVT LoadedVT = LD->getMemoryVT();
+
+ // Normal loads are handled by code generated from the .td file.
+ if (LD->getAddressingMode() != ISD::PRE_INC)
+ break;
+
+ SDValue Offset = LD->getOffset();
+ if (Offset.getOpcode() == ISD::TargetConstant ||
+ Offset.getOpcode() == ISD::TargetGlobalAddress) {
+
+ unsigned Opcode;
+ bool isSExt = LD->getExtensionType() == ISD::SEXTLOAD;
+ if (LD->getValueType(0) != MVT::i64) {
+ // Handle PPC32 integer and normal FP loads.
+ assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load");
+ switch (LoadedVT.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("Invalid PPC load type!");
+ case MVT::f64: Opcode = PPC::LFDU; break;
+ case MVT::f32: Opcode = PPC::LFSU; break;
+ case MVT::i32: Opcode = PPC::LWZU; break;
+ case MVT::i16: Opcode = isSExt ? PPC::LHAU : PPC::LHZU; break;
+ case MVT::i1:
+ case MVT::i8: Opcode = PPC::LBZU; break;
+ }
+ } else {
+ assert(LD->getValueType(0) == MVT::i64 && "Unknown load result type!");
+ assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load");
+ switch (LoadedVT.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("Invalid PPC load type!");
+ case MVT::i64: Opcode = PPC::LDU; break;
+ case MVT::i32: Opcode = PPC::LWZU8; break;
+ case MVT::i16: Opcode = isSExt ? PPC::LHAU8 : PPC::LHZU8; break;
+ case MVT::i1:
+ case MVT::i8: Opcode = PPC::LBZU8; break;
+ }
+ }
+
+ SDValue Chain = LD->getChain();
+ SDValue Base = LD->getBasePtr();
+ SDValue Ops[] = { Offset, Base, Chain };
+ SDNode *MN = CurDAG->getMachineNode(
+ Opcode, dl, LD->getValueType(0),
+ PPCLowering->getPointerTy(CurDAG->getDataLayout()), MVT::Other, Ops);
+ transferMemOperands(N, MN);
+ ReplaceNode(N, MN);
+ return;
+ } else {
+ unsigned Opcode;
+ bool isSExt = LD->getExtensionType() == ISD::SEXTLOAD;
+ if (LD->getValueType(0) != MVT::i64) {
+ // Handle PPC32 integer and normal FP loads.
+ assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load");
+ switch (LoadedVT.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("Invalid PPC load type!");
+ case MVT::v4f64: Opcode = PPC::QVLFDUX; break; // QPX
+ case MVT::v4f32: Opcode = PPC::QVLFSUX; break; // QPX
+ case MVT::f64: Opcode = PPC::LFDUX; break;
+ case MVT::f32: Opcode = PPC::LFSUX; break;
+ case MVT::i32: Opcode = PPC::LWZUX; break;
+ case MVT::i16: Opcode = isSExt ? PPC::LHAUX : PPC::LHZUX; break;
+ case MVT::i1:
+ case MVT::i8: Opcode = PPC::LBZUX; break;
+ }
+ } else {
+ assert(LD->getValueType(0) == MVT::i64 && "Unknown load result type!");
+ assert((!isSExt || LoadedVT == MVT::i16 || LoadedVT == MVT::i32) &&
+ "Invalid sext update load");
+ switch (LoadedVT.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("Invalid PPC load type!");
+ case MVT::i64: Opcode = PPC::LDUX; break;
+ case MVT::i32: Opcode = isSExt ? PPC::LWAUX : PPC::LWZUX8; break;
+ case MVT::i16: Opcode = isSExt ? PPC::LHAUX8 : PPC::LHZUX8; break;
+ case MVT::i1:
+ case MVT::i8: Opcode = PPC::LBZUX8; break;
+ }
+ }
+
+ SDValue Chain = LD->getChain();
+ SDValue Base = LD->getBasePtr();
+ SDValue Ops[] = { Base, Offset, Chain };
+ SDNode *MN = CurDAG->getMachineNode(
+ Opcode, dl, LD->getValueType(0),
+ PPCLowering->getPointerTy(CurDAG->getDataLayout()), MVT::Other, Ops);
+ transferMemOperands(N, MN);
+ ReplaceNode(N, MN);
+ return;
+ }
+ }
+
+ case ISD::AND: {
+ unsigned Imm, Imm2, SH, MB, ME;
+ uint64_t Imm64;
+
+ // If this is an and of a value rotated between 0 and 31 bits and then and'd
+ // with a mask, emit rlwinm
+ if (isInt32Immediate(N->getOperand(1), Imm) &&
+ isRotateAndMask(N->getOperand(0).getNode(), Imm, false, SH, MB, ME)) {
+ SDValue Val = N->getOperand(0).getOperand(0);
+ SDValue Ops[] = { Val, getI32Imm(SH, dl), getI32Imm(MB, dl),
+ getI32Imm(ME, dl) };
+ CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+ return;
+ }
+ // If this is just a masked value where the input is not handled above, and
+ // is not a rotate-left (handled by a pattern in the .td file), emit rlwinm
+ if (isInt32Immediate(N->getOperand(1), Imm) &&
+ isRunOfOnes(Imm, MB, ME) &&
+ N->getOperand(0).getOpcode() != ISD::ROTL) {
+ SDValue Val = N->getOperand(0);
+ SDValue Ops[] = { Val, getI32Imm(0, dl), getI32Imm(MB, dl),
+ getI32Imm(ME, dl) };
+ CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+ return;
+ }
+ // If this is a 64-bit zero-extension mask, emit rldicl.
+ if (isInt64Immediate(N->getOperand(1).getNode(), Imm64) &&
+ isMask_64(Imm64)) {
+ SDValue Val = N->getOperand(0);
+ MB = 64 - countTrailingOnes(Imm64);
+ SH = 0;
+
+ if (Val.getOpcode() == ISD::ANY_EXTEND) {
+ auto Op0 = Val.getOperand(0);
+ if ( Op0.getOpcode() == ISD::SRL &&
+ isInt32Immediate(Op0.getOperand(1).getNode(), Imm) && Imm <= MB) {
+
+ auto ResultType = Val.getNode()->getValueType(0);
+ auto ImDef = CurDAG->getMachineNode(PPC::IMPLICIT_DEF, dl,
+ ResultType);
+ SDValue IDVal (ImDef, 0);
+
+ Val = SDValue(CurDAG->getMachineNode(PPC::INSERT_SUBREG, dl,
+ ResultType, IDVal, Op0.getOperand(0),
+ getI32Imm(1, dl)), 0);
+ SH = 64 - Imm;
+ }
+ }
+
+ // If the operand is a logical right shift, we can fold it into this
+ // instruction: rldicl(rldicl(x, 64-n, n), 0, mb) -> rldicl(x, 64-n, mb)
+ // for n <= mb. The right shift is really a left rotate followed by a
+ // mask, and this mask is a more-restrictive sub-mask of the mask implied
+ // by the shift.
+ if (Val.getOpcode() == ISD::SRL &&
+ isInt32Immediate(Val.getOperand(1).getNode(), Imm) && Imm <= MB) {
+ assert(Imm < 64 && "Illegal shift amount");
+ Val = Val.getOperand(0);
+ SH = 64 - Imm;
+ }
+
+ SDValue Ops[] = { Val, getI32Imm(SH, dl), getI32Imm(MB, dl) };
+ CurDAG->SelectNodeTo(N, PPC::RLDICL, MVT::i64, Ops);
+ return;
+ }
+ // AND X, 0 -> 0, not "rlwinm 32".
+ if (isInt32Immediate(N->getOperand(1), Imm) && (Imm == 0)) {
+ ReplaceUses(SDValue(N, 0), N->getOperand(1));
+ return;
+ }
+ // ISD::OR doesn't get all the bitfield insertion fun.
+ // (and (or x, c1), c2) where isRunOfOnes(~(c1^c2)) might be a
+ // bitfield insert.
+ if (isInt32Immediate(N->getOperand(1), Imm) &&
+ N->getOperand(0).getOpcode() == ISD::OR &&
+ isInt32Immediate(N->getOperand(0).getOperand(1), Imm2)) {
+ // The idea here is to check whether this is equivalent to:
+ // (c1 & m) | (x & ~m)
+ // where m is a run-of-ones mask. The logic here is that, for each bit in
+ // c1 and c2:
+ // - if both are 1, then the output will be 1.
+ // - if both are 0, then the output will be 0.
+ // - if the bit in c1 is 0, and the bit in c2 is 1, then the output will
+ // come from x.
+ // - if the bit in c1 is 1, and the bit in c2 is 0, then the output will
+ // be 0.
+ // If that last condition is never the case, then we can form m from the
+ // bits that are the same between c1 and c2.
+ unsigned MB, ME;
+ if (isRunOfOnes(~(Imm^Imm2), MB, ME) && !(~Imm & Imm2)) {
+ SDValue Ops[] = { N->getOperand(0).getOperand(0),
+ N->getOperand(0).getOperand(1),
+ getI32Imm(0, dl), getI32Imm(MB, dl),
+ getI32Imm(ME, dl) };
+ ReplaceNode(N, CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops));
+ return;
+ }
+ }
+
+ // Other cases are autogenerated.
+ break;
+ }
+ case ISD::OR: {
+ if (N->getValueType(0) == MVT::i32)
+ if (tryBitfieldInsert(N))
+ return;
+
+ short Imm;
+ if (N->getOperand(0)->getOpcode() == ISD::FrameIndex &&
+ isIntS16Immediate(N->getOperand(1), Imm)) {
+ APInt LHSKnownZero, LHSKnownOne;
+ CurDAG->computeKnownBits(N->getOperand(0), LHSKnownZero, LHSKnownOne);
+
+ // If this is equivalent to an add, then we can fold it with the
+ // FrameIndex calculation.
+ if ((LHSKnownZero.getZExtValue()|~(uint64_t)Imm) == ~0ULL) {
+ selectFrameIndex(N, N->getOperand(0).getNode(), (int)Imm);
+ return;
+ }
+ }
+
+ // Other cases are autogenerated.
+ break;
+ }
+ case ISD::ADD: {
+ short Imm;
+ if (N->getOperand(0)->getOpcode() == ISD::FrameIndex &&
+ isIntS16Immediate(N->getOperand(1), Imm)) {
+ selectFrameIndex(N, N->getOperand(0).getNode(), (int)Imm);
+ return;
+ }
+
+ break;
+ }
+ case ISD::SHL: {
+ unsigned Imm, SH, MB, ME;
+ if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, Imm) &&
+ isRotateAndMask(N, Imm, true, SH, MB, ME)) {
+ SDValue Ops[] = { N->getOperand(0).getOperand(0),
+ getI32Imm(SH, dl), getI32Imm(MB, dl),
+ getI32Imm(ME, dl) };
+ CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+ return;
+ }
+
+ // Other cases are autogenerated.
+ break;
+ }
+ case ISD::SRL: {
+ unsigned Imm, SH, MB, ME;
+ if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, Imm) &&
+ isRotateAndMask(N, Imm, true, SH, MB, ME)) {
+ SDValue Ops[] = { N->getOperand(0).getOperand(0),
+ getI32Imm(SH, dl), getI32Imm(MB, dl),
+ getI32Imm(ME, dl) };
+ CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+ return;
+ }
+
+ // Other cases are autogenerated.
+ break;
+ }
+ // FIXME: Remove this once the ANDI glue bug is fixed:
+ case PPCISD::ANDIo_1_EQ_BIT:
+ case PPCISD::ANDIo_1_GT_BIT: {
+ if (!ANDIGlueBug)
+ break;
+
+ EVT InVT = N->getOperand(0).getValueType();
+ assert((InVT == MVT::i64 || InVT == MVT::i32) &&
+ "Invalid input type for ANDIo_1_EQ_BIT");
+
+ unsigned Opcode = (InVT == MVT::i64) ? PPC::ANDIo8 : PPC::ANDIo;
+ SDValue AndI(CurDAG->getMachineNode(Opcode, dl, InVT, MVT::Glue,
+ N->getOperand(0),
+ CurDAG->getTargetConstant(1, dl, InVT)),
+ 0);
+ SDValue CR0Reg = CurDAG->getRegister(PPC::CR0, MVT::i32);
+ SDValue SRIdxVal =
+ CurDAG->getTargetConstant(N->getOpcode() == PPCISD::ANDIo_1_EQ_BIT ?
+ PPC::sub_eq : PPC::sub_gt, dl, MVT::i32);
+
+ CurDAG->SelectNodeTo(N, TargetOpcode::EXTRACT_SUBREG, MVT::i1, CR0Reg,
+ SRIdxVal, SDValue(AndI.getNode(), 1) /* glue */);
+ return;
+ }
+ case ISD::SELECT_CC: {
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
+ EVT PtrVT =
+ CurDAG->getTargetLoweringInfo().getPointerTy(CurDAG->getDataLayout());
+ bool isPPC64 = (PtrVT == MVT::i64);
+
+ // If this is a select of i1 operands, we'll pattern match it.
+ if (PPCSubTarget->useCRBits() &&
+ N->getOperand(0).getValueType() == MVT::i1)
+ break;
+
+ // Handle the setcc cases here. select_cc lhs, 0, 1, 0, cc
+ if (!isPPC64)
+ if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1)))
+ if (ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N->getOperand(2)))
+ if (ConstantSDNode *N3C = dyn_cast<ConstantSDNode>(N->getOperand(3)))
+ if (N1C->isNullValue() && N3C->isNullValue() &&
+ N2C->getZExtValue() == 1ULL && CC == ISD::SETNE &&
+ // FIXME: Implement this optzn for PPC64.
+ N->getValueType(0) == MVT::i32) {
+ SDNode *Tmp =
+ CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue,
+ N->getOperand(0), getI32Imm(~0U, dl));
+ CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, SDValue(Tmp, 0),
+ N->getOperand(0), SDValue(Tmp, 1));
+ return;
+ }
+
+ SDValue CCReg = SelectCC(N->getOperand(0), N->getOperand(1), CC, dl);
+
+ if (N->getValueType(0) == MVT::i1) {
+ // An i1 select is: (c & t) | (!c & f).
+ bool Inv;
+ unsigned Idx = getCRIdxForSetCC(CC, Inv);
+
+ unsigned SRI;
+ switch (Idx) {
+ default: llvm_unreachable("Invalid CC index");
+ case 0: SRI = PPC::sub_lt; break;
+ case 1: SRI = PPC::sub_gt; break;
+ case 2: SRI = PPC::sub_eq; break;
+ case 3: SRI = PPC::sub_un; break;
+ }
+
+ SDValue CCBit = CurDAG->getTargetExtractSubreg(SRI, dl, MVT::i1, CCReg);
+
+ SDValue NotCCBit(CurDAG->getMachineNode(PPC::CRNOR, dl, MVT::i1,
+ CCBit, CCBit), 0);
+ SDValue C = Inv ? NotCCBit : CCBit,
+ NotC = Inv ? CCBit : NotCCBit;
+
+ SDValue CAndT(CurDAG->getMachineNode(PPC::CRAND, dl, MVT::i1,
+ C, N->getOperand(2)), 0);
+ SDValue NotCAndF(CurDAG->getMachineNode(PPC::CRAND, dl, MVT::i1,
+ NotC, N->getOperand(3)), 0);
+
+ CurDAG->SelectNodeTo(N, PPC::CROR, MVT::i1, CAndT, NotCAndF);
+ return;
+ }
+
+ unsigned BROpc = getPredicateForSetCC(CC);
+
+ unsigned SelectCCOp;
+ if (N->getValueType(0) == MVT::i32)
+ SelectCCOp = PPC::SELECT_CC_I4;
+ else if (N->getValueType(0) == MVT::i64)
+ SelectCCOp = PPC::SELECT_CC_I8;
+ else if (N->getValueType(0) == MVT::f32)
+ if (PPCSubTarget->hasP8Vector())
+ SelectCCOp = PPC::SELECT_CC_VSSRC;
+ else
+ SelectCCOp = PPC::SELECT_CC_F4;
+ else if (N->getValueType(0) == MVT::f64)
+ if (PPCSubTarget->hasVSX())
+ SelectCCOp = PPC::SELECT_CC_VSFRC;
+ else
+ SelectCCOp = PPC::SELECT_CC_F8;
+ else if (PPCSubTarget->hasQPX() && N->getValueType(0) == MVT::v4f64)
+ SelectCCOp = PPC::SELECT_CC_QFRC;
+ else if (PPCSubTarget->hasQPX() && N->getValueType(0) == MVT::v4f32)
+ SelectCCOp = PPC::SELECT_CC_QSRC;
+ else if (PPCSubTarget->hasQPX() && N->getValueType(0) == MVT::v4i1)
+ SelectCCOp = PPC::SELECT_CC_QBRC;
+ else if (N->getValueType(0) == MVT::v2f64 ||
+ N->getValueType(0) == MVT::v2i64)
+ SelectCCOp = PPC::SELECT_CC_VSRC;
+ else
+ SelectCCOp = PPC::SELECT_CC_VRRC;
+
+ SDValue Ops[] = { CCReg, N->getOperand(2), N->getOperand(3),
+ getI32Imm(BROpc, dl) };
+ CurDAG->SelectNodeTo(N, SelectCCOp, N->getValueType(0), Ops);
+ return;
+ }
+ case ISD::VSELECT:
+ if (PPCSubTarget->hasVSX()) {
+ SDValue Ops[] = { N->getOperand(2), N->getOperand(1), N->getOperand(0) };
+ CurDAG->SelectNodeTo(N, PPC::XXSEL, N->getValueType(0), Ops);
+ return;
+ }
+
+ break;
+ case ISD::VECTOR_SHUFFLE:
+ if (PPCSubTarget->hasVSX() && (N->getValueType(0) == MVT::v2f64 ||
+ N->getValueType(0) == MVT::v2i64)) {
+ ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
+
+ SDValue Op1 = N->getOperand(SVN->getMaskElt(0) < 2 ? 0 : 1),
+ Op2 = N->getOperand(SVN->getMaskElt(1) < 2 ? 0 : 1);
+ unsigned DM[2];
+
+ for (int i = 0; i < 2; ++i)
+ if (SVN->getMaskElt(i) <= 0 || SVN->getMaskElt(i) == 2)
+ DM[i] = 0;
+ else
+ DM[i] = 1;
+
+ if (Op1 == Op2 && DM[0] == 0 && DM[1] == 0 &&
+ Op1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ isa<LoadSDNode>(Op1.getOperand(0))) {
+ LoadSDNode *LD = cast<LoadSDNode>(Op1.getOperand(0));
+ SDValue Base, Offset;
+
+ if (LD->isUnindexed() && LD->hasOneUse() && Op1.hasOneUse() &&
+ (LD->getMemoryVT() == MVT::f64 ||
+ LD->getMemoryVT() == MVT::i64) &&
+ SelectAddrIdxOnly(LD->getBasePtr(), Base, Offset)) {
+ SDValue Chain = LD->getChain();
+ SDValue Ops[] = { Base, Offset, Chain };
+ CurDAG->SelectNodeTo(N, PPC::LXVDSX, N->getValueType(0), Ops);
+ return;
+ }
+ }
+
+ // For little endian, we must swap the input operands and adjust
+ // the mask elements (reverse and invert them).
+ if (PPCSubTarget->isLittleEndian()) {
+ std::swap(Op1, Op2);
+ unsigned tmp = DM[0];
+ DM[0] = 1 - DM[1];
+ DM[1] = 1 - tmp;
+ }
+
+ SDValue DMV = CurDAG->getTargetConstant(DM[1] | (DM[0] << 1), dl,
+ MVT::i32);
+ SDValue Ops[] = { Op1, Op2, DMV };
+ CurDAG->SelectNodeTo(N, PPC::XXPERMDI, N->getValueType(0), Ops);
+ return;
+ }
+
+ break;
+ case PPCISD::BDNZ:
+ case PPCISD::BDZ: {
+ bool IsPPC64 = PPCSubTarget->isPPC64();
+ SDValue Ops[] = { N->getOperand(1), N->getOperand(0) };
+ CurDAG->SelectNodeTo(N, N->getOpcode() == PPCISD::BDNZ
+ ? (IsPPC64 ? PPC::BDNZ8 : PPC::BDNZ)
+ : (IsPPC64 ? PPC::BDZ8 : PPC::BDZ),
+ MVT::Other, Ops);
+ return;
+ }
+ case PPCISD::COND_BRANCH: {
+ // Op #0 is the Chain.
+ // Op #1 is the PPC::PRED_* number.
+ // Op #2 is the CR#
+ // Op #3 is the Dest MBB
+ // Op #4 is the Flag.
+ // Prevent PPC::PRED_* from being selected into LI.
+ unsigned PCC = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ if (EnableBranchHint)
+ PCC |= getBranchHint(PCC, FuncInfo, N->getOperand(3));
+
+ SDValue Pred = getI32Imm(PCC, dl);
+ SDValue Ops[] = { Pred, N->getOperand(2), N->getOperand(3),
+ N->getOperand(0), N->getOperand(4) };
+ CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops);
+ return;
+ }
+ case ISD::BR_CC: {
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
+ unsigned PCC = getPredicateForSetCC(CC);
+
+ if (N->getOperand(2).getValueType() == MVT::i1) {
+ unsigned Opc;
+ bool Swap;
+ switch (PCC) {
+ default: llvm_unreachable("Unexpected Boolean-operand predicate");
+ case PPC::PRED_LT: Opc = PPC::CRANDC; Swap = true; break;
+ case PPC::PRED_LE: Opc = PPC::CRORC; Swap = true; break;
+ case PPC::PRED_EQ: Opc = PPC::CREQV; Swap = false; break;
+ case PPC::PRED_GE: Opc = PPC::CRORC; Swap = false; break;
+ case PPC::PRED_GT: Opc = PPC::CRANDC; Swap = false; break;
+ case PPC::PRED_NE: Opc = PPC::CRXOR; Swap = false; break;
+ }
+
+ SDValue BitComp(CurDAG->getMachineNode(Opc, dl, MVT::i1,
+ N->getOperand(Swap ? 3 : 2),
+ N->getOperand(Swap ? 2 : 3)), 0);
+ CurDAG->SelectNodeTo(N, PPC::BC, MVT::Other, BitComp, N->getOperand(4),
+ N->getOperand(0));
+ return;
+ }
+
+ if (EnableBranchHint)
+ PCC |= getBranchHint(PCC, FuncInfo, N->getOperand(4));
+
+ SDValue CondCode = SelectCC(N->getOperand(2), N->getOperand(3), CC, dl);
+ SDValue Ops[] = { getI32Imm(PCC, dl), CondCode,
+ N->getOperand(4), N->getOperand(0) };
+ CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops);
+ return;
+ }
+ case ISD::BRIND: {
+ // FIXME: Should custom lower this.
+ SDValue Chain = N->getOperand(0);
+ SDValue Target = N->getOperand(1);
+ unsigned Opc = Target.getValueType() == MVT::i32 ? PPC::MTCTR : PPC::MTCTR8;
+ unsigned Reg = Target.getValueType() == MVT::i32 ? PPC::BCTR : PPC::BCTR8;
+ Chain = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, Target,
+ Chain), 0);
+ CurDAG->SelectNodeTo(N, Reg, MVT::Other, Chain);
+ return;
+ }
+ case PPCISD::TOC_ENTRY: {
+ assert ((PPCSubTarget->isPPC64() || PPCSubTarget->isSVR4ABI()) &&
+ "Only supported for 64-bit ABI and 32-bit SVR4");
+ if (PPCSubTarget->isSVR4ABI() && !PPCSubTarget->isPPC64()) {
+ SDValue GA = N->getOperand(0);
+ SDNode *MN = CurDAG->getMachineNode(PPC::LWZtoc, dl, MVT::i32, GA,
+ N->getOperand(1));
+ transferMemOperands(N, MN);
+ ReplaceNode(N, MN);
+ return;
+ }
+
+ // For medium and large code model, we generate two instructions as
+ // described below. Otherwise we allow SelectCodeCommon to handle this,
+ // selecting one of LDtoc, LDtocJTI, LDtocCPT, and LDtocBA.
+ CodeModel::Model CModel = TM.getCodeModel();
+ if (CModel != CodeModel::Medium && CModel != CodeModel::Large)
+ break;
+
+ // The first source operand is a TargetGlobalAddress or a TargetJumpTable.
+ // If it must be toc-referenced according to PPCSubTarget, we generate:
+ // LDtocL(<ga:@sym>, ADDIStocHA(%X2, <ga:@sym>))
+ // Otherwise we generate:
+ // ADDItocL(ADDIStocHA(%X2, <ga:@sym>), <ga:@sym>)
+ SDValue GA = N->getOperand(0);
+ SDValue TOCbase = N->getOperand(1);
+ SDNode *Tmp = CurDAG->getMachineNode(PPC::ADDIStocHA, dl, MVT::i64,
+ TOCbase, GA);
+
+ if (isa<JumpTableSDNode>(GA) || isa<BlockAddressSDNode>(GA) ||
+ CModel == CodeModel::Large) {
+ SDNode *MN = CurDAG->getMachineNode(PPC::LDtocL, dl, MVT::i64, GA,
+ SDValue(Tmp, 0));
+ transferMemOperands(N, MN);
+ ReplaceNode(N, MN);
+ return;
+ }
+
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA)) {
+ const GlobalValue *GV = G->getGlobal();
+ unsigned char GVFlags = PPCSubTarget->classifyGlobalReference(GV);
+ if (GVFlags & PPCII::MO_NLP_FLAG) {
+ SDNode *MN = CurDAG->getMachineNode(PPC::LDtocL, dl, MVT::i64, GA,
+ SDValue(Tmp, 0));
+ transferMemOperands(N, MN);
+ ReplaceNode(N, MN);
+ return;
+ }
+ }
+
+ ReplaceNode(N, CurDAG->getMachineNode(PPC::ADDItocL, dl, MVT::i64,
+ SDValue(Tmp, 0), GA));
+ return;
+ }
+ case PPCISD::PPC32_PICGOT: {
+ // Generate a PIC-safe GOT reference.
+ assert(!PPCSubTarget->isPPC64() && PPCSubTarget->isSVR4ABI() &&
+ "PPCISD::PPC32_PICGOT is only supported for 32-bit SVR4");
+ CurDAG->SelectNodeTo(N, PPC::PPC32PICGOT,
+ PPCLowering->getPointerTy(CurDAG->getDataLayout()),
+ MVT::i32);
+ return;
+ }
+ case PPCISD::VADD_SPLAT: {
+ // This expands into one of three sequences, depending on whether
+ // the first operand is odd or even, positive or negative.
+ assert(isa<ConstantSDNode>(N->getOperand(0)) &&
+ isa<ConstantSDNode>(N->getOperand(1)) &&
+ "Invalid operand on VADD_SPLAT!");
+
+ int Elt = N->getConstantOperandVal(0);
+ int EltSize = N->getConstantOperandVal(1);
+ unsigned Opc1, Opc2, Opc3;
+ EVT VT;
+
+ if (EltSize == 1) {
+ Opc1 = PPC::VSPLTISB;
+ Opc2 = PPC::VADDUBM;
+ Opc3 = PPC::VSUBUBM;
+ VT = MVT::v16i8;
+ } else if (EltSize == 2) {
+ Opc1 = PPC::VSPLTISH;
+ Opc2 = PPC::VADDUHM;
+ Opc3 = PPC::VSUBUHM;
+ VT = MVT::v8i16;
+ } else {
+ assert(EltSize == 4 && "Invalid element size on VADD_SPLAT!");
+ Opc1 = PPC::VSPLTISW;
+ Opc2 = PPC::VADDUWM;
+ Opc3 = PPC::VSUBUWM;
+ VT = MVT::v4i32;
+ }
+
+ if ((Elt & 1) == 0) {
+ // Elt is even, in the range [-32,-18] + [16,30].
+ //
+ // Convert: VADD_SPLAT elt, size
+ // Into: tmp = VSPLTIS[BHW] elt
+ // VADDU[BHW]M tmp, tmp
+ // Where: [BHW] = B for size = 1, H for size = 2, W for size = 4
+ SDValue EltVal = getI32Imm(Elt >> 1, dl);
+ SDNode *Tmp = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
+ SDValue TmpVal = SDValue(Tmp, 0);
+ ReplaceNode(N, CurDAG->getMachineNode(Opc2, dl, VT, TmpVal, TmpVal));
+ return;
+
+ } else if (Elt > 0) {
+ // Elt is odd and positive, in the range [17,31].
+ //
+ // Convert: VADD_SPLAT elt, size
+ // Into: tmp1 = VSPLTIS[BHW] elt-16
+ // tmp2 = VSPLTIS[BHW] -16
+ // VSUBU[BHW]M tmp1, tmp2
+ SDValue EltVal = getI32Imm(Elt - 16, dl);
+ SDNode *Tmp1 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
+ EltVal = getI32Imm(-16, dl);
+ SDNode *Tmp2 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
+ ReplaceNode(N, CurDAG->getMachineNode(Opc3, dl, VT, SDValue(Tmp1, 0),
+ SDValue(Tmp2, 0)));
+ return;
+
+ } else {
+ // Elt is odd and negative, in the range [-31,-17].
+ //
+ // Convert: VADD_SPLAT elt, size
+ // Into: tmp1 = VSPLTIS[BHW] elt+16
+ // tmp2 = VSPLTIS[BHW] -16
+ // VADDU[BHW]M tmp1, tmp2
+ SDValue EltVal = getI32Imm(Elt + 16, dl);
+ SDNode *Tmp1 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
+ EltVal = getI32Imm(-16, dl);
+ SDNode *Tmp2 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
+ ReplaceNode(N, CurDAG->getMachineNode(Opc2, dl, VT, SDValue(Tmp1, 0),
+ SDValue(Tmp2, 0)));
+ return;
+ }
+ }
+ }
+
+ SelectCode(N);
+}
+
+// If the target supports the cmpb instruction, do the idiom recognition here.
+// We don't do this as a DAG combine because we don't want to do it as nodes
+// are being combined (because we might miss part of the eventual idiom). We
+// don't want to do it during instruction selection because we want to reuse
+// the logic for lowering the masking operations already part of the
+// instruction selector.
+SDValue PPCDAGToDAGISel::combineToCMPB(SDNode *N) {
+ SDLoc dl(N);
+
+ assert(N->getOpcode() == ISD::OR &&
+ "Only OR nodes are supported for CMPB");
+
+ SDValue Res;
+ if (!PPCSubTarget->hasCMPB())
+ return Res;
+
+ if (N->getValueType(0) != MVT::i32 &&
+ N->getValueType(0) != MVT::i64)
+ return Res;
+
+ EVT VT = N->getValueType(0);
+
+ SDValue RHS, LHS;
+ bool BytesFound[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+ uint64_t Mask = 0, Alt = 0;
+
+ auto IsByteSelectCC = [this](SDValue O, unsigned &b,
+ uint64_t &Mask, uint64_t &Alt,
+ SDValue &LHS, SDValue &RHS) {
+ if (O.getOpcode() != ISD::SELECT_CC)
+ return false;
+ ISD::CondCode CC = cast<CondCodeSDNode>(O.getOperand(4))->get();
+
+ if (!isa<ConstantSDNode>(O.getOperand(2)) ||
+ !isa<ConstantSDNode>(O.getOperand(3)))
+ return false;
+
+ uint64_t PM = O.getConstantOperandVal(2);
+ uint64_t PAlt = O.getConstantOperandVal(3);
+ for (b = 0; b < 8; ++b) {
+ uint64_t Mask = UINT64_C(0xFF) << (8*b);
+ if (PM && (PM & Mask) == PM && (PAlt & Mask) == PAlt)
+ break;
+ }
+
+ if (b == 8)
+ return false;
+ Mask |= PM;
+ Alt |= PAlt;
+
+ if (!isa<ConstantSDNode>(O.getOperand(1)) ||
+ O.getConstantOperandVal(1) != 0) {
+ SDValue Op0 = O.getOperand(0), Op1 = O.getOperand(1);
+ if (Op0.getOpcode() == ISD::TRUNCATE)
+ Op0 = Op0.getOperand(0);
+ if (Op1.getOpcode() == ISD::TRUNCATE)
+ Op1 = Op1.getOperand(0);
+
+ if (Op0.getOpcode() == ISD::SRL && Op1.getOpcode() == ISD::SRL &&
+ Op0.getOperand(1) == Op1.getOperand(1) && CC == ISD::SETEQ &&
+ isa<ConstantSDNode>(Op0.getOperand(1))) {
+
+ unsigned Bits = Op0.getValueSizeInBits();
+ if (b != Bits/8-1)
+ return false;
+ if (Op0.getConstantOperandVal(1) != Bits-8)
+ return false;
+
+ LHS = Op0.getOperand(0);
+ RHS = Op1.getOperand(0);
+ return true;
+ }
+
+ // When we have small integers (i16 to be specific), the form present
+ // post-legalization uses SETULT in the SELECT_CC for the
+ // higher-order byte, depending on the fact that the
+ // even-higher-order bytes are known to all be zero, for example:
+ // select_cc (xor $lhs, $rhs), 256, 65280, 0, setult
+ // (so when the second byte is the same, because all higher-order
+ // bits from bytes 3 and 4 are known to be zero, the result of the
+ // xor can be at most 255)
+ if (Op0.getOpcode() == ISD::XOR && CC == ISD::SETULT &&
+ isa<ConstantSDNode>(O.getOperand(1))) {
+
+ uint64_t ULim = O.getConstantOperandVal(1);
+ if (ULim != (UINT64_C(1) << b*8))
+ return false;
+
+ // Now we need to make sure that the upper bytes are known to be
+ // zero.
+ unsigned Bits = Op0.getValueSizeInBits();
+ if (!CurDAG->MaskedValueIsZero(
+ Op0, APInt::getHighBitsSet(Bits, Bits - (b + 1) * 8)))
+ return false;
+
+ LHS = Op0.getOperand(0);
+ RHS = Op0.getOperand(1);
+ return true;
+ }
+
+ return false;
+ }
+
+ if (CC != ISD::SETEQ)
+ return false;
+
+ SDValue Op = O.getOperand(0);
+ if (Op.getOpcode() == ISD::AND) {
+ if (!isa<ConstantSDNode>(Op.getOperand(1)))
+ return false;
+ if (Op.getConstantOperandVal(1) != (UINT64_C(0xFF) << (8*b)))
+ return false;
+
+ SDValue XOR = Op.getOperand(0);
+ if (XOR.getOpcode() == ISD::TRUNCATE)
+ XOR = XOR.getOperand(0);
+ if (XOR.getOpcode() != ISD::XOR)
+ return false;
+
+ LHS = XOR.getOperand(0);
+ RHS = XOR.getOperand(1);
+ return true;
+ } else if (Op.getOpcode() == ISD::SRL) {
+ if (!isa<ConstantSDNode>(Op.getOperand(1)))
+ return false;
+ unsigned Bits = Op.getValueSizeInBits();
+ if (b != Bits/8-1)
+ return false;
+ if (Op.getConstantOperandVal(1) != Bits-8)
+ return false;
+
+ SDValue XOR = Op.getOperand(0);
+ if (XOR.getOpcode() == ISD::TRUNCATE)
+ XOR = XOR.getOperand(0);
+ if (XOR.getOpcode() != ISD::XOR)
+ return false;
+
+ LHS = XOR.getOperand(0);
+ RHS = XOR.getOperand(1);
+ return true;
+ }
+
+ return false;
+ };
+
+ SmallVector<SDValue, 8> Queue(1, SDValue(N, 0));
+ while (!Queue.empty()) {
+ SDValue V = Queue.pop_back_val();
+
+ for (const SDValue &O : V.getNode()->ops()) {
+ unsigned b;
+ uint64_t M = 0, A = 0;
+ SDValue OLHS, ORHS;
+ if (O.getOpcode() == ISD::OR) {
+ Queue.push_back(O);
+ } else if (IsByteSelectCC(O, b, M, A, OLHS, ORHS)) {
+ if (!LHS) {
+ LHS = OLHS;
+ RHS = ORHS;
+ BytesFound[b] = true;
+ Mask |= M;
+ Alt |= A;
+ } else if ((LHS == ORHS && RHS == OLHS) ||
+ (RHS == ORHS && LHS == OLHS)) {
+ BytesFound[b] = true;
+ Mask |= M;
+ Alt |= A;
+ } else {
+ return Res;
+ }
+ } else {
+ return Res;
+ }
+ }
+ }
+
+ unsigned LastB = 0, BCnt = 0;
+ for (unsigned i = 0; i < 8; ++i)
+ if (BytesFound[LastB]) {
+ ++BCnt;
+ LastB = i;
+ }
+
+ if (!LastB || BCnt < 2)
+ return Res;
+
+ // Because we'll be zero-extending the output anyway if don't have a specific
+ // value for each input byte (via the Mask), we can 'anyext' the inputs.
+ if (LHS.getValueType() != VT) {
+ LHS = CurDAG->getAnyExtOrTrunc(LHS, dl, VT);
+ RHS = CurDAG->getAnyExtOrTrunc(RHS, dl, VT);
+ }
+
+ Res = CurDAG->getNode(PPCISD::CMPB, dl, VT, LHS, RHS);
+
+ bool NonTrivialMask = ((int64_t) Mask) != INT64_C(-1);
+ if (NonTrivialMask && !Alt) {
+ // Res = Mask & CMPB
+ Res = CurDAG->getNode(ISD::AND, dl, VT, Res,
+ CurDAG->getConstant(Mask, dl, VT));
+ } else if (Alt) {
+ // Res = (CMPB & Mask) | (~CMPB & Alt)
+ // Which, as suggested here:
+ // https://graphics.stanford.edu/~seander/bithacks.html#MaskedMerge
+ // can be written as:
+ // Res = Alt ^ ((Alt ^ Mask) & CMPB)
+ // useful because the (Alt ^ Mask) can be pre-computed.
+ Res = CurDAG->getNode(ISD::AND, dl, VT, Res,
+ CurDAG->getConstant(Mask ^ Alt, dl, VT));
+ Res = CurDAG->getNode(ISD::XOR, dl, VT, Res,
+ CurDAG->getConstant(Alt, dl, VT));
+ }
+
+ return Res;
+}
+
+// When CR bit registers are enabled, an extension of an i1 variable to a i32
+// or i64 value is lowered in terms of a SELECT_I[48] operation, and thus
+// involves constant materialization of a 0 or a 1 or both. If the result of
+// the extension is then operated upon by some operator that can be constant
+// folded with a constant 0 or 1, and that constant can be materialized using
+// only one instruction (like a zero or one), then we should fold in those
+// operations with the select.
+void PPCDAGToDAGISel::foldBoolExts(SDValue &Res, SDNode *&N) {
+ if (!PPCSubTarget->useCRBits())
+ return;
+
+ if (N->getOpcode() != ISD::ZERO_EXTEND &&
+ N->getOpcode() != ISD::SIGN_EXTEND &&
+ N->getOpcode() != ISD::ANY_EXTEND)
+ return;
+
+ if (N->getOperand(0).getValueType() != MVT::i1)
+ return;
+
+ if (!N->hasOneUse())
+ return;
+
+ SDLoc dl(N);
+ EVT VT = N->getValueType(0);
+ SDValue Cond = N->getOperand(0);
+ SDValue ConstTrue =
+ CurDAG->getConstant(N->getOpcode() == ISD::SIGN_EXTEND ? -1 : 1, dl, VT);
+ SDValue ConstFalse = CurDAG->getConstant(0, dl, VT);
+
+ do {
+ SDNode *User = *N->use_begin();
+ if (User->getNumOperands() != 2)
+ break;
+
+ auto TryFold = [this, N, User, dl](SDValue Val) {
+ SDValue UserO0 = User->getOperand(0), UserO1 = User->getOperand(1);
+ SDValue O0 = UserO0.getNode() == N ? Val : UserO0;
+ SDValue O1 = UserO1.getNode() == N ? Val : UserO1;
+
+ return CurDAG->FoldConstantArithmetic(User->getOpcode(), dl,
+ User->getValueType(0),
+ O0.getNode(), O1.getNode());
+ };
+
+ SDValue TrueRes = TryFold(ConstTrue);
+ if (!TrueRes)
+ break;
+ SDValue FalseRes = TryFold(ConstFalse);
+ if (!FalseRes)
+ break;
+
+ // For us to materialize these using one instruction, we must be able to
+ // represent them as signed 16-bit integers.
+ uint64_t True = cast<ConstantSDNode>(TrueRes)->getZExtValue(),
+ False = cast<ConstantSDNode>(FalseRes)->getZExtValue();
+ if (!isInt<16>(True) || !isInt<16>(False))
+ break;
+
+ // We can replace User with a new SELECT node, and try again to see if we
+ // can fold the select with its user.
+ Res = CurDAG->getSelect(dl, User->getValueType(0), Cond, TrueRes, FalseRes);
+ N = User;
+ ConstTrue = TrueRes;
+ ConstFalse = FalseRes;
+ } while (N->hasOneUse());
+}
+
+void PPCDAGToDAGISel::PreprocessISelDAG() {
+ SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
+ ++Position;
+
+ bool MadeChange = false;
+ while (Position != CurDAG->allnodes_begin()) {
+ SDNode *N = &*--Position;
+ if (N->use_empty())
+ continue;
+
+ SDValue Res;
+ switch (N->getOpcode()) {
+ default: break;
+ case ISD::OR:
+ Res = combineToCMPB(N);
+ break;
+ }
+
+ if (!Res)
+ foldBoolExts(Res, N);
+
+ if (Res) {
+ DEBUG(dbgs() << "PPC DAG preprocessing replacing:\nOld: ");
+ DEBUG(N->dump(CurDAG));
+ DEBUG(dbgs() << "\nNew: ");
+ DEBUG(Res.getNode()->dump(CurDAG));
+ DEBUG(dbgs() << "\n");
+
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
+ MadeChange = true;
+ }
+ }
+
+ if (MadeChange)
+ CurDAG->RemoveDeadNodes();
+}
+
+/// PostprocessISelDAG - Perform some late peephole optimizations
+/// on the DAG representation.
+void PPCDAGToDAGISel::PostprocessISelDAG() {
+
+ // Skip peepholes at -O0.
+ if (TM.getOptLevel() == CodeGenOpt::None)
+ return;
+
+ PeepholePPC64();
+ PeepholeCROps();
+ PeepholePPC64ZExt();
+}
+
+// Check if all users of this node will become isel where the second operand
+// is the constant zero. If this is so, and if we can negate the condition,
+// then we can flip the true and false operands. This will allow the zero to
+// be folded with the isel so that we don't need to materialize a register
+// containing zero.
+bool PPCDAGToDAGISel::AllUsersSelectZero(SDNode *N) {
+ // If we're not using isel, then this does not matter.
+ if (!PPCSubTarget->hasISEL())
+ return false;
+
+ for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
+ UI != UE; ++UI) {
+ SDNode *User = *UI;
+ if (!User->isMachineOpcode())
+ return false;
+ if (User->getMachineOpcode() != PPC::SELECT_I4 &&
+ User->getMachineOpcode() != PPC::SELECT_I8)
+ return false;
+
+ SDNode *Op2 = User->getOperand(2).getNode();
+ if (!Op2->isMachineOpcode())
+ return false;
+
+ if (Op2->getMachineOpcode() != PPC::LI &&
+ Op2->getMachineOpcode() != PPC::LI8)
+ return false;
+
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op2->getOperand(0));
+ if (!C)
+ return false;
+
+ if (!C->isNullValue())
+ return false;
+ }
+
+ return true;
+}
+
+void PPCDAGToDAGISel::SwapAllSelectUsers(SDNode *N) {
+ SmallVector<SDNode *, 4> ToReplace;
+ for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
+ UI != UE; ++UI) {
+ SDNode *User = *UI;
+ assert((User->getMachineOpcode() == PPC::SELECT_I4 ||
+ User->getMachineOpcode() == PPC::SELECT_I8) &&
+ "Must have all select users");
+ ToReplace.push_back(User);
+ }
+
+ for (SmallVector<SDNode *, 4>::iterator UI = ToReplace.begin(),
+ UE = ToReplace.end(); UI != UE; ++UI) {
+ SDNode *User = *UI;
+ SDNode *ResNode =
+ CurDAG->getMachineNode(User->getMachineOpcode(), SDLoc(User),
+ User->getValueType(0), User->getOperand(0),
+ User->getOperand(2),
+ User->getOperand(1));
+
+ DEBUG(dbgs() << "CR Peephole replacing:\nOld: ");
+ DEBUG(User->dump(CurDAG));
+ DEBUG(dbgs() << "\nNew: ");
+ DEBUG(ResNode->dump(CurDAG));
+ DEBUG(dbgs() << "\n");
+
+ ReplaceUses(User, ResNode);
+ }
+}
+
+void PPCDAGToDAGISel::PeepholeCROps() {
+ bool IsModified;
+ do {
+ IsModified = false;
+ for (SDNode &Node : CurDAG->allnodes()) {
+ MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(&Node);
+ if (!MachineNode || MachineNode->use_empty())
+ continue;
+ SDNode *ResNode = MachineNode;
+
+ bool Op1Set = false, Op1Unset = false,
+ Op1Not = false,
+ Op2Set = false, Op2Unset = false,
+ Op2Not = false;
+
+ unsigned Opcode = MachineNode->getMachineOpcode();
+ switch (Opcode) {
+ default: break;
+ case PPC::CRAND:
+ case PPC::CRNAND:
+ case PPC::CROR:
+ case PPC::CRXOR:
+ case PPC::CRNOR:
+ case PPC::CREQV:
+ case PPC::CRANDC:
+ case PPC::CRORC: {
+ SDValue Op = MachineNode->getOperand(1);
+ if (Op.isMachineOpcode()) {
+ if (Op.getMachineOpcode() == PPC::CRSET)
+ Op2Set = true;
+ else if (Op.getMachineOpcode() == PPC::CRUNSET)
+ Op2Unset = true;
+ else if (Op.getMachineOpcode() == PPC::CRNOR &&
+ Op.getOperand(0) == Op.getOperand(1))
+ Op2Not = true;
+ }
+ LLVM_FALLTHROUGH;
+ }
+ case PPC::BC:
+ case PPC::BCn:
+ case PPC::SELECT_I4:
+ case PPC::SELECT_I8:
+ case PPC::SELECT_F4:
+ case PPC::SELECT_F8:
+ case PPC::SELECT_QFRC:
+ case PPC::SELECT_QSRC:
+ case PPC::SELECT_QBRC:
+ case PPC::SELECT_VRRC:
+ case PPC::SELECT_VSFRC:
+ case PPC::SELECT_VSSRC:
+ case PPC::SELECT_VSRC: {
+ SDValue Op = MachineNode->getOperand(0);
+ if (Op.isMachineOpcode()) {
+ if (Op.getMachineOpcode() == PPC::CRSET)
+ Op1Set = true;
+ else if (Op.getMachineOpcode() == PPC::CRUNSET)
+ Op1Unset = true;
+ else if (Op.getMachineOpcode() == PPC::CRNOR &&
+ Op.getOperand(0) == Op.getOperand(1))
+ Op1Not = true;
+ }
+ }
+ break;
+ }
+
+ bool SelectSwap = false;
+ switch (Opcode) {
+ default: break;
+ case PPC::CRAND:
+ if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
+ // x & x = x
+ ResNode = MachineNode->getOperand(0).getNode();
+ else if (Op1Set)
+ // 1 & y = y
+ ResNode = MachineNode->getOperand(1).getNode();
+ else if (Op2Set)
+ // x & 1 = x
+ ResNode = MachineNode->getOperand(0).getNode();
+ else if (Op1Unset || Op2Unset)
+ // x & 0 = 0 & y = 0
+ ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode),
+ MVT::i1);
+ else if (Op1Not)
+ // ~x & y = andc(y, x)
+ ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode),
+ MVT::i1, MachineNode->getOperand(1),
+ MachineNode->getOperand(0).
+ getOperand(0));
+ else if (Op2Not)
+ // x & ~y = andc(x, y)
+ ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode),
+ MVT::i1, MachineNode->getOperand(0),
+ MachineNode->getOperand(1).
+ getOperand(0));
+ else if (AllUsersSelectZero(MachineNode)) {
+ ResNode = CurDAG->getMachineNode(PPC::CRNAND, SDLoc(MachineNode),
+ MVT::i1, MachineNode->getOperand(0),
+ MachineNode->getOperand(1));
+ SelectSwap = true;
+ }
+ break;
+ case PPC::CRNAND:
+ if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
+ // nand(x, x) -> nor(x, x)
+ ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+ MVT::i1, MachineNode->getOperand(0),
+ MachineNode->getOperand(0));
+ else if (Op1Set)
+ // nand(1, y) -> nor(y, y)
+ ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+ MVT::i1, MachineNode->getOperand(1),
+ MachineNode->getOperand(1));
+ else if (Op2Set)
+ // nand(x, 1) -> nor(x, x)
+ ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+ MVT::i1, MachineNode->getOperand(0),
+ MachineNode->getOperand(0));
+ else if (Op1Unset || Op2Unset)
+ // nand(x, 0) = nand(0, y) = 1
+ ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode),
+ MVT::i1);
+ else if (Op1Not)
+ // nand(~x, y) = ~(~x & y) = x | ~y = orc(x, y)
+ ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode),
+ MVT::i1, MachineNode->getOperand(0).
+ getOperand(0),
+ MachineNode->getOperand(1));
+ else if (Op2Not)
+ // nand(x, ~y) = ~x | y = orc(y, x)
+ ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode),
+ MVT::i1, MachineNode->getOperand(1).
+ getOperand(0),
+ MachineNode->getOperand(0));
+ else if (AllUsersSelectZero(MachineNode)) {
+ ResNode = CurDAG->getMachineNode(PPC::CRAND, SDLoc(MachineNode),
+ MVT::i1, MachineNode->getOperand(0),
+ MachineNode->getOperand(1));
+ SelectSwap = true;
+ }
+ break;
+ case PPC::CROR:
+ if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
+ // x | x = x
+ ResNode = MachineNode->getOperand(0).getNode();
+ else if (Op1Set || Op2Set)
+ // x | 1 = 1 | y = 1
+ ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode),
+ MVT::i1);
+ else if (Op1Unset)
+ // 0 | y = y
+ ResNode = MachineNode->getOperand(1).getNode();
+ else if (Op2Unset)
+ // x | 0 = x
+ ResNode = MachineNode->getOperand(0).getNode();
+ else if (Op1Not)
+ // ~x | y = orc(y, x)
+ ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode),
+ MVT::i1, MachineNode->getOperand(1),
+ MachineNode->getOperand(0).
+ getOperand(0));
+ else if (Op2Not)
+ // x | ~y = orc(x, y)
+ ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode),
+ MVT::i1, MachineNode->getOperand(0),
+ MachineNode->getOperand(1).
+ getOperand(0));
+ else if (AllUsersSelectZero(MachineNode)) {
+ ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+ MVT::i1, MachineNode->getOperand(0),
+ MachineNode->getOperand(1));
+ SelectSwap = true;
+ }
+ break;
+ case PPC::CRXOR:
+ if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
+ // xor(x, x) = 0
+ ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode),
+ MVT::i1);
+ else if (Op1Set)
+ // xor(1, y) -> nor(y, y)
+ ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+ MVT::i1, MachineNode->getOperand(1),
+ MachineNode->getOperand(1));
+ else if (Op2Set)
+ // xor(x, 1) -> nor(x, x)
+ ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+ MVT::i1, MachineNode->getOperand(0),
+ MachineNode->getOperand(0));
+ else if (Op1Unset)
+ // xor(0, y) = y
+ ResNode = MachineNode->getOperand(1).getNode();
+ else if (Op2Unset)
+ // xor(x, 0) = x
+ ResNode = MachineNode->getOperand(0).getNode();
+ else if (Op1Not)
+ // xor(~x, y) = eqv(x, y)
+ ResNode = CurDAG->getMachineNode(PPC::CREQV, SDLoc(MachineNode),
+ MVT::i1, MachineNode->getOperand(0).
+ getOperand(0),
+ MachineNode->getOperand(1));
+ else if (Op2Not)
+ // xor(x, ~y) = eqv(x, y)
+ ResNode = CurDAG->getMachineNode(PPC::CREQV, SDLoc(MachineNode),
+ MVT::i1, MachineNode->getOperand(0),
+ MachineNode->getOperand(1).
+ getOperand(0));
+ else if (AllUsersSelectZero(MachineNode)) {
+ ResNode = CurDAG->getMachineNode(PPC::CREQV, SDLoc(MachineNode),
+ MVT::i1, MachineNode->getOperand(0),
+ MachineNode->getOperand(1));
+ SelectSwap = true;
+ }
+ break;
+ case PPC::CRNOR:
+ if (Op1Set || Op2Set)
+ // nor(1, y) -> 0
+ ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode),
+ MVT::i1);
+ else if (Op1Unset)
+ // nor(0, y) = ~y -> nor(y, y)
+ ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+ MVT::i1, MachineNode->getOperand(1),
+ MachineNode->getOperand(1));
+ else if (Op2Unset)
+ // nor(x, 0) = ~x
+ ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+ MVT::i1, MachineNode->getOperand(0),
+ MachineNode->getOperand(0));
+ else if (Op1Not)
+ // nor(~x, y) = andc(x, y)
+ ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode),
+ MVT::i1, MachineNode->getOperand(0).
+ getOperand(0),
+ MachineNode->getOperand(1));
+ else if (Op2Not)
+ // nor(x, ~y) = andc(y, x)
+ ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode),
+ MVT::i1, MachineNode->getOperand(1).
+ getOperand(0),
+ MachineNode->getOperand(0));
+ else if (AllUsersSelectZero(MachineNode)) {
+ ResNode = CurDAG->getMachineNode(PPC::CROR, SDLoc(MachineNode),
+ MVT::i1, MachineNode->getOperand(0),
+ MachineNode->getOperand(1));
+ SelectSwap = true;
+ }
+ break;
+ case PPC::CREQV:
+ if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
+ // eqv(x, x) = 1
+ ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode),
+ MVT::i1);
+ else if (Op1Set)
+ // eqv(1, y) = y
+ ResNode = MachineNode->getOperand(1).getNode();
+ else if (Op2Set)
+ // eqv(x, 1) = x
+ ResNode = MachineNode->getOperand(0).getNode();
+ else if (Op1Unset)
+ // eqv(0, y) = ~y -> nor(y, y)
+ ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+ MVT::i1, MachineNode->getOperand(1),
+ MachineNode->getOperand(1));
+ else if (Op2Unset)
+ // eqv(x, 0) = ~x
+ ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+ MVT::i1, MachineNode->getOperand(0),
+ MachineNode->getOperand(0));
+ else if (Op1Not)
+ // eqv(~x, y) = xor(x, y)
+ ResNode = CurDAG->getMachineNode(PPC::CRXOR, SDLoc(MachineNode),
+ MVT::i1, MachineNode->getOperand(0).
+ getOperand(0),
+ MachineNode->getOperand(1));
+ else if (Op2Not)
+ // eqv(x, ~y) = xor(x, y)
+ ResNode = CurDAG->getMachineNode(PPC::CRXOR, SDLoc(MachineNode),
+ MVT::i1, MachineNode->getOperand(0),
+ MachineNode->getOperand(1).
+ getOperand(0));
+ else if (AllUsersSelectZero(MachineNode)) {
+ ResNode = CurDAG->getMachineNode(PPC::CRXOR, SDLoc(MachineNode),
+ MVT::i1, MachineNode->getOperand(0),
+ MachineNode->getOperand(1));
+ SelectSwap = true;
+ }
+ break;
+ case PPC::CRANDC:
+ if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
+ // andc(x, x) = 0
+ ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode),
+ MVT::i1);
+ else if (Op1Set)
+ // andc(1, y) = ~y
+ ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+ MVT::i1, MachineNode->getOperand(1),
+ MachineNode->getOperand(1));
+ else if (Op1Unset || Op2Set)
+ // andc(0, y) = andc(x, 1) = 0
+ ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode),
+ MVT::i1);
+ else if (Op2Unset)
+ // andc(x, 0) = x
+ ResNode = MachineNode->getOperand(0).getNode();
+ else if (Op1Not)
+ // andc(~x, y) = ~(x | y) = nor(x, y)
+ ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+ MVT::i1, MachineNode->getOperand(0).
+ getOperand(0),
+ MachineNode->getOperand(1));
+ else if (Op2Not)
+ // andc(x, ~y) = x & y
+ ResNode = CurDAG->getMachineNode(PPC::CRAND, SDLoc(MachineNode),
+ MVT::i1, MachineNode->getOperand(0),
+ MachineNode->getOperand(1).
+ getOperand(0));
+ else if (AllUsersSelectZero(MachineNode)) {
+ ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode),
+ MVT::i1, MachineNode->getOperand(1),
+ MachineNode->getOperand(0));
+ SelectSwap = true;
+ }
+ break;
+ case PPC::CRORC:
+ if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
+ // orc(x, x) = 1
+ ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode),
+ MVT::i1);
+ else if (Op1Set || Op2Unset)
+ // orc(1, y) = orc(x, 0) = 1
+ ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode),
+ MVT::i1);
+ else if (Op2Set)
+ // orc(x, 1) = x
+ ResNode = MachineNode->getOperand(0).getNode();
+ else if (Op1Unset)
+ // orc(0, y) = ~y
+ ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+ MVT::i1, MachineNode->getOperand(1),
+ MachineNode->getOperand(1));
+ else if (Op1Not)
+ // orc(~x, y) = ~(x & y) = nand(x, y)
+ ResNode = CurDAG->getMachineNode(PPC::CRNAND, SDLoc(MachineNode),
+ MVT::i1, MachineNode->getOperand(0).
+ getOperand(0),
+ MachineNode->getOperand(1));
+ else if (Op2Not)
+ // orc(x, ~y) = x | y
+ ResNode = CurDAG->getMachineNode(PPC::CROR, SDLoc(MachineNode),
+ MVT::i1, MachineNode->getOperand(0),
+ MachineNode->getOperand(1).
+ getOperand(0));
+ else if (AllUsersSelectZero(MachineNode)) {
+ ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode),
+ MVT::i1, MachineNode->getOperand(1),
+ MachineNode->getOperand(0));
+ SelectSwap = true;
+ }
+ break;
+ case PPC::SELECT_I4:
+ case PPC::SELECT_I8:
+ case PPC::SELECT_F4:
+ case PPC::SELECT_F8:
+ case PPC::SELECT_QFRC:
+ case PPC::SELECT_QSRC:
+ case PPC::SELECT_QBRC:
+ case PPC::SELECT_VRRC:
+ case PPC::SELECT_VSFRC:
+ case PPC::SELECT_VSSRC:
+ case PPC::SELECT_VSRC:
+ if (Op1Set)
+ ResNode = MachineNode->getOperand(1).getNode();
+ else if (Op1Unset)
+ ResNode = MachineNode->getOperand(2).getNode();
+ else if (Op1Not)
+ ResNode = CurDAG->getMachineNode(MachineNode->getMachineOpcode(),
+ SDLoc(MachineNode),
+ MachineNode->getValueType(0),
+ MachineNode->getOperand(0).
+ getOperand(0),
+ MachineNode->getOperand(2),
+ MachineNode->getOperand(1));
+ break;
+ case PPC::BC:
+ case PPC::BCn:
+ if (Op1Not)
+ ResNode = CurDAG->getMachineNode(Opcode == PPC::BC ? PPC::BCn :
+ PPC::BC,
+ SDLoc(MachineNode),
+ MVT::Other,
+ MachineNode->getOperand(0).
+ getOperand(0),
+ MachineNode->getOperand(1),
+ MachineNode->getOperand(2));
+ // FIXME: Handle Op1Set, Op1Unset here too.
+ break;
+ }
+
+ // If we're inverting this node because it is used only by selects that
+ // we'd like to swap, then swap the selects before the node replacement.
+ if (SelectSwap)
+ SwapAllSelectUsers(MachineNode);
+
+ if (ResNode != MachineNode) {
+ DEBUG(dbgs() << "CR Peephole replacing:\nOld: ");
+ DEBUG(MachineNode->dump(CurDAG));
+ DEBUG(dbgs() << "\nNew: ");
+ DEBUG(ResNode->dump(CurDAG));
+ DEBUG(dbgs() << "\n");
+
+ ReplaceUses(MachineNode, ResNode);
+ IsModified = true;
+ }
+ }
+ if (IsModified)
+ CurDAG->RemoveDeadNodes();
+ } while (IsModified);
+}
+
+// Gather the set of 32-bit operations that are known to have their
+// higher-order 32 bits zero, where ToPromote contains all such operations.
+static bool PeepholePPC64ZExtGather(SDValue Op32,
+ SmallPtrSetImpl<SDNode *> &ToPromote) {
+ if (!Op32.isMachineOpcode())
+ return false;
+
+ // First, check for the "frontier" instructions (those that will clear the
+ // higher-order 32 bits.
+
+ // For RLWINM and RLWNM, we need to make sure that the mask does not wrap
+ // around. If it does not, then these instructions will clear the
+ // higher-order bits.
+ if ((Op32.getMachineOpcode() == PPC::RLWINM ||
+ Op32.getMachineOpcode() == PPC::RLWNM) &&
+ Op32.getConstantOperandVal(2) <= Op32.getConstantOperandVal(3)) {
+ ToPromote.insert(Op32.getNode());
+ return true;
+ }
+
+ // SLW and SRW always clear the higher-order bits.
+ if (Op32.getMachineOpcode() == PPC::SLW ||
+ Op32.getMachineOpcode() == PPC::SRW) {
+ ToPromote.insert(Op32.getNode());
+ return true;
+ }
+
+ // For LI and LIS, we need the immediate to be positive (so that it is not
+ // sign extended).
+ if (Op32.getMachineOpcode() == PPC::LI ||
+ Op32.getMachineOpcode() == PPC::LIS) {
+ if (!isUInt<15>(Op32.getConstantOperandVal(0)))
+ return false;
+
+ ToPromote.insert(Op32.getNode());
+ return true;
+ }
+
+ // LHBRX and LWBRX always clear the higher-order bits.
+ if (Op32.getMachineOpcode() == PPC::LHBRX ||
+ Op32.getMachineOpcode() == PPC::LWBRX) {
+ ToPromote.insert(Op32.getNode());
+ return true;
+ }
+
+ // CNT[LT]ZW always produce a 64-bit value in [0,32], and so is zero extended.
+ if (Op32.getMachineOpcode() == PPC::CNTLZW ||
+ Op32.getMachineOpcode() == PPC::CNTTZW) {
+ ToPromote.insert(Op32.getNode());
+ return true;
+ }
+
+ // Next, check for those instructions we can look through.
+
+ // Assuming the mask does not wrap around, then the higher-order bits are
+ // taken directly from the first operand.
+ if (Op32.getMachineOpcode() == PPC::RLWIMI &&
+ Op32.getConstantOperandVal(3) <= Op32.getConstantOperandVal(4)) {
+ SmallPtrSet<SDNode *, 16> ToPromote1;
+ if (!PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1))
+ return false;
+
+ ToPromote.insert(Op32.getNode());
+ ToPromote.insert(ToPromote1.begin(), ToPromote1.end());
+ return true;
+ }
+
+ // For OR, the higher-order bits are zero if that is true for both operands.
+ // For SELECT_I4, the same is true (but the relevant operand numbers are
+ // shifted by 1).
+ if (Op32.getMachineOpcode() == PPC::OR ||
+ Op32.getMachineOpcode() == PPC::SELECT_I4) {
+ unsigned B = Op32.getMachineOpcode() == PPC::SELECT_I4 ? 1 : 0;
+ SmallPtrSet<SDNode *, 16> ToPromote1;
+ if (!PeepholePPC64ZExtGather(Op32.getOperand(B+0), ToPromote1))
+ return false;
+ if (!PeepholePPC64ZExtGather(Op32.getOperand(B+1), ToPromote1))
+ return false;
+
+ ToPromote.insert(Op32.getNode());
+ ToPromote.insert(ToPromote1.begin(), ToPromote1.end());
+ return true;
+ }
+
+ // For ORI and ORIS, we need the higher-order bits of the first operand to be
+ // zero, and also for the constant to be positive (so that it is not sign
+ // extended).
+ if (Op32.getMachineOpcode() == PPC::ORI ||
+ Op32.getMachineOpcode() == PPC::ORIS) {
+ SmallPtrSet<SDNode *, 16> ToPromote1;
+ if (!PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1))
+ return false;
+ if (!isUInt<15>(Op32.getConstantOperandVal(1)))
+ return false;
+
+ ToPromote.insert(Op32.getNode());
+ ToPromote.insert(ToPromote1.begin(), ToPromote1.end());
+ return true;
+ }
+
+ // The higher-order bits of AND are zero if that is true for at least one of
+ // the operands.
+ if (Op32.getMachineOpcode() == PPC::AND) {
+ SmallPtrSet<SDNode *, 16> ToPromote1, ToPromote2;
+ bool Op0OK =
+ PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1);
+ bool Op1OK =
+ PeepholePPC64ZExtGather(Op32.getOperand(1), ToPromote2);
+ if (!Op0OK && !Op1OK)
+ return false;
+
+ ToPromote.insert(Op32.getNode());
+
+ if (Op0OK)
+ ToPromote.insert(ToPromote1.begin(), ToPromote1.end());
+
+ if (Op1OK)
+ ToPromote.insert(ToPromote2.begin(), ToPromote2.end());
+
+ return true;
+ }
+
+ // For ANDI and ANDIS, the higher-order bits are zero if either that is true
+ // of the first operand, or if the second operand is positive (so that it is
+ // not sign extended).
+ if (Op32.getMachineOpcode() == PPC::ANDIo ||
+ Op32.getMachineOpcode() == PPC::ANDISo) {
+ SmallPtrSet<SDNode *, 16> ToPromote1;
+ bool Op0OK =
+ PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1);
+ bool Op1OK = isUInt<15>(Op32.getConstantOperandVal(1));
+ if (!Op0OK && !Op1OK)
+ return false;
+
+ ToPromote.insert(Op32.getNode());
+
+ if (Op0OK)
+ ToPromote.insert(ToPromote1.begin(), ToPromote1.end());
+
+ return true;
+ }
+
+ return false;
+}
+
+void PPCDAGToDAGISel::PeepholePPC64ZExt() {
+ if (!PPCSubTarget->isPPC64())
+ return;
+
+ // When we zero-extend from i32 to i64, we use a pattern like this:
+ // def : Pat<(i64 (zext i32:$in)),
+ // (RLDICL (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $in, sub_32),
+ // 0, 32)>;
+ // There are several 32-bit shift/rotate instructions, however, that will
+ // clear the higher-order bits of their output, rendering the RLDICL
+ // unnecessary. When that happens, we remove it here, and redefine the
+ // relevant 32-bit operation to be a 64-bit operation.
+
+ SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
+ ++Position;
+
+ bool MadeChange = false;
+ while (Position != CurDAG->allnodes_begin()) {
+ SDNode *N = &*--Position;
+ // Skip dead nodes and any non-machine opcodes.
+ if (N->use_empty() || !N->isMachineOpcode())
+ continue;
+
+ if (N->getMachineOpcode() != PPC::RLDICL)
+ continue;
+
+ if (N->getConstantOperandVal(1) != 0 ||
+ N->getConstantOperandVal(2) != 32)
+ continue;
+
+ SDValue ISR = N->getOperand(0);
+ if (!ISR.isMachineOpcode() ||
+ ISR.getMachineOpcode() != TargetOpcode::INSERT_SUBREG)
+ continue;
+
+ if (!ISR.hasOneUse())
+ continue;
+
+ if (ISR.getConstantOperandVal(2) != PPC::sub_32)
+ continue;
+
+ SDValue IDef = ISR.getOperand(0);
+ if (!IDef.isMachineOpcode() ||
+ IDef.getMachineOpcode() != TargetOpcode::IMPLICIT_DEF)
+ continue;
+
+ // We now know that we're looking at a canonical i32 -> i64 zext. See if we
+ // can get rid of it.
+
+ SDValue Op32 = ISR->getOperand(1);
+ if (!Op32.isMachineOpcode())
+ continue;
+
+ // There are some 32-bit instructions that always clear the high-order 32
+ // bits, there are also some instructions (like AND) that we can look
+ // through.
+ SmallPtrSet<SDNode *, 16> ToPromote;
+ if (!PeepholePPC64ZExtGather(Op32, ToPromote))
+ continue;
+
+ // If the ToPromote set contains nodes that have uses outside of the set
+ // (except for the original INSERT_SUBREG), then abort the transformation.
+ bool OutsideUse = false;
+ for (SDNode *PN : ToPromote) {
+ for (SDNode *UN : PN->uses()) {
+ if (!ToPromote.count(UN) && UN != ISR.getNode()) {
+ OutsideUse = true;
+ break;
+ }
+ }
+
+ if (OutsideUse)
+ break;
+ }
+ if (OutsideUse)
+ continue;
+
+ MadeChange = true;
+
+ // We now know that this zero extension can be removed by promoting to
+ // nodes in ToPromote to 64-bit operations, where for operations in the
+ // frontier of the set, we need to insert INSERT_SUBREGs for their
+ // operands.
+ for (SDNode *PN : ToPromote) {
+ unsigned NewOpcode;
+ switch (PN->getMachineOpcode()) {
+ default:
+ llvm_unreachable("Don't know the 64-bit variant of this instruction");
+ case PPC::RLWINM: NewOpcode = PPC::RLWINM8; break;
+ case PPC::RLWNM: NewOpcode = PPC::RLWNM8; break;
+ case PPC::SLW: NewOpcode = PPC::SLW8; break;
+ case PPC::SRW: NewOpcode = PPC::SRW8; break;
+ case PPC::LI: NewOpcode = PPC::LI8; break;
+ case PPC::LIS: NewOpcode = PPC::LIS8; break;
+ case PPC::LHBRX: NewOpcode = PPC::LHBRX8; break;
+ case PPC::LWBRX: NewOpcode = PPC::LWBRX8; break;
+ case PPC::CNTLZW: NewOpcode = PPC::CNTLZW8; break;
+ case PPC::CNTTZW: NewOpcode = PPC::CNTTZW8; break;
+ case PPC::RLWIMI: NewOpcode = PPC::RLWIMI8; break;
+ case PPC::OR: NewOpcode = PPC::OR8; break;
+ case PPC::SELECT_I4: NewOpcode = PPC::SELECT_I8; break;
+ case PPC::ORI: NewOpcode = PPC::ORI8; break;
+ case PPC::ORIS: NewOpcode = PPC::ORIS8; break;
+ case PPC::AND: NewOpcode = PPC::AND8; break;
+ case PPC::ANDIo: NewOpcode = PPC::ANDIo8; break;
+ case PPC::ANDISo: NewOpcode = PPC::ANDISo8; break;
+ }
+
+ // Note: During the replacement process, the nodes will be in an
+ // inconsistent state (some instructions will have operands with values
+ // of the wrong type). Once done, however, everything should be right
+ // again.
+
+ SmallVector<SDValue, 4> Ops;
+ for (const SDValue &V : PN->ops()) {
+ if (!ToPromote.count(V.getNode()) && V.getValueType() == MVT::i32 &&
+ !isa<ConstantSDNode>(V)) {
+ SDValue ReplOpOps[] = { ISR.getOperand(0), V, ISR.getOperand(2) };
+ SDNode *ReplOp =
+ CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, SDLoc(V),
+ ISR.getNode()->getVTList(), ReplOpOps);
+ Ops.push_back(SDValue(ReplOp, 0));
+ } else {
+ Ops.push_back(V);
+ }
+ }
+
+ // Because all to-be-promoted nodes only have users that are other
+ // promoted nodes (or the original INSERT_SUBREG), we can safely replace
+ // the i32 result value type with i64.
+
+ SmallVector<EVT, 2> NewVTs;
+ SDVTList VTs = PN->getVTList();
+ for (unsigned i = 0, ie = VTs.NumVTs; i != ie; ++i)
+ if (VTs.VTs[i] == MVT::i32)
+ NewVTs.push_back(MVT::i64);
+ else
+ NewVTs.push_back(VTs.VTs[i]);
+
+ DEBUG(dbgs() << "PPC64 ZExt Peephole morphing:\nOld: ");
+ DEBUG(PN->dump(CurDAG));
+
+ CurDAG->SelectNodeTo(PN, NewOpcode, CurDAG->getVTList(NewVTs), Ops);
+
+ DEBUG(dbgs() << "\nNew: ");
+ DEBUG(PN->dump(CurDAG));
+ DEBUG(dbgs() << "\n");
+ }
+
+ // Now we replace the original zero extend and its associated INSERT_SUBREG
+ // with the value feeding the INSERT_SUBREG (which has now been promoted to
+ // return an i64).
+
+ DEBUG(dbgs() << "PPC64 ZExt Peephole replacing:\nOld: ");
+ DEBUG(N->dump(CurDAG));
+ DEBUG(dbgs() << "\nNew: ");
+ DEBUG(Op32.getNode()->dump(CurDAG));
+ DEBUG(dbgs() << "\n");
+
+ ReplaceUses(N, Op32.getNode());
+ }
+
+ if (MadeChange)
+ CurDAG->RemoveDeadNodes();
+}
+
+void PPCDAGToDAGISel::PeepholePPC64() {
+ // These optimizations are currently supported only for 64-bit SVR4.
+ if (PPCSubTarget->isDarwin() || !PPCSubTarget->isPPC64())
+ return;
+
+ SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
+ ++Position;
+
+ while (Position != CurDAG->allnodes_begin()) {
+ SDNode *N = &*--Position;
+ // Skip dead nodes and any non-machine opcodes.
+ if (N->use_empty() || !N->isMachineOpcode())
+ continue;
+
+ unsigned FirstOp;
+ unsigned StorageOpcode = N->getMachineOpcode();
+
+ switch (StorageOpcode) {
+ default: continue;
+
+ case PPC::LBZ:
+ case PPC::LBZ8:
+ case PPC::LD:
+ case PPC::LFD:
+ case PPC::LFS:
+ case PPC::LHA:
+ case PPC::LHA8:
+ case PPC::LHZ:
+ case PPC::LHZ8:
+ case PPC::LWA:
+ case PPC::LWZ:
+ case PPC::LWZ8:
+ FirstOp = 0;
+ break;
+
+ case PPC::STB:
+ case PPC::STB8:
+ case PPC::STD:
+ case PPC::STFD:
+ case PPC::STFS:
+ case PPC::STH:
+ case PPC::STH8:
+ case PPC::STW:
+ case PPC::STW8:
+ FirstOp = 1;
+ break;
+ }
+
+ // If this is a load or store with a zero offset, or within the alignment,
+ // we may be able to fold an add-immediate into the memory operation.
+ // The check against alignment is below, as it can't occur until we check
+ // the arguments to N
+ if (!isa<ConstantSDNode>(N->getOperand(FirstOp)))
+ continue;
+
+ SDValue Base = N->getOperand(FirstOp + 1);
+ if (!Base.isMachineOpcode())
+ continue;
+
+ unsigned Flags = 0;
+ bool ReplaceFlags = true;
+
+ // When the feeding operation is an add-immediate of some sort,
+ // determine whether we need to add relocation information to the
+ // target flags on the immediate operand when we fold it into the
+ // load instruction.
+ //
+ // For something like ADDItocL, the relocation information is
+ // inferred from the opcode; when we process it in the AsmPrinter,
+ // we add the necessary relocation there. A load, though, can receive
+ // relocation from various flavors of ADDIxxx, so we need to carry
+ // the relocation information in the target flags.
+ switch (Base.getMachineOpcode()) {
+ default: continue;
+
+ case PPC::ADDI8:
+ case PPC::ADDI:
+ // In some cases (such as TLS) the relocation information
+ // is already in place on the operand, so copying the operand
+ // is sufficient.
+ ReplaceFlags = false;
+ // For these cases, the immediate may not be divisible by 4, in
+ // which case the fold is illegal for DS-form instructions. (The
+ // other cases provide aligned addresses and are always safe.)
+ if ((StorageOpcode == PPC::LWA ||
+ StorageOpcode == PPC::LD ||
+ StorageOpcode == PPC::STD) &&
+ (!isa<ConstantSDNode>(Base.getOperand(1)) ||
+ Base.getConstantOperandVal(1) % 4 != 0))
+ continue;
+ break;
+ case PPC::ADDIdtprelL:
+ Flags = PPCII::MO_DTPREL_LO;
+ break;
+ case PPC::ADDItlsldL:
+ Flags = PPCII::MO_TLSLD_LO;
+ break;
+ case PPC::ADDItocL:
+ Flags = PPCII::MO_TOC_LO;
+ break;
+ }
+
+ SDValue ImmOpnd = Base.getOperand(1);
+
+ // On PPC64, the TOC base pointer is guaranteed by the ABI only to have
+ // 8-byte alignment, and so we can only use offsets less than 8 (otherwise,
+ // we might have needed different @ha relocation values for the offset
+ // pointers).
+ int MaxDisplacement = 7;
+ if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd)) {
+ const GlobalValue *GV = GA->getGlobal();
+ MaxDisplacement = std::min((int) GV->getAlignment() - 1, MaxDisplacement);
+ }
+
+ bool UpdateHBase = false;
+ SDValue HBase = Base.getOperand(0);
+
+ int Offset = N->getConstantOperandVal(FirstOp);
+ if (ReplaceFlags) {
+ if (Offset < 0 || Offset > MaxDisplacement) {
+ // If we have a addi(toc@l)/addis(toc@ha) pair, and the addis has only
+ // one use, then we can do this for any offset, we just need to also
+ // update the offset (i.e. the symbol addend) on the addis also.
+ if (Base.getMachineOpcode() != PPC::ADDItocL)
+ continue;
+
+ if (!HBase.isMachineOpcode() ||
+ HBase.getMachineOpcode() != PPC::ADDIStocHA)
+ continue;
+
+ if (!Base.hasOneUse() || !HBase.hasOneUse())
+ continue;
+
+ SDValue HImmOpnd = HBase.getOperand(1);
+ if (HImmOpnd != ImmOpnd)
+ continue;
+
+ UpdateHBase = true;
+ }
+ } else {
+ // If we're directly folding the addend from an addi instruction, then:
+ // 1. In general, the offset on the memory access must be zero.
+ // 2. If the addend is a constant, then it can be combined with a
+ // non-zero offset, but only if the result meets the encoding
+ // requirements.
+ if (auto *C = dyn_cast<ConstantSDNode>(ImmOpnd)) {
+ Offset += C->getSExtValue();
+
+ if ((StorageOpcode == PPC::LWA || StorageOpcode == PPC::LD ||
+ StorageOpcode == PPC::STD) && (Offset % 4) != 0)
+ continue;
+
+ if (!isInt<16>(Offset))
+ continue;
+
+ ImmOpnd = CurDAG->getTargetConstant(Offset, SDLoc(ImmOpnd),
+ ImmOpnd.getValueType());
+ } else if (Offset != 0) {
+ continue;
+ }
+ }
+
+ // We found an opportunity. Reverse the operands from the add
+ // immediate and substitute them into the load or store. If
+ // needed, update the target flags for the immediate operand to
+ // reflect the necessary relocation information.
+ DEBUG(dbgs() << "Folding add-immediate into mem-op:\nBase: ");
+ DEBUG(Base->dump(CurDAG));
+ DEBUG(dbgs() << "\nN: ");
+ DEBUG(N->dump(CurDAG));
+ DEBUG(dbgs() << "\n");
+
+ // If the relocation information isn't already present on the
+ // immediate operand, add it now.
+ if (ReplaceFlags) {
+ if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd)) {
+ SDLoc dl(GA);
+ const GlobalValue *GV = GA->getGlobal();
+ // We can't perform this optimization for data whose alignment
+ // is insufficient for the instruction encoding.
+ if (GV->getAlignment() < 4 &&
+ (StorageOpcode == PPC::LD || StorageOpcode == PPC::STD ||
+ StorageOpcode == PPC::LWA || (Offset % 4) != 0)) {
+ DEBUG(dbgs() << "Rejected this candidate for alignment.\n\n");
+ continue;
+ }
+ ImmOpnd = CurDAG->getTargetGlobalAddress(GV, dl, MVT::i64, Offset, Flags);
+ } else if (ConstantPoolSDNode *CP =
+ dyn_cast<ConstantPoolSDNode>(ImmOpnd)) {
+ const Constant *C = CP->getConstVal();
+ ImmOpnd = CurDAG->getTargetConstantPool(C, MVT::i64,
+ CP->getAlignment(),
+ Offset, Flags);
+ }
+ }
+
+ if (FirstOp == 1) // Store
+ (void)CurDAG->UpdateNodeOperands(N, N->getOperand(0), ImmOpnd,
+ Base.getOperand(0), N->getOperand(3));
+ else // Load
+ (void)CurDAG->UpdateNodeOperands(N, ImmOpnd, Base.getOperand(0),
+ N->getOperand(2));
+
+ if (UpdateHBase)
+ (void)CurDAG->UpdateNodeOperands(HBase.getNode(), HBase.getOperand(0),
+ ImmOpnd);
+
+ // The add-immediate may now be dead, in which case remove it.
+ if (Base.getNode()->use_empty())
+ CurDAG->RemoveDeadNode(Base.getNode());
+ }
+}
+
+
+/// createPPCISelDag - This pass converts a legalized DAG into a
+/// PowerPC-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createPPCISelDag(PPCTargetMachine &TM) {
+ return new PPCDAGToDAGISel(TM);
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
new file mode 100644
index 000000000000..aa3ffde24b99
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -0,0 +1,12771 @@
+//===-- PPCISelLowering.cpp - PPC DAG Lowering Implementation -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PPCISelLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCISelLowering.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "PPCCallingConv.h"
+#include "PPCCCState.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCPerfectShuffle.h"
+#include "PPCTargetMachine.h"
+#include "PPCTargetObjectFile.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+#include <list>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-lowering"
+
+static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc",
+cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden);
+
+static cl::opt<bool> DisableILPPref("disable-ppc-ilp-pref",
+cl::desc("disable setting the node scheduling preference to ILP on PPC"), cl::Hidden);
+
+static cl::opt<bool> DisablePPCUnaligned("disable-ppc-unaligned",
+cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden);
+
+static cl::opt<bool> DisableSCO("disable-ppc-sco",
+cl::desc("disable sibling call optimization on ppc"), cl::Hidden);
+
+STATISTIC(NumTailCalls, "Number of tail calls");
+STATISTIC(NumSiblingCalls, "Number of sibling calls");
+
+// FIXME: Remove this once the bug has been fixed!
+extern cl::opt<bool> ANDIGlueBug;
+
+PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
+ const PPCSubtarget &STI)
+ : TargetLowering(TM), Subtarget(STI) {
+ // Use _setjmp/_longjmp instead of setjmp/longjmp.
+ setUseUnderscoreSetJmp(true);
+ setUseUnderscoreLongJmp(true);
+
+ // On PPC32/64, arguments smaller than 4/8 bytes are extended, so all
+ // arguments are at least 4/8 bytes aligned.
+ bool isPPC64 = Subtarget.isPPC64();
+ setMinStackArgumentAlignment(isPPC64 ? 8:4);
+
+ // Set up the register classes.
+ addRegisterClass(MVT::i32, &PPC::GPRCRegClass);
+ if (!useSoftFloat()) {
+ addRegisterClass(MVT::f32, &PPC::F4RCRegClass);
+ addRegisterClass(MVT::f64, &PPC::F8RCRegClass);
+ }
+
+ // PowerPC has an i16 but no i8 (or i1) SEXTLOAD
+ for (MVT VT : MVT::integer_valuetypes()) {
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand);
+ }
+
+ setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+ // PowerPC has pre-inc load and store's.
+ setIndexedLoadAction(ISD::PRE_INC, MVT::i1, Legal);
+ setIndexedLoadAction(ISD::PRE_INC, MVT::i8, Legal);
+ setIndexedLoadAction(ISD::PRE_INC, MVT::i16, Legal);
+ setIndexedLoadAction(ISD::PRE_INC, MVT::i32, Legal);
+ setIndexedLoadAction(ISD::PRE_INC, MVT::i64, Legal);
+ setIndexedLoadAction(ISD::PRE_INC, MVT::f32, Legal);
+ setIndexedLoadAction(ISD::PRE_INC, MVT::f64, Legal);
+ setIndexedStoreAction(ISD::PRE_INC, MVT::i1, Legal);
+ setIndexedStoreAction(ISD::PRE_INC, MVT::i8, Legal);
+ setIndexedStoreAction(ISD::PRE_INC, MVT::i16, Legal);
+ setIndexedStoreAction(ISD::PRE_INC, MVT::i32, Legal);
+ setIndexedStoreAction(ISD::PRE_INC, MVT::i64, Legal);
+ setIndexedStoreAction(ISD::PRE_INC, MVT::f32, Legal);
+ setIndexedStoreAction(ISD::PRE_INC, MVT::f64, Legal);
+
+ if (Subtarget.useCRBits()) {
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+ if (isPPC64 || Subtarget.hasFPCVT()) {
+ setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote);
+ AddPromotedToType (ISD::SINT_TO_FP, MVT::i1,
+ isPPC64 ? MVT::i64 : MVT::i32);
+ setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote);
+ AddPromotedToType(ISD::UINT_TO_FP, MVT::i1,
+ isPPC64 ? MVT::i64 : MVT::i32);
+ } else {
+ setOperationAction(ISD::SINT_TO_FP, MVT::i1, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::i1, Custom);
+ }
+
+ // PowerPC does not support direct load / store of condition registers
+ setOperationAction(ISD::LOAD, MVT::i1, Custom);
+ setOperationAction(ISD::STORE, MVT::i1, Custom);
+
+ // FIXME: Remove this once the ANDI glue bug is fixed:
+ if (ANDIGlueBug)
+ setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);
+
+ for (MVT VT : MVT::integer_valuetypes()) {
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+ setTruncStoreAction(VT, MVT::i1, Expand);
+ }
+
+ addRegisterClass(MVT::i1, &PPC::CRBITRCRegClass);
+ }
+
+ // This is used in the ppcf128->int sequence. Note it has different semantics
+ // from FP_ROUND: that rounds to nearest, this rounds to zero.
+ setOperationAction(ISD::FP_ROUND_INREG, MVT::ppcf128, Custom);
+
+ // We do not currently implement these libm ops for PowerPC.
+ setOperationAction(ISD::FFLOOR, MVT::ppcf128, Expand);
+ setOperationAction(ISD::FCEIL, MVT::ppcf128, Expand);
+ setOperationAction(ISD::FTRUNC, MVT::ppcf128, Expand);
+ setOperationAction(ISD::FRINT, MVT::ppcf128, Expand);
+ setOperationAction(ISD::FNEARBYINT, MVT::ppcf128, Expand);
+ setOperationAction(ISD::FREM, MVT::ppcf128, Expand);
+
+ // PowerPC has no SREM/UREM instructions
+ setOperationAction(ISD::SREM, MVT::i32, Expand);
+ setOperationAction(ISD::UREM, MVT::i32, Expand);
+ setOperationAction(ISD::SREM, MVT::i64, Expand);
+ setOperationAction(ISD::UREM, MVT::i64, Expand);
+
+ // Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM.
+ setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+ setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+ setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+ setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+ setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+ setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+ setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
+ setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+
+ // We don't support sin/cos/sqrt/fmod/pow
+ setOperationAction(ISD::FSIN , MVT::f64, Expand);
+ setOperationAction(ISD::FCOS , MVT::f64, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+ setOperationAction(ISD::FREM , MVT::f64, Expand);
+ setOperationAction(ISD::FPOW , MVT::f64, Expand);
+ setOperationAction(ISD::FMA , MVT::f64, Legal);
+ setOperationAction(ISD::FSIN , MVT::f32, Expand);
+ setOperationAction(ISD::FCOS , MVT::f32, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+ setOperationAction(ISD::FREM , MVT::f32, Expand);
+ setOperationAction(ISD::FPOW , MVT::f32, Expand);
+ setOperationAction(ISD::FMA , MVT::f32, Legal);
+
+ setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
+
+ // If we're enabling GP optimizations, use hardware square root
+ if (!Subtarget.hasFSQRT() &&
+ !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTE() &&
+ Subtarget.hasFRE()))
+ setOperationAction(ISD::FSQRT, MVT::f64, Expand);
+
+ if (!Subtarget.hasFSQRT() &&
+ !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTES() &&
+ Subtarget.hasFRES()))
+ setOperationAction(ISD::FSQRT, MVT::f32, Expand);
+
+ if (Subtarget.hasFCPSGN()) {
+ setOperationAction(ISD::FCOPYSIGN, MVT::f64, Legal);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f32, Legal);
+ } else {
+ setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+ }
+
+ if (Subtarget.hasFPRND()) {
+ setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
+ setOperationAction(ISD::FCEIL, MVT::f64, Legal);
+ setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
+ setOperationAction(ISD::FROUND, MVT::f64, Legal);
+
+ setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
+ setOperationAction(ISD::FCEIL, MVT::f32, Legal);
+ setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
+ setOperationAction(ISD::FROUND, MVT::f32, Legal);
+ }
+
+ // PowerPC does not have BSWAP
+ // CTPOP or CTTZ were introduced in P8/P9 respectivelly
+ setOperationAction(ISD::BSWAP, MVT::i32 , Expand);
+ setOperationAction(ISD::BSWAP, MVT::i64 , Expand);
+ if (Subtarget.isISA3_0()) {
+ setOperationAction(ISD::CTTZ , MVT::i32 , Legal);
+ setOperationAction(ISD::CTTZ , MVT::i64 , Legal);
+ } else {
+ setOperationAction(ISD::CTTZ , MVT::i32 , Expand);
+ setOperationAction(ISD::CTTZ , MVT::i64 , Expand);
+ }
+
+ if (Subtarget.hasPOPCNTD() == PPCSubtarget::POPCNTD_Fast) {
+ setOperationAction(ISD::CTPOP, MVT::i32 , Legal);
+ setOperationAction(ISD::CTPOP, MVT::i64 , Legal);
+ } else {
+ setOperationAction(ISD::CTPOP, MVT::i32 , Expand);
+ setOperationAction(ISD::CTPOP, MVT::i64 , Expand);
+ }
+
+ // PowerPC does not have ROTR
+ setOperationAction(ISD::ROTR, MVT::i32 , Expand);
+ setOperationAction(ISD::ROTR, MVT::i64 , Expand);
+
+ if (!Subtarget.useCRBits()) {
+ // PowerPC does not have Select
+ setOperationAction(ISD::SELECT, MVT::i32, Expand);
+ setOperationAction(ISD::SELECT, MVT::i64, Expand);
+ setOperationAction(ISD::SELECT, MVT::f32, Expand);
+ setOperationAction(ISD::SELECT, MVT::f64, Expand);
+ }
+
+ // PowerPC wants to turn select_cc of FP into fsel when possible.
+ setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
+
+ // PowerPC wants to optimize integer setcc a bit
+ if (!Subtarget.useCRBits())
+ setOperationAction(ISD::SETCC, MVT::i32, Custom);
+
+ // PowerPC does not have BRCOND which requires SetCC
+ if (!Subtarget.useCRBits())
+ setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+
+ setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+
+ // PowerPC turns FP_TO_SINT into FCTIWZ and some load/stores.
+ setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+
+ // PowerPC does not have [U|S]INT_TO_FP
+ setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand);
+ setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
+
+ if (Subtarget.hasDirectMove() && isPPC64) {
+ setOperationAction(ISD::BITCAST, MVT::f32, Legal);
+ setOperationAction(ISD::BITCAST, MVT::i32, Legal);
+ setOperationAction(ISD::BITCAST, MVT::i64, Legal);
+ setOperationAction(ISD::BITCAST, MVT::f64, Legal);
+ } else {
+ setOperationAction(ISD::BITCAST, MVT::f32, Expand);
+ setOperationAction(ISD::BITCAST, MVT::i32, Expand);
+ setOperationAction(ISD::BITCAST, MVT::i64, Expand);
+ setOperationAction(ISD::BITCAST, MVT::f64, Expand);
+ }
+
+ // We cannot sextinreg(i1). Expand to shifts.
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+ // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
+ // SjLj exception handling but a light-weight setjmp/longjmp replacement to
+ // support continuation, user-level threading, and etc.. As a result, no
+ // other SjLj exception interfaces are implemented and please don't build
+ // your own exception handling based on them.
+ // LLVM/Clang supports zero-cost DWARF exception handling.
+ setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
+ setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
+
+ // We want to legalize GlobalAddress and ConstantPool nodes into the
+ // appropriate instructions to materialize the address.
+ setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+ setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
+ setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
+ setOperationAction(ISD::ConstantPool, MVT::i32, Custom);
+ setOperationAction(ISD::JumpTable, MVT::i32, Custom);
+ setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+ setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
+ setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
+ setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
+ setOperationAction(ISD::JumpTable, MVT::i64, Custom);
+
+ // TRAP is legal.
+ setOperationAction(ISD::TRAP, MVT::Other, Legal);
+
+ // TRAMPOLINE is custom lowered.
+ setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
+ setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
+
+ // VASTART needs to be custom lowered to use the VarArgsFrameIndex
+ setOperationAction(ISD::VASTART , MVT::Other, Custom);
+
+ if (Subtarget.isSVR4ABI()) {
+ if (isPPC64) {
+ // VAARG always uses double-word chunks, so promote anything smaller.
+ setOperationAction(ISD::VAARG, MVT::i1, Promote);
+ AddPromotedToType (ISD::VAARG, MVT::i1, MVT::i64);
+ setOperationAction(ISD::VAARG, MVT::i8, Promote);
+ AddPromotedToType (ISD::VAARG, MVT::i8, MVT::i64);
+ setOperationAction(ISD::VAARG, MVT::i16, Promote);
+ AddPromotedToType (ISD::VAARG, MVT::i16, MVT::i64);
+ setOperationAction(ISD::VAARG, MVT::i32, Promote);
+ AddPromotedToType (ISD::VAARG, MVT::i32, MVT::i64);
+ setOperationAction(ISD::VAARG, MVT::Other, Expand);
+ } else {
+ // VAARG is custom lowered with the 32-bit SVR4 ABI.
+ setOperationAction(ISD::VAARG, MVT::Other, Custom);
+ setOperationAction(ISD::VAARG, MVT::i64, Custom);
+ }
+ } else
+ setOperationAction(ISD::VAARG, MVT::Other, Expand);
+
+ if (Subtarget.isSVR4ABI() && !isPPC64)
+ // VACOPY is custom lowered with the 32-bit SVR4 ABI.
+ setOperationAction(ISD::VACOPY , MVT::Other, Custom);
+ else
+ setOperationAction(ISD::VACOPY , MVT::Other, Expand);
+
+ // Use the default implementation.
+ setOperationAction(ISD::VAEND , MVT::Other, Expand);
+ setOperationAction(ISD::STACKSAVE , MVT::Other, Expand);
+ setOperationAction(ISD::STACKRESTORE , MVT::Other, Custom);
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32 , Custom);
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64 , Custom);
+ setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i32, Custom);
+ setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i64, Custom);
+ setOperationAction(ISD::EH_DWARF_CFA, MVT::i32, Custom);
+ setOperationAction(ISD::EH_DWARF_CFA, MVT::i64, Custom);
+
+ // We want to custom lower some of our intrinsics.
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
+ // To handle counter-based loop conditions.
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i1, Custom);
+
+ // Comparisons that require checking two conditions.
+ setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
+ setCondCodeAction(ISD::SETULT, MVT::f64, Expand);
+ setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
+ setCondCodeAction(ISD::SETUGT, MVT::f64, Expand);
+ setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
+ setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand);
+ setCondCodeAction(ISD::SETOGE, MVT::f32, Expand);
+ setCondCodeAction(ISD::SETOGE, MVT::f64, Expand);
+ setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
+ setCondCodeAction(ISD::SETOLE, MVT::f64, Expand);
+ setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
+ setCondCodeAction(ISD::SETONE, MVT::f64, Expand);
+
+ if (Subtarget.has64BitSupport()) {
+ // They also have instructions for converting between i64 and fp.
+ setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
+ setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
+ // This is just the low 32 bits of a (signed) fp->i64 conversion.
+ // We cannot do this with Promote because i64 is not a legal type.
+ setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+
+ if (Subtarget.hasLFIWAX() || Subtarget.isPPC64())
+ setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+ } else {
+ // PowerPC does not have FP_TO_UINT on 32-bit implementations.
+ setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
+ }
+
+ // With the instructions enabled under FPCVT, we can do everything.
+ if (Subtarget.hasFPCVT()) {
+ if (Subtarget.has64BitSupport()) {
+ setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
+ }
+
+ setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
+ }
+
+ if (Subtarget.use64BitRegs()) {
+ // 64-bit PowerPC implementations can support i64 types directly
+ addRegisterClass(MVT::i64, &PPC::G8RCRegClass);
+ // BUILD_PAIR can't be handled natively, and should be expanded to shl/or
+ setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
+ // 64-bit PowerPC wants to expand i128 shifts itself.
+ setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
+ setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
+ setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
+ } else {
+ // 32-bit PowerPC wants to expand i64 shifts itself.
+ setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
+ setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
+ setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
+ }
+
+ if (Subtarget.hasAltivec()) {
+ // First set operation action for all vector types to expand. Then we
+ // will selectively turn on ones that can be effectively codegen'd.
+ for (MVT VT : MVT::vector_valuetypes()) {
+ // add/sub are legal for all supported vector VT's.
+ setOperationAction(ISD::ADD, VT, Legal);
+ setOperationAction(ISD::SUB, VT, Legal);
+
+ // Vector instructions introduced in P8
+ if (Subtarget.hasP8Altivec() && (VT.SimpleTy != MVT::v1i128)) {
+ setOperationAction(ISD::CTPOP, VT, Legal);
+ setOperationAction(ISD::CTLZ, VT, Legal);
+ }
+ else {
+ setOperationAction(ISD::CTPOP, VT, Expand);
+ setOperationAction(ISD::CTLZ, VT, Expand);
+ }
+
+ // Vector instructions introduced in P9
+ if (Subtarget.hasP9Altivec() && (VT.SimpleTy != MVT::v1i128))
+ setOperationAction(ISD::CTTZ, VT, Legal);
+ else
+ setOperationAction(ISD::CTTZ, VT, Expand);
+
+ // We promote all shuffles to v16i8.
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Promote);
+ AddPromotedToType (ISD::VECTOR_SHUFFLE, VT, MVT::v16i8);
+
+ // We promote all non-typed operations to v4i32.
+ setOperationAction(ISD::AND , VT, Promote);
+ AddPromotedToType (ISD::AND , VT, MVT::v4i32);
+ setOperationAction(ISD::OR , VT, Promote);
+ AddPromotedToType (ISD::OR , VT, MVT::v4i32);
+ setOperationAction(ISD::XOR , VT, Promote);
+ AddPromotedToType (ISD::XOR , VT, MVT::v4i32);
+ setOperationAction(ISD::LOAD , VT, Promote);
+ AddPromotedToType (ISD::LOAD , VT, MVT::v4i32);
+ setOperationAction(ISD::SELECT, VT, Promote);
+ AddPromotedToType (ISD::SELECT, VT, MVT::v4i32);
+ setOperationAction(ISD::SELECT_CC, VT, Promote);
+ AddPromotedToType (ISD::SELECT_CC, VT, MVT::v4i32);
+ setOperationAction(ISD::STORE, VT, Promote);
+ AddPromotedToType (ISD::STORE, VT, MVT::v4i32);
+
+ // No other operations are legal.
+ setOperationAction(ISD::MUL , VT, Expand);
+ setOperationAction(ISD::SDIV, VT, Expand);
+ setOperationAction(ISD::SREM, VT, Expand);
+ setOperationAction(ISD::UDIV, VT, Expand);
+ setOperationAction(ISD::UREM, VT, Expand);
+ setOperationAction(ISD::FDIV, VT, Expand);
+ setOperationAction(ISD::FREM, VT, Expand);
+ setOperationAction(ISD::FNEG, VT, Expand);
+ setOperationAction(ISD::FSQRT, VT, Expand);
+ setOperationAction(ISD::FLOG, VT, Expand);
+ setOperationAction(ISD::FLOG10, VT, Expand);
+ setOperationAction(ISD::FLOG2, VT, Expand);
+ setOperationAction(ISD::FEXP, VT, Expand);
+ setOperationAction(ISD::FEXP2, VT, Expand);
+ setOperationAction(ISD::FSIN, VT, Expand);
+ setOperationAction(ISD::FCOS, VT, Expand);
+ setOperationAction(ISD::FABS, VT, Expand);
+ setOperationAction(ISD::FPOWI, VT, Expand);
+ setOperationAction(ISD::FFLOOR, VT, Expand);
+ setOperationAction(ISD::FCEIL, VT, Expand);
+ setOperationAction(ISD::FTRUNC, VT, Expand);
+ setOperationAction(ISD::FRINT, VT, Expand);
+ setOperationAction(ISD::FNEARBYINT, VT, Expand);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Expand);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Expand);
+ setOperationAction(ISD::MULHU, VT, Expand);
+ setOperationAction(ISD::MULHS, VT, Expand);
+ setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+ setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+ setOperationAction(ISD::UDIVREM, VT, Expand);
+ setOperationAction(ISD::SDIVREM, VT, Expand);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand);
+ setOperationAction(ISD::FPOW, VT, Expand);
+ setOperationAction(ISD::BSWAP, VT, Expand);
+ setOperationAction(ISD::VSELECT, VT, Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
+ setOperationAction(ISD::ROTL, VT, Expand);
+ setOperationAction(ISD::ROTR, VT, Expand);
+
+ for (MVT InnerVT : MVT::vector_valuetypes()) {
+ setTruncStoreAction(VT, InnerVT, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
+ }
+ }
+
+ // We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle
+ // with merges, splats, etc.
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom);
+
+ setOperationAction(ISD::AND , MVT::v4i32, Legal);
+ setOperationAction(ISD::OR , MVT::v4i32, Legal);
+ setOperationAction(ISD::XOR , MVT::v4i32, Legal);
+ setOperationAction(ISD::LOAD , MVT::v4i32, Legal);
+ setOperationAction(ISD::SELECT, MVT::v4i32,
+ Subtarget.useCRBits() ? Legal : Expand);
+ setOperationAction(ISD::STORE , MVT::v4i32, Legal);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
+ setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
+ setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
+ setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
+ setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
+
+ addRegisterClass(MVT::v4f32, &PPC::VRRCRegClass);
+ addRegisterClass(MVT::v4i32, &PPC::VRRCRegClass);
+ addRegisterClass(MVT::v8i16, &PPC::VRRCRegClass);
+ addRegisterClass(MVT::v16i8, &PPC::VRRCRegClass);
+
+ setOperationAction(ISD::MUL, MVT::v4f32, Legal);
+ setOperationAction(ISD::FMA, MVT::v4f32, Legal);
+
+ if (TM.Options.UnsafeFPMath || Subtarget.hasVSX()) {
+ setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
+ setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
+ }
+
+ if (Subtarget.hasP8Altivec())
+ setOperationAction(ISD::MUL, MVT::v4i32, Legal);
+ else
+ setOperationAction(ISD::MUL, MVT::v4i32, Custom);
+
+ setOperationAction(ISD::MUL, MVT::v8i16, Custom);
+ setOperationAction(ISD::MUL, MVT::v16i8, Custom);
+
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom);
+
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
+
+ // Altivec does not contain unordered floating-point compare instructions
+ setCondCodeAction(ISD::SETUO, MVT::v4f32, Expand);
+ setCondCodeAction(ISD::SETUEQ, MVT::v4f32, Expand);
+ setCondCodeAction(ISD::SETO, MVT::v4f32, Expand);
+ setCondCodeAction(ISD::SETONE, MVT::v4f32, Expand);
+
+ if (Subtarget.hasVSX()) {
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal);
+ if (Subtarget.hasP8Vector()) {
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Legal);
+ }
+ if (Subtarget.hasDirectMove() && isPPC64) {
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Legal);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Legal);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Legal);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i64, Legal);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Legal);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Legal);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Legal);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal);
+ }
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal);
+
+ setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
+ setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
+ setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
+ setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
+ setOperationAction(ISD::FROUND, MVT::v2f64, Legal);
+
+ setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
+
+ setOperationAction(ISD::MUL, MVT::v2f64, Legal);
+ setOperationAction(ISD::FMA, MVT::v2f64, Legal);
+
+ setOperationAction(ISD::FDIV, MVT::v2f64, Legal);
+ setOperationAction(ISD::FSQRT, MVT::v2f64, Legal);
+
+ setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
+ setOperationAction(ISD::VSELECT, MVT::v8i16, Legal);
+ setOperationAction(ISD::VSELECT, MVT::v4i32, Legal);
+ setOperationAction(ISD::VSELECT, MVT::v4f32, Legal);
+ setOperationAction(ISD::VSELECT, MVT::v2f64, Legal);
+
+ // Share the Altivec comparison restrictions.
+ setCondCodeAction(ISD::SETUO, MVT::v2f64, Expand);
+ setCondCodeAction(ISD::SETUEQ, MVT::v2f64, Expand);
+ setCondCodeAction(ISD::SETO, MVT::v2f64, Expand);
+ setCondCodeAction(ISD::SETONE, MVT::v2f64, Expand);
+
+ setOperationAction(ISD::LOAD, MVT::v2f64, Legal);
+ setOperationAction(ISD::STORE, MVT::v2f64, Legal);
+
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Legal);
+
+ if (Subtarget.hasP8Vector())
+ addRegisterClass(MVT::f32, &PPC::VSSRCRegClass);
+
+ addRegisterClass(MVT::f64, &PPC::VSFRCRegClass);
+
+ addRegisterClass(MVT::v4i32, &PPC::VSRCRegClass);
+ addRegisterClass(MVT::v4f32, &PPC::VSRCRegClass);
+ addRegisterClass(MVT::v2f64, &PPC::VSRCRegClass);
+
+ if (Subtarget.hasP8Altivec()) {
+ setOperationAction(ISD::SHL, MVT::v2i64, Legal);
+ setOperationAction(ISD::SRA, MVT::v2i64, Legal);
+ setOperationAction(ISD::SRL, MVT::v2i64, Legal);
+
+ setOperationAction(ISD::SETCC, MVT::v2i64, Legal);
+ }
+ else {
+ setOperationAction(ISD::SHL, MVT::v2i64, Expand);
+ setOperationAction(ISD::SRA, MVT::v2i64, Expand);
+ setOperationAction(ISD::SRL, MVT::v2i64, Expand);
+
+ setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
+
+ // VSX v2i64 only supports non-arithmetic operations.
+ setOperationAction(ISD::ADD, MVT::v2i64, Expand);
+ setOperationAction(ISD::SUB, MVT::v2i64, Expand);
+ }
+
+ setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
+ AddPromotedToType (ISD::LOAD, MVT::v2i64, MVT::v2f64);
+ setOperationAction(ISD::STORE, MVT::v2i64, Promote);
+ AddPromotedToType (ISD::STORE, MVT::v2i64, MVT::v2f64);
+
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Legal);
+
+ setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
+
+ // Vector operation legalization checks the result type of
+ // SIGN_EXTEND_INREG, overall legalization checks the inner type.
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i64, Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
+
+ setOperationAction(ISD::FNEG, MVT::v4f32, Legal);
+ setOperationAction(ISD::FNEG, MVT::v2f64, Legal);
+ setOperationAction(ISD::FABS, MVT::v4f32, Legal);
+ setOperationAction(ISD::FABS, MVT::v2f64, Legal);
+
+ if (Subtarget.hasDirectMove())
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
+
+ addRegisterClass(MVT::v2i64, &PPC::VSRCRegClass);
+ }
+
+ if (Subtarget.hasP8Altivec()) {
+ addRegisterClass(MVT::v2i64, &PPC::VRRCRegClass);
+ addRegisterClass(MVT::v1i128, &PPC::VRRCRegClass);
+ }
+
+ if (Subtarget.hasP9Vector()) {
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
+ }
+ }
+
+ if (Subtarget.hasQPX()) {
+ setOperationAction(ISD::FADD, MVT::v4f64, Legal);
+ setOperationAction(ISD::FSUB, MVT::v4f64, Legal);
+ setOperationAction(ISD::FMUL, MVT::v4f64, Legal);
+ setOperationAction(ISD::FREM, MVT::v4f64, Expand);
+
+ setOperationAction(ISD::FCOPYSIGN, MVT::v4f64, Legal);
+ setOperationAction(ISD::FGETSIGN, MVT::v4f64, Expand);
+
+ setOperationAction(ISD::LOAD , MVT::v4f64, Custom);
+ setOperationAction(ISD::STORE , MVT::v4f64, Custom);
+
+ setTruncStoreAction(MVT::v4f64, MVT::v4f32, Custom);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Custom);
+
+ if (!Subtarget.useCRBits())
+ setOperationAction(ISD::SELECT, MVT::v4f64, Expand);
+ setOperationAction(ISD::VSELECT, MVT::v4f64, Legal);
+
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f64, Legal);
+ setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f64, Expand);
+ setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f64, Expand);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f64, Expand);
+ setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f64, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f64, Legal);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom);
+
+ setOperationAction(ISD::FP_TO_SINT , MVT::v4f64, Legal);
+ setOperationAction(ISD::FP_TO_UINT , MVT::v4f64, Expand);
+
+ setOperationAction(ISD::FP_ROUND , MVT::v4f32, Legal);
+ setOperationAction(ISD::FP_ROUND_INREG , MVT::v4f32, Expand);
+ setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal);
+
+ setOperationAction(ISD::FNEG , MVT::v4f64, Legal);
+ setOperationAction(ISD::FABS , MVT::v4f64, Legal);
+ setOperationAction(ISD::FSIN , MVT::v4f64, Expand);
+ setOperationAction(ISD::FCOS , MVT::v4f64, Expand);
+ setOperationAction(ISD::FPOWI , MVT::v4f64, Expand);
+ setOperationAction(ISD::FPOW , MVT::v4f64, Expand);
+ setOperationAction(ISD::FLOG , MVT::v4f64, Expand);
+ setOperationAction(ISD::FLOG2 , MVT::v4f64, Expand);
+ setOperationAction(ISD::FLOG10 , MVT::v4f64, Expand);
+ setOperationAction(ISD::FEXP , MVT::v4f64, Expand);
+ setOperationAction(ISD::FEXP2 , MVT::v4f64, Expand);
+
+ setOperationAction(ISD::FMINNUM, MVT::v4f64, Legal);
+ setOperationAction(ISD::FMAXNUM, MVT::v4f64, Legal);
+
+ setIndexedLoadAction(ISD::PRE_INC, MVT::v4f64, Legal);
+ setIndexedStoreAction(ISD::PRE_INC, MVT::v4f64, Legal);
+
+ addRegisterClass(MVT::v4f64, &PPC::QFRCRegClass);
+
+ setOperationAction(ISD::FADD, MVT::v4f32, Legal);
+ setOperationAction(ISD::FSUB, MVT::v4f32, Legal);
+ setOperationAction(ISD::FMUL, MVT::v4f32, Legal);
+ setOperationAction(ISD::FREM, MVT::v4f32, Expand);
+
+ setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Legal);
+ setOperationAction(ISD::FGETSIGN, MVT::v4f32, Expand);
+
+ setOperationAction(ISD::LOAD , MVT::v4f32, Custom);
+ setOperationAction(ISD::STORE , MVT::v4f32, Custom);
+
+ if (!Subtarget.useCRBits())
+ setOperationAction(ISD::SELECT, MVT::v4f32, Expand);
+ setOperationAction(ISD::VSELECT, MVT::v4f32, Legal);
+
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f32, Legal);
+ setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f32, Expand);
+ setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f32, Expand);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f32, Expand);
+ setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f32, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
+
+ setOperationAction(ISD::FP_TO_SINT , MVT::v4f32, Legal);
+ setOperationAction(ISD::FP_TO_UINT , MVT::v4f32, Expand);
+
+ setOperationAction(ISD::FNEG , MVT::v4f32, Legal);
+ setOperationAction(ISD::FABS , MVT::v4f32, Legal);
+ setOperationAction(ISD::FSIN , MVT::v4f32, Expand);
+ setOperationAction(ISD::FCOS , MVT::v4f32, Expand);
+ setOperationAction(ISD::FPOWI , MVT::v4f32, Expand);
+ setOperationAction(ISD::FPOW , MVT::v4f32, Expand);
+ setOperationAction(ISD::FLOG , MVT::v4f32, Expand);
+ setOperationAction(ISD::FLOG2 , MVT::v4f32, Expand);
+ setOperationAction(ISD::FLOG10 , MVT::v4f32, Expand);
+ setOperationAction(ISD::FEXP , MVT::v4f32, Expand);
+ setOperationAction(ISD::FEXP2 , MVT::v4f32, Expand);
+
+ setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
+ setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
+
+ setIndexedLoadAction(ISD::PRE_INC, MVT::v4f32, Legal);
+ setIndexedStoreAction(ISD::PRE_INC, MVT::v4f32, Legal);
+
+ addRegisterClass(MVT::v4f32, &PPC::QSRCRegClass);
+
+ setOperationAction(ISD::AND , MVT::v4i1, Legal);
+ setOperationAction(ISD::OR , MVT::v4i1, Legal);
+ setOperationAction(ISD::XOR , MVT::v4i1, Legal);
+
+ if (!Subtarget.useCRBits())
+ setOperationAction(ISD::SELECT, MVT::v4i1, Expand);
+ setOperationAction(ISD::VSELECT, MVT::v4i1, Legal);
+
+ setOperationAction(ISD::LOAD , MVT::v4i1, Custom);
+ setOperationAction(ISD::STORE , MVT::v4i1, Custom);
+
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4i1, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4i1, Expand);
+ setOperationAction(ISD::CONCAT_VECTORS , MVT::v4i1, Expand);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4i1, Expand);
+ setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4i1, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i1, Expand);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom);
+
+ setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom);
+
+ addRegisterClass(MVT::v4i1, &PPC::QBRCRegClass);
+
+ setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal);
+ setOperationAction(ISD::FCEIL, MVT::v4f64, Legal);
+ setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal);
+ setOperationAction(ISD::FROUND, MVT::v4f64, Legal);
+
+ setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
+ setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
+ setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
+ setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
+
+ setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Expand);
+ setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
+
+ // These need to set FE_INEXACT, and so cannot be vectorized here.
+ setOperationAction(ISD::FRINT, MVT::v4f64, Expand);
+ setOperationAction(ISD::FRINT, MVT::v4f32, Expand);
+
+ if (TM.Options.UnsafeFPMath) {
+ setOperationAction(ISD::FDIV, MVT::v4f64, Legal);
+ setOperationAction(ISD::FSQRT, MVT::v4f64, Legal);
+
+ setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
+ setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
+ } else {
+ setOperationAction(ISD::FDIV, MVT::v4f64, Expand);
+ setOperationAction(ISD::FSQRT, MVT::v4f64, Expand);
+
+ setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
+ setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
+ }
+ }
+
+ if (Subtarget.has64BitSupport())
+ setOperationAction(ISD::PREFETCH, MVT::Other, Legal);
+
+ setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, isPPC64 ? Legal : Custom);
+
+ if (!isPPC64) {
+ setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Expand);
+ setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand);
+ }
+
+ setBooleanContents(ZeroOrOneBooleanContent);
+
+ if (Subtarget.hasAltivec()) {
+ // Altivec instructions set fields to all zeros or all ones.
+ setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+ }
+
+ if (!isPPC64) {
+ // These libcalls are not available in 32-bit.
+ setLibcallName(RTLIB::SHL_I128, nullptr);
+ setLibcallName(RTLIB::SRL_I128, nullptr);
+ setLibcallName(RTLIB::SRA_I128, nullptr);
+ }
+
+ setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1);
+
+ // We have target-specific dag combine patterns for the following nodes:
+ setTargetDAGCombine(ISD::SINT_TO_FP);
+ setTargetDAGCombine(ISD::BUILD_VECTOR);
+ if (Subtarget.hasFPCVT())
+ setTargetDAGCombine(ISD::UINT_TO_FP);
+ setTargetDAGCombine(ISD::LOAD);
+ setTargetDAGCombine(ISD::STORE);
+ setTargetDAGCombine(ISD::BR_CC);
+ if (Subtarget.useCRBits())
+ setTargetDAGCombine(ISD::BRCOND);
+ setTargetDAGCombine(ISD::BSWAP);
+ setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
+ setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
+ setTargetDAGCombine(ISD::INTRINSIC_VOID);
+
+ setTargetDAGCombine(ISD::SIGN_EXTEND);
+ setTargetDAGCombine(ISD::ZERO_EXTEND);
+ setTargetDAGCombine(ISD::ANY_EXTEND);
+
+ if (Subtarget.useCRBits()) {
+ setTargetDAGCombine(ISD::TRUNCATE);
+ setTargetDAGCombine(ISD::SETCC);
+ setTargetDAGCombine(ISD::SELECT_CC);
+ }
+
+ // Use reciprocal estimates.
+ if (TM.Options.UnsafeFPMath) {
+ setTargetDAGCombine(ISD::FDIV);
+ setTargetDAGCombine(ISD::FSQRT);
+ }
+
+ // Darwin long double math library functions have $LDBL128 appended.
+ if (Subtarget.isDarwin()) {
+ setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128");
+ setLibcallName(RTLIB::POW_PPCF128, "powl$LDBL128");
+ setLibcallName(RTLIB::REM_PPCF128, "fmodl$LDBL128");
+ setLibcallName(RTLIB::SIN_PPCF128, "sinl$LDBL128");
+ setLibcallName(RTLIB::SQRT_PPCF128, "sqrtl$LDBL128");
+ setLibcallName(RTLIB::LOG_PPCF128, "logl$LDBL128");
+ setLibcallName(RTLIB::LOG2_PPCF128, "log2l$LDBL128");
+ setLibcallName(RTLIB::LOG10_PPCF128, "log10l$LDBL128");
+ setLibcallName(RTLIB::EXP_PPCF128, "expl$LDBL128");
+ setLibcallName(RTLIB::EXP2_PPCF128, "exp2l$LDBL128");
+ }
+
+ // With 32 condition bits, we don't need to sink (and duplicate) compares
+ // aggressively in CodeGenPrep.
+ if (Subtarget.useCRBits()) {
+ setHasMultipleConditionRegisters();
+ setJumpIsExpensive();
+ }
+
+ setMinFunctionAlignment(2);
+ if (Subtarget.isDarwin())
+ setPrefFunctionAlignment(4);
+
+ switch (Subtarget.getDarwinDirective()) {
+ default: break;
+ case PPC::DIR_970:
+ case PPC::DIR_A2:
+ case PPC::DIR_E500mc:
+ case PPC::DIR_E5500:
+ case PPC::DIR_PWR4:
+ case PPC::DIR_PWR5:
+ case PPC::DIR_PWR5X:
+ case PPC::DIR_PWR6:
+ case PPC::DIR_PWR6X:
+ case PPC::DIR_PWR7:
+ case PPC::DIR_PWR8:
+ case PPC::DIR_PWR9:
+ setPrefFunctionAlignment(4);
+ setPrefLoopAlignment(4);
+ break;
+ }
+
+ if (Subtarget.enableMachineScheduler())
+ setSchedulingPreference(Sched::Source);
+ else
+ setSchedulingPreference(Sched::Hybrid);
+
+ computeRegisterProperties(STI.getRegisterInfo());
+
+ // The Freescale cores do better with aggressive inlining of memcpy and
+ // friends. GCC uses same threshold of 128 bytes (= 32 word stores).
+ if (Subtarget.getDarwinDirective() == PPC::DIR_E500mc ||
+ Subtarget.getDarwinDirective() == PPC::DIR_E5500) {
+ MaxStoresPerMemset = 32;
+ MaxStoresPerMemsetOptSize = 16;
+ MaxStoresPerMemcpy = 32;
+ MaxStoresPerMemcpyOptSize = 8;
+ MaxStoresPerMemmove = 32;
+ MaxStoresPerMemmoveOptSize = 8;
+ } else if (Subtarget.getDarwinDirective() == PPC::DIR_A2) {
+ // The A2 also benefits from (very) aggressive inlining of memcpy and
+ // friends. The overhead of a the function call, even when warm, can be
+ // over one hundred cycles.
+ MaxStoresPerMemset = 128;
+ MaxStoresPerMemcpy = 128;
+ MaxStoresPerMemmove = 128;
+ }
+}
+
+/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
+/// the desired ByVal argument alignment.
+static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign,
+ unsigned MaxMaxAlign) {
+ if (MaxAlign == MaxMaxAlign)
+ return;
+ if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+ if (MaxMaxAlign >= 32 && VTy->getBitWidth() >= 256)
+ MaxAlign = 32;
+ else if (VTy->getBitWidth() >= 128 && MaxAlign < 16)
+ MaxAlign = 16;
+ } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+ unsigned EltAlign = 0;
+ getMaxByValAlign(ATy->getElementType(), EltAlign, MaxMaxAlign);
+ if (EltAlign > MaxAlign)
+ MaxAlign = EltAlign;
+ } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
+ for (auto *EltTy : STy->elements()) {
+ unsigned EltAlign = 0;
+ getMaxByValAlign(EltTy, EltAlign, MaxMaxAlign);
+ if (EltAlign > MaxAlign)
+ MaxAlign = EltAlign;
+ if (MaxAlign == MaxMaxAlign)
+ break;
+ }
+ }
+}
+
+/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
+/// function arguments in the caller parameter area.
+unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty,
+ const DataLayout &DL) const {
+ // Darwin passes everything on 4 byte boundary.
+ if (Subtarget.isDarwin())
+ return 4;
+
+ // 16byte and wider vectors are passed on 16byte boundary.
+ // The rest is 8 on PPC64 and 4 on PPC32 boundary.
+ unsigned Align = Subtarget.isPPC64() ? 8 : 4;
+ if (Subtarget.hasAltivec() || Subtarget.hasQPX())
+ getMaxByValAlign(Ty, Align, Subtarget.hasQPX() ? 32 : 16);
+ return Align;
+}
+
+bool PPCTargetLowering::useSoftFloat() const {
+ return Subtarget.useSoftFloat();
+}
+
+const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
+ switch ((PPCISD::NodeType)Opcode) {
+ case PPCISD::FIRST_NUMBER: break;
+ case PPCISD::FSEL: return "PPCISD::FSEL";
+ case PPCISD::FCFID: return "PPCISD::FCFID";
+ case PPCISD::FCFIDU: return "PPCISD::FCFIDU";
+ case PPCISD::FCFIDS: return "PPCISD::FCFIDS";
+ case PPCISD::FCFIDUS: return "PPCISD::FCFIDUS";
+ case PPCISD::FCTIDZ: return "PPCISD::FCTIDZ";
+ case PPCISD::FCTIWZ: return "PPCISD::FCTIWZ";
+ case PPCISD::FCTIDUZ: return "PPCISD::FCTIDUZ";
+ case PPCISD::FCTIWUZ: return "PPCISD::FCTIWUZ";
+ case PPCISD::FRE: return "PPCISD::FRE";
+ case PPCISD::FRSQRTE: return "PPCISD::FRSQRTE";
+ case PPCISD::STFIWX: return "PPCISD::STFIWX";
+ case PPCISD::VMADDFP: return "PPCISD::VMADDFP";
+ case PPCISD::VNMSUBFP: return "PPCISD::VNMSUBFP";
+ case PPCISD::VPERM: return "PPCISD::VPERM";
+ case PPCISD::XXSPLT: return "PPCISD::XXSPLT";
+ case PPCISD::XXINSERT: return "PPCISD::XXINSERT";
+ case PPCISD::VECSHL: return "PPCISD::VECSHL";
+ case PPCISD::CMPB: return "PPCISD::CMPB";
+ case PPCISD::Hi: return "PPCISD::Hi";
+ case PPCISD::Lo: return "PPCISD::Lo";
+ case PPCISD::TOC_ENTRY: return "PPCISD::TOC_ENTRY";
+ case PPCISD::DYNALLOC: return "PPCISD::DYNALLOC";
+ case PPCISD::DYNAREAOFFSET: return "PPCISD::DYNAREAOFFSET";
+ case PPCISD::GlobalBaseReg: return "PPCISD::GlobalBaseReg";
+ case PPCISD::SRL: return "PPCISD::SRL";
+ case PPCISD::SRA: return "PPCISD::SRA";
+ case PPCISD::SHL: return "PPCISD::SHL";
+ case PPCISD::SRA_ADDZE: return "PPCISD::SRA_ADDZE";
+ case PPCISD::CALL: return "PPCISD::CALL";
+ case PPCISD::CALL_NOP: return "PPCISD::CALL_NOP";
+ case PPCISD::MTCTR: return "PPCISD::MTCTR";
+ case PPCISD::BCTRL: return "PPCISD::BCTRL";
+ case PPCISD::BCTRL_LOAD_TOC: return "PPCISD::BCTRL_LOAD_TOC";
+ case PPCISD::RET_FLAG: return "PPCISD::RET_FLAG";
+ case PPCISD::READ_TIME_BASE: return "PPCISD::READ_TIME_BASE";
+ case PPCISD::EH_SJLJ_SETJMP: return "PPCISD::EH_SJLJ_SETJMP";
+ case PPCISD::EH_SJLJ_LONGJMP: return "PPCISD::EH_SJLJ_LONGJMP";
+ case PPCISD::MFOCRF: return "PPCISD::MFOCRF";
+ case PPCISD::MFVSR: return "PPCISD::MFVSR";
+ case PPCISD::MTVSRA: return "PPCISD::MTVSRA";
+ case PPCISD::MTVSRZ: return "PPCISD::MTVSRZ";
+ case PPCISD::SINT_VEC_TO_FP: return "PPCISD::SINT_VEC_TO_FP";
+ case PPCISD::UINT_VEC_TO_FP: return "PPCISD::UINT_VEC_TO_FP";
+ case PPCISD::ANDIo_1_EQ_BIT: return "PPCISD::ANDIo_1_EQ_BIT";
+ case PPCISD::ANDIo_1_GT_BIT: return "PPCISD::ANDIo_1_GT_BIT";
+ case PPCISD::VCMP: return "PPCISD::VCMP";
+ case PPCISD::VCMPo: return "PPCISD::VCMPo";
+ case PPCISD::LBRX: return "PPCISD::LBRX";
+ case PPCISD::STBRX: return "PPCISD::STBRX";
+ case PPCISD::LFIWAX: return "PPCISD::LFIWAX";
+ case PPCISD::LFIWZX: return "PPCISD::LFIWZX";
+ case PPCISD::LXSIZX: return "PPCISD::LXSIZX";
+ case PPCISD::STXSIX: return "PPCISD::STXSIX";
+ case PPCISD::VEXTS: return "PPCISD::VEXTS";
+ case PPCISD::LXVD2X: return "PPCISD::LXVD2X";
+ case PPCISD::STXVD2X: return "PPCISD::STXVD2X";
+ case PPCISD::COND_BRANCH: return "PPCISD::COND_BRANCH";
+ case PPCISD::BDNZ: return "PPCISD::BDNZ";
+ case PPCISD::BDZ: return "PPCISD::BDZ";
+ case PPCISD::MFFS: return "PPCISD::MFFS";
+ case PPCISD::FADDRTZ: return "PPCISD::FADDRTZ";
+ case PPCISD::TC_RETURN: return "PPCISD::TC_RETURN";
+ case PPCISD::CR6SET: return "PPCISD::CR6SET";
+ case PPCISD::CR6UNSET: return "PPCISD::CR6UNSET";
+ case PPCISD::PPC32_GOT: return "PPCISD::PPC32_GOT";
+ case PPCISD::PPC32_PICGOT: return "PPCISD::PPC32_PICGOT";
+ case PPCISD::ADDIS_GOT_TPREL_HA: return "PPCISD::ADDIS_GOT_TPREL_HA";
+ case PPCISD::LD_GOT_TPREL_L: return "PPCISD::LD_GOT_TPREL_L";
+ case PPCISD::ADD_TLS: return "PPCISD::ADD_TLS";
+ case PPCISD::ADDIS_TLSGD_HA: return "PPCISD::ADDIS_TLSGD_HA";
+ case PPCISD::ADDI_TLSGD_L: return "PPCISD::ADDI_TLSGD_L";
+ case PPCISD::GET_TLS_ADDR: return "PPCISD::GET_TLS_ADDR";
+ case PPCISD::ADDI_TLSGD_L_ADDR: return "PPCISD::ADDI_TLSGD_L_ADDR";
+ case PPCISD::ADDIS_TLSLD_HA: return "PPCISD::ADDIS_TLSLD_HA";
+ case PPCISD::ADDI_TLSLD_L: return "PPCISD::ADDI_TLSLD_L";
+ case PPCISD::GET_TLSLD_ADDR: return "PPCISD::GET_TLSLD_ADDR";
+ case PPCISD::ADDI_TLSLD_L_ADDR: return "PPCISD::ADDI_TLSLD_L_ADDR";
+ case PPCISD::ADDIS_DTPREL_HA: return "PPCISD::ADDIS_DTPREL_HA";
+ case PPCISD::ADDI_DTPREL_L: return "PPCISD::ADDI_DTPREL_L";
+ case PPCISD::VADD_SPLAT: return "PPCISD::VADD_SPLAT";
+ case PPCISD::SC: return "PPCISD::SC";
+ case PPCISD::CLRBHRB: return "PPCISD::CLRBHRB";
+ case PPCISD::MFBHRBE: return "PPCISD::MFBHRBE";
+ case PPCISD::RFEBB: return "PPCISD::RFEBB";
+ case PPCISD::XXSWAPD: return "PPCISD::XXSWAPD";
+ case PPCISD::SWAP_NO_CHAIN: return "PPCISD::SWAP_NO_CHAIN";
+ case PPCISD::QVFPERM: return "PPCISD::QVFPERM";
+ case PPCISD::QVGPCI: return "PPCISD::QVGPCI";
+ case PPCISD::QVALIGNI: return "PPCISD::QVALIGNI";
+ case PPCISD::QVESPLATI: return "PPCISD::QVESPLATI";
+ case PPCISD::QBFLT: return "PPCISD::QBFLT";
+ case PPCISD::QVLFSb: return "PPCISD::QVLFSb";
+ }
+ return nullptr;
+}
+
+EVT PPCTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &C,
+ EVT VT) const {
+ if (!VT.isVector())
+ return Subtarget.useCRBits() ? MVT::i1 : MVT::i32;
+
+ if (Subtarget.hasQPX())
+ return EVT::getVectorVT(C, MVT::i1, VT.getVectorNumElements());
+
+ return VT.changeVectorElementTypeToInteger();
+}
+
+bool PPCTargetLowering::enableAggressiveFMAFusion(EVT VT) const {
+ assert(VT.isFloatingPoint() && "Non-floating-point FMA?");
+ return true;
+}
+
+//===----------------------------------------------------------------------===//
+// Node matching predicates, for use by the tblgen matching code.
+//===----------------------------------------------------------------------===//
+
+/// isFloatingPointZero - Return true if this is 0.0 or -0.0.
+static bool isFloatingPointZero(SDValue Op) {
+ if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
+ return CFP->getValueAPF().isZero();
+ else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
+ // Maybe this has already been legalized into the constant pool?
+ if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op.getOperand(1)))
+ if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
+ return CFP->getValueAPF().isZero();
+ }
+ return false;
+}
+
+/// isConstantOrUndef - Op is either an undef node or a ConstantSDNode. Return
+/// true if Op is undef or if it matches the specified value.
+static bool isConstantOrUndef(int Op, int Val) {
+ return Op < 0 || Op == Val;
+}
+
+/// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a
+/// VPKUHUM instruction.
+/// The ShuffleKind distinguishes between big-endian operations with
+/// two different inputs (0), either-endian operations with two identical
+/// inputs (1), and little-endian operations with two different inputs (2).
+/// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
+bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
+ SelectionDAG &DAG) {
+ bool IsLE = DAG.getDataLayout().isLittleEndian();
+ if (ShuffleKind == 0) {
+ if (IsLE)
+ return false;
+ for (unsigned i = 0; i != 16; ++i)
+ if (!isConstantOrUndef(N->getMaskElt(i), i*2+1))
+ return false;
+ } else if (ShuffleKind == 2) {
+ if (!IsLE)
+ return false;
+ for (unsigned i = 0; i != 16; ++i)
+ if (!isConstantOrUndef(N->getMaskElt(i), i*2))
+ return false;
+ } else if (ShuffleKind == 1) {
+ unsigned j = IsLE ? 0 : 1;
+ for (unsigned i = 0; i != 8; ++i)
+ if (!isConstantOrUndef(N->getMaskElt(i), i*2+j) ||
+ !isConstantOrUndef(N->getMaskElt(i+8), i*2+j))
+ return false;
+ }
+ return true;
+}
+
+/// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a
+/// VPKUWUM instruction.
+/// The ShuffleKind distinguishes between big-endian operations with
+/// two different inputs (0), either-endian operations with two identical
+/// inputs (1), and little-endian operations with two different inputs (2).
+/// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
+bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
+ SelectionDAG &DAG) {
+ bool IsLE = DAG.getDataLayout().isLittleEndian();
+ if (ShuffleKind == 0) {
+ if (IsLE)
+ return false;
+ for (unsigned i = 0; i != 16; i += 2)
+ if (!isConstantOrUndef(N->getMaskElt(i ), i*2+2) ||
+ !isConstantOrUndef(N->getMaskElt(i+1), i*2+3))
+ return false;
+ } else if (ShuffleKind == 2) {
+ if (!IsLE)
+ return false;
+ for (unsigned i = 0; i != 16; i += 2)
+ if (!isConstantOrUndef(N->getMaskElt(i ), i*2) ||
+ !isConstantOrUndef(N->getMaskElt(i+1), i*2+1))
+ return false;
+ } else if (ShuffleKind == 1) {
+ unsigned j = IsLE ? 0 : 2;
+ for (unsigned i = 0; i != 8; i += 2)
+ if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) ||
+ !isConstantOrUndef(N->getMaskElt(i+1), i*2+j+1) ||
+ !isConstantOrUndef(N->getMaskElt(i+8), i*2+j) ||
+ !isConstantOrUndef(N->getMaskElt(i+9), i*2+j+1))
+ return false;
+ }
+ return true;
+}
+
+/// isVPKUDUMShuffleMask - Return true if this is the shuffle mask for a
+/// VPKUDUM instruction, AND the VPKUDUM instruction exists for the
+/// current subtarget.
+///
+/// The ShuffleKind distinguishes between big-endian operations with
+/// two different inputs (0), either-endian operations with two identical
+/// inputs (1), and little-endian operations with two different inputs (2).
+/// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
+bool PPC::isVPKUDUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
+ SelectionDAG &DAG) {
+ const PPCSubtarget& Subtarget =
+ static_cast<const PPCSubtarget&>(DAG.getSubtarget());
+ if (!Subtarget.hasP8Vector())
+ return false;
+
+ bool IsLE = DAG.getDataLayout().isLittleEndian();
+ if (ShuffleKind == 0) {
+ if (IsLE)
+ return false;
+ for (unsigned i = 0; i != 16; i += 4)
+ if (!isConstantOrUndef(N->getMaskElt(i ), i*2+4) ||
+ !isConstantOrUndef(N->getMaskElt(i+1), i*2+5) ||
+ !isConstantOrUndef(N->getMaskElt(i+2), i*2+6) ||
+ !isConstantOrUndef(N->getMaskElt(i+3), i*2+7))
+ return false;
+ } else if (ShuffleKind == 2) {
+ if (!IsLE)
+ return false;
+ for (unsigned i = 0; i != 16; i += 4)
+ if (!isConstantOrUndef(N->getMaskElt(i ), i*2) ||
+ !isConstantOrUndef(N->getMaskElt(i+1), i*2+1) ||
+ !isConstantOrUndef(N->getMaskElt(i+2), i*2+2) ||
+ !isConstantOrUndef(N->getMaskElt(i+3), i*2+3))
+ return false;
+ } else if (ShuffleKind == 1) {
+ unsigned j = IsLE ? 0 : 4;
+ for (unsigned i = 0; i != 8; i += 4)
+ if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) ||
+ !isConstantOrUndef(N->getMaskElt(i+1), i*2+j+1) ||
+ !isConstantOrUndef(N->getMaskElt(i+2), i*2+j+2) ||
+ !isConstantOrUndef(N->getMaskElt(i+3), i*2+j+3) ||
+ !isConstantOrUndef(N->getMaskElt(i+8), i*2+j) ||
+ !isConstantOrUndef(N->getMaskElt(i+9), i*2+j+1) ||
+ !isConstantOrUndef(N->getMaskElt(i+10), i*2+j+2) ||
+ !isConstantOrUndef(N->getMaskElt(i+11), i*2+j+3))
+ return false;
+ }
+ return true;
+}
+
+/// isVMerge - Common function, used to match vmrg* shuffles.
+///
+static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize,
+ unsigned LHSStart, unsigned RHSStart) {
+ if (N->getValueType(0) != MVT::v16i8)
+ return false;
+ assert((UnitSize == 1 || UnitSize == 2 || UnitSize == 4) &&
+ "Unsupported merge size!");
+
+ for (unsigned i = 0; i != 8/UnitSize; ++i) // Step over units
+ for (unsigned j = 0; j != UnitSize; ++j) { // Step over bytes within unit
+ if (!isConstantOrUndef(N->getMaskElt(i*UnitSize*2+j),
+ LHSStart+j+i*UnitSize) ||
+ !isConstantOrUndef(N->getMaskElt(i*UnitSize*2+UnitSize+j),
+ RHSStart+j+i*UnitSize))
+ return false;
+ }
+ return true;
+}
+
+/// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for
+/// a VMRGL* instruction with the specified unit size (1,2 or 4 bytes).
+/// The ShuffleKind distinguishes between big-endian merges with two
+/// different inputs (0), either-endian merges with two identical inputs (1),
+/// and little-endian merges with two different inputs (2). For the latter,
+/// the input operands are swapped (see PPCInstrAltivec.td).
+bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
+ unsigned ShuffleKind, SelectionDAG &DAG) {
+ if (DAG.getDataLayout().isLittleEndian()) {
+ if (ShuffleKind == 1) // unary
+ return isVMerge(N, UnitSize, 0, 0);
+ else if (ShuffleKind == 2) // swapped
+ return isVMerge(N, UnitSize, 0, 16);
+ else
+ return false;
+ } else {
+ if (ShuffleKind == 1) // unary
+ return isVMerge(N, UnitSize, 8, 8);
+ else if (ShuffleKind == 0) // normal
+ return isVMerge(N, UnitSize, 8, 24);
+ else
+ return false;
+ }
+}
+
+/// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for
+/// a VMRGH* instruction with the specified unit size (1,2 or 4 bytes).
+/// The ShuffleKind distinguishes between big-endian merges with two
+/// different inputs (0), either-endian merges with two identical inputs (1),
+/// and little-endian merges with two different inputs (2). For the latter,
+/// the input operands are swapped (see PPCInstrAltivec.td).
+bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
+ unsigned ShuffleKind, SelectionDAG &DAG) {
+ if (DAG.getDataLayout().isLittleEndian()) {
+ if (ShuffleKind == 1) // unary
+ return isVMerge(N, UnitSize, 8, 8);
+ else if (ShuffleKind == 2) // swapped
+ return isVMerge(N, UnitSize, 8, 24);
+ else
+ return false;
+ } else {
+ if (ShuffleKind == 1) // unary
+ return isVMerge(N, UnitSize, 0, 0);
+ else if (ShuffleKind == 0) // normal
+ return isVMerge(N, UnitSize, 0, 16);
+ else
+ return false;
+ }
+}
+
+/**
+ * \brief Common function used to match vmrgew and vmrgow shuffles
+ *
+ * The indexOffset determines whether to look for even or odd words in
+ * the shuffle mask. This is based on the of the endianness of the target
+ * machine.
+ * - Little Endian:
+ * - Use offset of 0 to check for odd elements
+ * - Use offset of 4 to check for even elements
+ * - Big Endian:
+ * - Use offset of 0 to check for even elements
+ * - Use offset of 4 to check for odd elements
+ * A detailed description of the vector element ordering for little endian and
+ * big endian can be found at
+ * http://www.ibm.com/developerworks/library/l-ibm-xl-c-cpp-compiler/index.html
+ * Targeting your applications - what little endian and big endian IBM XL C/C++
+ * compiler differences mean to you
+ *
+ * The mask to the shuffle vector instruction specifies the indices of the
+ * elements from the two input vectors to place in the result. The elements are
+ * numbered in array-access order, starting with the first vector. These vectors
+ * are always of type v16i8, thus each vector will contain 16 elements of size
+ * 8. More info on the shuffle vector can be found in the
+ * http://llvm.org/docs/LangRef.html#shufflevector-instruction
+ * Language Reference.
+ *
+ * The RHSStartValue indicates whether the same input vectors are used (unary)
+ * or two different input vectors are used, based on the following:
+ * - If the instruction uses the same vector for both inputs, the range of the
+ * indices will be 0 to 15. In this case, the RHSStart value passed should
+ * be 0.
+ * - If the instruction has two different vectors then the range of the
+ * indices will be 0 to 31. In this case, the RHSStart value passed should
+ * be 16 (indices 0-15 specify elements in the first vector while indices 16
+ * to 31 specify elements in the second vector).
+ *
+ * \param[in] N The shuffle vector SD Node to analyze
+ * \param[in] IndexOffset Specifies whether to look for even or odd elements
+ * \param[in] RHSStartValue Specifies the starting index for the righthand input
+ * vector to the shuffle_vector instruction
+ * \return true iff this shuffle vector represents an even or odd word merge
+ */
+static bool isVMerge(ShuffleVectorSDNode *N, unsigned IndexOffset,
+ unsigned RHSStartValue) {
+ if (N->getValueType(0) != MVT::v16i8)
+ return false;
+
+ for (unsigned i = 0; i < 2; ++i)
+ for (unsigned j = 0; j < 4; ++j)
+ if (!isConstantOrUndef(N->getMaskElt(i*4+j),
+ i*RHSStartValue+j+IndexOffset) ||
+ !isConstantOrUndef(N->getMaskElt(i*4+j+8),
+ i*RHSStartValue+j+IndexOffset+8))
+ return false;
+ return true;
+}
+
+/**
+ * \brief Determine if the specified shuffle mask is suitable for the vmrgew or
+ * vmrgow instructions.
+ *
+ * \param[in] N The shuffle vector SD Node to analyze
+ * \param[in] CheckEven Check for an even merge (true) or an odd merge (false)
+ * \param[in] ShuffleKind Identify the type of merge:
+ * - 0 = big-endian merge with two different inputs;
+ * - 1 = either-endian merge with two identical inputs;
+ * - 2 = little-endian merge with two different inputs (inputs are swapped for
+ * little-endian merges).
+ * \param[in] DAG The current SelectionDAG
+ * \return true iff this shuffle mask
+ */
+bool PPC::isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven,
+ unsigned ShuffleKind, SelectionDAG &DAG) {
+ if (DAG.getDataLayout().isLittleEndian()) {
+ unsigned indexOffset = CheckEven ? 4 : 0;
+ if (ShuffleKind == 1) // Unary
+ return isVMerge(N, indexOffset, 0);
+ else if (ShuffleKind == 2) // swapped
+ return isVMerge(N, indexOffset, 16);
+ else
+ return false;
+ }
+ else {
+ unsigned indexOffset = CheckEven ? 0 : 4;
+ if (ShuffleKind == 1) // Unary
+ return isVMerge(N, indexOffset, 0);
+ else if (ShuffleKind == 0) // Normal
+ return isVMerge(N, indexOffset, 16);
+ else
+ return false;
+ }
+ return false;
+}
+
+/// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift
+/// amount, otherwise return -1.
+/// The ShuffleKind distinguishes between big-endian operations with two
+/// different inputs (0), either-endian operations with two identical inputs
+/// (1), and little-endian operations with two different inputs (2). For the
+/// latter, the input operands are swapped (see PPCInstrAltivec.td).
+int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind,
+ SelectionDAG &DAG) {
+ if (N->getValueType(0) != MVT::v16i8)
+ return -1;
+
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+
+ // Find the first non-undef value in the shuffle mask.
+ unsigned i;
+ for (i = 0; i != 16 && SVOp->getMaskElt(i) < 0; ++i)
+ /*search*/;
+
+ if (i == 16) return -1; // all undef.
+
+ // Otherwise, check to see if the rest of the elements are consecutively
+ // numbered from this value.
+ unsigned ShiftAmt = SVOp->getMaskElt(i);
+ if (ShiftAmt < i) return -1;
+
+ ShiftAmt -= i;
+ bool isLE = DAG.getDataLayout().isLittleEndian();
+
+ if ((ShuffleKind == 0 && !isLE) || (ShuffleKind == 2 && isLE)) {
+ // Check the rest of the elements to see if they are consecutive.
+ for (++i; i != 16; ++i)
+ if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i))
+ return -1;
+ } else if (ShuffleKind == 1) {
+ // Check the rest of the elements to see if they are consecutive.
+ for (++i; i != 16; ++i)
+ if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15))
+ return -1;
+ } else
+ return -1;
+
+ if (isLE)
+ ShiftAmt = 16 - ShiftAmt;
+
+ return ShiftAmt;
+}
+
+/// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a splat of a single element that is suitable for input to
+/// VSPLTB/VSPLTH/VSPLTW.
+bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) {
+ assert(N->getValueType(0) == MVT::v16i8 &&
+ (EltSize == 1 || EltSize == 2 || EltSize == 4));
+
+ // The consecutive indices need to specify an element, not part of two
+ // different elements. So abandon ship early if this isn't the case.
+ if (N->getMaskElt(0) % EltSize != 0)
+ return false;
+
+ // This is a splat operation if each element of the permute is the same, and
+ // if the value doesn't reference the second vector.
+ unsigned ElementBase = N->getMaskElt(0);
+
+ // FIXME: Handle UNDEF elements too!
+ if (ElementBase >= 16)
+ return false;
+
+ // Check that the indices are consecutive, in the case of a multi-byte element
+ // splatted with a v16i8 mask.
+ for (unsigned i = 1; i != EltSize; ++i)
+ if (N->getMaskElt(i) < 0 || N->getMaskElt(i) != (int)(i+ElementBase))
+ return false;
+
+ for (unsigned i = EltSize, e = 16; i != e; i += EltSize) {
+ if (N->getMaskElt(i) < 0) continue;
+ for (unsigned j = 0; j != EltSize; ++j)
+ if (N->getMaskElt(i+j) != N->getMaskElt(j))
+ return false;
+ }
+ return true;
+}
+
+bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
+ unsigned &InsertAtByte, bool &Swap, bool IsLE) {
+
+ // Check that the mask is shuffling words
+ for (unsigned i = 0; i < 4; ++i) {
+ unsigned B0 = N->getMaskElt(i*4);
+ unsigned B1 = N->getMaskElt(i*4+1);
+ unsigned B2 = N->getMaskElt(i*4+2);
+ unsigned B3 = N->getMaskElt(i*4+3);
+ if (B0 % 4)
+ return false;
+ if (B1 != B0+1 || B2 != B1+1 || B3 != B2+1)
+ return false;
+ }
+
+ // Now we look at mask elements 0,4,8,12
+ unsigned M0 = N->getMaskElt(0) / 4;
+ unsigned M1 = N->getMaskElt(4) / 4;
+ unsigned M2 = N->getMaskElt(8) / 4;
+ unsigned M3 = N->getMaskElt(12) / 4;
+ unsigned LittleEndianShifts[] = { 2, 1, 0, 3 };
+ unsigned BigEndianShifts[] = { 3, 0, 1, 2 };
+
+ // Below, let H and L be arbitrary elements of the shuffle mask
+ // where H is in the range [4,7] and L is in the range [0,3].
+ // H, 1, 2, 3 or L, 5, 6, 7
+ if ((M0 > 3 && M1 == 1 && M2 == 2 && M3 == 3) ||
+ (M0 < 4 && M1 == 5 && M2 == 6 && M3 == 7)) {
+ ShiftElts = IsLE ? LittleEndianShifts[M0 & 0x3] : BigEndianShifts[M0 & 0x3];
+ InsertAtByte = IsLE ? 12 : 0;
+ Swap = M0 < 4;
+ return true;
+ }
+ // 0, H, 2, 3 or 4, L, 6, 7
+ if ((M1 > 3 && M0 == 0 && M2 == 2 && M3 == 3) ||
+ (M1 < 4 && M0 == 4 && M2 == 6 && M3 == 7)) {
+ ShiftElts = IsLE ? LittleEndianShifts[M1 & 0x3] : BigEndianShifts[M1 & 0x3];
+ InsertAtByte = IsLE ? 8 : 4;
+ Swap = M1 < 4;
+ return true;
+ }
+ // 0, 1, H, 3 or 4, 5, L, 7
+ if ((M2 > 3 && M0 == 0 && M1 == 1 && M3 == 3) ||
+ (M2 < 4 && M0 == 4 && M1 == 5 && M3 == 7)) {
+ ShiftElts = IsLE ? LittleEndianShifts[M2 & 0x3] : BigEndianShifts[M2 & 0x3];
+ InsertAtByte = IsLE ? 4 : 8;
+ Swap = M2 < 4;
+ return true;
+ }
+ // 0, 1, 2, H or 4, 5, 6, L
+ if ((M3 > 3 && M0 == 0 && M1 == 1 && M2 == 2) ||
+ (M3 < 4 && M0 == 4 && M1 == 5 && M2 == 6)) {
+ ShiftElts = IsLE ? LittleEndianShifts[M3 & 0x3] : BigEndianShifts[M3 & 0x3];
+ InsertAtByte = IsLE ? 0 : 12;
+ Swap = M3 < 4;
+ return true;
+ }
+
+ // If both vector operands for the shuffle are the same vector, the mask will
+ // contain only elements from the first one and the second one will be undef.
+ if (N->getOperand(1).isUndef()) {
+ ShiftElts = 0;
+ Swap = true;
+ unsigned XXINSERTWSrcElem = IsLE ? 2 : 1;
+ if (M0 == XXINSERTWSrcElem && M1 == 1 && M2 == 2 && M3 == 3) {
+ InsertAtByte = IsLE ? 12 : 0;
+ return true;
+ }
+ if (M0 == 0 && M1 == XXINSERTWSrcElem && M2 == 2 && M3 == 3) {
+ InsertAtByte = IsLE ? 8 : 4;
+ return true;
+ }
+ if (M0 == 0 && M1 == 1 && M2 == XXINSERTWSrcElem && M3 == 3) {
+ InsertAtByte = IsLE ? 4 : 8;
+ return true;
+ }
+ if (M0 == 0 && M1 == 1 && M2 == 2 && M3 == XXINSERTWSrcElem) {
+ InsertAtByte = IsLE ? 0 : 12;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the
+/// specified isSplatShuffleMask VECTOR_SHUFFLE mask.
+unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize,
+ SelectionDAG &DAG) {
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ assert(isSplatShuffleMask(SVOp, EltSize));
+ if (DAG.getDataLayout().isLittleEndian())
+ return (16 / EltSize) - 1 - (SVOp->getMaskElt(0) / EltSize);
+ else
+ return SVOp->getMaskElt(0) / EltSize;
+}
+
+/// get_VSPLTI_elt - If this is a build_vector of constants which can be formed
+/// by using a vspltis[bhw] instruction of the specified element size, return
+/// the constant being splatted. The ByteSize field indicates the number of
+/// bytes of each element [124] -> [bhw].
+SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
+ SDValue OpVal(nullptr, 0);
+
+ // If ByteSize of the splat is bigger than the element size of the
+ // build_vector, then we have a case where we are checking for a splat where
+ // multiple elements of the buildvector are folded together into a single
+ // logical element of the splat (e.g. "vsplish 1" to splat {0,1}*8).
+ unsigned EltSize = 16/N->getNumOperands();
+ if (EltSize < ByteSize) {
+ unsigned Multiple = ByteSize/EltSize; // Number of BV entries per spltval.
+ SDValue UniquedVals[4];
+ assert(Multiple > 1 && Multiple <= 4 && "How can this happen?");
+
+ // See if all of the elements in the buildvector agree across.
+ for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+ if (N->getOperand(i).isUndef()) continue;
+ // If the element isn't a constant, bail fully out.
+ if (!isa<ConstantSDNode>(N->getOperand(i))) return SDValue();
+
+
+ if (!UniquedVals[i&(Multiple-1)].getNode())
+ UniquedVals[i&(Multiple-1)] = N->getOperand(i);
+ else if (UniquedVals[i&(Multiple-1)] != N->getOperand(i))
+ return SDValue(); // no match.
+ }
+
+ // Okay, if we reached this point, UniquedVals[0..Multiple-1] contains
+ // either constant or undef values that are identical for each chunk. See
+ // if these chunks can form into a larger vspltis*.
+
+ // Check to see if all of the leading entries are either 0 or -1. If
+ // neither, then this won't fit into the immediate field.
+ bool LeadingZero = true;
+ bool LeadingOnes = true;
+ for (unsigned i = 0; i != Multiple-1; ++i) {
+ if (!UniquedVals[i].getNode()) continue; // Must have been undefs.
+
+ LeadingZero &= isNullConstant(UniquedVals[i]);
+ LeadingOnes &= isAllOnesConstant(UniquedVals[i]);
+ }
+ // Finally, check the least significant entry.
+ if (LeadingZero) {
+ if (!UniquedVals[Multiple-1].getNode())
+ return DAG.getTargetConstant(0, SDLoc(N), MVT::i32); // 0,0,0,undef
+ int Val = cast<ConstantSDNode>(UniquedVals[Multiple-1])->getZExtValue();
+ if (Val < 16) // 0,0,0,4 -> vspltisw(4)
+ return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32);
+ }
+ if (LeadingOnes) {
+ if (!UniquedVals[Multiple-1].getNode())
+ return DAG.getTargetConstant(~0U, SDLoc(N), MVT::i32); // -1,-1,-1,undef
+ int Val =cast<ConstantSDNode>(UniquedVals[Multiple-1])->getSExtValue();
+ if (Val >= -16) // -1,-1,-1,-2 -> vspltisw(-2)
+ return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32);
+ }
+
+ return SDValue();
+ }
+
+ // Check to see if this buildvec has a single non-undef value in its elements.
+ for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+ if (N->getOperand(i).isUndef()) continue;
+ if (!OpVal.getNode())
+ OpVal = N->getOperand(i);
+ else if (OpVal != N->getOperand(i))
+ return SDValue();
+ }
+
+ if (!OpVal.getNode()) return SDValue(); // All UNDEF: use implicit def.
+
+ unsigned ValSizeInBytes = EltSize;
+ uint64_t Value = 0;
+ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) {
+ Value = CN->getZExtValue();
+ } else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal)) {
+ assert(CN->getValueType(0) == MVT::f32 && "Only one legal FP vector type!");
+ Value = FloatToBits(CN->getValueAPF().convertToFloat());
+ }
+
+ // If the splat value is larger than the element value, then we can never do
+ // this splat. The only case that we could fit the replicated bits into our
+ // immediate field for would be zero, and we prefer to use vxor for it.
+ if (ValSizeInBytes < ByteSize) return SDValue();
+
+ // If the element value is larger than the splat value, check if it consists
+ // of a repeated bit pattern of size ByteSize.
+ if (!APInt(ValSizeInBytes * 8, Value).isSplat(ByteSize * 8))
+ return SDValue();
+
+ // Properly sign extend the value.
+ int MaskVal = SignExtend32(Value, ByteSize * 8);
+
+ // If this is zero, don't match, zero matches ISD::isBuildVectorAllZeros.
+ if (MaskVal == 0) return SDValue();
+
+ // Finally, if this value fits in a 5 bit sext field, return it
+ if (SignExtend32<5>(MaskVal) == MaskVal)
+ return DAG.getTargetConstant(MaskVal, SDLoc(N), MVT::i32);
+ return SDValue();
+}
+
+/// isQVALIGNIShuffleMask - If this is a qvaligni shuffle mask, return the shift
+/// amount, otherwise return -1.
+int PPC::isQVALIGNIShuffleMask(SDNode *N) {
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::v4f64 && VT != MVT::v4f32 && VT != MVT::v4i1)
+ return -1;
+
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+
+ // Find the first non-undef value in the shuffle mask.
+ unsigned i;
+ for (i = 0; i != 4 && SVOp->getMaskElt(i) < 0; ++i)
+ /*search*/;
+
+ if (i == 4) return -1; // all undef.
+
+ // Otherwise, check to see if the rest of the elements are consecutively
+ // numbered from this value.
+ unsigned ShiftAmt = SVOp->getMaskElt(i);
+ if (ShiftAmt < i) return -1;
+ ShiftAmt -= i;
+
+ // Check the rest of the elements to see if they are consecutive.
+ for (++i; i != 4; ++i)
+ if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i))
+ return -1;
+
+ return ShiftAmt;
+}
+
+//===----------------------------------------------------------------------===//
+// Addressing Mode Selection
+//===----------------------------------------------------------------------===//
+
+/// isIntS16Immediate - This method tests to see if the node is either a 32-bit
+/// or 64-bit immediate, and if the value can be accurately represented as a
+/// sign extension from a 16-bit value. If so, this returns true and the
+/// immediate.
+static bool isIntS16Immediate(SDNode *N, short &Imm) {
+ if (!isa<ConstantSDNode>(N))
+ return false;
+
+ Imm = (short)cast<ConstantSDNode>(N)->getZExtValue();
+ if (N->getValueType(0) == MVT::i32)
+ return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue();
+ else
+ return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue();
+}
+static bool isIntS16Immediate(SDValue Op, short &Imm) {
+ return isIntS16Immediate(Op.getNode(), Imm);
+}
+
+/// SelectAddressRegReg - Given the specified addressed, check to see if it
+/// can be represented as an indexed [r+r] operation. Returns false if it
+/// can be more efficiently represented with [r+imm].
+bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base,
+ SDValue &Index,
+ SelectionDAG &DAG) const {
+ short imm = 0;
+ if (N.getOpcode() == ISD::ADD) {
+ if (isIntS16Immediate(N.getOperand(1), imm))
+ return false; // r+i
+ if (N.getOperand(1).getOpcode() == PPCISD::Lo)
+ return false; // r+i
+
+ Base = N.getOperand(0);
+ Index = N.getOperand(1);
+ return true;
+ } else if (N.getOpcode() == ISD::OR) {
+ if (isIntS16Immediate(N.getOperand(1), imm))
+ return false; // r+i can fold it if we can.
+
+ // If this is an or of disjoint bitfields, we can codegen this as an add
+ // (for better address arithmetic) if the LHS and RHS of the OR are provably
+ // disjoint.
+ APInt LHSKnownZero, LHSKnownOne;
+ APInt RHSKnownZero, RHSKnownOne;
+ DAG.computeKnownBits(N.getOperand(0),
+ LHSKnownZero, LHSKnownOne);
+
+ if (LHSKnownZero.getBoolValue()) {
+ DAG.computeKnownBits(N.getOperand(1),
+ RHSKnownZero, RHSKnownOne);
+ // If all of the bits are known zero on the LHS or RHS, the add won't
+ // carry.
+ if (~(LHSKnownZero | RHSKnownZero) == 0) {
+ Base = N.getOperand(0);
+ Index = N.getOperand(1);
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+// If we happen to be doing an i64 load or store into a stack slot that has
+// less than a 4-byte alignment, then the frame-index elimination may need to
+// use an indexed load or store instruction (because the offset may not be a
+// multiple of 4). The extra register needed to hold the offset comes from the
+// register scavenger, and it is possible that the scavenger will need to use
+// an emergency spill slot. As a result, we need to make sure that a spill slot
+// is allocated when doing an i64 load/store into a less-than-4-byte-aligned
+// stack slot.
+static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) {
+ // FIXME: This does not handle the LWA case.
+ if (VT != MVT::i64)
+ return;
+
+ // NOTE: We'll exclude negative FIs here, which come from argument
+ // lowering, because there are no known test cases triggering this problem
+ // using packed structures (or similar). We can remove this exclusion if
+ // we find such a test case. The reason why this is so test-case driven is
+ // because this entire 'fixup' is only to prevent crashes (from the
+ // register scavenger) on not-really-valid inputs. For example, if we have:
+ // %a = alloca i1
+ // %b = bitcast i1* %a to i64*
+ // store i64* a, i64 b
+ // then the store should really be marked as 'align 1', but is not. If it
+ // were marked as 'align 1' then the indexed form would have been
+ // instruction-selected initially, and the problem this 'fixup' is preventing
+ // won't happen regardless.
+ if (FrameIdx < 0)
+ return;
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ unsigned Align = MFI.getObjectAlignment(FrameIdx);
+ if (Align >= 4)
+ return;
+
+ PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+ FuncInfo->setHasNonRISpills();
+}
+
+/// Returns true if the address N can be represented by a base register plus
+/// a signed 16-bit displacement [r+imm], and if it is not better
+/// represented as reg+reg. If Aligned is true, only accept displacements
+/// suitable for STD and friends, i.e. multiples of 4.
+bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
+ SDValue &Base,
+ SelectionDAG &DAG,
+ bool Aligned) const {
+ // FIXME dl should come from parent load or store, not from address
+ SDLoc dl(N);
+ // If this can be more profitably realized as r+r, fail.
+ if (SelectAddressRegReg(N, Disp, Base, DAG))
+ return false;
+
+ if (N.getOpcode() == ISD::ADD) {
+ short imm = 0;
+ if (isIntS16Immediate(N.getOperand(1), imm) &&
+ (!Aligned || (imm & 3) == 0)) {
+ Disp = DAG.getTargetConstant(imm, dl, N.getValueType());
+ if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
+ Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+ fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
+ } else {
+ Base = N.getOperand(0);
+ }
+ return true; // [r+i]
+ } else if (N.getOperand(1).getOpcode() == PPCISD::Lo) {
+ // Match LOAD (ADD (X, Lo(G))).
+ assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue()
+ && "Cannot handle constant offsets yet!");
+ Disp = N.getOperand(1).getOperand(0); // The global address.
+ assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||
+ Disp.getOpcode() == ISD::TargetGlobalTLSAddress ||
+ Disp.getOpcode() == ISD::TargetConstantPool ||
+ Disp.getOpcode() == ISD::TargetJumpTable);
+ Base = N.getOperand(0);
+ return true; // [&g+r]
+ }
+ } else if (N.getOpcode() == ISD::OR) {
+ short imm = 0;
+ if (isIntS16Immediate(N.getOperand(1), imm) &&
+ (!Aligned || (imm & 3) == 0)) {
+ // If this is an or of disjoint bitfields, we can codegen this as an add
+ // (for better address arithmetic) if the LHS and RHS of the OR are
+ // provably disjoint.
+ APInt LHSKnownZero, LHSKnownOne;
+ DAG.computeKnownBits(N.getOperand(0), LHSKnownZero, LHSKnownOne);
+
+ if ((LHSKnownZero.getZExtValue()|~(uint64_t)imm) == ~0ULL) {
+ // If all of the bits are known zero on the LHS or RHS, the add won't
+ // carry.
+ if (FrameIndexSDNode *FI =
+ dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
+ Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+ fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
+ } else {
+ Base = N.getOperand(0);
+ }
+ Disp = DAG.getTargetConstant(imm, dl, N.getValueType());
+ return true;
+ }
+ }
+ } else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
+ // Loading from a constant address.
+
+ // If this address fits entirely in a 16-bit sext immediate field, codegen
+ // this as "d, 0"
+ short Imm;
+ if (isIntS16Immediate(CN, Imm) && (!Aligned || (Imm & 3) == 0)) {
+ Disp = DAG.getTargetConstant(Imm, dl, CN->getValueType(0));
+ Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
+ CN->getValueType(0));
+ return true;
+ }
+
+ // Handle 32-bit sext immediates with LIS + addr mode.
+ if ((CN->getValueType(0) == MVT::i32 ||
+ (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) &&
+ (!Aligned || (CN->getZExtValue() & 3) == 0)) {
+ int Addr = (int)CN->getZExtValue();
+
+ // Otherwise, break this down into an LIS + disp.
+ Disp = DAG.getTargetConstant((short)Addr, dl, MVT::i32);
+
+ Base = DAG.getTargetConstant((Addr - (signed short)Addr) >> 16, dl,
+ MVT::i32);
+ unsigned Opc = CN->getValueType(0) == MVT::i32 ? PPC::LIS : PPC::LIS8;
+ Base = SDValue(DAG.getMachineNode(Opc, dl, CN->getValueType(0), Base), 0);
+ return true;
+ }
+ }
+
+ Disp = DAG.getTargetConstant(0, dl, getPointerTy(DAG.getDataLayout()));
+ if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N)) {
+ Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+ fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
+ } else
+ Base = N;
+ return true; // [r+0]
+}
+
+/// SelectAddressRegRegOnly - Given the specified addressed, force it to be
+/// represented as an indexed [r+r] operation.
+bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base,
+ SDValue &Index,
+ SelectionDAG &DAG) const {
+ // Check to see if we can easily represent this as an [r+r] address. This
+ // will fail if it thinks that the address is more profitably represented as
+ // reg+imm, e.g. where imm = 0.
+ if (SelectAddressRegReg(N, Base, Index, DAG))
+ return true;
+
+ // If the operand is an addition, always emit this as [r+r], since this is
+ // better (for code size, and execution, as the memop does the add for free)
+ // than emitting an explicit add.
+ if (N.getOpcode() == ISD::ADD) {
+ Base = N.getOperand(0);
+ Index = N.getOperand(1);
+ return true;
+ }
+
+ // Otherwise, do it the hard way, using R0 as the base register.
+ Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
+ N.getValueType());
+ Index = N;
+ return true;
+}
+
+/// getPreIndexedAddressParts - returns true by value, base pointer and
+/// offset pointer and addressing mode by reference if the node's address
+/// can be legally represented as pre-indexed load / store address.
+bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
+ SDValue &Offset,
+ ISD::MemIndexedMode &AM,
+ SelectionDAG &DAG) const {
+ if (DisablePPCPreinc) return false;
+
+ bool isLoad = true;
+ SDValue Ptr;
+ EVT VT;
+ unsigned Alignment;
+ if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+ Ptr = LD->getBasePtr();
+ VT = LD->getMemoryVT();
+ Alignment = LD->getAlignment();
+ } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+ Ptr = ST->getBasePtr();
+ VT = ST->getMemoryVT();
+ Alignment = ST->getAlignment();
+ isLoad = false;
+ } else
+ return false;
+
+ // PowerPC doesn't have preinc load/store instructions for vectors (except
+ // for QPX, which does have preinc r+r forms).
+ if (VT.isVector()) {
+ if (!Subtarget.hasQPX() || (VT != MVT::v4f64 && VT != MVT::v4f32)) {
+ return false;
+ } else if (SelectAddressRegRegOnly(Ptr, Offset, Base, DAG)) {
+ AM = ISD::PRE_INC;
+ return true;
+ }
+ }
+
+ if (SelectAddressRegReg(Ptr, Base, Offset, DAG)) {
+
+ // Common code will reject creating a pre-inc form if the base pointer
+ // is a frame index, or if N is a store and the base pointer is either
+ // the same as or a predecessor of the value being stored. Check for
+ // those situations here, and try with swapped Base/Offset instead.
+ bool Swap = false;
+
+ if (isa<FrameIndexSDNode>(Base) || isa<RegisterSDNode>(Base))
+ Swap = true;
+ else if (!isLoad) {
+ SDValue Val = cast<StoreSDNode>(N)->getValue();
+ if (Val == Base || Base.getNode()->isPredecessorOf(Val.getNode()))
+ Swap = true;
+ }
+
+ if (Swap)
+ std::swap(Base, Offset);
+
+ AM = ISD::PRE_INC;
+ return true;
+ }
+
+ // LDU/STU can only handle immediates that are a multiple of 4.
+ if (VT != MVT::i64) {
+ if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, false))
+ return false;
+ } else {
+ // LDU/STU need an address with at least 4-byte alignment.
+ if (Alignment < 4)
+ return false;
+
+ if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, true))
+ return false;
+ }
+
+ if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+ // PPC64 doesn't have lwau, but it does have lwaux. Reject preinc load of
+ // sext i32 to i64 when addr mode is r+i.
+ if (LD->getValueType(0) == MVT::i64 && LD->getMemoryVT() == MVT::i32 &&
+ LD->getExtensionType() == ISD::SEXTLOAD &&
+ isa<ConstantSDNode>(Offset))
+ return false;
+ }
+
+ AM = ISD::PRE_INC;
+ return true;
+}
+
+//===----------------------------------------------------------------------===//
+// LowerOperation implementation
+//===----------------------------------------------------------------------===//
+
+/// Return true if we should reference labels using a PICBase, set the HiOpFlags
+/// and LoOpFlags to the target MO flags.
+static void getLabelAccessInfo(bool IsPIC, const PPCSubtarget &Subtarget,
+ unsigned &HiOpFlags, unsigned &LoOpFlags,
+ const GlobalValue *GV = nullptr) {
+ HiOpFlags = PPCII::MO_HA;
+ LoOpFlags = PPCII::MO_LO;
+
+ // Don't use the pic base if not in PIC relocation model.
+ if (IsPIC) {
+ HiOpFlags |= PPCII::MO_PIC_FLAG;
+ LoOpFlags |= PPCII::MO_PIC_FLAG;
+ }
+
+ // If this is a reference to a global value that requires a non-lazy-ptr, make
+ // sure that instruction lowering adds it.
+ if (GV && Subtarget.hasLazyResolverStub(GV)) {
+ HiOpFlags |= PPCII::MO_NLP_FLAG;
+ LoOpFlags |= PPCII::MO_NLP_FLAG;
+
+ if (GV->hasHiddenVisibility()) {
+ HiOpFlags |= PPCII::MO_NLP_HIDDEN_FLAG;
+ LoOpFlags |= PPCII::MO_NLP_HIDDEN_FLAG;
+ }
+ }
+}
+
+static SDValue LowerLabelRef(SDValue HiPart, SDValue LoPart, bool isPIC,
+ SelectionDAG &DAG) {
+ SDLoc DL(HiPart);
+ EVT PtrVT = HiPart.getValueType();
+ SDValue Zero = DAG.getConstant(0, DL, PtrVT);
+
+ SDValue Hi = DAG.getNode(PPCISD::Hi, DL, PtrVT, HiPart, Zero);
+ SDValue Lo = DAG.getNode(PPCISD::Lo, DL, PtrVT, LoPart, Zero);
+
+ // With PIC, the first instruction is actually "GR+hi(&G)".
+ if (isPIC)
+ Hi = DAG.getNode(ISD::ADD, DL, PtrVT,
+ DAG.getNode(PPCISD::GlobalBaseReg, DL, PtrVT), Hi);
+
+ // Generate non-pic code that has direct accesses to the constant pool.
+ // The address of the global is just (hi(&g)+lo(&g)).
+ return DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Lo);
+}
+
+static void setUsesTOCBasePtr(MachineFunction &MF) {
+ PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+ FuncInfo->setUsesTOCBasePtr();
+}
+
+static void setUsesTOCBasePtr(SelectionDAG &DAG) {
+ setUsesTOCBasePtr(DAG.getMachineFunction());
+}
+
+static SDValue getTOCEntry(SelectionDAG &DAG, const SDLoc &dl, bool Is64Bit,
+ SDValue GA) {
+ EVT VT = Is64Bit ? MVT::i64 : MVT::i32;
+ SDValue Reg = Is64Bit ? DAG.getRegister(PPC::X2, VT) :
+ DAG.getNode(PPCISD::GlobalBaseReg, dl, VT);
+
+ SDValue Ops[] = { GA, Reg };
+ return DAG.getMemIntrinsicNode(
+ PPCISD::TOC_ENTRY, dl, DAG.getVTList(VT, MVT::Other), Ops, VT,
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()), 0, false, true,
+ false, 0);
+}
+
+SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT PtrVT = Op.getValueType();
+ ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+ const Constant *C = CP->getConstVal();
+
+ // 64-bit SVR4 ABI code is always position-independent.
+ // The actual address of the GlobalValue is stored in the TOC.
+ if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
+ setUsesTOCBasePtr(DAG);
+ SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0);
+ return getTOCEntry(DAG, SDLoc(CP), true, GA);
+ }
+
+ unsigned MOHiFlag, MOLoFlag;
+ bool IsPIC = isPositionIndependent();
+ getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);
+
+ if (IsPIC && Subtarget.isSVR4ABI()) {
+ SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(),
+ PPCII::MO_PIC_FLAG);
+ return getTOCEntry(DAG, SDLoc(CP), false, GA);
+ }
+
+ SDValue CPIHi =
+ DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOHiFlag);
+ SDValue CPILo =
+ DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOLoFlag);
+ return LowerLabelRef(CPIHi, CPILo, IsPIC, DAG);
+}
+
+// For 64-bit PowerPC, prefer the more compact relative encodings.
+// This trades 32 bits per jump table entry for one or two instructions
+// on the jump site.
+unsigned PPCTargetLowering::getJumpTableEncoding() const {
+ if (isJumpTableRelative())
+ return MachineJumpTableInfo::EK_LabelDifference32;
+
+ return TargetLowering::getJumpTableEncoding();
+}
+
+bool PPCTargetLowering::isJumpTableRelative() const {
+ if (Subtarget.isPPC64())
+ return true;
+ return TargetLowering::isJumpTableRelative();
+}
+
+SDValue PPCTargetLowering::getPICJumpTableRelocBase(SDValue Table,
+ SelectionDAG &DAG) const {
+ if (!Subtarget.isPPC64())
+ return TargetLowering::getPICJumpTableRelocBase(Table, DAG);
+
+ switch (getTargetMachine().getCodeModel()) {
+ case CodeModel::Default:
+ case CodeModel::Small:
+ case CodeModel::Medium:
+ return TargetLowering::getPICJumpTableRelocBase(Table, DAG);
+ default:
+ return DAG.getNode(PPCISD::GlobalBaseReg, SDLoc(),
+ getPointerTy(DAG.getDataLayout()));
+ }
+}
+
+const MCExpr *
+PPCTargetLowering::getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
+ unsigned JTI,
+ MCContext &Ctx) const {
+ if (!Subtarget.isPPC64())
+ return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
+
+ switch (getTargetMachine().getCodeModel()) {
+ case CodeModel::Default:
+ case CodeModel::Small:
+ case CodeModel::Medium:
+ return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
+ default:
+ return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
+ }
+}
+
+SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
+ EVT PtrVT = Op.getValueType();
+ JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+
+ // 64-bit SVR4 ABI code is always position-independent.
+ // The actual address of the GlobalValue is stored in the TOC.
+ if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
+ setUsesTOCBasePtr(DAG);
+ SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
+ return getTOCEntry(DAG, SDLoc(JT), true, GA);
+ }
+
+ unsigned MOHiFlag, MOLoFlag;
+ bool IsPIC = isPositionIndependent();
+ getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);
+
+ if (IsPIC && Subtarget.isSVR4ABI()) {
+ SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
+ PPCII::MO_PIC_FLAG);
+ return getTOCEntry(DAG, SDLoc(GA), false, GA);
+ }
+
+ SDValue JTIHi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOHiFlag);
+ SDValue JTILo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOLoFlag);
+ return LowerLabelRef(JTIHi, JTILo, IsPIC, DAG);
+}
+
+SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT PtrVT = Op.getValueType();
+ BlockAddressSDNode *BASDN = cast<BlockAddressSDNode>(Op);
+ const BlockAddress *BA = BASDN->getBlockAddress();
+
+ // 64-bit SVR4 ABI code is always position-independent.
+ // The actual BlockAddress is stored in the TOC.
+ if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
+ setUsesTOCBasePtr(DAG);
+ SDValue GA = DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset());
+ return getTOCEntry(DAG, SDLoc(BASDN), true, GA);
+ }
+
+ unsigned MOHiFlag, MOLoFlag;
+ bool IsPIC = isPositionIndependent();
+ getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);
+ SDValue TgtBAHi = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOHiFlag);
+ SDValue TgtBALo = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOLoFlag);
+ return LowerLabelRef(TgtBAHi, TgtBALo, IsPIC, DAG);
+}
+
+SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
+ SelectionDAG &DAG) const {
+
+ // FIXME: TLS addresses currently use medium model code sequences,
+ // which is the most useful form. Eventually support for small and
+ // large models could be added if users need it, at the cost of
+ // additional complexity.
+ GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+ if (DAG.getTarget().Options.EmulatedTLS)
+ return LowerToTLSEmulatedModel(GA, DAG);
+
+ SDLoc dl(GA);
+ const GlobalValue *GV = GA->getGlobal();
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ bool is64bit = Subtarget.isPPC64();
+ const Module *M = DAG.getMachineFunction().getFunction()->getParent();
+ PICLevel::Level picLevel = M->getPICLevel();
+
+ TLSModel::Model Model = getTargetMachine().getTLSModel(GV);
+
+ if (Model == TLSModel::LocalExec) {
+ SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+ PPCII::MO_TPREL_HA);
+ SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+ PPCII::MO_TPREL_LO);
+ SDValue TLSReg = DAG.getRegister(is64bit ? PPC::X13 : PPC::R2,
+ is64bit ? MVT::i64 : MVT::i32);
+ SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, TGAHi, TLSReg);
+ return DAG.getNode(PPCISD::Lo, dl, PtrVT, TGALo, Hi);
+ }
+
+ if (Model == TLSModel::InitialExec) {
+ SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
+ SDValue TGATLS = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+ PPCII::MO_TLS);
+ SDValue GOTPtr;
+ if (is64bit) {
+ setUsesTOCBasePtr(DAG);
+ SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
+ GOTPtr = DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl,
+ PtrVT, GOTReg, TGA);
+ } else
+ GOTPtr = DAG.getNode(PPCISD::PPC32_GOT, dl, PtrVT);
+ SDValue TPOffset = DAG.getNode(PPCISD::LD_GOT_TPREL_L, dl,
+ PtrVT, TGA, GOTPtr);
+ return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TPOffset, TGATLS);
+ }
+
+ if (Model == TLSModel::GeneralDynamic) {
+ SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
+ SDValue GOTPtr;
+ if (is64bit) {
+ setUsesTOCBasePtr(DAG);
+ SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
+ GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSGD_HA, dl, PtrVT,
+ GOTReg, TGA);
+ } else {
+ if (picLevel == PICLevel::SmallPIC)
+ GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
+ else
+ GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
+ }
+ return DAG.getNode(PPCISD::ADDI_TLSGD_L_ADDR, dl, PtrVT,
+ GOTPtr, TGA, TGA);
+ }
+
+ if (Model == TLSModel::LocalDynamic) {
+ SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
+ SDValue GOTPtr;
+ if (is64bit) {
+ setUsesTOCBasePtr(DAG);
+ SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
+ GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSLD_HA, dl, PtrVT,
+ GOTReg, TGA);
+ } else {
+ if (picLevel == PICLevel::SmallPIC)
+ GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
+ else
+ GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
+ }
+ SDValue TLSAddr = DAG.getNode(PPCISD::ADDI_TLSLD_L_ADDR, dl,
+ PtrVT, GOTPtr, TGA, TGA);
+ SDValue DtvOffsetHi = DAG.getNode(PPCISD::ADDIS_DTPREL_HA, dl,
+ PtrVT, TLSAddr, TGA);
+ return DAG.getNode(PPCISD::ADDI_DTPREL_L, dl, PtrVT, DtvOffsetHi, TGA);
+ }
+
+ llvm_unreachable("Unknown TLS model!");
+}
+
+SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT PtrVT = Op.getValueType();
+ GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
+ SDLoc DL(GSDN);
+ const GlobalValue *GV = GSDN->getGlobal();
+
+ // 64-bit SVR4 ABI code is always position-independent.
+ // The actual address of the GlobalValue is stored in the TOC.
+ if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
+ setUsesTOCBasePtr(DAG);
+ SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset());
+ return getTOCEntry(DAG, DL, true, GA);
+ }
+
+ unsigned MOHiFlag, MOLoFlag;
+ bool IsPIC = isPositionIndependent();
+ getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag, GV);
+
+ if (IsPIC && Subtarget.isSVR4ABI()) {
+ SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT,
+ GSDN->getOffset(),
+ PPCII::MO_PIC_FLAG);
+ return getTOCEntry(DAG, DL, false, GA);
+ }
+
+ SDValue GAHi =
+ DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOHiFlag);
+ SDValue GALo =
+ DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOLoFlag);
+
+ SDValue Ptr = LowerLabelRef(GAHi, GALo, IsPIC, DAG);
+
+ // If the global reference is actually to a non-lazy-pointer, we have to do an
+ // extra load to get the address of the global.
+ if (MOHiFlag & PPCII::MO_NLP_FLAG)
+ Ptr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Ptr, MachinePointerInfo());
+ return Ptr;
+}
+
+SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+ SDLoc dl(Op);
+
+ if (Op.getValueType() == MVT::v2i64) {
+ // When the operands themselves are v2i64 values, we need to do something
+ // special because VSX has no underlying comparison operations for these.
+ if (Op.getOperand(0).getValueType() == MVT::v2i64) {
+ // Equality can be handled by casting to the legal type for Altivec
+ // comparisons, everything else needs to be expanded.
+ if (CC == ISD::SETEQ || CC == ISD::SETNE) {
+ return DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
+ DAG.getSetCC(dl, MVT::v4i32,
+ DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0)),
+ DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(1)),
+ CC));
+ }
+
+ return SDValue();
+ }
+
+ // We handle most of these in the usual way.
+ return Op;
+ }
+
+ // If we're comparing for equality to zero, expose the fact that this is
+ // implemented as a ctlz/srl pair on ppc, so that the dag combiner can
+ // fold the new nodes.
+ if (SDValue V = lowerCmpEqZeroToCtlzSrl(Op, DAG))
+ return V;
+
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+ // Leave comparisons against 0 and -1 alone for now, since they're usually
+ // optimized. FIXME: revisit this when we can custom lower all setcc
+ // optimizations.
+ if (C->isAllOnesValue() || C->isNullValue())
+ return SDValue();
+ }
+
+ // If we have an integer seteq/setne, turn it into a compare against zero
+ // by xor'ing the rhs with the lhs, which is faster than setting a
+ // condition register, reading it back out, and masking the correct bit. The
+ // normal approach here uses sub to do this instead of xor. Using xor exposes
+ // the result to other bit-twiddling opportunities.
+ EVT LHSVT = Op.getOperand(0).getValueType();
+ if (LHSVT.isInteger() && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+ EVT VT = Op.getValueType();
+ SDValue Sub = DAG.getNode(ISD::XOR, dl, LHSVT, Op.getOperand(0),
+ Op.getOperand(1));
+ return DAG.getSetCC(dl, VT, Sub, DAG.getConstant(0, dl, LHSVT), CC);
+ }
+ return SDValue();
+}
+
+SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
+ SDNode *Node = Op.getNode();
+ EVT VT = Node->getValueType(0);
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue InChain = Node->getOperand(0);
+ SDValue VAListPtr = Node->getOperand(1);
+ const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
+ SDLoc dl(Node);
+
+ assert(!Subtarget.isPPC64() && "LowerVAARG is PPC32 only");
+
+ // gpr_index
+ SDValue GprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain,
+ VAListPtr, MachinePointerInfo(SV), MVT::i8);
+ InChain = GprIndex.getValue(1);
+
+ if (VT == MVT::i64) {
+ // Check if GprIndex is even
+ SDValue GprAnd = DAG.getNode(ISD::AND, dl, MVT::i32, GprIndex,
+ DAG.getConstant(1, dl, MVT::i32));
+ SDValue CC64 = DAG.getSetCC(dl, MVT::i32, GprAnd,
+ DAG.getConstant(0, dl, MVT::i32), ISD::SETNE);
+ SDValue GprIndexPlusOne = DAG.getNode(ISD::ADD, dl, MVT::i32, GprIndex,
+ DAG.getConstant(1, dl, MVT::i32));
+ // Align GprIndex to be even if it isn't
+ GprIndex = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC64, GprIndexPlusOne,
+ GprIndex);
+ }
+
+ // fpr index is 1 byte after gpr
+ SDValue FprPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
+ DAG.getConstant(1, dl, MVT::i32));
+
+ // fpr
+ SDValue FprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain,
+ FprPtr, MachinePointerInfo(SV), MVT::i8);
+ InChain = FprIndex.getValue(1);
+
+ SDValue RegSaveAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
+ DAG.getConstant(8, dl, MVT::i32));
+
+ SDValue OverflowAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
+ DAG.getConstant(4, dl, MVT::i32));
+
+ // areas
+ SDValue OverflowArea =
+ DAG.getLoad(MVT::i32, dl, InChain, OverflowAreaPtr, MachinePointerInfo());
+ InChain = OverflowArea.getValue(1);
+
+ SDValue RegSaveArea =
+ DAG.getLoad(MVT::i32, dl, InChain, RegSaveAreaPtr, MachinePointerInfo());
+ InChain = RegSaveArea.getValue(1);
+
+ // select overflow_area if index > 8
+ SDValue CC = DAG.getSetCC(dl, MVT::i32, VT.isInteger() ? GprIndex : FprIndex,
+ DAG.getConstant(8, dl, MVT::i32), ISD::SETLT);
+
+ // adjustment constant gpr_index * 4/8
+ SDValue RegConstant = DAG.getNode(ISD::MUL, dl, MVT::i32,
+ VT.isInteger() ? GprIndex : FprIndex,
+ DAG.getConstant(VT.isInteger() ? 4 : 8, dl,
+ MVT::i32));
+
+ // OurReg = RegSaveArea + RegConstant
+ SDValue OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, RegSaveArea,
+ RegConstant);
+
+ // Floating types are 32 bytes into RegSaveArea
+ if (VT.isFloatingPoint())
+ OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, OurReg,
+ DAG.getConstant(32, dl, MVT::i32));
+
+ // increase {f,g}pr_index by 1 (or 2 if VT is i64)
+ SDValue IndexPlus1 = DAG.getNode(ISD::ADD, dl, MVT::i32,
+ VT.isInteger() ? GprIndex : FprIndex,
+ DAG.getConstant(VT == MVT::i64 ? 2 : 1, dl,
+ MVT::i32));
+
+ InChain = DAG.getTruncStore(InChain, dl, IndexPlus1,
+ VT.isInteger() ? VAListPtr : FprPtr,
+ MachinePointerInfo(SV), MVT::i8);
+
+ // determine if we should load from reg_save_area or overflow_area
+ SDValue Result = DAG.getNode(ISD::SELECT, dl, PtrVT, CC, OurReg, OverflowArea);
+
+ // increase overflow_area by 4/8 if gpr/fpr > 8
+ SDValue OverflowAreaPlusN = DAG.getNode(ISD::ADD, dl, PtrVT, OverflowArea,
+ DAG.getConstant(VT.isInteger() ? 4 : 8,
+ dl, MVT::i32));
+
+ OverflowArea = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC, OverflowArea,
+ OverflowAreaPlusN);
+
+ InChain = DAG.getTruncStore(InChain, dl, OverflowArea, OverflowAreaPtr,
+ MachinePointerInfo(), MVT::i32);
+
+ return DAG.getLoad(VT, dl, InChain, Result, MachinePointerInfo());
+}
+
+SDValue PPCTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
+ assert(!Subtarget.isPPC64() && "LowerVACOPY is PPC32 only");
+
+ // We have to copy the entire va_list struct:
+ // 2*sizeof(char) + 2 Byte alignment + 2*sizeof(char*) = 12 Byte
+ return DAG.getMemcpy(Op.getOperand(0), Op,
+ Op.getOperand(1), Op.getOperand(2),
+ DAG.getConstant(12, SDLoc(Op), MVT::i32), 8, false, true,
+ false, MachinePointerInfo(), MachinePointerInfo());
+}
+
+SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
+ SelectionDAG &DAG) const {
+ return Op.getOperand(0);
+}
+
+SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue Chain = Op.getOperand(0);
+ SDValue Trmp = Op.getOperand(1); // trampoline
+ SDValue FPtr = Op.getOperand(2); // nested function
+ SDValue Nest = Op.getOperand(3); // 'nest' parameter value
+ SDLoc dl(Op);
+
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ bool isPPC64 = (PtrVT == MVT::i64);
+ Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
+
+ TargetLowering::ArgListTy Args;
+ TargetLowering::ArgListEntry Entry;
+
+ Entry.Ty = IntPtrTy;
+ Entry.Node = Trmp; Args.push_back(Entry);
+
+ // TrampSize == (isPPC64 ? 48 : 40);
+ Entry.Node = DAG.getConstant(isPPC64 ? 48 : 40, dl,
+ isPPC64 ? MVT::i64 : MVT::i32);
+ Args.push_back(Entry);
+
+ Entry.Node = FPtr; Args.push_back(Entry);
+ Entry.Node = Nest; Args.push_back(Entry);
+
+ // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(dl).setChain(Chain)
+ .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+ DAG.getExternalSymbol("__trampoline_setup", PtrVT),
+ std::move(Args));
+
+ std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+ return CallResult.second;
+}
+
+SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+ EVT PtrVT = getPointerTy(MF.getDataLayout());
+
+ SDLoc dl(Op);
+
+ if (Subtarget.isDarwinABI() || Subtarget.isPPC64()) {
+ // vastart just stores the address of the VarArgsFrameIndex slot into the
+ // memory location argument.
+ SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+ const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+ return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
+ MachinePointerInfo(SV));
+ }
+
+ // For the 32-bit SVR4 ABI we follow the layout of the va_list struct.
+ // We suppose the given va_list is already allocated.
+ //
+ // typedef struct {
+ // char gpr; /* index into the array of 8 GPRs
+ // * stored in the register save area
+ // * gpr=0 corresponds to r3,
+ // * gpr=1 to r4, etc.
+ // */
+ // char fpr; /* index into the array of 8 FPRs
+ // * stored in the register save area
+ // * fpr=0 corresponds to f1,
+ // * fpr=1 to f2, etc.
+ // */
+ // char *overflow_arg_area;
+ // /* location on stack that holds
+ // * the next overflow argument
+ // */
+ // char *reg_save_area;
+ // /* where r3:r10 and f1:f8 (if saved)
+ // * are stored
+ // */
+ // } va_list[1];
+
+ SDValue ArgGPR = DAG.getConstant(FuncInfo->getVarArgsNumGPR(), dl, MVT::i32);
+ SDValue ArgFPR = DAG.getConstant(FuncInfo->getVarArgsNumFPR(), dl, MVT::i32);
+ SDValue StackOffsetFI = DAG.getFrameIndex(FuncInfo->getVarArgsStackOffset(),
+ PtrVT);
+ SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
+ PtrVT);
+
+ uint64_t FrameOffset = PtrVT.getSizeInBits()/8;
+ SDValue ConstFrameOffset = DAG.getConstant(FrameOffset, dl, PtrVT);
+
+ uint64_t StackOffset = PtrVT.getSizeInBits()/8 - 1;
+ SDValue ConstStackOffset = DAG.getConstant(StackOffset, dl, PtrVT);
+
+ uint64_t FPROffset = 1;
+ SDValue ConstFPROffset = DAG.getConstant(FPROffset, dl, PtrVT);
+
+ const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+
+ // Store first byte : number of int regs
+ SDValue firstStore =
+ DAG.getTruncStore(Op.getOperand(0), dl, ArgGPR, Op.getOperand(1),
+ MachinePointerInfo(SV), MVT::i8);
+ uint64_t nextOffset = FPROffset;
+ SDValue nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, Op.getOperand(1),
+ ConstFPROffset);
+
+ // Store second byte : number of float regs
+ SDValue secondStore =
+ DAG.getTruncStore(firstStore, dl, ArgFPR, nextPtr,
+ MachinePointerInfo(SV, nextOffset), MVT::i8);
+ nextOffset += StackOffset;
+ nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstStackOffset);
+
+ // Store second word : arguments given on stack
+ SDValue thirdStore = DAG.getStore(secondStore, dl, StackOffsetFI, nextPtr,
+ MachinePointerInfo(SV, nextOffset));
+ nextOffset += FrameOffset;
+ nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstFrameOffset);
+
+ // Store third word : arguments given in registers
+ return DAG.getStore(thirdStore, dl, FR, nextPtr,
+ MachinePointerInfo(SV, nextOffset));
+}
+
+#include "PPCGenCallingConv.inc"
+
+// Function whose sole purpose is to kill compiler warnings
+// stemming from unused functions included from PPCGenCallingConv.inc.
+CCAssignFn *PPCTargetLowering::useFastISelCCs(unsigned Flag) const {
+ return Flag ? CC_PPC64_ELF_FIS : RetCC_PPC64_ELF_FIS;
+}
+
+bool llvm::CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags,
+ CCState &State) {
+ return true;
+}
+
+bool llvm::CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
+ MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags,
+ CCState &State) {
+ static const MCPhysReg ArgRegs[] = {
+ PPC::R3, PPC::R4, PPC::R5, PPC::R6,
+ PPC::R7, PPC::R8, PPC::R9, PPC::R10,
+ };
+ const unsigned NumArgRegs = array_lengthof(ArgRegs);
+
+ unsigned RegNum = State.getFirstUnallocated(ArgRegs);
+
+ // Skip one register if the first unallocated register has an even register
+ // number and there are still argument registers available which have not been
+ // allocated yet. RegNum is actually an index into ArgRegs, which means we
+ // need to skip a register if RegNum is odd.
+ if (RegNum != NumArgRegs && RegNum % 2 == 1) {
+ State.AllocateReg(ArgRegs[RegNum]);
+ }
+
+ // Always return false here, as this function only makes sure that the first
+ // unallocated register has an odd register number and does not actually
+ // allocate a register for the current argument.
+ return false;
+}
+
+bool
+llvm::CC_PPC32_SVR4_Custom_SkipLastArgRegsPPCF128(unsigned &ValNo, MVT &ValVT,
+ MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags,
+ CCState &State) {
+ static const MCPhysReg ArgRegs[] = {
+ PPC::R3, PPC::R4, PPC::R5, PPC::R6,
+ PPC::R7, PPC::R8, PPC::R9, PPC::R10,
+ };
+ const unsigned NumArgRegs = array_lengthof(ArgRegs);
+
+ unsigned RegNum = State.getFirstUnallocated(ArgRegs);
+ int RegsLeft = NumArgRegs - RegNum;
+
+ // Skip if there is not enough registers left for long double type (4 gpr regs
+ // in soft float mode) and put long double argument on the stack.
+ if (RegNum != NumArgRegs && RegsLeft < 4) {
+ for (int i = 0; i < RegsLeft; i++) {
+ State.AllocateReg(ArgRegs[RegNum + i]);
+ }
+ }
+
+ return false;
+}
+
+bool llvm::CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
+ MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags,
+ CCState &State) {
+ static const MCPhysReg ArgRegs[] = {
+ PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
+ PPC::F8
+ };
+
+ const unsigned NumArgRegs = array_lengthof(ArgRegs);
+
+ unsigned RegNum = State.getFirstUnallocated(ArgRegs);
+
+ // If there is only one Floating-point register left we need to put both f64
+ // values of a split ppc_fp128 value on the stack.
+ if (RegNum != NumArgRegs && ArgRegs[RegNum] == PPC::F8) {
+ State.AllocateReg(ArgRegs[RegNum]);
+ }
+
+ // Always return false here, as this function only makes sure that the two f64
+ // values a ppc_fp128 value is split into are both passed in registers or both
+ // passed on the stack and does not actually allocate a register for the
+ // current argument.
+ return false;
+}
+
+/// FPR - The set of FP registers that should be allocated for arguments,
+/// on Darwin.
+static const MCPhysReg FPR[] = {PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5,
+ PPC::F6, PPC::F7, PPC::F8, PPC::F9, PPC::F10,
+ PPC::F11, PPC::F12, PPC::F13};
+
+/// QFPR - The set of QPX registers that should be allocated for arguments.
+static const MCPhysReg QFPR[] = {
+ PPC::QF1, PPC::QF2, PPC::QF3, PPC::QF4, PPC::QF5, PPC::QF6, PPC::QF7,
+ PPC::QF8, PPC::QF9, PPC::QF10, PPC::QF11, PPC::QF12, PPC::QF13};
+
+/// CalculateStackSlotSize - Calculates the size reserved for this argument on
+/// the stack.
+static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags,
+ unsigned PtrByteSize) {
+ unsigned ArgSize = ArgVT.getStoreSize();
+ if (Flags.isByVal())
+ ArgSize = Flags.getByValSize();
+
+ // Round up to multiples of the pointer size, except for array members,
+ // which are always packed.
+ if (!Flags.isInConsecutiveRegs())
+ ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+
+ return ArgSize;
+}
+
+/// CalculateStackSlotAlignment - Calculates the alignment of this argument
+/// on the stack.
+static unsigned CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT,
+ ISD::ArgFlagsTy Flags,
+ unsigned PtrByteSize) {
+ unsigned Align = PtrByteSize;
+
+ // Altivec parameters are padded to a 16 byte boundary.
+ if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
+ ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
+ ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 ||
+ ArgVT == MVT::v1i128)
+ Align = 16;
+ // QPX vector types stored in double-precision are padded to a 32 byte
+ // boundary.
+ else if (ArgVT == MVT::v4f64 || ArgVT == MVT::v4i1)
+ Align = 32;
+
+ // ByVal parameters are aligned as requested.
+ if (Flags.isByVal()) {
+ unsigned BVAlign = Flags.getByValAlign();
+ if (BVAlign > PtrByteSize) {
+ if (BVAlign % PtrByteSize != 0)
+ llvm_unreachable(
+ "ByVal alignment is not a multiple of the pointer size");
+
+ Align = BVAlign;
+ }
+ }
+
+ // Array members are always packed to their original alignment.
+ if (Flags.isInConsecutiveRegs()) {
+ // If the array member was split into multiple registers, the first
+ // needs to be aligned to the size of the full type. (Except for
+ // ppcf128, which is only aligned as its f64 components.)
+ if (Flags.isSplit() && OrigVT != MVT::ppcf128)
+ Align = OrigVT.getStoreSize();
+ else
+ Align = ArgVT.getStoreSize();
+ }
+
+ return Align;
+}
+
+/// CalculateStackSlotUsed - Return whether this argument will use its
+/// stack slot (instead of being passed in registers). ArgOffset,
+/// AvailableFPRs, and AvailableVRs must hold the current argument
+/// position, and will be updated to account for this argument.
+static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT,
+ ISD::ArgFlagsTy Flags,
+ unsigned PtrByteSize,
+ unsigned LinkageSize,
+ unsigned ParamAreaSize,
+ unsigned &ArgOffset,
+ unsigned &AvailableFPRs,
+ unsigned &AvailableVRs, bool HasQPX) {
+ bool UseMemory = false;
+
+ // Respect alignment of argument on the stack.
+ unsigned Align =
+ CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
+ ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
+ // If there's no space left in the argument save area, we must
+ // use memory (this check also catches zero-sized arguments).
+ if (ArgOffset >= LinkageSize + ParamAreaSize)
+ UseMemory = true;
+
+ // Allocate argument on the stack.
+ ArgOffset += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
+ if (Flags.isInConsecutiveRegsLast())
+ ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+ // If we overran the argument save area, we must use memory
+ // (this check catches arguments passed partially in memory)
+ if (ArgOffset > LinkageSize + ParamAreaSize)
+ UseMemory = true;
+
+ // However, if the argument is actually passed in an FPR or a VR,
+ // we don't use memory after all.
+ if (!Flags.isByVal()) {
+ if (ArgVT == MVT::f32 || ArgVT == MVT::f64 ||
+ // QPX registers overlap with the scalar FP registers.
+ (HasQPX && (ArgVT == MVT::v4f32 ||
+ ArgVT == MVT::v4f64 ||
+ ArgVT == MVT::v4i1)))
+ if (AvailableFPRs > 0) {
+ --AvailableFPRs;
+ return false;
+ }
+ if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
+ ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
+ ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 ||
+ ArgVT == MVT::v1i128)
+ if (AvailableVRs > 0) {
+ --AvailableVRs;
+ return false;
+ }
+ }
+
+ return UseMemory;
+}
+
+/// EnsureStackAlignment - Round stack frame size up from NumBytes to
+/// ensure minimum alignment required for target.
+static unsigned EnsureStackAlignment(const PPCFrameLowering *Lowering,
+ unsigned NumBytes) {
+ unsigned TargetAlign = Lowering->getStackAlignment();
+ unsigned AlignMask = TargetAlign - 1;
+ NumBytes = (NumBytes + AlignMask) & ~AlignMask;
+ return NumBytes;
+}
+
+SDValue PPCTargetLowering::LowerFormalArguments(
+ SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+ if (Subtarget.isSVR4ABI()) {
+ if (Subtarget.isPPC64())
+ return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins,
+ dl, DAG, InVals);
+ else
+ return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins,
+ dl, DAG, InVals);
+ } else {
+ return LowerFormalArguments_Darwin(Chain, CallConv, isVarArg, Ins,
+ dl, DAG, InVals);
+ }
+}
+
+SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
+ SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+
+ // 32-bit SVR4 ABI Stack Frame Layout:
+ // +-----------------------------------+
+ // +--> | Back chain |
+ // | +-----------------------------------+
+ // | | Floating-point register save area |
+ // | +-----------------------------------+
+ // | | General register save area |
+ // | +-----------------------------------+
+ // | | CR save word |
+ // | +-----------------------------------+
+ // | | VRSAVE save word |
+ // | +-----------------------------------+
+ // | | Alignment padding |
+ // | +-----------------------------------+
+ // | | Vector register save area |
+ // | +-----------------------------------+
+ // | | Local variable space |
+ // | +-----------------------------------+
+ // | | Parameter list area |
+ // | +-----------------------------------+
+ // | | LR save word |
+ // | +-----------------------------------+
+ // SP--> +--- | Back chain |
+ // +-----------------------------------+
+ //
+ // Specifications:
+ // System V Application Binary Interface PowerPC Processor Supplement
+ // AltiVec Technology Programming Interface Manual
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+
+ EVT PtrVT = getPointerTy(MF.getDataLayout());
+ // Potential tail calls could cause overwriting of argument stack slots.
+ bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
+ (CallConv == CallingConv::Fast));
+ unsigned PtrByteSize = 4;
+
+ // Assign locations to all of the incoming arguments.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ PPCCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+ *DAG.getContext());
+
+ // Reserve space for the linkage area on the stack.
+ unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
+ CCInfo.AllocateStack(LinkageSize, PtrByteSize);
+ if (useSoftFloat())
+ CCInfo.PreAnalyzeFormalArguments(Ins);
+
+ CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4);
+ CCInfo.clearWasPPCF128();
+
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+
+ // Arguments stored in registers.
+ if (VA.isRegLoc()) {
+ const TargetRegisterClass *RC;
+ EVT ValVT = VA.getValVT();
+
+ switch (ValVT.getSimpleVT().SimpleTy) {
+ default:
+ llvm_unreachable("ValVT not supported by formal arguments Lowering");
+ case MVT::i1:
+ case MVT::i32:
+ RC = &PPC::GPRCRegClass;
+ break;
+ case MVT::f32:
+ if (Subtarget.hasP8Vector())
+ RC = &PPC::VSSRCRegClass;
+ else
+ RC = &PPC::F4RCRegClass;
+ break;
+ case MVT::f64:
+ if (Subtarget.hasVSX())
+ RC = &PPC::VSFRCRegClass;
+ else
+ RC = &PPC::F8RCRegClass;
+ break;
+ case MVT::v16i8:
+ case MVT::v8i16:
+ case MVT::v4i32:
+ RC = &PPC::VRRCRegClass;
+ break;
+ case MVT::v4f32:
+ RC = Subtarget.hasQPX() ? &PPC::QSRCRegClass : &PPC::VRRCRegClass;
+ break;
+ case MVT::v2f64:
+ case MVT::v2i64:
+ RC = &PPC::VRRCRegClass;
+ break;
+ case MVT::v4f64:
+ RC = &PPC::QFRCRegClass;
+ break;
+ case MVT::v4i1:
+ RC = &PPC::QBRCRegClass;
+ break;
+ }
+
+ // Transform the arguments stored in physical registers into virtual ones.
+ unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+ SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg,
+ ValVT == MVT::i1 ? MVT::i32 : ValVT);
+
+ if (ValVT == MVT::i1)
+ ArgValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgValue);
+
+ InVals.push_back(ArgValue);
+ } else {
+ // Argument stored in memory.
+ assert(VA.isMemLoc());
+
+ unsigned ArgSize = VA.getLocVT().getStoreSize();
+ int FI = MFI.CreateFixedObject(ArgSize, VA.getLocMemOffset(),
+ isImmutable);
+
+ // Create load nodes to retrieve arguments from the stack.
+ SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+ InVals.push_back(
+ DAG.getLoad(VA.getValVT(), dl, Chain, FIN, MachinePointerInfo()));
+ }
+ }
+
+ // Assign locations to all of the incoming aggregate by value arguments.
+ // Aggregates passed by value are stored in the local variable space of the
+ // caller's stack frame, right above the parameter list area.
+ SmallVector<CCValAssign, 16> ByValArgLocs;
+ CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+ ByValArgLocs, *DAG.getContext());
+
+ // Reserve stack space for the allocations in CCInfo.
+ CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
+
+ CCByValInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4_ByVal);
+
+ // Area that is at least reserved in the caller of this function.
+ unsigned MinReservedArea = CCByValInfo.getNextStackOffset();
+ MinReservedArea = std::max(MinReservedArea, LinkageSize);
+
+ // Set the size that is at least reserved in caller of this function. Tail
+ // call optimized function's reserved stack space needs to be aligned so that
+ // taking the difference between two stack areas will result in an aligned
+ // stack.
+ MinReservedArea =
+ EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);
+ FuncInfo->setMinReservedArea(MinReservedArea);
+
+ SmallVector<SDValue, 8> MemOps;
+
+ // If the function takes variable number of arguments, make a frame index for
+ // the start of the first vararg value... for expansion of llvm.va_start.
+ if (isVarArg) {
+ static const MCPhysReg GPArgRegs[] = {
+ PPC::R3, PPC::R4, PPC::R5, PPC::R6,
+ PPC::R7, PPC::R8, PPC::R9, PPC::R10,
+ };
+ const unsigned NumGPArgRegs = array_lengthof(GPArgRegs);
+
+ static const MCPhysReg FPArgRegs[] = {
+ PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
+ PPC::F8
+ };
+ unsigned NumFPArgRegs = array_lengthof(FPArgRegs);
+
+ if (useSoftFloat())
+ NumFPArgRegs = 0;
+
+ FuncInfo->setVarArgsNumGPR(CCInfo.getFirstUnallocated(GPArgRegs));
+ FuncInfo->setVarArgsNumFPR(CCInfo.getFirstUnallocated(FPArgRegs));
+
+ // Make room for NumGPArgRegs and NumFPArgRegs.
+ int Depth = NumGPArgRegs * PtrVT.getSizeInBits()/8 +
+ NumFPArgRegs * MVT(MVT::f64).getSizeInBits()/8;
+
+ FuncInfo->setVarArgsStackOffset(
+ MFI.CreateFixedObject(PtrVT.getSizeInBits()/8,
+ CCInfo.getNextStackOffset(), true));
+
+ FuncInfo->setVarArgsFrameIndex(MFI.CreateStackObject(Depth, 8, false));
+ SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+
+ // The fixed integer arguments of a variadic function are stored to the
+ // VarArgsFrameIndex on the stack so that they may be loaded by
+ // dereferencing the result of va_next.
+ for (unsigned GPRIndex = 0; GPRIndex != NumGPArgRegs; ++GPRIndex) {
+ // Get an existing live-in vreg, or add a new one.
+ unsigned VReg = MF.getRegInfo().getLiveInVirtReg(GPArgRegs[GPRIndex]);
+ if (!VReg)
+ VReg = MF.addLiveIn(GPArgRegs[GPRIndex], &PPC::GPRCRegClass);
+
+ SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+ SDValue Store =
+ DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
+ MemOps.push_back(Store);
+ // Increment the address by four for the next argument to store
+ SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, dl, PtrVT);
+ FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
+ }
+
+ // FIXME 32-bit SVR4: We only need to save FP argument registers if CR bit 6
+ // is set.
+ // The double arguments are stored to the VarArgsFrameIndex
+ // on the stack.
+ for (unsigned FPRIndex = 0; FPRIndex != NumFPArgRegs; ++FPRIndex) {
+ // Get an existing live-in vreg, or add a new one.
+ unsigned VReg = MF.getRegInfo().getLiveInVirtReg(FPArgRegs[FPRIndex]);
+ if (!VReg)
+ VReg = MF.addLiveIn(FPArgRegs[FPRIndex], &PPC::F8RCRegClass);
+
+ SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::f64);
+ SDValue Store =
+ DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
+ MemOps.push_back(Store);
+ // Increment the address by eight for the next argument to store
+ SDValue PtrOff = DAG.getConstant(MVT(MVT::f64).getSizeInBits()/8, dl,
+ PtrVT);
+ FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
+ }
+ }
+
+ if (!MemOps.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
+
+ return Chain;
+}
+
+// PPC64 passes i8, i16, and i32 values in i64 registers. Promote
+// value to MVT::i64 and then truncate to the correct register size.
+SDValue PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags,
+ EVT ObjectVT, SelectionDAG &DAG,
+ SDValue ArgVal,
+ const SDLoc &dl) const {
+ if (Flags.isSExt())
+ ArgVal = DAG.getNode(ISD::AssertSext, dl, MVT::i64, ArgVal,
+ DAG.getValueType(ObjectVT));
+ else if (Flags.isZExt())
+ ArgVal = DAG.getNode(ISD::AssertZext, dl, MVT::i64, ArgVal,
+ DAG.getValueType(ObjectVT));
+
+ return DAG.getNode(ISD::TRUNCATE, dl, ObjectVT, ArgVal);
+}
+
+SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
+ SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+ // TODO: add description of PPC stack frame format, or at least some docs.
+ //
+ bool isELFv2ABI = Subtarget.isELFv2ABI();
+ bool isLittleEndian = Subtarget.isLittleEndian();
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+
+ assert(!(CallConv == CallingConv::Fast && isVarArg) &&
+ "fastcc not supported on varargs functions");
+
+ EVT PtrVT = getPointerTy(MF.getDataLayout());
+ // Potential tail calls could cause overwriting of argument stack slots.
+ bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
+ (CallConv == CallingConv::Fast));
+ unsigned PtrByteSize = 8;
+ unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
+
+ static const MCPhysReg GPR[] = {
+ PPC::X3, PPC::X4, PPC::X5, PPC::X6,
+ PPC::X7, PPC::X8, PPC::X9, PPC::X10,
+ };
+ static const MCPhysReg VR[] = {
+ PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
+ PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
+ };
+
+ const unsigned Num_GPR_Regs = array_lengthof(GPR);
+ const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13;
+ const unsigned Num_VR_Regs = array_lengthof(VR);
+ const unsigned Num_QFPR_Regs = Num_FPR_Regs;
+
+ // Do a first pass over the arguments to determine whether the ABI
+ // guarantees that our caller has allocated the parameter save area
+ // on its stack frame. In the ELFv1 ABI, this is always the case;
+ // in the ELFv2 ABI, it is true if this is a vararg function or if
+ // any parameter is located in a stack slot.
+
+ bool HasParameterArea = !isELFv2ABI || isVarArg;
+ unsigned ParamAreaSize = Num_GPR_Regs * PtrByteSize;
+ unsigned NumBytes = LinkageSize;
+ unsigned AvailableFPRs = Num_FPR_Regs;
+ unsigned AvailableVRs = Num_VR_Regs;
+ for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+ if (Ins[i].Flags.isNest())
+ continue;
+
+ if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].ArgVT, Ins[i].Flags,
+ PtrByteSize, LinkageSize, ParamAreaSize,
+ NumBytes, AvailableFPRs, AvailableVRs,
+ Subtarget.hasQPX()))
+ HasParameterArea = true;
+ }
+
+ // Add DAG nodes to load the arguments or copy them out of registers. On
+ // entry to a function on PPC, the arguments start after the linkage area,
+ // although the first ones are often in registers.
+
+ unsigned ArgOffset = LinkageSize;
+ unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
+ unsigned &QFPR_idx = FPR_idx;
+ SmallVector<SDValue, 8> MemOps;
+ Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin();
+ unsigned CurArgIdx = 0;
+ for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
+ SDValue ArgVal;
+ bool needsLoad = false;
+ EVT ObjectVT = Ins[ArgNo].VT;
+ EVT OrigVT = Ins[ArgNo].ArgVT;
+ unsigned ObjSize = ObjectVT.getStoreSize();
+ unsigned ArgSize = ObjSize;
+ ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
+ if (Ins[ArgNo].isOrigArg()) {
+ std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx);
+ CurArgIdx = Ins[ArgNo].getOrigArgIndex();
+ }
+ // We re-align the argument offset for each argument, except when using the
+ // fast calling convention, when we need to make sure we do that only when
+ // we'll actually use a stack slot.
+ unsigned CurArgOffset, Align;
+ auto ComputeArgOffset = [&]() {
+ /* Respect alignment of argument on the stack. */
+ Align = CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize);
+ ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
+ CurArgOffset = ArgOffset;
+ };
+
+ if (CallConv != CallingConv::Fast) {
+ ComputeArgOffset();
+
+ /* Compute GPR index associated with argument offset. */
+ GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
+ GPR_idx = std::min(GPR_idx, Num_GPR_Regs);
+ }
+
+ // FIXME the codegen can be much improved in some cases.
+ // We do not have to keep everything in memory.
+ if (Flags.isByVal()) {
+ assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit");
+
+ if (CallConv == CallingConv::Fast)
+ ComputeArgOffset();
+
+ // ObjSize is the true size, ArgSize rounded up to multiple of registers.
+ ObjSize = Flags.getByValSize();
+ ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+ // Empty aggregate parameters do not take up registers. Examples:
+ // struct { } a;
+ // union { } b;
+ // int c[0];
+ // etc. However, we have to provide a place-holder in InVals, so
+ // pretend we have an 8-byte item at the current address for that
+ // purpose.
+ if (!ObjSize) {
+ int FI = MFI.CreateFixedObject(PtrByteSize, ArgOffset, true);
+ SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+ InVals.push_back(FIN);
+ continue;
+ }
+
+ // Create a stack object covering all stack doublewords occupied
+ // by the argument. If the argument is (fully or partially) on
+ // the stack, or if the argument is fully in registers but the
+ // caller has allocated the parameter save anyway, we can refer
+ // directly to the caller's stack frame. Otherwise, create a
+ // local copy in our own frame.
+ int FI;
+ if (HasParameterArea ||
+ ArgSize + ArgOffset > LinkageSize + Num_GPR_Regs * PtrByteSize)
+ FI = MFI.CreateFixedObject(ArgSize, ArgOffset, false, true);
+ else
+ FI = MFI.CreateStackObject(ArgSize, Align, false);
+ SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+
+ // Handle aggregates smaller than 8 bytes.
+ if (ObjSize < PtrByteSize) {
+ // The value of the object is its address, which differs from the
+ // address of the enclosing doubleword on big-endian systems.
+ SDValue Arg = FIN;
+ if (!isLittleEndian) {
+ SDValue ArgOff = DAG.getConstant(PtrByteSize - ObjSize, dl, PtrVT);
+ Arg = DAG.getNode(ISD::ADD, dl, ArgOff.getValueType(), Arg, ArgOff);
+ }
+ InVals.push_back(Arg);
+
+ if (GPR_idx != Num_GPR_Regs) {
+ unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
+ SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+ SDValue Store;
+
+ if (ObjSize==1 || ObjSize==2 || ObjSize==4) {
+ EVT ObjType = (ObjSize == 1 ? MVT::i8 :
+ (ObjSize == 2 ? MVT::i16 : MVT::i32));
+ Store = DAG.getTruncStore(Val.getValue(1), dl, Val, Arg,
+ MachinePointerInfo(&*FuncArg), ObjType);
+ } else {
+ // For sizes that don't fit a truncating store (3, 5, 6, 7),
+ // store the whole register as-is to the parameter save area
+ // slot.
+ Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
+ MachinePointerInfo(&*FuncArg));
+ }
+
+ MemOps.push_back(Store);
+ }
+ // Whether we copied from a register or not, advance the offset
+ // into the parameter save area by a full doubleword.
+ ArgOffset += PtrByteSize;
+ continue;
+ }
+
+ // The value of the object is its address, which is the address of
+ // its first stack doubleword.
+ InVals.push_back(FIN);
+
+ // Store whatever pieces of the object are in registers to memory.
+ for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
+ if (GPR_idx == Num_GPR_Regs)
+ break;
+
+ unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+ SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+ SDValue Addr = FIN;
+ if (j) {
+ SDValue Off = DAG.getConstant(j, dl, PtrVT);
+ Addr = DAG.getNode(ISD::ADD, dl, Off.getValueType(), Addr, Off);
+ }
+ SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, Addr,
+ MachinePointerInfo(&*FuncArg, j));
+ MemOps.push_back(Store);
+ ++GPR_idx;
+ }
+ ArgOffset += ArgSize;
+ continue;
+ }
+
+ switch (ObjectVT.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("Unhandled argument type!");
+ case MVT::i1:
+ case MVT::i32:
+ case MVT::i64:
+ if (Flags.isNest()) {
+ // The 'nest' parameter, if any, is passed in R11.
+ unsigned VReg = MF.addLiveIn(PPC::X11, &PPC::G8RCRegClass);
+ ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
+
+ if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
+ ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
+
+ break;
+ }
+
+ // These can be scalar arguments or elements of an integer array type
+ // passed directly. Clang may use those instead of "byval" aggregate
+ // types to avoid forcing arguments to memory unnecessarily.
+ if (GPR_idx != Num_GPR_Regs) {
+ unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
+ ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
+
+ if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
+ // PPC64 passes i8, i16, and i32 values in i64 registers. Promote
+ // value to MVT::i64 and then truncate to the correct register size.
+ ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
+ } else {
+ if (CallConv == CallingConv::Fast)
+ ComputeArgOffset();
+
+ needsLoad = true;
+ ArgSize = PtrByteSize;
+ }
+ if (CallConv != CallingConv::Fast || needsLoad)
+ ArgOffset += 8;
+ break;
+
+ case MVT::f32:
+ case MVT::f64:
+ // These can be scalar arguments or elements of a float array type
+ // passed directly. The latter are used to implement ELFv2 homogenous
+ // float aggregates.
+ if (FPR_idx != Num_FPR_Regs) {
+ unsigned VReg;
+
+ if (ObjectVT == MVT::f32)
+ VReg = MF.addLiveIn(FPR[FPR_idx],
+ Subtarget.hasP8Vector()
+ ? &PPC::VSSRCRegClass
+ : &PPC::F4RCRegClass);
+ else
+ VReg = MF.addLiveIn(FPR[FPR_idx], Subtarget.hasVSX()
+ ? &PPC::VSFRCRegClass
+ : &PPC::F8RCRegClass);
+
+ ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
+ ++FPR_idx;
+ } else if (GPR_idx != Num_GPR_Regs && CallConv != CallingConv::Fast) {
+ // FIXME: We may want to re-enable this for CallingConv::Fast on the P8
+ // once we support fp <-> gpr moves.
+
+ // This can only ever happen in the presence of f32 array types,
+ // since otherwise we never run out of FPRs before running out
+ // of GPRs.
+ unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
+ ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
+
+ if (ObjectVT == MVT::f32) {
+ if ((ArgOffset % PtrByteSize) == (isLittleEndian ? 4 : 0))
+ ArgVal = DAG.getNode(ISD::SRL, dl, MVT::i64, ArgVal,
+ DAG.getConstant(32, dl, MVT::i32));
+ ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal);
+ }
+
+ ArgVal = DAG.getNode(ISD::BITCAST, dl, ObjectVT, ArgVal);
+ } else {
+ if (CallConv == CallingConv::Fast)
+ ComputeArgOffset();
+
+ needsLoad = true;
+ }
+
+ // When passing an array of floats, the array occupies consecutive
+ // space in the argument area; only round up to the next doubleword
+ // at the end of the array. Otherwise, each float takes 8 bytes.
+ if (CallConv != CallingConv::Fast || needsLoad) {
+ ArgSize = Flags.isInConsecutiveRegs() ? ObjSize : PtrByteSize;
+ ArgOffset += ArgSize;
+ if (Flags.isInConsecutiveRegsLast())
+ ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+ }
+ break;
+ case MVT::v4f32:
+ case MVT::v4i32:
+ case MVT::v8i16:
+ case MVT::v16i8:
+ case MVT::v2f64:
+ case MVT::v2i64:
+ case MVT::v1i128:
+ if (!Subtarget.hasQPX()) {
+ // These can be scalar arguments or elements of a vector array type
+ // passed directly. The latter are used to implement ELFv2 homogenous
+ // vector aggregates.
+ if (VR_idx != Num_VR_Regs) {
+ unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
+ ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
+ ++VR_idx;
+ } else {
+ if (CallConv == CallingConv::Fast)
+ ComputeArgOffset();
+
+ needsLoad = true;
+ }
+ if (CallConv != CallingConv::Fast || needsLoad)
+ ArgOffset += 16;
+ break;
+ } // not QPX
+
+ assert(ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 &&
+ "Invalid QPX parameter type");
+ /* fall through */
+
+ case MVT::v4f64:
+ case MVT::v4i1:
+ // QPX vectors are treated like their scalar floating-point subregisters
+ // (except that they're larger).
+ unsigned Sz = ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 ? 16 : 32;
+ if (QFPR_idx != Num_QFPR_Regs) {
+ const TargetRegisterClass *RC;
+ switch (ObjectVT.getSimpleVT().SimpleTy) {
+ case MVT::v4f64: RC = &PPC::QFRCRegClass; break;
+ case MVT::v4f32: RC = &PPC::QSRCRegClass; break;
+ default: RC = &PPC::QBRCRegClass; break;
+ }
+
+ unsigned VReg = MF.addLiveIn(QFPR[QFPR_idx], RC);
+ ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
+ ++QFPR_idx;
+ } else {
+ if (CallConv == CallingConv::Fast)
+ ComputeArgOffset();
+ needsLoad = true;
+ }
+ if (CallConv != CallingConv::Fast || needsLoad)
+ ArgOffset += Sz;
+ break;
+ }
+
+ // We need to load the argument to a virtual register if we determined
+ // above that we ran out of physical registers of the appropriate type.
+ if (needsLoad) {
+ if (ObjSize < ArgSize && !isLittleEndian)
+ CurArgOffset += ArgSize - ObjSize;
+ int FI = MFI.CreateFixedObject(ObjSize, CurArgOffset, isImmutable);
+ SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+ ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo());
+ }
+
+ InVals.push_back(ArgVal);
+ }
+
+ // Area that is at least reserved in the caller of this function.
+ unsigned MinReservedArea;
+ if (HasParameterArea)
+ MinReservedArea = std::max(ArgOffset, LinkageSize + 8 * PtrByteSize);
+ else
+ MinReservedArea = LinkageSize;
+
+ // Set the size that is at least reserved in caller of this function. Tail
+ // call optimized functions' reserved stack space needs to be aligned so that
+ // taking the difference between two stack areas will result in an aligned
+ // stack.
+ MinReservedArea =
+ EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);
+ FuncInfo->setMinReservedArea(MinReservedArea);
+
+ // If the function takes variable number of arguments, make a frame index for
+ // the start of the first vararg value... for expansion of llvm.va_start.
+ if (isVarArg) {
+ int Depth = ArgOffset;
+
+ FuncInfo->setVarArgsFrameIndex(
+ MFI.CreateFixedObject(PtrByteSize, Depth, true));
+ SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+
+ // If this function is vararg, store any remaining integer argument regs
+ // to their spots on the stack so that they may be loaded by dereferencing
+ // the result of va_next.
+ for (GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
+ GPR_idx < Num_GPR_Regs; ++GPR_idx) {
+ unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+ SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+ SDValue Store =
+ DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
+ MemOps.push_back(Store);
+ // Increment the address by four for the next argument to store
+ SDValue PtrOff = DAG.getConstant(PtrByteSize, dl, PtrVT);
+ FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
+ }
+ }
+
+ if (!MemOps.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
+
+ return Chain;
+}
+
+SDValue PPCTargetLowering::LowerFormalArguments_Darwin(
+ SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+ // TODO: add description of PPC stack frame format, or at least some docs.
+ //
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+
+ EVT PtrVT = getPointerTy(MF.getDataLayout());
+ bool isPPC64 = PtrVT == MVT::i64;
+ // Potential tail calls could cause overwriting of argument stack slots.
+ bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
+ (CallConv == CallingConv::Fast));
+ unsigned PtrByteSize = isPPC64 ? 8 : 4;
+ unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
+ unsigned ArgOffset = LinkageSize;
+ // Area that is at least reserved in caller of this function.
+ unsigned MinReservedArea = ArgOffset;
+
+ static const MCPhysReg GPR_32[] = { // 32-bit registers.
+ PPC::R3, PPC::R4, PPC::R5, PPC::R6,
+ PPC::R7, PPC::R8, PPC::R9, PPC::R10,
+ };
+ static const MCPhysReg GPR_64[] = { // 64-bit registers.
+ PPC::X3, PPC::X4, PPC::X5, PPC::X6,
+ PPC::X7, PPC::X8, PPC::X9, PPC::X10,
+ };
+ static const MCPhysReg VR[] = {
+ PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
+ PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
+ };
+
+ const unsigned Num_GPR_Regs = array_lengthof(GPR_32);
+ const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13;
+ const unsigned Num_VR_Regs = array_lengthof( VR);
+
+ unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
+
+ const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32;
+
+ // In 32-bit non-varargs functions, the stack space for vectors is after the
+ // stack space for non-vectors. We do not use this space unless we have
+ // too many vectors to fit in registers, something that only occurs in
+ // constructed examples:), but we have to walk the arglist to figure
+ // that out...for the pathological case, compute VecArgOffset as the
+ // start of the vector parameter area. Computing VecArgOffset is the
+ // entire point of the following loop.
+ unsigned VecArgOffset = ArgOffset;
+ if (!isVarArg && !isPPC64) {
+ for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e;
+ ++ArgNo) {
+ EVT ObjectVT = Ins[ArgNo].VT;
+ ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
+
+ if (Flags.isByVal()) {
+ // ObjSize is the true size, ArgSize rounded up to multiple of regs.
+ unsigned ObjSize = Flags.getByValSize();
+ unsigned ArgSize =
+ ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+ VecArgOffset += ArgSize;
+ continue;
+ }
+
+ switch(ObjectVT.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("Unhandled argument type!");
+ case MVT::i1:
+ case MVT::i32:
+ case MVT::f32:
+ VecArgOffset += 4;
+ break;
+ case MVT::i64: // PPC64
+ case MVT::f64:
+ // FIXME: We are guaranteed to be !isPPC64 at this point.
+ // Does MVT::i64 apply?
+ VecArgOffset += 8;
+ break;
+ case MVT::v4f32:
+ case MVT::v4i32:
+ case MVT::v8i16:
+ case MVT::v16i8:
+ // Nothing to do, we're only looking at Nonvector args here.
+ break;
+ }
+ }
+ }
+ // We've found where the vector parameter area in memory is. Skip the
+ // first 12 parameters; these don't use that memory.
+ VecArgOffset = ((VecArgOffset+15)/16)*16;
+ VecArgOffset += 12*16;
+
+ // Add DAG nodes to load the arguments or copy them out of registers. On
+ // entry to a function on PPC, the arguments start after the linkage area,
+ // although the first ones are often in registers.
+
+ SmallVector<SDValue, 8> MemOps;
+ unsigned nAltivecParamsAtEnd = 0;
+ Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin();
+ unsigned CurArgIdx = 0;
+ for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
+ SDValue ArgVal;
+ bool needsLoad = false;
+ EVT ObjectVT = Ins[ArgNo].VT;
+ unsigned ObjSize = ObjectVT.getSizeInBits()/8;
+ unsigned ArgSize = ObjSize;
+ ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
+ if (Ins[ArgNo].isOrigArg()) {
+ std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx);
+ CurArgIdx = Ins[ArgNo].getOrigArgIndex();
+ }
+ unsigned CurArgOffset = ArgOffset;
+
+ // Varargs or 64 bit Altivec parameters are padded to a 16 byte boundary.
+ if (ObjectVT==MVT::v4f32 || ObjectVT==MVT::v4i32 ||
+ ObjectVT==MVT::v8i16 || ObjectVT==MVT::v16i8) {
+ if (isVarArg || isPPC64) {
+ MinReservedArea = ((MinReservedArea+15)/16)*16;
+ MinReservedArea += CalculateStackSlotSize(ObjectVT,
+ Flags,
+ PtrByteSize);
+ } else nAltivecParamsAtEnd++;
+ } else
+ // Calculate min reserved area.
+ MinReservedArea += CalculateStackSlotSize(Ins[ArgNo].VT,
+ Flags,
+ PtrByteSize);
+
+ // FIXME the codegen can be much improved in some cases.
+ // We do not have to keep everything in memory.
+ if (Flags.isByVal()) {
+ assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit");
+
+ // ObjSize is the true size, ArgSize rounded up to multiple of registers.
+ ObjSize = Flags.getByValSize();
+ ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+ // Objects of size 1 and 2 are right justified, everything else is
+ // left justified. This means the memory address is adjusted forwards.
+ if (ObjSize==1 || ObjSize==2) {
+ CurArgOffset = CurArgOffset + (4 - ObjSize);
+ }
+ // The value of the object is its address.
+ int FI = MFI.CreateFixedObject(ObjSize, CurArgOffset, false, true);
+ SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+ InVals.push_back(FIN);
+ if (ObjSize==1 || ObjSize==2) {
+ if (GPR_idx != Num_GPR_Regs) {
+ unsigned VReg;
+ if (isPPC64)
+ VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+ else
+ VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
+ SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+ EVT ObjType = ObjSize == 1 ? MVT::i8 : MVT::i16;
+ SDValue Store =
+ DAG.getTruncStore(Val.getValue(1), dl, Val, FIN,
+ MachinePointerInfo(&*FuncArg), ObjType);
+ MemOps.push_back(Store);
+ ++GPR_idx;
+ }
+
+ ArgOffset += PtrByteSize;
+
+ continue;
+ }
+ for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
+ // Store whatever pieces of the object are in registers
+ // to memory. ArgOffset will be the address of the beginning
+ // of the object.
+ if (GPR_idx != Num_GPR_Regs) {
+ unsigned VReg;
+ if (isPPC64)
+ VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+ else
+ VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
+ int FI = MFI.CreateFixedObject(PtrByteSize, ArgOffset, true);
+ SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+ SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+ SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
+ MachinePointerInfo(&*FuncArg, j));
+ MemOps.push_back(Store);
+ ++GPR_idx;
+ ArgOffset += PtrByteSize;
+ } else {
+ ArgOffset += ArgSize - (ArgOffset-CurArgOffset);
+ break;
+ }
+ }
+ continue;
+ }
+
+ switch (ObjectVT.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("Unhandled argument type!");
+ case MVT::i1:
+ case MVT::i32:
+ if (!isPPC64) {
+ if (GPR_idx != Num_GPR_Regs) {
+ unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
+ ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
+
+ if (ObjectVT == MVT::i1)
+ ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgVal);
+
+ ++GPR_idx;
+ } else {
+ needsLoad = true;
+ ArgSize = PtrByteSize;
+ }
+ // All int arguments reserve stack space in the Darwin ABI.
+ ArgOffset += PtrByteSize;
+ break;
+ }
+ LLVM_FALLTHROUGH;
+ case MVT::i64: // PPC64
+ if (GPR_idx != Num_GPR_Regs) {
+ unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+ ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
+
+ if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
+ // PPC64 passes i8, i16, and i32 values in i64 registers. Promote
+ // value to MVT::i64 and then truncate to the correct register size.
+ ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
+
+ ++GPR_idx;
+ } else {
+ needsLoad = true;
+ ArgSize = PtrByteSize;
+ }
+ // All int arguments reserve stack space in the Darwin ABI.
+ ArgOffset += 8;
+ break;
+
+ case MVT::f32:
+ case MVT::f64:
+ // Every 4 bytes of argument space consumes one of the GPRs available for
+ // argument passing.
+ if (GPR_idx != Num_GPR_Regs) {
+ ++GPR_idx;
+ if (ObjSize == 8 && GPR_idx != Num_GPR_Regs && !isPPC64)
+ ++GPR_idx;
+ }
+ if (FPR_idx != Num_FPR_Regs) {
+ unsigned VReg;
+
+ if (ObjectVT == MVT::f32)
+ VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F4RCRegClass);
+ else
+ VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F8RCRegClass);
+
+ ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
+ ++FPR_idx;
+ } else {
+ needsLoad = true;
+ }
+
+ // All FP arguments reserve stack space in the Darwin ABI.
+ ArgOffset += isPPC64 ? 8 : ObjSize;
+ break;
+ case MVT::v4f32:
+ case MVT::v4i32:
+ case MVT::v8i16:
+ case MVT::v16i8:
+ // Note that vector arguments in registers don't reserve stack space,
+ // except in varargs functions.
+ if (VR_idx != Num_VR_Regs) {
+ unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
+ ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
+ if (isVarArg) {
+ while ((ArgOffset % 16) != 0) {
+ ArgOffset += PtrByteSize;
+ if (GPR_idx != Num_GPR_Regs)
+ GPR_idx++;
+ }
+ ArgOffset += 16;
+ GPR_idx = std::min(GPR_idx+4, Num_GPR_Regs); // FIXME correct for ppc64?
+ }
+ ++VR_idx;
+ } else {
+ if (!isVarArg && !isPPC64) {
+ // Vectors go after all the nonvectors.
+ CurArgOffset = VecArgOffset;
+ VecArgOffset += 16;
+ } else {
+ // Vectors are aligned.
+ ArgOffset = ((ArgOffset+15)/16)*16;
+ CurArgOffset = ArgOffset;
+ ArgOffset += 16;
+ }
+ needsLoad = true;
+ }
+ break;
+ }
+
+ // We need to load the argument to a virtual register if we determined above
+ // that we ran out of physical registers of the appropriate type.
+ if (needsLoad) {
+ int FI = MFI.CreateFixedObject(ObjSize,
+ CurArgOffset + (ArgSize - ObjSize),
+ isImmutable);
+ SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+ ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo());
+ }
+
+ InVals.push_back(ArgVal);
+ }
+
+ // Allow for Altivec parameters at the end, if needed.
+ if (nAltivecParamsAtEnd) {
+ MinReservedArea = ((MinReservedArea+15)/16)*16;
+ MinReservedArea += 16*nAltivecParamsAtEnd;
+ }
+
+ // Area that is at least reserved in the caller of this function.
+ MinReservedArea = std::max(MinReservedArea, LinkageSize + 8 * PtrByteSize);
+
+ // Set the size that is at least reserved in caller of this function. Tail
+ // call optimized functions' reserved stack space needs to be aligned so that
+ // taking the difference between two stack areas will result in an aligned
+ // stack.
+ MinReservedArea =
+ EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);
+ FuncInfo->setMinReservedArea(MinReservedArea);
+
+ // If the function takes variable number of arguments, make a frame index for
+ // the start of the first vararg value... for expansion of llvm.va_start.
+ if (isVarArg) {
+ int Depth = ArgOffset;
+
+ FuncInfo->setVarArgsFrameIndex(
+ MFI.CreateFixedObject(PtrVT.getSizeInBits()/8,
+ Depth, true));
+ SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+
+ // If this function is vararg, store any remaining integer argument regs
+ // to their spots on the stack so that they may be loaded by dereferencing
+ // the result of va_next.
+ for (; GPR_idx != Num_GPR_Regs; ++GPR_idx) {
+ unsigned VReg;
+
+ if (isPPC64)
+ VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+ else
+ VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
+
+ SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+ SDValue Store =
+ DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
+ MemOps.push_back(Store);
+ // Increment the address by four for the next argument to store
+ SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, dl, PtrVT);
+ FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
+ }
+ }
+
+ if (!MemOps.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
+
+ return Chain;
+}
+
+/// CalculateTailCallSPDiff - Get the amount the stack pointer has to be
+/// adjusted to accommodate the arguments for the tailcall.
+static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall,
+ unsigned ParamSize) {
+
+ if (!isTailCall) return 0;
+
+ PPCFunctionInfo *FI = DAG.getMachineFunction().getInfo<PPCFunctionInfo>();
+ unsigned CallerMinReservedArea = FI->getMinReservedArea();
+ int SPDiff = (int)CallerMinReservedArea - (int)ParamSize;
+ // Remember only if the new adjustement is bigger.
+ if (SPDiff < FI->getTailCallSPDelta())
+ FI->setTailCallSPDelta(SPDiff);
+
+ return SPDiff;
+}
+
+static bool isFunctionGlobalAddress(SDValue Callee);
+
+static bool
+resideInSameModule(SDValue Callee, Reloc::Model RelMod) {
+ // If !G, Callee can be an external symbol.
+ GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
+ if (!G) return false;
+
+ const GlobalValue *GV = G->getGlobal();
+
+ if (GV->isDeclaration()) return false;
+
+ switch(GV->getLinkage()) {
+ default: llvm_unreachable("unknow linkage type");
+ case GlobalValue::AvailableExternallyLinkage:
+ case GlobalValue::ExternalWeakLinkage:
+ return false;
+
+ // Callee with weak linkage is allowed if it has hidden or protected
+ // visibility
+ case GlobalValue::LinkOnceAnyLinkage:
+ case GlobalValue::LinkOnceODRLinkage: // e.g. c++ inline functions
+ case GlobalValue::WeakAnyLinkage:
+ case GlobalValue::WeakODRLinkage: // e.g. c++ template instantiation
+ if (GV->hasDefaultVisibility())
+ return false;
+
+ case GlobalValue::ExternalLinkage:
+ case GlobalValue::InternalLinkage:
+ case GlobalValue::PrivateLinkage:
+ break;
+ }
+
+ // With '-fPIC', calling default visiblity function need insert 'nop' after
+ // function call, no matter that function resides in same module or not, so
+ // we treat it as in different module.
+ if (RelMod == Reloc::PIC_ && GV->hasDefaultVisibility())
+ return false;
+
+ return true;
+}
+
+static bool
+needStackSlotPassParameters(const PPCSubtarget &Subtarget,
+ const SmallVectorImpl<ISD::OutputArg> &Outs) {
+ assert(Subtarget.isSVR4ABI() && Subtarget.isPPC64());
+
+ const unsigned PtrByteSize = 8;
+ const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
+
+ static const MCPhysReg GPR[] = {
+ PPC::X3, PPC::X4, PPC::X5, PPC::X6,
+ PPC::X7, PPC::X8, PPC::X9, PPC::X10,
+ };
+ static const MCPhysReg VR[] = {
+ PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
+ PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
+ };
+
+ const unsigned NumGPRs = array_lengthof(GPR);
+ const unsigned NumFPRs = 13;
+ const unsigned NumVRs = array_lengthof(VR);
+ const unsigned ParamAreaSize = NumGPRs * PtrByteSize;
+
+ unsigned NumBytes = LinkageSize;
+ unsigned AvailableFPRs = NumFPRs;
+ unsigned AvailableVRs = NumVRs;
+
+ for (const ISD::OutputArg& Param : Outs) {
+ if (Param.Flags.isNest()) continue;
+
+ if (CalculateStackSlotUsed(Param.VT, Param.ArgVT, Param.Flags,
+ PtrByteSize, LinkageSize, ParamAreaSize,
+ NumBytes, AvailableFPRs, AvailableVRs,
+ Subtarget.hasQPX()))
+ return true;
+ }
+ return false;
+}
+
+static bool
+hasSameArgumentList(const Function *CallerFn, ImmutableCallSite *CS) {
+ if (CS->arg_size() != CallerFn->getArgumentList().size())
+ return false;
+
+ ImmutableCallSite::arg_iterator CalleeArgIter = CS->arg_begin();
+ ImmutableCallSite::arg_iterator CalleeArgEnd = CS->arg_end();
+ Function::const_arg_iterator CallerArgIter = CallerFn->arg_begin();
+
+ for (; CalleeArgIter != CalleeArgEnd; ++CalleeArgIter, ++CallerArgIter) {
+ const Value* CalleeArg = *CalleeArgIter;
+ const Value* CallerArg = &(*CallerArgIter);
+ if (CalleeArg == CallerArg)
+ continue;
+
+ // e.g. @caller([4 x i64] %a, [4 x i64] %b) {
+ // tail call @callee([4 x i64] undef, [4 x i64] %b)
+ // }
+ // 1st argument of callee is undef and has the same type as caller.
+ if (CalleeArg->getType() == CallerArg->getType() &&
+ isa<UndefValue>(CalleeArg))
+ continue;
+
+ return false;
+ }
+
+ return true;
+}
+
+bool
+PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(
+ SDValue Callee,
+ CallingConv::ID CalleeCC,
+ ImmutableCallSite *CS,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ SelectionDAG& DAG) const {
+ bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
+
+ if (DisableSCO && !TailCallOpt) return false;
+
+ // Variadic argument functions are not supported.
+ if (isVarArg) return false;
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ CallingConv::ID CallerCC = MF.getFunction()->getCallingConv();
+
+ // Tail or Sibling call optimization (TCO/SCO) needs callee and caller has
+ // the same calling convention
+ if (CallerCC != CalleeCC) return false;
+
+ // SCO support C calling convention
+ if (CalleeCC != CallingConv::Fast && CalleeCC != CallingConv::C)
+ return false;
+
+ // Caller contains any byval parameter is not supported.
+ if (any_of(Ins, [](const ISD::InputArg &IA) { return IA.Flags.isByVal(); }))
+ return false;
+
+ // Callee contains any byval parameter is not supported, too.
+ // Note: This is a quick work around, because in some cases, e.g.
+ // caller's stack size > callee's stack size, we are still able to apply
+ // sibling call optimization. See: https://reviews.llvm.org/D23441#513574
+ if (any_of(Outs, [](const ISD::OutputArg& OA) { return OA.Flags.isByVal(); }))
+ return false;
+
+ // No TCO/SCO on indirect call because Caller have to restore its TOC
+ if (!isFunctionGlobalAddress(Callee) &&
+ !isa<ExternalSymbolSDNode>(Callee))
+ return false;
+
+ // Check if Callee resides in the same module, because for now, PPC64 SVR4 ABI
+ // (ELFv1/ELFv2) doesn't allow tail calls to a symbol resides in another
+ // module.
+ // ref: https://bugzilla.mozilla.org/show_bug.cgi?id=973977
+ if (!resideInSameModule(Callee, getTargetMachine().getRelocationModel()))
+ return false;
+
+ // TCO allows altering callee ABI, so we don't have to check further.
+ if (CalleeCC == CallingConv::Fast && TailCallOpt)
+ return true;
+
+ if (DisableSCO) return false;
+
+ // If callee use the same argument list that caller is using, then we can
+ // apply SCO on this case. If it is not, then we need to check if callee needs
+ // stack for passing arguments.
+ if (!hasSameArgumentList(MF.getFunction(), CS) &&
+ needStackSlotPassParameters(Subtarget, Outs)) {
+ return false;
+ }
+
+ return true;
+}
+
+/// IsEligibleForTailCallOptimization - Check whether the call is eligible
+/// for tail call optimization. Targets which want to do tail call
+/// optimization should implement this function.
+bool
+PPCTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
+ CallingConv::ID CalleeCC,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ SelectionDAG& DAG) const {
+ if (!getTargetMachine().Options.GuaranteedTailCallOpt)
+ return false;
+
+ // Variable argument functions are not supported.
+ if (isVarArg)
+ return false;
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ CallingConv::ID CallerCC = MF.getFunction()->getCallingConv();
+ if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) {
+ // Functions containing by val parameters are not supported.
+ for (unsigned i = 0; i != Ins.size(); i++) {
+ ISD::ArgFlagsTy Flags = Ins[i].Flags;
+ if (Flags.isByVal()) return false;
+ }
+
+ // Non-PIC/GOT tail calls are supported.
+ if (getTargetMachine().getRelocationModel() != Reloc::PIC_)
+ return true;
+
+ // At the moment we can only do local tail calls (in same module, hidden
+ // or protected) if we are generating PIC.
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+ return G->getGlobal()->hasHiddenVisibility()
+ || G->getGlobal()->hasProtectedVisibility();
+ }
+
+ return false;
+}
+
+/// isCallCompatibleAddress - Return the immediate to use if the specified
+/// 32-bit value is representable in the immediate field of a BxA instruction.
+static SDNode *isBLACompatibleAddress(SDValue Op, SelectionDAG &DAG) {
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+ if (!C) return nullptr;
+
+ int Addr = C->getZExtValue();
+ if ((Addr & 3) != 0 || // Low 2 bits are implicitly zero.
+ SignExtend32<26>(Addr) != Addr)
+ return nullptr; // Top 6 bits have to be sext of immediate.
+
+ return DAG
+ .getConstant(
+ (int)C->getZExtValue() >> 2, SDLoc(Op),
+ DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()))
+ .getNode();
+}
+
+namespace {
+
+struct TailCallArgumentInfo {
+ SDValue Arg;
+ SDValue FrameIdxOp;
+ int FrameIdx;
+
+ TailCallArgumentInfo() : FrameIdx(0) {}
+};
+}
+
+/// StoreTailCallArgumentsToStackSlot - Stores arguments to their stack slot.
+static void StoreTailCallArgumentsToStackSlot(
+ SelectionDAG &DAG, SDValue Chain,
+ const SmallVectorImpl<TailCallArgumentInfo> &TailCallArgs,
+ SmallVectorImpl<SDValue> &MemOpChains, const SDLoc &dl) {
+ for (unsigned i = 0, e = TailCallArgs.size(); i != e; ++i) {
+ SDValue Arg = TailCallArgs[i].Arg;
+ SDValue FIN = TailCallArgs[i].FrameIdxOp;
+ int FI = TailCallArgs[i].FrameIdx;
+ // Store relative to framepointer.
+ MemOpChains.push_back(DAG.getStore(
+ Chain, dl, Arg, FIN,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
+ }
+}
+
+/// EmitTailCallStoreFPAndRetAddr - Move the frame pointer and return address to
+/// the appropriate stack slot for the tail call optimized function call.
+static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG, SDValue Chain,
+ SDValue OldRetAddr, SDValue OldFP,
+ int SPDiff, const SDLoc &dl) {
+ if (SPDiff) {
+ // Calculate the new stack slot for the return address.
+ MachineFunction &MF = DAG.getMachineFunction();
+ const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+ const PPCFrameLowering *FL = Subtarget.getFrameLowering();
+ bool isPPC64 = Subtarget.isPPC64();
+ int SlotSize = isPPC64 ? 8 : 4;
+ int NewRetAddrLoc = SPDiff + FL->getReturnSaveOffset();
+ int NewRetAddr = MF.getFrameInfo().CreateFixedObject(SlotSize,
+ NewRetAddrLoc, true);
+ EVT VT = isPPC64 ? MVT::i64 : MVT::i32;
+ SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewRetAddr, VT);
+ Chain = DAG.getStore(Chain, dl, OldRetAddr, NewRetAddrFrIdx,
+ MachinePointerInfo::getFixedStack(MF, NewRetAddr));
+
+ // When using the 32/64-bit SVR4 ABI there is no need to move the FP stack
+ // slot as the FP is never overwritten.
+ if (Subtarget.isDarwinABI()) {
+ int NewFPLoc = SPDiff + FL->getFramePointerSaveOffset();
+ int NewFPIdx = MF.getFrameInfo().CreateFixedObject(SlotSize, NewFPLoc,
+ true);
+ SDValue NewFramePtrIdx = DAG.getFrameIndex(NewFPIdx, VT);
+ Chain = DAG.getStore(Chain, dl, OldFP, NewFramePtrIdx,
+ MachinePointerInfo::getFixedStack(
+ DAG.getMachineFunction(), NewFPIdx));
+ }
+ }
+ return Chain;
+}
+
+/// CalculateTailCallArgDest - Remember Argument for later processing. Calculate
+/// the position of the argument.
+static void
+CalculateTailCallArgDest(SelectionDAG &DAG, MachineFunction &MF, bool isPPC64,
+ SDValue Arg, int SPDiff, unsigned ArgOffset,
+ SmallVectorImpl<TailCallArgumentInfo>& TailCallArguments) {
+ int Offset = ArgOffset + SPDiff;
+ uint32_t OpSize = (Arg.getValueSizeInBits() + 7) / 8;
+ int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
+ EVT VT = isPPC64 ? MVT::i64 : MVT::i32;
+ SDValue FIN = DAG.getFrameIndex(FI, VT);
+ TailCallArgumentInfo Info;
+ Info.Arg = Arg;
+ Info.FrameIdxOp = FIN;
+ Info.FrameIdx = FI;
+ TailCallArguments.push_back(Info);
+}
+
+/// EmitTCFPAndRetAddrLoad - Emit load from frame pointer and return address
+/// stack slot. Returns the chain as result and the loaded frame pointers in
+/// LROpOut/FPOpout. Used when tail calling.
+SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(
+ SelectionDAG &DAG, int SPDiff, SDValue Chain, SDValue &LROpOut,
+ SDValue &FPOpOut, const SDLoc &dl) const {
+ if (SPDiff) {
+ // Load the LR and FP stack slot for later adjusting.
+ EVT VT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32;
+ LROpOut = getReturnAddrFrameIndex(DAG);
+ LROpOut = DAG.getLoad(VT, dl, Chain, LROpOut, MachinePointerInfo());
+ Chain = SDValue(LROpOut.getNode(), 1);
+
+ // When using the 32/64-bit SVR4 ABI there is no need to load the FP stack
+ // slot as the FP is never overwritten.
+ if (Subtarget.isDarwinABI()) {
+ FPOpOut = getFramePointerFrameIndex(DAG);
+ FPOpOut = DAG.getLoad(VT, dl, Chain, FPOpOut, MachinePointerInfo());
+ Chain = SDValue(FPOpOut.getNode(), 1);
+ }
+ }
+ return Chain;
+}
+
+/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
+/// by "Src" to address "Dst" of size "Size". Alignment information is
+/// specified by the specific parameter attribute. The copy will be passed as
+/// a byval function parameter.
+/// Sometimes what we are copying is the end of a larger object, the part that
+/// does not fit in registers.
+static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
+ SDValue Chain, ISD::ArgFlagsTy Flags,
+ SelectionDAG &DAG, const SDLoc &dl) {
+ SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
+ return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
+ false, false, false, MachinePointerInfo(),
+ MachinePointerInfo());
+}
+
+/// LowerMemOpCallTo - Store the argument to the stack or remember it in case of
+/// tail calls.
+static void LowerMemOpCallTo(
+ SelectionDAG &DAG, MachineFunction &MF, SDValue Chain, SDValue Arg,
+ SDValue PtrOff, int SPDiff, unsigned ArgOffset, bool isPPC64,
+ bool isTailCall, bool isVector, SmallVectorImpl<SDValue> &MemOpChains,
+ SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments, const SDLoc &dl) {
+ EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
+ if (!isTailCall) {
+ if (isVector) {
+ SDValue StackPtr;
+ if (isPPC64)
+ StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
+ else
+ StackPtr = DAG.getRegister(PPC::R1, MVT::i32);
+ PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
+ DAG.getConstant(ArgOffset, dl, PtrVT));
+ }
+ MemOpChains.push_back(
+ DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
+ // Calculate and remember argument location.
+ } else CalculateTailCallArgDest(DAG, MF, isPPC64, Arg, SPDiff, ArgOffset,
+ TailCallArguments);
+}
+
+static void
+PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain,
+ const SDLoc &dl, int SPDiff, unsigned NumBytes, SDValue LROp,
+ SDValue FPOp,
+ SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments) {
+ // Emit a sequence of copyto/copyfrom virtual registers for arguments that
+ // might overwrite each other in case of tail call optimization.
+ SmallVector<SDValue, 8> MemOpChains2;
+ // Do not flag preceding copytoreg stuff together with the following stuff.
+ InFlag = SDValue();
+ StoreTailCallArgumentsToStackSlot(DAG, Chain, TailCallArguments,
+ MemOpChains2, dl);
+ if (!MemOpChains2.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
+
+ // Store the return address to the appropriate stack slot.
+ Chain = EmitTailCallStoreFPAndRetAddr(DAG, Chain, LROp, FPOp, SPDiff, dl);
+
+ // Emit callseq_end just before tailcall node.
+ Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
+ DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
+ InFlag = Chain.getValue(1);
+}
+
+// Is this global address that of a function that can be called by name? (as
+// opposed to something that must hold a descriptor for an indirect call).
+static bool isFunctionGlobalAddress(SDValue Callee) {
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ if (Callee.getOpcode() == ISD::GlobalTLSAddress ||
+ Callee.getOpcode() == ISD::TargetGlobalTLSAddress)
+ return false;
+
+ return G->getGlobal()->getValueType()->isFunctionTy();
+ }
+
+ return false;
+}
+
+static unsigned
+PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain,
+ SDValue CallSeqStart, const SDLoc &dl, int SPDiff, bool isTailCall,
+ bool isPatchPoint, bool hasNest,
+ SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
+ SmallVectorImpl<SDValue> &Ops, std::vector<EVT> &NodeTys,
+ ImmutableCallSite *CS, const PPCSubtarget &Subtarget) {
+
+ bool isPPC64 = Subtarget.isPPC64();
+ bool isSVR4ABI = Subtarget.isSVR4ABI();
+ bool isELFv2ABI = Subtarget.isELFv2ABI();
+
+ EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
+ NodeTys.push_back(MVT::Other); // Returns a chain
+ NodeTys.push_back(MVT::Glue); // Returns a flag for retval copy to use.
+
+ unsigned CallOpc = PPCISD::CALL;
+
+ bool needIndirectCall = true;
+ if (!isSVR4ABI || !isPPC64)
+ if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG)) {
+ // If this is an absolute destination address, use the munged value.
+ Callee = SDValue(Dest, 0);
+ needIndirectCall = false;
+ }
+
+ // PC-relative references to external symbols should go through $stub, unless
+ // we're building with the leopard linker or later, which automatically
+ // synthesizes these stubs.
+ const TargetMachine &TM = DAG.getTarget();
+ const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
+ const GlobalValue *GV = nullptr;
+ if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee))
+ GV = G->getGlobal();
+ bool Local = TM.shouldAssumeDSOLocal(*Mod, GV);
+ bool UsePlt = !Local && Subtarget.isTargetELF() && !isPPC64;
+
+ if (isFunctionGlobalAddress(Callee)) {
+ GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Callee);
+ // A call to a TLS address is actually an indirect call to a
+ // thread-specific pointer.
+ unsigned OpFlags = 0;
+ if (UsePlt)
+ OpFlags = PPCII::MO_PLT;
+
+ // If the callee is a GlobalAddress/ExternalSymbol node (quite common,
+ // every direct call is) turn it into a TargetGlobalAddress /
+ // TargetExternalSymbol node so that legalize doesn't hack it.
+ Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl,
+ Callee.getValueType(), 0, OpFlags);
+ needIndirectCall = false;
+ }
+
+ if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+ unsigned char OpFlags = 0;
+
+ if (UsePlt)
+ OpFlags = PPCII::MO_PLT;
+
+ Callee = DAG.getTargetExternalSymbol(S->getSymbol(), Callee.getValueType(),
+ OpFlags);
+ needIndirectCall = false;
+ }
+
+ if (isPatchPoint) {
+ // We'll form an invalid direct call when lowering a patchpoint; the full
+ // sequence for an indirect call is complicated, and many of the
+ // instructions introduced might have side effects (and, thus, can't be
+ // removed later). The call itself will be removed as soon as the
+ // argument/return lowering is complete, so the fact that it has the wrong
+ // kind of operands should not really matter.
+ needIndirectCall = false;
+ }
+
+ if (needIndirectCall) {
+ // Otherwise, this is an indirect call. We have to use a MTCTR/BCTRL pair
+ // to do the call, we can't use PPCISD::CALL.
+ SDValue MTCTROps[] = {Chain, Callee, InFlag};
+
+ if (isSVR4ABI && isPPC64 && !isELFv2ABI) {
+ // Function pointers in the 64-bit SVR4 ABI do not point to the function
+ // entry point, but to the function descriptor (the function entry point
+ // address is part of the function descriptor though).
+ // The function descriptor is a three doubleword structure with the
+ // following fields: function entry point, TOC base address and
+ // environment pointer.
+ // Thus for a call through a function pointer, the following actions need
+ // to be performed:
+ // 1. Save the TOC of the caller in the TOC save area of its stack
+ // frame (this is done in LowerCall_Darwin() or LowerCall_64SVR4()).
+ // 2. Load the address of the function entry point from the function
+ // descriptor.
+ // 3. Load the TOC of the callee from the function descriptor into r2.
+ // 4. Load the environment pointer from the function descriptor into
+ // r11.
+ // 5. Branch to the function entry point address.
+ // 6. On return of the callee, the TOC of the caller needs to be
+ // restored (this is done in FinishCall()).
+ //
+ // The loads are scheduled at the beginning of the call sequence, and the
+ // register copies are flagged together to ensure that no other
+ // operations can be scheduled in between. E.g. without flagging the
+ // copies together, a TOC access in the caller could be scheduled between
+ // the assignment of the callee TOC and the branch to the callee, which
+ // results in the TOC access going through the TOC of the callee instead
+ // of going through the TOC of the caller, which leads to incorrect code.
+
+ // Load the address of the function entry point from the function
+ // descriptor.
+ SDValue LDChain = CallSeqStart.getValue(CallSeqStart->getNumValues()-1);
+ if (LDChain.getValueType() == MVT::Glue)
+ LDChain = CallSeqStart.getValue(CallSeqStart->getNumValues()-2);
+
+ auto MMOFlags = Subtarget.hasInvariantFunctionDescriptors()
+ ? (MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant)
+ : MachineMemOperand::MONone;
+
+ MachinePointerInfo MPI(CS ? CS->getCalledValue() : nullptr);
+ SDValue LoadFuncPtr = DAG.getLoad(MVT::i64, dl, LDChain, Callee, MPI,
+ /* Alignment = */ 8, MMOFlags);
+
+ // Load environment pointer into r11.
+ SDValue PtrOff = DAG.getIntPtrConstant(16, dl);
+ SDValue AddPtr = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, PtrOff);
+ SDValue LoadEnvPtr =
+ DAG.getLoad(MVT::i64, dl, LDChain, AddPtr, MPI.getWithOffset(16),
+ /* Alignment = */ 8, MMOFlags);
+
+ SDValue TOCOff = DAG.getIntPtrConstant(8, dl);
+ SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, TOCOff);
+ SDValue TOCPtr =
+ DAG.getLoad(MVT::i64, dl, LDChain, AddTOC, MPI.getWithOffset(8),
+ /* Alignment = */ 8, MMOFlags);
+
+ setUsesTOCBasePtr(DAG);
+ SDValue TOCVal = DAG.getCopyToReg(Chain, dl, PPC::X2, TOCPtr,
+ InFlag);
+ Chain = TOCVal.getValue(0);
+ InFlag = TOCVal.getValue(1);
+
+ // If the function call has an explicit 'nest' parameter, it takes the
+ // place of the environment pointer.
+ if (!hasNest) {
+ SDValue EnvVal = DAG.getCopyToReg(Chain, dl, PPC::X11, LoadEnvPtr,
+ InFlag);
+
+ Chain = EnvVal.getValue(0);
+ InFlag = EnvVal.getValue(1);
+ }
+
+ MTCTROps[0] = Chain;
+ MTCTROps[1] = LoadFuncPtr;
+ MTCTROps[2] = InFlag;
+ }
+
+ Chain = DAG.getNode(PPCISD::MTCTR, dl, NodeTys,
+ makeArrayRef(MTCTROps, InFlag.getNode() ? 3 : 2));
+ InFlag = Chain.getValue(1);
+
+ NodeTys.clear();
+ NodeTys.push_back(MVT::Other);
+ NodeTys.push_back(MVT::Glue);
+ Ops.push_back(Chain);
+ CallOpc = PPCISD::BCTRL;
+ Callee.setNode(nullptr);
+ // Add use of X11 (holding environment pointer)
+ if (isSVR4ABI && isPPC64 && !isELFv2ABI && !hasNest)
+ Ops.push_back(DAG.getRegister(PPC::X11, PtrVT));
+ // Add CTR register as callee so a bctr can be emitted later.
+ if (isTailCall)
+ Ops.push_back(DAG.getRegister(isPPC64 ? PPC::CTR8 : PPC::CTR, PtrVT));
+ }
+
+ // If this is a direct call, pass the chain and the callee.
+ if (Callee.getNode()) {
+ Ops.push_back(Chain);
+ Ops.push_back(Callee);
+ }
+ // If this is a tail call add stack pointer delta.
+ if (isTailCall)
+ Ops.push_back(DAG.getConstant(SPDiff, dl, MVT::i32));
+
+ // Add argument registers to the end of the list so that they are known live
+ // into the call.
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+ Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+ RegsToPass[i].second.getValueType()));
+
+ // All calls, in both the ELF V1 and V2 ABIs, need the TOC register live
+ // into the call.
+ if (isSVR4ABI && isPPC64 && !isPatchPoint) {
+ setUsesTOCBasePtr(DAG);
+ Ops.push_back(DAG.getRegister(PPC::X2, PtrVT));
+ }
+
+ return CallOpc;
+}
+
+static
+bool isLocalCall(const SDValue &Callee)
+{
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+ return G->getGlobal()->isStrongDefinitionForLinker();
+ return false;
+}
+
+SDValue PPCTargetLowering::LowerCallResult(
+ SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+ *DAG.getContext());
+ CCRetInfo.AnalyzeCallResult(Ins, RetCC_PPC);
+
+ // Copy all of the result registers out of their specified physreg.
+ for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+ CCValAssign &VA = RVLocs[i];
+ assert(VA.isRegLoc() && "Can only return in registers!");
+
+ SDValue Val = DAG.getCopyFromReg(Chain, dl,
+ VA.getLocReg(), VA.getLocVT(), InFlag);
+ Chain = Val.getValue(1);
+ InFlag = Val.getValue(2);
+
+ switch (VA.getLocInfo()) {
+ default: llvm_unreachable("Unknown loc info!");
+ case CCValAssign::Full: break;
+ case CCValAssign::AExt:
+ Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
+ break;
+ case CCValAssign::ZExt:
+ Val = DAG.getNode(ISD::AssertZext, dl, VA.getLocVT(), Val,
+ DAG.getValueType(VA.getValVT()));
+ Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
+ break;
+ case CCValAssign::SExt:
+ Val = DAG.getNode(ISD::AssertSext, dl, VA.getLocVT(), Val,
+ DAG.getValueType(VA.getValVT()));
+ Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
+ break;
+ }
+
+ InVals.push_back(Val);
+ }
+
+ return Chain;
+}
+
+SDValue PPCTargetLowering::FinishCall(
+ CallingConv::ID CallConv, const SDLoc &dl, bool isTailCall, bool isVarArg,
+ bool isPatchPoint, bool hasNest, SelectionDAG &DAG,
+ SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, SDValue InFlag,
+ SDValue Chain, SDValue CallSeqStart, SDValue &Callee, int SPDiff,
+ unsigned NumBytes, const SmallVectorImpl<ISD::InputArg> &Ins,
+ SmallVectorImpl<SDValue> &InVals, ImmutableCallSite *CS) const {
+
+ std::vector<EVT> NodeTys;
+ SmallVector<SDValue, 8> Ops;
+ unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, CallSeqStart, dl,
+ SPDiff, isTailCall, isPatchPoint, hasNest,
+ RegsToPass, Ops, NodeTys, CS, Subtarget);
+
+ // Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls
+ if (isVarArg && Subtarget.isSVR4ABI() && !Subtarget.isPPC64())
+ Ops.push_back(DAG.getRegister(PPC::CR1EQ, MVT::i32));
+
+ // When performing tail call optimization the callee pops its arguments off
+ // the stack. Account for this here so these bytes can be pushed back on in
+ // PPCFrameLowering::eliminateCallFramePseudoInstr.
+ int BytesCalleePops =
+ (CallConv == CallingConv::Fast &&
+ getTargetMachine().Options.GuaranteedTailCallOpt) ? NumBytes : 0;
+
+ // Add a register mask operand representing the call-preserved registers.
+ const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+ const uint32_t *Mask =
+ TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv);
+ assert(Mask && "Missing call preserved mask for calling convention");
+ Ops.push_back(DAG.getRegisterMask(Mask));
+
+ if (InFlag.getNode())
+ Ops.push_back(InFlag);
+
+ // Emit tail call.
+ if (isTailCall) {
+ assert(((Callee.getOpcode() == ISD::Register &&
+ cast<RegisterSDNode>(Callee)->getReg() == PPC::CTR) ||
+ Callee.getOpcode() == ISD::TargetExternalSymbol ||
+ Callee.getOpcode() == ISD::TargetGlobalAddress ||
+ isa<ConstantSDNode>(Callee)) &&
+ "Expecting an global address, external symbol, absolute value or register");
+
+ DAG.getMachineFunction().getFrameInfo().setHasTailCall();
+ return DAG.getNode(PPCISD::TC_RETURN, dl, MVT::Other, Ops);
+ }
+
+ // Add a NOP immediately after the branch instruction when using the 64-bit
+ // SVR4 ABI. At link time, if caller and callee are in a different module and
+ // thus have a different TOC, the call will be replaced with a call to a stub
+ // function which saves the current TOC, loads the TOC of the callee and
+ // branches to the callee. The NOP will be replaced with a load instruction
+ // which restores the TOC of the caller from the TOC save slot of the current
+ // stack frame. If caller and callee belong to the same module (and have the
+ // same TOC), the NOP will remain unchanged.
+
+ if (!isTailCall && Subtarget.isSVR4ABI()&& Subtarget.isPPC64() &&
+ !isPatchPoint) {
+ if (CallOpc == PPCISD::BCTRL) {
+ // This is a call through a function pointer.
+ // Restore the caller TOC from the save area into R2.
+ // See PrepareCall() for more information about calls through function
+ // pointers in the 64-bit SVR4 ABI.
+ // We are using a target-specific load with r2 hard coded, because the
+ // result of a target-independent load would never go directly into r2,
+ // since r2 is a reserved register (which prevents the register allocator
+ // from allocating it), resulting in an additional register being
+ // allocated and an unnecessary move instruction being generated.
+ CallOpc = PPCISD::BCTRL_LOAD_TOC;
+
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue StackPtr = DAG.getRegister(PPC::X1, PtrVT);
+ unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
+ SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);
+ SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, StackPtr, TOCOff);
+
+ // The address needs to go after the chain input but before the flag (or
+ // any other variadic arguments).
+ Ops.insert(std::next(Ops.begin()), AddTOC);
+ } else if ((CallOpc == PPCISD::CALL) &&
+ (!isLocalCall(Callee) ||
+ DAG.getTarget().getRelocationModel() == Reloc::PIC_))
+ // Otherwise insert NOP for non-local calls.
+ CallOpc = PPCISD::CALL_NOP;
+ }
+
+ Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
+ InFlag = Chain.getValue(1);
+
+ Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
+ DAG.getIntPtrConstant(BytesCalleePops, dl, true),
+ InFlag, dl);
+ if (!Ins.empty())
+ InFlag = Chain.getValue(1);
+
+ return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
+ Ins, dl, DAG, InVals);
+}
+
+SDValue
+PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const {
+ SelectionDAG &DAG = CLI.DAG;
+ SDLoc &dl = CLI.DL;
+ SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+ SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+ SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
+ SDValue Chain = CLI.Chain;
+ SDValue Callee = CLI.Callee;
+ bool &isTailCall = CLI.IsTailCall;
+ CallingConv::ID CallConv = CLI.CallConv;
+ bool isVarArg = CLI.IsVarArg;
+ bool isPatchPoint = CLI.IsPatchPoint;
+ ImmutableCallSite *CS = CLI.CS;
+
+ if (isTailCall) {
+ if (Subtarget.useLongCalls() && !(CS && CS->isMustTailCall()))
+ isTailCall = false;
+ else if (Subtarget.isSVR4ABI() && Subtarget.isPPC64())
+ isTailCall =
+ IsEligibleForTailCallOptimization_64SVR4(Callee, CallConv, CS,
+ isVarArg, Outs, Ins, DAG);
+ else
+ isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg,
+ Ins, DAG);
+ if (isTailCall) {
+ ++NumTailCalls;
+ if (!getTargetMachine().Options.GuaranteedTailCallOpt)
+ ++NumSiblingCalls;
+
+ assert(isa<GlobalAddressSDNode>(Callee) &&
+ "Callee should be an llvm::Function object.");
+ DEBUG(
+ const GlobalValue *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
+ const unsigned Width = 80 - strlen("TCO caller: ")
+ - strlen(", callee linkage: 0, 0");
+ dbgs() << "TCO caller: "
+ << left_justify(DAG.getMachineFunction().getName(), Width)
+ << ", callee linkage: "
+ << GV->getVisibility() << ", " << GV->getLinkage() << "\n"
+ );
+ }
+ }
+
+ if (!isTailCall && CS && CS->isMustTailCall())
+ report_fatal_error("failed to perform tail call elimination on a call "
+ "site marked musttail");
+
+ // When long calls (i.e. indirect calls) are always used, calls are always
+ // made via function pointer. If we have a function name, first translate it
+ // into a pointer.
+ if (Subtarget.useLongCalls() && isa<GlobalAddressSDNode>(Callee) &&
+ !isTailCall)
+ Callee = LowerGlobalAddress(Callee, DAG);
+
+ if (Subtarget.isSVR4ABI()) {
+ if (Subtarget.isPPC64())
+ return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg,
+ isTailCall, isPatchPoint, Outs, OutVals, Ins,
+ dl, DAG, InVals, CS);
+ else
+ return LowerCall_32SVR4(Chain, Callee, CallConv, isVarArg,
+ isTailCall, isPatchPoint, Outs, OutVals, Ins,
+ dl, DAG, InVals, CS);
+ }
+
+ return LowerCall_Darwin(Chain, Callee, CallConv, isVarArg,
+ isTailCall, isPatchPoint, Outs, OutVals, Ins,
+ dl, DAG, InVals, CS);
+}
+
+SDValue PPCTargetLowering::LowerCall_32SVR4(
+ SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
+ bool isTailCall, bool isPatchPoint,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
+ ImmutableCallSite *CS) const {
+ // See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description
+ // of the 32-bit SVR4 ABI stack frame layout.
+
+ assert((CallConv == CallingConv::C ||
+ CallConv == CallingConv::Fast) && "Unknown calling convention!");
+
+ unsigned PtrByteSize = 4;
+
+ MachineFunction &MF = DAG.getMachineFunction();
+
+ // Mark this function as potentially containing a function that contains a
+ // tail call. As a consequence the frame pointer will be used for dynamicalloc
+ // and restoring the callers stack pointer in this functions epilog. This is
+ // done because by tail calling the called function might overwrite the value
+ // in this function's (MF) stack pointer stack slot 0(SP).
+ if (getTargetMachine().Options.GuaranteedTailCallOpt &&
+ CallConv == CallingConv::Fast)
+ MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
+
+ // Count how many bytes are to be pushed on the stack, including the linkage
+ // area, parameter list area and the part of the local variable space which
+ // contains copies of aggregates which are passed by value.
+
+ // Assign locations to all of the outgoing arguments.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ PPCCCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
+
+ // Reserve space for the linkage area on the stack.
+ CCInfo.AllocateStack(Subtarget.getFrameLowering()->getLinkageSize(),
+ PtrByteSize);
+ if (useSoftFloat())
+ CCInfo.PreAnalyzeCallOperands(Outs);
+
+ if (isVarArg) {
+ // Handle fixed and variable vector arguments differently.
+ // Fixed vector arguments go into registers as long as registers are
+ // available. Variable vector arguments always go into memory.
+ unsigned NumArgs = Outs.size();
+
+ for (unsigned i = 0; i != NumArgs; ++i) {
+ MVT ArgVT = Outs[i].VT;
+ ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+ bool Result;
+
+ if (Outs[i].IsFixed) {
+ Result = CC_PPC32_SVR4(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags,
+ CCInfo);
+ } else {
+ Result = CC_PPC32_SVR4_VarArg(i, ArgVT, ArgVT, CCValAssign::Full,
+ ArgFlags, CCInfo);
+ }
+
+ if (Result) {
+#ifndef NDEBUG
+ errs() << "Call operand #" << i << " has unhandled type "
+ << EVT(ArgVT).getEVTString() << "\n";
+#endif
+ llvm_unreachable(nullptr);
+ }
+ }
+ } else {
+ // All arguments are treated the same.
+ CCInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4);
+ }
+ CCInfo.clearWasPPCF128();
+
+ // Assign locations to all of the outgoing aggregate by value arguments.
+ SmallVector<CCValAssign, 16> ByValArgLocs;
+ CCState CCByValInfo(CallConv, isVarArg, MF, ByValArgLocs, *DAG.getContext());
+
+ // Reserve stack space for the allocations in CCInfo.
+ CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
+
+ CCByValInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4_ByVal);
+
+ // Size of the linkage area, parameter list area and the part of the local
+ // space variable where copies of aggregates which are passed by value are
+ // stored.
+ unsigned NumBytes = CCByValInfo.getNextStackOffset();
+
+ // Calculate by how many bytes the stack has to be adjusted in case of tail
+ // call optimization.
+ int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
+
+ // Adjust the stack pointer for the new arguments...
+ // These operations are automatically eliminated by the prolog/epilog pass
+ Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
+ dl);
+ SDValue CallSeqStart = Chain;
+
+ // Load the return address and frame pointer so it can be moved somewhere else
+ // later.
+ SDValue LROp, FPOp;
+ Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl);
+
+ // Set up a copy of the stack pointer for use loading and storing any
+ // arguments that may not fit in the registers available for argument
+ // passing.
+ SDValue StackPtr = DAG.getRegister(PPC::R1, MVT::i32);
+
+ SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+ SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
+ SmallVector<SDValue, 8> MemOpChains;
+
+ bool seenFloatArg = false;
+ // Walk the register/memloc assignments, inserting copies/loads.
+ for (unsigned i = 0, j = 0, e = ArgLocs.size();
+ i != e;
+ ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ SDValue Arg = OutVals[i];
+ ISD::ArgFlagsTy Flags = Outs[i].Flags;
+
+ if (Flags.isByVal()) {
+ // Argument is an aggregate which is passed by value, thus we need to
+ // create a copy of it in the local variable space of the current stack
+ // frame (which is the stack frame of the caller) and pass the address of
+ // this copy to the callee.
+ assert((j < ByValArgLocs.size()) && "Index out of bounds!");
+ CCValAssign &ByValVA = ByValArgLocs[j++];
+ assert((VA.getValNo() == ByValVA.getValNo()) && "ValNo mismatch!");
+
+ // Memory reserved in the local variable space of the callers stack frame.
+ unsigned LocMemOffset = ByValVA.getLocMemOffset();
+
+ SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
+ PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()),
+ StackPtr, PtrOff);
+
+ // Create a copy of the argument in the local area of the current
+ // stack frame.
+ SDValue MemcpyCall =
+ CreateCopyOfByValArgument(Arg, PtrOff,
+ CallSeqStart.getNode()->getOperand(0),
+ Flags, DAG, dl);
+
+ // This must go outside the CALLSEQ_START..END.
+ SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
+ CallSeqStart.getNode()->getOperand(1),
+ SDLoc(MemcpyCall));
+ DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
+ NewCallSeqStart.getNode());
+ Chain = CallSeqStart = NewCallSeqStart;
+
+ // Pass the address of the aggregate copy on the stack either in a
+ // physical register or in the parameter list area of the current stack
+ // frame to the callee.
+ Arg = PtrOff;
+ }
+
+ if (VA.isRegLoc()) {
+ if (Arg.getValueType() == MVT::i1)
+ Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Arg);
+
+ seenFloatArg |= VA.getLocVT().isFloatingPoint();
+ // Put argument in a physical register.
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+ } else {
+ // Put argument in the parameter list area of the current stack frame.
+ assert(VA.isMemLoc());
+ unsigned LocMemOffset = VA.getLocMemOffset();
+
+ if (!isTailCall) {
+ SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
+ PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()),
+ StackPtr, PtrOff);
+
+ MemOpChains.push_back(
+ DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
+ } else {
+ // Calculate and remember argument location.
+ CalculateTailCallArgDest(DAG, MF, false, Arg, SPDiff, LocMemOffset,
+ TailCallArguments);
+ }
+ }
+ }
+
+ if (!MemOpChains.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+
+ // Build a sequence of copy-to-reg nodes chained together with token chain
+ // and flag operands which copy the outgoing args into the appropriate regs.
+ SDValue InFlag;
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+ Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+ RegsToPass[i].second, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ // Set CR bit 6 to true if this is a vararg call with floating args passed in
+ // registers.
+ if (isVarArg) {
+ SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue Ops[] = { Chain, InFlag };
+
+ Chain = DAG.getNode(seenFloatArg ? PPCISD::CR6SET : PPCISD::CR6UNSET,
+ dl, VTs, makeArrayRef(Ops, InFlag.getNode() ? 2 : 1));
+
+ InFlag = Chain.getValue(1);
+ }
+
+ if (isTailCall)
+ PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
+ TailCallArguments);
+
+ return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint,
+ /* unused except on PPC64 ELFv1 */ false, DAG,
+ RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff,
+ NumBytes, Ins, InVals, CS);
+}
+
+// Copy an argument into memory, being careful to do this outside the
+// call sequence for the call to which the argument belongs.
+SDValue PPCTargetLowering::createMemcpyOutsideCallSeq(
+ SDValue Arg, SDValue PtrOff, SDValue CallSeqStart, ISD::ArgFlagsTy Flags,
+ SelectionDAG &DAG, const SDLoc &dl) const {
+ SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff,
+ CallSeqStart.getNode()->getOperand(0),
+ Flags, DAG, dl);
+ // The MEMCPY must go outside the CALLSEQ_START..END.
+ SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
+ CallSeqStart.getNode()->getOperand(1),
+ SDLoc(MemcpyCall));
+ DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
+ NewCallSeqStart.getNode());
+ return NewCallSeqStart;
+}
+
+SDValue PPCTargetLowering::LowerCall_64SVR4(
+ SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
+ bool isTailCall, bool isPatchPoint,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
+ ImmutableCallSite *CS) const {
+
+ bool isELFv2ABI = Subtarget.isELFv2ABI();
+ bool isLittleEndian = Subtarget.isLittleEndian();
+ unsigned NumOps = Outs.size();
+ bool hasNest = false;
+ bool IsSibCall = false;
+
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ unsigned PtrByteSize = 8;
+
+ MachineFunction &MF = DAG.getMachineFunction();
+
+ if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt)
+ IsSibCall = true;
+
+ // Mark this function as potentially containing a function that contains a
+ // tail call. As a consequence the frame pointer will be used for dynamicalloc
+ // and restoring the callers stack pointer in this functions epilog. This is
+ // done because by tail calling the called function might overwrite the value
+ // in this function's (MF) stack pointer stack slot 0(SP).
+ if (getTargetMachine().Options.GuaranteedTailCallOpt &&
+ CallConv == CallingConv::Fast)
+ MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
+
+ assert(!(CallConv == CallingConv::Fast && isVarArg) &&
+ "fastcc not supported on varargs functions");
+
+ // Count how many bytes are to be pushed on the stack, including the linkage
+ // area, and parameter passing area. On ELFv1, the linkage area is 48 bytes
+ // reserved space for [SP][CR][LR][2 x unused][TOC]; on ELFv2, the linkage
+ // area is 32 bytes reserved space for [SP][CR][LR][TOC].
+ unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
+ unsigned NumBytes = LinkageSize;
+ unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
+ unsigned &QFPR_idx = FPR_idx;
+
+ static const MCPhysReg GPR[] = {
+ PPC::X3, PPC::X4, PPC::X5, PPC::X6,
+ PPC::X7, PPC::X8, PPC::X9, PPC::X10,
+ };
+ static const MCPhysReg VR[] = {
+ PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
+ PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
+ };
+
+ const unsigned NumGPRs = array_lengthof(GPR);
+ const unsigned NumFPRs = 13;
+ const unsigned NumVRs = array_lengthof(VR);
+ const unsigned NumQFPRs = NumFPRs;
+
+ // When using the fast calling convention, we don't provide backing for
+ // arguments that will be in registers.
+ unsigned NumGPRsUsed = 0, NumFPRsUsed = 0, NumVRsUsed = 0;
+
+ // Add up all the space actually used.
+ for (unsigned i = 0; i != NumOps; ++i) {
+ ISD::ArgFlagsTy Flags = Outs[i].Flags;
+ EVT ArgVT = Outs[i].VT;
+ EVT OrigVT = Outs[i].ArgVT;
+
+ if (Flags.isNest())
+ continue;
+
+ if (CallConv == CallingConv::Fast) {
+ if (Flags.isByVal())
+ NumGPRsUsed += (Flags.getByValSize()+7)/8;
+ else
+ switch (ArgVT.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("Unexpected ValueType for argument!");
+ case MVT::i1:
+ case MVT::i32:
+ case MVT::i64:
+ if (++NumGPRsUsed <= NumGPRs)
+ continue;
+ break;
+ case MVT::v4i32:
+ case MVT::v8i16:
+ case MVT::v16i8:
+ case MVT::v2f64:
+ case MVT::v2i64:
+ case MVT::v1i128:
+ if (++NumVRsUsed <= NumVRs)
+ continue;
+ break;
+ case MVT::v4f32:
+ // When using QPX, this is handled like a FP register, otherwise, it
+ // is an Altivec register.
+ if (Subtarget.hasQPX()) {
+ if (++NumFPRsUsed <= NumFPRs)
+ continue;
+ } else {
+ if (++NumVRsUsed <= NumVRs)
+ continue;
+ }
+ break;
+ case MVT::f32:
+ case MVT::f64:
+ case MVT::v4f64: // QPX
+ case MVT::v4i1: // QPX
+ if (++NumFPRsUsed <= NumFPRs)
+ continue;
+ break;
+ }
+ }
+
+ /* Respect alignment of argument on the stack. */
+ unsigned Align =
+ CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
+ NumBytes = ((NumBytes + Align - 1) / Align) * Align;
+
+ NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
+ if (Flags.isInConsecutiveRegsLast())
+ NumBytes = ((NumBytes + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+ }
+
+ unsigned NumBytesActuallyUsed = NumBytes;
+
+ // The prolog code of the callee may store up to 8 GPR argument registers to
+ // the stack, allowing va_start to index over them in memory if its varargs.
+ // Because we cannot tell if this is needed on the caller side, we have to
+ // conservatively assume that it is needed. As such, make sure we have at
+ // least enough stack space for the caller to store the 8 GPRs.
+ // FIXME: On ELFv2, it may be unnecessary to allocate the parameter area.
+ NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize);
+
+ // Tail call needs the stack to be aligned.
+ if (getTargetMachine().Options.GuaranteedTailCallOpt &&
+ CallConv == CallingConv::Fast)
+ NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes);
+
+ int SPDiff = 0;
+
+ // Calculate by how many bytes the stack has to be adjusted in case of tail
+ // call optimization.
+ if (!IsSibCall)
+ SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
+
+ // To protect arguments on the stack from being clobbered in a tail call,
+ // force all the loads to happen before doing any other lowering.
+ if (isTailCall)
+ Chain = DAG.getStackArgumentTokenFactor(Chain);
+
+ // Adjust the stack pointer for the new arguments...
+ // These operations are automatically eliminated by the prolog/epilog pass
+ if (!IsSibCall)
+ Chain = DAG.getCALLSEQ_START(Chain,
+ DAG.getIntPtrConstant(NumBytes, dl, true), dl);
+ SDValue CallSeqStart = Chain;
+
+ // Load the return address and frame pointer so it can be move somewhere else
+ // later.
+ SDValue LROp, FPOp;
+ Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl);
+
+ // Set up a copy of the stack pointer for use loading and storing any
+ // arguments that may not fit in the registers available for argument
+ // passing.
+ SDValue StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
+
+ // Figure out which arguments are going to go in registers, and which in
+ // memory. Also, if this is a vararg function, floating point operations
+ // must be stored to our stack, and loaded into integer regs as well, if
+ // any integer regs are available for argument passing.
+ unsigned ArgOffset = LinkageSize;
+
+ SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+ SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
+
+ SmallVector<SDValue, 8> MemOpChains;
+ for (unsigned i = 0; i != NumOps; ++i) {
+ SDValue Arg = OutVals[i];
+ ISD::ArgFlagsTy Flags = Outs[i].Flags;
+ EVT ArgVT = Outs[i].VT;
+ EVT OrigVT = Outs[i].ArgVT;
+
+ // PtrOff will be used to store the current argument to the stack if a
+ // register cannot be found for it.
+ SDValue PtrOff;
+
+ // We re-align the argument offset for each argument, except when using the
+ // fast calling convention, when we need to make sure we do that only when
+ // we'll actually use a stack slot.
+ auto ComputePtrOff = [&]() {
+ /* Respect alignment of argument on the stack. */
+ unsigned Align =
+ CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
+ ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
+
+ PtrOff = DAG.getConstant(ArgOffset, dl, StackPtr.getValueType());
+
+ PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
+ };
+
+ if (CallConv != CallingConv::Fast) {
+ ComputePtrOff();
+
+ /* Compute GPR index associated with argument offset. */
+ GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
+ GPR_idx = std::min(GPR_idx, NumGPRs);
+ }
+
+ // Promote integers to 64-bit values.
+ if (Arg.getValueType() == MVT::i32 || Arg.getValueType() == MVT::i1) {
+ // FIXME: Should this use ANY_EXTEND if neither sext nor zext?
+ unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg);
+ }
+
+ // FIXME memcpy is used way more than necessary. Correctness first.
+ // Note: "by value" is code for passing a structure by value, not
+ // basic types.
+ if (Flags.isByVal()) {
+ // Note: Size includes alignment padding, so
+ // struct x { short a; char b; }
+ // will have Size = 4. With #pragma pack(1), it will have Size = 3.
+ // These are the proper values we need for right-justifying the
+ // aggregate in a parameter register.
+ unsigned Size = Flags.getByValSize();
+
+ // An empty aggregate parameter takes up no storage and no
+ // registers.
+ if (Size == 0)
+ continue;
+
+ if (CallConv == CallingConv::Fast)
+ ComputePtrOff();
+
+ // All aggregates smaller than 8 bytes must be passed right-justified.
+ if (Size==1 || Size==2 || Size==4) {
+ EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32);
+ if (GPR_idx != NumGPRs) {
+ SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
+ MachinePointerInfo(), VT);
+ MemOpChains.push_back(Load.getValue(1));
+ RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+
+ ArgOffset += PtrByteSize;
+ continue;
+ }
+ }
+
+ if (GPR_idx == NumGPRs && Size < 8) {
+ SDValue AddPtr = PtrOff;
+ if (!isLittleEndian) {
+ SDValue Const = DAG.getConstant(PtrByteSize - Size, dl,
+ PtrOff.getValueType());
+ AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
+ }
+ Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
+ CallSeqStart,
+ Flags, DAG, dl);
+ ArgOffset += PtrByteSize;
+ continue;
+ }
+ // Copy entire object into memory. There are cases where gcc-generated
+ // code assumes it is there, even if it could be put entirely into
+ // registers. (This is not what the doc says.)
+
+ // FIXME: The above statement is likely due to a misunderstanding of the
+ // documents. All arguments must be copied into the parameter area BY
+ // THE CALLEE in the event that the callee takes the address of any
+ // formal argument. That has not yet been implemented. However, it is
+ // reasonable to use the stack area as a staging area for the register
+ // load.
+
+ // Skip this for small aggregates, as we will use the same slot for a
+ // right-justified copy, below.
+ if (Size >= 8)
+ Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff,
+ CallSeqStart,
+ Flags, DAG, dl);
+
+ // When a register is available, pass a small aggregate right-justified.
+ if (Size < 8 && GPR_idx != NumGPRs) {
+ // The easiest way to get this right-justified in a register
+ // is to copy the structure into the rightmost portion of a
+ // local variable slot, then load the whole slot into the
+ // register.
+ // FIXME: The memcpy seems to produce pretty awful code for
+ // small aggregates, particularly for packed ones.
+ // FIXME: It would be preferable to use the slot in the
+ // parameter save area instead of a new local variable.
+ SDValue AddPtr = PtrOff;
+ if (!isLittleEndian) {
+ SDValue Const = DAG.getConstant(8 - Size, dl, PtrOff.getValueType());
+ AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
+ }
+ Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
+ CallSeqStart,
+ Flags, DAG, dl);
+
+ // Load the slot into the register.
+ SDValue Load =
+ DAG.getLoad(PtrVT, dl, Chain, PtrOff, MachinePointerInfo());
+ MemOpChains.push_back(Load.getValue(1));
+ RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+
+ // Done with this argument.
+ ArgOffset += PtrByteSize;
+ continue;
+ }
+
+ // For aggregates larger than PtrByteSize, copy the pieces of the
+ // object that fit into registers from the parameter save area.
+ for (unsigned j=0; j<Size; j+=PtrByteSize) {
+ SDValue Const = DAG.getConstant(j, dl, PtrOff.getValueType());
+ SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
+ if (GPR_idx != NumGPRs) {
+ SDValue Load =
+ DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo());
+ MemOpChains.push_back(Load.getValue(1));
+ RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+ ArgOffset += PtrByteSize;
+ } else {
+ ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize;
+ break;
+ }
+ }
+ continue;
+ }
+
+ switch (Arg.getSimpleValueType().SimpleTy) {
+ default: llvm_unreachable("Unexpected ValueType for argument!");
+ case MVT::i1:
+ case MVT::i32:
+ case MVT::i64:
+ if (Flags.isNest()) {
+ // The 'nest' parameter, if any, is passed in R11.
+ RegsToPass.push_back(std::make_pair(PPC::X11, Arg));
+ hasNest = true;
+ break;
+ }
+
+ // These can be scalar arguments or elements of an integer array type
+ // passed directly. Clang may use those instead of "byval" aggregate
+ // types to avoid forcing arguments to memory unnecessarily.
+ if (GPR_idx != NumGPRs) {
+ RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
+ } else {
+ if (CallConv == CallingConv::Fast)
+ ComputePtrOff();
+
+ LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+ true, isTailCall, false, MemOpChains,
+ TailCallArguments, dl);
+ if (CallConv == CallingConv::Fast)
+ ArgOffset += PtrByteSize;
+ }
+ if (CallConv != CallingConv::Fast)
+ ArgOffset += PtrByteSize;
+ break;
+ case MVT::f32:
+ case MVT::f64: {
+ // These can be scalar arguments or elements of a float array type
+ // passed directly. The latter are used to implement ELFv2 homogenous
+ // float aggregates.
+
+ // Named arguments go into FPRs first, and once they overflow, the
+ // remaining arguments go into GPRs and then the parameter save area.
+ // Unnamed arguments for vararg functions always go to GPRs and
+ // then the parameter save area. For now, put all arguments to vararg
+ // routines always in both locations (FPR *and* GPR or stack slot).
+ bool NeedGPROrStack = isVarArg || FPR_idx == NumFPRs;
+ bool NeededLoad = false;
+
+ // First load the argument into the next available FPR.
+ if (FPR_idx != NumFPRs)
+ RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));
+
+ // Next, load the argument into GPR or stack slot if needed.
+ if (!NeedGPROrStack)
+ ;
+ else if (GPR_idx != NumGPRs && CallConv != CallingConv::Fast) {
+ // FIXME: We may want to re-enable this for CallingConv::Fast on the P8
+ // once we support fp <-> gpr moves.
+
+ // In the non-vararg case, this can only ever happen in the
+ // presence of f32 array types, since otherwise we never run
+ // out of FPRs before running out of GPRs.
+ SDValue ArgVal;
+
+ // Double values are always passed in a single GPR.
+ if (Arg.getValueType() != MVT::f32) {
+ ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
+
+ // Non-array float values are extended and passed in a GPR.
+ } else if (!Flags.isInConsecutiveRegs()) {
+ ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
+ ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal);
+
+ // If we have an array of floats, we collect every odd element
+ // together with its predecessor into one GPR.
+ } else if (ArgOffset % PtrByteSize != 0) {
+ SDValue Lo, Hi;
+ Lo = DAG.getNode(ISD::BITCAST, dl, MVT::i32, OutVals[i - 1]);
+ Hi = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
+ if (!isLittleEndian)
+ std::swap(Lo, Hi);
+ ArgVal = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+
+ // The final element, if even, goes into the first half of a GPR.
+ } else if (Flags.isInConsecutiveRegsLast()) {
+ ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
+ ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal);
+ if (!isLittleEndian)
+ ArgVal = DAG.getNode(ISD::SHL, dl, MVT::i64, ArgVal,
+ DAG.getConstant(32, dl, MVT::i32));
+
+ // Non-final even elements are skipped; they will be handled
+ // together the with subsequent argument on the next go-around.
+ } else
+ ArgVal = SDValue();
+
+ if (ArgVal.getNode())
+ RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], ArgVal));
+ } else {
+ if (CallConv == CallingConv::Fast)
+ ComputePtrOff();
+
+ // Single-precision floating-point values are mapped to the
+ // second (rightmost) word of the stack doubleword.
+ if (Arg.getValueType() == MVT::f32 &&
+ !isLittleEndian && !Flags.isInConsecutiveRegs()) {
+ SDValue ConstFour = DAG.getConstant(4, dl, PtrOff.getValueType());
+ PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
+ }
+
+ LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+ true, isTailCall, false, MemOpChains,
+ TailCallArguments, dl);
+
+ NeededLoad = true;
+ }
+ // When passing an array of floats, the array occupies consecutive
+ // space in the argument area; only round up to the next doubleword
+ // at the end of the array. Otherwise, each float takes 8 bytes.
+ if (CallConv != CallingConv::Fast || NeededLoad) {
+ ArgOffset += (Arg.getValueType() == MVT::f32 &&
+ Flags.isInConsecutiveRegs()) ? 4 : 8;
+ if (Flags.isInConsecutiveRegsLast())
+ ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+ }
+ break;
+ }
+ case MVT::v4f32:
+ case MVT::v4i32:
+ case MVT::v8i16:
+ case MVT::v16i8:
+ case MVT::v2f64:
+ case MVT::v2i64:
+ case MVT::v1i128:
+ if (!Subtarget.hasQPX()) {
+ // These can be scalar arguments or elements of a vector array type
+ // passed directly. The latter are used to implement ELFv2 homogenous
+ // vector aggregates.
+
+ // For a varargs call, named arguments go into VRs or on the stack as
+ // usual; unnamed arguments always go to the stack or the corresponding
+ // GPRs when within range. For now, we always put the value in both
+ // locations (or even all three).
+ if (isVarArg) {
+ // We could elide this store in the case where the object fits
+ // entirely in R registers. Maybe later.
+ SDValue Store =
+ DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
+ MemOpChains.push_back(Store);
+ if (VR_idx != NumVRs) {
+ SDValue Load =
+ DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, MachinePointerInfo());
+ MemOpChains.push_back(Load.getValue(1));
+ RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load));
+ }
+ ArgOffset += 16;
+ for (unsigned i=0; i<16; i+=PtrByteSize) {
+ if (GPR_idx == NumGPRs)
+ break;
+ SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
+ DAG.getConstant(i, dl, PtrVT));
+ SDValue Load =
+ DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo());
+ MemOpChains.push_back(Load.getValue(1));
+ RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+ }
+ break;
+ }
+
+ // Non-varargs Altivec params go into VRs or on the stack.
+ if (VR_idx != NumVRs) {
+ RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg));
+ } else {
+ if (CallConv == CallingConv::Fast)
+ ComputePtrOff();
+
+ LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+ true, isTailCall, true, MemOpChains,
+ TailCallArguments, dl);
+ if (CallConv == CallingConv::Fast)
+ ArgOffset += 16;
+ }
+
+ if (CallConv != CallingConv::Fast)
+ ArgOffset += 16;
+ break;
+ } // not QPX
+
+ assert(Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32 &&
+ "Invalid QPX parameter type");
+
+ /* fall through */
+ case MVT::v4f64:
+ case MVT::v4i1: {
+ bool IsF32 = Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32;
+ if (isVarArg) {
+ // We could elide this store in the case where the object fits
+ // entirely in R registers. Maybe later.
+ SDValue Store =
+ DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
+ MemOpChains.push_back(Store);
+ if (QFPR_idx != NumQFPRs) {
+ SDValue Load = DAG.getLoad(IsF32 ? MVT::v4f32 : MVT::v4f64, dl, Store,
+ PtrOff, MachinePointerInfo());
+ MemOpChains.push_back(Load.getValue(1));
+ RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Load));
+ }
+ ArgOffset += (IsF32 ? 16 : 32);
+ for (unsigned i = 0; i < (IsF32 ? 16U : 32U); i += PtrByteSize) {
+ if (GPR_idx == NumGPRs)
+ break;
+ SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
+ DAG.getConstant(i, dl, PtrVT));
+ SDValue Load =
+ DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo());
+ MemOpChains.push_back(Load.getValue(1));
+ RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+ }
+ break;
+ }
+
+ // Non-varargs QPX params go into registers or on the stack.
+ if (QFPR_idx != NumQFPRs) {
+ RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Arg));
+ } else {
+ if (CallConv == CallingConv::Fast)
+ ComputePtrOff();
+
+ LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+ true, isTailCall, true, MemOpChains,
+ TailCallArguments, dl);
+ if (CallConv == CallingConv::Fast)
+ ArgOffset += (IsF32 ? 16 : 32);
+ }
+
+ if (CallConv != CallingConv::Fast)
+ ArgOffset += (IsF32 ? 16 : 32);
+ break;
+ }
+ }
+ }
+
+ assert(NumBytesActuallyUsed == ArgOffset);
+ (void)NumBytesActuallyUsed;
+
+ if (!MemOpChains.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+
+ // Check if this is an indirect call (MTCTR/BCTRL).
+ // See PrepareCall() for more information about calls through function
+ // pointers in the 64-bit SVR4 ABI.
+ if (!isTailCall && !isPatchPoint &&
+ !isFunctionGlobalAddress(Callee) &&
+ !isa<ExternalSymbolSDNode>(Callee)) {
+ // Load r2 into a virtual register and store it to the TOC save area.
+ setUsesTOCBasePtr(DAG);
+ SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64);
+ // TOC save area offset.
+ unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
+ SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);
+ SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
+ Chain = DAG.getStore(
+ Val.getValue(1), dl, Val, AddPtr,
+ MachinePointerInfo::getStack(DAG.getMachineFunction(), TOCSaveOffset));
+ // In the ELFv2 ABI, R12 must contain the address of an indirect callee.
+ // This does not mean the MTCTR instruction must use R12; it's easier
+ // to model this as an extra parameter, so do that.
+ if (isELFv2ABI && !isPatchPoint)
+ RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee));
+ }
+
+ // Build a sequence of copy-to-reg nodes chained together with token chain
+ // and flag operands which copy the outgoing args into the appropriate regs.
+ SDValue InFlag;
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+ Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+ RegsToPass[i].second, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ if (isTailCall && !IsSibCall)
+ PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
+ TailCallArguments);
+
+ return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint, hasNest,
+ DAG, RegsToPass, InFlag, Chain, CallSeqStart, Callee,
+ SPDiff, NumBytes, Ins, InVals, CS);
+}
+
+SDValue PPCTargetLowering::LowerCall_Darwin(
+ SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
+ bool isTailCall, bool isPatchPoint,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
+ ImmutableCallSite *CS) const {
+
+ unsigned NumOps = Outs.size();
+
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ bool isPPC64 = PtrVT == MVT::i64;
+ unsigned PtrByteSize = isPPC64 ? 8 : 4;
+
+ MachineFunction &MF = DAG.getMachineFunction();
+
+ // Mark this function as potentially containing a function that contains a
+ // tail call. As a consequence the frame pointer will be used for dynamicalloc
+ // and restoring the callers stack pointer in this functions epilog. This is
+ // done because by tail calling the called function might overwrite the value
+ // in this function's (MF) stack pointer stack slot 0(SP).
+ if (getTargetMachine().Options.GuaranteedTailCallOpt &&
+ CallConv == CallingConv::Fast)
+ MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
+
+ // Count how many bytes are to be pushed on the stack, including the linkage
+ // area, and parameter passing area. We start with 24/48 bytes, which is
+ // prereserved space for [SP][CR][LR][3 x unused].
+ unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
+ unsigned NumBytes = LinkageSize;
+
+ // Add up all the space actually used.
+ // In 32-bit non-varargs calls, Altivec parameters all go at the end; usually
+ // they all go in registers, but we must reserve stack space for them for
+ // possible use by the caller. In varargs or 64-bit calls, parameters are
+ // assigned stack space in order, with padding so Altivec parameters are
+ // 16-byte aligned.
+ unsigned nAltivecParamsAtEnd = 0;
+ for (unsigned i = 0; i != NumOps; ++i) {
+ ISD::ArgFlagsTy Flags = Outs[i].Flags;
+ EVT ArgVT = Outs[i].VT;
+ // Varargs Altivec parameters are padded to a 16 byte boundary.
+ if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
+ ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
+ ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64) {
+ if (!isVarArg && !isPPC64) {
+ // Non-varargs Altivec parameters go after all the non-Altivec
+ // parameters; handle those later so we know how much padding we need.
+ nAltivecParamsAtEnd++;
+ continue;
+ }
+ // Varargs and 64-bit Altivec parameters are padded to 16 byte boundary.
+ NumBytes = ((NumBytes+15)/16)*16;
+ }
+ NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
+ }
+
+ // Allow for Altivec parameters at the end, if needed.
+ if (nAltivecParamsAtEnd) {
+ NumBytes = ((NumBytes+15)/16)*16;
+ NumBytes += 16*nAltivecParamsAtEnd;
+ }
+
+ // The prolog code of the callee may store up to 8 GPR argument registers to
+ // the stack, allowing va_start to index over them in memory if its varargs.
+ // Because we cannot tell if this is needed on the caller side, we have to
+ // conservatively assume that it is needed. As such, make sure we have at
+ // least enough stack space for the caller to store the 8 GPRs.
+ NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize);
+
+ // Tail call needs the stack to be aligned.
+ if (getTargetMachine().Options.GuaranteedTailCallOpt &&
+ CallConv == CallingConv::Fast)
+ NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes);
+
+ // Calculate by how many bytes the stack has to be adjusted in case of tail
+ // call optimization.
+ int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
+
+ // To protect arguments on the stack from being clobbered in a tail call,
+ // force all the loads to happen before doing any other lowering.
+ if (isTailCall)
+ Chain = DAG.getStackArgumentTokenFactor(Chain);
+
+ // Adjust the stack pointer for the new arguments...
+ // These operations are automatically eliminated by the prolog/epilog pass
+ Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
+ dl);
+ SDValue CallSeqStart = Chain;
+
+ // Load the return address and frame pointer so it can be move somewhere else
+ // later.
+ SDValue LROp, FPOp;
+ Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl);
+
+ // Set up a copy of the stack pointer for use loading and storing any
+ // arguments that may not fit in the registers available for argument
+ // passing.
+ SDValue StackPtr;
+ if (isPPC64)
+ StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
+ else
+ StackPtr = DAG.getRegister(PPC::R1, MVT::i32);
+
+ // Figure out which arguments are going to go in registers, and which in
+ // memory. Also, if this is a vararg function, floating point operations
+ // must be stored to our stack, and loaded into integer regs as well, if
+ // any integer regs are available for argument passing.
+ unsigned ArgOffset = LinkageSize;
+ unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
+
+ static const MCPhysReg GPR_32[] = { // 32-bit registers.
+ PPC::R3, PPC::R4, PPC::R5, PPC::R6,
+ PPC::R7, PPC::R8, PPC::R9, PPC::R10,
+ };
+ static const MCPhysReg GPR_64[] = { // 64-bit registers.
+ PPC::X3, PPC::X4, PPC::X5, PPC::X6,
+ PPC::X7, PPC::X8, PPC::X9, PPC::X10,
+ };
+ static const MCPhysReg VR[] = {
+ PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
+ PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
+ };
+ const unsigned NumGPRs = array_lengthof(GPR_32);
+ const unsigned NumFPRs = 13;
+ const unsigned NumVRs = array_lengthof(VR);
+
+ const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32;
+
+ SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+ SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
+
+ SmallVector<SDValue, 8> MemOpChains;
+ for (unsigned i = 0; i != NumOps; ++i) {
+ SDValue Arg = OutVals[i];
+ ISD::ArgFlagsTy Flags = Outs[i].Flags;
+
+ // PtrOff will be used to store the current argument to the stack if a
+ // register cannot be found for it.
+ SDValue PtrOff;
+
+ PtrOff = DAG.getConstant(ArgOffset, dl, StackPtr.getValueType());
+
+ PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
+
+ // On PPC64, promote integers to 64-bit values.
+ if (isPPC64 && Arg.getValueType() == MVT::i32) {
+ // FIXME: Should this use ANY_EXTEND if neither sext nor zext?
+ unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg);
+ }
+
+ // FIXME memcpy is used way more than necessary. Correctness first.
+ // Note: "by value" is code for passing a structure by value, not
+ // basic types.
+ if (Flags.isByVal()) {
+ unsigned Size = Flags.getByValSize();
+ // Very small objects are passed right-justified. Everything else is
+ // passed left-justified.
+ if (Size==1 || Size==2) {
+ EVT VT = (Size==1) ? MVT::i8 : MVT::i16;
+ if (GPR_idx != NumGPRs) {
+ SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
+ MachinePointerInfo(), VT);
+ MemOpChains.push_back(Load.getValue(1));
+ RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+
+ ArgOffset += PtrByteSize;
+ } else {
+ SDValue Const = DAG.getConstant(PtrByteSize - Size, dl,
+ PtrOff.getValueType());
+ SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
+ Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
+ CallSeqStart,
+ Flags, DAG, dl);
+ ArgOffset += PtrByteSize;
+ }
+ continue;
+ }
+ // Copy entire object into memory. There are cases where gcc-generated
+ // code assumes it is there, even if it could be put entirely into
+ // registers. (This is not what the doc says.)
+ Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff,
+ CallSeqStart,
+ Flags, DAG, dl);
+
+ // For small aggregates (Darwin only) and aggregates >= PtrByteSize,
+ // copy the pieces of the object that fit into registers from the
+ // parameter save area.
+ for (unsigned j=0; j<Size; j+=PtrByteSize) {
+ SDValue Const = DAG.getConstant(j, dl, PtrOff.getValueType());
+ SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
+ if (GPR_idx != NumGPRs) {
+ SDValue Load =
+ DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo());
+ MemOpChains.push_back(Load.getValue(1));
+ RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+ ArgOffset += PtrByteSize;
+ } else {
+ ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize;
+ break;
+ }
+ }
+ continue;
+ }
+
+ switch (Arg.getSimpleValueType().SimpleTy) {
+ default: llvm_unreachable("Unexpected ValueType for argument!");
+ case MVT::i1:
+ case MVT::i32:
+ case MVT::i64:
+ if (GPR_idx != NumGPRs) {
+ if (Arg.getValueType() == MVT::i1)
+ Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, PtrVT, Arg);
+
+ RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
+ } else {
+ LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+ isPPC64, isTailCall, false, MemOpChains,
+ TailCallArguments, dl);
+ }
+ ArgOffset += PtrByteSize;
+ break;
+ case MVT::f32:
+ case MVT::f64:
+ if (FPR_idx != NumFPRs) {
+ RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));
+
+ if (isVarArg) {
+ SDValue Store =
+ DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
+ MemOpChains.push_back(Store);
+
+ // Float varargs are always shadowed in available integer registers
+ if (GPR_idx != NumGPRs) {
+ SDValue Load =
+ DAG.getLoad(PtrVT, dl, Store, PtrOff, MachinePointerInfo());
+ MemOpChains.push_back(Load.getValue(1));
+ RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+ }
+ if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 && !isPPC64){
+ SDValue ConstFour = DAG.getConstant(4, dl, PtrOff.getValueType());
+ PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
+ SDValue Load =
+ DAG.getLoad(PtrVT, dl, Store, PtrOff, MachinePointerInfo());
+ MemOpChains.push_back(Load.getValue(1));
+ RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+ }
+ } else {
+ // If we have any FPRs remaining, we may also have GPRs remaining.
+ // Args passed in FPRs consume either 1 (f32) or 2 (f64) available
+ // GPRs.
+ if (GPR_idx != NumGPRs)
+ ++GPR_idx;
+ if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 &&
+ !isPPC64) // PPC64 has 64-bit GPR's obviously :)
+ ++GPR_idx;
+ }
+ } else
+ LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+ isPPC64, isTailCall, false, MemOpChains,
+ TailCallArguments, dl);
+ if (isPPC64)
+ ArgOffset += 8;
+ else
+ ArgOffset += Arg.getValueType() == MVT::f32 ? 4 : 8;
+ break;
+ case MVT::v4f32:
+ case MVT::v4i32:
+ case MVT::v8i16:
+ case MVT::v16i8:
+ if (isVarArg) {
+ // These go aligned on the stack, or in the corresponding R registers
+ // when within range. The Darwin PPC ABI doc claims they also go in
+ // V registers; in fact gcc does this only for arguments that are
+ // prototyped, not for those that match the ... We do it for all
+ // arguments, seems to work.
+ while (ArgOffset % 16 !=0) {
+ ArgOffset += PtrByteSize;
+ if (GPR_idx != NumGPRs)
+ GPR_idx++;
+ }
+ // We could elide this store in the case where the object fits
+ // entirely in R registers. Maybe later.
+ PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
+ DAG.getConstant(ArgOffset, dl, PtrVT));
+ SDValue Store =
+ DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
+ MemOpChains.push_back(Store);
+ if (VR_idx != NumVRs) {
+ SDValue Load =
+ DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, MachinePointerInfo());
+ MemOpChains.push_back(Load.getValue(1));
+ RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load));
+ }
+ ArgOffset += 16;
+ for (unsigned i=0; i<16; i+=PtrByteSize) {
+ if (GPR_idx == NumGPRs)
+ break;
+ SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
+ DAG.getConstant(i, dl, PtrVT));
+ SDValue Load =
+ DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo());
+ MemOpChains.push_back(Load.getValue(1));
+ RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+ }
+ break;
+ }
+
+ // Non-varargs Altivec params generally go in registers, but have
+ // stack space allocated at the end.
+ if (VR_idx != NumVRs) {
+ // Doesn't have GPR space allocated.
+ RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg));
+ } else if (nAltivecParamsAtEnd==0) {
+ // We are emitting Altivec params in order.
+ LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+ isPPC64, isTailCall, true, MemOpChains,
+ TailCallArguments, dl);
+ ArgOffset += 16;
+ }
+ break;
+ }
+ }
+ // If all Altivec parameters fit in registers, as they usually do,
+ // they get stack space following the non-Altivec parameters. We
+ // don't track this here because nobody below needs it.
+ // If there are more Altivec parameters than fit in registers emit
+ // the stores here.
+ if (!isVarArg && nAltivecParamsAtEnd > NumVRs) {
+ unsigned j = 0;
+ // Offset is aligned; skip 1st 12 params which go in V registers.
+ ArgOffset = ((ArgOffset+15)/16)*16;
+ ArgOffset += 12*16;
+ for (unsigned i = 0; i != NumOps; ++i) {
+ SDValue Arg = OutVals[i];
+ EVT ArgType = Outs[i].VT;
+ if (ArgType==MVT::v4f32 || ArgType==MVT::v4i32 ||
+ ArgType==MVT::v8i16 || ArgType==MVT::v16i8) {
+ if (++j > NumVRs) {
+ SDValue PtrOff;
+ // We are emitting Altivec params in order.
+ LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+ isPPC64, isTailCall, true, MemOpChains,
+ TailCallArguments, dl);
+ ArgOffset += 16;
+ }
+ }
+ }
+ }
+
+ if (!MemOpChains.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+
+ // On Darwin, R12 must contain the address of an indirect callee. This does
+ // not mean the MTCTR instruction must use R12; it's easier to model this as
+ // an extra parameter, so do that.
+ if (!isTailCall &&
+ !isFunctionGlobalAddress(Callee) &&
+ !isa<ExternalSymbolSDNode>(Callee) &&
+ !isBLACompatibleAddress(Callee, DAG))
+ RegsToPass.push_back(std::make_pair((unsigned)(isPPC64 ? PPC::X12 :
+ PPC::R12), Callee));
+
+ // Build a sequence of copy-to-reg nodes chained together with token chain
+ // and flag operands which copy the outgoing args into the appropriate regs.
+ SDValue InFlag;
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+ Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+ RegsToPass[i].second, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ if (isTailCall)
+ PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
+ TailCallArguments);
+
+ return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint,
+ /* unused except on PPC64 ELFv1 */ false, DAG,
+ RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff,
+ NumBytes, Ins, InVals, CS);
+}
+
+bool
+PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
+ MachineFunction &MF, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ LLVMContext &Context) const {
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
+ return CCInfo.CheckReturn(Outs, RetCC_PPC);
+}
+
+SDValue
+PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SDLoc &dl, SelectionDAG &DAG) const {
+
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+ *DAG.getContext());
+ CCInfo.AnalyzeReturn(Outs, RetCC_PPC);
+
+ SDValue Flag;
+ SmallVector<SDValue, 4> RetOps(1, Chain);
+
+ // Copy the result values into the output registers.
+ for (unsigned i = 0; i != RVLocs.size(); ++i) {
+ CCValAssign &VA = RVLocs[i];
+ assert(VA.isRegLoc() && "Can only return in registers!");
+
+ SDValue Arg = OutVals[i];
+
+ switch (VA.getLocInfo()) {
+ default: llvm_unreachable("Unknown loc info!");
+ case CCValAssign::Full: break;
+ case CCValAssign::AExt:
+ Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::ZExt:
+ Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::SExt:
+ Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+ break;
+ }
+
+ Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
+ Flag = Chain.getValue(1);
+ RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+ }
+
+ const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo();
+ const MCPhysReg *I =
+ TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
+ if (I) {
+ for (; *I; ++I) {
+
+ if (PPC::G8RCRegClass.contains(*I))
+ RetOps.push_back(DAG.getRegister(*I, MVT::i64));
+ else if (PPC::F8RCRegClass.contains(*I))
+ RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
+ else if (PPC::CRRCRegClass.contains(*I))
+ RetOps.push_back(DAG.getRegister(*I, MVT::i1));
+ else if (PPC::VRRCRegClass.contains(*I))
+ RetOps.push_back(DAG.getRegister(*I, MVT::Other));
+ else
+ llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+ }
+ }
+
+ RetOps[0] = Chain; // Update chain.
+
+ // Add the flag if we have it.
+ if (Flag.getNode())
+ RetOps.push_back(Flag);
+
+ return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other, RetOps);
+}
+
+SDValue
+PPCTargetLowering::LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+
+ // Get the corect type for integers.
+ EVT IntVT = Op.getValueType();
+
+ // Get the inputs.
+ SDValue Chain = Op.getOperand(0);
+ SDValue FPSIdx = getFramePointerFrameIndex(DAG);
+ // Build a DYNAREAOFFSET node.
+ SDValue Ops[2] = {Chain, FPSIdx};
+ SDVTList VTs = DAG.getVTList(IntVT);
+ return DAG.getNode(PPCISD::DYNAREAOFFSET, dl, VTs, Ops);
+}
+
+SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op,
+ SelectionDAG &DAG) const {
+ // When we pop the dynamic allocation we need to restore the SP link.
+ SDLoc dl(Op);
+
+ // Get the corect type for pointers.
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+ // Construct the stack pointer operand.
+ bool isPPC64 = Subtarget.isPPC64();
+ unsigned SP = isPPC64 ? PPC::X1 : PPC::R1;
+ SDValue StackPtr = DAG.getRegister(SP, PtrVT);
+
+ // Get the operands for the STACKRESTORE.
+ SDValue Chain = Op.getOperand(0);
+ SDValue SaveSP = Op.getOperand(1);
+
+ // Load the old link SP.
+ SDValue LoadLinkSP =
+ DAG.getLoad(PtrVT, dl, Chain, StackPtr, MachinePointerInfo());
+
+ // Restore the stack pointer.
+ Chain = DAG.getCopyToReg(LoadLinkSP.getValue(1), dl, SP, SaveSP);
+
+ // Store the old link SP.
+ return DAG.getStore(Chain, dl, LoadLinkSP, StackPtr, MachinePointerInfo());
+}
+
+SDValue PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ bool isPPC64 = Subtarget.isPPC64();
+ EVT PtrVT = getPointerTy(MF.getDataLayout());
+
+ // Get current frame pointer save index. The users of this index will be
+ // primarily DYNALLOC instructions.
+ PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+ int RASI = FI->getReturnAddrSaveIndex();
+
+ // If the frame pointer save index hasn't been defined yet.
+ if (!RASI) {
+ // Find out what the fix offset of the frame pointer save area.
+ int LROffset = Subtarget.getFrameLowering()->getReturnSaveOffset();
+ // Allocate the frame index for frame pointer save area.
+ RASI = MF.getFrameInfo().CreateFixedObject(isPPC64? 8 : 4, LROffset, false);
+ // Save the result.
+ FI->setReturnAddrSaveIndex(RASI);
+ }
+ return DAG.getFrameIndex(RASI, PtrVT);
+}
+
+SDValue
+PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ bool isPPC64 = Subtarget.isPPC64();
+ EVT PtrVT = getPointerTy(MF.getDataLayout());
+
+ // Get current frame pointer save index. The users of this index will be
+ // primarily DYNALLOC instructions.
+ PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+ int FPSI = FI->getFramePointerSaveIndex();
+
+ // If the frame pointer save index hasn't been defined yet.
+ if (!FPSI) {
+ // Find out what the fix offset of the frame pointer save area.
+ int FPOffset = Subtarget.getFrameLowering()->getFramePointerSaveOffset();
+ // Allocate the frame index for frame pointer save area.
+ FPSI = MF.getFrameInfo().CreateFixedObject(isPPC64? 8 : 4, FPOffset, true);
+ // Save the result.
+ FI->setFramePointerSaveIndex(FPSI);
+ }
+ return DAG.getFrameIndex(FPSI, PtrVT);
+}
+
+SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+ SelectionDAG &DAG) const {
+ // Get the inputs.
+ SDValue Chain = Op.getOperand(0);
+ SDValue Size = Op.getOperand(1);
+ SDLoc dl(Op);
+
+ // Get the corect type for pointers.
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ // Negate the size.
+ SDValue NegSize = DAG.getNode(ISD::SUB, dl, PtrVT,
+ DAG.getConstant(0, dl, PtrVT), Size);
+ // Construct a node for the frame pointer save index.
+ SDValue FPSIdx = getFramePointerFrameIndex(DAG);
+ // Build a DYNALLOC node.
+ SDValue Ops[3] = { Chain, NegSize, FPSIdx };
+ SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other);
+ return DAG.getNode(PPCISD::DYNALLOC, dl, VTs, Ops);
+}
+
+SDValue PPCTargetLowering::LowerEH_DWARF_CFA(SDValue Op,
+ SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+
+ bool isPPC64 = Subtarget.isPPC64();
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+ int FI = MF.getFrameInfo().CreateFixedObject(isPPC64 ? 8 : 4, 0, false);
+ return DAG.getFrameIndex(FI, PtrVT);
+}
+
+SDValue PPCTargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ return DAG.getNode(PPCISD::EH_SJLJ_SETJMP, DL,
+ DAG.getVTList(MVT::i32, MVT::Other),
+ Op.getOperand(0), Op.getOperand(1));
+}
+
+SDValue PPCTargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ return DAG.getNode(PPCISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
+ Op.getOperand(0), Op.getOperand(1));
+}
+
+SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+ if (Op.getValueType().isVector())
+ return LowerVectorLoad(Op, DAG);
+
+ assert(Op.getValueType() == MVT::i1 &&
+ "Custom lowering only for i1 loads");
+
+ // First, load 8 bits into 32 bits, then truncate to 1 bit.
+
+ SDLoc dl(Op);
+ LoadSDNode *LD = cast<LoadSDNode>(Op);
+
+ SDValue Chain = LD->getChain();
+ SDValue BasePtr = LD->getBasePtr();
+ MachineMemOperand *MMO = LD->getMemOperand();
+
+ SDValue NewLD =
+ DAG.getExtLoad(ISD::EXTLOAD, dl, getPointerTy(DAG.getDataLayout()), Chain,
+ BasePtr, MVT::i8, MMO);
+ SDValue Result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewLD);
+
+ SDValue Ops[] = { Result, SDValue(NewLD.getNode(), 1) };
+ return DAG.getMergeValues(Ops, dl);
+}
+
+SDValue PPCTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+ if (Op.getOperand(1).getValueType().isVector())
+ return LowerVectorStore(Op, DAG);
+
+ assert(Op.getOperand(1).getValueType() == MVT::i1 &&
+ "Custom lowering only for i1 stores");
+
+ // First, zero extend to 32 bits, then use a truncating store to 8 bits.
+
+ SDLoc dl(Op);
+ StoreSDNode *ST = cast<StoreSDNode>(Op);
+
+ SDValue Chain = ST->getChain();
+ SDValue BasePtr = ST->getBasePtr();
+ SDValue Value = ST->getValue();
+ MachineMemOperand *MMO = ST->getMemOperand();
+
+ Value = DAG.getNode(ISD::ZERO_EXTEND, dl, getPointerTy(DAG.getDataLayout()),
+ Value);
+ return DAG.getTruncStore(Chain, dl, Value, BasePtr, MVT::i8, MMO);
+}
+
+// FIXME: Remove this once the ANDI glue bug is fixed:
+SDValue PPCTargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
+ assert(Op.getValueType() == MVT::i1 &&
+ "Custom lowering only for i1 results");
+
+ SDLoc DL(Op);
+ return DAG.getNode(PPCISD::ANDIo_1_GT_BIT, DL, MVT::i1,
+ Op.getOperand(0));
+}
+
+/// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when
+/// possible.
+SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
+ // Not FP? Not a fsel.
+ if (!Op.getOperand(0).getValueType().isFloatingPoint() ||
+ !Op.getOperand(2).getValueType().isFloatingPoint())
+ return Op;
+
+ // We might be able to do better than this under some circumstances, but in
+ // general, fsel-based lowering of select is a finite-math-only optimization.
+ // For more information, see section F.3 of the 2.06 ISA specification.
+ if (!DAG.getTarget().Options.NoInfsFPMath ||
+ !DAG.getTarget().Options.NoNaNsFPMath)
+ return Op;
+ // TODO: Propagate flags from the select rather than global settings.
+ SDNodeFlags Flags;
+ Flags.setNoInfs(true);
+ Flags.setNoNaNs(true);
+
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+
+ EVT ResVT = Op.getValueType();
+ EVT CmpVT = Op.getOperand(0).getValueType();
+ SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
+ SDValue TV = Op.getOperand(2), FV = Op.getOperand(3);
+ SDLoc dl(Op);
+
+ // If the RHS of the comparison is a 0.0, we don't need to do the
+ // subtraction at all.
+ SDValue Sel1;
+ if (isFloatingPointZero(RHS))
+ switch (CC) {
+ default: break; // SETUO etc aren't handled by fsel.
+ case ISD::SETNE:
+ std::swap(TV, FV);
+ case ISD::SETEQ:
+ if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits
+ LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
+ Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV);
+ if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits
+ Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1);
+ return DAG.getNode(PPCISD::FSEL, dl, ResVT,
+ DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), Sel1, FV);
+ case ISD::SETULT:
+ case ISD::SETLT:
+ std::swap(TV, FV); // fsel is natively setge, swap operands for setlt
+ case ISD::SETOGE:
+ case ISD::SETGE:
+ if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits
+ LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
+ return DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV);
+ case ISD::SETUGT:
+ case ISD::SETGT:
+ std::swap(TV, FV); // fsel is natively setge, swap operands for setlt
+ case ISD::SETOLE:
+ case ISD::SETLE:
+ if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits
+ LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
+ return DAG.getNode(PPCISD::FSEL, dl, ResVT,
+ DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), TV, FV);
+ }
+
+ SDValue Cmp;
+ switch (CC) {
+ default: break; // SETUO etc aren't handled by fsel.
+ case ISD::SETNE:
+ std::swap(TV, FV);
+ case ISD::SETEQ:
+ Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, &Flags);
+ if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
+ Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
+ Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
+ if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits
+ Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1);
+ return DAG.getNode(PPCISD::FSEL, dl, ResVT,
+ DAG.getNode(ISD::FNEG, dl, MVT::f64, Cmp), Sel1, FV);
+ case ISD::SETULT:
+ case ISD::SETLT:
+ Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, &Flags);
+ if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
+ Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
+ return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV);
+ case ISD::SETOGE:
+ case ISD::SETGE:
+ Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, &Flags);
+ if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
+ Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
+ return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
+ case ISD::SETUGT:
+ case ISD::SETGT:
+ Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, &Flags);
+ if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
+ Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
+ return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV);
+ case ISD::SETOLE:
+ case ISD::SETLE:
+ Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, &Flags);
+ if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
+ Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
+ return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
+ }
+ return Op;
+}
+
+void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI,
+ SelectionDAG &DAG,
+ const SDLoc &dl) const {
+ assert(Op.getOperand(0).getValueType().isFloatingPoint());
+ SDValue Src = Op.getOperand(0);
+ if (Src.getValueType() == MVT::f32)
+ Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
+
+ SDValue Tmp;
+ switch (Op.getSimpleValueType().SimpleTy) {
+ default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!");
+ case MVT::i32:
+ Tmp = DAG.getNode(
+ Op.getOpcode() == ISD::FP_TO_SINT
+ ? PPCISD::FCTIWZ
+ : (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ),
+ dl, MVT::f64, Src);
+ break;
+ case MVT::i64:
+ assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) &&
+ "i64 FP_TO_UINT is supported only with FPCVT");
+ Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
+ PPCISD::FCTIDUZ,
+ dl, MVT::f64, Src);
+ break;
+ }
+
+ // Convert the FP value to an int value through memory.
+ bool i32Stack = Op.getValueType() == MVT::i32 && Subtarget.hasSTFIWX() &&
+ (Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT());
+ SDValue FIPtr = DAG.CreateStackTemporary(i32Stack ? MVT::i32 : MVT::f64);
+ int FI = cast<FrameIndexSDNode>(FIPtr)->getIndex();
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
+
+ // Emit a store to the stack slot.
+ SDValue Chain;
+ if (i32Stack) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineMemOperand *MMO =
+ MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 4, 4);
+ SDValue Ops[] = { DAG.getEntryNode(), Tmp, FIPtr };
+ Chain = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl,
+ DAG.getVTList(MVT::Other), Ops, MVT::i32, MMO);
+ } else
+ Chain = DAG.getStore(DAG.getEntryNode(), dl, Tmp, FIPtr, MPI);
+
+ // Result is a load from the stack slot. If loading 4 bytes, make sure to
+ // add in a bias on big endian.
+ if (Op.getValueType() == MVT::i32 && !i32Stack) {
+ FIPtr = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr,
+ DAG.getConstant(4, dl, FIPtr.getValueType()));
+ MPI = MPI.getWithOffset(Subtarget.isLittleEndian() ? 0 : 4);
+ }
+
+ RLI.Chain = Chain;
+ RLI.Ptr = FIPtr;
+ RLI.MPI = MPI;
+}
+
+/// \brief Custom lowers floating point to integer conversions to use
+/// the direct move instructions available in ISA 2.07 to avoid the
+/// need for load/store combinations.
+SDValue PPCTargetLowering::LowerFP_TO_INTDirectMove(SDValue Op,
+ SelectionDAG &DAG,
+ const SDLoc &dl) const {
+ assert(Op.getOperand(0).getValueType().isFloatingPoint());
+ SDValue Src = Op.getOperand(0);
+
+ if (Src.getValueType() == MVT::f32)
+ Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
+
+ SDValue Tmp;
+ switch (Op.getSimpleValueType().SimpleTy) {
+ default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!");
+ case MVT::i32:
+ Tmp = DAG.getNode(
+ Op.getOpcode() == ISD::FP_TO_SINT
+ ? PPCISD::FCTIWZ
+ : (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ),
+ dl, MVT::f64, Src);
+ Tmp = DAG.getNode(PPCISD::MFVSR, dl, MVT::i32, Tmp);
+ break;
+ case MVT::i64:
+ assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) &&
+ "i64 FP_TO_UINT is supported only with FPCVT");
+ Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
+ PPCISD::FCTIDUZ,
+ dl, MVT::f64, Src);
+ Tmp = DAG.getNode(PPCISD::MFVSR, dl, MVT::i64, Tmp);
+ break;
+ }
+ return Tmp;
+}
+
+SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
+ const SDLoc &dl) const {
+ if (Subtarget.hasDirectMove() && Subtarget.isPPC64())
+ return LowerFP_TO_INTDirectMove(Op, DAG, dl);
+
+ ReuseLoadInfo RLI;
+ LowerFP_TO_INTForReuse(Op, RLI, DAG, dl);
+
+ return DAG.getLoad(Op.getValueType(), dl, RLI.Chain, RLI.Ptr, RLI.MPI,
+ RLI.Alignment, RLI.MMOFlags(), RLI.AAInfo, RLI.Ranges);
+}
+
+// We're trying to insert a regular store, S, and then a load, L. If the
+// incoming value, O, is a load, we might just be able to have our load use the
+// address used by O. However, we don't know if anything else will store to
+// that address before we can load from it. To prevent this situation, we need
+// to insert our load, L, into the chain as a peer of O. To do this, we give L
+// the same chain operand as O, we create a token factor from the chain results
+// of O and L, and we replace all uses of O's chain result with that token
+// factor (see spliceIntoChain below for this last part).
+bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT,
+ ReuseLoadInfo &RLI,
+ SelectionDAG &DAG,
+ ISD::LoadExtType ET) const {
+ SDLoc dl(Op);
+ if (ET == ISD::NON_EXTLOAD &&
+ (Op.getOpcode() == ISD::FP_TO_UINT ||
+ Op.getOpcode() == ISD::FP_TO_SINT) &&
+ isOperationLegalOrCustom(Op.getOpcode(),
+ Op.getOperand(0).getValueType())) {
+
+ LowerFP_TO_INTForReuse(Op, RLI, DAG, dl);
+ return true;
+ }
+
+ LoadSDNode *LD = dyn_cast<LoadSDNode>(Op);
+ if (!LD || LD->getExtensionType() != ET || LD->isVolatile() ||
+ LD->isNonTemporal())
+ return false;
+ if (LD->getMemoryVT() != MemVT)
+ return false;
+
+ RLI.Ptr = LD->getBasePtr();
+ if (LD->isIndexed() && !LD->getOffset().isUndef()) {
+ assert(LD->getAddressingMode() == ISD::PRE_INC &&
+ "Non-pre-inc AM on PPC?");
+ RLI.Ptr = DAG.getNode(ISD::ADD, dl, RLI.Ptr.getValueType(), RLI.Ptr,
+ LD->getOffset());
+ }
+
+ RLI.Chain = LD->getChain();
+ RLI.MPI = LD->getPointerInfo();
+ RLI.IsDereferenceable = LD->isDereferenceable();
+ RLI.IsInvariant = LD->isInvariant();
+ RLI.Alignment = LD->getAlignment();
+ RLI.AAInfo = LD->getAAInfo();
+ RLI.Ranges = LD->getRanges();
+
+ RLI.ResChain = SDValue(LD, LD->isIndexed() ? 2 : 1);
+ return true;
+}
+
+// Given the head of the old chain, ResChain, insert a token factor containing
+// it and NewResChain, and make users of ResChain now be users of that token
+// factor.
+void PPCTargetLowering::spliceIntoChain(SDValue ResChain,
+ SDValue NewResChain,
+ SelectionDAG &DAG) const {
+ if (!ResChain)
+ return;
+
+ SDLoc dl(NewResChain);
+
+ SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ NewResChain, DAG.getUNDEF(MVT::Other));
+ assert(TF.getNode() != NewResChain.getNode() &&
+ "A new TF really is required here");
+
+ DAG.ReplaceAllUsesOfValueWith(ResChain, TF);
+ DAG.UpdateNodeOperands(TF.getNode(), ResChain, NewResChain);
+}
+
+/// \brief Analyze profitability of direct move
+/// prefer float load to int load plus direct move
+/// when there is no integer use of int load
+bool PPCTargetLowering::directMoveIsProfitable(const SDValue &Op) const {
+ SDNode *Origin = Op.getOperand(0).getNode();
+ if (Origin->getOpcode() != ISD::LOAD)
+ return true;
+
+ // If there is no LXSIBZX/LXSIHZX, like Power8,
+ // prefer direct move if the memory size is 1 or 2 bytes.
+ MachineMemOperand *MMO = cast<LoadSDNode>(Origin)->getMemOperand();
+ if (!Subtarget.hasP9Vector() && MMO->getSize() <= 2)
+ return true;
+
+ for (SDNode::use_iterator UI = Origin->use_begin(),
+ UE = Origin->use_end();
+ UI != UE; ++UI) {
+
+ // Only look at the users of the loaded value.
+ if (UI.getUse().get().getResNo() != 0)
+ continue;
+
+ if (UI->getOpcode() != ISD::SINT_TO_FP &&
+ UI->getOpcode() != ISD::UINT_TO_FP)
+ return true;
+ }
+
+ return false;
+}
+
+/// \brief Custom lowers integer to floating point conversions to use
+/// the direct move instructions available in ISA 2.07 to avoid the
+/// need for load/store combinations.
+SDValue PPCTargetLowering::LowerINT_TO_FPDirectMove(SDValue Op,
+ SelectionDAG &DAG,
+ const SDLoc &dl) const {
+ assert((Op.getValueType() == MVT::f32 ||
+ Op.getValueType() == MVT::f64) &&
+ "Invalid floating point type as target of conversion");
+ assert(Subtarget.hasFPCVT() &&
+ "Int to FP conversions with direct moves require FPCVT");
+ SDValue FP;
+ SDValue Src = Op.getOperand(0);
+ bool SinglePrec = Op.getValueType() == MVT::f32;
+ bool WordInt = Src.getSimpleValueType().SimpleTy == MVT::i32;
+ bool Signed = Op.getOpcode() == ISD::SINT_TO_FP;
+ unsigned ConvOp = Signed ? (SinglePrec ? PPCISD::FCFIDS : PPCISD::FCFID) :
+ (SinglePrec ? PPCISD::FCFIDUS : PPCISD::FCFIDU);
+
+ if (WordInt) {
+ FP = DAG.getNode(Signed ? PPCISD::MTVSRA : PPCISD::MTVSRZ,
+ dl, MVT::f64, Src);
+ FP = DAG.getNode(ConvOp, dl, SinglePrec ? MVT::f32 : MVT::f64, FP);
+ }
+ else {
+ FP = DAG.getNode(PPCISD::MTVSRA, dl, MVT::f64, Src);
+ FP = DAG.getNode(ConvOp, dl, SinglePrec ? MVT::f32 : MVT::f64, FP);
+ }
+
+ return FP;
+}
+
+SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+
+ if (Subtarget.hasQPX() && Op.getOperand(0).getValueType() == MVT::v4i1) {
+ if (Op.getValueType() != MVT::v4f32 && Op.getValueType() != MVT::v4f64)
+ return SDValue();
+
+ SDValue Value = Op.getOperand(0);
+ // The values are now known to be -1 (false) or 1 (true). To convert this
+ // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5).
+ // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5
+ Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value);
+
+ SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64);
+
+ Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);
+
+ if (Op.getValueType() != MVT::v4f64)
+ Value = DAG.getNode(ISD::FP_ROUND, dl,
+ Op.getValueType(), Value,
+ DAG.getIntPtrConstant(1, dl));
+ return Value;
+ }
+
+ // Don't handle ppc_fp128 here; let it be lowered to a libcall.
+ if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
+ return SDValue();
+
+ if (Op.getOperand(0).getValueType() == MVT::i1)
+ return DAG.getNode(ISD::SELECT, dl, Op.getValueType(), Op.getOperand(0),
+ DAG.getConstantFP(1.0, dl, Op.getValueType()),
+ DAG.getConstantFP(0.0, dl, Op.getValueType()));
+
+ // If we have direct moves, we can do all the conversion, skip the store/load
+ // however, without FPCVT we can't do most conversions.
+ if (Subtarget.hasDirectMove() && directMoveIsProfitable(Op) &&
+ Subtarget.isPPC64() && Subtarget.hasFPCVT())
+ return LowerINT_TO_FPDirectMove(Op, DAG, dl);
+
+ assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) &&
+ "UINT_TO_FP is supported only with FPCVT");
+
+ // If we have FCFIDS, then use it when converting to single-precision.
+ // Otherwise, convert to double-precision and then round.
+ unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
+ ? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS
+ : PPCISD::FCFIDS)
+ : (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU
+ : PPCISD::FCFID);
+ MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
+ ? MVT::f32
+ : MVT::f64;
+
+ if (Op.getOperand(0).getValueType() == MVT::i64) {
+ SDValue SINT = Op.getOperand(0);
+ // When converting to single-precision, we actually need to convert
+ // to double-precision first and then round to single-precision.
+ // To avoid double-rounding effects during that operation, we have
+ // to prepare the input operand. Bits that might be truncated when
+ // converting to double-precision are replaced by a bit that won't
+ // be lost at this stage, but is below the single-precision rounding
+ // position.
+ //
+ // However, if -enable-unsafe-fp-math is in effect, accept double
+ // rounding to avoid the extra overhead.
+ if (Op.getValueType() == MVT::f32 &&
+ !Subtarget.hasFPCVT() &&
+ !DAG.getTarget().Options.UnsafeFPMath) {
+
+ // Twiddle input to make sure the low 11 bits are zero. (If this
+ // is the case, we are guaranteed the value will fit into the 53 bit
+ // mantissa of an IEEE double-precision value without rounding.)
+ // If any of those low 11 bits were not zero originally, make sure
+ // bit 12 (value 2048) is set instead, so that the final rounding
+ // to single-precision gets the correct result.
+ SDValue Round = DAG.getNode(ISD::AND, dl, MVT::i64,
+ SINT, DAG.getConstant(2047, dl, MVT::i64));
+ Round = DAG.getNode(ISD::ADD, dl, MVT::i64,
+ Round, DAG.getConstant(2047, dl, MVT::i64));
+ Round = DAG.getNode(ISD::OR, dl, MVT::i64, Round, SINT);
+ Round = DAG.getNode(ISD::AND, dl, MVT::i64,
+ Round, DAG.getConstant(-2048, dl, MVT::i64));
+
+ // However, we cannot use that value unconditionally: if the magnitude
+ // of the input value is small, the bit-twiddling we did above might
+ // end up visibly changing the output. Fortunately, in that case, we
+ // don't need to twiddle bits since the original input will convert
+ // exactly to double-precision floating-point already. Therefore,
+ // construct a conditional to use the original value if the top 11
+ // bits are all sign-bit copies, and use the rounded value computed
+ // above otherwise.
+ SDValue Cond = DAG.getNode(ISD::SRA, dl, MVT::i64,
+ SINT, DAG.getConstant(53, dl, MVT::i32));
+ Cond = DAG.getNode(ISD::ADD, dl, MVT::i64,
+ Cond, DAG.getConstant(1, dl, MVT::i64));
+ Cond = DAG.getSetCC(dl, MVT::i32,
+ Cond, DAG.getConstant(1, dl, MVT::i64), ISD::SETUGT);
+
+ SINT = DAG.getNode(ISD::SELECT, dl, MVT::i64, Cond, Round, SINT);
+ }
+
+ ReuseLoadInfo RLI;
+ SDValue Bits;
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ if (canReuseLoadAddress(SINT, MVT::i64, RLI, DAG)) {
+ Bits = DAG.getLoad(MVT::f64, dl, RLI.Chain, RLI.Ptr, RLI.MPI,
+ RLI.Alignment, RLI.MMOFlags(), RLI.AAInfo, RLI.Ranges);
+ spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
+ } else if (Subtarget.hasLFIWAX() &&
+ canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::SEXTLOAD)) {
+ MachineMemOperand *MMO =
+ MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
+ RLI.Alignment, RLI.AAInfo, RLI.Ranges);
+ SDValue Ops[] = { RLI.Chain, RLI.Ptr };
+ Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWAX, dl,
+ DAG.getVTList(MVT::f64, MVT::Other),
+ Ops, MVT::i32, MMO);
+ spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
+ } else if (Subtarget.hasFPCVT() &&
+ canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::ZEXTLOAD)) {
+ MachineMemOperand *MMO =
+ MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
+ RLI.Alignment, RLI.AAInfo, RLI.Ranges);
+ SDValue Ops[] = { RLI.Chain, RLI.Ptr };
+ Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWZX, dl,
+ DAG.getVTList(MVT::f64, MVT::Other),
+ Ops, MVT::i32, MMO);
+ spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
+ } else if (((Subtarget.hasLFIWAX() &&
+ SINT.getOpcode() == ISD::SIGN_EXTEND) ||
+ (Subtarget.hasFPCVT() &&
+ SINT.getOpcode() == ISD::ZERO_EXTEND)) &&
+ SINT.getOperand(0).getValueType() == MVT::i32) {
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+ int FrameIdx = MFI.CreateStackObject(4, 4, false);
+ SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
+
+ SDValue Store =
+ DAG.getStore(DAG.getEntryNode(), dl, SINT.getOperand(0), FIdx,
+ MachinePointerInfo::getFixedStack(
+ DAG.getMachineFunction(), FrameIdx));
+
+ assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
+ "Expected an i32 store");
+
+ RLI.Ptr = FIdx;
+ RLI.Chain = Store;
+ RLI.MPI =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
+ RLI.Alignment = 4;
+
+ MachineMemOperand *MMO =
+ MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
+ RLI.Alignment, RLI.AAInfo, RLI.Ranges);
+ SDValue Ops[] = { RLI.Chain, RLI.Ptr };
+ Bits = DAG.getMemIntrinsicNode(SINT.getOpcode() == ISD::ZERO_EXTEND ?
+ PPCISD::LFIWZX : PPCISD::LFIWAX,
+ dl, DAG.getVTList(MVT::f64, MVT::Other),
+ Ops, MVT::i32, MMO);
+ } else
+ Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT);
+
+ SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Bits);
+
+ if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT())
+ FP = DAG.getNode(ISD::FP_ROUND, dl,
+ MVT::f32, FP, DAG.getIntPtrConstant(0, dl));
+ return FP;
+ }
+
+ assert(Op.getOperand(0).getValueType() == MVT::i32 &&
+ "Unhandled INT_TO_FP type in custom expander!");
+ // Since we only generate this in 64-bit mode, we can take advantage of
+ // 64-bit registers. In particular, sign extend the input value into the
+ // 64-bit register with extsw, store the WHOLE 64-bit value into the stack
+ // then lfd it and fcfid it.
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ EVT PtrVT = getPointerTy(MF.getDataLayout());
+
+ SDValue Ld;
+ if (Subtarget.hasLFIWAX() || Subtarget.hasFPCVT()) {
+ ReuseLoadInfo RLI;
+ bool ReusingLoad;
+ if (!(ReusingLoad = canReuseLoadAddress(Op.getOperand(0), MVT::i32, RLI,
+ DAG))) {
+ int FrameIdx = MFI.CreateStackObject(4, 4, false);
+ SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
+
+ SDValue Store =
+ DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,
+ MachinePointerInfo::getFixedStack(
+ DAG.getMachineFunction(), FrameIdx));
+
+ assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
+ "Expected an i32 store");
+
+ RLI.Ptr = FIdx;
+ RLI.Chain = Store;
+ RLI.MPI =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
+ RLI.Alignment = 4;
+ }
+
+ MachineMemOperand *MMO =
+ MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
+ RLI.Alignment, RLI.AAInfo, RLI.Ranges);
+ SDValue Ops[] = { RLI.Chain, RLI.Ptr };
+ Ld = DAG.getMemIntrinsicNode(Op.getOpcode() == ISD::UINT_TO_FP ?
+ PPCISD::LFIWZX : PPCISD::LFIWAX,
+ dl, DAG.getVTList(MVT::f64, MVT::Other),
+ Ops, MVT::i32, MMO);
+ if (ReusingLoad)
+ spliceIntoChain(RLI.ResChain, Ld.getValue(1), DAG);
+ } else {
+ assert(Subtarget.isPPC64() &&
+ "i32->FP without LFIWAX supported only on PPC64");
+
+ int FrameIdx = MFI.CreateStackObject(8, 8, false);
+ SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
+
+ SDValue Ext64 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i64,
+ Op.getOperand(0));
+
+ // STD the extended value into the stack slot.
+ SDValue Store = DAG.getStore(
+ DAG.getEntryNode(), dl, Ext64, FIdx,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx));
+
+ // Load the value as a double.
+ Ld = DAG.getLoad(
+ MVT::f64, dl, Store, FIdx,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx));
+ }
+
+ // FCFID it and return it.
+ SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Ld);
+ if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT())
+ FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP,
+ DAG.getIntPtrConstant(0, dl));
+ return FP;
+}
+
+SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ /*
+ The rounding mode is in bits 30:31 of FPSR, and has the following
+ settings:
+ 00 Round to nearest
+ 01 Round to 0
+ 10 Round to +inf
+ 11 Round to -inf
+
+ FLT_ROUNDS, on the other hand, expects the following:
+ -1 Undefined
+ 0 Round to 0
+ 1 Round to nearest
+ 2 Round to +inf
+ 3 Round to -inf
+
+ To perform the conversion, we do:
+ ((FPSCR & 0x3) ^ ((~FPSCR & 0x3) >> 1))
+ */
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ EVT VT = Op.getValueType();
+ EVT PtrVT = getPointerTy(MF.getDataLayout());
+
+ // Save FP Control Word to register
+ EVT NodeTys[] = {
+ MVT::f64, // return register
+ MVT::Glue // unused in this context
+ };
+ SDValue Chain = DAG.getNode(PPCISD::MFFS, dl, NodeTys, None);
+
+ // Save FP register to stack slot
+ int SSFI = MF.getFrameInfo().CreateStackObject(8, 8, false);
+ SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
+ SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Chain, StackSlot,
+ MachinePointerInfo());
+
+ // Load FP Control Word from low 32 bits of stack slot.
+ SDValue Four = DAG.getConstant(4, dl, PtrVT);
+ SDValue Addr = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, Four);
+ SDValue CWD = DAG.getLoad(MVT::i32, dl, Store, Addr, MachinePointerInfo());
+
+ // Transform as necessary
+ SDValue CWD1 =
+ DAG.getNode(ISD::AND, dl, MVT::i32,
+ CWD, DAG.getConstant(3, dl, MVT::i32));
+ SDValue CWD2 =
+ DAG.getNode(ISD::SRL, dl, MVT::i32,
+ DAG.getNode(ISD::AND, dl, MVT::i32,
+ DAG.getNode(ISD::XOR, dl, MVT::i32,
+ CWD, DAG.getConstant(3, dl, MVT::i32)),
+ DAG.getConstant(3, dl, MVT::i32)),
+ DAG.getConstant(1, dl, MVT::i32));
+
+ SDValue RetVal =
+ DAG.getNode(ISD::XOR, dl, MVT::i32, CWD1, CWD2);
+
+ return DAG.getNode((VT.getSizeInBits() < 16 ?
+ ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal);
+}
+
+SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ unsigned BitWidth = VT.getSizeInBits();
+ SDLoc dl(Op);
+ assert(Op.getNumOperands() == 3 &&
+ VT == Op.getOperand(1).getValueType() &&
+ "Unexpected SHL!");
+
+ // Expand into a bunch of logical ops. Note that these ops
+ // depend on the PPC behavior for oversized shift amounts.
+ SDValue Lo = Op.getOperand(0);
+ SDValue Hi = Op.getOperand(1);
+ SDValue Amt = Op.getOperand(2);
+ EVT AmtVT = Amt.getValueType();
+
+ SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
+ DAG.getConstant(BitWidth, dl, AmtVT), Amt);
+ SDValue Tmp2 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Amt);
+ SDValue Tmp3 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Tmp1);
+ SDValue Tmp4 = DAG.getNode(ISD::OR , dl, VT, Tmp2, Tmp3);
+ SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
+ DAG.getConstant(-BitWidth, dl, AmtVT));
+ SDValue Tmp6 = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Tmp5);
+ SDValue OutHi = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6);
+ SDValue OutLo = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Amt);
+ SDValue OutOps[] = { OutLo, OutHi };
+ return DAG.getMergeValues(OutOps, dl);
+}
+
+SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ SDLoc dl(Op);
+ unsigned BitWidth = VT.getSizeInBits();
+ assert(Op.getNumOperands() == 3 &&
+ VT == Op.getOperand(1).getValueType() &&
+ "Unexpected SRL!");
+
+ // Expand into a bunch of logical ops. Note that these ops
+ // depend on the PPC behavior for oversized shift amounts.
+ SDValue Lo = Op.getOperand(0);
+ SDValue Hi = Op.getOperand(1);
+ SDValue Amt = Op.getOperand(2);
+ EVT AmtVT = Amt.getValueType();
+
+ SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
+ DAG.getConstant(BitWidth, dl, AmtVT), Amt);
+ SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt);
+ SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1);
+ SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
+ SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
+ DAG.getConstant(-BitWidth, dl, AmtVT));
+ SDValue Tmp6 = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Tmp5);
+ SDValue OutLo = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6);
+ SDValue OutHi = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Amt);
+ SDValue OutOps[] = { OutLo, OutHi };
+ return DAG.getMergeValues(OutOps, dl);
+}
+
+SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+ unsigned BitWidth = VT.getSizeInBits();
+ assert(Op.getNumOperands() == 3 &&
+ VT == Op.getOperand(1).getValueType() &&
+ "Unexpected SRA!");
+
+ // Expand into a bunch of logical ops, followed by a select_cc.
+ SDValue Lo = Op.getOperand(0);
+ SDValue Hi = Op.getOperand(1);
+ SDValue Amt = Op.getOperand(2);
+ EVT AmtVT = Amt.getValueType();
+
+ SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
+ DAG.getConstant(BitWidth, dl, AmtVT), Amt);
+ SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt);
+ SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1);
+ SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
+ SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
+ DAG.getConstant(-BitWidth, dl, AmtVT));
+ SDValue Tmp6 = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Tmp5);
+ SDValue OutHi = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Amt);
+ SDValue OutLo = DAG.getSelectCC(dl, Tmp5, DAG.getConstant(0, dl, AmtVT),
+ Tmp4, Tmp6, ISD::SETLE);
+ SDValue OutOps[] = { OutLo, OutHi };
+ return DAG.getMergeValues(OutOps, dl);
+}
+
+//===----------------------------------------------------------------------===//
+// Vector related lowering.
+//
+
+/// BuildSplatI - Build a canonical splati of Val with an element size of
+/// SplatSize. Cast the result to VT.
+static SDValue BuildSplatI(int Val, unsigned SplatSize, EVT VT,
+ SelectionDAG &DAG, const SDLoc &dl) {
+ assert(Val >= -16 && Val <= 15 && "vsplti is out of range!");
+
+ static const MVT VTys[] = { // canonical VT to use for each size.
+ MVT::v16i8, MVT::v8i16, MVT::Other, MVT::v4i32
+ };
+
+ EVT ReqVT = VT != MVT::Other ? VT : VTys[SplatSize-1];
+
+ // Force vspltis[hw] -1 to vspltisb -1 to canonicalize.
+ if (Val == -1)
+ SplatSize = 1;
+
+ EVT CanonicalVT = VTys[SplatSize-1];
+
+ // Build a canonical splat for this value.
+ return DAG.getBitcast(ReqVT, DAG.getConstant(Val, dl, CanonicalVT));
+}
+
+/// BuildIntrinsicOp - Return a unary operator intrinsic node with the
+/// specified intrinsic ID.
+static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op, SelectionDAG &DAG,
+ const SDLoc &dl, EVT DestVT = MVT::Other) {
+ if (DestVT == MVT::Other) DestVT = Op.getValueType();
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
+ DAG.getConstant(IID, dl, MVT::i32), Op);
+}
+
+/// BuildIntrinsicOp - Return a binary operator intrinsic node with the
+/// specified intrinsic ID.
+static SDValue BuildIntrinsicOp(unsigned IID, SDValue LHS, SDValue RHS,
+ SelectionDAG &DAG, const SDLoc &dl,
+ EVT DestVT = MVT::Other) {
+ if (DestVT == MVT::Other) DestVT = LHS.getValueType();
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
+ DAG.getConstant(IID, dl, MVT::i32), LHS, RHS);
+}
+
+/// BuildIntrinsicOp - Return a ternary operator intrinsic node with the
+/// specified intrinsic ID.
+static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1,
+ SDValue Op2, SelectionDAG &DAG, const SDLoc &dl,
+ EVT DestVT = MVT::Other) {
+ if (DestVT == MVT::Other) DestVT = Op0.getValueType();
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
+ DAG.getConstant(IID, dl, MVT::i32), Op0, Op1, Op2);
+}
+
+/// BuildVSLDOI - Return a VECTOR_SHUFFLE that is a vsldoi of the specified
+/// amount. The result has the specified value type.
+static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt, EVT VT,
+ SelectionDAG &DAG, const SDLoc &dl) {
+ // Force LHS/RHS to be the right type.
+ LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, LHS);
+ RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, RHS);
+
+ int Ops[16];
+ for (unsigned i = 0; i != 16; ++i)
+ Ops[i] = i + Amt;
+ SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, LHS, RHS, Ops);
+ return DAG.getNode(ISD::BITCAST, dl, VT, T);
+}
+
+/// Do we have an efficient pattern in a .td file for this node?
+///
+/// \param V - pointer to the BuildVectorSDNode being matched
+/// \param HasDirectMove - does this subtarget have VSR <-> GPR direct moves?
+///
+/// There are some patterns where it is beneficial to keep a BUILD_VECTOR
+/// node as a BUILD_VECTOR node rather than expanding it. The patterns where
+/// the opposite is true (expansion is beneficial) are:
+/// - The node builds a vector out of integers that are not 32 or 64-bits
+/// - The node builds a vector out of constants
+/// - The node is a "load-and-splat"
+/// In all other cases, we will choose to keep the BUILD_VECTOR.
+static bool haveEfficientBuildVectorPattern(BuildVectorSDNode *V,
+ bool HasDirectMove) {
+ EVT VecVT = V->getValueType(0);
+ bool RightType = VecVT == MVT::v2f64 || VecVT == MVT::v4f32 ||
+ (HasDirectMove && (VecVT == MVT::v2i64 || VecVT == MVT::v4i32));
+ if (!RightType)
+ return false;
+
+ bool IsSplat = true;
+ bool IsLoad = false;
+ SDValue Op0 = V->getOperand(0);
+
+ // This function is called in a block that confirms the node is not a constant
+ // splat. So a constant BUILD_VECTOR here means the vector is built out of
+ // different constants.
+ if (V->isConstant())
+ return false;
+ for (int i = 0, e = V->getNumOperands(); i < e; ++i) {
+ if (V->getOperand(i).isUndef())
+ return false;
+ // We want to expand nodes that represent load-and-splat even if the
+ // loaded value is a floating point truncation or conversion to int.
+ if (V->getOperand(i).getOpcode() == ISD::LOAD ||
+ (V->getOperand(i).getOpcode() == ISD::FP_ROUND &&
+ V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD) ||
+ (V->getOperand(i).getOpcode() == ISD::FP_TO_SINT &&
+ V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD) ||
+ (V->getOperand(i).getOpcode() == ISD::FP_TO_UINT &&
+ V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD))
+ IsLoad = true;
+ // If the operands are different or the input is not a load and has more
+ // uses than just this BV node, then it isn't a splat.
+ if (V->getOperand(i) != Op0 ||
+ (!IsLoad && !V->isOnlyUserOf(V->getOperand(i).getNode())))
+ IsSplat = false;
+ }
+ return !(IsSplat && IsLoad);
+}
+
+// If this is a case we can't handle, return null and let the default
+// expansion code take care of it. If we CAN select this case, and if it
+// selects to a single instruction, return Op. Otherwise, if we can codegen
+// this case more efficiently than a constant pool load, lower it to the
+// sequence of ops that should be used.
+SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
+ assert(BVN && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR");
+
+ if (Subtarget.hasQPX() && Op.getValueType() == MVT::v4i1) {
+ // We first build an i32 vector, load it into a QPX register,
+ // then convert it to a floating-point vector and compare it
+ // to a zero vector to get the boolean result.
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ int FrameIdx = MFI.CreateStackObject(16, 16, false);
+ MachinePointerInfo PtrInfo =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
+
+ assert(BVN->getNumOperands() == 4 &&
+ "BUILD_VECTOR for v4i1 does not have 4 operands");
+
+ bool IsConst = true;
+ for (unsigned i = 0; i < 4; ++i) {
+ if (BVN->getOperand(i).isUndef()) continue;
+ if (!isa<ConstantSDNode>(BVN->getOperand(i))) {
+ IsConst = false;
+ break;
+ }
+ }
+
+ if (IsConst) {
+ Constant *One =
+ ConstantFP::get(Type::getFloatTy(*DAG.getContext()), 1.0);
+ Constant *NegOne =
+ ConstantFP::get(Type::getFloatTy(*DAG.getContext()), -1.0);
+
+ Constant *CV[4];
+ for (unsigned i = 0; i < 4; ++i) {
+ if (BVN->getOperand(i).isUndef())
+ CV[i] = UndefValue::get(Type::getFloatTy(*DAG.getContext()));
+ else if (isNullConstant(BVN->getOperand(i)))
+ CV[i] = NegOne;
+ else
+ CV[i] = One;
+ }
+
+ Constant *CP = ConstantVector::get(CV);
+ SDValue CPIdx = DAG.getConstantPool(CP, getPointerTy(DAG.getDataLayout()),
+ 16 /* alignment */);
+
+ SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
+ SDVTList VTs = DAG.getVTList({MVT::v4i1, /*chain*/ MVT::Other});
+ return DAG.getMemIntrinsicNode(
+ PPCISD::QVLFSb, dl, VTs, Ops, MVT::v4f32,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+ }
+
+ SmallVector<SDValue, 4> Stores;
+ for (unsigned i = 0; i < 4; ++i) {
+ if (BVN->getOperand(i).isUndef()) continue;
+
+ unsigned Offset = 4*i;
+ SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType());
+ Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx);
+
+ unsigned StoreSize = BVN->getOperand(i).getValueType().getStoreSize();
+ if (StoreSize > 4) {
+ Stores.push_back(
+ DAG.getTruncStore(DAG.getEntryNode(), dl, BVN->getOperand(i), Idx,
+ PtrInfo.getWithOffset(Offset), MVT::i32));
+ } else {
+ SDValue StoreValue = BVN->getOperand(i);
+ if (StoreSize < 4)
+ StoreValue = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, StoreValue);
+
+ Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl, StoreValue, Idx,
+ PtrInfo.getWithOffset(Offset)));
+ }
+ }
+
+ SDValue StoreChain;
+ if (!Stores.empty())
+ StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
+ else
+ StoreChain = DAG.getEntryNode();
+
+ // Now load from v4i32 into the QPX register; this will extend it to
+ // v4i64 but not yet convert it to a floating point. Nevertheless, this
+ // is typed as v4f64 because the QPX register integer states are not
+ // explicitly represented.
+
+ SDValue Ops[] = {StoreChain,
+ DAG.getConstant(Intrinsic::ppc_qpx_qvlfiwz, dl, MVT::i32),
+ FIdx};
+ SDVTList VTs = DAG.getVTList({MVT::v4f64, /*chain*/ MVT::Other});
+
+ SDValue LoadedVect = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN,
+ dl, VTs, Ops, MVT::v4i32, PtrInfo);
+ LoadedVect = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64,
+ DAG.getConstant(Intrinsic::ppc_qpx_qvfcfidu, dl, MVT::i32),
+ LoadedVect);
+
+ SDValue FPZeros = DAG.getConstantFP(0.0, dl, MVT::v4f64);
+
+ return DAG.getSetCC(dl, MVT::v4i1, LoadedVect, FPZeros, ISD::SETEQ);
+ }
+
+ // All other QPX vectors are handled by generic code.
+ if (Subtarget.hasQPX())
+ return SDValue();
+
+ // Check if this is a splat of a constant value.
+ APInt APSplatBits, APSplatUndef;
+ unsigned SplatBitSize;
+ bool HasAnyUndefs;
+ if (! BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
+ HasAnyUndefs, 0, !Subtarget.isLittleEndian()) ||
+ SplatBitSize > 32) {
+ // BUILD_VECTOR nodes that are not constant splats of up to 32-bits can be
+ // lowered to VSX instructions under certain conditions.
+ // Without VSX, there is no pattern more efficient than expanding the node.
+ if (Subtarget.hasVSX() &&
+ haveEfficientBuildVectorPattern(BVN, Subtarget.hasDirectMove()))
+ return Op;
+ return SDValue();
+ }
+
+ unsigned SplatBits = APSplatBits.getZExtValue();
+ unsigned SplatUndef = APSplatUndef.getZExtValue();
+ unsigned SplatSize = SplatBitSize / 8;
+
+ // First, handle single instruction cases.
+
+ // All zeros?
+ if (SplatBits == 0) {
+ // Canonicalize all zero vectors to be v4i32.
+ if (Op.getValueType() != MVT::v4i32 || HasAnyUndefs) {
+ SDValue Z = DAG.getConstant(0, dl, MVT::v4i32);
+ Op = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Z);
+ }
+ return Op;
+ }
+
+ // We have XXSPLTIB for constant splats one byte wide
+ if (Subtarget.hasP9Vector() && SplatSize == 1) {
+ // This is a splat of 1-byte elements with some elements potentially undef.
+ // Rather than trying to match undef in the SDAG patterns, ensure that all
+ // elements are the same constant.
+ if (HasAnyUndefs || ISD::isBuildVectorAllOnes(BVN)) {
+ SmallVector<SDValue, 16> Ops(16, DAG.getConstant(SplatBits,
+ dl, MVT::i32));
+ SDValue NewBV = DAG.getBuildVector(MVT::v16i8, dl, Ops);
+ if (Op.getValueType() != MVT::v16i8)
+ return DAG.getBitcast(Op.getValueType(), NewBV);
+ return NewBV;
+ }
+ return Op;
+ }
+
+ // If the sign extended value is in the range [-16,15], use VSPLTI[bhw].
+ int32_t SextVal= (int32_t(SplatBits << (32-SplatBitSize)) >>
+ (32-SplatBitSize));
+ if (SextVal >= -16 && SextVal <= 15)
+ return BuildSplatI(SextVal, SplatSize, Op.getValueType(), DAG, dl);
+
+ // Two instruction sequences.
+
+ // If this value is in the range [-32,30] and is even, use:
+ // VSPLTI[bhw](val/2) + VSPLTI[bhw](val/2)
+ // If this value is in the range [17,31] and is odd, use:
+ // VSPLTI[bhw](val-16) - VSPLTI[bhw](-16)
+ // If this value is in the range [-31,-17] and is odd, use:
+ // VSPLTI[bhw](val+16) + VSPLTI[bhw](-16)
+ // Note the last two are three-instruction sequences.
+ if (SextVal >= -32 && SextVal <= 31) {
+ // To avoid having these optimizations undone by constant folding,
+ // we convert to a pseudo that will be expanded later into one of
+ // the above forms.
+ SDValue Elt = DAG.getConstant(SextVal, dl, MVT::i32);
+ EVT VT = (SplatSize == 1 ? MVT::v16i8 :
+ (SplatSize == 2 ? MVT::v8i16 : MVT::v4i32));
+ SDValue EltSize = DAG.getConstant(SplatSize, dl, MVT::i32);
+ SDValue RetVal = DAG.getNode(PPCISD::VADD_SPLAT, dl, VT, Elt, EltSize);
+ if (VT == Op.getValueType())
+ return RetVal;
+ else
+ return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), RetVal);
+ }
+
+ // If this is 0x8000_0000 x 4, turn into vspltisw + vslw. If it is
+ // 0x7FFF_FFFF x 4, turn it into not(0x8000_0000). This is important
+ // for fneg/fabs.
+ if (SplatSize == 4 && SplatBits == (0x7FFFFFFF&~SplatUndef)) {
+ // Make -1 and vspltisw -1:
+ SDValue OnesV = BuildSplatI(-1, 4, MVT::v4i32, DAG, dl);
+
+ // Make the VSLW intrinsic, computing 0x8000_0000.
+ SDValue Res = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, OnesV,
+ OnesV, DAG, dl);
+
+ // xor by OnesV to invert it.
+ Res = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Res, OnesV);
+ return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
+ }
+
+ // Check to see if this is a wide variety of vsplti*, binop self cases.
+ static const signed char SplatCsts[] = {
+ -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7,
+ -8, 8, -9, 9, -10, 10, -11, 11, -12, 12, -13, 13, 14, -14, 15, -15, -16
+ };
+
+ for (unsigned idx = 0; idx < array_lengthof(SplatCsts); ++idx) {
+ // Indirect through the SplatCsts array so that we favor 'vsplti -1' for
+ // cases which are ambiguous (e.g. formation of 0x8000_0000). 'vsplti -1'
+ int i = SplatCsts[idx];
+
+ // Figure out what shift amount will be used by altivec if shifted by i in
+ // this splat size.
+ unsigned TypeShiftAmt = i & (SplatBitSize-1);
+
+ // vsplti + shl self.
+ if (SextVal == (int)((unsigned)i << TypeShiftAmt)) {
+ SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
+ static const unsigned IIDs[] = { // Intrinsic to use for each size.
+ Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0,
+ Intrinsic::ppc_altivec_vslw
+ };
+ Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
+ return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
+ }
+
+ // vsplti + srl self.
+ if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
+ SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
+ static const unsigned IIDs[] = { // Intrinsic to use for each size.
+ Intrinsic::ppc_altivec_vsrb, Intrinsic::ppc_altivec_vsrh, 0,
+ Intrinsic::ppc_altivec_vsrw
+ };
+ Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
+ return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
+ }
+
+ // vsplti + sra self.
+ if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
+ SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
+ static const unsigned IIDs[] = { // Intrinsic to use for each size.
+ Intrinsic::ppc_altivec_vsrab, Intrinsic::ppc_altivec_vsrah, 0,
+ Intrinsic::ppc_altivec_vsraw
+ };
+ Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
+ return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
+ }
+
+ // vsplti + rol self.
+ if (SextVal == (int)(((unsigned)i << TypeShiftAmt) |
+ ((unsigned)i >> (SplatBitSize-TypeShiftAmt)))) {
+ SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
+ static const unsigned IIDs[] = { // Intrinsic to use for each size.
+ Intrinsic::ppc_altivec_vrlb, Intrinsic::ppc_altivec_vrlh, 0,
+ Intrinsic::ppc_altivec_vrlw
+ };
+ Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
+ return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
+ }
+
+ // t = vsplti c, result = vsldoi t, t, 1
+ if (SextVal == (int)(((unsigned)i << 8) | (i < 0 ? 0xFF : 0))) {
+ SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
+ unsigned Amt = Subtarget.isLittleEndian() ? 15 : 1;
+ return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
+ }
+ // t = vsplti c, result = vsldoi t, t, 2
+ if (SextVal == (int)(((unsigned)i << 16) | (i < 0 ? 0xFFFF : 0))) {
+ SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
+ unsigned Amt = Subtarget.isLittleEndian() ? 14 : 2;
+ return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
+ }
+ // t = vsplti c, result = vsldoi t, t, 3
+ if (SextVal == (int)(((unsigned)i << 24) | (i < 0 ? 0xFFFFFF : 0))) {
+ SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
+ unsigned Amt = Subtarget.isLittleEndian() ? 13 : 3;
+ return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
+ }
+ }
+
+ return SDValue();
+}
+
+/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
+/// the specified operations to build the shuffle.
+static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
+ SDValue RHS, SelectionDAG &DAG,
+ const SDLoc &dl) {
+ unsigned OpNum = (PFEntry >> 26) & 0x0F;
+ unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
+ unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
+
+ enum {
+ OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
+ OP_VMRGHW,
+ OP_VMRGLW,
+ OP_VSPLTISW0,
+ OP_VSPLTISW1,
+ OP_VSPLTISW2,
+ OP_VSPLTISW3,
+ OP_VSLDOI4,
+ OP_VSLDOI8,
+ OP_VSLDOI12
+ };
+
+ if (OpNum == OP_COPY) {
+ if (LHSID == (1*9+2)*9+3) return LHS;
+ assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
+ return RHS;
+ }
+
+ SDValue OpLHS, OpRHS;
+ OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
+ OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
+
+ int ShufIdxs[16];
+ switch (OpNum) {
+ default: llvm_unreachable("Unknown i32 permute!");
+ case OP_VMRGHW:
+ ShufIdxs[ 0] = 0; ShufIdxs[ 1] = 1; ShufIdxs[ 2] = 2; ShufIdxs[ 3] = 3;
+ ShufIdxs[ 4] = 16; ShufIdxs[ 5] = 17; ShufIdxs[ 6] = 18; ShufIdxs[ 7] = 19;
+ ShufIdxs[ 8] = 4; ShufIdxs[ 9] = 5; ShufIdxs[10] = 6; ShufIdxs[11] = 7;
+ ShufIdxs[12] = 20; ShufIdxs[13] = 21; ShufIdxs[14] = 22; ShufIdxs[15] = 23;
+ break;
+ case OP_VMRGLW:
+ ShufIdxs[ 0] = 8; ShufIdxs[ 1] = 9; ShufIdxs[ 2] = 10; ShufIdxs[ 3] = 11;
+ ShufIdxs[ 4] = 24; ShufIdxs[ 5] = 25; ShufIdxs[ 6] = 26; ShufIdxs[ 7] = 27;
+ ShufIdxs[ 8] = 12; ShufIdxs[ 9] = 13; ShufIdxs[10] = 14; ShufIdxs[11] = 15;
+ ShufIdxs[12] = 28; ShufIdxs[13] = 29; ShufIdxs[14] = 30; ShufIdxs[15] = 31;
+ break;
+ case OP_VSPLTISW0:
+ for (unsigned i = 0; i != 16; ++i)
+ ShufIdxs[i] = (i&3)+0;
+ break;
+ case OP_VSPLTISW1:
+ for (unsigned i = 0; i != 16; ++i)
+ ShufIdxs[i] = (i&3)+4;
+ break;
+ case OP_VSPLTISW2:
+ for (unsigned i = 0; i != 16; ++i)
+ ShufIdxs[i] = (i&3)+8;
+ break;
+ case OP_VSPLTISW3:
+ for (unsigned i = 0; i != 16; ++i)
+ ShufIdxs[i] = (i&3)+12;
+ break;
+ case OP_VSLDOI4:
+ return BuildVSLDOI(OpLHS, OpRHS, 4, OpLHS.getValueType(), DAG, dl);
+ case OP_VSLDOI8:
+ return BuildVSLDOI(OpLHS, OpRHS, 8, OpLHS.getValueType(), DAG, dl);
+ case OP_VSLDOI12:
+ return BuildVSLDOI(OpLHS, OpRHS, 12, OpLHS.getValueType(), DAG, dl);
+ }
+ EVT VT = OpLHS.getValueType();
+ OpLHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLHS);
+ OpRHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpRHS);
+ SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, OpLHS, OpRHS, ShufIdxs);
+ return DAG.getNode(ISD::BITCAST, dl, VT, T);
+}
+
+/// LowerVECTOR_SHUFFLE - Return the code we lower for VECTOR_SHUFFLE. If this
+/// is a shuffle we can handle in a single instruction, return it. Otherwise,
+/// return the code it can be lowered into. Worst case, it can always be
+/// lowered into a vperm.
+SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ EVT VT = Op.getValueType();
+ bool isLittleEndian = Subtarget.isLittleEndian();
+
+ unsigned ShiftElts, InsertAtByte;
+ bool Swap;
+ if (Subtarget.hasP9Vector() &&
+ PPC::isXXINSERTWMask(SVOp, ShiftElts, InsertAtByte, Swap,
+ isLittleEndian)) {
+ if (Swap)
+ std::swap(V1, V2);
+ SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
+ SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V2);
+ if (ShiftElts) {
+ SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv2, Conv2,
+ DAG.getConstant(ShiftElts, dl, MVT::i32));
+ SDValue Ins = DAG.getNode(PPCISD::XXINSERT, dl, MVT::v4i32, Conv1, Shl,
+ DAG.getConstant(InsertAtByte, dl, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
+ }
+ SDValue Ins = DAG.getNode(PPCISD::XXINSERT, dl, MVT::v4i32, Conv1, Conv2,
+ DAG.getConstant(InsertAtByte, dl, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
+ }
+
+ if (Subtarget.hasVSX()) {
+ if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) {
+ int SplatIdx = PPC::getVSPLTImmediate(SVOp, 4, DAG);
+
+ // If the source for the shuffle is a scalar_to_vector that came from a
+ // 32-bit load, it will have used LXVWSX so we don't need to splat again.
+ if (Subtarget.hasP9Vector() &&
+ ((isLittleEndian && SplatIdx == 3) ||
+ (!isLittleEndian && SplatIdx == 0))) {
+ SDValue Src = V1.getOperand(0);
+ if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ Src.getOperand(0).getOpcode() == ISD::LOAD &&
+ Src.getOperand(0).hasOneUse())
+ return V1;
+ }
+ SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
+ SDValue Splat = DAG.getNode(PPCISD::XXSPLT, dl, MVT::v4i32, Conv,
+ DAG.getConstant(SplatIdx, dl, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Splat);
+ }
+
+ // Left shifts of 8 bytes are actually swaps. Convert accordingly.
+ if (V2.isUndef() && PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) == 8) {
+ SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
+ SDValue Swap = DAG.getNode(PPCISD::SWAP_NO_CHAIN, dl, MVT::v2f64, Conv);
+ return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Swap);
+ }
+
+ }
+
+ if (Subtarget.hasQPX()) {
+ if (VT.getVectorNumElements() != 4)
+ return SDValue();
+
+ if (V2.isUndef()) V2 = V1;
+
+ int AlignIdx = PPC::isQVALIGNIShuffleMask(SVOp);
+ if (AlignIdx != -1) {
+ return DAG.getNode(PPCISD::QVALIGNI, dl, VT, V1, V2,
+ DAG.getConstant(AlignIdx, dl, MVT::i32));
+ } else if (SVOp->isSplat()) {
+ int SplatIdx = SVOp->getSplatIndex();
+ if (SplatIdx >= 4) {
+ std::swap(V1, V2);
+ SplatIdx -= 4;
+ }
+
+ return DAG.getNode(PPCISD::QVESPLATI, dl, VT, V1,
+ DAG.getConstant(SplatIdx, dl, MVT::i32));
+ }
+
+ // Lower this into a qvgpci/qvfperm pair.
+
+ // Compute the qvgpci literal
+ unsigned idx = 0;
+ for (unsigned i = 0; i < 4; ++i) {
+ int m = SVOp->getMaskElt(i);
+ unsigned mm = m >= 0 ? (unsigned) m : i;
+ idx |= mm << (3-i)*3;
+ }
+
+ SDValue V3 = DAG.getNode(PPCISD::QVGPCI, dl, MVT::v4f64,
+ DAG.getConstant(idx, dl, MVT::i32));
+ return DAG.getNode(PPCISD::QVFPERM, dl, VT, V1, V2, V3);
+ }
+
+ // Cases that are handled by instructions that take permute immediates
+ // (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be
+ // selected by the instruction selector.
+ if (V2.isUndef()) {
+ if (PPC::isSplatShuffleMask(SVOp, 1) ||
+ PPC::isSplatShuffleMask(SVOp, 2) ||
+ PPC::isSplatShuffleMask(SVOp, 4) ||
+ PPC::isVPKUWUMShuffleMask(SVOp, 1, DAG) ||
+ PPC::isVPKUHUMShuffleMask(SVOp, 1, DAG) ||
+ PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) != -1 ||
+ PPC::isVMRGLShuffleMask(SVOp, 1, 1, DAG) ||
+ PPC::isVMRGLShuffleMask(SVOp, 2, 1, DAG) ||
+ PPC::isVMRGLShuffleMask(SVOp, 4, 1, DAG) ||
+ PPC::isVMRGHShuffleMask(SVOp, 1, 1, DAG) ||
+ PPC::isVMRGHShuffleMask(SVOp, 2, 1, DAG) ||
+ PPC::isVMRGHShuffleMask(SVOp, 4, 1, DAG) ||
+ (Subtarget.hasP8Altivec() && (
+ PPC::isVPKUDUMShuffleMask(SVOp, 1, DAG) ||
+ PPC::isVMRGEOShuffleMask(SVOp, true, 1, DAG) ||
+ PPC::isVMRGEOShuffleMask(SVOp, false, 1, DAG)))) {
+ return Op;
+ }
+ }
+
+ // Altivec has a variety of "shuffle immediates" that take two vector inputs
+ // and produce a fixed permutation. If any of these match, do not lower to
+ // VPERM.
+ unsigned int ShuffleKind = isLittleEndian ? 2 : 0;
+ if (PPC::isVPKUWUMShuffleMask(SVOp, ShuffleKind, DAG) ||
+ PPC::isVPKUHUMShuffleMask(SVOp, ShuffleKind, DAG) ||
+ PPC::isVSLDOIShuffleMask(SVOp, ShuffleKind, DAG) != -1 ||
+ PPC::isVMRGLShuffleMask(SVOp, 1, ShuffleKind, DAG) ||
+ PPC::isVMRGLShuffleMask(SVOp, 2, ShuffleKind, DAG) ||
+ PPC::isVMRGLShuffleMask(SVOp, 4, ShuffleKind, DAG) ||
+ PPC::isVMRGHShuffleMask(SVOp, 1, ShuffleKind, DAG) ||
+ PPC::isVMRGHShuffleMask(SVOp, 2, ShuffleKind, DAG) ||
+ PPC::isVMRGHShuffleMask(SVOp, 4, ShuffleKind, DAG) ||
+ (Subtarget.hasP8Altivec() && (
+ PPC::isVPKUDUMShuffleMask(SVOp, ShuffleKind, DAG) ||
+ PPC::isVMRGEOShuffleMask(SVOp, true, ShuffleKind, DAG) ||
+ PPC::isVMRGEOShuffleMask(SVOp, false, ShuffleKind, DAG))))
+ return Op;
+
+ // Check to see if this is a shuffle of 4-byte values. If so, we can use our
+ // perfect shuffle table to emit an optimal matching sequence.
+ ArrayRef<int> PermMask = SVOp->getMask();
+
+ unsigned PFIndexes[4];
+ bool isFourElementShuffle = true;
+ for (unsigned i = 0; i != 4 && isFourElementShuffle; ++i) { // Element number
+ unsigned EltNo = 8; // Start out undef.
+ for (unsigned j = 0; j != 4; ++j) { // Intra-element byte.
+ if (PermMask[i*4+j] < 0)
+ continue; // Undef, ignore it.
+
+ unsigned ByteSource = PermMask[i*4+j];
+ if ((ByteSource & 3) != j) {
+ isFourElementShuffle = false;
+ break;
+ }
+
+ if (EltNo == 8) {
+ EltNo = ByteSource/4;
+ } else if (EltNo != ByteSource/4) {
+ isFourElementShuffle = false;
+ break;
+ }
+ }
+ PFIndexes[i] = EltNo;
+ }
+
+ // If this shuffle can be expressed as a shuffle of 4-byte elements, use the
+ // perfect shuffle vector to determine if it is cost effective to do this as
+ // discrete instructions, or whether we should use a vperm.
+ // For now, we skip this for little endian until such time as we have a
+ // little-endian perfect shuffle table.
+ if (isFourElementShuffle && !isLittleEndian) {
+ // Compute the index in the perfect shuffle table.
+ unsigned PFTableIndex =
+ PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
+
+ unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+ unsigned Cost = (PFEntry >> 30);
+
+ // Determining when to avoid vperm is tricky. Many things affect the cost
+ // of vperm, particularly how many times the perm mask needs to be computed.
+ // For example, if the perm mask can be hoisted out of a loop or is already
+ // used (perhaps because there are multiple permutes with the same shuffle
+ // mask?) the vperm has a cost of 1. OTOH, hoisting the permute mask out of
+ // the loop requires an extra register.
+ //
+ // As a compromise, we only emit discrete instructions if the shuffle can be
+ // generated in 3 or fewer operations. When we have loop information
+ // available, if this block is within a loop, we should avoid using vperm
+ // for 3-operation perms and use a constant pool load instead.
+ if (Cost < 3)
+ return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
+ }
+
+ // Lower this to a VPERM(V1, V2, V3) expression, where V3 is a constant
+ // vector that will get spilled to the constant pool.
+ if (V2.isUndef()) V2 = V1;
+
+ // The SHUFFLE_VECTOR mask is almost exactly what we want for vperm, except
+ // that it is in input element units, not in bytes. Convert now.
+
+ // For little endian, the order of the input vectors is reversed, and
+ // the permutation mask is complemented with respect to 31. This is
+ // necessary to produce proper semantics with the big-endian-biased vperm
+ // instruction.
+ EVT EltVT = V1.getValueType().getVectorElementType();
+ unsigned BytesPerElement = EltVT.getSizeInBits()/8;
+
+ SmallVector<SDValue, 16> ResultMask;
+ for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
+ unsigned SrcElt = PermMask[i] < 0 ? 0 : PermMask[i];
+
+ for (unsigned j = 0; j != BytesPerElement; ++j)
+ if (isLittleEndian)
+ ResultMask.push_back(DAG.getConstant(31 - (SrcElt*BytesPerElement + j),
+ dl, MVT::i32));
+ else
+ ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement + j, dl,
+ MVT::i32));
+ }
+
+ SDValue VPermMask = DAG.getBuildVector(MVT::v16i8, dl, ResultMask);
+ if (isLittleEndian)
+ return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(),
+ V2, V1, VPermMask);
+ else
+ return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(),
+ V1, V2, VPermMask);
+}
+
+/// getVectorCompareInfo - Given an intrinsic, return false if it is not a
+/// vector comparison. If it is, return true and fill in Opc/isDot with
+/// information about the intrinsic.
+static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc,
+ bool &isDot, const PPCSubtarget &Subtarget) {
+ unsigned IntrinsicID =
+ cast<ConstantSDNode>(Intrin.getOperand(0))->getZExtValue();
+ CompareOpc = -1;
+ isDot = false;
+ switch (IntrinsicID) {
+ default: return false;
+ // Comparison predicates.
+ case Intrinsic::ppc_altivec_vcmpbfp_p: CompareOpc = 966; isDot = 1; break;
+ case Intrinsic::ppc_altivec_vcmpeqfp_p: CompareOpc = 198; isDot = 1; break;
+ case Intrinsic::ppc_altivec_vcmpequb_p: CompareOpc = 6; isDot = 1; break;
+ case Intrinsic::ppc_altivec_vcmpequh_p: CompareOpc = 70; isDot = 1; break;
+ case Intrinsic::ppc_altivec_vcmpequw_p: CompareOpc = 134; isDot = 1; break;
+ case Intrinsic::ppc_altivec_vcmpequd_p:
+ if (Subtarget.hasP8Altivec()) {
+ CompareOpc = 199;
+ isDot = 1;
+ } else
+ return false;
+
+ break;
+ case Intrinsic::ppc_altivec_vcmpneb_p:
+ case Intrinsic::ppc_altivec_vcmpneh_p:
+ case Intrinsic::ppc_altivec_vcmpnew_p:
+ case Intrinsic::ppc_altivec_vcmpnezb_p:
+ case Intrinsic::ppc_altivec_vcmpnezh_p:
+ case Intrinsic::ppc_altivec_vcmpnezw_p:
+ if (Subtarget.hasP9Altivec()) {
+ switch(IntrinsicID) {
+ default: llvm_unreachable("Unknown comparison intrinsic.");
+ case Intrinsic::ppc_altivec_vcmpneb_p: CompareOpc = 7; break;
+ case Intrinsic::ppc_altivec_vcmpneh_p: CompareOpc = 71; break;
+ case Intrinsic::ppc_altivec_vcmpnew_p: CompareOpc = 135; break;
+ case Intrinsic::ppc_altivec_vcmpnezb_p: CompareOpc = 263; break;
+ case Intrinsic::ppc_altivec_vcmpnezh_p: CompareOpc = 327; break;
+ case Intrinsic::ppc_altivec_vcmpnezw_p: CompareOpc = 391; break;
+ }
+ isDot = 1;
+ } else
+ return false;
+
+ break;
+ case Intrinsic::ppc_altivec_vcmpgefp_p: CompareOpc = 454; isDot = 1; break;
+ case Intrinsic::ppc_altivec_vcmpgtfp_p: CompareOpc = 710; isDot = 1; break;
+ case Intrinsic::ppc_altivec_vcmpgtsb_p: CompareOpc = 774; isDot = 1; break;
+ case Intrinsic::ppc_altivec_vcmpgtsh_p: CompareOpc = 838; isDot = 1; break;
+ case Intrinsic::ppc_altivec_vcmpgtsw_p: CompareOpc = 902; isDot = 1; break;
+ case Intrinsic::ppc_altivec_vcmpgtsd_p:
+ if (Subtarget.hasP8Altivec()) {
+ CompareOpc = 967;
+ isDot = 1;
+ } else
+ return false;
+
+ break;
+ case Intrinsic::ppc_altivec_vcmpgtub_p: CompareOpc = 518; isDot = 1; break;
+ case Intrinsic::ppc_altivec_vcmpgtuh_p: CompareOpc = 582; isDot = 1; break;
+ case Intrinsic::ppc_altivec_vcmpgtuw_p: CompareOpc = 646; isDot = 1; break;
+ case Intrinsic::ppc_altivec_vcmpgtud_p:
+ if (Subtarget.hasP8Altivec()) {
+ CompareOpc = 711;
+ isDot = 1;
+ } else
+ return false;
+
+ break;
+ // VSX predicate comparisons use the same infrastructure
+ case Intrinsic::ppc_vsx_xvcmpeqdp_p:
+ case Intrinsic::ppc_vsx_xvcmpgedp_p:
+ case Intrinsic::ppc_vsx_xvcmpgtdp_p:
+ case Intrinsic::ppc_vsx_xvcmpeqsp_p:
+ case Intrinsic::ppc_vsx_xvcmpgesp_p:
+ case Intrinsic::ppc_vsx_xvcmpgtsp_p:
+ if (Subtarget.hasVSX()) {
+ switch (IntrinsicID) {
+ case Intrinsic::ppc_vsx_xvcmpeqdp_p: CompareOpc = 99; break;
+ case Intrinsic::ppc_vsx_xvcmpgedp_p: CompareOpc = 115; break;
+ case Intrinsic::ppc_vsx_xvcmpgtdp_p: CompareOpc = 107; break;
+ case Intrinsic::ppc_vsx_xvcmpeqsp_p: CompareOpc = 67; break;
+ case Intrinsic::ppc_vsx_xvcmpgesp_p: CompareOpc = 83; break;
+ case Intrinsic::ppc_vsx_xvcmpgtsp_p: CompareOpc = 75; break;
+ }
+ isDot = 1;
+ }
+ else
+ return false;
+
+ break;
+
+ // Normal Comparisons.
+ case Intrinsic::ppc_altivec_vcmpbfp: CompareOpc = 966; isDot = 0; break;
+ case Intrinsic::ppc_altivec_vcmpeqfp: CompareOpc = 198; isDot = 0; break;
+ case Intrinsic::ppc_altivec_vcmpequb: CompareOpc = 6; isDot = 0; break;
+ case Intrinsic::ppc_altivec_vcmpequh: CompareOpc = 70; isDot = 0; break;
+ case Intrinsic::ppc_altivec_vcmpequw: CompareOpc = 134; isDot = 0; break;
+ case Intrinsic::ppc_altivec_vcmpequd:
+ if (Subtarget.hasP8Altivec()) {
+ CompareOpc = 199;
+ isDot = 0;
+ } else
+ return false;
+
+ break;
+ case Intrinsic::ppc_altivec_vcmpneb:
+ case Intrinsic::ppc_altivec_vcmpneh:
+ case Intrinsic::ppc_altivec_vcmpnew:
+ case Intrinsic::ppc_altivec_vcmpnezb:
+ case Intrinsic::ppc_altivec_vcmpnezh:
+ case Intrinsic::ppc_altivec_vcmpnezw:
+ if (Subtarget.hasP9Altivec()) {
+ switch (IntrinsicID) {
+ default: llvm_unreachable("Unknown comparison intrinsic.");
+ case Intrinsic::ppc_altivec_vcmpneb: CompareOpc = 7; break;
+ case Intrinsic::ppc_altivec_vcmpneh: CompareOpc = 71; break;
+ case Intrinsic::ppc_altivec_vcmpnew: CompareOpc = 135; break;
+ case Intrinsic::ppc_altivec_vcmpnezb: CompareOpc = 263; break;
+ case Intrinsic::ppc_altivec_vcmpnezh: CompareOpc = 327; break;
+ case Intrinsic::ppc_altivec_vcmpnezw: CompareOpc = 391; break;
+ }
+ isDot = 0;
+ } else
+ return false;
+ break;
+ case Intrinsic::ppc_altivec_vcmpgefp: CompareOpc = 454; isDot = 0; break;
+ case Intrinsic::ppc_altivec_vcmpgtfp: CompareOpc = 710; isDot = 0; break;
+ case Intrinsic::ppc_altivec_vcmpgtsb: CompareOpc = 774; isDot = 0; break;
+ case Intrinsic::ppc_altivec_vcmpgtsh: CompareOpc = 838; isDot = 0; break;
+ case Intrinsic::ppc_altivec_vcmpgtsw: CompareOpc = 902; isDot = 0; break;
+ case Intrinsic::ppc_altivec_vcmpgtsd:
+ if (Subtarget.hasP8Altivec()) {
+ CompareOpc = 967;
+ isDot = 0;
+ } else
+ return false;
+
+ break;
+ case Intrinsic::ppc_altivec_vcmpgtub: CompareOpc = 518; isDot = 0; break;
+ case Intrinsic::ppc_altivec_vcmpgtuh: CompareOpc = 582; isDot = 0; break;
+ case Intrinsic::ppc_altivec_vcmpgtuw: CompareOpc = 646; isDot = 0; break;
+ case Intrinsic::ppc_altivec_vcmpgtud:
+ if (Subtarget.hasP8Altivec()) {
+ CompareOpc = 711;
+ isDot = 0;
+ } else
+ return false;
+
+ break;
+ }
+ return true;
+}
+
+/// LowerINTRINSIC_WO_CHAIN - If this is an intrinsic that we want to custom
+/// lower, do it, otherwise return null.
+SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
+ SelectionDAG &DAG) const {
+ unsigned IntrinsicID =
+ cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+
+ if (IntrinsicID == Intrinsic::thread_pointer) {
+ // Reads the thread pointer register, used for __builtin_thread_pointer.
+ bool is64bit = Subtarget.isPPC64();
+ return DAG.getRegister(is64bit ? PPC::X13 : PPC::R2,
+ is64bit ? MVT::i64 : MVT::i32);
+ }
+
+ // If this is a lowered altivec predicate compare, CompareOpc is set to the
+ // opcode number of the comparison.
+ SDLoc dl(Op);
+ int CompareOpc;
+ bool isDot;
+ if (!getVectorCompareInfo(Op, CompareOpc, isDot, Subtarget))
+ return SDValue(); // Don't custom lower most intrinsics.
+
+ // If this is a non-dot comparison, make the VCMP node and we are done.
+ if (!isDot) {
+ SDValue Tmp = DAG.getNode(PPCISD::VCMP, dl, Op.getOperand(2).getValueType(),
+ Op.getOperand(1), Op.getOperand(2),
+ DAG.getConstant(CompareOpc, dl, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Tmp);
+ }
+
+ // Create the PPCISD altivec 'dot' comparison node.
+ SDValue Ops[] = {
+ Op.getOperand(2), // LHS
+ Op.getOperand(3), // RHS
+ DAG.getConstant(CompareOpc, dl, MVT::i32)
+ };
+ EVT VTs[] = { Op.getOperand(2).getValueType(), MVT::Glue };
+ SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops);
+
+ // Now that we have the comparison, emit a copy from the CR to a GPR.
+ // This is flagged to the above dot comparison.
+ SDValue Flags = DAG.getNode(PPCISD::MFOCRF, dl, MVT::i32,
+ DAG.getRegister(PPC::CR6, MVT::i32),
+ CompNode.getValue(1));
+
+ // Unpack the result based on how the target uses it.
+ unsigned BitNo; // Bit # of CR6.
+ bool InvertBit; // Invert result?
+ switch (cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()) {
+ default: // Can't happen, don't crash on invalid number though.
+ case 0: // Return the value of the EQ bit of CR6.
+ BitNo = 0; InvertBit = false;
+ break;
+ case 1: // Return the inverted value of the EQ bit of CR6.
+ BitNo = 0; InvertBit = true;
+ break;
+ case 2: // Return the value of the LT bit of CR6.
+ BitNo = 2; InvertBit = false;
+ break;
+ case 3: // Return the inverted value of the LT bit of CR6.
+ BitNo = 2; InvertBit = true;
+ break;
+ }
+
+ // Shift the bit into the low position.
+ Flags = DAG.getNode(ISD::SRL, dl, MVT::i32, Flags,
+ DAG.getConstant(8 - (3 - BitNo), dl, MVT::i32));
+ // Isolate the bit.
+ Flags = DAG.getNode(ISD::AND, dl, MVT::i32, Flags,
+ DAG.getConstant(1, dl, MVT::i32));
+
+ // If we are supposed to, toggle the bit.
+ if (InvertBit)
+ Flags = DAG.getNode(ISD::XOR, dl, MVT::i32, Flags,
+ DAG.getConstant(1, dl, MVT::i32));
+ return Flags;
+}
+
+SDValue PPCTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ // For v2i64 (VSX), we can pattern patch the v2i32 case (using fp <-> int
+ // instructions), but for smaller types, we need to first extend up to v2i32
+ // before doing going farther.
+ if (Op.getValueType() == MVT::v2i64) {
+ EVT ExtVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+ if (ExtVT != MVT::v2i32) {
+ Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0));
+ Op = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, Op,
+ DAG.getValueType(EVT::getVectorVT(*DAG.getContext(),
+ ExtVT.getVectorElementType(), 4)));
+ Op = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Op);
+ Op = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v2i64, Op,
+ DAG.getValueType(MVT::v2i32));
+ }
+
+ return Op;
+ }
+
+ return SDValue();
+}
+
+SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ // Create a stack slot that is 16-byte aligned.
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ int FrameIdx = MFI.CreateStackObject(16, 16, false);
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
+
+ // Store the input value into Value#0 of the stack slot.
+ SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,
+ MachinePointerInfo());
+ // Load it out.
+ return DAG.getLoad(Op.getValueType(), dl, Store, FIdx, MachinePointerInfo());
+}
+
+SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT &&
+ "Should only be called for ISD::INSERT_VECTOR_ELT");
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+ // We have legal lowering for constant indices but not for variable ones.
+ if (C)
+ return Op;
+ return SDValue();
+}
+
+SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ SDNode *N = Op.getNode();
+
+ assert(N->getOperand(0).getValueType() == MVT::v4i1 &&
+ "Unknown extract_vector_elt type");
+
+ SDValue Value = N->getOperand(0);
+
+ // The first part of this is like the store lowering except that we don't
+ // need to track the chain.
+
+ // The values are now known to be -1 (false) or 1 (true). To convert this
+ // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5).
+ // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5
+ Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value);
+
+ // FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to
+ // understand how to form the extending load.
+ SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64);
+
+ Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);
+
+ // Now convert to an integer and store.
+ Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64,
+ DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32),
+ Value);
+
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ int FrameIdx = MFI.CreateStackObject(16, 16, false);
+ MachinePointerInfo PtrInfo =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
+
+ SDValue StoreChain = DAG.getEntryNode();
+ SDValue Ops[] = {StoreChain,
+ DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32),
+ Value, FIdx};
+ SDVTList VTs = DAG.getVTList(/*chain*/ MVT::Other);
+
+ StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID,
+ dl, VTs, Ops, MVT::v4i32, PtrInfo);
+
+ // Extract the value requested.
+ unsigned Offset = 4*cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType());
+ Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx);
+
+ SDValue IntVal =
+ DAG.getLoad(MVT::i32, dl, StoreChain, Idx, PtrInfo.getWithOffset(Offset));
+
+ if (!Subtarget.useCRBits())
+ return IntVal;
+
+ return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, IntVal);
+}
+
+/// Lowering for QPX v4i1 loads
+SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ LoadSDNode *LN = cast<LoadSDNode>(Op.getNode());
+ SDValue LoadChain = LN->getChain();
+ SDValue BasePtr = LN->getBasePtr();
+
+ if (Op.getValueType() == MVT::v4f64 ||
+ Op.getValueType() == MVT::v4f32) {
+ EVT MemVT = LN->getMemoryVT();
+ unsigned Alignment = LN->getAlignment();
+
+ // If this load is properly aligned, then it is legal.
+ if (Alignment >= MemVT.getStoreSize())
+ return Op;
+
+ EVT ScalarVT = Op.getValueType().getScalarType(),
+ ScalarMemVT = MemVT.getScalarType();
+ unsigned Stride = ScalarMemVT.getStoreSize();
+
+ SDValue Vals[4], LoadChains[4];
+ for (unsigned Idx = 0; Idx < 4; ++Idx) {
+ SDValue Load;
+ if (ScalarVT != ScalarMemVT)
+ Load = DAG.getExtLoad(LN->getExtensionType(), dl, ScalarVT, LoadChain,
+ BasePtr,
+ LN->getPointerInfo().getWithOffset(Idx * Stride),
+ ScalarMemVT, MinAlign(Alignment, Idx * Stride),
+ LN->getMemOperand()->getFlags(), LN->getAAInfo());
+ else
+ Load = DAG.getLoad(ScalarVT, dl, LoadChain, BasePtr,
+ LN->getPointerInfo().getWithOffset(Idx * Stride),
+ MinAlign(Alignment, Idx * Stride),
+ LN->getMemOperand()->getFlags(), LN->getAAInfo());
+
+ if (Idx == 0 && LN->isIndexed()) {
+ assert(LN->getAddressingMode() == ISD::PRE_INC &&
+ "Unknown addressing mode on vector load");
+ Load = DAG.getIndexedLoad(Load, dl, BasePtr, LN->getOffset(),
+ LN->getAddressingMode());
+ }
+
+ Vals[Idx] = Load;
+ LoadChains[Idx] = Load.getValue(1);
+
+ BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+ DAG.getConstant(Stride, dl,
+ BasePtr.getValueType()));
+ }
+
+ SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
+ SDValue Value = DAG.getBuildVector(Op.getValueType(), dl, Vals);
+
+ if (LN->isIndexed()) {
+ SDValue RetOps[] = { Value, Vals[0].getValue(1), TF };
+ return DAG.getMergeValues(RetOps, dl);
+ }
+
+ SDValue RetOps[] = { Value, TF };
+ return DAG.getMergeValues(RetOps, dl);
+ }
+
+ assert(Op.getValueType() == MVT::v4i1 && "Unknown load to lower");
+ assert(LN->isUnindexed() && "Indexed v4i1 loads are not supported");
+
+ // To lower v4i1 from a byte array, we load the byte elements of the
+ // vector and then reuse the BUILD_VECTOR logic.
+
+ SDValue VectElmts[4], VectElmtChains[4];
+ for (unsigned i = 0; i < 4; ++i) {
+ SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType());
+ Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx);
+
+ VectElmts[i] = DAG.getExtLoad(
+ ISD::EXTLOAD, dl, MVT::i32, LoadChain, Idx,
+ LN->getPointerInfo().getWithOffset(i), MVT::i8,
+ /* Alignment = */ 1, LN->getMemOperand()->getFlags(), LN->getAAInfo());
+ VectElmtChains[i] = VectElmts[i].getValue(1);
+ }
+
+ LoadChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, VectElmtChains);
+ SDValue Value = DAG.getBuildVector(MVT::v4i1, dl, VectElmts);
+
+ SDValue RVals[] = { Value, LoadChain };
+ return DAG.getMergeValues(RVals, dl);
+}
+
+/// Lowering for QPX v4i1 stores
+SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ StoreSDNode *SN = cast<StoreSDNode>(Op.getNode());
+ SDValue StoreChain = SN->getChain();
+ SDValue BasePtr = SN->getBasePtr();
+ SDValue Value = SN->getValue();
+
+ if (Value.getValueType() == MVT::v4f64 ||
+ Value.getValueType() == MVT::v4f32) {
+ EVT MemVT = SN->getMemoryVT();
+ unsigned Alignment = SN->getAlignment();
+
+ // If this store is properly aligned, then it is legal.
+ if (Alignment >= MemVT.getStoreSize())
+ return Op;
+
+ EVT ScalarVT = Value.getValueType().getScalarType(),
+ ScalarMemVT = MemVT.getScalarType();
+ unsigned Stride = ScalarMemVT.getStoreSize();
+
+ SDValue Stores[4];
+ for (unsigned Idx = 0; Idx < 4; ++Idx) {
+ SDValue Ex = DAG.getNode(
+ ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Value,
+ DAG.getConstant(Idx, dl, getVectorIdxTy(DAG.getDataLayout())));
+ SDValue Store;
+ if (ScalarVT != ScalarMemVT)
+ Store =
+ DAG.getTruncStore(StoreChain, dl, Ex, BasePtr,
+ SN->getPointerInfo().getWithOffset(Idx * Stride),
+ ScalarMemVT, MinAlign(Alignment, Idx * Stride),
+ SN->getMemOperand()->getFlags(), SN->getAAInfo());
+ else
+ Store = DAG.getStore(StoreChain, dl, Ex, BasePtr,
+ SN->getPointerInfo().getWithOffset(Idx * Stride),
+ MinAlign(Alignment, Idx * Stride),
+ SN->getMemOperand()->getFlags(), SN->getAAInfo());
+
+ if (Idx == 0 && SN->isIndexed()) {
+ assert(SN->getAddressingMode() == ISD::PRE_INC &&
+ "Unknown addressing mode on vector store");
+ Store = DAG.getIndexedStore(Store, dl, BasePtr, SN->getOffset(),
+ SN->getAddressingMode());
+ }
+
+ BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+ DAG.getConstant(Stride, dl,
+ BasePtr.getValueType()));
+ Stores[Idx] = Store;
+ }
+
+ SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
+
+ if (SN->isIndexed()) {
+ SDValue RetOps[] = { TF, Stores[0].getValue(1) };
+ return DAG.getMergeValues(RetOps, dl);
+ }
+
+ return TF;
+ }
+
+ assert(SN->isUnindexed() && "Indexed v4i1 stores are not supported");
+ assert(Value.getValueType() == MVT::v4i1 && "Unknown store to lower");
+
+ // The values are now known to be -1 (false) or 1 (true). To convert this
+ // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5).
+ // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5
+ Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value);
+
+ // FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to
+ // understand how to form the extending load.
+ SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64);
+
+ Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);
+
+ // Now convert to an integer and store.
+ Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64,
+ DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32),
+ Value);
+
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ int FrameIdx = MFI.CreateStackObject(16, 16, false);
+ MachinePointerInfo PtrInfo =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
+
+ SDValue Ops[] = {StoreChain,
+ DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32),
+ Value, FIdx};
+ SDVTList VTs = DAG.getVTList(/*chain*/ MVT::Other);
+
+ StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID,
+ dl, VTs, Ops, MVT::v4i32, PtrInfo);
+
+ // Move data into the byte array.
+ SDValue Loads[4], LoadChains[4];
+ for (unsigned i = 0; i < 4; ++i) {
+ unsigned Offset = 4*i;
+ SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType());
+ Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx);
+
+ Loads[i] = DAG.getLoad(MVT::i32, dl, StoreChain, Idx,
+ PtrInfo.getWithOffset(Offset));
+ LoadChains[i] = Loads[i].getValue(1);
+ }
+
+ StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
+
+ SDValue Stores[4];
+ for (unsigned i = 0; i < 4; ++i) {
+ SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType());
+ Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx);
+
+ Stores[i] = DAG.getTruncStore(
+ StoreChain, dl, Loads[i], Idx, SN->getPointerInfo().getWithOffset(i),
+ MVT::i8, /* Alignment = */ 1, SN->getMemOperand()->getFlags(),
+ SN->getAAInfo());
+ }
+
+ StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
+
+ return StoreChain;
+}
+
+SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ if (Op.getValueType() == MVT::v4i32) {
+ SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
+
+ SDValue Zero = BuildSplatI( 0, 1, MVT::v4i32, DAG, dl);
+ SDValue Neg16 = BuildSplatI(-16, 4, MVT::v4i32, DAG, dl);//+16 as shift amt.
+
+ SDValue RHSSwap = // = vrlw RHS, 16
+ BuildIntrinsicOp(Intrinsic::ppc_altivec_vrlw, RHS, Neg16, DAG, dl);
+
+ // Shrinkify inputs to v8i16.
+ LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, LHS);
+ RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHS);
+ RHSSwap = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHSSwap);
+
+ // Low parts multiplied together, generating 32-bit results (we ignore the
+ // top parts).
+ SDValue LoProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmulouh,
+ LHS, RHS, DAG, dl, MVT::v4i32);
+
+ SDValue HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmsumuhm,
+ LHS, RHSSwap, Zero, DAG, dl, MVT::v4i32);
+ // Shift the high parts up 16 bits.
+ HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, HiProd,
+ Neg16, DAG, dl);
+ return DAG.getNode(ISD::ADD, dl, MVT::v4i32, LoProd, HiProd);
+ } else if (Op.getValueType() == MVT::v8i16) {
+ SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
+
+ SDValue Zero = BuildSplatI(0, 1, MVT::v8i16, DAG, dl);
+
+ return BuildIntrinsicOp(Intrinsic::ppc_altivec_vmladduhm,
+ LHS, RHS, Zero, DAG, dl);
+ } else if (Op.getValueType() == MVT::v16i8) {
+ SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
+ bool isLittleEndian = Subtarget.isLittleEndian();
+
+ // Multiply the even 8-bit parts, producing 16-bit sums.
+ SDValue EvenParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuleub,
+ LHS, RHS, DAG, dl, MVT::v8i16);
+ EvenParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, EvenParts);
+
+ // Multiply the odd 8-bit parts, producing 16-bit sums.
+ SDValue OddParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuloub,
+ LHS, RHS, DAG, dl, MVT::v8i16);
+ OddParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OddParts);
+
+ // Merge the results together. Because vmuleub and vmuloub are
+ // instructions with a big-endian bias, we must reverse the
+ // element numbering and reverse the meaning of "odd" and "even"
+ // when generating little endian code.
+ int Ops[16];
+ for (unsigned i = 0; i != 8; ++i) {
+ if (isLittleEndian) {
+ Ops[i*2 ] = 2*i;
+ Ops[i*2+1] = 2*i+16;
+ } else {
+ Ops[i*2 ] = 2*i+1;
+ Ops[i*2+1] = 2*i+1+16;
+ }
+ }
+ if (isLittleEndian)
+ return DAG.getVectorShuffle(MVT::v16i8, dl, OddParts, EvenParts, Ops);
+ else
+ return DAG.getVectorShuffle(MVT::v16i8, dl, EvenParts, OddParts, Ops);
+ } else {
+ llvm_unreachable("Unknown mul to lower!");
+ }
+}
+
+/// LowerOperation - Provide custom lowering hooks for some operations.
+///
+SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+ switch (Op.getOpcode()) {
+ default: llvm_unreachable("Wasn't expecting to be able to lower this!");
+ case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
+ case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
+ case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
+ case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
+ case ISD::JumpTable: return LowerJumpTable(Op, DAG);
+ case ISD::SETCC: return LowerSETCC(Op, DAG);
+ case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
+ case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
+ case ISD::VASTART:
+ return LowerVASTART(Op, DAG);
+
+ case ISD::VAARG:
+ return LowerVAARG(Op, DAG);
+
+ case ISD::VACOPY:
+ return LowerVACOPY(Op, DAG);
+
+ case ISD::STACKRESTORE:
+ return LowerSTACKRESTORE(Op, DAG);
+
+ case ISD::DYNAMIC_STACKALLOC:
+ return LowerDYNAMIC_STACKALLOC(Op, DAG);
+
+ case ISD::GET_DYNAMIC_AREA_OFFSET:
+ return LowerGET_DYNAMIC_AREA_OFFSET(Op, DAG);
+
+ case ISD::EH_DWARF_CFA:
+ return LowerEH_DWARF_CFA(Op, DAG);
+
+ case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
+ case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
+
+ case ISD::LOAD: return LowerLOAD(Op, DAG);
+ case ISD::STORE: return LowerSTORE(Op, DAG);
+ case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
+ case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
+ case ISD::FP_TO_UINT:
+ case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG,
+ SDLoc(Op));
+ case ISD::UINT_TO_FP:
+ case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
+ case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
+
+ // Lower 64-bit shifts.
+ case ISD::SHL_PARTS: return LowerSHL_PARTS(Op, DAG);
+ case ISD::SRL_PARTS: return LowerSRL_PARTS(Op, DAG);
+ case ISD::SRA_PARTS: return LowerSRA_PARTS(Op, DAG);
+
+ // Vector-related lowering.
+ case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
+ case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
+ case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+ case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
+ case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
+ case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+ case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
+ case ISD::MUL: return LowerMUL(Op, DAG);
+
+ // For counter-based loop handling.
+ case ISD::INTRINSIC_W_CHAIN: return SDValue();
+
+ // Frame & Return address.
+ case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
+ case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
+ }
+}
+
+void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
+ SmallVectorImpl<SDValue>&Results,
+ SelectionDAG &DAG) const {
+ SDLoc dl(N);
+ switch (N->getOpcode()) {
+ default:
+ llvm_unreachable("Do not know how to custom type legalize this operation!");
+ case ISD::READCYCLECOUNTER: {
+ SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
+ SDValue RTB = DAG.getNode(PPCISD::READ_TIME_BASE, dl, VTs, N->getOperand(0));
+
+ Results.push_back(RTB);
+ Results.push_back(RTB.getValue(1));
+ Results.push_back(RTB.getValue(2));
+ break;
+ }
+ case ISD::INTRINSIC_W_CHAIN: {
+ if (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() !=
+ Intrinsic::ppc_is_decremented_ctr_nonzero)
+ break;
+
+ assert(N->getValueType(0) == MVT::i1 &&
+ "Unexpected result type for CTR decrement intrinsic");
+ EVT SVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
+ N->getValueType(0));
+ SDVTList VTs = DAG.getVTList(SVT, MVT::Other);
+ SDValue NewInt = DAG.getNode(N->getOpcode(), dl, VTs, N->getOperand(0),
+ N->getOperand(1));
+
+ Results.push_back(NewInt);
+ Results.push_back(NewInt.getValue(1));
+ break;
+ }
+ case ISD::VAARG: {
+ if (!Subtarget.isSVR4ABI() || Subtarget.isPPC64())
+ return;
+
+ EVT VT = N->getValueType(0);
+
+ if (VT == MVT::i64) {
+ SDValue NewNode = LowerVAARG(SDValue(N, 1), DAG);
+
+ Results.push_back(NewNode);
+ Results.push_back(NewNode.getValue(1));
+ }
+ return;
+ }
+ case ISD::FP_ROUND_INREG: {
+ assert(N->getValueType(0) == MVT::ppcf128);
+ assert(N->getOperand(0).getValueType() == MVT::ppcf128);
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
+ MVT::f64, N->getOperand(0),
+ DAG.getIntPtrConstant(0, dl));
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
+ MVT::f64, N->getOperand(0),
+ DAG.getIntPtrConstant(1, dl));
+
+ // Add the two halves of the long double in round-to-zero mode.
+ SDValue FPreg = DAG.getNode(PPCISD::FADDRTZ, dl, MVT::f64, Lo, Hi);
+
+ // We know the low half is about to be thrown away, so just use something
+ // convenient.
+ Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::ppcf128,
+ FPreg, FPreg));
+ return;
+ }
+ case ISD::FP_TO_SINT:
+ case ISD::FP_TO_UINT:
+ // LowerFP_TO_INT() can only handle f32 and f64.
+ if (N->getOperand(0).getValueType() == MVT::ppcf128)
+ return;
+ Results.push_back(LowerFP_TO_INT(SDValue(N, 0), DAG, dl));
+ return;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Other Lowering Code
+//===----------------------------------------------------------------------===//
+
+static Instruction* callIntrinsic(IRBuilder<> &Builder, Intrinsic::ID Id) {
+ Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+ Function *Func = Intrinsic::getDeclaration(M, Id);
+ return Builder.CreateCall(Func, {});
+}
+
+// The mappings for emitLeading/TrailingFence is taken from
+// http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
+Instruction* PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
+ AtomicOrdering Ord, bool IsStore,
+ bool IsLoad) const {
+ if (Ord == AtomicOrdering::SequentiallyConsistent)
+ return callIntrinsic(Builder, Intrinsic::ppc_sync);
+ if (isReleaseOrStronger(Ord))
+ return callIntrinsic(Builder, Intrinsic::ppc_lwsync);
+ return nullptr;
+}
+
+Instruction* PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
+ AtomicOrdering Ord, bool IsStore,
+ bool IsLoad) const {
+ if (IsLoad && isAcquireOrStronger(Ord))
+ return callIntrinsic(Builder, Intrinsic::ppc_lwsync);
+ // FIXME: this is too conservative, a dependent branch + isync is enough.
+ // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and
+ // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html
+ // and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification.
+ return nullptr;
+}
+
+MachineBasicBlock *
+PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB,
+ unsigned AtomicSize,
+ unsigned BinOpcode,
+ unsigned CmpOpcode,
+ unsigned CmpPred) const {
+ // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+
+ auto LoadMnemonic = PPC::LDARX;
+ auto StoreMnemonic = PPC::STDCX;
+ switch (AtomicSize) {
+ default:
+ llvm_unreachable("Unexpected size of atomic entity");
+ case 1:
+ LoadMnemonic = PPC::LBARX;
+ StoreMnemonic = PPC::STBCX;
+ assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4");
+ break;
+ case 2:
+ LoadMnemonic = PPC::LHARX;
+ StoreMnemonic = PPC::STHCX;
+ assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4");
+ break;
+ case 4:
+ LoadMnemonic = PPC::LWARX;
+ StoreMnemonic = PPC::STWCX;
+ break;
+ case 8:
+ LoadMnemonic = PPC::LDARX;
+ StoreMnemonic = PPC::STDCX;
+ break;
+ }
+
+ const BasicBlock *LLVM_BB = BB->getBasicBlock();
+ MachineFunction *F = BB->getParent();
+ MachineFunction::iterator It = ++BB->getIterator();
+
+ unsigned dest = MI.getOperand(0).getReg();
+ unsigned ptrA = MI.getOperand(1).getReg();
+ unsigned ptrB = MI.getOperand(2).getReg();
+ unsigned incr = MI.getOperand(3).getReg();
+ DebugLoc dl = MI.getDebugLoc();
+
+ MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *loop2MBB =
+ CmpOpcode ? F->CreateMachineBasicBlock(LLVM_BB) : nullptr;
+ MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ F->insert(It, loopMBB);
+ if (CmpOpcode)
+ F->insert(It, loop2MBB);
+ F->insert(It, exitMBB);
+ exitMBB->splice(exitMBB->begin(), BB,
+ std::next(MachineBasicBlock::iterator(MI)), BB->end());
+ exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+ MachineRegisterInfo &RegInfo = F->getRegInfo();
+ unsigned TmpReg = (!BinOpcode) ? incr :
+ RegInfo.createVirtualRegister( AtomicSize == 8 ? &PPC::G8RCRegClass
+ : &PPC::GPRCRegClass);
+
+ // thisMBB:
+ // ...
+ // fallthrough --> loopMBB
+ BB->addSuccessor(loopMBB);
+
+ // loopMBB:
+ // l[wd]arx dest, ptr
+ // add r0, dest, incr
+ // st[wd]cx. r0, ptr
+ // bne- loopMBB
+ // fallthrough --> exitMBB
+
+ // For max/min...
+ // loopMBB:
+ // l[wd]arx dest, ptr
+ // cmpl?[wd] incr, dest
+ // bgt exitMBB
+ // loop2MBB:
+ // st[wd]cx. dest, ptr
+ // bne- loopMBB
+ // fallthrough --> exitMBB
+
+ BB = loopMBB;
+ BuildMI(BB, dl, TII->get(LoadMnemonic), dest)
+ .addReg(ptrA).addReg(ptrB);
+ if (BinOpcode)
+ BuildMI(BB, dl, TII->get(BinOpcode), TmpReg).addReg(incr).addReg(dest);
+ if (CmpOpcode) {
+ // Signed comparisons of byte or halfword values must be sign-extended.
+ if (CmpOpcode == PPC::CMPW && AtomicSize < 4) {
+ unsigned ExtReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass);
+ BuildMI(BB, dl, TII->get(AtomicSize == 1 ? PPC::EXTSB : PPC::EXTSH),
+ ExtReg).addReg(dest);
+ BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0)
+ .addReg(incr).addReg(ExtReg);
+ } else
+ BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0)
+ .addReg(incr).addReg(dest);
+
+ BuildMI(BB, dl, TII->get(PPC::BCC))
+ .addImm(CmpPred).addReg(PPC::CR0).addMBB(exitMBB);
+ BB->addSuccessor(loop2MBB);
+ BB->addSuccessor(exitMBB);
+ BB = loop2MBB;
+ }
+ BuildMI(BB, dl, TII->get(StoreMnemonic))
+ .addReg(TmpReg).addReg(ptrA).addReg(ptrB);
+ BuildMI(BB, dl, TII->get(PPC::BCC))
+ .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB);
+ BB->addSuccessor(loopMBB);
+ BB->addSuccessor(exitMBB);
+
+ // exitMBB:
+ // ...
+ BB = exitMBB;
+ return BB;
+}
+
+MachineBasicBlock *
+PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr &MI,
+ MachineBasicBlock *BB,
+ bool is8bit, // operation
+ unsigned BinOpcode,
+ unsigned CmpOpcode,
+ unsigned CmpPred) const {
+ // If we support part-word atomic mnemonics, just use them
+ if (Subtarget.hasPartwordAtomics())
+ return EmitAtomicBinary(MI, BB, is8bit ? 1 : 2, BinOpcode,
+ CmpOpcode, CmpPred);
+
+ // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ // In 64 bit mode we have to use 64 bits for addresses, even though the
+ // lwarx/stwcx are 32 bits. With the 32-bit atomics we can use address
+ // registers without caring whether they're 32 or 64, but here we're
+ // doing actual arithmetic on the addresses.
+ bool is64bit = Subtarget.isPPC64();
+ bool isLittleEndian = Subtarget.isLittleEndian();
+ unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;
+
+ const BasicBlock *LLVM_BB = BB->getBasicBlock();
+ MachineFunction *F = BB->getParent();
+ MachineFunction::iterator It = ++BB->getIterator();
+
+ unsigned dest = MI.getOperand(0).getReg();
+ unsigned ptrA = MI.getOperand(1).getReg();
+ unsigned ptrB = MI.getOperand(2).getReg();
+ unsigned incr = MI.getOperand(3).getReg();
+ DebugLoc dl = MI.getDebugLoc();
+
+ MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *loop2MBB =
+ CmpOpcode ? F->CreateMachineBasicBlock(LLVM_BB) : nullptr;
+ MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ F->insert(It, loopMBB);
+ if (CmpOpcode)
+ F->insert(It, loop2MBB);
+ F->insert(It, exitMBB);
+ exitMBB->splice(exitMBB->begin(), BB,
+ std::next(MachineBasicBlock::iterator(MI)), BB->end());
+ exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+ MachineRegisterInfo &RegInfo = F->getRegInfo();
+ const TargetRegisterClass *RC = is64bit ? &PPC::G8RCRegClass
+ : &PPC::GPRCRegClass;
+ unsigned PtrReg = RegInfo.createVirtualRegister(RC);
+ unsigned Shift1Reg = RegInfo.createVirtualRegister(RC);
+ unsigned ShiftReg =
+ isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(RC);
+ unsigned Incr2Reg = RegInfo.createVirtualRegister(RC);
+ unsigned MaskReg = RegInfo.createVirtualRegister(RC);
+ unsigned Mask2Reg = RegInfo.createVirtualRegister(RC);
+ unsigned Mask3Reg = RegInfo.createVirtualRegister(RC);
+ unsigned Tmp2Reg = RegInfo.createVirtualRegister(RC);
+ unsigned Tmp3Reg = RegInfo.createVirtualRegister(RC);
+ unsigned Tmp4Reg = RegInfo.createVirtualRegister(RC);
+ unsigned TmpDestReg = RegInfo.createVirtualRegister(RC);
+ unsigned Ptr1Reg;
+ unsigned TmpReg = (!BinOpcode) ? Incr2Reg : RegInfo.createVirtualRegister(RC);
+
+ // thisMBB:
+ // ...
+ // fallthrough --> loopMBB
+ BB->addSuccessor(loopMBB);
+
+ // The 4-byte load must be aligned, while a char or short may be
+ // anywhere in the word. Hence all this nasty bookkeeping code.
+ // add ptr1, ptrA, ptrB [copy if ptrA==0]
+ // rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27]
+ // xori shift, shift1, 24 [16]
+ // rlwinm ptr, ptr1, 0, 0, 29
+ // slw incr2, incr, shift
+ // li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535]
+ // slw mask, mask2, shift
+ // loopMBB:
+ // lwarx tmpDest, ptr
+ // add tmp, tmpDest, incr2
+ // andc tmp2, tmpDest, mask
+ // and tmp3, tmp, mask
+ // or tmp4, tmp3, tmp2
+ // stwcx. tmp4, ptr
+ // bne- loopMBB
+ // fallthrough --> exitMBB
+ // srw dest, tmpDest, shift
+ if (ptrA != ZeroReg) {
+ Ptr1Reg = RegInfo.createVirtualRegister(RC);
+ BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg)
+ .addReg(ptrA).addReg(ptrB);
+ } else {
+ Ptr1Reg = ptrB;
+ }
+ BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg).addReg(Ptr1Reg)
+ .addImm(3).addImm(27).addImm(is8bit ? 28 : 27);
+ if (!isLittleEndian)
+ BuildMI(BB, dl, TII->get(is64bit ? PPC::XORI8 : PPC::XORI), ShiftReg)
+ .addReg(Shift1Reg).addImm(is8bit ? 24 : 16);
+ if (is64bit)
+ BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg)
+ .addReg(Ptr1Reg).addImm(0).addImm(61);
+ else
+ BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg)
+ .addReg(Ptr1Reg).addImm(0).addImm(0).addImm(29);
+ BuildMI(BB, dl, TII->get(PPC::SLW), Incr2Reg)
+ .addReg(incr).addReg(ShiftReg);
+ if (is8bit)
+ BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255);
+ else {
+ BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0);
+ BuildMI(BB, dl, TII->get(PPC::ORI),Mask2Reg).addReg(Mask3Reg).addImm(65535);
+ }
+ BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg)
+ .addReg(Mask2Reg).addReg(ShiftReg);
+
+ BB = loopMBB;
+ BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg)
+ .addReg(ZeroReg).addReg(PtrReg);
+ if (BinOpcode)
+ BuildMI(BB, dl, TII->get(BinOpcode), TmpReg)
+ .addReg(Incr2Reg).addReg(TmpDestReg);
+ BuildMI(BB, dl, TII->get(is64bit ? PPC::ANDC8 : PPC::ANDC), Tmp2Reg)
+ .addReg(TmpDestReg).addReg(MaskReg);
+ BuildMI(BB, dl, TII->get(is64bit ? PPC::AND8 : PPC::AND), Tmp3Reg)
+ .addReg(TmpReg).addReg(MaskReg);
+ if (CmpOpcode) {
+ // For unsigned comparisons, we can directly compare the shifted values.
+ // For signed comparisons we shift and sign extend.
+ unsigned SReg = RegInfo.createVirtualRegister(RC);
+ BuildMI(BB, dl, TII->get(is64bit ? PPC::AND8 : PPC::AND), SReg)
+ .addReg(TmpDestReg).addReg(MaskReg);
+ unsigned ValueReg = SReg;
+ unsigned CmpReg = Incr2Reg;
+ if (CmpOpcode == PPC::CMPW) {
+ ValueReg = RegInfo.createVirtualRegister(RC);
+ BuildMI(BB, dl, TII->get(PPC::SRW), ValueReg)
+ .addReg(SReg).addReg(ShiftReg);
+ unsigned ValueSReg = RegInfo.createVirtualRegister(RC);
+ BuildMI(BB, dl, TII->get(is8bit ? PPC::EXTSB : PPC::EXTSH), ValueSReg)
+ .addReg(ValueReg);
+ ValueReg = ValueSReg;
+ CmpReg = incr;
+ }
+ BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0)
+ .addReg(CmpReg).addReg(ValueReg);
+ BuildMI(BB, dl, TII->get(PPC::BCC))
+ .addImm(CmpPred).addReg(PPC::CR0).addMBB(exitMBB);
+ BB->addSuccessor(loop2MBB);
+ BB->addSuccessor(exitMBB);
+ BB = loop2MBB;
+ }
+ BuildMI(BB, dl, TII->get(is64bit ? PPC::OR8 : PPC::OR), Tmp4Reg)
+ .addReg(Tmp3Reg).addReg(Tmp2Reg);
+ BuildMI(BB, dl, TII->get(PPC::STWCX))
+ .addReg(Tmp4Reg).addReg(ZeroReg).addReg(PtrReg);
+ BuildMI(BB, dl, TII->get(PPC::BCC))
+ .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB);
+ BB->addSuccessor(loopMBB);
+ BB->addSuccessor(exitMBB);
+
+ // exitMBB:
+ // ...
+ BB = exitMBB;
+ BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW), dest).addReg(TmpDestReg)
+ .addReg(ShiftReg);
+ return BB;
+}
+
+llvm::MachineBasicBlock *
+PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
+ MachineBasicBlock *MBB) const {
+ DebugLoc DL = MI.getDebugLoc();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+
+ MachineFunction *MF = MBB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ const BasicBlock *BB = MBB->getBasicBlock();
+ MachineFunction::iterator I = ++MBB->getIterator();
+
+ // Memory Reference
+ MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
+ MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
+
+ unsigned DstReg = MI.getOperand(0).getReg();
+ const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
+ assert(RC->hasType(MVT::i32) && "Invalid destination!");
+ unsigned mainDstReg = MRI.createVirtualRegister(RC);
+ unsigned restoreDstReg = MRI.createVirtualRegister(RC);
+
+ MVT PVT = getPointerTy(MF->getDataLayout());
+ assert((PVT == MVT::i64 || PVT == MVT::i32) &&
+ "Invalid Pointer Size!");
+ // For v = setjmp(buf), we generate
+ //
+ // thisMBB:
+ // SjLjSetup mainMBB
+ // bl mainMBB
+ // v_restore = 1
+ // b sinkMBB
+ //
+ // mainMBB:
+ // buf[LabelOffset] = LR
+ // v_main = 0
+ //
+ // sinkMBB:
+ // v = phi(main, restore)
+ //
+
+ MachineBasicBlock *thisMBB = MBB;
+ MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+ MF->insert(I, mainMBB);
+ MF->insert(I, sinkMBB);
+
+ MachineInstrBuilder MIB;
+
+ // Transfer the remainder of BB and its successor edges to sinkMBB.
+ sinkMBB->splice(sinkMBB->begin(), MBB,
+ std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+ sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+ // Note that the structure of the jmp_buf used here is not compatible
+ // with that used by libc, and is not designed to be. Specifically, it
+ // stores only those 'reserved' registers that LLVM does not otherwise
+ // understand how to spill. Also, by convention, by the time this
+ // intrinsic is called, Clang has already stored the frame address in the
+ // first slot of the buffer and stack address in the third. Following the
+ // X86 target code, we'll store the jump address in the second slot. We also
+ // need to save the TOC pointer (R2) to handle jumps between shared
+ // libraries, and that will be stored in the fourth slot. The thread
+ // identifier (R13) is not affected.
+
+ // thisMBB:
+ const int64_t LabelOffset = 1 * PVT.getStoreSize();
+ const int64_t TOCOffset = 3 * PVT.getStoreSize();
+ const int64_t BPOffset = 4 * PVT.getStoreSize();
+
+ // Prepare IP either in reg.
+ const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
+ unsigned LabelReg = MRI.createVirtualRegister(PtrRC);
+ unsigned BufReg = MI.getOperand(1).getReg();
+
+ if (Subtarget.isPPC64() && Subtarget.isSVR4ABI()) {
+ setUsesTOCBasePtr(*MBB->getParent());
+ MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::STD))
+ .addReg(PPC::X2)
+ .addImm(TOCOffset)
+ .addReg(BufReg);
+ MIB.setMemRefs(MMOBegin, MMOEnd);
+ }
+
+ // Naked functions never have a base pointer, and so we use r1. For all
+ // other functions, this decision must be delayed until during PEI.
+ unsigned BaseReg;
+ if (MF->getFunction()->hasFnAttribute(Attribute::Naked))
+ BaseReg = Subtarget.isPPC64() ? PPC::X1 : PPC::R1;
+ else
+ BaseReg = Subtarget.isPPC64() ? PPC::BP8 : PPC::BP;
+
+ MIB = BuildMI(*thisMBB, MI, DL,
+ TII->get(Subtarget.isPPC64() ? PPC::STD : PPC::STW))
+ .addReg(BaseReg)
+ .addImm(BPOffset)
+ .addReg(BufReg);
+ MIB.setMemRefs(MMOBegin, MMOEnd);
+
+ // Setup
+ MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB);
+ const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo();
+ MIB.addRegMask(TRI->getNoPreservedMask());
+
+ BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1);
+
+ MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::EH_SjLj_Setup))
+ .addMBB(mainMBB);
+ MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::B)).addMBB(sinkMBB);
+
+ thisMBB->addSuccessor(mainMBB, BranchProbability::getZero());
+ thisMBB->addSuccessor(sinkMBB, BranchProbability::getOne());
+
+ // mainMBB:
+ // mainDstReg = 0
+ MIB =
+ BuildMI(mainMBB, DL,
+ TII->get(Subtarget.isPPC64() ? PPC::MFLR8 : PPC::MFLR), LabelReg);
+
+ // Store IP
+ if (Subtarget.isPPC64()) {
+ MIB = BuildMI(mainMBB, DL, TII->get(PPC::STD))
+ .addReg(LabelReg)
+ .addImm(LabelOffset)
+ .addReg(BufReg);
+ } else {
+ MIB = BuildMI(mainMBB, DL, TII->get(PPC::STW))
+ .addReg(LabelReg)
+ .addImm(LabelOffset)
+ .addReg(BufReg);
+ }
+
+ MIB.setMemRefs(MMOBegin, MMOEnd);
+
+ BuildMI(mainMBB, DL, TII->get(PPC::LI), mainDstReg).addImm(0);
+ mainMBB->addSuccessor(sinkMBB);
+
+ // sinkMBB:
+ BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+ TII->get(PPC::PHI), DstReg)
+ .addReg(mainDstReg).addMBB(mainMBB)
+ .addReg(restoreDstReg).addMBB(thisMBB);
+
+ MI.eraseFromParent();
+ return sinkMBB;
+}
+
+MachineBasicBlock *
+PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
+ MachineBasicBlock *MBB) const {
+ DebugLoc DL = MI.getDebugLoc();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+
+ MachineFunction *MF = MBB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ // Memory Reference
+ MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
+ MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
+
+ MVT PVT = getPointerTy(MF->getDataLayout());
+ assert((PVT == MVT::i64 || PVT == MVT::i32) &&
+ "Invalid Pointer Size!");
+
+ const TargetRegisterClass *RC =
+ (PVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
+ unsigned Tmp = MRI.createVirtualRegister(RC);
+ // Since FP is only updated here but NOT referenced, it's treated as GPR.
+ unsigned FP = (PVT == MVT::i64) ? PPC::X31 : PPC::R31;
+ unsigned SP = (PVT == MVT::i64) ? PPC::X1 : PPC::R1;
+ unsigned BP =
+ (PVT == MVT::i64)
+ ? PPC::X30
+ : (Subtarget.isSVR4ABI() && isPositionIndependent() ? PPC::R29
+ : PPC::R30);
+
+ MachineInstrBuilder MIB;
+
+ const int64_t LabelOffset = 1 * PVT.getStoreSize();
+ const int64_t SPOffset = 2 * PVT.getStoreSize();
+ const int64_t TOCOffset = 3 * PVT.getStoreSize();
+ const int64_t BPOffset = 4 * PVT.getStoreSize();
+
+ unsigned BufReg = MI.getOperand(0).getReg();
+
+ // Reload FP (the jumped-to function may not have had a
+ // frame pointer, and if so, then its r31 will be restored
+ // as necessary).
+ if (PVT == MVT::i64) {
+ MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), FP)
+ .addImm(0)
+ .addReg(BufReg);
+ } else {
+ MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), FP)
+ .addImm(0)
+ .addReg(BufReg);
+ }
+ MIB.setMemRefs(MMOBegin, MMOEnd);
+
+ // Reload IP
+ if (PVT == MVT::i64) {
+ MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), Tmp)
+ .addImm(LabelOffset)
+ .addReg(BufReg);
+ } else {
+ MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), Tmp)
+ .addImm(LabelOffset)
+ .addReg(BufReg);
+ }
+ MIB.setMemRefs(MMOBegin, MMOEnd);
+
+ // Reload SP
+ if (PVT == MVT::i64) {
+ MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), SP)
+ .addImm(SPOffset)
+ .addReg(BufReg);
+ } else {
+ MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), SP)
+ .addImm(SPOffset)
+ .addReg(BufReg);
+ }
+ MIB.setMemRefs(MMOBegin, MMOEnd);
+
+ // Reload BP
+ if (PVT == MVT::i64) {
+ MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), BP)
+ .addImm(BPOffset)
+ .addReg(BufReg);
+ } else {
+ MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), BP)
+ .addImm(BPOffset)
+ .addReg(BufReg);
+ }
+ MIB.setMemRefs(MMOBegin, MMOEnd);
+
+ // Reload TOC
+ if (PVT == MVT::i64 && Subtarget.isSVR4ABI()) {
+ setUsesTOCBasePtr(*MBB->getParent());
+ MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), PPC::X2)
+ .addImm(TOCOffset)
+ .addReg(BufReg);
+
+ MIB.setMemRefs(MMOBegin, MMOEnd);
+ }
+
+ // Jump
+ BuildMI(*MBB, MI, DL,
+ TII->get(PVT == MVT::i64 ? PPC::MTCTR8 : PPC::MTCTR)).addReg(Tmp);
+ BuildMI(*MBB, MI, DL, TII->get(PVT == MVT::i64 ? PPC::BCTR8 : PPC::BCTR));
+
+ MI.eraseFromParent();
+ return MBB;
+}
+
+MachineBasicBlock *
+PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ if (MI.getOpcode() == TargetOpcode::STACKMAP ||
+ MI.getOpcode() == TargetOpcode::PATCHPOINT) {
+ if (Subtarget.isPPC64() && Subtarget.isSVR4ABI() &&
+ MI.getOpcode() == TargetOpcode::PATCHPOINT) {
+ // Call lowering should have added an r2 operand to indicate a dependence
+ // on the TOC base pointer value. It can't however, because there is no
+ // way to mark the dependence as implicit there, and so the stackmap code
+ // will confuse it with a regular operand. Instead, add the dependence
+ // here.
+ setUsesTOCBasePtr(*BB->getParent());
+ MI.addOperand(MachineOperand::CreateReg(PPC::X2, false, true));
+ }
+
+ return emitPatchPoint(MI, BB);
+ }
+
+ if (MI.getOpcode() == PPC::EH_SjLj_SetJmp32 ||
+ MI.getOpcode() == PPC::EH_SjLj_SetJmp64) {
+ return emitEHSjLjSetJmp(MI, BB);
+ } else if (MI.getOpcode() == PPC::EH_SjLj_LongJmp32 ||
+ MI.getOpcode() == PPC::EH_SjLj_LongJmp64) {
+ return emitEHSjLjLongJmp(MI, BB);
+ }
+
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+
+ // To "insert" these instructions we actually have to insert their
+ // control-flow patterns.
+ const BasicBlock *LLVM_BB = BB->getBasicBlock();
+ MachineFunction::iterator It = ++BB->getIterator();
+
+ MachineFunction *F = BB->getParent();
+
+ if (Subtarget.hasISEL() &&
+ (MI.getOpcode() == PPC::SELECT_CC_I4 ||
+ MI.getOpcode() == PPC::SELECT_CC_I8 ||
+ MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8)) {
+ SmallVector<MachineOperand, 2> Cond;
+ if (MI.getOpcode() == PPC::SELECT_CC_I4 ||
+ MI.getOpcode() == PPC::SELECT_CC_I8)
+ Cond.push_back(MI.getOperand(4));
+ else
+ Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_SET));
+ Cond.push_back(MI.getOperand(1));
+
+ DebugLoc dl = MI.getDebugLoc();
+ TII->insertSelect(*BB, MI, dl, MI.getOperand(0).getReg(), Cond,
+ MI.getOperand(2).getReg(), MI.getOperand(3).getReg());
+ } else if (MI.getOpcode() == PPC::SELECT_CC_I4 ||
+ MI.getOpcode() == PPC::SELECT_CC_I8 ||
+ MI.getOpcode() == PPC::SELECT_CC_F4 ||
+ MI.getOpcode() == PPC::SELECT_CC_F8 ||
+ MI.getOpcode() == PPC::SELECT_CC_QFRC ||
+ MI.getOpcode() == PPC::SELECT_CC_QSRC ||
+ MI.getOpcode() == PPC::SELECT_CC_QBRC ||
+ MI.getOpcode() == PPC::SELECT_CC_VRRC ||
+ MI.getOpcode() == PPC::SELECT_CC_VSFRC ||
+ MI.getOpcode() == PPC::SELECT_CC_VSSRC ||
+ MI.getOpcode() == PPC::SELECT_CC_VSRC ||
+ MI.getOpcode() == PPC::SELECT_I4 ||
+ MI.getOpcode() == PPC::SELECT_I8 ||
+ MI.getOpcode() == PPC::SELECT_F4 ||
+ MI.getOpcode() == PPC::SELECT_F8 ||
+ MI.getOpcode() == PPC::SELECT_QFRC ||
+ MI.getOpcode() == PPC::SELECT_QSRC ||
+ MI.getOpcode() == PPC::SELECT_QBRC ||
+ MI.getOpcode() == PPC::SELECT_VRRC ||
+ MI.getOpcode() == PPC::SELECT_VSFRC ||
+ MI.getOpcode() == PPC::SELECT_VSSRC ||
+ MI.getOpcode() == PPC::SELECT_VSRC) {
+ // The incoming instruction knows the destination vreg to set, the
+ // condition code register to branch on, the true/false values to
+ // select between, and a branch opcode to use.
+
+ // thisMBB:
+ // ...
+ // TrueVal = ...
+ // cmpTY ccX, r1, r2
+ // bCC copy1MBB
+ // fallthrough --> copy0MBB
+ MachineBasicBlock *thisMBB = BB;
+ MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ DebugLoc dl = MI.getDebugLoc();
+ F->insert(It, copy0MBB);
+ F->insert(It, sinkMBB);
+
+ // Transfer the remainder of BB and its successor edges to sinkMBB.
+ sinkMBB->splice(sinkMBB->begin(), BB,
+ std::next(MachineBasicBlock::iterator(MI)), BB->end());
+ sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+ // Next, add the true and fallthrough blocks as its successors.
+ BB->addSuccessor(copy0MBB);
+ BB->addSuccessor(sinkMBB);
+
+ if (MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8 ||
+ MI.getOpcode() == PPC::SELECT_F4 || MI.getOpcode() == PPC::SELECT_F8 ||
+ MI.getOpcode() == PPC::SELECT_QFRC ||
+ MI.getOpcode() == PPC::SELECT_QSRC ||
+ MI.getOpcode() == PPC::SELECT_QBRC ||
+ MI.getOpcode() == PPC::SELECT_VRRC ||
+ MI.getOpcode() == PPC::SELECT_VSFRC ||
+ MI.getOpcode() == PPC::SELECT_VSSRC ||
+ MI.getOpcode() == PPC::SELECT_VSRC) {
+ BuildMI(BB, dl, TII->get(PPC::BC))
+ .addReg(MI.getOperand(1).getReg())
+ .addMBB(sinkMBB);
+ } else {
+ unsigned SelectPred = MI.getOperand(4).getImm();
+ BuildMI(BB, dl, TII->get(PPC::BCC))
+ .addImm(SelectPred)
+ .addReg(MI.getOperand(1).getReg())
+ .addMBB(sinkMBB);
+ }
+
+ // copy0MBB:
+ // %FalseValue = ...
+ // # fallthrough to sinkMBB
+ BB = copy0MBB;
+
+ // Update machine-CFG edges
+ BB->addSuccessor(sinkMBB);
+
+ // sinkMBB:
+ // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+ // ...
+ BB = sinkMBB;
+ BuildMI(*BB, BB->begin(), dl, TII->get(PPC::PHI), MI.getOperand(0).getReg())
+ .addReg(MI.getOperand(3).getReg())
+ .addMBB(copy0MBB)
+ .addReg(MI.getOperand(2).getReg())
+ .addMBB(thisMBB);
+ } else if (MI.getOpcode() == PPC::ReadTB) {
+ // To read the 64-bit time-base register on a 32-bit target, we read the
+ // two halves. Should the counter have wrapped while it was being read, we
+ // need to try again.
+ // ...
+ // readLoop:
+ // mfspr Rx,TBU # load from TBU
+ // mfspr Ry,TB # load from TB
+ // mfspr Rz,TBU # load from TBU
+ // cmpw crX,Rx,Rz # check if 'old'='new'
+ // bne readLoop # branch if they're not equal
+ // ...
+
+ MachineBasicBlock *readMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ DebugLoc dl = MI.getDebugLoc();
+ F->insert(It, readMBB);
+ F->insert(It, sinkMBB);
+
+ // Transfer the remainder of BB and its successor edges to sinkMBB.
+ sinkMBB->splice(sinkMBB->begin(), BB,
+ std::next(MachineBasicBlock::iterator(MI)), BB->end());
+ sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+ BB->addSuccessor(readMBB);
+ BB = readMBB;
+
+ MachineRegisterInfo &RegInfo = F->getRegInfo();
+ unsigned ReadAgainReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass);
+ unsigned LoReg = MI.getOperand(0).getReg();
+ unsigned HiReg = MI.getOperand(1).getReg();
+
+ BuildMI(BB, dl, TII->get(PPC::MFSPR), HiReg).addImm(269);
+ BuildMI(BB, dl, TII->get(PPC::MFSPR), LoReg).addImm(268);
+ BuildMI(BB, dl, TII->get(PPC::MFSPR), ReadAgainReg).addImm(269);
+
+ unsigned CmpReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
+
+ BuildMI(BB, dl, TII->get(PPC::CMPW), CmpReg)
+ .addReg(HiReg).addReg(ReadAgainReg);
+ BuildMI(BB, dl, TII->get(PPC::BCC))
+ .addImm(PPC::PRED_NE).addReg(CmpReg).addMBB(readMBB);
+
+ BB->addSuccessor(readMBB);
+ BB->addSuccessor(sinkMBB);
+ } else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I8)
+ BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::ADD4);
+ else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I16)
+ BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::ADD4);
+ else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I32)
+ BB = EmitAtomicBinary(MI, BB, 4, PPC::ADD4);
+ else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I64)
+ BB = EmitAtomicBinary(MI, BB, 8, PPC::ADD8);
+
+ else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I8)
+ BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::AND);
+ else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I16)
+ BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::AND);
+ else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I32)
+ BB = EmitAtomicBinary(MI, BB, 4, PPC::AND);
+ else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I64)
+ BB = EmitAtomicBinary(MI, BB, 8, PPC::AND8);
+
+ else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I8)
+ BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::OR);
+ else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I16)
+ BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::OR);
+ else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I32)
+ BB = EmitAtomicBinary(MI, BB, 4, PPC::OR);
+ else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I64)
+ BB = EmitAtomicBinary(MI, BB, 8, PPC::OR8);
+
+ else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I8)
+ BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::XOR);
+ else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I16)
+ BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::XOR);
+ else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I32)
+ BB = EmitAtomicBinary(MI, BB, 4, PPC::XOR);
+ else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I64)
+ BB = EmitAtomicBinary(MI, BB, 8, PPC::XOR8);
+
+ else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I8)
+ BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::NAND);
+ else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I16)
+ BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::NAND);
+ else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I32)
+ BB = EmitAtomicBinary(MI, BB, 4, PPC::NAND);
+ else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I64)
+ BB = EmitAtomicBinary(MI, BB, 8, PPC::NAND8);
+
+ else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I8)
+ BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::SUBF);
+ else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I16)
+ BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::SUBF);
+ else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I32)
+ BB = EmitAtomicBinary(MI, BB, 4, PPC::SUBF);
+ else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I64)
+ BB = EmitAtomicBinary(MI, BB, 8, PPC::SUBF8);
+
+ else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I8)
+ BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPW, PPC::PRED_GE);
+ else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I16)
+ BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPW, PPC::PRED_GE);
+ else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I32)
+ BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPW, PPC::PRED_GE);
+ else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I64)
+ BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPD, PPC::PRED_GE);
+
+ else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I8)
+ BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPW, PPC::PRED_LE);
+ else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I16)
+ BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPW, PPC::PRED_LE);
+ else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I32)
+ BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPW, PPC::PRED_LE);
+ else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I64)
+ BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPD, PPC::PRED_LE);
+
+ else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I8)
+ BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPLW, PPC::PRED_GE);
+ else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I16)
+ BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPLW, PPC::PRED_GE);
+ else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I32)
+ BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPLW, PPC::PRED_GE);
+ else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I64)
+ BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPLD, PPC::PRED_GE);
+
+ else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I8)
+ BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPLW, PPC::PRED_LE);
+ else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I16)
+ BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPLW, PPC::PRED_LE);
+ else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I32)
+ BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPLW, PPC::PRED_LE);
+ else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I64)
+ BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPLD, PPC::PRED_LE);
+
+ else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I8)
+ BB = EmitPartwordAtomicBinary(MI, BB, true, 0);
+ else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I16)
+ BB = EmitPartwordAtomicBinary(MI, BB, false, 0);
+ else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I32)
+ BB = EmitAtomicBinary(MI, BB, 4, 0);
+ else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I64)
+ BB = EmitAtomicBinary(MI, BB, 8, 0);
+
+ else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I32 ||
+ MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64 ||
+ (Subtarget.hasPartwordAtomics() &&
+ MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8) ||
+ (Subtarget.hasPartwordAtomics() &&
+ MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16)) {
+ bool is64bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64;
+
+ auto LoadMnemonic = PPC::LDARX;
+ auto StoreMnemonic = PPC::STDCX;
+ switch (MI.getOpcode()) {
+ default:
+ llvm_unreachable("Compare and swap of unknown size");
+ case PPC::ATOMIC_CMP_SWAP_I8:
+ LoadMnemonic = PPC::LBARX;
+ StoreMnemonic = PPC::STBCX;
+ assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
+ break;
+ case PPC::ATOMIC_CMP_SWAP_I16:
+ LoadMnemonic = PPC::LHARX;
+ StoreMnemonic = PPC::STHCX;
+ assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
+ break;
+ case PPC::ATOMIC_CMP_SWAP_I32:
+ LoadMnemonic = PPC::LWARX;
+ StoreMnemonic = PPC::STWCX;
+ break;
+ case PPC::ATOMIC_CMP_SWAP_I64:
+ LoadMnemonic = PPC::LDARX;
+ StoreMnemonic = PPC::STDCX;
+ break;
+ }
+ unsigned dest = MI.getOperand(0).getReg();
+ unsigned ptrA = MI.getOperand(1).getReg();
+ unsigned ptrB = MI.getOperand(2).getReg();
+ unsigned oldval = MI.getOperand(3).getReg();
+ unsigned newval = MI.getOperand(4).getReg();
+ DebugLoc dl = MI.getDebugLoc();
+
+ MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ F->insert(It, loop1MBB);
+ F->insert(It, loop2MBB);
+ F->insert(It, midMBB);
+ F->insert(It, exitMBB);
+ exitMBB->splice(exitMBB->begin(), BB,
+ std::next(MachineBasicBlock::iterator(MI)), BB->end());
+ exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+ // thisMBB:
+ // ...
+ // fallthrough --> loopMBB
+ BB->addSuccessor(loop1MBB);
+
+ // loop1MBB:
+ // l[bhwd]arx dest, ptr
+ // cmp[wd] dest, oldval
+ // bne- midMBB
+ // loop2MBB:
+ // st[bhwd]cx. newval, ptr
+ // bne- loopMBB
+ // b exitBB
+ // midMBB:
+ // st[bhwd]cx. dest, ptr
+ // exitBB:
+ BB = loop1MBB;
+ BuildMI(BB, dl, TII->get(LoadMnemonic), dest)
+ .addReg(ptrA).addReg(ptrB);
+ BuildMI(BB, dl, TII->get(is64bit ? PPC::CMPD : PPC::CMPW), PPC::CR0)
+ .addReg(oldval).addReg(dest);
+ BuildMI(BB, dl, TII->get(PPC::BCC))
+ .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(midMBB);
+ BB->addSuccessor(loop2MBB);
+ BB->addSuccessor(midMBB);
+
+ BB = loop2MBB;
+ BuildMI(BB, dl, TII->get(StoreMnemonic))
+ .addReg(newval).addReg(ptrA).addReg(ptrB);
+ BuildMI(BB, dl, TII->get(PPC::BCC))
+ .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loop1MBB);
+ BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);
+ BB->addSuccessor(loop1MBB);
+ BB->addSuccessor(exitMBB);
+
+ BB = midMBB;
+ BuildMI(BB, dl, TII->get(StoreMnemonic))
+ .addReg(dest).addReg(ptrA).addReg(ptrB);
+ BB->addSuccessor(exitMBB);
+
+ // exitMBB:
+ // ...
+ BB = exitMBB;
+ } else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8 ||
+ MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16) {
+ // We must use 64-bit registers for addresses when targeting 64-bit,
+ // since we're actually doing arithmetic on them. Other registers
+ // can be 32-bit.
+ bool is64bit = Subtarget.isPPC64();
+ bool isLittleEndian = Subtarget.isLittleEndian();
+ bool is8bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8;
+
+ unsigned dest = MI.getOperand(0).getReg();
+ unsigned ptrA = MI.getOperand(1).getReg();
+ unsigned ptrB = MI.getOperand(2).getReg();
+ unsigned oldval = MI.getOperand(3).getReg();
+ unsigned newval = MI.getOperand(4).getReg();
+ DebugLoc dl = MI.getDebugLoc();
+
+ MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ F->insert(It, loop1MBB);
+ F->insert(It, loop2MBB);
+ F->insert(It, midMBB);
+ F->insert(It, exitMBB);
+ exitMBB->splice(exitMBB->begin(), BB,
+ std::next(MachineBasicBlock::iterator(MI)), BB->end());
+ exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+ MachineRegisterInfo &RegInfo = F->getRegInfo();
+ const TargetRegisterClass *RC = is64bit ? &PPC::G8RCRegClass
+ : &PPC::GPRCRegClass;
+ unsigned PtrReg = RegInfo.createVirtualRegister(RC);
+ unsigned Shift1Reg = RegInfo.createVirtualRegister(RC);
+ unsigned ShiftReg =
+ isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(RC);
+ unsigned NewVal2Reg = RegInfo.createVirtualRegister(RC);
+ unsigned NewVal3Reg = RegInfo.createVirtualRegister(RC);
+ unsigned OldVal2Reg = RegInfo.createVirtualRegister(RC);
+ unsigned OldVal3Reg = RegInfo.createVirtualRegister(RC);
+ unsigned MaskReg = RegInfo.createVirtualRegister(RC);
+ unsigned Mask2Reg = RegInfo.createVirtualRegister(RC);
+ unsigned Mask3Reg = RegInfo.createVirtualRegister(RC);
+ unsigned Tmp2Reg = RegInfo.createVirtualRegister(RC);
+ unsigned Tmp4Reg = RegInfo.createVirtualRegister(RC);
+ unsigned TmpDestReg = RegInfo.createVirtualRegister(RC);
+ unsigned Ptr1Reg;
+ unsigned TmpReg = RegInfo.createVirtualRegister(RC);
+ unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;
+ // thisMBB:
+ // ...
+ // fallthrough --> loopMBB
+ BB->addSuccessor(loop1MBB);
+
+ // The 4-byte load must be aligned, while a char or short may be
+ // anywhere in the word. Hence all this nasty bookkeeping code.
+ // add ptr1, ptrA, ptrB [copy if ptrA==0]
+ // rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27]
+ // xori shift, shift1, 24 [16]
+ // rlwinm ptr, ptr1, 0, 0, 29
+ // slw newval2, newval, shift
+ // slw oldval2, oldval,shift
+ // li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535]
+ // slw mask, mask2, shift
+ // and newval3, newval2, mask
+ // and oldval3, oldval2, mask
+ // loop1MBB:
+ // lwarx tmpDest, ptr
+ // and tmp, tmpDest, mask
+ // cmpw tmp, oldval3
+ // bne- midMBB
+ // loop2MBB:
+ // andc tmp2, tmpDest, mask
+ // or tmp4, tmp2, newval3
+ // stwcx. tmp4, ptr
+ // bne- loop1MBB
+ // b exitBB
+ // midMBB:
+ // stwcx. tmpDest, ptr
+ // exitBB:
+ // srw dest, tmpDest, shift
+ if (ptrA != ZeroReg) {
+ Ptr1Reg = RegInfo.createVirtualRegister(RC);
+ BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg)
+ .addReg(ptrA).addReg(ptrB);
+ } else {
+ Ptr1Reg = ptrB;
+ }
+ BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg).addReg(Ptr1Reg)
+ .addImm(3).addImm(27).addImm(is8bit ? 28 : 27);
+ if (!isLittleEndian)
+ BuildMI(BB, dl, TII->get(is64bit ? PPC::XORI8 : PPC::XORI), ShiftReg)
+ .addReg(Shift1Reg).addImm(is8bit ? 24 : 16);
+ if (is64bit)
+ BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg)
+ .addReg(Ptr1Reg).addImm(0).addImm(61);
+ else
+ BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg)
+ .addReg(Ptr1Reg).addImm(0).addImm(0).addImm(29);
+ BuildMI(BB, dl, TII->get(PPC::SLW), NewVal2Reg)
+ .addReg(newval).addReg(ShiftReg);
+ BuildMI(BB, dl, TII->get(PPC::SLW), OldVal2Reg)
+ .addReg(oldval).addReg(ShiftReg);
+ if (is8bit)
+ BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255);
+ else {
+ BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0);
+ BuildMI(BB, dl, TII->get(PPC::ORI), Mask2Reg)
+ .addReg(Mask3Reg).addImm(65535);
+ }
+ BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg)
+ .addReg(Mask2Reg).addReg(ShiftReg);
+ BuildMI(BB, dl, TII->get(PPC::AND), NewVal3Reg)
+ .addReg(NewVal2Reg).addReg(MaskReg);
+ BuildMI(BB, dl, TII->get(PPC::AND), OldVal3Reg)
+ .addReg(OldVal2Reg).addReg(MaskReg);
+
+ BB = loop1MBB;
+ BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg)
+ .addReg(ZeroReg).addReg(PtrReg);
+ BuildMI(BB, dl, TII->get(PPC::AND),TmpReg)
+ .addReg(TmpDestReg).addReg(MaskReg);
+ BuildMI(BB, dl, TII->get(PPC::CMPW), PPC::CR0)
+ .addReg(TmpReg).addReg(OldVal3Reg);
+ BuildMI(BB, dl, TII->get(PPC::BCC))
+ .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(midMBB);
+ BB->addSuccessor(loop2MBB);
+ BB->addSuccessor(midMBB);
+
+ BB = loop2MBB;
+ BuildMI(BB, dl, TII->get(PPC::ANDC),Tmp2Reg)
+ .addReg(TmpDestReg).addReg(MaskReg);
+ BuildMI(BB, dl, TII->get(PPC::OR),Tmp4Reg)
+ .addReg(Tmp2Reg).addReg(NewVal3Reg);
+ BuildMI(BB, dl, TII->get(PPC::STWCX)).addReg(Tmp4Reg)
+ .addReg(ZeroReg).addReg(PtrReg);
+ BuildMI(BB, dl, TII->get(PPC::BCC))
+ .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loop1MBB);
+ BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);
+ BB->addSuccessor(loop1MBB);
+ BB->addSuccessor(exitMBB);
+
+ BB = midMBB;
+ BuildMI(BB, dl, TII->get(PPC::STWCX)).addReg(TmpDestReg)
+ .addReg(ZeroReg).addReg(PtrReg);
+ BB->addSuccessor(exitMBB);
+
+ // exitMBB:
+ // ...
+ BB = exitMBB;
+ BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW),dest).addReg(TmpReg)
+ .addReg(ShiftReg);
+ } else if (MI.getOpcode() == PPC::FADDrtz) {
+ // This pseudo performs an FADD with rounding mode temporarily forced
+ // to round-to-zero. We emit this via custom inserter since the FPSCR
+ // is not modeled at the SelectionDAG level.
+ unsigned Dest = MI.getOperand(0).getReg();
+ unsigned Src1 = MI.getOperand(1).getReg();
+ unsigned Src2 = MI.getOperand(2).getReg();
+ DebugLoc dl = MI.getDebugLoc();
+
+ MachineRegisterInfo &RegInfo = F->getRegInfo();
+ unsigned MFFSReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass);
+
+ // Save FPSCR value.
+ BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), MFFSReg);
+
+ // Set rounding mode to round-to-zero.
+ BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB1)).addImm(31);
+ BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB0)).addImm(30);
+
+ // Perform addition.
+ BuildMI(*BB, MI, dl, TII->get(PPC::FADD), Dest).addReg(Src1).addReg(Src2);
+
+ // Restore FPSCR value.
+ BuildMI(*BB, MI, dl, TII->get(PPC::MTFSFb)).addImm(1).addReg(MFFSReg);
+ } else if (MI.getOpcode() == PPC::ANDIo_1_EQ_BIT ||
+ MI.getOpcode() == PPC::ANDIo_1_GT_BIT ||
+ MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8 ||
+ MI.getOpcode() == PPC::ANDIo_1_GT_BIT8) {
+ unsigned Opcode = (MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8 ||
+ MI.getOpcode() == PPC::ANDIo_1_GT_BIT8)
+ ? PPC::ANDIo8
+ : PPC::ANDIo;
+ bool isEQ = (MI.getOpcode() == PPC::ANDIo_1_EQ_BIT ||
+ MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8);
+
+ MachineRegisterInfo &RegInfo = F->getRegInfo();
+ unsigned Dest = RegInfo.createVirtualRegister(Opcode == PPC::ANDIo ?
+ &PPC::GPRCRegClass :
+ &PPC::G8RCRegClass);
+
+ DebugLoc dl = MI.getDebugLoc();
+ BuildMI(*BB, MI, dl, TII->get(Opcode), Dest)
+ .addReg(MI.getOperand(1).getReg())
+ .addImm(1);
+ BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY),
+ MI.getOperand(0).getReg())
+ .addReg(isEQ ? PPC::CR0EQ : PPC::CR0GT);
+ } else if (MI.getOpcode() == PPC::TCHECK_RET) {
+ DebugLoc Dl = MI.getDebugLoc();
+ MachineRegisterInfo &RegInfo = F->getRegInfo();
+ unsigned CRReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
+ BuildMI(*BB, MI, Dl, TII->get(PPC::TCHECK), CRReg);
+ return BB;
+ } else {
+ llvm_unreachable("Unexpected instr type to insert");
+ }
+
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
+ return BB;
+}
+
+//===----------------------------------------------------------------------===//
+// Target Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+static int getEstimateRefinementSteps(EVT VT, const PPCSubtarget &Subtarget) {
+ // For the estimates, convergence is quadratic, so we essentially double the
+ // number of digits correct after every iteration. For both FRE and FRSQRTE,
+ // the minimum architected relative accuracy is 2^-5. When hasRecipPrec(),
+ // this is 2^-14. IEEE float has 23 digits and double has 52 digits.
+ int RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3;
+ if (VT.getScalarType() == MVT::f64)
+ RefinementSteps++;
+ return RefinementSteps;
+}
+
+SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
+ int Enabled, int &RefinementSteps,
+ bool &UseOneConstNR,
+ bool Reciprocal) const {
+ EVT VT = Operand.getValueType();
+ if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) ||
+ (VT == MVT::f64 && Subtarget.hasFRSQRTE()) ||
+ (VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
+ (VT == MVT::v2f64 && Subtarget.hasVSX()) ||
+ (VT == MVT::v4f32 && Subtarget.hasQPX()) ||
+ (VT == MVT::v4f64 && Subtarget.hasQPX())) {
+ if (RefinementSteps == ReciprocalEstimate::Unspecified)
+ RefinementSteps = getEstimateRefinementSteps(VT, Subtarget);
+
+ UseOneConstNR = true;
+ return DAG.getNode(PPCISD::FRSQRTE, SDLoc(Operand), VT, Operand);
+ }
+ return SDValue();
+}
+
+SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, SelectionDAG &DAG,
+ int Enabled,
+ int &RefinementSteps) const {
+ EVT VT = Operand.getValueType();
+ if ((VT == MVT::f32 && Subtarget.hasFRES()) ||
+ (VT == MVT::f64 && Subtarget.hasFRE()) ||
+ (VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
+ (VT == MVT::v2f64 && Subtarget.hasVSX()) ||
+ (VT == MVT::v4f32 && Subtarget.hasQPX()) ||
+ (VT == MVT::v4f64 && Subtarget.hasQPX())) {
+ if (RefinementSteps == ReciprocalEstimate::Unspecified)
+ RefinementSteps = getEstimateRefinementSteps(VT, Subtarget);
+ return DAG.getNode(PPCISD::FRE, SDLoc(Operand), VT, Operand);
+ }
+ return SDValue();
+}
+
+unsigned PPCTargetLowering::combineRepeatedFPDivisors() const {
+ // Note: This functionality is used only when unsafe-fp-math is enabled, and
+ // on cores with reciprocal estimates (which are used when unsafe-fp-math is
+ // enabled for division), this functionality is redundant with the default
+ // combiner logic (once the division -> reciprocal/multiply transformation
+ // has taken place). As a result, this matters more for older cores than for
+ // newer ones.
+
+ // Combine multiple FDIVs with the same divisor into multiple FMULs by the
+ // reciprocal if there are two or more FDIVs (for embedded cores with only
+ // one FP pipeline) for three or more FDIVs (for generic OOO cores).
+ switch (Subtarget.getDarwinDirective()) {
+ default:
+ return 3;
+ case PPC::DIR_440:
+ case PPC::DIR_A2:
+ case PPC::DIR_E500mc:
+ case PPC::DIR_E5500:
+ return 2;
+ }
+}
+
+// isConsecutiveLSLoc needs to work even if all adds have not yet been
+// collapsed, and so we need to look through chains of them.
+static void getBaseWithConstantOffset(SDValue Loc, SDValue &Base,
+ int64_t& Offset, SelectionDAG &DAG) {
+ if (DAG.isBaseWithConstantOffset(Loc)) {
+ Base = Loc.getOperand(0);
+ Offset += cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue();
+
+ // The base might itself be a base plus an offset, and if so, accumulate
+ // that as well.
+ getBaseWithConstantOffset(Loc.getOperand(0), Base, Offset, DAG);
+ }
+}
+
+static bool isConsecutiveLSLoc(SDValue Loc, EVT VT, LSBaseSDNode *Base,
+ unsigned Bytes, int Dist,
+ SelectionDAG &DAG) {
+ if (VT.getSizeInBits() / 8 != Bytes)
+ return false;
+
+ SDValue BaseLoc = Base->getBasePtr();
+ if (Loc.getOpcode() == ISD::FrameIndex) {
+ if (BaseLoc.getOpcode() != ISD::FrameIndex)
+ return false;
+ const MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ int FI = cast<FrameIndexSDNode>(Loc)->getIndex();
+ int BFI = cast<FrameIndexSDNode>(BaseLoc)->getIndex();
+ int FS = MFI.getObjectSize(FI);
+ int BFS = MFI.getObjectSize(BFI);
+ if (FS != BFS || FS != (int)Bytes) return false;
+ return MFI.getObjectOffset(FI) == (MFI.getObjectOffset(BFI) + Dist*Bytes);
+ }
+
+ SDValue Base1 = Loc, Base2 = BaseLoc;
+ int64_t Offset1 = 0, Offset2 = 0;
+ getBaseWithConstantOffset(Loc, Base1, Offset1, DAG);
+ getBaseWithConstantOffset(BaseLoc, Base2, Offset2, DAG);
+ if (Base1 == Base2 && Offset1 == (Offset2 + Dist * Bytes))
+ return true;
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ const GlobalValue *GV1 = nullptr;
+ const GlobalValue *GV2 = nullptr;
+ Offset1 = 0;
+ Offset2 = 0;
+ bool isGA1 = TLI.isGAPlusOffset(Loc.getNode(), GV1, Offset1);
+ bool isGA2 = TLI.isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2);
+ if (isGA1 && isGA2 && GV1 == GV2)
+ return Offset1 == (Offset2 + Dist*Bytes);
+ return false;
+}
+
+// Like SelectionDAG::isConsecutiveLoad, but also works for stores, and does
+// not enforce equality of the chain operands.
+static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base,
+ unsigned Bytes, int Dist,
+ SelectionDAG &DAG) {
+ if (LSBaseSDNode *LS = dyn_cast<LSBaseSDNode>(N)) {
+ EVT VT = LS->getMemoryVT();
+ SDValue Loc = LS->getBasePtr();
+ return isConsecutiveLSLoc(Loc, VT, Base, Bytes, Dist, DAG);
+ }
+
+ if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
+ EVT VT;
+ switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+ default: return false;
+ case Intrinsic::ppc_qpx_qvlfd:
+ case Intrinsic::ppc_qpx_qvlfda:
+ VT = MVT::v4f64;
+ break;
+ case Intrinsic::ppc_qpx_qvlfs:
+ case Intrinsic::ppc_qpx_qvlfsa:
+ VT = MVT::v4f32;
+ break;
+ case Intrinsic::ppc_qpx_qvlfcd:
+ case Intrinsic::ppc_qpx_qvlfcda:
+ VT = MVT::v2f64;
+ break;
+ case Intrinsic::ppc_qpx_qvlfcs:
+ case Intrinsic::ppc_qpx_qvlfcsa:
+ VT = MVT::v2f32;
+ break;
+ case Intrinsic::ppc_qpx_qvlfiwa:
+ case Intrinsic::ppc_qpx_qvlfiwz:
+ case Intrinsic::ppc_altivec_lvx:
+ case Intrinsic::ppc_altivec_lvxl:
+ case Intrinsic::ppc_vsx_lxvw4x:
+ case Intrinsic::ppc_vsx_lxvw4x_be:
+ VT = MVT::v4i32;
+ break;
+ case Intrinsic::ppc_vsx_lxvd2x:
+ case Intrinsic::ppc_vsx_lxvd2x_be:
+ VT = MVT::v2f64;
+ break;
+ case Intrinsic::ppc_altivec_lvebx:
+ VT = MVT::i8;
+ break;
+ case Intrinsic::ppc_altivec_lvehx:
+ VT = MVT::i16;
+ break;
+ case Intrinsic::ppc_altivec_lvewx:
+ VT = MVT::i32;
+ break;
+ }
+
+ return isConsecutiveLSLoc(N->getOperand(2), VT, Base, Bytes, Dist, DAG);
+ }
+
+ if (N->getOpcode() == ISD::INTRINSIC_VOID) {
+ EVT VT;
+ switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+ default: return false;
+ case Intrinsic::ppc_qpx_qvstfd:
+ case Intrinsic::ppc_qpx_qvstfda:
+ VT = MVT::v4f64;
+ break;
+ case Intrinsic::ppc_qpx_qvstfs:
+ case Intrinsic::ppc_qpx_qvstfsa:
+ VT = MVT::v4f32;
+ break;
+ case Intrinsic::ppc_qpx_qvstfcd:
+ case Intrinsic::ppc_qpx_qvstfcda:
+ VT = MVT::v2f64;
+ break;
+ case Intrinsic::ppc_qpx_qvstfcs:
+ case Intrinsic::ppc_qpx_qvstfcsa:
+ VT = MVT::v2f32;
+ break;
+ case Intrinsic::ppc_qpx_qvstfiw:
+ case Intrinsic::ppc_qpx_qvstfiwa:
+ case Intrinsic::ppc_altivec_stvx:
+ case Intrinsic::ppc_altivec_stvxl:
+ case Intrinsic::ppc_vsx_stxvw4x:
+ VT = MVT::v4i32;
+ break;
+ case Intrinsic::ppc_vsx_stxvd2x:
+ VT = MVT::v2f64;
+ break;
+ case Intrinsic::ppc_vsx_stxvw4x_be:
+ VT = MVT::v4i32;
+ break;
+ case Intrinsic::ppc_vsx_stxvd2x_be:
+ VT = MVT::v2f64;
+ break;
+ case Intrinsic::ppc_altivec_stvebx:
+ VT = MVT::i8;
+ break;
+ case Intrinsic::ppc_altivec_stvehx:
+ VT = MVT::i16;
+ break;
+ case Intrinsic::ppc_altivec_stvewx:
+ VT = MVT::i32;
+ break;
+ }
+
+ return isConsecutiveLSLoc(N->getOperand(3), VT, Base, Bytes, Dist, DAG);
+ }
+
+ return false;
+}
+
+// Return true is there is a nearyby consecutive load to the one provided
+// (regardless of alignment). We search up and down the chain, looking though
+// token factors and other loads (but nothing else). As a result, a true result
+// indicates that it is safe to create a new consecutive load adjacent to the
+// load provided.
+static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) {
+ SDValue Chain = LD->getChain();
+ EVT VT = LD->getMemoryVT();
+
+ SmallSet<SDNode *, 16> LoadRoots;
+ SmallVector<SDNode *, 8> Queue(1, Chain.getNode());
+ SmallSet<SDNode *, 16> Visited;
+
+ // First, search up the chain, branching to follow all token-factor operands.
+ // If we find a consecutive load, then we're done, otherwise, record all
+ // nodes just above the top-level loads and token factors.
+ while (!Queue.empty()) {
+ SDNode *ChainNext = Queue.pop_back_val();
+ if (!Visited.insert(ChainNext).second)
+ continue;
+
+ if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(ChainNext)) {
+ if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG))
+ return true;
+
+ if (!Visited.count(ChainLD->getChain().getNode()))
+ Queue.push_back(ChainLD->getChain().getNode());
+ } else if (ChainNext->getOpcode() == ISD::TokenFactor) {
+ for (const SDUse &O : ChainNext->ops())
+ if (!Visited.count(O.getNode()))
+ Queue.push_back(O.getNode());
+ } else
+ LoadRoots.insert(ChainNext);
+ }
+
+ // Second, search down the chain, starting from the top-level nodes recorded
+ // in the first phase. These top-level nodes are the nodes just above all
+ // loads and token factors. Starting with their uses, recursively look though
+ // all loads (just the chain uses) and token factors to find a consecutive
+ // load.
+ Visited.clear();
+ Queue.clear();
+
+ for (SmallSet<SDNode *, 16>::iterator I = LoadRoots.begin(),
+ IE = LoadRoots.end(); I != IE; ++I) {
+ Queue.push_back(*I);
+
+ while (!Queue.empty()) {
+ SDNode *LoadRoot = Queue.pop_back_val();
+ if (!Visited.insert(LoadRoot).second)
+ continue;
+
+ if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(LoadRoot))
+ if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG))
+ return true;
+
+ for (SDNode::use_iterator UI = LoadRoot->use_begin(),
+ UE = LoadRoot->use_end(); UI != UE; ++UI)
+ if (((isa<MemSDNode>(*UI) &&
+ cast<MemSDNode>(*UI)->getChain().getNode() == LoadRoot) ||
+ UI->getOpcode() == ISD::TokenFactor) && !Visited.count(*UI))
+ Queue.push_back(*UI);
+ }
+ }
+
+ return false;
+}
+
+
+/// This function is called when we have proved that a SETCC node can be replaced
+/// by subtraction (and other supporting instructions) so that the result of
+/// comparison is kept in a GPR instead of CR. This function is purely for
+/// codegen purposes and has some flags to guide the codegen process.
+static SDValue generateEquivalentSub(SDNode *N, int Size, bool Complement,
+ bool Swap, SDLoc &DL, SelectionDAG &DAG) {
+
+ assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected.");
+
+ // Zero extend the operands to the largest legal integer. Originally, they
+ // must be of a strictly smaller size.
+ auto Op0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(0),
+ DAG.getConstant(Size, DL, MVT::i32));
+ auto Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(1),
+ DAG.getConstant(Size, DL, MVT::i32));
+
+ // Swap if needed. Depends on the condition code.
+ if (Swap)
+ std::swap(Op0, Op1);
+
+ // Subtract extended integers.
+ auto SubNode = DAG.getNode(ISD::SUB, DL, MVT::i64, Op0, Op1);
+
+ // Move the sign bit to the least significant position and zero out the rest.
+ // Now the least significant bit carries the result of original comparison.
+ auto Shifted = DAG.getNode(ISD::SRL, DL, MVT::i64, SubNode,
+ DAG.getConstant(Size - 1, DL, MVT::i32));
+ auto Final = Shifted;
+
+ // Complement the result if needed. Based on the condition code.
+ if (Complement)
+ Final = DAG.getNode(ISD::XOR, DL, MVT::i64, Shifted,
+ DAG.getConstant(1, DL, MVT::i64));
+
+ return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Final);
+}
+
+SDValue PPCTargetLowering::ConvertSETCCToSubtract(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+
+ assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected.");
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc DL(N);
+
+ // Size of integers being compared has a critical role in the following
+ // analysis, so we prefer to do this when all types are legal.
+ if (!DCI.isAfterLegalizeVectorOps())
+ return SDValue();
+
+ // If all users of SETCC extend its value to a legal integer type
+ // then we replace SETCC with a subtraction
+ for (SDNode::use_iterator UI = N->use_begin(),
+ UE = N->use_end(); UI != UE; ++UI) {
+ if (UI->getOpcode() != ISD::ZERO_EXTEND)
+ return SDValue();
+ }
+
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ auto OpSize = N->getOperand(0).getValueSizeInBits();
+
+ unsigned Size = DAG.getDataLayout().getLargestLegalIntTypeSizeInBits();
+
+ if (OpSize < Size) {
+ switch (CC) {
+ default: break;
+ case ISD::SETULT:
+ return generateEquivalentSub(N, Size, false, false, DL, DAG);
+ case ISD::SETULE:
+ return generateEquivalentSub(N, Size, true, true, DL, DAG);
+ case ISD::SETUGT:
+ return generateEquivalentSub(N, Size, false, true, DL, DAG);
+ case ISD::SETUGE:
+ return generateEquivalentSub(N, Size, true, false, DL, DAG);
+ }
+ }
+
+ return SDValue();
+}
+
+SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc dl(N);
+
+ assert(Subtarget.useCRBits() && "Expecting to be tracking CR bits");
+ // If we're tracking CR bits, we need to be careful that we don't have:
+ // trunc(binary-ops(zext(x), zext(y)))
+ // or
+ // trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
+ // such that we're unnecessarily moving things into GPRs when it would be
+ // better to keep them in CR bits.
+
+ // Note that trunc here can be an actual i1 trunc, or can be the effective
+ // truncation that comes from a setcc or select_cc.
+ if (N->getOpcode() == ISD::TRUNCATE &&
+ N->getValueType(0) != MVT::i1)
+ return SDValue();
+
+ if (N->getOperand(0).getValueType() != MVT::i32 &&
+ N->getOperand(0).getValueType() != MVT::i64)
+ return SDValue();
+
+ if (N->getOpcode() == ISD::SETCC ||
+ N->getOpcode() == ISD::SELECT_CC) {
+ // If we're looking at a comparison, then we need to make sure that the
+ // high bits (all except for the first) don't matter the result.
+ ISD::CondCode CC =
+ cast<CondCodeSDNode>(N->getOperand(
+ N->getOpcode() == ISD::SETCC ? 2 : 4))->get();
+ unsigned OpBits = N->getOperand(0).getValueSizeInBits();
+
+ if (ISD::isSignedIntSetCC(CC)) {
+ if (DAG.ComputeNumSignBits(N->getOperand(0)) != OpBits ||
+ DAG.ComputeNumSignBits(N->getOperand(1)) != OpBits)
+ return SDValue();
+ } else if (ISD::isUnsignedIntSetCC(CC)) {
+ if (!DAG.MaskedValueIsZero(N->getOperand(0),
+ APInt::getHighBitsSet(OpBits, OpBits-1)) ||
+ !DAG.MaskedValueIsZero(N->getOperand(1),
+ APInt::getHighBitsSet(OpBits, OpBits-1)))
+ return (N->getOpcode() == ISD::SETCC ? ConvertSETCCToSubtract(N, DCI)
+ : SDValue());
+ } else {
+ // This is neither a signed nor an unsigned comparison, just make sure
+ // that the high bits are equal.
+ APInt Op1Zero, Op1One;
+ APInt Op2Zero, Op2One;
+ DAG.computeKnownBits(N->getOperand(0), Op1Zero, Op1One);
+ DAG.computeKnownBits(N->getOperand(1), Op2Zero, Op2One);
+
+ // We don't really care about what is known about the first bit (if
+ // anything), so clear it in all masks prior to comparing them.
+ Op1Zero.clearBit(0); Op1One.clearBit(0);
+ Op2Zero.clearBit(0); Op2One.clearBit(0);
+
+ if (Op1Zero != Op2Zero || Op1One != Op2One)
+ return SDValue();
+ }
+ }
+
+ // We now know that the higher-order bits are irrelevant, we just need to
+ // make sure that all of the intermediate operations are bit operations, and
+ // all inputs are extensions.
+ if (N->getOperand(0).getOpcode() != ISD::AND &&
+ N->getOperand(0).getOpcode() != ISD::OR &&
+ N->getOperand(0).getOpcode() != ISD::XOR &&
+ N->getOperand(0).getOpcode() != ISD::SELECT &&
+ N->getOperand(0).getOpcode() != ISD::SELECT_CC &&
+ N->getOperand(0).getOpcode() != ISD::TRUNCATE &&
+ N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND &&
+ N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
+ N->getOperand(0).getOpcode() != ISD::ANY_EXTEND)
+ return SDValue();
+
+ if ((N->getOpcode() == ISD::SETCC || N->getOpcode() == ISD::SELECT_CC) &&
+ N->getOperand(1).getOpcode() != ISD::AND &&
+ N->getOperand(1).getOpcode() != ISD::OR &&
+ N->getOperand(1).getOpcode() != ISD::XOR &&
+ N->getOperand(1).getOpcode() != ISD::SELECT &&
+ N->getOperand(1).getOpcode() != ISD::SELECT_CC &&
+ N->getOperand(1).getOpcode() != ISD::TRUNCATE &&
+ N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND &&
+ N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
+ N->getOperand(1).getOpcode() != ISD::ANY_EXTEND)
+ return SDValue();
+
+ SmallVector<SDValue, 4> Inputs;
+ SmallVector<SDValue, 8> BinOps, PromOps;
+ SmallPtrSet<SDNode *, 16> Visited;
+
+ for (unsigned i = 0; i < 2; ++i) {
+ if (((N->getOperand(i).getOpcode() == ISD::SIGN_EXTEND ||
+ N->getOperand(i).getOpcode() == ISD::ZERO_EXTEND ||
+ N->getOperand(i).getOpcode() == ISD::ANY_EXTEND) &&
+ N->getOperand(i).getOperand(0).getValueType() == MVT::i1) ||
+ isa<ConstantSDNode>(N->getOperand(i)))
+ Inputs.push_back(N->getOperand(i));
+ else
+ BinOps.push_back(N->getOperand(i));
+
+ if (N->getOpcode() == ISD::TRUNCATE)
+ break;
+ }
+
+ // Visit all inputs, collect all binary operations (and, or, xor and
+ // select) that are all fed by extensions.
+ while (!BinOps.empty()) {
+ SDValue BinOp = BinOps.back();
+ BinOps.pop_back();
+
+ if (!Visited.insert(BinOp.getNode()).second)
+ continue;
+
+ PromOps.push_back(BinOp);
+
+ for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) {
+ // The condition of the select is not promoted.
+ if (BinOp.getOpcode() == ISD::SELECT && i == 0)
+ continue;
+ if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3)
+ continue;
+
+ if (((BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND ||
+ BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND ||
+ BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) &&
+ BinOp.getOperand(i).getOperand(0).getValueType() == MVT::i1) ||
+ isa<ConstantSDNode>(BinOp.getOperand(i))) {
+ Inputs.push_back(BinOp.getOperand(i));
+ } else if (BinOp.getOperand(i).getOpcode() == ISD::AND ||
+ BinOp.getOperand(i).getOpcode() == ISD::OR ||
+ BinOp.getOperand(i).getOpcode() == ISD::XOR ||
+ BinOp.getOperand(i).getOpcode() == ISD::SELECT ||
+ BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC ||
+ BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE ||
+ BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND ||
+ BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND ||
+ BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) {
+ BinOps.push_back(BinOp.getOperand(i));
+ } else {
+ // We have an input that is not an extension or another binary
+ // operation; we'll abort this transformation.
+ return SDValue();
+ }
+ }
+ }
+
+ // Make sure that this is a self-contained cluster of operations (which
+ // is not quite the same thing as saying that everything has only one
+ // use).
+ for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
+ if (isa<ConstantSDNode>(Inputs[i]))
+ continue;
+
+ for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(),
+ UE = Inputs[i].getNode()->use_end();
+ UI != UE; ++UI) {
+ SDNode *User = *UI;
+ if (User != N && !Visited.count(User))
+ return SDValue();
+
+ // Make sure that we're not going to promote the non-output-value
+ // operand(s) or SELECT or SELECT_CC.
+ // FIXME: Although we could sometimes handle this, and it does occur in
+ // practice that one of the condition inputs to the select is also one of
+ // the outputs, we currently can't deal with this.
+ if (User->getOpcode() == ISD::SELECT) {
+ if (User->getOperand(0) == Inputs[i])
+ return SDValue();
+ } else if (User->getOpcode() == ISD::SELECT_CC) {
+ if (User->getOperand(0) == Inputs[i] ||
+ User->getOperand(1) == Inputs[i])
+ return SDValue();
+ }
+ }
+ }
+
+ for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) {
+ for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(),
+ UE = PromOps[i].getNode()->use_end();
+ UI != UE; ++UI) {
+ SDNode *User = *UI;
+ if (User != N && !Visited.count(User))
+ return SDValue();
+
+ // Make sure that we're not going to promote the non-output-value
+ // operand(s) or SELECT or SELECT_CC.
+ // FIXME: Although we could sometimes handle this, and it does occur in
+ // practice that one of the condition inputs to the select is also one of
+ // the outputs, we currently can't deal with this.
+ if (User->getOpcode() == ISD::SELECT) {
+ if (User->getOperand(0) == PromOps[i])
+ return SDValue();
+ } else if (User->getOpcode() == ISD::SELECT_CC) {
+ if (User->getOperand(0) == PromOps[i] ||
+ User->getOperand(1) == PromOps[i])
+ return SDValue();
+ }
+ }
+ }
+
+ // Replace all inputs with the extension operand.
+ for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
+ // Constants may have users outside the cluster of to-be-promoted nodes,
+ // and so we need to replace those as we do the promotions.
+ if (isa<ConstantSDNode>(Inputs[i]))
+ continue;
+ else
+ DAG.ReplaceAllUsesOfValueWith(Inputs[i], Inputs[i].getOperand(0));
+ }
+
+ std::list<HandleSDNode> PromOpHandles;
+ for (auto &PromOp : PromOps)
+ PromOpHandles.emplace_back(PromOp);
+
+ // Replace all operations (these are all the same, but have a different
+ // (i1) return type). DAG.getNode will validate that the types of
+ // a binary operator match, so go through the list in reverse so that
+ // we've likely promoted both operands first. Any intermediate truncations or
+ // extensions disappear.
+ while (!PromOpHandles.empty()) {
+ SDValue PromOp = PromOpHandles.back().getValue();
+ PromOpHandles.pop_back();
+
+ if (PromOp.getOpcode() == ISD::TRUNCATE ||
+ PromOp.getOpcode() == ISD::SIGN_EXTEND ||
+ PromOp.getOpcode() == ISD::ZERO_EXTEND ||
+ PromOp.getOpcode() == ISD::ANY_EXTEND) {
+ if (!isa<ConstantSDNode>(PromOp.getOperand(0)) &&
+ PromOp.getOperand(0).getValueType() != MVT::i1) {
+ // The operand is not yet ready (see comment below).
+ PromOpHandles.emplace_front(PromOp);
+ continue;
+ }
+
+ SDValue RepValue = PromOp.getOperand(0);
+ if (isa<ConstantSDNode>(RepValue))
+ RepValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, RepValue);
+
+ DAG.ReplaceAllUsesOfValueWith(PromOp, RepValue);
+ continue;
+ }
+
+ unsigned C;
+ switch (PromOp.getOpcode()) {
+ default: C = 0; break;
+ case ISD::SELECT: C = 1; break;
+ case ISD::SELECT_CC: C = 2; break;
+ }
+
+ if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) &&
+ PromOp.getOperand(C).getValueType() != MVT::i1) ||
+ (!isa<ConstantSDNode>(PromOp.getOperand(C+1)) &&
+ PromOp.getOperand(C+1).getValueType() != MVT::i1)) {
+ // The to-be-promoted operands of this node have not yet been
+ // promoted (this should be rare because we're going through the
+ // list backward, but if one of the operands has several users in
+ // this cluster of to-be-promoted nodes, it is possible).
+ PromOpHandles.emplace_front(PromOp);
+ continue;
+ }
+
+ SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(),
+ PromOp.getNode()->op_end());
+
+ // If there are any constant inputs, make sure they're replaced now.
+ for (unsigned i = 0; i < 2; ++i)
+ if (isa<ConstantSDNode>(Ops[C+i]))
+ Ops[C+i] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ops[C+i]);
+
+ DAG.ReplaceAllUsesOfValueWith(PromOp,
+ DAG.getNode(PromOp.getOpcode(), dl, MVT::i1, Ops));
+ }
+
+ // Now we're left with the initial truncation itself.
+ if (N->getOpcode() == ISD::TRUNCATE)
+ return N->getOperand(0);
+
+ // Otherwise, this is a comparison. The operands to be compared have just
+ // changed type (to i1), but everything else is the same.
+ return SDValue(N, 0);
+}
+
+SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc dl(N);
+
+ // If we're tracking CR bits, we need to be careful that we don't have:
+ // zext(binary-ops(trunc(x), trunc(y)))
+ // or
+ // zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
+ // such that we're unnecessarily moving things into CR bits that can more
+ // efficiently stay in GPRs. Note that if we're not certain that the high
+ // bits are set as required by the final extension, we still may need to do
+ // some masking to get the proper behavior.
+
+ // This same functionality is important on PPC64 when dealing with
+ // 32-to-64-bit extensions; these occur often when 32-bit values are used as
+ // the return values of functions. Because it is so similar, it is handled
+ // here as well.
+
+ if (N->getValueType(0) != MVT::i32 &&
+ N->getValueType(0) != MVT::i64)
+ return SDValue();
+
+ if (!((N->getOperand(0).getValueType() == MVT::i1 && Subtarget.useCRBits()) ||
+ (N->getOperand(0).getValueType() == MVT::i32 && Subtarget.isPPC64())))
+ return SDValue();
+
+ if (N->getOperand(0).getOpcode() != ISD::AND &&
+ N->getOperand(0).getOpcode() != ISD::OR &&
+ N->getOperand(0).getOpcode() != ISD::XOR &&
+ N->getOperand(0).getOpcode() != ISD::SELECT &&
+ N->getOperand(0).getOpcode() != ISD::SELECT_CC)
+ return SDValue();
+
+ SmallVector<SDValue, 4> Inputs;
+ SmallVector<SDValue, 8> BinOps(1, N->getOperand(0)), PromOps;
+ SmallPtrSet<SDNode *, 16> Visited;
+
+ // Visit all inputs, collect all binary operations (and, or, xor and
+ // select) that are all fed by truncations.
+ while (!BinOps.empty()) {
+ SDValue BinOp = BinOps.back();
+ BinOps.pop_back();
+
+ if (!Visited.insert(BinOp.getNode()).second)
+ continue;
+
+ PromOps.push_back(BinOp);
+
+ for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) {
+ // The condition of the select is not promoted.
+ if (BinOp.getOpcode() == ISD::SELECT && i == 0)
+ continue;
+ if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3)
+ continue;
+
+ if (BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE ||
+ isa<ConstantSDNode>(BinOp.getOperand(i))) {
+ Inputs.push_back(BinOp.getOperand(i));
+ } else if (BinOp.getOperand(i).getOpcode() == ISD::AND ||
+ BinOp.getOperand(i).getOpcode() == ISD::OR ||
+ BinOp.getOperand(i).getOpcode() == ISD::XOR ||
+ BinOp.getOperand(i).getOpcode() == ISD::SELECT ||
+ BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC) {
+ BinOps.push_back(BinOp.getOperand(i));
+ } else {
+ // We have an input that is not a truncation or another binary
+ // operation; we'll abort this transformation.
+ return SDValue();
+ }
+ }
+ }
+
+ // The operands of a select that must be truncated when the select is
+ // promoted because the operand is actually part of the to-be-promoted set.
+ DenseMap<SDNode *, EVT> SelectTruncOp[2];
+
+ // Make sure that this is a self-contained cluster of operations (which
+ // is not quite the same thing as saying that everything has only one
+ // use).
+ for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
+ if (isa<ConstantSDNode>(Inputs[i]))
+ continue;
+
+ for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(),
+ UE = Inputs[i].getNode()->use_end();
+ UI != UE; ++UI) {
+ SDNode *User = *UI;
+ if (User != N && !Visited.count(User))
+ return SDValue();
+
+ // If we're going to promote the non-output-value operand(s) or SELECT or
+ // SELECT_CC, record them for truncation.
+ if (User->getOpcode() == ISD::SELECT) {
+ if (User->getOperand(0) == Inputs[i])
+ SelectTruncOp[0].insert(std::make_pair(User,
+ User->getOperand(0).getValueType()));
+ } else if (User->getOpcode() == ISD::SELECT_CC) {
+ if (User->getOperand(0) == Inputs[i])
+ SelectTruncOp[0].insert(std::make_pair(User,
+ User->getOperand(0).getValueType()));
+ if (User->getOperand(1) == Inputs[i])
+ SelectTruncOp[1].insert(std::make_pair(User,
+ User->getOperand(1).getValueType()));
+ }
+ }
+ }
+
+ for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) {
+ for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(),
+ UE = PromOps[i].getNode()->use_end();
+ UI != UE; ++UI) {
+ SDNode *User = *UI;
+ if (User != N && !Visited.count(User))
+ return SDValue();
+
+ // If we're going to promote the non-output-value operand(s) or SELECT or
+ // SELECT_CC, record them for truncation.
+ if (User->getOpcode() == ISD::SELECT) {
+ if (User->getOperand(0) == PromOps[i])
+ SelectTruncOp[0].insert(std::make_pair(User,
+ User->getOperand(0).getValueType()));
+ } else if (User->getOpcode() == ISD::SELECT_CC) {
+ if (User->getOperand(0) == PromOps[i])
+ SelectTruncOp[0].insert(std::make_pair(User,
+ User->getOperand(0).getValueType()));
+ if (User->getOperand(1) == PromOps[i])
+ SelectTruncOp[1].insert(std::make_pair(User,
+ User->getOperand(1).getValueType()));
+ }
+ }
+ }
+
+ unsigned PromBits = N->getOperand(0).getValueSizeInBits();
+ bool ReallyNeedsExt = false;
+ if (N->getOpcode() != ISD::ANY_EXTEND) {
+ // If all of the inputs are not already sign/zero extended, then
+ // we'll still need to do that at the end.
+ for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
+ if (isa<ConstantSDNode>(Inputs[i]))
+ continue;
+
+ unsigned OpBits =
+ Inputs[i].getOperand(0).getValueSizeInBits();
+ assert(PromBits < OpBits && "Truncation not to a smaller bit count?");
+
+ if ((N->getOpcode() == ISD::ZERO_EXTEND &&
+ !DAG.MaskedValueIsZero(Inputs[i].getOperand(0),
+ APInt::getHighBitsSet(OpBits,
+ OpBits-PromBits))) ||
+ (N->getOpcode() == ISD::SIGN_EXTEND &&
+ DAG.ComputeNumSignBits(Inputs[i].getOperand(0)) <
+ (OpBits-(PromBits-1)))) {
+ ReallyNeedsExt = true;
+ break;
+ }
+ }
+ }
+
+ // Replace all inputs, either with the truncation operand, or a
+ // truncation or extension to the final output type.
+ for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
+ // Constant inputs need to be replaced with the to-be-promoted nodes that
+ // use them because they might have users outside of the cluster of
+ // promoted nodes.
+ if (isa<ConstantSDNode>(Inputs[i]))
+ continue;
+
+ SDValue InSrc = Inputs[i].getOperand(0);
+ if (Inputs[i].getValueType() == N->getValueType(0))
+ DAG.ReplaceAllUsesOfValueWith(Inputs[i], InSrc);
+ else if (N->getOpcode() == ISD::SIGN_EXTEND)
+ DAG.ReplaceAllUsesOfValueWith(Inputs[i],
+ DAG.getSExtOrTrunc(InSrc, dl, N->getValueType(0)));
+ else if (N->getOpcode() == ISD::ZERO_EXTEND)
+ DAG.ReplaceAllUsesOfValueWith(Inputs[i],
+ DAG.getZExtOrTrunc(InSrc, dl, N->getValueType(0)));
+ else
+ DAG.ReplaceAllUsesOfValueWith(Inputs[i],
+ DAG.getAnyExtOrTrunc(InSrc, dl, N->getValueType(0)));
+ }
+
+ std::list<HandleSDNode> PromOpHandles;
+ for (auto &PromOp : PromOps)
+ PromOpHandles.emplace_back(PromOp);
+
+ // Replace all operations (these are all the same, but have a different
+ // (promoted) return type). DAG.getNode will validate that the types of
+ // a binary operator match, so go through the list in reverse so that
+ // we've likely promoted both operands first.
+ while (!PromOpHandles.empty()) {
+ SDValue PromOp = PromOpHandles.back().getValue();
+ PromOpHandles.pop_back();
+
+ unsigned C;
+ switch (PromOp.getOpcode()) {
+ default: C = 0; break;
+ case ISD::SELECT: C = 1; break;
+ case ISD::SELECT_CC: C = 2; break;
+ }
+
+ if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) &&
+ PromOp.getOperand(C).getValueType() != N->getValueType(0)) ||
+ (!isa<ConstantSDNode>(PromOp.getOperand(C+1)) &&
+ PromOp.getOperand(C+1).getValueType() != N->getValueType(0))) {
+ // The to-be-promoted operands of this node have not yet been
+ // promoted (this should be rare because we're going through the
+ // list backward, but if one of the operands has several users in
+ // this cluster of to-be-promoted nodes, it is possible).
+ PromOpHandles.emplace_front(PromOp);
+ continue;
+ }
+
+ // For SELECT and SELECT_CC nodes, we do a similar check for any
+ // to-be-promoted comparison inputs.
+ if (PromOp.getOpcode() == ISD::SELECT ||
+ PromOp.getOpcode() == ISD::SELECT_CC) {
+ if ((SelectTruncOp[0].count(PromOp.getNode()) &&
+ PromOp.getOperand(0).getValueType() != N->getValueType(0)) ||
+ (SelectTruncOp[1].count(PromOp.getNode()) &&
+ PromOp.getOperand(1).getValueType() != N->getValueType(0))) {
+ PromOpHandles.emplace_front(PromOp);
+ continue;
+ }
+ }
+
+ SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(),
+ PromOp.getNode()->op_end());
+
+ // If this node has constant inputs, then they'll need to be promoted here.
+ for (unsigned i = 0; i < 2; ++i) {
+ if (!isa<ConstantSDNode>(Ops[C+i]))
+ continue;
+ if (Ops[C+i].getValueType() == N->getValueType(0))
+ continue;
+
+ if (N->getOpcode() == ISD::SIGN_EXTEND)
+ Ops[C+i] = DAG.getSExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
+ else if (N->getOpcode() == ISD::ZERO_EXTEND)
+ Ops[C+i] = DAG.getZExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
+ else
+ Ops[C+i] = DAG.getAnyExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
+ }
+
+ // If we've promoted the comparison inputs of a SELECT or SELECT_CC,
+ // truncate them again to the original value type.
+ if (PromOp.getOpcode() == ISD::SELECT ||
+ PromOp.getOpcode() == ISD::SELECT_CC) {
+ auto SI0 = SelectTruncOp[0].find(PromOp.getNode());
+ if (SI0 != SelectTruncOp[0].end())
+ Ops[0] = DAG.getNode(ISD::TRUNCATE, dl, SI0->second, Ops[0]);
+ auto SI1 = SelectTruncOp[1].find(PromOp.getNode());
+ if (SI1 != SelectTruncOp[1].end())
+ Ops[1] = DAG.getNode(ISD::TRUNCATE, dl, SI1->second, Ops[1]);
+ }
+
+ DAG.ReplaceAllUsesOfValueWith(PromOp,
+ DAG.getNode(PromOp.getOpcode(), dl, N->getValueType(0), Ops));
+ }
+
+ // Now we're left with the initial extension itself.
+ if (!ReallyNeedsExt)
+ return N->getOperand(0);
+
+ // To zero extend, just mask off everything except for the first bit (in the
+ // i1 case).
+ if (N->getOpcode() == ISD::ZERO_EXTEND)
+ return DAG.getNode(ISD::AND, dl, N->getValueType(0), N->getOperand(0),
+ DAG.getConstant(APInt::getLowBitsSet(
+ N->getValueSizeInBits(0), PromBits),
+ dl, N->getValueType(0)));
+
+ assert(N->getOpcode() == ISD::SIGN_EXTEND &&
+ "Invalid extension type");
+ EVT ShiftAmountTy = getShiftAmountTy(N->getValueType(0), DAG.getDataLayout());
+ SDValue ShiftCst =
+ DAG.getConstant(N->getValueSizeInBits(0) - PromBits, dl, ShiftAmountTy);
+ return DAG.getNode(
+ ISD::SRA, dl, N->getValueType(0),
+ DAG.getNode(ISD::SHL, dl, N->getValueType(0), N->getOperand(0), ShiftCst),
+ ShiftCst);
+}
+
+/// \brief Reduces the number of fp-to-int conversion when building a vector.
+///
+/// If this vector is built out of floating to integer conversions,
+/// transform it to a vector built out of floating point values followed by a
+/// single floating to integer conversion of the vector.
+/// Namely (build_vector (fptosi $A), (fptosi $B), ...)
+/// becomes (fptosi (build_vector ($A, $B, ...)))
+SDValue PPCTargetLowering::
+combineElementTruncationToVectorTruncation(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ assert(N->getOpcode() == ISD::BUILD_VECTOR &&
+ "Should be called with a BUILD_VECTOR node");
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc dl(N);
+
+ SDValue FirstInput = N->getOperand(0);
+ assert(FirstInput.getOpcode() == PPCISD::MFVSR &&
+ "The input operand must be an fp-to-int conversion.");
+
+ // This combine happens after legalization so the fp_to_[su]i nodes are
+ // already converted to PPCSISD nodes.
+ unsigned FirstConversion = FirstInput.getOperand(0).getOpcode();
+ if (FirstConversion == PPCISD::FCTIDZ ||
+ FirstConversion == PPCISD::FCTIDUZ ||
+ FirstConversion == PPCISD::FCTIWZ ||
+ FirstConversion == PPCISD::FCTIWUZ) {
+ bool IsSplat = true;
+ bool Is32Bit = FirstConversion == PPCISD::FCTIWZ ||
+ FirstConversion == PPCISD::FCTIWUZ;
+ EVT SrcVT = FirstInput.getOperand(0).getValueType();
+ SmallVector<SDValue, 4> Ops;
+ EVT TargetVT = N->getValueType(0);
+ for (int i = 0, e = N->getNumOperands(); i < e; ++i) {
+ if (N->getOperand(i).getOpcode() != PPCISD::MFVSR)
+ return SDValue();
+ unsigned NextConversion = N->getOperand(i).getOperand(0).getOpcode();
+ if (NextConversion != FirstConversion)
+ return SDValue();
+ if (N->getOperand(i) != FirstInput)
+ IsSplat = false;
+ }
+
+ // If this is a splat, we leave it as-is since there will be only a single
+ // fp-to-int conversion followed by a splat of the integer. This is better
+ // for 32-bit and smaller ints and neutral for 64-bit ints.
+ if (IsSplat)
+ return SDValue();
+
+ // Now that we know we have the right type of node, get its operands
+ for (int i = 0, e = N->getNumOperands(); i < e; ++i) {
+ SDValue In = N->getOperand(i).getOperand(0);
+ // For 32-bit values, we need to add an FP_ROUND node.
+ if (Is32Bit) {
+ if (In.isUndef())
+ Ops.push_back(DAG.getUNDEF(SrcVT));
+ else {
+ SDValue Trunc = DAG.getNode(ISD::FP_ROUND, dl,
+ MVT::f32, In.getOperand(0),
+ DAG.getIntPtrConstant(1, dl));
+ Ops.push_back(Trunc);
+ }
+ } else
+ Ops.push_back(In.isUndef() ? DAG.getUNDEF(SrcVT) : In.getOperand(0));
+ }
+
+ unsigned Opcode;
+ if (FirstConversion == PPCISD::FCTIDZ ||
+ FirstConversion == PPCISD::FCTIWZ)
+ Opcode = ISD::FP_TO_SINT;
+ else
+ Opcode = ISD::FP_TO_UINT;
+
+ EVT NewVT = TargetVT == MVT::v2i64 ? MVT::v2f64 : MVT::v4f32;
+ SDValue BV = DAG.getBuildVector(NewVT, dl, Ops);
+ return DAG.getNode(Opcode, dl, TargetVT, BV);
+ }
+ return SDValue();
+}
+
+/// \brief Reduce the number of loads when building a vector.
+///
+/// Building a vector out of multiple loads can be converted to a load
+/// of the vector type if the loads are consecutive. If the loads are
+/// consecutive but in descending order, a shuffle is added at the end
+/// to reorder the vector.
+static SDValue combineBVOfConsecutiveLoads(SDNode *N, SelectionDAG &DAG) {
+ assert(N->getOpcode() == ISD::BUILD_VECTOR &&
+ "Should be called with a BUILD_VECTOR node");
+
+ SDLoc dl(N);
+ bool InputsAreConsecutiveLoads = true;
+ bool InputsAreReverseConsecutive = true;
+ unsigned ElemSize = N->getValueType(0).getScalarSizeInBits() / 8;
+ SDValue FirstInput = N->getOperand(0);
+ bool IsRoundOfExtLoad = false;
+
+ if (FirstInput.getOpcode() == ISD::FP_ROUND &&
+ FirstInput.getOperand(0).getOpcode() == ISD::LOAD) {
+ LoadSDNode *LD = dyn_cast<LoadSDNode>(FirstInput.getOperand(0));
+ IsRoundOfExtLoad = LD->getExtensionType() == ISD::EXTLOAD;
+ }
+ // Not a build vector of (possibly fp_rounded) loads.
+ if (!IsRoundOfExtLoad && FirstInput.getOpcode() != ISD::LOAD)
+ return SDValue();
+
+ for (int i = 1, e = N->getNumOperands(); i < e; ++i) {
+ // If any inputs are fp_round(extload), they all must be.
+ if (IsRoundOfExtLoad && N->getOperand(i).getOpcode() != ISD::FP_ROUND)
+ return SDValue();
+
+ SDValue NextInput = IsRoundOfExtLoad ? N->getOperand(i).getOperand(0) :
+ N->getOperand(i);
+ if (NextInput.getOpcode() != ISD::LOAD)
+ return SDValue();
+
+ SDValue PreviousInput =
+ IsRoundOfExtLoad ? N->getOperand(i-1).getOperand(0) : N->getOperand(i-1);
+ LoadSDNode *LD1 = dyn_cast<LoadSDNode>(PreviousInput);
+ LoadSDNode *LD2 = dyn_cast<LoadSDNode>(NextInput);
+
+ // If any inputs are fp_round(extload), they all must be.
+ if (IsRoundOfExtLoad && LD2->getExtensionType() != ISD::EXTLOAD)
+ return SDValue();
+
+ if (!isConsecutiveLS(LD2, LD1, ElemSize, 1, DAG))
+ InputsAreConsecutiveLoads = false;
+ if (!isConsecutiveLS(LD1, LD2, ElemSize, 1, DAG))
+ InputsAreReverseConsecutive = false;
+
+ // Exit early if the loads are neither consecutive nor reverse consecutive.
+ if (!InputsAreConsecutiveLoads && !InputsAreReverseConsecutive)
+ return SDValue();
+ }
+
+ assert(!(InputsAreConsecutiveLoads && InputsAreReverseConsecutive) &&
+ "The loads cannot be both consecutive and reverse consecutive.");
+
+ SDValue FirstLoadOp =
+ IsRoundOfExtLoad ? FirstInput.getOperand(0) : FirstInput;
+ SDValue LastLoadOp =
+ IsRoundOfExtLoad ? N->getOperand(N->getNumOperands()-1).getOperand(0) :
+ N->getOperand(N->getNumOperands()-1);
+
+ LoadSDNode *LD1 = dyn_cast<LoadSDNode>(FirstLoadOp);
+ LoadSDNode *LDL = dyn_cast<LoadSDNode>(LastLoadOp);
+ if (InputsAreConsecutiveLoads) {
+ assert(LD1 && "Input needs to be a LoadSDNode.");
+ return DAG.getLoad(N->getValueType(0), dl, LD1->getChain(),
+ LD1->getBasePtr(), LD1->getPointerInfo(),
+ LD1->getAlignment());
+ }
+ if (InputsAreReverseConsecutive) {
+ assert(LDL && "Input needs to be a LoadSDNode.");
+ SDValue Load = DAG.getLoad(N->getValueType(0), dl, LDL->getChain(),
+ LDL->getBasePtr(), LDL->getPointerInfo(),
+ LDL->getAlignment());
+ SmallVector<int, 16> Ops;
+ for (int i = N->getNumOperands() - 1; i >= 0; i--)
+ Ops.push_back(i);
+
+ return DAG.getVectorShuffle(N->getValueType(0), dl, Load,
+ DAG.getUNDEF(N->getValueType(0)), Ops);
+ }
+ return SDValue();
+}
+
+SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ assert(N->getOpcode() == ISD::BUILD_VECTOR &&
+ "Should be called with a BUILD_VECTOR node");
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc dl(N);
+
+ if (!Subtarget.hasVSX())
+ return SDValue();
+
+ // The target independent DAG combiner will leave a build_vector of
+ // float-to-int conversions intact. We can generate MUCH better code for
+ // a float-to-int conversion of a vector of floats.
+ SDValue FirstInput = N->getOperand(0);
+ if (FirstInput.getOpcode() == PPCISD::MFVSR) {
+ SDValue Reduced = combineElementTruncationToVectorTruncation(N, DCI);
+ if (Reduced)
+ return Reduced;
+ }
+
+ // If we're building a vector out of consecutive loads, just load that
+ // vector type.
+ SDValue Reduced = combineBVOfConsecutiveLoads(N, DAG);
+ if (Reduced)
+ return Reduced;
+
+ if (N->getValueType(0) != MVT::v2f64)
+ return SDValue();
+
+ // Looking for:
+ // (build_vector ([su]int_to_fp (extractelt 0)), [su]int_to_fp (extractelt 1))
+ if (FirstInput.getOpcode() != ISD::SINT_TO_FP &&
+ FirstInput.getOpcode() != ISD::UINT_TO_FP)
+ return SDValue();
+ if (N->getOperand(1).getOpcode() != ISD::SINT_TO_FP &&
+ N->getOperand(1).getOpcode() != ISD::UINT_TO_FP)
+ return SDValue();
+ if (FirstInput.getOpcode() != N->getOperand(1).getOpcode())
+ return SDValue();
+
+ SDValue Ext1 = FirstInput.getOperand(0);
+ SDValue Ext2 = N->getOperand(1).getOperand(0);
+ if(Ext1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ Ext2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ ConstantSDNode *Ext1Op = dyn_cast<ConstantSDNode>(Ext1.getOperand(1));
+ ConstantSDNode *Ext2Op = dyn_cast<ConstantSDNode>(Ext2.getOperand(1));
+ if (!Ext1Op || !Ext2Op)
+ return SDValue();
+ if (Ext1.getValueType() != MVT::i32 ||
+ Ext2.getValueType() != MVT::i32)
+ if (Ext1.getOperand(0) != Ext2.getOperand(0))
+ return SDValue();
+
+ int FirstElem = Ext1Op->getZExtValue();
+ int SecondElem = Ext2Op->getZExtValue();
+ int SubvecIdx;
+ if (FirstElem == 0 && SecondElem == 1)
+ SubvecIdx = Subtarget.isLittleEndian() ? 1 : 0;
+ else if (FirstElem == 2 && SecondElem == 3)
+ SubvecIdx = Subtarget.isLittleEndian() ? 0 : 1;
+ else
+ return SDValue();
+
+ SDValue SrcVec = Ext1.getOperand(0);
+ auto NodeType = (N->getOperand(1).getOpcode() == ISD::SINT_TO_FP) ?
+ PPCISD::SINT_VEC_TO_FP : PPCISD::UINT_VEC_TO_FP;
+ return DAG.getNode(NodeType, dl, MVT::v2f64,
+ SrcVec, DAG.getIntPtrConstant(SubvecIdx, dl));
+}
+
+SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ assert((N->getOpcode() == ISD::SINT_TO_FP ||
+ N->getOpcode() == ISD::UINT_TO_FP) &&
+ "Need an int -> FP conversion node here");
+
+ if (useSoftFloat() || !Subtarget.has64BitSupport())
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc dl(N);
+ SDValue Op(N, 0);
+
+ SDValue FirstOperand(Op.getOperand(0));
+ bool SubWordLoad = FirstOperand.getOpcode() == ISD::LOAD &&
+ (FirstOperand.getValueType() == MVT::i8 ||
+ FirstOperand.getValueType() == MVT::i16);
+ if (Subtarget.hasP9Vector() && Subtarget.hasP9Altivec() && SubWordLoad) {
+ bool Signed = N->getOpcode() == ISD::SINT_TO_FP;
+ bool DstDouble = Op.getValueType() == MVT::f64;
+ unsigned ConvOp = Signed ?
+ (DstDouble ? PPCISD::FCFID : PPCISD::FCFIDS) :
+ (DstDouble ? PPCISD::FCFIDU : PPCISD::FCFIDUS);
+ SDValue WidthConst =
+ DAG.getIntPtrConstant(FirstOperand.getValueType() == MVT::i8 ? 1 : 2,
+ dl, false);
+ LoadSDNode *LDN = cast<LoadSDNode>(FirstOperand.getNode());
+ SDValue Ops[] = { LDN->getChain(), LDN->getBasePtr(), WidthConst };
+ SDValue Ld = DAG.getMemIntrinsicNode(PPCISD::LXSIZX, dl,
+ DAG.getVTList(MVT::f64, MVT::Other),
+ Ops, MVT::i8, LDN->getMemOperand());
+
+ // For signed conversion, we need to sign-extend the value in the VSR
+ if (Signed) {
+ SDValue ExtOps[] = { Ld, WidthConst };
+ SDValue Ext = DAG.getNode(PPCISD::VEXTS, dl, MVT::f64, ExtOps);
+ return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ext);
+ } else
+ return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ld);
+ }
+
+ // Don't handle ppc_fp128 here or i1 conversions.
+ if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
+ return SDValue();
+ if (Op.getOperand(0).getValueType() == MVT::i1)
+ return SDValue();
+
+ // For i32 intermediate values, unfortunately, the conversion functions
+ // leave the upper 32 bits of the value are undefined. Within the set of
+ // scalar instructions, we have no method for zero- or sign-extending the
+ // value. Thus, we cannot handle i32 intermediate values here.
+ if (Op.getOperand(0).getValueType() == MVT::i32)
+ return SDValue();
+
+ assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) &&
+ "UINT_TO_FP is supported only with FPCVT");
+
+ // If we have FCFIDS, then use it when converting to single-precision.
+ // Otherwise, convert to double-precision and then round.
+ unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
+ ? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS
+ : PPCISD::FCFIDS)
+ : (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU
+ : PPCISD::FCFID);
+ MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
+ ? MVT::f32
+ : MVT::f64;
+
+ // If we're converting from a float, to an int, and back to a float again,
+ // then we don't need the store/load pair at all.
+ if ((Op.getOperand(0).getOpcode() == ISD::FP_TO_UINT &&
+ Subtarget.hasFPCVT()) ||
+ (Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT)) {
+ SDValue Src = Op.getOperand(0).getOperand(0);
+ if (Src.getValueType() == MVT::f32) {
+ Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
+ DCI.AddToWorklist(Src.getNode());
+ } else if (Src.getValueType() != MVT::f64) {
+ // Make sure that we don't pick up a ppc_fp128 source value.
+ return SDValue();
+ }
+
+ unsigned FCTOp =
+ Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
+ PPCISD::FCTIDUZ;
+
+ SDValue Tmp = DAG.getNode(FCTOp, dl, MVT::f64, Src);
+ SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Tmp);
+
+ if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) {
+ FP = DAG.getNode(ISD::FP_ROUND, dl,
+ MVT::f32, FP, DAG.getIntPtrConstant(0, dl));
+ DCI.AddToWorklist(FP.getNode());
+ }
+
+ return FP;
+ }
+
+ return SDValue();
+}
+
+// expandVSXLoadForLE - Convert VSX loads (which may be intrinsics for
+// builtins) into loads with swaps.
+SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc dl(N);
+ SDValue Chain;
+ SDValue Base;
+ MachineMemOperand *MMO;
+
+ switch (N->getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected opcode for little endian VSX load");
+ case ISD::LOAD: {
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ Chain = LD->getChain();
+ Base = LD->getBasePtr();
+ MMO = LD->getMemOperand();
+ // If the MMO suggests this isn't a load of a full vector, leave
+ // things alone. For a built-in, we have to make the change for
+ // correctness, so if there is a size problem that will be a bug.
+ if (MMO->getSize() < 16)
+ return SDValue();
+ break;
+ }
+ case ISD::INTRINSIC_W_CHAIN: {
+ MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N);
+ Chain = Intrin->getChain();
+ // Similarly to the store case below, Intrin->getBasePtr() doesn't get
+ // us what we want. Get operand 2 instead.
+ Base = Intrin->getOperand(2);
+ MMO = Intrin->getMemOperand();
+ break;
+ }
+ }
+
+ MVT VecTy = N->getValueType(0).getSimpleVT();
+ SDValue LoadOps[] = { Chain, Base };
+ SDValue Load = DAG.getMemIntrinsicNode(PPCISD::LXVD2X, dl,
+ DAG.getVTList(MVT::v2f64, MVT::Other),
+ LoadOps, MVT::v2f64, MMO);
+
+ DCI.AddToWorklist(Load.getNode());
+ Chain = Load.getValue(1);
+ SDValue Swap = DAG.getNode(
+ PPCISD::XXSWAPD, dl, DAG.getVTList(MVT::v2f64, MVT::Other), Chain, Load);
+ DCI.AddToWorklist(Swap.getNode());
+
+ // Add a bitcast if the resulting load type doesn't match v2f64.
+ if (VecTy != MVT::v2f64) {
+ SDValue N = DAG.getNode(ISD::BITCAST, dl, VecTy, Swap);
+ DCI.AddToWorklist(N.getNode());
+ // Package {bitcast value, swap's chain} to match Load's shape.
+ return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VecTy, MVT::Other),
+ N, Swap.getValue(1));
+ }
+
+ return Swap;
+}
+
+// expandVSXStoreForLE - Convert VSX stores (which may be intrinsics for
+// builtins) into stores with swaps.
+SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc dl(N);
+ SDValue Chain;
+ SDValue Base;
+ unsigned SrcOpnd;
+ MachineMemOperand *MMO;
+
+ switch (N->getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected opcode for little endian VSX store");
+ case ISD::STORE: {
+ StoreSDNode *ST = cast<StoreSDNode>(N);
+ Chain = ST->getChain();
+ Base = ST->getBasePtr();
+ MMO = ST->getMemOperand();
+ SrcOpnd = 1;
+ // If the MMO suggests this isn't a store of a full vector, leave
+ // things alone. For a built-in, we have to make the change for
+ // correctness, so if there is a size problem that will be a bug.
+ if (MMO->getSize() < 16)
+ return SDValue();
+ break;
+ }
+ case ISD::INTRINSIC_VOID: {
+ MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N);
+ Chain = Intrin->getChain();
+ // Intrin->getBasePtr() oddly does not get what we want.
+ Base = Intrin->getOperand(3);
+ MMO = Intrin->getMemOperand();
+ SrcOpnd = 2;
+ break;
+ }
+ }
+
+ SDValue Src = N->getOperand(SrcOpnd);
+ MVT VecTy = Src.getValueType().getSimpleVT();
+
+ // All stores are done as v2f64 and possible bit cast.
+ if (VecTy != MVT::v2f64) {
+ Src = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Src);
+ DCI.AddToWorklist(Src.getNode());
+ }
+
+ SDValue Swap = DAG.getNode(PPCISD::XXSWAPD, dl,
+ DAG.getVTList(MVT::v2f64, MVT::Other), Chain, Src);
+ DCI.AddToWorklist(Swap.getNode());
+ Chain = Swap.getValue(1);
+ SDValue StoreOps[] = { Chain, Swap, Base };
+ SDValue Store = DAG.getMemIntrinsicNode(PPCISD::STXVD2X, dl,
+ DAG.getVTList(MVT::Other),
+ StoreOps, VecTy, MMO);
+ DCI.AddToWorklist(Store.getNode());
+ return Store;
+}
+
+SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc dl(N);
+ switch (N->getOpcode()) {
+ default: break;
+ case PPCISD::SHL:
+ if (isNullConstant(N->getOperand(0))) // 0 << V -> 0.
+ return N->getOperand(0);
+ break;
+ case PPCISD::SRL:
+ if (isNullConstant(N->getOperand(0))) // 0 >>u V -> 0.
+ return N->getOperand(0);
+ break;
+ case PPCISD::SRA:
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+ if (C->isNullValue() || // 0 >>s V -> 0.
+ C->isAllOnesValue()) // -1 >>s V -> -1.
+ return N->getOperand(0);
+ }
+ break;
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND:
+ case ISD::ANY_EXTEND:
+ return DAGCombineExtBoolTrunc(N, DCI);
+ case ISD::TRUNCATE:
+ case ISD::SETCC:
+ case ISD::SELECT_CC:
+ return DAGCombineTruncBoolExt(N, DCI);
+ case ISD::SINT_TO_FP:
+ case ISD::UINT_TO_FP:
+ return combineFPToIntToFP(N, DCI);
+ case ISD::STORE: {
+ EVT Op1VT = N->getOperand(1).getValueType();
+ bool ValidTypeForStoreFltAsInt = (Op1VT == MVT::i32) ||
+ (Subtarget.hasP9Vector() && (Op1VT == MVT::i8 || Op1VT == MVT::i16));
+
+ // Turn STORE (FP_TO_SINT F) -> STFIWX(FCTIWZ(F)).
+ if (Subtarget.hasSTFIWX() && !cast<StoreSDNode>(N)->isTruncatingStore() &&
+ N->getOperand(1).getOpcode() == ISD::FP_TO_SINT &&
+ ValidTypeForStoreFltAsInt &&
+ N->getOperand(1).getOperand(0).getValueType() != MVT::ppcf128) {
+ SDValue Val = N->getOperand(1).getOperand(0);
+ if (Val.getValueType() == MVT::f32) {
+ Val = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Val);
+ DCI.AddToWorklist(Val.getNode());
+ }
+ Val = DAG.getNode(PPCISD::FCTIWZ, dl, MVT::f64, Val);
+ DCI.AddToWorklist(Val.getNode());
+
+ if (Op1VT == MVT::i32) {
+ SDValue Ops[] = {
+ N->getOperand(0), Val, N->getOperand(2),
+ DAG.getValueType(N->getOperand(1).getValueType())
+ };
+
+ Val = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl,
+ DAG.getVTList(MVT::Other), Ops,
+ cast<StoreSDNode>(N)->getMemoryVT(),
+ cast<StoreSDNode>(N)->getMemOperand());
+ } else {
+ unsigned WidthInBytes =
+ N->getOperand(1).getValueType() == MVT::i8 ? 1 : 2;
+ SDValue WidthConst = DAG.getIntPtrConstant(WidthInBytes, dl, false);
+
+ SDValue Ops[] = {
+ N->getOperand(0), Val, N->getOperand(2), WidthConst,
+ DAG.getValueType(N->getOperand(1).getValueType())
+ };
+ Val = DAG.getMemIntrinsicNode(PPCISD::STXSIX, dl,
+ DAG.getVTList(MVT::Other), Ops,
+ cast<StoreSDNode>(N)->getMemoryVT(),
+ cast<StoreSDNode>(N)->getMemOperand());
+ }
+
+ DCI.AddToWorklist(Val.getNode());
+ return Val;
+ }
+
+ // Turn STORE (BSWAP) -> sthbrx/stwbrx.
+ if (cast<StoreSDNode>(N)->isUnindexed() &&
+ N->getOperand(1).getOpcode() == ISD::BSWAP &&
+ N->getOperand(1).getNode()->hasOneUse() &&
+ (N->getOperand(1).getValueType() == MVT::i32 ||
+ N->getOperand(1).getValueType() == MVT::i16 ||
+ (Subtarget.hasLDBRX() && Subtarget.isPPC64() &&
+ N->getOperand(1).getValueType() == MVT::i64))) {
+ SDValue BSwapOp = N->getOperand(1).getOperand(0);
+ // Do an any-extend to 32-bits if this is a half-word input.
+ if (BSwapOp.getValueType() == MVT::i16)
+ BSwapOp = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, BSwapOp);
+
+ SDValue Ops[] = {
+ N->getOperand(0), BSwapOp, N->getOperand(2),
+ DAG.getValueType(N->getOperand(1).getValueType())
+ };
+ return
+ DAG.getMemIntrinsicNode(PPCISD::STBRX, dl, DAG.getVTList(MVT::Other),
+ Ops, cast<StoreSDNode>(N)->getMemoryVT(),
+ cast<StoreSDNode>(N)->getMemOperand());
+ }
+
+ // For little endian, VSX stores require generating xxswapd/lxvd2x.
+ // Not needed on ISA 3.0 based CPUs since we have a non-permuting store.
+ EVT VT = N->getOperand(1).getValueType();
+ if (VT.isSimple()) {
+ MVT StoreVT = VT.getSimpleVT();
+ if (Subtarget.needsSwapsForVSXMemOps() &&
+ (StoreVT == MVT::v2f64 || StoreVT == MVT::v2i64 ||
+ StoreVT == MVT::v4f32 || StoreVT == MVT::v4i32))
+ return expandVSXStoreForLE(N, DCI);
+ }
+ break;
+ }
+ case ISD::LOAD: {
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ EVT VT = LD->getValueType(0);
+
+ // For little endian, VSX loads require generating lxvd2x/xxswapd.
+ // Not needed on ISA 3.0 based CPUs since we have a non-permuting load.
+ if (VT.isSimple()) {
+ MVT LoadVT = VT.getSimpleVT();
+ if (Subtarget.needsSwapsForVSXMemOps() &&
+ (LoadVT == MVT::v2f64 || LoadVT == MVT::v2i64 ||
+ LoadVT == MVT::v4f32 || LoadVT == MVT::v4i32))
+ return expandVSXLoadForLE(N, DCI);
+ }
+
+ // We sometimes end up with a 64-bit integer load, from which we extract
+ // two single-precision floating-point numbers. This happens with
+ // std::complex<float>, and other similar structures, because of the way we
+ // canonicalize structure copies. However, if we lack direct moves,
+ // then the final bitcasts from the extracted integer values to the
+ // floating-point numbers turn into store/load pairs. Even with direct moves,
+ // just loading the two floating-point numbers is likely better.
+ auto ReplaceTwoFloatLoad = [&]() {
+ if (VT != MVT::i64)
+ return false;
+
+ if (LD->getExtensionType() != ISD::NON_EXTLOAD ||
+ LD->isVolatile())
+ return false;
+
+ // We're looking for a sequence like this:
+ // t13: i64,ch = load<LD8[%ref.tmp]> t0, t6, undef:i64
+ // t16: i64 = srl t13, Constant:i32<32>
+ // t17: i32 = truncate t16
+ // t18: f32 = bitcast t17
+ // t19: i32 = truncate t13
+ // t20: f32 = bitcast t19
+
+ if (!LD->hasNUsesOfValue(2, 0))
+ return false;
+
+ auto UI = LD->use_begin();
+ while (UI.getUse().getResNo() != 0) ++UI;
+ SDNode *Trunc = *UI++;
+ while (UI.getUse().getResNo() != 0) ++UI;
+ SDNode *RightShift = *UI;
+ if (Trunc->getOpcode() != ISD::TRUNCATE)
+ std::swap(Trunc, RightShift);
+
+ if (Trunc->getOpcode() != ISD::TRUNCATE ||
+ Trunc->getValueType(0) != MVT::i32 ||
+ !Trunc->hasOneUse())
+ return false;
+ if (RightShift->getOpcode() != ISD::SRL ||
+ !isa<ConstantSDNode>(RightShift->getOperand(1)) ||
+ RightShift->getConstantOperandVal(1) != 32 ||
+ !RightShift->hasOneUse())
+ return false;
+
+ SDNode *Trunc2 = *RightShift->use_begin();
+ if (Trunc2->getOpcode() != ISD::TRUNCATE ||
+ Trunc2->getValueType(0) != MVT::i32 ||
+ !Trunc2->hasOneUse())
+ return false;
+
+ SDNode *Bitcast = *Trunc->use_begin();
+ SDNode *Bitcast2 = *Trunc2->use_begin();
+
+ if (Bitcast->getOpcode() != ISD::BITCAST ||
+ Bitcast->getValueType(0) != MVT::f32)
+ return false;
+ if (Bitcast2->getOpcode() != ISD::BITCAST ||
+ Bitcast2->getValueType(0) != MVT::f32)
+ return false;
+
+ if (Subtarget.isLittleEndian())
+ std::swap(Bitcast, Bitcast2);
+
+ // Bitcast has the second float (in memory-layout order) and Bitcast2
+ // has the first one.
+
+ SDValue BasePtr = LD->getBasePtr();
+ if (LD->isIndexed()) {
+ assert(LD->getAddressingMode() == ISD::PRE_INC &&
+ "Non-pre-inc AM on PPC?");
+ BasePtr =
+ DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+ LD->getOffset());
+ }
+
+ auto MMOFlags =
+ LD->getMemOperand()->getFlags() & ~MachineMemOperand::MOVolatile;
+ SDValue FloatLoad = DAG.getLoad(MVT::f32, dl, LD->getChain(), BasePtr,
+ LD->getPointerInfo(), LD->getAlignment(),
+ MMOFlags, LD->getAAInfo());
+ SDValue AddPtr =
+ DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
+ BasePtr, DAG.getIntPtrConstant(4, dl));
+ SDValue FloatLoad2 = DAG.getLoad(
+ MVT::f32, dl, SDValue(FloatLoad.getNode(), 1), AddPtr,
+ LD->getPointerInfo().getWithOffset(4),
+ MinAlign(LD->getAlignment(), 4), MMOFlags, LD->getAAInfo());
+
+ if (LD->isIndexed()) {
+ // Note that DAGCombine should re-form any pre-increment load(s) from
+ // what is produced here if that makes sense.
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), BasePtr);
+ }
+
+ DCI.CombineTo(Bitcast2, FloatLoad);
+ DCI.CombineTo(Bitcast, FloatLoad2);
+
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LD, LD->isIndexed() ? 2 : 1),
+ SDValue(FloatLoad2.getNode(), 1));
+ return true;
+ };
+
+ if (ReplaceTwoFloatLoad())
+ return SDValue(N, 0);
+
+ EVT MemVT = LD->getMemoryVT();
+ Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
+ unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty);
+ Type *STy = MemVT.getScalarType().getTypeForEVT(*DAG.getContext());
+ unsigned ScalarABIAlignment = DAG.getDataLayout().getABITypeAlignment(STy);
+ if (LD->isUnindexed() && VT.isVector() &&
+ ((Subtarget.hasAltivec() && ISD::isNON_EXTLoad(N) &&
+ // P8 and later hardware should just use LOAD.
+ !Subtarget.hasP8Vector() && (VT == MVT::v16i8 || VT == MVT::v8i16 ||
+ VT == MVT::v4i32 || VT == MVT::v4f32)) ||
+ (Subtarget.hasQPX() && (VT == MVT::v4f64 || VT == MVT::v4f32) &&
+ LD->getAlignment() >= ScalarABIAlignment)) &&
+ LD->getAlignment() < ABIAlignment) {
+ // This is a type-legal unaligned Altivec or QPX load.
+ SDValue Chain = LD->getChain();
+ SDValue Ptr = LD->getBasePtr();
+ bool isLittleEndian = Subtarget.isLittleEndian();
+
+ // This implements the loading of unaligned vectors as described in
+ // the venerable Apple Velocity Engine overview. Specifically:
+ // https://developer.apple.com/hardwaredrivers/ve/alignment.html
+ // https://developer.apple.com/hardwaredrivers/ve/code_optimization.html
+ //
+ // The general idea is to expand a sequence of one or more unaligned
+ // loads into an alignment-based permutation-control instruction (lvsl
+ // or lvsr), a series of regular vector loads (which always truncate
+ // their input address to an aligned address), and a series of
+ // permutations. The results of these permutations are the requested
+ // loaded values. The trick is that the last "extra" load is not taken
+ // from the address you might suspect (sizeof(vector) bytes after the
+ // last requested load), but rather sizeof(vector) - 1 bytes after the
+ // last requested vector. The point of this is to avoid a page fault if
+ // the base address happened to be aligned. This works because if the
+ // base address is aligned, then adding less than a full vector length
+ // will cause the last vector in the sequence to be (re)loaded.
+ // Otherwise, the next vector will be fetched as you might suspect was
+ // necessary.
+
+ // We might be able to reuse the permutation generation from
+ // a different base address offset from this one by an aligned amount.
+ // The INTRINSIC_WO_CHAIN DAG combine will attempt to perform this
+ // optimization later.
+ Intrinsic::ID Intr, IntrLD, IntrPerm;
+ MVT PermCntlTy, PermTy, LDTy;
+ if (Subtarget.hasAltivec()) {
+ Intr = isLittleEndian ? Intrinsic::ppc_altivec_lvsr :
+ Intrinsic::ppc_altivec_lvsl;
+ IntrLD = Intrinsic::ppc_altivec_lvx;
+ IntrPerm = Intrinsic::ppc_altivec_vperm;
+ PermCntlTy = MVT::v16i8;
+ PermTy = MVT::v4i32;
+ LDTy = MVT::v4i32;
+ } else {
+ Intr = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlpcld :
+ Intrinsic::ppc_qpx_qvlpcls;
+ IntrLD = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlfd :
+ Intrinsic::ppc_qpx_qvlfs;
+ IntrPerm = Intrinsic::ppc_qpx_qvfperm;
+ PermCntlTy = MVT::v4f64;
+ PermTy = MVT::v4f64;
+ LDTy = MemVT.getSimpleVT();
+ }
+
+ SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, PermCntlTy);
+
+ // Create the new MMO for the new base load. It is like the original MMO,
+ // but represents an area in memory almost twice the vector size centered
+ // on the original address. If the address is unaligned, we might start
+ // reading up to (sizeof(vector)-1) bytes below the address of the
+ // original unaligned load.
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineMemOperand *BaseMMO =
+ MF.getMachineMemOperand(LD->getMemOperand(),
+ -(long)MemVT.getStoreSize()+1,
+ 2*MemVT.getStoreSize()-1);
+
+ // Create the new base load.
+ SDValue LDXIntID =
+ DAG.getTargetConstant(IntrLD, dl, getPointerTy(MF.getDataLayout()));
+ SDValue BaseLoadOps[] = { Chain, LDXIntID, Ptr };
+ SDValue BaseLoad =
+ DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl,
+ DAG.getVTList(PermTy, MVT::Other),
+ BaseLoadOps, LDTy, BaseMMO);
+
+ // Note that the value of IncOffset (which is provided to the next
+ // load's pointer info offset value, and thus used to calculate the
+ // alignment), and the value of IncValue (which is actually used to
+ // increment the pointer value) are different! This is because we
+ // require the next load to appear to be aligned, even though it
+ // is actually offset from the base pointer by a lesser amount.
+ int IncOffset = VT.getSizeInBits() / 8;
+ int IncValue = IncOffset;
+
+ // Walk (both up and down) the chain looking for another load at the real
+ // (aligned) offset (the alignment of the other load does not matter in
+ // this case). If found, then do not use the offset reduction trick, as
+ // that will prevent the loads from being later combined (as they would
+ // otherwise be duplicates).
+ if (!findConsecutiveLoad(LD, DAG))
+ --IncValue;
+
+ SDValue Increment =
+ DAG.getConstant(IncValue, dl, getPointerTy(MF.getDataLayout()));
+ Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+
+ MachineMemOperand *ExtraMMO =
+ MF.getMachineMemOperand(LD->getMemOperand(),
+ 1, 2*MemVT.getStoreSize()-1);
+ SDValue ExtraLoadOps[] = { Chain, LDXIntID, Ptr };
+ SDValue ExtraLoad =
+ DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl,
+ DAG.getVTList(PermTy, MVT::Other),
+ ExtraLoadOps, LDTy, ExtraMMO);
+
+ SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ BaseLoad.getValue(1), ExtraLoad.getValue(1));
+
+ // Because vperm has a big-endian bias, we must reverse the order
+ // of the input vectors and complement the permute control vector
+ // when generating little endian code. We have already handled the
+ // latter by using lvsr instead of lvsl, so just reverse BaseLoad
+ // and ExtraLoad here.
+ SDValue Perm;
+ if (isLittleEndian)
+ Perm = BuildIntrinsicOp(IntrPerm,
+ ExtraLoad, BaseLoad, PermCntl, DAG, dl);
+ else
+ Perm = BuildIntrinsicOp(IntrPerm,
+ BaseLoad, ExtraLoad, PermCntl, DAG, dl);
+
+ if (VT != PermTy)
+ Perm = Subtarget.hasAltivec() ?
+ DAG.getNode(ISD::BITCAST, dl, VT, Perm) :
+ DAG.getNode(ISD::FP_ROUND, dl, VT, Perm, // QPX
+ DAG.getTargetConstant(1, dl, MVT::i64));
+ // second argument is 1 because this rounding
+ // is always exact.
+
+ // The output of the permutation is our loaded result, the TokenFactor is
+ // our new chain.
+ DCI.CombineTo(N, Perm, TF);
+ return SDValue(N, 0);
+ }
+ }
+ break;
+ case ISD::INTRINSIC_WO_CHAIN: {
+ bool isLittleEndian = Subtarget.isLittleEndian();
+ unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+ Intrinsic::ID Intr = (isLittleEndian ? Intrinsic::ppc_altivec_lvsr
+ : Intrinsic::ppc_altivec_lvsl);
+ if ((IID == Intr ||
+ IID == Intrinsic::ppc_qpx_qvlpcld ||
+ IID == Intrinsic::ppc_qpx_qvlpcls) &&
+ N->getOperand(1)->getOpcode() == ISD::ADD) {
+ SDValue Add = N->getOperand(1);
+
+ int Bits = IID == Intrinsic::ppc_qpx_qvlpcld ?
+ 5 /* 32 byte alignment */ : 4 /* 16 byte alignment */;
+
+ if (DAG.MaskedValueIsZero(Add->getOperand(1),
+ APInt::getAllOnesValue(Bits /* alignment */)
+ .zext(Add.getScalarValueSizeInBits()))) {
+ SDNode *BasePtr = Add->getOperand(0).getNode();
+ for (SDNode::use_iterator UI = BasePtr->use_begin(),
+ UE = BasePtr->use_end();
+ UI != UE; ++UI) {
+ if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
+ cast<ConstantSDNode>(UI->getOperand(0))->getZExtValue() == IID) {
+ // We've found another LVSL/LVSR, and this address is an aligned
+ // multiple of that one. The results will be the same, so use the
+ // one we've just found instead.
+
+ return SDValue(*UI, 0);
+ }
+ }
+ }
+
+ if (isa<ConstantSDNode>(Add->getOperand(1))) {
+ SDNode *BasePtr = Add->getOperand(0).getNode();
+ for (SDNode::use_iterator UI = BasePtr->use_begin(),
+ UE = BasePtr->use_end(); UI != UE; ++UI) {
+ if (UI->getOpcode() == ISD::ADD &&
+ isa<ConstantSDNode>(UI->getOperand(1)) &&
+ (cast<ConstantSDNode>(Add->getOperand(1))->getZExtValue() -
+ cast<ConstantSDNode>(UI->getOperand(1))->getZExtValue()) %
+ (1ULL << Bits) == 0) {
+ SDNode *OtherAdd = *UI;
+ for (SDNode::use_iterator VI = OtherAdd->use_begin(),
+ VE = OtherAdd->use_end(); VI != VE; ++VI) {
+ if (VI->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
+ cast<ConstantSDNode>(VI->getOperand(0))->getZExtValue() == IID) {
+ return SDValue(*VI, 0);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ break;
+ case ISD::INTRINSIC_W_CHAIN: {
+ // For little endian, VSX loads require generating lxvd2x/xxswapd.
+ // Not needed on ISA 3.0 based CPUs since we have a non-permuting load.
+ if (Subtarget.needsSwapsForVSXMemOps()) {
+ switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+ default:
+ break;
+ case Intrinsic::ppc_vsx_lxvw4x:
+ case Intrinsic::ppc_vsx_lxvd2x:
+ return expandVSXLoadForLE(N, DCI);
+ }
+ }
+ break;
+ }
+ case ISD::INTRINSIC_VOID: {
+ // For little endian, VSX stores require generating xxswapd/stxvd2x.
+ // Not needed on ISA 3.0 based CPUs since we have a non-permuting store.
+ if (Subtarget.needsSwapsForVSXMemOps()) {
+ switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+ default:
+ break;
+ case Intrinsic::ppc_vsx_stxvw4x:
+ case Intrinsic::ppc_vsx_stxvd2x:
+ return expandVSXStoreForLE(N, DCI);
+ }
+ }
+ break;
+ }
+ case ISD::BSWAP:
+ // Turn BSWAP (LOAD) -> lhbrx/lwbrx.
+ if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
+ N->getOperand(0).hasOneUse() &&
+ (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i16 ||
+ (Subtarget.hasLDBRX() && Subtarget.isPPC64() &&
+ N->getValueType(0) == MVT::i64))) {
+ SDValue Load = N->getOperand(0);
+ LoadSDNode *LD = cast<LoadSDNode>(Load);
+ // Create the byte-swapping load.
+ SDValue Ops[] = {
+ LD->getChain(), // Chain
+ LD->getBasePtr(), // Ptr
+ DAG.getValueType(N->getValueType(0)) // VT
+ };
+ SDValue BSLoad =
+ DAG.getMemIntrinsicNode(PPCISD::LBRX, dl,
+ DAG.getVTList(N->getValueType(0) == MVT::i64 ?
+ MVT::i64 : MVT::i32, MVT::Other),
+ Ops, LD->getMemoryVT(), LD->getMemOperand());
+
+ // If this is an i16 load, insert the truncate.
+ SDValue ResVal = BSLoad;
+ if (N->getValueType(0) == MVT::i16)
+ ResVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, BSLoad);
+
+ // First, combine the bswap away. This makes the value produced by the
+ // load dead.
+ DCI.CombineTo(N, ResVal);
+
+ // Next, combine the load away, we give it a bogus result value but a real
+ // chain result. The result value is dead because the bswap is dead.
+ DCI.CombineTo(Load.getNode(), ResVal, BSLoad.getValue(1));
+
+ // Return N so it doesn't get rechecked!
+ return SDValue(N, 0);
+ }
+
+ break;
+ case PPCISD::VCMP: {
+ // If a VCMPo node already exists with exactly the same operands as this
+ // node, use its result instead of this node (VCMPo computes both a CR6 and
+ // a normal output).
+ //
+ if (!N->getOperand(0).hasOneUse() &&
+ !N->getOperand(1).hasOneUse() &&
+ !N->getOperand(2).hasOneUse()) {
+
+ // Scan all of the users of the LHS, looking for VCMPo's that match.
+ SDNode *VCMPoNode = nullptr;
+
+ SDNode *LHSN = N->getOperand(0).getNode();
+ for (SDNode::use_iterator UI = LHSN->use_begin(), E = LHSN->use_end();
+ UI != E; ++UI)
+ if (UI->getOpcode() == PPCISD::VCMPo &&
+ UI->getOperand(1) == N->getOperand(1) &&
+ UI->getOperand(2) == N->getOperand(2) &&
+ UI->getOperand(0) == N->getOperand(0)) {
+ VCMPoNode = *UI;
+ break;
+ }
+
+ // If there is no VCMPo node, or if the flag value has a single use, don't
+ // transform this.
+ if (!VCMPoNode || VCMPoNode->hasNUsesOfValue(0, 1))
+ break;
+
+ // Look at the (necessarily single) use of the flag value. If it has a
+ // chain, this transformation is more complex. Note that multiple things
+ // could use the value result, which we should ignore.
+ SDNode *FlagUser = nullptr;
+ for (SDNode::use_iterator UI = VCMPoNode->use_begin();
+ FlagUser == nullptr; ++UI) {
+ assert(UI != VCMPoNode->use_end() && "Didn't find user!");
+ SDNode *User = *UI;
+ for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
+ if (User->getOperand(i) == SDValue(VCMPoNode, 1)) {
+ FlagUser = User;
+ break;
+ }
+ }
+ }
+
+ // If the user is a MFOCRF instruction, we know this is safe.
+ // Otherwise we give up for right now.
+ if (FlagUser->getOpcode() == PPCISD::MFOCRF)
+ return SDValue(VCMPoNode, 0);
+ }
+ break;
+ }
+ case ISD::BRCOND: {
+ SDValue Cond = N->getOperand(1);
+ SDValue Target = N->getOperand(2);
+
+ if (Cond.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
+ cast<ConstantSDNode>(Cond.getOperand(1))->getZExtValue() ==
+ Intrinsic::ppc_is_decremented_ctr_nonzero) {
+
+ // We now need to make the intrinsic dead (it cannot be instruction
+ // selected).
+ DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Cond.getOperand(0));
+ assert(Cond.getNode()->hasOneUse() &&
+ "Counter decrement has more than one use");
+
+ return DAG.getNode(PPCISD::BDNZ, dl, MVT::Other,
+ N->getOperand(0), Target);
+ }
+ }
+ break;
+ case ISD::BR_CC: {
+ // If this is a branch on an altivec predicate comparison, lower this so
+ // that we don't have to do a MFOCRF: instead, branch directly on CR6. This
+ // lowering is done pre-legalize, because the legalizer lowers the predicate
+ // compare down to code that is difficult to reassemble.
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
+ SDValue LHS = N->getOperand(2), RHS = N->getOperand(3);
+
+ // Sometimes the promoted value of the intrinsic is ANDed by some non-zero
+ // value. If so, pass-through the AND to get to the intrinsic.
+ if (LHS.getOpcode() == ISD::AND &&
+ LHS.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN &&
+ cast<ConstantSDNode>(LHS.getOperand(0).getOperand(1))->getZExtValue() ==
+ Intrinsic::ppc_is_decremented_ctr_nonzero &&
+ isa<ConstantSDNode>(LHS.getOperand(1)) &&
+ !isNullConstant(LHS.getOperand(1)))
+ LHS = LHS.getOperand(0);
+
+ if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
+ cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() ==
+ Intrinsic::ppc_is_decremented_ctr_nonzero &&
+ isa<ConstantSDNode>(RHS)) {
+ assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
+ "Counter decrement comparison is not EQ or NE");
+
+ unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue();
+ bool isBDNZ = (CC == ISD::SETEQ && Val) ||
+ (CC == ISD::SETNE && !Val);
+
+ // We now need to make the intrinsic dead (it cannot be instruction
+ // selected).
+ DAG.ReplaceAllUsesOfValueWith(LHS.getValue(1), LHS.getOperand(0));
+ assert(LHS.getNode()->hasOneUse() &&
+ "Counter decrement has more than one use");
+
+ return DAG.getNode(isBDNZ ? PPCISD::BDNZ : PPCISD::BDZ, dl, MVT::Other,
+ N->getOperand(0), N->getOperand(4));
+ }
+
+ int CompareOpc;
+ bool isDot;
+
+ if (LHS.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
+ isa<ConstantSDNode>(RHS) && (CC == ISD::SETEQ || CC == ISD::SETNE) &&
+ getVectorCompareInfo(LHS, CompareOpc, isDot, Subtarget)) {
+ assert(isDot && "Can't compare against a vector result!");
+
+ // If this is a comparison against something other than 0/1, then we know
+ // that the condition is never/always true.
+ unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue();
+ if (Val != 0 && Val != 1) {
+ if (CC == ISD::SETEQ) // Cond never true, remove branch.
+ return N->getOperand(0);
+ // Always !=, turn it into an unconditional branch.
+ return DAG.getNode(ISD::BR, dl, MVT::Other,
+ N->getOperand(0), N->getOperand(4));
+ }
+
+ bool BranchOnWhenPredTrue = (CC == ISD::SETEQ) ^ (Val == 0);
+
+ // Create the PPCISD altivec 'dot' comparison node.
+ SDValue Ops[] = {
+ LHS.getOperand(2), // LHS of compare
+ LHS.getOperand(3), // RHS of compare
+ DAG.getConstant(CompareOpc, dl, MVT::i32)
+ };
+ EVT VTs[] = { LHS.getOperand(2).getValueType(), MVT::Glue };
+ SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops);
+
+ // Unpack the result based on how the target uses it.
+ PPC::Predicate CompOpc;
+ switch (cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue()) {
+ default: // Can't happen, don't crash on invalid number though.
+ case 0: // Branch on the value of the EQ bit of CR6.
+ CompOpc = BranchOnWhenPredTrue ? PPC::PRED_EQ : PPC::PRED_NE;
+ break;
+ case 1: // Branch on the inverted value of the EQ bit of CR6.
+ CompOpc = BranchOnWhenPredTrue ? PPC::PRED_NE : PPC::PRED_EQ;
+ break;
+ case 2: // Branch on the value of the LT bit of CR6.
+ CompOpc = BranchOnWhenPredTrue ? PPC::PRED_LT : PPC::PRED_GE;
+ break;
+ case 3: // Branch on the inverted value of the LT bit of CR6.
+ CompOpc = BranchOnWhenPredTrue ? PPC::PRED_GE : PPC::PRED_LT;
+ break;
+ }
+
+ return DAG.getNode(PPCISD::COND_BRANCH, dl, MVT::Other, N->getOperand(0),
+ DAG.getConstant(CompOpc, dl, MVT::i32),
+ DAG.getRegister(PPC::CR6, MVT::i32),
+ N->getOperand(4), CompNode.getValue(1));
+ }
+ break;
+ }
+ case ISD::BUILD_VECTOR:
+ return DAGCombineBuildVector(N, DCI);
+ }
+
+ return SDValue();
+}
+
+SDValue
+PPCTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
+ SelectionDAG &DAG,
+ std::vector<SDNode *> *Created) const {
+ // fold (sdiv X, pow2)
+ EVT VT = N->getValueType(0);
+ if (VT == MVT::i64 && !Subtarget.isPPC64())
+ return SDValue();
+ if ((VT != MVT::i32 && VT != MVT::i64) ||
+ !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()))
+ return SDValue();
+
+ SDLoc DL(N);
+ SDValue N0 = N->getOperand(0);
+
+ bool IsNegPow2 = (-Divisor).isPowerOf2();
+ unsigned Lg2 = (IsNegPow2 ? -Divisor : Divisor).countTrailingZeros();
+ SDValue ShiftAmt = DAG.getConstant(Lg2, DL, VT);
+
+ SDValue Op = DAG.getNode(PPCISD::SRA_ADDZE, DL, VT, N0, ShiftAmt);
+ if (Created)
+ Created->push_back(Op.getNode());
+
+ if (IsNegPow2) {
+ Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
+ if (Created)
+ Created->push_back(Op.getNode());
+ }
+
+ return Op;
+}
+
+//===----------------------------------------------------------------------===//
+// Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
+ APInt &KnownZero,
+ APInt &KnownOne,
+ const SelectionDAG &DAG,
+ unsigned Depth) const {
+ KnownZero = KnownOne = APInt(KnownZero.getBitWidth(), 0);
+ switch (Op.getOpcode()) {
+ default: break;
+ case PPCISD::LBRX: {
+ // lhbrx is known to have the top bits cleared out.
+ if (cast<VTSDNode>(Op.getOperand(2))->getVT() == MVT::i16)
+ KnownZero = 0xFFFF0000;
+ break;
+ }
+ case ISD::INTRINSIC_WO_CHAIN: {
+ switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) {
+ default: break;
+ case Intrinsic::ppc_altivec_vcmpbfp_p:
+ case Intrinsic::ppc_altivec_vcmpeqfp_p:
+ case Intrinsic::ppc_altivec_vcmpequb_p:
+ case Intrinsic::ppc_altivec_vcmpequh_p:
+ case Intrinsic::ppc_altivec_vcmpequw_p:
+ case Intrinsic::ppc_altivec_vcmpequd_p:
+ case Intrinsic::ppc_altivec_vcmpgefp_p:
+ case Intrinsic::ppc_altivec_vcmpgtfp_p:
+ case Intrinsic::ppc_altivec_vcmpgtsb_p:
+ case Intrinsic::ppc_altivec_vcmpgtsh_p:
+ case Intrinsic::ppc_altivec_vcmpgtsw_p:
+ case Intrinsic::ppc_altivec_vcmpgtsd_p:
+ case Intrinsic::ppc_altivec_vcmpgtub_p:
+ case Intrinsic::ppc_altivec_vcmpgtuh_p:
+ case Intrinsic::ppc_altivec_vcmpgtuw_p:
+ case Intrinsic::ppc_altivec_vcmpgtud_p:
+ KnownZero = ~1U; // All bits but the low one are known to be zero.
+ break;
+ }
+ }
+ }
+}
+
+unsigned PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
+ switch (Subtarget.getDarwinDirective()) {
+ default: break;
+ case PPC::DIR_970:
+ case PPC::DIR_PWR4:
+ case PPC::DIR_PWR5:
+ case PPC::DIR_PWR5X:
+ case PPC::DIR_PWR6:
+ case PPC::DIR_PWR6X:
+ case PPC::DIR_PWR7:
+ case PPC::DIR_PWR8:
+ case PPC::DIR_PWR9: {
+ if (!ML)
+ break;
+
+ const PPCInstrInfo *TII = Subtarget.getInstrInfo();
+
+ // For small loops (between 5 and 8 instructions), align to a 32-byte
+ // boundary so that the entire loop fits in one instruction-cache line.
+ uint64_t LoopSize = 0;
+ for (auto I = ML->block_begin(), IE = ML->block_end(); I != IE; ++I)
+ for (auto J = (*I)->begin(), JE = (*I)->end(); J != JE; ++J) {
+ LoopSize += TII->getInstSizeInBytes(*J);
+ if (LoopSize > 32)
+ break;
+ }
+
+ if (LoopSize > 16 && LoopSize <= 32)
+ return 5;
+
+ break;
+ }
+ }
+
+ return TargetLowering::getPrefLoopAlignment(ML);
+}
+
+/// getConstraintType - Given a constraint, return the type of
+/// constraint it is for this target.
+PPCTargetLowering::ConstraintType
+PPCTargetLowering::getConstraintType(StringRef Constraint) const {
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ default: break;
+ case 'b':
+ case 'r':
+ case 'f':
+ case 'd':
+ case 'v':
+ case 'y':
+ return C_RegisterClass;
+ case 'Z':
+ // FIXME: While Z does indicate a memory constraint, it specifically
+ // indicates an r+r address (used in conjunction with the 'y' modifier
+ // in the replacement string). Currently, we're forcing the base
+ // register to be r0 in the asm printer (which is interpreted as zero)
+ // and forming the complete address in the second register. This is
+ // suboptimal.
+ return C_Memory;
+ }
+ } else if (Constraint == "wc") { // individual CR bits.
+ return C_RegisterClass;
+ } else if (Constraint == "wa" || Constraint == "wd" ||
+ Constraint == "wf" || Constraint == "ws") {
+ return C_RegisterClass; // VSX registers.
+ }
+ return TargetLowering::getConstraintType(Constraint);
+}
+
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+PPCTargetLowering::getSingleConstraintMatchWeight(
+ AsmOperandInfo &info, const char *constraint) const {
+ ConstraintWeight weight = CW_Invalid;
+ Value *CallOperandVal = info.CallOperandVal;
+ // If we don't have a value, we can't do a match,
+ // but allow it at the lowest weight.
+ if (!CallOperandVal)
+ return CW_Default;
+ Type *type = CallOperandVal->getType();
+
+ // Look at the constraint type.
+ if (StringRef(constraint) == "wc" && type->isIntegerTy(1))
+ return CW_Register; // an individual CR bit.
+ else if ((StringRef(constraint) == "wa" ||
+ StringRef(constraint) == "wd" ||
+ StringRef(constraint) == "wf") &&
+ type->isVectorTy())
+ return CW_Register;
+ else if (StringRef(constraint) == "ws" && type->isDoubleTy())
+ return CW_Register;
+
+ switch (*constraint) {
+ default:
+ weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+ break;
+ case 'b':
+ if (type->isIntegerTy())
+ weight = CW_Register;
+ break;
+ case 'f':
+ if (type->isFloatTy())
+ weight = CW_Register;
+ break;
+ case 'd':
+ if (type->isDoubleTy())
+ weight = CW_Register;
+ break;
+ case 'v':
+ if (type->isVectorTy())
+ weight = CW_Register;
+ break;
+ case 'y':
+ weight = CW_Register;
+ break;
+ case 'Z':
+ weight = CW_Memory;
+ break;
+ }
+ return weight;
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ StringRef Constraint,
+ MVT VT) const {
+ if (Constraint.size() == 1) {
+ // GCC RS6000 Constraint Letters
+ switch (Constraint[0]) {
+ case 'b': // R1-R31
+ if (VT == MVT::i64 && Subtarget.isPPC64())
+ return std::make_pair(0U, &PPC::G8RC_NOX0RegClass);
+ return std::make_pair(0U, &PPC::GPRC_NOR0RegClass);
+ case 'r': // R0-R31
+ if (VT == MVT::i64 && Subtarget.isPPC64())
+ return std::make_pair(0U, &PPC::G8RCRegClass);
+ return std::make_pair(0U, &PPC::GPRCRegClass);
+ // 'd' and 'f' constraints are both defined to be "the floating point
+ // registers", where one is for 32-bit and the other for 64-bit. We don't
+ // really care overly much here so just give them all the same reg classes.
+ case 'd':
+ case 'f':
+ if (VT == MVT::f32 || VT == MVT::i32)
+ return std::make_pair(0U, &PPC::F4RCRegClass);
+ if (VT == MVT::f64 || VT == MVT::i64)
+ return std::make_pair(0U, &PPC::F8RCRegClass);
+ if (VT == MVT::v4f64 && Subtarget.hasQPX())
+ return std::make_pair(0U, &PPC::QFRCRegClass);
+ if (VT == MVT::v4f32 && Subtarget.hasQPX())
+ return std::make_pair(0U, &PPC::QSRCRegClass);
+ break;
+ case 'v':
+ if (VT == MVT::v4f64 && Subtarget.hasQPX())
+ return std::make_pair(0U, &PPC::QFRCRegClass);
+ if (VT == MVT::v4f32 && Subtarget.hasQPX())
+ return std::make_pair(0U, &PPC::QSRCRegClass);
+ if (Subtarget.hasAltivec())
+ return std::make_pair(0U, &PPC::VRRCRegClass);
+ case 'y': // crrc
+ return std::make_pair(0U, &PPC::CRRCRegClass);
+ }
+ } else if (Constraint == "wc" && Subtarget.useCRBits()) {
+ // An individual CR bit.
+ return std::make_pair(0U, &PPC::CRBITRCRegClass);
+ } else if ((Constraint == "wa" || Constraint == "wd" ||
+ Constraint == "wf") && Subtarget.hasVSX()) {
+ return std::make_pair(0U, &PPC::VSRCRegClass);
+ } else if (Constraint == "ws" && Subtarget.hasVSX()) {
+ if (VT == MVT::f32 && Subtarget.hasP8Vector())
+ return std::make_pair(0U, &PPC::VSSRCRegClass);
+ else
+ return std::make_pair(0U, &PPC::VSFRCRegClass);
+ }
+
+ std::pair<unsigned, const TargetRegisterClass *> R =
+ TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+
+ // r[0-9]+ are used, on PPC64, to refer to the corresponding 64-bit registers
+ // (which we call X[0-9]+). If a 64-bit value has been requested, and a
+ // 32-bit GPR has been selected, then 'upgrade' it to the 64-bit parent
+ // register.
+ // FIXME: If TargetLowering::getRegForInlineAsmConstraint could somehow use
+ // the AsmName field from *RegisterInfo.td, then this would not be necessary.
+ if (R.first && VT == MVT::i64 && Subtarget.isPPC64() &&
+ PPC::GPRCRegClass.contains(R.first))
+ return std::make_pair(TRI->getMatchingSuperReg(R.first,
+ PPC::sub_32, &PPC::G8RCRegClass),
+ &PPC::G8RCRegClass);
+
+ // GCC accepts 'cc' as an alias for 'cr0', and we need to do the same.
+ if (!R.second && StringRef("{cc}").equals_lower(Constraint)) {
+ R.first = PPC::CR0;
+ R.second = &PPC::CRRCRegClass;
+ }
+
+ return R;
+}
+
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector. If it is invalid, don't add anything to Ops.
+void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+ std::string &Constraint,
+ std::vector<SDValue>&Ops,
+ SelectionDAG &DAG) const {
+ SDValue Result;
+
+ // Only support length 1 constraints.
+ if (Constraint.length() > 1) return;
+
+ char Letter = Constraint[0];
+ switch (Letter) {
+ default: break;
+ case 'I':
+ case 'J':
+ case 'K':
+ case 'L':
+ case 'M':
+ case 'N':
+ case 'O':
+ case 'P': {
+ ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op);
+ if (!CST) return; // Must be an immediate to match.
+ SDLoc dl(Op);
+ int64_t Value = CST->getSExtValue();
+ EVT TCVT = MVT::i64; // All constants taken to be 64 bits so that negative
+ // numbers are printed as such.
+ switch (Letter) {
+ default: llvm_unreachable("Unknown constraint letter!");
+ case 'I': // "I" is a signed 16-bit constant.
+ if (isInt<16>(Value))
+ Result = DAG.getTargetConstant(Value, dl, TCVT);
+ break;
+ case 'J': // "J" is a constant with only the high-order 16 bits nonzero.
+ if (isShiftedUInt<16, 16>(Value))
+ Result = DAG.getTargetConstant(Value, dl, TCVT);
+ break;
+ case 'L': // "L" is a signed 16-bit constant shifted left 16 bits.
+ if (isShiftedInt<16, 16>(Value))
+ Result = DAG.getTargetConstant(Value, dl, TCVT);
+ break;
+ case 'K': // "K" is a constant with only the low-order 16 bits nonzero.
+ if (isUInt<16>(Value))
+ Result = DAG.getTargetConstant(Value, dl, TCVT);
+ break;
+ case 'M': // "M" is a constant that is greater than 31.
+ if (Value > 31)
+ Result = DAG.getTargetConstant(Value, dl, TCVT);
+ break;
+ case 'N': // "N" is a positive constant that is an exact power of two.
+ if (Value > 0 && isPowerOf2_64(Value))
+ Result = DAG.getTargetConstant(Value, dl, TCVT);
+ break;
+ case 'O': // "O" is the constant zero.
+ if (Value == 0)
+ Result = DAG.getTargetConstant(Value, dl, TCVT);
+ break;
+ case 'P': // "P" is a constant whose negation is a signed 16-bit constant.
+ if (isInt<16>(-Value))
+ Result = DAG.getTargetConstant(Value, dl, TCVT);
+ break;
+ }
+ break;
+ }
+ }
+
+ if (Result.getNode()) {
+ Ops.push_back(Result);
+ return;
+ }
+
+ // Handle standard constraint letters.
+ TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+// isLegalAddressingMode - Return true if the addressing mode represented
+// by AM is legal for this target, for a load/store of the specified type.
+bool PPCTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+ const AddrMode &AM, Type *Ty,
+ unsigned AS) const {
+ // PPC does not allow r+i addressing modes for vectors!
+ if (Ty->isVectorTy() && AM.BaseOffs != 0)
+ return false;
+
+ // PPC allows a sign-extended 16-bit immediate field.
+ if (AM.BaseOffs <= -(1LL << 16) || AM.BaseOffs >= (1LL << 16)-1)
+ return false;
+
+ // No global is ever allowed as a base.
+ if (AM.BaseGV)
+ return false;
+
+ // PPC only support r+r,
+ switch (AM.Scale) {
+ case 0: // "r+i" or just "i", depending on HasBaseReg.
+ break;
+ case 1:
+ if (AM.HasBaseReg && AM.BaseOffs) // "r+r+i" is not allowed.
+ return false;
+ // Otherwise we have r+r or r+i.
+ break;
+ case 2:
+ if (AM.HasBaseReg || AM.BaseOffs) // 2*r+r or 2*r+i is not allowed.
+ return false;
+ // Allow 2*r as r+r.
+ break;
+ default:
+ // No other scales are supported.
+ return false;
+ }
+
+ return true;
+}
+
+SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op,
+ SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MFI.setReturnAddressIsTaken(true);
+
+ if (verifyReturnAddressArgumentIsConstant(Op, DAG))
+ return SDValue();
+
+ SDLoc dl(Op);
+ unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+
+ // Make sure the function does not optimize away the store of the RA to
+ // the stack.
+ PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+ FuncInfo->setLRStoreRequired();
+ bool isPPC64 = Subtarget.isPPC64();
+ auto PtrVT = getPointerTy(MF.getDataLayout());
+
+ if (Depth > 0) {
+ SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+ SDValue Offset =
+ DAG.getConstant(Subtarget.getFrameLowering()->getReturnSaveOffset(), dl,
+ isPPC64 ? MVT::i64 : MVT::i32);
+ return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
+ DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
+ MachinePointerInfo());
+ }
+
+ // Just load the return address off the stack.
+ SDValue RetAddrFI = getReturnAddrFrameIndex(DAG);
+ return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
+ MachinePointerInfo());
+}
+
+SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MFI.setFrameAddressIsTaken(true);
+
+ EVT PtrVT = getPointerTy(MF.getDataLayout());
+ bool isPPC64 = PtrVT == MVT::i64;
+
+ // Naked functions never have a frame pointer, and so we use r1. For all
+ // other functions, this decision must be delayed until during PEI.
+ unsigned FrameReg;
+ if (MF.getFunction()->hasFnAttribute(Attribute::Naked))
+ FrameReg = isPPC64 ? PPC::X1 : PPC::R1;
+ else
+ FrameReg = isPPC64 ? PPC::FP8 : PPC::FP;
+
+ SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg,
+ PtrVT);
+ while (Depth--)
+ FrameAddr = DAG.getLoad(Op.getValueType(), dl, DAG.getEntryNode(),
+ FrameAddr, MachinePointerInfo());
+ return FrameAddr;
+}
+
+// FIXME? Maybe this could be a TableGen attribute on some registers and
+// this table could be generated automatically from RegInfo.
+unsigned PPCTargetLowering::getRegisterByName(const char* RegName, EVT VT,
+ SelectionDAG &DAG) const {
+ bool isPPC64 = Subtarget.isPPC64();
+ bool isDarwinABI = Subtarget.isDarwinABI();
+
+ if ((isPPC64 && VT != MVT::i64 && VT != MVT::i32) ||
+ (!isPPC64 && VT != MVT::i32))
+ report_fatal_error("Invalid register global variable type");
+
+ bool is64Bit = isPPC64 && VT == MVT::i64;
+ unsigned Reg = StringSwitch<unsigned>(RegName)
+ .Case("r1", is64Bit ? PPC::X1 : PPC::R1)
+ .Case("r2", (isDarwinABI || isPPC64) ? 0 : PPC::R2)
+ .Case("r13", (!isPPC64 && isDarwinABI) ? 0 :
+ (is64Bit ? PPC::X13 : PPC::R13))
+ .Default(0);
+
+ if (Reg)
+ return Reg;
+ report_fatal_error("Invalid register name global variable");
+}
+
+bool
+PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+ // The PowerPC target isn't yet aware of offsets.
+ return false;
+}
+
+bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
+ const CallInst &I,
+ unsigned Intrinsic) const {
+
+ switch (Intrinsic) {
+ case Intrinsic::ppc_qpx_qvlfd:
+ case Intrinsic::ppc_qpx_qvlfs:
+ case Intrinsic::ppc_qpx_qvlfcd:
+ case Intrinsic::ppc_qpx_qvlfcs:
+ case Intrinsic::ppc_qpx_qvlfiwa:
+ case Intrinsic::ppc_qpx_qvlfiwz:
+ case Intrinsic::ppc_altivec_lvx:
+ case Intrinsic::ppc_altivec_lvxl:
+ case Intrinsic::ppc_altivec_lvebx:
+ case Intrinsic::ppc_altivec_lvehx:
+ case Intrinsic::ppc_altivec_lvewx:
+ case Intrinsic::ppc_vsx_lxvd2x:
+ case Intrinsic::ppc_vsx_lxvw4x: {
+ EVT VT;
+ switch (Intrinsic) {
+ case Intrinsic::ppc_altivec_lvebx:
+ VT = MVT::i8;
+ break;
+ case Intrinsic::ppc_altivec_lvehx:
+ VT = MVT::i16;
+ break;
+ case Intrinsic::ppc_altivec_lvewx:
+ VT = MVT::i32;
+ break;
+ case Intrinsic::ppc_vsx_lxvd2x:
+ VT = MVT::v2f64;
+ break;
+ case Intrinsic::ppc_qpx_qvlfd:
+ VT = MVT::v4f64;
+ break;
+ case Intrinsic::ppc_qpx_qvlfs:
+ VT = MVT::v4f32;
+ break;
+ case Intrinsic::ppc_qpx_qvlfcd:
+ VT = MVT::v2f64;
+ break;
+ case Intrinsic::ppc_qpx_qvlfcs:
+ VT = MVT::v2f32;
+ break;
+ default:
+ VT = MVT::v4i32;
+ break;
+ }
+
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = VT;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = -VT.getStoreSize()+1;
+ Info.size = 2*VT.getStoreSize()-1;
+ Info.align = 1;
+ Info.vol = false;
+ Info.readMem = true;
+ Info.writeMem = false;
+ return true;
+ }
+ case Intrinsic::ppc_qpx_qvlfda:
+ case Intrinsic::ppc_qpx_qvlfsa:
+ case Intrinsic::ppc_qpx_qvlfcda:
+ case Intrinsic::ppc_qpx_qvlfcsa:
+ case Intrinsic::ppc_qpx_qvlfiwaa:
+ case Intrinsic::ppc_qpx_qvlfiwza: {
+ EVT VT;
+ switch (Intrinsic) {
+ case Intrinsic::ppc_qpx_qvlfda:
+ VT = MVT::v4f64;
+ break;
+ case Intrinsic::ppc_qpx_qvlfsa:
+ VT = MVT::v4f32;
+ break;
+ case Intrinsic::ppc_qpx_qvlfcda:
+ VT = MVT::v2f64;
+ break;
+ case Intrinsic::ppc_qpx_qvlfcsa:
+ VT = MVT::v2f32;
+ break;
+ default:
+ VT = MVT::v4i32;
+ break;
+ }
+
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = VT;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.size = VT.getStoreSize();
+ Info.align = 1;
+ Info.vol = false;
+ Info.readMem = true;
+ Info.writeMem = false;
+ return true;
+ }
+ case Intrinsic::ppc_qpx_qvstfd:
+ case Intrinsic::ppc_qpx_qvstfs:
+ case Intrinsic::ppc_qpx_qvstfcd:
+ case Intrinsic::ppc_qpx_qvstfcs:
+ case Intrinsic::ppc_qpx_qvstfiw:
+ case Intrinsic::ppc_altivec_stvx:
+ case Intrinsic::ppc_altivec_stvxl:
+ case Intrinsic::ppc_altivec_stvebx:
+ case Intrinsic::ppc_altivec_stvehx:
+ case Intrinsic::ppc_altivec_stvewx:
+ case Intrinsic::ppc_vsx_stxvd2x:
+ case Intrinsic::ppc_vsx_stxvw4x: {
+ EVT VT;
+ switch (Intrinsic) {
+ case Intrinsic::ppc_altivec_stvebx:
+ VT = MVT::i8;
+ break;
+ case Intrinsic::ppc_altivec_stvehx:
+ VT = MVT::i16;
+ break;
+ case Intrinsic::ppc_altivec_stvewx:
+ VT = MVT::i32;
+ break;
+ case Intrinsic::ppc_vsx_stxvd2x:
+ VT = MVT::v2f64;
+ break;
+ case Intrinsic::ppc_qpx_qvstfd:
+ VT = MVT::v4f64;
+ break;
+ case Intrinsic::ppc_qpx_qvstfs:
+ VT = MVT::v4f32;
+ break;
+ case Intrinsic::ppc_qpx_qvstfcd:
+ VT = MVT::v2f64;
+ break;
+ case Intrinsic::ppc_qpx_qvstfcs:
+ VT = MVT::v2f32;
+ break;
+ default:
+ VT = MVT::v4i32;
+ break;
+ }
+
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.memVT = VT;
+ Info.ptrVal = I.getArgOperand(1);
+ Info.offset = -VT.getStoreSize()+1;
+ Info.size = 2*VT.getStoreSize()-1;
+ Info.align = 1;
+ Info.vol = false;
+ Info.readMem = false;
+ Info.writeMem = true;
+ return true;
+ }
+ case Intrinsic::ppc_qpx_qvstfda:
+ case Intrinsic::ppc_qpx_qvstfsa:
+ case Intrinsic::ppc_qpx_qvstfcda:
+ case Intrinsic::ppc_qpx_qvstfcsa:
+ case Intrinsic::ppc_qpx_qvstfiwa: {
+ EVT VT;
+ switch (Intrinsic) {
+ case Intrinsic::ppc_qpx_qvstfda:
+ VT = MVT::v4f64;
+ break;
+ case Intrinsic::ppc_qpx_qvstfsa:
+ VT = MVT::v4f32;
+ break;
+ case Intrinsic::ppc_qpx_qvstfcda:
+ VT = MVT::v2f64;
+ break;
+ case Intrinsic::ppc_qpx_qvstfcsa:
+ VT = MVT::v2f32;
+ break;
+ default:
+ VT = MVT::v4i32;
+ break;
+ }
+
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.memVT = VT;
+ Info.ptrVal = I.getArgOperand(1);
+ Info.offset = 0;
+ Info.size = VT.getStoreSize();
+ Info.align = 1;
+ Info.vol = false;
+ Info.readMem = false;
+ Info.writeMem = true;
+ return true;
+ }
+ default:
+ break;
+ }
+
+ return false;
+}
+
+/// getOptimalMemOpType - Returns the target specific optimal type for load
+/// and store operations as a result of memset, memcpy, and memmove
+/// lowering. If DstAlign is zero that means it's safe to destination
+/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
+/// means there isn't a need to check it against alignment requirement,
+/// probably because the source does not need to be loaded. If 'IsMemset' is
+/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
+/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
+/// source is constant so it does not need to be loaded.
+/// It returns EVT::Other if the type should be determined using generic
+/// target-independent logic.
+EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size,
+ unsigned DstAlign, unsigned SrcAlign,
+ bool IsMemset, bool ZeroMemset,
+ bool MemcpyStrSrc,
+ MachineFunction &MF) const {
+ if (getTargetMachine().getOptLevel() != CodeGenOpt::None) {
+ const Function *F = MF.getFunction();
+ // When expanding a memset, require at least two QPX instructions to cover
+ // the cost of loading the value to be stored from the constant pool.
+ if (Subtarget.hasQPX() && Size >= 32 && (!IsMemset || Size >= 64) &&
+ (!SrcAlign || SrcAlign >= 32) && (!DstAlign || DstAlign >= 32) &&
+ !F->hasFnAttribute(Attribute::NoImplicitFloat)) {
+ return MVT::v4f64;
+ }
+
+ // We should use Altivec/VSX loads and stores when available. For unaligned
+ // addresses, unaligned VSX loads are only fast starting with the P8.
+ if (Subtarget.hasAltivec() && Size >= 16 &&
+ (((!SrcAlign || SrcAlign >= 16) && (!DstAlign || DstAlign >= 16)) ||
+ ((IsMemset && Subtarget.hasVSX()) || Subtarget.hasP8Vector())))
+ return MVT::v4i32;
+ }
+
+ if (Subtarget.isPPC64()) {
+ return MVT::i64;
+ }
+
+ return MVT::i32;
+}
+
+/// \brief Returns true if it is beneficial to convert a load of a constant
+/// to just the constant itself.
+bool PPCTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
+ Type *Ty) const {
+ assert(Ty->isIntegerTy());
+
+ unsigned BitSize = Ty->getPrimitiveSizeInBits();
+ return !(BitSize == 0 || BitSize > 64);
+}
+
+bool PPCTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
+ if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+ return false;
+ unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
+ unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
+ return NumBits1 == 64 && NumBits2 == 32;
+}
+
+bool PPCTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
+ if (!VT1.isInteger() || !VT2.isInteger())
+ return false;
+ unsigned NumBits1 = VT1.getSizeInBits();
+ unsigned NumBits2 = VT2.getSizeInBits();
+ return NumBits1 == 64 && NumBits2 == 32;
+}
+
+bool PPCTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+ // Generally speaking, zexts are not free, but they are free when they can be
+ // folded with other operations.
+ if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val)) {
+ EVT MemVT = LD->getMemoryVT();
+ if ((MemVT == MVT::i1 || MemVT == MVT::i8 || MemVT == MVT::i16 ||
+ (Subtarget.isPPC64() && MemVT == MVT::i32)) &&
+ (LD->getExtensionType() == ISD::NON_EXTLOAD ||
+ LD->getExtensionType() == ISD::ZEXTLOAD))
+ return true;
+ }
+
+ // FIXME: Add other cases...
+ // - 32-bit shifts with a zext to i64
+ // - zext after ctlz, bswap, etc.
+ // - zext after and by a constant mask
+
+ return TargetLowering::isZExtFree(Val, VT2);
+}
+
+bool PPCTargetLowering::isFPExtFree(EVT VT) const {
+ assert(VT.isFloatingPoint());
+ return true;
+}
+
+bool PPCTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
+ return isInt<16>(Imm) || isUInt<16>(Imm);
+}
+
+bool PPCTargetLowering::isLegalAddImmediate(int64_t Imm) const {
+ return isInt<16>(Imm) || isUInt<16>(Imm);
+}
+
+bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+ unsigned,
+ unsigned,
+ bool *Fast) const {
+ if (DisablePPCUnaligned)
+ return false;
+
+ // PowerPC supports unaligned memory access for simple non-vector types.
+ // Although accessing unaligned addresses is not as efficient as accessing
+ // aligned addresses, it is generally more efficient than manual expansion,
+ // and generally only traps for software emulation when crossing page
+ // boundaries.
+
+ if (!VT.isSimple())
+ return false;
+
+ if (VT.getSimpleVT().isVector()) {
+ if (Subtarget.hasVSX()) {
+ if (VT != MVT::v2f64 && VT != MVT::v2i64 &&
+ VT != MVT::v4f32 && VT != MVT::v4i32)
+ return false;
+ } else {
+ return false;
+ }
+ }
+
+ if (VT == MVT::ppcf128)
+ return false;
+
+ if (Fast)
+ *Fast = true;
+
+ return true;
+}
+
+bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+ VT = VT.getScalarType();
+
+ if (!VT.isSimple())
+ return false;
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ case MVT::f32:
+ case MVT::f64:
+ return true;
+ default:
+ break;
+ }
+
+ return false;
+}
+
+const MCPhysReg *
+PPCTargetLowering::getScratchRegisters(CallingConv::ID) const {
+ // LR is a callee-save register, but we must treat it as clobbered by any call
+ // site. Hence we include LR in the scratch registers, which are in turn added
+ // as implicit-defs for stackmaps and patchpoints. The same reasoning applies
+ // to CTR, which is used by any indirect call.
+ static const MCPhysReg ScratchRegs[] = {
+ PPC::X12, PPC::LR8, PPC::CTR8, 0
+ };
+
+ return ScratchRegs;
+}
+
+unsigned PPCTargetLowering::getExceptionPointerRegister(
+ const Constant *PersonalityFn) const {
+ return Subtarget.isPPC64() ? PPC::X3 : PPC::R3;
+}
+
+unsigned PPCTargetLowering::getExceptionSelectorRegister(
+ const Constant *PersonalityFn) const {
+ return Subtarget.isPPC64() ? PPC::X4 : PPC::R4;
+}
+
+bool
+PPCTargetLowering::shouldExpandBuildVectorWithShuffles(
+ EVT VT , unsigned DefinedValues) const {
+ if (VT == MVT::v2i64)
+ return Subtarget.hasDirectMove(); // Don't need stack ops with direct moves
+
+ if (Subtarget.hasVSX() || Subtarget.hasQPX())
+ return true;
+
+ return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues);
+}
+
+Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const {
+ if (DisableILPPref || Subtarget.enableMachineScheduler())
+ return TargetLowering::getSchedulingPreference(N);
+
+ return Sched::ILP;
+}
+
+// Create a fast isel object.
+FastISel *
+PPCTargetLowering::createFastISel(FunctionLoweringInfo &FuncInfo,
+ const TargetLibraryInfo *LibInfo) const {
+ return PPC::createFastISel(FuncInfo, LibInfo);
+}
+
+void PPCTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
+ if (Subtarget.isDarwinABI()) return;
+ if (!Subtarget.isPPC64()) return;
+
+ // Update IsSplitCSR in PPCFunctionInfo
+ PPCFunctionInfo *PFI = Entry->getParent()->getInfo<PPCFunctionInfo>();
+ PFI->setIsSplitCSR(true);
+}
+
+void PPCTargetLowering::insertCopiesSplitCSR(
+ MachineBasicBlock *Entry,
+ const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
+ const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo();
+ const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
+ if (!IStart)
+ return;
+
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
+ MachineBasicBlock::iterator MBBI = Entry->begin();
+ for (const MCPhysReg *I = IStart; *I; ++I) {
+ const TargetRegisterClass *RC = nullptr;
+ if (PPC::G8RCRegClass.contains(*I))
+ RC = &PPC::G8RCRegClass;
+ else if (PPC::F8RCRegClass.contains(*I))
+ RC = &PPC::F8RCRegClass;
+ else if (PPC::CRRCRegClass.contains(*I))
+ RC = &PPC::CRRCRegClass;
+ else if (PPC::VRRCRegClass.contains(*I))
+ RC = &PPC::VRRCRegClass;
+ else
+ llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+
+ unsigned NewVR = MRI->createVirtualRegister(RC);
+ // Create copy from CSR to a virtual register.
+ // FIXME: this currently does not emit CFI pseudo-instructions, it works
+ // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
+ // nounwind. If we want to generalize this later, we may need to emit
+ // CFI pseudo-instructions.
+ assert(Entry->getParent()->getFunction()->hasFnAttribute(
+ Attribute::NoUnwind) &&
+ "Function should be nounwind in insertCopiesSplitCSR!");
+ Entry->addLiveIn(*I);
+ BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
+ .addReg(*I);
+
+ // Insert the copy-back instructions right before the terminator
+ for (auto *Exit : Exits)
+ BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
+ TII->get(TargetOpcode::COPY), *I)
+ .addReg(NewVR);
+ }
+}
+
+// Override to enable LOAD_STACK_GUARD lowering on Linux.
+bool PPCTargetLowering::useLoadStackGuardNode() const {
+ if (!Subtarget.isTargetLinux())
+ return TargetLowering::useLoadStackGuardNode();
+ return true;
+}
+
+// Override to disable global variable loading on Linux.
+void PPCTargetLowering::insertSSPDeclarations(Module &M) const {
+ if (!Subtarget.isTargetLinux())
+ return TargetLowering::insertSSPDeclarations(M);
+}
+
+bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+
+ if (!VT.isSimple() || !Subtarget.hasVSX())
+ return false;
+
+ switch(VT.getSimpleVT().SimpleTy) {
+ default:
+ // For FP types that are currently not supported by PPC backend, return
+ // false. Examples: f16, f80.
+ return false;
+ case MVT::f32:
+ case MVT::f64:
+ case MVT::ppcf128:
+ return Imm.isPosZero();
+ }
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h
new file mode 100644
index 000000000000..d3c88482f092
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -0,0 +1,1031 @@
+//===-- PPCISelLowering.h - PPC32 DAG Lowering Interface --------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that PPC uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCISELLOWERING_H
+#define LLVM_LIB_TARGET_POWERPC_PPCISELLOWERING_H
+
+#include "PPC.h"
+#include "PPCInstrInfo.h"
+#include "PPCRegisterInfo.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+ namespace PPCISD {
+ enum NodeType : unsigned {
+ // Start the numbering where the builtin ops and target ops leave off.
+ FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+ /// FSEL - Traditional three-operand fsel node.
+ ///
+ FSEL,
+
+ /// FCFID - The FCFID instruction, taking an f64 operand and producing
+ /// and f64 value containing the FP representation of the integer that
+ /// was temporarily in the f64 operand.
+ FCFID,
+
+ /// Newer FCFID[US] integer-to-floating-point conversion instructions for
+ /// unsigned integers and single-precision outputs.
+ FCFIDU, FCFIDS, FCFIDUS,
+
+ /// FCTI[D,W]Z - The FCTIDZ and FCTIWZ instructions, taking an f32 or f64
+ /// operand, producing an f64 value containing the integer representation
+ /// of that FP value.
+ FCTIDZ, FCTIWZ,
+
+ /// Newer FCTI[D,W]UZ floating-point-to-integer conversion instructions for
+ /// unsigned integers.
+ FCTIDUZ, FCTIWUZ,
+
+ /// VEXTS, ByteWidth - takes an input in VSFRC and produces an output in
+ /// VSFRC that is sign-extended from ByteWidth to a 64-byte integer.
+ VEXTS,
+
+ /// Reciprocal estimate instructions (unary FP ops).
+ FRE, FRSQRTE,
+
+ // VMADDFP, VNMSUBFP - The VMADDFP and VNMSUBFP instructions, taking
+ // three v4f32 operands and producing a v4f32 result.
+ VMADDFP, VNMSUBFP,
+
+ /// VPERM - The PPC VPERM Instruction.
+ ///
+ VPERM,
+
+ /// XXSPLT - The PPC VSX splat instructions
+ ///
+ XXSPLT,
+
+ /// XXINSERT - The PPC VSX insert instruction
+ ///
+ XXINSERT,
+
+ /// VECSHL - The PPC VSX shift left instruction
+ ///
+ VECSHL,
+
+ /// The CMPB instruction (takes two operands of i32 or i64).
+ CMPB,
+
+ /// Hi/Lo - These represent the high and low 16-bit parts of a global
+ /// address respectively. These nodes have two operands, the first of
+ /// which must be a TargetGlobalAddress, and the second of which must be a
+ /// Constant. Selected naively, these turn into 'lis G+C' and 'li G+C',
+ /// though these are usually folded into other nodes.
+ Hi, Lo,
+
+ /// The following two target-specific nodes are used for calls through
+ /// function pointers in the 64-bit SVR4 ABI.
+
+ /// OPRC, CHAIN = DYNALLOC(CHAIN, NEGSIZE, FRAME_INDEX)
+ /// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to
+ /// compute an allocation on the stack.
+ DYNALLOC,
+
+ /// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to
+ /// compute an offset from native SP to the address of the most recent
+ /// dynamic alloca.
+ DYNAREAOFFSET,
+
+ /// GlobalBaseReg - On Darwin, this node represents the result of the mflr
+ /// at function entry, used for PIC code.
+ GlobalBaseReg,
+
+ /// These nodes represent the 32-bit PPC shifts that operate on 6-bit
+ /// shift amounts. These nodes are generated by the multi-precision shift
+ /// code.
+ SRL, SRA, SHL,
+
+ /// The combination of sra[wd]i and addze used to implemented signed
+ /// integer division by a power of 2. The first operand is the dividend,
+ /// and the second is the constant shift amount (representing the
+ /// divisor).
+ SRA_ADDZE,
+
+ /// CALL - A direct function call.
+ /// CALL_NOP is a call with the special NOP which follows 64-bit
+ /// SVR4 calls.
+ CALL, CALL_NOP,
+
+ /// CHAIN,FLAG = MTCTR(VAL, CHAIN[, INFLAG]) - Directly corresponds to a
+ /// MTCTR instruction.
+ MTCTR,
+
+ /// CHAIN,FLAG = BCTRL(CHAIN, INFLAG) - Directly corresponds to a
+ /// BCTRL instruction.
+ BCTRL,
+
+ /// CHAIN,FLAG = BCTRL(CHAIN, ADDR, INFLAG) - The combination of a bctrl
+ /// instruction and the TOC reload required on SVR4 PPC64.
+ BCTRL_LOAD_TOC,
+
+ /// Return with a flag operand, matched by 'blr'
+ RET_FLAG,
+
+ /// R32 = MFOCRF(CRREG, INFLAG) - Represents the MFOCRF instruction.
+ /// This copies the bits corresponding to the specified CRREG into the
+ /// resultant GPR. Bits corresponding to other CR regs are undefined.
+ MFOCRF,
+
+ /// Direct move from a VSX register to a GPR
+ MFVSR,
+
+ /// Direct move from a GPR to a VSX register (algebraic)
+ MTVSRA,
+
+ /// Direct move from a GPR to a VSX register (zero)
+ MTVSRZ,
+
+ /// Extract a subvector from signed integer vector and convert to FP.
+ /// It is primarily used to convert a (widened) illegal integer vector
+ /// type to a legal floating point vector type.
+ /// For example v2i32 -> widened to v4i32 -> v2f64
+ SINT_VEC_TO_FP,
+
+ /// Extract a subvector from unsigned integer vector and convert to FP.
+ /// As with SINT_VEC_TO_FP, used for converting illegal types.
+ UINT_VEC_TO_FP,
+
+ // FIXME: Remove these once the ANDI glue bug is fixed:
+ /// i1 = ANDIo_1_[EQ|GT]_BIT(i32 or i64 x) - Represents the result of the
+ /// eq or gt bit of CR0 after executing andi. x, 1. This is used to
+ /// implement truncation of i32 or i64 to i1.
+ ANDIo_1_EQ_BIT, ANDIo_1_GT_BIT,
+
+ // READ_TIME_BASE - A read of the 64-bit time-base register on a 32-bit
+ // target (returns (Lo, Hi)). It takes a chain operand.
+ READ_TIME_BASE,
+
+ // EH_SJLJ_SETJMP - SjLj exception handling setjmp.
+ EH_SJLJ_SETJMP,
+
+ // EH_SJLJ_LONGJMP - SjLj exception handling longjmp.
+ EH_SJLJ_LONGJMP,
+
+ /// RESVEC = VCMP(LHS, RHS, OPC) - Represents one of the altivec VCMP*
+ /// instructions. For lack of better number, we use the opcode number
+ /// encoding for the OPC field to identify the compare. For example, 838
+ /// is VCMPGTSH.
+ VCMP,
+
+ /// RESVEC, OUTFLAG = VCMPo(LHS, RHS, OPC) - Represents one of the
+ /// altivec VCMP*o instructions. For lack of better number, we use the
+ /// opcode number encoding for the OPC field to identify the compare. For
+ /// example, 838 is VCMPGTSH.
+ VCMPo,
+
+ /// CHAIN = COND_BRANCH CHAIN, CRRC, OPC, DESTBB [, INFLAG] - This
+ /// corresponds to the COND_BRANCH pseudo instruction. CRRC is the
+ /// condition register to branch on, OPC is the branch opcode to use (e.g.
+ /// PPC::BLE), DESTBB is the destination block to branch to, and INFLAG is
+ /// an optional input flag argument.
+ COND_BRANCH,
+
+ /// CHAIN = BDNZ CHAIN, DESTBB - These are used to create counter-based
+ /// loops.
+ BDNZ, BDZ,
+
+ /// F8RC = FADDRTZ F8RC, F8RC - This is an FADD done with rounding
+ /// towards zero. Used only as part of the long double-to-int
+ /// conversion sequence.
+ FADDRTZ,
+
+ /// F8RC = MFFS - This moves the FPSCR (not modeled) into the register.
+ MFFS,
+
+ /// TC_RETURN - A tail call return.
+ /// operand #0 chain
+ /// operand #1 callee (register or absolute)
+ /// operand #2 stack adjustment
+ /// operand #3 optional in flag
+ TC_RETURN,
+
+ /// ch, gl = CR6[UN]SET ch, inglue - Toggle CR bit 6 for SVR4 vararg calls
+ CR6SET,
+ CR6UNSET,
+
+ /// GPRC = address of _GLOBAL_OFFSET_TABLE_. Used by initial-exec TLS
+ /// on PPC32.
+ PPC32_GOT,
+
+ /// GPRC = address of _GLOBAL_OFFSET_TABLE_. Used by general dynamic and
+ /// local dynamic TLS on PPC32.
+ PPC32_PICGOT,
+
+ /// G8RC = ADDIS_GOT_TPREL_HA %X2, Symbol - Used by the initial-exec
+ /// TLS model, produces an ADDIS8 instruction that adds the GOT
+ /// base to sym\@got\@tprel\@ha.
+ ADDIS_GOT_TPREL_HA,
+
+ /// G8RC = LD_GOT_TPREL_L Symbol, G8RReg - Used by the initial-exec
+ /// TLS model, produces a LD instruction with base register G8RReg
+ /// and offset sym\@got\@tprel\@l. This completes the addition that
+ /// finds the offset of "sym" relative to the thread pointer.
+ LD_GOT_TPREL_L,
+
+ /// G8RC = ADD_TLS G8RReg, Symbol - Used by the initial-exec TLS
+ /// model, produces an ADD instruction that adds the contents of
+ /// G8RReg to the thread pointer. Symbol contains a relocation
+ /// sym\@tls which is to be replaced by the thread pointer and
+ /// identifies to the linker that the instruction is part of a
+ /// TLS sequence.
+ ADD_TLS,
+
+ /// G8RC = ADDIS_TLSGD_HA %X2, Symbol - For the general-dynamic TLS
+ /// model, produces an ADDIS8 instruction that adds the GOT base
+ /// register to sym\@got\@tlsgd\@ha.
+ ADDIS_TLSGD_HA,
+
+ /// %X3 = ADDI_TLSGD_L G8RReg, Symbol - For the general-dynamic TLS
+ /// model, produces an ADDI8 instruction that adds G8RReg to
+ /// sym\@got\@tlsgd\@l and stores the result in X3. Hidden by
+ /// ADDIS_TLSGD_L_ADDR until after register assignment.
+ ADDI_TLSGD_L,
+
+ /// %X3 = GET_TLS_ADDR %X3, Symbol - For the general-dynamic TLS
+ /// model, produces a call to __tls_get_addr(sym\@tlsgd). Hidden by
+ /// ADDIS_TLSGD_L_ADDR until after register assignment.
+ GET_TLS_ADDR,
+
+ /// G8RC = ADDI_TLSGD_L_ADDR G8RReg, Symbol, Symbol - Op that
+ /// combines ADDI_TLSGD_L and GET_TLS_ADDR until expansion following
+ /// register assignment.
+ ADDI_TLSGD_L_ADDR,
+
+ /// G8RC = ADDIS_TLSLD_HA %X2, Symbol - For the local-dynamic TLS
+ /// model, produces an ADDIS8 instruction that adds the GOT base
+ /// register to sym\@got\@tlsld\@ha.
+ ADDIS_TLSLD_HA,
+
+ /// %X3 = ADDI_TLSLD_L G8RReg, Symbol - For the local-dynamic TLS
+ /// model, produces an ADDI8 instruction that adds G8RReg to
+ /// sym\@got\@tlsld\@l and stores the result in X3. Hidden by
+ /// ADDIS_TLSLD_L_ADDR until after register assignment.
+ ADDI_TLSLD_L,
+
+ /// %X3 = GET_TLSLD_ADDR %X3, Symbol - For the local-dynamic TLS
+ /// model, produces a call to __tls_get_addr(sym\@tlsld). Hidden by
+ /// ADDIS_TLSLD_L_ADDR until after register assignment.
+ GET_TLSLD_ADDR,
+
+ /// G8RC = ADDI_TLSLD_L_ADDR G8RReg, Symbol, Symbol - Op that
+ /// combines ADDI_TLSLD_L and GET_TLSLD_ADDR until expansion
+ /// following register assignment.
+ ADDI_TLSLD_L_ADDR,
+
+ /// G8RC = ADDIS_DTPREL_HA %X3, Symbol - For the local-dynamic TLS
+ /// model, produces an ADDIS8 instruction that adds X3 to
+ /// sym\@dtprel\@ha.
+ ADDIS_DTPREL_HA,
+
+ /// G8RC = ADDI_DTPREL_L G8RReg, Symbol - For the local-dynamic TLS
+ /// model, produces an ADDI8 instruction that adds G8RReg to
+ /// sym\@got\@dtprel\@l.
+ ADDI_DTPREL_L,
+
+ /// VRRC = VADD_SPLAT Elt, EltSize - Temporary node to be expanded
+ /// during instruction selection to optimize a BUILD_VECTOR into
+ /// operations on splats. This is necessary to avoid losing these
+ /// optimizations due to constant folding.
+ VADD_SPLAT,
+
+ /// CHAIN = SC CHAIN, Imm128 - System call. The 7-bit unsigned
+ /// operand identifies the operating system entry point.
+ SC,
+
+ /// CHAIN = CLRBHRB CHAIN - Clear branch history rolling buffer.
+ CLRBHRB,
+
+ /// GPRC, CHAIN = MFBHRBE CHAIN, Entry, Dummy - Move from branch
+ /// history rolling buffer entry.
+ MFBHRBE,
+
+ /// CHAIN = RFEBB CHAIN, State - Return from event-based branch.
+ RFEBB,
+
+ /// VSRC, CHAIN = XXSWAPD CHAIN, VSRC - Occurs only for little
+ /// endian. Maps to an xxswapd instruction that corrects an lxvd2x
+ /// or stxvd2x instruction. The chain is necessary because the
+ /// sequence replaces a load and needs to provide the same number
+ /// of outputs.
+ XXSWAPD,
+
+ /// An SDNode for swaps that are not associated with any loads/stores
+ /// and thereby have no chain.
+ SWAP_NO_CHAIN,
+
+ /// QVFPERM = This corresponds to the QPX qvfperm instruction.
+ QVFPERM,
+
+ /// QVGPCI = This corresponds to the QPX qvgpci instruction.
+ QVGPCI,
+
+ /// QVALIGNI = This corresponds to the QPX qvaligni instruction.
+ QVALIGNI,
+
+ /// QVESPLATI = This corresponds to the QPX qvesplati instruction.
+ QVESPLATI,
+
+ /// QBFLT = Access the underlying QPX floating-point boolean
+ /// representation.
+ QBFLT,
+
+ /// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a
+ /// byte-swapping store instruction. It byte-swaps the low "Type" bits of
+ /// the GPRC input, then stores it through Ptr. Type can be either i16 or
+ /// i32.
+ STBRX = ISD::FIRST_TARGET_MEMORY_OPCODE,
+
+ /// GPRC, CHAIN = LBRX CHAIN, Ptr, Type - This is a
+ /// byte-swapping load instruction. It loads "Type" bits, byte swaps it,
+ /// then puts it in the bottom bits of the GPRC. TYPE can be either i16
+ /// or i32.
+ LBRX,
+
+ /// STFIWX - The STFIWX instruction. The first operand is an input token
+ /// chain, then an f64 value to store, then an address to store it to.
+ STFIWX,
+
+ /// GPRC, CHAIN = LFIWAX CHAIN, Ptr - This is a floating-point
+ /// load which sign-extends from a 32-bit integer value into the
+ /// destination 64-bit register.
+ LFIWAX,
+
+ /// GPRC, CHAIN = LFIWZX CHAIN, Ptr - This is a floating-point
+ /// load which zero-extends from a 32-bit integer value into the
+ /// destination 64-bit register.
+ LFIWZX,
+
+ /// GPRC, CHAIN = LXSIZX, CHAIN, Ptr, ByteWidth - This is a load of an
+ /// integer smaller than 64 bits into a VSR. The integer is zero-extended.
+ /// This can be used for converting loaded integers to floating point.
+ LXSIZX,
+
+ /// STXSIX - The STXSI[bh]X instruction. The first operand is an input
+ /// chain, then an f64 value to store, then an address to store it to,
+ /// followed by a byte-width for the store.
+ STXSIX,
+
+ /// VSRC, CHAIN = LXVD2X_LE CHAIN, Ptr - Occurs only for little endian.
+ /// Maps directly to an lxvd2x instruction that will be followed by
+ /// an xxswapd.
+ LXVD2X,
+
+ /// CHAIN = STXVD2X CHAIN, VSRC, Ptr - Occurs only for little endian.
+ /// Maps directly to an stxvd2x instruction that will be preceded by
+ /// an xxswapd.
+ STXVD2X,
+
+ /// QBRC, CHAIN = QVLFSb CHAIN, Ptr
+ /// The 4xf32 load used for v4i1 constants.
+ QVLFSb,
+
+ /// GPRC = TOC_ENTRY GA, TOC
+ /// Loads the entry for GA from the TOC, where the TOC base is given by
+ /// the last operand.
+ TOC_ENTRY
+ };
+ }
+
+ /// Define some predicates that are used for node matching.
+ namespace PPC {
+ /// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a
+ /// VPKUHUM instruction.
+ bool isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
+ SelectionDAG &DAG);
+
+ /// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a
+ /// VPKUWUM instruction.
+ bool isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
+ SelectionDAG &DAG);
+
+ /// isVPKUDUMShuffleMask - Return true if this is the shuffle mask for a
+ /// VPKUDUM instruction.
+ bool isVPKUDUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
+ SelectionDAG &DAG);
+
+ /// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for
+ /// a VRGL* instruction with the specified unit size (1,2 or 4 bytes).
+ bool isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
+ unsigned ShuffleKind, SelectionDAG &DAG);
+
+ /// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for
+ /// a VRGH* instruction with the specified unit size (1,2 or 4 bytes).
+ bool isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
+ unsigned ShuffleKind, SelectionDAG &DAG);
+
+ /// isVMRGEOShuffleMask - Return true if this is a shuffle mask suitable for
+ /// a VMRGEW or VMRGOW instruction
+ bool isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven,
+ unsigned ShuffleKind, SelectionDAG &DAG);
+
+ /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the
+ /// shift amount, otherwise return -1.
+ int isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind,
+ SelectionDAG &DAG);
+
+ /// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a splat of a single element that is suitable for input to
+ /// VSPLTB/VSPLTH/VSPLTW.
+ bool isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize);
+
+ /// isXXINSERTWMask - Return true if this VECTOR_SHUFFLE can be handled by
+ /// the XXINSERTW instruction introduced in ISA 3.0. This is essentially any
+ /// shuffle of v4f32/v4i32 vectors that just inserts one element from one
+ /// vector into the other. This function will also set a couple of
+ /// output parameters for how much the source vector needs to be shifted and
+ /// what byte number needs to be specified for the instruction to put the
+ /// element in the desired location of the target vector.
+ bool isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
+ unsigned &InsertAtByte, bool &Swap, bool IsLE);
+
+ /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the
+ /// specified isSplatShuffleMask VECTOR_SHUFFLE mask.
+ unsigned getVSPLTImmediate(SDNode *N, unsigned EltSize, SelectionDAG &DAG);
+
+ /// get_VSPLTI_elt - If this is a build_vector of constants which can be
+ /// formed by using a vspltis[bhw] instruction of the specified element
+ /// size, return the constant being splatted. The ByteSize field indicates
+ /// the number of bytes of each element [124] -> [bhw].
+ SDValue get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG);
+
+ /// If this is a qvaligni shuffle mask, return the shift
+ /// amount, otherwise return -1.
+ int isQVALIGNIShuffleMask(SDNode *N);
+ }
+
+ class PPCTargetLowering : public TargetLowering {
+ const PPCSubtarget &Subtarget;
+
+ public:
+ explicit PPCTargetLowering(const PPCTargetMachine &TM,
+ const PPCSubtarget &STI);
+
+ /// getTargetNodeName() - This method returns the name of a target specific
+ /// DAG node.
+ const char *getTargetNodeName(unsigned Opcode) const override;
+
+ /// getPreferredVectorAction - The code we generate when vector types are
+ /// legalized by promoting the integer element type is often much worse
+ /// than code we generate if we widen the type for applicable vector types.
+ /// The issue with promoting is that the vector is scalaraized, individual
+ /// elements promoted and then the vector is rebuilt. So say we load a pair
+ /// of v4i8's and shuffle them. This will turn into a mess of 8 extending
+ /// loads, moves back into VSR's (or memory ops if we don't have moves) and
+ /// then the VPERM for the shuffle. All in all a very slow sequence.
+ TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT)
+ const override {
+ if (VT.getScalarSizeInBits() % 8 == 0)
+ return TypeWidenVector;
+ return TargetLoweringBase::getPreferredVectorAction(VT);
+ }
+ bool useSoftFloat() const override;
+
+ MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
+ return MVT::i32;
+ }
+
+ bool isCheapToSpeculateCttz() const override {
+ return true;
+ }
+
+ bool isCheapToSpeculateCtlz() const override {
+ return true;
+ }
+
+ bool isCtlzFast() const override {
+ return true;
+ }
+
+ bool hasAndNotCompare(SDValue) const override {
+ return true;
+ }
+
+ bool supportSplitCSR(MachineFunction *MF) const override {
+ return
+ MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
+ MF->getFunction()->hasFnAttribute(Attribute::NoUnwind);
+ }
+
+ void initializeSplitCSR(MachineBasicBlock *Entry) const override;
+
+ void insertCopiesSplitCSR(
+ MachineBasicBlock *Entry,
+ const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
+
+ /// getSetCCResultType - Return the ISD::SETCC ValueType
+ EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+ EVT VT) const override;
+
+ /// Return true if target always beneficiates from combining into FMA for a
+ /// given value type. This must typically return false on targets where FMA
+ /// takes more cycles to execute than FADD.
+ bool enableAggressiveFMAFusion(EVT VT) const override;
+
+ /// getPreIndexedAddressParts - returns true by value, base pointer and
+ /// offset pointer and addressing mode by reference if the node's address
+ /// can be legally represented as pre-indexed load / store address.
+ bool getPreIndexedAddressParts(SDNode *N, SDValue &Base,
+ SDValue &Offset,
+ ISD::MemIndexedMode &AM,
+ SelectionDAG &DAG) const override;
+
+ /// SelectAddressRegReg - Given the specified addressed, check to see if it
+ /// can be represented as an indexed [r+r] operation. Returns false if it
+ /// can be more efficiently represented with [r+imm].
+ bool SelectAddressRegReg(SDValue N, SDValue &Base, SDValue &Index,
+ SelectionDAG &DAG) const;
+
+ /// SelectAddressRegImm - Returns true if the address N can be represented
+ /// by a base register plus a signed 16-bit displacement [r+imm], and if it
+ /// is not better represented as reg+reg. If Aligned is true, only accept
+ /// displacements suitable for STD and friends, i.e. multiples of 4.
+ bool SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base,
+ SelectionDAG &DAG, bool Aligned) const;
+
+ /// SelectAddressRegRegOnly - Given the specified addressed, force it to be
+ /// represented as an indexed [r+r] operation.
+ bool SelectAddressRegRegOnly(SDValue N, SDValue &Base, SDValue &Index,
+ SelectionDAG &DAG) const;
+
+ Sched::Preference getSchedulingPreference(SDNode *N) const override;
+
+ /// LowerOperation - Provide custom lowering hooks for some operations.
+ ///
+ SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+ /// ReplaceNodeResults - Replace the results of node with an illegal result
+ /// type with new values built out of custom code.
+ ///
+ void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+ SelectionDAG &DAG) const override;
+
+ SDValue expandVSXLoadForLE(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue expandVSXStoreForLE(SDNode *N, DAGCombinerInfo &DCI) const;
+
+ SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+ SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
+ std::vector<SDNode *> *Created) const override;
+
+ unsigned getRegisterByName(const char* RegName, EVT VT,
+ SelectionDAG &DAG) const override;
+
+ void computeKnownBitsForTargetNode(const SDValue Op,
+ APInt &KnownZero,
+ APInt &KnownOne,
+ const SelectionDAG &DAG,
+ unsigned Depth = 0) const override;
+
+ unsigned getPrefLoopAlignment(MachineLoop *ML) const override;
+
+ bool shouldInsertFencesForAtomic(const Instruction *I) const override {
+ return true;
+ }
+
+ Instruction* emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
+ bool IsStore, bool IsLoad) const override;
+ Instruction* emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
+ bool IsStore, bool IsLoad) const override;
+
+ MachineBasicBlock *
+ EmitInstrWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *MBB) const override;
+ MachineBasicBlock *EmitAtomicBinary(MachineInstr &MI,
+ MachineBasicBlock *MBB,
+ unsigned AtomicSize,
+ unsigned BinOpcode,
+ unsigned CmpOpcode = 0,
+ unsigned CmpPred = 0) const;
+ MachineBasicBlock *EmitPartwordAtomicBinary(MachineInstr &MI,
+ MachineBasicBlock *MBB,
+ bool is8bit,
+ unsigned Opcode,
+ unsigned CmpOpcode = 0,
+ unsigned CmpPred = 0) const;
+
+ MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI,
+ MachineBasicBlock *MBB) const;
+
+ MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr &MI,
+ MachineBasicBlock *MBB) const;
+
+ ConstraintType getConstraintType(StringRef Constraint) const override;
+
+ /// Examine constraint string and operand type and determine a weight value.
+ /// The operand object must already have been set up with the operand type.
+ ConstraintWeight getSingleConstraintMatchWeight(
+ AsmOperandInfo &info, const char *constraint) const override;
+
+ std::pair<unsigned, const TargetRegisterClass *>
+ getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ StringRef Constraint, MVT VT) const override;
+
+ /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
+ /// function arguments in the caller parameter area. This is the actual
+ /// alignment, not its logarithm.
+ unsigned getByValTypeAlignment(Type *Ty,
+ const DataLayout &DL) const override;
+
+ /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+ /// vector. If it is invalid, don't add anything to Ops.
+ void LowerAsmOperandForConstraint(SDValue Op,
+ std::string &Constraint,
+ std::vector<SDValue> &Ops,
+ SelectionDAG &DAG) const override;
+
+ unsigned
+ getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
+ if (ConstraintCode == "es")
+ return InlineAsm::Constraint_es;
+ else if (ConstraintCode == "o")
+ return InlineAsm::Constraint_o;
+ else if (ConstraintCode == "Q")
+ return InlineAsm::Constraint_Q;
+ else if (ConstraintCode == "Z")
+ return InlineAsm::Constraint_Z;
+ else if (ConstraintCode == "Zy")
+ return InlineAsm::Constraint_Zy;
+ return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
+ }
+
+ /// isLegalAddressingMode - Return true if the addressing mode represented
+ /// by AM is legal for this target, for a load/store of the specified type.
+ bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
+ Type *Ty, unsigned AS) const override;
+
+ /// isLegalICmpImmediate - Return true if the specified immediate is legal
+ /// icmp immediate, that is the target has icmp instructions which can
+ /// compare a register against the immediate without having to materialize
+ /// the immediate into a register.
+ bool isLegalICmpImmediate(int64_t Imm) const override;
+
+ /// isLegalAddImmediate - Return true if the specified immediate is legal
+ /// add immediate, that is the target has add instructions which can
+ /// add a register and the immediate without having to materialize
+ /// the immediate into a register.
+ bool isLegalAddImmediate(int64_t Imm) const override;
+
+ /// isTruncateFree - Return true if it's free to truncate a value of
+ /// type Ty1 to type Ty2. e.g. On PPC it's free to truncate a i64 value in
+ /// register X1 to i32 by referencing its sub-register R1.
+ bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
+ bool isTruncateFree(EVT VT1, EVT VT2) const override;
+
+ bool isZExtFree(SDValue Val, EVT VT2) const override;
+
+ bool isFPExtFree(EVT VT) const override;
+
+ /// \brief Returns true if it is beneficial to convert a load of a constant
+ /// to just the constant itself.
+ bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+ Type *Ty) const override;
+
+ bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+
+ bool getTgtMemIntrinsic(IntrinsicInfo &Info,
+ const CallInst &I,
+ unsigned Intrinsic) const override;
+
+ /// getOptimalMemOpType - Returns the target specific optimal type for load
+ /// and store operations as a result of memset, memcpy, and memmove
+ /// lowering. If DstAlign is zero that means it's safe to destination
+ /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
+ /// means there isn't a need to check it against alignment requirement,
+ /// probably because the source does not need to be loaded. If 'IsMemset' is
+ /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
+ /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
+ /// source is constant so it does not need to be loaded.
+ /// It returns EVT::Other if the type should be determined using generic
+ /// target-independent logic.
+ EVT
+ getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
+ bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
+ MachineFunction &MF) const override;
+
+ /// Is unaligned memory access allowed for the given type, and is it fast
+ /// relative to software emulation.
+ bool allowsMisalignedMemoryAccesses(EVT VT,
+ unsigned AddrSpace,
+ unsigned Align = 1,
+ bool *Fast = nullptr) const override;
+
+ /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
+ /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
+ /// expanded to FMAs when this method returns true, otherwise fmuladd is
+ /// expanded to fmul + fadd.
+ bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
+
+ const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
+
+ // Should we expand the build vector with shuffles?
+ bool
+ shouldExpandBuildVectorWithShuffles(EVT VT,
+ unsigned DefinedValues) const override;
+
+ /// createFastISel - This method returns a target-specific FastISel object,
+ /// or null if the target does not support "fast" instruction selection.
+ FastISel *createFastISel(FunctionLoweringInfo &FuncInfo,
+ const TargetLibraryInfo *LibInfo) const override;
+
+ /// \brief Returns true if an argument of type Ty needs to be passed in a
+ /// contiguous block of registers in calling convention CallConv.
+ bool functionArgumentNeedsConsecutiveRegisters(
+ Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override {
+ // We support any array type as "consecutive" block in the parameter
+ // save area. The element type defines the alignment requirement and
+ // whether the argument should go in GPRs, FPRs, or VRs if available.
+ //
+ // Note that clang uses this capability both to implement the ELFv2
+ // homogeneous float/vector aggregate ABI, and to avoid having to use
+ // "byval" when passing aggregates that might fully fit in registers.
+ return Ty->isArrayTy();
+ }
+
+ /// If a physical register, this returns the register that receives the
+ /// exception address on entry to an EH pad.
+ unsigned
+ getExceptionPointerRegister(const Constant *PersonalityFn) const override;
+
+ /// If a physical register, this returns the register that receives the
+ /// exception typeid on entry to a landing pad.
+ unsigned
+ getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
+
+ /// Override to support customized stack guard loading.
+ bool useLoadStackGuardNode() const override;
+ void insertSSPDeclarations(Module &M) const override;
+
+ bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+
+ unsigned getJumpTableEncoding() const override;
+ bool isJumpTableRelative() const override;
+ SDValue getPICJumpTableRelocBase(SDValue Table,
+ SelectionDAG &DAG) const override;
+ const MCExpr *getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
+ unsigned JTI,
+ MCContext &Ctx) const override;
+
+ private:
+ struct ReuseLoadInfo {
+ SDValue Ptr;
+ SDValue Chain;
+ SDValue ResChain;
+ MachinePointerInfo MPI;
+ bool IsDereferenceable;
+ bool IsInvariant;
+ unsigned Alignment;
+ AAMDNodes AAInfo;
+ const MDNode *Ranges;
+
+ ReuseLoadInfo()
+ : IsDereferenceable(false), IsInvariant(false), Alignment(0),
+ Ranges(nullptr) {}
+
+ MachineMemOperand::Flags MMOFlags() const {
+ MachineMemOperand::Flags F = MachineMemOperand::MONone;
+ if (IsDereferenceable)
+ F |= MachineMemOperand::MODereferenceable;
+ if (IsInvariant)
+ F |= MachineMemOperand::MOInvariant;
+ return F;
+ }
+ };
+
+ bool canReuseLoadAddress(SDValue Op, EVT MemVT, ReuseLoadInfo &RLI,
+ SelectionDAG &DAG,
+ ISD::LoadExtType ET = ISD::NON_EXTLOAD) const;
+ void spliceIntoChain(SDValue ResChain, SDValue NewResChain,
+ SelectionDAG &DAG) const;
+
+ void LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI,
+ SelectionDAG &DAG, const SDLoc &dl) const;
+ SDValue LowerFP_TO_INTDirectMove(SDValue Op, SelectionDAG &DAG,
+ const SDLoc &dl) const;
+
+ bool directMoveIsProfitable(const SDValue &Op) const;
+ SDValue LowerINT_TO_FPDirectMove(SDValue Op, SelectionDAG &DAG,
+ const SDLoc &dl) const;
+
+ SDValue getFramePointerFrameIndex(SelectionDAG & DAG) const;
+ SDValue getReturnAddrFrameIndex(SelectionDAG & DAG) const;
+
+ bool
+ IsEligibleForTailCallOptimization(SDValue Callee,
+ CallingConv::ID CalleeCC,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ SelectionDAG& DAG) const;
+
+ bool
+ IsEligibleForTailCallOptimization_64SVR4(
+ SDValue Callee,
+ CallingConv::ID CalleeCC,
+ ImmutableCallSite *CS,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ SelectionDAG& DAG) const;
+
+ SDValue EmitTailCallLoadFPAndRetAddr(SelectionDAG &DAG, int SPDiff,
+ SDValue Chain, SDValue &LROpOut,
+ SDValue &FPOpOut,
+ const SDLoc &dl) const;
+
+ SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerEH_DWARF_CFA(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
+ const SDLoc &dl) const;
+ SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+ CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const;
+ SDValue FinishCall(CallingConv::ID CallConv, const SDLoc &dl,
+ bool isTailCall, bool isVarArg, bool isPatchPoint,
+ bool hasNest, SelectionDAG &DAG,
+ SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass,
+ SDValue InFlag, SDValue Chain, SDValue CallSeqStart,
+ SDValue &Callee, int SPDiff, unsigned NumBytes,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ SmallVectorImpl<SDValue> &InVals,
+ ImmutableCallSite *CS) const;
+
+ SDValue
+ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const override;
+
+ SDValue
+ LowerCall(TargetLowering::CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const override;
+
+ bool
+ CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ LLVMContext &Context) const override;
+
+ SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SDLoc &dl, SelectionDAG &DAG) const override;
+
+ SDValue extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT,
+ SelectionDAG &DAG, SDValue ArgVal,
+ const SDLoc &dl) const;
+
+ SDValue LowerFormalArguments_Darwin(
+ SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const;
+ SDValue LowerFormalArguments_64SVR4(
+ SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const;
+ SDValue LowerFormalArguments_32SVR4(
+ SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const;
+
+ SDValue createMemcpyOutsideCallSeq(SDValue Arg, SDValue PtrOff,
+ SDValue CallSeqStart,
+ ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
+ const SDLoc &dl) const;
+
+ SDValue LowerCall_Darwin(SDValue Chain, SDValue Callee,
+ CallingConv::ID CallConv, bool isVarArg,
+ bool isTailCall, bool isPatchPoint,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals,
+ ImmutableCallSite *CS) const;
+ SDValue LowerCall_64SVR4(SDValue Chain, SDValue Callee,
+ CallingConv::ID CallConv, bool isVarArg,
+ bool isTailCall, bool isPatchPoint,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals,
+ ImmutableCallSite *CS) const;
+ SDValue LowerCall_32SVR4(SDValue Chain, SDValue Callee,
+ CallingConv::ID CallConv, bool isVarArg,
+ bool isTailCall, bool isPatchPoint,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals,
+ ImmutableCallSite *CS) const;
+
+ SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue DAGCombineExtBoolTrunc(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue DAGCombineBuildVector(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue DAGCombineTruncBoolExt(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue combineFPToIntToFP(SDNode *N, DAGCombinerInfo &DCI) const;
+
+ /// ConvertSETCCToSubtract - looks at SETCC that compares ints. It replaces
+ /// SETCC with integer subtraction when (1) there is a legal way of doing it
+ /// (2) keeping the result of comparison in GPR has performance benefit.
+ SDValue ConvertSETCCToSubtract(SDNode *N, DAGCombinerInfo &DCI) const;
+
+ SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
+ int &RefinementSteps, bool &UseOneConstNR,
+ bool Reciprocal) const override;
+ SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
+ int &RefinementSteps) const override;
+ unsigned combineRepeatedFPDivisors() const override;
+
+ CCAssignFn *useFastISelCCs(unsigned Flag) const;
+
+ SDValue
+ combineElementTruncationToVectorTruncation(SDNode *N,
+ DAGCombinerInfo &DCI) const;
+ };
+
+ namespace PPC {
+ FastISel *createFastISel(FunctionLoweringInfo &FuncInfo,
+ const TargetLibraryInfo *LibInfo);
+ }
+
+ bool CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags,
+ CCState &State);
+
+ bool CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
+ MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags,
+ CCState &State);
+
+ bool
+ CC_PPC32_SVR4_Custom_SkipLastArgRegsPPCF128(unsigned &ValNo, MVT &ValVT,
+ MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags,
+ CCState &State);
+
+ bool CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
+ MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags,
+ CCState &State);
+}
+
+#endif // LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
new file mode 100644
index 000000000000..03b2257a88a8
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -0,0 +1,1301 @@
+//===-- PPCInstr64Bit.td - The PowerPC 64-bit Support ------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the PowerPC 64-bit instructions. These patterns are used
+// both when in ppc64 mode and when in "use 64-bit extensions in 32-bit" mode.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// 64-bit operands.
+//
+def s16imm64 : Operand<i64> {
+ let PrintMethod = "printS16ImmOperand";
+ let EncoderMethod = "getImm16Encoding";
+ let ParserMatchClass = PPCS16ImmAsmOperand;
+ let DecoderMethod = "decodeSImmOperand<16>";
+}
+def u16imm64 : Operand<i64> {
+ let PrintMethod = "printU16ImmOperand";
+ let EncoderMethod = "getImm16Encoding";
+ let ParserMatchClass = PPCU16ImmAsmOperand;
+ let DecoderMethod = "decodeUImmOperand<16>";
+}
+def s17imm64 : Operand<i64> {
+ // This operand type is used for addis/lis to allow the assembler parser
+ // to accept immediates in the range -65536..65535 for compatibility with
+ // the GNU assembler. The operand is treated as 16-bit otherwise.
+ let PrintMethod = "printS16ImmOperand";
+ let EncoderMethod = "getImm16Encoding";
+ let ParserMatchClass = PPCS17ImmAsmOperand;
+ let DecoderMethod = "decodeSImmOperand<16>";
+}
+def tocentry : Operand<iPTR> {
+ let MIOperandInfo = (ops i64imm:$imm);
+}
+def tlsreg : Operand<i64> {
+ let EncoderMethod = "getTLSRegEncoding";
+ let ParserMatchClass = PPCTLSRegOperand;
+}
+def tlsgd : Operand<i64> {}
+def tlscall : Operand<i64> {
+ let PrintMethod = "printTLSCall";
+ let MIOperandInfo = (ops calltarget:$func, tlsgd:$sym);
+ let EncoderMethod = "getTLSCallEncoding";
+}
+
+//===----------------------------------------------------------------------===//
+// 64-bit transformation functions.
+//
+
+def SHL64 : SDNodeXForm<imm, [{
+ // Transformation function: 63 - imm
+ return getI32Imm(63 - N->getZExtValue(), SDLoc(N));
+}]>;
+
+def SRL64 : SDNodeXForm<imm, [{
+ // Transformation function: 64 - imm
+ return N->getZExtValue() ? getI32Imm(64 - N->getZExtValue(), SDLoc(N))
+ : getI32Imm(0, SDLoc(N));
+}]>;
+
+
+//===----------------------------------------------------------------------===//
+// Calls.
+//
+
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in {
+ let isReturn = 1, Uses = [LR8, RM] in
+ def BLR8 : XLForm_2_ext<19, 16, 20, 0, 0, (outs), (ins), "blr", IIC_BrB,
+ [(retflag)]>, Requires<[In64BitMode]>;
+ let isBranch = 1, isIndirectBranch = 1, Uses = [CTR8] in {
+ def BCTR8 : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB,
+ []>,
+ Requires<[In64BitMode]>;
+ def BCCCTR8 : XLForm_2_br<19, 528, 0, (outs), (ins pred:$cond),
+ "b${cond:cc}ctr${cond:pm} ${cond:reg}", IIC_BrB,
+ []>,
+ Requires<[In64BitMode]>;
+
+ def BCCTR8 : XLForm_2_br2<19, 528, 12, 0, (outs), (ins crbitrc:$bi),
+ "bcctr 12, $bi, 0", IIC_BrB, []>,
+ Requires<[In64BitMode]>;
+ def BCCTR8n : XLForm_2_br2<19, 528, 4, 0, (outs), (ins crbitrc:$bi),
+ "bcctr 4, $bi, 0", IIC_BrB, []>,
+ Requires<[In64BitMode]>;
+ }
+}
+
+let Defs = [LR8] in
+ def MovePCtoLR8 : Pseudo<(outs), (ins), "#MovePCtoLR8", []>,
+ PPC970_Unit_BRU;
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
+ let Defs = [CTR8], Uses = [CTR8] in {
+ def BDZ8 : BForm_1<16, 18, 0, 0, (outs), (ins condbrtarget:$dst),
+ "bdz $dst">;
+ def BDNZ8 : BForm_1<16, 16, 0, 0, (outs), (ins condbrtarget:$dst),
+ "bdnz $dst">;
+ }
+
+ let isReturn = 1, Defs = [CTR8], Uses = [CTR8, LR8, RM] in {
+ def BDZLR8 : XLForm_2_ext<19, 16, 18, 0, 0, (outs), (ins),
+ "bdzlr", IIC_BrB, []>;
+ def BDNZLR8 : XLForm_2_ext<19, 16, 16, 0, 0, (outs), (ins),
+ "bdnzlr", IIC_BrB, []>;
+ }
+}
+
+
+
+let isCall = 1, PPC970_Unit = 7, Defs = [LR8] in {
+ // Convenient aliases for call instructions
+ let Uses = [RM] in {
+ def BL8 : IForm<18, 0, 1, (outs), (ins calltarget:$func),
+ "bl $func", IIC_BrB, []>; // See Pat patterns below.
+
+ def BL8_TLS : IForm<18, 0, 1, (outs), (ins tlscall:$func),
+ "bl $func", IIC_BrB, []>;
+
+ def BLA8 : IForm<18, 1, 1, (outs), (ins abscalltarget:$func),
+ "bla $func", IIC_BrB, [(PPCcall (i64 imm:$func))]>;
+ }
+ let Uses = [RM], isCodeGenOnly = 1 in {
+ def BL8_NOP : IForm_and_DForm_4_zero<18, 0, 1, 24,
+ (outs), (ins calltarget:$func),
+ "bl $func\n\tnop", IIC_BrB, []>;
+
+ def BL8_NOP_TLS : IForm_and_DForm_4_zero<18, 0, 1, 24,
+ (outs), (ins tlscall:$func),
+ "bl $func\n\tnop", IIC_BrB, []>;
+
+ def BLA8_NOP : IForm_and_DForm_4_zero<18, 1, 1, 24,
+ (outs), (ins abscalltarget:$func),
+ "bla $func\n\tnop", IIC_BrB,
+ [(PPCcall_nop (i64 imm:$func))]>;
+ }
+ let Uses = [CTR8, RM] in {
+ def BCTRL8 : XLForm_2_ext<19, 528, 20, 0, 1, (outs), (ins),
+ "bctrl", IIC_BrB, [(PPCbctrl)]>,
+ Requires<[In64BitMode]>;
+
+ let isCodeGenOnly = 1 in {
+ def BCCCTRL8 : XLForm_2_br<19, 528, 1, (outs), (ins pred:$cond),
+ "b${cond:cc}ctrl${cond:pm} ${cond:reg}", IIC_BrB,
+ []>,
+ Requires<[In64BitMode]>;
+
+ def BCCTRL8 : XLForm_2_br2<19, 528, 12, 1, (outs), (ins crbitrc:$bi),
+ "bcctrl 12, $bi, 0", IIC_BrB, []>,
+ Requires<[In64BitMode]>;
+ def BCCTRL8n : XLForm_2_br2<19, 528, 4, 1, (outs), (ins crbitrc:$bi),
+ "bcctrl 4, $bi, 0", IIC_BrB, []>,
+ Requires<[In64BitMode]>;
+ }
+ }
+}
+
+let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1,
+ Defs = [LR8, X2], Uses = [CTR8, RM], RST = 2 in {
+ def BCTRL8_LDinto_toc :
+ XLForm_2_ext_and_DSForm_1<19, 528, 20, 0, 1, 58, 0, (outs),
+ (ins memrix:$src),
+ "bctrl\n\tld 2, $src", IIC_BrB,
+ [(PPCbctrl_load_toc ixaddr:$src)]>,
+ Requires<[In64BitMode]>;
+}
+
+} // Interpretation64Bit
+
+// FIXME: Duplicating this for the asm parser should be unnecessary, but the
+// previous definition must be marked as CodeGen only to prevent decoding
+// conflicts.
+let Interpretation64Bit = 1, isAsmParserOnly = 1 in
+let isCall = 1, PPC970_Unit = 7, Defs = [LR8], Uses = [RM] in
+def BL8_TLS_ : IForm<18, 0, 1, (outs), (ins tlscall:$func),
+ "bl $func", IIC_BrB, []>;
+
+// Calls
+def : Pat<(PPCcall (i64 tglobaladdr:$dst)),
+ (BL8 tglobaladdr:$dst)>;
+def : Pat<(PPCcall_nop (i64 tglobaladdr:$dst)),
+ (BL8_NOP tglobaladdr:$dst)>;
+
+def : Pat<(PPCcall (i64 texternalsym:$dst)),
+ (BL8 texternalsym:$dst)>;
+def : Pat<(PPCcall_nop (i64 texternalsym:$dst)),
+ (BL8_NOP texternalsym:$dst)>;
+
+// Atomic operations
+let usesCustomInserter = 1 in {
+ let Defs = [CR0] in {
+ def ATOMIC_LOAD_ADD_I64 : Pseudo<
+ (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_ADD_I64",
+ [(set i64:$dst, (atomic_load_add_64 xoaddr:$ptr, i64:$incr))]>;
+ def ATOMIC_LOAD_SUB_I64 : Pseudo<
+ (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_SUB_I64",
+ [(set i64:$dst, (atomic_load_sub_64 xoaddr:$ptr, i64:$incr))]>;
+ def ATOMIC_LOAD_OR_I64 : Pseudo<
+ (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_OR_I64",
+ [(set i64:$dst, (atomic_load_or_64 xoaddr:$ptr, i64:$incr))]>;
+ def ATOMIC_LOAD_XOR_I64 : Pseudo<
+ (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_XOR_I64",
+ [(set i64:$dst, (atomic_load_xor_64 xoaddr:$ptr, i64:$incr))]>;
+ def ATOMIC_LOAD_AND_I64 : Pseudo<
+ (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_AND_i64",
+ [(set i64:$dst, (atomic_load_and_64 xoaddr:$ptr, i64:$incr))]>;
+ def ATOMIC_LOAD_NAND_I64 : Pseudo<
+ (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_NAND_I64",
+ [(set i64:$dst, (atomic_load_nand_64 xoaddr:$ptr, i64:$incr))]>;
+ def ATOMIC_LOAD_MIN_I64 : Pseudo<
+ (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_MIN_I64",
+ [(set i64:$dst, (atomic_load_min_64 xoaddr:$ptr, i64:$incr))]>;
+ def ATOMIC_LOAD_MAX_I64 : Pseudo<
+ (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_MAX_I64",
+ [(set i64:$dst, (atomic_load_max_64 xoaddr:$ptr, i64:$incr))]>;
+ def ATOMIC_LOAD_UMIN_I64 : Pseudo<
+ (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_UMIN_I64",
+ [(set i64:$dst, (atomic_load_umin_64 xoaddr:$ptr, i64:$incr))]>;
+ def ATOMIC_LOAD_UMAX_I64 : Pseudo<
+ (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_UMAX_I64",
+ [(set i64:$dst, (atomic_load_umax_64 xoaddr:$ptr, i64:$incr))]>;
+
+ def ATOMIC_CMP_SWAP_I64 : Pseudo<
+ (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$old, g8rc:$new), "#ATOMIC_CMP_SWAP_I64",
+ [(set i64:$dst, (atomic_cmp_swap_64 xoaddr:$ptr, i64:$old, i64:$new))]>;
+
+ def ATOMIC_SWAP_I64 : Pseudo<
+ (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$new), "#ATOMIC_SWAP_I64",
+ [(set i64:$dst, (atomic_swap_64 xoaddr:$ptr, i64:$new))]>;
+ }
+}
+
+// Instructions to support atomic operations
+let mayLoad = 1, hasSideEffects = 0 in {
+def LDARX : XForm_1<31, 84, (outs g8rc:$rD), (ins memrr:$ptr),
+ "ldarx $rD, $ptr", IIC_LdStLDARX, []>;
+
+// Instruction to support lock versions of atomics
+// (EH=1 - see Power ISA 2.07 Book II 4.4.2)
+def LDARXL : XForm_1<31, 84, (outs g8rc:$rD), (ins memrr:$ptr),
+ "ldarx $rD, $ptr, 1", IIC_LdStLDARX, []>, isDOT;
+
+let hasExtraDefRegAllocReq = 1 in
+def LDAT : X_RD5_RS5_IM5<31, 614, (outs g8rc:$rD), (ins g8rc:$rA, u5imm:$FC),
+ "ldat $rD, $rA, $FC", IIC_LdStLoad>, isPPC64,
+ Requires<[IsISA3_0]>;
+}
+
+let Defs = [CR0], mayStore = 1, hasSideEffects = 0 in
+def STDCX : XForm_1<31, 214, (outs), (ins g8rc:$rS, memrr:$dst),
+ "stdcx. $rS, $dst", IIC_LdStSTDCX, []>, isDOT;
+
+let mayStore = 1, hasSideEffects = 0 in
+def STDAT : X_RD5_RS5_IM5<31, 742, (outs), (ins g8rc:$rS, g8rc:$rA, u5imm:$FC),
+ "stdat $rS, $rA, $FC", IIC_LdStStore>, isPPC64,
+ Requires<[IsISA3_0]>;
+
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
+def TCRETURNdi8 :Pseudo< (outs),
+ (ins calltarget:$dst, i32imm:$offset),
+ "#TC_RETURNd8 $dst $offset",
+ []>;
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
+def TCRETURNai8 :Pseudo<(outs), (ins abscalltarget:$func, i32imm:$offset),
+ "#TC_RETURNa8 $func $offset",
+ [(PPCtc_return (i64 imm:$func), imm:$offset)]>;
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
+def TCRETURNri8 : Pseudo<(outs), (ins CTRRC8:$dst, i32imm:$offset),
+ "#TC_RETURNr8 $dst $offset",
+ []>;
+
+let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7, isBranch = 1,
+ isIndirectBranch = 1, isCall = 1, isReturn = 1, Uses = [CTR8, RM] in
+def TAILBCTR8 : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB,
+ []>,
+ Requires<[In64BitMode]>;
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
+ isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in
+def TAILB8 : IForm<18, 0, 0, (outs), (ins calltarget:$dst),
+ "b $dst", IIC_BrB,
+ []>;
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
+ isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in
+def TAILBA8 : IForm<18, 0, 0, (outs), (ins abscalltarget:$dst),
+ "ba $dst", IIC_BrB,
+ []>;
+} // Interpretation64Bit
+
+def : Pat<(PPCtc_return (i64 tglobaladdr:$dst), imm:$imm),
+ (TCRETURNdi8 tglobaladdr:$dst, imm:$imm)>;
+
+def : Pat<(PPCtc_return (i64 texternalsym:$dst), imm:$imm),
+ (TCRETURNdi8 texternalsym:$dst, imm:$imm)>;
+
+def : Pat<(PPCtc_return CTRRC8:$dst, imm:$imm),
+ (TCRETURNri8 CTRRC8:$dst, imm:$imm)>;
+
+
+// 64-bit CR instructions
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+let hasSideEffects = 0 in {
+// mtocrf's input needs to be prepared by shifting by an amount dependent
+// on the cr register selected. Thus, post-ra anti-dep breaking must not
+// later change that register assignment.
+let hasExtraDefRegAllocReq = 1 in {
+def MTOCRF8: XFXForm_5a<31, 144, (outs crbitm:$FXM), (ins g8rc:$ST),
+ "mtocrf $FXM, $ST", IIC_BrMCRX>,
+ PPC970_DGroup_First, PPC970_Unit_CRU;
+
+// Similarly to mtocrf, the mask for mtcrf must be prepared in a way that
+// is dependent on the cr fields being set.
+def MTCRF8 : XFXForm_5<31, 144, (outs), (ins i32imm:$FXM, g8rc:$rS),
+ "mtcrf $FXM, $rS", IIC_BrMCRX>,
+ PPC970_MicroCode, PPC970_Unit_CRU;
+} // hasExtraDefRegAllocReq = 1
+
+// mfocrf's input needs to be prepared by shifting by an amount dependent
+// on the cr register selected. Thus, post-ra anti-dep breaking must not
+// later change that register assignment.
+let hasExtraSrcRegAllocReq = 1 in {
+def MFOCRF8: XFXForm_5a<31, 19, (outs g8rc:$rT), (ins crbitm:$FXM),
+ "mfocrf $rT, $FXM", IIC_SprMFCRF>,
+ PPC970_DGroup_First, PPC970_Unit_CRU;
+
+// Similarly to mfocrf, the mask for mfcrf must be prepared in a way that
+// is dependent on the cr fields being copied.
+def MFCR8 : XFXForm_3<31, 19, (outs g8rc:$rT), (ins),
+ "mfcr $rT", IIC_SprMFCR>,
+ PPC970_MicroCode, PPC970_Unit_CRU;
+} // hasExtraSrcRegAllocReq = 1
+} // hasSideEffects = 0
+
+let hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in {
+ let Defs = [CTR8] in
+ def EH_SjLj_SetJmp64 : Pseudo<(outs gprc:$dst), (ins memr:$buf),
+ "#EH_SJLJ_SETJMP64",
+ [(set i32:$dst, (PPCeh_sjlj_setjmp addr:$buf))]>,
+ Requires<[In64BitMode]>;
+ let isTerminator = 1 in
+ def EH_SjLj_LongJmp64 : Pseudo<(outs), (ins memr:$buf),
+ "#EH_SJLJ_LONGJMP64",
+ [(PPCeh_sjlj_longjmp addr:$buf)]>,
+ Requires<[In64BitMode]>;
+}
+
+def MFSPR8 : XFXForm_1<31, 339, (outs g8rc:$RT), (ins i32imm:$SPR),
+ "mfspr $RT, $SPR", IIC_SprMFSPR>;
+def MTSPR8 : XFXForm_1<31, 467, (outs), (ins i32imm:$SPR, g8rc:$RT),
+ "mtspr $SPR, $RT", IIC_SprMTSPR>;
+
+
+//===----------------------------------------------------------------------===//
+// 64-bit SPR manipulation instrs.
+
+let Uses = [CTR8] in {
+def MFCTR8 : XFXForm_1_ext<31, 339, 9, (outs g8rc:$rT), (ins),
+ "mfctr $rT", IIC_SprMFSPR>,
+ PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+let Pattern = [(PPCmtctr i64:$rS)], Defs = [CTR8] in {
+def MTCTR8 : XFXForm_7_ext<31, 467, 9, (outs), (ins g8rc:$rS),
+ "mtctr $rS", IIC_SprMTSPR>,
+ PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+let hasSideEffects = 1, Defs = [CTR8] in {
+let Pattern = [(int_ppc_mtctr i64:$rS)] in
+def MTCTR8loop : XFXForm_7_ext<31, 467, 9, (outs), (ins g8rc:$rS),
+ "mtctr $rS", IIC_SprMTSPR>,
+ PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+
+let Pattern = [(set i64:$rT, readcyclecounter)] in
+def MFTB8 : XFXForm_1_ext<31, 339, 268, (outs g8rc:$rT), (ins),
+ "mfspr $rT, 268", IIC_SprMFTB>,
+ PPC970_DGroup_First, PPC970_Unit_FXU;
+// Note that encoding mftb using mfspr is now the preferred form,
+// and has been since at least ISA v2.03. The mftb instruction has
+// now been phased out. Using mfspr, however, is known not to work on
+// the POWER3.
+
+let Defs = [X1], Uses = [X1] in
+def DYNALLOC8 : Pseudo<(outs g8rc:$result), (ins g8rc:$negsize, memri:$fpsi),"#DYNALLOC8",
+ [(set i64:$result,
+ (PPCdynalloc i64:$negsize, iaddr:$fpsi))]>;
+def DYNAREAOFFSET8 : Pseudo<(outs i64imm:$result), (ins memri:$fpsi), "#DYNAREAOFFSET8",
+ [(set i64:$result, (PPCdynareaoffset iaddr:$fpsi))]>;
+
+let Defs = [LR8] in {
+def MTLR8 : XFXForm_7_ext<31, 467, 8, (outs), (ins g8rc:$rS),
+ "mtlr $rS", IIC_SprMTSPR>,
+ PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+let Uses = [LR8] in {
+def MFLR8 : XFXForm_1_ext<31, 339, 8, (outs g8rc:$rT), (ins),
+ "mflr $rT", IIC_SprMFSPR>,
+ PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+} // Interpretation64Bit
+
+//===----------------------------------------------------------------------===//
+// Fixed point instructions.
+//
+
+let PPC970_Unit = 1 in { // FXU Operations.
+let Interpretation64Bit = 1 in {
+let hasSideEffects = 0 in {
+let isCodeGenOnly = 1 in {
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
+def LI8 : DForm_2_r0<14, (outs g8rc:$rD), (ins s16imm64:$imm),
+ "li $rD, $imm", IIC_IntSimple,
+ [(set i64:$rD, imm64SExt16:$imm)]>;
+def LIS8 : DForm_2_r0<15, (outs g8rc:$rD), (ins s17imm64:$imm),
+ "lis $rD, $imm", IIC_IntSimple,
+ [(set i64:$rD, imm16ShiftedSExt:$imm)]>;
+}
+
+// Logical ops.
+let isCommutable = 1 in {
+defm NAND8: XForm_6r<31, 476, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+ "nand", "$rA, $rS, $rB", IIC_IntSimple,
+ [(set i64:$rA, (not (and i64:$rS, i64:$rB)))]>;
+defm AND8 : XForm_6r<31, 28, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+ "and", "$rA, $rS, $rB", IIC_IntSimple,
+ [(set i64:$rA, (and i64:$rS, i64:$rB))]>;
+} // isCommutable
+defm ANDC8: XForm_6r<31, 60, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+ "andc", "$rA, $rS, $rB", IIC_IntSimple,
+ [(set i64:$rA, (and i64:$rS, (not i64:$rB)))]>;
+let isCommutable = 1 in {
+defm OR8 : XForm_6r<31, 444, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+ "or", "$rA, $rS, $rB", IIC_IntSimple,
+ [(set i64:$rA, (or i64:$rS, i64:$rB))]>;
+defm NOR8 : XForm_6r<31, 124, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+ "nor", "$rA, $rS, $rB", IIC_IntSimple,
+ [(set i64:$rA, (not (or i64:$rS, i64:$rB)))]>;
+} // isCommutable
+defm ORC8 : XForm_6r<31, 412, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+ "orc", "$rA, $rS, $rB", IIC_IntSimple,
+ [(set i64:$rA, (or i64:$rS, (not i64:$rB)))]>;
+let isCommutable = 1 in {
+defm EQV8 : XForm_6r<31, 284, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+ "eqv", "$rA, $rS, $rB", IIC_IntSimple,
+ [(set i64:$rA, (not (xor i64:$rS, i64:$rB)))]>;
+defm XOR8 : XForm_6r<31, 316, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+ "xor", "$rA, $rS, $rB", IIC_IntSimple,
+ [(set i64:$rA, (xor i64:$rS, i64:$rB))]>;
+} // let isCommutable = 1
+
+// Logical ops with immediate.
+let Defs = [CR0] in {
+def ANDIo8 : DForm_4<28, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2),
+ "andi. $dst, $src1, $src2", IIC_IntGeneral,
+ [(set i64:$dst, (and i64:$src1, immZExt16:$src2))]>,
+ isDOT;
+def ANDISo8 : DForm_4<29, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2),
+ "andis. $dst, $src1, $src2", IIC_IntGeneral,
+ [(set i64:$dst, (and i64:$src1, imm16ShiftedZExt:$src2))]>,
+ isDOT;
+}
+def ORI8 : DForm_4<24, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2),
+ "ori $dst, $src1, $src2", IIC_IntSimple,
+ [(set i64:$dst, (or i64:$src1, immZExt16:$src2))]>;
+def ORIS8 : DForm_4<25, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2),
+ "oris $dst, $src1, $src2", IIC_IntSimple,
+ [(set i64:$dst, (or i64:$src1, imm16ShiftedZExt:$src2))]>;
+def XORI8 : DForm_4<26, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2),
+ "xori $dst, $src1, $src2", IIC_IntSimple,
+ [(set i64:$dst, (xor i64:$src1, immZExt16:$src2))]>;
+def XORIS8 : DForm_4<27, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2),
+ "xoris $dst, $src1, $src2", IIC_IntSimple,
+ [(set i64:$dst, (xor i64:$src1, imm16ShiftedZExt:$src2))]>;
+
+let isCommutable = 1 in
+defm ADD8 : XOForm_1r<31, 266, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+ "add", "$rT, $rA, $rB", IIC_IntSimple,
+ [(set i64:$rT, (add i64:$rA, i64:$rB))]>;
+// ADD8 has a special form: reg = ADD8(reg, sym@tls) for use by the
+// initial-exec thread-local storage model. We need to forbid r0 here -
+// while it works for add just fine, the linker can relax this to local-exec
+// addi, which won't work for r0.
+def ADD8TLS : XOForm_1<31, 266, 0, (outs g8rc:$rT), (ins g8rc_nox0:$rA, tlsreg:$rB),
+ "add $rT, $rA, $rB", IIC_IntSimple,
+ [(set i64:$rT, (add i64:$rA, tglobaltlsaddr:$rB))]>;
+
+let isCommutable = 1 in
+defm ADDC8 : XOForm_1rc<31, 10, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+ "addc", "$rT, $rA, $rB", IIC_IntGeneral,
+ [(set i64:$rT, (addc i64:$rA, i64:$rB))]>,
+ PPC970_DGroup_Cracked;
+
+let Defs = [CARRY] in
+def ADDIC8 : DForm_2<12, (outs g8rc:$rD), (ins g8rc:$rA, s16imm64:$imm),
+ "addic $rD, $rA, $imm", IIC_IntGeneral,
+ [(set i64:$rD, (addc i64:$rA, imm64SExt16:$imm))]>;
+def ADDI8 : DForm_2<14, (outs g8rc:$rD), (ins g8rc_nox0:$rA, s16imm64:$imm),
+ "addi $rD, $rA, $imm", IIC_IntSimple,
+ [(set i64:$rD, (add i64:$rA, imm64SExt16:$imm))]>;
+def ADDIS8 : DForm_2<15, (outs g8rc:$rD), (ins g8rc_nox0:$rA, s17imm64:$imm),
+ "addis $rD, $rA, $imm", IIC_IntSimple,
+ [(set i64:$rD, (add i64:$rA, imm16ShiftedSExt:$imm))]>;
+
+let Defs = [CARRY] in {
+def SUBFIC8: DForm_2< 8, (outs g8rc:$rD), (ins g8rc:$rA, s16imm64:$imm),
+ "subfic $rD, $rA, $imm", IIC_IntGeneral,
+ [(set i64:$rD, (subc imm64SExt16:$imm, i64:$rA))]>;
+}
+defm SUBFC8 : XOForm_1rc<31, 8, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+ "subfc", "$rT, $rA, $rB", IIC_IntGeneral,
+ [(set i64:$rT, (subc i64:$rB, i64:$rA))]>,
+ PPC970_DGroup_Cracked;
+defm SUBF8 : XOForm_1r<31, 40, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+ "subf", "$rT, $rA, $rB", IIC_IntGeneral,
+ [(set i64:$rT, (sub i64:$rB, i64:$rA))]>;
+defm NEG8 : XOForm_3r<31, 104, 0, (outs g8rc:$rT), (ins g8rc:$rA),
+ "neg", "$rT, $rA", IIC_IntSimple,
+ [(set i64:$rT, (ineg i64:$rA))]>;
+let Uses = [CARRY] in {
+let isCommutable = 1 in
+defm ADDE8 : XOForm_1rc<31, 138, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+ "adde", "$rT, $rA, $rB", IIC_IntGeneral,
+ [(set i64:$rT, (adde i64:$rA, i64:$rB))]>;
+defm ADDME8 : XOForm_3rc<31, 234, 0, (outs g8rc:$rT), (ins g8rc:$rA),
+ "addme", "$rT, $rA", IIC_IntGeneral,
+ [(set i64:$rT, (adde i64:$rA, -1))]>;
+defm ADDZE8 : XOForm_3rc<31, 202, 0, (outs g8rc:$rT), (ins g8rc:$rA),
+ "addze", "$rT, $rA", IIC_IntGeneral,
+ [(set i64:$rT, (adde i64:$rA, 0))]>;
+defm SUBFE8 : XOForm_1rc<31, 136, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+ "subfe", "$rT, $rA, $rB", IIC_IntGeneral,
+ [(set i64:$rT, (sube i64:$rB, i64:$rA))]>;
+defm SUBFME8 : XOForm_3rc<31, 232, 0, (outs g8rc:$rT), (ins g8rc:$rA),
+ "subfme", "$rT, $rA", IIC_IntGeneral,
+ [(set i64:$rT, (sube -1, i64:$rA))]>;
+defm SUBFZE8 : XOForm_3rc<31, 200, 0, (outs g8rc:$rT), (ins g8rc:$rA),
+ "subfze", "$rT, $rA", IIC_IntGeneral,
+ [(set i64:$rT, (sube 0, i64:$rA))]>;
+}
+} // isCodeGenOnly
+
+// FIXME: Duplicating this for the asm parser should be unnecessary, but the
+// previous definition must be marked as CodeGen only to prevent decoding
+// conflicts.
+let isAsmParserOnly = 1 in
+def ADD8TLS_ : XOForm_1<31, 266, 0, (outs g8rc:$rT), (ins g8rc:$rA, tlsreg:$rB),
+ "add $rT, $rA, $rB", IIC_IntSimple, []>;
+
+let isCommutable = 1 in {
+defm MULHD : XOForm_1r<31, 73, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+ "mulhd", "$rT, $rA, $rB", IIC_IntMulHW,
+ [(set i64:$rT, (mulhs i64:$rA, i64:$rB))]>;
+defm MULHDU : XOForm_1r<31, 9, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+ "mulhdu", "$rT, $rA, $rB", IIC_IntMulHWU,
+ [(set i64:$rT, (mulhu i64:$rA, i64:$rB))]>;
+} // isCommutable
+}
+} // Interpretation64Bit
+
+let isCompare = 1, hasSideEffects = 0 in {
+ def CMPD : XForm_16_ext<31, 0, (outs crrc:$crD), (ins g8rc:$rA, g8rc:$rB),
+ "cmpd $crD, $rA, $rB", IIC_IntCompare>, isPPC64;
+ def CMPLD : XForm_16_ext<31, 32, (outs crrc:$crD), (ins g8rc:$rA, g8rc:$rB),
+ "cmpld $crD, $rA, $rB", IIC_IntCompare>, isPPC64;
+ def CMPDI : DForm_5_ext<11, (outs crrc:$crD), (ins g8rc:$rA, s16imm64:$imm),
+ "cmpdi $crD, $rA, $imm", IIC_IntCompare>, isPPC64;
+ def CMPLDI : DForm_6_ext<10, (outs crrc:$dst), (ins g8rc:$src1, u16imm64:$src2),
+ "cmpldi $dst, $src1, $src2",
+ IIC_IntCompare>, isPPC64;
+ let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+ def CMPRB8 : X_BF3_L1_RS5_RS5<31, 192, (outs crbitrc:$BF),
+ (ins u1imm:$L, g8rc:$rA, g8rc:$rB),
+ "cmprb $BF, $L, $rA, $rB", IIC_IntCompare, []>,
+ Requires<[IsISA3_0]>;
+ def CMPEQB : X_BF3_RS5_RS5<31, 224, (outs crbitrc:$BF),
+ (ins g8rc:$rA, g8rc:$rB), "cmpeqb $BF, $rA, $rB",
+ IIC_IntCompare, []>, Requires<[IsISA3_0]>;
+}
+
+let hasSideEffects = 0 in {
+defm SLD : XForm_6r<31, 27, (outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB),
+ "sld", "$rA, $rS, $rB", IIC_IntRotateD,
+ [(set i64:$rA, (PPCshl i64:$rS, i32:$rB))]>, isPPC64;
+defm SRD : XForm_6r<31, 539, (outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB),
+ "srd", "$rA, $rS, $rB", IIC_IntRotateD,
+ [(set i64:$rA, (PPCsrl i64:$rS, i32:$rB))]>, isPPC64;
+defm SRAD : XForm_6rc<31, 794, (outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB),
+ "srad", "$rA, $rS, $rB", IIC_IntRotateD,
+ [(set i64:$rA, (PPCsra i64:$rS, i32:$rB))]>, isPPC64;
+
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+defm CNTLZW8 : XForm_11r<31, 26, (outs g8rc:$rA), (ins g8rc:$rS),
+ "cntlzw", "$rA, $rS", IIC_IntGeneral, []>;
+defm CNTTZW8 : XForm_11r<31, 538, (outs g8rc:$rA), (ins g8rc:$rS),
+ "cnttzw", "$rA, $rS", IIC_IntGeneral, []>,
+ Requires<[IsISA3_0]>;
+
+defm EXTSB8 : XForm_11r<31, 954, (outs g8rc:$rA), (ins g8rc:$rS),
+ "extsb", "$rA, $rS", IIC_IntSimple,
+ [(set i64:$rA, (sext_inreg i64:$rS, i8))]>;
+defm EXTSH8 : XForm_11r<31, 922, (outs g8rc:$rA), (ins g8rc:$rS),
+ "extsh", "$rA, $rS", IIC_IntSimple,
+ [(set i64:$rA, (sext_inreg i64:$rS, i16))]>;
+
+defm SLW8 : XForm_6r<31, 24, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+ "slw", "$rA, $rS, $rB", IIC_IntGeneral, []>;
+defm SRW8 : XForm_6r<31, 536, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+ "srw", "$rA, $rS, $rB", IIC_IntGeneral, []>;
+} // Interpretation64Bit
+
+// For fast-isel:
+let isCodeGenOnly = 1 in {
+def EXTSB8_32_64 : XForm_11<31, 954, (outs g8rc:$rA), (ins gprc:$rS),
+ "extsb $rA, $rS", IIC_IntSimple, []>, isPPC64;
+def EXTSH8_32_64 : XForm_11<31, 922, (outs g8rc:$rA), (ins gprc:$rS),
+ "extsh $rA, $rS", IIC_IntSimple, []>, isPPC64;
+} // isCodeGenOnly for fast-isel
+
+defm EXTSW : XForm_11r<31, 986, (outs g8rc:$rA), (ins g8rc:$rS),
+ "extsw", "$rA, $rS", IIC_IntSimple,
+ [(set i64:$rA, (sext_inreg i64:$rS, i32))]>, isPPC64;
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+defm EXTSW_32_64 : XForm_11r<31, 986, (outs g8rc:$rA), (ins gprc:$rS),
+ "extsw", "$rA, $rS", IIC_IntSimple,
+ [(set i64:$rA, (sext i32:$rS))]>, isPPC64;
+
+defm SRADI : XSForm_1rc<31, 413, (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH),
+ "sradi", "$rA, $rS, $SH", IIC_IntRotateDI,
+ [(set i64:$rA, (sra i64:$rS, (i32 imm:$SH)))]>, isPPC64;
+defm CNTLZD : XForm_11r<31, 58, (outs g8rc:$rA), (ins g8rc:$rS),
+ "cntlzd", "$rA, $rS", IIC_IntGeneral,
+ [(set i64:$rA, (ctlz i64:$rS))]>;
+defm CNTTZD : XForm_11r<31, 570, (outs g8rc:$rA), (ins g8rc:$rS),
+ "cnttzd", "$rA, $rS", IIC_IntGeneral,
+ [(set i64:$rA, (cttz i64:$rS))]>, Requires<[IsISA3_0]>;
+def POPCNTD : XForm_11<31, 506, (outs g8rc:$rA), (ins g8rc:$rS),
+ "popcntd $rA, $rS", IIC_IntGeneral,
+ [(set i64:$rA, (ctpop i64:$rS))]>;
+def BPERMD : XForm_6<31, 252, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+ "bpermd $rA, $rS, $rB", IIC_IntGeneral,
+ [(set i64:$rA, (int_ppc_bpermd g8rc:$rS, g8rc:$rB))]>,
+ isPPC64, Requires<[HasBPERMD]>;
+
+let isCodeGenOnly = 1, isCommutable = 1 in
+def CMPB8 : XForm_6<31, 508, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+ "cmpb $rA, $rS, $rB", IIC_IntGeneral,
+ [(set i64:$rA, (PPCcmpb i64:$rS, i64:$rB))]>;
+
+// popcntw also does a population count on the high 32 bits (storing the
+// results in the high 32-bits of the output). We'll ignore that here (which is
+// safe because we never separately use the high part of the 64-bit registers).
+def POPCNTW : XForm_11<31, 378, (outs gprc:$rA), (ins gprc:$rS),
+ "popcntw $rA, $rS", IIC_IntGeneral,
+ [(set i32:$rA, (ctpop i32:$rS))]>;
+
+defm DIVD : XOForm_1rcr<31, 489, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+ "divd", "$rT, $rA, $rB", IIC_IntDivD,
+ [(set i64:$rT, (sdiv i64:$rA, i64:$rB))]>, isPPC64;
+defm DIVDU : XOForm_1rcr<31, 457, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+ "divdu", "$rT, $rA, $rB", IIC_IntDivD,
+ [(set i64:$rT, (udiv i64:$rA, i64:$rB))]>, isPPC64;
+def DIVDE : XOForm_1<31, 425, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+ "divde $rT, $rA, $rB", IIC_IntDivD,
+ [(set i64:$rT, (int_ppc_divde g8rc:$rA, g8rc:$rB))]>,
+ isPPC64, Requires<[HasExtDiv]>;
+let Defs = [CR0] in
+def DIVDEo : XOForm_1<31, 425, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+ "divde. $rT, $rA, $rB", IIC_IntDivD,
+ []>, isDOT, PPC970_DGroup_Cracked, PPC970_DGroup_First,
+ isPPC64, Requires<[HasExtDiv]>;
+def DIVDEU : XOForm_1<31, 393, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+ "divdeu $rT, $rA, $rB", IIC_IntDivD,
+ [(set i64:$rT, (int_ppc_divdeu g8rc:$rA, g8rc:$rB))]>,
+ isPPC64, Requires<[HasExtDiv]>;
+let Defs = [CR0] in
+def DIVDEUo : XOForm_1<31, 393, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+ "divdeu. $rT, $rA, $rB", IIC_IntDivD,
+ []>, isDOT, PPC970_DGroup_Cracked, PPC970_DGroup_First,
+ isPPC64, Requires<[HasExtDiv]>;
+let isCommutable = 1 in
+defm MULLD : XOForm_1r<31, 233, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+ "mulld", "$rT, $rA, $rB", IIC_IntMulHD,
+ [(set i64:$rT, (mul i64:$rA, i64:$rB))]>, isPPC64;
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+def MULLI8 : DForm_2<7, (outs g8rc:$rD), (ins g8rc:$rA, s16imm64:$imm),
+ "mulli $rD, $rA, $imm", IIC_IntMulLI,
+ [(set i64:$rD, (mul i64:$rA, imm64SExt16:$imm))]>;
+}
+
+let hasSideEffects = 0 in {
+defm RLDIMI : MDForm_1r<30, 3, (outs g8rc:$rA),
+ (ins g8rc:$rSi, g8rc:$rS, u6imm:$SH, u6imm:$MBE),
+ "rldimi", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI,
+ []>, isPPC64, RegConstraint<"$rSi = $rA">,
+ NoEncode<"$rSi">;
+
+// Rotate instructions.
+defm RLDCL : MDSForm_1r<30, 8,
+ (outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB, u6imm:$MBE),
+ "rldcl", "$rA, $rS, $rB, $MBE", IIC_IntRotateD,
+ []>, isPPC64;
+defm RLDCR : MDSForm_1r<30, 9,
+ (outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB, u6imm:$MBE),
+ "rldcr", "$rA, $rS, $rB, $MBE", IIC_IntRotateD,
+ []>, isPPC64;
+defm RLDICL : MDForm_1r<30, 0,
+ (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE),
+ "rldicl", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI,
+ []>, isPPC64;
+// For fast-isel:
+let isCodeGenOnly = 1 in
+def RLDICL_32_64 : MDForm_1<30, 0,
+ (outs g8rc:$rA),
+ (ins gprc:$rS, u6imm:$SH, u6imm:$MBE),
+ "rldicl $rA, $rS, $SH, $MBE", IIC_IntRotateDI,
+ []>, isPPC64;
+// End fast-isel.
+defm RLDICR : MDForm_1r<30, 1,
+ (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE),
+ "rldicr", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI,
+ []>, isPPC64;
+defm RLDIC : MDForm_1r<30, 2,
+ (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE),
+ "rldic", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI,
+ []>, isPPC64;
+
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+defm RLWINM8 : MForm_2r<21, (outs g8rc:$rA),
+ (ins g8rc:$rS, u5imm:$SH, u5imm:$MB, u5imm:$ME),
+ "rlwinm", "$rA, $rS, $SH, $MB, $ME", IIC_IntGeneral,
+ []>;
+
+defm RLWNM8 : MForm_2r<23, (outs g8rc:$rA),
+ (ins g8rc:$rS, g8rc:$rB, u5imm:$MB, u5imm:$ME),
+ "rlwnm", "$rA, $rS, $rB, $MB, $ME", IIC_IntGeneral,
+ []>;
+
+// RLWIMI can be commuted if the rotate amount is zero.
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+defm RLWIMI8 : MForm_2r<20, (outs g8rc:$rA),
+ (ins g8rc:$rSi, g8rc:$rS, u5imm:$SH, u5imm:$MB,
+ u5imm:$ME), "rlwimi", "$rA, $rS, $SH, $MB, $ME",
+ IIC_IntRotate, []>, PPC970_DGroup_Cracked,
+ RegConstraint<"$rSi = $rA">, NoEncode<"$rSi">;
+
+let isSelect = 1 in
+def ISEL8 : AForm_4<31, 15,
+ (outs g8rc:$rT), (ins g8rc_nox0:$rA, g8rc:$rB, crbitrc:$cond),
+ "isel $rT, $rA, $rB, $cond", IIC_IntISEL,
+ []>;
+} // Interpretation64Bit
+} // hasSideEffects = 0
+} // End FXU Operations.
+
+
+//===----------------------------------------------------------------------===//
+// Load/Store instructions.
+//
+
+
+// Sign extending loads.
+let PPC970_Unit = 2 in {
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+def LHA8: DForm_1<42, (outs g8rc:$rD), (ins memri:$src),
+ "lha $rD, $src", IIC_LdStLHA,
+ [(set i64:$rD, (sextloadi16 iaddr:$src))]>,
+ PPC970_DGroup_Cracked;
+def LWA : DSForm_1<58, 2, (outs g8rc:$rD), (ins memrix:$src),
+ "lwa $rD, $src", IIC_LdStLWA,
+ [(set i64:$rD,
+ (aligned4sextloadi32 ixaddr:$src))]>, isPPC64,
+ PPC970_DGroup_Cracked;
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+def LHAX8: XForm_1<31, 343, (outs g8rc:$rD), (ins memrr:$src),
+ "lhax $rD, $src", IIC_LdStLHA,
+ [(set i64:$rD, (sextloadi16 xaddr:$src))]>,
+ PPC970_DGroup_Cracked;
+def LWAX : XForm_1<31, 341, (outs g8rc:$rD), (ins memrr:$src),
+ "lwax $rD, $src", IIC_LdStLHA,
+ [(set i64:$rD, (sextloadi32 xaddr:$src))]>, isPPC64,
+ PPC970_DGroup_Cracked;
+// For fast-isel:
+let isCodeGenOnly = 1, mayLoad = 1 in {
+def LWA_32 : DSForm_1<58, 2, (outs gprc:$rD), (ins memrix:$src),
+ "lwa $rD, $src", IIC_LdStLWA, []>, isPPC64,
+ PPC970_DGroup_Cracked;
+def LWAX_32 : XForm_1<31, 341, (outs gprc:$rD), (ins memrr:$src),
+ "lwax $rD, $src", IIC_LdStLHA, []>, isPPC64,
+ PPC970_DGroup_Cracked;
+} // end fast-isel isCodeGenOnly
+
+// Update forms.
+let mayLoad = 1, hasSideEffects = 0 in {
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+def LHAU8 : DForm_1<43, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+ (ins memri:$addr),
+ "lhau $rD, $addr", IIC_LdStLHAU,
+ []>, RegConstraint<"$addr.reg = $ea_result">,
+ NoEncode<"$ea_result">;
+// NO LWAU!
+
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+def LHAUX8 : XForm_1<31, 375, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+ (ins memrr:$addr),
+ "lhaux $rD, $addr", IIC_LdStLHAUX,
+ []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+ NoEncode<"$ea_result">;
+def LWAUX : XForm_1<31, 373, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+ (ins memrr:$addr),
+ "lwaux $rD, $addr", IIC_LdStLHAUX,
+ []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+ NoEncode<"$ea_result">, isPPC64;
+}
+}
+
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+// Zero extending loads.
+let PPC970_Unit = 2 in {
+def LBZ8 : DForm_1<34, (outs g8rc:$rD), (ins memri:$src),
+ "lbz $rD, $src", IIC_LdStLoad,
+ [(set i64:$rD, (zextloadi8 iaddr:$src))]>;
+def LHZ8 : DForm_1<40, (outs g8rc:$rD), (ins memri:$src),
+ "lhz $rD, $src", IIC_LdStLoad,
+ [(set i64:$rD, (zextloadi16 iaddr:$src))]>;
+def LWZ8 : DForm_1<32, (outs g8rc:$rD), (ins memri:$src),
+ "lwz $rD, $src", IIC_LdStLoad,
+ [(set i64:$rD, (zextloadi32 iaddr:$src))]>, isPPC64;
+
+def LBZX8 : XForm_1<31, 87, (outs g8rc:$rD), (ins memrr:$src),
+ "lbzx $rD, $src", IIC_LdStLoad,
+ [(set i64:$rD, (zextloadi8 xaddr:$src))]>;
+def LHZX8 : XForm_1<31, 279, (outs g8rc:$rD), (ins memrr:$src),
+ "lhzx $rD, $src", IIC_LdStLoad,
+ [(set i64:$rD, (zextloadi16 xaddr:$src))]>;
+def LWZX8 : XForm_1<31, 23, (outs g8rc:$rD), (ins memrr:$src),
+ "lwzx $rD, $src", IIC_LdStLoad,
+ [(set i64:$rD, (zextloadi32 xaddr:$src))]>;
+
+
+// Update forms.
+let mayLoad = 1, hasSideEffects = 0 in {
+def LBZU8 : DForm_1<35, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+ "lbzu $rD, $addr", IIC_LdStLoadUpd,
+ []>, RegConstraint<"$addr.reg = $ea_result">,
+ NoEncode<"$ea_result">;
+def LHZU8 : DForm_1<41, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+ "lhzu $rD, $addr", IIC_LdStLoadUpd,
+ []>, RegConstraint<"$addr.reg = $ea_result">,
+ NoEncode<"$ea_result">;
+def LWZU8 : DForm_1<33, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+ "lwzu $rD, $addr", IIC_LdStLoadUpd,
+ []>, RegConstraint<"$addr.reg = $ea_result">,
+ NoEncode<"$ea_result">;
+
+def LBZUX8 : XForm_1<31, 119, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+ (ins memrr:$addr),
+ "lbzux $rD, $addr", IIC_LdStLoadUpdX,
+ []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+ NoEncode<"$ea_result">;
+def LHZUX8 : XForm_1<31, 311, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+ (ins memrr:$addr),
+ "lhzux $rD, $addr", IIC_LdStLoadUpdX,
+ []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+ NoEncode<"$ea_result">;
+def LWZUX8 : XForm_1<31, 55, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+ (ins memrr:$addr),
+ "lwzux $rD, $addr", IIC_LdStLoadUpdX,
+ []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+ NoEncode<"$ea_result">;
+}
+}
+} // Interpretation64Bit
+
+
+// Full 8-byte loads.
+let PPC970_Unit = 2 in {
+def LD : DSForm_1<58, 0, (outs g8rc:$rD), (ins memrix:$src),
+ "ld $rD, $src", IIC_LdStLD,
+ [(set i64:$rD, (aligned4load ixaddr:$src))]>, isPPC64;
+// The following four definitions are selected for small code model only.
+// Otherwise, we need to create two instructions to form a 32-bit offset,
+// so we have a custom matcher for TOC_ENTRY in PPCDAGToDAGIsel::Select().
+def LDtoc: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg),
+ "#LDtoc",
+ [(set i64:$rD,
+ (PPCtoc_entry tglobaladdr:$disp, i64:$reg))]>, isPPC64;
+def LDtocJTI: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg),
+ "#LDtocJTI",
+ [(set i64:$rD,
+ (PPCtoc_entry tjumptable:$disp, i64:$reg))]>, isPPC64;
+def LDtocCPT: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg),
+ "#LDtocCPT",
+ [(set i64:$rD,
+ (PPCtoc_entry tconstpool:$disp, i64:$reg))]>, isPPC64;
+def LDtocBA: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg),
+ "#LDtocCPT",
+ [(set i64:$rD,
+ (PPCtoc_entry tblockaddress:$disp, i64:$reg))]>, isPPC64;
+
+def LDX : XForm_1<31, 21, (outs g8rc:$rD), (ins memrr:$src),
+ "ldx $rD, $src", IIC_LdStLD,
+ [(set i64:$rD, (load xaddr:$src))]>, isPPC64;
+def LDBRX : XForm_1<31, 532, (outs g8rc:$rD), (ins memrr:$src),
+ "ldbrx $rD, $src", IIC_LdStLoad,
+ [(set i64:$rD, (PPClbrx xoaddr:$src, i64))]>, isPPC64;
+
+let mayLoad = 1, hasSideEffects = 0, isCodeGenOnly = 1 in {
+def LHBRX8 : XForm_1<31, 790, (outs g8rc:$rD), (ins memrr:$src),
+ "lhbrx $rD, $src", IIC_LdStLoad, []>;
+def LWBRX8 : XForm_1<31, 534, (outs g8rc:$rD), (ins memrr:$src),
+ "lwbrx $rD, $src", IIC_LdStLoad, []>;
+}
+
+let mayLoad = 1, hasSideEffects = 0 in {
+def LDU : DSForm_1<58, 1, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), (ins memrix:$addr),
+ "ldu $rD, $addr", IIC_LdStLDU,
+ []>, RegConstraint<"$addr.reg = $ea_result">, isPPC64,
+ NoEncode<"$ea_result">;
+
+def LDUX : XForm_1<31, 53, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+ (ins memrr:$addr),
+ "ldux $rD, $addr", IIC_LdStLDUX,
+ []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+ NoEncode<"$ea_result">, isPPC64;
+
+def LDMX : XForm_1<31, 309, (outs g8rc:$rD), (ins memrr:$src),
+ "ldmx $rD, $src", IIC_LdStLD, []>, isPPC64,
+ Requires<[IsISA3_0]>;
+}
+}
+
+// Support for medium and large code model.
+let hasSideEffects = 0 in {
+def ADDIStocHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
+ "#ADDIStocHA", []>, isPPC64;
+let mayLoad = 1 in
+def LDtocL: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc_nox0:$reg),
+ "#LDtocL", []>, isPPC64;
+def ADDItocL: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
+ "#ADDItocL", []>, isPPC64;
+}
+
+// Support for thread-local storage.
+def ADDISgotTprelHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
+ "#ADDISgotTprelHA",
+ [(set i64:$rD,
+ (PPCaddisGotTprelHA i64:$reg,
+ tglobaltlsaddr:$disp))]>,
+ isPPC64;
+def LDgotTprelL: Pseudo<(outs g8rc:$rD), (ins s16imm64:$disp, g8rc_nox0:$reg),
+ "#LDgotTprelL",
+ [(set i64:$rD,
+ (PPCldGotTprelL tglobaltlsaddr:$disp, i64:$reg))]>,
+ isPPC64;
+def : Pat<(PPCaddTls i64:$in, tglobaltlsaddr:$g),
+ (ADD8TLS $in, tglobaltlsaddr:$g)>;
+def ADDIStlsgdHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
+ "#ADDIStlsgdHA",
+ [(set i64:$rD,
+ (PPCaddisTlsgdHA i64:$reg, tglobaltlsaddr:$disp))]>,
+ isPPC64;
+def ADDItlsgdL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
+ "#ADDItlsgdL",
+ [(set i64:$rD,
+ (PPCaddiTlsgdL i64:$reg, tglobaltlsaddr:$disp))]>,
+ isPPC64;
+// LR8 is a true define, while the rest of the Defs are clobbers. X3 is
+// explicitly defined when this op is created, so not mentioned here.
+let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
+ Defs = [X0,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7] in
+def GETtlsADDR : Pseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym),
+ "#GETtlsADDR",
+ [(set i64:$rD,
+ (PPCgetTlsAddr i64:$reg, tglobaltlsaddr:$sym))]>,
+ isPPC64;
+// Combined op for ADDItlsgdL and GETtlsADDR, late expanded. X3 and LR8
+// are true defines while the rest of the Defs are clobbers.
+let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
+ Defs = [X0,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7]
+ in
+def ADDItlsgdLADDR : Pseudo<(outs g8rc:$rD),
+ (ins g8rc_nox0:$reg, s16imm64:$disp, tlsgd:$sym),
+ "#ADDItlsgdLADDR",
+ [(set i64:$rD,
+ (PPCaddiTlsgdLAddr i64:$reg,
+ tglobaltlsaddr:$disp,
+ tglobaltlsaddr:$sym))]>,
+ isPPC64;
+def ADDIStlsldHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
+ "#ADDIStlsldHA",
+ [(set i64:$rD,
+ (PPCaddisTlsldHA i64:$reg, tglobaltlsaddr:$disp))]>,
+ isPPC64;
+def ADDItlsldL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
+ "#ADDItlsldL",
+ [(set i64:$rD,
+ (PPCaddiTlsldL i64:$reg, tglobaltlsaddr:$disp))]>,
+ isPPC64;
+// LR8 is a true define, while the rest of the Defs are clobbers. X3 is
+// explicitly defined when this op is created, so not mentioned here.
+let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
+ Defs = [X0,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7] in
+def GETtlsldADDR : Pseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym),
+ "#GETtlsldADDR",
+ [(set i64:$rD,
+ (PPCgetTlsldAddr i64:$reg, tglobaltlsaddr:$sym))]>,
+ isPPC64;
+// Combined op for ADDItlsldL and GETtlsADDR, late expanded. X3 and LR8
+// are true defines, while the rest of the Defs are clobbers.
+let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
+ Defs = [X0,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7]
+ in
+def ADDItlsldLADDR : Pseudo<(outs g8rc:$rD),
+ (ins g8rc_nox0:$reg, s16imm64:$disp, tlsgd:$sym),
+ "#ADDItlsldLADDR",
+ [(set i64:$rD,
+ (PPCaddiTlsldLAddr i64:$reg,
+ tglobaltlsaddr:$disp,
+ tglobaltlsaddr:$sym))]>,
+ isPPC64;
+def ADDISdtprelHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
+ "#ADDISdtprelHA",
+ [(set i64:$rD,
+ (PPCaddisDtprelHA i64:$reg,
+ tglobaltlsaddr:$disp))]>,
+ isPPC64;
+def ADDIdtprelL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
+ "#ADDIdtprelL",
+ [(set i64:$rD,
+ (PPCaddiDtprelL i64:$reg, tglobaltlsaddr:$disp))]>,
+ isPPC64;
+
+let PPC970_Unit = 2 in {
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+// Truncating stores.
+def STB8 : DForm_1<38, (outs), (ins g8rc:$rS, memri:$src),
+ "stb $rS, $src", IIC_LdStStore,
+ [(truncstorei8 i64:$rS, iaddr:$src)]>;
+def STH8 : DForm_1<44, (outs), (ins g8rc:$rS, memri:$src),
+ "sth $rS, $src", IIC_LdStStore,
+ [(truncstorei16 i64:$rS, iaddr:$src)]>;
+def STW8 : DForm_1<36, (outs), (ins g8rc:$rS, memri:$src),
+ "stw $rS, $src", IIC_LdStStore,
+ [(truncstorei32 i64:$rS, iaddr:$src)]>;
+def STBX8 : XForm_8<31, 215, (outs), (ins g8rc:$rS, memrr:$dst),
+ "stbx $rS, $dst", IIC_LdStStore,
+ [(truncstorei8 i64:$rS, xaddr:$dst)]>,
+ PPC970_DGroup_Cracked;
+def STHX8 : XForm_8<31, 407, (outs), (ins g8rc:$rS, memrr:$dst),
+ "sthx $rS, $dst", IIC_LdStStore,
+ [(truncstorei16 i64:$rS, xaddr:$dst)]>,
+ PPC970_DGroup_Cracked;
+def STWX8 : XForm_8<31, 151, (outs), (ins g8rc:$rS, memrr:$dst),
+ "stwx $rS, $dst", IIC_LdStStore,
+ [(truncstorei32 i64:$rS, xaddr:$dst)]>,
+ PPC970_DGroup_Cracked;
+} // Interpretation64Bit
+
+// Normal 8-byte stores.
+def STD : DSForm_1<62, 0, (outs), (ins g8rc:$rS, memrix:$dst),
+ "std $rS, $dst", IIC_LdStSTD,
+ [(aligned4store i64:$rS, ixaddr:$dst)]>, isPPC64;
+def STDX : XForm_8<31, 149, (outs), (ins g8rc:$rS, memrr:$dst),
+ "stdx $rS, $dst", IIC_LdStSTD,
+ [(store i64:$rS, xaddr:$dst)]>, isPPC64,
+ PPC970_DGroup_Cracked;
+def STDBRX: XForm_8<31, 660, (outs), (ins g8rc:$rS, memrr:$dst),
+ "stdbrx $rS, $dst", IIC_LdStStore,
+ [(PPCstbrx i64:$rS, xoaddr:$dst, i64)]>, isPPC64,
+ PPC970_DGroup_Cracked;
+}
+
+// Stores with Update (pre-inc).
+let PPC970_Unit = 2, mayStore = 1 in {
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+def STBU8 : DForm_1<39, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memri:$dst),
+ "stbu $rS, $dst", IIC_LdStStoreUpd, []>,
+ RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+def STHU8 : DForm_1<45, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memri:$dst),
+ "sthu $rS, $dst", IIC_LdStStoreUpd, []>,
+ RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+def STWU8 : DForm_1<37, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memri:$dst),
+ "stwu $rS, $dst", IIC_LdStStoreUpd, []>,
+ RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+
+def STBUX8: XForm_8<31, 247, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst),
+ "stbux $rS, $dst", IIC_LdStStoreUpd, []>,
+ RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
+ PPC970_DGroup_Cracked;
+def STHUX8: XForm_8<31, 439, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst),
+ "sthux $rS, $dst", IIC_LdStStoreUpd, []>,
+ RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
+ PPC970_DGroup_Cracked;
+def STWUX8: XForm_8<31, 183, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst),
+ "stwux $rS, $dst", IIC_LdStStoreUpd, []>,
+ RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
+ PPC970_DGroup_Cracked;
+} // Interpretation64Bit
+
+def STDU : DSForm_1<62, 1, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrix:$dst),
+ "stdu $rS, $dst", IIC_LdStSTDU, []>,
+ RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">,
+ isPPC64;
+
+def STDUX : XForm_8<31, 181, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst),
+ "stdux $rS, $dst", IIC_LdStSTDUX, []>,
+ RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
+ PPC970_DGroup_Cracked, isPPC64;
+}
+
+// Patterns to match the pre-inc stores. We can't put the patterns on
+// the instruction definitions directly as ISel wants the address base
+// and offset to be separate operands, not a single complex operand.
+def : Pat<(pre_truncsti8 i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
+ (STBU8 $rS, iaddroff:$ptroff, $ptrreg)>;
+def : Pat<(pre_truncsti16 i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
+ (STHU8 $rS, iaddroff:$ptroff, $ptrreg)>;
+def : Pat<(pre_truncsti32 i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
+ (STWU8 $rS, iaddroff:$ptroff, $ptrreg)>;
+def : Pat<(aligned4pre_store i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
+ (STDU $rS, iaddroff:$ptroff, $ptrreg)>;
+
+def : Pat<(pre_truncsti8 i64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+ (STBUX8 $rS, $ptrreg, $ptroff)>;
+def : Pat<(pre_truncsti16 i64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+ (STHUX8 $rS, $ptrreg, $ptroff)>;
+def : Pat<(pre_truncsti32 i64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+ (STWUX8 $rS, $ptrreg, $ptroff)>;
+def : Pat<(pre_store i64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+ (STDUX $rS, $ptrreg, $ptroff)>;
+
+
+//===----------------------------------------------------------------------===//
+// Floating point instructions.
+//
+
+
+let PPC970_Unit = 3, hasSideEffects = 0,
+ Uses = [RM] in { // FPU Operations.
+defm FCFID : XForm_26r<63, 846, (outs f8rc:$frD), (ins f8rc:$frB),
+ "fcfid", "$frD, $frB", IIC_FPGeneral,
+ [(set f64:$frD, (PPCfcfid f64:$frB))]>, isPPC64;
+defm FCTID : XForm_26r<63, 814, (outs f8rc:$frD), (ins f8rc:$frB),
+ "fctid", "$frD, $frB", IIC_FPGeneral,
+ []>, isPPC64;
+defm FCTIDZ : XForm_26r<63, 815, (outs f8rc:$frD), (ins f8rc:$frB),
+ "fctidz", "$frD, $frB", IIC_FPGeneral,
+ [(set f64:$frD, (PPCfctidz f64:$frB))]>, isPPC64;
+
+defm FCFIDU : XForm_26r<63, 974, (outs f8rc:$frD), (ins f8rc:$frB),
+ "fcfidu", "$frD, $frB", IIC_FPGeneral,
+ [(set f64:$frD, (PPCfcfidu f64:$frB))]>, isPPC64;
+defm FCFIDS : XForm_26r<59, 846, (outs f4rc:$frD), (ins f8rc:$frB),
+ "fcfids", "$frD, $frB", IIC_FPGeneral,
+ [(set f32:$frD, (PPCfcfids f64:$frB))]>, isPPC64;
+defm FCFIDUS : XForm_26r<59, 974, (outs f4rc:$frD), (ins f8rc:$frB),
+ "fcfidus", "$frD, $frB", IIC_FPGeneral,
+ [(set f32:$frD, (PPCfcfidus f64:$frB))]>, isPPC64;
+defm FCTIDUZ : XForm_26r<63, 943, (outs f8rc:$frD), (ins f8rc:$frB),
+ "fctiduz", "$frD, $frB", IIC_FPGeneral,
+ [(set f64:$frD, (PPCfctiduz f64:$frB))]>, isPPC64;
+defm FCTIWUZ : XForm_26r<63, 143, (outs f8rc:$frD), (ins f8rc:$frB),
+ "fctiwuz", "$frD, $frB", IIC_FPGeneral,
+ [(set f64:$frD, (PPCfctiwuz f64:$frB))]>, isPPC64;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Instruction Patterns
+//
+
+// Extensions and truncates to/from 32-bit regs.
+def : Pat<(i64 (zext i32:$in)),
+ (RLDICL (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $in, sub_32),
+ 0, 32)>;
+def : Pat<(i64 (anyext i32:$in)),
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $in, sub_32)>;
+def : Pat<(i32 (trunc i64:$in)),
+ (EXTRACT_SUBREG $in, sub_32)>;
+
+// Implement the 'not' operation with the NOR instruction.
+// (we could use the default xori pattern, but nor has lower latency on some
+// cores (such as the A2)).
+def i64not : OutPatFrag<(ops node:$in),
+ (NOR8 $in, $in)>;
+def : Pat<(not i64:$in),
+ (i64not $in)>;
+
+// Extending loads with i64 targets.
+def : Pat<(zextloadi1 iaddr:$src),
+ (LBZ8 iaddr:$src)>;
+def : Pat<(zextloadi1 xaddr:$src),
+ (LBZX8 xaddr:$src)>;
+def : Pat<(extloadi1 iaddr:$src),
+ (LBZ8 iaddr:$src)>;
+def : Pat<(extloadi1 xaddr:$src),
+ (LBZX8 xaddr:$src)>;
+def : Pat<(extloadi8 iaddr:$src),
+ (LBZ8 iaddr:$src)>;
+def : Pat<(extloadi8 xaddr:$src),
+ (LBZX8 xaddr:$src)>;
+def : Pat<(extloadi16 iaddr:$src),
+ (LHZ8 iaddr:$src)>;
+def : Pat<(extloadi16 xaddr:$src),
+ (LHZX8 xaddr:$src)>;
+def : Pat<(extloadi32 iaddr:$src),
+ (LWZ8 iaddr:$src)>;
+def : Pat<(extloadi32 xaddr:$src),
+ (LWZX8 xaddr:$src)>;
+
+// Standard shifts. These are represented separately from the real shifts above
+// so that we can distinguish between shifts that allow 6-bit and 7-bit shift
+// amounts.
+def : Pat<(sra i64:$rS, i32:$rB),
+ (SRAD $rS, $rB)>;
+def : Pat<(srl i64:$rS, i32:$rB),
+ (SRD $rS, $rB)>;
+def : Pat<(shl i64:$rS, i32:$rB),
+ (SLD $rS, $rB)>;
+
+// SHL/SRL
+def : Pat<(shl i64:$in, (i32 imm:$imm)),
+ (RLDICR $in, imm:$imm, (SHL64 imm:$imm))>;
+def : Pat<(srl i64:$in, (i32 imm:$imm)),
+ (RLDICL $in, (SRL64 imm:$imm), imm:$imm)>;
+
+// ROTL
+def : Pat<(rotl i64:$in, i32:$sh),
+ (RLDCL $in, $sh, 0)>;
+def : Pat<(rotl i64:$in, (i32 imm:$imm)),
+ (RLDICL $in, imm:$imm, 0)>;
+
+// Hi and Lo for Darwin Global Addresses.
+def : Pat<(PPChi tglobaladdr:$in, 0), (LIS8 tglobaladdr:$in)>;
+def : Pat<(PPClo tglobaladdr:$in, 0), (LI8 tglobaladdr:$in)>;
+def : Pat<(PPChi tconstpool:$in , 0), (LIS8 tconstpool:$in)>;
+def : Pat<(PPClo tconstpool:$in , 0), (LI8 tconstpool:$in)>;
+def : Pat<(PPChi tjumptable:$in , 0), (LIS8 tjumptable:$in)>;
+def : Pat<(PPClo tjumptable:$in , 0), (LI8 tjumptable:$in)>;
+def : Pat<(PPChi tblockaddress:$in, 0), (LIS8 tblockaddress:$in)>;
+def : Pat<(PPClo tblockaddress:$in, 0), (LI8 tblockaddress:$in)>;
+def : Pat<(PPChi tglobaltlsaddr:$g, i64:$in),
+ (ADDIS8 $in, tglobaltlsaddr:$g)>;
+def : Pat<(PPClo tglobaltlsaddr:$g, i64:$in),
+ (ADDI8 $in, tglobaltlsaddr:$g)>;
+def : Pat<(add i64:$in, (PPChi tglobaladdr:$g, 0)),
+ (ADDIS8 $in, tglobaladdr:$g)>;
+def : Pat<(add i64:$in, (PPChi tconstpool:$g, 0)),
+ (ADDIS8 $in, tconstpool:$g)>;
+def : Pat<(add i64:$in, (PPChi tjumptable:$g, 0)),
+ (ADDIS8 $in, tjumptable:$g)>;
+def : Pat<(add i64:$in, (PPChi tblockaddress:$g, 0)),
+ (ADDIS8 $in, tblockaddress:$g)>;
+
+// Patterns to match r+r indexed loads and stores for
+// addresses without at least 4-byte alignment.
+def : Pat<(i64 (unaligned4sextloadi32 xoaddr:$src)),
+ (LWAX xoaddr:$src)>;
+def : Pat<(i64 (unaligned4load xoaddr:$src)),
+ (LDX xoaddr:$src)>;
+def : Pat<(unaligned4store i64:$rS, xoaddr:$dst),
+ (STDX $rS, xoaddr:$dst)>;
+
+// 64-bits atomic loads and stores
+def : Pat<(atomic_load_64 ixaddr:$src), (LD memrix:$src)>;
+def : Pat<(atomic_load_64 xaddr:$src), (LDX memrr:$src)>;
+
+def : Pat<(atomic_store_64 ixaddr:$ptr, i64:$val), (STD g8rc:$val, memrix:$ptr)>;
+def : Pat<(atomic_store_64 xaddr:$ptr, i64:$val), (STDX g8rc:$val, memrr:$ptr)>;
+
+let Predicates = [IsISA3_0] in {
+
+class X_L1_RA5_RB5<bits<6> opcode, bits<10> xo, string opc, RegisterOperand ty,
+ InstrItinClass itin, list<dag> pattern>
+ : X_L1_RS5_RS5<opcode, xo, (outs), (ins ty:$rA, ty:$rB, u1imm:$L),
+ !strconcat(opc, " $rA, $rB, $L"), itin, pattern>;
+
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+def CP_COPY8 : X_L1_RA5_RB5<31, 774, "copy" , g8rc, IIC_LdStCOPY, []>;
+def CP_PASTE8 : X_L1_RA5_RB5<31, 902, "paste" , g8rc, IIC_LdStPASTE, []>;
+def CP_PASTE8o : X_L1_RA5_RB5<31, 902, "paste.", g8rc, IIC_LdStPASTE, []>,isDOT;
+}
+
+// SLB Invalidate Entry Global
+def SLBIEG : XForm_26<31, 466, (outs), (ins gprc:$RS, gprc:$RB),
+ "slbieg $RS, $RB", IIC_SprSLBIEG, []>;
+// SLB Synchronize
+def SLBSYNC : XForm_0<31, 338, (outs), (ins), "slbsync", IIC_SprSLBSYNC, []>;
+
+} // IsISA3_0
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrAltivec.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
new file mode 100644
index 000000000000..5c022749ad64
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -0,0 +1,1451 @@
+//===-- PPCInstrAltivec.td - The PowerPC Altivec Extension -*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Altivec extension to the PowerPC instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+// *********************************** NOTE ***********************************
+// ** For POWER8 Little Endian, the VSX swap optimization relies on knowing **
+// ** which VMX and VSX instructions are lane-sensitive and which are not. **
+// ** A lane-sensitive instruction relies, implicitly or explicitly, on **
+// ** whether lanes are numbered from left to right. An instruction like **
+// ** VADDFP is not lane-sensitive, because each lane of the result vector **
+// ** relies only on the corresponding lane of the source vectors. However, **
+// ** an instruction like VMULESB is lane-sensitive, because "even" and **
+// ** "odd" lanes are different for big-endian and little-endian numbering. **
+// ** **
+// ** When adding new VMX and VSX instructions, please consider whether they **
+// ** are lane-sensitive. If so, they must be added to a switch statement **
+// ** in PPCVSXSwapRemoval::gatherVectorInstructions(). **
+// ****************************************************************************
+
+
+//===----------------------------------------------------------------------===//
+// Altivec transformation functions and pattern fragments.
+//
+
+// Since we canonicalize buildvectors to v16i8, all vnots "-1" operands will be
+// of that type.
+def vnot_ppc : PatFrag<(ops node:$in),
+ (xor node:$in, (bitconvert (v16i8 immAllOnesV)))>;
+
+def vpkuhum_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), 0, *CurDAG);
+}]>;
+def vpkuwum_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), 0, *CurDAG);
+}]>;
+def vpkudum_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return PPC::isVPKUDUMShuffleMask(cast<ShuffleVectorSDNode>(N), 0, *CurDAG);
+}]>;
+def vpkuhum_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), 1, *CurDAG);
+}]>;
+def vpkuwum_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), 1, *CurDAG);
+}]>;
+def vpkudum_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return PPC::isVPKUDUMShuffleMask(cast<ShuffleVectorSDNode>(N), 1, *CurDAG);
+}]>;
+
+// These fragments are provided for little-endian, where the inputs must be
+// swapped for correct semantics.
+def vpkuhum_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), 2, *CurDAG);
+}]>;
+def vpkuwum_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), 2, *CurDAG);
+}]>;
+def vpkudum_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return PPC::isVPKUDUMShuffleMask(cast<ShuffleVectorSDNode>(N), 2, *CurDAG);
+}]>;
+
+def vmrglb_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
+ return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 0, *CurDAG);
+}]>;
+def vmrglh_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
+ return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 0, *CurDAG);
+}]>;
+def vmrglw_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
+ return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 0, *CurDAG);
+}]>;
+def vmrghb_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
+ return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 0, *CurDAG);
+}]>;
+def vmrghh_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
+ return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 0, *CurDAG);
+}]>;
+def vmrghw_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
+ return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 0, *CurDAG);
+}]>;
+
+
+def vmrglb_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
+ return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 1, *CurDAG);
+}]>;
+def vmrglh_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 1, *CurDAG);
+}]>;
+def vmrglw_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 1, *CurDAG);
+}]>;
+def vmrghb_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 1, *CurDAG);
+}]>;
+def vmrghh_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 1, *CurDAG);
+}]>;
+def vmrghw_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 1, *CurDAG);
+}]>;
+
+
+// These fragments are provided for little-endian, where the inputs must be
+// swapped for correct semantics.
+def vmrglb_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
+ return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 2, *CurDAG);
+}]>;
+def vmrglh_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 2, *CurDAG);
+}]>;
+def vmrglw_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 2, *CurDAG);
+}]>;
+def vmrghb_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 2, *CurDAG);
+}]>;
+def vmrghh_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 2, *CurDAG);
+}]>;
+def vmrghw_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 2, *CurDAG);
+}]>;
+
+
+def vmrgew_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return PPC::isVMRGEOShuffleMask(cast<ShuffleVectorSDNode>(N), true, 0, *CurDAG);
+}]>;
+def vmrgow_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return PPC::isVMRGEOShuffleMask(cast<ShuffleVectorSDNode>(N), false, 0, *CurDAG);
+}]>;
+def vmrgew_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return PPC::isVMRGEOShuffleMask(cast<ShuffleVectorSDNode>(N), true, 1, *CurDAG);
+}]>;
+def vmrgow_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return PPC::isVMRGEOShuffleMask(cast<ShuffleVectorSDNode>(N), false, 1, *CurDAG);
+}]>;
+def vmrgew_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return PPC::isVMRGEOShuffleMask(cast<ShuffleVectorSDNode>(N), true, 2, *CurDAG);
+}]>;
+def vmrgow_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return PPC::isVMRGEOShuffleMask(cast<ShuffleVectorSDNode>(N), false, 2, *CurDAG);
+}]>;
+
+
+
+def VSLDOI_get_imm : SDNodeXForm<vector_shuffle, [{
+ return getI32Imm(PPC::isVSLDOIShuffleMask(N, 0, *CurDAG), SDLoc(N));
+}]>;
+def vsldoi_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return PPC::isVSLDOIShuffleMask(N, 0, *CurDAG) != -1;
+}], VSLDOI_get_imm>;
+
+
+/// VSLDOI_unary* - These are used to match vsldoi(X,X), which is turned into
+/// vector_shuffle(X,undef,mask) by the dag combiner.
+def VSLDOI_unary_get_imm : SDNodeXForm<vector_shuffle, [{
+ return getI32Imm(PPC::isVSLDOIShuffleMask(N, 1, *CurDAG), SDLoc(N));
+}]>;
+def vsldoi_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return PPC::isVSLDOIShuffleMask(N, 1, *CurDAG) != -1;
+}], VSLDOI_unary_get_imm>;
+
+
+/// VSLDOI_swapped* - These fragments are provided for little-endian, where
+/// the inputs must be swapped for correct semantics.
+def VSLDOI_swapped_get_imm : SDNodeXForm<vector_shuffle, [{
+ return getI32Imm(PPC::isVSLDOIShuffleMask(N, 2, *CurDAG), SDLoc(N));
+}]>;
+def vsldoi_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return PPC::isVSLDOIShuffleMask(N, 2, *CurDAG) != -1;
+}], VSLDOI_get_imm>;
+
+
+// VSPLT*_get_imm xform function: convert vector_shuffle mask to VSPLT* imm.
+def VSPLTB_get_imm : SDNodeXForm<vector_shuffle, [{
+ return getI32Imm(PPC::getVSPLTImmediate(N, 1, *CurDAG), SDLoc(N));
+}]>;
+def vspltb_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 1);
+}], VSPLTB_get_imm>;
+def VSPLTH_get_imm : SDNodeXForm<vector_shuffle, [{
+ return getI32Imm(PPC::getVSPLTImmediate(N, 2, *CurDAG), SDLoc(N));
+}]>;
+def vsplth_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 2);
+}], VSPLTH_get_imm>;
+def VSPLTW_get_imm : SDNodeXForm<vector_shuffle, [{
+ return getI32Imm(PPC::getVSPLTImmediate(N, 4, *CurDAG), SDLoc(N));
+}]>;
+def vspltw_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 4);
+}], VSPLTW_get_imm>;
+
+
+// VSPLTISB_get_imm xform function: convert build_vector to VSPLTISB imm.
+def VSPLTISB_get_imm : SDNodeXForm<build_vector, [{
+ return PPC::get_VSPLTI_elt(N, 1, *CurDAG);
+}]>;
+def vecspltisb : PatLeaf<(build_vector), [{
+ return PPC::get_VSPLTI_elt(N, 1, *CurDAG).getNode() != nullptr;
+}], VSPLTISB_get_imm>;
+
+// VSPLTISH_get_imm xform function: convert build_vector to VSPLTISH imm.
+def VSPLTISH_get_imm : SDNodeXForm<build_vector, [{
+ return PPC::get_VSPLTI_elt(N, 2, *CurDAG);
+}]>;
+def vecspltish : PatLeaf<(build_vector), [{
+ return PPC::get_VSPLTI_elt(N, 2, *CurDAG).getNode() != nullptr;
+}], VSPLTISH_get_imm>;
+
+// VSPLTISW_get_imm xform function: convert build_vector to VSPLTISW imm.
+def VSPLTISW_get_imm : SDNodeXForm<build_vector, [{
+ return PPC::get_VSPLTI_elt(N, 4, *CurDAG);
+}]>;
+def vecspltisw : PatLeaf<(build_vector), [{
+ return PPC::get_VSPLTI_elt(N, 4, *CurDAG).getNode() != nullptr;
+}], VSPLTISW_get_imm>;
+
+//===----------------------------------------------------------------------===//
+// Helpers for defining instructions that directly correspond to intrinsics.
+
+// VA1a_Int_Ty - A VAForm_1a intrinsic definition of specific type.
+class VA1a_Int_Ty<bits<6> xo, string opc, Intrinsic IntID, ValueType Ty>
+ : VAForm_1a<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vC),
+ !strconcat(opc, " $vD, $vA, $vB, $vC"), IIC_VecFP,
+ [(set Ty:$vD, (IntID Ty:$vA, Ty:$vB, Ty:$vC))]>;
+
+// VA1a_Int_Ty2 - A VAForm_1a intrinsic definition where the type of the
+// inputs doesn't match the type of the output.
+class VA1a_Int_Ty2<bits<6> xo, string opc, Intrinsic IntID, ValueType OutTy,
+ ValueType InTy>
+ : VAForm_1a<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vC),
+ !strconcat(opc, " $vD, $vA, $vB, $vC"), IIC_VecFP,
+ [(set OutTy:$vD, (IntID InTy:$vA, InTy:$vB, InTy:$vC))]>;
+
+// VA1a_Int_Ty3 - A VAForm_1a intrinsic definition where there are two
+// input types and an output type.
+class VA1a_Int_Ty3<bits<6> xo, string opc, Intrinsic IntID, ValueType OutTy,
+ ValueType In1Ty, ValueType In2Ty>
+ : VAForm_1a<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vC),
+ !strconcat(opc, " $vD, $vA, $vB, $vC"), IIC_VecFP,
+ [(set OutTy:$vD,
+ (IntID In1Ty:$vA, In1Ty:$vB, In2Ty:$vC))]>;
+
+// VX1_Int_Ty - A VXForm_1 intrinsic definition of specific type.
+class VX1_Int_Ty<bits<11> xo, string opc, Intrinsic IntID, ValueType Ty>
+ : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ !strconcat(opc, " $vD, $vA, $vB"), IIC_VecFP,
+ [(set Ty:$vD, (IntID Ty:$vA, Ty:$vB))]>;
+
+// VX1_Int_Ty2 - A VXForm_1 intrinsic definition where the type of the
+// inputs doesn't match the type of the output.
+class VX1_Int_Ty2<bits<11> xo, string opc, Intrinsic IntID, ValueType OutTy,
+ ValueType InTy>
+ : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ !strconcat(opc, " $vD, $vA, $vB"), IIC_VecFP,
+ [(set OutTy:$vD, (IntID InTy:$vA, InTy:$vB))]>;
+
+// VX1_Int_Ty3 - A VXForm_1 intrinsic definition where there are two
+// input types and an output type.
+class VX1_Int_Ty3<bits<11> xo, string opc, Intrinsic IntID, ValueType OutTy,
+ ValueType In1Ty, ValueType In2Ty>
+ : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ !strconcat(opc, " $vD, $vA, $vB"), IIC_VecFP,
+ [(set OutTy:$vD, (IntID In1Ty:$vA, In2Ty:$vB))]>;
+
+// VX2_Int_SP - A VXForm_2 intrinsic definition of vector single-precision type.
+class VX2_Int_SP<bits<11> xo, string opc, Intrinsic IntID>
+ : VXForm_2<xo, (outs vrrc:$vD), (ins vrrc:$vB),
+ !strconcat(opc, " $vD, $vB"), IIC_VecFP,
+ [(set v4f32:$vD, (IntID v4f32:$vB))]>;
+
+// VX2_Int_Ty2 - A VXForm_2 intrinsic definition where the type of the
+// inputs doesn't match the type of the output.
+class VX2_Int_Ty2<bits<11> xo, string opc, Intrinsic IntID, ValueType OutTy,
+ ValueType InTy>
+ : VXForm_2<xo, (outs vrrc:$vD), (ins vrrc:$vB),
+ !strconcat(opc, " $vD, $vB"), IIC_VecFP,
+ [(set OutTy:$vD, (IntID InTy:$vB))]>;
+
+class VXBX_Int_Ty<bits<11> xo, string opc, Intrinsic IntID, ValueType Ty>
+ : VXForm_BX<xo, (outs vrrc:$vD), (ins vrrc:$vA),
+ !strconcat(opc, " $vD, $vA"), IIC_VecFP,
+ [(set Ty:$vD, (IntID Ty:$vA))]>;
+
+class VXCR_Int_Ty<bits<11> xo, string opc, Intrinsic IntID, ValueType Ty>
+ : VXForm_CR<xo, (outs vrrc:$vD), (ins vrrc:$vA, u1imm:$ST, u4imm:$SIX),
+ !strconcat(opc, " $vD, $vA, $ST, $SIX"), IIC_VecFP,
+ [(set Ty:$vD, (IntID Ty:$vA, imm:$ST, imm:$SIX))]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction Definitions.
+
+def HasAltivec : Predicate<"PPCSubTarget->hasAltivec()">;
+let Predicates = [HasAltivec] in {
+
+def DSS : DSS_Form<0, 822, (outs), (ins u5imm:$STRM),
+ "dss $STRM", IIC_LdStLoad /*FIXME*/, [(int_ppc_altivec_dss imm:$STRM)]>,
+ Deprecated<DeprecatedDST> {
+ let A = 0;
+ let B = 0;
+}
+
+def DSSALL : DSS_Form<1, 822, (outs), (ins),
+ "dssall", IIC_LdStLoad /*FIXME*/, [(int_ppc_altivec_dssall)]>,
+ Deprecated<DeprecatedDST> {
+ let STRM = 0;
+ let A = 0;
+ let B = 0;
+}
+
+def DST : DSS_Form<0, 342, (outs), (ins u5imm:$STRM, gprc:$rA, gprc:$rB),
+ "dst $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/,
+ [(int_ppc_altivec_dst i32:$rA, i32:$rB, imm:$STRM)]>,
+ Deprecated<DeprecatedDST>;
+
+def DSTT : DSS_Form<1, 342, (outs), (ins u5imm:$STRM, gprc:$rA, gprc:$rB),
+ "dstt $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/,
+ [(int_ppc_altivec_dstt i32:$rA, i32:$rB, imm:$STRM)]>,
+ Deprecated<DeprecatedDST>;
+
+def DSTST : DSS_Form<0, 374, (outs), (ins u5imm:$STRM, gprc:$rA, gprc:$rB),
+ "dstst $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/,
+ [(int_ppc_altivec_dstst i32:$rA, i32:$rB, imm:$STRM)]>,
+ Deprecated<DeprecatedDST>;
+
+def DSTSTT : DSS_Form<1, 374, (outs), (ins u5imm:$STRM, gprc:$rA, gprc:$rB),
+ "dststt $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/,
+ [(int_ppc_altivec_dststt i32:$rA, i32:$rB, imm:$STRM)]>,
+ Deprecated<DeprecatedDST>;
+
+let isCodeGenOnly = 1 in {
+ // The very same instructions as above, but formally matching 64bit registers.
+ def DST64 : DSS_Form<0, 342, (outs), (ins u5imm:$STRM, g8rc:$rA, gprc:$rB),
+ "dst $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/,
+ [(int_ppc_altivec_dst i64:$rA, i32:$rB, imm:$STRM)]>,
+ Deprecated<DeprecatedDST>;
+
+ def DSTT64 : DSS_Form<1, 342, (outs), (ins u5imm:$STRM, g8rc:$rA, gprc:$rB),
+ "dstt $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/,
+ [(int_ppc_altivec_dstt i64:$rA, i32:$rB, imm:$STRM)]>,
+ Deprecated<DeprecatedDST>;
+
+ def DSTST64 : DSS_Form<0, 374, (outs), (ins u5imm:$STRM, g8rc:$rA, gprc:$rB),
+ "dstst $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/,
+ [(int_ppc_altivec_dstst i64:$rA, i32:$rB,
+ imm:$STRM)]>,
+ Deprecated<DeprecatedDST>;
+
+ def DSTSTT64 : DSS_Form<1, 374, (outs), (ins u5imm:$STRM, g8rc:$rA, gprc:$rB),
+ "dststt $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/,
+ [(int_ppc_altivec_dststt i64:$rA, i32:$rB,
+ imm:$STRM)]>,
+ Deprecated<DeprecatedDST>;
+}
+
+def MFVSCR : VXForm_4<1540, (outs vrrc:$vD), (ins),
+ "mfvscr $vD", IIC_LdStStore,
+ [(set v8i16:$vD, (int_ppc_altivec_mfvscr))]>;
+def MTVSCR : VXForm_5<1604, (outs), (ins vrrc:$vB),
+ "mtvscr $vB", IIC_LdStLoad,
+ [(int_ppc_altivec_mtvscr v4i32:$vB)]>;
+
+let PPC970_Unit = 2 in { // Loads.
+def LVEBX: XForm_1<31, 7, (outs vrrc:$vD), (ins memrr:$src),
+ "lvebx $vD, $src", IIC_LdStLoad,
+ [(set v16i8:$vD, (int_ppc_altivec_lvebx xoaddr:$src))]>;
+def LVEHX: XForm_1<31, 39, (outs vrrc:$vD), (ins memrr:$src),
+ "lvehx $vD, $src", IIC_LdStLoad,
+ [(set v8i16:$vD, (int_ppc_altivec_lvehx xoaddr:$src))]>;
+def LVEWX: XForm_1<31, 71, (outs vrrc:$vD), (ins memrr:$src),
+ "lvewx $vD, $src", IIC_LdStLoad,
+ [(set v4i32:$vD, (int_ppc_altivec_lvewx xoaddr:$src))]>;
+def LVX : XForm_1<31, 103, (outs vrrc:$vD), (ins memrr:$src),
+ "lvx $vD, $src", IIC_LdStLoad,
+ [(set v4i32:$vD, (int_ppc_altivec_lvx xoaddr:$src))]>;
+def LVXL : XForm_1<31, 359, (outs vrrc:$vD), (ins memrr:$src),
+ "lvxl $vD, $src", IIC_LdStLoad,
+ [(set v4i32:$vD, (int_ppc_altivec_lvxl xoaddr:$src))]>;
+}
+
+def LVSL : XForm_1<31, 6, (outs vrrc:$vD), (ins memrr:$src),
+ "lvsl $vD, $src", IIC_LdStLoad,
+ [(set v16i8:$vD, (int_ppc_altivec_lvsl xoaddr:$src))]>,
+ PPC970_Unit_LSU;
+def LVSR : XForm_1<31, 38, (outs vrrc:$vD), (ins memrr:$src),
+ "lvsr $vD, $src", IIC_LdStLoad,
+ [(set v16i8:$vD, (int_ppc_altivec_lvsr xoaddr:$src))]>,
+ PPC970_Unit_LSU;
+
+let PPC970_Unit = 2 in { // Stores.
+def STVEBX: XForm_8<31, 135, (outs), (ins vrrc:$rS, memrr:$dst),
+ "stvebx $rS, $dst", IIC_LdStStore,
+ [(int_ppc_altivec_stvebx v16i8:$rS, xoaddr:$dst)]>;
+def STVEHX: XForm_8<31, 167, (outs), (ins vrrc:$rS, memrr:$dst),
+ "stvehx $rS, $dst", IIC_LdStStore,
+ [(int_ppc_altivec_stvehx v8i16:$rS, xoaddr:$dst)]>;
+def STVEWX: XForm_8<31, 199, (outs), (ins vrrc:$rS, memrr:$dst),
+ "stvewx $rS, $dst", IIC_LdStStore,
+ [(int_ppc_altivec_stvewx v4i32:$rS, xoaddr:$dst)]>;
+def STVX : XForm_8<31, 231, (outs), (ins vrrc:$rS, memrr:$dst),
+ "stvx $rS, $dst", IIC_LdStStore,
+ [(int_ppc_altivec_stvx v4i32:$rS, xoaddr:$dst)]>;
+def STVXL : XForm_8<31, 487, (outs), (ins vrrc:$rS, memrr:$dst),
+ "stvxl $rS, $dst", IIC_LdStStore,
+ [(int_ppc_altivec_stvxl v4i32:$rS, xoaddr:$dst)]>;
+}
+
+let PPC970_Unit = 5 in { // VALU Operations.
+// VA-Form instructions. 3-input AltiVec ops.
+let isCommutable = 1 in {
+def VMADDFP : VAForm_1<46, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vC, vrrc:$vB),
+ "vmaddfp $vD, $vA, $vC, $vB", IIC_VecFP,
+ [(set v4f32:$vD,
+ (fma v4f32:$vA, v4f32:$vC, v4f32:$vB))]>;
+
+// FIXME: The fma+fneg pattern won't match because fneg is not legal.
+def VNMSUBFP: VAForm_1<47, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vC, vrrc:$vB),
+ "vnmsubfp $vD, $vA, $vC, $vB", IIC_VecFP,
+ [(set v4f32:$vD, (fneg (fma v4f32:$vA, v4f32:$vC,
+ (fneg v4f32:$vB))))]>;
+
+def VMHADDSHS : VA1a_Int_Ty<32, "vmhaddshs", int_ppc_altivec_vmhaddshs, v8i16>;
+def VMHRADDSHS : VA1a_Int_Ty<33, "vmhraddshs", int_ppc_altivec_vmhraddshs,
+ v8i16>;
+def VMLADDUHM : VA1a_Int_Ty<34, "vmladduhm", int_ppc_altivec_vmladduhm, v8i16>;
+} // isCommutable
+
+def VPERM : VA1a_Int_Ty3<43, "vperm", int_ppc_altivec_vperm,
+ v4i32, v4i32, v16i8>;
+def VSEL : VA1a_Int_Ty<42, "vsel", int_ppc_altivec_vsel, v4i32>;
+
+// Shuffles.
+def VSLDOI : VAForm_2<44, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, u5imm:$SH),
+ "vsldoi $vD, $vA, $vB, $SH", IIC_VecFP,
+ [(set v16i8:$vD,
+ (vsldoi_shuffle:$SH v16i8:$vA, v16i8:$vB))]>;
+
+// VX-Form instructions. AltiVec arithmetic ops.
+let isCommutable = 1 in {
+def VADDFP : VXForm_1<10, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vaddfp $vD, $vA, $vB", IIC_VecFP,
+ [(set v4f32:$vD, (fadd v4f32:$vA, v4f32:$vB))]>;
+
+def VADDUBM : VXForm_1<0, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vaddubm $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v16i8:$vD, (add v16i8:$vA, v16i8:$vB))]>;
+def VADDUHM : VXForm_1<64, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vadduhm $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v8i16:$vD, (add v8i16:$vA, v8i16:$vB))]>;
+def VADDUWM : VXForm_1<128, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vadduwm $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v4i32:$vD, (add v4i32:$vA, v4i32:$vB))]>;
+
+def VADDCUW : VX1_Int_Ty<384, "vaddcuw", int_ppc_altivec_vaddcuw, v4i32>;
+def VADDSBS : VX1_Int_Ty<768, "vaddsbs", int_ppc_altivec_vaddsbs, v16i8>;
+def VADDSHS : VX1_Int_Ty<832, "vaddshs", int_ppc_altivec_vaddshs, v8i16>;
+def VADDSWS : VX1_Int_Ty<896, "vaddsws", int_ppc_altivec_vaddsws, v4i32>;
+def VADDUBS : VX1_Int_Ty<512, "vaddubs", int_ppc_altivec_vaddubs, v16i8>;
+def VADDUHS : VX1_Int_Ty<576, "vadduhs", int_ppc_altivec_vadduhs, v8i16>;
+def VADDUWS : VX1_Int_Ty<640, "vadduws", int_ppc_altivec_vadduws, v4i32>;
+} // isCommutable
+
+let isCommutable = 1 in
+def VAND : VXForm_1<1028, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vand $vD, $vA, $vB", IIC_VecFP,
+ [(set v4i32:$vD, (and v4i32:$vA, v4i32:$vB))]>;
+def VANDC : VXForm_1<1092, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vandc $vD, $vA, $vB", IIC_VecFP,
+ [(set v4i32:$vD, (and v4i32:$vA,
+ (vnot_ppc v4i32:$vB)))]>;
+
+def VCFSX : VXForm_1<842, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
+ "vcfsx $vD, $vB, $UIMM", IIC_VecFP,
+ [(set v4f32:$vD,
+ (int_ppc_altivec_vcfsx v4i32:$vB, imm:$UIMM))]>;
+def VCFUX : VXForm_1<778, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
+ "vcfux $vD, $vB, $UIMM", IIC_VecFP,
+ [(set v4f32:$vD,
+ (int_ppc_altivec_vcfux v4i32:$vB, imm:$UIMM))]>;
+def VCTSXS : VXForm_1<970, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
+ "vctsxs $vD, $vB, $UIMM", IIC_VecFP,
+ [(set v4i32:$vD,
+ (int_ppc_altivec_vctsxs v4f32:$vB, imm:$UIMM))]>;
+def VCTUXS : VXForm_1<906, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
+ "vctuxs $vD, $vB, $UIMM", IIC_VecFP,
+ [(set v4i32:$vD,
+ (int_ppc_altivec_vctuxs v4f32:$vB, imm:$UIMM))]>;
+
+// Defines with the UIM field set to 0 for floating-point
+// to integer (fp_to_sint/fp_to_uint) conversions and integer
+// to floating-point (sint_to_fp/uint_to_fp) conversions.
+let isCodeGenOnly = 1, VA = 0 in {
+def VCFSX_0 : VXForm_1<842, (outs vrrc:$vD), (ins vrrc:$vB),
+ "vcfsx $vD, $vB, 0", IIC_VecFP,
+ [(set v4f32:$vD,
+ (int_ppc_altivec_vcfsx v4i32:$vB, 0))]>;
+def VCTUXS_0 : VXForm_1<906, (outs vrrc:$vD), (ins vrrc:$vB),
+ "vctuxs $vD, $vB, 0", IIC_VecFP,
+ [(set v4i32:$vD,
+ (int_ppc_altivec_vctuxs v4f32:$vB, 0))]>;
+def VCFUX_0 : VXForm_1<778, (outs vrrc:$vD), (ins vrrc:$vB),
+ "vcfux $vD, $vB, 0", IIC_VecFP,
+ [(set v4f32:$vD,
+ (int_ppc_altivec_vcfux v4i32:$vB, 0))]>;
+def VCTSXS_0 : VXForm_1<970, (outs vrrc:$vD), (ins vrrc:$vB),
+ "vctsxs $vD, $vB, 0", IIC_VecFP,
+ [(set v4i32:$vD,
+ (int_ppc_altivec_vctsxs v4f32:$vB, 0))]>;
+}
+def VEXPTEFP : VX2_Int_SP<394, "vexptefp", int_ppc_altivec_vexptefp>;
+def VLOGEFP : VX2_Int_SP<458, "vlogefp", int_ppc_altivec_vlogefp>;
+
+let isCommutable = 1 in {
+def VAVGSB : VX1_Int_Ty<1282, "vavgsb", int_ppc_altivec_vavgsb, v16i8>;
+def VAVGSH : VX1_Int_Ty<1346, "vavgsh", int_ppc_altivec_vavgsh, v8i16>;
+def VAVGSW : VX1_Int_Ty<1410, "vavgsw", int_ppc_altivec_vavgsw, v4i32>;
+def VAVGUB : VX1_Int_Ty<1026, "vavgub", int_ppc_altivec_vavgub, v16i8>;
+def VAVGUH : VX1_Int_Ty<1090, "vavguh", int_ppc_altivec_vavguh, v8i16>;
+def VAVGUW : VX1_Int_Ty<1154, "vavguw", int_ppc_altivec_vavguw, v4i32>;
+
+def VMAXFP : VX1_Int_Ty<1034, "vmaxfp", int_ppc_altivec_vmaxfp, v4f32>;
+def VMAXSB : VX1_Int_Ty< 258, "vmaxsb", int_ppc_altivec_vmaxsb, v16i8>;
+def VMAXSH : VX1_Int_Ty< 322, "vmaxsh", int_ppc_altivec_vmaxsh, v8i16>;
+def VMAXSW : VX1_Int_Ty< 386, "vmaxsw", int_ppc_altivec_vmaxsw, v4i32>;
+def VMAXUB : VX1_Int_Ty< 2, "vmaxub", int_ppc_altivec_vmaxub, v16i8>;
+def VMAXUH : VX1_Int_Ty< 66, "vmaxuh", int_ppc_altivec_vmaxuh, v8i16>;
+def VMAXUW : VX1_Int_Ty< 130, "vmaxuw", int_ppc_altivec_vmaxuw, v4i32>;
+def VMINFP : VX1_Int_Ty<1098, "vminfp", int_ppc_altivec_vminfp, v4f32>;
+def VMINSB : VX1_Int_Ty< 770, "vminsb", int_ppc_altivec_vminsb, v16i8>;
+def VMINSH : VX1_Int_Ty< 834, "vminsh", int_ppc_altivec_vminsh, v8i16>;
+def VMINSW : VX1_Int_Ty< 898, "vminsw", int_ppc_altivec_vminsw, v4i32>;
+def VMINUB : VX1_Int_Ty< 514, "vminub", int_ppc_altivec_vminub, v16i8>;
+def VMINUH : VX1_Int_Ty< 578, "vminuh", int_ppc_altivec_vminuh, v8i16>;
+def VMINUW : VX1_Int_Ty< 642, "vminuw", int_ppc_altivec_vminuw, v4i32>;
+} // isCommutable
+
+def VMRGHB : VXForm_1< 12, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vmrghb $vD, $vA, $vB", IIC_VecFP,
+ [(set v16i8:$vD, (vmrghb_shuffle v16i8:$vA, v16i8:$vB))]>;
+def VMRGHH : VXForm_1< 76, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vmrghh $vD, $vA, $vB", IIC_VecFP,
+ [(set v16i8:$vD, (vmrghh_shuffle v16i8:$vA, v16i8:$vB))]>;
+def VMRGHW : VXForm_1<140, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vmrghw $vD, $vA, $vB", IIC_VecFP,
+ [(set v16i8:$vD, (vmrghw_shuffle v16i8:$vA, v16i8:$vB))]>;
+def VMRGLB : VXForm_1<268, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vmrglb $vD, $vA, $vB", IIC_VecFP,
+ [(set v16i8:$vD, (vmrglb_shuffle v16i8:$vA, v16i8:$vB))]>;
+def VMRGLH : VXForm_1<332, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vmrglh $vD, $vA, $vB", IIC_VecFP,
+ [(set v16i8:$vD, (vmrglh_shuffle v16i8:$vA, v16i8:$vB))]>;
+def VMRGLW : VXForm_1<396, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vmrglw $vD, $vA, $vB", IIC_VecFP,
+ [(set v16i8:$vD, (vmrglw_shuffle v16i8:$vA, v16i8:$vB))]>;
+
+def VMSUMMBM : VA1a_Int_Ty3<37, "vmsummbm", int_ppc_altivec_vmsummbm,
+ v4i32, v16i8, v4i32>;
+def VMSUMSHM : VA1a_Int_Ty3<40, "vmsumshm", int_ppc_altivec_vmsumshm,
+ v4i32, v8i16, v4i32>;
+def VMSUMSHS : VA1a_Int_Ty3<41, "vmsumshs", int_ppc_altivec_vmsumshs,
+ v4i32, v8i16, v4i32>;
+def VMSUMUBM : VA1a_Int_Ty3<36, "vmsumubm", int_ppc_altivec_vmsumubm,
+ v4i32, v16i8, v4i32>;
+def VMSUMUHM : VA1a_Int_Ty3<38, "vmsumuhm", int_ppc_altivec_vmsumuhm,
+ v4i32, v8i16, v4i32>;
+def VMSUMUHS : VA1a_Int_Ty3<39, "vmsumuhs", int_ppc_altivec_vmsumuhs,
+ v4i32, v8i16, v4i32>;
+
+let isCommutable = 1 in {
+def VMULESB : VX1_Int_Ty2<776, "vmulesb", int_ppc_altivec_vmulesb,
+ v8i16, v16i8>;
+def VMULESH : VX1_Int_Ty2<840, "vmulesh", int_ppc_altivec_vmulesh,
+ v4i32, v8i16>;
+def VMULEUB : VX1_Int_Ty2<520, "vmuleub", int_ppc_altivec_vmuleub,
+ v8i16, v16i8>;
+def VMULEUH : VX1_Int_Ty2<584, "vmuleuh", int_ppc_altivec_vmuleuh,
+ v4i32, v8i16>;
+def VMULOSB : VX1_Int_Ty2<264, "vmulosb", int_ppc_altivec_vmulosb,
+ v8i16, v16i8>;
+def VMULOSH : VX1_Int_Ty2<328, "vmulosh", int_ppc_altivec_vmulosh,
+ v4i32, v8i16>;
+def VMULOUB : VX1_Int_Ty2< 8, "vmuloub", int_ppc_altivec_vmuloub,
+ v8i16, v16i8>;
+def VMULOUH : VX1_Int_Ty2< 72, "vmulouh", int_ppc_altivec_vmulouh,
+ v4i32, v8i16>;
+} // isCommutable
+
+def VREFP : VX2_Int_SP<266, "vrefp", int_ppc_altivec_vrefp>;
+def VRFIM : VX2_Int_SP<714, "vrfim", int_ppc_altivec_vrfim>;
+def VRFIN : VX2_Int_SP<522, "vrfin", int_ppc_altivec_vrfin>;
+def VRFIP : VX2_Int_SP<650, "vrfip", int_ppc_altivec_vrfip>;
+def VRFIZ : VX2_Int_SP<586, "vrfiz", int_ppc_altivec_vrfiz>;
+def VRSQRTEFP : VX2_Int_SP<330, "vrsqrtefp", int_ppc_altivec_vrsqrtefp>;
+
+def VSUBCUW : VX1_Int_Ty<1408, "vsubcuw", int_ppc_altivec_vsubcuw, v4i32>;
+
+def VSUBFP : VXForm_1<74, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vsubfp $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v4f32:$vD, (fsub v4f32:$vA, v4f32:$vB))]>;
+def VSUBUBM : VXForm_1<1024, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vsububm $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v16i8:$vD, (sub v16i8:$vA, v16i8:$vB))]>;
+def VSUBUHM : VXForm_1<1088, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vsubuhm $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v8i16:$vD, (sub v8i16:$vA, v8i16:$vB))]>;
+def VSUBUWM : VXForm_1<1152, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vsubuwm $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v4i32:$vD, (sub v4i32:$vA, v4i32:$vB))]>;
+
+def VSUBSBS : VX1_Int_Ty<1792, "vsubsbs" , int_ppc_altivec_vsubsbs, v16i8>;
+def VSUBSHS : VX1_Int_Ty<1856, "vsubshs" , int_ppc_altivec_vsubshs, v8i16>;
+def VSUBSWS : VX1_Int_Ty<1920, "vsubsws" , int_ppc_altivec_vsubsws, v4i32>;
+def VSUBUBS : VX1_Int_Ty<1536, "vsububs" , int_ppc_altivec_vsububs, v16i8>;
+def VSUBUHS : VX1_Int_Ty<1600, "vsubuhs" , int_ppc_altivec_vsubuhs, v8i16>;
+def VSUBUWS : VX1_Int_Ty<1664, "vsubuws" , int_ppc_altivec_vsubuws, v4i32>;
+
+def VSUMSWS : VX1_Int_Ty<1928, "vsumsws" , int_ppc_altivec_vsumsws, v4i32>;
+def VSUM2SWS: VX1_Int_Ty<1672, "vsum2sws", int_ppc_altivec_vsum2sws, v4i32>;
+
+def VSUM4SBS: VX1_Int_Ty3<1800, "vsum4sbs", int_ppc_altivec_vsum4sbs,
+ v4i32, v16i8, v4i32>;
+def VSUM4SHS: VX1_Int_Ty3<1608, "vsum4shs", int_ppc_altivec_vsum4shs,
+ v4i32, v8i16, v4i32>;
+def VSUM4UBS: VX1_Int_Ty3<1544, "vsum4ubs", int_ppc_altivec_vsum4ubs,
+ v4i32, v16i8, v4i32>;
+
+def VNOR : VXForm_1<1284, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vnor $vD, $vA, $vB", IIC_VecFP,
+ [(set v4i32:$vD, (vnot_ppc (or v4i32:$vA,
+ v4i32:$vB)))]>;
+let isCommutable = 1 in {
+def VOR : VXForm_1<1156, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vor $vD, $vA, $vB", IIC_VecFP,
+ [(set v4i32:$vD, (or v4i32:$vA, v4i32:$vB))]>;
+def VXOR : VXForm_1<1220, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vxor $vD, $vA, $vB", IIC_VecFP,
+ [(set v4i32:$vD, (xor v4i32:$vA, v4i32:$vB))]>;
+} // isCommutable
+
+def VRLB : VX1_Int_Ty< 4, "vrlb", int_ppc_altivec_vrlb, v16i8>;
+def VRLH : VX1_Int_Ty< 68, "vrlh", int_ppc_altivec_vrlh, v8i16>;
+def VRLW : VX1_Int_Ty< 132, "vrlw", int_ppc_altivec_vrlw, v4i32>;
+
+def VSL : VX1_Int_Ty< 452, "vsl" , int_ppc_altivec_vsl, v4i32 >;
+def VSLO : VX1_Int_Ty<1036, "vslo", int_ppc_altivec_vslo, v4i32>;
+
+def VSLB : VX1_Int_Ty< 260, "vslb", int_ppc_altivec_vslb, v16i8>;
+def VSLH : VX1_Int_Ty< 324, "vslh", int_ppc_altivec_vslh, v8i16>;
+def VSLW : VX1_Int_Ty< 388, "vslw", int_ppc_altivec_vslw, v4i32>;
+
+def VSPLTB : VXForm_1<524, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
+ "vspltb $vD, $vB, $UIMM", IIC_VecPerm,
+ [(set v16i8:$vD,
+ (vspltb_shuffle:$UIMM v16i8:$vB, (undef)))]>;
+def VSPLTH : VXForm_1<588, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
+ "vsplth $vD, $vB, $UIMM", IIC_VecPerm,
+ [(set v16i8:$vD,
+ (vsplth_shuffle:$UIMM v16i8:$vB, (undef)))]>;
+def VSPLTW : VXForm_1<652, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
+ "vspltw $vD, $vB, $UIMM", IIC_VecPerm,
+ [(set v16i8:$vD,
+ (vspltw_shuffle:$UIMM v16i8:$vB, (undef)))]>;
+let isCodeGenOnly = 1 in {
+ def VSPLTBs : VXForm_1<524, (outs vrrc:$vD), (ins u5imm:$UIMM, vfrc:$vB),
+ "vspltb $vD, $vB, $UIMM", IIC_VecPerm, []>;
+ def VSPLTHs : VXForm_1<588, (outs vrrc:$vD), (ins u5imm:$UIMM, vfrc:$vB),
+ "vsplth $vD, $vB, $UIMM", IIC_VecPerm, []>;
+}
+
+def VSR : VX1_Int_Ty< 708, "vsr" , int_ppc_altivec_vsr, v4i32>;
+def VSRO : VX1_Int_Ty<1100, "vsro" , int_ppc_altivec_vsro, v4i32>;
+
+def VSRAB : VX1_Int_Ty< 772, "vsrab", int_ppc_altivec_vsrab, v16i8>;
+def VSRAH : VX1_Int_Ty< 836, "vsrah", int_ppc_altivec_vsrah, v8i16>;
+def VSRAW : VX1_Int_Ty< 900, "vsraw", int_ppc_altivec_vsraw, v4i32>;
+def VSRB : VX1_Int_Ty< 516, "vsrb" , int_ppc_altivec_vsrb , v16i8>;
+def VSRH : VX1_Int_Ty< 580, "vsrh" , int_ppc_altivec_vsrh , v8i16>;
+def VSRW : VX1_Int_Ty< 644, "vsrw" , int_ppc_altivec_vsrw , v4i32>;
+
+
+def VSPLTISB : VXForm_3<780, (outs vrrc:$vD), (ins s5imm:$SIMM),
+ "vspltisb $vD, $SIMM", IIC_VecPerm,
+ [(set v16i8:$vD, (v16i8 vecspltisb:$SIMM))]>;
+def VSPLTISH : VXForm_3<844, (outs vrrc:$vD), (ins s5imm:$SIMM),
+ "vspltish $vD, $SIMM", IIC_VecPerm,
+ [(set v8i16:$vD, (v8i16 vecspltish:$SIMM))]>;
+def VSPLTISW : VXForm_3<908, (outs vrrc:$vD), (ins s5imm:$SIMM),
+ "vspltisw $vD, $SIMM", IIC_VecPerm,
+ [(set v4i32:$vD, (v4i32 vecspltisw:$SIMM))]>;
+
+// Vector Pack.
+def VPKPX : VX1_Int_Ty2<782, "vpkpx", int_ppc_altivec_vpkpx,
+ v8i16, v4i32>;
+def VPKSHSS : VX1_Int_Ty2<398, "vpkshss", int_ppc_altivec_vpkshss,
+ v16i8, v8i16>;
+def VPKSHUS : VX1_Int_Ty2<270, "vpkshus", int_ppc_altivec_vpkshus,
+ v16i8, v8i16>;
+def VPKSWSS : VX1_Int_Ty2<462, "vpkswss", int_ppc_altivec_vpkswss,
+ v8i16, v4i32>;
+def VPKSWUS : VX1_Int_Ty2<334, "vpkswus", int_ppc_altivec_vpkswus,
+ v8i16, v4i32>;
+def VPKUHUM : VXForm_1<14, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vpkuhum $vD, $vA, $vB", IIC_VecFP,
+ [(set v16i8:$vD,
+ (vpkuhum_shuffle v16i8:$vA, v16i8:$vB))]>;
+def VPKUHUS : VX1_Int_Ty2<142, "vpkuhus", int_ppc_altivec_vpkuhus,
+ v16i8, v8i16>;
+def VPKUWUM : VXForm_1<78, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vpkuwum $vD, $vA, $vB", IIC_VecFP,
+ [(set v16i8:$vD,
+ (vpkuwum_shuffle v16i8:$vA, v16i8:$vB))]>;
+def VPKUWUS : VX1_Int_Ty2<206, "vpkuwus", int_ppc_altivec_vpkuwus,
+ v8i16, v4i32>;
+
+// Vector Unpack.
+def VUPKHPX : VX2_Int_Ty2<846, "vupkhpx", int_ppc_altivec_vupkhpx,
+ v4i32, v8i16>;
+def VUPKHSB : VX2_Int_Ty2<526, "vupkhsb", int_ppc_altivec_vupkhsb,
+ v8i16, v16i8>;
+def VUPKHSH : VX2_Int_Ty2<590, "vupkhsh", int_ppc_altivec_vupkhsh,
+ v4i32, v8i16>;
+def VUPKLPX : VX2_Int_Ty2<974, "vupklpx", int_ppc_altivec_vupklpx,
+ v4i32, v8i16>;
+def VUPKLSB : VX2_Int_Ty2<654, "vupklsb", int_ppc_altivec_vupklsb,
+ v8i16, v16i8>;
+def VUPKLSH : VX2_Int_Ty2<718, "vupklsh", int_ppc_altivec_vupklsh,
+ v4i32, v8i16>;
+
+
+// Altivec Comparisons.
+
+class VCMP<bits<10> xo, string asmstr, ValueType Ty>
+ : VXRForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), asmstr,
+ IIC_VecFPCompare,
+ [(set Ty:$vD, (Ty (PPCvcmp Ty:$vA, Ty:$vB, xo)))]>;
+class VCMPo<bits<10> xo, string asmstr, ValueType Ty>
+ : VXRForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), asmstr,
+ IIC_VecFPCompare,
+ [(set Ty:$vD, (Ty (PPCvcmp_o Ty:$vA, Ty:$vB, xo)))]> {
+ let Defs = [CR6];
+ let RC = 1;
+}
+
+// f32 element comparisons.0
+def VCMPBFP : VCMP <966, "vcmpbfp $vD, $vA, $vB" , v4f32>;
+def VCMPBFPo : VCMPo<966, "vcmpbfp. $vD, $vA, $vB" , v4f32>;
+def VCMPEQFP : VCMP <198, "vcmpeqfp $vD, $vA, $vB" , v4f32>;
+def VCMPEQFPo : VCMPo<198, "vcmpeqfp. $vD, $vA, $vB", v4f32>;
+def VCMPGEFP : VCMP <454, "vcmpgefp $vD, $vA, $vB" , v4f32>;
+def VCMPGEFPo : VCMPo<454, "vcmpgefp. $vD, $vA, $vB", v4f32>;
+def VCMPGTFP : VCMP <710, "vcmpgtfp $vD, $vA, $vB" , v4f32>;
+def VCMPGTFPo : VCMPo<710, "vcmpgtfp. $vD, $vA, $vB", v4f32>;
+
+// i8 element comparisons.
+def VCMPEQUB : VCMP < 6, "vcmpequb $vD, $vA, $vB" , v16i8>;
+def VCMPEQUBo : VCMPo< 6, "vcmpequb. $vD, $vA, $vB", v16i8>;
+def VCMPGTSB : VCMP <774, "vcmpgtsb $vD, $vA, $vB" , v16i8>;
+def VCMPGTSBo : VCMPo<774, "vcmpgtsb. $vD, $vA, $vB", v16i8>;
+def VCMPGTUB : VCMP <518, "vcmpgtub $vD, $vA, $vB" , v16i8>;
+def VCMPGTUBo : VCMPo<518, "vcmpgtub. $vD, $vA, $vB", v16i8>;
+
+// i16 element comparisons.
+def VCMPEQUH : VCMP < 70, "vcmpequh $vD, $vA, $vB" , v8i16>;
+def VCMPEQUHo : VCMPo< 70, "vcmpequh. $vD, $vA, $vB", v8i16>;
+def VCMPGTSH : VCMP <838, "vcmpgtsh $vD, $vA, $vB" , v8i16>;
+def VCMPGTSHo : VCMPo<838, "vcmpgtsh. $vD, $vA, $vB", v8i16>;
+def VCMPGTUH : VCMP <582, "vcmpgtuh $vD, $vA, $vB" , v8i16>;
+def VCMPGTUHo : VCMPo<582, "vcmpgtuh. $vD, $vA, $vB", v8i16>;
+
+// i32 element comparisons.
+def VCMPEQUW : VCMP <134, "vcmpequw $vD, $vA, $vB" , v4i32>;
+def VCMPEQUWo : VCMPo<134, "vcmpequw. $vD, $vA, $vB", v4i32>;
+def VCMPGTSW : VCMP <902, "vcmpgtsw $vD, $vA, $vB" , v4i32>;
+def VCMPGTSWo : VCMPo<902, "vcmpgtsw. $vD, $vA, $vB", v4i32>;
+def VCMPGTUW : VCMP <646, "vcmpgtuw $vD, $vA, $vB" , v4i32>;
+def VCMPGTUWo : VCMPo<646, "vcmpgtuw. $vD, $vA, $vB", v4i32>;
+
+let isCodeGenOnly = 1 in {
+def V_SET0B : VXForm_setzero<1220, (outs vrrc:$vD), (ins),
+ "vxor $vD, $vD, $vD", IIC_VecFP,
+ [(set v16i8:$vD, (v16i8 immAllZerosV))]>;
+def V_SET0H : VXForm_setzero<1220, (outs vrrc:$vD), (ins),
+ "vxor $vD, $vD, $vD", IIC_VecFP,
+ [(set v8i16:$vD, (v8i16 immAllZerosV))]>;
+def V_SET0 : VXForm_setzero<1220, (outs vrrc:$vD), (ins),
+ "vxor $vD, $vD, $vD", IIC_VecFP,
+ [(set v4i32:$vD, (v4i32 immAllZerosV))]>;
+
+let IMM=-1 in {
+def V_SETALLONESB : VXForm_3<908, (outs vrrc:$vD), (ins),
+ "vspltisw $vD, -1", IIC_VecFP,
+ [(set v16i8:$vD, (v16i8 immAllOnesV))]>;
+def V_SETALLONESH : VXForm_3<908, (outs vrrc:$vD), (ins),
+ "vspltisw $vD, -1", IIC_VecFP,
+ [(set v8i16:$vD, (v8i16 immAllOnesV))]>;
+def V_SETALLONES : VXForm_3<908, (outs vrrc:$vD), (ins),
+ "vspltisw $vD, -1", IIC_VecFP,
+ [(set v4i32:$vD, (v4i32 immAllOnesV))]>;
+}
+}
+} // VALU Operations.
+
+//===----------------------------------------------------------------------===//
+// Additional Altivec Patterns
+//
+
+// Loads.
+def : Pat<(v4i32 (load xoaddr:$src)), (LVX xoaddr:$src)>;
+
+// Stores.
+def : Pat<(store v4i32:$rS, xoaddr:$dst),
+ (STVX $rS, xoaddr:$dst)>;
+
+// Bit conversions.
+def : Pat<(v16i8 (bitconvert (v8i16 VRRC:$src))), (v16i8 VRRC:$src)>;
+def : Pat<(v16i8 (bitconvert (v4i32 VRRC:$src))), (v16i8 VRRC:$src)>;
+def : Pat<(v16i8 (bitconvert (v4f32 VRRC:$src))), (v16i8 VRRC:$src)>;
+def : Pat<(v16i8 (bitconvert (v2i64 VRRC:$src))), (v16i8 VRRC:$src)>;
+def : Pat<(v16i8 (bitconvert (v1i128 VRRC:$src))), (v16i8 VRRC:$src)>;
+
+def : Pat<(v8i16 (bitconvert (v16i8 VRRC:$src))), (v8i16 VRRC:$src)>;
+def : Pat<(v8i16 (bitconvert (v4i32 VRRC:$src))), (v8i16 VRRC:$src)>;
+def : Pat<(v8i16 (bitconvert (v4f32 VRRC:$src))), (v8i16 VRRC:$src)>;
+def : Pat<(v8i16 (bitconvert (v2i64 VRRC:$src))), (v8i16 VRRC:$src)>;
+def : Pat<(v8i16 (bitconvert (v1i128 VRRC:$src))), (v8i16 VRRC:$src)>;
+
+def : Pat<(v4i32 (bitconvert (v16i8 VRRC:$src))), (v4i32 VRRC:$src)>;
+def : Pat<(v4i32 (bitconvert (v8i16 VRRC:$src))), (v4i32 VRRC:$src)>;
+def : Pat<(v4i32 (bitconvert (v4f32 VRRC:$src))), (v4i32 VRRC:$src)>;
+def : Pat<(v4i32 (bitconvert (v2i64 VRRC:$src))), (v4i32 VRRC:$src)>;
+def : Pat<(v4i32 (bitconvert (v1i128 VRRC:$src))), (v4i32 VRRC:$src)>;
+
+def : Pat<(v4f32 (bitconvert (v16i8 VRRC:$src))), (v4f32 VRRC:$src)>;
+def : Pat<(v4f32 (bitconvert (v8i16 VRRC:$src))), (v4f32 VRRC:$src)>;
+def : Pat<(v4f32 (bitconvert (v4i32 VRRC:$src))), (v4f32 VRRC:$src)>;
+def : Pat<(v4f32 (bitconvert (v2i64 VRRC:$src))), (v4f32 VRRC:$src)>;
+def : Pat<(v4f32 (bitconvert (v1i128 VRRC:$src))), (v4f32 VRRC:$src)>;
+
+def : Pat<(v2i64 (bitconvert (v16i8 VRRC:$src))), (v2i64 VRRC:$src)>;
+def : Pat<(v2i64 (bitconvert (v8i16 VRRC:$src))), (v2i64 VRRC:$src)>;
+def : Pat<(v2i64 (bitconvert (v4i32 VRRC:$src))), (v2i64 VRRC:$src)>;
+def : Pat<(v2i64 (bitconvert (v4f32 VRRC:$src))), (v2i64 VRRC:$src)>;
+def : Pat<(v2i64 (bitconvert (v1i128 VRRC:$src))), (v2i64 VRRC:$src)>;
+
+def : Pat<(v1i128 (bitconvert (v16i8 VRRC:$src))), (v1i128 VRRC:$src)>;
+def : Pat<(v1i128 (bitconvert (v8i16 VRRC:$src))), (v1i128 VRRC:$src)>;
+def : Pat<(v1i128 (bitconvert (v4i32 VRRC:$src))), (v1i128 VRRC:$src)>;
+def : Pat<(v1i128 (bitconvert (v4f32 VRRC:$src))), (v1i128 VRRC:$src)>;
+def : Pat<(v1i128 (bitconvert (v2i64 VRRC:$src))), (v1i128 VRRC:$src)>;
+
+// Shuffles.
+
+// Match vsldoi(x,x), vpkuwum(x,x), vpkuhum(x,x)
+def:Pat<(vsldoi_unary_shuffle:$in v16i8:$vA, undef),
+ (VSLDOI $vA, $vA, (VSLDOI_unary_get_imm $in))>;
+def:Pat<(vpkuwum_unary_shuffle v16i8:$vA, undef),
+ (VPKUWUM $vA, $vA)>;
+def:Pat<(vpkuhum_unary_shuffle v16i8:$vA, undef),
+ (VPKUHUM $vA, $vA)>;
+
+// Match vsldoi(y,x), vpkuwum(y,x), vpkuhum(y,x), i.e., swapped operands.
+// These fragments are matched for little-endian, where the inputs must
+// be swapped for correct semantics.
+def:Pat<(vsldoi_swapped_shuffle:$in v16i8:$vA, v16i8:$vB),
+ (VSLDOI $vB, $vA, (VSLDOI_swapped_get_imm $in))>;
+def:Pat<(vpkuwum_swapped_shuffle v16i8:$vA, v16i8:$vB),
+ (VPKUWUM $vB, $vA)>;
+def:Pat<(vpkuhum_swapped_shuffle v16i8:$vA, v16i8:$vB),
+ (VPKUHUM $vB, $vA)>;
+
+// Match vmrg*(x,x)
+def:Pat<(vmrglb_unary_shuffle v16i8:$vA, undef),
+ (VMRGLB $vA, $vA)>;
+def:Pat<(vmrglh_unary_shuffle v16i8:$vA, undef),
+ (VMRGLH $vA, $vA)>;
+def:Pat<(vmrglw_unary_shuffle v16i8:$vA, undef),
+ (VMRGLW $vA, $vA)>;
+def:Pat<(vmrghb_unary_shuffle v16i8:$vA, undef),
+ (VMRGHB $vA, $vA)>;
+def:Pat<(vmrghh_unary_shuffle v16i8:$vA, undef),
+ (VMRGHH $vA, $vA)>;
+def:Pat<(vmrghw_unary_shuffle v16i8:$vA, undef),
+ (VMRGHW $vA, $vA)>;
+
+// Match vmrg*(y,x), i.e., swapped operands. These fragments
+// are matched for little-endian, where the inputs must be
+// swapped for correct semantics.
+def:Pat<(vmrglb_swapped_shuffle v16i8:$vA, v16i8:$vB),
+ (VMRGLB $vB, $vA)>;
+def:Pat<(vmrglh_swapped_shuffle v16i8:$vA, v16i8:$vB),
+ (VMRGLH $vB, $vA)>;
+def:Pat<(vmrglw_swapped_shuffle v16i8:$vA, v16i8:$vB),
+ (VMRGLW $vB, $vA)>;
+def:Pat<(vmrghb_swapped_shuffle v16i8:$vA, v16i8:$vB),
+ (VMRGHB $vB, $vA)>;
+def:Pat<(vmrghh_swapped_shuffle v16i8:$vA, v16i8:$vB),
+ (VMRGHH $vB, $vA)>;
+def:Pat<(vmrghw_swapped_shuffle v16i8:$vA, v16i8:$vB),
+ (VMRGHW $vB, $vA)>;
+
+// Logical Operations
+def : Pat<(vnot_ppc v4i32:$vA), (VNOR $vA, $vA)>;
+
+def : Pat<(vnot_ppc (or v4i32:$A, v4i32:$B)),
+ (VNOR $A, $B)>;
+def : Pat<(and v4i32:$A, (vnot_ppc v4i32:$B)),
+ (VANDC $A, $B)>;
+
+def : Pat<(fmul v4f32:$vA, v4f32:$vB),
+ (VMADDFP $vA, $vB,
+ (v4i32 (VSLW (V_SETALLONES), (V_SETALLONES))))>;
+
+// Fused multiply add and multiply sub for packed float. These are represented
+// separately from the real instructions above, for operations that must have
+// the additional precision, such as Newton-Rhapson (used by divide, sqrt)
+def : Pat<(PPCvmaddfp v4f32:$A, v4f32:$B, v4f32:$C),
+ (VMADDFP $A, $B, $C)>;
+def : Pat<(PPCvnmsubfp v4f32:$A, v4f32:$B, v4f32:$C),
+ (VNMSUBFP $A, $B, $C)>;
+
+def : Pat<(int_ppc_altivec_vmaddfp v4f32:$A, v4f32:$B, v4f32:$C),
+ (VMADDFP $A, $B, $C)>;
+def : Pat<(int_ppc_altivec_vnmsubfp v4f32:$A, v4f32:$B, v4f32:$C),
+ (VNMSUBFP $A, $B, $C)>;
+
+def : Pat<(PPCvperm v16i8:$vA, v16i8:$vB, v16i8:$vC),
+ (VPERM $vA, $vB, $vC)>;
+
+def : Pat<(PPCfre v4f32:$A), (VREFP $A)>;
+def : Pat<(PPCfrsqrte v4f32:$A), (VRSQRTEFP $A)>;
+
+// Vector shifts
+def : Pat<(v16i8 (shl v16i8:$vA, v16i8:$vB)),
+ (v16i8 (VSLB $vA, $vB))>;
+def : Pat<(v8i16 (shl v8i16:$vA, v8i16:$vB)),
+ (v8i16 (VSLH $vA, $vB))>;
+def : Pat<(v4i32 (shl v4i32:$vA, v4i32:$vB)),
+ (v4i32 (VSLW $vA, $vB))>;
+
+def : Pat<(v16i8 (srl v16i8:$vA, v16i8:$vB)),
+ (v16i8 (VSRB $vA, $vB))>;
+def : Pat<(v8i16 (srl v8i16:$vA, v8i16:$vB)),
+ (v8i16 (VSRH $vA, $vB))>;
+def : Pat<(v4i32 (srl v4i32:$vA, v4i32:$vB)),
+ (v4i32 (VSRW $vA, $vB))>;
+
+def : Pat<(v16i8 (sra v16i8:$vA, v16i8:$vB)),
+ (v16i8 (VSRAB $vA, $vB))>;
+def : Pat<(v8i16 (sra v8i16:$vA, v8i16:$vB)),
+ (v8i16 (VSRAH $vA, $vB))>;
+def : Pat<(v4i32 (sra v4i32:$vA, v4i32:$vB)),
+ (v4i32 (VSRAW $vA, $vB))>;
+
+// Float to integer and integer to float conversions
+def : Pat<(v4i32 (fp_to_sint v4f32:$vA)),
+ (VCTSXS_0 $vA)>;
+def : Pat<(v4i32 (fp_to_uint v4f32:$vA)),
+ (VCTUXS_0 $vA)>;
+def : Pat<(v4f32 (sint_to_fp v4i32:$vA)),
+ (VCFSX_0 $vA)>;
+def : Pat<(v4f32 (uint_to_fp v4i32:$vA)),
+ (VCFUX_0 $vA)>;
+
+// Floating-point rounding
+def : Pat<(v4f32 (ffloor v4f32:$vA)),
+ (VRFIM $vA)>;
+def : Pat<(v4f32 (fceil v4f32:$vA)),
+ (VRFIP $vA)>;
+def : Pat<(v4f32 (ftrunc v4f32:$vA)),
+ (VRFIZ $vA)>;
+def : Pat<(v4f32 (fnearbyint v4f32:$vA)),
+ (VRFIN $vA)>;
+
+} // end HasAltivec
+
+def HasP8Altivec : Predicate<"PPCSubTarget->hasP8Altivec()">;
+def HasP8Crypto : Predicate<"PPCSubTarget->hasP8Crypto()">;
+let Predicates = [HasP8Altivec] in {
+
+let isCommutable = 1 in {
+def VMULESW : VX1_Int_Ty2<904, "vmulesw", int_ppc_altivec_vmulesw,
+ v2i64, v4i32>;
+def VMULEUW : VX1_Int_Ty2<648, "vmuleuw", int_ppc_altivec_vmuleuw,
+ v2i64, v4i32>;
+def VMULOSW : VX1_Int_Ty2<392, "vmulosw", int_ppc_altivec_vmulosw,
+ v2i64, v4i32>;
+def VMULOUW : VX1_Int_Ty2<136, "vmulouw", int_ppc_altivec_vmulouw,
+ v2i64, v4i32>;
+def VMULUWM : VXForm_1<137, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vmuluwm $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v4i32:$vD, (mul v4i32:$vA, v4i32:$vB))]>;
+def VMAXSD : VX1_Int_Ty<450, "vmaxsd", int_ppc_altivec_vmaxsd, v2i64>;
+def VMAXUD : VX1_Int_Ty<194, "vmaxud", int_ppc_altivec_vmaxud, v2i64>;
+def VMINSD : VX1_Int_Ty<962, "vminsd", int_ppc_altivec_vminsd, v2i64>;
+def VMINUD : VX1_Int_Ty<706, "vminud", int_ppc_altivec_vminud, v2i64>;
+} // isCommutable
+
+// Vector merge
+def VMRGEW : VXForm_1<1932, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vmrgew $vD, $vA, $vB", IIC_VecFP,
+ [(set v16i8:$vD, (vmrgew_shuffle v16i8:$vA, v16i8:$vB))]>;
+def VMRGOW : VXForm_1<1676, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vmrgow $vD, $vA, $vB", IIC_VecFP,
+ [(set v16i8:$vD, (vmrgow_shuffle v16i8:$vA, v16i8:$vB))]>;
+
+// Match vmrgew(x,x) and vmrgow(x,x)
+def:Pat<(vmrgew_unary_shuffle v16i8:$vA, undef),
+ (VMRGEW $vA, $vA)>;
+def:Pat<(vmrgow_unary_shuffle v16i8:$vA, undef),
+ (VMRGOW $vA, $vA)>;
+
+// Match vmrgew(y,x) and vmrgow(y,x), i.e., swapped operands. These fragments
+// are matched for little-endian, where the inputs must be swapped for correct
+// semantics.w
+def:Pat<(vmrgew_swapped_shuffle v16i8:$vA, v16i8:$vB),
+ (VMRGEW $vB, $vA)>;
+def:Pat<(vmrgow_swapped_shuffle v16i8:$vA, v16i8:$vB),
+ (VMRGOW $vB, $vA)>;
+
+
+// Vector shifts
+def VRLD : VX1_Int_Ty<196, "vrld", int_ppc_altivec_vrld, v2i64>;
+def VSLD : VXForm_1<1476, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vsld $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v2i64:$vD, (shl v2i64:$vA, v2i64:$vB))]>;
+def VSRD : VXForm_1<1732, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vsrd $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v2i64:$vD, (srl v2i64:$vA, v2i64:$vB))]>;
+def VSRAD : VXForm_1<964, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vsrad $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v2i64:$vD, (sra v2i64:$vA, v2i64:$vB))]>;
+
+// Vector Integer Arithmetic Instructions
+let isCommutable = 1 in {
+def VADDUDM : VXForm_1<192, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vaddudm $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v2i64:$vD, (add v2i64:$vA, v2i64:$vB))]>;
+def VADDUQM : VXForm_1<256, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vadduqm $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v1i128:$vD, (add v1i128:$vA, v1i128:$vB))]>;
+} // isCommutable
+
+// Vector Quadword Add
+def VADDEUQM : VA1a_Int_Ty<60, "vaddeuqm", int_ppc_altivec_vaddeuqm, v1i128>;
+def VADDCUQ : VX1_Int_Ty<320, "vaddcuq", int_ppc_altivec_vaddcuq, v1i128>;
+def VADDECUQ : VA1a_Int_Ty<61, "vaddecuq", int_ppc_altivec_vaddecuq, v1i128>;
+
+// Vector Doubleword Subtract
+def VSUBUDM : VXForm_1<1216, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vsubudm $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v2i64:$vD, (sub v2i64:$vA, v2i64:$vB))]>;
+
+// Vector Quadword Subtract
+def VSUBUQM : VXForm_1<1280, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vsubuqm $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v1i128:$vD, (sub v1i128:$vA, v1i128:$vB))]>;
+def VSUBEUQM : VA1a_Int_Ty<62, "vsubeuqm", int_ppc_altivec_vsubeuqm, v1i128>;
+def VSUBCUQ : VX1_Int_Ty<1344, "vsubcuq", int_ppc_altivec_vsubcuq, v1i128>;
+def VSUBECUQ : VA1a_Int_Ty<63, "vsubecuq", int_ppc_altivec_vsubecuq, v1i128>;
+
+// Count Leading Zeros
+def VCLZB : VXForm_2<1794, (outs vrrc:$vD), (ins vrrc:$vB),
+ "vclzb $vD, $vB", IIC_VecGeneral,
+ [(set v16i8:$vD, (ctlz v16i8:$vB))]>;
+def VCLZH : VXForm_2<1858, (outs vrrc:$vD), (ins vrrc:$vB),
+ "vclzh $vD, $vB", IIC_VecGeneral,
+ [(set v8i16:$vD, (ctlz v8i16:$vB))]>;
+def VCLZW : VXForm_2<1922, (outs vrrc:$vD), (ins vrrc:$vB),
+ "vclzw $vD, $vB", IIC_VecGeneral,
+ [(set v4i32:$vD, (ctlz v4i32:$vB))]>;
+def VCLZD : VXForm_2<1986, (outs vrrc:$vD), (ins vrrc:$vB),
+ "vclzd $vD, $vB", IIC_VecGeneral,
+ [(set v2i64:$vD, (ctlz v2i64:$vB))]>;
+
+// Population Count
+def VPOPCNTB : VXForm_2<1795, (outs vrrc:$vD), (ins vrrc:$vB),
+ "vpopcntb $vD, $vB", IIC_VecGeneral,
+ [(set v16i8:$vD, (ctpop v16i8:$vB))]>;
+def VPOPCNTH : VXForm_2<1859, (outs vrrc:$vD), (ins vrrc:$vB),
+ "vpopcnth $vD, $vB", IIC_VecGeneral,
+ [(set v8i16:$vD, (ctpop v8i16:$vB))]>;
+def VPOPCNTW : VXForm_2<1923, (outs vrrc:$vD), (ins vrrc:$vB),
+ "vpopcntw $vD, $vB", IIC_VecGeneral,
+ [(set v4i32:$vD, (ctpop v4i32:$vB))]>;
+def VPOPCNTD : VXForm_2<1987, (outs vrrc:$vD), (ins vrrc:$vB),
+ "vpopcntd $vD, $vB", IIC_VecGeneral,
+ [(set v2i64:$vD, (ctpop v2i64:$vB))]>;
+
+let isCommutable = 1 in {
+// FIXME: Use AddedComplexity > 400 to ensure these patterns match before the
+// VSX equivalents. We need to fix this up at some point. Two possible
+// solutions for this problem:
+// 1. Disable Altivec patterns that compete with VSX patterns using the
+// !HasVSX predicate. This essentially favours VSX over Altivec, in
+// hopes of reducing register pressure (larger register set using VSX
+// instructions than VMX instructions)
+// 2. Employ a more disciplined use of AddedComplexity, which would provide
+// more fine-grained control than option 1. This would be beneficial
+// if we find situations where Altivec is really preferred over VSX.
+def VEQV : VXForm_1<1668, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "veqv $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v4i32:$vD, (vnot_ppc (xor v4i32:$vA, v4i32:$vB)))]>;
+def VNAND : VXForm_1<1412, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vnand $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v4i32:$vD, (vnot_ppc (and v4i32:$vA, v4i32:$vB)))]>;
+} // isCommutable
+
+def VORC : VXForm_1<1348, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vorc $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v4i32:$vD, (or v4i32:$vA,
+ (vnot_ppc v4i32:$vB)))]>;
+
+// i64 element comparisons.
+def VCMPEQUD : VCMP <199, "vcmpequd $vD, $vA, $vB" , v2i64>;
+def VCMPEQUDo : VCMPo<199, "vcmpequd. $vD, $vA, $vB", v2i64>;
+def VCMPGTSD : VCMP <967, "vcmpgtsd $vD, $vA, $vB" , v2i64>;
+def VCMPGTSDo : VCMPo<967, "vcmpgtsd. $vD, $vA, $vB", v2i64>;
+def VCMPGTUD : VCMP <711, "vcmpgtud $vD, $vA, $vB" , v2i64>;
+def VCMPGTUDo : VCMPo<711, "vcmpgtud. $vD, $vA, $vB", v2i64>;
+
+// The cryptography instructions that do not require Category:Vector.Crypto
+def VPMSUMB : VX1_Int_Ty<1032, "vpmsumb",
+ int_ppc_altivec_crypto_vpmsumb, v16i8>;
+def VPMSUMH : VX1_Int_Ty<1096, "vpmsumh",
+ int_ppc_altivec_crypto_vpmsumh, v8i16>;
+def VPMSUMW : VX1_Int_Ty<1160, "vpmsumw",
+ int_ppc_altivec_crypto_vpmsumw, v4i32>;
+def VPMSUMD : VX1_Int_Ty<1224, "vpmsumd",
+ int_ppc_altivec_crypto_vpmsumd, v2i64>;
+def VPERMXOR : VA1a_Int_Ty<45, "vpermxor",
+ int_ppc_altivec_crypto_vpermxor, v16i8>;
+
+// Vector doubleword integer pack and unpack.
+def VPKSDSS : VX1_Int_Ty2<1486, "vpksdss", int_ppc_altivec_vpksdss,
+ v4i32, v2i64>;
+def VPKSDUS : VX1_Int_Ty2<1358, "vpksdus", int_ppc_altivec_vpksdus,
+ v4i32, v2i64>;
+def VPKUDUM : VXForm_1<1102, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vpkudum $vD, $vA, $vB", IIC_VecFP,
+ [(set v16i8:$vD,
+ (vpkudum_shuffle v16i8:$vA, v16i8:$vB))]>;
+def VPKUDUS : VX1_Int_Ty2<1230, "vpkudus", int_ppc_altivec_vpkudus,
+ v4i32, v2i64>;
+def VUPKHSW : VX2_Int_Ty2<1614, "vupkhsw", int_ppc_altivec_vupkhsw,
+ v2i64, v4i32>;
+def VUPKLSW : VX2_Int_Ty2<1742, "vupklsw", int_ppc_altivec_vupklsw,
+ v2i64, v4i32>;
+
+// Shuffle patterns for unary and swapped (LE) vector pack modulo.
+def:Pat<(vpkudum_unary_shuffle v16i8:$vA, undef),
+ (VPKUDUM $vA, $vA)>;
+def:Pat<(vpkudum_swapped_shuffle v16i8:$vA, v16i8:$vB),
+ (VPKUDUM $vB, $vA)>;
+
+def VGBBD : VX2_Int_Ty2<1292, "vgbbd", int_ppc_altivec_vgbbd, v16i8, v16i8>;
+def VBPERMQ : VX1_Int_Ty2<1356, "vbpermq", int_ppc_altivec_vbpermq,
+ v2i64, v16i8>;
+} // end HasP8Altivec
+
+// Crypto instructions (from builtins)
+let Predicates = [HasP8Crypto] in {
+def VSHASIGMAW : VXCR_Int_Ty<1666, "vshasigmaw",
+ int_ppc_altivec_crypto_vshasigmaw, v4i32>;
+def VSHASIGMAD : VXCR_Int_Ty<1730, "vshasigmad",
+ int_ppc_altivec_crypto_vshasigmad, v2i64>;
+def VCIPHER : VX1_Int_Ty<1288, "vcipher", int_ppc_altivec_crypto_vcipher,
+ v2i64>;
+def VCIPHERLAST : VX1_Int_Ty<1289, "vcipherlast",
+ int_ppc_altivec_crypto_vcipherlast, v2i64>;
+def VNCIPHER : VX1_Int_Ty<1352, "vncipher",
+ int_ppc_altivec_crypto_vncipher, v2i64>;
+def VNCIPHERLAST : VX1_Int_Ty<1353, "vncipherlast",
+ int_ppc_altivec_crypto_vncipherlast, v2i64>;
+def VSBOX : VXBX_Int_Ty<1480, "vsbox", int_ppc_altivec_crypto_vsbox, v2i64>;
+} // HasP8Crypto
+
+// The following altivec instructions were introduced in Power ISA 3.0
+def HasP9Altivec : Predicate<"PPCSubTarget->hasP9Altivec()">;
+let Predicates = [HasP9Altivec] in {
+
+// i8 element comparisons.
+def VCMPNEB : VCMP < 7, "vcmpneb $vD, $vA, $vB" , v16i8>;
+def VCMPNEBo : VCMPo < 7, "vcmpneb. $vD, $vA, $vB" , v16i8>;
+def VCMPNEZB : VCMP <263, "vcmpnezb $vD, $vA, $vB" , v16i8>;
+def VCMPNEZBo : VCMPo<263, "vcmpnezb. $vD, $vA, $vB", v16i8>;
+
+// i16 element comparisons.
+def VCMPNEH : VCMP < 71, "vcmpneh $vD, $vA, $vB" , v8i16>;
+def VCMPNEHo : VCMPo< 71, "vcmpneh. $vD, $vA, $vB" , v8i16>;
+def VCMPNEZH : VCMP <327, "vcmpnezh $vD, $vA, $vB" , v8i16>;
+def VCMPNEZHo : VCMPo<327, "vcmpnezh. $vD, $vA, $vB", v8i16>;
+
+// i32 element comparisons.
+def VCMPNEW : VCMP <135, "vcmpnew $vD, $vA, $vB" , v4i32>;
+def VCMPNEWo : VCMPo<135, "vcmpnew. $vD, $vA, $vB" , v4i32>;
+def VCMPNEZW : VCMP <391, "vcmpnezw $vD, $vA, $vB" , v4i32>;
+def VCMPNEZWo : VCMPo<391, "vcmpnezw. $vD, $vA, $vB", v4i32>;
+
+// VX-Form: [PO VRT / UIM VRB XO].
+// We use VXForm_1 to implement it, that is, we use "VRA" (5 bit) to represent
+// "/ UIM" (1 + 4 bit)
+class VX1_VT5_UIM5_VB5<bits<11> xo, string opc, list<dag> pattern>
+ : VXForm_1<xo, (outs vrrc:$vD), (ins u4imm:$UIMM, vrrc:$vB),
+ !strconcat(opc, " $vD, $vB, $UIMM"), IIC_VecGeneral, pattern>;
+
+class VX1_RT5_RA5_VB5<bits<11> xo, string opc, list<dag> pattern>
+ : VXForm_1<xo, (outs g8rc:$rD), (ins g8rc:$rA, vrrc:$vB),
+ !strconcat(opc, " $rD, $rA, $vB"), IIC_VecGeneral, pattern>;
+
+// Vector Extract Unsigned
+def VEXTRACTUB : VX1_VT5_UIM5_VB5<525, "vextractub", []>;
+def VEXTRACTUH : VX1_VT5_UIM5_VB5<589, "vextractuh", []>;
+def VEXTRACTUW : VX1_VT5_UIM5_VB5<653, "vextractuw", []>;
+def VEXTRACTD : VX1_VT5_UIM5_VB5<717, "vextractd" , []>;
+
+// Vector Extract Unsigned Byte/Halfword/Word Left/Right-Indexed
+def VEXTUBLX : VX1_RT5_RA5_VB5<1549, "vextublx", []>;
+def VEXTUBRX : VX1_RT5_RA5_VB5<1805, "vextubrx", []>;
+def VEXTUHLX : VX1_RT5_RA5_VB5<1613, "vextuhlx", []>;
+def VEXTUHRX : VX1_RT5_RA5_VB5<1869, "vextuhrx", []>;
+def VEXTUWLX : VX1_RT5_RA5_VB5<1677, "vextuwlx", []>;
+def VEXTUWRX : VX1_RT5_RA5_VB5<1933, "vextuwrx", []>;
+
+// Vector Insert Element Instructions
+def VINSERTB : VX1_VT5_UIM5_VB5<781, "vinsertb", []>;
+def VINSERTH : VX1_VT5_UIM5_VB5<845, "vinserth", []>;
+def VINSERTW : VX1_VT5_UIM5_VB5<909, "vinsertw", []>;
+def VINSERTD : VX1_VT5_UIM5_VB5<973, "vinsertd", []>;
+
+class VX_VT5_EO5_VB5<bits<11> xo, bits<5> eo, string opc, list<dag> pattern>
+ : VXForm_RD5_XO5_RS5<xo, eo, (outs vrrc:$vD), (ins vrrc:$vB),
+ !strconcat(opc, " $vD, $vB"), IIC_VecGeneral, pattern>;
+class VX_VT5_EO5_VB5s<bits<11> xo, bits<5> eo, string opc, list<dag> pattern>
+ : VXForm_RD5_XO5_RS5<xo, eo, (outs vfrc:$vD), (ins vfrc:$vB),
+ !strconcat(opc, " $vD, $vB"), IIC_VecGeneral, pattern>;
+
+// Vector Count Leading/Trailing Zero LSB. Result is placed into GPR[rD]
+def VCLZLSBB : VXForm_RD5_XO5_RS5<1538, 0, (outs gprc:$rD), (ins vrrc:$vB),
+ "vclzlsbb $rD, $vB", IIC_VecGeneral,
+ [(set i32:$rD, (int_ppc_altivec_vclzlsbb
+ v16i8:$vB))]>;
+def VCTZLSBB : VXForm_RD5_XO5_RS5<1538, 1, (outs gprc:$rD), (ins vrrc:$vB),
+ "vctzlsbb $rD, $vB", IIC_VecGeneral,
+ [(set i32:$rD, (int_ppc_altivec_vctzlsbb
+ v16i8:$vB))]>;
+// Vector Count Trailing Zeros
+def VCTZB : VX_VT5_EO5_VB5<1538, 28, "vctzb",
+ [(set v16i8:$vD, (cttz v16i8:$vB))]>;
+def VCTZH : VX_VT5_EO5_VB5<1538, 29, "vctzh",
+ [(set v8i16:$vD, (cttz v8i16:$vB))]>;
+def VCTZW : VX_VT5_EO5_VB5<1538, 30, "vctzw",
+ [(set v4i32:$vD, (cttz v4i32:$vB))]>;
+def VCTZD : VX_VT5_EO5_VB5<1538, 31, "vctzd",
+ [(set v2i64:$vD, (cttz v2i64:$vB))]>;
+
+// Vector Extend Sign
+def VEXTSB2W : VX_VT5_EO5_VB5<1538, 16, "vextsb2w", []>;
+def VEXTSH2W : VX_VT5_EO5_VB5<1538, 17, "vextsh2w", []>;
+def VEXTSB2D : VX_VT5_EO5_VB5<1538, 24, "vextsb2d", []>;
+def VEXTSH2D : VX_VT5_EO5_VB5<1538, 25, "vextsh2d", []>;
+def VEXTSW2D : VX_VT5_EO5_VB5<1538, 26, "vextsw2d", []>;
+let isCodeGenOnly = 1 in {
+ def VEXTSB2Ws : VX_VT5_EO5_VB5s<1538, 16, "vextsb2w", []>;
+ def VEXTSH2Ws : VX_VT5_EO5_VB5s<1538, 17, "vextsh2w", []>;
+ def VEXTSB2Ds : VX_VT5_EO5_VB5s<1538, 24, "vextsb2d", []>;
+ def VEXTSH2Ds : VX_VT5_EO5_VB5s<1538, 25, "vextsh2d", []>;
+ def VEXTSW2Ds : VX_VT5_EO5_VB5s<1538, 26, "vextsw2d", []>;
+}
+
+// Vector Integer Negate
+def VNEGW : VX_VT5_EO5_VB5<1538, 6, "vnegw",
+ [(set v4i32:$vD,
+ (sub (v4i32 immAllZerosV), v4i32:$vB))]>;
+
+def VNEGD : VX_VT5_EO5_VB5<1538, 7, "vnegd",
+ [(set v2i64:$vD,
+ (sub (v2i64 (bitconvert (v4i32 immAllZerosV))),
+ v2i64:$vB))]>;
+
+// Vector Parity Byte
+def VPRTYBW : VX_VT5_EO5_VB5<1538, 8, "vprtybw", [(set v4i32:$vD,
+ (int_ppc_altivec_vprtybw v4i32:$vB))]>;
+def VPRTYBD : VX_VT5_EO5_VB5<1538, 9, "vprtybd", [(set v2i64:$vD,
+ (int_ppc_altivec_vprtybd v2i64:$vB))]>;
+def VPRTYBQ : VX_VT5_EO5_VB5<1538, 10, "vprtybq", [(set v1i128:$vD,
+ (int_ppc_altivec_vprtybq v1i128:$vB))]>;
+
+// Vector (Bit) Permute (Right-indexed)
+def VBPERMD : VXForm_1<1484, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vbpermd $vD, $vA, $vB", IIC_VecFP, []>;
+def VPERMR : VAForm_1a<59, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vC),
+ "vpermr $vD, $vA, $vB, $vC", IIC_VecFP, []>;
+
+class VX1_VT5_VA5_VB5<bits<11> xo, string opc, list<dag> pattern>
+ : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ !strconcat(opc, " $vD, $vA, $vB"), IIC_VecFP, pattern>;
+
+// Vector Rotate Left Mask/Mask-Insert
+def VRLWNM : VX1_VT5_VA5_VB5<389, "vrlwnm",
+ [(set v4i32:$vD,
+ (int_ppc_altivec_vrlwnm v4i32:$vA,
+ v4i32:$vB))]>;
+def VRLWMI : VXForm_1<133, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vDi),
+ "vrlwmi $vD, $vA, $vB", IIC_VecFP,
+ [(set v4i32:$vD,
+ (int_ppc_altivec_vrlwmi v4i32:$vA, v4i32:$vB,
+ v4i32:$vDi))]>,
+ RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
+def VRLDNM : VX1_VT5_VA5_VB5<453, "vrldnm",
+ [(set v2i64:$vD,
+ (int_ppc_altivec_vrldnm v2i64:$vA,
+ v2i64:$vB))]>;
+def VRLDMI : VXForm_1<197, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vDi),
+ "vrldmi $vD, $vA, $vB", IIC_VecFP,
+ [(set v2i64:$vD,
+ (int_ppc_altivec_vrldmi v2i64:$vA, v2i64:$vB,
+ v2i64:$vDi))]>,
+ RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
+
+// Vector Shift Left/Right
+def VSLV : VX1_VT5_VA5_VB5<1860, "vslv",
+ [(set v16i8 : $vD, (int_ppc_altivec_vslv v16i8 : $vA, v16i8 : $vB))]>;
+def VSRV : VX1_VT5_VA5_VB5<1796, "vsrv",
+ [(set v16i8 : $vD, (int_ppc_altivec_vsrv v16i8 : $vA, v16i8 : $vB))]>;
+
+// Vector Multiply-by-10 (& Write Carry) Unsigned Quadword
+def VMUL10UQ : VXForm_BX<513, (outs vrrc:$vD), (ins vrrc:$vA),
+ "vmul10uq $vD, $vA", IIC_VecFP, []>;
+def VMUL10CUQ : VXForm_BX< 1, (outs vrrc:$vD), (ins vrrc:$vA),
+ "vmul10cuq $vD, $vA", IIC_VecFP, []>;
+
+// Vector Multiply-by-10 Extended (& Write Carry) Unsigned Quadword
+def VMUL10EUQ : VX1_VT5_VA5_VB5<577, "vmul10euq" , []>;
+def VMUL10ECUQ : VX1_VT5_VA5_VB5< 65, "vmul10ecuq", []>;
+
+// Decimal Integer Format Conversion Instructions
+
+// [PO VRT EO VRB 1 PS XO], "_o" means CR6 is set.
+class VX_VT5_EO5_VB5_PS1_XO9_o<bits<5> eo, bits<9> xo, string opc,
+ list<dag> pattern>
+ : VX_RD5_EO5_RS5_PS1_XO9<eo, xo, (outs vrrc:$vD), (ins vrrc:$vB, u1imm:$PS),
+ !strconcat(opc, " $vD, $vB, $PS"), IIC_VecFP, pattern> {
+ let Defs = [CR6];
+}
+
+// [PO VRT EO VRB 1 / XO]
+class VX_VT5_EO5_VB5_XO9_o<bits<5> eo, bits<9> xo, string opc,
+ list<dag> pattern>
+ : VX_RD5_EO5_RS5_PS1_XO9<eo, xo, (outs vrrc:$vD), (ins vrrc:$vB),
+ !strconcat(opc, " $vD, $vB"), IIC_VecFP, pattern> {
+ let Defs = [CR6];
+ let PS = 0;
+}
+
+// Decimal Convert From/to National/Zoned/Signed-QWord
+def BCDCFNo : VX_VT5_EO5_VB5_PS1_XO9_o<7, 385, "bcdcfn." , []>;
+def BCDCFZo : VX_VT5_EO5_VB5_PS1_XO9_o<6, 385, "bcdcfz." , []>;
+def BCDCTNo : VX_VT5_EO5_VB5_XO9_o <5, 385, "bcdctn." , []>;
+def BCDCTZo : VX_VT5_EO5_VB5_PS1_XO9_o<4, 385, "bcdctz." , []>;
+def BCDCFSQo : VX_VT5_EO5_VB5_PS1_XO9_o<2, 385, "bcdcfsq.", []>;
+def BCDCTSQo : VX_VT5_EO5_VB5_XO9_o <0, 385, "bcdctsq.", []>;
+
+// Decimal Copy-Sign/Set-Sign
+let Defs = [CR6] in
+def BCDCPSGNo : VX1_VT5_VA5_VB5<833, "bcdcpsgn.", []>;
+
+def BCDSETSGNo : VX_VT5_EO5_VB5_PS1_XO9_o<31, 385, "bcdsetsgn.", []>;
+
+// [PO VRT VRA VRB 1 PS XO], "_o" means CR6 is set.
+class VX_VT5_VA5_VB5_PS1_XO9_o<bits<9> xo, string opc, list<dag> pattern>
+ : VX_RD5_RSp5_PS1_XO9<xo,
+ (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, u1imm:$PS),
+ !strconcat(opc, " $vD, $vA, $vB, $PS"), IIC_VecFP, pattern> {
+ let Defs = [CR6];
+}
+
+// [PO VRT VRA VRB 1 / XO]
+class VX_VT5_VA5_VB5_XO9_o<bits<9> xo, string opc, list<dag> pattern>
+ : VX_RD5_RSp5_PS1_XO9<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ !strconcat(opc, " $vD, $vA, $vB"), IIC_VecFP, pattern> {
+ let Defs = [CR6];
+ let PS = 0;
+}
+
+// Decimal Shift/Unsigned-Shift/Shift-and-Round
+def BCDSo : VX_VT5_VA5_VB5_PS1_XO9_o<193, "bcds." , []>;
+def BCDUSo : VX_VT5_VA5_VB5_XO9_o <129, "bcdus.", []>;
+def BCDSRo : VX_VT5_VA5_VB5_PS1_XO9_o<449, "bcdsr.", []>;
+
+// Decimal (Unsigned) Truncate
+def BCDTRUNCo : VX_VT5_VA5_VB5_PS1_XO9_o<257, "bcdtrunc." , []>;
+def BCDUTRUNCo : VX_VT5_VA5_VB5_XO9_o <321, "bcdutrunc.", []>;
+
+// Absolute Difference
+def VABSDUB : VXForm_1<1027, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vabsdub $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v16i8:$vD, (int_ppc_altivec_vabsdub v16i8:$vA, v16i8:$vB))]>;
+def VABSDUH : VXForm_1<1091, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vabsduh $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v8i16:$vD, (int_ppc_altivec_vabsduh v8i16:$vA, v8i16:$vB))]>;
+def VABSDUW : VXForm_1<1155, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vabsduw $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v4i32:$vD, (int_ppc_altivec_vabsduw v4i32:$vA, v4i32:$vB))]>;
+} // end HasP9Altivec
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrBuilder.h b/contrib/llvm/lib/Target/PowerPC/PPCInstrBuilder.h
new file mode 100644
index 000000000000..cf71b1c59869
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrBuilder.h
@@ -0,0 +1,43 @@
+//===-- PPCInstrBuilder.h - Aides for building PPC insts --------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes functions that may be used with BuildMI from the
+// MachineInstrBuilder.h file to simplify generating frame and constant pool
+// references.
+//
+// For reference, the order of operands for memory references is:
+// (Operand), Dest Reg, Base Reg, and either Reg Index or Immediate
+// Displacement.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCINSTRBUILDER_H
+#define LLVM_LIB_TARGET_POWERPC_PPCINSTRBUILDER_H
+
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+namespace llvm {
+
+/// addFrameReference - This function is used to add a reference to the base of
+/// an abstract object on the stack frame of the current function. This
+/// reference has base register as the FrameIndex offset until it is resolved.
+/// This allows a constant offset to be specified as well...
+///
+static inline const MachineInstrBuilder&
+addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0,
+ bool mem = true) {
+ if (mem)
+ return MIB.addImm(Offset).addFrameIndex(FI);
+ else
+ return MIB.addFrameIndex(FI).addImm(Offset);
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrFormats.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrFormats.td
new file mode 100644
index 000000000000..99689f656c2d
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrFormats.td
@@ -0,0 +1,1992 @@
+//===- PowerPCInstrFormats.td - PowerPC Instruction Formats --*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//
+// PowerPC instruction formats
+
+class I<bits<6> opcode, dag OOL, dag IOL, string asmstr, InstrItinClass itin>
+ : Instruction {
+ field bits<32> Inst;
+ field bits<32> SoftFail = 0;
+ let Size = 4;
+
+ bit PPC64 = 0; // Default value, override with isPPC64
+
+ let Namespace = "PPC";
+ let Inst{0-5} = opcode;
+ let OutOperandList = OOL;
+ let InOperandList = IOL;
+ let AsmString = asmstr;
+ let Itinerary = itin;
+
+ bits<1> PPC970_First = 0;
+ bits<1> PPC970_Single = 0;
+ bits<1> PPC970_Cracked = 0;
+ bits<3> PPC970_Unit = 0;
+
+ /// These fields correspond to the fields in PPCInstrInfo.h. Any changes to
+ /// these must be reflected there! See comments there for what these are.
+ let TSFlags{0} = PPC970_First;
+ let TSFlags{1} = PPC970_Single;
+ let TSFlags{2} = PPC970_Cracked;
+ let TSFlags{5-3} = PPC970_Unit;
+
+ /// Indicate that the VSX instruction is to use VSX numbering/encoding.
+ /// Since ISA 3.0, there are scalar instructions that use the upper
+ /// half of the VSX register set only. Rather than adding further complexity
+ /// to the register class set, the VSX registers just include the Altivec
+ /// registers and this flag decides the numbering to be used for them.
+ bits<1> UseVSXReg = 0;
+ let TSFlags{6} = UseVSXReg;
+
+ // Fields used for relation models.
+ string BaseName = "";
+
+ // For cases where multiple instruction definitions really represent the
+ // same underlying instruction but with one definition for 64-bit arguments
+ // and one for 32-bit arguments, this bit breaks the degeneracy between
+ // the two forms and allows TableGen to generate mapping tables.
+ bit Interpretation64Bit = 0;
+}
+
+class PPC970_DGroup_First { bits<1> PPC970_First = 1; }
+class PPC970_DGroup_Single { bits<1> PPC970_Single = 1; }
+class PPC970_DGroup_Cracked { bits<1> PPC970_Cracked = 1; }
+class PPC970_MicroCode;
+
+class PPC970_Unit_Pseudo { bits<3> PPC970_Unit = 0; }
+class PPC970_Unit_FXU { bits<3> PPC970_Unit = 1; }
+class PPC970_Unit_LSU { bits<3> PPC970_Unit = 2; }
+class PPC970_Unit_FPU { bits<3> PPC970_Unit = 3; }
+class PPC970_Unit_CRU { bits<3> PPC970_Unit = 4; }
+class PPC970_Unit_VALU { bits<3> PPC970_Unit = 5; }
+class PPC970_Unit_VPERM { bits<3> PPC970_Unit = 6; }
+class PPC970_Unit_BRU { bits<3> PPC970_Unit = 7; }
+
+class UseVSXReg { bits<1> UseVSXReg = 1; }
+
+// Two joined instructions; used to emit two adjacent instructions as one.
+// The itinerary from the first instruction is used for scheduling and
+// classification.
+class I2<bits<6> opcode1, bits<6> opcode2, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin>
+ : Instruction {
+ field bits<64> Inst;
+ field bits<64> SoftFail = 0;
+ let Size = 8;
+
+ bit PPC64 = 0; // Default value, override with isPPC64
+
+ let Namespace = "PPC";
+ let Inst{0-5} = opcode1;
+ let Inst{32-37} = opcode2;
+ let OutOperandList = OOL;
+ let InOperandList = IOL;
+ let AsmString = asmstr;
+ let Itinerary = itin;
+
+ bits<1> PPC970_First = 0;
+ bits<1> PPC970_Single = 0;
+ bits<1> PPC970_Cracked = 0;
+ bits<3> PPC970_Unit = 0;
+
+ /// These fields correspond to the fields in PPCInstrInfo.h. Any changes to
+ /// these must be reflected there! See comments there for what these are.
+ let TSFlags{0} = PPC970_First;
+ let TSFlags{1} = PPC970_Single;
+ let TSFlags{2} = PPC970_Cracked;
+ let TSFlags{5-3} = PPC970_Unit;
+
+ // Fields used for relation models.
+ string BaseName = "";
+ bit Interpretation64Bit = 0;
+}
+
+// 1.7.1 I-Form
+class IForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ let Pattern = pattern;
+ bits<24> LI;
+
+ let Inst{6-29} = LI;
+ let Inst{30} = aa;
+ let Inst{31} = lk;
+}
+
+// 1.7.2 B-Form
+class BForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr>
+ : I<opcode, OOL, IOL, asmstr, IIC_BrB> {
+ bits<7> BIBO; // 2 bits of BI and 5 bits of BO.
+ bits<3> CR;
+ bits<14> BD;
+
+ bits<5> BI;
+ let BI{0-1} = BIBO{5-6};
+ let BI{2-4} = CR{0-2};
+
+ let Inst{6-10} = BIBO{4-0};
+ let Inst{11-15} = BI;
+ let Inst{16-29} = BD;
+ let Inst{30} = aa;
+ let Inst{31} = lk;
+}
+
+class BForm_1<bits<6> opcode, bits<5> bo, bit aa, bit lk, dag OOL, dag IOL,
+ string asmstr>
+ : BForm<opcode, aa, lk, OOL, IOL, asmstr> {
+ let BIBO{4-0} = bo;
+ let BIBO{6-5} = 0;
+ let CR = 0;
+}
+
+class BForm_2<bits<6> opcode, bits<5> bo, bits<5> bi, bit aa, bit lk,
+ dag OOL, dag IOL, string asmstr>
+ : I<opcode, OOL, IOL, asmstr, IIC_BrB> {
+ bits<14> BD;
+
+ let Inst{6-10} = bo;
+ let Inst{11-15} = bi;
+ let Inst{16-29} = BD;
+ let Inst{30} = aa;
+ let Inst{31} = lk;
+}
+
+class BForm_3<bits<6> opcode, bit aa, bit lk,
+ dag OOL, dag IOL, string asmstr>
+ : I<opcode, OOL, IOL, asmstr, IIC_BrB> {
+ bits<5> BO;
+ bits<5> BI;
+ bits<14> BD;
+
+ let Inst{6-10} = BO;
+ let Inst{11-15} = BI;
+ let Inst{16-29} = BD;
+ let Inst{30} = aa;
+ let Inst{31} = lk;
+}
+
+class BForm_3_at<bits<6> opcode, bit aa, bit lk,
+ dag OOL, dag IOL, string asmstr>
+ : I<opcode, OOL, IOL, asmstr, IIC_BrB> {
+ bits<5> BO;
+ bits<2> at;
+ bits<5> BI;
+ bits<14> BD;
+
+ let Inst{6-8} = BO{4-2};
+ let Inst{9-10} = at;
+ let Inst{11-15} = BI;
+ let Inst{16-29} = BD;
+ let Inst{30} = aa;
+ let Inst{31} = lk;
+}
+
+class BForm_4<bits<6> opcode, bits<5> bo, bit aa, bit lk,
+ dag OOL, dag IOL, string asmstr>
+ : I<opcode, OOL, IOL, asmstr, IIC_BrB> {
+ bits<5> BI;
+ bits<14> BD;
+
+ let Inst{6-10} = bo;
+ let Inst{11-15} = BI;
+ let Inst{16-29} = BD;
+ let Inst{30} = aa;
+ let Inst{31} = lk;
+}
+
+// 1.7.3 SC-Form
+class SCForm<bits<6> opcode, bits<1> xo,
+ dag OOL, dag IOL, string asmstr, InstrItinClass itin,
+ list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<7> LEV;
+
+ let Pattern = pattern;
+
+ let Inst{20-26} = LEV;
+ let Inst{30} = xo;
+}
+
+// 1.7.4 D-Form
+class DForm_base<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<5> A;
+ bits<5> B;
+ bits<16> C;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = A;
+ let Inst{11-15} = B;
+ let Inst{16-31} = C;
+}
+
+class DForm_1<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<5> A;
+ bits<21> Addr;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = A;
+ let Inst{11-15} = Addr{20-16}; // Base Reg
+ let Inst{16-31} = Addr{15-0}; // Displacement
+}
+
+class DForm_1a<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<5> A;
+ bits<16> C;
+ bits<5> B;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = A;
+ let Inst{11-15} = B;
+ let Inst{16-31} = C;
+}
+
+
+class DForm_2<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : DForm_base<opcode, OOL, IOL, asmstr, itin, pattern> {
+
+ // Even though ADDICo does not really have an RC bit, provide
+ // the declaration of one here so that isDOT has something to set.
+ bit RC = 0;
+}
+
+class DForm_2_r0<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<5> A;
+ bits<16> B;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = A;
+ let Inst{11-15} = 0;
+ let Inst{16-31} = B;
+}
+
+class DForm_4<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<5> B;
+ bits<5> A;
+ bits<16> C;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = A;
+ let Inst{11-15} = B;
+ let Inst{16-31} = C;
+}
+
+class DForm_4_zero<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : DForm_1<opcode, OOL, IOL, asmstr, itin, pattern> {
+ let A = 0;
+ let Addr = 0;
+}
+
+class DForm_4_fixedreg_zero<bits<6> opcode, bits<5> R, dag OOL, dag IOL,
+ string asmstr, InstrItinClass itin,
+ list<dag> pattern>
+ : DForm_4<opcode, OOL, IOL, asmstr, itin, pattern> {
+ let A = R;
+ let B = R;
+ let C = 0;
+}
+
+class IForm_and_DForm_1<bits<6> opcode1, bit aa, bit lk, bits<6> opcode2,
+ dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I2<opcode1, opcode2, OOL, IOL, asmstr, itin> {
+ bits<5> A;
+ bits<21> Addr;
+
+ let Pattern = pattern;
+ bits<24> LI;
+
+ let Inst{6-29} = LI;
+ let Inst{30} = aa;
+ let Inst{31} = lk;
+
+ let Inst{38-42} = A;
+ let Inst{43-47} = Addr{20-16}; // Base Reg
+ let Inst{48-63} = Addr{15-0}; // Displacement
+}
+
+// This is used to emit BL8+NOP.
+class IForm_and_DForm_4_zero<bits<6> opcode1, bit aa, bit lk, bits<6> opcode2,
+ dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : IForm_and_DForm_1<opcode1, aa, lk, opcode2,
+ OOL, IOL, asmstr, itin, pattern> {
+ let A = 0;
+ let Addr = 0;
+}
+
+class DForm_5<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<3> BF;
+ bits<1> L;
+ bits<5> RA;
+ bits<16> I;
+
+ let Inst{6-8} = BF;
+ let Inst{9} = 0;
+ let Inst{10} = L;
+ let Inst{11-15} = RA;
+ let Inst{16-31} = I;
+}
+
+class DForm_5_ext<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin>
+ : DForm_5<opcode, OOL, IOL, asmstr, itin> {
+ let L = PPC64;
+}
+
+class DForm_6<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin>
+ : DForm_5<opcode, OOL, IOL, asmstr, itin>;
+
+class DForm_6_ext<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin>
+ : DForm_6<opcode, OOL, IOL, asmstr, itin> {
+ let L = PPC64;
+}
+
+
+// 1.7.5 DS-Form
+class DSForm_1<bits<6> opcode, bits<2> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<5> RST;
+ bits<19> DS_RA;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = RST;
+ let Inst{11-15} = DS_RA{18-14}; // Register #
+ let Inst{16-29} = DS_RA{13-0}; // Displacement.
+ let Inst{30-31} = xo;
+}
+
+// DQ-Form: [PO T RA DQ TX XO] or [PO S RA DQ SX XO]
+class DQ_RD6_RS5_DQ12<bits<6> opcode, bits<3> xo, dag OOL, dag IOL,
+ string asmstr, InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<6> XT;
+ bits<17> DS_RA;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = XT{4-0};
+ let Inst{11-15} = DS_RA{16-12}; // Register #
+ let Inst{16-27} = DS_RA{11-0}; // Displacement.
+ let Inst{28} = XT{5};
+ let Inst{29-31} = xo;
+}
+
+// 1.7.6 X-Form
+class XForm_base_r3xo<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<5> RST;
+ bits<5> A;
+ bits<5> B;
+
+ let Pattern = pattern;
+
+ bit RC = 0; // set by isDOT
+
+ let Inst{6-10} = RST;
+ let Inst{11-15} = A;
+ let Inst{16-20} = B;
+ let Inst{21-30} = xo;
+ let Inst{31} = RC;
+}
+
+class XForm_tlb<bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin> : XForm_base_r3xo<31, xo, OOL, IOL, asmstr, itin, []> {
+ let RST = 0;
+}
+
+class XForm_attn<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ let Inst{21-30} = xo;
+}
+
+// This is the same as XForm_base_r3xo, but the first two operands are swapped
+// when code is emitted.
+class XForm_base_r3xo_swapped
+ <bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<5> A;
+ bits<5> RST;
+ bits<5> B;
+
+ bit RC = 0; // set by isDOT
+
+ let Inst{6-10} = RST;
+ let Inst{11-15} = A;
+ let Inst{16-20} = B;
+ let Inst{21-30} = xo;
+ let Inst{31} = RC;
+}
+
+
+class XForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern>;
+
+class XForm_1a<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+ let RST = 0;
+}
+
+class XForm_rs<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+ let A = 0;
+ let B = 0;
+}
+
+class XForm_tlbws<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<5> RST;
+ bits<5> A;
+ bits<1> WS;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = RST;
+ let Inst{11-15} = A;
+ let Inst{20} = WS;
+ let Inst{21-30} = xo;
+ let Inst{31} = 0;
+}
+
+class XForm_6<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : XForm_base_r3xo_swapped<opcode, xo, OOL, IOL, asmstr, itin> {
+ let Pattern = pattern;
+}
+
+class XForm_8<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern>;
+
+class XForm_10<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : XForm_base_r3xo_swapped<opcode, xo, OOL, IOL, asmstr, itin> {
+ let Pattern = pattern;
+}
+
+class XForm_11<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : XForm_base_r3xo_swapped<opcode, xo, OOL, IOL, asmstr, itin> {
+ let B = 0;
+ let Pattern = pattern;
+}
+
+class XForm_16<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<3> BF;
+ bits<1> L;
+ bits<5> RA;
+ bits<5> RB;
+
+ let Inst{6-8} = BF;
+ let Inst{9} = 0;
+ let Inst{10} = L;
+ let Inst{11-15} = RA;
+ let Inst{16-20} = RB;
+ let Inst{21-30} = xo;
+ let Inst{31} = 0;
+}
+
+class XForm_icbt<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<4> CT;
+ bits<5> RA;
+ bits<5> RB;
+
+ let Inst{6} = 0;
+ let Inst{7-10} = CT;
+ let Inst{11-15} = RA;
+ let Inst{16-20} = RB;
+ let Inst{21-30} = xo;
+ let Inst{31} = 0;
+}
+
+class XForm_sr<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<5> RS;
+ bits<4> SR;
+
+ let Inst{6-10} = RS;
+ let Inst{12-15} = SR;
+ let Inst{21-30} = xo;
+}
+
+class XForm_mbar<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<5> MO;
+
+ let Inst{6-10} = MO;
+ let Inst{21-30} = xo;
+}
+
+class XForm_srin<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<5> RS;
+ bits<5> RB;
+
+ let Inst{6-10} = RS;
+ let Inst{16-20} = RB;
+ let Inst{21-30} = xo;
+}
+
+class XForm_mtmsr<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<5> RS;
+ bits<1> L;
+
+ let Inst{6-10} = RS;
+ let Inst{15} = L;
+ let Inst{21-30} = xo;
+}
+
+class XForm_16_ext<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin>
+ : XForm_16<opcode, xo, OOL, IOL, asmstr, itin> {
+ let L = PPC64;
+}
+
+class XForm_17<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<3> BF;
+ bits<5> FRA;
+ bits<5> FRB;
+
+ let Inst{6-8} = BF;
+ let Inst{9-10} = 0;
+ let Inst{11-15} = FRA;
+ let Inst{16-20} = FRB;
+ let Inst{21-30} = xo;
+ let Inst{31} = 0;
+}
+
+// Used for QPX
+class XForm_18<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<5> FRT;
+ bits<5> FRA;
+ bits<5> FRB;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = FRT;
+ let Inst{11-15} = FRA;
+ let Inst{16-20} = FRB;
+ let Inst{21-30} = xo;
+ let Inst{31} = 0;
+}
+
+class XForm_19<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : XForm_18<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+ let FRA = 0;
+}
+
+class XForm_20<bits<6> opcode, bits<6> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<5> FRT;
+ bits<5> FRA;
+ bits<5> FRB;
+ bits<4> tttt;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = FRT;
+ let Inst{11-15} = FRA;
+ let Inst{16-20} = FRB;
+ let Inst{21-24} = tttt;
+ let Inst{25-30} = xo;
+ let Inst{31} = 0;
+}
+
+class XForm_24<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ let Pattern = pattern;
+ let Inst{6-10} = 31;
+ let Inst{11-15} = 0;
+ let Inst{16-20} = 0;
+ let Inst{21-30} = xo;
+ let Inst{31} = 0;
+}
+
+class XForm_24_sync<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+ string asmstr, InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<2> L;
+
+ let Pattern = pattern;
+ let Inst{6-8} = 0;
+ let Inst{9-10} = L;
+ let Inst{11-15} = 0;
+ let Inst{16-20} = 0;
+ let Inst{21-30} = xo;
+ let Inst{31} = 0;
+}
+
+class XForm_24_eieio<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+ string asmstr, InstrItinClass itin, list<dag> pattern>
+ : XForm_24_sync<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+ let L = 0;
+}
+
+class XForm_25<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+}
+
+class XForm_26<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+ let A = 0;
+}
+
+class XForm_28<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+}
+
+// This is used for MFFS, MTFSB0, MTFSB1. 42 is arbitrary; this series of
+// numbers presumably relates to some document, but I haven't found it.
+class XForm_42<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+ let Pattern = pattern;
+
+ bit RC = 0; // set by isDOT
+
+ let Inst{6-10} = RST;
+ let Inst{11-20} = 0;
+ let Inst{21-30} = xo;
+ let Inst{31} = RC;
+}
+class XForm_43<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+ let Pattern = pattern;
+ bits<5> FM;
+
+ bit RC = 0; // set by isDOT
+
+ let Inst{6-10} = FM;
+ let Inst{11-20} = 0;
+ let Inst{21-30} = xo;
+ let Inst{31} = RC;
+}
+
+class XForm_0<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+ let RST = 0;
+ let A = 0;
+ let B = 0;
+}
+
+class XForm_16b<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+ let RST = 0;
+ let A = 0;
+}
+
+class XForm_htm0<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+ string asmstr, InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bit R;
+
+ bit RC = 1;
+
+ let Inst{6-9} = 0;
+ let Inst{10} = R;
+ let Inst{11-20} = 0;
+ let Inst{21-30} = xo;
+ let Inst{31} = RC;
+}
+
+class XForm_htm1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+ string asmstr, InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bit A;
+
+ bit RC = 1;
+
+ let Inst{6} = A;
+ let Inst{7-20} = 0;
+ let Inst{21-30} = xo;
+ let Inst{31} = RC;
+}
+
+class XForm_htm2<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bit L;
+
+ bit RC = 0; // set by isDOT
+
+ let Inst{7-9} = 0;
+ let Inst{10} = L;
+ let Inst{11-20} = 0;
+ let Inst{21-30} = xo;
+ let Inst{31} = RC;
+}
+
+class XForm_htm3<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<3> BF;
+
+ bit RC = 0;
+
+ let Inst{6-8} = BF;
+ let Inst{9-20} = 0;
+ let Inst{21-30} = xo;
+ let Inst{31} = RC;
+}
+
+// [PO RT RA RB XO /]
+class X_BF3_L1_RS5_RS5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+ string asmstr, InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<3> BF;
+ bits<1> L;
+ bits<5> RA;
+ bits<5> RB;
+
+ let Pattern = pattern;
+
+ let Inst{6-8} = BF;
+ let Inst{9} = 0;
+ let Inst{10} = L;
+ let Inst{11-15} = RA;
+ let Inst{16-20} = RB;
+ let Inst{21-30} = xo;
+ let Inst{31} = 0;
+}
+
+// Same as XForm_17 but with GPR's and new naming convention
+class X_BF3_RS5_RS5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+ string asmstr, InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<3> BF;
+ bits<5> RA;
+ bits<5> RB;
+
+ let Pattern = pattern;
+
+ let Inst{6-8} = BF;
+ let Inst{9-10} = 0;
+ let Inst{11-15} = RA;
+ let Inst{16-20} = RB;
+ let Inst{21-30} = xo;
+ let Inst{31} = 0;
+}
+
+// e.g. [PO VRT XO VRB XO /] or [PO VRT XO VRB XO RO]
+class X_RD5_XO5_RS5<bits<6> opcode, bits<5> xo2, bits<10> xo, dag OOL, dag IOL,
+ string asmstr, InstrItinClass itin, list<dag> pattern>
+ : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+ let A = xo2;
+}
+
+class X_BF3_DCMX7_RS5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+ string asmstr, InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<3> BF;
+ bits<7> DCMX;
+ bits<5> VB;
+
+ let Pattern = pattern;
+
+ let Inst{6-8} = BF;
+ let Inst{9-15} = DCMX;
+ let Inst{16-20} = VB;
+ let Inst{21-30} = xo;
+ let Inst{31} = 0;
+}
+
+class X_RD6_IMM8<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+ string asmstr, InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<6> XT;
+ bits<8> IMM8;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = XT{4-0};
+ let Inst{11-12} = 0;
+ let Inst{13-20} = IMM8;
+ let Inst{21-30} = xo;
+ let Inst{31} = XT{5};
+}
+
+// XForm_base_r3xo for instructions such as P9 atomics where we don't want
+// to specify an SDAG pattern for matching.
+class X_RD5_RS5_IM5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+ string asmstr, InstrItinClass itin>
+ : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, []> {
+}
+
+class X_BF3<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin>
+ : XForm_17<opcode, xo, OOL, IOL, asmstr, itin> {
+ let FRA = 0;
+ let FRB = 0;
+}
+
+// [PO /// L RA RB XO /]
+class X_L1_RS5_RS5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+ string asmstr, InstrItinClass itin, list<dag> pattern>
+ : XForm_16<opcode, xo, OOL, IOL, asmstr, itin> {
+ let BF = 0;
+ let Pattern = pattern;
+
+ bit RC = 0;
+ let Inst{31} = RC;
+}
+
+// XX*-Form (VSX)
+class XX1Form<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<6> XT;
+ bits<5> A;
+ bits<5> B;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = XT{4-0};
+ let Inst{11-15} = A;
+ let Inst{16-20} = B;
+ let Inst{21-30} = xo;
+ let Inst{31} = XT{5};
+}
+
+class XX1_RS6_RD5_XO<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+ string asmstr, InstrItinClass itin, list<dag> pattern>
+ : XX1Form<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+ let B = 0;
+}
+
+class XX2Form<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<6> XT;
+ bits<6> XB;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = XT{4-0};
+ let Inst{11-15} = 0;
+ let Inst{16-20} = XB{4-0};
+ let Inst{21-29} = xo;
+ let Inst{30} = XB{5};
+ let Inst{31} = XT{5};
+}
+
+class XX2Form_1<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<3> CR;
+ bits<6> XB;
+
+ let Pattern = pattern;
+
+ let Inst{6-8} = CR;
+ let Inst{9-15} = 0;
+ let Inst{16-20} = XB{4-0};
+ let Inst{21-29} = xo;
+ let Inst{30} = XB{5};
+ let Inst{31} = 0;
+}
+
+class XX2Form_2<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<6> XT;
+ bits<6> XB;
+ bits<2> D;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = XT{4-0};
+ let Inst{11-13} = 0;
+ let Inst{14-15} = D;
+ let Inst{16-20} = XB{4-0};
+ let Inst{21-29} = xo;
+ let Inst{30} = XB{5};
+ let Inst{31} = XT{5};
+}
+
+class XX2_RD6_UIM5_RS6<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
+ string asmstr, InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<6> XT;
+ bits<6> XB;
+ bits<5> UIM5;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = XT{4-0};
+ let Inst{11-15} = UIM5;
+ let Inst{16-20} = XB{4-0};
+ let Inst{21-29} = xo;
+ let Inst{30} = XB{5};
+ let Inst{31} = XT{5};
+}
+
+// [PO T XO B XO BX /]
+class XX2_RD5_XO5_RS6<bits<6> opcode, bits<5> xo2, bits<9> xo, dag OOL, dag IOL,
+ string asmstr, InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<5> RT;
+ bits<6> XB;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = RT;
+ let Inst{11-15} = xo2;
+ let Inst{16-20} = XB{4-0};
+ let Inst{21-29} = xo;
+ let Inst{30} = XB{5};
+ let Inst{31} = 0;
+}
+
+// [PO T XO B XO BX TX]
+class XX2_RD6_XO5_RS6<bits<6> opcode, bits<5> xo2, bits<9> xo, dag OOL, dag IOL,
+ string asmstr, InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<6> XT;
+ bits<6> XB;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = XT{4-0};
+ let Inst{11-15} = xo2;
+ let Inst{16-20} = XB{4-0};
+ let Inst{21-29} = xo;
+ let Inst{30} = XB{5};
+ let Inst{31} = XT{5};
+}
+
+class XX2_BF3_DCMX7_RS6<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
+ string asmstr, InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<3> BF;
+ bits<7> DCMX;
+ bits<6> XB;
+
+ let Pattern = pattern;
+
+ let Inst{6-8} = BF;
+ let Inst{9-15} = DCMX;
+ let Inst{16-20} = XB{4-0};
+ let Inst{21-29} = xo;
+ let Inst{30} = XB{5};
+ let Inst{31} = 0;
+}
+
+class XX2_RD6_DCMX7_RS6<bits<6> opcode, bits<4> xo1, bits<3> xo2,
+ dag OOL, dag IOL, string asmstr, InstrItinClass itin,
+ list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<6> XT;
+ bits<7> DCMX;
+ bits<6> XB;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = XT{4-0};
+ let Inst{11-15} = DCMX{4-0};
+ let Inst{16-20} = XB{4-0};
+ let Inst{21-24} = xo1;
+ let Inst{25} = DCMX{5};
+ let Inst{26-28} = xo2;
+ let Inst{29} = DCMX{6};
+ let Inst{30} = XB{5};
+ let Inst{31} = XT{5};
+}
+
+class XX3Form<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<6> XT;
+ bits<6> XA;
+ bits<6> XB;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = XT{4-0};
+ let Inst{11-15} = XA{4-0};
+ let Inst{16-20} = XB{4-0};
+ let Inst{21-28} = xo;
+ let Inst{29} = XA{5};
+ let Inst{30} = XB{5};
+ let Inst{31} = XT{5};
+}
+
+class XX3Form_Zero<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : XX3Form<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+ let XA = XT;
+ let XB = XT;
+}
+
+class XX3Form_SetZero<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : XX3Form<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+ let XB = XT;
+ let XA = XT;
+}
+
+class XX3Form_1<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<3> CR;
+ bits<6> XA;
+ bits<6> XB;
+
+ let Pattern = pattern;
+
+ let Inst{6-8} = CR;
+ let Inst{9-10} = 0;
+ let Inst{11-15} = XA{4-0};
+ let Inst{16-20} = XB{4-0};
+ let Inst{21-28} = xo;
+ let Inst{29} = XA{5};
+ let Inst{30} = XB{5};
+ let Inst{31} = 0;
+}
+
+class XX3Form_2<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<6> XT;
+ bits<6> XA;
+ bits<6> XB;
+ bits<2> D;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = XT{4-0};
+ let Inst{11-15} = XA{4-0};
+ let Inst{16-20} = XB{4-0};
+ let Inst{21} = 0;
+ let Inst{22-23} = D;
+ let Inst{24-28} = xo;
+ let Inst{29} = XA{5};
+ let Inst{30} = XB{5};
+ let Inst{31} = XT{5};
+}
+
+class XX3Form_Rc<bits<6> opcode, bits<7> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<6> XT;
+ bits<6> XA;
+ bits<6> XB;
+
+ let Pattern = pattern;
+
+ bit RC = 0; // set by isDOT
+
+ let Inst{6-10} = XT{4-0};
+ let Inst{11-15} = XA{4-0};
+ let Inst{16-20} = XB{4-0};
+ let Inst{21} = RC;
+ let Inst{22-28} = xo;
+ let Inst{29} = XA{5};
+ let Inst{30} = XB{5};
+ let Inst{31} = XT{5};
+}
+
+class XX4Form<bits<6> opcode, bits<2> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<6> XT;
+ bits<6> XA;
+ bits<6> XB;
+ bits<6> XC;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = XT{4-0};
+ let Inst{11-15} = XA{4-0};
+ let Inst{16-20} = XB{4-0};
+ let Inst{21-25} = XC{4-0};
+ let Inst{26-27} = xo;
+ let Inst{28} = XC{5};
+ let Inst{29} = XA{5};
+ let Inst{30} = XB{5};
+ let Inst{31} = XT{5};
+}
+
+// DCB_Form - Form X instruction, used for dcb* instructions.
+class DCB_Form<bits<10> xo, bits<5> immfield, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<31, OOL, IOL, asmstr, itin> {
+ bits<5> A;
+ bits<5> B;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = immfield;
+ let Inst{11-15} = A;
+ let Inst{16-20} = B;
+ let Inst{21-30} = xo;
+ let Inst{31} = 0;
+}
+
+class DCB_Form_hint<bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<31, OOL, IOL, asmstr, itin> {
+ bits<5> TH;
+ bits<5> A;
+ bits<5> B;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = TH;
+ let Inst{11-15} = A;
+ let Inst{16-20} = B;
+ let Inst{21-30} = xo;
+ let Inst{31} = 0;
+}
+
+// DSS_Form - Form X instruction, used for altivec dss* instructions.
+class DSS_Form<bits<1> T, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<31, OOL, IOL, asmstr, itin> {
+ bits<2> STRM;
+ bits<5> A;
+ bits<5> B;
+
+ let Pattern = pattern;
+
+ let Inst{6} = T;
+ let Inst{7-8} = 0;
+ let Inst{9-10} = STRM;
+ let Inst{11-15} = A;
+ let Inst{16-20} = B;
+ let Inst{21-30} = xo;
+ let Inst{31} = 0;
+}
+
+// 1.7.7 XL-Form
+class XLForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<5> CRD;
+ bits<5> CRA;
+ bits<5> CRB;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = CRD;
+ let Inst{11-15} = CRA;
+ let Inst{16-20} = CRB;
+ let Inst{21-30} = xo;
+ let Inst{31} = 0;
+}
+
+class XLForm_1_np<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : XLForm_1<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+ let CRD = 0;
+ let CRA = 0;
+ let CRB = 0;
+}
+
+class XLForm_1_gen<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : XLForm_1<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+ bits<5> RT;
+ bits<5> RB;
+
+ let CRD = RT;
+ let CRA = 0;
+ let CRB = RB;
+}
+
+class XLForm_1_ext<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<5> CRD;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = CRD;
+ let Inst{11-15} = CRD;
+ let Inst{16-20} = CRD;
+ let Inst{21-30} = xo;
+ let Inst{31} = 0;
+}
+
+class XLForm_2<bits<6> opcode, bits<10> xo, bit lk, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<5> BO;
+ bits<5> BI;
+ bits<2> BH;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = BO;
+ let Inst{11-15} = BI;
+ let Inst{16-18} = 0;
+ let Inst{19-20} = BH;
+ let Inst{21-30} = xo;
+ let Inst{31} = lk;
+}
+
+class XLForm_2_br<bits<6> opcode, bits<10> xo, bit lk,
+ dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern>
+ : XLForm_2<opcode, xo, lk, OOL, IOL, asmstr, itin, pattern> {
+ bits<7> BIBO; // 2 bits of BI and 5 bits of BO.
+ bits<3> CR;
+
+ let BO = BIBO{4-0};
+ let BI{0-1} = BIBO{5-6};
+ let BI{2-4} = CR{0-2};
+ let BH = 0;
+}
+
+class XLForm_2_br2<bits<6> opcode, bits<10> xo, bits<5> bo, bit lk,
+ dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern>
+ : XLForm_2<opcode, xo, lk, OOL, IOL, asmstr, itin, pattern> {
+ let BO = bo;
+ let BH = 0;
+}
+
+class XLForm_2_ext<bits<6> opcode, bits<10> xo, bits<5> bo, bits<5> bi, bit lk,
+ dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern>
+ : XLForm_2<opcode, xo, lk, OOL, IOL, asmstr, itin, pattern> {
+ let BO = bo;
+ let BI = bi;
+ let BH = 0;
+}
+
+class XLForm_3<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<3> BF;
+ bits<3> BFA;
+
+ let Inst{6-8} = BF;
+ let Inst{9-10} = 0;
+ let Inst{11-13} = BFA;
+ let Inst{14-15} = 0;
+ let Inst{16-20} = 0;
+ let Inst{21-30} = xo;
+ let Inst{31} = 0;
+}
+
+class XLForm_4<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<3> BF;
+ bit W;
+ bits<4> U;
+
+ bit RC = 0;
+
+ let Inst{6-8} = BF;
+ let Inst{9-10} = 0;
+ let Inst{11-14} = 0;
+ let Inst{15} = W;
+ let Inst{16-19} = U;
+ let Inst{20} = 0;
+ let Inst{21-30} = xo;
+ let Inst{31} = RC;
+}
+
+class XLForm_S<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<1> S;
+
+ let Pattern = pattern;
+
+ let Inst{6-19} = 0;
+ let Inst{20} = S;
+ let Inst{21-30} = xo;
+ let Inst{31} = 0;
+}
+
+class XLForm_2_and_DSForm_1<bits<6> opcode1, bits<10> xo1, bit lk,
+ bits<6> opcode2, bits<2> xo2,
+ dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I2<opcode1, opcode2, OOL, IOL, asmstr, itin> {
+ bits<5> BO;
+ bits<5> BI;
+ bits<2> BH;
+
+ bits<5> RST;
+ bits<19> DS_RA;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = BO;
+ let Inst{11-15} = BI;
+ let Inst{16-18} = 0;
+ let Inst{19-20} = BH;
+ let Inst{21-30} = xo1;
+ let Inst{31} = lk;
+
+ let Inst{38-42} = RST;
+ let Inst{43-47} = DS_RA{18-14}; // Register #
+ let Inst{48-61} = DS_RA{13-0}; // Displacement.
+ let Inst{62-63} = xo2;
+}
+
+class XLForm_2_ext_and_DSForm_1<bits<6> opcode1, bits<10> xo1,
+ bits<5> bo, bits<5> bi, bit lk,
+ bits<6> opcode2, bits<2> xo2,
+ dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : XLForm_2_and_DSForm_1<opcode1, xo1, lk, opcode2, xo2,
+ OOL, IOL, asmstr, itin, pattern> {
+ let BO = bo;
+ let BI = bi;
+ let BH = 0;
+}
+
+// 1.7.8 XFX-Form
+class XFXForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<5> RT;
+ bits<10> SPR;
+
+ let Inst{6-10} = RT;
+ let Inst{11} = SPR{4};
+ let Inst{12} = SPR{3};
+ let Inst{13} = SPR{2};
+ let Inst{14} = SPR{1};
+ let Inst{15} = SPR{0};
+ let Inst{16} = SPR{9};
+ let Inst{17} = SPR{8};
+ let Inst{18} = SPR{7};
+ let Inst{19} = SPR{6};
+ let Inst{20} = SPR{5};
+ let Inst{21-30} = xo;
+ let Inst{31} = 0;
+}
+
+class XFXForm_1_ext<bits<6> opcode, bits<10> xo, bits<10> spr,
+ dag OOL, dag IOL, string asmstr, InstrItinClass itin>
+ : XFXForm_1<opcode, xo, OOL, IOL, asmstr, itin> {
+ let SPR = spr;
+}
+
+class XFXForm_3<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<5> RT;
+
+ let Inst{6-10} = RT;
+ let Inst{11-20} = 0;
+ let Inst{21-30} = xo;
+ let Inst{31} = 0;
+}
+
+class XFXForm_3p<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<5> RT;
+ bits<10> Entry;
+ let Pattern = pattern;
+
+ let Inst{6-10} = RT;
+ let Inst{11-20} = Entry;
+ let Inst{21-30} = xo;
+ let Inst{31} = 0;
+}
+
+class XFXForm_5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<8> FXM;
+ bits<5> rS;
+
+ let Inst{6-10} = rS;
+ let Inst{11} = 0;
+ let Inst{12-19} = FXM;
+ let Inst{20} = 0;
+ let Inst{21-30} = xo;
+ let Inst{31} = 0;
+}
+
+class XFXForm_5a<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<5> ST;
+ bits<8> FXM;
+
+ let Inst{6-10} = ST;
+ let Inst{11} = 1;
+ let Inst{12-19} = FXM;
+ let Inst{20} = 0;
+ let Inst{21-30} = xo;
+ let Inst{31} = 0;
+}
+
+class XFXForm_7<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin>
+ : XFXForm_1<opcode, xo, OOL, IOL, asmstr, itin>;
+
+class XFXForm_7_ext<bits<6> opcode, bits<10> xo, bits<10> spr,
+ dag OOL, dag IOL, string asmstr, InstrItinClass itin>
+ : XFXForm_7<opcode, xo, OOL, IOL, asmstr, itin> {
+ let SPR = spr;
+}
+
+// XFL-Form - MTFSF
+// This is probably 1.7.9, but I don't have the reference that uses this
+// numbering scheme...
+class XFLForm<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag>pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<8> FM;
+ bits<5> rT;
+
+ bit RC = 0; // set by isDOT
+ let Pattern = pattern;
+
+ let Inst{6} = 0;
+ let Inst{7-14} = FM;
+ let Inst{15} = 0;
+ let Inst{16-20} = rT;
+ let Inst{21-30} = xo;
+ let Inst{31} = RC;
+}
+
+class XFLForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag>pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bit L;
+ bits<8> FLM;
+ bit W;
+ bits<5> FRB;
+
+ bit RC = 0; // set by isDOT
+ let Pattern = pattern;
+
+ let Inst{6} = L;
+ let Inst{7-14} = FLM;
+ let Inst{15} = W;
+ let Inst{16-20} = FRB;
+ let Inst{21-30} = xo;
+ let Inst{31} = RC;
+}
+
+// 1.7.10 XS-Form - SRADI.
+class XSForm_1<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<5> A;
+ bits<5> RS;
+ bits<6> SH;
+
+ bit RC = 0; // set by isDOT
+ let Pattern = pattern;
+
+ let Inst{6-10} = RS;
+ let Inst{11-15} = A;
+ let Inst{16-20} = SH{4,3,2,1,0};
+ let Inst{21-29} = xo;
+ let Inst{30} = SH{5};
+ let Inst{31} = RC;
+}
+
+// 1.7.11 XO-Form
+class XOForm_1<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<5> RT;
+ bits<5> RA;
+ bits<5> RB;
+
+ let Pattern = pattern;
+
+ bit RC = 0; // set by isDOT
+
+ let Inst{6-10} = RT;
+ let Inst{11-15} = RA;
+ let Inst{16-20} = RB;
+ let Inst{21} = oe;
+ let Inst{22-30} = xo;
+ let Inst{31} = RC;
+}
+
+class XOForm_3<bits<6> opcode, bits<9> xo, bit oe,
+ dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern>
+ : XOForm_1<opcode, xo, oe, OOL, IOL, asmstr, itin, pattern> {
+ let RB = 0;
+}
+
+// 1.7.12 A-Form
+class AForm_1<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<5> FRT;
+ bits<5> FRA;
+ bits<5> FRC;
+ bits<5> FRB;
+
+ let Pattern = pattern;
+
+ bit RC = 0; // set by isDOT
+
+ let Inst{6-10} = FRT;
+ let Inst{11-15} = FRA;
+ let Inst{16-20} = FRB;
+ let Inst{21-25} = FRC;
+ let Inst{26-30} = xo;
+ let Inst{31} = RC;
+}
+
+class AForm_2<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : AForm_1<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+ let FRC = 0;
+}
+
+class AForm_3<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : AForm_1<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+ let FRB = 0;
+}
+
+class AForm_4<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<5> RT;
+ bits<5> RA;
+ bits<5> RB;
+ bits<5> COND;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = RT;
+ let Inst{11-15} = RA;
+ let Inst{16-20} = RB;
+ let Inst{21-25} = COND;
+ let Inst{26-30} = xo;
+ let Inst{31} = 0;
+}
+
+// Used for QPX
+class AForm_4a<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : AForm_1<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+ let FRA = 0;
+ let FRC = 0;
+}
+
+// 1.7.13 M-Form
+class MForm_1<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<5> RA;
+ bits<5> RS;
+ bits<5> RB;
+ bits<5> MB;
+ bits<5> ME;
+
+ let Pattern = pattern;
+
+ bit RC = 0; // set by isDOT
+
+ let Inst{6-10} = RS;
+ let Inst{11-15} = RA;
+ let Inst{16-20} = RB;
+ let Inst{21-25} = MB;
+ let Inst{26-30} = ME;
+ let Inst{31} = RC;
+}
+
+class MForm_2<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : MForm_1<opcode, OOL, IOL, asmstr, itin, pattern> {
+}
+
+// 1.7.14 MD-Form
+class MDForm_1<bits<6> opcode, bits<3> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<5> RA;
+ bits<5> RS;
+ bits<6> SH;
+ bits<6> MBE;
+
+ let Pattern = pattern;
+
+ bit RC = 0; // set by isDOT
+
+ let Inst{6-10} = RS;
+ let Inst{11-15} = RA;
+ let Inst{16-20} = SH{4,3,2,1,0};
+ let Inst{21-26} = MBE{4,3,2,1,0,5};
+ let Inst{27-29} = xo;
+ let Inst{30} = SH{5};
+ let Inst{31} = RC;
+}
+
+class MDSForm_1<bits<6> opcode, bits<4> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<5> RA;
+ bits<5> RS;
+ bits<5> RB;
+ bits<6> MBE;
+
+ let Pattern = pattern;
+
+ bit RC = 0; // set by isDOT
+
+ let Inst{6-10} = RS;
+ let Inst{11-15} = RA;
+ let Inst{16-20} = RB;
+ let Inst{21-26} = MBE{4,3,2,1,0,5};
+ let Inst{27-30} = xo;
+ let Inst{31} = RC;
+}
+
+
+// E-1 VA-Form
+
+// VAForm_1 - DACB ordering.
+class VAForm_1<bits<6> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<4, OOL, IOL, asmstr, itin> {
+ bits<5> VD;
+ bits<5> VA;
+ bits<5> VC;
+ bits<5> VB;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = VD;
+ let Inst{11-15} = VA;
+ let Inst{16-20} = VB;
+ let Inst{21-25} = VC;
+ let Inst{26-31} = xo;
+}
+
+// VAForm_1a - DABC ordering.
+class VAForm_1a<bits<6> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<4, OOL, IOL, asmstr, itin> {
+ bits<5> VD;
+ bits<5> VA;
+ bits<5> VB;
+ bits<5> VC;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = VD;
+ let Inst{11-15} = VA;
+ let Inst{16-20} = VB;
+ let Inst{21-25} = VC;
+ let Inst{26-31} = xo;
+}
+
+class VAForm_2<bits<6> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<4, OOL, IOL, asmstr, itin> {
+ bits<5> VD;
+ bits<5> VA;
+ bits<5> VB;
+ bits<4> SH;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = VD;
+ let Inst{11-15} = VA;
+ let Inst{16-20} = VB;
+ let Inst{21} = 0;
+ let Inst{22-25} = SH;
+ let Inst{26-31} = xo;
+}
+
+// E-2 VX-Form
+class VXForm_1<bits<11> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<4, OOL, IOL, asmstr, itin> {
+ bits<5> VD;
+ bits<5> VA;
+ bits<5> VB;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = VD;
+ let Inst{11-15} = VA;
+ let Inst{16-20} = VB;
+ let Inst{21-31} = xo;
+}
+
+class VXForm_setzero<bits<11> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : VXForm_1<xo, OOL, IOL, asmstr, itin, pattern> {
+ let VA = VD;
+ let VB = VD;
+}
+
+
+class VXForm_2<bits<11> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<4, OOL, IOL, asmstr, itin> {
+ bits<5> VD;
+ bits<5> VB;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = VD;
+ let Inst{11-15} = 0;
+ let Inst{16-20} = VB;
+ let Inst{21-31} = xo;
+}
+
+class VXForm_3<bits<11> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<4, OOL, IOL, asmstr, itin> {
+ bits<5> VD;
+ bits<5> IMM;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = VD;
+ let Inst{11-15} = IMM;
+ let Inst{16-20} = 0;
+ let Inst{21-31} = xo;
+}
+
+/// VXForm_4 - VX instructions with "VD,0,0" register fields, like mfvscr.
+class VXForm_4<bits<11> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<4, OOL, IOL, asmstr, itin> {
+ bits<5> VD;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = VD;
+ let Inst{11-15} = 0;
+ let Inst{16-20} = 0;
+ let Inst{21-31} = xo;
+}
+
+/// VXForm_5 - VX instructions with "0,0,VB" register fields, like mtvscr.
+class VXForm_5<bits<11> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<4, OOL, IOL, asmstr, itin> {
+ bits<5> VB;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = 0;
+ let Inst{11-15} = 0;
+ let Inst{16-20} = VB;
+ let Inst{21-31} = xo;
+}
+
+// e.g. [PO VRT EO VRB XO]
+class VXForm_RD5_XO5_RS5<bits<11> xo, bits<5> eo, dag OOL, dag IOL,
+ string asmstr, InstrItinClass itin, list<dag> pattern>
+ : I<4, OOL, IOL, asmstr, itin> {
+ bits<5> RD;
+ bits<5> VB;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = RD;
+ let Inst{11-15} = eo;
+ let Inst{16-20} = VB;
+ let Inst{21-31} = xo;
+}
+
+/// VXForm_CR - VX crypto instructions with "VRT, VRA, ST, SIX"
+class VXForm_CR<bits<11> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<4, OOL, IOL, asmstr, itin> {
+ bits<5> VD;
+ bits<5> VA;
+ bits<1> ST;
+ bits<4> SIX;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = VD;
+ let Inst{11-15} = VA;
+ let Inst{16} = ST;
+ let Inst{17-20} = SIX;
+ let Inst{21-31} = xo;
+}
+
+/// VXForm_BX - VX crypto instructions with "VRT, VRA, 0 - like vsbox"
+class VXForm_BX<bits<11> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<4, OOL, IOL, asmstr, itin> {
+ bits<5> VD;
+ bits<5> VA;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = VD;
+ let Inst{11-15} = VA;
+ let Inst{16-20} = 0;
+ let Inst{21-31} = xo;
+}
+
+// E-4 VXR-Form
+class VXRForm_1<bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<4, OOL, IOL, asmstr, itin> {
+ bits<5> VD;
+ bits<5> VA;
+ bits<5> VB;
+ bit RC = 0;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = VD;
+ let Inst{11-15} = VA;
+ let Inst{16-20} = VB;
+ let Inst{21} = RC;
+ let Inst{22-31} = xo;
+}
+
+// VX-Form: [PO VRT EO VRB 1 PS XO]
+class VX_RD5_EO5_RS5_PS1_XO9<bits<5> eo, bits<9> xo,
+ dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<4, OOL, IOL, asmstr, itin> {
+ bits<5> VD;
+ bits<5> VB;
+ bit PS;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = VD;
+ let Inst{11-15} = eo;
+ let Inst{16-20} = VB;
+ let Inst{21} = 1;
+ let Inst{22} = PS;
+ let Inst{23-31} = xo;
+}
+
+// VX-Form: [PO VRT VRA VRB 1 PS XO] or [PO VRT VRA VRB 1 / XO]
+class VX_RD5_RSp5_PS1_XO9<bits<9> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<4, OOL, IOL, asmstr, itin> {
+ bits<5> VD;
+ bits<5> VA;
+ bits<5> VB;
+ bit PS;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = VD;
+ let Inst{11-15} = VA;
+ let Inst{16-20} = VB;
+ let Inst{21} = 1;
+ let Inst{22} = PS;
+ let Inst{23-31} = xo;
+}
+
+// Z23-Form (used by QPX)
+class Z23Form_1<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<5> FRT;
+ bits<5> FRA;
+ bits<5> FRB;
+ bits<2> idx;
+
+ let Pattern = pattern;
+
+ bit RC = 0; // set by isDOT
+
+ let Inst{6-10} = FRT;
+ let Inst{11-15} = FRA;
+ let Inst{16-20} = FRB;
+ let Inst{21-22} = idx;
+ let Inst{23-30} = xo;
+ let Inst{31} = RC;
+}
+
+class Z23Form_2<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : Z23Form_1<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+ let FRB = 0;
+}
+
+class Z23Form_3<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<5> FRT;
+ bits<12> idx;
+
+ let Pattern = pattern;
+
+ bit RC = 0; // set by isDOT
+
+ let Inst{6-10} = FRT;
+ let Inst{11-22} = idx;
+ let Inst{23-30} = xo;
+ let Inst{31} = RC;
+}
+
+//===----------------------------------------------------------------------===//
+class Pseudo<dag OOL, dag IOL, string asmstr, list<dag> pattern>
+ : I<0, OOL, IOL, asmstr, NoItinerary> {
+ let isCodeGenOnly = 1;
+ let PPC64 = 0;
+ let Pattern = pattern;
+ let Inst{31-0} = 0;
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrHTM.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrHTM.td
new file mode 100644
index 000000000000..6c4e2129087c
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrHTM.td
@@ -0,0 +1,172 @@
+//===-- PPCInstrHTM.td - The PowerPC Hardware Transactional Memory -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hardware Transactional Memory extension to the
+// PowerPC instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+
+
+def HasHTM : Predicate<"PPCSubTarget->hasHTM()">;
+
+def HTM_get_imm : SDNodeXForm<imm, [{
+ return getI32Imm (N->getZExtValue(), SDLoc(N));
+}]>;
+
+let hasSideEffects = 1, usesCustomInserter = 1 in {
+def TCHECK_RET : Pseudo<(outs crrc:$out), (ins), "#TCHECK_RET", []>;
+}
+
+
+let Predicates = [HasHTM] in {
+
+def TBEGIN : XForm_htm0 <31, 654,
+ (outs crrc0:$ret), (ins u1imm:$R), "tbegin. $R", IIC_SprMTSPR, []>;
+
+def TEND : XForm_htm1 <31, 686,
+ (outs crrc0:$ret), (ins u1imm:$A), "tend. $A", IIC_SprMTSPR, []>;
+
+def TABORT : XForm_base_r3xo <31, 910,
+ (outs crrc0:$ret), (ins gprc:$A), "tabort. $A", IIC_SprMTSPR,
+ []>, isDOT {
+ let RST = 0;
+ let B = 0;
+}
+
+def TABORTWC : XForm_base_r3xo <31, 782,
+ (outs crrc0:$ret), (ins u5imm:$RTS, gprc:$A, gprc:$B),
+ "tabortwc. $RTS, $A, $B", IIC_SprMTSPR, []>,
+ isDOT;
+
+def TABORTWCI : XForm_base_r3xo <31, 846,
+ (outs crrc0:$ret), (ins u5imm:$RTS, gprc:$A, u5imm:$B),
+ "tabortwci. $RTS, $A, $B", IIC_SprMTSPR, []>,
+ isDOT;
+
+def TABORTDC : XForm_base_r3xo <31, 814,
+ (outs crrc0:$ret), (ins u5imm:$RTS, gprc:$A, gprc:$B),
+ "tabortdc. $RTS, $A, $B", IIC_SprMTSPR, []>,
+ isDOT;
+
+def TABORTDCI : XForm_base_r3xo <31, 878,
+ (outs crrc0:$ret), (ins u5imm:$RTS, gprc:$A, u5imm:$B),
+ "tabortdci. $RTS, $A, $B", IIC_SprMTSPR, []>,
+ isDOT;
+
+def TSR : XForm_htm2 <31, 750,
+ (outs crrc0:$ret), (ins u1imm:$L), "tsr. $L", IIC_SprMTSPR, []>,
+ isDOT;
+
+def TCHECK : XForm_htm3 <31, 718,
+ (outs), (ins crrc:$BF), "tcheck $BF", IIC_SprMTSPR, []>;
+
+
+def TRECLAIM : XForm_base_r3xo <31, 942,
+ (outs crrc:$ret), (ins gprc:$A), "treclaim. $A",
+ IIC_SprMTSPR, []>,
+ isDOT {
+ let RST = 0;
+ let B = 0;
+}
+
+def TRECHKPT : XForm_base_r3xo <31, 1006,
+ (outs crrc:$ret), (ins), "trechkpt.", IIC_SprMTSPR, []>,
+ isDOT {
+ let RST = 0;
+ let A = 0;
+ let B = 0;
+}
+
+// Builtins
+
+// All HTM instructions, with the exception of tcheck, set CR0 with the
+// value of the MSR Transaction State (TS) bits that exist before the
+// instruction is executed. For tbegin., the EQ bit in CR0 can be used
+// to determine whether the transaction was successfully started (0) or
+// failed (1). We use an XORI pattern to 'flip' the bit to match the
+// tbegin builtin API which defines a return value of 1 as success.
+
+def : Pat<(int_ppc_tbegin i32:$R),
+ (XORI
+ (EXTRACT_SUBREG (
+ TBEGIN (HTM_get_imm imm:$R)), sub_eq),
+ 1)>;
+
+def : Pat<(int_ppc_tend i32:$R),
+ (TEND (HTM_get_imm imm:$R))>;
+
+
+def : Pat<(int_ppc_tabort i32:$R),
+ (TABORT $R)>;
+
+def : Pat<(int_ppc_tabortwc i32:$TO, i32:$RA, i32:$RB),
+ (TABORTWC (HTM_get_imm imm:$TO), $RA, $RB)>;
+
+def : Pat<(int_ppc_tabortwci i32:$TO, i32:$RA, i32:$SI),
+ (TABORTWCI (HTM_get_imm imm:$TO), $RA, (HTM_get_imm imm:$SI))>;
+
+def : Pat<(int_ppc_tabortdc i32:$TO, i32:$RA, i32:$RB),
+ (TABORTDC (HTM_get_imm imm:$TO), $RA, $RB)>;
+
+def : Pat<(int_ppc_tabortdci i32:$TO, i32:$RA, i32:$SI),
+ (TABORTDCI (HTM_get_imm imm:$TO), $RA, (HTM_get_imm imm:$SI))>;
+
+def : Pat<(int_ppc_tcheck),
+ (TCHECK_RET)>;
+
+def : Pat<(int_ppc_treclaim i32:$RA),
+ (TRECLAIM $RA)>;
+
+def : Pat<(int_ppc_trechkpt),
+ (TRECHKPT)>;
+
+def : Pat<(int_ppc_tsr i32:$L),
+ (TSR (HTM_get_imm imm:$L))>;
+
+def : Pat<(int_ppc_get_texasr),
+ (MFSPR8 130)>;
+
+def : Pat<(int_ppc_get_texasru),
+ (MFSPR8 131)>;
+
+def : Pat<(int_ppc_get_tfhar),
+ (MFSPR8 128)>;
+
+def : Pat<(int_ppc_get_tfiar),
+ (MFSPR8 129)>;
+
+
+def : Pat<(int_ppc_set_texasr i64:$V),
+ (MTSPR8 130, $V)>;
+
+def : Pat<(int_ppc_set_texasru i64:$V),
+ (MTSPR8 131, $V)>;
+
+def : Pat<(int_ppc_set_tfhar i64:$V),
+ (MTSPR8 128, $V)>;
+
+def : Pat<(int_ppc_set_tfiar i64:$V),
+ (MTSPR8 129, $V)>;
+
+
+// Extended mnemonics
+def : Pat<(int_ppc_tendall),
+ (TEND 1)>;
+
+def : Pat<(int_ppc_tresume),
+ (TSR 1)>;
+
+def : Pat<(int_ppc_tsuspend),
+ (TSR 0)>;
+
+def : Pat<(i64 (int_ppc_ttest)),
+ (RLDICL (i64 (COPY (TABORTWCI 0, ZERO, 0))), 36, 28)>;
+
+} // [HasHTM]
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
new file mode 100644
index 000000000000..2e0b9355f82b
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -0,0 +1,1933 @@
+//===-- PPCInstrInfo.cpp - PowerPC Instruction Information ----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PowerPC implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCInstrInfo.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "PPC.h"
+#include "PPCHazardRecognizers.h"
+#include "PPCInstrBuilder.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-instr-info"
+
+#define GET_INSTRMAP_INFO
+#define GET_INSTRINFO_CTOR_DTOR
+#include "PPCGenInstrInfo.inc"
+
+static cl::
+opt<bool> DisableCTRLoopAnal("disable-ppc-ctrloop-analysis", cl::Hidden,
+ cl::desc("Disable analysis for CTR loops"));
+
+static cl::opt<bool> DisableCmpOpt("disable-ppc-cmp-opt",
+cl::desc("Disable compare instruction optimization"), cl::Hidden);
+
+static cl::opt<bool> VSXSelfCopyCrash("crash-on-ppc-vsx-self-copy",
+cl::desc("Causes the backend to crash instead of generating a nop VSX copy"),
+cl::Hidden);
+
+static cl::opt<bool>
+UseOldLatencyCalc("ppc-old-latency-calc", cl::Hidden,
+ cl::desc("Use the old (incorrect) instruction latency calculation"));
+
+// Pin the vtable to this file.
+void PPCInstrInfo::anchor() {}
+
+PPCInstrInfo::PPCInstrInfo(PPCSubtarget &STI)
+ : PPCGenInstrInfo(PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP),
+ Subtarget(STI), RI(STI.getTargetMachine()) {}
+
+/// CreateTargetHazardRecognizer - Return the hazard recognizer to use for
+/// this target when scheduling the DAG.
+ScheduleHazardRecognizer *
+PPCInstrInfo::CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI,
+ const ScheduleDAG *DAG) const {
+ unsigned Directive =
+ static_cast<const PPCSubtarget *>(STI)->getDarwinDirective();
+ if (Directive == PPC::DIR_440 || Directive == PPC::DIR_A2 ||
+ Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500) {
+ const InstrItineraryData *II =
+ static_cast<const PPCSubtarget *>(STI)->getInstrItineraryData();
+ return new ScoreboardHazardRecognizer(II, DAG);
+ }
+
+ return TargetInstrInfo::CreateTargetHazardRecognizer(STI, DAG);
+}
+
+/// CreateTargetPostRAHazardRecognizer - Return the postRA hazard recognizer
+/// to use for this target when scheduling the DAG.
+ScheduleHazardRecognizer *
+PPCInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
+ const ScheduleDAG *DAG) const {
+ unsigned Directive =
+ DAG->MF.getSubtarget<PPCSubtarget>().getDarwinDirective();
+
+ // FIXME: Leaving this as-is until we have POWER9 scheduling info
+ if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8)
+ return new PPCDispatchGroupSBHazardRecognizer(II, DAG);
+
+ // Most subtargets use a PPC970 recognizer.
+ if (Directive != PPC::DIR_440 && Directive != PPC::DIR_A2 &&
+ Directive != PPC::DIR_E500mc && Directive != PPC::DIR_E5500) {
+ assert(DAG->TII && "No InstrInfo?");
+
+ return new PPCHazardRecognizer970(*DAG);
+ }
+
+ return new ScoreboardHazardRecognizer(II, DAG);
+}
+
+unsigned PPCInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
+ const MachineInstr &MI,
+ unsigned *PredCost) const {
+ if (!ItinData || UseOldLatencyCalc)
+ return PPCGenInstrInfo::getInstrLatency(ItinData, MI, PredCost);
+
+ // The default implementation of getInstrLatency calls getStageLatency, but
+ // getStageLatency does not do the right thing for us. While we have
+ // itinerary, most cores are fully pipelined, and so the itineraries only
+ // express the first part of the pipeline, not every stage. Instead, we need
+ // to use the listed output operand cycle number (using operand 0 here, which
+ // is an output).
+
+ unsigned Latency = 1;
+ unsigned DefClass = MI.getDesc().getSchedClass();
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI.getOperand(i);
+ if (!MO.isReg() || !MO.isDef() || MO.isImplicit())
+ continue;
+
+ int Cycle = ItinData->getOperandCycle(DefClass, i);
+ if (Cycle < 0)
+ continue;
+
+ Latency = std::max(Latency, (unsigned) Cycle);
+ }
+
+ return Latency;
+}
+
+int PPCInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
+ const MachineInstr &DefMI, unsigned DefIdx,
+ const MachineInstr &UseMI,
+ unsigned UseIdx) const {
+ int Latency = PPCGenInstrInfo::getOperandLatency(ItinData, DefMI, DefIdx,
+ UseMI, UseIdx);
+
+ if (!DefMI.getParent())
+ return Latency;
+
+ const MachineOperand &DefMO = DefMI.getOperand(DefIdx);
+ unsigned Reg = DefMO.getReg();
+
+ bool IsRegCR;
+ if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ const MachineRegisterInfo *MRI =
+ &DefMI.getParent()->getParent()->getRegInfo();
+ IsRegCR = MRI->getRegClass(Reg)->hasSuperClassEq(&PPC::CRRCRegClass) ||
+ MRI->getRegClass(Reg)->hasSuperClassEq(&PPC::CRBITRCRegClass);
+ } else {
+ IsRegCR = PPC::CRRCRegClass.contains(Reg) ||
+ PPC::CRBITRCRegClass.contains(Reg);
+ }
+
+ if (UseMI.isBranch() && IsRegCR) {
+ if (Latency < 0)
+ Latency = getInstrLatency(ItinData, DefMI);
+
+ // On some cores, there is an additional delay between writing to a condition
+ // register, and using it from a branch.
+ unsigned Directive = Subtarget.getDarwinDirective();
+ switch (Directive) {
+ default: break;
+ case PPC::DIR_7400:
+ case PPC::DIR_750:
+ case PPC::DIR_970:
+ case PPC::DIR_E5500:
+ case PPC::DIR_PWR4:
+ case PPC::DIR_PWR5:
+ case PPC::DIR_PWR5X:
+ case PPC::DIR_PWR6:
+ case PPC::DIR_PWR6X:
+ case PPC::DIR_PWR7:
+ case PPC::DIR_PWR8:
+ // FIXME: Is this needed for POWER9?
+ Latency += 2;
+ break;
+ }
+ }
+
+ return Latency;
+}
+
+// This function does not list all associative and commutative operations, but
+// only those worth feeding through the machine combiner in an attempt to
+// reduce the critical path. Mostly, this means floating-point operations,
+// because they have high latencies (compared to other operations, such and
+// and/or, which are also associative and commutative, but have low latencies).
+bool PPCInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
+ switch (Inst.getOpcode()) {
+ // FP Add:
+ case PPC::FADD:
+ case PPC::FADDS:
+ // FP Multiply:
+ case PPC::FMUL:
+ case PPC::FMULS:
+ // Altivec Add:
+ case PPC::VADDFP:
+ // VSX Add:
+ case PPC::XSADDDP:
+ case PPC::XVADDDP:
+ case PPC::XVADDSP:
+ case PPC::XSADDSP:
+ // VSX Multiply:
+ case PPC::XSMULDP:
+ case PPC::XVMULDP:
+ case PPC::XVMULSP:
+ case PPC::XSMULSP:
+ // QPX Add:
+ case PPC::QVFADD:
+ case PPC::QVFADDS:
+ case PPC::QVFADDSs:
+ // QPX Multiply:
+ case PPC::QVFMUL:
+ case PPC::QVFMULS:
+ case PPC::QVFMULSs:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool PPCInstrInfo::getMachineCombinerPatterns(
+ MachineInstr &Root,
+ SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
+ // Using the machine combiner in this way is potentially expensive, so
+ // restrict to when aggressive optimizations are desired.
+ if (Subtarget.getTargetMachine().getOptLevel() != CodeGenOpt::Aggressive)
+ return false;
+
+ // FP reassociation is only legal when we don't need strict IEEE semantics.
+ if (!Root.getParent()->getParent()->getTarget().Options.UnsafeFPMath)
+ return false;
+
+ return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);
+}
+
+// Detect 32 -> 64-bit extensions where we may reuse the low sub-register.
+bool PPCInstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
+ unsigned &SrcReg, unsigned &DstReg,
+ unsigned &SubIdx) const {
+ switch (MI.getOpcode()) {
+ default: return false;
+ case PPC::EXTSW:
+ case PPC::EXTSW_32_64:
+ SrcReg = MI.getOperand(1).getReg();
+ DstReg = MI.getOperand(0).getReg();
+ SubIdx = PPC::sub_32;
+ return true;
+ }
+}
+
+unsigned PPCInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const {
+ // Note: This list must be kept consistent with LoadRegFromStackSlot.
+ switch (MI.getOpcode()) {
+ default: break;
+ case PPC::LD:
+ case PPC::LWZ:
+ case PPC::LFS:
+ case PPC::LFD:
+ case PPC::RESTORE_CR:
+ case PPC::RESTORE_CRBIT:
+ case PPC::LVX:
+ case PPC::LXVD2X:
+ case PPC::LXVX:
+ case PPC::QVLFDX:
+ case PPC::QVLFSXs:
+ case PPC::QVLFDXb:
+ case PPC::RESTORE_VRSAVE:
+ // Check for the operands added by addFrameReference (the immediate is the
+ // offset which defaults to 0).
+ if (MI.getOperand(1).isImm() && !MI.getOperand(1).getImm() &&
+ MI.getOperand(2).isFI()) {
+ FrameIndex = MI.getOperand(2).getIndex();
+ return MI.getOperand(0).getReg();
+ }
+ break;
+ }
+ return 0;
+}
+
+unsigned PPCInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const {
+ // Note: This list must be kept consistent with StoreRegToStackSlot.
+ switch (MI.getOpcode()) {
+ default: break;
+ case PPC::STD:
+ case PPC::STW:
+ case PPC::STFS:
+ case PPC::STFD:
+ case PPC::SPILL_CR:
+ case PPC::SPILL_CRBIT:
+ case PPC::STVX:
+ case PPC::STXVD2X:
+ case PPC::STXVX:
+ case PPC::QVSTFDX:
+ case PPC::QVSTFSXs:
+ case PPC::QVSTFDXb:
+ case PPC::SPILL_VRSAVE:
+ // Check for the operands added by addFrameReference (the immediate is the
+ // offset which defaults to 0).
+ if (MI.getOperand(1).isImm() && !MI.getOperand(1).getImm() &&
+ MI.getOperand(2).isFI()) {
+ FrameIndex = MI.getOperand(2).getIndex();
+ return MI.getOperand(0).getReg();
+ }
+ break;
+ }
+ return 0;
+}
+
+MachineInstr *PPCInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+ unsigned OpIdx1,
+ unsigned OpIdx2) const {
+ MachineFunction &MF = *MI.getParent()->getParent();
+
+ // Normal instructions can be commuted the obvious way.
+ if (MI.getOpcode() != PPC::RLWIMI && MI.getOpcode() != PPC::RLWIMIo)
+ return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+ // Note that RLWIMI can be commuted as a 32-bit instruction, but not as a
+ // 64-bit instruction (so we don't handle PPC::RLWIMI8 here), because
+ // changing the relative order of the mask operands might change what happens
+ // to the high-bits of the mask (and, thus, the result).
+
+ // Cannot commute if it has a non-zero rotate count.
+ if (MI.getOperand(3).getImm() != 0)
+ return nullptr;
+
+ // If we have a zero rotate count, we have:
+ // M = mask(MB,ME)
+ // Op0 = (Op1 & ~M) | (Op2 & M)
+ // Change this to:
+ // M = mask((ME+1)&31, (MB-1)&31)
+ // Op0 = (Op2 & ~M) | (Op1 & M)
+
+ // Swap op1/op2
+ assert(((OpIdx1 == 1 && OpIdx2 == 2) || (OpIdx1 == 2 && OpIdx2 == 1)) &&
+ "Only the operands 1 and 2 can be swapped in RLSIMI/RLWIMIo.");
+ unsigned Reg0 = MI.getOperand(0).getReg();
+ unsigned Reg1 = MI.getOperand(1).getReg();
+ unsigned Reg2 = MI.getOperand(2).getReg();
+ unsigned SubReg1 = MI.getOperand(1).getSubReg();
+ unsigned SubReg2 = MI.getOperand(2).getSubReg();
+ bool Reg1IsKill = MI.getOperand(1).isKill();
+ bool Reg2IsKill = MI.getOperand(2).isKill();
+ bool ChangeReg0 = false;
+ // If machine instrs are no longer in two-address forms, update
+ // destination register as well.
+ if (Reg0 == Reg1) {
+ // Must be two address instruction!
+ assert(MI.getDesc().getOperandConstraint(0, MCOI::TIED_TO) &&
+ "Expecting a two-address instruction!");
+ assert(MI.getOperand(0).getSubReg() == SubReg1 && "Tied subreg mismatch");
+ Reg2IsKill = false;
+ ChangeReg0 = true;
+ }
+
+ // Masks.
+ unsigned MB = MI.getOperand(4).getImm();
+ unsigned ME = MI.getOperand(5).getImm();
+
+ // We can't commute a trivial mask (there is no way to represent an all-zero
+ // mask).
+ if (MB == 0 && ME == 31)
+ return nullptr;
+
+ if (NewMI) {
+ // Create a new instruction.
+ unsigned Reg0 = ChangeReg0 ? Reg2 : MI.getOperand(0).getReg();
+ bool Reg0IsDead = MI.getOperand(0).isDead();
+ return BuildMI(MF, MI.getDebugLoc(), MI.getDesc())
+ .addReg(Reg0, RegState::Define | getDeadRegState(Reg0IsDead))
+ .addReg(Reg2, getKillRegState(Reg2IsKill))
+ .addReg(Reg1, getKillRegState(Reg1IsKill))
+ .addImm((ME + 1) & 31)
+ .addImm((MB - 1) & 31);
+ }
+
+ if (ChangeReg0) {
+ MI.getOperand(0).setReg(Reg2);
+ MI.getOperand(0).setSubReg(SubReg2);
+ }
+ MI.getOperand(2).setReg(Reg1);
+ MI.getOperand(1).setReg(Reg2);
+ MI.getOperand(2).setSubReg(SubReg1);
+ MI.getOperand(1).setSubReg(SubReg2);
+ MI.getOperand(2).setIsKill(Reg1IsKill);
+ MI.getOperand(1).setIsKill(Reg2IsKill);
+
+ // Swap the mask around.
+ MI.getOperand(4).setImm((ME + 1) & 31);
+ MI.getOperand(5).setImm((MB - 1) & 31);
+ return &MI;
+}
+
+bool PPCInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
+ unsigned &SrcOpIdx2) const {
+ // For VSX A-Type FMA instructions, it is the first two operands that can be
+ // commuted, however, because the non-encoded tied input operand is listed
+ // first, the operands to swap are actually the second and third.
+
+ int AltOpc = PPC::getAltVSXFMAOpcode(MI.getOpcode());
+ if (AltOpc == -1)
+ return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+
+ // The commutable operand indices are 2 and 3. Return them in SrcOpIdx1
+ // and SrcOpIdx2.
+ return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 2, 3);
+}
+
+void PPCInstrInfo::insertNoop(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI) const {
+ // This function is used for scheduling, and the nop wanted here is the type
+ // that terminates dispatch groups on the POWER cores.
+ unsigned Directive = Subtarget.getDarwinDirective();
+ unsigned Opcode;
+ switch (Directive) {
+ default: Opcode = PPC::NOP; break;
+ case PPC::DIR_PWR6: Opcode = PPC::NOP_GT_PWR6; break;
+ case PPC::DIR_PWR7: Opcode = PPC::NOP_GT_PWR7; break;
+ case PPC::DIR_PWR8: Opcode = PPC::NOP_GT_PWR7; break; /* FIXME: Update when P8 InstrScheduling model is ready */
+ // FIXME: Update when POWER9 scheduling model is ready.
+ case PPC::DIR_PWR9: Opcode = PPC::NOP_GT_PWR7; break;
+ }
+
+ DebugLoc DL;
+ BuildMI(MBB, MI, DL, get(Opcode));
+}
+
+/// getNoopForMachoTarget - Return the noop instruction to use for a noop.
+void PPCInstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+ NopInst.setOpcode(PPC::NOP);
+}
+
+// Branch analysis.
+// Note: If the condition register is set to CTR or CTR8 then this is a
+// BDNZ (imm == 1) or BDZ (imm == 0) branch.
+bool PPCInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const {
+ bool isPPC64 = Subtarget.isPPC64();
+
+ // If the block has no terminators, it just falls into the block after it.
+ MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
+ if (I == MBB.end())
+ return false;
+
+ if (!isUnpredicatedTerminator(*I))
+ return false;
+
+ // Get the last instruction in the block.
+ MachineInstr &LastInst = *I;
+
+ // If there is only one terminator instruction, process it.
+ if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
+ if (LastInst.getOpcode() == PPC::B) {
+ if (!LastInst.getOperand(0).isMBB())
+ return true;
+ TBB = LastInst.getOperand(0).getMBB();
+ return false;
+ } else if (LastInst.getOpcode() == PPC::BCC) {
+ if (!LastInst.getOperand(2).isMBB())
+ return true;
+ // Block ends with fall-through condbranch.
+ TBB = LastInst.getOperand(2).getMBB();
+ Cond.push_back(LastInst.getOperand(0));
+ Cond.push_back(LastInst.getOperand(1));
+ return false;
+ } else if (LastInst.getOpcode() == PPC::BC) {
+ if (!LastInst.getOperand(1).isMBB())
+ return true;
+ // Block ends with fall-through condbranch.
+ TBB = LastInst.getOperand(1).getMBB();
+ Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_SET));
+ Cond.push_back(LastInst.getOperand(0));
+ return false;
+ } else if (LastInst.getOpcode() == PPC::BCn) {
+ if (!LastInst.getOperand(1).isMBB())
+ return true;
+ // Block ends with fall-through condbranch.
+ TBB = LastInst.getOperand(1).getMBB();
+ Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_UNSET));
+ Cond.push_back(LastInst.getOperand(0));
+ return false;
+ } else if (LastInst.getOpcode() == PPC::BDNZ8 ||
+ LastInst.getOpcode() == PPC::BDNZ) {
+ if (!LastInst.getOperand(0).isMBB())
+ return true;
+ if (DisableCTRLoopAnal)
+ return true;
+ TBB = LastInst.getOperand(0).getMBB();
+ Cond.push_back(MachineOperand::CreateImm(1));
+ Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR,
+ true));
+ return false;
+ } else if (LastInst.getOpcode() == PPC::BDZ8 ||
+ LastInst.getOpcode() == PPC::BDZ) {
+ if (!LastInst.getOperand(0).isMBB())
+ return true;
+ if (DisableCTRLoopAnal)
+ return true;
+ TBB = LastInst.getOperand(0).getMBB();
+ Cond.push_back(MachineOperand::CreateImm(0));
+ Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR,
+ true));
+ return false;
+ }
+
+ // Otherwise, don't know what this is.
+ return true;
+ }
+
+ // Get the instruction before it if it's a terminator.
+ MachineInstr &SecondLastInst = *I;
+
+ // If there are three terminators, we don't know what sort of block this is.
+ if (I != MBB.begin() && isUnpredicatedTerminator(*--I))
+ return true;
+
+ // If the block ends with PPC::B and PPC:BCC, handle it.
+ if (SecondLastInst.getOpcode() == PPC::BCC &&
+ LastInst.getOpcode() == PPC::B) {
+ if (!SecondLastInst.getOperand(2).isMBB() ||
+ !LastInst.getOperand(0).isMBB())
+ return true;
+ TBB = SecondLastInst.getOperand(2).getMBB();
+ Cond.push_back(SecondLastInst.getOperand(0));
+ Cond.push_back(SecondLastInst.getOperand(1));
+ FBB = LastInst.getOperand(0).getMBB();
+ return false;
+ } else if (SecondLastInst.getOpcode() == PPC::BC &&
+ LastInst.getOpcode() == PPC::B) {
+ if (!SecondLastInst.getOperand(1).isMBB() ||
+ !LastInst.getOperand(0).isMBB())
+ return true;
+ TBB = SecondLastInst.getOperand(1).getMBB();
+ Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_SET));
+ Cond.push_back(SecondLastInst.getOperand(0));
+ FBB = LastInst.getOperand(0).getMBB();
+ return false;
+ } else if (SecondLastInst.getOpcode() == PPC::BCn &&
+ LastInst.getOpcode() == PPC::B) {
+ if (!SecondLastInst.getOperand(1).isMBB() ||
+ !LastInst.getOperand(0).isMBB())
+ return true;
+ TBB = SecondLastInst.getOperand(1).getMBB();
+ Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_UNSET));
+ Cond.push_back(SecondLastInst.getOperand(0));
+ FBB = LastInst.getOperand(0).getMBB();
+ return false;
+ } else if ((SecondLastInst.getOpcode() == PPC::BDNZ8 ||
+ SecondLastInst.getOpcode() == PPC::BDNZ) &&
+ LastInst.getOpcode() == PPC::B) {
+ if (!SecondLastInst.getOperand(0).isMBB() ||
+ !LastInst.getOperand(0).isMBB())
+ return true;
+ if (DisableCTRLoopAnal)
+ return true;
+ TBB = SecondLastInst.getOperand(0).getMBB();
+ Cond.push_back(MachineOperand::CreateImm(1));
+ Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR,
+ true));
+ FBB = LastInst.getOperand(0).getMBB();
+ return false;
+ } else if ((SecondLastInst.getOpcode() == PPC::BDZ8 ||
+ SecondLastInst.getOpcode() == PPC::BDZ) &&
+ LastInst.getOpcode() == PPC::B) {
+ if (!SecondLastInst.getOperand(0).isMBB() ||
+ !LastInst.getOperand(0).isMBB())
+ return true;
+ if (DisableCTRLoopAnal)
+ return true;
+ TBB = SecondLastInst.getOperand(0).getMBB();
+ Cond.push_back(MachineOperand::CreateImm(0));
+ Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR,
+ true));
+ FBB = LastInst.getOperand(0).getMBB();
+ return false;
+ }
+
+ // If the block ends with two PPC:Bs, handle it. The second one is not
+ // executed, so remove it.
+ if (SecondLastInst.getOpcode() == PPC::B && LastInst.getOpcode() == PPC::B) {
+ if (!SecondLastInst.getOperand(0).isMBB())
+ return true;
+ TBB = SecondLastInst.getOperand(0).getMBB();
+ I = LastInst;
+ if (AllowModify)
+ I->eraseFromParent();
+ return false;
+ }
+
+ // Otherwise, can't handle this.
+ return true;
+}
+
+unsigned PPCInstrInfo::removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved) const {
+ assert(!BytesRemoved && "code size not handled");
+
+ MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
+ if (I == MBB.end())
+ return 0;
+
+ if (I->getOpcode() != PPC::B && I->getOpcode() != PPC::BCC &&
+ I->getOpcode() != PPC::BC && I->getOpcode() != PPC::BCn &&
+ I->getOpcode() != PPC::BDNZ8 && I->getOpcode() != PPC::BDNZ &&
+ I->getOpcode() != PPC::BDZ8 && I->getOpcode() != PPC::BDZ)
+ return 0;
+
+ // Remove the branch.
+ I->eraseFromParent();
+
+ I = MBB.end();
+
+ if (I == MBB.begin()) return 1;
+ --I;
+ if (I->getOpcode() != PPC::BCC &&
+ I->getOpcode() != PPC::BC && I->getOpcode() != PPC::BCn &&
+ I->getOpcode() != PPC::BDNZ8 && I->getOpcode() != PPC::BDNZ &&
+ I->getOpcode() != PPC::BDZ8 && I->getOpcode() != PPC::BDZ)
+ return 1;
+
+ // Remove the branch.
+ I->eraseFromParent();
+ return 2;
+}
+
+unsigned PPCInstrInfo::insertBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB,
+ ArrayRef<MachineOperand> Cond,
+ const DebugLoc &DL,
+ int *BytesAdded) const {
+ // Shouldn't be a fall through.
+ assert(TBB && "insertBranch must not be told to insert a fallthrough");
+ assert((Cond.size() == 2 || Cond.size() == 0) &&
+ "PPC branch conditions have two components!");
+ assert(!BytesAdded && "code size not handled");
+
+ bool isPPC64 = Subtarget.isPPC64();
+
+ // One-way branch.
+ if (!FBB) {
+ if (Cond.empty()) // Unconditional branch
+ BuildMI(&MBB, DL, get(PPC::B)).addMBB(TBB);
+ else if (Cond[1].getReg() == PPC::CTR || Cond[1].getReg() == PPC::CTR8)
+ BuildMI(&MBB, DL, get(Cond[0].getImm() ?
+ (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) :
+ (isPPC64 ? PPC::BDZ8 : PPC::BDZ))).addMBB(TBB);
+ else if (Cond[0].getImm() == PPC::PRED_BIT_SET)
+ BuildMI(&MBB, DL, get(PPC::BC)).addOperand(Cond[1]).addMBB(TBB);
+ else if (Cond[0].getImm() == PPC::PRED_BIT_UNSET)
+ BuildMI(&MBB, DL, get(PPC::BCn)).addOperand(Cond[1]).addMBB(TBB);
+ else // Conditional branch
+ BuildMI(&MBB, DL, get(PPC::BCC))
+ .addImm(Cond[0].getImm()).addOperand(Cond[1]).addMBB(TBB);
+ return 1;
+ }
+
+ // Two-way Conditional Branch.
+ if (Cond[1].getReg() == PPC::CTR || Cond[1].getReg() == PPC::CTR8)
+ BuildMI(&MBB, DL, get(Cond[0].getImm() ?
+ (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) :
+ (isPPC64 ? PPC::BDZ8 : PPC::BDZ))).addMBB(TBB);
+ else if (Cond[0].getImm() == PPC::PRED_BIT_SET)
+ BuildMI(&MBB, DL, get(PPC::BC)).addOperand(Cond[1]).addMBB(TBB);
+ else if (Cond[0].getImm() == PPC::PRED_BIT_UNSET)
+ BuildMI(&MBB, DL, get(PPC::BCn)).addOperand(Cond[1]).addMBB(TBB);
+ else
+ BuildMI(&MBB, DL, get(PPC::BCC))
+ .addImm(Cond[0].getImm()).addOperand(Cond[1]).addMBB(TBB);
+ BuildMI(&MBB, DL, get(PPC::B)).addMBB(FBB);
+ return 2;
+}
+
+// Select analysis.
+bool PPCInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
+ ArrayRef<MachineOperand> Cond,
+ unsigned TrueReg, unsigned FalseReg,
+ int &CondCycles, int &TrueCycles, int &FalseCycles) const {
+ if (!Subtarget.hasISEL())
+ return false;
+
+ if (Cond.size() != 2)
+ return false;
+
+ // If this is really a bdnz-like condition, then it cannot be turned into a
+ // select.
+ if (Cond[1].getReg() == PPC::CTR || Cond[1].getReg() == PPC::CTR8)
+ return false;
+
+ // Check register classes.
+ const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ const TargetRegisterClass *RC =
+ RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
+ if (!RC)
+ return false;
+
+ // isel is for regular integer GPRs only.
+ if (!PPC::GPRCRegClass.hasSubClassEq(RC) &&
+ !PPC::GPRC_NOR0RegClass.hasSubClassEq(RC) &&
+ !PPC::G8RCRegClass.hasSubClassEq(RC) &&
+ !PPC::G8RC_NOX0RegClass.hasSubClassEq(RC))
+ return false;
+
+ // FIXME: These numbers are for the A2, how well they work for other cores is
+ // an open question. On the A2, the isel instruction has a 2-cycle latency
+ // but single-cycle throughput. These numbers are used in combination with
+ // the MispredictPenalty setting from the active SchedMachineModel.
+ CondCycles = 1;
+ TrueCycles = 1;
+ FalseCycles = 1;
+
+ return true;
+}
+
+void PPCInstrInfo::insertSelect(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const DebugLoc &dl, unsigned DestReg,
+ ArrayRef<MachineOperand> Cond, unsigned TrueReg,
+ unsigned FalseReg) const {
+ assert(Cond.size() == 2 &&
+ "PPC branch conditions have two components!");
+
+ assert(Subtarget.hasISEL() &&
+ "Cannot insert select on target without ISEL support");
+
+ // Get the register classes.
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ const TargetRegisterClass *RC =
+ RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
+ assert(RC && "TrueReg and FalseReg must have overlapping register classes");
+
+ bool Is64Bit = PPC::G8RCRegClass.hasSubClassEq(RC) ||
+ PPC::G8RC_NOX0RegClass.hasSubClassEq(RC);
+ assert((Is64Bit ||
+ PPC::GPRCRegClass.hasSubClassEq(RC) ||
+ PPC::GPRC_NOR0RegClass.hasSubClassEq(RC)) &&
+ "isel is for regular integer GPRs only");
+
+ unsigned OpCode = Is64Bit ? PPC::ISEL8 : PPC::ISEL;
+ auto SelectPred = static_cast<PPC::Predicate>(Cond[0].getImm());
+
+ unsigned SubIdx = 0;
+ bool SwapOps = false;
+ switch (SelectPred) {
+ case PPC::PRED_EQ:
+ case PPC::PRED_EQ_MINUS:
+ case PPC::PRED_EQ_PLUS:
+ SubIdx = PPC::sub_eq; SwapOps = false; break;
+ case PPC::PRED_NE:
+ case PPC::PRED_NE_MINUS:
+ case PPC::PRED_NE_PLUS:
+ SubIdx = PPC::sub_eq; SwapOps = true; break;
+ case PPC::PRED_LT:
+ case PPC::PRED_LT_MINUS:
+ case PPC::PRED_LT_PLUS:
+ SubIdx = PPC::sub_lt; SwapOps = false; break;
+ case PPC::PRED_GE:
+ case PPC::PRED_GE_MINUS:
+ case PPC::PRED_GE_PLUS:
+ SubIdx = PPC::sub_lt; SwapOps = true; break;
+ case PPC::PRED_GT:
+ case PPC::PRED_GT_MINUS:
+ case PPC::PRED_GT_PLUS:
+ SubIdx = PPC::sub_gt; SwapOps = false; break;
+ case PPC::PRED_LE:
+ case PPC::PRED_LE_MINUS:
+ case PPC::PRED_LE_PLUS:
+ SubIdx = PPC::sub_gt; SwapOps = true; break;
+ case PPC::PRED_UN:
+ case PPC::PRED_UN_MINUS:
+ case PPC::PRED_UN_PLUS:
+ SubIdx = PPC::sub_un; SwapOps = false; break;
+ case PPC::PRED_NU:
+ case PPC::PRED_NU_MINUS:
+ case PPC::PRED_NU_PLUS:
+ SubIdx = PPC::sub_un; SwapOps = true; break;
+ case PPC::PRED_BIT_SET: SubIdx = 0; SwapOps = false; break;
+ case PPC::PRED_BIT_UNSET: SubIdx = 0; SwapOps = true; break;
+ }
+
+ unsigned FirstReg = SwapOps ? FalseReg : TrueReg,
+ SecondReg = SwapOps ? TrueReg : FalseReg;
+
+ // The first input register of isel cannot be r0. If it is a member
+ // of a register class that can be r0, then copy it first (the
+ // register allocator should eliminate the copy).
+ if (MRI.getRegClass(FirstReg)->contains(PPC::R0) ||
+ MRI.getRegClass(FirstReg)->contains(PPC::X0)) {
+ const TargetRegisterClass *FirstRC =
+ MRI.getRegClass(FirstReg)->contains(PPC::X0) ?
+ &PPC::G8RC_NOX0RegClass : &PPC::GPRC_NOR0RegClass;
+ unsigned OldFirstReg = FirstReg;
+ FirstReg = MRI.createVirtualRegister(FirstRC);
+ BuildMI(MBB, MI, dl, get(TargetOpcode::COPY), FirstReg)
+ .addReg(OldFirstReg);
+ }
+
+ BuildMI(MBB, MI, dl, get(OpCode), DestReg)
+ .addReg(FirstReg).addReg(SecondReg)
+ .addReg(Cond[1].getReg(), 0, SubIdx);
+}
+
+static unsigned getCRBitValue(unsigned CRBit) {
+ unsigned Ret = 4;
+ if (CRBit == PPC::CR0LT || CRBit == PPC::CR1LT ||
+ CRBit == PPC::CR2LT || CRBit == PPC::CR3LT ||
+ CRBit == PPC::CR4LT || CRBit == PPC::CR5LT ||
+ CRBit == PPC::CR6LT || CRBit == PPC::CR7LT)
+ Ret = 3;
+ if (CRBit == PPC::CR0GT || CRBit == PPC::CR1GT ||
+ CRBit == PPC::CR2GT || CRBit == PPC::CR3GT ||
+ CRBit == PPC::CR4GT || CRBit == PPC::CR5GT ||
+ CRBit == PPC::CR6GT || CRBit == PPC::CR7GT)
+ Ret = 2;
+ if (CRBit == PPC::CR0EQ || CRBit == PPC::CR1EQ ||
+ CRBit == PPC::CR2EQ || CRBit == PPC::CR3EQ ||
+ CRBit == PPC::CR4EQ || CRBit == PPC::CR5EQ ||
+ CRBit == PPC::CR6EQ || CRBit == PPC::CR7EQ)
+ Ret = 1;
+ if (CRBit == PPC::CR0UN || CRBit == PPC::CR1UN ||
+ CRBit == PPC::CR2UN || CRBit == PPC::CR3UN ||
+ CRBit == PPC::CR4UN || CRBit == PPC::CR5UN ||
+ CRBit == PPC::CR6UN || CRBit == PPC::CR7UN)
+ Ret = 0;
+
+ assert(Ret != 4 && "Invalid CR bit register");
+ return Ret;
+}
+
+void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL, unsigned DestReg,
+ unsigned SrcReg, bool KillSrc) const {
+ // We can end up with self copies and similar things as a result of VSX copy
+ // legalization. Promote them here.
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ if (PPC::F8RCRegClass.contains(DestReg) &&
+ PPC::VSRCRegClass.contains(SrcReg)) {
+ unsigned SuperReg =
+ TRI->getMatchingSuperReg(DestReg, PPC::sub_64, &PPC::VSRCRegClass);
+
+ if (VSXSelfCopyCrash && SrcReg == SuperReg)
+ llvm_unreachable("nop VSX copy");
+
+ DestReg = SuperReg;
+ } else if (PPC::F8RCRegClass.contains(SrcReg) &&
+ PPC::VSRCRegClass.contains(DestReg)) {
+ unsigned SuperReg =
+ TRI->getMatchingSuperReg(SrcReg, PPC::sub_64, &PPC::VSRCRegClass);
+
+ if (VSXSelfCopyCrash && DestReg == SuperReg)
+ llvm_unreachable("nop VSX copy");
+
+ SrcReg = SuperReg;
+ }
+
+ // Different class register copy
+ if (PPC::CRBITRCRegClass.contains(SrcReg) &&
+ PPC::GPRCRegClass.contains(DestReg)) {
+ unsigned CRReg = getCRFromCRBit(SrcReg);
+ BuildMI(MBB, I, DL, get(PPC::MFOCRF), DestReg).addReg(CRReg);
+ getKillRegState(KillSrc);
+ // Rotate the CR bit in the CR fields to be the least significant bit and
+ // then mask with 0x1 (MB = ME = 31).
+ BuildMI(MBB, I, DL, get(PPC::RLWINM), DestReg)
+ .addReg(DestReg, RegState::Kill)
+ .addImm(TRI->getEncodingValue(CRReg) * 4 + (4 - getCRBitValue(SrcReg)))
+ .addImm(31)
+ .addImm(31);
+ return;
+ } else if (PPC::CRRCRegClass.contains(SrcReg) &&
+ PPC::G8RCRegClass.contains(DestReg)) {
+ BuildMI(MBB, I, DL, get(PPC::MFOCRF8), DestReg).addReg(SrcReg);
+ getKillRegState(KillSrc);
+ return;
+ } else if (PPC::CRRCRegClass.contains(SrcReg) &&
+ PPC::GPRCRegClass.contains(DestReg)) {
+ BuildMI(MBB, I, DL, get(PPC::MFOCRF), DestReg).addReg(SrcReg);
+ getKillRegState(KillSrc);
+ return;
+ }
+
+ unsigned Opc;
+ if (PPC::GPRCRegClass.contains(DestReg, SrcReg))
+ Opc = PPC::OR;
+ else if (PPC::G8RCRegClass.contains(DestReg, SrcReg))
+ Opc = PPC::OR8;
+ else if (PPC::F4RCRegClass.contains(DestReg, SrcReg))
+ Opc = PPC::FMR;
+ else if (PPC::CRRCRegClass.contains(DestReg, SrcReg))
+ Opc = PPC::MCRF;
+ else if (PPC::VRRCRegClass.contains(DestReg, SrcReg))
+ Opc = PPC::VOR;
+ else if (PPC::VSRCRegClass.contains(DestReg, SrcReg))
+ // There are two different ways this can be done:
+ // 1. xxlor : This has lower latency (on the P7), 2 cycles, but can only
+ // issue in VSU pipeline 0.
+ // 2. xmovdp/xmovsp: This has higher latency (on the P7), 6 cycles, but
+ // can go to either pipeline.
+ // We'll always use xxlor here, because in practically all cases where
+ // copies are generated, they are close enough to some use that the
+ // lower-latency form is preferable.
+ Opc = PPC::XXLOR;
+ else if (PPC::VSFRCRegClass.contains(DestReg, SrcReg) ||
+ PPC::VSSRCRegClass.contains(DestReg, SrcReg))
+ Opc = PPC::XXLORf;
+ else if (PPC::QFRCRegClass.contains(DestReg, SrcReg))
+ Opc = PPC::QVFMR;
+ else if (PPC::QSRCRegClass.contains(DestReg, SrcReg))
+ Opc = PPC::QVFMRs;
+ else if (PPC::QBRCRegClass.contains(DestReg, SrcReg))
+ Opc = PPC::QVFMRb;
+ else if (PPC::CRBITRCRegClass.contains(DestReg, SrcReg))
+ Opc = PPC::CROR;
+ else
+ llvm_unreachable("Impossible reg-to-reg copy");
+
+ const MCInstrDesc &MCID = get(Opc);
+ if (MCID.getNumOperands() == 3)
+ BuildMI(MBB, I, DL, MCID, DestReg)
+ .addReg(SrcReg).addReg(SrcReg, getKillRegState(KillSrc));
+ else
+ BuildMI(MBB, I, DL, MCID, DestReg).addReg(SrcReg, getKillRegState(KillSrc));
+}
+
+// This function returns true if a CR spill is necessary and false otherwise.
+bool
+PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
+ unsigned SrcReg, bool isKill,
+ int FrameIdx,
+ const TargetRegisterClass *RC,
+ SmallVectorImpl<MachineInstr*> &NewMIs,
+ bool &NonRI, bool &SpillsVRS) const{
+ // Note: If additional store instructions are added here,
+ // update isStoreToStackSlot.
+
+ DebugLoc DL;
+ if (PPC::GPRCRegClass.hasSubClassEq(RC) ||
+ PPC::GPRC_NOR0RegClass.hasSubClassEq(RC)) {
+ NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STW))
+ .addReg(SrcReg,
+ getKillRegState(isKill)),
+ FrameIdx));
+ } else if (PPC::G8RCRegClass.hasSubClassEq(RC) ||
+ PPC::G8RC_NOX0RegClass.hasSubClassEq(RC)) {
+ NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STD))
+ .addReg(SrcReg,
+ getKillRegState(isKill)),
+ FrameIdx));
+ } else if (PPC::F8RCRegClass.hasSubClassEq(RC)) {
+ NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STFD))
+ .addReg(SrcReg,
+ getKillRegState(isKill)),
+ FrameIdx));
+ } else if (PPC::F4RCRegClass.hasSubClassEq(RC)) {
+ NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STFS))
+ .addReg(SrcReg,
+ getKillRegState(isKill)),
+ FrameIdx));
+ } else if (PPC::CRRCRegClass.hasSubClassEq(RC)) {
+ NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILL_CR))
+ .addReg(SrcReg,
+ getKillRegState(isKill)),
+ FrameIdx));
+ return true;
+ } else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) {
+ NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILL_CRBIT))
+ .addReg(SrcReg,
+ getKillRegState(isKill)),
+ FrameIdx));
+ return true;
+ } else if (PPC::VRRCRegClass.hasSubClassEq(RC)) {
+ NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STVX))
+ .addReg(SrcReg,
+ getKillRegState(isKill)),
+ FrameIdx));
+ NonRI = true;
+ } else if (PPC::VSRCRegClass.hasSubClassEq(RC)) {
+ unsigned Op = Subtarget.hasP9Vector() ? PPC::STXVX : PPC::STXVD2X;
+ NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Op))
+ .addReg(SrcReg,
+ getKillRegState(isKill)),
+ FrameIdx));
+ NonRI = true;
+ } else if (PPC::VSFRCRegClass.hasSubClassEq(RC)) {
+ unsigned Opc = Subtarget.hasP9Vector() ? PPC::DFSTOREf64 : PPC::STXSDX;
+ NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Opc))
+ .addReg(SrcReg,
+ getKillRegState(isKill)),
+ FrameIdx));
+ NonRI = true;
+ } else if (PPC::VSSRCRegClass.hasSubClassEq(RC)) {
+ unsigned Opc = Subtarget.hasP9Vector() ? PPC::DFSTOREf32 : PPC::STXSSPX;
+ NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Opc))
+ .addReg(SrcReg,
+ getKillRegState(isKill)),
+ FrameIdx));
+ NonRI = true;
+ } else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.isDarwin() &&
+ "VRSAVE only needs spill/restore on Darwin");
+ NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILL_VRSAVE))
+ .addReg(SrcReg,
+ getKillRegState(isKill)),
+ FrameIdx));
+ SpillsVRS = true;
+ } else if (PPC::QFRCRegClass.hasSubClassEq(RC)) {
+ NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::QVSTFDX))
+ .addReg(SrcReg,
+ getKillRegState(isKill)),
+ FrameIdx));
+ NonRI = true;
+ } else if (PPC::QSRCRegClass.hasSubClassEq(RC)) {
+ NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::QVSTFSXs))
+ .addReg(SrcReg,
+ getKillRegState(isKill)),
+ FrameIdx));
+ NonRI = true;
+ } else if (PPC::QBRCRegClass.hasSubClassEq(RC)) {
+ NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::QVSTFDXb))
+ .addReg(SrcReg,
+ getKillRegState(isKill)),
+ FrameIdx));
+ NonRI = true;
+ } else {
+ llvm_unreachable("Unknown regclass!");
+ }
+
+ return false;
+}
+
+void
+PPCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned SrcReg, bool isKill, int FrameIdx,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ MachineFunction &MF = *MBB.getParent();
+ SmallVector<MachineInstr*, 4> NewMIs;
+
+ PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+ FuncInfo->setHasSpills();
+
+ // We need to avoid a situation in which the value from a VRRC register is
+ // spilled using an Altivec instruction and reloaded into a VSRC register
+ // using a VSX instruction. The issue with this is that the VSX
+ // load/store instructions swap the doublewords in the vector and the Altivec
+ // ones don't. The register classes on the spill/reload may be different if
+ // the register is defined using an Altivec instruction and is then used by a
+ // VSX instruction.
+ RC = updatedRC(RC);
+
+ bool NonRI = false, SpillsVRS = false;
+ if (StoreRegToStackSlot(MF, SrcReg, isKill, FrameIdx, RC, NewMIs,
+ NonRI, SpillsVRS))
+ FuncInfo->setSpillsCR();
+
+ if (SpillsVRS)
+ FuncInfo->setSpillsVRSAVE();
+
+ if (NonRI)
+ FuncInfo->setHasNonRISpills();
+
+ for (unsigned i = 0, e = NewMIs.size(); i != e; ++i)
+ MBB.insert(MI, NewMIs[i]);
+
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FrameIdx),
+ MachineMemOperand::MOStore, MFI.getObjectSize(FrameIdx),
+ MFI.getObjectAlignment(FrameIdx));
+ NewMIs.back()->addMemOperand(MF, MMO);
+}
+
+bool PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL,
+ unsigned DestReg, int FrameIdx,
+ const TargetRegisterClass *RC,
+ SmallVectorImpl<MachineInstr *> &NewMIs,
+ bool &NonRI, bool &SpillsVRS) const {
+ // Note: If additional load instructions are added here,
+ // update isLoadFromStackSlot.
+
+ if (PPC::GPRCRegClass.hasSubClassEq(RC) ||
+ PPC::GPRC_NOR0RegClass.hasSubClassEq(RC)) {
+ NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LWZ),
+ DestReg), FrameIdx));
+ } else if (PPC::G8RCRegClass.hasSubClassEq(RC) ||
+ PPC::G8RC_NOX0RegClass.hasSubClassEq(RC)) {
+ NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LD), DestReg),
+ FrameIdx));
+ } else if (PPC::F8RCRegClass.hasSubClassEq(RC)) {
+ NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LFD), DestReg),
+ FrameIdx));
+ } else if (PPC::F4RCRegClass.hasSubClassEq(RC)) {
+ NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LFS), DestReg),
+ FrameIdx));
+ } else if (PPC::CRRCRegClass.hasSubClassEq(RC)) {
+ NewMIs.push_back(addFrameReference(BuildMI(MF, DL,
+ get(PPC::RESTORE_CR), DestReg),
+ FrameIdx));
+ return true;
+ } else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) {
+ NewMIs.push_back(addFrameReference(BuildMI(MF, DL,
+ get(PPC::RESTORE_CRBIT), DestReg),
+ FrameIdx));
+ return true;
+ } else if (PPC::VRRCRegClass.hasSubClassEq(RC)) {
+ NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LVX), DestReg),
+ FrameIdx));
+ NonRI = true;
+ } else if (PPC::VSRCRegClass.hasSubClassEq(RC)) {
+ unsigned Op = Subtarget.hasP9Vector() ? PPC::LXVX : PPC::LXVD2X;
+ NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Op), DestReg),
+ FrameIdx));
+ NonRI = true;
+ } else if (PPC::VSFRCRegClass.hasSubClassEq(RC)) {
+ unsigned Opc = Subtarget.hasP9Vector() ? PPC::DFLOADf64 : PPC::LXSDX;
+ NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Opc),
+ DestReg), FrameIdx));
+ NonRI = true;
+ } else if (PPC::VSSRCRegClass.hasSubClassEq(RC)) {
+ unsigned Opc = Subtarget.hasP9Vector() ? PPC::DFLOADf32 : PPC::LXSSPX;
+ NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Opc),
+ DestReg), FrameIdx));
+ NonRI = true;
+ } else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.isDarwin() &&
+ "VRSAVE only needs spill/restore on Darwin");
+ NewMIs.push_back(addFrameReference(BuildMI(MF, DL,
+ get(PPC::RESTORE_VRSAVE),
+ DestReg),
+ FrameIdx));
+ SpillsVRS = true;
+ } else if (PPC::QFRCRegClass.hasSubClassEq(RC)) {
+ NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::QVLFDX), DestReg),
+ FrameIdx));
+ NonRI = true;
+ } else if (PPC::QSRCRegClass.hasSubClassEq(RC)) {
+ NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::QVLFSXs), DestReg),
+ FrameIdx));
+ NonRI = true;
+ } else if (PPC::QBRCRegClass.hasSubClassEq(RC)) {
+ NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::QVLFDXb), DestReg),
+ FrameIdx));
+ NonRI = true;
+ } else {
+ llvm_unreachable("Unknown regclass!");
+ }
+
+ return false;
+}
+
+void
+PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned DestReg, int FrameIdx,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ MachineFunction &MF = *MBB.getParent();
+ SmallVector<MachineInstr*, 4> NewMIs;
+ DebugLoc DL;
+ if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+ PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+ FuncInfo->setHasSpills();
+
+ // We need to avoid a situation in which the value from a VRRC register is
+ // spilled using an Altivec instruction and reloaded into a VSRC register
+ // using a VSX instruction. The issue with this is that the VSX
+ // load/store instructions swap the doublewords in the vector and the Altivec
+ // ones don't. The register classes on the spill/reload may be different if
+ // the register is defined using an Altivec instruction and is then used by a
+ // VSX instruction.
+ if (Subtarget.hasVSX() && RC == &PPC::VRRCRegClass)
+ RC = &PPC::VSRCRegClass;
+
+ bool NonRI = false, SpillsVRS = false;
+ if (LoadRegFromStackSlot(MF, DL, DestReg, FrameIdx, RC, NewMIs,
+ NonRI, SpillsVRS))
+ FuncInfo->setSpillsCR();
+
+ if (SpillsVRS)
+ FuncInfo->setSpillsVRSAVE();
+
+ if (NonRI)
+ FuncInfo->setHasNonRISpills();
+
+ for (unsigned i = 0, e = NewMIs.size(); i != e; ++i)
+ MBB.insert(MI, NewMIs[i]);
+
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FrameIdx),
+ MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIdx),
+ MFI.getObjectAlignment(FrameIdx));
+ NewMIs.back()->addMemOperand(MF, MMO);
+}
+
+bool PPCInstrInfo::
+reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+ assert(Cond.size() == 2 && "Invalid PPC branch opcode!");
+ if (Cond[1].getReg() == PPC::CTR8 || Cond[1].getReg() == PPC::CTR)
+ Cond[0].setImm(Cond[0].getImm() == 0 ? 1 : 0);
+ else
+ // Leave the CR# the same, but invert the condition.
+ Cond[0].setImm(PPC::InvertPredicate((PPC::Predicate)Cond[0].getImm()));
+ return false;
+}
+
+bool PPCInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
+ unsigned Reg, MachineRegisterInfo *MRI) const {
+ // For some instructions, it is legal to fold ZERO into the RA register field.
+ // A zero immediate should always be loaded with a single li.
+ unsigned DefOpc = DefMI.getOpcode();
+ if (DefOpc != PPC::LI && DefOpc != PPC::LI8)
+ return false;
+ if (!DefMI.getOperand(1).isImm())
+ return false;
+ if (DefMI.getOperand(1).getImm() != 0)
+ return false;
+
+ // Note that we cannot here invert the arguments of an isel in order to fold
+ // a ZERO into what is presented as the second argument. All we have here
+ // is the condition bit, and that might come from a CR-logical bit operation.
+
+ const MCInstrDesc &UseMCID = UseMI.getDesc();
+
+ // Only fold into real machine instructions.
+ if (UseMCID.isPseudo())
+ return false;
+
+ unsigned UseIdx;
+ for (UseIdx = 0; UseIdx < UseMI.getNumOperands(); ++UseIdx)
+ if (UseMI.getOperand(UseIdx).isReg() &&
+ UseMI.getOperand(UseIdx).getReg() == Reg)
+ break;
+
+ assert(UseIdx < UseMI.getNumOperands() && "Cannot find Reg in UseMI");
+ assert(UseIdx < UseMCID.getNumOperands() && "No operand description for Reg");
+
+ const MCOperandInfo *UseInfo = &UseMCID.OpInfo[UseIdx];
+
+ // We can fold the zero if this register requires a GPRC_NOR0/G8RC_NOX0
+ // register (which might also be specified as a pointer class kind).
+ if (UseInfo->isLookupPtrRegClass()) {
+ if (UseInfo->RegClass /* Kind */ != 1)
+ return false;
+ } else {
+ if (UseInfo->RegClass != PPC::GPRC_NOR0RegClassID &&
+ UseInfo->RegClass != PPC::G8RC_NOX0RegClassID)
+ return false;
+ }
+
+ // Make sure this is not tied to an output register (or otherwise
+ // constrained). This is true for ST?UX registers, for example, which
+ // are tied to their output registers.
+ if (UseInfo->Constraints != 0)
+ return false;
+
+ unsigned ZeroReg;
+ if (UseInfo->isLookupPtrRegClass()) {
+ bool isPPC64 = Subtarget.isPPC64();
+ ZeroReg = isPPC64 ? PPC::ZERO8 : PPC::ZERO;
+ } else {
+ ZeroReg = UseInfo->RegClass == PPC::G8RC_NOX0RegClassID ?
+ PPC::ZERO8 : PPC::ZERO;
+ }
+
+ bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
+ UseMI.getOperand(UseIdx).setReg(ZeroReg);
+
+ if (DeleteDef)
+ DefMI.eraseFromParent();
+
+ return true;
+}
+
+static bool MBBDefinesCTR(MachineBasicBlock &MBB) {
+ for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end();
+ I != IE; ++I)
+ if (I->definesRegister(PPC::CTR) || I->definesRegister(PPC::CTR8))
+ return true;
+ return false;
+}
+
+// We should make sure that, if we're going to predicate both sides of a
+// condition (a diamond), that both sides don't define the counter register. We
+// can predicate counter-decrement-based branches, but while that predicates
+// the branching, it does not predicate the counter decrement. If we tried to
+// merge the triangle into one predicated block, we'd decrement the counter
+// twice.
+bool PPCInstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB,
+ unsigned NumT, unsigned ExtraT,
+ MachineBasicBlock &FMBB,
+ unsigned NumF, unsigned ExtraF,
+ BranchProbability Probability) const {
+ return !(MBBDefinesCTR(TMBB) && MBBDefinesCTR(FMBB));
+}
+
+
+bool PPCInstrInfo::isPredicated(const MachineInstr &MI) const {
+ // The predicated branches are identified by their type, not really by the
+ // explicit presence of a predicate. Furthermore, some of them can be
+ // predicated more than once. Because if conversion won't try to predicate
+ // any instruction which already claims to be predicated (by returning true
+ // here), always return false. In doing so, we let isPredicable() be the
+ // final word on whether not the instruction can be (further) predicated.
+
+ return false;
+}
+
+bool PPCInstrInfo::isUnpredicatedTerminator(const MachineInstr &MI) const {
+ if (!MI.isTerminator())
+ return false;
+
+ // Conditional branch is a special case.
+ if (MI.isBranch() && !MI.isBarrier())
+ return true;
+
+ return !isPredicated(MI);
+}
+
+bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI,
+ ArrayRef<MachineOperand> Pred) const {
+ unsigned OpC = MI.getOpcode();
+ if (OpC == PPC::BLR || OpC == PPC::BLR8) {
+ if (Pred[1].getReg() == PPC::CTR8 || Pred[1].getReg() == PPC::CTR) {
+ bool isPPC64 = Subtarget.isPPC64();
+ MI.setDesc(get(Pred[0].getImm() ? (isPPC64 ? PPC::BDNZLR8 : PPC::BDNZLR)
+ : (isPPC64 ? PPC::BDZLR8 : PPC::BDZLR)));
+ } else if (Pred[0].getImm() == PPC::PRED_BIT_SET) {
+ MI.setDesc(get(PPC::BCLR));
+ MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+ .addReg(Pred[1].getReg());
+ } else if (Pred[0].getImm() == PPC::PRED_BIT_UNSET) {
+ MI.setDesc(get(PPC::BCLRn));
+ MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+ .addReg(Pred[1].getReg());
+ } else {
+ MI.setDesc(get(PPC::BCCLR));
+ MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+ .addImm(Pred[0].getImm())
+ .addReg(Pred[1].getReg());
+ }
+
+ return true;
+ } else if (OpC == PPC::B) {
+ if (Pred[1].getReg() == PPC::CTR8 || Pred[1].getReg() == PPC::CTR) {
+ bool isPPC64 = Subtarget.isPPC64();
+ MI.setDesc(get(Pred[0].getImm() ? (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ)
+ : (isPPC64 ? PPC::BDZ8 : PPC::BDZ)));
+ } else if (Pred[0].getImm() == PPC::PRED_BIT_SET) {
+ MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
+ MI.RemoveOperand(0);
+
+ MI.setDesc(get(PPC::BC));
+ MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+ .addReg(Pred[1].getReg())
+ .addMBB(MBB);
+ } else if (Pred[0].getImm() == PPC::PRED_BIT_UNSET) {
+ MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
+ MI.RemoveOperand(0);
+
+ MI.setDesc(get(PPC::BCn));
+ MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+ .addReg(Pred[1].getReg())
+ .addMBB(MBB);
+ } else {
+ MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
+ MI.RemoveOperand(0);
+
+ MI.setDesc(get(PPC::BCC));
+ MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+ .addImm(Pred[0].getImm())
+ .addReg(Pred[1].getReg())
+ .addMBB(MBB);
+ }
+
+ return true;
+ } else if (OpC == PPC::BCTR || OpC == PPC::BCTR8 ||
+ OpC == PPC::BCTRL || OpC == PPC::BCTRL8) {
+ if (Pred[1].getReg() == PPC::CTR8 || Pred[1].getReg() == PPC::CTR)
+ llvm_unreachable("Cannot predicate bctr[l] on the ctr register");
+
+ bool setLR = OpC == PPC::BCTRL || OpC == PPC::BCTRL8;
+ bool isPPC64 = Subtarget.isPPC64();
+
+ if (Pred[0].getImm() == PPC::PRED_BIT_SET) {
+ MI.setDesc(get(isPPC64 ? (setLR ? PPC::BCCTRL8 : PPC::BCCTR8)
+ : (setLR ? PPC::BCCTRL : PPC::BCCTR)));
+ MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+ .addReg(Pred[1].getReg());
+ return true;
+ } else if (Pred[0].getImm() == PPC::PRED_BIT_UNSET) {
+ MI.setDesc(get(isPPC64 ? (setLR ? PPC::BCCTRL8n : PPC::BCCTR8n)
+ : (setLR ? PPC::BCCTRLn : PPC::BCCTRn)));
+ MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+ .addReg(Pred[1].getReg());
+ return true;
+ }
+
+ MI.setDesc(get(isPPC64 ? (setLR ? PPC::BCCCTRL8 : PPC::BCCCTR8)
+ : (setLR ? PPC::BCCCTRL : PPC::BCCCTR)));
+ MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+ .addImm(Pred[0].getImm())
+ .addReg(Pred[1].getReg());
+ return true;
+ }
+
+ return false;
+}
+
+bool PPCInstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
+ ArrayRef<MachineOperand> Pred2) const {
+ assert(Pred1.size() == 2 && "Invalid PPC first predicate");
+ assert(Pred2.size() == 2 && "Invalid PPC second predicate");
+
+ if (Pred1[1].getReg() == PPC::CTR8 || Pred1[1].getReg() == PPC::CTR)
+ return false;
+ if (Pred2[1].getReg() == PPC::CTR8 || Pred2[1].getReg() == PPC::CTR)
+ return false;
+
+ // P1 can only subsume P2 if they test the same condition register.
+ if (Pred1[1].getReg() != Pred2[1].getReg())
+ return false;
+
+ PPC::Predicate P1 = (PPC::Predicate) Pred1[0].getImm();
+ PPC::Predicate P2 = (PPC::Predicate) Pred2[0].getImm();
+
+ if (P1 == P2)
+ return true;
+
+ // Does P1 subsume P2, e.g. GE subsumes GT.
+ if (P1 == PPC::PRED_LE &&
+ (P2 == PPC::PRED_LT || P2 == PPC::PRED_EQ))
+ return true;
+ if (P1 == PPC::PRED_GE &&
+ (P2 == PPC::PRED_GT || P2 == PPC::PRED_EQ))
+ return true;
+
+ return false;
+}
+
+bool PPCInstrInfo::DefinesPredicate(MachineInstr &MI,
+ std::vector<MachineOperand> &Pred) const {
+ // Note: At the present time, the contents of Pred from this function is
+ // unused by IfConversion. This implementation follows ARM by pushing the
+ // CR-defining operand. Because the 'DZ' and 'DNZ' count as types of
+ // predicate, instructions defining CTR or CTR8 are also included as
+ // predicate-defining instructions.
+
+ const TargetRegisterClass *RCs[] =
+ { &PPC::CRRCRegClass, &PPC::CRBITRCRegClass,
+ &PPC::CTRRCRegClass, &PPC::CTRRC8RegClass };
+
+ bool Found = false;
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI.getOperand(i);
+ for (unsigned c = 0; c < array_lengthof(RCs) && !Found; ++c) {
+ const TargetRegisterClass *RC = RCs[c];
+ if (MO.isReg()) {
+ if (MO.isDef() && RC->contains(MO.getReg())) {
+ Pred.push_back(MO);
+ Found = true;
+ }
+ } else if (MO.isRegMask()) {
+ for (TargetRegisterClass::iterator I = RC->begin(),
+ IE = RC->end(); I != IE; ++I)
+ if (MO.clobbersPhysReg(*I)) {
+ Pred.push_back(MO);
+ Found = true;
+ }
+ }
+ }
+ }
+
+ return Found;
+}
+
+bool PPCInstrInfo::isPredicable(MachineInstr &MI) const {
+ unsigned OpC = MI.getOpcode();
+ switch (OpC) {
+ default:
+ return false;
+ case PPC::B:
+ case PPC::BLR:
+ case PPC::BLR8:
+ case PPC::BCTR:
+ case PPC::BCTR8:
+ case PPC::BCTRL:
+ case PPC::BCTRL8:
+ return true;
+ }
+}
+
+bool PPCInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
+ unsigned &SrcReg2, int &Mask,
+ int &Value) const {
+ unsigned Opc = MI.getOpcode();
+
+ switch (Opc) {
+ default: return false;
+ case PPC::CMPWI:
+ case PPC::CMPLWI:
+ case PPC::CMPDI:
+ case PPC::CMPLDI:
+ SrcReg = MI.getOperand(1).getReg();
+ SrcReg2 = 0;
+ Value = MI.getOperand(2).getImm();
+ Mask = 0xFFFF;
+ return true;
+ case PPC::CMPW:
+ case PPC::CMPLW:
+ case PPC::CMPD:
+ case PPC::CMPLD:
+ case PPC::FCMPUS:
+ case PPC::FCMPUD:
+ SrcReg = MI.getOperand(1).getReg();
+ SrcReg2 = MI.getOperand(2).getReg();
+ return true;
+ }
+}
+
+bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
+ unsigned SrcReg2, int Mask, int Value,
+ const MachineRegisterInfo *MRI) const {
+ if (DisableCmpOpt)
+ return false;
+
+ int OpC = CmpInstr.getOpcode();
+ unsigned CRReg = CmpInstr.getOperand(0).getReg();
+
+ // FP record forms set CR1 based on the execption status bits, not a
+ // comparison with zero.
+ if (OpC == PPC::FCMPUS || OpC == PPC::FCMPUD)
+ return false;
+
+ // The record forms set the condition register based on a signed comparison
+ // with zero (so says the ISA manual). This is not as straightforward as it
+ // seems, however, because this is always a 64-bit comparison on PPC64, even
+ // for instructions that are 32-bit in nature (like slw for example).
+ // So, on PPC32, for unsigned comparisons, we can use the record forms only
+ // for equality checks (as those don't depend on the sign). On PPC64,
+ // we are restricted to equality for unsigned 64-bit comparisons and for
+ // signed 32-bit comparisons the applicability is more restricted.
+ bool isPPC64 = Subtarget.isPPC64();
+ bool is32BitSignedCompare = OpC == PPC::CMPWI || OpC == PPC::CMPW;
+ bool is32BitUnsignedCompare = OpC == PPC::CMPLWI || OpC == PPC::CMPLW;
+ bool is64BitUnsignedCompare = OpC == PPC::CMPLDI || OpC == PPC::CMPLD;
+
+ // Get the unique definition of SrcReg.
+ MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
+ if (!MI) return false;
+ int MIOpC = MI->getOpcode();
+
+ bool equalityOnly = false;
+ bool noSub = false;
+ if (isPPC64) {
+ if (is32BitSignedCompare) {
+ // We can perform this optimization only if MI is sign-extending.
+ if (MIOpC == PPC::SRAW || MIOpC == PPC::SRAWo ||
+ MIOpC == PPC::SRAWI || MIOpC == PPC::SRAWIo ||
+ MIOpC == PPC::EXTSB || MIOpC == PPC::EXTSBo ||
+ MIOpC == PPC::EXTSH || MIOpC == PPC::EXTSHo ||
+ MIOpC == PPC::EXTSW || MIOpC == PPC::EXTSWo) {
+ noSub = true;
+ } else
+ return false;
+ } else if (is32BitUnsignedCompare) {
+ // 32-bit rotate and mask instructions are zero extending only if MB <= ME
+ bool isZeroExtendingRotate =
+ (MIOpC == PPC::RLWINM || MIOpC == PPC::RLWINMo ||
+ MIOpC == PPC::RLWNM || MIOpC == PPC::RLWNMo)
+ && MI->getOperand(3).getImm() <= MI->getOperand(4).getImm();
+
+ // We can perform this optimization, equality only, if MI is
+ // zero-extending.
+ if (MIOpC == PPC::CNTLZW || MIOpC == PPC::CNTLZWo ||
+ MIOpC == PPC::SLW || MIOpC == PPC::SLWo ||
+ MIOpC == PPC::SRW || MIOpC == PPC::SRWo ||
+ isZeroExtendingRotate) {
+ noSub = true;
+ equalityOnly = true;
+ } else
+ return false;
+ } else
+ equalityOnly = is64BitUnsignedCompare;
+ } else
+ equalityOnly = is32BitUnsignedCompare;
+
+ if (equalityOnly) {
+ // We need to check the uses of the condition register in order to reject
+ // non-equality comparisons.
+ for (MachineRegisterInfo::use_instr_iterator I =MRI->use_instr_begin(CRReg),
+ IE = MRI->use_instr_end(); I != IE; ++I) {
+ MachineInstr *UseMI = &*I;
+ if (UseMI->getOpcode() == PPC::BCC) {
+ unsigned Pred = UseMI->getOperand(0).getImm();
+ if (Pred != PPC::PRED_EQ && Pred != PPC::PRED_NE)
+ return false;
+ } else if (UseMI->getOpcode() == PPC::ISEL ||
+ UseMI->getOpcode() == PPC::ISEL8) {
+ unsigned SubIdx = UseMI->getOperand(3).getSubReg();
+ if (SubIdx != PPC::sub_eq)
+ return false;
+ } else
+ return false;
+ }
+ }
+
+ MachineBasicBlock::iterator I = CmpInstr;
+
+ // Scan forward to find the first use of the compare.
+ for (MachineBasicBlock::iterator EL = CmpInstr.getParent()->end(); I != EL;
+ ++I) {
+ bool FoundUse = false;
+ for (MachineRegisterInfo::use_instr_iterator J =MRI->use_instr_begin(CRReg),
+ JE = MRI->use_instr_end(); J != JE; ++J)
+ if (&*J == &*I) {
+ FoundUse = true;
+ break;
+ }
+
+ if (FoundUse)
+ break;
+ }
+
+ // There are two possible candidates which can be changed to set CR[01].
+ // One is MI, the other is a SUB instruction.
+ // For CMPrr(r1,r2), we are looking for SUB(r1,r2) or SUB(r2,r1).
+ MachineInstr *Sub = nullptr;
+ if (SrcReg2 != 0)
+ // MI is not a candidate for CMPrr.
+ MI = nullptr;
+ // FIXME: Conservatively refuse to convert an instruction which isn't in the
+ // same BB as the comparison. This is to allow the check below to avoid calls
+ // (and other explicit clobbers); instead we should really check for these
+ // more explicitly (in at least a few predecessors).
+ else if (MI->getParent() != CmpInstr.getParent() || Value != 0) {
+ // PPC does not have a record-form SUBri.
+ return false;
+ }
+
+ // Search for Sub.
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ --I;
+
+ // Get ready to iterate backward from CmpInstr.
+ MachineBasicBlock::iterator E = MI, B = CmpInstr.getParent()->begin();
+
+ for (; I != E && !noSub; --I) {
+ const MachineInstr &Instr = *I;
+ unsigned IOpC = Instr.getOpcode();
+
+ if (&*I != &CmpInstr && (Instr.modifiesRegister(PPC::CR0, TRI) ||
+ Instr.readsRegister(PPC::CR0, TRI)))
+ // This instruction modifies or uses the record condition register after
+ // the one we want to change. While we could do this transformation, it
+ // would likely not be profitable. This transformation removes one
+ // instruction, and so even forcing RA to generate one move probably
+ // makes it unprofitable.
+ return false;
+
+ // Check whether CmpInstr can be made redundant by the current instruction.
+ if ((OpC == PPC::CMPW || OpC == PPC::CMPLW ||
+ OpC == PPC::CMPD || OpC == PPC::CMPLD) &&
+ (IOpC == PPC::SUBF || IOpC == PPC::SUBF8) &&
+ ((Instr.getOperand(1).getReg() == SrcReg &&
+ Instr.getOperand(2).getReg() == SrcReg2) ||
+ (Instr.getOperand(1).getReg() == SrcReg2 &&
+ Instr.getOperand(2).getReg() == SrcReg))) {
+ Sub = &*I;
+ break;
+ }
+
+ if (I == B)
+ // The 'and' is below the comparison instruction.
+ return false;
+ }
+
+ // Return false if no candidates exist.
+ if (!MI && !Sub)
+ return false;
+
+ // The single candidate is called MI.
+ if (!MI) MI = Sub;
+
+ int NewOpC = -1;
+ MIOpC = MI->getOpcode();
+ if (MIOpC == PPC::ANDIo || MIOpC == PPC::ANDIo8)
+ NewOpC = MIOpC;
+ else {
+ NewOpC = PPC::getRecordFormOpcode(MIOpC);
+ if (NewOpC == -1 && PPC::getNonRecordFormOpcode(MIOpC) != -1)
+ NewOpC = MIOpC;
+ }
+
+ // FIXME: On the non-embedded POWER architectures, only some of the record
+ // forms are fast, and we should use only the fast ones.
+
+ // The defining instruction has a record form (or is already a record
+ // form). It is possible, however, that we'll need to reverse the condition
+ // code of the users.
+ if (NewOpC == -1)
+ return false;
+
+ SmallVector<std::pair<MachineOperand*, PPC::Predicate>, 4> PredsToUpdate;
+ SmallVector<std::pair<MachineOperand*, unsigned>, 4> SubRegsToUpdate;
+
+ // If we have SUB(r1, r2) and CMP(r2, r1), the condition code based on CMP
+ // needs to be updated to be based on SUB. Push the condition code
+ // operands to OperandsToUpdate. If it is safe to remove CmpInstr, the
+ // condition code of these operands will be modified.
+ bool ShouldSwap = false;
+ if (Sub) {
+ ShouldSwap = SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
+ Sub->getOperand(2).getReg() == SrcReg;
+
+ // The operands to subf are the opposite of sub, so only in the fixed-point
+ // case, invert the order.
+ ShouldSwap = !ShouldSwap;
+ }
+
+ if (ShouldSwap)
+ for (MachineRegisterInfo::use_instr_iterator
+ I = MRI->use_instr_begin(CRReg), IE = MRI->use_instr_end();
+ I != IE; ++I) {
+ MachineInstr *UseMI = &*I;
+ if (UseMI->getOpcode() == PPC::BCC) {
+ PPC::Predicate Pred = (PPC::Predicate) UseMI->getOperand(0).getImm();
+ assert((!equalityOnly ||
+ Pred == PPC::PRED_EQ || Pred == PPC::PRED_NE) &&
+ "Invalid predicate for equality-only optimization");
+ PredsToUpdate.push_back(std::make_pair(&(UseMI->getOperand(0)),
+ PPC::getSwappedPredicate(Pred)));
+ } else if (UseMI->getOpcode() == PPC::ISEL ||
+ UseMI->getOpcode() == PPC::ISEL8) {
+ unsigned NewSubReg = UseMI->getOperand(3).getSubReg();
+ assert((!equalityOnly || NewSubReg == PPC::sub_eq) &&
+ "Invalid CR bit for equality-only optimization");
+
+ if (NewSubReg == PPC::sub_lt)
+ NewSubReg = PPC::sub_gt;
+ else if (NewSubReg == PPC::sub_gt)
+ NewSubReg = PPC::sub_lt;
+
+ SubRegsToUpdate.push_back(std::make_pair(&(UseMI->getOperand(3)),
+ NewSubReg));
+ } else // We need to abort on a user we don't understand.
+ return false;
+ }
+
+ // Create a new virtual register to hold the value of the CR set by the
+ // record-form instruction. If the instruction was not previously in
+ // record form, then set the kill flag on the CR.
+ CmpInstr.eraseFromParent();
+
+ MachineBasicBlock::iterator MII = MI;
+ BuildMI(*MI->getParent(), std::next(MII), MI->getDebugLoc(),
+ get(TargetOpcode::COPY), CRReg)
+ .addReg(PPC::CR0, MIOpC != NewOpC ? RegState::Kill : 0);
+
+ // Even if CR0 register were dead before, it is alive now since the
+ // instruction we just built uses it.
+ MI->clearRegisterDeads(PPC::CR0);
+
+ if (MIOpC != NewOpC) {
+ // We need to be careful here: we're replacing one instruction with
+ // another, and we need to make sure that we get all of the right
+ // implicit uses and defs. On the other hand, the caller may be holding
+ // an iterator to this instruction, and so we can't delete it (this is
+ // specifically the case if this is the instruction directly after the
+ // compare).
+
+ const MCInstrDesc &NewDesc = get(NewOpC);
+ MI->setDesc(NewDesc);
+
+ if (NewDesc.ImplicitDefs)
+ for (const MCPhysReg *ImpDefs = NewDesc.getImplicitDefs();
+ *ImpDefs; ++ImpDefs)
+ if (!MI->definesRegister(*ImpDefs))
+ MI->addOperand(*MI->getParent()->getParent(),
+ MachineOperand::CreateReg(*ImpDefs, true, true));
+ if (NewDesc.ImplicitUses)
+ for (const MCPhysReg *ImpUses = NewDesc.getImplicitUses();
+ *ImpUses; ++ImpUses)
+ if (!MI->readsRegister(*ImpUses))
+ MI->addOperand(*MI->getParent()->getParent(),
+ MachineOperand::CreateReg(*ImpUses, false, true));
+ }
+ assert(MI->definesRegister(PPC::CR0) &&
+ "Record-form instruction does not define cr0?");
+
+ // Modify the condition code of operands in OperandsToUpdate.
+ // Since we have SUB(r1, r2) and CMP(r2, r1), the condition code needs to
+ // be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
+ for (unsigned i = 0, e = PredsToUpdate.size(); i < e; i++)
+ PredsToUpdate[i].first->setImm(PredsToUpdate[i].second);
+
+ for (unsigned i = 0, e = SubRegsToUpdate.size(); i < e; i++)
+ SubRegsToUpdate[i].first->setSubReg(SubRegsToUpdate[i].second);
+
+ return true;
+}
+
+/// GetInstSize - Return the number of bytes of code the specified
+/// instruction may be. This returns the maximum number of bytes.
+///
+unsigned PPCInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
+ unsigned Opcode = MI.getOpcode();
+
+ if (Opcode == PPC::INLINEASM) {
+ const MachineFunction *MF = MI.getParent()->getParent();
+ const char *AsmStr = MI.getOperand(0).getSymbolName();
+ return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
+ } else if (Opcode == TargetOpcode::STACKMAP) {
+ StackMapOpers Opers(&MI);
+ return Opers.getNumPatchBytes();
+ } else if (Opcode == TargetOpcode::PATCHPOINT) {
+ PatchPointOpers Opers(&MI);
+ return Opers.getNumPatchBytes();
+ } else {
+ const MCInstrDesc &Desc = get(Opcode);
+ return Desc.getSize();
+ }
+}
+
+std::pair<unsigned, unsigned>
+PPCInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
+ const unsigned Mask = PPCII::MO_ACCESS_MASK;
+ return std::make_pair(TF & Mask, TF & ~Mask);
+}
+
+ArrayRef<std::pair<unsigned, const char *>>
+PPCInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
+ using namespace PPCII;
+ static const std::pair<unsigned, const char *> TargetFlags[] = {
+ {MO_LO, "ppc-lo"},
+ {MO_HA, "ppc-ha"},
+ {MO_TPREL_LO, "ppc-tprel-lo"},
+ {MO_TPREL_HA, "ppc-tprel-ha"},
+ {MO_DTPREL_LO, "ppc-dtprel-lo"},
+ {MO_TLSLD_LO, "ppc-tlsld-lo"},
+ {MO_TOC_LO, "ppc-toc-lo"},
+ {MO_TLS, "ppc-tls"}};
+ return makeArrayRef(TargetFlags);
+}
+
+ArrayRef<std::pair<unsigned, const char *>>
+PPCInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
+ using namespace PPCII;
+ static const std::pair<unsigned, const char *> TargetFlags[] = {
+ {MO_PLT, "ppc-plt"},
+ {MO_PIC_FLAG, "ppc-pic"},
+ {MO_NLP_FLAG, "ppc-nlp"},
+ {MO_NLP_HIDDEN_FLAG, "ppc-nlp-hidden"}};
+ return makeArrayRef(TargetFlags);
+}
+
+bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ case TargetOpcode::LOAD_STACK_GUARD: {
+ assert(Subtarget.isTargetLinux() &&
+ "Only Linux target is expected to contain LOAD_STACK_GUARD");
+ const int64_t Offset = Subtarget.isPPC64() ? -0x7010 : -0x7008;
+ const unsigned Reg = Subtarget.isPPC64() ? PPC::X13 : PPC::R2;
+ MI.setDesc(get(Subtarget.isPPC64() ? PPC::LD : PPC::LWZ));
+ MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+ .addImm(Offset)
+ .addReg(Reg);
+ return true;
+ }
+ case PPC::DFLOADf32:
+ case PPC::DFLOADf64:
+ case PPC::DFSTOREf32:
+ case PPC::DFSTOREf64: {
+ assert(Subtarget.hasP9Vector() &&
+ "Invalid D-Form Pseudo-ops on non-P9 target.");
+ unsigned UpperOpcode, LowerOpcode;
+ switch (MI.getOpcode()) {
+ case PPC::DFLOADf32:
+ UpperOpcode = PPC::LXSSP;
+ LowerOpcode = PPC::LFS;
+ break;
+ case PPC::DFLOADf64:
+ UpperOpcode = PPC::LXSD;
+ LowerOpcode = PPC::LFD;
+ break;
+ case PPC::DFSTOREf32:
+ UpperOpcode = PPC::STXSSP;
+ LowerOpcode = PPC::STFS;
+ break;
+ case PPC::DFSTOREf64:
+ UpperOpcode = PPC::STXSD;
+ LowerOpcode = PPC::STFD;
+ break;
+ }
+ unsigned TargetReg = MI.getOperand(0).getReg();
+ unsigned Opcode;
+ if ((TargetReg >= PPC::F0 && TargetReg <= PPC::F31) ||
+ (TargetReg >= PPC::VSL0 && TargetReg <= PPC::VSL31))
+ Opcode = LowerOpcode;
+ else
+ Opcode = UpperOpcode;
+ MI.setDesc(get(Opcode));
+ return true;
+ }
+ }
+ return false;
+}
+
+const TargetRegisterClass *
+PPCInstrInfo::updatedRC(const TargetRegisterClass *RC) const {
+ if (Subtarget.hasVSX() && RC == &PPC::VRRCRegClass)
+ return &PPC::VSRCRegClass;
+ return RC;
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h
new file mode 100644
index 000000000000..32b2f009a3f5
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h
@@ -0,0 +1,297 @@
+//===-- PPCInstrInfo.h - PowerPC Instruction Information --------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PowerPC implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCINSTRINFO_H
+#define LLVM_LIB_TARGET_POWERPC_PPCINSTRINFO_H
+
+#include "PPC.h"
+#include "PPCRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "PPCGenInstrInfo.inc"
+
+namespace llvm {
+
+/// PPCII - This namespace holds all of the PowerPC target-specific
+/// per-instruction flags. These must match the corresponding definitions in
+/// PPC.td and PPCInstrFormats.td.
+namespace PPCII {
+enum {
+ // PPC970 Instruction Flags. These flags describe the characteristics of the
+ // PowerPC 970 (aka G5) dispatch groups and how they are formed out of
+ // raw machine instructions.
+
+ /// PPC970_First - This instruction starts a new dispatch group, so it will
+ /// always be the first one in the group.
+ PPC970_First = 0x1,
+
+ /// PPC970_Single - This instruction starts a new dispatch group and
+ /// terminates it, so it will be the sole instruction in the group.
+ PPC970_Single = 0x2,
+
+ /// PPC970_Cracked - This instruction is cracked into two pieces, requiring
+ /// two dispatch pipes to be available to issue.
+ PPC970_Cracked = 0x4,
+
+ /// PPC970_Mask/Shift - This is a bitmask that selects the pipeline type that
+ /// an instruction is issued to.
+ PPC970_Shift = 3,
+ PPC970_Mask = 0x07 << PPC970_Shift
+};
+enum PPC970_Unit {
+ /// These are the various PPC970 execution unit pipelines. Each instruction
+ /// is one of these.
+ PPC970_Pseudo = 0 << PPC970_Shift, // Pseudo instruction
+ PPC970_FXU = 1 << PPC970_Shift, // Fixed Point (aka Integer/ALU) Unit
+ PPC970_LSU = 2 << PPC970_Shift, // Load Store Unit
+ PPC970_FPU = 3 << PPC970_Shift, // Floating Point Unit
+ PPC970_CRU = 4 << PPC970_Shift, // Control Register Unit
+ PPC970_VALU = 5 << PPC970_Shift, // Vector ALU
+ PPC970_VPERM = 6 << PPC970_Shift, // Vector Permute Unit
+ PPC970_BRU = 7 << PPC970_Shift // Branch Unit
+};
+
+enum {
+ /// Shift count to bypass PPC970 flags
+ NewDef_Shift = 6,
+
+ /// The VSX instruction that uses VSX register (vs0-vs63), instead of VMX
+ /// register (v0-v31).
+ UseVSXReg = 0x1 << NewDef_Shift
+};
+} // end namespace PPCII
+
+class PPCSubtarget;
+class PPCInstrInfo : public PPCGenInstrInfo {
+ PPCSubtarget &Subtarget;
+ const PPCRegisterInfo RI;
+
+ bool StoreRegToStackSlot(MachineFunction &MF,
+ unsigned SrcReg, bool isKill, int FrameIdx,
+ const TargetRegisterClass *RC,
+ SmallVectorImpl<MachineInstr*> &NewMIs,
+ bool &NonRI, bool &SpillsVRS) const;
+ bool LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL,
+ unsigned DestReg, int FrameIdx,
+ const TargetRegisterClass *RC,
+ SmallVectorImpl<MachineInstr *> &NewMIs,
+ bool &NonRI, bool &SpillsVRS) const;
+ virtual void anchor();
+
+protected:
+ /// Commutes the operands in the given instruction.
+ /// The commutable operands are specified by their indices OpIdx1 and OpIdx2.
+ ///
+ /// Do not call this method for a non-commutable instruction or for
+ /// non-commutable pair of operand indices OpIdx1 and OpIdx2.
+ /// Even though the instruction is commutable, the method may still
+ /// fail to commute the operands, null pointer is returned in such cases.
+ ///
+ /// For example, we can commute rlwimi instructions, but only if the
+ /// rotate amt is zero. We also have to munge the immediates a bit.
+ MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+ unsigned OpIdx1,
+ unsigned OpIdx2) const override;
+
+public:
+ explicit PPCInstrInfo(PPCSubtarget &STI);
+
+ /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As
+ /// such, whenever a client has an instance of instruction info, it should
+ /// always be able to get register info as well (through this method).
+ ///
+ const PPCRegisterInfo &getRegisterInfo() const { return RI; }
+
+ ScheduleHazardRecognizer *
+ CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI,
+ const ScheduleDAG *DAG) const override;
+ ScheduleHazardRecognizer *
+ CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
+ const ScheduleDAG *DAG) const override;
+
+ unsigned getInstrLatency(const InstrItineraryData *ItinData,
+ const MachineInstr &MI,
+ unsigned *PredCost = nullptr) const override;
+
+ int getOperandLatency(const InstrItineraryData *ItinData,
+ const MachineInstr &DefMI, unsigned DefIdx,
+ const MachineInstr &UseMI,
+ unsigned UseIdx) const override;
+ int getOperandLatency(const InstrItineraryData *ItinData,
+ SDNode *DefNode, unsigned DefIdx,
+ SDNode *UseNode, unsigned UseIdx) const override {
+ return PPCGenInstrInfo::getOperandLatency(ItinData, DefNode, DefIdx,
+ UseNode, UseIdx);
+ }
+
+ bool hasLowDefLatency(const TargetSchedModel &SchedModel,
+ const MachineInstr &DefMI,
+ unsigned DefIdx) const override {
+ // Machine LICM should hoist all instructions in low-register-pressure
+ // situations; none are sufficiently free to justify leaving in a loop
+ // body.
+ return false;
+ }
+
+ bool useMachineCombiner() const override {
+ return true;
+ }
+
+ /// Return true when there is potentially a faster code sequence
+ /// for an instruction chain ending in <Root>. All potential patterns are
+ /// output in the <Pattern> array.
+ bool getMachineCombinerPatterns(
+ MachineInstr &Root,
+ SmallVectorImpl<MachineCombinerPattern> &P) const override;
+
+ bool isAssociativeAndCommutative(const MachineInstr &Inst) const override;
+
+ bool isCoalescableExtInstr(const MachineInstr &MI,
+ unsigned &SrcReg, unsigned &DstReg,
+ unsigned &SubIdx) const override;
+ unsigned isLoadFromStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const override;
+ unsigned isStoreToStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const override;
+
+ bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
+ unsigned &SrcOpIdx2) const override;
+
+ void insertNoop(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI) const override;
+
+
+ // Branch analysis.
+ bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const override;
+ unsigned removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved = nullptr) const override;
+ unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+ const DebugLoc &DL,
+ int *BytesAdded = nullptr) const override;
+
+ // Select analysis.
+ bool canInsertSelect(const MachineBasicBlock &, ArrayRef<MachineOperand> Cond,
+ unsigned, unsigned, int &, int &, int &) const override;
+ void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ const DebugLoc &DL, unsigned DstReg,
+ ArrayRef<MachineOperand> Cond, unsigned TrueReg,
+ unsigned FalseReg) const override;
+
+ void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+ bool KillSrc) const override;
+
+ void storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ unsigned SrcReg, bool isKill, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
+
+ void loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ unsigned DestReg, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
+
+ bool
+ reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+
+ bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg,
+ MachineRegisterInfo *MRI) const override;
+
+ // If conversion by predication (only supported by some branch instructions).
+ // All of the profitability checks always return true; it is always
+ // profitable to use the predicated branches.
+ bool isProfitableToIfCvt(MachineBasicBlock &MBB,
+ unsigned NumCycles, unsigned ExtraPredCycles,
+ BranchProbability Probability) const override {
+ return true;
+ }
+
+ bool isProfitableToIfCvt(MachineBasicBlock &TMBB,
+ unsigned NumT, unsigned ExtraT,
+ MachineBasicBlock &FMBB,
+ unsigned NumF, unsigned ExtraF,
+ BranchProbability Probability) const override;
+
+ bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
+ BranchProbability Probability) const override {
+ return true;
+ }
+
+ bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
+ MachineBasicBlock &FMBB) const override {
+ return false;
+ }
+
+ // Predication support.
+ bool isPredicated(const MachineInstr &MI) const override;
+
+ bool isUnpredicatedTerminator(const MachineInstr &MI) const override;
+
+ bool PredicateInstruction(MachineInstr &MI,
+ ArrayRef<MachineOperand> Pred) const override;
+
+ bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
+ ArrayRef<MachineOperand> Pred2) const override;
+
+ bool DefinesPredicate(MachineInstr &MI,
+ std::vector<MachineOperand> &Pred) const override;
+
+ bool isPredicable(MachineInstr &MI) const override;
+
+ // Comparison optimization.
+
+ bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
+ unsigned &SrcReg2, int &Mask, int &Value) const override;
+
+ bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
+ unsigned SrcReg2, int Mask, int Value,
+ const MachineRegisterInfo *MRI) const override;
+
+ /// GetInstSize - Return the number of bytes of code the specified
+ /// instruction may be. This returns the maximum number of bytes.
+ ///
+ unsigned getInstSizeInBytes(const MachineInstr &MI) const override;
+
+ void getNoopForMachoTarget(MCInst &NopInst) const override;
+
+ std::pair<unsigned, unsigned>
+ decomposeMachineOperandsTargetFlags(unsigned TF) const override;
+
+ ArrayRef<std::pair<unsigned, const char *>>
+ getSerializableDirectMachineOperandTargetFlags() const override;
+
+ ArrayRef<std::pair<unsigned, const char *>>
+ getSerializableBitmaskMachineOperandTargetFlags() const override;
+
+ // Lower pseudo instructions after register allocation.
+ bool expandPostRAPseudo(MachineInstr &MI) const override;
+
+ static bool isVFRegister(unsigned Reg) {
+ return Reg >= PPC::VF0 && Reg <= PPC::VF31;
+ }
+ static bool isVRRegister(unsigned Reg) {
+ return Reg >= PPC::V0 && Reg <= PPC::V31;
+ }
+ const TargetRegisterClass *updatedRC(const TargetRegisterClass *RC) const;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td
new file mode 100644
index 000000000000..a7231bd2e2c0
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -0,0 +1,4403 @@
+//===-- PPCInstrInfo.td - The PowerPC Instruction Set ------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the subset of the 32-bit PowerPC instruction set, as used
+// by the PowerPC instruction selector.
+//
+//===----------------------------------------------------------------------===//
+
+include "PPCInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// PowerPC specific type constraints.
+//
+def SDT_PPCstfiwx : SDTypeProfile<0, 2, [ // stfiwx
+ SDTCisVT<0, f64>, SDTCisPtrTy<1>
+]>;
+def SDT_PPClfiwx : SDTypeProfile<1, 1, [ // lfiw[az]x
+ SDTCisVT<0, f64>, SDTCisPtrTy<1>
+]>;
+def SDT_PPCLxsizx : SDTypeProfile<1, 2, [
+ SDTCisVT<0, f64>, SDTCisPtrTy<1>, SDTCisPtrTy<2>
+]>;
+def SDT_PPCstxsix : SDTypeProfile<0, 3, [
+ SDTCisVT<0, f64>, SDTCisPtrTy<1>, SDTCisPtrTy<2>
+]>;
+def SDT_PPCVexts : SDTypeProfile<1, 2, [
+ SDTCisVT<0, f64>, SDTCisVT<1, f64>, SDTCisPtrTy<2>
+]>;
+
+def SDT_PPCCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_PPCCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>,
+ SDTCisVT<1, i32> ]>;
+def SDT_PPCvperm : SDTypeProfile<1, 3, [
+ SDTCisVT<3, v16i8>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>
+]>;
+
+def SDT_PPCVecSplat : SDTypeProfile<1, 2, [ SDTCisVec<0>,
+ SDTCisVec<1>, SDTCisInt<2>
+]>;
+
+def SDT_PPCVecShift : SDTypeProfile<1, 3, [ SDTCisVec<0>,
+ SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>
+]>;
+
+def SDT_PPCVecInsert : SDTypeProfile<1, 3, [ SDTCisVec<0>,
+ SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>
+]>;
+
+def SDT_PPCvcmp : SDTypeProfile<1, 3, [
+ SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i32>
+]>;
+
+def SDT_PPCcondbr : SDTypeProfile<0, 3, [
+ SDTCisVT<0, i32>, SDTCisVT<2, OtherVT>
+]>;
+
+def SDT_PPClbrx : SDTypeProfile<1, 2, [
+ SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT>
+]>;
+def SDT_PPCstbrx : SDTypeProfile<0, 3, [
+ SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT>
+]>;
+
+def SDT_PPCTC_ret : SDTypeProfile<0, 2, [
+ SDTCisPtrTy<0>, SDTCisVT<1, i32>
+]>;
+
+def tocentry32 : Operand<iPTR> {
+ let MIOperandInfo = (ops i32imm:$imm);
+}
+
+def SDT_PPCqvfperm : SDTypeProfile<1, 3, [
+ SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisVec<3>
+]>;
+def SDT_PPCqvgpci : SDTypeProfile<1, 1, [
+ SDTCisVec<0>, SDTCisInt<1>
+]>;
+def SDT_PPCqvaligni : SDTypeProfile<1, 3, [
+ SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<3>
+]>;
+def SDT_PPCqvesplati : SDTypeProfile<1, 2, [
+ SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisInt<2>
+]>;
+
+def SDT_PPCqbflt : SDTypeProfile<1, 1, [
+ SDTCisVec<0>, SDTCisVec<1>
+]>;
+
+def SDT_PPCqvlfsb : SDTypeProfile<1, 1, [
+ SDTCisVec<0>, SDTCisPtrTy<1>
+]>;
+
+//===----------------------------------------------------------------------===//
+// PowerPC specific DAG Nodes.
+//
+
+def PPCfre : SDNode<"PPCISD::FRE", SDTFPUnaryOp, []>;
+def PPCfrsqrte: SDNode<"PPCISD::FRSQRTE", SDTFPUnaryOp, []>;
+
+def PPCfcfid : SDNode<"PPCISD::FCFID", SDTFPUnaryOp, []>;
+def PPCfcfidu : SDNode<"PPCISD::FCFIDU", SDTFPUnaryOp, []>;
+def PPCfcfids : SDNode<"PPCISD::FCFIDS", SDTFPRoundOp, []>;
+def PPCfcfidus: SDNode<"PPCISD::FCFIDUS", SDTFPRoundOp, []>;
+def PPCfctidz : SDNode<"PPCISD::FCTIDZ", SDTFPUnaryOp, []>;
+def PPCfctiwz : SDNode<"PPCISD::FCTIWZ", SDTFPUnaryOp, []>;
+def PPCfctiduz: SDNode<"PPCISD::FCTIDUZ",SDTFPUnaryOp, []>;
+def PPCfctiwuz: SDNode<"PPCISD::FCTIWUZ",SDTFPUnaryOp, []>;
+def PPCstfiwx : SDNode<"PPCISD::STFIWX", SDT_PPCstfiwx,
+ [SDNPHasChain, SDNPMayStore]>;
+def PPClfiwax : SDNode<"PPCISD::LFIWAX", SDT_PPClfiwx,
+ [SDNPHasChain, SDNPMayLoad]>;
+def PPClfiwzx : SDNode<"PPCISD::LFIWZX", SDT_PPClfiwx,
+ [SDNPHasChain, SDNPMayLoad]>;
+def PPClxsizx : SDNode<"PPCISD::LXSIZX", SDT_PPCLxsizx,
+ [SDNPHasChain, SDNPMayLoad]>;
+def PPCstxsix : SDNode<"PPCISD::STXSIX", SDT_PPCstxsix,
+ [SDNPHasChain, SDNPMayStore]>;
+def PPCVexts : SDNode<"PPCISD::VEXTS", SDT_PPCVexts, []>;
+
+// Extract FPSCR (not modeled at the DAG level).
+def PPCmffs : SDNode<"PPCISD::MFFS",
+ SDTypeProfile<1, 0, [SDTCisVT<0, f64>]>, []>;
+
+// Perform FADD in round-to-zero mode.
+def PPCfaddrtz: SDNode<"PPCISD::FADDRTZ", SDTFPBinOp, []>;
+
+
+def PPCfsel : SDNode<"PPCISD::FSEL",
+ // Type constraint for fsel.
+ SDTypeProfile<1, 3, [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>,
+ SDTCisFP<0>, SDTCisVT<1, f64>]>, []>;
+
+def PPChi : SDNode<"PPCISD::Hi", SDTIntBinOp, []>;
+def PPClo : SDNode<"PPCISD::Lo", SDTIntBinOp, []>;
+def PPCtoc_entry: SDNode<"PPCISD::TOC_ENTRY", SDTIntBinOp,
+ [SDNPMayLoad, SDNPMemOperand]>;
+def PPCvmaddfp : SDNode<"PPCISD::VMADDFP", SDTFPTernaryOp, []>;
+def PPCvnmsubfp : SDNode<"PPCISD::VNMSUBFP", SDTFPTernaryOp, []>;
+
+def PPCppc32GOT : SDNode<"PPCISD::PPC32_GOT", SDTIntLeaf, []>;
+
+def PPCaddisGotTprelHA : SDNode<"PPCISD::ADDIS_GOT_TPREL_HA", SDTIntBinOp>;
+def PPCldGotTprelL : SDNode<"PPCISD::LD_GOT_TPREL_L", SDTIntBinOp,
+ [SDNPMayLoad]>;
+def PPCaddTls : SDNode<"PPCISD::ADD_TLS", SDTIntBinOp, []>;
+def PPCaddisTlsgdHA : SDNode<"PPCISD::ADDIS_TLSGD_HA", SDTIntBinOp>;
+def PPCaddiTlsgdL : SDNode<"PPCISD::ADDI_TLSGD_L", SDTIntBinOp>;
+def PPCgetTlsAddr : SDNode<"PPCISD::GET_TLS_ADDR", SDTIntBinOp>;
+def PPCaddiTlsgdLAddr : SDNode<"PPCISD::ADDI_TLSGD_L_ADDR",
+ SDTypeProfile<1, 3, [
+ SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+ SDTCisSameAs<0, 3>, SDTCisInt<0> ]>>;
+def PPCaddisTlsldHA : SDNode<"PPCISD::ADDIS_TLSLD_HA", SDTIntBinOp>;
+def PPCaddiTlsldL : SDNode<"PPCISD::ADDI_TLSLD_L", SDTIntBinOp>;
+def PPCgetTlsldAddr : SDNode<"PPCISD::GET_TLSLD_ADDR", SDTIntBinOp>;
+def PPCaddiTlsldLAddr : SDNode<"PPCISD::ADDI_TLSLD_L_ADDR",
+ SDTypeProfile<1, 3, [
+ SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+ SDTCisSameAs<0, 3>, SDTCisInt<0> ]>>;
+def PPCaddisDtprelHA : SDNode<"PPCISD::ADDIS_DTPREL_HA", SDTIntBinOp>;
+def PPCaddiDtprelL : SDNode<"PPCISD::ADDI_DTPREL_L", SDTIntBinOp>;
+
+def PPCvperm : SDNode<"PPCISD::VPERM", SDT_PPCvperm, []>;
+def PPCxxsplt : SDNode<"PPCISD::XXSPLT", SDT_PPCVecSplat, []>;
+def PPCxxinsert : SDNode<"PPCISD::XXINSERT", SDT_PPCVecInsert, []>;
+def PPCvecshl : SDNode<"PPCISD::VECSHL", SDT_PPCVecShift, []>;
+
+def PPCqvfperm : SDNode<"PPCISD::QVFPERM", SDT_PPCqvfperm, []>;
+def PPCqvgpci : SDNode<"PPCISD::QVGPCI", SDT_PPCqvgpci, []>;
+def PPCqvaligni : SDNode<"PPCISD::QVALIGNI", SDT_PPCqvaligni, []>;
+def PPCqvesplati : SDNode<"PPCISD::QVESPLATI", SDT_PPCqvesplati, []>;
+
+def PPCqbflt : SDNode<"PPCISD::QBFLT", SDT_PPCqbflt, []>;
+
+def PPCqvlfsb : SDNode<"PPCISD::QVLFSb", SDT_PPCqvlfsb,
+ [SDNPHasChain, SDNPMayLoad]>;
+
+def PPCcmpb : SDNode<"PPCISD::CMPB", SDTIntBinOp, []>;
+
+// These nodes represent the 32-bit PPC shifts that operate on 6-bit shift
+// amounts. These nodes are generated by the multi-precision shift code.
+def PPCsrl : SDNode<"PPCISD::SRL" , SDTIntShiftOp>;
+def PPCsra : SDNode<"PPCISD::SRA" , SDTIntShiftOp>;
+def PPCshl : SDNode<"PPCISD::SHL" , SDTIntShiftOp>;
+
+// These are target-independent nodes, but have target-specific formats.
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_PPCCallSeqStart,
+ [SDNPHasChain, SDNPOutGlue]>;
+def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_PPCCallSeqEnd,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def SDT_PPCCall : SDTypeProfile<0, -1, [SDTCisInt<0>]>;
+def PPCcall : SDNode<"PPCISD::CALL", SDT_PPCCall,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+ SDNPVariadic]>;
+def PPCcall_nop : SDNode<"PPCISD::CALL_NOP", SDT_PPCCall,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+ SDNPVariadic]>;
+def PPCmtctr : SDNode<"PPCISD::MTCTR", SDT_PPCCall,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def PPCbctrl : SDNode<"PPCISD::BCTRL", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+ SDNPVariadic]>;
+def PPCbctrl_load_toc : SDNode<"PPCISD::BCTRL_LOAD_TOC",
+ SDTypeProfile<0, 1, []>,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+ SDNPVariadic]>;
+
+def retflag : SDNode<"PPCISD::RET_FLAG", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+def PPCtc_return : SDNode<"PPCISD::TC_RETURN", SDT_PPCTC_ret,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+def PPCeh_sjlj_setjmp : SDNode<"PPCISD::EH_SJLJ_SETJMP",
+ SDTypeProfile<1, 1, [SDTCisInt<0>,
+ SDTCisPtrTy<1>]>,
+ [SDNPHasChain, SDNPSideEffect]>;
+def PPCeh_sjlj_longjmp : SDNode<"PPCISD::EH_SJLJ_LONGJMP",
+ SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+def SDT_PPCsc : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def PPCsc : SDNode<"PPCISD::SC", SDT_PPCsc,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+def PPCclrbhrb : SDNode<"PPCISD::CLRBHRB", SDTNone,
+ [SDNPHasChain, SDNPSideEffect]>;
+def PPCmfbhrbe : SDNode<"PPCISD::MFBHRBE", SDTIntBinOp, [SDNPHasChain]>;
+def PPCrfebb : SDNode<"PPCISD::RFEBB", SDT_PPCsc,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+def PPCvcmp : SDNode<"PPCISD::VCMP" , SDT_PPCvcmp, []>;
+def PPCvcmp_o : SDNode<"PPCISD::VCMPo", SDT_PPCvcmp, [SDNPOutGlue]>;
+
+def PPCcondbranch : SDNode<"PPCISD::COND_BRANCH", SDT_PPCcondbr,
+ [SDNPHasChain, SDNPOptInGlue]>;
+
+def PPClbrx : SDNode<"PPCISD::LBRX", SDT_PPClbrx,
+ [SDNPHasChain, SDNPMayLoad]>;
+def PPCstbrx : SDNode<"PPCISD::STBRX", SDT_PPCstbrx,
+ [SDNPHasChain, SDNPMayStore]>;
+
+// Instructions to set/unset CR bit 6 for SVR4 vararg calls
+def PPCcr6set : SDNode<"PPCISD::CR6SET", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def PPCcr6unset : SDNode<"PPCISD::CR6UNSET", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+// Instructions to support dynamic alloca.
+def SDTDynOp : SDTypeProfile<1, 2, []>;
+def SDTDynAreaOp : SDTypeProfile<1, 1, []>;
+def PPCdynalloc : SDNode<"PPCISD::DYNALLOC", SDTDynOp, [SDNPHasChain]>;
+def PPCdynareaoffset : SDNode<"PPCISD::DYNAREAOFFSET", SDTDynAreaOp, [SDNPHasChain]>;
+
+//===----------------------------------------------------------------------===//
+// PowerPC specific transformation functions and pattern fragments.
+//
+
+def SHL32 : SDNodeXForm<imm, [{
+ // Transformation function: 31 - imm
+ return getI32Imm(31 - N->getZExtValue(), SDLoc(N));
+}]>;
+
+def SRL32 : SDNodeXForm<imm, [{
+ // Transformation function: 32 - imm
+ return N->getZExtValue() ? getI32Imm(32 - N->getZExtValue(), SDLoc(N))
+ : getI32Imm(0, SDLoc(N));
+}]>;
+
+def LO16 : SDNodeXForm<imm, [{
+ // Transformation function: get the low 16 bits.
+ return getI32Imm((unsigned short)N->getZExtValue(), SDLoc(N));
+}]>;
+
+def HI16 : SDNodeXForm<imm, [{
+ // Transformation function: shift the immediate value down into the low bits.
+ return getI32Imm((unsigned)N->getZExtValue() >> 16, SDLoc(N));
+}]>;
+
+def HA16 : SDNodeXForm<imm, [{
+ // Transformation function: shift the immediate value down into the low bits.
+ int Val = N->getZExtValue();
+ return getI32Imm((Val - (signed short)Val) >> 16, SDLoc(N));
+}]>;
+def MB : SDNodeXForm<imm, [{
+ // Transformation function: get the start bit of a mask
+ unsigned mb = 0, me;
+ (void)isRunOfOnes((unsigned)N->getZExtValue(), mb, me);
+ return getI32Imm(mb, SDLoc(N));
+}]>;
+
+def ME : SDNodeXForm<imm, [{
+ // Transformation function: get the end bit of a mask
+ unsigned mb, me = 0;
+ (void)isRunOfOnes((unsigned)N->getZExtValue(), mb, me);
+ return getI32Imm(me, SDLoc(N));
+}]>;
+def maskimm32 : PatLeaf<(imm), [{
+ // maskImm predicate - True if immediate is a run of ones.
+ unsigned mb, me;
+ if (N->getValueType(0) == MVT::i32)
+ return isRunOfOnes((unsigned)N->getZExtValue(), mb, me);
+ else
+ return false;
+}]>;
+
+def imm32SExt16 : Operand<i32>, ImmLeaf<i32, [{
+ // imm32SExt16 predicate - True if the i32 immediate fits in a 16-bit
+ // sign extended field. Used by instructions like 'addi'.
+ return (int32_t)Imm == (short)Imm;
+}]>;
+def imm64SExt16 : Operand<i64>, ImmLeaf<i64, [{
+ // imm64SExt16 predicate - True if the i64 immediate fits in a 16-bit
+ // sign extended field. Used by instructions like 'addi'.
+ return (int64_t)Imm == (short)Imm;
+}]>;
+def immZExt16 : PatLeaf<(imm), [{
+ // immZExt16 predicate - True if the immediate fits in a 16-bit zero extended
+ // field. Used by instructions like 'ori'.
+ return (uint64_t)N->getZExtValue() == (unsigned short)N->getZExtValue();
+}], LO16>;
+def immAnyExt8 : ImmLeaf<i32, [{ return isInt<8>(Imm) || isUInt<8>(Imm); }]>;
+def immSExt5NonZero : ImmLeaf<i32, [{ return Imm && isInt<5>(Imm); }]>;
+
+// imm16Shifted* - These match immediates where the low 16-bits are zero. There
+// are two forms: imm16ShiftedSExt and imm16ShiftedZExt. These two forms are
+// identical in 32-bit mode, but in 64-bit mode, they return true if the
+// immediate fits into a sign/zero extended 32-bit immediate (with the low bits
+// clear).
+def imm16ShiftedZExt : PatLeaf<(imm), [{
+ // imm16ShiftedZExt predicate - True if only bits in the top 16-bits of the
+ // immediate are set. Used by instructions like 'xoris'.
+ return (N->getZExtValue() & ~uint64_t(0xFFFF0000)) == 0;
+}], HI16>;
+
+def imm16ShiftedSExt : PatLeaf<(imm), [{
+ // imm16ShiftedSExt predicate - True if only bits in the top 16-bits of the
+ // immediate are set. Used by instructions like 'addis'. Identical to
+ // imm16ShiftedZExt in 32-bit mode.
+ if (N->getZExtValue() & 0xFFFF) return false;
+ if (N->getValueType(0) == MVT::i32)
+ return true;
+ // For 64-bit, make sure it is sext right.
+ return N->getZExtValue() == (uint64_t)(int)N->getZExtValue();
+}], HI16>;
+
+def imm64ZExt32 : Operand<i64>, ImmLeaf<i64, [{
+ // imm64ZExt32 predicate - True if the i64 immediate fits in a 32-bit
+ // zero extended field.
+ return isUInt<32>(Imm);
+}]>;
+
+// Some r+i load/store instructions (such as LD, STD, LDU, etc.) that require
+// restricted memrix (4-aligned) constants are alignment sensitive. If these
+// offsets are hidden behind TOC entries than the values of the lower-order
+// bits cannot be checked directly. As a result, we need to also incorporate
+// an alignment check into the relevant patterns.
+
+def aligned4load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return cast<LoadSDNode>(N)->getAlignment() >= 4;
+}]>;
+def aligned4store : PatFrag<(ops node:$val, node:$ptr),
+ (store node:$val, node:$ptr), [{
+ return cast<StoreSDNode>(N)->getAlignment() >= 4;
+}]>;
+def aligned4sextloadi32 : PatFrag<(ops node:$ptr), (sextloadi32 node:$ptr), [{
+ return cast<LoadSDNode>(N)->getAlignment() >= 4;
+}]>;
+def aligned4pre_store : PatFrag<
+ (ops node:$val, node:$base, node:$offset),
+ (pre_store node:$val, node:$base, node:$offset), [{
+ return cast<StoreSDNode>(N)->getAlignment() >= 4;
+}]>;
+
+def unaligned4load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return cast<LoadSDNode>(N)->getAlignment() < 4;
+}]>;
+def unaligned4store : PatFrag<(ops node:$val, node:$ptr),
+ (store node:$val, node:$ptr), [{
+ return cast<StoreSDNode>(N)->getAlignment() < 4;
+}]>;
+def unaligned4sextloadi32 : PatFrag<(ops node:$ptr), (sextloadi32 node:$ptr), [{
+ return cast<LoadSDNode>(N)->getAlignment() < 4;
+}]>;
+
+//===----------------------------------------------------------------------===//
+// PowerPC Flag Definitions.
+
+class isPPC64 { bit PPC64 = 1; }
+class isDOT { bit RC = 1; }
+
+class RegConstraint<string C> {
+ string Constraints = C;
+}
+class NoEncode<string E> {
+ string DisableEncoding = E;
+}
+
+
+//===----------------------------------------------------------------------===//
+// PowerPC Operand Definitions.
+
+// In the default PowerPC assembler syntax, registers are specified simply
+// by number, so they cannot be distinguished from immediate values (without
+// looking at the opcode). This means that the default operand matching logic
+// for the asm parser does not work, and we need to specify custom matchers.
+// Since those can only be specified with RegisterOperand classes and not
+// directly on the RegisterClass, all instructions patterns used by the asm
+// parser need to use a RegisterOperand (instead of a RegisterClass) for
+// all their register operands.
+// For this purpose, we define one RegisterOperand for each RegisterClass,
+// using the same name as the class, just in lower case.
+
+def PPCRegGPRCAsmOperand : AsmOperandClass {
+ let Name = "RegGPRC"; let PredicateMethod = "isRegNumber";
+}
+def gprc : RegisterOperand<GPRC> {
+ let ParserMatchClass = PPCRegGPRCAsmOperand;
+}
+def PPCRegG8RCAsmOperand : AsmOperandClass {
+ let Name = "RegG8RC"; let PredicateMethod = "isRegNumber";
+}
+def g8rc : RegisterOperand<G8RC> {
+ let ParserMatchClass = PPCRegG8RCAsmOperand;
+}
+def PPCRegGPRCNoR0AsmOperand : AsmOperandClass {
+ let Name = "RegGPRCNoR0"; let PredicateMethod = "isRegNumber";
+}
+def gprc_nor0 : RegisterOperand<GPRC_NOR0> {
+ let ParserMatchClass = PPCRegGPRCNoR0AsmOperand;
+}
+def PPCRegG8RCNoX0AsmOperand : AsmOperandClass {
+ let Name = "RegG8RCNoX0"; let PredicateMethod = "isRegNumber";
+}
+def g8rc_nox0 : RegisterOperand<G8RC_NOX0> {
+ let ParserMatchClass = PPCRegG8RCNoX0AsmOperand;
+}
+def PPCRegF8RCAsmOperand : AsmOperandClass {
+ let Name = "RegF8RC"; let PredicateMethod = "isRegNumber";
+}
+def f8rc : RegisterOperand<F8RC> {
+ let ParserMatchClass = PPCRegF8RCAsmOperand;
+}
+def PPCRegF4RCAsmOperand : AsmOperandClass {
+ let Name = "RegF4RC"; let PredicateMethod = "isRegNumber";
+}
+def f4rc : RegisterOperand<F4RC> {
+ let ParserMatchClass = PPCRegF4RCAsmOperand;
+}
+def PPCRegVRRCAsmOperand : AsmOperandClass {
+ let Name = "RegVRRC"; let PredicateMethod = "isRegNumber";
+}
+def vrrc : RegisterOperand<VRRC> {
+ let ParserMatchClass = PPCRegVRRCAsmOperand;
+}
+def PPCRegVFRCAsmOperand : AsmOperandClass {
+ let Name = "RegVFRC"; let PredicateMethod = "isRegNumber";
+}
+def vfrc : RegisterOperand<VFRC> {
+ let ParserMatchClass = PPCRegVFRCAsmOperand;
+}
+def PPCRegCRBITRCAsmOperand : AsmOperandClass {
+ let Name = "RegCRBITRC"; let PredicateMethod = "isCRBitNumber";
+}
+def crbitrc : RegisterOperand<CRBITRC> {
+ let ParserMatchClass = PPCRegCRBITRCAsmOperand;
+}
+def PPCRegCRRCAsmOperand : AsmOperandClass {
+ let Name = "RegCRRC"; let PredicateMethod = "isCCRegNumber";
+}
+def crrc : RegisterOperand<CRRC> {
+ let ParserMatchClass = PPCRegCRRCAsmOperand;
+}
+def crrc0 : RegisterOperand<CRRC0> {
+ let ParserMatchClass = PPCRegCRRCAsmOperand;
+}
+
+def PPCU1ImmAsmOperand : AsmOperandClass {
+ let Name = "U1Imm"; let PredicateMethod = "isU1Imm";
+ let RenderMethod = "addImmOperands";
+}
+def u1imm : Operand<i32> {
+ let PrintMethod = "printU1ImmOperand";
+ let ParserMatchClass = PPCU1ImmAsmOperand;
+}
+
+def PPCU2ImmAsmOperand : AsmOperandClass {
+ let Name = "U2Imm"; let PredicateMethod = "isU2Imm";
+ let RenderMethod = "addImmOperands";
+}
+def u2imm : Operand<i32> {
+ let PrintMethod = "printU2ImmOperand";
+ let ParserMatchClass = PPCU2ImmAsmOperand;
+}
+
+def PPCATBitsAsHintAsmOperand : AsmOperandClass {
+ let Name = "ATBitsAsHint"; let PredicateMethod = "isATBitsAsHint";
+ let RenderMethod = "addImmOperands"; // Irrelevant, predicate always fails.
+}
+def atimm : Operand<i32> {
+ let PrintMethod = "printATBitsAsHint";
+ let ParserMatchClass = PPCATBitsAsHintAsmOperand;
+}
+
+def PPCU3ImmAsmOperand : AsmOperandClass {
+ let Name = "U3Imm"; let PredicateMethod = "isU3Imm";
+ let RenderMethod = "addImmOperands";
+}
+def u3imm : Operand<i32> {
+ let PrintMethod = "printU3ImmOperand";
+ let ParserMatchClass = PPCU3ImmAsmOperand;
+}
+
+def PPCU4ImmAsmOperand : AsmOperandClass {
+ let Name = "U4Imm"; let PredicateMethod = "isU4Imm";
+ let RenderMethod = "addImmOperands";
+}
+def u4imm : Operand<i32> {
+ let PrintMethod = "printU4ImmOperand";
+ let ParserMatchClass = PPCU4ImmAsmOperand;
+}
+def PPCS5ImmAsmOperand : AsmOperandClass {
+ let Name = "S5Imm"; let PredicateMethod = "isS5Imm";
+ let RenderMethod = "addImmOperands";
+}
+def s5imm : Operand<i32> {
+ let PrintMethod = "printS5ImmOperand";
+ let ParserMatchClass = PPCS5ImmAsmOperand;
+ let DecoderMethod = "decodeSImmOperand<5>";
+}
+def PPCU5ImmAsmOperand : AsmOperandClass {
+ let Name = "U5Imm"; let PredicateMethod = "isU5Imm";
+ let RenderMethod = "addImmOperands";
+}
+def u5imm : Operand<i32> {
+ let PrintMethod = "printU5ImmOperand";
+ let ParserMatchClass = PPCU5ImmAsmOperand;
+ let DecoderMethod = "decodeUImmOperand<5>";
+}
+def PPCU6ImmAsmOperand : AsmOperandClass {
+ let Name = "U6Imm"; let PredicateMethod = "isU6Imm";
+ let RenderMethod = "addImmOperands";
+}
+def u6imm : Operand<i32> {
+ let PrintMethod = "printU6ImmOperand";
+ let ParserMatchClass = PPCU6ImmAsmOperand;
+ let DecoderMethod = "decodeUImmOperand<6>";
+}
+def PPCU7ImmAsmOperand : AsmOperandClass {
+ let Name = "U7Imm"; let PredicateMethod = "isU7Imm";
+ let RenderMethod = "addImmOperands";
+}
+def u7imm : Operand<i32> {
+ let PrintMethod = "printU7ImmOperand";
+ let ParserMatchClass = PPCU7ImmAsmOperand;
+ let DecoderMethod = "decodeUImmOperand<7>";
+}
+def PPCU8ImmAsmOperand : AsmOperandClass {
+ let Name = "U8Imm"; let PredicateMethod = "isU8Imm";
+ let RenderMethod = "addImmOperands";
+}
+def u8imm : Operand<i32> {
+ let PrintMethod = "printU8ImmOperand";
+ let ParserMatchClass = PPCU8ImmAsmOperand;
+ let DecoderMethod = "decodeUImmOperand<8>";
+}
+def PPCU10ImmAsmOperand : AsmOperandClass {
+ let Name = "U10Imm"; let PredicateMethod = "isU10Imm";
+ let RenderMethod = "addImmOperands";
+}
+def u10imm : Operand<i32> {
+ let PrintMethod = "printU10ImmOperand";
+ let ParserMatchClass = PPCU10ImmAsmOperand;
+ let DecoderMethod = "decodeUImmOperand<10>";
+}
+def PPCU12ImmAsmOperand : AsmOperandClass {
+ let Name = "U12Imm"; let PredicateMethod = "isU12Imm";
+ let RenderMethod = "addImmOperands";
+}
+def u12imm : Operand<i32> {
+ let PrintMethod = "printU12ImmOperand";
+ let ParserMatchClass = PPCU12ImmAsmOperand;
+ let DecoderMethod = "decodeUImmOperand<12>";
+}
+def PPCS16ImmAsmOperand : AsmOperandClass {
+ let Name = "S16Imm"; let PredicateMethod = "isS16Imm";
+ let RenderMethod = "addS16ImmOperands";
+}
+def s16imm : Operand<i32> {
+ let PrintMethod = "printS16ImmOperand";
+ let EncoderMethod = "getImm16Encoding";
+ let ParserMatchClass = PPCS16ImmAsmOperand;
+ let DecoderMethod = "decodeSImmOperand<16>";
+}
+def PPCU16ImmAsmOperand : AsmOperandClass {
+ let Name = "U16Imm"; let PredicateMethod = "isU16Imm";
+ let RenderMethod = "addU16ImmOperands";
+}
+def u16imm : Operand<i32> {
+ let PrintMethod = "printU16ImmOperand";
+ let EncoderMethod = "getImm16Encoding";
+ let ParserMatchClass = PPCU16ImmAsmOperand;
+ let DecoderMethod = "decodeUImmOperand<16>";
+}
+def PPCS17ImmAsmOperand : AsmOperandClass {
+ let Name = "S17Imm"; let PredicateMethod = "isS17Imm";
+ let RenderMethod = "addS16ImmOperands";
+}
+def s17imm : Operand<i32> {
+ // This operand type is used for addis/lis to allow the assembler parser
+ // to accept immediates in the range -65536..65535 for compatibility with
+ // the GNU assembler. The operand is treated as 16-bit otherwise.
+ let PrintMethod = "printS16ImmOperand";
+ let EncoderMethod = "getImm16Encoding";
+ let ParserMatchClass = PPCS17ImmAsmOperand;
+ let DecoderMethod = "decodeSImmOperand<16>";
+}
+
+def fpimm0 : PatLeaf<(fpimm), [{ return N->isExactlyValue(+0.0); }]>;
+
+def PPCDirectBrAsmOperand : AsmOperandClass {
+ let Name = "DirectBr"; let PredicateMethod = "isDirectBr";
+ let RenderMethod = "addBranchTargetOperands";
+}
+def directbrtarget : Operand<OtherVT> {
+ let PrintMethod = "printBranchOperand";
+ let EncoderMethod = "getDirectBrEncoding";
+ let ParserMatchClass = PPCDirectBrAsmOperand;
+}
+def absdirectbrtarget : Operand<OtherVT> {
+ let PrintMethod = "printAbsBranchOperand";
+ let EncoderMethod = "getAbsDirectBrEncoding";
+ let ParserMatchClass = PPCDirectBrAsmOperand;
+}
+def PPCCondBrAsmOperand : AsmOperandClass {
+ let Name = "CondBr"; let PredicateMethod = "isCondBr";
+ let RenderMethod = "addBranchTargetOperands";
+}
+def condbrtarget : Operand<OtherVT> {
+ let PrintMethod = "printBranchOperand";
+ let EncoderMethod = "getCondBrEncoding";
+ let ParserMatchClass = PPCCondBrAsmOperand;
+}
+def abscondbrtarget : Operand<OtherVT> {
+ let PrintMethod = "printAbsBranchOperand";
+ let EncoderMethod = "getAbsCondBrEncoding";
+ let ParserMatchClass = PPCCondBrAsmOperand;
+}
+def calltarget : Operand<iPTR> {
+ let PrintMethod = "printBranchOperand";
+ let EncoderMethod = "getDirectBrEncoding";
+ let ParserMatchClass = PPCDirectBrAsmOperand;
+}
+def abscalltarget : Operand<iPTR> {
+ let PrintMethod = "printAbsBranchOperand";
+ let EncoderMethod = "getAbsDirectBrEncoding";
+ let ParserMatchClass = PPCDirectBrAsmOperand;
+}
+def PPCCRBitMaskOperand : AsmOperandClass {
+ let Name = "CRBitMask"; let PredicateMethod = "isCRBitMask";
+}
+def crbitm: Operand<i8> {
+ let PrintMethod = "printcrbitm";
+ let EncoderMethod = "get_crbitm_encoding";
+ let DecoderMethod = "decodeCRBitMOperand";
+ let ParserMatchClass = PPCCRBitMaskOperand;
+}
+// Address operands
+// A version of ptr_rc which excludes R0 (or X0 in 64-bit mode).
+def PPCRegGxRCNoR0Operand : AsmOperandClass {
+ let Name = "RegGxRCNoR0"; let PredicateMethod = "isRegNumber";
+}
+def ptr_rc_nor0 : Operand<iPTR>, PointerLikeRegClass<1> {
+ let ParserMatchClass = PPCRegGxRCNoR0Operand;
+}
+// A version of ptr_rc usable with the asm parser.
+def PPCRegGxRCOperand : AsmOperandClass {
+ let Name = "RegGxRC"; let PredicateMethod = "isRegNumber";
+}
+def ptr_rc_idx : Operand<iPTR>, PointerLikeRegClass<0> {
+ let ParserMatchClass = PPCRegGxRCOperand;
+}
+
+def PPCDispRIOperand : AsmOperandClass {
+ let Name = "DispRI"; let PredicateMethod = "isS16Imm";
+ let RenderMethod = "addS16ImmOperands";
+}
+def dispRI : Operand<iPTR> {
+ let ParserMatchClass = PPCDispRIOperand;
+}
+def PPCDispRIXOperand : AsmOperandClass {
+ let Name = "DispRIX"; let PredicateMethod = "isS16ImmX4";
+ let RenderMethod = "addImmOperands";
+}
+def dispRIX : Operand<iPTR> {
+ let ParserMatchClass = PPCDispRIXOperand;
+}
+def PPCDispRIX16Operand : AsmOperandClass {
+ let Name = "DispRIX16"; let PredicateMethod = "isS16ImmX16";
+ let RenderMethod = "addImmOperands";
+}
+def dispRIX16 : Operand<iPTR> {
+ let ParserMatchClass = PPCDispRIX16Operand;
+}
+def PPCDispSPE8Operand : AsmOperandClass {
+ let Name = "DispSPE8"; let PredicateMethod = "isU8ImmX8";
+ let RenderMethod = "addImmOperands";
+}
+def dispSPE8 : Operand<iPTR> {
+ let ParserMatchClass = PPCDispSPE8Operand;
+}
+def PPCDispSPE4Operand : AsmOperandClass {
+ let Name = "DispSPE4"; let PredicateMethod = "isU7ImmX4";
+ let RenderMethod = "addImmOperands";
+}
+def dispSPE4 : Operand<iPTR> {
+ let ParserMatchClass = PPCDispSPE4Operand;
+}
+def PPCDispSPE2Operand : AsmOperandClass {
+ let Name = "DispSPE2"; let PredicateMethod = "isU6ImmX2";
+ let RenderMethod = "addImmOperands";
+}
+def dispSPE2 : Operand<iPTR> {
+ let ParserMatchClass = PPCDispSPE2Operand;
+}
+
+def memri : Operand<iPTR> {
+ let PrintMethod = "printMemRegImm";
+ let MIOperandInfo = (ops dispRI:$imm, ptr_rc_nor0:$reg);
+ let EncoderMethod = "getMemRIEncoding";
+ let DecoderMethod = "decodeMemRIOperands";
+}
+def memrr : Operand<iPTR> {
+ let PrintMethod = "printMemRegReg";
+ let MIOperandInfo = (ops ptr_rc_nor0:$ptrreg, ptr_rc_idx:$offreg);
+}
+def memrix : Operand<iPTR> { // memri where the imm is 4-aligned.
+ let PrintMethod = "printMemRegImm";
+ let MIOperandInfo = (ops dispRIX:$imm, ptr_rc_nor0:$reg);
+ let EncoderMethod = "getMemRIXEncoding";
+ let DecoderMethod = "decodeMemRIXOperands";
+}
+def memrix16 : Operand<iPTR> { // memri, imm is 16-aligned, 12-bit, Inst{16:27}
+ let PrintMethod = "printMemRegImm";
+ let MIOperandInfo = (ops dispRIX16:$imm, ptr_rc_nor0:$reg);
+ let EncoderMethod = "getMemRIX16Encoding";
+ let DecoderMethod = "decodeMemRIX16Operands";
+}
+def spe8dis : Operand<iPTR> { // SPE displacement where the imm is 8-aligned.
+ let PrintMethod = "printMemRegImm";
+ let MIOperandInfo = (ops dispSPE8:$imm, ptr_rc_nor0:$reg);
+ let EncoderMethod = "getSPE8DisEncoding";
+}
+def spe4dis : Operand<iPTR> { // SPE displacement where the imm is 4-aligned.
+ let PrintMethod = "printMemRegImm";
+ let MIOperandInfo = (ops dispSPE4:$imm, ptr_rc_nor0:$reg);
+ let EncoderMethod = "getSPE4DisEncoding";
+}
+def spe2dis : Operand<iPTR> { // SPE displacement where the imm is 2-aligned.
+ let PrintMethod = "printMemRegImm";
+ let MIOperandInfo = (ops dispSPE2:$imm, ptr_rc_nor0:$reg);
+ let EncoderMethod = "getSPE2DisEncoding";
+}
+
+// A single-register address. This is used with the SjLj
+// pseudo-instructions.
+def memr : Operand<iPTR> {
+ let MIOperandInfo = (ops ptr_rc:$ptrreg);
+}
+def PPCTLSRegOperand : AsmOperandClass {
+ let Name = "TLSReg"; let PredicateMethod = "isTLSReg";
+ let RenderMethod = "addTLSRegOperands";
+}
+def tlsreg32 : Operand<i32> {
+ let EncoderMethod = "getTLSRegEncoding";
+ let ParserMatchClass = PPCTLSRegOperand;
+}
+def tlsgd32 : Operand<i32> {}
+def tlscall32 : Operand<i32> {
+ let PrintMethod = "printTLSCall";
+ let MIOperandInfo = (ops calltarget:$func, tlsgd32:$sym);
+ let EncoderMethod = "getTLSCallEncoding";
+}
+
+// PowerPC Predicate operand.
+def pred : Operand<OtherVT> {
+ let PrintMethod = "printPredicateOperand";
+ let MIOperandInfo = (ops i32imm:$bibo, crrc:$reg);
+}
+
+// Define PowerPC specific addressing mode.
+def iaddr : ComplexPattern<iPTR, 2, "SelectAddrImm", [], []>;
+def xaddr : ComplexPattern<iPTR, 2, "SelectAddrIdx", [], []>;
+def xoaddr : ComplexPattern<iPTR, 2, "SelectAddrIdxOnly",[], []>;
+def ixaddr : ComplexPattern<iPTR, 2, "SelectAddrImmX4", [], []>; // "std"
+
+// The address in a single register. This is used with the SjLj
+// pseudo-instructions.
+def addr : ComplexPattern<iPTR, 1, "SelectAddr",[], []>;
+
+/// This is just the offset part of iaddr, used for preinc.
+def iaddroff : ComplexPattern<iPTR, 1, "SelectAddrImmOffs", [], []>;
+
+//===----------------------------------------------------------------------===//
+// PowerPC Instruction Predicate Definitions.
+def In32BitMode : Predicate<"!PPCSubTarget->isPPC64()">;
+def In64BitMode : Predicate<"PPCSubTarget->isPPC64()">;
+def IsBookE : Predicate<"PPCSubTarget->isBookE()">;
+def IsNotBookE : Predicate<"!PPCSubTarget->isBookE()">;
+def HasOnlyMSYNC : Predicate<"PPCSubTarget->hasOnlyMSYNC()">;
+def HasSYNC : Predicate<"!PPCSubTarget->hasOnlyMSYNC()">;
+def IsPPC4xx : Predicate<"PPCSubTarget->isPPC4xx()">;
+def IsPPC6xx : Predicate<"PPCSubTarget->isPPC6xx()">;
+def IsE500 : Predicate<"PPCSubTarget->isE500()">;
+def HasSPE : Predicate<"PPCSubTarget->HasSPE()">;
+def HasICBT : Predicate<"PPCSubTarget->hasICBT()">;
+def HasPartwordAtomics : Predicate<"PPCSubTarget->hasPartwordAtomics()">;
+def NoNaNsFPMath : Predicate<"TM.Options.NoNaNsFPMath">;
+def NaNsFPMath : Predicate<"!TM.Options.NoNaNsFPMath">;
+def HasBPERMD : Predicate<"PPCSubTarget->hasBPERMD()">;
+def HasExtDiv : Predicate<"PPCSubTarget->hasExtDiv()">;
+def IsISA3_0 : Predicate<"PPCSubTarget->isISA3_0()">;
+
+//===----------------------------------------------------------------------===//
+// PowerPC Multiclass Definitions.
+
+multiclass XForm_6r<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+ string asmbase, string asmstr, InstrItinClass itin,
+ list<dag> pattern> {
+ let BaseName = asmbase in {
+ def NAME : XForm_6<opcode, xo, OOL, IOL,
+ !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+ pattern>, RecFormRel;
+ let Defs = [CR0] in
+ def o : XForm_6<opcode, xo, OOL, IOL,
+ !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+ []>, isDOT, RecFormRel;
+ }
+}
+
+multiclass XForm_6rc<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+ string asmbase, string asmstr, InstrItinClass itin,
+ list<dag> pattern> {
+ let BaseName = asmbase in {
+ let Defs = [CARRY] in
+ def NAME : XForm_6<opcode, xo, OOL, IOL,
+ !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+ pattern>, RecFormRel;
+ let Defs = [CARRY, CR0] in
+ def o : XForm_6<opcode, xo, OOL, IOL,
+ !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+ []>, isDOT, RecFormRel;
+ }
+}
+
+multiclass XForm_10rc<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+ string asmbase, string asmstr, InstrItinClass itin,
+ list<dag> pattern> {
+ let BaseName = asmbase in {
+ let Defs = [CARRY] in
+ def NAME : XForm_10<opcode, xo, OOL, IOL,
+ !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+ pattern>, RecFormRel;
+ let Defs = [CARRY, CR0] in
+ def o : XForm_10<opcode, xo, OOL, IOL,
+ !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+ []>, isDOT, RecFormRel;
+ }
+}
+
+multiclass XForm_11r<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+ string asmbase, string asmstr, InstrItinClass itin,
+ list<dag> pattern> {
+ let BaseName = asmbase in {
+ def NAME : XForm_11<opcode, xo, OOL, IOL,
+ !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+ pattern>, RecFormRel;
+ let Defs = [CR0] in
+ def o : XForm_11<opcode, xo, OOL, IOL,
+ !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+ []>, isDOT, RecFormRel;
+ }
+}
+
+multiclass XOForm_1r<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL,
+ string asmbase, string asmstr, InstrItinClass itin,
+ list<dag> pattern> {
+ let BaseName = asmbase in {
+ def NAME : XOForm_1<opcode, xo, oe, OOL, IOL,
+ !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+ pattern>, RecFormRel;
+ let Defs = [CR0] in
+ def o : XOForm_1<opcode, xo, oe, OOL, IOL,
+ !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+ []>, isDOT, RecFormRel;
+ }
+}
+
+// Multiclass for instructions for which the non record form is not cracked
+// and the record form is cracked (i.e. divw, mullw, etc.)
+multiclass XOForm_1rcr<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL,
+ string asmbase, string asmstr, InstrItinClass itin,
+ list<dag> pattern> {
+ let BaseName = asmbase in {
+ def NAME : XOForm_1<opcode, xo, oe, OOL, IOL,
+ !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+ pattern>, RecFormRel;
+ let Defs = [CR0] in
+ def o : XOForm_1<opcode, xo, oe, OOL, IOL,
+ !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+ []>, isDOT, RecFormRel, PPC970_DGroup_First,
+ PPC970_DGroup_Cracked;
+ }
+}
+
+multiclass XOForm_1rc<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL,
+ string asmbase, string asmstr, InstrItinClass itin,
+ list<dag> pattern> {
+ let BaseName = asmbase in {
+ let Defs = [CARRY] in
+ def NAME : XOForm_1<opcode, xo, oe, OOL, IOL,
+ !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+ pattern>, RecFormRel;
+ let Defs = [CARRY, CR0] in
+ def o : XOForm_1<opcode, xo, oe, OOL, IOL,
+ !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+ []>, isDOT, RecFormRel;
+ }
+}
+
+multiclass XOForm_3r<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL,
+ string asmbase, string asmstr, InstrItinClass itin,
+ list<dag> pattern> {
+ let BaseName = asmbase in {
+ def NAME : XOForm_3<opcode, xo, oe, OOL, IOL,
+ !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+ pattern>, RecFormRel;
+ let Defs = [CR0] in
+ def o : XOForm_3<opcode, xo, oe, OOL, IOL,
+ !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+ []>, isDOT, RecFormRel;
+ }
+}
+
+multiclass XOForm_3rc<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL,
+ string asmbase, string asmstr, InstrItinClass itin,
+ list<dag> pattern> {
+ let BaseName = asmbase in {
+ let Defs = [CARRY] in
+ def NAME : XOForm_3<opcode, xo, oe, OOL, IOL,
+ !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+ pattern>, RecFormRel;
+ let Defs = [CARRY, CR0] in
+ def o : XOForm_3<opcode, xo, oe, OOL, IOL,
+ !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+ []>, isDOT, RecFormRel;
+ }
+}
+
+multiclass MForm_2r<bits<6> opcode, dag OOL, dag IOL,
+ string asmbase, string asmstr, InstrItinClass itin,
+ list<dag> pattern> {
+ let BaseName = asmbase in {
+ def NAME : MForm_2<opcode, OOL, IOL,
+ !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+ pattern>, RecFormRel;
+ let Defs = [CR0] in
+ def o : MForm_2<opcode, OOL, IOL,
+ !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+ []>, isDOT, RecFormRel;
+ }
+}
+
+multiclass MDForm_1r<bits<6> opcode, bits<3> xo, dag OOL, dag IOL,
+ string asmbase, string asmstr, InstrItinClass itin,
+ list<dag> pattern> {
+ let BaseName = asmbase in {
+ def NAME : MDForm_1<opcode, xo, OOL, IOL,
+ !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+ pattern>, RecFormRel;
+ let Defs = [CR0] in
+ def o : MDForm_1<opcode, xo, OOL, IOL,
+ !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+ []>, isDOT, RecFormRel;
+ }
+}
+
+multiclass MDSForm_1r<bits<6> opcode, bits<4> xo, dag OOL, dag IOL,
+ string asmbase, string asmstr, InstrItinClass itin,
+ list<dag> pattern> {
+ let BaseName = asmbase in {
+ def NAME : MDSForm_1<opcode, xo, OOL, IOL,
+ !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+ pattern>, RecFormRel;
+ let Defs = [CR0] in
+ def o : MDSForm_1<opcode, xo, OOL, IOL,
+ !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+ []>, isDOT, RecFormRel;
+ }
+}
+
+multiclass XSForm_1rc<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
+ string asmbase, string asmstr, InstrItinClass itin,
+ list<dag> pattern> {
+ let BaseName = asmbase in {
+ let Defs = [CARRY] in
+ def NAME : XSForm_1<opcode, xo, OOL, IOL,
+ !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+ pattern>, RecFormRel;
+ let Defs = [CARRY, CR0] in
+ def o : XSForm_1<opcode, xo, OOL, IOL,
+ !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+ []>, isDOT, RecFormRel;
+ }
+}
+
+multiclass XForm_26r<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+ string asmbase, string asmstr, InstrItinClass itin,
+ list<dag> pattern> {
+ let BaseName = asmbase in {
+ def NAME : XForm_26<opcode, xo, OOL, IOL,
+ !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+ pattern>, RecFormRel;
+ let Defs = [CR1] in
+ def o : XForm_26<opcode, xo, OOL, IOL,
+ !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+ []>, isDOT, RecFormRel;
+ }
+}
+
+multiclass XForm_28r<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+ string asmbase, string asmstr, InstrItinClass itin,
+ list<dag> pattern> {
+ let BaseName = asmbase in {
+ def NAME : XForm_28<opcode, xo, OOL, IOL,
+ !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+ pattern>, RecFormRel;
+ let Defs = [CR1] in
+ def o : XForm_28<opcode, xo, OOL, IOL,
+ !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+ []>, isDOT, RecFormRel;
+ }
+}
+
+multiclass AForm_1r<bits<6> opcode, bits<5> xo, dag OOL, dag IOL,
+ string asmbase, string asmstr, InstrItinClass itin,
+ list<dag> pattern> {
+ let BaseName = asmbase in {
+ def NAME : AForm_1<opcode, xo, OOL, IOL,
+ !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+ pattern>, RecFormRel;
+ let Defs = [CR1] in
+ def o : AForm_1<opcode, xo, OOL, IOL,
+ !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+ []>, isDOT, RecFormRel;
+ }
+}
+
+multiclass AForm_2r<bits<6> opcode, bits<5> xo, dag OOL, dag IOL,
+ string asmbase, string asmstr, InstrItinClass itin,
+ list<dag> pattern> {
+ let BaseName = asmbase in {
+ def NAME : AForm_2<opcode, xo, OOL, IOL,
+ !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+ pattern>, RecFormRel;
+ let Defs = [CR1] in
+ def o : AForm_2<opcode, xo, OOL, IOL,
+ !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+ []>, isDOT, RecFormRel;
+ }
+}
+
+multiclass AForm_3r<bits<6> opcode, bits<5> xo, dag OOL, dag IOL,
+ string asmbase, string asmstr, InstrItinClass itin,
+ list<dag> pattern> {
+ let BaseName = asmbase in {
+ def NAME : AForm_3<opcode, xo, OOL, IOL,
+ !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+ pattern>, RecFormRel;
+ let Defs = [CR1] in
+ def o : AForm_3<opcode, xo, OOL, IOL,
+ !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+ []>, isDOT, RecFormRel;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// PowerPC Instruction Definitions.
+
+// Pseudo-instructions:
+
+let hasCtrlDep = 1 in {
+let Defs = [R1], Uses = [R1] in {
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins u16imm:$amt), "#ADJCALLSTACKDOWN $amt",
+ [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2), "#ADJCALLSTACKUP $amt1 $amt2",
+ [(callseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+def UPDATE_VRSAVE : Pseudo<(outs gprc:$rD), (ins gprc:$rS),
+ "UPDATE_VRSAVE $rD, $rS", []>;
+}
+
+let Defs = [R1], Uses = [R1] in
+def DYNALLOC : Pseudo<(outs gprc:$result), (ins gprc:$negsize, memri:$fpsi), "#DYNALLOC",
+ [(set i32:$result,
+ (PPCdynalloc i32:$negsize, iaddr:$fpsi))]>;
+def DYNAREAOFFSET : Pseudo<(outs i32imm:$result), (ins memri:$fpsi), "#DYNAREAOFFSET",
+ [(set i32:$result, (PPCdynareaoffset iaddr:$fpsi))]>;
+
+// SELECT_CC_* - Used to implement the SELECT_CC DAG operation. Expanded after
+// instruction selection into a branch sequence.
+let usesCustomInserter = 1, // Expanded after instruction selection.
+ PPC970_Single = 1 in {
+ // Note that SELECT_CC_I4 and SELECT_CC_I8 use the no-r0 register classes
+ // because either operand might become the first operand in an isel, and
+ // that operand cannot be r0.
+ def SELECT_CC_I4 : Pseudo<(outs gprc:$dst), (ins crrc:$cond,
+ gprc_nor0:$T, gprc_nor0:$F,
+ i32imm:$BROPC), "#SELECT_CC_I4",
+ []>;
+ def SELECT_CC_I8 : Pseudo<(outs g8rc:$dst), (ins crrc:$cond,
+ g8rc_nox0:$T, g8rc_nox0:$F,
+ i32imm:$BROPC), "#SELECT_CC_I8",
+ []>;
+ def SELECT_CC_F4 : Pseudo<(outs f4rc:$dst), (ins crrc:$cond, f4rc:$T, f4rc:$F,
+ i32imm:$BROPC), "#SELECT_CC_F4",
+ []>;
+ def SELECT_CC_F8 : Pseudo<(outs f8rc:$dst), (ins crrc:$cond, f8rc:$T, f8rc:$F,
+ i32imm:$BROPC), "#SELECT_CC_F8",
+ []>;
+ def SELECT_CC_VRRC: Pseudo<(outs vrrc:$dst), (ins crrc:$cond, vrrc:$T, vrrc:$F,
+ i32imm:$BROPC), "#SELECT_CC_VRRC",
+ []>;
+
+ // SELECT_* pseudo instructions, like SELECT_CC_* but taking condition
+ // register bit directly.
+ def SELECT_I4 : Pseudo<(outs gprc:$dst), (ins crbitrc:$cond,
+ gprc_nor0:$T, gprc_nor0:$F), "#SELECT_I4",
+ [(set i32:$dst, (select i1:$cond, i32:$T, i32:$F))]>;
+ def SELECT_I8 : Pseudo<(outs g8rc:$dst), (ins crbitrc:$cond,
+ g8rc_nox0:$T, g8rc_nox0:$F), "#SELECT_I8",
+ [(set i64:$dst, (select i1:$cond, i64:$T, i64:$F))]>;
+ def SELECT_F4 : Pseudo<(outs f4rc:$dst), (ins crbitrc:$cond,
+ f4rc:$T, f4rc:$F), "#SELECT_F4",
+ [(set f32:$dst, (select i1:$cond, f32:$T, f32:$F))]>;
+ def SELECT_F8 : Pseudo<(outs f8rc:$dst), (ins crbitrc:$cond,
+ f8rc:$T, f8rc:$F), "#SELECT_F8",
+ [(set f64:$dst, (select i1:$cond, f64:$T, f64:$F))]>;
+ def SELECT_VRRC: Pseudo<(outs vrrc:$dst), (ins crbitrc:$cond,
+ vrrc:$T, vrrc:$F), "#SELECT_VRRC",
+ [(set v4i32:$dst,
+ (select i1:$cond, v4i32:$T, v4i32:$F))]>;
+}
+
+// SPILL_CR - Indicate that we're dumping the CR register, so we'll need to
+// scavenge a register for it.
+let mayStore = 1 in {
+def SPILL_CR : Pseudo<(outs), (ins crrc:$cond, memri:$F),
+ "#SPILL_CR", []>;
+def SPILL_CRBIT : Pseudo<(outs), (ins crbitrc:$cond, memri:$F),
+ "#SPILL_CRBIT", []>;
+}
+
+// RESTORE_CR - Indicate that we're restoring the CR register (previously
+// spilled), so we'll need to scavenge a register for it.
+let mayLoad = 1 in {
+def RESTORE_CR : Pseudo<(outs crrc:$cond), (ins memri:$F),
+ "#RESTORE_CR", []>;
+def RESTORE_CRBIT : Pseudo<(outs crbitrc:$cond), (ins memri:$F),
+ "#RESTORE_CRBIT", []>;
+}
+
+let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in {
+ let isReturn = 1, Uses = [LR, RM] in
+ def BLR : XLForm_2_ext<19, 16, 20, 0, 0, (outs), (ins), "blr", IIC_BrB,
+ [(retflag)]>, Requires<[In32BitMode]>;
+ let isBranch = 1, isIndirectBranch = 1, Uses = [CTR] in {
+ def BCTR : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB,
+ []>;
+
+ let isCodeGenOnly = 1 in {
+ def BCCCTR : XLForm_2_br<19, 528, 0, (outs), (ins pred:$cond),
+ "b${cond:cc}ctr${cond:pm} ${cond:reg}", IIC_BrB,
+ []>;
+
+ def BCCTR : XLForm_2_br2<19, 528, 12, 0, (outs), (ins crbitrc:$bi),
+ "bcctr 12, $bi, 0", IIC_BrB, []>;
+ def BCCTRn : XLForm_2_br2<19, 528, 4, 0, (outs), (ins crbitrc:$bi),
+ "bcctr 4, $bi, 0", IIC_BrB, []>;
+ }
+ }
+}
+
+let Defs = [LR] in
+ def MovePCtoLR : Pseudo<(outs), (ins), "#MovePCtoLR", []>,
+ PPC970_Unit_BRU;
+let Defs = [LR] in
+ def MoveGOTtoLR : Pseudo<(outs), (ins), "#MoveGOTtoLR", []>,
+ PPC970_Unit_BRU;
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
+ let isBarrier = 1 in {
+ def B : IForm<18, 0, 0, (outs), (ins directbrtarget:$dst),
+ "b $dst", IIC_BrB,
+ [(br bb:$dst)]>;
+ def BA : IForm<18, 1, 0, (outs), (ins absdirectbrtarget:$dst),
+ "ba $dst", IIC_BrB, []>;
+ }
+
+ // BCC represents an arbitrary conditional branch on a predicate.
+ // FIXME: should be able to write a pattern for PPCcondbranch, but can't use
+ // a two-value operand where a dag node expects two operands. :(
+ let isCodeGenOnly = 1 in {
+ def BCC : BForm<16, 0, 0, (outs), (ins pred:$cond, condbrtarget:$dst),
+ "b${cond:cc}${cond:pm} ${cond:reg}, $dst"
+ /*[(PPCcondbranch crrc:$crS, imm:$opc, bb:$dst)]*/>;
+ def BCCA : BForm<16, 1, 0, (outs), (ins pred:$cond, abscondbrtarget:$dst),
+ "b${cond:cc}a${cond:pm} ${cond:reg}, $dst">;
+
+ let isReturn = 1, Uses = [LR, RM] in
+ def BCCLR : XLForm_2_br<19, 16, 0, (outs), (ins pred:$cond),
+ "b${cond:cc}lr${cond:pm} ${cond:reg}", IIC_BrB, []>;
+ }
+
+ let isCodeGenOnly = 1 in {
+ let Pattern = [(brcond i1:$bi, bb:$dst)] in
+ def BC : BForm_4<16, 12, 0, 0, (outs), (ins crbitrc:$bi, condbrtarget:$dst),
+ "bc 12, $bi, $dst">;
+
+ let Pattern = [(brcond (not i1:$bi), bb:$dst)] in
+ def BCn : BForm_4<16, 4, 0, 0, (outs), (ins crbitrc:$bi, condbrtarget:$dst),
+ "bc 4, $bi, $dst">;
+
+ let isReturn = 1, Uses = [LR, RM] in
+ def BCLR : XLForm_2_br2<19, 16, 12, 0, (outs), (ins crbitrc:$bi),
+ "bclr 12, $bi, 0", IIC_BrB, []>;
+ def BCLRn : XLForm_2_br2<19, 16, 4, 0, (outs), (ins crbitrc:$bi),
+ "bclr 4, $bi, 0", IIC_BrB, []>;
+ }
+
+ let isReturn = 1, Defs = [CTR], Uses = [CTR, LR, RM] in {
+ def BDZLR : XLForm_2_ext<19, 16, 18, 0, 0, (outs), (ins),
+ "bdzlr", IIC_BrB, []>;
+ def BDNZLR : XLForm_2_ext<19, 16, 16, 0, 0, (outs), (ins),
+ "bdnzlr", IIC_BrB, []>;
+ def BDZLRp : XLForm_2_ext<19, 16, 27, 0, 0, (outs), (ins),
+ "bdzlr+", IIC_BrB, []>;
+ def BDNZLRp: XLForm_2_ext<19, 16, 25, 0, 0, (outs), (ins),
+ "bdnzlr+", IIC_BrB, []>;
+ def BDZLRm : XLForm_2_ext<19, 16, 26, 0, 0, (outs), (ins),
+ "bdzlr-", IIC_BrB, []>;
+ def BDNZLRm: XLForm_2_ext<19, 16, 24, 0, 0, (outs), (ins),
+ "bdnzlr-", IIC_BrB, []>;
+ }
+
+ let Defs = [CTR], Uses = [CTR] in {
+ def BDZ : BForm_1<16, 18, 0, 0, (outs), (ins condbrtarget:$dst),
+ "bdz $dst">;
+ def BDNZ : BForm_1<16, 16, 0, 0, (outs), (ins condbrtarget:$dst),
+ "bdnz $dst">;
+ def BDZA : BForm_1<16, 18, 1, 0, (outs), (ins abscondbrtarget:$dst),
+ "bdza $dst">;
+ def BDNZA : BForm_1<16, 16, 1, 0, (outs), (ins abscondbrtarget:$dst),
+ "bdnza $dst">;
+ def BDZp : BForm_1<16, 27, 0, 0, (outs), (ins condbrtarget:$dst),
+ "bdz+ $dst">;
+ def BDNZp: BForm_1<16, 25, 0, 0, (outs), (ins condbrtarget:$dst),
+ "bdnz+ $dst">;
+ def BDZAp : BForm_1<16, 27, 1, 0, (outs), (ins abscondbrtarget:$dst),
+ "bdza+ $dst">;
+ def BDNZAp: BForm_1<16, 25, 1, 0, (outs), (ins abscondbrtarget:$dst),
+ "bdnza+ $dst">;
+ def BDZm : BForm_1<16, 26, 0, 0, (outs), (ins condbrtarget:$dst),
+ "bdz- $dst">;
+ def BDNZm: BForm_1<16, 24, 0, 0, (outs), (ins condbrtarget:$dst),
+ "bdnz- $dst">;
+ def BDZAm : BForm_1<16, 26, 1, 0, (outs), (ins abscondbrtarget:$dst),
+ "bdza- $dst">;
+ def BDNZAm: BForm_1<16, 24, 1, 0, (outs), (ins abscondbrtarget:$dst),
+ "bdnza- $dst">;
+ }
+}
+
+// The unconditional BCL used by the SjLj setjmp code.
+let isCall = 1, hasCtrlDep = 1, isCodeGenOnly = 1, PPC970_Unit = 7 in {
+ let Defs = [LR], Uses = [RM] in {
+ def BCLalways : BForm_2<16, 20, 31, 0, 1, (outs), (ins condbrtarget:$dst),
+ "bcl 20, 31, $dst">;
+ }
+}
+
+let isCall = 1, PPC970_Unit = 7, Defs = [LR] in {
+ // Convenient aliases for call instructions
+ let Uses = [RM] in {
+ def BL : IForm<18, 0, 1, (outs), (ins calltarget:$func),
+ "bl $func", IIC_BrB, []>; // See Pat patterns below.
+ def BLA : IForm<18, 1, 1, (outs), (ins abscalltarget:$func),
+ "bla $func", IIC_BrB, [(PPCcall (i32 imm:$func))]>;
+
+ let isCodeGenOnly = 1 in {
+ def BL_TLS : IForm<18, 0, 1, (outs), (ins tlscall32:$func),
+ "bl $func", IIC_BrB, []>;
+ def BCCL : BForm<16, 0, 1, (outs), (ins pred:$cond, condbrtarget:$dst),
+ "b${cond:cc}l${cond:pm} ${cond:reg}, $dst">;
+ def BCCLA : BForm<16, 1, 1, (outs), (ins pred:$cond, abscondbrtarget:$dst),
+ "b${cond:cc}la${cond:pm} ${cond:reg}, $dst">;
+
+ def BCL : BForm_4<16, 12, 0, 1, (outs),
+ (ins crbitrc:$bi, condbrtarget:$dst),
+ "bcl 12, $bi, $dst">;
+ def BCLn : BForm_4<16, 4, 0, 1, (outs),
+ (ins crbitrc:$bi, condbrtarget:$dst),
+ "bcl 4, $bi, $dst">;
+ }
+ }
+ let Uses = [CTR, RM] in {
+ def BCTRL : XLForm_2_ext<19, 528, 20, 0, 1, (outs), (ins),
+ "bctrl", IIC_BrB, [(PPCbctrl)]>,
+ Requires<[In32BitMode]>;
+
+ let isCodeGenOnly = 1 in {
+ def BCCCTRL : XLForm_2_br<19, 528, 1, (outs), (ins pred:$cond),
+ "b${cond:cc}ctrl${cond:pm} ${cond:reg}", IIC_BrB,
+ []>;
+
+ def BCCTRL : XLForm_2_br2<19, 528, 12, 1, (outs), (ins crbitrc:$bi),
+ "bcctrl 12, $bi, 0", IIC_BrB, []>;
+ def BCCTRLn : XLForm_2_br2<19, 528, 4, 1, (outs), (ins crbitrc:$bi),
+ "bcctrl 4, $bi, 0", IIC_BrB, []>;
+ }
+ }
+ let Uses = [LR, RM] in {
+ def BLRL : XLForm_2_ext<19, 16, 20, 0, 1, (outs), (ins),
+ "blrl", IIC_BrB, []>;
+
+ let isCodeGenOnly = 1 in {
+ def BCCLRL : XLForm_2_br<19, 16, 1, (outs), (ins pred:$cond),
+ "b${cond:cc}lrl${cond:pm} ${cond:reg}", IIC_BrB,
+ []>;
+
+ def BCLRL : XLForm_2_br2<19, 16, 12, 1, (outs), (ins crbitrc:$bi),
+ "bclrl 12, $bi, 0", IIC_BrB, []>;
+ def BCLRLn : XLForm_2_br2<19, 16, 4, 1, (outs), (ins crbitrc:$bi),
+ "bclrl 4, $bi, 0", IIC_BrB, []>;
+ }
+ }
+ let Defs = [CTR], Uses = [CTR, RM] in {
+ def BDZL : BForm_1<16, 18, 0, 1, (outs), (ins condbrtarget:$dst),
+ "bdzl $dst">;
+ def BDNZL : BForm_1<16, 16, 0, 1, (outs), (ins condbrtarget:$dst),
+ "bdnzl $dst">;
+ def BDZLA : BForm_1<16, 18, 1, 1, (outs), (ins abscondbrtarget:$dst),
+ "bdzla $dst">;
+ def BDNZLA : BForm_1<16, 16, 1, 1, (outs), (ins abscondbrtarget:$dst),
+ "bdnzla $dst">;
+ def BDZLp : BForm_1<16, 27, 0, 1, (outs), (ins condbrtarget:$dst),
+ "bdzl+ $dst">;
+ def BDNZLp: BForm_1<16, 25, 0, 1, (outs), (ins condbrtarget:$dst),
+ "bdnzl+ $dst">;
+ def BDZLAp : BForm_1<16, 27, 1, 1, (outs), (ins abscondbrtarget:$dst),
+ "bdzla+ $dst">;
+ def BDNZLAp: BForm_1<16, 25, 1, 1, (outs), (ins abscondbrtarget:$dst),
+ "bdnzla+ $dst">;
+ def BDZLm : BForm_1<16, 26, 0, 1, (outs), (ins condbrtarget:$dst),
+ "bdzl- $dst">;
+ def BDNZLm: BForm_1<16, 24, 0, 1, (outs), (ins condbrtarget:$dst),
+ "bdnzl- $dst">;
+ def BDZLAm : BForm_1<16, 26, 1, 1, (outs), (ins abscondbrtarget:$dst),
+ "bdzla- $dst">;
+ def BDNZLAm: BForm_1<16, 24, 1, 1, (outs), (ins abscondbrtarget:$dst),
+ "bdnzla- $dst">;
+ }
+ let Defs = [CTR], Uses = [CTR, LR, RM] in {
+ def BDZLRL : XLForm_2_ext<19, 16, 18, 0, 1, (outs), (ins),
+ "bdzlrl", IIC_BrB, []>;
+ def BDNZLRL : XLForm_2_ext<19, 16, 16, 0, 1, (outs), (ins),
+ "bdnzlrl", IIC_BrB, []>;
+ def BDZLRLp : XLForm_2_ext<19, 16, 27, 0, 1, (outs), (ins),
+ "bdzlrl+", IIC_BrB, []>;
+ def BDNZLRLp: XLForm_2_ext<19, 16, 25, 0, 1, (outs), (ins),
+ "bdnzlrl+", IIC_BrB, []>;
+ def BDZLRLm : XLForm_2_ext<19, 16, 26, 0, 1, (outs), (ins),
+ "bdzlrl-", IIC_BrB, []>;
+ def BDNZLRLm: XLForm_2_ext<19, 16, 24, 0, 1, (outs), (ins),
+ "bdnzlrl-", IIC_BrB, []>;
+ }
+}
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
+def TCRETURNdi :Pseudo< (outs),
+ (ins calltarget:$dst, i32imm:$offset),
+ "#TC_RETURNd $dst $offset",
+ []>;
+
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
+def TCRETURNai :Pseudo<(outs), (ins abscalltarget:$func, i32imm:$offset),
+ "#TC_RETURNa $func $offset",
+ [(PPCtc_return (i32 imm:$func), imm:$offset)]>;
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
+def TCRETURNri : Pseudo<(outs), (ins CTRRC:$dst, i32imm:$offset),
+ "#TC_RETURNr $dst $offset",
+ []>;
+
+
+let isCodeGenOnly = 1 in {
+
+let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7, isBranch = 1,
+ isIndirectBranch = 1, isCall = 1, isReturn = 1, Uses = [CTR, RM] in
+def TAILBCTR : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB,
+ []>, Requires<[In32BitMode]>;
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
+ isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in
+def TAILB : IForm<18, 0, 0, (outs), (ins calltarget:$dst),
+ "b $dst", IIC_BrB,
+ []>;
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
+ isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in
+def TAILBA : IForm<18, 0, 0, (outs), (ins abscalltarget:$dst),
+ "ba $dst", IIC_BrB,
+ []>;
+
+}
+
+let hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in {
+ let Defs = [CTR] in
+ def EH_SjLj_SetJmp32 : Pseudo<(outs gprc:$dst), (ins memr:$buf),
+ "#EH_SJLJ_SETJMP32",
+ [(set i32:$dst, (PPCeh_sjlj_setjmp addr:$buf))]>,
+ Requires<[In32BitMode]>;
+ let isTerminator = 1 in
+ def EH_SjLj_LongJmp32 : Pseudo<(outs), (ins memr:$buf),
+ "#EH_SJLJ_LONGJMP32",
+ [(PPCeh_sjlj_longjmp addr:$buf)]>,
+ Requires<[In32BitMode]>;
+}
+
+// This pseudo is never removed from the function, as it serves as
+// a terminator. Size is set to 0 to prevent the builtin assembler
+// from emitting it.
+let isBranch = 1, isTerminator = 1, Size = 0 in {
+ def EH_SjLj_Setup : Pseudo<(outs), (ins directbrtarget:$dst),
+ "#EH_SjLj_Setup\t$dst", []>;
+}
+
+// System call.
+let PPC970_Unit = 7 in {
+ def SC : SCForm<17, 1, (outs), (ins i32imm:$lev),
+ "sc $lev", IIC_BrB, [(PPCsc (i32 imm:$lev))]>;
+}
+
+// Branch history rolling buffer.
+def CLRBHRB : XForm_0<31, 430, (outs), (ins), "clrbhrb", IIC_BrB,
+ [(PPCclrbhrb)]>,
+ PPC970_DGroup_Single;
+// The $dmy argument used for MFBHRBE is not needed; however, including
+// it avoids automatic generation of PPCFastISel::fastEmit_i(), which
+// interferes with necessary special handling (see PPCFastISel.cpp).
+def MFBHRBE : XFXForm_3p<31, 302, (outs gprc:$rD),
+ (ins u10imm:$imm, u10imm:$dmy),
+ "mfbhrbe $rD, $imm", IIC_BrB,
+ [(set i32:$rD,
+ (PPCmfbhrbe imm:$imm, imm:$dmy))]>,
+ PPC970_DGroup_First;
+
+def RFEBB : XLForm_S<19, 146, (outs), (ins u1imm:$imm), "rfebb $imm",
+ IIC_BrB, [(PPCrfebb (i32 imm:$imm))]>,
+ PPC970_DGroup_Single;
+
+// DCB* instructions.
+def DCBA : DCB_Form<758, 0, (outs), (ins memrr:$dst), "dcba $dst",
+ IIC_LdStDCBF, [(int_ppc_dcba xoaddr:$dst)]>,
+ PPC970_DGroup_Single;
+def DCBI : DCB_Form<470, 0, (outs), (ins memrr:$dst), "dcbi $dst",
+ IIC_LdStDCBF, [(int_ppc_dcbi xoaddr:$dst)]>,
+ PPC970_DGroup_Single;
+def DCBST : DCB_Form<54, 0, (outs), (ins memrr:$dst), "dcbst $dst",
+ IIC_LdStDCBF, [(int_ppc_dcbst xoaddr:$dst)]>,
+ PPC970_DGroup_Single;
+def DCBZ : DCB_Form<1014, 0, (outs), (ins memrr:$dst), "dcbz $dst",
+ IIC_LdStDCBF, [(int_ppc_dcbz xoaddr:$dst)]>,
+ PPC970_DGroup_Single;
+def DCBZL : DCB_Form<1014, 1, (outs), (ins memrr:$dst), "dcbzl $dst",
+ IIC_LdStDCBF, [(int_ppc_dcbzl xoaddr:$dst)]>,
+ PPC970_DGroup_Single;
+
+def DCBF : DCB_Form_hint<86, (outs), (ins u5imm:$TH, memrr:$dst),
+ "dcbf $dst, $TH", IIC_LdStDCBF, []>,
+ PPC970_DGroup_Single;
+
+let hasSideEffects = 0, mayLoad = 1, mayStore = 1 in {
+def DCBT : DCB_Form_hint<278, (outs), (ins u5imm:$TH, memrr:$dst),
+ "dcbt $dst, $TH", IIC_LdStDCBF, []>,
+ PPC970_DGroup_Single;
+def DCBTST : DCB_Form_hint<246, (outs), (ins u5imm:$TH, memrr:$dst),
+ "dcbtst $dst, $TH", IIC_LdStDCBF, []>,
+ PPC970_DGroup_Single;
+} // hasSideEffects = 0
+
+def ICBT : XForm_icbt<31, 22, (outs), (ins u4imm:$CT, memrr:$src),
+ "icbt $CT, $src", IIC_LdStLoad>, Requires<[HasICBT]>;
+
+def : Pat<(int_ppc_dcbt xoaddr:$dst),
+ (DCBT 0, xoaddr:$dst)>;
+def : Pat<(int_ppc_dcbtst xoaddr:$dst),
+ (DCBTST 0, xoaddr:$dst)>;
+def : Pat<(int_ppc_dcbf xoaddr:$dst),
+ (DCBF 0, xoaddr:$dst)>;
+
+def : Pat<(prefetch xoaddr:$dst, (i32 0), imm, (i32 1)),
+ (DCBT 0, xoaddr:$dst)>; // data prefetch for loads
+def : Pat<(prefetch xoaddr:$dst, (i32 1), imm, (i32 1)),
+ (DCBTST 0, xoaddr:$dst)>; // data prefetch for stores
+def : Pat<(prefetch xoaddr:$dst, (i32 0), imm, (i32 0)),
+ (ICBT 0, xoaddr:$dst)>, Requires<[HasICBT]>; // inst prefetch (for read)
+
+// Atomic operations
+let usesCustomInserter = 1 in {
+ let Defs = [CR0] in {
+ def ATOMIC_LOAD_ADD_I8 : Pseudo<
+ (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I8",
+ [(set i32:$dst, (atomic_load_add_8 xoaddr:$ptr, i32:$incr))]>;
+ def ATOMIC_LOAD_SUB_I8 : Pseudo<
+ (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I8",
+ [(set i32:$dst, (atomic_load_sub_8 xoaddr:$ptr, i32:$incr))]>;
+ def ATOMIC_LOAD_AND_I8 : Pseudo<
+ (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I8",
+ [(set i32:$dst, (atomic_load_and_8 xoaddr:$ptr, i32:$incr))]>;
+ def ATOMIC_LOAD_OR_I8 : Pseudo<
+ (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I8",
+ [(set i32:$dst, (atomic_load_or_8 xoaddr:$ptr, i32:$incr))]>;
+ def ATOMIC_LOAD_XOR_I8 : Pseudo<
+ (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "ATOMIC_LOAD_XOR_I8",
+ [(set i32:$dst, (atomic_load_xor_8 xoaddr:$ptr, i32:$incr))]>;
+ def ATOMIC_LOAD_NAND_I8 : Pseudo<
+ (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I8",
+ [(set i32:$dst, (atomic_load_nand_8 xoaddr:$ptr, i32:$incr))]>;
+ def ATOMIC_LOAD_MIN_I8 : Pseudo<
+ (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MIN_I8",
+ [(set i32:$dst, (atomic_load_min_8 xoaddr:$ptr, i32:$incr))]>;
+ def ATOMIC_LOAD_MAX_I8 : Pseudo<
+ (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MAX_I8",
+ [(set i32:$dst, (atomic_load_max_8 xoaddr:$ptr, i32:$incr))]>;
+ def ATOMIC_LOAD_UMIN_I8 : Pseudo<
+ (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMIN_I8",
+ [(set i32:$dst, (atomic_load_umin_8 xoaddr:$ptr, i32:$incr))]>;
+ def ATOMIC_LOAD_UMAX_I8 : Pseudo<
+ (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMAX_I8",
+ [(set i32:$dst, (atomic_load_umax_8 xoaddr:$ptr, i32:$incr))]>;
+ def ATOMIC_LOAD_ADD_I16 : Pseudo<
+ (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I16",
+ [(set i32:$dst, (atomic_load_add_16 xoaddr:$ptr, i32:$incr))]>;
+ def ATOMIC_LOAD_SUB_I16 : Pseudo<
+ (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I16",
+ [(set i32:$dst, (atomic_load_sub_16 xoaddr:$ptr, i32:$incr))]>;
+ def ATOMIC_LOAD_AND_I16 : Pseudo<
+ (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I16",
+ [(set i32:$dst, (atomic_load_and_16 xoaddr:$ptr, i32:$incr))]>;
+ def ATOMIC_LOAD_OR_I16 : Pseudo<
+ (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I16",
+ [(set i32:$dst, (atomic_load_or_16 xoaddr:$ptr, i32:$incr))]>;
+ def ATOMIC_LOAD_XOR_I16 : Pseudo<
+ (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_XOR_I16",
+ [(set i32:$dst, (atomic_load_xor_16 xoaddr:$ptr, i32:$incr))]>;
+ def ATOMIC_LOAD_NAND_I16 : Pseudo<
+ (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I16",
+ [(set i32:$dst, (atomic_load_nand_16 xoaddr:$ptr, i32:$incr))]>;
+ def ATOMIC_LOAD_MIN_I16 : Pseudo<
+ (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MIN_I16",
+ [(set i32:$dst, (atomic_load_min_16 xoaddr:$ptr, i32:$incr))]>;
+ def ATOMIC_LOAD_MAX_I16 : Pseudo<
+ (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MAX_I16",
+ [(set i32:$dst, (atomic_load_max_16 xoaddr:$ptr, i32:$incr))]>;
+ def ATOMIC_LOAD_UMIN_I16 : Pseudo<
+ (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMIN_I16",
+ [(set i32:$dst, (atomic_load_umin_16 xoaddr:$ptr, i32:$incr))]>;
+ def ATOMIC_LOAD_UMAX_I16 : Pseudo<
+ (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMAX_I16",
+ [(set i32:$dst, (atomic_load_umax_16 xoaddr:$ptr, i32:$incr))]>;
+ def ATOMIC_LOAD_ADD_I32 : Pseudo<
+ (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I32",
+ [(set i32:$dst, (atomic_load_add_32 xoaddr:$ptr, i32:$incr))]>;
+ def ATOMIC_LOAD_SUB_I32 : Pseudo<
+ (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I32",
+ [(set i32:$dst, (atomic_load_sub_32 xoaddr:$ptr, i32:$incr))]>;
+ def ATOMIC_LOAD_AND_I32 : Pseudo<
+ (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I32",
+ [(set i32:$dst, (atomic_load_and_32 xoaddr:$ptr, i32:$incr))]>;
+ def ATOMIC_LOAD_OR_I32 : Pseudo<
+ (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I32",
+ [(set i32:$dst, (atomic_load_or_32 xoaddr:$ptr, i32:$incr))]>;
+ def ATOMIC_LOAD_XOR_I32 : Pseudo<
+ (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_XOR_I32",
+ [(set i32:$dst, (atomic_load_xor_32 xoaddr:$ptr, i32:$incr))]>;
+ def ATOMIC_LOAD_NAND_I32 : Pseudo<
+ (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I32",
+ [(set i32:$dst, (atomic_load_nand_32 xoaddr:$ptr, i32:$incr))]>;
+ def ATOMIC_LOAD_MIN_I32 : Pseudo<
+ (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MIN_I32",
+ [(set i32:$dst, (atomic_load_min_32 xoaddr:$ptr, i32:$incr))]>;
+ def ATOMIC_LOAD_MAX_I32 : Pseudo<
+ (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MAX_I32",
+ [(set i32:$dst, (atomic_load_max_32 xoaddr:$ptr, i32:$incr))]>;
+ def ATOMIC_LOAD_UMIN_I32 : Pseudo<
+ (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMIN_I32",
+ [(set i32:$dst, (atomic_load_umin_32 xoaddr:$ptr, i32:$incr))]>;
+ def ATOMIC_LOAD_UMAX_I32 : Pseudo<
+ (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMAX_I32",
+ [(set i32:$dst, (atomic_load_umax_32 xoaddr:$ptr, i32:$incr))]>;
+
+ def ATOMIC_CMP_SWAP_I8 : Pseudo<
+ (outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I8",
+ [(set i32:$dst, (atomic_cmp_swap_8 xoaddr:$ptr, i32:$old, i32:$new))]>;
+ def ATOMIC_CMP_SWAP_I16 : Pseudo<
+ (outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I16 $dst $ptr $old $new",
+ [(set i32:$dst, (atomic_cmp_swap_16 xoaddr:$ptr, i32:$old, i32:$new))]>;
+ def ATOMIC_CMP_SWAP_I32 : Pseudo<
+ (outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I32 $dst $ptr $old $new",
+ [(set i32:$dst, (atomic_cmp_swap_32 xoaddr:$ptr, i32:$old, i32:$new))]>;
+
+ def ATOMIC_SWAP_I8 : Pseudo<
+ (outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_i8",
+ [(set i32:$dst, (atomic_swap_8 xoaddr:$ptr, i32:$new))]>;
+ def ATOMIC_SWAP_I16 : Pseudo<
+ (outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_I16",
+ [(set i32:$dst, (atomic_swap_16 xoaddr:$ptr, i32:$new))]>;
+ def ATOMIC_SWAP_I32 : Pseudo<
+ (outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_I32",
+ [(set i32:$dst, (atomic_swap_32 xoaddr:$ptr, i32:$new))]>;
+ }
+}
+
+// Instructions to support atomic operations
+let mayLoad = 1, hasSideEffects = 0 in {
+def LBARX : XForm_1<31, 52, (outs gprc:$rD), (ins memrr:$src),
+ "lbarx $rD, $src", IIC_LdStLWARX, []>,
+ Requires<[HasPartwordAtomics]>;
+
+def LHARX : XForm_1<31, 116, (outs gprc:$rD), (ins memrr:$src),
+ "lharx $rD, $src", IIC_LdStLWARX, []>,
+ Requires<[HasPartwordAtomics]>;
+
+def LWARX : XForm_1<31, 20, (outs gprc:$rD), (ins memrr:$src),
+ "lwarx $rD, $src", IIC_LdStLWARX, []>;
+
+// Instructions to support lock versions of atomics
+// (EH=1 - see Power ISA 2.07 Book II 4.4.2)
+def LBARXL : XForm_1<31, 52, (outs gprc:$rD), (ins memrr:$src),
+ "lbarx $rD, $src, 1", IIC_LdStLWARX, []>, isDOT,
+ Requires<[HasPartwordAtomics]>;
+
+def LHARXL : XForm_1<31, 116, (outs gprc:$rD), (ins memrr:$src),
+ "lharx $rD, $src, 1", IIC_LdStLWARX, []>, isDOT,
+ Requires<[HasPartwordAtomics]>;
+
+def LWARXL : XForm_1<31, 20, (outs gprc:$rD), (ins memrr:$src),
+ "lwarx $rD, $src, 1", IIC_LdStLWARX, []>, isDOT;
+
+// The atomic instructions use the destination register as well as the next one
+// or two registers in order (modulo 31).
+let hasExtraSrcRegAllocReq = 1 in
+def LWAT : X_RD5_RS5_IM5<31, 582, (outs gprc:$rD), (ins gprc:$rA, u5imm:$FC),
+ "lwat $rD, $rA, $FC", IIC_LdStLoad>,
+ Requires<[IsISA3_0]>;
+}
+
+let Defs = [CR0], mayStore = 1, hasSideEffects = 0 in {
+def STBCX : XForm_1<31, 694, (outs), (ins gprc:$rS, memrr:$dst),
+ "stbcx. $rS, $dst", IIC_LdStSTWCX, []>,
+ isDOT, Requires<[HasPartwordAtomics]>;
+
+def STHCX : XForm_1<31, 726, (outs), (ins gprc:$rS, memrr:$dst),
+ "sthcx. $rS, $dst", IIC_LdStSTWCX, []>,
+ isDOT, Requires<[HasPartwordAtomics]>;
+
+def STWCX : XForm_1<31, 150, (outs), (ins gprc:$rS, memrr:$dst),
+ "stwcx. $rS, $dst", IIC_LdStSTWCX, []>, isDOT;
+}
+
+let mayStore = 1, hasSideEffects = 0 in
+def STWAT : X_RD5_RS5_IM5<31, 710, (outs), (ins gprc:$rS, gprc:$rA, u5imm:$FC),
+ "stwat $rS, $rA, $FC", IIC_LdStStore>,
+ Requires<[IsISA3_0]>;
+
+let isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in
+def TRAP : XForm_24<31, 4, (outs), (ins), "trap", IIC_LdStLoad, [(trap)]>;
+
+def TWI : DForm_base<3, (outs), (ins u5imm:$to, gprc:$rA, s16imm:$imm),
+ "twi $to, $rA, $imm", IIC_IntTrapW, []>;
+def TW : XForm_1<31, 4, (outs), (ins u5imm:$to, gprc:$rA, gprc:$rB),
+ "tw $to, $rA, $rB", IIC_IntTrapW, []>;
+def TDI : DForm_base<2, (outs), (ins u5imm:$to, g8rc:$rA, s16imm:$imm),
+ "tdi $to, $rA, $imm", IIC_IntTrapD, []>;
+def TD : XForm_1<31, 68, (outs), (ins u5imm:$to, g8rc:$rA, g8rc:$rB),
+ "td $to, $rA, $rB", IIC_IntTrapD, []>;
+
+//===----------------------------------------------------------------------===//
+// PPC32 Load Instructions.
+//
+
+// Unindexed (r+i) Loads.
+let PPC970_Unit = 2 in {
+def LBZ : DForm_1<34, (outs gprc:$rD), (ins memri:$src),
+ "lbz $rD, $src", IIC_LdStLoad,
+ [(set i32:$rD, (zextloadi8 iaddr:$src))]>;
+def LHA : DForm_1<42, (outs gprc:$rD), (ins memri:$src),
+ "lha $rD, $src", IIC_LdStLHA,
+ [(set i32:$rD, (sextloadi16 iaddr:$src))]>,
+ PPC970_DGroup_Cracked;
+def LHZ : DForm_1<40, (outs gprc:$rD), (ins memri:$src),
+ "lhz $rD, $src", IIC_LdStLoad,
+ [(set i32:$rD, (zextloadi16 iaddr:$src))]>;
+def LWZ : DForm_1<32, (outs gprc:$rD), (ins memri:$src),
+ "lwz $rD, $src", IIC_LdStLoad,
+ [(set i32:$rD, (load iaddr:$src))]>;
+
+def LFS : DForm_1<48, (outs f4rc:$rD), (ins memri:$src),
+ "lfs $rD, $src", IIC_LdStLFD,
+ [(set f32:$rD, (load iaddr:$src))]>;
+def LFD : DForm_1<50, (outs f8rc:$rD), (ins memri:$src),
+ "lfd $rD, $src", IIC_LdStLFD,
+ [(set f64:$rD, (load iaddr:$src))]>;
+
+
+// Unindexed (r+i) Loads with Update (preinc).
+let mayLoad = 1, hasSideEffects = 0 in {
+def LBZU : DForm_1<35, (outs gprc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+ "lbzu $rD, $addr", IIC_LdStLoadUpd,
+ []>, RegConstraint<"$addr.reg = $ea_result">,
+ NoEncode<"$ea_result">;
+
+def LHAU : DForm_1<43, (outs gprc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+ "lhau $rD, $addr", IIC_LdStLHAU,
+ []>, RegConstraint<"$addr.reg = $ea_result">,
+ NoEncode<"$ea_result">;
+
+def LHZU : DForm_1<41, (outs gprc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+ "lhzu $rD, $addr", IIC_LdStLoadUpd,
+ []>, RegConstraint<"$addr.reg = $ea_result">,
+ NoEncode<"$ea_result">;
+
+def LWZU : DForm_1<33, (outs gprc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+ "lwzu $rD, $addr", IIC_LdStLoadUpd,
+ []>, RegConstraint<"$addr.reg = $ea_result">,
+ NoEncode<"$ea_result">;
+
+def LFSU : DForm_1<49, (outs f4rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+ "lfsu $rD, $addr", IIC_LdStLFDU,
+ []>, RegConstraint<"$addr.reg = $ea_result">,
+ NoEncode<"$ea_result">;
+
+def LFDU : DForm_1<51, (outs f8rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+ "lfdu $rD, $addr", IIC_LdStLFDU,
+ []>, RegConstraint<"$addr.reg = $ea_result">,
+ NoEncode<"$ea_result">;
+
+
+// Indexed (r+r) Loads with Update (preinc).
+def LBZUX : XForm_1<31, 119, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
+ (ins memrr:$addr),
+ "lbzux $rD, $addr", IIC_LdStLoadUpdX,
+ []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+ NoEncode<"$ea_result">;
+
+def LHAUX : XForm_1<31, 375, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
+ (ins memrr:$addr),
+ "lhaux $rD, $addr", IIC_LdStLHAUX,
+ []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+ NoEncode<"$ea_result">;
+
+def LHZUX : XForm_1<31, 311, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
+ (ins memrr:$addr),
+ "lhzux $rD, $addr", IIC_LdStLoadUpdX,
+ []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+ NoEncode<"$ea_result">;
+
+def LWZUX : XForm_1<31, 55, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
+ (ins memrr:$addr),
+ "lwzux $rD, $addr", IIC_LdStLoadUpdX,
+ []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+ NoEncode<"$ea_result">;
+
+def LFSUX : XForm_1<31, 567, (outs f4rc:$rD, ptr_rc_nor0:$ea_result),
+ (ins memrr:$addr),
+ "lfsux $rD, $addr", IIC_LdStLFDUX,
+ []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+ NoEncode<"$ea_result">;
+
+def LFDUX : XForm_1<31, 631, (outs f8rc:$rD, ptr_rc_nor0:$ea_result),
+ (ins memrr:$addr),
+ "lfdux $rD, $addr", IIC_LdStLFDUX,
+ []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+ NoEncode<"$ea_result">;
+}
+}
+
+// Indexed (r+r) Loads.
+//
+let PPC970_Unit = 2 in {
+def LBZX : XForm_1<31, 87, (outs gprc:$rD), (ins memrr:$src),
+ "lbzx $rD, $src", IIC_LdStLoad,
+ [(set i32:$rD, (zextloadi8 xaddr:$src))]>;
+def LHAX : XForm_1<31, 343, (outs gprc:$rD), (ins memrr:$src),
+ "lhax $rD, $src", IIC_LdStLHA,
+ [(set i32:$rD, (sextloadi16 xaddr:$src))]>,
+ PPC970_DGroup_Cracked;
+def LHZX : XForm_1<31, 279, (outs gprc:$rD), (ins memrr:$src),
+ "lhzx $rD, $src", IIC_LdStLoad,
+ [(set i32:$rD, (zextloadi16 xaddr:$src))]>;
+def LWZX : XForm_1<31, 23, (outs gprc:$rD), (ins memrr:$src),
+ "lwzx $rD, $src", IIC_LdStLoad,
+ [(set i32:$rD, (load xaddr:$src))]>;
+
+
+def LHBRX : XForm_1<31, 790, (outs gprc:$rD), (ins memrr:$src),
+ "lhbrx $rD, $src", IIC_LdStLoad,
+ [(set i32:$rD, (PPClbrx xoaddr:$src, i16))]>;
+def LWBRX : XForm_1<31, 534, (outs gprc:$rD), (ins memrr:$src),
+ "lwbrx $rD, $src", IIC_LdStLoad,
+ [(set i32:$rD, (PPClbrx xoaddr:$src, i32))]>;
+
+def LFSX : XForm_25<31, 535, (outs f4rc:$frD), (ins memrr:$src),
+ "lfsx $frD, $src", IIC_LdStLFD,
+ [(set f32:$frD, (load xaddr:$src))]>;
+def LFDX : XForm_25<31, 599, (outs f8rc:$frD), (ins memrr:$src),
+ "lfdx $frD, $src", IIC_LdStLFD,
+ [(set f64:$frD, (load xaddr:$src))]>;
+
+def LFIWAX : XForm_25<31, 855, (outs f8rc:$frD), (ins memrr:$src),
+ "lfiwax $frD, $src", IIC_LdStLFD,
+ [(set f64:$frD, (PPClfiwax xoaddr:$src))]>;
+def LFIWZX : XForm_25<31, 887, (outs f8rc:$frD), (ins memrr:$src),
+ "lfiwzx $frD, $src", IIC_LdStLFD,
+ [(set f64:$frD, (PPClfiwzx xoaddr:$src))]>;
+}
+
+// Load Multiple
+def LMW : DForm_1<46, (outs gprc:$rD), (ins memri:$src),
+ "lmw $rD, $src", IIC_LdStLMW, []>;
+
+//===----------------------------------------------------------------------===//
+// PPC32 Store Instructions.
+//
+
+// Unindexed (r+i) Stores.
+let PPC970_Unit = 2 in {
+def STB : DForm_1<38, (outs), (ins gprc:$rS, memri:$src),
+ "stb $rS, $src", IIC_LdStStore,
+ [(truncstorei8 i32:$rS, iaddr:$src)]>;
+def STH : DForm_1<44, (outs), (ins gprc:$rS, memri:$src),
+ "sth $rS, $src", IIC_LdStStore,
+ [(truncstorei16 i32:$rS, iaddr:$src)]>;
+def STW : DForm_1<36, (outs), (ins gprc:$rS, memri:$src),
+ "stw $rS, $src", IIC_LdStStore,
+ [(store i32:$rS, iaddr:$src)]>;
+def STFS : DForm_1<52, (outs), (ins f4rc:$rS, memri:$dst),
+ "stfs $rS, $dst", IIC_LdStSTFD,
+ [(store f32:$rS, iaddr:$dst)]>;
+def STFD : DForm_1<54, (outs), (ins f8rc:$rS, memri:$dst),
+ "stfd $rS, $dst", IIC_LdStSTFD,
+ [(store f64:$rS, iaddr:$dst)]>;
+}
+
+// Unindexed (r+i) Stores with Update (preinc).
+let PPC970_Unit = 2, mayStore = 1 in {
+def STBU : DForm_1<39, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memri:$dst),
+ "stbu $rS, $dst", IIC_LdStStoreUpd, []>,
+ RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+def STHU : DForm_1<45, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memri:$dst),
+ "sthu $rS, $dst", IIC_LdStStoreUpd, []>,
+ RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+def STWU : DForm_1<37, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memri:$dst),
+ "stwu $rS, $dst", IIC_LdStStoreUpd, []>,
+ RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+def STFSU : DForm_1<53, (outs ptr_rc_nor0:$ea_res), (ins f4rc:$rS, memri:$dst),
+ "stfsu $rS, $dst", IIC_LdStSTFDU, []>,
+ RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+def STFDU : DForm_1<55, (outs ptr_rc_nor0:$ea_res), (ins f8rc:$rS, memri:$dst),
+ "stfdu $rS, $dst", IIC_LdStSTFDU, []>,
+ RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+}
+
+// Patterns to match the pre-inc stores. We can't put the patterns on
+// the instruction definitions directly as ISel wants the address base
+// and offset to be separate operands, not a single complex operand.
+def : Pat<(pre_truncsti8 i32:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
+ (STBU $rS, iaddroff:$ptroff, $ptrreg)>;
+def : Pat<(pre_truncsti16 i32:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
+ (STHU $rS, iaddroff:$ptroff, $ptrreg)>;
+def : Pat<(pre_store i32:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
+ (STWU $rS, iaddroff:$ptroff, $ptrreg)>;
+def : Pat<(pre_store f32:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
+ (STFSU $rS, iaddroff:$ptroff, $ptrreg)>;
+def : Pat<(pre_store f64:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
+ (STFDU $rS, iaddroff:$ptroff, $ptrreg)>;
+
+// Indexed (r+r) Stores.
+let PPC970_Unit = 2 in {
+def STBX : XForm_8<31, 215, (outs), (ins gprc:$rS, memrr:$dst),
+ "stbx $rS, $dst", IIC_LdStStore,
+ [(truncstorei8 i32:$rS, xaddr:$dst)]>,
+ PPC970_DGroup_Cracked;
+def STHX : XForm_8<31, 407, (outs), (ins gprc:$rS, memrr:$dst),
+ "sthx $rS, $dst", IIC_LdStStore,
+ [(truncstorei16 i32:$rS, xaddr:$dst)]>,
+ PPC970_DGroup_Cracked;
+def STWX : XForm_8<31, 151, (outs), (ins gprc:$rS, memrr:$dst),
+ "stwx $rS, $dst", IIC_LdStStore,
+ [(store i32:$rS, xaddr:$dst)]>,
+ PPC970_DGroup_Cracked;
+
+def STHBRX: XForm_8<31, 918, (outs), (ins gprc:$rS, memrr:$dst),
+ "sthbrx $rS, $dst", IIC_LdStStore,
+ [(PPCstbrx i32:$rS, xoaddr:$dst, i16)]>,
+ PPC970_DGroup_Cracked;
+def STWBRX: XForm_8<31, 662, (outs), (ins gprc:$rS, memrr:$dst),
+ "stwbrx $rS, $dst", IIC_LdStStore,
+ [(PPCstbrx i32:$rS, xoaddr:$dst, i32)]>,
+ PPC970_DGroup_Cracked;
+
+def STFIWX: XForm_28<31, 983, (outs), (ins f8rc:$frS, memrr:$dst),
+ "stfiwx $frS, $dst", IIC_LdStSTFD,
+ [(PPCstfiwx f64:$frS, xoaddr:$dst)]>;
+
+def STFSX : XForm_28<31, 663, (outs), (ins f4rc:$frS, memrr:$dst),
+ "stfsx $frS, $dst", IIC_LdStSTFD,
+ [(store f32:$frS, xaddr:$dst)]>;
+def STFDX : XForm_28<31, 727, (outs), (ins f8rc:$frS, memrr:$dst),
+ "stfdx $frS, $dst", IIC_LdStSTFD,
+ [(store f64:$frS, xaddr:$dst)]>;
+}
+
+// Indexed (r+r) Stores with Update (preinc).
+let PPC970_Unit = 2, mayStore = 1 in {
+def STBUX : XForm_8<31, 247, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memrr:$dst),
+ "stbux $rS, $dst", IIC_LdStStoreUpd, []>,
+ RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
+ PPC970_DGroup_Cracked;
+def STHUX : XForm_8<31, 439, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memrr:$dst),
+ "sthux $rS, $dst", IIC_LdStStoreUpd, []>,
+ RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
+ PPC970_DGroup_Cracked;
+def STWUX : XForm_8<31, 183, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memrr:$dst),
+ "stwux $rS, $dst", IIC_LdStStoreUpd, []>,
+ RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
+ PPC970_DGroup_Cracked;
+def STFSUX: XForm_8<31, 695, (outs ptr_rc_nor0:$ea_res), (ins f4rc:$rS, memrr:$dst),
+ "stfsux $rS, $dst", IIC_LdStSTFDU, []>,
+ RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
+ PPC970_DGroup_Cracked;
+def STFDUX: XForm_8<31, 759, (outs ptr_rc_nor0:$ea_res), (ins f8rc:$rS, memrr:$dst),
+ "stfdux $rS, $dst", IIC_LdStSTFDU, []>,
+ RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
+ PPC970_DGroup_Cracked;
+}
+
+// Patterns to match the pre-inc stores. We can't put the patterns on
+// the instruction definitions directly as ISel wants the address base
+// and offset to be separate operands, not a single complex operand.
+def : Pat<(pre_truncsti8 i32:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+ (STBUX $rS, $ptrreg, $ptroff)>;
+def : Pat<(pre_truncsti16 i32:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+ (STHUX $rS, $ptrreg, $ptroff)>;
+def : Pat<(pre_store i32:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+ (STWUX $rS, $ptrreg, $ptroff)>;
+def : Pat<(pre_store f32:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+ (STFSUX $rS, $ptrreg, $ptroff)>;
+def : Pat<(pre_store f64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+ (STFDUX $rS, $ptrreg, $ptroff)>;
+
+// Store Multiple
+def STMW : DForm_1<47, (outs), (ins gprc:$rS, memri:$dst),
+ "stmw $rS, $dst", IIC_LdStLMW, []>;
+
+def SYNC : XForm_24_sync<31, 598, (outs), (ins i32imm:$L),
+ "sync $L", IIC_LdStSync, []>;
+
+let isCodeGenOnly = 1 in {
+ def MSYNC : XForm_24_sync<31, 598, (outs), (ins),
+ "msync", IIC_LdStSync, []> {
+ let L = 0;
+ }
+}
+
+def : Pat<(int_ppc_sync), (SYNC 0)>, Requires<[HasSYNC]>;
+def : Pat<(int_ppc_lwsync), (SYNC 1)>, Requires<[HasSYNC]>;
+def : Pat<(int_ppc_sync), (MSYNC)>, Requires<[HasOnlyMSYNC]>;
+def : Pat<(int_ppc_lwsync), (MSYNC)>, Requires<[HasOnlyMSYNC]>;
+
+//===----------------------------------------------------------------------===//
+// PPC32 Arithmetic Instructions.
+//
+
+let PPC970_Unit = 1 in { // FXU Operations.
+def ADDI : DForm_2<14, (outs gprc:$rD), (ins gprc_nor0:$rA, s16imm:$imm),
+ "addi $rD, $rA, $imm", IIC_IntSimple,
+ [(set i32:$rD, (add i32:$rA, imm32SExt16:$imm))]>;
+let BaseName = "addic" in {
+let Defs = [CARRY] in
+def ADDIC : DForm_2<12, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm),
+ "addic $rD, $rA, $imm", IIC_IntGeneral,
+ [(set i32:$rD, (addc i32:$rA, imm32SExt16:$imm))]>,
+ RecFormRel, PPC970_DGroup_Cracked;
+let Defs = [CARRY, CR0] in
+def ADDICo : DForm_2<13, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm),
+ "addic. $rD, $rA, $imm", IIC_IntGeneral,
+ []>, isDOT, RecFormRel;
+}
+def ADDIS : DForm_2<15, (outs gprc:$rD), (ins gprc_nor0:$rA, s17imm:$imm),
+ "addis $rD, $rA, $imm", IIC_IntSimple,
+ [(set i32:$rD, (add i32:$rA, imm16ShiftedSExt:$imm))]>;
+let isCodeGenOnly = 1 in
+def LA : DForm_2<14, (outs gprc:$rD), (ins gprc_nor0:$rA, s16imm:$sym),
+ "la $rD, $sym($rA)", IIC_IntGeneral,
+ [(set i32:$rD, (add i32:$rA,
+ (PPClo tglobaladdr:$sym, 0)))]>;
+def MULLI : DForm_2< 7, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm),
+ "mulli $rD, $rA, $imm", IIC_IntMulLI,
+ [(set i32:$rD, (mul i32:$rA, imm32SExt16:$imm))]>;
+let Defs = [CARRY] in
+def SUBFIC : DForm_2< 8, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm),
+ "subfic $rD, $rA, $imm", IIC_IntGeneral,
+ [(set i32:$rD, (subc imm32SExt16:$imm, i32:$rA))]>;
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
+ def LI : DForm_2_r0<14, (outs gprc:$rD), (ins s16imm:$imm),
+ "li $rD, $imm", IIC_IntSimple,
+ [(set i32:$rD, imm32SExt16:$imm)]>;
+ def LIS : DForm_2_r0<15, (outs gprc:$rD), (ins s17imm:$imm),
+ "lis $rD, $imm", IIC_IntSimple,
+ [(set i32:$rD, imm16ShiftedSExt:$imm)]>;
+}
+}
+
+let PPC970_Unit = 1 in { // FXU Operations.
+let Defs = [CR0] in {
+def ANDIo : DForm_4<28, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2),
+ "andi. $dst, $src1, $src2", IIC_IntGeneral,
+ [(set i32:$dst, (and i32:$src1, immZExt16:$src2))]>,
+ isDOT;
+def ANDISo : DForm_4<29, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2),
+ "andis. $dst, $src1, $src2", IIC_IntGeneral,
+ [(set i32:$dst, (and i32:$src1, imm16ShiftedZExt:$src2))]>,
+ isDOT;
+}
+def ORI : DForm_4<24, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2),
+ "ori $dst, $src1, $src2", IIC_IntSimple,
+ [(set i32:$dst, (or i32:$src1, immZExt16:$src2))]>;
+def ORIS : DForm_4<25, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2),
+ "oris $dst, $src1, $src2", IIC_IntSimple,
+ [(set i32:$dst, (or i32:$src1, imm16ShiftedZExt:$src2))]>;
+def XORI : DForm_4<26, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2),
+ "xori $dst, $src1, $src2", IIC_IntSimple,
+ [(set i32:$dst, (xor i32:$src1, immZExt16:$src2))]>;
+def XORIS : DForm_4<27, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2),
+ "xoris $dst, $src1, $src2", IIC_IntSimple,
+ [(set i32:$dst, (xor i32:$src1, imm16ShiftedZExt:$src2))]>;
+
+def NOP : DForm_4_zero<24, (outs), (ins), "nop", IIC_IntSimple,
+ []>;
+let isCodeGenOnly = 1 in {
+// The POWER6 and POWER7 have special group-terminating nops.
+def NOP_GT_PWR6 : DForm_4_fixedreg_zero<24, 1, (outs), (ins),
+ "ori 1, 1, 0", IIC_IntSimple, []>;
+def NOP_GT_PWR7 : DForm_4_fixedreg_zero<24, 2, (outs), (ins),
+ "ori 2, 2, 0", IIC_IntSimple, []>;
+}
+
+let isCompare = 1, hasSideEffects = 0 in {
+ def CMPWI : DForm_5_ext<11, (outs crrc:$crD), (ins gprc:$rA, s16imm:$imm),
+ "cmpwi $crD, $rA, $imm", IIC_IntCompare>;
+ def CMPLWI : DForm_6_ext<10, (outs crrc:$dst), (ins gprc:$src1, u16imm:$src2),
+ "cmplwi $dst, $src1, $src2", IIC_IntCompare>;
+ def CMPRB : X_BF3_L1_RS5_RS5<31, 192, (outs crbitrc:$BF),
+ (ins u1imm:$L, g8rc:$rA, g8rc:$rB),
+ "cmprb $BF, $L, $rA, $rB", IIC_IntCompare, []>,
+ Requires<[IsISA3_0]>;
+}
+}
+
+let PPC970_Unit = 1, hasSideEffects = 0 in { // FXU Operations.
+let isCommutable = 1 in {
+defm NAND : XForm_6r<31, 476, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
+ "nand", "$rA, $rS, $rB", IIC_IntSimple,
+ [(set i32:$rA, (not (and i32:$rS, i32:$rB)))]>;
+defm AND : XForm_6r<31, 28, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
+ "and", "$rA, $rS, $rB", IIC_IntSimple,
+ [(set i32:$rA, (and i32:$rS, i32:$rB))]>;
+} // isCommutable
+defm ANDC : XForm_6r<31, 60, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
+ "andc", "$rA, $rS, $rB", IIC_IntSimple,
+ [(set i32:$rA, (and i32:$rS, (not i32:$rB)))]>;
+let isCommutable = 1 in {
+defm OR : XForm_6r<31, 444, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
+ "or", "$rA, $rS, $rB", IIC_IntSimple,
+ [(set i32:$rA, (or i32:$rS, i32:$rB))]>;
+defm NOR : XForm_6r<31, 124, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
+ "nor", "$rA, $rS, $rB", IIC_IntSimple,
+ [(set i32:$rA, (not (or i32:$rS, i32:$rB)))]>;
+} // isCommutable
+defm ORC : XForm_6r<31, 412, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
+ "orc", "$rA, $rS, $rB", IIC_IntSimple,
+ [(set i32:$rA, (or i32:$rS, (not i32:$rB)))]>;
+let isCommutable = 1 in {
+defm EQV : XForm_6r<31, 284, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
+ "eqv", "$rA, $rS, $rB", IIC_IntSimple,
+ [(set i32:$rA, (not (xor i32:$rS, i32:$rB)))]>;
+defm XOR : XForm_6r<31, 316, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
+ "xor", "$rA, $rS, $rB", IIC_IntSimple,
+ [(set i32:$rA, (xor i32:$rS, i32:$rB))]>;
+} // isCommutable
+defm SLW : XForm_6r<31, 24, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
+ "slw", "$rA, $rS, $rB", IIC_IntGeneral,
+ [(set i32:$rA, (PPCshl i32:$rS, i32:$rB))]>;
+defm SRW : XForm_6r<31, 536, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
+ "srw", "$rA, $rS, $rB", IIC_IntGeneral,
+ [(set i32:$rA, (PPCsrl i32:$rS, i32:$rB))]>;
+defm SRAW : XForm_6rc<31, 792, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
+ "sraw", "$rA, $rS, $rB", IIC_IntShift,
+ [(set i32:$rA, (PPCsra i32:$rS, i32:$rB))]>;
+}
+
+let PPC970_Unit = 1 in { // FXU Operations.
+let hasSideEffects = 0 in {
+defm SRAWI : XForm_10rc<31, 824, (outs gprc:$rA), (ins gprc:$rS, u5imm:$SH),
+ "srawi", "$rA, $rS, $SH", IIC_IntShift,
+ [(set i32:$rA, (sra i32:$rS, (i32 imm:$SH)))]>;
+defm CNTLZW : XForm_11r<31, 26, (outs gprc:$rA), (ins gprc:$rS),
+ "cntlzw", "$rA, $rS", IIC_IntGeneral,
+ [(set i32:$rA, (ctlz i32:$rS))]>;
+defm CNTTZW : XForm_11r<31, 538, (outs gprc:$rA), (ins gprc:$rS),
+ "cnttzw", "$rA, $rS", IIC_IntGeneral,
+ [(set i32:$rA, (cttz i32:$rS))]>, Requires<[IsISA3_0]>;
+defm EXTSB : XForm_11r<31, 954, (outs gprc:$rA), (ins gprc:$rS),
+ "extsb", "$rA, $rS", IIC_IntSimple,
+ [(set i32:$rA, (sext_inreg i32:$rS, i8))]>;
+defm EXTSH : XForm_11r<31, 922, (outs gprc:$rA), (ins gprc:$rS),
+ "extsh", "$rA, $rS", IIC_IntSimple,
+ [(set i32:$rA, (sext_inreg i32:$rS, i16))]>;
+
+let isCommutable = 1 in
+def CMPB : XForm_6<31, 508, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
+ "cmpb $rA, $rS, $rB", IIC_IntGeneral,
+ [(set i32:$rA, (PPCcmpb i32:$rS, i32:$rB))]>;
+}
+let isCompare = 1, hasSideEffects = 0 in {
+ def CMPW : XForm_16_ext<31, 0, (outs crrc:$crD), (ins gprc:$rA, gprc:$rB),
+ "cmpw $crD, $rA, $rB", IIC_IntCompare>;
+ def CMPLW : XForm_16_ext<31, 32, (outs crrc:$crD), (ins gprc:$rA, gprc:$rB),
+ "cmplw $crD, $rA, $rB", IIC_IntCompare>;
+}
+}
+let PPC970_Unit = 3 in { // FPU Operations.
+//def FCMPO : XForm_17<63, 32, (outs CRRC:$crD), (ins FPRC:$fA, FPRC:$fB),
+// "fcmpo $crD, $fA, $fB", IIC_FPCompare>;
+let isCompare = 1, hasSideEffects = 0 in {
+ def FCMPUS : XForm_17<63, 0, (outs crrc:$crD), (ins f4rc:$fA, f4rc:$fB),
+ "fcmpu $crD, $fA, $fB", IIC_FPCompare>;
+ let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+ def FCMPUD : XForm_17<63, 0, (outs crrc:$crD), (ins f8rc:$fA, f8rc:$fB),
+ "fcmpu $crD, $fA, $fB", IIC_FPCompare>;
+}
+
+let Uses = [RM] in {
+ let hasSideEffects = 0 in {
+ defm FCTIW : XForm_26r<63, 14, (outs f8rc:$frD), (ins f8rc:$frB),
+ "fctiw", "$frD, $frB", IIC_FPGeneral,
+ []>;
+ defm FCTIWZ : XForm_26r<63, 15, (outs f8rc:$frD), (ins f8rc:$frB),
+ "fctiwz", "$frD, $frB", IIC_FPGeneral,
+ [(set f64:$frD, (PPCfctiwz f64:$frB))]>;
+
+ defm FRSP : XForm_26r<63, 12, (outs f4rc:$frD), (ins f8rc:$frB),
+ "frsp", "$frD, $frB", IIC_FPGeneral,
+ [(set f32:$frD, (fpround f64:$frB))]>;
+
+ let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+ defm FRIND : XForm_26r<63, 392, (outs f8rc:$frD), (ins f8rc:$frB),
+ "frin", "$frD, $frB", IIC_FPGeneral,
+ [(set f64:$frD, (fround f64:$frB))]>;
+ defm FRINS : XForm_26r<63, 392, (outs f4rc:$frD), (ins f4rc:$frB),
+ "frin", "$frD, $frB", IIC_FPGeneral,
+ [(set f32:$frD, (fround f32:$frB))]>;
+ }
+
+ let hasSideEffects = 0 in {
+ let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+ defm FRIPD : XForm_26r<63, 456, (outs f8rc:$frD), (ins f8rc:$frB),
+ "frip", "$frD, $frB", IIC_FPGeneral,
+ [(set f64:$frD, (fceil f64:$frB))]>;
+ defm FRIPS : XForm_26r<63, 456, (outs f4rc:$frD), (ins f4rc:$frB),
+ "frip", "$frD, $frB", IIC_FPGeneral,
+ [(set f32:$frD, (fceil f32:$frB))]>;
+ let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+ defm FRIZD : XForm_26r<63, 424, (outs f8rc:$frD), (ins f8rc:$frB),
+ "friz", "$frD, $frB", IIC_FPGeneral,
+ [(set f64:$frD, (ftrunc f64:$frB))]>;
+ defm FRIZS : XForm_26r<63, 424, (outs f4rc:$frD), (ins f4rc:$frB),
+ "friz", "$frD, $frB", IIC_FPGeneral,
+ [(set f32:$frD, (ftrunc f32:$frB))]>;
+ let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+ defm FRIMD : XForm_26r<63, 488, (outs f8rc:$frD), (ins f8rc:$frB),
+ "frim", "$frD, $frB", IIC_FPGeneral,
+ [(set f64:$frD, (ffloor f64:$frB))]>;
+ defm FRIMS : XForm_26r<63, 488, (outs f4rc:$frD), (ins f4rc:$frB),
+ "frim", "$frD, $frB", IIC_FPGeneral,
+ [(set f32:$frD, (ffloor f32:$frB))]>;
+
+ defm FSQRT : XForm_26r<63, 22, (outs f8rc:$frD), (ins f8rc:$frB),
+ "fsqrt", "$frD, $frB", IIC_FPSqrtD,
+ [(set f64:$frD, (fsqrt f64:$frB))]>;
+ defm FSQRTS : XForm_26r<59, 22, (outs f4rc:$frD), (ins f4rc:$frB),
+ "fsqrts", "$frD, $frB", IIC_FPSqrtS,
+ [(set f32:$frD, (fsqrt f32:$frB))]>;
+ }
+ }
+}
+
+/// Note that FMR is defined as pseudo-ops on the PPC970 because they are
+/// often coalesced away and we don't want the dispatch group builder to think
+/// that they will fill slots (which could cause the load of a LSU reject to
+/// sneak into a d-group with a store).
+let hasSideEffects = 0 in
+defm FMR : XForm_26r<63, 72, (outs f4rc:$frD), (ins f4rc:$frB),
+ "fmr", "$frD, $frB", IIC_FPGeneral,
+ []>, // (set f32:$frD, f32:$frB)
+ PPC970_Unit_Pseudo;
+
+let PPC970_Unit = 3, hasSideEffects = 0 in { // FPU Operations.
+// These are artificially split into two different forms, for 4/8 byte FP.
+defm FABSS : XForm_26r<63, 264, (outs f4rc:$frD), (ins f4rc:$frB),
+ "fabs", "$frD, $frB", IIC_FPGeneral,
+ [(set f32:$frD, (fabs f32:$frB))]>;
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+defm FABSD : XForm_26r<63, 264, (outs f8rc:$frD), (ins f8rc:$frB),
+ "fabs", "$frD, $frB", IIC_FPGeneral,
+ [(set f64:$frD, (fabs f64:$frB))]>;
+defm FNABSS : XForm_26r<63, 136, (outs f4rc:$frD), (ins f4rc:$frB),
+ "fnabs", "$frD, $frB", IIC_FPGeneral,
+ [(set f32:$frD, (fneg (fabs f32:$frB)))]>;
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+defm FNABSD : XForm_26r<63, 136, (outs f8rc:$frD), (ins f8rc:$frB),
+ "fnabs", "$frD, $frB", IIC_FPGeneral,
+ [(set f64:$frD, (fneg (fabs f64:$frB)))]>;
+defm FNEGS : XForm_26r<63, 40, (outs f4rc:$frD), (ins f4rc:$frB),
+ "fneg", "$frD, $frB", IIC_FPGeneral,
+ [(set f32:$frD, (fneg f32:$frB))]>;
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+defm FNEGD : XForm_26r<63, 40, (outs f8rc:$frD), (ins f8rc:$frB),
+ "fneg", "$frD, $frB", IIC_FPGeneral,
+ [(set f64:$frD, (fneg f64:$frB))]>;
+
+defm FCPSGNS : XForm_28r<63, 8, (outs f4rc:$frD), (ins f4rc:$frA, f4rc:$frB),
+ "fcpsgn", "$frD, $frA, $frB", IIC_FPGeneral,
+ [(set f32:$frD, (fcopysign f32:$frB, f32:$frA))]>;
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+defm FCPSGND : XForm_28r<63, 8, (outs f8rc:$frD), (ins f8rc:$frA, f8rc:$frB),
+ "fcpsgn", "$frD, $frA, $frB", IIC_FPGeneral,
+ [(set f64:$frD, (fcopysign f64:$frB, f64:$frA))]>;
+
+// Reciprocal estimates.
+defm FRE : XForm_26r<63, 24, (outs f8rc:$frD), (ins f8rc:$frB),
+ "fre", "$frD, $frB", IIC_FPGeneral,
+ [(set f64:$frD, (PPCfre f64:$frB))]>;
+defm FRES : XForm_26r<59, 24, (outs f4rc:$frD), (ins f4rc:$frB),
+ "fres", "$frD, $frB", IIC_FPGeneral,
+ [(set f32:$frD, (PPCfre f32:$frB))]>;
+defm FRSQRTE : XForm_26r<63, 26, (outs f8rc:$frD), (ins f8rc:$frB),
+ "frsqrte", "$frD, $frB", IIC_FPGeneral,
+ [(set f64:$frD, (PPCfrsqrte f64:$frB))]>;
+defm FRSQRTES : XForm_26r<59, 26, (outs f4rc:$frD), (ins f4rc:$frB),
+ "frsqrtes", "$frD, $frB", IIC_FPGeneral,
+ [(set f32:$frD, (PPCfrsqrte f32:$frB))]>;
+}
+
+// XL-Form instructions. condition register logical ops.
+//
+let hasSideEffects = 0 in
+def MCRF : XLForm_3<19, 0, (outs crrc:$BF), (ins crrc:$BFA),
+ "mcrf $BF, $BFA", IIC_BrMCR>,
+ PPC970_DGroup_First, PPC970_Unit_CRU;
+
+// FIXME: According to the ISA (section 2.5.1 of version 2.06), the
+// condition-register logical instructions have preferred forms. Specifically,
+// it is preferred that the bit specified by the BT field be in the same
+// condition register as that specified by the bit BB. We might want to account
+// for this via hinting the register allocator and anti-dep breakers, or we
+// could constrain the register class to force this constraint and then loosen
+// it during register allocation via convertToThreeAddress or some similar
+// mechanism.
+
+let isCommutable = 1 in {
+def CRAND : XLForm_1<19, 257, (outs crbitrc:$CRD),
+ (ins crbitrc:$CRA, crbitrc:$CRB),
+ "crand $CRD, $CRA, $CRB", IIC_BrCR,
+ [(set i1:$CRD, (and i1:$CRA, i1:$CRB))]>;
+
+def CRNAND : XLForm_1<19, 225, (outs crbitrc:$CRD),
+ (ins crbitrc:$CRA, crbitrc:$CRB),
+ "crnand $CRD, $CRA, $CRB", IIC_BrCR,
+ [(set i1:$CRD, (not (and i1:$CRA, i1:$CRB)))]>;
+
+def CROR : XLForm_1<19, 449, (outs crbitrc:$CRD),
+ (ins crbitrc:$CRA, crbitrc:$CRB),
+ "cror $CRD, $CRA, $CRB", IIC_BrCR,
+ [(set i1:$CRD, (or i1:$CRA, i1:$CRB))]>;
+
+def CRXOR : XLForm_1<19, 193, (outs crbitrc:$CRD),
+ (ins crbitrc:$CRA, crbitrc:$CRB),
+ "crxor $CRD, $CRA, $CRB", IIC_BrCR,
+ [(set i1:$CRD, (xor i1:$CRA, i1:$CRB))]>;
+
+def CRNOR : XLForm_1<19, 33, (outs crbitrc:$CRD),
+ (ins crbitrc:$CRA, crbitrc:$CRB),
+ "crnor $CRD, $CRA, $CRB", IIC_BrCR,
+ [(set i1:$CRD, (not (or i1:$CRA, i1:$CRB)))]>;
+
+def CREQV : XLForm_1<19, 289, (outs crbitrc:$CRD),
+ (ins crbitrc:$CRA, crbitrc:$CRB),
+ "creqv $CRD, $CRA, $CRB", IIC_BrCR,
+ [(set i1:$CRD, (not (xor i1:$CRA, i1:$CRB)))]>;
+} // isCommutable
+
+def CRANDC : XLForm_1<19, 129, (outs crbitrc:$CRD),
+ (ins crbitrc:$CRA, crbitrc:$CRB),
+ "crandc $CRD, $CRA, $CRB", IIC_BrCR,
+ [(set i1:$CRD, (and i1:$CRA, (not i1:$CRB)))]>;
+
+def CRORC : XLForm_1<19, 417, (outs crbitrc:$CRD),
+ (ins crbitrc:$CRA, crbitrc:$CRB),
+ "crorc $CRD, $CRA, $CRB", IIC_BrCR,
+ [(set i1:$CRD, (or i1:$CRA, (not i1:$CRB)))]>;
+
+let isCodeGenOnly = 1 in {
+def CRSET : XLForm_1_ext<19, 289, (outs crbitrc:$dst), (ins),
+ "creqv $dst, $dst, $dst", IIC_BrCR,
+ [(set i1:$dst, 1)]>;
+
+def CRUNSET: XLForm_1_ext<19, 193, (outs crbitrc:$dst), (ins),
+ "crxor $dst, $dst, $dst", IIC_BrCR,
+ [(set i1:$dst, 0)]>;
+
+let Defs = [CR1EQ], CRD = 6 in {
+def CR6SET : XLForm_1_ext<19, 289, (outs), (ins),
+ "creqv 6, 6, 6", IIC_BrCR,
+ [(PPCcr6set)]>;
+
+def CR6UNSET: XLForm_1_ext<19, 193, (outs), (ins),
+ "crxor 6, 6, 6", IIC_BrCR,
+ [(PPCcr6unset)]>;
+}
+}
+
+// XFX-Form instructions. Instructions that deal with SPRs.
+//
+
+def MFSPR : XFXForm_1<31, 339, (outs gprc:$RT), (ins i32imm:$SPR),
+ "mfspr $RT, $SPR", IIC_SprMFSPR>;
+def MTSPR : XFXForm_1<31, 467, (outs), (ins i32imm:$SPR, gprc:$RT),
+ "mtspr $SPR, $RT", IIC_SprMTSPR>;
+
+def MFTB : XFXForm_1<31, 371, (outs gprc:$RT), (ins i32imm:$SPR),
+ "mftb $RT, $SPR", IIC_SprMFTB>;
+
+// A pseudo-instruction used to implement the read of the 64-bit cycle counter
+// on a 32-bit target.
+let hasSideEffects = 1, usesCustomInserter = 1 in
+def ReadTB : Pseudo<(outs gprc:$lo, gprc:$hi), (ins),
+ "#ReadTB", []>;
+
+let Uses = [CTR] in {
+def MFCTR : XFXForm_1_ext<31, 339, 9, (outs gprc:$rT), (ins),
+ "mfctr $rT", IIC_SprMFSPR>,
+ PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+let Defs = [CTR], Pattern = [(PPCmtctr i32:$rS)] in {
+def MTCTR : XFXForm_7_ext<31, 467, 9, (outs), (ins gprc:$rS),
+ "mtctr $rS", IIC_SprMTSPR>,
+ PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+let hasSideEffects = 1, isCodeGenOnly = 1, Defs = [CTR] in {
+let Pattern = [(int_ppc_mtctr i32:$rS)] in
+def MTCTRloop : XFXForm_7_ext<31, 467, 9, (outs), (ins gprc:$rS),
+ "mtctr $rS", IIC_SprMTSPR>,
+ PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+
+let Defs = [LR] in {
+def MTLR : XFXForm_7_ext<31, 467, 8, (outs), (ins gprc:$rS),
+ "mtlr $rS", IIC_SprMTSPR>,
+ PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+let Uses = [LR] in {
+def MFLR : XFXForm_1_ext<31, 339, 8, (outs gprc:$rT), (ins),
+ "mflr $rT", IIC_SprMFSPR>,
+ PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+
+let isCodeGenOnly = 1 in {
+ // Move to/from VRSAVE: despite being a SPR, the VRSAVE register is renamed
+ // like a GPR on the PPC970. As such, copies in and out have the same
+ // performance characteristics as an OR instruction.
+ def MTVRSAVE : XFXForm_7_ext<31, 467, 256, (outs), (ins gprc:$rS),
+ "mtspr 256, $rS", IIC_IntGeneral>,
+ PPC970_DGroup_Single, PPC970_Unit_FXU;
+ def MFVRSAVE : XFXForm_1_ext<31, 339, 256, (outs gprc:$rT), (ins),
+ "mfspr $rT, 256", IIC_IntGeneral>,
+ PPC970_DGroup_First, PPC970_Unit_FXU;
+
+ def MTVRSAVEv : XFXForm_7_ext<31, 467, 256,
+ (outs VRSAVERC:$reg), (ins gprc:$rS),
+ "mtspr 256, $rS", IIC_IntGeneral>,
+ PPC970_DGroup_Single, PPC970_Unit_FXU;
+ def MFVRSAVEv : XFXForm_1_ext<31, 339, 256, (outs gprc:$rT),
+ (ins VRSAVERC:$reg),
+ "mfspr $rT, 256", IIC_IntGeneral>,
+ PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+
+// Aliases for mtvrsave/mfvrsave to mfspr/mtspr.
+def : InstAlias<"mtvrsave $rS", (MTVRSAVE gprc:$rS)>;
+def : InstAlias<"mfvrsave $rS", (MFVRSAVE gprc:$rS)>;
+
+// SPILL_VRSAVE - Indicate that we're dumping the VRSAVE register,
+// so we'll need to scavenge a register for it.
+let mayStore = 1 in
+def SPILL_VRSAVE : Pseudo<(outs), (ins VRSAVERC:$vrsave, memri:$F),
+ "#SPILL_VRSAVE", []>;
+
+// RESTORE_VRSAVE - Indicate that we're restoring the VRSAVE register (previously
+// spilled), so we'll need to scavenge a register for it.
+let mayLoad = 1 in
+def RESTORE_VRSAVE : Pseudo<(outs VRSAVERC:$vrsave), (ins memri:$F),
+ "#RESTORE_VRSAVE", []>;
+
+let hasSideEffects = 0 in {
+// mtocrf's input needs to be prepared by shifting by an amount dependent
+// on the cr register selected. Thus, post-ra anti-dep breaking must not
+// later change that register assignment.
+let hasExtraDefRegAllocReq = 1 in {
+def MTOCRF: XFXForm_5a<31, 144, (outs crbitm:$FXM), (ins gprc:$ST),
+ "mtocrf $FXM, $ST", IIC_BrMCRX>,
+ PPC970_DGroup_First, PPC970_Unit_CRU;
+
+// Similarly to mtocrf, the mask for mtcrf must be prepared in a way that
+// is dependent on the cr fields being set.
+def MTCRF : XFXForm_5<31, 144, (outs), (ins i32imm:$FXM, gprc:$rS),
+ "mtcrf $FXM, $rS", IIC_BrMCRX>,
+ PPC970_MicroCode, PPC970_Unit_CRU;
+} // hasExtraDefRegAllocReq = 1
+
+// mfocrf's input needs to be prepared by shifting by an amount dependent
+// on the cr register selected. Thus, post-ra anti-dep breaking must not
+// later change that register assignment.
+let hasExtraSrcRegAllocReq = 1 in {
+def MFOCRF: XFXForm_5a<31, 19, (outs gprc:$rT), (ins crbitm:$FXM),
+ "mfocrf $rT, $FXM", IIC_SprMFCRF>,
+ PPC970_DGroup_First, PPC970_Unit_CRU;
+
+// Similarly to mfocrf, the mask for mfcrf must be prepared in a way that
+// is dependent on the cr fields being copied.
+def MFCR : XFXForm_3<31, 19, (outs gprc:$rT), (ins),
+ "mfcr $rT", IIC_SprMFCR>,
+ PPC970_MicroCode, PPC970_Unit_CRU;
+} // hasExtraSrcRegAllocReq = 1
+
+def MCRXRX : X_BF3<31, 576, (outs crrc:$BF), (ins),
+ "mcrxrx $BF", IIC_BrMCRX>, Requires<[IsISA3_0]>;
+} // hasSideEffects = 0
+
+// Pseudo instruction to perform FADD in round-to-zero mode.
+let usesCustomInserter = 1, Uses = [RM] in {
+ def FADDrtz: Pseudo<(outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB), "",
+ [(set f64:$FRT, (PPCfaddrtz f64:$FRA, f64:$FRB))]>;
+}
+
+// The above pseudo gets expanded to make use of the following instructions
+// to manipulate FPSCR. Note that FPSCR is not modeled at the DAG level.
+let Uses = [RM], Defs = [RM] in {
+ def MTFSB0 : XForm_43<63, 70, (outs), (ins u5imm:$FM),
+ "mtfsb0 $FM", IIC_IntMTFSB0, []>,
+ PPC970_DGroup_Single, PPC970_Unit_FPU;
+ def MTFSB1 : XForm_43<63, 38, (outs), (ins u5imm:$FM),
+ "mtfsb1 $FM", IIC_IntMTFSB0, []>,
+ PPC970_DGroup_Single, PPC970_Unit_FPU;
+ let isCodeGenOnly = 1 in
+ def MTFSFb : XFLForm<63, 711, (outs), (ins i32imm:$FM, f8rc:$rT),
+ "mtfsf $FM, $rT", IIC_IntMTFSB0, []>,
+ PPC970_DGroup_Single, PPC970_Unit_FPU;
+}
+let Uses = [RM] in {
+ def MFFS : XForm_42<63, 583, (outs f8rc:$rT), (ins),
+ "mffs $rT", IIC_IntMFFS,
+ [(set f64:$rT, (PPCmffs))]>,
+ PPC970_DGroup_Single, PPC970_Unit_FPU;
+
+ let Defs = [CR1] in
+ def MFFSo : XForm_42<63, 583, (outs f8rc:$rT), (ins),
+ "mffs. $rT", IIC_IntMFFS, []>, isDOT;
+}
+
+
+let PPC970_Unit = 1, hasSideEffects = 0 in { // FXU Operations.
+// XO-Form instructions. Arithmetic instructions that can set overflow bit
+let isCommutable = 1 in
+defm ADD4 : XOForm_1r<31, 266, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+ "add", "$rT, $rA, $rB", IIC_IntSimple,
+ [(set i32:$rT, (add i32:$rA, i32:$rB))]>;
+let isCodeGenOnly = 1 in
+def ADD4TLS : XOForm_1<31, 266, 0, (outs gprc:$rT), (ins gprc:$rA, tlsreg32:$rB),
+ "add $rT, $rA, $rB", IIC_IntSimple,
+ [(set i32:$rT, (add i32:$rA, tglobaltlsaddr:$rB))]>;
+let isCommutable = 1 in
+defm ADDC : XOForm_1rc<31, 10, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+ "addc", "$rT, $rA, $rB", IIC_IntGeneral,
+ [(set i32:$rT, (addc i32:$rA, i32:$rB))]>,
+ PPC970_DGroup_Cracked;
+
+defm DIVW : XOForm_1rcr<31, 491, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+ "divw", "$rT, $rA, $rB", IIC_IntDivW,
+ [(set i32:$rT, (sdiv i32:$rA, i32:$rB))]>;
+defm DIVWU : XOForm_1rcr<31, 459, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+ "divwu", "$rT, $rA, $rB", IIC_IntDivW,
+ [(set i32:$rT, (udiv i32:$rA, i32:$rB))]>;
+def DIVWE : XOForm_1<31, 427, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+ "divwe $rT, $rA, $rB", IIC_IntDivW,
+ [(set i32:$rT, (int_ppc_divwe gprc:$rA, gprc:$rB))]>,
+ Requires<[HasExtDiv]>;
+let Defs = [CR0] in
+def DIVWEo : XOForm_1<31, 427, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+ "divwe. $rT, $rA, $rB", IIC_IntDivW,
+ []>, isDOT, PPC970_DGroup_Cracked, PPC970_DGroup_First,
+ Requires<[HasExtDiv]>;
+def DIVWEU : XOForm_1<31, 395, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+ "divweu $rT, $rA, $rB", IIC_IntDivW,
+ [(set i32:$rT, (int_ppc_divweu gprc:$rA, gprc:$rB))]>,
+ Requires<[HasExtDiv]>;
+let Defs = [CR0] in
+def DIVWEUo : XOForm_1<31, 395, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+ "divweu. $rT, $rA, $rB", IIC_IntDivW,
+ []>, isDOT, PPC970_DGroup_Cracked, PPC970_DGroup_First,
+ Requires<[HasExtDiv]>;
+let isCommutable = 1 in {
+defm MULHW : XOForm_1r<31, 75, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+ "mulhw", "$rT, $rA, $rB", IIC_IntMulHW,
+ [(set i32:$rT, (mulhs i32:$rA, i32:$rB))]>;
+defm MULHWU : XOForm_1r<31, 11, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+ "mulhwu", "$rT, $rA, $rB", IIC_IntMulHWU,
+ [(set i32:$rT, (mulhu i32:$rA, i32:$rB))]>;
+defm MULLW : XOForm_1r<31, 235, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+ "mullw", "$rT, $rA, $rB", IIC_IntMulHW,
+ [(set i32:$rT, (mul i32:$rA, i32:$rB))]>;
+} // isCommutable
+defm SUBF : XOForm_1r<31, 40, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+ "subf", "$rT, $rA, $rB", IIC_IntGeneral,
+ [(set i32:$rT, (sub i32:$rB, i32:$rA))]>;
+defm SUBFC : XOForm_1rc<31, 8, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+ "subfc", "$rT, $rA, $rB", IIC_IntGeneral,
+ [(set i32:$rT, (subc i32:$rB, i32:$rA))]>,
+ PPC970_DGroup_Cracked;
+defm NEG : XOForm_3r<31, 104, 0, (outs gprc:$rT), (ins gprc:$rA),
+ "neg", "$rT, $rA", IIC_IntSimple,
+ [(set i32:$rT, (ineg i32:$rA))]>;
+let Uses = [CARRY] in {
+let isCommutable = 1 in
+defm ADDE : XOForm_1rc<31, 138, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+ "adde", "$rT, $rA, $rB", IIC_IntGeneral,
+ [(set i32:$rT, (adde i32:$rA, i32:$rB))]>;
+defm ADDME : XOForm_3rc<31, 234, 0, (outs gprc:$rT), (ins gprc:$rA),
+ "addme", "$rT, $rA", IIC_IntGeneral,
+ [(set i32:$rT, (adde i32:$rA, -1))]>;
+defm ADDZE : XOForm_3rc<31, 202, 0, (outs gprc:$rT), (ins gprc:$rA),
+ "addze", "$rT, $rA", IIC_IntGeneral,
+ [(set i32:$rT, (adde i32:$rA, 0))]>;
+defm SUBFE : XOForm_1rc<31, 136, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+ "subfe", "$rT, $rA, $rB", IIC_IntGeneral,
+ [(set i32:$rT, (sube i32:$rB, i32:$rA))]>;
+defm SUBFME : XOForm_3rc<31, 232, 0, (outs gprc:$rT), (ins gprc:$rA),
+ "subfme", "$rT, $rA", IIC_IntGeneral,
+ [(set i32:$rT, (sube -1, i32:$rA))]>;
+defm SUBFZE : XOForm_3rc<31, 200, 0, (outs gprc:$rT), (ins gprc:$rA),
+ "subfze", "$rT, $rA", IIC_IntGeneral,
+ [(set i32:$rT, (sube 0, i32:$rA))]>;
+}
+}
+
+// A-Form instructions. Most of the instructions executed in the FPU are of
+// this type.
+//
+let PPC970_Unit = 3, hasSideEffects = 0 in { // FPU Operations.
+let Uses = [RM] in {
+let isCommutable = 1 in {
+ defm FMADD : AForm_1r<63, 29,
+ (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB),
+ "fmadd", "$FRT, $FRA, $FRC, $FRB", IIC_FPFused,
+ [(set f64:$FRT, (fma f64:$FRA, f64:$FRC, f64:$FRB))]>;
+ defm FMADDS : AForm_1r<59, 29,
+ (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC, f4rc:$FRB),
+ "fmadds", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
+ [(set f32:$FRT, (fma f32:$FRA, f32:$FRC, f32:$FRB))]>;
+ defm FMSUB : AForm_1r<63, 28,
+ (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB),
+ "fmsub", "$FRT, $FRA, $FRC, $FRB", IIC_FPFused,
+ [(set f64:$FRT,
+ (fma f64:$FRA, f64:$FRC, (fneg f64:$FRB)))]>;
+ defm FMSUBS : AForm_1r<59, 28,
+ (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC, f4rc:$FRB),
+ "fmsubs", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
+ [(set f32:$FRT,
+ (fma f32:$FRA, f32:$FRC, (fneg f32:$FRB)))]>;
+ defm FNMADD : AForm_1r<63, 31,
+ (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB),
+ "fnmadd", "$FRT, $FRA, $FRC, $FRB", IIC_FPFused,
+ [(set f64:$FRT,
+ (fneg (fma f64:$FRA, f64:$FRC, f64:$FRB)))]>;
+ defm FNMADDS : AForm_1r<59, 31,
+ (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC, f4rc:$FRB),
+ "fnmadds", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
+ [(set f32:$FRT,
+ (fneg (fma f32:$FRA, f32:$FRC, f32:$FRB)))]>;
+ defm FNMSUB : AForm_1r<63, 30,
+ (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB),
+ "fnmsub", "$FRT, $FRA, $FRC, $FRB", IIC_FPFused,
+ [(set f64:$FRT, (fneg (fma f64:$FRA, f64:$FRC,
+ (fneg f64:$FRB))))]>;
+ defm FNMSUBS : AForm_1r<59, 30,
+ (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC, f4rc:$FRB),
+ "fnmsubs", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
+ [(set f32:$FRT, (fneg (fma f32:$FRA, f32:$FRC,
+ (fneg f32:$FRB))))]>;
+} // isCommutable
+}
+// FSEL is artificially split into 4 and 8-byte forms for the result. To avoid
+// having 4 of these, force the comparison to always be an 8-byte double (code
+// should use an FMRSD if the input comparison value really wants to be a float)
+// and 4/8 byte forms for the result and operand type..
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+defm FSELD : AForm_1r<63, 23,
+ (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB),
+ "fsel", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
+ [(set f64:$FRT, (PPCfsel f64:$FRA, f64:$FRC, f64:$FRB))]>;
+defm FSELS : AForm_1r<63, 23,
+ (outs f4rc:$FRT), (ins f8rc:$FRA, f4rc:$FRC, f4rc:$FRB),
+ "fsel", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
+ [(set f32:$FRT, (PPCfsel f64:$FRA, f32:$FRC, f32:$FRB))]>;
+let Uses = [RM] in {
+ let isCommutable = 1 in {
+ defm FADD : AForm_2r<63, 21,
+ (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB),
+ "fadd", "$FRT, $FRA, $FRB", IIC_FPAddSub,
+ [(set f64:$FRT, (fadd f64:$FRA, f64:$FRB))]>;
+ defm FADDS : AForm_2r<59, 21,
+ (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRB),
+ "fadds", "$FRT, $FRA, $FRB", IIC_FPGeneral,
+ [(set f32:$FRT, (fadd f32:$FRA, f32:$FRB))]>;
+ } // isCommutable
+ defm FDIV : AForm_2r<63, 18,
+ (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB),
+ "fdiv", "$FRT, $FRA, $FRB", IIC_FPDivD,
+ [(set f64:$FRT, (fdiv f64:$FRA, f64:$FRB))]>;
+ defm FDIVS : AForm_2r<59, 18,
+ (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRB),
+ "fdivs", "$FRT, $FRA, $FRB", IIC_FPDivS,
+ [(set f32:$FRT, (fdiv f32:$FRA, f32:$FRB))]>;
+ let isCommutable = 1 in {
+ defm FMUL : AForm_3r<63, 25,
+ (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC),
+ "fmul", "$FRT, $FRA, $FRC", IIC_FPFused,
+ [(set f64:$FRT, (fmul f64:$FRA, f64:$FRC))]>;
+ defm FMULS : AForm_3r<59, 25,
+ (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC),
+ "fmuls", "$FRT, $FRA, $FRC", IIC_FPGeneral,
+ [(set f32:$FRT, (fmul f32:$FRA, f32:$FRC))]>;
+ } // isCommutable
+ defm FSUB : AForm_2r<63, 20,
+ (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB),
+ "fsub", "$FRT, $FRA, $FRB", IIC_FPAddSub,
+ [(set f64:$FRT, (fsub f64:$FRA, f64:$FRB))]>;
+ defm FSUBS : AForm_2r<59, 20,
+ (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRB),
+ "fsubs", "$FRT, $FRA, $FRB", IIC_FPGeneral,
+ [(set f32:$FRT, (fsub f32:$FRA, f32:$FRB))]>;
+ }
+}
+
+let hasSideEffects = 0 in {
+let PPC970_Unit = 1 in { // FXU Operations.
+ let isSelect = 1 in
+ def ISEL : AForm_4<31, 15,
+ (outs gprc:$rT), (ins gprc_nor0:$rA, gprc:$rB, crbitrc:$cond),
+ "isel $rT, $rA, $rB, $cond", IIC_IntISEL,
+ []>;
+}
+
+let PPC970_Unit = 1 in { // FXU Operations.
+// M-Form instructions. rotate and mask instructions.
+//
+let isCommutable = 1 in {
+// RLWIMI can be commuted if the rotate amount is zero.
+defm RLWIMI : MForm_2r<20, (outs gprc:$rA),
+ (ins gprc:$rSi, gprc:$rS, u5imm:$SH, u5imm:$MB,
+ u5imm:$ME), "rlwimi", "$rA, $rS, $SH, $MB, $ME",
+ IIC_IntRotate, []>, PPC970_DGroup_Cracked,
+ RegConstraint<"$rSi = $rA">, NoEncode<"$rSi">;
+}
+let BaseName = "rlwinm" in {
+def RLWINM : MForm_2<21,
+ (outs gprc:$rA), (ins gprc:$rS, u5imm:$SH, u5imm:$MB, u5imm:$ME),
+ "rlwinm $rA, $rS, $SH, $MB, $ME", IIC_IntGeneral,
+ []>, RecFormRel;
+let Defs = [CR0] in
+def RLWINMo : MForm_2<21,
+ (outs gprc:$rA), (ins gprc:$rS, u5imm:$SH, u5imm:$MB, u5imm:$ME),
+ "rlwinm. $rA, $rS, $SH, $MB, $ME", IIC_IntGeneral,
+ []>, isDOT, RecFormRel, PPC970_DGroup_Cracked;
+}
+defm RLWNM : MForm_2r<23, (outs gprc:$rA),
+ (ins gprc:$rS, gprc:$rB, u5imm:$MB, u5imm:$ME),
+ "rlwnm", "$rA, $rS, $rB, $MB, $ME", IIC_IntGeneral,
+ []>;
+}
+} // hasSideEffects = 0
+
+//===----------------------------------------------------------------------===//
+// PowerPC Instruction Patterns
+//
+
+// Arbitrary immediate support. Implement in terms of LIS/ORI.
+def : Pat<(i32 imm:$imm),
+ (ORI (LIS (HI16 imm:$imm)), (LO16 imm:$imm))>;
+
+// Implement the 'not' operation with the NOR instruction.
+def i32not : OutPatFrag<(ops node:$in),
+ (NOR $in, $in)>;
+def : Pat<(not i32:$in),
+ (i32not $in)>;
+
+// ADD an arbitrary immediate.
+def : Pat<(add i32:$in, imm:$imm),
+ (ADDIS (ADDI $in, (LO16 imm:$imm)), (HA16 imm:$imm))>;
+// OR an arbitrary immediate.
+def : Pat<(or i32:$in, imm:$imm),
+ (ORIS (ORI $in, (LO16 imm:$imm)), (HI16 imm:$imm))>;
+// XOR an arbitrary immediate.
+def : Pat<(xor i32:$in, imm:$imm),
+ (XORIS (XORI $in, (LO16 imm:$imm)), (HI16 imm:$imm))>;
+// SUBFIC
+def : Pat<(sub imm32SExt16:$imm, i32:$in),
+ (SUBFIC $in, imm:$imm)>;
+
+// SHL/SRL
+def : Pat<(shl i32:$in, (i32 imm:$imm)),
+ (RLWINM $in, imm:$imm, 0, (SHL32 imm:$imm))>;
+def : Pat<(srl i32:$in, (i32 imm:$imm)),
+ (RLWINM $in, (SRL32 imm:$imm), imm:$imm, 31)>;
+
+// ROTL
+def : Pat<(rotl i32:$in, i32:$sh),
+ (RLWNM $in, $sh, 0, 31)>;
+def : Pat<(rotl i32:$in, (i32 imm:$imm)),
+ (RLWINM $in, imm:$imm, 0, 31)>;
+
+// RLWNM
+def : Pat<(and (rotl i32:$in, i32:$sh), maskimm32:$imm),
+ (RLWNM $in, $sh, (MB maskimm32:$imm), (ME maskimm32:$imm))>;
+
+// Calls
+def : Pat<(PPCcall (i32 tglobaladdr:$dst)),
+ (BL tglobaladdr:$dst)>;
+def : Pat<(PPCcall (i32 texternalsym:$dst)),
+ (BL texternalsym:$dst)>;
+
+def : Pat<(PPCtc_return (i32 tglobaladdr:$dst), imm:$imm),
+ (TCRETURNdi tglobaladdr:$dst, imm:$imm)>;
+
+def : Pat<(PPCtc_return (i32 texternalsym:$dst), imm:$imm),
+ (TCRETURNdi texternalsym:$dst, imm:$imm)>;
+
+def : Pat<(PPCtc_return CTRRC:$dst, imm:$imm),
+ (TCRETURNri CTRRC:$dst, imm:$imm)>;
+
+
+
+// Hi and Lo for Darwin Global Addresses.
+def : Pat<(PPChi tglobaladdr:$in, 0), (LIS tglobaladdr:$in)>;
+def : Pat<(PPClo tglobaladdr:$in, 0), (LI tglobaladdr:$in)>;
+def : Pat<(PPChi tconstpool:$in, 0), (LIS tconstpool:$in)>;
+def : Pat<(PPClo tconstpool:$in, 0), (LI tconstpool:$in)>;
+def : Pat<(PPChi tjumptable:$in, 0), (LIS tjumptable:$in)>;
+def : Pat<(PPClo tjumptable:$in, 0), (LI tjumptable:$in)>;
+def : Pat<(PPChi tblockaddress:$in, 0), (LIS tblockaddress:$in)>;
+def : Pat<(PPClo tblockaddress:$in, 0), (LI tblockaddress:$in)>;
+def : Pat<(PPChi tglobaltlsaddr:$g, i32:$in),
+ (ADDIS $in, tglobaltlsaddr:$g)>;
+def : Pat<(PPClo tglobaltlsaddr:$g, i32:$in),
+ (ADDI $in, tglobaltlsaddr:$g)>;
+def : Pat<(add i32:$in, (PPChi tglobaladdr:$g, 0)),
+ (ADDIS $in, tglobaladdr:$g)>;
+def : Pat<(add i32:$in, (PPChi tconstpool:$g, 0)),
+ (ADDIS $in, tconstpool:$g)>;
+def : Pat<(add i32:$in, (PPChi tjumptable:$g, 0)),
+ (ADDIS $in, tjumptable:$g)>;
+def : Pat<(add i32:$in, (PPChi tblockaddress:$g, 0)),
+ (ADDIS $in, tblockaddress:$g)>;
+
+// Support for thread-local storage.
+def PPC32GOT: Pseudo<(outs gprc:$rD), (ins), "#PPC32GOT",
+ [(set i32:$rD, (PPCppc32GOT))]>;
+
+// Get the _GLOBAL_OFFSET_TABLE_ in PIC mode.
+// This uses two output registers, the first as the real output, the second as a
+// temporary register, used internally in code generation.
+def PPC32PICGOT: Pseudo<(outs gprc:$rD, gprc:$rT), (ins), "#PPC32PICGOT",
+ []>, NoEncode<"$rT">;
+
+def LDgotTprelL32: Pseudo<(outs gprc:$rD), (ins s16imm:$disp, gprc_nor0:$reg),
+ "#LDgotTprelL32",
+ [(set i32:$rD,
+ (PPCldGotTprelL tglobaltlsaddr:$disp, i32:$reg))]>;
+def : Pat<(PPCaddTls i32:$in, tglobaltlsaddr:$g),
+ (ADD4TLS $in, tglobaltlsaddr:$g)>;
+
+def ADDItlsgdL32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp),
+ "#ADDItlsgdL32",
+ [(set i32:$rD,
+ (PPCaddiTlsgdL i32:$reg, tglobaltlsaddr:$disp))]>;
+// LR is a true define, while the rest of the Defs are clobbers. R3 is
+// explicitly defined when this op is created, so not mentioned here.
+let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
+ Defs = [R0,R4,R5,R6,R7,R8,R9,R10,R11,R12,LR,CTR,CR0,CR1,CR5,CR6,CR7] in
+def GETtlsADDR32 : Pseudo<(outs gprc:$rD), (ins gprc:$reg, tlsgd32:$sym),
+ "GETtlsADDR32",
+ [(set i32:$rD,
+ (PPCgetTlsAddr i32:$reg, tglobaltlsaddr:$sym))]>;
+// Combined op for ADDItlsgdL32 and GETtlsADDR32, late expanded. R3 and LR
+// are true defines while the rest of the Defs are clobbers.
+let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
+ Defs = [R0,R3,R4,R5,R6,R7,R8,R9,R10,R11,R12,LR,CTR,CR0,CR1,CR5,CR6,CR7] in
+def ADDItlsgdLADDR32 : Pseudo<(outs gprc:$rD),
+ (ins gprc_nor0:$reg, s16imm:$disp, tlsgd32:$sym),
+ "#ADDItlsgdLADDR32",
+ [(set i32:$rD,
+ (PPCaddiTlsgdLAddr i32:$reg,
+ tglobaltlsaddr:$disp,
+ tglobaltlsaddr:$sym))]>;
+def ADDItlsldL32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp),
+ "#ADDItlsldL32",
+ [(set i32:$rD,
+ (PPCaddiTlsldL i32:$reg, tglobaltlsaddr:$disp))]>;
+// LR is a true define, while the rest of the Defs are clobbers. R3 is
+// explicitly defined when this op is created, so not mentioned here.
+let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
+ Defs = [R0,R4,R5,R6,R7,R8,R9,R10,R11,R12,LR,CTR,CR0,CR1,CR5,CR6,CR7] in
+def GETtlsldADDR32 : Pseudo<(outs gprc:$rD), (ins gprc:$reg, tlsgd32:$sym),
+ "GETtlsldADDR32",
+ [(set i32:$rD,
+ (PPCgetTlsldAddr i32:$reg,
+ tglobaltlsaddr:$sym))]>;
+// Combined op for ADDItlsldL32 and GETtlsADDR32, late expanded. R3 and LR
+// are true defines while the rest of the Defs are clobbers.
+let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
+ Defs = [R0,R3,R4,R5,R6,R7,R8,R9,R10,R11,R12,LR,CTR,CR0,CR1,CR5,CR6,CR7] in
+def ADDItlsldLADDR32 : Pseudo<(outs gprc:$rD),
+ (ins gprc_nor0:$reg, s16imm:$disp, tlsgd32:$sym),
+ "#ADDItlsldLADDR32",
+ [(set i32:$rD,
+ (PPCaddiTlsldLAddr i32:$reg,
+ tglobaltlsaddr:$disp,
+ tglobaltlsaddr:$sym))]>;
+def ADDIdtprelL32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp),
+ "#ADDIdtprelL32",
+ [(set i32:$rD,
+ (PPCaddiDtprelL i32:$reg, tglobaltlsaddr:$disp))]>;
+def ADDISdtprelHA32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp),
+ "#ADDISdtprelHA32",
+ [(set i32:$rD,
+ (PPCaddisDtprelHA i32:$reg,
+ tglobaltlsaddr:$disp))]>;
+
+// Support for Position-independent code
+def LWZtoc : Pseudo<(outs gprc:$rD), (ins tocentry32:$disp, gprc:$reg),
+ "#LWZtoc",
+ [(set i32:$rD,
+ (PPCtoc_entry tglobaladdr:$disp, i32:$reg))]>;
+// Get Global (GOT) Base Register offset, from the word immediately preceding
+// the function label.
+def UpdateGBR : Pseudo<(outs gprc:$rD, gprc:$rT), (ins gprc:$rI), "#UpdateGBR", []>;
+
+
+// Standard shifts. These are represented separately from the real shifts above
+// so that we can distinguish between shifts that allow 5-bit and 6-bit shift
+// amounts.
+def : Pat<(sra i32:$rS, i32:$rB),
+ (SRAW $rS, $rB)>;
+def : Pat<(srl i32:$rS, i32:$rB),
+ (SRW $rS, $rB)>;
+def : Pat<(shl i32:$rS, i32:$rB),
+ (SLW $rS, $rB)>;
+
+def : Pat<(zextloadi1 iaddr:$src),
+ (LBZ iaddr:$src)>;
+def : Pat<(zextloadi1 xaddr:$src),
+ (LBZX xaddr:$src)>;
+def : Pat<(extloadi1 iaddr:$src),
+ (LBZ iaddr:$src)>;
+def : Pat<(extloadi1 xaddr:$src),
+ (LBZX xaddr:$src)>;
+def : Pat<(extloadi8 iaddr:$src),
+ (LBZ iaddr:$src)>;
+def : Pat<(extloadi8 xaddr:$src),
+ (LBZX xaddr:$src)>;
+def : Pat<(extloadi16 iaddr:$src),
+ (LHZ iaddr:$src)>;
+def : Pat<(extloadi16 xaddr:$src),
+ (LHZX xaddr:$src)>;
+def : Pat<(f64 (extloadf32 iaddr:$src)),
+ (COPY_TO_REGCLASS (LFS iaddr:$src), F8RC)>;
+def : Pat<(f64 (extloadf32 xaddr:$src)),
+ (COPY_TO_REGCLASS (LFSX xaddr:$src), F8RC)>;
+
+def : Pat<(f64 (fpextend f32:$src)),
+ (COPY_TO_REGCLASS $src, F8RC)>;
+
+// Only seq_cst fences require the heavyweight sync (SYNC 0).
+// All others can use the lightweight sync (SYNC 1).
+// source: http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
+// The rule for seq_cst is duplicated to work with both 64 bits and 32 bits
+// versions of Power.
+def : Pat<(atomic_fence (i64 7), (imm)), (SYNC 0)>, Requires<[HasSYNC]>;
+def : Pat<(atomic_fence (i32 7), (imm)), (SYNC 0)>, Requires<[HasSYNC]>;
+def : Pat<(atomic_fence (imm), (imm)), (SYNC 1)>, Requires<[HasSYNC]>;
+def : Pat<(atomic_fence (imm), (imm)), (MSYNC)>, Requires<[HasOnlyMSYNC]>;
+
+// Additional FNMSUB patterns: -a*c + b == -(a*c - b)
+def : Pat<(fma (fneg f64:$A), f64:$C, f64:$B),
+ (FNMSUB $A, $C, $B)>;
+def : Pat<(fma f64:$A, (fneg f64:$C), f64:$B),
+ (FNMSUB $A, $C, $B)>;
+def : Pat<(fma (fneg f32:$A), f32:$C, f32:$B),
+ (FNMSUBS $A, $C, $B)>;
+def : Pat<(fma f32:$A, (fneg f32:$C), f32:$B),
+ (FNMSUBS $A, $C, $B)>;
+
+// FCOPYSIGN's operand types need not agree.
+def : Pat<(fcopysign f64:$frB, f32:$frA),
+ (FCPSGND (COPY_TO_REGCLASS $frA, F8RC), $frB)>;
+def : Pat<(fcopysign f32:$frB, f64:$frA),
+ (FCPSGNS (COPY_TO_REGCLASS $frA, F4RC), $frB)>;
+
+include "PPCInstrAltivec.td"
+include "PPCInstrSPE.td"
+include "PPCInstr64Bit.td"
+include "PPCInstrVSX.td"
+include "PPCInstrQPX.td"
+include "PPCInstrHTM.td"
+
+def crnot : OutPatFrag<(ops node:$in),
+ (CRNOR $in, $in)>;
+def : Pat<(not i1:$in),
+ (crnot $in)>;
+
+// Patterns for arithmetic i1 operations.
+def : Pat<(add i1:$a, i1:$b),
+ (CRXOR $a, $b)>;
+def : Pat<(sub i1:$a, i1:$b),
+ (CRXOR $a, $b)>;
+def : Pat<(mul i1:$a, i1:$b),
+ (CRAND $a, $b)>;
+
+// We're sometimes asked to materialize i1 -1, which is just 1 in this case
+// (-1 is used to mean all bits set).
+def : Pat<(i1 -1), (CRSET)>;
+
+// i1 extensions, implemented in terms of isel.
+def : Pat<(i32 (zext i1:$in)),
+ (SELECT_I4 $in, (LI 1), (LI 0))>;
+def : Pat<(i32 (sext i1:$in)),
+ (SELECT_I4 $in, (LI -1), (LI 0))>;
+
+def : Pat<(i64 (zext i1:$in)),
+ (SELECT_I8 $in, (LI8 1), (LI8 0))>;
+def : Pat<(i64 (sext i1:$in)),
+ (SELECT_I8 $in, (LI8 -1), (LI8 0))>;
+
+// FIXME: We should choose either a zext or a sext based on other constants
+// already around.
+def : Pat<(i32 (anyext i1:$in)),
+ (SELECT_I4 $in, (LI 1), (LI 0))>;
+def : Pat<(i64 (anyext i1:$in)),
+ (SELECT_I8 $in, (LI8 1), (LI8 0))>;
+
+// match setcc on i1 variables.
+// CRANDC is:
+// 1 1 : F
+// 1 0 : T
+// 0 1 : F
+// 0 0 : F
+//
+// LT is:
+// -1 -1 : F
+// -1 0 : T
+// 0 -1 : F
+// 0 0 : F
+//
+// ULT is:
+// 1 1 : F
+// 1 0 : F
+// 0 1 : T
+// 0 0 : F
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETLT)),
+ (CRANDC $s1, $s2)>;
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETULT)),
+ (CRANDC $s2, $s1)>;
+// CRORC is:
+// 1 1 : T
+// 1 0 : T
+// 0 1 : F
+// 0 0 : T
+//
+// LE is:
+// -1 -1 : T
+// -1 0 : T
+// 0 -1 : F
+// 0 0 : T
+//
+// ULE is:
+// 1 1 : T
+// 1 0 : F
+// 0 1 : T
+// 0 0 : T
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETLE)),
+ (CRORC $s1, $s2)>;
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETULE)),
+ (CRORC $s2, $s1)>;
+
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETEQ)),
+ (CREQV $s1, $s2)>;
+
+// GE is:
+// -1 -1 : T
+// -1 0 : F
+// 0 -1 : T
+// 0 0 : T
+//
+// UGE is:
+// 1 1 : T
+// 1 0 : T
+// 0 1 : F
+// 0 0 : T
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETGE)),
+ (CRORC $s2, $s1)>;
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETUGE)),
+ (CRORC $s1, $s2)>;
+
+// GT is:
+// -1 -1 : F
+// -1 0 : F
+// 0 -1 : T
+// 0 0 : F
+//
+// UGT is:
+// 1 1 : F
+// 1 0 : T
+// 0 1 : F
+// 0 0 : F
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETGT)),
+ (CRANDC $s2, $s1)>;
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETUGT)),
+ (CRANDC $s1, $s2)>;
+
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETNE)),
+ (CRXOR $s1, $s2)>;
+
+// match setcc on non-i1 (non-vector) variables. Note that SETUEQ, SETOGE,
+// SETOLE, SETONE, SETULT and SETUGT should be expanded by legalize for
+// floating-point types.
+
+multiclass CRNotPat<dag pattern, dag result> {
+ def : Pat<pattern, (crnot result)>;
+ def : Pat<(not pattern), result>;
+
+ // We can also fold the crnot into an extension:
+ def : Pat<(i32 (zext pattern)),
+ (SELECT_I4 result, (LI 0), (LI 1))>;
+ def : Pat<(i32 (sext pattern)),
+ (SELECT_I4 result, (LI 0), (LI -1))>;
+
+ // We can also fold the crnot into an extension:
+ def : Pat<(i64 (zext pattern)),
+ (SELECT_I8 result, (LI8 0), (LI8 1))>;
+ def : Pat<(i64 (sext pattern)),
+ (SELECT_I8 result, (LI8 0), (LI8 -1))>;
+
+ // FIXME: We should choose either a zext or a sext based on other constants
+ // already around.
+ def : Pat<(i32 (anyext pattern)),
+ (SELECT_I4 result, (LI 0), (LI 1))>;
+
+ def : Pat<(i64 (anyext pattern)),
+ (SELECT_I8 result, (LI8 0), (LI8 1))>;
+}
+
+// FIXME: Because of what seems like a bug in TableGen's type-inference code,
+// we need to write imm:$imm in the output patterns below, not just $imm, or
+// else the resulting matcher will not correctly add the immediate operand
+// (making it a register operand instead).
+
+// extended SETCC.
+multiclass ExtSetCCPat<CondCode cc, PatFrag pfrag,
+ OutPatFrag rfrag, OutPatFrag rfrag8> {
+ def : Pat<(i32 (zext (i1 (pfrag i32:$s1, cc)))),
+ (rfrag $s1)>;
+ def : Pat<(i64 (zext (i1 (pfrag i64:$s1, cc)))),
+ (rfrag8 $s1)>;
+ def : Pat<(i64 (zext (i1 (pfrag i32:$s1, cc)))),
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (rfrag $s1), sub_32)>;
+ def : Pat<(i32 (zext (i1 (pfrag i64:$s1, cc)))),
+ (EXTRACT_SUBREG (rfrag8 $s1), sub_32)>;
+
+ def : Pat<(i32 (anyext (i1 (pfrag i32:$s1, cc)))),
+ (rfrag $s1)>;
+ def : Pat<(i64 (anyext (i1 (pfrag i64:$s1, cc)))),
+ (rfrag8 $s1)>;
+ def : Pat<(i64 (anyext (i1 (pfrag i32:$s1, cc)))),
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (rfrag $s1), sub_32)>;
+ def : Pat<(i32 (anyext (i1 (pfrag i64:$s1, cc)))),
+ (EXTRACT_SUBREG (rfrag8 $s1), sub_32)>;
+}
+
+// Note that we do all inversions below with i(32|64)not, instead of using
+// (xori x, 1) because on the A2 nor has single-cycle latency while xori
+// has 2-cycle latency.
+
+defm : ExtSetCCPat<SETEQ,
+ PatFrag<(ops node:$in, node:$cc),
+ (setcc $in, 0, $cc)>,
+ OutPatFrag<(ops node:$in),
+ (RLWINM (CNTLZW $in), 27, 31, 31)>,
+ OutPatFrag<(ops node:$in),
+ (RLDICL (CNTLZD $in), 58, 63)> >;
+
+defm : ExtSetCCPat<SETNE,
+ PatFrag<(ops node:$in, node:$cc),
+ (setcc $in, 0, $cc)>,
+ OutPatFrag<(ops node:$in),
+ (RLWINM (i32not (CNTLZW $in)), 27, 31, 31)>,
+ OutPatFrag<(ops node:$in),
+ (RLDICL (i64not (CNTLZD $in)), 58, 63)> >;
+
+defm : ExtSetCCPat<SETLT,
+ PatFrag<(ops node:$in, node:$cc),
+ (setcc $in, 0, $cc)>,
+ OutPatFrag<(ops node:$in),
+ (RLWINM $in, 1, 31, 31)>,
+ OutPatFrag<(ops node:$in),
+ (RLDICL $in, 1, 63)> >;
+
+defm : ExtSetCCPat<SETGE,
+ PatFrag<(ops node:$in, node:$cc),
+ (setcc $in, 0, $cc)>,
+ OutPatFrag<(ops node:$in),
+ (RLWINM (i32not $in), 1, 31, 31)>,
+ OutPatFrag<(ops node:$in),
+ (RLDICL (i64not $in), 1, 63)> >;
+
+defm : ExtSetCCPat<SETGT,
+ PatFrag<(ops node:$in, node:$cc),
+ (setcc $in, 0, $cc)>,
+ OutPatFrag<(ops node:$in),
+ (RLWINM (ANDC (NEG $in), $in), 1, 31, 31)>,
+ OutPatFrag<(ops node:$in),
+ (RLDICL (ANDC8 (NEG8 $in), $in), 1, 63)> >;
+
+defm : ExtSetCCPat<SETLE,
+ PatFrag<(ops node:$in, node:$cc),
+ (setcc $in, 0, $cc)>,
+ OutPatFrag<(ops node:$in),
+ (RLWINM (ORC $in, (NEG $in)), 1, 31, 31)>,
+ OutPatFrag<(ops node:$in),
+ (RLDICL (ORC8 $in, (NEG8 $in)), 1, 63)> >;
+
+defm : ExtSetCCPat<SETLT,
+ PatFrag<(ops node:$in, node:$cc),
+ (setcc $in, -1, $cc)>,
+ OutPatFrag<(ops node:$in),
+ (RLWINM (AND $in, (ADDI $in, 1)), 1, 31, 31)>,
+ OutPatFrag<(ops node:$in),
+ (RLDICL (AND8 $in, (ADDI8 $in, 1)), 1, 63)> >;
+
+defm : ExtSetCCPat<SETGE,
+ PatFrag<(ops node:$in, node:$cc),
+ (setcc $in, -1, $cc)>,
+ OutPatFrag<(ops node:$in),
+ (RLWINM (NAND $in, (ADDI $in, 1)), 1, 31, 31)>,
+ OutPatFrag<(ops node:$in),
+ (RLDICL (NAND8 $in, (ADDI8 $in, 1)), 1, 63)> >;
+
+defm : ExtSetCCPat<SETGT,
+ PatFrag<(ops node:$in, node:$cc),
+ (setcc $in, -1, $cc)>,
+ OutPatFrag<(ops node:$in),
+ (RLWINM (i32not $in), 1, 31, 31)>,
+ OutPatFrag<(ops node:$in),
+ (RLDICL (i64not $in), 1, 63)> >;
+
+defm : ExtSetCCPat<SETLE,
+ PatFrag<(ops node:$in, node:$cc),
+ (setcc $in, -1, $cc)>,
+ OutPatFrag<(ops node:$in),
+ (RLWINM $in, 1, 31, 31)>,
+ OutPatFrag<(ops node:$in),
+ (RLDICL $in, 1, 63)> >;
+
+// An extended SETCC with shift amount.
+multiclass ExtSetCCShiftPat<CondCode cc, PatFrag pfrag,
+ OutPatFrag rfrag, OutPatFrag rfrag8> {
+ def : Pat<(i32 (zext (i1 (pfrag i32:$s1, i32:$sa, cc)))),
+ (rfrag $s1, $sa)>;
+ def : Pat<(i64 (zext (i1 (pfrag i64:$s1, i32:$sa, cc)))),
+ (rfrag8 $s1, $sa)>;
+ def : Pat<(i64 (zext (i1 (pfrag i32:$s1, i32:$sa, cc)))),
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (rfrag $s1, $sa), sub_32)>;
+ def : Pat<(i32 (zext (i1 (pfrag i64:$s1, i32:$sa, cc)))),
+ (EXTRACT_SUBREG (rfrag8 $s1, $sa), sub_32)>;
+
+ def : Pat<(i32 (anyext (i1 (pfrag i32:$s1, i32:$sa, cc)))),
+ (rfrag $s1, $sa)>;
+ def : Pat<(i64 (anyext (i1 (pfrag i64:$s1, i32:$sa, cc)))),
+ (rfrag8 $s1, $sa)>;
+ def : Pat<(i64 (anyext (i1 (pfrag i32:$s1, i32:$sa, cc)))),
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (rfrag $s1, $sa), sub_32)>;
+ def : Pat<(i32 (anyext (i1 (pfrag i64:$s1, i32:$sa, cc)))),
+ (EXTRACT_SUBREG (rfrag8 $s1, $sa), sub_32)>;
+}
+
+defm : ExtSetCCShiftPat<SETNE,
+ PatFrag<(ops node:$in, node:$sa, node:$cc),
+ (setcc (and $in, (shl 1, $sa)), 0, $cc)>,
+ OutPatFrag<(ops node:$in, node:$sa),
+ (RLWNM $in, (SUBFIC $sa, 32), 31, 31)>,
+ OutPatFrag<(ops node:$in, node:$sa),
+ (RLDCL $in, (SUBFIC $sa, 64), 63)> >;
+
+defm : ExtSetCCShiftPat<SETEQ,
+ PatFrag<(ops node:$in, node:$sa, node:$cc),
+ (setcc (and $in, (shl 1, $sa)), 0, $cc)>,
+ OutPatFrag<(ops node:$in, node:$sa),
+ (RLWNM (i32not $in),
+ (SUBFIC $sa, 32), 31, 31)>,
+ OutPatFrag<(ops node:$in, node:$sa),
+ (RLDCL (i64not $in),
+ (SUBFIC $sa, 64), 63)> >;
+
+// SETCC for i32.
+def : Pat<(i1 (setcc i32:$s1, immZExt16:$imm, SETULT)),
+ (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_lt)>;
+def : Pat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETLT)),
+ (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_lt)>;
+def : Pat<(i1 (setcc i32:$s1, immZExt16:$imm, SETUGT)),
+ (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_gt)>;
+def : Pat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETGT)),
+ (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_gt)>;
+def : Pat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETEQ)),
+ (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_eq)>;
+def : Pat<(i1 (setcc i32:$s1, immZExt16:$imm, SETEQ)),
+ (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_eq)>;
+
+// For non-equality comparisons, the default code would materialize the
+// constant, then compare against it, like this:
+// lis r2, 4660
+// ori r2, r2, 22136
+// cmpw cr0, r3, r2
+// beq cr0,L6
+// Since we are just comparing for equality, we can emit this instead:
+// xoris r0,r3,0x1234
+// cmplwi cr0,r0,0x5678
+// beq cr0,L6
+
+def : Pat<(i1 (setcc i32:$s1, imm:$imm, SETEQ)),
+ (EXTRACT_SUBREG (CMPLWI (XORIS $s1, (HI16 imm:$imm)),
+ (LO16 imm:$imm)), sub_eq)>;
+
+defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETUGE)),
+ (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETGE)),
+ (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETULE)),
+ (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETLE)),
+ (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETNE)),
+ (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_eq)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETNE)),
+ (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_eq)>;
+
+defm : CRNotPat<(i1 (setcc i32:$s1, imm:$imm, SETNE)),
+ (EXTRACT_SUBREG (CMPLWI (XORIS $s1, (HI16 imm:$imm)),
+ (LO16 imm:$imm)), sub_eq)>;
+
+def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETULT)),
+ (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETLT)),
+ (EXTRACT_SUBREG (CMPW $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETUGT)),
+ (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETGT)),
+ (EXTRACT_SUBREG (CMPW $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETEQ)),
+ (EXTRACT_SUBREG (CMPW $s1, $s2), sub_eq)>;
+
+defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETUGE)),
+ (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETGE)),
+ (EXTRACT_SUBREG (CMPW $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETULE)),
+ (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETLE)),
+ (EXTRACT_SUBREG (CMPW $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETNE)),
+ (EXTRACT_SUBREG (CMPW $s1, $s2), sub_eq)>;
+
+// SETCC for i64.
+def : Pat<(i1 (setcc i64:$s1, immZExt16:$imm, SETULT)),
+ (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_lt)>;
+def : Pat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETLT)),
+ (EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_lt)>;
+def : Pat<(i1 (setcc i64:$s1, immZExt16:$imm, SETUGT)),
+ (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_gt)>;
+def : Pat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETGT)),
+ (EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_gt)>;
+def : Pat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETEQ)),
+ (EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_eq)>;
+def : Pat<(i1 (setcc i64:$s1, immZExt16:$imm, SETEQ)),
+ (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_eq)>;
+
+// For non-equality comparisons, the default code would materialize the
+// constant, then compare against it, like this:
+// lis r2, 4660
+// ori r2, r2, 22136
+// cmpd cr0, r3, r2
+// beq cr0,L6
+// Since we are just comparing for equality, we can emit this instead:
+// xoris r0,r3,0x1234
+// cmpldi cr0,r0,0x5678
+// beq cr0,L6
+
+def : Pat<(i1 (setcc i64:$s1, imm64ZExt32:$imm, SETEQ)),
+ (EXTRACT_SUBREG (CMPLDI (XORIS8 $s1, (HI16 imm:$imm)),
+ (LO16 imm:$imm)), sub_eq)>;
+
+defm : CRNotPat<(i1 (setcc i64:$s1, immZExt16:$imm, SETUGE)),
+ (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETGE)),
+ (EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i64:$s1, immZExt16:$imm, SETULE)),
+ (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETLE)),
+ (EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETNE)),
+ (EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_eq)>;
+defm : CRNotPat<(i1 (setcc i64:$s1, immZExt16:$imm, SETNE)),
+ (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_eq)>;
+
+defm : CRNotPat<(i1 (setcc i64:$s1, imm64ZExt32:$imm, SETNE)),
+ (EXTRACT_SUBREG (CMPLDI (XORIS8 $s1, (HI16 imm:$imm)),
+ (LO16 imm:$imm)), sub_eq)>;
+
+def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETULT)),
+ (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETLT)),
+ (EXTRACT_SUBREG (CMPD $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETUGT)),
+ (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETGT)),
+ (EXTRACT_SUBREG (CMPD $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETEQ)),
+ (EXTRACT_SUBREG (CMPD $s1, $s2), sub_eq)>;
+
+defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETUGE)),
+ (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETGE)),
+ (EXTRACT_SUBREG (CMPD $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETULE)),
+ (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETLE)),
+ (EXTRACT_SUBREG (CMPD $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETNE)),
+ (EXTRACT_SUBREG (CMPD $s1, $s2), sub_eq)>;
+
+// SETCC for f32.
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOLT)),
+ (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETLT)),
+ (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOGT)),
+ (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETGT)),
+ (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOEQ)),
+ (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETEQ)),
+ (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETUO)),
+ (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_un)>;
+
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUGE)),
+ (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETGE)),
+ (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETULE)),
+ (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETLE)),
+ (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUNE)),
+ (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETNE)),
+ (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETO)),
+ (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_un)>;
+
+// SETCC for f64.
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOLT)),
+ (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETLT)),
+ (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOGT)),
+ (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETGT)),
+ (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOEQ)),
+ (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETEQ)),
+ (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETUO)),
+ (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_un)>;
+
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUGE)),
+ (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETGE)),
+ (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETULE)),
+ (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETLE)),
+ (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUNE)),
+ (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETNE)),
+ (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETO)),
+ (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_un)>;
+
+// match select on i1 variables:
+def : Pat<(i1 (select i1:$cond, i1:$tval, i1:$fval)),
+ (CROR (CRAND $cond , $tval),
+ (CRAND (crnot $cond), $fval))>;
+
+// match selectcc on i1 variables:
+// select (lhs == rhs), tval, fval is:
+// ((lhs == rhs) & tval) | (!(lhs == rhs) & fval)
+def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETLT)),
+ (CROR (CRAND (CRANDC $lhs, $rhs), $tval),
+ (CRAND (CRORC $rhs, $lhs), $fval))>;
+def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETULT)),
+ (CROR (CRAND (CRANDC $rhs, $lhs), $tval),
+ (CRAND (CRORC $lhs, $rhs), $fval))>;
+def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETLE)),
+ (CROR (CRAND (CRORC $lhs, $rhs), $tval),
+ (CRAND (CRANDC $rhs, $lhs), $fval))>;
+def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETULE)),
+ (CROR (CRAND (CRORC $rhs, $lhs), $tval),
+ (CRAND (CRANDC $lhs, $rhs), $fval))>;
+def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETEQ)),
+ (CROR (CRAND (CREQV $lhs, $rhs), $tval),
+ (CRAND (CRXOR $lhs, $rhs), $fval))>;
+def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETGE)),
+ (CROR (CRAND (CRORC $rhs, $lhs), $tval),
+ (CRAND (CRANDC $lhs, $rhs), $fval))>;
+def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETUGE)),
+ (CROR (CRAND (CRORC $lhs, $rhs), $tval),
+ (CRAND (CRANDC $rhs, $lhs), $fval))>;
+def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETGT)),
+ (CROR (CRAND (CRANDC $rhs, $lhs), $tval),
+ (CRAND (CRORC $lhs, $rhs), $fval))>;
+def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETUGT)),
+ (CROR (CRAND (CRANDC $lhs, $rhs), $tval),
+ (CRAND (CRORC $rhs, $lhs), $fval))>;
+def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETNE)),
+ (CROR (CRAND (CREQV $lhs, $rhs), $fval),
+ (CRAND (CRXOR $lhs, $rhs), $tval))>;
+
+// match selectcc on i1 variables with non-i1 output.
+def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETLT)),
+ (SELECT_I4 (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETULT)),
+ (SELECT_I4 (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETLE)),
+ (SELECT_I4 (CRORC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETULE)),
+ (SELECT_I4 (CRORC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETEQ)),
+ (SELECT_I4 (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETGE)),
+ (SELECT_I4 (CRORC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETUGE)),
+ (SELECT_I4 (CRORC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETGT)),
+ (SELECT_I4 (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETUGT)),
+ (SELECT_I4 (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETNE)),
+ (SELECT_I4 (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETLT)),
+ (SELECT_I8 (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETULT)),
+ (SELECT_I8 (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETLE)),
+ (SELECT_I8 (CRORC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETULE)),
+ (SELECT_I8 (CRORC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETEQ)),
+ (SELECT_I8 (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETGE)),
+ (SELECT_I8 (CRORC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETUGE)),
+ (SELECT_I8 (CRORC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETGT)),
+ (SELECT_I8 (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETUGT)),
+ (SELECT_I8 (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETNE)),
+ (SELECT_I8 (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLT)),
+ (SELECT_F4 (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETULT)),
+ (SELECT_F4 (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLE)),
+ (SELECT_F4 (CRORC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETULE)),
+ (SELECT_F4 (CRORC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETEQ)),
+ (SELECT_F4 (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETGE)),
+ (SELECT_F4 (CRORC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETUGE)),
+ (SELECT_F4 (CRORC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETGT)),
+ (SELECT_F4 (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETUGT)),
+ (SELECT_F4 (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETNE)),
+ (SELECT_F4 (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETLT)),
+ (SELECT_F8 (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETULT)),
+ (SELECT_F8 (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETLE)),
+ (SELECT_F8 (CRORC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETULE)),
+ (SELECT_F8 (CRORC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETEQ)),
+ (SELECT_F8 (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETGE)),
+ (SELECT_F8 (CRORC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETUGE)),
+ (SELECT_F8 (CRORC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETGT)),
+ (SELECT_F8 (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETUGT)),
+ (SELECT_F8 (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETNE)),
+ (SELECT_F8 (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETLT)),
+ (SELECT_VRRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETULT)),
+ (SELECT_VRRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETLE)),
+ (SELECT_VRRC (CRORC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETULE)),
+ (SELECT_VRRC (CRORC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETEQ)),
+ (SELECT_VRRC (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETGE)),
+ (SELECT_VRRC (CRORC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETUGE)),
+ (SELECT_VRRC (CRORC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETGT)),
+ (SELECT_VRRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETUGT)),
+ (SELECT_VRRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETNE)),
+ (SELECT_VRRC (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+let usesCustomInserter = 1 in {
+def ANDIo_1_EQ_BIT : Pseudo<(outs crbitrc:$dst), (ins gprc:$in),
+ "#ANDIo_1_EQ_BIT",
+ [(set i1:$dst, (trunc (not i32:$in)))]>;
+def ANDIo_1_GT_BIT : Pseudo<(outs crbitrc:$dst), (ins gprc:$in),
+ "#ANDIo_1_GT_BIT",
+ [(set i1:$dst, (trunc i32:$in))]>;
+
+def ANDIo_1_EQ_BIT8 : Pseudo<(outs crbitrc:$dst), (ins g8rc:$in),
+ "#ANDIo_1_EQ_BIT8",
+ [(set i1:$dst, (trunc (not i64:$in)))]>;
+def ANDIo_1_GT_BIT8 : Pseudo<(outs crbitrc:$dst), (ins g8rc:$in),
+ "#ANDIo_1_GT_BIT8",
+ [(set i1:$dst, (trunc i64:$in))]>;
+}
+
+def : Pat<(i1 (not (trunc i32:$in))),
+ (ANDIo_1_EQ_BIT $in)>;
+def : Pat<(i1 (not (trunc i64:$in))),
+ (ANDIo_1_EQ_BIT8 $in)>;
+
+//===----------------------------------------------------------------------===//
+// PowerPC Instructions used for assembler/disassembler only
+//
+
+// FIXME: For B=0 or B > 8, the registers following RT are used.
+// WARNING: Do not add patterns for this instruction without fixing this.
+def LSWI : XForm_base_r3xo<31, 597, (outs gprc:$RT), (ins gprc:$A, u5imm:$B),
+ "lswi $RT, $A, $B", IIC_LdStLoad, []>;
+
+// FIXME: For B=0 or B > 8, the registers following RT are used.
+// WARNING: Do not add patterns for this instruction without fixing this.
+def STSWI : XForm_base_r3xo<31, 725, (outs), (ins gprc:$RT, gprc:$A, u5imm:$B),
+ "stswi $RT, $A, $B", IIC_LdStLoad, []>;
+
+def ISYNC : XLForm_2_ext<19, 150, 0, 0, 0, (outs), (ins),
+ "isync", IIC_SprISYNC, []>;
+
+def ICBI : XForm_1a<31, 982, (outs), (ins memrr:$src),
+ "icbi $src", IIC_LdStICBI, []>;
+
+// We used to have EIEIO as value but E[0-9A-Z] is a reserved name
+def EnforceIEIO : XForm_24_eieio<31, 854, (outs), (ins),
+ "eieio", IIC_LdStLoad, []>;
+
+def WAIT : XForm_24_sync<31, 62, (outs), (ins i32imm:$L),
+ "wait $L", IIC_LdStLoad, []>;
+
+def MBAR : XForm_mbar<31, 854, (outs), (ins u5imm:$MO),
+ "mbar $MO", IIC_LdStLoad>, Requires<[IsBookE]>;
+
+def MTSR: XForm_sr<31, 210, (outs), (ins gprc:$RS, u4imm:$SR),
+ "mtsr $SR, $RS", IIC_SprMTSR>;
+
+def MFSR: XForm_sr<31, 595, (outs gprc:$RS), (ins u4imm:$SR),
+ "mfsr $RS, $SR", IIC_SprMFSR>;
+
+def MTSRIN: XForm_srin<31, 242, (outs), (ins gprc:$RS, gprc:$RB),
+ "mtsrin $RS, $RB", IIC_SprMTSR>;
+
+def MFSRIN: XForm_srin<31, 659, (outs gprc:$RS), (ins gprc:$RB),
+ "mfsrin $RS, $RB", IIC_SprMFSR>;
+
+def MTMSR: XForm_mtmsr<31, 146, (outs), (ins gprc:$RS, i32imm:$L),
+ "mtmsr $RS, $L", IIC_SprMTMSR>;
+
+def WRTEE: XForm_mtmsr<31, 131, (outs), (ins gprc:$RS),
+ "wrtee $RS", IIC_SprMTMSR>, Requires<[IsBookE]> {
+ let L = 0;
+}
+
+def WRTEEI: I<31, (outs), (ins i1imm:$E), "wrteei $E", IIC_SprMTMSR>,
+ Requires<[IsBookE]> {
+ bits<1> E;
+
+ let Inst{16} = E;
+ let Inst{21-30} = 163;
+}
+
+def DCCCI : XForm_tlb<454, (outs), (ins gprc:$A, gprc:$B),
+ "dccci $A, $B", IIC_LdStLoad>, Requires<[IsPPC4xx]>;
+def ICCCI : XForm_tlb<966, (outs), (ins gprc:$A, gprc:$B),
+ "iccci $A, $B", IIC_LdStLoad>, Requires<[IsPPC4xx]>;
+
+def : InstAlias<"dci 0", (DCCCI R0, R0)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"dccci", (DCCCI R0, R0)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"ici 0", (ICCCI R0, R0)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"iccci", (ICCCI R0, R0)>, Requires<[IsPPC4xx]>;
+
+def MFMSR : XForm_rs<31, 83, (outs gprc:$RT), (ins),
+ "mfmsr $RT", IIC_SprMFMSR, []>;
+
+def MTMSRD : XForm_mtmsr<31, 178, (outs), (ins gprc:$RS, i32imm:$L),
+ "mtmsrd $RS, $L", IIC_SprMTMSRD>;
+
+def MCRFS : XLForm_3<63, 64, (outs crrc:$BF), (ins crrc:$BFA),
+ "mcrfs $BF, $BFA", IIC_BrMCR>;
+
+def MTFSFI : XLForm_4<63, 134, (outs crrc:$BF), (ins i32imm:$U, i32imm:$W),
+ "mtfsfi $BF, $U, $W", IIC_IntMFFS>;
+
+def MTFSFIo : XLForm_4<63, 134, (outs crrc:$BF), (ins i32imm:$U, i32imm:$W),
+ "mtfsfi. $BF, $U, $W", IIC_IntMFFS>, isDOT;
+
+def : InstAlias<"mtfsfi $BF, $U", (MTFSFI crrc:$BF, i32imm:$U, 0)>;
+def : InstAlias<"mtfsfi. $BF, $U", (MTFSFIo crrc:$BF, i32imm:$U, 0)>;
+
+def MTFSF : XFLForm_1<63, 711, (outs),
+ (ins i32imm:$FLM, f8rc:$FRB, i32imm:$L, i32imm:$W),
+ "mtfsf $FLM, $FRB, $L, $W", IIC_IntMFFS, []>;
+def MTFSFo : XFLForm_1<63, 711, (outs),
+ (ins i32imm:$FLM, f8rc:$FRB, i32imm:$L, i32imm:$W),
+ "mtfsf. $FLM, $FRB, $L, $W", IIC_IntMFFS, []>, isDOT;
+
+def : InstAlias<"mtfsf $FLM, $FRB", (MTFSF i32imm:$FLM, f8rc:$FRB, 0, 0)>;
+def : InstAlias<"mtfsf. $FLM, $FRB", (MTFSFo i32imm:$FLM, f8rc:$FRB, 0, 0)>;
+
+def SLBIE : XForm_16b<31, 434, (outs), (ins gprc:$RB),
+ "slbie $RB", IIC_SprSLBIE, []>;
+
+def SLBMTE : XForm_26<31, 402, (outs), (ins gprc:$RS, gprc:$RB),
+ "slbmte $RS, $RB", IIC_SprSLBMTE, []>;
+
+def SLBMFEE : XForm_26<31, 915, (outs gprc:$RT), (ins gprc:$RB),
+ "slbmfee $RT, $RB", IIC_SprSLBMFEE, []>;
+
+def SLBMFEV : XLForm_1_gen<31, 851, (outs gprc:$RT), (ins gprc:$RB),
+ "slbmfev $RT, $RB", IIC_SprSLBMFEV, []>;
+
+def SLBIA : XForm_0<31, 498, (outs), (ins), "slbia", IIC_SprSLBIA, []>;
+
+def TLBIA : XForm_0<31, 370, (outs), (ins),
+ "tlbia", IIC_SprTLBIA, []>;
+
+def TLBSYNC : XForm_0<31, 566, (outs), (ins),
+ "tlbsync", IIC_SprTLBSYNC, []>;
+
+def TLBIEL : XForm_16b<31, 274, (outs), (ins gprc:$RB),
+ "tlbiel $RB", IIC_SprTLBIEL, []>;
+
+def TLBLD : XForm_16b<31, 978, (outs), (ins gprc:$RB),
+ "tlbld $RB", IIC_LdStLoad, []>, Requires<[IsPPC6xx]>;
+def TLBLI : XForm_16b<31, 1010, (outs), (ins gprc:$RB),
+ "tlbli $RB", IIC_LdStLoad, []>, Requires<[IsPPC6xx]>;
+
+def TLBIE : XForm_26<31, 306, (outs), (ins gprc:$RS, gprc:$RB),
+ "tlbie $RB,$RS", IIC_SprTLBIE, []>;
+
+def TLBSX : XForm_tlb<914, (outs), (ins gprc:$A, gprc:$B), "tlbsx $A, $B",
+ IIC_LdStLoad>, Requires<[IsBookE]>;
+
+def TLBIVAX : XForm_tlb<786, (outs), (ins gprc:$A, gprc:$B), "tlbivax $A, $B",
+ IIC_LdStLoad>, Requires<[IsBookE]>;
+
+def TLBRE : XForm_24_eieio<31, 946, (outs), (ins),
+ "tlbre", IIC_LdStLoad, []>, Requires<[IsBookE]>;
+
+def TLBWE : XForm_24_eieio<31, 978, (outs), (ins),
+ "tlbwe", IIC_LdStLoad, []>, Requires<[IsBookE]>;
+
+def TLBRE2 : XForm_tlbws<31, 946, (outs gprc:$RS), (ins gprc:$A, i1imm:$WS),
+ "tlbre $RS, $A, $WS", IIC_LdStLoad, []>, Requires<[IsPPC4xx]>;
+
+def TLBWE2 : XForm_tlbws<31, 978, (outs), (ins gprc:$RS, gprc:$A, i1imm:$WS),
+ "tlbwe $RS, $A, $WS", IIC_LdStLoad, []>, Requires<[IsPPC4xx]>;
+
+def TLBSX2 : XForm_base_r3xo<31, 914, (outs), (ins gprc:$RST, gprc:$A, gprc:$B),
+ "tlbsx $RST, $A, $B", IIC_LdStLoad, []>,
+ Requires<[IsPPC4xx]>;
+def TLBSX2D : XForm_base_r3xo<31, 914, (outs),
+ (ins gprc:$RST, gprc:$A, gprc:$B),
+ "tlbsx. $RST, $A, $B", IIC_LdStLoad, []>,
+ Requires<[IsPPC4xx]>, isDOT;
+
+def RFID : XForm_0<19, 18, (outs), (ins), "rfid", IIC_IntRFID, []>;
+
+def RFI : XForm_0<19, 50, (outs), (ins), "rfi", IIC_SprRFI, []>,
+ Requires<[IsBookE]>;
+def RFCI : XForm_0<19, 51, (outs), (ins), "rfci", IIC_BrB, []>,
+ Requires<[IsBookE]>;
+
+def RFDI : XForm_0<19, 39, (outs), (ins), "rfdi", IIC_BrB, []>,
+ Requires<[IsE500]>;
+def RFMCI : XForm_0<19, 38, (outs), (ins), "rfmci", IIC_BrB, []>,
+ Requires<[IsE500]>;
+
+def MFDCR : XFXForm_1<31, 323, (outs gprc:$RT), (ins i32imm:$SPR),
+ "mfdcr $RT, $SPR", IIC_SprMFSPR>, Requires<[IsPPC4xx]>;
+def MTDCR : XFXForm_1<31, 451, (outs), (ins gprc:$RT, i32imm:$SPR),
+ "mtdcr $SPR, $RT", IIC_SprMTSPR>, Requires<[IsPPC4xx]>;
+
+def HRFID : XLForm_1_np<19, 274, (outs), (ins), "hrfid", IIC_BrB, []>;
+def NAP : XLForm_1_np<19, 434, (outs), (ins), "nap", IIC_BrB, []>;
+
+def ATTN : XForm_attn<0, 256, (outs), (ins), "attn", IIC_BrB>;
+
+def LBZCIX : XForm_base_r3xo<31, 853, (outs gprc:$RST), (ins gprc:$A, gprc:$B),
+ "lbzcix $RST, $A, $B", IIC_LdStLoad, []>;
+def LHZCIX : XForm_base_r3xo<31, 821, (outs gprc:$RST), (ins gprc:$A, gprc:$B),
+ "lhzcix $RST, $A, $B", IIC_LdStLoad, []>;
+def LWZCIX : XForm_base_r3xo<31, 789, (outs gprc:$RST), (ins gprc:$A, gprc:$B),
+ "lwzcix $RST, $A, $B", IIC_LdStLoad, []>;
+def LDCIX : XForm_base_r3xo<31, 885, (outs gprc:$RST), (ins gprc:$A, gprc:$B),
+ "ldcix $RST, $A, $B", IIC_LdStLoad, []>;
+
+def STBCIX : XForm_base_r3xo<31, 981, (outs), (ins gprc:$RST, gprc:$A, gprc:$B),
+ "stbcix $RST, $A, $B", IIC_LdStLoad, []>;
+def STHCIX : XForm_base_r3xo<31, 949, (outs), (ins gprc:$RST, gprc:$A, gprc:$B),
+ "sthcix $RST, $A, $B", IIC_LdStLoad, []>;
+def STWCIX : XForm_base_r3xo<31, 917, (outs), (ins gprc:$RST, gprc:$A, gprc:$B),
+ "stwcix $RST, $A, $B", IIC_LdStLoad, []>;
+def STDCIX : XForm_base_r3xo<31, 1013, (outs), (ins gprc:$RST, gprc:$A, gprc:$B),
+ "stdcix $RST, $A, $B", IIC_LdStLoad, []>;
+
+//===----------------------------------------------------------------------===//
+// PowerPC Assembler Instruction Aliases
+//
+
+// Pseudo-instructions for alternate assembly syntax (never used by codegen).
+// These are aliases that require C++ handling to convert to the target
+// instruction, while InstAliases can be handled directly by tblgen.
+class PPCAsmPseudo<string asm, dag iops>
+ : Instruction {
+ let Namespace = "PPC";
+ bit PPC64 = 0; // Default value, override with isPPC64
+
+ let OutOperandList = (outs);
+ let InOperandList = iops;
+ let Pattern = [];
+ let AsmString = asm;
+ let isAsmParserOnly = 1;
+ let isPseudo = 1;
+}
+
+def : InstAlias<"sc", (SC 0)>;
+
+def : InstAlias<"sync", (SYNC 0)>, Requires<[HasSYNC]>;
+def : InstAlias<"msync", (SYNC 0), 0>, Requires<[HasSYNC]>;
+def : InstAlias<"lwsync", (SYNC 1)>, Requires<[HasSYNC]>;
+def : InstAlias<"ptesync", (SYNC 2)>, Requires<[HasSYNC]>;
+
+def : InstAlias<"wait", (WAIT 0)>;
+def : InstAlias<"waitrsv", (WAIT 1)>;
+def : InstAlias<"waitimpl", (WAIT 2)>;
+
+def : InstAlias<"mbar", (MBAR 0)>, Requires<[IsBookE]>;
+
+def DCBTx : PPCAsmPseudo<"dcbt $dst", (ins memrr:$dst)>;
+def DCBTSTx : PPCAsmPseudo<"dcbtst $dst", (ins memrr:$dst)>;
+
+def DCBTCT : PPCAsmPseudo<"dcbtct $dst, $TH", (ins memrr:$dst, u5imm:$TH)>;
+def DCBTDS : PPCAsmPseudo<"dcbtds $dst, $TH", (ins memrr:$dst, u5imm:$TH)>;
+def DCBTT : PPCAsmPseudo<"dcbtt $dst", (ins memrr:$dst)>;
+
+def DCBTSTCT : PPCAsmPseudo<"dcbtstct $dst, $TH", (ins memrr:$dst, u5imm:$TH)>;
+def DCBTSTDS : PPCAsmPseudo<"dcbtstds $dst, $TH", (ins memrr:$dst, u5imm:$TH)>;
+def DCBTSTT : PPCAsmPseudo<"dcbtstt $dst", (ins memrr:$dst)>;
+
+def DCBFx : PPCAsmPseudo<"dcbf $dst", (ins memrr:$dst)>;
+def DCBFL : PPCAsmPseudo<"dcbfl $dst", (ins memrr:$dst)>;
+def DCBFLP : PPCAsmPseudo<"dcbflp $dst", (ins memrr:$dst)>;
+
+def : InstAlias<"crset $bx", (CREQV crbitrc:$bx, crbitrc:$bx, crbitrc:$bx)>;
+def : InstAlias<"crclr $bx", (CRXOR crbitrc:$bx, crbitrc:$bx, crbitrc:$bx)>;
+def : InstAlias<"crmove $bx, $by", (CROR crbitrc:$bx, crbitrc:$by, crbitrc:$by)>;
+def : InstAlias<"crnot $bx, $by", (CRNOR crbitrc:$bx, crbitrc:$by, crbitrc:$by)>;
+
+def : InstAlias<"mtxer $Rx", (MTSPR 1, gprc:$Rx)>;
+def : InstAlias<"mfxer $Rx", (MFSPR gprc:$Rx, 1)>;
+
+def : InstAlias<"mfrtcu $Rx", (MFSPR gprc:$Rx, 4)>;
+def : InstAlias<"mfrtcl $Rx", (MFSPR gprc:$Rx, 5)>;
+
+def : InstAlias<"mtdscr $Rx", (MTSPR 17, gprc:$Rx)>;
+def : InstAlias<"mfdscr $Rx", (MFSPR gprc:$Rx, 17)>;
+
+def : InstAlias<"mtdsisr $Rx", (MTSPR 18, gprc:$Rx)>;
+def : InstAlias<"mfdsisr $Rx", (MFSPR gprc:$Rx, 18)>;
+
+def : InstAlias<"mtdar $Rx", (MTSPR 19, gprc:$Rx)>;
+def : InstAlias<"mfdar $Rx", (MFSPR gprc:$Rx, 19)>;
+
+def : InstAlias<"mtdec $Rx", (MTSPR 22, gprc:$Rx)>;
+def : InstAlias<"mfdec $Rx", (MFSPR gprc:$Rx, 22)>;
+
+def : InstAlias<"mtsdr1 $Rx", (MTSPR 25, gprc:$Rx)>;
+def : InstAlias<"mfsdr1 $Rx", (MFSPR gprc:$Rx, 25)>;
+
+def : InstAlias<"mtsrr0 $Rx", (MTSPR 26, gprc:$Rx)>;
+def : InstAlias<"mfsrr0 $Rx", (MFSPR gprc:$Rx, 26)>;
+
+def : InstAlias<"mtsrr1 $Rx", (MTSPR 27, gprc:$Rx)>;
+def : InstAlias<"mfsrr1 $Rx", (MFSPR gprc:$Rx, 27)>;
+
+def : InstAlias<"mtsrr2 $Rx", (MTSPR 990, gprc:$Rx)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"mfsrr2 $Rx", (MFSPR gprc:$Rx, 990)>, Requires<[IsPPC4xx]>;
+
+def : InstAlias<"mtsrr3 $Rx", (MTSPR 991, gprc:$Rx)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"mfsrr3 $Rx", (MFSPR gprc:$Rx, 991)>, Requires<[IsPPC4xx]>;
+
+def : InstAlias<"mtcfar $Rx", (MTSPR 28, gprc:$Rx)>;
+def : InstAlias<"mfcfar $Rx", (MFSPR gprc:$Rx, 28)>;
+
+def : InstAlias<"mtamr $Rx", (MTSPR 29, gprc:$Rx)>;
+def : InstAlias<"mfamr $Rx", (MFSPR gprc:$Rx, 29)>;
+
+def : InstAlias<"mtpid $Rx", (MTSPR 48, gprc:$Rx)>, Requires<[IsBookE]>;
+def : InstAlias<"mfpid $Rx", (MFSPR gprc:$Rx, 48)>, Requires<[IsBookE]>;
+
+def : InstAlias<"mftb $Rx", (MFTB gprc:$Rx, 268)>;
+def : InstAlias<"mftbl $Rx", (MFTB gprc:$Rx, 268)>;
+def : InstAlias<"mftbu $Rx", (MFTB gprc:$Rx, 269)>;
+
+def : InstAlias<"mttbl $Rx", (MTSPR 284, gprc:$Rx)>;
+def : InstAlias<"mttbu $Rx", (MTSPR 285, gprc:$Rx)>;
+
+def : InstAlias<"mftblo $Rx", (MFSPR gprc:$Rx, 989)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"mttblo $Rx", (MTSPR 989, gprc:$Rx)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"mftbhi $Rx", (MFSPR gprc:$Rx, 988)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"mttbhi $Rx", (MTSPR 988, gprc:$Rx)>, Requires<[IsPPC4xx]>;
+
+def : InstAlias<"xnop", (XORI R0, R0, 0)>;
+
+def : InstAlias<"mr $rA, $rB", (OR8 g8rc:$rA, g8rc:$rB, g8rc:$rB)>;
+def : InstAlias<"mr. $rA, $rB", (OR8o g8rc:$rA, g8rc:$rB, g8rc:$rB)>;
+
+def : InstAlias<"not $rA, $rB", (NOR8 g8rc:$rA, g8rc:$rB, g8rc:$rB)>;
+def : InstAlias<"not. $rA, $rB", (NOR8o g8rc:$rA, g8rc:$rB, g8rc:$rB)>;
+
+def : InstAlias<"mtcr $rA", (MTCRF8 255, g8rc:$rA)>;
+
+foreach BATR = 0-3 in {
+ def : InstAlias<"mtdbatu "#BATR#", $Rx",
+ (MTSPR !add(BATR, !add(BATR, 536)), gprc:$Rx)>,
+ Requires<[IsPPC6xx]>;
+ def : InstAlias<"mfdbatu $Rx, "#BATR,
+ (MFSPR gprc:$Rx, !add(BATR, !add(BATR, 536)))>,
+ Requires<[IsPPC6xx]>;
+ def : InstAlias<"mtdbatl "#BATR#", $Rx",
+ (MTSPR !add(BATR, !add(BATR, 537)), gprc:$Rx)>,
+ Requires<[IsPPC6xx]>;
+ def : InstAlias<"mfdbatl $Rx, "#BATR,
+ (MFSPR gprc:$Rx, !add(BATR, !add(BATR, 537)))>,
+ Requires<[IsPPC6xx]>;
+ def : InstAlias<"mtibatu "#BATR#", $Rx",
+ (MTSPR !add(BATR, !add(BATR, 528)), gprc:$Rx)>,
+ Requires<[IsPPC6xx]>;
+ def : InstAlias<"mfibatu $Rx, "#BATR,
+ (MFSPR gprc:$Rx, !add(BATR, !add(BATR, 528)))>,
+ Requires<[IsPPC6xx]>;
+ def : InstAlias<"mtibatl "#BATR#", $Rx",
+ (MTSPR !add(BATR, !add(BATR, 529)), gprc:$Rx)>,
+ Requires<[IsPPC6xx]>;
+ def : InstAlias<"mfibatl $Rx, "#BATR,
+ (MFSPR gprc:$Rx, !add(BATR, !add(BATR, 529)))>,
+ Requires<[IsPPC6xx]>;
+}
+
+foreach BR = 0-7 in {
+ def : InstAlias<"mfbr"#BR#" $Rx",
+ (MFDCR gprc:$Rx, !add(BR, 0x80))>,
+ Requires<[IsPPC4xx]>;
+ def : InstAlias<"mtbr"#BR#" $Rx",
+ (MTDCR gprc:$Rx, !add(BR, 0x80))>,
+ Requires<[IsPPC4xx]>;
+}
+
+def : InstAlias<"mtdccr $Rx", (MTSPR 1018, gprc:$Rx)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"mfdccr $Rx", (MFSPR gprc:$Rx, 1018)>, Requires<[IsPPC4xx]>;
+
+def : InstAlias<"mticcr $Rx", (MTSPR 1019, gprc:$Rx)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"mficcr $Rx", (MFSPR gprc:$Rx, 1019)>, Requires<[IsPPC4xx]>;
+
+def : InstAlias<"mtdear $Rx", (MTSPR 981, gprc:$Rx)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"mfdear $Rx", (MFSPR gprc:$Rx, 981)>, Requires<[IsPPC4xx]>;
+
+def : InstAlias<"mtesr $Rx", (MTSPR 980, gprc:$Rx)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"mfesr $Rx", (MFSPR gprc:$Rx, 980)>, Requires<[IsPPC4xx]>;
+
+def : InstAlias<"mfspefscr $Rx", (MFSPR gprc:$Rx, 512)>;
+def : InstAlias<"mtspefscr $Rx", (MTSPR 512, gprc:$Rx)>;
+
+def : InstAlias<"mttcr $Rx", (MTSPR 986, gprc:$Rx)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"mftcr $Rx", (MFSPR gprc:$Rx, 986)>, Requires<[IsPPC4xx]>;
+
+def LAx : PPCAsmPseudo<"la $rA, $addr", (ins gprc:$rA, memri:$addr)>;
+
+def SUBI : PPCAsmPseudo<"subi $rA, $rB, $imm",
+ (ins gprc:$rA, gprc:$rB, s16imm:$imm)>;
+def SUBIS : PPCAsmPseudo<"subis $rA, $rB, $imm",
+ (ins gprc:$rA, gprc:$rB, s16imm:$imm)>;
+def SUBIC : PPCAsmPseudo<"subic $rA, $rB, $imm",
+ (ins gprc:$rA, gprc:$rB, s16imm:$imm)>;
+def SUBICo : PPCAsmPseudo<"subic. $rA, $rB, $imm",
+ (ins gprc:$rA, gprc:$rB, s16imm:$imm)>;
+
+def : InstAlias<"sub $rA, $rB, $rC", (SUBF8 g8rc:$rA, g8rc:$rC, g8rc:$rB)>;
+def : InstAlias<"sub. $rA, $rB, $rC", (SUBF8o g8rc:$rA, g8rc:$rC, g8rc:$rB)>;
+def : InstAlias<"subc $rA, $rB, $rC", (SUBFC8 g8rc:$rA, g8rc:$rC, g8rc:$rB)>;
+def : InstAlias<"subc. $rA, $rB, $rC", (SUBFC8o g8rc:$rA, g8rc:$rC, g8rc:$rB)>;
+
+def : InstAlias<"mtmsrd $RS", (MTMSRD gprc:$RS, 0)>;
+def : InstAlias<"mtmsr $RS", (MTMSR gprc:$RS, 0)>;
+
+def : InstAlias<"mfasr $RT", (MFSPR gprc:$RT, 280)>;
+def : InstAlias<"mtasr $RT", (MTSPR 280, gprc:$RT)>;
+
+foreach SPRG = 0-3 in {
+ def : InstAlias<"mfsprg $RT, "#SPRG, (MFSPR gprc:$RT, !add(SPRG, 272))>;
+ def : InstAlias<"mfsprg"#SPRG#" $RT", (MFSPR gprc:$RT, !add(SPRG, 272))>;
+ def : InstAlias<"mtsprg "#SPRG#", $RT", (MTSPR !add(SPRG, 272), gprc:$RT)>;
+ def : InstAlias<"mtsprg"#SPRG#" $RT", (MTSPR !add(SPRG, 272), gprc:$RT)>;
+}
+foreach SPRG = 4-7 in {
+ def : InstAlias<"mfsprg $RT, "#SPRG, (MFSPR gprc:$RT, !add(SPRG, 256))>,
+ Requires<[IsBookE]>;
+ def : InstAlias<"mfsprg"#SPRG#" $RT", (MFSPR gprc:$RT, !add(SPRG, 256))>,
+ Requires<[IsBookE]>;
+ def : InstAlias<"mtsprg "#SPRG#", $RT", (MTSPR !add(SPRG, 256), gprc:$RT)>,
+ Requires<[IsBookE]>;
+ def : InstAlias<"mtsprg"#SPRG#" $RT", (MTSPR !add(SPRG, 256), gprc:$RT)>,
+ Requires<[IsBookE]>;
+}
+
+def : InstAlias<"mtasr $RS", (MTSPR 280, gprc:$RS)>;
+
+def : InstAlias<"mfdec $RT", (MFSPR gprc:$RT, 22)>;
+def : InstAlias<"mtdec $RT", (MTSPR 22, gprc:$RT)>;
+
+def : InstAlias<"mfpvr $RT", (MFSPR gprc:$RT, 287)>;
+
+def : InstAlias<"mfsdr1 $RT", (MFSPR gprc:$RT, 25)>;
+def : InstAlias<"mtsdr1 $RT", (MTSPR 25, gprc:$RT)>;
+
+def : InstAlias<"mfsrr0 $RT", (MFSPR gprc:$RT, 26)>;
+def : InstAlias<"mfsrr1 $RT", (MFSPR gprc:$RT, 27)>;
+def : InstAlias<"mtsrr0 $RT", (MTSPR 26, gprc:$RT)>;
+def : InstAlias<"mtsrr1 $RT", (MTSPR 27, gprc:$RT)>;
+
+def : InstAlias<"tlbie $RB", (TLBIE R0, gprc:$RB)>;
+
+def : InstAlias<"tlbrehi $RS, $A", (TLBRE2 gprc:$RS, gprc:$A, 0)>,
+ Requires<[IsPPC4xx]>;
+def : InstAlias<"tlbrelo $RS, $A", (TLBRE2 gprc:$RS, gprc:$A, 1)>,
+ Requires<[IsPPC4xx]>;
+def : InstAlias<"tlbwehi $RS, $A", (TLBWE2 gprc:$RS, gprc:$A, 0)>,
+ Requires<[IsPPC4xx]>;
+def : InstAlias<"tlbwelo $RS, $A", (TLBWE2 gprc:$RS, gprc:$A, 1)>,
+ Requires<[IsPPC4xx]>;
+
+def EXTLWI : PPCAsmPseudo<"extlwi $rA, $rS, $n, $b",
+ (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
+def EXTLWIo : PPCAsmPseudo<"extlwi. $rA, $rS, $n, $b",
+ (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
+def EXTRWI : PPCAsmPseudo<"extrwi $rA, $rS, $n, $b",
+ (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
+def EXTRWIo : PPCAsmPseudo<"extrwi. $rA, $rS, $n, $b",
+ (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
+def INSLWI : PPCAsmPseudo<"inslwi $rA, $rS, $n, $b",
+ (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
+def INSLWIo : PPCAsmPseudo<"inslwi. $rA, $rS, $n, $b",
+ (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
+def INSRWI : PPCAsmPseudo<"insrwi $rA, $rS, $n, $b",
+ (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
+def INSRWIo : PPCAsmPseudo<"insrwi. $rA, $rS, $n, $b",
+ (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
+def ROTRWI : PPCAsmPseudo<"rotrwi $rA, $rS, $n",
+ (ins gprc:$rA, gprc:$rS, u5imm:$n)>;
+def ROTRWIo : PPCAsmPseudo<"rotrwi. $rA, $rS, $n",
+ (ins gprc:$rA, gprc:$rS, u5imm:$n)>;
+def SLWI : PPCAsmPseudo<"slwi $rA, $rS, $n",
+ (ins gprc:$rA, gprc:$rS, u5imm:$n)>;
+def SLWIo : PPCAsmPseudo<"slwi. $rA, $rS, $n",
+ (ins gprc:$rA, gprc:$rS, u5imm:$n)>;
+def SRWI : PPCAsmPseudo<"srwi $rA, $rS, $n",
+ (ins gprc:$rA, gprc:$rS, u5imm:$n)>;
+def SRWIo : PPCAsmPseudo<"srwi. $rA, $rS, $n",
+ (ins gprc:$rA, gprc:$rS, u5imm:$n)>;
+def CLRRWI : PPCAsmPseudo<"clrrwi $rA, $rS, $n",
+ (ins gprc:$rA, gprc:$rS, u5imm:$n)>;
+def CLRRWIo : PPCAsmPseudo<"clrrwi. $rA, $rS, $n",
+ (ins gprc:$rA, gprc:$rS, u5imm:$n)>;
+def CLRLSLWI : PPCAsmPseudo<"clrlslwi $rA, $rS, $b, $n",
+ (ins gprc:$rA, gprc:$rS, u5imm:$b, u5imm:$n)>;
+def CLRLSLWIo : PPCAsmPseudo<"clrlslwi. $rA, $rS, $b, $n",
+ (ins gprc:$rA, gprc:$rS, u5imm:$b, u5imm:$n)>;
+
+def : InstAlias<"rotlwi $rA, $rS, $n", (RLWINM gprc:$rA, gprc:$rS, u5imm:$n, 0, 31)>;
+def : InstAlias<"rotlwi. $rA, $rS, $n", (RLWINMo gprc:$rA, gprc:$rS, u5imm:$n, 0, 31)>;
+def : InstAlias<"rotlw $rA, $rS, $rB", (RLWNM gprc:$rA, gprc:$rS, gprc:$rB, 0, 31)>;
+def : InstAlias<"rotlw. $rA, $rS, $rB", (RLWNMo gprc:$rA, gprc:$rS, gprc:$rB, 0, 31)>;
+def : InstAlias<"clrlwi $rA, $rS, $n", (RLWINM gprc:$rA, gprc:$rS, 0, u5imm:$n, 31)>;
+def : InstAlias<"clrlwi. $rA, $rS, $n", (RLWINMo gprc:$rA, gprc:$rS, 0, u5imm:$n, 31)>;
+
+def : InstAlias<"cntlzw $rA, $rS", (CNTLZW gprc:$rA, gprc:$rS)>;
+def : InstAlias<"cntlzw. $rA, $rS", (CNTLZWo gprc:$rA, gprc:$rS)>;
+// The POWER variant
+def : MnemonicAlias<"cntlz", "cntlzw">;
+def : MnemonicAlias<"cntlz.", "cntlzw.">;
+
+def EXTLDI : PPCAsmPseudo<"extldi $rA, $rS, $n, $b",
+ (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>;
+def EXTLDIo : PPCAsmPseudo<"extldi. $rA, $rS, $n, $b",
+ (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>;
+def EXTRDI : PPCAsmPseudo<"extrdi $rA, $rS, $n, $b",
+ (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>;
+def EXTRDIo : PPCAsmPseudo<"extrdi. $rA, $rS, $n, $b",
+ (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>;
+def INSRDI : PPCAsmPseudo<"insrdi $rA, $rS, $n, $b",
+ (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>;
+def INSRDIo : PPCAsmPseudo<"insrdi. $rA, $rS, $n, $b",
+ (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>;
+def ROTRDI : PPCAsmPseudo<"rotrdi $rA, $rS, $n",
+ (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
+def ROTRDIo : PPCAsmPseudo<"rotrdi. $rA, $rS, $n",
+ (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
+def SLDI : PPCAsmPseudo<"sldi $rA, $rS, $n",
+ (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
+def SLDIo : PPCAsmPseudo<"sldi. $rA, $rS, $n",
+ (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
+def SRDI : PPCAsmPseudo<"srdi $rA, $rS, $n",
+ (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
+def SRDIo : PPCAsmPseudo<"srdi. $rA, $rS, $n",
+ (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
+def CLRRDI : PPCAsmPseudo<"clrrdi $rA, $rS, $n",
+ (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
+def CLRRDIo : PPCAsmPseudo<"clrrdi. $rA, $rS, $n",
+ (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
+def CLRLSLDI : PPCAsmPseudo<"clrlsldi $rA, $rS, $b, $n",
+ (ins g8rc:$rA, g8rc:$rS, u6imm:$b, u6imm:$n)>;
+def CLRLSLDIo : PPCAsmPseudo<"clrlsldi. $rA, $rS, $b, $n",
+ (ins g8rc:$rA, g8rc:$rS, u6imm:$b, u6imm:$n)>;
+
+def : InstAlias<"rotldi $rA, $rS, $n", (RLDICL g8rc:$rA, g8rc:$rS, u6imm:$n, 0)>;
+def : InstAlias<"rotldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, u6imm:$n, 0)>;
+def : InstAlias<"rotld $rA, $rS, $rB", (RLDCL g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>;
+def : InstAlias<"rotld. $rA, $rS, $rB", (RLDCLo g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>;
+def : InstAlias<"clrldi $rA, $rS, $n", (RLDICL g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>;
+def : InstAlias<"clrldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>;
+
+def RLWINMbm : PPCAsmPseudo<"rlwinm $rA, $rS, $n, $b",
+ (ins g8rc:$rA, g8rc:$rS, u5imm:$n, i32imm:$b)>;
+def RLWINMobm : PPCAsmPseudo<"rlwinm. $rA, $rS, $n, $b",
+ (ins g8rc:$rA, g8rc:$rS, u5imm:$n, i32imm:$b)>;
+def RLWIMIbm : PPCAsmPseudo<"rlwimi $rA, $rS, $n, $b",
+ (ins g8rc:$rA, g8rc:$rS, u5imm:$n, i32imm:$b)>;
+def RLWIMIobm : PPCAsmPseudo<"rlwimi. $rA, $rS, $n, $b",
+ (ins g8rc:$rA, g8rc:$rS, u5imm:$n, i32imm:$b)>;
+def RLWNMbm : PPCAsmPseudo<"rlwnm $rA, $rS, $n, $b",
+ (ins g8rc:$rA, g8rc:$rS, u5imm:$n, i32imm:$b)>;
+def RLWNMobm : PPCAsmPseudo<"rlwnm. $rA, $rS, $n, $b",
+ (ins g8rc:$rA, g8rc:$rS, u5imm:$n, i32imm:$b)>;
+
+// These generic branch instruction forms are used for the assembler parser only.
+// Defs and Uses are conservative, since we don't know the BO value.
+let PPC970_Unit = 7 in {
+ let Defs = [CTR], Uses = [CTR, RM] in {
+ def gBC : BForm_3<16, 0, 0, (outs),
+ (ins u5imm:$bo, crbitrc:$bi, condbrtarget:$dst),
+ "bc $bo, $bi, $dst">;
+ def gBCA : BForm_3<16, 1, 0, (outs),
+ (ins u5imm:$bo, crbitrc:$bi, abscondbrtarget:$dst),
+ "bca $bo, $bi, $dst">;
+ let isAsmParserOnly = 1 in {
+ def gBCat : BForm_3_at<16, 0, 0, (outs),
+ (ins u5imm:$bo, atimm:$at, crbitrc:$bi,
+ condbrtarget:$dst),
+ "bc$at $bo, $bi, $dst">;
+ def gBCAat : BForm_3_at<16, 1, 0, (outs),
+ (ins u5imm:$bo, atimm:$at, crbitrc:$bi,
+ abscondbrtarget:$dst),
+ "bca$at $bo, $bi, $dst">;
+ } // isAsmParserOnly = 1
+ }
+ let Defs = [LR, CTR], Uses = [CTR, RM] in {
+ def gBCL : BForm_3<16, 0, 1, (outs),
+ (ins u5imm:$bo, crbitrc:$bi, condbrtarget:$dst),
+ "bcl $bo, $bi, $dst">;
+ def gBCLA : BForm_3<16, 1, 1, (outs),
+ (ins u5imm:$bo, crbitrc:$bi, abscondbrtarget:$dst),
+ "bcla $bo, $bi, $dst">;
+ let isAsmParserOnly = 1 in {
+ def gBCLat : BForm_3_at<16, 0, 1, (outs),
+ (ins u5imm:$bo, atimm:$at, crbitrc:$bi,
+ condbrtarget:$dst),
+ "bcl$at $bo, $bi, $dst">;
+ def gBCLAat : BForm_3_at<16, 1, 1, (outs),
+ (ins u5imm:$bo, atimm:$at, crbitrc:$bi,
+ abscondbrtarget:$dst),
+ "bcla$at $bo, $bi, $dst">;
+ } // // isAsmParserOnly = 1
+ }
+ let Defs = [CTR], Uses = [CTR, LR, RM] in
+ def gBCLR : XLForm_2<19, 16, 0, (outs),
+ (ins u5imm:$bo, crbitrc:$bi, i32imm:$bh),
+ "bclr $bo, $bi, $bh", IIC_BrB, []>;
+ let Defs = [LR, CTR], Uses = [CTR, LR, RM] in
+ def gBCLRL : XLForm_2<19, 16, 1, (outs),
+ (ins u5imm:$bo, crbitrc:$bi, i32imm:$bh),
+ "bclrl $bo, $bi, $bh", IIC_BrB, []>;
+ let Defs = [CTR], Uses = [CTR, LR, RM] in
+ def gBCCTR : XLForm_2<19, 528, 0, (outs),
+ (ins u5imm:$bo, crbitrc:$bi, i32imm:$bh),
+ "bcctr $bo, $bi, $bh", IIC_BrB, []>;
+ let Defs = [LR, CTR], Uses = [CTR, LR, RM] in
+ def gBCCTRL : XLForm_2<19, 528, 1, (outs),
+ (ins u5imm:$bo, crbitrc:$bi, i32imm:$bh),
+ "bcctrl $bo, $bi, $bh", IIC_BrB, []>;
+}
+
+multiclass BranchSimpleMnemonicAT<string pm, int at> {
+ def : InstAlias<"bc"#pm#" $bo, $bi, $dst", (gBCat u5imm:$bo, at, crbitrc:$bi,
+ condbrtarget:$dst)>;
+ def : InstAlias<"bca"#pm#" $bo, $bi, $dst", (gBCAat u5imm:$bo, at, crbitrc:$bi,
+ condbrtarget:$dst)>;
+ def : InstAlias<"bcl"#pm#" $bo, $bi, $dst", (gBCLat u5imm:$bo, at, crbitrc:$bi,
+ condbrtarget:$dst)>;
+ def : InstAlias<"bcla"#pm#" $bo, $bi, $dst", (gBCLAat u5imm:$bo, at, crbitrc:$bi,
+ condbrtarget:$dst)>;
+}
+defm : BranchSimpleMnemonicAT<"+", 3>;
+defm : BranchSimpleMnemonicAT<"-", 2>;
+
+def : InstAlias<"bclr $bo, $bi", (gBCLR u5imm:$bo, crbitrc:$bi, 0)>;
+def : InstAlias<"bclrl $bo, $bi", (gBCLRL u5imm:$bo, crbitrc:$bi, 0)>;
+def : InstAlias<"bcctr $bo, $bi", (gBCCTR u5imm:$bo, crbitrc:$bi, 0)>;
+def : InstAlias<"bcctrl $bo, $bi", (gBCCTRL u5imm:$bo, crbitrc:$bi, 0)>;
+
+multiclass BranchSimpleMnemonic1<string name, string pm, int bo> {
+ def : InstAlias<"b"#name#pm#" $bi, $dst", (gBC bo, crbitrc:$bi, condbrtarget:$dst)>;
+ def : InstAlias<"b"#name#"a"#pm#" $bi, $dst", (gBCA bo, crbitrc:$bi, abscondbrtarget:$dst)>;
+ def : InstAlias<"b"#name#"lr"#pm#" $bi", (gBCLR bo, crbitrc:$bi, 0)>;
+ def : InstAlias<"b"#name#"l"#pm#" $bi, $dst", (gBCL bo, crbitrc:$bi, condbrtarget:$dst)>;
+ def : InstAlias<"b"#name#"la"#pm#" $bi, $dst", (gBCLA bo, crbitrc:$bi, abscondbrtarget:$dst)>;
+ def : InstAlias<"b"#name#"lrl"#pm#" $bi", (gBCLRL bo, crbitrc:$bi, 0)>;
+}
+multiclass BranchSimpleMnemonic2<string name, string pm, int bo>
+ : BranchSimpleMnemonic1<name, pm, bo> {
+ def : InstAlias<"b"#name#"ctr"#pm#" $bi", (gBCCTR bo, crbitrc:$bi, 0)>;
+ def : InstAlias<"b"#name#"ctrl"#pm#" $bi", (gBCCTRL bo, crbitrc:$bi, 0)>;
+}
+defm : BranchSimpleMnemonic2<"t", "", 12>;
+defm : BranchSimpleMnemonic2<"f", "", 4>;
+defm : BranchSimpleMnemonic2<"t", "-", 14>;
+defm : BranchSimpleMnemonic2<"f", "-", 6>;
+defm : BranchSimpleMnemonic2<"t", "+", 15>;
+defm : BranchSimpleMnemonic2<"f", "+", 7>;
+defm : BranchSimpleMnemonic1<"dnzt", "", 8>;
+defm : BranchSimpleMnemonic1<"dnzf", "", 0>;
+defm : BranchSimpleMnemonic1<"dzt", "", 10>;
+defm : BranchSimpleMnemonic1<"dzf", "", 2>;
+
+multiclass BranchExtendedMnemonicPM<string name, string pm, int bibo> {
+ def : InstAlias<"b"#name#pm#" $cc, $dst",
+ (BCC bibo, crrc:$cc, condbrtarget:$dst)>;
+ def : InstAlias<"b"#name#pm#" $dst",
+ (BCC bibo, CR0, condbrtarget:$dst)>;
+
+ def : InstAlias<"b"#name#"a"#pm#" $cc, $dst",
+ (BCCA bibo, crrc:$cc, abscondbrtarget:$dst)>;
+ def : InstAlias<"b"#name#"a"#pm#" $dst",
+ (BCCA bibo, CR0, abscondbrtarget:$dst)>;
+
+ def : InstAlias<"b"#name#"lr"#pm#" $cc",
+ (BCCLR bibo, crrc:$cc)>;
+ def : InstAlias<"b"#name#"lr"#pm,
+ (BCCLR bibo, CR0)>;
+
+ def : InstAlias<"b"#name#"ctr"#pm#" $cc",
+ (BCCCTR bibo, crrc:$cc)>;
+ def : InstAlias<"b"#name#"ctr"#pm,
+ (BCCCTR bibo, CR0)>;
+
+ def : InstAlias<"b"#name#"l"#pm#" $cc, $dst",
+ (BCCL bibo, crrc:$cc, condbrtarget:$dst)>;
+ def : InstAlias<"b"#name#"l"#pm#" $dst",
+ (BCCL bibo, CR0, condbrtarget:$dst)>;
+
+ def : InstAlias<"b"#name#"la"#pm#" $cc, $dst",
+ (BCCLA bibo, crrc:$cc, abscondbrtarget:$dst)>;
+ def : InstAlias<"b"#name#"la"#pm#" $dst",
+ (BCCLA bibo, CR0, abscondbrtarget:$dst)>;
+
+ def : InstAlias<"b"#name#"lrl"#pm#" $cc",
+ (BCCLRL bibo, crrc:$cc)>;
+ def : InstAlias<"b"#name#"lrl"#pm,
+ (BCCLRL bibo, CR0)>;
+
+ def : InstAlias<"b"#name#"ctrl"#pm#" $cc",
+ (BCCCTRL bibo, crrc:$cc)>;
+ def : InstAlias<"b"#name#"ctrl"#pm,
+ (BCCCTRL bibo, CR0)>;
+}
+multiclass BranchExtendedMnemonic<string name, int bibo> {
+ defm : BranchExtendedMnemonicPM<name, "", bibo>;
+ defm : BranchExtendedMnemonicPM<name, "-", !add(bibo, 2)>;
+ defm : BranchExtendedMnemonicPM<name, "+", !add(bibo, 3)>;
+}
+defm : BranchExtendedMnemonic<"lt", 12>;
+defm : BranchExtendedMnemonic<"gt", 44>;
+defm : BranchExtendedMnemonic<"eq", 76>;
+defm : BranchExtendedMnemonic<"un", 108>;
+defm : BranchExtendedMnemonic<"so", 108>;
+defm : BranchExtendedMnemonic<"ge", 4>;
+defm : BranchExtendedMnemonic<"nl", 4>;
+defm : BranchExtendedMnemonic<"le", 36>;
+defm : BranchExtendedMnemonic<"ng", 36>;
+defm : BranchExtendedMnemonic<"ne", 68>;
+defm : BranchExtendedMnemonic<"nu", 100>;
+defm : BranchExtendedMnemonic<"ns", 100>;
+
+def : InstAlias<"cmpwi $rA, $imm", (CMPWI CR0, gprc:$rA, s16imm:$imm)>;
+def : InstAlias<"cmpw $rA, $rB", (CMPW CR0, gprc:$rA, gprc:$rB)>;
+def : InstAlias<"cmplwi $rA, $imm", (CMPLWI CR0, gprc:$rA, u16imm:$imm)>;
+def : InstAlias<"cmplw $rA, $rB", (CMPLW CR0, gprc:$rA, gprc:$rB)>;
+def : InstAlias<"cmpdi $rA, $imm", (CMPDI CR0, g8rc:$rA, s16imm64:$imm)>;
+def : InstAlias<"cmpd $rA, $rB", (CMPD CR0, g8rc:$rA, g8rc:$rB)>;
+def : InstAlias<"cmpldi $rA, $imm", (CMPLDI CR0, g8rc:$rA, u16imm64:$imm)>;
+def : InstAlias<"cmpld $rA, $rB", (CMPLD CR0, g8rc:$rA, g8rc:$rB)>;
+
+def : InstAlias<"cmpi $bf, 0, $rA, $imm", (CMPWI crrc:$bf, gprc:$rA, s16imm:$imm)>;
+def : InstAlias<"cmp $bf, 0, $rA, $rB", (CMPW crrc:$bf, gprc:$rA, gprc:$rB)>;
+def : InstAlias<"cmpli $bf, 0, $rA, $imm", (CMPLWI crrc:$bf, gprc:$rA, u16imm:$imm)>;
+def : InstAlias<"cmpl $bf, 0, $rA, $rB", (CMPLW crrc:$bf, gprc:$rA, gprc:$rB)>;
+def : InstAlias<"cmpi $bf, 1, $rA, $imm", (CMPDI crrc:$bf, g8rc:$rA, s16imm64:$imm)>;
+def : InstAlias<"cmp $bf, 1, $rA, $rB", (CMPD crrc:$bf, g8rc:$rA, g8rc:$rB)>;
+def : InstAlias<"cmpli $bf, 1, $rA, $imm", (CMPLDI crrc:$bf, g8rc:$rA, u16imm64:$imm)>;
+def : InstAlias<"cmpl $bf, 1, $rA, $rB", (CMPLD crrc:$bf, g8rc:$rA, g8rc:$rB)>;
+
+multiclass TrapExtendedMnemonic<string name, int to> {
+ def : InstAlias<"td"#name#"i $rA, $imm", (TDI to, g8rc:$rA, s16imm:$imm)>;
+ def : InstAlias<"td"#name#" $rA, $rB", (TD to, g8rc:$rA, g8rc:$rB)>;
+ def : InstAlias<"tw"#name#"i $rA, $imm", (TWI to, gprc:$rA, s16imm:$imm)>;
+ def : InstAlias<"tw"#name#" $rA, $rB", (TW to, gprc:$rA, gprc:$rB)>;
+}
+defm : TrapExtendedMnemonic<"lt", 16>;
+defm : TrapExtendedMnemonic<"le", 20>;
+defm : TrapExtendedMnemonic<"eq", 4>;
+defm : TrapExtendedMnemonic<"ge", 12>;
+defm : TrapExtendedMnemonic<"gt", 8>;
+defm : TrapExtendedMnemonic<"nl", 12>;
+defm : TrapExtendedMnemonic<"ne", 24>;
+defm : TrapExtendedMnemonic<"ng", 20>;
+defm : TrapExtendedMnemonic<"llt", 2>;
+defm : TrapExtendedMnemonic<"lle", 6>;
+defm : TrapExtendedMnemonic<"lge", 5>;
+defm : TrapExtendedMnemonic<"lgt", 1>;
+defm : TrapExtendedMnemonic<"lnl", 5>;
+defm : TrapExtendedMnemonic<"lng", 6>;
+defm : TrapExtendedMnemonic<"u", 31>;
+
+// Atomic loads
+def : Pat<(atomic_load_8 iaddr:$src), (LBZ memri:$src)>;
+def : Pat<(atomic_load_16 iaddr:$src), (LHZ memri:$src)>;
+def : Pat<(atomic_load_32 iaddr:$src), (LWZ memri:$src)>;
+def : Pat<(atomic_load_8 xaddr:$src), (LBZX memrr:$src)>;
+def : Pat<(atomic_load_16 xaddr:$src), (LHZX memrr:$src)>;
+def : Pat<(atomic_load_32 xaddr:$src), (LWZX memrr:$src)>;
+
+// Atomic stores
+def : Pat<(atomic_store_8 iaddr:$ptr, i32:$val), (STB gprc:$val, memri:$ptr)>;
+def : Pat<(atomic_store_16 iaddr:$ptr, i32:$val), (STH gprc:$val, memri:$ptr)>;
+def : Pat<(atomic_store_32 iaddr:$ptr, i32:$val), (STW gprc:$val, memri:$ptr)>;
+def : Pat<(atomic_store_8 xaddr:$ptr, i32:$val), (STBX gprc:$val, memrr:$ptr)>;
+def : Pat<(atomic_store_16 xaddr:$ptr, i32:$val), (STHX gprc:$val, memrr:$ptr)>;
+def : Pat<(atomic_store_32 xaddr:$ptr, i32:$val), (STWX gprc:$val, memrr:$ptr)>;
+
+let Predicates = [IsISA3_0] in {
+
+// Copy-Paste Facility
+// We prefix 'CP' to COPY due to name conflict in Target.td. We also prefix to
+// PASTE for naming consistency.
+let mayLoad = 1 in
+def CP_COPY : X_L1_RA5_RB5<31, 774, "copy" , gprc, IIC_LdStCOPY, []>;
+
+let mayStore = 1 in
+def CP_PASTE : X_L1_RA5_RB5<31, 902, "paste" , gprc, IIC_LdStPASTE, []>;
+
+let mayStore = 1, Defs = [CR0] in
+def CP_PASTEo : X_L1_RA5_RB5<31, 902, "paste.", gprc, IIC_LdStPASTE, []>, isDOT;
+
+def CP_COPYx : PPCAsmPseudo<"copy $rA, $rB" , (ins gprc:$rA, gprc:$rB)>;
+def CP_PASTEx : PPCAsmPseudo<"paste $rA, $rB", (ins gprc:$rA, gprc:$rB)>;
+def CP_COPY_FIRST : PPCAsmPseudo<"copy_first $rA, $rB",
+ (ins gprc:$rA, gprc:$rB)>;
+def CP_PASTE_LAST : PPCAsmPseudo<"paste_last $rA, $rB",
+ (ins gprc:$rA, gprc:$rB)>;
+def CP_ABORT : XForm_0<31, 838, (outs), (ins), "cp_abort", IIC_SprABORT, []>;
+
+// Message Synchronize
+def MSGSYNC : XForm_0<31, 886, (outs), (ins), "msgsync", IIC_SprMSGSYNC, []>;
+
+// Power-Saving Mode Instruction:
+def STOP : XForm_0<19, 370, (outs), (ins), "stop", IIC_SprSTOP, []>;
+
+} // IsISA3_0
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrQPX.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrQPX.td
new file mode 100644
index 000000000000..4940c77c7ae5
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrQPX.td
@@ -0,0 +1,1216 @@
+//===- PPCInstrQPX.td - The PowerPC QPX Extension --*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the QPX extension to the PowerPC instruction set.
+// Reference:
+// Book Q: QPX Architecture Definition. IBM (as updated in) 2011.
+//
+//===----------------------------------------------------------------------===//
+
+def PPCRegQFRCAsmOperand : AsmOperandClass {
+ let Name = "RegQFRC"; let PredicateMethod = "isRegNumber";
+}
+def qfrc : RegisterOperand<QFRC> {
+ let ParserMatchClass = PPCRegQFRCAsmOperand;
+}
+def PPCRegQSRCAsmOperand : AsmOperandClass {
+ let Name = "RegQSRC"; let PredicateMethod = "isRegNumber";
+}
+def qsrc : RegisterOperand<QSRC> {
+ let ParserMatchClass = PPCRegQSRCAsmOperand;
+}
+def PPCRegQBRCAsmOperand : AsmOperandClass {
+ let Name = "RegQBRC"; let PredicateMethod = "isRegNumber";
+}
+def qbrc : RegisterOperand<QBRC> {
+ let ParserMatchClass = PPCRegQBRCAsmOperand;
+}
+
+//===----------------------------------------------------------------------===//
+// Helpers for defining instructions that directly correspond to intrinsics.
+
+// QPXA1_Int - A AForm_1 intrinsic definition.
+class QPXA1_Int<bits<6> opcode, bits<5> xo, string opc, Intrinsic IntID>
+ : AForm_1<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC),
+ !strconcat(opc, " $FRT, $FRA, $FRC, $FRB"), IIC_FPFused,
+ [(set v4f64:$FRT, (IntID v4f64:$FRA, v4f64:$FRB, v4f64:$FRC))]>;
+// QPXA1s_Int - A AForm_1 intrinsic definition (simple instructions).
+class QPXA1s_Int<bits<6> opcode, bits<5> xo, string opc, Intrinsic IntID>
+ : AForm_1<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC),
+ !strconcat(opc, " $FRT, $FRA, $FRC, $FRB"), IIC_VecPerm,
+ [(set v4f64:$FRT, (IntID v4f64:$FRA, v4f64:$FRB, v4f64:$FRC))]>;
+// QPXA2_Int - A AForm_2 intrinsic definition.
+class QPXA2_Int<bits<6> opcode, bits<5> xo, string opc, Intrinsic IntID>
+ : AForm_2<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
+ !strconcat(opc, " $FRT, $FRA, $FRB"), IIC_FPGeneral,
+ [(set v4f64:$FRT, (IntID v4f64:$FRA, v4f64:$FRB))]>;
+// QPXA3_Int - A AForm_3 intrinsic definition.
+class QPXA3_Int<bits<6> opcode, bits<5> xo, string opc, Intrinsic IntID>
+ : AForm_3<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC),
+ !strconcat(opc, " $FRT, $FRA, $FRC"), IIC_FPGeneral,
+ [(set v4f64:$FRT, (IntID v4f64:$FRA, v4f64:$FRC))]>;
+// QPXA4_Int - A AForm_4a intrinsic definition.
+class QPXA4_Int<bits<6> opcode, bits<5> xo, string opc, Intrinsic IntID>
+ : AForm_4a<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRB),
+ !strconcat(opc, " $FRT, $FRB"), IIC_FPGeneral,
+ [(set v4f64:$FRT, (IntID v4f64:$FRB))]>;
+// QPXX18_Int - A XForm_18 intrinsic definition.
+class QPXX18_Int<bits<6> opcode, bits<10> xo, string opc, Intrinsic IntID>
+ : XForm_18<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
+ !strconcat(opc, " $FRT, $FRA, $FRB"), IIC_FPCompare,
+ [(set v4f64:$FRT, (IntID v4f64:$FRA, v4f64:$FRB))]>;
+// QPXX19_Int - A XForm_19 intrinsic definition.
+class QPXX19_Int<bits<6> opcode, bits<10> xo, string opc, Intrinsic IntID>
+ : XForm_19<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRB),
+ !strconcat(opc, " $FRT, $FRB"), IIC_FPGeneral,
+ [(set v4f64:$FRT, (IntID v4f64:$FRB))]>;
+
+//===----------------------------------------------------------------------===//
+// Pattern Frags.
+
+def extloadv4f32 : PatFrag<(ops node:$ptr), (extload node:$ptr), [{
+ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::v4f32;
+}]>;
+
+def truncstorev4f32 : PatFrag<(ops node:$val, node:$ptr),
+ (truncstore node:$val, node:$ptr), [{
+ return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v4f32;
+}]>;
+def pre_truncstv4f32 : PatFrag<(ops node:$val, node:$base, node:$offset),
+ (pre_truncst node:$val,
+ node:$base, node:$offset), [{
+ return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v4f32;
+}]>;
+
+def fround_inexact : PatFrag<(ops node:$val), (fpround node:$val), [{
+ return cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() == 0;
+}]>;
+
+def fround_exact : PatFrag<(ops node:$val), (fpround node:$val), [{
+ return cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() == 1;
+}]>;
+
+let FastIselShouldIgnore = 1 in // FastIsel should ignore all u12 instrs.
+ def u12 : ImmLeaf<i32, [{ return (Imm & 0xFFF) == Imm; }]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction Definitions.
+
+def HasQPX : Predicate<"PPCSubTarget->hasQPX()">;
+let Predicates = [HasQPX] in {
+let DecoderNamespace = "QPX" in {
+let hasSideEffects = 0 in { // QPX instructions don't have side effects.
+let Uses = [RM] in {
+ // Add Instructions
+ let isCommutable = 1 in {
+ def QVFADD : AForm_2<4, 21,
+ (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
+ "qvfadd $FRT, $FRA, $FRB", IIC_FPGeneral,
+ [(set v4f64:$FRT, (fadd v4f64:$FRA, v4f64:$FRB))]>;
+ let isCodeGenOnly = 1 in
+ def QVFADDS : QPXA2_Int<0, 21, "qvfadds", int_ppc_qpx_qvfadds>;
+ def QVFADDSs : AForm_2<0, 21,
+ (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB),
+ "qvfadds $FRT, $FRA, $FRB", IIC_FPGeneral,
+ [(set v4f32:$FRT, (fadd v4f32:$FRA, v4f32:$FRB))]>;
+ }
+ def QVFSUB : AForm_2<4, 20,
+ (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
+ "qvfsub $FRT, $FRA, $FRB", IIC_FPGeneral,
+ [(set v4f64:$FRT, (fsub v4f64:$FRA, v4f64:$FRB))]>;
+ let isCodeGenOnly = 1 in
+ def QVFSUBS : QPXA2_Int<0, 20, "qvfsubs", int_ppc_qpx_qvfsubs>;
+ def QVFSUBSs : AForm_2<0, 20,
+ (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB),
+ "qvfsubs $FRT, $FRA, $FRB", IIC_FPGeneral,
+ [(set v4f32:$FRT, (fsub v4f32:$FRA, v4f32:$FRB))]>;
+
+ // Estimate Instructions
+ def QVFRE : AForm_4a<4, 24, (outs qfrc:$FRT), (ins qfrc:$FRB),
+ "qvfre $FRT, $FRB", IIC_FPGeneral,
+ [(set v4f64:$FRT, (PPCfre v4f64:$FRB))]>;
+ def QVFRES : QPXA4_Int<0, 24, "qvfres", int_ppc_qpx_qvfres>;
+ let isCodeGenOnly = 1 in
+ def QVFRESs : AForm_4a<0, 24, (outs qsrc:$FRT), (ins qsrc:$FRB),
+ "qvfres $FRT, $FRB", IIC_FPGeneral,
+ [(set v4f32:$FRT, (PPCfre v4f32:$FRB))]>;
+
+ def QVFRSQRTE : AForm_4a<4, 26, (outs qfrc:$FRT), (ins qfrc:$FRB),
+ "qvfrsqrte $FRT, $FRB", IIC_FPGeneral,
+ [(set v4f64:$FRT, (PPCfrsqrte v4f64:$FRB))]>;
+ def QVFRSQRTES : QPXA4_Int<0, 26, "qvfrsqrtes", int_ppc_qpx_qvfrsqrtes>;
+ let isCodeGenOnly = 1 in
+ def QVFRSQRTESs : AForm_4a<0, 26, (outs qsrc:$FRT), (ins qsrc:$FRB),
+ "qvfrsqrtes $FRT, $FRB", IIC_FPGeneral,
+ [(set v4f32:$FRT, (PPCfrsqrte v4f32:$FRB))]>;
+
+ // Multiply Instructions
+ let isCommutable = 1 in {
+ def QVFMUL : AForm_3<4, 25,
+ (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC),
+ "qvfmul $FRT, $FRA, $FRC", IIC_FPGeneral,
+ [(set v4f64:$FRT, (fmul v4f64:$FRA, v4f64:$FRC))]>;
+ let isCodeGenOnly = 1 in
+ def QVFMULS : QPXA3_Int<0, 25, "qvfmuls", int_ppc_qpx_qvfmuls>;
+ def QVFMULSs : AForm_3<0, 25,
+ (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC),
+ "qvfmuls $FRT, $FRA, $FRC", IIC_FPGeneral,
+ [(set v4f32:$FRT, (fmul v4f32:$FRA, v4f32:$FRC))]>;
+ }
+ def QVFXMUL : QPXA3_Int<4, 17, "qvfxmul", int_ppc_qpx_qvfxmul>;
+ def QVFXMULS : QPXA3_Int<0, 17, "qvfxmuls", int_ppc_qpx_qvfxmuls>;
+
+ // Multiply-add instructions
+ def QVFMADD : AForm_1<4, 29,
+ (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC),
+ "qvfmadd $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
+ [(set v4f64:$FRT, (fma v4f64:$FRA, v4f64:$FRC, v4f64:$FRB))]>;
+ let isCodeGenOnly = 1 in
+ def QVFMADDS : QPXA1_Int<0, 29, "qvfmadds", int_ppc_qpx_qvfmadds>;
+ def QVFMADDSs : AForm_1<0, 29,
+ (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, qsrc:$FRC),
+ "qvfmadds $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
+ [(set v4f32:$FRT, (fma v4f32:$FRA, v4f32:$FRC, v4f32:$FRB))]>;
+ def QVFNMADD : AForm_1<4, 31,
+ (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC),
+ "qvfnmadd $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
+ [(set v4f64:$FRT, (fneg (fma v4f64:$FRA, v4f64:$FRC,
+ v4f64:$FRB)))]>;
+ let isCodeGenOnly = 1 in
+ def QVFNMADDS : QPXA1_Int<0, 31, "qvfnmadds", int_ppc_qpx_qvfnmadds>;
+ def QVFNMADDSs : AForm_1<0, 31,
+ (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, qsrc:$FRC),
+ "qvfnmadds $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
+ [(set v4f32:$FRT, (fneg (fma v4f32:$FRA, v4f32:$FRC,
+ v4f32:$FRB)))]>;
+ def QVFMSUB : AForm_1<4, 28,
+ (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC),
+ "qvfmsub $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
+ [(set v4f64:$FRT, (fma v4f64:$FRA, v4f64:$FRC,
+ (fneg v4f64:$FRB)))]>;
+ let isCodeGenOnly = 1 in
+ def QVFMSUBS : QPXA1_Int<0, 28, "qvfmsubs", int_ppc_qpx_qvfmsubs>;
+ def QVFMSUBSs : AForm_1<0, 28,
+ (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, qsrc:$FRC),
+ "qvfmsubs $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
+ [(set v4f32:$FRT, (fma v4f32:$FRA, v4f32:$FRC,
+ (fneg v4f32:$FRB)))]>;
+ def QVFNMSUB : AForm_1<4, 30,
+ (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC),
+ "qvfnmsub $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
+ [(set v4f64:$FRT, (fneg (fma v4f64:$FRA, v4f64:$FRC,
+ (fneg v4f64:$FRB))))]>;
+ let isCodeGenOnly = 1 in
+ def QVFNMSUBS : QPXA1_Int<0, 30, "qvfnmsubs", int_ppc_qpx_qvfnmsubs>;
+ def QVFNMSUBSs : AForm_1<0, 30,
+ (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, qsrc:$FRC),
+ "qvfnmsubs $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
+ [(set v4f32:$FRT, (fneg (fma v4f32:$FRA, v4f32:$FRC,
+ (fneg v4f32:$FRB))))]>;
+ def QVFXMADD : QPXA1_Int<4, 9, "qvfxmadd", int_ppc_qpx_qvfxmadd>;
+ def QVFXMADDS : QPXA1_Int<0, 9, "qvfxmadds", int_ppc_qpx_qvfxmadds>;
+ def QVFXXNPMADD : QPXA1_Int<4, 11, "qvfxxnpmadd", int_ppc_qpx_qvfxxnpmadd>;
+ def QVFXXNPMADDS : QPXA1_Int<0, 11, "qvfxxnpmadds", int_ppc_qpx_qvfxxnpmadds>;
+ def QVFXXCPNMADD : QPXA1_Int<4, 3, "qvfxxcpnmadd", int_ppc_qpx_qvfxxcpnmadd>;
+ def QVFXXCPNMADDS : QPXA1_Int<0, 3, "qvfxxcpnmadds", int_ppc_qpx_qvfxxcpnmadds>;
+ def QVFXXMADD : QPXA1_Int<4, 1, "qvfxxmadd", int_ppc_qpx_qvfxxmadd>;
+ def QVFXXMADDS : QPXA1_Int<0, 1, "qvfxxmadds", int_ppc_qpx_qvfxxmadds>;
+
+ // Select Instruction
+ let isCodeGenOnly = 1 in
+ def QVFSEL : QPXA1s_Int<4, 23, "qvfsel", int_ppc_qpx_qvfsel>;
+ def QVFSELb : AForm_1<4, 23, (outs qfrc:$FRT),
+ (ins qbrc:$FRA, qfrc:$FRB, qfrc:$FRC),
+ "qvfsel $FRT, $FRA, $FRC, $FRB", IIC_VecPerm,
+ [(set v4f64:$FRT, (vselect v4i1:$FRA,
+ v4f64:$FRC, v4f64:$FRB))]>;
+ let isCodeGenOnly = 1 in
+ def QVFSELbs : AForm_1<4, 23, (outs qsrc:$FRT),
+ (ins qbrc:$FRA, qsrc:$FRB, qsrc:$FRC),
+ "qvfsel $FRT, $FRA, $FRC, $FRB", IIC_VecPerm,
+ [(set v4f32:$FRT, (vselect v4i1:$FRA,
+ v4f32:$FRC, v4f32:$FRB))]>;
+ let isCodeGenOnly = 1 in
+ def QVFSELbb: AForm_1<4, 23, (outs qbrc:$FRT),
+ (ins qbrc:$FRA, qbrc:$FRB, qbrc:$FRC),
+ "qvfsel $FRT, $FRA, $FRC, $FRB", IIC_VecPerm,
+ [(set v4i1:$FRT, (vselect v4i1:$FRA,
+ v4i1:$FRC, v4i1:$FRB))]>;
+
+ // SELECT_CC_* - Used to implement the SELECT_CC DAG operation. Expanded after
+ // instruction selection into a branch sequence.
+ let usesCustomInserter = 1 in {
+ def SELECT_CC_QFRC: Pseudo<(outs qfrc:$dst), (ins crrc:$cond, qfrc:$T, qfrc:$F,
+ i32imm:$BROPC), "#SELECT_CC_QFRC",
+ []>;
+ def SELECT_CC_QSRC: Pseudo<(outs qsrc:$dst), (ins crrc:$cond, qsrc:$T, qsrc:$F,
+ i32imm:$BROPC), "#SELECT_CC_QSRC",
+ []>;
+ def SELECT_CC_QBRC: Pseudo<(outs qbrc:$dst), (ins crrc:$cond, qbrc:$T, qbrc:$F,
+ i32imm:$BROPC), "#SELECT_CC_QBRC",
+ []>;
+
+ // SELECT_* pseudo instructions, like SELECT_CC_* but taking condition
+ // register bit directly.
+ def SELECT_QFRC: Pseudo<(outs qfrc:$dst), (ins crbitrc:$cond,
+ qfrc:$T, qfrc:$F), "#SELECT_QFRC",
+ [(set v4f64:$dst,
+ (select i1:$cond, v4f64:$T, v4f64:$F))]>;
+ def SELECT_QSRC: Pseudo<(outs qsrc:$dst), (ins crbitrc:$cond,
+ qsrc:$T, qsrc:$F), "#SELECT_QSRC",
+ [(set v4f32:$dst,
+ (select i1:$cond, v4f32:$T, v4f32:$F))]>;
+ def SELECT_QBRC: Pseudo<(outs qbrc:$dst), (ins crbitrc:$cond,
+ qbrc:$T, qbrc:$F), "#SELECT_QBRC",
+ [(set v4i1:$dst,
+ (select i1:$cond, v4i1:$T, v4i1:$F))]>;
+ }
+
+ // Convert and Round Instructions
+ def QVFCTID : QPXX19_Int<4, 814, "qvfctid", int_ppc_qpx_qvfctid>;
+ let isCodeGenOnly = 1 in
+ def QVFCTIDb : XForm_19<4, 814, (outs qbrc:$FRT), (ins qbrc:$FRB),
+ "qvfctid $FRT, $FRB", IIC_FPGeneral, []>;
+
+ def QVFCTIDU : QPXX19_Int<4, 942, "qvfctidu", int_ppc_qpx_qvfctidu>;
+ def QVFCTIDZ : QPXX19_Int<4, 815, "qvfctidz", int_ppc_qpx_qvfctidz>;
+ def QVFCTIDUZ : QPXX19_Int<4, 943, "qvfctiduz", int_ppc_qpx_qvfctiduz>;
+ def QVFCTIW : QPXX19_Int<4, 14, "qvfctiw", int_ppc_qpx_qvfctiw>;
+ def QVFCTIWU : QPXX19_Int<4, 142, "qvfctiwu", int_ppc_qpx_qvfctiwu>;
+ def QVFCTIWZ : QPXX19_Int<4, 15, "qvfctiwz", int_ppc_qpx_qvfctiwz>;
+ def QVFCTIWUZ : QPXX19_Int<4, 143, "qvfctiwuz", int_ppc_qpx_qvfctiwuz>;
+ def QVFCFID : QPXX19_Int<4, 846, "qvfcfid", int_ppc_qpx_qvfcfid>;
+ let isCodeGenOnly = 1 in
+ def QVFCFIDb : XForm_19<4, 846, (outs qbrc:$FRT), (ins qbrc:$FRB),
+ "qvfcfid $FRT, $FRB", IIC_FPGeneral, []>;
+
+ def QVFCFIDU : QPXX19_Int<4, 974, "qvfcfidu", int_ppc_qpx_qvfcfidu>;
+ def QVFCFIDS : QPXX19_Int<0, 846, "qvfcfids", int_ppc_qpx_qvfcfids>;
+ def QVFCFIDUS : QPXX19_Int<0, 974, "qvfcfidus", int_ppc_qpx_qvfcfidus>;
+
+ let isCodeGenOnly = 1 in
+ def QVFRSP : QPXX19_Int<4, 12, "qvfrsp", int_ppc_qpx_qvfrsp>;
+ def QVFRSPs : XForm_19<4, 12,
+ (outs qsrc:$FRT), (ins qfrc:$FRB),
+ "qvfrsp $FRT, $FRB", IIC_FPGeneral,
+ [(set v4f32:$FRT, (fround_inexact v4f64:$FRB))]>;
+
+ def QVFRIZ : XForm_19<4, 424, (outs qfrc:$FRT), (ins qfrc:$FRB),
+ "qvfriz $FRT, $FRB", IIC_FPGeneral,
+ [(set v4f64:$FRT, (ftrunc v4f64:$FRB))]>;
+ let isCodeGenOnly = 1 in
+ def QVFRIZs : XForm_19<4, 424, (outs qsrc:$FRT), (ins qsrc:$FRB),
+ "qvfriz $FRT, $FRB", IIC_FPGeneral,
+ [(set v4f32:$FRT, (ftrunc v4f32:$FRB))]>;
+
+ def QVFRIN : XForm_19<4, 392, (outs qfrc:$FRT), (ins qfrc:$FRB),
+ "qvfrin $FRT, $FRB", IIC_FPGeneral,
+ [(set v4f64:$FRT, (fround v4f64:$FRB))]>;
+ let isCodeGenOnly = 1 in
+ def QVFRINs : XForm_19<4, 392, (outs qsrc:$FRT), (ins qsrc:$FRB),
+ "qvfrin $FRT, $FRB", IIC_FPGeneral,
+ [(set v4f32:$FRT, (fround v4f32:$FRB))]>;
+
+ def QVFRIP : XForm_19<4, 456, (outs qfrc:$FRT), (ins qfrc:$FRB),
+ "qvfrip $FRT, $FRB", IIC_FPGeneral,
+ [(set v4f64:$FRT, (fceil v4f64:$FRB))]>;
+ let isCodeGenOnly = 1 in
+ def QVFRIPs : XForm_19<4, 456, (outs qsrc:$FRT), (ins qsrc:$FRB),
+ "qvfrip $FRT, $FRB", IIC_FPGeneral,
+ [(set v4f32:$FRT, (fceil v4f32:$FRB))]>;
+
+ def QVFRIM : XForm_19<4, 488, (outs qfrc:$FRT), (ins qfrc:$FRB),
+ "qvfrim $FRT, $FRB", IIC_FPGeneral,
+ [(set v4f64:$FRT, (ffloor v4f64:$FRB))]>;
+ let isCodeGenOnly = 1 in
+ def QVFRIMs : XForm_19<4, 488, (outs qsrc:$FRT), (ins qsrc:$FRB),
+ "qvfrim $FRT, $FRB", IIC_FPGeneral,
+ [(set v4f32:$FRT, (ffloor v4f32:$FRB))]>;
+
+ // Move Instructions
+ def QVFMR : XForm_19<4, 72,
+ (outs qfrc:$FRT), (ins qfrc:$FRB),
+ "qvfmr $FRT, $FRB", IIC_VecPerm,
+ [/* (set v4f64:$FRT, v4f64:$FRB) */]>;
+ let isCodeGenOnly = 1 in {
+ def QVFMRs : XForm_19<4, 72,
+ (outs qsrc:$FRT), (ins qsrc:$FRB),
+ "qvfmr $FRT, $FRB", IIC_VecPerm,
+ [/* (set v4f32:$FRT, v4f32:$FRB) */]>;
+ def QVFMRb : XForm_19<4, 72,
+ (outs qbrc:$FRT), (ins qbrc:$FRB),
+ "qvfmr $FRT, $FRB", IIC_VecPerm,
+ [/* (set v4i1:$FRT, v4i1:$FRB) */]>;
+ }
+ def QVFNEG : XForm_19<4, 40,
+ (outs qfrc:$FRT), (ins qfrc:$FRB),
+ "qvfneg $FRT, $FRB", IIC_VecPerm,
+ [(set v4f64:$FRT, (fneg v4f64:$FRB))]>;
+ let isCodeGenOnly = 1 in
+ def QVFNEGs : XForm_19<4, 40,
+ (outs qsrc:$FRT), (ins qsrc:$FRB),
+ "qvfneg $FRT, $FRB", IIC_VecPerm,
+ [(set v4f32:$FRT, (fneg v4f32:$FRB))]>;
+ def QVFABS : XForm_19<4, 264,
+ (outs qfrc:$FRT), (ins qfrc:$FRB),
+ "qvfabs $FRT, $FRB", IIC_VecPerm,
+ [(set v4f64:$FRT, (fabs v4f64:$FRB))]>;
+ let isCodeGenOnly = 1 in
+ def QVFABSs : XForm_19<4, 264,
+ (outs qsrc:$FRT), (ins qsrc:$FRB),
+ "qvfabs $FRT, $FRB", IIC_VecPerm,
+ [(set v4f32:$FRT, (fabs v4f32:$FRB))]>;
+ def QVFNABS : XForm_19<4, 136,
+ (outs qfrc:$FRT), (ins qfrc:$FRB),
+ "qvfnabs $FRT, $FRB", IIC_VecPerm,
+ [(set v4f64:$FRT, (fneg (fabs v4f64:$FRB)))]>;
+ let isCodeGenOnly = 1 in
+ def QVFNABSs : XForm_19<4, 136,
+ (outs qsrc:$FRT), (ins qsrc:$FRB),
+ "qvfnabs $FRT, $FRB", IIC_VecPerm,
+ [(set v4f32:$FRT, (fneg (fabs v4f32:$FRB)))]>;
+ def QVFCPSGN : XForm_18<4, 8,
+ (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
+ "qvfcpsgn $FRT, $FRA, $FRB", IIC_VecPerm,
+ [(set v4f64:$FRT, (fcopysign v4f64:$FRB, v4f64:$FRA))]>;
+ let isCodeGenOnly = 1 in
+ def QVFCPSGNs : XForm_18<4, 8,
+ (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB),
+ "qvfcpsgn $FRT, $FRA, $FRB", IIC_VecPerm,
+ [(set v4f32:$FRT, (fcopysign v4f32:$FRB, v4f32:$FRA))]>;
+
+ def QVALIGNI : Z23Form_1<4, 5,
+ (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, u2imm:$idx),
+ "qvaligni $FRT, $FRA, $FRB, $idx", IIC_VecPerm,
+ [(set v4f64:$FRT,
+ (PPCqvaligni v4f64:$FRA, v4f64:$FRB,
+ (i32 imm:$idx)))]>;
+ let isCodeGenOnly = 1 in
+ def QVALIGNIs : Z23Form_1<4, 5,
+ (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, u2imm:$idx),
+ "qvaligni $FRT, $FRA, $FRB, $idx", IIC_VecPerm,
+ [(set v4f32:$FRT,
+ (PPCqvaligni v4f32:$FRA, v4f32:$FRB,
+ (i32 imm:$idx)))]>;
+ let isCodeGenOnly = 1 in
+ def QVALIGNIb : Z23Form_1<4, 5,
+ (outs qbrc:$FRT), (ins qbrc:$FRA, qbrc:$FRB, u2imm:$idx),
+ "qvaligni $FRT, $FRA, $FRB, $idx", IIC_VecPerm,
+ [(set v4i1:$FRT,
+ (PPCqvaligni v4i1:$FRA, v4i1:$FRB,
+ (i32 imm:$idx)))]>;
+
+ def QVESPLATI : Z23Form_2<4, 37,
+ (outs qfrc:$FRT), (ins qfrc:$FRA, u2imm:$idx),
+ "qvesplati $FRT, $FRA, $idx", IIC_VecPerm,
+ [(set v4f64:$FRT,
+ (PPCqvesplati v4f64:$FRA, (i32 imm:$idx)))]>;
+ let isCodeGenOnly = 1 in
+ def QVESPLATIs : Z23Form_2<4, 37,
+ (outs qsrc:$FRT), (ins qsrc:$FRA, u2imm:$idx),
+ "qvesplati $FRT, $FRA, $idx", IIC_VecPerm,
+ [(set v4f32:$FRT,
+ (PPCqvesplati v4f32:$FRA, (i32 imm:$idx)))]>;
+ let isCodeGenOnly = 1 in
+ def QVESPLATIb : Z23Form_2<4, 37,
+ (outs qbrc:$FRT), (ins qbrc:$FRA, u2imm:$idx),
+ "qvesplati $FRT, $FRA, $idx", IIC_VecPerm,
+ [(set v4i1:$FRT,
+ (PPCqvesplati v4i1:$FRA, (i32 imm:$idx)))]>;
+
+ def QVFPERM : AForm_1<4, 6,
+ (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC),
+ "qvfperm $FRT, $FRA, $FRB, $FRC", IIC_VecPerm,
+ [(set v4f64:$FRT,
+ (PPCqvfperm v4f64:$FRA, v4f64:$FRB, v4f64:$FRC))]>;
+ let isCodeGenOnly = 1 in
+ def QVFPERMs : AForm_1<4, 6,
+ (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, qfrc:$FRC),
+ "qvfperm $FRT, $FRA, $FRB, $FRC", IIC_VecPerm,
+ [(set v4f32:$FRT,
+ (PPCqvfperm v4f32:$FRA, v4f32:$FRB, v4f64:$FRC))]>;
+
+ let isReMaterializable = 1, isAsCheapAsAMove = 1 in
+ def QVGPCI : Z23Form_3<4, 133,
+ (outs qfrc:$FRT), (ins u12imm:$idx),
+ "qvgpci $FRT, $idx", IIC_VecPerm,
+ [(set v4f64:$FRT, (PPCqvgpci (u12:$idx)))]>;
+
+ // Compare Instruction
+ let isCodeGenOnly = 1 in
+ def QVFTSTNAN : QPXX18_Int<4, 64, "qvftstnan", int_ppc_qpx_qvftstnan>;
+ def QVFTSTNANb : XForm_18<4, 64, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
+ "qvftstnan $FRT, $FRA, $FRB", IIC_FPCompare,
+ [(set v4i1:$FRT,
+ (setcc v4f64:$FRA, v4f64:$FRB, SETUO))]>;
+ let isCodeGenOnly = 1 in
+ def QVFTSTNANbs : XForm_18<4, 64, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB),
+ "qvftstnan $FRT, $FRA, $FRB", IIC_FPCompare,
+ [(set v4i1:$FRT,
+ (setcc v4f32:$FRA, v4f32:$FRB, SETUO))]>;
+ let isCodeGenOnly = 1 in
+ def QVFCMPLT : QPXX18_Int<4, 96, "qvfcmplt", int_ppc_qpx_qvfcmplt>;
+ def QVFCMPLTb : XForm_18<4, 96, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
+ "qvfcmplt $FRT, $FRA, $FRB", IIC_FPCompare,
+ [(set v4i1:$FRT,
+ (setcc v4f64:$FRA, v4f64:$FRB, SETOLT))]>;
+ let isCodeGenOnly = 1 in
+ def QVFCMPLTbs : XForm_18<4, 96, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB),
+ "qvfcmplt $FRT, $FRA, $FRB", IIC_FPCompare,
+ [(set v4i1:$FRT,
+ (setcc v4f32:$FRA, v4f32:$FRB, SETOLT))]>;
+ let isCodeGenOnly = 1 in
+ def QVFCMPGT : QPXX18_Int<4, 32, "qvfcmpgt", int_ppc_qpx_qvfcmpgt>;
+ def QVFCMPGTb : XForm_18<4, 32, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
+ "qvfcmpgt $FRT, $FRA, $FRB", IIC_FPCompare,
+ [(set v4i1:$FRT,
+ (setcc v4f64:$FRA, v4f64:$FRB, SETOGT))]>;
+ let isCodeGenOnly = 1 in
+ def QVFCMPGTbs : XForm_18<4, 32, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB),
+ "qvfcmpgt $FRT, $FRA, $FRB", IIC_FPCompare,
+ [(set v4i1:$FRT,
+ (setcc v4f32:$FRA, v4f32:$FRB, SETOGT))]>;
+ let isCodeGenOnly = 1 in
+ def QVFCMPEQ : QPXX18_Int<4, 0, "qvfcmpeq", int_ppc_qpx_qvfcmpeq>;
+ def QVFCMPEQb : XForm_18<4, 0, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
+ "qvfcmpeq $FRT, $FRA, $FRB", IIC_FPCompare,
+ [(set v4i1:$FRT,
+ (setcc v4f64:$FRA, v4f64:$FRB, SETOEQ))]>;
+ let isCodeGenOnly = 1 in
+ def QVFCMPEQbs : XForm_18<4, 0, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB),
+ "qvfcmpeq $FRT, $FRA, $FRB", IIC_FPCompare,
+ [(set v4i1:$FRT,
+ (setcc v4f32:$FRA, v4f32:$FRB, SETOEQ))]>;
+
+ let isCodeGenOnly = 1 in
+ def QVFLOGICAL : XForm_20<4, 4,
+ (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, u12imm:$tttt),
+ "qvflogical $FRT, $FRA, $FRB, $tttt", IIC_VecPerm, []>;
+ def QVFLOGICALb : XForm_20<4, 4,
+ (outs qbrc:$FRT), (ins qbrc:$FRA, qbrc:$FRB, u12imm:$tttt),
+ "qvflogical $FRT, $FRA, $FRB, $tttt", IIC_VecPerm, []>;
+ let isCodeGenOnly = 1 in
+ def QVFLOGICALs : XForm_20<4, 4,
+ (outs qbrc:$FRT), (ins qbrc:$FRA, qbrc:$FRB, u12imm:$tttt),
+ "qvflogical $FRT, $FRA, $FRB, $tttt", IIC_VecPerm, []>;
+
+ // Load indexed instructions
+ let mayLoad = 1 in {
+ def QVLFDX : XForm_1<31, 583,
+ (outs qfrc:$FRT), (ins memrr:$src),
+ "qvlfdx $FRT, $src", IIC_LdStLFD,
+ [(set v4f64:$FRT, (load xoaddr:$src))]>;
+ let isCodeGenOnly = 1 in
+ def QVLFDXb : XForm_1<31, 583,
+ (outs qbrc:$FRT), (ins memrr:$src),
+ "qvlfdx $FRT, $src", IIC_LdStLFD, []>;
+
+ let RC = 1 in
+ def QVLFDXA : XForm_1<31, 583,
+ (outs qfrc:$FRT), (ins memrr:$src),
+ "qvlfdxa $FRT, $src", IIC_LdStLFD, []>;
+
+ def QVLFDUX : XForm_1<31, 615,
+ (outs qfrc:$FRT, ptr_rc_nor0:$ea_result),
+ (ins memrr:$src),
+ "qvlfdux $FRT, $src", IIC_LdStLFDU, []>,
+ RegConstraint<"$src.ptrreg = $ea_result">,
+ NoEncode<"$ea_result">;
+ let RC = 1 in
+ def QVLFDUXA : XForm_1<31, 615,
+ (outs qfrc:$FRT), (ins memrr:$src),
+ "qvlfduxa $FRT, $src", IIC_LdStLFD, []>;
+
+ def QVLFSX : XForm_1<31, 519,
+ (outs qfrc:$FRT), (ins memrr:$src),
+ "qvlfsx $FRT, $src", IIC_LdStLFD,
+ [(set v4f64:$FRT, (extloadv4f32 xoaddr:$src))]>;
+
+ let isCodeGenOnly = 1 in
+ def QVLFSXb : XForm_1<31, 519,
+ (outs qbrc:$FRT), (ins memrr:$src),
+ "qvlfsx $FRT, $src", IIC_LdStLFD,
+ [(set v4i1:$FRT, (PPCqvlfsb xoaddr:$src))]>;
+ let isCodeGenOnly = 1 in
+ def QVLFSXs : XForm_1<31, 519,
+ (outs qsrc:$FRT), (ins memrr:$src),
+ "qvlfsx $FRT, $src", IIC_LdStLFD,
+ [(set v4f32:$FRT, (load xoaddr:$src))]>;
+
+ let RC = 1 in
+ def QVLFSXA : XForm_1<31, 519,
+ (outs qfrc:$FRT), (ins memrr:$src),
+ "qvlfsxa $FRT, $src", IIC_LdStLFD, []>;
+
+ def QVLFSUX : XForm_1<31, 551,
+ (outs qsrc:$FRT, ptr_rc_nor0:$ea_result),
+ (ins memrr:$src),
+ "qvlfsux $FRT, $src", IIC_LdStLFDU, []>,
+ RegConstraint<"$src.ptrreg = $ea_result">,
+ NoEncode<"$ea_result">;
+
+ let RC = 1 in
+ def QVLFSUXA : XForm_1<31, 551,
+ (outs qfrc:$FRT), (ins memrr:$src),
+ "qvlfsuxa $FRT, $src", IIC_LdStLFD, []>;
+
+ def QVLFCDX : XForm_1<31, 71,
+ (outs qfrc:$FRT), (ins memrr:$src),
+ "qvlfcdx $FRT, $src", IIC_LdStLFD, []>;
+ let RC = 1 in
+ def QVLFCDXA : XForm_1<31, 71,
+ (outs qfrc:$FRT), (ins memrr:$src),
+ "qvlfcdxa $FRT, $src", IIC_LdStLFD, []>;
+
+ def QVLFCDUX : XForm_1<31, 103,
+ (outs qfrc:$FRT), (ins memrr:$src),
+ "qvlfcdux $FRT, $src", IIC_LdStLFD, []>;
+ let RC = 1 in
+ def QVLFCDUXA : XForm_1<31, 103,
+ (outs qfrc:$FRT), (ins memrr:$src),
+ "qvlfcduxa $FRT, $src", IIC_LdStLFD, []>;
+
+ def QVLFCSX : XForm_1<31, 7,
+ (outs qfrc:$FRT), (ins memrr:$src),
+ "qvlfcsx $FRT, $src", IIC_LdStLFD, []>;
+ let isCodeGenOnly = 1 in
+ def QVLFCSXs : XForm_1<31, 7,
+ (outs qsrc:$FRT), (ins memrr:$src),
+ "qvlfcsx $FRT, $src", IIC_LdStLFD, []>;
+
+ let RC = 1 in
+ def QVLFCSXA : XForm_1<31, 7,
+ (outs qfrc:$FRT), (ins memrr:$src),
+ "qvlfcsxa $FRT, $src", IIC_LdStLFD, []>;
+
+ def QVLFCSUX : XForm_1<31, 39,
+ (outs qfrc:$FRT), (ins memrr:$src),
+ "qvlfcsux $FRT, $src", IIC_LdStLFD, []>;
+ let RC = 1 in
+ def QVLFCSUXA : XForm_1<31, 39,
+ (outs qfrc:$FRT), (ins memrr:$src),
+ "qvlfcsuxa $FRT, $src", IIC_LdStLFD, []>;
+
+ def QVLFIWAX : XForm_1<31, 871,
+ (outs qfrc:$FRT), (ins memrr:$src),
+ "qvlfiwax $FRT, $src", IIC_LdStLFD, []>;
+ let RC = 1 in
+ def QVLFIWAXA : XForm_1<31, 871,
+ (outs qfrc:$FRT), (ins memrr:$src),
+ "qvlfiwaxa $FRT, $src", IIC_LdStLFD, []>;
+
+ def QVLFIWZX : XForm_1<31, 839,
+ (outs qfrc:$FRT), (ins memrr:$src),
+ "qvlfiwzx $FRT, $src", IIC_LdStLFD, []>;
+ let RC = 1 in
+ def QVLFIWZXA : XForm_1<31, 839,
+ (outs qfrc:$FRT), (ins memrr:$src),
+ "qvlfiwzxa $FRT, $src", IIC_LdStLFD, []>;
+ }
+
+
+ def QVLPCLDX : XForm_1<31, 582,
+ (outs qfrc:$FRT), (ins memrr:$src),
+ "qvlpcldx $FRT, $src", IIC_LdStLFD, []>;
+ def QVLPCLSX : XForm_1<31, 518,
+ (outs qfrc:$FRT), (ins memrr:$src),
+ "qvlpclsx $FRT, $src", IIC_LdStLFD, []>;
+ let isCodeGenOnly = 1 in
+ def QVLPCLSXint : XForm_11<31, 518,
+ (outs qfrc:$FRT), (ins G8RC:$src),
+ "qvlpclsx $FRT, 0, $src", IIC_LdStLFD, []>;
+ def QVLPCRDX : XForm_1<31, 70,
+ (outs qfrc:$FRT), (ins memrr:$src),
+ "qvlpcrdx $FRT, $src", IIC_LdStLFD, []>;
+ def QVLPCRSX : XForm_1<31, 6,
+ (outs qfrc:$FRT), (ins memrr:$src),
+ "qvlpcrsx $FRT, $src", IIC_LdStLFD, []>;
+
+ // Store indexed instructions
+ let mayStore = 1 in {
+ def QVSTFDX : XForm_8<31, 711,
+ (outs), (ins qfrc:$FRT, memrr:$dst),
+ "qvstfdx $FRT, $dst", IIC_LdStSTFD,
+ [(store qfrc:$FRT, xoaddr:$dst)]>;
+ let isCodeGenOnly = 1 in
+ def QVSTFDXb : XForm_8<31, 711,
+ (outs), (ins qbrc:$FRT, memrr:$dst),
+ "qvstfdx $FRT, $dst", IIC_LdStSTFD, []>;
+
+ let RC = 1 in
+ def QVSTFDXA : XForm_8<31, 711,
+ (outs), (ins qfrc:$FRT, memrr:$dst),
+ "qvstfdxa $FRT, $dst", IIC_LdStSTFD, []>;
+
+ def QVSTFDUX : XForm_8<31, 743, (outs ptr_rc_nor0:$ea_res),
+ (ins qfrc:$FRT, memrr:$dst),
+ "qvstfdux $FRT, $dst", IIC_LdStSTFDU, []>,
+ RegConstraint<"$dst.ptrreg = $ea_res">,
+ NoEncode<"$ea_res">;
+
+ let RC = 1 in
+ def QVSTFDUXA : XForm_8<31, 743,
+ (outs), (ins qfrc:$FRT, memrr:$dst),
+ "qvstfduxa $FRT, $dst", IIC_LdStSTFD, []>;
+
+ def QVSTFDXI : XForm_8<31, 709,
+ (outs), (ins qfrc:$FRT, memrr:$dst),
+ "qvstfdxi $FRT, $dst", IIC_LdStSTFD, []>;
+ let RC = 1 in
+ def QVSTFDXIA : XForm_8<31, 709,
+ (outs), (ins qfrc:$FRT, memrr:$dst),
+ "qvstfdxia $FRT, $dst", IIC_LdStSTFD, []>;
+
+ def QVSTFDUXI : XForm_8<31, 741,
+ (outs), (ins qfrc:$FRT, memrr:$dst),
+ "qvstfduxi $FRT, $dst", IIC_LdStSTFD, []>;
+ let RC = 1 in
+ def QVSTFDUXIA : XForm_8<31, 741,
+ (outs), (ins qfrc:$FRT, memrr:$dst),
+ "qvstfduxia $FRT, $dst", IIC_LdStSTFD, []>;
+
+ def QVSTFSX : XForm_8<31, 647,
+ (outs), (ins qfrc:$FRT, memrr:$dst),
+ "qvstfsx $FRT, $dst", IIC_LdStSTFD,
+ [(truncstorev4f32 qfrc:$FRT, xoaddr:$dst)]>;
+ let isCodeGenOnly = 1 in
+ def QVSTFSXs : XForm_8<31, 647,
+ (outs), (ins qsrc:$FRT, memrr:$dst),
+ "qvstfsx $FRT, $dst", IIC_LdStSTFD,
+ [(store qsrc:$FRT, xoaddr:$dst)]>;
+
+ let RC = 1 in
+ def QVSTFSXA : XForm_8<31, 647,
+ (outs), (ins qfrc:$FRT, memrr:$dst),
+ "qvstfsxa $FRT, $dst", IIC_LdStSTFD, []>;
+
+ def QVSTFSUX : XForm_8<31, 679, (outs ptr_rc_nor0:$ea_res),
+ (ins qsrc:$FRT, memrr:$dst),
+ "qvstfsux $FRT, $dst", IIC_LdStSTFDU, []>,
+ RegConstraint<"$dst.ptrreg = $ea_res">,
+ NoEncode<"$ea_res">;
+ let isCodeGenOnly = 1 in
+ def QVSTFSUXs: XForm_8<31, 679, (outs ptr_rc_nor0:$ea_res),
+ (ins qfrc:$FRT, memrr:$dst),
+ "qvstfsux $FRT, $dst", IIC_LdStSTFDU, []>,
+ RegConstraint<"$dst.ptrreg = $ea_res">,
+ NoEncode<"$ea_res">;
+
+ let RC = 1 in
+ def QVSTFSUXA : XForm_8<31, 679,
+ (outs), (ins qfrc:$FRT, memrr:$dst),
+ "qvstfsuxa $FRT, $dst", IIC_LdStSTFD, []>;
+
+ def QVSTFSXI : XForm_8<31, 645,
+ (outs), (ins qfrc:$FRT, memrr:$dst),
+ "qvstfsxi $FRT, $dst", IIC_LdStSTFD, []>;
+ let RC = 1 in
+ def QVSTFSXIA : XForm_8<31, 645,
+ (outs), (ins qfrc:$FRT, memrr:$dst),
+ "qvstfsxia $FRT, $dst", IIC_LdStSTFD, []>;
+
+ def QVSTFSUXI : XForm_8<31, 677,
+ (outs), (ins qfrc:$FRT, memrr:$dst),
+ "qvstfsuxi $FRT, $dst", IIC_LdStSTFD, []>;
+ let RC = 1 in
+ def QVSTFSUXIA : XForm_8<31, 677,
+ (outs), (ins qfrc:$FRT, memrr:$dst),
+ "qvstfsuxia $FRT, $dst", IIC_LdStSTFD, []>;
+
+ def QVSTFCDX : XForm_8<31, 199,
+ (outs), (ins qfrc:$FRT, memrr:$dst),
+ "qvstfcdx $FRT, $dst", IIC_LdStSTFD, []>;
+ let RC = 1 in
+ def QVSTFCDXA : XForm_8<31, 199,
+ (outs), (ins qfrc:$FRT, memrr:$dst),
+ "qvstfcdxa $FRT, $dst", IIC_LdStSTFD, []>;
+
+ def QVSTFCSX : XForm_8<31, 135,
+ (outs), (ins qfrc:$FRT, memrr:$dst),
+ "qvstfcsx $FRT, $dst", IIC_LdStSTFD, []>;
+ let isCodeGenOnly = 1 in
+ def QVSTFCSXs : XForm_8<31, 135,
+ (outs), (ins qsrc:$FRT, memrr:$dst),
+ "qvstfcsx $FRT, $dst", IIC_LdStSTFD, []>;
+
+ let RC = 1 in
+ def QVSTFCSXA : XForm_8<31, 135,
+ (outs), (ins qfrc:$FRT, memrr:$dst),
+ "qvstfcsxa $FRT, $dst", IIC_LdStSTFD, []>;
+
+ def QVSTFCDUX : XForm_8<31, 231,
+ (outs), (ins qfrc:$FRT, memrr:$dst),
+ "qvstfcdux $FRT, $dst", IIC_LdStSTFD, []>;
+ let RC = 1 in
+ def QVSTFCDUXA : XForm_8<31, 231,
+ (outs), (ins qfrc:$FRT, memrr:$dst),
+ "qvstfcduxa $FRT, $dst", IIC_LdStSTFD, []>;
+
+ def QVSTFCSUX : XForm_8<31, 167,
+ (outs), (ins qfrc:$FRT, memrr:$dst),
+ "qvstfcsux $FRT, $dst", IIC_LdStSTFD, []>;
+ let RC = 1 in
+ def QVSTFCSUXA : XForm_8<31, 167,
+ (outs), (ins qfrc:$FRT, memrr:$dst),
+ "qvstfcsuxa $FRT, $dst", IIC_LdStSTFD, []>;
+
+ def QVSTFCDXI : XForm_8<31, 197,
+ (outs), (ins qfrc:$FRT, memrr:$dst),
+ "qvstfcdxi $FRT, $dst", IIC_LdStSTFD, []>;
+ let RC = 1 in
+ def QVSTFCDXIA : XForm_8<31, 197,
+ (outs), (ins qfrc:$FRT, memrr:$dst),
+ "qvstfcdxia $FRT, $dst", IIC_LdStSTFD, []>;
+
+ def QVSTFCSXI : XForm_8<31, 133,
+ (outs), (ins qfrc:$FRT, memrr:$dst),
+ "qvstfcsxi $FRT, $dst", IIC_LdStSTFD, []>;
+ let RC = 1 in
+ def QVSTFCSXIA : XForm_8<31, 133,
+ (outs), (ins qfrc:$FRT, memrr:$dst),
+ "qvstfcsxia $FRT, $dst", IIC_LdStSTFD, []>;
+
+ def QVSTFCDUXI : XForm_8<31, 229,
+ (outs), (ins qfrc:$FRT, memrr:$dst),
+ "qvstfcduxi $FRT, $dst", IIC_LdStSTFD, []>;
+ let RC = 1 in
+ def QVSTFCDUXIA : XForm_8<31, 229,
+ (outs), (ins qfrc:$FRT, memrr:$dst),
+ "qvstfcduxia $FRT, $dst", IIC_LdStSTFD, []>;
+
+ def QVSTFCSUXI : XForm_8<31, 165,
+ (outs), (ins qfrc:$FRT, memrr:$dst),
+ "qvstfcsuxi $FRT, $dst", IIC_LdStSTFD, []>;
+ let RC = 1 in
+ def QVSTFCSUXIA : XForm_8<31, 165,
+ (outs), (ins qfrc:$FRT, memrr:$dst),
+ "qvstfcsuxia $FRT, $dst", IIC_LdStSTFD, []>;
+
+ def QVSTFIWX : XForm_8<31, 967,
+ (outs), (ins qfrc:$FRT, memrr:$dst),
+ "qvstfiwx $FRT, $dst", IIC_LdStSTFD, []>;
+ let RC = 1 in
+ def QVSTFIWXA : XForm_8<31, 967,
+ (outs), (ins qfrc:$FRT, memrr:$dst),
+ "qvstfiwxa $FRT, $dst", IIC_LdStSTFD, []>;
+ }
+}
+
+} // neverHasSideEffects
+}
+
+def : InstAlias<"qvfclr $FRT",
+ (QVFLOGICALb qbrc:$FRT, qbrc:$FRT, qbrc:$FRT, 0)>;
+def : InstAlias<"qvfand $FRT, $FRA, $FRB",
+ (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 1)>;
+def : InstAlias<"qvfandc $FRT, $FRA, $FRB",
+ (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 4)>;
+def : InstAlias<"qvfctfb $FRT, $FRA",
+ (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRA, 5)>;
+def : InstAlias<"qvfxor $FRT, $FRA, $FRB",
+ (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 6)>;
+def : InstAlias<"qvfor $FRT, $FRA, $FRB",
+ (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 7)>;
+def : InstAlias<"qvfnor $FRT, $FRA, $FRB",
+ (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 8)>;
+def : InstAlias<"qvfequ $FRT, $FRA, $FRB",
+ (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 9)>;
+def : InstAlias<"qvfnot $FRT, $FRA",
+ (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRA, 10)>;
+def : InstAlias<"qvforc $FRT, $FRA, $FRB",
+ (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 13)>;
+def : InstAlias<"qvfnand $FRT, $FRA, $FRB",
+ (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 14)>;
+def : InstAlias<"qvfset $FRT",
+ (QVFLOGICALb qbrc:$FRT, qbrc:$FRT, qbrc:$FRT, 15)>;
+
+//===----------------------------------------------------------------------===//
+// Additional QPX Patterns
+//
+
+def : Pat<(v4f64 (scalar_to_vector f64:$A)),
+ (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), $A, sub_64)>;
+def : Pat<(v4f32 (scalar_to_vector f32:$A)),
+ (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $A, sub_64)>;
+
+def : Pat<(f64 (extractelt v4f64:$S, 0)),
+ (EXTRACT_SUBREG $S, sub_64)>;
+def : Pat<(f32 (extractelt v4f32:$S, 0)),
+ (EXTRACT_SUBREG $S, sub_64)>;
+
+def : Pat<(f64 (extractelt v4f64:$S, 1)),
+ (EXTRACT_SUBREG (QVESPLATI $S, 1), sub_64)>;
+def : Pat<(f64 (extractelt v4f64:$S, 2)),
+ (EXTRACT_SUBREG (QVESPLATI $S, 2), sub_64)>;
+def : Pat<(f64 (extractelt v4f64:$S, 3)),
+ (EXTRACT_SUBREG (QVESPLATI $S, 3), sub_64)>;
+
+def : Pat<(f32 (extractelt v4f32:$S, 1)),
+ (EXTRACT_SUBREG (QVESPLATIs $S, 1), sub_64)>;
+def : Pat<(f32 (extractelt v4f32:$S, 2)),
+ (EXTRACT_SUBREG (QVESPLATIs $S, 2), sub_64)>;
+def : Pat<(f32 (extractelt v4f32:$S, 3)),
+ (EXTRACT_SUBREG (QVESPLATIs $S, 3), sub_64)>;
+
+def : Pat<(f64 (extractelt v4f64:$S, i64:$F)),
+ (EXTRACT_SUBREG (QVFPERM $S, $S,
+ (QVLPCLSXint (RLDICR $F, 2,
+ /* 63-2 = */ 61))),
+ sub_64)>;
+def : Pat<(f32 (extractelt v4f32:$S, i64:$F)),
+ (EXTRACT_SUBREG (QVFPERMs $S, $S,
+ (QVLPCLSXint (RLDICR $F, 2,
+ /* 63-2 = */ 61))),
+ sub_64)>;
+
+def : Pat<(int_ppc_qpx_qvfperm v4f64:$A, v4f64:$B, v4f64:$C),
+ (QVFPERM $A, $B, $C)>;
+
+def : Pat<(int_ppc_qpx_qvfcpsgn v4f64:$A, v4f64:$B),
+ (QVFCPSGN $A, $B)>;
+
+// FCOPYSIGN's operand types need not agree.
+def : Pat<(fcopysign v4f64:$frB, v4f32:$frA),
+ (QVFCPSGN (COPY_TO_REGCLASS $frA, QFRC), $frB)>;
+def : Pat<(fcopysign QSRC:$frB, QFRC:$frA),
+ (QVFCPSGNs (COPY_TO_REGCLASS $frA, QSRC), $frB)>;
+
+def : Pat<(int_ppc_qpx_qvfneg v4f64:$A), (QVFNEG $A)>;
+def : Pat<(int_ppc_qpx_qvfabs v4f64:$A), (QVFABS $A)>;
+def : Pat<(int_ppc_qpx_qvfnabs v4f64:$A), (QVFNABS $A)>;
+
+def : Pat<(int_ppc_qpx_qvfriz v4f64:$A), (QVFRIZ $A)>;
+def : Pat<(int_ppc_qpx_qvfrin v4f64:$A), (QVFRIN $A)>;
+def : Pat<(int_ppc_qpx_qvfrip v4f64:$A), (QVFRIP $A)>;
+def : Pat<(int_ppc_qpx_qvfrim v4f64:$A), (QVFRIM $A)>;
+
+def : Pat<(int_ppc_qpx_qvfre v4f64:$A), (QVFRE $A)>;
+def : Pat<(int_ppc_qpx_qvfrsqrte v4f64:$A), (QVFRSQRTE $A)>;
+
+def : Pat<(int_ppc_qpx_qvfadd v4f64:$A, v4f64:$B),
+ (QVFADD $A, $B)>;
+def : Pat<(int_ppc_qpx_qvfsub v4f64:$A, v4f64:$B),
+ (QVFSUB $A, $B)>;
+def : Pat<(int_ppc_qpx_qvfmul v4f64:$A, v4f64:$B),
+ (QVFMUL $A, $B)>;
+
+// Additional QVFNMSUB patterns: -a*c + b == -(a*c - b)
+def : Pat<(fma (fneg v4f64:$A), v4f64:$C, v4f64:$B),
+ (QVFNMSUB $A, $B, $C)>;
+def : Pat<(fma v4f64:$A, (fneg v4f64:$C), v4f64:$B),
+ (QVFNMSUB $A, $B, $C)>;
+def : Pat<(fma (fneg v4f32:$A), v4f32:$C, v4f32:$B),
+ (QVFNMSUBSs $A, $B, $C)>;
+def : Pat<(fma v4f32:$A, (fneg v4f32:$C), v4f32:$B),
+ (QVFNMSUBSs $A, $B, $C)>;
+
+def : Pat<(int_ppc_qpx_qvfmadd v4f64:$A, v4f64:$B, v4f64:$C),
+ (QVFMADD $A, $B, $C)>;
+def : Pat<(int_ppc_qpx_qvfnmadd v4f64:$A, v4f64:$B, v4f64:$C),
+ (QVFNMADD $A, $B, $C)>;
+def : Pat<(int_ppc_qpx_qvfmsub v4f64:$A, v4f64:$B, v4f64:$C),
+ (QVFMSUB $A, $B, $C)>;
+def : Pat<(int_ppc_qpx_qvfnmsub v4f64:$A, v4f64:$B, v4f64:$C),
+ (QVFNMSUB $A, $B, $C)>;
+
+def : Pat<(int_ppc_qpx_qvlfd xoaddr:$src),
+ (QVLFDX xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlfda xoaddr:$src),
+ (QVLFDXA xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlfs xoaddr:$src),
+ (QVLFSX xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlfsa xoaddr:$src),
+ (QVLFSXA xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlfcda xoaddr:$src),
+ (QVLFCDXA xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlfcd xoaddr:$src),
+ (QVLFCDX xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlfcsa xoaddr:$src),
+ (QVLFCSXA xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlfcs xoaddr:$src),
+ (QVLFCSX xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlfda xoaddr:$src),
+ (QVLFDXA xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlfiwaa xoaddr:$src),
+ (QVLFIWAXA xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlfiwa xoaddr:$src),
+ (QVLFIWAX xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlfiwza xoaddr:$src),
+ (QVLFIWZXA xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlfiwz xoaddr:$src),
+ (QVLFIWZX xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlfsa xoaddr:$src),
+ (QVLFSXA xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlpcld xoaddr:$src),
+ (QVLPCLDX xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlpcls xoaddr:$src),
+ (QVLPCLSX xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlpcrd xoaddr:$src),
+ (QVLPCRDX xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlpcrs xoaddr:$src),
+ (QVLPCRSX xoaddr:$src)>;
+
+def : Pat<(int_ppc_qpx_qvstfd v4f64:$T, xoaddr:$dst),
+ (QVSTFDX $T, xoaddr:$dst)>;
+def : Pat<(int_ppc_qpx_qvstfs v4f64:$T, xoaddr:$dst),
+ (QVSTFSX $T, xoaddr:$dst)>;
+def : Pat<(int_ppc_qpx_qvstfcda v4f64:$T, xoaddr:$dst),
+ (QVSTFCDXA $T, xoaddr:$dst)>;
+def : Pat<(int_ppc_qpx_qvstfcd v4f64:$T, xoaddr:$dst),
+ (QVSTFCDX $T, xoaddr:$dst)>;
+def : Pat<(int_ppc_qpx_qvstfcsa v4f64:$T, xoaddr:$dst),
+ (QVSTFCSXA $T, xoaddr:$dst)>;
+def : Pat<(int_ppc_qpx_qvstfcs v4f64:$T, xoaddr:$dst),
+ (QVSTFCSX $T, xoaddr:$dst)>;
+def : Pat<(int_ppc_qpx_qvstfda v4f64:$T, xoaddr:$dst),
+ (QVSTFDXA $T, xoaddr:$dst)>;
+def : Pat<(int_ppc_qpx_qvstfiwa v4f64:$T, xoaddr:$dst),
+ (QVSTFIWXA $T, xoaddr:$dst)>;
+def : Pat<(int_ppc_qpx_qvstfiw v4f64:$T, xoaddr:$dst),
+ (QVSTFIWX $T, xoaddr:$dst)>;
+def : Pat<(int_ppc_qpx_qvstfsa v4f64:$T, xoaddr:$dst),
+ (QVSTFSXA $T, xoaddr:$dst)>;
+
+def : Pat<(pre_store v4f64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+ (QVSTFDUX $rS, $ptrreg, $ptroff)>;
+def : Pat<(pre_store v4f32:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+ (QVSTFSUX $rS, $ptrreg, $ptroff)>;
+def : Pat<(pre_truncstv4f32 v4f64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+ (QVSTFSUXs $rS, $ptrreg, $ptroff)>;
+
+def : Pat<(int_ppc_qpx_qvflogical v4f64:$A, v4f64:$B, (i32 imm:$idx)),
+ (QVFLOGICAL $A, $B, imm:$idx)>;
+def : Pat<(int_ppc_qpx_qvgpci (u12:$idx)),
+ (QVGPCI imm:$idx)>;
+
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETOGE),
+ (QVFLOGICALb (QVFCMPLTb $FRA, $FRB),
+ (QVFTSTNANb $FRA, $FRB), (i32 8))>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETOLE),
+ (QVFLOGICALb (QVFCMPGTb $FRA, $FRB),
+ (QVFTSTNANb $FRA, $FRB), (i32 8))>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETONE),
+ (QVFLOGICALb (QVFCMPEQb $FRA, $FRB),
+ (QVFTSTNANb $FRA, $FRB), (i32 8))>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETO),
+ (QVFLOGICALb (QVFTSTNANb $FRA, $FRB),
+ (QVFTSTNANb $FRA, $FRB), (i32 10))>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUEQ),
+ (QVFLOGICALb (QVFCMPEQb $FRA, $FRB),
+ (QVFTSTNANb $FRA, $FRB), (i32 7))>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUGT),
+ (QVFLOGICALb (QVFCMPGTb $FRA, $FRB),
+ (QVFTSTNANb $FRA, $FRB), (i32 7))>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUGE),
+ (QVFLOGICALb (QVFTSTNANb $FRA, $FRB),
+ (QVFCMPLTb $FRA, $FRB), (i32 13))>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETULT),
+ (QVFLOGICALb (QVFCMPLTb $FRA, $FRB),
+ (QVFTSTNANb $FRA, $FRB), (i32 7))>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETULE),
+ (QVFLOGICALb (QVFTSTNANb $FRA, $FRB),
+ (QVFCMPGTb $FRA, $FRB), (i32 13))>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUNE),
+ (QVFLOGICALb (QVFTSTNANb $FRA, $FRB),
+ (QVFCMPEQb $FRA, $FRB), (i32 13))>;
+
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETEQ),
+ (QVFCMPEQb $FRA, $FRB)>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETGT),
+ (QVFCMPGTb $FRA, $FRB)>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETGE),
+ (QVFLOGICALb (QVFCMPLTb $FRA, $FRB),
+ (QVFCMPLTb $FRA, $FRB), (i32 10))>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETLT),
+ (QVFCMPLTb $FRA, $FRB)>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETLE),
+ (QVFLOGICALb (QVFCMPGTb $FRA, $FRB),
+ (QVFCMPGTb $FRA, $FRB), (i32 10))>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETNE),
+ (QVFLOGICALb (QVFCMPEQb $FRA, $FRB),
+ (QVFCMPEQb $FRA, $FRB), (i32 10))>;
+
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETOGE),
+ (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB),
+ (QVFTSTNANbs $FRA, $FRB), (i32 8))>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETOLE),
+ (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB),
+ (QVFTSTNANbs $FRA, $FRB), (i32 8))>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETONE),
+ (QVFLOGICALb (QVFCMPEQbs $FRA, $FRB),
+ (QVFTSTNANbs $FRA, $FRB), (i32 8))>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETO),
+ (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB),
+ (QVFTSTNANbs $FRA, $FRB), (i32 10))>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUEQ),
+ (QVFLOGICALb (QVFCMPEQbs $FRA, $FRB),
+ (QVFTSTNANbs $FRA, $FRB), (i32 7))>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUGT),
+ (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB),
+ (QVFTSTNANbs $FRA, $FRB), (i32 7))>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUGE),
+ (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB),
+ (QVFCMPLTbs $FRA, $FRB), (i32 13))>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETULT),
+ (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB),
+ (QVFTSTNANbs $FRA, $FRB), (i32 7))>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETULE),
+ (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB),
+ (QVFCMPGTbs $FRA, $FRB), (i32 13))>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUNE),
+ (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB),
+ (QVFCMPEQbs $FRA, $FRB), (i32 13))>;
+
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETEQ),
+ (QVFCMPEQbs $FRA, $FRB)>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETGT),
+ (QVFCMPGTbs $FRA, $FRB)>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETGE),
+ (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB),
+ (QVFCMPLTbs $FRA, $FRB), (i32 10))>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETLT),
+ (QVFCMPLTbs $FRA, $FRB)>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETLE),
+ (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB),
+ (QVFCMPGTbs $FRA, $FRB), (i32 10))>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETNE),
+ (QVFLOGICALb (QVFCMPEQbs $FRA, $FRB),
+ (QVFCMPEQbs $FRA, $FRB), (i32 10))>;
+
+def : Pat<(and v4i1:$FRA, (not v4i1:$FRB)),
+ (QVFLOGICALb $FRA, $FRB, (i32 4))>;
+def : Pat<(not (or v4i1:$FRA, v4i1:$FRB)),
+ (QVFLOGICALb $FRA, $FRB, (i32 8))>;
+def : Pat<(not (xor v4i1:$FRA, v4i1:$FRB)),
+ (QVFLOGICALb $FRA, $FRB, (i32 9))>;
+def : Pat<(or v4i1:$FRA, (not v4i1:$FRB)),
+ (QVFLOGICALb $FRA, $FRB, (i32 13))>;
+def : Pat<(not (and v4i1:$FRA, v4i1:$FRB)),
+ (QVFLOGICALb $FRA, $FRB, (i32 14))>;
+
+def : Pat<(and v4i1:$FRA, v4i1:$FRB),
+ (QVFLOGICALb $FRA, $FRB, (i32 1))>;
+def : Pat<(or v4i1:$FRA, v4i1:$FRB),
+ (QVFLOGICALb $FRA, $FRB, (i32 7))>;
+def : Pat<(xor v4i1:$FRA, v4i1:$FRB),
+ (QVFLOGICALb $FRA, $FRB, (i32 6))>;
+def : Pat<(not v4i1:$FRA),
+ (QVFLOGICALb $FRA, $FRA, (i32 10))>;
+
+def : Pat<(v4f64 (fpextend v4f32:$src)),
+ (COPY_TO_REGCLASS $src, QFRC)>;
+
+def : Pat<(v4f32 (fround_exact v4f64:$src)),
+ (COPY_TO_REGCLASS $src, QSRC)>;
+
+// Extract the underlying floating-point values from the
+// QPX (-1.0, 1.0) boolean representation.
+def : Pat<(v4f64 (PPCqbflt v4i1:$src)),
+ (COPY_TO_REGCLASS $src, QFRC)>;
+
+def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETLT)),
+ (SELECT_QFRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETULT)),
+ (SELECT_QFRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETLE)),
+ (SELECT_QFRC (CRORC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETULE)),
+ (SELECT_QFRC (CRORC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETEQ)),
+ (SELECT_QFRC (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETGE)),
+ (SELECT_QFRC (CRORC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETUGE)),
+ (SELECT_QFRC (CRORC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETGT)),
+ (SELECT_QFRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETUGT)),
+ (SELECT_QFRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETNE)),
+ (SELECT_QFRC (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETLT)),
+ (SELECT_QSRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETULT)),
+ (SELECT_QSRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETLE)),
+ (SELECT_QSRC (CRORC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETULE)),
+ (SELECT_QSRC (CRORC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETEQ)),
+ (SELECT_QSRC (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETGE)),
+ (SELECT_QSRC (CRORC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETUGE)),
+ (SELECT_QSRC (CRORC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETGT)),
+ (SELECT_QSRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETUGT)),
+ (SELECT_QSRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETNE)),
+ (SELECT_QSRC (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETLT)),
+ (SELECT_QBRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETULT)),
+ (SELECT_QBRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETLE)),
+ (SELECT_QBRC (CRORC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETULE)),
+ (SELECT_QBRC (CRORC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETEQ)),
+ (SELECT_QBRC (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETGE)),
+ (SELECT_QBRC (CRORC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETUGE)),
+ (SELECT_QBRC (CRORC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETGT)),
+ (SELECT_QBRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETUGT)),
+ (SELECT_QBRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETNE)),
+ (SELECT_QBRC (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+} // end HasQPX
+
+let Predicates = [HasQPX, NoNaNsFPMath] in {
+def : Pat<(fminnum v4f64:$FRA, v4f64:$FRB),
+ (QVFSELb (QVFCMPLTb $FRA, $FRB), $FRB, $FRA)>;
+def : Pat<(fmaxnum v4f64:$FRA, v4f64:$FRB),
+ (QVFSELb (QVFCMPGTb $FRA, $FRB), $FRB, $FRA)>;
+
+def : Pat<(fminnum v4f32:$FRA, v4f32:$FRB),
+ (QVFSELbs (QVFCMPLTbs $FRA, $FRB), $FRB, $FRA)>;
+def : Pat<(fmaxnum v4f32:$FRA, v4f32:$FRB),
+ (QVFSELbs (QVFCMPGTbs $FRA, $FRB), $FRB, $FRA)>;
+}
+
+let Predicates = [HasQPX, NaNsFPMath] in {
+// When either of these operands is NaN, we should return the other operand.
+// QVFCMPLT/QVFCMPGT return false is either operand is NaN, which means we need
+// to explicitly or with a NaN test on the second operand.
+def : Pat<(fminnum v4f64:$FRA, v4f64:$FRB),
+ (QVFSELb (QVFLOGICALb (QVFCMPLTb $FRA, $FRB),
+ (QVFTSTNANb $FRB, $FRB), (i32 7)),
+ $FRB, $FRA)>;
+def : Pat<(fmaxnum v4f64:$FRA, v4f64:$FRB),
+ (QVFSELb (QVFLOGICALb (QVFCMPGTb $FRA, $FRB),
+ (QVFTSTNANb $FRB, $FRB), (i32 7)),
+ $FRB, $FRA)>;
+
+def : Pat<(fminnum v4f32:$FRA, v4f32:$FRB),
+ (QVFSELbs (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB),
+ (QVFTSTNANbs $FRB, $FRB), (i32 7)),
+ $FRB, $FRA)>;
+def : Pat<(fmaxnum v4f32:$FRA, v4f32:$FRB),
+ (QVFSELbs (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB),
+ (QVFTSTNANbs $FRB, $FRB), (i32 7)),
+ $FRB, $FRA)>;
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrSPE.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrSPE.td
new file mode 100644
index 000000000000..cc3a4d20a9b2
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrSPE.td
@@ -0,0 +1,447 @@
+//=======-- PPCInstrSPE.td - The PowerPC SPE Extension -*- tablegen -*-=======//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Signal Processing Engine extension to
+// the PowerPC instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+class EVXForm_1<bits<11> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin> : I<4, OOL, IOL, asmstr, itin> {
+ bits<5> RT;
+ bits<5> RA;
+ bits<5> RB;
+
+ let Pattern = [];
+
+ let Inst{6-10} = RT;
+ let Inst{11-15} = RA;
+ let Inst{16-20} = RB;
+ let Inst{21-31} = xo;
+}
+
+class EVXForm_2<bits<11> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin> : EVXForm_1<xo, OOL, IOL, asmstr, itin> {
+ let RB = 0;
+}
+
+class EVXForm_3<bits<11> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin> : I<4, OOL, IOL, asmstr, itin> {
+ bits<3> crD;
+ bits<5> RA;
+ bits<5> RB;
+
+ let Pattern = [];
+
+ let Inst{6-8} = crD;
+ let Inst{9-10} = 0;
+ let Inst{11-15} = RA;
+ let Inst{16-20} = RB;
+ let Inst{21-31} = xo;
+}
+
+class EVXForm_D<bits<11> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin> : I<4, OOL, IOL, asmstr, itin> {
+ bits<5> RT;
+ bits<21> D;
+
+ let Pattern = [];
+
+ let Inst{6-10} = RT;
+ let Inst{20} = D{0};
+ let Inst{19} = D{1};
+ let Inst{18} = D{2};
+ let Inst{17} = D{3};
+ let Inst{16} = D{4};
+ let Inst{15} = D{5};
+ let Inst{14} = D{6};
+ let Inst{13} = D{7};
+ let Inst{12} = D{8};
+ let Inst{11} = D{9};
+ let Inst{11-20} = D{0-9};
+ let Inst{21-31} = xo;
+}
+
+let Predicates = [HasSPE], isAsmParserOnly = 1 in {
+
+def EVLDD : EVXForm_D<769, (outs gprc:$RT), (ins spe8dis:$dst),
+ "evldd $RT, $dst", IIC_VecFP>;
+def EVLDW : EVXForm_D<771, (outs gprc:$RT), (ins spe8dis:$dst),
+ "evldw $RT, $dst", IIC_VecFP>;
+def EVLDH : EVXForm_D<773, (outs gprc:$RT), (ins spe8dis:$dst),
+ "evldh $RT, $dst", IIC_VecFP>;
+def EVLHHESPLAT : EVXForm_D<777, (outs gprc:$RT), (ins spe2dis:$dst),
+ "evlhhesplat $RT, $dst", IIC_VecFP>;
+def EVLHHOUSPLAT : EVXForm_D<781, (outs gprc:$RT), (ins spe2dis:$dst),
+ "evlhhousplat $RT, $dst", IIC_VecFP>;
+def EVLHHOSSPLAT : EVXForm_D<783, (outs gprc:$RT), (ins spe2dis:$dst),
+ "evlhhossplat $RT, $dst", IIC_VecFP>;
+def EVLWHE : EVXForm_D<785, (outs gprc:$RT), (ins spe4dis:$dst),
+ "evlwhe $RT, $dst", IIC_VecFP>;
+def EVLWHOU : EVXForm_D<789, (outs gprc:$RT), (ins spe4dis:$dst),
+ "evlwhou $RT, $dst", IIC_VecFP>;
+def EVLWHOS : EVXForm_D<791, (outs gprc:$RT), (ins spe4dis:$dst),
+ "evlwhos $RT, $dst", IIC_VecFP>;
+def EVLWWSPLAT : EVXForm_D<793, (outs gprc:$RT), (ins spe4dis:$dst),
+ "evlwwsplat $RT, $dst", IIC_VecFP>;
+def EVLWHSPLAT : EVXForm_D<797, (outs gprc:$RT), (ins spe4dis:$dst),
+ "evlwhsplat $RT, $dst", IIC_VecFP>;
+
+def EVSTDD : EVXForm_D<801, (outs), (ins gprc:$RT, spe8dis:$dst),
+ "evstdd $RT, $dst", IIC_VecFP>;
+def EVSTDH : EVXForm_D<805, (outs), (ins gprc:$RT, spe8dis:$dst),
+ "evstdh $RT, $dst", IIC_VecFP>;
+def EVSTDW : EVXForm_D<803, (outs), (ins gprc:$RT, spe8dis:$dst),
+ "evstdw $RT, $dst", IIC_VecFP>;
+def EVSTWHE : EVXForm_D<817, (outs), (ins gprc:$RT, spe4dis:$dst),
+ "evstwhe $RT, $dst", IIC_VecFP>;
+def EVSTWHO : EVXForm_D<821, (outs), (ins gprc:$RT, spe4dis:$dst),
+ "evstwho $RT, $dst", IIC_VecFP>;
+def EVSTWWE : EVXForm_D<825, (outs), (ins gprc:$RT, spe4dis:$dst),
+ "evstwwe $RT, $dst", IIC_VecFP>;
+def EVSTWWO : EVXForm_D<829, (outs), (ins gprc:$RT, spe4dis:$dst),
+ "evstwwo $RT, $dst", IIC_VecFP>;
+
+def EVMRA : EVXForm_1<1220, (outs gprc:$RT), (ins gprc:$RA),
+ "evmra $RT, $RA", IIC_VecFP> {
+ let RB = 0;
+}
+
+def BRINC : EVXForm_1<527, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "brinc $RT, $RA, $RB", IIC_VecFP>;
+def EVABS : EVXForm_2<520, (outs gprc:$RT), (ins gprc:$RA),
+ "evabs $RT, $RA", IIC_VecFP>;
+
+def EVADDIW : EVXForm_1<514, (outs gprc:$RT), (ins gprc:$RA, u5imm:$RB),
+ "evaddiw $RT, $RB, $RA", IIC_VecFP>;
+def EVADDSMIAAW : EVXForm_2<1225, (outs gprc:$RT), (ins gprc:$RA),
+ "evaddsmiaaw $RT, $RA", IIC_VecFP>;
+def EVADDSSIAAW : EVXForm_2<1217, (outs gprc:$RT), (ins gprc:$RA),
+ "evaddssiaaw $RT, $RA", IIC_VecFP>;
+def EVADDUSIAAW : EVXForm_2<1216, (outs gprc:$RT), (ins gprc:$RA),
+ "evaddusiaaw $RT, $RA", IIC_VecFP>;
+def EVADDUMIAAW : EVXForm_2<1224, (outs gprc:$RT), (ins gprc:$RA),
+ "evaddumiaaw $RT, $RA", IIC_VecFP>;
+def EVADDW : EVXForm_1<512, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evaddw $RT, $RA, $RB", IIC_VecFP>;
+
+def EVAND : EVXForm_1<529, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evand $RT, $RA, $RB", IIC_VecFP>;
+def EVANDC : EVXForm_1<530, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evandc $RT, $RA, $RB", IIC_VecFP>;
+
+def EVCMPEQ : EVXForm_3<564, (outs crrc:$crD), (ins gprc:$RA, gprc:$RB),
+ "evcmpeq $crD, $RA, $RB", IIC_VecFP>;
+def EVCMPGTS : EVXForm_3<561, (outs crrc:$crD), (ins gprc:$RA, gprc:$RB),
+ "evcmpgts $crD, $RA, $RB", IIC_VecFP>;
+def EVCMPGTU : EVXForm_3<560, (outs crrc:$crD), (ins gprc:$RA, gprc:$RB),
+ "evcmpgtu $crD, $RA, $RB", IIC_VecFP>;
+def EVCMPLTS : EVXForm_3<563, (outs crrc:$crD), (ins gprc:$RA, gprc:$RB),
+ "evcmplts $crD, $RA, $RB", IIC_VecFP>;
+def EVCMPLTU : EVXForm_3<562, (outs crrc:$crD), (ins gprc:$RA, gprc:$RB),
+ "evcmpltu $crD, $RA, $RB", IIC_VecFP>;
+
+def EVCNTLSW : EVXForm_2<526, (outs gprc:$RT), (ins gprc:$RA),
+ "evcntlsw $RT, $RA", IIC_VecFP>;
+def EVCNTLZW : EVXForm_2<525, (outs gprc:$RT), (ins gprc:$RA),
+ "evcntlzw $RT, $RA", IIC_VecFP>;
+
+def EVDIVWS : EVXForm_1<1222, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evdivws $RT, $RA, $RB", IIC_VecFP>;
+def EVDIVWU : EVXForm_1<1223, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evdivwu $RT, $RA, $RB", IIC_VecFP>;
+
+def EVEQV : EVXForm_1<537, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "eveqv $RT, $RA, $RB", IIC_VecFP>;
+
+def EVEXTSB : EVXForm_2<522, (outs gprc:$RT), (ins gprc:$RA),
+ "evextsb $RT, $RA", IIC_VecFP>;
+def EVEXTSH : EVXForm_2<523, (outs gprc:$RT), (ins gprc:$RA),
+ "evextsh $RT, $RA", IIC_VecFP>;
+
+def EVLDDX : EVXForm_1<768, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evlddx $RT, $RA, $RB", IIC_VecFP>;
+def EVLDWX : EVXForm_1<770, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evldwx $RT, $RA, $RB", IIC_VecFP>;
+def EVLDHX : EVXForm_1<772, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evldhx $RT, $RA, $RB", IIC_VecFP>;
+def EVLHHESPLATX : EVXForm_1<776, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evlhhesplatx $RT, $RA, $RB", IIC_VecFP>;
+def EVLHHOUSPLATX : EVXForm_1<780, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evlhhousplatx $RT, $RA, $RB", IIC_VecFP>;
+def EVLHHOSSPLATX : EVXForm_1<782, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evlhhossplatx $RT, $RA, $RB", IIC_VecFP>;
+def EVLWHEX : EVXForm_1<784, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evlwhex $RT, $RA, $RB", IIC_VecFP>;
+def EVLWHOUX : EVXForm_1<788, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evlwhoux $RT, $RA, $RB", IIC_VecFP>;
+def EVLWHOSX : EVXForm_1<790, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evlwhosx $RT, $RA, $RB", IIC_VecFP>;
+def EVLWWSPLATX : EVXForm_1<792, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evlwwsplatx $RT, $RA, $RB", IIC_VecFP>;
+def EVLWHSPLATX : EVXForm_1<796, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evlwhsplatx $RT, $RA, $RB", IIC_VecFP>;
+
+def EVMERGEHI : EVXForm_1<556, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmergehi $RT, $RA, $RB", IIC_VecFP>;
+def EVMERGELO : EVXForm_1<557, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmergelo $RT, $RA, $RB", IIC_VecFP>;
+def EVMERGEHILO : EVXForm_1<558, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmergehilo $RT, $RA, $RB", IIC_VecFP>;
+def EVMERGELOHI : EVXForm_1<559, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmergelohi $RT, $RA, $RB", IIC_VecFP>;
+
+def EVMHEGSMFAA : EVXForm_1<1323, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhegsmfaa $RT, $RA, $RB", IIC_VecFP>;
+def EVMHEGSMFAN : EVXForm_1<1451, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhegsmfan $RT, $RA, $RB", IIC_VecFP>;
+def EVMHEGSMIAA : EVXForm_1<1321, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhegsmiaa $RT, $RA, $RB", IIC_VecFP>;
+def EVMHEGSMIAN : EVXForm_1<1449, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhegsmian $RT, $RA, $RB", IIC_VecFP>;
+def EVMHEGUMIAA : EVXForm_1<1320, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhegumiaa $RT, $RA, $RB", IIC_VecFP>;
+def EVMHEGUMIAN : EVXForm_1<1448, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhegumian $RT, $RA, $RB", IIC_VecFP>;
+
+def EVMHESMF : EVXForm_1<1035, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhesmf $RT, $RA, $RB", IIC_VecFP>;
+def EVMHESMFA : EVXForm_1<1067, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhesmfa $RT, $RA, $RB", IIC_VecFP>;
+def EVMHESMFAAW : EVXForm_1<1291, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhesmfaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHESMFANW : EVXForm_1<1419, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhesmfanw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHESMI : EVXForm_1<1033, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhesmi $RT, $RA, $RB", IIC_VecFP>;
+def EVMHESMIA : EVXForm_1<1065, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhesmia $RT, $RA, $RB", IIC_VecFP>;
+def EVMHESMIAAW : EVXForm_1<1289, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhesmiaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHESMIANW : EVXForm_1<1417, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhesmianw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHESSF : EVXForm_1<1027, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhessf $RT, $RA, $RB", IIC_VecFP>;
+def EVMHESSFA : EVXForm_1<1059, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhessfa $RT, $RA, $RB", IIC_VecFP>;
+def EVMHESSFAAW : EVXForm_1<1283, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhessfaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHESSFANW : EVXForm_1<1411, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhessfanw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHESSIAAW : EVXForm_1<1281, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhessiaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHESSIANW : EVXForm_1<1409, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhessianw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHEUMI : EVXForm_1<1032, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmheumi $RT, $RA, $RB", IIC_VecFP>;
+def EVMHEUMIA : EVXForm_1<1064, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmheumia $RT, $RA, $RB", IIC_VecFP>;
+def EVMHEUMIAAW : EVXForm_1<1288, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmheumiaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHEUMIANW : EVXForm_1<1416, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmheumianw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHEUSIAAW : EVXForm_1<1280, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmheusiaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHEUSIANW : EVXForm_1<1408, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmheusianw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOGSMFAA : EVXForm_1<1327, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhogsmfaa $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOGSMFAN : EVXForm_1<1455, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhogsmfan $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOGSMIAA : EVXForm_1<1325, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhogsmiaa $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOGSMIAN : EVXForm_1<1453, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhogsmian $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOGUMIAA : EVXForm_1<1324, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhogumiaa $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOGUMIAN : EVXForm_1<1452, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhogumian $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSMF : EVXForm_1<1039, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhosmf $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSMFA : EVXForm_1<1071, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhosmfa $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSMFAAW : EVXForm_1<1295, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhosmfaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSMFANW : EVXForm_1<1423, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhosmfanw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSMI : EVXForm_1<1037, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhosmi $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSMIA : EVXForm_1<1069, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhosmia $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSMIAAW : EVXForm_1<1293, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhosmiaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSMIANW : EVXForm_1<1421, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhosmianw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSSF : EVXForm_1<1031, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhossf $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSSFA : EVXForm_1<1063, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhossfa $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSSFAAW : EVXForm_1<1287, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhossfaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSSFANW : EVXForm_1<1415, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhossfanw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSSIAAW : EVXForm_1<1285, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhossiaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSSIANW : EVXForm_1<1413, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhossianw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOUMI : EVXForm_1<1036, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhoumi $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOUMIA : EVXForm_1<1068, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhoumia $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOUMIAAW : EVXForm_1<1292, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhoumiaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOUMIANW : EVXForm_1<1420, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhoumianw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOUSIAAW : EVXForm_1<1284, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhousiaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOUSIANW : EVXForm_1<1412, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmhousianw $RT, $RA, $RB", IIC_VecFP>;
+
+
+def EVMWHSMF : EVXForm_1<1103, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmwhsmf $RT, $RA, $RB", IIC_VecFP>;
+def EVMWHSMFA : EVXForm_1<1135, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmwhsmfa $RT, $RA, $RB", IIC_VecFP>;
+def EVMWHSMI : EVXForm_1<1101, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmwhsmi $RT, $RA, $RB", IIC_VecFP>;
+def EVMWHSMIA : EVXForm_1<1133, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmwhsmia $RT, $RA, $RB", IIC_VecFP>;
+def EVMWHSSF : EVXForm_1<1095, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmwhssf $RT, $RA, $RB", IIC_VecFP>;
+def EVMWHSSFA : EVXForm_1<1127, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmwhssfa $RT, $RA, $RB", IIC_VecFP>;
+def EVMWHUMI : EVXForm_1<1100, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmwhumi $RT, $RA, $RB", IIC_VecFP>;
+def EVMWHUMIA : EVXForm_1<1132, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmwhumia $RT, $RA, $RB", IIC_VecFP>;
+def EVMWLSMIAAW : EVXForm_1<1353, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmwlsmiaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMWLSMIANW : EVXForm_1<1481, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmwlsmianw $RT, $RA, $RB", IIC_VecFP>;
+def EVMWLSSIAAW : EVXForm_1<1345, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmwlssiaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMWLSSIANW : EVXForm_1<1473, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmwlssianw $RT, $RA, $RB", IIC_VecFP>;
+def EVMWLUMI : EVXForm_1<1096, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmwlumi $RT, $RA, $RB", IIC_VecFP>;
+def EVMWLUMIA : EVXForm_1<1128, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmwlumia $RT, $RA, $RB", IIC_VecFP>;
+def EVMWLUMIAAW : EVXForm_1<1352, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmwlumiaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMWLUMIANW : EVXForm_1<1480, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmwlumianw $RT, $RA, $RB", IIC_VecFP>;
+def EVMWLUSIAAW : EVXForm_1<1344, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmwlusiaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMWLUSIANW : EVXForm_1<1472, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmwlusianw $RT, $RA, $RB", IIC_VecFP>;
+def EVMWSMF : EVXForm_1<1115, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmwsmf $RT, $RA, $RB", IIC_VecFP>;
+def EVMWSMFA : EVXForm_1<1147, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmwsmfa $RT, $RA, $RB", IIC_VecFP>;
+def EVMWSMFAA : EVXForm_1<1371, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmwsmfaa $RT, $RA, $RB", IIC_VecFP>;
+def EVMWSMFAN : EVXForm_1<1499, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmwsmfan $RT, $RA, $RB", IIC_VecFP>;
+def EVMWSMI : EVXForm_1<1113, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmwsmi $RT, $RA, $RB", IIC_VecFP>;
+def EVMWSMIA : EVXForm_1<1145, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmwsmia $RT, $RA, $RB", IIC_VecFP>;
+def EVMWSMIAA : EVXForm_1<1369, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmwsmiaa $RT, $RA, $RB", IIC_VecFP>;
+def EVMWSMIAN : EVXForm_1<1497, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmwsmian $RT, $RA, $RB", IIC_VecFP>;
+def EVMWSSF : EVXForm_1<1107, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmwssf $RT, $RA, $RB", IIC_VecFP>;
+def EVMWSSFA : EVXForm_1<1139, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmwssfa $RT, $RA, $RB", IIC_VecFP>;
+def EVMWSSFAA : EVXForm_1<1363, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmwssfaa $RT, $RA, $RB", IIC_VecFP>;
+def EVMWSSFAN : EVXForm_1<1491, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmwssfan $RT, $RA, $RB", IIC_VecFP>;
+def EVMWUMI : EVXForm_1<1112, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmwumi $RT, $RA, $RB", IIC_VecFP>;
+def EVMWUMIA : EVXForm_1<1144, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmwumia $RT, $RA, $RB", IIC_VecFP>;
+def EVMWUMIAA : EVXForm_1<1368, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmwumiaa $RT, $RA, $RB", IIC_VecFP>;
+def EVMWUMIAN : EVXForm_1<1496, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evmwumian $RT, $RA, $RB", IIC_VecFP>;
+
+
+def EVNAND : EVXForm_1<542, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evnand $RT, $RA, $RB", IIC_VecFP>;
+
+def EVNEG : EVXForm_2<521, (outs gprc:$RT), (ins gprc:$RA),
+ "evneg $RT, $RA", IIC_VecFP>;
+
+def EVNOR : EVXForm_1<536, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evnor $RT, $RA, $RB", IIC_VecFP>;
+def EVOR : EVXForm_1<535, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evor $RT, $RA, $RB", IIC_VecFP>;
+def EVORC : EVXForm_1<539, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evorc $RT, $RA, $RB", IIC_VecFP>;
+
+def EVRLWI : EVXForm_1<554, (outs gprc:$RT), (ins gprc:$RA, u5imm:$RB),
+ "evrlwi $RT, $RA, $RB", IIC_VecFP>;
+def EVRLW : EVXForm_1<552, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evrlw $RT, $RA, $RB", IIC_VecFP>;
+
+def EVRNDW : EVXForm_2<524, (outs gprc:$RT), (ins gprc:$RA),
+ "evrndw $RT, $RA", IIC_VecFP>;
+
+def EVSLWI : EVXForm_1<550, (outs gprc:$RT), (ins gprc:$RA, u5imm:$RB),
+ "evslwi $RT, $RA, $RB", IIC_VecFP>;
+def EVSLW : EVXForm_1<548, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evslw $RT, $RA, $RB", IIC_VecFP>;
+
+def EVSPLATFI : EVXForm_2<555, (outs gprc:$RT), (ins i32imm:$RA),
+ "evsplatfi $RT, $RA", IIC_VecFP>;
+def EVSPLATI : EVXForm_2<553, (outs gprc:$RT), (ins i32imm:$RA),
+ "evsplati $RT, $RA", IIC_VecFP>;
+
+def EVSRWIS : EVXForm_1<547, (outs gprc:$RT), (ins gprc:$RA, u5imm:$RB),
+ "evsrwis $RT, $RA, $RB", IIC_VecFP>;
+def EVSRWIU : EVXForm_1<546, (outs gprc:$RT), (ins gprc:$RA, u5imm:$RB),
+ "evsrwiu $RT, $RA, $RB", IIC_VecFP>;
+def EVSRWS : EVXForm_1<545, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evsrws $RT, $RA, $RB", IIC_VecFP>;
+def EVSRWU : EVXForm_1<544, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evsrwu $RT, $RA, $RB", IIC_VecFP>;
+
+def EVSTDDX : EVXForm_1<800, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB),
+ "evstddx $RT, $RA, $RB", IIC_VecFP>;
+def EVSTDHX : EVXForm_1<804, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB),
+ "evstdhx $RT, $RA, $RB", IIC_VecFP>;
+def EVSTDWX : EVXForm_1<802, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB),
+ "evstdwx $RT, $RA, $RB", IIC_VecFP>;
+def EVSTWHEX : EVXForm_1<816, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB),
+ "evstwhex $RT, $RA, $RB", IIC_VecFP>;
+def EVSTWHOX : EVXForm_1<820, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB),
+ "evstwhox $RT, $RA, $RB", IIC_VecFP>;
+def EVSTWWEX : EVXForm_1<824, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB),
+ "evstwwex $RT, $RA, $RB", IIC_VecFP>;
+def EVSTWWOX : EVXForm_1<828, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB),
+ "evstwwox $RT, $RA, $RB", IIC_VecFP>;
+
+def EVSUBFSSIAAW : EVXForm_2<1219, (outs gprc:$RT), (ins gprc:$RA),
+ "evsubfssiaaw $RT, $RA", IIC_VecFP>;
+def EVSUBFSMIAAW : EVXForm_2<1227, (outs gprc:$RT), (ins gprc:$RA),
+ "evsubfsmiaaw $RT, $RA", IIC_VecFP>;
+def EVSUBFUMIAAW : EVXForm_2<1226, (outs gprc:$RT), (ins gprc:$RA),
+ "evsubfumiaaw $RT, $RA", IIC_VecFP>;
+def EVSUBFUSIAAW : EVXForm_2<1218, (outs gprc:$RT), (ins gprc:$RA),
+ "evsubfusiaaw $RT, $RA", IIC_VecFP>;
+def EVSUBFW : EVXForm_1<516, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evsubfw $RT, $RA, $RB", IIC_VecFP>;
+def EVSUBIFW : EVXForm_1<518, (outs gprc:$RT), (ins u5imm:$RA, gprc:$RB),
+ "evsubifw $RT, $RA, $RB", IIC_VecFP>;
+def EVXOR : EVXForm_1<534, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+ "evxor $RT, $RA, $RB", IIC_VecFP>;
+
+} // HasSPE
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td
new file mode 100644
index 000000000000..0d9e3459f47e
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -0,0 +1,2924 @@
+//===- PPCInstrVSX.td - The PowerPC VSX Extension --*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the VSX extension to the PowerPC instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+// *********************************** NOTE ***********************************
+// ** For POWER8 Little Endian, the VSX swap optimization relies on knowing **
+// ** which VMX and VSX instructions are lane-sensitive and which are not. **
+// ** A lane-sensitive instruction relies, implicitly or explicitly, on **
+// ** whether lanes are numbered from left to right. An instruction like **
+// ** VADDFP is not lane-sensitive, because each lane of the result vector **
+// ** relies only on the corresponding lane of the source vectors. However, **
+// ** an instruction like VMULESB is lane-sensitive, because "even" and **
+// ** "odd" lanes are different for big-endian and little-endian numbering. **
+// ** **
+// ** When adding new VMX and VSX instructions, please consider whether they **
+// ** are lane-sensitive. If so, they must be added to a switch statement **
+// ** in PPCVSXSwapRemoval::gatherVectorInstructions(). **
+// ****************************************************************************
+
+def PPCRegVSRCAsmOperand : AsmOperandClass {
+ let Name = "RegVSRC"; let PredicateMethod = "isVSRegNumber";
+}
+def vsrc : RegisterOperand<VSRC> {
+ let ParserMatchClass = PPCRegVSRCAsmOperand;
+}
+
+def PPCRegVSFRCAsmOperand : AsmOperandClass {
+ let Name = "RegVSFRC"; let PredicateMethod = "isVSRegNumber";
+}
+def vsfrc : RegisterOperand<VSFRC> {
+ let ParserMatchClass = PPCRegVSFRCAsmOperand;
+}
+
+def PPCRegVSSRCAsmOperand : AsmOperandClass {
+ let Name = "RegVSSRC"; let PredicateMethod = "isVSRegNumber";
+}
+def vssrc : RegisterOperand<VSSRC> {
+ let ParserMatchClass = PPCRegVSSRCAsmOperand;
+}
+
+// Little-endian-specific nodes.
+def SDT_PPClxvd2x : SDTypeProfile<1, 1, [
+ SDTCisVT<0, v2f64>, SDTCisPtrTy<1>
+]>;
+def SDT_PPCstxvd2x : SDTypeProfile<0, 2, [
+ SDTCisVT<0, v2f64>, SDTCisPtrTy<1>
+]>;
+def SDT_PPCxxswapd : SDTypeProfile<1, 1, [
+ SDTCisSameAs<0, 1>
+]>;
+def SDTVecConv : SDTypeProfile<1, 2, [
+ SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>
+]>;
+
+def PPClxvd2x : SDNode<"PPCISD::LXVD2X", SDT_PPClxvd2x,
+ [SDNPHasChain, SDNPMayLoad]>;
+def PPCstxvd2x : SDNode<"PPCISD::STXVD2X", SDT_PPCstxvd2x,
+ [SDNPHasChain, SDNPMayStore]>;
+def PPCxxswapd : SDNode<"PPCISD::XXSWAPD", SDT_PPCxxswapd, [SDNPHasChain]>;
+def PPCmfvsr : SDNode<"PPCISD::MFVSR", SDTUnaryOp, []>;
+def PPCmtvsra : SDNode<"PPCISD::MTVSRA", SDTUnaryOp, []>;
+def PPCmtvsrz : SDNode<"PPCISD::MTVSRZ", SDTUnaryOp, []>;
+def PPCsvec2fp : SDNode<"PPCISD::SINT_VEC_TO_FP", SDTVecConv, []>;
+def PPCuvec2fp: SDNode<"PPCISD::UINT_VEC_TO_FP", SDTVecConv, []>;
+def PPCswapNoChain : SDNode<"PPCISD::SWAP_NO_CHAIN", SDT_PPCxxswapd>;
+
+multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, string asmbase,
+ string asmstr, InstrItinClass itin, Intrinsic Int,
+ ValueType OutTy, ValueType InTy> {
+ let BaseName = asmbase in {
+ def NAME : XX3Form_Rc<opcode, xo, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+ !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+ [(set OutTy:$XT, (Int InTy:$XA, InTy:$XB))]>;
+ let Defs = [CR6] in
+ def o : XX3Form_Rc<opcode, xo, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+ !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+ [(set InTy:$XT,
+ (InTy (PPCvcmp_o InTy:$XA, InTy:$XB, xo)))]>,
+ isDOT;
+ }
+}
+
+// Instruction form with a single input register for instructions such as
+// XXPERMDI. The reason for defining this is that specifying multiple chained
+// operands (such as loads) to an instruction will perform both chained
+// operations rather than coalescing them into a single register - even though
+// the source memory location is the same. This simply forces the instruction
+// to use the same register for both inputs.
+// For example, an output DAG such as this:
+// (XXPERMDI (LXSIBZX xoaddr:$src), (LXSIBZX xoaddr:$src ), 0))
+// would result in two load instructions emitted and used as separate inputs
+// to the XXPERMDI instruction.
+class XX3Form_2s<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : XX3Form_2<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+ let XB = XA;
+}
+
+def HasVSX : Predicate<"PPCSubTarget->hasVSX()">;
+def IsLittleEndian : Predicate<"PPCSubTarget->isLittleEndian()">;
+def IsBigEndian : Predicate<"!PPCSubTarget->isLittleEndian()">;
+def HasOnlySwappingMemOps : Predicate<"!PPCSubTarget->hasP9Vector()">;
+
+let Predicates = [HasVSX] in {
+let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
+let UseVSXReg = 1 in {
+let hasSideEffects = 0 in { // VSX instructions don't have side effects.
+let Uses = [RM] in {
+
+ // Load indexed instructions
+ let mayLoad = 1 in {
+ let CodeSize = 3 in
+ def LXSDX : XX1Form<31, 588,
+ (outs vsfrc:$XT), (ins memrr:$src),
+ "lxsdx $XT, $src", IIC_LdStLFD,
+ [(set f64:$XT, (load xoaddr:$src))]>;
+
+ let Predicates = [HasVSX, HasOnlySwappingMemOps] in
+ def LXVD2X : XX1Form<31, 844,
+ (outs vsrc:$XT), (ins memrr:$src),
+ "lxvd2x $XT, $src", IIC_LdStLFD,
+ [(set v2f64:$XT, (int_ppc_vsx_lxvd2x xoaddr:$src))]>;
+
+ def LXVDSX : XX1Form<31, 332,
+ (outs vsrc:$XT), (ins memrr:$src),
+ "lxvdsx $XT, $src", IIC_LdStLFD, []>;
+
+ let Predicates = [HasVSX, HasOnlySwappingMemOps] in
+ def LXVW4X : XX1Form<31, 780,
+ (outs vsrc:$XT), (ins memrr:$src),
+ "lxvw4x $XT, $src", IIC_LdStLFD,
+ [(set v4i32:$XT, (int_ppc_vsx_lxvw4x xoaddr:$src))]>;
+ } // mayLoad
+
+ // Store indexed instructions
+ let mayStore = 1 in {
+ let CodeSize = 3 in
+ def STXSDX : XX1Form<31, 716,
+ (outs), (ins vsfrc:$XT, memrr:$dst),
+ "stxsdx $XT, $dst", IIC_LdStSTFD,
+ [(store f64:$XT, xoaddr:$dst)]>;
+
+ let Predicates = [HasVSX, HasOnlySwappingMemOps] in {
+ // The behaviour of this instruction is endianness-specific so we provide no
+ // pattern to match it without considering endianness.
+ def STXVD2X : XX1Form<31, 972,
+ (outs), (ins vsrc:$XT, memrr:$dst),
+ "stxvd2x $XT, $dst", IIC_LdStSTFD,
+ []>;
+
+ def STXVW4X : XX1Form<31, 908,
+ (outs), (ins vsrc:$XT, memrr:$dst),
+ "stxvw4x $XT, $dst", IIC_LdStSTFD,
+ [(store v4i32:$XT, xoaddr:$dst)]>;
+ }
+ } // mayStore
+
+ // Add/Mul Instructions
+ let isCommutable = 1 in {
+ def XSADDDP : XX3Form<60, 32,
+ (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
+ "xsadddp $XT, $XA, $XB", IIC_VecFP,
+ [(set f64:$XT, (fadd f64:$XA, f64:$XB))]>;
+ def XSMULDP : XX3Form<60, 48,
+ (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
+ "xsmuldp $XT, $XA, $XB", IIC_VecFP,
+ [(set f64:$XT, (fmul f64:$XA, f64:$XB))]>;
+
+ def XVADDDP : XX3Form<60, 96,
+ (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+ "xvadddp $XT, $XA, $XB", IIC_VecFP,
+ [(set v2f64:$XT, (fadd v2f64:$XA, v2f64:$XB))]>;
+
+ def XVADDSP : XX3Form<60, 64,
+ (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+ "xvaddsp $XT, $XA, $XB", IIC_VecFP,
+ [(set v4f32:$XT, (fadd v4f32:$XA, v4f32:$XB))]>;
+
+ def XVMULDP : XX3Form<60, 112,
+ (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+ "xvmuldp $XT, $XA, $XB", IIC_VecFP,
+ [(set v2f64:$XT, (fmul v2f64:$XA, v2f64:$XB))]>;
+
+ def XVMULSP : XX3Form<60, 80,
+ (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+ "xvmulsp $XT, $XA, $XB", IIC_VecFP,
+ [(set v4f32:$XT, (fmul v4f32:$XA, v4f32:$XB))]>;
+ }
+
+ // Subtract Instructions
+ def XSSUBDP : XX3Form<60, 40,
+ (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
+ "xssubdp $XT, $XA, $XB", IIC_VecFP,
+ [(set f64:$XT, (fsub f64:$XA, f64:$XB))]>;
+
+ def XVSUBDP : XX3Form<60, 104,
+ (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+ "xvsubdp $XT, $XA, $XB", IIC_VecFP,
+ [(set v2f64:$XT, (fsub v2f64:$XA, v2f64:$XB))]>;
+ def XVSUBSP : XX3Form<60, 72,
+ (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+ "xvsubsp $XT, $XA, $XB", IIC_VecFP,
+ [(set v4f32:$XT, (fsub v4f32:$XA, v4f32:$XB))]>;
+
+ // FMA Instructions
+ let BaseName = "XSMADDADP" in {
+ let isCommutable = 1 in
+ def XSMADDADP : XX3Form<60, 33,
+ (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
+ "xsmaddadp $XT, $XA, $XB", IIC_VecFP,
+ [(set f64:$XT, (fma f64:$XA, f64:$XB, f64:$XTi))]>,
+ RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ AltVSXFMARel;
+ let IsVSXFMAAlt = 1 in
+ def XSMADDMDP : XX3Form<60, 41,
+ (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
+ "xsmaddmdp $XT, $XA, $XB", IIC_VecFP, []>,
+ RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ AltVSXFMARel;
+ }
+
+ let BaseName = "XSMSUBADP" in {
+ let isCommutable = 1 in
+ def XSMSUBADP : XX3Form<60, 49,
+ (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
+ "xsmsubadp $XT, $XA, $XB", IIC_VecFP,
+ [(set f64:$XT, (fma f64:$XA, f64:$XB, (fneg f64:$XTi)))]>,
+ RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ AltVSXFMARel;
+ let IsVSXFMAAlt = 1 in
+ def XSMSUBMDP : XX3Form<60, 57,
+ (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
+ "xsmsubmdp $XT, $XA, $XB", IIC_VecFP, []>,
+ RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ AltVSXFMARel;
+ }
+
+ let BaseName = "XSNMADDADP" in {
+ let isCommutable = 1 in
+ def XSNMADDADP : XX3Form<60, 161,
+ (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
+ "xsnmaddadp $XT, $XA, $XB", IIC_VecFP,
+ [(set f64:$XT, (fneg (fma f64:$XA, f64:$XB, f64:$XTi)))]>,
+ RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ AltVSXFMARel;
+ let IsVSXFMAAlt = 1 in
+ def XSNMADDMDP : XX3Form<60, 169,
+ (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
+ "xsnmaddmdp $XT, $XA, $XB", IIC_VecFP, []>,
+ RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ AltVSXFMARel;
+ }
+
+ let BaseName = "XSNMSUBADP" in {
+ let isCommutable = 1 in
+ def XSNMSUBADP : XX3Form<60, 177,
+ (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
+ "xsnmsubadp $XT, $XA, $XB", IIC_VecFP,
+ [(set f64:$XT, (fneg (fma f64:$XA, f64:$XB, (fneg f64:$XTi))))]>,
+ RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ AltVSXFMARel;
+ let IsVSXFMAAlt = 1 in
+ def XSNMSUBMDP : XX3Form<60, 185,
+ (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
+ "xsnmsubmdp $XT, $XA, $XB", IIC_VecFP, []>,
+ RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ AltVSXFMARel;
+ }
+
+ let BaseName = "XVMADDADP" in {
+ let isCommutable = 1 in
+ def XVMADDADP : XX3Form<60, 97,
+ (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+ "xvmaddadp $XT, $XA, $XB", IIC_VecFP,
+ [(set v2f64:$XT, (fma v2f64:$XA, v2f64:$XB, v2f64:$XTi))]>,
+ RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ AltVSXFMARel;
+ let IsVSXFMAAlt = 1 in
+ def XVMADDMDP : XX3Form<60, 105,
+ (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+ "xvmaddmdp $XT, $XA, $XB", IIC_VecFP, []>,
+ RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ AltVSXFMARel;
+ }
+
+ let BaseName = "XVMADDASP" in {
+ let isCommutable = 1 in
+ def XVMADDASP : XX3Form<60, 65,
+ (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+ "xvmaddasp $XT, $XA, $XB", IIC_VecFP,
+ [(set v4f32:$XT, (fma v4f32:$XA, v4f32:$XB, v4f32:$XTi))]>,
+ RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ AltVSXFMARel;
+ let IsVSXFMAAlt = 1 in
+ def XVMADDMSP : XX3Form<60, 73,
+ (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+ "xvmaddmsp $XT, $XA, $XB", IIC_VecFP, []>,
+ RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ AltVSXFMARel;
+ }
+
+ let BaseName = "XVMSUBADP" in {
+ let isCommutable = 1 in
+ def XVMSUBADP : XX3Form<60, 113,
+ (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+ "xvmsubadp $XT, $XA, $XB", IIC_VecFP,
+ [(set v2f64:$XT, (fma v2f64:$XA, v2f64:$XB, (fneg v2f64:$XTi)))]>,
+ RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ AltVSXFMARel;
+ let IsVSXFMAAlt = 1 in
+ def XVMSUBMDP : XX3Form<60, 121,
+ (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+ "xvmsubmdp $XT, $XA, $XB", IIC_VecFP, []>,
+ RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ AltVSXFMARel;
+ }
+
+ let BaseName = "XVMSUBASP" in {
+ let isCommutable = 1 in
+ def XVMSUBASP : XX3Form<60, 81,
+ (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+ "xvmsubasp $XT, $XA, $XB", IIC_VecFP,
+ [(set v4f32:$XT, (fma v4f32:$XA, v4f32:$XB, (fneg v4f32:$XTi)))]>,
+ RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ AltVSXFMARel;
+ let IsVSXFMAAlt = 1 in
+ def XVMSUBMSP : XX3Form<60, 89,
+ (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+ "xvmsubmsp $XT, $XA, $XB", IIC_VecFP, []>,
+ RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ AltVSXFMARel;
+ }
+
+ let BaseName = "XVNMADDADP" in {
+ let isCommutable = 1 in
+ def XVNMADDADP : XX3Form<60, 225,
+ (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+ "xvnmaddadp $XT, $XA, $XB", IIC_VecFP,
+ [(set v2f64:$XT, (fneg (fma v2f64:$XA, v2f64:$XB, v2f64:$XTi)))]>,
+ RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ AltVSXFMARel;
+ let IsVSXFMAAlt = 1 in
+ def XVNMADDMDP : XX3Form<60, 233,
+ (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+ "xvnmaddmdp $XT, $XA, $XB", IIC_VecFP, []>,
+ RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ AltVSXFMARel;
+ }
+
+ let BaseName = "XVNMADDASP" in {
+ let isCommutable = 1 in
+ def XVNMADDASP : XX3Form<60, 193,
+ (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+ "xvnmaddasp $XT, $XA, $XB", IIC_VecFP,
+ [(set v4f32:$XT, (fneg (fma v4f32:$XA, v4f32:$XB, v4f32:$XTi)))]>,
+ RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ AltVSXFMARel;
+ let IsVSXFMAAlt = 1 in
+ def XVNMADDMSP : XX3Form<60, 201,
+ (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+ "xvnmaddmsp $XT, $XA, $XB", IIC_VecFP, []>,
+ RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ AltVSXFMARel;
+ }
+
+ let BaseName = "XVNMSUBADP" in {
+ let isCommutable = 1 in
+ def XVNMSUBADP : XX3Form<60, 241,
+ (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+ "xvnmsubadp $XT, $XA, $XB", IIC_VecFP,
+ [(set v2f64:$XT, (fneg (fma v2f64:$XA, v2f64:$XB, (fneg v2f64:$XTi))))]>,
+ RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ AltVSXFMARel;
+ let IsVSXFMAAlt = 1 in
+ def XVNMSUBMDP : XX3Form<60, 249,
+ (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+ "xvnmsubmdp $XT, $XA, $XB", IIC_VecFP, []>,
+ RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ AltVSXFMARel;
+ }
+
+ let BaseName = "XVNMSUBASP" in {
+ let isCommutable = 1 in
+ def XVNMSUBASP : XX3Form<60, 209,
+ (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+ "xvnmsubasp $XT, $XA, $XB", IIC_VecFP,
+ [(set v4f32:$XT, (fneg (fma v4f32:$XA, v4f32:$XB, (fneg v4f32:$XTi))))]>,
+ RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ AltVSXFMARel;
+ let IsVSXFMAAlt = 1 in
+ def XVNMSUBMSP : XX3Form<60, 217,
+ (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+ "xvnmsubmsp $XT, $XA, $XB", IIC_VecFP, []>,
+ RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ AltVSXFMARel;
+ }
+
+ // Division Instructions
+ def XSDIVDP : XX3Form<60, 56,
+ (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
+ "xsdivdp $XT, $XA, $XB", IIC_FPDivD,
+ [(set f64:$XT, (fdiv f64:$XA, f64:$XB))]>;
+ def XSSQRTDP : XX2Form<60, 75,
+ (outs vsfrc:$XT), (ins vsfrc:$XB),
+ "xssqrtdp $XT, $XB", IIC_FPSqrtD,
+ [(set f64:$XT, (fsqrt f64:$XB))]>;
+
+ def XSREDP : XX2Form<60, 90,
+ (outs vsfrc:$XT), (ins vsfrc:$XB),
+ "xsredp $XT, $XB", IIC_VecFP,
+ [(set f64:$XT, (PPCfre f64:$XB))]>;
+ def XSRSQRTEDP : XX2Form<60, 74,
+ (outs vsfrc:$XT), (ins vsfrc:$XB),
+ "xsrsqrtedp $XT, $XB", IIC_VecFP,
+ [(set f64:$XT, (PPCfrsqrte f64:$XB))]>;
+
+ def XSTDIVDP : XX3Form_1<60, 61,
+ (outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB),
+ "xstdivdp $crD, $XA, $XB", IIC_FPCompare, []>;
+ def XSTSQRTDP : XX2Form_1<60, 106,
+ (outs crrc:$crD), (ins vsfrc:$XB),
+ "xstsqrtdp $crD, $XB", IIC_FPCompare, []>;
+
+ def XVDIVDP : XX3Form<60, 120,
+ (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+ "xvdivdp $XT, $XA, $XB", IIC_FPDivD,
+ [(set v2f64:$XT, (fdiv v2f64:$XA, v2f64:$XB))]>;
+ def XVDIVSP : XX3Form<60, 88,
+ (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+ "xvdivsp $XT, $XA, $XB", IIC_FPDivS,
+ [(set v4f32:$XT, (fdiv v4f32:$XA, v4f32:$XB))]>;
+
+ def XVSQRTDP : XX2Form<60, 203,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvsqrtdp $XT, $XB", IIC_FPSqrtD,
+ [(set v2f64:$XT, (fsqrt v2f64:$XB))]>;
+ def XVSQRTSP : XX2Form<60, 139,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvsqrtsp $XT, $XB", IIC_FPSqrtS,
+ [(set v4f32:$XT, (fsqrt v4f32:$XB))]>;
+
+ def XVTDIVDP : XX3Form_1<60, 125,
+ (outs crrc:$crD), (ins vsrc:$XA, vsrc:$XB),
+ "xvtdivdp $crD, $XA, $XB", IIC_FPCompare, []>;
+ def XVTDIVSP : XX3Form_1<60, 93,
+ (outs crrc:$crD), (ins vsrc:$XA, vsrc:$XB),
+ "xvtdivsp $crD, $XA, $XB", IIC_FPCompare, []>;
+
+ def XVTSQRTDP : XX2Form_1<60, 234,
+ (outs crrc:$crD), (ins vsrc:$XB),
+ "xvtsqrtdp $crD, $XB", IIC_FPCompare, []>;
+ def XVTSQRTSP : XX2Form_1<60, 170,
+ (outs crrc:$crD), (ins vsrc:$XB),
+ "xvtsqrtsp $crD, $XB", IIC_FPCompare, []>;
+
+ def XVREDP : XX2Form<60, 218,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvredp $XT, $XB", IIC_VecFP,
+ [(set v2f64:$XT, (PPCfre v2f64:$XB))]>;
+ def XVRESP : XX2Form<60, 154,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvresp $XT, $XB", IIC_VecFP,
+ [(set v4f32:$XT, (PPCfre v4f32:$XB))]>;
+
+ def XVRSQRTEDP : XX2Form<60, 202,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvrsqrtedp $XT, $XB", IIC_VecFP,
+ [(set v2f64:$XT, (PPCfrsqrte v2f64:$XB))]>;
+ def XVRSQRTESP : XX2Form<60, 138,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvrsqrtesp $XT, $XB", IIC_VecFP,
+ [(set v4f32:$XT, (PPCfrsqrte v4f32:$XB))]>;
+
+ // Compare Instructions
+ def XSCMPODP : XX3Form_1<60, 43,
+ (outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB),
+ "xscmpodp $crD, $XA, $XB", IIC_FPCompare, []>;
+ def XSCMPUDP : XX3Form_1<60, 35,
+ (outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB),
+ "xscmpudp $crD, $XA, $XB", IIC_FPCompare, []>;
+
+ defm XVCMPEQDP : XX3Form_Rcr<60, 99,
+ "xvcmpeqdp", "$XT, $XA, $XB", IIC_VecFPCompare,
+ int_ppc_vsx_xvcmpeqdp, v2i64, v2f64>;
+ defm XVCMPEQSP : XX3Form_Rcr<60, 67,
+ "xvcmpeqsp", "$XT, $XA, $XB", IIC_VecFPCompare,
+ int_ppc_vsx_xvcmpeqsp, v4i32, v4f32>;
+ defm XVCMPGEDP : XX3Form_Rcr<60, 115,
+ "xvcmpgedp", "$XT, $XA, $XB", IIC_VecFPCompare,
+ int_ppc_vsx_xvcmpgedp, v2i64, v2f64>;
+ defm XVCMPGESP : XX3Form_Rcr<60, 83,
+ "xvcmpgesp", "$XT, $XA, $XB", IIC_VecFPCompare,
+ int_ppc_vsx_xvcmpgesp, v4i32, v4f32>;
+ defm XVCMPGTDP : XX3Form_Rcr<60, 107,
+ "xvcmpgtdp", "$XT, $XA, $XB", IIC_VecFPCompare,
+ int_ppc_vsx_xvcmpgtdp, v2i64, v2f64>;
+ defm XVCMPGTSP : XX3Form_Rcr<60, 75,
+ "xvcmpgtsp", "$XT, $XA, $XB", IIC_VecFPCompare,
+ int_ppc_vsx_xvcmpgtsp, v4i32, v4f32>;
+
+ // Move Instructions
+ def XSABSDP : XX2Form<60, 345,
+ (outs vsfrc:$XT), (ins vsfrc:$XB),
+ "xsabsdp $XT, $XB", IIC_VecFP,
+ [(set f64:$XT, (fabs f64:$XB))]>;
+ def XSNABSDP : XX2Form<60, 361,
+ (outs vsfrc:$XT), (ins vsfrc:$XB),
+ "xsnabsdp $XT, $XB", IIC_VecFP,
+ [(set f64:$XT, (fneg (fabs f64:$XB)))]>;
+ def XSNEGDP : XX2Form<60, 377,
+ (outs vsfrc:$XT), (ins vsfrc:$XB),
+ "xsnegdp $XT, $XB", IIC_VecFP,
+ [(set f64:$XT, (fneg f64:$XB))]>;
+ def XSCPSGNDP : XX3Form<60, 176,
+ (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
+ "xscpsgndp $XT, $XA, $XB", IIC_VecFP,
+ [(set f64:$XT, (fcopysign f64:$XB, f64:$XA))]>;
+
+ def XVABSDP : XX2Form<60, 473,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvabsdp $XT, $XB", IIC_VecFP,
+ [(set v2f64:$XT, (fabs v2f64:$XB))]>;
+
+ def XVABSSP : XX2Form<60, 409,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvabssp $XT, $XB", IIC_VecFP,
+ [(set v4f32:$XT, (fabs v4f32:$XB))]>;
+
+ def XVCPSGNDP : XX3Form<60, 240,
+ (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+ "xvcpsgndp $XT, $XA, $XB", IIC_VecFP,
+ [(set v2f64:$XT, (fcopysign v2f64:$XB, v2f64:$XA))]>;
+ def XVCPSGNSP : XX3Form<60, 208,
+ (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+ "xvcpsgnsp $XT, $XA, $XB", IIC_VecFP,
+ [(set v4f32:$XT, (fcopysign v4f32:$XB, v4f32:$XA))]>;
+
+ def XVNABSDP : XX2Form<60, 489,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvnabsdp $XT, $XB", IIC_VecFP,
+ [(set v2f64:$XT, (fneg (fabs v2f64:$XB)))]>;
+ def XVNABSSP : XX2Form<60, 425,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvnabssp $XT, $XB", IIC_VecFP,
+ [(set v4f32:$XT, (fneg (fabs v4f32:$XB)))]>;
+
+ def XVNEGDP : XX2Form<60, 505,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvnegdp $XT, $XB", IIC_VecFP,
+ [(set v2f64:$XT, (fneg v2f64:$XB))]>;
+ def XVNEGSP : XX2Form<60, 441,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvnegsp $XT, $XB", IIC_VecFP,
+ [(set v4f32:$XT, (fneg v4f32:$XB))]>;
+
+ // Conversion Instructions
+ def XSCVDPSP : XX2Form<60, 265,
+ (outs vsfrc:$XT), (ins vsfrc:$XB),
+ "xscvdpsp $XT, $XB", IIC_VecFP, []>;
+ def XSCVDPSXDS : XX2Form<60, 344,
+ (outs vsfrc:$XT), (ins vsfrc:$XB),
+ "xscvdpsxds $XT, $XB", IIC_VecFP,
+ [(set f64:$XT, (PPCfctidz f64:$XB))]>;
+ let isCodeGenOnly = 1 in
+ def XSCVDPSXDSs : XX2Form<60, 344,
+ (outs vssrc:$XT), (ins vssrc:$XB),
+ "xscvdpsxds $XT, $XB", IIC_VecFP,
+ [(set f32:$XT, (PPCfctidz f32:$XB))]>;
+ def XSCVDPSXWS : XX2Form<60, 88,
+ (outs vsfrc:$XT), (ins vsfrc:$XB),
+ "xscvdpsxws $XT, $XB", IIC_VecFP,
+ [(set f64:$XT, (PPCfctiwz f64:$XB))]>;
+ let isCodeGenOnly = 1 in
+ def XSCVDPSXWSs : XX2Form<60, 88,
+ (outs vssrc:$XT), (ins vssrc:$XB),
+ "xscvdpsxws $XT, $XB", IIC_VecFP,
+ [(set f32:$XT, (PPCfctiwz f32:$XB))]>;
+ def XSCVDPUXDS : XX2Form<60, 328,
+ (outs vsfrc:$XT), (ins vsfrc:$XB),
+ "xscvdpuxds $XT, $XB", IIC_VecFP,
+ [(set f64:$XT, (PPCfctiduz f64:$XB))]>;
+ let isCodeGenOnly = 1 in
+ def XSCVDPUXDSs : XX2Form<60, 328,
+ (outs vssrc:$XT), (ins vssrc:$XB),
+ "xscvdpuxds $XT, $XB", IIC_VecFP,
+ [(set f32:$XT, (PPCfctiduz f32:$XB))]>;
+ def XSCVDPUXWS : XX2Form<60, 72,
+ (outs vsfrc:$XT), (ins vsfrc:$XB),
+ "xscvdpuxws $XT, $XB", IIC_VecFP,
+ [(set f64:$XT, (PPCfctiwuz f64:$XB))]>;
+ let isCodeGenOnly = 1 in
+ def XSCVDPUXWSs : XX2Form<60, 72,
+ (outs vssrc:$XT), (ins vssrc:$XB),
+ "xscvdpuxws $XT, $XB", IIC_VecFP,
+ [(set f32:$XT, (PPCfctiwuz f32:$XB))]>;
+ def XSCVSPDP : XX2Form<60, 329,
+ (outs vsfrc:$XT), (ins vsfrc:$XB),
+ "xscvspdp $XT, $XB", IIC_VecFP, []>;
+ def XSCVSXDDP : XX2Form<60, 376,
+ (outs vsfrc:$XT), (ins vsfrc:$XB),
+ "xscvsxddp $XT, $XB", IIC_VecFP,
+ [(set f64:$XT, (PPCfcfid f64:$XB))]>;
+ def XSCVUXDDP : XX2Form<60, 360,
+ (outs vsfrc:$XT), (ins vsfrc:$XB),
+ "xscvuxddp $XT, $XB", IIC_VecFP,
+ [(set f64:$XT, (PPCfcfidu f64:$XB))]>;
+
+ def XVCVDPSP : XX2Form<60, 393,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvcvdpsp $XT, $XB", IIC_VecFP,
+ [(set v4f32:$XT, (int_ppc_vsx_xvcvdpsp v2f64:$XB))]>;
+ def XVCVDPSXDS : XX2Form<60, 472,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvcvdpsxds $XT, $XB", IIC_VecFP,
+ [(set v2i64:$XT, (fp_to_sint v2f64:$XB))]>;
+ def XVCVDPSXWS : XX2Form<60, 216,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvcvdpsxws $XT, $XB", IIC_VecFP,
+ [(set v4i32:$XT, (int_ppc_vsx_xvcvdpsxws v2f64:$XB))]>;
+ def XVCVDPUXDS : XX2Form<60, 456,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvcvdpuxds $XT, $XB", IIC_VecFP,
+ [(set v2i64:$XT, (fp_to_uint v2f64:$XB))]>;
+ def XVCVDPUXWS : XX2Form<60, 200,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvcvdpuxws $XT, $XB", IIC_VecFP,
+ [(set v4i32:$XT, (int_ppc_vsx_xvcvdpuxws v2f64:$XB))]>;
+
+ def XVCVSPDP : XX2Form<60, 457,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvcvspdp $XT, $XB", IIC_VecFP,
+ [(set v2f64:$XT, (int_ppc_vsx_xvcvspdp v4f32:$XB))]>;
+ def XVCVSPSXDS : XX2Form<60, 408,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvcvspsxds $XT, $XB", IIC_VecFP, []>;
+ def XVCVSPSXWS : XX2Form<60, 152,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvcvspsxws $XT, $XB", IIC_VecFP,
+ [(set v4i32:$XT, (fp_to_sint v4f32:$XB))]>;
+ def XVCVSPUXDS : XX2Form<60, 392,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvcvspuxds $XT, $XB", IIC_VecFP, []>;
+ def XVCVSPUXWS : XX2Form<60, 136,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvcvspuxws $XT, $XB", IIC_VecFP,
+ [(set v4i32:$XT, (fp_to_uint v4f32:$XB))]>;
+ def XVCVSXDDP : XX2Form<60, 504,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvcvsxddp $XT, $XB", IIC_VecFP,
+ [(set v2f64:$XT, (sint_to_fp v2i64:$XB))]>;
+ def XVCVSXDSP : XX2Form<60, 440,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvcvsxdsp $XT, $XB", IIC_VecFP,
+ [(set v4f32:$XT, (int_ppc_vsx_xvcvsxdsp v2i64:$XB))]>;
+ def XVCVSXWDP : XX2Form<60, 248,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvcvsxwdp $XT, $XB", IIC_VecFP,
+ [(set v2f64:$XT, (int_ppc_vsx_xvcvsxwdp v4i32:$XB))]>;
+ def XVCVSXWSP : XX2Form<60, 184,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvcvsxwsp $XT, $XB", IIC_VecFP,
+ [(set v4f32:$XT, (sint_to_fp v4i32:$XB))]>;
+ def XVCVUXDDP : XX2Form<60, 488,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvcvuxddp $XT, $XB", IIC_VecFP,
+ [(set v2f64:$XT, (uint_to_fp v2i64:$XB))]>;
+ def XVCVUXDSP : XX2Form<60, 424,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvcvuxdsp $XT, $XB", IIC_VecFP,
+ [(set v4f32:$XT, (int_ppc_vsx_xvcvuxdsp v2i64:$XB))]>;
+ def XVCVUXWDP : XX2Form<60, 232,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvcvuxwdp $XT, $XB", IIC_VecFP,
+ [(set v2f64:$XT, (int_ppc_vsx_xvcvuxwdp v4i32:$XB))]>;
+ def XVCVUXWSP : XX2Form<60, 168,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvcvuxwsp $XT, $XB", IIC_VecFP,
+ [(set v4f32:$XT, (uint_to_fp v4i32:$XB))]>;
+
+ // Rounding Instructions
+ def XSRDPI : XX2Form<60, 73,
+ (outs vsfrc:$XT), (ins vsfrc:$XB),
+ "xsrdpi $XT, $XB", IIC_VecFP,
+ [(set f64:$XT, (fround f64:$XB))]>;
+ def XSRDPIC : XX2Form<60, 107,
+ (outs vsfrc:$XT), (ins vsfrc:$XB),
+ "xsrdpic $XT, $XB", IIC_VecFP,
+ [(set f64:$XT, (fnearbyint f64:$XB))]>;
+ def XSRDPIM : XX2Form<60, 121,
+ (outs vsfrc:$XT), (ins vsfrc:$XB),
+ "xsrdpim $XT, $XB", IIC_VecFP,
+ [(set f64:$XT, (ffloor f64:$XB))]>;
+ def XSRDPIP : XX2Form<60, 105,
+ (outs vsfrc:$XT), (ins vsfrc:$XB),
+ "xsrdpip $XT, $XB", IIC_VecFP,
+ [(set f64:$XT, (fceil f64:$XB))]>;
+ def XSRDPIZ : XX2Form<60, 89,
+ (outs vsfrc:$XT), (ins vsfrc:$XB),
+ "xsrdpiz $XT, $XB", IIC_VecFP,
+ [(set f64:$XT, (ftrunc f64:$XB))]>;
+
+ def XVRDPI : XX2Form<60, 201,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvrdpi $XT, $XB", IIC_VecFP,
+ [(set v2f64:$XT, (fround v2f64:$XB))]>;
+ def XVRDPIC : XX2Form<60, 235,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvrdpic $XT, $XB", IIC_VecFP,
+ [(set v2f64:$XT, (fnearbyint v2f64:$XB))]>;
+ def XVRDPIM : XX2Form<60, 249,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvrdpim $XT, $XB", IIC_VecFP,
+ [(set v2f64:$XT, (ffloor v2f64:$XB))]>;
+ def XVRDPIP : XX2Form<60, 233,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvrdpip $XT, $XB", IIC_VecFP,
+ [(set v2f64:$XT, (fceil v2f64:$XB))]>;
+ def XVRDPIZ : XX2Form<60, 217,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvrdpiz $XT, $XB", IIC_VecFP,
+ [(set v2f64:$XT, (ftrunc v2f64:$XB))]>;
+
+ def XVRSPI : XX2Form<60, 137,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvrspi $XT, $XB", IIC_VecFP,
+ [(set v4f32:$XT, (fround v4f32:$XB))]>;
+ def XVRSPIC : XX2Form<60, 171,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvrspic $XT, $XB", IIC_VecFP,
+ [(set v4f32:$XT, (fnearbyint v4f32:$XB))]>;
+ def XVRSPIM : XX2Form<60, 185,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvrspim $XT, $XB", IIC_VecFP,
+ [(set v4f32:$XT, (ffloor v4f32:$XB))]>;
+ def XVRSPIP : XX2Form<60, 169,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvrspip $XT, $XB", IIC_VecFP,
+ [(set v4f32:$XT, (fceil v4f32:$XB))]>;
+ def XVRSPIZ : XX2Form<60, 153,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvrspiz $XT, $XB", IIC_VecFP,
+ [(set v4f32:$XT, (ftrunc v4f32:$XB))]>;
+
+ // Max/Min Instructions
+ let isCommutable = 1 in {
+ def XSMAXDP : XX3Form<60, 160,
+ (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
+ "xsmaxdp $XT, $XA, $XB", IIC_VecFP,
+ [(set vsfrc:$XT,
+ (int_ppc_vsx_xsmaxdp vsfrc:$XA, vsfrc:$XB))]>;
+ def XSMINDP : XX3Form<60, 168,
+ (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
+ "xsmindp $XT, $XA, $XB", IIC_VecFP,
+ [(set vsfrc:$XT,
+ (int_ppc_vsx_xsmindp vsfrc:$XA, vsfrc:$XB))]>;
+
+ def XVMAXDP : XX3Form<60, 224,
+ (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+ "xvmaxdp $XT, $XA, $XB", IIC_VecFP,
+ [(set vsrc:$XT,
+ (int_ppc_vsx_xvmaxdp vsrc:$XA, vsrc:$XB))]>;
+ def XVMINDP : XX3Form<60, 232,
+ (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+ "xvmindp $XT, $XA, $XB", IIC_VecFP,
+ [(set vsrc:$XT,
+ (int_ppc_vsx_xvmindp vsrc:$XA, vsrc:$XB))]>;
+
+ def XVMAXSP : XX3Form<60, 192,
+ (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+ "xvmaxsp $XT, $XA, $XB", IIC_VecFP,
+ [(set vsrc:$XT,
+ (int_ppc_vsx_xvmaxsp vsrc:$XA, vsrc:$XB))]>;
+ def XVMINSP : XX3Form<60, 200,
+ (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+ "xvminsp $XT, $XA, $XB", IIC_VecFP,
+ [(set vsrc:$XT,
+ (int_ppc_vsx_xvminsp vsrc:$XA, vsrc:$XB))]>;
+ } // isCommutable
+} // Uses = [RM]
+
+ // Logical Instructions
+ let isCommutable = 1 in
+ def XXLAND : XX3Form<60, 130,
+ (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+ "xxland $XT, $XA, $XB", IIC_VecGeneral,
+ [(set v4i32:$XT, (and v4i32:$XA, v4i32:$XB))]>;
+ def XXLANDC : XX3Form<60, 138,
+ (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+ "xxlandc $XT, $XA, $XB", IIC_VecGeneral,
+ [(set v4i32:$XT, (and v4i32:$XA,
+ (vnot_ppc v4i32:$XB)))]>;
+ let isCommutable = 1 in {
+ def XXLNOR : XX3Form<60, 162,
+ (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+ "xxlnor $XT, $XA, $XB", IIC_VecGeneral,
+ [(set v4i32:$XT, (vnot_ppc (or v4i32:$XA,
+ v4i32:$XB)))]>;
+ def XXLOR : XX3Form<60, 146,
+ (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+ "xxlor $XT, $XA, $XB", IIC_VecGeneral,
+ [(set v4i32:$XT, (or v4i32:$XA, v4i32:$XB))]>;
+ let isCodeGenOnly = 1 in
+ def XXLORf: XX3Form<60, 146,
+ (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
+ "xxlor $XT, $XA, $XB", IIC_VecGeneral, []>;
+ def XXLXOR : XX3Form<60, 154,
+ (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+ "xxlxor $XT, $XA, $XB", IIC_VecGeneral,
+ [(set v4i32:$XT, (xor v4i32:$XA, v4i32:$XB))]>;
+ } // isCommutable
+ let isCodeGenOnly = 1 in
+ def XXLXORz : XX3Form_Zero<60, 154, (outs vsrc:$XT), (ins),
+ "xxlxor $XT, $XT, $XT", IIC_VecGeneral,
+ [(set v4i32:$XT, (v4i32 immAllZerosV))]>;
+
+ let isCodeGenOnly = 1 in {
+ def XXLXORdpz : XX3Form_SetZero<60, 154,
+ (outs vsfrc:$XT), (ins),
+ "xxlxor $XT, $XT, $XT", IIC_VecGeneral,
+ [(set f64:$XT, (fpimm0))]>;
+ def XXLXORspz : XX3Form_SetZero<60, 154,
+ (outs vssrc:$XT), (ins),
+ "xxlxor $XT, $XT, $XT", IIC_VecGeneral,
+ [(set f32:$XT, (fpimm0))]>;
+ }
+
+ // Permutation Instructions
+ def XXMRGHW : XX3Form<60, 18,
+ (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+ "xxmrghw $XT, $XA, $XB", IIC_VecPerm, []>;
+ def XXMRGLW : XX3Form<60, 50,
+ (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+ "xxmrglw $XT, $XA, $XB", IIC_VecPerm, []>;
+
+ def XXPERMDI : XX3Form_2<60, 10,
+ (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, u2imm:$DM),
+ "xxpermdi $XT, $XA, $XB, $DM", IIC_VecPerm, []>;
+ let isCodeGenOnly = 1 in
+ def XXPERMDIs : XX3Form_2s<60, 10, (outs vsrc:$XT), (ins vsfrc:$XA, u2imm:$DM),
+ "xxpermdi $XT, $XA, $XA, $DM", IIC_VecPerm, []>;
+ def XXSEL : XX4Form<60, 3,
+ (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, vsrc:$XC),
+ "xxsel $XT, $XA, $XB, $XC", IIC_VecPerm, []>;
+
+ def XXSLDWI : XX3Form_2<60, 2,
+ (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, u2imm:$SHW),
+ "xxsldwi $XT, $XA, $XB, $SHW", IIC_VecPerm,
+ [(set v4i32:$XT, (PPCvecshl v4i32:$XA, v4i32:$XB,
+ imm32SExt16:$SHW))]>;
+ def XXSPLTW : XX2Form_2<60, 164,
+ (outs vsrc:$XT), (ins vsrc:$XB, u2imm:$UIM),
+ "xxspltw $XT, $XB, $UIM", IIC_VecPerm,
+ [(set v4i32:$XT,
+ (PPCxxsplt v4i32:$XB, imm32SExt16:$UIM))]>;
+ let isCodeGenOnly = 1 in
+ def XXSPLTWs : XX2Form_2<60, 164,
+ (outs vsrc:$XT), (ins vfrc:$XB, u2imm:$UIM),
+ "xxspltw $XT, $XB, $UIM", IIC_VecPerm, []>;
+} // hasSideEffects
+} // UseVSXReg = 1
+
+// SELECT_CC_* - Used to implement the SELECT_CC DAG operation. Expanded after
+// instruction selection into a branch sequence.
+let usesCustomInserter = 1, // Expanded after instruction selection.
+ PPC970_Single = 1 in {
+
+ def SELECT_CC_VSRC: Pseudo<(outs vsrc:$dst),
+ (ins crrc:$cond, vsrc:$T, vsrc:$F, i32imm:$BROPC),
+ "#SELECT_CC_VSRC",
+ []>;
+ def SELECT_VSRC: Pseudo<(outs vsrc:$dst),
+ (ins crbitrc:$cond, vsrc:$T, vsrc:$F),
+ "#SELECT_VSRC",
+ [(set v2f64:$dst,
+ (select i1:$cond, v2f64:$T, v2f64:$F))]>;
+ def SELECT_CC_VSFRC: Pseudo<(outs f8rc:$dst),
+ (ins crrc:$cond, f8rc:$T, f8rc:$F,
+ i32imm:$BROPC), "#SELECT_CC_VSFRC",
+ []>;
+ def SELECT_VSFRC: Pseudo<(outs f8rc:$dst),
+ (ins crbitrc:$cond, f8rc:$T, f8rc:$F),
+ "#SELECT_VSFRC",
+ [(set f64:$dst,
+ (select i1:$cond, f64:$T, f64:$F))]>;
+ def SELECT_CC_VSSRC: Pseudo<(outs f4rc:$dst),
+ (ins crrc:$cond, f4rc:$T, f4rc:$F,
+ i32imm:$BROPC), "#SELECT_CC_VSSRC",
+ []>;
+ def SELECT_VSSRC: Pseudo<(outs f4rc:$dst),
+ (ins crbitrc:$cond, f4rc:$T, f4rc:$F),
+ "#SELECT_VSSRC",
+ [(set f32:$dst,
+ (select i1:$cond, f32:$T, f32:$F))]>;
+} // usesCustomInserter
+} // AddedComplexity
+
+def : InstAlias<"xvmovdp $XT, $XB",
+ (XVCPSGNDP vsrc:$XT, vsrc:$XB, vsrc:$XB)>;
+def : InstAlias<"xvmovsp $XT, $XB",
+ (XVCPSGNSP vsrc:$XT, vsrc:$XB, vsrc:$XB)>;
+
+def : InstAlias<"xxspltd $XT, $XB, 0",
+ (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 0)>;
+def : InstAlias<"xxspltd $XT, $XB, 1",
+ (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 3)>;
+def : InstAlias<"xxmrghd $XT, $XA, $XB",
+ (XXPERMDI vsrc:$XT, vsrc:$XA, vsrc:$XB, 0)>;
+def : InstAlias<"xxmrgld $XT, $XA, $XB",
+ (XXPERMDI vsrc:$XT, vsrc:$XA, vsrc:$XB, 3)>;
+def : InstAlias<"xxswapd $XT, $XB",
+ (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 2)>;
+def : InstAlias<"xxspltd $XT, $XB, 0",
+ (XXPERMDIs vsrc:$XT, vsfrc:$XB, 0)>;
+def : InstAlias<"xxspltd $XT, $XB, 1",
+ (XXPERMDIs vsrc:$XT, vsfrc:$XB, 3)>;
+def : InstAlias<"xxswapd $XT, $XB",
+ (XXPERMDIs vsrc:$XT, vsfrc:$XB, 2)>;
+
+let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
+
+def : Pat<(v4i32 (vnot_ppc v4i32:$A)),
+ (v4i32 (XXLNOR $A, $A))>;
+let Predicates = [IsBigEndian] in {
+def : Pat<(v2f64 (scalar_to_vector f64:$A)),
+ (v2f64 (SUBREG_TO_REG (i64 1), $A, sub_64))>;
+
+def : Pat<(f64 (extractelt v2f64:$S, 0)),
+ (f64 (EXTRACT_SUBREG $S, sub_64))>;
+def : Pat<(f64 (extractelt v2f64:$S, 1)),
+ (f64 (EXTRACT_SUBREG (XXPERMDI $S, $S, 2), sub_64))>;
+}
+
+let Predicates = [IsLittleEndian] in {
+def : Pat<(v2f64 (scalar_to_vector f64:$A)),
+ (v2f64 (XXPERMDI (SUBREG_TO_REG (i64 1), $A, sub_64),
+ (SUBREG_TO_REG (i64 1), $A, sub_64), 0))>;
+
+def : Pat<(f64 (extractelt v2f64:$S, 0)),
+ (f64 (EXTRACT_SUBREG (XXPERMDI $S, $S, 2), sub_64))>;
+def : Pat<(f64 (extractelt v2f64:$S, 1)),
+ (f64 (EXTRACT_SUBREG $S, sub_64))>;
+}
+
+// Additional fnmsub patterns: -a*c + b == -(a*c - b)
+def : Pat<(fma (fneg f64:$A), f64:$C, f64:$B),
+ (XSNMSUBADP $B, $C, $A)>;
+def : Pat<(fma f64:$A, (fneg f64:$C), f64:$B),
+ (XSNMSUBADP $B, $C, $A)>;
+
+def : Pat<(fma (fneg v2f64:$A), v2f64:$C, v2f64:$B),
+ (XVNMSUBADP $B, $C, $A)>;
+def : Pat<(fma v2f64:$A, (fneg v2f64:$C), v2f64:$B),
+ (XVNMSUBADP $B, $C, $A)>;
+
+def : Pat<(fma (fneg v4f32:$A), v4f32:$C, v4f32:$B),
+ (XVNMSUBASP $B, $C, $A)>;
+def : Pat<(fma v4f32:$A, (fneg v4f32:$C), v4f32:$B),
+ (XVNMSUBASP $B, $C, $A)>;
+
+def : Pat<(v2f64 (bitconvert v4f32:$A)),
+ (COPY_TO_REGCLASS $A, VSRC)>;
+def : Pat<(v2f64 (bitconvert v4i32:$A)),
+ (COPY_TO_REGCLASS $A, VSRC)>;
+def : Pat<(v2f64 (bitconvert v8i16:$A)),
+ (COPY_TO_REGCLASS $A, VSRC)>;
+def : Pat<(v2f64 (bitconvert v16i8:$A)),
+ (COPY_TO_REGCLASS $A, VSRC)>;
+
+def : Pat<(v4f32 (bitconvert v2f64:$A)),
+ (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v4i32 (bitconvert v2f64:$A)),
+ (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v8i16 (bitconvert v2f64:$A)),
+ (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v16i8 (bitconvert v2f64:$A)),
+ (COPY_TO_REGCLASS $A, VRRC)>;
+
+def : Pat<(v2i64 (bitconvert v4f32:$A)),
+ (COPY_TO_REGCLASS $A, VSRC)>;
+def : Pat<(v2i64 (bitconvert v4i32:$A)),
+ (COPY_TO_REGCLASS $A, VSRC)>;
+def : Pat<(v2i64 (bitconvert v8i16:$A)),
+ (COPY_TO_REGCLASS $A, VSRC)>;
+def : Pat<(v2i64 (bitconvert v16i8:$A)),
+ (COPY_TO_REGCLASS $A, VSRC)>;
+
+def : Pat<(v4f32 (bitconvert v2i64:$A)),
+ (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v4i32 (bitconvert v2i64:$A)),
+ (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v8i16 (bitconvert v2i64:$A)),
+ (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v16i8 (bitconvert v2i64:$A)),
+ (COPY_TO_REGCLASS $A, VRRC)>;
+
+def : Pat<(v2f64 (bitconvert v2i64:$A)),
+ (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v2i64 (bitconvert v2f64:$A)),
+ (COPY_TO_REGCLASS $A, VRRC)>;
+
+def : Pat<(v2f64 (bitconvert v1i128:$A)),
+ (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v1i128 (bitconvert v2f64:$A)),
+ (COPY_TO_REGCLASS $A, VRRC)>;
+
+// sign extension patterns
+// To extend "in place" from v2i32 to v2i64, we have input data like:
+// | undef | i32 | undef | i32 |
+// but xvcvsxwdp expects the input in big-Endian format:
+// | i32 | undef | i32 | undef |
+// so we need to shift everything to the left by one i32 (word) before
+// the conversion.
+def : Pat<(sext_inreg v2i64:$C, v2i32),
+ (XVCVDPSXDS (XVCVSXWDP (XXSLDWI $C, $C, 1)))>;
+def : Pat<(v2f64 (sint_to_fp (sext_inreg v2i64:$C, v2i32))),
+ (XVCVSXWDP (XXSLDWI $C, $C, 1))>;
+
+def : Pat<(v2f64 (PPCsvec2fp v4i32:$C, 0)),
+ (v2f64 (XVCVSXWDP (v2i64 (XXMRGHW $C, $C))))>;
+def : Pat<(v2f64 (PPCsvec2fp v4i32:$C, 1)),
+ (v2f64 (XVCVSXWDP (v2i64 (XXMRGLW $C, $C))))>;
+
+def : Pat<(v2f64 (PPCuvec2fp v4i32:$C, 0)),
+ (v2f64 (XVCVUXWDP (v2i64 (XXMRGHW $C, $C))))>;
+def : Pat<(v2f64 (PPCuvec2fp v4i32:$C, 1)),
+ (v2f64 (XVCVUXWDP (v2i64 (XXMRGLW $C, $C))))>;
+
+// Loads.
+let Predicates = [HasVSX, HasOnlySwappingMemOps] in {
+ def : Pat<(v2f64 (PPClxvd2x xoaddr:$src)), (LXVD2X xoaddr:$src)>;
+
+ // Stores.
+ def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst),
+ (STXVD2X $rS, xoaddr:$dst)>;
+ def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xoaddr:$dst),
+ (STXVW4X $rS, xoaddr:$dst)>;
+ def : Pat<(int_ppc_vsx_stxvd2x_be v2f64:$rS, xoaddr:$dst),
+ (STXVD2X $rS, xoaddr:$dst)>;
+ def : Pat<(int_ppc_vsx_stxvw4x_be v4i32:$rS, xoaddr:$dst),
+ (STXVW4X $rS, xoaddr:$dst)>;
+ def : Pat<(PPCstxvd2x v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
+}
+let Predicates = [IsBigEndian, HasVSX, HasOnlySwappingMemOps] in {
+ def : Pat<(v2f64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
+ def : Pat<(v2i64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
+ def : Pat<(v4i32 (load xoaddr:$src)), (LXVW4X xoaddr:$src)>;
+ def : Pat<(store v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
+ def : Pat<(store v2i64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
+}
+
+// Permutes.
+def : Pat<(v2f64 (PPCxxswapd v2f64:$src)), (XXPERMDI $src, $src, 2)>;
+def : Pat<(v2i64 (PPCxxswapd v2i64:$src)), (XXPERMDI $src, $src, 2)>;
+def : Pat<(v4f32 (PPCxxswapd v4f32:$src)), (XXPERMDI $src, $src, 2)>;
+def : Pat<(v4i32 (PPCxxswapd v4i32:$src)), (XXPERMDI $src, $src, 2)>;
+def : Pat<(v2f64 (PPCswapNoChain v2f64:$src)), (XXPERMDI $src, $src, 2)>;
+
+// Selects.
+def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETLT)),
+ (SELECT_VSRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETULT)),
+ (SELECT_VSRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETLE)),
+ (SELECT_VSRC (CRORC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETULE)),
+ (SELECT_VSRC (CRORC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETEQ)),
+ (SELECT_VSRC (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETGE)),
+ (SELECT_VSRC (CRORC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETUGE)),
+ (SELECT_VSRC (CRORC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETGT)),
+ (SELECT_VSRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETUGT)),
+ (SELECT_VSRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETNE)),
+ (SELECT_VSRC (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETLT)),
+ (SELECT_VSFRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETULT)),
+ (SELECT_VSFRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETLE)),
+ (SELECT_VSFRC (CRORC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETULE)),
+ (SELECT_VSFRC (CRORC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETEQ)),
+ (SELECT_VSFRC (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETGE)),
+ (SELECT_VSFRC (CRORC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETUGE)),
+ (SELECT_VSFRC (CRORC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETGT)),
+ (SELECT_VSFRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETUGT)),
+ (SELECT_VSFRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETNE)),
+ (SELECT_VSFRC (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+// Divides.
+def : Pat<(int_ppc_vsx_xvdivsp v4f32:$A, v4f32:$B),
+ (XVDIVSP $A, $B)>;
+def : Pat<(int_ppc_vsx_xvdivdp v2f64:$A, v2f64:$B),
+ (XVDIVDP $A, $B)>;
+
+// Reciprocal estimate
+def : Pat<(int_ppc_vsx_xvresp v4f32:$A),
+ (XVRESP $A)>;
+def : Pat<(int_ppc_vsx_xvredp v2f64:$A),
+ (XVREDP $A)>;
+
+// Recip. square root estimate
+def : Pat<(int_ppc_vsx_xvrsqrtesp v4f32:$A),
+ (XVRSQRTESP $A)>;
+def : Pat<(int_ppc_vsx_xvrsqrtedp v2f64:$A),
+ (XVRSQRTEDP $A)>;
+
+let Predicates = [IsLittleEndian] in {
+def : Pat<(f64 (PPCfcfid (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
+ (f64 (XSCVSXDDP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
+def : Pat<(f64 (PPCfcfid (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
+ (f64 (XSCVSXDDP (COPY_TO_REGCLASS (f64 (COPY_TO_REGCLASS $S, VSRC)), VSFRC)))>;
+def : Pat<(f64 (PPCfcfidu (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
+ (f64 (XSCVUXDDP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
+def : Pat<(f64 (PPCfcfidu (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
+ (f64 (XSCVUXDDP (COPY_TO_REGCLASS (f64 (COPY_TO_REGCLASS $S, VSRC)), VSFRC)))>;
+} // IsLittleEndian
+
+let Predicates = [IsBigEndian] in {
+def : Pat<(f64 (PPCfcfid (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
+ (f64 (XSCVSXDDP (COPY_TO_REGCLASS $S, VSFRC)))>;
+def : Pat<(f64 (PPCfcfid (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
+ (f64 (XSCVSXDDP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
+def : Pat<(f64 (PPCfcfidu (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
+ (f64 (XSCVUXDDP (COPY_TO_REGCLASS $S, VSFRC)))>;
+def : Pat<(f64 (PPCfcfidu (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
+ (f64 (XSCVUXDDP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
+} // IsBigEndian
+
+} // AddedComplexity
+} // HasVSX
+
+def ScalarLoads {
+ dag Li8 = (i32 (extloadi8 xoaddr:$src));
+ dag ZELi8 = (i32 (zextloadi8 xoaddr:$src));
+ dag ZELi8i64 = (i64 (zextloadi8 xoaddr:$src));
+ dag SELi8 = (i32 (sext_inreg (extloadi8 xoaddr:$src), i8));
+ dag SELi8i64 = (i64 (sext_inreg (extloadi8 xoaddr:$src), i8));
+
+ dag Li16 = (i32 (extloadi16 xoaddr:$src));
+ dag ZELi16 = (i32 (zextloadi16 xoaddr:$src));
+ dag ZELi16i64 = (i64 (zextloadi16 xoaddr:$src));
+ dag SELi16 = (i32 (sextloadi16 xoaddr:$src));
+ dag SELi16i64 = (i64 (sextloadi16 xoaddr:$src));
+
+ dag Li32 = (i32 (load xoaddr:$src));
+}
+
+// The following VSX instructions were introduced in Power ISA 2.07
+/* FIXME: if the operands are v2i64, these patterns will not match.
+ we should define new patterns or otherwise match the same patterns
+ when the elements are larger than i32.
+*/
+def HasP8Vector : Predicate<"PPCSubTarget->hasP8Vector()">;
+def HasDirectMove : Predicate<"PPCSubTarget->hasDirectMove()">;
+let Predicates = [HasP8Vector] in {
+let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
+ let isCommutable = 1, UseVSXReg = 1 in {
+ def XXLEQV : XX3Form<60, 186,
+ (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+ "xxleqv $XT, $XA, $XB", IIC_VecGeneral,
+ [(set v4i32:$XT, (vnot_ppc (xor v4i32:$XA, v4i32:$XB)))]>;
+ def XXLNAND : XX3Form<60, 178,
+ (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+ "xxlnand $XT, $XA, $XB", IIC_VecGeneral,
+ [(set v4i32:$XT, (vnot_ppc (and v4i32:$XA,
+ v4i32:$XB)))]>;
+ } // isCommutable, UseVSXReg
+
+ def : Pat<(int_ppc_vsx_xxleqv v4i32:$A, v4i32:$B),
+ (XXLEQV $A, $B)>;
+
+ let UseVSXReg = 1 in {
+ def XXLORC : XX3Form<60, 170,
+ (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+ "xxlorc $XT, $XA, $XB", IIC_VecGeneral,
+ [(set v4i32:$XT, (or v4i32:$XA, (vnot_ppc v4i32:$XB)))]>;
+
+ // VSX scalar loads introduced in ISA 2.07
+ let mayLoad = 1 in {
+ let CodeSize = 3 in
+ def LXSSPX : XX1Form<31, 524, (outs vssrc:$XT), (ins memrr:$src),
+ "lxsspx $XT, $src", IIC_LdStLFD,
+ [(set f32:$XT, (load xoaddr:$src))]>;
+ def LXSIWAX : XX1Form<31, 76, (outs vsfrc:$XT), (ins memrr:$src),
+ "lxsiwax $XT, $src", IIC_LdStLFD,
+ [(set f64:$XT, (PPClfiwax xoaddr:$src))]>;
+ def LXSIWZX : XX1Form<31, 12, (outs vsfrc:$XT), (ins memrr:$src),
+ "lxsiwzx $XT, $src", IIC_LdStLFD,
+ [(set f64:$XT, (PPClfiwzx xoaddr:$src))]>;
+ } // mayLoad
+
+ // VSX scalar stores introduced in ISA 2.07
+ let mayStore = 1 in {
+ let CodeSize = 3 in
+ def STXSSPX : XX1Form<31, 652, (outs), (ins vssrc:$XT, memrr:$dst),
+ "stxsspx $XT, $dst", IIC_LdStSTFD,
+ [(store f32:$XT, xoaddr:$dst)]>;
+ def STXSIWX : XX1Form<31, 140, (outs), (ins vsfrc:$XT, memrr:$dst),
+ "stxsiwx $XT, $dst", IIC_LdStSTFD,
+ [(PPCstfiwx f64:$XT, xoaddr:$dst)]>;
+ } // mayStore
+ } // UseVSXReg = 1
+
+ def : Pat<(f64 (extloadf32 xoaddr:$src)),
+ (COPY_TO_REGCLASS (LXSSPX xoaddr:$src), VSFRC)>;
+ def : Pat<(f32 (fpround (extloadf32 xoaddr:$src))),
+ (f32 (LXSSPX xoaddr:$src))>;
+ def : Pat<(f64 (fpextend f32:$src)),
+ (COPY_TO_REGCLASS $src, VSFRC)>;
+
+ def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLT)),
+ (SELECT_VSSRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+ def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETULT)),
+ (SELECT_VSSRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+ def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLE)),
+ (SELECT_VSSRC (CRORC $lhs, $rhs), $tval, $fval)>;
+ def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETULE)),
+ (SELECT_VSSRC (CRORC $rhs, $lhs), $tval, $fval)>;
+ def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETEQ)),
+ (SELECT_VSSRC (CREQV $lhs, $rhs), $tval, $fval)>;
+ def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETGE)),
+ (SELECT_VSSRC (CRORC $rhs, $lhs), $tval, $fval)>;
+ def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETUGE)),
+ (SELECT_VSSRC (CRORC $lhs, $rhs), $tval, $fval)>;
+ def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETGT)),
+ (SELECT_VSSRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+ def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETUGT)),
+ (SELECT_VSSRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+ def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETNE)),
+ (SELECT_VSSRC (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+ let UseVSXReg = 1 in {
+ // VSX Elementary Scalar FP arithmetic (SP)
+ let isCommutable = 1 in {
+ def XSADDSP : XX3Form<60, 0,
+ (outs vssrc:$XT), (ins vssrc:$XA, vssrc:$XB),
+ "xsaddsp $XT, $XA, $XB", IIC_VecFP,
+ [(set f32:$XT, (fadd f32:$XA, f32:$XB))]>;
+ def XSMULSP : XX3Form<60, 16,
+ (outs vssrc:$XT), (ins vssrc:$XA, vssrc:$XB),
+ "xsmulsp $XT, $XA, $XB", IIC_VecFP,
+ [(set f32:$XT, (fmul f32:$XA, f32:$XB))]>;
+ } // isCommutable
+
+ def XSDIVSP : XX3Form<60, 24,
+ (outs vssrc:$XT), (ins vssrc:$XA, vssrc:$XB),
+ "xsdivsp $XT, $XA, $XB", IIC_FPDivS,
+ [(set f32:$XT, (fdiv f32:$XA, f32:$XB))]>;
+ def XSRESP : XX2Form<60, 26,
+ (outs vssrc:$XT), (ins vssrc:$XB),
+ "xsresp $XT, $XB", IIC_VecFP,
+ [(set f32:$XT, (PPCfre f32:$XB))]>;
+ def XSSQRTSP : XX2Form<60, 11,
+ (outs vssrc:$XT), (ins vssrc:$XB),
+ "xssqrtsp $XT, $XB", IIC_FPSqrtS,
+ [(set f32:$XT, (fsqrt f32:$XB))]>;
+ def XSRSQRTESP : XX2Form<60, 10,
+ (outs vssrc:$XT), (ins vssrc:$XB),
+ "xsrsqrtesp $XT, $XB", IIC_VecFP,
+ [(set f32:$XT, (PPCfrsqrte f32:$XB))]>;
+ def XSSUBSP : XX3Form<60, 8,
+ (outs vssrc:$XT), (ins vssrc:$XA, vssrc:$XB),
+ "xssubsp $XT, $XA, $XB", IIC_VecFP,
+ [(set f32:$XT, (fsub f32:$XA, f32:$XB))]>;
+
+ // FMA Instructions
+ let BaseName = "XSMADDASP" in {
+ let isCommutable = 1 in
+ def XSMADDASP : XX3Form<60, 1,
+ (outs vssrc:$XT),
+ (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
+ "xsmaddasp $XT, $XA, $XB", IIC_VecFP,
+ [(set f32:$XT, (fma f32:$XA, f32:$XB, f32:$XTi))]>,
+ RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ AltVSXFMARel;
+ let IsVSXFMAAlt = 1 in
+ def XSMADDMSP : XX3Form<60, 9,
+ (outs vssrc:$XT),
+ (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
+ "xsmaddmsp $XT, $XA, $XB", IIC_VecFP, []>,
+ RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ AltVSXFMARel;
+ }
+
+ let BaseName = "XSMSUBASP" in {
+ let isCommutable = 1 in
+ def XSMSUBASP : XX3Form<60, 17,
+ (outs vssrc:$XT),
+ (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
+ "xsmsubasp $XT, $XA, $XB", IIC_VecFP,
+ [(set f32:$XT, (fma f32:$XA, f32:$XB,
+ (fneg f32:$XTi)))]>,
+ RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ AltVSXFMARel;
+ let IsVSXFMAAlt = 1 in
+ def XSMSUBMSP : XX3Form<60, 25,
+ (outs vssrc:$XT),
+ (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
+ "xsmsubmsp $XT, $XA, $XB", IIC_VecFP, []>,
+ RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ AltVSXFMARel;
+ }
+
+ let BaseName = "XSNMADDASP" in {
+ let isCommutable = 1 in
+ def XSNMADDASP : XX3Form<60, 129,
+ (outs vssrc:$XT),
+ (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
+ "xsnmaddasp $XT, $XA, $XB", IIC_VecFP,
+ [(set f32:$XT, (fneg (fma f32:$XA, f32:$XB,
+ f32:$XTi)))]>,
+ RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ AltVSXFMARel;
+ let IsVSXFMAAlt = 1 in
+ def XSNMADDMSP : XX3Form<60, 137,
+ (outs vssrc:$XT),
+ (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
+ "xsnmaddmsp $XT, $XA, $XB", IIC_VecFP, []>,
+ RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ AltVSXFMARel;
+ }
+
+ let BaseName = "XSNMSUBASP" in {
+ let isCommutable = 1 in
+ def XSNMSUBASP : XX3Form<60, 145,
+ (outs vssrc:$XT),
+ (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
+ "xsnmsubasp $XT, $XA, $XB", IIC_VecFP,
+ [(set f32:$XT, (fneg (fma f32:$XA, f32:$XB,
+ (fneg f32:$XTi))))]>,
+ RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ AltVSXFMARel;
+ let IsVSXFMAAlt = 1 in
+ def XSNMSUBMSP : XX3Form<60, 153,
+ (outs vssrc:$XT),
+ (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
+ "xsnmsubmsp $XT, $XA, $XB", IIC_VecFP, []>,
+ RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ AltVSXFMARel;
+ }
+
+ // Single Precision Conversions (FP <-> INT)
+ def XSCVSXDSP : XX2Form<60, 312,
+ (outs vssrc:$XT), (ins vsfrc:$XB),
+ "xscvsxdsp $XT, $XB", IIC_VecFP,
+ [(set f32:$XT, (PPCfcfids f64:$XB))]>;
+ def XSCVUXDSP : XX2Form<60, 296,
+ (outs vssrc:$XT), (ins vsfrc:$XB),
+ "xscvuxdsp $XT, $XB", IIC_VecFP,
+ [(set f32:$XT, (PPCfcfidus f64:$XB))]>;
+
+ // Conversions between vector and scalar single precision
+ def XSCVDPSPN : XX2Form<60, 267, (outs vsrc:$XT), (ins vssrc:$XB),
+ "xscvdpspn $XT, $XB", IIC_VecFP, []>;
+ def XSCVSPDPN : XX2Form<60, 331, (outs vssrc:$XT), (ins vsrc:$XB),
+ "xscvspdpn $XT, $XB", IIC_VecFP, []>;
+ } // UseVSXReg = 1
+
+ let Predicates = [IsLittleEndian] in {
+ def : Pat<(f32 (PPCfcfids (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
+ (f32 (XSCVSXDSP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
+ def : Pat<(f32 (PPCfcfids (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
+ (f32 (XSCVSXDSP (COPY_TO_REGCLASS (f64 (COPY_TO_REGCLASS $S, VSRC)), VSFRC)))>;
+ def : Pat<(f32 (PPCfcfidus (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
+ (f32 (XSCVUXDSP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
+ def : Pat<(f32 (PPCfcfidus (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
+ (f32 (XSCVUXDSP (COPY_TO_REGCLASS (f64 (COPY_TO_REGCLASS $S, VSRC)), VSFRC)))>;
+ }
+
+ let Predicates = [IsBigEndian] in {
+ def : Pat<(f32 (PPCfcfids (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
+ (f32 (XSCVSXDSP (COPY_TO_REGCLASS $S, VSFRC)))>;
+ def : Pat<(f32 (PPCfcfids (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
+ (f32 (XSCVSXDSP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
+ def : Pat<(f32 (PPCfcfidus (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
+ (f32 (XSCVUXDSP (COPY_TO_REGCLASS $S, VSFRC)))>;
+ def : Pat<(f32 (PPCfcfidus (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
+ (f32 (XSCVUXDSP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
+ }
+ def : Pat<(v4i32 (scalar_to_vector ScalarLoads.Li32)),
+ (v4i32 (XXSPLTWs (LXSIWAX xoaddr:$src), 1))>;
+} // AddedComplexity = 400
+} // HasP8Vector
+
+let UseVSXReg = 1, AddedComplexity = 400 in {
+let Predicates = [HasDirectMove] in {
+ // VSX direct move instructions
+ def MFVSRD : XX1_RS6_RD5_XO<31, 51, (outs g8rc:$rA), (ins vsfrc:$XT),
+ "mfvsrd $rA, $XT", IIC_VecGeneral,
+ [(set i64:$rA, (PPCmfvsr f64:$XT))]>,
+ Requires<[In64BitMode]>;
+ def MFVSRWZ : XX1_RS6_RD5_XO<31, 115, (outs gprc:$rA), (ins vsfrc:$XT),
+ "mfvsrwz $rA, $XT", IIC_VecGeneral,
+ [(set i32:$rA, (PPCmfvsr f64:$XT))]>;
+ def MTVSRD : XX1_RS6_RD5_XO<31, 179, (outs vsfrc:$XT), (ins g8rc:$rA),
+ "mtvsrd $XT, $rA", IIC_VecGeneral,
+ [(set f64:$XT, (PPCmtvsra i64:$rA))]>,
+ Requires<[In64BitMode]>;
+ def MTVSRWA : XX1_RS6_RD5_XO<31, 211, (outs vsfrc:$XT), (ins gprc:$rA),
+ "mtvsrwa $XT, $rA", IIC_VecGeneral,
+ [(set f64:$XT, (PPCmtvsra i32:$rA))]>;
+ def MTVSRWZ : XX1_RS6_RD5_XO<31, 243, (outs vsfrc:$XT), (ins gprc:$rA),
+ "mtvsrwz $XT, $rA", IIC_VecGeneral,
+ [(set f64:$XT, (PPCmtvsrz i32:$rA))]>;
+} // HasDirectMove
+
+let Predicates = [IsISA3_0, HasDirectMove] in {
+ def MTVSRWS: XX1_RS6_RD5_XO<31, 403, (outs vsrc:$XT), (ins gprc:$rA),
+ "mtvsrws $XT, $rA", IIC_VecGeneral, []>;
+
+ def MTVSRDD: XX1Form<31, 435, (outs vsrc:$XT), (ins g8rc:$rA, g8rc:$rB),
+ "mtvsrdd $XT, $rA, $rB", IIC_VecGeneral,
+ []>, Requires<[In64BitMode]>;
+
+ def MFVSRLD: XX1_RS6_RD5_XO<31, 307, (outs g8rc:$rA), (ins vsrc:$XT),
+ "mfvsrld $rA, $XT", IIC_VecGeneral,
+ []>, Requires<[In64BitMode]>;
+
+} // IsISA3_0, HasDirectMove
+} // UseVSXReg = 1
+
+/* Direct moves of various widths from GPR's into VSR's. Each move lines
+ the value up into element 0 (both BE and LE). Namely, entities smaller than
+ a doubleword are shifted left and moved for BE. For LE, they're moved, then
+ swapped to go into the least significant element of the VSR.
+*/
+def MovesToVSR {
+ dag BE_BYTE_0 =
+ (MTVSRD
+ (RLDICR
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32), 56, 7));
+ dag BE_HALF_0 =
+ (MTVSRD
+ (RLDICR
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32), 48, 15));
+ dag BE_WORD_0 =
+ (MTVSRD
+ (RLDICR
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32), 32, 31));
+ dag BE_DWORD_0 = (MTVSRD $A);
+
+ dag LE_MTVSRW = (MTVSRD (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32));
+ dag LE_WORD_1 = (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
+ LE_MTVSRW, sub_64));
+ dag LE_WORD_0 = (XXPERMDI LE_WORD_1, LE_WORD_1, 2);
+ dag LE_DWORD_1 = (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
+ BE_DWORD_0, sub_64));
+ dag LE_DWORD_0 = (XXPERMDI LE_DWORD_1, LE_DWORD_1, 2);
+}
+
+/* Patterns for extracting elements out of vectors. Integer elements are
+ extracted using direct move operations. Patterns for extracting elements
+ whose indices are not available at compile time are also provided with
+ various _VARIABLE_ patterns.
+ The numbering for the DAG's is for LE, but when used on BE, the correct
+ LE element can just be used (i.e. LE_BYTE_2 == BE_BYTE_13).
+*/
+def VectorExtractions {
+ // Doubleword extraction
+ dag LE_DWORD_0 =
+ (MFVSRD
+ (EXTRACT_SUBREG
+ (XXPERMDI (COPY_TO_REGCLASS $S, VSRC),
+ (COPY_TO_REGCLASS $S, VSRC), 2), sub_64));
+ dag LE_DWORD_1 = (MFVSRD
+ (EXTRACT_SUBREG
+ (v2i64 (COPY_TO_REGCLASS $S, VSRC)), sub_64));
+
+ // Word extraction
+ dag LE_WORD_0 = (MFVSRWZ (EXTRACT_SUBREG (XXPERMDI $S, $S, 2), sub_64));
+ dag LE_WORD_1 = (MFVSRWZ (EXTRACT_SUBREG (XXSLDWI $S, $S, 1), sub_64));
+ dag LE_WORD_2 = (MFVSRWZ (EXTRACT_SUBREG
+ (v2i64 (COPY_TO_REGCLASS $S, VSRC)), sub_64));
+ dag LE_WORD_3 = (MFVSRWZ (EXTRACT_SUBREG (XXSLDWI $S, $S, 3), sub_64));
+
+ // Halfword extraction
+ dag LE_HALF_0 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 0, 48), sub_32));
+ dag LE_HALF_1 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 48, 48), sub_32));
+ dag LE_HALF_2 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 32, 48), sub_32));
+ dag LE_HALF_3 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 16, 48), sub_32));
+ dag LE_HALF_4 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 0, 48), sub_32));
+ dag LE_HALF_5 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 48, 48), sub_32));
+ dag LE_HALF_6 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 32, 48), sub_32));
+ dag LE_HALF_7 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 16, 48), sub_32));
+
+ // Byte extraction
+ dag LE_BYTE_0 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 0, 56), sub_32));
+ dag LE_BYTE_1 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 56, 56), sub_32));
+ dag LE_BYTE_2 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 48, 56), sub_32));
+ dag LE_BYTE_3 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 40, 56), sub_32));
+ dag LE_BYTE_4 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 32, 56), sub_32));
+ dag LE_BYTE_5 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 24, 56), sub_32));
+ dag LE_BYTE_6 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 16, 56), sub_32));
+ dag LE_BYTE_7 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 8, 56), sub_32));
+ dag LE_BYTE_8 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 0, 56), sub_32));
+ dag LE_BYTE_9 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 56, 56), sub_32));
+ dag LE_BYTE_10 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 48, 56), sub_32));
+ dag LE_BYTE_11 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 40, 56), sub_32));
+ dag LE_BYTE_12 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 32, 56), sub_32));
+ dag LE_BYTE_13 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 24, 56), sub_32));
+ dag LE_BYTE_14 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 16, 56), sub_32));
+ dag LE_BYTE_15 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 8, 56), sub_32));
+
+ /* Variable element number (BE and LE patterns must be specified separately)
+ This is a rather involved process.
+
+ Conceptually, this is how the move is accomplished:
+ 1. Identify which doubleword contains the element
+ 2. Shift in the VMX register so that the correct doubleword is correctly
+ lined up for the MFVSRD
+ 3. Perform the move so that the element (along with some extra stuff)
+ is in the GPR
+ 4. Right shift within the GPR so that the element is right-justified
+
+ Of course, the index is an element number which has a different meaning
+ on LE/BE so the patterns have to be specified separately.
+
+ Note: The final result will be the element right-justified with high
+ order bits being arbitrarily defined (namely, whatever was in the
+ vector register to the left of the value originally).
+ */
+
+ /* LE variable byte
+ Number 1. above:
+ - For elements 0-7, we shift left by 8 bytes since they're on the right
+ - For elements 8-15, we need not shift (shift left by zero bytes)
+ This is accomplished by inverting the bits of the index and AND-ing
+ with 0x8 (i.e. clearing all bits of the index and inverting bit 60).
+ */
+ dag LE_VBYTE_PERM_VEC = (LVSL ZERO8, (ANDC8 (LI8 8), $Idx));
+
+ // Number 2. above:
+ // - Now that we set up the shift amount, we shift in the VMX register
+ dag LE_VBYTE_PERMUTE = (VPERM $S, $S, LE_VBYTE_PERM_VEC);
+
+ // Number 3. above:
+ // - The doubleword containing our element is moved to a GPR
+ dag LE_MV_VBYTE = (MFVSRD
+ (EXTRACT_SUBREG
+ (v2i64 (COPY_TO_REGCLASS LE_VBYTE_PERMUTE, VSRC)),
+ sub_64));
+
+ /* Number 4. above:
+ - Truncate the element number to the range 0-7 (8-15 are symmetrical
+ and out of range values are truncated accordingly)
+ - Multiply by 8 as we need to shift right by the number of bits, not bytes
+ - Shift right in the GPR by the calculated value
+ */
+ dag LE_VBYTE_SHIFT = (EXTRACT_SUBREG (RLDICR (AND8 (LI8 7), $Idx), 3, 60),
+ sub_32);
+ dag LE_VARIABLE_BYTE = (EXTRACT_SUBREG (SRD LE_MV_VBYTE, LE_VBYTE_SHIFT),
+ sub_32);
+
+ /* LE variable halfword
+ Number 1. above:
+ - For elements 0-3, we shift left by 8 since they're on the right
+ - For elements 4-7, we need not shift (shift left by zero bytes)
+ Similarly to the byte pattern, we invert the bits of the index, but we
+ AND with 0x4 (i.e. clear all bits of the index and invert bit 61).
+ Of course, the shift is still by 8 bytes, so we must multiply by 2.
+ */
+ dag LE_VHALF_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDC8 (LI8 4), $Idx), 1, 62));
+
+ // Number 2. above:
+ // - Now that we set up the shift amount, we shift in the VMX register
+ dag LE_VHALF_PERMUTE = (VPERM $S, $S, LE_VHALF_PERM_VEC);
+
+ // Number 3. above:
+ // - The doubleword containing our element is moved to a GPR
+ dag LE_MV_VHALF = (MFVSRD
+ (EXTRACT_SUBREG
+ (v2i64 (COPY_TO_REGCLASS LE_VHALF_PERMUTE, VSRC)),
+ sub_64));
+
+ /* Number 4. above:
+ - Truncate the element number to the range 0-3 (4-7 are symmetrical
+ and out of range values are truncated accordingly)
+ - Multiply by 16 as we need to shift right by the number of bits
+ - Shift right in the GPR by the calculated value
+ */
+ dag LE_VHALF_SHIFT = (EXTRACT_SUBREG (RLDICR (AND8 (LI8 3), $Idx), 4, 59),
+ sub_32);
+ dag LE_VARIABLE_HALF = (EXTRACT_SUBREG (SRD LE_MV_VHALF, LE_VHALF_SHIFT),
+ sub_32);
+
+ /* LE variable word
+ Number 1. above:
+ - For elements 0-1, we shift left by 8 since they're on the right
+ - For elements 2-3, we need not shift
+ */
+ dag LE_VWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDC8 (LI8 2), $Idx), 2, 61));
+
+ // Number 2. above:
+ // - Now that we set up the shift amount, we shift in the VMX register
+ dag LE_VWORD_PERMUTE = (VPERM $S, $S, LE_VWORD_PERM_VEC);
+
+ // Number 3. above:
+ // - The doubleword containing our element is moved to a GPR
+ dag LE_MV_VWORD = (MFVSRD
+ (EXTRACT_SUBREG
+ (v2i64 (COPY_TO_REGCLASS LE_VWORD_PERMUTE, VSRC)),
+ sub_64));
+
+ /* Number 4. above:
+ - Truncate the element number to the range 0-1 (2-3 are symmetrical
+ and out of range values are truncated accordingly)
+ - Multiply by 32 as we need to shift right by the number of bits
+ - Shift right in the GPR by the calculated value
+ */
+ dag LE_VWORD_SHIFT = (EXTRACT_SUBREG (RLDICR (AND8 (LI8 1), $Idx), 5, 58),
+ sub_32);
+ dag LE_VARIABLE_WORD = (EXTRACT_SUBREG (SRD LE_MV_VWORD, LE_VWORD_SHIFT),
+ sub_32);
+
+ /* LE variable doubleword
+ Number 1. above:
+ - For element 0, we shift left by 8 since it's on the right
+ - For element 1, we need not shift
+ */
+ dag LE_VDWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDC8 (LI8 1), $Idx), 3, 60));
+
+ // Number 2. above:
+ // - Now that we set up the shift amount, we shift in the VMX register
+ dag LE_VDWORD_PERMUTE = (VPERM $S, $S, LE_VDWORD_PERM_VEC);
+
+ // Number 3. above:
+ // - The doubleword containing our element is moved to a GPR
+ // - Number 4. is not needed for the doubleword as the value is 64-bits
+ dag LE_VARIABLE_DWORD =
+ (MFVSRD (EXTRACT_SUBREG
+ (v2i64 (COPY_TO_REGCLASS LE_VDWORD_PERMUTE, VSRC)),
+ sub_64));
+
+ /* LE variable float
+ - Shift the vector to line up the desired element to BE Word 0
+ - Convert 32-bit float to a 64-bit single precision float
+ */
+ dag LE_VFLOAT_PERM_VEC = (LVSL ZERO8, (RLDICR (XOR8 (LI8 3), $Idx), 2, 61));
+ dag LE_VFLOAT_PERMUTE = (VPERM $S, $S, LE_VFLOAT_PERM_VEC);
+ dag LE_VARIABLE_FLOAT = (XSCVSPDPN LE_VFLOAT_PERMUTE);
+
+ /* LE variable double
+ Same as the LE doubleword except there is no move.
+ */
+ dag LE_VDOUBLE_PERMUTE = (VPERM (COPY_TO_REGCLASS $S, VRRC),
+ (COPY_TO_REGCLASS $S, VRRC),
+ LE_VDWORD_PERM_VEC);
+ dag LE_VARIABLE_DOUBLE = (COPY_TO_REGCLASS LE_VDOUBLE_PERMUTE, VSRC);
+
+ /* BE variable byte
+ The algorithm here is the same as the LE variable byte except:
+ - The shift in the VMX register is by 0/8 for opposite element numbers so
+ we simply AND the element number with 0x8
+ - The order of elements after the move to GPR is reversed, so we invert
+ the bits of the index prior to truncating to the range 0-7
+ */
+ dag BE_VBYTE_PERM_VEC = (LVSL ZERO8, (ANDIo8 $Idx, 8));
+ dag BE_VBYTE_PERMUTE = (VPERM $S, $S, BE_VBYTE_PERM_VEC);
+ dag BE_MV_VBYTE = (MFVSRD
+ (EXTRACT_SUBREG
+ (v2i64 (COPY_TO_REGCLASS BE_VBYTE_PERMUTE, VSRC)),
+ sub_64));
+ dag BE_VBYTE_SHIFT = (EXTRACT_SUBREG (RLDICR (ANDC8 (LI8 7), $Idx), 3, 60),
+ sub_32);
+ dag BE_VARIABLE_BYTE = (EXTRACT_SUBREG (SRD BE_MV_VBYTE, BE_VBYTE_SHIFT),
+ sub_32);
+
+ /* BE variable halfword
+ The algorithm here is the same as the LE variable halfword except:
+ - The shift in the VMX register is by 0/8 for opposite element numbers so
+ we simply AND the element number with 0x4 and multiply by 2
+ - The order of elements after the move to GPR is reversed, so we invert
+ the bits of the index prior to truncating to the range 0-3
+ */
+ dag BE_VHALF_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDIo8 $Idx, 4), 1, 62));
+ dag BE_VHALF_PERMUTE = (VPERM $S, $S, BE_VHALF_PERM_VEC);
+ dag BE_MV_VHALF = (MFVSRD
+ (EXTRACT_SUBREG
+ (v2i64 (COPY_TO_REGCLASS BE_VHALF_PERMUTE, VSRC)),
+ sub_64));
+ dag BE_VHALF_SHIFT = (EXTRACT_SUBREG (RLDICR (ANDC8 (LI8 3), $Idx), 4, 59),
+ sub_32);
+ dag BE_VARIABLE_HALF = (EXTRACT_SUBREG (SRD BE_MV_VHALF, BE_VHALF_SHIFT),
+ sub_32);
+
+ /* BE variable word
+ The algorithm is the same as the LE variable word except:
+ - The shift in the VMX register happens for opposite element numbers
+ - The order of elements after the move to GPR is reversed, so we invert
+ the bits of the index prior to truncating to the range 0-1
+ */
+ dag BE_VWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDIo8 $Idx, 2), 2, 61));
+ dag BE_VWORD_PERMUTE = (VPERM $S, $S, BE_VWORD_PERM_VEC);
+ dag BE_MV_VWORD = (MFVSRD
+ (EXTRACT_SUBREG
+ (v2i64 (COPY_TO_REGCLASS BE_VWORD_PERMUTE, VSRC)),
+ sub_64));
+ dag BE_VWORD_SHIFT = (EXTRACT_SUBREG (RLDICR (ANDC8 (LI8 1), $Idx), 5, 58),
+ sub_32);
+ dag BE_VARIABLE_WORD = (EXTRACT_SUBREG (SRD BE_MV_VWORD, BE_VWORD_SHIFT),
+ sub_32);
+
+ /* BE variable doubleword
+ Same as the LE doubleword except we shift in the VMX register for opposite
+ element indices.
+ */
+ dag BE_VDWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDIo8 $Idx, 1), 3, 60));
+ dag BE_VDWORD_PERMUTE = (VPERM $S, $S, BE_VDWORD_PERM_VEC);
+ dag BE_VARIABLE_DWORD =
+ (MFVSRD (EXTRACT_SUBREG
+ (v2i64 (COPY_TO_REGCLASS BE_VDWORD_PERMUTE, VSRC)),
+ sub_64));
+
+ /* BE variable float
+ - Shift the vector to line up the desired element to BE Word 0
+ - Convert 32-bit float to a 64-bit single precision float
+ */
+ dag BE_VFLOAT_PERM_VEC = (LVSL ZERO8, (RLDICR $Idx, 2, 61));
+ dag BE_VFLOAT_PERMUTE = (VPERM $S, $S, BE_VFLOAT_PERM_VEC);
+ dag BE_VARIABLE_FLOAT = (XSCVSPDPN BE_VFLOAT_PERMUTE);
+
+ /* BE variable double
+ Same as the BE doubleword except there is no move.
+ */
+ dag BE_VDOUBLE_PERMUTE = (VPERM (COPY_TO_REGCLASS $S, VRRC),
+ (COPY_TO_REGCLASS $S, VRRC),
+ BE_VDWORD_PERM_VEC);
+ dag BE_VARIABLE_DOUBLE = (COPY_TO_REGCLASS BE_VDOUBLE_PERMUTE, VSRC);
+}
+
+let AddedComplexity = 400 in {
+// v4f32 scalar <-> vector conversions (BE)
+let Predicates = [IsBigEndian, HasP8Vector] in {
+ def : Pat<(v4f32 (scalar_to_vector f32:$A)),
+ (v4f32 (XSCVDPSPN $A))>;
+ def : Pat<(f32 (vector_extract v4f32:$S, 0)),
+ (f32 (XSCVSPDPN $S))>;
+ def : Pat<(f32 (vector_extract v4f32:$S, 1)),
+ (f32 (XSCVSPDPN (XXSLDWI $S, $S, 1)))>;
+ def : Pat<(f32 (vector_extract v4f32:$S, 2)),
+ (f32 (XSCVSPDPN (XXPERMDI $S, $S, 2)))>;
+ def : Pat<(f32 (vector_extract v4f32:$S, 3)),
+ (f32 (XSCVSPDPN (XXSLDWI $S, $S, 3)))>;
+ def : Pat<(f32 (vector_extract v4f32:$S, i64:$Idx)),
+ (f32 VectorExtractions.BE_VARIABLE_FLOAT)>;
+} // IsBigEndian, HasP8Vector
+
+// Variable index vector_extract for v2f64 does not require P8Vector
+let Predicates = [IsBigEndian, HasVSX] in
+ def : Pat<(f64 (vector_extract v2f64:$S, i64:$Idx)),
+ (f64 VectorExtractions.BE_VARIABLE_DOUBLE)>;
+
+let Predicates = [IsBigEndian, HasDirectMove] in {
+ // v16i8 scalar <-> vector conversions (BE)
+ def : Pat<(v16i8 (scalar_to_vector i32:$A)),
+ (v16i8 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_BYTE_0, sub_64))>;
+ def : Pat<(v8i16 (scalar_to_vector i32:$A)),
+ (v8i16 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_HALF_0, sub_64))>;
+ def : Pat<(v4i32 (scalar_to_vector i32:$A)),
+ (v4i32 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_WORD_0, sub_64))>;
+ def : Pat<(v2i64 (scalar_to_vector i64:$A)),
+ (v2i64 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_DWORD_0, sub_64))>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 0)),
+ (i32 VectorExtractions.LE_BYTE_15)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 1)),
+ (i32 VectorExtractions.LE_BYTE_14)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 2)),
+ (i32 VectorExtractions.LE_BYTE_13)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 3)),
+ (i32 VectorExtractions.LE_BYTE_12)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 4)),
+ (i32 VectorExtractions.LE_BYTE_11)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 5)),
+ (i32 VectorExtractions.LE_BYTE_10)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 6)),
+ (i32 VectorExtractions.LE_BYTE_9)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 7)),
+ (i32 VectorExtractions.LE_BYTE_8)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 8)),
+ (i32 VectorExtractions.LE_BYTE_7)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 9)),
+ (i32 VectorExtractions.LE_BYTE_6)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 10)),
+ (i32 VectorExtractions.LE_BYTE_5)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 11)),
+ (i32 VectorExtractions.LE_BYTE_4)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 12)),
+ (i32 VectorExtractions.LE_BYTE_3)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 13)),
+ (i32 VectorExtractions.LE_BYTE_2)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 14)),
+ (i32 VectorExtractions.LE_BYTE_1)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 15)),
+ (i32 VectorExtractions.LE_BYTE_0)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)),
+ (i32 VectorExtractions.BE_VARIABLE_BYTE)>;
+
+ // v8i16 scalar <-> vector conversions (BE)
+ def : Pat<(i32 (vector_extract v8i16:$S, 0)),
+ (i32 VectorExtractions.LE_HALF_7)>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 1)),
+ (i32 VectorExtractions.LE_HALF_6)>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 2)),
+ (i32 VectorExtractions.LE_HALF_5)>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 3)),
+ (i32 VectorExtractions.LE_HALF_4)>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 4)),
+ (i32 VectorExtractions.LE_HALF_3)>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 5)),
+ (i32 VectorExtractions.LE_HALF_2)>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 6)),
+ (i32 VectorExtractions.LE_HALF_1)>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 7)),
+ (i32 VectorExtractions.LE_HALF_0)>;
+ def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)),
+ (i32 VectorExtractions.BE_VARIABLE_HALF)>;
+
+ // v4i32 scalar <-> vector conversions (BE)
+ def : Pat<(i32 (vector_extract v4i32:$S, 0)),
+ (i32 VectorExtractions.LE_WORD_3)>;
+ def : Pat<(i32 (vector_extract v4i32:$S, 1)),
+ (i32 VectorExtractions.LE_WORD_2)>;
+ def : Pat<(i32 (vector_extract v4i32:$S, 2)),
+ (i32 VectorExtractions.LE_WORD_1)>;
+ def : Pat<(i32 (vector_extract v4i32:$S, 3)),
+ (i32 VectorExtractions.LE_WORD_0)>;
+ def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)),
+ (i32 VectorExtractions.BE_VARIABLE_WORD)>;
+
+ // v2i64 scalar <-> vector conversions (BE)
+ def : Pat<(i64 (vector_extract v2i64:$S, 0)),
+ (i64 VectorExtractions.LE_DWORD_1)>;
+ def : Pat<(i64 (vector_extract v2i64:$S, 1)),
+ (i64 VectorExtractions.LE_DWORD_0)>;
+ def : Pat<(i64 (vector_extract v2i64:$S, i64:$Idx)),
+ (i64 VectorExtractions.BE_VARIABLE_DWORD)>;
+} // IsBigEndian, HasDirectMove
+
+// v4f32 scalar <-> vector conversions (LE)
+let Predicates = [IsLittleEndian, HasP8Vector] in {
+ def : Pat<(v4f32 (scalar_to_vector f32:$A)),
+ (v4f32 (XXSLDWI (XSCVDPSPN $A), (XSCVDPSPN $A), 1))>;
+ def : Pat<(f32 (vector_extract v4f32:$S, 0)),
+ (f32 (XSCVSPDPN (XXSLDWI $S, $S, 3)))>;
+ def : Pat<(f32 (vector_extract v4f32:$S, 1)),
+ (f32 (XSCVSPDPN (XXPERMDI $S, $S, 2)))>;
+ def : Pat<(f32 (vector_extract v4f32:$S, 2)),
+ (f32 (XSCVSPDPN (XXSLDWI $S, $S, 1)))>;
+ def : Pat<(f32 (vector_extract v4f32:$S, 3)),
+ (f32 (XSCVSPDPN $S))>;
+ def : Pat<(f32 (vector_extract v4f32:$S, i64:$Idx)),
+ (f32 VectorExtractions.LE_VARIABLE_FLOAT)>;
+} // IsLittleEndian, HasP8Vector
+
+// Variable index vector_extract for v2f64 does not require P8Vector
+let Predicates = [IsLittleEndian, HasVSX] in
+ def : Pat<(f64 (vector_extract v2f64:$S, i64:$Idx)),
+ (f64 VectorExtractions.LE_VARIABLE_DOUBLE)>;
+
+ def : Pat<(v4i32 (int_ppc_vsx_lxvw4x_be xoaddr:$src)), (LXVW4X xoaddr:$src)>;
+ def : Pat<(v2f64 (int_ppc_vsx_lxvd2x_be xoaddr:$src)), (LXVD2X xoaddr:$src)>;
+
+let Predicates = [IsLittleEndian, HasDirectMove] in {
+ // v16i8 scalar <-> vector conversions (LE)
+ def : Pat<(v16i8 (scalar_to_vector i32:$A)),
+ (v16i8 (COPY_TO_REGCLASS MovesToVSR.LE_WORD_0, VSRC))>;
+ def : Pat<(v8i16 (scalar_to_vector i32:$A)),
+ (v8i16 (COPY_TO_REGCLASS MovesToVSR.LE_WORD_0, VSRC))>;
+ def : Pat<(v4i32 (scalar_to_vector i32:$A)),
+ (v4i32 MovesToVSR.LE_WORD_0)>;
+ def : Pat<(v2i64 (scalar_to_vector i64:$A)),
+ (v2i64 MovesToVSR.LE_DWORD_0)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 0)),
+ (i32 VectorExtractions.LE_BYTE_0)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 1)),
+ (i32 VectorExtractions.LE_BYTE_1)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 2)),
+ (i32 VectorExtractions.LE_BYTE_2)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 3)),
+ (i32 VectorExtractions.LE_BYTE_3)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 4)),
+ (i32 VectorExtractions.LE_BYTE_4)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 5)),
+ (i32 VectorExtractions.LE_BYTE_5)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 6)),
+ (i32 VectorExtractions.LE_BYTE_6)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 7)),
+ (i32 VectorExtractions.LE_BYTE_7)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 8)),
+ (i32 VectorExtractions.LE_BYTE_8)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 9)),
+ (i32 VectorExtractions.LE_BYTE_9)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 10)),
+ (i32 VectorExtractions.LE_BYTE_10)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 11)),
+ (i32 VectorExtractions.LE_BYTE_11)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 12)),
+ (i32 VectorExtractions.LE_BYTE_12)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 13)),
+ (i32 VectorExtractions.LE_BYTE_13)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 14)),
+ (i32 VectorExtractions.LE_BYTE_14)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 15)),
+ (i32 VectorExtractions.LE_BYTE_15)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)),
+ (i32 VectorExtractions.LE_VARIABLE_BYTE)>;
+
+ // v8i16 scalar <-> vector conversions (LE)
+ def : Pat<(i32 (vector_extract v8i16:$S, 0)),
+ (i32 VectorExtractions.LE_HALF_0)>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 1)),
+ (i32 VectorExtractions.LE_HALF_1)>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 2)),
+ (i32 VectorExtractions.LE_HALF_2)>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 3)),
+ (i32 VectorExtractions.LE_HALF_3)>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 4)),
+ (i32 VectorExtractions.LE_HALF_4)>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 5)),
+ (i32 VectorExtractions.LE_HALF_5)>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 6)),
+ (i32 VectorExtractions.LE_HALF_6)>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 7)),
+ (i32 VectorExtractions.LE_HALF_7)>;
+ def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)),
+ (i32 VectorExtractions.LE_VARIABLE_HALF)>;
+
+ // v4i32 scalar <-> vector conversions (LE)
+ def : Pat<(i32 (vector_extract v4i32:$S, 0)),
+ (i32 VectorExtractions.LE_WORD_0)>;
+ def : Pat<(i32 (vector_extract v4i32:$S, 1)),
+ (i32 VectorExtractions.LE_WORD_1)>;
+ def : Pat<(i32 (vector_extract v4i32:$S, 2)),
+ (i32 VectorExtractions.LE_WORD_2)>;
+ def : Pat<(i32 (vector_extract v4i32:$S, 3)),
+ (i32 VectorExtractions.LE_WORD_3)>;
+ def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)),
+ (i32 VectorExtractions.LE_VARIABLE_WORD)>;
+
+ // v2i64 scalar <-> vector conversions (LE)
+ def : Pat<(i64 (vector_extract v2i64:$S, 0)),
+ (i64 VectorExtractions.LE_DWORD_0)>;
+ def : Pat<(i64 (vector_extract v2i64:$S, 1)),
+ (i64 VectorExtractions.LE_DWORD_1)>;
+ def : Pat<(i64 (vector_extract v2i64:$S, i64:$Idx)),
+ (i64 VectorExtractions.LE_VARIABLE_DWORD)>;
+} // IsLittleEndian, HasDirectMove
+
+let Predicates = [HasDirectMove, HasVSX] in {
+// bitconvert f32 -> i32
+// (convert to 32-bit fp single, shift right 1 word, move to GPR)
+def : Pat<(i32 (bitconvert f32:$S)),
+ (i32 (MFVSRWZ (EXTRACT_SUBREG
+ (XXSLDWI (XSCVDPSPN $S),(XSCVDPSPN $S), 3),
+ sub_64)))>;
+// bitconvert i32 -> f32
+// (move to FPR, shift left 1 word, convert to 64-bit fp single)
+def : Pat<(f32 (bitconvert i32:$A)),
+ (f32 (XSCVSPDPN
+ (XXSLDWI MovesToVSR.LE_WORD_1, MovesToVSR.LE_WORD_1, 1)))>;
+
+// bitconvert f64 -> i64
+// (move to GPR, nothing else needed)
+def : Pat<(i64 (bitconvert f64:$S)),
+ (i64 (MFVSRD $S))>;
+
+// bitconvert i64 -> f64
+// (move to FPR, nothing else needed)
+def : Pat<(f64 (bitconvert i64:$S)),
+ (f64 (MTVSRD $S))>;
+}
+
+// Materialize a zero-vector of long long
+def : Pat<(v2i64 immAllZerosV),
+ (v2i64 (XXLXORz))>;
+}
+
+def AlignValues {
+ dag F32_TO_BE_WORD1 = (v4f32 (XXSLDWI (XSCVDPSPN $B), (XSCVDPSPN $B), 3));
+ dag I32_TO_BE_WORD1 = (COPY_TO_REGCLASS (MTVSRWZ $B), VSRC);
+}
+
+// The following VSX instructions were introduced in Power ISA 3.0
+def HasP9Vector : Predicate<"PPCSubTarget->hasP9Vector()">;
+let AddedComplexity = 400, Predicates = [HasP9Vector] in {
+
+ // [PO VRT XO VRB XO /]
+ class X_VT5_XO5_VB5<bits<6> opcode, bits<5> xo2, bits<10> xo, string opc,
+ list<dag> pattern>
+ : X_RD5_XO5_RS5<opcode, xo2, xo, (outs vrrc:$vT), (ins vrrc:$vB),
+ !strconcat(opc, " $vT, $vB"), IIC_VecFP, pattern>;
+
+ // [PO VRT XO VRB XO RO], Round to Odd version of [PO VRT XO VRB XO /]
+ class X_VT5_XO5_VB5_Ro<bits<6> opcode, bits<5> xo2, bits<10> xo, string opc,
+ list<dag> pattern>
+ : X_VT5_XO5_VB5<opcode, xo2, xo, opc, pattern>, isDOT;
+
+ // [PO VRT XO VRB XO /], but the VRB is only used the left 64 bits (or less),
+ // So we use different operand class for VRB
+ class X_VT5_XO5_VB5_TyVB<bits<6> opcode, bits<5> xo2, bits<10> xo, string opc,
+ RegisterOperand vbtype, list<dag> pattern>
+ : X_RD5_XO5_RS5<opcode, xo2, xo, (outs vrrc:$vT), (ins vbtype:$vB),
+ !strconcat(opc, " $vT, $vB"), IIC_VecFP, pattern>;
+
+ let UseVSXReg = 1 in {
+ // [PO T XO B XO BX /]
+ class XX2_RT5_XO5_XB6<bits<6> opcode, bits<5> xo2, bits<9> xo, string opc,
+ list<dag> pattern>
+ : XX2_RD5_XO5_RS6<opcode, xo2, xo, (outs g8rc:$rT), (ins vsfrc:$XB),
+ !strconcat(opc, " $rT, $XB"), IIC_VecFP, pattern>;
+
+ // [PO T XO B XO BX TX]
+ class XX2_XT6_XO5_XB6<bits<6> opcode, bits<5> xo2, bits<9> xo, string opc,
+ RegisterOperand vtype, list<dag> pattern>
+ : XX2_RD6_XO5_RS6<opcode, xo2, xo, (outs vtype:$XT), (ins vtype:$XB),
+ !strconcat(opc, " $XT, $XB"), IIC_VecFP, pattern>;
+
+ // [PO T A B XO AX BX TX], src and dest register use different operand class
+ class XX3_XT5_XA5_XB5<bits<6> opcode, bits<8> xo, string opc,
+ RegisterOperand xty, RegisterOperand aty, RegisterOperand bty,
+ InstrItinClass itin, list<dag> pattern>
+ : XX3Form<opcode, xo, (outs xty:$XT), (ins aty:$XA, bty:$XB),
+ !strconcat(opc, " $XT, $XA, $XB"), itin, pattern>;
+ } // UseVSXReg = 1
+
+ // [PO VRT VRA VRB XO /]
+ class X_VT5_VA5_VB5<bits<6> opcode, bits<10> xo, string opc,
+ list<dag> pattern>
+ : XForm_1<opcode, xo, (outs vrrc:$vT), (ins vrrc:$vA, vrrc:$vB),
+ !strconcat(opc, " $vT, $vA, $vB"), IIC_VecFP, pattern>;
+
+ // [PO VRT VRA VRB XO RO], Round to Odd version of [PO VRT VRA VRB XO /]
+ class X_VT5_VA5_VB5_Ro<bits<6> opcode, bits<10> xo, string opc,
+ list<dag> pattern>
+ : X_VT5_VA5_VB5<opcode, xo, opc, pattern>, isDOT;
+
+ //===--------------------------------------------------------------------===//
+ // Quad-Precision Scalar Move Instructions:
+
+ // Copy Sign
+ def XSCPSGNQP : X_VT5_VA5_VB5<63, 100, "xscpsgnqp", []>;
+
+ // Absolute/Negative-Absolute/Negate
+ def XSABSQP : X_VT5_XO5_VB5<63, 0, 804, "xsabsqp" , []>;
+ def XSNABSQP : X_VT5_XO5_VB5<63, 8, 804, "xsnabsqp", []>;
+ def XSNEGQP : X_VT5_XO5_VB5<63, 16, 804, "xsnegqp" , []>;
+
+ //===--------------------------------------------------------------------===//
+ // Quad-Precision Scalar Floating-Point Arithmetic Instructions:
+
+ // Add/Divide/Multiply/Subtract
+ def XSADDQP : X_VT5_VA5_VB5 <63, 4, "xsaddqp" , []>;
+ def XSADDQPO : X_VT5_VA5_VB5_Ro<63, 4, "xsaddqpo", []>;
+ def XSDIVQP : X_VT5_VA5_VB5 <63, 548, "xsdivqp" , []>;
+ def XSDIVQPO : X_VT5_VA5_VB5_Ro<63, 548, "xsdivqpo", []>;
+ def XSMULQP : X_VT5_VA5_VB5 <63, 36, "xsmulqp" , []>;
+ def XSMULQPO : X_VT5_VA5_VB5_Ro<63, 36, "xsmulqpo", []>;
+ def XSSUBQP : X_VT5_VA5_VB5 <63, 516, "xssubqp" , []>;
+ def XSSUBQPO : X_VT5_VA5_VB5_Ro<63, 516, "xssubqpo", []>;
+
+ // Square-Root
+ def XSSQRTQP : X_VT5_XO5_VB5 <63, 27, 804, "xssqrtqp" , []>;
+ def XSSQRTQPO : X_VT5_XO5_VB5_Ro<63, 27, 804, "xssqrtqpo", []>;
+
+ // (Negative) Multiply-{Add/Subtract}
+ def XSMADDQP : X_VT5_VA5_VB5 <63, 388, "xsmaddqp" , []>;
+ def XSMADDQPO : X_VT5_VA5_VB5_Ro<63, 388, "xsmaddqpo" , []>;
+ def XSMSUBQP : X_VT5_VA5_VB5 <63, 420, "xsmsubqp" , []>;
+ def XSMSUBQPO : X_VT5_VA5_VB5_Ro<63, 420, "xsmsubqpo" , []>;
+ def XSNMADDQP : X_VT5_VA5_VB5 <63, 452, "xsnmaddqp" , []>;
+ def XSNMADDQPO: X_VT5_VA5_VB5_Ro<63, 452, "xsnmaddqpo", []>;
+ def XSNMSUBQP : X_VT5_VA5_VB5 <63, 484, "xsnmsubqp" , []>;
+ def XSNMSUBQPO: X_VT5_VA5_VB5_Ro<63, 484, "xsnmsubqpo", []>;
+
+ //===--------------------------------------------------------------------===//
+ // Quad/Double-Precision Compare Instructions:
+
+ // [PO BF // VRA VRB XO /]
+ class X_BF3_VA5_VB5<bits<6> opcode, bits<10> xo, string opc,
+ list<dag> pattern>
+ : XForm_17<opcode, xo, (outs crrc:$crD), (ins vrrc:$VA, vrrc:$VB),
+ !strconcat(opc, " $crD, $VA, $VB"), IIC_FPCompare> {
+ let Pattern = pattern;
+ }
+
+ // QP Compare Ordered/Unordered
+ def XSCMPOQP : X_BF3_VA5_VB5<63, 132, "xscmpoqp", []>;
+ def XSCMPUQP : X_BF3_VA5_VB5<63, 644, "xscmpuqp", []>;
+
+ // DP/QP Compare Exponents
+ def XSCMPEXPDP : XX3Form_1<60, 59,
+ (outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB),
+ "xscmpexpdp $crD, $XA, $XB", IIC_FPCompare, []>,
+ UseVSXReg;
+ def XSCMPEXPQP : X_BF3_VA5_VB5<63, 164, "xscmpexpqp", []>;
+
+ // DP Compare ==, >=, >, !=
+ // Use vsrc for XT, because the entire register of XT is set.
+ // XT.dword[1] = 0x0000_0000_0000_0000
+ def XSCMPEQDP : XX3_XT5_XA5_XB5<60, 3, "xscmpeqdp", vsrc, vsfrc, vsfrc,
+ IIC_FPCompare, []>;
+ def XSCMPGEDP : XX3_XT5_XA5_XB5<60, 19, "xscmpgedp", vsrc, vsfrc, vsfrc,
+ IIC_FPCompare, []>;
+ def XSCMPGTDP : XX3_XT5_XA5_XB5<60, 11, "xscmpgtdp", vsrc, vsfrc, vsfrc,
+ IIC_FPCompare, []>;
+ def XSCMPNEDP : XX3_XT5_XA5_XB5<60, 27, "xscmpnedp", vsrc, vsfrc, vsfrc,
+ IIC_FPCompare, []>;
+ let UseVSXReg = 1 in {
+ // Vector Compare Not Equal
+ def XVCMPNEDP : XX3Form_Rc<60, 123,
+ (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+ "xvcmpnedp $XT, $XA, $XB", IIC_VecFPCompare, []>;
+ let Defs = [CR6] in
+ def XVCMPNEDPo : XX3Form_Rc<60, 123,
+ (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+ "xvcmpnedp. $XT, $XA, $XB", IIC_VecFPCompare, []>,
+ isDOT;
+ def XVCMPNESP : XX3Form_Rc<60, 91,
+ (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+ "xvcmpnesp $XT, $XA, $XB", IIC_VecFPCompare, []>;
+ let Defs = [CR6] in
+ def XVCMPNESPo : XX3Form_Rc<60, 91,
+ (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+ "xvcmpnesp. $XT, $XA, $XB", IIC_VecFPCompare, []>,
+ isDOT;
+ } // UseVSXReg = 1
+
+ //===--------------------------------------------------------------------===//
+ // Quad-Precision Floating-Point Conversion Instructions:
+
+ // Convert DP -> QP
+ def XSCVDPQP : X_VT5_XO5_VB5_TyVB<63, 22, 836, "xscvdpqp", vfrc, []>;
+
+ // Round & Convert QP -> DP (dword[1] is set to zero)
+ def XSCVQPDP : X_VT5_XO5_VB5 <63, 20, 836, "xscvqpdp" , []>;
+ def XSCVQPDPO : X_VT5_XO5_VB5_Ro<63, 20, 836, "xscvqpdpo", []>;
+
+ // Truncate & Convert QP -> (Un)Signed (D)Word (dword[1] is set to zero)
+ def XSCVQPSDZ : X_VT5_XO5_VB5<63, 25, 836, "xscvqpsdz", []>;
+ def XSCVQPSWZ : X_VT5_XO5_VB5<63, 9, 836, "xscvqpswz", []>;
+ def XSCVQPUDZ : X_VT5_XO5_VB5<63, 17, 836, "xscvqpudz", []>;
+ def XSCVQPUWZ : X_VT5_XO5_VB5<63, 1, 836, "xscvqpuwz", []>;
+
+ // Convert (Un)Signed DWord -> QP
+ def XSCVSDQP : X_VT5_XO5_VB5_TyVB<63, 10, 836, "xscvsdqp", vfrc, []>;
+ def XSCVUDQP : X_VT5_XO5_VB5_TyVB<63, 2, 836, "xscvudqp", vfrc, []>;
+
+ let UseVSXReg = 1 in {
+ //===--------------------------------------------------------------------===//
+ // Round to Floating-Point Integer Instructions
+
+ // (Round &) Convert DP <-> HP
+ // Note! xscvdphp's src and dest register both use the left 64 bits, so we use
+ // vsfrc for src and dest register. xscvhpdp's src only use the left 16 bits,
+ // but we still use vsfrc for it.
+ def XSCVDPHP : XX2_XT6_XO5_XB6<60, 17, 347, "xscvdphp", vsfrc, []>;
+ def XSCVHPDP : XX2_XT6_XO5_XB6<60, 16, 347, "xscvhpdp", vsfrc, []>;
+
+ // Vector HP -> SP
+ def XVCVHPSP : XX2_XT6_XO5_XB6<60, 24, 475, "xvcvhpsp", vsrc, []>;
+ def XVCVSPHP : XX2_XT6_XO5_XB6<60, 25, 475, "xvcvsphp", vsrc,
+ [(set v4f32:$XT,
+ (int_ppc_vsx_xvcvsphp v4f32:$XB))]>;
+
+ } // UseVSXReg = 1
+
+ // Pattern for matching Vector HP -> Vector SP intrinsic. Defined as a
+ // seperate pattern so that it can convert the input register class from
+ // VRRC(v8i16) to VSRC.
+ def : Pat<(v4f32 (int_ppc_vsx_xvcvhpsp v8i16:$A)),
+ (v4f32 (XVCVHPSP (COPY_TO_REGCLASS $A, VSRC)))>;
+
+ class Z23_VT5_R1_VB5_RMC2_EX1<bits<6> opcode, bits<8> xo, bit ex, string opc,
+ list<dag> pattern>
+ : Z23Form_1<opcode, xo,
+ (outs vrrc:$vT), (ins u1imm:$r, vrrc:$vB, u2imm:$rmc),
+ !strconcat(opc, " $r, $vT, $vB, $rmc"), IIC_VecFP, pattern> {
+ let RC = ex;
+ }
+
+ // Round to Quad-Precision Integer [with Inexact]
+ def XSRQPI : Z23_VT5_R1_VB5_RMC2_EX1<63, 5, 0, "xsrqpi" , []>;
+ def XSRQPIX : Z23_VT5_R1_VB5_RMC2_EX1<63, 5, 1, "xsrqpix", []>;
+
+ // Round Quad-Precision to Double-Extended Precision (fp80)
+ def XSRQPXP : Z23_VT5_R1_VB5_RMC2_EX1<63, 37, 0, "xsrqpxp", []>;
+
+ //===--------------------------------------------------------------------===//
+ // Insert/Extract Instructions
+
+ // Insert Exponent DP/QP
+ // XT NOTE: XT.dword[1] = 0xUUUU_UUUU_UUUU_UUUU
+ def XSIEXPDP : XX1Form <60, 918, (outs vsrc:$XT), (ins g8rc:$rA, g8rc:$rB),
+ "xsiexpdp $XT, $rA, $rB", IIC_VecFP, []>, UseVSXReg;
+ // vB NOTE: only vB.dword[0] is used, that's why we don't use
+ // X_VT5_VA5_VB5 form
+ def XSIEXPQP : XForm_18<63, 868, (outs vrrc:$vT), (ins vrrc:$vA, vsfrc:$vB),
+ "xsiexpqp $vT, $vA, $vB", IIC_VecFP, []>;
+
+ // Extract Exponent/Significand DP/QP
+ def XSXEXPDP : XX2_RT5_XO5_XB6<60, 0, 347, "xsxexpdp", []>;
+ def XSXSIGDP : XX2_RT5_XO5_XB6<60, 1, 347, "xsxsigdp", []>;
+
+ def XSXEXPQP : X_VT5_XO5_VB5 <63, 2, 804, "xsxexpqp", []>;
+ def XSXSIGQP : X_VT5_XO5_VB5 <63, 18, 804, "xsxsigqp", []>;
+
+ // Vector Insert Word
+ let UseVSXReg = 1 in {
+ // XB NOTE: Only XB.dword[1] is used, but we use vsrc on XB.
+ def XXINSERTW :
+ XX2_RD6_UIM5_RS6<60, 181, (outs vsrc:$XT),
+ (ins vsrc:$XTi, vsrc:$XB, u4imm:$UIM),
+ "xxinsertw $XT, $XB, $UIM", IIC_VecFP,
+ [(set v4i32:$XT, (PPCxxinsert v4i32:$XTi, v4i32:$XB,
+ imm32SExt16:$UIM))]>,
+ RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">;
+
+ // Vector Extract Unsigned Word
+ def XXEXTRACTUW : XX2_RD6_UIM5_RS6<60, 165,
+ (outs vsfrc:$XT), (ins vsrc:$XB, u4imm:$UIMM),
+ "xxextractuw $XT, $XB, $UIMM", IIC_VecFP, []>;
+ } // UseVSXReg = 1
+
+ // Vector Insert Exponent DP/SP
+ def XVIEXPDP : XX3_XT5_XA5_XB5<60, 248, "xviexpdp", vsrc, vsrc, vsrc,
+ IIC_VecFP, [(set v2f64: $XT,(int_ppc_vsx_xviexpdp v2i64:$XA, v2i64:$XB))]>;
+ def XVIEXPSP : XX3_XT5_XA5_XB5<60, 216, "xviexpsp", vsrc, vsrc, vsrc,
+ IIC_VecFP, [(set v4f32: $XT,(int_ppc_vsx_xviexpsp v4i32:$XA, v4i32:$XB))]>;
+
+ // Vector Extract Exponent/Significand DP/SP
+ def XVXEXPDP : XX2_XT6_XO5_XB6<60, 0, 475, "xvxexpdp", vsrc,
+ [(set v2i64: $XT,
+ (int_ppc_vsx_xvxexpdp v2f64:$XB))]>;
+ def XVXEXPSP : XX2_XT6_XO5_XB6<60, 8, 475, "xvxexpsp", vsrc,
+ [(set v4i32: $XT,
+ (int_ppc_vsx_xvxexpsp v4f32:$XB))]>;
+ def XVXSIGDP : XX2_XT6_XO5_XB6<60, 1, 475, "xvxsigdp", vsrc,
+ [(set v2i64: $XT,
+ (int_ppc_vsx_xvxsigdp v2f64:$XB))]>;
+ def XVXSIGSP : XX2_XT6_XO5_XB6<60, 9, 475, "xvxsigsp", vsrc,
+ [(set v4i32: $XT,
+ (int_ppc_vsx_xvxsigsp v4f32:$XB))]>;
+
+ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
+ // Extra patterns expanding to vector Extract Word/Insert Word
+ def : Pat<(v4i32 (int_ppc_vsx_xxinsertw v4i32:$A, v2i64:$B, imm:$IMM)),
+ (v4i32 (XXINSERTW $A, $B, imm:$IMM))>;
+ def : Pat<(v2i64 (int_ppc_vsx_xxextractuw v2i64:$A, imm:$IMM)),
+ (v2i64 (COPY_TO_REGCLASS (XXEXTRACTUW $A, imm:$IMM), VSRC))>;
+ } // AddedComplexity = 400, HasP9Vector
+
+ //===--------------------------------------------------------------------===//
+
+ // Test Data Class SP/DP/QP
+ let UseVSXReg = 1 in {
+ def XSTSTDCSP : XX2_BF3_DCMX7_RS6<60, 298,
+ (outs crrc:$BF), (ins u7imm:$DCMX, vsfrc:$XB),
+ "xststdcsp $BF, $XB, $DCMX", IIC_VecFP, []>;
+ def XSTSTDCDP : XX2_BF3_DCMX7_RS6<60, 362,
+ (outs crrc:$BF), (ins u7imm:$DCMX, vsfrc:$XB),
+ "xststdcdp $BF, $XB, $DCMX", IIC_VecFP, []>;
+ } // UseVSXReg = 1
+ def XSTSTDCQP : X_BF3_DCMX7_RS5 <63, 708,
+ (outs crrc:$BF), (ins u7imm:$DCMX, vrrc:$vB),
+ "xststdcqp $BF, $vB, $DCMX", IIC_VecFP, []>;
+
+ // Vector Test Data Class SP/DP
+ let UseVSXReg = 1 in {
+ def XVTSTDCSP : XX2_RD6_DCMX7_RS6<60, 13, 5,
+ (outs vsrc:$XT), (ins u7imm:$DCMX, vsrc:$XB),
+ "xvtstdcsp $XT, $XB, $DCMX", IIC_VecFP,
+ [(set v4i32: $XT,
+ (int_ppc_vsx_xvtstdcsp v4f32:$XB, imm:$DCMX))]>;
+ def XVTSTDCDP : XX2_RD6_DCMX7_RS6<60, 15, 5,
+ (outs vsrc:$XT), (ins u7imm:$DCMX, vsrc:$XB),
+ "xvtstdcdp $XT, $XB, $DCMX", IIC_VecFP,
+ [(set v2i64: $XT,
+ (int_ppc_vsx_xvtstdcdp v2f64:$XB, imm:$DCMX))]>;
+ } // UseVSXReg = 1
+
+ //===--------------------------------------------------------------------===//
+
+ // Maximum/Minimum Type-C/Type-J DP
+ // XT.dword[1] = 0xUUUU_UUUU_UUUU_UUUU, so we use vsrc for XT
+ def XSMAXCDP : XX3_XT5_XA5_XB5<60, 128, "xsmaxcdp", vsrc, vsfrc, vsfrc,
+ IIC_VecFP, []>;
+ def XSMAXJDP : XX3_XT5_XA5_XB5<60, 144, "xsmaxjdp", vsrc, vsfrc, vsfrc,
+ IIC_VecFP, []>;
+ def XSMINCDP : XX3_XT5_XA5_XB5<60, 136, "xsmincdp", vsrc, vsfrc, vsfrc,
+ IIC_VecFP, []>;
+ def XSMINJDP : XX3_XT5_XA5_XB5<60, 152, "xsminjdp", vsrc, vsfrc, vsfrc,
+ IIC_VecFP, []>;
+
+ //===--------------------------------------------------------------------===//
+
+ // Vector Byte-Reverse H/W/D/Q Word
+ def XXBRH : XX2_XT6_XO5_XB6<60, 7, 475, "xxbrh", vsrc, []>;
+ def XXBRW : XX2_XT6_XO5_XB6<60, 15, 475, "xxbrw", vsrc, []>;
+ def XXBRD : XX2_XT6_XO5_XB6<60, 23, 475, "xxbrd", vsrc, []>;
+ def XXBRQ : XX2_XT6_XO5_XB6<60, 31, 475, "xxbrq", vsrc, []>;
+
+ // Vector Permute
+ def XXPERM : XX3_XT5_XA5_XB5<60, 26, "xxperm" , vsrc, vsrc, vsrc,
+ IIC_VecPerm, []>;
+ def XXPERMR : XX3_XT5_XA5_XB5<60, 58, "xxpermr", vsrc, vsrc, vsrc,
+ IIC_VecPerm, []>;
+
+ // Vector Splat Immediate Byte
+ def XXSPLTIB : X_RD6_IMM8<60, 360, (outs vsrc:$XT), (ins u8imm:$IMM8),
+ "xxspltib $XT, $IMM8", IIC_VecPerm, []>, UseVSXReg;
+
+ //===--------------------------------------------------------------------===//
+ // Vector/Scalar Load/Store Instructions
+
+ // When adding new D-Form loads/stores, be sure to update the ImmToIdxMap in
+ // PPCRegisterInfo::PPCRegisterInfo and maybe save yourself some debugging.
+ let mayLoad = 1 in {
+ // Load Vector
+ def LXV : DQ_RD6_RS5_DQ12<61, 1, (outs vsrc:$XT), (ins memrix16:$src),
+ "lxv $XT, $src", IIC_LdStLFD, []>, UseVSXReg;
+ // Load DWord
+ def LXSD : DSForm_1<57, 2, (outs vfrc:$vD), (ins memrix:$src),
+ "lxsd $vD, $src", IIC_LdStLFD, []>;
+ // Load SP from src, convert it to DP, and place in dword[0]
+ def LXSSP : DSForm_1<57, 3, (outs vfrc:$vD), (ins memrix:$src),
+ "lxssp $vD, $src", IIC_LdStLFD, []>;
+
+ // [PO T RA RB XO TX] almost equal to [PO S RA RB XO SX], but has different
+ // "out" and "in" dag
+ class X_XT6_RA5_RB5<bits<6> opcode, bits<10> xo, string opc,
+ RegisterOperand vtype, list<dag> pattern>
+ : XX1Form<opcode, xo, (outs vtype:$XT), (ins memrr:$src),
+ !strconcat(opc, " $XT, $src"), IIC_LdStLFD, pattern>, UseVSXReg;
+
+ // Load as Integer Byte/Halfword & Zero Indexed
+ def LXSIBZX : X_XT6_RA5_RB5<31, 781, "lxsibzx", vsfrc,
+ [(set f64:$XT, (PPClxsizx xoaddr:$src, 1))]>;
+ def LXSIHZX : X_XT6_RA5_RB5<31, 813, "lxsihzx", vsfrc,
+ [(set f64:$XT, (PPClxsizx xoaddr:$src, 2))]>;
+
+ // Load Vector Halfword*8/Byte*16 Indexed
+ def LXVH8X : X_XT6_RA5_RB5<31, 812, "lxvh8x" , vsrc, []>;
+ def LXVB16X : X_XT6_RA5_RB5<31, 876, "lxvb16x", vsrc, []>;
+
+ // Load Vector Indexed
+ def LXVX : X_XT6_RA5_RB5<31, 268, "lxvx" , vsrc,
+ [(set v2f64:$XT, (load xoaddr:$src))]>;
+
+ // Load Vector (Left-justified) with Length
+ def LXVL : XX1Form<31, 269, (outs vsrc:$XT), (ins memr:$src, g8rc:$rB),
+ "lxvl $XT, $src, $rB", IIC_LdStLoad,
+ [(set v4i32:$XT, (int_ppc_vsx_lxvl addr:$src, i64:$rB))]>,
+ UseVSXReg;
+ def LXVLL : XX1Form<31,301, (outs vsrc:$XT), (ins memr:$src, g8rc:$rB),
+ "lxvll $XT, $src, $rB", IIC_LdStLoad,
+ [(set v4i32:$XT, (int_ppc_vsx_lxvll addr:$src, i64:$rB))]>,
+ UseVSXReg;
+
+ // Load Vector Word & Splat Indexed
+ def LXVWSX : X_XT6_RA5_RB5<31, 364, "lxvwsx" , vsrc, []>;
+ } // mayLoad
+
+ // When adding new D-Form loads/stores, be sure to update the ImmToIdxMap in
+ // PPCRegisterInfo::PPCRegisterInfo and maybe save yourself some debugging.
+ let mayStore = 1 in {
+ // Store Vector
+ def STXV : DQ_RD6_RS5_DQ12<61, 5, (outs), (ins vsrc:$XT, memrix16:$dst),
+ "stxv $XT, $dst", IIC_LdStSTFD, []>, UseVSXReg;
+ // Store DWord
+ def STXSD : DSForm_1<61, 2, (outs), (ins vfrc:$vS, memrix:$dst),
+ "stxsd $vS, $dst", IIC_LdStSTFD, []>;
+ // Convert DP of dword[0] to SP, and Store to dst
+ def STXSSP : DSForm_1<61, 3, (outs), (ins vfrc:$vS, memrix:$dst),
+ "stxssp $vS, $dst", IIC_LdStSTFD, []>;
+
+ // [PO S RA RB XO SX]
+ class X_XS6_RA5_RB5<bits<6> opcode, bits<10> xo, string opc,
+ RegisterOperand vtype, list<dag> pattern>
+ : XX1Form<opcode, xo, (outs), (ins vtype:$XT, memrr:$dst),
+ !strconcat(opc, " $XT, $dst"), IIC_LdStSTFD, pattern>, UseVSXReg;
+
+ // Store as Integer Byte/Halfword Indexed
+ def STXSIBX : X_XS6_RA5_RB5<31, 909, "stxsibx" , vsfrc,
+ [(PPCstxsix f64:$XT, xoaddr:$dst, 1)]>;
+ def STXSIHX : X_XS6_RA5_RB5<31, 941, "stxsihx" , vsfrc,
+ [(PPCstxsix f64:$XT, xoaddr:$dst, 2)]>;
+ let isCodeGenOnly = 1 in {
+ def STXSIBXv : X_XS6_RA5_RB5<31, 909, "stxsibx" , vrrc, []>;
+ def STXSIHXv : X_XS6_RA5_RB5<31, 941, "stxsihx" , vrrc, []>;
+ }
+
+ // Store Vector Halfword*8/Byte*16 Indexed
+ def STXVH8X : X_XS6_RA5_RB5<31, 940, "stxvh8x" , vsrc, []>;
+ def STXVB16X : X_XS6_RA5_RB5<31, 1004, "stxvb16x", vsrc, []>;
+
+ // Store Vector Indexed
+ def STXVX : X_XS6_RA5_RB5<31, 396, "stxvx" , vsrc,
+ [(store v2f64:$XT, xoaddr:$dst)]>;
+
+ // Store Vector (Left-justified) with Length
+ def STXVL : XX1Form<31, 397, (outs), (ins vsrc:$XT, memr:$dst, g8rc:$rB),
+ "stxvl $XT, $dst, $rB", IIC_LdStLoad,
+ [(int_ppc_vsx_stxvl v4i32:$XT, addr:$dst, i64:$rB)]>,
+ UseVSXReg;
+ def STXVLL : XX1Form<31, 429, (outs), (ins vsrc:$XT, memr:$dst, g8rc:$rB),
+ "stxvll $XT, $dst, $rB", IIC_LdStLoad,
+ [(int_ppc_vsx_stxvll v4i32:$XT, addr:$dst, i64:$rB)]>,
+ UseVSXReg;
+ } // mayStore
+
+ // Patterns for which instructions from ISA 3.0 are a better match
+ let Predicates = [IsLittleEndian, HasP9Vector] in {
+ def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 0))))),
+ (f32 (XSCVUXDSP (XXEXTRACTUW $A, 12)))>;
+ def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 1))))),
+ (f32 (XSCVUXDSP (XXEXTRACTUW $A, 8)))>;
+ def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 2))))),
+ (f32 (XSCVUXDSP (XXEXTRACTUW $A, 4)))>;
+ def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 3))))),
+ (f32 (XSCVUXDSP (XXEXTRACTUW $A, 0)))>;
+ def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 0)),
+ (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 12))>;
+ def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 1)),
+ (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 8))>;
+ def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 2)),
+ (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 4))>;
+ def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 3)),
+ (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 0))>;
+ def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 0)),
+ (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 12))>;
+ def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 1)),
+ (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 8))>;
+ def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 2)),
+ (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 4))>;
+ def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 3)),
+ (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 0))>;
+ } // IsLittleEndian, HasP9Vector
+
+ let Predicates = [IsBigEndian, HasP9Vector] in {
+ def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 0))))),
+ (f32 (XSCVUXDSP (XXEXTRACTUW $A, 0)))>;
+ def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 1))))),
+ (f32 (XSCVUXDSP (XXEXTRACTUW $A, 4)))>;
+ def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 2))))),
+ (f32 (XSCVUXDSP (XXEXTRACTUW $A, 8)))>;
+ def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 3))))),
+ (f32 (XSCVUXDSP (XXEXTRACTUW $A, 12)))>;
+ def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 0)),
+ (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 0))>;
+ def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 1)),
+ (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 4))>;
+ def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 2)),
+ (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 8))>;
+ def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 3)),
+ (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 12))>;
+ def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 0)),
+ (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 0))>;
+ def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 1)),
+ (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 4))>;
+ def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 2)),
+ (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 8))>;
+ def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 3)),
+ (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 12))>;
+ } // IsLittleEndian, HasP9Vector
+
+ def : Pat<(v2f64 (load xoaddr:$src)), (LXVX xoaddr:$src)>;
+ def : Pat<(v2i64 (load xoaddr:$src)), (LXVX xoaddr:$src)>;
+ def : Pat<(v4f32 (load xoaddr:$src)), (LXVX xoaddr:$src)>;
+ def : Pat<(v4i32 (load xoaddr:$src)), (LXVX xoaddr:$src)>;
+ def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xoaddr:$src)), (LXVX xoaddr:$src)>;
+ def : Pat<(v2f64 (int_ppc_vsx_lxvd2x xoaddr:$src)), (LXVX xoaddr:$src)>;
+ def : Pat<(store v2f64:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>;
+ def : Pat<(store v2i64:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>;
+ def : Pat<(store v4f32:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>;
+ def : Pat<(store v4i32:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>;
+ def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xoaddr:$dst),
+ (STXVX $rS, xoaddr:$dst)>;
+ def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst),
+ (STXVX $rS, xoaddr:$dst)>;
+
+ def : Pat<(v4i32 (scalar_to_vector (i32 (load xoaddr:$src)))),
+ (v4i32 (LXVWSX xoaddr:$src))>;
+ def : Pat<(v4f32 (scalar_to_vector (f32 (load xoaddr:$src)))),
+ (v4f32 (LXVWSX xoaddr:$src))>;
+ def : Pat<(v4f32 (scalar_to_vector (f32 (fpround (extloadf32 xoaddr:$src))))),
+ (v4f32 (LXVWSX xoaddr:$src))>;
+
+ // Build vectors from i8 loads
+ def : Pat<(v16i8 (scalar_to_vector ScalarLoads.Li8)),
+ (v16i8 (VSPLTBs 7, (LXSIBZX xoaddr:$src)))>;
+ def : Pat<(v8i16 (scalar_to_vector ScalarLoads.ZELi8)),
+ (v8i16 (VSPLTHs 3, (LXSIBZX xoaddr:$src)))>;
+ def : Pat<(v4i32 (scalar_to_vector ScalarLoads.ZELi8)),
+ (v4i32 (XXSPLTWs (LXSIBZX xoaddr:$src), 1))>;
+ def : Pat<(v2i64 (scalar_to_vector ScalarLoads.ZELi8i64)),
+ (v2i64 (XXPERMDIs (LXSIBZX xoaddr:$src), 0))>;
+ def : Pat<(v4i32 (scalar_to_vector ScalarLoads.SELi8)),
+ (v4i32 (XXSPLTWs (VEXTSB2Ws (LXSIBZX xoaddr:$src)), 1))>;
+ def : Pat<(v2i64 (scalar_to_vector ScalarLoads.SELi8i64)),
+ (v2i64 (XXPERMDIs (VEXTSB2Ds (LXSIBZX xoaddr:$src)), 0))>;
+
+ // Build vectors from i16 loads
+ def : Pat<(v8i16 (scalar_to_vector ScalarLoads.Li16)),
+ (v8i16 (VSPLTHs 3, (LXSIHZX xoaddr:$src)))>;
+ def : Pat<(v4i32 (scalar_to_vector ScalarLoads.ZELi16)),
+ (v4i32 (XXSPLTWs (LXSIHZX xoaddr:$src), 1))>;
+ def : Pat<(v2i64 (scalar_to_vector ScalarLoads.ZELi16i64)),
+ (v2i64 (XXPERMDIs (LXSIHZX xoaddr:$src), 0))>;
+ def : Pat<(v4i32 (scalar_to_vector ScalarLoads.SELi16)),
+ (v4i32 (XXSPLTWs (VEXTSH2Ws (LXSIHZX xoaddr:$src)), 1))>;
+ def : Pat<(v2i64 (scalar_to_vector ScalarLoads.SELi16i64)),
+ (v2i64 (XXPERMDIs (VEXTSH2Ds (LXSIHZX xoaddr:$src)), 0))>;
+
+ let Predicates = [IsBigEndian, HasP9Vector] in {
+ // Scalar stores of i8
+ def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 0)), xoaddr:$dst),
+ (STXSIBXv (VSLDOI $S, $S, 9), xoaddr:$dst)>;
+ def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 1)), xoaddr:$dst),
+ (STXSIBXv (VSLDOI $S, $S, 10), xoaddr:$dst)>;
+ def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 2)), xoaddr:$dst),
+ (STXSIBXv (VSLDOI $S, $S, 11), xoaddr:$dst)>;
+ def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 3)), xoaddr:$dst),
+ (STXSIBXv (VSLDOI $S, $S, 12), xoaddr:$dst)>;
+ def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 4)), xoaddr:$dst),
+ (STXSIBXv (VSLDOI $S, $S, 13), xoaddr:$dst)>;
+ def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 5)), xoaddr:$dst),
+ (STXSIBXv (VSLDOI $S, $S, 14), xoaddr:$dst)>;
+ def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 6)), xoaddr:$dst),
+ (STXSIBXv (VSLDOI $S, $S, 15), xoaddr:$dst)>;
+ def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 7)), xoaddr:$dst),
+ (STXSIBXv $S, xoaddr:$dst)>;
+ def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 8)), xoaddr:$dst),
+ (STXSIBXv (VSLDOI $S, $S, 1), xoaddr:$dst)>;
+ def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 9)), xoaddr:$dst),
+ (STXSIBXv (VSLDOI $S, $S, 2), xoaddr:$dst)>;
+ def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 10)), xoaddr:$dst),
+ (STXSIBXv (VSLDOI $S, $S, 3), xoaddr:$dst)>;
+ def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 11)), xoaddr:$dst),
+ (STXSIBXv (VSLDOI $S, $S, 4), xoaddr:$dst)>;
+ def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 12)), xoaddr:$dst),
+ (STXSIBXv (VSLDOI $S, $S, 5), xoaddr:$dst)>;
+ def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 13)), xoaddr:$dst),
+ (STXSIBXv (VSLDOI $S, $S, 6), xoaddr:$dst)>;
+ def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 14)), xoaddr:$dst),
+ (STXSIBXv (VSLDOI $S, $S, 7), xoaddr:$dst)>;
+ def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 15)), xoaddr:$dst),
+ (STXSIBXv (VSLDOI $S, $S, 8), xoaddr:$dst)>;
+
+ // Scalar stores of i16
+ def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 0)), xoaddr:$dst),
+ (STXSIHXv (VSLDOI $S, $S, 10), xoaddr:$dst)>;
+ def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 1)), xoaddr:$dst),
+ (STXSIHXv (VSLDOI $S, $S, 12), xoaddr:$dst)>;
+ def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 2)), xoaddr:$dst),
+ (STXSIHXv (VSLDOI $S, $S, 14), xoaddr:$dst)>;
+ def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 3)), xoaddr:$dst),
+ (STXSIHXv $S, xoaddr:$dst)>;
+ def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 4)), xoaddr:$dst),
+ (STXSIHXv (VSLDOI $S, $S, 2), xoaddr:$dst)>;
+ def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 5)), xoaddr:$dst),
+ (STXSIHXv (VSLDOI $S, $S, 4), xoaddr:$dst)>;
+ def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 6)), xoaddr:$dst),
+ (STXSIHXv (VSLDOI $S, $S, 6), xoaddr:$dst)>;
+ def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 7)), xoaddr:$dst),
+ (STXSIHXv (VSLDOI $S, $S, 8), xoaddr:$dst)>;
+ } // IsBigEndian, HasP9Vector
+
+ let Predicates = [IsLittleEndian, HasP9Vector] in {
+ // Scalar stores of i8
+ def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 0)), xoaddr:$dst),
+ (STXSIBXv (VSLDOI $S, $S, 8), xoaddr:$dst)>;
+ def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 1)), xoaddr:$dst),
+ (STXSIBXv (VSLDOI $S, $S, 7), xoaddr:$dst)>;
+ def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 2)), xoaddr:$dst),
+ (STXSIBXv (VSLDOI $S, $S, 6), xoaddr:$dst)>;
+ def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 3)), xoaddr:$dst),
+ (STXSIBXv (VSLDOI $S, $S, 5), xoaddr:$dst)>;
+ def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 4)), xoaddr:$dst),
+ (STXSIBXv (VSLDOI $S, $S, 4), xoaddr:$dst)>;
+ def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 5)), xoaddr:$dst),
+ (STXSIBXv (VSLDOI $S, $S, 3), xoaddr:$dst)>;
+ def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 6)), xoaddr:$dst),
+ (STXSIBXv (VSLDOI $S, $S, 2), xoaddr:$dst)>;
+ def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 7)), xoaddr:$dst),
+ (STXSIBXv (VSLDOI $S, $S, 1), xoaddr:$dst)>;
+ def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 8)), xoaddr:$dst),
+ (STXSIBXv $S, xoaddr:$dst)>;
+ def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 9)), xoaddr:$dst),
+ (STXSIBXv (VSLDOI $S, $S, 15), xoaddr:$dst)>;
+ def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 10)), xoaddr:$dst),
+ (STXSIBXv (VSLDOI $S, $S, 14), xoaddr:$dst)>;
+ def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 11)), xoaddr:$dst),
+ (STXSIBXv (VSLDOI $S, $S, 13), xoaddr:$dst)>;
+ def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 12)), xoaddr:$dst),
+ (STXSIBXv (VSLDOI $S, $S, 12), xoaddr:$dst)>;
+ def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 13)), xoaddr:$dst),
+ (STXSIBXv (VSLDOI $S, $S, 11), xoaddr:$dst)>;
+ def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 14)), xoaddr:$dst),
+ (STXSIBXv (VSLDOI $S, $S, 10), xoaddr:$dst)>;
+ def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 15)), xoaddr:$dst),
+ (STXSIBXv (VSLDOI $S, $S, 9), xoaddr:$dst)>;
+
+ // Scalar stores of i16
+ def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 0)), xoaddr:$dst),
+ (STXSIHXv (VSLDOI $S, $S, 8), xoaddr:$dst)>;
+ def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 1)), xoaddr:$dst),
+ (STXSIHXv (VSLDOI $S, $S, 6), xoaddr:$dst)>;
+ def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 2)), xoaddr:$dst),
+ (STXSIHXv (VSLDOI $S, $S, 4), xoaddr:$dst)>;
+ def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 3)), xoaddr:$dst),
+ (STXSIHXv (VSLDOI $S, $S, 2), xoaddr:$dst)>;
+ def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 4)), xoaddr:$dst),
+ (STXSIHXv $S, xoaddr:$dst)>;
+ def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 5)), xoaddr:$dst),
+ (STXSIHXv (VSLDOI $S, $S, 14), xoaddr:$dst)>;
+ def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 6)), xoaddr:$dst),
+ (STXSIHXv (VSLDOI $S, $S, 12), xoaddr:$dst)>;
+ def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 7)), xoaddr:$dst),
+ (STXSIHXv (VSLDOI $S, $S, 10), xoaddr:$dst)>;
+ } // IsLittleEndian, HasP9Vector
+
+
+ // Vector sign extensions
+ def : Pat<(f64 (PPCVexts f64:$A, 1)),
+ (f64 (COPY_TO_REGCLASS (VEXTSB2Ds $A), VSFRC))>;
+ def : Pat<(f64 (PPCVexts f64:$A, 2)),
+ (f64 (COPY_TO_REGCLASS (VEXTSH2Ds $A), VSFRC))>;
+
+ let isPseudo = 1 in {
+ def DFLOADf32 : Pseudo<(outs vssrc:$XT), (ins memrix:$src),
+ "#DFLOADf32",
+ [(set f32:$XT, (load iaddr:$src))]>;
+ def DFLOADf64 : Pseudo<(outs vsfrc:$XT), (ins memrix:$src),
+ "#DFLOADf64",
+ [(set f64:$XT, (load iaddr:$src))]>;
+ def DFSTOREf32 : Pseudo<(outs), (ins vssrc:$XT, memrix:$dst),
+ "#DFSTOREf32",
+ [(store f32:$XT, iaddr:$dst)]>;
+ def DFSTOREf64 : Pseudo<(outs), (ins vsfrc:$XT, memrix:$dst),
+ "#DFSTOREf64",
+ [(store f64:$XT, iaddr:$dst)]>;
+ }
+ def : Pat<(f64 (extloadf32 iaddr:$src)),
+ (COPY_TO_REGCLASS (DFLOADf32 iaddr:$src), VSFRC)>;
+ def : Pat<(f32 (fpround (extloadf32 iaddr:$src))),
+ (f32 (DFLOADf32 iaddr:$src))>;
+} // end HasP9Vector, AddedComplexity
+
+// Integer extend helper dags 32 -> 64
+def AnyExts {
+ dag A = (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32);
+ dag B = (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $B, sub_32);
+ dag C = (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $C, sub_32);
+ dag D = (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $D, sub_32);
+}
+
+def DblToFlt {
+ dag A0 = (f32 (fpround (f64 (extractelt v2f64:$A, 0))));
+ dag A1 = (f32 (fpround (f64 (extractelt v2f64:$A, 1))));
+ dag B0 = (f32 (fpround (f64 (extractelt v2f64:$B, 0))));
+ dag B1 = (f32 (fpround (f64 (extractelt v2f64:$B, 1))));
+}
+def FltToIntLoad {
+ dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (extloadf32 xoaddr:$A)))));
+}
+def FltToUIntLoad {
+ dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (extloadf32 xoaddr:$A)))));
+}
+def FltToLongLoad {
+ dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 xoaddr:$A)))));
+}
+def FltToULongLoad {
+ dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 xoaddr:$A)))));
+}
+def FltToLong {
+ dag A = (i64 (PPCmfvsr (PPCfctidz (fpextend f32:$A))));
+}
+def FltToULong {
+ dag A = (i64 (PPCmfvsr (PPCfctiduz (fpextend f32:$A))));
+}
+def DblToInt {
+ dag A = (i32 (PPCmfvsr (f64 (PPCfctiwz f64:$A))));
+}
+def DblToUInt {
+ dag A = (i32 (PPCmfvsr (f64 (PPCfctiwuz f64:$A))));
+}
+def DblToLong {
+ dag A = (i64 (PPCmfvsr (f64 (PPCfctidz f64:$A))));
+}
+def DblToULong {
+ dag A = (i64 (PPCmfvsr (f64 (PPCfctiduz f64:$A))));
+}
+def DblToIntLoad {
+ dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load xoaddr:$A)))));
+}
+def DblToUIntLoad {
+ dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load xoaddr:$A)))));
+}
+def DblToLongLoad {
+ dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (load xoaddr:$A)))));
+}
+def DblToULongLoad {
+ dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (load xoaddr:$A)))));
+}
+
+// FP merge dags (for f32 -> v4f32)
+def MrgFP {
+ dag AC = (XVCVDPSP (XXPERMDI (COPY_TO_REGCLASS $A, VSRC),
+ (COPY_TO_REGCLASS $C, VSRC), 0));
+ dag BD = (XVCVDPSP (XXPERMDI (COPY_TO_REGCLASS $B, VSRC),
+ (COPY_TO_REGCLASS $D, VSRC), 0));
+ dag ABhToFlt = (XVCVDPSP (XXPERMDI $A, $B, 0));
+ dag ABlToFlt = (XVCVDPSP (XXPERMDI $A, $B, 3));
+ dag BAhToFlt = (XVCVDPSP (XXPERMDI $B, $A, 0));
+ dag BAlToFlt = (XVCVDPSP (XXPERMDI $B, $A, 3));
+}
+
+// Patterns for BUILD_VECTOR nodes.
+def NoP9Vector : Predicate<"!PPCSubTarget->hasP9Vector()">;
+let AddedComplexity = 400 in {
+
+ let Predicates = [HasVSX] in {
+ // Build vectors of floating point converted to i32.
+ def : Pat<(v4i32 (build_vector DblToInt.A, DblToInt.A,
+ DblToInt.A, DblToInt.A)),
+ (v4i32 (XXSPLTW (COPY_TO_REGCLASS (XSCVDPSXWS $A), VSRC), 1))>;
+ def : Pat<(v4i32 (build_vector DblToUInt.A, DblToUInt.A,
+ DblToUInt.A, DblToUInt.A)),
+ (v4i32 (XXSPLTW (COPY_TO_REGCLASS (XSCVDPUXWS $A), VSRC), 1))>;
+ def : Pat<(v2i64 (build_vector DblToLong.A, DblToLong.A)),
+ (v2i64 (XXPERMDI (COPY_TO_REGCLASS (XSCVDPSXDS $A), VSRC),
+ (COPY_TO_REGCLASS (XSCVDPSXDS $A), VSRC), 0))>;
+ def : Pat<(v2i64 (build_vector DblToULong.A, DblToULong.A)),
+ (v2i64 (XXPERMDI (COPY_TO_REGCLASS (XSCVDPUXDS $A), VSRC),
+ (COPY_TO_REGCLASS (XSCVDPUXDS $A), VSRC), 0))>;
+ def : Pat<(v4i32 (scalar_to_vector FltToIntLoad.A)),
+ (v4i32 (XXSPLTW (COPY_TO_REGCLASS
+ (XSCVDPSXWSs (LXSSPX xoaddr:$A)), VSRC), 1))>;
+ def : Pat<(v4i32 (scalar_to_vector FltToUIntLoad.A)),
+ (v4i32 (XXSPLTW (COPY_TO_REGCLASS
+ (XSCVDPUXWSs (LXSSPX xoaddr:$A)), VSRC), 1))>;
+ def : Pat<(v4f32 (build_vector f32:$A, f32:$A, f32:$A, f32:$A)),
+ (v4f32 (XXSPLTW (v4f32 (XSCVDPSPN $A)), 0))>;
+
+ // Build vectors of floating point converted to i64.
+ def : Pat<(v2i64 (build_vector FltToLong.A, FltToLong.A)),
+ (v2i64 (XXPERMDIs
+ (COPY_TO_REGCLASS (XSCVDPSXDSs $A), VSFRC), 0))>;
+ def : Pat<(v2i64 (build_vector FltToULong.A, FltToULong.A)),
+ (v2i64 (XXPERMDIs
+ (COPY_TO_REGCLASS (XSCVDPUXDSs $A), VSFRC), 0))>;
+ def : Pat<(v2i64 (scalar_to_vector DblToLongLoad.A)),
+ (v2i64 (XVCVDPSXDS (LXVDSX xoaddr:$A)))>;
+ def : Pat<(v2i64 (scalar_to_vector DblToULongLoad.A)),
+ (v2i64 (XVCVDPUXDS (LXVDSX xoaddr:$A)))>;
+ }
+
+ let Predicates = [HasVSX, NoP9Vector] in {
+ // Load-and-splat with fp-to-int conversion (using X-Form VSX loads).
+ def : Pat<(v4i32 (scalar_to_vector DblToIntLoad.A)),
+ (v4i32 (XXSPLTW (COPY_TO_REGCLASS
+ (XSCVDPSXWS (LXSDX xoaddr:$A)), VSRC), 1))>;
+ def : Pat<(v4i32 (scalar_to_vector DblToUIntLoad.A)),
+ (v4i32 (XXSPLTW (COPY_TO_REGCLASS
+ (XSCVDPUXWS (LXSDX xoaddr:$A)), VSRC), 1))>;
+ def : Pat<(v2i64 (scalar_to_vector FltToLongLoad.A)),
+ (v2i64 (XXPERMDIs (XSCVDPSXDS (COPY_TO_REGCLASS
+ (LXSSPX xoaddr:$A), VSFRC)), 0))>;
+ def : Pat<(v2i64 (scalar_to_vector FltToULongLoad.A)),
+ (v2i64 (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS
+ (LXSSPX xoaddr:$A), VSFRC)), 0))>;
+ }
+
+ // Big endian, available on all targets with VSX
+ let Predicates = [IsBigEndian, HasVSX] in {
+ def : Pat<(v2f64 (build_vector f64:$A, f64:$B)),
+ (v2f64 (XXPERMDI
+ (COPY_TO_REGCLASS $A, VSRC),
+ (COPY_TO_REGCLASS $B, VSRC), 0))>;
+
+ def : Pat<(v4f32 (build_vector f32:$A, f32:$B, f32:$C, f32:$D)),
+ (VMRGEW MrgFP.AC, MrgFP.BD)>;
+ def : Pat<(v4f32 (build_vector DblToFlt.A0, DblToFlt.A1,
+ DblToFlt.B0, DblToFlt.B1)),
+ (v4f32 (VMRGEW MrgFP.ABhToFlt, MrgFP.ABlToFlt))>;
+ }
+
+ let Predicates = [IsLittleEndian, HasVSX] in {
+ // Little endian, available on all targets with VSX
+ def : Pat<(v2f64 (build_vector f64:$A, f64:$B)),
+ (v2f64 (XXPERMDI
+ (COPY_TO_REGCLASS $B, VSRC),
+ (COPY_TO_REGCLASS $A, VSRC), 0))>;
+
+ def : Pat<(v4f32 (build_vector f32:$D, f32:$C, f32:$B, f32:$A)),
+ (VMRGEW MrgFP.AC, MrgFP.BD)>;
+ def : Pat<(v4f32 (build_vector DblToFlt.A0, DblToFlt.A1,
+ DblToFlt.B0, DblToFlt.B1)),
+ (v4f32 (VMRGEW MrgFP.BAhToFlt, MrgFP.BAlToFlt))>;
+ }
+
+ let Predicates = [HasDirectMove] in {
+ // Endianness-neutral constant splat on P8 and newer targets. The reason
+ // for this pattern is that on targets with direct moves, we don't expand
+ // BUILD_VECTOR nodes for v4i32.
+ def : Pat<(v4i32 (build_vector immSExt5NonZero:$A, immSExt5NonZero:$A,
+ immSExt5NonZero:$A, immSExt5NonZero:$A)),
+ (v4i32 (VSPLTISW imm:$A))>;
+ }
+
+ let Predicates = [IsBigEndian, HasDirectMove, NoP9Vector] in {
+ // Big endian integer vectors using direct moves.
+ def : Pat<(v2i64 (build_vector i64:$A, i64:$B)),
+ (v2i64 (XXPERMDI
+ (COPY_TO_REGCLASS (MTVSRD $A), VSRC),
+ (COPY_TO_REGCLASS (MTVSRD $B), VSRC), 0))>;
+ def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)),
+ (VMRGOW (XXPERMDI (COPY_TO_REGCLASS (MTVSRWZ $A), VSRC),
+ (COPY_TO_REGCLASS (MTVSRWZ $C), VSRC), 0),
+ (XXPERMDI (COPY_TO_REGCLASS (MTVSRWZ $B), VSRC),
+ (COPY_TO_REGCLASS (MTVSRWZ $D), VSRC), 0))>;
+ def : Pat<(v4i32 (build_vector i32:$A, i32:$A, i32:$A, i32:$A)),
+ (XXSPLTW (COPY_TO_REGCLASS (MTVSRWZ $A), VSRC), 1)>;
+ }
+
+ let Predicates = [IsLittleEndian, HasDirectMove, NoP9Vector] in {
+ // Little endian integer vectors using direct moves.
+ def : Pat<(v2i64 (build_vector i64:$A, i64:$B)),
+ (v2i64 (XXPERMDI
+ (COPY_TO_REGCLASS (MTVSRD $B), VSRC),
+ (COPY_TO_REGCLASS (MTVSRD $A), VSRC), 0))>;
+ def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)),
+ (VMRGOW (XXPERMDI (COPY_TO_REGCLASS (MTVSRWZ $D), VSRC),
+ (COPY_TO_REGCLASS (MTVSRWZ $B), VSRC), 0),
+ (XXPERMDI (COPY_TO_REGCLASS (MTVSRWZ $C), VSRC),
+ (COPY_TO_REGCLASS (MTVSRWZ $A), VSRC), 0))>;
+ def : Pat<(v4i32 (build_vector i32:$A, i32:$A, i32:$A, i32:$A)),
+ (XXSPLTW (COPY_TO_REGCLASS (MTVSRWZ $A), VSRC), 1)>;
+ }
+
+ let Predicates = [HasP9Vector] in {
+ // Endianness-neutral patterns for const splats with ISA 3.0 instructions.
+ def : Pat<(v4i32 (scalar_to_vector i32:$A)),
+ (v4i32 (MTVSRWS $A))>;
+ def : Pat<(v4i32 (build_vector i32:$A, i32:$A, i32:$A, i32:$A)),
+ (v4i32 (MTVSRWS $A))>;
+ def : Pat<(v16i8 (build_vector immAnyExt8:$A, immAnyExt8:$A, immAnyExt8:$A,
+ immAnyExt8:$A, immAnyExt8:$A, immAnyExt8:$A,
+ immAnyExt8:$A, immAnyExt8:$A, immAnyExt8:$A,
+ immAnyExt8:$A, immAnyExt8:$A, immAnyExt8:$A,
+ immAnyExt8:$A, immAnyExt8:$A, immAnyExt8:$A,
+ immAnyExt8:$A)),
+ (v16i8 (COPY_TO_REGCLASS (XXSPLTIB imm:$A), VSRC))>;
+ def : Pat<(v16i8 immAllOnesV),
+ (v16i8 (COPY_TO_REGCLASS (XXSPLTIB 255), VSRC))>;
+ def : Pat<(v8i16 immAllOnesV),
+ (v8i16 (COPY_TO_REGCLASS (XXSPLTIB 255), VSRC))>;
+ def : Pat<(v4i32 immAllOnesV),
+ (v4i32 (XXSPLTIB 255))>;
+ def : Pat<(v2i64 immAllOnesV),
+ (v2i64 (XXSPLTIB 255))>;
+ def : Pat<(v4i32 (scalar_to_vector FltToIntLoad.A)),
+ (v4i32 (XVCVSPSXWS (LXVWSX xoaddr:$A)))>;
+ def : Pat<(v4i32 (scalar_to_vector FltToUIntLoad.A)),
+ (v4i32 (XVCVSPUXWS (LXVWSX xoaddr:$A)))>;
+ def : Pat<(v4i32 (scalar_to_vector DblToIntLoad.A)),
+ (v4i32 (XXSPLTW (COPY_TO_REGCLASS
+ (XSCVDPSXWS (DFLOADf64 iaddr:$A)), VSRC), 1))>;
+ def : Pat<(v4i32 (scalar_to_vector DblToUIntLoad.A)),
+ (v4i32 (XXSPLTW (COPY_TO_REGCLASS
+ (XSCVDPUXWS (DFLOADf64 iaddr:$A)), VSRC), 1))>;
+ def : Pat<(v2i64 (scalar_to_vector FltToLongLoad.A)),
+ (v2i64 (XXPERMDIs (XSCVDPSXDS (COPY_TO_REGCLASS
+ (DFLOADf32 iaddr:$A),
+ VSFRC)), 0))>;
+ def : Pat<(v2i64 (scalar_to_vector FltToULongLoad.A)),
+ (v2i64 (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS
+ (DFLOADf32 iaddr:$A),
+ VSFRC)), 0))>;
+ }
+
+ let Predicates = [IsISA3_0, HasDirectMove, IsBigEndian] in {
+ def : Pat<(i64 (extractelt v2i64:$A, 1)),
+ (i64 (MFVSRLD $A))>;
+ // Better way to build integer vectors if we have MTVSRDD. Big endian.
+ def : Pat<(v2i64 (build_vector i64:$rB, i64:$rA)),
+ (v2i64 (MTVSRDD $rB, $rA))>;
+ def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)),
+ (VMRGOW (COPY_TO_REGCLASS (MTVSRDD AnyExts.A, AnyExts.C), VSRC),
+ (COPY_TO_REGCLASS (MTVSRDD AnyExts.B, AnyExts.D), VSRC))>;
+ }
+
+ let Predicates = [IsISA3_0, HasDirectMove, IsLittleEndian] in {
+ def : Pat<(i64 (extractelt v2i64:$A, 0)),
+ (i64 (MFVSRLD $A))>;
+ // Better way to build integer vectors if we have MTVSRDD. Little endian.
+ def : Pat<(v2i64 (build_vector i64:$rA, i64:$rB)),
+ (v2i64 (MTVSRDD $rB, $rA))>;
+ def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)),
+ (VMRGOW (COPY_TO_REGCLASS (MTVSRDD AnyExts.D, AnyExts.B), VSRC),
+ (COPY_TO_REGCLASS (MTVSRDD AnyExts.C, AnyExts.A), VSRC))>;
+ }
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp b/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
new file mode 100644
index 000000000000..2c3e75523e8f
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
@@ -0,0 +1,452 @@
+//===------ PPCLoopPreIncPrep.cpp - Loop Pre-Inc. AM Prep. Pass -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass to prepare loops for pre-increment addressing
+// modes. Additional PHIs are created for loop induction variables used by
+// load/store instructions so that the pre-increment forms can be used.
+// Generically, this means transforming loops like this:
+// for (int i = 0; i < n; ++i)
+// array[i] = c;
+// to look like this:
+// T *p = array[-1];
+// for (int i = 0; i < n; ++i)
+// *++p = c;
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ppc-loop-preinc-prep"
+
+#include "PPC.h"
+#include "PPCSubtarget.h"
+#include "PPCTargetMachine.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include <cassert>
+#include <iterator>
+#include <utility>
+
+using namespace llvm;
+
+// By default, we limit this to creating 16 PHIs (which is a little over half
+// of the allocatable register set).
+static cl::opt<unsigned> MaxVars("ppc-preinc-prep-max-vars",
+ cl::Hidden, cl::init(16),
+ cl::desc("Potential PHI threshold for PPC preinc loop prep"));
+
+namespace llvm {
+
+ void initializePPCLoopPreIncPrepPass(PassRegistry&);
+
+} // end namespace llvm
+
+namespace {
+
+ class PPCLoopPreIncPrep : public FunctionPass {
+ public:
+ static char ID; // Pass ID, replacement for typeid
+
+ PPCLoopPreIncPrep() : FunctionPass(ID), TM(nullptr) {
+ initializePPCLoopPreIncPrepPass(*PassRegistry::getPassRegistry());
+ }
+ PPCLoopPreIncPrep(PPCTargetMachine &TM) : FunctionPass(ID), TM(&TM) {
+ initializePPCLoopPreIncPrepPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ }
+
+ bool runOnFunction(Function &F) override;
+
+ bool runOnLoop(Loop *L);
+ void simplifyLoopLatch(Loop *L);
+ bool rotateLoop(Loop *L);
+
+ private:
+ PPCTargetMachine *TM;
+ DominatorTree *DT;
+ LoopInfo *LI;
+ ScalarEvolution *SE;
+ bool PreserveLCSSA;
+ };
+
+} // end anonymous namespace
+
+char PPCLoopPreIncPrep::ID = 0;
+static const char *name = "Prepare loop for pre-inc. addressing modes";
+INITIALIZE_PASS_BEGIN(PPCLoopPreIncPrep, DEBUG_TYPE, name, false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(PPCLoopPreIncPrep, DEBUG_TYPE, name, false, false)
+
+FunctionPass *llvm::createPPCLoopPreIncPrepPass(PPCTargetMachine &TM) {
+ return new PPCLoopPreIncPrep(TM);
+}
+
+namespace {
+
+ struct BucketElement {
+ BucketElement(const SCEVConstant *O, Instruction *I) : Offset(O), Instr(I) {}
+ BucketElement(Instruction *I) : Offset(nullptr), Instr(I) {}
+
+ const SCEVConstant *Offset;
+ Instruction *Instr;
+ };
+
+ struct Bucket {
+ Bucket(const SCEV *B, Instruction *I) : BaseSCEV(B),
+ Elements(1, BucketElement(I)) {}
+
+ const SCEV *BaseSCEV;
+ SmallVector<BucketElement, 16> Elements;
+ };
+
+} // end anonymous namespace
+
+static bool IsPtrInBounds(Value *BasePtr) {
+ Value *StrippedBasePtr = BasePtr;
+ while (BitCastInst *BC = dyn_cast<BitCastInst>(StrippedBasePtr))
+ StrippedBasePtr = BC->getOperand(0);
+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(StrippedBasePtr))
+ return GEP->isInBounds();
+
+ return false;
+}
+
+static Value *GetPointerOperand(Value *MemI) {
+ if (LoadInst *LMemI = dyn_cast<LoadInst>(MemI)) {
+ return LMemI->getPointerOperand();
+ } else if (StoreInst *SMemI = dyn_cast<StoreInst>(MemI)) {
+ return SMemI->getPointerOperand();
+ } else if (IntrinsicInst *IMemI = dyn_cast<IntrinsicInst>(MemI)) {
+ if (IMemI->getIntrinsicID() == Intrinsic::prefetch)
+ return IMemI->getArgOperand(0);
+ }
+
+ return nullptr;
+}
+
+bool PPCLoopPreIncPrep::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+ DT = DTWP ? &DTWP->getDomTree() : nullptr;
+ PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+
+ bool MadeChange = false;
+
+ for (auto I = LI->begin(), IE = LI->end(); I != IE; ++I)
+ for (auto L = df_begin(*I), LE = df_end(*I); L != LE; ++L)
+ MadeChange |= runOnLoop(*L);
+
+ return MadeChange;
+}
+
+bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
+ bool MadeChange = false;
+
+ // Only prep. the inner-most loop
+ if (!L->empty())
+ return MadeChange;
+
+ DEBUG(dbgs() << "PIP: Examining: " << *L << "\n");
+
+ BasicBlock *Header = L->getHeader();
+
+ const PPCSubtarget *ST =
+ TM ? TM->getSubtargetImpl(*Header->getParent()) : nullptr;
+
+ unsigned HeaderLoopPredCount =
+ std::distance(pred_begin(Header), pred_end(Header));
+
+ // Collect buckets of comparable addresses used by loads and stores.
+ SmallVector<Bucket, 16> Buckets;
+ for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
+ I != IE; ++I) {
+ for (BasicBlock::iterator J = (*I)->begin(), JE = (*I)->end();
+ J != JE; ++J) {
+ Value *PtrValue;
+ Instruction *MemI;
+
+ if (LoadInst *LMemI = dyn_cast<LoadInst>(J)) {
+ MemI = LMemI;
+ PtrValue = LMemI->getPointerOperand();
+ } else if (StoreInst *SMemI = dyn_cast<StoreInst>(J)) {
+ MemI = SMemI;
+ PtrValue = SMemI->getPointerOperand();
+ } else if (IntrinsicInst *IMemI = dyn_cast<IntrinsicInst>(J)) {
+ if (IMemI->getIntrinsicID() == Intrinsic::prefetch) {
+ MemI = IMemI;
+ PtrValue = IMemI->getArgOperand(0);
+ } else continue;
+ } else continue;
+
+ unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace();
+ if (PtrAddrSpace)
+ continue;
+
+ // There are no update forms for Altivec vector load/stores.
+ if (ST && ST->hasAltivec() &&
+ PtrValue->getType()->getPointerElementType()->isVectorTy())
+ continue;
+
+ if (L->isLoopInvariant(PtrValue))
+ continue;
+
+ const SCEV *LSCEV = SE->getSCEVAtScope(PtrValue, L);
+ if (const SCEVAddRecExpr *LARSCEV = dyn_cast<SCEVAddRecExpr>(LSCEV)) {
+ if (LARSCEV->getLoop() != L)
+ continue;
+ } else {
+ continue;
+ }
+
+ bool FoundBucket = false;
+ for (auto &B : Buckets) {
+ const SCEV *Diff = SE->getMinusSCEV(LSCEV, B.BaseSCEV);
+ if (const auto *CDiff = dyn_cast<SCEVConstant>(Diff)) {
+ B.Elements.push_back(BucketElement(CDiff, MemI));
+ FoundBucket = true;
+ break;
+ }
+ }
+
+ if (!FoundBucket) {
+ if (Buckets.size() == MaxVars)
+ return MadeChange;
+ Buckets.push_back(Bucket(LSCEV, MemI));
+ }
+ }
+ }
+
+ if (Buckets.empty())
+ return MadeChange;
+
+ BasicBlock *LoopPredecessor = L->getLoopPredecessor();
+ // If there is no loop predecessor, or the loop predecessor's terminator
+ // returns a value (which might contribute to determining the loop's
+ // iteration space), insert a new preheader for the loop.
+ if (!LoopPredecessor ||
+ !LoopPredecessor->getTerminator()->getType()->isVoidTy()) {
+ LoopPredecessor = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA);
+ if (LoopPredecessor)
+ MadeChange = true;
+ }
+ if (!LoopPredecessor)
+ return MadeChange;
+
+ DEBUG(dbgs() << "PIP: Found " << Buckets.size() << " buckets\n");
+
+ SmallSet<BasicBlock *, 16> BBChanged;
+ for (unsigned i = 0, e = Buckets.size(); i != e; ++i) {
+ // The base address of each bucket is transformed into a phi and the others
+ // are rewritten as offsets of that variable.
+
+ // We have a choice now of which instruction's memory operand we use as the
+ // base for the generated PHI. Always picking the first instruction in each
+ // bucket does not work well, specifically because that instruction might
+ // be a prefetch (and there are no pre-increment dcbt variants). Otherwise,
+ // the choice is somewhat arbitrary, because the backend will happily
+ // generate direct offsets from both the pre-incremented and
+ // post-incremented pointer values. Thus, we'll pick the first non-prefetch
+ // instruction in each bucket, and adjust the recurrence and other offsets
+ // accordingly.
+ for (int j = 0, je = Buckets[i].Elements.size(); j != je; ++j) {
+ if (auto *II = dyn_cast<IntrinsicInst>(Buckets[i].Elements[j].Instr))
+ if (II->getIntrinsicID() == Intrinsic::prefetch)
+ continue;
+
+ // If we'd otherwise pick the first element anyway, there's nothing to do.
+ if (j == 0)
+ break;
+
+ // If our chosen element has no offset from the base pointer, there's
+ // nothing to do.
+ if (!Buckets[i].Elements[j].Offset ||
+ Buckets[i].Elements[j].Offset->isZero())
+ break;
+
+ const SCEV *Offset = Buckets[i].Elements[j].Offset;
+ Buckets[i].BaseSCEV = SE->getAddExpr(Buckets[i].BaseSCEV, Offset);
+ for (auto &E : Buckets[i].Elements) {
+ if (E.Offset)
+ E.Offset = cast<SCEVConstant>(SE->getMinusSCEV(E.Offset, Offset));
+ else
+ E.Offset = cast<SCEVConstant>(SE->getNegativeSCEV(Offset));
+ }
+
+ std::swap(Buckets[i].Elements[j], Buckets[i].Elements[0]);
+ break;
+ }
+
+ const SCEVAddRecExpr *BasePtrSCEV =
+ cast<SCEVAddRecExpr>(Buckets[i].BaseSCEV);
+ if (!BasePtrSCEV->isAffine())
+ continue;
+
+ DEBUG(dbgs() << "PIP: Transforming: " << *BasePtrSCEV << "\n");
+ assert(BasePtrSCEV->getLoop() == L &&
+ "AddRec for the wrong loop?");
+
+ // The instruction corresponding to the Bucket's BaseSCEV must be the first
+ // in the vector of elements.
+ Instruction *MemI = Buckets[i].Elements.begin()->Instr;
+ Value *BasePtr = GetPointerOperand(MemI);
+ assert(BasePtr && "No pointer operand");
+
+ Type *I8Ty = Type::getInt8Ty(MemI->getParent()->getContext());
+ Type *I8PtrTy = Type::getInt8PtrTy(MemI->getParent()->getContext(),
+ BasePtr->getType()->getPointerAddressSpace());
+
+ const SCEV *BasePtrStartSCEV = BasePtrSCEV->getStart();
+ if (!SE->isLoopInvariant(BasePtrStartSCEV, L))
+ continue;
+
+ const SCEVConstant *BasePtrIncSCEV =
+ dyn_cast<SCEVConstant>(BasePtrSCEV->getStepRecurrence(*SE));
+ if (!BasePtrIncSCEV)
+ continue;
+ BasePtrStartSCEV = SE->getMinusSCEV(BasePtrStartSCEV, BasePtrIncSCEV);
+ if (!isSafeToExpand(BasePtrStartSCEV, *SE))
+ continue;
+
+ DEBUG(dbgs() << "PIP: New start is: " << *BasePtrStartSCEV << "\n");
+
+ PHINode *NewPHI = PHINode::Create(I8PtrTy, HeaderLoopPredCount,
+ MemI->hasName() ? MemI->getName() + ".phi" : "",
+ Header->getFirstNonPHI());
+
+ SCEVExpander SCEVE(*SE, Header->getModule()->getDataLayout(), "pistart");
+ Value *BasePtrStart = SCEVE.expandCodeFor(BasePtrStartSCEV, I8PtrTy,
+ LoopPredecessor->getTerminator());
+
+ // Note that LoopPredecessor might occur in the predecessor list multiple
+ // times, and we need to add it the right number of times.
+ for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header);
+ PI != PE; ++PI) {
+ if (*PI != LoopPredecessor)
+ continue;
+
+ NewPHI->addIncoming(BasePtrStart, LoopPredecessor);
+ }
+
+ Instruction *InsPoint = &*Header->getFirstInsertionPt();
+ GetElementPtrInst *PtrInc = GetElementPtrInst::Create(
+ I8Ty, NewPHI, BasePtrIncSCEV->getValue(),
+ MemI->hasName() ? MemI->getName() + ".inc" : "", InsPoint);
+ PtrInc->setIsInBounds(IsPtrInBounds(BasePtr));
+ for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header);
+ PI != PE; ++PI) {
+ if (*PI == LoopPredecessor)
+ continue;
+
+ NewPHI->addIncoming(PtrInc, *PI);
+ }
+
+ Instruction *NewBasePtr;
+ if (PtrInc->getType() != BasePtr->getType())
+ NewBasePtr = new BitCastInst(PtrInc, BasePtr->getType(),
+ PtrInc->hasName() ? PtrInc->getName() + ".cast" : "", InsPoint);
+ else
+ NewBasePtr = PtrInc;
+
+ if (Instruction *IDel = dyn_cast<Instruction>(BasePtr))
+ BBChanged.insert(IDel->getParent());
+ BasePtr->replaceAllUsesWith(NewBasePtr);
+ RecursivelyDeleteTriviallyDeadInstructions(BasePtr);
+
+ // Keep track of the replacement pointer values we've inserted so that we
+ // don't generate more pointer values than necessary.
+ SmallPtrSet<Value *, 16> NewPtrs;
+ NewPtrs.insert( NewBasePtr);
+
+ for (auto I = std::next(Buckets[i].Elements.begin()),
+ IE = Buckets[i].Elements.end(); I != IE; ++I) {
+ Value *Ptr = GetPointerOperand(I->Instr);
+ assert(Ptr && "No pointer operand");
+ if (NewPtrs.count(Ptr))
+ continue;
+
+ Instruction *RealNewPtr;
+ if (!I->Offset || I->Offset->getValue()->isZero()) {
+ RealNewPtr = NewBasePtr;
+ } else {
+ Instruction *PtrIP = dyn_cast<Instruction>(Ptr);
+ if (PtrIP && isa<Instruction>(NewBasePtr) &&
+ cast<Instruction>(NewBasePtr)->getParent() == PtrIP->getParent())
+ PtrIP = nullptr;
+ else if (isa<PHINode>(PtrIP))
+ PtrIP = &*PtrIP->getParent()->getFirstInsertionPt();
+ else if (!PtrIP)
+ PtrIP = I->Instr;
+
+ GetElementPtrInst *NewPtr = GetElementPtrInst::Create(
+ I8Ty, PtrInc, I->Offset->getValue(),
+ I->Instr->hasName() ? I->Instr->getName() + ".off" : "", PtrIP);
+ if (!PtrIP)
+ NewPtr->insertAfter(cast<Instruction>(PtrInc));
+ NewPtr->setIsInBounds(IsPtrInBounds(Ptr));
+ RealNewPtr = NewPtr;
+ }
+
+ if (Instruction *IDel = dyn_cast<Instruction>(Ptr))
+ BBChanged.insert(IDel->getParent());
+
+ Instruction *ReplNewPtr;
+ if (Ptr->getType() != RealNewPtr->getType()) {
+ ReplNewPtr = new BitCastInst(RealNewPtr, Ptr->getType(),
+ Ptr->hasName() ? Ptr->getName() + ".cast" : "");
+ ReplNewPtr->insertAfter(RealNewPtr);
+ } else
+ ReplNewPtr = RealNewPtr;
+
+ Ptr->replaceAllUsesWith(ReplNewPtr);
+ RecursivelyDeleteTriviallyDeadInstructions(Ptr);
+
+ NewPtrs.insert(RealNewPtr);
+ }
+
+ MadeChange = true;
+ }
+
+ for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
+ I != IE; ++I) {
+ if (BBChanged.count(*I))
+ DeleteDeadPHIs(*I);
+ }
+
+ return MadeChange;
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp b/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
new file mode 100644
index 000000000000..e527b018d4fb
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -0,0 +1,187 @@
+//===-- PPCMCInstLower.cpp - Convert PPC MachineInstr to an MCInst --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower PPC MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "MCTargetDesc/PPCMCExpr.h"
+#include "PPCSubtarget.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+using namespace llvm;
+
+static MachineModuleInfoMachO &getMachOMMI(AsmPrinter &AP) {
+ return AP.MMI->getObjFileInfo<MachineModuleInfoMachO>();
+}
+
+static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO,
+ AsmPrinter &AP) {
+ const TargetMachine &TM = AP.TM;
+ Mangler &Mang = TM.getObjFileLowering()->getMangler();
+ const DataLayout &DL = AP.getDataLayout();
+ MCContext &Ctx = AP.OutContext;
+
+ SmallString<128> Name;
+ StringRef Suffix;
+ if (MO.getTargetFlags() & PPCII::MO_NLP_FLAG)
+ Suffix = "$non_lazy_ptr";
+
+ if (!Suffix.empty())
+ Name += DL.getPrivateGlobalPrefix();
+
+ if (!MO.isGlobal()) {
+ assert(MO.isSymbol() && "Isn't a symbol reference");
+ Mangler::getNameWithPrefix(Name, MO.getSymbolName(), DL);
+ } else {
+ const GlobalValue *GV = MO.getGlobal();
+ TM.getNameWithPrefix(Name, GV, Mang);
+ }
+
+ Name += Suffix;
+ MCSymbol *Sym = Ctx.getOrCreateSymbol(Name);
+
+ // If the symbol reference is actually to a non_lazy_ptr, not to the symbol,
+ // then add the suffix.
+ if (MO.getTargetFlags() & PPCII::MO_NLP_FLAG) {
+ MachineModuleInfoMachO &MachO = getMachOMMI(AP);
+
+ MachineModuleInfoImpl::StubValueTy &StubSym = MachO.getGVStubEntry(Sym);
+
+ if (!StubSym.getPointer()) {
+ assert(MO.isGlobal() && "Extern symbol not handled yet");
+ StubSym = MachineModuleInfoImpl::
+ StubValueTy(AP.getSymbol(MO.getGlobal()),
+ !MO.getGlobal()->hasInternalLinkage());
+ }
+ return Sym;
+ }
+
+ return Sym;
+}
+
+static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
+ AsmPrinter &Printer, bool isDarwin) {
+ MCContext &Ctx = Printer.OutContext;
+ MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
+
+ unsigned access = MO.getTargetFlags() & PPCII::MO_ACCESS_MASK;
+
+ switch (access) {
+ case PPCII::MO_TPREL_LO:
+ RefKind = MCSymbolRefExpr::VK_PPC_TPREL_LO;
+ break;
+ case PPCII::MO_TPREL_HA:
+ RefKind = MCSymbolRefExpr::VK_PPC_TPREL_HA;
+ break;
+ case PPCII::MO_DTPREL_LO:
+ RefKind = MCSymbolRefExpr::VK_PPC_DTPREL_LO;
+ break;
+ case PPCII::MO_TLSLD_LO:
+ RefKind = MCSymbolRefExpr::VK_PPC_GOT_TLSLD_LO;
+ break;
+ case PPCII::MO_TOC_LO:
+ RefKind = MCSymbolRefExpr::VK_PPC_TOC_LO;
+ break;
+ case PPCII::MO_TLS:
+ RefKind = MCSymbolRefExpr::VK_PPC_TLS;
+ break;
+ }
+
+ if (MO.getTargetFlags() == PPCII::MO_PLT)
+ RefKind = MCSymbolRefExpr::VK_PLT;
+
+ const MCExpr *Expr = MCSymbolRefExpr::create(Symbol, RefKind, Ctx);
+
+ if (!MO.isJTI() && MO.getOffset())
+ Expr = MCBinaryExpr::createAdd(Expr,
+ MCConstantExpr::create(MO.getOffset(), Ctx),
+ Ctx);
+
+ // Subtract off the PIC base if required.
+ if (MO.getTargetFlags() & PPCII::MO_PIC_FLAG) {
+ const MachineFunction *MF = MO.getParent()->getParent()->getParent();
+
+ const MCExpr *PB = MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
+ Expr = MCBinaryExpr::createSub(Expr, PB, Ctx);
+ }
+
+ // Add ha16() / lo16() markers if required.
+ switch (access) {
+ case PPCII::MO_LO:
+ Expr = PPCMCExpr::createLo(Expr, isDarwin, Ctx);
+ break;
+ case PPCII::MO_HA:
+ Expr = PPCMCExpr::createHa(Expr, isDarwin, Ctx);
+ break;
+ }
+
+ return MCOperand::createExpr(Expr);
+}
+
+void llvm::LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
+ AsmPrinter &AP, bool isDarwin) {
+ OutMI.setOpcode(MI->getOpcode());
+
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+
+ MCOperand MCOp;
+ switch (MO.getType()) {
+ default:
+ MI->dump();
+ llvm_unreachable("unknown operand type");
+ case MachineOperand::MO_Register:
+ assert(!MO.getSubReg() && "Subregs should be eliminated!");
+ assert(MO.getReg() > PPC::NoRegister &&
+ MO.getReg() < PPC::NUM_TARGET_REGS &&
+ "Invalid register for this target!");
+ MCOp = MCOperand::createReg(MO.getReg());
+ break;
+ case MachineOperand::MO_Immediate:
+ MCOp = MCOperand::createImm(MO.getImm());
+ break;
+ case MachineOperand::MO_MachineBasicBlock:
+ MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(
+ MO.getMBB()->getSymbol(), AP.OutContext));
+ break;
+ case MachineOperand::MO_GlobalAddress:
+ case MachineOperand::MO_ExternalSymbol:
+ MCOp = GetSymbolRef(MO, GetSymbolFromOperand(MO, AP), AP, isDarwin);
+ break;
+ case MachineOperand::MO_JumpTableIndex:
+ MCOp = GetSymbolRef(MO, AP.GetJTISymbol(MO.getIndex()), AP, isDarwin);
+ break;
+ case MachineOperand::MO_ConstantPoolIndex:
+ MCOp = GetSymbolRef(MO, AP.GetCPISymbol(MO.getIndex()), AP, isDarwin);
+ break;
+ case MachineOperand::MO_BlockAddress:
+ MCOp = GetSymbolRef(MO,AP.GetBlockAddressSymbol(MO.getBlockAddress()),AP,
+ isDarwin);
+ break;
+ case MachineOperand::MO_RegisterMask:
+ continue;
+ }
+
+ OutMI.addOperand(MCOp);
+ }
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/contrib/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
new file mode 100644
index 000000000000..2413af3f7042
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
@@ -0,0 +1,392 @@
+//===-------------- PPCMIPeephole.cpp - MI Peephole Cleanups -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+//
+// This pass performs peephole optimizations to clean up ugly code
+// sequences at the MachineInstruction layer. It runs at the end of
+// the SSA phases, following VSX swap removal. A pass of dead code
+// elimination follows this one for quick clean-up of any dead
+// instructions introduced here. Although we could do this as callbacks
+// from the generic peephole pass, this would have a couple of bad
+// effects: it might remove optimization opportunities for VSX swap
+// removal, and it would miss cleanups made possible following VSX
+// swap removal.
+//
+//===---------------------------------------------------------------------===//
+
+#include "PPCInstrInfo.h"
+#include "PPC.h"
+#include "PPCInstrBuilder.h"
+#include "PPCTargetMachine.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-mi-peepholes"
+
+namespace llvm {
+ void initializePPCMIPeepholePass(PassRegistry&);
+}
+
+namespace {
+
+struct PPCMIPeephole : public MachineFunctionPass {
+
+ static char ID;
+ const PPCInstrInfo *TII;
+ MachineFunction *MF;
+ MachineRegisterInfo *MRI;
+
+ PPCMIPeephole() : MachineFunctionPass(ID) {
+ initializePPCMIPeepholePass(*PassRegistry::getPassRegistry());
+ }
+
+private:
+ // Initialize class variables.
+ void initialize(MachineFunction &MFParm);
+
+ // Perform peepholes.
+ bool simplifyCode(void);
+
+ // Find the "true" register represented by SrcReg (following chains
+ // of copies and subreg_to_reg operations).
+ unsigned lookThruCopyLike(unsigned SrcReg);
+
+public:
+ // Main entry point for this pass.
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+ initialize(MF);
+ return simplifyCode();
+ }
+};
+
+// Initialize class variables.
+void PPCMIPeephole::initialize(MachineFunction &MFParm) {
+ MF = &MFParm;
+ MRI = &MF->getRegInfo();
+ TII = MF->getSubtarget<PPCSubtarget>().getInstrInfo();
+ DEBUG(dbgs() << "*** PowerPC MI peephole pass ***\n\n");
+ DEBUG(MF->dump());
+}
+
+// Perform peephole optimizations.
+bool PPCMIPeephole::simplifyCode(void) {
+ bool Simplified = false;
+ MachineInstr* ToErase = nullptr;
+
+ for (MachineBasicBlock &MBB : *MF) {
+ for (MachineInstr &MI : MBB) {
+
+ // If the previous instruction was marked for elimination,
+ // remove it now.
+ if (ToErase) {
+ ToErase->eraseFromParent();
+ ToErase = nullptr;
+ }
+
+ // Ignore debug instructions.
+ if (MI.isDebugValue())
+ continue;
+
+ // Per-opcode peepholes.
+ switch (MI.getOpcode()) {
+
+ default:
+ break;
+
+ case PPC::XXPERMDI: {
+ // Perform simplifications of 2x64 vector swaps and splats.
+ // A swap is identified by an immediate value of 2, and a splat
+ // is identified by an immediate value of 0 or 3.
+ int Immed = MI.getOperand(3).getImm();
+
+ if (Immed != 1) {
+
+ // For each of these simplifications, we need the two source
+ // regs to match. Unfortunately, MachineCSE ignores COPY and
+ // SUBREG_TO_REG, so for example we can see
+ // XXPERMDI t, SUBREG_TO_REG(s), SUBREG_TO_REG(s), immed.
+ // We have to look through chains of COPY and SUBREG_TO_REG
+ // to find the real source values for comparison.
+ unsigned TrueReg1 = lookThruCopyLike(MI.getOperand(1).getReg());
+ unsigned TrueReg2 = lookThruCopyLike(MI.getOperand(2).getReg());
+
+ if (TrueReg1 == TrueReg2
+ && TargetRegisterInfo::isVirtualRegister(TrueReg1)) {
+ MachineInstr *DefMI = MRI->getVRegDef(TrueReg1);
+ unsigned DefOpc = DefMI ? DefMI->getOpcode() : 0;
+
+ // If this is a splat fed by a splatting load, the splat is
+ // redundant. Replace with a copy. This doesn't happen directly due
+ // to code in PPCDAGToDAGISel.cpp, but it can happen when converting
+ // a load of a double to a vector of 64-bit integers.
+ auto isConversionOfLoadAndSplat = [=]() -> bool {
+ if (DefOpc != PPC::XVCVDPSXDS && DefOpc != PPC::XVCVDPUXDS)
+ return false;
+ unsigned DefReg = lookThruCopyLike(DefMI->getOperand(1).getReg());
+ if (TargetRegisterInfo::isVirtualRegister(DefReg)) {
+ MachineInstr *LoadMI = MRI->getVRegDef(DefReg);
+ if (LoadMI && LoadMI->getOpcode() == PPC::LXVDSX)
+ return true;
+ }
+ return false;
+ };
+ if (DefMI && (Immed == 0 || Immed == 3)) {
+ if (DefOpc == PPC::LXVDSX || isConversionOfLoadAndSplat()) {
+ DEBUG(dbgs()
+ << "Optimizing load-and-splat/splat "
+ "to load-and-splat/copy: ");
+ DEBUG(MI.dump());
+ BuildMI(MBB, &MI, MI.getDebugLoc(),
+ TII->get(PPC::COPY), MI.getOperand(0).getReg())
+ .addOperand(MI.getOperand(1));
+ ToErase = &MI;
+ Simplified = true;
+ }
+ }
+
+ // If this is a splat or a swap fed by another splat, we
+ // can replace it with a copy.
+ if (DefOpc == PPC::XXPERMDI) {
+ unsigned FeedImmed = DefMI->getOperand(3).getImm();
+ unsigned FeedReg1
+ = lookThruCopyLike(DefMI->getOperand(1).getReg());
+ unsigned FeedReg2
+ = lookThruCopyLike(DefMI->getOperand(2).getReg());
+
+ if ((FeedImmed == 0 || FeedImmed == 3) && FeedReg1 == FeedReg2) {
+ DEBUG(dbgs()
+ << "Optimizing splat/swap or splat/splat "
+ "to splat/copy: ");
+ DEBUG(MI.dump());
+ BuildMI(MBB, &MI, MI.getDebugLoc(),
+ TII->get(PPC::COPY), MI.getOperand(0).getReg())
+ .addOperand(MI.getOperand(1));
+ ToErase = &MI;
+ Simplified = true;
+ }
+
+ // If this is a splat fed by a swap, we can simplify modify
+ // the splat to splat the other value from the swap's input
+ // parameter.
+ else if ((Immed == 0 || Immed == 3)
+ && FeedImmed == 2 && FeedReg1 == FeedReg2) {
+ DEBUG(dbgs() << "Optimizing swap/splat => splat: ");
+ DEBUG(MI.dump());
+ MI.getOperand(1).setReg(DefMI->getOperand(1).getReg());
+ MI.getOperand(2).setReg(DefMI->getOperand(2).getReg());
+ MI.getOperand(3).setImm(3 - Immed);
+ Simplified = true;
+ }
+
+ // If this is a swap fed by a swap, we can replace it
+ // with a copy from the first swap's input.
+ else if (Immed == 2 && FeedImmed == 2 && FeedReg1 == FeedReg2) {
+ DEBUG(dbgs() << "Optimizing swap/swap => copy: ");
+ DEBUG(MI.dump());
+ BuildMI(MBB, &MI, MI.getDebugLoc(),
+ TII->get(PPC::COPY), MI.getOperand(0).getReg())
+ .addOperand(DefMI->getOperand(1));
+ ToErase = &MI;
+ Simplified = true;
+ }
+ } else if ((Immed == 0 || Immed == 3) && DefOpc == PPC::XXPERMDIs &&
+ (DefMI->getOperand(2).getImm() == 0 ||
+ DefMI->getOperand(2).getImm() == 3)) {
+ // Splat fed by another splat - switch the output of the first
+ // and remove the second.
+ DefMI->getOperand(0).setReg(MI.getOperand(0).getReg());
+ ToErase = &MI;
+ Simplified = true;
+ DEBUG(dbgs() << "Removing redundant splat: ");
+ DEBUG(MI.dump());
+ }
+ }
+ }
+ break;
+ }
+ case PPC::VSPLTB:
+ case PPC::VSPLTH:
+ case PPC::XXSPLTW: {
+ unsigned MyOpcode = MI.getOpcode();
+ unsigned OpNo = MyOpcode == PPC::XXSPLTW ? 1 : 2;
+ unsigned TrueReg = lookThruCopyLike(MI.getOperand(OpNo).getReg());
+ if (!TargetRegisterInfo::isVirtualRegister(TrueReg))
+ break;
+ MachineInstr *DefMI = MRI->getVRegDef(TrueReg);
+ if (!DefMI)
+ break;
+ unsigned DefOpcode = DefMI->getOpcode();
+ auto isConvertOfSplat = [=]() -> bool {
+ if (DefOpcode != PPC::XVCVSPSXWS && DefOpcode != PPC::XVCVSPUXWS)
+ return false;
+ unsigned ConvReg = DefMI->getOperand(1).getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(ConvReg))
+ return false;
+ MachineInstr *Splt = MRI->getVRegDef(ConvReg);
+ return Splt && (Splt->getOpcode() == PPC::LXVWSX ||
+ Splt->getOpcode() == PPC::XXSPLTW);
+ };
+ bool AlreadySplat = (MyOpcode == DefOpcode) ||
+ (MyOpcode == PPC::VSPLTB && DefOpcode == PPC::VSPLTBs) ||
+ (MyOpcode == PPC::VSPLTH && DefOpcode == PPC::VSPLTHs) ||
+ (MyOpcode == PPC::XXSPLTW && DefOpcode == PPC::XXSPLTWs) ||
+ (MyOpcode == PPC::XXSPLTW && DefOpcode == PPC::LXVWSX) ||
+ (MyOpcode == PPC::XXSPLTW && DefOpcode == PPC::MTVSRWS)||
+ (MyOpcode == PPC::XXSPLTW && isConvertOfSplat());
+ // If the instruction[s] that feed this splat have already splat
+ // the value, this splat is redundant.
+ if (AlreadySplat) {
+ DEBUG(dbgs() << "Changing redundant splat to a copy: ");
+ DEBUG(MI.dump());
+ BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY),
+ MI.getOperand(0).getReg())
+ .addOperand(MI.getOperand(OpNo));
+ ToErase = &MI;
+ Simplified = true;
+ }
+ // Splat fed by a shift. Usually when we align value to splat into
+ // vector element zero.
+ if (DefOpcode == PPC::XXSLDWI) {
+ unsigned ShiftRes = DefMI->getOperand(0).getReg();
+ unsigned ShiftOp1 = DefMI->getOperand(1).getReg();
+ unsigned ShiftOp2 = DefMI->getOperand(2).getReg();
+ unsigned ShiftImm = DefMI->getOperand(3).getImm();
+ unsigned SplatImm = MI.getOperand(2).getImm();
+ if (ShiftOp1 == ShiftOp2) {
+ unsigned NewElem = (SplatImm + ShiftImm) & 0x3;
+ if (MRI->hasOneNonDBGUse(ShiftRes)) {
+ DEBUG(dbgs() << "Removing redundant shift: ");
+ DEBUG(DefMI->dump());
+ ToErase = DefMI;
+ }
+ Simplified = true;
+ DEBUG(dbgs() << "Changing splat immediate from " << SplatImm <<
+ " to " << NewElem << " in instruction: ");
+ DEBUG(MI.dump());
+ MI.getOperand(1).setReg(ShiftOp1);
+ MI.getOperand(2).setImm(NewElem);
+ }
+ }
+ break;
+ }
+ case PPC::XVCVDPSP: {
+ // If this is a DP->SP conversion fed by an FRSP, the FRSP is redundant.
+ unsigned TrueReg = lookThruCopyLike(MI.getOperand(1).getReg());
+ if (!TargetRegisterInfo::isVirtualRegister(TrueReg))
+ break;
+ MachineInstr *DefMI = MRI->getVRegDef(TrueReg);
+
+ // This can occur when building a vector of single precision or integer
+ // values.
+ if (DefMI && DefMI->getOpcode() == PPC::XXPERMDI) {
+ unsigned DefsReg1 = lookThruCopyLike(DefMI->getOperand(1).getReg());
+ unsigned DefsReg2 = lookThruCopyLike(DefMI->getOperand(2).getReg());
+ if (!TargetRegisterInfo::isVirtualRegister(DefsReg1) ||
+ !TargetRegisterInfo::isVirtualRegister(DefsReg2))
+ break;
+ MachineInstr *P1 = MRI->getVRegDef(DefsReg1);
+ MachineInstr *P2 = MRI->getVRegDef(DefsReg2);
+
+ if (!P1 || !P2)
+ break;
+
+ // Remove the passed FRSP instruction if it only feeds this MI and
+ // set any uses of that FRSP (in this MI) to the source of the FRSP.
+ auto removeFRSPIfPossible = [&](MachineInstr *RoundInstr) {
+ if (RoundInstr->getOpcode() == PPC::FRSP &&
+ MRI->hasOneNonDBGUse(RoundInstr->getOperand(0).getReg())) {
+ Simplified = true;
+ unsigned ConvReg1 = RoundInstr->getOperand(1).getReg();
+ unsigned FRSPDefines = RoundInstr->getOperand(0).getReg();
+ MachineInstr &Use = *(MRI->use_instr_begin(FRSPDefines));
+ for (int i = 0, e = Use.getNumOperands(); i < e; ++i)
+ if (Use.getOperand(i).isReg() &&
+ Use.getOperand(i).getReg() == FRSPDefines)
+ Use.getOperand(i).setReg(ConvReg1);
+ DEBUG(dbgs() << "Removing redundant FRSP:\n");
+ DEBUG(RoundInstr->dump());
+ DEBUG(dbgs() << "As it feeds instruction:\n");
+ DEBUG(MI.dump());
+ DEBUG(dbgs() << "Through instruction:\n");
+ DEBUG(DefMI->dump());
+ RoundInstr->eraseFromParent();
+ }
+ };
+
+ // If the input to XVCVDPSP is a vector that was built (even
+ // partially) out of FRSP's, the FRSP(s) can safely be removed
+ // since this instruction performs the same operation.
+ if (P1 != P2) {
+ removeFRSPIfPossible(P1);
+ removeFRSPIfPossible(P2);
+ break;
+ }
+ removeFRSPIfPossible(P1);
+ }
+ break;
+ }
+ }
+ }
+ // If the last instruction was marked for elimination,
+ // remove it now.
+ if (ToErase) {
+ ToErase->eraseFromParent();
+ ToErase = nullptr;
+ }
+ }
+
+ return Simplified;
+}
+
+// This is used to find the "true" source register for an
+// XXPERMDI instruction, since MachineCSE does not handle the
+// "copy-like" operations (Copy and SubregToReg). Returns
+// the original SrcReg unless it is the target of a copy-like
+// operation, in which case we chain backwards through all
+// such operations to the ultimate source register. If a
+// physical register is encountered, we stop the search.
+unsigned PPCMIPeephole::lookThruCopyLike(unsigned SrcReg) {
+
+ while (true) {
+
+ MachineInstr *MI = MRI->getVRegDef(SrcReg);
+ if (!MI->isCopyLike())
+ return SrcReg;
+
+ unsigned CopySrcReg;
+ if (MI->isCopy())
+ CopySrcReg = MI->getOperand(1).getReg();
+ else {
+ assert(MI->isSubregToReg() && "bad opcode for lookThruCopyLike");
+ CopySrcReg = MI->getOperand(2).getReg();
+ }
+
+ if (!TargetRegisterInfo::isVirtualRegister(CopySrcReg))
+ return CopySrcReg;
+
+ SrcReg = CopySrcReg;
+ }
+}
+
+} // end default namespace
+
+INITIALIZE_PASS_BEGIN(PPCMIPeephole, DEBUG_TYPE,
+ "PowerPC MI Peephole Optimization", false, false)
+INITIALIZE_PASS_END(PPCMIPeephole, DEBUG_TYPE,
+ "PowerPC MI Peephole Optimization", false, false)
+
+char PPCMIPeephole::ID = 0;
+FunctionPass*
+llvm::createPPCMIPeepholePass() { return new PPCMIPeephole(); }
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
new file mode 100644
index 000000000000..9d91e31165de
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
@@ -0,0 +1,46 @@
+//===-- PPCMachineFunctionInfo.cpp - Private data used for PowerPC --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCMachineFunctionInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+void PPCFunctionInfo::anchor() { }
+
+MCSymbol *PPCFunctionInfo::getPICOffsetSymbol() const {
+ const DataLayout &DL = MF.getDataLayout();
+ return MF.getContext().getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) +
+ Twine(MF.getFunctionNumber()) +
+ "$poff");
+}
+
+MCSymbol *PPCFunctionInfo::getGlobalEPSymbol() const {
+ const DataLayout &DL = MF.getDataLayout();
+ return MF.getContext().getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) +
+ "func_gep" +
+ Twine(MF.getFunctionNumber()));
+}
+
+MCSymbol *PPCFunctionInfo::getLocalEPSymbol() const {
+ const DataLayout &DL = MF.getDataLayout();
+ return MF.getContext().getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) +
+ "func_lep" +
+ Twine(MF.getFunctionNumber()));
+}
+
+MCSymbol *PPCFunctionInfo::getTOCOffsetSymbol() const {
+ const DataLayout &DL = MF.getDataLayout();
+ return MF.getContext().getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) +
+ "func_toc" +
+ Twine(MF.getFunctionNumber()));
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
new file mode 100644
index 000000000000..4c29aa06f048
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
@@ -0,0 +1,217 @@
+//===-- PPCMachineFunctionInfo.h - Private data used for PowerPC --*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the PowerPC specific subclass of MachineFunctionInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCMACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_POWERPC_PPCMACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+/// PPCFunctionInfo - This class is derived from MachineFunction private
+/// PowerPC target-specific information for each MachineFunction.
+class PPCFunctionInfo : public MachineFunctionInfo {
+ virtual void anchor();
+
+ /// FramePointerSaveIndex - Frame index of where the old frame pointer is
+ /// stored. Also used as an anchor for instructions that need to be altered
+ /// when using frame pointers (dyna_add, dyna_sub.)
+ int FramePointerSaveIndex;
+
+ /// ReturnAddrSaveIndex - Frame index of where the return address is stored.
+ ///
+ int ReturnAddrSaveIndex;
+
+ /// Frame index where the old base pointer is stored.
+ int BasePointerSaveIndex;
+
+ /// Frame index where the old PIC base pointer is stored.
+ int PICBasePointerSaveIndex;
+
+ /// MustSaveLR - Indicates whether LR is defined (or clobbered) in the current
+ /// function. This is only valid after the initial scan of the function by
+ /// PEI.
+ bool MustSaveLR;
+
+ /// Does this function have any stack spills.
+ bool HasSpills;
+
+ /// Does this function spill using instructions with only r+r (not r+i)
+ /// forms.
+ bool HasNonRISpills;
+
+ /// SpillsCR - Indicates whether CR is spilled in the current function.
+ bool SpillsCR;
+
+ /// Indicates whether VRSAVE is spilled in the current function.
+ bool SpillsVRSAVE;
+
+ /// LRStoreRequired - The bool indicates whether there is some explicit use of
+ /// the LR/LR8 stack slot that is not obvious from scanning the code. This
+ /// requires that the code generator produce a store of LR to the stack on
+ /// entry, even though LR may otherwise apparently not be used.
+ bool LRStoreRequired;
+
+ /// This function makes use of the PPC64 ELF TOC base pointer (register r2).
+ bool UsesTOCBasePtr;
+
+ /// MinReservedArea - This is the frame size that is at least reserved in a
+ /// potential caller (parameter+linkage area).
+ unsigned MinReservedArea;
+
+ /// TailCallSPDelta - Stack pointer delta used when tail calling. Maximum
+ /// amount the stack pointer is adjusted to make the frame bigger for tail
+ /// calls. Used for creating an area before the register spill area.
+ int TailCallSPDelta;
+
+ /// HasFastCall - Does this function contain a fast call. Used to determine
+ /// how the caller's stack pointer should be calculated (epilog/dynamicalloc).
+ bool HasFastCall;
+
+ /// VarArgsFrameIndex - FrameIndex for start of varargs area.
+ int VarArgsFrameIndex;
+ /// VarArgsStackOffset - StackOffset for start of stack
+ /// arguments.
+ int VarArgsStackOffset;
+ /// VarArgsNumGPR - Index of the first unused integer
+ /// register for parameter passing.
+ unsigned VarArgsNumGPR;
+ /// VarArgsNumFPR - Index of the first unused double
+ /// register for parameter passing.
+ unsigned VarArgsNumFPR;
+
+ /// CRSpillFrameIndex - FrameIndex for CR spill slot for 32-bit SVR4.
+ int CRSpillFrameIndex;
+
+ /// If any of CR[2-4] need to be saved in the prologue and restored in the
+ /// epilogue then they are added to this array. This is used for the
+ /// 64-bit SVR4 ABI.
+ SmallVector<unsigned, 3> MustSaveCRs;
+
+ /// Hold onto our MachineFunction context.
+ MachineFunction &MF;
+
+ /// Whether this uses the PIC Base register or not.
+ bool UsesPICBase;
+
+ /// True if this function has a subset of CSRs that is handled explicitly via
+ /// copies
+ bool IsSplitCSR;
+
+public:
+ explicit PPCFunctionInfo(MachineFunction &MF)
+ : FramePointerSaveIndex(0),
+ ReturnAddrSaveIndex(0),
+ BasePointerSaveIndex(0),
+ PICBasePointerSaveIndex(0),
+ HasSpills(false),
+ HasNonRISpills(false),
+ SpillsCR(false),
+ SpillsVRSAVE(false),
+ LRStoreRequired(false),
+ UsesTOCBasePtr(false),
+ MinReservedArea(0),
+ TailCallSPDelta(0),
+ HasFastCall(false),
+ VarArgsFrameIndex(0),
+ VarArgsStackOffset(0),
+ VarArgsNumGPR(0),
+ VarArgsNumFPR(0),
+ CRSpillFrameIndex(0),
+ MF(MF),
+ UsesPICBase(0),
+ IsSplitCSR(false) {}
+
+ int getFramePointerSaveIndex() const { return FramePointerSaveIndex; }
+ void setFramePointerSaveIndex(int Idx) { FramePointerSaveIndex = Idx; }
+
+ int getReturnAddrSaveIndex() const { return ReturnAddrSaveIndex; }
+ void setReturnAddrSaveIndex(int idx) { ReturnAddrSaveIndex = idx; }
+
+ int getBasePointerSaveIndex() const { return BasePointerSaveIndex; }
+ void setBasePointerSaveIndex(int Idx) { BasePointerSaveIndex = Idx; }
+
+ int getPICBasePointerSaveIndex() const { return PICBasePointerSaveIndex; }
+ void setPICBasePointerSaveIndex(int Idx) { PICBasePointerSaveIndex = Idx; }
+
+ unsigned getMinReservedArea() const { return MinReservedArea; }
+ void setMinReservedArea(unsigned size) { MinReservedArea = size; }
+
+ int getTailCallSPDelta() const { return TailCallSPDelta; }
+ void setTailCallSPDelta(int size) { TailCallSPDelta = size; }
+
+ /// MustSaveLR - This is set when the prolog/epilog inserter does its initial
+ /// scan of the function. It is true if the LR/LR8 register is ever explicitly
+ /// defined/clobbered in the machine function (e.g. by calls and movpctolr,
+ /// which is used in PIC generation), or if the LR stack slot is explicitly
+ /// referenced by builtin_return_address.
+ void setMustSaveLR(bool U) { MustSaveLR = U; }
+ bool mustSaveLR() const { return MustSaveLR; }
+
+ void setHasSpills() { HasSpills = true; }
+ bool hasSpills() const { return HasSpills; }
+
+ void setHasNonRISpills() { HasNonRISpills = true; }
+ bool hasNonRISpills() const { return HasNonRISpills; }
+
+ void setSpillsCR() { SpillsCR = true; }
+ bool isCRSpilled() const { return SpillsCR; }
+
+ void setSpillsVRSAVE() { SpillsVRSAVE = true; }
+ bool isVRSAVESpilled() const { return SpillsVRSAVE; }
+
+ void setLRStoreRequired() { LRStoreRequired = true; }
+ bool isLRStoreRequired() const { return LRStoreRequired; }
+
+ void setUsesTOCBasePtr() { UsesTOCBasePtr = true; }
+ bool usesTOCBasePtr() const { return UsesTOCBasePtr; }
+
+ void setHasFastCall() { HasFastCall = true; }
+ bool hasFastCall() const { return HasFastCall;}
+
+ int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+ void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
+
+ int getVarArgsStackOffset() const { return VarArgsStackOffset; }
+ void setVarArgsStackOffset(int Offset) { VarArgsStackOffset = Offset; }
+
+ unsigned getVarArgsNumGPR() const { return VarArgsNumGPR; }
+ void setVarArgsNumGPR(unsigned Num) { VarArgsNumGPR = Num; }
+
+ unsigned getVarArgsNumFPR() const { return VarArgsNumFPR; }
+ void setVarArgsNumFPR(unsigned Num) { VarArgsNumFPR = Num; }
+
+ int getCRSpillFrameIndex() const { return CRSpillFrameIndex; }
+ void setCRSpillFrameIndex(int idx) { CRSpillFrameIndex = idx; }
+
+ const SmallVectorImpl<unsigned> &
+ getMustSaveCRs() const { return MustSaveCRs; }
+ void addMustSaveCR(unsigned Reg) { MustSaveCRs.push_back(Reg); }
+
+ void setUsesPICBase(bool uses) { UsesPICBase = uses; }
+ bool usesPICBase() const { return UsesPICBase; }
+
+ bool isSplitCSR() const { return IsSplitCSR; }
+ void setIsSplitCSR(bool s) { IsSplitCSR = s; }
+
+ MCSymbol *getPICOffsetSymbol() const;
+
+ MCSymbol *getGlobalEPSymbol() const;
+ MCSymbol *getLocalEPSymbol() const;
+ MCSymbol *getTOCOffsetSymbol() const;
+};
+
+} // end of namespace llvm
+
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCPerfectShuffle.h b/contrib/llvm/lib/Target/PowerPC/PPCPerfectShuffle.h
new file mode 100644
index 000000000000..8a1d68011c5f
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCPerfectShuffle.h
@@ -0,0 +1,6591 @@
+//===-- PPCPerfectShuffle.h - Altivec Perfect Shuffle Table -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file, which was autogenerated by llvm-PerfectShuffle, contains data
+// for the optimal way to build a perfect shuffle without using vperm.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCPERFECTSHUFFLE_H
+#define LLVM_LIB_TARGET_POWERPC_PPCPERFECTSHUFFLE_H
+
+// 31 entries have cost 0
+// 292 entries have cost 1
+// 1384 entries have cost 2
+// 3061 entries have cost 3
+// 1733 entries have cost 4
+// 60 entries have cost 5
+
+// This table is 6561*4 = 26244 bytes in size.
+static const unsigned PerfectShuffleTable[6561+1] = {
+ 202162278U, // <0,0,0,0>: Cost 1 vspltisw0 LHS
+ 1140850790U, // <0,0,0,1>: Cost 2 vmrghw <0,0,0,0>, LHS
+ 2617247181U, // <0,0,0,2>: Cost 3 vsldoi4 <0,0,0,0>, <2,0,3,0>
+ 2635163787U, // <0,0,0,3>: Cost 3 vsldoi4 <3,0,0,0>, <3,0,0,0>
+ 1543507254U, // <0,0,0,4>: Cost 2 vsldoi4 <0,0,0,0>, RHS
+ 2281701705U, // <0,0,0,5>: Cost 3 vmrglw <0,0,0,0>, <0,4,0,5>
+ 2617250133U, // <0,0,0,6>: Cost 3 vsldoi4 <0,0,0,0>, <6,0,7,0>
+ 2659054575U, // <0,0,0,7>: Cost 3 vsldoi4 <7,0,0,0>, <7,0,0,0>
+ 202162278U, // <0,0,0,u>: Cost 1 vspltisw0 LHS
+ 1141686282U, // <0,0,1,0>: Cost 2 vmrghw LHS, <0,0,1,1>
+ 67944550U, // <0,0,1,1>: Cost 1 vmrghw LHS, LHS
+ 1685241958U, // <0,0,1,2>: Cost 2 vsldoi12 <1,2,3,0>, LHS
+ 2215870716U, // <0,0,1,3>: Cost 3 vmrghw LHS, <0,3,1,0>
+ 1141727570U, // <0,0,1,4>: Cost 2 vmrghw LHS, <0,4,1,5>
+ 2215428562U, // <0,0,1,5>: Cost 3 vmrghw LHS, <0,5,6,7>
+ 2215428589U, // <0,0,1,6>: Cost 3 vmrghw LHS, <0,6,0,7>
+ 2659062768U, // <0,0,1,7>: Cost 3 vsldoi4 <7,0,0,1>, <7,0,0,1>
+ 67945117U, // <0,0,1,u>: Cost 1 vmrghw LHS, LHS
+ 2684356045U, // <0,0,2,0>: Cost 3 vsldoi8 <0,0,0,0>, <2,0,3,0>
+ 2216009830U, // <0,0,2,1>: Cost 3 vmrghw <0,2,1,2>, LHS
+ 2216009901U, // <0,0,2,2>: Cost 3 vmrghw <0,2,1,2>, <0,2,1,2>
+ 2698290853U, // <0,0,2,3>: Cost 3 vsldoi8 <2,3,0,0>, <2,3,0,0>
+ 3289751890U, // <0,0,2,4>: Cost 4 vmrghw <0,2,1,2>, <0,4,1,5>
+ 3758098275U, // <0,0,2,5>: Cost 4 vsldoi8 <0,0,0,0>, <2,5,3,1>
+ 2684356538U, // <0,0,2,6>: Cost 3 vsldoi8 <0,0,0,0>, <2,6,3,7>
+ 3758098410U, // <0,0,2,7>: Cost 4 vsldoi8 <0,0,0,0>, <2,7,0,1>
+ 2216010397U, // <0,0,2,u>: Cost 3 vmrghw <0,2,1,2>, LHS
+ 2702272651U, // <0,0,3,0>: Cost 3 vsldoi8 <3,0,0,0>, <3,0,0,0>
+ 2216656998U, // <0,0,3,1>: Cost 3 vmrghw <0,3,1,0>, LHS
+ 3844669704U, // <0,0,3,2>: Cost 4 vsldoi12 <3,2,3,0>, <0,3,2,3>
+ 2216657148U, // <0,0,3,3>: Cost 3 vmrghw <0,3,1,0>, <0,3,1,0>
+ 2684357122U, // <0,0,3,4>: Cost 3 vsldoi8 <0,0,0,0>, <3,4,5,6>
+ 3732820066U, // <0,0,3,5>: Cost 4 vsldoi4 <7,0,0,3>, <5,6,7,0>
+ 3778005624U, // <0,0,3,6>: Cost 4 vsldoi8 <3,3,0,0>, <3,6,0,7>
+ 3374713464U, // <0,0,3,7>: Cost 4 vmrglw <3,2,0,3>, <3,6,0,7>
+ 2216657565U, // <0,0,3,u>: Cost 3 vmrghw <0,3,1,0>, LHS
+ 2217361408U, // <0,0,4,0>: Cost 3 vmrghw <0,4,1,5>, <0,0,0,0>
+ 1143619686U, // <0,0,4,1>: Cost 2 vmrghw <0,4,1,5>, LHS
+ 3291103405U, // <0,0,4,2>: Cost 4 vmrghw <0,4,1,5>, <0,2,1,2>
+ 3827269988U, // <0,0,4,3>: Cost 4 vsldoi12 <0,3,1,0>, <0,4,3,5>
+ 1143619922U, // <0,0,4,4>: Cost 2 vmrghw <0,4,1,5>, <0,4,1,5>
+ 1610616118U, // <0,0,4,5>: Cost 2 vsldoi8 <0,0,0,0>, RHS
+ 3758099833U, // <0,0,4,6>: Cost 4 vsldoi8 <0,0,0,0>, <4,6,5,2>
+ 3854107016U, // <0,0,4,7>: Cost 4 vsldoi12 <4,7,5,0>, <0,4,7,5>
+ 1143620253U, // <0,0,4,u>: Cost 2 vmrghw <0,4,1,5>, LHS
+ 2284396544U, // <0,0,5,0>: Cost 3 vmrglw <0,4,0,5>, <0,0,0,0>
+ 2218025062U, // <0,0,5,1>: Cost 3 vmrghw <0,5,1,5>, LHS
+ 3758100203U, // <0,0,5,2>: Cost 4 vsldoi8 <0,0,0,0>, <5,2,1,3>
+ 3395966100U, // <0,0,5,3>: Cost 4 vmrglw <6,7,0,5>, <7,2,0,3>
+ 3804549052U, // <0,0,5,4>: Cost 4 vsldoi8 <7,7,0,0>, <5,4,6,5>
+ 2302314964U, // <0,0,5,5>: Cost 3 vmrglw <3,4,0,5>, <3,4,0,5>
+ 2785821138U, // <0,0,5,6>: Cost 3 vsldoi12 <5,6,7,0>, <0,5,6,7>
+ 3395966428U, // <0,0,5,7>: Cost 4 vmrglw <6,7,0,5>, <7,6,0,7>
+ 2787148260U, // <0,0,5,u>: Cost 3 vsldoi12 <5,u,7,0>, <0,5,u,7>
+ 2684358997U, // <0,0,6,0>: Cost 3 vsldoi8 <0,0,0,0>, <6,0,7,0>
+ 2218631270U, // <0,0,6,1>: Cost 3 vmrghw <0,6,0,7>, LHS
+ 2684359162U, // <0,0,6,2>: Cost 3 vsldoi8 <0,0,0,0>, <6,2,7,3>
+ 3758101042U, // <0,0,6,3>: Cost 4 vsldoi8 <0,0,0,0>, <6,3,4,5>
+ 3732843830U, // <0,0,6,4>: Cost 4 vsldoi4 <7,0,0,6>, RHS
+ 3758101227U, // <0,0,6,5>: Cost 4 vsldoi8 <0,0,0,0>, <6,5,7,1>
+ 2684359480U, // <0,0,6,6>: Cost 3 vsldoi8 <0,0,0,0>, <6,6,6,6>
+ 2724836173U, // <0,0,6,7>: Cost 3 vsldoi8 <6,7,0,0>, <6,7,0,0>
+ 2725499806U, // <0,0,6,u>: Cost 3 vsldoi8 <6,u,0,0>, <6,u,0,0>
+ 2726163439U, // <0,0,7,0>: Cost 3 vsldoi8 <7,0,0,0>, <7,0,0,0>
+ 2219311206U, // <0,0,7,1>: Cost 3 vmrghw <0,7,1,0>, LHS
+ 3868557900U, // <0,0,7,2>: Cost 4 vsldoi12 <7,2,3,0>, <0,7,2,3>
+ 3377400112U, // <0,0,7,3>: Cost 4 vmrglw <3,6,0,7>, <3,2,0,3>
+ 2684360038U, // <0,0,7,4>: Cost 3 vsldoi8 <0,0,0,0>, <7,4,5,6>
+ 3732852834U, // <0,0,7,5>: Cost 4 vsldoi4 <7,0,0,7>, <5,6,7,0>
+ 3871507060U, // <0,0,7,6>: Cost 4 vsldoi12 <7,6,7,0>, <0,7,6,7>
+ 2303658616U, // <0,0,7,7>: Cost 3 vmrglw <3,6,0,7>, <3,6,0,7>
+ 2726163439U, // <0,0,7,u>: Cost 3 vsldoi8 <7,0,0,0>, <7,0,0,0>
+ 202162278U, // <0,0,u,0>: Cost 1 vspltisw0 LHS
+ 72589414U, // <0,0,u,1>: Cost 1 vmrghw LHS, LHS
+ 1685242525U, // <0,0,u,2>: Cost 2 vsldoi12 <1,2,3,0>, LHS
+ 2220073212U, // <0,0,u,3>: Cost 3 vmrghw LHS, <0,3,1,0>
+ 1146331474U, // <0,0,u,4>: Cost 2 vmrghw LHS, <0,4,1,5>
+ 1610619034U, // <0,0,u,5>: Cost 2 vsldoi8 <0,0,0,0>, RHS
+ 2785821138U, // <0,0,u,6>: Cost 3 vsldoi12 <5,6,7,0>, <0,5,6,7>
+ 2659120119U, // <0,0,u,7>: Cost 3 vsldoi4 <7,0,0,u>, <7,0,0,u>
+ 72589981U, // <0,0,u,u>: Cost 1 vmrghw LHS, LHS
+ 2698297344U, // <0,1,0,0>: Cost 3 vsldoi8 <2,3,0,1>, <0,0,0,0>
+ 1624555622U, // <0,1,0,1>: Cost 2 vsldoi8 <2,3,0,1>, LHS
+ 2758984428U, // <0,1,0,2>: Cost 3 vsldoi12 <1,2,3,0>, <1,0,2,1>
+ 2635237524U, // <0,1,0,3>: Cost 3 vsldoi4 <3,0,1,0>, <3,0,1,0>
+ 2693652818U, // <0,1,0,4>: Cost 3 vsldoi8 <1,5,0,1>, <0,4,1,5>
+ 2281701714U, // <0,1,0,5>: Cost 3 vmrglw <0,0,0,0>, <0,4,1,5>
+ 2698297846U, // <0,1,0,6>: Cost 3 vsldoi8 <2,3,0,1>, <0,6,1,7>
+ 2659128312U, // <0,1,0,7>: Cost 3 vsldoi4 <7,0,1,0>, <7,0,1,0>
+ 1624556189U, // <0,1,0,u>: Cost 2 vsldoi8 <2,3,0,1>, LHS
+ 1543585802U, // <0,1,1,0>: Cost 2 vsldoi4 <0,0,1,1>, <0,0,1,1>
+ 1141728052U, // <0,1,1,1>: Cost 2 vmrghw LHS, <1,1,1,1>
+ 1141728150U, // <0,1,1,2>: Cost 2 vmrghw LHS, <1,2,3,0>
+ 2295644334U, // <0,1,1,3>: Cost 3 vmrglw <2,3,0,1>, <0,2,1,3>
+ 1543589174U, // <0,1,1,4>: Cost 2 vsldoi4 <0,0,1,1>, RHS
+ 2290999634U, // <0,1,1,5>: Cost 3 vmrglw <1,5,0,1>, <0,4,1,5>
+ 2617332135U, // <0,1,1,6>: Cost 3 vsldoi4 <0,0,1,1>, <6,1,7,1>
+ 2617332720U, // <0,1,1,7>: Cost 3 vsldoi4 <0,0,1,1>, <7,0,0,1>
+ 1142171004U, // <0,1,1,u>: Cost 2 vmrghw LHS, <1,u,3,0>
+ 1561509990U, // <0,1,2,0>: Cost 2 vsldoi4 <3,0,1,2>, LHS
+ 2623308516U, // <0,1,2,1>: Cost 3 vsldoi4 <1,0,1,2>, <1,0,1,2>
+ 2698298984U, // <0,1,2,2>: Cost 3 vsldoi8 <2,3,0,1>, <2,2,2,2>
+ 835584U, // <0,1,2,3>: Cost 0 copy LHS
+ 1561513270U, // <0,1,2,4>: Cost 2 vsldoi4 <3,0,1,2>, RHS
+ 2647199304U, // <0,1,2,5>: Cost 3 vsldoi4 <5,0,1,2>, <5,0,1,2>
+ 2698299322U, // <0,1,2,6>: Cost 3 vsldoi8 <2,3,0,1>, <2,6,3,7>
+ 1585402874U, // <0,1,2,7>: Cost 2 vsldoi4 <7,0,1,2>, <7,0,1,2>
+ 835584U, // <0,1,2,u>: Cost 0 copy LHS
+ 2698299540U, // <0,1,3,0>: Cost 3 vsldoi8 <2,3,0,1>, <3,0,1,0>
+ 3290399540U, // <0,1,3,1>: Cost 4 vmrghw <0,3,1,0>, <1,1,1,1>
+ 2698299720U, // <0,1,3,2>: Cost 3 vsldoi8 <2,3,0,1>, <3,2,3,0>
+ 2698299804U, // <0,1,3,3>: Cost 3 vsldoi8 <2,3,0,1>, <3,3,3,3>
+ 2698299906U, // <0,1,3,4>: Cost 3 vsldoi8 <2,3,0,1>, <3,4,5,6>
+ 3832726521U, // <0,1,3,5>: Cost 4 vsldoi12 <1,2,3,0>, <1,3,5,0>
+ 2724842160U, // <0,1,3,6>: Cost 3 vsldoi8 <6,7,0,1>, <3,6,7,0>
+ 2706926275U, // <0,1,3,7>: Cost 3 vsldoi8 <3,7,0,1>, <3,7,0,1>
+ 2698300190U, // <0,1,3,u>: Cost 3 vsldoi8 <2,3,0,1>, <3,u,1,2>
+ 2635268198U, // <0,1,4,0>: Cost 3 vsldoi4 <3,0,1,4>, LHS
+ 2217362228U, // <0,1,4,1>: Cost 3 vmrghw <0,4,1,5>, <1,1,1,1>
+ 2217362326U, // <0,1,4,2>: Cost 3 vmrghw <0,4,1,5>, <1,2,3,0>
+ 2635270296U, // <0,1,4,3>: Cost 3 vsldoi4 <3,0,1,4>, <3,0,1,4>
+ 2635271478U, // <0,1,4,4>: Cost 3 vsldoi4 <3,0,1,4>, RHS
+ 1624558902U, // <0,1,4,5>: Cost 2 vsldoi8 <2,3,0,1>, RHS
+ 2659160910U, // <0,1,4,6>: Cost 3 vsldoi4 <7,0,1,4>, <6,7,0,1>
+ 2659161084U, // <0,1,4,7>: Cost 3 vsldoi4 <7,0,1,4>, <7,0,1,4>
+ 1624559145U, // <0,1,4,u>: Cost 2 vsldoi8 <2,3,0,1>, RHS
+ 3832726639U, // <0,1,5,0>: Cost 4 vsldoi12 <1,2,3,0>, <1,5,0,1>
+ 2714889871U, // <0,1,5,1>: Cost 3 vsldoi8 <5,1,0,1>, <5,1,0,1>
+ 2302314646U, // <0,1,5,2>: Cost 3 vmrglw <3,4,0,5>, <3,0,1,2>
+ 3834717321U, // <0,1,5,3>: Cost 4 vsldoi12 <1,5,3,0>, <1,5,3,0>
+ 3832726679U, // <0,1,5,4>: Cost 4 vsldoi12 <1,2,3,0>, <1,5,4,5>
+ 2717544403U, // <0,1,5,5>: Cost 3 vsldoi8 <5,5,0,1>, <5,5,0,1>
+ 2718208036U, // <0,1,5,6>: Cost 3 vsldoi8 <5,6,0,1>, <5,6,0,1>
+ 3792613493U, // <0,1,5,7>: Cost 4 vsldoi8 <5,7,0,1>, <5,7,0,1>
+ 2719535302U, // <0,1,5,u>: Cost 3 vsldoi8 <5,u,0,1>, <5,u,0,1>
+ 2659172454U, // <0,1,6,0>: Cost 3 vsldoi4 <7,0,1,6>, LHS
+ 3832726735U, // <0,1,6,1>: Cost 4 vsldoi12 <1,2,3,0>, <1,6,1,7>
+ 2724844026U, // <0,1,6,2>: Cost 3 vsldoi8 <6,7,0,1>, <6,2,7,3>
+ 3775361608U, // <0,1,6,3>: Cost 4 vsldoi8 <2,u,0,1>, <6,3,7,0>
+ 2659175734U, // <0,1,6,4>: Cost 3 vsldoi4 <7,0,1,6>, RHS
+ 3832726771U, // <0,1,6,5>: Cost 4 vsldoi12 <1,2,3,0>, <1,6,5,7>
+ 2724844344U, // <0,1,6,6>: Cost 3 vsldoi8 <6,7,0,1>, <6,6,6,6>
+ 1651102542U, // <0,1,6,7>: Cost 2 vsldoi8 <6,7,0,1>, <6,7,0,1>
+ 1651766175U, // <0,1,6,u>: Cost 2 vsldoi8 <6,u,0,1>, <6,u,0,1>
+ 2724844536U, // <0,1,7,0>: Cost 3 vsldoi8 <6,7,0,1>, <7,0,1,0>
+ 3377397770U, // <0,1,7,1>: Cost 4 vmrglw <3,6,0,7>, <0,0,1,1>
+ 2698302636U, // <0,1,7,2>: Cost 3 vsldoi8 <2,3,0,1>, <7,2,3,0>
+ 2728162531U, // <0,1,7,3>: Cost 3 vsldoi8 <7,3,0,1>, <7,3,0,1>
+ 2724844902U, // <0,1,7,4>: Cost 3 vsldoi8 <6,7,0,1>, <7,4,5,6>
+ 3377398098U, // <0,1,7,5>: Cost 4 vmrglw <3,6,0,7>, <0,4,1,5>
+ 2724845076U, // <0,1,7,6>: Cost 3 vsldoi8 <6,7,0,1>, <7,6,7,0>
+ 2724845164U, // <0,1,7,7>: Cost 3 vsldoi8 <6,7,0,1>, <7,7,7,7>
+ 2724845186U, // <0,1,7,u>: Cost 3 vsldoi8 <6,7,0,1>, <7,u,1,2>
+ 1561559142U, // <0,1,u,0>: Cost 2 vsldoi4 <3,0,1,u>, LHS
+ 1146331956U, // <0,1,u,1>: Cost 2 vmrghw LHS, <1,1,1,1>
+ 1146332054U, // <0,1,u,2>: Cost 2 vmrghw LHS, <1,2,3,0>
+ 835584U, // <0,1,u,3>: Cost 0 copy LHS
+ 1561562422U, // <0,1,u,4>: Cost 2 vsldoi4 <3,0,1,u>, RHS
+ 1624561818U, // <0,1,u,5>: Cost 2 vsldoi8 <2,3,0,1>, RHS
+ 2220074191U, // <0,1,u,6>: Cost 3 vmrghw LHS, <1,6,1,7>
+ 1585452032U, // <0,1,u,7>: Cost 2 vsldoi4 <7,0,1,u>, <7,0,1,u>
+ 835584U, // <0,1,u,u>: Cost 0 copy LHS
+ 2214593997U, // <0,2,0,0>: Cost 3 vmrghw <0,0,0,0>, <2,0,3,0>
+ 2214675999U, // <0,2,0,1>: Cost 3 vmrghw <0,0,1,1>, <2,1,3,1>
+ 2214594152U, // <0,2,0,2>: Cost 3 vmrghw <0,0,0,0>, <2,2,2,2>
+ 1207959654U, // <0,2,0,3>: Cost 2 vmrglw <0,0,0,0>, LHS
+ 3709054262U, // <0,2,0,4>: Cost 4 vsldoi4 <3,0,2,0>, RHS
+ 3375350836U, // <0,2,0,5>: Cost 4 vmrglw <3,3,0,0>, <1,4,2,5>
+ 2214594490U, // <0,2,0,6>: Cost 3 vmrghw <0,0,0,0>, <2,6,3,7>
+ 3288336362U, // <0,2,0,7>: Cost 4 vmrghw <0,0,0,0>, <2,7,0,1>
+ 1207959659U, // <0,2,0,u>: Cost 2 vmrglw <0,0,0,0>, LHS
+ 2215871994U, // <0,2,1,0>: Cost 3 vmrghw LHS, <2,0,u,0>
+ 2215470623U, // <0,2,1,1>: Cost 3 vmrghw LHS, <2,1,3,1>
+ 1141728872U, // <0,2,1,2>: Cost 2 vmrghw LHS, <2,2,2,2>
+ 1141728934U, // <0,2,1,3>: Cost 2 vmrghw LHS, <2,3,0,1>
+ 2215872323U, // <0,2,1,4>: Cost 3 vmrghw LHS, <2,4,u,5>
+ 2215872405U, // <0,2,1,5>: Cost 3 vmrghw LHS, <2,5,u,6>
+ 1141729210U, // <0,2,1,6>: Cost 2 vmrghw LHS, <2,6,3,7>
+ 2215430122U, // <0,2,1,7>: Cost 3 vmrghw LHS, <2,7,0,1>
+ 1141729368U, // <0,2,1,u>: Cost 2 vmrghw LHS, <2,u,3,3>
+ 3289736698U, // <0,2,2,0>: Cost 4 vmrghw <0,2,1,0>, <2,0,u,0>
+ 3289744927U, // <0,2,2,1>: Cost 4 vmrghw <0,2,1,1>, <2,1,3,1>
+ 2216011368U, // <0,2,2,2>: Cost 3 vmrghw <0,2,1,2>, <2,2,2,2>
+ 2216019622U, // <0,2,2,3>: Cost 3 vmrghw <0,2,1,3>, <2,3,0,1>
+ 3289769795U, // <0,2,2,4>: Cost 4 vmrghw <0,2,1,4>, <2,4,u,5>
+ 3289778069U, // <0,2,2,5>: Cost 4 vmrghw <0,2,1,5>, <2,5,u,6>
+ 2216044474U, // <0,2,2,6>: Cost 3 vmrghw <0,2,1,6>, <2,6,3,7>
+ 3732960259U, // <0,2,2,7>: Cost 4 vsldoi4 <7,0,2,2>, <7,0,2,2>
+ 2216061016U, // <0,2,2,u>: Cost 3 vmrghw <0,2,1,u>, <2,u,3,3>
+ 2758985382U, // <0,2,3,0>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,0,1>
+ 2758985392U, // <0,2,3,1>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,1,2>
+ 3290400360U, // <0,2,3,2>: Cost 4 vmrghw <0,3,1,0>, <2,2,2,2>
+ 2758985408U, // <0,2,3,3>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,3,0>
+ 2758985422U, // <0,2,3,4>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,4,5>
+ 2785822424U, // <0,2,3,5>: Cost 3 vsldoi12 <5,6,7,0>, <2,3,5,6>
+ 3290400698U, // <0,2,3,6>: Cost 4 vmrghw <0,3,1,0>, <2,6,3,7>
+ 2765915876U, // <0,2,3,7>: Cost 3 vsldoi12 <2,3,7,0>, <2,3,7,0>
+ 2758985453U, // <0,2,3,u>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,u,0>
+ 3291104762U, // <0,2,4,0>: Cost 4 vmrghw <0,4,1,5>, <2,0,u,0>
+ 2217362979U, // <0,2,4,1>: Cost 3 vmrghw <0,4,1,5>, <2,1,3,5>
+ 2217363048U, // <0,2,4,2>: Cost 3 vmrghw <0,4,1,5>, <2,2,2,2>
+ 2217363110U, // <0,2,4,3>: Cost 3 vmrghw <0,4,1,5>, <2,3,0,1>
+ 3291105087U, // <0,2,4,4>: Cost 4 vmrghw <0,4,1,5>, <2,4,u,1>
+ 3291105173U, // <0,2,4,5>: Cost 4 vmrghw <0,4,1,5>, <2,5,u,6>
+ 2217363386U, // <0,2,4,6>: Cost 3 vmrghw <0,4,1,5>, <2,6,3,7>
+ 3788639688U, // <0,2,4,7>: Cost 4 vsldoi8 <5,1,0,2>, <4,7,5,0>
+ 2217363515U, // <0,2,4,u>: Cost 3 vmrghw <0,4,1,5>, <2,u,0,1>
+ 3376054371U, // <0,2,5,0>: Cost 4 vmrglw <3,4,0,5>, <0,1,2,0>
+ 3788639888U, // <0,2,5,1>: Cost 4 vsldoi8 <5,1,0,2>, <5,1,0,2>
+ 3376055912U, // <0,2,5,2>: Cost 4 vmrglw <3,4,0,5>, <2,2,2,2>
+ 2302312550U, // <0,2,5,3>: Cost 3 vmrglw <3,4,0,5>, LHS
+ 3376054375U, // <0,2,5,4>: Cost 4 vmrglw <3,4,0,5>, <0,1,2,4>
+ 3374728244U, // <0,2,5,5>: Cost 4 vmrglw <3,2,0,5>, <1,4,2,5>
+ 3805229154U, // <0,2,5,6>: Cost 4 vsldoi8 <7,u,0,2>, <5,6,7,0>
+ 3376055512U, // <0,2,5,7>: Cost 4 vmrglw <3,4,0,5>, <1,6,2,7>
+ 2302312555U, // <0,2,5,u>: Cost 3 vmrglw <3,4,0,5>, LHS
+ 3709100134U, // <0,2,6,0>: Cost 4 vsldoi4 <3,0,2,6>, LHS
+ 3709100950U, // <0,2,6,1>: Cost 4 vsldoi4 <3,0,2,6>, <1,2,3,0>
+ 3709102010U, // <0,2,6,2>: Cost 4 vsldoi4 <3,0,2,6>, <2,6,3,7>
+ 2758985658U, // <0,2,6,3>: Cost 3 vsldoi12 <1,2,3,0>, <2,6,3,7>
+ 3709103414U, // <0,2,6,4>: Cost 4 vsldoi4 <3,0,2,6>, RHS
+ 3732992098U, // <0,2,6,5>: Cost 4 vsldoi4 <7,0,2,6>, <5,6,7,0>
+ 3292374970U, // <0,2,6,6>: Cost 4 vmrghw <0,6,0,7>, <2,6,3,7>
+ 3798594383U, // <0,2,6,7>: Cost 4 vsldoi8 <6,7,0,2>, <6,7,0,2>
+ 2758985703U, // <0,2,6,u>: Cost 3 vsldoi12 <1,2,3,0>, <2,6,u,7>
+ 3788641274U, // <0,2,7,0>: Cost 4 vsldoi8 <5,1,0,2>, <7,0,1,2>
+ 3377398508U, // <0,2,7,1>: Cost 4 vmrglw <3,6,0,7>, <1,0,2,1>
+ 3377398590U, // <0,2,7,2>: Cost 4 vmrglw <3,6,0,7>, <1,1,2,2>
+ 2303656038U, // <0,2,7,3>: Cost 3 vmrglw <3,6,0,7>, LHS
+ 3709111606U, // <0,2,7,4>: Cost 4 vsldoi4 <3,0,2,7>, RHS
+ 3377398836U, // <0,2,7,5>: Cost 4 vmrglw <3,6,0,7>, <1,4,2,5>
+ 3803903447U, // <0,2,7,6>: Cost 4 vsldoi8 <7,6,0,2>, <7,6,0,2>
+ 3293054954U, // <0,2,7,7>: Cost 4 vmrghw <0,7,1,0>, <2,7,0,1>
+ 2303656043U, // <0,2,7,u>: Cost 3 vmrglw <3,6,0,7>, LHS
+ 2220074490U, // <0,2,u,0>: Cost 3 vmrghw LHS, <2,0,u,0>
+ 2220074527U, // <0,2,u,1>: Cost 3 vmrghw LHS, <2,1,3,1>
+ 1146332776U, // <0,2,u,2>: Cost 2 vmrghw LHS, <2,2,2,2>
+ 1146332838U, // <0,2,u,3>: Cost 2 vmrghw LHS, <2,3,0,1>
+ 2220074819U, // <0,2,u,4>: Cost 3 vmrghw LHS, <2,4,u,5>
+ 2220074901U, // <0,2,u,5>: Cost 3 vmrghw LHS, <2,5,u,6>
+ 1146333114U, // <0,2,u,6>: Cost 2 vmrghw LHS, <2,6,3,7>
+ 2220074986U, // <0,2,u,7>: Cost 3 vmrghw LHS, <2,7,0,1>
+ 1146333243U, // <0,2,u,u>: Cost 2 vmrghw LHS, <2,u,0,1>
+ 2629410816U, // <0,3,0,0>: Cost 3 vsldoi4 <2,0,3,0>, <0,0,0,0>
+ 2753530006U, // <0,3,0,1>: Cost 3 vsldoi12 <0,3,1,0>, <3,0,1,2>
+ 2629412301U, // <0,3,0,2>: Cost 3 vsldoi4 <2,0,3,0>, <2,0,3,0>
+ 2214594972U, // <0,3,0,3>: Cost 3 vmrghw <0,0,0,0>, <3,3,3,3>
+ 2758985908U, // <0,3,0,4>: Cost 3 vsldoi12 <1,2,3,0>, <3,0,4,5>
+ 3733016674U, // <0,3,0,5>: Cost 4 vsldoi4 <7,0,3,0>, <5,6,7,0>
+ 3777364488U, // <0,3,0,6>: Cost 4 vsldoi8 <3,2,0,3>, <0,6,3,7>
+ 2281703354U, // <0,3,0,7>: Cost 3 vmrglw <0,0,0,0>, <2,6,3,7>
+ 2758985941U, // <0,3,0,u>: Cost 3 vsldoi12 <1,2,3,0>, <3,0,u,2>
+ 1141729430U, // <0,3,1,0>: Cost 2 vmrghw LHS, <3,0,1,2>
+ 2215471334U, // <0,3,1,1>: Cost 3 vmrghw LHS, <3,1,1,1>
+ 2215471425U, // <0,3,1,2>: Cost 3 vmrghw LHS, <3,2,2,2>
+ 1141729692U, // <0,3,1,3>: Cost 2 vmrghw LHS, <3,3,3,3>
+ 1141729794U, // <0,3,1,4>: Cost 2 vmrghw LHS, <3,4,5,6>
+ 2215430738U, // <0,3,1,5>: Cost 3 vmrghw LHS, <3,5,5,5>
+ 2215430776U, // <0,3,1,6>: Cost 3 vmrghw LHS, <3,6,0,7>
+ 2295646138U, // <0,3,1,7>: Cost 3 vmrglw <2,3,0,1>, <2,6,3,7>
+ 1141730078U, // <0,3,1,u>: Cost 2 vmrghw LHS, <3,u,1,2>
+ 2758986032U, // <0,3,2,0>: Cost 3 vsldoi12 <1,2,3,0>, <3,2,0,3>
+ 3709141910U, // <0,3,2,1>: Cost 4 vsldoi4 <3,0,3,2>, <1,2,3,0>
+ 3289753921U, // <0,3,2,2>: Cost 4 vmrghw <0,2,1,2>, <3,2,2,2>
+ 2770929992U, // <0,3,2,3>: Cost 3 vsldoi12 <3,2,3,0>, <3,2,3,0>
+ 3289754114U, // <0,3,2,4>: Cost 4 vmrghw <0,2,1,2>, <3,4,5,6>
+ 3362095460U, // <0,3,2,5>: Cost 5 vmrglw <1,1,0,2>, <0,4,3,5>
+ 3832727910U, // <0,3,2,6>: Cost 4 vsldoi12 <1,2,3,0>, <3,2,6,3>
+ 3365414842U, // <0,3,2,7>: Cost 4 vmrglw <1,6,0,2>, <2,6,3,7>
+ 2771298677U, // <0,3,2,u>: Cost 3 vsldoi12 <3,2,u,0>, <3,2,u,0>
+ 2216659094U, // <0,3,3,0>: Cost 3 vmrghw <0,3,1,0>, <3,0,1,2>
+ 3290409190U, // <0,3,3,1>: Cost 4 vmrghw <0,3,1,1>, <3,1,1,1>
+ 2703624496U, // <0,3,3,2>: Cost 3 vsldoi8 <3,2,0,3>, <3,2,0,3>
+ 2216683932U, // <0,3,3,3>: Cost 3 vmrghw <0,3,1,3>, <3,3,3,3>
+ 2216692226U, // <0,3,3,4>: Cost 3 vmrghw <0,3,1,4>, <3,4,5,6>
+ 3733041250U, // <0,3,3,5>: Cost 4 vsldoi4 <7,0,3,3>, <5,6,7,0>
+ 3832727988U, // <0,3,3,6>: Cost 4 vsldoi12 <1,2,3,0>, <3,3,6,0>
+ 3374712762U, // <0,3,3,7>: Cost 4 vmrglw <3,2,0,3>, <2,6,3,7>
+ 2216725278U, // <0,3,3,u>: Cost 3 vmrghw <0,3,1,u>, <3,u,1,2>
+ 2217363606U, // <0,3,4,0>: Cost 3 vmrghw <0,4,1,5>, <3,0,1,2>
+ 3291105510U, // <0,3,4,1>: Cost 4 vmrghw <0,4,1,5>, <3,1,1,1>
+ 3291105601U, // <0,3,4,2>: Cost 4 vmrghw <0,4,1,5>, <3,2,2,2>
+ 2217363868U, // <0,3,4,3>: Cost 3 vmrghw <0,4,1,5>, <3,3,3,3>
+ 2217363970U, // <0,3,4,4>: Cost 3 vmrghw <0,4,1,5>, <3,4,5,6>
+ 2758986242U, // <0,3,4,5>: Cost 3 vsldoi12 <1,2,3,0>, <3,4,5,6>
+ 3727077685U, // <0,3,4,6>: Cost 4 vsldoi4 <6,0,3,4>, <6,0,3,4>
+ 3364767674U, // <0,3,4,7>: Cost 4 vmrglw <1,5,0,4>, <2,6,3,7>
+ 2217364254U, // <0,3,4,u>: Cost 3 vmrghw <0,4,1,5>, <3,u,1,2>
+ 3832728102U, // <0,3,5,0>: Cost 4 vsldoi12 <1,2,3,0>, <3,5,0,6>
+ 3405916003U, // <0,3,5,1>: Cost 4 vmrglw <u,4,0,5>, <2,5,3,1>
+ 3376055840U, // <0,3,5,2>: Cost 4 vmrglw <3,4,0,5>, <2,1,3,2>
+ 3376055679U, // <0,3,5,3>: Cost 4 vmrglw <3,4,0,5>, <1,u,3,3>
+ 3376055194U, // <0,3,5,4>: Cost 4 vmrglw <3,4,0,5>, <1,2,3,4>
+ 3859565138U, // <0,3,5,5>: Cost 4 vsldoi12 <5,6,7,0>, <3,5,5,5>
+ 2727514210U, // <0,3,5,6>: Cost 3 vsldoi8 <7,2,0,3>, <5,6,7,0>
+ 3376056250U, // <0,3,5,7>: Cost 4 vmrglw <3,4,0,5>, <2,6,3,7>
+ 2727514210U, // <0,3,5,u>: Cost 3 vsldoi8 <7,2,0,3>, <5,6,7,0>
+ 2758986360U, // <0,3,6,0>: Cost 3 vsldoi12 <1,2,3,0>, <3,6,0,7>
+ 3709174678U, // <0,3,6,1>: Cost 4 vsldoi4 <3,0,3,6>, <1,2,3,0>
+ 3795284411U, // <0,3,6,2>: Cost 4 vsldoi8 <6,2,0,3>, <6,2,0,3>
+ 3709175980U, // <0,3,6,3>: Cost 4 vsldoi4 <3,0,3,6>, <3,0,3,6>
+ 3833096860U, // <0,3,6,4>: Cost 4 vsldoi12 <1,2,u,0>, <3,6,4,7>
+ 3376728235U, // <0,3,6,5>: Cost 5 vmrglw <3,5,0,6>, <3,0,3,5>
+ 3859565229U, // <0,3,6,6>: Cost 4 vsldoi12 <5,6,7,0>, <3,6,6,6>
+ 2773879472U, // <0,3,6,7>: Cost 3 vsldoi12 <3,6,7,0>, <3,6,7,0>
+ 2758986360U, // <0,3,6,u>: Cost 3 vsldoi12 <1,2,3,0>, <3,6,0,7>
+ 2303656854U, // <0,3,7,0>: Cost 3 vmrglw <3,6,0,7>, <1,2,3,0>
+ 3807229018U, // <0,3,7,1>: Cost 4 vsldoi8 <u,2,0,3>, <7,1,2,u>
+ 2727515284U, // <0,3,7,2>: Cost 3 vsldoi8 <7,2,0,3>, <7,2,0,3>
+ 3377399410U, // <0,3,7,3>: Cost 4 vmrglw <3,6,0,7>, <2,2,3,3>
+ 3377398682U, // <0,3,7,4>: Cost 4 vmrglw <3,6,0,7>, <1,2,3,4>
+ 3801257409U, // <0,3,7,5>: Cost 4 vsldoi8 <7,2,0,3>, <7,5,6,7>
+ 3377399980U, // <0,3,7,6>: Cost 4 vmrglw <3,6,0,7>, <3,0,3,6>
+ 3375409082U, // <0,3,7,7>: Cost 4 vmrglw <3,3,0,7>, <2,6,3,7>
+ 2731497082U, // <0,3,7,u>: Cost 3 vsldoi8 <7,u,0,3>, <7,u,0,3>
+ 1146333334U, // <0,3,u,0>: Cost 2 vmrghw LHS, <3,0,1,2>
+ 2220075238U, // <0,3,u,1>: Cost 3 vmrghw LHS, <3,1,1,1>
+ 2220075329U, // <0,3,u,2>: Cost 3 vmrghw LHS, <3,2,2,2>
+ 1146333596U, // <0,3,u,3>: Cost 2 vmrghw LHS, <3,3,3,3>
+ 1146333698U, // <0,3,u,4>: Cost 2 vmrghw LHS, <3,4,5,6>
+ 2758986566U, // <0,3,u,5>: Cost 3 vsldoi12 <1,2,3,0>, <3,u,5,6>
+ 2803739472U, // <0,3,u,6>: Cost 3 vsldoi12 <u,6,7,0>, <3,u,6,7>
+ 2295703482U, // <0,3,u,7>: Cost 3 vmrglw <2,3,0,u>, <2,6,3,7>
+ 1146333982U, // <0,3,u,u>: Cost 2 vmrghw LHS, <3,u,1,2>
+ 2214595473U, // <0,4,0,0>: Cost 3 vmrghw <0,0,0,0>, <4,0,5,0>
+ 2693677158U, // <0,4,0,1>: Cost 3 vsldoi8 <1,5,0,4>, LHS
+ 3839437689U, // <0,4,0,2>: Cost 4 vsldoi12 <2,3,4,0>, <4,0,2,3>
+ 3709200559U, // <0,4,0,3>: Cost 4 vsldoi4 <3,0,4,0>, <3,0,4,0>
+ 2693677394U, // <0,4,0,4>: Cost 3 vsldoi8 <1,5,0,4>, <0,4,1,5>
+ 1140854070U, // <0,4,0,5>: Cost 2 vmrghw <0,0,0,0>, RHS
+ 3767419409U, // <0,4,0,6>: Cost 4 vsldoi8 <1,5,0,4>, <0,6,4,7>
+ 3854109604U, // <0,4,0,7>: Cost 4 vsldoi12 <4,7,5,0>, <4,0,7,1>
+ 1140854313U, // <0,4,0,u>: Cost 2 vmrghw <0,0,0,0>, RHS
+ 1141689234U, // <0,4,1,0>: Cost 2 vmrghw LHS, <4,0,5,1>
+ 2215431114U, // <0,4,1,1>: Cost 3 vmrghw LHS, <4,1,2,3>
+ 2215431221U, // <0,4,1,2>: Cost 3 vmrghw LHS, <4,2,5,2>
+ 2635466928U, // <0,4,1,3>: Cost 3 vsldoi4 <3,0,4,1>, <3,0,4,1>
+ 1141689552U, // <0,4,1,4>: Cost 2 vmrghw LHS, <4,4,4,4>
+ 67947830U, // <0,4,1,5>: Cost 1 vmrghw LHS, RHS
+ 2215431545U, // <0,4,1,6>: Cost 3 vmrghw LHS, <4,6,5,2>
+ 2659357716U, // <0,4,1,7>: Cost 3 vsldoi4 <7,0,4,1>, <7,0,4,1>
+ 67948073U, // <0,4,1,u>: Cost 1 vmrghw LHS, RHS
+ 3767420369U, // <0,4,2,0>: Cost 4 vsldoi8 <1,5,0,4>, <2,0,3,4>
+ 3767420451U, // <0,4,2,1>: Cost 4 vsldoi8 <1,5,0,4>, <2,1,3,5>
+ 3767420520U, // <0,4,2,2>: Cost 4 vsldoi8 <1,5,0,4>, <2,2,2,2>
+ 2698323625U, // <0,4,2,3>: Cost 3 vsldoi8 <2,3,0,4>, <2,3,0,4>
+ 3709218102U, // <0,4,2,4>: Cost 4 vsldoi4 <3,0,4,2>, RHS
+ 2216013110U, // <0,4,2,5>: Cost 3 vmrghw <0,2,1,2>, RHS
+ 3767420858U, // <0,4,2,6>: Cost 4 vsldoi8 <1,5,0,4>, <2,6,3,7>
+ 3774719981U, // <0,4,2,7>: Cost 4 vsldoi8 <2,7,0,4>, <2,7,0,4>
+ 2216013353U, // <0,4,2,u>: Cost 3 vmrghw <0,2,1,2>, RHS
+ 3767421078U, // <0,4,3,0>: Cost 4 vsldoi8 <1,5,0,4>, <3,0,1,2>
+ 3776710880U, // <0,4,3,1>: Cost 4 vsldoi8 <3,1,0,4>, <3,1,0,4>
+ 3833097325U, // <0,4,3,2>: Cost 5 vsldoi12 <1,2,u,0>, <4,3,2,4>
+ 3767421340U, // <0,4,3,3>: Cost 4 vsldoi8 <1,5,0,4>, <3,3,3,3>
+ 3767421442U, // <0,4,3,4>: Cost 4 vsldoi8 <1,5,0,4>, <3,4,5,6>
+ 2216660278U, // <0,4,3,5>: Cost 3 vmrghw <0,3,1,0>, RHS
+ 3833097361U, // <0,4,3,6>: Cost 5 vsldoi12 <1,2,u,0>, <4,3,6,4>
+ 3780692678U, // <0,4,3,7>: Cost 4 vsldoi8 <3,7,0,4>, <3,7,0,4>
+ 2216660521U, // <0,4,3,u>: Cost 3 vmrghw <0,3,1,0>, RHS
+ 2617573416U, // <0,4,4,0>: Cost 3 vsldoi4 <0,0,4,4>, <0,0,4,4>
+ 2217364450U, // <0,4,4,1>: Cost 3 vmrghw <0,4,1,5>, <4,1,5,0>
+ 3691316771U, // <0,4,4,2>: Cost 4 vsldoi4 <0,0,4,4>, <2,1,3,5>
+ 3709233331U, // <0,4,4,3>: Cost 4 vsldoi4 <3,0,4,4>, <3,0,4,4>
+ 2785823952U, // <0,4,4,4>: Cost 3 vsldoi12 <5,6,7,0>, <4,4,4,4>
+ 1143622966U, // <0,4,4,5>: Cost 2 vmrghw <0,4,1,5>, RHS
+ 3691319723U, // <0,4,4,6>: Cost 4 vsldoi4 <0,0,4,4>, <6,1,7,5>
+ 3854109932U, // <0,4,4,7>: Cost 4 vsldoi12 <4,7,5,0>, <4,4,7,5>
+ 1143623209U, // <0,4,4,u>: Cost 2 vmrghw <0,4,1,5>, RHS
+ 2635497574U, // <0,4,5,0>: Cost 3 vsldoi4 <3,0,4,5>, LHS
+ 2635498390U, // <0,4,5,1>: Cost 3 vsldoi4 <3,0,4,5>, <1,2,3,0>
+ 3709240936U, // <0,4,5,2>: Cost 4 vsldoi4 <3,0,4,5>, <2,2,2,2>
+ 2635499700U, // <0,4,5,3>: Cost 3 vsldoi4 <3,0,4,5>, <3,0,4,5>
+ 2635500854U, // <0,4,5,4>: Cost 3 vsldoi4 <3,0,4,5>, RHS
+ 2785824044U, // <0,4,5,5>: Cost 3 vsldoi12 <5,6,7,0>, <4,5,5,6>
+ 1685245238U, // <0,4,5,6>: Cost 2 vsldoi12 <1,2,3,0>, RHS
+ 2659390488U, // <0,4,5,7>: Cost 3 vsldoi4 <7,0,4,5>, <7,0,4,5>
+ 1685245256U, // <0,4,5,u>: Cost 2 vsldoi12 <1,2,3,0>, RHS
+ 3839438161U, // <0,4,6,0>: Cost 4 vsldoi12 <2,3,4,0>, <4,6,0,7>
+ 3798610347U, // <0,4,6,1>: Cost 4 vsldoi8 <6,7,0,4>, <6,1,7,5>
+ 3798610426U, // <0,4,6,2>: Cost 4 vsldoi8 <6,7,0,4>, <6,2,7,3>
+ 3795956237U, // <0,4,6,3>: Cost 4 vsldoi8 <6,3,0,4>, <6,3,0,4>
+ 3733138742U, // <0,4,6,4>: Cost 4 vsldoi4 <7,0,4,6>, RHS
+ 2218634550U, // <0,4,6,5>: Cost 3 vmrghw <0,6,0,7>, RHS
+ 3798610744U, // <0,4,6,6>: Cost 4 vsldoi8 <6,7,0,4>, <6,6,6,6>
+ 2724868945U, // <0,4,6,7>: Cost 3 vsldoi8 <6,7,0,4>, <6,7,0,4>
+ 2725532578U, // <0,4,6,u>: Cost 3 vsldoi8 <6,u,0,4>, <6,u,0,4>
+ 3383371465U, // <0,4,7,0>: Cost 4 vmrglw <4,6,0,7>, <2,3,4,0>
+ 3800601668U, // <0,4,7,1>: Cost 4 vsldoi8 <7,1,0,4>, <7,1,0,4>
+ 3775386826U, // <0,4,7,2>: Cost 5 vsldoi8 <2,u,0,4>, <7,2,6,3>
+ 3801928934U, // <0,4,7,3>: Cost 4 vsldoi8 <7,3,0,4>, <7,3,0,4>
+ 3721202998U, // <0,4,7,4>: Cost 4 vsldoi4 <5,0,4,7>, RHS
+ 2780368328U, // <0,4,7,5>: Cost 3 vsldoi12 <4,7,5,0>, <4,7,5,0>
+ 3383372686U, // <0,4,7,6>: Cost 5 vmrglw <4,6,0,7>, <4,0,4,6>
+ 3854110170U, // <0,4,7,7>: Cost 4 vsldoi12 <4,7,5,0>, <4,7,7,0>
+ 2780368328U, // <0,4,7,u>: Cost 3 vsldoi12 <4,7,5,0>, <4,7,5,0>
+ 1146334098U, // <0,4,u,0>: Cost 2 vmrghw LHS, <4,0,5,1>
+ 2220076002U, // <0,4,u,1>: Cost 3 vmrghw LHS, <4,1,5,0>
+ 2220076085U, // <0,4,u,2>: Cost 3 vmrghw LHS, <4,2,5,2>
+ 2635524279U, // <0,4,u,3>: Cost 3 vsldoi4 <3,0,4,u>, <3,0,4,u>
+ 1146334416U, // <0,4,u,4>: Cost 2 vmrghw LHS, <4,4,4,4>
+ 72592694U, // <0,4,u,5>: Cost 1 vmrghw LHS, RHS
+ 1685245481U, // <0,4,u,6>: Cost 2 vsldoi12 <1,2,3,0>, RHS
+ 2659415067U, // <0,4,u,7>: Cost 3 vsldoi4 <7,0,4,u>, <7,0,4,u>
+ 72592937U, // <0,4,u,u>: Cost 1 vmrghw LHS, RHS
+ 2281704337U, // <0,5,0,0>: Cost 3 vmrglw <0,0,0,0>, <4,0,5,0>
+ 2704965734U, // <0,5,0,1>: Cost 3 vsldoi8 <3,4,0,5>, LHS
+ 3778707666U, // <0,5,0,2>: Cost 4 vsldoi8 <3,4,0,5>, <0,2,5,3>
+ 3778707708U, // <0,5,0,3>: Cost 4 vsldoi8 <3,4,0,5>, <0,3,1,0>
+ 2687050057U, // <0,5,0,4>: Cost 3 vsldoi8 <0,4,0,5>, <0,4,0,5>
+ 2214596612U, // <0,5,0,5>: Cost 3 vmrghw <0,0,0,0>, <5,5,5,5>
+ 2785824372U, // <0,5,0,6>: Cost 3 vsldoi12 <5,6,7,0>, <5,0,6,1>
+ 3854110332U, // <0,5,0,7>: Cost 4 vsldoi12 <4,7,5,0>, <5,0,7,0>
+ 2704966301U, // <0,5,0,u>: Cost 3 vsldoi8 <3,4,0,5>, LHS
+ 1567768678U, // <0,5,1,0>: Cost 2 vsldoi4 <4,0,5,1>, LHS
+ 2312236570U, // <0,5,1,1>: Cost 3 vmrglw <5,1,0,1>, <4,u,5,1>
+ 2215431915U, // <0,5,1,2>: Cost 3 vmrghw LHS, <5,2,1,3>
+ 2641512598U, // <0,5,1,3>: Cost 3 vsldoi4 <4,0,5,1>, <3,0,1,2>
+ 1567771538U, // <0,5,1,4>: Cost 2 vsldoi4 <4,0,5,1>, <4,0,5,1>
+ 1141690372U, // <0,5,1,5>: Cost 2 vmrghw LHS, <5,5,5,5>
+ 1141690466U, // <0,5,1,6>: Cost 2 vmrghw LHS, <5,6,7,0>
+ 2641515514U, // <0,5,1,7>: Cost 3 vsldoi4 <4,0,5,1>, <7,0,1,2>
+ 1141690615U, // <0,5,1,u>: Cost 2 vmrghw LHS, <5,u,5,5>
+ 3772736973U, // <0,5,2,0>: Cost 4 vsldoi8 <2,4,0,5>, <2,0,3,0>
+ 3778709024U, // <0,5,2,1>: Cost 4 vsldoi8 <3,4,0,5>, <2,1,3,2>
+ 3778709096U, // <0,5,2,2>: Cost 4 vsldoi8 <3,4,0,5>, <2,2,2,2>
+ 3778709158U, // <0,5,2,3>: Cost 4 vsldoi8 <3,4,0,5>, <2,3,0,1>
+ 3772737275U, // <0,5,2,4>: Cost 4 vsldoi8 <2,4,0,5>, <2,4,0,5>
+ 3859566351U, // <0,5,2,5>: Cost 4 vsldoi12 <5,6,7,0>, <5,2,5,3>
+ 3778709434U, // <0,5,2,6>: Cost 4 vsldoi8 <3,4,0,5>, <2,6,3,7>
+ 3805251562U, // <0,5,2,7>: Cost 4 vsldoi8 <7,u,0,5>, <2,7,0,1>
+ 3775391807U, // <0,5,2,u>: Cost 4 vsldoi8 <2,u,0,5>, <2,u,0,5>
+ 2704967830U, // <0,5,3,0>: Cost 3 vsldoi8 <3,4,0,5>, <3,0,1,2>
+ 3776719073U, // <0,5,3,1>: Cost 4 vsldoi8 <3,1,0,5>, <3,1,0,5>
+ 3777382706U, // <0,5,3,2>: Cost 4 vsldoi8 <3,2,0,5>, <3,2,0,5>
+ 3778709887U, // <0,5,3,3>: Cost 4 vsldoi8 <3,4,0,5>, <3,3,0,1>
+ 2704968148U, // <0,5,3,4>: Cost 3 vsldoi8 <3,4,0,5>, <3,4,0,5>
+ 3857428317U, // <0,5,3,5>: Cost 4 vsldoi12 <5,3,5,0>, <5,3,5,0>
+ 3364096514U, // <0,5,3,6>: Cost 4 vmrglw <1,4,0,3>, <3,4,5,6>
+ 3780700871U, // <0,5,3,7>: Cost 4 vsldoi8 <3,7,0,5>, <3,7,0,5>
+ 2707622680U, // <0,5,3,u>: Cost 3 vsldoi8 <3,u,0,5>, <3,u,0,5>
+ 2728856466U, // <0,5,4,0>: Cost 3 vsldoi8 <7,4,0,5>, <4,0,5,1>
+ 3697361674U, // <0,5,4,1>: Cost 4 vsldoi4 <1,0,5,4>, <1,0,5,4>
+ 3697362601U, // <0,5,4,2>: Cost 4 vsldoi4 <1,0,5,4>, <2,3,0,4>
+ 3364766635U, // <0,5,4,3>: Cost 4 vmrglw <1,5,0,4>, <1,2,5,3>
+ 2217365428U, // <0,5,4,4>: Cost 3 vmrghw <0,4,1,5>, <5,4,5,6>
+ 2704969014U, // <0,5,4,5>: Cost 3 vsldoi8 <3,4,0,5>, RHS
+ 2785824700U, // <0,5,4,6>: Cost 3 vsldoi12 <5,6,7,0>, <5,4,6,5>
+ 3364766963U, // <0,5,4,7>: Cost 4 vmrglw <1,5,0,4>, <1,6,5,7>
+ 2704969257U, // <0,5,4,u>: Cost 3 vsldoi8 <3,4,0,5>, RHS
+ 3846148050U, // <0,5,5,0>: Cost 4 vsldoi12 <3,4,5,0>, <5,5,0,0>
+ 2326203282U, // <0,5,5,1>: Cost 3 vmrglw <7,4,0,5>, <4,0,5,1>
+ 3291746027U, // <0,5,5,2>: Cost 4 vmrghw <0,5,1,2>, <5,2,1,3>
+ 3376054482U, // <0,5,5,3>: Cost 4 vmrglw <3,4,0,5>, <0,2,5,3>
+ 3790655366U, // <0,5,5,4>: Cost 4 vsldoi8 <5,4,0,5>, <5,4,0,5>
+ 2785824772U, // <0,5,5,5>: Cost 3 vsldoi12 <5,6,7,0>, <5,5,5,5>
+ 2724876386U, // <0,5,5,6>: Cost 3 vsldoi8 <6,7,0,5>, <5,6,7,0>
+ 3858903057U, // <0,5,5,7>: Cost 4 vsldoi12 <5,5,7,0>, <5,5,7,0>
+ 2736820484U, // <0,5,5,u>: Cost 3 vsldoi8 <u,7,0,5>, <5,u,7,0>
+ 2659467366U, // <0,5,6,0>: Cost 3 vsldoi4 <7,0,5,6>, LHS
+ 3859566643U, // <0,5,6,1>: Cost 4 vsldoi12 <5,6,7,0>, <5,6,1,7>
+ 3798618618U, // <0,5,6,2>: Cost 4 vsldoi8 <6,7,0,5>, <6,2,7,3>
+ 3852857410U, // <0,5,6,3>: Cost 4 vsldoi12 <4,5,6,0>, <5,6,3,4>
+ 2659470646U, // <0,5,6,4>: Cost 3 vsldoi4 <7,0,5,6>, RHS
+ 2659471458U, // <0,5,6,5>: Cost 3 vsldoi4 <7,0,5,6>, <5,6,7,0>
+ 3832729696U, // <0,5,6,6>: Cost 4 vsldoi12 <1,2,3,0>, <5,6,6,7>
+ 1712083042U, // <0,5,6,7>: Cost 2 vsldoi12 <5,6,7,0>, <5,6,7,0>
+ 1712156779U, // <0,5,6,u>: Cost 2 vsldoi12 <5,6,u,0>, <5,6,u,0>
+ 2731512826U, // <0,5,7,0>: Cost 3 vsldoi8 <7,u,0,5>, <7,0,1,2>
+ 3859566717U, // <0,5,7,1>: Cost 4 vsldoi12 <5,6,7,0>, <5,7,1,0>
+ 3798619284U, // <0,5,7,2>: Cost 4 vsldoi8 <6,7,0,5>, <7,2,0,3>
+ 3778712803U, // <0,5,7,3>: Cost 4 vsldoi8 <3,4,0,5>, <7,3,0,1>
+ 2728858936U, // <0,5,7,4>: Cost 3 vsldoi8 <7,4,0,5>, <7,4,0,5>
+ 3859566753U, // <0,5,7,5>: Cost 4 vsldoi12 <5,6,7,0>, <5,7,5,0>
+ 3377398135U, // <0,5,7,6>: Cost 4 vmrglw <3,6,0,7>, <0,4,5,6>
+ 3798619686U, // <0,5,7,7>: Cost 4 vsldoi8 <6,7,0,5>, <7,7,0,0>
+ 2731513468U, // <0,5,7,u>: Cost 3 vsldoi8 <7,u,0,5>, <7,u,0,5>
+ 1567826022U, // <0,5,u,0>: Cost 2 vsldoi4 <4,0,5,u>, LHS
+ 2704971566U, // <0,5,u,1>: Cost 3 vsldoi8 <3,4,0,5>, LHS
+ 2220076779U, // <0,5,u,2>: Cost 3 vmrghw LHS, <5,2,1,3>
+ 2641569942U, // <0,5,u,3>: Cost 3 vsldoi4 <4,0,5,u>, <3,0,1,2>
+ 1567828889U, // <0,5,u,4>: Cost 2 vsldoi4 <4,0,5,u>, <4,0,5,u>
+ 1146335236U, // <0,5,u,5>: Cost 2 vmrghw LHS, <5,5,5,5>
+ 1146335330U, // <0,5,u,6>: Cost 2 vmrghw LHS, <5,6,7,0>
+ 1713410308U, // <0,5,u,7>: Cost 2 vsldoi12 <5,u,7,0>, <5,u,7,0>
+ 1713484045U, // <0,5,u,u>: Cost 2 vsldoi12 <5,u,u,0>, <5,u,u,0>
+ 2214596949U, // <0,6,0,0>: Cost 3 vmrghw <0,0,0,0>, <6,0,7,0>
+ 2214678951U, // <0,6,0,1>: Cost 3 vmrghw <0,0,1,1>, <6,1,7,1>
+ 2214597114U, // <0,6,0,2>: Cost 3 vmrghw <0,0,0,0>, <6,2,7,3>
+ 3852857653U, // <0,6,0,3>: Cost 4 vsldoi12 <4,5,6,0>, <6,0,3,4>
+ 3832729919U, // <0,6,0,4>: Cost 4 vsldoi12 <1,2,3,0>, <6,0,4,5>
+ 3721293427U, // <0,6,0,5>: Cost 4 vsldoi4 <5,0,6,0>, <5,0,6,0>
+ 2214597432U, // <0,6,0,6>: Cost 3 vmrghw <0,0,0,0>, <6,6,6,6>
+ 1207962934U, // <0,6,0,7>: Cost 2 vmrglw <0,0,0,0>, RHS
+ 1207962935U, // <0,6,0,u>: Cost 2 vmrglw <0,0,0,0>, RHS
+ 2215432481U, // <0,6,1,0>: Cost 3 vmrghw LHS, <6,0,1,2>
+ 2215432615U, // <0,6,1,1>: Cost 3 vmrghw LHS, <6,1,7,1>
+ 1141690874U, // <0,6,1,2>: Cost 2 vmrghw LHS, <6,2,7,3>
+ 2215432754U, // <0,6,1,3>: Cost 3 vmrghw LHS, <6,3,4,5>
+ 2215432817U, // <0,6,1,4>: Cost 3 vmrghw LHS, <6,4,2,5>
+ 2215432939U, // <0,6,1,5>: Cost 3 vmrghw LHS, <6,5,7,1>
+ 1141691192U, // <0,6,1,6>: Cost 2 vmrghw LHS, <6,6,6,6>
+ 1221905718U, // <0,6,1,7>: Cost 2 vmrglw <2,3,0,1>, RHS
+ 1221905719U, // <0,6,1,u>: Cost 2 vmrglw <2,3,0,1>, RHS
+ 3852857787U, // <0,6,2,0>: Cost 4 vsldoi12 <4,5,6,0>, <6,2,0,3>
+ 3289764265U, // <0,6,2,1>: Cost 4 vmrghw <0,2,1,3>, <6,1,7,3>
+ 3289690618U, // <0,6,2,2>: Cost 4 vmrghw <0,2,0,3>, <6,2,7,3>
+ 3862589907U, // <0,6,2,3>: Cost 4 vsldoi12 <6,2,3,0>, <6,2,3,0>
+ 3733253430U, // <0,6,2,4>: Cost 4 vsldoi4 <7,0,6,2>, RHS
+ 3733254242U, // <0,6,2,5>: Cost 4 vsldoi4 <7,0,6,2>, <5,6,7,0>
+ 3777390522U, // <0,6,2,6>: Cost 4 vsldoi8 <3,2,0,6>, <2,6,3,7>
+ 2785825274U, // <0,6,2,7>: Cost 3 vsldoi12 <5,6,7,0>, <6,2,7,3>
+ 2785825283U, // <0,6,2,u>: Cost 3 vsldoi12 <5,6,7,0>, <6,2,u,3>
+ 3777390742U, // <0,6,3,0>: Cost 4 vsldoi8 <3,2,0,6>, <3,0,1,2>
+ 3863106066U, // <0,6,3,1>: Cost 4 vsldoi12 <6,3,1,0>, <6,3,1,0>
+ 3777390899U, // <0,6,3,2>: Cost 4 vsldoi8 <3,2,0,6>, <3,2,0,6>
+ 3290436146U, // <0,6,3,3>: Cost 4 vmrghw <0,3,1,4>, <6,3,4,5>
+ 3779381762U, // <0,6,3,4>: Cost 4 vsldoi8 <3,5,0,6>, <3,4,5,6>
+ 3779381798U, // <0,6,3,5>: Cost 4 vsldoi8 <3,5,0,6>, <3,5,0,6>
+ 3733262920U, // <0,6,3,6>: Cost 4 vsldoi4 <7,0,6,3>, <6,3,7,0>
+ 2300972342U, // <0,6,3,7>: Cost 3 vmrglw <3,2,0,3>, RHS
+ 2300972343U, // <0,6,3,u>: Cost 3 vmrglw <3,2,0,3>, RHS
+ 3802606482U, // <0,6,4,0>: Cost 4 vsldoi8 <7,4,0,6>, <4,0,5,1>
+ 2217365931U, // <0,6,4,1>: Cost 3 vmrghw <0,4,1,5>, <6,1,7,5>
+ 2217366010U, // <0,6,4,2>: Cost 3 vmrghw <0,4,1,5>, <6,2,7,3>
+ 3291107890U, // <0,6,4,3>: Cost 4 vmrghw <0,4,1,5>, <6,3,4,5>
+ 3291099805U, // <0,6,4,4>: Cost 4 vmrghw <0,4,1,4>, <6,4,7,4>
+ 3777391926U, // <0,6,4,5>: Cost 4 vsldoi8 <3,2,0,6>, RHS
+ 2217366328U, // <0,6,4,6>: Cost 3 vmrghw <0,4,1,5>, <6,6,6,6>
+ 2291027254U, // <0,6,4,7>: Cost 3 vmrglw <1,5,0,4>, RHS
+ 2291027255U, // <0,6,4,u>: Cost 3 vmrglw <1,5,0,4>, RHS
+ 3852858033U, // <0,6,5,0>: Cost 4 vsldoi12 <4,5,6,0>, <6,5,0,6>
+ 3395964532U, // <0,6,5,1>: Cost 4 vmrglw <6,7,0,5>, <5,0,6,1>
+ 3864507069U, // <0,6,5,2>: Cost 4 vsldoi12 <6,5,2,0>, <6,5,2,0>
+ 3376056678U, // <0,6,5,3>: Cost 5 vmrglw <3,4,0,5>, <3,2,6,3>
+ 3721334070U, // <0,6,5,4>: Cost 4 vsldoi4 <5,0,6,5>, RHS
+ 3395964860U, // <0,6,5,5>: Cost 4 vmrglw <6,7,0,5>, <5,4,6,5>
+ 3864802017U, // <0,6,5,6>: Cost 4 vsldoi12 <6,5,6,0>, <6,5,6,0>
+ 2302315830U, // <0,6,5,7>: Cost 3 vmrglw <3,4,0,5>, RHS
+ 2302315831U, // <0,6,5,u>: Cost 3 vmrglw <3,4,0,5>, RHS
+ 3852858108U, // <0,6,6,0>: Cost 4 vsldoi12 <4,5,6,0>, <6,6,0,0>
+ 3398624745U, // <0,6,6,1>: Cost 4 vmrglw <7,2,0,6>, <2,0,6,1>
+ 2218668538U, // <0,6,6,2>: Cost 3 vmrghw <0,6,1,2>, <6,2,7,3>
+ 3292418610U, // <0,6,6,3>: Cost 4 vmrghw <0,6,1,3>, <6,3,4,5>
+ 3733286198U, // <0,6,6,4>: Cost 4 vsldoi4 <7,0,6,6>, RHS
+ 3797299889U, // <0,6,6,5>: Cost 4 vsldoi8 <6,5,0,6>, <6,5,0,6>
+ 2785825592U, // <0,6,6,6>: Cost 3 vsldoi12 <5,6,7,0>, <6,6,6,6>
+ 2785825602U, // <0,6,6,7>: Cost 3 vsldoi12 <5,6,7,0>, <6,6,7,7>
+ 2785825611U, // <0,6,6,u>: Cost 3 vsldoi12 <5,6,7,0>, <6,6,u,7>
+ 2785825614U, // <0,6,7,0>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,0,1>
+ 2758988632U, // <0,6,7,1>: Cost 3 vsldoi12 <1,2,3,0>, <6,7,1,2>
+ 3377400084U, // <0,6,7,2>: Cost 4 vmrglw <3,6,0,7>, <3,1,6,2>
+ 2792166248U, // <0,6,7,3>: Cost 3 vsldoi12 <6,7,3,0>, <6,7,3,0>
+ 2785825654U, // <0,6,7,4>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,4,5>
+ 2785825664U, // <0,6,7,5>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,5,6>
+ 3859567493U, // <0,6,7,6>: Cost 4 vsldoi12 <5,6,7,0>, <6,7,6,2>
+ 2303659318U, // <0,6,7,7>: Cost 3 vmrglw <3,6,0,7>, RHS
+ 2303659319U, // <0,6,7,u>: Cost 3 vmrglw <3,6,0,7>, RHS
+ 2785825695U, // <0,6,u,0>: Cost 3 vsldoi12 <5,6,7,0>, <6,u,0,1>
+ 2220077479U, // <0,6,u,1>: Cost 3 vmrghw LHS, <6,1,7,1>
+ 1146335738U, // <0,6,u,2>: Cost 2 vmrghw LHS, <6,2,7,3>
+ 2792829881U, // <0,6,u,3>: Cost 3 vsldoi12 <6,u,3,0>, <6,u,3,0>
+ 2785825735U, // <0,6,u,4>: Cost 3 vsldoi12 <5,6,7,0>, <6,u,4,5>
+ 2785825664U, // <0,6,u,5>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,5,6>
+ 1146336056U, // <0,6,u,6>: Cost 2 vmrghw LHS, <6,6,6,6>
+ 1221963062U, // <0,6,u,7>: Cost 2 vmrglw <2,3,0,u>, RHS
+ 1221963063U, // <0,6,u,u>: Cost 2 vmrglw <2,3,0,u>, RHS
+ 2653593600U, // <0,7,0,0>: Cost 3 vsldoi4 <6,0,7,0>, <0,0,0,0>
+ 2706309222U, // <0,7,0,1>: Cost 3 vsldoi8 <3,6,0,7>, LHS
+ 3709421498U, // <0,7,0,2>: Cost 4 vsldoi4 <3,0,7,0>, <2,6,3,7>
+ 2281705978U, // <0,7,0,3>: Cost 3 vmrglw <0,0,0,0>, <6,2,7,3>
+ 2785825816U, // <0,7,0,4>: Cost 3 vsldoi12 <5,6,7,0>, <7,0,4,5>
+ 2785825826U, // <0,7,0,5>: Cost 3 vsldoi12 <5,6,7,0>, <7,0,5,6>
+ 2653598037U, // <0,7,0,6>: Cost 3 vsldoi4 <6,0,7,0>, <6,0,7,0>
+ 2214598252U, // <0,7,0,7>: Cost 3 vmrghw <0,0,0,0>, <7,7,7,7>
+ 2706309789U, // <0,7,0,u>: Cost 3 vsldoi8 <3,6,0,7>, LHS
+ 1141691386U, // <0,7,1,0>: Cost 2 vmrghw LHS, <7,0,1,2>
+ 2215433290U, // <0,7,1,1>: Cost 3 vmrghw LHS, <7,1,1,1>
+ 2706310038U, // <0,7,1,2>: Cost 3 vsldoi8 <3,6,0,7>, <1,2,3,0>
+ 2322190842U, // <0,7,1,3>: Cost 3 vmrglw <6,7,0,1>, <6,2,7,3>
+ 1141691750U, // <0,7,1,4>: Cost 2 vmrghw LHS, <7,4,5,6>
+ 2215433654U, // <0,7,1,5>: Cost 3 vmrghw LHS, <7,5,5,5>
+ 2653606230U, // <0,7,1,6>: Cost 3 vsldoi4 <6,0,7,1>, <6,0,7,1>
+ 1141692012U, // <0,7,1,7>: Cost 2 vmrghw LHS, <7,7,7,7>
+ 1141692034U, // <0,7,1,u>: Cost 2 vmrghw LHS, <7,u,1,2>
+ 2785825940U, // <0,7,2,0>: Cost 3 vsldoi12 <5,6,7,0>, <7,2,0,3>
+ 3768108576U, // <0,7,2,1>: Cost 5 vsldoi8 <1,6,0,7>, <2,1,3,2>
+ 3780052584U, // <0,7,2,2>: Cost 4 vsldoi8 <3,6,0,7>, <2,2,2,2>
+ 2794820780U, // <0,7,2,3>: Cost 3 vsldoi12 <7,2,3,0>, <7,2,3,0>
+ 3859641528U, // <0,7,2,4>: Cost 4 vsldoi12 <5,6,u,0>, <7,2,4,3>
+ 3733327970U, // <0,7,2,5>: Cost 4 vsldoi4 <7,0,7,2>, <5,6,7,0>
+ 3778062266U, // <0,7,2,6>: Cost 4 vsldoi8 <3,3,0,7>, <2,6,3,7>
+ 3733328944U, // <0,7,2,7>: Cost 4 vsldoi4 <7,0,7,2>, <7,0,7,2>
+ 2795189465U, // <0,7,2,u>: Cost 3 vsldoi12 <7,2,u,0>, <7,2,u,0>
+ 2324861026U, // <0,7,3,0>: Cost 3 vmrglw <7,2,0,3>, <5,6,7,0>
+ 3780053233U, // <0,7,3,1>: Cost 4 vsldoi8 <3,6,0,7>, <3,1,2,3>
+ 3780053296U, // <0,7,3,2>: Cost 4 vsldoi8 <3,6,0,7>, <3,2,0,3>
+ 3778062725U, // <0,7,3,3>: Cost 4 vsldoi8 <3,3,0,7>, <3,3,0,7>
+ 3780053506U, // <0,7,3,4>: Cost 4 vsldoi8 <3,6,0,7>, <3,4,5,6>
+ 3803941469U, // <0,7,3,5>: Cost 4 vsldoi8 <7,6,0,7>, <3,5,6,7>
+ 2706311800U, // <0,7,3,6>: Cost 3 vsldoi8 <3,6,0,7>, <3,6,0,7>
+ 3398603586U, // <0,7,3,7>: Cost 4 vmrglw <7,2,0,3>, <6,6,7,7>
+ 2707639066U, // <0,7,3,u>: Cost 3 vsldoi8 <3,u,0,7>, <3,u,0,7>
+ 2217366522U, // <0,7,4,0>: Cost 3 vmrghw <0,4,1,5>, <7,0,1,2>
+ 3727369110U, // <0,7,4,1>: Cost 4 vsldoi4 <6,0,7,4>, <1,2,3,0>
+ 3291108500U, // <0,7,4,2>: Cost 4 vmrghw <0,4,1,5>, <7,2,0,3>
+ 3727370872U, // <0,7,4,3>: Cost 4 vsldoi4 <6,0,7,4>, <3,6,0,7>
+ 2217366886U, // <0,7,4,4>: Cost 3 vmrghw <0,4,1,5>, <7,4,5,6>
+ 2706312502U, // <0,7,4,5>: Cost 3 vsldoi8 <3,6,0,7>, RHS
+ 3786026321U, // <0,7,4,6>: Cost 4 vsldoi8 <4,6,0,7>, <4,6,0,7>
+ 2217367148U, // <0,7,4,7>: Cost 3 vmrghw <0,4,1,5>, <7,7,7,7>
+ 2706312745U, // <0,7,4,u>: Cost 3 vsldoi8 <3,6,0,7>, RHS
+ 2322223202U, // <0,7,5,0>: Cost 3 vmrglw <6,7,0,5>, <5,6,7,0>
+ 3399946987U, // <0,7,5,1>: Cost 4 vmrglw <7,4,0,5>, <6,5,7,1>
+ 3291780244U, // <0,7,5,2>: Cost 4 vmrghw <0,5,1,6>, <7,2,0,3>
+ 3727378582U, // <0,7,5,3>: Cost 4 vsldoi4 <6,0,7,5>, <3,0,1,2>
+ 3727379766U, // <0,7,5,4>: Cost 4 vsldoi4 <6,0,7,5>, RHS
+ 3859568054U, // <0,7,5,5>: Cost 4 vsldoi12 <5,6,7,0>, <7,5,5,5>
+ 2785826241U, // <0,7,5,6>: Cost 3 vsldoi12 <5,6,7,0>, <7,5,6,7>
+ 3395965762U, // <0,7,5,7>: Cost 4 vmrglw <6,7,0,5>, <6,6,7,7>
+ 2787153363U, // <0,7,5,u>: Cost 3 vsldoi12 <5,u,7,0>, <7,5,u,7>
+ 2785826268U, // <0,7,6,0>: Cost 3 vsldoi12 <5,6,7,0>, <7,6,0,7>
+ 3780055420U, // <0,7,6,1>: Cost 5 vsldoi8 <3,6,0,7>, <6,1,2,3>
+ 3859568110U, // <0,7,6,2>: Cost 4 vsldoi12 <5,6,7,0>, <7,6,2,7>
+ 3874534903U, // <0,7,6,3>: Cost 4 vsldoi12 <u,2,3,0>, <7,6,3,7>
+ 3859641856U, // <0,7,6,4>: Cost 4 vsldoi12 <5,6,u,0>, <7,6,4,7>
+ 3733360738U, // <0,7,6,5>: Cost 4 vsldoi4 <7,0,7,6>, <5,6,7,0>
+ 3859568145U, // <0,7,6,6>: Cost 4 vsldoi12 <5,6,7,0>, <7,6,6,6>
+ 2797770260U, // <0,7,6,7>: Cost 3 vsldoi12 <7,6,7,0>, <7,6,7,0>
+ 2797843997U, // <0,7,6,u>: Cost 3 vsldoi12 <7,6,u,0>, <7,6,u,0>
+ 2785826342U, // <0,7,7,0>: Cost 3 vsldoi12 <5,6,7,0>, <7,7,0,0>
+ 3727393686U, // <0,7,7,1>: Cost 4 vsldoi4 <6,0,7,7>, <1,2,3,0>
+ 3868563003U, // <0,7,7,2>: Cost 4 vsldoi12 <7,2,3,0>, <7,7,2,3>
+ 3377397988U, // <0,7,7,3>: Cost 4 vmrglw <3,6,0,7>, <0,2,7,3>
+ 2219349350U, // <0,7,7,4>: Cost 3 vmrghw <0,7,1,4>, <7,4,5,6>
+ 3859568217U, // <0,7,7,5>: Cost 4 vsldoi12 <5,6,7,0>, <7,7,5,6>
+ 2730202588U, // <0,7,7,6>: Cost 3 vsldoi8 <7,6,0,7>, <7,6,0,7>
+ 2785826412U, // <0,7,7,7>: Cost 3 vsldoi12 <5,6,7,0>, <7,7,7,7>
+ 2731529854U, // <0,7,7,u>: Cost 3 vsldoi8 <7,u,0,7>, <7,u,0,7>
+ 1146336250U, // <0,7,u,0>: Cost 2 vmrghw LHS, <7,0,1,2>
+ 2706315054U, // <0,7,u,1>: Cost 3 vsldoi8 <3,6,0,7>, LHS
+ 2653660845U, // <0,7,u,2>: Cost 3 vsldoi4 <6,0,7,u>, <2,3,0,u>
+ 2322248186U, // <0,7,u,3>: Cost 3 vmrglw <6,7,0,u>, <6,2,7,3>
+ 1146336614U, // <0,7,u,4>: Cost 2 vmrghw LHS, <7,4,5,6>
+ 2706315418U, // <0,7,u,5>: Cost 3 vsldoi8 <3,6,0,7>, RHS
+ 2653663581U, // <0,7,u,6>: Cost 3 vsldoi4 <6,0,7,u>, <6,0,7,u>
+ 1146336876U, // <0,7,u,7>: Cost 2 vmrghw LHS, <7,7,7,7>
+ 1146336898U, // <0,7,u,u>: Cost 2 vmrghw LHS, <7,u,1,2>
+ 202162278U, // <0,u,0,0>: Cost 1 vspltisw0 LHS
+ 1624612966U, // <0,u,0,1>: Cost 2 vsldoi8 <2,3,0,u>, LHS
+ 2629780986U, // <0,u,0,2>: Cost 3 vsldoi4 <2,0,u,0>, <2,0,u,0>
+ 1207959708U, // <0,u,0,3>: Cost 2 vmrglw <0,0,0,0>, LHS
+ 1544097078U, // <0,u,0,4>: Cost 2 vsldoi4 <0,0,u,0>, RHS
+ 1140856986U, // <0,u,0,5>: Cost 2 vmrghw <0,0,0,0>, RHS
+ 2698355253U, // <0,u,0,6>: Cost 3 vsldoi8 <2,3,0,u>, <0,6,u,7>
+ 1207962952U, // <0,u,0,7>: Cost 2 vmrglw <0,0,0,0>, RHS
+ 202162278U, // <0,u,0,u>: Cost 1 vspltisw0 LHS
+ 1142134483U, // <0,u,1,0>: Cost 2 vmrghw LHS, <u,0,1,2>
+ 67950382U, // <0,u,1,1>: Cost 1 vmrghw LHS, LHS
+ 1142175624U, // <0,u,1,2>: Cost 2 vmrghw LHS, <u,2,3,3>
+ 1142175676U, // <0,u,1,3>: Cost 2 vmrghw LHS, <u,3,0,1>
+ 1142134847U, // <0,u,1,4>: Cost 2 vmrghw LHS, <u,4,5,6>
+ 67950746U, // <0,u,1,5>: Cost 1 vmrghw LHS, RHS
+ 1142175952U, // <0,u,1,6>: Cost 2 vmrghw LHS, <u,6,3,7>
+ 1221905736U, // <0,u,1,7>: Cost 2 vmrglw <2,3,0,1>, RHS
+ 67950949U, // <0,u,1,u>: Cost 1 vmrghw LHS, LHS
+ 1562026086U, // <0,u,2,0>: Cost 2 vsldoi4 <3,0,u,2>, LHS
+ 2216015662U, // <0,u,2,1>: Cost 3 vmrghw <0,2,1,2>, LHS
+ 2698356328U, // <0,u,2,2>: Cost 3 vsldoi8 <2,3,0,u>, <2,2,2,2>
+ 835584U, // <0,u,2,3>: Cost 0 copy LHS
+ 1562029366U, // <0,u,2,4>: Cost 2 vsldoi4 <3,0,u,2>, RHS
+ 2216016026U, // <0,u,2,5>: Cost 3 vmrghw <0,2,1,2>, RHS
+ 2698356666U, // <0,u,2,6>: Cost 3 vsldoi8 <2,3,0,u>, <2,6,3,7>
+ 1585919033U, // <0,u,2,7>: Cost 2 vsldoi4 <7,0,u,2>, <7,0,u,2>
+ 835584U, // <0,u,2,u>: Cost 0 copy LHS
+ 2758989756U, // <0,u,3,0>: Cost 3 vsldoi12 <1,2,3,0>, <u,3,0,1>
+ 2216662830U, // <0,u,3,1>: Cost 3 vmrghw <0,3,1,0>, LHS
+ 2703665461U, // <0,u,3,2>: Cost 3 vsldoi8 <3,2,0,u>, <3,2,0,u>
+ 2758989782U, // <0,u,3,3>: Cost 3 vsldoi12 <1,2,3,0>, <u,3,3,0>
+ 2758989796U, // <0,u,3,4>: Cost 3 vsldoi12 <1,2,3,0>, <u,3,4,5>
+ 2216663194U, // <0,u,3,5>: Cost 3 vmrghw <0,3,1,0>, RHS
+ 2706319993U, // <0,u,3,6>: Cost 3 vsldoi8 <3,6,0,u>, <3,6,0,u>
+ 2300972360U, // <0,u,3,7>: Cost 3 vmrglw <3,2,0,3>, RHS
+ 2216663397U, // <0,u,3,u>: Cost 3 vmrghw <0,3,1,0>, LHS
+ 2217367251U, // <0,u,4,0>: Cost 3 vmrghw <0,4,1,5>, <u,0,1,2>
+ 1143625518U, // <0,u,4,1>: Cost 2 vmrghw <0,4,1,5>, LHS
+ 2217367432U, // <0,u,4,2>: Cost 3 vmrghw <0,4,1,5>, <u,2,3,3>
+ 2217367484U, // <0,u,4,3>: Cost 3 vmrghw <0,4,1,5>, <u,3,0,1>
+ 1143619922U, // <0,u,4,4>: Cost 2 vmrghw <0,4,1,5>, <0,4,1,5>
+ 1143625882U, // <0,u,4,5>: Cost 2 vmrghw <0,4,1,5>, RHS
+ 2217367760U, // <0,u,4,6>: Cost 3 vmrghw <0,4,1,5>, <u,6,3,7>
+ 2291027272U, // <0,u,4,7>: Cost 3 vmrglw <1,5,0,4>, RHS
+ 1143626085U, // <0,u,4,u>: Cost 2 vmrghw <0,4,1,5>, LHS
+ 2635792486U, // <0,u,5,0>: Cost 3 vsldoi4 <3,0,u,5>, LHS
+ 2635793302U, // <0,u,5,1>: Cost 3 vsldoi4 <3,0,u,5>, <1,2,3,0>
+ 2302314646U, // <0,u,5,2>: Cost 3 vmrglw <3,4,0,5>, <3,0,1,2>
+ 2635794648U, // <0,u,5,3>: Cost 3 vsldoi4 <3,0,u,5>, <3,0,u,5>
+ 2635795766U, // <0,u,5,4>: Cost 3 vsldoi4 <3,0,u,5>, RHS
+ 2717601754U, // <0,u,5,5>: Cost 3 vsldoi8 <5,5,0,u>, <5,5,0,u>
+ 1685248154U, // <0,u,5,6>: Cost 2 vsldoi12 <1,2,3,0>, RHS
+ 2302315848U, // <0,u,5,7>: Cost 3 vmrglw <3,4,0,5>, RHS
+ 1685248172U, // <0,u,5,u>: Cost 2 vsldoi12 <1,2,3,0>, RHS
+ 2759358645U, // <0,u,6,0>: Cost 3 vsldoi12 <1,2,u,0>, <u,6,0,7>
+ 2218637102U, // <0,u,6,1>: Cost 3 vmrghw <0,6,0,7>, LHS
+ 2724901370U, // <0,u,6,2>: Cost 3 vsldoi8 <6,7,0,u>, <6,2,7,3>
+ 2758990032U, // <0,u,6,3>: Cost 3 vsldoi12 <1,2,3,0>, <u,6,3,7>
+ 2659691830U, // <0,u,6,4>: Cost 3 vsldoi4 <7,0,u,6>, RHS
+ 2659471458U, // <0,u,6,5>: Cost 3 vsldoi4 <7,0,5,6>, <5,6,7,0>
+ 2724901688U, // <0,u,6,6>: Cost 3 vsldoi8 <6,7,0,u>, <6,6,6,6>
+ 1651159893U, // <0,u,6,7>: Cost 2 vsldoi8 <6,7,0,u>, <6,7,0,u>
+ 1651823526U, // <0,u,6,u>: Cost 2 vsldoi8 <6,u,0,u>, <6,u,0,u>
+ 2785827072U, // <0,u,7,0>: Cost 3 vsldoi12 <5,6,7,0>, <u,7,0,1>
+ 2803964168U, // <0,u,7,1>: Cost 3 vsldoi12 <u,7,1,0>, <u,7,1,0>
+ 2727556249U, // <0,u,7,2>: Cost 3 vsldoi8 <7,2,0,u>, <7,2,0,u>
+ 2303656092U, // <0,u,7,3>: Cost 3 vmrglw <3,6,0,7>, LHS
+ 2785827112U, // <0,u,7,4>: Cost 3 vsldoi12 <5,6,7,0>, <u,7,4,5>
+ 2785827122U, // <0,u,7,5>: Cost 3 vsldoi12 <5,6,7,0>, <u,7,5,6>
+ 2730210781U, // <0,u,7,6>: Cost 3 vsldoi8 <7,6,0,u>, <7,6,0,u>
+ 2303659336U, // <0,u,7,7>: Cost 3 vmrglw <3,6,0,7>, RHS
+ 2303656097U, // <0,u,7,u>: Cost 3 vmrglw <3,6,0,7>, LHS
+ 202162278U, // <0,u,u,0>: Cost 1 vspltisw0 LHS
+ 72595246U, // <0,u,u,1>: Cost 1 vmrghw LHS, LHS
+ 1146337160U, // <0,u,u,2>: Cost 2 vmrghw LHS, <u,2,3,3>
+ 835584U, // <0,u,u,3>: Cost 0 copy LHS
+ 1146337343U, // <0,u,u,4>: Cost 2 vmrghw LHS, <u,4,5,6>
+ 72595610U, // <0,u,u,5>: Cost 1 vmrghw LHS, RHS
+ 1146337488U, // <0,u,u,6>: Cost 2 vmrghw LHS, <u,6,3,7>
+ 1221963080U, // <0,u,u,7>: Cost 2 vmrglw <2,3,0,u>, RHS
+ 835584U, // <0,u,u,u>: Cost 0 copy LHS
+ 2756853760U, // <1,0,0,0>: Cost 3 vsldoi12 <0,u,1,1>, <0,0,0,0>
+ 1677803530U, // <1,0,0,1>: Cost 2 vsldoi12 <0,0,1,1>, <0,0,1,1>
+ 3759497387U, // <1,0,0,2>: Cost 4 vsldoi8 <0,2,1,0>, <0,2,1,0>
+ 2686419196U, // <1,0,0,3>: Cost 3 vsldoi8 <0,3,1,0>, <0,3,1,0>
+ 2751766565U, // <1,0,0,4>: Cost 3 vsldoi12 <0,0,4,1>, <0,0,4,1>
+ 2687746462U, // <1,0,0,5>: Cost 3 vsldoi8 <0,5,1,0>, <0,5,1,0>
+ 3776086518U, // <1,0,0,6>: Cost 4 vsldoi8 <3,0,1,0>, <0,6,1,7>
+ 2689073728U, // <1,0,0,7>: Cost 3 vsldoi8 <0,7,1,0>, <0,7,1,0>
+ 1678319689U, // <1,0,0,u>: Cost 2 vsldoi12 <0,0,u,1>, <0,0,u,1>
+ 2287091712U, // <1,0,1,0>: Cost 3 vmrglw <0,u,1,1>, <0,0,0,0>
+ 1147568230U, // <1,0,1,1>: Cost 2 vmrghw <1,1,1,1>, LHS
+ 1683112038U, // <1,0,1,2>: Cost 2 vsldoi12 <0,u,1,1>, LHS
+ 3294970108U, // <1,0,1,3>: Cost 4 vmrghw <1,1,0,0>, <0,3,1,0>
+ 2623892790U, // <1,0,1,4>: Cost 3 vsldoi4 <1,1,0,1>, RHS
+ 2647781007U, // <1,0,1,5>: Cost 3 vsldoi4 <5,1,0,1>, <5,1,0,1>
+ 2791948430U, // <1,0,1,6>: Cost 3 vsldoi12 <6,7,0,1>, <0,1,6,7>
+ 3721524218U, // <1,0,1,7>: Cost 4 vsldoi4 <5,1,0,1>, <7,0,1,2>
+ 1683112092U, // <1,0,1,u>: Cost 2 vsldoi12 <0,u,1,1>, LHS
+ 2222112768U, // <1,0,2,0>: Cost 3 vmrghw <1,2,3,0>, <0,0,0,0>
+ 1148371046U, // <1,0,2,1>: Cost 2 vmrghw <1,2,3,0>, LHS
+ 3356862524U, // <1,0,2,2>: Cost 4 vmrglw <0,2,1,2>, <2,u,0,2>
+ 2702345894U, // <1,0,2,3>: Cost 3 vsldoi8 <3,0,1,0>, <2,3,0,1>
+ 2222113106U, // <1,0,2,4>: Cost 3 vmrghw <1,2,3,0>, <0,4,1,5>
+ 2299709908U, // <1,0,2,5>: Cost 3 vmrglw <3,0,1,2>, <3,4,0,5>
+ 3760162746U, // <1,0,2,6>: Cost 4 vsldoi8 <0,3,1,0>, <2,6,3,7>
+ 3369470584U, // <1,0,2,7>: Cost 4 vmrglw <2,3,1,2>, <3,6,0,7>
+ 1148371613U, // <1,0,2,u>: Cost 2 vmrghw <1,2,3,0>, LHS
+ 2686421142U, // <1,0,3,0>: Cost 3 vsldoi8 <0,3,1,0>, <3,0,1,2>
+ 2283128486U, // <1,0,3,1>: Cost 3 vmrglw <0,2,1,3>, <2,3,0,1>
+ 3296305326U, // <1,0,3,2>: Cost 4 vmrghw <1,3,0,1>, <0,2,1,3>
+ 3760163199U, // <1,0,3,3>: Cost 4 vsldoi8 <0,3,1,0>, <3,3,0,1>
+ 3760163330U, // <1,0,3,4>: Cost 4 vsldoi8 <0,3,1,0>, <3,4,5,6>
+ 3779406377U, // <1,0,3,5>: Cost 4 vsldoi8 <3,5,1,0>, <3,5,1,0>
+ 3865690416U, // <1,0,3,6>: Cost 4 vsldoi12 <6,7,0,1>, <0,3,6,7>
+ 3366824568U, // <1,0,3,7>: Cost 5 vmrglw <1,u,1,3>, <3,6,0,7>
+ 2707655452U, // <1,0,3,u>: Cost 3 vsldoi8 <3,u,1,0>, <3,u,1,0>
+ 2734861202U, // <1,0,4,0>: Cost 3 vsldoi8 <u,4,1,0>, <4,0,5,1>
+ 2756854098U, // <1,0,4,1>: Cost 3 vsldoi12 <0,u,1,1>, <0,4,1,5>
+ 3830595931U, // <1,0,4,2>: Cost 5 vsldoi12 <0,u,1,1>, <0,4,2,5>
+ 3296968960U, // <1,0,4,3>: Cost 4 vmrghw <1,4,0,1>, <0,3,1,4>
+ 3830595949U, // <1,0,4,4>: Cost 4 vsldoi12 <0,u,1,1>, <0,4,4,5>
+ 2686422326U, // <1,0,4,5>: Cost 3 vsldoi8 <0,3,1,0>, RHS
+ 3297378806U, // <1,0,4,6>: Cost 5 vmrghw <1,4,5,6>, <0,6,1,7>
+ 3810594248U, // <1,0,4,7>: Cost 4 vsldoi8 <u,7,1,0>, <4,7,5,0>
+ 2686422569U, // <1,0,4,u>: Cost 3 vsldoi8 <0,3,1,0>, RHS
+ 2284470272U, // <1,0,5,0>: Cost 3 vmrglw <0,4,1,5>, <0,0,0,0>
+ 2284471974U, // <1,0,5,1>: Cost 3 vmrglw <0,4,1,5>, <2,3,0,1>
+ 3809267435U, // <1,0,5,2>: Cost 4 vsldoi8 <u,5,1,0>, <5,2,1,3>
+ 3297968384U, // <1,0,5,3>: Cost 4 vmrghw <1,5,4,6>, <0,3,1,4>
+ 2284471977U, // <1,0,5,4>: Cost 3 vmrglw <0,4,1,5>, <2,3,0,4>
+ 3721555603U, // <1,0,5,5>: Cost 4 vsldoi4 <5,1,0,5>, <5,1,0,5>
+ 3792679010U, // <1,0,5,6>: Cost 4 vsldoi8 <5,7,1,0>, <5,6,7,0>
+ 3792679037U, // <1,0,5,7>: Cost 4 vsldoi8 <5,7,1,0>, <5,7,1,0>
+ 2284471981U, // <1,0,5,u>: Cost 3 vmrglw <0,4,1,5>, <2,3,0,u>
+ 3356893184U, // <1,0,6,0>: Cost 4 vmrglw <0,2,1,6>, <0,0,0,0>
+ 2224676966U, // <1,0,6,1>: Cost 3 vmrghw <1,6,1,7>, LHS
+ 3298295985U, // <1,0,6,2>: Cost 4 vmrghw <1,6,0,1>, <0,2,1,6>
+ 3298345212U, // <1,0,6,3>: Cost 4 vmrghw <1,6,0,7>, <0,3,1,0>
+ 2224972114U, // <1,0,6,4>: Cost 3 vmrghw <1,6,5,7>, <0,4,1,5>
+ 3808604907U, // <1,0,6,5>: Cost 4 vsldoi8 <u,4,1,0>, <6,5,7,1>
+ 3799978808U, // <1,0,6,6>: Cost 4 vsldoi8 <7,0,1,0>, <6,6,6,6>
+ 2726237006U, // <1,0,6,7>: Cost 3 vsldoi8 <7,0,1,0>, <6,7,0,1>
+ 2224677522U, // <1,0,6,u>: Cost 3 vmrghw <1,6,1,7>, <0,u,1,1>
+ 2726237176U, // <1,0,7,0>: Cost 3 vsldoi8 <7,0,1,0>, <7,0,1,0>
+ 2285815462U, // <1,0,7,1>: Cost 3 vmrglw <0,6,1,7>, <2,3,0,1>
+ 3805951193U, // <1,0,7,2>: Cost 4 vsldoi8 <u,0,1,0>, <7,2,u,0>
+ 3807941859U, // <1,0,7,3>: Cost 4 vsldoi8 <u,3,1,0>, <7,3,0,1>
+ 3799979366U, // <1,0,7,4>: Cost 4 vsldoi8 <7,0,1,0>, <7,4,5,6>
+ 3803297165U, // <1,0,7,5>: Cost 4 vsldoi8 <7,5,1,0>, <7,5,1,0>
+ 3799979540U, // <1,0,7,6>: Cost 4 vsldoi8 <7,0,1,0>, <7,6,7,0>
+ 3799979628U, // <1,0,7,7>: Cost 4 vsldoi8 <7,0,1,0>, <7,7,7,7>
+ 2731546240U, // <1,0,7,u>: Cost 3 vsldoi8 <7,u,1,0>, <7,u,1,0>
+ 2284494848U, // <1,0,u,0>: Cost 3 vmrglw <0,4,1,u>, <0,0,0,0>
+ 1683112594U, // <1,0,u,1>: Cost 2 vsldoi12 <0,u,1,1>, <0,u,1,1>
+ 1683112605U, // <1,0,u,2>: Cost 2 vsldoi12 <0,u,1,1>, LHS
+ 2734200772U, // <1,0,u,3>: Cost 3 vsldoi8 <u,3,1,0>, <u,3,1,0>
+ 2757075629U, // <1,0,u,4>: Cost 3 vsldoi12 <0,u,4,1>, <0,u,4,1>
+ 2686425242U, // <1,0,u,5>: Cost 3 vsldoi8 <0,3,1,0>, RHS
+ 2791948430U, // <1,0,u,6>: Cost 3 vsldoi12 <6,7,0,1>, <0,1,6,7>
+ 2736855304U, // <1,0,u,7>: Cost 3 vsldoi8 <u,7,1,0>, <u,7,1,0>
+ 1683112659U, // <1,0,u,u>: Cost 2 vsldoi12 <0,u,1,1>, LHS
+ 1610694666U, // <1,1,0,0>: Cost 2 vsldoi8 <0,0,1,1>, <0,0,1,1>
+ 1616003174U, // <1,1,0,1>: Cost 2 vsldoi8 <0,u,1,1>, LHS
+ 2283767958U, // <1,1,0,2>: Cost 3 vmrglw <0,3,1,0>, <3,0,1,2>
+ 3357507596U, // <1,1,0,3>: Cost 4 vmrglw <0,3,1,0>, <0,0,1,3>
+ 2689745234U, // <1,1,0,4>: Cost 3 vsldoi8 <0,u,1,1>, <0,4,1,5>
+ 3357507922U, // <1,1,0,5>: Cost 4 vmrglw <0,3,1,0>, <0,4,1,5>
+ 3294397647U, // <1,1,0,6>: Cost 4 vmrghw <1,0,1,2>, <1,6,1,7>
+ 3373433334U, // <1,1,0,7>: Cost 4 vmrglw <3,0,1,0>, <0,6,1,7>
+ 1616003730U, // <1,1,0,u>: Cost 2 vsldoi8 <0,u,1,1>, <0,u,1,1>
+ 1550221414U, // <1,1,1,0>: Cost 2 vsldoi4 <1,1,1,1>, LHS
+ 269271142U, // <1,1,1,1>: Cost 1 vspltisw1 LHS
+ 2287093910U, // <1,1,1,2>: Cost 3 vmrglw <0,u,1,1>, <3,0,1,2>
+ 2287092615U, // <1,1,1,3>: Cost 3 vmrglw <0,u,1,1>, <1,2,1,3>
+ 1550224694U, // <1,1,1,4>: Cost 2 vsldoi4 <1,1,1,1>, RHS
+ 2287092050U, // <1,1,1,5>: Cost 3 vmrglw <0,u,1,1>, <0,4,1,5>
+ 2689746127U, // <1,1,1,6>: Cost 3 vsldoi8 <0,u,1,1>, <1,6,1,7>
+ 2659800138U, // <1,1,1,7>: Cost 3 vsldoi4 <7,1,1,1>, <7,1,1,1>
+ 269271142U, // <1,1,1,u>: Cost 1 vspltisw1 LHS
+ 2222113516U, // <1,1,2,0>: Cost 3 vmrghw <1,2,3,0>, <1,0,2,1>
+ 2756854663U, // <1,1,2,1>: Cost 3 vsldoi12 <0,u,1,1>, <1,2,1,3>
+ 1148371862U, // <1,1,2,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0>
+ 2689746598U, // <1,1,2,3>: Cost 3 vsldoi8 <0,u,1,1>, <2,3,0,1>
+ 2618002742U, // <1,1,2,4>: Cost 3 vsldoi4 <0,1,1,2>, RHS
+ 2299707730U, // <1,1,2,5>: Cost 3 vmrglw <3,0,1,2>, <0,4,1,5>
+ 2689746874U, // <1,1,2,6>: Cost 3 vsldoi8 <0,u,1,1>, <2,6,3,7>
+ 3361506511U, // <1,1,2,7>: Cost 4 vmrglw <1,0,1,2>, <1,6,1,7>
+ 1148371862U, // <1,1,2,u>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0>
+ 2689747094U, // <1,1,3,0>: Cost 3 vsldoi8 <0,u,1,1>, <3,0,1,2>
+ 2691074278U, // <1,1,3,1>: Cost 3 vsldoi8 <1,1,1,1>, <3,1,1,1>
+ 3356870806U, // <1,1,3,2>: Cost 4 vmrglw <0,2,1,3>, <3,0,1,2>
+ 2283126958U, // <1,1,3,3>: Cost 3 vmrglw <0,2,1,3>, <0,2,1,3>
+ 2689747458U, // <1,1,3,4>: Cost 3 vsldoi8 <0,u,1,1>, <3,4,5,6>
+ 3356868946U, // <1,1,3,5>: Cost 4 vmrglw <0,2,1,3>, <0,4,1,5>
+ 3811265144U, // <1,1,3,6>: Cost 4 vsldoi8 <u,u,1,1>, <3,6,0,7>
+ 3362841807U, // <1,1,3,7>: Cost 4 vmrglw <1,2,1,3>, <1,6,1,7>
+ 2689747742U, // <1,1,3,u>: Cost 3 vsldoi8 <0,u,1,1>, <3,u,1,2>
+ 2623987814U, // <1,1,4,0>: Cost 3 vsldoi4 <1,1,1,4>, LHS
+ 2758181931U, // <1,1,4,1>: Cost 3 vsldoi12 <1,1,1,1>, <1,4,1,5>
+ 2223408022U, // <1,1,4,2>: Cost 3 vmrghw <1,4,2,5>, <1,2,3,0>
+ 3697731734U, // <1,1,4,3>: Cost 4 vsldoi4 <1,1,1,4>, <3,0,1,2>
+ 2283798784U, // <1,1,4,4>: Cost 3 vmrglw <0,3,1,4>, <0,3,1,4>
+ 1616006454U, // <1,1,4,5>: Cost 2 vsldoi8 <0,u,1,1>, RHS
+ 3297379535U, // <1,1,4,6>: Cost 4 vmrghw <1,4,5,6>, <1,6,1,7>
+ 3373466102U, // <1,1,4,7>: Cost 4 vmrglw <3,0,1,4>, <0,6,1,7>
+ 1616006697U, // <1,1,4,u>: Cost 2 vsldoi8 <0,u,1,1>, RHS
+ 2760762479U, // <1,1,5,0>: Cost 3 vsldoi12 <1,5,0,1>, <1,5,0,1>
+ 2284470282U, // <1,1,5,1>: Cost 3 vmrglw <0,4,1,5>, <0,0,1,1>
+ 2284472470U, // <1,1,5,2>: Cost 3 vmrglw <0,4,1,5>, <3,0,1,2>
+ 3358212270U, // <1,1,5,3>: Cost 4 vmrglw <0,4,1,5>, <0,2,1,3>
+ 2284470285U, // <1,1,5,4>: Cost 3 vmrglw <0,4,1,5>, <0,0,1,4>
+ 1210728786U, // <1,1,5,5>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5>
+ 2737524834U, // <1,1,5,6>: Cost 3 vsldoi8 <u,u,1,1>, <5,6,7,0>
+ 3360867535U, // <1,1,5,7>: Cost 4 vmrglw <0,u,1,5>, <1,6,1,7>
+ 1210728786U, // <1,1,5,u>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5>
+ 3697746022U, // <1,1,6,0>: Cost 4 vsldoi4 <1,1,1,6>, LHS
+ 2756854991U, // <1,1,6,1>: Cost 3 vsldoi12 <0,u,1,1>, <1,6,1,7>
+ 2737525242U, // <1,1,6,2>: Cost 3 vsldoi8 <u,u,1,1>, <6,2,7,3>
+ 3839149281U, // <1,1,6,3>: Cost 4 vsldoi12 <2,3,0,1>, <1,6,3,7>
+ 3697749302U, // <1,1,6,4>: Cost 4 vsldoi4 <1,1,1,6>, RHS
+ 3356893522U, // <1,1,6,5>: Cost 4 vmrglw <0,2,1,6>, <0,4,1,5>
+ 2283151537U, // <1,1,6,6>: Cost 3 vmrglw <0,2,1,6>, <0,2,1,6>
+ 2791949566U, // <1,1,6,7>: Cost 3 vsldoi12 <6,7,0,1>, <1,6,7,0>
+ 2792613127U, // <1,1,6,u>: Cost 3 vsldoi12 <6,u,0,1>, <1,6,u,0>
+ 2737525754U, // <1,1,7,0>: Cost 3 vsldoi8 <u,u,1,1>, <7,0,1,2>
+ 2291786386U, // <1,1,7,1>: Cost 3 vmrglw <1,6,1,7>, <0,u,1,1>
+ 3365528292U, // <1,1,7,2>: Cost 4 vmrglw <1,6,1,7>, <1,0,1,2>
+ 3365528455U, // <1,1,7,3>: Cost 4 vmrglw <1,6,1,7>, <1,2,1,3>
+ 2737526118U, // <1,1,7,4>: Cost 3 vsldoi8 <u,u,1,1>, <7,4,5,6>
+ 3365527890U, // <1,1,7,5>: Cost 4 vmrglw <1,6,1,7>, <0,4,1,5>
+ 3365528377U, // <1,1,7,6>: Cost 4 vmrglw <1,6,1,7>, <1,1,1,6>
+ 2291786959U, // <1,1,7,7>: Cost 3 vmrglw <1,6,1,7>, <1,6,1,7>
+ 2737526402U, // <1,1,7,u>: Cost 3 vsldoi8 <u,u,1,1>, <7,u,1,2>
+ 1550221414U, // <1,1,u,0>: Cost 2 vsldoi4 <1,1,1,1>, LHS
+ 269271142U, // <1,1,u,1>: Cost 1 vspltisw1 LHS
+ 1148371862U, // <1,1,u,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0>
+ 2689750972U, // <1,1,u,3>: Cost 3 vsldoi8 <0,u,1,1>, <u,3,0,1>
+ 1550224694U, // <1,1,u,4>: Cost 2 vsldoi4 <1,1,1,1>, RHS
+ 1616009370U, // <1,1,u,5>: Cost 2 vsldoi8 <0,u,1,1>, RHS
+ 2689751248U, // <1,1,u,6>: Cost 3 vsldoi8 <0,u,1,1>, <u,6,3,7>
+ 2736863497U, // <1,1,u,7>: Cost 3 vsldoi8 <u,7,1,1>, <u,7,1,1>
+ 269271142U, // <1,1,u,u>: Cost 1 vspltisw1 LHS
+ 2702360576U, // <1,2,0,0>: Cost 3 vsldoi8 <3,0,1,2>, <0,0,0,0>
+ 1628618854U, // <1,2,0,1>: Cost 2 vsldoi8 <3,0,1,2>, LHS
+ 2685771949U, // <1,2,0,2>: Cost 3 vsldoi8 <0,2,1,2>, <0,2,1,2>
+ 2283765862U, // <1,2,0,3>: Cost 3 vmrglw <0,3,1,0>, LHS
+ 2702360914U, // <1,2,0,4>: Cost 3 vsldoi8 <3,0,1,2>, <0,4,1,5>
+ 3788046813U, // <1,2,0,5>: Cost 4 vsldoi8 <5,0,1,2>, <0,5,u,0>
+ 2688426481U, // <1,2,0,6>: Cost 3 vsldoi8 <0,6,1,2>, <0,6,1,2>
+ 2726249024U, // <1,2,0,7>: Cost 3 vsldoi8 <7,0,1,2>, <0,7,1,0>
+ 1628619421U, // <1,2,0,u>: Cost 2 vsldoi8 <3,0,1,2>, LHS
+ 2690417380U, // <1,2,1,0>: Cost 3 vsldoi8 <1,0,1,2>, <1,0,1,2>
+ 2702361396U, // <1,2,1,1>: Cost 3 vsldoi8 <3,0,1,2>, <1,1,1,1>
+ 2287093352U, // <1,2,1,2>: Cost 3 vmrglw <0,u,1,1>, <2,2,2,2>
+ 1213349990U, // <1,2,1,3>: Cost 2 vmrglw <0,u,1,1>, LHS
+ 3764159522U, // <1,2,1,4>: Cost 4 vsldoi8 <1,0,1,2>, <1,4,0,5>
+ 3295053672U, // <1,2,1,5>: Cost 4 vmrghw <1,1,1,1>, <2,5,3,6>
+ 2221311930U, // <1,2,1,6>: Cost 3 vmrghw <1,1,1,1>, <2,6,3,7>
+ 3799991593U, // <1,2,1,7>: Cost 4 vsldoi8 <7,0,1,2>, <1,7,2,7>
+ 1213349995U, // <1,2,1,u>: Cost 2 vmrglw <0,u,1,1>, LHS
+ 2624045158U, // <1,2,2,0>: Cost 3 vsldoi4 <1,1,2,2>, LHS
+ 2702362144U, // <1,2,2,1>: Cost 3 vsldoi8 <3,0,1,2>, <2,1,3,2>
+ 2283120232U, // <1,2,2,2>: Cost 3 vmrglw <0,2,1,2>, <2,2,2,2>
+ 1225965670U, // <1,2,2,3>: Cost 2 vmrglw <3,0,1,2>, LHS
+ 2624048438U, // <1,2,2,4>: Cost 3 vsldoi4 <1,1,2,2>, RHS
+ 3356860763U, // <1,2,2,5>: Cost 4 vmrglw <0,2,1,2>, <0,4,2,5>
+ 2222114746U, // <1,2,2,6>: Cost 3 vmrghw <1,2,3,0>, <2,6,3,7>
+ 2299708632U, // <1,2,2,7>: Cost 3 vmrglw <3,0,1,2>, <1,6,2,7>
+ 1225965675U, // <1,2,2,u>: Cost 2 vmrglw <3,0,1,2>, LHS
+ 470597734U, // <1,2,3,0>: Cost 1 vsldoi4 LHS, LHS
+ 1544340276U, // <1,2,3,1>: Cost 2 vsldoi4 LHS, <1,1,1,1>
+ 1544341096U, // <1,2,3,2>: Cost 2 vsldoi4 LHS, <2,2,2,2>
+ 1544341916U, // <1,2,3,3>: Cost 2 vsldoi4 LHS, <3,3,3,3>
+ 470601014U, // <1,2,3,4>: Cost 1 vsldoi4 LHS, RHS
+ 1592119300U, // <1,2,3,5>: Cost 2 vsldoi4 LHS, <5,5,5,5>
+ 1592119802U, // <1,2,3,6>: Cost 2 vsldoi4 LHS, <6,2,7,3>
+ 1592120314U, // <1,2,3,7>: Cost 2 vsldoi4 LHS, <7,0,1,2>
+ 470603566U, // <1,2,3,u>: Cost 1 vsldoi4 LHS, LHS
+ 2708335471U, // <1,2,4,0>: Cost 3 vsldoi8 <4,0,1,2>, <4,0,1,2>
+ 3838043908U, // <1,2,4,1>: Cost 4 vsldoi12 <2,1,3,1>, <2,4,1,5>
+ 3357541992U, // <1,2,4,2>: Cost 4 vmrglw <0,3,1,4>, <2,2,2,2>
+ 2283798630U, // <1,2,4,3>: Cost 3 vmrglw <0,3,1,4>, LHS
+ 2726251728U, // <1,2,4,4>: Cost 3 vsldoi8 <7,0,1,2>, <4,4,4,4>
+ 1628622134U, // <1,2,4,5>: Cost 2 vsldoi8 <3,0,1,2>, RHS
+ 3297077178U, // <1,2,4,6>: Cost 4 vmrghw <1,4,1,5>, <2,6,3,7>
+ 2726251976U, // <1,2,4,7>: Cost 3 vsldoi8 <7,0,1,2>, <4,7,5,0>
+ 1628622377U, // <1,2,4,u>: Cost 2 vsldoi8 <3,0,1,2>, RHS
+ 2714308168U, // <1,2,5,0>: Cost 3 vsldoi8 <5,0,1,2>, <5,0,1,2>
+ 3297633827U, // <1,2,5,1>: Cost 4 vmrghw <1,5,0,1>, <2,1,3,5>
+ 2284471912U, // <1,2,5,2>: Cost 3 vmrglw <0,4,1,5>, <2,2,2,2>
+ 1210728550U, // <1,2,5,3>: Cost 2 vmrglw <0,4,1,5>, LHS
+ 3776106420U, // <1,2,5,4>: Cost 4 vsldoi8 <3,0,1,2>, <5,4,5,6>
+ 2726252548U, // <1,2,5,5>: Cost 3 vsldoi8 <7,0,1,2>, <5,5,5,5>
+ 2726252642U, // <1,2,5,6>: Cost 3 vsldoi8 <7,0,1,2>, <5,6,7,0>
+ 3799994538U, // <1,2,5,7>: Cost 4 vsldoi8 <7,0,1,2>, <5,7,6,0>
+ 1210728555U, // <1,2,5,u>: Cost 2 vmrglw <0,4,1,5>, LHS
+ 2720280865U, // <1,2,6,0>: Cost 3 vsldoi8 <6,0,1,2>, <6,0,1,2>
+ 2702365096U, // <1,2,6,1>: Cost 3 vsldoi8 <3,0,1,2>, <6,1,7,2>
+ 2726253050U, // <1,2,6,2>: Cost 3 vsldoi8 <7,0,1,2>, <6,2,7,3>
+ 2283151462U, // <1,2,6,3>: Cost 3 vmrglw <0,2,1,6>, LHS
+ 3697823030U, // <1,2,6,4>: Cost 4 vsldoi4 <1,1,2,6>, RHS
+ 3298715497U, // <1,2,6,5>: Cost 4 vmrghw <1,6,5,7>, <2,5,3,7>
+ 2726253368U, // <1,2,6,6>: Cost 3 vsldoi8 <7,0,1,2>, <6,6,6,6>
+ 2724926296U, // <1,2,6,7>: Cost 3 vsldoi8 <6,7,1,2>, <6,7,1,2>
+ 2283151467U, // <1,2,6,u>: Cost 3 vmrglw <0,2,1,6>, LHS
+ 1652511738U, // <1,2,7,0>: Cost 2 vsldoi8 <7,0,1,2>, <7,0,1,2>
+ 3371500916U, // <1,2,7,1>: Cost 4 vmrglw <2,6,1,7>, <1,u,2,1>
+ 3365529192U, // <1,2,7,2>: Cost 4 vmrglw <1,6,1,7>, <2,2,2,2>
+ 2291785830U, // <1,2,7,3>: Cost 3 vmrglw <1,6,1,7>, LHS
+ 2726253926U, // <1,2,7,4>: Cost 3 vsldoi8 <7,0,1,2>, <7,4,5,6>
+ 3788051845U, // <1,2,7,5>: Cost 4 vsldoi8 <5,0,1,2>, <7,5,0,1>
+ 3794023894U, // <1,2,7,6>: Cost 4 vsldoi8 <6,0,1,2>, <7,6,0,1>
+ 2726254119U, // <1,2,7,7>: Cost 3 vsldoi8 <7,0,1,2>, <7,7,0,1>
+ 1657820802U, // <1,2,7,u>: Cost 2 vsldoi8 <7,u,1,2>, <7,u,1,2>
+ 470638699U, // <1,2,u,0>: Cost 1 vsldoi4 LHS, LHS
+ 1544381236U, // <1,2,u,1>: Cost 2 vsldoi4 LHS, <1,1,1,1>
+ 1544382056U, // <1,2,u,2>: Cost 2 vsldoi4 LHS, <2,2,2,2>
+ 1544382614U, // <1,2,u,3>: Cost 2 vsldoi4 LHS, <3,0,1,2>
+ 470641974U, // <1,2,u,4>: Cost 1 vsldoi4 LHS, RHS
+ 1628625050U, // <1,2,u,5>: Cost 2 vsldoi8 <3,0,1,2>, RHS
+ 1592160762U, // <1,2,u,6>: Cost 2 vsldoi4 LHS, <6,2,7,3>
+ 1592161274U, // <1,2,u,7>: Cost 2 vsldoi4 LHS, <7,0,1,2>
+ 470644526U, // <1,2,u,u>: Cost 1 vsldoi4 LHS, LHS
+ 2769389708U, // <1,3,0,0>: Cost 3 vsldoi12 <3,0,0,1>, <3,0,0,1>
+ 2685780070U, // <1,3,0,1>: Cost 3 vsldoi8 <0,2,1,3>, LHS
+ 2685780142U, // <1,3,0,2>: Cost 3 vsldoi8 <0,2,1,3>, <0,2,1,3>
+ 2686443775U, // <1,3,0,3>: Cost 3 vsldoi8 <0,3,1,3>, <0,3,1,3>
+ 2769684656U, // <1,3,0,4>: Cost 3 vsldoi12 <3,0,4,1>, <3,0,4,1>
+ 3357507940U, // <1,3,0,5>: Cost 4 vmrglw <0,3,1,0>, <0,4,3,5>
+ 3759522294U, // <1,3,0,6>: Cost 4 vsldoi8 <0,2,1,3>, <0,6,1,7>
+ 3357509562U, // <1,3,0,7>: Cost 4 vmrglw <0,3,1,0>, <2,6,3,7>
+ 2685780637U, // <1,3,0,u>: Cost 3 vsldoi8 <0,2,1,3>, LHS
+ 2287092630U, // <1,3,1,0>: Cost 3 vmrglw <0,u,1,1>, <1,2,3,0>
+ 2221312230U, // <1,3,1,1>: Cost 3 vmrghw <1,1,1,1>, <3,1,1,1>
+ 2691752839U, // <1,3,1,2>: Cost 3 vsldoi8 <1,2,1,3>, <1,2,1,3>
+ 2287093362U, // <1,3,1,3>: Cost 3 vmrglw <0,u,1,1>, <2,2,3,3>
+ 2287092634U, // <1,3,1,4>: Cost 3 vmrglw <0,u,1,1>, <1,2,3,4>
+ 3360835107U, // <1,3,1,5>: Cost 4 vmrglw <0,u,1,1>, <2,1,3,5>
+ 3759523041U, // <1,3,1,6>: Cost 4 vsldoi8 <0,2,1,3>, <1,6,3,7>
+ 2287093690U, // <1,3,1,7>: Cost 3 vmrglw <0,u,1,1>, <2,6,3,7>
+ 2287092638U, // <1,3,1,u>: Cost 3 vmrglw <0,u,1,1>, <1,2,3,u>
+ 2222114966U, // <1,3,2,0>: Cost 3 vmrghw <1,2,3,0>, <3,0,1,2>
+ 2222115057U, // <1,3,2,1>: Cost 3 vmrghw <1,2,3,0>, <3,1,2,3>
+ 2630092320U, // <1,3,2,2>: Cost 3 vsldoi4 <2,1,3,2>, <2,1,3,2>
+ 2685781670U, // <1,3,2,3>: Cost 3 vsldoi8 <0,2,1,3>, <2,3,0,1>
+ 2222115330U, // <1,3,2,4>: Cost 3 vmrghw <1,2,3,0>, <3,4,5,6>
+ 3373449572U, // <1,3,2,5>: Cost 4 vmrglw <3,0,1,2>, <0,4,3,5>
+ 2222115448U, // <1,3,2,6>: Cost 3 vmrghw <1,2,3,0>, <3,6,0,7>
+ 2299709370U, // <1,3,2,7>: Cost 3 vmrglw <3,0,1,2>, <2,6,3,7>
+ 2222115614U, // <1,3,2,u>: Cost 3 vmrghw <1,2,3,0>, <3,u,1,2>
+ 2771380607U, // <1,3,3,0>: Cost 3 vsldoi12 <3,3,0,1>, <3,3,0,1>
+ 3356874468U, // <1,3,3,1>: Cost 4 vmrglw <0,2,1,3>, <u,0,3,1>
+ 3759524168U, // <1,3,3,2>: Cost 4 vsldoi8 <0,2,1,3>, <3,2,3,0>
+ 2283792796U, // <1,3,3,3>: Cost 3 vmrglw <0,3,1,3>, <3,3,3,3>
+ 3356869530U, // <1,3,3,4>: Cost 4 vmrglw <0,2,1,3>, <1,2,3,4>
+ 3721760428U, // <1,3,3,5>: Cost 4 vsldoi4 <5,1,3,3>, <5,1,3,3>
+ 3296496248U, // <1,3,3,6>: Cost 4 vmrghw <1,3,2,6>, <3,6,0,7>
+ 3356870586U, // <1,3,3,7>: Cost 4 vmrglw <0,2,1,3>, <2,6,3,7>
+ 2771970503U, // <1,3,3,u>: Cost 3 vsldoi12 <3,3,u,1>, <3,3,u,1>
+ 2772044240U, // <1,3,4,0>: Cost 3 vsldoi12 <3,4,0,1>, <3,4,0,1>
+ 3362186135U, // <1,3,4,1>: Cost 4 vmrglw <1,1,1,4>, <1,2,3,1>
+ 3297151280U, // <1,3,4,2>: Cost 4 vmrghw <1,4,2,5>, <3,2,0,3>
+ 3357542002U, // <1,3,4,3>: Cost 4 vmrglw <0,3,1,4>, <2,2,3,3>
+ 3357540626U, // <1,3,4,4>: Cost 4 vmrglw <0,3,1,4>, <0,3,3,4>
+ 2685783350U, // <1,3,4,5>: Cost 3 vsldoi8 <0,2,1,3>, RHS
+ 3357546622U, // <1,3,4,6>: Cost 4 vmrglw <0,3,1,4>, <u,5,3,6>
+ 3357542330U, // <1,3,4,7>: Cost 4 vmrglw <0,3,1,4>, <2,6,3,7>
+ 2685783593U, // <1,3,4,u>: Cost 3 vsldoi8 <0,2,1,3>, RHS
+ 2284471190U, // <1,3,5,0>: Cost 3 vmrglw <0,4,1,5>, <1,2,3,0>
+ 3358213015U, // <1,3,5,1>: Cost 4 vmrglw <0,4,1,5>, <1,2,3,1>
+ 2630116899U, // <1,3,5,2>: Cost 3 vsldoi4 <2,1,3,5>, <2,1,3,5>
+ 2284471922U, // <1,3,5,3>: Cost 3 vmrglw <0,4,1,5>, <2,2,3,3>
+ 2284471194U, // <1,3,5,4>: Cost 3 vmrglw <0,4,1,5>, <1,2,3,4>
+ 2284471843U, // <1,3,5,5>: Cost 3 vmrglw <0,4,1,5>, <2,1,3,5>
+ 3358218366U, // <1,3,5,6>: Cost 4 vmrglw <0,4,1,5>, <u,5,3,6>
+ 2284472250U, // <1,3,5,7>: Cost 3 vmrglw <0,4,1,5>, <2,6,3,7>
+ 2284471198U, // <1,3,5,u>: Cost 3 vmrglw <0,4,1,5>, <1,2,3,u>
+ 2224752790U, // <1,3,6,0>: Cost 3 vmrghw <1,6,2,7>, <3,0,1,2>
+ 3832736385U, // <1,3,6,1>: Cost 4 vsldoi12 <1,2,3,1>, <3,6,1,7>
+ 3703866916U, // <1,3,6,2>: Cost 4 vsldoi4 <2,1,3,6>, <2,1,3,6>
+ 3356894834U, // <1,3,6,3>: Cost 4 vmrglw <0,2,1,6>, <2,2,3,3>
+ 3356894106U, // <1,3,6,4>: Cost 4 vmrglw <0,2,1,6>, <1,2,3,4>
+ 3356894755U, // <1,3,6,5>: Cost 5 vmrglw <0,2,1,6>, <2,1,3,5>
+ 3356899130U, // <1,3,6,6>: Cost 4 vmrglw <0,2,1,6>, <u,1,3,6>
+ 2283153338U, // <1,3,6,7>: Cost 3 vmrglw <0,2,1,6>, <2,6,3,7>
+ 2283153338U, // <1,3,6,u>: Cost 3 vmrglw <0,2,1,6>, <2,6,3,7>
+ 2774035139U, // <1,3,7,0>: Cost 3 vsldoi12 <3,7,0,1>, <3,7,0,1>
+ 3703874767U, // <1,3,7,1>: Cost 4 vsldoi4 <2,1,3,7>, <1,6,1,7>
+ 3703875109U, // <1,3,7,2>: Cost 4 vsldoi4 <2,1,3,7>, <2,1,3,7>
+ 3365529202U, // <1,3,7,3>: Cost 4 vmrglw <1,6,1,7>, <2,2,3,3>
+ 3365528474U, // <1,3,7,4>: Cost 4 vmrglw <1,6,1,7>, <1,2,3,4>
+ 3789387159U, // <1,3,7,5>: Cost 4 vsldoi8 <5,2,1,3>, <7,5,2,1>
+ 3865692927U, // <1,3,7,6>: Cost 4 vsldoi12 <6,7,0,1>, <3,7,6,7>
+ 3363538874U, // <1,3,7,7>: Cost 4 vmrglw <1,3,1,7>, <2,6,3,7>
+ 2774625035U, // <1,3,7,u>: Cost 3 vsldoi12 <3,7,u,1>, <3,7,u,1>
+ 2284495766U, // <1,3,u,0>: Cost 3 vmrglw <0,4,1,u>, <1,2,3,0>
+ 2685785902U, // <1,3,u,1>: Cost 3 vsldoi8 <0,2,1,3>, LHS
+ 2630141478U, // <1,3,u,2>: Cost 3 vsldoi4 <2,1,3,u>, <2,1,3,u>
+ 2283169880U, // <1,3,u,3>: Cost 3 vmrglw <0,2,1,u>, <2,u,3,3>
+ 2284495770U, // <1,3,u,4>: Cost 3 vmrglw <0,4,1,u>, <1,2,3,4>
+ 2685786266U, // <1,3,u,5>: Cost 3 vsldoi8 <0,2,1,3>, RHS
+ 2222115448U, // <1,3,u,6>: Cost 3 vmrghw <1,2,3,0>, <3,6,0,7>
+ 2284496826U, // <1,3,u,7>: Cost 3 vmrglw <0,4,1,u>, <2,6,3,7>
+ 2685786469U, // <1,3,u,u>: Cost 3 vsldoi8 <0,2,1,3>, LHS
+ 2684461069U, // <1,4,0,0>: Cost 3 vsldoi8 <0,0,1,4>, <0,0,1,4>
+ 2686451814U, // <1,4,0,1>: Cost 3 vsldoi8 <0,3,1,4>, LHS
+ 3759530159U, // <1,4,0,2>: Cost 4 vsldoi8 <0,2,1,4>, <0,2,1,4>
+ 2686451968U, // <1,4,0,3>: Cost 3 vsldoi8 <0,3,1,4>, <0,3,1,4>
+ 2684461394U, // <1,4,0,4>: Cost 3 vsldoi8 <0,0,1,4>, <0,4,1,5>
+ 1701989266U, // <1,4,0,5>: Cost 2 vsldoi12 <4,0,5,1>, <4,0,5,1>
+ 3776119286U, // <1,4,0,6>: Cost 4 vsldoi8 <3,0,1,4>, <0,6,1,7>
+ 2689106500U, // <1,4,0,7>: Cost 3 vsldoi8 <0,7,1,4>, <0,7,1,4>
+ 1702210477U, // <1,4,0,u>: Cost 2 vsldoi12 <4,0,u,1>, <4,0,u,1>
+ 2221312914U, // <1,4,1,0>: Cost 3 vmrghw <1,1,1,1>, <4,0,5,1>
+ 2691097399U, // <1,4,1,1>: Cost 3 vsldoi8 <1,1,1,4>, <1,1,1,4>
+ 3760194454U, // <1,4,1,2>: Cost 4 vsldoi8 <0,3,1,4>, <1,2,3,0>
+ 3766166489U, // <1,4,1,3>: Cost 4 vsldoi8 <1,3,1,4>, <1,3,1,4>
+ 2334870736U, // <1,4,1,4>: Cost 3 vmrglw <u,u,1,1>, <4,4,4,4>
+ 1147571510U, // <1,4,1,5>: Cost 2 vmrghw <1,1,1,1>, RHS
+ 3760194794U, // <1,4,1,6>: Cost 4 vsldoi8 <0,3,1,4>, <1,6,4,7>
+ 3867315188U, // <1,4,1,7>: Cost 4 vsldoi12 <7,0,4,1>, <4,1,7,0>
+ 1147571753U, // <1,4,1,u>: Cost 2 vmrghw <1,1,1,1>, RHS
+ 2222115730U, // <1,4,2,0>: Cost 3 vmrghw <1,2,3,0>, <4,0,5,1>
+ 2222115812U, // <1,4,2,1>: Cost 3 vmrghw <1,2,3,0>, <4,1,5,2>
+ 3760195176U, // <1,4,2,2>: Cost 4 vsldoi8 <0,3,1,4>, <2,2,2,2>
+ 2702378662U, // <1,4,2,3>: Cost 3 vsldoi8 <3,0,1,4>, <2,3,0,1>
+ 2323598544U, // <1,4,2,4>: Cost 3 vmrglw <7,0,1,2>, <4,4,4,4>
+ 1148374326U, // <1,4,2,5>: Cost 2 vmrghw <1,2,3,0>, RHS
+ 3760195514U, // <1,4,2,6>: Cost 4 vsldoi8 <0,3,1,4>, <2,6,3,7>
+ 3373451932U, // <1,4,2,7>: Cost 4 vmrglw <3,0,1,2>, <3,6,4,7>
+ 1148374569U, // <1,4,2,u>: Cost 2 vmrghw <1,2,3,0>, RHS
+ 2702379160U, // <1,4,3,0>: Cost 3 vsldoi8 <3,0,1,4>, <3,0,1,4>
+ 3760195840U, // <1,4,3,1>: Cost 4 vsldoi8 <0,3,1,4>, <3,1,4,0>
+ 3776121160U, // <1,4,3,2>: Cost 4 vsldoi8 <3,0,1,4>, <3,2,3,0>
+ 3760195996U, // <1,4,3,3>: Cost 4 vsldoi8 <0,3,1,4>, <3,3,3,3>
+ 2686454274U, // <1,4,3,4>: Cost 3 vsldoi8 <0,3,1,4>, <3,4,5,6>
+ 3356870350U, // <1,4,3,5>: Cost 4 vmrglw <0,2,1,3>, <2,3,4,5>
+ 3800009392U, // <1,4,3,6>: Cost 4 vsldoi8 <7,0,1,4>, <3,6,7,0>
+ 3366824604U, // <1,4,3,7>: Cost 5 vmrglw <1,u,1,3>, <3,6,4,7>
+ 2707688224U, // <1,4,3,u>: Cost 3 vsldoi8 <3,u,1,4>, <3,u,1,4>
+ 2775731368U, // <1,4,4,0>: Cost 3 vsldoi12 <4,0,5,1>, <4,4,0,0>
+ 3830820018U, // <1,4,4,1>: Cost 4 vsldoi12 <0,u,4,1>, <4,4,1,1>
+ 3691980454U, // <1,4,4,2>: Cost 4 vsldoi4 <0,1,4,4>, <2,3,0,1>
+ 3357541282U, // <1,4,4,3>: Cost 4 vmrglw <0,3,1,4>, <1,2,4,3>
+ 2781039824U, // <1,4,4,4>: Cost 3 vsldoi12 <4,u,5,1>, <4,4,4,4>
+ 2686455094U, // <1,4,4,5>: Cost 3 vsldoi8 <0,3,1,4>, RHS
+ 3357541528U, // <1,4,4,6>: Cost 4 vmrglw <0,3,1,4>, <1,5,4,6>
+ 3810627020U, // <1,4,4,7>: Cost 4 vsldoi8 <u,7,1,4>, <4,7,5,4>
+ 2686455337U, // <1,4,4,u>: Cost 3 vsldoi8 <0,3,1,4>, RHS
+ 2624217190U, // <1,4,5,0>: Cost 3 vsldoi4 <1,1,4,5>, LHS
+ 2284470309U, // <1,4,5,1>: Cost 3 vmrglw <0,4,1,5>, <0,0,4,1>
+ 2618246822U, // <1,4,5,2>: Cost 3 vsldoi4 <0,1,4,5>, <2,3,0,1>
+ 3358212297U, // <1,4,5,3>: Cost 4 vmrglw <0,4,1,5>, <0,2,4,3>
+ 2284470312U, // <1,4,5,4>: Cost 3 vmrglw <0,4,1,5>, <0,0,4,4>
+ 2284470637U, // <1,4,5,5>: Cost 3 vmrglw <0,4,1,5>, <0,4,4,5>
+ 1683115318U, // <1,4,5,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+ 3721851898U, // <1,4,5,7>: Cost 4 vsldoi4 <5,1,4,5>, <7,0,1,2>
+ 1683115336U, // <1,4,5,u>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+ 3794039075U, // <1,4,6,0>: Cost 4 vsldoi8 <6,0,1,4>, <6,0,1,4>
+ 3830820186U, // <1,4,6,1>: Cost 4 vsldoi12 <0,u,4,1>, <4,6,1,7>
+ 3800011258U, // <1,4,6,2>: Cost 4 vsldoi8 <7,0,1,4>, <6,2,7,3>
+ 3807973938U, // <1,4,6,3>: Cost 4 vsldoi8 <u,3,1,4>, <6,3,4,5>
+ 3298716880U, // <1,4,6,4>: Cost 4 vmrghw <1,6,5,7>, <4,4,4,4>
+ 2224680246U, // <1,4,6,5>: Cost 3 vmrghw <1,6,1,7>, RHS
+ 3800011576U, // <1,4,6,6>: Cost 4 vsldoi8 <7,0,1,4>, <6,6,6,6>
+ 2726269774U, // <1,4,6,7>: Cost 3 vsldoi8 <7,0,1,4>, <6,7,0,1>
+ 2224680489U, // <1,4,6,u>: Cost 3 vmrghw <1,6,1,7>, RHS
+ 2726269948U, // <1,4,7,0>: Cost 3 vsldoi8 <7,0,1,4>, <7,0,1,4>
+ 3383444141U, // <1,4,7,1>: Cost 4 vmrglw <4,6,1,7>, <0,u,4,1>
+ 3805983961U, // <1,4,7,2>: Cost 4 vsldoi8 <u,0,1,4>, <7,2,u,0>
+ 3807974667U, // <1,4,7,3>: Cost 4 vsldoi8 <u,3,1,4>, <7,3,4,5>
+ 2736887142U, // <1,4,7,4>: Cost 3 vsldoi8 <u,7,1,4>, <7,4,5,6>
+ 3365528403U, // <1,4,7,5>: Cost 4 vmrglw <1,6,1,7>, <1,1,4,5>
+ 3800012308U, // <1,4,7,6>: Cost 4 vsldoi8 <7,0,1,4>, <7,6,7,0>
+ 3800012396U, // <1,4,7,7>: Cost 4 vsldoi8 <7,0,1,4>, <7,7,7,7>
+ 2731579012U, // <1,4,7,u>: Cost 3 vsldoi8 <7,u,1,4>, <7,u,1,4>
+ 2624241766U, // <1,4,u,0>: Cost 3 vsldoi4 <1,1,4,u>, LHS
+ 2686457646U, // <1,4,u,1>: Cost 3 vsldoi8 <0,3,1,4>, LHS
+ 2618271398U, // <1,4,u,2>: Cost 3 vsldoi4 <0,1,4,u>, <2,3,0,1>
+ 2734233544U, // <1,4,u,3>: Cost 3 vsldoi8 <u,3,1,4>, <u,3,1,4>
+ 2689775679U, // <1,4,u,4>: Cost 3 vsldoi8 <0,u,1,4>, <u,4,5,6>
+ 1152355638U, // <1,4,u,5>: Cost 2 vmrghw <1,u,3,0>, RHS
+ 1683115561U, // <1,4,u,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+ 2736888076U, // <1,4,u,7>: Cost 3 vsldoi8 <u,7,1,4>, <u,7,1,4>
+ 1683115579U, // <1,4,u,u>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+ 2687123456U, // <1,5,0,0>: Cost 3 vsldoi8 <0,4,1,5>, <0,0,0,0>
+ 1613381734U, // <1,5,0,1>: Cost 2 vsldoi8 <0,4,1,5>, LHS
+ 3759538352U, // <1,5,0,2>: Cost 4 vsldoi8 <0,2,1,5>, <0,2,1,5>
+ 3760865532U, // <1,5,0,3>: Cost 4 vsldoi8 <0,4,1,5>, <0,3,1,0>
+ 1613381970U, // <1,5,0,4>: Cost 2 vsldoi8 <0,4,1,5>, <0,4,1,5>
+ 2687787427U, // <1,5,0,5>: Cost 3 vsldoi8 <0,5,1,5>, <0,5,1,5>
+ 2781777524U, // <1,5,0,6>: Cost 3 vsldoi12 <5,0,6,1>, <5,0,6,1>
+ 3733828717U, // <1,5,0,7>: Cost 4 vsldoi4 <7,1,5,0>, <7,1,5,0>
+ 1613382301U, // <1,5,0,u>: Cost 2 vsldoi8 <0,4,1,5>, LHS
+ 2781040271U, // <1,5,1,0>: Cost 3 vsldoi12 <4,u,5,1>, <5,1,0,1>
+ 2687124276U, // <1,5,1,1>: Cost 3 vsldoi8 <0,4,1,5>, <1,1,1,1>
+ 2687124374U, // <1,5,1,2>: Cost 3 vsldoi8 <0,4,1,5>, <1,2,3,0>
+ 3760866297U, // <1,5,1,3>: Cost 4 vsldoi8 <0,4,1,5>, <1,3,5,0>
+ 2693096491U, // <1,5,1,4>: Cost 3 vsldoi8 <1,4,1,5>, <1,4,1,5>
+ 2687124591U, // <1,5,1,5>: Cost 3 vsldoi8 <0,4,1,5>, <1,5,0,1>
+ 2687124723U, // <1,5,1,6>: Cost 3 vsldoi8 <0,4,1,5>, <1,6,5,7>
+ 3360834803U, // <1,5,1,7>: Cost 4 vmrglw <0,u,1,1>, <1,6,5,7>
+ 2687124860U, // <1,5,1,u>: Cost 3 vsldoi8 <0,4,1,5>, <1,u,3,0>
+ 2323598792U, // <1,5,2,0>: Cost 3 vmrglw <7,0,1,2>, <4,7,5,0>
+ 2687125027U, // <1,5,2,1>: Cost 3 vsldoi8 <0,4,1,5>, <2,1,3,5>
+ 2687125096U, // <1,5,2,2>: Cost 3 vsldoi8 <0,4,1,5>, <2,2,2,2>
+ 2687125158U, // <1,5,2,3>: Cost 3 vsldoi8 <0,4,1,5>, <2,3,0,1>
+ 2642185188U, // <1,5,2,4>: Cost 3 vsldoi4 <4,1,5,2>, <4,1,5,2>
+ 2323598554U, // <1,5,2,5>: Cost 3 vmrglw <7,0,1,2>, <4,4,5,5>
+ 2687125434U, // <1,5,2,6>: Cost 3 vsldoi8 <0,4,1,5>, <2,6,3,7>
+ 3373450483U, // <1,5,2,7>: Cost 4 vmrglw <3,0,1,2>, <1,6,5,7>
+ 2687125563U, // <1,5,2,u>: Cost 3 vsldoi8 <0,4,1,5>, <2,u,0,1>
+ 2687125654U, // <1,5,3,0>: Cost 3 vsldoi8 <0,4,1,5>, <3,0,1,2>
+ 2312990234U, // <1,5,3,1>: Cost 3 vmrglw <5,2,1,3>, <4,u,5,1>
+ 3760867649U, // <1,5,3,2>: Cost 4 vsldoi8 <0,4,1,5>, <3,2,2,2>
+ 2687125916U, // <1,5,3,3>: Cost 3 vsldoi8 <0,4,1,5>, <3,3,3,3>
+ 2687126018U, // <1,5,3,4>: Cost 3 vsldoi8 <0,4,1,5>, <3,4,5,6>
+ 3386731738U, // <1,5,3,5>: Cost 4 vmrglw <5,2,1,3>, <4,4,5,5>
+ 3356871170U, // <1,5,3,6>: Cost 4 vmrglw <0,2,1,3>, <3,4,5,6>
+ 3808643779U, // <1,5,3,7>: Cost 4 vsldoi8 <u,4,1,5>, <3,7,0,1>
+ 2687126302U, // <1,5,3,u>: Cost 3 vsldoi8 <0,4,1,5>, <3,u,1,2>
+ 2642198630U, // <1,5,4,0>: Cost 3 vsldoi4 <4,1,5,4>, LHS
+ 2687126498U, // <1,5,4,1>: Cost 3 vsldoi8 <0,4,1,5>, <4,1,5,0>
+ 3715941923U, // <1,5,4,2>: Cost 4 vsldoi4 <4,1,5,4>, <2,1,3,5>
+ 3709970701U, // <1,5,4,3>: Cost 4 vsldoi4 <3,1,5,4>, <3,1,5,4>
+ 2687126736U, // <1,5,4,4>: Cost 3 vsldoi8 <0,4,1,5>, <4,4,4,4>
+ 1613385014U, // <1,5,4,5>: Cost 2 vsldoi8 <0,4,1,5>, RHS
+ 2283801090U, // <1,5,4,6>: Cost 3 vmrglw <0,3,1,4>, <3,4,5,6>
+ 3733861489U, // <1,5,4,7>: Cost 4 vsldoi4 <7,1,5,4>, <7,1,5,4>
+ 1613385257U, // <1,5,4,u>: Cost 2 vsldoi8 <0,4,1,5>, RHS
+ 2624290918U, // <1,5,5,0>: Cost 3 vsldoi4 <1,1,5,5>, LHS
+ 2624291676U, // <1,5,5,1>: Cost 3 vsldoi4 <1,1,5,5>, <1,1,5,5>
+ 3698034211U, // <1,5,5,2>: Cost 4 vsldoi4 <1,1,5,5>, <2,1,3,5>
+ 2284471211U, // <1,5,5,3>: Cost 3 vmrglw <0,4,1,5>, <1,2,5,3>
+ 2624294198U, // <1,5,5,4>: Cost 3 vsldoi4 <1,1,5,5>, RHS
+ 2284471132U, // <1,5,5,5>: Cost 3 vmrglw <0,4,1,5>, <1,1,5,5>
+ 2284472834U, // <1,5,5,6>: Cost 3 vmrglw <0,4,1,5>, <3,4,5,6>
+ 2284471539U, // <1,5,5,7>: Cost 3 vmrglw <0,4,1,5>, <1,6,5,7>
+ 2284471216U, // <1,5,5,u>: Cost 3 vmrglw <0,4,1,5>, <1,2,5,u>
+ 2785316900U, // <1,5,6,0>: Cost 3 vsldoi12 <5,6,0,1>, <5,6,0,1>
+ 2781040691U, // <1,5,6,1>: Cost 3 vsldoi12 <4,u,5,1>, <5,6,1,7>
+ 2734903802U, // <1,5,6,2>: Cost 3 vsldoi8 <u,4,1,5>, <6,2,7,3>
+ 3848736834U, // <1,5,6,3>: Cost 4 vsldoi12 <3,u,4,1>, <5,6,3,4>
+ 3298717620U, // <1,5,6,4>: Cost 4 vmrghw <1,6,5,7>, <5,4,5,6>
+ 3298717700U, // <1,5,6,5>: Cost 4 vmrghw <1,6,5,7>, <5,5,5,5>
+ 2734904120U, // <1,5,6,6>: Cost 3 vsldoi8 <u,4,1,5>, <6,6,6,6>
+ 2781040738U, // <1,5,6,7>: Cost 3 vsldoi12 <4,u,5,1>, <5,6,7,0>
+ 2781040747U, // <1,5,6,u>: Cost 3 vsldoi12 <4,u,5,1>, <5,6,u,0>
+ 2734904314U, // <1,5,7,0>: Cost 3 vsldoi8 <u,4,1,5>, <7,0,1,2>
+ 2315677210U, // <1,5,7,1>: Cost 3 vmrglw <5,6,1,7>, <4,u,5,1>
+ 3808646292U, // <1,5,7,2>: Cost 4 vsldoi8 <u,4,1,5>, <7,2,0,3>
+ 3808646371U, // <1,5,7,3>: Cost 4 vsldoi8 <u,4,1,5>, <7,3,0,1>
+ 2734904678U, // <1,5,7,4>: Cost 3 vsldoi8 <u,4,1,5>, <7,4,5,6>
+ 3389418714U, // <1,5,7,5>: Cost 4 vmrglw <5,6,1,7>, <4,4,5,5>
+ 3365528656U, // <1,5,7,6>: Cost 4 vmrglw <1,6,1,7>, <1,4,5,6>
+ 2734904940U, // <1,5,7,7>: Cost 3 vsldoi8 <u,4,1,5>, <7,7,7,7>
+ 2734904962U, // <1,5,7,u>: Cost 3 vsldoi8 <u,4,1,5>, <7,u,1,2>
+ 2687129299U, // <1,5,u,0>: Cost 3 vsldoi8 <0,4,1,5>, <u,0,1,2>
+ 1613387566U, // <1,5,u,1>: Cost 2 vsldoi8 <0,4,1,5>, LHS
+ 2687129480U, // <1,5,u,2>: Cost 3 vsldoi8 <0,4,1,5>, <u,2,3,3>
+ 2687129532U, // <1,5,u,3>: Cost 3 vsldoi8 <0,4,1,5>, <u,3,0,1>
+ 1661163546U, // <1,5,u,4>: Cost 2 vsldoi8 <u,4,1,5>, <u,4,1,5>
+ 1613387930U, // <1,5,u,5>: Cost 2 vsldoi8 <0,4,1,5>, RHS
+ 2687129808U, // <1,5,u,6>: Cost 3 vsldoi8 <0,4,1,5>, <u,6,3,7>
+ 2781040900U, // <1,5,u,7>: Cost 3 vsldoi12 <4,u,5,1>, <5,u,7,0>
+ 1613388133U, // <1,5,u,u>: Cost 2 vsldoi8 <0,4,1,5>, LHS
+ 3759546368U, // <1,6,0,0>: Cost 4 vsldoi8 <0,2,1,6>, <0,0,0,0>
+ 2685804646U, // <1,6,0,1>: Cost 3 vsldoi8 <0,2,1,6>, LHS
+ 2685804721U, // <1,6,0,2>: Cost 3 vsldoi8 <0,2,1,6>, <0,2,1,6>
+ 3861270834U, // <1,6,0,3>: Cost 4 vsldoi12 <6,0,3,1>, <6,0,3,1>
+ 3759546706U, // <1,6,0,4>: Cost 4 vsldoi8 <0,2,1,6>, <0,4,1,5>
+ 2687795620U, // <1,6,0,5>: Cost 3 vsldoi8 <0,5,1,6>, <0,5,1,6>
+ 2688459253U, // <1,6,0,6>: Cost 3 vsldoi8 <0,6,1,6>, <0,6,1,6>
+ 2283769142U, // <1,6,0,7>: Cost 3 vmrglw <0,3,1,0>, RHS
+ 2685805213U, // <1,6,0,u>: Cost 3 vsldoi8 <0,2,1,6>, LHS
+ 3698073702U, // <1,6,1,0>: Cost 4 vsldoi4 <1,1,6,1>, LHS
+ 3759547188U, // <1,6,1,1>: Cost 4 vsldoi8 <0,2,1,6>, <1,1,1,1>
+ 2221314554U, // <1,6,1,2>: Cost 3 vmrghw <1,1,1,1>, <6,2,7,3>
+ 3759547401U, // <1,6,1,3>: Cost 4 vsldoi8 <0,2,1,6>, <1,3,6,7>
+ 3698076982U, // <1,6,1,4>: Cost 4 vsldoi4 <1,1,6,1>, RHS
+ 3767510141U, // <1,6,1,5>: Cost 4 vsldoi8 <1,5,1,6>, <1,5,1,6>
+ 2334872376U, // <1,6,1,6>: Cost 3 vmrglw <u,u,1,1>, <6,6,6,6>
+ 1213353270U, // <1,6,1,7>: Cost 2 vmrglw <0,u,1,1>, RHS
+ 1213353271U, // <1,6,1,u>: Cost 2 vmrglw <0,u,1,1>, RHS
+ 3704053862U, // <1,6,2,0>: Cost 4 vsldoi4 <2,1,6,2>, LHS
+ 3759547961U, // <1,6,2,1>: Cost 4 vsldoi8 <0,2,1,6>, <2,1,6,0>
+ 2222117370U, // <1,6,2,2>: Cost 3 vmrghw <1,2,3,0>, <6,2,7,3>
+ 3759548070U, // <1,6,2,3>: Cost 4 vsldoi8 <0,2,1,6>, <2,3,0,1>
+ 3704057142U, // <1,6,2,4>: Cost 4 vsldoi4 <2,1,6,2>, RHS
+ 3373451057U, // <1,6,2,5>: Cost 4 vmrglw <3,0,1,2>, <2,4,6,5>
+ 2685806522U, // <1,6,2,6>: Cost 3 vsldoi8 <0,2,1,6>, <2,6,3,7>
+ 1225968950U, // <1,6,2,7>: Cost 2 vmrglw <3,0,1,2>, RHS
+ 1225968951U, // <1,6,2,u>: Cost 2 vmrglw <3,0,1,2>, RHS
+ 3759548566U, // <1,6,3,0>: Cost 4 vsldoi8 <0,2,1,6>, <3,0,1,2>
+ 3842912793U, // <1,6,3,1>: Cost 4 vsldoi12 <2,u,6,1>, <6,3,1,7>
+ 3759548774U, // <1,6,3,2>: Cost 4 vsldoi8 <0,2,1,6>, <3,2,6,3>
+ 3759548828U, // <1,6,3,3>: Cost 4 vsldoi8 <0,2,1,6>, <3,3,3,3>
+ 3759548930U, // <1,6,3,4>: Cost 4 vsldoi8 <0,2,1,6>, <3,4,5,6>
+ 3809315421U, // <1,6,3,5>: Cost 4 vsldoi8 <u,5,1,6>, <3,5,6,7>
+ 3386733368U, // <1,6,3,6>: Cost 4 vmrglw <5,2,1,3>, <6,6,6,6>
+ 2283130166U, // <1,6,3,7>: Cost 3 vmrglw <0,2,1,3>, RHS
+ 2283130167U, // <1,6,3,u>: Cost 3 vmrglw <0,2,1,3>, RHS
+ 3704070246U, // <1,6,4,0>: Cost 4 vsldoi4 <2,1,6,4>, LHS
+ 3862229608U, // <1,6,4,1>: Cost 4 vsldoi12 <6,1,7,1>, <6,4,1,5>
+ 3704071741U, // <1,6,4,2>: Cost 4 vsldoi4 <2,1,6,4>, <2,1,6,4>
+ 3721988610U, // <1,6,4,3>: Cost 4 vsldoi4 <5,1,6,4>, <3,4,5,6>
+ 3704073526U, // <1,6,4,4>: Cost 4 vsldoi4 <2,1,6,4>, RHS
+ 2685807926U, // <1,6,4,5>: Cost 3 vsldoi8 <0,2,1,6>, RHS
+ 3865621141U, // <1,6,4,6>: Cost 4 vsldoi12 <6,6,u,1>, <6,4,6,5>
+ 2283801910U, // <1,6,4,7>: Cost 3 vmrglw <0,3,1,4>, RHS
+ 2685808169U, // <1,6,4,u>: Cost 3 vsldoi8 <0,2,1,6>, RHS
+ 3710050406U, // <1,6,5,0>: Cost 4 vsldoi4 <3,1,6,5>, LHS
+ 3710051571U, // <1,6,5,1>: Cost 4 vsldoi4 <3,1,6,5>, <1,6,5,7>
+ 3405989597U, // <1,6,5,2>: Cost 4 vmrglw <u,4,1,5>, <2,3,6,2>
+ 3358214502U, // <1,6,5,3>: Cost 4 vmrglw <0,4,1,5>, <3,2,6,3>
+ 3710053686U, // <1,6,5,4>: Cost 4 vsldoi4 <3,1,6,5>, RHS
+ 3721998025U, // <1,6,5,5>: Cost 4 vsldoi4 <5,1,6,5>, <5,1,6,5>
+ 2332250936U, // <1,6,5,6>: Cost 3 vmrglw <u,4,1,5>, <6,6,6,6>
+ 1210731830U, // <1,6,5,7>: Cost 2 vmrglw <0,4,1,5>, RHS
+ 1210731831U, // <1,6,5,u>: Cost 2 vmrglw <0,4,1,5>, RHS
+ 2791289597U, // <1,6,6,0>: Cost 3 vsldoi12 <6,6,0,1>, <6,6,0,1>
+ 3698115430U, // <1,6,6,1>: Cost 4 vsldoi4 <1,1,6,6>, <1,1,6,6>
+ 3698116538U, // <1,6,6,2>: Cost 4 vsldoi4 <1,1,6,6>, <2,6,3,7>
+ 3356894132U, // <1,6,6,3>: Cost 4 vmrglw <0,2,1,6>, <1,2,6,3>
+ 3698117942U, // <1,6,6,4>: Cost 4 vsldoi4 <1,1,6,6>, RHS
+ 3722006218U, // <1,6,6,5>: Cost 4 vsldoi4 <5,1,6,6>, <5,1,6,6>
+ 2781041464U, // <1,6,6,6>: Cost 3 vsldoi12 <4,u,5,1>, <6,6,6,6>
+ 2283154742U, // <1,6,6,7>: Cost 3 vmrglw <0,2,1,6>, RHS
+ 2283154743U, // <1,6,6,u>: Cost 3 vmrglw <0,2,1,6>, RHS
+ 1718211406U, // <1,6,7,0>: Cost 2 vsldoi12 <6,7,0,1>, <6,7,0,1>
+ 2792026967U, // <1,6,7,1>: Cost 3 vsldoi12 <6,7,1,1>, <6,7,1,1>
+ 2765411170U, // <1,6,7,2>: Cost 3 vsldoi12 <2,3,0,1>, <6,7,2,3>
+ 3854783336U, // <1,6,7,3>: Cost 4 vsldoi12 <4,u,5,1>, <6,7,3,0>
+ 2781041526U, // <1,6,7,4>: Cost 3 vsldoi12 <4,u,5,1>, <6,7,4,5>
+ 3365528664U, // <1,6,7,5>: Cost 4 vmrglw <1,6,1,7>, <1,4,6,5>
+ 2791953290U, // <1,6,7,6>: Cost 3 vsldoi12 <6,7,0,1>, <6,7,6,7>
+ 2291789110U, // <1,6,7,7>: Cost 3 vmrglw <1,6,1,7>, RHS
+ 1718801302U, // <1,6,7,u>: Cost 2 vsldoi12 <6,7,u,1>, <6,7,u,1>
+ 1718875039U, // <1,6,u,0>: Cost 2 vsldoi12 <6,u,0,1>, <6,u,0,1>
+ 2685810478U, // <1,6,u,1>: Cost 3 vsldoi8 <0,2,1,6>, LHS
+ 2792764337U, // <1,6,u,2>: Cost 3 vsldoi12 <6,u,2,1>, <6,u,2,1>
+ 3759552444U, // <1,6,u,3>: Cost 4 vsldoi8 <0,2,1,6>, <u,3,0,1>
+ 2781041607U, // <1,6,u,4>: Cost 3 vsldoi12 <4,u,5,1>, <6,u,4,5>
+ 2685810842U, // <1,6,u,5>: Cost 3 vsldoi8 <0,2,1,6>, RHS
+ 2689792208U, // <1,6,u,6>: Cost 3 vsldoi8 <0,u,1,6>, <u,6,3,7>
+ 1210756406U, // <1,6,u,7>: Cost 2 vmrglw <0,4,1,u>, RHS
+ 1210756407U, // <1,6,u,u>: Cost 2 vmrglw <0,4,1,u>, RHS
+ 2793280496U, // <1,7,0,0>: Cost 3 vsldoi12 <7,0,0,1>, <7,0,0,1>
+ 2694439014U, // <1,7,0,1>: Cost 3 vsldoi8 <1,6,1,7>, LHS
+ 3393343912U, // <1,7,0,2>: Cost 4 vmrglw <6,3,1,0>, <6,1,7,2>
+ 3397325306U, // <1,7,0,3>: Cost 4 vmrglw <7,0,1,0>, <6,2,7,3>
+ 2793575444U, // <1,7,0,4>: Cost 3 vsldoi12 <7,0,4,1>, <7,0,4,1>
+ 3722030797U, // <1,7,0,5>: Cost 4 vsldoi4 <5,1,7,0>, <5,1,7,0>
+ 2688467446U, // <1,7,0,6>: Cost 3 vsldoi8 <0,6,1,7>, <0,6,1,7>
+ 2689131079U, // <1,7,0,7>: Cost 3 vsldoi8 <0,7,1,7>, <0,7,1,7>
+ 2694439570U, // <1,7,0,u>: Cost 3 vsldoi8 <1,6,1,7>, <0,u,1,1>
+ 2654265354U, // <1,7,1,0>: Cost 3 vsldoi4 <6,1,7,1>, <0,0,1,1>
+ 2794017866U, // <1,7,1,1>: Cost 3 vsldoi12 <7,1,1,1>, <7,1,1,1>
+ 3768181639U, // <1,7,1,2>: Cost 4 vsldoi8 <1,6,1,7>, <1,2,1,3>
+ 2334872058U, // <1,7,1,3>: Cost 3 vmrglw <u,u,1,1>, <6,2,7,3>
+ 2654268726U, // <1,7,1,4>: Cost 3 vsldoi4 <6,1,7,1>, RHS
+ 3792069797U, // <1,7,1,5>: Cost 4 vsldoi8 <5,6,1,7>, <1,5,6,1>
+ 2694440143U, // <1,7,1,6>: Cost 3 vsldoi8 <1,6,1,7>, <1,6,1,7>
+ 2334872386U, // <1,7,1,7>: Cost 3 vmrglw <u,u,1,1>, <6,6,7,7>
+ 2695767409U, // <1,7,1,u>: Cost 3 vsldoi8 <1,u,1,7>, <1,u,1,7>
+ 2654273638U, // <1,7,2,0>: Cost 3 vsldoi4 <6,1,7,2>, LHS
+ 2222117973U, // <1,7,2,1>: Cost 3 vmrghw <1,2,3,0>, <7,1,2,3>
+ 2299711912U, // <1,7,2,2>: Cost 3 vmrglw <3,0,1,2>, <6,1,7,2>
+ 2654275734U, // <1,7,2,3>: Cost 3 vsldoi4 <6,1,7,2>, <3,0,1,2>
+ 2654276918U, // <1,7,2,4>: Cost 3 vsldoi4 <6,1,7,2>, RHS
+ 3385397675U, // <1,7,2,5>: Cost 4 vmrglw <5,0,1,2>, <6,1,7,5>
+ 2654278056U, // <1,7,2,6>: Cost 3 vsldoi4 <6,1,7,2>, <6,1,7,2>
+ 2323599627U, // <1,7,2,7>: Cost 3 vmrglw <7,0,1,2>, <5,u,7,7>
+ 2654279470U, // <1,7,2,u>: Cost 3 vsldoi4 <6,1,7,2>, LHS
+ 2795271395U, // <1,7,3,0>: Cost 3 vsldoi12 <7,3,0,1>, <7,3,0,1>
+ 3768183059U, // <1,7,3,1>: Cost 4 vsldoi8 <1,6,1,7>, <3,1,6,1>
+ 3728025254U, // <1,7,3,2>: Cost 4 vsldoi4 <6,1,7,3>, <2,3,0,1>
+ 3768183196U, // <1,7,3,3>: Cost 4 vsldoi8 <1,6,1,7>, <3,3,3,3>
+ 3768183298U, // <1,7,3,4>: Cost 4 vsldoi8 <1,6,1,7>, <3,4,5,6>
+ 3792071255U, // <1,7,3,5>: Cost 4 vsldoi8 <5,6,1,7>, <3,5,6,1>
+ 3780127361U, // <1,7,3,6>: Cost 4 vsldoi8 <3,6,1,7>, <3,6,1,7>
+ 3847779617U, // <1,7,3,7>: Cost 4 vsldoi12 <3,7,0,1>, <7,3,7,0>
+ 2795861291U, // <1,7,3,u>: Cost 3 vsldoi12 <7,3,u,1>, <7,3,u,1>
+ 2795935028U, // <1,7,4,0>: Cost 3 vsldoi12 <7,4,0,1>, <7,4,0,1>
+ 3728032975U, // <1,7,4,1>: Cost 4 vsldoi4 <6,1,7,4>, <1,6,1,7>
+ 3839153480U, // <1,7,4,2>: Cost 4 vsldoi12 <2,3,0,1>, <7,4,2,3>
+ 3397358074U, // <1,7,4,3>: Cost 4 vmrglw <7,0,1,4>, <6,2,7,3>
+ 3854783835U, // <1,7,4,4>: Cost 4 vsldoi12 <4,u,5,1>, <7,4,4,4>
+ 2694442294U, // <1,7,4,5>: Cost 3 vsldoi8 <1,6,1,7>, RHS
+ 3786100058U, // <1,7,4,6>: Cost 4 vsldoi8 <4,6,1,7>, <4,6,1,7>
+ 3722065254U, // <1,7,4,7>: Cost 4 vsldoi4 <5,1,7,4>, <7,4,5,6>
+ 2694442537U, // <1,7,4,u>: Cost 3 vsldoi8 <1,6,1,7>, RHS
+ 2654298214U, // <1,7,5,0>: Cost 3 vsldoi4 <6,1,7,5>, LHS
+ 3854783893U, // <1,7,5,1>: Cost 4 vsldoi12 <4,u,5,1>, <7,5,1,u>
+ 3710126010U, // <1,7,5,2>: Cost 4 vsldoi4 <3,1,7,5>, <2,6,3,7>
+ 2332250618U, // <1,7,5,3>: Cost 3 vmrglw <u,4,1,5>, <6,2,7,3>
+ 2654301494U, // <1,7,5,4>: Cost 3 vsldoi4 <6,1,7,5>, RHS
+ 2284474795U, // <1,7,5,5>: Cost 3 vmrglw <0,4,1,5>, <6,1,7,5>
+ 2718330931U, // <1,7,5,6>: Cost 3 vsldoi8 <5,6,1,7>, <5,6,1,7>
+ 2332250946U, // <1,7,5,7>: Cost 3 vmrglw <u,4,1,5>, <6,6,7,7>
+ 2719658197U, // <1,7,5,u>: Cost 3 vsldoi8 <5,u,1,7>, <5,u,1,7>
+ 2332921954U, // <1,7,6,0>: Cost 3 vmrglw <u,5,1,6>, <5,6,7,0>
+ 3768185254U, // <1,7,6,1>: Cost 4 vsldoi8 <1,6,1,7>, <6,1,7,0>
+ 3710134202U, // <1,7,6,2>: Cost 4 vsldoi4 <3,1,7,6>, <2,6,3,7>
+ 3710134561U, // <1,7,6,3>: Cost 4 vsldoi4 <3,1,7,6>, <3,1,7,6>
+ 3710135606U, // <1,7,6,4>: Cost 4 vsldoi4 <3,1,7,6>, RHS
+ 3864884745U, // <1,7,6,5>: Cost 4 vsldoi12 <6,5,7,1>, <7,6,5,7>
+ 3854784017U, // <1,7,6,6>: Cost 4 vsldoi12 <4,u,5,1>, <7,6,6,6>
+ 2791953940U, // <1,7,6,7>: Cost 3 vsldoi12 <6,7,0,1>, <7,6,7,0>
+ 2792617501U, // <1,7,6,u>: Cost 3 vsldoi12 <6,u,0,1>, <7,6,u,0>
+ 2797925927U, // <1,7,7,0>: Cost 3 vsldoi12 <7,7,0,1>, <7,7,0,1>
+ 3365528426U, // <1,7,7,1>: Cost 4 vmrglw <1,6,1,7>, <1,1,7,1>
+ 3728058022U, // <1,7,7,2>: Cost 4 vsldoi4 <6,1,7,7>, <2,3,0,1>
+ 3365528509U, // <1,7,7,3>: Cost 4 vmrglw <1,6,1,7>, <1,2,7,3>
+ 3854784079U, // <1,7,7,4>: Cost 4 vsldoi12 <4,u,5,1>, <7,7,4,5>
+ 3722088148U, // <1,7,7,5>: Cost 4 vsldoi4 <5,1,7,7>, <5,1,7,7>
+ 3728060845U, // <1,7,7,6>: Cost 4 vsldoi4 <6,1,7,7>, <6,1,7,7>
+ 2781042284U, // <1,7,7,7>: Cost 3 vsldoi12 <4,u,5,1>, <7,7,7,7>
+ 2798515823U, // <1,7,7,u>: Cost 3 vsldoi12 <7,7,u,1>, <7,7,u,1>
+ 2654322705U, // <1,7,u,0>: Cost 3 vsldoi4 <6,1,7,u>, <0,0,1,u>
+ 2694444846U, // <1,7,u,1>: Cost 3 vsldoi8 <1,6,1,7>, LHS
+ 2299711912U, // <1,7,u,2>: Cost 3 vmrglw <3,0,1,2>, <6,1,7,2>
+ 2323649018U, // <1,7,u,3>: Cost 3 vmrglw <7,0,1,u>, <6,2,7,3>
+ 2654326070U, // <1,7,u,4>: Cost 3 vsldoi4 <6,1,7,u>, RHS
+ 2694445210U, // <1,7,u,5>: Cost 3 vsldoi8 <1,6,1,7>, RHS
+ 2654327214U, // <1,7,u,6>: Cost 3 vsldoi4 <6,1,7,u>, <6,1,7,u>
+ 2323649346U, // <1,7,u,7>: Cost 3 vmrglw <7,0,1,u>, <6,6,7,7>
+ 2694445413U, // <1,7,u,u>: Cost 3 vsldoi8 <1,6,1,7>, LHS
+ 1610752017U, // <1,u,0,0>: Cost 2 vsldoi8 <0,0,1,u>, <0,0,1,u>
+ 1613406310U, // <1,u,0,1>: Cost 2 vsldoi8 <0,4,1,u>, LHS
+ 2685821107U, // <1,u,0,2>: Cost 3 vsldoi8 <0,2,1,u>, <0,2,1,u>
+ 2283765916U, // <1,u,0,3>: Cost 3 vmrglw <0,3,1,0>, LHS
+ 1613406549U, // <1,u,0,4>: Cost 2 vsldoi8 <0,4,1,u>, <0,4,1,u>
+ 1725880054U, // <1,u,0,5>: Cost 2 vsldoi12 <u,0,5,1>, <u,0,5,1>
+ 2688475639U, // <1,u,0,6>: Cost 3 vsldoi8 <0,6,1,u>, <0,6,1,u>
+ 2283769160U, // <1,u,0,7>: Cost 3 vmrglw <0,3,1,0>, RHS
+ 1613406877U, // <1,u,0,u>: Cost 2 vsldoi8 <0,4,1,u>, LHS
+ 1550221414U, // <1,u,1,0>: Cost 2 vsldoi4 <1,1,1,1>, LHS
+ 269271142U, // <1,u,1,1>: Cost 1 vspltisw1 LHS
+ 1683117870U, // <1,u,1,2>: Cost 2 vsldoi12 <0,u,1,1>, LHS
+ 1213350044U, // <1,u,1,3>: Cost 2 vmrglw <0,u,1,1>, LHS
+ 1550224694U, // <1,u,1,4>: Cost 2 vsldoi4 <1,1,1,1>, RHS
+ 1147574426U, // <1,u,1,5>: Cost 2 vmrghw <1,1,1,1>, RHS
+ 2687149326U, // <1,u,1,6>: Cost 3 vsldoi8 <0,4,1,u>, <1,6,u,7>
+ 1213353288U, // <1,u,1,7>: Cost 2 vmrglw <0,u,1,1>, RHS
+ 269271142U, // <1,u,1,u>: Cost 1 vspltisw1 LHS
+ 2222118611U, // <1,u,2,0>: Cost 3 vmrghw <1,2,3,0>, <u,0,1,2>
+ 1148376878U, // <1,u,2,1>: Cost 2 vmrghw <1,2,3,0>, LHS
+ 1148371862U, // <1,u,2,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0>
+ 1225965724U, // <1,u,2,3>: Cost 2 vmrglw <3,0,1,2>, LHS
+ 2222118975U, // <1,u,2,4>: Cost 3 vmrghw <1,2,3,0>, <u,4,5,6>
+ 1148377242U, // <1,u,2,5>: Cost 2 vmrghw <1,2,3,0>, RHS
+ 2687150010U, // <1,u,2,6>: Cost 3 vsldoi8 <0,4,1,u>, <2,6,3,7>
+ 1225968968U, // <1,u,2,7>: Cost 2 vmrglw <3,0,1,2>, RHS
+ 1148377445U, // <1,u,2,u>: Cost 2 vmrghw <1,2,3,0>, LHS
+ 471040156U, // <1,u,3,0>: Cost 1 vsldoi4 LHS, LHS
+ 1544782644U, // <1,u,3,1>: Cost 2 vsldoi4 LHS, <1,1,1,1>
+ 1544783464U, // <1,u,3,2>: Cost 2 vsldoi4 LHS, <2,2,2,2>
+ 1544784022U, // <1,u,3,3>: Cost 2 vsldoi4 LHS, <3,0,1,2>
+ 471043382U, // <1,u,3,4>: Cost 1 vsldoi4 LHS, RHS
+ 1592561668U, // <1,u,3,5>: Cost 2 vsldoi4 LHS, <5,5,5,5>
+ 1592562170U, // <1,u,3,6>: Cost 2 vsldoi4 LHS, <6,2,7,3>
+ 1592562682U, // <1,u,3,7>: Cost 2 vsldoi4 LHS, <7,0,1,2>
+ 471045934U, // <1,u,3,u>: Cost 1 vsldoi4 LHS, LHS
+ 2708384629U, // <1,u,4,0>: Cost 3 vsldoi8 <4,0,1,u>, <4,0,1,u>
+ 2687151101U, // <1,u,4,1>: Cost 3 vsldoi8 <0,4,1,u>, <4,1,u,0>
+ 2223408022U, // <1,u,4,2>: Cost 3 vmrghw <1,4,2,5>, <1,2,3,0>
+ 2283798684U, // <1,u,4,3>: Cost 3 vmrglw <0,3,1,4>, LHS
+ 2642422785U, // <1,u,4,4>: Cost 3 vsldoi4 <4,1,u,4>, <4,1,u,4>
+ 1613409590U, // <1,u,4,5>: Cost 2 vsldoi8 <0,4,1,u>, RHS
+ 2283801090U, // <1,u,4,6>: Cost 3 vmrglw <0,3,1,4>, <3,4,5,6>
+ 2283801928U, // <1,u,4,7>: Cost 3 vmrglw <0,3,1,4>, RHS
+ 1613409833U, // <1,u,4,u>: Cost 2 vsldoi8 <0,4,1,u>, RHS
+ 2284471235U, // <1,u,5,0>: Cost 3 vmrglw <0,4,1,5>, <1,2,u,0>
+ 2284472046U, // <1,u,5,1>: Cost 3 vmrglw <0,4,1,5>, <2,3,u,1>
+ 2284472533U, // <1,u,5,2>: Cost 3 vmrglw <0,4,1,5>, <3,0,u,2>
+ 1210728604U, // <1,u,5,3>: Cost 2 vmrglw <0,4,1,5>, LHS
+ 2284471239U, // <1,u,5,4>: Cost 3 vmrglw <0,4,1,5>, <1,2,u,4>
+ 1210728786U, // <1,u,5,5>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5>
+ 1683118234U, // <1,u,5,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+ 1210731848U, // <1,u,5,7>: Cost 2 vmrglw <0,4,1,5>, RHS
+ 1210728609U, // <1,u,5,u>: Cost 2 vmrglw <0,4,1,5>, LHS
+ 2720330023U, // <1,u,6,0>: Cost 3 vsldoi8 <6,0,1,u>, <6,0,1,u>
+ 2757376190U, // <1,u,6,1>: Cost 3 vsldoi12 <0,u,u,1>, <u,6,1,7>
+ 2726302202U, // <1,u,6,2>: Cost 3 vsldoi8 <7,0,1,u>, <6,2,7,3>
+ 2283151516U, // <1,u,6,3>: Cost 3 vmrglw <0,2,1,6>, LHS
+ 2224972114U, // <1,u,6,4>: Cost 3 vmrghw <1,6,5,7>, <0,4,1,5>
+ 2224683162U, // <1,u,6,5>: Cost 3 vmrghw <1,6,1,7>, RHS
+ 2726302520U, // <1,u,6,6>: Cost 3 vsldoi8 <7,0,1,u>, <6,6,6,6>
+ 2283154760U, // <1,u,6,7>: Cost 3 vmrglw <0,2,1,6>, RHS
+ 2283151521U, // <1,u,6,u>: Cost 3 vmrglw <0,2,1,6>, LHS
+ 1652560896U, // <1,u,7,0>: Cost 2 vsldoi8 <7,0,1,u>, <7,0,1,u>
+ 2333590225U, // <1,u,7,1>: Cost 3 vmrglw <u,6,1,7>, <0,u,u,1>
+ 2765412628U, // <1,u,7,2>: Cost 3 vsldoi12 <2,3,0,1>, <u,7,2,3>
+ 2291785884U, // <1,u,7,3>: Cost 3 vmrglw <1,6,1,7>, LHS
+ 2781042984U, // <1,u,7,4>: Cost 3 vsldoi12 <4,u,5,1>, <u,7,4,5>
+ 3365527953U, // <1,u,7,5>: Cost 4 vmrglw <1,6,1,7>, <0,4,u,5>
+ 2791954748U, // <1,u,7,6>: Cost 3 vsldoi12 <6,7,0,1>, <u,7,6,7>
+ 2291789128U, // <1,u,7,7>: Cost 3 vmrglw <1,6,1,7>, RHS
+ 1657869960U, // <1,u,7,u>: Cost 2 vsldoi8 <7,u,1,u>, <7,u,1,u>
+ 471081121U, // <1,u,u,0>: Cost 1 vsldoi4 LHS, LHS
+ 269271142U, // <1,u,u,1>: Cost 1 vspltisw1 LHS
+ 1544824424U, // <1,u,u,2>: Cost 2 vsldoi4 LHS, <2,2,2,2>
+ 1544824982U, // <1,u,u,3>: Cost 2 vsldoi4 LHS, <3,0,1,2>
+ 471084342U, // <1,u,u,4>: Cost 1 vsldoi4 LHS, RHS
+ 1613412506U, // <1,u,u,5>: Cost 2 vsldoi8 <0,4,1,u>, RHS
+ 1683118477U, // <1,u,u,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+ 1210756424U, // <1,u,u,7>: Cost 2 vmrglw <0,4,1,u>, RHS
+ 471086894U, // <1,u,u,u>: Cost 1 vsldoi4 LHS, LHS
+ 2226757632U, // <2,0,0,0>: Cost 3 vmrghw <2,0,3,0>, <0,0,0,0>
+ 2226757734U, // <2,0,0,1>: Cost 3 vmrghw <2,0,3,0>, LHS
+ 3826622483U, // <2,0,0,2>: Cost 4 vsldoi12 <0,2,1,2>, <0,0,2,1>
+ 3843211292U, // <2,0,0,3>: Cost 4 vsldoi12 <3,0,1,2>, <0,0,3,1>
+ 3300499794U, // <2,0,0,4>: Cost 4 vmrghw <2,0,3,0>, <0,4,1,5>
+ 3356256724U, // <2,0,0,5>: Cost 4 vmrglw <0,1,2,0>, <3,4,0,5>
+ 3825664056U, // <2,0,0,6>: Cost 4 vsldoi12 <0,0,6,2>, <0,0,6,2>
+ 3762889289U, // <2,0,0,7>: Cost 4 vsldoi8 <0,7,2,0>, <0,7,2,0>
+ 2226758301U, // <2,0,0,u>: Cost 3 vmrghw <2,0,3,0>, LHS
+ 2227429386U, // <2,0,1,0>: Cost 3 vmrghw <2,1,3,1>, <0,0,1,1>
+ 2227429478U, // <2,0,1,1>: Cost 3 vmrghw <2,1,3,1>, LHS
+ 1691156582U, // <2,0,1,2>: Cost 2 vsldoi12 <2,2,2,2>, LHS
+ 2666358997U, // <2,0,1,3>: Cost 3 vsldoi4 <u,2,0,1>, <3,0,u,2>
+ 2227462482U, // <2,0,1,4>: Cost 3 vmrghw <2,1,3,5>, <0,4,1,5>
+ 3722186464U, // <2,0,1,5>: Cost 4 vsldoi4 <5,2,0,1>, <5,2,0,1>
+ 3867099278U, // <2,0,1,6>: Cost 4 vsldoi12 <7,0,1,2>, <0,1,6,7>
+ 3366881912U, // <2,0,1,7>: Cost 4 vmrglw <1,u,2,1>, <3,6,0,7>
+ 1691156636U, // <2,0,1,u>: Cost 2 vsldoi12 <2,2,2,2>, LHS
+ 2228027392U, // <2,0,2,0>: Cost 3 vmrghw <2,2,2,2>, <0,0,0,0>
+ 1154285670U, // <2,0,2,1>: Cost 2 vmrghw <2,2,2,2>, LHS
+ 2228027565U, // <2,0,2,2>: Cost 3 vmrghw <2,2,2,2>, <0,2,1,2>
+ 3301769468U, // <2,0,2,3>: Cost 4 vmrghw <2,2,2,2>, <0,3,1,0>
+ 2228027730U, // <2,0,2,4>: Cost 3 vmrghw <2,2,2,2>, <0,4,1,5>
+ 3301769635U, // <2,0,2,5>: Cost 4 vmrghw <2,2,2,2>, <0,5,1,5>
+ 3780806586U, // <2,0,2,6>: Cost 4 vsldoi8 <3,7,2,0>, <2,6,3,7>
+ 3368880760U, // <2,0,2,7>: Cost 4 vmrglw <2,2,2,2>, <3,6,0,7>
+ 1154286237U, // <2,0,2,u>: Cost 2 vmrghw <2,2,2,2>, LHS
+ 1213440000U, // <2,0,3,0>: Cost 2 vmrglw LHS, <0,0,0,0>
+ 1213441702U, // <2,0,3,1>: Cost 2 vmrglw LHS, <2,3,0,1>
+ 2228535470U, // <2,0,3,2>: Cost 3 vmrghw <2,3,0,1>, <0,2,1,3>
+ 2636515632U, // <2,0,3,3>: Cost 3 vsldoi4 <3,2,0,3>, <3,2,0,3>
+ 2287182962U, // <2,0,3,4>: Cost 3 vmrglw LHS, <1,5,0,4>
+ 2660405346U, // <2,0,3,5>: Cost 3 vsldoi4 <7,2,0,3>, <5,6,7,0>
+ 2228535798U, // <2,0,3,6>: Cost 3 vmrghw <2,3,0,1>, <0,6,1,7>
+ 2660406420U, // <2,0,3,7>: Cost 3 vsldoi4 <7,2,0,3>, <7,2,0,3>
+ 1213441709U, // <2,0,3,u>: Cost 2 vmrglw LHS, <2,3,0,u>
+ 3368894464U, // <2,0,4,0>: Cost 4 vmrglw <2,2,2,4>, <0,0,0,0>
+ 2764898642U, // <2,0,4,1>: Cost 3 vsldoi12 <2,2,2,2>, <0,4,1,5>
+ 3826622811U, // <2,0,4,2>: Cost 4 vsldoi12 <0,2,1,2>, <0,4,2,5>
+ 3843211620U, // <2,0,4,3>: Cost 4 vsldoi12 <3,0,1,2>, <0,4,3,5>
+ 3838640493U, // <2,0,4,4>: Cost 4 vsldoi12 <2,2,2,2>, <0,4,4,5>
+ 2732944694U, // <2,0,4,5>: Cost 3 vsldoi8 <u,1,2,0>, RHS
+ 3797396857U, // <2,0,4,6>: Cost 4 vsldoi8 <6,5,2,0>, <4,6,5,2>
+ 3867099528U, // <2,0,4,7>: Cost 4 vsldoi12 <7,0,1,2>, <0,4,7,5>
+ 2764898705U, // <2,0,4,u>: Cost 3 vsldoi12 <2,2,2,2>, <0,4,u,5>
+ 3364257792U, // <2,0,5,0>: Cost 4 vmrglw <1,4,2,5>, <0,0,0,0>
+ 2230124646U, // <2,0,5,1>: Cost 3 vmrghw <2,5,3,6>, LHS
+ 3304235184U, // <2,0,5,2>: Cost 4 vmrghw <2,5,u,6>, <0,2,1,5>
+ 3364260144U, // <2,0,5,3>: Cost 4 vmrglw <1,4,2,5>, <3,2,0,3>
+ 3303817554U, // <2,0,5,4>: Cost 4 vmrghw <2,5,3,0>, <0,4,1,5>
+ 3364260146U, // <2,0,5,5>: Cost 4 vmrglw <1,4,2,5>, <3,2,0,5>
+ 3867099602U, // <2,0,5,6>: Cost 4 vsldoi12 <7,0,1,2>, <0,5,6,7>
+ 3364260472U, // <2,0,5,7>: Cost 4 vmrglw <1,4,2,5>, <3,6,0,7>
+ 2230125213U, // <2,0,5,u>: Cost 3 vmrghw <2,5,3,6>, LHS
+ 2230796288U, // <2,0,6,0>: Cost 3 vmrghw <2,6,3,7>, <0,0,0,0>
+ 1157054566U, // <2,0,6,1>: Cost 2 vmrghw <2,6,3,7>, LHS
+ 2230796465U, // <2,0,6,2>: Cost 3 vmrghw <2,6,3,7>, <0,2,1,6>
+ 3304538364U, // <2,0,6,3>: Cost 4 vmrghw <2,6,3,7>, <0,3,1,0>
+ 2230796626U, // <2,0,6,4>: Cost 3 vmrghw <2,6,3,7>, <0,4,1,5>
+ 3797398205U, // <2,0,6,5>: Cost 4 vsldoi8 <6,5,2,0>, <6,5,2,0>
+ 3304538614U, // <2,0,6,6>: Cost 4 vmrghw <2,6,3,7>, <0,6,1,7>
+ 3798725471U, // <2,0,6,7>: Cost 4 vsldoi8 <6,7,2,0>, <6,7,2,0>
+ 1157055133U, // <2,0,6,u>: Cost 2 vmrghw <2,6,3,7>, LHS
+ 3371573248U, // <2,0,7,0>: Cost 4 vmrglw <2,6,2,7>, <0,0,0,0>
+ 2231189606U, // <2,0,7,1>: Cost 3 vmrghw <2,7,0,1>, LHS
+ 3801380003U, // <2,0,7,2>: Cost 4 vsldoi8 <7,2,2,0>, <7,2,2,0>
+ 3802043636U, // <2,0,7,3>: Cost 4 vsldoi8 <7,3,2,0>, <7,3,2,0>
+ 3806688614U, // <2,0,7,4>: Cost 4 vsldoi8 <u,1,2,0>, <7,4,5,6>
+ 3356317308U, // <2,0,7,5>: Cost 4 vmrglw <0,1,2,7>, <7,u,0,5>
+ 3804034535U, // <2,0,7,6>: Cost 4 vsldoi8 <7,6,2,0>, <7,6,2,0>
+ 3806688876U, // <2,0,7,7>: Cost 4 vsldoi8 <u,1,2,0>, <7,7,7,7>
+ 2231190173U, // <2,0,7,u>: Cost 3 vmrghw <2,7,0,1>, LHS
+ 1208836096U, // <2,0,u,0>: Cost 2 vmrglw LHS, <0,0,0,0>
+ 1208837798U, // <2,0,u,1>: Cost 2 vmrglw LHS, <2,3,0,1>
+ 1691157149U, // <2,0,u,2>: Cost 2 vsldoi12 <2,2,2,2>, LHS
+ 2636556597U, // <2,0,u,3>: Cost 3 vsldoi4 <3,2,0,u>, <3,2,0,u>
+ 2282579625U, // <2,0,u,4>: Cost 3 vmrglw LHS, <2,3,0,4>
+ 2660446306U, // <2,0,u,5>: Cost 3 vsldoi4 <7,2,0,u>, <5,6,7,0>
+ 2228535798U, // <2,0,u,6>: Cost 3 vmrghw <2,3,0,1>, <0,6,1,7>
+ 2660447385U, // <2,0,u,7>: Cost 3 vsldoi4 <7,2,0,u>, <7,2,0,u>
+ 1208837805U, // <2,0,u,u>: Cost 2 vmrglw LHS, <2,3,0,u>
+ 3692388523U, // <2,1,0,0>: Cost 4 vsldoi4 <0,2,1,0>, <0,2,1,0>
+ 2757526244U, // <2,1,0,1>: Cost 3 vsldoi12 <1,0,1,2>, <1,0,1,2>
+ 2330290974U, // <2,1,0,2>: Cost 3 vmrglw <u,1,2,0>, <3,u,1,2>
+ 3843212020U, // <2,1,0,3>: Cost 4 vsldoi12 <3,0,1,2>, <1,0,3,0>
+ 3692391734U, // <2,1,0,4>: Cost 4 vsldoi4 <0,2,1,0>, RHS
+ 3300533362U, // <2,1,0,5>: Cost 4 vmrghw <2,0,3,4>, <1,5,0,4>
+ 3794084337U, // <2,1,0,6>: Cost 4 vsldoi8 <6,0,2,1>, <0,6,1,2>
+ 3374170614U, // <2,1,0,7>: Cost 5 vmrglw <3,1,2,0>, <0,6,1,7>
+ 2758042403U, // <2,1,0,u>: Cost 3 vsldoi12 <1,0,u,2>, <1,0,u,2>
+ 2690482924U, // <2,1,1,0>: Cost 3 vsldoi8 <1,0,2,1>, <1,0,2,1>
+ 2764899124U, // <2,1,1,1>: Cost 3 vsldoi12 <2,2,2,2>, <1,1,1,1>
+ 2695791510U, // <2,1,1,2>: Cost 3 vsldoi8 <1,u,2,1>, <1,2,3,0>
+ 3362235271U, // <2,1,1,3>: Cost 4 vmrglw <1,1,2,1>, <1,2,1,3>
+ 3692399926U, // <2,1,1,4>: Cost 4 vsldoi4 <0,2,1,1>, RHS
+ 3832226649U, // <2,1,1,5>: Cost 4 vsldoi12 <1,1,5,2>, <1,1,5,2>
+ 3301205235U, // <2,1,1,6>: Cost 4 vmrghw <2,1,3,5>, <1,6,5,7>
+ 3768870179U, // <2,1,1,7>: Cost 4 vsldoi8 <1,7,2,1>, <1,7,2,1>
+ 2695791988U, // <2,1,1,u>: Cost 3 vsldoi8 <1,u,2,1>, <1,u,2,1>
+ 2618663085U, // <2,1,2,0>: Cost 3 vsldoi4 <0,2,1,2>, <0,2,1,2>
+ 2228028212U, // <2,1,2,1>: Cost 3 vmrghw <2,2,2,2>, <1,1,1,1>
+ 2618664552U, // <2,1,2,2>: Cost 3 vsldoi4 <0,2,1,2>, <2,2,2,2>
+ 2759000984U, // <2,1,2,3>: Cost 3 vsldoi12 <1,2,3,2>, <1,2,3,2>
+ 2618666294U, // <2,1,2,4>: Cost 3 vsldoi4 <0,2,1,2>, RHS
+ 2295136594U, // <2,1,2,5>: Cost 3 vmrglw <2,2,2,2>, <0,4,1,5>
+ 3769534376U, // <2,1,2,6>: Cost 4 vsldoi8 <1,u,2,1>, <2,6,1,7>
+ 2793358266U, // <2,1,2,7>: Cost 3 vsldoi12 <7,0,1,2>, <1,2,7,0>
+ 2618668846U, // <2,1,2,u>: Cost 3 vsldoi4 <0,2,1,2>, LHS
+ 2282536969U, // <2,1,3,0>: Cost 3 vmrglw LHS, <0,0,1,0>
+ 1208795146U, // <2,1,3,1>: Cost 2 vmrglw LHS, <0,0,1,1>
+ 1213442198U, // <2,1,3,2>: Cost 2 vmrglw LHS, <3,0,1,2>
+ 2287181998U, // <2,1,3,3>: Cost 3 vmrglw LHS, <0,2,1,3>
+ 2618674486U, // <2,1,3,4>: Cost 3 vsldoi4 <0,2,1,3>, RHS
+ 1208795474U, // <2,1,3,5>: Cost 2 vmrglw LHS, <0,4,1,5>
+ 2287182001U, // <2,1,3,6>: Cost 3 vmrglw LHS, <0,2,1,6>
+ 2287183055U, // <2,1,3,7>: Cost 3 vmrglw LHS, <1,6,1,7>
+ 1208795153U, // <2,1,3,u>: Cost 2 vmrglw LHS, <0,0,1,u>
+ 3692421295U, // <2,1,4,0>: Cost 4 vsldoi4 <0,2,1,4>, <0,2,1,4>
+ 3838641195U, // <2,1,4,1>: Cost 4 vsldoi12 <2,2,2,2>, <1,4,1,5>
+ 2330323742U, // <2,1,4,2>: Cost 3 vmrglw <u,1,2,4>, <3,u,1,2>
+ 3692423318U, // <2,1,4,3>: Cost 5 vsldoi4 <0,2,1,4>, <3,0,1,2>
+ 3692424502U, // <2,1,4,4>: Cost 4 vsldoi4 <0,2,1,4>, RHS
+ 2695793974U, // <2,1,4,5>: Cost 3 vsldoi8 <1,u,2,1>, RHS
+ 3799395705U, // <2,1,4,6>: Cost 4 vsldoi8 <6,u,2,1>, <4,6,5,2>
+ 3368895695U, // <2,1,4,7>: Cost 5 vmrglw <2,2,2,4>, <1,6,1,7>
+ 2695794217U, // <2,1,4,u>: Cost 3 vsldoi8 <1,u,2,1>, RHS
+ 3692429488U, // <2,1,5,0>: Cost 4 vsldoi4 <0,2,1,5>, <0,2,1,5>
+ 3364257802U, // <2,1,5,1>: Cost 4 vmrglw <1,4,2,5>, <0,0,1,1>
+ 3692431253U, // <2,1,5,2>: Cost 4 vsldoi4 <0,2,1,5>, <2,5,u,6>
+ 3692431874U, // <2,1,5,3>: Cost 4 vsldoi4 <0,2,1,5>, <3,4,5,6>
+ 3692432694U, // <2,1,5,4>: Cost 4 vsldoi4 <0,2,1,5>, RHS
+ 3364258130U, // <2,1,5,5>: Cost 4 vmrglw <1,4,2,5>, <0,4,1,5>
+ 3303875827U, // <2,1,5,6>: Cost 4 vmrghw <2,5,3,7>, <1,6,5,7>
+ 3867100333U, // <2,1,5,7>: Cost 4 vsldoi12 <7,0,1,2>, <1,5,7,0>
+ 3692435246U, // <2,1,5,u>: Cost 4 vsldoi4 <0,2,1,5>, LHS
+ 2618695857U, // <2,1,6,0>: Cost 3 vsldoi4 <0,2,1,6>, <0,2,1,6>
+ 2230797108U, // <2,1,6,1>: Cost 3 vmrghw <2,6,3,7>, <1,1,1,1>
+ 2618697658U, // <2,1,6,2>: Cost 3 vsldoi4 <0,2,1,6>, <2,6,3,7>
+ 3692439702U, // <2,1,6,3>: Cost 4 vsldoi4 <0,2,1,6>, <3,0,1,2>
+ 2618699062U, // <2,1,6,4>: Cost 3 vsldoi4 <0,2,1,6>, RHS
+ 3364929874U, // <2,1,6,5>: Cost 4 vmrglw <1,5,2,6>, <0,4,1,5>
+ 3692442424U, // <2,1,6,6>: Cost 4 vsldoi4 <0,2,1,6>, <6,6,6,6>
+ 3798733664U, // <2,1,6,7>: Cost 4 vsldoi8 <6,7,2,1>, <6,7,2,1>
+ 2618701614U, // <2,1,6,u>: Cost 3 vsldoi4 <0,2,1,6>, LHS
+ 3799397370U, // <2,1,7,0>: Cost 4 vsldoi8 <6,u,2,1>, <7,0,1,2>
+ 3371573258U, // <2,1,7,1>: Cost 4 vmrglw <2,6,2,7>, <0,0,1,1>
+ 2330351234U, // <2,1,7,2>: Cost 3 vmrglw <u,1,2,7>, <7,u,1,2>
+ 3799397658U, // <2,1,7,3>: Cost 4 vsldoi8 <6,u,2,1>, <7,3,6,2>
+ 3799397734U, // <2,1,7,4>: Cost 4 vsldoi8 <6,u,2,1>, <7,4,5,6>
+ 3371573586U, // <2,1,7,5>: Cost 4 vmrglw <2,6,2,7>, <0,4,1,5>
+ 3799397870U, // <2,1,7,6>: Cost 4 vsldoi8 <6,u,2,1>, <7,6,2,7>
+ 3799397956U, // <2,1,7,7>: Cost 4 vsldoi8 <6,u,2,1>, <7,7,3,3>
+ 2330351234U, // <2,1,7,u>: Cost 3 vmrglw <u,1,2,7>, <7,u,1,2>
+ 2282577929U, // <2,1,u,0>: Cost 3 vmrglw LHS, <0,0,1,0>
+ 1208836106U, // <2,1,u,1>: Cost 2 vmrglw LHS, <0,0,1,1>
+ 1208838294U, // <2,1,u,2>: Cost 2 vmrglw LHS, <3,0,1,2>
+ 2282578094U, // <2,1,u,3>: Cost 3 vmrglw LHS, <0,2,1,3>
+ 2282577933U, // <2,1,u,4>: Cost 3 vmrglw LHS, <0,0,1,4>
+ 1208836434U, // <2,1,u,5>: Cost 2 vmrglw LHS, <0,4,1,5>
+ 2282578097U, // <2,1,u,6>: Cost 3 vmrglw LHS, <0,2,1,6>
+ 2287224015U, // <2,1,u,7>: Cost 3 vmrglw LHS, <1,6,1,7>
+ 1208836113U, // <2,1,u,u>: Cost 2 vmrglw LHS, <0,0,1,u>
+ 2226759117U, // <2,2,0,0>: Cost 3 vmrghw <2,0,3,0>, <2,0,3,0>
+ 1624047718U, // <2,2,0,1>: Cost 2 vsldoi8 <2,2,2,2>, LHS
+ 2697789613U, // <2,2,0,2>: Cost 3 vsldoi8 <2,2,2,2>, <0,2,1,2>
+ 2226767526U, // <2,2,0,3>: Cost 3 vmrghw <2,0,3,1>, <2,3,0,1>
+ 2697789778U, // <2,2,0,4>: Cost 3 vsldoi8 <2,2,2,2>, <0,4,1,5>
+ 3300657000U, // <2,2,0,5>: Cost 4 vmrghw <2,0,5,1>, <2,5,3,6>
+ 2226988986U, // <2,2,0,6>: Cost 3 vmrghw <2,0,6,1>, <2,6,3,7>
+ 3734271139U, // <2,2,0,7>: Cost 4 vsldoi4 <7,2,2,0>, <7,2,2,0>
+ 1624048285U, // <2,2,0,u>: Cost 2 vsldoi8 <2,2,2,2>, LHS
+ 3831268868U, // <2,2,1,0>: Cost 4 vsldoi12 <1,0,1,2>, <2,1,0,1>
+ 2293138804U, // <2,2,1,1>: Cost 3 vmrglw <1,u,2,1>, <1,u,2,1>
+ 2697790358U, // <2,2,1,2>: Cost 3 vsldoi8 <2,2,2,2>, <1,2,3,0>
+ 2293137510U, // <2,2,1,3>: Cost 3 vmrglw <1,u,2,1>, LHS
+ 3771532331U, // <2,2,1,4>: Cost 4 vsldoi8 <2,2,2,2>, <1,4,1,5>
+ 3767551106U, // <2,2,1,5>: Cost 4 vsldoi8 <1,5,2,2>, <1,5,2,2>
+ 3301173178U, // <2,2,1,6>: Cost 4 vmrghw <2,1,3,1>, <2,6,3,7>
+ 3372853169U, // <2,2,1,7>: Cost 4 vmrglw <2,u,2,1>, <2,6,2,7>
+ 2293137515U, // <2,2,1,u>: Cost 3 vmrglw <1,u,2,1>, LHS
+ 1556938854U, // <2,2,2,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS
+ 2295137733U, // <2,2,2,1>: Cost 3 vmrglw <2,2,2,2>, <2,0,2,1>
+ 336380006U, // <2,2,2,2>: Cost 1 vspltisw2 LHS
+ 1221394534U, // <2,2,2,3>: Cost 2 vmrglw <2,2,2,2>, LHS
+ 1556942134U, // <2,2,2,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS
+ 2295138061U, // <2,2,2,5>: Cost 3 vmrglw <2,2,2,2>, <2,4,2,5>
+ 2228029370U, // <2,2,2,6>: Cost 3 vmrghw <2,2,2,2>, <2,6,3,7>
+ 2660545701U, // <2,2,2,7>: Cost 3 vsldoi4 <7,2,2,2>, <7,2,2,2>
+ 336380006U, // <2,2,2,u>: Cost 1 vspltisw2 LHS
+ 2697791638U, // <2,2,3,0>: Cost 3 vsldoi8 <2,2,2,2>, <3,0,1,2>
+ 2765489840U, // <2,2,3,1>: Cost 3 vsldoi12 <2,3,1,2>, <2,3,1,2>
+ 1213441640U, // <2,2,3,2>: Cost 2 vmrglw LHS, <2,2,2,2>
+ 135053414U, // <2,2,3,3>: Cost 1 vmrglw LHS, LHS
+ 2697792002U, // <2,2,3,4>: Cost 3 vsldoi8 <2,2,2,2>, <3,4,5,6>
+ 2330313780U, // <2,2,3,5>: Cost 3 vmrglw LHS, <1,4,2,5>
+ 2287183549U, // <2,2,3,6>: Cost 3 vmrglw LHS, <2,3,2,6>
+ 2660553894U, // <2,2,3,7>: Cost 3 vsldoi4 <7,2,2,3>, <7,2,2,3>
+ 135053419U, // <2,2,3,u>: Cost 1 vmrglw LHS, LHS
+ 2630697062U, // <2,2,4,0>: Cost 3 vsldoi4 <2,2,2,4>, LHS
+ 3771534282U, // <2,2,4,1>: Cost 4 vsldoi8 <2,2,2,2>, <4,1,2,3>
+ 2764900109U, // <2,2,4,2>: Cost 3 vsldoi12 <2,2,2,2>, <2,4,2,5>
+ 2295152742U, // <2,2,4,3>: Cost 3 vmrglw <2,2,2,4>, LHS
+ 2295154282U, // <2,2,4,4>: Cost 3 vmrglw <2,2,2,4>, <2,2,2,4>
+ 1624050998U, // <2,2,4,5>: Cost 2 vsldoi8 <2,2,2,2>, RHS
+ 2229675962U, // <2,2,4,6>: Cost 3 vmrghw <2,4,6,5>, <2,6,3,7>
+ 3368896433U, // <2,2,4,7>: Cost 4 vmrglw <2,2,2,4>, <2,6,2,7>
+ 1624051241U, // <2,2,4,u>: Cost 2 vsldoi8 <2,2,2,2>, RHS
+ 3771534920U, // <2,2,5,0>: Cost 4 vsldoi8 <2,2,2,2>, <5,0,1,2>
+ 3364258540U, // <2,2,5,1>: Cost 4 vmrglw <1,4,2,5>, <1,0,2,1>
+ 2296489576U, // <2,2,5,2>: Cost 3 vmrglw <2,4,2,5>, <2,2,2,2>
+ 2290516070U, // <2,2,5,3>: Cost 3 vmrglw <1,4,2,5>, LHS
+ 3771535284U, // <2,2,5,4>: Cost 4 vsldoi8 <2,2,2,2>, <5,4,5,6>
+ 2290517044U, // <2,2,5,5>: Cost 3 vmrglw <1,4,2,5>, <1,4,2,5>
+ 2697793634U, // <2,2,5,6>: Cost 3 vsldoi8 <2,2,2,2>, <5,6,7,0>
+ 3370231729U, // <2,2,5,7>: Cost 4 vmrglw <2,4,2,5>, <2,6,2,7>
+ 2290516075U, // <2,2,5,u>: Cost 3 vmrglw <1,4,2,5>, LHS
+ 2230797801U, // <2,2,6,0>: Cost 3 vmrghw <2,6,3,7>, <2,0,6,1>
+ 3304539679U, // <2,2,6,1>: Cost 4 vmrghw <2,6,3,7>, <2,1,3,1>
+ 2764900273U, // <2,2,6,2>: Cost 3 vsldoi12 <2,2,2,2>, <2,6,2,7>
+ 2764900282U, // <2,2,6,3>: Cost 3 vsldoi12 <2,2,2,2>, <2,6,3,7>
+ 2230798129U, // <2,2,6,4>: Cost 3 vmrghw <2,6,3,7>, <2,4,6,5>
+ 3304540008U, // <2,2,6,5>: Cost 4 vmrghw <2,6,3,7>, <2,5,3,6>
+ 1157056442U, // <2,2,6,6>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7>
+ 2725000033U, // <2,2,6,7>: Cost 3 vsldoi8 <6,7,2,2>, <6,7,2,2>
+ 1157056442U, // <2,2,6,u>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7>
+ 2793359338U, // <2,2,7,0>: Cost 3 vsldoi12 <7,0,1,2>, <2,7,0,1>
+ 3371574725U, // <2,2,7,1>: Cost 4 vmrglw <2,6,2,7>, <2,0,2,1>
+ 2297833064U, // <2,2,7,2>: Cost 3 vmrglw <2,6,2,7>, <2,2,2,2>
+ 2297831526U, // <2,2,7,3>: Cost 3 vmrglw <2,6,2,7>, LHS
+ 2697794918U, // <2,2,7,4>: Cost 3 vsldoi8 <2,2,2,2>, <7,4,5,6>
+ 3371575053U, // <2,2,7,5>: Cost 4 vmrglw <2,6,2,7>, <2,4,2,5>
+ 3304933297U, // <2,2,7,6>: Cost 4 vmrghw <2,7,0,1>, <2,6,2,7>
+ 2297833393U, // <2,2,7,7>: Cost 3 vmrglw <2,6,2,7>, <2,6,2,7>
+ 2297831531U, // <2,2,7,u>: Cost 3 vmrglw <2,6,2,7>, LHS
+ 1556938854U, // <2,2,u,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS
+ 1624053550U, // <2,2,u,1>: Cost 2 vsldoi8 <2,2,2,2>, LHS
+ 336380006U, // <2,2,u,2>: Cost 1 vspltisw2 LHS
+ 135094374U, // <2,2,u,3>: Cost 1 vmrglw LHS, LHS
+ 1556942134U, // <2,2,u,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS
+ 1624053914U, // <2,2,u,5>: Cost 2 vsldoi8 <2,2,2,2>, RHS
+ 1157056442U, // <2,2,u,6>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7>
+ 2660594859U, // <2,2,u,7>: Cost 3 vsldoi4 <7,2,2,u>, <7,2,2,u>
+ 135094379U, // <2,2,u,u>: Cost 1 vmrglw LHS, LHS
+ 1611448320U, // <2,3,0,0>: Cost 2 vsldoi8 LHS, <0,0,0,0>
+ 537706598U, // <2,3,0,1>: Cost 1 vsldoi8 LHS, LHS
+ 2689835181U, // <2,3,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2>
+ 2689835260U, // <2,3,0,3>: Cost 3 vsldoi8 LHS, <0,3,1,0>
+ 1611448658U, // <2,3,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5>
+ 2732966354U, // <2,3,0,5>: Cost 3 vsldoi8 LHS, <0,5,6,7>
+ 2732966390U, // <2,3,0,6>: Cost 3 vsldoi8 LHS, <0,6,1,7>
+ 2660603052U, // <2,3,0,7>: Cost 3 vsldoi4 <7,2,3,0>, <7,2,3,0>
+ 537707165U, // <2,3,0,u>: Cost 1 vsldoi8 LHS, LHS
+ 2689835748U, // <2,3,1,0>: Cost 3 vsldoi8 LHS, <1,0,1,2>
+ 1611449140U, // <2,3,1,1>: Cost 2 vsldoi8 LHS, <1,1,1,1>
+ 1611449238U, // <2,3,1,2>: Cost 2 vsldoi8 LHS, <1,2,3,0>
+ 3763577805U, // <2,3,1,3>: Cost 4 vsldoi8 LHS, <1,3,0,1>
+ 2689836112U, // <2,3,1,4>: Cost 3 vsldoi8 LHS, <1,4,5,6>
+ 2689836143U, // <2,3,1,5>: Cost 3 vsldoi8 LHS, <1,5,0,1>
+ 2689836239U, // <2,3,1,6>: Cost 3 vsldoi8 LHS, <1,6,1,7>
+ 3366881210U, // <2,3,1,7>: Cost 4 vmrglw <1,u,2,1>, <2,6,3,7>
+ 1616094588U, // <2,3,1,u>: Cost 2 vsldoi8 LHS, <1,u,3,0>
+ 2689836493U, // <2,3,2,0>: Cost 3 vsldoi8 LHS, <2,0,3,0>
+ 2685191711U, // <2,3,2,1>: Cost 3 vsldoi8 LHS, <2,1,3,1>
+ 1611449960U, // <2,3,2,2>: Cost 2 vsldoi8 LHS, <2,2,2,2>
+ 1611450022U, // <2,3,2,3>: Cost 2 vsldoi8 LHS, <2,3,0,1>
+ 2689836822U, // <2,3,2,4>: Cost 3 vsldoi8 LHS, <2,4,3,5>
+ 2689836904U, // <2,3,2,5>: Cost 3 vsldoi8 LHS, <2,5,3,6>
+ 1611450298U, // <2,3,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7>
+ 2295138234U, // <2,3,2,7>: Cost 3 vmrglw <2,2,2,2>, <2,6,3,7>
+ 1611450456U, // <2,3,2,u>: Cost 2 vsldoi8 LHS, <2,u,3,3>
+ 1213440918U, // <2,3,3,0>: Cost 2 vmrglw LHS, <1,2,3,0>
+ 2282538527U, // <2,3,3,1>: Cost 3 vmrglw LHS, <2,1,3,1>
+ 1557022322U, // <2,3,3,2>: Cost 2 vsldoi4 <2,2,3,3>, <2,2,3,3>
+ 1208796786U, // <2,3,3,3>: Cost 2 vmrglw LHS, <2,2,3,3>
+ 1213440922U, // <2,3,3,4>: Cost 2 vmrglw LHS, <1,2,3,4>
+ 2282538531U, // <2,3,3,5>: Cost 3 vmrglw LHS, <2,1,3,5>
+ 2287188094U, // <2,3,3,6>: Cost 3 vmrglw LHS, <u,5,3,6>
+ 1213441978U, // <2,3,3,7>: Cost 2 vmrglw LHS, <2,6,3,7>
+ 1208796791U, // <2,3,3,u>: Cost 2 vmrglw LHS, <2,2,3,u>
+ 1551056998U, // <2,3,4,0>: Cost 2 vsldoi4 <1,2,3,4>, LHS
+ 1551057818U, // <2,3,4,1>: Cost 2 vsldoi4 <1,2,3,4>, <1,2,3,4>
+ 2624800360U, // <2,3,4,2>: Cost 3 vsldoi4 <1,2,3,4>, <2,2,2,2>
+ 2624800918U, // <2,3,4,3>: Cost 3 vsldoi4 <1,2,3,4>, <3,0,1,2>
+ 1551060278U, // <2,3,4,4>: Cost 2 vsldoi4 <1,2,3,4>, RHS
+ 537709878U, // <2,3,4,5>: Cost 1 vsldoi8 LHS, RHS
+ 2732969337U, // <2,3,4,6>: Cost 3 vsldoi8 LHS, <4,6,5,2>
+ 2660635824U, // <2,3,4,7>: Cost 3 vsldoi4 <7,2,3,4>, <7,2,3,4>
+ 537710121U, // <2,3,4,u>: Cost 1 vsldoi8 LHS, RHS
+ 2689838664U, // <2,3,5,0>: Cost 3 vsldoi8 LHS, <5,0,1,2>
+ 2732969615U, // <2,3,5,1>: Cost 3 vsldoi8 LHS, <5,1,0,1>
+ 2732969707U, // <2,3,5,2>: Cost 3 vsldoi8 LHS, <5,2,1,3>
+ 3763580721U, // <2,3,5,3>: Cost 4 vsldoi8 LHS, <5,3,0,1>
+ 2689839028U, // <2,3,5,4>: Cost 3 vsldoi8 LHS, <5,4,5,6>
+ 1659228164U, // <2,3,5,5>: Cost 2 vsldoi8 LHS, <5,5,5,5>
+ 1659228258U, // <2,3,5,6>: Cost 2 vsldoi8 LHS, <5,6,7,0>
+ 3364259770U, // <2,3,5,7>: Cost 4 vmrglw <1,4,2,5>, <2,6,3,7>
+ 1659228420U, // <2,3,5,u>: Cost 2 vsldoi8 LHS, <5,u,7,0>
+ 2230798486U, // <2,3,6,0>: Cost 3 vmrghw <2,6,3,7>, <3,0,1,2>
+ 2732970407U, // <2,3,6,1>: Cost 3 vsldoi8 LHS, <6,1,7,1>
+ 1659228666U, // <2,3,6,2>: Cost 2 vsldoi8 LHS, <6,2,7,3>
+ 2230798748U, // <2,3,6,3>: Cost 3 vmrghw <2,6,3,7>, <3,3,3,3>
+ 2230798850U, // <2,3,6,4>: Cost 3 vmrghw <2,6,3,7>, <3,4,5,6>
+ 2732970731U, // <2,3,6,5>: Cost 3 vsldoi8 LHS, <6,5,7,1>
+ 1659228984U, // <2,3,6,6>: Cost 2 vsldoi8 LHS, <6,6,6,6>
+ 1659229006U, // <2,3,6,7>: Cost 2 vsldoi8 LHS, <6,7,0,1>
+ 1659229087U, // <2,3,6,u>: Cost 2 vsldoi8 LHS, <6,u,0,1>
+ 1659229178U, // <2,3,7,0>: Cost 2 vsldoi8 LHS, <7,0,1,2>
+ 2726999125U, // <2,3,7,1>: Cost 3 vsldoi8 <7,1,2,3>, <7,1,2,3>
+ 2727662758U, // <2,3,7,2>: Cost 3 vsldoi8 <7,2,2,3>, <7,2,2,3>
+ 2732971235U, // <2,3,7,3>: Cost 3 vsldoi8 LHS, <7,3,0,1>
+ 1659229542U, // <2,3,7,4>: Cost 2 vsldoi8 LHS, <7,4,5,6>
+ 2732971446U, // <2,3,7,5>: Cost 3 vsldoi8 LHS, <7,5,5,5>
+ 2732971484U, // <2,3,7,6>: Cost 3 vsldoi8 LHS, <7,6,0,7>
+ 1659229804U, // <2,3,7,7>: Cost 2 vsldoi8 LHS, <7,7,7,7>
+ 1659229826U, // <2,3,7,u>: Cost 2 vsldoi8 LHS, <7,u,1,2>
+ 1208837014U, // <2,3,u,0>: Cost 2 vmrglw LHS, <1,2,3,0>
+ 537712430U, // <2,3,u,1>: Cost 1 vsldoi8 LHS, LHS
+ 1616099205U, // <2,3,u,2>: Cost 2 vsldoi8 LHS, <u,2,3,0>
+ 1208837746U, // <2,3,u,3>: Cost 2 vmrglw LHS, <2,2,3,3>
+ 1208837018U, // <2,3,u,4>: Cost 2 vmrglw LHS, <1,2,3,4>
+ 537712794U, // <2,3,u,5>: Cost 1 vsldoi8 LHS, RHS
+ 1616099536U, // <2,3,u,6>: Cost 2 vsldoi8 LHS, <u,6,3,7>
+ 1208838074U, // <2,3,u,7>: Cost 2 vmrglw LHS, <2,6,3,7>
+ 537712997U, // <2,3,u,u>: Cost 1 vsldoi8 LHS, LHS
+ 3771547648U, // <2,4,0,0>: Cost 4 vsldoi8 <2,2,2,4>, <0,0,0,0>
+ 2697805926U, // <2,4,0,1>: Cost 3 vsldoi8 <2,2,2,4>, LHS
+ 3770884269U, // <2,4,0,2>: Cost 4 vsldoi8 <2,1,2,4>, <0,2,1,2>
+ 3806716164U, // <2,4,0,3>: Cost 4 vsldoi8 <u,1,2,4>, <0,3,1,u>
+ 3771547986U, // <2,4,0,4>: Cost 4 vsldoi8 <2,2,2,4>, <0,4,1,5>
+ 2226761014U, // <2,4,0,5>: Cost 3 vmrghw <2,0,3,0>, RHS
+ 3853462427U, // <2,4,0,6>: Cost 4 vsldoi12 <4,6,5,2>, <4,0,6,1>
+ 3867102116U, // <2,4,0,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,0,7,1>
+ 2226761257U, // <2,4,0,u>: Cost 3 vmrghw <2,0,3,0>, RHS
+ 3849186231U, // <2,4,1,0>: Cost 4 vsldoi12 <4,0,1,2>, <4,1,0,2>
+ 3301207010U, // <2,4,1,1>: Cost 4 vmrghw <2,1,3,5>, <4,1,5,0>
+ 3766240150U, // <2,4,1,2>: Cost 4 vsldoi8 <1,3,2,4>, <1,2,3,0>
+ 3766240226U, // <2,4,1,3>: Cost 4 vsldoi8 <1,3,2,4>, <1,3,2,4>
+ 3301207248U, // <2,4,1,4>: Cost 4 vmrghw <2,1,3,5>, <4,4,4,4>
+ 2227432758U, // <2,4,1,5>: Cost 3 vmrghw <2,1,3,1>, RHS
+ 3758941400U, // <2,4,1,6>: Cost 4 vsldoi8 <0,1,2,4>, <1,6,2,7>
+ 3768894758U, // <2,4,1,7>: Cost 4 vsldoi8 <1,7,2,4>, <1,7,2,4>
+ 2227433001U, // <2,4,1,u>: Cost 3 vmrghw <2,1,3,1>, RHS
+ 2228030354U, // <2,4,2,0>: Cost 3 vmrghw <2,2,2,2>, <4,0,5,1>
+ 3770885657U, // <2,4,2,1>: Cost 4 vsldoi8 <2,1,2,4>, <2,1,2,4>
+ 2697807466U, // <2,4,2,2>: Cost 3 vsldoi8 <2,2,2,4>, <2,2,2,4>
+ 3368880468U, // <2,4,2,3>: Cost 4 vmrglw <2,2,2,2>, <3,2,4,3>
+ 2228030672U, // <2,4,2,4>: Cost 3 vmrghw <2,2,2,2>, <4,4,4,4>
+ 1154288950U, // <2,4,2,5>: Cost 2 vmrghw <2,2,2,2>, RHS
+ 3771549617U, // <2,4,2,6>: Cost 4 vsldoi8 <2,2,2,4>, <2,6,2,7>
+ 3368880796U, // <2,4,2,7>: Cost 4 vmrglw <2,2,2,2>, <3,6,4,7>
+ 1154289193U, // <2,4,2,u>: Cost 2 vmrghw <2,2,2,2>, RHS
+ 2636808294U, // <2,4,3,0>: Cost 3 vsldoi4 <3,2,4,3>, LHS
+ 2287181861U, // <2,4,3,1>: Cost 3 vmrglw LHS, <0,0,4,1>
+ 2228866102U, // <2,4,3,2>: Cost 3 vmrghw <2,3,4,5>, <4,2,5,3>
+ 2636810580U, // <2,4,3,3>: Cost 3 vsldoi4 <3,2,4,3>, <3,2,4,3>
+ 1256574160U, // <2,4,3,4>: Cost 2 vmrglw LHS, <4,4,4,4>
+ 1213441742U, // <2,4,3,5>: Cost 2 vmrglw LHS, <2,3,4,5>
+ 2228866430U, // <2,4,3,6>: Cost 3 vmrghw <2,3,4,5>, <4,6,5,7>
+ 2660701368U, // <2,4,3,7>: Cost 3 vsldoi4 <7,2,4,3>, <7,2,4,3>
+ 1213441745U, // <2,4,3,u>: Cost 2 vmrglw LHS, <2,3,4,u>
+ 3704586342U, // <2,4,4,0>: Cost 4 vsldoi4 <2,2,4,4>, LHS
+ 3782831051U, // <2,4,4,1>: Cost 4 vsldoi8 <4,1,2,4>, <4,1,2,4>
+ 3704587900U, // <2,4,4,2>: Cost 4 vsldoi4 <2,2,4,4>, <2,2,4,4>
+ 3368896123U, // <2,4,4,3>: Cost 4 vmrglw <2,2,2,4>, <2,2,4,3>
+ 2793360592U, // <2,4,4,4>: Cost 3 vsldoi12 <7,0,1,2>, <4,4,4,4>
+ 2697809206U, // <2,4,4,5>: Cost 3 vsldoi8 <2,2,2,4>, RHS
+ 3303198078U, // <2,4,4,6>: Cost 4 vmrghw <2,4,3,5>, <4,6,5,7>
+ 3867102444U, // <2,4,4,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,4,7,5>
+ 2697809449U, // <2,4,4,u>: Cost 3 vsldoi8 <2,2,2,4>, RHS
+ 2630852710U, // <2,4,5,0>: Cost 3 vsldoi4 <2,2,4,5>, LHS
+ 2624881572U, // <2,4,5,1>: Cost 3 vsldoi4 <1,2,4,5>, <1,2,4,5>
+ 2630854269U, // <2,4,5,2>: Cost 3 vsldoi4 <2,2,4,5>, <2,2,4,5>
+ 2666686677U, // <2,4,5,3>: Cost 3 vsldoi4 <u,2,4,5>, <3,0,u,2>
+ 2630855990U, // <2,4,5,4>: Cost 3 vsldoi4 <2,2,4,5>, RHS
+ 2230127926U, // <2,4,5,5>: Cost 3 vmrghw <2,5,3,6>, RHS
+ 1691159862U, // <2,4,5,6>: Cost 2 vsldoi12 <2,2,2,2>, RHS
+ 3867102520U, // <2,4,5,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,5,7,0>
+ 1691159880U, // <2,4,5,u>: Cost 2 vsldoi12 <2,2,2,2>, RHS
+ 2230799250U, // <2,4,6,0>: Cost 3 vmrghw <2,6,3,7>, <4,0,5,1>
+ 3304541130U, // <2,4,6,1>: Cost 4 vmrghw <2,6,3,7>, <4,1,2,3>
+ 2230799417U, // <2,4,6,2>: Cost 3 vmrghw <2,6,3,7>, <4,2,5,6>
+ 3304541323U, // <2,4,6,3>: Cost 4 vmrghw <2,6,3,7>, <4,3,5,7>
+ 2230799568U, // <2,4,6,4>: Cost 3 vmrghw <2,6,3,7>, <4,4,4,4>
+ 1157057846U, // <2,4,6,5>: Cost 2 vmrghw <2,6,3,7>, RHS
+ 3304541566U, // <2,4,6,6>: Cost 4 vmrghw <2,6,3,7>, <4,6,5,7>
+ 3798758243U, // <2,4,6,7>: Cost 4 vsldoi8 <6,7,2,4>, <6,7,2,4>
+ 1157058089U, // <2,4,6,u>: Cost 2 vmrghw <2,6,3,7>, RHS
+ 3806721018U, // <2,4,7,0>: Cost 4 vsldoi8 <u,1,2,4>, <7,0,1,2>
+ 3853831590U, // <2,4,7,1>: Cost 4 vsldoi12 <4,7,1,2>, <4,7,1,2>
+ 3801412775U, // <2,4,7,2>: Cost 4 vsldoi8 <7,2,2,4>, <7,2,2,4>
+ 3802076408U, // <2,4,7,3>: Cost 4 vsldoi8 <7,3,2,4>, <7,3,2,4>
+ 3401436368U, // <2,4,7,4>: Cost 4 vmrglw <7,6,2,7>, <4,4,4,4>
+ 2793360840U, // <2,4,7,5>: Cost 3 vsldoi12 <7,0,1,2>, <4,7,5,0>
+ 3804067307U, // <2,4,7,6>: Cost 4 vsldoi8 <7,6,2,4>, <7,6,2,4>
+ 3867102682U, // <2,4,7,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,7,7,0>
+ 2793360867U, // <2,4,7,u>: Cost 3 vsldoi12 <7,0,1,2>, <4,7,u,0>
+ 2630877286U, // <2,4,u,0>: Cost 3 vsldoi4 <2,2,4,u>, LHS
+ 2282580144U, // <2,4,u,1>: Cost 3 vmrglw LHS, <3,0,4,1>
+ 2630878848U, // <2,4,u,2>: Cost 3 vsldoi4 <2,2,4,u>, <2,2,4,u>
+ 2636851545U, // <2,4,u,3>: Cost 3 vsldoi4 <3,2,4,u>, <3,2,4,u>
+ 1256615120U, // <2,4,u,4>: Cost 2 vmrglw LHS, <4,4,4,4>
+ 1208837838U, // <2,4,u,5>: Cost 2 vmrglw LHS, <2,3,4,5>
+ 1691160105U, // <2,4,u,6>: Cost 2 vsldoi12 <2,2,2,2>, RHS
+ 2660742333U, // <2,4,u,7>: Cost 3 vsldoi4 <7,2,4,u>, <7,2,4,u>
+ 1208837841U, // <2,4,u,u>: Cost 2 vmrglw LHS, <2,3,4,u>
+ 3766910976U, // <2,5,0,0>: Cost 4 vsldoi8 <1,4,2,5>, <0,0,0,0>
+ 2693169254U, // <2,5,0,1>: Cost 3 vsldoi8 <1,4,2,5>, LHS
+ 3760939181U, // <2,5,0,2>: Cost 4 vsldoi8 <0,4,2,5>, <0,2,1,2>
+ 3843214936U, // <2,5,0,3>: Cost 4 vsldoi12 <3,0,1,2>, <5,0,3,0>
+ 3760939355U, // <2,5,0,4>: Cost 4 vsldoi8 <0,4,2,5>, <0,4,2,5>
+ 3867102827U, // <2,5,0,5>: Cost 4 vsldoi12 <7,0,1,2>, <5,0,5,1>
+ 3867102836U, // <2,5,0,6>: Cost 4 vsldoi12 <7,0,1,2>, <5,0,6,1>
+ 3867102844U, // <2,5,0,7>: Cost 4 vsldoi12 <7,0,1,2>, <5,0,7,0>
+ 2693169821U, // <2,5,0,u>: Cost 3 vsldoi8 <1,4,2,5>, LHS
+ 3766911724U, // <2,5,1,0>: Cost 4 vsldoi8 <1,4,2,5>, <1,0,2,1>
+ 3766911796U, // <2,5,1,1>: Cost 4 vsldoi8 <1,4,2,5>, <1,1,1,1>
+ 2693170070U, // <2,5,1,2>: Cost 3 vsldoi8 <1,4,2,5>, <1,2,3,0>
+ 3384798262U, // <2,5,1,3>: Cost 4 vmrglw <4,u,2,1>, <4,2,5,3>
+ 2693170228U, // <2,5,1,4>: Cost 3 vsldoi8 <1,4,2,5>, <1,4,2,5>
+ 3301208068U, // <2,5,1,5>: Cost 4 vmrghw <2,1,3,5>, <5,5,5,5>
+ 3366879607U, // <2,5,1,6>: Cost 4 vmrglw <1,u,2,1>, <0,4,5,6>
+ 3867102925U, // <2,5,1,7>: Cost 4 vsldoi12 <7,0,1,2>, <5,1,7,0>
+ 2695824760U, // <2,5,1,u>: Cost 3 vsldoi8 <1,u,2,5>, <1,u,2,5>
+ 2642845798U, // <2,5,2,0>: Cost 3 vsldoi4 <4,2,5,2>, LHS
+ 2295139218U, // <2,5,2,1>: Cost 3 vmrglw <2,2,2,2>, <4,0,5,1>
+ 2699142760U, // <2,5,2,2>: Cost 3 vsldoi8 <2,4,2,5>, <2,2,2,2>
+ 3766912678U, // <2,5,2,3>: Cost 4 vsldoi8 <1,4,2,5>, <2,3,0,1>
+ 2699142925U, // <2,5,2,4>: Cost 3 vsldoi8 <2,4,2,5>, <2,4,2,5>
+ 2228031492U, // <2,5,2,5>: Cost 3 vmrghw <2,2,2,2>, <5,5,5,5>
+ 2295138818U, // <2,5,2,6>: Cost 3 vmrglw <2,2,2,2>, <3,4,5,6>
+ 3368879347U, // <2,5,2,7>: Cost 4 vmrglw <2,2,2,2>, <1,6,5,7>
+ 2295138820U, // <2,5,2,u>: Cost 3 vmrglw <2,2,2,2>, <3,4,5,u>
+ 2287184866U, // <2,5,3,0>: Cost 3 vmrglw LHS, <4,1,5,0>
+ 1256573842U, // <2,5,3,1>: Cost 2 vmrglw LHS, <4,0,5,1>
+ 2642855630U, // <2,5,3,2>: Cost 3 vsldoi4 <4,2,5,3>, <2,3,4,5>
+ 2287182763U, // <2,5,3,3>: Cost 3 vmrglw LHS, <1,2,5,3>
+ 2287184870U, // <2,5,3,4>: Cost 3 vmrglw LHS, <4,1,5,4>
+ 1256574170U, // <2,5,3,5>: Cost 2 vmrglw LHS, <4,4,5,5>
+ 1213442562U, // <2,5,3,6>: Cost 2 vmrglw LHS, <3,4,5,6>
+ 2287183091U, // <2,5,3,7>: Cost 3 vmrglw LHS, <1,6,5,7>
+ 1213442564U, // <2,5,3,u>: Cost 2 vmrglw LHS, <3,4,5,u>
+ 3716604006U, // <2,5,4,0>: Cost 4 vsldoi4 <4,2,5,4>, LHS
+ 3716604822U, // <2,5,4,1>: Cost 4 vsldoi4 <4,2,5,4>, <1,2,3,0>
+ 3766914099U, // <2,5,4,2>: Cost 4 vsldoi8 <1,4,2,5>, <4,2,5,0>
+ 3368895403U, // <2,5,4,3>: Cost 5 vmrglw <2,2,2,4>, <1,2,5,3>
+ 3716607031U, // <2,5,4,4>: Cost 4 vsldoi4 <4,2,5,4>, <4,2,5,4>
+ 2693172534U, // <2,5,4,5>: Cost 3 vsldoi8 <1,4,2,5>, RHS
+ 3363588610U, // <2,5,4,6>: Cost 4 vmrglw <1,3,2,4>, <3,4,5,6>
+ 3368895731U, // <2,5,4,7>: Cost 5 vmrglw <2,2,2,4>, <1,6,5,7>
+ 2693172777U, // <2,5,4,u>: Cost 3 vsldoi8 <1,4,2,5>, RHS
+ 3704668262U, // <2,5,5,0>: Cost 4 vsldoi4 <2,2,5,5>, LHS
+ 3704669078U, // <2,5,5,1>: Cost 4 vsldoi4 <2,2,5,5>, <1,2,3,0>
+ 3704669830U, // <2,5,5,2>: Cost 4 vsldoi4 <2,2,5,5>, <2,2,5,5>
+ 3364259460U, // <2,5,5,3>: Cost 4 vmrglw <1,4,2,5>, <2,2,5,3>
+ 3704671542U, // <2,5,5,4>: Cost 4 vsldoi4 <2,2,5,5>, RHS
+ 2793361412U, // <2,5,5,5>: Cost 3 vsldoi12 <7,0,1,2>, <5,5,5,5>
+ 3364258167U, // <2,5,5,6>: Cost 4 vmrglw <1,4,2,5>, <0,4,5,6>
+ 3867103249U, // <2,5,5,7>: Cost 4 vsldoi12 <7,0,1,2>, <5,5,7,0>
+ 2793361412U, // <2,5,5,u>: Cost 3 vsldoi12 <7,0,1,2>, <5,5,5,5>
+ 2642878566U, // <2,5,6,0>: Cost 3 vsldoi4 <4,2,5,6>, LHS
+ 3386166810U, // <2,5,6,1>: Cost 4 vmrglw <5,1,2,6>, <4,u,5,1>
+ 2723033594U, // <2,5,6,2>: Cost 3 vsldoi8 <6,4,2,5>, <6,2,7,3>
+ 3848523842U, // <2,5,6,3>: Cost 4 vsldoi12 <3,u,1,2>, <5,6,3,4>
+ 2723033713U, // <2,5,6,4>: Cost 3 vsldoi8 <6,4,2,5>, <6,4,2,5>
+ 2230800388U, // <2,5,6,5>: Cost 3 vmrghw <2,6,3,7>, <5,5,5,5>
+ 2230800482U, // <2,5,6,6>: Cost 3 vmrghw <2,6,3,7>, <5,6,7,0>
+ 2785841252U, // <2,5,6,7>: Cost 3 vsldoi12 <5,6,7,2>, <5,6,7,2>
+ 2785914989U, // <2,5,6,u>: Cost 3 vsldoi12 <5,6,u,2>, <5,6,u,2>
+ 3796775930U, // <2,5,7,0>: Cost 4 vsldoi8 <6,4,2,5>, <7,0,1,2>
+ 3800757335U, // <2,5,7,1>: Cost 4 vsldoi8 <7,1,2,5>, <7,1,2,5>
+ 3853463689U, // <2,5,7,2>: Cost 4 vsldoi12 <4,6,5,2>, <5,7,2,3>
+ 3796776218U, // <2,5,7,3>: Cost 4 vsldoi8 <6,4,2,5>, <7,3,6,2>
+ 3796776294U, // <2,5,7,4>: Cost 4 vsldoi8 <6,4,2,5>, <7,4,5,6>
+ 3803411867U, // <2,5,7,5>: Cost 4 vsldoi8 <7,5,2,5>, <7,5,2,5>
+ 3371575081U, // <2,5,7,6>: Cost 4 vmrglw <2,6,2,7>, <2,4,5,6>
+ 3796776516U, // <2,5,7,7>: Cost 4 vsldoi8 <6,4,2,5>, <7,7,3,3>
+ 3371575083U, // <2,5,7,u>: Cost 4 vmrglw <2,6,2,7>, <2,4,5,u>
+ 2287225826U, // <2,5,u,0>: Cost 3 vmrglw LHS, <4,1,5,0>
+ 1256614802U, // <2,5,u,1>: Cost 2 vmrglw LHS, <4,0,5,1>
+ 2642896590U, // <2,5,u,2>: Cost 3 vsldoi4 <4,2,5,u>, <2,3,4,5>
+ 2287223723U, // <2,5,u,3>: Cost 3 vmrglw LHS, <1,2,5,3>
+ 2287225830U, // <2,5,u,4>: Cost 3 vmrglw LHS, <4,1,5,4>
+ 1256615130U, // <2,5,u,5>: Cost 2 vmrglw LHS, <4,4,5,5>
+ 1208838658U, // <2,5,u,6>: Cost 2 vmrglw LHS, <3,4,5,6>
+ 2287224051U, // <2,5,u,7>: Cost 3 vmrglw LHS, <1,6,5,7>
+ 1208838660U, // <2,5,u,u>: Cost 2 vmrglw LHS, <3,4,5,u>
+ 3772227584U, // <2,6,0,0>: Cost 4 vsldoi8 <2,3,2,6>, <0,0,0,0>
+ 2698485862U, // <2,6,0,1>: Cost 3 vsldoi8 <2,3,2,6>, LHS
+ 3759620282U, // <2,6,0,2>: Cost 4 vsldoi8 <0,2,2,6>, <0,2,2,6>
+ 3710675299U, // <2,6,0,3>: Cost 4 vsldoi4 <3,2,6,0>, <3,2,6,0>
+ 3767583058U, // <2,6,0,4>: Cost 4 vsldoi8 <1,5,2,6>, <0,4,1,5>
+ 3378153265U, // <2,6,0,5>: Cost 5 vmrglw <3,7,2,0>, <2,4,6,5>
+ 3865186637U, // <2,6,0,6>: Cost 4 vsldoi12 <6,6,2,2>, <6,0,6,1>
+ 2330291510U, // <2,6,0,7>: Cost 3 vmrglw <u,1,2,0>, RHS
+ 2698486429U, // <2,6,0,u>: Cost 3 vsldoi8 <2,3,2,6>, LHS
+ 3734569062U, // <2,6,1,0>: Cost 4 vsldoi4 <7,2,6,1>, LHS
+ 3764929346U, // <2,6,1,1>: Cost 4 vsldoi8 <1,1,2,6>, <1,1,2,6>
+ 3772228502U, // <2,6,1,2>: Cost 4 vsldoi8 <2,3,2,6>, <1,2,3,0>
+ 3734571158U, // <2,6,1,3>: Cost 4 vsldoi4 <7,2,6,1>, <3,0,1,2>
+ 3734572342U, // <2,6,1,4>: Cost 4 vsldoi4 <7,2,6,1>, RHS
+ 3767583878U, // <2,6,1,5>: Cost 4 vsldoi8 <1,5,2,6>, <1,5,2,6>
+ 3768247511U, // <2,6,1,6>: Cost 4 vsldoi8 <1,6,2,6>, <1,6,2,6>
+ 2293140790U, // <2,6,1,7>: Cost 3 vmrglw <1,u,2,1>, RHS
+ 2293140791U, // <2,6,1,u>: Cost 3 vmrglw <1,u,2,1>, RHS
+ 3704717414U, // <2,6,2,0>: Cost 4 vsldoi4 <2,2,6,2>, LHS
+ 3395424589U, // <2,6,2,1>: Cost 4 vmrglw <6,6,2,2>, <6,0,6,1>
+ 2228031993U, // <2,6,2,2>: Cost 3 vmrghw <2,2,2,2>, <6,2,7,2>
+ 2698487485U, // <2,6,2,3>: Cost 3 vsldoi8 <2,3,2,6>, <2,3,2,6>
+ 3704720694U, // <2,6,2,4>: Cost 4 vsldoi4 <2,2,6,2>, RHS
+ 3773556575U, // <2,6,2,5>: Cost 4 vsldoi8 <2,5,2,6>, <2,5,2,6>
+ 2698487738U, // <2,6,2,6>: Cost 3 vsldoi8 <2,3,2,6>, <2,6,3,7>
+ 1221397814U, // <2,6,2,7>: Cost 2 vmrglw <2,2,2,2>, RHS
+ 1221397815U, // <2,6,2,u>: Cost 2 vmrglw <2,2,2,2>, RHS
+ 2636955750U, // <2,6,3,0>: Cost 3 vsldoi4 <3,2,6,3>, LHS
+ 2330314217U, // <2,6,3,1>: Cost 3 vmrglw LHS, <2,0,6,1>
+ 2636957626U, // <2,6,3,2>: Cost 3 vsldoi4 <3,2,6,3>, <2,6,3,7>
+ 2287184230U, // <2,6,3,3>: Cost 3 vmrglw LHS, <3,2,6,3>
+ 2636959030U, // <2,6,3,4>: Cost 3 vsldoi4 <3,2,6,3>, RHS
+ 2648903448U, // <2,6,3,5>: Cost 3 vsldoi4 <5,2,6,3>, <5,2,6,3>
+ 1256575800U, // <2,6,3,6>: Cost 2 vmrglw LHS, <6,6,6,6>
+ 135056694U, // <2,6,3,7>: Cost 1 vmrglw LHS, RHS
+ 135056695U, // <2,6,3,u>: Cost 1 vmrglw LHS, RHS
+ 3710705766U, // <2,6,4,0>: Cost 4 vsldoi4 <3,2,6,4>, LHS
+ 3698762677U, // <2,6,4,1>: Cost 5 vsldoi4 <1,2,6,4>, <1,2,6,4>
+ 3710707389U, // <2,6,4,2>: Cost 4 vsldoi4 <3,2,6,4>, <2,3,2,6>
+ 3710708071U, // <2,6,4,3>: Cost 4 vsldoi4 <3,2,6,4>, <3,2,6,4>
+ 3710709046U, // <2,6,4,4>: Cost 4 vsldoi4 <3,2,6,4>, RHS
+ 2698489142U, // <2,6,4,5>: Cost 3 vsldoi8 <2,3,2,6>, RHS
+ 3796782457U, // <2,6,4,6>: Cost 4 vsldoi8 <6,4,2,6>, <4,6,5,2>
+ 2295156022U, // <2,6,4,7>: Cost 3 vmrglw <2,2,2,4>, RHS
+ 2295156023U, // <2,6,4,u>: Cost 3 vmrglw <2,2,2,4>, RHS
+ 3303870753U, // <2,6,5,0>: Cost 4 vmrghw <2,5,3,6>, <6,0,1,2>
+ 3788820134U, // <2,6,5,1>: Cost 4 vsldoi8 <5,1,2,6>, <5,1,2,6>
+ 3779530520U, // <2,6,5,2>: Cost 4 vsldoi8 <3,5,2,6>, <5,2,6,3>
+ 3303871026U, // <2,6,5,3>: Cost 4 vmrghw <2,5,3,6>, <6,3,4,5>
+ 3303871117U, // <2,6,5,4>: Cost 4 vmrghw <2,5,3,6>, <6,4,5,6>
+ 3791474666U, // <2,6,5,5>: Cost 4 vsldoi8 <5,5,2,6>, <5,5,2,6>
+ 3792138299U, // <2,6,5,6>: Cost 4 vsldoi8 <5,6,2,6>, <5,6,2,6>
+ 2290519350U, // <2,6,5,7>: Cost 3 vmrglw <1,4,2,5>, RHS
+ 2290519351U, // <2,6,5,u>: Cost 3 vmrglw <1,4,2,5>, RHS
+ 2631008358U, // <2,6,6,0>: Cost 3 vsldoi4 <2,2,6,6>, LHS
+ 3372893673U, // <2,6,6,1>: Cost 4 vmrglw <2,u,2,6>, <2,0,6,1>
+ 2791445264U, // <2,6,6,2>: Cost 3 vsldoi12 <6,6,2,2>, <6,6,2,2>
+ 2230800968U, // <2,6,6,3>: Cost 3 vmrghw <2,6,3,7>, <6,3,7,0>
+ 2631011638U, // <2,6,6,4>: Cost 3 vsldoi4 <2,2,6,6>, RHS
+ 3372894001U, // <2,6,6,5>: Cost 4 vmrglw <2,u,2,6>, <2,4,6,5>
+ 2793362232U, // <2,6,6,6>: Cost 3 vsldoi12 <7,0,1,2>, <6,6,6,6>
+ 2295835958U, // <2,6,6,7>: Cost 3 vmrglw <2,3,2,6>, RHS
+ 2295835959U, // <2,6,6,u>: Cost 3 vmrglw <2,3,2,6>, RHS
+ 2793362254U, // <2,6,7,0>: Cost 3 vsldoi12 <7,0,1,2>, <6,7,0,1>
+ 2792035160U, // <2,6,7,1>: Cost 3 vsldoi12 <6,7,1,2>, <6,7,1,2>
+ 2792108897U, // <2,6,7,2>: Cost 3 vsldoi12 <6,7,2,2>, <6,7,2,2>
+ 2769474408U, // <2,6,7,3>: Cost 3 vsldoi12 <3,0,1,2>, <6,7,3,0>
+ 2793362294U, // <2,6,7,4>: Cost 3 vsldoi12 <7,0,1,2>, <6,7,4,5>
+ 3371575089U, // <2,6,7,5>: Cost 4 vmrglw <2,6,2,7>, <2,4,6,5>
+ 2792403845U, // <2,6,7,6>: Cost 3 vsldoi12 <6,7,6,2>, <6,7,6,2>
+ 2297834806U, // <2,6,7,7>: Cost 3 vmrglw <2,6,2,7>, RHS
+ 2297834807U, // <2,6,7,u>: Cost 3 vmrglw <2,6,2,7>, RHS
+ 2636996710U, // <2,6,u,0>: Cost 3 vsldoi4 <3,2,6,u>, LHS
+ 2698491694U, // <2,6,u,1>: Cost 3 vsldoi8 <2,3,2,6>, LHS
+ 2636998631U, // <2,6,u,2>: Cost 3 vsldoi4 <3,2,6,u>, <2,6,u,7>
+ 2282580326U, // <2,6,u,3>: Cost 3 vmrglw LHS, <3,2,6,3>
+ 2636999990U, // <2,6,u,4>: Cost 3 vsldoi4 <3,2,6,u>, RHS
+ 2698492058U, // <2,6,u,5>: Cost 3 vsldoi8 <2,3,2,6>, RHS
+ 1256616760U, // <2,6,u,6>: Cost 2 vmrglw LHS, <6,6,6,6>
+ 135097654U, // <2,6,u,7>: Cost 1 vmrglw LHS, RHS
+ 135097655U, // <2,6,u,u>: Cost 1 vmrglw LHS, RHS
+ 2666864742U, // <2,7,0,0>: Cost 3 vsldoi4 <u,2,7,0>, LHS
+ 1719620602U, // <2,7,0,1>: Cost 2 vsldoi12 <7,0,1,2>, <7,0,1,2>
+ 3768254637U, // <2,7,0,2>: Cost 4 vsldoi8 <1,6,2,7>, <0,2,1,2>
+ 3393417722U, // <2,7,0,3>: Cost 4 vmrglw <6,3,2,0>, <6,2,7,3>
+ 2666868022U, // <2,7,0,4>: Cost 3 vsldoi4 <u,2,7,0>, RHS
+ 3867104290U, // <2,7,0,5>: Cost 4 vsldoi12 <7,0,1,2>, <7,0,5,6>
+ 3728667127U, // <2,7,0,6>: Cost 4 vsldoi4 <6,2,7,0>, <6,2,7,0>
+ 2666869817U, // <2,7,0,7>: Cost 3 vsldoi4 <u,2,7,0>, <7,0,u,2>
+ 1720136761U, // <2,7,0,u>: Cost 2 vsldoi12 <7,0,u,2>, <7,0,u,2>
+ 3728670822U, // <2,7,1,0>: Cost 4 vsldoi4 <6,2,7,1>, LHS
+ 3774227252U, // <2,7,1,1>: Cost 4 vsldoi8 <2,6,2,7>, <1,1,1,1>
+ 3774227350U, // <2,7,1,2>: Cost 4 vsldoi8 <2,6,2,7>, <1,2,3,0>
+ 2323001850U, // <2,7,1,3>: Cost 3 vmrglw <6,u,2,1>, <6,2,7,3>
+ 3728674102U, // <2,7,1,4>: Cost 4 vsldoi4 <6,2,7,1>, RHS
+ 3774227567U, // <2,7,1,5>: Cost 5 vsldoi8 <2,6,2,7>, <1,5,0,1>
+ 2694513880U, // <2,7,1,6>: Cost 3 vsldoi8 <1,6,2,7>, <1,6,2,7>
+ 3396744002U, // <2,7,1,7>: Cost 4 vmrglw <6,u,2,1>, <6,6,7,7>
+ 2323001850U, // <2,7,1,u>: Cost 3 vmrglw <6,u,2,1>, <6,2,7,3>
+ 2654937190U, // <2,7,2,0>: Cost 3 vsldoi4 <6,2,7,2>, LHS
+ 3728679732U, // <2,7,2,1>: Cost 4 vsldoi4 <6,2,7,2>, <1,1,1,1>
+ 2700486248U, // <2,7,2,2>: Cost 3 vsldoi8 <2,6,2,7>, <2,2,2,2>
+ 2321682938U, // <2,7,2,3>: Cost 3 vmrglw <6,6,2,2>, <6,2,7,3>
+ 2654940470U, // <2,7,2,4>: Cost 3 vsldoi4 <6,2,7,2>, RHS
+ 3859584196U, // <2,7,2,5>: Cost 4 vsldoi12 <5,6,7,2>, <7,2,5,6>
+ 2700486577U, // <2,7,2,6>: Cost 3 vsldoi8 <2,6,2,7>, <2,6,2,7>
+ 2228033132U, // <2,7,2,7>: Cost 3 vmrghw <2,2,2,2>, <7,7,7,7>
+ 2701813843U, // <2,7,2,u>: Cost 3 vsldoi8 <2,u,2,7>, <2,u,2,7>
+ 1581203558U, // <2,7,3,0>: Cost 2 vsldoi4 <6,2,7,3>, LHS
+ 2654946100U, // <2,7,3,1>: Cost 3 vsldoi4 <6,2,7,3>, <1,1,1,1>
+ 2637031354U, // <2,7,3,2>: Cost 3 vsldoi4 <3,2,7,3>, <2,6,3,7>
+ 1256575482U, // <2,7,3,3>: Cost 2 vmrglw LHS, <6,2,7,3>
+ 1581206838U, // <2,7,3,4>: Cost 2 vsldoi4 <6,2,7,3>, RHS
+ 2654949380U, // <2,7,3,5>: Cost 3 vsldoi4 <6,2,7,3>, <5,5,5,5>
+ 1581208058U, // <2,7,3,6>: Cost 2 vsldoi4 <6,2,7,3>, <6,2,7,3>
+ 1256575810U, // <2,7,3,7>: Cost 2 vmrglw LHS, <6,6,7,7>
+ 1581209390U, // <2,7,3,u>: Cost 2 vsldoi4 <6,2,7,3>, LHS
+ 3728695398U, // <2,7,4,0>: Cost 4 vsldoi4 <6,2,7,4>, LHS
+ 3869758782U, // <2,7,4,1>: Cost 4 vsldoi12 <7,4,1,2>, <7,4,1,2>
+ 3728696936U, // <2,7,4,2>: Cost 4 vsldoi4 <6,2,7,4>, <2,2,2,2>
+ 3393450490U, // <2,7,4,3>: Cost 4 vmrglw <6,3,2,4>, <6,2,7,3>
+ 3728698678U, // <2,7,4,4>: Cost 4 vsldoi4 <6,2,7,4>, RHS
+ 2700487990U, // <2,7,4,5>: Cost 3 vsldoi8 <2,6,2,7>, RHS
+ 3728699899U, // <2,7,4,6>: Cost 4 vsldoi4 <6,2,7,4>, <6,2,7,4>
+ 3867104626U, // <2,7,4,7>: Cost 4 vsldoi12 <7,0,1,2>, <7,4,7,0>
+ 2700488233U, // <2,7,4,u>: Cost 3 vsldoi8 <2,6,2,7>, RHS
+ 3855160709U, // <2,7,5,0>: Cost 4 vsldoi12 <5,0,1,2>, <7,5,0,1>
+ 3728704406U, // <2,7,5,1>: Cost 4 vsldoi4 <6,2,7,5>, <1,2,3,0>
+ 3370233956U, // <2,7,5,2>: Cost 4 vmrglw <2,4,2,5>, <5,6,7,2>
+ 2320380410U, // <2,7,5,3>: Cost 3 vmrglw <6,4,2,5>, <6,2,7,3>
+ 3728706870U, // <2,7,5,4>: Cost 4 vsldoi4 <6,2,7,5>, RHS
+ 3867104694U, // <2,7,5,5>: Cost 4 vsldoi12 <7,0,1,2>, <7,5,5,5>
+ 3792146492U, // <2,7,5,6>: Cost 4 vsldoi8 <5,6,2,7>, <5,6,2,7>
+ 3394122562U, // <2,7,5,7>: Cost 4 vmrglw <6,4,2,5>, <6,6,7,7>
+ 2320380410U, // <2,7,5,u>: Cost 3 vmrglw <6,4,2,5>, <6,2,7,3>
+ 2230801402U, // <2,7,6,0>: Cost 3 vmrghw <2,6,3,7>, <7,0,1,2>
+ 3768258984U, // <2,7,6,1>: Cost 4 vsldoi8 <1,6,2,7>, <6,1,7,2>
+ 2730349050U, // <2,7,6,2>: Cost 3 vsldoi8 <7,6,2,7>, <6,2,7,3>
+ 3372894575U, // <2,7,6,3>: Cost 4 vmrglw <2,u,2,6>, <3,2,7,3>
+ 2230801766U, // <2,7,6,4>: Cost 3 vmrghw <2,6,3,7>, <7,4,5,6>
+ 3304543670U, // <2,7,6,5>: Cost 4 vmrghw <2,6,3,7>, <7,5,5,5>
+ 3728716285U, // <2,7,6,6>: Cost 4 vsldoi4 <6,2,7,6>, <6,2,7,6>
+ 2230802028U, // <2,7,6,7>: Cost 3 vmrghw <2,6,3,7>, <7,7,7,7>
+ 2730349050U, // <2,7,6,u>: Cost 3 vsldoi8 <7,6,2,7>, <6,2,7,3>
+ 2793362983U, // <2,7,7,0>: Cost 3 vsldoi12 <7,0,1,2>, <7,7,0,1>
+ 3728721112U, // <2,7,7,1>: Cost 4 vsldoi4 <6,2,7,7>, <1,6,2,7>
+ 3371574933U, // <2,7,7,2>: Cost 4 vmrglw <2,6,2,7>, <2,2,7,2>
+ 2327695866U, // <2,7,7,3>: Cost 3 vmrglw <7,6,2,7>, <6,2,7,3>
+ 3728723254U, // <2,7,7,4>: Cost 4 vsldoi4 <6,2,7,7>, RHS
+ 3371574855U, // <2,7,7,5>: Cost 5 vmrglw <2,6,2,7>, <2,1,7,5>
+ 2730350062U, // <2,7,7,6>: Cost 3 vsldoi8 <7,6,2,7>, <7,6,2,7>
+ 2793363052U, // <2,7,7,7>: Cost 3 vsldoi12 <7,0,1,2>, <7,7,7,7>
+ 2798671471U, // <2,7,7,u>: Cost 3 vsldoi12 <7,u,1,2>, <7,7,u,1>
+ 1581244518U, // <2,7,u,0>: Cost 2 vsldoi4 <6,2,7,u>, LHS
+ 1724929666U, // <2,7,u,1>: Cost 2 vsldoi12 <7,u,1,2>, <7,u,1,2>
+ 2637072314U, // <2,7,u,2>: Cost 3 vsldoi4 <3,2,7,u>, <2,6,3,7>
+ 1256616442U, // <2,7,u,3>: Cost 2 vmrglw LHS, <6,2,7,3>
+ 1581247798U, // <2,7,u,4>: Cost 2 vsldoi4 <6,2,7,u>, RHS
+ 2700490906U, // <2,7,u,5>: Cost 3 vsldoi8 <2,6,2,7>, RHS
+ 1581249023U, // <2,7,u,6>: Cost 2 vsldoi4 <6,2,7,u>, <6,2,7,u>
+ 1256616770U, // <2,7,u,7>: Cost 2 vmrglw LHS, <6,6,7,7>
+ 1581250350U, // <2,7,u,u>: Cost 2 vsldoi4 <6,2,7,u>, LHS
+ 1611489280U, // <2,u,0,0>: Cost 2 vsldoi8 LHS, <0,0,0,0>
+ 537747563U, // <2,u,0,1>: Cost 1 vsldoi8 LHS, LHS
+ 2685231277U, // <2,u,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2>
+ 2685231356U, // <2,u,0,3>: Cost 3 vsldoi8 LHS, <0,3,1,0>
+ 1611489618U, // <2,u,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5>
+ 2226763930U, // <2,u,0,5>: Cost 3 vmrghw <2,0,3,0>, RHS
+ 2733007350U, // <2,u,0,6>: Cost 3 vsldoi8 LHS, <0,6,1,7>
+ 2660971737U, // <2,u,0,7>: Cost 3 vsldoi4 <7,2,u,0>, <7,2,u,0>
+ 537748125U, // <2,u,0,u>: Cost 1 vsldoi8 LHS, LHS
+ 2689876708U, // <2,u,1,0>: Cost 3 vsldoi8 LHS, <1,0,1,2>
+ 1611490100U, // <2,u,1,1>: Cost 2 vsldoi8 LHS, <1,1,1,1>
+ 1611490198U, // <2,u,1,2>: Cost 2 vsldoi8 LHS, <1,2,3,0>
+ 2293137564U, // <2,u,1,3>: Cost 3 vmrglw <1,u,2,1>, LHS
+ 2689877072U, // <2,u,1,4>: Cost 3 vsldoi8 LHS, <1,4,5,6>
+ 2689877103U, // <2,u,1,5>: Cost 3 vsldoi8 LHS, <1,5,0,1>
+ 2689877199U, // <2,u,1,6>: Cost 3 vsldoi8 LHS, <1,6,1,7>
+ 2293140808U, // <2,u,1,7>: Cost 3 vmrglw <1,u,2,1>, RHS
+ 1616135548U, // <2,u,1,u>: Cost 2 vsldoi8 LHS, <1,u,3,0>
+ 1556938854U, // <2,u,2,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS
+ 1154291502U, // <2,u,2,1>: Cost 2 vmrghw <2,2,2,2>, LHS
+ 336380006U, // <2,u,2,2>: Cost 1 vspltisw2 LHS
+ 1611490982U, // <2,u,2,3>: Cost 2 vsldoi8 LHS, <2,3,0,1>
+ 1556942134U, // <2,u,2,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS
+ 1154291866U, // <2,u,2,5>: Cost 2 vmrghw <2,2,2,2>, RHS
+ 1611491258U, // <2,u,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7>
+ 1221397832U, // <2,u,2,7>: Cost 2 vmrglw <2,2,2,2>, RHS
+ 336380006U, // <2,u,2,u>: Cost 1 vspltisw2 LHS
+ 1611491478U, // <2,u,3,0>: Cost 2 vsldoi8 LHS, <3,0,1,2>
+ 1213440073U, // <2,u,3,1>: Cost 2 vmrglw LHS, <0,0,u,1>
+ 1213442261U, // <2,u,3,2>: Cost 2 vmrglw LHS, <3,0,u,2>
+ 135053468U, // <2,u,3,3>: Cost 1 vmrglw LHS, LHS
+ 1611491842U, // <2,u,3,4>: Cost 2 vsldoi8 LHS, <3,4,5,6>
+ 1213440401U, // <2,u,3,5>: Cost 2 vmrglw LHS, <0,4,u,5>
+ 1213442589U, // <2,u,3,6>: Cost 2 vmrglw LHS, <3,4,u,6>
+ 135056712U, // <2,u,3,7>: Cost 1 vmrglw LHS, RHS
+ 135053473U, // <2,u,3,u>: Cost 1 vmrglw LHS, LHS
+ 1551425638U, // <2,u,4,0>: Cost 2 vsldoi4 <1,2,u,4>, LHS
+ 1551426503U, // <2,u,4,1>: Cost 2 vsldoi4 <1,2,u,4>, <1,2,u,4>
+ 2625169000U, // <2,u,4,2>: Cost 3 vsldoi4 <1,2,u,4>, <2,2,2,2>
+ 2625169558U, // <2,u,4,3>: Cost 3 vsldoi4 <1,2,u,4>, <3,0,1,2>
+ 1551428918U, // <2,u,4,4>: Cost 2 vsldoi4 <1,2,u,4>, RHS
+ 537750838U, // <2,u,4,5>: Cost 1 vsldoi8 LHS, RHS
+ 2733010297U, // <2,u,4,6>: Cost 3 vsldoi8 LHS, <4,6,5,2>
+ 2295156040U, // <2,u,4,7>: Cost 3 vmrglw <2,2,2,4>, RHS
+ 537751081U, // <2,u,4,u>: Cost 1 vsldoi8 LHS, RHS
+ 2689879624U, // <2,u,5,0>: Cost 3 vsldoi8 LHS, <5,0,1,2>
+ 2230130478U, // <2,u,5,1>: Cost 3 vmrghw <2,5,3,6>, LHS
+ 2631149217U, // <2,u,5,2>: Cost 3 vsldoi4 <2,2,u,5>, <2,2,u,5>
+ 2290516124U, // <2,u,5,3>: Cost 3 vmrglw <1,4,2,5>, LHS
+ 2689879988U, // <2,u,5,4>: Cost 3 vsldoi8 LHS, <5,4,5,6>
+ 1659269124U, // <2,u,5,5>: Cost 2 vsldoi8 LHS, <5,5,5,5>
+ 1691162778U, // <2,u,5,6>: Cost 2 vsldoi12 <2,2,2,2>, RHS
+ 2290519368U, // <2,u,5,7>: Cost 3 vmrglw <1,4,2,5>, RHS
+ 1691162796U, // <2,u,5,u>: Cost 2 vsldoi12 <2,2,2,2>, RHS
+ 2230802131U, // <2,u,6,0>: Cost 3 vmrghw <2,6,3,7>, <u,0,1,2>
+ 1157060398U, // <2,u,6,1>: Cost 2 vmrghw <2,6,3,7>, LHS
+ 1659269626U, // <2,u,6,2>: Cost 2 vsldoi8 LHS, <6,2,7,3>
+ 2764904656U, // <2,u,6,3>: Cost 3 vsldoi12 <2,2,2,2>, <u,6,3,7>
+ 2230802495U, // <2,u,6,4>: Cost 3 vmrghw <2,6,3,7>, <u,4,5,6>
+ 1157060762U, // <2,u,6,5>: Cost 2 vmrghw <2,6,3,7>, RHS
+ 1659269944U, // <2,u,6,6>: Cost 2 vsldoi8 LHS, <6,6,6,6>
+ 1659269966U, // <2,u,6,7>: Cost 2 vsldoi8 LHS, <6,7,0,1>
+ 1157060965U, // <2,u,6,u>: Cost 2 vmrghw <2,6,3,7>, LHS
+ 1659270138U, // <2,u,7,0>: Cost 2 vsldoi8 LHS, <7,0,1,2>
+ 2727040090U, // <2,u,7,1>: Cost 3 vsldoi8 <7,1,2,u>, <7,1,2,u>
+ 2727703723U, // <2,u,7,2>: Cost 3 vsldoi8 <7,2,2,u>, <7,2,2,u>
+ 2297831580U, // <2,u,7,3>: Cost 3 vmrglw <2,6,2,7>, LHS
+ 1659270502U, // <2,u,7,4>: Cost 2 vsldoi8 LHS, <7,4,5,6>
+ 2733012406U, // <2,u,7,5>: Cost 3 vsldoi8 LHS, <7,5,5,5>
+ 2730358255U, // <2,u,7,6>: Cost 3 vsldoi8 <7,6,2,u>, <7,6,2,u>
+ 1659270764U, // <2,u,7,7>: Cost 2 vsldoi8 LHS, <7,7,7,7>
+ 1659270786U, // <2,u,7,u>: Cost 2 vsldoi8 LHS, <7,u,1,2>
+ 1213481923U, // <2,u,u,0>: Cost 2 vmrglw LHS, <1,2,u,0>
+ 537753390U, // <2,u,u,1>: Cost 1 vsldoi8 LHS, LHS
+ 336380006U, // <2,u,u,2>: Cost 1 vspltisw2 LHS
+ 135094428U, // <2,u,u,3>: Cost 1 vmrglw LHS, LHS
+ 1213481927U, // <2,u,u,4>: Cost 2 vmrglw LHS, <1,2,u,4>
+ 537753754U, // <2,u,u,5>: Cost 1 vsldoi8 LHS, RHS
+ 1208838685U, // <2,u,u,6>: Cost 2 vmrglw LHS, <3,4,u,6>
+ 135097672U, // <2,u,u,7>: Cost 1 vmrglw LHS, RHS
+ 135094433U, // <2,u,u,u>: Cost 1 vmrglw LHS, LHS
+ 1678557184U, // <3,0,0,0>: Cost 2 vsldoi12 LHS, <0,0,0,0>
+ 1678557194U, // <3,0,0,1>: Cost 2 vsldoi12 LHS, <0,0,1,1>
+ 2631181989U, // <3,0,0,2>: Cost 3 vsldoi4 <2,3,0,0>, <2,3,0,0>
+ 2289223984U, // <3,0,0,3>: Cost 3 vmrglw <1,2,3,0>, <3,2,0,3>
+ 2756943909U, // <3,0,0,4>: Cost 3 vsldoi12 LHS, <0,0,4,1>
+ 3362965729U, // <3,0,0,5>: Cost 4 vmrglw <1,2,3,0>, <3,1,0,5>
+ 3362966054U, // <3,0,0,6>: Cost 4 vmrglw <1,2,3,0>, <3,5,0,6>
+ 2289224312U, // <3,0,0,7>: Cost 3 vmrglw <1,2,3,0>, <3,6,0,7>
+ 1683202121U, // <3,0,0,u>: Cost 2 vsldoi12 LHS, <0,0,u,1>
+ 1557446758U, // <3,0,1,0>: Cost 2 vsldoi4 <2,3,0,1>, LHS
+ 2752741467U, // <3,0,1,1>: Cost 3 vsldoi12 LHS, <0,1,1,1>
+ 604815462U, // <3,0,1,2>: Cost 1 vsldoi12 LHS, LHS
+ 2631190676U, // <3,0,1,3>: Cost 3 vsldoi4 <2,3,0,1>, <3,0,1,0>
+ 1557450038U, // <3,0,1,4>: Cost 2 vsldoi4 <2,3,0,1>, RHS
+ 2667024388U, // <3,0,1,5>: Cost 3 vsldoi4 <u,3,0,1>, <5,5,5,5>
+ 2800074894U, // <3,0,1,6>: Cost 3 vsldoi12 LHS, <0,1,6,7>
+ 2661053667U, // <3,0,1,7>: Cost 3 vsldoi4 <7,3,0,1>, <7,3,0,1>
+ 604815516U, // <3,0,1,u>: Cost 1 vsldoi12 LHS, LHS
+ 2696521165U, // <3,0,2,0>: Cost 3 vsldoi8 <2,0,3,0>, <2,0,3,0>
+ 2752741549U, // <3,0,2,1>: Cost 3 vsldoi12 LHS, <0,2,1,2>
+ 2691876456U, // <3,0,2,2>: Cost 3 vsldoi8 <1,2,3,0>, <2,2,2,2>
+ 2691876518U, // <3,0,2,3>: Cost 3 vsldoi8 <1,2,3,0>, <2,3,0,1>
+ 3830685895U, // <3,0,2,4>: Cost 4 vsldoi12 LHS, <0,2,4,1>
+ 3765618536U, // <3,0,2,5>: Cost 4 vsldoi8 <1,2,3,0>, <2,5,3,6>
+ 2691876794U, // <3,0,2,6>: Cost 3 vsldoi8 <1,2,3,0>, <2,6,3,7>
+ 2701166596U, // <3,0,2,7>: Cost 3 vsldoi8 <2,7,3,0>, <2,7,3,0>
+ 2756944108U, // <3,0,2,u>: Cost 3 vsldoi12 LHS, <0,2,u,2>
+ 2691877014U, // <3,0,3,0>: Cost 3 vsldoi8 <1,2,3,0>, <3,0,1,2>
+ 1161003110U, // <3,0,3,1>: Cost 2 vmrghw <3,3,3,3>, LHS
+ 2691877168U, // <3,0,3,2>: Cost 3 vsldoi8 <1,2,3,0>, <3,2,0,3>
+ 2691877246U, // <3,0,3,3>: Cost 3 vsldoi8 <1,2,3,0>, <3,3,0,0>
+ 2691877378U, // <3,0,3,4>: Cost 3 vsldoi8 <1,2,3,0>, <3,4,5,6>
+ 3765619238U, // <3,0,3,5>: Cost 4 vsldoi8 <1,2,3,0>, <3,5,0,6>
+ 2691877496U, // <3,0,3,6>: Cost 3 vsldoi8 <1,2,3,0>, <3,6,0,7>
+ 3368962680U, // <3,0,3,7>: Cost 4 vmrglw <2,2,3,3>, <3,6,0,7>
+ 1161003677U, // <3,0,3,u>: Cost 2 vmrghw <3,3,3,3>, LHS
+ 2289254400U, // <3,0,4,0>: Cost 3 vmrglw <1,2,3,4>, <0,0,0,0>
+ 1678557522U, // <3,0,4,1>: Cost 2 vsldoi12 LHS, <0,4,1,5>
+ 2631214761U, // <3,0,4,2>: Cost 3 vsldoi4 <2,3,0,4>, <2,3,0,4>
+ 2235580672U, // <3,0,4,3>: Cost 3 vmrghw <3,4,5,6>, <0,3,1,4>
+ 2756944237U, // <3,0,4,4>: Cost 3 vsldoi12 LHS, <0,4,4,5>
+ 1618136374U, // <3,0,4,5>: Cost 2 vsldoi8 <1,2,3,0>, RHS
+ 3309322742U, // <3,0,4,6>: Cost 4 vmrghw <3,4,5,6>, <0,6,1,7>
+ 3362998904U, // <3,0,4,7>: Cost 4 vmrglw <1,2,3,4>, <3,6,0,7>
+ 1683202449U, // <3,0,4,u>: Cost 2 vsldoi12 LHS, <0,4,u,5>
+ 3765620296U, // <3,0,5,0>: Cost 4 vsldoi8 <1,2,3,0>, <5,0,1,2>
+ 2752299427U, // <3,0,5,1>: Cost 3 vsldoi12 LHS, <0,5,1,5>
+ 3789508346U, // <3,0,5,2>: Cost 4 vsldoi8 <5,2,3,0>, <5,2,3,0>
+ 3403486842U, // <3,0,5,3>: Cost 4 vmrglw <u,0,3,5>, <7,u,0,3>
+ 3765620660U, // <3,0,5,4>: Cost 4 vsldoi8 <1,2,3,0>, <5,4,5,6>
+ 2733682692U, // <3,0,5,5>: Cost 3 vsldoi8 <u,2,3,0>, <5,5,5,5>
+ 2800075218U, // <3,0,5,6>: Cost 3 vsldoi12 LHS, <0,5,6,7>
+ 3873817044U, // <3,0,5,7>: Cost 4 vsldoi12 LHS, <0,5,7,0>
+ 2800075234U, // <3,0,5,u>: Cost 3 vsldoi12 LHS, <0,5,u,5>
+ 2752299501U, // <3,0,6,0>: Cost 3 vsldoi12 LHS, <0,6,0,7>
+ 2236547174U, // <3,0,6,1>: Cost 3 vmrghw <3,6,0,7>, LHS
+ 2733683194U, // <3,0,6,2>: Cost 3 vsldoi8 <u,2,3,0>, <6,2,7,3>
+ 3844473352U, // <3,0,6,3>: Cost 4 vsldoi12 <3,2,0,3>, <0,6,3,7>
+ 3310289234U, // <3,0,6,4>: Cost 4 vmrghw <3,6,0,7>, <0,4,1,5>
+ 3873817114U, // <3,0,6,5>: Cost 4 vsldoi12 LHS, <0,6,5,7>
+ 2733683512U, // <3,0,6,6>: Cost 3 vsldoi8 <u,2,3,0>, <6,6,6,6>
+ 2725057384U, // <3,0,6,7>: Cost 3 vsldoi8 <6,7,3,0>, <6,7,3,0>
+ 2236547741U, // <3,0,6,u>: Cost 3 vmrghw <3,6,0,7>, LHS
+ 2297905152U, // <3,0,7,0>: Cost 3 vmrglw <2,6,3,7>, <0,0,0,0>
+ 2297906854U, // <3,0,7,1>: Cost 3 vmrglw <2,6,3,7>, <2,3,0,1>
+ 2727711916U, // <3,0,7,2>: Cost 3 vsldoi8 <7,2,3,0>, <7,2,3,0>
+ 3371649328U, // <3,0,7,3>: Cost 4 vmrglw <2,6,3,7>, <3,2,0,3>
+ 2733684070U, // <3,0,7,4>: Cost 3 vsldoi8 <u,2,3,0>, <7,4,5,6>
+ 3734843490U, // <3,0,7,5>: Cost 4 vsldoi4 <7,3,0,7>, <5,6,7,0>
+ 3798799895U, // <3,0,7,6>: Cost 4 vsldoi8 <6,7,3,0>, <7,6,7,3>
+ 2733684332U, // <3,0,7,7>: Cost 3 vsldoi8 <u,2,3,0>, <7,7,7,7>
+ 2297906861U, // <3,0,7,u>: Cost 3 vmrglw <2,6,3,7>, <2,3,0,u>
+ 1557504102U, // <3,0,u,0>: Cost 2 vsldoi4 <2,3,0,u>, LHS
+ 1678557842U, // <3,0,u,1>: Cost 2 vsldoi12 LHS, <0,u,1,1>
+ 604816029U, // <3,0,u,2>: Cost 1 vsldoi12 LHS, LHS
+ 2691880892U, // <3,0,u,3>: Cost 3 vsldoi8 <1,2,3,0>, <u,3,0,1>
+ 1557507382U, // <3,0,u,4>: Cost 2 vsldoi4 <2,3,0,u>, RHS
+ 1618139290U, // <3,0,u,5>: Cost 2 vsldoi8 <1,2,3,0>, RHS
+ 2691881168U, // <3,0,u,6>: Cost 3 vsldoi8 <1,2,3,0>, <u,6,3,7>
+ 2661111018U, // <3,0,u,7>: Cost 3 vsldoi4 <7,3,0,u>, <7,3,0,u>
+ 604816083U, // <3,0,u,u>: Cost 1 vsldoi12 LHS, LHS
+ 2619310332U, // <3,1,0,0>: Cost 3 vsldoi4 <0,3,1,0>, <0,3,1,0>
+ 2756944612U, // <3,1,0,1>: Cost 3 vsldoi12 LHS, <1,0,1,2>
+ 2289221724U, // <3,1,0,2>: Cost 3 vmrglw <1,2,3,0>, <0,1,1,2>
+ 2619312278U, // <3,1,0,3>: Cost 3 vsldoi4 <0,3,1,0>, <3,0,1,2>
+ 2619313462U, // <3,1,0,4>: Cost 3 vsldoi4 <0,3,1,0>, RHS
+ 2289221970U, // <3,1,0,5>: Cost 3 vmrglw <1,2,3,0>, <0,4,1,5>
+ 2232599768U, // <3,1,0,6>: Cost 3 vmrghw <3,0,1,2>, <1,6,2,7>
+ 3362964687U, // <3,1,0,7>: Cost 4 vmrglw <1,2,3,0>, <1,6,1,7>
+ 2619316014U, // <3,1,0,u>: Cost 3 vsldoi4 <0,3,1,0>, LHS
+ 2756944683U, // <3,1,1,0>: Cost 3 vsldoi12 LHS, <1,1,0,1>
+ 1678558004U, // <3,1,1,1>: Cost 2 vsldoi12 LHS, <1,1,1,1>
+ 2691883927U, // <3,1,1,2>: Cost 3 vsldoi8 <1,2,3,1>, <1,2,3,1>
+ 3826631496U, // <3,1,1,3>: Cost 4 vsldoi12 <0,2,1,3>, <1,1,3,3>
+ 2756944723U, // <3,1,1,4>: Cost 3 vsldoi12 LHS, <1,1,4,5>
+ 2756944732U, // <3,1,1,5>: Cost 3 vsldoi12 LHS, <1,1,5,5>
+ 3830686561U, // <3,1,1,6>: Cost 4 vsldoi12 LHS, <1,1,6,1>
+ 3734869228U, // <3,1,1,7>: Cost 4 vsldoi4 <7,3,1,1>, <7,3,1,1>
+ 1678558004U, // <3,1,1,u>: Cost 2 vsldoi12 LHS, <1,1,1,1>
+ 2696529358U, // <3,1,2,0>: Cost 3 vsldoi8 <2,0,3,1>, <2,0,3,1>
+ 2756944775U, // <3,1,2,1>: Cost 3 vsldoi12 LHS, <1,2,1,3>
+ 2294548630U, // <3,1,2,2>: Cost 3 vmrglw <2,1,3,2>, <3,0,1,2>
+ 1678558102U, // <3,1,2,3>: Cost 2 vsldoi12 LHS, <1,2,3,0>
+ 2631273782U, // <3,1,2,4>: Cost 3 vsldoi4 <2,3,1,2>, RHS
+ 2756944811U, // <3,1,2,5>: Cost 3 vsldoi12 LHS, <1,2,5,3>
+ 3830686644U, // <3,1,2,6>: Cost 4 vsldoi12 LHS, <1,2,6,3>
+ 2800075706U, // <3,1,2,7>: Cost 3 vsldoi12 LHS, <1,2,7,0>
+ 1679000515U, // <3,1,2,u>: Cost 2 vsldoi12 LHS, <1,2,u,0>
+ 2619334911U, // <3,1,3,0>: Cost 3 vsldoi4 <0,3,1,3>, <0,3,1,3>
+ 2295218186U, // <3,1,3,1>: Cost 3 vmrglw <2,2,3,3>, <0,0,1,1>
+ 2293229718U, // <3,1,3,2>: Cost 3 vmrglw <1,u,3,3>, <3,0,1,2>
+ 2619337116U, // <3,1,3,3>: Cost 3 vsldoi4 <0,3,1,3>, <3,3,3,3>
+ 2619338038U, // <3,1,3,4>: Cost 3 vsldoi4 <0,3,1,3>, RHS
+ 2295218514U, // <3,1,3,5>: Cost 3 vmrglw <2,2,3,3>, <0,4,1,5>
+ 3830686729U, // <3,1,3,6>: Cost 4 vsldoi12 LHS, <1,3,6,7>
+ 3368961231U, // <3,1,3,7>: Cost 4 vmrglw <2,2,3,3>, <1,6,1,7>
+ 2619340590U, // <3,1,3,u>: Cost 3 vsldoi4 <0,3,1,3>, LHS
+ 2619343104U, // <3,1,4,0>: Cost 3 vsldoi4 <0,3,1,4>, <0,3,1,4>
+ 2289254410U, // <3,1,4,1>: Cost 3 vmrglw <1,2,3,4>, <0,0,1,1>
+ 2289256598U, // <3,1,4,2>: Cost 3 vmrglw <1,2,3,4>, <3,0,1,2>
+ 2619345410U, // <3,1,4,3>: Cost 3 vsldoi4 <0,3,1,4>, <3,4,5,6>
+ 2619346230U, // <3,1,4,4>: Cost 3 vsldoi4 <0,3,1,4>, RHS
+ 2756944976U, // <3,1,4,5>: Cost 3 vsldoi12 LHS, <1,4,5,6>
+ 3362996401U, // <3,1,4,6>: Cost 4 vmrglw <1,2,3,4>, <0,2,1,6>
+ 3362997455U, // <3,1,4,7>: Cost 4 vmrglw <1,2,3,4>, <1,6,1,7>
+ 2619348782U, // <3,1,4,u>: Cost 3 vsldoi4 <0,3,1,4>, LHS
+ 2756945007U, // <3,1,5,0>: Cost 3 vsldoi12 LHS, <1,5,0,1>
+ 3830686840U, // <3,1,5,1>: Cost 4 vsldoi12 LHS, <1,5,1,1>
+ 3358361750U, // <3,1,5,2>: Cost 4 vmrglw <0,4,3,5>, <3,0,1,2>
+ 3830686857U, // <3,1,5,3>: Cost 4 vsldoi12 LHS, <1,5,3,0>
+ 2756945047U, // <3,1,5,4>: Cost 3 vsldoi12 LHS, <1,5,4,5>
+ 2294571346U, // <3,1,5,5>: Cost 3 vmrglw <2,1,3,5>, <0,4,1,5>
+ 3806105698U, // <3,1,5,6>: Cost 4 vsldoi8 <u,0,3,1>, <5,6,7,0>
+ 3873817774U, // <3,1,5,7>: Cost 4 vsldoi12 LHS, <1,5,7,1>
+ 2756945079U, // <3,1,5,u>: Cost 3 vsldoi12 LHS, <1,5,u,1>
+ 3830686912U, // <3,1,6,0>: Cost 4 vsldoi12 LHS, <1,6,0,1>
+ 2756945103U, // <3,1,6,1>: Cost 3 vsldoi12 LHS, <1,6,1,7>
+ 2236547990U, // <3,1,6,2>: Cost 3 vmrghw <3,6,0,7>, <1,2,3,0>
+ 3826631905U, // <3,1,6,3>: Cost 4 vsldoi12 <0,2,1,3>, <1,6,3,7>
+ 3830686952U, // <3,1,6,4>: Cost 4 vsldoi12 LHS, <1,6,4,5>
+ 2756945139U, // <3,1,6,5>: Cost 3 vsldoi12 LHS, <1,6,5,7>
+ 3830686972U, // <3,1,6,6>: Cost 4 vsldoi12 LHS, <1,6,6,7>
+ 2800076030U, // <3,1,6,7>: Cost 3 vsldoi12 LHS, <1,6,7,0>
+ 2756945166U, // <3,1,6,u>: Cost 3 vsldoi12 LHS, <1,6,u,7>
+ 3699081318U, // <3,1,7,0>: Cost 4 vsldoi4 <1,3,1,7>, LHS
+ 2297905162U, // <3,1,7,1>: Cost 3 vmrglw <2,6,3,7>, <0,0,1,1>
+ 2297907350U, // <3,1,7,2>: Cost 3 vmrglw <2,6,3,7>, <3,0,1,2>
+ 3365675182U, // <3,1,7,3>: Cost 4 vmrglw <1,6,3,7>, <0,2,1,3>
+ 3699084598U, // <3,1,7,4>: Cost 4 vsldoi4 <1,3,1,7>, RHS
+ 2297905490U, // <3,1,7,5>: Cost 3 vmrglw <2,6,3,7>, <0,4,1,5>
+ 2297905329U, // <3,1,7,6>: Cost 3 vmrglw <2,6,3,7>, <0,2,1,6>
+ 3368330447U, // <3,1,7,7>: Cost 4 vmrglw <2,1,3,7>, <1,6,1,7>
+ 2297905169U, // <3,1,7,u>: Cost 3 vmrglw <2,6,3,7>, <0,0,1,u>
+ 2619375876U, // <3,1,u,0>: Cost 3 vsldoi4 <0,3,1,u>, <0,3,1,u>
+ 1678558004U, // <3,1,u,1>: Cost 2 vsldoi12 LHS, <1,1,1,1>
+ 2289289366U, // <3,1,u,2>: Cost 3 vmrglw <1,2,3,u>, <3,0,1,2>
+ 1679000956U, // <3,1,u,3>: Cost 2 vsldoi12 LHS, <1,u,3,0>
+ 2619378998U, // <3,1,u,4>: Cost 3 vsldoi4 <0,3,1,u>, RHS
+ 2756945297U, // <3,1,u,5>: Cost 3 vsldoi12 LHS, <1,u,5,3>
+ 2297905329U, // <3,1,u,6>: Cost 3 vmrglw <2,6,3,7>, <0,2,1,6>
+ 2800076192U, // <3,1,u,7>: Cost 3 vsldoi12 LHS, <1,u,7,0>
+ 1683203497U, // <3,1,u,u>: Cost 2 vsldoi12 LHS, <1,u,u,0>
+ 3362964203U, // <3,2,0,0>: Cost 4 vmrglw <1,2,3,0>, <1,0,2,0>
+ 2289222380U, // <3,2,0,1>: Cost 3 vmrglw <1,2,3,0>, <1,0,2,1>
+ 2289222462U, // <3,2,0,2>: Cost 3 vmrglw <1,2,3,0>, <1,1,2,2>
+ 1215479910U, // <3,2,0,3>: Cost 2 vmrglw <1,2,3,0>, LHS
+ 3362964207U, // <3,2,0,4>: Cost 4 vmrglw <1,2,3,0>, <1,0,2,4>
+ 2289222708U, // <3,2,0,5>: Cost 3 vmrglw <1,2,3,0>, <1,4,2,5>
+ 2232600506U, // <3,2,0,6>: Cost 3 vmrghw <3,0,1,2>, <2,6,3,7>
+ 3396142296U, // <3,2,0,7>: Cost 4 vmrglw <6,7,3,0>, <1,6,2,7>
+ 1215479915U, // <3,2,0,u>: Cost 2 vmrglw <1,2,3,0>, LHS
+ 3699105894U, // <3,2,1,0>: Cost 4 vsldoi4 <1,3,2,1>, LHS
+ 3765633844U, // <3,2,1,1>: Cost 4 vsldoi8 <1,2,3,2>, <1,1,1,1>
+ 2691892120U, // <3,2,1,2>: Cost 3 vsldoi8 <1,2,3,2>, <1,2,3,2>
+ 2752300575U, // <3,2,1,3>: Cost 3 vsldoi12 LHS, <2,1,3,1>
+ 3699109174U, // <3,2,1,4>: Cost 4 vsldoi4 <1,3,2,1>, RHS
+ 3830687280U, // <3,2,1,5>: Cost 5 vsldoi12 LHS, <2,1,5,0>
+ 3830687289U, // <3,2,1,6>: Cost 4 vsldoi12 LHS, <2,1,6,0>
+ 3874260548U, // <3,2,1,7>: Cost 4 vsldoi12 LHS, <2,1,7,2>
+ 2752742988U, // <3,2,1,u>: Cost 3 vsldoi12 LHS, <2,1,u,1>
+ 2631344230U, // <3,2,2,0>: Cost 3 vsldoi4 <2,3,2,2>, LHS
+ 2697201184U, // <3,2,2,1>: Cost 3 vsldoi8 <2,1,3,2>, <2,1,3,2>
+ 1678558824U, // <3,2,2,2>: Cost 2 vsldoi12 LHS, <2,2,2,2>
+ 1678558834U, // <3,2,2,3>: Cost 2 vsldoi12 LHS, <2,2,3,3>
+ 2631347510U, // <3,2,2,4>: Cost 3 vsldoi4 <2,3,2,2>, RHS
+ 3368953613U, // <3,2,2,5>: Cost 4 vmrglw <2,2,3,2>, <2,4,2,5>
+ 2234304442U, // <3,2,2,6>: Cost 3 vmrghw <3,2,6,3>, <2,6,3,7>
+ 3368953777U, // <3,2,2,7>: Cost 4 vmrglw <2,2,3,2>, <2,6,2,7>
+ 1679001247U, // <3,2,2,u>: Cost 2 vsldoi12 LHS, <2,2,u,3>
+ 1678558886U, // <3,2,3,0>: Cost 2 vsldoi12 LHS, <2,3,0,1>
+ 2752300719U, // <3,2,3,1>: Cost 3 vsldoi12 LHS, <2,3,1,1>
+ 2752300729U, // <3,2,3,2>: Cost 3 vsldoi12 LHS, <2,3,2,2>
+ 1221476454U, // <3,2,3,3>: Cost 2 vmrglw <2,2,3,3>, LHS
+ 1678558926U, // <3,2,3,4>: Cost 2 vsldoi12 LHS, <2,3,4,5>
+ 2800076503U, // <3,2,3,5>: Cost 3 vsldoi12 LHS, <2,3,5,5>
+ 2234746810U, // <3,2,3,6>: Cost 3 vmrghw <3,3,3,3>, <2,6,3,7>
+ 2800076516U, // <3,2,3,7>: Cost 3 vsldoi12 LHS, <2,3,7,0>
+ 1678558958U, // <3,2,3,u>: Cost 2 vsldoi12 LHS, <2,3,u,1>
+ 3699130470U, // <3,2,4,0>: Cost 4 vsldoi4 <1,3,2,4>, LHS
+ 3362996972U, // <3,2,4,1>: Cost 4 vmrglw <1,2,3,4>, <1,0,2,1>
+ 2289256040U, // <3,2,4,2>: Cost 3 vmrglw <1,2,3,4>, <2,2,2,2>
+ 1215512678U, // <3,2,4,3>: Cost 2 vmrglw <1,2,3,4>, LHS
+ 3362998676U, // <3,2,4,4>: Cost 4 vmrglw <1,2,3,4>, <3,3,2,4>
+ 2691894582U, // <3,2,4,5>: Cost 3 vsldoi8 <1,2,3,2>, RHS
+ 2235582394U, // <3,2,4,6>: Cost 3 vmrghw <3,4,5,6>, <2,6,3,7>
+ 3734967544U, // <3,2,4,7>: Cost 4 vsldoi4 <7,3,2,4>, <7,3,2,4>
+ 1215512683U, // <3,2,4,u>: Cost 2 vmrglw <1,2,3,4>, LHS
+ 3705110630U, // <3,2,5,0>: Cost 4 vsldoi4 <2,3,2,5>, LHS
+ 3368313985U, // <3,2,5,1>: Cost 4 vmrglw <2,1,3,5>, <1,5,2,1>
+ 3368314472U, // <3,2,5,2>: Cost 4 vmrglw <2,1,3,5>, <2,2,2,2>
+ 2756945768U, // <3,2,5,3>: Cost 3 vsldoi12 LHS, <2,5,3,6>
+ 3705113910U, // <3,2,5,4>: Cost 4 vsldoi4 <2,3,2,5>, RHS
+ 3310061416U, // <3,2,5,5>: Cost 4 vmrghw <3,5,6,6>, <2,5,3,6>
+ 3310135226U, // <3,2,5,6>: Cost 4 vmrghw <3,5,7,6>, <2,6,3,7>
+ 3370305457U, // <3,2,5,7>: Cost 5 vmrglw <2,4,3,5>, <2,6,2,7>
+ 2752743317U, // <3,2,5,u>: Cost 3 vsldoi12 LHS, <2,5,u,6>
+ 2631376998U, // <3,2,6,0>: Cost 3 vsldoi4 <2,3,2,6>, LHS
+ 3705119540U, // <3,2,6,1>: Cost 4 vsldoi4 <2,3,2,6>, <1,1,1,1>
+ 2631378621U, // <3,2,6,2>: Cost 3 vsldoi4 <2,3,2,6>, <2,3,2,6>
+ 1678559162U, // <3,2,6,3>: Cost 2 vsldoi12 LHS, <2,6,3,7>
+ 2631380278U, // <3,2,6,4>: Cost 3 vsldoi4 <2,3,2,6>, RHS
+ 3370976956U, // <3,2,6,5>: Cost 4 vmrglw <2,5,3,6>, <2,3,2,5>
+ 2237065146U, // <3,2,6,6>: Cost 3 vmrghw <3,6,7,7>, <2,6,3,7>
+ 3798815594U, // <3,2,6,7>: Cost 4 vsldoi8 <6,7,3,2>, <6,7,3,2>
+ 1679001575U, // <3,2,6,u>: Cost 2 vsldoi12 LHS, <2,6,u,7>
+ 2800076778U, // <3,2,7,0>: Cost 3 vsldoi12 LHS, <2,7,0,1>
+ 3371647724U, // <3,2,7,1>: Cost 4 vmrglw <2,6,3,7>, <1,0,2,1>
+ 2297906792U, // <3,2,7,2>: Cost 3 vmrglw <2,6,3,7>, <2,2,2,2>
+ 1224163430U, // <3,2,7,3>: Cost 2 vmrglw <2,6,3,7>, LHS
+ 3705130294U, // <3,2,7,4>: Cost 4 vsldoi4 <2,3,2,7>, RHS
+ 3371648052U, // <3,2,7,5>: Cost 4 vmrglw <2,6,3,7>, <1,4,2,5>
+ 2297906877U, // <3,2,7,6>: Cost 3 vmrglw <2,6,3,7>, <2,3,2,6>
+ 3371648702U, // <3,2,7,7>: Cost 4 vmrglw <2,6,3,7>, <2,3,2,7>
+ 1224163435U, // <3,2,7,u>: Cost 2 vmrglw <2,6,3,7>, LHS
+ 1679001659U, // <3,2,u,0>: Cost 2 vsldoi12 LHS, <2,u,0,1>
+ 2752743492U, // <3,2,u,1>: Cost 3 vsldoi12 LHS, <2,u,1,1>
+ 1678558824U, // <3,2,u,2>: Cost 2 vsldoi12 LHS, <2,2,2,2>
+ 1678559320U, // <3,2,u,3>: Cost 2 vsldoi12 LHS, <2,u,3,3>
+ 1679001699U, // <3,2,u,4>: Cost 2 vsldoi12 LHS, <2,u,4,5>
+ 2691897498U, // <3,2,u,5>: Cost 3 vsldoi8 <1,2,3,2>, RHS
+ 2237908922U, // <3,2,u,6>: Cost 3 vmrghw <3,u,1,2>, <2,6,3,7>
+ 2800519289U, // <3,2,u,7>: Cost 3 vsldoi12 LHS, <2,u,7,0>
+ 1679001731U, // <3,2,u,u>: Cost 2 vsldoi12 LHS, <2,u,u,1>
+ 1215480726U, // <3,3,0,0>: Cost 2 vmrglw <1,2,3,0>, <1,2,3,0>
+ 1678559382U, // <3,3,0,1>: Cost 2 vsldoi12 LHS, <3,0,1,2>
+ 2631403200U, // <3,3,0,2>: Cost 3 vsldoi4 <2,3,3,0>, <2,3,3,0>
+ 2289223282U, // <3,3,0,3>: Cost 3 vmrglw <1,2,3,0>, <2,2,3,3>
+ 2752301232U, // <3,3,0,4>: Cost 3 vsldoi12 LHS, <3,0,4,1>
+ 3362965027U, // <3,3,0,5>: Cost 4 vmrglw <1,2,3,0>, <2,1,3,5>
+ 3362965352U, // <3,3,0,6>: Cost 4 vmrglw <1,2,3,0>, <2,5,3,6>
+ 2289223610U, // <3,3,0,7>: Cost 3 vmrglw <1,2,3,0>, <2,6,3,7>
+ 1678559445U, // <3,3,0,u>: Cost 2 vsldoi12 LHS, <3,0,u,2>
+ 3830687964U, // <3,3,1,0>: Cost 4 vsldoi12 LHS, <3,1,0,0>
+ 2752301286U, // <3,3,1,1>: Cost 3 vsldoi12 LHS, <3,1,1,1>
+ 2752301297U, // <3,3,1,2>: Cost 3 vsldoi12 LHS, <3,1,2,3>
+ 2305157532U, // <3,3,1,3>: Cost 3 vmrglw <3,u,3,1>, <3,3,3,3>
+ 3830688000U, // <3,3,1,4>: Cost 4 vsldoi12 LHS, <3,1,4,0>
+ 3830688009U, // <3,3,1,5>: Cost 4 vsldoi12 LHS, <3,1,5,0>
+ 3830688019U, // <3,3,1,6>: Cost 4 vsldoi12 LHS, <3,1,6,1>
+ 3362973626U, // <3,3,1,7>: Cost 4 vmrglw <1,2,3,1>, <2,6,3,7>
+ 2752743719U, // <3,3,1,u>: Cost 3 vsldoi12 LHS, <3,1,u,3>
+ 2631417958U, // <3,3,2,0>: Cost 3 vsldoi4 <2,3,3,2>, LHS
+ 3826043193U, // <3,3,2,1>: Cost 4 vsldoi12 LHS, <3,2,1,3>
+ 1624131186U, // <3,3,2,2>: Cost 2 vsldoi8 <2,2,3,3>, <2,2,3,3>
+ 2752301384U, // <3,3,2,3>: Cost 3 vsldoi12 LHS, <3,2,3,0>
+ 2631421238U, // <3,3,2,4>: Cost 3 vsldoi4 <2,3,3,2>, RHS
+ 3826485602U, // <3,3,2,5>: Cost 4 vsldoi12 LHS, <3,2,5,u>
+ 2752301414U, // <3,3,2,6>: Cost 3 vsldoi12 LHS, <3,2,6,3>
+ 2771249519U, // <3,3,2,7>: Cost 3 vsldoi12 <3,2,7,3>, <3,2,7,3>
+ 1628112984U, // <3,3,2,u>: Cost 2 vsldoi8 <2,u,3,3>, <2,u,3,3>
+ 1563656294U, // <3,3,3,0>: Cost 2 vsldoi4 <3,3,3,3>, LHS
+ 2301855911U, // <3,3,3,1>: Cost 3 vmrglw <3,3,3,3>, <3,0,3,1>
+ 2697873730U, // <3,3,3,2>: Cost 3 vsldoi8 <2,2,3,3>, <3,2,2,3>
+ 403488870U, // <3,3,3,3>: Cost 1 vspltisw3 LHS
+ 1563659574U, // <3,3,3,4>: Cost 2 vsldoi4 <3,3,3,3>, RHS
+ 2301856239U, // <3,3,3,5>: Cost 3 vmrglw <3,3,3,3>, <3,4,3,5>
+ 2697874067U, // <3,3,3,6>: Cost 3 vsldoi8 <2,2,3,3>, <3,6,3,7>
+ 2295220154U, // <3,3,3,7>: Cost 3 vmrglw <2,2,3,3>, <2,6,3,7>
+ 403488870U, // <3,3,3,u>: Cost 1 vspltisw3 LHS
+ 2289255318U, // <3,3,4,0>: Cost 3 vmrglw <1,2,3,4>, <1,2,3,0>
+ 2631435162U, // <3,3,4,1>: Cost 3 vsldoi4 <2,3,3,4>, <1,2,3,4>
+ 2631435972U, // <3,3,4,2>: Cost 3 vsldoi4 <2,3,3,4>, <2,3,3,4>
+ 2289256050U, // <3,3,4,3>: Cost 3 vmrglw <1,2,3,4>, <2,2,3,3>
+ 1215513498U, // <3,3,4,4>: Cost 2 vmrglw <1,2,3,4>, <1,2,3,4>
+ 1679002114U, // <3,3,4,5>: Cost 2 vsldoi12 LHS, <3,4,5,6>
+ 3362998120U, // <3,3,4,6>: Cost 4 vmrglw <1,2,3,4>, <2,5,3,6>
+ 2289256378U, // <3,3,4,7>: Cost 3 vmrglw <1,2,3,4>, <2,6,3,7>
+ 1679002141U, // <3,3,4,u>: Cost 2 vsldoi12 LHS, <3,4,u,6>
+ 3831130657U, // <3,3,5,0>: Cost 4 vsldoi12 LHS, <3,5,0,1>
+ 3376277671U, // <3,3,5,1>: Cost 4 vmrglw <3,4,3,5>, <3,0,3,1>
+ 3771617012U, // <3,3,5,2>: Cost 4 vsldoi8 <2,2,3,3>, <5,2,2,3>
+ 2302536092U, // <3,3,5,3>: Cost 3 vmrglw <3,4,3,5>, <3,3,3,3>
+ 3831130697U, // <3,3,5,4>: Cost 4 vsldoi12 LHS, <3,5,4,5>
+ 2294572579U, // <3,3,5,5>: Cost 3 vmrglw <2,1,3,5>, <2,1,3,5>
+ 2800519773U, // <3,3,5,6>: Cost 3 vsldoi12 LHS, <3,5,6,7>
+ 3368314810U, // <3,3,5,7>: Cost 4 vmrglw <2,1,3,5>, <2,6,3,7>
+ 2800519791U, // <3,3,5,u>: Cost 3 vsldoi12 LHS, <3,5,u,7>
+ 2800077432U, // <3,3,6,0>: Cost 3 vsldoi12 LHS, <3,6,0,7>
+ 3310291185U, // <3,3,6,1>: Cost 4 vmrghw <3,6,0,7>, <3,1,2,3>
+ 2789165706U, // <3,3,6,2>: Cost 3 vsldoi12 <6,2,7,3>, <3,6,2,7>
+ 2764982931U, // <3,3,6,3>: Cost 3 vsldoi12 <2,2,3,3>, <3,6,3,7>
+ 2800077468U, // <3,3,6,4>: Cost 3 vsldoi12 LHS, <3,6,4,7>
+ 3873819301U, // <3,3,6,5>: Cost 4 vsldoi12 LHS, <3,6,5,7>
+ 2297235304U, // <3,3,6,6>: Cost 3 vmrglw <2,5,3,6>, <2,5,3,6>
+ 2725081963U, // <3,3,6,7>: Cost 3 vsldoi8 <6,7,3,3>, <6,7,3,3>
+ 2725745596U, // <3,3,6,u>: Cost 3 vsldoi8 <6,u,3,3>, <6,u,3,3>
+ 2631458918U, // <3,3,7,0>: Cost 3 vsldoi4 <2,3,3,7>, LHS
+ 3705201460U, // <3,3,7,1>: Cost 4 vsldoi4 <2,3,3,7>, <1,1,1,1>
+ 2631460551U, // <3,3,7,2>: Cost 3 vsldoi4 <2,3,3,7>, <2,3,3,7>
+ 2297906802U, // <3,3,7,3>: Cost 3 vmrglw <2,6,3,7>, <2,2,3,3>
+ 2631462198U, // <3,3,7,4>: Cost 3 vsldoi4 <2,3,3,7>, RHS
+ 3371648547U, // <3,3,7,5>: Cost 4 vmrglw <2,6,3,7>, <2,1,3,5>
+ 3371648548U, // <3,3,7,6>: Cost 4 vmrglw <2,6,3,7>, <2,1,3,6>
+ 1224165306U, // <3,3,7,7>: Cost 2 vmrglw <2,6,3,7>, <2,6,3,7>
+ 1224165306U, // <3,3,7,u>: Cost 2 vmrglw <2,6,3,7>, <2,6,3,7>
+ 1215480726U, // <3,3,u,0>: Cost 2 vmrglw <1,2,3,0>, <1,2,3,0>
+ 1679002398U, // <3,3,u,1>: Cost 2 vsldoi12 LHS, <3,u,1,2>
+ 1659967368U, // <3,3,u,2>: Cost 2 vsldoi8 <u,2,3,3>, <u,2,3,3>
+ 403488870U, // <3,3,u,3>: Cost 1 vspltisw3 LHS
+ 1563659574U, // <3,3,u,4>: Cost 2 vsldoi4 <3,3,3,3>, RHS
+ 1679002438U, // <3,3,u,5>: Cost 2 vsldoi12 LHS, <3,u,5,6>
+ 2756946764U, // <3,3,u,6>: Cost 3 vsldoi12 LHS, <3,u,6,3>
+ 1224165306U, // <3,3,u,7>: Cost 2 vmrglw <2,6,3,7>, <2,6,3,7>
+ 403488870U, // <3,3,u,u>: Cost 1 vspltisw3 LHS
+ 2691907584U, // <3,4,0,0>: Cost 3 vsldoi8 <1,2,3,4>, <0,0,0,0>
+ 1618165862U, // <3,4,0,1>: Cost 2 vsldoi8 <1,2,3,4>, LHS
+ 2631476937U, // <3,4,0,2>: Cost 3 vsldoi4 <2,3,4,0>, <2,3,4,0>
+ 2232601732U, // <3,4,0,3>: Cost 3 vmrghw <3,0,1,2>, <4,3,5,0>
+ 2691907922U, // <3,4,0,4>: Cost 3 vsldoi8 <1,2,3,4>, <0,4,1,5>
+ 1158860086U, // <3,4,0,5>: Cost 2 vmrghw <3,0,1,2>, RHS
+ 3306343806U, // <3,4,0,6>: Cost 4 vmrghw <3,0,1,2>, <4,6,5,7>
+ 3366947484U, // <3,4,0,7>: Cost 4 vmrglw <1,u,3,0>, <3,6,4,7>
+ 1618166429U, // <3,4,0,u>: Cost 2 vsldoi8 <1,2,3,4>, LHS
+ 2631483494U, // <3,4,1,0>: Cost 3 vsldoi4 <2,3,4,1>, LHS
+ 2691908404U, // <3,4,1,1>: Cost 3 vsldoi8 <1,2,3,4>, <1,1,1,1>
+ 1618166682U, // <3,4,1,2>: Cost 2 vsldoi8 <1,2,3,4>, <1,2,3,4>
+ 3765650393U, // <3,4,1,3>: Cost 4 vsldoi8 <1,2,3,4>, <1,3,1,4>
+ 2631486774U, // <3,4,1,4>: Cost 3 vsldoi4 <2,3,4,1>, RHS
+ 2756946914U, // <3,4,1,5>: Cost 3 vsldoi12 LHS, <4,1,5,0>
+ 3765650639U, // <3,4,1,6>: Cost 4 vsldoi8 <1,2,3,4>, <1,6,1,7>
+ 3735090439U, // <3,4,1,7>: Cost 4 vsldoi4 <7,3,4,1>, <7,3,4,1>
+ 1622148480U, // <3,4,1,u>: Cost 2 vsldoi8 <1,u,3,4>, <1,u,3,4>
+ 3765650893U, // <3,4,2,0>: Cost 4 vsldoi8 <1,2,3,4>, <2,0,3,0>
+ 3831131154U, // <3,4,2,1>: Cost 4 vsldoi12 LHS, <4,2,1,3>
+ 2691909224U, // <3,4,2,2>: Cost 3 vsldoi8 <1,2,3,4>, <2,2,2,2>
+ 2691909286U, // <3,4,2,3>: Cost 3 vsldoi8 <1,2,3,4>, <2,3,0,1>
+ 2699208469U, // <3,4,2,4>: Cost 3 vsldoi8 <2,4,3,4>, <2,4,3,4>
+ 2233863478U, // <3,4,2,5>: Cost 3 vmrghw <3,2,0,3>, RHS
+ 2691909562U, // <3,4,2,6>: Cost 3 vsldoi8 <1,2,3,4>, <2,6,3,7>
+ 2701199368U, // <3,4,2,7>: Cost 3 vsldoi8 <2,7,3,4>, <2,7,3,4>
+ 2691909691U, // <3,4,2,u>: Cost 3 vsldoi8 <1,2,3,4>, <2,u,0,1>
+ 2691909782U, // <3,4,3,0>: Cost 3 vsldoi8 <1,2,3,4>, <3,0,1,2>
+ 3765651686U, // <3,4,3,1>: Cost 4 vsldoi8 <1,2,3,4>, <3,1,1,1>
+ 2691909972U, // <3,4,3,2>: Cost 3 vsldoi8 <1,2,3,4>, <3,2,4,3>
+ 2691910044U, // <3,4,3,3>: Cost 3 vsldoi8 <1,2,3,4>, <3,3,3,3>
+ 2691910096U, // <3,4,3,4>: Cost 3 vsldoi8 <1,2,3,4>, <3,4,0,1>
+ 1161006390U, // <3,4,3,5>: Cost 2 vmrghw <3,3,3,3>, RHS
+ 2691910300U, // <3,4,3,6>: Cost 3 vsldoi8 <1,2,3,4>, <3,6,4,7>
+ 3368962716U, // <3,4,3,7>: Cost 4 vmrglw <2,2,3,3>, <3,6,4,7>
+ 1161006633U, // <3,4,3,u>: Cost 2 vmrghw <3,3,3,3>, RHS
+ 2631508070U, // <3,4,4,0>: Cost 3 vsldoi4 <2,3,4,4>, LHS
+ 2631508890U, // <3,4,4,1>: Cost 3 vsldoi4 <2,3,4,4>, <1,2,3,4>
+ 2631509709U, // <3,4,4,2>: Cost 3 vsldoi4 <2,3,4,4>, <2,3,4,4>
+ 2289256788U, // <3,4,4,3>: Cost 3 vmrglw <1,2,3,4>, <3,2,4,3>
+ 1726336208U, // <3,4,4,4>: Cost 2 vsldoi12 LHS, <4,4,4,4>
+ 1618169142U, // <3,4,4,5>: Cost 2 vsldoi8 <1,2,3,4>, RHS
+ 3362998858U, // <3,4,4,6>: Cost 4 vmrglw <1,2,3,4>, <3,5,4,6>
+ 2289257116U, // <3,4,4,7>: Cost 3 vmrglw <1,2,3,4>, <3,6,4,7>
+ 1618169385U, // <3,4,4,u>: Cost 2 vsldoi8 <1,2,3,4>, RHS
+ 1557774438U, // <3,4,5,0>: Cost 2 vsldoi4 <2,3,4,5>, LHS
+ 2631516980U, // <3,4,5,1>: Cost 3 vsldoi4 <2,3,4,5>, <1,1,1,1>
+ 1557776078U, // <3,4,5,2>: Cost 2 vsldoi4 <2,3,4,5>, <2,3,4,5>
+ 2631518358U, // <3,4,5,3>: Cost 3 vsldoi4 <2,3,4,5>, <3,0,1,2>
+ 1557777718U, // <3,4,5,4>: Cost 2 vsldoi4 <2,3,4,5>, RHS
+ 2296563406U, // <3,4,5,5>: Cost 3 vmrglw <2,4,3,5>, <2,3,4,5>
+ 604818742U, // <3,4,5,6>: Cost 1 vsldoi12 LHS, RHS
+ 2661381387U, // <3,4,5,7>: Cost 3 vsldoi4 <7,3,4,5>, <7,3,4,5>
+ 604818760U, // <3,4,5,u>: Cost 1 vsldoi12 LHS, RHS
+ 3705266278U, // <3,4,6,0>: Cost 4 vsldoi4 <2,3,4,6>, LHS
+ 3831131482U, // <3,4,6,1>: Cost 4 vsldoi12 LHS, <4,6,1,7>
+ 2733715962U, // <3,4,6,2>: Cost 3 vsldoi8 <u,2,3,4>, <6,2,7,3>
+ 3844771180U, // <3,4,6,3>: Cost 4 vsldoi12 <3,2,4,3>, <4,6,3,7>
+ 2800078197U, // <3,4,6,4>: Cost 3 vsldoi12 LHS, <4,6,4,7>
+ 2236550454U, // <3,4,6,5>: Cost 3 vmrghw <3,6,0,7>, RHS
+ 2733716280U, // <3,4,6,6>: Cost 3 vsldoi8 <u,2,3,4>, <6,6,6,6>
+ 2725090156U, // <3,4,6,7>: Cost 3 vsldoi8 <6,7,3,4>, <6,7,3,4>
+ 2236550697U, // <3,4,6,u>: Cost 3 vmrghw <3,6,0,7>, RHS
+ 2733716474U, // <3,4,7,0>: Cost 3 vsldoi8 <u,2,3,4>, <7,0,1,2>
+ 3371647013U, // <3,4,7,1>: Cost 4 vmrglw <2,6,3,7>, <0,0,4,1>
+ 2727744688U, // <3,4,7,2>: Cost 3 vsldoi8 <7,2,3,4>, <7,2,3,4>
+ 3371649364U, // <3,4,7,3>: Cost 4 vmrglw <2,6,3,7>, <3,2,4,3>
+ 2733716838U, // <3,4,7,4>: Cost 3 vsldoi8 <u,2,3,4>, <7,4,5,6>
+ 2297906894U, // <3,4,7,5>: Cost 3 vmrglw <2,6,3,7>, <2,3,4,5>
+ 3371647180U, // <3,4,7,6>: Cost 4 vmrglw <2,6,3,7>, <0,2,4,6>
+ 2733717100U, // <3,4,7,7>: Cost 3 vsldoi8 <u,2,3,4>, <7,7,7,7>
+ 2297906897U, // <3,4,7,u>: Cost 3 vmrglw <2,6,3,7>, <2,3,4,u>
+ 1557799014U, // <3,4,u,0>: Cost 2 vsldoi4 <2,3,4,u>, LHS
+ 1618171694U, // <3,4,u,1>: Cost 2 vsldoi8 <1,2,3,4>, LHS
+ 1557800657U, // <3,4,u,2>: Cost 2 vsldoi4 <2,3,4,u>, <2,3,4,u>
+ 2691913660U, // <3,4,u,3>: Cost 3 vsldoi8 <1,2,3,4>, <u,3,0,1>
+ 1557802294U, // <3,4,u,4>: Cost 2 vsldoi4 <2,3,4,u>, RHS
+ 1618172058U, // <3,4,u,5>: Cost 2 vsldoi8 <1,2,3,4>, RHS
+ 604818985U, // <3,4,u,6>: Cost 1 vsldoi12 LHS, RHS
+ 2661405966U, // <3,4,u,7>: Cost 3 vsldoi4 <7,3,4,u>, <7,3,4,u>
+ 604819003U, // <3,4,u,u>: Cost 1 vsldoi12 LHS, RHS
+ 2643492966U, // <3,5,0,0>: Cost 3 vsldoi4 <4,3,5,0>, LHS
+ 2756947528U, // <3,5,0,1>: Cost 3 vsldoi12 LHS, <5,0,1,2>
+ 2331029019U, // <3,5,0,2>: Cost 3 vmrglw <u,2,3,0>, <4,u,5,2>
+ 2643495062U, // <3,5,0,3>: Cost 3 vsldoi4 <4,3,5,0>, <3,0,1,2>
+ 2756947554U, // <3,5,0,4>: Cost 3 vsldoi12 LHS, <5,0,4,1>
+ 2800078443U, // <3,5,0,5>: Cost 3 vsldoi12 LHS, <5,0,5,1>
+ 2289224194U, // <3,5,0,6>: Cost 3 vmrglw <1,2,3,0>, <3,4,5,6>
+ 3362964723U, // <3,5,0,7>: Cost 4 vmrglw <1,2,3,0>, <1,6,5,7>
+ 2756947590U, // <3,5,0,u>: Cost 3 vsldoi12 LHS, <5,0,u,1>
+ 2800078479U, // <3,5,1,0>: Cost 3 vsldoi12 LHS, <5,1,0,1>
+ 2333027218U, // <3,5,1,1>: Cost 3 vmrglw <u,5,3,1>, <4,0,5,1>
+ 2691916699U, // <3,5,1,2>: Cost 3 vsldoi8 <1,2,3,5>, <1,2,3,5>
+ 3832901294U, // <3,5,1,3>: Cost 4 vsldoi12 <1,2,5,3>, <5,1,3,5>
+ 2800078519U, // <3,5,1,4>: Cost 3 vsldoi12 LHS, <5,1,4,5>
+ 3830689467U, // <3,5,1,5>: Cost 4 vsldoi12 LHS, <5,1,5,0>
+ 3830689481U, // <3,5,1,6>: Cost 4 vsldoi12 LHS, <5,1,6,5>
+ 3873820365U, // <3,5,1,7>: Cost 4 vsldoi12 LHS, <5,1,7,0>
+ 2800078551U, // <3,5,1,u>: Cost 3 vsldoi12 LHS, <5,1,u,1>
+ 3770967487U, // <3,5,2,0>: Cost 4 vsldoi8 <2,1,3,5>, <2,0,1,4>
+ 2697225763U, // <3,5,2,1>: Cost 3 vsldoi8 <2,1,3,5>, <2,1,3,5>
+ 3830689523U, // <3,5,2,2>: Cost 4 vsldoi12 LHS, <5,2,2,2>
+ 2699216590U, // <3,5,2,3>: Cost 3 vsldoi8 <2,4,3,5>, <2,3,4,5>
+ 2699216662U, // <3,5,2,4>: Cost 3 vsldoi8 <2,4,3,5>, <2,4,3,5>
+ 2783047439U, // <3,5,2,5>: Cost 3 vsldoi12 <5,2,5,3>, <5,2,5,3>
+ 2783121176U, // <3,5,2,6>: Cost 3 vsldoi12 <5,2,6,3>, <5,2,6,3>
+ 3856936737U, // <3,5,2,7>: Cost 4 vsldoi12 <5,2,7,3>, <5,2,7,3>
+ 2701871194U, // <3,5,2,u>: Cost 3 vsldoi8 <2,u,3,5>, <2,u,3,5>
+ 2643517542U, // <3,5,3,0>: Cost 3 vsldoi4 <4,3,5,3>, LHS
+ 2331052946U, // <3,5,3,1>: Cost 3 vmrglw <u,2,3,3>, <4,0,5,1>
+ 3699345010U, // <3,5,3,2>: Cost 4 vsldoi4 <1,3,5,3>, <2,2,3,3>
+ 2705189276U, // <3,5,3,3>: Cost 3 vsldoi8 <3,4,3,5>, <3,3,3,3>
+ 2705189359U, // <3,5,3,4>: Cost 3 vsldoi8 <3,4,3,5>, <3,4,3,5>
+ 2331053274U, // <3,5,3,5>: Cost 3 vmrglw <u,2,3,3>, <4,4,5,5>
+ 2295220738U, // <3,5,3,6>: Cost 3 vmrglw <2,2,3,3>, <3,4,5,6>
+ 3368961267U, // <3,5,3,7>: Cost 4 vmrglw <2,2,3,3>, <1,6,5,7>
+ 2295220740U, // <3,5,3,u>: Cost 3 vmrglw <2,2,3,3>, <3,4,5,u>
+ 2643525734U, // <3,5,4,0>: Cost 3 vsldoi4 <4,3,5,4>, LHS
+ 2331061138U, // <3,5,4,1>: Cost 3 vmrglw <u,2,3,4>, <4,0,5,1>
+ 2235584280U, // <3,5,4,2>: Cost 3 vmrghw <3,4,5,6>, <5,2,6,3>
+ 2643528194U, // <3,5,4,3>: Cost 3 vsldoi4 <4,3,5,4>, <3,4,5,6>
+ 2735713498U, // <3,5,4,4>: Cost 3 vsldoi8 <u,5,3,5>, <4,4,5,5>
+ 2756947892U, // <3,5,4,5>: Cost 3 vsldoi12 LHS, <5,4,5,6>
+ 2289256962U, // <3,5,4,6>: Cost 3 vmrglw <1,2,3,4>, <3,4,5,6>
+ 3362997491U, // <3,5,4,7>: Cost 4 vmrglw <1,2,3,4>, <1,6,5,7>
+ 2756947919U, // <3,5,4,u>: Cost 3 vsldoi12 LHS, <5,4,u,6>
+ 2800078803U, // <3,5,5,0>: Cost 3 vsldoi12 LHS, <5,5,0,1>
+ 2800078812U, // <3,5,5,1>: Cost 3 vsldoi12 LHS, <5,5,1,1>
+ 2631591639U, // <3,5,5,2>: Cost 3 vsldoi4 <2,3,5,5>, <2,3,5,5>
+ 3832901616U, // <3,5,5,3>: Cost 4 vsldoi12 <1,2,5,3>, <5,5,3,3>
+ 2800078843U, // <3,5,5,4>: Cost 3 vsldoi12 LHS, <5,5,4,5>
+ 1726337028U, // <3,5,5,5>: Cost 2 vsldoi12 LHS, <5,5,5,5>
+ 2800078862U, // <3,5,5,6>: Cost 3 vsldoi12 LHS, <5,5,6,6>
+ 3368314099U, // <3,5,5,7>: Cost 4 vmrglw <2,1,3,5>, <1,6,5,7>
+ 1726337028U, // <3,5,5,u>: Cost 2 vsldoi12 LHS, <5,5,5,5>
+ 2800078884U, // <3,5,6,0>: Cost 3 vsldoi12 LHS, <5,6,0,1>
+ 2800078899U, // <3,5,6,1>: Cost 3 vsldoi12 LHS, <5,6,1,7>
+ 2631599832U, // <3,5,6,2>: Cost 3 vsldoi4 <2,3,5,6>, <2,3,5,6>
+ 2800078914U, // <3,5,6,3>: Cost 3 vsldoi12 LHS, <5,6,3,4>
+ 2800078924U, // <3,5,6,4>: Cost 3 vsldoi12 LHS, <5,6,4,5>
+ 2800078935U, // <3,5,6,5>: Cost 3 vsldoi12 LHS, <5,6,5,7>
+ 2297235970U, // <3,5,6,6>: Cost 3 vmrglw <2,5,3,6>, <3,4,5,6>
+ 1726337122U, // <3,5,6,7>: Cost 2 vsldoi12 LHS, <5,6,7,0>
+ 1726337131U, // <3,5,6,u>: Cost 2 vsldoi12 LHS, <5,6,u,0>
+ 3699376230U, // <3,5,7,0>: Cost 4 vsldoi4 <1,3,5,7>, LHS
+ 2333739922U, // <3,5,7,1>: Cost 3 vmrglw <u,6,3,7>, <4,0,5,1>
+ 3699378106U, // <3,5,7,2>: Cost 4 vsldoi4 <1,3,5,7>, <2,6,3,7>
+ 3371647915U, // <3,5,7,3>: Cost 4 vmrglw <2,6,3,7>, <1,2,5,3>
+ 3699379510U, // <3,5,7,4>: Cost 4 vsldoi4 <1,3,5,7>, RHS
+ 2333740250U, // <3,5,7,5>: Cost 3 vmrglw <u,6,3,7>, <4,4,5,5>
+ 2297907714U, // <3,5,7,6>: Cost 3 vmrglw <2,6,3,7>, <3,4,5,6>
+ 3370984691U, // <3,5,7,7>: Cost 4 vmrglw <2,5,3,7>, <1,6,5,7>
+ 2297907716U, // <3,5,7,u>: Cost 3 vmrglw <2,6,3,7>, <3,4,5,u>
+ 2800079046U, // <3,5,u,0>: Cost 3 vsldoi12 LHS, <5,u,0,1>
+ 2756948176U, // <3,5,u,1>: Cost 3 vsldoi12 LHS, <5,u,1,2>
+ 2331029019U, // <3,5,u,2>: Cost 3 vmrglw <u,2,3,0>, <4,u,5,2>
+ 2800079076U, // <3,5,u,3>: Cost 3 vsldoi12 LHS, <5,u,3,4>
+ 2800079085U, // <3,5,u,4>: Cost 3 vsldoi12 LHS, <5,u,4,4>
+ 1726337028U, // <3,5,u,5>: Cost 2 vsldoi12 LHS, <5,5,5,5>
+ 2289289730U, // <3,5,u,6>: Cost 3 vmrglw <1,2,3,u>, <3,4,5,6>
+ 1726337284U, // <3,5,u,7>: Cost 2 vsldoi12 LHS, <5,u,7,0>
+ 1726337293U, // <3,5,u,u>: Cost 2 vsldoi12 LHS, <5,u,u,0>
+ 3773628416U, // <3,6,0,0>: Cost 4 vsldoi8 <2,5,3,6>, <0,0,0,0>
+ 2699886694U, // <3,6,0,1>: Cost 3 vsldoi8 <2,5,3,6>, LHS
+ 2789167401U, // <3,6,0,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,0,2,1>
+ 3362965862U, // <3,6,0,3>: Cost 4 vmrglw <1,2,3,0>, <3,2,6,3>
+ 3773628754U, // <3,6,0,4>: Cost 4 vsldoi8 <2,5,3,6>, <0,4,1,5>
+ 3723284326U, // <3,6,0,5>: Cost 4 vsldoi4 <5,3,6,0>, <5,3,6,0>
+ 2800079181U, // <3,6,0,6>: Cost 3 vsldoi12 LHS, <6,0,6,1>
+ 1215483190U, // <3,6,0,7>: Cost 2 vmrglw <1,2,3,0>, RHS
+ 1215483191U, // <3,6,0,u>: Cost 2 vmrglw <1,2,3,0>, RHS
+ 3873821032U, // <3,6,1,0>: Cost 4 vsldoi12 LHS, <6,1,0,1>
+ 3773629236U, // <3,6,1,1>: Cost 4 vsldoi8 <2,5,3,6>, <1,1,1,1>
+ 2691924892U, // <3,6,1,2>: Cost 3 vsldoi8 <1,2,3,6>, <1,2,3,6>
+ 3830690184U, // <3,6,1,3>: Cost 5 vsldoi12 LHS, <6,1,3,6>
+ 3873821072U, // <3,6,1,4>: Cost 4 vsldoi12 LHS, <6,1,4,5>
+ 3873821082U, // <3,6,1,5>: Cost 4 vsldoi12 LHS, <6,1,5,6>
+ 3403453240U, // <3,6,1,6>: Cost 4 vmrglw <u,0,3,1>, <6,6,6,6>
+ 2289233206U, // <3,6,1,7>: Cost 3 vmrglw <1,2,3,1>, RHS
+ 2289233207U, // <3,6,1,u>: Cost 3 vmrglw <1,2,3,1>, RHS
+ 2661498982U, // <3,6,2,0>: Cost 3 vsldoi4 <7,3,6,2>, LHS
+ 3770975780U, // <3,6,2,1>: Cost 4 vsldoi8 <2,1,3,6>, <2,1,3,6>
+ 2631640797U, // <3,6,2,2>: Cost 3 vsldoi4 <2,3,6,2>, <2,3,6,2>
+ 3771639485U, // <3,6,2,3>: Cost 4 vsldoi8 <2,2,3,6>, <2,3,2,6>
+ 2661502262U, // <3,6,2,4>: Cost 3 vsldoi4 <7,3,6,2>, RHS
+ 2699888488U, // <3,6,2,5>: Cost 3 vsldoi8 <2,5,3,6>, <2,5,3,6>
+ 2661503482U, // <3,6,2,6>: Cost 3 vsldoi4 <7,3,6,2>, <6,2,7,3>
+ 1715425786U, // <3,6,2,7>: Cost 2 vsldoi12 <6,2,7,3>, <6,2,7,3>
+ 1715499523U, // <3,6,2,u>: Cost 2 vsldoi12 <6,2,u,3>, <6,2,u,3>
+ 3773630614U, // <3,6,3,0>: Cost 4 vsldoi8 <2,5,3,6>, <3,0,1,2>
+ 3372942825U, // <3,6,3,1>: Cost 4 vmrglw <2,u,3,3>, <2,0,6,1>
+ 2234749434U, // <3,6,3,2>: Cost 3 vmrghw <3,3,3,3>, <6,2,7,3>
+ 3368962406U, // <3,6,3,3>: Cost 4 vmrglw <2,2,3,3>, <3,2,6,3>
+ 2699889154U, // <3,6,3,4>: Cost 3 vsldoi8 <2,5,3,6>, <3,4,5,6>
+ 3773631068U, // <3,6,3,5>: Cost 4 vsldoi8 <2,5,3,6>, <3,5,6,6>
+ 2331054904U, // <3,6,3,6>: Cost 3 vmrglw <u,2,3,3>, <6,6,6,6>
+ 1221479734U, // <3,6,3,7>: Cost 2 vmrglw <2,2,3,3>, RHS
+ 1221479735U, // <3,6,3,u>: Cost 2 vmrglw <2,2,3,3>, RHS
+ 2235584801U, // <3,6,4,0>: Cost 3 vmrghw <3,4,5,6>, <6,0,1,2>
+ 3717342106U, // <3,6,4,1>: Cost 4 vsldoi4 <4,3,6,4>, <1,2,3,4>
+ 2789167729U, // <3,6,4,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,4,2,5>
+ 2235585074U, // <3,6,4,3>: Cost 3 vmrghw <3,4,5,6>, <6,3,4,5>
+ 2235585165U, // <3,6,4,4>: Cost 3 vmrghw <3,4,5,6>, <6,4,5,6>
+ 2699889974U, // <3,6,4,5>: Cost 3 vsldoi8 <2,5,3,6>, RHS
+ 2800079509U, // <3,6,4,6>: Cost 3 vsldoi12 LHS, <6,4,6,5>
+ 1215515958U, // <3,6,4,7>: Cost 2 vmrglw <1,2,3,4>, RHS
+ 1215515959U, // <3,6,4,u>: Cost 2 vmrglw <1,2,3,4>, RHS
+ 3873821356U, // <3,6,5,0>: Cost 4 vsldoi12 LHS, <6,5,0,1>
+ 3372959209U, // <3,6,5,1>: Cost 5 vmrglw <2,u,3,5>, <2,0,6,1>
+ 3862909629U, // <3,6,5,2>: Cost 4 vsldoi12 <6,2,7,3>, <6,5,2,0>
+ 3773632358U, // <3,6,5,3>: Cost 4 vsldoi8 <2,5,3,6>, <5,3,6,0>
+ 3873821396U, // <3,6,5,4>: Cost 4 vsldoi12 LHS, <6,5,4,5>
+ 3873821405U, // <3,6,5,5>: Cost 4 vsldoi12 LHS, <6,5,5,5>
+ 3862909672U, // <3,6,5,6>: Cost 4 vsldoi12 <6,2,7,3>, <6,5,6,7>
+ 2294574390U, // <3,6,5,7>: Cost 3 vmrglw <2,1,3,5>, RHS
+ 2294574391U, // <3,6,5,u>: Cost 3 vmrglw <2,1,3,5>, RHS
+ 2800079613U, // <3,6,6,0>: Cost 3 vsldoi12 LHS, <6,6,0,1>
+ 3873821446U, // <3,6,6,1>: Cost 4 vsldoi12 LHS, <6,6,1,1>
+ 2789167888U, // <3,6,6,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,6,2,2>
+ 3844920090U, // <3,6,6,3>: Cost 4 vsldoi12 <3,2,6,3>, <6,6,3,3>
+ 2800079653U, // <3,6,6,4>: Cost 3 vsldoi12 LHS, <6,6,4,5>
+ 3723333484U, // <3,6,6,5>: Cost 4 vsldoi4 <5,3,6,6>, <5,3,6,6>
+ 1726337848U, // <3,6,6,6>: Cost 2 vsldoi12 LHS, <6,6,6,6>
+ 1726337858U, // <3,6,6,7>: Cost 2 vsldoi12 LHS, <6,6,7,7>
+ 1726337867U, // <3,6,6,u>: Cost 2 vsldoi12 LHS, <6,6,u,7>
+ 1726337870U, // <3,6,7,0>: Cost 2 vsldoi12 LHS, <6,7,0,1>
+ 2297906665U, // <3,6,7,1>: Cost 3 vmrglw <2,6,3,7>, <2,0,6,1>
+ 2792117090U, // <3,6,7,2>: Cost 3 vsldoi12 <6,7,2,3>, <6,7,2,3>
+ 2297907558U, // <3,6,7,3>: Cost 3 vmrglw <2,6,3,7>, <3,2,6,3>
+ 1726337910U, // <3,6,7,4>: Cost 2 vsldoi12 LHS, <6,7,4,5>
+ 2297906993U, // <3,6,7,5>: Cost 3 vmrglw <2,6,3,7>, <2,4,6,5>
+ 2297906832U, // <3,6,7,6>: Cost 3 vmrglw <2,6,3,7>, <2,2,6,6>
+ 1224166710U, // <3,6,7,7>: Cost 2 vmrglw <2,6,3,7>, RHS
+ 1224166711U, // <3,6,7,u>: Cost 2 vmrglw <2,6,3,7>, RHS
+ 1726337951U, // <3,6,u,0>: Cost 2 vsldoi12 LHS, <6,u,0,1>
+ 2699892526U, // <3,6,u,1>: Cost 3 vsldoi8 <2,5,3,6>, LHS
+ 2789168049U, // <3,6,u,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,u,2,1>
+ 2792854460U, // <3,6,u,3>: Cost 3 vsldoi12 <6,u,3,3>, <6,u,3,3>
+ 1726337991U, // <3,6,u,4>: Cost 2 vsldoi12 LHS, <6,u,4,5>
+ 2699892890U, // <3,6,u,5>: Cost 3 vsldoi8 <2,5,3,6>, RHS
+ 1726337848U, // <3,6,u,6>: Cost 2 vsldoi12 LHS, <6,6,6,6>
+ 1215548726U, // <3,6,u,7>: Cost 2 vmrglw <1,2,3,u>, RHS
+ 1215548727U, // <3,6,u,u>: Cost 2 vmrglw <1,2,3,u>, RHS
+ 2700558336U, // <3,7,0,0>: Cost 3 vsldoi8 <2,6,3,7>, <0,0,0,0>
+ 1626816614U, // <3,7,0,1>: Cost 2 vsldoi8 <2,6,3,7>, LHS
+ 2700558513U, // <3,7,0,2>: Cost 3 vsldoi8 <2,6,3,7>, <0,2,1,6>
+ 2331030010U, // <3,7,0,3>: Cost 3 vmrglw <u,2,3,0>, <6,2,7,3>
+ 2700558674U, // <3,7,0,4>: Cost 3 vsldoi8 <2,6,3,7>, <0,4,1,5>
+ 2800079906U, // <3,7,0,5>: Cost 3 vsldoi12 LHS, <7,0,5,6>
+ 2655588936U, // <3,7,0,6>: Cost 3 vsldoi4 <6,3,7,0>, <6,3,7,0>
+ 2800079919U, // <3,7,0,7>: Cost 3 vsldoi12 LHS, <7,0,7,1>
+ 1626817181U, // <3,7,0,u>: Cost 2 vsldoi8 <2,6,3,7>, LHS
+ 3774300899U, // <3,7,1,0>: Cost 4 vsldoi8 <2,6,3,7>, <1,0,1,1>
+ 2700559156U, // <3,7,1,1>: Cost 3 vsldoi8 <2,6,3,7>, <1,1,1,1>
+ 2700559254U, // <3,7,1,2>: Cost 3 vsldoi8 <2,6,3,7>, <1,2,3,0>
+ 3774301148U, // <3,7,1,3>: Cost 4 vsldoi8 <2,6,3,7>, <1,3,1,7>
+ 3774301227U, // <3,7,1,4>: Cost 4 vsldoi8 <2,6,3,7>, <1,4,1,5>
+ 3774301295U, // <3,7,1,5>: Cost 4 vsldoi8 <2,6,3,7>, <1,5,0,1>
+ 3768329441U, // <3,7,1,6>: Cost 4 vsldoi8 <1,6,3,7>, <1,6,3,7>
+ 3403453250U, // <3,7,1,7>: Cost 4 vmrglw <u,0,3,1>, <6,6,7,7>
+ 2700559740U, // <3,7,1,u>: Cost 3 vsldoi8 <2,6,3,7>, <1,u,3,0>
+ 2700559849U, // <3,7,2,0>: Cost 3 vsldoi8 <2,6,3,7>, <2,0,6,1>
+ 3770983973U, // <3,7,2,1>: Cost 4 vsldoi8 <2,1,3,7>, <2,1,3,7>
+ 2700559976U, // <3,7,2,2>: Cost 3 vsldoi8 <2,6,3,7>, <2,2,2,2>
+ 2698569415U, // <3,7,2,3>: Cost 3 vsldoi8 <2,3,3,7>, <2,3,3,7>
+ 2700560177U, // <3,7,2,4>: Cost 3 vsldoi8 <2,6,3,7>, <2,4,6,5>
+ 3773638505U, // <3,7,2,5>: Cost 4 vsldoi8 <2,5,3,7>, <2,5,3,7>
+ 1626818490U, // <3,7,2,6>: Cost 2 vsldoi8 <2,6,3,7>, <2,6,3,7>
+ 2795140307U, // <3,7,2,7>: Cost 3 vsldoi12 <7,2,7,3>, <7,2,7,3>
+ 1628145756U, // <3,7,2,u>: Cost 2 vsldoi8 <2,u,3,7>, <2,u,3,7>
+ 2700560534U, // <3,7,3,0>: Cost 3 vsldoi8 <2,6,3,7>, <3,0,1,2>
+ 3774302438U, // <3,7,3,1>: Cost 4 vsldoi8 <2,6,3,7>, <3,1,1,1>
+ 2700560742U, // <3,7,3,2>: Cost 3 vsldoi8 <2,6,3,7>, <3,2,6,3>
+ 2700560796U, // <3,7,3,3>: Cost 3 vsldoi8 <2,6,3,7>, <3,3,3,3>
+ 2700560898U, // <3,7,3,4>: Cost 3 vsldoi8 <2,6,3,7>, <3,4,5,6>
+ 3774302821U, // <3,7,3,5>: Cost 4 vsldoi8 <2,6,3,7>, <3,5,7,6>
+ 2700561079U, // <3,7,3,6>: Cost 3 vsldoi8 <2,6,3,7>, <3,6,7,7>
+ 2700561091U, // <3,7,3,7>: Cost 3 vsldoi8 <2,6,3,7>, <3,7,0,1>
+ 2700561182U, // <3,7,3,u>: Cost 3 vsldoi8 <2,6,3,7>, <3,u,1,2>
+ 2655617126U, // <3,7,4,0>: Cost 3 vsldoi4 <6,3,7,4>, LHS
+ 3774303178U, // <3,7,4,1>: Cost 4 vsldoi8 <2,6,3,7>, <4,1,2,3>
+ 2655619002U, // <3,7,4,2>: Cost 3 vsldoi4 <6,3,7,4>, <2,6,3,7>
+ 2331062778U, // <3,7,4,3>: Cost 3 vmrglw <u,2,3,4>, <6,2,7,3>
+ 2655620406U, // <3,7,4,4>: Cost 3 vsldoi4 <6,3,7,4>, RHS
+ 1626819894U, // <3,7,4,5>: Cost 2 vsldoi8 <2,6,3,7>, RHS
+ 2655621708U, // <3,7,4,6>: Cost 3 vsldoi4 <6,3,7,4>, <6,3,7,4>
+ 2800080247U, // <3,7,4,7>: Cost 3 vsldoi12 LHS, <7,4,7,5>
+ 1626820137U, // <3,7,4,u>: Cost 2 vsldoi8 <2,6,3,7>, RHS
+ 3774303816U, // <3,7,5,0>: Cost 4 vsldoi8 <2,6,3,7>, <5,0,1,2>
+ 3873822093U, // <3,7,5,1>: Cost 4 vsldoi12 LHS, <7,5,1,0>
+ 3774303998U, // <3,7,5,2>: Cost 4 vsldoi8 <2,6,3,7>, <5,2,3,4>
+ 3862910368U, // <3,7,5,3>: Cost 4 vsldoi12 <6,2,7,3>, <7,5,3,1>
+ 3774304180U, // <3,7,5,4>: Cost 4 vsldoi8 <2,6,3,7>, <5,4,5,6>
+ 2800080310U, // <3,7,5,5>: Cost 3 vsldoi12 LHS, <7,5,5,5>
+ 2800080321U, // <3,7,5,6>: Cost 3 vsldoi12 LHS, <7,5,6,7>
+ 3873822147U, // <3,7,5,7>: Cost 4 vsldoi12 LHS, <7,5,7,0>
+ 2800080339U, // <3,7,5,u>: Cost 3 vsldoi12 LHS, <7,5,u,7>
+ 2800080348U, // <3,7,6,0>: Cost 3 vsldoi12 LHS, <7,6,0,7>
+ 3873822181U, // <3,7,6,1>: Cost 4 vsldoi12 LHS, <7,6,1,7>
+ 2789168622U, // <3,7,6,2>: Cost 3 vsldoi12 <6,2,7,3>, <7,6,2,7>
+ 2700563016U, // <3,7,6,3>: Cost 3 vsldoi8 <2,6,3,7>, <6,3,7,0>
+ 2800080384U, // <3,7,6,4>: Cost 3 vsldoi12 LHS, <7,6,4,7>
+ 3862910472U, // <3,7,6,5>: Cost 4 vsldoi12 <6,2,7,3>, <7,6,5,6>
+ 2700563256U, // <3,7,6,6>: Cost 3 vsldoi8 <2,6,3,7>, <6,6,6,6>
+ 2800080404U, // <3,7,6,7>: Cost 3 vsldoi12 LHS, <7,6,7,0>
+ 2793149988U, // <3,7,6,u>: Cost 3 vsldoi12 <6,u,7,3>, <7,6,u,7>
+ 2637725798U, // <3,7,7,0>: Cost 3 vsldoi4 <3,3,7,7>, LHS
+ 3371649227U, // <3,7,7,1>: Cost 4 vmrglw <2,6,3,7>, <3,0,7,1>
+ 2637727674U, // <3,7,7,2>: Cost 3 vsldoi4 <3,3,7,7>, <2,6,3,7>
+ 2297907567U, // <3,7,7,3>: Cost 3 vmrglw <2,6,3,7>, <3,2,7,3>
+ 2637729078U, // <3,7,7,4>: Cost 3 vsldoi4 <3,3,7,7>, RHS
+ 3371649312U, // <3,7,7,5>: Cost 4 vmrglw <2,6,3,7>, <3,1,7,5>
+ 2655646287U, // <3,7,7,6>: Cost 3 vsldoi4 <6,3,7,7>, <6,3,7,7>
+ 1726338668U, // <3,7,7,7>: Cost 2 vsldoi12 LHS, <7,7,7,7>
+ 1726338668U, // <3,7,7,u>: Cost 2 vsldoi12 LHS, <7,7,7,7>
+ 2700564179U, // <3,7,u,0>: Cost 3 vsldoi8 <2,6,3,7>, <u,0,1,2>
+ 1626822446U, // <3,7,u,1>: Cost 2 vsldoi8 <2,6,3,7>, LHS
+ 2700564357U, // <3,7,u,2>: Cost 3 vsldoi8 <2,6,3,7>, <u,2,3,0>
+ 2700564412U, // <3,7,u,3>: Cost 3 vsldoi8 <2,6,3,7>, <u,3,0,1>
+ 2700564543U, // <3,7,u,4>: Cost 3 vsldoi8 <2,6,3,7>, <u,4,5,6>
+ 1626822810U, // <3,7,u,5>: Cost 2 vsldoi8 <2,6,3,7>, RHS
+ 1662654672U, // <3,7,u,6>: Cost 2 vsldoi8 <u,6,3,7>, <u,6,3,7>
+ 1726338668U, // <3,7,u,7>: Cost 2 vsldoi12 LHS, <7,7,7,7>
+ 1626823013U, // <3,7,u,u>: Cost 2 vsldoi8 <2,6,3,7>, LHS
+ 1678557184U, // <3,u,0,0>: Cost 2 vsldoi12 LHS, <0,0,0,0>
+ 1679005395U, // <3,u,0,1>: Cost 2 vsldoi12 LHS, <u,0,1,2>
+ 2289221787U, // <3,u,0,2>: Cost 3 vmrglw <1,2,3,0>, <0,1,u,2>
+ 1215479964U, // <3,u,0,3>: Cost 2 vmrglw <1,2,3,0>, LHS
+ 2752747245U, // <3,u,0,4>: Cost 3 vsldoi12 LHS, <u,0,4,1>
+ 1158863002U, // <3,u,0,5>: Cost 2 vmrghw <3,0,1,2>, RHS
+ 2289224221U, // <3,u,0,6>: Cost 3 vmrglw <1,2,3,0>, <3,4,u,6>
+ 1215483208U, // <3,u,0,7>: Cost 2 vmrglw <1,2,3,0>, RHS
+ 1679005458U, // <3,u,0,u>: Cost 2 vsldoi12 LHS, <u,0,u,2>
+ 1558036582U, // <3,u,1,0>: Cost 2 vsldoi4 <2,3,u,1>, LHS
+ 1678558004U, // <3,u,1,1>: Cost 2 vsldoi12 LHS, <1,1,1,1>
+ 604821294U, // <3,u,1,2>: Cost 1 vsldoi12 LHS, LHS
+ 2752747317U, // <3,u,1,3>: Cost 3 vsldoi12 LHS, <u,1,3,1>
+ 1558039862U, // <3,u,1,4>: Cost 2 vsldoi4 <2,3,u,1>, RHS
+ 2756949830U, // <3,u,1,5>: Cost 3 vsldoi12 LHS, <u,1,5,0>
+ 2800080726U, // <3,u,1,6>: Cost 3 vsldoi12 LHS, <u,1,6,7>
+ 2289233224U, // <3,u,1,7>: Cost 3 vmrglw <1,2,3,1>, RHS
+ 604821348U, // <3,u,1,u>: Cost 1 vsldoi12 LHS, LHS
+ 2696586709U, // <3,u,2,0>: Cost 3 vsldoi8 <2,0,3,u>, <2,0,3,u>
+ 2757392246U, // <3,u,2,1>: Cost 3 vsldoi12 LHS, <u,2,1,3>
+ 1624172151U, // <3,u,2,2>: Cost 2 vsldoi8 <2,2,3,u>, <2,2,3,u>
+ 1679005576U, // <3,u,2,3>: Cost 2 vsldoi12 LHS, <u,2,3,3>
+ 2631789878U, // <3,u,2,4>: Cost 3 vsldoi4 <2,3,u,2>, RHS
+ 2699904874U, // <3,u,2,5>: Cost 3 vsldoi8 <2,5,3,u>, <2,5,3,u>
+ 1626826683U, // <3,u,2,6>: Cost 2 vsldoi8 <2,6,3,u>, <2,6,3,u>
+ 1726338988U, // <3,u,2,7>: Cost 2 vsldoi12 LHS, <u,2,7,3>
+ 1683208117U, // <3,u,2,u>: Cost 2 vsldoi12 LHS, <u,2,u,3>
+ 1679005628U, // <3,u,3,0>: Cost 2 vsldoi12 LHS, <u,3,0,1>
+ 1161008942U, // <3,u,3,1>: Cost 2 vmrghw <3,3,3,3>, LHS
+ 2752747471U, // <3,u,3,2>: Cost 3 vsldoi12 LHS, <u,3,2,2>
+ 403488870U, // <3,u,3,3>: Cost 1 vspltisw3 LHS
+ 1679005668U, // <3,u,3,4>: Cost 2 vsldoi12 LHS, <u,3,4,5>
+ 1161009306U, // <3,u,3,5>: Cost 2 vmrghw <3,3,3,3>, RHS
+ 2691943104U, // <3,u,3,6>: Cost 3 vsldoi8 <1,2,3,u>, <3,6,u,7>
+ 1221479752U, // <3,u,3,7>: Cost 2 vmrglw <2,2,3,3>, RHS
+ 403488870U, // <3,u,3,u>: Cost 1 vspltisw3 LHS
+ 2289255363U, // <3,u,4,0>: Cost 3 vmrglw <1,2,3,4>, <1,2,u,0>
+ 1161844526U, // <3,u,4,1>: Cost 2 vmrghw <3,4,5,6>, LHS
+ 2289256661U, // <3,u,4,2>: Cost 3 vmrglw <1,2,3,4>, <3,0,u,2>
+ 1215512732U, // <3,u,4,3>: Cost 2 vmrglw <1,2,3,4>, LHS
+ 1215513498U, // <3,u,4,4>: Cost 2 vmrglw <1,2,3,4>, <1,2,3,4>
+ 1679005759U, // <3,u,4,5>: Cost 2 vsldoi12 LHS, <u,4,5,6>
+ 2289256989U, // <3,u,4,6>: Cost 3 vmrglw <1,2,3,4>, <3,4,u,6>
+ 1215515976U, // <3,u,4,7>: Cost 2 vmrglw <1,2,3,4>, RHS
+ 1679005786U, // <3,u,4,u>: Cost 2 vsldoi12 LHS, <u,4,u,6>
+ 1558069350U, // <3,u,5,0>: Cost 2 vsldoi4 <2,3,u,5>, LHS
+ 2631811892U, // <3,u,5,1>: Cost 3 vsldoi4 <2,3,u,5>, <1,1,1,1>
+ 1558071026U, // <3,u,5,2>: Cost 2 vsldoi4 <2,3,u,5>, <2,3,u,5>
+ 2752747646U, // <3,u,5,3>: Cost 3 vsldoi12 LHS, <u,5,3,6>
+ 1558072630U, // <3,u,5,4>: Cost 2 vsldoi4 <2,3,u,5>, RHS
+ 1726337028U, // <3,u,5,5>: Cost 2 vsldoi12 LHS, <5,5,5,5>
+ 604821658U, // <3,u,5,6>: Cost 1 vsldoi12 LHS, RHS
+ 2294574408U, // <3,u,5,7>: Cost 3 vmrglw <2,1,3,5>, RHS
+ 604821676U, // <3,u,5,u>: Cost 1 vsldoi12 LHS, RHS
+ 2631819366U, // <3,u,6,0>: Cost 3 vsldoi4 <2,3,u,6>, LHS
+ 2757392574U, // <3,u,6,1>: Cost 3 vsldoi12 LHS, <u,6,1,7>
+ 2631821043U, // <3,u,6,2>: Cost 3 vsldoi4 <2,3,u,6>, <2,3,u,6>
+ 1679005904U, // <3,u,6,3>: Cost 2 vsldoi12 LHS, <u,6,3,7>
+ 2631822646U, // <3,u,6,4>: Cost 3 vsldoi4 <2,3,u,6>, RHS
+ 2236553370U, // <3,u,6,5>: Cost 3 vmrghw <3,6,0,7>, RHS
+ 1726337848U, // <3,u,6,6>: Cost 2 vsldoi12 LHS, <6,6,6,6>
+ 1726339309U, // <3,u,6,7>: Cost 2 vsldoi12 LHS, <u,6,7,0>
+ 1683208445U, // <3,u,6,u>: Cost 2 vsldoi12 LHS, <u,6,u,7>
+ 1726339328U, // <3,u,7,0>: Cost 2 vsldoi12 LHS, <u,7,0,1>
+ 2297905225U, // <3,u,7,1>: Cost 3 vmrglw <2,6,3,7>, <0,0,u,1>
+ 2631829236U, // <3,u,7,2>: Cost 3 vsldoi4 <2,3,u,7>, <2,3,u,7>
+ 1224163484U, // <3,u,7,3>: Cost 2 vmrglw <2,6,3,7>, LHS
+ 1726339368U, // <3,u,7,4>: Cost 2 vsldoi12 LHS, <u,7,4,5>
+ 2297905553U, // <3,u,7,5>: Cost 3 vmrglw <2,6,3,7>, <0,4,u,5>
+ 2297905392U, // <3,u,7,6>: Cost 3 vmrglw <2,6,3,7>, <0,2,u,6>
+ 1224166728U, // <3,u,7,7>: Cost 2 vmrglw <2,6,3,7>, RHS
+ 1224163489U, // <3,u,7,u>: Cost 2 vmrglw <2,6,3,7>, LHS
+ 1683208529U, // <3,u,u,0>: Cost 2 vsldoi12 LHS, <u,u,0,1>
+ 1679006043U, // <3,u,u,1>: Cost 2 vsldoi12 LHS, <u,u,1,2>
+ 604821861U, // <3,u,u,2>: Cost 1 vsldoi12 LHS, LHS
+ 403488870U, // <3,u,u,3>: Cost 1 vspltisw3 LHS
+ 1683208569U, // <3,u,u,4>: Cost 2 vsldoi12 LHS, <u,u,4,5>
+ 1679006083U, // <3,u,u,5>: Cost 2 vsldoi12 LHS, <u,u,5,6>
+ 604821901U, // <3,u,u,6>: Cost 1 vsldoi12 LHS, RHS
+ 1215548744U, // <3,u,u,7>: Cost 2 vmrglw <1,2,3,u>, RHS
+ 604821915U, // <3,u,u,u>: Cost 1 vsldoi12 LHS, LHS
+ 2759016448U, // <4,0,0,0>: Cost 3 vsldoi12 <1,2,3,4>, <0,0,0,0>
+ 1165115494U, // <4,0,0,1>: Cost 2 vmrghw <4,0,5,1>, LHS
+ 3717531337U, // <4,0,0,2>: Cost 4 vsldoi4 <4,4,0,0>, <2,3,4,0>
+ 3369675785U, // <4,0,0,3>: Cost 4 vmrglw <2,3,4,0>, <4,2,0,3>
+ 2751791144U, // <4,0,0,4>: Cost 3 vsldoi12 <0,0,4,4>, <0,0,4,4>
+ 2238857630U, // <4,0,0,5>: Cost 3 vmrghw <4,0,5,1>, <0,5,1,0>
+ 3312591341U, // <4,0,0,6>: Cost 4 vmrghw <4,0,5,0>, <0,6,0,7>
+ 3369676113U, // <4,0,0,7>: Cost 4 vmrglw <2,3,4,0>, <4,6,0,7>
+ 1165116061U, // <4,0,0,u>: Cost 2 vmrghw <4,0,5,1>, LHS
+ 2637824102U, // <4,0,1,0>: Cost 3 vsldoi4 <3,4,0,1>, LHS
+ 2637824922U, // <4,0,1,1>: Cost 3 vsldoi4 <3,4,0,1>, <1,2,3,4>
+ 1685274726U, // <4,0,1,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+ 2637826512U, // <4,0,1,3>: Cost 3 vsldoi4 <3,4,0,1>, <3,4,0,1>
+ 2637827382U, // <4,0,1,4>: Cost 3 vsldoi4 <3,4,0,1>, RHS
+ 2661716070U, // <4,0,1,5>: Cost 3 vsldoi4 <7,4,0,1>, <5,6,7,4>
+ 3729486427U, // <4,0,1,6>: Cost 4 vsldoi4 <6,4,0,1>, <6,4,0,1>
+ 2661717300U, // <4,0,1,7>: Cost 3 vsldoi4 <7,4,0,1>, <7,4,0,1>
+ 1685274780U, // <4,0,1,u>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+ 3711574118U, // <4,0,2,0>: Cost 4 vsldoi4 <3,4,0,2>, LHS
+ 2240200806U, // <4,0,2,1>: Cost 3 vmrghw <4,2,5,3>, LHS
+ 3771663992U, // <4,0,2,2>: Cost 4 vsldoi8 <2,2,4,0>, <2,2,4,0>
+ 2698585801U, // <4,0,2,3>: Cost 3 vsldoi8 <2,3,4,0>, <2,3,4,0>
+ 3373672105U, // <4,0,2,4>: Cost 4 vmrglw <3,0,4,2>, <2,3,0,4>
+ 3810813795U, // <4,0,2,5>: Cost 4 vsldoi8 <u,7,4,0>, <2,5,3,1>
+ 3772327866U, // <4,0,2,6>: Cost 4 vsldoi8 <2,3,4,0>, <2,6,3,7>
+ 3386280568U, // <4,0,2,7>: Cost 5 vmrglw <5,1,4,2>, <3,6,0,7>
+ 2701903966U, // <4,0,2,u>: Cost 3 vsldoi8 <2,u,4,0>, <2,u,4,0>
+ 3699638374U, // <4,0,3,0>: Cost 4 vsldoi4 <1,4,0,3>, LHS
+ 2753560832U, // <4,0,3,1>: Cost 3 vsldoi12 <0,3,1,4>, <0,3,1,4>
+ 3772328276U, // <4,0,3,2>: Cost 4 vsldoi8 <2,3,4,0>, <3,2,4,3>
+ 3827302674U, // <4,0,3,3>: Cost 4 vsldoi12 <0,3,1,4>, <0,3,3,4>
+ 3699641654U, // <4,0,3,4>: Cost 4 vsldoi4 <1,4,0,3>, RHS
+ 3779627588U, // <4,0,3,5>: Cost 4 vsldoi8 <3,5,4,0>, <3,5,4,0>
+ 3772328604U, // <4,0,3,6>: Cost 4 vsldoi8 <2,3,4,0>, <3,6,4,7>
+ 3780954854U, // <4,0,3,7>: Cost 4 vsldoi8 <3,7,4,0>, <3,7,4,0>
+ 2753560832U, // <4,0,3,u>: Cost 3 vsldoi12 <0,3,1,4>, <0,3,1,4>
+ 2725129106U, // <4,0,4,0>: Cost 3 vsldoi8 <6,7,4,0>, <4,0,5,1>
+ 1167720550U, // <4,0,4,1>: Cost 2 vmrghw <4,4,4,4>, LHS
+ 3839172953U, // <4,0,4,2>: Cost 4 vsldoi12 <2,3,0,4>, <0,4,2,3>
+ 3772329051U, // <4,0,4,3>: Cost 4 vsldoi8 <2,3,4,0>, <4,3,0,4>
+ 2241462610U, // <4,0,4,4>: Cost 3 vmrghw <4,4,4,4>, <0,4,1,5>
+ 2698587446U, // <4,0,4,5>: Cost 3 vsldoi8 <2,3,4,0>, RHS
+ 3772329297U, // <4,0,4,6>: Cost 4 vsldoi8 <2,3,4,0>, <4,6,0,7>
+ 3735483703U, // <4,0,4,7>: Cost 4 vsldoi4 <7,4,0,4>, <7,4,0,4>
+ 1167721117U, // <4,0,4,u>: Cost 2 vmrghw <4,4,4,4>, LHS
+ 1168556032U, // <4,0,5,0>: Cost 2 vmrghw RHS, <0,0,0,0>
+ 94814310U, // <4,0,5,1>: Cost 1 vmrghw RHS, LHS
+ 2242298029U, // <4,0,5,2>: Cost 3 vmrghw RHS, <0,2,1,2>
+ 2637859284U, // <4,0,5,3>: Cost 3 vsldoi4 <3,4,0,5>, <3,4,0,5>
+ 1168556370U, // <4,0,5,4>: Cost 2 vmrghw RHS, <0,4,1,5>
+ 2242306530U, // <4,0,5,5>: Cost 3 vmrghw RHS, <0,5,u,5>
+ 2242298358U, // <4,0,5,6>: Cost 3 vmrghw RHS, <0,6,1,7>
+ 2661750072U, // <4,0,5,7>: Cost 3 vsldoi4 <7,4,0,5>, <7,4,0,5>
+ 94814877U, // <4,0,5,u>: Cost 1 vmrghw RHS, LHS
+ 3316580362U, // <4,0,6,0>: Cost 4 vmrghw <4,6,5,1>, <0,0,1,1>
+ 2242846822U, // <4,0,6,1>: Cost 3 vmrghw <4,6,5,2>, LHS
+ 3798872570U, // <4,0,6,2>: Cost 4 vsldoi8 <6,7,4,0>, <6,2,7,3>
+ 3796218413U, // <4,0,6,3>: Cost 4 vsldoi8 <6,3,4,0>, <6,3,4,0>
+ 3834528273U, // <4,0,6,4>: Cost 4 vsldoi12 <1,5,0,4>, <0,6,4,7>
+ 3798872811U, // <4,0,6,5>: Cost 4 vsldoi8 <6,7,4,0>, <6,5,7,1>
+ 3316621876U, // <4,0,6,6>: Cost 4 vmrghw <4,6,5,6>, <0,6,u,6>
+ 2725131121U, // <4,0,6,7>: Cost 3 vsldoi8 <6,7,4,0>, <6,7,4,0>
+ 2242847389U, // <4,0,6,u>: Cost 3 vmrghw <4,6,5,2>, LHS
+ 3377692672U, // <4,0,7,0>: Cost 4 vmrglw <3,6,4,7>, <0,0,0,0>
+ 2243493990U, // <4,0,7,1>: Cost 3 vmrghw <4,7,5,0>, LHS
+ 3775648970U, // <4,0,7,2>: Cost 5 vsldoi8 <2,u,4,0>, <7,2,6,3>
+ 3802191110U, // <4,0,7,3>: Cost 4 vsldoi8 <7,3,4,0>, <7,3,4,0>
+ 3317236050U, // <4,0,7,4>: Cost 4 vmrghw <4,7,5,0>, <0,4,1,5>
+ 3803518376U, // <4,0,7,5>: Cost 4 vsldoi8 <7,5,4,0>, <7,5,4,0>
+ 3317236214U, // <4,0,7,6>: Cost 5 vmrghw <4,7,5,0>, <0,6,1,7>
+ 3798873708U, // <4,0,7,7>: Cost 4 vsldoi8 <6,7,4,0>, <7,7,7,7>
+ 2243494557U, // <4,0,7,u>: Cost 3 vmrghw <4,7,5,0>, LHS
+ 1170546688U, // <4,0,u,0>: Cost 2 vmrghw RHS, <0,0,0,0>
+ 96804966U, // <4,0,u,1>: Cost 1 vmrghw RHS, LHS
+ 1685275293U, // <4,0,u,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+ 2637883863U, // <4,0,u,3>: Cost 3 vsldoi4 <3,4,0,u>, <3,4,0,u>
+ 1170547026U, // <4,0,u,4>: Cost 2 vmrghw RHS, <0,4,1,5>
+ 2698590362U, // <4,0,u,5>: Cost 3 vsldoi8 <2,3,4,0>, RHS
+ 2244289014U, // <4,0,u,6>: Cost 3 vmrghw RHS, <0,6,1,7>
+ 2661774651U, // <4,0,u,7>: Cost 3 vsldoi4 <7,4,0,u>, <7,4,0,u>
+ 96805533U, // <4,0,u,u>: Cost 1 vmrghw RHS, LHS
+ 2667749478U, // <4,1,0,0>: Cost 3 vsldoi4 <u,4,1,0>, LHS
+ 2689966182U, // <4,1,0,1>: Cost 3 vsldoi8 <0,u,4,1>, LHS
+ 2238571418U, // <4,1,0,2>: Cost 3 vmrghw <4,0,1,2>, <1,2,3,4>
+ 3711633880U, // <4,1,0,3>: Cost 4 vsldoi4 <3,4,1,0>, <3,4,1,0>
+ 2689966418U, // <4,1,0,4>: Cost 3 vsldoi8 <0,u,4,1>, <0,4,1,5>
+ 3361046866U, // <4,1,0,5>: Cost 4 vmrglw <0,u,4,0>, <0,4,1,5>
+ 3741495802U, // <4,1,0,6>: Cost 4 vsldoi4 <u,4,1,0>, <6,2,7,3>
+ 3741496314U, // <4,1,0,7>: Cost 4 vsldoi4 <u,4,1,0>, <7,0,1,2>
+ 2689966765U, // <4,1,0,u>: Cost 3 vsldoi8 <0,u,4,1>, <0,u,4,1>
+ 3764372222U, // <4,1,1,0>: Cost 4 vsldoi8 <1,0,4,1>, <1,0,4,1>
+ 2758206263U, // <4,1,1,1>: Cost 3 vsldoi12 <1,1,1,4>, <1,1,1,4>
+ 2698593178U, // <4,1,1,2>: Cost 3 vsldoi8 <2,3,4,1>, <1,2,3,4>
+ 3361057810U, // <4,1,1,3>: Cost 4 vmrglw <0,u,4,1>, <4,2,1,3>
+ 3827303250U, // <4,1,1,4>: Cost 4 vsldoi12 <0,3,1,4>, <1,1,4,4>
+ 2287313234U, // <4,1,1,5>: Cost 3 vmrglw <0,u,4,1>, <0,4,1,5>
+ 3763709171U, // <4,1,1,6>: Cost 4 vsldoi8 <0,u,4,1>, <1,6,5,7>
+ 3361058138U, // <4,1,1,7>: Cost 4 vmrglw <0,u,4,1>, <4,6,1,7>
+ 2239759744U, // <4,1,1,u>: Cost 3 vmrghw <4,1,u,3>, <1,u,3,4>
+ 2637906022U, // <4,1,2,0>: Cost 3 vsldoi4 <3,4,1,2>, LHS
+ 2637906842U, // <4,1,2,1>: Cost 3 vsldoi4 <3,4,1,2>, <1,2,3,4>
+ 3763709544U, // <4,1,2,2>: Cost 4 vsldoi8 <0,u,4,1>, <2,2,2,2>
+ 1685275546U, // <4,1,2,3>: Cost 2 vsldoi12 <1,2,3,4>, <1,2,3,4>
+ 2637909302U, // <4,1,2,4>: Cost 3 vsldoi4 <3,4,1,2>, RHS
+ 3361063250U, // <4,1,2,5>: Cost 4 vmrglw <0,u,4,2>, <0,4,1,5>
+ 3763709882U, // <4,1,2,6>: Cost 4 vsldoi8 <0,u,4,1>, <2,6,3,7>
+ 3735541054U, // <4,1,2,7>: Cost 4 vsldoi4 <7,4,1,2>, <7,4,1,2>
+ 1685644231U, // <4,1,2,u>: Cost 2 vsldoi12 <1,2,u,4>, <1,2,u,4>
+ 2702575792U, // <4,1,3,0>: Cost 3 vsldoi8 <3,0,4,1>, <3,0,4,1>
+ 3832759257U, // <4,1,3,1>: Cost 4 vsldoi12 <1,2,3,4>, <1,3,1,4>
+ 3833349090U, // <4,1,3,2>: Cost 4 vsldoi12 <1,3,2,4>, <1,3,2,4>
+ 3763710364U, // <4,1,3,3>: Cost 4 vsldoi8 <0,u,4,1>, <3,3,3,3>
+ 2707884546U, // <4,1,3,4>: Cost 3 vsldoi8 <3,u,4,1>, <3,4,5,6>
+ 3361071442U, // <4,1,3,5>: Cost 4 vmrglw <0,u,4,3>, <0,4,1,5>
+ 3772336796U, // <4,1,3,6>: Cost 4 vsldoi8 <2,3,4,1>, <3,6,4,7>
+ 3775654595U, // <4,1,3,7>: Cost 5 vsldoi8 <2,u,4,1>, <3,7,0,1>
+ 2707884856U, // <4,1,3,u>: Cost 3 vsldoi8 <3,u,4,1>, <3,u,4,1>
+ 2667782246U, // <4,1,4,0>: Cost 3 vsldoi4 <u,4,1,4>, LHS
+ 2241463092U, // <4,1,4,1>: Cost 3 vmrghw <4,4,4,4>, <1,1,1,1>
+ 2241553306U, // <4,1,4,2>: Cost 3 vmrghw <4,4,5,6>, <1,2,3,4>
+ 3827303484U, // <4,1,4,3>: Cost 4 vsldoi12 <0,3,1,4>, <1,4,3,4>
+ 2667785424U, // <4,1,4,4>: Cost 3 vsldoi4 <u,4,1,4>, <4,4,4,4>
+ 2689969462U, // <4,1,4,5>: Cost 3 vsldoi8 <0,u,4,1>, RHS
+ 3763711322U, // <4,1,4,6>: Cost 4 vsldoi8 <0,u,4,1>, <4,6,1,7>
+ 3867116636U, // <4,1,4,7>: Cost 4 vsldoi12 <7,0,1,4>, <1,4,7,0>
+ 2689969705U, // <4,1,4,u>: Cost 3 vsldoi8 <0,u,4,1>, RHS
+ 1546273106U, // <4,1,5,0>: Cost 2 vsldoi4 <0,4,1,5>, <0,4,1,5>
+ 1168556852U, // <4,1,5,1>: Cost 2 vmrghw RHS, <1,1,1,1>
+ 1168556950U, // <4,1,5,2>: Cost 2 vmrghw RHS, <1,2,3,0>
+ 2620016790U, // <4,1,5,3>: Cost 3 vsldoi4 <0,4,1,5>, <3,0,1,2>
+ 1546276150U, // <4,1,5,4>: Cost 2 vsldoi4 <0,4,1,5>, RHS
+ 2620018692U, // <4,1,5,5>: Cost 3 vsldoi4 <0,4,1,5>, <5,5,5,5>
+ 2242299087U, // <4,1,5,6>: Cost 3 vmrghw RHS, <1,6,1,7>
+ 2667795450U, // <4,1,5,7>: Cost 3 vsldoi4 <u,4,1,5>, <7,0,1,2>
+ 1546278702U, // <4,1,5,u>: Cost 2 vsldoi4 <0,4,1,5>, LHS
+ 3781628193U, // <4,1,6,0>: Cost 4 vsldoi8 <3,u,4,1>, <6,0,1,2>
+ 3832759503U, // <4,1,6,1>: Cost 4 vsldoi12 <1,2,3,4>, <1,6,1,7>
+ 3316261786U, // <4,1,6,2>: Cost 4 vmrghw <4,6,0,7>, <1,2,3,4>
+ 3781628466U, // <4,1,6,3>: Cost 4 vsldoi8 <3,u,4,1>, <6,3,4,5>
+ 3827303658U, // <4,1,6,4>: Cost 4 vsldoi12 <0,3,1,4>, <1,6,4,7>
+ 3361096018U, // <4,1,6,5>: Cost 4 vmrglw <0,u,4,6>, <0,4,1,5>
+ 3788264248U, // <4,1,6,6>: Cost 4 vsldoi8 <5,0,4,1>, <6,6,6,6>
+ 3788264270U, // <4,1,6,7>: Cost 4 vsldoi8 <5,0,4,1>, <6,7,0,1>
+ 3832759566U, // <4,1,6,u>: Cost 4 vsldoi12 <1,2,3,4>, <1,6,u,7>
+ 2726466580U, // <4,1,7,0>: Cost 3 vsldoi8 <7,0,4,1>, <7,0,4,1>
+ 3377692682U, // <4,1,7,1>: Cost 4 vmrglw <3,6,4,7>, <0,0,1,1>
+ 3377694870U, // <4,1,7,2>: Cost 4 vmrglw <3,6,4,7>, <3,0,1,2>
+ 3802199303U, // <4,1,7,3>: Cost 4 vsldoi8 <7,3,4,1>, <7,3,4,1>
+ 2731775334U, // <4,1,7,4>: Cost 3 vsldoi8 <7,u,4,1>, <7,4,5,6>
+ 3377693010U, // <4,1,7,5>: Cost 4 vmrglw <3,6,4,7>, <0,4,1,5>
+ 3365749804U, // <4,1,7,6>: Cost 5 vmrglw <1,6,4,7>, <1,4,1,6>
+ 3788265068U, // <4,1,7,7>: Cost 4 vsldoi8 <5,0,4,1>, <7,7,7,7>
+ 2731775644U, // <4,1,7,u>: Cost 3 vsldoi8 <7,u,4,1>, <7,u,4,1>
+ 1546297685U, // <4,1,u,0>: Cost 2 vsldoi4 <0,4,1,u>, <0,4,1,u>
+ 1170547508U, // <4,1,u,1>: Cost 2 vmrghw RHS, <1,1,1,1>
+ 1170547606U, // <4,1,u,2>: Cost 2 vmrghw RHS, <1,2,3,0>
+ 1689257344U, // <4,1,u,3>: Cost 2 vsldoi12 <1,u,3,4>, <1,u,3,4>
+ 1546300726U, // <4,1,u,4>: Cost 2 vsldoi4 <0,4,1,u>, RHS
+ 2284716370U, // <4,1,u,5>: Cost 3 vmrglw <0,4,4,u>, <0,4,1,5>
+ 2244289743U, // <4,1,u,6>: Cost 3 vmrghw RHS, <1,6,1,7>
+ 2667820026U, // <4,1,u,7>: Cost 3 vsldoi4 <u,4,1,u>, <7,0,1,2>
+ 1546303278U, // <4,1,u,u>: Cost 2 vsldoi4 <0,4,1,u>, LHS
+ 3729621094U, // <4,2,0,0>: Cost 4 vsldoi4 <6,4,2,0>, LHS
+ 3763716198U, // <4,2,0,1>: Cost 4 vsldoi8 <0,u,4,2>, LHS
+ 2238858856U, // <4,2,0,2>: Cost 3 vmrghw <4,0,5,1>, <2,2,2,2>
+ 2295930982U, // <4,2,0,3>: Cost 3 vmrglw <2,3,4,0>, LHS
+ 3763716434U, // <4,2,0,4>: Cost 4 vsldoi8 <0,u,4,2>, <0,4,1,5>
+ 2238859107U, // <4,2,0,5>: Cost 3 vmrghw <4,0,5,1>, <2,5,3,1>
+ 2238859194U, // <4,2,0,6>: Cost 3 vmrghw <4,0,5,1>, <2,6,3,7>
+ 3312601066U, // <4,2,0,7>: Cost 4 vmrghw <4,0,5,1>, <2,7,0,1>
+ 2295930987U, // <4,2,0,u>: Cost 3 vmrglw <2,3,4,0>, LHS
+ 3699769446U, // <4,2,1,0>: Cost 4 vsldoi4 <1,4,2,1>, LHS
+ 3313255971U, // <4,2,1,1>: Cost 4 vmrghw <4,1,5,0>, <2,1,3,5>
+ 3361056360U, // <4,2,1,2>: Cost 4 vmrglw <0,u,4,1>, <2,2,2,2>
+ 2287312998U, // <4,2,1,3>: Cost 3 vmrglw <0,u,4,1>, LHS
+ 3788932148U, // <4,2,1,4>: Cost 4 vsldoi8 <5,1,4,2>, <1,4,2,5>
+ 3313256290U, // <4,2,1,5>: Cost 4 vmrghw <4,1,5,0>, <2,5,3,0>
+ 3838289469U, // <4,2,1,6>: Cost 4 vsldoi12 <2,1,6,4>, <2,1,6,4>
+ 3369682865U, // <4,2,1,7>: Cost 5 vmrglw <2,3,4,1>, <2,6,2,7>
+ 2287313003U, // <4,2,1,u>: Cost 3 vmrglw <0,u,4,1>, LHS
+ 3838658133U, // <4,2,2,0>: Cost 4 vsldoi12 <2,2,2,4>, <2,2,0,1>
+ 3711722394U, // <4,2,2,1>: Cost 4 vsldoi4 <3,4,2,2>, <1,2,3,4>
+ 2759018088U, // <4,2,2,2>: Cost 3 vsldoi12 <1,2,3,4>, <2,2,2,2>
+ 2759018098U, // <4,2,2,3>: Cost 3 vsldoi12 <1,2,3,4>, <2,2,3,3>
+ 3838658168U, // <4,2,2,4>: Cost 4 vsldoi12 <2,2,2,4>, <2,2,4,0>
+ 3369027341U, // <4,2,2,5>: Cost 4 vmrglw <2,2,4,2>, <2,4,2,5>
+ 2240227258U, // <4,2,2,6>: Cost 3 vmrghw <4,2,5,6>, <2,6,3,7>
+ 3735614791U, // <4,2,2,7>: Cost 4 vsldoi4 <7,4,2,2>, <7,4,2,2>
+ 2759018143U, // <4,2,2,u>: Cost 3 vsldoi12 <1,2,3,4>, <2,2,u,3>
+ 2759018150U, // <4,2,3,0>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,0,1>
+ 3831948975U, // <4,2,3,1>: Cost 4 vsldoi12 <1,1,1,4>, <2,3,1,1>
+ 3832759993U, // <4,2,3,2>: Cost 4 vsldoi12 <1,2,3,4>, <2,3,2,2>
+ 2759018180U, // <4,2,3,3>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,3,4>
+ 2759018185U, // <4,2,3,4>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,4,0>
+ 3839542998U, // <4,2,3,5>: Cost 4 vsldoi12 <2,3,5,4>, <2,3,5,4>
+ 3314640826U, // <4,2,3,6>: Cost 4 vmrghw <4,3,5,7>, <2,6,3,7>
+ 2765948648U, // <4,2,3,7>: Cost 3 vsldoi12 <2,3,7,4>, <2,3,7,4>
+ 2759018222U, // <4,2,3,u>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,u,1>
+ 3838658295U, // <4,2,4,0>: Cost 4 vsldoi12 <2,2,2,4>, <2,4,0,1>
+ 3315205667U, // <4,2,4,1>: Cost 4 vmrghw <4,4,4,4>, <2,1,3,5>
+ 2241463912U, // <4,2,4,2>: Cost 3 vmrghw <4,4,4,4>, <2,2,2,2>
+ 1234829414U, // <4,2,4,3>: Cost 2 vmrglw <4,4,4,4>, LHS
+ 2241464085U, // <4,2,4,4>: Cost 3 vmrghw <4,4,4,4>, <2,4,3,4>
+ 2241546087U, // <4,2,4,5>: Cost 3 vmrghw <4,4,5,5>, <2,5,3,5>
+ 2241464250U, // <4,2,4,6>: Cost 3 vmrghw <4,4,4,4>, <2,6,3,7>
+ 3741602873U, // <4,2,4,7>: Cost 4 vsldoi4 <u,4,2,4>, <7,0,u,2>
+ 1234829419U, // <4,2,4,u>: Cost 2 vmrglw <4,4,4,4>, LHS
+ 2626060390U, // <4,2,5,0>: Cost 3 vsldoi4 <1,4,2,5>, LHS
+ 2626061364U, // <4,2,5,1>: Cost 3 vsldoi4 <1,4,2,5>, <1,4,2,5>
+ 1168557672U, // <4,2,5,2>: Cost 2 vmrghw RHS, <2,2,2,2>
+ 1222230118U, // <4,2,5,3>: Cost 2 vmrglw <2,3,4,5>, LHS
+ 2626063670U, // <4,2,5,4>: Cost 3 vsldoi4 <1,4,2,5>, RHS
+ 2242299752U, // <4,2,5,5>: Cost 3 vmrghw RHS, <2,5,3,6>
+ 1168558010U, // <4,2,5,6>: Cost 2 vmrghw RHS, <2,6,3,7>
+ 2242299882U, // <4,2,5,7>: Cost 3 vmrghw RHS, <2,7,0,1>
+ 1222230123U, // <4,2,5,u>: Cost 2 vmrglw <2,3,4,5>, LHS
+ 3711754342U, // <4,2,6,0>: Cost 4 vsldoi4 <3,4,2,6>, LHS
+ 3711755162U, // <4,2,6,1>: Cost 4 vsldoi4 <3,4,2,6>, <1,2,3,4>
+ 3838658481U, // <4,2,6,2>: Cost 4 vsldoi12 <2,2,2,4>, <2,6,2,7>
+ 2759018426U, // <4,2,6,3>: Cost 3 vsldoi12 <1,2,3,4>, <2,6,3,7>
+ 3838658499U, // <4,2,6,4>: Cost 4 vsldoi12 <2,2,2,4>, <2,6,4,7>
+ 3735646310U, // <4,2,6,5>: Cost 4 vsldoi4 <7,4,2,6>, <5,6,7,4>
+ 3316590522U, // <4,2,6,6>: Cost 4 vmrghw <4,6,5,2>, <2,6,3,7>
+ 3798889331U, // <4,2,6,7>: Cost 4 vsldoi8 <6,7,4,2>, <6,7,4,2>
+ 2759018471U, // <4,2,6,u>: Cost 3 vsldoi12 <1,2,3,4>, <2,6,u,7>
+ 3874564074U, // <4,2,7,0>: Cost 4 vsldoi12 <u,2,3,4>, <2,7,0,1>
+ 3800880230U, // <4,2,7,1>: Cost 4 vsldoi8 <7,1,4,2>, <7,1,4,2>
+ 3371722344U, // <4,2,7,2>: Cost 4 vmrglw <2,6,4,7>, <2,2,2,2>
+ 2303950950U, // <4,2,7,3>: Cost 3 vmrglw <3,6,4,7>, LHS
+ 3371722346U, // <4,2,7,4>: Cost 4 vmrglw <2,6,4,7>, <2,2,2,4>
+ 3371722509U, // <4,2,7,5>: Cost 5 vmrglw <2,6,4,7>, <2,4,2,5>
+ 3317237690U, // <4,2,7,6>: Cost 4 vmrghw <4,7,5,0>, <2,6,3,7>
+ 3317237738U, // <4,2,7,7>: Cost 4 vmrghw <4,7,5,0>, <2,7,0,1>
+ 2303950955U, // <4,2,7,u>: Cost 3 vmrglw <3,6,4,7>, LHS
+ 2759018555U, // <4,2,u,0>: Cost 3 vsldoi12 <1,2,3,4>, <2,u,0,1>
+ 2626085943U, // <4,2,u,1>: Cost 3 vsldoi4 <1,4,2,u>, <1,4,2,u>
+ 1170548328U, // <4,2,u,2>: Cost 2 vmrghw RHS, <2,2,2,2>
+ 1222254694U, // <4,2,u,3>: Cost 2 vmrglw <2,3,4,u>, LHS
+ 2759018595U, // <4,2,u,4>: Cost 3 vsldoi12 <1,2,3,4>, <2,u,4,5>
+ 2244290408U, // <4,2,u,5>: Cost 3 vmrghw RHS, <2,5,3,6>
+ 1170548666U, // <4,2,u,6>: Cost 2 vmrghw RHS, <2,6,3,7>
+ 2769266813U, // <4,2,u,7>: Cost 3 vsldoi12 <2,u,7,4>, <2,u,7,4>
+ 1222254699U, // <4,2,u,u>: Cost 2 vmrglw <2,3,4,u>, LHS
+ 2238859414U, // <4,3,0,0>: Cost 3 vmrghw <4,0,5,1>, <3,0,1,2>
+ 2759018646U, // <4,3,0,1>: Cost 3 vsldoi12 <1,2,3,4>, <3,0,1,2>
+ 3312314708U, // <4,3,0,2>: Cost 4 vmrghw <4,0,1,2>, <3,2,4,3>
+ 2238859676U, // <4,3,0,3>: Cost 3 vmrghw <4,0,5,1>, <3,3,3,3>
+ 2295931802U, // <4,3,0,4>: Cost 3 vmrglw <2,3,4,0>, <1,2,3,4>
+ 3735670886U, // <4,3,0,5>: Cost 4 vsldoi4 <7,4,3,0>, <5,6,7,4>
+ 3312315036U, // <4,3,0,6>: Cost 4 vmrghw <4,0,1,2>, <3,6,4,7>
+ 3369674682U, // <4,3,0,7>: Cost 4 vmrglw <2,3,4,0>, <2,6,3,7>
+ 2759018709U, // <4,3,0,u>: Cost 3 vsldoi12 <1,2,3,4>, <3,0,u,2>
+ 3361055638U, // <4,3,1,0>: Cost 4 vmrglw <0,u,4,1>, <1,2,3,0>
+ 3831949542U, // <4,3,1,1>: Cost 4 vsldoi12 <1,1,1,4>, <3,1,1,1>
+ 2703917978U, // <4,3,1,2>: Cost 3 vsldoi8 <3,2,4,3>, <1,2,3,4>
+ 3361056370U, // <4,3,1,3>: Cost 4 vmrglw <0,u,4,1>, <2,2,3,3>
+ 2295939994U, // <4,3,1,4>: Cost 3 vmrglw <2,3,4,1>, <1,2,3,4>
+ 3361056291U, // <4,3,1,5>: Cost 4 vmrglw <0,u,4,1>, <2,1,3,5>
+ 3378972520U, // <4,3,1,6>: Cost 4 vmrglw <3,u,4,1>, <2,5,3,6>
+ 3361056698U, // <4,3,1,7>: Cost 4 vmrglw <0,u,4,1>, <2,6,3,7>
+ 2703917978U, // <4,3,1,u>: Cost 3 vsldoi8 <3,2,4,3>, <1,2,3,4>
+ 3832760624U, // <4,3,2,0>: Cost 4 vsldoi12 <1,2,3,4>, <3,2,0,3>
+ 3711796122U, // <4,3,2,1>: Cost 4 vsldoi4 <3,4,3,2>, <1,2,3,4>
+ 3832760641U, // <4,3,2,2>: Cost 4 vsldoi12 <1,2,3,4>, <3,2,2,2>
+ 2770962764U, // <4,3,2,3>: Cost 3 vsldoi12 <3,2,3,4>, <3,2,3,4>
+ 2759018836U, // <4,3,2,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,2,4,3>
+ 3827304802U, // <4,3,2,5>: Cost 5 vsldoi12 <0,3,1,4>, <3,2,5,u>
+ 3832760678U, // <4,3,2,6>: Cost 4 vsldoi12 <1,2,3,4>, <3,2,6,3>
+ 3859597679U, // <4,3,2,7>: Cost 4 vsldoi12 <5,6,7,4>, <3,2,7,3>
+ 2771331449U, // <4,3,2,u>: Cost 3 vsldoi12 <3,2,u,4>, <3,2,u,4>
+ 2240841878U, // <4,3,3,0>: Cost 3 vmrghw <4,3,5,0>, <3,0,1,2>
+ 3776997635U, // <4,3,3,1>: Cost 4 vsldoi8 <3,1,4,3>, <3,1,4,3>
+ 2703919444U, // <4,3,3,2>: Cost 3 vsldoi8 <3,2,4,3>, <3,2,4,3>
+ 2759018908U, // <4,3,3,3>: Cost 3 vsldoi12 <1,2,3,4>, <3,3,3,3>
+ 2759018918U, // <4,3,3,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,3,4,4>
+ 3386951446U, // <4,3,3,5>: Cost 4 vmrglw <5,2,4,3>, <2,4,3,5>
+ 3777661596U, // <4,3,3,6>: Cost 4 vsldoi8 <3,2,4,3>, <3,6,4,7>
+ 3375007674U, // <4,3,3,7>: Cost 4 vmrglw <3,2,4,3>, <2,6,3,7>
+ 2707901242U, // <4,3,3,u>: Cost 3 vsldoi8 <3,u,4,3>, <3,u,4,3>
+ 2759018960U, // <4,3,4,0>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,0,1>
+ 2759018970U, // <4,3,4,1>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,1,2>
+ 2632099605U, // <4,3,4,2>: Cost 3 vsldoi4 <2,4,3,4>, <2,4,3,4>
+ 2241464732U, // <4,3,4,3>: Cost 3 vmrghw <4,4,4,4>, <3,3,3,3>
+ 2759019000U, // <4,3,4,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,4,5>
+ 2753563138U, // <4,3,4,5>: Cost 3 vsldoi12 <0,3,1,4>, <3,4,5,6>
+ 3777662316U, // <4,3,4,6>: Cost 4 vsldoi8 <3,2,4,3>, <4,6,3,7>
+ 2308573114U, // <4,3,4,7>: Cost 3 vmrglw <4,4,4,4>, <2,6,3,7>
+ 2759019032U, // <4,3,4,u>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,u,1>
+ 1168558230U, // <4,3,5,0>: Cost 2 vmrghw RHS, <3,0,1,2>
+ 2242300134U, // <4,3,5,1>: Cost 3 vmrghw RHS, <3,1,1,1>
+ 2632107798U, // <4,3,5,2>: Cost 3 vsldoi4 <2,4,3,5>, <2,4,3,5>
+ 1168558492U, // <4,3,5,3>: Cost 2 vmrghw RHS, <3,3,3,3>
+ 1168558594U, // <4,3,5,4>: Cost 2 vmrghw RHS, <3,4,5,6>
+ 2295973654U, // <4,3,5,5>: Cost 3 vmrglw <2,3,4,5>, <2,4,3,5>
+ 2242300536U, // <4,3,5,6>: Cost 3 vmrghw RHS, <3,6,0,7>
+ 2295973818U, // <4,3,5,7>: Cost 3 vmrglw <2,3,4,5>, <2,6,3,7>
+ 1168558878U, // <4,3,5,u>: Cost 2 vmrghw RHS, <3,u,1,2>
+ 3832760952U, // <4,3,6,0>: Cost 4 vsldoi12 <1,2,3,4>, <3,6,0,7>
+ 3711828890U, // <4,3,6,1>: Cost 4 vsldoi4 <3,4,3,6>, <1,2,3,4>
+ 3316484436U, // <4,3,6,2>: Cost 4 vmrghw <4,6,3,7>, <3,2,4,3>
+ 3711830512U, // <4,3,6,3>: Cost 4 vsldoi4 <3,4,3,6>, <3,4,3,6>
+ 2759019164U, // <4,3,6,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,6,4,7>
+ 3361097251U, // <4,3,6,5>: Cost 5 vmrglw <0,u,4,6>, <2,1,3,5>
+ 3316624045U, // <4,3,6,6>: Cost 4 vmrghw <4,6,5,6>, <3,6,6,6>
+ 2773912244U, // <4,3,6,7>: Cost 3 vsldoi12 <3,6,7,4>, <3,6,7,4>
+ 2759019164U, // <4,3,6,u>: Cost 3 vsldoi12 <1,2,3,4>, <3,6,4,7>
+ 3377693590U, // <4,3,7,0>: Cost 4 vmrglw <3,6,4,7>, <1,2,3,0>
+ 3365751680U, // <4,3,7,1>: Cost 5 vmrglw <1,6,4,7>, <4,0,3,1>
+ 2727810232U, // <4,3,7,2>: Cost 3 vsldoi8 <7,2,4,3>, <7,2,4,3>
+ 3377694322U, // <4,3,7,3>: Cost 4 vmrglw <3,6,4,7>, <2,2,3,3>
+ 2303951770U, // <4,3,7,4>: Cost 3 vmrglw <3,6,4,7>, <1,2,3,4>
+ 3741700198U, // <4,3,7,5>: Cost 4 vsldoi4 <u,4,3,7>, <5,6,7,4>
+ 3377695216U, // <4,3,7,6>: Cost 4 vmrglw <3,6,4,7>, <3,4,3,6>
+ 3375703994U, // <4,3,7,7>: Cost 4 vmrglw <3,3,4,7>, <2,6,3,7>
+ 2731792030U, // <4,3,7,u>: Cost 3 vsldoi8 <7,u,4,3>, <7,u,4,3>
+ 1170548886U, // <4,3,u,0>: Cost 2 vmrghw RHS, <3,0,1,2>
+ 2759019294U, // <4,3,u,1>: Cost 3 vsldoi12 <1,2,3,4>, <3,u,1,2>
+ 2632132377U, // <4,3,u,2>: Cost 3 vsldoi4 <2,4,3,u>, <2,4,3,u>
+ 1170549148U, // <4,3,u,3>: Cost 2 vmrghw RHS, <3,3,3,3>
+ 1170549250U, // <4,3,u,4>: Cost 2 vmrghw RHS, <3,4,5,6>
+ 2759019334U, // <4,3,u,5>: Cost 3 vsldoi12 <1,2,3,4>, <3,u,5,6>
+ 2244291192U, // <4,3,u,6>: Cost 3 vmrghw RHS, <3,6,0,7>
+ 2295998394U, // <4,3,u,7>: Cost 3 vmrglw <2,3,4,u>, <2,6,3,7>
+ 1170549534U, // <4,3,u,u>: Cost 2 vmrghw RHS, <3,u,1,2>
+ 1165118354U, // <4,4,0,0>: Cost 2 vmrghw <4,0,5,1>, <4,0,5,1>
+ 1637482598U, // <4,4,0,1>: Cost 2 vsldoi8 <4,4,4,4>, LHS
+ 3711854285U, // <4,4,0,2>: Cost 4 vsldoi4 <3,4,4,0>, <2,3,4,4>
+ 3827305344U, // <4,4,0,3>: Cost 4 vsldoi12 <0,3,1,4>, <4,0,3,1>
+ 2711224658U, // <4,4,0,4>: Cost 3 vsldoi8 <4,4,4,4>, <0,4,1,5>
+ 1165118774U, // <4,4,0,5>: Cost 2 vmrghw <4,0,5,1>, RHS
+ 3312602489U, // <4,4,0,6>: Cost 4 vmrghw <4,0,5,1>, <4,6,5,2>
+ 3369675420U, // <4,4,0,7>: Cost 4 vmrglw <2,3,4,0>, <3,6,4,7>
+ 1165119017U, // <4,4,0,u>: Cost 2 vmrghw <4,0,5,1>, RHS
+ 3369682633U, // <4,4,1,0>: Cost 4 vmrglw <2,3,4,1>, <2,3,4,0>
+ 2287313581U, // <4,4,1,1>: Cost 3 vmrglw <0,u,4,1>, <0,u,4,1>
+ 2759019466U, // <4,4,1,2>: Cost 3 vsldoi12 <1,2,3,4>, <4,1,2,3>
+ 3369683284U, // <4,4,1,3>: Cost 4 vmrglw <2,3,4,1>, <3,2,4,3>
+ 2311204048U, // <4,4,1,4>: Cost 3 vmrglw <4,u,4,1>, <4,4,4,4>
+ 2239319350U, // <4,4,1,5>: Cost 3 vmrghw <4,1,2,3>, RHS
+ 3784967411U, // <4,4,1,6>: Cost 4 vsldoi8 <4,4,4,4>, <1,6,5,7>
+ 3369683612U, // <4,4,1,7>: Cost 4 vmrglw <2,3,4,1>, <3,6,4,7>
+ 2763000832U, // <4,4,1,u>: Cost 3 vsldoi12 <1,u,3,4>, <4,1,u,3>
+ 3711869030U, // <4,4,2,0>: Cost 4 vsldoi4 <3,4,4,2>, LHS
+ 3711869850U, // <4,4,2,1>: Cost 4 vsldoi4 <3,4,4,2>, <1,2,3,4>
+ 2240203830U, // <4,4,2,2>: Cost 3 vmrghw <4,2,5,3>, <4,2,5,3>
+ 2698618573U, // <4,4,2,3>: Cost 3 vsldoi8 <2,3,4,4>, <2,3,4,4>
+ 2711226133U, // <4,4,2,4>: Cost 3 vsldoi8 <4,4,4,4>, <2,4,3,4>
+ 2240204086U, // <4,4,2,5>: Cost 3 vmrghw <4,2,5,3>, RHS
+ 2711226298U, // <4,4,2,6>: Cost 3 vsldoi8 <4,4,4,4>, <2,6,3,7>
+ 3832761416U, // <4,4,2,7>: Cost 4 vsldoi12 <1,2,3,4>, <4,2,7,3>
+ 2701936738U, // <4,4,2,u>: Cost 3 vsldoi8 <2,u,4,4>, <2,u,4,4>
+ 2711226518U, // <4,4,3,0>: Cost 3 vsldoi8 <4,4,4,4>, <3,0,1,2>
+ 3777005828U, // <4,4,3,1>: Cost 4 vsldoi8 <3,1,4,4>, <3,1,4,4>
+ 3832761453U, // <4,4,3,2>: Cost 4 vsldoi12 <1,2,3,4>, <4,3,2,4>
+ 2301266260U, // <4,4,3,3>: Cost 3 vmrglw <3,2,4,3>, <3,2,4,3>
+ 2705254903U, // <4,4,3,4>: Cost 3 vsldoi8 <3,4,4,4>, <3,4,4,4>
+ 2240843062U, // <4,4,3,5>: Cost 3 vmrghw <4,3,5,0>, RHS
+ 3832761489U, // <4,4,3,6>: Cost 4 vsldoi12 <1,2,3,4>, <4,3,6,4>
+ 3375008412U, // <4,4,3,7>: Cost 4 vmrglw <3,2,4,3>, <3,6,4,7>
+ 2301266260U, // <4,4,3,u>: Cost 3 vmrglw <3,2,4,3>, <3,2,4,3>
+ 1570373734U, // <4,4,4,0>: Cost 2 vsldoi4 <4,4,4,4>, LHS
+ 2308574089U, // <4,4,4,1>: Cost 3 vmrglw <4,4,4,4>, <4,0,4,1>
+ 2644117096U, // <4,4,4,2>: Cost 3 vsldoi4 <4,4,4,4>, <2,2,2,2>
+ 2638146039U, // <4,4,4,3>: Cost 3 vsldoi4 <3,4,4,4>, <3,4,4,4>
+ 229035318U, // <4,4,4,4>: Cost 1 vspltisw0 RHS
+ 1167723830U, // <4,4,4,5>: Cost 2 vmrghw <4,4,4,4>, RHS
+ 2644120058U, // <4,4,4,6>: Cost 3 vsldoi4 <4,4,4,4>, <6,2,7,3>
+ 2662036827U, // <4,4,4,7>: Cost 3 vsldoi4 <7,4,4,4>, <7,4,4,4>
+ 229035318U, // <4,4,4,u>: Cost 1 vspltisw0 RHS
+ 1168558994U, // <4,4,5,0>: Cost 2 vmrghw RHS, <4,0,5,1>
+ 2638152602U, // <4,4,5,1>: Cost 3 vsldoi4 <3,4,4,5>, <1,2,3,4>
+ 2242300981U, // <4,4,5,2>: Cost 3 vmrghw RHS, <4,2,5,2>
+ 2638154232U, // <4,4,5,3>: Cost 3 vsldoi4 <3,4,4,5>, <3,4,4,5>
+ 1168559322U, // <4,4,5,4>: Cost 2 vmrghw RHS, <4,4,5,5>
+ 94817590U, // <4,4,5,5>: Cost 1 vmrghw RHS, RHS
+ 1685278006U, // <4,4,5,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS
+ 2242309576U, // <4,4,5,7>: Cost 3 vmrghw RHS, <4,7,5,0>
+ 94817833U, // <4,4,5,u>: Cost 1 vmrghw RHS, RHS
+ 3316591506U, // <4,4,6,0>: Cost 4 vmrghw <4,6,5,2>, <4,0,5,1>
+ 3758428587U, // <4,4,6,1>: Cost 4 vsldoi8 <0,0,4,4>, <6,1,7,5>
+ 2711228922U, // <4,4,6,2>: Cost 3 vsldoi8 <4,4,4,4>, <6,2,7,3>
+ 3796251185U, // <4,4,6,3>: Cost 4 vsldoi8 <6,3,4,4>, <6,3,4,4>
+ 2711229085U, // <4,4,6,4>: Cost 3 vsldoi8 <4,4,4,4>, <6,4,7,4>
+ 2242850102U, // <4,4,6,5>: Cost 3 vmrghw <4,6,5,2>, RHS
+ 2242850169U, // <4,4,6,6>: Cost 3 vmrghw <4,6,5,2>, <4,6,5,2>
+ 2725163893U, // <4,4,6,7>: Cost 3 vsldoi8 <6,7,4,4>, <6,7,4,4>
+ 2242850345U, // <4,4,6,u>: Cost 3 vmrghw <4,6,5,2>, RHS
+ 2711229434U, // <4,4,7,0>: Cost 3 vsldoi8 <4,4,4,4>, <7,0,1,2>
+ 3377694410U, // <4,4,7,1>: Cost 4 vmrglw <3,6,4,7>, <2,3,4,1>
+ 3868593584U, // <4,4,7,2>: Cost 4 vsldoi12 <7,2,3,4>, <4,7,2,3>
+ 3377695060U, // <4,4,7,3>: Cost 4 vmrglw <3,6,4,7>, <3,2,4,3>
+ 2729145691U, // <4,4,7,4>: Cost 3 vsldoi8 <7,4,4,4>, <7,4,4,4>
+ 2243497270U, // <4,4,7,5>: Cost 3 vmrghw <4,7,5,0>, RHS
+ 3871542744U, // <4,4,7,6>: Cost 4 vsldoi12 <7,6,7,4>, <4,7,6,7>
+ 2303953564U, // <4,4,7,7>: Cost 3 vmrglw <3,6,4,7>, <3,6,4,7>
+ 2243497513U, // <4,4,7,u>: Cost 3 vmrghw <4,7,5,0>, RHS
+ 1170549650U, // <4,4,u,0>: Cost 2 vmrghw RHS, <4,0,5,1>
+ 1637488430U, // <4,4,u,1>: Cost 2 vsldoi8 <4,4,4,4>, LHS
+ 2244291637U, // <4,4,u,2>: Cost 3 vmrghw RHS, <4,2,5,2>
+ 2638178811U, // <4,4,u,3>: Cost 3 vsldoi4 <3,4,4,u>, <3,4,4,u>
+ 229035318U, // <4,4,u,4>: Cost 1 vspltisw0 RHS
+ 96808246U, // <4,4,u,5>: Cost 1 vmrghw RHS, RHS
+ 1685278249U, // <4,4,u,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS
+ 2244292040U, // <4,4,u,7>: Cost 3 vmrghw RHS, <4,7,5,0>
+ 96808489U, // <4,4,u,u>: Cost 1 vmrghw RHS, RHS
+ 2698625024U, // <4,5,0,0>: Cost 3 vsldoi8 <2,3,4,5>, <0,0,0,0>
+ 1624883302U, // <4,5,0,1>: Cost 2 vsldoi8 <2,3,4,5>, LHS
+ 2638186190U, // <4,5,0,2>: Cost 3 vsldoi4 <3,4,5,0>, <2,3,4,5>
+ 2638187004U, // <4,5,0,3>: Cost 3 vsldoi4 <3,4,5,0>, <3,4,5,0>
+ 2687345005U, // <4,5,0,4>: Cost 3 vsldoi8 <0,4,4,5>, <0,4,4,5>
+ 2238861316U, // <4,5,0,5>: Cost 3 vmrghw <4,0,5,1>, <5,5,5,5>
+ 2662077302U, // <4,5,0,6>: Cost 3 vsldoi4 <7,4,5,0>, <6,7,4,5>
+ 2662077792U, // <4,5,0,7>: Cost 3 vsldoi4 <7,4,5,0>, <7,4,5,0>
+ 1624883869U, // <4,5,0,u>: Cost 2 vsldoi8 <2,3,4,5>, LHS
+ 3361057762U, // <4,5,1,0>: Cost 4 vmrglw <0,u,4,1>, <4,1,5,0>
+ 2691326803U, // <4,5,1,1>: Cost 3 vsldoi8 <1,1,4,5>, <1,1,4,5>
+ 2698625942U, // <4,5,1,2>: Cost 3 vsldoi8 <2,3,4,5>, <1,2,3,0>
+ 3361055659U, // <4,5,1,3>: Cost 4 vmrglw <0,u,4,1>, <1,2,5,3>
+ 3761087567U, // <4,5,1,4>: Cost 4 vsldoi8 <0,4,4,5>, <1,4,5,5>
+ 2693981335U, // <4,5,1,5>: Cost 3 vsldoi8 <1,5,4,5>, <1,5,4,5>
+ 2305231362U, // <4,5,1,6>: Cost 3 vmrglw <3,u,4,1>, <3,4,5,6>
+ 3361055987U, // <4,5,1,7>: Cost 4 vmrglw <0,u,4,1>, <1,6,5,7>
+ 2695972234U, // <4,5,1,u>: Cost 3 vsldoi8 <1,u,4,5>, <1,u,4,5>
+ 2638200934U, // <4,5,2,0>: Cost 3 vsldoi4 <3,4,5,2>, LHS
+ 3761088035U, // <4,5,2,1>: Cost 4 vsldoi8 <0,4,4,5>, <2,1,3,5>
+ 2697963133U, // <4,5,2,2>: Cost 3 vsldoi8 <2,2,4,5>, <2,2,4,5>
+ 1624884942U, // <4,5,2,3>: Cost 2 vsldoi8 <2,3,4,5>, <2,3,4,5>
+ 2698626838U, // <4,5,2,4>: Cost 3 vsldoi8 <2,3,4,5>, <2,4,3,5>
+ 3772368744U, // <4,5,2,5>: Cost 4 vsldoi8 <2,3,4,5>, <2,5,3,6>
+ 2698627002U, // <4,5,2,6>: Cost 3 vsldoi8 <2,3,4,5>, <2,6,3,7>
+ 3775023122U, // <4,5,2,7>: Cost 4 vsldoi8 <2,7,4,5>, <2,7,4,5>
+ 1628203107U, // <4,5,2,u>: Cost 2 vsldoi8 <2,u,4,5>, <2,u,4,5>
+ 2698627222U, // <4,5,3,0>: Cost 3 vsldoi8 <2,3,4,5>, <3,0,1,2>
+ 3765070057U, // <4,5,3,1>: Cost 4 vsldoi8 <1,1,4,5>, <3,1,1,4>
+ 2698627404U, // <4,5,3,2>: Cost 3 vsldoi8 <2,3,4,5>, <3,2,3,4>
+ 2698627484U, // <4,5,3,3>: Cost 3 vsldoi8 <2,3,4,5>, <3,3,3,3>
+ 2698627580U, // <4,5,3,4>: Cost 3 vsldoi8 <2,3,4,5>, <3,4,5,0>
+ 3779668553U, // <4,5,3,5>: Cost 4 vsldoi8 <3,5,4,5>, <3,5,4,5>
+ 2725169844U, // <4,5,3,6>: Cost 3 vsldoi8 <6,7,4,5>, <3,6,7,4>
+ 2707253995U, // <4,5,3,7>: Cost 3 vsldoi8 <3,7,4,5>, <3,7,4,5>
+ 2698627870U, // <4,5,3,u>: Cost 3 vsldoi8 <2,3,4,5>, <3,u,1,2>
+ 2638217318U, // <4,5,4,0>: Cost 3 vsldoi4 <3,4,5,4>, LHS
+ 2308574098U, // <4,5,4,1>: Cost 3 vmrglw <4,4,4,4>, <4,0,5,1>
+ 2698628150U, // <4,5,4,2>: Cost 3 vsldoi8 <2,3,4,5>, <4,2,5,3>
+ 2638219776U, // <4,5,4,3>: Cost 3 vsldoi4 <3,4,5,4>, <3,4,5,4>
+ 2698628314U, // <4,5,4,4>: Cost 3 vsldoi8 <2,3,4,5>, <4,4,5,5>
+ 1624886582U, // <4,5,4,5>: Cost 2 vsldoi8 <2,3,4,5>, RHS
+ 2698628478U, // <4,5,4,6>: Cost 3 vsldoi8 <2,3,4,5>, <4,6,5,7>
+ 2662110564U, // <4,5,4,7>: Cost 3 vsldoi4 <7,4,5,4>, <7,4,5,4>
+ 1624886825U, // <4,5,4,u>: Cost 2 vsldoi8 <2,3,4,5>, RHS
+ 1570455654U, // <4,5,5,0>: Cost 2 vsldoi4 <4,4,5,5>, LHS
+ 2312564250U, // <4,5,5,1>: Cost 3 vmrglw <5,1,4,5>, <4,u,5,1>
+ 2644199118U, // <4,5,5,2>: Cost 3 vsldoi4 <4,4,5,5>, <2,3,4,5>
+ 2295974966U, // <4,5,5,3>: Cost 3 vmrglw <2,3,4,5>, <4,2,5,3>
+ 1570458842U, // <4,5,5,4>: Cost 2 vsldoi4 <4,4,5,5>, <4,4,5,5>
+ 1168568324U, // <4,5,5,5>: Cost 2 vmrghw RHS, <5,5,5,5>
+ 1168568418U, // <4,5,5,6>: Cost 2 vmrghw RHS, <5,6,7,0>
+ 2295975294U, // <4,5,5,7>: Cost 3 vmrglw <2,3,4,5>, <4,6,5,7>
+ 1168716036U, // <4,5,5,u>: Cost 2 vmrghw RHS, <5,u,7,0>
+ 1564491878U, // <4,5,6,0>: Cost 2 vsldoi4 <3,4,5,6>, LHS
+ 2626290768U, // <4,5,6,1>: Cost 3 vsldoi4 <1,4,5,6>, <1,4,5,6>
+ 2632263465U, // <4,5,6,2>: Cost 3 vsldoi4 <2,4,5,6>, <2,4,5,6>
+ 1564494338U, // <4,5,6,3>: Cost 2 vsldoi4 <3,4,5,6>, <3,4,5,6>
+ 1564495158U, // <4,5,6,4>: Cost 2 vsldoi4 <3,4,5,6>, RHS
+ 2638237464U, // <4,5,6,5>: Cost 3 vsldoi4 <3,4,5,6>, <5,2,6,3>
+ 2656154253U, // <4,5,6,6>: Cost 3 vsldoi4 <6,4,5,6>, <6,4,5,6>
+ 27705344U, // <4,5,6,7>: Cost 0 copy RHS
+ 27705344U, // <4,5,6,u>: Cost 0 copy RHS
+ 2725172218U, // <4,5,7,0>: Cost 3 vsldoi8 <6,7,4,5>, <7,0,1,2>
+ 3859599489U, // <4,5,7,1>: Cost 4 vsldoi12 <5,6,7,4>, <5,7,1,4>
+ 2698630320U, // <4,5,7,2>: Cost 3 vsldoi8 <2,3,4,5>, <7,2,3,4>
+ 2728490251U, // <4,5,7,3>: Cost 3 vsldoi8 <7,3,4,5>, <7,3,4,5>
+ 2725172576U, // <4,5,7,4>: Cost 3 vsldoi8 <6,7,4,5>, <7,4,5,0>
+ 3317239812U, // <4,5,7,5>: Cost 4 vmrghw <4,7,5,0>, <5,5,5,5>
+ 2725172760U, // <4,5,7,6>: Cost 3 vsldoi8 <6,7,4,5>, <7,6,7,4>
+ 2725172844U, // <4,5,7,7>: Cost 3 vsldoi8 <6,7,4,5>, <7,7,7,7>
+ 2725172866U, // <4,5,7,u>: Cost 3 vsldoi8 <6,7,4,5>, <7,u,1,2>
+ 1564508262U, // <4,5,u,0>: Cost 2 vsldoi4 <3,4,5,u>, LHS
+ 1624889134U, // <4,5,u,1>: Cost 2 vsldoi8 <2,3,4,5>, LHS
+ 2698631045U, // <4,5,u,2>: Cost 3 vsldoi8 <2,3,4,5>, <u,2,3,0>
+ 1564510724U, // <4,5,u,3>: Cost 2 vsldoi4 <3,4,5,u>, <3,4,5,u>
+ 1564511542U, // <4,5,u,4>: Cost 2 vsldoi4 <3,4,5,u>, RHS
+ 1624889498U, // <4,5,u,5>: Cost 2 vsldoi8 <2,3,4,5>, RHS
+ 1170550882U, // <4,5,u,6>: Cost 2 vmrghw RHS, <5,6,7,0>
+ 27705344U, // <4,5,u,7>: Cost 0 copy RHS
+ 27705344U, // <4,5,u,u>: Cost 0 copy RHS
+ 3312595285U, // <4,6,0,0>: Cost 4 vmrghw <4,0,5,0>, <6,0,7,0>
+ 3763748966U, // <4,6,0,1>: Cost 4 vsldoi8 <0,u,4,6>, LHS
+ 2238861818U, // <4,6,0,2>: Cost 3 vmrghw <4,0,5,1>, <6,2,7,3>
+ 3767730432U, // <4,6,0,3>: Cost 4 vsldoi8 <1,5,4,6>, <0,3,1,4>
+ 3763749202U, // <4,6,0,4>: Cost 4 vsldoi8 <0,u,4,6>, <0,4,1,5>
+ 2238862059U, // <4,6,0,5>: Cost 3 vmrghw <4,0,5,1>, <6,5,7,1>
+ 2238862136U, // <4,6,0,6>: Cost 3 vmrghw <4,0,5,1>, <6,6,6,6>
+ 2295934262U, // <4,6,0,7>: Cost 3 vmrglw <2,3,4,0>, RHS
+ 2295934263U, // <4,6,0,u>: Cost 3 vmrglw <2,3,4,0>, RHS
+ 3378973999U, // <4,6,1,0>: Cost 4 vmrglw <3,u,4,1>, <4,5,6,0>
+ 3378974648U, // <4,6,1,1>: Cost 4 vmrglw <3,u,4,1>, <5,4,6,1>
+ 3779675034U, // <4,6,1,2>: Cost 4 vsldoi8 <3,5,4,6>, <1,2,3,4>
+ 3378974002U, // <4,6,1,3>: Cost 4 vmrglw <3,u,4,1>, <4,5,6,3>
+ 3378974003U, // <4,6,1,4>: Cost 4 vmrglw <3,u,4,1>, <4,5,6,4>
+ 3767731352U, // <4,6,1,5>: Cost 4 vsldoi8 <1,5,4,6>, <1,5,4,6>
+ 3378974734U, // <4,6,1,6>: Cost 4 vmrglw <3,u,4,1>, <5,5,6,6>
+ 2287316278U, // <4,6,1,7>: Cost 3 vmrglw <0,u,4,1>, RHS
+ 2287316279U, // <4,6,1,u>: Cost 3 vmrglw <0,u,4,1>, RHS
+ 3735904358U, // <4,6,2,0>: Cost 4 vsldoi4 <7,4,6,2>, LHS
+ 3763750435U, // <4,6,2,1>: Cost 5 vsldoi8 <0,u,4,6>, <2,1,3,5>
+ 3313938937U, // <4,6,2,2>: Cost 4 vmrghw <4,2,5,2>, <6,2,7,2>
+ 3772376782U, // <4,6,2,3>: Cost 4 vsldoi8 <2,3,4,6>, <2,3,4,5>
+ 3852890591U, // <4,6,2,4>: Cost 4 vsldoi12 <4,5,6,4>, <6,2,4,3>
+ 3735908454U, // <4,6,2,5>: Cost 4 vsldoi4 <7,4,6,2>, <5,6,7,4>
+ 3801573306U, // <4,6,2,6>: Cost 4 vsldoi8 <7,2,4,6>, <2,6,3,7>
+ 2785858042U, // <4,6,2,7>: Cost 3 vsldoi12 <5,6,7,4>, <6,2,7,3>
+ 2785858051U, // <4,6,2,u>: Cost 3 vsldoi12 <5,6,7,4>, <6,2,u,3>
+ 3863065101U, // <4,6,3,0>: Cost 4 vsldoi12 <6,3,0,4>, <6,3,0,4>
+ 3314586024U, // <4,6,3,1>: Cost 4 vmrghw <4,3,5,0>, <6,1,7,2>
+ 3863212575U, // <4,6,3,2>: Cost 4 vsldoi12 <6,3,2,4>, <6,3,2,4>
+ 3863286312U, // <4,6,3,3>: Cost 4 vsldoi12 <6,3,3,4>, <6,3,3,4>
+ 3767732738U, // <4,6,3,4>: Cost 4 vsldoi8 <1,5,4,6>, <3,4,5,6>
+ 3779676746U, // <4,6,3,5>: Cost 4 vsldoi8 <3,5,4,6>, <3,5,4,6>
+ 3398898488U, // <4,6,3,6>: Cost 4 vmrglw <7,2,4,3>, <6,6,6,6>
+ 2301267254U, // <4,6,3,7>: Cost 3 vmrglw <3,2,4,3>, RHS
+ 2301267255U, // <4,6,3,u>: Cost 3 vmrglw <3,2,4,3>, RHS
+ 3852890715U, // <4,6,4,0>: Cost 4 vsldoi12 <4,5,6,4>, <6,4,0,1>
+ 3315208615U, // <4,6,4,1>: Cost 4 vmrghw <4,4,4,4>, <6,1,7,1>
+ 2241466874U, // <4,6,4,2>: Cost 3 vmrghw <4,4,4,4>, <6,2,7,3>
+ 3852890745U, // <4,6,4,3>: Cost 4 vsldoi12 <4,5,6,4>, <6,4,3,4>
+ 2241467037U, // <4,6,4,4>: Cost 3 vmrghw <4,4,4,4>, <6,4,7,4>
+ 2241549039U, // <4,6,4,5>: Cost 3 vmrghw <4,4,5,5>, <6,5,7,5>
+ 2241467192U, // <4,6,4,6>: Cost 3 vmrghw <4,4,4,4>, <6,6,6,6>
+ 1234832694U, // <4,6,4,7>: Cost 2 vmrglw <4,4,4,4>, RHS
+ 1234832695U, // <4,6,4,u>: Cost 2 vmrglw <4,4,4,4>, RHS
+ 2242302241U, // <4,6,5,0>: Cost 3 vmrghw RHS, <6,0,1,2>
+ 2242310567U, // <4,6,5,1>: Cost 3 vmrghw RHS, <6,1,7,1>
+ 1168568826U, // <4,6,5,2>: Cost 2 vmrghw RHS, <6,2,7,3>
+ 2242302514U, // <4,6,5,3>: Cost 3 vmrghw RHS, <6,3,4,5>
+ 2242302605U, // <4,6,5,4>: Cost 3 vmrghw RHS, <6,4,5,6>
+ 2242310891U, // <4,6,5,5>: Cost 3 vmrghw RHS, <6,5,7,1>
+ 1168569144U, // <4,6,5,6>: Cost 2 vmrghw RHS, <6,6,6,6>
+ 1222233398U, // <4,6,5,7>: Cost 2 vmrglw <2,3,4,5>, RHS
+ 1222233399U, // <4,6,5,u>: Cost 2 vmrglw <2,3,4,5>, RHS
+ 3316576545U, // <4,6,6,0>: Cost 4 vmrghw <4,6,5,0>, <6,0,1,2>
+ 3316584871U, // <4,6,6,1>: Cost 4 vmrghw <4,6,5,1>, <6,1,7,1>
+ 2242851322U, // <4,6,6,2>: Cost 3 vmrghw <4,6,5,2>, <6,2,7,3>
+ 3316601394U, // <4,6,6,3>: Cost 4 vmrghw <4,6,5,3>, <6,3,4,5>
+ 3852890916U, // <4,6,6,4>: Cost 4 vsldoi12 <4,5,6,4>, <6,6,4,4>
+ 3316617963U, // <4,6,6,5>: Cost 4 vmrghw <4,6,5,5>, <6,5,7,1>
+ 2242884408U, // <4,6,6,6>: Cost 3 vmrghw <4,6,5,6>, <6,6,6,6>
+ 2785858370U, // <4,6,6,7>: Cost 3 vsldoi12 <5,6,7,4>, <6,6,7,7>
+ 2785858379U, // <4,6,6,u>: Cost 3 vsldoi12 <5,6,7,4>, <6,6,u,7>
+ 2785858382U, // <4,6,7,0>: Cost 3 vsldoi12 <5,6,7,4>, <6,7,0,1>
+ 3859600215U, // <4,6,7,1>: Cost 4 vsldoi12 <5,6,7,4>, <6,7,1,1>
+ 3317240314U, // <4,6,7,2>: Cost 4 vmrghw <4,7,5,0>, <6,2,7,3>
+ 2792199020U, // <4,6,7,3>: Cost 3 vsldoi12 <6,7,3,4>, <6,7,3,4>
+ 2785858422U, // <4,6,7,4>: Cost 3 vsldoi12 <5,6,7,4>, <6,7,4,5>
+ 3856651132U, // <4,6,7,5>: Cost 4 vsldoi12 <5,2,3,4>, <6,7,5,2>
+ 3317240632U, // <4,6,7,6>: Cost 4 vmrghw <4,7,5,0>, <6,6,6,6>
+ 2303954230U, // <4,6,7,7>: Cost 3 vmrglw <3,6,4,7>, RHS
+ 2303954231U, // <4,6,7,u>: Cost 3 vmrglw <3,6,4,7>, RHS
+ 2244292897U, // <4,6,u,0>: Cost 3 vmrghw RHS, <6,0,1,2>
+ 2244293031U, // <4,6,u,1>: Cost 3 vmrghw RHS, <6,1,7,1>
+ 1170551290U, // <4,6,u,2>: Cost 2 vmrghw RHS, <6,2,7,3>
+ 2244293170U, // <4,6,u,3>: Cost 3 vmrghw RHS, <6,3,4,5>
+ 2244293261U, // <4,6,u,4>: Cost 3 vmrghw RHS, <6,4,5,6>
+ 2244293355U, // <4,6,u,5>: Cost 3 vmrghw RHS, <6,5,7,1>
+ 1170551608U, // <4,6,u,6>: Cost 2 vmrghw RHS, <6,6,6,6>
+ 1222257974U, // <4,6,u,7>: Cost 2 vmrglw <2,3,4,u>, RHS
+ 1222257975U, // <4,6,u,u>: Cost 2 vmrglw <2,3,4,u>, RHS
+ 2238862330U, // <4,7,0,0>: Cost 3 vmrghw <4,0,5,1>, <7,0,1,2>
+ 2706604134U, // <4,7,0,1>: Cost 3 vsldoi8 <3,6,4,7>, LHS
+ 3312604308U, // <4,7,0,2>: Cost 4 vmrghw <4,0,5,1>, <7,2,0,3>
+ 3768402176U, // <4,7,0,3>: Cost 4 vsldoi8 <1,6,4,7>, <0,3,1,4>
+ 2238862648U, // <4,7,0,4>: Cost 3 vmrghw <4,0,5,1>, <7,4,0,5>
+ 3859600418U, // <4,7,0,5>: Cost 4 vsldoi12 <5,6,7,4>, <7,0,5,6>
+ 3729994393U, // <4,7,0,6>: Cost 4 vsldoi4 <6,4,7,0>, <6,4,7,0>
+ 2238862956U, // <4,7,0,7>: Cost 3 vmrghw <4,0,5,1>, <7,7,7,7>
+ 2706604701U, // <4,7,0,u>: Cost 3 vsldoi8 <3,6,4,7>, LHS
+ 3385610338U, // <4,7,1,0>: Cost 4 vmrglw <5,0,4,1>, <5,6,7,0>
+ 3780346676U, // <4,7,1,1>: Cost 4 vsldoi8 <3,6,4,7>, <1,1,1,1>
+ 2706604954U, // <4,7,1,2>: Cost 3 vsldoi8 <3,6,4,7>, <1,2,3,4>
+ 3385610746U, // <4,7,1,3>: Cost 4 vmrglw <5,0,4,1>, <6,2,7,3>
+ 3385610342U, // <4,7,1,4>: Cost 4 vmrglw <5,0,4,1>, <5,6,7,4>
+ 3385610667U, // <4,7,1,5>: Cost 4 vmrglw <5,0,4,1>, <6,1,7,5>
+ 3768403178U, // <4,7,1,6>: Cost 4 vsldoi8 <1,6,4,7>, <1,6,4,7>
+ 3385611074U, // <4,7,1,7>: Cost 4 vmrglw <5,0,4,1>, <6,6,7,7>
+ 2706604954U, // <4,7,1,u>: Cost 3 vsldoi8 <3,6,4,7>, <1,2,3,4>
+ 3859600532U, // <4,7,2,0>: Cost 4 vsldoi12 <5,6,7,4>, <7,2,0,3>
+ 3712091034U, // <4,7,2,1>: Cost 5 vsldoi4 <3,4,7,2>, <1,2,3,4>
+ 3774375528U, // <4,7,2,2>: Cost 4 vsldoi8 <2,6,4,7>, <2,2,2,2>
+ 2794853552U, // <4,7,2,3>: Cost 3 vsldoi12 <7,2,3,4>, <7,2,3,4>
+ 2785858744U, // <4,7,2,4>: Cost 3 vsldoi12 <5,6,7,4>, <7,2,4,3>
+ 3735982182U, // <4,7,2,5>: Cost 4 vsldoi4 <7,4,7,2>, <5,6,7,4>
+ 3774375875U, // <4,7,2,6>: Cost 4 vsldoi8 <2,6,4,7>, <2,6,4,7>
+ 3735983476U, // <4,7,2,7>: Cost 4 vsldoi4 <7,4,7,2>, <7,4,7,2>
+ 2795222237U, // <4,7,2,u>: Cost 3 vsldoi12 <7,2,u,4>, <7,2,u,4>
+ 3780348054U, // <4,7,3,0>: Cost 4 vsldoi8 <3,6,4,7>, <3,0,1,2>
+ 3730015130U, // <4,7,3,1>: Cost 4 vsldoi4 <6,4,7,3>, <1,2,3,4>
+ 3780348244U, // <4,7,3,2>: Cost 4 vsldoi8 <3,6,4,7>, <3,2,4,3>
+ 3778357673U, // <4,7,3,3>: Cost 4 vsldoi8 <3,3,4,7>, <3,3,4,7>
+ 2325155942U, // <4,7,3,4>: Cost 3 vmrglw <7,2,4,3>, <5,6,7,4>
+ 3779684939U, // <4,7,3,5>: Cost 5 vsldoi8 <3,5,4,7>, <3,5,4,7>
+ 2706606748U, // <4,7,3,6>: Cost 3 vsldoi8 <3,6,4,7>, <3,6,4,7>
+ 3398898498U, // <4,7,3,7>: Cost 4 vmrglw <7,2,4,3>, <6,6,7,7>
+ 2707934014U, // <4,7,3,u>: Cost 3 vsldoi8 <3,u,4,7>, <3,u,4,7>
+ 2785858868U, // <4,7,4,0>: Cost 3 vsldoi12 <5,6,7,4>, <7,4,0,1>
+ 3780348874U, // <4,7,4,1>: Cost 4 vsldoi8 <3,6,4,7>, <4,1,2,3>
+ 3780349000U, // <4,7,4,2>: Cost 4 vsldoi8 <3,6,4,7>, <4,2,7,3>
+ 2308575738U, // <4,7,4,3>: Cost 3 vmrglw <4,4,4,4>, <6,2,7,3>
+ 2656283856U, // <4,7,4,4>: Cost 3 vsldoi4 <6,4,7,4>, <4,4,4,4>
+ 2706607414U, // <4,7,4,5>: Cost 3 vsldoi8 <3,6,4,7>, RHS
+ 2656285341U, // <4,7,4,6>: Cost 3 vsldoi4 <6,4,7,4>, <6,4,7,4>
+ 2241468012U, // <4,7,4,7>: Cost 3 vmrghw <4,4,4,4>, <7,7,7,7>
+ 2706607657U, // <4,7,4,u>: Cost 3 vsldoi8 <3,6,4,7>, RHS
+ 1168569338U, // <4,7,5,0>: Cost 2 vmrghw RHS, <7,0,1,2>
+ 2242311242U, // <4,7,5,1>: Cost 3 vmrghw RHS, <7,1,1,1>
+ 2242303178U, // <4,7,5,2>: Cost 3 vmrghw RHS, <7,2,6,3>
+ 2242311395U, // <4,7,5,3>: Cost 3 vmrghw RHS, <7,3,0,1>
+ 1168569702U, // <4,7,5,4>: Cost 2 vmrghw RHS, <7,4,5,6>
+ 2242311606U, // <4,7,5,5>: Cost 3 vmrghw RHS, <7,5,5,5>
+ 2242311662U, // <4,7,5,6>: Cost 3 vmrghw RHS, <7,6,2,7>
+ 1168569964U, // <4,7,5,7>: Cost 2 vmrghw RHS, <7,7,7,7>
+ 1168569986U, // <4,7,5,u>: Cost 2 vmrghw RHS, <7,u,1,2>
+ 3316593658U, // <4,7,6,0>: Cost 4 vmrghw <4,6,5,2>, <7,0,1,2>
+ 3316593738U, // <4,7,6,1>: Cost 5 vmrghw <4,6,5,2>, <7,1,1,1>
+ 3316634800U, // <4,7,6,2>: Cost 4 vmrghw <4,6,5,7>, <7,2,3,4>
+ 3386978810U, // <4,7,6,3>: Cost 4 vmrglw <5,2,4,6>, <6,2,7,3>
+ 2785859072U, // <4,7,6,4>: Cost 3 vsldoi12 <5,6,7,4>, <7,6,4,7>
+ 3736014950U, // <4,7,6,5>: Cost 4 vsldoi4 <7,4,7,6>, <5,6,7,4>
+ 3316594158U, // <4,7,6,6>: Cost 4 vmrghw <4,6,5,2>, <7,6,2,7>
+ 2797803032U, // <4,7,6,7>: Cost 3 vsldoi12 <7,6,7,4>, <7,6,7,4>
+ 2797876769U, // <4,7,6,u>: Cost 3 vsldoi12 <7,6,u,4>, <7,6,u,4>
+ 2243499002U, // <4,7,7,0>: Cost 3 vmrghw <4,7,5,0>, <7,0,1,2>
+ 3718103962U, // <4,7,7,1>: Cost 4 vsldoi4 <4,4,7,7>, <1,2,3,4>
+ 3317257418U, // <4,7,7,2>: Cost 4 vmrghw <4,7,5,2>, <7,2,6,3>
+ 3377695816U, // <4,7,7,3>: Cost 4 vmrglw <3,6,4,7>, <4,2,7,3>
+ 2243532134U, // <4,7,7,4>: Cost 3 vmrghw <4,7,5,4>, <7,4,5,6>
+ 3317282230U, // <4,7,7,5>: Cost 4 vmrghw <4,7,5,5>, <7,5,5,5>
+ 2730497536U, // <4,7,7,6>: Cost 3 vsldoi8 <7,6,4,7>, <7,6,4,7>
+ 2243556972U, // <4,7,7,7>: Cost 3 vmrghw <4,7,5,7>, <7,7,7,7>
+ 2243565186U, // <4,7,7,u>: Cost 3 vmrghw <4,7,5,u>, <7,u,1,2>
+ 1170551802U, // <4,7,u,0>: Cost 2 vmrghw RHS, <7,0,1,2>
+ 2706609966U, // <4,7,u,1>: Cost 3 vsldoi8 <3,6,4,7>, LHS
+ 2244293797U, // <4,7,u,2>: Cost 3 vmrghw RHS, <7,2,2,2>
+ 2244293859U, // <4,7,u,3>: Cost 3 vmrghw RHS, <7,3,0,1>
+ 1170552166U, // <4,7,u,4>: Cost 2 vmrghw RHS, <7,4,5,6>
+ 2706610330U, // <4,7,u,5>: Cost 3 vsldoi8 <3,6,4,7>, RHS
+ 2244294126U, // <4,7,u,6>: Cost 3 vmrghw RHS, <7,6,2,7>
+ 1170552428U, // <4,7,u,7>: Cost 2 vmrghw RHS, <7,7,7,7>
+ 1170552450U, // <4,7,u,u>: Cost 2 vmrghw RHS, <7,u,1,2>
+ 1165118354U, // <4,u,0,0>: Cost 2 vmrghw <4,0,5,1>, <4,0,5,1>
+ 1624907878U, // <4,u,0,1>: Cost 2 vsldoi8 <2,3,4,u>, LHS
+ 2638407377U, // <4,u,0,2>: Cost 3 vsldoi4 <3,4,u,0>, <2,3,4,u>
+ 2295931036U, // <4,u,0,3>: Cost 3 vmrglw <2,3,4,0>, LHS
+ 2687369584U, // <4,u,0,4>: Cost 3 vsldoi8 <0,4,4,u>, <0,4,4,u>
+ 1165121690U, // <4,u,0,5>: Cost 2 vmrghw <4,0,5,1>, RHS
+ 2662298489U, // <4,u,0,6>: Cost 3 vsldoi4 <7,4,u,0>, <6,7,4,u>
+ 2295934280U, // <4,u,0,7>: Cost 3 vmrglw <2,3,4,0>, RHS
+ 1624908445U, // <4,u,0,u>: Cost 2 vsldoi8 <2,3,4,u>, LHS
+ 2638413926U, // <4,u,1,0>: Cost 3 vsldoi4 <3,4,u,1>, LHS
+ 2691351382U, // <4,u,1,1>: Cost 3 vsldoi8 <1,1,4,u>, <1,1,4,u>
+ 1685280558U, // <4,u,1,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+ 2287313052U, // <4,u,1,3>: Cost 3 vmrglw <0,u,4,1>, LHS
+ 2299257799U, // <4,u,1,4>: Cost 3 vmrglw <2,u,4,1>, <1,2,u,4>
+ 2694005914U, // <4,u,1,5>: Cost 3 vsldoi8 <1,5,4,u>, <1,5,4,u>
+ 2305231362U, // <4,u,1,6>: Cost 3 vmrglw <3,u,4,1>, <3,4,5,6>
+ 2287316296U, // <4,u,1,7>: Cost 3 vmrglw <0,u,4,1>, RHS
+ 1685280612U, // <4,u,1,u>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+ 2638422118U, // <4,u,2,0>: Cost 3 vsldoi4 <3,4,u,2>, LHS
+ 2240206638U, // <4,u,2,1>: Cost 3 vmrghw <4,2,5,3>, LHS
+ 2697987712U, // <4,u,2,2>: Cost 3 vsldoi8 <2,2,4,u>, <2,2,4,u>
+ 1624909521U, // <4,u,2,3>: Cost 2 vsldoi8 <2,3,4,u>, <2,3,4,u>
+ 2759391121U, // <4,u,2,4>: Cost 3 vsldoi12 <1,2,u,4>, <u,2,4,3>
+ 2240207002U, // <4,u,2,5>: Cost 3 vmrghw <4,2,5,3>, RHS
+ 2698651578U, // <4,u,2,6>: Cost 3 vsldoi8 <2,3,4,u>, <2,6,3,7>
+ 2785859500U, // <4,u,2,7>: Cost 3 vsldoi12 <5,6,7,4>, <u,2,7,3>
+ 1628227686U, // <4,u,2,u>: Cost 2 vsldoi8 <2,u,4,u>, <2,u,4,u>
+ 2759022524U, // <4,u,3,0>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,0,1>
+ 2801342408U, // <4,u,3,1>: Cost 3 vsldoi12 <u,3,1,4>, <u,3,1,4>
+ 2703960409U, // <4,u,3,2>: Cost 3 vsldoi8 <3,2,4,u>, <3,2,4,u>
+ 2759022554U, // <4,u,3,3>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,3,4>
+ 2759022564U, // <4,u,3,4>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,4,5>
+ 2240845978U, // <4,u,3,5>: Cost 3 vmrghw <4,3,5,0>, RHS
+ 2706614941U, // <4,u,3,6>: Cost 3 vsldoi8 <3,6,4,u>, <3,6,4,u>
+ 2301267272U, // <4,u,3,7>: Cost 3 vmrglw <3,2,4,3>, RHS
+ 2759022596U, // <4,u,3,u>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,u,1>
+ 1570668646U, // <4,u,4,0>: Cost 2 vsldoi4 <4,4,u,4>, LHS
+ 1167726382U, // <4,u,4,1>: Cost 2 vmrghw <4,4,4,4>, LHS
+ 2698652753U, // <4,u,4,2>: Cost 3 vsldoi8 <2,3,4,u>, <4,2,u,3>
+ 1234829468U, // <4,u,4,3>: Cost 2 vmrglw <4,4,4,4>, LHS
+ 229035318U, // <4,u,4,4>: Cost 1 vspltisw0 RHS
+ 1624911158U, // <4,u,4,5>: Cost 2 vsldoi8 <2,3,4,u>, RHS
+ 2698653081U, // <4,u,4,6>: Cost 3 vsldoi8 <2,3,4,u>, <4,6,u,7>
+ 1234832712U, // <4,u,4,7>: Cost 2 vmrglw <4,4,4,4>, RHS
+ 229035318U, // <4,u,4,u>: Cost 1 vspltisw0 RHS
+ 1168561875U, // <4,u,5,0>: Cost 2 vmrghw RHS, <u,0,1,2>
+ 94820142U, // <4,u,5,1>: Cost 1 vmrghw RHS, LHS
+ 1168562053U, // <4,u,5,2>: Cost 2 vmrghw RHS, <u,2,3,0>
+ 1222230172U, // <4,u,5,3>: Cost 2 vmrglw <2,3,4,5>, LHS
+ 1168562239U, // <4,u,5,4>: Cost 2 vmrghw RHS, <u,4,5,6>
+ 94820506U, // <4,u,5,5>: Cost 1 vmrghw RHS, RHS
+ 1685280922U, // <4,u,5,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS
+ 1222233416U, // <4,u,5,7>: Cost 2 vmrglw <2,3,4,5>, RHS
+ 94820709U, // <4,u,5,u>: Cost 1 vmrghw RHS, LHS
+ 1564713062U, // <4,u,6,0>: Cost 2 vsldoi4 <3,4,u,6>, LHS
+ 2626511979U, // <4,u,6,1>: Cost 3 vsldoi4 <1,4,u,6>, <1,4,u,6>
+ 2632484676U, // <4,u,6,2>: Cost 3 vsldoi4 <2,4,u,6>, <2,4,u,6>
+ 1564715549U, // <4,u,6,3>: Cost 2 vsldoi4 <3,4,u,6>, <3,4,u,6>
+ 1564716342U, // <4,u,6,4>: Cost 2 vsldoi4 <3,4,u,6>, RHS
+ 2242853018U, // <4,u,6,5>: Cost 3 vmrghw <4,6,5,2>, RHS
+ 2656375464U, // <4,u,6,6>: Cost 3 vsldoi4 <6,4,u,6>, <6,4,u,6>
+ 27705344U, // <4,u,6,7>: Cost 0 copy RHS
+ 27705344U, // <4,u,6,u>: Cost 0 copy RHS
+ 2785859840U, // <4,u,7,0>: Cost 3 vsldoi12 <5,6,7,4>, <u,7,0,1>
+ 2243499822U, // <4,u,7,1>: Cost 3 vmrghw <4,7,5,0>, LHS
+ 2727851197U, // <4,u,7,2>: Cost 3 vsldoi8 <7,2,4,u>, <7,2,4,u>
+ 2303951004U, // <4,u,7,3>: Cost 3 vmrglw <3,6,4,7>, LHS
+ 2785859880U, // <4,u,7,4>: Cost 3 vsldoi12 <5,6,7,4>, <u,7,4,5>
+ 2243500186U, // <4,u,7,5>: Cost 3 vmrghw <4,7,5,0>, RHS
+ 2730505729U, // <4,u,7,6>: Cost 3 vsldoi8 <7,6,4,u>, <7,6,4,u>
+ 2303954248U, // <4,u,7,7>: Cost 3 vmrglw <3,6,4,7>, RHS
+ 2303951009U, // <4,u,7,u>: Cost 3 vmrglw <3,6,4,7>, LHS
+ 1564729446U, // <4,u,u,0>: Cost 2 vsldoi4 <3,4,u,u>, LHS
+ 96810798U, // <4,u,u,1>: Cost 1 vmrghw RHS, LHS
+ 1685281125U, // <4,u,u,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+ 1222254748U, // <4,u,u,3>: Cost 2 vmrglw <2,3,4,u>, LHS
+ 229035318U, // <4,u,u,4>: Cost 1 vspltisw0 RHS
+ 96811162U, // <4,u,u,5>: Cost 1 vmrghw RHS, RHS
+ 1685281165U, // <4,u,u,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS
+ 27705344U, // <4,u,u,7>: Cost 0 copy RHS
+ 27705344U, // <4,u,u,u>: Cost 0 copy RHS
+ 2754232320U, // <5,0,0,0>: Cost 3 vsldoi12 <0,4,1,5>, <0,0,0,0>
+ 2754232330U, // <5,0,0,1>: Cost 3 vsldoi12 <0,4,1,5>, <0,0,1,1>
+ 3718194894U, // <5,0,0,2>: Cost 4 vsldoi4 <4,5,0,0>, <2,3,4,5>
+ 3376385762U, // <5,0,0,3>: Cost 4 vmrglw <3,4,5,0>, <5,2,0,3>
+ 2754232357U, // <5,0,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <0,0,4,1>
+ 3845816370U, // <5,0,0,5>: Cost 4 vsldoi12 <3,4,0,5>, <0,0,5,5>
+ 3782353389U, // <5,0,0,6>: Cost 4 vsldoi8 <4,0,5,0>, <0,6,0,7>
+ 3376386090U, // <5,0,0,7>: Cost 4 vmrglw <3,4,5,0>, <5,6,0,7>
+ 2757402697U, // <5,0,0,u>: Cost 3 vsldoi12 <0,u,u,5>, <0,0,u,1>
+ 2626543718U, // <5,0,1,0>: Cost 3 vsldoi4 <1,5,0,1>, LHS
+ 2626544751U, // <5,0,1,1>: Cost 3 vsldoi4 <1,5,0,1>, <1,5,0,1>
+ 1680490598U, // <5,0,1,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+ 3766428665U, // <5,0,1,3>: Cost 4 vsldoi8 <1,3,5,0>, <1,3,5,0>
+ 2626546998U, // <5,0,1,4>: Cost 3 vsldoi4 <1,5,0,1>, RHS
+ 2650435539U, // <5,0,1,5>: Cost 3 vsldoi4 <5,5,0,1>, <5,5,0,1>
+ 3783017715U, // <5,0,1,6>: Cost 4 vsldoi8 <4,1,5,0>, <1,6,5,7>
+ 3385019000U, // <5,0,1,7>: Cost 4 vmrglw <4,u,5,1>, <3,6,0,7>
+ 1680490652U, // <5,0,1,u>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+ 3376398336U, // <5,0,2,0>: Cost 4 vmrglw <3,4,5,2>, <0,0,0,0>
+ 2245877862U, // <5,0,2,1>: Cost 3 vmrghw <5,2,1,3>, LHS
+ 3773064808U, // <5,0,2,2>: Cost 4 vsldoi8 <2,4,5,0>, <2,2,2,2>
+ 2705295054U, // <5,0,2,3>: Cost 3 vsldoi8 <3,4,5,0>, <2,3,4,5>
+ 3827974343U, // <5,0,2,4>: Cost 4 vsldoi12 <0,4,1,5>, <0,2,4,1>
+ 3845816530U, // <5,0,2,5>: Cost 4 vsldoi12 <3,4,0,5>, <0,2,5,3>
+ 3779037114U, // <5,0,2,6>: Cost 4 vsldoi8 <3,4,5,0>, <2,6,3,7>
+ 3810887658U, // <5,0,2,7>: Cost 4 vsldoi8 <u,7,5,0>, <2,7,0,1>
+ 2245878429U, // <5,0,2,u>: Cost 3 vmrghw <5,2,1,3>, LHS
+ 2710603926U, // <5,0,3,0>: Cost 3 vsldoi8 <4,3,5,0>, <3,0,1,2>
+ 3827974396U, // <5,0,3,1>: Cost 4 vsldoi12 <0,4,1,5>, <0,3,1,0>
+ 3779037516U, // <5,0,3,2>: Cost 4 vsldoi8 <3,4,5,0>, <3,2,3,4>
+ 3779037596U, // <5,0,3,3>: Cost 4 vsldoi8 <3,4,5,0>, <3,3,3,3>
+ 2705295868U, // <5,0,3,4>: Cost 3 vsldoi8 <3,4,5,0>, <3,4,5,0>
+ 3379726804U, // <5,0,3,5>: Cost 4 vmrglw <4,0,5,3>, <3,4,0,5>
+ 3802925748U, // <5,0,3,6>: Cost 4 vsldoi8 <7,4,5,0>, <3,6,7,4>
+ 3363138168U, // <5,0,3,7>: Cost 5 vmrglw <1,2,5,3>, <3,6,0,7>
+ 2707950400U, // <5,0,3,u>: Cost 3 vsldoi8 <3,u,5,0>, <3,u,5,0>
+ 2626568294U, // <5,0,4,0>: Cost 3 vsldoi4 <1,5,0,4>, LHS
+ 1680490834U, // <5,0,4,1>: Cost 2 vsldoi12 <0,4,1,5>, <0,4,1,5>
+ 3828048219U, // <5,0,4,2>: Cost 4 vsldoi12 <0,4,2,5>, <0,4,2,5>
+ 2710604932U, // <5,0,4,3>: Cost 3 vsldoi8 <4,3,5,0>, <4,3,5,0>
+ 2754232685U, // <5,0,4,4>: Cost 3 vsldoi12 <0,4,1,5>, <0,4,4,5>
+ 2705296694U, // <5,0,4,5>: Cost 3 vsldoi8 <3,4,5,0>, RHS
+ 3779038590U, // <5,0,4,6>: Cost 4 vsldoi8 <3,4,5,0>, <4,6,5,7>
+ 2713259464U, // <5,0,4,7>: Cost 3 vsldoi8 <4,7,5,0>, <4,7,5,0>
+ 1680490834U, // <5,0,4,u>: Cost 2 vsldoi12 <0,4,1,5>, <0,4,1,5>
+ 2311307264U, // <5,0,5,0>: Cost 3 vmrglw <4,u,5,5>, <0,0,0,0>
+ 1174437990U, // <5,0,5,1>: Cost 2 vmrghw <5,5,5,5>, LHS
+ 3779038946U, // <5,0,5,2>: Cost 4 vsldoi8 <3,4,5,0>, <5,2,0,3>
+ 3845816752U, // <5,0,5,3>: Cost 4 vsldoi12 <3,4,0,5>, <0,5,3,0>
+ 2248180050U, // <5,0,5,4>: Cost 3 vmrghw <5,5,5,5>, <0,4,1,5>
+ 2248180194U, // <5,0,5,5>: Cost 3 vmrghw <5,5,5,5>, <0,5,u,5>
+ 3779039274U, // <5,0,5,6>: Cost 4 vsldoi8 <3,4,5,0>, <5,6,0,7>
+ 3385051768U, // <5,0,5,7>: Cost 4 vmrglw <4,u,5,5>, <3,6,0,7>
+ 1174438557U, // <5,0,5,u>: Cost 2 vmrghw <5,5,5,5>, LHS
+ 2302689280U, // <5,0,6,0>: Cost 3 vmrglw <3,4,5,6>, <0,0,0,0>
+ 1175208038U, // <5,0,6,1>: Cost 2 vmrghw <5,6,7,0>, LHS
+ 3787002362U, // <5,0,6,2>: Cost 4 vsldoi8 <4,7,5,0>, <6,2,7,3>
+ 3376432160U, // <5,0,6,3>: Cost 4 vmrglw <3,4,5,6>, <1,4,0,3>
+ 2248950098U, // <5,0,6,4>: Cost 3 vmrghw <5,6,7,0>, <0,4,1,5>
+ 2248950180U, // <5,0,6,5>: Cost 3 vmrghw <5,6,7,0>, <0,5,1,6>
+ 3376433702U, // <5,0,6,6>: Cost 4 vmrglw <3,4,5,6>, <3,5,0,6>
+ 2729186166U, // <5,0,6,7>: Cost 3 vsldoi8 <7,4,5,0>, <6,7,4,5>
+ 1175208605U, // <5,0,6,u>: Cost 2 vmrghw <5,6,7,0>, LHS
+ 2713261050U, // <5,0,7,0>: Cost 3 vsldoi8 <4,7,5,0>, <7,0,1,2>
+ 3365823599U, // <5,0,7,1>: Cost 4 vmrglw <1,6,5,7>, <1,5,0,1>
+ 3808900317U, // <5,0,7,2>: Cost 4 vsldoi8 <u,4,5,0>, <7,2,u,4>
+ 3784348899U, // <5,0,7,3>: Cost 4 vsldoi8 <4,3,5,0>, <7,3,0,1>
+ 2729186656U, // <5,0,7,4>: Cost 3 vsldoi8 <7,4,5,0>, <7,4,5,0>
+ 3787003268U, // <5,0,7,5>: Cost 4 vsldoi8 <4,7,5,0>, <7,5,0,0>
+ 3802928664U, // <5,0,7,6>: Cost 4 vsldoi8 <7,4,5,0>, <7,6,7,4>
+ 3787003431U, // <5,0,7,7>: Cost 4 vsldoi8 <4,7,5,0>, <7,7,0,1>
+ 2731841188U, // <5,0,7,u>: Cost 3 vsldoi8 <7,u,5,0>, <7,u,5,0>
+ 2626601062U, // <5,0,u,0>: Cost 3 vsldoi4 <1,5,0,u>, LHS
+ 1683145366U, // <5,0,u,1>: Cost 2 vsldoi12 <0,u,1,5>, <0,u,1,5>
+ 1680491165U, // <5,0,u,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+ 2705295054U, // <5,0,u,3>: Cost 3 vsldoi8 <3,4,5,0>, <2,3,4,5>
+ 2754233005U, // <5,0,u,4>: Cost 3 vsldoi12 <0,4,1,5>, <0,u,4,1>
+ 2705299610U, // <5,0,u,5>: Cost 3 vsldoi8 <3,4,5,0>, RHS
+ 3779041488U, // <5,0,u,6>: Cost 4 vsldoi8 <3,4,5,0>, <u,6,3,7>
+ 2737150252U, // <5,0,u,7>: Cost 3 vsldoi8 <u,7,5,0>, <u,7,5,0>
+ 1680491219U, // <5,0,u,u>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+ 2713927680U, // <5,1,0,0>: Cost 3 vsldoi8 <4,u,5,1>, <0,0,0,0>
+ 1640185958U, // <5,1,0,1>: Cost 2 vsldoi8 <4,u,5,1>, LHS
+ 2310607866U, // <5,1,0,2>: Cost 3 vmrglw <4,7,5,0>, <7,0,1,2>
+ 3787669756U, // <5,1,0,3>: Cost 4 vsldoi8 <4,u,5,1>, <0,3,1,0>
+ 2713928018U, // <5,1,0,4>: Cost 3 vsldoi8 <4,u,5,1>, <0,4,1,5>
+ 2306621778U, // <5,1,0,5>: Cost 3 vmrglw <4,1,5,0>, <0,4,1,5>
+ 3787670006U, // <5,1,0,6>: Cost 4 vsldoi8 <4,u,5,1>, <0,6,1,7>
+ 3736188301U, // <5,1,0,7>: Cost 4 vsldoi4 <7,5,1,0>, <7,5,1,0>
+ 1640186525U, // <5,1,0,u>: Cost 2 vsldoi8 <4,u,5,1>, LHS
+ 2650505318U, // <5,1,1,0>: Cost 3 vsldoi4 <5,5,1,1>, LHS
+ 2754233140U, // <5,1,1,1>: Cost 3 vsldoi12 <0,4,1,5>, <1,1,1,1>
+ 2311276694U, // <5,1,1,2>: Cost 3 vmrglw <4,u,5,1>, <3,0,1,2>
+ 2311278315U, // <5,1,1,3>: Cost 3 vmrglw <4,u,5,1>, <5,2,1,3>
+ 2758435667U, // <5,1,1,4>: Cost 3 vsldoi12 <1,1,4,5>, <1,1,4,5>
+ 2754233180U, // <5,1,1,5>: Cost 3 vsldoi12 <0,4,1,5>, <1,1,5,5>
+ 3385016497U, // <5,1,1,6>: Cost 4 vmrglw <4,u,5,1>, <0,2,1,6>
+ 2311278643U, // <5,1,1,7>: Cost 3 vmrglw <4,u,5,1>, <5,6,1,7>
+ 2758730615U, // <5,1,1,u>: Cost 3 vsldoi12 <1,1,u,5>, <1,1,u,5>
+ 3700367462U, // <5,1,2,0>: Cost 4 vsldoi4 <1,5,1,2>, LHS
+ 3830629255U, // <5,1,2,1>: Cost 4 vsldoi12 <0,u,1,5>, <1,2,1,3>
+ 2713929320U, // <5,1,2,2>: Cost 3 vsldoi8 <4,u,5,1>, <2,2,2,2>
+ 2754233238U, // <5,1,2,3>: Cost 3 vsldoi12 <0,4,1,5>, <1,2,3,0>
+ 2759099300U, // <5,1,2,4>: Cost 3 vsldoi12 <1,2,4,5>, <1,2,4,5>
+ 2754233259U, // <5,1,2,5>: Cost 3 vsldoi12 <0,4,1,5>, <1,2,5,3>
+ 2713929658U, // <5,1,2,6>: Cost 3 vsldoi8 <4,u,5,1>, <2,6,3,7>
+ 3872359354U, // <5,1,2,7>: Cost 4 vsldoi12 <7,u,0,5>, <1,2,7,0>
+ 2754233283U, // <5,1,2,u>: Cost 3 vsldoi12 <0,4,1,5>, <1,2,u,0>
+ 2713929878U, // <5,1,3,0>: Cost 3 vsldoi8 <4,u,5,1>, <3,0,1,2>
+ 3363135498U, // <5,1,3,1>: Cost 4 vmrglw <1,2,5,3>, <0,0,1,1>
+ 3363137686U, // <5,1,3,2>: Cost 4 vmrglw <1,2,5,3>, <3,0,1,2>
+ 2713930140U, // <5,1,3,3>: Cost 3 vsldoi8 <4,u,5,1>, <3,3,3,3>
+ 2713930242U, // <5,1,3,4>: Cost 3 vsldoi8 <4,u,5,1>, <3,4,5,6>
+ 2289394002U, // <5,1,3,5>: Cost 3 vmrglw <1,2,5,3>, <0,4,1,5>
+ 3787672184U, // <5,1,3,6>: Cost 4 vsldoi8 <4,u,5,1>, <3,6,0,7>
+ 3787672259U, // <5,1,3,7>: Cost 4 vsldoi8 <4,u,5,1>, <3,7,0,1>
+ 2713930526U, // <5,1,3,u>: Cost 3 vsldoi8 <4,u,5,1>, <3,u,1,2>
+ 1634880402U, // <5,1,4,0>: Cost 2 vsldoi8 <4,0,5,1>, <4,0,5,1>
+ 2760205355U, // <5,1,4,1>: Cost 3 vsldoi12 <1,4,1,5>, <1,4,1,5>
+ 2760279092U, // <5,1,4,2>: Cost 3 vsldoi12 <1,4,2,5>, <1,4,2,5>
+ 3787672708U, // <5,1,4,3>: Cost 4 vsldoi8 <4,u,5,1>, <4,3,5,0>
+ 2713930960U, // <5,1,4,4>: Cost 3 vsldoi8 <4,u,5,1>, <4,4,4,4>
+ 1640189238U, // <5,1,4,5>: Cost 2 vsldoi8 <4,u,5,1>, RHS
+ 3786345848U, // <5,1,4,6>: Cost 4 vsldoi8 <4,6,5,1>, <4,6,5,1>
+ 3787009481U, // <5,1,4,7>: Cost 4 vsldoi8 <4,7,5,1>, <4,7,5,1>
+ 1640189466U, // <5,1,4,u>: Cost 2 vsldoi8 <4,u,5,1>, <4,u,5,1>
+ 2754233455U, // <5,1,5,0>: Cost 3 vsldoi12 <0,4,1,5>, <1,5,0,1>
+ 2713931407U, // <5,1,5,1>: Cost 3 vsldoi8 <4,u,5,1>, <5,1,0,1>
+ 2713931499U, // <5,1,5,2>: Cost 3 vsldoi8 <4,u,5,1>, <5,2,1,3>
+ 3827975305U, // <5,1,5,3>: Cost 4 vsldoi12 <0,4,1,5>, <1,5,3,0>
+ 2754233495U, // <5,1,5,4>: Cost 3 vsldoi12 <0,4,1,5>, <1,5,4,5>
+ 2288746834U, // <5,1,5,5>: Cost 3 vmrglw <1,1,5,5>, <0,4,1,5>
+ 2713931827U, // <5,1,5,6>: Cost 3 vsldoi8 <4,u,5,1>, <5,6,1,7>
+ 3787673725U, // <5,1,5,7>: Cost 4 vsldoi8 <4,u,5,1>, <5,7,1,0>
+ 2754233527U, // <5,1,5,u>: Cost 3 vsldoi12 <0,4,1,5>, <1,5,u,1>
+ 2668462182U, // <5,1,6,0>: Cost 3 vsldoi4 <u,5,1,6>, LHS
+ 2290746002U, // <5,1,6,1>: Cost 3 vmrglw <1,4,5,6>, <0,u,1,1>
+ 2302691478U, // <5,1,6,2>: Cost 3 vmrglw <3,4,5,6>, <3,0,1,2>
+ 3364488071U, // <5,1,6,3>: Cost 4 vmrglw <1,4,5,6>, <1,2,1,3>
+ 2302689536U, // <5,1,6,4>: Cost 3 vmrglw <3,4,5,6>, <0,3,1,4>
+ 2754233587U, // <5,1,6,5>: Cost 3 vsldoi12 <0,4,1,5>, <1,6,5,7>
+ 2713932600U, // <5,1,6,6>: Cost 3 vsldoi8 <4,u,5,1>, <6,6,6,6>
+ 2713932622U, // <5,1,6,7>: Cost 3 vsldoi8 <4,u,5,1>, <6,7,0,1>
+ 2302689297U, // <5,1,6,u>: Cost 3 vmrglw <3,4,5,6>, <0,0,1,u>
+ 2713932794U, // <5,1,7,0>: Cost 3 vsldoi8 <4,u,5,1>, <7,0,1,2>
+ 3365822474U, // <5,1,7,1>: Cost 4 vmrglw <1,6,5,7>, <0,0,1,1>
+ 3365824662U, // <5,1,7,2>: Cost 4 vmrglw <1,6,5,7>, <3,0,1,2>
+ 3787674851U, // <5,1,7,3>: Cost 4 vsldoi8 <4,u,5,1>, <7,3,0,1>
+ 2713933158U, // <5,1,7,4>: Cost 3 vsldoi8 <4,u,5,1>, <7,4,5,6>
+ 2292080978U, // <5,1,7,5>: Cost 3 vmrglw <1,6,5,7>, <0,4,1,5>
+ 3365823613U, // <5,1,7,6>: Cost 4 vmrglw <1,6,5,7>, <1,5,1,6>
+ 2713933420U, // <5,1,7,7>: Cost 3 vsldoi8 <4,u,5,1>, <7,7,7,7>
+ 2713933442U, // <5,1,7,u>: Cost 3 vsldoi8 <4,u,5,1>, <7,u,1,2>
+ 1658771190U, // <5,1,u,0>: Cost 2 vsldoi8 <u,0,5,1>, <u,0,5,1>
+ 1640191790U, // <5,1,u,1>: Cost 2 vsldoi8 <4,u,5,1>, LHS
+ 2762933624U, // <5,1,u,2>: Cost 3 vsldoi12 <1,u,2,5>, <1,u,2,5>
+ 2754233724U, // <5,1,u,3>: Cost 3 vsldoi12 <0,4,1,5>, <1,u,3,0>
+ 2763081098U, // <5,1,u,4>: Cost 3 vsldoi12 <1,u,4,5>, <1,u,4,5>
+ 1640192154U, // <5,1,u,5>: Cost 2 vsldoi8 <4,u,5,1>, RHS
+ 2713934032U, // <5,1,u,6>: Cost 3 vsldoi8 <4,u,5,1>, <u,6,3,7>
+ 2713934080U, // <5,1,u,7>: Cost 3 vsldoi8 <4,u,5,1>, <u,7,0,1>
+ 1640192357U, // <5,1,u,u>: Cost 2 vsldoi8 <4,u,5,1>, LHS
+ 3779051520U, // <5,2,0,0>: Cost 4 vsldoi8 <3,4,5,2>, <0,0,0,0>
+ 2705309798U, // <5,2,0,1>: Cost 3 vsldoi8 <3,4,5,2>, LHS
+ 3838813637U, // <5,2,0,2>: Cost 4 vsldoi12 <2,2,4,5>, <2,0,2,1>
+ 2302640230U, // <5,2,0,3>: Cost 3 vmrglw <3,4,5,0>, LHS
+ 3765117266U, // <5,2,0,4>: Cost 4 vsldoi8 <1,1,5,2>, <0,4,1,5>
+ 3381027892U, // <5,2,0,5>: Cost 4 vmrglw <4,2,5,0>, <1,4,2,5>
+ 3842794985U, // <5,2,0,6>: Cost 4 vsldoi12 <2,u,4,5>, <2,0,6,1>
+ 3408232554U, // <5,2,0,7>: Cost 4 vmrglw <u,7,5,0>, <0,1,2,7>
+ 2302640235U, // <5,2,0,u>: Cost 3 vmrglw <3,4,5,0>, LHS
+ 3700432998U, // <5,2,1,0>: Cost 4 vsldoi4 <1,5,2,1>, LHS
+ 3765117785U, // <5,2,1,1>: Cost 4 vsldoi8 <1,1,5,2>, <1,1,5,2>
+ 2311276136U, // <5,2,1,2>: Cost 3 vmrglw <4,u,5,1>, <2,2,2,2>
+ 1237532774U, // <5,2,1,3>: Cost 2 vmrglw <4,u,5,1>, LHS
+ 3700436278U, // <5,2,1,4>: Cost 4 vsldoi4 <1,5,2,1>, RHS
+ 3381036084U, // <5,2,1,5>: Cost 4 vmrglw <4,2,5,1>, <1,4,2,5>
+ 3385018045U, // <5,2,1,6>: Cost 4 vmrglw <4,u,5,1>, <2,3,2,6>
+ 3385017560U, // <5,2,1,7>: Cost 4 vmrglw <4,u,5,1>, <1,6,2,7>
+ 1237532779U, // <5,2,1,u>: Cost 2 vmrglw <4,u,5,1>, LHS
+ 3700441190U, // <5,2,2,0>: Cost 4 vsldoi4 <1,5,2,2>, LHS
+ 3700442242U, // <5,2,2,1>: Cost 4 vsldoi4 <1,5,2,2>, <1,5,2,2>
+ 2754233960U, // <5,2,2,2>: Cost 3 vsldoi12 <0,4,1,5>, <2,2,2,2>
+ 2754233970U, // <5,2,2,3>: Cost 3 vsldoi12 <0,4,1,5>, <2,2,3,3>
+ 2765071997U, // <5,2,2,4>: Cost 3 vsldoi12 <2,2,4,5>, <2,2,4,5>
+ 3834021508U, // <5,2,2,5>: Cost 4 vsldoi12 <1,4,2,5>, <2,2,5,3>
+ 3842795152U, // <5,2,2,6>: Cost 4 vsldoi12 <2,u,4,5>, <2,2,6,6>
+ 3376402492U, // <5,2,2,7>: Cost 4 vmrglw <3,4,5,2>, <5,6,2,7>
+ 2754234015U, // <5,2,2,u>: Cost 3 vsldoi12 <0,4,1,5>, <2,2,u,3>
+ 2754234022U, // <5,2,3,0>: Cost 3 vsldoi12 <0,4,1,5>, <2,3,0,1>
+ 3827975855U, // <5,2,3,1>: Cost 4 vsldoi12 <0,4,1,5>, <2,3,1,1>
+ 2644625102U, // <5,2,3,2>: Cost 3 vsldoi4 <4,5,2,3>, <2,3,4,5>
+ 2289393766U, // <5,2,3,3>: Cost 3 vmrglw <1,2,5,3>, LHS
+ 1691993806U, // <5,2,3,4>: Cost 2 vsldoi12 <2,3,4,5>, <2,3,4,5>
+ 2785052375U, // <5,2,3,5>: Cost 3 vsldoi12 <5,5,5,5>, <2,3,5,5>
+ 3854812897U, // <5,2,3,6>: Cost 4 vsldoi12 <4,u,5,5>, <2,3,6,6>
+ 3802942187U, // <5,2,3,7>: Cost 4 vsldoi8 <7,4,5,2>, <3,7,4,5>
+ 1692288754U, // <5,2,3,u>: Cost 2 vsldoi12 <2,3,u,5>, <2,3,u,5>
+ 3839846139U, // <5,2,4,0>: Cost 4 vsldoi12 <2,4,0,5>, <2,4,0,5>
+ 2709294052U, // <5,2,4,1>: Cost 3 vsldoi8 <4,1,5,2>, <4,1,5,2>
+ 2766251789U, // <5,2,4,2>: Cost 3 vsldoi12 <2,4,2,5>, <2,4,2,5>
+ 2765735702U, // <5,2,4,3>: Cost 3 vsldoi12 <2,3,4,5>, <2,4,3,5>
+ 3840141087U, // <5,2,4,4>: Cost 4 vsldoi12 <2,4,4,5>, <2,4,4,5>
+ 2705313078U, // <5,2,4,5>: Cost 3 vsldoi8 <3,4,5,2>, RHS
+ 2712612217U, // <5,2,4,6>: Cost 3 vsldoi8 <4,6,5,2>, <4,6,5,2>
+ 3787017674U, // <5,2,4,7>: Cost 4 vsldoi8 <4,7,5,2>, <4,7,5,2>
+ 2765735747U, // <5,2,4,u>: Cost 3 vsldoi12 <2,3,4,5>, <2,4,u,5>
+ 3834021704U, // <5,2,5,0>: Cost 4 vsldoi12 <1,4,2,5>, <2,5,0,1>
+ 3834021714U, // <5,2,5,1>: Cost 4 vsldoi12 <1,4,2,5>, <2,5,1,2>
+ 2311308904U, // <5,2,5,2>: Cost 3 vmrglw <4,u,5,5>, <2,2,2,2>
+ 1237565542U, // <5,2,5,3>: Cost 2 vmrglw <4,u,5,5>, LHS
+ 3834021744U, // <5,2,5,4>: Cost 4 vsldoi12 <1,4,2,5>, <2,5,4,5>
+ 3369124916U, // <5,2,5,5>: Cost 4 vmrglw <2,2,5,5>, <1,4,2,5>
+ 2248181690U, // <5,2,5,6>: Cost 3 vmrghw <5,5,5,5>, <2,6,3,7>
+ 3786354825U, // <5,2,5,7>: Cost 4 vsldoi8 <4,6,5,2>, <5,7,2,3>
+ 1237565547U, // <5,2,5,u>: Cost 2 vmrglw <4,u,5,5>, LHS
+ 3700473958U, // <5,2,6,0>: Cost 4 vsldoi4 <1,5,2,6>, LHS
+ 3700475014U, // <5,2,6,1>: Cost 4 vsldoi4 <1,5,2,6>, <1,5,2,6>
+ 2296718952U, // <5,2,6,2>: Cost 3 vmrglw <2,4,5,6>, <2,2,2,2>
+ 1228947558U, // <5,2,6,3>: Cost 2 vmrglw <3,4,5,6>, LHS
+ 3700477238U, // <5,2,6,4>: Cost 4 vsldoi4 <1,5,2,6>, RHS
+ 3834021836U, // <5,2,6,5>: Cost 4 vsldoi12 <1,4,2,5>, <2,6,5,7>
+ 2248951738U, // <5,2,6,6>: Cost 3 vmrghw <5,6,7,0>, <2,6,3,7>
+ 3370461105U, // <5,2,6,7>: Cost 4 vmrglw <2,4,5,6>, <2,6,2,7>
+ 1228947563U, // <5,2,6,u>: Cost 2 vmrglw <3,4,5,6>, LHS
+ 3786355706U, // <5,2,7,0>: Cost 4 vsldoi8 <4,6,5,2>, <7,0,1,2>
+ 3783038037U, // <5,2,7,1>: Cost 4 vsldoi8 <4,1,5,2>, <7,1,2,3>
+ 3365824104U, // <5,2,7,2>: Cost 4 vmrglw <1,6,5,7>, <2,2,2,2>
+ 2292080742U, // <5,2,7,3>: Cost 3 vmrglw <1,6,5,7>, LHS
+ 3842131986U, // <5,2,7,4>: Cost 4 vsldoi12 <2,7,4,5>, <2,7,4,5>
+ 3371795508U, // <5,2,7,5>: Cost 4 vmrglw <2,6,5,7>, <1,4,2,5>
+ 3786356206U, // <5,2,7,6>: Cost 4 vsldoi8 <4,6,5,2>, <7,6,2,7>
+ 3786356332U, // <5,2,7,7>: Cost 4 vsldoi8 <4,6,5,2>, <7,7,7,7>
+ 2292080747U, // <5,2,7,u>: Cost 3 vmrglw <1,6,5,7>, LHS
+ 2754234427U, // <5,2,u,0>: Cost 3 vsldoi12 <0,4,1,5>, <2,u,0,1>
+ 2705315630U, // <5,2,u,1>: Cost 3 vsldoi8 <3,4,5,2>, LHS
+ 2296735336U, // <5,2,u,2>: Cost 3 vmrglw <2,4,5,u>, <2,2,2,2>
+ 1228963942U, // <5,2,u,3>: Cost 2 vmrglw <3,4,5,u>, LHS
+ 1695311971U, // <5,2,u,4>: Cost 2 vsldoi12 <2,u,4,5>, <2,u,4,5>
+ 2705315994U, // <5,2,u,5>: Cost 3 vsldoi8 <3,4,5,2>, RHS
+ 2769201269U, // <5,2,u,6>: Cost 3 vsldoi12 <2,u,6,5>, <2,u,6,5>
+ 3370477489U, // <5,2,u,7>: Cost 4 vmrglw <2,4,5,u>, <2,6,2,7>
+ 1695606919U, // <5,2,u,u>: Cost 2 vsldoi12 <2,u,u,5>, <2,u,u,5>
+ 3827976331U, // <5,3,0,0>: Cost 4 vsldoi12 <0,4,1,5>, <3,0,0,0>
+ 2754234518U, // <5,3,0,1>: Cost 3 vsldoi12 <0,4,1,5>, <3,0,1,2>
+ 3706472290U, // <5,3,0,2>: Cost 4 vsldoi4 <2,5,3,0>, <2,5,3,0>
+ 3700500630U, // <5,3,0,3>: Cost 4 vsldoi4 <1,5,3,0>, <3,0,1,2>
+ 2754234544U, // <5,3,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <3,0,4,1>
+ 3376383766U, // <5,3,0,5>: Cost 4 vmrglw <3,4,5,0>, <2,4,3,5>
+ 3769770513U, // <5,3,0,6>: Cost 5 vsldoi8 <1,u,5,3>, <0,6,4,7>
+ 3376383930U, // <5,3,0,7>: Cost 4 vmrglw <3,4,5,0>, <2,6,3,7>
+ 2754234581U, // <5,3,0,u>: Cost 3 vsldoi12 <0,4,1,5>, <3,0,u,2>
+ 2311275414U, // <5,3,1,0>: Cost 3 vmrglw <4,u,5,1>, <1,2,3,0>
+ 2305967971U, // <5,3,1,1>: Cost 3 vmrglw <4,0,5,1>, <2,5,3,1>
+ 2692047787U, // <5,3,1,2>: Cost 3 vsldoi8 <1,2,5,3>, <1,2,5,3>
+ 2311276146U, // <5,3,1,3>: Cost 3 vmrglw <4,u,5,1>, <2,2,3,3>
+ 2311275418U, // <5,3,1,4>: Cost 3 vmrglw <4,u,5,1>, <1,2,3,4>
+ 3765789807U, // <5,3,1,5>: Cost 4 vsldoi8 <1,2,5,3>, <1,5,0,1>
+ 3765789939U, // <5,3,1,6>: Cost 4 vsldoi8 <1,2,5,3>, <1,6,5,7>
+ 2311276474U, // <5,3,1,7>: Cost 3 vmrglw <4,u,5,1>, <2,6,3,7>
+ 2696029585U, // <5,3,1,u>: Cost 3 vsldoi8 <1,u,5,3>, <1,u,5,3>
+ 2311288709U, // <5,3,2,0>: Cost 3 vmrglw <4,u,5,2>, <u,2,3,0>
+ 3765790243U, // <5,3,2,1>: Cost 4 vsldoi8 <1,2,5,3>, <2,1,3,5>
+ 3827976513U, // <5,3,2,2>: Cost 4 vsldoi12 <0,4,1,5>, <3,2,2,2>
+ 2765736268U, // <5,3,2,3>: Cost 3 vsldoi12 <2,3,4,5>, <3,2,3,4>
+ 2246248962U, // <5,3,2,4>: Cost 3 vmrghw <5,2,6,3>, <3,4,5,6>
+ 3765790563U, // <5,3,2,5>: Cost 4 vsldoi8 <1,2,5,3>, <2,5,3,1>
+ 3827976550U, // <5,3,2,6>: Cost 4 vsldoi12 <0,4,1,5>, <3,2,6,3>
+ 3842795887U, // <5,3,2,7>: Cost 4 vsldoi12 <2,u,4,5>, <3,2,7,3>
+ 2769054073U, // <5,3,2,u>: Cost 3 vsldoi12 <2,u,4,5>, <3,2,u,4>
+ 3827976575U, // <5,3,3,0>: Cost 4 vsldoi12 <0,4,1,5>, <3,3,0,1>
+ 3765790963U, // <5,3,3,1>: Cost 4 vsldoi8 <1,2,5,3>, <3,1,2,5>
+ 3839478162U, // <5,3,3,2>: Cost 4 vsldoi12 <2,3,4,5>, <3,3,2,2>
+ 2754234780U, // <5,3,3,3>: Cost 3 vsldoi12 <0,4,1,5>, <3,3,3,3>
+ 2771708327U, // <5,3,3,4>: Cost 3 vsldoi12 <3,3,4,5>, <3,3,4,5>
+ 3363137059U, // <5,3,3,5>: Cost 4 vmrglw <1,2,5,3>, <2,1,3,5>
+ 3375081320U, // <5,3,3,6>: Cost 4 vmrglw <3,2,5,3>, <2,5,3,6>
+ 3363137466U, // <5,3,3,7>: Cost 4 vmrglw <1,2,5,3>, <2,6,3,7>
+ 2772003275U, // <5,3,3,u>: Cost 3 vsldoi12 <3,3,u,5>, <3,3,u,5>
+ 2772077012U, // <5,3,4,0>: Cost 3 vsldoi12 <3,4,0,5>, <3,4,0,5>
+ 3765791714U, // <5,3,4,1>: Cost 4 vsldoi8 <1,2,5,3>, <4,1,5,0>
+ 2709965878U, // <5,3,4,2>: Cost 3 vsldoi8 <4,2,5,3>, <4,2,5,3>
+ 2772298223U, // <5,3,4,3>: Cost 3 vsldoi12 <3,4,3,5>, <3,4,3,5>
+ 2772371960U, // <5,3,4,4>: Cost 3 vsldoi12 <3,4,4,5>, <3,4,4,5>
+ 2754234882U, // <5,3,4,5>: Cost 3 vsldoi12 <0,4,1,5>, <3,4,5,6>
+ 3839478282U, // <5,3,4,6>: Cost 4 vsldoi12 <2,3,4,5>, <3,4,6,5>
+ 3376416698U, // <5,3,4,7>: Cost 4 vmrglw <3,4,5,4>, <2,6,3,7>
+ 2754234909U, // <5,3,4,u>: Cost 3 vsldoi12 <0,4,1,5>, <3,4,u,6>
+ 2311308182U, // <5,3,5,0>: Cost 3 vmrglw <4,u,5,5>, <1,2,3,0>
+ 3765792421U, // <5,3,5,1>: Cost 4 vsldoi8 <1,2,5,3>, <5,1,2,5>
+ 2715938575U, // <5,3,5,2>: Cost 3 vsldoi8 <5,2,5,3>, <5,2,5,3>
+ 2311308914U, // <5,3,5,3>: Cost 3 vmrglw <4,u,5,5>, <2,2,3,3>
+ 2311308186U, // <5,3,5,4>: Cost 3 vmrglw <4,u,5,5>, <1,2,3,4>
+ 2248182354U, // <5,3,5,5>: Cost 3 vmrghw <5,5,5,5>, <3,5,5,5>
+ 3765792837U, // <5,3,5,6>: Cost 4 vsldoi8 <1,2,5,3>, <5,6,3,7>
+ 2311309242U, // <5,3,5,7>: Cost 3 vmrglw <4,u,5,5>, <2,6,3,7>
+ 2311308190U, // <5,3,5,u>: Cost 3 vmrglw <4,u,5,5>, <1,2,3,u>
+ 2632777830U, // <5,3,6,0>: Cost 3 vsldoi4 <2,5,3,6>, LHS
+ 3706520372U, // <5,3,6,1>: Cost 4 vsldoi4 <2,5,3,6>, <1,1,1,1>
+ 2632779624U, // <5,3,6,2>: Cost 3 vsldoi4 <2,5,3,6>, <2,5,3,6>
+ 2632780290U, // <5,3,6,3>: Cost 3 vsldoi4 <2,5,3,6>, <3,4,5,6>
+ 2632781110U, // <5,3,6,4>: Cost 3 vsldoi4 <2,5,3,6>, RHS
+ 2248952413U, // <5,3,6,5>: Cost 3 vmrghw <5,6,7,0>, <3,5,6,7>
+ 2302691176U, // <5,3,6,6>: Cost 3 vmrglw <3,4,5,6>, <2,5,3,6>
+ 2302691258U, // <5,3,6,7>: Cost 3 vmrglw <3,4,5,6>, <2,6,3,7>
+ 2632783662U, // <5,3,6,u>: Cost 3 vsldoi4 <2,5,3,6>, LHS
+ 3365823382U, // <5,3,7,0>: Cost 4 vmrglw <1,6,5,7>, <1,2,3,0>
+ 3706529011U, // <5,3,7,1>: Cost 4 vsldoi4 <2,5,3,7>, <1,6,5,7>
+ 3706529641U, // <5,3,7,2>: Cost 4 vsldoi4 <2,5,3,7>, <2,5,3,7>
+ 3365824114U, // <5,3,7,3>: Cost 4 vmrglw <1,6,5,7>, <2,2,3,3>
+ 2774362859U, // <5,3,7,4>: Cost 3 vsldoi12 <3,7,4,5>, <3,7,4,5>
+ 3365824035U, // <5,3,7,5>: Cost 4 vmrglw <1,6,5,7>, <2,1,3,5>
+ 3383740183U, // <5,3,7,6>: Cost 4 vmrglw <4,6,5,7>, <2,4,3,6>
+ 3363833786U, // <5,3,7,7>: Cost 4 vmrglw <1,3,5,7>, <2,6,3,7>
+ 2774657807U, // <5,3,7,u>: Cost 3 vsldoi12 <3,7,u,5>, <3,7,u,5>
+ 2632794214U, // <5,3,u,0>: Cost 3 vsldoi4 <2,5,3,u>, LHS
+ 2754235166U, // <5,3,u,1>: Cost 3 vsldoi12 <0,4,1,5>, <3,u,1,2>
+ 2632796010U, // <5,3,u,2>: Cost 3 vsldoi4 <2,5,3,u>, <2,5,3,u>
+ 2632796676U, // <5,3,u,3>: Cost 3 vsldoi4 <2,5,3,u>, <3,4,5,u>
+ 2632797494U, // <5,3,u,4>: Cost 3 vsldoi4 <2,5,3,u>, RHS
+ 2754235206U, // <5,3,u,5>: Cost 3 vsldoi12 <0,4,1,5>, <3,u,5,6>
+ 2302691176U, // <5,3,u,6>: Cost 3 vmrglw <3,4,5,6>, <2,5,3,6>
+ 2302707642U, // <5,3,u,7>: Cost 3 vmrglw <3,4,5,u>, <2,6,3,7>
+ 2754235229U, // <5,3,u,u>: Cost 3 vsldoi12 <0,4,1,5>, <3,u,u,2>
+ 3765133325U, // <5,4,0,0>: Cost 4 vsldoi8 <1,1,5,4>, <0,0,1,4>
+ 2705326182U, // <5,4,0,1>: Cost 3 vsldoi8 <3,4,5,4>, LHS
+ 3718489806U, // <5,4,0,2>: Cost 4 vsldoi4 <4,5,4,0>, <2,3,4,5>
+ 3718490624U, // <5,4,0,3>: Cost 4 vsldoi4 <4,5,4,0>, <3,4,5,4>
+ 2709307730U, // <5,4,0,4>: Cost 3 vsldoi8 <4,1,5,4>, <0,4,1,5>
+ 2302641870U, // <5,4,0,5>: Cost 3 vmrglw <3,4,5,0>, <2,3,4,5>
+ 3376383695U, // <5,4,0,6>: Cost 5 vmrglw <3,4,5,0>, <2,3,4,6>
+ 3384351018U, // <5,4,0,7>: Cost 4 vmrglw <4,7,5,0>, <u,7,4,7>
+ 2705326749U, // <5,4,0,u>: Cost 3 vsldoi8 <3,4,5,4>, LHS
+ 2305971057U, // <5,4,1,0>: Cost 3 vmrglw <4,0,5,1>, <6,7,4,0>
+ 3765134171U, // <5,4,1,1>: Cost 4 vsldoi8 <1,1,5,4>, <1,1,5,4>
+ 3766461338U, // <5,4,1,2>: Cost 4 vsldoi8 <1,3,5,4>, <1,2,3,4>
+ 3766461437U, // <5,4,1,3>: Cost 4 vsldoi8 <1,3,5,4>, <1,3,5,4>
+ 2311277776U, // <5,4,1,4>: Cost 3 vmrglw <4,u,5,1>, <4,4,4,4>
+ 2754235362U, // <5,4,1,5>: Cost 3 vsldoi12 <0,4,1,5>, <4,1,5,0>
+ 3783050483U, // <5,4,1,6>: Cost 4 vsldoi8 <4,1,5,4>, <1,6,5,7>
+ 3385019036U, // <5,4,1,7>: Cost 4 vmrglw <4,u,5,1>, <3,6,4,7>
+ 2311276241U, // <5,4,1,u>: Cost 3 vmrglw <4,u,5,1>, <2,3,4,u>
+ 3718504550U, // <5,4,2,0>: Cost 4 vsldoi4 <4,5,4,2>, LHS
+ 3783050787U, // <5,4,2,1>: Cost 4 vsldoi8 <4,1,5,4>, <2,1,3,5>
+ 3773097576U, // <5,4,2,2>: Cost 4 vsldoi8 <2,4,5,4>, <2,2,2,2>
+ 2705327822U, // <5,4,2,3>: Cost 3 vsldoi8 <3,4,5,4>, <2,3,4,5>
+ 3773097767U, // <5,4,2,4>: Cost 4 vsldoi8 <2,4,5,4>, <2,4,5,4>
+ 2765737014U, // <5,4,2,5>: Cost 3 vsldoi12 <2,3,4,5>, <4,2,5,3>
+ 3779069882U, // <5,4,2,6>: Cost 4 vsldoi8 <3,4,5,4>, <2,6,3,7>
+ 3376401052U, // <5,4,2,7>: Cost 5 vmrglw <3,4,5,2>, <3,6,4,7>
+ 2245881370U, // <5,4,2,u>: Cost 3 vmrghw <5,2,1,3>, <4,u,5,1>
+ 3779070102U, // <5,4,3,0>: Cost 4 vsldoi8 <3,4,5,4>, <3,0,1,2>
+ 3363135525U, // <5,4,3,1>: Cost 4 vmrglw <1,2,5,3>, <0,0,4,1>
+ 3779070284U, // <5,4,3,2>: Cost 4 vsldoi8 <3,4,5,4>, <3,2,3,4>
+ 3779070364U, // <5,4,3,3>: Cost 4 vsldoi8 <3,4,5,4>, <3,3,3,3>
+ 2705328640U, // <5,4,3,4>: Cost 3 vsldoi8 <3,4,5,4>, <3,4,5,4>
+ 2307311310U, // <5,4,3,5>: Cost 3 vmrglw <4,2,5,3>, <2,3,4,5>
+ 3866021012U, // <5,4,3,6>: Cost 4 vsldoi12 <6,7,4,5>, <4,3,6,7>
+ 3363138204U, // <5,4,3,7>: Cost 5 vmrglw <1,2,5,3>, <3,6,4,7>
+ 2707983172U, // <5,4,3,u>: Cost 3 vsldoi8 <3,u,5,4>, <3,u,5,4>
+ 2708646805U, // <5,4,4,0>: Cost 3 vsldoi8 <4,0,5,4>, <4,0,5,4>
+ 2709310438U, // <5,4,4,1>: Cost 3 vsldoi8 <4,1,5,4>, <4,1,5,4>
+ 3779071030U, // <5,4,4,2>: Cost 4 vsldoi8 <3,4,5,4>, <4,2,5,3>
+ 2710637704U, // <5,4,4,3>: Cost 3 vsldoi8 <4,3,5,4>, <4,3,5,4>
+ 2754235600U, // <5,4,4,4>: Cost 3 vsldoi12 <0,4,1,5>, <4,4,4,4>
+ 1704676570U, // <5,4,4,5>: Cost 2 vsldoi12 <4,4,5,5>, <4,4,5,5>
+ 3779071358U, // <5,4,4,6>: Cost 4 vsldoi8 <3,4,5,4>, <4,6,5,7>
+ 2713292236U, // <5,4,4,7>: Cost 3 vsldoi8 <4,7,5,4>, <4,7,5,4>
+ 1704897781U, // <5,4,4,u>: Cost 2 vsldoi12 <4,4,u,5>, <4,4,u,5>
+ 2626871398U, // <5,4,5,0>: Cost 3 vsldoi4 <1,5,4,5>, LHS
+ 2626872471U, // <5,4,5,1>: Cost 3 vsldoi4 <1,5,4,5>, <1,5,4,5>
+ 2765737230U, // <5,4,5,2>: Cost 3 vsldoi12 <2,3,4,5>, <4,5,2,3>
+ 3700615318U, // <5,4,5,3>: Cost 4 vsldoi4 <1,5,4,5>, <3,0,1,2>
+ 2626874678U, // <5,4,5,4>: Cost 3 vsldoi4 <1,5,4,5>, RHS
+ 1174441270U, // <5,4,5,5>: Cost 2 vmrghw <5,5,5,5>, RHS
+ 1680493878U, // <5,4,5,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+ 3385051804U, // <5,4,5,7>: Cost 4 vmrglw <4,u,5,5>, <3,6,4,7>
+ 1680493896U, // <5,4,5,u>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+ 2248952722U, // <5,4,6,0>: Cost 3 vmrghw <5,6,7,0>, <4,0,5,1>
+ 2302692152U, // <5,4,6,1>: Cost 3 vmrglw <3,4,5,6>, <3,u,4,1>
+ 3382406107U, // <5,4,6,2>: Cost 4 vmrglw <4,4,5,6>, <4,1,4,2>
+ 3700623874U, // <5,4,6,3>: Cost 4 vsldoi4 <1,5,4,6>, <3,4,5,6>
+ 2248953040U, // <5,4,6,4>: Cost 3 vmrghw <5,6,7,0>, <4,4,4,4>
+ 1175211318U, // <5,4,6,5>: Cost 2 vmrghw <5,6,7,0>, RHS
+ 3376432280U, // <5,4,6,6>: Cost 4 vmrglw <3,4,5,6>, <1,5,4,6>
+ 2729218934U, // <5,4,6,7>: Cost 3 vsldoi8 <7,4,5,4>, <6,7,4,5>
+ 1175211561U, // <5,4,6,u>: Cost 2 vmrghw <5,6,7,0>, RHS
+ 3787035642U, // <5,4,7,0>: Cost 4 vsldoi8 <4,7,5,4>, <7,0,1,2>
+ 3365822501U, // <5,4,7,1>: Cost 4 vmrglw <1,6,5,7>, <0,0,4,1>
+ 3808933085U, // <5,4,7,2>: Cost 4 vsldoi8 <u,4,5,4>, <7,2,u,4>
+ 3784381707U, // <5,4,7,3>: Cost 4 vsldoi8 <4,3,5,4>, <7,3,4,5>
+ 2713294182U, // <5,4,7,4>: Cost 3 vsldoi8 <4,7,5,4>, <7,4,5,6>
+ 2309998286U, // <5,4,7,5>: Cost 3 vmrglw <4,6,5,7>, <2,3,4,5>
+ 3383740111U, // <5,4,7,6>: Cost 4 vmrglw <4,6,5,7>, <2,3,4,6>
+ 3787036239U, // <5,4,7,7>: Cost 4 vsldoi8 <4,7,5,4>, <7,7,4,5>
+ 2731873960U, // <5,4,7,u>: Cost 3 vsldoi8 <7,u,5,4>, <7,u,5,4>
+ 2626895974U, // <5,4,u,0>: Cost 3 vsldoi4 <1,5,4,u>, LHS
+ 2626897050U, // <5,4,u,1>: Cost 3 vsldoi4 <1,5,4,u>, <1,5,4,u>
+ 2644813518U, // <5,4,u,2>: Cost 3 vsldoi4 <4,5,4,u>, <2,3,4,5>
+ 2705327822U, // <5,4,u,3>: Cost 3 vsldoi8 <3,4,5,4>, <2,3,4,5>
+ 2626899254U, // <5,4,u,4>: Cost 3 vsldoi4 <1,5,4,u>, RHS
+ 1707331102U, // <5,4,u,5>: Cost 2 vsldoi12 <4,u,5,5>, <4,u,5,5>
+ 1680494121U, // <5,4,u,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+ 2737183024U, // <5,4,u,7>: Cost 3 vsldoi8 <u,7,5,4>, <u,7,5,4>
+ 1680494139U, // <5,4,u,u>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+ 2302642684U, // <5,5,0,0>: Cost 3 vmrglw <3,4,5,0>, <3,4,5,0>
+ 1640218726U, // <5,5,0,1>: Cost 2 vsldoi8 <4,u,5,5>, LHS
+ 3376384510U, // <5,5,0,2>: Cost 4 vmrglw <3,4,5,0>, <3,4,5,2>
+ 3376385078U, // <5,5,0,3>: Cost 4 vmrglw <3,4,5,0>, <4,2,5,3>
+ 2754236002U, // <5,5,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <5,0,4,1>
+ 2717942242U, // <5,5,0,5>: Cost 3 vsldoi8 <5,5,5,5>, <0,5,u,5>
+ 2244907106U, // <5,5,0,6>: Cost 3 vmrghw <5,0,6,1>, <5,6,7,0>
+ 3376385406U, // <5,5,0,7>: Cost 4 vmrglw <3,4,5,0>, <4,6,5,7>
+ 1640219293U, // <5,5,0,u>: Cost 2 vsldoi8 <4,u,5,5>, LHS
+ 2305969365U, // <5,5,1,0>: Cost 3 vmrglw <4,0,5,1>, <4,4,5,0>
+ 1237536282U, // <5,5,1,1>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1>
+ 2713961366U, // <5,5,1,2>: Cost 3 vsldoi8 <4,u,5,5>, <1,2,3,0>
+ 3766469630U, // <5,5,1,3>: Cost 4 vsldoi8 <1,3,5,5>, <1,3,5,5>
+ 2782326455U, // <5,5,1,4>: Cost 3 vsldoi12 <5,1,4,5>, <5,1,4,5>
+ 2311277786U, // <5,5,1,5>: Cost 3 vmrglw <4,u,5,1>, <4,4,5,5>
+ 2311277058U, // <5,5,1,6>: Cost 3 vmrglw <4,u,5,1>, <3,4,5,6>
+ 3385017587U, // <5,5,1,7>: Cost 4 vmrglw <4,u,5,1>, <1,6,5,7>
+ 1237536282U, // <5,5,1,u>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1>
+ 3376400892U, // <5,5,2,0>: Cost 4 vmrglw <3,4,5,2>, <3,4,5,0>
+ 3827977963U, // <5,5,2,1>: Cost 4 vsldoi12 <0,4,1,5>, <5,2,1,3>
+ 2302659070U, // <5,5,2,2>: Cost 3 vmrglw <3,4,5,2>, <3,4,5,2>
+ 2765737726U, // <5,5,2,3>: Cost 3 vsldoi12 <2,3,4,5>, <5,2,3,4>
+ 3839479558U, // <5,5,2,4>: Cost 4 vsldoi12 <2,3,4,5>, <5,2,4,3>
+ 2781073167U, // <5,5,2,5>: Cost 3 vsldoi12 <4,u,5,5>, <5,2,5,3>
+ 2713962426U, // <5,5,2,6>: Cost 3 vsldoi8 <4,u,5,5>, <2,6,3,7>
+ 3376401790U, // <5,5,2,7>: Cost 4 vmrglw <3,4,5,2>, <4,6,5,7>
+ 2769055531U, // <5,5,2,u>: Cost 3 vsldoi12 <2,u,4,5>, <5,2,u,4>
+ 2713962646U, // <5,5,3,0>: Cost 3 vsldoi8 <4,u,5,5>, <3,0,1,2>
+ 3765143786U, // <5,5,3,1>: Cost 4 vsldoi8 <1,1,5,5>, <3,1,1,5>
+ 3839479621U, // <5,5,3,2>: Cost 4 vsldoi12 <2,3,4,5>, <5,3,2,3>
+ 2289394603U, // <5,5,3,3>: Cost 3 vmrglw <1,2,5,3>, <1,2,5,3>
+ 2713963010U, // <5,5,3,4>: Cost 3 vsldoi8 <4,u,5,5>, <3,4,5,6>
+ 2313285150U, // <5,5,3,5>: Cost 3 vmrglw <5,2,5,3>, <4,u,5,5>
+ 3363138050U, // <5,5,3,6>: Cost 4 vmrglw <1,2,5,3>, <3,4,5,6>
+ 3363136755U, // <5,5,3,7>: Cost 4 vmrglw <1,2,5,3>, <1,6,5,7>
+ 2713963294U, // <5,5,3,u>: Cost 3 vsldoi8 <4,u,5,5>, <3,u,1,2>
+ 2713963410U, // <5,5,4,0>: Cost 3 vsldoi8 <4,u,5,5>, <4,0,5,1>
+ 3827978127U, // <5,5,4,1>: Cost 4 vsldoi12 <0,4,1,5>, <5,4,1,5>
+ 3839479704U, // <5,5,4,2>: Cost 4 vsldoi12 <2,3,4,5>, <5,4,2,5>
+ 3376417846U, // <5,5,4,3>: Cost 4 vmrglw <3,4,5,4>, <4,2,5,3>
+ 1637567706U, // <5,5,4,4>: Cost 2 vsldoi8 <4,4,5,5>, <4,4,5,5>
+ 1640222006U, // <5,5,4,5>: Cost 2 vsldoi8 <4,u,5,5>, RHS
+ 2310640998U, // <5,5,4,6>: Cost 3 vmrglw <4,7,5,4>, <7,4,5,6>
+ 3376418174U, // <5,5,4,7>: Cost 4 vmrglw <3,4,5,4>, <4,6,5,7>
+ 1640222238U, // <5,5,4,u>: Cost 2 vsldoi8 <4,u,5,5>, <4,u,5,5>
+ 1577091174U, // <5,5,5,0>: Cost 2 vsldoi4 <5,5,5,5>, LHS
+ 2311310226U, // <5,5,5,1>: Cost 3 vmrglw <4,u,5,5>, <4,0,5,1>
+ 2713964303U, // <5,5,5,2>: Cost 3 vsldoi8 <4,u,5,5>, <5,2,5,3>
+ 2311311119U, // <5,5,5,3>: Cost 3 vmrglw <4,u,5,5>, <5,2,5,3>
+ 1577094454U, // <5,5,5,4>: Cost 2 vsldoi4 <5,5,5,5>, RHS
+ 296144182U, // <5,5,5,5>: Cost 1 vspltisw1 RHS
+ 2311309826U, // <5,5,5,6>: Cost 3 vmrglw <4,u,5,5>, <3,4,5,6>
+ 2311311447U, // <5,5,5,7>: Cost 3 vmrglw <4,u,5,5>, <5,6,5,7>
+ 296144182U, // <5,5,5,u>: Cost 1 vspltisw1 RHS
+ 2248953460U, // <5,5,6,0>: Cost 3 vmrghw <5,6,7,0>, <5,0,6,1>
+ 2326580114U, // <5,5,6,1>: Cost 3 vmrglw <7,4,5,6>, <4,0,5,1>
+ 2713965050U, // <5,5,6,2>: Cost 3 vsldoi8 <4,u,5,5>, <6,2,7,3>
+ 3700697602U, // <5,5,6,3>: Cost 4 vsldoi4 <1,5,5,6>, <3,4,5,6>
+ 2785644620U, // <5,5,6,4>: Cost 3 vsldoi12 <5,6,4,5>, <5,6,4,5>
+ 2781073495U, // <5,5,6,5>: Cost 3 vsldoi12 <4,u,5,5>, <5,6,5,7>
+ 1228950018U, // <5,5,6,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6>
+ 2713965390U, // <5,5,6,7>: Cost 3 vsldoi8 <4,u,5,5>, <6,7,0,1>
+ 1228950018U, // <5,5,6,u>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6>
+ 2713965562U, // <5,5,7,0>: Cost 3 vsldoi8 <4,u,5,5>, <7,0,1,2>
+ 3383741330U, // <5,5,7,1>: Cost 4 vmrglw <4,6,5,7>, <4,0,5,1>
+ 3718620878U, // <5,5,7,2>: Cost 4 vsldoi4 <4,5,5,7>, <2,3,4,5>
+ 3365823403U, // <5,5,7,3>: Cost 4 vmrglw <1,6,5,7>, <1,2,5,3>
+ 2713965926U, // <5,5,7,4>: Cost 3 vsldoi8 <4,u,5,5>, <7,4,5,6>
+ 2717947318U, // <5,5,7,5>: Cost 3 vsldoi8 <5,5,5,5>, <7,5,5,5>
+ 3365825026U, // <5,5,7,6>: Cost 4 vmrglw <1,6,5,7>, <3,4,5,6>
+ 2292081907U, // <5,5,7,7>: Cost 3 vmrglw <1,6,5,7>, <1,6,5,7>
+ 2713966210U, // <5,5,7,u>: Cost 3 vsldoi8 <4,u,5,5>, <7,u,1,2>
+ 1577091174U, // <5,5,u,0>: Cost 2 vsldoi4 <5,5,5,5>, LHS
+ 1640224558U, // <5,5,u,1>: Cost 2 vsldoi8 <4,u,5,5>, LHS
+ 2713966469U, // <5,5,u,2>: Cost 3 vsldoi8 <4,u,5,5>, <u,2,3,0>
+ 2713966524U, // <5,5,u,3>: Cost 3 vsldoi8 <4,u,5,5>, <u,3,0,1>
+ 1577094454U, // <5,5,u,4>: Cost 2 vsldoi4 <5,5,5,5>, RHS
+ 296144182U, // <5,5,u,5>: Cost 1 vspltisw1 RHS
+ 1228950018U, // <5,5,u,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6>
+ 2713966848U, // <5,5,u,7>: Cost 3 vsldoi8 <4,u,5,5>, <u,7,0,1>
+ 296144182U, // <5,5,u,u>: Cost 1 vspltisw1 RHS
+ 2705342464U, // <5,6,0,0>: Cost 3 vsldoi8 <3,4,5,6>, <0,0,0,0>
+ 1631600742U, // <5,6,0,1>: Cost 2 vsldoi8 <3,4,5,6>, LHS
+ 3773112493U, // <5,6,0,2>: Cost 4 vsldoi8 <2,4,5,6>, <0,2,1,2>
+ 2705342720U, // <5,6,0,3>: Cost 3 vsldoi8 <3,4,5,6>, <0,3,1,4>
+ 2705342802U, // <5,6,0,4>: Cost 3 vsldoi8 <3,4,5,6>, <0,4,1,5>
+ 3779084708U, // <5,6,0,5>: Cost 4 vsldoi8 <3,4,5,6>, <0,5,1,6>
+ 3779084790U, // <5,6,0,6>: Cost 4 vsldoi8 <3,4,5,6>, <0,6,1,7>
+ 2302643510U, // <5,6,0,7>: Cost 3 vmrglw <3,4,5,0>, RHS
+ 1631601309U, // <5,6,0,u>: Cost 2 vsldoi8 <3,4,5,6>, LHS
+ 3767141092U, // <5,6,1,0>: Cost 4 vsldoi8 <1,4,5,6>, <1,0,1,2>
+ 2705343284U, // <5,6,1,1>: Cost 3 vsldoi8 <3,4,5,6>, <1,1,1,1>
+ 2705343382U, // <5,6,1,2>: Cost 3 vsldoi8 <3,4,5,6>, <1,2,3,0>
+ 3779085282U, // <5,6,1,3>: Cost 4 vsldoi8 <3,4,5,6>, <1,3,2,4>
+ 2693399632U, // <5,6,1,4>: Cost 3 vsldoi8 <1,4,5,6>, <1,4,5,6>
+ 3767805089U, // <5,6,1,5>: Cost 4 vsldoi8 <1,5,5,6>, <1,5,5,6>
+ 2311279416U, // <5,6,1,6>: Cost 3 vmrglw <4,u,5,1>, <6,6,6,6>
+ 1237536054U, // <5,6,1,7>: Cost 2 vmrglw <4,u,5,1>, RHS
+ 1237536055U, // <5,6,1,u>: Cost 2 vmrglw <4,u,5,1>, RHS
+ 3773113789U, // <5,6,2,0>: Cost 4 vsldoi8 <2,4,5,6>, <2,0,1,2>
+ 3779085855U, // <5,6,2,1>: Cost 4 vsldoi8 <3,4,5,6>, <2,1,3,1>
+ 2699372136U, // <5,6,2,2>: Cost 3 vsldoi8 <2,4,5,6>, <2,2,2,2>
+ 2705344166U, // <5,6,2,3>: Cost 3 vsldoi8 <3,4,5,6>, <2,3,0,1>
+ 2699372329U, // <5,6,2,4>: Cost 3 vsldoi8 <2,4,5,6>, <2,4,5,6>
+ 2705344360U, // <5,6,2,5>: Cost 3 vsldoi8 <3,4,5,6>, <2,5,3,6>
+ 2705344442U, // <5,6,2,6>: Cost 3 vsldoi8 <3,4,5,6>, <2,6,3,7>
+ 2302659894U, // <5,6,2,7>: Cost 3 vmrglw <3,4,5,2>, RHS
+ 2702026861U, // <5,6,2,u>: Cost 3 vsldoi8 <2,u,5,6>, <2,u,5,6>
+ 2705344662U, // <5,6,3,0>: Cost 3 vsldoi8 <3,4,5,6>, <3,0,1,2>
+ 3767142661U, // <5,6,3,1>: Cost 4 vsldoi8 <1,4,5,6>, <3,1,4,5>
+ 3773114689U, // <5,6,3,2>: Cost 4 vsldoi8 <2,4,5,6>, <3,2,2,2>
+ 2705344924U, // <5,6,3,3>: Cost 3 vsldoi8 <3,4,5,6>, <3,3,3,3>
+ 1631603202U, // <5,6,3,4>: Cost 2 vsldoi8 <3,4,5,6>, <3,4,5,6>
+ 3842945597U, // <5,6,3,5>: Cost 4 vsldoi12 <2,u,6,5>, <6,3,5,7>
+ 3779086962U, // <5,6,3,6>: Cost 4 vsldoi8 <3,4,5,6>, <3,6,0,1>
+ 2289397046U, // <5,6,3,7>: Cost 3 vmrglw <1,2,5,3>, RHS
+ 1634257734U, // <5,6,3,u>: Cost 2 vsldoi8 <3,u,5,6>, <3,u,5,6>
+ 2644926566U, // <5,6,4,0>: Cost 3 vsldoi4 <4,5,6,4>, LHS
+ 3779087306U, // <5,6,4,1>: Cost 4 vsldoi8 <3,4,5,6>, <4,1,2,3>
+ 2790142577U, // <5,6,4,2>: Cost 3 vsldoi12 <6,4,2,5>, <6,4,2,5>
+ 2644929026U, // <5,6,4,3>: Cost 3 vsldoi4 <4,5,6,4>, <3,4,5,6>
+ 2711317723U, // <5,6,4,4>: Cost 3 vsldoi8 <4,4,5,6>, <4,4,5,6>
+ 1631604022U, // <5,6,4,5>: Cost 2 vsldoi8 <3,4,5,6>, RHS
+ 2712644989U, // <5,6,4,6>: Cost 3 vsldoi8 <4,6,5,6>, <4,6,5,6>
+ 2302676278U, // <5,6,4,7>: Cost 3 vmrglw <3,4,5,4>, RHS
+ 1631604265U, // <5,6,4,u>: Cost 2 vsldoi8 <3,4,5,6>, RHS
+ 3842945708U, // <5,6,5,0>: Cost 4 vsldoi12 <2,u,6,5>, <6,5,0,1>
+ 3767144133U, // <5,6,5,1>: Cost 4 vsldoi8 <1,4,5,6>, <5,1,6,1>
+ 2705346328U, // <5,6,5,2>: Cost 3 vsldoi8 <3,4,5,6>, <5,2,6,3>
+ 3779088207U, // <5,6,5,3>: Cost 4 vsldoi8 <3,4,5,6>, <5,3,3,4>
+ 2717290420U, // <5,6,5,4>: Cost 3 vsldoi8 <5,4,5,6>, <5,4,5,6>
+ 2705346574U, // <5,6,5,5>: Cost 3 vsldoi8 <3,4,5,6>, <5,5,6,6>
+ 2705346596U, // <5,6,5,6>: Cost 3 vsldoi8 <3,4,5,6>, <5,6,0,1>
+ 1237568822U, // <5,6,5,7>: Cost 2 vmrglw <4,u,5,5>, RHS
+ 1237568823U, // <5,6,5,u>: Cost 2 vmrglw <4,u,5,5>, RHS
+ 2650914918U, // <5,6,6,0>: Cost 3 vsldoi4 <5,5,6,6>, LHS
+ 3364490949U, // <5,6,6,1>: Cost 4 vmrglw <1,4,5,6>, <5,1,6,1>
+ 2248954362U, // <5,6,6,2>: Cost 3 vmrghw <5,6,7,0>, <6,2,7,3>
+ 2302693144U, // <5,6,6,3>: Cost 3 vmrglw <3,4,5,6>, <5,2,6,3>
+ 2650918198U, // <5,6,6,4>: Cost 3 vsldoi4 <5,5,6,6>, RHS
+ 2650918926U, // <5,6,6,5>: Cost 3 vsldoi4 <5,5,6,6>, <5,5,6,6>
+ 2302693390U, // <5,6,6,6>: Cost 3 vmrglw <3,4,5,6>, <5,5,6,6>
+ 1228950838U, // <5,6,6,7>: Cost 2 vmrglw <3,4,5,6>, RHS
+ 1228950839U, // <5,6,6,u>: Cost 2 vmrglw <3,4,5,6>, RHS
+ 497467494U, // <5,6,7,0>: Cost 1 vsldoi4 RHS, LHS
+ 1571210036U, // <5,6,7,1>: Cost 2 vsldoi4 RHS, <1,1,1,1>
+ 1571210856U, // <5,6,7,2>: Cost 2 vsldoi4 RHS, <2,2,2,2>
+ 1571211414U, // <5,6,7,3>: Cost 2 vsldoi4 RHS, <3,0,1,2>
+ 497470774U, // <5,6,7,4>: Cost 1 vsldoi4 RHS, RHS
+ 1571213316U, // <5,6,7,5>: Cost 2 vsldoi4 RHS, <5,5,5,5>
+ 1571213818U, // <5,6,7,6>: Cost 2 vsldoi4 RHS, <6,2,7,3>
+ 1571214956U, // <5,6,7,7>: Cost 2 vsldoi4 RHS, <7,7,7,7>
+ 497473326U, // <5,6,7,u>: Cost 1 vsldoi4 RHS, LHS
+ 497475686U, // <5,6,u,0>: Cost 1 vsldoi4 RHS, LHS
+ 1631606574U, // <5,6,u,1>: Cost 2 vsldoi8 <3,4,5,6>, LHS
+ 1571219048U, // <5,6,u,2>: Cost 2 vsldoi4 RHS, <2,2,2,2>
+ 1571219606U, // <5,6,u,3>: Cost 2 vsldoi4 RHS, <3,0,1,2>
+ 497478967U, // <5,6,u,4>: Cost 1 vsldoi4 RHS, RHS
+ 1631606938U, // <5,6,u,5>: Cost 2 vsldoi8 <3,4,5,6>, RHS
+ 1571222010U, // <5,6,u,6>: Cost 2 vsldoi4 RHS, <6,2,7,3>
+ 1228967222U, // <5,6,u,7>: Cost 2 vmrglw <3,4,5,u>, RHS
+ 497481518U, // <5,6,u,u>: Cost 1 vsldoi4 RHS, LHS
+ 3768475648U, // <5,7,0,0>: Cost 4 vsldoi8 <1,6,5,7>, <0,0,0,0>
+ 2694733926U, // <5,7,0,1>: Cost 3 vsldoi8 <1,6,5,7>, LHS
+ 3718711395U, // <5,7,0,2>: Cost 4 vsldoi4 <4,5,7,0>, <2,u,4,5>
+ 3384349178U, // <5,7,0,3>: Cost 4 vmrglw <4,7,5,0>, <6,2,7,3>
+ 2694734162U, // <5,7,0,4>: Cost 3 vsldoi8 <1,6,5,7>, <0,4,1,5>
+ 3384347884U, // <5,7,0,5>: Cost 4 vmrglw <4,7,5,0>, <4,4,7,5>
+ 3730658026U, // <5,7,0,6>: Cost 4 vsldoi4 <6,5,7,0>, <6,5,7,0>
+ 3718714362U, // <5,7,0,7>: Cost 4 vsldoi4 <4,5,7,0>, <7,0,1,2>
+ 2694734493U, // <5,7,0,u>: Cost 3 vsldoi8 <1,6,5,7>, LHS
+ 2311278690U, // <5,7,1,0>: Cost 3 vmrglw <4,u,5,1>, <5,6,7,0>
+ 2305970923U, // <5,7,1,1>: Cost 3 vmrglw <4,0,5,1>, <6,5,7,1>
+ 3768476566U, // <5,7,1,2>: Cost 4 vsldoi8 <1,6,5,7>, <1,2,3,0>
+ 2311279098U, // <5,7,1,3>: Cost 3 vmrglw <4,u,5,1>, <6,2,7,3>
+ 2311278694U, // <5,7,1,4>: Cost 3 vmrglw <4,u,5,1>, <5,6,7,4>
+ 3768476783U, // <5,7,1,5>: Cost 4 vsldoi8 <1,6,5,7>, <1,5,0,1>
+ 2694735091U, // <5,7,1,6>: Cost 3 vsldoi8 <1,6,5,7>, <1,6,5,7>
+ 2311279426U, // <5,7,1,7>: Cost 3 vmrglw <4,u,5,1>, <6,6,7,7>
+ 2696062357U, // <5,7,1,u>: Cost 3 vsldoi8 <1,u,5,7>, <1,u,5,7>
+ 3383701602U, // <5,7,2,0>: Cost 4 vmrglw <4,6,5,2>, <5,6,7,0>
+ 3768477219U, // <5,7,2,1>: Cost 4 vsldoi8 <1,6,5,7>, <2,1,3,5>
+ 3768477288U, // <5,7,2,2>: Cost 4 vsldoi8 <1,6,5,7>, <2,2,2,2>
+ 2309960186U, // <5,7,2,3>: Cost 3 vmrglw <4,6,5,2>, <6,2,7,3>
+ 3383701606U, // <5,7,2,4>: Cost 4 vmrglw <4,6,5,2>, <5,6,7,4>
+ 3768477545U, // <5,7,2,5>: Cost 4 vsldoi8 <1,6,5,7>, <2,5,3,7>
+ 3766486970U, // <5,7,2,6>: Cost 4 vsldoi8 <1,3,5,7>, <2,6,3,7>
+ 3383702338U, // <5,7,2,7>: Cost 4 vmrglw <4,6,5,2>, <6,6,7,7>
+ 2309960186U, // <5,7,2,u>: Cost 3 vmrglw <4,6,5,2>, <6,2,7,3>
+ 3768477846U, // <5,7,3,0>: Cost 4 vsldoi8 <1,6,5,7>, <3,0,1,2>
+ 3768477975U, // <5,7,3,1>: Cost 4 vsldoi8 <1,6,5,7>, <3,1,6,5>
+ 3786393932U, // <5,7,3,2>: Cost 4 vsldoi8 <4,6,5,7>, <3,2,3,4>
+ 3768478108U, // <5,7,3,3>: Cost 4 vsldoi8 <1,6,5,7>, <3,3,3,3>
+ 2795599115U, // <5,7,3,4>: Cost 3 vsldoi12 <7,3,4,5>, <7,3,4,5>
+ 3385037470U, // <5,7,3,5>: Cost 4 vmrglw <4,u,5,3>, <6,4,7,5>
+ 3780422309U, // <5,7,3,6>: Cost 4 vsldoi8 <3,6,5,7>, <3,6,5,7>
+ 3848107301U, // <5,7,3,7>: Cost 4 vsldoi12 <3,7,4,5>, <7,3,7,4>
+ 2795894063U, // <5,7,3,u>: Cost 3 vsldoi12 <7,3,u,5>, <7,3,u,5>
+ 2795967800U, // <5,7,4,0>: Cost 3 vsldoi12 <7,4,0,5>, <7,4,0,5>
+ 3768478690U, // <5,7,4,1>: Cost 4 vsldoi8 <1,6,5,7>, <4,1,5,0>
+ 3718744163U, // <5,7,4,2>: Cost 4 vsldoi4 <4,5,7,4>, <2,u,4,5>
+ 3784404107U, // <5,7,4,3>: Cost 4 vsldoi8 <4,3,5,7>, <4,3,5,7>
+ 2796262748U, // <5,7,4,4>: Cost 3 vsldoi12 <7,4,4,5>, <7,4,4,5>
+ 2694737206U, // <5,7,4,5>: Cost 3 vsldoi8 <1,6,5,7>, RHS
+ 2712653182U, // <5,7,4,6>: Cost 3 vsldoi8 <4,6,5,7>, <4,6,5,7>
+ 2713316815U, // <5,7,4,7>: Cost 3 vsldoi8 <4,7,5,7>, <4,7,5,7>
+ 2694737449U, // <5,7,4,u>: Cost 3 vsldoi8 <1,6,5,7>, RHS
+ 2311311458U, // <5,7,5,0>: Cost 3 vmrglw <4,u,5,5>, <5,6,7,0>
+ 3768479433U, // <5,7,5,1>: Cost 4 vsldoi8 <1,6,5,7>, <5,1,6,5>
+ 3768479521U, // <5,7,5,2>: Cost 4 vsldoi8 <1,6,5,7>, <5,2,7,3>
+ 2311311866U, // <5,7,5,3>: Cost 3 vmrglw <4,u,5,5>, <6,2,7,3>
+ 2311311462U, // <5,7,5,4>: Cost 3 vmrglw <4,u,5,5>, <5,6,7,4>
+ 2248185270U, // <5,7,5,5>: Cost 3 vmrghw <5,5,5,5>, <7,5,5,5>
+ 2718625879U, // <5,7,5,6>: Cost 3 vsldoi8 <5,6,5,7>, <5,6,5,7>
+ 2311312194U, // <5,7,5,7>: Cost 3 vmrglw <4,u,5,5>, <6,6,7,7>
+ 2311311466U, // <5,7,5,u>: Cost 3 vmrglw <4,u,5,5>, <5,6,7,u>
+ 2248954874U, // <5,7,6,0>: Cost 3 vmrghw <5,6,7,0>, <7,0,1,2>
+ 3322696778U, // <5,7,6,1>: Cost 4 vmrghw <5,6,7,0>, <7,1,1,1>
+ 2248955028U, // <5,7,6,2>: Cost 3 vmrghw <5,6,7,0>, <7,2,0,3>
+ 2656963074U, // <5,7,6,3>: Cost 3 vsldoi4 <6,5,7,6>, <3,4,5,6>
+ 2248955238U, // <5,7,6,4>: Cost 3 vmrghw <5,6,7,0>, <7,4,5,6>
+ 2248955329U, // <5,7,6,5>: Cost 3 vmrghw <5,6,7,0>, <7,5,6,7>
+ 2656965360U, // <5,7,6,6>: Cost 3 vsldoi4 <6,5,7,6>, <6,5,7,6>
+ 2248955500U, // <5,7,6,7>: Cost 3 vmrghw <5,6,7,0>, <7,7,7,7>
+ 2248955522U, // <5,7,6,u>: Cost 3 vmrghw <5,6,7,0>, <7,u,1,2>
+ 3718766694U, // <5,7,7,0>: Cost 4 vsldoi4 <4,5,7,7>, LHS
+ 3724739827U, // <5,7,7,1>: Cost 4 vsldoi4 <5,5,7,7>, <1,6,5,7>
+ 3718768739U, // <5,7,7,2>: Cost 4 vsldoi4 <4,5,7,7>, <2,u,4,5>
+ 3365826337U, // <5,7,7,3>: Cost 4 vmrglw <1,6,5,7>, <5,2,7,3>
+ 2798253647U, // <5,7,7,4>: Cost 3 vsldoi12 <7,7,4,5>, <7,7,4,5>
+ 3365826258U, // <5,7,7,5>: Cost 4 vmrglw <1,6,5,7>, <5,1,7,5>
+ 3730715377U, // <5,7,7,6>: Cost 4 vsldoi4 <6,5,7,7>, <6,5,7,7>
+ 2310665836U, // <5,7,7,7>: Cost 3 vmrglw <4,7,5,7>, <7,7,7,7>
+ 2798548595U, // <5,7,7,u>: Cost 3 vsldoi12 <7,7,u,5>, <7,7,u,5>
+ 2311336034U, // <5,7,u,0>: Cost 3 vmrglw <4,u,5,u>, <5,6,7,0>
+ 2694739758U, // <5,7,u,1>: Cost 3 vsldoi8 <1,6,5,7>, LHS
+ 2248955028U, // <5,7,u,2>: Cost 3 vmrghw <5,6,7,0>, <7,2,0,3>
+ 2311336442U, // <5,7,u,3>: Cost 3 vmrglw <4,u,5,u>, <6,2,7,3>
+ 2311336038U, // <5,7,u,4>: Cost 3 vmrglw <4,u,5,u>, <5,6,7,4>
+ 2694740122U, // <5,7,u,5>: Cost 3 vsldoi8 <1,6,5,7>, RHS
+ 2656981746U, // <5,7,u,6>: Cost 3 vsldoi4 <6,5,7,u>, <6,5,7,u>
+ 2311336770U, // <5,7,u,7>: Cost 3 vmrglw <4,u,5,u>, <6,6,7,7>
+ 2694740325U, // <5,7,u,u>: Cost 3 vsldoi8 <1,6,5,7>, LHS
+ 2705358848U, // <5,u,0,0>: Cost 3 vsldoi8 <3,4,5,u>, <0,0,0,0>
+ 1631617126U, // <5,u,0,1>: Cost 2 vsldoi8 <3,4,5,u>, LHS
+ 2310607866U, // <5,u,0,2>: Cost 3 vmrglw <4,7,5,0>, <7,0,1,2>
+ 2302640284U, // <5,u,0,3>: Cost 3 vmrglw <3,4,5,0>, LHS
+ 2754238189U, // <5,u,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <u,0,4,1>
+ 2305296114U, // <5,u,0,5>: Cost 3 vmrglw <3,u,5,0>, <2,3,u,5>
+ 2244907106U, // <5,u,0,6>: Cost 3 vmrghw <5,0,6,1>, <5,6,7,0>
+ 2302643528U, // <5,u,0,7>: Cost 3 vmrglw <3,4,5,0>, RHS
+ 1631617693U, // <5,u,0,u>: Cost 2 vsldoi8 <3,4,5,u>, LHS
+ 2627133542U, // <5,u,1,0>: Cost 3 vsldoi4 <1,5,u,1>, LHS
+ 1237536282U, // <5,u,1,1>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1>
+ 1680496430U, // <5,u,1,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+ 1237532828U, // <5,u,1,3>: Cost 2 vmrglw <4,u,5,1>, LHS
+ 2693416018U, // <5,u,1,4>: Cost 3 vsldoi8 <1,4,5,u>, <1,4,5,u>
+ 2756892486U, // <5,u,1,5>: Cost 3 vsldoi12 <0,u,1,5>, <u,1,5,0>
+ 2694743284U, // <5,u,1,6>: Cost 3 vsldoi8 <1,6,5,u>, <1,6,5,u>
+ 1237536072U, // <5,u,1,7>: Cost 2 vmrglw <4,u,5,1>, RHS
+ 1680496484U, // <5,u,1,u>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+ 2311288709U, // <5,u,2,0>: Cost 3 vmrglw <4,u,5,2>, <u,2,3,0>
+ 2245883694U, // <5,u,2,1>: Cost 3 vmrghw <5,2,1,3>, LHS
+ 2699388520U, // <5,u,2,2>: Cost 3 vsldoi8 <2,4,5,u>, <2,2,2,2>
+ 2754238344U, // <5,u,2,3>: Cost 3 vsldoi12 <0,4,1,5>, <u,2,3,3>
+ 2699388715U, // <5,u,2,4>: Cost 3 vsldoi8 <2,4,5,u>, <2,4,5,u>
+ 2757408666U, // <5,u,2,5>: Cost 3 vsldoi12 <0,u,u,5>, <u,2,5,3>
+ 2705360826U, // <5,u,2,6>: Cost 3 vsldoi8 <3,4,5,u>, <2,6,3,7>
+ 2302659912U, // <5,u,2,7>: Cost 3 vmrglw <3,4,5,2>, RHS
+ 2754238389U, // <5,u,2,u>: Cost 3 vsldoi12 <0,4,1,5>, <u,2,u,3>
+ 2754238396U, // <5,u,3,0>: Cost 3 vsldoi12 <0,4,1,5>, <u,3,0,1>
+ 3827980229U, // <5,u,3,1>: Cost 4 vsldoi12 <0,4,1,5>, <u,3,1,1>
+ 2644625102U, // <5,u,3,2>: Cost 3 vsldoi4 <4,5,2,3>, <2,3,4,5>
+ 2289393820U, // <5,u,3,3>: Cost 3 vmrglw <1,2,5,3>, LHS
+ 1631619588U, // <5,u,3,4>: Cost 2 vsldoi8 <3,4,5,u>, <3,4,5,u>
+ 2785056749U, // <5,u,3,5>: Cost 3 vsldoi12 <5,5,5,5>, <u,3,5,5>
+ 3363138077U, // <5,u,3,6>: Cost 4 vmrglw <1,2,5,3>, <3,4,u,6>
+ 2289397064U, // <5,u,3,7>: Cost 3 vmrglw <1,2,5,3>, RHS
+ 1634274120U, // <5,u,3,u>: Cost 2 vsldoi8 <3,u,5,u>, <3,u,5,u>
+ 1634937753U, // <5,u,4,0>: Cost 2 vsldoi8 <4,0,5,u>, <4,0,5,u>
+ 1728272410U, // <5,u,4,1>: Cost 2 vsldoi12 <u,4,1,5>, <u,4,1,5>
+ 2710006843U, // <5,u,4,2>: Cost 3 vsldoi8 <4,2,5,u>, <4,2,5,u>
+ 2765740076U, // <5,u,4,3>: Cost 3 vsldoi12 <2,3,4,5>, <u,4,3,5>
+ 1637592285U, // <5,u,4,4>: Cost 2 vsldoi8 <4,4,5,u>, <4,4,5,u>
+ 1631620406U, // <5,u,4,5>: Cost 2 vsldoi8 <3,4,5,u>, RHS
+ 2712661375U, // <5,u,4,6>: Cost 3 vsldoi8 <4,6,5,u>, <4,6,5,u>
+ 2302676296U, // <5,u,4,7>: Cost 3 vmrglw <3,4,5,4>, RHS
+ 1631620649U, // <5,u,4,u>: Cost 2 vsldoi8 <3,4,5,u>, RHS
+ 1577091174U, // <5,u,5,0>: Cost 2 vsldoi4 <5,5,5,5>, LHS
+ 1174443822U, // <5,u,5,1>: Cost 2 vmrghw <5,5,5,5>, LHS
+ 2766035058U, // <5,u,5,2>: Cost 3 vsldoi12 <2,3,u,5>, <u,5,2,3>
+ 1237565596U, // <5,u,5,3>: Cost 2 vmrglw <4,u,5,5>, LHS
+ 1577094454U, // <5,u,5,4>: Cost 2 vsldoi4 <5,5,5,5>, RHS
+ 296144182U, // <5,u,5,5>: Cost 1 vspltisw1 RHS
+ 1680496794U, // <5,u,5,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+ 1237568840U, // <5,u,5,7>: Cost 2 vmrglw <4,u,5,5>, RHS
+ 296144182U, // <5,u,5,u>: Cost 1 vspltisw1 RHS
+ 2633146470U, // <5,u,6,0>: Cost 3 vsldoi4 <2,5,u,6>, LHS
+ 1175213870U, // <5,u,6,1>: Cost 2 vmrghw <5,6,7,0>, LHS
+ 2633148309U, // <5,u,6,2>: Cost 3 vsldoi4 <2,5,u,6>, <2,5,u,6>
+ 1228947612U, // <5,u,6,3>: Cost 2 vmrglw <3,4,5,6>, LHS
+ 2633149750U, // <5,u,6,4>: Cost 3 vsldoi4 <2,5,u,6>, RHS
+ 1175214234U, // <5,u,6,5>: Cost 2 vmrghw <5,6,7,0>, RHS
+ 1228950018U, // <5,u,6,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6>
+ 1228950856U, // <5,u,6,7>: Cost 2 vmrglw <3,4,5,6>, RHS
+ 1228947617U, // <5,u,6,u>: Cost 2 vmrglw <3,4,5,6>, LHS
+ 497614950U, // <5,u,7,0>: Cost 1 vsldoi4 RHS, LHS
+ 1571357492U, // <5,u,7,1>: Cost 2 vsldoi4 RHS, <1,1,1,1>
+ 1571358312U, // <5,u,7,2>: Cost 2 vsldoi4 RHS, <2,2,2,2>
+ 1571358870U, // <5,u,7,3>: Cost 2 vsldoi4 RHS, <3,0,1,2>
+ 497618248U, // <5,u,7,4>: Cost 1 vsldoi4 RHS, RHS
+ 1571360772U, // <5,u,7,5>: Cost 2 vsldoi4 RHS, <5,5,5,5>
+ 1571361274U, // <5,u,7,6>: Cost 2 vsldoi4 RHS, <6,2,7,3>
+ 1571361786U, // <5,u,7,7>: Cost 2 vsldoi4 RHS, <7,0,1,2>
+ 497620782U, // <5,u,7,u>: Cost 1 vsldoi4 RHS, LHS
+ 497623142U, // <5,u,u,0>: Cost 1 vsldoi4 RHS, LHS
+ 1631622958U, // <5,u,u,1>: Cost 2 vsldoi8 <3,4,5,u>, LHS
+ 1680496997U, // <5,u,u,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+ 1228963996U, // <5,u,u,3>: Cost 2 vmrglw <3,4,5,u>, LHS
+ 497626441U, // <5,u,u,4>: Cost 1 vsldoi4 RHS, RHS
+ 296144182U, // <5,u,u,5>: Cost 1 vspltisw1 RHS
+ 1680497037U, // <5,u,u,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+ 1228967240U, // <5,u,u,7>: Cost 2 vmrglw <3,4,5,u>, RHS
+ 497628974U, // <5,u,u,u>: Cost 1 vsldoi4 RHS, LHS
+ 2772451328U, // <6,0,0,0>: Cost 3 vsldoi12 <3,4,5,6>, <0,0,0,0>
+ 2772451338U, // <6,0,0,1>: Cost 3 vsldoi12 <3,4,5,6>, <0,0,1,1>
+ 3771146417U, // <6,0,0,2>: Cost 4 vsldoi8 <2,1,6,0>, <0,2,1,6>
+ 3383095739U, // <6,0,0,3>: Cost 4 vmrglw <4,5,6,0>, <6,2,0,3>
+ 3846193189U, // <6,0,0,4>: Cost 4 vsldoi12 <3,4,5,6>, <0,0,4,1>
+ 3724832803U, // <6,0,0,5>: Cost 4 vsldoi4 <5,6,0,0>, <5,6,0,0>
+ 3383095985U, // <6,0,0,6>: Cost 4 vmrglw <4,5,6,0>, <6,5,0,6>
+ 3383096067U, // <6,0,0,7>: Cost 4 vmrglw <4,5,6,0>, <6,6,0,7>
+ 2772451401U, // <6,0,0,u>: Cost 3 vsldoi12 <3,4,5,6>, <0,0,u,1>
+ 2651095142U, // <6,0,1,0>: Cost 3 vsldoi4 <5,6,0,1>, LHS
+ 2251612262U, // <6,0,1,1>: Cost 3 vmrghw <6,1,7,1>, LHS
+ 1698709606U, // <6,0,1,2>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+ 2651097602U, // <6,0,1,3>: Cost 3 vsldoi4 <5,6,0,1>, <3,4,5,6>
+ 2651098422U, // <6,0,1,4>: Cost 3 vsldoi4 <5,6,0,1>, RHS
+ 2651099172U, // <6,0,1,5>: Cost 3 vsldoi4 <5,6,0,1>, <5,6,0,1>
+ 2657071869U, // <6,0,1,6>: Cost 3 vsldoi4 <6,6,0,1>, <6,6,0,1>
+ 3724841978U, // <6,0,1,7>: Cost 4 vsldoi4 <5,6,0,1>, <7,0,1,2>
+ 1698709660U, // <6,0,1,u>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+ 2252292096U, // <6,0,2,0>: Cost 3 vmrghw <6,2,7,3>, <0,0,0,0>
+ 1178550374U, // <6,0,2,1>: Cost 2 vmrghw <6,2,7,3>, LHS
+ 3826655418U, // <6,0,2,2>: Cost 4 vsldoi12 <0,2,1,6>, <0,2,2,6>
+ 3777783485U, // <6,0,2,3>: Cost 4 vsldoi8 <3,2,6,0>, <2,3,2,6>
+ 2252292434U, // <6,0,2,4>: Cost 3 vmrghw <6,2,7,3>, <0,4,1,5>
+ 3785746280U, // <6,0,2,5>: Cost 4 vsldoi8 <4,5,6,0>, <2,5,3,6>
+ 2252292593U, // <6,0,2,6>: Cost 3 vmrghw <6,2,7,3>, <0,6,1,2>
+ 3736794583U, // <6,0,2,7>: Cost 4 vsldoi4 <7,6,0,2>, <7,6,0,2>
+ 1178550941U, // <6,0,2,u>: Cost 2 vmrghw <6,2,7,3>, LHS
+ 3375153152U, // <6,0,3,0>: Cost 4 vmrglw <3,2,6,3>, <0,0,0,0>
+ 2772451584U, // <6,0,3,1>: Cost 3 vsldoi12 <3,4,5,6>, <0,3,1,4>
+ 3777784163U, // <6,0,3,2>: Cost 4 vsldoi8 <3,2,6,0>, <3,2,6,0>
+ 3846193426U, // <6,0,3,3>: Cost 4 vsldoi12 <3,4,5,6>, <0,3,3,4>
+ 2712005122U, // <6,0,3,4>: Cost 3 vsldoi8 <4,5,6,0>, <3,4,5,6>
+ 3724857382U, // <6,0,3,5>: Cost 4 vsldoi4 <5,6,0,3>, <5,6,0,3>
+ 3802335864U, // <6,0,3,6>: Cost 4 vsldoi8 <7,3,6,0>, <3,6,0,7>
+ 3801672410U, // <6,0,3,7>: Cost 4 vsldoi8 <7,2,6,0>, <3,7,2,6>
+ 2772451647U, // <6,0,3,u>: Cost 3 vsldoi12 <3,4,5,6>, <0,3,u,4>
+ 3383123968U, // <6,0,4,0>: Cost 4 vmrglw <4,5,6,4>, <0,0,0,0>
+ 2772451666U, // <6,0,4,1>: Cost 3 vsldoi12 <3,4,5,6>, <0,4,1,5>
+ 3773803577U, // <6,0,4,2>: Cost 4 vsldoi8 <2,5,6,0>, <4,2,5,6>
+ 3724864002U, // <6,0,4,3>: Cost 4 vsldoi4 <5,6,0,4>, <3,4,5,6>
+ 3846193517U, // <6,0,4,4>: Cost 4 vsldoi12 <3,4,5,6>, <0,4,4,5>
+ 2712005935U, // <6,0,4,5>: Cost 3 vsldoi8 <4,5,6,0>, <4,5,6,0>
+ 3327009265U, // <6,0,4,6>: Cost 4 vmrghw <6,4,2,5>, <0,6,1,2>
+ 3383126648U, // <6,0,4,7>: Cost 5 vmrglw <4,5,6,4>, <3,6,0,7>
+ 2772451729U, // <6,0,4,u>: Cost 3 vsldoi12 <3,4,5,6>, <0,4,u,5>
+ 3373178880U, // <6,0,5,0>: Cost 4 vmrglw <2,u,6,5>, <0,0,0,0>
+ 2254266470U, // <6,0,5,1>: Cost 3 vmrghw <6,5,7,1>, LHS
+ 3785748248U, // <6,0,5,2>: Cost 4 vsldoi8 <4,5,6,0>, <5,2,6,3>
+ 3790393190U, // <6,0,5,3>: Cost 4 vsldoi8 <5,3,6,0>, <5,3,6,0>
+ 3328000338U, // <6,0,5,4>: Cost 4 vmrghw <6,5,7,0>, <0,4,1,5>
+ 3785748494U, // <6,0,5,5>: Cost 4 vsldoi8 <4,5,6,0>, <5,5,6,6>
+ 3785748516U, // <6,0,5,6>: Cost 4 vsldoi8 <4,5,6,0>, <5,6,0,1>
+ 3379153528U, // <6,0,5,7>: Cost 4 vmrglw <3,u,6,5>, <3,6,0,7>
+ 2254267037U, // <6,0,5,u>: Cost 3 vmrghw <6,5,7,1>, LHS
+ 2254897152U, // <6,0,6,0>: Cost 3 vmrghw <6,6,6,6>, <0,0,0,0>
+ 1181155430U, // <6,0,6,1>: Cost 2 vmrghw <6,6,6,6>, LHS
+ 3785748923U, // <6,0,6,2>: Cost 4 vsldoi8 <4,5,6,0>, <6,2,0,3>
+ 3785749042U, // <6,0,6,3>: Cost 4 vsldoi8 <4,5,6,0>, <6,3,4,5>
+ 2254897490U, // <6,0,6,4>: Cost 3 vmrghw <6,6,6,6>, <0,4,1,5>
+ 3785749169U, // <6,0,6,5>: Cost 4 vsldoi8 <4,5,6,0>, <6,5,0,6>
+ 2724614962U, // <6,0,6,6>: Cost 3 vsldoi8 <6,6,6,0>, <6,6,6,0>
+ 3787739982U, // <6,0,6,7>: Cost 4 vsldoi8 <4,u,6,0>, <6,7,0,1>
+ 1181155997U, // <6,0,6,u>: Cost 2 vmrghw <6,6,6,6>, LHS
+ 1235664896U, // <6,0,7,0>: Cost 2 vmrglw RHS, <0,0,0,0>
+ 1235666598U, // <6,0,7,1>: Cost 2 vmrglw RHS, <2,3,0,1>
+ 3712943720U, // <6,0,7,2>: Cost 4 vsldoi4 <3,6,0,7>, <2,2,2,2>
+ 2639202936U, // <6,0,7,3>: Cost 3 vsldoi4 <3,6,0,7>, <3,6,0,7>
+ 2639203638U, // <6,0,7,4>: Cost 3 vsldoi4 <3,6,0,7>, RHS
+ 2309409236U, // <6,0,7,5>: Cost 3 vmrglw RHS, <3,4,0,5>
+ 3712946517U, // <6,0,7,6>: Cost 4 vsldoi4 <3,6,0,7>, <6,0,7,0>
+ 2309409400U, // <6,0,7,7>: Cost 3 vmrglw RHS, <3,6,0,7>
+ 1235666605U, // <6,0,7,u>: Cost 2 vmrglw RHS, <2,3,0,u>
+ 1235673088U, // <6,0,u,0>: Cost 2 vmrglw RHS, <0,0,0,0>
+ 1235674790U, // <6,0,u,1>: Cost 2 vmrglw RHS, <2,3,0,1>
+ 1698710173U, // <6,0,u,2>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+ 2639211129U, // <6,0,u,3>: Cost 3 vsldoi4 <3,6,0,u>, <3,6,0,u>
+ 2639211830U, // <6,0,u,4>: Cost 3 vsldoi4 <3,6,0,u>, RHS
+ 2712008858U, // <6,0,u,5>: Cost 3 vsldoi8 <4,5,6,0>, RHS
+ 2657129220U, // <6,0,u,6>: Cost 3 vsldoi4 <6,6,0,u>, <6,6,0,u>
+ 2309417592U, // <6,0,u,7>: Cost 3 vmrglw RHS, <3,6,0,7>
+ 1698710227U, // <6,0,u,u>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+ 3775799296U, // <6,1,0,0>: Cost 4 vsldoi8 <2,u,6,1>, <0,0,0,0>
+ 2702057574U, // <6,1,0,1>: Cost 3 vsldoi8 <2,u,6,1>, LHS
+ 3373143763U, // <6,1,0,2>: Cost 4 vmrglw <2,u,6,0>, <u,0,1,2>
+ 3695045122U, // <6,1,0,3>: Cost 4 vsldoi4 <0,6,1,0>, <3,4,5,6>
+ 3775799634U, // <6,1,0,4>: Cost 4 vsldoi8 <2,u,6,1>, <0,4,1,5>
+ 3383091538U, // <6,1,0,5>: Cost 4 vmrglw <4,5,6,0>, <0,4,1,5>
+ 3368493233U, // <6,1,0,6>: Cost 4 vmrglw <2,1,6,0>, <0,2,1,6>
+ 3362522319U, // <6,1,0,7>: Cost 5 vmrglw <1,1,6,0>, <1,6,1,7>
+ 2702058141U, // <6,1,0,u>: Cost 3 vsldoi8 <2,u,6,1>, LHS
+ 3834250027U, // <6,1,1,0>: Cost 4 vsldoi12 <1,4,5,6>, <1,1,0,1>
+ 2772452148U, // <6,1,1,1>: Cost 3 vsldoi12 <3,4,5,6>, <1,1,1,1>
+ 3832038210U, // <6,1,1,2>: Cost 4 vsldoi12 <1,1,2,6>, <1,1,2,6>
+ 3373150660U, // <6,1,1,3>: Cost 4 vmrglw <2,u,6,1>, <6,2,1,3>
+ 3834250067U, // <6,1,1,4>: Cost 4 vsldoi12 <1,4,5,6>, <1,1,4,5>
+ 3373146450U, // <6,1,1,5>: Cost 4 vmrglw <2,u,6,1>, <0,4,1,5>
+ 3826656102U, // <6,1,1,6>: Cost 4 vsldoi12 <0,2,1,6>, <1,1,6,6>
+ 3362530511U, // <6,1,1,7>: Cost 4 vmrglw <1,1,6,1>, <1,6,1,7>
+ 2772452148U, // <6,1,1,u>: Cost 3 vsldoi12 <3,4,5,6>, <1,1,1,1>
+ 2669092966U, // <6,1,2,0>: Cost 3 vsldoi4 <u,6,1,2>, LHS
+ 2252292916U, // <6,1,2,1>: Cost 3 vmrghw <6,2,7,3>, <1,1,1,1>
+ 2252293014U, // <6,1,2,2>: Cost 3 vmrghw <6,2,7,3>, <1,2,3,0>
+ 2772452246U, // <6,1,2,3>: Cost 3 vsldoi12 <3,4,5,6>, <1,2,3,0>
+ 2669096246U, // <6,1,2,4>: Cost 3 vsldoi4 <u,6,1,2>, RHS
+ 3846194091U, // <6,1,2,5>: Cost 4 vsldoi12 <3,4,5,6>, <1,2,5,3>
+ 2702059450U, // <6,1,2,6>: Cost 3 vsldoi8 <2,u,6,1>, <2,6,3,7>
+ 3870081978U, // <6,1,2,7>: Cost 4 vsldoi12 <7,4,5,6>, <1,2,7,0>
+ 2702059633U, // <6,1,2,u>: Cost 3 vsldoi8 <2,u,6,1>, <2,u,6,1>
+ 3775801494U, // <6,1,3,0>: Cost 4 vsldoi8 <2,u,6,1>, <3,0,1,2>
+ 3777128723U, // <6,1,3,1>: Cost 4 vsldoi8 <3,1,6,1>, <3,1,6,1>
+ 3775801702U, // <6,1,3,2>: Cost 4 vsldoi8 <2,u,6,1>, <3,2,6,3>
+ 3775801756U, // <6,1,3,3>: Cost 4 vsldoi8 <2,u,6,1>, <3,3,3,3>
+ 3775801858U, // <6,1,3,4>: Cost 4 vsldoi8 <2,u,6,1>, <3,4,5,6>
+ 3375153490U, // <6,1,3,5>: Cost 4 vmrglw <3,2,6,3>, <0,4,1,5>
+ 3826656265U, // <6,1,3,6>: Cost 4 vsldoi12 <0,2,1,6>, <1,3,6,7>
+ 3775802051U, // <6,1,3,7>: Cost 4 vsldoi8 <2,u,6,1>, <3,7,0,1>
+ 3775802142U, // <6,1,3,u>: Cost 4 vsldoi8 <2,u,6,1>, <3,u,1,2>
+ 3846194206U, // <6,1,4,0>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,0,1>
+ 3846194219U, // <6,1,4,1>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,1,5>
+ 3846194228U, // <6,1,4,2>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,2,5>
+ 3846194236U, // <6,1,4,3>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,3,4>
+ 3846194246U, // <6,1,4,4>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,4,5>
+ 2760508496U, // <6,1,4,5>: Cost 3 vsldoi12 <1,4,5,6>, <1,4,5,6>
+ 3368526001U, // <6,1,4,6>: Cost 4 vmrglw <2,1,6,4>, <0,2,1,6>
+ 3870082144U, // <6,1,4,7>: Cost 4 vsldoi12 <7,4,5,6>, <1,4,7,4>
+ 2760729707U, // <6,1,4,u>: Cost 3 vsldoi12 <1,4,u,6>, <1,4,u,6>
+ 2714668660U, // <6,1,5,0>: Cost 3 vsldoi8 <5,0,6,1>, <5,0,6,1>
+ 3834619005U, // <6,1,5,1>: Cost 4 vsldoi12 <1,5,1,6>, <1,5,1,6>
+ 3834692742U, // <6,1,5,2>: Cost 4 vsldoi12 <1,5,2,6>, <1,5,2,6>
+ 3846194317U, // <6,1,5,3>: Cost 4 vsldoi12 <3,4,5,6>, <1,5,3,4>
+ 3834840216U, // <6,1,5,4>: Cost 4 vsldoi12 <1,5,4,6>, <1,5,4,6>
+ 3834913953U, // <6,1,5,5>: Cost 4 vsldoi12 <1,5,5,6>, <1,5,5,6>
+ 2719977570U, // <6,1,5,6>: Cost 3 vsldoi8 <5,u,6,1>, <5,6,7,0>
+ 3367208143U, // <6,1,5,7>: Cost 4 vmrglw <1,u,6,5>, <1,6,1,7>
+ 2719977724U, // <6,1,5,u>: Cost 3 vsldoi8 <5,u,6,1>, <5,u,6,1>
+ 2669125734U, // <6,1,6,0>: Cost 3 vsldoi4 <u,6,1,6>, LHS
+ 2254897972U, // <6,1,6,1>: Cost 3 vmrghw <6,6,6,6>, <1,1,1,1>
+ 2254898070U, // <6,1,6,2>: Cost 3 vmrghw <6,6,6,6>, <1,2,3,0>
+ 3775803929U, // <6,1,6,3>: Cost 4 vsldoi8 <2,u,6,1>, <6,3,1,7>
+ 2669129014U, // <6,1,6,4>: Cost 3 vsldoi4 <u,6,1,6>, RHS
+ 2322006354U, // <6,1,6,5>: Cost 3 vmrglw <6,6,6,6>, <0,4,1,5>
+ 2725950264U, // <6,1,6,6>: Cost 3 vsldoi8 <6,u,6,1>, <6,6,6,6>
+ 3793720142U, // <6,1,6,7>: Cost 4 vsldoi8 <5,u,6,1>, <6,7,0,1>
+ 2254898556U, // <6,1,6,u>: Cost 3 vmrghw <6,6,6,6>, <1,u,3,0>
+ 2627330150U, // <6,1,7,0>: Cost 3 vsldoi4 <1,6,1,7>, LHS
+ 1235664906U, // <6,1,7,1>: Cost 2 vmrglw RHS, <0,0,1,1>
+ 1235667094U, // <6,1,7,2>: Cost 2 vmrglw RHS, <3,0,1,2>
+ 2309406894U, // <6,1,7,3>: Cost 3 vmrglw RHS, <0,2,1,3>
+ 2627333430U, // <6,1,7,4>: Cost 3 vsldoi4 <1,6,1,7>, RHS
+ 1235665234U, // <6,1,7,5>: Cost 2 vmrglw RHS, <0,4,1,5>
+ 2309406897U, // <6,1,7,6>: Cost 3 vmrglw RHS, <0,2,1,6>
+ 2309407222U, // <6,1,7,7>: Cost 3 vmrglw RHS, <0,6,1,7>
+ 1235664913U, // <6,1,7,u>: Cost 2 vmrglw RHS, <0,0,1,u>
+ 2627338342U, // <6,1,u,0>: Cost 3 vsldoi4 <1,6,1,u>, LHS
+ 1235673098U, // <6,1,u,1>: Cost 2 vmrglw RHS, <0,0,1,1>
+ 1235675286U, // <6,1,u,2>: Cost 2 vmrglw RHS, <3,0,1,2>
+ 2772452732U, // <6,1,u,3>: Cost 3 vsldoi12 <3,4,5,6>, <1,u,3,0>
+ 2627341622U, // <6,1,u,4>: Cost 3 vsldoi4 <1,6,1,u>, RHS
+ 1235673426U, // <6,1,u,5>: Cost 2 vmrglw RHS, <0,4,1,5>
+ 2309415089U, // <6,1,u,6>: Cost 3 vmrglw RHS, <0,2,1,6>
+ 2309415414U, // <6,1,u,7>: Cost 3 vmrglw RHS, <0,6,1,7>
+ 1235673105U, // <6,1,u,u>: Cost 2 vmrglw RHS, <0,0,1,u>
+ 3324683725U, // <6,2,0,0>: Cost 4 vmrghw <6,0,7,0>, <2,0,3,0>
+ 2725290086U, // <6,2,0,1>: Cost 3 vsldoi8 <6,7,6,2>, LHS
+ 3771162801U, // <6,2,0,2>: Cost 4 vsldoi8 <2,1,6,2>, <0,2,1,6>
+ 2309349478U, // <6,2,0,3>: Cost 3 vmrglw <4,5,6,0>, LHS
+ 3730951478U, // <6,2,0,4>: Cost 4 vsldoi4 <6,6,2,0>, RHS
+ 3840738784U, // <6,2,0,5>: Cost 4 vsldoi12 <2,5,3,6>, <2,0,5,1>
+ 3842655721U, // <6,2,0,6>: Cost 4 vsldoi12 <2,u,2,6>, <2,0,6,1>
+ 3736925671U, // <6,2,0,7>: Cost 4 vsldoi4 <7,6,2,0>, <7,6,2,0>
+ 2309349483U, // <6,2,0,u>: Cost 3 vmrglw <4,5,6,0>, LHS
+ 3367840468U, // <6,2,1,0>: Cost 4 vmrglw <2,0,6,1>, <3,7,2,0>
+ 3325355551U, // <6,2,1,1>: Cost 4 vmrghw <6,1,7,1>, <2,1,3,1>
+ 3373147752U, // <6,2,1,2>: Cost 4 vmrglw <2,u,6,1>, <2,2,2,2>
+ 2299404390U, // <6,2,1,3>: Cost 3 vmrglw <2,u,6,1>, LHS
+ 3701099830U, // <6,2,1,4>: Cost 5 vsldoi4 <1,6,2,1>, RHS
+ 3767846054U, // <6,2,1,5>: Cost 4 vsldoi8 <1,5,6,2>, <1,5,6,2>
+ 3826656825U, // <6,2,1,6>: Cost 4 vsldoi12 <0,2,1,6>, <2,1,6,0>
+ 3373147838U, // <6,2,1,7>: Cost 5 vmrglw <2,u,6,1>, <2,3,2,7>
+ 2299404395U, // <6,2,1,u>: Cost 3 vmrglw <2,u,6,1>, LHS
+ 2657222758U, // <6,2,2,0>: Cost 3 vsldoi4 <6,6,2,2>, LHS
+ 3771164219U, // <6,2,2,1>: Cost 4 vsldoi8 <2,1,6,2>, <2,1,6,2>
+ 2766481000U, // <6,2,2,2>: Cost 3 vsldoi12 <2,4,5,6>, <2,2,2,2>
+ 2772452978U, // <6,2,2,3>: Cost 3 vsldoi12 <3,4,5,6>, <2,2,3,3>
+ 2657226038U, // <6,2,2,4>: Cost 3 vsldoi4 <6,6,2,2>, RHS
+ 3790407528U, // <6,2,2,5>: Cost 4 vsldoi8 <5,3,6,2>, <2,5,3,6>
+ 2252294074U, // <6,2,2,6>: Cost 3 vmrghw <6,2,7,3>, <2,6,3,7>
+ 2252294148U, // <6,2,2,7>: Cost 3 vmrghw <6,2,7,3>, <2,7,3,0>
+ 2772453023U, // <6,2,2,u>: Cost 3 vsldoi12 <3,4,5,6>, <2,2,u,3>
+ 2772453030U, // <6,2,3,0>: Cost 3 vsldoi12 <3,4,5,6>, <2,3,0,1>
+ 3834250930U, // <6,2,3,1>: Cost 4 vsldoi12 <1,4,5,6>, <2,3,1,4>
+ 2765596349U, // <6,2,3,2>: Cost 3 vsldoi12 <2,3,2,6>, <2,3,2,6>
+ 2301411430U, // <6,2,3,3>: Cost 3 vmrglw <3,2,6,3>, LHS
+ 2772453070U, // <6,2,3,4>: Cost 3 vsldoi12 <3,4,5,6>, <2,3,4,5>
+ 2765817560U, // <6,2,3,5>: Cost 3 vsldoi12 <2,3,5,6>, <2,3,5,6>
+ 2252933050U, // <6,2,3,6>: Cost 3 vmrghw <6,3,7,0>, <2,6,3,7>
+ 2796340968U, // <6,2,3,7>: Cost 3 vsldoi12 <7,4,5,6>, <2,3,7,4>
+ 2766038771U, // <6,2,3,u>: Cost 3 vsldoi12 <2,3,u,6>, <2,3,u,6>
+ 3725008998U, // <6,2,4,0>: Cost 4 vsldoi4 <5,6,2,4>, LHS
+ 3368530217U, // <6,2,4,1>: Cost 5 vmrglw <2,1,6,4>, <6,0,2,1>
+ 3840222989U, // <6,2,4,2>: Cost 4 vsldoi12 <2,4,5,6>, <2,4,2,5>
+ 2309382246U, // <6,2,4,3>: Cost 3 vmrglw <4,5,6,4>, LHS
+ 3725012278U, // <6,2,4,4>: Cost 4 vsldoi4 <5,6,2,4>, RHS
+ 2766481193U, // <6,2,4,5>: Cost 3 vsldoi12 <2,4,5,6>, <2,4,5,6>
+ 3842656049U, // <6,2,4,6>: Cost 4 vsldoi12 <2,u,2,6>, <2,4,6,5>
+ 3327010820U, // <6,2,4,7>: Cost 4 vmrghw <6,4,2,5>, <2,7,3,0>
+ 2766702404U, // <6,2,4,u>: Cost 3 vsldoi12 <2,4,u,6>, <2,4,u,6>
+ 3713073254U, // <6,2,5,0>: Cost 4 vsldoi4 <3,6,2,5>, LHS
+ 3789082310U, // <6,2,5,1>: Cost 4 vsldoi8 <5,1,6,2>, <5,1,6,2>
+ 3840665439U, // <6,2,5,2>: Cost 4 vsldoi12 <2,5,2,6>, <2,5,2,6>
+ 2766997352U, // <6,2,5,3>: Cost 3 vsldoi12 <2,5,3,6>, <2,5,3,6>
+ 3713076534U, // <6,2,5,4>: Cost 4 vsldoi4 <3,6,2,5>, RHS
+ 3791736842U, // <6,2,5,5>: Cost 4 vsldoi8 <5,5,6,2>, <5,5,6,2>
+ 3373180605U, // <6,2,5,6>: Cost 4 vmrglw <2,u,6,5>, <2,3,2,6>
+ 3793064108U, // <6,2,5,7>: Cost 4 vsldoi8 <5,7,6,2>, <5,7,6,2>
+ 2767366037U, // <6,2,5,u>: Cost 3 vsldoi12 <2,5,u,6>, <2,5,u,6>
+ 3701137510U, // <6,2,6,0>: Cost 4 vsldoi4 <1,6,2,6>, LHS
+ 3701138647U, // <6,2,6,1>: Cost 4 vsldoi4 <1,6,2,6>, <1,6,2,6>
+ 2254898792U, // <6,2,6,2>: Cost 3 vmrghw <6,6,6,6>, <2,2,2,2>
+ 1248264294U, // <6,2,6,3>: Cost 2 vmrglw <6,6,6,6>, LHS
+ 3701140790U, // <6,2,6,4>: Cost 4 vsldoi4 <1,6,2,6>, RHS
+ 3725029435U, // <6,2,6,5>: Cost 4 vsldoi4 <5,6,2,6>, <5,6,2,6>
+ 2254899130U, // <6,2,6,6>: Cost 3 vmrghw <6,6,6,6>, <2,6,3,7>
+ 2725294981U, // <6,2,6,7>: Cost 3 vsldoi8 <6,7,6,2>, <6,7,6,2>
+ 1248264299U, // <6,2,6,u>: Cost 2 vmrglw <6,6,6,6>, LHS
+ 2633375846U, // <6,2,7,0>: Cost 3 vsldoi4 <2,6,2,7>, LHS
+ 2309407468U, // <6,2,7,1>: Cost 3 vmrglw RHS, <1,0,2,1>
+ 1235666536U, // <6,2,7,2>: Cost 2 vmrglw RHS, <2,2,2,2>
+ 161923174U, // <6,2,7,3>: Cost 1 vmrglw RHS, LHS
+ 2633379126U, // <6,2,7,4>: Cost 3 vsldoi4 <2,6,2,7>, RHS
+ 2309407796U, // <6,2,7,5>: Cost 3 vmrglw RHS, <1,4,2,5>
+ 2309408445U, // <6,2,7,6>: Cost 3 vmrglw RHS, <2,3,2,6>
+ 2309407960U, // <6,2,7,7>: Cost 3 vmrglw RHS, <1,6,2,7>
+ 161923179U, // <6,2,7,u>: Cost 1 vmrglw RHS, LHS
+ 2633384038U, // <6,2,u,0>: Cost 3 vsldoi4 <2,6,2,u>, LHS
+ 2309415660U, // <6,2,u,1>: Cost 3 vmrglw RHS, <1,0,2,1>
+ 1235674728U, // <6,2,u,2>: Cost 2 vmrglw RHS, <2,2,2,2>
+ 161931366U, // <6,2,u,3>: Cost 1 vmrglw RHS, LHS
+ 2633387318U, // <6,2,u,4>: Cost 3 vsldoi4 <2,6,2,u>, RHS
+ 2769135725U, // <6,2,u,5>: Cost 3 vsldoi12 <2,u,5,6>, <2,u,5,6>
+ 2309416637U, // <6,2,u,6>: Cost 3 vmrglw RHS, <2,3,2,6>
+ 2309416152U, // <6,2,u,7>: Cost 3 vmrglw RHS, <1,6,2,7>
+ 161931371U, // <6,2,u,u>: Cost 1 vmrglw RHS, LHS
+ 3777806336U, // <6,3,0,0>: Cost 4 vsldoi8 <3,2,6,3>, <0,0,0,0>
+ 2704064614U, // <6,3,0,1>: Cost 3 vsldoi8 <3,2,6,3>, LHS
+ 3765862577U, // <6,3,0,2>: Cost 4 vsldoi8 <1,2,6,3>, <0,2,1,6>
+ 3843393708U, // <6,3,0,3>: Cost 4 vsldoi12 <3,0,3,6>, <3,0,3,6>
+ 2250516994U, // <6,3,0,4>: Cost 3 vmrghw <6,0,1,2>, <3,4,5,6>
+ 3725054014U, // <6,3,0,5>: Cost 4 vsldoi4 <5,6,3,0>, <5,6,3,0>
+ 3383093096U, // <6,3,0,6>: Cost 4 vmrglw <4,5,6,0>, <2,5,3,6>
+ 3368495034U, // <6,3,0,7>: Cost 4 vmrglw <2,1,6,0>, <2,6,3,7>
+ 2704065181U, // <6,3,0,u>: Cost 3 vsldoi8 <3,2,6,3>, LHS
+ 2251622550U, // <6,3,1,0>: Cost 3 vmrghw <6,1,7,2>, <3,0,1,2>
+ 3777807156U, // <6,3,1,1>: Cost 4 vsldoi8 <3,2,6,3>, <1,1,1,1>
+ 3765863348U, // <6,3,1,2>: Cost 4 vsldoi8 <1,2,6,3>, <1,2,6,3>
+ 3373147762U, // <6,3,1,3>: Cost 4 vmrglw <2,u,6,1>, <2,2,3,3>
+ 3834251525U, // <6,3,1,4>: Cost 4 vsldoi12 <1,4,5,6>, <3,1,4,5>
+ 3373147683U, // <6,3,1,5>: Cost 5 vmrglw <2,u,6,1>, <2,1,3,5>
+ 3391727545U, // <6,3,1,6>: Cost 4 vmrglw <6,0,6,1>, <2,6,3,6>
+ 2299406266U, // <6,3,1,7>: Cost 3 vmrglw <2,u,6,1>, <2,6,3,7>
+ 2251622550U, // <6,3,1,u>: Cost 3 vmrghw <6,1,7,2>, <3,0,1,2>
+ 2252294294U, // <6,3,2,0>: Cost 3 vmrghw <6,2,7,3>, <3,0,1,2>
+ 3326036198U, // <6,3,2,1>: Cost 4 vmrghw <6,2,7,3>, <3,1,1,1>
+ 3771836045U, // <6,3,2,2>: Cost 4 vsldoi8 <2,2,6,3>, <2,2,6,3>
+ 2252294556U, // <6,3,2,3>: Cost 3 vmrghw <6,2,7,3>, <3,3,3,3>
+ 2252294658U, // <6,3,2,4>: Cost 3 vmrghw <6,2,7,3>, <3,4,5,6>
+ 3840739677U, // <6,3,2,5>: Cost 4 vsldoi12 <2,5,3,6>, <3,2,5,3>
+ 2704066490U, // <6,3,2,6>: Cost 3 vsldoi8 <3,2,6,3>, <2,6,3,7>
+ 3368511418U, // <6,3,2,7>: Cost 4 vmrglw <2,1,6,2>, <2,6,3,7>
+ 2252294942U, // <6,3,2,u>: Cost 3 vmrghw <6,2,7,3>, <3,u,1,2>
+ 3707158630U, // <6,3,3,0>: Cost 4 vsldoi4 <2,6,3,3>, LHS
+ 3765864692U, // <6,3,3,1>: Cost 5 vsldoi8 <1,2,6,3>, <3,1,2,6>
+ 2704066918U, // <6,3,3,2>: Cost 3 vsldoi8 <3,2,6,3>, <3,2,6,3>
+ 2772453788U, // <6,3,3,3>: Cost 3 vsldoi12 <3,4,5,6>, <3,3,3,3>
+ 2772453799U, // <6,3,3,4>: Cost 3 vsldoi12 <3,4,5,6>, <3,3,4,5>
+ 3789752888U, // <6,3,3,5>: Cost 4 vsldoi8 <5,2,6,3>, <3,5,2,6>
+ 3840739770U, // <6,3,3,6>: Cost 4 vsldoi12 <2,5,3,6>, <3,3,6,6>
+ 2301413306U, // <6,3,3,7>: Cost 3 vmrglw <3,2,6,3>, <2,6,3,7>
+ 2775108043U, // <6,3,3,u>: Cost 3 vsldoi12 <3,u,5,6>, <3,3,u,5>
+ 2651340902U, // <6,3,4,0>: Cost 3 vsldoi4 <5,6,3,4>, LHS
+ 3846195674U, // <6,3,4,1>: Cost 4 vsldoi12 <3,4,5,6>, <3,4,1,2>
+ 3845974503U, // <6,3,4,2>: Cost 4 vsldoi12 <3,4,2,6>, <3,4,2,6>
+ 2651343362U, // <6,3,4,3>: Cost 3 vsldoi4 <5,6,3,4>, <3,4,5,6>
+ 2651344182U, // <6,3,4,4>: Cost 3 vsldoi4 <5,6,3,4>, RHS
+ 1698712066U, // <6,3,4,5>: Cost 2 vsldoi12 <3,4,5,6>, <3,4,5,6>
+ 3383125864U, // <6,3,4,6>: Cost 4 vmrglw <4,5,6,4>, <2,5,3,6>
+ 3368527802U, // <6,3,4,7>: Cost 4 vmrglw <2,1,6,4>, <2,6,3,7>
+ 1698933277U, // <6,3,4,u>: Cost 2 vsldoi12 <3,4,u,6>, <3,4,u,6>
+ 3373179798U, // <6,3,5,0>: Cost 4 vmrglw <2,u,6,5>, <1,2,3,0>
+ 3707176179U, // <6,3,5,1>: Cost 5 vsldoi4 <2,6,3,5>, <1,6,5,7>
+ 2716012312U, // <6,3,5,2>: Cost 3 vsldoi8 <5,2,6,3>, <5,2,6,3>
+ 3373180530U, // <6,3,5,3>: Cost 4 vmrglw <2,u,6,5>, <2,2,3,3>
+ 2254309890U, // <6,3,5,4>: Cost 3 vmrghw <6,5,7,6>, <3,4,5,6>
+ 3785773070U, // <6,3,5,5>: Cost 4 vsldoi8 <4,5,6,3>, <5,5,6,6>
+ 3840739932U, // <6,3,5,6>: Cost 4 vsldoi12 <2,5,3,6>, <3,5,6,6>
+ 2299439034U, // <6,3,5,7>: Cost 3 vmrglw <2,u,6,5>, <2,6,3,7>
+ 2719994110U, // <6,3,5,u>: Cost 3 vsldoi8 <5,u,6,3>, <5,u,6,3>
+ 2254899350U, // <6,3,6,0>: Cost 3 vmrghw <6,6,6,6>, <3,0,1,2>
+ 3328641254U, // <6,3,6,1>: Cost 4 vmrghw <6,6,6,6>, <3,1,1,1>
+ 2633443257U, // <6,3,6,2>: Cost 3 vsldoi4 <2,6,3,6>, <2,6,3,6>
+ 2254899612U, // <6,3,6,3>: Cost 3 vmrghw <6,6,6,6>, <3,3,3,3>
+ 2254899714U, // <6,3,6,4>: Cost 3 vmrghw <6,6,6,6>, <3,4,5,6>
+ 3785773772U, // <6,3,6,5>: Cost 4 vsldoi8 <4,5,6,3>, <6,5,3,6>
+ 2725966648U, // <6,3,6,6>: Cost 3 vsldoi8 <6,u,6,3>, <6,6,6,6>
+ 2322007994U, // <6,3,6,7>: Cost 3 vmrglw <6,6,6,6>, <2,6,3,7>
+ 2254899998U, // <6,3,6,u>: Cost 3 vmrghw <6,6,6,6>, <3,u,1,2>
+ 1559707750U, // <6,3,7,0>: Cost 2 vsldoi4 <2,6,3,7>, LHS
+ 2633450292U, // <6,3,7,1>: Cost 3 vsldoi4 <2,6,3,7>, <1,1,1,1>
+ 1559709626U, // <6,3,7,2>: Cost 2 vsldoi4 <2,6,3,7>, <2,6,3,7>
+ 1235666546U, // <6,3,7,3>: Cost 2 vmrglw RHS, <2,2,3,3>
+ 1559711030U, // <6,3,7,4>: Cost 2 vsldoi4 <2,6,3,7>, RHS
+ 2309408291U, // <6,3,7,5>: Cost 3 vmrglw RHS, <2,1,3,5>
+ 2633454152U, // <6,3,7,6>: Cost 3 vsldoi4 <2,6,3,7>, <6,3,7,0>
+ 1235666874U, // <6,3,7,7>: Cost 2 vmrglw RHS, <2,6,3,7>
+ 1559713582U, // <6,3,7,u>: Cost 2 vsldoi4 <2,6,3,7>, LHS
+ 1559715942U, // <6,3,u,0>: Cost 2 vsldoi4 <2,6,3,u>, LHS
+ 2633458484U, // <6,3,u,1>: Cost 3 vsldoi4 <2,6,3,u>, <1,1,1,1>
+ 1559717819U, // <6,3,u,2>: Cost 2 vsldoi4 <2,6,3,u>, <2,6,3,u>
+ 1235674738U, // <6,3,u,3>: Cost 2 vmrglw RHS, <2,2,3,3>
+ 1559719222U, // <6,3,u,4>: Cost 2 vsldoi4 <2,6,3,u>, RHS
+ 1701366598U, // <6,3,u,5>: Cost 2 vsldoi12 <3,u,5,6>, <3,u,5,6>
+ 2633462353U, // <6,3,u,6>: Cost 3 vsldoi4 <2,6,3,u>, <6,3,u,0>
+ 1235675066U, // <6,3,u,7>: Cost 2 vmrglw RHS, <2,6,3,7>
+ 1559721774U, // <6,3,u,u>: Cost 2 vsldoi4 <2,6,3,u>, LHS
+ 3785777152U, // <6,4,0,0>: Cost 4 vsldoi8 <4,5,6,4>, <0,0,0,0>
+ 2712035430U, // <6,4,0,1>: Cost 3 vsldoi8 <4,5,6,4>, LHS
+ 3771179185U, // <6,4,0,2>: Cost 4 vsldoi8 <2,1,6,4>, <0,2,1,6>
+ 3846196096U, // <6,4,0,3>: Cost 4 vsldoi12 <3,4,5,6>, <4,0,3,1>
+ 3785777490U, // <6,4,0,4>: Cost 4 vsldoi8 <4,5,6,4>, <0,4,1,5>
+ 2250517814U, // <6,4,0,5>: Cost 3 vmrghw <6,0,1,2>, RHS
+ 3324259703U, // <6,4,0,6>: Cost 4 vmrghw <6,0,1,2>, <4,6,5,0>
+ 3383092458U, // <6,4,0,7>: Cost 5 vmrglw <4,5,6,0>, <1,6,4,7>
+ 2712035997U, // <6,4,0,u>: Cost 3 vsldoi8 <4,5,6,4>, LHS
+ 3325356946U, // <6,4,1,0>: Cost 4 vmrghw <6,1,7,1>, <4,0,5,1>
+ 3785777972U, // <6,4,1,1>: Cost 4 vsldoi8 <4,5,6,4>, <1,1,1,1>
+ 3846196170U, // <6,4,1,2>: Cost 4 vsldoi12 <3,4,5,6>, <4,1,2,3>
+ 3325365380U, // <6,4,1,3>: Cost 4 vmrghw <6,1,7,2>, <4,3,5,0>
+ 3852168155U, // <6,4,1,4>: Cost 4 vsldoi12 <4,4,5,6>, <4,1,4,2>
+ 2251615542U, // <6,4,1,5>: Cost 3 vmrghw <6,1,7,1>, RHS
+ 3325357432U, // <6,4,1,6>: Cost 4 vmrghw <6,1,7,1>, <4,6,5,1>
+ 3870084088U, // <6,4,1,7>: Cost 4 vsldoi12 <7,4,5,6>, <4,1,7,4>
+ 2251615785U, // <6,4,1,u>: Cost 3 vmrghw <6,1,7,1>, RHS
+ 2252295058U, // <6,4,2,0>: Cost 3 vmrghw <6,2,7,3>, <4,0,5,1>
+ 3771180605U, // <6,4,2,1>: Cost 4 vsldoi8 <2,1,6,4>, <2,1,6,4>
+ 3785778792U, // <6,4,2,2>: Cost 4 vsldoi8 <4,5,6,4>, <2,2,2,2>
+ 3777816253U, // <6,4,2,3>: Cost 4 vsldoi8 <3,2,6,4>, <2,3,2,6>
+ 2252295376U, // <6,4,2,4>: Cost 3 vmrghw <6,2,7,3>, <4,4,4,4>
+ 1178553654U, // <6,4,2,5>: Cost 2 vmrghw <6,2,7,3>, RHS
+ 2252295545U, // <6,4,2,6>: Cost 3 vmrghw <6,2,7,3>, <4,6,5,2>
+ 3326037448U, // <6,4,2,7>: Cost 4 vmrghw <6,2,7,3>, <4,7,5,0>
+ 1178553897U, // <6,4,2,u>: Cost 2 vmrghw <6,2,7,3>, RHS
+ 3785779350U, // <6,4,3,0>: Cost 4 vsldoi8 <4,5,6,4>, <3,0,1,2>
+ 3383118648U, // <6,4,3,1>: Cost 4 vmrglw <4,5,6,3>, <3,u,4,1>
+ 3777816935U, // <6,4,3,2>: Cost 4 vsldoi8 <3,2,6,4>, <3,2,6,4>
+ 3785779612U, // <6,4,3,3>: Cost 4 vsldoi8 <4,5,6,4>, <3,3,3,3>
+ 2712037890U, // <6,4,3,4>: Cost 3 vsldoi8 <4,5,6,4>, <3,4,5,6>
+ 2252754230U, // <6,4,3,5>: Cost 3 vmrghw <6,3,4,5>, RHS
+ 3784452764U, // <6,4,3,6>: Cost 4 vsldoi8 <4,3,6,4>, <3,6,4,7>
+ 3801705178U, // <6,4,3,7>: Cost 4 vsldoi8 <7,2,6,4>, <3,7,2,6>
+ 2252754473U, // <6,4,3,u>: Cost 3 vmrghw <6,3,4,5>, RHS
+ 3787770770U, // <6,4,4,0>: Cost 4 vsldoi8 <4,u,6,4>, <4,0,5,1>
+ 3383126840U, // <6,4,4,1>: Cost 4 vmrglw <4,5,6,4>, <3,u,4,1>
+ 3327380534U, // <6,4,4,2>: Cost 4 vmrghw <6,4,7,5>, <4,2,5,3>
+ 3784453265U, // <6,4,4,3>: Cost 4 vsldoi8 <4,3,6,4>, <4,3,6,4>
+ 2253630672U, // <6,4,4,4>: Cost 3 vmrghw <6,4,7,4>, <4,4,4,4>
+ 2778426587U, // <6,4,4,5>: Cost 3 vsldoi12 <4,4,5,6>, <4,4,5,6>
+ 3383128789U, // <6,4,4,6>: Cost 4 vmrglw <4,5,6,4>, <6,5,4,6>
+ 3381799580U, // <6,4,4,7>: Cost 4 vmrglw <4,3,6,4>, <3,6,4,7>
+ 2778647798U, // <6,4,4,u>: Cost 3 vsldoi12 <4,4,u,6>, <4,4,u,6>
+ 2651422822U, // <6,4,5,0>: Cost 3 vsldoi4 <5,6,4,5>, LHS
+ 3701277928U, // <6,4,5,1>: Cost 4 vsldoi4 <1,6,4,5>, <1,6,4,5>
+ 3701278650U, // <6,4,5,2>: Cost 4 vsldoi4 <1,6,4,5>, <2,6,3,7>
+ 2651425282U, // <6,4,5,3>: Cost 3 vsldoi4 <5,6,4,5>, <3,4,5,6>
+ 2651426102U, // <6,4,5,4>: Cost 3 vsldoi4 <5,6,4,5>, RHS
+ 2651426892U, // <6,4,5,5>: Cost 3 vsldoi4 <5,6,4,5>, <5,6,4,5>
+ 1698712886U, // <6,4,5,6>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+ 3725169658U, // <6,4,5,7>: Cost 4 vsldoi4 <5,6,4,5>, <7,0,1,2>
+ 1698712904U, // <6,4,5,u>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+ 2254900114U, // <6,4,6,0>: Cost 3 vmrghw <6,6,6,6>, <4,0,5,1>
+ 3389115192U, // <6,4,6,1>: Cost 4 vmrglw <5,5,6,6>, <3,u,4,1>
+ 3785781727U, // <6,4,6,2>: Cost 4 vsldoi8 <4,5,6,4>, <6,2,4,3>
+ 3785781810U, // <6,4,6,3>: Cost 4 vsldoi8 <4,5,6,4>, <6,3,4,5>
+ 2254900432U, // <6,4,6,4>: Cost 3 vmrghw <6,6,6,6>, <4,4,4,4>
+ 1181158710U, // <6,4,6,5>: Cost 2 vmrghw <6,6,6,6>, RHS
+ 2254900605U, // <6,4,6,6>: Cost 3 vmrghw <6,6,6,6>, <4,6,5,6>
+ 3787772750U, // <6,4,6,7>: Cost 4 vsldoi8 <4,u,6,4>, <6,7,0,1>
+ 1181158953U, // <6,4,6,u>: Cost 2 vmrghw <6,6,6,6>, RHS
+ 2639495270U, // <6,4,7,0>: Cost 3 vsldoi4 <3,6,4,7>, LHS
+ 2639496090U, // <6,4,7,1>: Cost 3 vsldoi4 <3,6,4,7>, <1,2,3,4>
+ 3707267011U, // <6,4,7,2>: Cost 4 vsldoi4 <2,6,4,7>, <2,6,4,7>
+ 2639497884U, // <6,4,7,3>: Cost 3 vsldoi4 <3,6,4,7>, <3,6,4,7>
+ 1237658832U, // <6,4,7,4>: Cost 2 vmrglw RHS, <4,4,4,4>
+ 1235666638U, // <6,4,7,5>: Cost 2 vmrglw RHS, <2,3,4,5>
+ 3713241753U, // <6,4,7,6>: Cost 4 vsldoi4 <3,6,4,7>, <6,4,7,0>
+ 2309409436U, // <6,4,7,7>: Cost 3 vmrglw RHS, <3,6,4,7>
+ 1235666641U, // <6,4,7,u>: Cost 2 vmrglw RHS, <2,3,4,u>
+ 2639503462U, // <6,4,u,0>: Cost 3 vsldoi4 <3,6,4,u>, LHS
+ 2639504282U, // <6,4,u,1>: Cost 3 vsldoi4 <3,6,4,u>, <1,2,3,4>
+ 3701303226U, // <6,4,u,2>: Cost 4 vsldoi4 <1,6,4,u>, <2,6,3,7>
+ 2639506077U, // <6,4,u,3>: Cost 3 vsldoi4 <3,6,4,u>, <3,6,4,u>
+ 1235676368U, // <6,4,u,4>: Cost 2 vmrglw RHS, <4,4,4,4>
+ 1235674830U, // <6,4,u,5>: Cost 2 vmrglw RHS, <2,3,4,5>
+ 1698713129U, // <6,4,u,6>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+ 2309417628U, // <6,4,u,7>: Cost 3 vmrglw RHS, <3,6,4,7>
+ 1698713147U, // <6,4,u,u>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+ 3775832064U, // <6,5,0,0>: Cost 4 vsldoi8 <2,u,6,5>, <0,0,0,0>
+ 2702090342U, // <6,5,0,1>: Cost 3 vsldoi8 <2,u,6,5>, LHS
+ 3775832241U, // <6,5,0,2>: Cost 4 vsldoi8 <2,u,6,5>, <0,2,1,6>
+ 3719227906U, // <6,5,0,3>: Cost 4 vsldoi4 <4,6,5,0>, <3,4,5,6>
+ 3775832402U, // <6,5,0,4>: Cost 4 vsldoi8 <2,u,6,5>, <0,4,1,5>
+ 3385085146U, // <6,5,0,5>: Cost 4 vmrglw <4,u,6,0>, <4,4,5,5>
+ 2309351938U, // <6,5,0,6>: Cost 3 vmrglw <4,5,6,0>, <3,4,5,6>
+ 3376459134U, // <6,5,0,7>: Cost 5 vmrglw <3,4,6,0>, <4,6,5,7>
+ 2702090909U, // <6,5,0,u>: Cost 3 vsldoi8 <2,u,6,5>, LHS
+ 3719233546U, // <6,5,1,0>: Cost 4 vsldoi4 <4,6,5,1>, <0,0,1,1>
+ 3775832884U, // <6,5,1,1>: Cost 4 vsldoi8 <2,u,6,5>, <1,1,1,1>
+ 3775832982U, // <6,5,1,2>: Cost 4 vsldoi8 <2,u,6,5>, <1,2,3,0>
+ 3846196909U, // <6,5,1,3>: Cost 4 vsldoi12 <3,4,5,6>, <5,1,3,4>
+ 3719236984U, // <6,5,1,4>: Cost 4 vsldoi4 <4,6,5,1>, <4,6,5,1>
+ 3856150209U, // <6,5,1,5>: Cost 4 vsldoi12 <5,1,5,6>, <5,1,5,6>
+ 3834252997U, // <6,5,1,6>: Cost 4 vsldoi12 <1,4,5,6>, <5,1,6,1>
+ 3870084817U, // <6,5,1,7>: Cost 4 vsldoi12 <7,4,5,6>, <5,1,7,4>
+ 3769861532U, // <6,5,1,u>: Cost 4 vsldoi8 <1,u,6,5>, <1,u,6,5>
+ 2645500006U, // <6,5,2,0>: Cost 3 vsldoi4 <4,6,5,2>, LHS
+ 3719242548U, // <6,5,2,1>: Cost 4 vsldoi4 <4,6,5,2>, <1,1,1,1>
+ 3775833704U, // <6,5,2,2>: Cost 4 vsldoi8 <2,u,6,5>, <2,2,2,2>
+ 3775833766U, // <6,5,2,3>: Cost 4 vsldoi8 <2,u,6,5>, <2,3,0,1>
+ 2645503353U, // <6,5,2,4>: Cost 3 vsldoi4 <4,6,5,2>, <4,6,5,2>
+ 2252296196U, // <6,5,2,5>: Cost 3 vmrghw <6,2,7,3>, <5,5,5,5>
+ 2702092218U, // <6,5,2,6>: Cost 3 vsldoi8 <2,u,6,5>, <2,6,3,7>
+ 3719246842U, // <6,5,2,7>: Cost 4 vsldoi4 <4,6,5,2>, <7,0,1,2>
+ 2702092405U, // <6,5,2,u>: Cost 3 vsldoi8 <2,u,6,5>, <2,u,6,5>
+ 3775834262U, // <6,5,3,0>: Cost 4 vsldoi8 <2,u,6,5>, <3,0,1,2>
+ 3777161495U, // <6,5,3,1>: Cost 4 vsldoi8 <3,1,6,5>, <3,1,6,5>
+ 3775834470U, // <6,5,3,2>: Cost 4 vsldoi8 <2,u,6,5>, <3,2,6,3>
+ 3775834524U, // <6,5,3,3>: Cost 4 vsldoi8 <2,u,6,5>, <3,3,3,3>
+ 3775834626U, // <6,5,3,4>: Cost 4 vsldoi8 <2,u,6,5>, <3,4,5,6>
+ 3385109722U, // <6,5,3,5>: Cost 4 vmrglw <4,u,6,3>, <4,4,5,5>
+ 2309376514U, // <6,5,3,6>: Cost 3 vmrglw <4,5,6,3>, <3,4,5,6>
+ 3775834819U, // <6,5,3,7>: Cost 4 vsldoi8 <2,u,6,5>, <3,7,0,1>
+ 2309376514U, // <6,5,3,u>: Cost 3 vmrglw <4,5,6,3>, <3,4,5,6>
+ 3719258214U, // <6,5,4,0>: Cost 4 vsldoi4 <4,6,5,4>, LHS
+ 3385117586U, // <6,5,4,1>: Cost 4 vmrglw <4,u,6,4>, <4,0,5,1>
+ 3327242008U, // <6,5,4,2>: Cost 4 vmrghw <6,4,5,6>, <5,2,6,3>
+ 3719260674U, // <6,5,4,3>: Cost 4 vsldoi4 <4,6,5,4>, <3,4,5,6>
+ 3719261563U, // <6,5,4,4>: Cost 4 vsldoi4 <4,6,5,4>, <4,6,5,4>
+ 2702093622U, // <6,5,4,5>: Cost 3 vsldoi8 <2,u,6,5>, RHS
+ 2309384706U, // <6,5,4,6>: Cost 3 vmrglw <4,5,6,4>, <3,4,5,6>
+ 3870085060U, // <6,5,4,7>: Cost 4 vsldoi12 <7,4,5,6>, <5,4,7,4>
+ 2702093865U, // <6,5,4,u>: Cost 3 vsldoi8 <2,u,6,5>, RHS
+ 3719266406U, // <6,5,5,0>: Cost 4 vsldoi4 <4,6,5,5>, LHS
+ 3789106889U, // <6,5,5,1>: Cost 4 vsldoi8 <5,1,6,5>, <5,1,6,5>
+ 3785789208U, // <6,5,5,2>: Cost 4 vsldoi8 <4,5,6,5>, <5,2,6,3>
+ 3373183950U, // <6,5,5,3>: Cost 4 vmrglw <2,u,6,5>, <6,u,5,3>
+ 2717355964U, // <6,5,5,4>: Cost 3 vsldoi8 <5,4,6,5>, <5,4,6,5>
+ 2791772164U, // <6,5,5,5>: Cost 3 vsldoi12 <6,6,6,6>, <5,5,5,5>
+ 2772455438U, // <6,5,5,6>: Cost 3 vsldoi12 <3,4,5,6>, <5,5,6,6>
+ 3373183549U, // <6,5,5,7>: Cost 4 vmrglw <2,u,6,5>, <6,3,5,7>
+ 2720010496U, // <6,5,5,u>: Cost 3 vsldoi8 <5,u,6,5>, <5,u,6,5>
+ 2772455460U, // <6,5,6,0>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,0,1>
+ 2322008978U, // <6,5,6,1>: Cost 3 vmrglw <6,6,6,6>, <4,0,5,1>
+ 3840225335U, // <6,5,6,2>: Cost 4 vsldoi12 <2,4,5,6>, <5,6,2,2>
+ 2772455490U, // <6,5,6,3>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,3,4>
+ 2772455500U, // <6,5,6,4>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,4,5>
+ 2254901252U, // <6,5,6,5>: Cost 3 vmrghw <6,6,6,6>, <5,5,5,5>
+ 2772455520U, // <6,5,6,6>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,6,7>
+ 2785874024U, // <6,5,6,7>: Cost 3 vsldoi12 <5,6,7,6>, <5,6,7,6>
+ 2772455532U, // <6,5,6,u>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,u,1>
+ 2627625062U, // <6,5,7,0>: Cost 3 vsldoi4 <1,6,5,7>, LHS
+ 1235667858U, // <6,5,7,1>: Cost 2 vmrglw RHS, <4,0,5,1>
+ 2309409278U, // <6,5,7,2>: Cost 3 vmrglw RHS, <3,4,5,2>
+ 2309407659U, // <6,5,7,3>: Cost 3 vmrglw RHS, <1,2,5,3>
+ 2627628342U, // <6,5,7,4>: Cost 3 vsldoi4 <1,6,5,7>, RHS
+ 1235668186U, // <6,5,7,5>: Cost 2 vmrglw RHS, <4,4,5,5>
+ 1235667458U, // <6,5,7,6>: Cost 2 vmrglw RHS, <3,4,5,6>
+ 2309407987U, // <6,5,7,7>: Cost 3 vmrglw RHS, <1,6,5,7>
+ 1235667460U, // <6,5,7,u>: Cost 2 vmrglw RHS, <3,4,5,u>
+ 2627633254U, // <6,5,u,0>: Cost 3 vsldoi4 <1,6,5,u>, LHS
+ 1235676050U, // <6,5,u,1>: Cost 2 vmrglw RHS, <4,0,5,1>
+ 2309417470U, // <6,5,u,2>: Cost 3 vmrglw RHS, <3,4,5,2>
+ 2309415851U, // <6,5,u,3>: Cost 3 vmrglw RHS, <1,2,5,3>
+ 2627636534U, // <6,5,u,4>: Cost 3 vsldoi4 <1,6,5,u>, RHS
+ 1235676378U, // <6,5,u,5>: Cost 2 vmrglw RHS, <4,4,5,5>
+ 1235675650U, // <6,5,u,6>: Cost 2 vmrglw RHS, <3,4,5,6>
+ 2309416179U, // <6,5,u,7>: Cost 3 vmrglw RHS, <1,6,5,7>
+ 1235675652U, // <6,5,u,u>: Cost 2 vmrglw RHS, <3,4,5,u>
+ 2309352751U, // <6,6,0,0>: Cost 3 vmrglw <4,5,6,0>, <4,5,6,0>
+ 1650917478U, // <6,6,0,1>: Cost 2 vsldoi8 <6,6,6,6>, LHS
+ 2250584570U, // <6,6,0,2>: Cost 3 vmrghw <6,0,2,1>, <6,2,7,3>
+ 3846197554U, // <6,6,0,3>: Cost 4 vsldoi12 <3,4,5,6>, <6,0,3,1>
+ 2724659538U, // <6,6,0,4>: Cost 3 vsldoi8 <6,6,6,6>, <0,4,1,5>
+ 3725275225U, // <6,6,0,5>: Cost 4 vsldoi4 <5,6,6,0>, <5,6,6,0>
+ 2791772493U, // <6,6,0,6>: Cost 3 vsldoi12 <6,6,6,6>, <6,0,6,1>
+ 2309352758U, // <6,6,0,7>: Cost 3 vmrglw <4,5,6,0>, RHS
+ 1650918045U, // <6,6,0,u>: Cost 2 vsldoi8 <6,6,6,6>, LHS
+ 3325358368U, // <6,6,1,0>: Cost 4 vmrghw <6,1,7,1>, <6,0,1,1>
+ 2299406449U, // <6,6,1,1>: Cost 3 vmrglw <2,u,6,1>, <2,u,6,1>
+ 2724660118U, // <6,6,1,2>: Cost 3 vsldoi8 <6,6,6,6>, <1,2,3,0>
+ 3373148518U, // <6,6,1,3>: Cost 4 vmrglw <2,u,6,1>, <3,2,6,3>
+ 3834253712U, // <6,6,1,4>: Cost 4 vsldoi12 <1,4,5,6>, <6,1,4,5>
+ 3373147953U, // <6,6,1,5>: Cost 4 vmrglw <2,u,6,1>, <2,4,6,5>
+ 2323297080U, // <6,6,1,6>: Cost 3 vmrglw <6,u,6,1>, <6,6,6,6>
+ 2299407670U, // <6,6,1,7>: Cost 3 vmrglw <2,u,6,1>, RHS
+ 2299407671U, // <6,6,1,u>: Cost 3 vmrglw <2,u,6,1>, RHS
+ 2252296489U, // <6,6,2,0>: Cost 3 vmrghw <6,2,7,3>, <6,0,2,1>
+ 3326038394U, // <6,6,2,1>: Cost 4 vmrghw <6,2,7,3>, <6,1,2,1>
+ 1178554874U, // <6,6,2,2>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3>
+ 2724660902U, // <6,6,2,3>: Cost 3 vsldoi8 <6,6,6,6>, <2,3,0,1>
+ 2252296817U, // <6,6,2,4>: Cost 3 vmrghw <6,2,7,3>, <6,4,2,5>
+ 3840741864U, // <6,6,2,5>: Cost 4 vsldoi12 <2,5,3,6>, <6,2,5,3>
+ 2252296976U, // <6,6,2,6>: Cost 3 vmrghw <6,2,7,3>, <6,6,2,2>
+ 2785874426U, // <6,6,2,7>: Cost 3 vsldoi12 <5,6,7,6>, <6,2,7,3>
+ 1178554874U, // <6,6,2,u>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3>
+ 2724661398U, // <6,6,3,0>: Cost 3 vsldoi8 <6,6,6,6>, <3,0,1,2>
+ 3375154665U, // <6,6,3,1>: Cost 4 vmrglw <3,2,6,3>, <2,0,6,1>
+ 3375154909U, // <6,6,3,2>: Cost 4 vmrglw <3,2,6,3>, <2,3,6,2>
+ 2301413734U, // <6,6,3,3>: Cost 3 vmrglw <3,2,6,3>, <3,2,6,3>
+ 2772455986U, // <6,6,3,4>: Cost 3 vsldoi12 <3,4,5,6>, <6,3,4,5>
+ 3375154993U, // <6,6,3,5>: Cost 4 vmrglw <3,2,6,3>, <2,4,6,5>
+ 2323313464U, // <6,6,3,6>: Cost 3 vmrglw <6,u,6,3>, <6,6,6,6>
+ 2301414710U, // <6,6,3,7>: Cost 3 vmrglw <3,2,6,3>, RHS
+ 2301414711U, // <6,6,3,u>: Cost 3 vmrglw <3,2,6,3>, RHS
+ 2724662162U, // <6,6,4,0>: Cost 3 vsldoi8 <6,6,6,6>, <4,0,5,1>
+ 3326939559U, // <6,6,4,1>: Cost 4 vmrghw <6,4,1,5>, <6,1,7,1>
+ 2253271546U, // <6,6,4,2>: Cost 3 vmrghw <6,4,2,5>, <6,2,7,3>
+ 3383127346U, // <6,6,4,3>: Cost 4 vmrglw <4,5,6,4>, <4,5,6,3>
+ 2309385523U, // <6,6,4,4>: Cost 3 vmrglw <4,5,6,4>, <4,5,6,4>
+ 1650920758U, // <6,6,4,5>: Cost 2 vsldoi8 <6,6,6,6>, RHS
+ 2724662653U, // <6,6,4,6>: Cost 3 vsldoi8 <6,6,6,6>, <4,6,5,6>
+ 2309385526U, // <6,6,4,7>: Cost 3 vmrglw <4,5,6,4>, RHS
+ 1650921001U, // <6,6,4,u>: Cost 2 vsldoi8 <6,6,6,6>, RHS
+ 3725312102U, // <6,6,5,0>: Cost 4 vsldoi4 <5,6,6,5>, LHS
+ 3373180393U, // <6,6,5,1>: Cost 4 vmrglw <2,u,6,5>, <2,0,6,1>
+ 3791769368U, // <6,6,5,2>: Cost 4 vsldoi8 <5,5,6,6>, <5,2,6,3>
+ 3373181286U, // <6,6,5,3>: Cost 4 vmrglw <2,u,6,5>, <3,2,6,3>
+ 3725315382U, // <6,6,5,4>: Cost 4 vsldoi4 <5,6,6,5>, RHS
+ 2299439221U, // <6,6,5,5>: Cost 3 vmrglw <2,u,6,5>, <2,u,6,5>
+ 2724663394U, // <6,6,5,6>: Cost 3 vsldoi8 <6,6,6,6>, <5,6,7,0>
+ 2299440438U, // <6,6,5,7>: Cost 3 vmrglw <2,u,6,5>, RHS
+ 2299440439U, // <6,6,5,u>: Cost 3 vmrglw <2,u,6,5>, RHS
+ 1583808614U, // <6,6,6,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS
+ 2322010445U, // <6,6,6,1>: Cost 3 vmrglw <6,6,6,6>, <6,0,6,1>
+ 2254574074U, // <6,6,6,2>: Cost 3 vmrghw <6,6,2,2>, <6,2,7,3>
+ 2322010609U, // <6,6,6,3>: Cost 3 vmrglw <6,6,6,6>, <6,2,6,3>
+ 1583811894U, // <6,6,6,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS
+ 2322010773U, // <6,6,6,5>: Cost 3 vmrglw <6,6,6,6>, <6,4,6,5>
+ 363253046U, // <6,6,6,6>: Cost 1 vspltisw2 RHS
+ 1248267574U, // <6,6,6,7>: Cost 2 vmrglw <6,6,6,6>, RHS
+ 363253046U, // <6,6,6,u>: Cost 1 vspltisw2 RHS
+ 2309410095U, // <6,6,7,0>: Cost 3 vmrglw RHS, <4,5,6,0>
+ 2309408233U, // <6,6,7,1>: Cost 3 vmrglw RHS, <2,0,6,1>
+ 2311402373U, // <6,6,7,2>: Cost 3 vmrglw RHS, <6,7,6,2>
+ 2309409126U, // <6,6,7,3>: Cost 3 vmrglw RHS, <3,2,6,3>
+ 2309410099U, // <6,6,7,4>: Cost 3 vmrglw RHS, <4,5,6,4>
+ 2309408561U, // <6,6,7,5>: Cost 3 vmrglw RHS, <2,4,6,5>
+ 1237660472U, // <6,6,7,6>: Cost 2 vmrglw RHS, <6,6,6,6>
+ 161926454U, // <6,6,7,7>: Cost 1 vmrglw RHS, RHS
+ 161926455U, // <6,6,7,u>: Cost 1 vmrglw RHS, RHS
+ 1583808614U, // <6,6,u,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS
+ 1650923310U, // <6,6,u,1>: Cost 2 vsldoi8 <6,6,6,6>, LHS
+ 1178554874U, // <6,6,u,2>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3>
+ 2309417318U, // <6,6,u,3>: Cost 3 vmrglw RHS, <3,2,6,3>
+ 1583811894U, // <6,6,u,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS
+ 1650923674U, // <6,6,u,5>: Cost 2 vsldoi8 <6,6,6,6>, RHS
+ 363253046U, // <6,6,u,6>: Cost 1 vspltisw2 RHS
+ 161934646U, // <6,6,u,7>: Cost 1 vmrglw RHS, RHS
+ 161934647U, // <6,6,u,u>: Cost 1 vmrglw RHS, RHS
+ 1638318080U, // <6,7,0,0>: Cost 2 vsldoi8 RHS, <0,0,0,0>
+ 564576358U, // <6,7,0,1>: Cost 1 vsldoi8 RHS, LHS
+ 2712060077U, // <6,7,0,2>: Cost 3 vsldoi8 RHS, <0,2,1,2>
+ 2712060156U, // <6,7,0,3>: Cost 3 vsldoi8 RHS, <0,3,1,0>
+ 1638318418U, // <6,7,0,4>: Cost 2 vsldoi8 RHS, <0,4,1,5>
+ 1577865314U, // <6,7,0,5>: Cost 2 vsldoi4 <5,6,7,0>, <5,6,7,0>
+ 2712060406U, // <6,7,0,6>: Cost 3 vsldoi8 RHS, <0,6,1,7>
+ 2651608058U, // <6,7,0,7>: Cost 3 vsldoi4 <5,6,7,0>, <7,0,1,2>
+ 564576925U, // <6,7,0,u>: Cost 1 vsldoi8 RHS, LHS
+ 2712060643U, // <6,7,1,0>: Cost 3 vsldoi8 RHS, <1,0,1,1>
+ 1638318900U, // <6,7,1,1>: Cost 2 vsldoi8 RHS, <1,1,1,1>
+ 1638318998U, // <6,7,1,2>: Cost 2 vsldoi8 RHS, <1,2,3,0>
+ 3766559753U, // <6,7,1,3>: Cost 4 vsldoi8 <1,3,6,7>, <1,3,6,7>
+ 2712060971U, // <6,7,1,4>: Cost 3 vsldoi8 RHS, <1,4,1,5>
+ 2712061039U, // <6,7,1,5>: Cost 3 vsldoi8 RHS, <1,5,0,1>
+ 2712061135U, // <6,7,1,6>: Cost 3 vsldoi8 RHS, <1,6,1,7>
+ 3373148612U, // <6,7,1,7>: Cost 4 vmrglw <2,u,6,1>, <3,3,7,7>
+ 1638319484U, // <6,7,1,u>: Cost 2 vsldoi8 RHS, <1,u,3,0>
+ 2712061373U, // <6,7,2,0>: Cost 3 vsldoi8 RHS, <2,0,1,2>
+ 2712061471U, // <6,7,2,1>: Cost 3 vsldoi8 RHS, <2,1,3,1>
+ 1638319720U, // <6,7,2,2>: Cost 2 vsldoi8 RHS, <2,2,2,2>
+ 1638319782U, // <6,7,2,3>: Cost 2 vsldoi8 RHS, <2,3,0,1>
+ 2712061709U, // <6,7,2,4>: Cost 3 vsldoi8 RHS, <2,4,2,5>
+ 2712061800U, // <6,7,2,5>: Cost 3 vsldoi8 RHS, <2,5,3,6>
+ 1638320058U, // <6,7,2,6>: Cost 2 vsldoi8 RHS, <2,6,3,7>
+ 2252297836U, // <6,7,2,7>: Cost 3 vmrghw <6,2,7,3>, <7,7,7,7>
+ 1638320187U, // <6,7,2,u>: Cost 2 vsldoi8 RHS, <2,u,0,1>
+ 1638320278U, // <6,7,3,0>: Cost 2 vsldoi8 RHS, <3,0,1,2>
+ 2712062182U, // <6,7,3,1>: Cost 3 vsldoi8 RHS, <3,1,1,1>
+ 2712062256U, // <6,7,3,2>: Cost 3 vsldoi8 RHS, <3,2,0,3>
+ 1638320540U, // <6,7,3,3>: Cost 2 vsldoi8 RHS, <3,3,3,3>
+ 1638320642U, // <6,7,3,4>: Cost 2 vsldoi8 RHS, <3,4,5,6>
+ 2712062546U, // <6,7,3,5>: Cost 3 vsldoi8 RHS, <3,5,5,5>
+ 2712062584U, // <6,7,3,6>: Cost 3 vsldoi8 RHS, <3,6,0,7>
+ 2712062659U, // <6,7,3,7>: Cost 3 vsldoi8 RHS, <3,7,0,1>
+ 1638320926U, // <6,7,3,u>: Cost 2 vsldoi8 RHS, <3,u,1,2>
+ 1638321042U, // <6,7,4,0>: Cost 2 vsldoi8 RHS, <4,0,5,1>
+ 2712062922U, // <6,7,4,1>: Cost 3 vsldoi8 RHS, <4,1,2,3>
+ 2712063029U, // <6,7,4,2>: Cost 3 vsldoi8 RHS, <4,2,5,2>
+ 2712063108U, // <6,7,4,3>: Cost 3 vsldoi8 RHS, <4,3,5,0>
+ 1638321360U, // <6,7,4,4>: Cost 2 vsldoi8 RHS, <4,4,4,4>
+ 564579638U, // <6,7,4,5>: Cost 1 vsldoi8 RHS, RHS
+ 2712063357U, // <6,7,4,6>: Cost 3 vsldoi8 RHS, <4,6,5,6>
+ 2712063439U, // <6,7,4,7>: Cost 3 vsldoi8 RHS, <4,7,5,7>
+ 564579881U, // <6,7,4,u>: Cost 1 vsldoi8 RHS, RHS
+ 2712063560U, // <6,7,5,0>: Cost 3 vsldoi8 RHS, <5,0,1,2>
+ 2714054287U, // <6,7,5,1>: Cost 3 vsldoi8 RHS, <5,1,0,1>
+ 2712063742U, // <6,7,5,2>: Cost 3 vsldoi8 RHS, <5,2,3,4>
+ 3373181295U, // <6,7,5,3>: Cost 4 vmrglw <2,u,6,5>, <3,2,7,3>
+ 2712063924U, // <6,7,5,4>: Cost 3 vsldoi8 RHS, <5,4,5,6>
+ 1638322180U, // <6,7,5,5>: Cost 2 vsldoi8 RHS, <5,5,5,5>
+ 1638322274U, // <6,7,5,6>: Cost 2 vsldoi8 RHS, <5,6,7,0>
+ 3373181380U, // <6,7,5,7>: Cost 4 vmrglw <2,u,6,5>, <3,3,7,7>
+ 1640313092U, // <6,7,5,u>: Cost 2 vsldoi8 RHS, <5,u,7,0>
+ 2712064289U, // <6,7,6,0>: Cost 3 vsldoi8 RHS, <6,0,1,2>
+ 2712064423U, // <6,7,6,1>: Cost 3 vsldoi8 RHS, <6,1,7,1>
+ 1638322682U, // <6,7,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3>
+ 2712064562U, // <6,7,6,3>: Cost 3 vsldoi8 RHS, <6,3,4,5>
+ 2712064653U, // <6,7,6,4>: Cost 3 vsldoi8 RHS, <6,4,5,6>
+ 2712064747U, // <6,7,6,5>: Cost 3 vsldoi8 RHS, <6,5,7,1>
+ 1638323000U, // <6,7,6,6>: Cost 2 vsldoi8 RHS, <6,6,6,6>
+ 1638323022U, // <6,7,6,7>: Cost 2 vsldoi8 RHS, <6,7,0,1>
+ 1638323168U, // <6,7,6,u>: Cost 2 vsldoi8 RHS, <6,u,7,3>
+ 1237659746U, // <6,7,7,0>: Cost 2 vmrglw RHS, <5,6,7,0>
+ 2309411158U, // <6,7,7,1>: Cost 3 vmrglw RHS, <6,0,7,1>
+ 2639718330U, // <6,7,7,2>: Cost 3 vsldoi4 <3,6,7,7>, <2,6,3,7>
+ 1235669498U, // <6,7,7,3>: Cost 2 vmrglw RHS, <6,2,7,3>
+ 1237659750U, // <6,7,7,4>: Cost 2 vmrglw RHS, <5,6,7,4>
+ 2309411243U, // <6,7,7,5>: Cost 3 vmrglw RHS, <6,1,7,5>
+ 1583895362U, // <6,7,7,6>: Cost 2 vsldoi4 <6,6,7,7>, <6,6,7,7>
+ 1235669826U, // <6,7,7,7>: Cost 2 vmrglw RHS, <6,6,7,7>
+ 1235669503U, // <6,7,7,u>: Cost 2 vmrglw RHS, <6,2,7,u>
+ 1638323923U, // <6,7,u,0>: Cost 2 vsldoi8 RHS, <u,0,1,2>
+ 564582190U, // <6,7,u,1>: Cost 1 vsldoi8 RHS, LHS
+ 1638324101U, // <6,7,u,2>: Cost 2 vsldoi8 RHS, <u,2,3,0>
+ 1638324156U, // <6,7,u,3>: Cost 2 vsldoi8 RHS, <u,3,0,1>
+ 1638324287U, // <6,7,u,4>: Cost 2 vsldoi8 RHS, <u,4,5,6>
+ 564582554U, // <6,7,u,5>: Cost 1 vsldoi8 RHS, RHS
+ 1638324432U, // <6,7,u,6>: Cost 2 vsldoi8 RHS, <u,6,3,7>
+ 1235678018U, // <6,7,u,7>: Cost 2 vmrglw RHS, <6,6,7,7>
+ 564582757U, // <6,7,u,u>: Cost 1 vsldoi8 RHS, LHS
+ 1638326272U, // <6,u,0,0>: Cost 2 vsldoi8 RHS, <0,0,0,0>
+ 564584550U, // <6,u,0,1>: Cost 1 vsldoi8 RHS, LHS
+ 2712068269U, // <6,u,0,2>: Cost 3 vsldoi8 RHS, <0,2,1,2>
+ 2309349532U, // <6,u,0,3>: Cost 3 vmrglw <4,5,6,0>, LHS
+ 1638326610U, // <6,u,0,4>: Cost 2 vsldoi8 RHS, <0,4,1,5>
+ 1577939051U, // <6,u,0,5>: Cost 2 vsldoi4 <5,6,u,0>, <5,6,u,0>
+ 2712068598U, // <6,u,0,6>: Cost 3 vsldoi8 RHS, <0,6,1,7>
+ 2309352776U, // <6,u,0,7>: Cost 3 vmrglw <4,5,6,0>, RHS
+ 564585117U, // <6,u,0,u>: Cost 1 vsldoi8 RHS, LHS
+ 2712068835U, // <6,u,1,0>: Cost 3 vsldoi8 RHS, <1,0,1,1>
+ 1638327092U, // <6,u,1,1>: Cost 2 vsldoi8 RHS, <1,1,1,1>
+ 1698715438U, // <6,u,1,2>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+ 2299404444U, // <6,u,1,3>: Cost 3 vmrglw <2,u,6,1>, LHS
+ 2712069163U, // <6,u,1,4>: Cost 3 vsldoi8 RHS, <1,4,1,5>
+ 2712069231U, // <6,u,1,5>: Cost 3 vsldoi8 RHS, <1,5,0,1>
+ 2712069327U, // <6,u,1,6>: Cost 3 vsldoi8 RHS, <1,6,1,7>
+ 2299407688U, // <6,u,1,7>: Cost 3 vmrglw <2,u,6,1>, RHS
+ 1698715492U, // <6,u,1,u>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+ 2712069565U, // <6,u,2,0>: Cost 3 vsldoi8 RHS, <2,0,1,2>
+ 1178556206U, // <6,u,2,1>: Cost 2 vmrghw <6,2,7,3>, LHS
+ 1638327912U, // <6,u,2,2>: Cost 2 vsldoi8 RHS, <2,2,2,2>
+ 1638327974U, // <6,u,2,3>: Cost 2 vsldoi8 RHS, <2,3,0,1>
+ 2712069901U, // <6,u,2,4>: Cost 3 vsldoi8 RHS, <2,4,2,5>
+ 1178556570U, // <6,u,2,5>: Cost 2 vmrghw <6,2,7,3>, RHS
+ 1638328250U, // <6,u,2,6>: Cost 2 vsldoi8 RHS, <2,6,3,7>
+ 2252298496U, // <6,u,2,7>: Cost 3 vmrghw <6,2,7,3>, <u,7,0,1>
+ 1638328379U, // <6,u,2,u>: Cost 2 vsldoi8 RHS, <2,u,0,1>
+ 1638328470U, // <6,u,3,0>: Cost 2 vsldoi8 RHS, <3,0,1,2>
+ 2712070374U, // <6,u,3,1>: Cost 3 vsldoi8 RHS, <3,1,1,1>
+ 2704107883U, // <6,u,3,2>: Cost 3 vsldoi8 <3,2,6,u>, <3,2,6,u>
+ 1638328732U, // <6,u,3,3>: Cost 2 vsldoi8 RHS, <3,3,3,3>
+ 1638328834U, // <6,u,3,4>: Cost 2 vsldoi8 RHS, <3,4,5,6>
+ 2712070738U, // <6,u,3,5>: Cost 3 vsldoi8 RHS, <3,5,5,5>
+ 2712070776U, // <6,u,3,6>: Cost 3 vsldoi8 RHS, <3,6,0,7>
+ 2301414728U, // <6,u,3,7>: Cost 3 vmrglw <3,2,6,3>, RHS
+ 1638329118U, // <6,u,3,u>: Cost 2 vsldoi8 RHS, <3,u,1,2>
+ 1638329234U, // <6,u,4,0>: Cost 2 vsldoi8 RHS, <4,0,5,1>
+ 2712071114U, // <6,u,4,1>: Cost 3 vsldoi8 RHS, <4,1,2,3>
+ 2712071221U, // <6,u,4,2>: Cost 3 vsldoi8 RHS, <4,2,5,2>
+ 2309382300U, // <6,u,4,3>: Cost 3 vmrglw <4,5,6,4>, LHS
+ 1638329552U, // <6,u,4,4>: Cost 2 vsldoi8 RHS, <4,4,4,4>
+ 564587831U, // <6,u,4,5>: Cost 1 vsldoi8 RHS, RHS
+ 2712071545U, // <6,u,4,6>: Cost 3 vsldoi8 RHS, <4,6,5,2>
+ 2309385544U, // <6,u,4,7>: Cost 3 vmrglw <4,5,6,4>, RHS
+ 564588073U, // <6,u,4,u>: Cost 1 vsldoi8 RHS, RHS
+ 2712071752U, // <6,u,5,0>: Cost 3 vsldoi8 RHS, <5,0,1,2>
+ 2714062479U, // <6,u,5,1>: Cost 3 vsldoi8 RHS, <5,1,0,1>
+ 2712071934U, // <6,u,5,2>: Cost 3 vsldoi8 RHS, <5,2,3,4>
+ 2299437212U, // <6,u,5,3>: Cost 3 vmrglw <2,u,6,5>, LHS
+ 2712072116U, // <6,u,5,4>: Cost 3 vsldoi8 RHS, <5,4,5,6>
+ 1638330372U, // <6,u,5,5>: Cost 2 vsldoi8 RHS, <5,5,5,5>
+ 1698715802U, // <6,u,5,6>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+ 2299440456U, // <6,u,5,7>: Cost 3 vmrglw <2,u,6,5>, RHS
+ 1698715820U, // <6,u,5,u>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+ 1583808614U, // <6,u,6,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS
+ 1181161262U, // <6,u,6,1>: Cost 2 vmrghw <6,6,6,6>, LHS
+ 1638330874U, // <6,u,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3>
+ 1248264348U, // <6,u,6,3>: Cost 2 vmrglw <6,6,6,6>, LHS
+ 1583811894U, // <6,u,6,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS
+ 1181161626U, // <6,u,6,5>: Cost 2 vmrghw <6,6,6,6>, RHS
+ 363253046U, // <6,u,6,6>: Cost 1 vspltisw2 RHS
+ 1638331214U, // <6,u,6,7>: Cost 2 vsldoi8 RHS, <6,7,0,1>
+ 363253046U, // <6,u,6,u>: Cost 1 vspltisw2 RHS
+ 1560076390U, // <6,u,7,0>: Cost 2 vsldoi4 <2,6,u,7>, LHS
+ 1235664969U, // <6,u,7,1>: Cost 2 vmrglw RHS, <0,0,u,1>
+ 1560078311U, // <6,u,7,2>: Cost 2 vsldoi4 <2,6,u,7>, <2,6,u,7>
+ 161923228U, // <6,u,7,3>: Cost 1 vmrglw RHS, LHS
+ 1560079670U, // <6,u,7,4>: Cost 2 vsldoi4 <2,6,u,7>, RHS
+ 1235665297U, // <6,u,7,5>: Cost 2 vmrglw RHS, <0,4,u,5>
+ 1235667485U, // <6,u,7,6>: Cost 2 vmrglw RHS, <3,4,u,6>
+ 161926472U, // <6,u,7,7>: Cost 1 vmrglw RHS, RHS
+ 161923233U, // <6,u,7,u>: Cost 1 vmrglw RHS, LHS
+ 1560084582U, // <6,u,u,0>: Cost 2 vsldoi4 <2,6,u,u>, LHS
+ 564590382U, // <6,u,u,1>: Cost 1 vsldoi8 RHS, LHS
+ 1560086504U, // <6,u,u,2>: Cost 2 vsldoi4 <2,6,u,u>, <2,6,u,u>
+ 161931420U, // <6,u,u,3>: Cost 1 vmrglw RHS, LHS
+ 1560087862U, // <6,u,u,4>: Cost 2 vsldoi4 <2,6,u,u>, RHS
+ 564590746U, // <6,u,u,5>: Cost 1 vsldoi8 RHS, RHS
+ 363253046U, // <6,u,u,6>: Cost 1 vspltisw2 RHS
+ 161934664U, // <6,u,u,7>: Cost 1 vmrglw RHS, RHS
+ 161931425U, // <6,u,u,u>: Cost 1 vmrglw RHS, LHS
+ 1705426944U, // <7,0,0,0>: Cost 2 vsldoi12 RHS, <0,0,0,0>
+ 1705426954U, // <7,0,0,1>: Cost 2 vsldoi12 RHS, <0,0,1,1>
+ 3713550266U, // <7,0,0,2>: Cost 4 vsldoi4 <3,7,0,0>, <2,6,3,7>
+ 2316063892U, // <7,0,0,3>: Cost 3 vmrglw <5,6,7,0>, <7,2,0,3>
+ 2779168805U, // <7,0,0,4>: Cost 3 vsldoi12 RHS, <0,0,4,1>
+ 2663698530U, // <7,0,0,5>: Cost 3 vsldoi4 <7,7,0,0>, <5,6,7,0>
+ 2657727309U, // <7,0,0,6>: Cost 3 vsldoi4 <6,7,0,0>, <6,7,0,0>
+ 2316064220U, // <7,0,0,7>: Cost 3 vmrglw <5,6,7,0>, <7,6,0,7>
+ 1705427017U, // <7,0,0,u>: Cost 2 vsldoi12 RHS, <0,0,u,1>
+ 1583988838U, // <7,0,1,0>: Cost 2 vsldoi4 <6,7,0,1>, LHS
+ 2779168859U, // <7,0,1,1>: Cost 3 vsldoi12 RHS, <0,1,1,1>
+ 631685222U, // <7,0,1,2>: Cost 1 vsldoi12 RHS, LHS
+ 2639817411U, // <7,0,1,3>: Cost 3 vsldoi4 <3,7,0,1>, <3,7,0,1>
+ 1583992118U, // <7,0,1,4>: Cost 2 vsldoi4 <6,7,0,1>, RHS
+ 2657734660U, // <7,0,1,5>: Cost 3 vsldoi4 <6,7,0,1>, <5,5,5,5>
+ 1583993678U, // <7,0,1,6>: Cost 2 vsldoi4 <6,7,0,1>, <6,7,0,1>
+ 2657735672U, // <7,0,1,7>: Cost 3 vsldoi4 <6,7,0,1>, <7,0,1,0>
+ 631685276U, // <7,0,1,u>: Cost 1 vsldoi12 RHS, LHS
+ 2779168933U, // <7,0,2,0>: Cost 3 vsldoi12 RHS, <0,2,0,3>
+ 2767667377U, // <7,0,2,1>: Cost 3 vsldoi12 <2,6,3,7>, <0,2,1,6>
+ 2718713448U, // <7,0,2,2>: Cost 3 vsldoi8 <5,6,7,0>, <2,2,2,2>
+ 2718713510U, // <7,0,2,3>: Cost 3 vsldoi8 <5,6,7,0>, <2,3,0,1>
+ 3841409228U, // <7,0,2,4>: Cost 4 vsldoi12 <2,6,3,7>, <0,2,4,6>
+ 3852910802U, // <7,0,2,5>: Cost 4 vsldoi12 RHS, <0,2,5,3>
+ 2718713786U, // <7,0,2,6>: Cost 3 vsldoi8 <5,6,7,0>, <2,6,3,7>
+ 3847160036U, // <7,0,2,7>: Cost 4 vsldoi12 <3,6,0,7>, <0,2,7,3>
+ 2767667440U, // <7,0,2,u>: Cost 3 vsldoi12 <2,6,3,7>, <0,2,u,6>
+ 2718714006U, // <7,0,3,0>: Cost 3 vsldoi8 <5,6,7,0>, <3,0,1,2>
+ 2779169020U, // <7,0,3,1>: Cost 3 vsldoi12 RHS, <0,3,1,0>
+ 3852910853U, // <7,0,3,2>: Cost 4 vsldoi12 RHS, <0,3,2,0>
+ 2718714268U, // <7,0,3,3>: Cost 3 vsldoi8 <5,6,7,0>, <3,3,3,3>
+ 2718714370U, // <7,0,3,4>: Cost 3 vsldoi8 <5,6,7,0>, <3,4,5,6>
+ 2718714461U, // <7,0,3,5>: Cost 3 vsldoi8 <5,6,7,0>, <3,5,6,7>
+ 2706770608U, // <7,0,3,6>: Cost 3 vsldoi8 <3,6,7,0>, <3,6,7,0>
+ 3847160114U, // <7,0,3,7>: Cost 4 vsldoi12 <3,6,0,7>, <0,3,7,0>
+ 2779169083U, // <7,0,3,u>: Cost 3 vsldoi12 RHS, <0,3,u,0>
+ 2718714770U, // <7,0,4,0>: Cost 3 vsldoi8 <5,6,7,0>, <4,0,5,1>
+ 1705427282U, // <7,0,4,1>: Cost 2 vsldoi12 RHS, <0,4,1,5>
+ 3713583034U, // <7,0,4,2>: Cost 4 vsldoi4 <3,7,0,4>, <2,6,3,7>
+ 3713583814U, // <7,0,4,3>: Cost 4 vsldoi4 <3,7,0,4>, <3,7,0,4>
+ 2779169133U, // <7,0,4,4>: Cost 3 vsldoi12 RHS, <0,4,4,5>
+ 1644973366U, // <7,0,4,5>: Cost 2 vsldoi8 <5,6,7,0>, RHS
+ 2657760081U, // <7,0,4,6>: Cost 3 vsldoi4 <6,7,0,4>, <6,7,0,4>
+ 2259468868U, // <7,0,4,7>: Cost 3 vmrghw <7,4,5,6>, <0,7,1,4>
+ 1705427345U, // <7,0,4,u>: Cost 2 vsldoi12 RHS, <0,4,u,5>
+ 2718715508U, // <7,0,5,0>: Cost 3 vsldoi8 <5,6,7,0>, <5,0,6,1>
+ 2260123750U, // <7,0,5,1>: Cost 3 vmrghw <7,5,5,5>, LHS
+ 3792457451U, // <7,0,5,2>: Cost 4 vsldoi8 <5,6,7,0>, <5,2,1,3>
+ 3852911024U, // <7,0,5,3>: Cost 4 vsldoi12 RHS, <0,5,3,0>
+ 2718715836U, // <7,0,5,4>: Cost 3 vsldoi8 <5,6,7,0>, <5,4,6,5>
+ 2718715908U, // <7,0,5,5>: Cost 3 vsldoi8 <5,6,7,0>, <5,5,5,5>
+ 1644974178U, // <7,0,5,6>: Cost 2 vsldoi8 <5,6,7,0>, <5,6,7,0>
+ 3792457853U, // <7,0,5,7>: Cost 4 vsldoi8 <5,6,7,0>, <5,7,1,0>
+ 1646301444U, // <7,0,5,u>: Cost 2 vsldoi8 <5,u,7,0>, <5,u,7,0>
+ 2720706901U, // <7,0,6,0>: Cost 3 vsldoi8 <6,0,7,0>, <6,0,7,0>
+ 2779169270U, // <7,0,6,1>: Cost 3 vsldoi12 RHS, <0,6,1,7>
+ 2718716410U, // <7,0,6,2>: Cost 3 vsldoi8 <5,6,7,0>, <6,2,7,3>
+ 2722697800U, // <7,0,6,3>: Cost 3 vsldoi8 <6,3,7,0>, <6,3,7,0>
+ 3852911121U, // <7,0,6,4>: Cost 4 vsldoi12 RHS, <0,6,4,7>
+ 3852911130U, // <7,0,6,5>: Cost 4 vsldoi12 RHS, <0,6,5,7>
+ 2718716728U, // <7,0,6,6>: Cost 3 vsldoi8 <5,6,7,0>, <6,6,6,6>
+ 2718716750U, // <7,0,6,7>: Cost 3 vsldoi8 <5,6,7,0>, <6,7,0,1>
+ 2779169333U, // <7,0,6,u>: Cost 3 vsldoi12 RHS, <0,6,u,7>
+ 2718716922U, // <7,0,7,0>: Cost 3 vsldoi8 <5,6,7,0>, <7,0,1,2>
+ 1187872870U, // <7,0,7,1>: Cost 2 vmrghw <7,7,7,7>, LHS
+ 2718717076U, // <7,0,7,2>: Cost 3 vsldoi8 <5,6,7,0>, <7,2,0,3>
+ 3847160408U, // <7,0,7,3>: Cost 4 vsldoi12 <3,6,0,7>, <0,7,3,6>
+ 2718717286U, // <7,0,7,4>: Cost 3 vsldoi8 <5,6,7,0>, <7,4,5,6>
+ 2718717377U, // <7,0,7,5>: Cost 3 vsldoi8 <5,6,7,0>, <7,5,6,7>
+ 2718717404U, // <7,0,7,6>: Cost 3 vsldoi8 <5,6,7,0>, <7,6,0,7>
+ 2718717478U, // <7,0,7,7>: Cost 3 vsldoi8 <5,6,7,0>, <7,7,0,0>
+ 1187873437U, // <7,0,7,u>: Cost 2 vmrghw <7,7,7,7>, LHS
+ 1584046182U, // <7,0,u,0>: Cost 2 vsldoi4 <6,7,0,u>, LHS
+ 1705427602U, // <7,0,u,1>: Cost 2 vsldoi12 RHS, <0,u,1,1>
+ 631685789U, // <7,0,u,2>: Cost 1 vsldoi12 RHS, LHS
+ 2639874762U, // <7,0,u,3>: Cost 3 vsldoi4 <3,7,0,u>, <3,7,0,u>
+ 1584049462U, // <7,0,u,4>: Cost 2 vsldoi4 <6,7,0,u>, RHS
+ 1644976282U, // <7,0,u,5>: Cost 2 vsldoi8 <5,6,7,0>, RHS
+ 1584051029U, // <7,0,u,6>: Cost 2 vsldoi4 <6,7,0,u>, <6,7,0,u>
+ 2718718208U, // <7,0,u,7>: Cost 3 vsldoi8 <5,6,7,0>, <u,7,0,1>
+ 631685843U, // <7,0,u,u>: Cost 1 vsldoi12 RHS, LHS
+ 2721374218U, // <7,1,0,0>: Cost 3 vsldoi8 <6,1,7,1>, <0,0,1,1>
+ 2779169507U, // <7,1,0,1>: Cost 3 vsldoi12 RHS, <1,0,1,1>
+ 2779169516U, // <7,1,0,2>: Cost 3 vsldoi12 RHS, <1,0,2,1>
+ 3852911348U, // <7,1,0,3>: Cost 4 vsldoi12 RHS, <1,0,3,0>
+ 2669743414U, // <7,1,0,4>: Cost 3 vsldoi4 <u,7,1,0>, RHS
+ 2316058962U, // <7,1,0,5>: Cost 3 vmrglw <5,6,7,0>, <0,4,1,5>
+ 2316059044U, // <7,1,0,6>: Cost 3 vmrglw <5,6,7,0>, <0,5,1,6>
+ 2669745146U, // <7,1,0,7>: Cost 3 vsldoi4 <u,7,1,0>, <7,0,1,2>
+ 2779169570U, // <7,1,0,u>: Cost 3 vsldoi12 RHS, <1,0,u,1>
+ 2779169579U, // <7,1,1,0>: Cost 3 vsldoi12 RHS, <1,1,0,1>
+ 1705427764U, // <7,1,1,1>: Cost 2 vsldoi12 RHS, <1,1,1,1>
+ 2779169598U, // <7,1,1,2>: Cost 3 vsldoi12 RHS, <1,1,2,2>
+ 3713632972U, // <7,1,1,3>: Cost 4 vsldoi4 <3,7,1,1>, <3,7,1,1>
+ 2779169619U, // <7,1,1,4>: Cost 3 vsldoi12 RHS, <1,1,4,5>
+ 2779169628U, // <7,1,1,5>: Cost 3 vsldoi12 RHS, <1,1,5,5>
+ 2657809239U, // <7,1,1,6>: Cost 3 vsldoi4 <6,7,1,1>, <6,7,1,1>
+ 3835290474U, // <7,1,1,7>: Cost 4 vsldoi12 <1,6,1,7>, <1,1,7,1>
+ 1705427764U, // <7,1,1,u>: Cost 2 vsldoi12 RHS, <1,1,1,1>
+ 2779169660U, // <7,1,2,0>: Cost 3 vsldoi12 RHS, <1,2,0,1>
+ 2779169671U, // <7,1,2,1>: Cost 3 vsldoi12 RHS, <1,2,1,3>
+ 2779169680U, // <7,1,2,2>: Cost 3 vsldoi12 RHS, <1,2,2,3>
+ 1705427862U, // <7,1,2,3>: Cost 2 vsldoi12 RHS, <1,2,3,0>
+ 2779169700U, // <7,1,2,4>: Cost 3 vsldoi12 RHS, <1,2,4,5>
+ 2779169707U, // <7,1,2,5>: Cost 3 vsldoi12 RHS, <1,2,5,3>
+ 2657817432U, // <7,1,2,6>: Cost 3 vsldoi4 <6,7,1,2>, <6,7,1,2>
+ 2803057594U, // <7,1,2,7>: Cost 3 vsldoi12 RHS, <1,2,7,0>
+ 1705427907U, // <7,1,2,u>: Cost 2 vsldoi12 RHS, <1,2,u,0>
+ 3776538827U, // <7,1,3,0>: Cost 4 vsldoi8 <3,0,7,1>, <3,0,7,1>
+ 2319400970U, // <7,1,3,1>: Cost 3 vmrglw <6,2,7,3>, <0,0,1,1>
+ 2316085398U, // <7,1,3,2>: Cost 3 vmrglw <5,6,7,3>, <3,0,1,2>
+ 3852911591U, // <7,1,3,3>: Cost 4 vsldoi12 RHS, <1,3,3,0>
+ 3852911600U, // <7,1,3,4>: Cost 4 vsldoi12 RHS, <1,3,4,0>
+ 2319401298U, // <7,1,3,5>: Cost 3 vmrglw <6,2,7,3>, <0,4,1,5>
+ 3833668617U, // <7,1,3,6>: Cost 4 vsldoi12 <1,3,6,7>, <1,3,6,7>
+ 3367265487U, // <7,1,3,7>: Cost 4 vmrglw <1,u,7,3>, <1,6,1,7>
+ 2319400977U, // <7,1,3,u>: Cost 3 vmrglw <6,2,7,3>, <0,0,1,u>
+ 2724031378U, // <7,1,4,0>: Cost 3 vsldoi8 <6,5,7,1>, <4,0,5,1>
+ 2779169835U, // <7,1,4,1>: Cost 3 vsldoi12 RHS, <1,4,1,5>
+ 2779169844U, // <7,1,4,2>: Cost 3 vsldoi12 RHS, <1,4,2,5>
+ 3852911672U, // <7,1,4,3>: Cost 4 vsldoi12 RHS, <1,4,3,0>
+ 2669776182U, // <7,1,4,4>: Cost 3 vsldoi4 <u,7,1,4>, RHS
+ 2779169872U, // <7,1,4,5>: Cost 3 vsldoi12 RHS, <1,4,5,6>
+ 3835290712U, // <7,1,4,6>: Cost 4 vsldoi12 <1,6,1,7>, <1,4,6,5>
+ 2669778278U, // <7,1,4,7>: Cost 3 vsldoi4 <u,7,1,4>, <7,4,5,6>
+ 2779169898U, // <7,1,4,u>: Cost 3 vsldoi12 RHS, <1,4,u,5>
+ 2779169903U, // <7,1,5,0>: Cost 3 vsldoi12 RHS, <1,5,0,1>
+ 3835585661U, // <7,1,5,1>: Cost 4 vsldoi12 <1,6,5,7>, <1,5,1,6>
+ 3841410182U, // <7,1,5,2>: Cost 4 vsldoi12 <2,6,3,7>, <1,5,2,6>
+ 3852911753U, // <7,1,5,3>: Cost 4 vsldoi12 RHS, <1,5,3,0>
+ 2779169943U, // <7,1,5,4>: Cost 3 vsldoi12 RHS, <1,5,4,5>
+ 2318754130U, // <7,1,5,5>: Cost 3 vmrglw <6,1,7,5>, <0,4,1,5>
+ 2718724195U, // <7,1,5,6>: Cost 3 vsldoi8 <5,6,7,1>, <5,6,7,1>
+ 3859178670U, // <7,1,5,7>: Cost 4 vsldoi12 <5,6,1,7>, <1,5,7,1>
+ 2779169975U, // <7,1,5,u>: Cost 3 vsldoi12 RHS, <1,5,u,1>
+ 2720715094U, // <7,1,6,0>: Cost 3 vsldoi8 <6,0,7,1>, <6,0,7,1>
+ 2761549007U, // <7,1,6,1>: Cost 3 vsldoi12 <1,6,1,7>, <1,6,1,7>
+ 2779170008U, // <7,1,6,2>: Cost 3 vsldoi12 RHS, <1,6,2,7>
+ 3835438305U, // <7,1,6,3>: Cost 4 vsldoi12 <1,6,3,7>, <1,6,3,7>
+ 3835512042U, // <7,1,6,4>: Cost 4 vsldoi12 <1,6,4,7>, <1,6,4,7>
+ 2761843955U, // <7,1,6,5>: Cost 3 vsldoi12 <1,6,5,7>, <1,6,5,7>
+ 3835659516U, // <7,1,6,6>: Cost 4 vsldoi12 <1,6,6,7>, <1,6,6,7>
+ 2803057918U, // <7,1,6,7>: Cost 3 vsldoi12 RHS, <1,6,7,0>
+ 2762065166U, // <7,1,6,u>: Cost 3 vsldoi12 <1,6,u,7>, <1,6,u,7>
+ 2669797478U, // <7,1,7,0>: Cost 3 vsldoi4 <u,7,1,7>, LHS
+ 2322087946U, // <7,1,7,1>: Cost 3 vmrglw <6,6,7,7>, <0,0,1,1>
+ 2317448186U, // <7,1,7,2>: Cost 3 vmrglw <5,u,7,7>, <7,0,1,2>
+ 3395829934U, // <7,1,7,3>: Cost 4 vmrglw <6,6,7,7>, <0,2,1,3>
+ 2669800758U, // <7,1,7,4>: Cost 3 vsldoi4 <u,7,1,7>, RHS
+ 2322088274U, // <7,1,7,5>: Cost 3 vmrglw <6,6,7,7>, <0,4,1,5>
+ 3375923377U, // <7,1,7,6>: Cost 4 vmrglw <3,3,7,7>, <0,2,1,6>
+ 2731996780U, // <7,1,7,7>: Cost 3 vsldoi8 <7,u,7,1>, <7,7,7,7>
+ 2322087953U, // <7,1,7,u>: Cost 3 vmrglw <6,6,7,7>, <0,0,1,u>
+ 2779170146U, // <7,1,u,0>: Cost 3 vsldoi12 RHS, <1,u,0,1>
+ 1705427764U, // <7,1,u,1>: Cost 2 vsldoi12 RHS, <1,1,1,1>
+ 2779170164U, // <7,1,u,2>: Cost 3 vsldoi12 RHS, <1,u,2,1>
+ 1705428348U, // <7,1,u,3>: Cost 2 vsldoi12 RHS, <1,u,3,0>
+ 2779170186U, // <7,1,u,4>: Cost 3 vsldoi12 RHS, <1,u,4,5>
+ 2763171221U, // <7,1,u,5>: Cost 3 vsldoi12 <1,u,5,7>, <1,u,5,7>
+ 2657866590U, // <7,1,u,6>: Cost 3 vsldoi4 <6,7,1,u>, <6,7,1,u>
+ 2803058080U, // <7,1,u,7>: Cost 3 vsldoi12 RHS, <1,u,7,0>
+ 1705428393U, // <7,1,u,u>: Cost 2 vsldoi12 RHS, <1,u,u,0>
+ 3713695846U, // <7,2,0,0>: Cost 4 vsldoi4 <3,7,2,0>, LHS
+ 2779170237U, // <7,2,0,1>: Cost 3 vsldoi12 RHS, <2,0,1,2>
+ 2779170245U, // <7,2,0,2>: Cost 3 vsldoi12 RHS, <2,0,2,1>
+ 1242316902U, // <7,2,0,3>: Cost 2 vmrglw <5,6,7,0>, LHS
+ 3713699126U, // <7,2,0,4>: Cost 4 vsldoi4 <3,7,2,0>, RHS
+ 3852912096U, // <7,2,0,5>: Cost 4 vsldoi12 RHS, <2,0,5,1>
+ 2767668713U, // <7,2,0,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,0,6,1>
+ 2256488426U, // <7,2,0,7>: Cost 3 vmrghw <7,0,1,2>, <2,7,0,1>
+ 1242316907U, // <7,2,0,u>: Cost 2 vmrglw <5,6,7,0>, LHS
+ 3852912132U, // <7,2,1,0>: Cost 4 vsldoi12 RHS, <2,1,0,1>
+ 3852912141U, // <7,2,1,1>: Cost 4 vsldoi12 RHS, <2,1,1,1>
+ 3852912149U, // <7,2,1,2>: Cost 4 vsldoi12 RHS, <2,1,2,0>
+ 2779170335U, // <7,2,1,3>: Cost 3 vsldoi12 RHS, <2,1,3,1>
+ 3852912172U, // <7,2,1,4>: Cost 4 vsldoi12 RHS, <2,1,4,5>
+ 3840747062U, // <7,2,1,5>: Cost 5 vsldoi12 <2,5,3,7>, <2,1,5,6>
+ 3841410617U, // <7,2,1,6>: Cost 4 vsldoi12 <2,6,3,7>, <2,1,6,0>
+ 3795125538U, // <7,2,1,7>: Cost 4 vsldoi8 <6,1,7,2>, <1,7,2,0>
+ 2779170380U, // <7,2,1,u>: Cost 3 vsldoi12 RHS, <2,1,u,1>
+ 2779170389U, // <7,2,2,0>: Cost 3 vsldoi12 RHS, <2,2,0,1>
+ 3852912222U, // <7,2,2,1>: Cost 4 vsldoi12 RHS, <2,2,1,1>
+ 1705428584U, // <7,2,2,2>: Cost 2 vsldoi12 RHS, <2,2,2,2>
+ 1705428594U, // <7,2,2,3>: Cost 2 vsldoi12 RHS, <2,2,3,3>
+ 2779170429U, // <7,2,2,4>: Cost 3 vsldoi12 RHS, <2,2,4,5>
+ 3852912259U, // <7,2,2,5>: Cost 4 vsldoi12 RHS, <2,2,5,2>
+ 2767668880U, // <7,2,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,2,6,6>
+ 3841336981U, // <7,2,2,7>: Cost 4 vsldoi12 <2,6,2,7>, <2,2,7,2>
+ 1705428639U, // <7,2,2,u>: Cost 2 vsldoi12 RHS, <2,2,u,3>
+ 1705428646U, // <7,2,3,0>: Cost 2 vsldoi12 RHS, <2,3,0,1>
+ 2779170479U, // <7,2,3,1>: Cost 3 vsldoi12 RHS, <2,3,1,1>
+ 2767668925U, // <7,2,3,2>: Cost 3 vsldoi12 <2,6,3,7>, <2,3,2,6>
+ 1245659238U, // <7,2,3,3>: Cost 2 vmrglw <6,2,7,3>, LHS
+ 1705428686U, // <7,2,3,4>: Cost 2 vsldoi12 RHS, <2,3,4,5>
+ 2779170519U, // <7,2,3,5>: Cost 3 vsldoi12 RHS, <2,3,5,5>
+ 2657899362U, // <7,2,3,6>: Cost 3 vsldoi4 <6,7,2,3>, <6,7,2,3>
+ 2319406574U, // <7,2,3,7>: Cost 3 vmrglw <6,2,7,3>, <7,6,2,7>
+ 1705428718U, // <7,2,3,u>: Cost 2 vsldoi12 RHS, <2,3,u,1>
+ 3713728614U, // <7,2,4,0>: Cost 4 vsldoi4 <3,7,2,4>, LHS
+ 3852912388U, // <7,2,4,1>: Cost 4 vsldoi12 RHS, <2,4,1,5>
+ 2779170573U, // <7,2,4,2>: Cost 3 vsldoi12 RHS, <2,4,2,5>
+ 1242349670U, // <7,2,4,3>: Cost 2 vmrglw <5,6,7,4>, LHS
+ 3713731894U, // <7,2,4,4>: Cost 4 vsldoi4 <3,7,2,4>, RHS
+ 2779170601U, // <7,2,4,5>: Cost 3 vsldoi12 RHS, <2,4,5,6>
+ 2767669041U, // <7,2,4,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,4,6,5>
+ 3389834456U, // <7,2,4,7>: Cost 4 vmrglw <5,6,7,4>, <1,6,2,7>
+ 1242349675U, // <7,2,4,u>: Cost 2 vmrglw <5,6,7,4>, LHS
+ 3852912456U, // <7,2,5,0>: Cost 4 vsldoi12 RHS, <2,5,0,1>
+ 3852912466U, // <7,2,5,1>: Cost 4 vsldoi12 RHS, <2,5,1,2>
+ 3852912475U, // <7,2,5,2>: Cost 4 vsldoi12 RHS, <2,5,2,2>
+ 2779170664U, // <7,2,5,3>: Cost 3 vsldoi12 RHS, <2,5,3,6>
+ 3852912496U, // <7,2,5,4>: Cost 4 vsldoi12 RHS, <2,5,4,5>
+ 3792474116U, // <7,2,5,5>: Cost 4 vsldoi8 <5,6,7,2>, <5,5,5,5>
+ 2718732388U, // <7,2,5,6>: Cost 3 vsldoi8 <5,6,7,2>, <5,6,7,2>
+ 3841337228U, // <7,2,5,7>: Cost 5 vsldoi12 <2,6,2,7>, <2,5,7,6>
+ 2779170709U, // <7,2,5,u>: Cost 3 vsldoi12 RHS, <2,5,u,6>
+ 2640003174U, // <7,2,6,0>: Cost 3 vsldoi4 <3,7,2,6>, LHS
+ 2721386920U, // <7,2,6,1>: Cost 3 vsldoi8 <6,1,7,2>, <6,1,7,2>
+ 2767595441U, // <7,2,6,2>: Cost 3 vsldoi12 <2,6,2,7>, <2,6,2,7>
+ 1693927354U, // <7,2,6,3>: Cost 2 vsldoi12 <2,6,3,7>, <2,6,3,7>
+ 2640006454U, // <7,2,6,4>: Cost 3 vsldoi4 <3,7,2,6>, RHS
+ 3841558476U, // <7,2,6,5>: Cost 4 vsldoi12 <2,6,5,7>, <2,6,5,7>
+ 2657923941U, // <7,2,6,6>: Cost 3 vsldoi4 <6,7,2,6>, <6,7,2,6>
+ 3841337310U, // <7,2,6,7>: Cost 4 vsldoi12 <2,6,2,7>, <2,6,7,7>
+ 1694296039U, // <7,2,6,u>: Cost 2 vsldoi12 <2,6,u,7>, <2,6,u,7>
+ 2803058666U, // <7,2,7,0>: Cost 3 vsldoi12 RHS, <2,7,0,1>
+ 3852912632U, // <7,2,7,1>: Cost 4 vsldoi12 RHS, <2,7,1,6>
+ 2322089576U, // <7,2,7,2>: Cost 3 vmrglw <6,6,7,7>, <2,2,2,2>
+ 1248346214U, // <7,2,7,3>: Cost 2 vmrglw <6,6,7,7>, LHS
+ 3841337362U, // <7,2,7,4>: Cost 4 vsldoi12 <2,6,2,7>, <2,7,4,5>
+ 3395830836U, // <7,2,7,5>: Cost 4 vmrglw <6,6,7,7>, <1,4,2,5>
+ 2261616570U, // <7,2,7,6>: Cost 3 vmrghw <7,7,7,7>, <2,6,3,7>
+ 3371943857U, // <7,2,7,7>: Cost 4 vmrglw <2,6,7,7>, <2,6,2,7>
+ 1248346219U, // <7,2,7,u>: Cost 2 vmrglw <6,6,7,7>, LHS
+ 1705429051U, // <7,2,u,0>: Cost 2 vsldoi12 RHS, <2,u,0,1>
+ 2779170884U, // <7,2,u,1>: Cost 3 vsldoi12 RHS, <2,u,1,1>
+ 1705428584U, // <7,2,u,2>: Cost 2 vsldoi12 RHS, <2,2,2,2>
+ 1695254620U, // <7,2,u,3>: Cost 2 vsldoi12 <2,u,3,7>, <2,u,3,7>
+ 1705429091U, // <7,2,u,4>: Cost 2 vsldoi12 RHS, <2,u,4,5>
+ 2779170924U, // <7,2,u,5>: Cost 3 vsldoi12 RHS, <2,u,5,5>
+ 2767669361U, // <7,2,u,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,u,6,1>
+ 2803058809U, // <7,2,u,7>: Cost 3 vsldoi12 RHS, <2,u,7,0>
+ 1695623305U, // <7,2,u,u>: Cost 2 vsldoi12 <2,u,u,7>, <2,u,u,7>
+ 2779170955U, // <7,3,0,0>: Cost 3 vsldoi12 RHS, <3,0,0,0>
+ 1705429142U, // <7,3,0,1>: Cost 2 vsldoi12 RHS, <3,0,1,2>
+ 2634057732U, // <7,3,0,2>: Cost 3 vsldoi4 <2,7,3,0>, <2,7,3,0>
+ 2779170983U, // <7,3,0,3>: Cost 3 vsldoi12 RHS, <3,0,3,1>
+ 2779170992U, // <7,3,0,4>: Cost 3 vsldoi12 RHS, <3,0,4,1>
+ 3852912829U, // <7,3,0,5>: Cost 4 vsldoi12 RHS, <3,0,5,5>
+ 2657948520U, // <7,3,0,6>: Cost 3 vsldoi4 <6,7,3,0>, <6,7,3,0>
+ 2316060602U, // <7,3,0,7>: Cost 3 vmrglw <5,6,7,0>, <2,6,3,7>
+ 1705429205U, // <7,3,0,u>: Cost 2 vsldoi12 RHS, <3,0,u,2>
+ 3852912860U, // <7,3,1,0>: Cost 4 vsldoi12 RHS, <3,1,0,0>
+ 2779171046U, // <7,3,1,1>: Cost 3 vsldoi12 RHS, <3,1,1,1>
+ 2779171057U, // <7,3,1,2>: Cost 3 vsldoi12 RHS, <3,1,2,3>
+ 3852912887U, // <7,3,1,3>: Cost 4 vsldoi12 RHS, <3,1,3,0>
+ 3852912896U, // <7,3,1,4>: Cost 4 vsldoi12 RHS, <3,1,4,0>
+ 3852912905U, // <7,3,1,5>: Cost 4 vsldoi12 RHS, <3,1,5,0>
+ 3835291923U, // <7,3,1,6>: Cost 4 vsldoi12 <1,6,1,7>, <3,1,6,1>
+ 3841411356U, // <7,3,1,7>: Cost 4 vsldoi12 <2,6,3,7>, <3,1,7,1>
+ 2779171111U, // <7,3,1,u>: Cost 3 vsldoi12 RHS, <3,1,u,3>
+ 2779171120U, // <7,3,2,0>: Cost 3 vsldoi12 RHS, <3,2,0,3>
+ 3852912952U, // <7,3,2,1>: Cost 4 vsldoi12 RHS, <3,2,1,2>
+ 2779171137U, // <7,3,2,2>: Cost 3 vsldoi12 RHS, <3,2,2,2>
+ 2779171144U, // <7,3,2,3>: Cost 3 vsldoi12 RHS, <3,2,3,0>
+ 2779171156U, // <7,3,2,4>: Cost 3 vsldoi12 RHS, <3,2,4,3>
+ 3852912989U, // <7,3,2,5>: Cost 4 vsldoi12 RHS, <3,2,5,3>
+ 2767669606U, // <7,3,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <3,2,6,3>
+ 2767669615U, // <7,3,2,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,2,7,3>
+ 2779171189U, // <7,3,2,u>: Cost 3 vsldoi12 RHS, <3,2,u,0>
+ 2779171198U, // <7,3,3,0>: Cost 3 vsldoi12 RHS, <3,3,0,0>
+ 3852913032U, // <7,3,3,1>: Cost 4 vsldoi12 RHS, <3,3,1,1>
+ 2704140655U, // <7,3,3,2>: Cost 3 vsldoi8 <3,2,7,3>, <3,2,7,3>
+ 1705429404U, // <7,3,3,3>: Cost 2 vsldoi12 RHS, <3,3,3,3>
+ 2779171238U, // <7,3,3,4>: Cost 3 vsldoi12 RHS, <3,3,4,4>
+ 3852913070U, // <7,3,3,5>: Cost 4 vsldoi12 RHS, <3,3,5,3>
+ 2657973099U, // <7,3,3,6>: Cost 3 vsldoi4 <6,7,3,3>, <6,7,3,3>
+ 2767669700U, // <7,3,3,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,3,7,7>
+ 1705429404U, // <7,3,3,u>: Cost 2 vsldoi12 RHS, <3,3,3,3>
+ 2779171280U, // <7,3,4,0>: Cost 3 vsldoi12 RHS, <3,4,0,1>
+ 2779171290U, // <7,3,4,1>: Cost 3 vsldoi12 RHS, <3,4,1,2>
+ 2634090504U, // <7,3,4,2>: Cost 3 vsldoi4 <2,7,3,4>, <2,7,3,4>
+ 2779171311U, // <7,3,4,3>: Cost 3 vsldoi12 RHS, <3,4,3,5>
+ 2779171319U, // <7,3,4,4>: Cost 3 vsldoi12 RHS, <3,4,4,4>
+ 1705429506U, // <7,3,4,5>: Cost 2 vsldoi12 RHS, <3,4,5,6>
+ 2722057593U, // <7,3,4,6>: Cost 3 vsldoi8 <6,2,7,3>, <4,6,5,2>
+ 2316093370U, // <7,3,4,7>: Cost 3 vmrglw <5,6,7,4>, <2,6,3,7>
+ 1705429533U, // <7,3,4,u>: Cost 2 vsldoi12 RHS, <3,4,u,6>
+ 3852913185U, // <7,3,5,0>: Cost 4 vsldoi12 RHS, <3,5,0,1>
+ 3795799695U, // <7,3,5,1>: Cost 4 vsldoi8 <6,2,7,3>, <5,1,0,1>
+ 3852913203U, // <7,3,5,2>: Cost 4 vsldoi12 RHS, <3,5,2,1>
+ 3852913214U, // <7,3,5,3>: Cost 4 vsldoi12 RHS, <3,5,3,3>
+ 3852913225U, // <7,3,5,4>: Cost 4 vsldoi12 RHS, <3,5,4,5>
+ 2779171410U, // <7,3,5,5>: Cost 3 vsldoi12 RHS, <3,5,5,5>
+ 2718740581U, // <7,3,5,6>: Cost 3 vsldoi8 <5,6,7,3>, <5,6,7,3>
+ 3841411685U, // <7,3,5,7>: Cost 4 vsldoi12 <2,6,3,7>, <3,5,7,6>
+ 2720067847U, // <7,3,5,u>: Cost 3 vsldoi8 <5,u,7,3>, <5,u,7,3>
+ 2773420664U, // <7,3,6,0>: Cost 3 vsldoi12 <3,6,0,7>, <3,6,0,7>
+ 3847236225U, // <7,3,6,1>: Cost 4 vsldoi12 <3,6,1,7>, <3,6,1,7>
+ 1648316922U, // <7,3,6,2>: Cost 2 vsldoi8 <6,2,7,3>, <6,2,7,3>
+ 2773641875U, // <7,3,6,3>: Cost 3 vsldoi12 <3,6,3,7>, <3,6,3,7>
+ 2773715612U, // <7,3,6,4>: Cost 3 vsldoi12 <3,6,4,7>, <3,6,4,7>
+ 3847531173U, // <7,3,6,5>: Cost 4 vsldoi12 <3,6,5,7>, <3,6,5,7>
+ 2722059024U, // <7,3,6,6>: Cost 3 vsldoi8 <6,2,7,3>, <6,6,2,2>
+ 2767669943U, // <7,3,6,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,6,7,7>
+ 1652298720U, // <7,3,6,u>: Cost 2 vsldoi8 <6,u,7,3>, <6,u,7,3>
+ 2767669955U, // <7,3,7,0>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,0,1>
+ 3841411788U, // <7,3,7,1>: Cost 4 vsldoi12 <2,6,3,7>, <3,7,1,1>
+ 2767669978U, // <7,3,7,2>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,2,6>
+ 2722059546U, // <7,3,7,3>: Cost 3 vsldoi8 <6,2,7,3>, <7,3,6,2>
+ 2767669995U, // <7,3,7,4>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,4,5>
+ 3852913396U, // <7,3,7,5>: Cost 4 vsldoi12 RHS, <3,7,5,5>
+ 2722059758U, // <7,3,7,6>: Cost 3 vsldoi8 <6,2,7,3>, <7,6,2,7>
+ 2302183354U, // <7,3,7,7>: Cost 3 vmrglw <3,3,7,7>, <2,6,3,7>
+ 2767670027U, // <7,3,7,u>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,u,1>
+ 2774747930U, // <7,3,u,0>: Cost 3 vsldoi12 <3,u,0,7>, <3,u,0,7>
+ 1705429790U, // <7,3,u,1>: Cost 2 vsldoi12 RHS, <3,u,1,2>
+ 1660262316U, // <7,3,u,2>: Cost 2 vsldoi8 <u,2,7,3>, <u,2,7,3>
+ 1705429404U, // <7,3,u,3>: Cost 2 vsldoi12 RHS, <3,3,3,3>
+ 2775042878U, // <7,3,u,4>: Cost 3 vsldoi12 <3,u,4,7>, <3,u,4,7>
+ 1705429830U, // <7,3,u,5>: Cost 2 vsldoi12 RHS, <3,u,5,6>
+ 2779171660U, // <7,3,u,6>: Cost 3 vsldoi12 RHS, <3,u,6,3>
+ 2767670101U, // <7,3,u,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,u,7,3>
+ 1705429853U, // <7,3,u,u>: Cost 2 vsldoi12 RHS, <3,u,u,2>
+ 2718744576U, // <7,4,0,0>: Cost 3 vsldoi8 <5,6,7,4>, <0,0,0,0>
+ 1645002854U, // <7,4,0,1>: Cost 2 vsldoi8 <5,6,7,4>, LHS
+ 3852913527U, // <7,4,0,2>: Cost 4 vsldoi12 RHS, <4,0,2,1>
+ 3852913536U, // <7,4,0,3>: Cost 4 vsldoi12 RHS, <4,0,3,1>
+ 2316061904U, // <7,4,0,4>: Cost 3 vmrglw <5,6,7,0>, <4,4,4,4>
+ 1705429906U, // <7,4,0,5>: Cost 2 vsldoi12 RHS, <4,0,5,1>
+ 2658022257U, // <7,4,0,6>: Cost 3 vsldoi4 <6,7,4,0>, <6,7,4,0>
+ 2256489928U, // <7,4,0,7>: Cost 3 vmrghw <7,0,1,2>, <4,7,5,0>
+ 1707420589U, // <7,4,0,u>: Cost 2 vsldoi12 RHS, <4,0,u,1>
+ 3852913590U, // <7,4,1,0>: Cost 4 vsldoi12 RHS, <4,1,0,1>
+ 2718745396U, // <7,4,1,1>: Cost 3 vsldoi8 <5,6,7,4>, <1,1,1,1>
+ 2779171786U, // <7,4,1,2>: Cost 3 vsldoi12 RHS, <4,1,2,3>
+ 3852913616U, // <7,4,1,3>: Cost 4 vsldoi12 RHS, <4,1,3,0>
+ 3852913627U, // <7,4,1,4>: Cost 4 vsldoi12 RHS, <4,1,4,2>
+ 2779171810U, // <7,4,1,5>: Cost 3 vsldoi12 RHS, <4,1,5,0>
+ 3792487631U, // <7,4,1,6>: Cost 4 vsldoi8 <5,6,7,4>, <1,6,1,7>
+ 3394456220U, // <7,4,1,7>: Cost 4 vmrglw <6,4,7,1>, <3,6,4,7>
+ 2779171837U, // <7,4,1,u>: Cost 3 vsldoi12 RHS, <4,1,u,0>
+ 3852913673U, // <7,4,2,0>: Cost 4 vsldoi12 RHS, <4,2,0,3>
+ 3852913682U, // <7,4,2,1>: Cost 4 vsldoi12 RHS, <4,2,1,3>
+ 2718746216U, // <7,4,2,2>: Cost 3 vsldoi8 <5,6,7,4>, <2,2,2,2>
+ 2718746278U, // <7,4,2,3>: Cost 3 vsldoi8 <5,6,7,4>, <2,3,0,1>
+ 2779171885U, // <7,4,2,4>: Cost 3 vsldoi12 RHS, <4,2,4,3>
+ 2779171893U, // <7,4,2,5>: Cost 3 vsldoi12 RHS, <4,2,5,2>
+ 2718746554U, // <7,4,2,6>: Cost 3 vsldoi8 <5,6,7,4>, <2,6,3,7>
+ 3847457864U, // <7,4,2,7>: Cost 4 vsldoi12 <3,6,4,7>, <4,2,7,3>
+ 2779171921U, // <7,4,2,u>: Cost 3 vsldoi12 RHS, <4,2,u,3>
+ 2718746774U, // <7,4,3,0>: Cost 3 vsldoi8 <5,6,7,4>, <3,0,1,2>
+ 3852913762U, // <7,4,3,1>: Cost 4 vsldoi12 RHS, <4,3,1,2>
+ 3852913772U, // <7,4,3,2>: Cost 4 vsldoi12 RHS, <4,3,2,3>
+ 2718747036U, // <7,4,3,3>: Cost 3 vsldoi8 <5,6,7,4>, <3,3,3,3>
+ 2718747138U, // <7,4,3,4>: Cost 3 vsldoi8 <5,6,7,4>, <3,4,5,6>
+ 2779171972U, // <7,4,3,5>: Cost 3 vsldoi12 RHS, <4,3,5,0>
+ 2706803380U, // <7,4,3,6>: Cost 3 vsldoi8 <3,6,7,4>, <3,6,7,4>
+ 3847457946U, // <7,4,3,7>: Cost 4 vsldoi12 <3,6,4,7>, <4,3,7,4>
+ 2781162655U, // <7,4,3,u>: Cost 3 vsldoi12 RHS, <4,3,u,0>
+ 2718747538U, // <7,4,4,0>: Cost 3 vsldoi8 <5,6,7,4>, <4,0,5,1>
+ 3852913842U, // <7,4,4,1>: Cost 4 vsldoi12 RHS, <4,4,1,1>
+ 3852913852U, // <7,4,4,2>: Cost 4 vsldoi12 RHS, <4,4,2,2>
+ 2316096696U, // <7,4,4,3>: Cost 3 vmrglw <5,6,7,4>, <7,2,4,3>
+ 1705430224U, // <7,4,4,4>: Cost 2 vsldoi12 RHS, <4,4,4,4>
+ 1705430234U, // <7,4,4,5>: Cost 2 vsldoi12 RHS, <4,4,5,5>
+ 2658055029U, // <7,4,4,6>: Cost 3 vsldoi4 <6,7,4,4>, <6,7,4,4>
+ 2316097024U, // <7,4,4,7>: Cost 3 vmrglw <5,6,7,4>, <7,6,4,7>
+ 1707420917U, // <7,4,4,u>: Cost 2 vsldoi12 RHS, <4,4,u,5>
+ 1584316518U, // <7,4,5,0>: Cost 2 vsldoi4 <6,7,4,5>, LHS
+ 2658059060U, // <7,4,5,1>: Cost 3 vsldoi4 <6,7,4,5>, <1,1,1,1>
+ 2640144314U, // <7,4,5,2>: Cost 3 vsldoi4 <3,7,4,5>, <2,6,3,7>
+ 2640145131U, // <7,4,5,3>: Cost 3 vsldoi4 <3,7,4,5>, <3,7,4,5>
+ 1584319798U, // <7,4,5,4>: Cost 2 vsldoi4 <6,7,4,5>, RHS
+ 2779172134U, // <7,4,5,5>: Cost 3 vsldoi12 RHS, <4,5,5,0>
+ 631688502U, // <7,4,5,6>: Cost 1 vsldoi12 RHS, RHS
+ 2658063354U, // <7,4,5,7>: Cost 3 vsldoi4 <6,7,4,5>, <7,0,1,2>
+ 631688520U, // <7,4,5,u>: Cost 1 vsldoi12 RHS, RHS
+ 3852914001U, // <7,4,6,0>: Cost 4 vsldoi12 RHS, <4,6,0,7>
+ 3852914010U, // <7,4,6,1>: Cost 4 vsldoi12 RHS, <4,6,1,7>
+ 2718749178U, // <7,4,6,2>: Cost 3 vsldoi8 <5,6,7,4>, <6,2,7,3>
+ 2722730572U, // <7,4,6,3>: Cost 3 vsldoi8 <6,3,7,4>, <6,3,7,4>
+ 2723394205U, // <7,4,6,4>: Cost 3 vsldoi8 <6,4,7,4>, <6,4,7,4>
+ 2779172221U, // <7,4,6,5>: Cost 3 vsldoi12 RHS, <4,6,5,6>
+ 2718749496U, // <7,4,6,6>: Cost 3 vsldoi8 <5,6,7,4>, <6,6,6,6>
+ 2718749518U, // <7,4,6,7>: Cost 3 vsldoi8 <5,6,7,4>, <6,7,0,1>
+ 2779172249U, // <7,4,6,u>: Cost 3 vsldoi12 RHS, <4,6,u,7>
+ 2718749690U, // <7,4,7,0>: Cost 3 vsldoi8 <5,6,7,4>, <7,0,1,2>
+ 3847458214U, // <7,4,7,1>: Cost 4 vsldoi12 <3,6,4,7>, <4,7,1,2>
+ 2718749880U, // <7,4,7,2>: Cost 3 vsldoi8 <5,6,7,4>, <7,2,4,3>
+ 3847458236U, // <7,4,7,3>: Cost 4 vsldoi12 <3,6,4,7>, <4,7,3,6>
+ 2718750004U, // <7,4,7,4>: Cost 3 vsldoi8 <5,6,7,4>, <7,4,0,1>
+ 1187876150U, // <7,4,7,5>: Cost 2 vmrghw <7,7,7,7>, RHS
+ 2718750208U, // <7,4,7,6>: Cost 3 vsldoi8 <5,6,7,4>, <7,6,4,7>
+ 2718750286U, // <7,4,7,7>: Cost 3 vsldoi8 <5,6,7,4>, <7,7,4,4>
+ 1187876393U, // <7,4,7,u>: Cost 2 vmrghw <7,7,7,7>, RHS
+ 1584341094U, // <7,4,u,0>: Cost 2 vsldoi4 <6,7,4,u>, LHS
+ 1645008686U, // <7,4,u,1>: Cost 2 vsldoi8 <5,6,7,4>, LHS
+ 2640168890U, // <7,4,u,2>: Cost 3 vsldoi4 <3,7,4,u>, <2,6,3,7>
+ 2640169710U, // <7,4,u,3>: Cost 3 vsldoi4 <3,7,4,u>, <3,7,4,u>
+ 1584344374U, // <7,4,u,4>: Cost 2 vsldoi4 <6,7,4,u>, RHS
+ 1705430554U, // <7,4,u,5>: Cost 2 vsldoi12 RHS, <4,u,5,1>
+ 631688745U, // <7,4,u,6>: Cost 1 vsldoi12 RHS, RHS
+ 2718750976U, // <7,4,u,7>: Cost 3 vsldoi8 <5,6,7,4>, <u,7,0,1>
+ 631688763U, // <7,4,u,u>: Cost 1 vsldoi12 RHS, RHS
+ 2646147174U, // <7,5,0,0>: Cost 3 vsldoi4 <4,7,5,0>, LHS
+ 2779172424U, // <7,5,0,1>: Cost 3 vsldoi12 RHS, <5,0,1,2>
+ 3852914258U, // <7,5,0,2>: Cost 4 vsldoi12 RHS, <5,0,2,3>
+ 3852914268U, // <7,5,0,3>: Cost 4 vsldoi12 RHS, <5,0,3,4>
+ 2779172450U, // <7,5,0,4>: Cost 3 vsldoi12 RHS, <5,0,4,1>
+ 2316061914U, // <7,5,0,5>: Cost 3 vmrglw <5,6,7,0>, <4,4,5,5>
+ 2316061186U, // <7,5,0,6>: Cost 3 vmrglw <5,6,7,0>, <3,4,5,6>
+ 2646152186U, // <7,5,0,7>: Cost 3 vsldoi4 <4,7,5,0>, <7,0,1,2>
+ 2779172486U, // <7,5,0,u>: Cost 3 vsldoi12 RHS, <5,0,u,1>
+ 2781163151U, // <7,5,1,0>: Cost 3 vsldoi12 RHS, <5,1,0,1>
+ 2321378194U, // <7,5,1,1>: Cost 3 vmrglw <6,5,7,1>, <4,0,5,1>
+ 3852914339U, // <7,5,1,2>: Cost 4 vsldoi12 RHS, <5,1,2,3>
+ 3852914350U, // <7,5,1,3>: Cost 4 vsldoi12 RHS, <5,1,3,5>
+ 2781163191U, // <7,5,1,4>: Cost 3 vsldoi12 RHS, <5,1,4,5>
+ 3852914363U, // <7,5,1,5>: Cost 4 vsldoi12 RHS, <5,1,5,0>
+ 3835588297U, // <7,5,1,6>: Cost 4 vsldoi12 <1,6,5,7>, <5,1,6,5>
+ 3835588306U, // <7,5,1,7>: Cost 4 vsldoi12 <1,6,5,7>, <5,1,7,5>
+ 2781163223U, // <7,5,1,u>: Cost 3 vsldoi12 RHS, <5,1,u,1>
+ 3852914400U, // <7,5,2,0>: Cost 4 vsldoi12 RHS, <5,2,0,1>
+ 2781163243U, // <7,5,2,1>: Cost 3 vsldoi12 RHS, <5,2,1,3>
+ 3852914419U, // <7,5,2,2>: Cost 4 vsldoi12 RHS, <5,2,2,2>
+ 2779172606U, // <7,5,2,3>: Cost 3 vsldoi12 RHS, <5,2,3,4>
+ 3780552497U, // <7,5,2,4>: Cost 4 vsldoi8 <3,6,7,5>, <2,4,6,5>
+ 2781163279U, // <7,5,2,5>: Cost 3 vsldoi12 RHS, <5,2,5,3>
+ 2779172632U, // <7,5,2,6>: Cost 3 vsldoi12 RHS, <5,2,6,3>
+ 3835588385U, // <7,5,2,7>: Cost 4 vsldoi12 <1,6,5,7>, <5,2,7,3>
+ 2779172650U, // <7,5,2,u>: Cost 3 vsldoi12 RHS, <5,2,u,3>
+ 3852914481U, // <7,5,3,0>: Cost 4 vsldoi12 RHS, <5,3,0,1>
+ 2319403922U, // <7,5,3,1>: Cost 3 vmrglw <6,2,7,3>, <4,0,5,1>
+ 2319404409U, // <7,5,3,2>: Cost 3 vmrglw <6,2,7,3>, <4,6,5,2>
+ 3852914510U, // <7,5,3,3>: Cost 4 vsldoi12 RHS, <5,3,3,3>
+ 3779226131U, // <7,5,3,4>: Cost 4 vsldoi8 <3,4,7,5>, <3,4,7,5>
+ 2319404250U, // <7,5,3,5>: Cost 3 vmrglw <6,2,7,3>, <4,4,5,5>
+ 2319403522U, // <7,5,3,6>: Cost 3 vmrglw <6,2,7,3>, <3,4,5,6>
+ 3852914547U, // <7,5,3,7>: Cost 4 vsldoi12 RHS, <5,3,7,4>
+ 2319403524U, // <7,5,3,u>: Cost 3 vmrglw <6,2,7,3>, <3,4,5,u>
+ 2646179942U, // <7,5,4,0>: Cost 3 vsldoi4 <4,7,5,4>, LHS
+ 2316094354U, // <7,5,4,1>: Cost 3 vmrglw <5,6,7,4>, <4,0,5,1>
+ 3852914582U, // <7,5,4,2>: Cost 4 vsldoi12 RHS, <5,4,2,3>
+ 3852914592U, // <7,5,4,3>: Cost 4 vsldoi12 RHS, <5,4,3,4>
+ 2646183372U, // <7,5,4,4>: Cost 3 vsldoi4 <4,7,5,4>, <4,7,5,4>
+ 2779172788U, // <7,5,4,5>: Cost 3 vsldoi12 RHS, <5,4,5,6>
+ 2316093954U, // <7,5,4,6>: Cost 3 vmrglw <5,6,7,4>, <3,4,5,6>
+ 2646185318U, // <7,5,4,7>: Cost 3 vsldoi4 <4,7,5,4>, <7,4,5,6>
+ 2779172815U, // <7,5,4,u>: Cost 3 vsldoi12 RHS, <5,4,u,6>
+ 2781163475U, // <7,5,5,0>: Cost 3 vsldoi12 RHS, <5,5,0,1>
+ 2781163484U, // <7,5,5,1>: Cost 3 vsldoi12 RHS, <5,5,1,1>
+ 3852914662U, // <7,5,5,2>: Cost 4 vsldoi12 RHS, <5,5,2,2>
+ 3852914672U, // <7,5,5,3>: Cost 4 vsldoi12 RHS, <5,5,3,3>
+ 2781163515U, // <7,5,5,4>: Cost 3 vsldoi12 RHS, <5,5,4,5>
+ 1705431044U, // <7,5,5,5>: Cost 2 vsldoi12 RHS, <5,5,5,5>
+ 2779172878U, // <7,5,5,6>: Cost 3 vsldoi12 RHS, <5,5,6,6>
+ 3835588632U, // <7,5,5,7>: Cost 4 vsldoi12 <1,6,5,7>, <5,5,7,7>
+ 1705431044U, // <7,5,5,u>: Cost 2 vsldoi12 RHS, <5,5,5,5>
+ 2779172900U, // <7,5,6,0>: Cost 3 vsldoi12 RHS, <5,6,0,1>
+ 2781163571U, // <7,5,6,1>: Cost 3 vsldoi12 RHS, <5,6,1,7>
+ 3852914743U, // <7,5,6,2>: Cost 4 vsldoi12 RHS, <5,6,2,2>
+ 2779172930U, // <7,5,6,3>: Cost 3 vsldoi12 RHS, <5,6,3,4>
+ 2779172940U, // <7,5,6,4>: Cost 3 vsldoi12 RHS, <5,6,4,5>
+ 2781163607U, // <7,5,6,5>: Cost 3 vsldoi12 RHS, <5,6,5,7>
+ 2779172960U, // <7,5,6,6>: Cost 3 vsldoi12 RHS, <5,6,6,7>
+ 1705431138U, // <7,5,6,7>: Cost 2 vsldoi12 RHS, <5,6,7,0>
+ 1705578603U, // <7,5,6,u>: Cost 2 vsldoi12 RHS, <5,6,u,0>
+ 2646204518U, // <7,5,7,0>: Cost 3 vsldoi4 <4,7,5,7>, LHS
+ 2322090898U, // <7,5,7,1>: Cost 3 vmrglw <6,6,7,7>, <4,0,5,1>
+ 3719947880U, // <7,5,7,2>: Cost 4 vsldoi4 <4,7,5,7>, <2,2,2,2>
+ 3719948438U, // <7,5,7,3>: Cost 4 vsldoi4 <4,7,5,7>, <3,0,1,2>
+ 2646207951U, // <7,5,7,4>: Cost 3 vsldoi4 <4,7,5,7>, <4,7,5,7>
+ 2322091226U, // <7,5,7,5>: Cost 3 vmrglw <6,6,7,7>, <4,4,5,5>
+ 2322090498U, // <7,5,7,6>: Cost 3 vmrglw <6,6,7,7>, <3,4,5,6>
+ 2646210156U, // <7,5,7,7>: Cost 3 vsldoi4 <4,7,5,7>, <7,7,7,7>
+ 2646210350U, // <7,5,7,u>: Cost 3 vsldoi4 <4,7,5,7>, LHS
+ 2779173062U, // <7,5,u,0>: Cost 3 vsldoi12 RHS, <5,u,0,1>
+ 2779173072U, // <7,5,u,1>: Cost 3 vsldoi12 RHS, <5,u,1,2>
+ 2319404409U, // <7,5,u,2>: Cost 3 vmrglw <6,2,7,3>, <4,6,5,2>
+ 2779173092U, // <7,5,u,3>: Cost 3 vsldoi12 RHS, <5,u,3,4>
+ 2779173101U, // <7,5,u,4>: Cost 3 vsldoi12 RHS, <5,u,4,4>
+ 1705431044U, // <7,5,u,5>: Cost 2 vsldoi12 RHS, <5,5,5,5>
+ 2779173118U, // <7,5,u,6>: Cost 3 vsldoi12 RHS, <5,u,6,3>
+ 1705578756U, // <7,5,u,7>: Cost 2 vsldoi12 RHS, <5,u,7,0>
+ 1707421965U, // <7,5,u,u>: Cost 2 vsldoi12 RHS, <5,u,u,0>
+ 3852914966U, // <7,6,0,0>: Cost 4 vsldoi12 RHS, <6,0,0,0>
+ 2779173153U, // <7,6,0,1>: Cost 3 vsldoi12 RHS, <6,0,1,2>
+ 2256491002U, // <7,6,0,2>: Cost 3 vmrghw <7,0,1,2>, <6,2,7,3>
+ 3852914994U, // <7,6,0,3>: Cost 4 vsldoi12 RHS, <6,0,3,1>
+ 3852915003U, // <7,6,0,4>: Cost 4 vsldoi12 RHS, <6,0,4,1>
+ 2316062652U, // <7,6,0,5>: Cost 3 vmrglw <5,6,7,0>, <5,4,6,5>
+ 2316063544U, // <7,6,0,6>: Cost 3 vmrglw <5,6,7,0>, <6,6,6,6>
+ 1242320182U, // <7,6,0,7>: Cost 2 vmrglw <5,6,7,0>, RHS
+ 1242320183U, // <7,6,0,u>: Cost 2 vmrglw <5,6,7,0>, RHS
+ 3852915048U, // <7,6,1,0>: Cost 4 vsldoi12 RHS, <6,1,0,1>
+ 3377866217U, // <7,6,1,1>: Cost 4 vmrglw <3,6,7,1>, <2,0,6,1>
+ 3852915068U, // <7,6,1,2>: Cost 4 vsldoi12 RHS, <6,1,2,3>
+ 3833672072U, // <7,6,1,3>: Cost 5 vsldoi12 <1,3,6,7>, <6,1,3,6>
+ 3852915088U, // <7,6,1,4>: Cost 4 vsldoi12 RHS, <6,1,4,5>
+ 3395122056U, // <7,6,1,5>: Cost 4 vmrglw <6,5,7,1>, <6,7,6,5>
+ 3389813560U, // <7,6,1,6>: Cost 4 vmrglw <5,6,7,1>, <6,6,6,6>
+ 2779173287U, // <7,6,1,7>: Cost 3 vsldoi12 RHS, <6,1,7,1>
+ 2779320752U, // <7,6,1,u>: Cost 3 vsldoi12 RHS, <6,1,u,1>
+ 2658181222U, // <7,6,2,0>: Cost 3 vsldoi4 <6,7,6,2>, LHS
+ 3852915140U, // <7,6,2,1>: Cost 4 vsldoi12 RHS, <6,2,1,3>
+ 2257973754U, // <7,6,2,2>: Cost 3 vmrghw <7,2,3,3>, <6,2,7,3>
+ 3841413589U, // <7,6,2,3>: Cost 4 vsldoi12 <2,6,3,7>, <6,2,3,2>
+ 2658184502U, // <7,6,2,4>: Cost 3 vsldoi4 <6,7,6,2>, RHS
+ 3852915176U, // <7,6,2,5>: Cost 4 vsldoi12 RHS, <6,2,5,3>
+ 2658186117U, // <7,6,2,6>: Cost 3 vsldoi4 <6,7,6,2>, <6,7,6,2>
+ 1705431546U, // <7,6,2,7>: Cost 2 vsldoi12 RHS, <6,2,7,3>
+ 1705579011U, // <7,6,2,u>: Cost 2 vsldoi12 RHS, <6,2,u,3>
+ 3714015334U, // <7,6,3,0>: Cost 4 vsldoi4 <3,7,6,3>, LHS
+ 3777243425U, // <7,6,3,1>: Cost 4 vsldoi8 <3,1,7,6>, <3,1,7,6>
+ 2319405957U, // <7,6,3,2>: Cost 3 vmrglw <6,2,7,3>, <6,7,6,2>
+ 3375229286U, // <7,6,3,3>: Cost 4 vmrglw <3,2,7,3>, <3,2,6,3>
+ 2779173426U, // <7,6,3,4>: Cost 3 vsldoi12 RHS, <6,3,4,5>
+ 3375228721U, // <7,6,3,5>: Cost 4 vmrglw <3,2,7,3>, <2,4,6,5>
+ 2319405880U, // <7,6,3,6>: Cost 3 vmrglw <6,2,7,3>, <6,6,6,6>
+ 1245662518U, // <7,6,3,7>: Cost 2 vmrglw <6,2,7,3>, RHS
+ 1245662519U, // <7,6,3,u>: Cost 2 vmrglw <6,2,7,3>, RHS
+ 3852915291U, // <7,6,4,0>: Cost 4 vsldoi12 RHS, <6,4,0,1>
+ 3389834729U, // <7,6,4,1>: Cost 4 vmrglw <5,6,7,4>, <2,0,6,1>
+ 2259472890U, // <7,6,4,2>: Cost 3 vmrghw <7,4,5,6>, <6,2,7,3>
+ 3852915321U, // <7,6,4,3>: Cost 4 vsldoi12 RHS, <6,4,3,4>
+ 3852915330U, // <7,6,4,4>: Cost 4 vsldoi12 RHS, <6,4,4,4>
+ 2779173517U, // <7,6,4,5>: Cost 3 vsldoi12 RHS, <6,4,5,6>
+ 2316096312U, // <7,6,4,6>: Cost 3 vmrglw <5,6,7,4>, <6,6,6,6>
+ 1242352950U, // <7,6,4,7>: Cost 2 vmrglw <5,6,7,4>, RHS
+ 1242352951U, // <7,6,4,u>: Cost 2 vmrglw <5,6,7,4>, RHS
+ 3852915372U, // <7,6,5,0>: Cost 4 vsldoi12 RHS, <6,5,0,1>
+ 3835294392U, // <7,6,5,1>: Cost 5 vsldoi12 <1,6,1,7>, <6,5,1,4>
+ 3852915395U, // <7,6,5,2>: Cost 4 vsldoi12 RHS, <6,5,2,6>
+ 3852915404U, // <7,6,5,3>: Cost 4 vsldoi12 RHS, <6,5,3,6>
+ 3852915412U, // <7,6,5,4>: Cost 4 vsldoi12 RHS, <6,5,4,5>
+ 3377899313U, // <7,6,5,5>: Cost 4 vmrglw <3,6,7,5>, <2,4,6,5>
+ 2718765160U, // <7,6,5,6>: Cost 3 vsldoi8 <5,6,7,6>, <5,6,7,6>
+ 2779173611U, // <7,6,5,7>: Cost 3 vsldoi12 RHS, <6,5,7,1>
+ 2779321076U, // <7,6,5,u>: Cost 3 vsldoi12 RHS, <6,5,u,1>
+ 2658213990U, // <7,6,6,0>: Cost 3 vsldoi4 <6,7,6,6>, LHS
+ 3852915462U, // <7,6,6,1>: Cost 4 vsldoi12 RHS, <6,6,1,1>
+ 2718765562U, // <7,6,6,2>: Cost 3 vsldoi8 <5,6,7,6>, <6,2,7,3>
+ 3714042622U, // <7,6,6,3>: Cost 4 vsldoi4 <3,7,6,6>, <3,7,6,6>
+ 2658217270U, // <7,6,6,4>: Cost 3 vsldoi4 <6,7,6,6>, RHS
+ 2724074224U, // <7,6,6,5>: Cost 3 vsldoi8 <6,5,7,6>, <6,5,7,6>
+ 1705431864U, // <7,6,6,6>: Cost 2 vsldoi12 RHS, <6,6,6,6>
+ 1705431874U, // <7,6,6,7>: Cost 2 vsldoi12 RHS, <6,6,7,7>
+ 1705579339U, // <7,6,6,u>: Cost 2 vsldoi12 RHS, <6,6,u,7>
+ 1705431886U, // <7,6,7,0>: Cost 2 vsldoi12 RHS, <6,7,0,1>
+ 2779173719U, // <7,6,7,1>: Cost 3 vsldoi12 RHS, <6,7,1,1>
+ 2779173729U, // <7,6,7,2>: Cost 3 vsldoi12 RHS, <6,7,2,2>
+ 2779173736U, // <7,6,7,3>: Cost 3 vsldoi12 RHS, <6,7,3,0>
+ 1705431926U, // <7,6,7,4>: Cost 2 vsldoi12 RHS, <6,7,4,5>
+ 2779173759U, // <7,6,7,5>: Cost 3 vsldoi12 RHS, <6,7,5,5>
+ 2779173765U, // <7,6,7,6>: Cost 3 vsldoi12 RHS, <6,7,6,2>
+ 1248349494U, // <7,6,7,7>: Cost 2 vmrglw <6,6,7,7>, RHS
+ 1705431958U, // <7,6,7,u>: Cost 2 vsldoi12 RHS, <6,7,u,1>
+ 1705579423U, // <7,6,u,0>: Cost 2 vsldoi12 RHS, <6,u,0,1>
+ 2779173801U, // <7,6,u,1>: Cost 3 vsldoi12 RHS, <6,u,1,2>
+ 2779321266U, // <7,6,u,2>: Cost 3 vsldoi12 RHS, <6,u,2,2>
+ 2779321273U, // <7,6,u,3>: Cost 3 vsldoi12 RHS, <6,u,3,0>
+ 1705579463U, // <7,6,u,4>: Cost 2 vsldoi12 RHS, <6,u,4,5>
+ 2779173841U, // <7,6,u,5>: Cost 3 vsldoi12 RHS, <6,u,5,6>
+ 1705431864U, // <7,6,u,6>: Cost 2 vsldoi12 RHS, <6,6,6,6>
+ 1705432032U, // <7,6,u,7>: Cost 2 vsldoi12 RHS, <6,u,7,3>
+ 1705579495U, // <7,6,u,u>: Cost 2 vsldoi12 RHS, <6,u,u,1>
+ 1242320994U, // <7,7,0,0>: Cost 2 vmrglw <5,6,7,0>, <5,6,7,0>
+ 1705432058U, // <7,7,0,1>: Cost 2 vsldoi12 RHS, <7,0,1,2>
+ 3841414146U, // <7,7,0,2>: Cost 4 vsldoi12 <2,6,3,7>, <7,0,2,1>
+ 2316063226U, // <7,7,0,3>: Cost 3 vmrglw <5,6,7,0>, <6,2,7,3>
+ 2779173908U, // <7,7,0,4>: Cost 3 vsldoi12 RHS, <7,0,4,1>
+ 2658242658U, // <7,7,0,5>: Cost 3 vsldoi4 <6,7,7,0>, <5,6,7,0>
+ 2658243468U, // <7,7,0,6>: Cost 3 vsldoi4 <6,7,7,0>, <6,7,7,0>
+ 2316063554U, // <7,7,0,7>: Cost 3 vmrglw <5,6,7,0>, <6,6,7,7>
+ 1705432121U, // <7,7,0,u>: Cost 2 vsldoi12 RHS, <7,0,u,2>
+ 3852915777U, // <7,7,1,0>: Cost 4 vsldoi12 RHS, <7,1,0,1>
+ 2779173962U, // <7,7,1,1>: Cost 3 vsldoi12 RHS, <7,1,1,1>
+ 2779173973U, // <7,7,1,2>: Cost 3 vsldoi12 RHS, <7,1,2,3>
+ 3389813242U, // <7,7,1,3>: Cost 4 vmrglw <5,6,7,1>, <6,2,7,3>
+ 3852915813U, // <7,7,1,4>: Cost 4 vsldoi12 RHS, <7,1,4,1>
+ 3852915821U, // <7,7,1,5>: Cost 4 vsldoi12 RHS, <7,1,5,0>
+ 3835294839U, // <7,7,1,6>: Cost 4 vsldoi12 <1,6,1,7>, <7,1,6,1>
+ 2329343596U, // <7,7,1,7>: Cost 3 vmrglw <7,u,7,1>, <7,7,7,7>
+ 2779174027U, // <7,7,1,u>: Cost 3 vsldoi12 RHS, <7,1,u,3>
+ 2803061908U, // <7,7,2,0>: Cost 3 vsldoi12 RHS, <7,2,0,3>
+ 3852915869U, // <7,7,2,1>: Cost 4 vsldoi12 RHS, <7,2,1,3>
+ 2779174053U, // <7,7,2,2>: Cost 3 vsldoi12 RHS, <7,2,2,2>
+ 2779174060U, // <7,7,2,3>: Cost 3 vsldoi12 RHS, <7,2,3,0>
+ 2803061944U, // <7,7,2,4>: Cost 3 vsldoi12 RHS, <7,2,4,3>
+ 3852915905U, // <7,7,2,5>: Cost 4 vsldoi12 RHS, <7,2,5,3>
+ 2767672522U, // <7,7,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <7,2,6,3>
+ 2791855315U, // <7,7,2,7>: Cost 3 vsldoi12 <6,6,7,7>, <7,2,7,3>
+ 2768999644U, // <7,7,2,u>: Cost 3 vsldoi12 <2,u,3,7>, <7,2,u,3>
+ 2779174115U, // <7,7,3,0>: Cost 3 vsldoi12 RHS, <7,3,0,1>
+ 3852915948U, // <7,7,3,1>: Cost 4 vsldoi12 RHS, <7,3,1,1>
+ 3841414394U, // <7,7,3,2>: Cost 4 vsldoi12 <2,6,3,7>, <7,3,2,6>
+ 1245663738U, // <7,7,3,3>: Cost 2 vmrglw <6,2,7,3>, <6,2,7,3>
+ 2779174155U, // <7,7,3,4>: Cost 3 vsldoi12 RHS, <7,3,4,5>
+ 3852915988U, // <7,7,3,5>: Cost 4 vsldoi12 RHS, <7,3,5,5>
+ 2706827959U, // <7,7,3,6>: Cost 3 vsldoi8 <3,6,7,7>, <3,6,7,7>
+ 2319405890U, // <7,7,3,7>: Cost 3 vmrglw <6,2,7,3>, <6,6,7,7>
+ 1245663738U, // <7,7,3,u>: Cost 2 vmrglw <6,2,7,3>, <6,2,7,3>
+ 2779174200U, // <7,7,4,0>: Cost 3 vsldoi12 RHS, <7,4,0,5>
+ 3852916030U, // <7,7,4,1>: Cost 4 vsldoi12 RHS, <7,4,1,2>
+ 3714099130U, // <7,7,4,2>: Cost 4 vsldoi4 <3,7,7,4>, <2,6,3,7>
+ 2316095994U, // <7,7,4,3>: Cost 3 vmrglw <5,6,7,4>, <6,2,7,3>
+ 1242353766U, // <7,7,4,4>: Cost 2 vmrglw <5,6,7,4>, <5,6,7,4>
+ 1705432422U, // <7,7,4,5>: Cost 2 vsldoi12 RHS, <7,4,5,6>
+ 2658276240U, // <7,7,4,6>: Cost 3 vsldoi4 <6,7,7,4>, <6,7,7,4>
+ 2316096322U, // <7,7,4,7>: Cost 3 vmrglw <5,6,7,4>, <6,6,7,7>
+ 1705432449U, // <7,7,4,u>: Cost 2 vsldoi12 RHS, <7,4,u,6>
+ 3852916101U, // <7,7,5,0>: Cost 4 vsldoi12 RHS, <7,5,0,1>
+ 3854906765U, // <7,7,5,1>: Cost 4 vsldoi12 RHS, <7,5,1,0>
+ 3852916121U, // <7,7,5,2>: Cost 4 vsldoi12 RHS, <7,5,2,3>
+ 3389846010U, // <7,7,5,3>: Cost 4 vmrglw <5,6,7,5>, <6,2,7,3>
+ 3852916141U, // <7,7,5,4>: Cost 4 vsldoi12 RHS, <7,5,4,5>
+ 2779174326U, // <7,7,5,5>: Cost 3 vsldoi12 RHS, <7,5,5,5>
+ 2779174337U, // <7,7,5,6>: Cost 3 vsldoi12 RHS, <7,5,6,7>
+ 2329376364U, // <7,7,5,7>: Cost 3 vmrglw <7,u,7,5>, <7,7,7,7>
+ 2779321811U, // <7,7,5,u>: Cost 3 vsldoi12 RHS, <7,5,u,7>
+ 2658287718U, // <7,7,6,0>: Cost 3 vsldoi4 <6,7,7,6>, LHS
+ 3852916197U, // <7,7,6,1>: Cost 4 vsldoi12 RHS, <7,6,1,7>
+ 2779174382U, // <7,7,6,2>: Cost 3 vsldoi12 RHS, <7,6,2,7>
+ 2316112378U, // <7,7,6,3>: Cost 3 vmrglw <5,6,7,6>, <6,2,7,3>
+ 2658290998U, // <7,7,6,4>: Cost 3 vsldoi4 <6,7,7,6>, RHS
+ 3852916233U, // <7,7,6,5>: Cost 4 vsldoi12 RHS, <7,6,5,7>
+ 1651004226U, // <7,7,6,6>: Cost 2 vsldoi8 <6,6,7,7>, <6,6,7,7>
+ 2779174420U, // <7,7,6,7>: Cost 3 vsldoi12 RHS, <7,6,7,0>
+ 1652331492U, // <7,7,6,u>: Cost 2 vsldoi8 <6,u,7,7>, <6,u,7,7>
+ 1590526054U, // <7,7,7,0>: Cost 2 vsldoi4 <7,7,7,7>, LHS
+ 2328728623U, // <7,7,7,1>: Cost 3 vmrglw <7,7,7,7>, <7,0,7,1>
+ 2724746451U, // <7,7,7,2>: Cost 3 vsldoi8 <6,6,7,7>, <7,2,7,3>
+ 2322092538U, // <7,7,7,3>: Cost 3 vmrglw <6,6,7,7>, <6,2,7,3>
+ 1590529334U, // <7,7,7,4>: Cost 2 vsldoi4 <7,7,7,7>, RHS
+ 2328728951U, // <7,7,7,5>: Cost 3 vmrglw <7,7,7,7>, <7,4,7,5>
+ 2724746770U, // <7,7,7,6>: Cost 3 vsldoi8 <6,6,7,7>, <7,6,6,7>
+ 430361910U, // <7,7,7,7>: Cost 1 vspltisw3 RHS
+ 430361910U, // <7,7,7,u>: Cost 1 vspltisw3 RHS
+ 1242320994U, // <7,7,u,0>: Cost 2 vmrglw <5,6,7,0>, <5,6,7,0>
+ 1705580162U, // <7,7,u,1>: Cost 2 vsldoi12 RHS, <7,u,1,2>
+ 2779321996U, // <7,7,u,2>: Cost 3 vsldoi12 RHS, <7,u,2,3>
+ 1245663738U, // <7,7,u,3>: Cost 2 vmrglw <6,2,7,3>, <6,2,7,3>
+ 1242353766U, // <7,7,u,4>: Cost 2 vmrglw <5,6,7,4>, <5,6,7,4>
+ 1705580202U, // <7,7,u,5>: Cost 2 vsldoi12 RHS, <7,u,5,6>
+ 1662949620U, // <7,7,u,6>: Cost 2 vsldoi8 <u,6,7,7>, <u,6,7,7>
+ 430361910U, // <7,7,u,7>: Cost 1 vspltisw3 RHS
+ 430361910U, // <7,7,u,u>: Cost 1 vspltisw3 RHS
+ 1705426944U, // <7,u,0,0>: Cost 2 vsldoi12 RHS, <0,0,0,0>
+ 1705432787U, // <7,u,0,1>: Cost 2 vsldoi12 RHS, <u,0,1,2>
+ 2316060885U, // <7,u,0,2>: Cost 3 vmrglw <5,6,7,0>, <3,0,u,2>
+ 1242316956U, // <7,u,0,3>: Cost 2 vmrglw <5,6,7,0>, LHS
+ 2779174637U, // <7,u,0,4>: Cost 3 vsldoi12 RHS, <u,0,4,1>
+ 1182750874U, // <7,u,0,5>: Cost 2 vmrghw <7,0,1,2>, RHS
+ 2316061213U, // <7,u,0,6>: Cost 3 vmrglw <5,6,7,0>, <3,4,u,6>
+ 1242320200U, // <7,u,0,7>: Cost 2 vmrglw <5,6,7,0>, RHS
+ 1705432850U, // <7,u,0,u>: Cost 2 vsldoi12 RHS, <u,0,u,2>
+ 1584578662U, // <7,u,1,0>: Cost 2 vsldoi4 <6,7,u,1>, LHS
+ 1705427764U, // <7,u,1,1>: Cost 2 vsldoi12 RHS, <1,1,1,1>
+ 631691054U, // <7,u,1,2>: Cost 1 vsldoi12 RHS, LHS
+ 2640407307U, // <7,u,1,3>: Cost 3 vsldoi4 <3,7,u,1>, <3,7,u,1>
+ 1584581942U, // <7,u,1,4>: Cost 2 vsldoi4 <6,7,u,1>, RHS
+ 2779174726U, // <7,u,1,5>: Cost 3 vsldoi12 RHS, <u,1,5,0>
+ 1584583574U, // <7,u,1,6>: Cost 2 vsldoi4 <6,7,u,1>, <6,7,u,1>
+ 2779322201U, // <7,u,1,7>: Cost 3 vsldoi12 RHS, <u,1,7,1>
+ 631691108U, // <7,u,1,u>: Cost 1 vsldoi12 RHS, LHS
+ 2779174763U, // <7,u,2,0>: Cost 3 vsldoi12 RHS, <u,2,0,1>
+ 2779174774U, // <7,u,2,1>: Cost 3 vsldoi12 RHS, <u,2,1,3>
+ 1705428584U, // <7,u,2,2>: Cost 2 vsldoi12 RHS, <2,2,2,2>
+ 1705432965U, // <7,u,2,3>: Cost 2 vsldoi12 RHS, <u,2,3,0>
+ 2779174801U, // <7,u,2,4>: Cost 3 vsldoi12 RHS, <u,2,4,3>
+ 2779174810U, // <7,u,2,5>: Cost 3 vsldoi12 RHS, <u,2,5,3>
+ 2767673251U, // <7,u,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <u,2,6,3>
+ 1705580460U, // <7,u,2,7>: Cost 2 vsldoi12 RHS, <u,2,7,3>
+ 1705433010U, // <7,u,2,u>: Cost 2 vsldoi12 RHS, <u,2,u,0>
+ 1705433020U, // <7,u,3,0>: Cost 2 vsldoi12 RHS, <u,3,0,1>
+ 2779174853U, // <7,u,3,1>: Cost 3 vsldoi12 RHS, <u,3,1,1>
+ 2767673299U, // <7,u,3,2>: Cost 3 vsldoi12 <2,6,3,7>, <u,3,2,6>
+ 1245659292U, // <7,u,3,3>: Cost 2 vmrglw <6,2,7,3>, LHS
+ 1705433060U, // <7,u,3,4>: Cost 2 vsldoi12 RHS, <u,3,4,5>
+ 2779174893U, // <7,u,3,5>: Cost 3 vsldoi12 RHS, <u,3,5,5>
+ 2706836152U, // <7,u,3,6>: Cost 3 vsldoi8 <3,6,7,u>, <3,6,7,u>
+ 1245662536U, // <7,u,3,7>: Cost 2 vmrglw <6,2,7,3>, RHS
+ 1705433092U, // <7,u,3,u>: Cost 2 vsldoi12 RHS, <u,3,u,1>
+ 2779174925U, // <7,u,4,0>: Cost 3 vsldoi12 RHS, <u,4,0,1>
+ 1185732398U, // <7,u,4,1>: Cost 2 vmrghw <7,4,5,6>, LHS
+ 2316093653U, // <7,u,4,2>: Cost 3 vmrglw <5,6,7,4>, <3,0,u,2>
+ 1242349724U, // <7,u,4,3>: Cost 2 vmrglw <5,6,7,4>, LHS
+ 1705430224U, // <7,u,4,4>: Cost 2 vsldoi12 RHS, <4,4,4,4>
+ 1705433151U, // <7,u,4,5>: Cost 2 vsldoi12 RHS, <u,4,5,6>
+ 2316093981U, // <7,u,4,6>: Cost 3 vmrglw <5,6,7,4>, <3,4,u,6>
+ 1242352968U, // <7,u,4,7>: Cost 2 vmrglw <5,6,7,4>, RHS
+ 1705433178U, // <7,u,4,u>: Cost 2 vsldoi12 RHS, <u,4,u,6>
+ 1584611430U, // <7,u,5,0>: Cost 2 vsldoi4 <6,7,u,5>, LHS
+ 2781165670U, // <7,u,5,1>: Cost 3 vsldoi12 RHS, <u,5,1,0>
+ 2640439226U, // <7,u,5,2>: Cost 3 vsldoi4 <3,7,u,5>, <2,6,3,7>
+ 2640440079U, // <7,u,5,3>: Cost 3 vsldoi4 <3,7,u,5>, <3,7,u,5>
+ 1584614710U, // <7,u,5,4>: Cost 2 vsldoi4 <6,7,u,5>, RHS
+ 1705431044U, // <7,u,5,5>: Cost 2 vsldoi12 RHS, <5,5,5,5>
+ 631691418U, // <7,u,5,6>: Cost 1 vsldoi12 RHS, RHS
+ 2779322525U, // <7,u,5,7>: Cost 3 vsldoi12 RHS, <u,5,7,1>
+ 631691436U, // <7,u,5,u>: Cost 1 vsldoi12 RHS, RHS
+ 2779175087U, // <7,u,6,0>: Cost 3 vsldoi12 RHS, <u,6,0,1>
+ 2779175102U, // <7,u,6,1>: Cost 3 vsldoi12 RHS, <u,6,1,7>
+ 1648357887U, // <7,u,6,2>: Cost 2 vsldoi8 <6,2,7,u>, <6,2,7,u>
+ 1705433296U, // <7,u,6,3>: Cost 2 vsldoi12 RHS, <u,6,3,7>
+ 2779175127U, // <7,u,6,4>: Cost 3 vsldoi12 RHS, <u,6,4,5>
+ 2779175138U, // <7,u,6,5>: Cost 3 vsldoi12 RHS, <u,6,5,7>
+ 1651012419U, // <7,u,6,6>: Cost 2 vsldoi8 <6,6,7,u>, <6,6,7,u>
+ 1705580788U, // <7,u,6,7>: Cost 2 vsldoi12 RHS, <u,6,7,7>
+ 1705433341U, // <7,u,6,u>: Cost 2 vsldoi12 RHS, <u,6,u,7>
+ 1705580800U, // <7,u,7,0>: Cost 2 vsldoi12 RHS, <u,7,0,1>
+ 1187878702U, // <7,u,7,1>: Cost 2 vmrghw <7,7,7,7>, LHS
+ 2768042263U, // <7,u,7,2>: Cost 3 vsldoi12 <2,6,u,7>, <u,7,2,6>
+ 1248346268U, // <7,u,7,3>: Cost 2 vmrglw <6,6,7,7>, LHS
+ 1705580840U, // <7,u,7,4>: Cost 2 vsldoi12 RHS, <u,7,4,5>
+ 1187879066U, // <7,u,7,5>: Cost 2 vmrghw <7,7,7,7>, RHS
+ 2779322679U, // <7,u,7,6>: Cost 3 vsldoi12 RHS, <u,7,6,2>
+ 430361910U, // <7,u,7,7>: Cost 1 vspltisw3 RHS
+ 430361910U, // <7,u,7,u>: Cost 1 vspltisw3 RHS
+ 1705433425U, // <7,u,u,0>: Cost 2 vsldoi12 RHS, <u,u,0,1>
+ 1705433435U, // <7,u,u,1>: Cost 2 vsldoi12 RHS, <u,u,1,2>
+ 631691621U, // <7,u,u,2>: Cost 1 vsldoi12 RHS, LHS
+ 1705433451U, // <7,u,u,3>: Cost 2 vsldoi12 RHS, <u,u,3,0>
+ 1705433465U, // <7,u,u,4>: Cost 2 vsldoi12 RHS, <u,u,4,5>
+ 1705433475U, // <7,u,u,5>: Cost 2 vsldoi12 RHS, <u,u,5,6>
+ 631691661U, // <7,u,u,6>: Cost 1 vsldoi12 RHS, RHS
+ 430361910U, // <7,u,u,7>: Cost 1 vspltisw3 RHS
+ 631691675U, // <7,u,u,u>: Cost 1 vsldoi12 RHS, LHS
+ 202162278U, // <u,0,0,0>: Cost 1 vspltisw0 LHS
+ 1678598154U, // <u,0,0,1>: Cost 2 vsldoi12 LHS, <0,0,1,1>
+ 2634500154U, // <u,0,0,2>: Cost 3 vsldoi4 <2,u,0,0>, <2,u,0,0>
+ 2289596269U, // <u,0,0,3>: Cost 3 vmrglw <1,2,u,0>, <u,2,0,3>
+ 1548815670U, // <u,0,0,4>: Cost 2 vsldoi4 <0,u,0,0>, RHS
+ 2663698530U, // <u,0,0,5>: Cost 3 vsldoi4 <7,7,0,0>, <5,6,7,0>
+ 2658390942U, // <u,0,0,6>: Cost 3 vsldoi4 <6,u,0,0>, <6,u,0,0>
+ 2289596597U, // <u,0,0,7>: Cost 3 vmrglw <1,2,u,0>, <u,6,0,7>
+ 202162278U, // <u,0,0,u>: Cost 1 vspltisw0 LHS
+ 1560764518U, // <u,0,1,0>: Cost 2 vsldoi4 <2,u,0,1>, LHS
+ 115720294U, // <u,0,1,1>: Cost 1 vmrghw LHS, LHS
+ 604856427U, // <u,0,1,2>: Cost 1 vsldoi12 LHS, LHS
+ 2634508438U, // <u,0,1,3>: Cost 3 vsldoi4 <2,u,0,1>, <3,0,1,2>
+ 1560767798U, // <u,0,1,4>: Cost 2 vsldoi4 <2,u,0,1>, RHS
+ 2652426438U, // <u,0,1,5>: Cost 3 vsldoi4 <5,u,0,1>, <5,u,0,1>
+ 1584657311U, // <u,0,1,6>: Cost 2 vsldoi4 <6,u,0,1>, <6,u,0,1>
+ 2658399226U, // <u,0,1,7>: Cost 3 vsldoi4 <6,u,0,1>, <7,0,1,2>
+ 604856476U, // <u,0,1,u>: Cost 1 vsldoi12 LHS, LHS
+ 2696889850U, // <u,0,2,0>: Cost 3 vsldoi8 <2,0,u,0>, <2,0,u,0>
+ 1190174822U, // <u,0,2,1>: Cost 2 vmrghw <u,2,3,0>, LHS
+ 2692245096U, // <u,0,2,2>: Cost 3 vsldoi8 <1,2,u,0>, <2,2,2,2>
+ 2692245158U, // <u,0,2,3>: Cost 3 vsldoi8 <1,2,u,0>, <2,3,0,1>
+ 2263916882U, // <u,0,2,4>: Cost 3 vmrghw <u,2,3,0>, <0,4,1,5>
+ 2299709908U, // <u,0,2,5>: Cost 3 vmrglw <3,0,1,2>, <3,4,0,5>
+ 2692245434U, // <u,0,2,6>: Cost 3 vsldoi8 <1,2,u,0>, <2,6,3,7>
+ 2701535281U, // <u,0,2,7>: Cost 3 vsldoi8 <2,7,u,0>, <2,7,u,0>
+ 1190175389U, // <u,0,2,u>: Cost 2 vmrghw <u,2,3,0>, LHS
+ 1209237504U, // <u,0,3,0>: Cost 2 vmrglw LHS, <0,0,0,0>
+ 1209239206U, // <u,0,3,1>: Cost 2 vmrglw LHS, <2,3,0,1>
+ 2704189813U, // <u,0,3,2>: Cost 3 vsldoi8 <3,2,u,0>, <3,2,u,0>
+ 2692245916U, // <u,0,3,3>: Cost 3 vsldoi8 <1,2,u,0>, <3,3,3,3>
+ 2282981033U, // <u,0,3,4>: Cost 3 vmrglw LHS, <2,3,0,4>
+ 2664386658U, // <u,0,3,5>: Cost 3 vsldoi4 <7,u,0,3>, <5,6,7,0>
+ 2691877496U, // <u,0,3,6>: Cost 3 vsldoi8 <1,2,3,0>, <3,6,0,7>
+ 2664388218U, // <u,0,3,7>: Cost 3 vsldoi4 <7,u,0,3>, <7,u,0,3>
+ 1209239213U, // <u,0,3,u>: Cost 2 vmrglw LHS, <2,3,0,u>
+ 2289623040U, // <u,0,4,0>: Cost 3 vmrglw <1,2,u,4>, <0,0,0,0>
+ 1678598482U, // <u,0,4,1>: Cost 2 vsldoi12 LHS, <0,4,1,5>
+ 2634532926U, // <u,0,4,2>: Cost 3 vsldoi4 <2,u,0,4>, <2,u,0,4>
+ 2235580672U, // <u,0,4,3>: Cost 3 vmrghw <3,4,5,6>, <0,3,1,4>
+ 1143619922U, // <u,0,4,4>: Cost 2 vmrghw <0,4,1,5>, <0,4,1,5>
+ 1618505014U, // <u,0,4,5>: Cost 2 vsldoi8 <1,2,u,0>, RHS
+ 2658423714U, // <u,0,4,6>: Cost 3 vsldoi4 <6,u,0,4>, <6,u,0,4>
+ 2713259464U, // <u,0,4,7>: Cost 3 vsldoi8 <4,7,5,0>, <4,7,5,0>
+ 1683243409U, // <u,0,4,u>: Cost 2 vsldoi12 LHS, <0,4,u,5>
+ 1192443904U, // <u,0,5,0>: Cost 2 vmrghw RHS, <0,0,0,0>
+ 118702182U, // <u,0,5,1>: Cost 1 vmrghw RHS, LHS
+ 2266185901U, // <u,0,5,2>: Cost 3 vmrghw RHS, <0,2,1,2>
+ 2640513816U, // <u,0,5,3>: Cost 3 vsldoi4 <3,u,0,5>, <3,u,0,5>
+ 1192444242U, // <u,0,5,4>: Cost 2 vmrghw RHS, <0,4,1,5>
+ 2718789636U, // <u,0,5,5>: Cost 3 vsldoi8 <5,6,u,0>, <5,5,5,5>
+ 1645047915U, // <u,0,5,6>: Cost 2 vsldoi8 <5,6,u,0>, <5,6,u,0>
+ 2664404604U, // <u,0,5,7>: Cost 3 vsldoi4 <7,u,0,5>, <7,u,0,5>
+ 118702749U, // <u,0,5,u>: Cost 1 vmrghw RHS, LHS
+ 2302910464U, // <u,0,6,0>: Cost 3 vmrglw <3,4,u,6>, <0,0,0,0>
+ 1192886374U, // <u,0,6,1>: Cost 2 vmrghw <u,6,3,7>, LHS
+ 2718790138U, // <u,0,6,2>: Cost 3 vsldoi8 <5,6,u,0>, <6,2,7,3>
+ 2722771537U, // <u,0,6,3>: Cost 3 vsldoi8 <6,3,u,0>, <6,3,u,0>
+ 2266628434U, // <u,0,6,4>: Cost 3 vmrghw <u,6,3,7>, <0,4,1,5>
+ 2248950180U, // <u,0,6,5>: Cost 3 vmrghw <5,6,7,0>, <0,5,1,6>
+ 2718790456U, // <u,0,6,6>: Cost 3 vsldoi8 <5,6,u,0>, <6,6,6,6>
+ 2718790478U, // <u,0,6,7>: Cost 3 vsldoi8 <5,6,u,0>, <6,7,0,1>
+ 1192886941U, // <u,0,6,u>: Cost 2 vmrghw <u,6,3,7>, LHS
+ 1235812352U, // <u,0,7,0>: Cost 2 vmrglw RHS, <0,0,0,0>
+ 1235814054U, // <u,0,7,1>: Cost 2 vmrglw RHS, <2,3,0,1>
+ 2728080601U, // <u,0,7,2>: Cost 3 vsldoi8 <7,2,u,0>, <7,2,u,0>
+ 2640530202U, // <u,0,7,3>: Cost 3 vsldoi4 <3,u,0,7>, <3,u,0,7>
+ 2640530742U, // <u,0,7,4>: Cost 3 vsldoi4 <3,u,0,7>, RHS
+ 2309556692U, // <u,0,7,5>: Cost 3 vmrglw RHS, <3,4,0,5>
+ 2730735133U, // <u,0,7,6>: Cost 3 vsldoi8 <7,6,u,0>, <7,6,u,0>
+ 2309556856U, // <u,0,7,7>: Cost 3 vmrglw RHS, <3,6,0,7>
+ 1235814061U, // <u,0,7,u>: Cost 2 vmrglw RHS, <2,3,0,u>
+ 202162278U, // <u,0,u,0>: Cost 1 vspltisw0 LHS
+ 120365158U, // <u,0,u,1>: Cost 1 vmrghw LHS, LHS
+ 604856989U, // <u,0,u,2>: Cost 1 vsldoi12 LHS, LHS
+ 2692249532U, // <u,0,u,3>: Cost 3 vsldoi8 <1,2,u,0>, <u,3,0,1>
+ 1560825142U, // <u,0,u,4>: Cost 2 vsldoi4 <2,u,0,u>, RHS
+ 1618507930U, // <u,0,u,5>: Cost 2 vsldoi8 <1,2,u,0>, RHS
+ 1584714662U, // <u,0,u,6>: Cost 2 vsldoi4 <6,u,0,u>, <6,u,0,u>
+ 2309565048U, // <u,0,u,7>: Cost 3 vmrglw RHS, <3,6,0,7>
+ 604857043U, // <u,0,u,u>: Cost 1 vsldoi12 LHS, LHS
+ 1611210825U, // <u,1,0,0>: Cost 2 vsldoi8 <0,0,u,1>, <0,0,u,1>
+ 1616519270U, // <u,1,0,1>: Cost 2 vsldoi8 <0,u,u,1>, LHS
+ 2287605459U, // <u,1,0,2>: Cost 3 vmrglw <0,u,u,0>, <u,0,1,2>
+ 2640546588U, // <u,1,0,3>: Cost 3 vsldoi4 <3,u,1,0>, <3,u,1,0>
+ 2622631222U, // <u,1,0,4>: Cost 3 vsldoi4 <0,u,1,0>, RHS
+ 2289590610U, // <u,1,0,5>: Cost 3 vmrglw <1,2,u,0>, <0,4,1,5>
+ 2664436630U, // <u,1,0,6>: Cost 3 vsldoi4 <7,u,1,0>, <6,7,u,1>
+ 2664437376U, // <u,1,0,7>: Cost 3 vsldoi4 <7,u,1,0>, <7,u,1,0>
+ 1616519889U, // <u,1,0,u>: Cost 2 vsldoi8 <0,u,u,1>, <0,u,u,1>
+ 1548894866U, // <u,1,1,0>: Cost 2 vsldoi4 <0,u,1,1>, <0,u,1,1>
+ 269271142U, // <u,1,1,1>: Cost 1 vspltisw1 LHS
+ 1189462934U, // <u,1,1,2>: Cost 2 vmrghw LHS, <1,2,3,0>
+ 2622638230U, // <u,1,1,3>: Cost 3 vsldoi4 <0,u,1,1>, <3,0,1,2>
+ 1548897590U, // <u,1,1,4>: Cost 2 vsldoi4 <0,u,1,1>, RHS
+ 2756985692U, // <u,1,1,5>: Cost 3 vsldoi12 LHS, <1,1,5,5>
+ 2658472872U, // <u,1,1,6>: Cost 3 vsldoi4 <6,u,1,1>, <6,u,1,1>
+ 2287614142U, // <u,1,1,7>: Cost 3 vmrglw <0,u,u,1>, <u,6,1,7>
+ 269271142U, // <u,1,1,u>: Cost 1 vspltisw1 LHS
+ 1566818406U, // <u,1,2,0>: Cost 2 vsldoi4 <3,u,1,2>, LHS
+ 2756985735U, // <u,1,2,1>: Cost 3 vsldoi12 LHS, <1,2,1,3>
+ 1148371862U, // <u,1,2,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0>
+ 835584U, // <u,1,2,3>: Cost 0 copy LHS
+ 1566821686U, // <u,1,2,4>: Cost 2 vsldoi4 <3,u,1,2>, RHS
+ 2756985771U, // <u,1,2,5>: Cost 3 vsldoi12 LHS, <1,2,5,3>
+ 2690262970U, // <u,1,2,6>: Cost 3 vsldoi8 <0,u,u,1>, <2,6,3,7>
+ 1590711938U, // <u,1,2,7>: Cost 2 vsldoi4 <7,u,1,2>, <7,u,1,2>
+ 835584U, // <u,1,2,u>: Cost 0 copy LHS
+ 2282979337U, // <u,1,3,0>: Cost 3 vmrglw LHS, <0,0,1,0>
+ 1209237514U, // <u,1,3,1>: Cost 2 vmrglw LHS, <0,0,1,1>
+ 1209239702U, // <u,1,3,2>: Cost 2 vmrglw LHS, <3,0,1,2>
+ 2282979502U, // <u,1,3,3>: Cost 3 vmrglw LHS, <0,2,1,3>
+ 2282979341U, // <u,1,3,4>: Cost 3 vmrglw LHS, <0,0,1,4>
+ 1209237842U, // <u,1,3,5>: Cost 2 vmrglw LHS, <0,4,1,5>
+ 2282979505U, // <u,1,3,6>: Cost 3 vmrglw LHS, <0,2,1,6>
+ 2287625423U, // <u,1,3,7>: Cost 3 vmrglw LHS, <1,6,1,7>
+ 1209237521U, // <u,1,3,u>: Cost 2 vmrglw LHS, <0,0,1,u>
+ 1635101613U, // <u,1,4,0>: Cost 2 vsldoi8 <4,0,u,1>, <4,0,u,1>
+ 2289623050U, // <u,1,4,1>: Cost 3 vmrglw <1,2,u,4>, <0,0,1,1>
+ 2289625238U, // <u,1,4,2>: Cost 3 vmrglw <1,2,u,4>, <3,0,1,2>
+ 2640579360U, // <u,1,4,3>: Cost 3 vsldoi4 <3,u,1,4>, <3,u,1,4>
+ 2622663990U, // <u,1,4,4>: Cost 3 vsldoi4 <0,u,1,4>, RHS
+ 1616522550U, // <u,1,4,5>: Cost 2 vsldoi8 <0,u,u,1>, RHS
+ 2664469398U, // <u,1,4,6>: Cost 3 vsldoi4 <7,u,1,4>, <6,7,u,1>
+ 2664470148U, // <u,1,4,7>: Cost 3 vsldoi4 <7,u,1,4>, <7,u,1,4>
+ 1616522793U, // <u,1,4,u>: Cost 2 vsldoi8 <0,u,u,1>, RHS
+ 1548927638U, // <u,1,5,0>: Cost 2 vsldoi4 <0,u,1,5>, <0,u,1,5>
+ 1192444724U, // <u,1,5,1>: Cost 2 vmrghw RHS, <1,1,1,1>
+ 1192444822U, // <u,1,5,2>: Cost 2 vmrghw RHS, <1,2,3,0>
+ 2622670998U, // <u,1,5,3>: Cost 3 vsldoi4 <0,u,1,5>, <3,0,1,2>
+ 1548930358U, // <u,1,5,4>: Cost 2 vsldoi4 <0,u,1,5>, RHS
+ 1210728786U, // <u,1,5,5>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5>
+ 2714153058U, // <u,1,5,6>: Cost 3 vsldoi8 <4,u,u,1>, <5,6,7,0>
+ 2670449658U, // <u,1,5,7>: Cost 3 vsldoi4 <u,u,1,5>, <7,0,1,2>
+ 1548932910U, // <u,1,5,u>: Cost 2 vsldoi4 <0,u,1,5>, LHS
+ 2622677655U, // <u,1,6,0>: Cost 3 vsldoi4 <0,u,1,6>, <0,u,1,6>
+ 2756986063U, // <u,1,6,1>: Cost 3 vsldoi12 LHS, <1,6,1,7>
+ 2302912662U, // <u,1,6,2>: Cost 3 vmrglw <3,4,u,6>, <3,0,1,2>
+ 3696421014U, // <u,1,6,3>: Cost 4 vsldoi4 <0,u,1,6>, <3,0,1,2>
+ 2622680374U, // <u,1,6,4>: Cost 3 vsldoi4 <0,u,1,6>, RHS
+ 2756986099U, // <u,1,6,5>: Cost 3 vsldoi12 LHS, <1,6,5,7>
+ 2714153784U, // <u,1,6,6>: Cost 3 vsldoi8 <4,u,u,1>, <6,6,6,6>
+ 1651692438U, // <u,1,6,7>: Cost 2 vsldoi8 <6,7,u,1>, <6,7,u,1>
+ 1652356071U, // <u,1,6,u>: Cost 2 vsldoi8 <6,u,u,1>, <6,u,u,1>
+ 2628657254U, // <u,1,7,0>: Cost 3 vsldoi4 <1,u,1,7>, LHS
+ 1235812362U, // <u,1,7,1>: Cost 2 vmrglw RHS, <0,0,1,1>
+ 1235814550U, // <u,1,7,2>: Cost 2 vmrglw RHS, <3,0,1,2>
+ 2309554350U, // <u,1,7,3>: Cost 3 vmrglw RHS, <0,2,1,3>
+ 2628660534U, // <u,1,7,4>: Cost 3 vsldoi4 <1,u,1,7>, RHS
+ 1235812690U, // <u,1,7,5>: Cost 2 vmrglw RHS, <0,4,1,5>
+ 2309554353U, // <u,1,7,6>: Cost 3 vmrglw RHS, <0,2,1,6>
+ 2309554678U, // <u,1,7,7>: Cost 3 vmrglw RHS, <0,6,1,7>
+ 1235812369U, // <u,1,7,u>: Cost 2 vmrglw RHS, <0,0,1,u>
+ 1548952217U, // <u,1,u,0>: Cost 2 vsldoi4 <0,u,1,u>, <0,u,1,u>
+ 269271142U, // <u,1,u,1>: Cost 1 vspltisw1 LHS
+ 1209280662U, // <u,1,u,2>: Cost 2 vmrglw LHS, <3,0,1,2>
+ 835584U, // <u,1,u,3>: Cost 0 copy LHS
+ 1548954934U, // <u,1,u,4>: Cost 2 vsldoi4 <0,u,1,u>, RHS
+ 1209278802U, // <u,1,u,5>: Cost 2 vmrglw LHS, <0,4,1,5>
+ 2283020465U, // <u,1,u,6>: Cost 3 vmrglw LHS, <0,2,1,6>
+ 1590761096U, // <u,1,u,7>: Cost 2 vsldoi4 <7,u,1,u>, <7,u,1,u>
+ 835584U, // <u,1,u,u>: Cost 0 copy LHS
+ 2702876672U, // <u,2,0,0>: Cost 3 vsldoi8 <3,0,u,2>, <0,0,0,0>
+ 1629134950U, // <u,2,0,1>: Cost 2 vsldoi8 <3,0,u,2>, LHS
+ 2289591912U, // <u,2,0,2>: Cost 3 vmrglw <1,2,u,0>, <2,2,2,2>
+ 1215848550U, // <u,2,0,3>: Cost 2 vmrglw <1,2,u,0>, LHS
+ 2702877010U, // <u,2,0,4>: Cost 3 vsldoi8 <3,0,u,2>, <0,4,1,5>
+ 2289222708U, // <u,2,0,5>: Cost 3 vmrglw <1,2,3,0>, <1,4,2,5>
+ 2779178473U, // <u,2,0,6>: Cost 3 vsldoi12 RHS, <2,0,6,1>
+ 2726249024U, // <u,2,0,7>: Cost 3 vsldoi8 <7,0,1,2>, <0,7,1,0>
+ 1215848555U, // <u,2,0,u>: Cost 2 vmrglw <1,2,u,0>, LHS
+ 2690933539U, // <u,2,1,0>: Cost 3 vsldoi8 <1,0,u,2>, <1,0,u,2>
+ 2628683124U, // <u,2,1,1>: Cost 3 vsldoi4 <1,u,2,1>, <1,u,2,1>
+ 1189463656U, // <u,2,1,2>: Cost 2 vmrghw LHS, <2,2,2,2>
+ 1213866086U, // <u,2,1,3>: Cost 2 vmrglw <0,u,u,1>, LHS
+ 2628685110U, // <u,2,1,4>: Cost 3 vsldoi4 <1,u,2,1>, RHS
+ 2263205736U, // <u,2,1,5>: Cost 3 vmrghw LHS, <2,5,3,6>
+ 1189463994U, // <u,2,1,6>: Cost 2 vmrghw LHS, <2,6,3,7>
+ 2263205866U, // <u,2,1,7>: Cost 3 vmrghw LHS, <2,7,0,1>
+ 1213866091U, // <u,2,1,u>: Cost 2 vmrglw <0,u,u,1>, LHS
+ 1556938854U, // <u,2,2,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS
+ 2697569869U, // <u,2,2,1>: Cost 3 vsldoi8 <2,1,u,2>, <2,1,u,2>
+ 336380006U, // <u,2,2,2>: Cost 1 vspltisw2 LHS
+ 1678599794U, // <u,2,2,3>: Cost 2 vsldoi12 LHS, <2,2,3,3>
+ 1556942134U, // <u,2,2,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS
+ 2295138061U, // <u,2,2,5>: Cost 3 vmrglw <2,2,2,2>, <2,4,2,5>
+ 2702878650U, // <u,2,2,6>: Cost 3 vsldoi8 <3,0,u,2>, <2,6,3,7>
+ 2300229831U, // <u,2,2,7>: Cost 3 vmrglw <3,0,u,2>, <u,6,2,7>
+ 336380006U, // <u,2,2,u>: Cost 1 vspltisw2 LHS
+ 475243165U, // <u,2,3,0>: Cost 1 vsldoi4 LHS, LHS
+ 1548985140U, // <u,2,3,1>: Cost 2 vsldoi4 LHS, <1,1,1,1>
+ 1209239144U, // <u,2,3,2>: Cost 2 vmrglw LHS, <2,2,2,2>
+ 135495782U, // <u,2,3,3>: Cost 1 vmrglw LHS, LHS
+ 475245878U, // <u,2,3,4>: Cost 1 vsldoi4 LHS, RHS
+ 1596764164U, // <u,2,3,5>: Cost 2 vsldoi4 LHS, <5,5,5,5>
+ 1596764666U, // <u,2,3,6>: Cost 2 vsldoi4 LHS, <6,2,7,3>
+ 1596765178U, // <u,2,3,7>: Cost 2 vsldoi4 LHS, <7,0,1,2>
+ 135495787U, // <u,2,3,u>: Cost 1 vmrglw LHS, LHS
+ 2708851630U, // <u,2,4,0>: Cost 3 vsldoi8 <4,0,u,2>, <4,0,u,2>
+ 2217362979U, // <u,2,4,1>: Cost 3 vmrghw <0,4,1,5>, <2,1,3,5>
+ 2289624680U, // <u,2,4,2>: Cost 3 vmrglw <1,2,u,4>, <2,2,2,2>
+ 1215881318U, // <u,2,4,3>: Cost 2 vmrglw <1,2,u,4>, LHS
+ 2726767824U, // <u,2,4,4>: Cost 3 vsldoi8 <7,0,u,2>, <4,4,4,4>
+ 1629138230U, // <u,2,4,5>: Cost 2 vsldoi8 <3,0,u,2>, RHS
+ 2779178801U, // <u,2,4,6>: Cost 3 vsldoi12 RHS, <2,4,6,5>
+ 2726251976U, // <u,2,4,7>: Cost 3 vsldoi8 <7,0,1,2>, <4,7,5,0>
+ 1215881323U, // <u,2,4,u>: Cost 2 vmrglw <1,2,u,4>, LHS
+ 2628714598U, // <u,2,5,0>: Cost 3 vsldoi4 <1,u,2,5>, LHS
+ 2628715896U, // <u,2,5,1>: Cost 3 vsldoi4 <1,u,2,5>, <1,u,2,5>
+ 1192445544U, // <u,2,5,2>: Cost 2 vmrghw RHS, <2,2,2,2>
+ 1213898854U, // <u,2,5,3>: Cost 2 vmrglw <0,u,u,5>, LHS
+ 2628717878U, // <u,2,5,4>: Cost 3 vsldoi4 <1,u,2,5>, RHS
+ 2726768644U, // <u,2,5,5>: Cost 3 vsldoi8 <7,0,u,2>, <5,5,5,5>
+ 1192445882U, // <u,2,5,6>: Cost 2 vmrghw RHS, <2,6,3,7>
+ 2266187754U, // <u,2,5,7>: Cost 3 vmrghw RHS, <2,7,0,1>
+ 1213898859U, // <u,2,5,u>: Cost 2 vmrglw <0,u,u,5>, LHS
+ 2634694758U, // <u,2,6,0>: Cost 3 vsldoi4 <2,u,2,6>, LHS
+ 2721460657U, // <u,2,6,1>: Cost 3 vsldoi8 <6,1,u,2>, <6,1,u,2>
+ 2296940136U, // <u,2,6,2>: Cost 3 vmrglw <2,4,u,6>, <2,2,2,2>
+ 1678600122U, // <u,2,6,3>: Cost 2 vsldoi12 LHS, <2,6,3,7>
+ 2634698038U, // <u,2,6,4>: Cost 3 vsldoi4 <2,u,2,6>, RHS
+ 3370682125U, // <u,2,6,5>: Cost 4 vmrglw <2,4,u,6>, <2,4,2,5>
+ 1157056442U, // <u,2,6,6>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7>
+ 2725442455U, // <u,2,6,7>: Cost 3 vsldoi8 <6,7,u,2>, <6,7,u,2>
+ 1678600167U, // <u,2,6,u>: Cost 2 vsldoi12 LHS, <2,6,u,7>
+ 1653027897U, // <u,2,7,0>: Cost 2 vsldoi8 <7,0,u,2>, <7,0,u,2>
+ 2309554924U, // <u,2,7,1>: Cost 3 vmrglw RHS, <1,0,2,1>
+ 1235813992U, // <u,2,7,2>: Cost 2 vmrglw RHS, <2,2,2,2>
+ 162070630U, // <u,2,7,3>: Cost 1 vmrglw RHS, LHS
+ 2634706230U, // <u,2,7,4>: Cost 3 vsldoi4 <2,u,2,7>, RHS
+ 2309555252U, // <u,2,7,5>: Cost 3 vmrglw RHS, <1,4,2,5>
+ 2309555901U, // <u,2,7,6>: Cost 3 vmrglw RHS, <2,3,2,6>
+ 2309555416U, // <u,2,7,7>: Cost 3 vmrglw RHS, <1,6,2,7>
+ 162070635U, // <u,2,7,u>: Cost 1 vmrglw RHS, LHS
+ 475284130U, // <u,2,u,0>: Cost 1 vsldoi4 LHS, LHS
+ 1549026100U, // <u,2,u,1>: Cost 2 vsldoi4 LHS, <1,1,1,1>
+ 336380006U, // <u,2,u,2>: Cost 1 vspltisw2 LHS
+ 135536742U, // <u,2,u,3>: Cost 1 vmrglw LHS, LHS
+ 475286838U, // <u,2,u,4>: Cost 1 vsldoi4 LHS, RHS
+ 1629141146U, // <u,2,u,5>: Cost 2 vsldoi8 <3,0,u,2>, RHS
+ 1194108858U, // <u,2,u,6>: Cost 2 vmrghw LHS, <2,6,3,7>
+ 1596806138U, // <u,2,u,7>: Cost 2 vsldoi4 LHS, <7,0,1,2>
+ 135536747U, // <u,2,u,u>: Cost 1 vmrglw LHS, LHS
+ 1611890688U, // <u,3,0,0>: Cost 2 vsldoi8 LHS, <0,0,0,0>
+ 538149020U, // <u,3,0,1>: Cost 1 vsldoi8 LHS, LHS
+ 2685632685U, // <u,3,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2>
+ 2685632764U, // <u,3,0,3>: Cost 3 vsldoi8 LHS, <0,3,1,0>
+ 1611891026U, // <u,3,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5>
+ 2733408722U, // <u,3,0,5>: Cost 3 vsldoi8 LHS, <0,5,6,7>
+ 2658612153U, // <u,3,0,6>: Cost 3 vsldoi4 <6,u,3,0>, <6,u,3,0>
+ 2289592250U, // <u,3,0,7>: Cost 3 vmrglw <1,2,u,0>, <2,6,3,7>
+ 538149533U, // <u,3,0,u>: Cost 1 vsldoi8 LHS, LHS
+ 1189464214U, // <u,3,1,0>: Cost 2 vmrghw LHS, <3,0,1,2>
+ 1611891508U, // <u,3,1,1>: Cost 2 vsldoi8 LHS, <1,1,1,1>
+ 1611891606U, // <u,3,1,2>: Cost 2 vsldoi8 LHS, <1,2,3,0>
+ 1189464476U, // <u,3,1,3>: Cost 2 vmrghw LHS, <3,3,3,3>
+ 1189464578U, // <u,3,1,4>: Cost 2 vmrghw LHS, <3,4,5,6>
+ 2690278511U, // <u,3,1,5>: Cost 3 vsldoi8 LHS, <1,5,0,1>
+ 2690278607U, // <u,3,1,6>: Cost 3 vsldoi8 LHS, <1,6,1,7>
+ 2287609786U, // <u,3,1,7>: Cost 3 vmrglw <0,u,u,1>, <2,6,3,7>
+ 1611892092U, // <u,3,1,u>: Cost 2 vsldoi8 LHS, <1,u,3,0>
+ 2685634042U, // <u,3,2,0>: Cost 3 vsldoi8 LHS, <2,0,u,0>
+ 2685634079U, // <u,3,2,1>: Cost 3 vsldoi8 LHS, <2,1,3,1>
+ 1611892328U, // <u,3,2,2>: Cost 2 vsldoi8 LHS, <2,2,2,2>
+ 1611892390U, // <u,3,2,3>: Cost 2 vsldoi8 LHS, <2,3,0,1>
+ 2685634371U, // <u,3,2,4>: Cost 3 vsldoi8 LHS, <2,4,u,5>
+ 2685634453U, // <u,3,2,5>: Cost 3 vsldoi8 LHS, <2,5,u,6>
+ 1611892666U, // <u,3,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7>
+ 2300225466U, // <u,3,2,7>: Cost 3 vmrglw <3,0,u,2>, <2,6,3,7>
+ 1611892795U, // <u,3,2,u>: Cost 2 vsldoi8 LHS, <2,u,0,1>
+ 1209238422U, // <u,3,3,0>: Cost 2 vmrglw LHS, <1,2,3,0>
+ 2282980247U, // <u,3,3,1>: Cost 3 vmrglw LHS, <1,2,3,1>
+ 1561004120U, // <u,3,3,2>: Cost 2 vsldoi4 <2,u,3,3>, <2,u,3,3>
+ 403488870U, // <u,3,3,3>: Cost 1 vspltisw3 LHS
+ 1209238426U, // <u,3,3,4>: Cost 2 vmrglw LHS, <1,2,3,4>
+ 2282980899U, // <u,3,3,5>: Cost 3 vmrglw LHS, <2,1,3,5>
+ 2282985598U, // <u,3,3,6>: Cost 3 vmrglw LHS, <u,5,3,6>
+ 1209239482U, // <u,3,3,7>: Cost 2 vmrglw LHS, <2,6,3,7>
+ 403488870U, // <u,3,3,u>: Cost 1 vspltisw3 LHS
+ 1555038310U, // <u,3,4,0>: Cost 2 vsldoi4 <1,u,3,4>, LHS
+ 1555039616U, // <u,3,4,1>: Cost 2 vsldoi4 <1,u,3,4>, <1,u,3,4>
+ 2628781672U, // <u,3,4,2>: Cost 3 vsldoi4 <1,u,3,4>, <2,2,2,2>
+ 2289624690U, // <u,3,4,3>: Cost 3 vmrglw <1,2,u,4>, <2,2,3,3>
+ 1555041590U, // <u,3,4,4>: Cost 2 vsldoi4 <1,u,3,4>, RHS
+ 538152246U, // <u,3,4,5>: Cost 1 vsldoi8 LHS, RHS
+ 2658644925U, // <u,3,4,6>: Cost 3 vsldoi4 <6,u,3,4>, <6,u,3,4>
+ 2289625018U, // <u,3,4,7>: Cost 3 vmrglw <1,2,u,4>, <2,6,3,7>
+ 538152489U, // <u,3,4,u>: Cost 1 vsldoi8 LHS, RHS
+ 1192446102U, // <u,3,5,0>: Cost 2 vmrghw RHS, <3,0,1,2>
+ 2733411983U, // <u,3,5,1>: Cost 3 vsldoi8 LHS, <5,1,0,1>
+ 2634762330U, // <u,3,5,2>: Cost 3 vsldoi4 <2,u,3,5>, <2,u,3,5>
+ 1192446364U, // <u,3,5,3>: Cost 2 vmrghw RHS, <3,3,3,3>
+ 1192446466U, // <u,3,5,4>: Cost 2 vmrghw RHS, <3,4,5,6>
+ 1659670532U, // <u,3,5,5>: Cost 2 vsldoi8 LHS, <5,5,5,5>
+ 1659670626U, // <u,3,5,6>: Cost 2 vsldoi8 LHS, <5,6,7,0>
+ 2287642554U, // <u,3,5,7>: Cost 3 vmrglw <0,u,u,5>, <2,6,3,7>
+ 1659670788U, // <u,3,5,u>: Cost 2 vsldoi8 LHS, <5,u,7,0>
+ 2634768486U, // <u,3,6,0>: Cost 3 vsldoi4 <2,u,3,6>, LHS
+ 2733412775U, // <u,3,6,1>: Cost 3 vsldoi8 LHS, <6,1,7,1>
+ 1648390659U, // <u,3,6,2>: Cost 2 vsldoi8 <6,2,u,3>, <6,2,u,3>
+ 2634770973U, // <u,3,6,3>: Cost 3 vsldoi4 <2,u,3,6>, <3,4,u,6>
+ 2634771766U, // <u,3,6,4>: Cost 3 vsldoi4 <2,u,3,6>, RHS
+ 2733413099U, // <u,3,6,5>: Cost 3 vsldoi8 LHS, <6,5,7,1>
+ 1659671352U, // <u,3,6,6>: Cost 2 vsldoi8 LHS, <6,6,6,6>
+ 1659671374U, // <u,3,6,7>: Cost 2 vsldoi8 LHS, <6,7,0,1>
+ 1652372457U, // <u,3,6,u>: Cost 2 vsldoi8 <6,u,u,3>, <6,u,u,3>
+ 1561034854U, // <u,3,7,0>: Cost 2 vsldoi4 <2,u,3,7>, LHS
+ 2634777396U, // <u,3,7,1>: Cost 3 vsldoi4 <2,u,3,7>, <1,1,1,1>
+ 1561036892U, // <u,3,7,2>: Cost 2 vsldoi4 <2,u,3,7>, <2,u,3,7>
+ 1235814002U, // <u,3,7,3>: Cost 2 vmrglw RHS, <2,2,3,3>
+ 1561038134U, // <u,3,7,4>: Cost 2 vsldoi4 <2,u,3,7>, RHS
+ 2309555747U, // <u,3,7,5>: Cost 3 vmrglw RHS, <2,1,3,5>
+ 2309556072U, // <u,3,7,6>: Cost 3 vmrglw RHS, <2,5,3,6>
+ 1235814330U, // <u,3,7,7>: Cost 2 vmrglw RHS, <2,6,3,7>
+ 1561040686U, // <u,3,7,u>: Cost 2 vsldoi4 <2,u,3,7>, LHS
+ 1611896531U, // <u,3,u,0>: Cost 2 vsldoi8 LHS, <u,0,1,2>
+ 538154798U, // <u,3,u,1>: Cost 1 vsldoi8 LHS, LHS
+ 1611896712U, // <u,3,u,2>: Cost 2 vsldoi8 LHS, <u,2,3,3>
+ 403488870U, // <u,3,u,3>: Cost 1 vspltisw3 LHS
+ 1611896895U, // <u,3,u,4>: Cost 2 vsldoi8 LHS, <u,4,5,6>
+ 538155162U, // <u,3,u,5>: Cost 1 vsldoi8 LHS, RHS
+ 1611897040U, // <u,3,u,6>: Cost 2 vsldoi8 LHS, <u,6,3,7>
+ 1209280442U, // <u,3,u,7>: Cost 2 vmrglw LHS, <2,6,3,7>
+ 538155365U, // <u,3,u,u>: Cost 1 vsldoi8 LHS, LHS
+ 1165118354U, // <u,4,0,0>: Cost 2 vmrghw <4,0,5,1>, <4,0,5,1>
+ 1618534502U, // <u,4,0,1>: Cost 2 vsldoi8 <1,2,u,4>, LHS
+ 2634795102U, // <u,4,0,2>: Cost 3 vsldoi4 <2,u,4,0>, <2,u,4,0>
+ 2686451968U, // <u,4,0,3>: Cost 3 vsldoi8 <0,3,1,4>, <0,3,1,4>
+ 2692276562U, // <u,4,0,4>: Cost 3 vsldoi8 <1,2,u,4>, <0,4,1,5>
+ 1705438098U, // <u,4,0,5>: Cost 2 vsldoi12 RHS, <4,0,5,1>
+ 2658685890U, // <u,4,0,6>: Cost 3 vsldoi4 <6,u,4,0>, <6,u,4,0>
+ 2256489928U, // <u,4,0,7>: Cost 3 vmrghw <7,0,1,2>, <4,7,5,0>
+ 1618535069U, // <u,4,0,u>: Cost 2 vsldoi8 <1,2,u,4>, LHS
+ 1189464978U, // <u,4,1,0>: Cost 2 vmrghw LHS, <4,0,5,1>
+ 2692277044U, // <u,4,1,1>: Cost 3 vsldoi8 <1,2,u,4>, <1,1,1,1>
+ 1618535367U, // <u,4,1,2>: Cost 2 vsldoi8 <1,2,u,4>, <1,2,u,4>
+ 2640775992U, // <u,4,1,3>: Cost 3 vsldoi4 <3,u,4,1>, <3,u,4,1>
+ 1189465296U, // <u,4,1,4>: Cost 2 vmrghw LHS, <4,4,4,4>
+ 115723574U, // <u,4,1,5>: Cost 1 vmrghw LHS, RHS
+ 2263207289U, // <u,4,1,6>: Cost 3 vmrghw LHS, <4,6,5,2>
+ 2664666780U, // <u,4,1,7>: Cost 3 vsldoi4 <7,u,4,1>, <7,u,4,1>
+ 115723817U, // <u,4,1,u>: Cost 1 vmrghw LHS, RHS
+ 2263919506U, // <u,4,2,0>: Cost 3 vmrghw <u,2,3,0>, <4,0,5,1>
+ 2222115812U, // <u,4,2,1>: Cost 3 vmrghw <1,2,3,0>, <4,1,5,2>
+ 2692277864U, // <u,4,2,2>: Cost 3 vsldoi8 <1,2,u,4>, <2,2,2,2>
+ 2692277926U, // <u,4,2,3>: Cost 3 vsldoi8 <1,2,u,4>, <2,3,0,1>
+ 2324114640U, // <u,4,2,4>: Cost 3 vmrglw <7,0,u,2>, <4,4,4,4>
+ 1190178102U, // <u,4,2,5>: Cost 2 vmrghw <u,2,3,0>, RHS
+ 2692278202U, // <u,4,2,6>: Cost 3 vsldoi8 <1,2,u,4>, <2,6,3,7>
+ 2701568053U, // <u,4,2,7>: Cost 3 vsldoi8 <2,7,u,4>, <2,7,u,4>
+ 1190178345U, // <u,4,2,u>: Cost 2 vmrghw <u,2,3,0>, RHS
+ 2692278422U, // <u,4,3,0>: Cost 3 vsldoi8 <1,2,u,4>, <3,0,1,2>
+ 2282981552U, // <u,4,3,1>: Cost 3 vmrglw LHS, <3,0,4,1>
+ 2704222585U, // <u,4,3,2>: Cost 3 vsldoi8 <3,2,u,4>, <3,2,u,4>
+ 2692278684U, // <u,4,3,3>: Cost 3 vsldoi8 <1,2,u,4>, <3,3,3,3>
+ 1257016528U, // <u,4,3,4>: Cost 2 vmrglw LHS, <4,4,4,4>
+ 1209239246U, // <u,4,3,5>: Cost 2 vmrglw LHS, <2,3,4,5>
+ 2691910300U, // <u,4,3,6>: Cost 3 vsldoi8 <1,2,3,4>, <3,6,4,7>
+ 2664683166U, // <u,4,3,7>: Cost 3 vsldoi4 <7,u,4,3>, <7,u,4,3>
+ 1209239249U, // <u,4,3,u>: Cost 2 vmrglw LHS, <2,3,4,u>
+ 1573027942U, // <u,4,4,0>: Cost 2 vsldoi4 <4,u,4,4>, LHS
+ 2634826695U, // <u,4,4,1>: Cost 3 vsldoi4 <2,u,4,4>, <1,2,u,4>
+ 2634827874U, // <u,4,4,2>: Cost 3 vsldoi4 <2,u,4,4>, <2,u,4,4>
+ 2289629073U, // <u,4,4,3>: Cost 3 vmrglw <1,2,u,4>, <u,2,4,3>
+ 229035318U, // <u,4,4,4>: Cost 1 vspltisw0 RHS
+ 1618537782U, // <u,4,4,5>: Cost 2 vsldoi8 <1,2,u,4>, RHS
+ 2658718662U, // <u,4,4,6>: Cost 3 vsldoi4 <6,u,4,4>, <6,u,4,4>
+ 2289629401U, // <u,4,4,7>: Cost 3 vmrglw <1,2,u,4>, <u,6,4,7>
+ 229035318U, // <u,4,4,u>: Cost 1 vspltisw0 RHS
+ 1561092198U, // <u,4,5,0>: Cost 2 vsldoi4 <2,u,4,5>, LHS
+ 2628863370U, // <u,4,5,1>: Cost 3 vsldoi4 <1,u,4,5>, <1,u,4,5>
+ 1561094243U, // <u,4,5,2>: Cost 2 vsldoi4 <2,u,4,5>, <2,u,4,5>
+ 2634836118U, // <u,4,5,3>: Cost 3 vsldoi4 <2,u,4,5>, <3,0,1,2>
+ 1561095478U, // <u,4,5,4>: Cost 2 vsldoi4 <2,u,4,5>, RHS
+ 118705462U, // <u,4,5,5>: Cost 1 vmrghw RHS, RHS
+ 604859702U, // <u,4,5,6>: Cost 1 vsldoi12 LHS, RHS
+ 2658726906U, // <u,4,5,7>: Cost 3 vsldoi4 <6,u,4,5>, <7,0,1,2>
+ 604859720U, // <u,4,5,u>: Cost 1 vsldoi12 LHS, RHS
+ 2266631058U, // <u,4,6,0>: Cost 3 vmrghw <u,6,3,7>, <4,0,5,1>
+ 2302692152U, // <u,4,6,1>: Cost 3 vmrglw <3,4,5,6>, <3,u,4,1>
+ 2718822906U, // <u,4,6,2>: Cost 3 vsldoi8 <5,6,u,4>, <6,2,7,3>
+ 2722804309U, // <u,4,6,3>: Cost 3 vsldoi8 <6,3,u,4>, <6,3,u,4>
+ 2723467942U, // <u,4,6,4>: Cost 3 vsldoi8 <6,4,u,4>, <6,4,u,4>
+ 1192889654U, // <u,4,6,5>: Cost 2 vmrghw <u,6,3,7>, RHS
+ 2718823224U, // <u,4,6,6>: Cost 3 vsldoi8 <5,6,u,4>, <6,6,6,6>
+ 2718823246U, // <u,4,6,7>: Cost 3 vsldoi8 <5,6,u,4>, <6,7,0,1>
+ 1192889897U, // <u,4,6,u>: Cost 2 vmrghw <u,6,3,7>, RHS
+ 2640822374U, // <u,4,7,0>: Cost 3 vsldoi4 <3,u,4,7>, LHS
+ 2640823194U, // <u,4,7,1>: Cost 3 vsldoi4 <3,u,4,7>, <1,2,3,4>
+ 2728113373U, // <u,4,7,2>: Cost 3 vsldoi8 <7,2,u,4>, <7,2,u,4>
+ 2640825150U, // <u,4,7,3>: Cost 3 vsldoi4 <3,u,4,7>, <3,u,4,7>
+ 1235815632U, // <u,4,7,4>: Cost 2 vmrglw RHS, <4,4,4,4>
+ 1235814094U, // <u,4,7,5>: Cost 2 vmrglw RHS, <2,3,4,5>
+ 2730767905U, // <u,4,7,6>: Cost 3 vsldoi8 <7,6,u,4>, <7,6,u,4>
+ 2309556892U, // <u,4,7,7>: Cost 3 vmrglw RHS, <3,6,4,7>
+ 1235814097U, // <u,4,7,u>: Cost 2 vmrglw RHS, <2,3,4,u>
+ 1561116774U, // <u,4,u,0>: Cost 2 vsldoi4 <2,u,4,u>, LHS
+ 1618540334U, // <u,4,u,1>: Cost 2 vsldoi8 <1,2,u,4>, LHS
+ 1561118822U, // <u,4,u,2>: Cost 2 vsldoi4 <2,u,4,u>, <2,u,4,u>
+ 2692282300U, // <u,4,u,3>: Cost 3 vsldoi8 <1,2,u,4>, <u,3,0,1>
+ 229035318U, // <u,4,u,4>: Cost 1 vspltisw0 RHS
+ 120368438U, // <u,4,u,5>: Cost 1 vmrghw LHS, RHS
+ 604859945U, // <u,4,u,6>: Cost 1 vsldoi12 LHS, RHS
+ 2309565084U, // <u,4,u,7>: Cost 3 vmrglw RHS, <3,6,4,7>
+ 604859963U, // <u,4,u,u>: Cost 1 vsldoi12 LHS, RHS
+ 2690293760U, // <u,5,0,0>: Cost 3 vsldoi8 <0,u,u,5>, <0,0,0,0>
+ 1616552038U, // <u,5,0,1>: Cost 2 vsldoi8 <0,u,u,5>, LHS
+ 2640840434U, // <u,5,0,2>: Cost 3 vsldoi4 <3,u,5,0>, <2,3,u,5>
+ 2640841536U, // <u,5,0,3>: Cost 3 vsldoi4 <3,u,5,0>, <3,u,5,0>
+ 1613381970U, // <u,5,0,4>: Cost 2 vsldoi8 <0,4,1,5>, <0,4,1,5>
+ 2316135642U, // <u,5,0,5>: Cost 3 vmrglw <5,6,u,0>, <4,4,5,5>
+ 2289592834U, // <u,5,0,6>: Cost 3 vmrglw <1,2,u,0>, <3,4,5,6>
+ 2664732324U, // <u,5,0,7>: Cost 3 vsldoi4 <7,u,5,0>, <7,u,5,0>
+ 1616552661U, // <u,5,0,u>: Cost 2 vsldoi8 <0,u,u,5>, <0,u,u,5>
+ 1573077094U, // <u,5,1,0>: Cost 2 vsldoi4 <4,u,5,1>, LHS
+ 1237536282U, // <u,5,1,1>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1>
+ 2690294678U, // <u,5,1,2>: Cost 3 vsldoi8 <0,u,u,5>, <1,2,3,0>
+ 2646821014U, // <u,5,1,3>: Cost 3 vsldoi4 <4,u,5,1>, <3,0,1,2>
+ 1573080602U, // <u,5,1,4>: Cost 2 vsldoi4 <4,u,5,1>, <4,u,5,1>
+ 1189466116U, // <u,5,1,5>: Cost 2 vmrghw LHS, <5,5,5,5>
+ 1189466210U, // <u,5,1,6>: Cost 2 vmrghw LHS, <5,6,7,0>
+ 2646823930U, // <u,5,1,7>: Cost 3 vsldoi4 <4,u,5,1>, <7,0,1,2>
+ 1573082926U, // <u,5,1,u>: Cost 2 vsldoi4 <4,u,5,1>, LHS
+ 2640855142U, // <u,5,2,0>: Cost 3 vsldoi4 <3,u,5,2>, LHS
+ 2697594448U, // <u,5,2,1>: Cost 3 vsldoi8 <2,1,u,5>, <2,1,u,5>
+ 2690295400U, // <u,5,2,2>: Cost 3 vsldoi8 <0,u,u,5>, <2,2,2,2>
+ 1625179890U, // <u,5,2,3>: Cost 2 vsldoi8 <2,3,u,5>, <2,3,u,5>
+ 2699585347U, // <u,5,2,4>: Cost 3 vsldoi8 <2,4,u,5>, <2,4,u,5>
+ 2781171471U, // <u,5,2,5>: Cost 3 vsldoi12 RHS, <5,2,5,3>
+ 2690295738U, // <u,5,2,6>: Cost 3 vsldoi8 <0,u,u,5>, <2,6,3,7>
+ 3775318070U, // <u,5,2,7>: Cost 4 vsldoi8 <2,7,u,5>, <2,7,u,5>
+ 1628498055U, // <u,5,2,u>: Cost 2 vsldoi8 <2,u,u,5>, <2,u,u,5>
+ 2287627234U, // <u,5,3,0>: Cost 3 vmrglw LHS, <4,1,5,0>
+ 1257016210U, // <u,5,3,1>: Cost 2 vmrglw LHS, <4,0,5,1>
+ 2646836942U, // <u,5,3,2>: Cost 3 vsldoi4 <4,u,5,3>, <2,3,4,5>
+ 2287625131U, // <u,5,3,3>: Cost 3 vmrglw LHS, <1,2,5,3>
+ 2287627238U, // <u,5,3,4>: Cost 3 vmrglw LHS, <4,1,5,4>
+ 1257016538U, // <u,5,3,5>: Cost 2 vmrglw LHS, <4,4,5,5>
+ 1209240066U, // <u,5,3,6>: Cost 2 vmrglw LHS, <3,4,5,6>
+ 2287625459U, // <u,5,3,7>: Cost 3 vmrglw LHS, <1,6,5,7>
+ 1209240068U, // <u,5,3,u>: Cost 2 vmrglw LHS, <3,4,5,u>
+ 2640871526U, // <u,5,4,0>: Cost 3 vsldoi4 <3,u,5,4>, LHS
+ 2316168082U, // <u,5,4,1>: Cost 3 vmrglw <5,6,u,4>, <4,0,5,1>
+ 2640873202U, // <u,5,4,2>: Cost 3 vsldoi4 <3,u,5,4>, <2,3,u,5>
+ 2640874308U, // <u,5,4,3>: Cost 3 vsldoi4 <3,u,5,4>, <3,u,5,4>
+ 1637788917U, // <u,5,4,4>: Cost 2 vsldoi8 <4,4,u,5>, <4,4,u,5>
+ 1616555318U, // <u,5,4,5>: Cost 2 vsldoi8 <0,u,u,5>, RHS
+ 2287638591U, // <u,5,4,6>: Cost 3 vmrglw <0,u,u,4>, <u,4,5,6>
+ 2664765096U, // <u,5,4,7>: Cost 3 vsldoi4 <7,u,5,4>, <7,u,5,4>
+ 1616555561U, // <u,5,4,u>: Cost 2 vsldoi8 <0,u,u,5>, RHS
+ 1573109862U, // <u,5,5,0>: Cost 2 vsldoi4 <4,u,5,5>, LHS
+ 2646852404U, // <u,5,5,1>: Cost 3 vsldoi4 <4,u,5,5>, <1,1,1,1>
+ 2646853224U, // <u,5,5,2>: Cost 3 vsldoi4 <4,u,5,5>, <2,2,2,2>
+ 2287646618U, // <u,5,5,3>: Cost 3 vmrglw <0,u,u,5>, <u,2,5,3>
+ 1573113374U, // <u,5,5,4>: Cost 2 vsldoi4 <4,u,5,5>, <4,u,5,5>
+ 296144182U, // <u,5,5,5>: Cost 1 vspltisw1 RHS
+ 1192448098U, // <u,5,5,6>: Cost 2 vmrghw RHS, <5,6,7,0>
+ 2287646946U, // <u,5,5,7>: Cost 3 vmrglw <0,u,u,5>, <u,6,5,7>
+ 296144182U, // <u,5,5,u>: Cost 1 vspltisw1 RHS
+ 1567146086U, // <u,5,6,0>: Cost 2 vsldoi4 <3,u,5,6>, LHS
+ 2628945300U, // <u,5,6,1>: Cost 3 vsldoi4 <1,u,5,6>, <1,u,5,6>
+ 2634917997U, // <u,5,6,2>: Cost 3 vsldoi4 <2,u,5,6>, <2,u,5,6>
+ 1567148870U, // <u,5,6,3>: Cost 2 vsldoi4 <3,u,5,6>, <3,u,5,6>
+ 1567149366U, // <u,5,6,4>: Cost 2 vsldoi4 <3,u,5,6>, RHS
+ 2781171799U, // <u,5,6,5>: Cost 3 vsldoi12 RHS, <5,6,5,7>
+ 1228950018U, // <u,5,6,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6>
+ 27705344U, // <u,5,6,7>: Cost 0 copy RHS
+ 27705344U, // <u,5,6,u>: Cost 0 copy RHS
+ 2628952166U, // <u,5,7,0>: Cost 3 vsldoi4 <1,u,5,7>, LHS
+ 1235815314U, // <u,5,7,1>: Cost 2 vmrglw RHS, <4,0,5,1>
+ 2309556734U, // <u,5,7,2>: Cost 3 vmrglw RHS, <3,4,5,2>
+ 2309555115U, // <u,5,7,3>: Cost 3 vmrglw RHS, <1,2,5,3>
+ 2628955446U, // <u,5,7,4>: Cost 3 vsldoi4 <1,u,5,7>, RHS
+ 1235815642U, // <u,5,7,5>: Cost 2 vmrglw RHS, <4,4,5,5>
+ 1235814914U, // <u,5,7,6>: Cost 2 vmrglw RHS, <3,4,5,6>
+ 2309555443U, // <u,5,7,7>: Cost 3 vmrglw RHS, <1,6,5,7>
+ 1235814916U, // <u,5,7,u>: Cost 2 vmrglw RHS, <3,4,5,u>
+ 1567162470U, // <u,5,u,0>: Cost 2 vsldoi4 <3,u,5,u>, LHS
+ 1616557870U, // <u,5,u,1>: Cost 2 vsldoi8 <0,u,u,5>, LHS
+ 2690299781U, // <u,5,u,2>: Cost 3 vsldoi8 <0,u,u,5>, <u,2,3,0>
+ 1567165256U, // <u,5,u,3>: Cost 2 vsldoi4 <3,u,5,u>, <3,u,5,u>
+ 1567165750U, // <u,5,u,4>: Cost 2 vsldoi4 <3,u,5,u>, RHS
+ 296144182U, // <u,5,u,5>: Cost 1 vspltisw1 RHS
+ 1209281026U, // <u,5,u,6>: Cost 2 vmrglw LHS, <3,4,5,6>
+ 27705344U, // <u,5,u,7>: Cost 0 copy RHS
+ 27705344U, // <u,5,u,u>: Cost 0 copy RHS
+ 2705563648U, // <u,6,0,0>: Cost 3 vsldoi8 <3,4,u,6>, <0,0,0,0>
+ 1631821926U, // <u,6,0,1>: Cost 2 vsldoi8 <3,4,u,6>, LHS
+ 2262462970U, // <u,6,0,2>: Cost 3 vmrghw <u,0,1,2>, <6,2,7,3>
+ 2646886941U, // <u,6,0,3>: Cost 3 vsldoi4 <4,u,6,0>, <3,4,u,6>
+ 2705563986U, // <u,6,0,4>: Cost 3 vsldoi8 <3,4,u,6>, <0,4,1,5>
+ 2316062652U, // <u,6,0,5>: Cost 3 vmrglw <5,6,7,0>, <5,4,6,5>
+ 2316137272U, // <u,6,0,6>: Cost 3 vmrglw <5,6,u,0>, <6,6,6,6>
+ 1215851830U, // <u,6,0,7>: Cost 2 vmrglw <1,2,u,0>, RHS
+ 1215851831U, // <u,6,0,u>: Cost 2 vmrglw <1,2,u,0>, RHS
+ 2634948710U, // <u,6,1,0>: Cost 3 vsldoi4 <2,u,6,1>, LHS
+ 2705564468U, // <u,6,1,1>: Cost 3 vsldoi8 <3,4,u,6>, <1,1,1,1>
+ 1189466618U, // <u,6,1,2>: Cost 2 vmrghw LHS, <6,2,7,3>
+ 2263208498U, // <u,6,1,3>: Cost 3 vmrghw LHS, <6,3,4,5>
+ 2693620843U, // <u,6,1,4>: Cost 3 vsldoi8 <1,4,u,6>, <1,4,u,6>
+ 2652868860U, // <u,6,1,5>: Cost 3 vsldoi4 <5,u,6,1>, <5,u,6,1>
+ 1189466936U, // <u,6,1,6>: Cost 2 vmrghw LHS, <6,6,6,6>
+ 1213869366U, // <u,6,1,7>: Cost 2 vmrglw <0,u,u,1>, RHS
+ 1213869367U, // <u,6,1,u>: Cost 2 vmrglw <0,u,u,1>, RHS
+ 2658844774U, // <u,6,2,0>: Cost 3 vsldoi4 <6,u,6,2>, LHS
+ 3771344465U, // <u,6,2,1>: Cost 4 vsldoi8 <2,1,u,6>, <2,1,u,6>
+ 1178554874U, // <u,6,2,2>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3>
+ 2698929907U, // <u,6,2,3>: Cost 3 vsldoi8 <2,3,u,6>, <2,3,u,6>
+ 2699593540U, // <u,6,2,4>: Cost 3 vsldoi8 <2,4,u,6>, <2,4,u,6>
+ 2700257173U, // <u,6,2,5>: Cost 3 vsldoi8 <2,5,u,6>, <2,5,u,6>
+ 2705565626U, // <u,6,2,6>: Cost 3 vsldoi8 <3,4,u,6>, <2,6,3,7>
+ 1226485046U, // <u,6,2,7>: Cost 2 vmrglw <3,0,u,2>, RHS
+ 1226485047U, // <u,6,2,u>: Cost 2 vmrglw <3,0,u,2>, RHS
+ 2705565846U, // <u,6,3,0>: Cost 3 vsldoi8 <3,4,u,6>, <3,0,1,2>
+ 2330756585U, // <u,6,3,1>: Cost 3 vmrglw LHS, <2,0,6,1>
+ 2330756829U, // <u,6,3,2>: Cost 3 vmrglw LHS, <2,3,6,2>
+ 2282981734U, // <u,6,3,3>: Cost 3 vmrglw LHS, <3,2,6,3>
+ 1631824413U, // <u,6,3,4>: Cost 2 vsldoi8 <3,4,u,6>, <3,4,u,6>
+ 2652885246U, // <u,6,3,5>: Cost 3 vsldoi4 <5,u,6,3>, <5,u,6,3>
+ 1257018168U, // <u,6,3,6>: Cost 2 vmrglw LHS, <6,6,6,6>
+ 135499062U, // <u,6,3,7>: Cost 1 vmrglw LHS, RHS
+ 135499063U, // <u,6,3,u>: Cost 1 vmrglw LHS, RHS
+ 2646917222U, // <u,6,4,0>: Cost 3 vsldoi4 <4,u,6,4>, LHS
+ 2217365931U, // <u,6,4,1>: Cost 3 vmrghw <0,4,1,5>, <6,1,7,5>
+ 2790167156U, // <u,6,4,2>: Cost 3 vsldoi12 <6,4,2,u>, <6,4,2,u>
+ 2646919709U, // <u,6,4,3>: Cost 3 vsldoi4 <4,u,6,4>, <3,4,u,6>
+ 2711538934U, // <u,6,4,4>: Cost 3 vsldoi8 <4,4,u,6>, <4,4,u,6>
+ 1631825206U, // <u,6,4,5>: Cost 2 vsldoi8 <3,4,u,6>, RHS
+ 2316170040U, // <u,6,4,6>: Cost 3 vmrglw <5,6,u,4>, <6,6,6,6>
+ 1215884598U, // <u,6,4,7>: Cost 2 vmrglw <1,2,u,4>, RHS
+ 1215884599U, // <u,6,4,u>: Cost 2 vmrglw <1,2,u,4>, RHS
+ 2634981478U, // <u,6,5,0>: Cost 3 vsldoi4 <2,u,6,5>, LHS
+ 2266190247U, // <u,6,5,1>: Cost 3 vmrghw RHS, <6,1,7,1>
+ 1192448506U, // <u,6,5,2>: Cost 2 vmrghw RHS, <6,2,7,3>
+ 2266190386U, // <u,6,5,3>: Cost 3 vmrghw RHS, <6,3,4,5>
+ 2634984758U, // <u,6,5,4>: Cost 3 vsldoi4 <2,u,6,5>, RHS
+ 2652901632U, // <u,6,5,5>: Cost 3 vsldoi4 <5,u,6,5>, <5,u,6,5>
+ 1192448824U, // <u,6,5,6>: Cost 2 vmrghw RHS, <6,6,6,6>
+ 1213902134U, // <u,6,5,7>: Cost 2 vmrglw <0,u,u,5>, RHS
+ 1213902135U, // <u,6,5,u>: Cost 2 vmrglw <0,u,u,5>, RHS
+ 1583808614U, // <u,6,6,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS
+ 2322010445U, // <u,6,6,1>: Cost 3 vmrglw <6,6,6,6>, <6,0,6,1>
+ 2718839290U, // <u,6,6,2>: Cost 3 vsldoi8 <5,6,u,6>, <6,2,7,3>
+ 2670823965U, // <u,6,6,3>: Cost 3 vsldoi4 <u,u,6,6>, <3,4,u,6>
+ 1583811894U, // <u,6,6,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS
+ 2724147961U, // <u,6,6,5>: Cost 3 vsldoi8 <6,5,u,6>, <6,5,u,6>
+ 363253046U, // <u,6,6,6>: Cost 1 vspltisw2 RHS
+ 1229172022U, // <u,6,6,7>: Cost 2 vmrglw <3,4,u,6>, RHS
+ 363253046U, // <u,6,6,u>: Cost 1 vspltisw2 RHS
+ 499458150U, // <u,6,7,0>: Cost 1 vsldoi4 RHS, LHS
+ 1573200692U, // <u,6,7,1>: Cost 2 vsldoi4 RHS, <1,1,1,1>
+ 1573201512U, // <u,6,7,2>: Cost 2 vsldoi4 RHS, <2,2,2,2>
+ 1573202070U, // <u,6,7,3>: Cost 2 vsldoi4 RHS, <3,0,1,2>
+ 499461673U, // <u,6,7,4>: Cost 1 vsldoi4 RHS, RHS
+ 1573203972U, // <u,6,7,5>: Cost 2 vsldoi4 RHS, <5,5,5,5>
+ 1235817272U, // <u,6,7,6>: Cost 2 vmrglw RHS, <6,6,6,6>
+ 162073910U, // <u,6,7,7>: Cost 1 vmrglw RHS, RHS
+ 162073911U, // <u,6,7,u>: Cost 1 vmrglw RHS, RHS
+ 499466342U, // <u,6,u,0>: Cost 1 vsldoi4 RHS, LHS
+ 1631827758U, // <u,6,u,1>: Cost 2 vsldoi8 <3,4,u,6>, LHS
+ 1573209704U, // <u,6,u,2>: Cost 2 vsldoi4 RHS, <2,2,2,2>
+ 1573210262U, // <u,6,u,3>: Cost 2 vsldoi4 RHS, <3,0,1,2>
+ 499469866U, // <u,6,u,4>: Cost 1 vsldoi4 RHS, RHS
+ 1631828122U, // <u,6,u,5>: Cost 2 vsldoi8 <3,4,u,6>, RHS
+ 363253046U, // <u,6,u,6>: Cost 1 vspltisw2 RHS
+ 135540022U, // <u,6,u,7>: Cost 1 vmrglw LHS, RHS
+ 135540023U, // <u,6,u,u>: Cost 1 vmrglw LHS, RHS
+ 1638465536U, // <u,7,0,0>: Cost 2 vsldoi8 RHS, <0,0,0,0>
+ 564723814U, // <u,7,0,1>: Cost 1 vsldoi8 RHS, LHS
+ 2712207533U, // <u,7,0,2>: Cost 3 vsldoi8 RHS, <0,2,1,2>
+ 2712207612U, // <u,7,0,3>: Cost 3 vsldoi8 RHS, <0,3,1,0>
+ 1638465874U, // <u,7,0,4>: Cost 2 vsldoi8 RHS, <0,4,1,5>
+ 1579192580U, // <u,7,0,5>: Cost 2 vsldoi4 <5,u,7,0>, <5,u,7,0>
+ 2712207862U, // <u,7,0,6>: Cost 3 vsldoi8 RHS, <0,6,1,7>
+ 2316137282U, // <u,7,0,7>: Cost 3 vmrglw <5,6,u,0>, <6,6,7,7>
+ 564724381U, // <u,7,0,u>: Cost 1 vsldoi8 RHS, LHS
+ 1189467130U, // <u,7,1,0>: Cost 2 vmrghw LHS, <7,0,1,2>
+ 1638466356U, // <u,7,1,1>: Cost 2 vsldoi8 RHS, <1,1,1,1>
+ 1638466454U, // <u,7,1,2>: Cost 2 vsldoi8 RHS, <1,2,3,0>
+ 2311500282U, // <u,7,1,3>: Cost 3 vmrglw <4,u,u,1>, <6,2,7,3>
+ 1189467494U, // <u,7,1,4>: Cost 2 vmrghw LHS, <7,4,5,6>
+ 2712208495U, // <u,7,1,5>: Cost 3 vsldoi8 RHS, <1,5,0,1>
+ 2694956302U, // <u,7,1,6>: Cost 3 vsldoi8 <1,6,u,7>, <1,6,u,7>
+ 1189467756U, // <u,7,1,7>: Cost 2 vmrghw LHS, <7,7,7,7>
+ 1638466940U, // <u,7,1,u>: Cost 2 vsldoi8 RHS, <1,u,3,0>
+ 2712208829U, // <u,7,2,0>: Cost 3 vsldoi8 RHS, <2,0,1,2>
+ 2712208927U, // <u,7,2,1>: Cost 3 vsldoi8 RHS, <2,1,3,1>
+ 1638467176U, // <u,7,2,2>: Cost 2 vsldoi8 RHS, <2,2,2,2>
+ 1638467238U, // <u,7,2,3>: Cost 2 vsldoi8 RHS, <2,3,0,1>
+ 2712209165U, // <u,7,2,4>: Cost 3 vsldoi8 RHS, <2,4,2,5>
+ 2712209256U, // <u,7,2,5>: Cost 3 vsldoi8 RHS, <2,5,3,6>
+ 1627187175U, // <u,7,2,6>: Cost 2 vsldoi8 <2,6,u,7>, <2,6,u,7>
+ 2324116290U, // <u,7,2,7>: Cost 3 vmrglw <7,0,u,2>, <6,6,7,7>
+ 1628514441U, // <u,7,2,u>: Cost 2 vsldoi8 <2,u,u,7>, <2,u,u,7>
+ 1638467734U, // <u,7,3,0>: Cost 2 vsldoi8 RHS, <3,0,1,2>
+ 2712209638U, // <u,7,3,1>: Cost 3 vsldoi8 RHS, <3,1,1,1>
+ 2700929387U, // <u,7,3,2>: Cost 3 vsldoi8 <2,6,u,7>, <3,2,6,u>
+ 1638467996U, // <u,7,3,3>: Cost 2 vsldoi8 RHS, <3,3,3,3>
+ 1638468098U, // <u,7,3,4>: Cost 2 vsldoi8 RHS, <3,4,5,6>
+ 2712210002U, // <u,7,3,5>: Cost 3 vsldoi8 RHS, <3,5,5,5>
+ 1585189856U, // <u,7,3,6>: Cost 2 vsldoi4 <6,u,7,3>, <6,u,7,3>
+ 1257018178U, // <u,7,3,7>: Cost 2 vmrglw LHS, <6,6,7,7>
+ 1638468382U, // <u,7,3,u>: Cost 2 vsldoi8 RHS, <3,u,1,2>
+ 1638468498U, // <u,7,4,0>: Cost 2 vsldoi8 RHS, <4,0,5,1>
+ 2712210378U, // <u,7,4,1>: Cost 3 vsldoi8 RHS, <4,1,2,3>
+ 2712210485U, // <u,7,4,2>: Cost 3 vsldoi8 RHS, <4,2,5,2>
+ 2712210564U, // <u,7,4,3>: Cost 3 vsldoi8 RHS, <4,3,5,0>
+ 1638468816U, // <u,7,4,4>: Cost 2 vsldoi8 RHS, <4,4,4,4>
+ 564727112U, // <u,7,4,5>: Cost 1 vsldoi8 RHS, RHS
+ 2712210809U, // <u,7,4,6>: Cost 3 vsldoi8 RHS, <4,6,5,2>
+ 2712210888U, // <u,7,4,7>: Cost 3 vsldoi8 RHS, <4,7,5,0>
+ 564727337U, // <u,7,4,u>: Cost 1 vsldoi8 RHS, RHS
+ 1192449018U, // <u,7,5,0>: Cost 2 vmrghw RHS, <7,0,1,2>
+ 2714201743U, // <u,7,5,1>: Cost 3 vsldoi8 RHS, <5,1,0,1>
+ 2712211198U, // <u,7,5,2>: Cost 3 vsldoi8 RHS, <5,2,3,4>
+ 2311533050U, // <u,7,5,3>: Cost 3 vmrglw <4,u,u,5>, <6,2,7,3>
+ 1192449382U, // <u,7,5,4>: Cost 2 vmrghw RHS, <7,4,5,6>
+ 1638469636U, // <u,7,5,5>: Cost 2 vsldoi8 RHS, <5,5,5,5>
+ 1638469730U, // <u,7,5,6>: Cost 2 vsldoi8 RHS, <5,6,7,0>
+ 1192449644U, // <u,7,5,7>: Cost 2 vmrghw RHS, <7,7,7,7>
+ 1638469892U, // <u,7,5,u>: Cost 2 vsldoi8 RHS, <5,u,7,0>
+ 2712211745U, // <u,7,6,0>: Cost 3 vsldoi8 RHS, <6,0,1,2>
+ 2712211879U, // <u,7,6,1>: Cost 3 vsldoi8 RHS, <6,1,7,1>
+ 1638470138U, // <u,7,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3>
+ 2712212018U, // <u,7,6,3>: Cost 3 vsldoi8 RHS, <6,3,4,5>
+ 2712212109U, // <u,7,6,4>: Cost 3 vsldoi8 RHS, <6,4,5,6>
+ 2712212203U, // <u,7,6,5>: Cost 3 vsldoi8 RHS, <6,5,7,1>
+ 1638470456U, // <u,7,6,6>: Cost 2 vsldoi8 RHS, <6,6,6,6>
+ 1638470478U, // <u,7,6,7>: Cost 2 vsldoi8 RHS, <6,7,0,1>
+ 1638470559U, // <u,7,6,u>: Cost 2 vsldoi8 RHS, <6,u,0,1>
+ 1235816546U, // <u,7,7,0>: Cost 2 vmrglw RHS, <5,6,7,0>
+ 2309558371U, // <u,7,7,1>: Cost 3 vmrglw RHS, <5,6,7,1>
+ 2641045434U, // <u,7,7,2>: Cost 3 vsldoi4 <3,u,7,7>, <2,6,3,7>
+ 1235816954U, // <u,7,7,3>: Cost 2 vmrglw RHS, <6,2,7,3>
+ 1235816550U, // <u,7,7,4>: Cost 2 vmrglw RHS, <5,6,7,4>
+ 2309558375U, // <u,7,7,5>: Cost 3 vmrglw RHS, <5,6,7,5>
+ 1585222628U, // <u,7,7,6>: Cost 2 vsldoi4 <6,u,7,7>, <6,u,7,7>
+ 430361910U, // <u,7,7,7>: Cost 1 vspltisw3 RHS
+ 430361910U, // <u,7,7,u>: Cost 1 vspltisw3 RHS
+ 1638471379U, // <u,7,u,0>: Cost 2 vsldoi8 RHS, <u,0,1,2>
+ 564729646U, // <u,7,u,1>: Cost 1 vsldoi8 RHS, LHS
+ 1638471557U, // <u,7,u,2>: Cost 2 vsldoi8 RHS, <u,2,3,0>
+ 1638471612U, // <u,7,u,3>: Cost 2 vsldoi8 RHS, <u,3,0,1>
+ 1638471743U, // <u,7,u,4>: Cost 2 vsldoi8 RHS, <u,4,5,6>
+ 564730010U, // <u,7,u,5>: Cost 1 vsldoi8 RHS, RHS
+ 1638471888U, // <u,7,u,6>: Cost 2 vsldoi8 RHS, <u,6,3,7>
+ 430361910U, // <u,7,u,7>: Cost 1 vspltisw3 RHS
+ 564730213U, // <u,7,u,u>: Cost 1 vsldoi8 RHS, LHS
+ 202162278U, // <u,u,0,0>: Cost 1 vspltisw0 LHS
+ 538189985U, // <u,u,0,1>: Cost 1 vsldoi8 LHS, LHS
+ 2685673645U, // <u,u,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2>
+ 1215848604U, // <u,u,0,3>: Cost 2 vmrglw <1,2,u,0>, LHS
+ 1611931986U, // <u,u,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5>
+ 1579266317U, // <u,u,0,5>: Cost 2 vsldoi4 <5,u,u,0>, <5,u,u,0>
+ 2289592861U, // <u,u,0,6>: Cost 3 vmrglw <1,2,u,0>, <3,4,u,6>
+ 1215851848U, // <u,u,0,7>: Cost 2 vmrglw <1,2,u,0>, RHS
+ 538190493U, // <u,u,0,u>: Cost 1 vsldoi8 LHS, LHS
+ 1549411025U, // <u,u,1,0>: Cost 2 vsldoi4 <0,u,u,1>, <0,u,u,1>
+ 115726126U, // <u,u,1,1>: Cost 1 vmrghw LHS, LHS
+ 604862254U, // <u,u,1,2>: Cost 1 vsldoi12 LHS, LHS
+ 1213866140U, // <u,u,1,3>: Cost 2 vmrglw <0,u,u,1>, LHS
+ 1549413686U, // <u,u,1,4>: Cost 2 vsldoi4 <0,u,u,1>, RHS
+ 115726490U, // <u,u,1,5>: Cost 1 vmrghw LHS, RHS
+ 1585247207U, // <u,u,1,6>: Cost 2 vsldoi4 <6,u,u,1>, <6,u,u,1>
+ 1213869384U, // <u,u,1,7>: Cost 2 vmrglw <0,u,u,1>, RHS
+ 604862308U, // <u,u,1,u>: Cost 1 vsldoi12 LHS, LHS
+ 1567334502U, // <u,u,2,0>: Cost 2 vsldoi4 <3,u,u,2>, LHS
+ 1190180654U, // <u,u,2,1>: Cost 2 vmrghw <u,2,3,0>, LHS
+ 336380006U, // <u,u,2,2>: Cost 1 vspltisw2 LHS
+ 835584U, // <u,u,2,3>: Cost 0 copy LHS
+ 1567337782U, // <u,u,2,4>: Cost 2 vsldoi4 <3,u,u,2>, RHS
+ 1190181018U, // <u,u,2,5>: Cost 2 vmrghw <u,2,3,0>, RHS
+ 1611933626U, // <u,u,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7>
+ 1226485064U, // <u,u,2,7>: Cost 2 vmrglw <3,0,u,2>, RHS
+ 835584U, // <u,u,2,u>: Cost 0 copy LHS
+ 475685587U, // <u,u,3,0>: Cost 1 vsldoi4 LHS, LHS
+ 1209239278U, // <u,u,3,1>: Cost 2 vmrglw LHS, <2,3,u,1>
+ 1209239765U, // <u,u,3,2>: Cost 2 vmrglw LHS, <3,0,u,2>
+ 135495836U, // <u,u,3,3>: Cost 1 vmrglw LHS, LHS
+ 475688246U, // <u,u,3,4>: Cost 1 vsldoi4 LHS, RHS
+ 1209239282U, // <u,u,3,5>: Cost 2 vmrglw LHS, <2,3,u,5>
+ 1209240093U, // <u,u,3,6>: Cost 2 vmrglw LHS, <3,4,u,6>
+ 135499080U, // <u,u,3,7>: Cost 1 vmrglw LHS, RHS
+ 135495841U, // <u,u,3,u>: Cost 1 vmrglw LHS, LHS
+ 1555406950U, // <u,u,4,0>: Cost 2 vsldoi4 <1,u,u,4>, LHS
+ 1555408301U, // <u,u,4,1>: Cost 2 vsldoi4 <1,u,u,4>, <1,u,u,4>
+ 2289625301U, // <u,u,4,2>: Cost 3 vmrglw <1,2,u,4>, <3,0,u,2>
+ 1215881372U, // <u,u,4,3>: Cost 2 vmrglw <1,2,u,4>, LHS
+ 229035318U, // <u,u,4,4>: Cost 1 vspltisw0 RHS
+ 538193206U, // <u,u,4,5>: Cost 1 vsldoi8 LHS, RHS
+ 2289625629U, // <u,u,4,6>: Cost 3 vmrglw <1,2,u,4>, <3,4,u,6>
+ 1215884616U, // <u,u,4,7>: Cost 2 vmrglw <1,2,u,4>, RHS
+ 538193449U, // <u,u,4,u>: Cost 1 vsldoi8 LHS, RHS
+ 1549443797U, // <u,u,5,0>: Cost 2 vsldoi4 <0,u,u,5>, <0,u,u,5>
+ 118708014U, // <u,u,5,1>: Cost 1 vmrghw RHS, LHS
+ 1561389191U, // <u,u,5,2>: Cost 2 vsldoi4 <2,u,u,5>, <2,u,u,5>
+ 1213898908U, // <u,u,5,3>: Cost 2 vmrglw <0,u,u,5>, LHS
+ 1549446454U, // <u,u,5,4>: Cost 2 vsldoi4 <0,u,u,5>, RHS
+ 118708378U, // <u,u,5,5>: Cost 1 vmrghw RHS, RHS
+ 604862618U, // <u,u,5,6>: Cost 1 vsldoi12 LHS, RHS
+ 1213902152U, // <u,u,5,7>: Cost 2 vmrglw <0,u,u,5>, RHS
+ 604862636U, // <u,u,5,u>: Cost 1 vsldoi12 LHS, RHS
+ 1567367270U, // <u,u,6,0>: Cost 2 vsldoi4 <3,u,u,6>, LHS
+ 1192892206U, // <u,u,6,1>: Cost 2 vmrghw <u,6,3,7>, LHS
+ 1638478330U, // <u,u,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3>
+ 1679046864U, // <u,u,6,3>: Cost 2 vsldoi12 LHS, <u,6,3,7>
+ 1567370550U, // <u,u,6,4>: Cost 2 vsldoi4 <3,u,u,6>, RHS
+ 1192892570U, // <u,u,6,5>: Cost 2 vmrghw <u,6,3,7>, RHS
+ 363253046U, // <u,u,6,6>: Cost 1 vspltisw2 RHS
+ 27705344U, // <u,u,6,7>: Cost 0 copy RHS
+ 27705344U, // <u,u,6,u>: Cost 0 copy RHS
+ 499605606U, // <u,u,7,0>: Cost 1 vsldoi4 RHS, LHS
+ 1235812425U, // <u,u,7,1>: Cost 2 vmrglw RHS, <0,0,u,1>
+ 1561405577U, // <u,u,7,2>: Cost 2 vsldoi4 <2,u,u,7>, <2,u,u,7>
+ 162070684U, // <u,u,7,3>: Cost 1 vmrglw RHS, LHS
+ 499609147U, // <u,u,7,4>: Cost 1 vsldoi4 RHS, RHS
+ 1235812753U, // <u,u,7,5>: Cost 2 vmrglw RHS, <0,4,u,5>
+ 1235814941U, // <u,u,7,6>: Cost 2 vmrglw RHS, <3,4,u,6>
+ 162073928U, // <u,u,7,7>: Cost 1 vmrglw RHS, RHS
+ 162070689U, // <u,u,7,u>: Cost 1 vmrglw RHS, LHS
+ 475726552U, // <u,u,u,0>: Cost 1 vsldoi4 LHS, LHS
+ 538195758U, // <u,u,u,1>: Cost 1 vsldoi8 LHS, LHS
+ 604862821U, // <u,u,u,2>: Cost 1 vsldoi12 LHS, LHS
+ 835584U, // <u,u,u,3>: Cost 0 copy LHS
+ 475729206U, // <u,u,u,4>: Cost 1 vsldoi4 LHS, RHS
+ 538196122U, // <u,u,u,5>: Cost 1 vsldoi8 LHS, RHS
+ 604862861U, // <u,u,u,6>: Cost 1 vsldoi12 LHS, RHS
+ 27705344U, // <u,u,u,7>: Cost 0 copy RHS
+ 835584U, // <u,u,u,u>: Cost 0 copy LHS
+ 0
+};
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp b/contrib/llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp
new file mode 100644
index 000000000000..8a18ab9e0e9a
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp
@@ -0,0 +1,166 @@
+//===----- PPCQPXLoadSplat.cpp - QPX Load Splat Simplification ------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The QPX vector registers overlay the scalar floating-point registers, and
+// any scalar floating-point loads splat their value across all vector lanes.
+// Thus, if we have a scalar load followed by a splat, we can remove the splat
+// (i.e. replace the load with a load-and-splat pseudo instruction).
+//
+// This pass must run after anything that might do store-to-load forwarding.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "PPCInstrBuilder.h"
+#include "PPCInstrInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-qpx-load-splat"
+
+STATISTIC(NumSimplified, "Number of QPX load splats simplified");
+
+namespace llvm {
+ void initializePPCQPXLoadSplatPass(PassRegistry&);
+}
+
+namespace {
+ struct PPCQPXLoadSplat : public MachineFunctionPass {
+ static char ID;
+ PPCQPXLoadSplat() : MachineFunctionPass(ID) {
+ initializePPCQPXLoadSplatPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+
+ StringRef getPassName() const override {
+ return "PowerPC QPX Load Splat Simplification";
+ }
+ };
+ char PPCQPXLoadSplat::ID = 0;
+}
+
+INITIALIZE_PASS(PPCQPXLoadSplat, "ppc-qpx-load-splat",
+ "PowerPC QPX Load Splat Simplification",
+ false, false)
+
+FunctionPass *llvm::createPPCQPXLoadSplatPass() {
+ return new PPCQPXLoadSplat();
+}
+
+bool PPCQPXLoadSplat::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
+ bool MadeChange = false;
+ const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+
+ for (auto MFI = MF.begin(), MFIE = MF.end(); MFI != MFIE; ++MFI) {
+ MachineBasicBlock *MBB = &*MFI;
+ SmallVector<MachineInstr *, 4> Splats;
+
+ for (auto MBBI = MBB->rbegin(); MBBI != MBB->rend(); ++MBBI) {
+ MachineInstr *MI = &*MBBI;
+
+ if (MI->hasUnmodeledSideEffects() || MI->isCall()) {
+ Splats.clear();
+ continue;
+ }
+
+ // We're looking for a sequence like this:
+ // %F0<def> = LFD 0, %X3<kill>, %QF0<imp-def>; mem:LD8[%a](tbaa=!2)
+ // %QF1<def> = QVESPLATI %QF0<kill>, 0, %RM<imp-use>
+
+ for (auto SI = Splats.begin(); SI != Splats.end();) {
+ MachineInstr *SMI = *SI;
+ unsigned SplatReg = SMI->getOperand(0).getReg();
+ unsigned SrcReg = SMI->getOperand(1).getReg();
+
+ if (MI->modifiesRegister(SrcReg, TRI)) {
+ switch (MI->getOpcode()) {
+ default:
+ SI = Splats.erase(SI);
+ continue;
+ case PPC::LFS:
+ case PPC::LFD:
+ case PPC::LFSU:
+ case PPC::LFDU:
+ case PPC::LFSUX:
+ case PPC::LFDUX:
+ case PPC::LFSX:
+ case PPC::LFDX:
+ case PPC::LFIWAX:
+ case PPC::LFIWZX:
+ if (SplatReg != SrcReg) {
+ // We need to change the load to define the scalar subregister of
+ // the QPX splat source register.
+ unsigned SubRegIndex =
+ TRI->getSubRegIndex(SrcReg, MI->getOperand(0).getReg());
+ unsigned SplatSubReg = TRI->getSubReg(SplatReg, SubRegIndex);
+
+ // Substitute both the explicit defined register, and also the
+ // implicit def of the containing QPX register.
+ MI->getOperand(0).setReg(SplatSubReg);
+ MI->substituteRegister(SrcReg, SplatReg, 0, *TRI);
+ }
+
+ SI = Splats.erase(SI);
+
+ // If SMI is directly after MI, then MBBI's base iterator is
+ // pointing at SMI. Adjust MBBI around the call to erase SMI to
+ // avoid invalidating MBBI.
+ ++MBBI;
+ SMI->eraseFromParent();
+ --MBBI;
+
+ ++NumSimplified;
+ MadeChange = true;
+ continue;
+ }
+ }
+
+ // If this instruction defines the splat register, then we cannot move
+ // the previous definition above it. If it reads from the splat
+ // register, then it must already be alive from some previous
+ // definition, and if the splat register is different from the source
+ // register, then this definition must not be the load for which we're
+ // searching.
+ if (MI->modifiesRegister(SplatReg, TRI) ||
+ (SrcReg != SplatReg &&
+ MI->readsRegister(SplatReg, TRI))) {
+ SI = Splats.erase(SI);
+ continue;
+ }
+
+ ++SI;
+ }
+
+ if (MI->getOpcode() != PPC::QVESPLATI &&
+ MI->getOpcode() != PPC::QVESPLATIs &&
+ MI->getOpcode() != PPC::QVESPLATIb)
+ continue;
+ if (MI->getOperand(2).getImm() != 0)
+ continue;
+
+ // If there are other uses of the scalar value after this, replacing
+ // those uses might be non-trivial.
+ if (!MI->getOperand(1).isKill())
+ continue;
+
+ Splats.push_back(MI);
+ }
+ }
+
+ return MadeChange;
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
new file mode 100644
index 000000000000..e49201402861
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -0,0 +1,1078 @@
+//===-- PPCRegisterInfo.cpp - PowerPC Register Information ----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PowerPC implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCRegisterInfo.h"
+#include "PPC.h"
+#include "PPCFrameLowering.h"
+#include "PPCInstrBuilder.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCSubtarget.h"
+#include "PPCTargetMachine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include <cstdlib>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "reginfo"
+
+#define GET_REGINFO_TARGET_DESC
+#include "PPCGenRegisterInfo.inc"
+
+static cl::opt<bool>
+EnableBasePointer("ppc-use-base-pointer", cl::Hidden, cl::init(true),
+ cl::desc("Enable use of a base pointer for complex stack frames"));
+
+static cl::opt<bool>
+AlwaysBasePointer("ppc-always-use-base-pointer", cl::Hidden, cl::init(false),
+ cl::desc("Force the use of a base pointer in every function"));
+
+PPCRegisterInfo::PPCRegisterInfo(const PPCTargetMachine &TM)
+ : PPCGenRegisterInfo(TM.isPPC64() ? PPC::LR8 : PPC::LR,
+ TM.isPPC64() ? 0 : 1,
+ TM.isPPC64() ? 0 : 1),
+ TM(TM) {
+ ImmToIdxMap[PPC::LD] = PPC::LDX; ImmToIdxMap[PPC::STD] = PPC::STDX;
+ ImmToIdxMap[PPC::LBZ] = PPC::LBZX; ImmToIdxMap[PPC::STB] = PPC::STBX;
+ ImmToIdxMap[PPC::LHZ] = PPC::LHZX; ImmToIdxMap[PPC::LHA] = PPC::LHAX;
+ ImmToIdxMap[PPC::LWZ] = PPC::LWZX; ImmToIdxMap[PPC::LWA] = PPC::LWAX;
+ ImmToIdxMap[PPC::LFS] = PPC::LFSX; ImmToIdxMap[PPC::LFD] = PPC::LFDX;
+ ImmToIdxMap[PPC::STH] = PPC::STHX; ImmToIdxMap[PPC::STW] = PPC::STWX;
+ ImmToIdxMap[PPC::STFS] = PPC::STFSX; ImmToIdxMap[PPC::STFD] = PPC::STFDX;
+ ImmToIdxMap[PPC::ADDI] = PPC::ADD4;
+ ImmToIdxMap[PPC::LWA_32] = PPC::LWAX_32;
+
+ // 64-bit
+ ImmToIdxMap[PPC::LHA8] = PPC::LHAX8; ImmToIdxMap[PPC::LBZ8] = PPC::LBZX8;
+ ImmToIdxMap[PPC::LHZ8] = PPC::LHZX8; ImmToIdxMap[PPC::LWZ8] = PPC::LWZX8;
+ ImmToIdxMap[PPC::STB8] = PPC::STBX8; ImmToIdxMap[PPC::STH8] = PPC::STHX8;
+ ImmToIdxMap[PPC::STW8] = PPC::STWX8; ImmToIdxMap[PPC::STDU] = PPC::STDUX;
+ ImmToIdxMap[PPC::ADDI8] = PPC::ADD8;
+
+ // VSX
+ ImmToIdxMap[PPC::DFLOADf32] = PPC::LXSSPX;
+ ImmToIdxMap[PPC::DFLOADf64] = PPC::LXSDX;
+ ImmToIdxMap[PPC::DFSTOREf32] = PPC::STXSSPX;
+ ImmToIdxMap[PPC::DFSTOREf64] = PPC::STXSDX;
+ ImmToIdxMap[PPC::LXV] = PPC::LXVX;
+ ImmToIdxMap[PPC::LXSD] = PPC::LXSDX;
+ ImmToIdxMap[PPC::LXSSP] = PPC::LXSSPX;
+ ImmToIdxMap[PPC::STXV] = PPC::STXVX;
+ ImmToIdxMap[PPC::STXSD] = PPC::STXSDX;
+ ImmToIdxMap[PPC::STXSSP] = PPC::STXSSPX;
+}
+
+/// getPointerRegClass - Return the register class to use to hold pointers.
+/// This is used for addressing modes.
+const TargetRegisterClass *
+PPCRegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
+ const {
+ // Note that PPCInstrInfo::FoldImmediate also directly uses this Kind value
+ // when it checks for ZERO folding.
+ if (Kind == 1) {
+ if (TM.isPPC64())
+ return &PPC::G8RC_NOX0RegClass;
+ return &PPC::GPRC_NOR0RegClass;
+ }
+
+ if (TM.isPPC64())
+ return &PPC::G8RCRegClass;
+ return &PPC::GPRCRegClass;
+}
+
+const MCPhysReg*
+PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+ const PPCSubtarget &Subtarget = MF->getSubtarget<PPCSubtarget>();
+ if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg) {
+ if (Subtarget.hasVSX())
+ return CSR_64_AllRegs_VSX_SaveList;
+ if (Subtarget.hasAltivec())
+ return CSR_64_AllRegs_Altivec_SaveList;
+ return CSR_64_AllRegs_SaveList;
+ }
+
+ if (Subtarget.isDarwinABI())
+ return TM.isPPC64()
+ ? (Subtarget.hasAltivec() ? CSR_Darwin64_Altivec_SaveList
+ : CSR_Darwin64_SaveList)
+ : (Subtarget.hasAltivec() ? CSR_Darwin32_Altivec_SaveList
+ : CSR_Darwin32_SaveList);
+
+ if (TM.isPPC64() && MF->getInfo<PPCFunctionInfo>()->isSplitCSR())
+ return CSR_SRV464_TLS_PE_SaveList;
+
+ // On PPC64, we might need to save r2 (but only if it is not reserved).
+ bool SaveR2 = MF->getRegInfo().isAllocatable(PPC::X2);
+
+ return TM.isPPC64()
+ ? (Subtarget.hasAltivec()
+ ? (SaveR2 ? CSR_SVR464_R2_Altivec_SaveList
+ : CSR_SVR464_Altivec_SaveList)
+ : (SaveR2 ? CSR_SVR464_R2_SaveList : CSR_SVR464_SaveList))
+ : (Subtarget.hasAltivec() ? CSR_SVR432_Altivec_SaveList
+ : CSR_SVR432_SaveList);
+}
+
+const MCPhysReg *
+PPCRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const {
+ assert(MF && "Invalid MachineFunction pointer.");
+ const PPCSubtarget &Subtarget = MF->getSubtarget<PPCSubtarget>();
+ if (Subtarget.isDarwinABI())
+ return nullptr;
+ if (!TM.isPPC64())
+ return nullptr;
+ if (MF->getFunction()->getCallingConv() != CallingConv::CXX_FAST_TLS)
+ return nullptr;
+ if (!MF->getInfo<PPCFunctionInfo>()->isSplitCSR())
+ return nullptr;
+
+ // On PPC64, we might need to save r2 (but only if it is not reserved).
+ bool SaveR2 = !getReservedRegs(*MF).test(PPC::X2);
+ if (Subtarget.hasAltivec())
+ return SaveR2
+ ? CSR_SVR464_R2_Altivec_ViaCopy_SaveList
+ : CSR_SVR464_Altivec_ViaCopy_SaveList;
+ else
+ return SaveR2
+ ? CSR_SVR464_R2_ViaCopy_SaveList
+ : CSR_SVR464_ViaCopy_SaveList;
+}
+
+const uint32_t *
+PPCRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
+ CallingConv::ID CC) const {
+ const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+ if (CC == CallingConv::AnyReg) {
+ if (Subtarget.hasVSX())
+ return CSR_64_AllRegs_VSX_RegMask;
+ if (Subtarget.hasAltivec())
+ return CSR_64_AllRegs_Altivec_RegMask;
+ return CSR_64_AllRegs_RegMask;
+ }
+
+ if (Subtarget.isDarwinABI())
+ return TM.isPPC64() ? (Subtarget.hasAltivec() ? CSR_Darwin64_Altivec_RegMask
+ : CSR_Darwin64_RegMask)
+ : (Subtarget.hasAltivec() ? CSR_Darwin32_Altivec_RegMask
+ : CSR_Darwin32_RegMask);
+
+ return TM.isPPC64() ? (Subtarget.hasAltivec() ? CSR_SVR464_Altivec_RegMask
+ : CSR_SVR464_RegMask)
+ : (Subtarget.hasAltivec() ? CSR_SVR432_Altivec_RegMask
+ : CSR_SVR432_RegMask);
+}
+
+const uint32_t*
+PPCRegisterInfo::getNoPreservedMask() const {
+ return CSR_NoRegs_RegMask;
+}
+
+void PPCRegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const {
+ for (unsigned PseudoReg : {PPC::ZERO, PPC::ZERO8, PPC::RM})
+ Mask[PseudoReg / 32] &= ~(1u << (PseudoReg % 32));
+}
+
+BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+ BitVector Reserved(getNumRegs());
+ const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+ const PPCFrameLowering *TFI = getFrameLowering(MF);
+
+ // The ZERO register is not really a register, but the representation of r0
+ // when used in instructions that treat r0 as the constant 0.
+ Reserved.set(PPC::ZERO);
+ Reserved.set(PPC::ZERO8);
+
+ // The FP register is also not really a register, but is the representation
+ // of the frame pointer register used by ISD::FRAMEADDR.
+ Reserved.set(PPC::FP);
+ Reserved.set(PPC::FP8);
+
+ // The BP register is also not really a register, but is the representation
+ // of the base pointer register used by setjmp.
+ Reserved.set(PPC::BP);
+ Reserved.set(PPC::BP8);
+
+ // The counter registers must be reserved so that counter-based loops can
+ // be correctly formed (and the mtctr instructions are not DCE'd).
+ Reserved.set(PPC::CTR);
+ Reserved.set(PPC::CTR8);
+
+ Reserved.set(PPC::R1);
+ Reserved.set(PPC::LR);
+ Reserved.set(PPC::LR8);
+ Reserved.set(PPC::RM);
+
+ if (!Subtarget.isDarwinABI() || !Subtarget.hasAltivec())
+ Reserved.set(PPC::VRSAVE);
+
+ // The SVR4 ABI reserves r2 and r13
+ if (Subtarget.isSVR4ABI()) {
+ Reserved.set(PPC::R2); // System-reserved register
+ Reserved.set(PPC::R13); // Small Data Area pointer register
+ }
+
+ // On PPC64, r13 is the thread pointer. Never allocate this register.
+ if (TM.isPPC64()) {
+ Reserved.set(PPC::R13);
+
+ Reserved.set(PPC::X1);
+ Reserved.set(PPC::X13);
+
+ if (TFI->needsFP(MF))
+ Reserved.set(PPC::X31);
+
+ if (hasBasePointer(MF))
+ Reserved.set(PPC::X30);
+
+ // The 64-bit SVR4 ABI reserves r2 for the TOC pointer.
+ if (Subtarget.isSVR4ABI()) {
+ // We only reserve r2 if we need to use the TOC pointer. If we have no
+ // explicit uses of the TOC pointer (meaning we're a leaf function with
+ // no constant-pool loads, etc.) and we have no potential uses inside an
+ // inline asm block, then we can treat r2 has an ordinary callee-saved
+ // register.
+ const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+ if (FuncInfo->usesTOCBasePtr() || MF.hasInlineAsm())
+ Reserved.set(PPC::X2);
+ else
+ Reserved.reset(PPC::R2);
+ }
+ }
+
+ if (TFI->needsFP(MF))
+ Reserved.set(PPC::R31);
+
+ bool IsPositionIndependent = TM.isPositionIndependent();
+ if (hasBasePointer(MF)) {
+ if (Subtarget.isSVR4ABI() && !TM.isPPC64() && IsPositionIndependent)
+ Reserved.set(PPC::R29);
+ else
+ Reserved.set(PPC::R30);
+ }
+
+ if (Subtarget.isSVR4ABI() && !TM.isPPC64() && IsPositionIndependent)
+ Reserved.set(PPC::R30);
+
+ // Reserve Altivec registers when Altivec is unavailable.
+ if (!Subtarget.hasAltivec())
+ for (TargetRegisterClass::iterator I = PPC::VRRCRegClass.begin(),
+ IE = PPC::VRRCRegClass.end(); I != IE; ++I)
+ Reserved.set(*I);
+
+ return Reserved;
+}
+
+unsigned PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+ MachineFunction &MF) const {
+ const PPCFrameLowering *TFI = getFrameLowering(MF);
+ const unsigned DefaultSafety = 1;
+
+ switch (RC->getID()) {
+ default:
+ return 0;
+ case PPC::G8RC_NOX0RegClassID:
+ case PPC::GPRC_NOR0RegClassID:
+ case PPC::G8RCRegClassID:
+ case PPC::GPRCRegClassID: {
+ unsigned FP = TFI->hasFP(MF) ? 1 : 0;
+ return 32 - FP - DefaultSafety;
+ }
+ case PPC::F8RCRegClassID:
+ case PPC::F4RCRegClassID:
+ case PPC::QFRCRegClassID:
+ case PPC::QSRCRegClassID:
+ case PPC::QBRCRegClassID:
+ case PPC::VRRCRegClassID:
+ case PPC::VFRCRegClassID:
+ case PPC::VSLRCRegClassID:
+ return 32 - DefaultSafety;
+ case PPC::VSRCRegClassID:
+ case PPC::VSFRCRegClassID:
+ case PPC::VSSRCRegClassID:
+ return 64 - DefaultSafety;
+ case PPC::CRRCRegClassID:
+ return 8 - DefaultSafety;
+ }
+}
+
+const TargetRegisterClass *
+PPCRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
+ const MachineFunction &MF) const {
+ const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+ if (Subtarget.hasVSX()) {
+ // With VSX, we can inflate various sub-register classes to the full VSX
+ // register set.
+
+ if (RC == &PPC::F8RCRegClass)
+ return &PPC::VSFRCRegClass;
+ else if (RC == &PPC::VRRCRegClass)
+ return &PPC::VSRCRegClass;
+ else if (RC == &PPC::F4RCRegClass && Subtarget.hasP8Vector())
+ return &PPC::VSSRCRegClass;
+ }
+
+ return TargetRegisterInfo::getLargestLegalSuperClass(RC, MF);
+}
+
+//===----------------------------------------------------------------------===//
+// Stack Frame Processing methods
+//===----------------------------------------------------------------------===//
+
+/// lowerDynamicAlloc - Generate the code for allocating an object in the
+/// current frame. The sequence of code will be in the general form
+///
+/// addi R0, SP, \#frameSize ; get the address of the previous frame
+/// stwxu R0, SP, Rnegsize ; add and update the SP with the negated size
+/// addi Rnew, SP, \#maxCalFrameSize ; get the top of the allocation
+///
+void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const {
+ // Get the instruction.
+ MachineInstr &MI = *II;
+ // Get the instruction's basic block.
+ MachineBasicBlock &MBB = *MI.getParent();
+ // Get the basic block's function.
+ MachineFunction &MF = *MBB.getParent();
+ // Get the frame info.
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+ // Get the instruction info.
+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+ // Determine whether 64-bit pointers are used.
+ bool LP64 = TM.isPPC64();
+ DebugLoc dl = MI.getDebugLoc();
+
+ // Get the maximum call stack size.
+ unsigned maxCallFrameSize = MFI.getMaxCallFrameSize();
+ // Get the total frame size.
+ unsigned FrameSize = MFI.getStackSize();
+
+ // Get stack alignments.
+ const PPCFrameLowering *TFI = getFrameLowering(MF);
+ unsigned TargetAlign = TFI->getStackAlignment();
+ unsigned MaxAlign = MFI.getMaxAlignment();
+ assert((maxCallFrameSize & (MaxAlign-1)) == 0 &&
+ "Maximum call-frame size not sufficiently aligned");
+
+ // Determine the previous frame's address. If FrameSize can't be
+ // represented as 16 bits or we need special alignment, then we load the
+ // previous frame's address from 0(SP). Why not do an addis of the hi?
+ // Because R0 is our only safe tmp register and addi/addis treat R0 as zero.
+ // Constructing the constant and adding would take 3 instructions.
+ // Fortunately, a frame greater than 32K is rare.
+ const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+ const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+ unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+
+ if (MaxAlign < TargetAlign && isInt<16>(FrameSize)) {
+ BuildMI(MBB, II, dl, TII.get(PPC::ADDI), Reg)
+ .addReg(PPC::R31)
+ .addImm(FrameSize);
+ } else if (LP64) {
+ BuildMI(MBB, II, dl, TII.get(PPC::LD), Reg)
+ .addImm(0)
+ .addReg(PPC::X1);
+ } else {
+ BuildMI(MBB, II, dl, TII.get(PPC::LWZ), Reg)
+ .addImm(0)
+ .addReg(PPC::R1);
+ }
+
+ bool KillNegSizeReg = MI.getOperand(1).isKill();
+ unsigned NegSizeReg = MI.getOperand(1).getReg();
+
+ // Grow the stack and update the stack pointer link, then determine the
+ // address of new allocated space.
+ if (LP64) {
+ if (MaxAlign > TargetAlign) {
+ unsigned UnalNegSizeReg = NegSizeReg;
+ NegSizeReg = MF.getRegInfo().createVirtualRegister(G8RC);
+
+ // Unfortunately, there is no andi, only andi., and we can't insert that
+ // here because we might clobber cr0 while it is live.
+ BuildMI(MBB, II, dl, TII.get(PPC::LI8), NegSizeReg)
+ .addImm(~(MaxAlign-1));
+
+ unsigned NegSizeReg1 = NegSizeReg;
+ NegSizeReg = MF.getRegInfo().createVirtualRegister(G8RC);
+ BuildMI(MBB, II, dl, TII.get(PPC::AND8), NegSizeReg)
+ .addReg(UnalNegSizeReg, getKillRegState(KillNegSizeReg))
+ .addReg(NegSizeReg1, RegState::Kill);
+ KillNegSizeReg = true;
+ }
+
+ BuildMI(MBB, II, dl, TII.get(PPC::STDUX), PPC::X1)
+ .addReg(Reg, RegState::Kill)
+ .addReg(PPC::X1)
+ .addReg(NegSizeReg, getKillRegState(KillNegSizeReg));
+ BuildMI(MBB, II, dl, TII.get(PPC::ADDI8), MI.getOperand(0).getReg())
+ .addReg(PPC::X1)
+ .addImm(maxCallFrameSize);
+ } else {
+ if (MaxAlign > TargetAlign) {
+ unsigned UnalNegSizeReg = NegSizeReg;
+ NegSizeReg = MF.getRegInfo().createVirtualRegister(GPRC);
+
+ // Unfortunately, there is no andi, only andi., and we can't insert that
+ // here because we might clobber cr0 while it is live.
+ BuildMI(MBB, II, dl, TII.get(PPC::LI), NegSizeReg)
+ .addImm(~(MaxAlign-1));
+
+ unsigned NegSizeReg1 = NegSizeReg;
+ NegSizeReg = MF.getRegInfo().createVirtualRegister(GPRC);
+ BuildMI(MBB, II, dl, TII.get(PPC::AND), NegSizeReg)
+ .addReg(UnalNegSizeReg, getKillRegState(KillNegSizeReg))
+ .addReg(NegSizeReg1, RegState::Kill);
+ KillNegSizeReg = true;
+ }
+
+ BuildMI(MBB, II, dl, TII.get(PPC::STWUX), PPC::R1)
+ .addReg(Reg, RegState::Kill)
+ .addReg(PPC::R1)
+ .addReg(NegSizeReg, getKillRegState(KillNegSizeReg));
+ BuildMI(MBB, II, dl, TII.get(PPC::ADDI), MI.getOperand(0).getReg())
+ .addReg(PPC::R1)
+ .addImm(maxCallFrameSize);
+ }
+
+ // Discard the DYNALLOC instruction.
+ MBB.erase(II);
+}
+
+void PPCRegisterInfo::lowerDynamicAreaOffset(
+ MachineBasicBlock::iterator II) const {
+ // Get the instruction.
+ MachineInstr &MI = *II;
+ // Get the instruction's basic block.
+ MachineBasicBlock &MBB = *MI.getParent();
+ // Get the basic block's function.
+ MachineFunction &MF = *MBB.getParent();
+ // Get the frame info.
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+ // Get the instruction info.
+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+
+ unsigned maxCallFrameSize = MFI.getMaxCallFrameSize();
+ DebugLoc dl = MI.getDebugLoc();
+ BuildMI(MBB, II, dl, TII.get(PPC::LI), MI.getOperand(0).getReg())
+ .addImm(maxCallFrameSize);
+ MBB.erase(II);
+}
+
+/// lowerCRSpilling - Generate the code for spilling a CR register. Instead of
+/// reserving a whole register (R0), we scrounge for one here. This generates
+/// code like this:
+///
+/// mfcr rA ; Move the conditional register into GPR rA.
+/// rlwinm rA, rA, SB, 0, 31 ; Shift the bits left so they are in CR0's slot.
+/// stw rA, FI ; Store rA to the frame.
+///
+void PPCRegisterInfo::lowerCRSpilling(MachineBasicBlock::iterator II,
+ unsigned FrameIndex) const {
+ // Get the instruction.
+ MachineInstr &MI = *II; // ; SPILL_CR <SrcReg>, <offset>
+ // Get the instruction's basic block.
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineFunction &MF = *MBB.getParent();
+ const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+ DebugLoc dl = MI.getDebugLoc();
+
+ bool LP64 = TM.isPPC64();
+ const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+ const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+
+ unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+ unsigned SrcReg = MI.getOperand(0).getReg();
+
+ // We need to store the CR in the low 4-bits of the saved value. First, issue
+ // an MFOCRF to save all of the CRBits and, if needed, kill the SrcReg.
+ BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), Reg)
+ .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill()));
+
+ // If the saved register wasn't CR0, shift the bits left so that they are in
+ // CR0's slot.
+ if (SrcReg != PPC::CR0) {
+ unsigned Reg1 = Reg;
+ Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+
+ // rlwinm rA, rA, ShiftBits, 0, 31.
+ BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::RLWINM8 : PPC::RLWINM), Reg)
+ .addReg(Reg1, RegState::Kill)
+ .addImm(getEncodingValue(SrcReg) * 4)
+ .addImm(0)
+ .addImm(31);
+ }
+
+ addFrameReference(BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::STW8 : PPC::STW))
+ .addReg(Reg, RegState::Kill),
+ FrameIndex);
+
+ // Discard the pseudo instruction.
+ MBB.erase(II);
+}
+
+void PPCRegisterInfo::lowerCRRestore(MachineBasicBlock::iterator II,
+ unsigned FrameIndex) const {
+ // Get the instruction.
+ MachineInstr &MI = *II; // ; <DestReg> = RESTORE_CR <offset>
+ // Get the instruction's basic block.
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineFunction &MF = *MBB.getParent();
+ const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+ DebugLoc dl = MI.getDebugLoc();
+
+ bool LP64 = TM.isPPC64();
+ const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+ const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+
+ unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+ unsigned DestReg = MI.getOperand(0).getReg();
+ assert(MI.definesRegister(DestReg) &&
+ "RESTORE_CR does not define its destination");
+
+ addFrameReference(BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::LWZ8 : PPC::LWZ),
+ Reg), FrameIndex);
+
+ // If the reloaded register isn't CR0, shift the bits right so that they are
+ // in the right CR's slot.
+ if (DestReg != PPC::CR0) {
+ unsigned Reg1 = Reg;
+ Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+
+ unsigned ShiftBits = getEncodingValue(DestReg)*4;
+ // rlwinm r11, r11, 32-ShiftBits, 0, 31.
+ BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::RLWINM8 : PPC::RLWINM), Reg)
+ .addReg(Reg1, RegState::Kill).addImm(32-ShiftBits).addImm(0)
+ .addImm(31);
+ }
+
+ BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MTOCRF8 : PPC::MTOCRF), DestReg)
+ .addReg(Reg, RegState::Kill);
+
+ // Discard the pseudo instruction.
+ MBB.erase(II);
+}
+
+void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II,
+ unsigned FrameIndex) const {
+ // Get the instruction.
+ MachineInstr &MI = *II; // ; SPILL_CRBIT <SrcReg>, <offset>
+ // Get the instruction's basic block.
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineFunction &MF = *MBB.getParent();
+ const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+ DebugLoc dl = MI.getDebugLoc();
+
+ bool LP64 = TM.isPPC64();
+ const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+ const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+
+ unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+ unsigned SrcReg = MI.getOperand(0).getReg();
+
+ BuildMI(MBB, II, dl, TII.get(TargetOpcode::KILL),
+ getCRFromCRBit(SrcReg))
+ .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill()));
+
+ BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), Reg)
+ .addReg(getCRFromCRBit(SrcReg));
+
+ // If the saved register wasn't CR0LT, shift the bits left so that the bit to
+ // store is the first one. Mask all but that bit.
+ unsigned Reg1 = Reg;
+ Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+
+ // rlwinm rA, rA, ShiftBits, 0, 0.
+ BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::RLWINM8 : PPC::RLWINM), Reg)
+ .addReg(Reg1, RegState::Kill)
+ .addImm(getEncodingValue(SrcReg))
+ .addImm(0).addImm(0);
+
+ addFrameReference(BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::STW8 : PPC::STW))
+ .addReg(Reg, RegState::Kill),
+ FrameIndex);
+
+ // Discard the pseudo instruction.
+ MBB.erase(II);
+}
+
+void PPCRegisterInfo::lowerCRBitRestore(MachineBasicBlock::iterator II,
+ unsigned FrameIndex) const {
+ // Get the instruction.
+ MachineInstr &MI = *II; // ; <DestReg> = RESTORE_CRBIT <offset>
+ // Get the instruction's basic block.
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineFunction &MF = *MBB.getParent();
+ const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+ DebugLoc dl = MI.getDebugLoc();
+
+ bool LP64 = TM.isPPC64();
+ const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+ const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+
+ unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+ unsigned DestReg = MI.getOperand(0).getReg();
+ assert(MI.definesRegister(DestReg) &&
+ "RESTORE_CRBIT does not define its destination");
+
+ addFrameReference(BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::LWZ8 : PPC::LWZ),
+ Reg), FrameIndex);
+
+ BuildMI(MBB, II, dl, TII.get(TargetOpcode::IMPLICIT_DEF), DestReg);
+
+ unsigned RegO = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+ BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), RegO)
+ .addReg(getCRFromCRBit(DestReg));
+
+ unsigned ShiftBits = getEncodingValue(DestReg);
+ // rlwimi r11, r10, 32-ShiftBits, ..., ...
+ BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::RLWIMI8 : PPC::RLWIMI), RegO)
+ .addReg(RegO, RegState::Kill)
+ .addReg(Reg, RegState::Kill)
+ .addImm(ShiftBits ? 32 - ShiftBits : 0)
+ .addImm(ShiftBits)
+ .addImm(ShiftBits);
+
+ BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MTOCRF8 : PPC::MTOCRF),
+ getCRFromCRBit(DestReg))
+ .addReg(RegO, RegState::Kill)
+ // Make sure we have a use dependency all the way through this
+ // sequence of instructions. We can't have the other bits in the CR
+ // modified in between the mfocrf and the mtocrf.
+ .addReg(getCRFromCRBit(DestReg), RegState::Implicit);
+
+ // Discard the pseudo instruction.
+ MBB.erase(II);
+}
+
+void PPCRegisterInfo::lowerVRSAVESpilling(MachineBasicBlock::iterator II,
+ unsigned FrameIndex) const {
+ // Get the instruction.
+ MachineInstr &MI = *II; // ; SPILL_VRSAVE <SrcReg>, <offset>
+ // Get the instruction's basic block.
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineFunction &MF = *MBB.getParent();
+ const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+ DebugLoc dl = MI.getDebugLoc();
+
+ const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+ unsigned Reg = MF.getRegInfo().createVirtualRegister(GPRC);
+ unsigned SrcReg = MI.getOperand(0).getReg();
+
+ BuildMI(MBB, II, dl, TII.get(PPC::MFVRSAVEv), Reg)
+ .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill()));
+
+ addFrameReference(
+ BuildMI(MBB, II, dl, TII.get(PPC::STW)).addReg(Reg, RegState::Kill),
+ FrameIndex);
+
+ // Discard the pseudo instruction.
+ MBB.erase(II);
+}
+
+void PPCRegisterInfo::lowerVRSAVERestore(MachineBasicBlock::iterator II,
+ unsigned FrameIndex) const {
+ // Get the instruction.
+ MachineInstr &MI = *II; // ; <DestReg> = RESTORE_VRSAVE <offset>
+ // Get the instruction's basic block.
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineFunction &MF = *MBB.getParent();
+ const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+ DebugLoc dl = MI.getDebugLoc();
+
+ const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+ unsigned Reg = MF.getRegInfo().createVirtualRegister(GPRC);
+ unsigned DestReg = MI.getOperand(0).getReg();
+ assert(MI.definesRegister(DestReg) &&
+ "RESTORE_VRSAVE does not define its destination");
+
+ addFrameReference(BuildMI(MBB, II, dl, TII.get(PPC::LWZ),
+ Reg), FrameIndex);
+
+ BuildMI(MBB, II, dl, TII.get(PPC::MTVRSAVEv), DestReg)
+ .addReg(Reg, RegState::Kill);
+
+ // Discard the pseudo instruction.
+ MBB.erase(II);
+}
+
+bool PPCRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
+ unsigned Reg, int &FrameIdx) const {
+ const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+ // For the nonvolatile condition registers (CR2, CR3, CR4) in an SVR4
+ // ABI, return true to prevent allocating an additional frame slot.
+ // For 64-bit, the CR save area is at SP+8; the value of FrameIdx = 0
+ // is arbitrary and will be subsequently ignored. For 32-bit, we have
+ // previously created the stack slot if needed, so return its FrameIdx.
+ if (Subtarget.isSVR4ABI() && PPC::CR2 <= Reg && Reg <= PPC::CR4) {
+ if (TM.isPPC64())
+ FrameIdx = 0;
+ else {
+ const PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+ FrameIdx = FI->getCRSpillFrameIndex();
+ }
+ return true;
+ }
+ return false;
+}
+
+// Figure out if the offset in the instruction must be a multiple of 4.
+// This is true for instructions like "STD".
+static bool usesIXAddr(const MachineInstr &MI) {
+ unsigned OpC = MI.getOpcode();
+
+ switch (OpC) {
+ default:
+ return false;
+ case PPC::LWA:
+ case PPC::LWA_32:
+ case PPC::LD:
+ case PPC::STD:
+ return true;
+ }
+}
+
+// Return the OffsetOperandNo given the FIOperandNum (and the instruction).
+static unsigned getOffsetONFromFION(const MachineInstr &MI,
+ unsigned FIOperandNum) {
+ // Take into account whether it's an add or mem instruction
+ unsigned OffsetOperandNo = (FIOperandNum == 2) ? 1 : 2;
+ if (MI.isInlineAsm())
+ OffsetOperandNo = FIOperandNum - 1;
+ else if (MI.getOpcode() == TargetOpcode::STACKMAP ||
+ MI.getOpcode() == TargetOpcode::PATCHPOINT)
+ OffsetOperandNo = FIOperandNum + 1;
+
+ return OffsetOperandNo;
+}
+
+void
+PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+ int SPAdj, unsigned FIOperandNum,
+ RegScavenger *RS) const {
+ assert(SPAdj == 0 && "Unexpected");
+
+ // Get the instruction.
+ MachineInstr &MI = *II;
+ // Get the instruction's basic block.
+ MachineBasicBlock &MBB = *MI.getParent();
+ // Get the basic block's function.
+ MachineFunction &MF = *MBB.getParent();
+ const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+ // Get the instruction info.
+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+ // Get the frame info.
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ DebugLoc dl = MI.getDebugLoc();
+
+ unsigned OffsetOperandNo = getOffsetONFromFION(MI, FIOperandNum);
+
+ // Get the frame index.
+ int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+
+ // Get the frame pointer save index. Users of this index are primarily
+ // DYNALLOC instructions.
+ PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+ int FPSI = FI->getFramePointerSaveIndex();
+ // Get the instruction opcode.
+ unsigned OpC = MI.getOpcode();
+
+ if ((OpC == PPC::DYNAREAOFFSET || OpC == PPC::DYNAREAOFFSET8)) {
+ lowerDynamicAreaOffset(II);
+ return;
+ }
+
+ // Special case for dynamic alloca.
+ if (FPSI && FrameIndex == FPSI &&
+ (OpC == PPC::DYNALLOC || OpC == PPC::DYNALLOC8)) {
+ lowerDynamicAlloc(II);
+ return;
+ }
+
+ // Special case for pseudo-ops SPILL_CR and RESTORE_CR, etc.
+ if (OpC == PPC::SPILL_CR) {
+ lowerCRSpilling(II, FrameIndex);
+ return;
+ } else if (OpC == PPC::RESTORE_CR) {
+ lowerCRRestore(II, FrameIndex);
+ return;
+ } else if (OpC == PPC::SPILL_CRBIT) {
+ lowerCRBitSpilling(II, FrameIndex);
+ return;
+ } else if (OpC == PPC::RESTORE_CRBIT) {
+ lowerCRBitRestore(II, FrameIndex);
+ return;
+ } else if (OpC == PPC::SPILL_VRSAVE) {
+ lowerVRSAVESpilling(II, FrameIndex);
+ return;
+ } else if (OpC == PPC::RESTORE_VRSAVE) {
+ lowerVRSAVERestore(II, FrameIndex);
+ return;
+ }
+
+ // Replace the FrameIndex with base register with GPR1 (SP) or GPR31 (FP).
+ MI.getOperand(FIOperandNum).ChangeToRegister(
+ FrameIndex < 0 ? getBaseRegister(MF) : getFrameRegister(MF), false);
+
+ // Figure out if the offset in the instruction is shifted right two bits.
+ bool isIXAddr = usesIXAddr(MI);
+
+ // If the instruction is not present in ImmToIdxMap, then it has no immediate
+ // form (and must be r+r).
+ bool noImmForm = !MI.isInlineAsm() && OpC != TargetOpcode::STACKMAP &&
+ OpC != TargetOpcode::PATCHPOINT && !ImmToIdxMap.count(OpC);
+
+ // Now add the frame object offset to the offset from r1.
+ int Offset = MFI.getObjectOffset(FrameIndex);
+ Offset += MI.getOperand(OffsetOperandNo).getImm();
+
+ // If we're not using a Frame Pointer that has been set to the value of the
+ // SP before having the stack size subtracted from it, then add the stack size
+ // to Offset to get the correct offset.
+ // Naked functions have stack size 0, although getStackSize may not reflect
+ // that because we didn't call all the pieces that compute it for naked
+ // functions.
+ if (!MF.getFunction()->hasFnAttribute(Attribute::Naked)) {
+ if (!(hasBasePointer(MF) && FrameIndex < 0))
+ Offset += MFI.getStackSize();
+ }
+
+ // If we can, encode the offset directly into the instruction. If this is a
+ // normal PPC "ri" instruction, any 16-bit value can be safely encoded. If
+ // this is a PPC64 "ix" instruction, only a 16-bit value with the low two bits
+ // clear can be encoded. This is extremely uncommon, because normally you
+ // only "std" to a stack slot that is at least 4-byte aligned, but it can
+ // happen in invalid code.
+ assert(OpC != PPC::DBG_VALUE &&
+ "This should be handled in a target-independent way");
+ if (!noImmForm && ((isInt<16>(Offset) && (!isIXAddr || (Offset & 3) == 0)) ||
+ OpC == TargetOpcode::STACKMAP ||
+ OpC == TargetOpcode::PATCHPOINT)) {
+ MI.getOperand(OffsetOperandNo).ChangeToImmediate(Offset);
+ return;
+ }
+
+ // The offset doesn't fit into a single register, scavenge one to build the
+ // offset in.
+
+ bool is64Bit = TM.isPPC64();
+ const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+ const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+ const TargetRegisterClass *RC = is64Bit ? G8RC : GPRC;
+ unsigned SRegHi = MF.getRegInfo().createVirtualRegister(RC),
+ SReg = MF.getRegInfo().createVirtualRegister(RC);
+
+ // Insert a set of rA with the full offset value before the ld, st, or add
+ BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::LIS8 : PPC::LIS), SRegHi)
+ .addImm(Offset >> 16);
+ BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::ORI8 : PPC::ORI), SReg)
+ .addReg(SRegHi, RegState::Kill)
+ .addImm(Offset);
+
+ // Convert into indexed form of the instruction:
+ //
+ // sth 0:rA, 1:imm 2:(rB) ==> sthx 0:rA, 2:rB, 1:r0
+ // addi 0:rA 1:rB, 2, imm ==> add 0:rA, 1:rB, 2:r0
+ unsigned OperandBase;
+
+ if (noImmForm)
+ OperandBase = 1;
+ else if (OpC != TargetOpcode::INLINEASM) {
+ assert(ImmToIdxMap.count(OpC) &&
+ "No indexed form of load or store available!");
+ unsigned NewOpcode = ImmToIdxMap.find(OpC)->second;
+ MI.setDesc(TII.get(NewOpcode));
+ OperandBase = 1;
+ } else {
+ OperandBase = OffsetOperandNo;
+ }
+
+ unsigned StackReg = MI.getOperand(FIOperandNum).getReg();
+ MI.getOperand(OperandBase).ChangeToRegister(StackReg, false);
+ MI.getOperand(OperandBase + 1).ChangeToRegister(SReg, false, false, true);
+}
+
+unsigned PPCRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+ const PPCFrameLowering *TFI = getFrameLowering(MF);
+
+ if (!TM.isPPC64())
+ return TFI->hasFP(MF) ? PPC::R31 : PPC::R1;
+ else
+ return TFI->hasFP(MF) ? PPC::X31 : PPC::X1;
+}
+
+unsigned PPCRegisterInfo::getBaseRegister(const MachineFunction &MF) const {
+ const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+ if (!hasBasePointer(MF))
+ return getFrameRegister(MF);
+
+ if (TM.isPPC64())
+ return PPC::X30;
+
+ if (Subtarget.isSVR4ABI() && TM.isPositionIndependent())
+ return PPC::R29;
+
+ return PPC::R30;
+}
+
+bool PPCRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
+ if (!EnableBasePointer)
+ return false;
+ if (AlwaysBasePointer)
+ return true;
+
+ // If we need to realign the stack, then the stack pointer can no longer
+ // serve as an offset into the caller's stack space. As a result, we need a
+ // base pointer.
+ return needsStackRealignment(MF);
+}
+
+/// Returns true if the instruction's frame index
+/// reference would be better served by a base register other than FP
+/// or SP. Used by LocalStackFrameAllocation to determine which frame index
+/// references it should create new base registers for.
+bool PPCRegisterInfo::
+needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
+ assert(Offset < 0 && "Local offset must be negative");
+
+ // It's the load/store FI references that cause issues, as it can be difficult
+ // to materialize the offset if it won't fit in the literal field. Estimate
+ // based on the size of the local frame and some conservative assumptions
+ // about the rest of the stack frame (note, this is pre-regalloc, so
+ // we don't know everything for certain yet) whether this offset is likely
+ // to be out of range of the immediate. Return true if so.
+
+ // We only generate virtual base registers for loads and stores that have
+ // an r+i form. Return false for everything else.
+ unsigned OpC = MI->getOpcode();
+ if (!ImmToIdxMap.count(OpC))
+ return false;
+
+ // Don't generate a new virtual base register just to add zero to it.
+ if ((OpC == PPC::ADDI || OpC == PPC::ADDI8) &&
+ MI->getOperand(2).getImm() == 0)
+ return false;
+
+ MachineBasicBlock &MBB = *MI->getParent();
+ MachineFunction &MF = *MBB.getParent();
+ const PPCFrameLowering *TFI = getFrameLowering(MF);
+ unsigned StackEst = TFI->determineFrameLayout(MF, false, true);
+
+ // If we likely don't need a stack frame, then we probably don't need a
+ // virtual base register either.
+ if (!StackEst)
+ return false;
+
+ // Estimate an offset from the stack pointer.
+ // The incoming offset is relating to the SP at the start of the function,
+ // but when we access the local it'll be relative to the SP after local
+ // allocation, so adjust our SP-relative offset by that allocation size.
+ Offset += StackEst;
+
+ // The frame pointer will point to the end of the stack, so estimate the
+ // offset as the difference between the object offset and the FP location.
+ return !isFrameOffsetLegal(MI, getBaseRegister(MF), Offset);
+}
+
+/// Insert defining instruction(s) for BaseReg to
+/// be a pointer to FrameIdx at the beginning of the basic block.
+void PPCRegisterInfo::
+materializeFrameBaseRegister(MachineBasicBlock *MBB,
+ unsigned BaseReg, int FrameIdx,
+ int64_t Offset) const {
+ unsigned ADDriOpc = TM.isPPC64() ? PPC::ADDI8 : PPC::ADDI;
+
+ MachineBasicBlock::iterator Ins = MBB->begin();
+ DebugLoc DL; // Defaults to "unknown"
+ if (Ins != MBB->end())
+ DL = Ins->getDebugLoc();
+
+ const MachineFunction &MF = *MBB->getParent();
+ const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+ const MCInstrDesc &MCID = TII.get(ADDriOpc);
+ MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+ MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this, MF));
+
+ BuildMI(*MBB, Ins, DL, MCID, BaseReg)
+ .addFrameIndex(FrameIdx).addImm(Offset);
+}
+
+void PPCRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+ int64_t Offset) const {
+ unsigned FIOperandNum = 0;
+ while (!MI.getOperand(FIOperandNum).isFI()) {
+ ++FIOperandNum;
+ assert(FIOperandNum < MI.getNumOperands() &&
+ "Instr doesn't have FrameIndex operand!");
+ }
+
+ MI.getOperand(FIOperandNum).ChangeToRegister(BaseReg, false);
+ unsigned OffsetOperandNo = getOffsetONFromFION(MI, FIOperandNum);
+ Offset += MI.getOperand(OffsetOperandNo).getImm();
+ MI.getOperand(OffsetOperandNo).ChangeToImmediate(Offset);
+
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineFunction &MF = *MBB.getParent();
+ const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+ const MCInstrDesc &MCID = MI.getDesc();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ MRI.constrainRegClass(BaseReg,
+ TII.getRegClass(MCID, FIOperandNum, this, MF));
+}
+
+bool PPCRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
+ unsigned BaseReg,
+ int64_t Offset) const {
+ unsigned FIOperandNum = 0;
+ while (!MI->getOperand(FIOperandNum).isFI()) {
+ ++FIOperandNum;
+ assert(FIOperandNum < MI->getNumOperands() &&
+ "Instr doesn't have FrameIndex operand!");
+ }
+
+ unsigned OffsetOperandNo = getOffsetONFromFION(*MI, FIOperandNum);
+ Offset += MI->getOperand(OffsetOperandNo).getImm();
+
+ return MI->getOpcode() == PPC::DBG_VALUE || // DBG_VALUE is always Reg+Imm
+ MI->getOpcode() == TargetOpcode::STACKMAP ||
+ MI->getOpcode() == TargetOpcode::PATCHPOINT ||
+ (isInt<16>(Offset) && (!usesIXAddr(*MI) || (Offset & 3) == 0));
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
new file mode 100644
index 000000000000..4a96327fe552
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -0,0 +1,145 @@
+//===-- PPCRegisterInfo.h - PowerPC Register Information Impl ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PowerPC implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCREGISTERINFO_H
+#define LLVM_LIB_TARGET_POWERPC_PPCREGISTERINFO_H
+
+#include "PPC.h"
+#include "llvm/ADT/DenseMap.h"
+
+#define GET_REGINFO_HEADER
+#include "PPCGenRegisterInfo.inc"
+
+namespace llvm {
+
+inline static unsigned getCRFromCRBit(unsigned SrcReg) {
+ unsigned Reg = 0;
+ if (SrcReg == PPC::CR0LT || SrcReg == PPC::CR0GT ||
+ SrcReg == PPC::CR0EQ || SrcReg == PPC::CR0UN)
+ Reg = PPC::CR0;
+ else if (SrcReg == PPC::CR1LT || SrcReg == PPC::CR1GT ||
+ SrcReg == PPC::CR1EQ || SrcReg == PPC::CR1UN)
+ Reg = PPC::CR1;
+ else if (SrcReg == PPC::CR2LT || SrcReg == PPC::CR2GT ||
+ SrcReg == PPC::CR2EQ || SrcReg == PPC::CR2UN)
+ Reg = PPC::CR2;
+ else if (SrcReg == PPC::CR3LT || SrcReg == PPC::CR3GT ||
+ SrcReg == PPC::CR3EQ || SrcReg == PPC::CR3UN)
+ Reg = PPC::CR3;
+ else if (SrcReg == PPC::CR4LT || SrcReg == PPC::CR4GT ||
+ SrcReg == PPC::CR4EQ || SrcReg == PPC::CR4UN)
+ Reg = PPC::CR4;
+ else if (SrcReg == PPC::CR5LT || SrcReg == PPC::CR5GT ||
+ SrcReg == PPC::CR5EQ || SrcReg == PPC::CR5UN)
+ Reg = PPC::CR5;
+ else if (SrcReg == PPC::CR6LT || SrcReg == PPC::CR6GT ||
+ SrcReg == PPC::CR6EQ || SrcReg == PPC::CR6UN)
+ Reg = PPC::CR6;
+ else if (SrcReg == PPC::CR7LT || SrcReg == PPC::CR7GT ||
+ SrcReg == PPC::CR7EQ || SrcReg == PPC::CR7UN)
+ Reg = PPC::CR7;
+
+ assert(Reg != 0 && "Invalid CR bit register");
+ return Reg;
+}
+
+class PPCRegisterInfo : public PPCGenRegisterInfo {
+ DenseMap<unsigned, unsigned> ImmToIdxMap;
+ const PPCTargetMachine &TM;
+
+public:
+ PPCRegisterInfo(const PPCTargetMachine &TM);
+
+ /// getPointerRegClass - Return the register class to use to hold pointers.
+ /// This is used for addressing modes.
+ const TargetRegisterClass *
+ getPointerRegClass(const MachineFunction &MF, unsigned Kind=0) const override;
+
+ unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+ MachineFunction &MF) const override;
+
+ const TargetRegisterClass *
+ getLargestLegalSuperClass(const TargetRegisterClass *RC,
+ const MachineFunction &MF) const override;
+
+ /// Code Generation virtual methods...
+ const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+ const MCPhysReg *getCalleeSavedRegsViaCopy(const MachineFunction *MF) const;
+ const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+ CallingConv::ID CC) const override;
+ const uint32_t *getNoPreservedMask() const override;
+
+ void adjustStackMapLiveOutMask(uint32_t *Mask) const override;
+
+ BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+ /// We require the register scavenger.
+ bool requiresRegisterScavenging(const MachineFunction &MF) const override {
+ return true;
+ }
+
+ bool requiresFrameIndexScavenging(const MachineFunction &MF) const override {
+ return true;
+ }
+
+ bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override {
+ return true;
+ }
+
+ bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override {
+ return true;
+ }
+
+ void lowerDynamicAlloc(MachineBasicBlock::iterator II) const;
+ void lowerDynamicAreaOffset(MachineBasicBlock::iterator II) const;
+ void lowerCRSpilling(MachineBasicBlock::iterator II,
+ unsigned FrameIndex) const;
+ void lowerCRRestore(MachineBasicBlock::iterator II,
+ unsigned FrameIndex) const;
+ void lowerCRBitSpilling(MachineBasicBlock::iterator II,
+ unsigned FrameIndex) const;
+ void lowerCRBitRestore(MachineBasicBlock::iterator II,
+ unsigned FrameIndex) const;
+ void lowerVRSAVESpilling(MachineBasicBlock::iterator II,
+ unsigned FrameIndex) const;
+ void lowerVRSAVERestore(MachineBasicBlock::iterator II,
+ unsigned FrameIndex) const;
+
+ bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg,
+ int &FrameIdx) const override;
+ void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+ unsigned FIOperandNum,
+ RegScavenger *RS = nullptr) const override;
+
+ // Support for virtual base registers.
+ bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
+ void materializeFrameBaseRegister(MachineBasicBlock *MBB,
+ unsigned BaseReg, int FrameIdx,
+ int64_t Offset) const override;
+ void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+ int64_t Offset) const override;
+ bool isFrameOffsetLegal(const MachineInstr *MI, unsigned BaseReg,
+ int64_t Offset) const override;
+
+ // Debug information queries.
+ unsigned getFrameRegister(const MachineFunction &MF) const override;
+
+ // Base pointer (stack realignment) support.
+ unsigned getBaseRegister(const MachineFunction &MF) const;
+ bool hasBasePointer(const MachineFunction &MF) const;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.td b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
new file mode 100644
index 000000000000..896cec7e4f6e
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -0,0 +1,352 @@
+//===-- PPCRegisterInfo.td - The PowerPC Register File -----*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+let Namespace = "PPC" in {
+def sub_lt : SubRegIndex<1>;
+def sub_gt : SubRegIndex<1, 1>;
+def sub_eq : SubRegIndex<1, 2>;
+def sub_un : SubRegIndex<1, 3>;
+def sub_32 : SubRegIndex<32>;
+def sub_64 : SubRegIndex<64>;
+}
+
+
+class PPCReg<string n> : Register<n> {
+ let Namespace = "PPC";
+}
+
+// We identify all our registers with a 5-bit ID, for consistency's sake.
+
+// GPR - One of the 32 32-bit general-purpose registers
+class GPR<bits<5> num, string n> : PPCReg<n> {
+ let HWEncoding{4-0} = num;
+}
+
+// GP8 - One of the 32 64-bit general-purpose registers
+class GP8<GPR SubReg, string n> : PPCReg<n> {
+ let HWEncoding = SubReg.HWEncoding;
+ let SubRegs = [SubReg];
+ let SubRegIndices = [sub_32];
+}
+
+// SPR - One of the 32-bit special-purpose registers
+class SPR<bits<10> num, string n> : PPCReg<n> {
+ let HWEncoding{9-0} = num;
+}
+
+// FPR - One of the 32 64-bit floating-point registers
+class FPR<bits<5> num, string n> : PPCReg<n> {
+ let HWEncoding{4-0} = num;
+}
+
+// QFPR - One of the 32 256-bit floating-point vector registers (used for QPX)
+class QFPR<FPR SubReg, string n> : PPCReg<n> {
+ let HWEncoding = SubReg.HWEncoding;
+ let SubRegs = [SubReg];
+ let SubRegIndices = [sub_64];
+}
+
+// VF - One of the 32 64-bit floating-point subregisters of the vector
+// registers (used by VSX).
+class VF<bits<5> num, string n> : PPCReg<n> {
+ let HWEncoding{4-0} = num;
+ let HWEncoding{5} = 1;
+}
+
+// VR - One of the 32 128-bit vector registers
+class VR<VF SubReg, string n> : PPCReg<n> {
+ let HWEncoding{4-0} = SubReg.HWEncoding{4-0};
+ let HWEncoding{5} = 0;
+ let SubRegs = [SubReg];
+ let SubRegIndices = [sub_64];
+}
+
+// VSRL - One of the 32 128-bit VSX registers that overlap with the scalar
+// floating-point registers.
+class VSRL<FPR SubReg, string n> : PPCReg<n> {
+ let HWEncoding = SubReg.HWEncoding;
+ let SubRegs = [SubReg];
+ let SubRegIndices = [sub_64];
+}
+
+// CR - One of the 8 4-bit condition registers
+class CR<bits<3> num, string n, list<Register> subregs> : PPCReg<n> {
+ let HWEncoding{2-0} = num;
+ let SubRegs = subregs;
+}
+
+// CRBIT - One of the 32 1-bit condition register fields
+class CRBIT<bits<5> num, string n> : PPCReg<n> {
+ let HWEncoding{4-0} = num;
+}
+
+// General-purpose registers
+foreach Index = 0-31 in {
+ def R#Index : GPR<Index, "r"#Index>, DwarfRegNum<[-2, Index]>;
+}
+
+// 64-bit General-purpose registers
+foreach Index = 0-31 in {
+ def X#Index : GP8<!cast<GPR>("R"#Index), "r"#Index>,
+ DwarfRegNum<[Index, -2]>;
+}
+
+// Floating-point registers
+foreach Index = 0-31 in {
+ def F#Index : FPR<Index, "f"#Index>,
+ DwarfRegNum<[!add(Index, 32), !add(Index, 32)]>;
+}
+
+// 64-bit Floating-point subregisters of Altivec registers
+// Note: the register names are v0-v31 or vs32-vs63 depending on the use.
+// Custom C++ code is used to produce the correct name and encoding.
+foreach Index = 0-31 in {
+ def VF#Index : VF<Index, "v" #Index>,
+ DwarfRegNum<[!add(Index, 77), !add(Index, 77)]>;
+}
+
+// QPX Floating-point registers
+foreach Index = 0-31 in {
+ def QF#Index : QFPR<!cast<FPR>("F"#Index), "q"#Index>,
+ DwarfRegNum<[!add(Index, 32), !add(Index, 32)]>;
+}
+
+// Vector registers
+foreach Index = 0-31 in {
+ def V#Index : VR<!cast<VF>("VF"#Index), "v"#Index>,
+ DwarfRegNum<[!add(Index, 77), !add(Index, 77)]>;
+}
+
+// VSX registers
+foreach Index = 0-31 in {
+ def VSL#Index : VSRL<!cast<FPR>("F"#Index), "vs"#Index>,
+ DwarfRegAlias<!cast<FPR>("F"#Index)>;
+}
+
+// Dummy VSX registers, this defines string: "vs32"-"vs63", and is only used for
+// asm printing.
+foreach Index = 32-63 in {
+ def VSX#Index : PPCReg<"vs"#Index>;
+}
+
+// The reprsentation of r0 when treated as the constant 0.
+def ZERO : GPR<0, "0">, DwarfRegAlias<R0>;
+def ZERO8 : GP8<ZERO, "0">, DwarfRegAlias<X0>;
+
+// Representations of the frame pointer used by ISD::FRAMEADDR.
+def FP : GPR<0 /* arbitrary */, "**FRAME POINTER**">;
+def FP8 : GP8<FP, "**FRAME POINTER**">;
+
+// Representations of the base pointer used by setjmp.
+def BP : GPR<0 /* arbitrary */, "**BASE POINTER**">;
+def BP8 : GP8<BP, "**BASE POINTER**">;
+
+// Condition register bits
+def CR0LT : CRBIT< 0, "0">;
+def CR0GT : CRBIT< 1, "1">;
+def CR0EQ : CRBIT< 2, "2">;
+def CR0UN : CRBIT< 3, "3">;
+def CR1LT : CRBIT< 4, "4">;
+def CR1GT : CRBIT< 5, "5">;
+def CR1EQ : CRBIT< 6, "6">;
+def CR1UN : CRBIT< 7, "7">;
+def CR2LT : CRBIT< 8, "8">;
+def CR2GT : CRBIT< 9, "9">;
+def CR2EQ : CRBIT<10, "10">;
+def CR2UN : CRBIT<11, "11">;
+def CR3LT : CRBIT<12, "12">;
+def CR3GT : CRBIT<13, "13">;
+def CR3EQ : CRBIT<14, "14">;
+def CR3UN : CRBIT<15, "15">;
+def CR4LT : CRBIT<16, "16">;
+def CR4GT : CRBIT<17, "17">;
+def CR4EQ : CRBIT<18, "18">;
+def CR4UN : CRBIT<19, "19">;
+def CR5LT : CRBIT<20, "20">;
+def CR5GT : CRBIT<21, "21">;
+def CR5EQ : CRBIT<22, "22">;
+def CR5UN : CRBIT<23, "23">;
+def CR6LT : CRBIT<24, "24">;
+def CR6GT : CRBIT<25, "25">;
+def CR6EQ : CRBIT<26, "26">;
+def CR6UN : CRBIT<27, "27">;
+def CR7LT : CRBIT<28, "28">;
+def CR7GT : CRBIT<29, "29">;
+def CR7EQ : CRBIT<30, "30">;
+def CR7UN : CRBIT<31, "31">;
+
+// Condition registers
+let SubRegIndices = [sub_lt, sub_gt, sub_eq, sub_un] in {
+def CR0 : CR<0, "cr0", [CR0LT, CR0GT, CR0EQ, CR0UN]>, DwarfRegNum<[68, 68]>;
+def CR1 : CR<1, "cr1", [CR1LT, CR1GT, CR1EQ, CR1UN]>, DwarfRegNum<[69, 69]>;
+def CR2 : CR<2, "cr2", [CR2LT, CR2GT, CR2EQ, CR2UN]>, DwarfRegNum<[70, 70]>;
+def CR3 : CR<3, "cr3", [CR3LT, CR3GT, CR3EQ, CR3UN]>, DwarfRegNum<[71, 71]>;
+def CR4 : CR<4, "cr4", [CR4LT, CR4GT, CR4EQ, CR4UN]>, DwarfRegNum<[72, 72]>;
+def CR5 : CR<5, "cr5", [CR5LT, CR5GT, CR5EQ, CR5UN]>, DwarfRegNum<[73, 73]>;
+def CR6 : CR<6, "cr6", [CR6LT, CR6GT, CR6EQ, CR6UN]>, DwarfRegNum<[74, 74]>;
+def CR7 : CR<7, "cr7", [CR7LT, CR7GT, CR7EQ, CR7UN]>, DwarfRegNum<[75, 75]>;
+}
+
+// Link register
+def LR : SPR<8, "lr">, DwarfRegNum<[-2, 65]>;
+//let Aliases = [LR] in
+def LR8 : SPR<8, "lr">, DwarfRegNum<[65, -2]>;
+
+// Count register
+def CTR : SPR<9, "ctr">, DwarfRegNum<[-2, 66]>;
+def CTR8 : SPR<9, "ctr">, DwarfRegNum<[66, -2]>;
+
+// VRsave register
+def VRSAVE: SPR<256, "vrsave">, DwarfRegNum<[109]>;
+
+// Carry bit. In the architecture this is really bit 0 of the XER register
+// (which really is SPR register 1); this is the only bit interesting to a
+// compiler.
+def CARRY: SPR<1, "ca">, DwarfRegNum<[76]>;
+
+// FP rounding mode: bits 30 and 31 of the FP status and control register
+// This is not allocated as a normal register; it appears only in
+// Uses and Defs. The ABI says it needs to be preserved by a function,
+// but this is not achieved by saving and restoring it as with
+// most registers, it has to be done in code; to make this work all the
+// return and call instructions are described as Uses of RM, so instructions
+// that do nothing but change RM will not get deleted.
+def RM: PPCReg<"**ROUNDING MODE**">;
+
+/// Register classes
+// Allocate volatiles first
+// then nonvolatiles in reverse order since stmw/lmw save from rN to r31
+def GPRC : RegisterClass<"PPC", [i32], 32, (add (sequence "R%u", 2, 12),
+ (sequence "R%u", 30, 13),
+ R31, R0, R1, FP, BP)> {
+ // On non-Darwin PPC64 systems, R2 can be allocated, but must be restored, so
+ // put it at the end of the list.
+ let AltOrders = [(add (sub GPRC, R2), R2)];
+ let AltOrderSelect = [{
+ const PPCSubtarget &S = MF.getSubtarget<PPCSubtarget>();
+ return S.isPPC64() && S.isSVR4ABI();
+ }];
+}
+
+def G8RC : RegisterClass<"PPC", [i64], 64, (add (sequence "X%u", 2, 12),
+ (sequence "X%u", 30, 14),
+ X31, X13, X0, X1, FP8, BP8)> {
+ // On non-Darwin PPC64 systems, R2 can be allocated, but must be restored, so
+ // put it at the end of the list.
+ let AltOrders = [(add (sub G8RC, X2), X2)];
+ let AltOrderSelect = [{
+ const PPCSubtarget &S = MF.getSubtarget<PPCSubtarget>();
+ return S.isPPC64() && S.isSVR4ABI();
+ }];
+}
+
+// For some instructions r0 is special (representing the value 0 instead of
+// the value in the r0 register), and we use these register subclasses to
+// prevent r0 from being allocated for use by those instructions.
+def GPRC_NOR0 : RegisterClass<"PPC", [i32], 32, (add (sub GPRC, R0), ZERO)> {
+ // On non-Darwin PPC64 systems, R2 can be allocated, but must be restored, so
+ // put it at the end of the list.
+ let AltOrders = [(add (sub GPRC_NOR0, R2), R2)];
+ let AltOrderSelect = [{
+ const PPCSubtarget &S = MF.getSubtarget<PPCSubtarget>();
+ return S.isPPC64() && S.isSVR4ABI();
+ }];
+}
+
+def G8RC_NOX0 : RegisterClass<"PPC", [i64], 64, (add (sub G8RC, X0), ZERO8)> {
+ // On non-Darwin PPC64 systems, R2 can be allocated, but must be restored, so
+ // put it at the end of the list.
+ let AltOrders = [(add (sub G8RC_NOX0, X2), X2)];
+ let AltOrderSelect = [{
+ const PPCSubtarget &S = MF.getSubtarget<PPCSubtarget>();
+ return S.isPPC64() && S.isSVR4ABI();
+ }];
+}
+
+// Allocate volatiles first, then non-volatiles in reverse order. With the SVR4
+// ABI the size of the Floating-point register save area is determined by the
+// allocated non-volatile register with the lowest register number, as FP
+// register N is spilled to offset 8 * (32 - N) below the back chain word of the
+// previous stack frame. By allocating non-volatiles in reverse order we make
+// sure that the Floating-point register save area is always as small as
+// possible because there aren't any unused spill slots.
+def F8RC : RegisterClass<"PPC", [f64], 64, (add (sequence "F%u", 0, 13),
+ (sequence "F%u", 31, 14))>;
+def F4RC : RegisterClass<"PPC", [f32], 32, (add F8RC)>;
+
+def VRRC : RegisterClass<"PPC", [v16i8,v8i16,v4i32,v2i64,v1i128,v4f32,v2f64], 128,
+ (add V2, V3, V4, V5, V0, V1, V6, V7, V8, V9, V10, V11,
+ V12, V13, V14, V15, V16, V17, V18, V19, V31, V30,
+ V29, V28, V27, V26, V25, V24, V23, V22, V21, V20)>;
+
+// VSX register classes (the allocation order mirrors that of the corresponding
+// subregister classes).
+def VSLRC : RegisterClass<"PPC", [v4i32,v4f32,v2f64,v2i64], 128,
+ (add (sequence "VSL%u", 0, 13),
+ (sequence "VSL%u", 31, 14))>;
+def VSRC : RegisterClass<"PPC", [v4i32,v4f32,v2f64,v2i64], 128,
+ (add VSLRC, VRRC)>;
+
+// Register classes for the 64-bit "scalar" VSX subregisters.
+def VFRC : RegisterClass<"PPC", [f64], 64,
+ (add VF2, VF3, VF4, VF5, VF0, VF1, VF6, VF7,
+ VF8, VF9, VF10, VF11, VF12, VF13, VF14,
+ VF15, VF16, VF17, VF18, VF19, VF31, VF30,
+ VF29, VF28, VF27, VF26, VF25, VF24, VF23,
+ VF22, VF21, VF20)>;
+def VSFRC : RegisterClass<"PPC", [f64], 64, (add F8RC, VFRC)>;
+
+// Register class for single precision scalars in VSX registers
+def VSSRC : RegisterClass<"PPC", [f32], 32, (add VSFRC)>;
+
+// For QPX
+def QFRC : RegisterClass<"PPC", [v4f64], 256, (add (sequence "QF%u", 0, 13),
+ (sequence "QF%u", 31, 14))>;
+def QSRC : RegisterClass<"PPC", [v4f32], 128, (add QFRC)>;
+def QBRC : RegisterClass<"PPC", [v4i1], 256, (add QFRC)> {
+ // These are actually stored as floating-point values where a positive
+ // number is true and anything else (including NaN) is false.
+ let Size = 256;
+}
+
+def CRBITRC : RegisterClass<"PPC", [i1], 32,
+ (add CR2LT, CR2GT, CR2EQ, CR2UN,
+ CR3LT, CR3GT, CR3EQ, CR3UN,
+ CR4LT, CR4GT, CR4EQ, CR4UN,
+ CR5LT, CR5GT, CR5EQ, CR5UN,
+ CR6LT, CR6GT, CR6EQ, CR6UN,
+ CR7LT, CR7GT, CR7EQ, CR7UN,
+ CR1LT, CR1GT, CR1EQ, CR1UN,
+ CR0LT, CR0GT, CR0EQ, CR0UN)> {
+ let Size = 32;
+}
+
+def CRRC : RegisterClass<"PPC", [i32], 32, (add CR0, CR1, CR5, CR6,
+ CR7, CR2, CR3, CR4)>;
+
+def CRRC0 : RegisterClass<"PPC", [i32], 32, (add CR0)>;
+
+// The CTR registers are not allocatable because they're used by the
+// decrement-and-branch instructions, and thus need to stay live across
+// multiple basic blocks.
+def CTRRC : RegisterClass<"PPC", [i32], 32, (add CTR)> {
+ let isAllocatable = 0;
+}
+def CTRRC8 : RegisterClass<"PPC", [i64], 64, (add CTR8)> {
+ let isAllocatable = 0;
+}
+
+def VRSAVERC : RegisterClass<"PPC", [i32], 32, (add VRSAVE)>;
+def CARRYRC : RegisterClass<"PPC", [i32], 32, (add CARRY)> {
+ let CopyCost = -1;
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSchedule.td b/contrib/llvm/lib/Target/PowerPC/PPCSchedule.td
new file mode 100644
index 000000000000..edabe7748673
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCSchedule.td
@@ -0,0 +1,135 @@
+//===-- PPCSchedule.td - PowerPC Scheduling Definitions ----*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction Itinerary classes used for PowerPC
+//
+def IIC_IntSimple : InstrItinClass;
+def IIC_IntGeneral : InstrItinClass;
+def IIC_IntCompare : InstrItinClass;
+def IIC_IntISEL : InstrItinClass;
+def IIC_IntDivD : InstrItinClass;
+def IIC_IntDivW : InstrItinClass;
+def IIC_IntMFFS : InstrItinClass;
+def IIC_IntMFVSCR : InstrItinClass;
+def IIC_IntMTFSB0 : InstrItinClass;
+def IIC_IntMTSRD : InstrItinClass;
+def IIC_IntMulHD : InstrItinClass;
+def IIC_IntMulHW : InstrItinClass;
+def IIC_IntMulHWU : InstrItinClass;
+def IIC_IntMulLI : InstrItinClass;
+def IIC_IntRFID : InstrItinClass;
+def IIC_IntRotateD : InstrItinClass;
+def IIC_IntRotateDI : InstrItinClass;
+def IIC_IntRotate : InstrItinClass;
+def IIC_IntShift : InstrItinClass;
+def IIC_IntTrapD : InstrItinClass;
+def IIC_IntTrapW : InstrItinClass;
+def IIC_BrB : InstrItinClass;
+def IIC_BrCR : InstrItinClass;
+def IIC_BrMCR : InstrItinClass;
+def IIC_BrMCRX : InstrItinClass;
+def IIC_LdStDCBA : InstrItinClass;
+def IIC_LdStDCBF : InstrItinClass;
+def IIC_LdStDCBI : InstrItinClass;
+def IIC_LdStLoad : InstrItinClass;
+def IIC_LdStLoadUpd : InstrItinClass;
+def IIC_LdStLoadUpdX : InstrItinClass;
+def IIC_LdStStore : InstrItinClass;
+def IIC_LdStStoreUpd : InstrItinClass;
+def IIC_LdStDSS : InstrItinClass;
+def IIC_LdStICBI : InstrItinClass;
+def IIC_LdStLD : InstrItinClass;
+def IIC_LdStLDU : InstrItinClass;
+def IIC_LdStLDUX : InstrItinClass;
+def IIC_LdStLDARX : InstrItinClass;
+def IIC_LdStLFD : InstrItinClass;
+def IIC_LdStLFDU : InstrItinClass;
+def IIC_LdStLFDUX : InstrItinClass;
+def IIC_LdStLHA : InstrItinClass;
+def IIC_LdStLHAU : InstrItinClass;
+def IIC_LdStLHAUX : InstrItinClass;
+def IIC_LdStLMW : InstrItinClass;
+def IIC_LdStLVecX : InstrItinClass;
+def IIC_LdStLWA : InstrItinClass;
+def IIC_LdStLWARX : InstrItinClass;
+def IIC_LdStSLBIA : InstrItinClass;
+def IIC_LdStSLBIE : InstrItinClass;
+def IIC_LdStSTD : InstrItinClass;
+def IIC_LdStSTDCX : InstrItinClass;
+def IIC_LdStSTDU : InstrItinClass;
+def IIC_LdStSTDUX : InstrItinClass;
+def IIC_LdStSTFD : InstrItinClass;
+def IIC_LdStSTFDU : InstrItinClass;
+def IIC_LdStSTVEBX : InstrItinClass;
+def IIC_LdStSTWCX : InstrItinClass;
+def IIC_LdStSync : InstrItinClass;
+def IIC_LdStCOPY : InstrItinClass;
+def IIC_LdStPASTE : InstrItinClass;
+def IIC_SprISYNC : InstrItinClass;
+def IIC_SprMFSR : InstrItinClass;
+def IIC_SprMTMSR : InstrItinClass;
+def IIC_SprMTSR : InstrItinClass;
+def IIC_SprTLBSYNC : InstrItinClass;
+def IIC_SprMFCR : InstrItinClass;
+def IIC_SprMFCRF : InstrItinClass;
+def IIC_SprMFMSR : InstrItinClass;
+def IIC_SprMFSPR : InstrItinClass;
+def IIC_SprMFTB : InstrItinClass;
+def IIC_SprMTSPR : InstrItinClass;
+def IIC_SprMTSRIN : InstrItinClass;
+def IIC_SprRFI : InstrItinClass;
+def IIC_SprSC : InstrItinClass;
+def IIC_FPGeneral : InstrItinClass;
+def IIC_FPAddSub : InstrItinClass;
+def IIC_FPCompare : InstrItinClass;
+def IIC_FPDivD : InstrItinClass;
+def IIC_FPDivS : InstrItinClass;
+def IIC_FPFused : InstrItinClass;
+def IIC_FPRes : InstrItinClass;
+def IIC_FPSqrtD : InstrItinClass;
+def IIC_FPSqrtS : InstrItinClass;
+def IIC_VecGeneral : InstrItinClass;
+def IIC_VecFP : InstrItinClass;
+def IIC_VecFPCompare : InstrItinClass;
+def IIC_VecComplex : InstrItinClass;
+def IIC_VecPerm : InstrItinClass;
+def IIC_VecFPRound : InstrItinClass;
+def IIC_VecVSL : InstrItinClass;
+def IIC_VecVSR : InstrItinClass;
+def IIC_SprMTMSRD : InstrItinClass;
+def IIC_SprSLIE : InstrItinClass;
+def IIC_SprSLBIE : InstrItinClass;
+def IIC_SprSLBIEG : InstrItinClass;
+def IIC_SprSLBMTE : InstrItinClass;
+def IIC_SprSLBMFEE : InstrItinClass;
+def IIC_SprSLBMFEV : InstrItinClass;
+def IIC_SprSLBIA : InstrItinClass;
+def IIC_SprSLBSYNC : InstrItinClass;
+def IIC_SprTLBIA : InstrItinClass;
+def IIC_SprTLBIEL : InstrItinClass;
+def IIC_SprTLBIE : InstrItinClass;
+def IIC_SprABORT : InstrItinClass;
+def IIC_SprMSGSYNC : InstrItinClass;
+def IIC_SprSTOP : InstrItinClass;
+
+//===----------------------------------------------------------------------===//
+// Processor instruction itineraries.
+
+include "PPCScheduleG3.td"
+include "PPCSchedule440.td"
+include "PPCScheduleG4.td"
+include "PPCScheduleG4Plus.td"
+include "PPCScheduleG5.td"
+include "PPCScheduleP7.td"
+include "PPCScheduleP8.td"
+include "PPCScheduleP9.td"
+include "PPCScheduleA2.td"
+include "PPCScheduleE500mc.td"
+include "PPCScheduleE5500.td"
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSchedule440.td b/contrib/llvm/lib/Target/PowerPC/PPCSchedule440.td
new file mode 100644
index 000000000000..2455e5e52de5
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCSchedule440.td
@@ -0,0 +1,608 @@
+//===-- PPCSchedule440.td - PPC 440 Scheduling Definitions -*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// Primary reference:
+// PowerPC 440x6 Embedded Processor Core User's Manual.
+// IBM (as updated in) 2010.
+
+// The basic PPC 440 does not include a floating-point unit; the pipeline
+// timings here are constructed to match the FP2 unit shipped with the
+// PPC-440- and PPC-450-based Blue Gene (L and P) supercomputers.
+// References:
+// S. Chatterjee, et al. Design and exploitation of a high-performance
+// SIMD floating-point unit for Blue Gene/L.
+// IBM J. Res. & Dev. 49 (2/3) March/May 2005.
+// also:
+// Carlos Sosa and Brant Knudson. IBM System Blue Gene Solution:
+// Blue Gene/P Application Development.
+// IBM (as updated in) 2009.
+
+//===----------------------------------------------------------------------===//
+// Functional units on the PowerPC 440/450 chip sets
+//
+def P440_DISS1 : FuncUnit; // Issue unit 1
+def P440_DISS2 : FuncUnit; // Issue unit 2
+def P440_LRACC : FuncUnit; // Register access and dispatch for
+ // the simple integer (J-pipe) and
+ // load/store (L-pipe) pipelines
+def P440_IRACC : FuncUnit; // Register access and dispatch for
+ // the complex integer (I-pipe) pipeline
+def P440_FRACC : FuncUnit; // Register access and dispatch for
+ // the floating-point execution (F-pipe) pipeline
+def P440_IEXE1 : FuncUnit; // Execution stage 1 for the I pipeline
+def P440_IEXE2 : FuncUnit; // Execution stage 2 for the I pipeline
+def P440_IWB : FuncUnit; // Write-back unit for the I pipeline
+def P440_JEXE1 : FuncUnit; // Execution stage 1 for the J pipeline
+def P440_JEXE2 : FuncUnit; // Execution stage 2 for the J pipeline
+def P440_JWB : FuncUnit; // Write-back unit for the J pipeline
+def P440_AGEN : FuncUnit; // Address generation for the L pipeline
+def P440_CRD : FuncUnit; // D-cache access for the L pipeline
+def P440_LWB : FuncUnit; // Write-back unit for the L pipeline
+def P440_FEXE1 : FuncUnit; // Execution stage 1 for the F pipeline
+def P440_FEXE2 : FuncUnit; // Execution stage 2 for the F pipeline
+def P440_FEXE3 : FuncUnit; // Execution stage 3 for the F pipeline
+def P440_FEXE4 : FuncUnit; // Execution stage 4 for the F pipeline
+def P440_FEXE5 : FuncUnit; // Execution stage 5 for the F pipeline
+def P440_FEXE6 : FuncUnit; // Execution stage 6 for the F pipeline
+def P440_FWB : FuncUnit; // Write-back unit for the F pipeline
+
+def P440_LWARX_Hold : FuncUnit; // This is a pseudo-unit which is used
+ // to make sure that no lwarx/stwcx.
+ // instructions are issued while another
+ // lwarx/stwcx. is in the L pipe.
+
+def P440_GPR_Bypass : Bypass; // The bypass for general-purpose regs.
+def P440_FPR_Bypass : Bypass; // The bypass for floating-point regs.
+
+// Notes:
+// Instructions are held in the FRACC, LRACC and IRACC pipeline
+// stages until their source operands become ready. Exceptions:
+// - Store instructions will hold in the AGEN stage
+// - The integer multiply-accumulate instruction will hold in
+// the IEXE1 stage
+//
+// For most I-pipe operations, the result is available at the end of
+// the IEXE1 stage. Operations such as multiply and divide must
+// continue to execute in IEXE2 and IWB. Divide resides in IWB for
+// 33 cycles (multiply also calculates its result in IWB). For all
+// J-pipe instructions, the result is available
+// at the end of the JEXE1 stage. Loads have a 3-cycle latency
+// (data is not available until after the LWB stage).
+//
+// The L1 cache hit latency is four cycles for floating point loads
+// and three cycles for integer loads.
+//
+// The stwcx. instruction requires both the LRACC and the IRACC
+// dispatch stages. It must be issued from DISS0.
+//
+// All lwarx/stwcx. instructions hold in LRACC if another
+// uncommitted lwarx/stwcx. is in AGEN, CRD, or LWB.
+//
+// msync (a.k.a. sync) and mbar will hold in LWB until all load/store
+// resources are empty. AGEN and CRD are held empty until the msync/mbar
+// commits.
+//
+// Most floating-point instructions, computational and move,
+// have a 5-cycle latency. Divide takes longer (30 cycles). Instructions that
+// update the CR take 2 cycles. Stores take 3 cycles and, as mentioned above,
+// loads take 4 cycles (for L1 hit).
+
+//
+// This file defines the itinerary class data for the PPC 440 processor.
+//
+//===----------------------------------------------------------------------===//
+
+
+def PPC440Itineraries : ProcessorItineraries<
+ [P440_DISS1, P440_DISS2, P440_FRACC, P440_IRACC, P440_IEXE1, P440_IEXE2,
+ P440_IWB, P440_LRACC, P440_JEXE1, P440_JEXE2, P440_JWB, P440_AGEN, P440_CRD,
+ P440_LWB, P440_FEXE1, P440_FEXE2, P440_FEXE3, P440_FEXE4, P440_FEXE5,
+ P440_FEXE6, P440_FWB, P440_LWARX_Hold],
+ [P440_GPR_Bypass, P440_FPR_Bypass], [
+ InstrItinData<IIC_IntSimple, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_IRACC, P440_LRACC]>,
+ InstrStage<1, [P440_IEXE1, P440_JEXE1]>,
+ InstrStage<1, [P440_IEXE2, P440_JEXE2]>,
+ InstrStage<1, [P440_IWB, P440_JWB]>],
+ [2, 0, 0],
+ [P440_GPR_Bypass,
+ P440_GPR_Bypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_IntGeneral, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_IRACC, P440_LRACC]>,
+ InstrStage<1, [P440_IEXE1, P440_JEXE1]>,
+ InstrStage<1, [P440_IEXE2, P440_JEXE2]>,
+ InstrStage<1, [P440_IWB, P440_JWB]>],
+ [2, 0, 0],
+ [P440_GPR_Bypass,
+ P440_GPR_Bypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_IntISEL, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_IRACC, P440_LRACC]>,
+ InstrStage<1, [P440_IEXE1, P440_JEXE1]>,
+ InstrStage<1, [P440_IEXE2, P440_JEXE2]>,
+ InstrStage<1, [P440_IWB, P440_JWB]>],
+ [2, 0, 0, 0],
+ [P440_GPR_Bypass,
+ P440_GPR_Bypass, P440_GPR_Bypass, NoBypass]>,
+ InstrItinData<IIC_IntCompare, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_IRACC, P440_LRACC]>,
+ InstrStage<1, [P440_IEXE1, P440_JEXE1]>,
+ InstrStage<1, [P440_IEXE2, P440_JEXE2]>,
+ InstrStage<1, [P440_IWB, P440_JWB]>],
+ [2, 0, 0],
+ [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_IntDivW, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_IRACC]>,
+ InstrStage<1, [P440_IEXE1]>,
+ InstrStage<1, [P440_IEXE2]>,
+ InstrStage<33, [P440_IWB]>],
+ [36, 0, 0],
+ [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_IntMFFS, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_IRACC]>,
+ InstrStage<1, [P440_IEXE1]>,
+ InstrStage<1, [P440_IEXE2]>,
+ InstrStage<1, [P440_IWB]>],
+ [3, 0, 0],
+ [P440_GPR_Bypass,
+ P440_GPR_Bypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_IntMTFSB0, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_IRACC]>,
+ InstrStage<1, [P440_IEXE1]>,
+ InstrStage<1, [P440_IEXE2]>,
+ InstrStage<1, [P440_IWB]>],
+ [3, 0, 0],
+ [P440_GPR_Bypass,
+ P440_GPR_Bypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_IntMulHW, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_IRACC]>,
+ InstrStage<1, [P440_IEXE1]>,
+ InstrStage<1, [P440_IEXE2]>,
+ InstrStage<1, [P440_IWB]>],
+ [4, 0, 0],
+ [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_IntMulHWU, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_IRACC]>,
+ InstrStage<1, [P440_IEXE1]>,
+ InstrStage<1, [P440_IEXE2]>,
+ InstrStage<1, [P440_IWB]>],
+ [4, 0, 0],
+ [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_IntMulLI, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_IRACC]>,
+ InstrStage<1, [P440_IEXE1]>,
+ InstrStage<1, [P440_IEXE2]>,
+ InstrStage<1, [P440_IWB]>],
+ [4, 0, 0],
+ [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_IntRotate, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_IRACC, P440_LRACC]>,
+ InstrStage<1, [P440_IEXE1, P440_JEXE1]>,
+ InstrStage<1, [P440_IEXE2, P440_JEXE2]>,
+ InstrStage<1, [P440_IWB, P440_JWB]>],
+ [2, 0, 0],
+ [P440_GPR_Bypass,
+ P440_GPR_Bypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_IntShift, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_IRACC, P440_LRACC]>,
+ InstrStage<1, [P440_IEXE1, P440_JEXE1]>,
+ InstrStage<1, [P440_IEXE2, P440_JEXE2]>,
+ InstrStage<1, [P440_IWB, P440_JWB]>],
+ [2, 0, 0],
+ [P440_GPR_Bypass,
+ P440_GPR_Bypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_IntTrapW, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_IRACC]>,
+ InstrStage<1, [P440_IEXE1]>,
+ InstrStage<1, [P440_IEXE2]>,
+ InstrStage<1, [P440_IWB]>],
+ [2, 0],
+ [P440_GPR_Bypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_BrB, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_IRACC]>,
+ InstrStage<1, [P440_IEXE1]>,
+ InstrStage<1, [P440_IEXE2]>,
+ InstrStage<1, [P440_IWB]>],
+ [4, 0],
+ [NoBypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_BrCR, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_IRACC]>,
+ InstrStage<1, [P440_IEXE1]>,
+ InstrStage<1, [P440_IEXE2]>,
+ InstrStage<1, [P440_IWB]>],
+ [4, 0, 0],
+ [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_BrMCR, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_IRACC]>,
+ InstrStage<1, [P440_IEXE1]>,
+ InstrStage<1, [P440_IEXE2]>,
+ InstrStage<1, [P440_IWB]>],
+ [4, 0, 0],
+ [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_BrMCRX, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_IRACC]>,
+ InstrStage<1, [P440_IEXE1]>,
+ InstrStage<1, [P440_IEXE2]>,
+ InstrStage<1, [P440_IWB]>],
+ [4, 0, 0],
+ [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_LdStDCBA, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_LRACC]>,
+ InstrStage<1, [P440_AGEN]>,
+ InstrStage<1, [P440_CRD]>,
+ InstrStage<1, [P440_LWB]>],
+ [1, 1],
+ [NoBypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_LdStDCBF, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_LRACC]>,
+ InstrStage<1, [P440_AGEN]>,
+ InstrStage<1, [P440_CRD]>,
+ InstrStage<1, [P440_LWB]>],
+ [1, 1],
+ [NoBypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_LdStDCBI, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_LRACC]>,
+ InstrStage<1, [P440_AGEN]>,
+ InstrStage<1, [P440_CRD]>,
+ InstrStage<1, [P440_LWB]>],
+ [1, 1],
+ [NoBypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_LdStLoad, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_LRACC]>,
+ InstrStage<1, [P440_AGEN]>,
+ InstrStage<1, [P440_CRD]>,
+ InstrStage<2, [P440_LWB]>],
+ [5, 1, 1],
+ [P440_GPR_Bypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_LdStLoadUpd,[InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_LRACC]>,
+ InstrStage<1, [P440_AGEN]>,
+ InstrStage<1, [P440_CRD]>,
+ InstrStage<2, [P440_LWB]>],
+ [5, 2, 1, 1],
+ [P440_GPR_Bypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_LdStLoadUpdX,[InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_LRACC]>,
+ InstrStage<1, [P440_AGEN]>,
+ InstrStage<1, [P440_CRD]>,
+ InstrStage<2, [P440_LWB]>],
+ [5, 2, 1, 1],
+ [P440_GPR_Bypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_LdStStore, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_LRACC]>,
+ InstrStage<1, [P440_AGEN]>,
+ InstrStage<1, [P440_CRD]>,
+ InstrStage<2, [P440_LWB]>],
+ [1, 1, 1],
+ [NoBypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_LdStStoreUpd,[InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_LRACC]>,
+ InstrStage<1, [P440_AGEN]>,
+ InstrStage<1, [P440_CRD]>,
+ InstrStage<2, [P440_LWB]>],
+ [2, 1, 1, 1],
+ [NoBypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_LdStICBI, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_LRACC]>,
+ InstrStage<1, [P440_AGEN]>,
+ InstrStage<1, [P440_CRD]>,
+ InstrStage<1, [P440_LWB]>],
+ [4, 1, 1],
+ [NoBypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_LdStSTFD, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_LRACC]>,
+ InstrStage<1, [P440_AGEN]>,
+ InstrStage<1, [P440_CRD]>,
+ InstrStage<1, [P440_LWB]>],
+ [1, 1, 1],
+ [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_LdStSTFDU, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_LRACC]>,
+ InstrStage<1, [P440_AGEN]>,
+ InstrStage<1, [P440_CRD]>,
+ InstrStage<1, [P440_LWB]>],
+ [2, 1, 1, 1],
+ [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_LdStLFD, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_LRACC]>,
+ InstrStage<1, [P440_AGEN]>,
+ InstrStage<1, [P440_CRD]>,
+ InstrStage<2, [P440_LWB]>],
+ [5, 1, 1],
+ [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_LdStLFDU, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_LRACC]>,
+ InstrStage<1, [P440_AGEN]>,
+ InstrStage<1, [P440_CRD]>,
+ InstrStage<1, [P440_LWB]>],
+ [5, 2, 1, 1],
+ [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_LdStLFDUX, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_LRACC]>,
+ InstrStage<1, [P440_AGEN]>,
+ InstrStage<1, [P440_CRD]>,
+ InstrStage<1, [P440_LWB]>],
+ [5, 2, 1, 1],
+ [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_LdStLHA, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_LRACC]>,
+ InstrStage<1, [P440_AGEN]>,
+ InstrStage<1, [P440_CRD]>,
+ InstrStage<1, [P440_LWB]>],
+ [4, 1, 1],
+ [NoBypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_LdStLHAU, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_LRACC]>,
+ InstrStage<1, [P440_AGEN]>,
+ InstrStage<1, [P440_CRD]>,
+ InstrStage<1, [P440_LWB]>],
+ [4, 1, 1],
+ [NoBypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_LdStLHAUX, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_LRACC]>,
+ InstrStage<1, [P440_AGEN]>,
+ InstrStage<1, [P440_CRD]>,
+ InstrStage<1, [P440_LWB]>],
+ [4, 1, 1],
+ [NoBypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_LdStLMW, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_LRACC]>,
+ InstrStage<1, [P440_AGEN]>,
+ InstrStage<1, [P440_CRD]>,
+ InstrStage<1, [P440_LWB]>],
+ [4, 1, 1],
+ [NoBypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_LdStLWARX, [InstrStage<1, [P440_DISS1]>,
+ InstrStage<1, [P440_IRACC], 0>,
+ InstrStage<4, [P440_LWARX_Hold], 0>,
+ InstrStage<1, [P440_LRACC]>,
+ InstrStage<1, [P440_AGEN]>,
+ InstrStage<1, [P440_CRD]>,
+ InstrStage<1, [P440_LWB]>],
+ [4, 1, 1],
+ [NoBypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_LdStSTD, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_LRACC]>,
+ InstrStage<1, [P440_AGEN]>,
+ InstrStage<1, [P440_CRD]>,
+ InstrStage<2, [P440_LWB]>],
+ [4, 1, 1],
+ [NoBypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_LdStSTDU, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_LRACC]>,
+ InstrStage<1, [P440_AGEN]>,
+ InstrStage<1, [P440_CRD]>,
+ InstrStage<2, [P440_LWB]>],
+ [2, 1, 1, 1],
+ [NoBypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_LdStSTDUX, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_LRACC]>,
+ InstrStage<1, [P440_AGEN]>,
+ InstrStage<1, [P440_CRD]>,
+ InstrStage<2, [P440_LWB]>],
+ [2, 1, 1, 1],
+ [NoBypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_LdStSTDCX, [InstrStage<1, [P440_DISS1]>,
+ InstrStage<1, [P440_IRACC], 0>,
+ InstrStage<4, [P440_LWARX_Hold], 0>,
+ InstrStage<1, [P440_LRACC]>,
+ InstrStage<1, [P440_AGEN]>,
+ InstrStage<1, [P440_CRD]>,
+ InstrStage<1, [P440_LWB]>],
+ [4, 1, 1],
+ [NoBypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_LdStSTWCX, [InstrStage<1, [P440_DISS1]>,
+ InstrStage<1, [P440_IRACC], 0>,
+ InstrStage<4, [P440_LWARX_Hold], 0>,
+ InstrStage<1, [P440_LRACC]>,
+ InstrStage<1, [P440_AGEN]>,
+ InstrStage<1, [P440_CRD]>,
+ InstrStage<1, [P440_LWB]>],
+ [4, 1, 1],
+ [NoBypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_LdStSync, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_LRACC]>,
+ InstrStage<3, [P440_AGEN], 1>,
+ InstrStage<2, [P440_CRD], 1>,
+ InstrStage<1, [P440_LWB]>]>,
+ InstrItinData<IIC_SprISYNC, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_FRACC], 0>,
+ InstrStage<1, [P440_LRACC], 0>,
+ InstrStage<1, [P440_IRACC]>,
+ InstrStage<1, [P440_FEXE1], 0>,
+ InstrStage<1, [P440_AGEN], 0>,
+ InstrStage<1, [P440_JEXE1], 0>,
+ InstrStage<1, [P440_IEXE1]>,
+ InstrStage<1, [P440_FEXE2], 0>,
+ InstrStage<1, [P440_CRD], 0>,
+ InstrStage<1, [P440_JEXE2], 0>,
+ InstrStage<1, [P440_IEXE2]>,
+ InstrStage<6, [P440_FEXE3], 0>,
+ InstrStage<6, [P440_LWB], 0>,
+ InstrStage<6, [P440_JWB], 0>,
+ InstrStage<6, [P440_IWB]>]>,
+ InstrItinData<IIC_SprMFSR, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_IRACC]>,
+ InstrStage<1, [P440_IEXE1]>,
+ InstrStage<1, [P440_IEXE2]>,
+ InstrStage<1, [P440_IWB]>],
+ [2, 0],
+ [P440_GPR_Bypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_SprMTMSR, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_IRACC]>,
+ InstrStage<1, [P440_IEXE1]>,
+ InstrStage<1, [P440_IEXE2]>,
+ InstrStage<1, [P440_IWB]>],
+ [2, 0],
+ [P440_GPR_Bypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_SprMTSR, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_IRACC]>,
+ InstrStage<1, [P440_IEXE1]>,
+ InstrStage<1, [P440_IEXE2]>,
+ InstrStage<3, [P440_IWB]>],
+ [5, 0],
+ [NoBypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_SprTLBSYNC, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_IRACC]>,
+ InstrStage<1, [P440_IEXE1]>,
+ InstrStage<1, [P440_IEXE2]>,
+ InstrStage<1, [P440_IWB]>]>,
+ InstrItinData<IIC_SprMFCR, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_IRACC]>,
+ InstrStage<1, [P440_IEXE1]>,
+ InstrStage<1, [P440_IEXE2]>,
+ InstrStage<1, [P440_IWB]>],
+ [4, 0],
+ [NoBypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_SprMFMSR, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_IRACC]>,
+ InstrStage<1, [P440_IEXE1]>,
+ InstrStage<1, [P440_IEXE2]>,
+ InstrStage<1, [P440_IWB]>],
+ [3, 0],
+ [P440_GPR_Bypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_SprMFSPR, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_IRACC]>,
+ InstrStage<1, [P440_IEXE1]>,
+ InstrStage<1, [P440_IEXE2]>,
+ InstrStage<3, [P440_IWB]>],
+ [6, 0],
+ [NoBypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_SprMFTB, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_IRACC]>,
+ InstrStage<1, [P440_IEXE1]>,
+ InstrStage<1, [P440_IEXE2]>,
+ InstrStage<3, [P440_IWB]>],
+ [6, 0],
+ [NoBypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_SprMTSPR, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_IRACC]>,
+ InstrStage<1, [P440_IEXE1]>,
+ InstrStage<1, [P440_IEXE2]>,
+ InstrStage<3, [P440_IWB]>],
+ [6, 0],
+ [NoBypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_SprMTSRIN, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_IRACC]>,
+ InstrStage<1, [P440_IEXE1]>,
+ InstrStage<1, [P440_IEXE2]>,
+ InstrStage<3, [P440_IWB]>],
+ [6, 0],
+ [NoBypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_SprRFI, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_IRACC]>,
+ InstrStage<1, [P440_IEXE1]>,
+ InstrStage<1, [P440_IEXE2]>,
+ InstrStage<1, [P440_IWB]>],
+ [4, 0],
+ [NoBypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_SprSC, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_IRACC]>,
+ InstrStage<1, [P440_IEXE1]>,
+ InstrStage<1, [P440_IEXE2]>,
+ InstrStage<1, [P440_IWB]>],
+ [4, 0],
+ [NoBypass, P440_GPR_Bypass]>,
+ InstrItinData<IIC_FPGeneral, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_FRACC]>,
+ InstrStage<1, [P440_FEXE1]>,
+ InstrStage<1, [P440_FEXE2]>,
+ InstrStage<1, [P440_FEXE3]>,
+ InstrStage<1, [P440_FEXE4]>,
+ InstrStage<1, [P440_FEXE5]>,
+ InstrStage<1, [P440_FEXE6]>,
+ InstrStage<1, [P440_FWB]>],
+ [6, 0, 0],
+ [P440_FPR_Bypass,
+ P440_FPR_Bypass, P440_FPR_Bypass]>,
+ InstrItinData<IIC_FPAddSub, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_FRACC]>,
+ InstrStage<1, [P440_FEXE1]>,
+ InstrStage<1, [P440_FEXE2]>,
+ InstrStage<1, [P440_FEXE3]>,
+ InstrStage<1, [P440_FEXE4]>,
+ InstrStage<1, [P440_FEXE5]>,
+ InstrStage<1, [P440_FEXE6]>,
+ InstrStage<1, [P440_FWB]>],
+ [6, 0, 0],
+ [P440_FPR_Bypass,
+ P440_FPR_Bypass, P440_FPR_Bypass]>,
+ InstrItinData<IIC_FPCompare, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_FRACC]>,
+ InstrStage<1, [P440_FEXE1]>,
+ InstrStage<1, [P440_FEXE2]>,
+ InstrStage<1, [P440_FEXE3]>,
+ InstrStage<1, [P440_FEXE4]>,
+ InstrStage<1, [P440_FEXE5]>,
+ InstrStage<1, [P440_FEXE6]>,
+ InstrStage<1, [P440_FWB]>],
+ [6, 0, 0],
+ [P440_FPR_Bypass, P440_FPR_Bypass,
+ P440_FPR_Bypass]>,
+ InstrItinData<IIC_FPDivD, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_FRACC]>,
+ InstrStage<1, [P440_FEXE1]>,
+ InstrStage<1, [P440_FEXE2]>,
+ InstrStage<1, [P440_FEXE3]>,
+ InstrStage<1, [P440_FEXE4]>,
+ InstrStage<1, [P440_FEXE5]>,
+ InstrStage<1, [P440_FEXE6]>,
+ InstrStage<25, [P440_FWB]>],
+ [31, 0, 0],
+ [NoBypass, P440_FPR_Bypass, P440_FPR_Bypass]>,
+ InstrItinData<IIC_FPDivS, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_FRACC]>,
+ InstrStage<1, [P440_FEXE1]>,
+ InstrStage<1, [P440_FEXE2]>,
+ InstrStage<1, [P440_FEXE3]>,
+ InstrStage<1, [P440_FEXE4]>,
+ InstrStage<1, [P440_FEXE5]>,
+ InstrStage<1, [P440_FEXE6]>,
+ InstrStage<13, [P440_FWB]>],
+ [19, 0, 0],
+ [NoBypass, P440_FPR_Bypass, P440_FPR_Bypass]>,
+ InstrItinData<IIC_FPFused, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_FRACC]>,
+ InstrStage<1, [P440_FEXE1]>,
+ InstrStage<1, [P440_FEXE2]>,
+ InstrStage<1, [P440_FEXE3]>,
+ InstrStage<1, [P440_FEXE4]>,
+ InstrStage<1, [P440_FEXE5]>,
+ InstrStage<1, [P440_FEXE6]>,
+ InstrStage<1, [P440_FWB]>],
+ [6, 0, 0, 0],
+ [P440_FPR_Bypass,
+ P440_FPR_Bypass, P440_FPR_Bypass,
+ P440_FPR_Bypass]>,
+ InstrItinData<IIC_FPRes, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+ InstrStage<1, [P440_FRACC]>,
+ InstrStage<1, [P440_FEXE1]>,
+ InstrStage<1, [P440_FEXE2]>,
+ InstrStage<1, [P440_FEXE3]>,
+ InstrStage<1, [P440_FEXE4]>,
+ InstrStage<1, [P440_FEXE5]>,
+ InstrStage<1, [P440_FEXE6]>,
+ InstrStage<1, [P440_FWB]>],
+ [6, 0],
+ [P440_FPR_Bypass, P440_FPR_Bypass]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// PPC440 machine model for scheduling and other instruction cost heuristics.
+
+def PPC440Model : SchedMachineModel {
+ let IssueWidth = 2; // 2 instructions are dispatched per cycle.
+ let LoadLatency = 5; // Optimistic load latency assuming bypass.
+ // This is overriden by OperandCycles if the
+ // Itineraries are queried instead.
+
+ let CompleteModel = 0;
+
+ let Itineraries = PPC440Itineraries;
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleA2.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleA2.td
new file mode 100644
index 000000000000..54cfae5d74b7
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleA2.td
@@ -0,0 +1,172 @@
+//===- PPCScheduleA2.td - PPC A2 Scheduling Definitions --*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// Primary reference:
+// A2 Processor User's Manual.
+// IBM (as updated in) 2010.
+
+//===----------------------------------------------------------------------===//
+// Functional units on the PowerPC A2 chip sets
+//
+def A2_XU : FuncUnit; // A2_XU pipeline
+def A2_FU : FuncUnit; // FI pipeline
+
+//
+// This file defines the itinerary class data for the PPC A2 processor.
+//
+//===----------------------------------------------------------------------===//
+
+
+def PPCA2Itineraries : ProcessorItineraries<
+ [A2_XU, A2_FU], [], [
+ InstrItinData<IIC_IntSimple, [InstrStage<1, [A2_XU]>],
+ [1, 0, 0]>,
+ InstrItinData<IIC_IntGeneral, [InstrStage<1, [A2_XU]>],
+ [2, 0, 0]>,
+ InstrItinData<IIC_IntISEL, [InstrStage<1, [A2_XU]>],
+ [2, 0, 0, 0]>,
+ InstrItinData<IIC_IntCompare, [InstrStage<1, [A2_XU]>],
+ [2, 0, 0]>,
+ InstrItinData<IIC_IntDivW, [InstrStage<1, [A2_XU]>],
+ [39, 0, 0]>,
+ InstrItinData<IIC_IntDivD, [InstrStage<1, [A2_XU]>],
+ [71, 0, 0]>,
+ InstrItinData<IIC_IntMulHW, [InstrStage<1, [A2_XU]>],
+ [5, 0, 0]>,
+ InstrItinData<IIC_IntMulHWU, [InstrStage<1, [A2_XU]>],
+ [5, 0, 0]>,
+ InstrItinData<IIC_IntMulLI, [InstrStage<1, [A2_XU]>],
+ [6, 0, 0]>,
+ InstrItinData<IIC_IntRotate, [InstrStage<1, [A2_XU]>],
+ [2, 0, 0]>,
+ InstrItinData<IIC_IntRotateD, [InstrStage<1, [A2_XU]>],
+ [2, 0, 0]>,
+ InstrItinData<IIC_IntRotateDI, [InstrStage<1, [A2_XU]>],
+ [2, 0, 0]>,
+ InstrItinData<IIC_IntShift, [InstrStage<1, [A2_XU]>],
+ [2, 0, 0]>,
+ InstrItinData<IIC_IntTrapW, [InstrStage<1, [A2_XU]>],
+ [2, 0]>,
+ InstrItinData<IIC_IntTrapD, [InstrStage<1, [A2_XU]>],
+ [2, 0]>,
+ InstrItinData<IIC_BrB, [InstrStage<1, [A2_XU]>],
+ [6, 0, 0]>,
+ InstrItinData<IIC_BrCR, [InstrStage<1, [A2_XU]>],
+ [1, 0, 0]>,
+ InstrItinData<IIC_BrMCR, [InstrStage<1, [A2_XU]>],
+ [5, 0, 0]>,
+ InstrItinData<IIC_BrMCRX, [InstrStage<1, [A2_XU]>],
+ [1, 0, 0]>,
+ InstrItinData<IIC_LdStDCBA, [InstrStage<1, [A2_XU]>],
+ [1, 0, 0]>,
+ InstrItinData<IIC_LdStDCBF, [InstrStage<1, [A2_XU]>],
+ [1, 0, 0]>,
+ InstrItinData<IIC_LdStDCBI, [InstrStage<1, [A2_XU]>],
+ [1, 0, 0]>,
+ InstrItinData<IIC_LdStLoad, [InstrStage<1, [A2_XU]>],
+ [6, 0, 0]>,
+ InstrItinData<IIC_LdStLoadUpd, [InstrStage<1, [A2_XU]>],
+ [6, 8, 0, 0]>,
+ InstrItinData<IIC_LdStLoadUpdX,[InstrStage<1, [A2_XU]>],
+ [6, 8, 0, 0]>,
+ InstrItinData<IIC_LdStLDU, [InstrStage<1, [A2_XU]>],
+ [6, 0, 0]>,
+ InstrItinData<IIC_LdStLDUX, [InstrStage<1, [A2_XU]>],
+ [6, 0, 0]>,
+ InstrItinData<IIC_LdStStore, [InstrStage<1, [A2_XU]>],
+ [0, 0, 0]>,
+ InstrItinData<IIC_LdStStoreUpd,[InstrStage<1, [A2_XU]>],
+ [2, 0, 0, 0]>,
+ InstrItinData<IIC_LdStICBI, [InstrStage<1, [A2_XU]>],
+ [16, 0, 0]>,
+ InstrItinData<IIC_LdStSTFD, [InstrStage<1, [A2_XU]>],
+ [0, 0, 0]>,
+ InstrItinData<IIC_LdStSTFDU, [InstrStage<1, [A2_XU]>],
+ [2, 0, 0, 0]>,
+ InstrItinData<IIC_LdStLFD, [InstrStage<1, [A2_XU]>],
+ [7, 0, 0]>,
+ InstrItinData<IIC_LdStLFDU, [InstrStage<1, [A2_XU]>],
+ [7, 9, 0, 0]>,
+ InstrItinData<IIC_LdStLFDUX, [InstrStage<1, [A2_XU]>],
+ [7, 9, 0, 0]>,
+ InstrItinData<IIC_LdStLHA, [InstrStage<1, [A2_XU]>],
+ [6, 0, 0]>,
+ InstrItinData<IIC_LdStLHAU, [InstrStage<1, [A2_XU]>],
+ [6, 8, 0, 0]>,
+ InstrItinData<IIC_LdStLHAUX, [InstrStage<1, [A2_XU]>],
+ [6, 8, 0, 0]>,
+ InstrItinData<IIC_LdStLWARX, [InstrStage<1, [A2_XU]>],
+ [82, 0, 0]>, // L2 latency
+ InstrItinData<IIC_LdStSTD, [InstrStage<1, [A2_XU]>],
+ [0, 0, 0]>,
+ InstrItinData<IIC_LdStSTDU, [InstrStage<1, [A2_XU]>],
+ [2, 0, 0, 0]>,
+ InstrItinData<IIC_LdStSTDUX, [InstrStage<1, [A2_XU]>],
+ [2, 0, 0, 0]>,
+ InstrItinData<IIC_LdStSTDCX, [InstrStage<1, [A2_XU]>],
+ [82, 0, 0]>, // L2 latency
+ InstrItinData<IIC_LdStSTWCX, [InstrStage<1, [A2_XU]>],
+ [82, 0, 0]>, // L2 latency
+ InstrItinData<IIC_LdStSync, [InstrStage<1, [A2_XU]>],
+ [6]>,
+ InstrItinData<IIC_SprISYNC, [InstrStage<1, [A2_XU]>],
+ [16]>,
+ InstrItinData<IIC_SprMTMSR, [InstrStage<1, [A2_XU]>],
+ [16, 0]>,
+ InstrItinData<IIC_SprMFCR, [InstrStage<1, [A2_XU]>],
+ [6, 0]>,
+ InstrItinData<IIC_SprMFCRF, [InstrStage<1, [A2_XU]>],
+ [1, 0]>,
+ InstrItinData<IIC_SprMFMSR, [InstrStage<1, [A2_XU]>],
+ [4, 0]>,
+ InstrItinData<IIC_SprMFSPR, [InstrStage<1, [A2_XU]>],
+ [6, 0]>,
+ InstrItinData<IIC_SprMFTB, [InstrStage<1, [A2_XU]>],
+ [4, 0]>,
+ InstrItinData<IIC_SprMTSPR, [InstrStage<1, [A2_XU]>],
+ [6, 0]>,
+ InstrItinData<IIC_SprRFI, [InstrStage<1, [A2_XU]>],
+ [16]>,
+ InstrItinData<IIC_SprSC, [InstrStage<1, [A2_XU]>],
+ [16]>,
+ InstrItinData<IIC_FPGeneral, [InstrStage<1, [A2_FU]>],
+ [6, 0, 0]>,
+ InstrItinData<IIC_FPAddSub, [InstrStage<1, [A2_FU]>],
+ [6, 0, 0]>,
+ InstrItinData<IIC_FPCompare, [InstrStage<1, [A2_FU]>],
+ [5, 0, 0]>,
+ InstrItinData<IIC_FPDivD, [InstrStage<1, [A2_FU]>],
+ [72, 0, 0]>,
+ InstrItinData<IIC_FPDivS, [InstrStage<1, [A2_FU]>],
+ [59, 0, 0]>,
+ InstrItinData<IIC_FPSqrtD, [InstrStage<1, [A2_FU]>],
+ [69, 0, 0]>,
+ InstrItinData<IIC_FPSqrtS, [InstrStage<1, [A2_FU]>],
+ [65, 0, 0]>,
+ InstrItinData<IIC_FPFused, [InstrStage<1, [A2_FU]>],
+ [6, 0, 0, 0]>,
+ InstrItinData<IIC_FPRes, [InstrStage<1, [A2_FU]>],
+ [6, 0]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// A2 machine model for scheduling and other instruction cost heuristics.
+
+def PPCA2Model : SchedMachineModel {
+ let IssueWidth = 1; // 1 instruction is dispatched per cycle.
+ let LoadLatency = 6; // Optimistic load latency assuming bypass.
+ // This is overriden by OperandCycles if the
+ // Itineraries are queried instead.
+ let MispredictPenalty = 13;
+
+ let CompleteModel = 0;
+
+ let Itineraries = PPCA2Itineraries;
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleE500mc.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleE500mc.td
new file mode 100644
index 000000000000..f687d326b52d
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleE500mc.td
@@ -0,0 +1,321 @@
+//===-- PPCScheduleE500mc.td - e500mc Scheduling Defs ------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the Freescale e500mc 32-bit
+// Power processor.
+//
+// All information is derived from the "e500mc Core Reference Manual",
+// Freescale Document Number E500MCRM, Rev. 1, 03/2012.
+//
+//===----------------------------------------------------------------------===//
+// Relevant functional units in the Freescale e500mc core:
+//
+// * Decode & Dispatch
+// Can dispatch up to 2 instructions per clock cycle to either the GPR Issue
+// queues (GIQx), FP Issue Queue (FIQ), or Branch issue queue (BIQ).
+def E500_DIS0 : FuncUnit; // Dispatch stage - insn 1
+def E500_DIS1 : FuncUnit; // Dispatch stage - insn 2
+
+// * Execute
+// 6 pipelined execution units: SFX0, SFX1, BU, FPU, LSU, CFX.
+// Some instructions can only execute in SFX0 but not SFX1.
+// The CFX has a bypass path, allowing non-divide instructions to execute
+// while a divide instruction is executed.
+def E500_SFX0 : FuncUnit; // Simple unit 0
+def E500_SFX1 : FuncUnit; // Simple unit 1
+def E500_BU : FuncUnit; // Branch unit
+def E500_CFX_DivBypass
+ : FuncUnit; // CFX divide bypass path
+def E500_CFX_0 : FuncUnit; // CFX pipeline
+def E500_LSU_0 : FuncUnit; // LSU pipeline
+def E500_FPU_0 : FuncUnit; // FPU pipeline
+
+def E500_GPR_Bypass : Bypass;
+def E500_FPR_Bypass : Bypass;
+def E500_CR_Bypass : Bypass;
+
+def PPCE500mcItineraries : ProcessorItineraries<
+ [E500_DIS0, E500_DIS1, E500_SFX0, E500_SFX1, E500_BU, E500_CFX_DivBypass,
+ E500_CFX_0, E500_LSU_0, E500_FPU_0],
+ [E500_CR_Bypass, E500_GPR_Bypass, E500_FPR_Bypass], [
+ InstrItinData<IIC_IntSimple, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_SFX0, E500_SFX1]>],
+ [4, 1, 1], // Latency = 1
+ [E500_GPR_Bypass,
+ E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_IntGeneral, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_SFX0, E500_SFX1]>],
+ [4, 1, 1], // Latency = 1
+ [E500_GPR_Bypass,
+ E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_IntISEL, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_SFX0, E500_SFX1]>],
+ [4, 1, 1, 1], // Latency = 1
+ [E500_GPR_Bypass,
+ E500_GPR_Bypass, E500_GPR_Bypass,
+ E500_CR_Bypass]>,
+ InstrItinData<IIC_IntCompare, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_SFX0, E500_SFX1]>],
+ [5, 1, 1], // Latency = 1 or 2
+ [E500_CR_Bypass,
+ E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_IntDivW, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_CFX_0], 0>,
+ InstrStage<14, [E500_CFX_DivBypass]>],
+ [17, 1, 1], // Latency=4..35, Repeat= 4..35
+ [E500_GPR_Bypass,
+ E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_IntMFFS, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<8, [E500_FPU_0]>],
+ [11], // Latency = 8
+ [E500_FPR_Bypass]>,
+ InstrItinData<IIC_IntMTFSB0, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<8, [E500_FPU_0]>],
+ [11, 1, 1], // Latency = 8
+ [NoBypass, NoBypass, NoBypass]>,
+ InstrItinData<IIC_IntMulHW, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_CFX_0]>],
+ [7, 1, 1], // Latency = 4, Repeat rate = 1
+ [E500_GPR_Bypass,
+ E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_IntMulHWU, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_CFX_0]>],
+ [7, 1, 1], // Latency = 4, Repeat rate = 1
+ [E500_GPR_Bypass,
+ E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_IntMulLI, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_CFX_0]>],
+ [7, 1, 1], // Latency = 4, Repeat rate = 1
+ [E500_GPR_Bypass,
+ E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_IntRotate, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_SFX0, E500_SFX1]>],
+ [4, 1, 1], // Latency = 1
+ [E500_GPR_Bypass,
+ E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_IntShift, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_SFX0, E500_SFX1]>],
+ [4, 1, 1], // Latency = 1
+ [E500_GPR_Bypass,
+ E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_IntTrapW, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<2, [E500_SFX0]>],
+ [5, 1], // Latency = 2, Repeat rate = 2
+ [E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_BrB, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_BU]>],
+ [4, 1], // Latency = 1
+ [NoBypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_BrCR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_BU]>],
+ [4, 1, 1], // Latency = 1
+ [E500_CR_Bypass,
+ E500_CR_Bypass, E500_CR_Bypass]>,
+ InstrItinData<IIC_BrMCR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_BU]>],
+ [4, 1], // Latency = 1
+ [E500_CR_Bypass, E500_CR_Bypass]>,
+ InstrItinData<IIC_BrMCRX, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_SFX0, E500_SFX1]>],
+ [4, 1, 1], // Latency = 1
+ [E500_CR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStDCBA, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_LSU_0]>],
+ [6, 1], // Latency = 3, Repeat rate = 1
+ [E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStDCBF, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_LSU_0]>],
+ [6, 1], // Latency = 3
+ [E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStDCBI, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_LSU_0]>],
+ [6, 1], // Latency = 3
+ [E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStLoad, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_LSU_0]>],
+ [6, 1], // Latency = 3
+ [E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStLoadUpd, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
+ InstrStage<1, [E500_LSU_0]>],
+ [6, 1], // Latency = 3
+ [E500_GPR_Bypass, E500_GPR_Bypass],
+ 2>, // 2 micro-ops
+ InstrItinData<IIC_LdStLoadUpdX,[InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
+ InstrStage<1, [E500_LSU_0]>],
+ [6, 1], // Latency = 3
+ [E500_GPR_Bypass, E500_GPR_Bypass],
+ 2>, // 2 micro-ops
+ InstrItinData<IIC_LdStStore, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_LSU_0]>],
+ [6, 1], // Latency = 3
+ [NoBypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStStoreUpd,[InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
+ InstrStage<1, [E500_LSU_0]>],
+ [6, 1], // Latency = 3
+ [NoBypass, E500_GPR_Bypass],
+ 2>, // 2 micro-ops
+ InstrItinData<IIC_LdStICBI, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_LSU_0]>],
+ [6, 1], // Latency = 3
+ [NoBypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStSTFD, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_LSU_0]>],
+ [6, 1, 1], // Latency = 3
+ [E500_GPR_Bypass,
+ E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStSTFDU, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
+ InstrStage<1, [E500_LSU_0]>],
+ [6, 1, 1], // Latency = 3
+ [E500_GPR_Bypass,
+ E500_GPR_Bypass, E500_GPR_Bypass],
+ 2>, // 2 micro-ops
+ InstrItinData<IIC_LdStLFD, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_LSU_0]>],
+ [7, 1, 1], // Latency = 4
+ [E500_FPR_Bypass,
+ E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStLFDU, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
+ InstrStage<1, [E500_LSU_0]>],
+ [7, 1, 1], // Latency = 4
+ [E500_FPR_Bypass,
+ E500_GPR_Bypass, E500_GPR_Bypass],
+ 2>, // 2 micro-ops
+ InstrItinData<IIC_LdStLFDUX, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
+ InstrStage<1, [E500_LSU_0]>],
+ [7, 1, 1], // Latency = 4
+ [E500_FPR_Bypass,
+ E500_GPR_Bypass, E500_GPR_Bypass],
+ 2>, // 2 micro-ops
+ InstrItinData<IIC_LdStLHA, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_LSU_0]>],
+ [6, 1], // Latency = 3
+ [E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStLHAU, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
+ InstrStage<1, [E500_LSU_0]>],
+ [6, 1], // Latency = 3
+ [E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStLHAUX, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
+ InstrStage<1, [E500_LSU_0]>],
+ [6, 1], // Latency = 3
+ [E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStLMW, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_LSU_0]>],
+ [7, 1], // Latency = r+3
+ [NoBypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStLWARX, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<3, [E500_LSU_0]>],
+ [6, 1, 1], // Latency = 3, Repeat rate = 3
+ [E500_GPR_Bypass,
+ E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStSTWCX, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_LSU_0]>],
+ [6, 1], // Latency = 3
+ [NoBypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStSync, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_LSU_0]>]>,
+ InstrItinData<IIC_SprMFSR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<4, [E500_SFX0]>],
+ [7, 1],
+ [E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_SprMTMSR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<2, [E500_SFX0, E500_SFX1]>],
+ [5, 1], // Latency = 2, Repeat rate = 4
+ [E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_SprMTSR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_SFX0]>],
+ [5, 1],
+ [NoBypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_SprTLBSYNC, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_LSU_0], 0>]>,
+ InstrItinData<IIC_SprMFCR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<5, [E500_SFX0]>],
+ [8, 1],
+ [E500_GPR_Bypass, E500_CR_Bypass]>,
+ InstrItinData<IIC_SprMFCRF, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<5, [E500_SFX0]>],
+ [8, 1],
+ [E500_GPR_Bypass, E500_CR_Bypass]>,
+ InstrItinData<IIC_SprMFMSR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<4, [E500_SFX0]>],
+ [7, 1], // Latency = 4, Repeat rate = 4
+ [E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_SprMFSPR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_SFX0, E500_SFX1]>],
+ [4, 1], // Latency = 1, Repeat rate = 1
+ [E500_GPR_Bypass, E500_CR_Bypass]>,
+ InstrItinData<IIC_SprMFTB, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<4, [E500_SFX0]>],
+ [7, 1], // Latency = 4, Repeat rate = 4
+ [NoBypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_SprMTSPR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_SFX0, E500_SFX1]>],
+ [4, 1], // Latency = 1, Repeat rate = 1
+ [E500_CR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_SprMTSRIN, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_SFX0]>],
+ [4, 1],
+ [NoBypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_FPGeneral, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<2, [E500_FPU_0]>],
+ [11, 1, 1], // Latency = 8, Repeat rate = 2
+ [E500_FPR_Bypass,
+ E500_FPR_Bypass, E500_FPR_Bypass]>,
+ InstrItinData<IIC_FPAddSub, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<4, [E500_FPU_0]>],
+ [13, 1, 1], // Latency = 10, Repeat rate = 4
+ [E500_FPR_Bypass,
+ E500_FPR_Bypass, E500_FPR_Bypass]>,
+ InstrItinData<IIC_FPCompare, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<2, [E500_FPU_0]>],
+ [11, 1, 1], // Latency = 8, Repeat rate = 2
+ [E500_CR_Bypass,
+ E500_FPR_Bypass, E500_FPR_Bypass]>,
+ InstrItinData<IIC_FPDivD, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<68, [E500_FPU_0]>],
+ [71, 1, 1], // Latency = 68, Repeat rate = 68
+ [E500_FPR_Bypass,
+ E500_FPR_Bypass, E500_FPR_Bypass]>,
+ InstrItinData<IIC_FPDivS, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<38, [E500_FPU_0]>],
+ [41, 1, 1], // Latency = 38, Repeat rate = 38
+ [E500_FPR_Bypass,
+ E500_FPR_Bypass, E500_FPR_Bypass]>,
+ InstrItinData<IIC_FPFused, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<4, [E500_FPU_0]>],
+ [13, 1, 1, 1], // Latency = 10, Repeat rate = 4
+ [E500_FPR_Bypass,
+ E500_FPR_Bypass, E500_FPR_Bypass,
+ E500_FPR_Bypass]>,
+ InstrItinData<IIC_FPRes, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<38, [E500_FPU_0]>],
+ [41, 1], // Latency = 38, Repeat rate = 38
+ [E500_FPR_Bypass, E500_FPR_Bypass]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// e500mc machine model for scheduling and other instruction cost heuristics.
+
+def PPCE500mcModel : SchedMachineModel {
+ let IssueWidth = 2; // 2 micro-ops are dispatched per cycle.
+ let LoadLatency = 5; // Optimistic load latency assuming bypass.
+ // This is overriden by OperandCycles if the
+ // Itineraries are queried instead.
+
+ let CompleteModel = 0;
+
+ let Itineraries = PPCE500mcItineraries;
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleE5500.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleE5500.td
new file mode 100644
index 000000000000..5db886cf8f94
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleE5500.td
@@ -0,0 +1,381 @@
+//===-- PPCScheduleE500mc.td - e5500 Scheduling Defs -------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the Freescale e5500 64-bit
+// Power processor.
+//
+// All information is derived from the "e5500 Core Reference Manual",
+// Freescale Document Number e5500RM, Rev. 1, 03/2012.
+//
+//===----------------------------------------------------------------------===//
+// Relevant functional units in the Freescale e5500 core
+// (These are the same as for the e500mc)
+//
+// * Decode & Dispatch
+// Can dispatch up to 2 instructions per clock cycle to either the GPR Issue
+// queues (GIQx), FP Issue Queue (FIQ), or Branch issue queue (BIQ).
+def E5500_DIS0 : FuncUnit;
+def E5500_DIS1 : FuncUnit;
+
+// * Execute
+// 6 pipelined execution units: SFX0, SFX1, BU, FPU, LSU, CFX.
+// The CFX has a bypass path, allowing non-divide instructions to execute
+// while a divide instruction is being executed.
+def E5500_SFX0 : FuncUnit; // Simple unit 0
+def E5500_SFX1 : FuncUnit; // Simple unit 1
+def E5500_BU : FuncUnit; // Branch unit
+def E5500_CFX_DivBypass
+ : FuncUnit; // CFX divide bypass path
+def E5500_CFX_0 : FuncUnit; // CFX pipeline stage 0
+
+def E5500_CFX_1 : FuncUnit; // CFX pipeline stage 1
+
+def E5500_LSU_0 : FuncUnit; // LSU pipeline
+def E5500_FPU_0 : FuncUnit; // FPU pipeline
+
+def E5500_GPR_Bypass : Bypass;
+def E5500_FPR_Bypass : Bypass;
+def E5500_CR_Bypass : Bypass;
+
+def PPCE5500Itineraries : ProcessorItineraries<
+ [E5500_DIS0, E5500_DIS1, E5500_SFX0, E5500_SFX1, E5500_BU,
+ E5500_CFX_DivBypass, E5500_CFX_0, E5500_CFX_1,
+ E5500_LSU_0, E5500_FPU_0],
+ [E5500_CR_Bypass, E5500_GPR_Bypass, E5500_FPR_Bypass], [
+ InstrItinData<IIC_IntSimple, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_SFX0, E5500_SFX1]>],
+ [5, 2, 2], // Latency = 1
+ [E5500_GPR_Bypass,
+ E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+ InstrItinData<IIC_IntGeneral, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_SFX0, E5500_SFX1]>],
+ [5, 2, 2], // Latency = 1
+ [E5500_GPR_Bypass,
+ E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+ InstrItinData<IIC_IntISEL, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_SFX0, E5500_SFX1]>],
+ [5, 2, 2, 2], // Latency = 1
+ [E5500_GPR_Bypass,
+ E5500_GPR_Bypass, E5500_GPR_Bypass,
+ E5500_CR_Bypass]>,
+ InstrItinData<IIC_IntCompare, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_SFX0, E5500_SFX1]>],
+ [6, 2, 2], // Latency = 1 or 2
+ [E5500_CR_Bypass,
+ E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+ InstrItinData<IIC_IntDivD, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_CFX_0], 0>,
+ InstrStage<26, [E5500_CFX_DivBypass]>],
+ [30, 2, 2], // Latency= 4..26, Repeat rate= 4..26
+ [E5500_GPR_Bypass,
+ E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+ InstrItinData<IIC_IntDivW, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_CFX_0], 0>,
+ InstrStage<16, [E5500_CFX_DivBypass]>],
+ [20, 2, 2], // Latency= 4..16, Repeat rate= 4..16
+ [E5500_GPR_Bypass,
+ E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+ InstrItinData<IIC_IntMFFS, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_FPU_0]>],
+ [11], // Latency = 7, Repeat rate = 1
+ [E5500_FPR_Bypass]>,
+ InstrItinData<IIC_IntMTFSB0, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<7, [E5500_FPU_0]>],
+ [11, 2, 2], // Latency = 7, Repeat rate = 7
+ [NoBypass, NoBypass, NoBypass]>,
+ InstrItinData<IIC_IntMulHD, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_CFX_0], 0>,
+ InstrStage<2, [E5500_CFX_1]>],
+ [9, 2, 2], // Latency = 4..7, Repeat rate = 2..4
+ [E5500_GPR_Bypass,
+ E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+ InstrItinData<IIC_IntMulHW, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_CFX_0], 0>,
+ InstrStage<1, [E5500_CFX_1]>],
+ [8, 2, 2], // Latency = 4, Repeat rate = 1
+ [E5500_GPR_Bypass,
+ E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+ InstrItinData<IIC_IntMulHWU, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_CFX_0], 0>,
+ InstrStage<1, [E5500_CFX_1]>],
+ [8, 2, 2], // Latency = 4, Repeat rate = 1
+ [E5500_GPR_Bypass,
+ E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+ InstrItinData<IIC_IntMulLI, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_CFX_0], 0>,
+ InstrStage<2, [E5500_CFX_1]>],
+ [8, 2, 2], // Latency = 4 or 5, Repeat = 2
+ [E5500_GPR_Bypass,
+ E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+ InstrItinData<IIC_IntRotate, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_SFX0, E5500_SFX1]>],
+ [5, 2, 2], // Latency = 1
+ [E5500_GPR_Bypass,
+ E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+ InstrItinData<IIC_IntRotateD, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<2, [E5500_SFX0, E5500_SFX1]>],
+ [6, 2, 2], // Latency = 2, Repeat rate = 2
+ [E5500_GPR_Bypass,
+ E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+ InstrItinData<IIC_IntRotateDI, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_SFX0, E5500_SFX1]>],
+ [5, 2, 2], // Latency = 1, Repeat rate = 1
+ [E5500_GPR_Bypass,
+ E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+ InstrItinData<IIC_IntShift, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<2, [E5500_SFX0, E5500_SFX1]>],
+ [6, 2, 2], // Latency = 2, Repeat rate = 2
+ [E5500_GPR_Bypass,
+ E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+ InstrItinData<IIC_IntTrapW, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<2, [E5500_SFX0]>],
+ [6, 2], // Latency = 2, Repeat rate = 2
+ [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+ InstrItinData<IIC_BrB, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_BU]>],
+ [5, 2], // Latency = 1
+ [NoBypass, E5500_GPR_Bypass]>,
+ InstrItinData<IIC_BrCR, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_BU]>],
+ [5, 2, 2], // Latency = 1
+ [E5500_CR_Bypass,
+ E5500_CR_Bypass, E5500_CR_Bypass]>,
+ InstrItinData<IIC_BrMCR, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_BU]>],
+ [5, 2], // Latency = 1
+ [E5500_CR_Bypass, E5500_CR_Bypass]>,
+ InstrItinData<IIC_BrMCRX, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_CFX_0]>],
+ [5, 2, 2], // Latency = 1
+ [E5500_CR_Bypass, E5500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStDCBA, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_LSU_0]>],
+ [7, 2], // Latency = 3, Repeat rate = 1
+ [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStDCBF, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_LSU_0]>],
+ [7, 2], // Latency = 3, Repeat rate = 1
+ [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStDCBI, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_LSU_0]>],
+ [7, 2], // Latency = 3, Repeat rate = 1
+ [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStLoad, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_LSU_0]>],
+ [7, 2], // Latency = 3
+ [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStLoadUpd, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+ InstrStage<1, [E5500_LSU_0]>],
+ [7, 2], // Latency = 3, Repeat rate = 1
+ [E5500_GPR_Bypass, E5500_GPR_Bypass],
+ 2>, // 2 micro-ops
+ InstrItinData<IIC_LdStLoadUpdX,[InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+ InstrStage<1, [E5500_LSU_0]>],
+ [7, 2], // Latency = 3, Repeat rate = 1
+ [E5500_GPR_Bypass, E5500_GPR_Bypass],
+ 2>, // 2 micro-ops
+ InstrItinData<IIC_LdStLD, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_LSU_0]>],
+ [7, 2], // Latency = 3, Repeat rate = 1
+ [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStLDARX, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<3, [E5500_LSU_0]>],
+ [7, 2], // Latency = 3, Repeat rate = 3
+ [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStLDU, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+ InstrStage<1, [E5500_LSU_0]>],
+ [7, 2], // Latency = 3, Repeat rate = 1
+ [E5500_GPR_Bypass, E5500_GPR_Bypass],
+ 2>, // 2 micro-ops
+ InstrItinData<IIC_LdStLDUX, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+ InstrStage<1, [E5500_LSU_0]>],
+ [7, 2], // Latency = 3, Repeat rate = 1
+ [E5500_GPR_Bypass, E5500_GPR_Bypass],
+ 2>, // 2 micro-ops
+ InstrItinData<IIC_LdStStore, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_LSU_0]>],
+ [7, 2], // Latency = 3, Repeat rate = 1
+ [NoBypass, E5500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStStoreUpd,[InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+ InstrStage<1, [E5500_LSU_0]>],
+ [7, 2], // Latency = 3, Repeat rate = 1
+ [NoBypass, E5500_GPR_Bypass],
+ 2>, // 2 micro-ops
+ InstrItinData<IIC_LdStICBI, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_LSU_0]>],
+ [7, 2], // Latency = 3, Repeat rate = 1
+ [NoBypass, E5500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStSTFD, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_LSU_0]>],
+ [7, 2, 2], // Latency = 3, Repeat rate = 1
+ [E5500_GPR_Bypass,
+ E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStSTFDU, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+ InstrStage<1, [E5500_LSU_0]>],
+ [7, 2, 2], // Latency = 3, Repeat rate = 1
+ [E5500_GPR_Bypass,
+ E5500_GPR_Bypass, E5500_GPR_Bypass],
+ 2>, // 2 micro-ops
+ InstrItinData<IIC_LdStLFD, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_LSU_0]>],
+ [8, 2, 2], // Latency = 4, Repeat rate = 1
+ [E5500_FPR_Bypass,
+ E5500_GPR_Bypass, E5500_GPR_Bypass],
+ 2>, // 2 micro-ops
+ InstrItinData<IIC_LdStLFDU, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+ InstrStage<1, [E5500_LSU_0]>],
+ [8, 2, 2], // Latency = 4, Repeat rate = 1
+ [E5500_FPR_Bypass,
+ E5500_GPR_Bypass, E5500_GPR_Bypass],
+ 2>, // 2 micro-ops
+ InstrItinData<IIC_LdStLFDUX, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+ InstrStage<1, [E5500_LSU_0]>],
+ [8, 2, 2], // Latency = 4, Repeat rate = 1
+ [E5500_FPR_Bypass,
+ E5500_GPR_Bypass, E5500_GPR_Bypass],
+ 2>, // 2 micro-ops
+ InstrItinData<IIC_LdStLHA, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_LSU_0]>],
+ [7, 2], // Latency = 3
+ [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStLHAU, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+ InstrStage<1, [E5500_LSU_0]>],
+ [7, 2], // Latency = 3, Repeat rate = 1
+ [E5500_GPR_Bypass, E5500_GPR_Bypass],
+ 2>, // 2 micro-ops
+ InstrItinData<IIC_LdStLHAUX, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+ InstrStage<1, [E5500_LSU_0]>],
+ [7, 2], // Latency = 3, Repeat rate = 1
+ [E5500_GPR_Bypass, E5500_GPR_Bypass],
+ 2>, // 2 micro-ops
+ InstrItinData<IIC_LdStLMW, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<4, [E5500_LSU_0]>],
+ [8, 2], // Latency = r+3, Repeat rate = r+3
+ [NoBypass, E5500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStLWARX, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<3, [E5500_LSU_0]>],
+ [7, 2, 2], // Latency = 3, Repeat rate = 3
+ [E5500_GPR_Bypass,
+ E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStSTD, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_LSU_0]>],
+ [7, 2], // Latency = 3, Repeat rate = 1
+ [NoBypass, E5500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStSTDCX, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_LSU_0]>],
+ [7, 2], // Latency = 3, Repeat rate = 1
+ [NoBypass, E5500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStSTDU, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+ InstrStage<1, [E5500_LSU_0]>],
+ [7, 2], // Latency = 3, Repeat rate = 1
+ [NoBypass, E5500_GPR_Bypass],
+ 2>, // 2 micro-ops
+ InstrItinData<IIC_LdStSTDUX, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+ InstrStage<1, [E5500_LSU_0]>],
+ [7, 2], // Latency = 3, Repeat rate = 1
+ [NoBypass, E5500_GPR_Bypass],
+ 2>, // 2 micro-ops
+ InstrItinData<IIC_LdStSTWCX, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_LSU_0]>],
+ [7, 2], // Latency = 3, Repeat rate = 1
+ [NoBypass, E5500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStSync, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_LSU_0]>]>,
+ InstrItinData<IIC_SprMTMSR, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<2, [E5500_CFX_0]>],
+ [6, 2], // Latency = 2, Repeat rate = 4
+ [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+ InstrItinData<IIC_SprTLBSYNC, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_LSU_0], 0>]>,
+ InstrItinData<IIC_SprMFCR, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<5, [E5500_CFX_0]>],
+ [9, 2], // Latency = 5, Repeat rate = 5
+ [E5500_GPR_Bypass, E5500_CR_Bypass]>,
+ InstrItinData<IIC_SprMFCRF, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<5, [E5500_CFX_0]>],
+ [9, 2], // Latency = 5, Repeat rate = 5
+ [E5500_GPR_Bypass, E5500_CR_Bypass]>,
+ InstrItinData<IIC_SprMFMSR, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<4, [E5500_SFX0]>],
+ [8, 2], // Latency = 4, Repeat rate = 4
+ [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+ InstrItinData<IIC_SprMFSPR, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_CFX_0]>],
+ [5], // Latency = 1, Repeat rate = 1
+ [E5500_GPR_Bypass]>,
+ InstrItinData<IIC_SprMFTB, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<4, [E5500_CFX_0]>],
+ [8, 2], // Latency = 4, Repeat rate = 4
+ [NoBypass, E5500_GPR_Bypass]>,
+ InstrItinData<IIC_SprMTSPR, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_SFX0, E5500_SFX1]>],
+ [5], // Latency = 1, Repeat rate = 1
+ [E5500_GPR_Bypass]>,
+ InstrItinData<IIC_FPGeneral, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_FPU_0]>],
+ [11, 2, 2], // Latency = 7, Repeat rate = 1
+ [E5500_FPR_Bypass,
+ E5500_FPR_Bypass, E5500_FPR_Bypass]>,
+ InstrItinData<IIC_FPAddSub, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_FPU_0]>],
+ [11, 2, 2], // Latency = 7, Repeat rate = 1
+ [E5500_FPR_Bypass,
+ E5500_FPR_Bypass, E5500_FPR_Bypass]>,
+ InstrItinData<IIC_FPCompare, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_FPU_0]>],
+ [11, 2, 2], // Latency = 7, Repeat rate = 1
+ [E5500_CR_Bypass,
+ E5500_FPR_Bypass, E5500_FPR_Bypass]>,
+ InstrItinData<IIC_FPDivD, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<31, [E5500_FPU_0]>],
+ [39, 2, 2], // Latency = 35, Repeat rate = 31
+ [E5500_FPR_Bypass,
+ E5500_FPR_Bypass, E5500_FPR_Bypass]>,
+ InstrItinData<IIC_FPDivS, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<16, [E5500_FPU_0]>],
+ [24, 2, 2], // Latency = 20, Repeat rate = 16
+ [E5500_FPR_Bypass,
+ E5500_FPR_Bypass, E5500_FPR_Bypass]>,
+ InstrItinData<IIC_FPFused, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<1, [E5500_FPU_0]>],
+ [11, 2, 2, 2], // Latency = 7, Repeat rate = 1
+ [E5500_FPR_Bypass,
+ E5500_FPR_Bypass, E5500_FPR_Bypass,
+ E5500_FPR_Bypass]>,
+ InstrItinData<IIC_FPRes, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+ InstrStage<2, [E5500_FPU_0]>],
+ [12, 2], // Latency = 8, Repeat rate = 2
+ [E5500_FPR_Bypass, E5500_FPR_Bypass]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// e5500 machine model for scheduling and other instruction cost heuristics.
+
+def PPCE5500Model : SchedMachineModel {
+ let IssueWidth = 2; // 2 micro-ops are dispatched per cycle.
+ let LoadLatency = 6; // Optimistic load latency assuming bypass.
+ // This is overriden by OperandCycles if the
+ // Itineraries are queried instead.
+
+ let CompleteModel = 0;
+
+ let Itineraries = PPCE5500Itineraries;
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleG3.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG3.td
new file mode 100644
index 000000000000..21efd8f8f6c9
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG3.td
@@ -0,0 +1,80 @@
+//===-- PPCScheduleG3.td - PPC G3 Scheduling Definitions ---*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the G3 (750) processor.
+//
+//===----------------------------------------------------------------------===//
+
+def G3_BPU : FuncUnit; // Branch unit
+def G3_SLU : FuncUnit; // Store/load unit
+def G3_SRU : FuncUnit; // special register unit
+def G3_IU1 : FuncUnit; // integer unit 1 (simple)
+def G3_IU2 : FuncUnit; // integer unit 2 (complex)
+def G3_FPU1 : FuncUnit; // floating point unit 1
+
+def G3Itineraries : ProcessorItineraries<
+ [G3_IU1, G3_IU2, G3_FPU1, G3_BPU, G3_SRU, G3_SLU], [], [
+ InstrItinData<IIC_IntSimple , [InstrStage<1, [G3_IU1, G3_IU2]>]>,
+ InstrItinData<IIC_IntGeneral , [InstrStage<1, [G3_IU1, G3_IU2]>]>,
+ InstrItinData<IIC_IntCompare , [InstrStage<1, [G3_IU1, G3_IU2]>]>,
+ InstrItinData<IIC_IntDivW , [InstrStage<19, [G3_IU1]>]>,
+ InstrItinData<IIC_IntMFFS , [InstrStage<1, [G3_FPU1]>]>,
+ InstrItinData<IIC_IntMTFSB0 , [InstrStage<3, [G3_FPU1]>]>,
+ InstrItinData<IIC_IntMulHW , [InstrStage<5, [G3_IU1]>]>,
+ InstrItinData<IIC_IntMulHWU , [InstrStage<6, [G3_IU1]>]>,
+ InstrItinData<IIC_IntMulLI , [InstrStage<3, [G3_IU1]>]>,
+ InstrItinData<IIC_IntRotate , [InstrStage<1, [G3_IU1, G3_IU2]>]>,
+ InstrItinData<IIC_IntShift , [InstrStage<1, [G3_IU1, G3_IU2]>]>,
+ InstrItinData<IIC_IntTrapW , [InstrStage<2, [G3_IU1, G3_IU2]>]>,
+ InstrItinData<IIC_BrB , [InstrStage<1, [G3_BPU]>]>,
+ InstrItinData<IIC_BrCR , [InstrStage<1, [G3_SRU]>]>,
+ InstrItinData<IIC_BrMCR , [InstrStage<1, [G3_SRU]>]>,
+ InstrItinData<IIC_BrMCRX , [InstrStage<1, [G3_SRU]>]>,
+ InstrItinData<IIC_LdStDCBA , [InstrStage<2, [G3_SLU]>]>,
+ InstrItinData<IIC_LdStDCBF , [InstrStage<3, [G3_SLU]>]>,
+ InstrItinData<IIC_LdStDCBI , [InstrStage<3, [G3_SLU]>]>,
+ InstrItinData<IIC_LdStLoad , [InstrStage<2, [G3_SLU]>]>,
+ InstrItinData<IIC_LdStLoadUpd , [InstrStage<2, [G3_SLU]>]>,
+ InstrItinData<IIC_LdStLoadUpdX, [InstrStage<2, [G3_SLU]>]>,
+ InstrItinData<IIC_LdStStore , [InstrStage<2, [G3_SLU]>]>,
+ InstrItinData<IIC_LdStStoreUpd, [InstrStage<2, [G3_SLU]>]>,
+ InstrItinData<IIC_LdStICBI , [InstrStage<3, [G3_SLU]>]>,
+ InstrItinData<IIC_LdStSTFD , [InstrStage<2, [G3_SLU]>]>,
+ InstrItinData<IIC_LdStSTFDU , [InstrStage<2, [G3_SLU]>]>,
+ InstrItinData<IIC_LdStLFD , [InstrStage<2, [G3_SLU]>]>,
+ InstrItinData<IIC_LdStLFDU , [InstrStage<2, [G3_SLU]>]>,
+ InstrItinData<IIC_LdStLFDUX , [InstrStage<2, [G3_SLU]>]>,
+ InstrItinData<IIC_LdStLHA , [InstrStage<2, [G3_SLU]>]>,
+ InstrItinData<IIC_LdStLHAU , [InstrStage<2, [G3_SLU]>]>,
+ InstrItinData<IIC_LdStLHAUX , [InstrStage<2, [G3_SLU]>]>,
+ InstrItinData<IIC_LdStLMW , [InstrStage<34, [G3_SLU]>]>,
+ InstrItinData<IIC_LdStLWARX , [InstrStage<3, [G3_SLU]>]>,
+ InstrItinData<IIC_LdStSTWCX , [InstrStage<8, [G3_SLU]>]>,
+ InstrItinData<IIC_LdStSync , [InstrStage<3, [G3_SLU]>]>,
+ InstrItinData<IIC_SprISYNC , [InstrStage<2, [G3_SRU]>]>,
+ InstrItinData<IIC_SprMFSR , [InstrStage<3, [G3_SRU]>]>,
+ InstrItinData<IIC_SprMTMSR , [InstrStage<1, [G3_SRU]>]>,
+ InstrItinData<IIC_SprMTSR , [InstrStage<2, [G3_SRU]>]>,
+ InstrItinData<IIC_SprTLBSYNC , [InstrStage<3, [G3_SRU]>]>,
+ InstrItinData<IIC_SprMFCR , [InstrStage<1, [G3_SRU]>]>,
+ InstrItinData<IIC_SprMFMSR , [InstrStage<1, [G3_SRU]>]>,
+ InstrItinData<IIC_SprMFSPR , [InstrStage<3, [G3_SRU]>]>,
+ InstrItinData<IIC_SprMFTB , [InstrStage<3, [G3_SRU]>]>,
+ InstrItinData<IIC_SprMTSPR , [InstrStage<2, [G3_SRU]>]>,
+ InstrItinData<IIC_SprMTSRIN , [InstrStage<2, [G3_SRU]>]>,
+ InstrItinData<IIC_SprRFI , [InstrStage<2, [G3_SRU]>]>,
+ InstrItinData<IIC_SprSC , [InstrStage<2, [G3_SRU]>]>,
+ InstrItinData<IIC_FPGeneral , [InstrStage<1, [G3_FPU1]>]>,
+ InstrItinData<IIC_FPAddSub , [InstrStage<1, [G3_FPU1]>]>,
+ InstrItinData<IIC_FPCompare , [InstrStage<1, [G3_FPU1]>]>,
+ InstrItinData<IIC_FPDivD , [InstrStage<31, [G3_FPU1]>]>,
+ InstrItinData<IIC_FPDivS , [InstrStage<17, [G3_FPU1]>]>,
+ InstrItinData<IIC_FPFused , [InstrStage<2, [G3_FPU1]>]>,
+ InstrItinData<IIC_FPRes , [InstrStage<10, [G3_FPU1]>]>
+]>;
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleG4.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG4.td
new file mode 100644
index 000000000000..340773ef7876
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG4.td
@@ -0,0 +1,96 @@
+//===-- PPCScheduleG4.td - PPC G4 Scheduling Definitions ---*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the G4 (7400) processor.
+//
+//===----------------------------------------------------------------------===//
+
+def G4_BPU : FuncUnit; // Branch unit
+def G4_SLU : FuncUnit; // Store/load unit
+def G4_SRU : FuncUnit; // special register unit
+def G4_IU1 : FuncUnit; // integer unit 1 (simple)
+def G4_IU2 : FuncUnit; // integer unit 2 (complex)
+def G4_FPU1 : FuncUnit; // floating point unit 1
+def G4_VPU : FuncUnit; // vector permutation unit
+def G4_VIU1 : FuncUnit; // vector integer unit 1 (simple)
+def G4_VIU2 : FuncUnit; // vector integer unit 2 (complex)
+def G4_VFPU : FuncUnit; // vector floating point unit
+
+def G4Itineraries : ProcessorItineraries<
+ [G4_IU1, G4_IU2, G4_SLU, G4_SRU, G4_BPU, G4_FPU1,
+ G4_VIU1, G4_VIU2, G4_VPU, G4_VFPU], [], [
+ InstrItinData<IIC_IntSimple , [InstrStage<1, [G4_IU1, G4_IU2]>]>,
+ InstrItinData<IIC_IntGeneral , [InstrStage<1, [G4_IU1, G4_IU2]>]>,
+ InstrItinData<IIC_IntCompare , [InstrStage<1, [G4_IU1, G4_IU2]>]>,
+ InstrItinData<IIC_IntDivW , [InstrStage<19, [G4_IU1]>]>,
+ InstrItinData<IIC_IntMFFS , [InstrStage<3, [G4_FPU1]>]>,
+ InstrItinData<IIC_IntMFVSCR , [InstrStage<1, [G4_VIU1]>]>,
+ InstrItinData<IIC_IntMTFSB0 , [InstrStage<3, [G4_FPU1]>]>,
+ InstrItinData<IIC_IntMulHW , [InstrStage<5, [G4_IU1]>]>,
+ InstrItinData<IIC_IntMulHWU , [InstrStage<6, [G4_IU1]>]>,
+ InstrItinData<IIC_IntMulLI , [InstrStage<3, [G4_IU1]>]>,
+ InstrItinData<IIC_IntRotate , [InstrStage<1, [G4_IU1, G4_IU2]>]>,
+ InstrItinData<IIC_IntShift , [InstrStage<1, [G4_IU1, G4_IU2]>]>,
+ InstrItinData<IIC_IntTrapW , [InstrStage<2, [G4_IU1, G4_IU2]>]>,
+ InstrItinData<IIC_BrB , [InstrStage<1, [G4_BPU]>]>,
+ InstrItinData<IIC_BrCR , [InstrStage<1, [G4_SRU]>]>,
+ InstrItinData<IIC_BrMCR , [InstrStage<1, [G4_SRU]>]>,
+ InstrItinData<IIC_BrMCRX , [InstrStage<1, [G4_SRU]>]>,
+ InstrItinData<IIC_LdStDCBF , [InstrStage<2, [G4_SLU]>]>,
+ InstrItinData<IIC_LdStDCBI , [InstrStage<2, [G4_SLU]>]>,
+ InstrItinData<IIC_LdStLoad , [InstrStage<2, [G4_SLU]>]>,
+ InstrItinData<IIC_LdStLoadUpd , [InstrStage<2, [G4_SLU]>]>,
+ InstrItinData<IIC_LdStLoadUpdX, [InstrStage<2, [G4_SLU]>]>,
+ InstrItinData<IIC_LdStStore , [InstrStage<2, [G4_SLU]>]>,
+ InstrItinData<IIC_LdStStoreUpd, [InstrStage<2, [G4_SLU]>]>,
+ InstrItinData<IIC_LdStDSS , [InstrStage<2, [G4_SLU]>]>,
+ InstrItinData<IIC_LdStICBI , [InstrStage<2, [G4_SLU]>]>,
+ InstrItinData<IIC_LdStSTFD , [InstrStage<2, [G4_SLU]>]>,
+ InstrItinData<IIC_LdStSTFDU , [InstrStage<2, [G4_SLU]>]>,
+ InstrItinData<IIC_LdStLFD , [InstrStage<2, [G4_SLU]>]>,
+ InstrItinData<IIC_LdStLFDU , [InstrStage<2, [G4_SLU]>]>,
+ InstrItinData<IIC_LdStLFDUX , [InstrStage<2, [G4_SLU]>]>,
+ InstrItinData<IIC_LdStLHA , [InstrStage<2, [G4_SLU]>]>,
+ InstrItinData<IIC_LdStLHAU , [InstrStage<2, [G4_SLU]>]>,
+ InstrItinData<IIC_LdStLHAUX , [InstrStage<2, [G4_SLU]>]>,
+ InstrItinData<IIC_LdStLMW , [InstrStage<34, [G4_SLU]>]>,
+ InstrItinData<IIC_LdStLVecX , [InstrStage<2, [G4_SLU]>]>,
+ InstrItinData<IIC_LdStLWARX , [InstrStage<3, [G4_SLU]>]>,
+ InstrItinData<IIC_LdStSTVEBX , [InstrStage<2, [G4_SLU]>]>,
+ InstrItinData<IIC_LdStSTWCX , [InstrStage<5, [G4_SLU]>]>,
+ InstrItinData<IIC_LdStSync , [InstrStage<8, [G4_SLU]>]>,
+ InstrItinData<IIC_SprISYNC , [InstrStage<2, [G4_SRU]>]>,
+ InstrItinData<IIC_SprMFSR , [InstrStage<3, [G4_SRU]>]>,
+ InstrItinData<IIC_SprMTMSR , [InstrStage<1, [G4_SRU]>]>,
+ InstrItinData<IIC_SprMTSR , [InstrStage<2, [G4_SRU]>]>,
+ InstrItinData<IIC_SprTLBSYNC , [InstrStage<8, [G4_SRU]>]>,
+ InstrItinData<IIC_SprMFCR , [InstrStage<1, [G4_SRU]>]>,
+ InstrItinData<IIC_SprMFMSR , [InstrStage<1, [G4_SRU]>]>,
+ InstrItinData<IIC_SprMFSPR , [InstrStage<3, [G4_SRU]>]>,
+ InstrItinData<IIC_SprMFTB , [InstrStage<1, [G4_SRU]>]>,
+ InstrItinData<IIC_SprMTSPR , [InstrStage<2, [G4_SRU]>]>,
+ InstrItinData<IIC_SprMTSRIN , [InstrStage<2, [G4_SRU]>]>,
+ InstrItinData<IIC_SprRFI , [InstrStage<2, [G4_SRU]>]>,
+ InstrItinData<IIC_SprSC , [InstrStage<2, [G4_SRU]>]>,
+ InstrItinData<IIC_FPGeneral , [InstrStage<1, [G4_FPU1]>]>,
+ InstrItinData<IIC_FPAddSub , [InstrStage<1, [G4_FPU1]>]>,
+ InstrItinData<IIC_FPCompare , [InstrStage<1, [G4_FPU1]>]>,
+ InstrItinData<IIC_FPDivD , [InstrStage<31, [G4_FPU1]>]>,
+ InstrItinData<IIC_FPDivS , [InstrStage<17, [G4_FPU1]>]>,
+ InstrItinData<IIC_FPFused , [InstrStage<1, [G4_FPU1]>]>,
+ InstrItinData<IIC_FPRes , [InstrStage<10, [G4_FPU1]>]>,
+ InstrItinData<IIC_VecGeneral , [InstrStage<1, [G4_VIU1]>]>,
+ InstrItinData<IIC_VecFP , [InstrStage<4, [G4_VFPU]>]>,
+ InstrItinData<IIC_VecFPCompare, [InstrStage<1, [G4_VIU1]>]>,
+ InstrItinData<IIC_VecComplex , [InstrStage<3, [G4_VIU2]>]>,
+ InstrItinData<IIC_VecPerm , [InstrStage<1, [G4_VPU]>]>,
+ InstrItinData<IIC_VecFPRound , [InstrStage<4, [G4_VFPU]>]>,
+ InstrItinData<IIC_VecVSL , [InstrStage<1, [G4_VIU1]>]>,
+ InstrItinData<IIC_VecVSR , [InstrStage<1, [G4_VIU1]>]>
+]>;
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleG4Plus.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG4Plus.td
new file mode 100644
index 000000000000..1d9f13fcb850
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG4Plus.td
@@ -0,0 +1,112 @@
+//===-- PPCScheduleG4Plus.td - PPC G4+ Scheduling Defs. ----*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the G4+ (7450) processor.
+//
+//===----------------------------------------------------------------------===//
+
+def G4P_BPU : FuncUnit; // Branch unit
+def G4P_SLU : FuncUnit; // Store/load unit
+def G4P_SRU : FuncUnit; // special register unit
+def G4P_IU1 : FuncUnit; // integer unit 1 (simple)
+def G4P_IU2 : FuncUnit; // integer unit 2 (complex)
+def G4P_IU3 : FuncUnit; // integer unit 3 (simple)
+def G4P_IU4 : FuncUnit; // integer unit 4 (simple)
+def G4P_FPU1 : FuncUnit; // floating point unit 1
+def G4P_VPU : FuncUnit; // vector permutation unit
+def G4P_VIU1 : FuncUnit; // vector integer unit 1 (simple)
+def G4P_VIU2 : FuncUnit; // vector integer unit 2 (complex)
+def G4P_VFPU : FuncUnit; // vector floating point unit
+
+def G4PlusItineraries : ProcessorItineraries<
+ [G4P_IU1, G4P_IU2, G4P_IU3, G4P_IU4, G4P_BPU, G4P_SLU, G4P_FPU1,
+ G4P_VFPU, G4P_VIU1, G4P_VIU2, G4P_VPU], [], [
+ InstrItinData<IIC_IntSimple , [InstrStage<1, [G4P_IU1, G4P_IU2,
+ G4P_IU3, G4P_IU4]>]>,
+ InstrItinData<IIC_IntGeneral , [InstrStage<1, [G4P_IU1, G4P_IU2,
+ G4P_IU3, G4P_IU4]>]>,
+ InstrItinData<IIC_IntCompare , [InstrStage<1, [G4P_IU1, G4P_IU2,
+ G4P_IU3, G4P_IU4]>]>,
+ InstrItinData<IIC_IntDivW , [InstrStage<23, [G4P_IU2]>]>,
+ InstrItinData<IIC_IntMFFS , [InstrStage<5, [G4P_FPU1]>]>,
+ InstrItinData<IIC_IntMFVSCR , [InstrStage<2, [G4P_VFPU]>]>,
+ InstrItinData<IIC_IntMTFSB0 , [InstrStage<5, [G4P_FPU1]>]>,
+ InstrItinData<IIC_IntMulHW , [InstrStage<4, [G4P_IU2]>]>,
+ InstrItinData<IIC_IntMulHWU , [InstrStage<4, [G4P_IU2]>]>,
+ InstrItinData<IIC_IntMulLI , [InstrStage<3, [G4P_IU2]>]>,
+ InstrItinData<IIC_IntRotate , [InstrStage<1, [G4P_IU1, G4P_IU2,
+ G4P_IU3, G4P_IU4]>]>,
+ InstrItinData<IIC_IntShift , [InstrStage<2, [G4P_IU1, G4P_IU2,
+ G4P_IU3, G4P_IU4]>]>,
+ InstrItinData<IIC_IntTrapW , [InstrStage<2, [G4P_IU1, G4P_IU2,
+ G4P_IU3, G4P_IU4]>]>,
+ InstrItinData<IIC_BrB , [InstrStage<1, [G4P_BPU]>]>,
+ InstrItinData<IIC_BrCR , [InstrStage<2, [G4P_IU2]>]>,
+ InstrItinData<IIC_BrMCR , [InstrStage<2, [G4P_IU2]>]>,
+ InstrItinData<IIC_BrMCRX , [InstrStage<2, [G4P_IU2]>]>,
+ InstrItinData<IIC_LdStDCBF , [InstrStage<3, [G4P_SLU]>]>,
+ InstrItinData<IIC_LdStDCBI , [InstrStage<3, [G4P_SLU]>]>,
+ InstrItinData<IIC_LdStLoad , [InstrStage<3, [G4P_SLU]>]>,
+ InstrItinData<IIC_LdStLoadUpd , [InstrStage<3, [G4P_SLU]>]>,
+ InstrItinData<IIC_LdStLoadUpdX, [InstrStage<3, [G4P_SLU]>]>,
+ InstrItinData<IIC_LdStStore , [InstrStage<3, [G4P_SLU]>]>,
+ InstrItinData<IIC_LdStStoreUpd, [InstrStage<3, [G4P_SLU]>]>,
+ InstrItinData<IIC_LdStDSS , [InstrStage<3, [G4P_SLU]>]>,
+ InstrItinData<IIC_LdStICBI , [InstrStage<3, [G4P_IU2]>]>,
+ InstrItinData<IIC_LdStSTFD , [InstrStage<3, [G4P_SLU]>]>,
+ InstrItinData<IIC_LdStSTFDU , [InstrStage<3, [G4P_SLU]>]>,
+ InstrItinData<IIC_LdStLFD , [InstrStage<4, [G4P_SLU]>]>,
+ InstrItinData<IIC_LdStLFDU , [InstrStage<4, [G4P_SLU]>]>,
+ InstrItinData<IIC_LdStLFDUX , [InstrStage<4, [G4P_SLU]>]>,
+ InstrItinData<IIC_LdStLHA , [InstrStage<3, [G4P_SLU]>]>,
+ InstrItinData<IIC_LdStLHAU , [InstrStage<3, [G4P_SLU]>]>,
+ InstrItinData<IIC_LdStLHAUX , [InstrStage<3, [G4P_SLU]>]>,
+ InstrItinData<IIC_LdStLMW , [InstrStage<37, [G4P_SLU]>]>,
+ InstrItinData<IIC_LdStLVecX , [InstrStage<3, [G4P_SLU]>]>,
+ InstrItinData<IIC_LdStLWA , [InstrStage<3, [G4P_SLU]>]>,
+ InstrItinData<IIC_LdStLWARX , [InstrStage<3, [G4P_SLU]>]>,
+ InstrItinData<IIC_LdStSTD , [InstrStage<3, [G4P_SLU]>]>,
+ InstrItinData<IIC_LdStSTDCX , [InstrStage<3, [G4P_SLU]>]>,
+ InstrItinData<IIC_LdStSTDU , [InstrStage<3, [G4P_SLU]>]>,
+ InstrItinData<IIC_LdStSTDUX , [InstrStage<3, [G4P_SLU]>]>,
+ InstrItinData<IIC_LdStSTVEBX , [InstrStage<3, [G4P_SLU]>]>,
+ InstrItinData<IIC_LdStSTWCX , [InstrStage<3, [G4P_SLU]>]>,
+ InstrItinData<IIC_LdStSync , [InstrStage<35, [G4P_SLU]>]>,
+ InstrItinData<IIC_SprISYNC , [InstrStage<0, [G4P_IU1, G4P_IU2,
+ G4P_IU3, G4P_IU4]>]>,
+ InstrItinData<IIC_SprMFSR , [InstrStage<4, [G4P_IU2]>]>,
+ InstrItinData<IIC_SprMTMSR , [InstrStage<2, [G4P_IU2]>]>,
+ InstrItinData<IIC_SprMTSR , [InstrStage<2, [G4P_IU2]>]>,
+ InstrItinData<IIC_SprTLBSYNC , [InstrStage<3, [G4P_SLU]>]>,
+ InstrItinData<IIC_SprMFCR , [InstrStage<2, [G4P_IU2]>]>,
+ InstrItinData<IIC_SprMFMSR , [InstrStage<3, [G4P_IU2]>]>,
+ InstrItinData<IIC_SprMFSPR , [InstrStage<4, [G4P_IU2]>]>,
+ InstrItinData<IIC_SprMFTB , [InstrStage<5, [G4P_IU2]>]>,
+ InstrItinData<IIC_SprMTSPR , [InstrStage<2, [G4P_IU2]>]>,
+ InstrItinData<IIC_SprMTSRIN , [InstrStage<2, [G4P_IU2]>]>,
+ InstrItinData<IIC_SprRFI , [InstrStage<1, [G4P_IU1, G4P_IU2,
+ G4P_IU3, G4P_IU4]>]>,
+ InstrItinData<IIC_SprSC , [InstrStage<0, [G4P_IU1, G4P_IU2,
+ G4P_IU3, G4P_IU4]>]>,
+ InstrItinData<IIC_FPGeneral , [InstrStage<5, [G4P_FPU1]>]>,
+ InstrItinData<IIC_FPAddSub , [InstrStage<5, [G4P_FPU1]>]>,
+ InstrItinData<IIC_FPCompare , [InstrStage<5, [G4P_FPU1]>]>,
+ InstrItinData<IIC_FPDivD , [InstrStage<35, [G4P_FPU1]>]>,
+ InstrItinData<IIC_FPDivS , [InstrStage<21, [G4P_FPU1]>]>,
+ InstrItinData<IIC_FPFused , [InstrStage<5, [G4P_FPU1]>]>,
+ InstrItinData<IIC_FPRes , [InstrStage<14, [G4P_FPU1]>]>,
+ InstrItinData<IIC_VecGeneral , [InstrStage<1, [G4P_VIU1]>]>,
+ InstrItinData<IIC_VecFP , [InstrStage<4, [G4P_VFPU]>]>,
+ InstrItinData<IIC_VecFPCompare, [InstrStage<2, [G4P_VFPU]>]>,
+ InstrItinData<IIC_VecComplex , [InstrStage<4, [G4P_VIU2]>]>,
+ InstrItinData<IIC_VecPerm , [InstrStage<2, [G4P_VPU]>]>,
+ InstrItinData<IIC_VecFPRound , [InstrStage<4, [G4P_VIU1]>]>,
+ InstrItinData<IIC_VecVSL , [InstrStage<2, [G4P_VPU]>]>,
+ InstrItinData<IIC_VecVSR , [InstrStage<2, [G4P_VPU]>]>
+]>;
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleG5.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG5.td
new file mode 100644
index 000000000000..b5a9f96d45ae
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG5.td
@@ -0,0 +1,130 @@
+//===-- PPCScheduleG5.td - PPC G5 Scheduling Definitions ---*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the G5 (970) processor.
+//
+//===----------------------------------------------------------------------===//
+
+def G5_BPU : FuncUnit; // Branch unit
+def G5_SLU : FuncUnit; // Store/load unit
+def G5_SRU : FuncUnit; // special register unit
+def G5_IU1 : FuncUnit; // integer unit 1 (simple)
+def G5_IU2 : FuncUnit; // integer unit 2 (complex)
+def G5_FPU1 : FuncUnit; // floating point unit 1
+def G5_FPU2 : FuncUnit; // floating point unit 2
+def G5_VPU : FuncUnit; // vector permutation unit
+def G5_VIU1 : FuncUnit; // vector integer unit 1 (simple)
+def G5_VIU2 : FuncUnit; // vector integer unit 2 (complex)
+def G5_VFPU : FuncUnit; // vector floating point unit
+
+def G5Itineraries : ProcessorItineraries<
+ [G5_IU1, G5_IU2, G5_SLU, G5_BPU, G5_FPU1, G5_FPU2,
+ G5_VFPU, G5_VIU1, G5_VIU2, G5_VPU], [], [
+ InstrItinData<IIC_IntSimple , [InstrStage<2, [G5_IU1, G5_IU2]>]>,
+ InstrItinData<IIC_IntGeneral , [InstrStage<2, [G5_IU1, G5_IU2]>]>,
+ InstrItinData<IIC_IntCompare , [InstrStage<3, [G5_IU1, G5_IU2]>]>,
+ InstrItinData<IIC_IntDivD , [InstrStage<68, [G5_IU1]>]>,
+ InstrItinData<IIC_IntDivW , [InstrStage<36, [G5_IU1]>]>,
+ InstrItinData<IIC_IntMFFS , [InstrStage<6, [G5_IU2]>]>,
+ InstrItinData<IIC_IntMFVSCR , [InstrStage<1, [G5_VFPU]>]>,
+ InstrItinData<IIC_IntMTFSB0 , [InstrStage<6, [G5_FPU1, G5_FPU2]>]>,
+ InstrItinData<IIC_IntMulHD , [InstrStage<7, [G5_IU1, G5_IU2]>]>,
+ InstrItinData<IIC_IntMulHW , [InstrStage<5, [G5_IU1, G5_IU2]>]>,
+ InstrItinData<IIC_IntMulHWU , [InstrStage<5, [G5_IU1, G5_IU2]>]>,
+ InstrItinData<IIC_IntMulLI , [InstrStage<4, [G5_IU1, G5_IU2]>]>,
+ InstrItinData<IIC_IntRFID , [InstrStage<1, [G5_IU2]>]>,
+ InstrItinData<IIC_IntRotateD , [InstrStage<2, [G5_IU1, G5_IU2]>]>,
+ InstrItinData<IIC_IntRotateDI , [InstrStage<2, [G5_IU1, G5_IU2]>]>,
+ InstrItinData<IIC_IntRotate , [InstrStage<4, [G5_IU1, G5_IU2]>]>,
+ InstrItinData<IIC_IntShift , [InstrStage<2, [G5_IU1, G5_IU2]>]>,
+ InstrItinData<IIC_IntTrapD , [InstrStage<1, [G5_IU1, G5_IU2]>]>,
+ InstrItinData<IIC_IntTrapW , [InstrStage<1, [G5_IU1, G5_IU2]>]>,
+ InstrItinData<IIC_BrB , [InstrStage<1, [G5_BPU]>]>,
+ InstrItinData<IIC_BrCR , [InstrStage<4, [G5_BPU]>]>,
+ InstrItinData<IIC_BrMCR , [InstrStage<2, [G5_BPU]>]>,
+ InstrItinData<IIC_BrMCRX , [InstrStage<3, [G5_BPU]>]>,
+ InstrItinData<IIC_LdStDCBF , [InstrStage<3, [G5_SLU]>]>,
+ InstrItinData<IIC_LdStLoad , [InstrStage<3, [G5_SLU]>]>,
+ InstrItinData<IIC_LdStLoadUpd , [InstrStage<3, [G5_SLU]>]>,
+ InstrItinData<IIC_LdStLoadUpdX, [InstrStage<3, [G5_SLU]>]>,
+ InstrItinData<IIC_LdStStore , [InstrStage<3, [G5_SLU]>]>,
+ InstrItinData<IIC_LdStStoreUpd, [InstrStage<3, [G5_SLU]>]>,
+ InstrItinData<IIC_LdStDSS , [InstrStage<10, [G5_SLU]>]>,
+ InstrItinData<IIC_LdStICBI , [InstrStage<40, [G5_SLU]>]>,
+ InstrItinData<IIC_LdStSTFD , [InstrStage<4, [G5_SLU]>]>,
+ InstrItinData<IIC_LdStSTFDU , [InstrStage<4, [G5_SLU]>]>,
+ InstrItinData<IIC_LdStLD , [InstrStage<3, [G5_SLU]>]>,
+ InstrItinData<IIC_LdStLDU , [InstrStage<3, [G5_SLU]>]>,
+ InstrItinData<IIC_LdStLDUX , [InstrStage<3, [G5_SLU]>]>,
+ InstrItinData<IIC_LdStLDARX , [InstrStage<11, [G5_SLU]>]>,
+ InstrItinData<IIC_LdStLFD , [InstrStage<3, [G5_SLU]>]>,
+ InstrItinData<IIC_LdStLFDU , [InstrStage<5, [G5_SLU]>]>,
+ InstrItinData<IIC_LdStLFDUX , [InstrStage<5, [G5_SLU]>]>,
+ InstrItinData<IIC_LdStLHA , [InstrStage<5, [G5_SLU]>]>,
+ InstrItinData<IIC_LdStLHAU , [InstrStage<5, [G5_SLU]>]>,
+ InstrItinData<IIC_LdStLHAUX , [InstrStage<5, [G5_SLU]>]>,
+ InstrItinData<IIC_LdStLMW , [InstrStage<64, [G5_SLU]>]>,
+ InstrItinData<IIC_LdStLVecX , [InstrStage<3, [G5_SLU]>]>,
+ InstrItinData<IIC_LdStLWA , [InstrStage<5, [G5_SLU]>]>,
+ InstrItinData<IIC_LdStLWARX , [InstrStage<11, [G5_SLU]>]>,
+ InstrItinData<IIC_LdStSLBIA , [InstrStage<40, [G5_SLU]>]>, // needs work
+ InstrItinData<IIC_LdStSLBIE , [InstrStage<2, [G5_SLU]>]>,
+ InstrItinData<IIC_LdStSTD , [InstrStage<3, [G5_SLU]>]>,
+ InstrItinData<IIC_LdStSTDU , [InstrStage<3, [G5_SLU]>]>,
+ InstrItinData<IIC_LdStSTDUX , [InstrStage<3, [G5_SLU]>]>,
+ InstrItinData<IIC_LdStSTDCX , [InstrStage<11, [G5_SLU]>]>,
+ InstrItinData<IIC_LdStSTVEBX , [InstrStage<5, [G5_SLU]>]>,
+ InstrItinData<IIC_LdStSTWCX , [InstrStage<11, [G5_SLU]>]>,
+ InstrItinData<IIC_LdStSync , [InstrStage<35, [G5_SLU]>]>,
+ InstrItinData<IIC_SprISYNC , [InstrStage<40, [G5_SLU]>]>, // needs work
+ InstrItinData<IIC_SprMFSR , [InstrStage<3, [G5_SLU]>]>,
+ InstrItinData<IIC_SprMTMSR , [InstrStage<3, [G5_SLU]>]>,
+ InstrItinData<IIC_SprMTSR , [InstrStage<3, [G5_SLU]>]>,
+ InstrItinData<IIC_SprTLBSYNC , [InstrStage<3, [G5_SLU]>]>,
+ InstrItinData<IIC_SprMFCR , [InstrStage<2, [G5_IU2]>]>,
+ InstrItinData<IIC_SprMFCRF , [InstrStage<2, [G5_IU2]>]>,
+ InstrItinData<IIC_SprMFMSR , [InstrStage<3, [G5_IU2]>]>,
+ InstrItinData<IIC_SprMFSPR , [InstrStage<3, [G5_IU2]>]>,
+ InstrItinData<IIC_SprMFTB , [InstrStage<10, [G5_IU2]>]>,
+ InstrItinData<IIC_SprMTSPR , [InstrStage<8, [G5_IU2]>]>,
+ InstrItinData<IIC_SprSC , [InstrStage<1, [G5_IU2]>]>,
+ InstrItinData<IIC_FPGeneral , [InstrStage<6, [G5_FPU1, G5_FPU2]>]>,
+ InstrItinData<IIC_FPAddSub , [InstrStage<6, [G5_FPU1, G5_FPU2]>]>,
+ InstrItinData<IIC_FPCompare , [InstrStage<8, [G5_FPU1, G5_FPU2]>]>,
+ InstrItinData<IIC_FPDivD , [InstrStage<33, [G5_FPU1, G5_FPU2]>]>,
+ InstrItinData<IIC_FPDivS , [InstrStage<33, [G5_FPU1, G5_FPU2]>]>,
+ InstrItinData<IIC_FPFused , [InstrStage<6, [G5_FPU1, G5_FPU2]>]>,
+ InstrItinData<IIC_FPRes , [InstrStage<6, [G5_FPU1, G5_FPU2]>]>,
+ InstrItinData<IIC_FPSqrtD , [InstrStage<40, [G5_FPU1, G5_FPU2]>]>,
+ InstrItinData<IIC_FPSqrtS , [InstrStage<40, [G5_FPU1, G5_FPU2]>]>,
+ InstrItinData<IIC_VecGeneral , [InstrStage<2, [G5_VIU1]>]>,
+ InstrItinData<IIC_VecFP , [InstrStage<8, [G5_VFPU]>]>,
+ InstrItinData<IIC_VecFPCompare, [InstrStage<2, [G5_VFPU]>]>,
+ InstrItinData<IIC_VecComplex , [InstrStage<5, [G5_VIU2]>]>,
+ InstrItinData<IIC_VecPerm , [InstrStage<3, [G5_VPU]>]>,
+ InstrItinData<IIC_VecFPRound , [InstrStage<8, [G5_VFPU]>]>,
+ InstrItinData<IIC_VecVSL , [InstrStage<2, [G5_VIU1]>]>,
+ InstrItinData<IIC_VecVSR , [InstrStage<3, [G5_VPU]>]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// G5 machine model for scheduling and other instruction cost heuristics.
+
+def G5Model : SchedMachineModel {
+ let IssueWidth = 4; // 4 (non-branch) instructions are dispatched per cycle.
+ let LoadLatency = 3; // Optimistic load latency assuming bypass.
+ // This is overriden by OperandCycles if the
+ // Itineraries are queried instead.
+ let MispredictPenalty = 16;
+
+ let CompleteModel = 0;
+
+ let Itineraries = G5Itineraries;
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleP7.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleP7.td
new file mode 100644
index 000000000000..a8678f56900e
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleP7.td
@@ -0,0 +1,397 @@
+//===-- PPCScheduleP7.td - PPC P7 Scheduling Definitions ---*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the POWER7 processor.
+//
+//===----------------------------------------------------------------------===//
+
+// Primary reference:
+// IBM POWER7 multicore server processor
+// B. Sinharoy, et al.
+// IBM J. Res. & Dev. (55) 3. May/June 2011.
+
+// Scheduling for the P7 involves tracking two types of resources:
+// 1. The dispatch bundle slots
+// 2. The functional unit resources
+
+// Dispatch units:
+def P7_DU1 : FuncUnit;
+def P7_DU2 : FuncUnit;
+def P7_DU3 : FuncUnit;
+def P7_DU4 : FuncUnit;
+def P7_DU5 : FuncUnit;
+def P7_DU6 : FuncUnit;
+
+def P7_LS1 : FuncUnit; // Load/Store pipeline 1
+def P7_LS2 : FuncUnit; // Load/Store pipeline 2
+
+def P7_FX1 : FuncUnit; // FX pipeline 1
+def P7_FX2 : FuncUnit; // FX pipeline 2
+
+// VS pipeline 1 (vector integer ops. always here)
+def P7_VS1 : FuncUnit; // VS pipeline 1
+// VS pipeline 2 (128-bit stores and perms. here)
+def P7_VS2 : FuncUnit; // VS pipeline 2
+
+def P7_CRU : FuncUnit; // CR unit (CR logicals and move-from-SPRs)
+def P7_BRU : FuncUnit; // BR unit
+
+// Notes:
+// Each LSU pipeline can also execute FX add and logical instructions.
+// Each LSU pipeline can complete a load or store in one cycle.
+//
+// Each store is broken into two parts, AGEN goes to the LSU while a
+// "data steering" op. goes to the FXU or VSU.
+//
+// FX loads have a two cycle load-to-use latency (so one "bubble" cycle).
+// VSU loads have a three cycle load-to-use latency (so two "bubble" cycle).
+//
+// Frequent FX ops. take only one cycle and results can be used again in the
+// next cycle (there is a self-bypass). Getting results from the other FX
+// pipeline takes an additional cycle.
+//
+// The VSU XS is similar to the POWER6, but with a pipeline length of 2 cycles
+// (instead of 3 cycles on the POWER6). VSU XS handles vector FX-style ops.
+// Dispatch of an instruction to VS1 that uses four single prec. inputs
+// (either to a float or XC op). prevents dispatch in that cycle to VS2 of any
+// floating point instruction.
+//
+// The VSU PM is similar to the POWER6, but with a pipeline length of 3 cycles
+// (instead of 4 cycles on the POWER6). vsel is handled by the PM pipeline
+// (unlike on the POWER6).
+//
+// FMA from the VSUs can forward results in 6 cycles. VS1 XS and vector FP
+// share the same write-back, and have a 5-cycle latency difference, so the
+// IFU/IDU will not dispatch an XS instructon 5 cycles after a vector FP
+// op. has been dispatched to VS1.
+//
+// Three cycles after an L1 cache hit, a dependent VSU instruction can issue.
+//
+// Instruction dispatch groups have (at most) four non-branch instructions, and
+// two branches. Unlike on the POWER4/5, a branch does not automatically
+// end the dispatch group, but a second branch must be the last in the group.
+
+def P7Itineraries : ProcessorItineraries<
+ [P7_DU1, P7_DU2, P7_DU3, P7_DU4, P7_DU5, P7_DU6,
+ P7_LS1, P7_LS2, P7_FX1, P7_FX2, P7_VS1, P7_VS2, P7_CRU, P7_BRU], [], [
+ InstrItinData<IIC_IntSimple , [InstrStage<1, [P7_DU1, P7_DU2,
+ P7_DU3, P7_DU4], 0>,
+ InstrStage<1, [P7_FX1, P7_FX2,
+ P7_LS1, P7_LS2]>],
+ [1, 1, 1]>,
+ InstrItinData<IIC_IntGeneral , [InstrStage<1, [P7_DU1, P7_DU2,
+ P7_DU3, P7_DU4], 0>,
+ InstrStage<1, [P7_FX1, P7_FX2]>],
+ [1, 1, 1]>,
+ InstrItinData<IIC_IntISEL, [InstrStage<1, [P7_DU1], 0>,
+ InstrStage<1, [P7_FX1, P7_FX2], 0>,
+ InstrStage<1, [P7_BRU]>],
+ [1, 1, 1, 1]>,
+ InstrItinData<IIC_IntCompare , [InstrStage<1, [P7_DU1, P7_DU2,
+ P7_DU3, P7_DU4], 0>,
+ InstrStage<1, [P7_FX1, P7_FX2]>],
+ [1, 1, 1]>,
+ // FIXME: Add record-form itinerary data.
+ InstrItinData<IIC_IntDivW , [InstrStage<1, [P7_DU1], 0>,
+ InstrStage<1, [P7_DU2], 0>,
+ InstrStage<36, [P7_FX1, P7_FX2]>],
+ [36, 1, 1]>,
+ InstrItinData<IIC_IntDivD , [InstrStage<1, [P7_DU1], 0>,
+ InstrStage<1, [P7_DU2], 0>,
+ InstrStage<68, [P7_FX1, P7_FX2]>],
+ [68, 1, 1]>,
+ InstrItinData<IIC_IntMulHW , [InstrStage<1, [P7_DU1, P7_DU2,
+ P7_DU3, P7_DU4], 0>,
+ InstrStage<1, [P7_FX1, P7_FX2]>],
+ [4, 1, 1]>,
+ InstrItinData<IIC_IntMulHWU , [InstrStage<1, [P7_DU1, P7_DU2,
+ P7_DU3, P7_DU4], 0>,
+ InstrStage<1, [P7_FX1, P7_FX2]>],
+ [4, 1, 1]>,
+ InstrItinData<IIC_IntMulLI , [InstrStage<1, [P7_DU1, P7_DU2,
+ P7_DU3, P7_DU4], 0>,
+ InstrStage<1, [P7_FX1, P7_FX2]>],
+ [4, 1, 1]>,
+ InstrItinData<IIC_IntRotate , [InstrStage<1, [P7_DU1, P7_DU2,
+ P7_DU3, P7_DU4], 0>,
+ InstrStage<1, [P7_FX1, P7_FX2]>],
+ [1, 1, 1]>,
+ InstrItinData<IIC_IntRotateD , [InstrStage<1, [P7_DU1, P7_DU2,
+ P7_DU3, P7_DU4], 0>,
+ InstrStage<1, [P7_FX1, P7_FX2]>],
+ [1, 1, 1]>,
+ InstrItinData<IIC_IntShift , [InstrStage<1, [P7_DU1, P7_DU2,
+ P7_DU3, P7_DU4], 0>,
+ InstrStage<1, [P7_FX1, P7_FX2]>],
+ [1, 1, 1]>,
+ InstrItinData<IIC_IntTrapW , [InstrStage<1, [P7_DU1, P7_DU2,
+ P7_DU3, P7_DU4], 0>,
+ InstrStage<1, [P7_FX1, P7_FX2]>],
+ [1, 1]>,
+ InstrItinData<IIC_IntTrapD , [InstrStage<1, [P7_DU1, P7_DU2,
+ P7_DU3, P7_DU4], 0>,
+ InstrStage<1, [P7_FX1, P7_FX2]>],
+ [1, 1]>,
+ InstrItinData<IIC_BrB , [InstrStage<1, [P7_DU5, P7_DU6], 0>,
+ InstrStage<1, [P7_BRU]>],
+ [3, 1, 1]>,
+ InstrItinData<IIC_BrCR , [InstrStage<1, [P7_DU1], 0>,
+ InstrStage<1, [P7_CRU]>],
+ [3, 1, 1]>,
+ InstrItinData<IIC_BrMCR , [InstrStage<1, [P7_DU5, P7_DU6], 0>,
+ InstrStage<1, [P7_BRU]>],
+ [3, 1, 1]>,
+ InstrItinData<IIC_BrMCRX , [InstrStage<1, [P7_DU5, P7_DU6], 0>,
+ InstrStage<1, [P7_BRU]>],
+ [3, 1, 1]>,
+ InstrItinData<IIC_LdStLoad , [InstrStage<1, [P7_DU1, P7_DU2,
+ P7_DU3, P7_DU4], 0>,
+ InstrStage<1, [P7_LS1, P7_LS2]>],
+ [2, 1, 1]>,
+ InstrItinData<IIC_LdStLoadUpd , [InstrStage<1, [P7_DU1], 0>,
+ InstrStage<1, [P7_DU2], 0>,
+ InstrStage<1, [P7_LS1, P7_LS2], 0>,
+ InstrStage<1, [P7_FX1, P7_FX2]>],
+ [2, 2, 1, 1]>,
+ InstrItinData<IIC_LdStLoadUpdX, [InstrStage<1, [P7_DU1], 0>,
+ InstrStage<1, [P7_DU2], 0>,
+ InstrStage<1, [P7_DU3], 0>,
+ InstrStage<1, [P7_DU4], 0>,
+ InstrStage<1, [P7_FX1, P7_FX2]>,
+ InstrStage<1, [P7_LS1, P7_LS2], 0>,
+ InstrStage<1, [P7_FX1, P7_FX2]>],
+ [3, 3, 1, 1]>,
+ InstrItinData<IIC_LdStLD , [InstrStage<1, [P7_DU1, P7_DU2,
+ P7_DU3, P7_DU4], 0>,
+ InstrStage<1, [P7_LS1, P7_LS2]>],
+ [2, 1, 1]>,
+ InstrItinData<IIC_LdStLDU , [InstrStage<1, [P7_DU1], 0>,
+ InstrStage<1, [P7_DU2], 0>,
+ InstrStage<1, [P7_LS1, P7_LS2], 0>,
+ InstrStage<1, [P7_FX1, P7_FX2]>],
+ [2, 2, 1, 1]>,
+ InstrItinData<IIC_LdStLDUX , [InstrStage<1, [P7_DU1], 0>,
+ InstrStage<1, [P7_DU2], 0>,
+ InstrStage<1, [P7_DU3], 0>,
+ InstrStage<1, [P7_DU4], 0>,
+ InstrStage<1, [P7_FX1, P7_FX2]>,
+ InstrStage<1, [P7_LS1, P7_LS2], 0>,
+ InstrStage<1, [P7_FX1, P7_FX2]>],
+ [3, 3, 1, 1]>,
+ InstrItinData<IIC_LdStLFD , [InstrStage<1, [P7_DU1, P7_DU2,
+ P7_DU3, P7_DU4], 0>,
+ InstrStage<1, [P7_LS1, P7_LS2]>],
+ [3, 1, 1]>,
+ InstrItinData<IIC_LdStLVecX , [InstrStage<1, [P7_DU1, P7_DU2,
+ P7_DU3, P7_DU4], 0>,
+ InstrStage<1, [P7_LS1, P7_LS2]>],
+ [3, 1, 1]>,
+ InstrItinData<IIC_LdStLFDU , [InstrStage<1, [P7_DU1], 0>,
+ InstrStage<1, [P7_DU2], 0>,
+ InstrStage<1, [P7_LS1, P7_LS2], 0>,
+ InstrStage<1, [P7_FX1, P7_FX2]>],
+ [3, 3, 1, 1]>,
+ InstrItinData<IIC_LdStLFDUX , [InstrStage<1, [P7_DU1], 0>,
+ InstrStage<1, [P7_DU2], 0>,
+ InstrStage<1, [P7_LS1, P7_LS2], 0>,
+ InstrStage<1, [P7_FX1, P7_FX2]>],
+ [3, 3, 1, 1]>,
+ InstrItinData<IIC_LdStLHA , [InstrStage<1, [P7_DU1], 0>,
+ InstrStage<1, [P7_DU2], 0>,
+ InstrStage<1, [P7_LS1, P7_LS2]>,
+ InstrStage<1, [P7_FX1, P7_FX2]>],
+ [3, 1, 1]>,
+ InstrItinData<IIC_LdStLHAU , [InstrStage<1, [P7_DU1], 0>,
+ InstrStage<1, [P7_DU2], 0>,
+ InstrStage<1, [P7_LS1, P7_LS2], 0>,
+ InstrStage<1, [P7_FX1, P7_FX2]>,
+ InstrStage<1, [P7_FX1, P7_FX2]>],
+ [4, 4, 1, 1]>,
+ InstrItinData<IIC_LdStLHAUX , [InstrStage<1, [P7_DU1], 0>,
+ InstrStage<1, [P7_DU2], 0>,
+ InstrStage<1, [P7_DU3], 0>,
+ InstrStage<1, [P7_DU4], 0>,
+ InstrStage<1, [P7_FX1, P7_FX2]>,
+ InstrStage<1, [P7_LS1, P7_LS2], 0>,
+ InstrStage<1, [P7_FX1, P7_FX2]>,
+ InstrStage<1, [P7_FX1, P7_FX2]>],
+ [4, 4, 1, 1]>,
+ InstrItinData<IIC_LdStLWA , [InstrStage<1, [P7_DU1], 0>,
+ InstrStage<1, [P7_DU2], 0>,
+ InstrStage<1, [P7_LS1, P7_LS2]>,
+ InstrStage<1, [P7_FX1, P7_FX2]>],
+ [3, 1, 1]>,
+ InstrItinData<IIC_LdStLWARX, [InstrStage<1, [P7_DU1], 0>,
+ InstrStage<1, [P7_DU2], 0>,
+ InstrStage<1, [P7_DU3], 0>,
+ InstrStage<1, [P7_DU4], 0>,
+ InstrStage<1, [P7_LS1, P7_LS2]>],
+ [3, 1, 1]>,
+ InstrItinData<IIC_LdStLDARX, [InstrStage<1, [P7_DU1], 0>,
+ InstrStage<1, [P7_DU2], 0>,
+ InstrStage<1, [P7_DU3], 0>,
+ InstrStage<1, [P7_DU4], 0>,
+ InstrStage<1, [P7_LS1, P7_LS2]>],
+ [3, 1, 1]>,
+ InstrItinData<IIC_LdStLMW , [InstrStage<1, [P7_DU1, P7_DU2,
+ P7_DU3, P7_DU4], 0>,
+ InstrStage<1, [P7_LS1, P7_LS2]>],
+ [2, 1, 1]>,
+ InstrItinData<IIC_LdStStore , [InstrStage<1, [P7_DU1, P7_DU2,
+ P7_DU3, P7_DU4], 0>,
+ InstrStage<1, [P7_LS1, P7_LS2], 0>,
+ InstrStage<1, [P7_FX1, P7_FX2]>],
+ [1, 1, 1]>,
+ InstrItinData<IIC_LdStSTD , [InstrStage<1, [P7_DU1, P7_DU2,
+ P7_DU3, P7_DU4], 0>,
+ InstrStage<1, [P7_LS1, P7_LS2], 0>,
+ InstrStage<1, [P7_FX1, P7_FX2]>],
+ [1, 1, 1]>,
+ InstrItinData<IIC_LdStSTDU , [InstrStage<1, [P7_DU1], 0>,
+ InstrStage<1, [P7_DU2], 0>,
+ InstrStage<1, [P7_LS1, P7_LS2], 0>,
+ InstrStage<1, [P7_FX1, P7_FX2]>,
+ InstrStage<1, [P7_FX1, P7_FX2]>],
+ [2, 1, 1, 1]>,
+ InstrItinData<IIC_LdStSTDUX , [InstrStage<1, [P7_DU1], 0>,
+ InstrStage<1, [P7_DU2], 0>,
+ InstrStage<1, [P7_DU3], 0>,
+ InstrStage<1, [P7_DU4], 0>,
+ InstrStage<1, [P7_LS1, P7_LS2], 0>,
+ InstrStage<1, [P7_FX1, P7_FX2]>,
+ InstrStage<1, [P7_FX1, P7_FX2]>],
+ [2, 1, 1, 1]>,
+ InstrItinData<IIC_LdStSTFD , [InstrStage<1, [P7_DU1, P7_DU2,
+ P7_DU3, P7_DU4], 0>,
+ InstrStage<1, [P7_LS1, P7_LS2], 0>,
+ InstrStage<1, [P7_VS1, P7_VS2]>],
+ [1, 1, 1]>,
+ InstrItinData<IIC_LdStSTFDU , [InstrStage<1, [P7_DU1], 0>,
+ InstrStage<1, [P7_DU2], 0>,
+ InstrStage<1, [P7_LS1, P7_LS2], 0>,
+ InstrStage<1, [P7_FX1, P7_FX2], 0>,
+ InstrStage<1, [P7_VS1, P7_VS2]>],
+ [2, 1, 1, 1]>,
+ InstrItinData<IIC_LdStSTVEBX , [InstrStage<1, [P7_DU1, P7_DU2,
+ P7_DU3, P7_DU4], 0>,
+ InstrStage<1, [P7_LS1, P7_LS2], 0>,
+ InstrStage<1, [P7_VS2]>],
+ [1, 1, 1]>,
+ InstrItinData<IIC_LdStSTDCX , [InstrStage<1, [P7_DU1], 0>,
+ InstrStage<1, [P7_DU2], 0>,
+ InstrStage<1, [P7_DU3], 0>,
+ InstrStage<1, [P7_DU4], 0>,
+ InstrStage<1, [P7_LS1, P7_LS2]>],
+ [1, 1, 1]>,
+ InstrItinData<IIC_LdStSTWCX , [InstrStage<1, [P7_DU1], 0>,
+ InstrStage<1, [P7_DU2], 0>,
+ InstrStage<1, [P7_DU3], 0>,
+ InstrStage<1, [P7_DU4], 0>,
+ InstrStage<1, [P7_LS1, P7_LS2]>],
+ [1, 1, 1]>,
+ InstrItinData<IIC_BrMCRX , [InstrStage<1, [P7_DU1], 0>,
+ InstrStage<1, [P7_DU2], 0>,
+ InstrStage<1, [P7_DU3], 0>,
+ InstrStage<1, [P7_DU4], 0>,
+ InstrStage<1, [P7_CRU]>,
+ InstrStage<1, [P7_FX1, P7_FX2]>],
+ [3, 1]>, // mtcr
+ InstrItinData<IIC_SprMFCR , [InstrStage<1, [P7_DU1], 0>,
+ InstrStage<1, [P7_CRU]>],
+ [6, 1]>,
+ InstrItinData<IIC_SprMFCRF , [InstrStage<1, [P7_DU1], 0>,
+ InstrStage<1, [P7_CRU]>],
+ [3, 1]>,
+ InstrItinData<IIC_SprMTSPR , [InstrStage<1, [P7_DU1], 0>,
+ InstrStage<1, [P7_FX1]>],
+ [4, 1]>, // mtctr
+ InstrItinData<IIC_FPGeneral , [InstrStage<1, [P7_DU1, P7_DU2,
+ P7_DU3, P7_DU4], 0>,
+ InstrStage<1, [P7_VS1, P7_VS2]>],
+ [5, 1, 1]>,
+ InstrItinData<IIC_FPAddSub , [InstrStage<1, [P7_DU1, P7_DU2,
+ P7_DU3, P7_DU4], 0>,
+ InstrStage<1, [P7_VS1, P7_VS2]>],
+ [5, 1, 1]>,
+ InstrItinData<IIC_FPCompare , [InstrStage<1, [P7_DU1, P7_DU2,
+ P7_DU3, P7_DU4], 0>,
+ InstrStage<1, [P7_VS1, P7_VS2]>],
+ [8, 1, 1]>,
+ InstrItinData<IIC_FPDivD , [InstrStage<1, [P7_DU1, P7_DU2,
+ P7_DU3, P7_DU4], 0>,
+ InstrStage<1, [P7_VS1, P7_VS2]>],
+ [33, 1, 1]>,
+ InstrItinData<IIC_FPDivS , [InstrStage<1, [P7_DU1, P7_DU2,
+ P7_DU3, P7_DU4], 0>,
+ InstrStage<1, [P7_VS1, P7_VS2]>],
+ [27, 1, 1]>,
+ InstrItinData<IIC_FPSqrtD , [InstrStage<1, [P7_DU1, P7_DU2,
+ P7_DU3, P7_DU4], 0>,
+ InstrStage<1, [P7_VS1, P7_VS2]>],
+ [44, 1, 1]>,
+ InstrItinData<IIC_FPSqrtS , [InstrStage<1, [P7_DU1, P7_DU2,
+ P7_DU3, P7_DU4], 0>,
+ InstrStage<1, [P7_VS1, P7_VS2]>],
+ [32, 1, 1]>,
+ InstrItinData<IIC_FPFused , [InstrStage<1, [P7_DU1, P7_DU2,
+ P7_DU3, P7_DU4], 0>,
+ InstrStage<1, [P7_VS1, P7_VS2]>],
+ [5, 1, 1, 1]>,
+ InstrItinData<IIC_FPRes , [InstrStage<1, [P7_DU1, P7_DU2,
+ P7_DU3, P7_DU4], 0>,
+ InstrStage<1, [P7_VS1, P7_VS2]>],
+ [5, 1, 1]>,
+ InstrItinData<IIC_VecGeneral , [InstrStage<1, [P7_DU1], 0>,
+ InstrStage<1, [P7_VS1]>],
+ [2, 1, 1]>,
+ InstrItinData<IIC_VecVSL , [InstrStage<1, [P7_DU1], 0>,
+ InstrStage<1, [P7_VS1]>],
+ [2, 1, 1]>,
+ InstrItinData<IIC_VecVSR , [InstrStage<1, [P7_DU1], 0>,
+ InstrStage<1, [P7_VS1]>],
+ [2, 1, 1]>,
+ InstrItinData<IIC_VecFP , [InstrStage<1, [P7_DU1], 0>,
+ InstrStage<1, [P7_VS1, P7_VS2]>],
+ [6, 1, 1]>,
+ InstrItinData<IIC_VecFPCompare, [InstrStage<1, [P7_DU1], 0>,
+ InstrStage<1, [P7_VS1, P7_VS2]>],
+ [6, 1, 1]>,
+ InstrItinData<IIC_VecFPRound , [InstrStage<1, [P7_DU1], 0>,
+ InstrStage<1, [P7_VS1, P7_VS2]>],
+ [6, 1, 1]>,
+ InstrItinData<IIC_VecComplex , [InstrStage<1, [P7_DU1], 0>,
+ InstrStage<1, [P7_VS1]>],
+ [7, 1, 1]>,
+ InstrItinData<IIC_VecPerm , [InstrStage<1, [P7_DU1, P7_DU2], 0>,
+ InstrStage<1, [P7_VS2]>],
+ [3, 1, 1]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// P7 machine model for scheduling and other instruction cost heuristics.
+
+def P7Model : SchedMachineModel {
+ let IssueWidth = 6; // 4 (non-branch) instructions are dispatched per cycle.
+ // Note that the dispatch bundle size is 6 (including
+ // branches), but the total internal issue bandwidth per
+ // cycle (from all queues) is 8.
+
+ let LoadLatency = 3; // Optimistic load latency assuming bypass.
+ // This is overriden by OperandCycles if the
+ // Itineraries are queried instead.
+ let MispredictPenalty = 16;
+
+ // Try to make sure we have at least 10 dispatch groups in a loop.
+ let LoopMicroOpBufferSize = 40;
+
+ let CompleteModel = 0;
+
+ let Itineraries = P7Itineraries;
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleP8.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleP8.td
new file mode 100644
index 000000000000..8e52da583a0d
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleP8.td
@@ -0,0 +1,406 @@
+//===-- PPCScheduleP8.td - PPC P8 Scheduling Definitions ---*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the POWER8 processor.
+//
+//===----------------------------------------------------------------------===//
+
+// Scheduling for the P8 involves tracking two types of resources:
+// 1. The dispatch bundle slots
+// 2. The functional unit resources
+
+// Dispatch units:
+def P8_DU1 : FuncUnit;
+def P8_DU2 : FuncUnit;
+def P8_DU3 : FuncUnit;
+def P8_DU4 : FuncUnit;
+def P8_DU5 : FuncUnit;
+def P8_DU6 : FuncUnit;
+def P8_DU7 : FuncUnit; // Only branch instructions will use DU7,DU8
+def P8_DU8 : FuncUnit;
+
+// 10 insns per cycle (2-LU, 2-LSU, 2-FXU, 2-FPU, 1-CRU, 1-BRU).
+
+def P8_LU1 : FuncUnit; // Loads or fixed-point operations 1
+def P8_LU2 : FuncUnit; // Loads or fixed-point operations 2
+
+// Load/Store pipelines can handle Stores, fixed-point loads, and simple
+// fixed-point operations.
+def P8_LSU1 : FuncUnit; // Load/Store pipeline 1
+def P8_LSU2 : FuncUnit; // Load/Store pipeline 2
+
+// Fixed Point unit
+def P8_FXU1 : FuncUnit; // FX pipeline 1
+def P8_FXU2 : FuncUnit; // FX pipeline 2
+
+// The Floating-Point Unit (FPU) and Vector Media Extension (VMX) units
+// are combined on P7 and newer into a Vector Scalar Unit (VSU).
+// The P8 Instruction latency documents still refers to the unit as the
+// FPU, so keep in mind that FPU==VSU.
+// In contrast to the P7, the VMX units on P8 are symmetric, so no need to
+// split vector integer ops or 128-bit load/store/perms to the specific units.
+def P8_FPU1 : FuncUnit; // VS pipeline 1
+def P8_FPU2 : FuncUnit; // VS pipeline 2
+
+def P8_CRU : FuncUnit; // CR unit (CR logicals and move-from-SPRs)
+def P8_BRU : FuncUnit; // BR unit
+
+def P8Itineraries : ProcessorItineraries<
+ [P8_DU1, P8_DU2, P8_DU3, P8_DU4, P8_DU5, P8_DU6, P8_DU7, P8_DU8,
+ P8_LU1, P8_LU2, P8_LSU1, P8_LSU2, P8_FXU1, P8_FXU2,
+ P8_FPU1, P8_FPU2, P8_CRU, P8_BRU], [], [
+ InstrItinData<IIC_IntSimple , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+ P8_DU4, P8_DU5, P8_DU6], 0>,
+ InstrStage<1, [P8_FXU1, P8_FXU2,
+ P8_LU1, P8_LU2,
+ P8_LSU1, P8_LSU2]>],
+ [1, 1, 1]>,
+ InstrItinData<IIC_IntGeneral , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+ P8_DU4, P8_DU5, P8_DU6], 0>,
+ InstrStage<1, [P8_FXU1, P8_FXU2, P8_LU1,
+ P8_LU2, P8_LSU1, P8_LSU2]>],
+ [1, 1, 1]>,
+ InstrItinData<IIC_IntISEL, [InstrStage<1, [P8_DU1], 0>,
+ InstrStage<1, [P8_FXU1, P8_FXU2], 0>,
+ InstrStage<1, [P8_BRU]>],
+ [1, 1, 1, 1]>,
+ InstrItinData<IIC_IntCompare , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+ P8_DU4, P8_DU5, P8_DU6], 0>,
+ InstrStage<1, [P8_FXU1, P8_FXU2]>],
+ [1, 1, 1]>,
+ InstrItinData<IIC_IntDivW , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+ P8_DU4, P8_DU5, P8_DU6], 0>,
+ InstrStage<15, [P8_FXU1, P8_FXU2]>],
+ [15, 1, 1]>,
+ InstrItinData<IIC_IntDivD , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+ P8_DU4, P8_DU5, P8_DU6], 0>,
+ InstrStage<23, [P8_FXU1, P8_FXU2]>],
+ [23, 1, 1]>,
+ InstrItinData<IIC_IntMulHW , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+ P8_DU4, P8_DU5, P8_DU6], 0>,
+ InstrStage<1, [P8_FXU1, P8_FXU2]>],
+ [4, 1, 1]>,
+ InstrItinData<IIC_IntMulHWU , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+ P8_DU4, P8_DU5, P8_DU6], 0>,
+ InstrStage<1, [P8_FXU1, P8_FXU2]>],
+ [4, 1, 1]>,
+ InstrItinData<IIC_IntMulLI , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+ P8_DU4, P8_DU5, P8_DU6], 0>,
+ InstrStage<1, [P8_FXU1, P8_FXU2]>],
+ [4, 1, 1]>,
+ InstrItinData<IIC_IntRotate , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+ P8_DU4, P8_DU5, P8_DU6], 0>,
+ InstrStage<1, [P8_FXU1, P8_FXU2]>],
+ [1, 1, 1]>,
+ InstrItinData<IIC_IntRotateD , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+ P8_DU4, P8_DU5, P8_DU6], 0>,
+ InstrStage<1, [P8_FXU1, P8_FXU2]>],
+ [1, 1, 1]>,
+ InstrItinData<IIC_IntShift , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+ P8_DU4, P8_DU5, P8_DU6], 0>,
+ InstrStage<1, [P8_FXU1, P8_FXU2]>],
+ [1, 1, 1]>,
+ InstrItinData<IIC_IntTrapW , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+ P8_DU4, P8_DU5, P8_DU6], 0>,
+ InstrStage<1, [P8_FXU1, P8_FXU2]>],
+ [1, 1]>,
+ InstrItinData<IIC_IntTrapD , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+ P8_DU4, P8_DU5, P8_DU6], 0>,
+ InstrStage<1, [P8_FXU1, P8_FXU2]>],
+ [1, 1]>,
+ InstrItinData<IIC_BrB , [InstrStage<1, [P8_DU7, P8_DU8], 0>,
+ InstrStage<1, [P8_BRU]>],
+ [3, 1, 1]>,
+ // FIXME - the Br* groups below are not branch related, so should probably
+ // be renamed.
+ // IIC_BrCR consists of the cr* instructions. (crand,crnor,creqv, etc).
+ // and should be 'First' in dispatch.
+ InstrItinData<IIC_BrCR , [InstrStage<1, [P8_DU1], 0>,
+ InstrStage<1, [P8_CRU]>],
+ [3, 1, 1]>,
+ // IIC_BrMCR consists of the mcrf instruction.
+ InstrItinData<IIC_BrMCR , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+ P8_DU4, P8_DU5, P8_DU6], 0>,
+ InstrStage<1, [P8_CRU]>],
+ [3, 1, 1]>,
+ // IIC_BrMCRX consists of mcrxr (obsolete instruction) and mtcrf, which
+ // should be first in the dispatch group.
+ InstrItinData<IIC_BrMCRX , [InstrStage<1, [P8_DU1], 0>,
+ InstrStage<1, [P8_FXU1, P8_FXU2]>],
+ [3, 1, 1]>,
+ InstrItinData<IIC_BrMCRX , [InstrStage<1, [P8_DU1], 0>,
+ InstrStage<1, [P8_FXU1, P8_FXU2]>],
+ [3, 1]>,
+ InstrItinData<IIC_LdStLoad , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+ P8_DU4, P8_DU5, P8_DU6], 0>,
+ InstrStage<1, [P8_LSU1, P8_LSU2,
+ P8_LU1, P8_LU2]>],
+ [2, 1, 1]>,
+ InstrItinData<IIC_LdStLoadUpd , [InstrStage<1, [P8_DU1], 0>,
+ InstrStage<1, [P8_DU2], 0>,
+ InstrStage<1, [P8_LSU1, P8_LSU2,
+ P8_LU1, P8_LU2 ], 0>,
+ InstrStage<1, [P8_FXU1, P8_FXU2]>],
+ [2, 2, 1, 1]>,
+ // Update-Indexed form loads/stores are no longer first and last in the
+ // dispatch group. They are simply cracked, so require DU1,DU2.
+ InstrItinData<IIC_LdStLoadUpdX, [InstrStage<1, [P8_DU1], 0>,
+ InstrStage<1, [P8_DU2], 0>,
+ InstrStage<1, [P8_LSU1, P8_LSU2,
+ P8_LU1, P8_LU2], 0>,
+ InstrStage<1, [P8_FXU1, P8_FXU2]>],
+ [3, 3, 1, 1]>,
+ InstrItinData<IIC_LdStLD , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+ P8_DU4, P8_DU5, P8_DU6], 0>,
+ InstrStage<1, [P8_LSU1, P8_LSU2,
+ P8_LU1, P8_LU2]>],
+ [2, 1, 1]>,
+ InstrItinData<IIC_LdStLDU , [InstrStage<1, [P8_DU1], 0>,
+ InstrStage<1, [P8_DU2], 0>,
+ InstrStage<1, [P8_LSU1, P8_LSU2,
+ P8_LU1, P8_LU2], 0>,
+ InstrStage<1, [P8_FXU1, P8_FXU2]>],
+ [2, 2, 1, 1]>,
+ InstrItinData<IIC_LdStLDUX , [InstrStage<1, [P8_DU1], 0>,
+ InstrStage<1, [P8_DU2], 0>,
+ InstrStage<1, [P8_LSU1, P8_LSU2,
+ P8_LU1, P8_LU2], 0>,
+ InstrStage<1, [P8_FXU1, P8_FXU2]>],
+ [3, 3, 1, 1]>,
+ InstrItinData<IIC_LdStLFD , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+ P8_DU4, P8_DU5, P8_DU6], 0>,
+ InstrStage<1, [P8_LU1, P8_LU2]>],
+ [3, 1, 1]>,
+ InstrItinData<IIC_LdStLVecX , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+ P8_DU4, P8_DU5, P8_DU6], 0>,
+ InstrStage<1, [P8_LU1, P8_LU2]>],
+ [3, 1, 1]>,
+ InstrItinData<IIC_LdStLFDU , [InstrStage<1, [P8_DU1], 0>,
+ InstrStage<1, [P8_DU2], 0>,
+ InstrStage<1, [P8_LU1, P8_LU2], 0>,
+ InstrStage<1, [P8_FXU1, P8_FXU2]>],
+ [3, 3, 1, 1]>,
+ InstrItinData<IIC_LdStLFDUX , [InstrStage<1, [P8_DU1], 0>,
+ InstrStage<1, [P8_DU2], 0>,
+ InstrStage<1, [P8_LU1, P8_LU2], 0>,
+ InstrStage<1, [P8_FXU1, P8_FXU2]>],
+ [3, 3, 1, 1]>,
+ InstrItinData<IIC_LdStLHA , [InstrStage<1, [P8_DU1], 0>,
+ InstrStage<1, [P8_DU2], 0>,
+ InstrStage<1, [P8_LSU1, P8_LSU2,
+ P8_LU1, P8_LU2], 0>,
+ InstrStage<1, [P8_FXU1, P8_FXU2,
+ P8_LU1, P8_LU2]>],
+ [3, 1, 1]>,
+ InstrItinData<IIC_LdStLHAU , [InstrStage<1, [P8_DU1], 0>,
+ InstrStage<1, [P8_DU2], 0>,
+ InstrStage<1, [P8_LSU1, P8_LSU2,
+ P8_LU1, P8_LU2], 0>,
+ InstrStage<1, [P8_FXU1, P8_FXU2]>,
+ InstrStage<1, [P8_FXU1, P8_FXU2]>],
+ [4, 4, 1, 1]>,
+ // first+last in dispatch group.
+ InstrItinData<IIC_LdStLHAUX , [InstrStage<1, [P8_DU1], 0>,
+ InstrStage<1, [P8_DU2], 0>,
+ InstrStage<1, [P8_DU3], 0>,
+ InstrStage<1, [P8_DU4], 0>,
+ InstrStage<1, [P8_DU5], 0>,
+ InstrStage<1, [P8_DU6], 0>,
+ InstrStage<1, [P8_LSU1, P8_LSU2,
+ P8_LU1, P8_LU2], 0>,
+ InstrStage<1, [P8_FXU1, P8_FXU2]>,
+ InstrStage<1, [P8_FXU1, P8_FXU2]>],
+ [4, 4, 1, 1]>,
+ InstrItinData<IIC_LdStLWA , [InstrStage<1, [P8_DU1], 0>,
+ InstrStage<1, [P8_DU2], 0>,
+ InstrStage<1, [P8_LSU1, P8_LSU2,
+ P8_LU1, P8_LU2]>,
+ InstrStage<1, [P8_FXU1, P8_FXU2]>],
+ [3, 1, 1]>,
+ InstrItinData<IIC_LdStLWARX, [InstrStage<1, [P8_DU1], 0>,
+ InstrStage<1, [P8_DU2], 0>,
+ InstrStage<1, [P8_DU3], 0>,
+ InstrStage<1, [P8_DU4], 0>,
+ InstrStage<1, [P8_LSU1, P8_LSU2,
+ P8_LU1, P8_LU2]>],
+ [3, 1, 1]>,
+ // first+last
+ InstrItinData<IIC_LdStLDARX, [InstrStage<1, [P8_DU1], 0>,
+ InstrStage<1, [P8_DU2], 0>,
+ InstrStage<1, [P8_DU3], 0>,
+ InstrStage<1, [P8_DU4], 0>,
+ InstrStage<1, [P8_DU5], 0>,
+ InstrStage<1, [P8_DU6], 0>,
+ InstrStage<1, [P8_LSU1, P8_LSU2,
+ P8_LU1, P8_LU2]>],
+ [3, 1, 1]>,
+ InstrItinData<IIC_LdStLMW , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+ P8_DU4, P8_DU5, P8_DU6], 0>,
+ InstrStage<1, [P8_LSU1, P8_LSU2,
+ P8_LU1, P8_LU2]>],
+ [2, 1, 1]>,
+// Stores are dual-issued from the issue queue, so may only take up one
+// dispatch slot. The instruction will be broken into two IOPS. The agen
+// op is issued to the LSU, and the data op (register fetch) is issued
+// to either the LU (GPR store) or the VSU (FPR store).
+ InstrItinData<IIC_LdStStore , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+ P8_DU4, P8_DU5, P8_DU6], 0>,
+ InstrStage<1, [P8_LSU1, P8_LSU2]>,
+ InstrStage<1, [P8_LU1, P8_LU2]>],
+ [1, 1, 1]>,
+ InstrItinData<IIC_LdStSTD , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+ P8_DU4, P8_DU5, P8_DU6], 0>,
+ InstrStage<1, [P8_LU1, P8_LU2,
+ P8_LSU1, P8_LSU2]>]
+ [1, 1, 1]>,
+ InstrItinData<IIC_LdStSTDU , [InstrStage<1, [P8_DU1], 0>,
+ InstrStage<1, [P8_DU2], 0>,
+ InstrStage<1, [P8_LU1, P8_LU2,
+ P8_LSU1, P8_LSU2], 0>,
+ InstrStage<1, [P8_FXU1, P8_FXU2]>],
+ [2, 1, 1, 1]>,
+ // First+last
+ InstrItinData<IIC_LdStSTDUX , [InstrStage<1, [P8_DU1], 0>,
+ InstrStage<1, [P8_DU2], 0>,
+ InstrStage<1, [P8_DU3], 0>,
+ InstrStage<1, [P8_DU4], 0>,
+ InstrStage<1, [P8_DU5], 0>,
+ InstrStage<1, [P8_DU6], 0>,
+ InstrStage<1, [P8_LSU1, P8_LSU2], 0>,
+ InstrStage<1, [P8_FXU1, P8_FXU2]>,
+ InstrStage<1, [P8_FXU1, P8_FXU2]>],
+ [2, 1, 1, 1]>,
+ InstrItinData<IIC_LdStSTFD , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+ P8_DU4, P8_DU5, P8_DU6], 0>,
+ InstrStage<1, [P8_LSU1, P8_LSU2], 0>,
+ InstrStage<1, [P8_FPU1, P8_FPU2]>],
+ [1, 1, 1]>,
+ InstrItinData<IIC_LdStSTFDU , [InstrStage<1, [P8_DU1], 0>,
+ InstrStage<1, [P8_DU2], 0>,
+ InstrStage<1, [P8_LSU1, P8_LSU2], 0>,
+ InstrStage<1, [P8_FXU1, P8_FXU2], 0>,
+ InstrStage<1, [P8_FPU1, P8_FPU2]>],
+ [2, 1, 1, 1]>,
+ InstrItinData<IIC_LdStSTVEBX , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+ P8_DU4, P8_DU5, P8_DU6], 0>,
+ InstrStage<1, [P8_LSU1, P8_LSU2], 0>,
+ InstrStage<1, [P8_FPU1, P8_FPU2]>],
+ [1, 1, 1]>,
+ InstrItinData<IIC_LdStSTDCX , [InstrStage<1, [P8_DU1], 0>,
+ InstrStage<1, [P8_DU2], 0>,
+ InstrStage<1, [P8_DU3], 0>,
+ InstrStage<1, [P8_DU4], 0>,
+ InstrStage<1, [P8_DU5], 0>,
+ InstrStage<1, [P8_DU6], 0>,
+ InstrStage<1, [P8_LSU1, P8_LSU2], 0>,
+ InstrStage<1, [P8_LU1, P8_LU2]>],
+ [1, 1, 1]>,
+ InstrItinData<IIC_LdStSTWCX , [InstrStage<1, [P8_DU1], 0>,
+ InstrStage<1, [P8_DU2], 0>,
+ InstrStage<1, [P8_DU3], 0>,
+ InstrStage<1, [P8_DU4], 0>,
+ InstrStage<1, [P8_DU5], 0>,
+ InstrStage<1, [P8_DU6], 0>,
+ InstrStage<1, [P8_LSU1, P8_LSU2], 0>,
+ InstrStage<1, [P8_LU1, P8_LU2]>],
+ [1, 1, 1]>,
+ InstrItinData<IIC_SprMFCR , [InstrStage<1, [P8_DU1], 0>,
+ InstrStage<1, [P8_CRU]>],
+ [6, 1]>,
+ InstrItinData<IIC_SprMFCRF , [InstrStage<1, [P8_DU1], 0>,
+ InstrStage<1, [P8_CRU]>],
+ [3, 1]>,
+ InstrItinData<IIC_SprMTSPR , [InstrStage<1, [P8_DU1], 0>,
+ InstrStage<1, [P8_FXU1, P8_FXU2]>],
+ [4, 1]>, // mtctr
+ InstrItinData<IIC_FPGeneral , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+ P8_DU4, P8_DU5, P8_DU6], 0>,
+ InstrStage<1, [P8_FPU1, P8_FPU2]>],
+ [5, 1, 1]>,
+ InstrItinData<IIC_FPAddSub , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+ P8_DU4, P8_DU5, P8_DU6], 0>,
+ InstrStage<1, [P8_FPU1, P8_FPU2]>],
+ [5, 1, 1]>,
+ InstrItinData<IIC_FPCompare , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+ P8_DU4, P8_DU5, P8_DU6], 0>,
+ InstrStage<1, [P8_FPU1, P8_FPU2]>],
+ [8, 1, 1]>,
+ InstrItinData<IIC_FPDivD , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+ P8_DU4, P8_DU5, P8_DU6], 0>,
+ InstrStage<1, [P8_FPU1, P8_FPU2]>],
+ [33, 1, 1]>,
+ InstrItinData<IIC_FPDivS , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+ P8_DU4, P8_DU5, P8_DU6], 0>,
+ InstrStage<1, [P8_FPU1, P8_FPU2]>],
+ [27, 1, 1]>,
+ InstrItinData<IIC_FPSqrtD , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+ P8_DU4, P8_DU5, P8_DU6], 0>,
+ InstrStage<1, [P8_FPU1, P8_FPU2]>],
+ [44, 1, 1]>,
+ InstrItinData<IIC_FPSqrtS , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+ P8_DU4, P8_DU5, P8_DU6], 0>,
+ InstrStage<1, [P8_FPU1, P8_FPU2]>],
+ [32, 1, 1]>,
+ InstrItinData<IIC_FPFused , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+ P8_DU4, P8_DU5, P8_DU6], 0>,
+ InstrStage<1, [P8_FPU1, P8_FPU2]>],
+ [5, 1, 1, 1]>,
+ InstrItinData<IIC_FPRes , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+ P8_DU4, P8_DU5, P8_DU6], 0>,
+ InstrStage<1, [P8_FPU1, P8_FPU2]>],
+ [5, 1, 1]>,
+ InstrItinData<IIC_VecGeneral , [InstrStage<1, [P8_DU1], 0>,
+ InstrStage<1, [P8_FPU1, P8_FPU2]>],
+ [2, 1, 1]>,
+ InstrItinData<IIC_VecVSL , [InstrStage<1, [P8_DU1], 0>,
+ InstrStage<1, [P8_FPU1, P8_FPU2]>],
+ [2, 1, 1]>,
+ InstrItinData<IIC_VecVSR , [InstrStage<1, [P8_DU1], 0>,
+ InstrStage<1, [P8_FPU1, P8_FPU2]>],
+ [2, 1, 1]>,
+ InstrItinData<IIC_VecFP , [InstrStage<1, [P8_DU1], 0>,
+ InstrStage<1, [P8_FPU1, P8_FPU2]>],
+ [6, 1, 1]>,
+ InstrItinData<IIC_VecFPCompare, [InstrStage<1, [P8_DU1], 0>,
+ InstrStage<1, [P8_FPU1, P8_FPU2]>],
+ [6, 1, 1]>,
+ InstrItinData<IIC_VecFPRound , [InstrStage<1, [P8_DU1], 0>,
+ InstrStage<1, [P8_FPU1, P8_FPU2]>],
+ [6, 1, 1]>,
+ InstrItinData<IIC_VecComplex , [InstrStage<1, [P8_DU1], 0>,
+ InstrStage<1, [P8_FPU1, P8_FPU2]>],
+ [7, 1, 1]>,
+ InstrItinData<IIC_VecPerm , [InstrStage<1, [P8_DU1, P8_DU2], 0>,
+ InstrStage<1, [P8_FPU2, P8_FPU2]>],
+ [3, 1, 1]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// P8 machine model for scheduling and other instruction cost heuristics.
+// P8 has an 8 insn dispatch group (6 non-branch, 2 branch) and can issue up
+// to 10 insns per cycle (2-LU, 2-LSU, 2-FXU, 2-FPU, 1-CRU, 1-BRU).
+
+def P8Model : SchedMachineModel {
+ let IssueWidth = 8; // up to 8 instructions dispatched per cycle.
+ // up to six non-branch instructions.
+ // up to two branches in a dispatch group.
+
+ let LoadLatency = 3; // Optimistic load latency assuming bypass.
+ // This is overriden by OperandCycles if the
+ // Itineraries are queried instead.
+ let MispredictPenalty = 16;
+
+ // Try to make sure we have at least 10 dispatch groups in a loop.
+ let LoopMicroOpBufferSize = 60;
+
+ let CompleteModel = 0;
+
+ let Itineraries = P8Itineraries;
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleP9.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleP9.td
new file mode 100644
index 000000000000..a9c1bd78b05e
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleP9.td
@@ -0,0 +1,335 @@
+//===-- PPCScheduleP9.td - PPC P9 Scheduling Definitions ---*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the POWER9 processor.
+//
+//===----------------------------------------------------------------------===//
+include "PPCInstrInfo.td"
+
+def P9Model : SchedMachineModel {
+ let IssueWidth = 8;
+
+ let LoadLatency = 5;
+
+ let MispredictPenalty = 16;
+
+ // Try to make sure we have at least 10 dispatch groups in a loop.
+ let LoopMicroOpBufferSize = 60;
+
+ let CompleteModel = 0;
+
+}
+
+let SchedModel = P9Model in {
+
+ // ***************** Processor Resources *****************
+
+ //Dispatcher:
+ def DISPATCHER : ProcResource<12>;
+
+ // Issue Ports
+ def IP_AGEN : ProcResource<4>;
+ def IP_EXEC : ProcResource<4>;
+ def IP_EXECE : ProcResource<2> {
+ //Even Exec Ports
+ let Super = IP_EXEC;
+ }
+ def IP_EXECO : ProcResource<2> {
+ //Odd Exec Ports
+ let Super = IP_EXEC;
+ }
+
+ // Pipeline Groups
+ def ALU : ProcResource<4>;
+ def ALUE : ProcResource<2> {
+ //Even ALU pipelines
+ let Super = ALU;
+ }
+ def ALUO : ProcResource<2> {
+ //Odd ALU pipelines
+ let Super = ALU;
+ }
+ def DIV : ProcResource<2>;
+ def DP : ProcResource<4>;
+ def DPE : ProcResource<2> {
+ //Even DP pipelines
+ let Super = DP;
+ }
+ def DPO : ProcResource<2> {
+ //Odd DP pipelines
+ let Super = DP;
+ }
+ def LS : ProcResource<4>;
+ def PM : ProcResource<2>;
+ def DFU : ProcResource<1>;
+
+ def TestGroup : ProcResGroup<[ALU, DP]>;
+
+ // ***************** SchedWriteRes Definitions *****************
+
+ //Dispatcher
+ def DISP_1C : SchedWriteRes<[DISPATCHER]> {
+ let NumMicroOps = 0;
+ let Latency = 1;
+ }
+
+ // Issue Ports
+ def IP_AGEN_1C : SchedWriteRes<[IP_AGEN]> {
+ let NumMicroOps = 0;
+ let Latency = 1;
+ }
+
+ def IP_EXEC_1C : SchedWriteRes<[IP_EXEC]> {
+ let NumMicroOps = 0;
+ let Latency = 1;
+ }
+
+ def IP_EXECE_1C : SchedWriteRes<[IP_EXECE]> {
+ let NumMicroOps = 0;
+ let Latency = 1;
+ }
+
+ def IP_EXECO_1C : SchedWriteRes<[IP_EXECO]> {
+ let NumMicroOps = 0;
+ let Latency = 1;
+ }
+
+ //Pipeline Groups
+ def P9_ALU_2C : SchedWriteRes<[ALU]> {
+ let Latency = 2;
+ }
+
+ def P9_ALUE_2C : SchedWriteRes<[ALUE]> {
+ let Latency = 2;
+ }
+
+ def P9_ALUO_2C : SchedWriteRes<[ALUO]> {
+ let Latency = 2;
+ }
+
+ def P9_ALU_3C : SchedWriteRes<[ALU]> {
+ let Latency = 3;
+ }
+
+ def P9_ALUE_3C : SchedWriteRes<[ALUE]> {
+ let Latency = 3;
+ }
+
+ def P9_ALUO_3C : SchedWriteRes<[ALUO]> {
+ let Latency = 3;
+ }
+
+ def P9_ALU_4C : SchedWriteRes<[ALU]> {
+ let Latency = 4;
+ }
+
+ def P9_ALUE_4C : SchedWriteRes<[ALUE]> {
+ let Latency = 4;
+ }
+
+ def P9_ALUO_4C : SchedWriteRes<[ALUO]> {
+ let Latency = 4;
+ }
+
+ def P9_ALU_5C : SchedWriteRes<[ALU]> {
+ let Latency = 5;
+ }
+
+ def P9_ALU_6C : SchedWriteRes<[ALU]> {
+ let Latency = 6;
+ }
+
+ def P9_DIV_16C_8 : SchedWriteRes<[DIV]> {
+ let ResourceCycles = [8];
+ let Latency = 16;
+ }
+
+ def P9_DIV_24C_8 : SchedWriteRes<[DIV]> {
+ let ResourceCycles = [8];
+ let Latency = 24;
+ }
+
+ def P9_DIV_40C_8 : SchedWriteRes<[DIV]> {
+ let ResourceCycles = [8];
+ let Latency = 40;
+ }
+
+ def P9_DP_2C : SchedWriteRes<[DP]> {
+ let Latency = 2;
+ }
+
+ def P9_DP_5C : SchedWriteRes<[DP]> {
+ let Latency = 5;
+ }
+
+ def P9_DP_7C : SchedWriteRes<[DP]> {
+ let Latency = 7;
+ }
+
+ def P9_DPE_7C : SchedWriteRes<[DPE]> {
+ let Latency = 7;
+ }
+
+ def P9_DPO_7C : SchedWriteRes<[DPO]> {
+ let Latency = 7;
+ }
+
+ def P9_DP_22C_5 : SchedWriteRes<[DP]> {
+ let ResourceCycles = [5];
+ let Latency = 22;
+ }
+
+ def P9_DP_24C_8 : SchedWriteRes<[DP]> {
+ let ResourceCycles = [8];
+ let Latency = 24;
+ }
+
+ def P9_DP_26C_5 : SchedWriteRes<[DP]> {
+ let ResourceCycles = [5];
+ let Latency = 22;
+ }
+
+ def P9_DP_27C_7 : SchedWriteRes<[DP]> {
+ let ResourceCycles = [7];
+ let Latency = 27;
+ }
+
+ def P9_DP_33C_8 : SchedWriteRes<[DP]> {
+ let ResourceCycles = [8];
+ let Latency = 33;
+ }
+
+ def P9_DP_36C_10 : SchedWriteRes<[DP]> {
+ let ResourceCycles = [10];
+ let Latency = 36;
+ }
+
+ def P9_PM_3C : SchedWriteRes<[PM]> {
+ let Latency = 3;
+ }
+
+ def P9_PM_7C : SchedWriteRes<[PM]> {
+ let Latency = 3;
+ }
+
+ def P9_LS_1C : SchedWriteRes<[LS]> {
+ let Latency = 1;
+ }
+
+ def P9_LS_4C : SchedWriteRes<[LS]> {
+ let Latency = 4;
+ }
+
+ def P9_LS_5C : SchedWriteRes<[LS]> {
+ let Latency = 5;
+ }
+
+ def P9_DFU_12C : SchedWriteRes<[DFU]> {
+ let Latency = 12;
+ }
+
+ def P9_DFU_24C : SchedWriteRes<[DFU]> {
+ let Latency = 24;
+ let ResourceCycles = [12];
+ }
+
+ def P9_DFU_58C : SchedWriteRes<[DFU]> {
+ let Latency = 58;
+ let ResourceCycles = [44];
+ }
+
+ def P9_DFU_76C : SchedWriteRes<[TestGroup, DFU]> {
+ let Latency = 76;
+ let ResourceCycles = [62];
+ }
+ // ***************** WriteSeq Definitions *****************
+
+ def P9_LoadAndALUOp_6C : WriteSequence<[P9_LS_4C, P9_ALU_2C]>;
+ def P9_LoadAndALUOp_7C : WriteSequence<[P9_LS_5C, P9_ALU_2C]>;
+ def P9_LoadAndPMOp_8C : WriteSequence<[P9_LS_5C, P9_PM_3C]>;
+ def P9_IntDivAndALUOp_26C_8 : WriteSequence<[P9_DIV_24C_8, P9_ALU_2C]>;
+ def P9_IntDivAndALUOp_42C_8 : WriteSequence<[P9_DIV_40C_8, P9_ALU_2C]>;
+ def P9_StoreAndALUOp_4C : WriteSequence<[P9_LS_1C, P9_ALU_3C]>;
+ def P9_ALUOpAndALUOp_4C : WriteSequence<[P9_ALU_2C, P9_ALU_2C]>;
+
+ // ***************** Defining Itinerary Class Resources *****************
+
+ def : ItinRW<[P9_DFU_76C, IP_EXEC_1C, DISP_1C, DISP_1C], [IIC_IntSimple,
+ IIC_IntGeneral]>;
+
+ def : ItinRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+ [IIC_IntISEL, IIC_IntRotate, IIC_IntShift]>;
+
+ def : ItinRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C], [IIC_IntCompare]>;
+
+ def : ItinRW<[P9_DP_5C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+ [IIC_IntMulHW, IIC_IntMulHWU, IIC_IntMulLI]>;
+
+ def : ItinRW<[P9_LS_5C, IP_EXEC_1C, DISP_1C, DISP_1C],
+ [IIC_LdStLoad, IIC_LdStLD]>;
+
+ def : ItinRW<[P9_LS_4C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ [IIC_LdStLoadUpd, IIC_LdStLDU]>;
+
+ def : ItinRW<[P9_LS_4C, P9_ALU_2C, IP_EXECE_1C, IP_EXECO_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ [IIC_LdStLoadUpdX, IIC_LdStLDUX]>;
+
+ def : ItinRW<[P9_LS_1C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ [IIC_LdStSTFDU]>;
+
+ def : ItinRW<[P9_LoadAndALUOp_6C,
+ IP_AGEN_1C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ [IIC_LdStLHA, IIC_LdStLWA]>;
+
+ def : ItinRW<[P9_LoadAndALUOp_6C, P9_ALU_2C,
+ IP_AGEN_1C, IP_EXEC_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ [IIC_LdStLHAU, IIC_LdStLHAUX]>;
+
+ // IIC_LdStLMW contains two microcoded insns. This is not accurate, but
+ // those insns are not used that much, if at all.
+ def : ItinRW<[P9_LS_4C, IP_EXEC_1C, DISP_1C, DISP_1C],
+ [IIC_LdStLWARX, IIC_LdStLDARX, IIC_LdStLMW]>;
+
+ def : ItinRW<[P9_LS_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C],
+ [IIC_LdStSTFD, IIC_LdStSTD, IIC_LdStStore]>;
+
+ def : ItinRW<[P9_LS_1C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ [IIC_LdStSTDU, IIC_LdStSTDUX]>;
+
+ def : ItinRW<[P9_StoreAndALUOp_4C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ [IIC_LdStSTDCX, IIC_LdStSTWCX]>;
+
+ def : ItinRW<[P9_ALU_5C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+ [IIC_BrCR, IIC_IntMTFSB0]>;
+
+ def : ItinRW<[P9_ALUOpAndALUOp_4C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C,
+ IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C,
+ DISP_1C, DISP_1C, DISP_1C], [IIC_SprMFCR, IIC_SprMFCRF]>;
+
+ // This class should be broken down to instruction level, once some missing
+ // info is obtained.
+ def : ItinRW<[P9_LoadAndALUOp_6C, IP_EXEC_1C, IP_AGEN_1C,
+ DISP_1C, DISP_1C, DISP_1C], [IIC_SprMTSPR]>;
+
+ def : ItinRW<[P9_DP_7C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C], [IIC_FPGeneral, IIC_FPAddSub]>;
+
+ def : ItinRW<[P9_DP_36C_10, IP_EXEC_1C], [IIC_FPSqrtD]>;
+ def : ItinRW<[P9_DP_26C_5, P9_DP_26C_5, IP_EXEC_1C, IP_EXEC_1C], [IIC_FPSqrtS]>;
+
+ include "P9InstrResources.td"
+
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
new file mode 100644
index 000000000000..e8a87e7f4437
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -0,0 +1,252 @@
+//===-- PowerPCSubtarget.cpp - PPC Subtarget Information ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PPC specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCSubtarget.h"
+#include "PPC.h"
+#include "PPCRegisterInfo.h"
+#include "PPCTargetMachine.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cstdlib>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "PPCGenSubtargetInfo.inc"
+
+static cl::opt<bool> UseSubRegLiveness("ppc-track-subreg-liveness",
+cl::desc("Enable subregister liveness tracking for PPC"), cl::Hidden);
+
+static cl::opt<bool> QPXStackUnaligned("qpx-stack-unaligned",
+ cl::desc("Even when QPX is enabled the stack is not 32-byte aligned"),
+ cl::Hidden);
+
+PPCSubtarget &PPCSubtarget::initializeSubtargetDependencies(StringRef CPU,
+ StringRef FS) {
+ initializeEnvironment();
+ initSubtargetFeatures(CPU, FS);
+ return *this;
+}
+
+PPCSubtarget::PPCSubtarget(const Triple &TT, const std::string &CPU,
+ const std::string &FS, const PPCTargetMachine &TM)
+ : PPCGenSubtargetInfo(TT, CPU, FS), TargetTriple(TT),
+ IsPPC64(TargetTriple.getArch() == Triple::ppc64 ||
+ TargetTriple.getArch() == Triple::ppc64le),
+ TM(TM), FrameLowering(initializeSubtargetDependencies(CPU, FS)),
+ InstrInfo(*this), TLInfo(TM, *this) {}
+
+void PPCSubtarget::initializeEnvironment() {
+ StackAlignment = 16;
+ DarwinDirective = PPC::DIR_NONE;
+ HasMFOCRF = false;
+ Has64BitSupport = false;
+ Use64BitRegs = false;
+ UseCRBits = false;
+ HasHardFloat = false;
+ HasAltivec = false;
+ HasSPE = false;
+ HasQPX = false;
+ HasVSX = false;
+ HasP8Vector = false;
+ HasP8Altivec = false;
+ HasP8Crypto = false;
+ HasP9Vector = false;
+ HasP9Altivec = false;
+ HasFCPSGN = false;
+ HasFSQRT = false;
+ HasFRE = false;
+ HasFRES = false;
+ HasFRSQRTE = false;
+ HasFRSQRTES = false;
+ HasRecipPrec = false;
+ HasSTFIWX = false;
+ HasLFIWAX = false;
+ HasFPRND = false;
+ HasFPCVT = false;
+ HasISEL = false;
+ HasBPERMD = false;
+ HasExtDiv = false;
+ HasCMPB = false;
+ HasLDBRX = false;
+ IsBookE = false;
+ HasOnlyMSYNC = false;
+ IsPPC4xx = false;
+ IsPPC6xx = false;
+ IsE500 = false;
+ FeatureMFTB = false;
+ DeprecatedDST = false;
+ HasLazyResolverStubs = false;
+ HasICBT = false;
+ HasInvariantFunctionDescriptors = false;
+ HasPartwordAtomics = false;
+ HasDirectMove = false;
+ IsQPXStackUnaligned = false;
+ HasHTM = false;
+ HasFusion = false;
+ HasFloat128 = false;
+ IsISA3_0 = false;
+ UseLongCalls = false;
+
+ HasPOPCNTD = POPCNTD_Unavailable;
+}
+
+void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
+ // Determine default and user specified characteristics
+ std::string CPUName = CPU;
+ if (CPUName.empty() || CPU == "generic") {
+ // If cross-compiling with -march=ppc64le without -mcpu
+ if (TargetTriple.getArch() == Triple::ppc64le)
+ CPUName = "ppc64le";
+ else
+ CPUName = "generic";
+ }
+
+ // Initialize scheduling itinerary for the specified CPU.
+ InstrItins = getInstrItineraryForCPU(CPUName);
+
+ // Parse features string.
+ ParseSubtargetFeatures(CPUName, FS);
+
+ // If the user requested use of 64-bit regs, but the cpu selected doesn't
+ // support it, ignore.
+ if (IsPPC64 && has64BitSupport())
+ Use64BitRegs = true;
+
+ // Set up darwin-specific properties.
+ if (isDarwin())
+ HasLazyResolverStubs = true;
+
+ // QPX requires a 32-byte aligned stack. Note that we need to do this if
+ // we're compiling for a BG/Q system regardless of whether or not QPX
+ // is enabled because external functions will assume this alignment.
+ IsQPXStackUnaligned = QPXStackUnaligned;
+ StackAlignment = getPlatformStackAlignment();
+
+ // Determine endianness.
+ // FIXME: Part of the TargetMachine.
+ IsLittleEndian = (TargetTriple.getArch() == Triple::ppc64le);
+}
+
+/// Return true if accesses to the specified global have to go through a dyld
+/// lazy resolution stub. This means that an extra load is required to get the
+/// address of the global.
+bool PPCSubtarget::hasLazyResolverStub(const GlobalValue *GV) const {
+ if (!HasLazyResolverStubs)
+ return false;
+ if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
+ return true;
+ // 32 bit macho has no relocation for a-b if a is undefined, even if b is in
+ // the section that is being relocated. This means we have to use o load even
+ // for GVs that are known to be local to the dso.
+ if (GV->isDeclarationForLinker() || GV->hasCommonLinkage())
+ return true;
+ return false;
+}
+
+// Embedded cores need aggressive scheduling (and some others also benefit).
+static bool needsAggressiveScheduling(unsigned Directive) {
+ switch (Directive) {
+ default: return false;
+ case PPC::DIR_440:
+ case PPC::DIR_A2:
+ case PPC::DIR_E500mc:
+ case PPC::DIR_E5500:
+ case PPC::DIR_PWR7:
+ case PPC::DIR_PWR8:
+ // FIXME: Same as P8 until POWER9 scheduling info is available
+ case PPC::DIR_PWR9:
+ return true;
+ }
+}
+
+bool PPCSubtarget::enableMachineScheduler() const {
+ // Enable MI scheduling for the embedded cores.
+ // FIXME: Enable this for all cores (some additional modeling
+ // may be necessary).
+ return needsAggressiveScheduling(DarwinDirective);
+}
+
+// This overrides the PostRAScheduler bit in the SchedModel for each CPU.
+bool PPCSubtarget::enablePostRAScheduler() const { return true; }
+
+PPCGenSubtargetInfo::AntiDepBreakMode PPCSubtarget::getAntiDepBreakMode() const {
+ return TargetSubtargetInfo::ANTIDEP_ALL;
+}
+
+void PPCSubtarget::getCriticalPathRCs(RegClassVector &CriticalPathRCs) const {
+ CriticalPathRCs.clear();
+ CriticalPathRCs.push_back(isPPC64() ?
+ &PPC::G8RCRegClass : &PPC::GPRCRegClass);
+}
+
+void PPCSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
+ unsigned NumRegionInstrs) const {
+ if (needsAggressiveScheduling(DarwinDirective)) {
+ Policy.OnlyTopDown = false;
+ Policy.OnlyBottomUp = false;
+ }
+
+ // Spilling is generally expensive on all PPC cores, so always enable
+ // register-pressure tracking.
+ Policy.ShouldTrackPressure = true;
+}
+
+bool PPCSubtarget::useAA() const {
+ // Use AA during code generation for the embedded cores.
+ return needsAggressiveScheduling(DarwinDirective);
+}
+
+bool PPCSubtarget::enableSubRegLiveness() const {
+ return UseSubRegLiveness;
+}
+
+unsigned char PPCSubtarget::classifyGlobalReference(
+ const GlobalValue *GV) const {
+ // Note that currently we don't generate non-pic references.
+ // If a caller wants that, this will have to be updated.
+
+ // Large code model always uses the TOC even for local symbols.
+ if (TM.getCodeModel() == CodeModel::Large)
+ return PPCII::MO_PIC_FLAG | PPCII::MO_NLP_FLAG;
+
+ unsigned char flags = PPCII::MO_PIC_FLAG;
+
+ // Only if the relocation mode is PIC do we have to worry about
+ // interposition. In all other cases we can use a slightly looser standard to
+ // decide how to access the symbol.
+ if (TM.getRelocationModel() == Reloc::PIC_) {
+ // If it's local, or it's non-default, it can't be interposed.
+ if (!GV->hasLocalLinkage() &&
+ GV->hasDefaultVisibility()) {
+ flags |= PPCII::MO_NLP_FLAG;
+ }
+ return flags;
+ }
+
+ if (GV->isStrongDefinitionForLinker())
+ return flags;
+ return flags | PPCII::MO_NLP_FLAG;
+}
+
+bool PPCSubtarget::isELFv2ABI() const { return TM.isELFv2ABI(); }
+bool PPCSubtarget::isPPC64() const { return TM.isPPC64(); }
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h
new file mode 100644
index 000000000000..7fd907990ceb
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h
@@ -0,0 +1,322 @@
+//===-- PPCSubtarget.h - Define Subtarget for the PPC ----------*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the PowerPC specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCSUBTARGET_H
+#define LLVM_LIB_TARGET_POWERPC_PPCSUBTARGET_H
+
+#include "PPCFrameLowering.h"
+#include "PPCISelLowering.h"
+#include "PPCInstrInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "PPCGenSubtargetInfo.inc"
+
+// GCC #defines PPC on Linux but we use it as our namespace name
+#undef PPC
+
+namespace llvm {
+class StringRef;
+
+namespace PPC {
+ // -m directive values.
+ enum {
+ DIR_NONE,
+ DIR_32,
+ DIR_440,
+ DIR_601,
+ DIR_602,
+ DIR_603,
+ DIR_7400,
+ DIR_750,
+ DIR_970,
+ DIR_A2,
+ DIR_E500mc,
+ DIR_E5500,
+ DIR_PWR3,
+ DIR_PWR4,
+ DIR_PWR5,
+ DIR_PWR5X,
+ DIR_PWR6,
+ DIR_PWR6X,
+ DIR_PWR7,
+ DIR_PWR8,
+ DIR_PWR9,
+ DIR_64
+ };
+}
+
+class GlobalValue;
+class TargetMachine;
+
+class PPCSubtarget : public PPCGenSubtargetInfo {
+public:
+ enum POPCNTDKind {
+ POPCNTD_Unavailable,
+ POPCNTD_Slow,
+ POPCNTD_Fast
+ };
+
+protected:
+ /// TargetTriple - What processor and OS we're targeting.
+ Triple TargetTriple;
+
+ /// stackAlignment - The minimum alignment known to hold of the stack frame on
+ /// entry to the function and which must be maintained by every function.
+ unsigned StackAlignment;
+
+ /// Selected instruction itineraries (one entry per itinerary class.)
+ InstrItineraryData InstrItins;
+
+ /// Which cpu directive was used.
+ unsigned DarwinDirective;
+
+ /// Used by the ISel to turn in optimizations for POWER4-derived architectures
+ bool HasMFOCRF;
+ bool Has64BitSupport;
+ bool Use64BitRegs;
+ bool UseCRBits;
+ bool HasHardFloat;
+ bool IsPPC64;
+ bool HasAltivec;
+ bool HasSPE;
+ bool HasQPX;
+ bool HasVSX;
+ bool HasP8Vector;
+ bool HasP8Altivec;
+ bool HasP8Crypto;
+ bool HasP9Vector;
+ bool HasP9Altivec;
+ bool HasFCPSGN;
+ bool HasFSQRT;
+ bool HasFRE, HasFRES, HasFRSQRTE, HasFRSQRTES;
+ bool HasRecipPrec;
+ bool HasSTFIWX;
+ bool HasLFIWAX;
+ bool HasFPRND;
+ bool HasFPCVT;
+ bool HasISEL;
+ bool HasBPERMD;
+ bool HasExtDiv;
+ bool HasCMPB;
+ bool HasLDBRX;
+ bool IsBookE;
+ bool HasOnlyMSYNC;
+ bool IsE500;
+ bool IsPPC4xx;
+ bool IsPPC6xx;
+ bool FeatureMFTB;
+ bool DeprecatedDST;
+ bool HasLazyResolverStubs;
+ bool IsLittleEndian;
+ bool HasICBT;
+ bool HasInvariantFunctionDescriptors;
+ bool HasPartwordAtomics;
+ bool HasDirectMove;
+ bool HasHTM;
+ bool HasFusion;
+ bool HasFloat128;
+ bool IsISA3_0;
+ bool UseLongCalls;
+
+ POPCNTDKind HasPOPCNTD;
+
+ /// When targeting QPX running a stock PPC64 Linux kernel where the stack
+ /// alignment has not been changed, we need to keep the 16-byte alignment
+ /// of the stack.
+ bool IsQPXStackUnaligned;
+
+ const PPCTargetMachine &TM;
+ PPCFrameLowering FrameLowering;
+ PPCInstrInfo InstrInfo;
+ PPCTargetLowering TLInfo;
+ SelectionDAGTargetInfo TSInfo;
+
+public:
+ /// This constructor initializes the data members to match that
+ /// of the specified triple.
+ ///
+ PPCSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS,
+ const PPCTargetMachine &TM);
+
+ /// ParseSubtargetFeatures - Parses features string setting specified
+ /// subtarget options. Definition of function is auto generated by tblgen.
+ void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+ /// getStackAlignment - Returns the minimum alignment known to hold of the
+ /// stack frame on entry to the function and which must be maintained by every
+ /// function for this subtarget.
+ unsigned getStackAlignment() const { return StackAlignment; }
+
+ /// getDarwinDirective - Returns the -m directive specified for the cpu.
+ ///
+ unsigned getDarwinDirective() const { return DarwinDirective; }
+
+ /// getInstrItins - Return the instruction itineraries based on subtarget
+ /// selection.
+ const InstrItineraryData *getInstrItineraryData() const override {
+ return &InstrItins;
+ }
+
+ const PPCFrameLowering *getFrameLowering() const override {
+ return &FrameLowering;
+ }
+ const PPCInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+ const PPCTargetLowering *getTargetLowering() const override {
+ return &TLInfo;
+ }
+ const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+ return &TSInfo;
+ }
+ const PPCRegisterInfo *getRegisterInfo() const override {
+ return &getInstrInfo()->getRegisterInfo();
+ }
+ const PPCTargetMachine &getTargetMachine() const { return TM; }
+
+ /// initializeSubtargetDependencies - Initializes using a CPU and feature string
+ /// so that we can use initializer lists for subtarget initialization.
+ PPCSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
+
+private:
+ void initializeEnvironment();
+ void initSubtargetFeatures(StringRef CPU, StringRef FS);
+
+public:
+ /// isPPC64 - Return true if we are generating code for 64-bit pointer mode.
+ ///
+ bool isPPC64() const;
+
+ /// has64BitSupport - Return true if the selected CPU supports 64-bit
+ /// instructions, regardless of whether we are in 32-bit or 64-bit mode.
+ bool has64BitSupport() const { return Has64BitSupport; }
+ // useSoftFloat - Return true if soft-float option is turned on.
+ bool useSoftFloat() const { return !HasHardFloat; }
+
+ /// use64BitRegs - Return true if in 64-bit mode or if we should use 64-bit
+ /// registers in 32-bit mode when possible. This can only true if
+ /// has64BitSupport() returns true.
+ bool use64BitRegs() const { return Use64BitRegs; }
+
+ /// useCRBits - Return true if we should store and manipulate i1 values in
+ /// the individual condition register bits.
+ bool useCRBits() const { return UseCRBits; }
+
+ /// hasLazyResolverStub - Return true if accesses to the specified global have
+ /// to go through a dyld lazy resolution stub. This means that an extra load
+ /// is required to get the address of the global.
+ bool hasLazyResolverStub(const GlobalValue *GV) const;
+
+ // isLittleEndian - True if generating little-endian code
+ bool isLittleEndian() const { return IsLittleEndian; }
+
+ // Specific obvious features.
+ bool hasFCPSGN() const { return HasFCPSGN; }
+ bool hasFSQRT() const { return HasFSQRT; }
+ bool hasFRE() const { return HasFRE; }
+ bool hasFRES() const { return HasFRES; }
+ bool hasFRSQRTE() const { return HasFRSQRTE; }
+ bool hasFRSQRTES() const { return HasFRSQRTES; }
+ bool hasRecipPrec() const { return HasRecipPrec; }
+ bool hasSTFIWX() const { return HasSTFIWX; }
+ bool hasLFIWAX() const { return HasLFIWAX; }
+ bool hasFPRND() const { return HasFPRND; }
+ bool hasFPCVT() const { return HasFPCVT; }
+ bool hasAltivec() const { return HasAltivec; }
+ bool hasSPE() const { return HasSPE; }
+ bool hasQPX() const { return HasQPX; }
+ bool hasVSX() const { return HasVSX; }
+ bool hasP8Vector() const { return HasP8Vector; }
+ bool hasP8Altivec() const { return HasP8Altivec; }
+ bool hasP8Crypto() const { return HasP8Crypto; }
+ bool hasP9Vector() const { return HasP9Vector; }
+ bool hasP9Altivec() const { return HasP9Altivec; }
+ bool hasMFOCRF() const { return HasMFOCRF; }
+ bool hasISEL() const { return HasISEL; }
+ bool hasBPERMD() const { return HasBPERMD; }
+ bool hasExtDiv() const { return HasExtDiv; }
+ bool hasCMPB() const { return HasCMPB; }
+ bool hasLDBRX() const { return HasLDBRX; }
+ bool isBookE() const { return IsBookE; }
+ bool hasOnlyMSYNC() const { return HasOnlyMSYNC; }
+ bool isPPC4xx() const { return IsPPC4xx; }
+ bool isPPC6xx() const { return IsPPC6xx; }
+ bool isE500() const { return IsE500; }
+ bool isFeatureMFTB() const { return FeatureMFTB; }
+ bool isDeprecatedDST() const { return DeprecatedDST; }
+ bool hasICBT() const { return HasICBT; }
+ bool hasInvariantFunctionDescriptors() const {
+ return HasInvariantFunctionDescriptors;
+ }
+ bool hasPartwordAtomics() const { return HasPartwordAtomics; }
+ bool hasDirectMove() const { return HasDirectMove; }
+
+ bool isQPXStackUnaligned() const { return IsQPXStackUnaligned; }
+ unsigned getPlatformStackAlignment() const {
+ if ((hasQPX() || isBGQ()) && !isQPXStackUnaligned())
+ return 32;
+
+ return 16;
+ }
+ bool hasHTM() const { return HasHTM; }
+ bool hasFusion() const { return HasFusion; }
+ bool hasFloat128() const { return HasFloat128; }
+ bool isISA3_0() const { return IsISA3_0; }
+ bool useLongCalls() const { return UseLongCalls; }
+ bool needsSwapsForVSXMemOps() const {
+ return hasVSX() && isLittleEndian() && !hasP9Vector();
+ }
+
+ POPCNTDKind hasPOPCNTD() const { return HasPOPCNTD; }
+
+ const Triple &getTargetTriple() const { return TargetTriple; }
+
+ /// isDarwin - True if this is any darwin platform.
+ bool isDarwin() const { return TargetTriple.isMacOSX(); }
+ /// isBGQ - True if this is a BG/Q platform.
+ bool isBGQ() const { return TargetTriple.getVendor() == Triple::BGQ; }
+
+ bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
+ bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
+ bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
+
+ bool isDarwinABI() const { return isTargetMachO() || isDarwin(); }
+ bool isSVR4ABI() const { return !isDarwinABI(); }
+ bool isELFv2ABI() const;
+
+ bool enableEarlyIfConversion() const override { return hasISEL(); }
+
+ // Scheduling customization.
+ bool enableMachineScheduler() const override;
+ // This overrides the PostRAScheduler bit in the SchedModel for each CPU.
+ bool enablePostRAScheduler() const override;
+ AntiDepBreakMode getAntiDepBreakMode() const override;
+ void getCriticalPathRCs(RegClassVector &CriticalPathRCs) const override;
+
+ void overrideSchedPolicy(MachineSchedPolicy &Policy,
+ unsigned NumRegionInstrs) const override;
+ bool useAA() const override;
+
+ bool enableSubRegLiveness() const override;
+
+ /// classifyGlobalReference - Classify a global variable reference for the
+ /// current subtarget accourding to how we should reference it.
+ unsigned char classifyGlobalReference(const GlobalValue *GV) const;
+};
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
new file mode 100644
index 000000000000..0c1260a2965b
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
@@ -0,0 +1,174 @@
+//===---------- PPCTLSDynamicCall.cpp - TLS Dynamic Call Fixup ------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass expands ADDItls{ld,gd}LADDR[32] machine instructions into
+// separate ADDItls[gd]L[32] and GETtlsADDR[32] instructions, both of
+// which define GPR3. A copy is added from GPR3 to the target virtual
+// register of the original instruction. The GETtlsADDR[32] is really
+// a call instruction, so its target register is constrained to be GPR3.
+// This is not true of ADDItls[gd]L[32], but there is a legacy linker
+// optimization bug that requires the target register of the addi of
+// a local- or general-dynamic TLS access sequence to be GPR3.
+//
+// This is done in a late pass so that TLS variable accesses can be
+// fully commoned by MachineCSE.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCInstrInfo.h"
+#include "PPC.h"
+#include "PPCInstrBuilder.h"
+#include "PPCTargetMachine.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-tls-dynamic-call"
+
+namespace llvm {
+ void initializePPCTLSDynamicCallPass(PassRegistry&);
+}
+
+namespace {
+ struct PPCTLSDynamicCall : public MachineFunctionPass {
+ static char ID;
+ PPCTLSDynamicCall() : MachineFunctionPass(ID) {
+ initializePPCTLSDynamicCallPass(*PassRegistry::getPassRegistry());
+ }
+
+ const PPCInstrInfo *TII;
+ LiveIntervals *LIS;
+
+protected:
+ bool processBlock(MachineBasicBlock &MBB) {
+ bool Changed = false;
+ bool Is64Bit = MBB.getParent()->getSubtarget<PPCSubtarget>().isPPC64();
+
+ for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end();
+ I != IE;) {
+ MachineInstr &MI = *I;
+
+ if (MI.getOpcode() != PPC::ADDItlsgdLADDR &&
+ MI.getOpcode() != PPC::ADDItlsldLADDR &&
+ MI.getOpcode() != PPC::ADDItlsgdLADDR32 &&
+ MI.getOpcode() != PPC::ADDItlsldLADDR32) {
+ ++I;
+ continue;
+ }
+
+ DEBUG(dbgs() << "TLS Dynamic Call Fixup:\n " << MI);
+
+ unsigned OutReg = MI.getOperand(0).getReg();
+ unsigned InReg = MI.getOperand(1).getReg();
+ DebugLoc DL = MI.getDebugLoc();
+ unsigned GPR3 = Is64Bit ? PPC::X3 : PPC::R3;
+ unsigned Opc1, Opc2;
+ const unsigned OrigRegs[] = {OutReg, InReg, GPR3};
+
+ switch (MI.getOpcode()) {
+ default:
+ llvm_unreachable("Opcode inconsistency error");
+ case PPC::ADDItlsgdLADDR:
+ Opc1 = PPC::ADDItlsgdL;
+ Opc2 = PPC::GETtlsADDR;
+ break;
+ case PPC::ADDItlsldLADDR:
+ Opc1 = PPC::ADDItlsldL;
+ Opc2 = PPC::GETtlsldADDR;
+ break;
+ case PPC::ADDItlsgdLADDR32:
+ Opc1 = PPC::ADDItlsgdL32;
+ Opc2 = PPC::GETtlsADDR32;
+ break;
+ case PPC::ADDItlsldLADDR32:
+ Opc1 = PPC::ADDItlsldL32;
+ Opc2 = PPC::GETtlsldADDR32;
+ break;
+ }
+
+ // Don't really need to save data to the stack - the clobbered
+ // registers are already saved when the SDNode (e.g. PPCaddiTlsgdLAddr)
+ // gets translated to the pseudo instruction (e.g. ADDItlsgdLADDR).
+ BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKDOWN)).addImm(0);
+
+ // Expand into two ops built prior to the existing instruction.
+ MachineInstr *Addi = BuildMI(MBB, I, DL, TII->get(Opc1), GPR3)
+ .addReg(InReg);
+ Addi->addOperand(MI.getOperand(2));
+
+ // The ADDItls* instruction is the first instruction in the
+ // repair range.
+ MachineBasicBlock::iterator First = I;
+ --First;
+
+ MachineInstr *Call = (BuildMI(MBB, I, DL, TII->get(Opc2), GPR3)
+ .addReg(GPR3));
+ Call->addOperand(MI.getOperand(3));
+
+ BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKUP)).addImm(0).addImm(0);
+
+ BuildMI(MBB, I, DL, TII->get(TargetOpcode::COPY), OutReg)
+ .addReg(GPR3);
+
+ // The COPY is the last instruction in the repair range.
+ MachineBasicBlock::iterator Last = I;
+ --Last;
+
+ // Move past the original instruction and remove it.
+ ++I;
+ MI.removeFromParent();
+
+ // Repair the live intervals.
+ LIS->repairIntervalsInRange(&MBB, First, Last, OrigRegs);
+ Changed = true;
+ }
+
+ return Changed;
+ }
+
+public:
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ TII = MF.getSubtarget<PPCSubtarget>().getInstrInfo();
+ LIS = &getAnalysis<LiveIntervals>();
+
+ bool Changed = false;
+
+ for (MachineFunction::iterator I = MF.begin(); I != MF.end();) {
+ MachineBasicBlock &B = *I++;
+ if (processBlock(B))
+ Changed = true;
+ }
+
+ return Changed;
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LiveIntervals>();
+ AU.addPreserved<LiveIntervals>();
+ AU.addRequired<SlotIndexes>();
+ AU.addPreserved<SlotIndexes>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+ };
+}
+
+INITIALIZE_PASS_BEGIN(PPCTLSDynamicCall, DEBUG_TYPE,
+ "PowerPC TLS Dynamic Call Fixup", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_END(PPCTLSDynamicCall, DEBUG_TYPE,
+ "PowerPC TLS Dynamic Call Fixup", false, false)
+
+char PPCTLSDynamicCall::ID = 0;
+FunctionPass*
+llvm::createPPCTLSDynamicCallPass() { return new PPCTLSDynamicCall(); }
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTOCRegDeps.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTOCRegDeps.cpp
new file mode 100644
index 000000000000..7c53a5601790
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTOCRegDeps.cpp
@@ -0,0 +1,155 @@
+//===-- PPCTOCRegDeps.cpp - Add Extra TOC Register Dependencies -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// When resolving an address using the ELF ABI TOC pointer, two relocations are
+// generally required: one for the high part and one for the low part. Only
+// the high part generally explicitly depends on r2 (the TOC pointer). And, so,
+// we might produce code like this:
+//
+// .Ltmp526:
+// addis 3, 2, .LC12@toc@ha
+// .Ltmp1628:
+// std 2, 40(1)
+// ld 5, 0(27)
+// ld 2, 8(27)
+// ld 11, 16(27)
+// ld 3, .LC12@toc@l(3)
+// rldicl 4, 4, 0, 32
+// mtctr 5
+// bctrl
+// ld 2, 40(1)
+//
+// And there is nothing wrong with this code, as such, but there is a linker bug
+// in binutils (https://sourceware.org/bugzilla/show_bug.cgi?id=18414) that will
+// misoptimize this code sequence to this:
+// nop
+// std r2,40(r1)
+// ld r5,0(r27)
+// ld r2,8(r27)
+// ld r11,16(r27)
+// ld r3,-32472(r2)
+// clrldi r4,r4,32
+// mtctr r5
+// bctrl
+// ld r2,40(r1)
+// because the linker does not know (and does not check) that the value in r2
+// changed in between the instruction using the .LC12@toc@ha (TOC-relative)
+// relocation and the instruction using the .LC12@toc@l(3) relocation.
+// Because it finds these instructions using the relocations (and not by
+// scanning the instructions), it has been asserted that there is no good way
+// to detect the change of r2 in between. As a result, this bug may never be
+// fixed (i.e. it may become part of the definition of the ABI). GCC was
+// updated to add extra dependencies on r2 to instructions using the @toc@l
+// relocations to avoid this problem, and we'll do the same here.
+//
+// This is done as a separate pass because:
+// 1. These extra r2 dependencies are not really properties of the
+// instructions, but rather due to a linker bug, and maybe one day we'll be
+// able to get rid of them when targeting linkers without this bug (and,
+// thus, keeping the logic centralized here will make that
+// straightforward).
+// 2. There are ISel-level peephole optimizations that propagate the @toc@l
+// relocations to some user instructions, and so the exta dependencies do
+// not apply only to a fixed set of instructions (without undesirable
+// definition replication).
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "PPCInstrBuilder.h"
+#include "PPCInstrInfo.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-toc-reg-deps"
+
+namespace llvm {
+ void initializePPCTOCRegDepsPass(PassRegistry&);
+}
+
+namespace {
+ // PPCTOCRegDeps pass - For simple functions without epilogue code, move
+ // returns up, and create conditional returns, to avoid unnecessary
+ // branch-to-blr sequences.
+ struct PPCTOCRegDeps : public MachineFunctionPass {
+ static char ID;
+ PPCTOCRegDeps() : MachineFunctionPass(ID) {
+ initializePPCTOCRegDepsPass(*PassRegistry::getPassRegistry());
+ }
+
+protected:
+ bool hasTOCLoReloc(const MachineInstr &MI) {
+ if (MI.getOpcode() == PPC::LDtocL ||
+ MI.getOpcode() == PPC::ADDItocL)
+ return true;
+
+ for (const MachineOperand &MO : MI.operands()) {
+ if ((MO.getTargetFlags() & PPCII::MO_ACCESS_MASK) == PPCII::MO_TOC_LO)
+ return true;
+ }
+
+ return false;
+ }
+
+ bool processBlock(MachineBasicBlock &MBB) {
+ bool Changed = false;
+
+ for (auto &MI : MBB) {
+ if (!hasTOCLoReloc(MI))
+ continue;
+
+ MI.addOperand(MachineOperand::CreateReg(PPC::X2,
+ false /*IsDef*/,
+ true /*IsImp*/));
+ Changed = true;
+ }
+
+ return Changed;
+ }
+
+public:
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ bool Changed = false;
+
+ for (MachineFunction::iterator I = MF.begin(); I != MF.end();) {
+ MachineBasicBlock &B = *I++;
+ if (processBlock(B))
+ Changed = true;
+ }
+
+ return Changed;
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+ };
+}
+
+INITIALIZE_PASS(PPCTOCRegDeps, DEBUG_TYPE,
+ "PowerPC TOC Register Dependencies", false, false)
+
+char PPCTOCRegDeps::ID = 0;
+FunctionPass*
+llvm::createPPCTOCRegDepsPass() { return new PPCTOCRegDeps(); }
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
new file mode 100644
index 000000000000..91b1d24b2e41
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -0,0 +1,429 @@
+//===-- PPCTargetMachine.cpp - Define TargetMachine for PowerPC -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Top-level implementation for the PowerPC target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCTargetMachine.h"
+#include "PPC.h"
+#include "PPCTargetObjectFile.h"
+#include "PPCTargetTransformInfo.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+static cl::
+opt<bool> DisableCTRLoops("disable-ppc-ctrloops", cl::Hidden,
+ cl::desc("Disable CTR loops for PPC"));
+
+static cl::
+opt<bool> DisablePreIncPrep("disable-ppc-preinc-prep", cl::Hidden,
+ cl::desc("Disable PPC loop preinc prep"));
+
+static cl::opt<bool>
+VSXFMAMutateEarly("schedule-ppc-vsx-fma-mutation-early",
+ cl::Hidden, cl::desc("Schedule VSX FMA instruction mutation early"));
+
+static cl::
+opt<bool> DisableVSXSwapRemoval("disable-ppc-vsx-swap-removal", cl::Hidden,
+ cl::desc("Disable VSX Swap Removal for PPC"));
+
+static cl::
+opt<bool> DisableQPXLoadSplat("disable-ppc-qpx-load-splat", cl::Hidden,
+ cl::desc("Disable QPX load splat simplification"));
+
+static cl::
+opt<bool> DisableMIPeephole("disable-ppc-peephole", cl::Hidden,
+ cl::desc("Disable machine peepholes for PPC"));
+
+static cl::opt<bool>
+EnableGEPOpt("ppc-gep-opt", cl::Hidden,
+ cl::desc("Enable optimizations on complex GEPs"),
+ cl::init(true));
+
+static cl::opt<bool>
+EnablePrefetch("enable-ppc-prefetching",
+ cl::desc("disable software prefetching on PPC"),
+ cl::init(false), cl::Hidden);
+
+static cl::opt<bool>
+EnableExtraTOCRegDeps("enable-ppc-extra-toc-reg-deps",
+ cl::desc("Add extra TOC register dependencies"),
+ cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+EnableMachineCombinerPass("ppc-machine-combiner",
+ cl::desc("Enable the machine combiner pass"),
+ cl::init(true), cl::Hidden);
+
+extern "C" void LLVMInitializePowerPCTarget() {
+ // Register the targets
+ RegisterTargetMachine<PPC32TargetMachine> A(getThePPC32Target());
+ RegisterTargetMachine<PPC64TargetMachine> B(getThePPC64Target());
+ RegisterTargetMachine<PPC64TargetMachine> C(getThePPC64LETarget());
+
+ PassRegistry &PR = *PassRegistry::getPassRegistry();
+ initializePPCBoolRetToIntPass(PR);
+}
+
+/// Return the datalayout string of a subtarget.
+static std::string getDataLayoutString(const Triple &T) {
+ bool is64Bit = T.getArch() == Triple::ppc64 || T.getArch() == Triple::ppc64le;
+ std::string Ret;
+
+ // Most PPC* platforms are big endian, PPC64LE is little endian.
+ if (T.getArch() == Triple::ppc64le)
+ Ret = "e";
+ else
+ Ret = "E";
+
+ Ret += DataLayout::getManglingComponent(T);
+
+ // PPC32 has 32 bit pointers. The PS3 (OS Lv2) is a PPC64 machine with 32 bit
+ // pointers.
+ if (!is64Bit || T.getOS() == Triple::Lv2)
+ Ret += "-p:32:32";
+
+ // Note, the alignment values for f64 and i64 on ppc64 in Darwin
+ // documentation are wrong; these are correct (i.e. "what gcc does").
+ if (is64Bit || !T.isOSDarwin())
+ Ret += "-i64:64";
+ else
+ Ret += "-f64:32:64";
+
+ // PPC64 has 32 and 64 bit registers, PPC32 has only 32 bit ones.
+ if (is64Bit)
+ Ret += "-n32:64";
+ else
+ Ret += "-n32";
+
+ return Ret;
+}
+
+static std::string computeFSAdditions(StringRef FS, CodeGenOpt::Level OL,
+ const Triple &TT) {
+ std::string FullFS = FS;
+
+ // Make sure 64-bit features are available when CPUname is generic
+ if (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le) {
+ if (!FullFS.empty())
+ FullFS = "+64bit," + FullFS;
+ else
+ FullFS = "+64bit";
+ }
+
+ if (OL >= CodeGenOpt::Default) {
+ if (!FullFS.empty())
+ FullFS = "+crbits," + FullFS;
+ else
+ FullFS = "+crbits";
+ }
+
+ if (OL != CodeGenOpt::None) {
+ if (!FullFS.empty())
+ FullFS = "+invariant-function-descriptors," + FullFS;
+ else
+ FullFS = "+invariant-function-descriptors";
+ }
+
+ return FullFS;
+}
+
+static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
+ // If it isn't a Mach-O file then it's going to be a linux ELF
+ // object file.
+ if (TT.isOSDarwin())
+ return make_unique<TargetLoweringObjectFileMachO>();
+
+ return make_unique<PPC64LinuxTargetObjectFile>();
+}
+
+static PPCTargetMachine::PPCABI computeTargetABI(const Triple &TT,
+ const TargetOptions &Options) {
+ if (Options.MCOptions.getABIName().startswith("elfv1"))
+ return PPCTargetMachine::PPC_ABI_ELFv1;
+ else if (Options.MCOptions.getABIName().startswith("elfv2"))
+ return PPCTargetMachine::PPC_ABI_ELFv2;
+
+ assert(Options.MCOptions.getABIName().empty() &&
+ "Unknown target-abi option!");
+
+ if (!TT.isMacOSX()) {
+ switch (TT.getArch()) {
+ case Triple::ppc64le:
+ return PPCTargetMachine::PPC_ABI_ELFv2;
+ case Triple::ppc64:
+ return PPCTargetMachine::PPC_ABI_ELFv1;
+ default:
+ // Fallthrough.
+ ;
+ }
+ }
+ return PPCTargetMachine::PPC_ABI_UNKNOWN;
+}
+
+static Reloc::Model getEffectiveRelocModel(const Triple &TT,
+ Optional<Reloc::Model> RM) {
+ if (!RM.hasValue()) {
+ if (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le) {
+ if (!TT.isOSBinFormatMachO() && !TT.isMacOSX())
+ return Reloc::PIC_;
+ }
+ if (TT.isOSDarwin())
+ return Reloc::DynamicNoPIC;
+ return Reloc::Static;
+ }
+ return *RM;
+}
+
+// The FeatureString here is a little subtle. We are modifying the feature
+// string with what are (currently) non-function specific overrides as it goes
+// into the LLVMTargetMachine constructor and then using the stored value in the
+// Subtarget constructor below it.
+PPCTargetMachine::PPCTargetMachine(const Target &T, const Triple &TT,
+ StringRef CPU, StringRef FS,
+ const TargetOptions &Options,
+ Optional<Reloc::Model> RM,
+ CodeModel::Model CM, CodeGenOpt::Level OL)
+ : LLVMTargetMachine(T, getDataLayoutString(TT), TT, CPU,
+ computeFSAdditions(FS, OL, TT), Options,
+ getEffectiveRelocModel(TT, RM), CM, OL),
+ TLOF(createTLOF(getTargetTriple())),
+ TargetABI(computeTargetABI(TT, Options)),
+ Subtarget(TargetTriple, CPU, computeFSAdditions(FS, OL, TT), *this) {
+
+ initAsmInfo();
+}
+
+PPCTargetMachine::~PPCTargetMachine() {}
+
+void PPC32TargetMachine::anchor() { }
+
+PPC32TargetMachine::PPC32TargetMachine(const Target &T, const Triple &TT,
+ StringRef CPU, StringRef FS,
+ const TargetOptions &Options,
+ Optional<Reloc::Model> RM,
+ CodeModel::Model CM,
+ CodeGenOpt::Level OL)
+ : PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
+
+void PPC64TargetMachine::anchor() { }
+
+PPC64TargetMachine::PPC64TargetMachine(const Target &T, const Triple &TT,
+ StringRef CPU, StringRef FS,
+ const TargetOptions &Options,
+ Optional<Reloc::Model> RM,
+ CodeModel::Model CM,
+ CodeGenOpt::Level OL)
+ : PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
+
+const PPCSubtarget *
+PPCTargetMachine::getSubtargetImpl(const Function &F) const {
+ Attribute CPUAttr = F.getFnAttribute("target-cpu");
+ Attribute FSAttr = F.getFnAttribute("target-features");
+
+ std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
+ ? CPUAttr.getValueAsString().str()
+ : TargetCPU;
+ std::string FS = !FSAttr.hasAttribute(Attribute::None)
+ ? FSAttr.getValueAsString().str()
+ : TargetFS;
+
+ // FIXME: This is related to the code below to reset the target options,
+ // we need to know whether or not the soft float flag is set on the
+ // function before we can generate a subtarget. We also need to use
+ // it as a key for the subtarget since that can be the only difference
+ // between two functions.
+ bool SoftFloat =
+ F.getFnAttribute("use-soft-float").getValueAsString() == "true";
+ // If the soft float attribute is set on the function turn on the soft float
+ // subtarget feature.
+ if (SoftFloat)
+ FS += FS.empty() ? "-hard-float" : ",-hard-float";
+
+ auto &I = SubtargetMap[CPU + FS];
+ if (!I) {
+ // This needs to be done before we create a new subtarget since any
+ // creation will depend on the TM and the code generation flags on the
+ // function that reside in TargetOptions.
+ resetTargetOptions(F);
+ I = llvm::make_unique<PPCSubtarget>(
+ TargetTriple, CPU,
+ // FIXME: It would be good to have the subtarget additions here
+ // not necessary. Anything that turns them on/off (overrides) ends
+ // up being put at the end of the feature string, but the defaults
+ // shouldn't require adding them. Fixing this means pulling Feature64Bit
+ // out of most of the target cpus in the .td file and making it set only
+ // as part of initialization via the TargetTriple.
+ computeFSAdditions(FS, getOptLevel(), getTargetTriple()), *this);
+ }
+ return I.get();
+}
+
+//===----------------------------------------------------------------------===//
+// Pass Pipeline Configuration
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// PPC Code Generator Pass Configuration Options.
+class PPCPassConfig : public TargetPassConfig {
+public:
+ PPCPassConfig(PPCTargetMachine *TM, PassManagerBase &PM)
+ : TargetPassConfig(TM, PM) {}
+
+ PPCTargetMachine &getPPCTargetMachine() const {
+ return getTM<PPCTargetMachine>();
+ }
+
+ void addIRPasses() override;
+ bool addPreISel() override;
+ bool addILPOpts() override;
+ bool addInstSelector() override;
+ void addMachineSSAOptimization() override;
+ void addPreRegAlloc() override;
+ void addPreSched2() override;
+ void addPreEmitPass() override;
+};
+} // namespace
+
+TargetPassConfig *PPCTargetMachine::createPassConfig(PassManagerBase &PM) {
+ return new PPCPassConfig(this, PM);
+}
+
+void PPCPassConfig::addIRPasses() {
+ if (TM->getOptLevel() != CodeGenOpt::None)
+ addPass(createPPCBoolRetToIntPass());
+ addPass(createAtomicExpandPass(&getPPCTargetMachine()));
+
+ // For the BG/Q (or if explicitly requested), add explicit data prefetch
+ // intrinsics.
+ bool UsePrefetching = TM->getTargetTriple().getVendor() == Triple::BGQ &&
+ getOptLevel() != CodeGenOpt::None;
+ if (EnablePrefetch.getNumOccurrences() > 0)
+ UsePrefetching = EnablePrefetch;
+ if (UsePrefetching)
+ addPass(createLoopDataPrefetchPass());
+
+ if (TM->getOptLevel() >= CodeGenOpt::Default && EnableGEPOpt) {
+ // Call SeparateConstOffsetFromGEP pass to extract constants within indices
+ // and lower a GEP with multiple indices to either arithmetic operations or
+ // multiple GEPs with single index.
+ addPass(createSeparateConstOffsetFromGEPPass(TM, true));
+ // Call EarlyCSE pass to find and remove subexpressions in the lowered
+ // result.
+ addPass(createEarlyCSEPass());
+ // Do loop invariant code motion in case part of the lowered result is
+ // invariant.
+ addPass(createLICMPass());
+ }
+
+ TargetPassConfig::addIRPasses();
+}
+
+bool PPCPassConfig::addPreISel() {
+ if (!DisablePreIncPrep && getOptLevel() != CodeGenOpt::None)
+ addPass(createPPCLoopPreIncPrepPass(getPPCTargetMachine()));
+
+ if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None)
+ addPass(createPPCCTRLoops(getPPCTargetMachine()));
+
+ return false;
+}
+
+bool PPCPassConfig::addILPOpts() {
+ addPass(&EarlyIfConverterID);
+
+ if (EnableMachineCombinerPass)
+ addPass(&MachineCombinerID);
+
+ return true;
+}
+
+bool PPCPassConfig::addInstSelector() {
+ // Install an instruction selector.
+ addPass(createPPCISelDag(getPPCTargetMachine()));
+
+#ifndef NDEBUG
+ if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None)
+ addPass(createPPCCTRLoopsVerify());
+#endif
+
+ addPass(createPPCVSXCopyPass());
+ return false;
+}
+
+void PPCPassConfig::addMachineSSAOptimization() {
+ TargetPassConfig::addMachineSSAOptimization();
+ // For little endian, remove where possible the vector swap instructions
+ // introduced at code generation to normalize vector element order.
+ if (TM->getTargetTriple().getArch() == Triple::ppc64le &&
+ !DisableVSXSwapRemoval)
+ addPass(createPPCVSXSwapRemovalPass());
+ // Target-specific peephole cleanups performed after instruction
+ // selection.
+ if (!DisableMIPeephole) {
+ addPass(createPPCMIPeepholePass());
+ addPass(&DeadMachineInstructionElimID);
+ }
+}
+
+void PPCPassConfig::addPreRegAlloc() {
+ if (getOptLevel() != CodeGenOpt::None) {
+ initializePPCVSXFMAMutatePass(*PassRegistry::getPassRegistry());
+ insertPass(VSXFMAMutateEarly ? &RegisterCoalescerID : &MachineSchedulerID,
+ &PPCVSXFMAMutateID);
+ }
+
+ // FIXME: We probably don't need to run these for -fPIE.
+ if (getPPCTargetMachine().isPositionIndependent()) {
+ // FIXME: LiveVariables should not be necessary here!
+ // PPCTLSDYnamicCallPass uses LiveIntervals which previously dependet on
+ // LiveVariables. This (unnecessary) dependency has been removed now,
+ // however a stage-2 clang build fails without LiveVariables computed here.
+ addPass(&LiveVariablesID, false);
+ addPass(createPPCTLSDynamicCallPass());
+ }
+ if (EnableExtraTOCRegDeps)
+ addPass(createPPCTOCRegDepsPass());
+}
+
+void PPCPassConfig::addPreSched2() {
+ if (getOptLevel() != CodeGenOpt::None) {
+ addPass(&IfConverterID);
+
+ // This optimization must happen after anything that might do store-to-load
+ // forwarding. Here we're after RA (and, thus, when spills are inserted)
+ // but before post-RA scheduling.
+ if (!DisableQPXLoadSplat)
+ addPass(createPPCQPXLoadSplatPass());
+ }
+}
+
+void PPCPassConfig::addPreEmitPass() {
+ if (getOptLevel() != CodeGenOpt::None)
+ addPass(createPPCEarlyReturnPass(), false);
+ // Must run branch selection immediately preceding the asm printer.
+ addPass(createPPCBranchSelectionPass(), false);
+}
+
+TargetIRAnalysis PPCTargetMachine::getTargetIRAnalysis() {
+ return TargetIRAnalysis([this](const Function &F) {
+ return TargetTransformInfo(PPCTTIImpl(this, F));
+ });
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.h b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.h
new file mode 100644
index 000000000000..59b4f1e30c0e
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.h
@@ -0,0 +1,85 @@
+//===-- PPCTargetMachine.h - Define TargetMachine for PowerPC ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the PowerPC specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCTARGETMACHINE_H
+#define LLVM_LIB_TARGET_POWERPC_PPCTARGETMACHINE_H
+
+#include "PPCInstrInfo.h"
+#include "PPCSubtarget.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+/// Common code between 32-bit and 64-bit PowerPC targets.
+///
+class PPCTargetMachine : public LLVMTargetMachine {
+public:
+ enum PPCABI { PPC_ABI_UNKNOWN, PPC_ABI_ELFv1, PPC_ABI_ELFv2 };
+private:
+ std::unique_ptr<TargetLoweringObjectFile> TLOF;
+ PPCABI TargetABI;
+ PPCSubtarget Subtarget;
+
+ mutable StringMap<std::unique_ptr<PPCSubtarget>> SubtargetMap;
+
+public:
+ PPCTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL);
+
+ ~PPCTargetMachine() override;
+
+ const PPCSubtarget *getSubtargetImpl(const Function &F) const override;
+
+ // Pass Pipeline Configuration
+ TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+ TargetIRAnalysis getTargetIRAnalysis() override;
+
+ TargetLoweringObjectFile *getObjFileLowering() const override {
+ return TLOF.get();
+ }
+ bool isELFv2ABI() const { return TargetABI == PPC_ABI_ELFv2; }
+ bool isPPC64() const {
+ const Triple &TT = getTargetTriple();
+ return (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le);
+ };
+};
+
+/// PowerPC 32-bit target machine.
+///
+class PPC32TargetMachine : public PPCTargetMachine {
+ virtual void anchor();
+public:
+ PPC32TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL);
+};
+
+/// PowerPC 64-bit target machine.
+///
+class PPC64TargetMachine : public PPCTargetMachine {
+ virtual void anchor();
+public:
+ PPC64TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp
new file mode 100644
index 000000000000..a049dc3fda93
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp
@@ -0,0 +1,59 @@
+//===-- PPCTargetObjectFile.cpp - PPC Object Info -------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCTargetObjectFile.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionELF.h"
+
+using namespace llvm;
+
+void
+PPC64LinuxTargetObjectFile::
+Initialize(MCContext &Ctx, const TargetMachine &TM) {
+ TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+ InitializeELF(TM.Options.UseInitArray);
+}
+
+MCSection *PPC64LinuxTargetObjectFile::SelectSectionForGlobal(
+ const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+ // Here override ReadOnlySection to DataRelROSection for PPC64 SVR4 ABI
+ // when we have a constant that contains global relocations. This is
+ // necessary because of this ABI's handling of pointers to functions in
+ // a shared library. The address of a function is actually the address
+ // of a function descriptor, which resides in the .opd section. Generated
+ // code uses the descriptor directly rather than going via the GOT as some
+ // other ABIs do, which means that initialized function pointers must
+ // reference the descriptor. The linker must convert copy relocs of
+ // pointers to functions in shared libraries into dynamic relocations,
+ // because of an ordering problem with initialization of copy relocs and
+ // PLT entries. The dynamic relocation will be initialized by the dynamic
+ // linker, so we must use DataRelROSection instead of ReadOnlySection.
+ // For more information, see the description of ELIMINATE_COPY_RELOCS in
+ // GNU ld.
+ if (Kind.isReadOnly()) {
+ const auto *GVar = dyn_cast<GlobalVariable>(GO);
+
+ if (GVar && GVar->isConstant() && GVar->getInitializer()->needsRelocation())
+ Kind = SectionKind::getReadOnlyWithRel();
+ }
+
+ return TargetLoweringObjectFileELF::SelectSectionForGlobal(GO, Kind, TM);
+}
+
+const MCExpr *PPC64LinuxTargetObjectFile::
+getDebugThreadLocalSymbol(const MCSymbol *Sym) const {
+ const MCExpr *Expr =
+ MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_DTPREL, getContext());
+ return MCBinaryExpr::createAdd(Expr,
+ MCConstantExpr::create(0x8000, getContext()),
+ getContext());
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.h b/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.h
new file mode 100644
index 000000000000..c8b9b2e9790b
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.h
@@ -0,0 +1,34 @@
+//===-- PPCTargetObjectFile.h - PPC Object Info -----------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCTARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_POWERPC_PPCTARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+ /// PPC64LinuxTargetObjectFile - This implementation is used for
+ /// 64-bit PowerPC Linux.
+ class PPC64LinuxTargetObjectFile : public TargetLoweringObjectFileELF {
+
+ void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+
+ MCSection *SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind,
+ const TargetMachine &TM) const override;
+
+ /// \brief Describe a TLS variable address within debug info.
+ const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override;
+ };
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetStreamer.h b/contrib/llvm/lib/Target/PowerPC/PPCTargetStreamer.h
new file mode 100644
index 000000000000..dbe7617d3542
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetStreamer.h
@@ -0,0 +1,27 @@
+//===-- PPCTargetStreamer.h - PPC Target Streamer --s-----------*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCTARGETSTREAMER_H
+#define LLVM_LIB_TARGET_POWERPC_PPCTARGETSTREAMER_H
+
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+class PPCTargetStreamer : public MCTargetStreamer {
+public:
+ PPCTargetStreamer(MCStreamer &S);
+ ~PPCTargetStreamer() override;
+ virtual void emitTCEntry(const MCSymbol &S) = 0;
+ virtual void emitMachine(StringRef CPU) = 0;
+ virtual void emitAbiVersion(int AbiVersion) = 0;
+ virtual void emitLocalEntry(MCSymbolELF *S, const MCExpr *LocalOffset) = 0;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
new file mode 100644
index 000000000000..f7785342b364
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -0,0 +1,443 @@
+//===-- PPCTargetTransformInfo.cpp - PPC specific TTI ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCTargetTransformInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/CostTable.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "ppctti"
+
+static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting",
+cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden);
+
+// This is currently only used for the data prefetch pass which is only enabled
+// for BG/Q by default.
+static cl::opt<unsigned>
+CacheLineSize("ppc-loop-prefetch-cache-line", cl::Hidden, cl::init(64),
+ cl::desc("The loop prefetch cache line size"));
+
+//===----------------------------------------------------------------------===//
+//
+// PPC cost model.
+//
+//===----------------------------------------------------------------------===//
+
+TargetTransformInfo::PopcntSupportKind
+PPCTTIImpl::getPopcntSupport(unsigned TyWidth) {
+ assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
+ if (ST->hasPOPCNTD() != PPCSubtarget::POPCNTD_Unavailable && TyWidth <= 64)
+ return ST->hasPOPCNTD() == PPCSubtarget::POPCNTD_Slow ?
+ TTI::PSK_SlowHardware : TTI::PSK_FastHardware;
+ return TTI::PSK_Software;
+}
+
+int PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
+ if (DisablePPCConstHoist)
+ return BaseT::getIntImmCost(Imm, Ty);
+
+ assert(Ty->isIntegerTy());
+
+ unsigned BitSize = Ty->getPrimitiveSizeInBits();
+ if (BitSize == 0)
+ return ~0U;
+
+ if (Imm == 0)
+ return TTI::TCC_Free;
+
+ if (Imm.getBitWidth() <= 64) {
+ if (isInt<16>(Imm.getSExtValue()))
+ return TTI::TCC_Basic;
+
+ if (isInt<32>(Imm.getSExtValue())) {
+ // A constant that can be materialized using lis.
+ if ((Imm.getZExtValue() & 0xFFFF) == 0)
+ return TTI::TCC_Basic;
+
+ return 2 * TTI::TCC_Basic;
+ }
+ }
+
+ return 4 * TTI::TCC_Basic;
+}
+
+int PPCTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+ Type *Ty) {
+ if (DisablePPCConstHoist)
+ return BaseT::getIntImmCost(IID, Idx, Imm, Ty);
+
+ assert(Ty->isIntegerTy());
+
+ unsigned BitSize = Ty->getPrimitiveSizeInBits();
+ if (BitSize == 0)
+ return ~0U;
+
+ switch (IID) {
+ default:
+ return TTI::TCC_Free;
+ case Intrinsic::sadd_with_overflow:
+ case Intrinsic::uadd_with_overflow:
+ case Intrinsic::ssub_with_overflow:
+ case Intrinsic::usub_with_overflow:
+ if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<16>(Imm.getSExtValue()))
+ return TTI::TCC_Free;
+ break;
+ case Intrinsic::experimental_stackmap:
+ if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+ return TTI::TCC_Free;
+ break;
+ case Intrinsic::experimental_patchpoint_void:
+ case Intrinsic::experimental_patchpoint_i64:
+ if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+ return TTI::TCC_Free;
+ break;
+ }
+ return PPCTTIImpl::getIntImmCost(Imm, Ty);
+}
+
+int PPCTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+ Type *Ty) {
+ if (DisablePPCConstHoist)
+ return BaseT::getIntImmCost(Opcode, Idx, Imm, Ty);
+
+ assert(Ty->isIntegerTy());
+
+ unsigned BitSize = Ty->getPrimitiveSizeInBits();
+ if (BitSize == 0)
+ return ~0U;
+
+ unsigned ImmIdx = ~0U;
+ bool ShiftedFree = false, RunFree = false, UnsignedFree = false,
+ ZeroFree = false;
+ switch (Opcode) {
+ default:
+ return TTI::TCC_Free;
+ case Instruction::GetElementPtr:
+ // Always hoist the base address of a GetElementPtr. This prevents the
+ // creation of new constants for every base constant that gets constant
+ // folded with the offset.
+ if (Idx == 0)
+ return 2 * TTI::TCC_Basic;
+ return TTI::TCC_Free;
+ case Instruction::And:
+ RunFree = true; // (for the rotate-and-mask instructions)
+ LLVM_FALLTHROUGH;
+ case Instruction::Add:
+ case Instruction::Or:
+ case Instruction::Xor:
+ ShiftedFree = true;
+ LLVM_FALLTHROUGH;
+ case Instruction::Sub:
+ case Instruction::Mul:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ ImmIdx = 1;
+ break;
+ case Instruction::ICmp:
+ UnsignedFree = true;
+ ImmIdx = 1;
+ // Zero comparisons can use record-form instructions.
+ LLVM_FALLTHROUGH;
+ case Instruction::Select:
+ ZeroFree = true;
+ break;
+ case Instruction::PHI:
+ case Instruction::Call:
+ case Instruction::Ret:
+ case Instruction::Load:
+ case Instruction::Store:
+ break;
+ }
+
+ if (ZeroFree && Imm == 0)
+ return TTI::TCC_Free;
+
+ if (Idx == ImmIdx && Imm.getBitWidth() <= 64) {
+ if (isInt<16>(Imm.getSExtValue()))
+ return TTI::TCC_Free;
+
+ if (RunFree) {
+ if (Imm.getBitWidth() <= 32 &&
+ (isShiftedMask_32(Imm.getZExtValue()) ||
+ isShiftedMask_32(~Imm.getZExtValue())))
+ return TTI::TCC_Free;
+
+ if (ST->isPPC64() &&
+ (isShiftedMask_64(Imm.getZExtValue()) ||
+ isShiftedMask_64(~Imm.getZExtValue())))
+ return TTI::TCC_Free;
+ }
+
+ if (UnsignedFree && isUInt<16>(Imm.getZExtValue()))
+ return TTI::TCC_Free;
+
+ if (ShiftedFree && (Imm.getZExtValue() & 0xFFFF) == 0)
+ return TTI::TCC_Free;
+ }
+
+ return PPCTTIImpl::getIntImmCost(Imm, Ty);
+}
+
+void PPCTTIImpl::getUnrollingPreferences(Loop *L,
+ TTI::UnrollingPreferences &UP) {
+ if (ST->getDarwinDirective() == PPC::DIR_A2) {
+ // The A2 is in-order with a deep pipeline, and concatenation unrolling
+ // helps expose latency-hiding opportunities to the instruction scheduler.
+ UP.Partial = UP.Runtime = true;
+
+ // We unroll a lot on the A2 (hundreds of instructions), and the benefits
+ // often outweigh the cost of a division to compute the trip count.
+ UP.AllowExpensiveTripCount = true;
+ }
+
+ BaseT::getUnrollingPreferences(L, UP);
+}
+
+bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
+ // On the A2, always unroll aggressively. For QPX unaligned loads, we depend
+ // on combining the loads generated for consecutive accesses, and failure to
+ // do so is particularly expensive. This makes it much more likely (compared
+ // to only using concatenation unrolling).
+ if (ST->getDarwinDirective() == PPC::DIR_A2)
+ return true;
+
+ return LoopHasReductions;
+}
+
+bool PPCTTIImpl::enableInterleavedAccessVectorization() {
+ return true;
+}
+
+unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) {
+ if (Vector && !ST->hasAltivec() && !ST->hasQPX())
+ return 0;
+ return ST->hasVSX() ? 64 : 32;
+}
+
+unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) {
+ if (Vector) {
+ if (ST->hasQPX()) return 256;
+ if (ST->hasAltivec()) return 128;
+ return 0;
+ }
+
+ if (ST->isPPC64())
+ return 64;
+ return 32;
+
+}
+
+unsigned PPCTTIImpl::getCacheLineSize() {
+ // This is currently only used for the data prefetch pass which is only
+ // enabled for BG/Q by default.
+ return CacheLineSize;
+}
+
+unsigned PPCTTIImpl::getPrefetchDistance() {
+ // This seems like a reasonable default for the BG/Q (this pass is enabled, by
+ // default, only on the BG/Q).
+ return 300;
+}
+
+unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) {
+ unsigned Directive = ST->getDarwinDirective();
+ // The 440 has no SIMD support, but floating-point instructions
+ // have a 5-cycle latency, so unroll by 5x for latency hiding.
+ if (Directive == PPC::DIR_440)
+ return 5;
+
+ // The A2 has no SIMD support, but floating-point instructions
+ // have a 6-cycle latency, so unroll by 6x for latency hiding.
+ if (Directive == PPC::DIR_A2)
+ return 6;
+
+ // FIXME: For lack of any better information, do no harm...
+ if (Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500)
+ return 1;
+
+ // For P7 and P8, floating-point instructions have a 6-cycle latency and
+ // there are two execution units, so unroll by 12x for latency hiding.
+ // FIXME: the same for P9 as previous gen until POWER9 scheduling is ready
+ if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8 ||
+ Directive == PPC::DIR_PWR9)
+ return 12;
+
+ // For most things, modern systems have two execution units (and
+ // out-of-order execution).
+ return 2;
+}
+
+int PPCTTIImpl::getArithmeticInstrCost(
+ unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
+ TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
+ TTI::OperandValueProperties Opd2PropInfo) {
+ assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
+
+ // Fallback to the default implementation.
+ return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
+ Opd1PropInfo, Opd2PropInfo);
+}
+
+int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+ Type *SubTp) {
+ // Legalize the type.
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+
+ // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations
+ // (at least in the sense that there need only be one non-loop-invariant
+ // instruction). We need one such shuffle instruction for each actual
+ // register (this is not true for arbitrary shuffles, but is true for the
+ // structured types of shuffles covered by TTI::ShuffleKind).
+ return LT.first;
+}
+
+int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+ assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
+
+ return BaseT::getCastInstrCost(Opcode, Dst, Src);
+}
+
+int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
+ return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+}
+
+int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
+ assert(Val->isVectorTy() && "This must be a vector type");
+
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ assert(ISD && "Invalid opcode");
+
+ if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) {
+ // Double-precision scalars are already located in index #0.
+ if (Index == 0)
+ return 0;
+
+ return BaseT::getVectorInstrCost(Opcode, Val, Index);
+ } else if (ST->hasQPX() && Val->getScalarType()->isFloatingPointTy()) {
+ // Floating point scalars are already located in index #0.
+ if (Index == 0)
+ return 0;
+
+ return BaseT::getVectorInstrCost(Opcode, Val, Index);
+ }
+
+ // Estimated cost of a load-hit-store delay. This was obtained
+ // experimentally as a minimum needed to prevent unprofitable
+ // vectorization for the paq8p benchmark. It may need to be
+ // raised further if other unprofitable cases remain.
+ unsigned LHSPenalty = 2;
+ if (ISD == ISD::INSERT_VECTOR_ELT)
+ LHSPenalty += 7;
+
+ // Vector element insert/extract with Altivec is very expensive,
+ // because they require store and reload with the attendant
+ // processor stall for load-hit-store. Until VSX is available,
+ // these need to be estimated as very costly.
+ if (ISD == ISD::EXTRACT_VECTOR_ELT ||
+ ISD == ISD::INSERT_VECTOR_ELT)
+ return LHSPenalty + BaseT::getVectorInstrCost(Opcode, Val, Index);
+
+ return BaseT::getVectorInstrCost(Opcode, Val, Index);
+}
+
+int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+ unsigned AddressSpace) {
+ // Legalize the type.
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
+ assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
+ "Invalid Opcode");
+
+ int Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
+
+ bool IsAltivecType = ST->hasAltivec() &&
+ (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 ||
+ LT.second == MVT::v4i32 || LT.second == MVT::v4f32);
+ bool IsVSXType = ST->hasVSX() &&
+ (LT.second == MVT::v2f64 || LT.second == MVT::v2i64);
+ bool IsQPXType = ST->hasQPX() &&
+ (LT.second == MVT::v4f64 || LT.second == MVT::v4f32);
+
+ // VSX has 32b/64b load instructions. Legalization can handle loading of
+ // 32b/64b to VSR correctly and cheaply. But BaseT::getMemoryOpCost and
+ // PPCTargetLowering can't compute the cost appropriately. So here we
+ // explicitly check this case.
+ unsigned MemBytes = Src->getPrimitiveSizeInBits();
+ if (Opcode == Instruction::Load && ST->hasVSX() && IsAltivecType &&
+ (MemBytes == 64 || (ST->hasP8Vector() && MemBytes == 32)))
+ return 1;
+
+ // Aligned loads and stores are easy.
+ unsigned SrcBytes = LT.second.getStoreSize();
+ if (!SrcBytes || !Alignment || Alignment >= SrcBytes)
+ return Cost;
+
+ // If we can use the permutation-based load sequence, then this is also
+ // relatively cheap (not counting loop-invariant instructions): one load plus
+ // one permute (the last load in a series has extra cost, but we're
+ // neglecting that here). Note that on the P7, we could do unaligned loads
+ // for Altivec types using the VSX instructions, but that's more expensive
+ // than using the permutation-based load sequence. On the P8, that's no
+ // longer true.
+ if (Opcode == Instruction::Load &&
+ ((!ST->hasP8Vector() && IsAltivecType) || IsQPXType) &&
+ Alignment >= LT.second.getScalarType().getStoreSize())
+ return Cost + LT.first; // Add the cost of the permutations.
+
+ // For VSX, we can do unaligned loads and stores on Altivec/VSX types. On the
+ // P7, unaligned vector loads are more expensive than the permutation-based
+ // load sequence, so that might be used instead, but regardless, the net cost
+ // is about the same (not counting loop-invariant instructions).
+ if (IsVSXType || (ST->hasVSX() && IsAltivecType))
+ return Cost;
+
+ // PPC in general does not support unaligned loads and stores. They'll need
+ // to be decomposed based on the alignment factor.
+
+ // Add the cost of each scalar load or store.
+ Cost += LT.first*(SrcBytes/Alignment-1);
+
+ // For a vector type, there is also scalarization overhead (only for
+ // stores, loads are expanded using the vector-load + permutation sequence,
+ // which is much less expensive).
+ if (Src->isVectorTy() && Opcode == Instruction::Store)
+ for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i)
+ Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i);
+
+ return Cost;
+}
+
+int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+ unsigned Factor,
+ ArrayRef<unsigned> Indices,
+ unsigned Alignment,
+ unsigned AddressSpace) {
+ assert(isa<VectorType>(VecTy) &&
+ "Expect a vector type for interleaved memory op");
+
+ // Legalize the type.
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, VecTy);
+
+ // Firstly, the cost of load/store operation.
+ int Cost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace);
+
+ // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations
+ // (at least in the sense that there need only be one non-loop-invariant
+ // instruction). For each result vector, we need one shuffle per incoming
+ // vector (except that the first shuffle can take two incoming vectors
+ // because it does not need to take itself).
+ Cost += Factor*(LT.first-1);
+
+ return Cost;
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
new file mode 100644
index 000000000000..8308086ccfaa
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -0,0 +1,92 @@
+//===-- PPCTargetTransformInfo.h - PPC specific TTI -------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file a TargetTransformInfo::Concept conforming object specific to the
+/// PPC target machine. It uses the target's detailed information to
+/// provide more precise answers to certain TTI queries, while letting the
+/// target independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCTARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_POWERPC_PPCTARGETTRANSFORMINFO_H
+
+#include "PPC.h"
+#include "PPCTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+class PPCTTIImpl : public BasicTTIImplBase<PPCTTIImpl> {
+ typedef BasicTTIImplBase<PPCTTIImpl> BaseT;
+ typedef TargetTransformInfo TTI;
+ friend BaseT;
+
+ const PPCSubtarget *ST;
+ const PPCTargetLowering *TLI;
+
+ const PPCSubtarget *getST() const { return ST; }
+ const PPCTargetLowering *getTLI() const { return TLI; }
+
+public:
+ explicit PPCTTIImpl(const PPCTargetMachine *TM, const Function &F)
+ : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+ TLI(ST->getTargetLowering()) {}
+
+ /// \name Scalar TTI Implementations
+ /// @{
+
+ using BaseT::getIntImmCost;
+ int getIntImmCost(const APInt &Imm, Type *Ty);
+
+ int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
+ int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+ Type *Ty);
+
+ TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
+ void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
+
+ /// @}
+
+ /// \name Vector TTI Implementations
+ /// @{
+
+ bool enableAggressiveInterleaving(bool LoopHasReductions);
+ bool enableInterleavedAccessVectorization();
+ unsigned getNumberOfRegisters(bool Vector);
+ unsigned getRegisterBitWidth(bool Vector);
+ unsigned getCacheLineSize();
+ unsigned getPrefetchDistance();
+ unsigned getMaxInterleaveFactor(unsigned VF);
+ int getArithmeticInstrCost(
+ unsigned Opcode, Type *Ty,
+ TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+ TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+ TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+ int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
+ int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
+ int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+ int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+ int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+ unsigned AddressSpace);
+ int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+ unsigned Factor,
+ ArrayRef<unsigned> Indices,
+ unsigned Alignment,
+ unsigned AddressSpace);
+
+ /// @}
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp b/contrib/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp
new file mode 100644
index 000000000000..3b5d8f094fd0
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp
@@ -0,0 +1,177 @@
+//===-------------- PPCVSXCopy.cpp - VSX Copy Legalization ----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// A pass which deals with the complexity of generating legal VSX register
+// copies to/from register classes which partially overlap with the VSX
+// register file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "PPCHazardRecognizers.h"
+#include "PPCInstrBuilder.h"
+#include "PPCInstrInfo.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-vsx-copy"
+
+namespace llvm {
+ void initializePPCVSXCopyPass(PassRegistry&);
+}
+
+namespace {
+ // PPCVSXCopy pass - For copies between VSX registers and non-VSX registers
+ // (Altivec and scalar floating-point registers), we need to transform the
+ // copies into subregister copies with other restrictions.
+ struct PPCVSXCopy : public MachineFunctionPass {
+ static char ID;
+ PPCVSXCopy() : MachineFunctionPass(ID) {
+ initializePPCVSXCopyPass(*PassRegistry::getPassRegistry());
+ }
+
+ const TargetInstrInfo *TII;
+
+ bool IsRegInClass(unsigned Reg, const TargetRegisterClass *RC,
+ MachineRegisterInfo &MRI) {
+ if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ return RC->hasSubClassEq(MRI.getRegClass(Reg));
+ } else if (RC->contains(Reg)) {
+ return true;
+ }
+
+ return false;
+ }
+
+ bool IsVSReg(unsigned Reg, MachineRegisterInfo &MRI) {
+ return IsRegInClass(Reg, &PPC::VSRCRegClass, MRI);
+ }
+
+ bool IsVRReg(unsigned Reg, MachineRegisterInfo &MRI) {
+ return IsRegInClass(Reg, &PPC::VRRCRegClass, MRI);
+ }
+
+ bool IsF8Reg(unsigned Reg, MachineRegisterInfo &MRI) {
+ return IsRegInClass(Reg, &PPC::F8RCRegClass, MRI);
+ }
+
+ bool IsVSFReg(unsigned Reg, MachineRegisterInfo &MRI) {
+ return IsRegInClass(Reg, &PPC::VSFRCRegClass, MRI);
+ }
+
+ bool IsVSSReg(unsigned Reg, MachineRegisterInfo &MRI) {
+ return IsRegInClass(Reg, &PPC::VSSRCRegClass, MRI);
+ }
+
+protected:
+ bool processBlock(MachineBasicBlock &MBB) {
+ bool Changed = false;
+
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ for (MachineInstr &MI : MBB) {
+ if (!MI.isFullCopy())
+ continue;
+
+ MachineOperand &DstMO = MI.getOperand(0);
+ MachineOperand &SrcMO = MI.getOperand(1);
+
+ if ( IsVSReg(DstMO.getReg(), MRI) &&
+ !IsVSReg(SrcMO.getReg(), MRI)) {
+ // This is a copy *to* a VSX register from a non-VSX register.
+ Changed = true;
+
+ const TargetRegisterClass *SrcRC = &PPC::VSLRCRegClass;
+ assert((IsF8Reg(SrcMO.getReg(), MRI) ||
+ IsVSSReg(SrcMO.getReg(), MRI) ||
+ IsVSFReg(SrcMO.getReg(), MRI)) &&
+ "Unknown source for a VSX copy");
+
+ unsigned NewVReg = MRI.createVirtualRegister(SrcRC);
+ BuildMI(MBB, MI, MI.getDebugLoc(),
+ TII->get(TargetOpcode::SUBREG_TO_REG), NewVReg)
+ .addImm(1) // add 1, not 0, because there is no implicit clearing
+ // of the high bits.
+ .addOperand(SrcMO)
+ .addImm(PPC::sub_64);
+
+ // The source of the original copy is now the new virtual register.
+ SrcMO.setReg(NewVReg);
+ } else if (!IsVSReg(DstMO.getReg(), MRI) &&
+ IsVSReg(SrcMO.getReg(), MRI)) {
+ // This is a copy *from* a VSX register to a non-VSX register.
+ Changed = true;
+
+ const TargetRegisterClass *DstRC = &PPC::VSLRCRegClass;
+ assert((IsF8Reg(DstMO.getReg(), MRI) ||
+ IsVSFReg(DstMO.getReg(), MRI) ||
+ IsVSSReg(DstMO.getReg(), MRI)) &&
+ "Unknown destination for a VSX copy");
+
+ // Copy the VSX value into a new VSX register of the correct subclass.
+ unsigned NewVReg = MRI.createVirtualRegister(DstRC);
+ BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
+ NewVReg)
+ .addOperand(SrcMO);
+
+ // Transform the original copy into a subregister extraction copy.
+ SrcMO.setReg(NewVReg);
+ SrcMO.setSubReg(PPC::sub_64);
+ }
+ }
+
+ return Changed;
+ }
+
+public:
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ // If we don't have VSX on the subtarget, don't do anything.
+ const PPCSubtarget &STI = MF.getSubtarget<PPCSubtarget>();
+ if (!STI.hasVSX())
+ return false;
+ TII = STI.getInstrInfo();
+
+ bool Changed = false;
+
+ for (MachineFunction::iterator I = MF.begin(); I != MF.end();) {
+ MachineBasicBlock &B = *I++;
+ if (processBlock(B))
+ Changed = true;
+ }
+
+ return Changed;
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+ };
+}
+
+INITIALIZE_PASS(PPCVSXCopy, DEBUG_TYPE,
+ "PowerPC VSX Copy Legalization", false, false)
+
+char PPCVSXCopy::ID = 0;
+FunctionPass*
+llvm::createPPCVSXCopyPass() { return new PPCVSXCopy(); }
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/contrib/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
new file mode 100644
index 000000000000..f6d20ced15a0
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
@@ -0,0 +1,398 @@
+//===--------------- PPCVSXFMAMutate.cpp - VSX FMA Mutation ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass mutates the form of VSX FMA instructions to avoid unnecessary
+// copies.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCInstrInfo.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "PPC.h"
+#include "PPCInstrBuilder.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+// Temporarily disable FMA mutation by default, since it doesn't handle
+// cross-basic-block intervals well.
+// See: http://lists.llvm.org/pipermail/llvm-dev/2016-February/095669.html
+// http://reviews.llvm.org/D17087
+static cl::opt<bool> DisableVSXFMAMutate(
+ "disable-ppc-vsx-fma-mutation",
+ cl::desc("Disable VSX FMA instruction mutation"), cl::init(true),
+ cl::Hidden);
+
+#define DEBUG_TYPE "ppc-vsx-fma-mutate"
+
+namespace llvm { namespace PPC {
+ int getAltVSXFMAOpcode(uint16_t Opcode);
+} }
+
+namespace {
+ // PPCVSXFMAMutate pass - For copies between VSX registers and non-VSX registers
+ // (Altivec and scalar floating-point registers), we need to transform the
+ // copies into subregister copies with other restrictions.
+ struct PPCVSXFMAMutate : public MachineFunctionPass {
+ static char ID;
+ PPCVSXFMAMutate() : MachineFunctionPass(ID) {
+ initializePPCVSXFMAMutatePass(*PassRegistry::getPassRegistry());
+ }
+
+ LiveIntervals *LIS;
+ const PPCInstrInfo *TII;
+
+protected:
+ bool processBlock(MachineBasicBlock &MBB) {
+ bool Changed = false;
+
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ const TargetRegisterInfo *TRI = &TII->getRegisterInfo();
+ for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end();
+ I != IE; ++I) {
+ MachineInstr &MI = *I;
+
+ // The default (A-type) VSX FMA form kills the addend (it is taken from
+ // the target register, which is then updated to reflect the result of
+ // the FMA). If the instruction, however, kills one of the registers
+ // used for the product, then we can use the M-form instruction (which
+ // will take that value from the to-be-defined register).
+
+ int AltOpc = PPC::getAltVSXFMAOpcode(MI.getOpcode());
+ if (AltOpc == -1)
+ continue;
+
+ // This pass is run after register coalescing, and so we're looking for
+ // a situation like this:
+ // ...
+ // %vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9
+ // %vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16,
+ // %RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16
+ // ...
+ // %vreg9<def,tied1> = XSMADDADP %vreg9<tied0>, %vreg17, %vreg19,
+ // %RM<imp-use>; VSLRC:%vreg9,%vreg17,%vreg19
+ // ...
+ // Where we can eliminate the copy by changing from the A-type to the
+ // M-type instruction. Specifically, for this example, this means:
+ // %vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16,
+ // %RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16
+ // is replaced by:
+ // %vreg16<def,tied1> = XSMADDMDP %vreg16<tied0>, %vreg18, %vreg9,
+ // %RM<imp-use>; VSLRC:%vreg16,%vreg18,%vreg9
+ // and we remove: %vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9
+
+ SlotIndex FMAIdx = LIS->getInstructionIndex(MI);
+
+ VNInfo *AddendValNo =
+ LIS->getInterval(MI.getOperand(1).getReg()).Query(FMAIdx).valueIn();
+
+ // This can be null if the register is undef.
+ if (!AddendValNo)
+ continue;
+
+ MachineInstr *AddendMI = LIS->getInstructionFromIndex(AddendValNo->def);
+
+ // The addend and this instruction must be in the same block.
+
+ if (!AddendMI || AddendMI->getParent() != MI.getParent())
+ continue;
+
+ // The addend must be a full copy within the same register class.
+
+ if (!AddendMI->isFullCopy())
+ continue;
+
+ unsigned AddendSrcReg = AddendMI->getOperand(1).getReg();
+ if (TargetRegisterInfo::isVirtualRegister(AddendSrcReg)) {
+ if (MRI.getRegClass(AddendMI->getOperand(0).getReg()) !=
+ MRI.getRegClass(AddendSrcReg))
+ continue;
+ } else {
+ // If AddendSrcReg is a physical register, make sure the destination
+ // register class contains it.
+ if (!MRI.getRegClass(AddendMI->getOperand(0).getReg())
+ ->contains(AddendSrcReg))
+ continue;
+ }
+
+ // In theory, there could be other uses of the addend copy before this
+ // fma. We could deal with this, but that would require additional
+ // logic below and I suspect it will not occur in any relevant
+ // situations. Additionally, check whether the copy source is killed
+ // prior to the fma. In order to replace the addend here with the
+ // source of the copy, it must still be live here. We can't use
+ // interval testing for a physical register, so as long as we're
+ // walking the MIs we may as well test liveness here.
+ //
+ // FIXME: There is a case that occurs in practice, like this:
+ // %vreg9<def> = COPY %F1; VSSRC:%vreg9
+ // ...
+ // %vreg6<def> = COPY %vreg9; VSSRC:%vreg6,%vreg9
+ // %vreg7<def> = COPY %vreg9; VSSRC:%vreg7,%vreg9
+ // %vreg9<def,tied1> = XSMADDASP %vreg9<tied0>, %vreg1, %vreg4; VSSRC:
+ // %vreg6<def,tied1> = XSMADDASP %vreg6<tied0>, %vreg1, %vreg2; VSSRC:
+ // %vreg7<def,tied1> = XSMADDASP %vreg7<tied0>, %vreg1, %vreg3; VSSRC:
+ // which prevents an otherwise-profitable transformation.
+ bool OtherUsers = false, KillsAddendSrc = false;
+ for (auto J = std::prev(I), JE = MachineBasicBlock::iterator(AddendMI);
+ J != JE; --J) {
+ if (J->readsVirtualRegister(AddendMI->getOperand(0).getReg())) {
+ OtherUsers = true;
+ break;
+ }
+ if (J->modifiesRegister(AddendSrcReg, TRI) ||
+ J->killsRegister(AddendSrcReg, TRI)) {
+ KillsAddendSrc = true;
+ break;
+ }
+ }
+
+ if (OtherUsers || KillsAddendSrc)
+ continue;
+
+
+ // The transformation doesn't work well with things like:
+ // %vreg5 = A-form-op %vreg5, %vreg11, %vreg5;
+ // unless vreg11 is also a kill, so skip when it is not,
+ // and check operand 3 to see it is also a kill to handle the case:
+ // %vreg5 = A-form-op %vreg5, %vreg5, %vreg11;
+ // where vreg5 and vreg11 are both kills. This case would be skipped
+ // otherwise.
+ unsigned OldFMAReg = MI.getOperand(0).getReg();
+
+ // Find one of the product operands that is killed by this instruction.
+ unsigned KilledProdOp = 0, OtherProdOp = 0;
+ unsigned Reg2 = MI.getOperand(2).getReg();
+ unsigned Reg3 = MI.getOperand(3).getReg();
+ if (LIS->getInterval(Reg2).Query(FMAIdx).isKill()
+ && Reg2 != OldFMAReg) {
+ KilledProdOp = 2;
+ OtherProdOp = 3;
+ } else if (LIS->getInterval(Reg3).Query(FMAIdx).isKill()
+ && Reg3 != OldFMAReg) {
+ KilledProdOp = 3;
+ OtherProdOp = 2;
+ }
+
+ // If there are no usable killed product operands, then this
+ // transformation is likely not profitable.
+ if (!KilledProdOp)
+ continue;
+
+ // If the addend copy is used only by this MI, then the addend source
+ // register is likely not live here. This could be fixed (based on the
+ // legality checks above, the live range for the addend source register
+ // could be extended), but it seems likely that such a trivial copy can
+ // be coalesced away later, and thus is not worth the effort.
+ if (TargetRegisterInfo::isVirtualRegister(AddendSrcReg) &&
+ !LIS->getInterval(AddendSrcReg).liveAt(FMAIdx))
+ continue;
+
+ // Transform: (O2 * O3) + O1 -> (O2 * O1) + O3.
+
+ unsigned KilledProdReg = MI.getOperand(KilledProdOp).getReg();
+ unsigned OtherProdReg = MI.getOperand(OtherProdOp).getReg();
+
+ unsigned AddSubReg = AddendMI->getOperand(1).getSubReg();
+ unsigned KilledProdSubReg = MI.getOperand(KilledProdOp).getSubReg();
+ unsigned OtherProdSubReg = MI.getOperand(OtherProdOp).getSubReg();
+
+ bool AddRegKill = AddendMI->getOperand(1).isKill();
+ bool KilledProdRegKill = MI.getOperand(KilledProdOp).isKill();
+ bool OtherProdRegKill = MI.getOperand(OtherProdOp).isKill();
+
+ bool AddRegUndef = AddendMI->getOperand(1).isUndef();
+ bool KilledProdRegUndef = MI.getOperand(KilledProdOp).isUndef();
+ bool OtherProdRegUndef = MI.getOperand(OtherProdOp).isUndef();
+
+ // If there isn't a class that fits, we can't perform the transform.
+ // This is needed for correctness with a mixture of VSX and Altivec
+ // instructions to make sure that a low VSX register is not assigned to
+ // the Altivec instruction.
+ if (!MRI.constrainRegClass(KilledProdReg,
+ MRI.getRegClass(OldFMAReg)))
+ continue;
+
+ assert(OldFMAReg == AddendMI->getOperand(0).getReg() &&
+ "Addend copy not tied to old FMA output!");
+
+ DEBUG(dbgs() << "VSX FMA Mutation:\n " << MI);
+
+ MI.getOperand(0).setReg(KilledProdReg);
+ MI.getOperand(1).setReg(KilledProdReg);
+ MI.getOperand(3).setReg(AddendSrcReg);
+
+ MI.getOperand(0).setSubReg(KilledProdSubReg);
+ MI.getOperand(1).setSubReg(KilledProdSubReg);
+ MI.getOperand(3).setSubReg(AddSubReg);
+
+ MI.getOperand(1).setIsKill(KilledProdRegKill);
+ MI.getOperand(3).setIsKill(AddRegKill);
+
+ MI.getOperand(1).setIsUndef(KilledProdRegUndef);
+ MI.getOperand(3).setIsUndef(AddRegUndef);
+
+ MI.setDesc(TII->get(AltOpc));
+
+ // If the addend is also a multiplicand, replace it with the addend
+ // source in both places.
+ if (OtherProdReg == AddendMI->getOperand(0).getReg()) {
+ MI.getOperand(2).setReg(AddendSrcReg);
+ MI.getOperand(2).setSubReg(AddSubReg);
+ MI.getOperand(2).setIsKill(AddRegKill);
+ MI.getOperand(2).setIsUndef(AddRegUndef);
+ } else {
+ MI.getOperand(2).setReg(OtherProdReg);
+ MI.getOperand(2).setSubReg(OtherProdSubReg);
+ MI.getOperand(2).setIsKill(OtherProdRegKill);
+ MI.getOperand(2).setIsUndef(OtherProdRegUndef);
+ }
+
+ DEBUG(dbgs() << " -> " << MI);
+
+ // The killed product operand was killed here, so we can reuse it now
+ // for the result of the fma.
+
+ LiveInterval &FMAInt = LIS->getInterval(OldFMAReg);
+ VNInfo *FMAValNo = FMAInt.getVNInfoAt(FMAIdx.getRegSlot());
+ for (auto UI = MRI.reg_nodbg_begin(OldFMAReg), UE = MRI.reg_nodbg_end();
+ UI != UE;) {
+ MachineOperand &UseMO = *UI;
+ MachineInstr *UseMI = UseMO.getParent();
+ ++UI;
+
+ // Don't replace the result register of the copy we're about to erase.
+ if (UseMI == AddendMI)
+ continue;
+
+ UseMO.substVirtReg(KilledProdReg, KilledProdSubReg, *TRI);
+ }
+
+ // Extend the live intervals of the killed product operand to hold the
+ // fma result.
+
+ LiveInterval &NewFMAInt = LIS->getInterval(KilledProdReg);
+ for (LiveInterval::iterator AI = FMAInt.begin(), AE = FMAInt.end();
+ AI != AE; ++AI) {
+ // Don't add the segment that corresponds to the original copy.
+ if (AI->valno == AddendValNo)
+ continue;
+
+ VNInfo *NewFMAValNo =
+ NewFMAInt.getNextValue(AI->start,
+ LIS->getVNInfoAllocator());
+
+ NewFMAInt.addSegment(LiveInterval::Segment(AI->start, AI->end,
+ NewFMAValNo));
+ }
+ DEBUG(dbgs() << " extended: " << NewFMAInt << '\n');
+
+ // Extend the live interval of the addend source (it might end at the
+ // copy to be removed, or somewhere in between there and here). This
+ // is necessary only if it is a physical register.
+ if (!TargetRegisterInfo::isVirtualRegister(AddendSrcReg))
+ for (MCRegUnitIterator Units(AddendSrcReg, TRI); Units.isValid();
+ ++Units) {
+ unsigned Unit = *Units;
+
+ LiveRange &AddendSrcRange = LIS->getRegUnit(Unit);
+ AddendSrcRange.extendInBlock(LIS->getMBBStartIdx(&MBB),
+ FMAIdx.getRegSlot());
+ DEBUG(dbgs() << " extended: " << AddendSrcRange << '\n');
+ }
+
+ FMAInt.removeValNo(FMAValNo);
+ DEBUG(dbgs() << " trimmed: " << FMAInt << '\n');
+
+ // Remove the (now unused) copy.
+
+ DEBUG(dbgs() << " removing: " << *AddendMI << '\n');
+ LIS->RemoveMachineInstrFromMaps(*AddendMI);
+ AddendMI->eraseFromParent();
+
+ Changed = true;
+ }
+
+ return Changed;
+ }
+
+public:
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
+ // If we don't have VSX then go ahead and return without doing
+ // anything.
+ const PPCSubtarget &STI = MF.getSubtarget<PPCSubtarget>();
+ if (!STI.hasVSX())
+ return false;
+
+ LIS = &getAnalysis<LiveIntervals>();
+
+ TII = STI.getInstrInfo();
+
+ bool Changed = false;
+
+ if (DisableVSXFMAMutate)
+ return Changed;
+
+ for (MachineFunction::iterator I = MF.begin(); I != MF.end();) {
+ MachineBasicBlock &B = *I++;
+ if (processBlock(B))
+ Changed = true;
+ }
+
+ return Changed;
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LiveIntervals>();
+ AU.addPreserved<LiveIntervals>();
+ AU.addRequired<SlotIndexes>();
+ AU.addPreserved<SlotIndexes>();
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+ };
+}
+
+INITIALIZE_PASS_BEGIN(PPCVSXFMAMutate, DEBUG_TYPE,
+ "PowerPC VSX FMA Mutation", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(PPCVSXFMAMutate, DEBUG_TYPE,
+ "PowerPC VSX FMA Mutation", false, false)
+
+char &llvm::PPCVSXFMAMutateID = PPCVSXFMAMutate::ID;
+
+char PPCVSXFMAMutate::ID = 0;
+FunctionPass *llvm::createPPCVSXFMAMutatePass() {
+ return new PPCVSXFMAMutate();
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp b/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
new file mode 100644
index 000000000000..8197285b7b1f
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
@@ -0,0 +1,1035 @@
+//===----------- PPCVSXSwapRemoval.cpp - Remove VSX LE Swaps -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+//
+// This pass analyzes vector computations and removes unnecessary
+// doubleword swaps (xxswapd instructions). This pass is performed
+// only for little-endian VSX code generation.
+//
+// For this specific case, loads and stores of v4i32, v4f32, v2i64,
+// and v2f64 vectors are inefficient. These are implemented using
+// the lxvd2x and stxvd2x instructions, which invert the order of
+// doublewords in a vector register. Thus code generation inserts
+// an xxswapd after each such load, and prior to each such store.
+//
+// The extra xxswapd instructions reduce performance. The purpose
+// of this pass is to reduce the number of xxswapd instructions
+// required for correctness.
+//
+// The primary insight is that much code that operates on vectors
+// does not care about the relative order of elements in a register,
+// so long as the correct memory order is preserved. If we have a
+// computation where all input values are provided by lxvd2x/xxswapd,
+// all outputs are stored using xxswapd/lxvd2x, and all intermediate
+// computations are lane-insensitive (independent of element order),
+// then all the xxswapd instructions associated with the loads and
+// stores may be removed without changing observable semantics.
+//
+// This pass uses standard equivalence class infrastructure to create
+// maximal webs of computations fitting the above description. Each
+// such web is then optimized by removing its unnecessary xxswapd
+// instructions.
+//
+// There are some lane-sensitive operations for which we can still
+// permit the optimization, provided we modify those operations
+// accordingly. Such operations are identified as using "special
+// handling" within this module.
+//
+//===---------------------------------------------------------------------===//
+
+#include "PPCInstrInfo.h"
+#include "PPC.h"
+#include "PPCInstrBuilder.h"
+#include "PPCTargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-vsx-swaps"
+
+namespace llvm {
+ void initializePPCVSXSwapRemovalPass(PassRegistry&);
+}
+
+namespace {
+
+// A PPCVSXSwapEntry is created for each machine instruction that
+// is relevant to a vector computation.
+struct PPCVSXSwapEntry {
+ // Pointer to the instruction.
+ MachineInstr *VSEMI;
+
+ // Unique ID (position in the swap vector).
+ int VSEId;
+
+ // Attributes of this node.
+ unsigned int IsLoad : 1;
+ unsigned int IsStore : 1;
+ unsigned int IsSwap : 1;
+ unsigned int MentionsPhysVR : 1;
+ unsigned int IsSwappable : 1;
+ unsigned int MentionsPartialVR : 1;
+ unsigned int SpecialHandling : 3;
+ unsigned int WebRejected : 1;
+ unsigned int WillRemove : 1;
+};
+
+enum SHValues {
+ SH_NONE = 0,
+ SH_EXTRACT,
+ SH_INSERT,
+ SH_NOSWAP_LD,
+ SH_NOSWAP_ST,
+ SH_SPLAT,
+ SH_XXPERMDI,
+ SH_COPYWIDEN
+};
+
+struct PPCVSXSwapRemoval : public MachineFunctionPass {
+
+ static char ID;
+ const PPCInstrInfo *TII;
+ MachineFunction *MF;
+ MachineRegisterInfo *MRI;
+
+ // Swap entries are allocated in a vector for better performance.
+ std::vector<PPCVSXSwapEntry> SwapVector;
+
+ // A mapping is maintained between machine instructions and
+ // their swap entries. The key is the address of the MI.
+ DenseMap<MachineInstr*, int> SwapMap;
+
+ // Equivalence classes are used to gather webs of related computation.
+ // Swap entries are represented by their VSEId fields.
+ EquivalenceClasses<int> *EC;
+
+ PPCVSXSwapRemoval() : MachineFunctionPass(ID) {
+ initializePPCVSXSwapRemovalPass(*PassRegistry::getPassRegistry());
+ }
+
+private:
+ // Initialize data structures.
+ void initialize(MachineFunction &MFParm);
+
+ // Walk the machine instructions to gather vector usage information.
+ // Return true iff vector mentions are present.
+ bool gatherVectorInstructions();
+
+ // Add an entry to the swap vector and swap map.
+ int addSwapEntry(MachineInstr *MI, PPCVSXSwapEntry &SwapEntry);
+
+ // Hunt backwards through COPY and SUBREG_TO_REG chains for a
+ // source register. VecIdx indicates the swap vector entry to
+ // mark as mentioning a physical register if the search leads
+ // to one.
+ unsigned lookThruCopyLike(unsigned SrcReg, unsigned VecIdx);
+
+ // Generate equivalence classes for related computations (webs).
+ void formWebs();
+
+ // Analyze webs and determine those that cannot be optimized.
+ void recordUnoptimizableWebs();
+
+ // Record which swap instructions can be safely removed.
+ void markSwapsForRemoval();
+
+ // Remove swaps and update other instructions requiring special
+ // handling. Return true iff any changes are made.
+ bool removeSwaps();
+
+ // Insert a swap instruction from SrcReg to DstReg at the given
+ // InsertPoint.
+ void insertSwap(MachineInstr *MI, MachineBasicBlock::iterator InsertPoint,
+ unsigned DstReg, unsigned SrcReg);
+
+ // Update instructions requiring special handling.
+ void handleSpecialSwappables(int EntryIdx);
+
+ // Dump a description of the entries in the swap vector.
+ void dumpSwapVector();
+
+ // Return true iff the given register is in the given class.
+ bool isRegInClass(unsigned Reg, const TargetRegisterClass *RC) {
+ if (TargetRegisterInfo::isVirtualRegister(Reg))
+ return RC->hasSubClassEq(MRI->getRegClass(Reg));
+ return RC->contains(Reg);
+ }
+
+ // Return true iff the given register is a full vector register.
+ bool isVecReg(unsigned Reg) {
+ return (isRegInClass(Reg, &PPC::VSRCRegClass) ||
+ isRegInClass(Reg, &PPC::VRRCRegClass));
+ }
+
+ // Return true iff the given register is a partial vector register.
+ bool isScalarVecReg(unsigned Reg) {
+ return (isRegInClass(Reg, &PPC::VSFRCRegClass) ||
+ isRegInClass(Reg, &PPC::VSSRCRegClass));
+ }
+
+ // Return true iff the given register mentions all or part of a
+ // vector register. Also sets Partial to true if the mention
+ // is for just the floating-point register overlap of the register.
+ bool isAnyVecReg(unsigned Reg, bool &Partial) {
+ if (isScalarVecReg(Reg))
+ Partial = true;
+ return isScalarVecReg(Reg) || isVecReg(Reg);
+ }
+
+public:
+ // Main entry point for this pass.
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
+ // If we don't have VSX on the subtarget, don't do anything.
+ const PPCSubtarget &STI = MF.getSubtarget<PPCSubtarget>();
+ if (!STI.hasVSX())
+ return false;
+
+ bool Changed = false;
+ initialize(MF);
+
+ if (gatherVectorInstructions()) {
+ formWebs();
+ recordUnoptimizableWebs();
+ markSwapsForRemoval();
+ Changed = removeSwaps();
+ }
+
+ // FIXME: See the allocation of EC in initialize().
+ delete EC;
+ return Changed;
+ }
+};
+
+// Initialize data structures for this pass. In particular, clear the
+// swap vector and allocate the equivalence class mapping before
+// processing each function.
+void PPCVSXSwapRemoval::initialize(MachineFunction &MFParm) {
+ MF = &MFParm;
+ MRI = &MF->getRegInfo();
+ TII = MF->getSubtarget<PPCSubtarget>().getInstrInfo();
+
+ // An initial vector size of 256 appears to work well in practice.
+ // Small/medium functions with vector content tend not to incur a
+ // reallocation at this size. Three of the vector tests in
+ // projects/test-suite reallocate, which seems like a reasonable rate.
+ const int InitialVectorSize(256);
+ SwapVector.clear();
+ SwapVector.reserve(InitialVectorSize);
+
+ // FIXME: Currently we allocate EC each time because we don't have
+ // access to the set representation on which to call clear(). Should
+ // consider adding a clear() method to the EquivalenceClasses class.
+ EC = new EquivalenceClasses<int>;
+}
+
+// Create an entry in the swap vector for each instruction that mentions
+// a full vector register, recording various characteristics of the
+// instructions there.
+bool PPCVSXSwapRemoval::gatherVectorInstructions() {
+ bool RelevantFunction = false;
+
+ for (MachineBasicBlock &MBB : *MF) {
+ for (MachineInstr &MI : MBB) {
+
+ if (MI.isDebugValue())
+ continue;
+
+ bool RelevantInstr = false;
+ bool Partial = false;
+
+ for (const MachineOperand &MO : MI.operands()) {
+ if (!MO.isReg())
+ continue;
+ unsigned Reg = MO.getReg();
+ if (isAnyVecReg(Reg, Partial)) {
+ RelevantInstr = true;
+ break;
+ }
+ }
+
+ if (!RelevantInstr)
+ continue;
+
+ RelevantFunction = true;
+
+ // Create a SwapEntry initialized to zeros, then fill in the
+ // instruction and ID fields before pushing it to the back
+ // of the swap vector.
+ PPCVSXSwapEntry SwapEntry{};
+ int VecIdx = addSwapEntry(&MI, SwapEntry);
+
+ switch(MI.getOpcode()) {
+ default:
+ // Unless noted otherwise, an instruction is considered
+ // safe for the optimization. There are a large number of
+ // such true-SIMD instructions (all vector math, logical,
+ // select, compare, etc.). However, if the instruction
+ // mentions a partial vector register and does not have
+ // special handling defined, it is not swappable.
+ if (Partial)
+ SwapVector[VecIdx].MentionsPartialVR = 1;
+ else
+ SwapVector[VecIdx].IsSwappable = 1;
+ break;
+ case PPC::XXPERMDI: {
+ // This is a swap if it is of the form XXPERMDI t, s, s, 2.
+ // Unfortunately, MachineCSE ignores COPY and SUBREG_TO_REG, so we
+ // can also see XXPERMDI t, SUBREG_TO_REG(s), SUBREG_TO_REG(s), 2,
+ // for example. We have to look through chains of COPY and
+ // SUBREG_TO_REG to find the real source value for comparison.
+ // If the real source value is a physical register, then mark the
+ // XXPERMDI as mentioning a physical register.
+ int immed = MI.getOperand(3).getImm();
+ if (immed == 2) {
+ unsigned trueReg1 = lookThruCopyLike(MI.getOperand(1).getReg(),
+ VecIdx);
+ unsigned trueReg2 = lookThruCopyLike(MI.getOperand(2).getReg(),
+ VecIdx);
+ if (trueReg1 == trueReg2)
+ SwapVector[VecIdx].IsSwap = 1;
+ else {
+ // We can still handle these if the two registers are not
+ // identical, by adjusting the form of the XXPERMDI.
+ SwapVector[VecIdx].IsSwappable = 1;
+ SwapVector[VecIdx].SpecialHandling = SHValues::SH_XXPERMDI;
+ }
+ // This is a doubleword splat if it is of the form
+ // XXPERMDI t, s, s, 0 or XXPERMDI t, s, s, 3. As above we
+ // must look through chains of copy-likes to find the source
+ // register. We turn off the marking for mention of a physical
+ // register, because splatting it is safe; the optimization
+ // will not swap the value in the physical register. Whether
+ // or not the two input registers are identical, we can handle
+ // these by adjusting the form of the XXPERMDI.
+ } else if (immed == 0 || immed == 3) {
+
+ SwapVector[VecIdx].IsSwappable = 1;
+ SwapVector[VecIdx].SpecialHandling = SHValues::SH_XXPERMDI;
+
+ unsigned trueReg1 = lookThruCopyLike(MI.getOperand(1).getReg(),
+ VecIdx);
+ unsigned trueReg2 = lookThruCopyLike(MI.getOperand(2).getReg(),
+ VecIdx);
+ if (trueReg1 == trueReg2)
+ SwapVector[VecIdx].MentionsPhysVR = 0;
+
+ } else {
+ // We can still handle these by adjusting the form of the XXPERMDI.
+ SwapVector[VecIdx].IsSwappable = 1;
+ SwapVector[VecIdx].SpecialHandling = SHValues::SH_XXPERMDI;
+ }
+ break;
+ }
+ case PPC::LVX:
+ // Non-permuting loads are currently unsafe. We can use special
+ // handling for this in the future. By not marking these as
+ // IsSwap, we ensure computations containing them will be rejected
+ // for now.
+ SwapVector[VecIdx].IsLoad = 1;
+ break;
+ case PPC::LXVD2X:
+ case PPC::LXVW4X:
+ // Permuting loads are marked as both load and swap, and are
+ // safe for optimization.
+ SwapVector[VecIdx].IsLoad = 1;
+ SwapVector[VecIdx].IsSwap = 1;
+ break;
+ case PPC::LXSDX:
+ case PPC::LXSSPX:
+ // A load of a floating-point value into the high-order half of
+ // a vector register is safe, provided that we introduce a swap
+ // following the load, which will be done by the SUBREG_TO_REG
+ // support. So just mark these as safe.
+ SwapVector[VecIdx].IsLoad = 1;
+ SwapVector[VecIdx].IsSwappable = 1;
+ break;
+ case PPC::STVX:
+ // Non-permuting stores are currently unsafe. We can use special
+ // handling for this in the future. By not marking these as
+ // IsSwap, we ensure computations containing them will be rejected
+ // for now.
+ SwapVector[VecIdx].IsStore = 1;
+ break;
+ case PPC::STXVD2X:
+ case PPC::STXVW4X:
+ // Permuting stores are marked as both store and swap, and are
+ // safe for optimization.
+ SwapVector[VecIdx].IsStore = 1;
+ SwapVector[VecIdx].IsSwap = 1;
+ break;
+ case PPC::COPY:
+ // These are fine provided they are moving between full vector
+ // register classes.
+ if (isVecReg(MI.getOperand(0).getReg()) &&
+ isVecReg(MI.getOperand(1).getReg()))
+ SwapVector[VecIdx].IsSwappable = 1;
+ // If we have a copy from one scalar floating-point register
+ // to another, we can accept this even if it is a physical
+ // register. The only way this gets involved is if it feeds
+ // a SUBREG_TO_REG, which is handled by introducing a swap.
+ else if (isScalarVecReg(MI.getOperand(0).getReg()) &&
+ isScalarVecReg(MI.getOperand(1).getReg()))
+ SwapVector[VecIdx].IsSwappable = 1;
+ break;
+ case PPC::SUBREG_TO_REG: {
+ // These are fine provided they are moving between full vector
+ // register classes. If they are moving from a scalar
+ // floating-point class to a vector class, we can handle those
+ // as well, provided we introduce a swap. It is generally the
+ // case that we will introduce fewer swaps than we remove, but
+ // (FIXME) a cost model could be used. However, introduced
+ // swaps could potentially be CSEd, so this is not trivial.
+ if (isVecReg(MI.getOperand(0).getReg()) &&
+ isVecReg(MI.getOperand(2).getReg()))
+ SwapVector[VecIdx].IsSwappable = 1;
+ else if (isVecReg(MI.getOperand(0).getReg()) &&
+ isScalarVecReg(MI.getOperand(2).getReg())) {
+ SwapVector[VecIdx].IsSwappable = 1;
+ SwapVector[VecIdx].SpecialHandling = SHValues::SH_COPYWIDEN;
+ }
+ break;
+ }
+ case PPC::VSPLTB:
+ case PPC::VSPLTH:
+ case PPC::VSPLTW:
+ case PPC::XXSPLTW:
+ // Splats are lane-sensitive, but we can use special handling
+ // to adjust the source lane for the splat.
+ SwapVector[VecIdx].IsSwappable = 1;
+ SwapVector[VecIdx].SpecialHandling = SHValues::SH_SPLAT;
+ break;
+ // The presence of the following lane-sensitive operations in a
+ // web will kill the optimization, at least for now. For these
+ // we do nothing, causing the optimization to fail.
+ // FIXME: Some of these could be permitted with special handling,
+ // and will be phased in as time permits.
+ // FIXME: There is no simple and maintainable way to express a set
+ // of opcodes having a common attribute in TableGen. Should this
+ // change, this is a prime candidate to use such a mechanism.
+ case PPC::INLINEASM:
+ case PPC::EXTRACT_SUBREG:
+ case PPC::INSERT_SUBREG:
+ case PPC::COPY_TO_REGCLASS:
+ case PPC::LVEBX:
+ case PPC::LVEHX:
+ case PPC::LVEWX:
+ case PPC::LVSL:
+ case PPC::LVSR:
+ case PPC::LVXL:
+ case PPC::STVEBX:
+ case PPC::STVEHX:
+ case PPC::STVEWX:
+ case PPC::STVXL:
+ // We can handle STXSDX and STXSSPX similarly to LXSDX and LXSSPX,
+ // by adding special handling for narrowing copies as well as
+ // widening ones. However, I've experimented with this, and in
+ // practice we currently do not appear to use STXSDX fed by
+ // a narrowing copy from a full vector register. Since I can't
+ // generate any useful test cases, I've left this alone for now.
+ case PPC::STXSDX:
+ case PPC::STXSSPX:
+ case PPC::VCIPHER:
+ case PPC::VCIPHERLAST:
+ case PPC::VMRGHB:
+ case PPC::VMRGHH:
+ case PPC::VMRGHW:
+ case PPC::VMRGLB:
+ case PPC::VMRGLH:
+ case PPC::VMRGLW:
+ case PPC::VMULESB:
+ case PPC::VMULESH:
+ case PPC::VMULESW:
+ case PPC::VMULEUB:
+ case PPC::VMULEUH:
+ case PPC::VMULEUW:
+ case PPC::VMULOSB:
+ case PPC::VMULOSH:
+ case PPC::VMULOSW:
+ case PPC::VMULOUB:
+ case PPC::VMULOUH:
+ case PPC::VMULOUW:
+ case PPC::VNCIPHER:
+ case PPC::VNCIPHERLAST:
+ case PPC::VPERM:
+ case PPC::VPERMXOR:
+ case PPC::VPKPX:
+ case PPC::VPKSHSS:
+ case PPC::VPKSHUS:
+ case PPC::VPKSDSS:
+ case PPC::VPKSDUS:
+ case PPC::VPKSWSS:
+ case PPC::VPKSWUS:
+ case PPC::VPKUDUM:
+ case PPC::VPKUDUS:
+ case PPC::VPKUHUM:
+ case PPC::VPKUHUS:
+ case PPC::VPKUWUM:
+ case PPC::VPKUWUS:
+ case PPC::VPMSUMB:
+ case PPC::VPMSUMD:
+ case PPC::VPMSUMH:
+ case PPC::VPMSUMW:
+ case PPC::VRLB:
+ case PPC::VRLD:
+ case PPC::VRLH:
+ case PPC::VRLW:
+ case PPC::VSBOX:
+ case PPC::VSHASIGMAD:
+ case PPC::VSHASIGMAW:
+ case PPC::VSL:
+ case PPC::VSLDOI:
+ case PPC::VSLO:
+ case PPC::VSR:
+ case PPC::VSRO:
+ case PPC::VSUM2SWS:
+ case PPC::VSUM4SBS:
+ case PPC::VSUM4SHS:
+ case PPC::VSUM4UBS:
+ case PPC::VSUMSWS:
+ case PPC::VUPKHPX:
+ case PPC::VUPKHSB:
+ case PPC::VUPKHSH:
+ case PPC::VUPKHSW:
+ case PPC::VUPKLPX:
+ case PPC::VUPKLSB:
+ case PPC::VUPKLSH:
+ case PPC::VUPKLSW:
+ case PPC::XXMRGHW:
+ case PPC::XXMRGLW:
+ // XXSLDWI could be replaced by a general permute with one of three
+ // permute control vectors (for shift values 1, 2, 3). However,
+ // VPERM has a more restrictive register class.
+ case PPC::XXSLDWI:
+ break;
+ }
+ }
+ }
+
+ if (RelevantFunction) {
+ DEBUG(dbgs() << "Swap vector when first built\n\n");
+ dumpSwapVector();
+ }
+
+ return RelevantFunction;
+}
+
+// Add an entry to the swap vector and swap map, and make a
+// singleton equivalence class for the entry.
+int PPCVSXSwapRemoval::addSwapEntry(MachineInstr *MI,
+ PPCVSXSwapEntry& SwapEntry) {
+ SwapEntry.VSEMI = MI;
+ SwapEntry.VSEId = SwapVector.size();
+ SwapVector.push_back(SwapEntry);
+ EC->insert(SwapEntry.VSEId);
+ SwapMap[MI] = SwapEntry.VSEId;
+ return SwapEntry.VSEId;
+}
+
+// This is used to find the "true" source register for an
+// XXPERMDI instruction, since MachineCSE does not handle the
+// "copy-like" operations (Copy and SubregToReg). Returns
+// the original SrcReg unless it is the target of a copy-like
+// operation, in which case we chain backwards through all
+// such operations to the ultimate source register. If a
+// physical register is encountered, we stop the search and
+// flag the swap entry indicated by VecIdx (the original
+// XXPERMDI) as mentioning a physical register.
+unsigned PPCVSXSwapRemoval::lookThruCopyLike(unsigned SrcReg,
+ unsigned VecIdx) {
+ MachineInstr *MI = MRI->getVRegDef(SrcReg);
+ if (!MI->isCopyLike())
+ return SrcReg;
+
+ unsigned CopySrcReg;
+ if (MI->isCopy())
+ CopySrcReg = MI->getOperand(1).getReg();
+ else {
+ assert(MI->isSubregToReg() && "bad opcode for lookThruCopyLike");
+ CopySrcReg = MI->getOperand(2).getReg();
+ }
+
+ if (!TargetRegisterInfo::isVirtualRegister(CopySrcReg)) {
+ if (!isScalarVecReg(CopySrcReg))
+ SwapVector[VecIdx].MentionsPhysVR = 1;
+ return CopySrcReg;
+ }
+
+ return lookThruCopyLike(CopySrcReg, VecIdx);
+}
+
+// Generate equivalence classes for related computations (webs) by
+// def-use relationships of virtual registers. Mention of a physical
+// register terminates the generation of equivalence classes as this
+// indicates a use of a parameter, definition of a return value, use
+// of a value returned from a call, or definition of a parameter to a
+// call. Computations with physical register mentions are flagged
+// as such so their containing webs will not be optimized.
+void PPCVSXSwapRemoval::formWebs() {
+
+ DEBUG(dbgs() << "\n*** Forming webs for swap removal ***\n\n");
+
+ for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) {
+
+ MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
+
+ DEBUG(dbgs() << "\n" << SwapVector[EntryIdx].VSEId << " ");
+ DEBUG(MI->dump());
+
+ // It's sufficient to walk vector uses and join them to their unique
+ // definitions. In addition, check full vector register operands
+ // for physical regs. We exclude partial-vector register operands
+ // because we can handle them if copied to a full vector.
+ for (const MachineOperand &MO : MI->operands()) {
+ if (!MO.isReg())
+ continue;
+
+ unsigned Reg = MO.getReg();
+ if (!isVecReg(Reg) && !isScalarVecReg(Reg))
+ continue;
+
+ if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (!(MI->isCopy() && isScalarVecReg(Reg)))
+ SwapVector[EntryIdx].MentionsPhysVR = 1;
+ continue;
+ }
+
+ if (!MO.isUse())
+ continue;
+
+ MachineInstr* DefMI = MRI->getVRegDef(Reg);
+ assert(SwapMap.find(DefMI) != SwapMap.end() &&
+ "Inconsistency: def of vector reg not found in swap map!");
+ int DefIdx = SwapMap[DefMI];
+ (void)EC->unionSets(SwapVector[DefIdx].VSEId,
+ SwapVector[EntryIdx].VSEId);
+
+ DEBUG(dbgs() << format("Unioning %d with %d\n", SwapVector[DefIdx].VSEId,
+ SwapVector[EntryIdx].VSEId));
+ DEBUG(dbgs() << " Def: ");
+ DEBUG(DefMI->dump());
+ }
+ }
+}
+
+// Walk the swap vector entries looking for conditions that prevent their
+// containing computations from being optimized. When such conditions are
+// found, mark the representative of the computation's equivalence class
+// as rejected.
+void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
+
+ DEBUG(dbgs() << "\n*** Rejecting webs for swap removal ***\n\n");
+
+ for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) {
+ int Repr = EC->getLeaderValue(SwapVector[EntryIdx].VSEId);
+
+ // If representative is already rejected, don't waste further time.
+ if (SwapVector[Repr].WebRejected)
+ continue;
+
+ // Reject webs containing mentions of physical or partial registers, or
+ // containing operations that we don't know how to handle in a lane-
+ // permuted region.
+ if (SwapVector[EntryIdx].MentionsPhysVR ||
+ SwapVector[EntryIdx].MentionsPartialVR ||
+ !(SwapVector[EntryIdx].IsSwappable || SwapVector[EntryIdx].IsSwap)) {
+
+ SwapVector[Repr].WebRejected = 1;
+
+ DEBUG(dbgs() <<
+ format("Web %d rejected for physreg, partial reg, or not "
+ "swap[pable]\n", Repr));
+ DEBUG(dbgs() << " in " << EntryIdx << ": ");
+ DEBUG(SwapVector[EntryIdx].VSEMI->dump());
+ DEBUG(dbgs() << "\n");
+ }
+
+ // Reject webs than contain swapping loads that feed something other
+ // than a swap instruction.
+ else if (SwapVector[EntryIdx].IsLoad && SwapVector[EntryIdx].IsSwap) {
+ MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
+ unsigned DefReg = MI->getOperand(0).getReg();
+
+ // We skip debug instructions in the analysis. (Note that debug
+ // location information is still maintained by this optimization
+ // because it remains on the LXVD2X and STXVD2X instructions after
+ // the XXPERMDIs are removed.)
+ for (MachineInstr &UseMI : MRI->use_nodbg_instructions(DefReg)) {
+ int UseIdx = SwapMap[&UseMI];
+
+ if (!SwapVector[UseIdx].IsSwap || SwapVector[UseIdx].IsLoad ||
+ SwapVector[UseIdx].IsStore) {
+
+ SwapVector[Repr].WebRejected = 1;
+
+ DEBUG(dbgs() <<
+ format("Web %d rejected for load not feeding swap\n", Repr));
+ DEBUG(dbgs() << " def " << EntryIdx << ": ");
+ DEBUG(MI->dump());
+ DEBUG(dbgs() << " use " << UseIdx << ": ");
+ DEBUG(UseMI.dump());
+ DEBUG(dbgs() << "\n");
+ }
+ }
+
+ // Reject webs that contain swapping stores that are fed by something
+ // other than a swap instruction.
+ } else if (SwapVector[EntryIdx].IsStore && SwapVector[EntryIdx].IsSwap) {
+ MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
+ unsigned UseReg = MI->getOperand(0).getReg();
+ MachineInstr *DefMI = MRI->getVRegDef(UseReg);
+ unsigned DefReg = DefMI->getOperand(0).getReg();
+ int DefIdx = SwapMap[DefMI];
+
+ if (!SwapVector[DefIdx].IsSwap || SwapVector[DefIdx].IsLoad ||
+ SwapVector[DefIdx].IsStore) {
+
+ SwapVector[Repr].WebRejected = 1;
+
+ DEBUG(dbgs() <<
+ format("Web %d rejected for store not fed by swap\n", Repr));
+ DEBUG(dbgs() << " def " << DefIdx << ": ");
+ DEBUG(DefMI->dump());
+ DEBUG(dbgs() << " use " << EntryIdx << ": ");
+ DEBUG(MI->dump());
+ DEBUG(dbgs() << "\n");
+ }
+
+ // Ensure all uses of the register defined by DefMI feed store
+ // instructions
+ for (MachineInstr &UseMI : MRI->use_nodbg_instructions(DefReg)) {
+ int UseIdx = SwapMap[&UseMI];
+
+ if (SwapVector[UseIdx].VSEMI->getOpcode() != MI->getOpcode()) {
+ SwapVector[Repr].WebRejected = 1;
+
+ DEBUG(dbgs() <<
+ format("Web %d rejected for swap not feeding only stores\n",
+ Repr));
+ DEBUG(dbgs() << " def " << " : ");
+ DEBUG(DefMI->dump());
+ DEBUG(dbgs() << " use " << UseIdx << ": ");
+ DEBUG(SwapVector[UseIdx].VSEMI->dump());
+ DEBUG(dbgs() << "\n");
+ }
+ }
+ }
+ }
+
+ DEBUG(dbgs() << "Swap vector after web analysis:\n\n");
+ dumpSwapVector();
+}
+
+// Walk the swap vector entries looking for swaps fed by permuting loads
+// and swaps that feed permuting stores. If the containing computation
+// has not been marked rejected, mark each such swap for removal.
+// (Removal is delayed in case optimization has disturbed the pattern,
+// such that multiple loads feed the same swap, etc.)
+void PPCVSXSwapRemoval::markSwapsForRemoval() {
+
+ DEBUG(dbgs() << "\n*** Marking swaps for removal ***\n\n");
+
+ for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) {
+
+ if (SwapVector[EntryIdx].IsLoad && SwapVector[EntryIdx].IsSwap) {
+ int Repr = EC->getLeaderValue(SwapVector[EntryIdx].VSEId);
+
+ if (!SwapVector[Repr].WebRejected) {
+ MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
+ unsigned DefReg = MI->getOperand(0).getReg();
+
+ for (MachineInstr &UseMI : MRI->use_nodbg_instructions(DefReg)) {
+ int UseIdx = SwapMap[&UseMI];
+ SwapVector[UseIdx].WillRemove = 1;
+
+ DEBUG(dbgs() << "Marking swap fed by load for removal: ");
+ DEBUG(UseMI.dump());
+ }
+ }
+
+ } else if (SwapVector[EntryIdx].IsStore && SwapVector[EntryIdx].IsSwap) {
+ int Repr = EC->getLeaderValue(SwapVector[EntryIdx].VSEId);
+
+ if (!SwapVector[Repr].WebRejected) {
+ MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
+ unsigned UseReg = MI->getOperand(0).getReg();
+ MachineInstr *DefMI = MRI->getVRegDef(UseReg);
+ int DefIdx = SwapMap[DefMI];
+ SwapVector[DefIdx].WillRemove = 1;
+
+ DEBUG(dbgs() << "Marking swap feeding store for removal: ");
+ DEBUG(DefMI->dump());
+ }
+
+ } else if (SwapVector[EntryIdx].IsSwappable &&
+ SwapVector[EntryIdx].SpecialHandling != 0) {
+ int Repr = EC->getLeaderValue(SwapVector[EntryIdx].VSEId);
+
+ if (!SwapVector[Repr].WebRejected)
+ handleSpecialSwappables(EntryIdx);
+ }
+ }
+}
+
+// Create an xxswapd instruction and insert it prior to the given point.
+// MI is used to determine basic block and debug loc information.
+// FIXME: When inserting a swap, we should check whether SrcReg is
+// defined by another swap: SrcReg = XXPERMDI Reg, Reg, 2; If so,
+// then instead we should generate a copy from Reg to DstReg.
+void PPCVSXSwapRemoval::insertSwap(MachineInstr *MI,
+ MachineBasicBlock::iterator InsertPoint,
+ unsigned DstReg, unsigned SrcReg) {
+ BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
+ TII->get(PPC::XXPERMDI), DstReg)
+ .addReg(SrcReg)
+ .addReg(SrcReg)
+ .addImm(2);
+}
+
+// The identified swap entry requires special handling to allow its
+// containing computation to be optimized. Perform that handling
+// here.
+// FIXME: Additional opportunities will be phased in with subsequent
+// patches.
+void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
+ switch (SwapVector[EntryIdx].SpecialHandling) {
+
+ default:
+ llvm_unreachable("Unexpected special handling type");
+
+ // For splats based on an index into a vector, add N/2 modulo N
+ // to the index, where N is the number of vector elements.
+ case SHValues::SH_SPLAT: {
+ MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
+ unsigned NElts;
+
+ DEBUG(dbgs() << "Changing splat: ");
+ DEBUG(MI->dump());
+
+ switch (MI->getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected splat opcode");
+ case PPC::VSPLTB: NElts = 16; break;
+ case PPC::VSPLTH: NElts = 8; break;
+ case PPC::VSPLTW:
+ case PPC::XXSPLTW: NElts = 4; break;
+ }
+
+ unsigned EltNo;
+ if (MI->getOpcode() == PPC::XXSPLTW)
+ EltNo = MI->getOperand(2).getImm();
+ else
+ EltNo = MI->getOperand(1).getImm();
+
+ EltNo = (EltNo + NElts / 2) % NElts;
+ if (MI->getOpcode() == PPC::XXSPLTW)
+ MI->getOperand(2).setImm(EltNo);
+ else
+ MI->getOperand(1).setImm(EltNo);
+
+ DEBUG(dbgs() << " Into: ");
+ DEBUG(MI->dump());
+ break;
+ }
+
+ // For an XXPERMDI that isn't handled otherwise, we need to
+ // reverse the order of the operands. If the selector operand
+ // has a value of 0 or 3, we need to change it to 3 or 0,
+ // respectively. Otherwise we should leave it alone. (This
+ // is equivalent to reversing the two bits of the selector
+ // operand and complementing the result.)
+ case SHValues::SH_XXPERMDI: {
+ MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
+
+ DEBUG(dbgs() << "Changing XXPERMDI: ");
+ DEBUG(MI->dump());
+
+ unsigned Selector = MI->getOperand(3).getImm();
+ if (Selector == 0 || Selector == 3)
+ Selector = 3 - Selector;
+ MI->getOperand(3).setImm(Selector);
+
+ unsigned Reg1 = MI->getOperand(1).getReg();
+ unsigned Reg2 = MI->getOperand(2).getReg();
+ MI->getOperand(1).setReg(Reg2);
+ MI->getOperand(2).setReg(Reg1);
+
+ DEBUG(dbgs() << " Into: ");
+ DEBUG(MI->dump());
+ break;
+ }
+
+ // For a copy from a scalar floating-point register to a vector
+ // register, removing swaps will leave the copied value in the
+ // wrong lane. Insert a swap following the copy to fix this.
+ case SHValues::SH_COPYWIDEN: {
+ MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
+
+ DEBUG(dbgs() << "Changing SUBREG_TO_REG: ");
+ DEBUG(MI->dump());
+
+ unsigned DstReg = MI->getOperand(0).getReg();
+ const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg);
+ unsigned NewVReg = MRI->createVirtualRegister(DstRC);
+
+ MI->getOperand(0).setReg(NewVReg);
+ DEBUG(dbgs() << " Into: ");
+ DEBUG(MI->dump());
+
+ auto InsertPoint = ++MachineBasicBlock::iterator(MI);
+
+ // Note that an XXPERMDI requires a VSRC, so if the SUBREG_TO_REG
+ // is copying to a VRRC, we need to be careful to avoid a register
+ // assignment problem. In this case we must copy from VRRC to VSRC
+ // prior to the swap, and from VSRC to VRRC following the swap.
+ // Coalescing will usually remove all this mess.
+ if (DstRC == &PPC::VRRCRegClass) {
+ unsigned VSRCTmp1 = MRI->createVirtualRegister(&PPC::VSRCRegClass);
+ unsigned VSRCTmp2 = MRI->createVirtualRegister(&PPC::VSRCRegClass);
+
+ BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
+ TII->get(PPC::COPY), VSRCTmp1)
+ .addReg(NewVReg);
+ DEBUG(std::prev(InsertPoint)->dump());
+
+ insertSwap(MI, InsertPoint, VSRCTmp2, VSRCTmp1);
+ DEBUG(std::prev(InsertPoint)->dump());
+
+ BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
+ TII->get(PPC::COPY), DstReg)
+ .addReg(VSRCTmp2);
+ DEBUG(std::prev(InsertPoint)->dump());
+
+ } else {
+ insertSwap(MI, InsertPoint, DstReg, NewVReg);
+ DEBUG(std::prev(InsertPoint)->dump());
+ }
+ break;
+ }
+ }
+}
+
+// Walk the swap vector and replace each entry marked for removal with
+// a copy operation.
+bool PPCVSXSwapRemoval::removeSwaps() {
+
+ DEBUG(dbgs() << "\n*** Removing swaps ***\n\n");
+
+ bool Changed = false;
+
+ for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) {
+ if (SwapVector[EntryIdx].WillRemove) {
+ Changed = true;
+ MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
+ MachineBasicBlock *MBB = MI->getParent();
+ BuildMI(*MBB, MI, MI->getDebugLoc(),
+ TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
+ .addOperand(MI->getOperand(1));
+
+ DEBUG(dbgs() << format("Replaced %d with copy: ",
+ SwapVector[EntryIdx].VSEId));
+ DEBUG(MI->dump());
+
+ MI->eraseFromParent();
+ }
+ }
+
+ return Changed;
+}
+
+// For debug purposes, dump the contents of the swap vector.
+void PPCVSXSwapRemoval::dumpSwapVector() {
+
+ for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) {
+
+ MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
+ int ID = SwapVector[EntryIdx].VSEId;
+
+ DEBUG(dbgs() << format("%6d", ID));
+ DEBUG(dbgs() << format("%6d", EC->getLeaderValue(ID)));
+ DEBUG(dbgs() << format(" BB#%3d", MI->getParent()->getNumber()));
+ DEBUG(dbgs() << format(" %14s ",
+ TII->getName(MI->getOpcode()).str().c_str()));
+
+ if (SwapVector[EntryIdx].IsLoad)
+ DEBUG(dbgs() << "load ");
+ if (SwapVector[EntryIdx].IsStore)
+ DEBUG(dbgs() << "store ");
+ if (SwapVector[EntryIdx].IsSwap)
+ DEBUG(dbgs() << "swap ");
+ if (SwapVector[EntryIdx].MentionsPhysVR)
+ DEBUG(dbgs() << "physreg ");
+ if (SwapVector[EntryIdx].MentionsPartialVR)
+ DEBUG(dbgs() << "partialreg ");
+
+ if (SwapVector[EntryIdx].IsSwappable) {
+ DEBUG(dbgs() << "swappable ");
+ switch(SwapVector[EntryIdx].SpecialHandling) {
+ default:
+ DEBUG(dbgs() << "special:**unknown**");
+ break;
+ case SH_NONE:
+ break;
+ case SH_EXTRACT:
+ DEBUG(dbgs() << "special:extract ");
+ break;
+ case SH_INSERT:
+ DEBUG(dbgs() << "special:insert ");
+ break;
+ case SH_NOSWAP_LD:
+ DEBUG(dbgs() << "special:load ");
+ break;
+ case SH_NOSWAP_ST:
+ DEBUG(dbgs() << "special:store ");
+ break;
+ case SH_SPLAT:
+ DEBUG(dbgs() << "special:splat ");
+ break;
+ case SH_XXPERMDI:
+ DEBUG(dbgs() << "special:xxpermdi ");
+ break;
+ case SH_COPYWIDEN:
+ DEBUG(dbgs() << "special:copywiden ");
+ break;
+ }
+ }
+
+ if (SwapVector[EntryIdx].WebRejected)
+ DEBUG(dbgs() << "rejected ");
+ if (SwapVector[EntryIdx].WillRemove)
+ DEBUG(dbgs() << "remove ");
+
+ DEBUG(dbgs() << "\n");
+
+ // For no-asserts builds.
+ (void)MI;
+ (void)ID;
+ }
+
+ DEBUG(dbgs() << "\n");
+}
+
+} // end default namespace
+
+INITIALIZE_PASS_BEGIN(PPCVSXSwapRemoval, DEBUG_TYPE,
+ "PowerPC VSX Swap Removal", false, false)
+INITIALIZE_PASS_END(PPCVSXSwapRemoval, DEBUG_TYPE,
+ "PowerPC VSX Swap Removal", false, false)
+
+char PPCVSXSwapRemoval::ID = 0;
+FunctionPass*
+llvm::createPPCVSXSwapRemovalPass() { return new PPCVSXSwapRemoval(); }
diff --git a/contrib/llvm/lib/Target/PowerPC/README_P9.txt b/contrib/llvm/lib/Target/PowerPC/README_P9.txt
new file mode 100644
index 000000000000..d56f7cca7b21
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/README_P9.txt
@@ -0,0 +1,605 @@
+//===- README_P9.txt - Notes for improving Power9 code gen ----------------===//
+
+TODO: Instructions Need Implement Instrinstics or Map to LLVM IR
+
+Altivec:
+- Vector Compare Not Equal (Zero):
+ vcmpneb(.) vcmpneh(.) vcmpnew(.)
+ vcmpnezb(.) vcmpnezh(.) vcmpnezw(.)
+ . Same as other VCMP*, use VCMP/VCMPo form (support intrinsic)
+
+- Vector Extract Unsigned: vextractub vextractuh vextractuw vextractd
+ . Don't use llvm extractelement because they have different semantics
+ . Use instrinstics:
+ (set v2i64:$vD, (int_ppc_altivec_vextractub v16i8:$vA, imm:$UIMM))
+ (set v2i64:$vD, (int_ppc_altivec_vextractuh v8i16:$vA, imm:$UIMM))
+ (set v2i64:$vD, (int_ppc_altivec_vextractuw v4i32:$vA, imm:$UIMM))
+ (set v2i64:$vD, (int_ppc_altivec_vextractd v2i64:$vA, imm:$UIMM))
+
+- Vector Extract Unsigned Byte Left/Right-Indexed:
+ vextublx vextubrx vextuhlx vextuhrx vextuwlx vextuwrx
+ . Use instrinstics:
+ // Left-Indexed
+ (set i64:$rD, (int_ppc_altivec_vextublx i64:$rA, v16i8:$vB))
+ (set i64:$rD, (int_ppc_altivec_vextuhlx i64:$rA, v8i16:$vB))
+ (set i64:$rD, (int_ppc_altivec_vextuwlx i64:$rA, v4i32:$vB))
+
+ // Right-Indexed
+ (set i64:$rD, (int_ppc_altivec_vextubrx i64:$rA, v16i8:$vB))
+ (set i64:$rD, (int_ppc_altivec_vextuhrx i64:$rA, v8i16:$vB))
+ (set i64:$rD, (int_ppc_altivec_vextuwrx i64:$rA, v4i32:$vB))
+
+- Vector Insert Element Instructions: vinsertb vinsertd vinserth vinsertw
+ (set v16i8:$vD, (int_ppc_altivec_vinsertb v16i8:$vA, imm:$UIMM))
+ (set v8i16:$vD, (int_ppc_altivec_vinsertd v8i16:$vA, imm:$UIMM))
+ (set v4i32:$vD, (int_ppc_altivec_vinserth v4i32:$vA, imm:$UIMM))
+ (set v2i64:$vD, (int_ppc_altivec_vinsertw v2i64:$vA, imm:$UIMM))
+
+- Vector Count Leading/Trailing Zero LSB. Result is placed into GPR[rD]:
+ vclzlsbb vctzlsbb
+ . Use intrinsic:
+ (set i64:$rD, (int_ppc_altivec_vclzlsbb v16i8:$vB))
+ (set i64:$rD, (int_ppc_altivec_vctzlsbb v16i8:$vB))
+
+- Vector Count Trailing Zeros: vctzb vctzh vctzw vctzd
+ . Map to llvm cttz
+ (set v16i8:$vD, (cttz v16i8:$vB)) // vctzb
+ (set v8i16:$vD, (cttz v8i16:$vB)) // vctzh
+ (set v4i32:$vD, (cttz v4i32:$vB)) // vctzw
+ (set v2i64:$vD, (cttz v2i64:$vB)) // vctzd
+
+- Vector Extend Sign: vextsb2w vextsh2w vextsb2d vextsh2d vextsw2d
+ . vextsb2w:
+ (set v4i32:$vD, (sext v4i8:$vB))
+
+ // PowerISA_V3.0:
+ do i = 0 to 3
+ VR[VRT].word[i] ← EXTS32(VR[VRB].word[i].byte[3])
+ end
+
+ . vextsh2w:
+ (set v4i32:$vD, (sext v4i16:$vB))
+
+ // PowerISA_V3.0:
+ do i = 0 to 3
+ VR[VRT].word[i] ← EXTS32(VR[VRB].word[i].hword[1])
+ end
+
+ . vextsb2d
+ (set v2i64:$vD, (sext v2i8:$vB))
+
+ // PowerISA_V3.0:
+ do i = 0 to 1
+ VR[VRT].dword[i] ← EXTS64(VR[VRB].dword[i].byte[7])
+ end
+
+ . vextsh2d
+ (set v2i64:$vD, (sext v2i16:$vB))
+
+ // PowerISA_V3.0:
+ do i = 0 to 1
+ VR[VRT].dword[i] ← EXTS64(VR[VRB].dword[i].hword[3])
+ end
+
+ . vextsw2d
+ (set v2i64:$vD, (sext v2i32:$vB))
+
+ // PowerISA_V3.0:
+ do i = 0 to 1
+ VR[VRT].dword[i] ← EXTS64(VR[VRB].dword[i].word[1])
+ end
+
+- Vector Integer Negate: vnegw vnegd
+ . Map to llvm ineg
+ (set v4i32:$rT, (ineg v4i32:$rA)) // vnegw
+ (set v2i64:$rT, (ineg v2i64:$rA)) // vnegd
+
+- Vector Parity Byte: vprtybw vprtybd vprtybq
+ . Use intrinsic:
+ (set v4i32:$rD, (int_ppc_altivec_vprtybw v4i32:$vB))
+ (set v2i64:$rD, (int_ppc_altivec_vprtybd v2i64:$vB))
+ (set v1i128:$rD, (int_ppc_altivec_vprtybq v1i128:$vB))
+
+- Vector (Bit) Permute (Right-indexed):
+ . vbpermd: Same as "vbpermq", use VX1_Int_Ty2:
+ VX1_Int_Ty2<1484, "vbpermd", int_ppc_altivec_vbpermd, v2i64, v2i64>;
+
+ . vpermr: use VA1a_Int_Ty3
+ VA1a_Int_Ty3<59, "vpermr", int_ppc_altivec_vpermr, v16i8, v16i8, v16i8>;
+
+- Vector Rotate Left Mask/Mask-Insert: vrlwnm vrlwmi vrldnm vrldmi
+ . Use intrinsic:
+ VX1_Int_Ty<389, "vrlwnm", int_ppc_altivec_vrlwnm, v4i32>;
+ VX1_Int_Ty<133, "vrlwmi", int_ppc_altivec_vrlwmi, v4i32>;
+ VX1_Int_Ty<453, "vrldnm", int_ppc_altivec_vrldnm, v2i64>;
+ VX1_Int_Ty<197, "vrldmi", int_ppc_altivec_vrldmi, v2i64>;
+
+- Vector Shift Left/Right: vslv vsrv
+ . Use intrinsic, don't map to llvm shl and lshr, because they have different
+ semantics, e.g. vslv:
+
+ do i = 0 to 15
+ sh ← VR[VRB].byte[i].bit[5:7]
+ VR[VRT].byte[i] ← src.byte[i:i+1].bit[sh:sh+7]
+ end
+
+ VR[VRT].byte[i] is composed of 2 bytes from src.byte[i:i+1]
+
+ . VX1_Int_Ty<1860, "vslv", int_ppc_altivec_vslv, v16i8>;
+ VX1_Int_Ty<1796, "vsrv", int_ppc_altivec_vsrv, v16i8>;
+
+- Vector Multiply-by-10 (& Write Carry) Unsigned Quadword:
+ vmul10uq vmul10cuq
+ . Use intrinsic:
+ VX1_Int_Ty<513, "vmul10uq", int_ppc_altivec_vmul10uq, v1i128>;
+ VX1_Int_Ty< 1, "vmul10cuq", int_ppc_altivec_vmul10cuq, v1i128>;
+
+- Vector Multiply-by-10 Extended (& Write Carry) Unsigned Quadword:
+ vmul10euq vmul10ecuq
+ . Use intrinsic:
+ VX1_Int_Ty<577, "vmul10euq", int_ppc_altivec_vmul10euq, v1i128>;
+ VX1_Int_Ty< 65, "vmul10ecuq", int_ppc_altivec_vmul10ecuq, v1i128>;
+
+- Decimal Convert From/to National/Zoned/Signed-QWord:
+ bcdcfn. bcdcfz. bcdctn. bcdctz. bcdcfsq. bcdctsq.
+ . Use instrinstics:
+ (set v1i128:$vD, (int_ppc_altivec_bcdcfno v1i128:$vB, i1:$PS))
+ (set v1i128:$vD, (int_ppc_altivec_bcdcfzo v1i128:$vB, i1:$PS))
+ (set v1i128:$vD, (int_ppc_altivec_bcdctno v1i128:$vB))
+ (set v1i128:$vD, (int_ppc_altivec_bcdctzo v1i128:$vB, i1:$PS))
+ (set v1i128:$vD, (int_ppc_altivec_bcdcfsqo v1i128:$vB, i1:$PS))
+ (set v1i128:$vD, (int_ppc_altivec_bcdctsqo v1i128:$vB))
+
+- Decimal Copy-Sign/Set-Sign: bcdcpsgn. bcdsetsgn.
+ . Use instrinstics:
+ (set v1i128:$vD, (int_ppc_altivec_bcdcpsgno v1i128:$vA, v1i128:$vB))
+ (set v1i128:$vD, (int_ppc_altivec_bcdsetsgno v1i128:$vB, i1:$PS))
+
+- Decimal Shift/Unsigned-Shift/Shift-and-Round: bcds. bcdus. bcdsr.
+ . Use instrinstics:
+ (set v1i128:$vD, (int_ppc_altivec_bcdso v1i128:$vA, v1i128:$vB, i1:$PS))
+ (set v1i128:$vD, (int_ppc_altivec_bcduso v1i128:$vA, v1i128:$vB))
+ (set v1i128:$vD, (int_ppc_altivec_bcdsro v1i128:$vA, v1i128:$vB, i1:$PS))
+
+ . Note! Their VA is accessed only 1 byte, i.e. VA.byte[7]
+
+- Decimal (Unsigned) Truncate: bcdtrunc. bcdutrunc.
+ . Use instrinstics:
+ (set v1i128:$vD, (int_ppc_altivec_bcdso v1i128:$vA, v1i128:$vB, i1:$PS))
+ (set v1i128:$vD, (int_ppc_altivec_bcduso v1i128:$vA, v1i128:$vB))
+
+ . Note! Their VA is accessed only 2 byte, i.e. VA.hword[3] (VA.bit[48:63])
+
+VSX:
+- QP Copy Sign: xscpsgnqp
+ . Similar to xscpsgndp
+ . (set f128:$vT, (fcopysign f128:$vB, f128:$vA)
+
+- QP Absolute/Negative-Absolute/Negate: xsabsqp xsnabsqp xsnegqp
+ . Similar to xsabsdp/xsnabsdp/xsnegdp
+ . (set f128:$vT, (fabs f128:$vB)) // xsabsqp
+ (set f128:$vT, (fneg (fabs f128:$vB))) // xsnabsqp
+ (set f128:$vT, (fneg f128:$vB)) // xsnegqp
+
+- QP Add/Divide/Multiply/Subtract/Square-Root:
+ xsaddqp xsdivqp xsmulqp xssubqp xssqrtqp
+ . Similar to xsadddp
+ . isCommutable = 1
+ (set f128:$vT, (fadd f128:$vA, f128:$vB)) // xsaddqp
+ (set f128:$vT, (fmul f128:$vA, f128:$vB)) // xsmulqp
+
+ . isCommutable = 0
+ (set f128:$vT, (fdiv f128:$vA, f128:$vB)) // xsdivqp
+ (set f128:$vT, (fsub f128:$vA, f128:$vB)) // xssubqp
+ (set f128:$vT, (fsqrt f128:$vB))) // xssqrtqp
+
+- Round to Odd of QP Add/Divide/Multiply/Subtract/Square-Root:
+ xsaddqpo xsdivqpo xsmulqpo xssubqpo xssqrtqpo
+ . Similar to xsrsqrtedp??
+ def XSRSQRTEDP : XX2Form<60, 74,
+ (outs vsfrc:$XT), (ins vsfrc:$XB),
+ "xsrsqrtedp $XT, $XB", IIC_VecFP,
+ [(set f64:$XT, (PPCfrsqrte f64:$XB))]>;
+
+ . Define DAG Node in PPCInstrInfo.td:
+ def PPCfaddrto: SDNode<"PPCISD::FADDRTO", SDTFPBinOp, []>;
+ def PPCfdivrto: SDNode<"PPCISD::FDIVRTO", SDTFPBinOp, []>;
+ def PPCfmulrto: SDNode<"PPCISD::FMULRTO", SDTFPBinOp, []>;
+ def PPCfsubrto: SDNode<"PPCISD::FSUBRTO", SDTFPBinOp, []>;
+ def PPCfsqrtrto: SDNode<"PPCISD::FSQRTRTO", SDTFPUnaryOp, []>;
+
+ DAG patterns of each instruction (PPCInstrVSX.td):
+ . isCommutable = 1
+ (set f128:$vT, (PPCfaddrto f128:$vA, f128:$vB)) // xsaddqpo
+ (set f128:$vT, (PPCfmulrto f128:$vA, f128:$vB)) // xsmulqpo
+
+ . isCommutable = 0
+ (set f128:$vT, (PPCfdivrto f128:$vA, f128:$vB)) // xsdivqpo
+ (set f128:$vT, (PPCfsubrto f128:$vA, f128:$vB)) // xssubqpo
+ (set f128:$vT, (PPCfsqrtrto f128:$vB)) // xssqrtqpo
+
+- QP (Negative) Multiply-{Add/Subtract}: xsmaddqp xsmsubqp xsnmaddqp xsnmsubqp
+ . Ref: xsmaddadp/xsmsubadp/xsnmaddadp/xsnmsubadp
+
+ . isCommutable = 1
+ // xsmaddqp
+ [(set f128:$vT, (fma f128:$vA, f128:$vB, f128:$vTi))]>,
+ RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+ AltVSXFMARel;
+
+ // xsmsubqp
+ [(set f128:$vT, (fma f128:$vA, f128:$vB, (fneg f128:$vTi)))]>,
+ RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+ AltVSXFMARel;
+
+ // xsnmaddqp
+ [(set f128:$vT, (fneg (fma f128:$vA, f128:$vB, f128:$vTi)))]>,
+ RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+ AltVSXFMARel;
+
+ // xsnmsubqp
+ [(set f128:$vT, (fneg (fma f128:$vA, f128:$vB, (fneg f128:$vTi))))]>,
+ RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+ AltVSXFMARel;
+
+- Round to Odd of QP (Negative) Multiply-{Add/Subtract}:
+ xsmaddqpo xsmsubqpo xsnmaddqpo xsnmsubqpo
+ . Similar to xsrsqrtedp??
+
+ . Define DAG Node in PPCInstrInfo.td:
+ def PPCfmarto: SDNode<"PPCISD::FMARTO", SDTFPTernaryOp, []>;
+
+ It looks like we only need to define "PPCfmarto" for these instructions,
+ because according to PowerISA_V3.0, these instructions perform RTO on
+ fma's result:
+ xsmaddqp(o)
+ v ← bfp_MULTIPLY_ADD(src1, src3, src2)
+ rnd ← bfp_ROUND_TO_BFP128(RO, FPSCR.RN, v)
+ result ← bfp_CONVERT_TO_BFP128(rnd)
+
+ xsmsubqp(o)
+ v ← bfp_MULTIPLY_ADD(src1, src3, bfp_NEGATE(src2))
+ rnd ← bfp_ROUND_TO_BFP128(RO, FPSCR.RN, v)
+ result ← bfp_CONVERT_TO_BFP128(rnd)
+
+ xsnmaddqp(o)
+ v ← bfp_MULTIPLY_ADD(src1,src3,src2)
+ rnd ← bfp_NEGATE(bfp_ROUND_TO_BFP128(RO, FPSCR.RN, v))
+ result ← bfp_CONVERT_TO_BFP128(rnd)
+
+ xsnmsubqp(o)
+ v ← bfp_MULTIPLY_ADD(src1, src3, bfp_NEGATE(src2))
+ rnd ← bfp_NEGATE(bfp_ROUND_TO_BFP128(RO, FPSCR.RN, v))
+ result ← bfp_CONVERT_TO_BFP128(rnd)
+
+ DAG patterns of each instruction (PPCInstrVSX.td):
+ . isCommutable = 1
+ // xsmaddqpo
+ [(set f128:$vT, (PPCfmarto f128:$vA, f128:$vB, f128:$vTi))]>,
+ RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+ AltVSXFMARel;
+
+ // xsmsubqpo
+ [(set f128:$vT, (PPCfmarto f128:$vA, f128:$vB, (fneg f128:$vTi)))]>,
+ RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+ AltVSXFMARel;
+
+ // xsnmaddqpo
+ [(set f128:$vT, (fneg (PPCfmarto f128:$vA, f128:$vB, f128:$vTi)))]>,
+ RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+ AltVSXFMARel;
+
+ // xsnmsubqpo
+ [(set f128:$vT, (fneg (PPCfmarto f128:$vA, f128:$vB, (fneg f128:$vTi))))]>,
+ RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+ AltVSXFMARel;
+
+- QP Compare Ordered/Unordered: xscmpoqp xscmpuqp
+ . ref: XSCMPUDP
+ def XSCMPUDP : XX3Form_1<60, 35,
+ (outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB),
+ "xscmpudp $crD, $XA, $XB", IIC_FPCompare, []>;
+
+ . No SDAG, intrinsic, builtin are required??
+ Or llvm fcmp order/unorder compare??
+
+- DP/QP Compare Exponents: xscmpexpdp xscmpexpqp
+ . No SDAG, intrinsic, builtin are required?
+
+- DP Compare ==, >=, >, !=: xscmpeqdp xscmpgedp xscmpgtdp xscmpnedp
+ . I checked existing instruction "XSCMPUDP". They are different in target
+ register. "XSCMPUDP" write to CR field, xscmp*dp write to VSX register
+
+ . Use instrinsic:
+ (set i128:$XT, (int_ppc_vsx_xscmpeqdp f64:$XA, f64:$XB))
+ (set i128:$XT, (int_ppc_vsx_xscmpgedp f64:$XA, f64:$XB))
+ (set i128:$XT, (int_ppc_vsx_xscmpgtdp f64:$XA, f64:$XB))
+ (set i128:$XT, (int_ppc_vsx_xscmpnedp f64:$XA, f64:$XB))
+
+- Vector Compare Not Equal: xvcmpnedp xvcmpnedp. xvcmpnesp xvcmpnesp.
+ . Similar to xvcmpeqdp:
+ defm XVCMPEQDP : XX3Form_Rcr<60, 99,
+ "xvcmpeqdp", "$XT, $XA, $XB", IIC_VecFPCompare,
+ int_ppc_vsx_xvcmpeqdp, v2i64, v2f64>;
+
+ . So we should use "XX3Form_Rcr" to implement instrinsic
+
+- Convert DP -> QP: xscvdpqp
+ . Similar to XSCVDPSP:
+ def XSCVDPSP : XX2Form<60, 265,
+ (outs vsfrc:$XT), (ins vsfrc:$XB),
+ "xscvdpsp $XT, $XB", IIC_VecFP, []>;
+ . So, No SDAG, intrinsic, builtin are required??
+
+- Round & Convert QP -> DP (dword[1] is set to zero): xscvqpdp xscvqpdpo
+ . Similar to XSCVDPSP
+ . No SDAG, intrinsic, builtin are required??
+
+- Truncate & Convert QP -> (Un)Signed (D)Word (dword[1] is set to zero):
+ xscvqpsdz xscvqpswz xscvqpudz xscvqpuwz
+ . According to PowerISA_V3.0, these are similar to "XSCVDPSXDS", "XSCVDPSXWS",
+ "XSCVDPUXDS", "XSCVDPUXWS"
+
+ . DAG patterns:
+ (set f128:$XT, (PPCfctidz f128:$XB)) // xscvqpsdz
+ (set f128:$XT, (PPCfctiwz f128:$XB)) // xscvqpswz
+ (set f128:$XT, (PPCfctiduz f128:$XB)) // xscvqpudz
+ (set f128:$XT, (PPCfctiwuz f128:$XB)) // xscvqpuwz
+
+- Convert (Un)Signed DWord -> QP: xscvsdqp xscvudqp
+ . Similar to XSCVSXDSP
+ . (set f128:$XT, (PPCfcfids f64:$XB)) // xscvsdqp
+ (set f128:$XT, (PPCfcfidus f64:$XB)) // xscvudqp
+
+- (Round &) Convert DP <-> HP: xscvdphp xscvhpdp
+ . Similar to XSCVDPSP
+ . No SDAG, intrinsic, builtin are required??
+
+- Vector HP -> SP: xvcvhpsp xvcvsphp
+ . Similar to XVCVDPSP:
+ def XVCVDPSP : XX2Form<60, 393,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvcvdpsp $XT, $XB", IIC_VecFP, []>;
+ . No SDAG, intrinsic, builtin are required??
+
+- Round to Quad-Precision Integer: xsrqpi xsrqpix
+ . These are combination of "XSRDPI", "XSRDPIC", "XSRDPIM", .., because you
+ need to assign rounding mode in instruction
+ . Provide builtin?
+ (set f128:$vT, (int_ppc_vsx_xsrqpi f128:$vB))
+ (set f128:$vT, (int_ppc_vsx_xsrqpix f128:$vB))
+
+- Round Quad-Precision to Double-Extended Precision (fp80): xsrqpxp
+ . Provide builtin?
+ (set f128:$vT, (int_ppc_vsx_xsrqpxp f128:$vB))
+
+Fixed Point Facility:
+
+- Exploit cmprb and cmpeqb (perhaps for something like
+ isalpha/isdigit/isupper/islower and isspace respectivelly). This can
+ perhaps be done through a builtin.
+
+- Provide testing for cnttz[dw]
+- Insert Exponent DP/QP: xsiexpdp xsiexpqp
+ . Use intrinsic?
+ . xsiexpdp:
+ // Note: rA and rB are the unsigned integer value.
+ (set f128:$XT, (int_ppc_vsx_xsiexpdp i64:$rA, i64:$rB))
+
+ . xsiexpqp:
+ (set f128:$vT, (int_ppc_vsx_xsiexpqp f128:$vA, f64:$vB))
+
+- Extract Exponent/Significand DP/QP: xsxexpdp xsxsigdp xsxexpqp xsxsigqp
+ . Use intrinsic?
+ . (set i64:$rT, (int_ppc_vsx_xsxexpdp f64$XB)) // xsxexpdp
+ (set i64:$rT, (int_ppc_vsx_xsxsigdp f64$XB)) // xsxsigdp
+ (set f128:$vT, (int_ppc_vsx_xsxexpqp f128$vB)) // xsxexpqp
+ (set f128:$vT, (int_ppc_vsx_xsxsigqp f128$vB)) // xsxsigqp
+
+- Vector Insert Word: xxinsertw
+ - Useful for inserting f32/i32 elements into vectors (the element to be
+ inserted needs to be prepared)
+ . Note: llvm has insertelem in "Vector Operations"
+ ; yields <n x <ty>>
+ <result> = insertelement <n x <ty>> <val>, <ty> <elt>, <ty2> <idx>
+
+ But how to map to it??
+ [(set v1f128:$XT, (insertelement v1f128:$XTi, f128:$XB, i4:$UIMM))]>,
+ RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+
+ . Or use intrinsic?
+ (set v1f128:$XT, (int_ppc_vsx_xxinsertw v1f128:$XTi, f128:$XB, i4:$UIMM))
+
+- Vector Extract Unsigned Word: xxextractuw
+ - Not useful for extraction of f32 from v4f32 (the current pattern is better -
+ shift->convert)
+ - It is useful for (uint_to_fp (vector_extract v4i32, N))
+ - Unfortunately, it can't be used for (sint_to_fp (vector_extract v4i32, N))
+ . Note: llvm has extractelement in "Vector Operations"
+ ; yields <ty>
+ <result> = extractelement <n x <ty>> <val>, <ty2> <idx>
+
+ How to map to it??
+ [(set f128:$XT, (extractelement v1f128:$XB, i4:$UIMM))]
+
+ . Or use intrinsic?
+ (set f128:$XT, (int_ppc_vsx_xxextractuw v1f128:$XB, i4:$UIMM))
+
+- Vector Insert Exponent DP/SP: xviexpdp xviexpsp
+ . Use intrinsic
+ (set v2f64:$XT, (int_ppc_vsx_xviexpdp v2f64:$XA, v2f64:$XB))
+ (set v4f32:$XT, (int_ppc_vsx_xviexpsp v4f32:$XA, v4f32:$XB))
+
+- Vector Extract Exponent/Significand DP/SP: xvxexpdp xvxexpsp xvxsigdp xvxsigsp
+ . Use intrinsic
+ (set v2f64:$XT, (int_ppc_vsx_xvxexpdp v2f64:$XB))
+ (set v4f32:$XT, (int_ppc_vsx_xvxexpsp v4f32:$XB))
+ (set v2f64:$XT, (int_ppc_vsx_xvxsigdp v2f64:$XB))
+ (set v4f32:$XT, (int_ppc_vsx_xvxsigsp v4f32:$XB))
+
+- Test Data Class SP/DP/QP: xststdcsp xststdcdp xststdcqp
+ . No SDAG, intrinsic, builtin are required?
+ Because it seems that we have no way to map BF field?
+
+ Instruction Form: [PO T XO B XO BX TX]
+ Asm: xststd* BF,XB,DCMX
+
+ BF is an index to CR register field.
+
+- Vector Test Data Class SP/DP: xvtstdcsp xvtstdcdp
+ . Use intrinsic
+ (set v4f32:$XT, (int_ppc_vsx_xvtstdcsp v4f32:$XB, i7:$DCMX))
+ (set v2f64:$XT, (int_ppc_vsx_xvtstdcdp v2f64:$XB, i7:$DCMX))
+
+- Maximum/Minimum Type-C/Type-J DP: xsmaxcdp xsmaxjdp xsmincdp xsminjdp
+ . PowerISA_V3.0:
+ "xsmaxcdp can be used to implement the C/C++/Java conditional operation
+ (x>y)?x:y for single-precision and double-precision arguments."
+
+ Note! c type and j type have different behavior when:
+ 1. Either input is NaN
+ 2. Both input are +-Infinity, +-Zero
+
+ . dtype map to llvm fmaxnum/fminnum
+ jtype use intrinsic
+
+ . xsmaxcdp xsmincdp
+ (set f64:$XT, (fmaxnum f64:$XA, f64:$XB))
+ (set f64:$XT, (fminnum f64:$XA, f64:$XB))
+
+ . xsmaxjdp xsminjdp
+ (set f64:$XT, (int_ppc_vsx_xsmaxjdp f64:$XA, f64:$XB))
+ (set f64:$XT, (int_ppc_vsx_xsminjdp f64:$XA, f64:$XB))
+
+- Vector Byte-Reverse H/W/D/Q Word: xxbrh xxbrw xxbrd xxbrq
+ . Use intrinsic
+ (set v8i16:$XT, (int_ppc_vsx_xxbrh v8i16:$XB))
+ (set v4i32:$XT, (int_ppc_vsx_xxbrw v4i32:$XB))
+ (set v2i64:$XT, (int_ppc_vsx_xxbrd v2i64:$XB))
+ (set v1i128:$XT, (int_ppc_vsx_xxbrq v1i128:$XB))
+
+- Vector Permute: xxperm xxpermr
+ . I have checked "PPCxxswapd" in PPCInstrVSX.td, but they are different
+ . Use intrinsic
+ (set v16i8:$XT, (int_ppc_vsx_xxperm v16i8:$XA, v16i8:$XB))
+ (set v16i8:$XT, (int_ppc_vsx_xxpermr v16i8:$XA, v16i8:$XB))
+
+- Vector Splat Immediate Byte: xxspltib
+ . Similar to XXSPLTW:
+ def XXSPLTW : XX2Form_2<60, 164,
+ (outs vsrc:$XT), (ins vsrc:$XB, u2imm:$UIM),
+ "xxspltw $XT, $XB, $UIM", IIC_VecPerm, []>;
+
+ . No SDAG, intrinsic, builtin are required?
+
+- Load/Store Vector: lxv stxv
+ . Has likely SDAG match:
+ (set v?:$XT, (load ix16addr:$src))
+ (set v?:$XT, (store ix16addr:$dst))
+
+ . Need define ix16addr in PPCInstrInfo.td
+ ix16addr: 16-byte aligned, see "def memrix16" in PPCInstrInfo.td
+
+- Load/Store Vector Indexed: lxvx stxvx
+ . Has likely SDAG match:
+ (set v?:$XT, (load xoaddr:$src))
+ (set v?:$XT, (store xoaddr:$dst))
+
+- Load/Store DWord: lxsd stxsd
+ . Similar to lxsdx/stxsdx:
+ def LXSDX : XX1Form<31, 588,
+ (outs vsfrc:$XT), (ins memrr:$src),
+ "lxsdx $XT, $src", IIC_LdStLFD,
+ [(set f64:$XT, (load xoaddr:$src))]>;
+
+ . (set f64:$XT, (load ixaddr:$src))
+ (set f64:$XT, (store ixaddr:$dst))
+
+- Load/Store SP, with conversion from/to DP: lxssp stxssp
+ . Similar to lxsspx/stxsspx:
+ def LXSSPX : XX1Form<31, 524, (outs vssrc:$XT), (ins memrr:$src),
+ "lxsspx $XT, $src", IIC_LdStLFD,
+ [(set f32:$XT, (load xoaddr:$src))]>;
+
+ . (set f32:$XT, (load ixaddr:$src))
+ (set f32:$XT, (store ixaddr:$dst))
+
+- Load as Integer Byte/Halfword & Zero Indexed: lxsibzx lxsihzx
+ . Similar to lxsiwzx:
+ def LXSIWZX : XX1Form<31, 12, (outs vsfrc:$XT), (ins memrr:$src),
+ "lxsiwzx $XT, $src", IIC_LdStLFD,
+ [(set f64:$XT, (PPClfiwzx xoaddr:$src))]>;
+
+ . (set f64:$XT, (PPClfiwzx xoaddr:$src))
+
+- Store as Integer Byte/Halfword Indexed: stxsibx stxsihx
+ . Similar to stxsiwx:
+ def STXSIWX : XX1Form<31, 140, (outs), (ins vsfrc:$XT, memrr:$dst),
+ "stxsiwx $XT, $dst", IIC_LdStSTFD,
+ [(PPCstfiwx f64:$XT, xoaddr:$dst)]>;
+
+ . (PPCstfiwx f64:$XT, xoaddr:$dst)
+
+- Load Vector Halfword*8/Byte*16 Indexed: lxvh8x lxvb16x
+ . Similar to lxvd2x/lxvw4x:
+ def LXVD2X : XX1Form<31, 844,
+ (outs vsrc:$XT), (ins memrr:$src),
+ "lxvd2x $XT, $src", IIC_LdStLFD,
+ [(set v2f64:$XT, (int_ppc_vsx_lxvd2x xoaddr:$src))]>;
+
+ . (set v8i16:$XT, (int_ppc_vsx_lxvh8x xoaddr:$src))
+ (set v16i8:$XT, (int_ppc_vsx_lxvb16x xoaddr:$src))
+
+- Store Vector Halfword*8/Byte*16 Indexed: stxvh8x stxvb16x
+ . Similar to stxvd2x/stxvw4x:
+ def STXVD2X : XX1Form<31, 972,
+ (outs), (ins vsrc:$XT, memrr:$dst),
+ "stxvd2x $XT, $dst", IIC_LdStSTFD,
+ [(store v2f64:$XT, xoaddr:$dst)]>;
+
+ . (store v8i16:$XT, xoaddr:$dst)
+ (store v16i8:$XT, xoaddr:$dst)
+
+- Load/Store Vector (Left-justified) with Length: lxvl lxvll stxvl stxvll
+ . Likely needs an intrinsic
+ . (set v?:$XT, (int_ppc_vsx_lxvl xoaddr:$src))
+ (set v?:$XT, (int_ppc_vsx_lxvll xoaddr:$src))
+
+ . (int_ppc_vsx_stxvl xoaddr:$dst))
+ (int_ppc_vsx_stxvll xoaddr:$dst))
+
+- Load Vector Word & Splat Indexed: lxvwsx
+ . Likely needs an intrinsic
+ . (set v?:$XT, (int_ppc_vsx_lxvwsx xoaddr:$src))
+
+Atomic operations (l[dw]at, st[dw]at):
+- Provide custom lowering for common atomic operations to use these
+ instructions with the correct Function Code
+- Ensure the operands are in the correct register (i.e. RT+1, RT+2)
+- Provide builtins since not all FC's necessarily have an existing LLVM
+ atomic operation
+
+Load Doubleword Monitored (ldmx):
+- Investigate whether there are any uses for this. It seems to be related to
+ Garbage Collection so it isn't likely to be all that useful for most
+ languages we deal with.
+
+Move to CR from XER Extended (mcrxrx):
+- Is there a use for this in LLVM?
+
+Fixed Point Facility:
+
+- Copy-Paste Facility: copy copy_first cp_abort paste paste. paste_last
+ . Use instrinstics:
+ (int_ppc_copy_first i32:$rA, i32:$rB)
+ (int_ppc_copy i32:$rA, i32:$rB)
+
+ (int_ppc_paste i32:$rA, i32:$rB)
+ (int_ppc_paste_last i32:$rA, i32:$rB)
+
+ (int_cp_abort)
+
+- Message Synchronize: msgsync
+- SLB*: slbieg slbsync
+- stop
+ . No instrinstics
diff --git a/contrib/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp b/contrib/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
new file mode 100644
index 000000000000..a637dd11f810
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
@@ -0,0 +1,37 @@
+//===-- PowerPCTargetInfo.cpp - PowerPC Target Implementation -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target &llvm::getThePPC32Target() {
+ static Target ThePPC32Target;
+ return ThePPC32Target;
+}
+Target &llvm::getThePPC64Target() {
+ static Target ThePPC64Target;
+ return ThePPC64Target;
+}
+Target &llvm::getThePPC64LETarget() {
+ static Target ThePPC64LETarget;
+ return ThePPC64LETarget;
+}
+
+extern "C" void LLVMInitializePowerPCTargetInfo() {
+ RegisterTarget<Triple::ppc, /*HasJIT=*/true> X(getThePPC32Target(), "ppc32",
+ "PowerPC 32");
+
+ RegisterTarget<Triple::ppc64, /*HasJIT=*/true> Y(getThePPC64Target(), "ppc64",
+ "PowerPC 64");
+
+ RegisterTarget<Triple::ppc64le, /*HasJIT=*/true> Z(
+ getThePPC64LETarget(), "ppc64le", "PowerPC 64 LE");
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/p9-instrs.txt b/contrib/llvm/lib/Target/PowerPC/p9-instrs.txt
new file mode 100644
index 000000000000..a70582aca398
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/p9-instrs.txt
@@ -0,0 +1,442 @@
+Content:
+========
+. Remaining Instructions (Total 56 Instructions, include 2 unknow instructions)
+. Done (Total 155 Instructions: 101 VSX, 54 Altivec)
+
+//------------------------------------------------------------------------------
+//. Remaining Instructions
+//------------------------------------------------------------------------------
+GCC reference: https://sourceware.org/ml/binutils/2015-11/msg00071.html
+
+// Add PC Immediate Shifted DX-form p69
+[PO RT d1 d0 XO d2] addpcis RT,D
+ subpcis Rx,value = addpcis Rx,-value
+
+// 6.17.2 Decimal Integer Format Conversion Instructions
+
+// Decimal Convert From National VX-form p352
+[PO VRT EO VRB 1 PS XO] bcdcfn. VRT,VRB,PS
+
+// Decimal Convert From Zoned VX-form p353
+[PO VRT EO VRB 1 PS XO] bcdcfz. VRT,VRB,PS
+
+// Decimal Convert To National VX-form p354
+[PO VRT EO VRB 1 / XO] bcdctn. VRT,VRB
+
+// Decimal Convert To Zoned VX-form p355
+[PO VRT EO VRB 1 PS XO] bcdctz. VRT,VRB,PS
+
+// Decimal Convert From Signed Quadword VX-form p356
+[PO VRT EO VRB 1 PS XO] bcdcfsq. VRT,VRB,PS
+
+// Decimal Convert To Signed Quadword VX-form p356
+[PO VRT EO VRB 1 / XO] bcdctsq. VRT,VRB
+
+// 6.17.3 Decimal Integer Sign Manipulation Instructions
+
+// Decimal Copy Sign VX-form p358
+[PO VRT VRA VRB XO] bcdcpsgn. VRT,VRA,VRB
+
+// Decimal Set Sign VX-form p358
+[PO VRT EO VRB 1 PS XO] bcdsetsgn. VRT,VRB,PS
+
+// Decimal Shift VX-form p359
+[PO VRT VRA VRB 1 PS XO] bcds. VRT,VRA,VRB,PS
+
+// Decimal Unsigned Shift VX-form p360
+[PO VRT VRA VRB 1 / XO] bcdus. VRT,VRA,VRB
+
+// Decimal Shift and Round VX-form p361
+[PO VRT VRA VRB 1 PS XO] bcdsr. VRT,VRA,VRB,PS
+
+// 6.17.5 Decimal Integer Truncate Instructions
+
+// Decimal Truncate VX-form p362
+[PO VRT VRA VRB 1 PS XO] bcdtrunc. VRT,VRA,VRB,PS
+
+// Decimal Unsigned Truncate VX-form p363
+[PO VRT VRA VRB 1 / XO] bcdutrunc. VRT,VRA,VRB
+
+// 3.3.10.1 Character-Type Compare Instructions
+
+// Compare Ranged Byte X-form p87
+[PO BF / L RA RB XO /] cmprb BF,L,RA,RB
+
+// Compare Equal Byte X-form p88
+[PO BF // RA RB XO /] cmpeqb BF,RA,RB
+
+// 3.3.13 Fixed-Point Logical Instructions
+
+// Count Trailing Zeros Word X-form p95
+[PO RS RA /// XO Rc] cnttzw(.) RA,RS
+
+// 3.3.13.1 64-bit Fixed-Point Logical Instructions
+
+// Count Trailing Zeros Doubleword X-form p98
+[PO RS RA /// XO Rc] cnttzd(.) RA,RS
+
+// 4.4 Copy-Paste Facility
+
+// Copy X-form p858
+[PO /// L RA RB XO /] copy RA,RB,L
+ copy_first = copy RA, RB, 1
+// CP_Abort p860
+[PO /// /// /// XO /] cp_abort
+
+// Paste p859
+[PO /// L RA RB XO Rc] paste(.) RA,RB,L
+ paste_last = paste RA,RB,1
+
+// 3.3.9 Fixed-Point Arithmetic Instructions
+
+// Deliver A Random Number X-form p79
+[PO RT /// L /// XO /] darn RT,L
+
+// Multiply-Add High Doubleword VA-form p81
+[PO RT RA RB RC XO] maddhd RT,RA.RB,RC
+
+// Multiply-Add High Doubleword Unsigned VA-form p81
+[PO RT RA RB RC XO] maddhdu RT,RA.RB,RC
+
+// Multiply-Add Low Doubleword VA-form p81
+[PO RT RA RB RC XO] maddld RT,RA.RB,RC
+
+// Modulo Signed Word X-form p76
+[PO RT RA RB XO /] modsw RT,RA,RB
+
+// Modulo Unsigned Word X-form p76
+[PO RT RA RB XO /] moduw RT,RA,RB
+
+// Modulo Signed Doubleword X-form p84
+[PO RT RA RB XO /] modsd RT,RA,RB
+
+// Modulo Unsigned Doubleword X-form p84
+[PO RT RA RB XO /] modud RT,RA,RB
+
+
+// DFP Test Significance Immediate [Quad] X-form p204
+[PO BF / UIM FRB XO /] dtstsfi BF,UIM,FRB
+[PO BF / UIM FRBp XO /] dtstsfiq BF,UIM,FRBp
+
+// 3.3.14.2.1 64-bit Fixed-Point Shift Instructions
+
+// Extend-Sign Word and Shift Left Immediate XS-form p109
+[PO RS RA sh XO sh Rc] extswsli(.) RA,RS,SH
+
+// 4.5.1 Load Atomic
+
+// Load Word Atomic X-form p864
+[PO RT RA FC XO /] lwat RT,RA,FC
+
+// Load Doubleword Atomic X-form p864
+[PO RT RA FC XO /] ldat RT,RA,FC
+
+// 4.5.2 Store Atomic
+
+// Store Word Atomic X-form p866
+[PO RS RA FC XO /] stwat RS,RA,FC
+
+// Store Doubleword Atomic X-form p866
+[PO RS RA FC XO /] stdat RS,RA,FC
+
+// 3.3.2.1 64-bit Fixed-Point Load Instructions
+
+// Load Doubleword Monitored Indexed X-form p54
+[PO RT RA RB XO /] ldmx RT,RA,RB
+
+// 3.3.16 Move To/From Vector-Scalar Register Instructions
+
+// Move From VSR Lower Doubleword XX1-form p111
+[PO S RA /// XO SX] mfvsrld RA,XS
+
+// Move To VSR Double Doubleword XX1-form p114
+[PO T RA RB XO TX] mtvsrdd XT,RA,RB
+
+// Move To VSR Word & Splat XX1-form p115
+[PO T RA /// XO TX] mtvsrws XT,RA
+
+// Move to CR from XER Extended X-form p119
+[PO BF // /// /// XO /] mcrxrx BF
+
+// Set Boolean X-form p121
+[PO RT BFA // /// XO /] setb RT,BFA
+
+// Message Synchronize X-form p1126
+[PO /// /// /// XO /] msgsync
+
+// SLB Invalidate Entry Global X-form p1026
+[PO RS /// RB XO /] slbieg RS,RB
+
+// SLB Synchronize X-form p1031
+[PO /// /// /// XO /] slbsync
+
+// 3.3.2.1 Power-Saving Mode Instruction
+
+// stop XL-form p957
+[PO /// /// /// XO /] stop
+
+// 4.6.4 Wait Instruction
+// Wait X-form p880
+[PO /// WC /// /// XO /] wait
+
+// Unknow Instructions:
+urfid
+- gcc's implementation:
+ {"urfid", XL(19,306), 0xffffffff, POWER9, PPCNONE, {0}},
+ (4c 00 02 64|64 02 00 4c) urfid
+
+rmieg
+- gcc's implementation:
+ {"rmieg", X(31,882), XRTRA_MASK, POWER9, PPCNONE, {RB}},
+ (7c 00 f6 e4|e4 f6 00 7c) rmieg r30
+
+//------------------------------------------------------------------------------
+//. Done:
+//------------------------------------------------------------------------------
+
+//======================================
+"vsx instructions"
+
+//--------------------------------------
+"7.6.1.2.1 VSX Scalar Move Instructions"
+// VSX Scalar Quad-Precision Move Instructions
+
+// VSX Scalar Copy Sign Quad-Precision X-form p.553
+[PO VRT VRA VRB XO /] xscpsgnqp
+
+// VSX Scalar Absolute Quad-Precision X-form 531
+// VSX Scalar Negate Quad-Precision X-form 627
+// VSX Scalar Negative Absolute Quad-Precision X-form 626
+[PO VRT XO VRB XO /] xsabsqp xsnegqp xsnabsqp
+
+//--------------------------------------
+"7.6.1.3 VSX Floating-Point Arithmetic Instructions"
+
+// VSX Scalar Quad-Precision Elementary Arithmetic
+
+// VSX Scalar Add Quad-Precision [using round to Odd] X-form 539
+// VSX Scalar Divide Quad-Precision [using round to Odd] X-form 584
+// VSX Scalar Multiply Quad-Precision [using round to Odd] X-form 622
+[PO VRT VRA VRB XO RO] xsaddqp xsaddqpo xsdivqp xsdivqpo xsmulqp xsmulqpo
+
+// VSX Scalar Square Root Quad-Precision [using round to Odd] X-form 662
+// VSX Scalar Subtract Quad-Precision [using round to Odd] X-form 667
+ xssubqp xssubqpo
+
+[PO VRT XO VRB XO RO] xssqrtqp xssqrtqpo
+
+// VSX Scalar Quad-Precision Multiply-Add Arithmetic Instructions
+
+// VSX Scalar Multiply-Add Quad-Precision [using round to Odd] X-form 596
+// VSX Scalar Multiply-Subtract Quad-Precision [using round to Odd] X-form 617
+// VSX Scalar Negative Multiply-Add Quad-Precision [using round to Odd] X-form 636
+// VSX Scalar Negative Multiply-Subtract Quad-Precision [using round to Odd]
+// X-form 645
+[PO VRT VRA VRB XO RO] xsmaddqp xsmaddqpo xsmsubqp xsmsubqpo
+ xsnmaddqp xsnmaddqpo xsnmsubqp xsnmsubqpo
+
+22
+//--------------------------------------
+"7.6.1.4 VSX Floating-Point Compare Instructions"
+
+// VSX Scalar Quad-Precision Compare Instructions
+
+// VSX Scalar Compare Ordered Quad-Precision X-form 549
+// VSX Scalar Compare Unordered Quad-Precision X-form 552
+[PO BF // VRA VRB XO /] xscmpoqp xscmpuqp
+
+"7.6.1.8 VSX Scalar Floating-Point Support Instructions"
+// VSX Scalar Compare Exponents Quad-Precision X-form p. 541 542
+[PO BF // A B XO AX BX /] xscmpexpdp
+[PO BF // VRA VRB XO /] xscmpexpqp
+
+// VSX Scalar Compare DP, XX3-form, p.543 544 545
+// VSX Scalar Compare Equal Double-Precision,
+[PO T A B XO AX BX TX] xscmpeqdp xscmpgedp xscmpgtdp xscmpnedp
+
+// VSX Vector Compare Not Equal Double-Precision XX3-form 691
+[PO T A B Rc XO AX BX TX] xvcmpnedp xvcmpnedp. xvcmpnesp xvcmpnesp.
+
+//--------------------------------------
+"7.6.1.5 VSX FP-FP Conversion Instructions"
+// VSX Scalar Quad-Precision Floating-Point Conversion Instructions
+
+// VSX Scalar round & Convert Quad-Precision format to Double-Precision format
+// [using round to Odd] X-form 567
+[PO VRT XO VRB XO /] xscvqpdp xscvqpdpo (actually [PO VRT XO VRB XO RO])
+[PO VRT XO VRB XO /] xscvdpqp
+
+// VSX Scalar Quad-Precision Convert to Integer Instructions
+
+// VSX Scalar truncate & Convert Quad-Precision format to Signed Doubleword format
+// 568 570 572 574
+[PO VRT XO VRB XO /] xscvqpsdz xscvqpswz xscvqpudz xscvqpuwz
+576 = 580 xscvsdqp xscvudqp
+
+"7.6.1.7 VSX Round to Floating-Point Integer Instructions"
+// VSX Scalar round & Convert Double-Precision format to Half-Precision format
+// XX2-form 554 566
+[PO T XO B XO BX TX] xscvdphp xscvhpdp
+
+// VSX Vector Convert Half-Precision format to Single-Precision format
+// XX2-form 703 705
+[PO T XO B XO BX TX] xvcvhpsp xvcvsphp
+
+// VSX Scalar Round to Quad-Precision Integer [with Inexact] Z23-form 654
+[PO VRT /// R VRB RMC XO EX] xsrqpi xsrqpix
+
+// VSX Scalar Round Quad-Precision to Double-Extended Precision Z23-form 656
+[PO VRT /// R VRB RMC XO /] xsrqpxp
+def XSRQPXP : Z23Form_1<63, 37,
+ (outs vrrc:$vT), (ins u5imm:$R, vrrc:$vB, u2imm:$RMC),
+ "xsrqpxp $vT, $R, $vB, $RMC"), IIC_VecFP, []>;
+
+27~28
+//--------------------------------------
+// VSX Scalar Insert Exponent Double-Precision X-form 588
+// VSX Scalar Insert Exponent Quad-Precision X-form 589
+[PO VT rA rB XO /] xsiexpdp
+[PO VRT VRA VRB XO /] xsiexpqp
+
+// VSX Vector Insert Exponent Double-Precision XX3-form 722
+[PO T A B XO AX BX TX] xviexpdp xviexpsp
+
+// VSX Vector Extract Unsigned Word XX2-form 788
+// VSX Vector Insert Word XX2-form
+[PO T / UIM B XO BX TX] xxextractuw xxinsertw
+
+// VSX Scalar Extract Exponent Double-Precision XX2-form 676
+[PO BF DCMX B XO BX /]
+[PO T XO B XO BX /] xsxexpdp xsxsigdp
+// X-form
+[PO VRT XO VRB XO /] xsxexpqp xsxsigqp
+
+// VSX Vector Extract Exponent Double-Precision XX2-form 784
+[PO T XO B XO BX TX] xvxexpdp xvxexpsp
+
+// VSX Vector Extract Significand Double-Precision XX2-form 785
+[PO T XO B XO BX TX] xvxsigdp xvxsigsp
+
+//--------------------------------------
+// VSX Scalar Test Data Class Double-Precision XX2-form p673
+// VSX Scalar Test Data Class Quad-Precision X-form 674
+// VSX Scalar Test Data Class Single-Precision XX2-form 675
+[PO BF DCMX B XO BX /] xststdcdp xststdcsp
+[PO BF DCMX VRB XO /] xststdcqp
+
+// VSX Vector Test Data Class Double-Precision XX2-form 782 783
+[PO T dx B XO dc XO dm BX TX] xvtstdcdp xvtstdcsp
+
+//--------------------------------------
+// VSX Scalar Maximum Type-C Double-Precision XX3-form 601 ~ 609
+[PO T A B XO AX BX TX] xsmaxcdp xsmaxjdp xsmincdp xsminjdp
+
+//--------------------------------------
+// VSX Vector Byte-Reverse Doubleword XX2-form 786 787
+[PO T XO B XO BX TX] xxbrd xxbrh xxbrq xxbrw
+
+// VSX Vector Permute XX3-form 794
+[PO T A B XO AX BX TX] xxperm xxpermr
+
+// VSX Vector Splat Immediate Byte 796 x-form
+[PO T EO IMM8 XO TX] xxspltib <= sign or unsigned?
+
+30
+//--------------------------------------
+// Load VSX Vector DQ-form 511
+[PO T RA DQ TX XO] lxv
+
+// Store VSX Vector DQ-form 526
+[PO S RA DQ SX XO] stxv
+
+// Load VSX Scalar Doubleword DS-form 499
+// Load VSX Scalar Single DS-form 504
+[PO VRT RA DS XO] lxsd lxssp
+
+// Store VSX Scalar Doubleword DS-form 517
+// Store VSX Scalar Single DS-form 520
+[PO VRT RA DS XO] stxsd stxssp
+
+
+// Load VSX Vector Indexed X-form 511
+// Load VSX Scalar as Integer Byte & Zero Indexed X-form 501
+// Load VSX Vector Byte*16 Indexed X-form 506
+// Load VSX Vector with Length X-form 508
+// Load VSX Vector Left-justified with Length X-form 510
+// Load VSX Vector Halfword*8 Indexed X-form 514
+// Load VSX Vector Word & Splat Indexed X-form 516
+[PO T RA RB XO TX] lxvx lxsibzx lxsihzx lxvb16x lxvl lxvll lxvh8x lxvwsx
+
+// Store VSX Scalar as Integer Byte Indexed X-form 518
+// Store VSX Scalar as Integer Halfword Indexed X-form 518
+// Store VSX Vector Byte*16 Indexed X-form 522
+// Store VSX Vector Halfword*8 Indexed X-form 524
+// Store VSX Vector with Length X-form 526
+// Store VSX Vector Left-justified with Length X-form 528
+// Store VSX Vector Indexed X-form 529
+[PO S RA RB XO SX] stxsibx stxsihx stxvb16x stxvh8x stxvl stxvll stxvx
+
+21
+
+//--------------------------------------
+". vector instructions"
+
+[1] PowerISA-v3.0 p.933 - Table 1, and Chapter 6. Vector Facility (altivec)
+[2] https://sourceware.org/ml/binutils/2015-11/msg00071.html
+
+//--------------------------------------
+New patch:
+// vector bit, p.367, 6.16 Vector Bit Permute Instruction
+[PO VRT VRA VRB XO] vbpermd, (existing: vbpermq)
+
+// vector permute, p.280
+[PO VRT VRA VRB VRC XO] vpermr
+
+// vector rotate left, p.341
+[PO VRT VRA VRB XO] vrlwnm vrlwmi vrldnm vrldmi
+
+// vector shift, p.285
+[PO VRT VRA VRB XO] vslv vsrv
+
+// vector multiply-by-10, p.375
+[PO VRT VRA /// XO] vmul10cuq vmul10uq
+[PO VRT VRA VRB XO] vmul10ecuq vmul10euq
+
+12
+//--------------------------------------
+http://reviews.llvm.org/D15887 + ext + neg + prty - vbpermd
+// vector count leading/trailing zero
+. new vx-form: p.31, 1.6.14 VX-FORM
+[PO RT EO VRB XO] vclzlsbb vctzlsbb (p.363)
+
+// Vector Count Trailing Zeros Instructions, 362
+[PO VRT EO VRB XO] vctzb vctzh vctzw vctzd (v16i8 v8i16 v4i32 v2i64)
+
+// vector extend sign (p.314)
+[PO VRT EO VRB XO] vextsb2w vextsh2w vextsb2d vextsh2d vextsw2d
+
+// vector negate, p.313
+[PO VRT EO VRB XO] vnegd vnegw
+
+// vector parity, p.335
+[PO VRT EO VRB XO] vprtybd vprtybq vprtybw
+
+16
+//--------------------------------------
+// vector compare, p.330
+[PO VRT VRA VRB RC XO] vcmpneb vcmpneb. vcmpneh vcmpneh. vcmpnew vcmpnew.
+ vcmpnezb vcmpnezb. vcmpnezh vcmpnezh. vcmpnezw vcmpnezw.
+12
+//--------------------------------------
+http://reviews.llvm.org/D15917 + insert
+// vector extract (p.287) ref: vspltb (v2.07, p.227)
+// vector insert, p.288
+[PO VRT / UIM VRB XO] vinsertb vinsertd vinserth vinsertw
+
+// Vector Extract Unsigned
+[PO VRT / UIM VRB XO] vextractub vextractuh vextractuw vextractd
+
+// p.364: Vector Extract Unsigned Left/Right-Indexed
+[PO RT RA VRB XO] vextublx vextubrx vextuhlx vextuhrx vextuwlx vextuwrx
+
+14
diff --git a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
new file mode 100644
index 000000000000..f8ef142255c8
--- /dev/null
+++ b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -0,0 +1,91 @@
+//===-- RISCVAsmBackend.cpp - RISCV Assembler Backend ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/RISCVMCTargetDesc.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+class RISCVAsmBackend : public MCAsmBackend {
+ uint8_t OSABI;
+ bool Is64Bit;
+
+public:
+ RISCVAsmBackend(uint8_t OSABI, bool Is64Bit)
+ : MCAsmBackend(), OSABI(OSABI), Is64Bit(Is64Bit) {}
+ ~RISCVAsmBackend() override {}
+
+ void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+ uint64_t Value, bool IsPCRel) const override;
+
+ MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
+
+ bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+ const MCRelaxableFragment *DF,
+ const MCAsmLayout &Layout) const override {
+ return false;
+ }
+
+ unsigned getNumFixupKinds() const override { return 1; }
+
+ bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
+
+ void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+ MCInst &Res) const override {
+
+ llvm_unreachable("RISCVAsmBackend::relaxInstruction() unimplemented");
+ }
+
+ bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+};
+
+bool RISCVAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+ // Once support for the compressed instruction set is added, we will be able
+ // to conditionally support 16-bit NOPs
+ if ((Count % 4) != 0)
+ return false;
+
+ // The canonical nop on RISC-V is addi x0, x0, 0
+ for (uint64_t i = 0; i < Count; i += 4)
+ OW->write32(0x13);
+
+ return true;
+}
+
+void RISCVAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+ unsigned DataSize, uint64_t Value,
+ bool IsPCRel) const {
+ return;
+}
+
+MCObjectWriter *
+RISCVAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
+ return createRISCVELFObjectWriter(OS, OSABI, Is64Bit);
+}
+
+} // end anonymous namespace
+
+MCAsmBackend *llvm::createRISCVAsmBackend(const Target &T,
+ const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options) {
+ uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TT.getOS());
+ return new RISCVAsmBackend(OSABI, TT.isArch64Bit());
+}
diff --git a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
new file mode 100644
index 000000000000..4f085d31a267
--- /dev/null
+++ b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
@@ -0,0 +1,47 @@
+//===-- RISCVELFObjectWriter.cpp - RISCV ELF Writer -----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/RISCVMCTargetDesc.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+class RISCVELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+ RISCVELFObjectWriter(uint8_t OSABI, bool Is64Bit);
+
+ ~RISCVELFObjectWriter() override;
+
+protected:
+ unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+ const MCFixup &Fixup, bool IsPCRel) const override;
+};
+}
+
+RISCVELFObjectWriter::RISCVELFObjectWriter(uint8_t OSABI, bool Is64Bit)
+ : MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_RISCV,
+ /*HasRelocationAddend*/ false) {}
+
+RISCVELFObjectWriter::~RISCVELFObjectWriter() {}
+
+unsigned RISCVELFObjectWriter::getRelocType(MCContext &Ctx,
+ const MCValue &Target,
+ const MCFixup &Fixup,
+ bool IsPCRel) const {
+ llvm_unreachable("invalid fixup kind!");
+}
+
+MCObjectWriter *llvm::createRISCVELFObjectWriter(raw_pwrite_stream &OS,
+ uint8_t OSABI, bool Is64Bit) {
+ MCELFObjectTargetWriter *MOTW = new RISCVELFObjectWriter(OSABI, Is64Bit);
+ return createELFObjectWriter(MOTW, OS, /*IsLittleEndian*/ true);
+}
diff --git a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp
new file mode 100644
index 000000000000..b164df8b595a
--- /dev/null
+++ b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp
@@ -0,0 +1,25 @@
+//===-- RISCVMCAsmInfo.cpp - RISCV Asm properties -------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the RISCVMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCVMCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+using namespace llvm;
+
+void RISCVMCAsmInfo::anchor() {}
+
+RISCVMCAsmInfo::RISCVMCAsmInfo(const Triple &TT) {
+ PointerSize = CalleeSaveStackSlotSize = TT.isArch64Bit() ? 8 : 4;
+ CommentString = "#";
+ AlignmentIsInBytes = false;
+ SupportsDebugInformation = true;
+}
diff --git a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h
new file mode 100644
index 000000000000..901a1eba8af2
--- /dev/null
+++ b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h
@@ -0,0 +1,31 @@
+//===-- RISCVMCAsmInfo.h - RISCV Asm Info ----------------------*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the RISCVMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVMCASMINFO_H
+#define LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVMCASMINFO_H
+
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+class Triple;
+
+class RISCVMCAsmInfo : public MCAsmInfoELF {
+ void anchor() override;
+
+public:
+ explicit RISCVMCAsmInfo(const Triple &TargetTriple);
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
new file mode 100644
index 000000000000..b2ed13758d41
--- /dev/null
+++ b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -0,0 +1,91 @@
+//===-- RISCVMCCodeEmitter.cpp - Convert RISCV code to machine code -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the RISCVMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/RISCVMCTargetDesc.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mccodeemitter"
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
+
+namespace {
+class RISCVMCCodeEmitter : public MCCodeEmitter {
+ RISCVMCCodeEmitter(const RISCVMCCodeEmitter &) = delete;
+ void operator=(const RISCVMCCodeEmitter &) = delete;
+ MCContext &Ctx;
+
+public:
+ RISCVMCCodeEmitter(MCContext &ctx) : Ctx(ctx) {}
+
+ ~RISCVMCCodeEmitter() override {}
+
+ void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const override;
+
+ /// TableGen'erated function for getting the binary encoding for an
+ /// instruction.
+ uint64_t getBinaryCodeForInstr(const MCInst &MI,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// Return binary encoding of operand. If the machine operand requires
+ /// relocation, record the relocation and return zero.
+ unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+};
+} // end anonymous namespace
+
+MCCodeEmitter *llvm::createRISCVMCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx) {
+ return new RISCVMCCodeEmitter(Ctx);
+}
+
+void RISCVMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // For now, we only support RISC-V instructions with 32-bit length
+ uint32_t Bits = getBinaryCodeForInstr(MI, Fixups, STI);
+ support::endian::Writer<support::little>(OS).write(Bits);
+ ++MCNumEmitted; // Keep track of the # of mi's emitted.
+}
+
+unsigned
+RISCVMCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+
+ if (MO.isReg())
+ return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
+
+ if (MO.isImm())
+ return static_cast<unsigned>(MO.getImm());
+
+ llvm_unreachable("Unhandled expression!");
+ return 0;
+}
+
+#include "RISCVGenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
new file mode 100644
index 000000000000..4fc69a7fcaba
--- /dev/null
+++ b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
@@ -0,0 +1,59 @@
+//===-- RISCVMCTargetDesc.cpp - RISCV Target Descriptions -----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// This file provides RISCV-specific target descriptions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "RISCVMCTargetDesc.h"
+#include "RISCVMCAsmInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_INSTRINFO_MC_DESC
+#include "RISCVGenInstrInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "RISCVGenRegisterInfo.inc"
+
+using namespace llvm;
+
+static MCInstrInfo *createRISCVMCInstrInfo() {
+ MCInstrInfo *X = new MCInstrInfo();
+ InitRISCVMCInstrInfo(X);
+ return X;
+}
+
+static MCRegisterInfo *createRISCVMCRegisterInfo(const Triple &TT) {
+ MCRegisterInfo *X = new MCRegisterInfo();
+ InitRISCVMCRegisterInfo(X, RISCV::X1_32);
+ return X;
+}
+
+static MCAsmInfo *createRISCVMCAsmInfo(const MCRegisterInfo &MRI,
+ const Triple &TT) {
+ MCAsmInfo *MAI = new RISCVMCAsmInfo(TT);
+ return MAI;
+}
+
+extern "C" void LLVMInitializeRISCVTargetMC() {
+ for (Target *T : {&getTheRISCV32Target(), &getTheRISCV64Target()}) {
+ RegisterMCAsmInfoFn X(*T, createRISCVMCAsmInfo);
+ TargetRegistry::RegisterMCInstrInfo(*T, createRISCVMCInstrInfo);
+ TargetRegistry::RegisterMCRegInfo(*T, createRISCVMCRegisterInfo);
+ TargetRegistry::RegisterMCAsmBackend(*T, createRISCVAsmBackend);
+ TargetRegistry::RegisterMCCodeEmitter(*T, createRISCVMCCodeEmitter);
+ }
+}
diff --git a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h
new file mode 100644
index 000000000000..ddc3bf350452
--- /dev/null
+++ b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h
@@ -0,0 +1,58 @@
+//===-- RISCVMCTargetDesc.h - RISCV Target Descriptions ---------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides RISCV specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVMCTARGETDESC_H
+#define LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVMCTARGETDESC_H
+
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Config/config.h"
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class StringRef;
+class Target;
+class Triple;
+class raw_ostream;
+class raw_pwrite_stream;
+
+Target &getTheRISCV32Target();
+Target &getTheRISCV64Target();
+
+MCCodeEmitter *createRISCVMCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx);
+
+MCAsmBackend *createRISCVAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options);
+
+MCObjectWriter *createRISCVELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI,
+ bool Is64Bit);
+}
+
+// Defines symbolic names for RISC-V registers.
+#define GET_REGINFO_ENUM
+#include "RISCVGenRegisterInfo.inc"
+
+// Defines symbolic names for RISC-V instructions.
+#define GET_INSTRINFO_ENUM
+#include "RISCVGenInstrInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/RISCV/RISCV.td b/contrib/llvm/lib/Target/RISCV/RISCV.td
new file mode 100644
index 000000000000..14838309a1bf
--- /dev/null
+++ b/contrib/llvm/lib/Target/RISCV/RISCV.td
@@ -0,0 +1,27 @@
+//===-- RISCV.td - Describe the RISCV Target Machine -------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+include "RISCVRegisterInfo.td"
+include "RISCVInstrInfo.td"
+
+
+def RISCVInstrInfo : InstrInfo;
+
+def Feature64Bit : SubtargetFeature<"64bit", "HasRV64", "true",
+ "Implements RV64">;
+
+def : ProcessorModel<"generic-rv32", NoSchedModel, []>;
+
+def : ProcessorModel<"generic-rv64", NoSchedModel, [Feature64Bit]>;
+
+def RISCV : Target {
+ let InstructionSet = RISCVInstrInfo;
+}
diff --git a/contrib/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/contrib/llvm/lib/Target/RISCV/RISCVInstrFormats.td
new file mode 100644
index 000000000000..1e9bc3bf9bc5
--- /dev/null
+++ b/contrib/llvm/lib/Target/RISCV/RISCVInstrFormats.td
@@ -0,0 +1,152 @@
+//===-- RISCVInstrFormats.td - RISCV Instruction Formats ---*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//
+// These instruction format definitions are structured to match the
+// description in the RISC-V User-Level ISA specification as closely as
+// possible. For instance, the specification describes instructions with the
+// MSB (31st bit) on the left and the LSB (0th bit) on the right. This is
+// reflected in the order of parameters to each instruction class.
+//
+// One area of divergence is in the description of immediates. The
+// specification describes immediate encoding in terms of bit-slicing
+// operations on the logical value represented. The immediate argument to
+// these instruction formats instead represents the bit sequence that will be
+// inserted into the instruction. e.g. although JAL's immediate is logically
+// a 21-bit value (where the LSB is always zero), we describe it as an imm20
+// to match how it is encoded.
+//
+//===----------------------------------------------------------------------===//
+
+class RISCVInst<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : Instruction {
+ field bits<32> Inst;
+ let Size = 4;
+
+ bits<7> Opcode = 0;
+
+ let Inst{6-0} = Opcode;
+
+ let Namespace = "RISCV";
+
+ dag OutOperandList = outs;
+ dag InOperandList = ins;
+ let AsmString = asmstr;
+ let Pattern = pattern;
+}
+
+// Pseudo instructions
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : RISCVInst<outs, ins, asmstr, pattern> {
+ let isPseudo = 1;
+}
+
+class FR<bits<7> funct7, bits<3> funct3, bits<7> opcode, dag outs, dag ins,
+ string asmstr, list<dag> pattern> : RISCVInst<outs, ins, asmstr, pattern>
+{
+ bits<5> rs2;
+ bits<5> rs1;
+ bits<5> rd;
+
+ let Inst{31-25} = funct7;
+ let Inst{24-20} = rs2;
+ let Inst{19-15} = rs1;
+ let Inst{14-12} = funct3;
+ let Inst{11-7} = rd;
+ let Opcode = opcode;
+}
+
+class FI<bits<3> funct3, bits<7> opcode, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : RISCVInst<outs, ins, asmstr, pattern>
+{
+ bits<12> imm12;
+ bits<5> rs1;
+ bits<5> rd;
+
+ let Inst{31-20} = imm12;
+ let Inst{19-15} = rs1;
+ let Inst{14-12} = funct3;
+ let Inst{11-7} = rd;
+ let Opcode = opcode;
+}
+
+class FI32Shift<bit arithshift, bits<3> funct3, bits<7> opcode, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : RISCVInst<outs, ins, asmstr, pattern>
+{
+ bits<5> shamt;
+ bits<5> rs1;
+ bits<5> rd;
+
+ let Inst{31} = 0;
+ let Inst{30} = arithshift;
+ let Inst{29-25} = 0;
+ let Inst{24-20} = shamt;
+ let Inst{19-15} = rs1;
+ let Inst{14-12} = funct3;
+ let Inst{11-7} = rd;
+ let Opcode = opcode;
+}
+
+class FS<bits<3> funct3, bits<7> opcode, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : RISCVInst<outs, ins, asmstr, pattern>
+{
+ bits<12> imm12;
+ bits<5> rs2;
+ bits<5> rs1;
+
+ let Inst{31-25} = imm12{11-5};
+ let Inst{24-20} = rs2;
+ let Inst{19-15} = rs1;
+ let Inst{14-12} = funct3;
+ let Inst{11-7} = imm12{4-0};
+ let Opcode = opcode;
+}
+
+class FSB<bits<3> funct3, bits<7> opcode, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : RISCVInst<outs, ins, asmstr, pattern>
+{
+ bits<12> imm12;
+ bits<5> rs2;
+ bits<5> rs1;
+
+ let Inst{31} = imm12{11};
+ let Inst{30-25} = imm12{9-4};
+ let Inst{24-20} = rs2;
+ let Inst{19-15} = rs1;
+ let Inst{14-12} = funct3;
+ let Inst{11-8} = imm12{3-0};
+ let Inst{7} = imm12{10};
+ let Opcode = opcode;
+}
+
+class FU<bits<7> opcode, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : RISCVInst<outs, ins, asmstr, pattern>
+{
+ bits<20> imm20;
+ bits<5> rd;
+
+ let Inst{31-12} = imm20;
+ let Inst{11-7} = rd;
+ let Opcode = opcode;
+}
+
+class FUJ<bits<7> opcode, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : RISCVInst<outs, ins, asmstr, pattern>
+{
+ bits<20> imm20;
+ bits<5> rd;
+
+ let Inst{31} = imm20{19};
+ let Inst{30-21} = imm20{9-0};
+ let Inst{20} = imm20{10};
+ let Inst{19-12} = imm20{18-11};
+ let Inst{11-7} = rd;
+ let Opcode = opcode;
+}
diff --git a/contrib/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/contrib/llvm/lib/Target/RISCV/RISCVInstrInfo.td
new file mode 100644
index 000000000000..52530c2f136c
--- /dev/null
+++ b/contrib/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -0,0 +1,55 @@
+//===-- RISCVInstrInfo.td - Target Description for RISCV ---*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the RISC-V instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+include "RISCVInstrFormats.td"
+
+def simm12 : Operand<i32>;
+
+// As noted in RISCVRegisterInfo.td, the hope is that support for
+// variable-sized register classes will mean that instruction definitions do
+// not need to be duplicated for 32-bit and 64-bit register classes. For now
+// we use 'GPR', which is 32-bit. When codegen for both RV32 and RV64 is
+// added, we will need to duplicate instruction definitions unless a proposal
+// like <http://lists.llvm.org/pipermail/llvm-dev/2016-September/105027.html>
+// is adopted.
+
+class ALU_ri<bits<3> funct3, string OpcodeStr> :
+ FI<funct3, 0b0010011, (outs GPR:$rd), (ins GPR:$rs1, simm12:$imm12),
+ OpcodeStr#"\t$rd, $rs1, $imm12", []>
+{
+}
+
+def ADDI : ALU_ri<0b000, "addi">;
+def SLTI : ALU_ri<0b010, "slti">;
+def SLTIU : ALU_ri<0b011, "sltiu">;
+def XORI : ALU_ri<0b100, "xori">;
+def ORI : ALU_ri<0b110, "ori">;
+def ANDI : ALU_ri<0b111, "andi">;
+
+class ALU_rr<bits<7> funct7, bits<3> funct3, string OpcodeStr> :
+ FR<funct7, funct3, 0b0110011, (outs GPR:$rd), (ins GPR:$rs1, GPR:$rs2),
+ OpcodeStr#"\t$rd, $rs1, $rs2", []>
+{
+}
+
+def ADD : ALU_rr<0b0000000, 0b000, "add">;
+def SUB : ALU_rr<0b0100000, 0b000, "sub">;
+def SLL : ALU_rr<0b0000000, 0b001, "sll">;
+def SLT : ALU_rr<0b0000000, 0b010, "slt">;
+def SLTU : ALU_rr<0b0000000, 0b011, "sltu">;
+def XOR : ALU_rr<0b0000000, 0b100, "xor">;
+def SRL : ALU_rr<0b0000000, 0b101, "srl">;
+def SRA : ALU_rr<0b0100000, 0b101, "sra">;
+def OR : ALU_rr<0b0000000, 0b110, "or">;
+def AND : ALU_rr<0b0000000, 0b111, "and">;
+
diff --git a/contrib/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/contrib/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
new file mode 100644
index 000000000000..f04de217bf0d
--- /dev/null
+++ b/contrib/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -0,0 +1,90 @@
+//===-- RISCVRegisterInfo.td - RISC-V Register defs --------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Declarations that describe the RISC-V register file
+//===----------------------------------------------------------------------===//
+
+let Namespace = "RISCV" in {
+ def sub_32 : SubRegIndex<32>;
+
+ class RISCVReg32<bits<5> Enc, string n, list<string> alt = []> : Register<n> {
+ let HWEncoding{4-0} = Enc;
+ let AltNames = alt;
+ }
+
+ // RISCV64 registers don't define an AsmName or AltName. If they specified
+ // names aliasing the RISCVReg32 registers, the generation of the default
+ // MatchRegisterName/MatchRegisterAltName would fail. When necessary,
+ // RISCVAsmParser will need to convert a register number from a RISCVReg32
+ // to the equivalent RISCVReg64.
+ class RISCVReg64<RISCVReg32 subreg> : Register<""> {
+ let HWEncoding{4-0} = subreg.HWEncoding{4-0};
+ let SubRegs = [subreg];
+ let SubRegIndices = [sub_32];
+ }
+
+ def ABIRegAltName : RegAltNameIndex;
+}
+
+// Integer registers
+let RegAltNameIndices = [ABIRegAltName] in {
+ def X0_32 : RISCVReg32<0, "x0", ["zero"]>, DwarfRegNum<[0]>;
+ def X1_32 : RISCVReg32<1, "x1", ["ra"]>, DwarfRegNum<[1]>;
+ def X2_32 : RISCVReg32<2, "x2", ["sp"]>, DwarfRegNum<[2]>;
+ def X3_32 : RISCVReg32<3, "x3", ["gp"]>, DwarfRegNum<[3]>;
+ def X4_32 : RISCVReg32<4, "x4", ["tp"]>, DwarfRegNum<[4]>;
+ def X5_32 : RISCVReg32<5, "x5", ["t0"]>, DwarfRegNum<[5]>;
+ def X6_32 : RISCVReg32<6, "x6", ["t1"]>, DwarfRegNum<[6]>;
+ def X7_32 : RISCVReg32<7, "x7", ["t2"]>, DwarfRegNum<[7]>;
+ def X8_32 : RISCVReg32<8, "x8", ["s0"]>, DwarfRegNum<[8]>;
+ def X9_32 : RISCVReg32<9, "x9", ["s1"]>, DwarfRegNum<[9]>;
+ def X10_32 : RISCVReg32<10,"x10", ["a0"]>, DwarfRegNum<[10]>;
+ def X11_32 : RISCVReg32<11,"x11", ["a1"]>, DwarfRegNum<[11]>;
+ def X12_32 : RISCVReg32<12,"x12", ["a2"]>, DwarfRegNum<[12]>;
+ def X13_32 : RISCVReg32<13,"x13", ["a3"]>, DwarfRegNum<[13]>;
+ def X14_32 : RISCVReg32<14,"x14", ["a4"]>, DwarfRegNum<[14]>;
+ def X15_32 : RISCVReg32<15,"x15", ["a5"]>, DwarfRegNum<[15]>;
+ def X16_32 : RISCVReg32<16,"x16", ["a6"]>, DwarfRegNum<[16]>;
+ def X17_32 : RISCVReg32<17,"x17", ["a7"]>, DwarfRegNum<[17]>;
+ def X18_32 : RISCVReg32<18,"x18", ["s2"]>, DwarfRegNum<[18]>;
+ def X19_32 : RISCVReg32<19,"x19", ["s3"]>, DwarfRegNum<[19]>;
+ def X20_32 : RISCVReg32<20,"x20", ["s4"]>, DwarfRegNum<[20]>;
+ def X21_32 : RISCVReg32<21,"x21", ["s5"]>, DwarfRegNum<[21]>;
+ def X22_32 : RISCVReg32<22,"x22", ["s6"]>, DwarfRegNum<[22]>;
+ def X23_32 : RISCVReg32<23,"x23", ["s7"]>, DwarfRegNum<[23]>;
+ def X24_32 : RISCVReg32<24,"x24", ["s8"]>, DwarfRegNum<[24]>;
+ def X25_32 : RISCVReg32<25,"x25", ["s9"]>, DwarfRegNum<[25]>;
+ def X26_32 : RISCVReg32<26,"x26", ["s10"]>, DwarfRegNum<[26]>;
+ def X27_32 : RISCVReg32<27,"x27", ["s11"]>, DwarfRegNum<[27]>;
+ def X28_32 : RISCVReg32<28,"x28", ["t3"]>, DwarfRegNum<[28]>;
+ def X29_32 : RISCVReg32<29,"x29", ["t4"]>, DwarfRegNum<[29]>;
+ def X30_32 : RISCVReg32<30,"x30", ["t5"]>, DwarfRegNum<[30]>;
+ def X31_32 : RISCVReg32<31,"x31", ["t6"]>, DwarfRegNum<[31]>;
+}
+
+foreach Index = 0-31 in {
+ def X#Index#_64 : RISCVReg64<!cast<RISCVReg32>("X"#Index#"_32")>, DwarfRegNum<[Index]>;
+}
+
+// We currently define separate register classes for the 32-bit and 64-bit
+// GPRs. Once variable-sized register classes
+// <http://lists.llvm.org/pipermail/llvm-dev/2016-September/105027.html> or
+// similar are implemented, we can just use one 'GPR' class for most
+// instruction definitions.
+
+// TODO: once codegen is implemented, registers should be listed in an order
+// reflecting the preferred register allocation sequence.
+def GPR : RegisterClass<"RISCV", [i32], 32, (add
+ (sequence "X%u_32", 0, 31)
+)>;
+
+def GPR64 : RegisterClass<"RISCV", [i64], 64, (add
+ (sequence "X%u_64", 0, 31)
+)>;
diff --git a/contrib/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/contrib/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
new file mode 100644
index 000000000000..afbbe004186e
--- /dev/null
+++ b/contrib/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -0,0 +1,58 @@
+//===-- RISCVTargetMachine.cpp - Define TargetMachine for RISCV -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements the info about RISCV target spec.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCVTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+extern "C" void LLVMInitializeRISCVTarget() {
+ RegisterTargetMachine<RISCVTargetMachine> X(getTheRISCV32Target());
+ RegisterTargetMachine<RISCVTargetMachine> Y(getTheRISCV64Target());
+}
+
+static std::string computeDataLayout(const Triple &TT) {
+ if (TT.isArch64Bit()) {
+ return "e-m:e-i64:64-n32:64-S128";
+ } else {
+ assert(TT.isArch32Bit() && "only RV32 and RV64 are currently supported");
+ return "e-m:e-i64:64-n32-S128";
+ }
+}
+
+static Reloc::Model getEffectiveRelocModel(const Triple &TT,
+ Optional<Reloc::Model> RM) {
+ if (!RM.hasValue())
+ return Reloc::Static;
+ return *RM;
+}
+
+RISCVTargetMachine::RISCVTargetMachine(const Target &T, const Triple &TT,
+ StringRef CPU, StringRef FS,
+ const TargetOptions &Options,
+ Optional<Reloc::Model> RM,
+ CodeModel::Model CM,
+ CodeGenOpt::Level OL)
+ : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options,
+ getEffectiveRelocModel(TT, RM), CM, OL),
+ TLOF(make_unique<TargetLoweringObjectFileELF>()) {}
+
+TargetPassConfig *RISCVTargetMachine::createPassConfig(PassManagerBase &PM) {
+ return new TargetPassConfig(this, PM);
+}
diff --git a/contrib/llvm/lib/Target/RISCV/RISCVTargetMachine.h b/contrib/llvm/lib/Target/RISCV/RISCVTargetMachine.h
new file mode 100644
index 000000000000..d13e574c9bf8
--- /dev/null
+++ b/contrib/llvm/lib/Target/RISCV/RISCVTargetMachine.h
@@ -0,0 +1,40 @@
+//===-- RISCVTargetMachine.h - Define TargetMachine for RISCV ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the RISCV specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_RISCV_RISCVTARGETMACHINE_H
+#define LLVM_LIB_TARGET_RISCV_RISCVTARGETMACHINE_H
+
+#include "MCTargetDesc/RISCVMCTargetDesc.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class RISCVTargetMachine : public LLVMTargetMachine {
+ std::unique_ptr<TargetLoweringObjectFile> TLOF;
+
+public:
+ RISCVTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL);
+
+ TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+ TargetLoweringObjectFile *getObjFileLowering() const override {
+ return TLOF.get();
+ }
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.cpp b/contrib/llvm/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.cpp
new file mode 100644
index 000000000000..34932c259156
--- /dev/null
+++ b/contrib/llvm/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.cpp
@@ -0,0 +1,30 @@
+//===-- RISCVTargetInfo.cpp - RISCV Target Implementation -----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+namespace llvm {
+Target &getTheRISCV32Target() {
+ static Target TheRISCV32Target;
+ return TheRISCV32Target;
+}
+
+Target &getTheRISCV64Target() {
+ static Target TheRISCV64Target;
+ return TheRISCV64Target;
+}
+}
+
+extern "C" void LLVMInitializeRISCVTargetInfo() {
+ RegisterTarget<Triple::riscv32> X(getTheRISCV32Target(), "riscv32",
+ "32-bit RISC-V");
+ RegisterTarget<Triple::riscv64> Y(getTheRISCV64Target(), "riscv64",
+ "64-bit RISC-V");
+}
diff --git a/contrib/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/contrib/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
new file mode 100644
index 000000000000..e775aa607b53
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -0,0 +1,1300 @@
+//===-- SparcAsmParser.cpp - Parse Sparc assembly to MCInst instructions --===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/SparcMCExpr.h"
+#include "MCTargetDesc/SparcMCTargetDesc.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+// The generated AsmMatcher SparcGenAsmMatcher uses "Sparc" as the target
+// namespace. But SPARC backend uses "SP" as its namespace.
+namespace llvm {
+ namespace Sparc {
+ using namespace SP;
+ }
+}
+
+namespace {
+class SparcOperand;
+class SparcAsmParser : public MCTargetAsmParser {
+
+ MCAsmParser &Parser;
+
+ /// @name Auto-generated Match Functions
+ /// {
+
+#define GET_ASSEMBLER_HEADER
+#include "SparcGenAsmMatcher.inc"
+
+ /// }
+
+ // public interface of the MCTargetAsmParser.
+ bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands, MCStreamer &Out,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) override;
+ bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+ bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+ SMLoc NameLoc, OperandVector &Operands) override;
+ bool ParseDirective(AsmToken DirectiveID) override;
+
+ unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
+ unsigned Kind) override;
+
+ // Custom parse functions for Sparc specific operands.
+ OperandMatchResultTy parseMEMOperand(OperandVector &Operands);
+
+ OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Name);
+
+ OperandMatchResultTy
+ parseSparcAsmOperand(std::unique_ptr<SparcOperand> &Operand,
+ bool isCall = false);
+
+ OperandMatchResultTy parseBranchModifiers(OperandVector &Operands);
+
+ // Helper function for dealing with %lo / %hi in PIC mode.
+ const SparcMCExpr *adjustPICRelocation(SparcMCExpr::VariantKind VK,
+ const MCExpr *subExpr);
+
+ // returns true if Tok is matched to a register and returns register in RegNo.
+ bool matchRegisterName(const AsmToken &Tok, unsigned &RegNo,
+ unsigned &RegKind);
+
+ bool matchSparcAsmModifiers(const MCExpr *&EVal, SMLoc &EndLoc);
+ bool parseDirectiveWord(unsigned Size, SMLoc L);
+
+ bool is64Bit() const {
+ return getSTI().getTargetTriple().getArch() == Triple::sparcv9;
+ }
+
+ bool expandSET(MCInst &Inst, SMLoc IDLoc,
+ SmallVectorImpl<MCInst> &Instructions);
+
+public:
+ SparcAsmParser(const MCSubtargetInfo &sti, MCAsmParser &parser,
+ const MCInstrInfo &MII,
+ const MCTargetOptions &Options)
+ : MCTargetAsmParser(Options, sti), Parser(parser) {
+ // Initialize the set of available features.
+ setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
+ }
+
+};
+
+ static const MCPhysReg IntRegs[32] = {
+ Sparc::G0, Sparc::G1, Sparc::G2, Sparc::G3,
+ Sparc::G4, Sparc::G5, Sparc::G6, Sparc::G7,
+ Sparc::O0, Sparc::O1, Sparc::O2, Sparc::O3,
+ Sparc::O4, Sparc::O5, Sparc::O6, Sparc::O7,
+ Sparc::L0, Sparc::L1, Sparc::L2, Sparc::L3,
+ Sparc::L4, Sparc::L5, Sparc::L6, Sparc::L7,
+ Sparc::I0, Sparc::I1, Sparc::I2, Sparc::I3,
+ Sparc::I4, Sparc::I5, Sparc::I6, Sparc::I7 };
+
+ static const MCPhysReg FloatRegs[32] = {
+ Sparc::F0, Sparc::F1, Sparc::F2, Sparc::F3,
+ Sparc::F4, Sparc::F5, Sparc::F6, Sparc::F7,
+ Sparc::F8, Sparc::F9, Sparc::F10, Sparc::F11,
+ Sparc::F12, Sparc::F13, Sparc::F14, Sparc::F15,
+ Sparc::F16, Sparc::F17, Sparc::F18, Sparc::F19,
+ Sparc::F20, Sparc::F21, Sparc::F22, Sparc::F23,
+ Sparc::F24, Sparc::F25, Sparc::F26, Sparc::F27,
+ Sparc::F28, Sparc::F29, Sparc::F30, Sparc::F31 };
+
+ static const MCPhysReg DoubleRegs[32] = {
+ Sparc::D0, Sparc::D1, Sparc::D2, Sparc::D3,
+ Sparc::D4, Sparc::D5, Sparc::D6, Sparc::D7,
+ Sparc::D8, Sparc::D9, Sparc::D10, Sparc::D11,
+ Sparc::D12, Sparc::D13, Sparc::D14, Sparc::D15,
+ Sparc::D16, Sparc::D17, Sparc::D18, Sparc::D19,
+ Sparc::D20, Sparc::D21, Sparc::D22, Sparc::D23,
+ Sparc::D24, Sparc::D25, Sparc::D26, Sparc::D27,
+ Sparc::D28, Sparc::D29, Sparc::D30, Sparc::D31 };
+
+ static const MCPhysReg QuadFPRegs[32] = {
+ Sparc::Q0, Sparc::Q1, Sparc::Q2, Sparc::Q3,
+ Sparc::Q4, Sparc::Q5, Sparc::Q6, Sparc::Q7,
+ Sparc::Q8, Sparc::Q9, Sparc::Q10, Sparc::Q11,
+ Sparc::Q12, Sparc::Q13, Sparc::Q14, Sparc::Q15 };
+
+ static const MCPhysReg ASRRegs[32] = {
+ SP::Y, SP::ASR1, SP::ASR2, SP::ASR3,
+ SP::ASR4, SP::ASR5, SP::ASR6, SP::ASR7,
+ SP::ASR8, SP::ASR9, SP::ASR10, SP::ASR11,
+ SP::ASR12, SP::ASR13, SP::ASR14, SP::ASR15,
+ SP::ASR16, SP::ASR17, SP::ASR18, SP::ASR19,
+ SP::ASR20, SP::ASR21, SP::ASR22, SP::ASR23,
+ SP::ASR24, SP::ASR25, SP::ASR26, SP::ASR27,
+ SP::ASR28, SP::ASR29, SP::ASR30, SP::ASR31};
+
+ static const MCPhysReg IntPairRegs[] = {
+ Sparc::G0_G1, Sparc::G2_G3, Sparc::G4_G5, Sparc::G6_G7,
+ Sparc::O0_O1, Sparc::O2_O3, Sparc::O4_O5, Sparc::O6_O7,
+ Sparc::L0_L1, Sparc::L2_L3, Sparc::L4_L5, Sparc::L6_L7,
+ Sparc::I0_I1, Sparc::I2_I3, Sparc::I4_I5, Sparc::I6_I7};
+
+ static const MCPhysReg CoprocRegs[32] = {
+ Sparc::C0, Sparc::C1, Sparc::C2, Sparc::C3,
+ Sparc::C4, Sparc::C5, Sparc::C6, Sparc::C7,
+ Sparc::C8, Sparc::C9, Sparc::C10, Sparc::C11,
+ Sparc::C12, Sparc::C13, Sparc::C14, Sparc::C15,
+ Sparc::C16, Sparc::C17, Sparc::C18, Sparc::C19,
+ Sparc::C20, Sparc::C21, Sparc::C22, Sparc::C23,
+ Sparc::C24, Sparc::C25, Sparc::C26, Sparc::C27,
+ Sparc::C28, Sparc::C29, Sparc::C30, Sparc::C31 };
+
+ static const MCPhysReg CoprocPairRegs[] = {
+ Sparc::C0_C1, Sparc::C2_C3, Sparc::C4_C5, Sparc::C6_C7,
+ Sparc::C8_C9, Sparc::C10_C11, Sparc::C12_C13, Sparc::C14_C15,
+ Sparc::C16_C17, Sparc::C18_C19, Sparc::C20_C21, Sparc::C22_C23,
+ Sparc::C24_C25, Sparc::C26_C27, Sparc::C28_C29, Sparc::C30_C31};
+
+/// SparcOperand - Instances of this class represent a parsed Sparc machine
+/// instruction.
+class SparcOperand : public MCParsedAsmOperand {
+public:
+ enum RegisterKind {
+ rk_None,
+ rk_IntReg,
+ rk_IntPairReg,
+ rk_FloatReg,
+ rk_DoubleReg,
+ rk_QuadReg,
+ rk_CoprocReg,
+ rk_CoprocPairReg,
+ rk_Special,
+ };
+
+private:
+ enum KindTy {
+ k_Token,
+ k_Register,
+ k_Immediate,
+ k_MemoryReg,
+ k_MemoryImm
+ } Kind;
+
+ SMLoc StartLoc, EndLoc;
+
+ struct Token {
+ const char *Data;
+ unsigned Length;
+ };
+
+ struct RegOp {
+ unsigned RegNum;
+ RegisterKind Kind;
+ };
+
+ struct ImmOp {
+ const MCExpr *Val;
+ };
+
+ struct MemOp {
+ unsigned Base;
+ unsigned OffsetReg;
+ const MCExpr *Off;
+ };
+
+ union {
+ struct Token Tok;
+ struct RegOp Reg;
+ struct ImmOp Imm;
+ struct MemOp Mem;
+ };
+public:
+ SparcOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
+
+ bool isToken() const override { return Kind == k_Token; }
+ bool isReg() const override { return Kind == k_Register; }
+ bool isImm() const override { return Kind == k_Immediate; }
+ bool isMem() const override { return isMEMrr() || isMEMri(); }
+ bool isMEMrr() const { return Kind == k_MemoryReg; }
+ bool isMEMri() const { return Kind == k_MemoryImm; }
+
+ bool isIntReg() const {
+ return (Kind == k_Register && Reg.Kind == rk_IntReg);
+ }
+
+ bool isFloatReg() const {
+ return (Kind == k_Register && Reg.Kind == rk_FloatReg);
+ }
+
+ bool isFloatOrDoubleReg() const {
+ return (Kind == k_Register && (Reg.Kind == rk_FloatReg
+ || Reg.Kind == rk_DoubleReg));
+ }
+
+ bool isCoprocReg() const {
+ return (Kind == k_Register && Reg.Kind == rk_CoprocReg);
+ }
+
+ StringRef getToken() const {
+ assert(Kind == k_Token && "Invalid access!");
+ return StringRef(Tok.Data, Tok.Length);
+ }
+
+ unsigned getReg() const override {
+ assert((Kind == k_Register) && "Invalid access!");
+ return Reg.RegNum;
+ }
+
+ const MCExpr *getImm() const {
+ assert((Kind == k_Immediate) && "Invalid access!");
+ return Imm.Val;
+ }
+
+ unsigned getMemBase() const {
+ assert((Kind == k_MemoryReg || Kind == k_MemoryImm) && "Invalid access!");
+ return Mem.Base;
+ }
+
+ unsigned getMemOffsetReg() const {
+ assert((Kind == k_MemoryReg) && "Invalid access!");
+ return Mem.OffsetReg;
+ }
+
+ const MCExpr *getMemOff() const {
+ assert((Kind == k_MemoryImm) && "Invalid access!");
+ return Mem.Off;
+ }
+
+ /// getStartLoc - Get the location of the first token of this operand.
+ SMLoc getStartLoc() const override {
+ return StartLoc;
+ }
+ /// getEndLoc - Get the location of the last token of this operand.
+ SMLoc getEndLoc() const override {
+ return EndLoc;
+ }
+
+ void print(raw_ostream &OS) const override {
+ switch (Kind) {
+ case k_Token: OS << "Token: " << getToken() << "\n"; break;
+ case k_Register: OS << "Reg: #" << getReg() << "\n"; break;
+ case k_Immediate: OS << "Imm: " << getImm() << "\n"; break;
+ case k_MemoryReg: OS << "Mem: " << getMemBase() << "+"
+ << getMemOffsetReg() << "\n"; break;
+ case k_MemoryImm: assert(getMemOff() != nullptr);
+ OS << "Mem: " << getMemBase()
+ << "+" << *getMemOff()
+ << "\n"; break;
+ }
+ }
+
+ void addRegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getReg()));
+ }
+
+ void addImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCExpr *Expr = getImm();
+ addExpr(Inst, Expr);
+ }
+
+ void addExpr(MCInst &Inst, const MCExpr *Expr) const{
+ // Add as immediate when possible. Null MCExpr = 0.
+ if (!Expr)
+ Inst.addOperand(MCOperand::createImm(0));
+ else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+ Inst.addOperand(MCOperand::createImm(CE->getValue()));
+ else
+ Inst.addOperand(MCOperand::createExpr(Expr));
+ }
+
+ void addMEMrrOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 2 && "Invalid number of operands!");
+
+ Inst.addOperand(MCOperand::createReg(getMemBase()));
+
+ assert(getMemOffsetReg() != 0 && "Invalid offset");
+ Inst.addOperand(MCOperand::createReg(getMemOffsetReg()));
+ }
+
+ void addMEMriOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 2 && "Invalid number of operands!");
+
+ Inst.addOperand(MCOperand::createReg(getMemBase()));
+
+ const MCExpr *Expr = getMemOff();
+ addExpr(Inst, Expr);
+ }
+
+ static std::unique_ptr<SparcOperand> CreateToken(StringRef Str, SMLoc S) {
+ auto Op = make_unique<SparcOperand>(k_Token);
+ Op->Tok.Data = Str.data();
+ Op->Tok.Length = Str.size();
+ Op->StartLoc = S;
+ Op->EndLoc = S;
+ return Op;
+ }
+
+ static std::unique_ptr<SparcOperand> CreateReg(unsigned RegNum, unsigned Kind,
+ SMLoc S, SMLoc E) {
+ auto Op = make_unique<SparcOperand>(k_Register);
+ Op->Reg.RegNum = RegNum;
+ Op->Reg.Kind = (SparcOperand::RegisterKind)Kind;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ static std::unique_ptr<SparcOperand> CreateImm(const MCExpr *Val, SMLoc S,
+ SMLoc E) {
+ auto Op = make_unique<SparcOperand>(k_Immediate);
+ Op->Imm.Val = Val;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ static bool MorphToIntPairReg(SparcOperand &Op) {
+ unsigned Reg = Op.getReg();
+ assert(Op.Reg.Kind == rk_IntReg);
+ unsigned regIdx = 32;
+ if (Reg >= Sparc::G0 && Reg <= Sparc::G7)
+ regIdx = Reg - Sparc::G0;
+ else if (Reg >= Sparc::O0 && Reg <= Sparc::O7)
+ regIdx = Reg - Sparc::O0 + 8;
+ else if (Reg >= Sparc::L0 && Reg <= Sparc::L7)
+ regIdx = Reg - Sparc::L0 + 16;
+ else if (Reg >= Sparc::I0 && Reg <= Sparc::I7)
+ regIdx = Reg - Sparc::I0 + 24;
+ if (regIdx % 2 || regIdx > 31)
+ return false;
+ Op.Reg.RegNum = IntPairRegs[regIdx / 2];
+ Op.Reg.Kind = rk_IntPairReg;
+ return true;
+ }
+
+ static bool MorphToDoubleReg(SparcOperand &Op) {
+ unsigned Reg = Op.getReg();
+ assert(Op.Reg.Kind == rk_FloatReg);
+ unsigned regIdx = Reg - Sparc::F0;
+ if (regIdx % 2 || regIdx > 31)
+ return false;
+ Op.Reg.RegNum = DoubleRegs[regIdx / 2];
+ Op.Reg.Kind = rk_DoubleReg;
+ return true;
+ }
+
+ static bool MorphToQuadReg(SparcOperand &Op) {
+ unsigned Reg = Op.getReg();
+ unsigned regIdx = 0;
+ switch (Op.Reg.Kind) {
+ default: llvm_unreachable("Unexpected register kind!");
+ case rk_FloatReg:
+ regIdx = Reg - Sparc::F0;
+ if (regIdx % 4 || regIdx > 31)
+ return false;
+ Reg = QuadFPRegs[regIdx / 4];
+ break;
+ case rk_DoubleReg:
+ regIdx = Reg - Sparc::D0;
+ if (regIdx % 2 || regIdx > 31)
+ return false;
+ Reg = QuadFPRegs[regIdx / 2];
+ break;
+ }
+ Op.Reg.RegNum = Reg;
+ Op.Reg.Kind = rk_QuadReg;
+ return true;
+ }
+
+ static bool MorphToCoprocPairReg(SparcOperand &Op) {
+ unsigned Reg = Op.getReg();
+ assert(Op.Reg.Kind == rk_CoprocReg);
+ unsigned regIdx = 32;
+ if (Reg >= Sparc::C0 && Reg <= Sparc::C31)
+ regIdx = Reg - Sparc::C0;
+ if (regIdx % 2 || regIdx > 31)
+ return false;
+ Op.Reg.RegNum = CoprocPairRegs[regIdx / 2];
+ Op.Reg.Kind = rk_CoprocPairReg;
+ return true;
+ }
+
+ static std::unique_ptr<SparcOperand>
+ MorphToMEMrr(unsigned Base, std::unique_ptr<SparcOperand> Op) {
+ unsigned offsetReg = Op->getReg();
+ Op->Kind = k_MemoryReg;
+ Op->Mem.Base = Base;
+ Op->Mem.OffsetReg = offsetReg;
+ Op->Mem.Off = nullptr;
+ return Op;
+ }
+
+ static std::unique_ptr<SparcOperand>
+ CreateMEMr(unsigned Base, SMLoc S, SMLoc E) {
+ auto Op = make_unique<SparcOperand>(k_MemoryReg);
+ Op->Mem.Base = Base;
+ Op->Mem.OffsetReg = Sparc::G0; // always 0
+ Op->Mem.Off = nullptr;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ static std::unique_ptr<SparcOperand>
+ MorphToMEMri(unsigned Base, std::unique_ptr<SparcOperand> Op) {
+ const MCExpr *Imm = Op->getImm();
+ Op->Kind = k_MemoryImm;
+ Op->Mem.Base = Base;
+ Op->Mem.OffsetReg = 0;
+ Op->Mem.Off = Imm;
+ return Op;
+ }
+};
+
+} // end namespace
+
+bool SparcAsmParser::expandSET(MCInst &Inst, SMLoc IDLoc,
+ SmallVectorImpl<MCInst> &Instructions) {
+ MCOperand MCRegOp = Inst.getOperand(0);
+ MCOperand MCValOp = Inst.getOperand(1);
+ assert(MCRegOp.isReg());
+ assert(MCValOp.isImm() || MCValOp.isExpr());
+
+ // the imm operand can be either an expression or an immediate.
+ bool IsImm = Inst.getOperand(1).isImm();
+ int64_t RawImmValue = IsImm ? MCValOp.getImm() : 0;
+
+ // Allow either a signed or unsigned 32-bit immediate.
+ if (RawImmValue < -2147483648LL || RawImmValue > 4294967295LL) {
+ return Error(IDLoc,
+ "set: argument must be between -2147483648 and 4294967295");
+ }
+
+ // If the value was expressed as a large unsigned number, that's ok.
+ // We want to see if it "looks like" a small signed number.
+ int32_t ImmValue = RawImmValue;
+ // For 'set' you can't use 'or' with a negative operand on V9 because
+ // that would splat the sign bit across the upper half of the destination
+ // register, whereas 'set' is defined to zero the high 32 bits.
+ bool IsEffectivelyImm13 =
+ IsImm && ((is64Bit() ? 0 : -4096) <= ImmValue && ImmValue < 4096);
+ const MCExpr *ValExpr;
+ if (IsImm)
+ ValExpr = MCConstantExpr::create(ImmValue, getContext());
+ else
+ ValExpr = MCValOp.getExpr();
+
+ MCOperand PrevReg = MCOperand::createReg(Sparc::G0);
+
+ // If not just a signed imm13 value, then either we use a 'sethi' with a
+ // following 'or', or a 'sethi' by itself if there are no more 1 bits.
+ // In either case, start with the 'sethi'.
+ if (!IsEffectivelyImm13) {
+ MCInst TmpInst;
+ const MCExpr *Expr = adjustPICRelocation(SparcMCExpr::VK_Sparc_HI, ValExpr);
+ TmpInst.setLoc(IDLoc);
+ TmpInst.setOpcode(SP::SETHIi);
+ TmpInst.addOperand(MCRegOp);
+ TmpInst.addOperand(MCOperand::createExpr(Expr));
+ Instructions.push_back(TmpInst);
+ PrevReg = MCRegOp;
+ }
+
+ // The low bits require touching in 3 cases:
+ // * A non-immediate value will always require both instructions.
+ // * An effectively imm13 value needs only an 'or' instruction.
+ // * Otherwise, an immediate that is not effectively imm13 requires the
+ // 'or' only if bits remain after clearing the 22 bits that 'sethi' set.
+ // If the low bits are known zeros, there's nothing to do.
+ // In the second case, and only in that case, must we NOT clear
+ // bits of the immediate value via the %lo() assembler function.
+ // Note also, the 'or' instruction doesn't mind a large value in the case
+ // where the operand to 'set' was 0xFFFFFzzz - it does exactly what you mean.
+ if (!IsImm || IsEffectivelyImm13 || (ImmValue & 0x3ff)) {
+ MCInst TmpInst;
+ const MCExpr *Expr;
+ if (IsEffectivelyImm13)
+ Expr = ValExpr;
+ else
+ Expr = adjustPICRelocation(SparcMCExpr::VK_Sparc_LO, ValExpr);
+ TmpInst.setLoc(IDLoc);
+ TmpInst.setOpcode(SP::ORri);
+ TmpInst.addOperand(MCRegOp);
+ TmpInst.addOperand(PrevReg);
+ TmpInst.addOperand(MCOperand::createExpr(Expr));
+ Instructions.push_back(TmpInst);
+ }
+ return false;
+}
+
+bool SparcAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands,
+ MCStreamer &Out,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) {
+ MCInst Inst;
+ SmallVector<MCInst, 8> Instructions;
+ unsigned MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo,
+ MatchingInlineAsm);
+ switch (MatchResult) {
+ case Match_Success: {
+ switch (Inst.getOpcode()) {
+ default:
+ Inst.setLoc(IDLoc);
+ Instructions.push_back(Inst);
+ break;
+ case SP::SET:
+ if (expandSET(Inst, IDLoc, Instructions))
+ return true;
+ break;
+ }
+
+ for (const MCInst &I : Instructions) {
+ Out.EmitInstruction(I, getSTI());
+ }
+ return false;
+ }
+
+ case Match_MissingFeature:
+ return Error(IDLoc,
+ "instruction requires a CPU feature not currently enabled");
+
+ case Match_InvalidOperand: {
+ SMLoc ErrorLoc = IDLoc;
+ if (ErrorInfo != ~0ULL) {
+ if (ErrorInfo >= Operands.size())
+ return Error(IDLoc, "too few operands for instruction");
+
+ ErrorLoc = ((SparcOperand &)*Operands[ErrorInfo]).getStartLoc();
+ if (ErrorLoc == SMLoc())
+ ErrorLoc = IDLoc;
+ }
+
+ return Error(ErrorLoc, "invalid operand for instruction");
+ }
+ case Match_MnemonicFail:
+ return Error(IDLoc, "invalid instruction mnemonic");
+ }
+ llvm_unreachable("Implement any new match types added!");
+}
+
+bool SparcAsmParser::
+ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc)
+{
+ const AsmToken &Tok = Parser.getTok();
+ StartLoc = Tok.getLoc();
+ EndLoc = Tok.getEndLoc();
+ RegNo = 0;
+ if (getLexer().getKind() != AsmToken::Percent)
+ return false;
+ Parser.Lex();
+ unsigned regKind = SparcOperand::rk_None;
+ if (matchRegisterName(Tok, RegNo, regKind)) {
+ Parser.Lex();
+ return false;
+ }
+
+ return Error(StartLoc, "invalid register name");
+}
+
+static void applyMnemonicAliases(StringRef &Mnemonic, uint64_t Features,
+ unsigned VariantID);
+
+bool SparcAsmParser::ParseInstruction(ParseInstructionInfo &Info,
+ StringRef Name, SMLoc NameLoc,
+ OperandVector &Operands) {
+
+ // First operand in MCInst is instruction mnemonic.
+ Operands.push_back(SparcOperand::CreateToken(Name, NameLoc));
+
+ // apply mnemonic aliases, if any, so that we can parse operands correctly.
+ applyMnemonicAliases(Name, getAvailableFeatures(), 0);
+
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ // Read the first operand.
+ if (getLexer().is(AsmToken::Comma)) {
+ if (parseBranchModifiers(Operands) != MatchOperand_Success) {
+ SMLoc Loc = getLexer().getLoc();
+ return Error(Loc, "unexpected token");
+ }
+ }
+ if (parseOperand(Operands, Name) != MatchOperand_Success) {
+ SMLoc Loc = getLexer().getLoc();
+ return Error(Loc, "unexpected token");
+ }
+
+ while (getLexer().is(AsmToken::Comma) || getLexer().is(AsmToken::Plus)) {
+ if (getLexer().is(AsmToken::Plus)) {
+ // Plus tokens are significant in software_traps (p83, sparcv8.pdf). We must capture them.
+ Operands.push_back(SparcOperand::CreateToken("+", Parser.getTok().getLoc()));
+ }
+ Parser.Lex(); // Eat the comma or plus.
+ // Parse and remember the operand.
+ if (parseOperand(Operands, Name) != MatchOperand_Success) {
+ SMLoc Loc = getLexer().getLoc();
+ return Error(Loc, "unexpected token");
+ }
+ }
+ }
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ SMLoc Loc = getLexer().getLoc();
+ return Error(Loc, "unexpected token");
+ }
+ Parser.Lex(); // Consume the EndOfStatement.
+ return false;
+}
+
+bool SparcAsmParser::
+ParseDirective(AsmToken DirectiveID)
+{
+ StringRef IDVal = DirectiveID.getString();
+
+ if (IDVal == ".byte")
+ return parseDirectiveWord(1, DirectiveID.getLoc());
+
+ if (IDVal == ".half")
+ return parseDirectiveWord(2, DirectiveID.getLoc());
+
+ if (IDVal == ".word")
+ return parseDirectiveWord(4, DirectiveID.getLoc());
+
+ if (IDVal == ".nword")
+ return parseDirectiveWord(is64Bit() ? 8 : 4, DirectiveID.getLoc());
+
+ if (is64Bit() && IDVal == ".xword")
+ return parseDirectiveWord(8, DirectiveID.getLoc());
+
+ if (IDVal == ".register") {
+ // For now, ignore .register directive.
+ Parser.eatToEndOfStatement();
+ return false;
+ }
+ if (IDVal == ".proc") {
+ // For compatibility, ignore this directive.
+ // (It's supposed to be an "optimization" in the Sun assembler)
+ Parser.eatToEndOfStatement();
+ return false;
+ }
+
+ // Let the MC layer to handle other directives.
+ return true;
+}
+
+bool SparcAsmParser:: parseDirectiveWord(unsigned Size, SMLoc L) {
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ for (;;) {
+ const MCExpr *Value;
+ if (getParser().parseExpression(Value))
+ return true;
+
+ getParser().getStreamer().EmitValue(Value, Size);
+
+ if (getLexer().is(AsmToken::EndOfStatement))
+ break;
+
+ // FIXME: Improve diagnostic.
+ if (getLexer().isNot(AsmToken::Comma))
+ return Error(L, "unexpected token in directive");
+ Parser.Lex();
+ }
+ }
+ Parser.Lex();
+ return false;
+}
+
+OperandMatchResultTy
+SparcAsmParser::parseMEMOperand(OperandVector &Operands) {
+
+ SMLoc S, E;
+ unsigned BaseReg = 0;
+
+ if (ParseRegister(BaseReg, S, E)) {
+ return MatchOperand_NoMatch;
+ }
+
+ switch (getLexer().getKind()) {
+ default: return MatchOperand_NoMatch;
+
+ case AsmToken::Comma:
+ case AsmToken::RBrac:
+ case AsmToken::EndOfStatement:
+ Operands.push_back(SparcOperand::CreateMEMr(BaseReg, S, E));
+ return MatchOperand_Success;
+
+ case AsmToken:: Plus:
+ Parser.Lex(); // Eat the '+'
+ break;
+ case AsmToken::Minus:
+ break;
+ }
+
+ std::unique_ptr<SparcOperand> Offset;
+ OperandMatchResultTy ResTy = parseSparcAsmOperand(Offset);
+ if (ResTy != MatchOperand_Success || !Offset)
+ return MatchOperand_NoMatch;
+
+ Operands.push_back(
+ Offset->isImm() ? SparcOperand::MorphToMEMri(BaseReg, std::move(Offset))
+ : SparcOperand::MorphToMEMrr(BaseReg, std::move(Offset)));
+
+ return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+SparcAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
+
+ OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+
+ // If there wasn't a custom match, try the generic matcher below. Otherwise,
+ // there was a match, but an error occurred, in which case, just return that
+ // the operand parsing failed.
+ if (ResTy == MatchOperand_Success || ResTy == MatchOperand_ParseFail)
+ return ResTy;
+
+ if (getLexer().is(AsmToken::LBrac)) {
+ // Memory operand
+ Operands.push_back(SparcOperand::CreateToken("[",
+ Parser.getTok().getLoc()));
+ Parser.Lex(); // Eat the [
+
+ if (Mnemonic == "cas" || Mnemonic == "casx" || Mnemonic == "casa") {
+ SMLoc S = Parser.getTok().getLoc();
+ if (getLexer().getKind() != AsmToken::Percent)
+ return MatchOperand_NoMatch;
+ Parser.Lex(); // eat %
+
+ unsigned RegNo, RegKind;
+ if (!matchRegisterName(Parser.getTok(), RegNo, RegKind))
+ return MatchOperand_NoMatch;
+
+ Parser.Lex(); // Eat the identifier token.
+ SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer()-1);
+ Operands.push_back(SparcOperand::CreateReg(RegNo, RegKind, S, E));
+ ResTy = MatchOperand_Success;
+ } else {
+ ResTy = parseMEMOperand(Operands);
+ }
+
+ if (ResTy != MatchOperand_Success)
+ return ResTy;
+
+ if (!getLexer().is(AsmToken::RBrac))
+ return MatchOperand_ParseFail;
+
+ Operands.push_back(SparcOperand::CreateToken("]",
+ Parser.getTok().getLoc()));
+ Parser.Lex(); // Eat the ]
+
+ // Parse an optional address-space identifier after the address.
+ if (getLexer().is(AsmToken::Integer)) {
+ std::unique_ptr<SparcOperand> Op;
+ ResTy = parseSparcAsmOperand(Op, false);
+ if (ResTy != MatchOperand_Success || !Op)
+ return MatchOperand_ParseFail;
+ Operands.push_back(std::move(Op));
+ }
+ return MatchOperand_Success;
+ }
+
+ std::unique_ptr<SparcOperand> Op;
+
+ ResTy = parseSparcAsmOperand(Op, (Mnemonic == "call"));
+ if (ResTy != MatchOperand_Success || !Op)
+ return MatchOperand_ParseFail;
+
+ // Push the parsed operand into the list of operands
+ Operands.push_back(std::move(Op));
+
+ return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+SparcAsmParser::parseSparcAsmOperand(std::unique_ptr<SparcOperand> &Op,
+ bool isCall) {
+
+ SMLoc S = Parser.getTok().getLoc();
+ SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+ const MCExpr *EVal;
+
+ Op = nullptr;
+ switch (getLexer().getKind()) {
+ default: break;
+
+ case AsmToken::Percent:
+ Parser.Lex(); // Eat the '%'.
+ unsigned RegNo;
+ unsigned RegKind;
+ if (matchRegisterName(Parser.getTok(), RegNo, RegKind)) {
+ StringRef name = Parser.getTok().getString();
+ Parser.Lex(); // Eat the identifier token.
+ E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+ switch (RegNo) {
+ default:
+ Op = SparcOperand::CreateReg(RegNo, RegKind, S, E);
+ break;
+ case Sparc::PSR:
+ Op = SparcOperand::CreateToken("%psr", S);
+ break;
+ case Sparc::FSR:
+ Op = SparcOperand::CreateToken("%fsr", S);
+ break;
+ case Sparc::FQ:
+ Op = SparcOperand::CreateToken("%fq", S);
+ break;
+ case Sparc::CPSR:
+ Op = SparcOperand::CreateToken("%csr", S);
+ break;
+ case Sparc::CPQ:
+ Op = SparcOperand::CreateToken("%cq", S);
+ break;
+ case Sparc::WIM:
+ Op = SparcOperand::CreateToken("%wim", S);
+ break;
+ case Sparc::TBR:
+ Op = SparcOperand::CreateToken("%tbr", S);
+ break;
+ case Sparc::ICC:
+ if (name == "xcc")
+ Op = SparcOperand::CreateToken("%xcc", S);
+ else
+ Op = SparcOperand::CreateToken("%icc", S);
+ break;
+ }
+ break;
+ }
+ if (matchSparcAsmModifiers(EVal, E)) {
+ E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+ Op = SparcOperand::CreateImm(EVal, S, E);
+ }
+ break;
+
+ case AsmToken::Minus:
+ case AsmToken::Integer:
+ case AsmToken::LParen:
+ case AsmToken::Dot:
+ if (!getParser().parseExpression(EVal, E))
+ Op = SparcOperand::CreateImm(EVal, S, E);
+ break;
+
+ case AsmToken::Identifier: {
+ StringRef Identifier;
+ if (!getParser().parseIdentifier(Identifier)) {
+ E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+ MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier);
+
+ const MCExpr *Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None,
+ getContext());
+ if (isCall && getContext().getObjectFileInfo()->isPositionIndependent())
+ Res = SparcMCExpr::create(SparcMCExpr::VK_Sparc_WPLT30, Res,
+ getContext());
+ Op = SparcOperand::CreateImm(Res, S, E);
+ }
+ break;
+ }
+ }
+ return (Op) ? MatchOperand_Success : MatchOperand_ParseFail;
+}
+
+OperandMatchResultTy
+SparcAsmParser::parseBranchModifiers(OperandVector &Operands) {
+
+ // parse (,a|,pn|,pt)+
+
+ while (getLexer().is(AsmToken::Comma)) {
+
+ Parser.Lex(); // Eat the comma
+
+ if (!getLexer().is(AsmToken::Identifier))
+ return MatchOperand_ParseFail;
+ StringRef modName = Parser.getTok().getString();
+ if (modName == "a" || modName == "pn" || modName == "pt") {
+ Operands.push_back(SparcOperand::CreateToken(modName,
+ Parser.getTok().getLoc()));
+ Parser.Lex(); // eat the identifier.
+ }
+ }
+ return MatchOperand_Success;
+}
+
+bool SparcAsmParser::matchRegisterName(const AsmToken &Tok,
+ unsigned &RegNo,
+ unsigned &RegKind)
+{
+ int64_t intVal = 0;
+ RegNo = 0;
+ RegKind = SparcOperand::rk_None;
+ if (Tok.is(AsmToken::Identifier)) {
+ StringRef name = Tok.getString();
+
+ // %fp
+ if (name.equals("fp")) {
+ RegNo = Sparc::I6;
+ RegKind = SparcOperand::rk_IntReg;
+ return true;
+ }
+ // %sp
+ if (name.equals("sp")) {
+ RegNo = Sparc::O6;
+ RegKind = SparcOperand::rk_IntReg;
+ return true;
+ }
+
+ if (name.equals("y")) {
+ RegNo = Sparc::Y;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+
+ if (name.substr(0, 3).equals_lower("asr")
+ && !name.substr(3).getAsInteger(10, intVal)
+ && intVal > 0 && intVal < 32) {
+ RegNo = ASRRegs[intVal];
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+
+ // %fprs is an alias of %asr6.
+ if (name.equals("fprs")) {
+ RegNo = ASRRegs[6];
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+
+ if (name.equals("icc")) {
+ RegNo = Sparc::ICC;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+
+ if (name.equals("psr")) {
+ RegNo = Sparc::PSR;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+
+ if (name.equals("fsr")) {
+ RegNo = Sparc::FSR;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+
+ if (name.equals("fq")) {
+ RegNo = Sparc::FQ;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+
+ if (name.equals("csr")) {
+ RegNo = Sparc::CPSR;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+
+ if (name.equals("cq")) {
+ RegNo = Sparc::CPQ;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+
+ if (name.equals("wim")) {
+ RegNo = Sparc::WIM;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+
+ if (name.equals("tbr")) {
+ RegNo = Sparc::TBR;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+
+ if (name.equals("xcc")) {
+ // FIXME:: check 64bit.
+ RegNo = Sparc::ICC;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+
+ // %fcc0 - %fcc3
+ if (name.substr(0, 3).equals_lower("fcc")
+ && !name.substr(3).getAsInteger(10, intVal)
+ && intVal < 4) {
+ // FIXME: check 64bit and handle %fcc1 - %fcc3
+ RegNo = Sparc::FCC0 + intVal;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+
+ // %g0 - %g7
+ if (name.substr(0, 1).equals_lower("g")
+ && !name.substr(1).getAsInteger(10, intVal)
+ && intVal < 8) {
+ RegNo = IntRegs[intVal];
+ RegKind = SparcOperand::rk_IntReg;
+ return true;
+ }
+ // %o0 - %o7
+ if (name.substr(0, 1).equals_lower("o")
+ && !name.substr(1).getAsInteger(10, intVal)
+ && intVal < 8) {
+ RegNo = IntRegs[8 + intVal];
+ RegKind = SparcOperand::rk_IntReg;
+ return true;
+ }
+ if (name.substr(0, 1).equals_lower("l")
+ && !name.substr(1).getAsInteger(10, intVal)
+ && intVal < 8) {
+ RegNo = IntRegs[16 + intVal];
+ RegKind = SparcOperand::rk_IntReg;
+ return true;
+ }
+ if (name.substr(0, 1).equals_lower("i")
+ && !name.substr(1).getAsInteger(10, intVal)
+ && intVal < 8) {
+ RegNo = IntRegs[24 + intVal];
+ RegKind = SparcOperand::rk_IntReg;
+ return true;
+ }
+ // %f0 - %f31
+ if (name.substr(0, 1).equals_lower("f")
+ && !name.substr(1, 2).getAsInteger(10, intVal) && intVal < 32) {
+ RegNo = FloatRegs[intVal];
+ RegKind = SparcOperand::rk_FloatReg;
+ return true;
+ }
+ // %f32 - %f62
+ if (name.substr(0, 1).equals_lower("f")
+ && !name.substr(1, 2).getAsInteger(10, intVal)
+ && intVal >= 32 && intVal <= 62 && (intVal % 2 == 0)) {
+ // FIXME: Check V9
+ RegNo = DoubleRegs[intVal/2];
+ RegKind = SparcOperand::rk_DoubleReg;
+ return true;
+ }
+
+ // %r0 - %r31
+ if (name.substr(0, 1).equals_lower("r")
+ && !name.substr(1, 2).getAsInteger(10, intVal) && intVal < 31) {
+ RegNo = IntRegs[intVal];
+ RegKind = SparcOperand::rk_IntReg;
+ return true;
+ }
+
+ // %c0 - %c31
+ if (name.substr(0, 1).equals_lower("c")
+ && !name.substr(1).getAsInteger(10, intVal)
+ && intVal < 32) {
+ RegNo = CoprocRegs[intVal];
+ RegKind = SparcOperand::rk_CoprocReg;
+ return true;
+ }
+
+ if (name.equals("tpc")) {
+ RegNo = Sparc::TPC;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+ if (name.equals("tnpc")) {
+ RegNo = Sparc::TNPC;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+ if (name.equals("tstate")) {
+ RegNo = Sparc::TSTATE;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+ if (name.equals("tt")) {
+ RegNo = Sparc::TT;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+ if (name.equals("tick")) {
+ RegNo = Sparc::TICK;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+ if (name.equals("tba")) {
+ RegNo = Sparc::TBA;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+ if (name.equals("pstate")) {
+ RegNo = Sparc::PSTATE;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+ if (name.equals("tl")) {
+ RegNo = Sparc::TL;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+ if (name.equals("pil")) {
+ RegNo = Sparc::PIL;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+ if (name.equals("cwp")) {
+ RegNo = Sparc::CWP;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+ if (name.equals("cansave")) {
+ RegNo = Sparc::CANSAVE;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+ if (name.equals("canrestore")) {
+ RegNo = Sparc::CANRESTORE;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+ if (name.equals("cleanwin")) {
+ RegNo = Sparc::CLEANWIN;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+ if (name.equals("otherwin")) {
+ RegNo = Sparc::OTHERWIN;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+ if (name.equals("wstate")) {
+ RegNo = Sparc::WSTATE;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+ }
+ return false;
+}
+
+// Determine if an expression contains a reference to the symbol
+// "_GLOBAL_OFFSET_TABLE_".
+static bool hasGOTReference(const MCExpr *Expr) {
+ switch (Expr->getKind()) {
+ case MCExpr::Target:
+ if (const SparcMCExpr *SE = dyn_cast<SparcMCExpr>(Expr))
+ return hasGOTReference(SE->getSubExpr());
+ break;
+
+ case MCExpr::Constant:
+ break;
+
+ case MCExpr::Binary: {
+ const MCBinaryExpr *BE = cast<MCBinaryExpr>(Expr);
+ return hasGOTReference(BE->getLHS()) || hasGOTReference(BE->getRHS());
+ }
+
+ case MCExpr::SymbolRef: {
+ const MCSymbolRefExpr &SymRef = *cast<MCSymbolRefExpr>(Expr);
+ return (SymRef.getSymbol().getName() == "_GLOBAL_OFFSET_TABLE_");
+ }
+
+ case MCExpr::Unary:
+ return hasGOTReference(cast<MCUnaryExpr>(Expr)->getSubExpr());
+ }
+ return false;
+}
+
+const SparcMCExpr *
+SparcAsmParser::adjustPICRelocation(SparcMCExpr::VariantKind VK,
+ const MCExpr *subExpr)
+{
+ // When in PIC mode, "%lo(...)" and "%hi(...)" behave differently.
+ // If the expression refers contains _GLOBAL_OFFSETE_TABLE, it is
+ // actually a %pc10 or %pc22 relocation. Otherwise, they are interpreted
+ // as %got10 or %got22 relocation.
+
+ if (getContext().getObjectFileInfo()->isPositionIndependent()) {
+ switch(VK) {
+ default: break;
+ case SparcMCExpr::VK_Sparc_LO:
+ VK = (hasGOTReference(subExpr) ? SparcMCExpr::VK_Sparc_PC10
+ : SparcMCExpr::VK_Sparc_GOT10);
+ break;
+ case SparcMCExpr::VK_Sparc_HI:
+ VK = (hasGOTReference(subExpr) ? SparcMCExpr::VK_Sparc_PC22
+ : SparcMCExpr::VK_Sparc_GOT22);
+ break;
+ }
+ }
+
+ return SparcMCExpr::create(VK, subExpr, getContext());
+}
+
+bool SparcAsmParser::matchSparcAsmModifiers(const MCExpr *&EVal,
+ SMLoc &EndLoc)
+{
+ AsmToken Tok = Parser.getTok();
+ if (!Tok.is(AsmToken::Identifier))
+ return false;
+
+ StringRef name = Tok.getString();
+
+ SparcMCExpr::VariantKind VK = SparcMCExpr::parseVariantKind(name);
+
+ if (VK == SparcMCExpr::VK_Sparc_None)
+ return false;
+
+ Parser.Lex(); // Eat the identifier.
+ if (Parser.getTok().getKind() != AsmToken::LParen)
+ return false;
+
+ Parser.Lex(); // Eat the LParen token.
+ const MCExpr *subExpr;
+ if (Parser.parseParenExpression(subExpr, EndLoc))
+ return false;
+
+ EVal = adjustPICRelocation(VK, subExpr);
+ return true;
+}
+
+extern "C" void LLVMInitializeSparcAsmParser() {
+ RegisterMCAsmParser<SparcAsmParser> A(getTheSparcTarget());
+ RegisterMCAsmParser<SparcAsmParser> B(getTheSparcV9Target());
+ RegisterMCAsmParser<SparcAsmParser> C(getTheSparcelTarget());
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#include "SparcGenAsmMatcher.inc"
+
+unsigned SparcAsmParser::validateTargetOperandClass(MCParsedAsmOperand &GOp,
+ unsigned Kind) {
+ SparcOperand &Op = (SparcOperand &)GOp;
+ if (Op.isFloatOrDoubleReg()) {
+ switch (Kind) {
+ default: break;
+ case MCK_DFPRegs:
+ if (!Op.isFloatReg() || SparcOperand::MorphToDoubleReg(Op))
+ return MCTargetAsmParser::Match_Success;
+ break;
+ case MCK_QFPRegs:
+ if (SparcOperand::MorphToQuadReg(Op))
+ return MCTargetAsmParser::Match_Success;
+ break;
+ }
+ }
+ if (Op.isIntReg() && Kind == MCK_IntPair) {
+ if (SparcOperand::MorphToIntPairReg(Op))
+ return MCTargetAsmParser::Match_Success;
+ }
+ if (Op.isCoprocReg() && Kind == MCK_CoprocPair) {
+ if (SparcOperand::MorphToCoprocPairReg(Op))
+ return MCTargetAsmParser::Match_Success;
+ }
+ return Match_InvalidOperand;
+}
diff --git a/contrib/llvm/lib/Target/Sparc/DelaySlotFiller.cpp b/contrib/llvm/lib/Target/Sparc/DelaySlotFiller.cpp
new file mode 100644
index 000000000000..6f9cc314e376
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/DelaySlotFiller.cpp
@@ -0,0 +1,512 @@
+//===-- DelaySlotFiller.cpp - SPARC delay slot filler ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a simple local pass that attempts to fill delay slots with useful
+// instructions. If no instructions can be moved into the delay slot, then a
+// NOP is placed.
+//===----------------------------------------------------------------------===//
+
+#include "Sparc.h"
+#include "SparcSubtarget.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "delay-slot-filler"
+
+STATISTIC(FilledSlots, "Number of delay slots filled");
+
+static cl::opt<bool> DisableDelaySlotFiller(
+ "disable-sparc-delay-filler",
+ cl::init(false),
+ cl::desc("Disable the Sparc delay slot filler."),
+ cl::Hidden);
+
+namespace {
+ struct Filler : public MachineFunctionPass {
+ const SparcSubtarget *Subtarget;
+
+ static char ID;
+ Filler() : MachineFunctionPass(ID) {}
+
+ StringRef getPassName() const override { return "SPARC Delay Slot Filler"; }
+
+ bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
+ bool runOnMachineFunction(MachineFunction &F) override {
+ bool Changed = false;
+ Subtarget = &F.getSubtarget<SparcSubtarget>();
+
+ // This pass invalidates liveness information when it reorders
+ // instructions to fill delay slot.
+ F.getRegInfo().invalidateLiveness();
+
+ for (MachineFunction::iterator FI = F.begin(), FE = F.end();
+ FI != FE; ++FI)
+ Changed |= runOnMachineBasicBlock(*FI);
+ return Changed;
+ }
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+ void insertCallDefsUses(MachineBasicBlock::iterator MI,
+ SmallSet<unsigned, 32>& RegDefs,
+ SmallSet<unsigned, 32>& RegUses);
+
+ void insertDefsUses(MachineBasicBlock::iterator MI,
+ SmallSet<unsigned, 32>& RegDefs,
+ SmallSet<unsigned, 32>& RegUses);
+
+ bool IsRegInSet(SmallSet<unsigned, 32>& RegSet,
+ unsigned Reg);
+
+ bool delayHasHazard(MachineBasicBlock::iterator candidate,
+ bool &sawLoad, bool &sawStore,
+ SmallSet<unsigned, 32> &RegDefs,
+ SmallSet<unsigned, 32> &RegUses);
+
+ MachineBasicBlock::iterator
+ findDelayInstr(MachineBasicBlock &MBB, MachineBasicBlock::iterator slot);
+
+ bool needsUnimp(MachineBasicBlock::iterator I, unsigned &StructSize);
+
+ bool tryCombineRestoreWithPrevInst(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI);
+
+ };
+ char Filler::ID = 0;
+} // end of anonymous namespace
+
+/// createSparcDelaySlotFillerPass - Returns a pass that fills in delay
+/// slots in Sparc MachineFunctions
+///
+FunctionPass *llvm::createSparcDelaySlotFillerPass(TargetMachine &tm) {
+ return new Filler;
+}
+
+
+/// runOnMachineBasicBlock - Fill in delay slots for the given basic block.
+/// We assume there is only one delay slot per delayed instruction.
+///
+bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
+ bool Changed = false;
+ Subtarget = &MBB.getParent()->getSubtarget<SparcSubtarget>();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+
+ for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ) {
+ MachineBasicBlock::iterator MI = I;
+ ++I;
+
+ // If MI is restore, try combining it with previous inst.
+ if (!DisableDelaySlotFiller &&
+ (MI->getOpcode() == SP::RESTORErr
+ || MI->getOpcode() == SP::RESTOREri)) {
+ Changed |= tryCombineRestoreWithPrevInst(MBB, MI);
+ continue;
+ }
+
+ // TODO: If we ever want to support v7, this needs to be extended
+ // to cover all floating point operations.
+ if (!Subtarget->isV9() &&
+ (MI->getOpcode() == SP::FCMPS || MI->getOpcode() == SP::FCMPD
+ || MI->getOpcode() == SP::FCMPQ)) {
+ BuildMI(MBB, I, MI->getDebugLoc(), TII->get(SP::NOP));
+ Changed = true;
+ continue;
+ }
+
+ // If MI has no delay slot, skip.
+ if (!MI->hasDelaySlot())
+ continue;
+
+ MachineBasicBlock::iterator D = MBB.end();
+
+ if (!DisableDelaySlotFiller)
+ D = findDelayInstr(MBB, MI);
+
+ ++FilledSlots;
+ Changed = true;
+
+ if (D == MBB.end())
+ BuildMI(MBB, I, MI->getDebugLoc(), TII->get(SP::NOP));
+ else
+ MBB.splice(I, &MBB, D);
+
+ unsigned structSize = 0;
+ if (needsUnimp(MI, structSize)) {
+ MachineBasicBlock::iterator J = MI;
+ ++J; // skip the delay filler.
+ assert (J != MBB.end() && "MI needs a delay instruction.");
+ BuildMI(MBB, ++J, MI->getDebugLoc(),
+ TII->get(SP::UNIMP)).addImm(structSize);
+ // Bundle the delay filler and unimp with the instruction.
+ MIBundleBuilder(MBB, MachineBasicBlock::iterator(MI), J);
+ } else {
+ MIBundleBuilder(MBB, MachineBasicBlock::iterator(MI), I);
+ }
+ }
+ return Changed;
+}
+
+MachineBasicBlock::iterator
+Filler::findDelayInstr(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator slot)
+{
+ SmallSet<unsigned, 32> RegDefs;
+ SmallSet<unsigned, 32> RegUses;
+ bool sawLoad = false;
+ bool sawStore = false;
+
+ if (slot == MBB.begin())
+ return MBB.end();
+
+ if (slot->getOpcode() == SP::RET || slot->getOpcode() == SP::TLS_CALL)
+ return MBB.end();
+
+ if (slot->getOpcode() == SP::RETL) {
+ MachineBasicBlock::iterator J = slot;
+ --J;
+
+ if (J->getOpcode() == SP::RESTORErr
+ || J->getOpcode() == SP::RESTOREri) {
+ // change retl to ret.
+ slot->setDesc(Subtarget->getInstrInfo()->get(SP::RET));
+ return J;
+ }
+ }
+
+ // Call's delay filler can def some of call's uses.
+ if (slot->isCall())
+ insertCallDefsUses(slot, RegDefs, RegUses);
+ else
+ insertDefsUses(slot, RegDefs, RegUses);
+
+ bool done = false;
+
+ MachineBasicBlock::iterator I = slot;
+
+ while (!done) {
+ done = (I == MBB.begin());
+
+ if (!done)
+ --I;
+
+ // skip debug value
+ if (I->isDebugValue())
+ continue;
+
+ if (I->hasUnmodeledSideEffects() || I->isInlineAsm() || I->isPosition() ||
+ I->hasDelaySlot() || I->isBundledWithSucc())
+ break;
+
+ if (delayHasHazard(I, sawLoad, sawStore, RegDefs, RegUses)) {
+ insertDefsUses(I, RegDefs, RegUses);
+ continue;
+ }
+
+ return I;
+ }
+ return MBB.end();
+}
+
+bool Filler::delayHasHazard(MachineBasicBlock::iterator candidate,
+ bool &sawLoad,
+ bool &sawStore,
+ SmallSet<unsigned, 32> &RegDefs,
+ SmallSet<unsigned, 32> &RegUses)
+{
+
+ if (candidate->isImplicitDef() || candidate->isKill())
+ return true;
+
+ if (candidate->mayLoad()) {
+ sawLoad = true;
+ if (sawStore)
+ return true;
+ }
+
+ if (candidate->mayStore()) {
+ if (sawStore)
+ return true;
+ sawStore = true;
+ if (sawLoad)
+ return true;
+ }
+
+ for (unsigned i = 0, e = candidate->getNumOperands(); i!= e; ++i) {
+ const MachineOperand &MO = candidate->getOperand(i);
+ if (!MO.isReg())
+ continue; // skip
+
+ unsigned Reg = MO.getReg();
+
+ if (MO.isDef()) {
+ // check whether Reg is defined or used before delay slot.
+ if (IsRegInSet(RegDefs, Reg) || IsRegInSet(RegUses, Reg))
+ return true;
+ }
+ if (MO.isUse()) {
+ // check whether Reg is defined before delay slot.
+ if (IsRegInSet(RegDefs, Reg))
+ return true;
+ }
+ }
+
+ unsigned Opcode = candidate->getOpcode();
+ // LD and LDD may have NOPs inserted afterwards in the case of some LEON
+ // processors, so we can't use the delay slot if this feature is switched-on.
+ if (Subtarget->insertNOPLoad()
+ &&
+ Opcode >= SP::LDDArr && Opcode <= SP::LDrr)
+ return true;
+
+ // Same as above for FDIV and FSQRT on some LEON processors.
+ if (Subtarget->fixAllFDIVSQRT()
+ &&
+ Opcode >= SP::FDIVD && Opcode <= SP::FSQRTD)
+ return true;
+
+
+ return false;
+}
+
+
+void Filler::insertCallDefsUses(MachineBasicBlock::iterator MI,
+ SmallSet<unsigned, 32>& RegDefs,
+ SmallSet<unsigned, 32>& RegUses)
+{
+ // Call defines o7, which is visible to the instruction in delay slot.
+ RegDefs.insert(SP::O7);
+
+ switch(MI->getOpcode()) {
+ default: llvm_unreachable("Unknown opcode.");
+ case SP::CALL: break;
+ case SP::CALLrr:
+ case SP::CALLri:
+ assert(MI->getNumOperands() >= 2);
+ const MachineOperand &Reg = MI->getOperand(0);
+ assert(Reg.isReg() && "CALL first operand is not a register.");
+ assert(Reg.isUse() && "CALL first operand is not a use.");
+ RegUses.insert(Reg.getReg());
+
+ const MachineOperand &Operand1 = MI->getOperand(1);
+ if (Operand1.isImm() || Operand1.isGlobal())
+ break;
+ assert(Operand1.isReg() && "CALLrr second operand is not a register.");
+ assert(Operand1.isUse() && "CALLrr second operand is not a use.");
+ RegUses.insert(Operand1.getReg());
+ break;
+ }
+}
+
+// Insert Defs and Uses of MI into the sets RegDefs and RegUses.
+void Filler::insertDefsUses(MachineBasicBlock::iterator MI,
+ SmallSet<unsigned, 32>& RegDefs,
+ SmallSet<unsigned, 32>& RegUses)
+{
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+ if (!MO.isReg())
+ continue;
+
+ unsigned Reg = MO.getReg();
+ if (Reg == 0)
+ continue;
+ if (MO.isDef())
+ RegDefs.insert(Reg);
+ if (MO.isUse()) {
+ // Implicit register uses of retl are return values and
+ // retl does not use them.
+ if (MO.isImplicit() && MI->getOpcode() == SP::RETL)
+ continue;
+ RegUses.insert(Reg);
+ }
+ }
+}
+
+// returns true if the Reg or its alias is in the RegSet.
+bool Filler::IsRegInSet(SmallSet<unsigned, 32>& RegSet, unsigned Reg)
+{
+ // Check Reg and all aliased Registers.
+ for (MCRegAliasIterator AI(Reg, Subtarget->getRegisterInfo(), true);
+ AI.isValid(); ++AI)
+ if (RegSet.count(*AI))
+ return true;
+ return false;
+}
+
+bool Filler::needsUnimp(MachineBasicBlock::iterator I, unsigned &StructSize)
+{
+ if (!I->isCall())
+ return false;
+
+ unsigned structSizeOpNum = 0;
+ switch (I->getOpcode()) {
+ default: llvm_unreachable("Unknown call opcode.");
+ case SP::CALL: structSizeOpNum = 1; break;
+ case SP::CALLrr:
+ case SP::CALLri: structSizeOpNum = 2; break;
+ case SP::TLS_CALL: return false;
+ }
+
+ const MachineOperand &MO = I->getOperand(structSizeOpNum);
+ if (!MO.isImm())
+ return false;
+ StructSize = MO.getImm();
+ return true;
+}
+
+static bool combineRestoreADD(MachineBasicBlock::iterator RestoreMI,
+ MachineBasicBlock::iterator AddMI,
+ const TargetInstrInfo *TII)
+{
+ // Before: add <op0>, <op1>, %i[0-7]
+ // restore %g0, %g0, %i[0-7]
+ //
+ // After : restore <op0>, <op1>, %o[0-7]
+
+ unsigned reg = AddMI->getOperand(0).getReg();
+ if (reg < SP::I0 || reg > SP::I7)
+ return false;
+
+ // Erase RESTORE.
+ RestoreMI->eraseFromParent();
+
+ // Change ADD to RESTORE.
+ AddMI->setDesc(TII->get((AddMI->getOpcode() == SP::ADDrr)
+ ? SP::RESTORErr
+ : SP::RESTOREri));
+
+ // Map the destination register.
+ AddMI->getOperand(0).setReg(reg - SP::I0 + SP::O0);
+
+ return true;
+}
+
+static bool combineRestoreOR(MachineBasicBlock::iterator RestoreMI,
+ MachineBasicBlock::iterator OrMI,
+ const TargetInstrInfo *TII)
+{
+ // Before: or <op0>, <op1>, %i[0-7]
+ // restore %g0, %g0, %i[0-7]
+ // and <op0> or <op1> is zero,
+ //
+ // After : restore <op0>, <op1>, %o[0-7]
+
+ unsigned reg = OrMI->getOperand(0).getReg();
+ if (reg < SP::I0 || reg > SP::I7)
+ return false;
+
+ // check whether it is a copy.
+ if (OrMI->getOpcode() == SP::ORrr
+ && OrMI->getOperand(1).getReg() != SP::G0
+ && OrMI->getOperand(2).getReg() != SP::G0)
+ return false;
+
+ if (OrMI->getOpcode() == SP::ORri
+ && OrMI->getOperand(1).getReg() != SP::G0
+ && (!OrMI->getOperand(2).isImm() || OrMI->getOperand(2).getImm() != 0))
+ return false;
+
+ // Erase RESTORE.
+ RestoreMI->eraseFromParent();
+
+ // Change OR to RESTORE.
+ OrMI->setDesc(TII->get((OrMI->getOpcode() == SP::ORrr)
+ ? SP::RESTORErr
+ : SP::RESTOREri));
+
+ // Map the destination register.
+ OrMI->getOperand(0).setReg(reg - SP::I0 + SP::O0);
+
+ return true;
+}
+
+static bool combineRestoreSETHIi(MachineBasicBlock::iterator RestoreMI,
+ MachineBasicBlock::iterator SetHiMI,
+ const TargetInstrInfo *TII)
+{
+ // Before: sethi imm3, %i[0-7]
+ // restore %g0, %g0, %g0
+ //
+ // After : restore %g0, (imm3<<10), %o[0-7]
+
+ unsigned reg = SetHiMI->getOperand(0).getReg();
+ if (reg < SP::I0 || reg > SP::I7)
+ return false;
+
+ if (!SetHiMI->getOperand(1).isImm())
+ return false;
+
+ int64_t imm = SetHiMI->getOperand(1).getImm();
+
+ // Is it a 3 bit immediate?
+ if (!isInt<3>(imm))
+ return false;
+
+ // Make it a 13 bit immediate.
+ imm = (imm << 10) & 0x1FFF;
+
+ assert(RestoreMI->getOpcode() == SP::RESTORErr);
+
+ RestoreMI->setDesc(TII->get(SP::RESTOREri));
+
+ RestoreMI->getOperand(0).setReg(reg - SP::I0 + SP::O0);
+ RestoreMI->getOperand(1).setReg(SP::G0);
+ RestoreMI->getOperand(2).ChangeToImmediate(imm);
+
+
+ // Erase the original SETHI.
+ SetHiMI->eraseFromParent();
+
+ return true;
+}
+
+bool Filler::tryCombineRestoreWithPrevInst(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI)
+{
+ // No previous instruction.
+ if (MBBI == MBB.begin())
+ return false;
+
+ // assert that MBBI is a "restore %g0, %g0, %g0".
+ assert(MBBI->getOpcode() == SP::RESTORErr
+ && MBBI->getOperand(0).getReg() == SP::G0
+ && MBBI->getOperand(1).getReg() == SP::G0
+ && MBBI->getOperand(2).getReg() == SP::G0);
+
+ MachineBasicBlock::iterator PrevInst = std::prev(MBBI);
+
+ // It cannot be combined with a bundled instruction.
+ if (PrevInst->isBundledWithSucc())
+ return false;
+
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+
+ switch (PrevInst->getOpcode()) {
+ default: break;
+ case SP::ADDrr:
+ case SP::ADDri: return combineRestoreADD(MBBI, PrevInst, TII); break;
+ case SP::ORrr:
+ case SP::ORri: return combineRestoreOR(MBBI, PrevInst, TII); break;
+ case SP::SETHIi: return combineRestoreSETHIi(MBBI, PrevInst, TII); break;
+ }
+ // It cannot combine with the previous instruction.
+ return false;
+}
diff --git a/contrib/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp b/contrib/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
new file mode 100644
index 000000000000..da7e0b737e78
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
@@ -0,0 +1,670 @@
+//===- SparcDisassembler.cpp - Disassembler for Sparc -----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the Sparc Disassembler.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Sparc.h"
+#include "SparcRegisterInfo.h"
+#include "SparcSubtarget.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "sparc-disassembler"
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+
+/// A disassembler class for Sparc.
+class SparcDisassembler : public MCDisassembler {
+public:
+ SparcDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
+ : MCDisassembler(STI, Ctx) {}
+ virtual ~SparcDisassembler() {}
+
+ DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes, uint64_t Address,
+ raw_ostream &VStream,
+ raw_ostream &CStream) const override;
+};
+}
+
+namespace llvm {
+Target &getTheSparcTarget();
+Target &getTheSparcV9Target();
+Target &getTheSparcelTarget();
+}
+
+static MCDisassembler *createSparcDisassembler(const Target &T,
+ const MCSubtargetInfo &STI,
+ MCContext &Ctx) {
+ return new SparcDisassembler(STI, Ctx);
+}
+
+
+extern "C" void LLVMInitializeSparcDisassembler() {
+ // Register the disassembler.
+ TargetRegistry::RegisterMCDisassembler(getTheSparcTarget(),
+ createSparcDisassembler);
+ TargetRegistry::RegisterMCDisassembler(getTheSparcV9Target(),
+ createSparcDisassembler);
+ TargetRegistry::RegisterMCDisassembler(getTheSparcelTarget(),
+ createSparcDisassembler);
+}
+
+static const unsigned IntRegDecoderTable[] = {
+ SP::G0, SP::G1, SP::G2, SP::G3,
+ SP::G4, SP::G5, SP::G6, SP::G7,
+ SP::O0, SP::O1, SP::O2, SP::O3,
+ SP::O4, SP::O5, SP::O6, SP::O7,
+ SP::L0, SP::L1, SP::L2, SP::L3,
+ SP::L4, SP::L5, SP::L6, SP::L7,
+ SP::I0, SP::I1, SP::I2, SP::I3,
+ SP::I4, SP::I5, SP::I6, SP::I7 };
+
+static const unsigned FPRegDecoderTable[] = {
+ SP::F0, SP::F1, SP::F2, SP::F3,
+ SP::F4, SP::F5, SP::F6, SP::F7,
+ SP::F8, SP::F9, SP::F10, SP::F11,
+ SP::F12, SP::F13, SP::F14, SP::F15,
+ SP::F16, SP::F17, SP::F18, SP::F19,
+ SP::F20, SP::F21, SP::F22, SP::F23,
+ SP::F24, SP::F25, SP::F26, SP::F27,
+ SP::F28, SP::F29, SP::F30, SP::F31 };
+
+static const unsigned DFPRegDecoderTable[] = {
+ SP::D0, SP::D16, SP::D1, SP::D17,
+ SP::D2, SP::D18, SP::D3, SP::D19,
+ SP::D4, SP::D20, SP::D5, SP::D21,
+ SP::D6, SP::D22, SP::D7, SP::D23,
+ SP::D8, SP::D24, SP::D9, SP::D25,
+ SP::D10, SP::D26, SP::D11, SP::D27,
+ SP::D12, SP::D28, SP::D13, SP::D29,
+ SP::D14, SP::D30, SP::D15, SP::D31 };
+
+static const unsigned QFPRegDecoderTable[] = {
+ SP::Q0, SP::Q8, ~0U, ~0U,
+ SP::Q1, SP::Q9, ~0U, ~0U,
+ SP::Q2, SP::Q10, ~0U, ~0U,
+ SP::Q3, SP::Q11, ~0U, ~0U,
+ SP::Q4, SP::Q12, ~0U, ~0U,
+ SP::Q5, SP::Q13, ~0U, ~0U,
+ SP::Q6, SP::Q14, ~0U, ~0U,
+ SP::Q7, SP::Q15, ~0U, ~0U } ;
+
+static const unsigned FCCRegDecoderTable[] = {
+ SP::FCC0, SP::FCC1, SP::FCC2, SP::FCC3 };
+
+static const unsigned ASRRegDecoderTable[] = {
+ SP::Y, SP::ASR1, SP::ASR2, SP::ASR3,
+ SP::ASR4, SP::ASR5, SP::ASR6, SP::ASR7,
+ SP::ASR8, SP::ASR9, SP::ASR10, SP::ASR11,
+ SP::ASR12, SP::ASR13, SP::ASR14, SP::ASR15,
+ SP::ASR16, SP::ASR17, SP::ASR18, SP::ASR19,
+ SP::ASR20, SP::ASR21, SP::ASR22, SP::ASR23,
+ SP::ASR24, SP::ASR25, SP::ASR26, SP::ASR27,
+ SP::ASR28, SP::ASR29, SP::ASR30, SP::ASR31};
+
+static const unsigned PRRegDecoderTable[] = {
+ SP::TPC, SP::TNPC, SP::TSTATE, SP::TT, SP::TICK, SP::TBA, SP::PSTATE,
+ SP::TL, SP::PIL, SP::CWP, SP::CANSAVE, SP::CANRESTORE, SP::CLEANWIN,
+ SP::OTHERWIN, SP::WSTATE
+};
+
+static const uint16_t IntPairDecoderTable[] = {
+ SP::G0_G1, SP::G2_G3, SP::G4_G5, SP::G6_G7,
+ SP::O0_O1, SP::O2_O3, SP::O4_O5, SP::O6_O7,
+ SP::L0_L1, SP::L2_L3, SP::L4_L5, SP::L6_L7,
+ SP::I0_I1, SP::I2_I3, SP::I4_I5, SP::I6_I7,
+};
+
+static const unsigned CPRegDecoderTable[] = {
+ SP::C0, SP::C1, SP::C2, SP::C3,
+ SP::C4, SP::C5, SP::C6, SP::C7,
+ SP::C8, SP::C9, SP::C10, SP::C11,
+ SP::C12, SP::C13, SP::C14, SP::C15,
+ SP::C16, SP::C17, SP::C18, SP::C19,
+ SP::C20, SP::C21, SP::C22, SP::C23,
+ SP::C24, SP::C25, SP::C26, SP::C27,
+ SP::C28, SP::C29, SP::C30, SP::C31
+};
+
+
+static const uint16_t CPPairDecoderTable[] = {
+ SP::C0_C1, SP::C2_C3, SP::C4_C5, SP::C6_C7,
+ SP::C8_C9, SP::C10_C11, SP::C12_C13, SP::C14_C15,
+ SP::C16_C17, SP::C18_C19, SP::C20_C21, SP::C22_C23,
+ SP::C24_C25, SP::C26_C27, SP::C28_C29, SP::C30_C31
+};
+
+static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return MCDisassembler::Fail;
+ unsigned Reg = IntRegDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeI64RegsRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return MCDisassembler::Fail;
+ unsigned Reg = IntRegDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
+
+
+static DecodeStatus DecodeFPRegsRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return MCDisassembler::Fail;
+ unsigned Reg = FPRegDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
+
+
+static DecodeStatus DecodeDFPRegsRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return MCDisassembler::Fail;
+ unsigned Reg = DFPRegDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
+
+
+static DecodeStatus DecodeQFPRegsRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return MCDisassembler::Fail;
+
+ unsigned Reg = QFPRegDecoderTable[RegNo];
+ if (Reg == ~0U)
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCPRegsRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return MCDisassembler::Fail;
+ unsigned Reg = CPRegDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFCCRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo > 3)
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createReg(FCCRegDecoderTable[RegNo]));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeASRRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createReg(ASRRegDecoderTable[RegNo]));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodePRRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo >= array_lengthof(PRRegDecoderTable))
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createReg(PRRegDecoderTable[RegNo]));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeIntPairRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ if (RegNo > 31)
+ return MCDisassembler::Fail;
+
+ if ((RegNo & 1))
+ S = MCDisassembler::SoftFail;
+
+ unsigned RegisterPair = IntPairDecoderTable[RegNo/2];
+ Inst.addOperand(MCOperand::createReg(RegisterPair));
+ return S;
+}
+
+static DecodeStatus DecodeCPPairRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder) {
+ if (RegNo > 31)
+ return MCDisassembler::Fail;
+
+ unsigned RegisterPair = CPPairDecoderTable[RegNo/2];
+ Inst.addOperand(MCOperand::createReg(RegisterPair));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeLoadInt(MCInst &Inst, unsigned insn, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeLoadIntPair(MCInst &Inst, unsigned insn, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeLoadFP(MCInst &Inst, unsigned insn, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeLoadDFP(MCInst &Inst, unsigned insn, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeLoadQFP(MCInst &Inst, unsigned insn, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeLoadCP(MCInst &Inst, unsigned insn, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeLoadCPPair(MCInst &Inst, unsigned insn, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeStoreInt(MCInst &Inst, unsigned insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeStoreIntPair(MCInst &Inst, unsigned insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeStoreFP(MCInst &Inst, unsigned insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeStoreDFP(MCInst &Inst, unsigned insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeStoreQFP(MCInst &Inst, unsigned insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeStoreCP(MCInst &Inst, unsigned insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeStoreCPPair(MCInst &Inst, unsigned insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeCall(MCInst &Inst, unsigned insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSIMM13(MCInst &Inst, unsigned insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeJMPL(MCInst &Inst, unsigned insn, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeReturn(MCInst &MI, unsigned insn, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeSWAP(MCInst &Inst, unsigned insn, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeTRAP(MCInst &Inst, unsigned insn, uint64_t Address,
+ const void *Decoder);
+
+#include "SparcGenDisassemblerTables.inc"
+
+/// Read four bytes from the ArrayRef and return 32 bit word.
+static DecodeStatus readInstruction32(ArrayRef<uint8_t> Bytes, uint64_t Address,
+ uint64_t &Size, uint32_t &Insn,
+ bool IsLittleEndian) {
+ // We want to read exactly 4 Bytes of data.
+ if (Bytes.size() < 4) {
+ Size = 0;
+ return MCDisassembler::Fail;
+ }
+
+ Insn = IsLittleEndian
+ ? (Bytes[0] << 0) | (Bytes[1] << 8) | (Bytes[2] << 16) |
+ (Bytes[3] << 24)
+ : (Bytes[3] << 0) | (Bytes[2] << 8) | (Bytes[1] << 16) |
+ (Bytes[0] << 24);
+
+ return MCDisassembler::Success;
+}
+
+DecodeStatus SparcDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes,
+ uint64_t Address,
+ raw_ostream &VStream,
+ raw_ostream &CStream) const {
+ uint32_t Insn;
+ bool isLittleEndian = getContext().getAsmInfo()->isLittleEndian();
+ DecodeStatus Result =
+ readInstruction32(Bytes, Address, Size, Insn, isLittleEndian);
+ if (Result == MCDisassembler::Fail)
+ return MCDisassembler::Fail;
+
+ // Calling the auto-generated decoder function.
+
+ if (STI.getFeatureBits()[Sparc::FeatureV9])
+ {
+ Result = decodeInstruction(DecoderTableSparcV932, Instr, Insn, Address, this, STI);
+ }
+ else
+ {
+ Result = decodeInstruction(DecoderTableSparcV832, Instr, Insn, Address, this, STI);
+ }
+ if (Result != MCDisassembler::Fail)
+ return Result;
+
+ Result =
+ decodeInstruction(DecoderTableSparc32, Instr, Insn, Address, this, STI);
+
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ return Result;
+ }
+
+ return MCDisassembler::Fail;
+}
+
+
+typedef DecodeStatus (*DecodeFunc)(MCInst &MI, unsigned insn, uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeMem(MCInst &MI, unsigned insn, uint64_t Address,
+ const void *Decoder,
+ bool isLoad, DecodeFunc DecodeRD) {
+ unsigned rd = fieldFromInstruction(insn, 25, 5);
+ unsigned rs1 = fieldFromInstruction(insn, 14, 5);
+ bool isImm = fieldFromInstruction(insn, 13, 1);
+ bool hasAsi = fieldFromInstruction(insn, 23, 1); // (in op3 field)
+ unsigned asi = fieldFromInstruction(insn, 5, 8);
+ unsigned rs2 = 0;
+ unsigned simm13 = 0;
+ if (isImm)
+ simm13 = SignExtend32<13>(fieldFromInstruction(insn, 0, 13));
+ else
+ rs2 = fieldFromInstruction(insn, 0, 5);
+
+ DecodeStatus status;
+ if (isLoad) {
+ status = DecodeRD(MI, rd, Address, Decoder);
+ if (status != MCDisassembler::Success)
+ return status;
+ }
+
+ // Decode rs1.
+ status = DecodeIntRegsRegisterClass(MI, rs1, Address, Decoder);
+ if (status != MCDisassembler::Success)
+ return status;
+
+ // Decode imm|rs2.
+ if (isImm)
+ MI.addOperand(MCOperand::createImm(simm13));
+ else {
+ status = DecodeIntRegsRegisterClass(MI, rs2, Address, Decoder);
+ if (status != MCDisassembler::Success)
+ return status;
+ }
+
+ if (hasAsi)
+ MI.addOperand(MCOperand::createImm(asi));
+
+ if (!isLoad) {
+ status = DecodeRD(MI, rd, Address, Decoder);
+ if (status != MCDisassembler::Success)
+ return status;
+ }
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeLoadInt(MCInst &Inst, unsigned insn, uint64_t Address,
+ const void *Decoder) {
+ return DecodeMem(Inst, insn, Address, Decoder, true,
+ DecodeIntRegsRegisterClass);
+}
+
+static DecodeStatus DecodeLoadIntPair(MCInst &Inst, unsigned insn, uint64_t Address,
+ const void *Decoder) {
+ return DecodeMem(Inst, insn, Address, Decoder, true,
+ DecodeIntPairRegisterClass);
+}
+
+static DecodeStatus DecodeLoadFP(MCInst &Inst, unsigned insn, uint64_t Address,
+ const void *Decoder) {
+ return DecodeMem(Inst, insn, Address, Decoder, true,
+ DecodeFPRegsRegisterClass);
+}
+
+static DecodeStatus DecodeLoadDFP(MCInst &Inst, unsigned insn, uint64_t Address,
+ const void *Decoder) {
+ return DecodeMem(Inst, insn, Address, Decoder, true,
+ DecodeDFPRegsRegisterClass);
+}
+
+static DecodeStatus DecodeLoadQFP(MCInst &Inst, unsigned insn, uint64_t Address,
+ const void *Decoder) {
+ return DecodeMem(Inst, insn, Address, Decoder, true,
+ DecodeQFPRegsRegisterClass);
+}
+
+static DecodeStatus DecodeLoadCP(MCInst &Inst, unsigned insn, uint64_t Address,
+ const void *Decoder) {
+ return DecodeMem(Inst, insn, Address, Decoder, true,
+ DecodeCPRegsRegisterClass);
+}
+
+static DecodeStatus DecodeLoadCPPair(MCInst &Inst, unsigned insn, uint64_t Address,
+ const void *Decoder) {
+ return DecodeMem(Inst, insn, Address, Decoder, true,
+ DecodeCPPairRegisterClass);
+}
+
+static DecodeStatus DecodeStoreInt(MCInst &Inst, unsigned insn,
+ uint64_t Address, const void *Decoder) {
+ return DecodeMem(Inst, insn, Address, Decoder, false,
+ DecodeIntRegsRegisterClass);
+}
+
+static DecodeStatus DecodeStoreIntPair(MCInst &Inst, unsigned insn,
+ uint64_t Address, const void *Decoder) {
+ return DecodeMem(Inst, insn, Address, Decoder, false,
+ DecodeIntPairRegisterClass);
+}
+
+static DecodeStatus DecodeStoreFP(MCInst &Inst, unsigned insn, uint64_t Address,
+ const void *Decoder) {
+ return DecodeMem(Inst, insn, Address, Decoder, false,
+ DecodeFPRegsRegisterClass);
+}
+
+static DecodeStatus DecodeStoreDFP(MCInst &Inst, unsigned insn,
+ uint64_t Address, const void *Decoder) {
+ return DecodeMem(Inst, insn, Address, Decoder, false,
+ DecodeDFPRegsRegisterClass);
+}
+
+static DecodeStatus DecodeStoreQFP(MCInst &Inst, unsigned insn,
+ uint64_t Address, const void *Decoder) {
+ return DecodeMem(Inst, insn, Address, Decoder, false,
+ DecodeQFPRegsRegisterClass);
+}
+
+static DecodeStatus DecodeStoreCP(MCInst &Inst, unsigned insn,
+ uint64_t Address, const void *Decoder) {
+ return DecodeMem(Inst, insn, Address, Decoder, false,
+ DecodeCPRegsRegisterClass);
+}
+
+static DecodeStatus DecodeStoreCPPair(MCInst &Inst, unsigned insn,
+ uint64_t Address, const void *Decoder) {
+ return DecodeMem(Inst, insn, Address, Decoder, false,
+ DecodeCPPairRegisterClass);
+}
+
+static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch,
+ uint64_t Address, uint64_t Offset,
+ uint64_t Width, MCInst &MI,
+ const void *Decoder) {
+ const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
+ return Dis->tryAddingSymbolicOperand(MI, Value, Address, isBranch,
+ Offset, Width);
+}
+
+static DecodeStatus DecodeCall(MCInst &MI, unsigned insn,
+ uint64_t Address, const void *Decoder) {
+ unsigned tgt = fieldFromInstruction(insn, 0, 30);
+ tgt <<= 2;
+ if (!tryAddingSymbolicOperand(tgt+Address, false, Address,
+ 0, 30, MI, Decoder))
+ MI.addOperand(MCOperand::createImm(tgt));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSIMM13(MCInst &MI, unsigned insn,
+ uint64_t Address, const void *Decoder) {
+ unsigned tgt = SignExtend32<13>(fieldFromInstruction(insn, 0, 13));
+ MI.addOperand(MCOperand::createImm(tgt));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeJMPL(MCInst &MI, unsigned insn, uint64_t Address,
+ const void *Decoder) {
+
+ unsigned rd = fieldFromInstruction(insn, 25, 5);
+ unsigned rs1 = fieldFromInstruction(insn, 14, 5);
+ unsigned isImm = fieldFromInstruction(insn, 13, 1);
+ unsigned rs2 = 0;
+ unsigned simm13 = 0;
+ if (isImm)
+ simm13 = SignExtend32<13>(fieldFromInstruction(insn, 0, 13));
+ else
+ rs2 = fieldFromInstruction(insn, 0, 5);
+
+ // Decode RD.
+ DecodeStatus status = DecodeIntRegsRegisterClass(MI, rd, Address, Decoder);
+ if (status != MCDisassembler::Success)
+ return status;
+
+ // Decode RS1.
+ status = DecodeIntRegsRegisterClass(MI, rs1, Address, Decoder);
+ if (status != MCDisassembler::Success)
+ return status;
+
+ // Decode RS1 | SIMM13.
+ if (isImm)
+ MI.addOperand(MCOperand::createImm(simm13));
+ else {
+ status = DecodeIntRegsRegisterClass(MI, rs2, Address, Decoder);
+ if (status != MCDisassembler::Success)
+ return status;
+ }
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeReturn(MCInst &MI, unsigned insn, uint64_t Address,
+ const void *Decoder) {
+
+ unsigned rs1 = fieldFromInstruction(insn, 14, 5);
+ unsigned isImm = fieldFromInstruction(insn, 13, 1);
+ unsigned rs2 = 0;
+ unsigned simm13 = 0;
+ if (isImm)
+ simm13 = SignExtend32<13>(fieldFromInstruction(insn, 0, 13));
+ else
+ rs2 = fieldFromInstruction(insn, 0, 5);
+
+ // Decode RS1.
+ DecodeStatus status = DecodeIntRegsRegisterClass(MI, rs1, Address, Decoder);
+ if (status != MCDisassembler::Success)
+ return status;
+
+ // Decode RS2 | SIMM13.
+ if (isImm)
+ MI.addOperand(MCOperand::createImm(simm13));
+ else {
+ status = DecodeIntRegsRegisterClass(MI, rs2, Address, Decoder);
+ if (status != MCDisassembler::Success)
+ return status;
+ }
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSWAP(MCInst &MI, unsigned insn, uint64_t Address,
+ const void *Decoder) {
+
+ unsigned rd = fieldFromInstruction(insn, 25, 5);
+ unsigned rs1 = fieldFromInstruction(insn, 14, 5);
+ unsigned isImm = fieldFromInstruction(insn, 13, 1);
+ bool hasAsi = fieldFromInstruction(insn, 23, 1); // (in op3 field)
+ unsigned asi = fieldFromInstruction(insn, 5, 8);
+ unsigned rs2 = 0;
+ unsigned simm13 = 0;
+ if (isImm)
+ simm13 = SignExtend32<13>(fieldFromInstruction(insn, 0, 13));
+ else
+ rs2 = fieldFromInstruction(insn, 0, 5);
+
+ // Decode RD.
+ DecodeStatus status = DecodeIntRegsRegisterClass(MI, rd, Address, Decoder);
+ if (status != MCDisassembler::Success)
+ return status;
+
+ // Decode RS1.
+ status = DecodeIntRegsRegisterClass(MI, rs1, Address, Decoder);
+ if (status != MCDisassembler::Success)
+ return status;
+
+ // Decode RS1 | SIMM13.
+ if (isImm)
+ MI.addOperand(MCOperand::createImm(simm13));
+ else {
+ status = DecodeIntRegsRegisterClass(MI, rs2, Address, Decoder);
+ if (status != MCDisassembler::Success)
+ return status;
+ }
+
+ if (hasAsi)
+ MI.addOperand(MCOperand::createImm(asi));
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeTRAP(MCInst &MI, unsigned insn, uint64_t Address,
+ const void *Decoder) {
+
+ unsigned rs1 = fieldFromInstruction(insn, 14, 5);
+ unsigned isImm = fieldFromInstruction(insn, 13, 1);
+ unsigned cc =fieldFromInstruction(insn, 25, 4);
+ unsigned rs2 = 0;
+ unsigned imm7 = 0;
+ if (isImm)
+ imm7 = fieldFromInstruction(insn, 0, 7);
+ else
+ rs2 = fieldFromInstruction(insn, 0, 5);
+
+ // Decode RS1.
+ DecodeStatus status = DecodeIntRegsRegisterClass(MI, rs1, Address, Decoder);
+ if (status != MCDisassembler::Success)
+ return status;
+
+ // Decode RS1 | IMM7.
+ if (isImm)
+ MI.addOperand(MCOperand::createImm(imm7));
+ else {
+ status = DecodeIntRegsRegisterClass(MI, rs2, Address, Decoder);
+ if (status != MCDisassembler::Success)
+ return status;
+ }
+
+ // Decode CC
+ MI.addOperand(MCOperand::createImm(cc));
+
+ return MCDisassembler::Success;
+}
diff --git a/contrib/llvm/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp b/contrib/llvm/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp
new file mode 100644
index 000000000000..4981deae6af6
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp
@@ -0,0 +1,197 @@
+//===-- SparcInstPrinter.cpp - Convert Sparc MCInst to assembly syntax -----==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an Sparc MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcInstPrinter.h"
+#include "Sparc.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+// The generated AsmMatcher SparcGenAsmWriter uses "Sparc" as the target
+// namespace. But SPARC backend uses "SP" as its namespace.
+namespace llvm {
+namespace Sparc {
+ using namespace SP;
+}
+}
+
+#define GET_INSTRUCTION_NAME
+#define PRINT_ALIAS_INSTR
+#include "SparcGenAsmWriter.inc"
+
+bool SparcInstPrinter::isV9(const MCSubtargetInfo &STI) const {
+ return (STI.getFeatureBits()[Sparc::FeatureV9]) != 0;
+}
+
+void SparcInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const
+{
+ OS << '%' << StringRef(getRegisterName(RegNo)).lower();
+}
+
+void SparcInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+ StringRef Annot, const MCSubtargetInfo &STI) {
+ if (!printAliasInstr(MI, STI, O) && !printSparcAliasInstr(MI, STI, O))
+ printInstruction(MI, STI, O);
+ printAnnotation(O, Annot);
+}
+
+bool SparcInstPrinter::printSparcAliasInstr(const MCInst *MI,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ switch (MI->getOpcode()) {
+ default: return false;
+ case SP::JMPLrr:
+ case SP::JMPLri: {
+ if (MI->getNumOperands() != 3)
+ return false;
+ if (!MI->getOperand(0).isReg())
+ return false;
+ switch (MI->getOperand(0).getReg()) {
+ default: return false;
+ case SP::G0: // jmp $addr | ret | retl
+ if (MI->getOperand(2).isImm() &&
+ MI->getOperand(2).getImm() == 8) {
+ switch(MI->getOperand(1).getReg()) {
+ default: break;
+ case SP::I7: O << "\tret"; return true;
+ case SP::O7: O << "\tretl"; return true;
+ }
+ }
+ O << "\tjmp "; printMemOperand(MI, 1, STI, O);
+ return true;
+ case SP::O7: // call $addr
+ O << "\tcall "; printMemOperand(MI, 1, STI, O);
+ return true;
+ }
+ }
+ case SP::V9FCMPS: case SP::V9FCMPD: case SP::V9FCMPQ:
+ case SP::V9FCMPES: case SP::V9FCMPED: case SP::V9FCMPEQ: {
+ if (isV9(STI)
+ || (MI->getNumOperands() != 3)
+ || (!MI->getOperand(0).isReg())
+ || (MI->getOperand(0).getReg() != SP::FCC0))
+ return false;
+ // if V8, skip printing %fcc0.
+ switch(MI->getOpcode()) {
+ default:
+ case SP::V9FCMPS: O << "\tfcmps "; break;
+ case SP::V9FCMPD: O << "\tfcmpd "; break;
+ case SP::V9FCMPQ: O << "\tfcmpq "; break;
+ case SP::V9FCMPES: O << "\tfcmpes "; break;
+ case SP::V9FCMPED: O << "\tfcmped "; break;
+ case SP::V9FCMPEQ: O << "\tfcmpeq "; break;
+ }
+ printOperand(MI, 1, STI, O);
+ O << ", ";
+ printOperand(MI, 2, STI, O);
+ return true;
+ }
+ }
+}
+
+void SparcInstPrinter::printOperand(const MCInst *MI, int opNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &MO = MI->getOperand (opNum);
+
+ if (MO.isReg()) {
+ printRegName(O, MO.getReg());
+ return ;
+ }
+
+ if (MO.isImm()) {
+ switch (MI->getOpcode()) {
+ default:
+ O << (int)MO.getImm();
+ return;
+
+ case SP::TICCri: // Fall through
+ case SP::TICCrr: // Fall through
+ case SP::TRAPri: // Fall through
+ case SP::TRAPrr: // Fall through
+ case SP::TXCCri: // Fall through
+ case SP::TXCCrr: // Fall through
+ // Only seven-bit values up to 127.
+ O << ((int) MO.getImm() & 0x7f);
+ return;
+ }
+ }
+
+ assert(MO.isExpr() && "Unknown operand kind in printOperand");
+ MO.getExpr()->print(O, &MAI);
+}
+
+void SparcInstPrinter::printMemOperand(const MCInst *MI, int opNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O, const char *Modifier) {
+ printOperand(MI, opNum, STI, O);
+
+ // If this is an ADD operand, emit it like normal operands.
+ if (Modifier && !strcmp(Modifier, "arith")) {
+ O << ", ";
+ printOperand(MI, opNum+1, STI, O);
+ return;
+ }
+ const MCOperand &MO = MI->getOperand(opNum+1);
+
+ if (MO.isReg() && MO.getReg() == SP::G0)
+ return; // don't print "+%g0"
+ if (MO.isImm() && MO.getImm() == 0)
+ return; // don't print "+0"
+
+ O << "+";
+
+ printOperand(MI, opNum+1, STI, O);
+}
+
+void SparcInstPrinter::printCCOperand(const MCInst *MI, int opNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ int CC = (int)MI->getOperand(opNum).getImm();
+ switch (MI->getOpcode()) {
+ default: break;
+ case SP::FBCOND:
+ case SP::FBCONDA:
+ case SP::BPFCC:
+ case SP::BPFCCA:
+ case SP::BPFCCNT:
+ case SP::BPFCCANT:
+ case SP::MOVFCCrr: case SP::V9MOVFCCrr:
+ case SP::MOVFCCri: case SP::V9MOVFCCri:
+ case SP::FMOVS_FCC: case SP::V9FMOVS_FCC:
+ case SP::FMOVD_FCC: case SP::V9FMOVD_FCC:
+ case SP::FMOVQ_FCC: case SP::V9FMOVQ_FCC:
+ // Make sure CC is a fp conditional flag.
+ CC = (CC < 16) ? (CC + 16) : CC;
+ break;
+ case SP::CBCOND:
+ case SP::CBCONDA:
+ // Make sure CC is a cp conditional flag.
+ CC = (CC < 32) ? (CC + 32) : CC;
+ break;
+ }
+ O << SPARCCondCodeToString((SPCC::CondCodes)CC);
+}
+
+bool SparcInstPrinter::printGetPCX(const MCInst *MI, unsigned opNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ llvm_unreachable("FIXME: Implement SparcInstPrinter::printGetPCX.");
+ return true;
+}
diff --git a/contrib/llvm/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h b/contrib/llvm/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h
new file mode 100644
index 000000000000..6f06d1ddae32
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h
@@ -0,0 +1,55 @@
+//===-- SparcInstPrinter.h - Convert Sparc MCInst to assembly syntax ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an Sparc MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPARC_INSTPRINTER_SPARCINSTPRINTER_H
+#define LLVM_LIB_TARGET_SPARC_INSTPRINTER_SPARCINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+
+class SparcInstPrinter : public MCInstPrinter {
+public:
+ SparcInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI)
+ : MCInstPrinter(MAI, MII, MRI) {}
+
+ void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+ void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
+ const MCSubtargetInfo &STI) override;
+ bool printSparcAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI,
+ raw_ostream &OS);
+ bool isV9(const MCSubtargetInfo &STI) const;
+
+ // Autogenerated by tblgen.
+ void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ bool printAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+ unsigned PrintMethodIdx,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ static const char *getRegisterName(unsigned RegNo);
+
+ void printOperand(const MCInst *MI, int opNum, const MCSubtargetInfo &STI,
+ raw_ostream &OS);
+ void printMemOperand(const MCInst *MI, int opNum, const MCSubtargetInfo &STI,
+ raw_ostream &OS, const char *Modifier = nullptr);
+ void printCCOperand(const MCInst *MI, int opNum, const MCSubtargetInfo &STI,
+ raw_ostream &OS);
+ bool printGetPCX(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &OS);
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/LeonFeatures.td b/contrib/llvm/lib/Target/Sparc/LeonFeatures.td
new file mode 100755
index 000000000000..d06e734b5a7b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/LeonFeatures.td
@@ -0,0 +1,82 @@
+//===-- LeonFeatures.td - Describe the Leon Features -------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// UMAC and SMAC support for LEON3 and LEON4 processors.
+//===----------------------------------------------------------------------===//
+
+//support to casa instruction; for leon3 subtarget only
+def UMACSMACSupport : SubtargetFeature<
+ "hasumacsmac",
+ "HasUmacSmac",
+ "true",
+ "Enable UMAC and SMAC for LEON3 and LEON4 processors"
+>;
+
+
+//===----------------------------------------------------------------------===//
+// CASA Support differs between LEON3-FT GR712RC and LEON3-FT UT699
+// We need to have the option to switch this on and off.
+//===----------------------------------------------------------------------===//
+
+//support to casa instruction; for leon3 subtarget only
+def LeonCASA : SubtargetFeature<
+ "hasleoncasa",
+ "HasLeonCasa",
+ "true",
+ "Enable CASA instruction for LEON3 and LEON4 processors"
+>;
+
+
+def ReplaceSDIV : SubtargetFeature<
+ "replacesdiv",
+ "PerformSDIVReplace",
+ "true",
+ "AT697E erratum fix: Do not emit SDIV, emit SDIVCC instead"
+>;
+
+def InsertNOPLoad: SubtargetFeature<
+ "insertnopload",
+ "InsertNOPLoad",
+ "true",
+ "LEON3 erratum fix: Insert a NOP instruction after every single-cycle load instruction when the next instruction is another load/store instruction"
+>;
+
+def FixFSMULD : SubtargetFeature<
+ "fixfsmuld",
+ "FixFSMULD",
+ "true",
+ "LEON erratum fix: Do not use FSMULD"
+>;
+
+def ReplaceFMULS : SubtargetFeature<
+ "replacefmuls",
+ "ReplaceFMULS",
+ "true",
+ "LEON erratum fix: Replace FMULS instruction with FMULD and relevant conversion instructions"
+>;
+
+def DetectRoundChange : SubtargetFeature<
+ "detectroundchange",
+ "DetectRoundChange",
+ "true",
+ "LEON3 erratum detection: Detects any rounding mode change "
+ "request: use only the round-to-nearest rounding mode"
+>;
+
+def FixAllFDIVSQRT : SubtargetFeature<
+ "fixallfdivsqrt",
+ "FixAllFDIVSQRT",
+ "true",
+ "LEON erratum fix: Fix FDIVS/FDIVD/FSQRTS/FSQRTD instructions with NOPs and floating-point store"
+>;
diff --git a/contrib/llvm/lib/Target/Sparc/LeonPasses.cpp b/contrib/llvm/lib/Target/Sparc/LeonPasses.cpp
new file mode 100755
index 000000000000..0acc2875daa8
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/LeonPasses.cpp
@@ -0,0 +1,374 @@
+//===------ LeonPasses.cpp - Define passes specific to LEON ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "LeonPasses.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+LEONMachineFunctionPass::LEONMachineFunctionPass(TargetMachine &tm, char &ID)
+ : MachineFunctionPass(ID) {}
+
+LEONMachineFunctionPass::LEONMachineFunctionPass(char &ID)
+ : MachineFunctionPass(ID) {}
+
+int LEONMachineFunctionPass::GetRegIndexForOperand(MachineInstr &MI,
+ int OperandIndex) {
+ if (MI.getNumOperands() > 0) {
+ if (OperandIndex == LAST_OPERAND) {
+ OperandIndex = MI.getNumOperands() - 1;
+ }
+
+ if (MI.getNumOperands() > (unsigned)OperandIndex &&
+ MI.getOperand(OperandIndex).isReg()) {
+ return (int)MI.getOperand(OperandIndex).getReg();
+ }
+ }
+
+ static int NotFoundIndex = -10;
+ // Return a different number each time to avoid any comparisons between the
+ // values returned.
+ NotFoundIndex -= 10;
+ return NotFoundIndex;
+}
+
+// finds a new free FP register
+// checks also the AllocatedRegisters vector
+int LEONMachineFunctionPass::getUnusedFPRegister(MachineRegisterInfo &MRI) {
+ for (int RegisterIndex = SP::F0; RegisterIndex <= SP::F31; ++RegisterIndex) {
+ if (!MRI.isPhysRegUsed(RegisterIndex) &&
+ !is_contained(UsedRegisters, RegisterIndex)) {
+ return RegisterIndex;
+ }
+ }
+
+ return -1;
+}
+
+//*****************************************************************************
+//**** InsertNOPLoad pass
+//*****************************************************************************
+// This pass fixes the incorrectly working Load instructions that exists for
+// some earlier versions of the LEON processor line. NOP instructions must
+// be inserted after the load instruction to ensure that the Load instruction
+// behaves as expected for these processors.
+//
+// This pass inserts a NOP after any LD or LDF instruction.
+//
+char InsertNOPLoad::ID = 0;
+
+InsertNOPLoad::InsertNOPLoad(TargetMachine &tm)
+ : LEONMachineFunctionPass(tm, ID) {}
+
+bool InsertNOPLoad::runOnMachineFunction(MachineFunction &MF) {
+ Subtarget = &MF.getSubtarget<SparcSubtarget>();
+ const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
+ DebugLoc DL = DebugLoc();
+
+ bool Modified = false;
+ for (auto MFI = MF.begin(), E = MF.end(); MFI != E; ++MFI) {
+ MachineBasicBlock &MBB = *MFI;
+ for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E; ++MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned Opcode = MI.getOpcode();
+ if (Opcode >= SP::LDDArr && Opcode <= SP::LDrr) {
+ MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+ BuildMI(MBB, NMBBI, DL, TII.get(SP::NOP));
+ Modified = true;
+ }
+ }
+ }
+
+ return Modified;
+}
+
+//*****************************************************************************
+//**** FixFSMULD pass
+//*****************************************************************************
+// This pass fixes the incorrectly working FSMULD instruction that exists for
+// some earlier versions of the LEON processor line.
+//
+// The pass should convert the FSMULD operands to double precision in scratch
+// registers, then calculate the result with the FMULD instruction. Therefore,
+// the pass should replace operations of the form:
+// fsmuld %f20,%f21,%f8
+// with the sequence:
+// fstod %f20,%f0
+// fstod %f21,%f2
+// fmuld %f0,%f2,%f8
+//
+char FixFSMULD::ID = 0;
+
+FixFSMULD::FixFSMULD(TargetMachine &tm) : LEONMachineFunctionPass(tm, ID) {}
+
+bool FixFSMULD::runOnMachineFunction(MachineFunction &MF) {
+ Subtarget = &MF.getSubtarget<SparcSubtarget>();
+ const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
+ DebugLoc DL = DebugLoc();
+
+ bool Modified = false;
+ for (auto MFI = MF.begin(), E = MF.end(); MFI != E; ++MFI) {
+ MachineBasicBlock &MBB = *MFI;
+ for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E; ++MBBI) {
+
+ MachineInstr &MI = *MBBI;
+ unsigned Opcode = MI.getOpcode();
+
+ const int UNASSIGNED_INDEX = -1;
+ int Reg1Index = UNASSIGNED_INDEX;
+ int Reg2Index = UNASSIGNED_INDEX;
+ int Reg3Index = UNASSIGNED_INDEX;
+
+ if (Opcode == SP::FSMULD && MI.getNumOperands() == 3) {
+ // take the registers from fsmuld %f20,%f21,%f8
+ Reg1Index = MI.getOperand(0).getReg();
+ Reg2Index = MI.getOperand(1).getReg();
+ Reg3Index = MI.getOperand(2).getReg();
+ }
+
+ if (Reg1Index != UNASSIGNED_INDEX && Reg2Index != UNASSIGNED_INDEX &&
+ Reg3Index != UNASSIGNED_INDEX) {
+ clearUsedRegisterList();
+ MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+ // Whatever Reg3Index is hasn't been used yet, so we need to reserve it.
+ markRegisterUsed(Reg3Index);
+ const int ScratchReg1Index = getUnusedFPRegister(MF.getRegInfo());
+ markRegisterUsed(ScratchReg1Index);
+ const int ScratchReg2Index = getUnusedFPRegister(MF.getRegInfo());
+ markRegisterUsed(ScratchReg2Index);
+
+ if (ScratchReg1Index == UNASSIGNED_INDEX ||
+ ScratchReg2Index == UNASSIGNED_INDEX) {
+ errs() << "Cannot allocate free scratch registers for the FixFSMULD "
+ "pass."
+ << "\n";
+ } else {
+ // create fstod %f20,%f0
+ BuildMI(MBB, MBBI, DL, TII.get(SP::FSTOD))
+ .addReg(ScratchReg1Index)
+ .addReg(Reg1Index);
+
+ // create fstod %f21,%f2
+ BuildMI(MBB, MBBI, DL, TII.get(SP::FSTOD))
+ .addReg(ScratchReg2Index)
+ .addReg(Reg2Index);
+
+ // create fmuld %f0,%f2,%f8
+ BuildMI(MBB, MBBI, DL, TII.get(SP::FMULD))
+ .addReg(Reg3Index)
+ .addReg(ScratchReg1Index)
+ .addReg(ScratchReg2Index);
+
+ MI.eraseFromParent();
+ MBBI = NMBBI;
+
+ Modified = true;
+ }
+ }
+ }
+ }
+
+ return Modified;
+}
+
+//*****************************************************************************
+//**** ReplaceFMULS pass
+//*****************************************************************************
+// This pass fixes the incorrectly working FMULS instruction that exists for
+// some earlier versions of the LEON processor line.
+//
+// This pass converts the FMULS operands to double precision in scratch
+// registers, then calculates the result with the FMULD instruction.
+// The pass should replace operations of the form:
+// fmuls %f20,%f21,%f8
+// with the sequence:
+// fstod %f20,%f0
+// fstod %f21,%f2
+// fmuld %f0,%f2,%f8
+//
+char ReplaceFMULS::ID = 0;
+
+ReplaceFMULS::ReplaceFMULS(TargetMachine &tm)
+ : LEONMachineFunctionPass(tm, ID) {}
+
+bool ReplaceFMULS::runOnMachineFunction(MachineFunction &MF) {
+ Subtarget = &MF.getSubtarget<SparcSubtarget>();
+ const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
+ DebugLoc DL = DebugLoc();
+
+ bool Modified = false;
+ for (auto MFI = MF.begin(), E = MF.end(); MFI != E; ++MFI) {
+ MachineBasicBlock &MBB = *MFI;
+ for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E; ++MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned Opcode = MI.getOpcode();
+
+ const int UNASSIGNED_INDEX = -1;
+ int Reg1Index = UNASSIGNED_INDEX;
+ int Reg2Index = UNASSIGNED_INDEX;
+ int Reg3Index = UNASSIGNED_INDEX;
+
+ if (Opcode == SP::FMULS && MI.getNumOperands() == 3) {
+ // take the registers from fmuls %f20,%f21,%f8
+ Reg1Index = MI.getOperand(0).getReg();
+ Reg2Index = MI.getOperand(1).getReg();
+ Reg3Index = MI.getOperand(2).getReg();
+ }
+
+ if (Reg1Index != UNASSIGNED_INDEX && Reg2Index != UNASSIGNED_INDEX &&
+ Reg3Index != UNASSIGNED_INDEX) {
+ clearUsedRegisterList();
+ MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+ // Whatever Reg3Index is hasn't been used yet, so we need to reserve it.
+ markRegisterUsed(Reg3Index);
+ const int ScratchReg1Index = getUnusedFPRegister(MF.getRegInfo());
+ markRegisterUsed(ScratchReg1Index);
+ const int ScratchReg2Index = getUnusedFPRegister(MF.getRegInfo());
+ markRegisterUsed(ScratchReg2Index);
+
+ if (ScratchReg1Index == UNASSIGNED_INDEX ||
+ ScratchReg2Index == UNASSIGNED_INDEX) {
+ errs() << "Cannot allocate free scratch registers for the "
+ "ReplaceFMULS pass."
+ << "\n";
+ } else {
+ // create fstod %f20,%f0
+ BuildMI(MBB, MBBI, DL, TII.get(SP::FSTOD))
+ .addReg(ScratchReg1Index)
+ .addReg(Reg1Index);
+
+ // create fstod %f21,%f2
+ BuildMI(MBB, MBBI, DL, TII.get(SP::FSTOD))
+ .addReg(ScratchReg2Index)
+ .addReg(Reg2Index);
+
+ // create fmuld %f0,%f2,%f8
+ BuildMI(MBB, MBBI, DL, TII.get(SP::FMULD))
+ .addReg(Reg3Index)
+ .addReg(ScratchReg1Index)
+ .addReg(ScratchReg2Index);
+
+ MI.eraseFromParent();
+ MBBI = NMBBI;
+
+ Modified = true;
+ }
+ }
+ }
+ }
+
+ return Modified;
+}
+
+
+//*****************************************************************************
+//**** DetectRoundChange pass
+//*****************************************************************************
+// To prevent any explicit change of the default rounding mode, this pass
+// detects any call of the fesetround function.
+// A warning is generated to ensure the user knows this has happened.
+//
+// Detects an erratum in UT699 LEON 3 processor
+
+char DetectRoundChange::ID = 0;
+
+DetectRoundChange::DetectRoundChange(TargetMachine &tm)
+ : LEONMachineFunctionPass(tm, ID) {}
+
+bool DetectRoundChange::runOnMachineFunction(MachineFunction &MF) {
+ Subtarget = &MF.getSubtarget<SparcSubtarget>();
+
+ bool Modified = false;
+ for (auto MFI = MF.begin(), E = MF.end(); MFI != E; ++MFI) {
+ MachineBasicBlock &MBB = *MFI;
+ for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E; ++MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned Opcode = MI.getOpcode();
+ if (Opcode == SP::CALL && MI.getNumOperands() > 0) {
+ MachineOperand &MO = MI.getOperand(0);
+
+ if (MO.isGlobal()) {
+ StringRef FuncName = MO.getGlobal()->getName();
+ if (FuncName.compare_lower("fesetround") == 0) {
+ errs() << "Error: You are using the detectroundchange "
+ "option to detect rounding changes that will "
+ "cause LEON errata. The only way to fix this "
+ "is to remove the call to fesetround from "
+ "the source code.\n";
+ }
+ }
+ }
+ }
+ }
+
+ return Modified;
+}
+
+//*****************************************************************************
+//**** FixAllFDIVSQRT pass
+//*****************************************************************************
+// This pass fixes the incorrectly working FDIVx and FSQRTx instructions that
+// exist for some earlier versions of the LEON processor line. Five NOP
+// instructions need to be inserted after these instructions to ensure the
+// correct result is placed in the destination registers before they are used.
+//
+// This pass implements two fixes:
+// 1) fixing the FSQRTS and FSQRTD instructions.
+// 2) fixing the FDIVS and FDIVD instructions.
+//
+// FSQRTS and FDIVS are converted to FDIVD and FSQRTD respectively earlier in
+// the pipeline when this option is enabled, so this pass needs only to deal
+// with the changes that still need implementing for the "double" versions
+// of these instructions.
+//
+char FixAllFDIVSQRT::ID = 0;
+
+FixAllFDIVSQRT::FixAllFDIVSQRT(TargetMachine &tm)
+ : LEONMachineFunctionPass(tm, ID) {}
+
+bool FixAllFDIVSQRT::runOnMachineFunction(MachineFunction &MF) {
+ Subtarget = &MF.getSubtarget<SparcSubtarget>();
+ const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
+ DebugLoc DL = DebugLoc();
+
+ bool Modified = false;
+ for (auto MFI = MF.begin(), E = MF.end(); MFI != E; ++MFI) {
+ MachineBasicBlock &MBB = *MFI;
+ for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E; ++MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned Opcode = MI.getOpcode();
+
+ // Note: FDIVS and FSQRTS cannot be generated when this erratum fix is
+ // switched on so we don't need to check for them here. They will
+ // already have been converted to FSQRTD or FDIVD earlier in the
+ // pipeline.
+ if (Opcode == SP::FSQRTD || Opcode == SP::FDIVD) {
+ for (int InsertedCount = 0; InsertedCount < 5; InsertedCount++)
+ BuildMI(MBB, MBBI, DL, TII.get(SP::NOP));
+
+ MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+ for (int InsertedCount = 0; InsertedCount < 28; InsertedCount++)
+ BuildMI(MBB, NMBBI, DL, TII.get(SP::NOP));
+
+ Modified = true;
+ }
+ }
+ }
+
+ return Modified;
+}
diff --git a/contrib/llvm/lib/Target/Sparc/LeonPasses.h b/contrib/llvm/lib/Target/Sparc/LeonPasses.h
new file mode 100755
index 000000000000..2158cb636bfc
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/LeonPasses.h
@@ -0,0 +1,115 @@
+//===------- LeonPasses.h - Define passes specific to LEON ----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPARC_LEON_PASSES_H
+#define LLVM_LIB_TARGET_SPARC_LEON_PASSES_H
+
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
+
+#include "Sparc.h"
+#include "SparcSubtarget.h"
+
+namespace llvm {
+class LLVM_LIBRARY_VISIBILITY LEONMachineFunctionPass
+ : public MachineFunctionPass {
+protected:
+ const SparcSubtarget *Subtarget;
+ const int LAST_OPERAND = -1;
+
+ // this vector holds free registers that we allocate in groups for some of the
+ // LEON passes
+ std::vector<int> UsedRegisters;
+
+protected:
+ LEONMachineFunctionPass(TargetMachine &tm, char &ID);
+ LEONMachineFunctionPass(char &ID);
+
+ int GetRegIndexForOperand(MachineInstr &MI, int OperandIndex);
+ void clearUsedRegisterList() { UsedRegisters.clear(); }
+
+ void markRegisterUsed(int registerIndex) {
+ UsedRegisters.push_back(registerIndex);
+ }
+ int getUnusedFPRegister(MachineRegisterInfo &MRI);
+};
+
+class LLVM_LIBRARY_VISIBILITY InsertNOPLoad : public LEONMachineFunctionPass {
+public:
+ static char ID;
+
+ InsertNOPLoad(TargetMachine &tm);
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "InsertNOPLoad: Erratum Fix LBR35: insert a NOP instruction after "
+ "every single-cycle load instruction when the next instruction is "
+ "another load/store instruction";
+ }
+};
+
+class LLVM_LIBRARY_VISIBILITY FixFSMULD : public LEONMachineFunctionPass {
+public:
+ static char ID;
+
+ FixFSMULD(TargetMachine &tm);
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "FixFSMULD: Erratum Fix LBR31: do not select FSMULD";
+ }
+};
+
+class LLVM_LIBRARY_VISIBILITY ReplaceFMULS : public LEONMachineFunctionPass {
+public:
+ static char ID;
+
+ ReplaceFMULS(TargetMachine &tm);
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "ReplaceFMULS: Erratum Fix LBR32: replace FMULS instruction with a "
+ "routine using conversions/double precision operations to replace "
+ "FMULS";
+ }
+};
+
+class LLVM_LIBRARY_VISIBILITY DetectRoundChange
+ : public LEONMachineFunctionPass {
+public:
+ static char ID;
+
+ DetectRoundChange(TargetMachine &tm);
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "DetectRoundChange: Leon erratum detection: detect any rounding "
+ "mode change request: use only the round-to-nearest rounding mode";
+ }
+};
+
+class LLVM_LIBRARY_VISIBILITY FixAllFDIVSQRT : public LEONMachineFunctionPass {
+public:
+ static char ID;
+
+ FixAllFDIVSQRT(TargetMachine &tm);
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "FixAllFDIVSQRT: Erratum Fix LBR34: fix FDIVS/FDIVD/FSQRTS/FSQRTD "
+ "instructions with NOPs and floating-point store";
+ }
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_SPARC_LEON_PASSES_H
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
new file mode 100644
index 000000000000..6106a6c32dc8
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
@@ -0,0 +1,306 @@
+//===-- SparcAsmBackend.cpp - Sparc Assembler Backend ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCAsmBackend.h"
+#include "MCTargetDesc/SparcFixupKinds.h"
+#include "MCTargetDesc/SparcMCTargetDesc.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
+ switch (Kind) {
+ default:
+ llvm_unreachable("Unknown fixup kind!");
+ case FK_Data_1:
+ case FK_Data_2:
+ case FK_Data_4:
+ case FK_Data_8:
+ return Value;
+
+ case Sparc::fixup_sparc_wplt30:
+ case Sparc::fixup_sparc_call30:
+ return (Value >> 2) & 0x3fffffff;
+
+ case Sparc::fixup_sparc_br22:
+ return (Value >> 2) & 0x3fffff;
+
+ case Sparc::fixup_sparc_br19:
+ return (Value >> 2) & 0x7ffff;
+
+ case Sparc::fixup_sparc_br16_2:
+ return (Value >> 2) & 0xc000;
+
+ case Sparc::fixup_sparc_br16_14:
+ return (Value >> 2) & 0x3fff;
+
+ case Sparc::fixup_sparc_pc22:
+ case Sparc::fixup_sparc_got22:
+ case Sparc::fixup_sparc_tls_gd_hi22:
+ case Sparc::fixup_sparc_tls_ldm_hi22:
+ case Sparc::fixup_sparc_tls_ie_hi22:
+ case Sparc::fixup_sparc_hi22:
+ return (Value >> 10) & 0x3fffff;
+
+ case Sparc::fixup_sparc_pc10:
+ case Sparc::fixup_sparc_got10:
+ case Sparc::fixup_sparc_tls_gd_lo10:
+ case Sparc::fixup_sparc_tls_ldm_lo10:
+ case Sparc::fixup_sparc_tls_ie_lo10:
+ case Sparc::fixup_sparc_lo10:
+ return Value & 0x3ff;
+
+ case Sparc::fixup_sparc_tls_ldo_hix22:
+ case Sparc::fixup_sparc_tls_le_hix22:
+ return (~Value >> 10) & 0x3fffff;
+
+ case Sparc::fixup_sparc_tls_ldo_lox10:
+ case Sparc::fixup_sparc_tls_le_lox10:
+ return (~(~Value & 0x3ff)) & 0x1fff;
+
+ case Sparc::fixup_sparc_h44:
+ return (Value >> 22) & 0x3fffff;
+
+ case Sparc::fixup_sparc_m44:
+ return (Value >> 12) & 0x3ff;
+
+ case Sparc::fixup_sparc_l44:
+ return Value & 0xfff;
+
+ case Sparc::fixup_sparc_hh:
+ return (Value >> 42) & 0x3fffff;
+
+ case Sparc::fixup_sparc_hm:
+ return (Value >> 32) & 0x3ff;
+
+ case Sparc::fixup_sparc_tls_gd_add:
+ case Sparc::fixup_sparc_tls_gd_call:
+ case Sparc::fixup_sparc_tls_ldm_add:
+ case Sparc::fixup_sparc_tls_ldm_call:
+ case Sparc::fixup_sparc_tls_ldo_add:
+ case Sparc::fixup_sparc_tls_ie_ld:
+ case Sparc::fixup_sparc_tls_ie_ldx:
+ case Sparc::fixup_sparc_tls_ie_add:
+ return 0;
+ }
+}
+
+namespace {
+ class SparcAsmBackend : public MCAsmBackend {
+ protected:
+ const Target &TheTarget;
+ bool IsLittleEndian;
+ bool Is64Bit;
+
+ public:
+ SparcAsmBackend(const Target &T)
+ : MCAsmBackend(), TheTarget(T),
+ IsLittleEndian(StringRef(TheTarget.getName()) == "sparcel"),
+ Is64Bit(StringRef(TheTarget.getName()) == "sparcv9") {}
+
+ unsigned getNumFixupKinds() const override {
+ return Sparc::NumTargetFixupKinds;
+ }
+
+ const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
+ const static MCFixupKindInfo InfosBE[Sparc::NumTargetFixupKinds] = {
+ // name offset bits flags
+ { "fixup_sparc_call30", 2, 30, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_sparc_br22", 10, 22, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_sparc_br19", 13, 19, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_sparc_br16_2", 10, 2, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_sparc_br16_14", 18, 14, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_sparc_hi22", 10, 22, 0 },
+ { "fixup_sparc_lo10", 22, 10, 0 },
+ { "fixup_sparc_h44", 10, 22, 0 },
+ { "fixup_sparc_m44", 22, 10, 0 },
+ { "fixup_sparc_l44", 20, 12, 0 },
+ { "fixup_sparc_hh", 10, 22, 0 },
+ { "fixup_sparc_hm", 22, 10, 0 },
+ { "fixup_sparc_pc22", 10, 22, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_sparc_pc10", 22, 10, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_sparc_got22", 10, 22, 0 },
+ { "fixup_sparc_got10", 22, 10, 0 },
+ { "fixup_sparc_wplt30", 2, 30, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_sparc_tls_gd_hi22", 10, 22, 0 },
+ { "fixup_sparc_tls_gd_lo10", 22, 10, 0 },
+ { "fixup_sparc_tls_gd_add", 0, 0, 0 },
+ { "fixup_sparc_tls_gd_call", 0, 0, 0 },
+ { "fixup_sparc_tls_ldm_hi22", 10, 22, 0 },
+ { "fixup_sparc_tls_ldm_lo10", 22, 10, 0 },
+ { "fixup_sparc_tls_ldm_add", 0, 0, 0 },
+ { "fixup_sparc_tls_ldm_call", 0, 0, 0 },
+ { "fixup_sparc_tls_ldo_hix22", 10, 22, 0 },
+ { "fixup_sparc_tls_ldo_lox10", 22, 10, 0 },
+ { "fixup_sparc_tls_ldo_add", 0, 0, 0 },
+ { "fixup_sparc_tls_ie_hi22", 10, 22, 0 },
+ { "fixup_sparc_tls_ie_lo10", 22, 10, 0 },
+ { "fixup_sparc_tls_ie_ld", 0, 0, 0 },
+ { "fixup_sparc_tls_ie_ldx", 0, 0, 0 },
+ { "fixup_sparc_tls_ie_add", 0, 0, 0 },
+ { "fixup_sparc_tls_le_hix22", 0, 0, 0 },
+ { "fixup_sparc_tls_le_lox10", 0, 0, 0 }
+ };
+
+ const static MCFixupKindInfo InfosLE[Sparc::NumTargetFixupKinds] = {
+ // name offset bits flags
+ { "fixup_sparc_call30", 0, 30, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_sparc_br22", 0, 22, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_sparc_br19", 0, 19, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_sparc_br16_2", 20, 2, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_sparc_br16_14", 0, 14, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_sparc_hi22", 0, 22, 0 },
+ { "fixup_sparc_lo10", 0, 10, 0 },
+ { "fixup_sparc_h44", 0, 22, 0 },
+ { "fixup_sparc_m44", 0, 10, 0 },
+ { "fixup_sparc_l44", 0, 12, 0 },
+ { "fixup_sparc_hh", 0, 22, 0 },
+ { "fixup_sparc_hm", 0, 10, 0 },
+ { "fixup_sparc_pc22", 0, 22, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_sparc_pc10", 0, 10, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_sparc_got22", 0, 22, 0 },
+ { "fixup_sparc_got10", 0, 10, 0 },
+ { "fixup_sparc_wplt30", 0, 30, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_sparc_tls_gd_hi22", 0, 22, 0 },
+ { "fixup_sparc_tls_gd_lo10", 0, 10, 0 },
+ { "fixup_sparc_tls_gd_add", 0, 0, 0 },
+ { "fixup_sparc_tls_gd_call", 0, 0, 0 },
+ { "fixup_sparc_tls_ldm_hi22", 0, 22, 0 },
+ { "fixup_sparc_tls_ldm_lo10", 0, 10, 0 },
+ { "fixup_sparc_tls_ldm_add", 0, 0, 0 },
+ { "fixup_sparc_tls_ldm_call", 0, 0, 0 },
+ { "fixup_sparc_tls_ldo_hix22", 0, 22, 0 },
+ { "fixup_sparc_tls_ldo_lox10", 0, 10, 0 },
+ { "fixup_sparc_tls_ldo_add", 0, 0, 0 },
+ { "fixup_sparc_tls_ie_hi22", 0, 22, 0 },
+ { "fixup_sparc_tls_ie_lo10", 0, 10, 0 },
+ { "fixup_sparc_tls_ie_ld", 0, 0, 0 },
+ { "fixup_sparc_tls_ie_ldx", 0, 0, 0 },
+ { "fixup_sparc_tls_ie_add", 0, 0, 0 },
+ { "fixup_sparc_tls_le_hix22", 0, 0, 0 },
+ { "fixup_sparc_tls_le_lox10", 0, 0, 0 }
+ };
+
+ if (Kind < FirstTargetFixupKind)
+ return MCAsmBackend::getFixupKindInfo(Kind);
+
+ assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+ "Invalid kind!");
+ if (IsLittleEndian)
+ return InfosLE[Kind - FirstTargetFixupKind];
+
+ return InfosBE[Kind - FirstTargetFixupKind];
+ }
+
+ void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
+ const MCFixup &Fixup, const MCFragment *DF,
+ const MCValue &Target, uint64_t &Value,
+ bool &IsResolved) override {
+ switch ((Sparc::Fixups)Fixup.getKind()) {
+ default: break;
+ case Sparc::fixup_sparc_wplt30:
+ if (Target.getSymA()->getSymbol().isTemporary())
+ return;
+ case Sparc::fixup_sparc_tls_gd_hi22:
+ case Sparc::fixup_sparc_tls_gd_lo10:
+ case Sparc::fixup_sparc_tls_gd_add:
+ case Sparc::fixup_sparc_tls_gd_call:
+ case Sparc::fixup_sparc_tls_ldm_hi22:
+ case Sparc::fixup_sparc_tls_ldm_lo10:
+ case Sparc::fixup_sparc_tls_ldm_add:
+ case Sparc::fixup_sparc_tls_ldm_call:
+ case Sparc::fixup_sparc_tls_ldo_hix22:
+ case Sparc::fixup_sparc_tls_ldo_lox10:
+ case Sparc::fixup_sparc_tls_ldo_add:
+ case Sparc::fixup_sparc_tls_ie_hi22:
+ case Sparc::fixup_sparc_tls_ie_lo10:
+ case Sparc::fixup_sparc_tls_ie_ld:
+ case Sparc::fixup_sparc_tls_ie_ldx:
+ case Sparc::fixup_sparc_tls_ie_add:
+ case Sparc::fixup_sparc_tls_le_hix22:
+ case Sparc::fixup_sparc_tls_le_lox10: IsResolved = false; break;
+ }
+ }
+
+ bool mayNeedRelaxation(const MCInst &Inst) const override {
+ // FIXME.
+ return false;
+ }
+
+ /// fixupNeedsRelaxation - Target specific predicate for whether a given
+ /// fixup requires the associated instruction to be relaxed.
+ bool fixupNeedsRelaxation(const MCFixup &Fixup,
+ uint64_t Value,
+ const MCRelaxableFragment *DF,
+ const MCAsmLayout &Layout) const override {
+ // FIXME.
+ llvm_unreachable("fixupNeedsRelaxation() unimplemented");
+ return false;
+ }
+ void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+ MCInst &Res) const override {
+ // FIXME.
+ llvm_unreachable("relaxInstruction() unimplemented");
+ }
+
+ bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override {
+ // Cannot emit NOP with size not multiple of 32 bits.
+ if (Count % 4 != 0)
+ return false;
+
+ uint64_t NumNops = Count / 4;
+ for (uint64_t i = 0; i != NumNops; ++i)
+ OW->write32(0x01000000);
+
+ return true;
+ }
+ };
+
+ class ELFSparcAsmBackend : public SparcAsmBackend {
+ Triple::OSType OSType;
+ public:
+ ELFSparcAsmBackend(const Target &T, Triple::OSType OSType) :
+ SparcAsmBackend(T), OSType(OSType) { }
+
+ void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+ uint64_t Value, bool IsPCRel) const override {
+
+ Value = adjustFixupValue(Fixup.getKind(), Value);
+ if (!Value) return; // Doesn't change encoding.
+
+ unsigned Offset = Fixup.getOffset();
+
+ // For each byte of the fragment that the fixup touches, mask in the bits
+ // from the fixup value. The Value has been "split up" into the
+ // appropriate bitfields above.
+ for (unsigned i = 0; i != 4; ++i) {
+ unsigned Idx = IsLittleEndian ? i : 3 - i;
+ Data[Offset + Idx] |= uint8_t((Value >> (i * 8)) & 0xff);
+ }
+ }
+
+ MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+ uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(OSType);
+ return createSparcELFObjectWriter(OS, Is64Bit, IsLittleEndian, OSABI);
+ }
+ };
+
+} // end anonymous namespace
+
+MCAsmBackend *llvm::createSparcAsmBackend(const Target &T,
+ const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options) {
+ return new ELFSparcAsmBackend(T, TT.getOS());
+}
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
new file mode 100644
index 000000000000..d35e45e03466
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
@@ -0,0 +1,140 @@
+//===-- SparcELFObjectWriter.cpp - Sparc ELF Writer -----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/SparcFixupKinds.h"
+#include "MCTargetDesc/SparcMCExpr.h"
+#include "MCTargetDesc/SparcMCTargetDesc.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+ class SparcELFObjectWriter : public MCELFObjectTargetWriter {
+ public:
+ SparcELFObjectWriter(bool Is64Bit, uint8_t OSABI)
+ : MCELFObjectTargetWriter(Is64Bit, OSABI,
+ Is64Bit ? ELF::EM_SPARCV9 : ELF::EM_SPARC,
+ /*HasRelocationAddend*/ true) {}
+
+ ~SparcELFObjectWriter() override {}
+
+ protected:
+ unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+ const MCFixup &Fixup, bool IsPCRel) const override;
+
+ bool needsRelocateWithSymbol(const MCSymbol &Sym,
+ unsigned Type) const override;
+
+ };
+}
+
+unsigned SparcELFObjectWriter::getRelocType(MCContext &Ctx,
+ const MCValue &Target,
+ const MCFixup &Fixup,
+ bool IsPCRel) const {
+
+ if (const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(Fixup.getValue())) {
+ if (SExpr->getKind() == SparcMCExpr::VK_Sparc_R_DISP32)
+ return ELF::R_SPARC_DISP32;
+ }
+
+ if (IsPCRel) {
+ switch((unsigned)Fixup.getKind()) {
+ default:
+ llvm_unreachable("Unimplemented fixup -> relocation");
+ case FK_Data_1: return ELF::R_SPARC_DISP8;
+ case FK_Data_2: return ELF::R_SPARC_DISP16;
+ case FK_Data_4: return ELF::R_SPARC_DISP32;
+ case FK_Data_8: return ELF::R_SPARC_DISP64;
+ case Sparc::fixup_sparc_call30: return ELF::R_SPARC_WDISP30;
+ case Sparc::fixup_sparc_br22: return ELF::R_SPARC_WDISP22;
+ case Sparc::fixup_sparc_br19: return ELF::R_SPARC_WDISP19;
+ case Sparc::fixup_sparc_pc22: return ELF::R_SPARC_PC22;
+ case Sparc::fixup_sparc_pc10: return ELF::R_SPARC_PC10;
+ case Sparc::fixup_sparc_wplt30: return ELF::R_SPARC_WPLT30;
+ }
+ }
+
+ switch((unsigned)Fixup.getKind()) {
+ default:
+ llvm_unreachable("Unimplemented fixup -> relocation");
+ case FK_Data_1: return ELF::R_SPARC_8;
+ case FK_Data_2: return ((Fixup.getOffset() % 2)
+ ? ELF::R_SPARC_UA16
+ : ELF::R_SPARC_16);
+ case FK_Data_4: return ((Fixup.getOffset() % 4)
+ ? ELF::R_SPARC_UA32
+ : ELF::R_SPARC_32);
+ case FK_Data_8: return ((Fixup.getOffset() % 8)
+ ? ELF::R_SPARC_UA64
+ : ELF::R_SPARC_64);
+ case Sparc::fixup_sparc_hi22: return ELF::R_SPARC_HI22;
+ case Sparc::fixup_sparc_lo10: return ELF::R_SPARC_LO10;
+ case Sparc::fixup_sparc_h44: return ELF::R_SPARC_H44;
+ case Sparc::fixup_sparc_m44: return ELF::R_SPARC_M44;
+ case Sparc::fixup_sparc_l44: return ELF::R_SPARC_L44;
+ case Sparc::fixup_sparc_hh: return ELF::R_SPARC_HH22;
+ case Sparc::fixup_sparc_hm: return ELF::R_SPARC_HM10;
+ case Sparc::fixup_sparc_got22: return ELF::R_SPARC_GOT22;
+ case Sparc::fixup_sparc_got10: return ELF::R_SPARC_GOT10;
+ case Sparc::fixup_sparc_tls_gd_hi22: return ELF::R_SPARC_TLS_GD_HI22;
+ case Sparc::fixup_sparc_tls_gd_lo10: return ELF::R_SPARC_TLS_GD_LO10;
+ case Sparc::fixup_sparc_tls_gd_add: return ELF::R_SPARC_TLS_GD_ADD;
+ case Sparc::fixup_sparc_tls_gd_call: return ELF::R_SPARC_TLS_GD_CALL;
+ case Sparc::fixup_sparc_tls_ldm_hi22: return ELF::R_SPARC_TLS_LDM_HI22;
+ case Sparc::fixup_sparc_tls_ldm_lo10: return ELF::R_SPARC_TLS_LDM_LO10;
+ case Sparc::fixup_sparc_tls_ldm_add: return ELF::R_SPARC_TLS_LDM_ADD;
+ case Sparc::fixup_sparc_tls_ldm_call: return ELF::R_SPARC_TLS_LDM_CALL;
+ case Sparc::fixup_sparc_tls_ldo_hix22: return ELF::R_SPARC_TLS_LDO_HIX22;
+ case Sparc::fixup_sparc_tls_ldo_lox10: return ELF::R_SPARC_TLS_LDO_LOX10;
+ case Sparc::fixup_sparc_tls_ldo_add: return ELF::R_SPARC_TLS_LDO_ADD;
+ case Sparc::fixup_sparc_tls_ie_hi22: return ELF::R_SPARC_TLS_IE_HI22;
+ case Sparc::fixup_sparc_tls_ie_lo10: return ELF::R_SPARC_TLS_IE_LO10;
+ case Sparc::fixup_sparc_tls_ie_ld: return ELF::R_SPARC_TLS_IE_LD;
+ case Sparc::fixup_sparc_tls_ie_ldx: return ELF::R_SPARC_TLS_IE_LDX;
+ case Sparc::fixup_sparc_tls_ie_add: return ELF::R_SPARC_TLS_IE_ADD;
+ case Sparc::fixup_sparc_tls_le_hix22: return ELF::R_SPARC_TLS_LE_HIX22;
+ case Sparc::fixup_sparc_tls_le_lox10: return ELF::R_SPARC_TLS_LE_LOX10;
+ }
+
+ return ELF::R_SPARC_NONE;
+}
+
+bool SparcELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
+ unsigned Type) const {
+ switch (Type) {
+ default:
+ return false;
+
+ // All relocations that use a GOT need a symbol, not an offset, as
+ // the offset of the symbol within the section is irrelevant to
+ // where the GOT entry is. Don't need to list all the TLS entries,
+ // as they're all marked as requiring a symbol anyways.
+ case ELF::R_SPARC_GOT10:
+ case ELF::R_SPARC_GOT13:
+ case ELF::R_SPARC_GOT22:
+ case ELF::R_SPARC_GOTDATA_HIX22:
+ case ELF::R_SPARC_GOTDATA_LOX10:
+ case ELF::R_SPARC_GOTDATA_OP_HIX22:
+ case ELF::R_SPARC_GOTDATA_OP_LOX10:
+ return true;
+ }
+}
+
+MCObjectWriter *llvm::createSparcELFObjectWriter(raw_pwrite_stream &OS,
+ bool Is64Bit,
+ bool IsLittleEndian,
+ uint8_t OSABI) {
+ MCELFObjectTargetWriter *MOTW = new SparcELFObjectWriter(Is64Bit, OSABI);
+ return createELFObjectWriter(MOTW, OS, IsLittleEndian);
+}
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h
new file mode 100644
index 000000000000..8d79396d936e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h
@@ -0,0 +1,97 @@
+//===-- SparcFixupKinds.h - Sparc Specific Fixup Entries --------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCFIXUPKINDS_H
+#define LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCFIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+ namespace Sparc {
+ enum Fixups {
+ // fixup_sparc_call30 - 30-bit PC relative relocation for call
+ fixup_sparc_call30 = FirstTargetFixupKind,
+
+ /// fixup_sparc_br22 - 22-bit PC relative relocation for
+ /// branches
+ fixup_sparc_br22,
+
+ /// fixup_sparc_br19 - 19-bit PC relative relocation for
+ /// branches on icc/xcc
+ fixup_sparc_br19,
+
+ /// fixup_sparc_bpr - 16-bit fixup for bpr
+ fixup_sparc_br16_2,
+ fixup_sparc_br16_14,
+
+ /// fixup_sparc_hi22 - 22-bit fixup corresponding to %hi(foo)
+ /// for sethi
+ fixup_sparc_hi22,
+
+ /// fixup_sparc_lo10 - 10-bit fixup corresponding to %lo(foo)
+ fixup_sparc_lo10,
+
+ /// fixup_sparc_h44 - 22-bit fixup corresponding to %h44(foo)
+ fixup_sparc_h44,
+
+ /// fixup_sparc_m44 - 10-bit fixup corresponding to %m44(foo)
+ fixup_sparc_m44,
+
+ /// fixup_sparc_l44 - 12-bit fixup corresponding to %l44(foo)
+ fixup_sparc_l44,
+
+ /// fixup_sparc_hh - 22-bit fixup corresponding to %hh(foo)
+ fixup_sparc_hh,
+
+ /// fixup_sparc_hm - 10-bit fixup corresponding to %hm(foo)
+ fixup_sparc_hm,
+
+ /// fixup_sparc_pc22 - 22-bit fixup corresponding to %pc22(foo)
+ fixup_sparc_pc22,
+
+ /// fixup_sparc_pc10 - 10-bit fixup corresponding to %pc10(foo)
+ fixup_sparc_pc10,
+
+ /// fixup_sparc_got22 - 22-bit fixup corresponding to %got22(foo)
+ fixup_sparc_got22,
+
+ /// fixup_sparc_got10 - 10-bit fixup corresponding to %got10(foo)
+ fixup_sparc_got10,
+
+ /// fixup_sparc_wplt30
+ fixup_sparc_wplt30,
+
+ /// fixups for Thread Local Storage
+ fixup_sparc_tls_gd_hi22,
+ fixup_sparc_tls_gd_lo10,
+ fixup_sparc_tls_gd_add,
+ fixup_sparc_tls_gd_call,
+ fixup_sparc_tls_ldm_hi22,
+ fixup_sparc_tls_ldm_lo10,
+ fixup_sparc_tls_ldm_add,
+ fixup_sparc_tls_ldm_call,
+ fixup_sparc_tls_ldo_hix22,
+ fixup_sparc_tls_ldo_lox10,
+ fixup_sparc_tls_ldo_add,
+ fixup_sparc_tls_ie_hi22,
+ fixup_sparc_tls_ie_lo10,
+ fixup_sparc_tls_ie_ld,
+ fixup_sparc_tls_ie_ldx,
+ fixup_sparc_tls_ie_add,
+ fixup_sparc_tls_le_hix22,
+ fixup_sparc_tls_le_lox10,
+
+ // Marker
+ LastTargetFixupKind,
+ NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+ };
+ }
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
new file mode 100644
index 000000000000..280c6d7937b2
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
@@ -0,0 +1,70 @@
+//===-- SparcMCAsmInfo.cpp - Sparc asm properties -------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the SparcMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcMCAsmInfo.h"
+#include "SparcMCExpr.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCStreamer.h"
+
+using namespace llvm;
+
+void SparcELFMCAsmInfo::anchor() {}
+
+SparcELFMCAsmInfo::SparcELFMCAsmInfo(const Triple &TheTriple) {
+ bool isV9 = (TheTriple.getArch() == Triple::sparcv9);
+ IsLittleEndian = (TheTriple.getArch() == Triple::sparcel);
+
+ if (isV9) {
+ PointerSize = CalleeSaveStackSlotSize = 8;
+ }
+
+ Data16bitsDirective = "\t.half\t";
+ Data32bitsDirective = "\t.word\t";
+ // .xword is only supported by V9.
+ Data64bitsDirective = (isV9) ? "\t.xword\t" : nullptr;
+ ZeroDirective = "\t.skip\t";
+ CommentString = "!";
+ SupportsDebugInformation = true;
+
+ ExceptionsType = ExceptionHandling::DwarfCFI;
+
+ SunStyleELFSectionSwitchSyntax = true;
+ UsesELFSectionDirectiveForBSS = true;
+
+ UseIntegratedAssembler = true;
+}
+
+const MCExpr*
+SparcELFMCAsmInfo::getExprForPersonalitySymbol(const MCSymbol *Sym,
+ unsigned Encoding,
+ MCStreamer &Streamer) const {
+ if (Encoding & dwarf::DW_EH_PE_pcrel) {
+ MCContext &Ctx = Streamer.getContext();
+ return SparcMCExpr::create(SparcMCExpr::VK_Sparc_R_DISP32,
+ MCSymbolRefExpr::create(Sym, Ctx), Ctx);
+ }
+
+ return MCAsmInfo::getExprForPersonalitySymbol(Sym, Encoding, Streamer);
+}
+
+const MCExpr*
+SparcELFMCAsmInfo::getExprForFDESymbol(const MCSymbol *Sym,
+ unsigned Encoding,
+ MCStreamer &Streamer) const {
+ if (Encoding & dwarf::DW_EH_PE_pcrel) {
+ MCContext &Ctx = Streamer.getContext();
+ return SparcMCExpr::create(SparcMCExpr::VK_Sparc_R_DISP32,
+ MCSymbolRefExpr::create(Sym, Ctx), Ctx);
+ }
+ return MCAsmInfo::getExprForFDESymbol(Sym, Encoding, Streamer);
+}
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
new file mode 100644
index 000000000000..ad441227600e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
@@ -0,0 +1,38 @@
+//===-- SparcMCAsmInfo.h - Sparc asm properties ----------------*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the SparcMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCMCASMINFO_H
+#define LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCMCASMINFO_H
+
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+class Triple;
+
+class SparcELFMCAsmInfo : public MCAsmInfoELF {
+ void anchor() override;
+
+public:
+ explicit SparcELFMCAsmInfo(const Triple &TheTriple);
+ const MCExpr*
+ getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
+ MCStreamer &Streamer) const override;
+ const MCExpr* getExprForFDESymbol(const MCSymbol *Sym,
+ unsigned Encoding,
+ MCStreamer &Streamer) const override;
+
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
new file mode 100644
index 000000000000..86341c61d1e2
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
@@ -0,0 +1,229 @@
+//===-- SparcMCCodeEmitter.cpp - Convert Sparc code to machine code -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SparcMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcMCExpr.h"
+#include "MCTargetDesc/SparcFixupKinds.h"
+#include "SparcMCTargetDesc.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mccodeemitter"
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
+
+namespace {
+class SparcMCCodeEmitter : public MCCodeEmitter {
+ SparcMCCodeEmitter(const SparcMCCodeEmitter &) = delete;
+ void operator=(const SparcMCCodeEmitter &) = delete;
+ const MCInstrInfo &MCII;
+ MCContext &Ctx;
+
+public:
+ SparcMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
+ : MCII(mcii), Ctx(ctx) {}
+
+ ~SparcMCCodeEmitter() override {}
+
+ void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const override;
+
+ // getBinaryCodeForInstr - TableGen'erated function for getting the
+ // binary encoding for an instruction.
+ uint64_t getBinaryCodeForInstr(const MCInst &MI,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getMachineOpValue - Return binary encoding of operand. If the machine
+ /// operand requires relocation, record the relocation and return zero.
+ unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ unsigned getCallTargetOpValue(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ unsigned getBranchTargetOpValue(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ unsigned getBranchPredTargetOpValue(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ unsigned getBranchOnRegTargetOpValue(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+private:
+ uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;
+ void verifyInstructionPredicates(const MCInst &MI,
+ uint64_t AvailableFeatures) const;
+};
+} // end anonymous namespace
+
+MCCodeEmitter *llvm::createSparcMCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx) {
+ return new SparcMCCodeEmitter(MCII, Ctx);
+}
+
+void SparcMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ verifyInstructionPredicates(MI,
+ computeAvailableFeatures(STI.getFeatureBits()));
+
+ unsigned Bits = getBinaryCodeForInstr(MI, Fixups, STI);
+
+ if (Ctx.getAsmInfo()->isLittleEndian()) {
+ // Output the bits in little-endian byte order.
+ support::endian::Writer<support::little>(OS).write<uint32_t>(Bits);
+ } else {
+ // Output the bits in big-endian byte order.
+ support::endian::Writer<support::big>(OS).write<uint32_t>(Bits);
+ }
+ unsigned tlsOpNo = 0;
+ switch (MI.getOpcode()) {
+ default: break;
+ case SP::TLS_CALL: tlsOpNo = 1; break;
+ case SP::TLS_ADDrr:
+ case SP::TLS_ADDXrr:
+ case SP::TLS_LDrr:
+ case SP::TLS_LDXrr: tlsOpNo = 3; break;
+ }
+ if (tlsOpNo != 0) {
+ const MCOperand &MO = MI.getOperand(tlsOpNo);
+ uint64_t op = getMachineOpValue(MI, MO, Fixups, STI);
+ assert(op == 0 && "Unexpected operand value!");
+ (void)op; // suppress warning.
+ }
+
+ ++MCNumEmitted; // Keep track of the # of mi's emitted.
+}
+
+
+unsigned SparcMCCodeEmitter::
+getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+
+ if (MO.isReg())
+ return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
+
+ if (MO.isImm())
+ return MO.getImm();
+
+ assert(MO.isExpr());
+ const MCExpr *Expr = MO.getExpr();
+ if (const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(Expr)) {
+ MCFixupKind Kind = (MCFixupKind)SExpr->getFixupKind();
+ Fixups.push_back(MCFixup::create(0, Expr, Kind));
+ return 0;
+ }
+
+ int64_t Res;
+ if (Expr->evaluateAsAbsolute(Res))
+ return Res;
+
+ llvm_unreachable("Unhandled expression!");
+ return 0;
+}
+
+unsigned SparcMCCodeEmitter::
+getCallTargetOpValue(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpNo);
+ if (MO.isReg() || MO.isImm())
+ return getMachineOpValue(MI, MO, Fixups, STI);
+
+ if (MI.getOpcode() == SP::TLS_CALL) {
+ // No fixups for __tls_get_addr. Will emit for fixups for tls_symbol in
+ // encodeInstruction.
+#ifndef NDEBUG
+ // Verify that the callee is actually __tls_get_addr.
+ const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(MO.getExpr());
+ assert(SExpr && SExpr->getSubExpr()->getKind() == MCExpr::SymbolRef &&
+ "Unexpected expression in TLS_CALL");
+ const MCSymbolRefExpr *SymExpr = cast<MCSymbolRefExpr>(SExpr->getSubExpr());
+ assert(SymExpr->getSymbol().getName() == "__tls_get_addr" &&
+ "Unexpected function for TLS_CALL");
+#endif
+ return 0;
+ }
+
+ MCFixupKind fixupKind = (MCFixupKind)Sparc::fixup_sparc_call30;
+
+ if (const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(MO.getExpr())) {
+ if (SExpr->getKind() == SparcMCExpr::VK_Sparc_WPLT30)
+ fixupKind = (MCFixupKind)Sparc::fixup_sparc_wplt30;
+ }
+
+ Fixups.push_back(MCFixup::create(0, MO.getExpr(), fixupKind));
+
+ return 0;
+}
+
+unsigned SparcMCCodeEmitter::
+getBranchTargetOpValue(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpNo);
+ if (MO.isReg() || MO.isImm())
+ return getMachineOpValue(MI, MO, Fixups, STI);
+
+ Fixups.push_back(MCFixup::create(0, MO.getExpr(),
+ (MCFixupKind)Sparc::fixup_sparc_br22));
+ return 0;
+}
+
+unsigned SparcMCCodeEmitter::
+getBranchPredTargetOpValue(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpNo);
+ if (MO.isReg() || MO.isImm())
+ return getMachineOpValue(MI, MO, Fixups, STI);
+
+ Fixups.push_back(MCFixup::create(0, MO.getExpr(),
+ (MCFixupKind)Sparc::fixup_sparc_br19));
+ return 0;
+}
+unsigned SparcMCCodeEmitter::
+getBranchOnRegTargetOpValue(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpNo);
+ if (MO.isReg() || MO.isImm())
+ return getMachineOpValue(MI, MO, Fixups, STI);
+
+ Fixups.push_back(MCFixup::create(0, MO.getExpr(),
+ (MCFixupKind)Sparc::fixup_sparc_br16_2));
+ Fixups.push_back(MCFixup::create(0, MO.getExpr(),
+ (MCFixupKind)Sparc::fixup_sparc_br16_14));
+
+ return 0;
+}
+
+#define ENABLE_INSTR_PREDICATE_VERIFIER
+#include "SparcGenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
new file mode 100644
index 000000000000..e85a8cd5e339
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
@@ -0,0 +1,221 @@
+//===-- SparcMCExpr.cpp - Sparc specific MC expression classes --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the assembly expression modifiers
+// accepted by the Sparc architecture (e.g. "%hi", "%lo", ...).
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcMCExpr.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/Object/ELF.h"
+
+
+using namespace llvm;
+
+#define DEBUG_TYPE "sparcmcexpr"
+
+const SparcMCExpr*
+SparcMCExpr::create(VariantKind Kind, const MCExpr *Expr,
+ MCContext &Ctx) {
+ return new (Ctx) SparcMCExpr(Kind, Expr);
+}
+
+void SparcMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
+
+ bool closeParen = printVariantKind(OS, Kind);
+
+ const MCExpr *Expr = getSubExpr();
+ Expr->print(OS, MAI);
+
+ if (closeParen)
+ OS << ')';
+}
+
+bool SparcMCExpr::printVariantKind(raw_ostream &OS, VariantKind Kind)
+{
+ bool closeParen = true;
+ switch (Kind) {
+ case VK_Sparc_None: closeParen = false; break;
+ case VK_Sparc_LO: OS << "%lo("; break;
+ case VK_Sparc_HI: OS << "%hi("; break;
+ case VK_Sparc_H44: OS << "%h44("; break;
+ case VK_Sparc_M44: OS << "%m44("; break;
+ case VK_Sparc_L44: OS << "%l44("; break;
+ case VK_Sparc_HH: OS << "%hh("; break;
+ case VK_Sparc_HM: OS << "%hm("; break;
+ // FIXME: use %pc22/%pc10, if system assembler supports them.
+ case VK_Sparc_PC22: OS << "%hi("; break;
+ case VK_Sparc_PC10: OS << "%lo("; break;
+ // FIXME: use %got22/%got10, if system assembler supports them.
+ case VK_Sparc_GOT22: OS << "%hi("; break;
+ case VK_Sparc_GOT10: OS << "%lo("; break;
+ case VK_Sparc_WPLT30: closeParen = false; break;
+ case VK_Sparc_R_DISP32: OS << "%r_disp32("; break;
+ case VK_Sparc_TLS_GD_HI22: OS << "%tgd_hi22("; break;
+ case VK_Sparc_TLS_GD_LO10: OS << "%tgd_lo10("; break;
+ case VK_Sparc_TLS_GD_ADD: OS << "%tgd_add("; break;
+ case VK_Sparc_TLS_GD_CALL: OS << "%tgd_call("; break;
+ case VK_Sparc_TLS_LDM_HI22: OS << "%tldm_hi22("; break;
+ case VK_Sparc_TLS_LDM_LO10: OS << "%tldm_lo10("; break;
+ case VK_Sparc_TLS_LDM_ADD: OS << "%tldm_add("; break;
+ case VK_Sparc_TLS_LDM_CALL: OS << "%tldm_call("; break;
+ case VK_Sparc_TLS_LDO_HIX22: OS << "%tldo_hix22("; break;
+ case VK_Sparc_TLS_LDO_LOX10: OS << "%tldo_lox10("; break;
+ case VK_Sparc_TLS_LDO_ADD: OS << "%tldo_add("; break;
+ case VK_Sparc_TLS_IE_HI22: OS << "%tie_hi22("; break;
+ case VK_Sparc_TLS_IE_LO10: OS << "%tie_lo10("; break;
+ case VK_Sparc_TLS_IE_LD: OS << "%tie_ld("; break;
+ case VK_Sparc_TLS_IE_LDX: OS << "%tie_ldx("; break;
+ case VK_Sparc_TLS_IE_ADD: OS << "%tie_add("; break;
+ case VK_Sparc_TLS_LE_HIX22: OS << "%tle_hix22("; break;
+ case VK_Sparc_TLS_LE_LOX10: OS << "%tle_lox10("; break;
+ }
+ return closeParen;
+}
+
+SparcMCExpr::VariantKind SparcMCExpr::parseVariantKind(StringRef name)
+{
+ return StringSwitch<SparcMCExpr::VariantKind>(name)
+ .Case("lo", VK_Sparc_LO)
+ .Case("hi", VK_Sparc_HI)
+ .Case("h44", VK_Sparc_H44)
+ .Case("m44", VK_Sparc_M44)
+ .Case("l44", VK_Sparc_L44)
+ .Case("hh", VK_Sparc_HH)
+ .Case("hm", VK_Sparc_HM)
+ .Case("pc22", VK_Sparc_PC22)
+ .Case("pc10", VK_Sparc_PC10)
+ .Case("got22", VK_Sparc_GOT22)
+ .Case("got10", VK_Sparc_GOT10)
+ .Case("r_disp32", VK_Sparc_R_DISP32)
+ .Case("tgd_hi22", VK_Sparc_TLS_GD_HI22)
+ .Case("tgd_lo10", VK_Sparc_TLS_GD_LO10)
+ .Case("tgd_add", VK_Sparc_TLS_GD_ADD)
+ .Case("tgd_call", VK_Sparc_TLS_GD_CALL)
+ .Case("tldm_hi22", VK_Sparc_TLS_LDM_HI22)
+ .Case("tldm_lo10", VK_Sparc_TLS_LDM_LO10)
+ .Case("tldm_add", VK_Sparc_TLS_LDM_ADD)
+ .Case("tldm_call", VK_Sparc_TLS_LDM_CALL)
+ .Case("tldo_hix22", VK_Sparc_TLS_LDO_HIX22)
+ .Case("tldo_lox10", VK_Sparc_TLS_LDO_LOX10)
+ .Case("tldo_add", VK_Sparc_TLS_LDO_ADD)
+ .Case("tie_hi22", VK_Sparc_TLS_IE_HI22)
+ .Case("tie_lo10", VK_Sparc_TLS_IE_LO10)
+ .Case("tie_ld", VK_Sparc_TLS_IE_LD)
+ .Case("tie_ldx", VK_Sparc_TLS_IE_LDX)
+ .Case("tie_add", VK_Sparc_TLS_IE_ADD)
+ .Case("tle_hix22", VK_Sparc_TLS_LE_HIX22)
+ .Case("tle_lox10", VK_Sparc_TLS_LE_LOX10)
+ .Default(VK_Sparc_None);
+}
+
+Sparc::Fixups SparcMCExpr::getFixupKind(SparcMCExpr::VariantKind Kind) {
+ switch (Kind) {
+ default: llvm_unreachable("Unhandled SparcMCExpr::VariantKind");
+ case VK_Sparc_LO: return Sparc::fixup_sparc_lo10;
+ case VK_Sparc_HI: return Sparc::fixup_sparc_hi22;
+ case VK_Sparc_H44: return Sparc::fixup_sparc_h44;
+ case VK_Sparc_M44: return Sparc::fixup_sparc_m44;
+ case VK_Sparc_L44: return Sparc::fixup_sparc_l44;
+ case VK_Sparc_HH: return Sparc::fixup_sparc_hh;
+ case VK_Sparc_HM: return Sparc::fixup_sparc_hm;
+ case VK_Sparc_PC22: return Sparc::fixup_sparc_pc22;
+ case VK_Sparc_PC10: return Sparc::fixup_sparc_pc10;
+ case VK_Sparc_GOT22: return Sparc::fixup_sparc_got22;
+ case VK_Sparc_GOT10: return Sparc::fixup_sparc_got10;
+ case VK_Sparc_WPLT30: return Sparc::fixup_sparc_wplt30;
+ case VK_Sparc_TLS_GD_HI22: return Sparc::fixup_sparc_tls_gd_hi22;
+ case VK_Sparc_TLS_GD_LO10: return Sparc::fixup_sparc_tls_gd_lo10;
+ case VK_Sparc_TLS_GD_ADD: return Sparc::fixup_sparc_tls_gd_add;
+ case VK_Sparc_TLS_GD_CALL: return Sparc::fixup_sparc_tls_gd_call;
+ case VK_Sparc_TLS_LDM_HI22: return Sparc::fixup_sparc_tls_ldm_hi22;
+ case VK_Sparc_TLS_LDM_LO10: return Sparc::fixup_sparc_tls_ldm_lo10;
+ case VK_Sparc_TLS_LDM_ADD: return Sparc::fixup_sparc_tls_ldm_add;
+ case VK_Sparc_TLS_LDM_CALL: return Sparc::fixup_sparc_tls_ldm_call;
+ case VK_Sparc_TLS_LDO_HIX22: return Sparc::fixup_sparc_tls_ldo_hix22;
+ case VK_Sparc_TLS_LDO_LOX10: return Sparc::fixup_sparc_tls_ldo_lox10;
+ case VK_Sparc_TLS_LDO_ADD: return Sparc::fixup_sparc_tls_ldo_add;
+ case VK_Sparc_TLS_IE_HI22: return Sparc::fixup_sparc_tls_ie_hi22;
+ case VK_Sparc_TLS_IE_LO10: return Sparc::fixup_sparc_tls_ie_lo10;
+ case VK_Sparc_TLS_IE_LD: return Sparc::fixup_sparc_tls_ie_ld;
+ case VK_Sparc_TLS_IE_LDX: return Sparc::fixup_sparc_tls_ie_ldx;
+ case VK_Sparc_TLS_IE_ADD: return Sparc::fixup_sparc_tls_ie_add;
+ case VK_Sparc_TLS_LE_HIX22: return Sparc::fixup_sparc_tls_le_hix22;
+ case VK_Sparc_TLS_LE_LOX10: return Sparc::fixup_sparc_tls_le_lox10;
+ }
+}
+
+bool
+SparcMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
+ const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const {
+ return getSubExpr()->evaluateAsRelocatable(Res, Layout, Fixup);
+}
+
+static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
+ switch (Expr->getKind()) {
+ case MCExpr::Target:
+ llvm_unreachable("Can't handle nested target expr!");
+ break;
+
+ case MCExpr::Constant:
+ break;
+
+ case MCExpr::Binary: {
+ const MCBinaryExpr *BE = cast<MCBinaryExpr>(Expr);
+ fixELFSymbolsInTLSFixupsImpl(BE->getLHS(), Asm);
+ fixELFSymbolsInTLSFixupsImpl(BE->getRHS(), Asm);
+ break;
+ }
+
+ case MCExpr::SymbolRef: {
+ const MCSymbolRefExpr &SymRef = *cast<MCSymbolRefExpr>(Expr);
+ cast<MCSymbolELF>(SymRef.getSymbol()).setType(ELF::STT_TLS);
+ break;
+ }
+
+ case MCExpr::Unary:
+ fixELFSymbolsInTLSFixupsImpl(cast<MCUnaryExpr>(Expr)->getSubExpr(), Asm);
+ break;
+ }
+
+}
+
+void SparcMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
+ switch(getKind()) {
+ default: return;
+ case VK_Sparc_TLS_GD_HI22:
+ case VK_Sparc_TLS_GD_LO10:
+ case VK_Sparc_TLS_GD_ADD:
+ case VK_Sparc_TLS_GD_CALL:
+ case VK_Sparc_TLS_LDM_HI22:
+ case VK_Sparc_TLS_LDM_LO10:
+ case VK_Sparc_TLS_LDM_ADD:
+ case VK_Sparc_TLS_LDM_CALL:
+ case VK_Sparc_TLS_LDO_HIX22:
+ case VK_Sparc_TLS_LDO_LOX10:
+ case VK_Sparc_TLS_LDO_ADD:
+ case VK_Sparc_TLS_IE_HI22:
+ case VK_Sparc_TLS_IE_LO10:
+ case VK_Sparc_TLS_IE_LD:
+ case VK_Sparc_TLS_IE_LDX:
+ case VK_Sparc_TLS_IE_ADD:
+ case VK_Sparc_TLS_LE_HIX22:
+ case VK_Sparc_TLS_LE_LOX10: break;
+ }
+ fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm);
+}
+
+void SparcMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
+ Streamer.visitUsedExpr(*getSubExpr());
+}
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
new file mode 100644
index 000000000000..13f08195c764
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
@@ -0,0 +1,112 @@
+//====- SparcMCExpr.h - Sparc specific MC expression classes --*- C++ -*-=====//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Sparc-specific MCExprs, used for modifiers like
+// "%hi" or "%lo" etc.,
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCMCEXPR_H
+#define LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCMCEXPR_H
+
+#include "SparcFixupKinds.h"
+#include "llvm/MC/MCExpr.h"
+
+namespace llvm {
+
+class StringRef;
+class SparcMCExpr : public MCTargetExpr {
+public:
+ enum VariantKind {
+ VK_Sparc_None,
+ VK_Sparc_LO,
+ VK_Sparc_HI,
+ VK_Sparc_H44,
+ VK_Sparc_M44,
+ VK_Sparc_L44,
+ VK_Sparc_HH,
+ VK_Sparc_HM,
+ VK_Sparc_PC22,
+ VK_Sparc_PC10,
+ VK_Sparc_GOT22,
+ VK_Sparc_GOT10,
+ VK_Sparc_WPLT30,
+ VK_Sparc_R_DISP32,
+ VK_Sparc_TLS_GD_HI22,
+ VK_Sparc_TLS_GD_LO10,
+ VK_Sparc_TLS_GD_ADD,
+ VK_Sparc_TLS_GD_CALL,
+ VK_Sparc_TLS_LDM_HI22,
+ VK_Sparc_TLS_LDM_LO10,
+ VK_Sparc_TLS_LDM_ADD,
+ VK_Sparc_TLS_LDM_CALL,
+ VK_Sparc_TLS_LDO_HIX22,
+ VK_Sparc_TLS_LDO_LOX10,
+ VK_Sparc_TLS_LDO_ADD,
+ VK_Sparc_TLS_IE_HI22,
+ VK_Sparc_TLS_IE_LO10,
+ VK_Sparc_TLS_IE_LD,
+ VK_Sparc_TLS_IE_LDX,
+ VK_Sparc_TLS_IE_ADD,
+ VK_Sparc_TLS_LE_HIX22,
+ VK_Sparc_TLS_LE_LOX10
+ };
+
+private:
+ const VariantKind Kind;
+ const MCExpr *Expr;
+
+ explicit SparcMCExpr(VariantKind Kind, const MCExpr *Expr)
+ : Kind(Kind), Expr(Expr) {}
+
+public:
+ /// @name Construction
+ /// @{
+
+ static const SparcMCExpr *create(VariantKind Kind, const MCExpr *Expr,
+ MCContext &Ctx);
+ /// @}
+ /// @name Accessors
+ /// @{
+
+ /// getOpcode - Get the kind of this expression.
+ VariantKind getKind() const { return Kind; }
+
+ /// getSubExpr - Get the child of this expression.
+ const MCExpr *getSubExpr() const { return Expr; }
+
+ /// getFixupKind - Get the fixup kind of this expression.
+ Sparc::Fixups getFixupKind() const { return getFixupKind(Kind); }
+
+ /// @}
+ void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+ bool evaluateAsRelocatableImpl(MCValue &Res,
+ const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const override;
+ void visitUsedExpr(MCStreamer &Streamer) const override;
+ MCFragment *findAssociatedFragment() const override {
+ return getSubExpr()->findAssociatedFragment();
+ }
+
+ void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override;
+
+ static bool classof(const MCExpr *E) {
+ return E->getKind() == MCExpr::Target;
+ }
+
+ static bool classof(const SparcMCExpr *) { return true; }
+
+ static VariantKind parseVariantKind(StringRef name);
+ static bool printVariantKind(raw_ostream &OS, VariantKind Kind);
+ static Sparc::Fixups getFixupKind(VariantKind Kind);
+};
+
+} // end namespace llvm.
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
new file mode 100644
index 000000000000..889e2fd19ba9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
@@ -0,0 +1,170 @@
+//===-- SparcMCTargetDesc.cpp - Sparc Target Descriptions -----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Sparc specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcMCTargetDesc.h"
+#include "InstPrinter/SparcInstPrinter.h"
+#include "SparcMCAsmInfo.h"
+#include "SparcTargetStreamer.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "SparcGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "SparcGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "SparcGenRegisterInfo.inc"
+
+static MCAsmInfo *createSparcMCAsmInfo(const MCRegisterInfo &MRI,
+ const Triple &TT) {
+ MCAsmInfo *MAI = new SparcELFMCAsmInfo(TT);
+ unsigned Reg = MRI.getDwarfRegNum(SP::O6, true);
+ MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, Reg, 0);
+ MAI->addInitialFrameState(Inst);
+ return MAI;
+}
+
+static MCAsmInfo *createSparcV9MCAsmInfo(const MCRegisterInfo &MRI,
+ const Triple &TT) {
+ MCAsmInfo *MAI = new SparcELFMCAsmInfo(TT);
+ unsigned Reg = MRI.getDwarfRegNum(SP::O6, true);
+ MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, Reg, 2047);
+ MAI->addInitialFrameState(Inst);
+ return MAI;
+}
+
+static MCInstrInfo *createSparcMCInstrInfo() {
+ MCInstrInfo *X = new MCInstrInfo();
+ InitSparcMCInstrInfo(X);
+ return X;
+}
+
+static MCRegisterInfo *createSparcMCRegisterInfo(const Triple &TT) {
+ MCRegisterInfo *X = new MCRegisterInfo();
+ InitSparcMCRegisterInfo(X, SP::O7);
+ return X;
+}
+
+static MCSubtargetInfo *
+createSparcMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
+ if (CPU.empty())
+ CPU = (TT.getArch() == Triple::sparcv9) ? "v9" : "v8";
+ return createSparcMCSubtargetInfoImpl(TT, CPU, FS);
+}
+
+// Code models. Some only make sense for 64-bit code.
+//
+// SunCC Reloc CodeModel Constraints
+// abs32 Static Small text+data+bss linked below 2^32 bytes
+// abs44 Static Medium text+data+bss linked below 2^44 bytes
+// abs64 Static Large text smaller than 2^31 bytes
+// pic13 PIC_ Small GOT < 2^13 bytes
+// pic32 PIC_ Medium GOT < 2^32 bytes
+//
+// All code models require that the text segment is smaller than 2GB.
+
+static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM,
+ CodeModel::Model &CM) {
+ // The default 32-bit code model is abs32/pic32 and the default 32-bit
+ // code model for JIT is abs32.
+ switch (CM) {
+ default: break;
+ case CodeModel::Default:
+ case CodeModel::JITDefault: CM = CodeModel::Small; break;
+ }
+}
+
+static void adjustCodeGenOptsV9(const Triple &TT, Reloc::Model RM,
+ CodeModel::Model &CM) {
+ // The default 64-bit code model is abs44/pic32 and the default 64-bit
+ // code model for JIT is abs64.
+ switch (CM) {
+ default: break;
+ case CodeModel::Default:
+ CM = RM == Reloc::PIC_ ? CodeModel::Small : CodeModel::Medium;
+ break;
+ case CodeModel::JITDefault:
+ CM = CodeModel::Large;
+ break;
+ }
+}
+
+static MCTargetStreamer *
+createObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
+ return new SparcTargetELFStreamer(S);
+}
+
+static MCTargetStreamer *createTargetAsmStreamer(MCStreamer &S,
+ formatted_raw_ostream &OS,
+ MCInstPrinter *InstPrint,
+ bool isVerboseAsm) {
+ return new SparcTargetAsmStreamer(S, OS);
+}
+
+static MCInstPrinter *createSparcMCInstPrinter(const Triple &T,
+ unsigned SyntaxVariant,
+ const MCAsmInfo &MAI,
+ const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI) {
+ return new SparcInstPrinter(MAI, MII, MRI);
+}
+
+extern "C" void LLVMInitializeSparcTargetMC() {
+ // Register the MC asm info.
+ RegisterMCAsmInfoFn X(getTheSparcTarget(), createSparcMCAsmInfo);
+ RegisterMCAsmInfoFn Y(getTheSparcV9Target(), createSparcV9MCAsmInfo);
+ RegisterMCAsmInfoFn Z(getTheSparcelTarget(), createSparcMCAsmInfo);
+
+ for (Target *T :
+ {&getTheSparcTarget(), &getTheSparcV9Target(), &getTheSparcelTarget()}) {
+ // Register the MC instruction info.
+ TargetRegistry::RegisterMCInstrInfo(*T, createSparcMCInstrInfo);
+
+ // Register the MC register info.
+ TargetRegistry::RegisterMCRegInfo(*T, createSparcMCRegisterInfo);
+
+ // Register the MC subtarget info.
+ TargetRegistry::RegisterMCSubtargetInfo(*T, createSparcMCSubtargetInfo);
+
+ // Register the MC Code Emitter.
+ TargetRegistry::RegisterMCCodeEmitter(*T, createSparcMCCodeEmitter);
+
+ // Register the asm backend.
+ TargetRegistry::RegisterMCAsmBackend(*T, createSparcAsmBackend);
+
+ // Register the object target streamer.
+ TargetRegistry::RegisterObjectTargetStreamer(*T,
+ createObjectTargetStreamer);
+
+ // Register the asm streamer.
+ TargetRegistry::RegisterAsmTargetStreamer(*T, createTargetAsmStreamer);
+
+ // Register the MCInstPrinter
+ TargetRegistry::RegisterMCInstPrinter(*T, createSparcMCInstPrinter);
+ }
+
+ // Register the MC codegen info.
+ TargetRegistry::registerMCAdjustCodeGenOpts(getTheSparcTarget(),
+ adjustCodeGenOpts);
+ TargetRegistry::registerMCAdjustCodeGenOpts(getTheSparcV9Target(),
+ adjustCodeGenOptsV9);
+ TargetRegistry::registerMCAdjustCodeGenOpts(getTheSparcelTarget(),
+ adjustCodeGenOpts);
+}
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
new file mode 100644
index 000000000000..4e754c132d11
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
@@ -0,0 +1,62 @@
+//===-- SparcMCTargetDesc.h - Sparc Target Descriptions ---------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Sparc specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCMCTARGETDESC_H
+#define LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCMCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class MCTargetOptions;
+class Target;
+class Triple;
+class StringRef;
+class raw_pwrite_stream;
+class raw_ostream;
+
+Target &getTheSparcTarget();
+Target &getTheSparcV9Target();
+Target &getTheSparcelTarget();
+
+MCCodeEmitter *createSparcMCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx);
+MCAsmBackend *createSparcAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options);
+MCObjectWriter *createSparcELFObjectWriter(raw_pwrite_stream &OS, bool Is64Bit,
+ bool IsLIttleEndian, uint8_t OSABI);
+} // End llvm namespace
+
+// Defines symbolic names for Sparc registers. This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "SparcGenRegisterInfo.inc"
+
+// Defines symbolic names for the Sparc instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "SparcGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "SparcGenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.cpp b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.cpp
new file mode 100644
index 000000000000..94af791e0e75
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.cpp
@@ -0,0 +1,46 @@
+//===-- SparcTargetStreamer.cpp - Sparc Target Streamer Methods -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Sparc specific target streamer methods.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcTargetStreamer.h"
+#include "InstPrinter/SparcInstPrinter.h"
+#include "llvm/Support/FormattedStream.h"
+
+using namespace llvm;
+
+// pin vtable to this file
+SparcTargetStreamer::SparcTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
+
+void SparcTargetStreamer::anchor() {}
+
+SparcTargetAsmStreamer::SparcTargetAsmStreamer(MCStreamer &S,
+ formatted_raw_ostream &OS)
+ : SparcTargetStreamer(S), OS(OS) {}
+
+void SparcTargetAsmStreamer::emitSparcRegisterIgnore(unsigned reg) {
+ OS << "\t.register "
+ << "%" << StringRef(SparcInstPrinter::getRegisterName(reg)).lower()
+ << ", #ignore\n";
+}
+
+void SparcTargetAsmStreamer::emitSparcRegisterScratch(unsigned reg) {
+ OS << "\t.register "
+ << "%" << StringRef(SparcInstPrinter::getRegisterName(reg)).lower()
+ << ", #scratch\n";
+}
+
+SparcTargetELFStreamer::SparcTargetELFStreamer(MCStreamer &S)
+ : SparcTargetStreamer(S) {}
+
+MCELFStreamer &SparcTargetELFStreamer::getStreamer() {
+ return static_cast<MCELFStreamer &>(Streamer);
+}
diff --git a/contrib/llvm/lib/Target/Sparc/Sparc.h b/contrib/llvm/lib/Target/Sparc/Sparc.h
new file mode 100644
index 000000000000..0a8272d89297
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/Sparc.h
@@ -0,0 +1,167 @@
+//===-- Sparc.h - Top-level interface for Sparc representation --*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// Sparc back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPARC_SPARC_H
+#define LLVM_LIB_TARGET_SPARC_SPARC_H
+
+#include "MCTargetDesc/SparcMCTargetDesc.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+ class FunctionPass;
+ class SparcTargetMachine;
+ class formatted_raw_ostream;
+ class AsmPrinter;
+ class MCInst;
+ class MachineInstr;
+
+ FunctionPass *createSparcISelDag(SparcTargetMachine &TM);
+ FunctionPass *createSparcDelaySlotFillerPass(TargetMachine &TM);
+
+ void LowerSparcMachineInstrToMCInst(const MachineInstr *MI,
+ MCInst &OutMI,
+ AsmPrinter &AP);
+} // end namespace llvm;
+
+namespace llvm {
+ // Enums corresponding to Sparc condition codes, both icc's and fcc's. These
+ // values must be kept in sync with the ones in the .td file.
+ namespace SPCC {
+ enum CondCodes {
+ ICC_A = 8 , // Always
+ ICC_N = 0 , // Never
+ ICC_NE = 9 , // Not Equal
+ ICC_E = 1 , // Equal
+ ICC_G = 10 , // Greater
+ ICC_LE = 2 , // Less or Equal
+ ICC_GE = 11 , // Greater or Equal
+ ICC_L = 3 , // Less
+ ICC_GU = 12 , // Greater Unsigned
+ ICC_LEU = 4 , // Less or Equal Unsigned
+ ICC_CC = 13 , // Carry Clear/Great or Equal Unsigned
+ ICC_CS = 5 , // Carry Set/Less Unsigned
+ ICC_POS = 14 , // Positive
+ ICC_NEG = 6 , // Negative
+ ICC_VC = 15 , // Overflow Clear
+ ICC_VS = 7 , // Overflow Set
+
+ FCC_A = 8+16, // Always
+ FCC_N = 0+16, // Never
+ FCC_U = 7+16, // Unordered
+ FCC_G = 6+16, // Greater
+ FCC_UG = 5+16, // Unordered or Greater
+ FCC_L = 4+16, // Less
+ FCC_UL = 3+16, // Unordered or Less
+ FCC_LG = 2+16, // Less or Greater
+ FCC_NE = 1+16, // Not Equal
+ FCC_E = 9+16, // Equal
+ FCC_UE = 10+16, // Unordered or Equal
+ FCC_GE = 11+16, // Greater or Equal
+ FCC_UGE = 12+16, // Unordered or Greater or Equal
+ FCC_LE = 13+16, // Less or Equal
+ FCC_ULE = 14+16, // Unordered or Less or Equal
+ FCC_O = 15+16, // Ordered
+
+ CPCC_A = 8+32, // Always
+ CPCC_N = 0+32, // Never
+ CPCC_3 = 7+32,
+ CPCC_2 = 6+32,
+ CPCC_23 = 5+32,
+ CPCC_1 = 4+32,
+ CPCC_13 = 3+32,
+ CPCC_12 = 2+32,
+ CPCC_123 = 1+32,
+ CPCC_0 = 9+32,
+ CPCC_03 = 10+32,
+ CPCC_02 = 11+32,
+ CPCC_023 = 12+32,
+ CPCC_01 = 13+32,
+ CPCC_013 = 14+32,
+ CPCC_012 = 15+32
+ };
+ }
+
+ inline static const char *SPARCCondCodeToString(SPCC::CondCodes CC) {
+ switch (CC) {
+ case SPCC::ICC_A: return "a";
+ case SPCC::ICC_N: return "n";
+ case SPCC::ICC_NE: return "ne";
+ case SPCC::ICC_E: return "e";
+ case SPCC::ICC_G: return "g";
+ case SPCC::ICC_LE: return "le";
+ case SPCC::ICC_GE: return "ge";
+ case SPCC::ICC_L: return "l";
+ case SPCC::ICC_GU: return "gu";
+ case SPCC::ICC_LEU: return "leu";
+ case SPCC::ICC_CC: return "cc";
+ case SPCC::ICC_CS: return "cs";
+ case SPCC::ICC_POS: return "pos";
+ case SPCC::ICC_NEG: return "neg";
+ case SPCC::ICC_VC: return "vc";
+ case SPCC::ICC_VS: return "vs";
+ case SPCC::FCC_A: return "a";
+ case SPCC::FCC_N: return "n";
+ case SPCC::FCC_U: return "u";
+ case SPCC::FCC_G: return "g";
+ case SPCC::FCC_UG: return "ug";
+ case SPCC::FCC_L: return "l";
+ case SPCC::FCC_UL: return "ul";
+ case SPCC::FCC_LG: return "lg";
+ case SPCC::FCC_NE: return "ne";
+ case SPCC::FCC_E: return "e";
+ case SPCC::FCC_UE: return "ue";
+ case SPCC::FCC_GE: return "ge";
+ case SPCC::FCC_UGE: return "uge";
+ case SPCC::FCC_LE: return "le";
+ case SPCC::FCC_ULE: return "ule";
+ case SPCC::FCC_O: return "o";
+ case SPCC::CPCC_A: return "a";
+ case SPCC::CPCC_N: return "n";
+ case SPCC::CPCC_3: return "3";
+ case SPCC::CPCC_2: return "2";
+ case SPCC::CPCC_23: return "23";
+ case SPCC::CPCC_1: return "1";
+ case SPCC::CPCC_13: return "13";
+ case SPCC::CPCC_12: return "12";
+ case SPCC::CPCC_123: return "123";
+ case SPCC::CPCC_0: return "0";
+ case SPCC::CPCC_03: return "03";
+ case SPCC::CPCC_02: return "02";
+ case SPCC::CPCC_023: return "023";
+ case SPCC::CPCC_01: return "01";
+ case SPCC::CPCC_013: return "013";
+ case SPCC::CPCC_012: return "012";
+ }
+ llvm_unreachable("Invalid cond code");
+ }
+
+ inline static unsigned HI22(int64_t imm) {
+ return (unsigned)((imm >> 10) & ((1 << 22)-1));
+ }
+
+ inline static unsigned LO10(int64_t imm) {
+ return (unsigned)(imm & 0x3FF);
+ }
+
+ inline static unsigned HIX22(int64_t imm) {
+ return HI22(~imm);
+ }
+
+ inline static unsigned LOX10(int64_t imm) {
+ return ~LO10(~imm);
+ }
+
+} // end namespace llvm
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/Sparc.td b/contrib/llvm/lib/Target/Sparc/Sparc.td
new file mode 100644
index 000000000000..11004c5a952f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/Sparc.td
@@ -0,0 +1,159 @@
+//===-- Sparc.td - Describe the Sparc Target Machine -------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// SPARC Subtarget features.
+//
+
+def FeatureV9
+ : SubtargetFeature<"v9", "IsV9", "true",
+ "Enable SPARC-V9 instructions">;
+def FeatureV8Deprecated
+ : SubtargetFeature<"deprecated-v8", "V8DeprecatedInsts", "true",
+ "Enable deprecated V8 instructions in V9 mode">;
+def FeatureVIS
+ : SubtargetFeature<"vis", "IsVIS", "true",
+ "Enable UltraSPARC Visual Instruction Set extensions">;
+def FeatureVIS2
+ : SubtargetFeature<"vis2", "IsVIS2", "true",
+ "Enable Visual Instruction Set extensions II">;
+def FeatureVIS3
+ : SubtargetFeature<"vis3", "IsVIS3", "true",
+ "Enable Visual Instruction Set extensions III">;
+def FeatureLeon
+ : SubtargetFeature<"leon", "IsLeon", "true",
+ "Enable LEON extensions">;
+
+def FeatureHardQuad
+ : SubtargetFeature<"hard-quad-float", "HasHardQuad", "true",
+ "Enable quad-word floating point instructions">;
+
+def UsePopc : SubtargetFeature<"popc", "UsePopc", "true",
+ "Use the popc (population count) instruction">;
+
+def FeatureSoftFloat : SubtargetFeature<"soft-float", "UseSoftFloat", "true",
+ "Use software emulation for floating point">;
+
+//==== Features added predmoninantly for LEON subtarget support
+include "LeonFeatures.td"
+
+//===----------------------------------------------------------------------===//
+// Register File, Calling Conv, Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "SparcRegisterInfo.td"
+include "SparcCallingConv.td"
+include "SparcSchedule.td"
+include "SparcInstrInfo.td"
+
+def SparcInstrInfo : InstrInfo;
+
+def SparcAsmParser : AsmParser {
+ bit ShouldEmitMatchRegisterName = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// SPARC processors supported.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"generic", []>;
+def : Proc<"v7", []>;
+def : Proc<"v8", []>;
+def : Proc<"supersparc", []>;
+def : Proc<"sparclite", []>;
+def : Proc<"f934", []>;
+def : Proc<"hypersparc", []>;
+def : Proc<"sparclite86x", []>;
+def : Proc<"sparclet", []>;
+def : Proc<"tsc701", []>;
+def : Proc<"myriad2", [FeatureLeon, LeonCASA]>;
+def : Proc<"myriad2.1", [FeatureLeon, LeonCASA]>;
+def : Proc<"myriad2.2", [FeatureLeon, LeonCASA]>;
+def : Proc<"ma2100", [FeatureLeon, LeonCASA]>;
+def : Proc<"ma2150", [FeatureLeon, LeonCASA]>;
+def : Proc<"ma2450", [FeatureLeon, LeonCASA]>;
+def : Proc<"v9", [FeatureV9]>;
+def : Proc<"ultrasparc", [FeatureV9, FeatureV8Deprecated, FeatureVIS]>;
+def : Proc<"ultrasparc3", [FeatureV9, FeatureV8Deprecated, FeatureVIS,
+ FeatureVIS2]>;
+def : Proc<"niagara", [FeatureV9, FeatureV8Deprecated, FeatureVIS,
+ FeatureVIS2]>;
+def : Proc<"niagara2", [FeatureV9, FeatureV8Deprecated, UsePopc,
+ FeatureVIS, FeatureVIS2]>;
+def : Proc<"niagara3", [FeatureV9, FeatureV8Deprecated, UsePopc,
+ FeatureVIS, FeatureVIS2]>;
+def : Proc<"niagara4", [FeatureV9, FeatureV8Deprecated, UsePopc,
+ FeatureVIS, FeatureVIS2, FeatureVIS3]>;
+
+// LEON 2 FT generic
+def : Processor<"leon2", LEON2Itineraries,
+ [FeatureLeon]>;
+
+// LEON 2 FT (AT697E)
+// TO DO: Place-holder: Processor specific features will be added *very* soon here.
+def : Processor<"at697e", LEON2Itineraries,
+ [FeatureLeon, ReplaceSDIV, InsertNOPLoad]>;
+
+// LEON 2 FT (AT697F)
+// TO DO: Place-holder: Processor specific features will be added *very* soon here.
+def : Processor<"at697f", LEON2Itineraries,
+ [FeatureLeon, InsertNOPLoad]>;
+
+
+// LEON 3 FT generic
+def : Processor<"leon3", LEON3Itineraries,
+ [FeatureLeon, UMACSMACSupport]>;
+
+// LEON 3 FT (UT699). Provides features for the UT699 processor
+// - covers all the erratum fixes for LEON3, but does not support the CASA instruction.
+def : Processor<"ut699", LEON3Itineraries,
+ [FeatureLeon, InsertNOPLoad, FixFSMULD, ReplaceFMULS, FixAllFDIVSQRT]>;
+
+// LEON3 FT (GR712RC). Provides features for the GR712RC processor.
+// - covers all the erratum fixed for LEON3 and support for the CASA instruction.
+def : Processor<"gr712rc", LEON3Itineraries,
+ [FeatureLeon, LeonCASA]>;
+
+// LEON 4 FT generic
+def : Processor<"leon4", LEON4Itineraries,
+ [FeatureLeon, UMACSMACSupport, LeonCASA]>;
+
+// LEON 4 FT (GR740)
+// TO DO: Place-holder: Processor specific features will be added *very* soon here.
+def : Processor<"gr740", LEON4Itineraries,
+ [FeatureLeon, UMACSMACSupport, LeonCASA]>;
+
+//===----------------------------------------------------------------------===//
+// Declare the target which we are implementing
+//===----------------------------------------------------------------------===//
+
+def SparcAsmWriter : AsmWriter {
+ string AsmWriterClassName = "InstPrinter";
+ int PassSubtarget = 1;
+ int Variant = 0;
+}
+
+def Sparc : Target {
+ // Pull in Instruction Info:
+ let InstructionSet = SparcInstrInfo;
+ let AssemblyParsers = [SparcAsmParser];
+ let AssemblyWriters = [SparcAsmWriter];
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/contrib/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
new file mode 100644
index 000000000000..31a128a5f271
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -0,0 +1,449 @@
+//===-- SparcAsmPrinter.cpp - Sparc LLVM assembly writer ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to GAS-format SPARC assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Sparc.h"
+#include "InstPrinter/SparcInstPrinter.h"
+#include "MCTargetDesc/SparcMCExpr.h"
+#include "SparcInstrInfo.h"
+#include "SparcTargetMachine.h"
+#include "SparcTargetStreamer.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+namespace {
+ class SparcAsmPrinter : public AsmPrinter {
+ SparcTargetStreamer &getTargetStreamer() {
+ return static_cast<SparcTargetStreamer &>(
+ *OutStreamer->getTargetStreamer());
+ }
+ public:
+ explicit SparcAsmPrinter(TargetMachine &TM,
+ std::unique_ptr<MCStreamer> Streamer)
+ : AsmPrinter(TM, std::move(Streamer)) {}
+
+ StringRef getPassName() const override { return "Sparc Assembly Printer"; }
+
+ void printOperand(const MachineInstr *MI, int opNum, raw_ostream &OS);
+ void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &OS,
+ const char *Modifier = nullptr);
+
+ void EmitFunctionBodyStart() override;
+ void EmitInstruction(const MachineInstr *MI) override;
+
+ static const char *getRegisterName(unsigned RegNo) {
+ return SparcInstPrinter::getRegisterName(RegNo);
+ }
+
+ bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &O) override;
+ bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &O) override;
+
+ void LowerGETPCXAndEmitMCInsts(const MachineInstr *MI,
+ const MCSubtargetInfo &STI);
+
+ };
+} // end of anonymous namespace
+
+static MCOperand createSparcMCOperand(SparcMCExpr::VariantKind Kind,
+ MCSymbol *Sym, MCContext &OutContext) {
+ const MCSymbolRefExpr *MCSym = MCSymbolRefExpr::create(Sym,
+ OutContext);
+ const SparcMCExpr *expr = SparcMCExpr::create(Kind, MCSym, OutContext);
+ return MCOperand::createExpr(expr);
+
+}
+static MCOperand createPCXCallOP(MCSymbol *Label,
+ MCContext &OutContext) {
+ return createSparcMCOperand(SparcMCExpr::VK_Sparc_None, Label, OutContext);
+}
+
+static MCOperand createPCXRelExprOp(SparcMCExpr::VariantKind Kind,
+ MCSymbol *GOTLabel, MCSymbol *StartLabel,
+ MCSymbol *CurLabel,
+ MCContext &OutContext)
+{
+ const MCSymbolRefExpr *GOT = MCSymbolRefExpr::create(GOTLabel, OutContext);
+ const MCSymbolRefExpr *Start = MCSymbolRefExpr::create(StartLabel,
+ OutContext);
+ const MCSymbolRefExpr *Cur = MCSymbolRefExpr::create(CurLabel,
+ OutContext);
+
+ const MCBinaryExpr *Sub = MCBinaryExpr::createSub(Cur, Start, OutContext);
+ const MCBinaryExpr *Add = MCBinaryExpr::createAdd(GOT, Sub, OutContext);
+ const SparcMCExpr *expr = SparcMCExpr::create(Kind,
+ Add, OutContext);
+ return MCOperand::createExpr(expr);
+}
+
+static void EmitCall(MCStreamer &OutStreamer,
+ MCOperand &Callee,
+ const MCSubtargetInfo &STI)
+{
+ MCInst CallInst;
+ CallInst.setOpcode(SP::CALL);
+ CallInst.addOperand(Callee);
+ OutStreamer.EmitInstruction(CallInst, STI);
+}
+
+static void EmitSETHI(MCStreamer &OutStreamer,
+ MCOperand &Imm, MCOperand &RD,
+ const MCSubtargetInfo &STI)
+{
+ MCInst SETHIInst;
+ SETHIInst.setOpcode(SP::SETHIi);
+ SETHIInst.addOperand(RD);
+ SETHIInst.addOperand(Imm);
+ OutStreamer.EmitInstruction(SETHIInst, STI);
+}
+
+static void EmitBinary(MCStreamer &OutStreamer, unsigned Opcode,
+ MCOperand &RS1, MCOperand &Src2, MCOperand &RD,
+ const MCSubtargetInfo &STI)
+{
+ MCInst Inst;
+ Inst.setOpcode(Opcode);
+ Inst.addOperand(RD);
+ Inst.addOperand(RS1);
+ Inst.addOperand(Src2);
+ OutStreamer.EmitInstruction(Inst, STI);
+}
+
+static void EmitOR(MCStreamer &OutStreamer,
+ MCOperand &RS1, MCOperand &Imm, MCOperand &RD,
+ const MCSubtargetInfo &STI) {
+ EmitBinary(OutStreamer, SP::ORri, RS1, Imm, RD, STI);
+}
+
+static void EmitADD(MCStreamer &OutStreamer,
+ MCOperand &RS1, MCOperand &RS2, MCOperand &RD,
+ const MCSubtargetInfo &STI) {
+ EmitBinary(OutStreamer, SP::ADDrr, RS1, RS2, RD, STI);
+}
+
+static void EmitSHL(MCStreamer &OutStreamer,
+ MCOperand &RS1, MCOperand &Imm, MCOperand &RD,
+ const MCSubtargetInfo &STI) {
+ EmitBinary(OutStreamer, SP::SLLri, RS1, Imm, RD, STI);
+}
+
+
+static void EmitHiLo(MCStreamer &OutStreamer, MCSymbol *GOTSym,
+ SparcMCExpr::VariantKind HiKind,
+ SparcMCExpr::VariantKind LoKind,
+ MCOperand &RD,
+ MCContext &OutContext,
+ const MCSubtargetInfo &STI) {
+
+ MCOperand hi = createSparcMCOperand(HiKind, GOTSym, OutContext);
+ MCOperand lo = createSparcMCOperand(LoKind, GOTSym, OutContext);
+ EmitSETHI(OutStreamer, hi, RD, STI);
+ EmitOR(OutStreamer, RD, lo, RD, STI);
+}
+
+void SparcAsmPrinter::LowerGETPCXAndEmitMCInsts(const MachineInstr *MI,
+ const MCSubtargetInfo &STI)
+{
+ MCSymbol *GOTLabel =
+ OutContext.getOrCreateSymbol(Twine("_GLOBAL_OFFSET_TABLE_"));
+
+ const MachineOperand &MO = MI->getOperand(0);
+ assert(MO.getReg() != SP::O7 &&
+ "%o7 is assigned as destination for getpcx!");
+
+ MCOperand MCRegOP = MCOperand::createReg(MO.getReg());
+
+
+ if (!isPositionIndependent()) {
+ // Just load the address of GOT to MCRegOP.
+ switch(TM.getCodeModel()) {
+ default:
+ llvm_unreachable("Unsupported absolute code model");
+ case CodeModel::Small:
+ EmitHiLo(*OutStreamer, GOTLabel,
+ SparcMCExpr::VK_Sparc_HI, SparcMCExpr::VK_Sparc_LO,
+ MCRegOP, OutContext, STI);
+ break;
+ case CodeModel::Medium: {
+ EmitHiLo(*OutStreamer, GOTLabel,
+ SparcMCExpr::VK_Sparc_H44, SparcMCExpr::VK_Sparc_M44,
+ MCRegOP, OutContext, STI);
+ MCOperand imm = MCOperand::createExpr(MCConstantExpr::create(12,
+ OutContext));
+ EmitSHL(*OutStreamer, MCRegOP, imm, MCRegOP, STI);
+ MCOperand lo = createSparcMCOperand(SparcMCExpr::VK_Sparc_L44,
+ GOTLabel, OutContext);
+ EmitOR(*OutStreamer, MCRegOP, lo, MCRegOP, STI);
+ break;
+ }
+ case CodeModel::Large: {
+ EmitHiLo(*OutStreamer, GOTLabel,
+ SparcMCExpr::VK_Sparc_HH, SparcMCExpr::VK_Sparc_HM,
+ MCRegOP, OutContext, STI);
+ MCOperand imm = MCOperand::createExpr(MCConstantExpr::create(32,
+ OutContext));
+ EmitSHL(*OutStreamer, MCRegOP, imm, MCRegOP, STI);
+ // Use register %o7 to load the lower 32 bits.
+ MCOperand RegO7 = MCOperand::createReg(SP::O7);
+ EmitHiLo(*OutStreamer, GOTLabel,
+ SparcMCExpr::VK_Sparc_HI, SparcMCExpr::VK_Sparc_LO,
+ RegO7, OutContext, STI);
+ EmitADD(*OutStreamer, MCRegOP, RegO7, MCRegOP, STI);
+ }
+ }
+ return;
+ }
+
+ MCSymbol *StartLabel = OutContext.createTempSymbol();
+ MCSymbol *EndLabel = OutContext.createTempSymbol();
+ MCSymbol *SethiLabel = OutContext.createTempSymbol();
+
+ MCOperand RegO7 = MCOperand::createReg(SP::O7);
+
+ // <StartLabel>:
+ // call <EndLabel>
+ // <SethiLabel>:
+ // sethi %hi(_GLOBAL_OFFSET_TABLE_+(<SethiLabel>-<StartLabel>)), <MO>
+ // <EndLabel>:
+ // or <MO>, %lo(_GLOBAL_OFFSET_TABLE_+(<EndLabel>-<StartLabel>))), <MO>
+ // add <MO>, %o7, <MO>
+
+ OutStreamer->EmitLabel(StartLabel);
+ MCOperand Callee = createPCXCallOP(EndLabel, OutContext);
+ EmitCall(*OutStreamer, Callee, STI);
+ OutStreamer->EmitLabel(SethiLabel);
+ MCOperand hiImm = createPCXRelExprOp(SparcMCExpr::VK_Sparc_PC22,
+ GOTLabel, StartLabel, SethiLabel,
+ OutContext);
+ EmitSETHI(*OutStreamer, hiImm, MCRegOP, STI);
+ OutStreamer->EmitLabel(EndLabel);
+ MCOperand loImm = createPCXRelExprOp(SparcMCExpr::VK_Sparc_PC10,
+ GOTLabel, StartLabel, EndLabel,
+ OutContext);
+ EmitOR(*OutStreamer, MCRegOP, loImm, MCRegOP, STI);
+ EmitADD(*OutStreamer, MCRegOP, RegO7, MCRegOP, STI);
+}
+
+void SparcAsmPrinter::EmitInstruction(const MachineInstr *MI)
+{
+
+ switch (MI->getOpcode()) {
+ default: break;
+ case TargetOpcode::DBG_VALUE:
+ // FIXME: Debug Value.
+ return;
+ case SP::GETPCX:
+ LowerGETPCXAndEmitMCInsts(MI, getSubtargetInfo());
+ return;
+ }
+ MachineBasicBlock::const_instr_iterator I = MI->getIterator();
+ MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
+ do {
+ MCInst TmpInst;
+ LowerSparcMachineInstrToMCInst(&*I, TmpInst, *this);
+ EmitToStreamer(*OutStreamer, TmpInst);
+ } while ((++I != E) && I->isInsideBundle()); // Delay slot check.
+}
+
+void SparcAsmPrinter::EmitFunctionBodyStart() {
+ if (!MF->getSubtarget<SparcSubtarget>().is64Bit())
+ return;
+
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
+ const unsigned globalRegs[] = { SP::G2, SP::G3, SP::G6, SP::G7, 0 };
+ for (unsigned i = 0; globalRegs[i] != 0; ++i) {
+ unsigned reg = globalRegs[i];
+ if (MRI.use_empty(reg))
+ continue;
+
+ if (reg == SP::G6 || reg == SP::G7)
+ getTargetStreamer().emitSparcRegisterIgnore(reg);
+ else
+ getTargetStreamer().emitSparcRegisterScratch(reg);
+ }
+}
+
+void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
+ raw_ostream &O) {
+ const DataLayout &DL = getDataLayout();
+ const MachineOperand &MO = MI->getOperand (opNum);
+ SparcMCExpr::VariantKind TF = (SparcMCExpr::VariantKind) MO.getTargetFlags();
+
+#ifndef NDEBUG
+ // Verify the target flags.
+ if (MO.isGlobal() || MO.isSymbol() || MO.isCPI()) {
+ if (MI->getOpcode() == SP::CALL)
+ assert(TF == SparcMCExpr::VK_Sparc_None &&
+ "Cannot handle target flags on call address");
+ else if (MI->getOpcode() == SP::SETHIi || MI->getOpcode() == SP::SETHIXi)
+ assert((TF == SparcMCExpr::VK_Sparc_HI
+ || TF == SparcMCExpr::VK_Sparc_H44
+ || TF == SparcMCExpr::VK_Sparc_HH
+ || TF == SparcMCExpr::VK_Sparc_TLS_GD_HI22
+ || TF == SparcMCExpr::VK_Sparc_TLS_LDM_HI22
+ || TF == SparcMCExpr::VK_Sparc_TLS_LDO_HIX22
+ || TF == SparcMCExpr::VK_Sparc_TLS_IE_HI22
+ || TF == SparcMCExpr::VK_Sparc_TLS_LE_HIX22) &&
+ "Invalid target flags for address operand on sethi");
+ else if (MI->getOpcode() == SP::TLS_CALL)
+ assert((TF == SparcMCExpr::VK_Sparc_None
+ || TF == SparcMCExpr::VK_Sparc_TLS_GD_CALL
+ || TF == SparcMCExpr::VK_Sparc_TLS_LDM_CALL) &&
+ "Cannot handle target flags on tls call address");
+ else if (MI->getOpcode() == SP::TLS_ADDrr)
+ assert((TF == SparcMCExpr::VK_Sparc_TLS_GD_ADD
+ || TF == SparcMCExpr::VK_Sparc_TLS_LDM_ADD
+ || TF == SparcMCExpr::VK_Sparc_TLS_LDO_ADD
+ || TF == SparcMCExpr::VK_Sparc_TLS_IE_ADD) &&
+ "Cannot handle target flags on add for TLS");
+ else if (MI->getOpcode() == SP::TLS_LDrr)
+ assert(TF == SparcMCExpr::VK_Sparc_TLS_IE_LD &&
+ "Cannot handle target flags on ld for TLS");
+ else if (MI->getOpcode() == SP::TLS_LDXrr)
+ assert(TF == SparcMCExpr::VK_Sparc_TLS_IE_LDX &&
+ "Cannot handle target flags on ldx for TLS");
+ else if (MI->getOpcode() == SP::XORri || MI->getOpcode() == SP::XORXri)
+ assert((TF == SparcMCExpr::VK_Sparc_TLS_LDO_LOX10
+ || TF == SparcMCExpr::VK_Sparc_TLS_LE_LOX10) &&
+ "Cannot handle target flags on xor for TLS");
+ else
+ assert((TF == SparcMCExpr::VK_Sparc_LO
+ || TF == SparcMCExpr::VK_Sparc_M44
+ || TF == SparcMCExpr::VK_Sparc_L44
+ || TF == SparcMCExpr::VK_Sparc_HM
+ || TF == SparcMCExpr::VK_Sparc_TLS_GD_LO10
+ || TF == SparcMCExpr::VK_Sparc_TLS_LDM_LO10
+ || TF == SparcMCExpr::VK_Sparc_TLS_IE_LO10 ) &&
+ "Invalid target flags for small address operand");
+ }
+#endif
+
+
+ bool CloseParen = SparcMCExpr::printVariantKind(O, TF);
+
+ switch (MO.getType()) {
+ case MachineOperand::MO_Register:
+ O << "%" << StringRef(getRegisterName(MO.getReg())).lower();
+ break;
+
+ case MachineOperand::MO_Immediate:
+ O << (int)MO.getImm();
+ break;
+ case MachineOperand::MO_MachineBasicBlock:
+ MO.getMBB()->getSymbol()->print(O, MAI);
+ return;
+ case MachineOperand::MO_GlobalAddress:
+ getSymbol(MO.getGlobal())->print(O, MAI);
+ break;
+ case MachineOperand::MO_BlockAddress:
+ O << GetBlockAddressSymbol(MO.getBlockAddress())->getName();
+ break;
+ case MachineOperand::MO_ExternalSymbol:
+ O << MO.getSymbolName();
+ break;
+ case MachineOperand::MO_ConstantPoolIndex:
+ O << DL.getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << "_"
+ << MO.getIndex();
+ break;
+ case MachineOperand::MO_Metadata:
+ MO.getMetadata()->printAsOperand(O, MMI->getModule());
+ break;
+ default:
+ llvm_unreachable("<unknown operand type>");
+ }
+ if (CloseParen) O << ")";
+}
+
+void SparcAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum,
+ raw_ostream &O, const char *Modifier) {
+ printOperand(MI, opNum, O);
+
+ // If this is an ADD operand, emit it like normal operands.
+ if (Modifier && !strcmp(Modifier, "arith")) {
+ O << ", ";
+ printOperand(MI, opNum+1, O);
+ return;
+ }
+
+ if (MI->getOperand(opNum+1).isReg() &&
+ MI->getOperand(opNum+1).getReg() == SP::G0)
+ return; // don't print "+%g0"
+ if (MI->getOperand(opNum+1).isImm() &&
+ MI->getOperand(opNum+1).getImm() == 0)
+ return; // don't print "+0"
+
+ O << "+";
+ printOperand(MI, opNum+1, O);
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool SparcAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant,
+ const char *ExtraCode,
+ raw_ostream &O) {
+ if (ExtraCode && ExtraCode[0]) {
+ if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+ switch (ExtraCode[0]) {
+ default:
+ // See if this is a generic print operand
+ return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+ case 'f':
+ case 'r':
+ break;
+ }
+ }
+
+ printOperand(MI, OpNo, O);
+
+ return false;
+}
+
+bool SparcAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+ unsigned OpNo, unsigned AsmVariant,
+ const char *ExtraCode,
+ raw_ostream &O) {
+ if (ExtraCode && ExtraCode[0])
+ return true; // Unknown modifier
+
+ O << '[';
+ printMemOperand(MI, OpNo, O);
+ O << ']';
+
+ return false;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeSparcAsmPrinter() {
+ RegisterAsmPrinter<SparcAsmPrinter> X(getTheSparcTarget());
+ RegisterAsmPrinter<SparcAsmPrinter> Y(getTheSparcV9Target());
+ RegisterAsmPrinter<SparcAsmPrinter> Z(getTheSparcelTarget());
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcCallingConv.td b/contrib/llvm/lib/Target/Sparc/SparcCallingConv.td
new file mode 100644
index 000000000000..0aa29d186dc1
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcCallingConv.td
@@ -0,0 +1,144 @@
+//===-- SparcCallingConv.td - Calling Conventions Sparc ----*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the Sparc architectures.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SPARC v8 32-bit.
+//===----------------------------------------------------------------------===//
+
+def CC_Sparc32 : CallingConv<[
+ // Custom assign SRet to [sp+64].
+ CCIfSRet<CCCustom<"CC_Sparc_Assign_SRet">>,
+ // i32 f32 arguments get passed in integer registers if there is space.
+ CCIfType<[i32, f32], CCAssignToReg<[I0, I1, I2, I3, I4, I5]>>,
+ // f64 arguments are split and passed through registers or through stack.
+ CCIfType<[f64], CCCustom<"CC_Sparc_Assign_Split_64">>,
+ // As are v2i32 arguments (this would be the default behavior for
+ // v2i32 if it wasn't allocated to the IntPair register-class)
+ CCIfType<[v2i32], CCCustom<"CC_Sparc_Assign_Split_64">>,
+
+
+ // Alternatively, they are assigned to the stack in 4-byte aligned units.
+ CCAssignToStack<4, 4>
+]>;
+
+def RetCC_Sparc32 : CallingConv<[
+ CCIfType<[i32], CCAssignToReg<[I0, I1, I2, I3, I4, I5]>>,
+ CCIfType<[f32], CCAssignToReg<[F0, F1, F2, F3]>>,
+ CCIfType<[f64], CCAssignToReg<[D0, D1]>>,
+ CCIfType<[v2i32], CCCustom<"CC_Sparc_Assign_Ret_Split_64">>
+]>;
+
+
+//===----------------------------------------------------------------------===//
+// SPARC v9 64-bit.
+//===----------------------------------------------------------------------===//
+//
+// The 64-bit ABI conceptually assigns all function arguments to a parameter
+// array starting at [%fp+BIAS+128] in the callee's stack frame. All arguments
+// occupy a multiple of 8 bytes in the array. Integer arguments are extended to
+// 64 bits by the caller. Floats are right-aligned in their 8-byte slot, the
+// first 4 bytes in the slot are undefined.
+//
+// The integer registers %i0 to %i5 shadow the first 48 bytes of the parameter
+// array at fixed offsets. Integer arguments are promoted to registers when
+// possible.
+//
+// The floating point registers %f0 to %f31 shadow the first 128 bytes of the
+// parameter array at fixed offsets. Float and double parameters are promoted
+// to these registers when possible.
+//
+// Structs up to 16 bytes in size are passed by value. They are right-aligned
+// in one or two 8-byte slots in the parameter array. Struct members are
+// promoted to both floating point and integer registers when possible. A
+// struct containing two floats would thus be passed in %f0 and %f1, while two
+// float function arguments would occupy 8 bytes each, and be passed in %f1 and
+// %f3.
+//
+// When a struct { int, float } is passed by value, the int goes in the high
+// bits of an integer register while the float goes in a floating point
+// register.
+//
+// The difference is encoded in LLVM IR using the inreg atttribute on function
+// arguments:
+//
+// C: void f(float, float);
+// IR: declare void f(float %f1, float %f3)
+//
+// C: void f(struct { float f0, f1; });
+// IR: declare void f(float inreg %f0, float inreg %f1)
+//
+// C: void f(int, float);
+// IR: declare void f(int signext %i0, float %f3)
+//
+// C: void f(struct { int i0high; float f1; });
+// IR: declare void f(i32 inreg %i0high, float inreg %f1)
+//
+// Two ints in a struct are simply coerced to i64:
+//
+// C: void f(struct { int i0high, i0low; });
+// IR: declare void f(i64 %i0.coerced)
+//
+// The frontend and backend divide the task of producing ABI compliant code for
+// C functions. The C frontend will:
+//
+// - Annotate integer arguments with zeroext or signext attributes.
+//
+// - Split structs into one or two 64-bit sized chunks, or 32-bit chunks with
+// inreg attributes.
+//
+// - Pass structs larger than 16 bytes indirectly with an explicit pointer
+// argument. The byval attribute is not used.
+//
+// The backend will:
+//
+// - Assign all arguments to 64-bit aligned stack slots, 32-bits for inreg.
+//
+// - Promote to integer or floating point registers depending on type.
+//
+// Function return values are passed exactly like function arguments, except a
+// struct up to 32 bytes in size can be returned in registers.
+
+// Function arguments AND most return values.
+def CC_Sparc64 : CallingConv<[
+ // The frontend uses the inreg flag to indicate i32 and float arguments from
+ // structs. These arguments are not promoted to 64 bits, but they can still
+ // be assigned to integer and float registers.
+ CCIfInReg<CCIfType<[i32, f32], CCCustom<"CC_Sparc64_Half">>>,
+
+ // All integers are promoted to i64 by the caller.
+ CCIfType<[i32], CCPromoteToType<i64>>,
+
+ // Custom assignment is required because stack space is reserved for all
+ // arguments whether they are passed in registers or not.
+ CCCustom<"CC_Sparc64_Full">
+]>;
+
+def RetCC_Sparc64 : CallingConv<[
+ // A single f32 return value always goes in %f0. The ABI doesn't specify what
+ // happens to multiple f32 return values outside a struct.
+ CCIfType<[f32], CCCustom<"CC_Sparc64_Half">>,
+
+ // Otherwise, return values are passed exactly like arguments.
+ CCDelegateTo<CC_Sparc64>
+]>;
+
+// Callee-saved registers are handled by the register window mechanism.
+def CSR : CalleeSavedRegs<(add)> {
+ let OtherPreserved = (add (sequence "I%u", 0, 7),
+ (sequence "L%u", 0, 7));
+}
+
+// Callee-saved registers for calls with ReturnsTwice attribute.
+def RTCSR : CalleeSavedRegs<(add)> {
+ let OtherPreserved = (add I6, I7);
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.cpp b/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
new file mode 100644
index 000000000000..122f830e0dc5
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
@@ -0,0 +1,368 @@
+//===-- SparcFrameLowering.cpp - Sparc Frame Information ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Sparc implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcFrameLowering.h"
+#include "SparcInstrInfo.h"
+#include "SparcMachineFunctionInfo.h"
+#include "SparcSubtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+DisableLeafProc("disable-sparc-leaf-proc",
+ cl::init(false),
+ cl::desc("Disable Sparc leaf procedure optimization."),
+ cl::Hidden);
+
+SparcFrameLowering::SparcFrameLowering(const SparcSubtarget &ST)
+ : TargetFrameLowering(TargetFrameLowering::StackGrowsDown,
+ ST.is64Bit() ? 16 : 8, 0, ST.is64Bit() ? 16 : 8) {}
+
+void SparcFrameLowering::emitSPAdjustment(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ int NumBytes,
+ unsigned ADDrr,
+ unsigned ADDri) const {
+
+ DebugLoc dl;
+ const SparcInstrInfo &TII =
+ *static_cast<const SparcInstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+ if (NumBytes >= -4096 && NumBytes < 4096) {
+ BuildMI(MBB, MBBI, dl, TII.get(ADDri), SP::O6)
+ .addReg(SP::O6).addImm(NumBytes);
+ return;
+ }
+
+ // Emit this the hard way. This clobbers G1 which we always know is
+ // available here.
+ if (NumBytes >= 0) {
+ // Emit nonnegative numbers with sethi + or.
+ // sethi %hi(NumBytes), %g1
+ // or %g1, %lo(NumBytes), %g1
+ // add %sp, %g1, %sp
+ BuildMI(MBB, MBBI, dl, TII.get(SP::SETHIi), SP::G1)
+ .addImm(HI22(NumBytes));
+ BuildMI(MBB, MBBI, dl, TII.get(SP::ORri), SP::G1)
+ .addReg(SP::G1).addImm(LO10(NumBytes));
+ BuildMI(MBB, MBBI, dl, TII.get(ADDrr), SP::O6)
+ .addReg(SP::O6).addReg(SP::G1);
+ return ;
+ }
+
+ // Emit negative numbers with sethi + xor.
+ // sethi %hix(NumBytes), %g1
+ // xor %g1, %lox(NumBytes), %g1
+ // add %sp, %g1, %sp
+ BuildMI(MBB, MBBI, dl, TII.get(SP::SETHIi), SP::G1)
+ .addImm(HIX22(NumBytes));
+ BuildMI(MBB, MBBI, dl, TII.get(SP::XORri), SP::G1)
+ .addReg(SP::G1).addImm(LOX10(NumBytes));
+ BuildMI(MBB, MBBI, dl, TII.get(ADDrr), SP::O6)
+ .addReg(SP::O6).addReg(SP::G1);
+}
+
+void SparcFrameLowering::emitPrologue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
+
+ assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ const SparcInstrInfo &TII =
+ *static_cast<const SparcInstrInfo *>(MF.getSubtarget().getInstrInfo());
+ const SparcRegisterInfo &RegInfo =
+ *static_cast<const SparcRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
+ MachineBasicBlock::iterator MBBI = MBB.begin();
+ // Debug location must be unknown since the first debug location is used
+ // to determine the end of the prologue.
+ DebugLoc dl;
+ bool NeedsStackRealignment = RegInfo.needsStackRealignment(MF);
+
+ // FIXME: unfortunately, returning false from canRealignStack
+ // actually just causes needsStackRealignment to return false,
+ // rather than reporting an error, as would be sensible. This is
+ // poor, but fixing that bogosity is going to be a large project.
+ // For now, just see if it's lied, and report an error here.
+ if (!NeedsStackRealignment && MFI.getMaxAlignment() > getStackAlignment())
+ report_fatal_error("Function \"" + Twine(MF.getName()) + "\" required "
+ "stack re-alignment, but LLVM couldn't handle it "
+ "(probably because it has a dynamic alloca).");
+
+ // Get the number of bytes to allocate from the FrameInfo
+ int NumBytes = (int) MFI.getStackSize();
+
+ unsigned SAVEri = SP::SAVEri;
+ unsigned SAVErr = SP::SAVErr;
+ if (FuncInfo->isLeafProc()) {
+ if (NumBytes == 0)
+ return;
+ SAVEri = SP::ADDri;
+ SAVErr = SP::ADDrr;
+ }
+
+ // The SPARC ABI is a bit odd in that it requires a reserved 92-byte
+ // (128 in v9) area in the user's stack, starting at %sp. Thus, the
+ // first part of the stack that can actually be used is located at
+ // %sp + 92.
+ //
+ // We therefore need to add that offset to the total stack size
+ // after all the stack objects are placed by
+ // PrologEpilogInserter calculateFrameObjectOffsets. However, since the stack needs to be
+ // aligned *after* the extra size is added, we need to disable
+ // calculateFrameObjectOffsets's built-in stack alignment, by having
+ // targetHandlesStackFrameRounding return true.
+
+
+ // Add the extra call frame stack size, if needed. (This is the same
+ // code as in PrologEpilogInserter, but also gets disabled by
+ // targetHandlesStackFrameRounding)
+ if (MFI.adjustsStack() && hasReservedCallFrame(MF))
+ NumBytes += MFI.getMaxCallFrameSize();
+
+ // Adds the SPARC subtarget-specific spill area to the stack
+ // size. Also ensures target-required alignment.
+ NumBytes = MF.getSubtarget<SparcSubtarget>().getAdjustedFrameSize(NumBytes);
+
+ // Finally, ensure that the size is sufficiently aligned for the
+ // data on the stack.
+ if (MFI.getMaxAlignment() > 0) {
+ NumBytes = alignTo(NumBytes, MFI.getMaxAlignment());
+ }
+
+ // Update stack size with corrected value.
+ MFI.setStackSize(NumBytes);
+
+ emitSPAdjustment(MF, MBB, MBBI, -NumBytes, SAVErr, SAVEri);
+
+ unsigned regFP = RegInfo.getDwarfRegNum(SP::I6, true);
+
+ // Emit ".cfi_def_cfa_register 30".
+ unsigned CFIIndex =
+ MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, regFP));
+ BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+
+ // Emit ".cfi_window_save".
+ CFIIndex = MF.addFrameInst(MCCFIInstruction::createWindowSave(nullptr));
+ BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+
+ unsigned regInRA = RegInfo.getDwarfRegNum(SP::I7, true);
+ unsigned regOutRA = RegInfo.getDwarfRegNum(SP::O7, true);
+ // Emit ".cfi_register 15, 31".
+ CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::createRegister(nullptr, regOutRA, regInRA));
+ BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+
+ if (NeedsStackRealignment) {
+ // andn %o6, MaxAlign-1, %o6
+ int MaxAlign = MFI.getMaxAlignment();
+ BuildMI(MBB, MBBI, dl, TII.get(SP::ANDNri), SP::O6).addReg(SP::O6).addImm(MaxAlign - 1);
+ }
+}
+
+MachineBasicBlock::iterator SparcFrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
+ if (!hasReservedCallFrame(MF)) {
+ MachineInstr &MI = *I;
+ int Size = MI.getOperand(0).getImm();
+ if (MI.getOpcode() == SP::ADJCALLSTACKDOWN)
+ Size = -Size;
+
+ if (Size)
+ emitSPAdjustment(MF, MBB, I, Size, SP::ADDrr, SP::ADDri);
+ }
+ return MBB.erase(I);
+}
+
+
+void SparcFrameLowering::emitEpilogue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
+ MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+ const SparcInstrInfo &TII =
+ *static_cast<const SparcInstrInfo *>(MF.getSubtarget().getInstrInfo());
+ DebugLoc dl = MBBI->getDebugLoc();
+ assert(MBBI->getOpcode() == SP::RETL &&
+ "Can only put epilog before 'retl' instruction!");
+ if (!FuncInfo->isLeafProc()) {
+ BuildMI(MBB, MBBI, dl, TII.get(SP::RESTORErr), SP::G0).addReg(SP::G0)
+ .addReg(SP::G0);
+ return;
+ }
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ int NumBytes = (int) MFI.getStackSize();
+ if (NumBytes == 0)
+ return;
+
+ emitSPAdjustment(MF, MBB, MBBI, NumBytes, SP::ADDrr, SP::ADDri);
+}
+
+bool SparcFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+ // Reserve call frame if there are no variable sized objects on the stack.
+ return !MF.getFrameInfo().hasVarSizedObjects();
+}
+
+// hasFP - Return true if the specified function should have a dedicated frame
+// pointer register. This is true if the function has variable sized allocas or
+// if frame pointer elimination is disabled.
+bool SparcFrameLowering::hasFP(const MachineFunction &MF) const {
+ const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
+
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ return MF.getTarget().Options.DisableFramePointerElim(MF) ||
+ RegInfo->needsStackRealignment(MF) ||
+ MFI.hasVarSizedObjects() ||
+ MFI.isFrameAddressTaken();
+}
+
+
+int SparcFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+ unsigned &FrameReg) const {
+ const SparcSubtarget &Subtarget = MF.getSubtarget<SparcSubtarget>();
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const SparcRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ const SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
+ bool isFixed = MFI.isFixedObjectIndex(FI);
+
+ // Addressable stack objects are accessed using neg. offsets from
+ // %fp, or positive offsets from %sp.
+ bool UseFP;
+
+ // Sparc uses FP-based references in general, even when "hasFP" is
+ // false. That function is rather a misnomer, because %fp is
+ // actually always available, unless isLeafProc.
+ if (FuncInfo->isLeafProc()) {
+ // If there's a leaf proc, all offsets need to be %sp-based,
+ // because we haven't caused %fp to actually point to our frame.
+ UseFP = false;
+ } else if (isFixed) {
+ // Otherwise, argument access should always use %fp.
+ UseFP = true;
+ } else if (RegInfo->needsStackRealignment(MF)) {
+ // If there is dynamic stack realignment, all local object
+ // references need to be via %sp, to take account of the
+ // re-alignment.
+ UseFP = false;
+ } else {
+ // Finally, default to using %fp.
+ UseFP = true;
+ }
+
+ int64_t FrameOffset = MF.getFrameInfo().getObjectOffset(FI) +
+ Subtarget.getStackPointerBias();
+
+ if (UseFP) {
+ FrameReg = RegInfo->getFrameRegister(MF);
+ return FrameOffset;
+ } else {
+ FrameReg = SP::O6; // %sp
+ return FrameOffset + MF.getFrameInfo().getStackSize();
+ }
+}
+
+static bool LLVM_ATTRIBUTE_UNUSED verifyLeafProcRegUse(MachineRegisterInfo *MRI)
+{
+
+ for (unsigned reg = SP::I0; reg <= SP::I7; ++reg)
+ if (!MRI->reg_nodbg_empty(reg))
+ return false;
+
+ for (unsigned reg = SP::L0; reg <= SP::L7; ++reg)
+ if (!MRI->reg_nodbg_empty(reg))
+ return false;
+
+ return true;
+}
+
+bool SparcFrameLowering::isLeafProc(MachineFunction &MF) const
+{
+
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ return !(MFI.hasCalls() // has calls
+ || !MRI.reg_nodbg_empty(SP::L0) // Too many registers needed
+ || !MRI.reg_nodbg_empty(SP::O6) // %SP is used
+ || hasFP(MF)); // need %FP
+}
+
+void SparcFrameLowering::remapRegsForLeafProc(MachineFunction &MF) const {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ // Remap %i[0-7] to %o[0-7].
+ for (unsigned reg = SP::I0; reg <= SP::I7; ++reg) {
+ if (MRI.reg_nodbg_empty(reg))
+ continue;
+
+ unsigned mapped_reg = reg - SP::I0 + SP::O0;
+ assert(MRI.reg_nodbg_empty(mapped_reg));
+
+ // Replace I register with O register.
+ MRI.replaceRegWith(reg, mapped_reg);
+
+ // Also replace register pair super-registers.
+ if ((reg - SP::I0) % 2 == 0) {
+ unsigned preg = (reg - SP::I0) / 2 + SP::I0_I1;
+ unsigned mapped_preg = preg - SP::I0_I1 + SP::O0_O1;
+ MRI.replaceRegWith(preg, mapped_preg);
+ }
+ }
+
+ // Rewrite MBB's Live-ins.
+ for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
+ MBB != E; ++MBB) {
+ for (unsigned reg = SP::I0_I1; reg <= SP::I6_I7; ++reg) {
+ if (!MBB->isLiveIn(reg))
+ continue;
+ MBB->removeLiveIn(reg);
+ MBB->addLiveIn(reg - SP::I0_I1 + SP::O0_O1);
+ }
+ for (unsigned reg = SP::I0; reg <= SP::I7; ++reg) {
+ if (!MBB->isLiveIn(reg))
+ continue;
+ MBB->removeLiveIn(reg);
+ MBB->addLiveIn(reg - SP::I0 + SP::O0);
+ }
+ }
+
+ assert(verifyLeafProcRegUse(&MRI));
+#ifdef EXPENSIVE_CHECKS
+ MF.verify(0, "After LeafProc Remapping");
+#endif
+}
+
+void SparcFrameLowering::determineCalleeSaves(MachineFunction &MF,
+ BitVector &SavedRegs,
+ RegScavenger *RS) const {
+ TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+ if (!DisableLeafProc && isLeafProc(MF)) {
+ SparcMachineFunctionInfo *MFI = MF.getInfo<SparcMachineFunctionInfo>();
+ MFI->setLeafProc(true);
+
+ remapRegsForLeafProc(MF);
+ }
+
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.h b/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.h
new file mode 100644
index 000000000000..ac0e69ccde1e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.h
@@ -0,0 +1,68 @@
+//===-- SparcFrameLowering.h - Define frame lowering for Sparc --*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPARC_SPARCFRAMELOWERING_H
+#define LLVM_LIB_TARGET_SPARC_SPARCFRAMELOWERING_H
+
+#include "Sparc.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+
+class SparcSubtarget;
+class SparcFrameLowering : public TargetFrameLowering {
+public:
+ explicit SparcFrameLowering(const SparcSubtarget &ST);
+
+ /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+ /// the function.
+ void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+ void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+ MachineBasicBlock::iterator
+ eliminateCallFramePseudoInstr(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const override;
+
+ bool hasReservedCallFrame(const MachineFunction &MF) const override;
+ bool hasFP(const MachineFunction &MF) const override;
+ void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+ RegScavenger *RS = nullptr) const override;
+
+ int getFrameIndexReference(const MachineFunction &MF, int FI,
+ unsigned &FrameReg) const override;
+
+ /// targetHandlesStackFrameRounding - Returns true if the target is
+ /// responsible for rounding up the stack frame (probably at emitPrologue
+ /// time).
+ bool targetHandlesStackFrameRounding() const override { return true; }
+
+private:
+ // Remap input registers to output registers for leaf procedure.
+ void remapRegsForLeafProc(MachineFunction &MF) const;
+
+ // Returns true if MF is a leaf procedure.
+ bool isLeafProc(MachineFunction &MF) const;
+
+
+ // Emits code for adjusting SP in function prologue/epilogue.
+ void emitSPAdjustment(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ int NumBytes, unsigned ADDrr, unsigned ADDri) const;
+
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp
new file mode 100644
index 000000000000..c36e75d1b076
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp
@@ -0,0 +1,405 @@
+//===-- SparcISelDAGToDAG.cpp - A dag to dag inst selector for Sparc ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the SPARC target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcTargetMachine.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+//===--------------------------------------------------------------------===//
+/// SparcDAGToDAGISel - SPARC specific code to select SPARC machine
+/// instructions for SelectionDAG operations.
+///
+namespace {
+class SparcDAGToDAGISel : public SelectionDAGISel {
+ /// Subtarget - Keep a pointer to the Sparc Subtarget around so that we can
+ /// make the right decision when generating code for different targets.
+ const SparcSubtarget *Subtarget;
+public:
+ explicit SparcDAGToDAGISel(SparcTargetMachine &tm) : SelectionDAGISel(tm) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ Subtarget = &MF.getSubtarget<SparcSubtarget>();
+ return SelectionDAGISel::runOnMachineFunction(MF);
+ }
+
+ void Select(SDNode *N) override;
+
+ // Complex Pattern Selectors.
+ bool SelectADDRrr(SDValue N, SDValue &R1, SDValue &R2);
+ bool SelectADDRri(SDValue N, SDValue &Base, SDValue &Offset);
+
+ /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+ /// inline asm expressions.
+ bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+ unsigned ConstraintID,
+ std::vector<SDValue> &OutOps) override;
+
+ StringRef getPassName() const override {
+ return "SPARC DAG->DAG Pattern Instruction Selection";
+ }
+
+ // Include the pieces autogenerated from the target description.
+#include "SparcGenDAGISel.inc"
+
+private:
+ SDNode* getGlobalBaseReg();
+ bool tryInlineAsm(SDNode *N);
+};
+} // end anonymous namespace
+
+SDNode* SparcDAGToDAGISel::getGlobalBaseReg() {
+ unsigned GlobalBaseReg = Subtarget->getInstrInfo()->getGlobalBaseReg(MF);
+ return CurDAG->getRegister(GlobalBaseReg,
+ TLI->getPointerTy(CurDAG->getDataLayout()))
+ .getNode();
+}
+
+bool SparcDAGToDAGISel::SelectADDRri(SDValue Addr,
+ SDValue &Base, SDValue &Offset) {
+ if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+ Base = CurDAG->getTargetFrameIndex(
+ FIN->getIndex(), TLI->getPointerTy(CurDAG->getDataLayout()));
+ Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
+ return true;
+ }
+ if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+ Addr.getOpcode() == ISD::TargetGlobalAddress ||
+ Addr.getOpcode() == ISD::TargetGlobalTLSAddress)
+ return false; // direct calls.
+
+ if (Addr.getOpcode() == ISD::ADD) {
+ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
+ if (isInt<13>(CN->getSExtValue())) {
+ if (FrameIndexSDNode *FIN =
+ dyn_cast<FrameIndexSDNode>(Addr.getOperand(0))) {
+ // Constant offset from frame ref.
+ Base = CurDAG->getTargetFrameIndex(
+ FIN->getIndex(), TLI->getPointerTy(CurDAG->getDataLayout()));
+ } else {
+ Base = Addr.getOperand(0);
+ }
+ Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(Addr),
+ MVT::i32);
+ return true;
+ }
+ }
+ if (Addr.getOperand(0).getOpcode() == SPISD::Lo) {
+ Base = Addr.getOperand(1);
+ Offset = Addr.getOperand(0).getOperand(0);
+ return true;
+ }
+ if (Addr.getOperand(1).getOpcode() == SPISD::Lo) {
+ Base = Addr.getOperand(0);
+ Offset = Addr.getOperand(1).getOperand(0);
+ return true;
+ }
+ }
+ Base = Addr;
+ Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
+ return true;
+}
+
+bool SparcDAGToDAGISel::SelectADDRrr(SDValue Addr, SDValue &R1, SDValue &R2) {
+ if (Addr.getOpcode() == ISD::FrameIndex) return false;
+ if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+ Addr.getOpcode() == ISD::TargetGlobalAddress ||
+ Addr.getOpcode() == ISD::TargetGlobalTLSAddress)
+ return false; // direct calls.
+
+ if (Addr.getOpcode() == ISD::ADD) {
+ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
+ if (isInt<13>(CN->getSExtValue()))
+ return false; // Let the reg+imm pattern catch this!
+ if (Addr.getOperand(0).getOpcode() == SPISD::Lo ||
+ Addr.getOperand(1).getOpcode() == SPISD::Lo)
+ return false; // Let the reg+imm pattern catch this!
+ R1 = Addr.getOperand(0);
+ R2 = Addr.getOperand(1);
+ return true;
+ }
+
+ R1 = Addr;
+ R2 = CurDAG->getRegister(SP::G0, TLI->getPointerTy(CurDAG->getDataLayout()));
+ return true;
+}
+
+
+// Re-assemble i64 arguments split up in SelectionDAGBuilder's
+// visitInlineAsm / GetRegistersForValue functions.
+//
+// Note: This function was copied from, and is essentially identical
+// to ARMISelDAGToDAG::SelectInlineAsm. It is very unfortunate that
+// such hacking-up is necessary; a rethink of how inline asm operands
+// are handled may be in order to make doing this more sane.
+//
+// TODO: fix inline asm support so I can simply tell it that 'i64'
+// inputs to asm need to be allocated to the IntPair register type,
+// and have that work. Then, delete this function.
+bool SparcDAGToDAGISel::tryInlineAsm(SDNode *N){
+ std::vector<SDValue> AsmNodeOperands;
+ unsigned Flag, Kind;
+ bool Changed = false;
+ unsigned NumOps = N->getNumOperands();
+
+ // Normally, i64 data is bounded to two arbitrary GPRs for "%r"
+ // constraint. However, some instructions (e.g. ldd/std) require
+ // (even/even+1) GPRs.
+
+ // So, here, we check for this case, and mutate the inlineasm to use
+ // a single IntPair register instead, which guarantees such even/odd
+ // placement.
+
+ SDLoc dl(N);
+ SDValue Glue = N->getGluedNode() ? N->getOperand(NumOps-1)
+ : SDValue(nullptr,0);
+
+ SmallVector<bool, 8> OpChanged;
+ // Glue node will be appended late.
+ for(unsigned i = 0, e = N->getGluedNode() ? NumOps - 1 : NumOps; i < e; ++i) {
+ SDValue op = N->getOperand(i);
+ AsmNodeOperands.push_back(op);
+
+ if (i < InlineAsm::Op_FirstOperand)
+ continue;
+
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(i))) {
+ Flag = C->getZExtValue();
+ Kind = InlineAsm::getKind(Flag);
+ }
+ else
+ continue;
+
+ // Immediate operands to inline asm in the SelectionDAG are modeled with
+ // two operands. The first is a constant of value InlineAsm::Kind_Imm, and
+ // the second is a constant with the value of the immediate. If we get here
+ // and we have a Kind_Imm, skip the next operand, and continue.
+ if (Kind == InlineAsm::Kind_Imm) {
+ SDValue op = N->getOperand(++i);
+ AsmNodeOperands.push_back(op);
+ continue;
+ }
+
+ unsigned NumRegs = InlineAsm::getNumOperandRegisters(Flag);
+ if (NumRegs)
+ OpChanged.push_back(false);
+
+ unsigned DefIdx = 0;
+ bool IsTiedToChangedOp = false;
+ // If it's a use that is tied with a previous def, it has no
+ // reg class constraint.
+ if (Changed && InlineAsm::isUseOperandTiedToDef(Flag, DefIdx))
+ IsTiedToChangedOp = OpChanged[DefIdx];
+
+ if (Kind != InlineAsm::Kind_RegUse && Kind != InlineAsm::Kind_RegDef
+ && Kind != InlineAsm::Kind_RegDefEarlyClobber)
+ continue;
+
+ unsigned RC;
+ bool HasRC = InlineAsm::hasRegClassConstraint(Flag, RC);
+ if ((!IsTiedToChangedOp && (!HasRC || RC != SP::IntRegsRegClassID))
+ || NumRegs != 2)
+ continue;
+
+ assert((i+2 < NumOps) && "Invalid number of operands in inline asm");
+ SDValue V0 = N->getOperand(i+1);
+ SDValue V1 = N->getOperand(i+2);
+ unsigned Reg0 = cast<RegisterSDNode>(V0)->getReg();
+ unsigned Reg1 = cast<RegisterSDNode>(V1)->getReg();
+ SDValue PairedReg;
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ if (Kind == InlineAsm::Kind_RegDef ||
+ Kind == InlineAsm::Kind_RegDefEarlyClobber) {
+ // Replace the two GPRs with 1 GPRPair and copy values from GPRPair to
+ // the original GPRs.
+
+ unsigned GPVR = MRI.createVirtualRegister(&SP::IntPairRegClass);
+ PairedReg = CurDAG->getRegister(GPVR, MVT::v2i32);
+ SDValue Chain = SDValue(N,0);
+
+ SDNode *GU = N->getGluedUser();
+ SDValue RegCopy = CurDAG->getCopyFromReg(Chain, dl, GPVR, MVT::v2i32,
+ Chain.getValue(1));
+
+ // Extract values from a GPRPair reg and copy to the original GPR reg.
+ SDValue Sub0 = CurDAG->getTargetExtractSubreg(SP::sub_even, dl, MVT::i32,
+ RegCopy);
+ SDValue Sub1 = CurDAG->getTargetExtractSubreg(SP::sub_odd, dl, MVT::i32,
+ RegCopy);
+ SDValue T0 = CurDAG->getCopyToReg(Sub0, dl, Reg0, Sub0,
+ RegCopy.getValue(1));
+ SDValue T1 = CurDAG->getCopyToReg(Sub1, dl, Reg1, Sub1, T0.getValue(1));
+
+ // Update the original glue user.
+ std::vector<SDValue> Ops(GU->op_begin(), GU->op_end()-1);
+ Ops.push_back(T1.getValue(1));
+ CurDAG->UpdateNodeOperands(GU, Ops);
+ }
+ else {
+ // For Kind == InlineAsm::Kind_RegUse, we first copy two GPRs into a
+ // GPRPair and then pass the GPRPair to the inline asm.
+ SDValue Chain = AsmNodeOperands[InlineAsm::Op_InputChain];
+
+ // As REG_SEQ doesn't take RegisterSDNode, we copy them first.
+ SDValue T0 = CurDAG->getCopyFromReg(Chain, dl, Reg0, MVT::i32,
+ Chain.getValue(1));
+ SDValue T1 = CurDAG->getCopyFromReg(Chain, dl, Reg1, MVT::i32,
+ T0.getValue(1));
+ SDValue Pair = SDValue(
+ CurDAG->getMachineNode(
+ TargetOpcode::REG_SEQUENCE, dl, MVT::v2i32,
+ {
+ CurDAG->getTargetConstant(SP::IntPairRegClassID, dl,
+ MVT::i32),
+ T0,
+ CurDAG->getTargetConstant(SP::sub_even, dl, MVT::i32),
+ T1,
+ CurDAG->getTargetConstant(SP::sub_odd, dl, MVT::i32),
+ }),
+ 0);
+
+ // Copy REG_SEQ into a GPRPair-typed VR and replace the original two
+ // i32 VRs of inline asm with it.
+ unsigned GPVR = MRI.createVirtualRegister(&SP::IntPairRegClass);
+ PairedReg = CurDAG->getRegister(GPVR, MVT::v2i32);
+ Chain = CurDAG->getCopyToReg(T1, dl, GPVR, Pair, T1.getValue(1));
+
+ AsmNodeOperands[InlineAsm::Op_InputChain] = Chain;
+ Glue = Chain.getValue(1);
+ }
+
+ Changed = true;
+
+ if(PairedReg.getNode()) {
+ OpChanged[OpChanged.size() -1 ] = true;
+ Flag = InlineAsm::getFlagWord(Kind, 1 /* RegNum*/);
+ if (IsTiedToChangedOp)
+ Flag = InlineAsm::getFlagWordForMatchingOp(Flag, DefIdx);
+ else
+ Flag = InlineAsm::getFlagWordForRegClass(Flag, SP::IntPairRegClassID);
+ // Replace the current flag.
+ AsmNodeOperands[AsmNodeOperands.size() -1] = CurDAG->getTargetConstant(
+ Flag, dl, MVT::i32);
+ // Add the new register node and skip the original two GPRs.
+ AsmNodeOperands.push_back(PairedReg);
+ // Skip the next two GPRs.
+ i += 2;
+ }
+ }
+
+ if (Glue.getNode())
+ AsmNodeOperands.push_back(Glue);
+ if (!Changed)
+ return false;
+
+ SDValue New = CurDAG->getNode(ISD::INLINEASM, SDLoc(N),
+ CurDAG->getVTList(MVT::Other, MVT::Glue), AsmNodeOperands);
+ New->setNodeId(-1);
+ ReplaceNode(N, New.getNode());
+ return true;
+}
+
+void SparcDAGToDAGISel::Select(SDNode *N) {
+ SDLoc dl(N);
+ if (N->isMachineOpcode()) {
+ N->setNodeId(-1);
+ return; // Already selected.
+ }
+
+ switch (N->getOpcode()) {
+ default: break;
+ case ISD::INLINEASM: {
+ if (tryInlineAsm(N))
+ return;
+ break;
+ }
+ case SPISD::GLOBAL_BASE_REG:
+ ReplaceNode(N, getGlobalBaseReg());
+ return;
+
+ case ISD::SDIV:
+ case ISD::UDIV: {
+ // sdivx / udivx handle 64-bit divides.
+ if (N->getValueType(0) == MVT::i64)
+ break;
+ // FIXME: should use a custom expander to expose the SRA to the dag.
+ SDValue DivLHS = N->getOperand(0);
+ SDValue DivRHS = N->getOperand(1);
+
+ // Set the Y register to the high-part.
+ SDValue TopPart;
+ if (N->getOpcode() == ISD::SDIV) {
+ TopPart = SDValue(CurDAG->getMachineNode(SP::SRAri, dl, MVT::i32, DivLHS,
+ CurDAG->getTargetConstant(31, dl, MVT::i32)),
+ 0);
+ } else {
+ TopPart = CurDAG->getRegister(SP::G0, MVT::i32);
+ }
+ TopPart = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SP::Y, TopPart,
+ SDValue())
+ .getValue(1);
+
+ // FIXME: Handle div by immediate.
+ unsigned Opcode = N->getOpcode() == ISD::SDIV ? SP::SDIVrr : SP::UDIVrr;
+ // SDIV is a hardware erratum on some LEON2 processors. Replace it with SDIVcc here.
+ if (((SparcTargetMachine&)TM).getSubtargetImpl()->performSDIVReplace()
+ &&
+ Opcode == SP::SDIVrr) {
+ Opcode = SP::SDIVCCrr;
+ }
+ CurDAG->SelectNodeTo(N, Opcode, MVT::i32, DivLHS, DivRHS, TopPart);
+ return;
+ }
+ }
+
+ SelectCode(N);
+}
+
+
+/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+/// inline asm expressions.
+bool
+SparcDAGToDAGISel::SelectInlineAsmMemoryOperand(const SDValue &Op,
+ unsigned ConstraintID,
+ std::vector<SDValue> &OutOps) {
+ SDValue Op0, Op1;
+ switch (ConstraintID) {
+ default: return true;
+ case InlineAsm::Constraint_i:
+ case InlineAsm::Constraint_o:
+ case InlineAsm::Constraint_m: // memory
+ if (!SelectADDRrr(Op, Op0, Op1))
+ SelectADDRri(Op, Op0, Op1);
+ break;
+ }
+
+ OutOps.push_back(Op0);
+ OutOps.push_back(Op1);
+ return false;
+}
+
+/// createSparcISelDag - This pass converts a legalized DAG into a
+/// SPARC-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createSparcISelDag(SparcTargetMachine &TM) {
+ return new SparcDAGToDAGISel(TM);
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/contrib/llvm/lib/Target/Sparc/SparcISelLowering.cpp
new file mode 100644
index 000000000000..2ac9aae2471b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -0,0 +1,3574 @@
+//===-- SparcISelLowering.cpp - Sparc DAG Lowering Implementation ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the interfaces that Sparc uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcISelLowering.h"
+#include "MCTargetDesc/SparcMCExpr.h"
+#include "SparcMachineFunctionInfo.h"
+#include "SparcRegisterInfo.h"
+#include "SparcTargetMachine.h"
+#include "SparcTargetObjectFile.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/ErrorHandling.h"
+using namespace llvm;
+
+
+//===----------------------------------------------------------------------===//
+// Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+static bool CC_Sparc_Assign_SRet(unsigned &ValNo, MVT &ValVT,
+ MVT &LocVT, CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State)
+{
+ assert (ArgFlags.isSRet());
+
+ // Assign SRet argument.
+ State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
+ 0,
+ LocVT, LocInfo));
+ return true;
+}
+
+static bool CC_Sparc_Assign_Split_64(unsigned &ValNo, MVT &ValVT,
+ MVT &LocVT, CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State)
+{
+ static const MCPhysReg RegList[] = {
+ SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5
+ };
+ // Try to get first reg.
+ if (unsigned Reg = State.AllocateReg(RegList)) {
+ State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ } else {
+ // Assign whole thing in stack.
+ State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
+ State.AllocateStack(8,4),
+ LocVT, LocInfo));
+ return true;
+ }
+
+ // Try to get second reg.
+ if (unsigned Reg = State.AllocateReg(RegList))
+ State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ else
+ State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
+ State.AllocateStack(4,4),
+ LocVT, LocInfo));
+ return true;
+}
+
+static bool CC_Sparc_Assign_Ret_Split_64(unsigned &ValNo, MVT &ValVT,
+ MVT &LocVT, CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State)
+{
+ static const MCPhysReg RegList[] = {
+ SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5
+ };
+
+ // Try to get first reg.
+ if (unsigned Reg = State.AllocateReg(RegList))
+ State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ else
+ return false;
+
+ // Try to get second reg.
+ if (unsigned Reg = State.AllocateReg(RegList))
+ State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ else
+ return false;
+
+ return true;
+}
+
+// Allocate a full-sized argument for the 64-bit ABI.
+static bool CC_Sparc64_Full(unsigned &ValNo, MVT &ValVT,
+ MVT &LocVT, CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+ assert((LocVT == MVT::f32 || LocVT == MVT::f128
+ || LocVT.getSizeInBits() == 64) &&
+ "Can't handle non-64 bits locations");
+
+ // Stack space is allocated for all arguments starting from [%fp+BIAS+128].
+ unsigned size = (LocVT == MVT::f128) ? 16 : 8;
+ unsigned alignment = (LocVT == MVT::f128) ? 16 : 8;
+ unsigned Offset = State.AllocateStack(size, alignment);
+ unsigned Reg = 0;
+
+ if (LocVT == MVT::i64 && Offset < 6*8)
+ // Promote integers to %i0-%i5.
+ Reg = SP::I0 + Offset/8;
+ else if (LocVT == MVT::f64 && Offset < 16*8)
+ // Promote doubles to %d0-%d30. (Which LLVM calls D0-D15).
+ Reg = SP::D0 + Offset/8;
+ else if (LocVT == MVT::f32 && Offset < 16*8)
+ // Promote floats to %f1, %f3, ...
+ Reg = SP::F1 + Offset/4;
+ else if (LocVT == MVT::f128 && Offset < 16*8)
+ // Promote long doubles to %q0-%q28. (Which LLVM calls Q0-Q7).
+ Reg = SP::Q0 + Offset/16;
+
+ // Promote to register when possible, otherwise use the stack slot.
+ if (Reg) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return true;
+ }
+
+ // This argument goes on the stack in an 8-byte slot.
+ // When passing floats, LocVT is smaller than 8 bytes. Adjust the offset to
+ // the right-aligned float. The first 4 bytes of the stack slot are undefined.
+ if (LocVT == MVT::f32)
+ Offset += 4;
+
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ return true;
+}
+
+// Allocate a half-sized argument for the 64-bit ABI.
+//
+// This is used when passing { float, int } structs by value in registers.
+static bool CC_Sparc64_Half(unsigned &ValNo, MVT &ValVT,
+ MVT &LocVT, CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+ assert(LocVT.getSizeInBits() == 32 && "Can't handle non-32 bits locations");
+ unsigned Offset = State.AllocateStack(4, 4);
+
+ if (LocVT == MVT::f32 && Offset < 16*8) {
+ // Promote floats to %f0-%f31.
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, SP::F0 + Offset/4,
+ LocVT, LocInfo));
+ return true;
+ }
+
+ if (LocVT == MVT::i32 && Offset < 6*8) {
+ // Promote integers to %i0-%i5, using half the register.
+ unsigned Reg = SP::I0 + Offset/8;
+ LocVT = MVT::i64;
+ LocInfo = CCValAssign::AExt;
+
+ // Set the Custom bit if this i32 goes in the high bits of a register.
+ if (Offset % 8 == 0)
+ State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg,
+ LocVT, LocInfo));
+ else
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return true;
+ }
+
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ return true;
+}
+
+#include "SparcGenCallingConv.inc"
+
+// The calling conventions in SparcCallingConv.td are described in terms of the
+// callee's register window. This function translates registers to the
+// corresponding caller window %o register.
+static unsigned toCallerWindow(unsigned Reg) {
+ static_assert(SP::I0 + 7 == SP::I7 && SP::O0 + 7 == SP::O7,
+ "Unexpected enum");
+ if (Reg >= SP::I0 && Reg <= SP::I7)
+ return Reg - SP::I0 + SP::O0;
+ return Reg;
+}
+
+SDValue
+SparcTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+ bool IsVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SDLoc &DL, SelectionDAG &DAG) const {
+ if (Subtarget->is64Bit())
+ return LowerReturn_64(Chain, CallConv, IsVarArg, Outs, OutVals, DL, DAG);
+ return LowerReturn_32(Chain, CallConv, IsVarArg, Outs, OutVals, DL, DAG);
+}
+
+SDValue
+SparcTargetLowering::LowerReturn_32(SDValue Chain, CallingConv::ID CallConv,
+ bool IsVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SDLoc &DL, SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+
+ // CCValAssign - represent the assignment of the return value to locations.
+ SmallVector<CCValAssign, 16> RVLocs;
+
+ // CCState - Info about the registers and stack slot.
+ CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
+ *DAG.getContext());
+
+ // Analyze return values.
+ CCInfo.AnalyzeReturn(Outs, RetCC_Sparc32);
+
+ SDValue Flag;
+ SmallVector<SDValue, 4> RetOps(1, Chain);
+ // Make room for the return address offset.
+ RetOps.push_back(SDValue());
+
+ // Copy the result values into the output registers.
+ for (unsigned i = 0, realRVLocIdx = 0;
+ i != RVLocs.size();
+ ++i, ++realRVLocIdx) {
+ CCValAssign &VA = RVLocs[i];
+ assert(VA.isRegLoc() && "Can only return in registers!");
+
+ SDValue Arg = OutVals[realRVLocIdx];
+
+ if (VA.needsCustom()) {
+ assert(VA.getLocVT() == MVT::v2i32);
+ // Legalize ret v2i32 -> ret 2 x i32 (Basically: do what would
+ // happen by default if this wasn't a legal type)
+
+ SDValue Part0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
+ Arg,
+ DAG.getConstant(0, DL, getVectorIdxTy(DAG.getDataLayout())));
+ SDValue Part1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
+ Arg,
+ DAG.getConstant(1, DL, getVectorIdxTy(DAG.getDataLayout())));
+
+ Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Part0, Flag);
+ Flag = Chain.getValue(1);
+ RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+ VA = RVLocs[++i]; // skip ahead to next loc
+ Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Part1,
+ Flag);
+ } else
+ Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
+
+ // Guarantee that all emitted copies are stuck together with flags.
+ Flag = Chain.getValue(1);
+ RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+ }
+
+ unsigned RetAddrOffset = 8; // Call Inst + Delay Slot
+ // If the function returns a struct, copy the SRetReturnReg to I0
+ if (MF.getFunction()->hasStructRetAttr()) {
+ SparcMachineFunctionInfo *SFI = MF.getInfo<SparcMachineFunctionInfo>();
+ unsigned Reg = SFI->getSRetReturnReg();
+ if (!Reg)
+ llvm_unreachable("sret virtual register not created in the entry block");
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, PtrVT);
+ Chain = DAG.getCopyToReg(Chain, DL, SP::I0, Val, Flag);
+ Flag = Chain.getValue(1);
+ RetOps.push_back(DAG.getRegister(SP::I0, PtrVT));
+ RetAddrOffset = 12; // CallInst + Delay Slot + Unimp
+ }
+
+ RetOps[0] = Chain; // Update chain.
+ RetOps[1] = DAG.getConstant(RetAddrOffset, DL, MVT::i32);
+
+ // Add the flag if we have it.
+ if (Flag.getNode())
+ RetOps.push_back(Flag);
+
+ return DAG.getNode(SPISD::RET_FLAG, DL, MVT::Other, RetOps);
+}
+
+// Lower return values for the 64-bit ABI.
+// Return values are passed the exactly the same way as function arguments.
+SDValue
+SparcTargetLowering::LowerReturn_64(SDValue Chain, CallingConv::ID CallConv,
+ bool IsVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SDLoc &DL, SelectionDAG &DAG) const {
+ // CCValAssign - represent the assignment of the return value to locations.
+ SmallVector<CCValAssign, 16> RVLocs;
+
+ // CCState - Info about the registers and stack slot.
+ CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
+ *DAG.getContext());
+
+ // Analyze return values.
+ CCInfo.AnalyzeReturn(Outs, RetCC_Sparc64);
+
+ SDValue Flag;
+ SmallVector<SDValue, 4> RetOps(1, Chain);
+
+ // The second operand on the return instruction is the return address offset.
+ // The return address is always %i7+8 with the 64-bit ABI.
+ RetOps.push_back(DAG.getConstant(8, DL, MVT::i32));
+
+ // Copy the result values into the output registers.
+ for (unsigned i = 0; i != RVLocs.size(); ++i) {
+ CCValAssign &VA = RVLocs[i];
+ assert(VA.isRegLoc() && "Can only return in registers!");
+ SDValue OutVal = OutVals[i];
+
+ // Integer return values must be sign or zero extended by the callee.
+ switch (VA.getLocInfo()) {
+ case CCValAssign::Full: break;
+ case CCValAssign::SExt:
+ OutVal = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), OutVal);
+ break;
+ case CCValAssign::ZExt:
+ OutVal = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), OutVal);
+ break;
+ case CCValAssign::AExt:
+ OutVal = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), OutVal);
+ break;
+ default:
+ llvm_unreachable("Unknown loc info!");
+ }
+
+ // The custom bit on an i32 return value indicates that it should be passed
+ // in the high bits of the register.
+ if (VA.getValVT() == MVT::i32 && VA.needsCustom()) {
+ OutVal = DAG.getNode(ISD::SHL, DL, MVT::i64, OutVal,
+ DAG.getConstant(32, DL, MVT::i32));
+
+ // The next value may go in the low bits of the same register.
+ // Handle both at once.
+ if (i+1 < RVLocs.size() && RVLocs[i+1].getLocReg() == VA.getLocReg()) {
+ SDValue NV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, OutVals[i+1]);
+ OutVal = DAG.getNode(ISD::OR, DL, MVT::i64, OutVal, NV);
+ // Skip the next value, it's already done.
+ ++i;
+ }
+ }
+
+ Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), OutVal, Flag);
+
+ // Guarantee that all emitted copies are stuck together with flags.
+ Flag = Chain.getValue(1);
+ RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+ }
+
+ RetOps[0] = Chain; // Update chain.
+
+ // Add the flag if we have it.
+ if (Flag.getNode())
+ RetOps.push_back(Flag);
+
+ return DAG.getNode(SPISD::RET_FLAG, DL, MVT::Other, RetOps);
+}
+
+SDValue SparcTargetLowering::LowerFormalArguments(
+ SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+ if (Subtarget->is64Bit())
+ return LowerFormalArguments_64(Chain, CallConv, IsVarArg, Ins,
+ DL, DAG, InVals);
+ return LowerFormalArguments_32(Chain, CallConv, IsVarArg, Ins,
+ DL, DAG, InVals);
+}
+
+/// LowerFormalArguments32 - V8 uses a very simple ABI, where all values are
+/// passed in either one or two GPRs, including FP values. TODO: we should
+/// pass FP values in FP registers for fastcc functions.
+SDValue SparcTargetLowering::LowerFormalArguments_32(
+ SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineRegisterInfo &RegInfo = MF.getRegInfo();
+ SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
+
+ // Assign locations to all of the incoming arguments.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+ *DAG.getContext());
+ CCInfo.AnalyzeFormalArguments(Ins, CC_Sparc32);
+
+ const unsigned StackOffset = 92;
+ bool IsLittleEndian = DAG.getDataLayout().isLittleEndian();
+
+ unsigned InIdx = 0;
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i, ++InIdx) {
+ CCValAssign &VA = ArgLocs[i];
+
+ if (Ins[InIdx].Flags.isSRet()) {
+ if (InIdx != 0)
+ report_fatal_error("sparc only supports sret on the first parameter");
+ // Get SRet from [%fp+64].
+ int FrameIdx = MF.getFrameInfo().CreateFixedObject(4, 64, true);
+ SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
+ SDValue Arg =
+ DAG.getLoad(MVT::i32, dl, Chain, FIPtr, MachinePointerInfo());
+ InVals.push_back(Arg);
+ continue;
+ }
+
+ if (VA.isRegLoc()) {
+ if (VA.needsCustom()) {
+ assert(VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2i32);
+
+ unsigned VRegHi = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
+ MF.getRegInfo().addLiveIn(VA.getLocReg(), VRegHi);
+ SDValue HiVal = DAG.getCopyFromReg(Chain, dl, VRegHi, MVT::i32);
+
+ assert(i+1 < e);
+ CCValAssign &NextVA = ArgLocs[++i];
+
+ SDValue LoVal;
+ if (NextVA.isMemLoc()) {
+ int FrameIdx = MF.getFrameInfo().
+ CreateFixedObject(4, StackOffset+NextVA.getLocMemOffset(),true);
+ SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
+ LoVal = DAG.getLoad(MVT::i32, dl, Chain, FIPtr, MachinePointerInfo());
+ } else {
+ unsigned loReg = MF.addLiveIn(NextVA.getLocReg(),
+ &SP::IntRegsRegClass);
+ LoVal = DAG.getCopyFromReg(Chain, dl, loReg, MVT::i32);
+ }
+
+ if (IsLittleEndian)
+ std::swap(LoVal, HiVal);
+
+ SDValue WholeValue =
+ DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, LoVal, HiVal);
+ WholeValue = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), WholeValue);
+ InVals.push_back(WholeValue);
+ continue;
+ }
+ unsigned VReg = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
+ MF.getRegInfo().addLiveIn(VA.getLocReg(), VReg);
+ SDValue Arg = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
+ if (VA.getLocVT() == MVT::f32)
+ Arg = DAG.getNode(ISD::BITCAST, dl, MVT::f32, Arg);
+ else if (VA.getLocVT() != MVT::i32) {
+ Arg = DAG.getNode(ISD::AssertSext, dl, MVT::i32, Arg,
+ DAG.getValueType(VA.getLocVT()));
+ Arg = DAG.getNode(ISD::TRUNCATE, dl, VA.getLocVT(), Arg);
+ }
+ InVals.push_back(Arg);
+ continue;
+ }
+
+ assert(VA.isMemLoc());
+
+ unsigned Offset = VA.getLocMemOffset()+StackOffset;
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+ if (VA.needsCustom()) {
+ assert(VA.getValVT() == MVT::f64 || VA.getValVT() == MVT::v2i32);
+ // If it is double-word aligned, just load.
+ if (Offset % 8 == 0) {
+ int FI = MF.getFrameInfo().CreateFixedObject(8,
+ Offset,
+ true);
+ SDValue FIPtr = DAG.getFrameIndex(FI, PtrVT);
+ SDValue Load =
+ DAG.getLoad(VA.getValVT(), dl, Chain, FIPtr, MachinePointerInfo());
+ InVals.push_back(Load);
+ continue;
+ }
+
+ int FI = MF.getFrameInfo().CreateFixedObject(4,
+ Offset,
+ true);
+ SDValue FIPtr = DAG.getFrameIndex(FI, PtrVT);
+ SDValue HiVal =
+ DAG.getLoad(MVT::i32, dl, Chain, FIPtr, MachinePointerInfo());
+ int FI2 = MF.getFrameInfo().CreateFixedObject(4,
+ Offset+4,
+ true);
+ SDValue FIPtr2 = DAG.getFrameIndex(FI2, PtrVT);
+
+ SDValue LoVal =
+ DAG.getLoad(MVT::i32, dl, Chain, FIPtr2, MachinePointerInfo());
+
+ if (IsLittleEndian)
+ std::swap(LoVal, HiVal);
+
+ SDValue WholeValue =
+ DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, LoVal, HiVal);
+ WholeValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), WholeValue);
+ InVals.push_back(WholeValue);
+ continue;
+ }
+
+ int FI = MF.getFrameInfo().CreateFixedObject(4,
+ Offset,
+ true);
+ SDValue FIPtr = DAG.getFrameIndex(FI, PtrVT);
+ SDValue Load ;
+ if (VA.getValVT() == MVT::i32 || VA.getValVT() == MVT::f32) {
+ Load = DAG.getLoad(VA.getValVT(), dl, Chain, FIPtr, MachinePointerInfo());
+ } else if (VA.getValVT() == MVT::f128) {
+ report_fatal_error("SPARCv8 does not handle f128 in calls; "
+ "pass indirectly");
+ } else {
+ // We shouldn't see any other value types here.
+ llvm_unreachable("Unexpected ValVT encountered in frame lowering.");
+ }
+ InVals.push_back(Load);
+ }
+
+ if (MF.getFunction()->hasStructRetAttr()) {
+ // Copy the SRet Argument to SRetReturnReg.
+ SparcMachineFunctionInfo *SFI = MF.getInfo<SparcMachineFunctionInfo>();
+ unsigned Reg = SFI->getSRetReturnReg();
+ if (!Reg) {
+ Reg = MF.getRegInfo().createVirtualRegister(&SP::IntRegsRegClass);
+ SFI->setSRetReturnReg(Reg);
+ }
+ SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
+ }
+
+ // Store remaining ArgRegs to the stack if this is a varargs function.
+ if (isVarArg) {
+ static const MCPhysReg ArgRegs[] = {
+ SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5
+ };
+ unsigned NumAllocated = CCInfo.getFirstUnallocated(ArgRegs);
+ const MCPhysReg *CurArgReg = ArgRegs+NumAllocated, *ArgRegEnd = ArgRegs+6;
+ unsigned ArgOffset = CCInfo.getNextStackOffset();
+ if (NumAllocated == 6)
+ ArgOffset += StackOffset;
+ else {
+ assert(!ArgOffset);
+ ArgOffset = 68+4*NumAllocated;
+ }
+
+ // Remember the vararg offset for the va_start implementation.
+ FuncInfo->setVarArgsFrameOffset(ArgOffset);
+
+ std::vector<SDValue> OutChains;
+
+ for (; CurArgReg != ArgRegEnd; ++CurArgReg) {
+ unsigned VReg = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
+ MF.getRegInfo().addLiveIn(*CurArgReg, VReg);
+ SDValue Arg = DAG.getCopyFromReg(DAG.getRoot(), dl, VReg, MVT::i32);
+
+ int FrameIdx = MF.getFrameInfo().CreateFixedObject(4, ArgOffset,
+ true);
+ SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
+
+ OutChains.push_back(
+ DAG.getStore(DAG.getRoot(), dl, Arg, FIPtr, MachinePointerInfo()));
+ ArgOffset += 4;
+ }
+
+ if (!OutChains.empty()) {
+ OutChains.push_back(Chain);
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+ }
+ }
+
+ return Chain;
+}
+
+// Lower formal arguments for the 64 bit ABI.
+SDValue SparcTargetLowering::LowerFormalArguments_64(
+ SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+
+ // Analyze arguments according to CC_Sparc64.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
+ *DAG.getContext());
+ CCInfo.AnalyzeFormalArguments(Ins, CC_Sparc64);
+
+ // The argument array begins at %fp+BIAS+128, after the register save area.
+ const unsigned ArgArea = 128;
+
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ if (VA.isRegLoc()) {
+ // This argument is passed in a register.
+ // All integer register arguments are promoted by the caller to i64.
+
+ // Create a virtual register for the promoted live-in value.
+ unsigned VReg = MF.addLiveIn(VA.getLocReg(),
+ getRegClassFor(VA.getLocVT()));
+ SDValue Arg = DAG.getCopyFromReg(Chain, DL, VReg, VA.getLocVT());
+
+ // Get the high bits for i32 struct elements.
+ if (VA.getValVT() == MVT::i32 && VA.needsCustom())
+ Arg = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Arg,
+ DAG.getConstant(32, DL, MVT::i32));
+
+ // The caller promoted the argument, so insert an Assert?ext SDNode so we
+ // won't promote the value again in this function.
+ switch (VA.getLocInfo()) {
+ case CCValAssign::SExt:
+ Arg = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Arg,
+ DAG.getValueType(VA.getValVT()));
+ break;
+ case CCValAssign::ZExt:
+ Arg = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Arg,
+ DAG.getValueType(VA.getValVT()));
+ break;
+ default:
+ break;
+ }
+
+ // Truncate the register down to the argument type.
+ if (VA.isExtInLoc())
+ Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
+
+ InVals.push_back(Arg);
+ continue;
+ }
+
+ // The registers are exhausted. This argument was passed on the stack.
+ assert(VA.isMemLoc());
+ // The CC_Sparc64_Full/Half functions compute stack offsets relative to the
+ // beginning of the arguments area at %fp+BIAS+128.
+ unsigned Offset = VA.getLocMemOffset() + ArgArea;
+ unsigned ValSize = VA.getValVT().getSizeInBits() / 8;
+ // Adjust offset for extended arguments, SPARC is big-endian.
+ // The caller will have written the full slot with extended bytes, but we
+ // prefer our own extending loads.
+ if (VA.isExtInLoc())
+ Offset += 8 - ValSize;
+ int FI = MF.getFrameInfo().CreateFixedObject(ValSize, Offset, true);
+ InVals.push_back(
+ DAG.getLoad(VA.getValVT(), DL, Chain,
+ DAG.getFrameIndex(FI, getPointerTy(MF.getDataLayout())),
+ MachinePointerInfo::getFixedStack(MF, FI)));
+ }
+
+ if (!IsVarArg)
+ return Chain;
+
+ // This function takes variable arguments, some of which may have been passed
+ // in registers %i0-%i5. Variable floating point arguments are never passed
+ // in floating point registers. They go on %i0-%i5 or on the stack like
+ // integer arguments.
+ //
+ // The va_start intrinsic needs to know the offset to the first variable
+ // argument.
+ unsigned ArgOffset = CCInfo.getNextStackOffset();
+ SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
+ // Skip the 128 bytes of register save area.
+ FuncInfo->setVarArgsFrameOffset(ArgOffset + ArgArea +
+ Subtarget->getStackPointerBias());
+
+ // Save the variable arguments that were passed in registers.
+ // The caller is required to reserve stack space for 6 arguments regardless
+ // of how many arguments were actually passed.
+ SmallVector<SDValue, 8> OutChains;
+ for (; ArgOffset < 6*8; ArgOffset += 8) {
+ unsigned VReg = MF.addLiveIn(SP::I0 + ArgOffset/8, &SP::I64RegsRegClass);
+ SDValue VArg = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
+ int FI = MF.getFrameInfo().CreateFixedObject(8, ArgOffset + ArgArea, true);
+ auto PtrVT = getPointerTy(MF.getDataLayout());
+ OutChains.push_back(
+ DAG.getStore(Chain, DL, VArg, DAG.getFrameIndex(FI, PtrVT),
+ MachinePointerInfo::getFixedStack(MF, FI)));
+ }
+
+ if (!OutChains.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
+
+ return Chain;
+}
+
+SDValue
+SparcTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const {
+ if (Subtarget->is64Bit())
+ return LowerCall_64(CLI, InVals);
+ return LowerCall_32(CLI, InVals);
+}
+
+static bool hasReturnsTwiceAttr(SelectionDAG &DAG, SDValue Callee,
+ ImmutableCallSite *CS) {
+ if (CS)
+ return CS->hasFnAttr(Attribute::ReturnsTwice);
+
+ const Function *CalleeFn = nullptr;
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ CalleeFn = dyn_cast<Function>(G->getGlobal());
+ } else if (ExternalSymbolSDNode *E =
+ dyn_cast<ExternalSymbolSDNode>(Callee)) {
+ const Function *Fn = DAG.getMachineFunction().getFunction();
+ const Module *M = Fn->getParent();
+ const char *CalleeName = E->getSymbol();
+ CalleeFn = M->getFunction(CalleeName);
+ }
+
+ if (!CalleeFn)
+ return false;
+ return CalleeFn->hasFnAttribute(Attribute::ReturnsTwice);
+}
+
+// Lower a call for the 32-bit ABI.
+SDValue
+SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const {
+ SelectionDAG &DAG = CLI.DAG;
+ SDLoc &dl = CLI.DL;
+ SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+ SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+ SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
+ SDValue Chain = CLI.Chain;
+ SDValue Callee = CLI.Callee;
+ bool &isTailCall = CLI.IsTailCall;
+ CallingConv::ID CallConv = CLI.CallConv;
+ bool isVarArg = CLI.IsVarArg;
+
+ // Sparc target does not yet support tail call optimization.
+ isTailCall = false;
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+ *DAG.getContext());
+ CCInfo.AnalyzeCallOperands(Outs, CC_Sparc32);
+
+ // Get the size of the outgoing arguments stack space requirement.
+ unsigned ArgsSize = CCInfo.getNextStackOffset();
+
+ // Keep stack frames 8-byte aligned.
+ ArgsSize = (ArgsSize+7) & ~7;
+
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+
+ // Create local copies for byval args.
+ SmallVector<SDValue, 8> ByValArgs;
+ for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+ ISD::ArgFlagsTy Flags = Outs[i].Flags;
+ if (!Flags.isByVal())
+ continue;
+
+ SDValue Arg = OutVals[i];
+ unsigned Size = Flags.getByValSize();
+ unsigned Align = Flags.getByValAlign();
+
+ if (Size > 0U) {
+ int FI = MFI.CreateStackObject(Size, Align, false);
+ SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+ SDValue SizeNode = DAG.getConstant(Size, dl, MVT::i32);
+
+ Chain = DAG.getMemcpy(Chain, dl, FIPtr, Arg, SizeNode, Align,
+ false, // isVolatile,
+ (Size <= 32), // AlwaysInline if size <= 32,
+ false, // isTailCall
+ MachinePointerInfo(), MachinePointerInfo());
+ ByValArgs.push_back(FIPtr);
+ }
+ else {
+ SDValue nullVal;
+ ByValArgs.push_back(nullVal);
+ }
+ }
+
+ Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(ArgsSize, dl, true),
+ dl);
+
+ SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+ SmallVector<SDValue, 8> MemOpChains;
+
+ const unsigned StackOffset = 92;
+ bool hasStructRetAttr = false;
+ // Walk the register/memloc assignments, inserting copies/loads.
+ for (unsigned i = 0, realArgIdx = 0, byvalArgIdx = 0, e = ArgLocs.size();
+ i != e;
+ ++i, ++realArgIdx) {
+ CCValAssign &VA = ArgLocs[i];
+ SDValue Arg = OutVals[realArgIdx];
+
+ ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
+
+ // Use local copy if it is a byval arg.
+ if (Flags.isByVal()) {
+ Arg = ByValArgs[byvalArgIdx++];
+ if (!Arg) {
+ continue;
+ }
+ }
+
+ // Promote the value if needed.
+ switch (VA.getLocInfo()) {
+ default: llvm_unreachable("Unknown loc info!");
+ case CCValAssign::Full: break;
+ case CCValAssign::SExt:
+ Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::ZExt:
+ Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::AExt:
+ Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::BCvt:
+ Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
+ break;
+ }
+
+ if (Flags.isSRet()) {
+ assert(VA.needsCustom());
+ // store SRet argument in %sp+64
+ SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
+ SDValue PtrOff = DAG.getIntPtrConstant(64, dl);
+ PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
+ MemOpChains.push_back(
+ DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
+ hasStructRetAttr = true;
+ continue;
+ }
+
+ if (VA.needsCustom()) {
+ assert(VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2i32);
+
+ if (VA.isMemLoc()) {
+ unsigned Offset = VA.getLocMemOffset() + StackOffset;
+ // if it is double-word aligned, just store.
+ if (Offset % 8 == 0) {
+ SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
+ SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
+ PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
+ MemOpChains.push_back(
+ DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
+ continue;
+ }
+ }
+
+ if (VA.getLocVT() == MVT::f64) {
+ // Move from the float value from float registers into the
+ // integer registers.
+
+ // TODO: The f64 -> v2i32 conversion is super-inefficient for
+ // constants: it sticks them in the constant pool, then loads
+ // to a fp register, then stores to temp memory, then loads to
+ // integer registers.
+ Arg = DAG.getNode(ISD::BITCAST, dl, MVT::v2i32, Arg);
+ }
+
+ SDValue Part0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+ Arg,
+ DAG.getConstant(0, dl, getVectorIdxTy(DAG.getDataLayout())));
+ SDValue Part1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+ Arg,
+ DAG.getConstant(1, dl, getVectorIdxTy(DAG.getDataLayout())));
+
+ if (VA.isRegLoc()) {
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Part0));
+ assert(i+1 != e);
+ CCValAssign &NextVA = ArgLocs[++i];
+ if (NextVA.isRegLoc()) {
+ RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Part1));
+ } else {
+ // Store the second part in stack.
+ unsigned Offset = NextVA.getLocMemOffset() + StackOffset;
+ SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
+ SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
+ PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
+ MemOpChains.push_back(
+ DAG.getStore(Chain, dl, Part1, PtrOff, MachinePointerInfo()));
+ }
+ } else {
+ unsigned Offset = VA.getLocMemOffset() + StackOffset;
+ // Store the first part.
+ SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
+ SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
+ PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
+ MemOpChains.push_back(
+ DAG.getStore(Chain, dl, Part0, PtrOff, MachinePointerInfo()));
+ // Store the second part.
+ PtrOff = DAG.getIntPtrConstant(Offset + 4, dl);
+ PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
+ MemOpChains.push_back(
+ DAG.getStore(Chain, dl, Part1, PtrOff, MachinePointerInfo()));
+ }
+ continue;
+ }
+
+ // Arguments that can be passed on register must be kept at
+ // RegsToPass vector
+ if (VA.isRegLoc()) {
+ if (VA.getLocVT() != MVT::f32) {
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+ continue;
+ }
+ Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+ continue;
+ }
+
+ assert(VA.isMemLoc());
+
+ // Create a store off the stack pointer for this argument.
+ SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
+ SDValue PtrOff = DAG.getIntPtrConstant(VA.getLocMemOffset() + StackOffset,
+ dl);
+ PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
+ MemOpChains.push_back(
+ DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
+ }
+
+
+ // Emit all stores, make sure the occur before any copies into physregs.
+ if (!MemOpChains.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+
+ // Build a sequence of copy-to-reg nodes chained together with token
+ // chain and flag operands which copy the outgoing args into registers.
+ // The InFlag in necessary since all emitted instructions must be
+ // stuck together.
+ SDValue InFlag;
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+ unsigned Reg = toCallerWindow(RegsToPass[i].first);
+ Chain = DAG.getCopyToReg(Chain, dl, Reg, RegsToPass[i].second, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ unsigned SRetArgSize = (hasStructRetAttr)? getSRetArgSize(DAG, Callee):0;
+ bool hasReturnsTwice = hasReturnsTwiceAttr(DAG, Callee, CLI.CS);
+
+ // If the callee is a GlobalAddress node (quite common, every direct call is)
+ // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+ // Likewise ExternalSymbol -> TargetExternalSymbol.
+ unsigned TF = isPositionIndependent() ? SparcMCExpr::VK_Sparc_WPLT30 : 0;
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+ Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, MVT::i32, 0, TF);
+ else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
+ Callee = DAG.getTargetExternalSymbol(E->getSymbol(), MVT::i32, TF);
+
+ // Returns a chain & a flag for retval copy to use
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(Chain);
+ Ops.push_back(Callee);
+ if (hasStructRetAttr)
+ Ops.push_back(DAG.getTargetConstant(SRetArgSize, dl, MVT::i32));
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+ Ops.push_back(DAG.getRegister(toCallerWindow(RegsToPass[i].first),
+ RegsToPass[i].second.getValueType()));
+
+ // Add a register mask operand representing the call-preserved registers.
+ const SparcRegisterInfo *TRI = Subtarget->getRegisterInfo();
+ const uint32_t *Mask =
+ ((hasReturnsTwice)
+ ? TRI->getRTCallPreservedMask(CallConv)
+ : TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv));
+ assert(Mask && "Missing call preserved mask for calling convention");
+ Ops.push_back(DAG.getRegisterMask(Mask));
+
+ if (InFlag.getNode())
+ Ops.push_back(InFlag);
+
+ Chain = DAG.getNode(SPISD::CALL, dl, NodeTys, Ops);
+ InFlag = Chain.getValue(1);
+
+ Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(ArgsSize, dl, true),
+ DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
+ InFlag = Chain.getValue(1);
+
+ // Assign locations to each value returned by this call.
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState RVInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+ *DAG.getContext());
+
+ RVInfo.AnalyzeCallResult(Ins, RetCC_Sparc32);
+
+ // Copy all of the result registers out of their specified physreg.
+ for (unsigned i = 0; i != RVLocs.size(); ++i) {
+ if (RVLocs[i].getLocVT() == MVT::v2i32) {
+ SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2i32);
+ SDValue Lo = DAG.getCopyFromReg(
+ Chain, dl, toCallerWindow(RVLocs[i++].getLocReg()), MVT::i32, InFlag);
+ Chain = Lo.getValue(1);
+ InFlag = Lo.getValue(2);
+ Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2i32, Vec, Lo,
+ DAG.getConstant(0, dl, MVT::i32));
+ SDValue Hi = DAG.getCopyFromReg(
+ Chain, dl, toCallerWindow(RVLocs[i].getLocReg()), MVT::i32, InFlag);
+ Chain = Hi.getValue(1);
+ InFlag = Hi.getValue(2);
+ Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2i32, Vec, Hi,
+ DAG.getConstant(1, dl, MVT::i32));
+ InVals.push_back(Vec);
+ } else {
+ Chain =
+ DAG.getCopyFromReg(Chain, dl, toCallerWindow(RVLocs[i].getLocReg()),
+ RVLocs[i].getValVT(), InFlag)
+ .getValue(1);
+ InFlag = Chain.getValue(2);
+ InVals.push_back(Chain.getValue(0));
+ }
+ }
+
+ return Chain;
+}
+
+// FIXME? Maybe this could be a TableGen attribute on some registers and
+// this table could be generated automatically from RegInfo.
+unsigned SparcTargetLowering::getRegisterByName(const char* RegName, EVT VT,
+ SelectionDAG &DAG) const {
+ unsigned Reg = StringSwitch<unsigned>(RegName)
+ .Case("i0", SP::I0).Case("i1", SP::I1).Case("i2", SP::I2).Case("i3", SP::I3)
+ .Case("i4", SP::I4).Case("i5", SP::I5).Case("i6", SP::I6).Case("i7", SP::I7)
+ .Case("o0", SP::O0).Case("o1", SP::O1).Case("o2", SP::O2).Case("o3", SP::O3)
+ .Case("o4", SP::O4).Case("o5", SP::O5).Case("o6", SP::O6).Case("o7", SP::O7)
+ .Case("l0", SP::L0).Case("l1", SP::L1).Case("l2", SP::L2).Case("l3", SP::L3)
+ .Case("l4", SP::L4).Case("l5", SP::L5).Case("l6", SP::L6).Case("l7", SP::L7)
+ .Case("g0", SP::G0).Case("g1", SP::G1).Case("g2", SP::G2).Case("g3", SP::G3)
+ .Case("g4", SP::G4).Case("g5", SP::G5).Case("g6", SP::G6).Case("g7", SP::G7)
+ .Default(0);
+
+ if (Reg)
+ return Reg;
+
+ report_fatal_error("Invalid register name global variable");
+}
+
+// This functions returns true if CalleeName is a ABI function that returns
+// a long double (fp128).
+static bool isFP128ABICall(const char *CalleeName)
+{
+ static const char *const ABICalls[] =
+ { "_Q_add", "_Q_sub", "_Q_mul", "_Q_div",
+ "_Q_sqrt", "_Q_neg",
+ "_Q_itoq", "_Q_stoq", "_Q_dtoq", "_Q_utoq",
+ "_Q_lltoq", "_Q_ulltoq",
+ nullptr
+ };
+ for (const char * const *I = ABICalls; *I != nullptr; ++I)
+ if (strcmp(CalleeName, *I) == 0)
+ return true;
+ return false;
+}
+
+unsigned
+SparcTargetLowering::getSRetArgSize(SelectionDAG &DAG, SDValue Callee) const
+{
+ const Function *CalleeFn = nullptr;
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ CalleeFn = dyn_cast<Function>(G->getGlobal());
+ } else if (ExternalSymbolSDNode *E =
+ dyn_cast<ExternalSymbolSDNode>(Callee)) {
+ const Function *Fn = DAG.getMachineFunction().getFunction();
+ const Module *M = Fn->getParent();
+ const char *CalleeName = E->getSymbol();
+ CalleeFn = M->getFunction(CalleeName);
+ if (!CalleeFn && isFP128ABICall(CalleeName))
+ return 16; // Return sizeof(fp128)
+ }
+
+ if (!CalleeFn)
+ return 0;
+
+ // It would be nice to check for the sret attribute on CalleeFn here,
+ // but since it is not part of the function type, any check will misfire.
+
+ PointerType *Ty = cast<PointerType>(CalleeFn->arg_begin()->getType());
+ Type *ElementTy = Ty->getElementType();
+ return DAG.getDataLayout().getTypeAllocSize(ElementTy);
+}
+
+
+// Fixup floating point arguments in the ... part of a varargs call.
+//
+// The SPARC v9 ABI requires that floating point arguments are treated the same
+// as integers when calling a varargs function. This does not apply to the
+// fixed arguments that are part of the function's prototype.
+//
+// This function post-processes a CCValAssign array created by
+// AnalyzeCallOperands().
+static void fixupVariableFloatArgs(SmallVectorImpl<CCValAssign> &ArgLocs,
+ ArrayRef<ISD::OutputArg> Outs) {
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ const CCValAssign &VA = ArgLocs[i];
+ MVT ValTy = VA.getLocVT();
+ // FIXME: What about f32 arguments? C promotes them to f64 when calling
+ // varargs functions.
+ if (!VA.isRegLoc() || (ValTy != MVT::f64 && ValTy != MVT::f128))
+ continue;
+ // The fixed arguments to a varargs function still go in FP registers.
+ if (Outs[VA.getValNo()].IsFixed)
+ continue;
+
+ // This floating point argument should be reassigned.
+ CCValAssign NewVA;
+
+ // Determine the offset into the argument array.
+ unsigned firstReg = (ValTy == MVT::f64) ? SP::D0 : SP::Q0;
+ unsigned argSize = (ValTy == MVT::f64) ? 8 : 16;
+ unsigned Offset = argSize * (VA.getLocReg() - firstReg);
+ assert(Offset < 16*8 && "Offset out of range, bad register enum?");
+
+ if (Offset < 6*8) {
+ // This argument should go in %i0-%i5.
+ unsigned IReg = SP::I0 + Offset/8;
+ if (ValTy == MVT::f64)
+ // Full register, just bitconvert into i64.
+ NewVA = CCValAssign::getReg(VA.getValNo(), VA.getValVT(),
+ IReg, MVT::i64, CCValAssign::BCvt);
+ else {
+ assert(ValTy == MVT::f128 && "Unexpected type!");
+ // Full register, just bitconvert into i128 -- We will lower this into
+ // two i64s in LowerCall_64.
+ NewVA = CCValAssign::getCustomReg(VA.getValNo(), VA.getValVT(),
+ IReg, MVT::i128, CCValAssign::BCvt);
+ }
+ } else {
+ // This needs to go to memory, we're out of integer registers.
+ NewVA = CCValAssign::getMem(VA.getValNo(), VA.getValVT(),
+ Offset, VA.getLocVT(), VA.getLocInfo());
+ }
+ ArgLocs[i] = NewVA;
+ }
+}
+
+// Lower a call for the 64-bit ABI.
+SDValue
+SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const {
+ SelectionDAG &DAG = CLI.DAG;
+ SDLoc DL = CLI.DL;
+ SDValue Chain = CLI.Chain;
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+ // Sparc target does not yet support tail call optimization.
+ CLI.IsTailCall = false;
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(), ArgLocs,
+ *DAG.getContext());
+ CCInfo.AnalyzeCallOperands(CLI.Outs, CC_Sparc64);
+
+ // Get the size of the outgoing arguments stack space requirement.
+ // The stack offset computed by CC_Sparc64 includes all arguments.
+ // Called functions expect 6 argument words to exist in the stack frame, used
+ // or not.
+ unsigned ArgsSize = std::max(6*8u, CCInfo.getNextStackOffset());
+
+ // Keep stack frames 16-byte aligned.
+ ArgsSize = alignTo(ArgsSize, 16);
+
+ // Varargs calls require special treatment.
+ if (CLI.IsVarArg)
+ fixupVariableFloatArgs(ArgLocs, CLI.Outs);
+
+ // Adjust the stack pointer to make room for the arguments.
+ // FIXME: Use hasReservedCallFrame to avoid %sp adjustments around all calls
+ // with more than 6 arguments.
+ Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(ArgsSize, DL, true),
+ DL);
+
+ // Collect the set of registers to pass to the function and their values.
+ // This will be emitted as a sequence of CopyToReg nodes glued to the call
+ // instruction.
+ SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+
+ // Collect chains from all the memory opeations that copy arguments to the
+ // stack. They must follow the stack pointer adjustment above and precede the
+ // call instruction itself.
+ SmallVector<SDValue, 8> MemOpChains;
+
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ const CCValAssign &VA = ArgLocs[i];
+ SDValue Arg = CLI.OutVals[i];
+
+ // Promote the value if needed.
+ switch (VA.getLocInfo()) {
+ default:
+ llvm_unreachable("Unknown location info!");
+ case CCValAssign::Full:
+ break;
+ case CCValAssign::SExt:
+ Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::ZExt:
+ Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::AExt:
+ Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::BCvt:
+ // fixupVariableFloatArgs() may create bitcasts from f128 to i128. But
+ // SPARC does not support i128 natively. Lower it into two i64, see below.
+ if (!VA.needsCustom() || VA.getValVT() != MVT::f128
+ || VA.getLocVT() != MVT::i128)
+ Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+ break;
+ }
+
+ if (VA.isRegLoc()) {
+ if (VA.needsCustom() && VA.getValVT() == MVT::f128
+ && VA.getLocVT() == MVT::i128) {
+ // Store and reload into the integer register reg and reg+1.
+ unsigned Offset = 8 * (VA.getLocReg() - SP::I0);
+ unsigned StackOffset = Offset + Subtarget->getStackPointerBias() + 128;
+ SDValue StackPtr = DAG.getRegister(SP::O6, PtrVT);
+ SDValue HiPtrOff = DAG.getIntPtrConstant(StackOffset, DL);
+ HiPtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, HiPtrOff);
+ SDValue LoPtrOff = DAG.getIntPtrConstant(StackOffset + 8, DL);
+ LoPtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, LoPtrOff);
+
+ // Store to %sp+BIAS+128+Offset
+ SDValue Store =
+ DAG.getStore(Chain, DL, Arg, HiPtrOff, MachinePointerInfo());
+ // Load into Reg and Reg+1
+ SDValue Hi64 =
+ DAG.getLoad(MVT::i64, DL, Store, HiPtrOff, MachinePointerInfo());
+ SDValue Lo64 =
+ DAG.getLoad(MVT::i64, DL, Store, LoPtrOff, MachinePointerInfo());
+ RegsToPass.push_back(std::make_pair(toCallerWindow(VA.getLocReg()),
+ Hi64));
+ RegsToPass.push_back(std::make_pair(toCallerWindow(VA.getLocReg()+1),
+ Lo64));
+ continue;
+ }
+
+ // The custom bit on an i32 return value indicates that it should be
+ // passed in the high bits of the register.
+ if (VA.getValVT() == MVT::i32 && VA.needsCustom()) {
+ Arg = DAG.getNode(ISD::SHL, DL, MVT::i64, Arg,
+ DAG.getConstant(32, DL, MVT::i32));
+
+ // The next value may go in the low bits of the same register.
+ // Handle both at once.
+ if (i+1 < ArgLocs.size() && ArgLocs[i+1].isRegLoc() &&
+ ArgLocs[i+1].getLocReg() == VA.getLocReg()) {
+ SDValue NV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64,
+ CLI.OutVals[i+1]);
+ Arg = DAG.getNode(ISD::OR, DL, MVT::i64, Arg, NV);
+ // Skip the next value, it's already done.
+ ++i;
+ }
+ }
+ RegsToPass.push_back(std::make_pair(toCallerWindow(VA.getLocReg()), Arg));
+ continue;
+ }
+
+ assert(VA.isMemLoc());
+
+ // Create a store off the stack pointer for this argument.
+ SDValue StackPtr = DAG.getRegister(SP::O6, PtrVT);
+ // The argument area starts at %fp+BIAS+128 in the callee frame,
+ // %sp+BIAS+128 in ours.
+ SDValue PtrOff = DAG.getIntPtrConstant(VA.getLocMemOffset() +
+ Subtarget->getStackPointerBias() +
+ 128, DL);
+ PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
+ MemOpChains.push_back(
+ DAG.getStore(Chain, DL, Arg, PtrOff, MachinePointerInfo()));
+ }
+
+ // Emit all stores, make sure they occur before the call.
+ if (!MemOpChains.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
+
+ // Build a sequence of CopyToReg nodes glued together with token chain and
+ // glue operands which copy the outgoing args into registers. The InGlue is
+ // necessary since all emitted instructions must be stuck together in order
+ // to pass the live physical registers.
+ SDValue InGlue;
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+ Chain = DAG.getCopyToReg(Chain, DL,
+ RegsToPass[i].first, RegsToPass[i].second, InGlue);
+ InGlue = Chain.getValue(1);
+ }
+
+ // If the callee is a GlobalAddress node (quite common, every direct call is)
+ // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+ // Likewise ExternalSymbol -> TargetExternalSymbol.
+ SDValue Callee = CLI.Callee;
+ bool hasReturnsTwice = hasReturnsTwiceAttr(DAG, Callee, CLI.CS);
+ unsigned TF = isPositionIndependent() ? SparcMCExpr::VK_Sparc_WPLT30 : 0;
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+ Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, PtrVT, 0, TF);
+ else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
+ Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT, TF);
+
+ // Build the operands for the call instruction itself.
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(Chain);
+ Ops.push_back(Callee);
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+ Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+ RegsToPass[i].second.getValueType()));
+
+ // Add a register mask operand representing the call-preserved registers.
+ const SparcRegisterInfo *TRI = Subtarget->getRegisterInfo();
+ const uint32_t *Mask =
+ ((hasReturnsTwice) ? TRI->getRTCallPreservedMask(CLI.CallConv)
+ : TRI->getCallPreservedMask(DAG.getMachineFunction(),
+ CLI.CallConv));
+ assert(Mask && "Missing call preserved mask for calling convention");
+ Ops.push_back(DAG.getRegisterMask(Mask));
+
+ // Make sure the CopyToReg nodes are glued to the call instruction which
+ // consumes the registers.
+ if (InGlue.getNode())
+ Ops.push_back(InGlue);
+
+ // Now the call itself.
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+ Chain = DAG.getNode(SPISD::CALL, DL, NodeTys, Ops);
+ InGlue = Chain.getValue(1);
+
+ // Revert the stack pointer immediately after the call.
+ Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(ArgsSize, DL, true),
+ DAG.getIntPtrConstant(0, DL, true), InGlue, DL);
+ InGlue = Chain.getValue(1);
+
+ // Now extract the return values. This is more or less the same as
+ // LowerFormalArguments_64.
+
+ // Assign locations to each value returned by this call.
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState RVInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(), RVLocs,
+ *DAG.getContext());
+
+ // Set inreg flag manually for codegen generated library calls that
+ // return float.
+ if (CLI.Ins.size() == 1 && CLI.Ins[0].VT == MVT::f32 && CLI.CS == nullptr)
+ CLI.Ins[0].Flags.setInReg();
+
+ RVInfo.AnalyzeCallResult(CLI.Ins, RetCC_Sparc64);
+
+ // Copy all of the result registers out of their specified physreg.
+ for (unsigned i = 0; i != RVLocs.size(); ++i) {
+ CCValAssign &VA = RVLocs[i];
+ unsigned Reg = toCallerWindow(VA.getLocReg());
+
+ // When returning 'inreg {i32, i32 }', two consecutive i32 arguments can
+ // reside in the same register in the high and low bits. Reuse the
+ // CopyFromReg previous node to avoid duplicate copies.
+ SDValue RV;
+ if (RegisterSDNode *SrcReg = dyn_cast<RegisterSDNode>(Chain.getOperand(1)))
+ if (SrcReg->getReg() == Reg && Chain->getOpcode() == ISD::CopyFromReg)
+ RV = Chain.getValue(0);
+
+ // But usually we'll create a new CopyFromReg for a different register.
+ if (!RV.getNode()) {
+ RV = DAG.getCopyFromReg(Chain, DL, Reg, RVLocs[i].getLocVT(), InGlue);
+ Chain = RV.getValue(1);
+ InGlue = Chain.getValue(2);
+ }
+
+ // Get the high bits for i32 struct elements.
+ if (VA.getValVT() == MVT::i32 && VA.needsCustom())
+ RV = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), RV,
+ DAG.getConstant(32, DL, MVT::i32));
+
+ // The callee promoted the return value, so insert an Assert?ext SDNode so
+ // we won't promote the value again in this function.
+ switch (VA.getLocInfo()) {
+ case CCValAssign::SExt:
+ RV = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), RV,
+ DAG.getValueType(VA.getValVT()));
+ break;
+ case CCValAssign::ZExt:
+ RV = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), RV,
+ DAG.getValueType(VA.getValVT()));
+ break;
+ default:
+ break;
+ }
+
+ // Truncate the register down to the return value type.
+ if (VA.isExtInLoc())
+ RV = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), RV);
+
+ InVals.push_back(RV);
+ }
+
+ return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+// TargetLowering Implementation
+//===----------------------------------------------------------------------===//
+
+TargetLowering::AtomicExpansionKind SparcTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
+ if (AI->getOperation() == AtomicRMWInst::Xchg &&
+ AI->getType()->getPrimitiveSizeInBits() == 32)
+ return AtomicExpansionKind::None; // Uses xchg instruction
+
+ return AtomicExpansionKind::CmpXChg;
+}
+
+/// IntCondCCodeToICC - Convert a DAG integer condition code to a SPARC ICC
+/// condition.
+static SPCC::CondCodes IntCondCCodeToICC(ISD::CondCode CC) {
+ switch (CC) {
+ default: llvm_unreachable("Unknown integer condition code!");
+ case ISD::SETEQ: return SPCC::ICC_E;
+ case ISD::SETNE: return SPCC::ICC_NE;
+ case ISD::SETLT: return SPCC::ICC_L;
+ case ISD::SETGT: return SPCC::ICC_G;
+ case ISD::SETLE: return SPCC::ICC_LE;
+ case ISD::SETGE: return SPCC::ICC_GE;
+ case ISD::SETULT: return SPCC::ICC_CS;
+ case ISD::SETULE: return SPCC::ICC_LEU;
+ case ISD::SETUGT: return SPCC::ICC_GU;
+ case ISD::SETUGE: return SPCC::ICC_CC;
+ }
+}
+
+/// FPCondCCodeToFCC - Convert a DAG floatingp oint condition code to a SPARC
+/// FCC condition.
+static SPCC::CondCodes FPCondCCodeToFCC(ISD::CondCode CC) {
+ switch (CC) {
+ default: llvm_unreachable("Unknown fp condition code!");
+ case ISD::SETEQ:
+ case ISD::SETOEQ: return SPCC::FCC_E;
+ case ISD::SETNE:
+ case ISD::SETUNE: return SPCC::FCC_NE;
+ case ISD::SETLT:
+ case ISD::SETOLT: return SPCC::FCC_L;
+ case ISD::SETGT:
+ case ISD::SETOGT: return SPCC::FCC_G;
+ case ISD::SETLE:
+ case ISD::SETOLE: return SPCC::FCC_LE;
+ case ISD::SETGE:
+ case ISD::SETOGE: return SPCC::FCC_GE;
+ case ISD::SETULT: return SPCC::FCC_UL;
+ case ISD::SETULE: return SPCC::FCC_ULE;
+ case ISD::SETUGT: return SPCC::FCC_UG;
+ case ISD::SETUGE: return SPCC::FCC_UGE;
+ case ISD::SETUO: return SPCC::FCC_U;
+ case ISD::SETO: return SPCC::FCC_O;
+ case ISD::SETONE: return SPCC::FCC_LG;
+ case ISD::SETUEQ: return SPCC::FCC_UE;
+ }
+}
+
+SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
+ const SparcSubtarget &STI)
+ : TargetLowering(TM), Subtarget(&STI) {
+ MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
+
+ // Instructions which use registers as conditionals examine all the
+ // bits (as does the pseudo SELECT_CC expansion). I don't think it
+ // matters much whether it's ZeroOrOneBooleanContent, or
+ // ZeroOrNegativeOneBooleanContent, so, arbitrarily choose the
+ // former.
+ setBooleanContents(ZeroOrOneBooleanContent);
+ setBooleanVectorContents(ZeroOrOneBooleanContent);
+
+ // Set up the register classes.
+ addRegisterClass(MVT::i32, &SP::IntRegsRegClass);
+ if (!Subtarget->useSoftFloat()) {
+ addRegisterClass(MVT::f32, &SP::FPRegsRegClass);
+ addRegisterClass(MVT::f64, &SP::DFPRegsRegClass);
+ addRegisterClass(MVT::f128, &SP::QFPRegsRegClass);
+ }
+ if (Subtarget->is64Bit()) {
+ addRegisterClass(MVT::i64, &SP::I64RegsRegClass);
+ } else {
+ // On 32bit sparc, we define a double-register 32bit register
+ // class, as well. This is modeled in LLVM as a 2-vector of i32.
+ addRegisterClass(MVT::v2i32, &SP::IntPairRegClass);
+
+ // ...but almost all operations must be expanded, so set that as
+ // the default.
+ for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
+ setOperationAction(Op, MVT::v2i32, Expand);
+ }
+ // Truncating/extending stores/loads are also not supported.
+ for (MVT VT : MVT::integer_vector_valuetypes()) {
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i32, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Expand);
+
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, VT, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, VT, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, VT, Expand);
+
+ setTruncStoreAction(VT, MVT::v2i32, Expand);
+ setTruncStoreAction(MVT::v2i32, VT, Expand);
+ }
+ // However, load and store *are* legal.
+ setOperationAction(ISD::LOAD, MVT::v2i32, Legal);
+ setOperationAction(ISD::STORE, MVT::v2i32, Legal);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Legal);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Legal);
+
+ // And we need to promote i64 loads/stores into vector load/store
+ setOperationAction(ISD::LOAD, MVT::i64, Custom);
+ setOperationAction(ISD::STORE, MVT::i64, Custom);
+
+ // Sadly, this doesn't work:
+ // AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
+ // AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
+ }
+
+ // Turn FP extload into load/fpextend
+ for (MVT VT : MVT::fp_valuetypes()) {
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
+ }
+
+ // Sparc doesn't have i1 sign extending load
+ for (MVT VT : MVT::integer_valuetypes())
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+
+ // Turn FP truncstore into trunc + store.
+ setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+ setTruncStoreAction(MVT::f128, MVT::f32, Expand);
+ setTruncStoreAction(MVT::f128, MVT::f64, Expand);
+
+ // Custom legalize GlobalAddress nodes into LO/HI parts.
+ setOperationAction(ISD::GlobalAddress, PtrVT, Custom);
+ setOperationAction(ISD::GlobalTLSAddress, PtrVT, Custom);
+ setOperationAction(ISD::ConstantPool, PtrVT, Custom);
+ setOperationAction(ISD::BlockAddress, PtrVT, Custom);
+
+ // Sparc doesn't have sext_inreg, replace them with shl/sra
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
+
+ // Sparc has no REM or DIVREM operations.
+ setOperationAction(ISD::UREM, MVT::i32, Expand);
+ setOperationAction(ISD::SREM, MVT::i32, Expand);
+ setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+ setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+
+ // ... nor does SparcV9.
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::UREM, MVT::i64, Expand);
+ setOperationAction(ISD::SREM, MVT::i64, Expand);
+ setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+ setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
+ }
+
+ // Custom expand fp<->sint
+ setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+ setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+
+ // Custom Expand fp<->uint
+ setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
+
+ setOperationAction(ISD::BITCAST, MVT::f32, Expand);
+ setOperationAction(ISD::BITCAST, MVT::i32, Expand);
+
+ // Sparc has no select or setcc: expand to SELECT_CC.
+ setOperationAction(ISD::SELECT, MVT::i32, Expand);
+ setOperationAction(ISD::SELECT, MVT::f32, Expand);
+ setOperationAction(ISD::SELECT, MVT::f64, Expand);
+ setOperationAction(ISD::SELECT, MVT::f128, Expand);
+
+ setOperationAction(ISD::SETCC, MVT::i32, Expand);
+ setOperationAction(ISD::SETCC, MVT::f32, Expand);
+ setOperationAction(ISD::SETCC, MVT::f64, Expand);
+ setOperationAction(ISD::SETCC, MVT::f128, Expand);
+
+ // Sparc doesn't have BRCOND either, it has BR_CC.
+ setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+ setOperationAction(ISD::BRIND, MVT::Other, Expand);
+ setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+ setOperationAction(ISD::BR_CC, MVT::i32, Custom);
+ setOperationAction(ISD::BR_CC, MVT::f32, Custom);
+ setOperationAction(ISD::BR_CC, MVT::f64, Custom);
+ setOperationAction(ISD::BR_CC, MVT::f128, Custom);
+
+ setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
+
+ setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
+ setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
+
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::ADDC, MVT::i64, Custom);
+ setOperationAction(ISD::ADDE, MVT::i64, Custom);
+ setOperationAction(ISD::SUBC, MVT::i64, Custom);
+ setOperationAction(ISD::SUBE, MVT::i64, Custom);
+ setOperationAction(ISD::BITCAST, MVT::f64, Expand);
+ setOperationAction(ISD::BITCAST, MVT::i64, Expand);
+ setOperationAction(ISD::SELECT, MVT::i64, Expand);
+ setOperationAction(ISD::SETCC, MVT::i64, Expand);
+ setOperationAction(ISD::BR_CC, MVT::i64, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
+
+ setOperationAction(ISD::CTPOP, MVT::i64,
+ Subtarget->usePopc() ? Legal : Expand);
+ setOperationAction(ISD::CTTZ , MVT::i64, Expand);
+ setOperationAction(ISD::CTLZ , MVT::i64, Expand);
+ setOperationAction(ISD::BSWAP, MVT::i64, Expand);
+ setOperationAction(ISD::ROTL , MVT::i64, Expand);
+ setOperationAction(ISD::ROTR , MVT::i64, Expand);
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
+ }
+
+ // ATOMICs.
+ // Atomics are supported on SparcV9. 32-bit atomics are also
+ // supported by some Leon SparcV8 variants. Otherwise, atomics
+ // are unsupported.
+ if (Subtarget->isV9())
+ setMaxAtomicSizeInBitsSupported(64);
+ else if (Subtarget->hasLeonCasa())
+ setMaxAtomicSizeInBitsSupported(32);
+ else
+ setMaxAtomicSizeInBitsSupported(0);
+
+ setMinCmpXchgSizeInBits(32);
+
+ setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Legal);
+
+ setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Legal);
+
+ // Custom Lower Atomic LOAD/STORE
+ setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
+ setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
+
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Legal);
+ setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Legal);
+ setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
+ setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Custom);
+ }
+
+ if (!Subtarget->is64Bit()) {
+ // These libcalls are not available in 32-bit.
+ setLibcallName(RTLIB::SHL_I128, nullptr);
+ setLibcallName(RTLIB::SRL_I128, nullptr);
+ setLibcallName(RTLIB::SRA_I128, nullptr);
+ }
+
+ if (!Subtarget->isV9()) {
+ // SparcV8 does not have FNEGD and FABSD.
+ setOperationAction(ISD::FNEG, MVT::f64, Custom);
+ setOperationAction(ISD::FABS, MVT::f64, Custom);
+ }
+
+ setOperationAction(ISD::FSIN , MVT::f128, Expand);
+ setOperationAction(ISD::FCOS , MVT::f128, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
+ setOperationAction(ISD::FREM , MVT::f128, Expand);
+ setOperationAction(ISD::FMA , MVT::f128, Expand);
+ setOperationAction(ISD::FSIN , MVT::f64, Expand);
+ setOperationAction(ISD::FCOS , MVT::f64, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+ setOperationAction(ISD::FREM , MVT::f64, Expand);
+ setOperationAction(ISD::FMA , MVT::f64, Expand);
+ setOperationAction(ISD::FSIN , MVT::f32, Expand);
+ setOperationAction(ISD::FCOS , MVT::f32, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+ setOperationAction(ISD::FREM , MVT::f32, Expand);
+ setOperationAction(ISD::FMA , MVT::f32, Expand);
+ setOperationAction(ISD::CTTZ , MVT::i32, Expand);
+ setOperationAction(ISD::CTLZ , MVT::i32, Expand);
+ setOperationAction(ISD::ROTL , MVT::i32, Expand);
+ setOperationAction(ISD::ROTR , MVT::i32, Expand);
+ setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+ setOperationAction(ISD::FPOW , MVT::f128, Expand);
+ setOperationAction(ISD::FPOW , MVT::f64, Expand);
+ setOperationAction(ISD::FPOW , MVT::f32, Expand);
+
+ setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
+ setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
+ setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
+
+ // Expands to [SU]MUL_LOHI.
+ setOperationAction(ISD::MULHU, MVT::i32, Expand);
+ setOperationAction(ISD::MULHS, MVT::i32, Expand);
+ setOperationAction(ISD::MUL, MVT::i32, Expand);
+
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+ setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+ setOperationAction(ISD::MULHU, MVT::i64, Expand);
+ setOperationAction(ISD::MULHS, MVT::i64, Expand);
+
+ setOperationAction(ISD::UMULO, MVT::i64, Custom);
+ setOperationAction(ISD::SMULO, MVT::i64, Custom);
+
+ setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
+ setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
+ setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
+ }
+
+ // VASTART needs to be custom lowered to use the VarArgsFrameIndex.
+ setOperationAction(ISD::VASTART , MVT::Other, Custom);
+ // VAARG needs to be lowered to not do unaligned accesses for doubles.
+ setOperationAction(ISD::VAARG , MVT::Other, Custom);
+
+ setOperationAction(ISD::TRAP , MVT::Other, Legal);
+
+ // Use the default implementation.
+ setOperationAction(ISD::VACOPY , MVT::Other, Expand);
+ setOperationAction(ISD::VAEND , MVT::Other, Expand);
+ setOperationAction(ISD::STACKSAVE , MVT::Other, Expand);
+ setOperationAction(ISD::STACKRESTORE , MVT::Other, Expand);
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32 , Custom);
+
+ setStackPointerRegisterToSaveRestore(SP::O6);
+
+ setOperationAction(ISD::CTPOP, MVT::i32,
+ Subtarget->usePopc() ? Legal : Expand);
+
+ if (Subtarget->isV9() && Subtarget->hasHardQuad()) {
+ setOperationAction(ISD::LOAD, MVT::f128, Legal);
+ setOperationAction(ISD::STORE, MVT::f128, Legal);
+ } else {
+ setOperationAction(ISD::LOAD, MVT::f128, Custom);
+ setOperationAction(ISD::STORE, MVT::f128, Custom);
+ }
+
+ if (Subtarget->hasHardQuad()) {
+ setOperationAction(ISD::FADD, MVT::f128, Legal);
+ setOperationAction(ISD::FSUB, MVT::f128, Legal);
+ setOperationAction(ISD::FMUL, MVT::f128, Legal);
+ setOperationAction(ISD::FDIV, MVT::f128, Legal);
+ setOperationAction(ISD::FSQRT, MVT::f128, Legal);
+ setOperationAction(ISD::FP_EXTEND, MVT::f128, Legal);
+ setOperationAction(ISD::FP_ROUND, MVT::f64, Legal);
+ if (Subtarget->isV9()) {
+ setOperationAction(ISD::FNEG, MVT::f128, Legal);
+ setOperationAction(ISD::FABS, MVT::f128, Legal);
+ } else {
+ setOperationAction(ISD::FNEG, MVT::f128, Custom);
+ setOperationAction(ISD::FABS, MVT::f128, Custom);
+ }
+
+ if (!Subtarget->is64Bit()) {
+ setLibcallName(RTLIB::FPTOSINT_F128_I64, "_Q_qtoll");
+ setLibcallName(RTLIB::FPTOUINT_F128_I64, "_Q_qtoull");
+ setLibcallName(RTLIB::SINTTOFP_I64_F128, "_Q_lltoq");
+ setLibcallName(RTLIB::UINTTOFP_I64_F128, "_Q_ulltoq");
+ }
+
+ } else {
+ // Custom legalize f128 operations.
+
+ setOperationAction(ISD::FADD, MVT::f128, Custom);
+ setOperationAction(ISD::FSUB, MVT::f128, Custom);
+ setOperationAction(ISD::FMUL, MVT::f128, Custom);
+ setOperationAction(ISD::FDIV, MVT::f128, Custom);
+ setOperationAction(ISD::FSQRT, MVT::f128, Custom);
+ setOperationAction(ISD::FNEG, MVT::f128, Custom);
+ setOperationAction(ISD::FABS, MVT::f128, Custom);
+
+ setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
+ setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
+ setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
+
+ // Setup Runtime library names.
+ if (Subtarget->is64Bit() && !Subtarget->useSoftFloat()) {
+ setLibcallName(RTLIB::ADD_F128, "_Qp_add");
+ setLibcallName(RTLIB::SUB_F128, "_Qp_sub");
+ setLibcallName(RTLIB::MUL_F128, "_Qp_mul");
+ setLibcallName(RTLIB::DIV_F128, "_Qp_div");
+ setLibcallName(RTLIB::SQRT_F128, "_Qp_sqrt");
+ setLibcallName(RTLIB::FPTOSINT_F128_I32, "_Qp_qtoi");
+ setLibcallName(RTLIB::FPTOUINT_F128_I32, "_Qp_qtoui");
+ setLibcallName(RTLIB::SINTTOFP_I32_F128, "_Qp_itoq");
+ setLibcallName(RTLIB::UINTTOFP_I32_F128, "_Qp_uitoq");
+ setLibcallName(RTLIB::FPTOSINT_F128_I64, "_Qp_qtox");
+ setLibcallName(RTLIB::FPTOUINT_F128_I64, "_Qp_qtoux");
+ setLibcallName(RTLIB::SINTTOFP_I64_F128, "_Qp_xtoq");
+ setLibcallName(RTLIB::UINTTOFP_I64_F128, "_Qp_uxtoq");
+ setLibcallName(RTLIB::FPEXT_F32_F128, "_Qp_stoq");
+ setLibcallName(RTLIB::FPEXT_F64_F128, "_Qp_dtoq");
+ setLibcallName(RTLIB::FPROUND_F128_F32, "_Qp_qtos");
+ setLibcallName(RTLIB::FPROUND_F128_F64, "_Qp_qtod");
+ } else if (!Subtarget->useSoftFloat()) {
+ setLibcallName(RTLIB::ADD_F128, "_Q_add");
+ setLibcallName(RTLIB::SUB_F128, "_Q_sub");
+ setLibcallName(RTLIB::MUL_F128, "_Q_mul");
+ setLibcallName(RTLIB::DIV_F128, "_Q_div");
+ setLibcallName(RTLIB::SQRT_F128, "_Q_sqrt");
+ setLibcallName(RTLIB::FPTOSINT_F128_I32, "_Q_qtoi");
+ setLibcallName(RTLIB::FPTOUINT_F128_I32, "_Q_qtou");
+ setLibcallName(RTLIB::SINTTOFP_I32_F128, "_Q_itoq");
+ setLibcallName(RTLIB::UINTTOFP_I32_F128, "_Q_utoq");
+ setLibcallName(RTLIB::FPTOSINT_F128_I64, "_Q_qtoll");
+ setLibcallName(RTLIB::FPTOUINT_F128_I64, "_Q_qtoull");
+ setLibcallName(RTLIB::SINTTOFP_I64_F128, "_Q_lltoq");
+ setLibcallName(RTLIB::UINTTOFP_I64_F128, "_Q_ulltoq");
+ setLibcallName(RTLIB::FPEXT_F32_F128, "_Q_stoq");
+ setLibcallName(RTLIB::FPEXT_F64_F128, "_Q_dtoq");
+ setLibcallName(RTLIB::FPROUND_F128_F32, "_Q_qtos");
+ setLibcallName(RTLIB::FPROUND_F128_F64, "_Q_qtod");
+ }
+ }
+
+ if (Subtarget->fixAllFDIVSQRT()) {
+ // Promote FDIVS and FSQRTS to FDIVD and FSQRTD instructions instead as
+ // the former instructions generate errata on LEON processors.
+ setOperationAction(ISD::FDIV, MVT::f32, Promote);
+ setOperationAction(ISD::FSQRT, MVT::f32, Promote);
+ }
+
+ if (Subtarget->replaceFMULS()) {
+ // Promote FMULS to FMULD instructions instead as
+ // the former instructions generate errata on LEON processors.
+ setOperationAction(ISD::FMUL, MVT::f32, Promote);
+ }
+
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
+ setMinFunctionAlignment(2);
+
+ computeRegisterProperties(Subtarget->getRegisterInfo());
+}
+
+bool SparcTargetLowering::useSoftFloat() const {
+ return Subtarget->useSoftFloat();
+}
+
+const char *SparcTargetLowering::getTargetNodeName(unsigned Opcode) const {
+ switch ((SPISD::NodeType)Opcode) {
+ case SPISD::FIRST_NUMBER: break;
+ case SPISD::CMPICC: return "SPISD::CMPICC";
+ case SPISD::CMPFCC: return "SPISD::CMPFCC";
+ case SPISD::BRICC: return "SPISD::BRICC";
+ case SPISD::BRXCC: return "SPISD::BRXCC";
+ case SPISD::BRFCC: return "SPISD::BRFCC";
+ case SPISD::SELECT_ICC: return "SPISD::SELECT_ICC";
+ case SPISD::SELECT_XCC: return "SPISD::SELECT_XCC";
+ case SPISD::SELECT_FCC: return "SPISD::SELECT_FCC";
+ case SPISD::EH_SJLJ_SETJMP: return "SPISD::EH_SJLJ_SETJMP";
+ case SPISD::EH_SJLJ_LONGJMP: return "SPISD::EH_SJLJ_LONGJMP";
+ case SPISD::Hi: return "SPISD::Hi";
+ case SPISD::Lo: return "SPISD::Lo";
+ case SPISD::FTOI: return "SPISD::FTOI";
+ case SPISD::ITOF: return "SPISD::ITOF";
+ case SPISD::FTOX: return "SPISD::FTOX";
+ case SPISD::XTOF: return "SPISD::XTOF";
+ case SPISD::CALL: return "SPISD::CALL";
+ case SPISD::RET_FLAG: return "SPISD::RET_FLAG";
+ case SPISD::GLOBAL_BASE_REG: return "SPISD::GLOBAL_BASE_REG";
+ case SPISD::FLUSHW: return "SPISD::FLUSHW";
+ case SPISD::TLS_ADD: return "SPISD::TLS_ADD";
+ case SPISD::TLS_LD: return "SPISD::TLS_LD";
+ case SPISD::TLS_CALL: return "SPISD::TLS_CALL";
+ }
+ return nullptr;
+}
+
+EVT SparcTargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
+ EVT VT) const {
+ if (!VT.isVector())
+ return MVT::i32;
+ return VT.changeVectorElementTypeToInteger();
+}
+
+/// isMaskedValueZeroForTargetNode - Return true if 'Op & Mask' is known to
+/// be zero. Op is expected to be a target specific node. Used by DAG
+/// combiner.
+void SparcTargetLowering::computeKnownBitsForTargetNode
+ (const SDValue Op,
+ APInt &KnownZero,
+ APInt &KnownOne,
+ const SelectionDAG &DAG,
+ unsigned Depth) const {
+ APInt KnownZero2, KnownOne2;
+ KnownZero = KnownOne = APInt(KnownZero.getBitWidth(), 0);
+
+ switch (Op.getOpcode()) {
+ default: break;
+ case SPISD::SELECT_ICC:
+ case SPISD::SELECT_XCC:
+ case SPISD::SELECT_FCC:
+ DAG.computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1);
+ DAG.computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
+
+ // Only known if known in both the LHS and RHS.
+ KnownOne &= KnownOne2;
+ KnownZero &= KnownZero2;
+ break;
+ }
+}
+
+// Look at LHS/RHS/CC and see if they are a lowered setcc instruction. If so
+// set LHS/RHS and SPCC to the LHS/RHS of the setcc and SPCC to the condition.
+static void LookThroughSetCC(SDValue &LHS, SDValue &RHS,
+ ISD::CondCode CC, unsigned &SPCC) {
+ if (isNullConstant(RHS) &&
+ CC == ISD::SETNE &&
+ (((LHS.getOpcode() == SPISD::SELECT_ICC ||
+ LHS.getOpcode() == SPISD::SELECT_XCC) &&
+ LHS.getOperand(3).getOpcode() == SPISD::CMPICC) ||
+ (LHS.getOpcode() == SPISD::SELECT_FCC &&
+ LHS.getOperand(3).getOpcode() == SPISD::CMPFCC)) &&
+ isOneConstant(LHS.getOperand(0)) &&
+ isNullConstant(LHS.getOperand(1))) {
+ SDValue CMPCC = LHS.getOperand(3);
+ SPCC = cast<ConstantSDNode>(LHS.getOperand(2))->getZExtValue();
+ LHS = CMPCC.getOperand(0);
+ RHS = CMPCC.getOperand(1);
+ }
+}
+
+// Convert to a target node and set target flags.
+SDValue SparcTargetLowering::withTargetFlags(SDValue Op, unsigned TF,
+ SelectionDAG &DAG) const {
+ if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op))
+ return DAG.getTargetGlobalAddress(GA->getGlobal(),
+ SDLoc(GA),
+ GA->getValueType(0),
+ GA->getOffset(), TF);
+
+ if (const ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op))
+ return DAG.getTargetConstantPool(CP->getConstVal(),
+ CP->getValueType(0),
+ CP->getAlignment(),
+ CP->getOffset(), TF);
+
+ if (const BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(Op))
+ return DAG.getTargetBlockAddress(BA->getBlockAddress(),
+ Op.getValueType(),
+ 0,
+ TF);
+
+ if (const ExternalSymbolSDNode *ES = dyn_cast<ExternalSymbolSDNode>(Op))
+ return DAG.getTargetExternalSymbol(ES->getSymbol(),
+ ES->getValueType(0), TF);
+
+ llvm_unreachable("Unhandled address SDNode");
+}
+
+// Split Op into high and low parts according to HiTF and LoTF.
+// Return an ADD node combining the parts.
+SDValue SparcTargetLowering::makeHiLoPair(SDValue Op,
+ unsigned HiTF, unsigned LoTF,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+ SDValue Hi = DAG.getNode(SPISD::Hi, DL, VT, withTargetFlags(Op, HiTF, DAG));
+ SDValue Lo = DAG.getNode(SPISD::Lo, DL, VT, withTargetFlags(Op, LoTF, DAG));
+ return DAG.getNode(ISD::ADD, DL, VT, Hi, Lo);
+}
+
+// Build SDNodes for producing an address from a GlobalAddress, ConstantPool,
+// or ExternalSymbol SDNode.
+SDValue SparcTargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ EVT VT = getPointerTy(DAG.getDataLayout());
+
+ // Handle PIC mode first. SPARC needs a got load for every variable!
+ if (isPositionIndependent()) {
+ // This is the pic32 code model, the GOT is known to be smaller than 4GB.
+ SDValue HiLo = makeHiLoPair(Op, SparcMCExpr::VK_Sparc_GOT22,
+ SparcMCExpr::VK_Sparc_GOT10, DAG);
+ SDValue GlobalBase = DAG.getNode(SPISD::GLOBAL_BASE_REG, DL, VT);
+ SDValue AbsAddr = DAG.getNode(ISD::ADD, DL, VT, GlobalBase, HiLo);
+ // GLOBAL_BASE_REG codegen'ed with call. Inform MFI that this
+ // function has calls.
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ MFI.setHasCalls(true);
+ return DAG.getLoad(VT, DL, DAG.getEntryNode(), AbsAddr,
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+ }
+
+ // This is one of the absolute code models.
+ switch(getTargetMachine().getCodeModel()) {
+ default:
+ llvm_unreachable("Unsupported absolute code model");
+ case CodeModel::Small:
+ // abs32.
+ return makeHiLoPair(Op, SparcMCExpr::VK_Sparc_HI,
+ SparcMCExpr::VK_Sparc_LO, DAG);
+ case CodeModel::Medium: {
+ // abs44.
+ SDValue H44 = makeHiLoPair(Op, SparcMCExpr::VK_Sparc_H44,
+ SparcMCExpr::VK_Sparc_M44, DAG);
+ H44 = DAG.getNode(ISD::SHL, DL, VT, H44, DAG.getConstant(12, DL, MVT::i32));
+ SDValue L44 = withTargetFlags(Op, SparcMCExpr::VK_Sparc_L44, DAG);
+ L44 = DAG.getNode(SPISD::Lo, DL, VT, L44);
+ return DAG.getNode(ISD::ADD, DL, VT, H44, L44);
+ }
+ case CodeModel::Large: {
+ // abs64.
+ SDValue Hi = makeHiLoPair(Op, SparcMCExpr::VK_Sparc_HH,
+ SparcMCExpr::VK_Sparc_HM, DAG);
+ Hi = DAG.getNode(ISD::SHL, DL, VT, Hi, DAG.getConstant(32, DL, MVT::i32));
+ SDValue Lo = makeHiLoPair(Op, SparcMCExpr::VK_Sparc_HI,
+ SparcMCExpr::VK_Sparc_LO, DAG);
+ return DAG.getNode(ISD::ADD, DL, VT, Hi, Lo);
+ }
+ }
+}
+
+SDValue SparcTargetLowering::LowerGlobalAddress(SDValue Op,
+ SelectionDAG &DAG) const {
+ return makeAddress(Op, DAG);
+}
+
+SDValue SparcTargetLowering::LowerConstantPool(SDValue Op,
+ SelectionDAG &DAG) const {
+ return makeAddress(Op, DAG);
+}
+
+SDValue SparcTargetLowering::LowerBlockAddress(SDValue Op,
+ SelectionDAG &DAG) const {
+ return makeAddress(Op, DAG);
+}
+
+SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op,
+ SelectionDAG &DAG) const {
+
+ GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+ if (DAG.getTarget().Options.EmulatedTLS)
+ return LowerToTLSEmulatedModel(GA, DAG);
+
+ SDLoc DL(GA);
+ const GlobalValue *GV = GA->getGlobal();
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+ TLSModel::Model model = getTargetMachine().getTLSModel(GV);
+
+ if (model == TLSModel::GeneralDynamic || model == TLSModel::LocalDynamic) {
+ unsigned HiTF = ((model == TLSModel::GeneralDynamic)
+ ? SparcMCExpr::VK_Sparc_TLS_GD_HI22
+ : SparcMCExpr::VK_Sparc_TLS_LDM_HI22);
+ unsigned LoTF = ((model == TLSModel::GeneralDynamic)
+ ? SparcMCExpr::VK_Sparc_TLS_GD_LO10
+ : SparcMCExpr::VK_Sparc_TLS_LDM_LO10);
+ unsigned addTF = ((model == TLSModel::GeneralDynamic)
+ ? SparcMCExpr::VK_Sparc_TLS_GD_ADD
+ : SparcMCExpr::VK_Sparc_TLS_LDM_ADD);
+ unsigned callTF = ((model == TLSModel::GeneralDynamic)
+ ? SparcMCExpr::VK_Sparc_TLS_GD_CALL
+ : SparcMCExpr::VK_Sparc_TLS_LDM_CALL);
+
+ SDValue HiLo = makeHiLoPair(Op, HiTF, LoTF, DAG);
+ SDValue Base = DAG.getNode(SPISD::GLOBAL_BASE_REG, DL, PtrVT);
+ SDValue Argument = DAG.getNode(SPISD::TLS_ADD, DL, PtrVT, Base, HiLo,
+ withTargetFlags(Op, addTF, DAG));
+
+ SDValue Chain = DAG.getEntryNode();
+ SDValue InFlag;
+
+ Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(1, DL, true), DL);
+ Chain = DAG.getCopyToReg(Chain, DL, SP::O0, Argument, InFlag);
+ InFlag = Chain.getValue(1);
+ SDValue Callee = DAG.getTargetExternalSymbol("__tls_get_addr", PtrVT);
+ SDValue Symbol = withTargetFlags(Op, callTF, DAG);
+
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+ const uint32_t *Mask = Subtarget->getRegisterInfo()->getCallPreservedMask(
+ DAG.getMachineFunction(), CallingConv::C);
+ assert(Mask && "Missing call preserved mask for calling convention");
+ SDValue Ops[] = {Chain,
+ Callee,
+ Symbol,
+ DAG.getRegister(SP::O0, PtrVT),
+ DAG.getRegisterMask(Mask),
+ InFlag};
+ Chain = DAG.getNode(SPISD::TLS_CALL, DL, NodeTys, Ops);
+ InFlag = Chain.getValue(1);
+ Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(1, DL, true),
+ DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
+ InFlag = Chain.getValue(1);
+ SDValue Ret = DAG.getCopyFromReg(Chain, DL, SP::O0, PtrVT, InFlag);
+
+ if (model != TLSModel::LocalDynamic)
+ return Ret;
+
+ SDValue Hi = DAG.getNode(SPISD::Hi, DL, PtrVT,
+ withTargetFlags(Op, SparcMCExpr::VK_Sparc_TLS_LDO_HIX22, DAG));
+ SDValue Lo = DAG.getNode(SPISD::Lo, DL, PtrVT,
+ withTargetFlags(Op, SparcMCExpr::VK_Sparc_TLS_LDO_LOX10, DAG));
+ HiLo = DAG.getNode(ISD::XOR, DL, PtrVT, Hi, Lo);
+ return DAG.getNode(SPISD::TLS_ADD, DL, PtrVT, Ret, HiLo,
+ withTargetFlags(Op, SparcMCExpr::VK_Sparc_TLS_LDO_ADD, DAG));
+ }
+
+ if (model == TLSModel::InitialExec) {
+ unsigned ldTF = ((PtrVT == MVT::i64)? SparcMCExpr::VK_Sparc_TLS_IE_LDX
+ : SparcMCExpr::VK_Sparc_TLS_IE_LD);
+
+ SDValue Base = DAG.getNode(SPISD::GLOBAL_BASE_REG, DL, PtrVT);
+
+ // GLOBAL_BASE_REG codegen'ed with call. Inform MFI that this
+ // function has calls.
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ MFI.setHasCalls(true);
+
+ SDValue TGA = makeHiLoPair(Op,
+ SparcMCExpr::VK_Sparc_TLS_IE_HI22,
+ SparcMCExpr::VK_Sparc_TLS_IE_LO10, DAG);
+ SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base, TGA);
+ SDValue Offset = DAG.getNode(SPISD::TLS_LD,
+ DL, PtrVT, Ptr,
+ withTargetFlags(Op, ldTF, DAG));
+ return DAG.getNode(SPISD::TLS_ADD, DL, PtrVT,
+ DAG.getRegister(SP::G7, PtrVT), Offset,
+ withTargetFlags(Op,
+ SparcMCExpr::VK_Sparc_TLS_IE_ADD, DAG));
+ }
+
+ assert(model == TLSModel::LocalExec);
+ SDValue Hi = DAG.getNode(SPISD::Hi, DL, PtrVT,
+ withTargetFlags(Op, SparcMCExpr::VK_Sparc_TLS_LE_HIX22, DAG));
+ SDValue Lo = DAG.getNode(SPISD::Lo, DL, PtrVT,
+ withTargetFlags(Op, SparcMCExpr::VK_Sparc_TLS_LE_LOX10, DAG));
+ SDValue Offset = DAG.getNode(ISD::XOR, DL, PtrVT, Hi, Lo);
+
+ return DAG.getNode(ISD::ADD, DL, PtrVT,
+ DAG.getRegister(SP::G7, PtrVT), Offset);
+}
+
+SDValue SparcTargetLowering::LowerF128_LibCallArg(SDValue Chain,
+ ArgListTy &Args, SDValue Arg,
+ const SDLoc &DL,
+ SelectionDAG &DAG) const {
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ EVT ArgVT = Arg.getValueType();
+ Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+
+ ArgListEntry Entry;
+ Entry.Node = Arg;
+ Entry.Ty = ArgTy;
+
+ if (ArgTy->isFP128Ty()) {
+ // Create a stack object and pass the pointer to the library function.
+ int FI = MFI.CreateStackObject(16, 8, false);
+ SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+ Chain = DAG.getStore(Chain, DL, Entry.Node, FIPtr, MachinePointerInfo(),
+ /* Alignment = */ 8);
+
+ Entry.Node = FIPtr;
+ Entry.Ty = PointerType::getUnqual(ArgTy);
+ }
+ Args.push_back(Entry);
+ return Chain;
+}
+
+SDValue
+SparcTargetLowering::LowerF128Op(SDValue Op, SelectionDAG &DAG,
+ const char *LibFuncName,
+ unsigned numArgs) const {
+
+ ArgListTy Args;
+
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+ SDValue Callee = DAG.getExternalSymbol(LibFuncName, PtrVT);
+ Type *RetTy = Op.getValueType().getTypeForEVT(*DAG.getContext());
+ Type *RetTyABI = RetTy;
+ SDValue Chain = DAG.getEntryNode();
+ SDValue RetPtr;
+
+ if (RetTy->isFP128Ty()) {
+ // Create a Stack Object to receive the return value of type f128.
+ ArgListEntry Entry;
+ int RetFI = MFI.CreateStackObject(16, 8, false);
+ RetPtr = DAG.getFrameIndex(RetFI, PtrVT);
+ Entry.Node = RetPtr;
+ Entry.Ty = PointerType::getUnqual(RetTy);
+ if (!Subtarget->is64Bit())
+ Entry.isSRet = true;
+ Entry.isReturned = false;
+ Args.push_back(Entry);
+ RetTyABI = Type::getVoidTy(*DAG.getContext());
+ }
+
+ assert(Op->getNumOperands() >= numArgs && "Not enough operands!");
+ for (unsigned i = 0, e = numArgs; i != e; ++i) {
+ Chain = LowerF128_LibCallArg(Chain, Args, Op.getOperand(i), SDLoc(Op), DAG);
+ }
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(SDLoc(Op)).setChain(Chain)
+ .setCallee(CallingConv::C, RetTyABI, Callee, std::move(Args));
+
+ std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+
+ // chain is in second result.
+ if (RetTyABI == RetTy)
+ return CallInfo.first;
+
+ assert (RetTy->isFP128Ty() && "Unexpected return type!");
+
+ Chain = CallInfo.second;
+
+ // Load RetPtr to get the return value.
+ return DAG.getLoad(Op.getValueType(), SDLoc(Op), Chain, RetPtr,
+ MachinePointerInfo(), /* Alignment = */ 8);
+}
+
+SDValue SparcTargetLowering::LowerF128Compare(SDValue LHS, SDValue RHS,
+ unsigned &SPCC, const SDLoc &DL,
+ SelectionDAG &DAG) const {
+
+ const char *LibCall = nullptr;
+ bool is64Bit = Subtarget->is64Bit();
+ switch(SPCC) {
+ default: llvm_unreachable("Unhandled conditional code!");
+ case SPCC::FCC_E : LibCall = is64Bit? "_Qp_feq" : "_Q_feq"; break;
+ case SPCC::FCC_NE : LibCall = is64Bit? "_Qp_fne" : "_Q_fne"; break;
+ case SPCC::FCC_L : LibCall = is64Bit? "_Qp_flt" : "_Q_flt"; break;
+ case SPCC::FCC_G : LibCall = is64Bit? "_Qp_fgt" : "_Q_fgt"; break;
+ case SPCC::FCC_LE : LibCall = is64Bit? "_Qp_fle" : "_Q_fle"; break;
+ case SPCC::FCC_GE : LibCall = is64Bit? "_Qp_fge" : "_Q_fge"; break;
+ case SPCC::FCC_UL :
+ case SPCC::FCC_ULE:
+ case SPCC::FCC_UG :
+ case SPCC::FCC_UGE:
+ case SPCC::FCC_U :
+ case SPCC::FCC_O :
+ case SPCC::FCC_LG :
+ case SPCC::FCC_UE : LibCall = is64Bit? "_Qp_cmp" : "_Q_cmp"; break;
+ }
+
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue Callee = DAG.getExternalSymbol(LibCall, PtrVT);
+ Type *RetTy = Type::getInt32Ty(*DAG.getContext());
+ ArgListTy Args;
+ SDValue Chain = DAG.getEntryNode();
+ Chain = LowerF128_LibCallArg(Chain, Args, LHS, DL, DAG);
+ Chain = LowerF128_LibCallArg(Chain, Args, RHS, DL, DAG);
+
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(DL).setChain(Chain)
+ .setCallee(CallingConv::C, RetTy, Callee, std::move(Args));
+
+ std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+
+ // result is in first, and chain is in second result.
+ SDValue Result = CallInfo.first;
+
+ switch(SPCC) {
+ default: {
+ SDValue RHS = DAG.getTargetConstant(0, DL, Result.getValueType());
+ SPCC = SPCC::ICC_NE;
+ return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+ }
+ case SPCC::FCC_UL : {
+ SDValue Mask = DAG.getTargetConstant(1, DL, Result.getValueType());
+ Result = DAG.getNode(ISD::AND, DL, Result.getValueType(), Result, Mask);
+ SDValue RHS = DAG.getTargetConstant(0, DL, Result.getValueType());
+ SPCC = SPCC::ICC_NE;
+ return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+ }
+ case SPCC::FCC_ULE: {
+ SDValue RHS = DAG.getTargetConstant(2, DL, Result.getValueType());
+ SPCC = SPCC::ICC_NE;
+ return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+ }
+ case SPCC::FCC_UG : {
+ SDValue RHS = DAG.getTargetConstant(1, DL, Result.getValueType());
+ SPCC = SPCC::ICC_G;
+ return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+ }
+ case SPCC::FCC_UGE: {
+ SDValue RHS = DAG.getTargetConstant(1, DL, Result.getValueType());
+ SPCC = SPCC::ICC_NE;
+ return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+ }
+
+ case SPCC::FCC_U : {
+ SDValue RHS = DAG.getTargetConstant(3, DL, Result.getValueType());
+ SPCC = SPCC::ICC_E;
+ return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+ }
+ case SPCC::FCC_O : {
+ SDValue RHS = DAG.getTargetConstant(3, DL, Result.getValueType());
+ SPCC = SPCC::ICC_NE;
+ return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+ }
+ case SPCC::FCC_LG : {
+ SDValue Mask = DAG.getTargetConstant(3, DL, Result.getValueType());
+ Result = DAG.getNode(ISD::AND, DL, Result.getValueType(), Result, Mask);
+ SDValue RHS = DAG.getTargetConstant(0, DL, Result.getValueType());
+ SPCC = SPCC::ICC_NE;
+ return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+ }
+ case SPCC::FCC_UE : {
+ SDValue Mask = DAG.getTargetConstant(3, DL, Result.getValueType());
+ Result = DAG.getNode(ISD::AND, DL, Result.getValueType(), Result, Mask);
+ SDValue RHS = DAG.getTargetConstant(0, DL, Result.getValueType());
+ SPCC = SPCC::ICC_E;
+ return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+ }
+ }
+}
+
+static SDValue
+LowerF128_FPEXTEND(SDValue Op, SelectionDAG &DAG,
+ const SparcTargetLowering &TLI) {
+
+ if (Op.getOperand(0).getValueType() == MVT::f64)
+ return TLI.LowerF128Op(Op, DAG,
+ TLI.getLibcallName(RTLIB::FPEXT_F64_F128), 1);
+
+ if (Op.getOperand(0).getValueType() == MVT::f32)
+ return TLI.LowerF128Op(Op, DAG,
+ TLI.getLibcallName(RTLIB::FPEXT_F32_F128), 1);
+
+ llvm_unreachable("fpextend with non-float operand!");
+ return SDValue();
+}
+
+static SDValue
+LowerF128_FPROUND(SDValue Op, SelectionDAG &DAG,
+ const SparcTargetLowering &TLI) {
+ // FP_ROUND on f64 and f32 are legal.
+ if (Op.getOperand(0).getValueType() != MVT::f128)
+ return Op;
+
+ if (Op.getValueType() == MVT::f64)
+ return TLI.LowerF128Op(Op, DAG,
+ TLI.getLibcallName(RTLIB::FPROUND_F128_F64), 1);
+ if (Op.getValueType() == MVT::f32)
+ return TLI.LowerF128Op(Op, DAG,
+ TLI.getLibcallName(RTLIB::FPROUND_F128_F32), 1);
+
+ llvm_unreachable("fpround to non-float!");
+ return SDValue();
+}
+
+static SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG,
+ const SparcTargetLowering &TLI,
+ bool hasHardQuad) {
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+ assert(VT == MVT::i32 || VT == MVT::i64);
+
+ // Expand f128 operations to fp128 abi calls.
+ if (Op.getOperand(0).getValueType() == MVT::f128
+ && (!hasHardQuad || !TLI.isTypeLegal(VT))) {
+ const char *libName = TLI.getLibcallName(VT == MVT::i32
+ ? RTLIB::FPTOSINT_F128_I32
+ : RTLIB::FPTOSINT_F128_I64);
+ return TLI.LowerF128Op(Op, DAG, libName, 1);
+ }
+
+ // Expand if the resulting type is illegal.
+ if (!TLI.isTypeLegal(VT))
+ return SDValue();
+
+ // Otherwise, Convert the fp value to integer in an FP register.
+ if (VT == MVT::i32)
+ Op = DAG.getNode(SPISD::FTOI, dl, MVT::f32, Op.getOperand(0));
+ else
+ Op = DAG.getNode(SPISD::FTOX, dl, MVT::f64, Op.getOperand(0));
+
+ return DAG.getNode(ISD::BITCAST, dl, VT, Op);
+}
+
+static SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG,
+ const SparcTargetLowering &TLI,
+ bool hasHardQuad) {
+ SDLoc dl(Op);
+ EVT OpVT = Op.getOperand(0).getValueType();
+ assert(OpVT == MVT::i32 || (OpVT == MVT::i64));
+
+ EVT floatVT = (OpVT == MVT::i32) ? MVT::f32 : MVT::f64;
+
+ // Expand f128 operations to fp128 ABI calls.
+ if (Op.getValueType() == MVT::f128
+ && (!hasHardQuad || !TLI.isTypeLegal(OpVT))) {
+ const char *libName = TLI.getLibcallName(OpVT == MVT::i32
+ ? RTLIB::SINTTOFP_I32_F128
+ : RTLIB::SINTTOFP_I64_F128);
+ return TLI.LowerF128Op(Op, DAG, libName, 1);
+ }
+
+ // Expand if the operand type is illegal.
+ if (!TLI.isTypeLegal(OpVT))
+ return SDValue();
+
+ // Otherwise, Convert the int value to FP in an FP register.
+ SDValue Tmp = DAG.getNode(ISD::BITCAST, dl, floatVT, Op.getOperand(0));
+ unsigned opcode = (OpVT == MVT::i32)? SPISD::ITOF : SPISD::XTOF;
+ return DAG.getNode(opcode, dl, Op.getValueType(), Tmp);
+}
+
+static SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG,
+ const SparcTargetLowering &TLI,
+ bool hasHardQuad) {
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+
+ // Expand if it does not involve f128 or the target has support for
+ // quad floating point instructions and the resulting type is legal.
+ if (Op.getOperand(0).getValueType() != MVT::f128 ||
+ (hasHardQuad && TLI.isTypeLegal(VT)))
+ return SDValue();
+
+ assert(VT == MVT::i32 || VT == MVT::i64);
+
+ return TLI.LowerF128Op(Op, DAG,
+ TLI.getLibcallName(VT == MVT::i32
+ ? RTLIB::FPTOUINT_F128_I32
+ : RTLIB::FPTOUINT_F128_I64),
+ 1);
+}
+
+static SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG,
+ const SparcTargetLowering &TLI,
+ bool hasHardQuad) {
+ SDLoc dl(Op);
+ EVT OpVT = Op.getOperand(0).getValueType();
+ assert(OpVT == MVT::i32 || OpVT == MVT::i64);
+
+ // Expand if it does not involve f128 or the target has support for
+ // quad floating point instructions and the operand type is legal.
+ if (Op.getValueType() != MVT::f128 || (hasHardQuad && TLI.isTypeLegal(OpVT)))
+ return SDValue();
+
+ return TLI.LowerF128Op(Op, DAG,
+ TLI.getLibcallName(OpVT == MVT::i32
+ ? RTLIB::UINTTOFP_I32_F128
+ : RTLIB::UINTTOFP_I64_F128),
+ 1);
+}
+
+static SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG,
+ const SparcTargetLowering &TLI,
+ bool hasHardQuad) {
+ SDValue Chain = Op.getOperand(0);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+ SDValue LHS = Op.getOperand(2);
+ SDValue RHS = Op.getOperand(3);
+ SDValue Dest = Op.getOperand(4);
+ SDLoc dl(Op);
+ unsigned Opc, SPCC = ~0U;
+
+ // If this is a br_cc of a "setcc", and if the setcc got lowered into
+ // an CMP[IF]CC/SELECT_[IF]CC pair, find the original compared values.
+ LookThroughSetCC(LHS, RHS, CC, SPCC);
+
+ // Get the condition flag.
+ SDValue CompareFlag;
+ if (LHS.getValueType().isInteger()) {
+ CompareFlag = DAG.getNode(SPISD::CMPICC, dl, MVT::Glue, LHS, RHS);
+ if (SPCC == ~0U) SPCC = IntCondCCodeToICC(CC);
+ // 32-bit compares use the icc flags, 64-bit uses the xcc flags.
+ Opc = LHS.getValueType() == MVT::i32 ? SPISD::BRICC : SPISD::BRXCC;
+ } else {
+ if (!hasHardQuad && LHS.getValueType() == MVT::f128) {
+ if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
+ CompareFlag = TLI.LowerF128Compare(LHS, RHS, SPCC, dl, DAG);
+ Opc = SPISD::BRICC;
+ } else {
+ CompareFlag = DAG.getNode(SPISD::CMPFCC, dl, MVT::Glue, LHS, RHS);
+ if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
+ Opc = SPISD::BRFCC;
+ }
+ }
+ return DAG.getNode(Opc, dl, MVT::Other, Chain, Dest,
+ DAG.getConstant(SPCC, dl, MVT::i32), CompareFlag);
+}
+
+static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG,
+ const SparcTargetLowering &TLI,
+ bool hasHardQuad) {
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+ SDValue TrueVal = Op.getOperand(2);
+ SDValue FalseVal = Op.getOperand(3);
+ SDLoc dl(Op);
+ unsigned Opc, SPCC = ~0U;
+
+ // If this is a select_cc of a "setcc", and if the setcc got lowered into
+ // an CMP[IF]CC/SELECT_[IF]CC pair, find the original compared values.
+ LookThroughSetCC(LHS, RHS, CC, SPCC);
+
+ SDValue CompareFlag;
+ if (LHS.getValueType().isInteger()) {
+ CompareFlag = DAG.getNode(SPISD::CMPICC, dl, MVT::Glue, LHS, RHS);
+ Opc = LHS.getValueType() == MVT::i32 ?
+ SPISD::SELECT_ICC : SPISD::SELECT_XCC;
+ if (SPCC == ~0U) SPCC = IntCondCCodeToICC(CC);
+ } else {
+ if (!hasHardQuad && LHS.getValueType() == MVT::f128) {
+ if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
+ CompareFlag = TLI.LowerF128Compare(LHS, RHS, SPCC, dl, DAG);
+ Opc = SPISD::SELECT_ICC;
+ } else {
+ CompareFlag = DAG.getNode(SPISD::CMPFCC, dl, MVT::Glue, LHS, RHS);
+ Opc = SPISD::SELECT_FCC;
+ if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
+ }
+ }
+ return DAG.getNode(Opc, dl, TrueVal.getValueType(), TrueVal, FalseVal,
+ DAG.getConstant(SPCC, dl, MVT::i32), CompareFlag);
+}
+
+SDValue SparcTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG,
+ const SparcTargetLowering &TLI) const {
+ SDLoc DL(Op);
+ return DAG.getNode(SPISD::EH_SJLJ_SETJMP, DL,
+ DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), Op.getOperand(1));
+
+}
+
+SDValue SparcTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG,
+ const SparcTargetLowering &TLI) const {
+ SDLoc DL(Op);
+ return DAG.getNode(SPISD::EH_SJLJ_LONGJMP, DL, MVT::Other, Op.getOperand(0), Op.getOperand(1));
+}
+
+static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG,
+ const SparcTargetLowering &TLI) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
+ auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+
+ // Need frame address to find the address of VarArgsFrameIndex.
+ MF.getFrameInfo().setFrameAddressIsTaken(true);
+
+ // vastart just stores the address of the VarArgsFrameIndex slot into the
+ // memory location argument.
+ SDLoc DL(Op);
+ SDValue Offset =
+ DAG.getNode(ISD::ADD, DL, PtrVT, DAG.getRegister(SP::I6, PtrVT),
+ DAG.getIntPtrConstant(FuncInfo->getVarArgsFrameOffset(), DL));
+ const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+ return DAG.getStore(Op.getOperand(0), DL, Offset, Op.getOperand(1),
+ MachinePointerInfo(SV));
+}
+
+static SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) {
+ SDNode *Node = Op.getNode();
+ EVT VT = Node->getValueType(0);
+ SDValue InChain = Node->getOperand(0);
+ SDValue VAListPtr = Node->getOperand(1);
+ EVT PtrVT = VAListPtr.getValueType();
+ const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
+ SDLoc DL(Node);
+ SDValue VAList =
+ DAG.getLoad(PtrVT, DL, InChain, VAListPtr, MachinePointerInfo(SV));
+ // Increment the pointer, VAList, to the next vaarg.
+ SDValue NextPtr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
+ DAG.getIntPtrConstant(VT.getSizeInBits()/8,
+ DL));
+ // Store the incremented VAList to the legalized pointer.
+ InChain = DAG.getStore(VAList.getValue(1), DL, NextPtr, VAListPtr,
+ MachinePointerInfo(SV));
+ // Load the actual argument out of the pointer VAList.
+ // We can't count on greater alignment than the word size.
+ return DAG.getLoad(VT, DL, InChain, VAList, MachinePointerInfo(),
+ std::min(PtrVT.getSizeInBits(), VT.getSizeInBits()) / 8);
+}
+
+static SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG,
+ const SparcSubtarget *Subtarget) {
+ SDValue Chain = Op.getOperand(0); // Legalize the chain.
+ SDValue Size = Op.getOperand(1); // Legalize the size.
+ unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+ unsigned StackAlign = Subtarget->getFrameLowering()->getStackAlignment();
+ EVT VT = Size->getValueType(0);
+ SDLoc dl(Op);
+
+ // TODO: implement over-aligned alloca. (Note: also implies
+ // supporting support for overaligned function frames + dynamic
+ // allocations, at all, which currently isn't supported)
+ if (Align > StackAlign) {
+ const MachineFunction &MF = DAG.getMachineFunction();
+ report_fatal_error("Function \"" + Twine(MF.getName()) + "\": "
+ "over-aligned dynamic alloca not supported.");
+ }
+
+ // The resultant pointer needs to be above the register spill area
+ // at the bottom of the stack.
+ unsigned regSpillArea;
+ if (Subtarget->is64Bit()) {
+ regSpillArea = 128;
+ } else {
+ // On Sparc32, the size of the spill area is 92. Unfortunately,
+ // that's only 4-byte aligned, not 8-byte aligned (the stack
+ // pointer is 8-byte aligned). So, if the user asked for an 8-byte
+ // aligned dynamic allocation, we actually need to add 96 to the
+ // bottom of the stack, instead of 92, to ensure 8-byte alignment.
+
+ // That also means adding 4 to the size of the allocation --
+ // before applying the 8-byte rounding. Unfortunately, we the
+ // value we get here has already had rounding applied. So, we need
+ // to add 8, instead, wasting a bit more memory.
+
+ // Further, this only actually needs to be done if the required
+ // alignment is > 4, but, we've lost that info by this point, too,
+ // so we always apply it.
+
+ // (An alternative approach would be to always reserve 96 bytes
+ // instead of the required 92, but then we'd waste 4 extra bytes
+ // in every frame, not just those with dynamic stack allocations)
+
+ // TODO: modify code in SelectionDAGBuilder to make this less sad.
+
+ Size = DAG.getNode(ISD::ADD, dl, VT, Size,
+ DAG.getConstant(8, dl, VT));
+ regSpillArea = 96;
+ }
+
+ unsigned SPReg = SP::O6;
+ SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
+ SDValue NewSP = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
+ Chain = DAG.getCopyToReg(SP.getValue(1), dl, SPReg, NewSP); // Output chain
+
+ regSpillArea += Subtarget->getStackPointerBias();
+
+ SDValue NewVal = DAG.getNode(ISD::ADD, dl, VT, NewSP,
+ DAG.getConstant(regSpillArea, dl, VT));
+ SDValue Ops[2] = { NewVal, Chain };
+ return DAG.getMergeValues(Ops, dl);
+}
+
+
+static SDValue getFLUSHW(SDValue Op, SelectionDAG &DAG) {
+ SDLoc dl(Op);
+ SDValue Chain = DAG.getNode(SPISD::FLUSHW,
+ dl, MVT::Other, DAG.getEntryNode());
+ return Chain;
+}
+
+static SDValue getFRAMEADDR(uint64_t depth, SDValue Op, SelectionDAG &DAG,
+ const SparcSubtarget *Subtarget) {
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ MFI.setFrameAddressIsTaken(true);
+
+ EVT VT = Op.getValueType();
+ SDLoc dl(Op);
+ unsigned FrameReg = SP::I6;
+ unsigned stackBias = Subtarget->getStackPointerBias();
+
+ SDValue FrameAddr;
+
+ if (depth == 0) {
+ FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
+ if (Subtarget->is64Bit())
+ FrameAddr = DAG.getNode(ISD::ADD, dl, VT, FrameAddr,
+ DAG.getIntPtrConstant(stackBias, dl));
+ return FrameAddr;
+ }
+
+ // flush first to make sure the windowed registers' values are in stack
+ SDValue Chain = getFLUSHW(Op, DAG);
+ FrameAddr = DAG.getCopyFromReg(Chain, dl, FrameReg, VT);
+
+ unsigned Offset = (Subtarget->is64Bit()) ? (stackBias + 112) : 56;
+
+ while (depth--) {
+ SDValue Ptr = DAG.getNode(ISD::ADD, dl, VT, FrameAddr,
+ DAG.getIntPtrConstant(Offset, dl));
+ FrameAddr = DAG.getLoad(VT, dl, Chain, Ptr, MachinePointerInfo());
+ }
+ if (Subtarget->is64Bit())
+ FrameAddr = DAG.getNode(ISD::ADD, dl, VT, FrameAddr,
+ DAG.getIntPtrConstant(stackBias, dl));
+ return FrameAddr;
+}
+
+
+static SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG,
+ const SparcSubtarget *Subtarget) {
+
+ uint64_t depth = Op.getConstantOperandVal(0);
+
+ return getFRAMEADDR(depth, Op, DAG, Subtarget);
+
+}
+
+static SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG,
+ const SparcTargetLowering &TLI,
+ const SparcSubtarget *Subtarget) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MFI.setReturnAddressIsTaken(true);
+
+ if (TLI.verifyReturnAddressArgumentIsConstant(Op, DAG))
+ return SDValue();
+
+ EVT VT = Op.getValueType();
+ SDLoc dl(Op);
+ uint64_t depth = Op.getConstantOperandVal(0);
+
+ SDValue RetAddr;
+ if (depth == 0) {
+ auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+ unsigned RetReg = MF.addLiveIn(SP::I7, TLI.getRegClassFor(PtrVT));
+ RetAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, RetReg, VT);
+ return RetAddr;
+ }
+
+ // Need frame address to find return address of the caller.
+ SDValue FrameAddr = getFRAMEADDR(depth - 1, Op, DAG, Subtarget);
+
+ unsigned Offset = (Subtarget->is64Bit()) ? 120 : 60;
+ SDValue Ptr = DAG.getNode(ISD::ADD,
+ dl, VT,
+ FrameAddr,
+ DAG.getIntPtrConstant(Offset, dl));
+ RetAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), Ptr, MachinePointerInfo());
+
+ return RetAddr;
+}
+
+static SDValue LowerF64Op(SDValue SrcReg64, const SDLoc &dl, SelectionDAG &DAG,
+ unsigned opcode) {
+ assert(SrcReg64.getValueType() == MVT::f64 && "LowerF64Op called on non-double!");
+ assert(opcode == ISD::FNEG || opcode == ISD::FABS);
+
+ // Lower fneg/fabs on f64 to fneg/fabs on f32.
+ // fneg f64 => fneg f32:sub_even, fmov f32:sub_odd.
+ // fabs f64 => fabs f32:sub_even, fmov f32:sub_odd.
+
+ // Note: in little-endian, the floating-point value is stored in the
+ // registers are in the opposite order, so the subreg with the sign
+ // bit is the highest-numbered (odd), rather than the
+ // lowest-numbered (even).
+
+ SDValue Hi32 = DAG.getTargetExtractSubreg(SP::sub_even, dl, MVT::f32,
+ SrcReg64);
+ SDValue Lo32 = DAG.getTargetExtractSubreg(SP::sub_odd, dl, MVT::f32,
+ SrcReg64);
+
+ if (DAG.getDataLayout().isLittleEndian())
+ Lo32 = DAG.getNode(opcode, dl, MVT::f32, Lo32);
+ else
+ Hi32 = DAG.getNode(opcode, dl, MVT::f32, Hi32);
+
+ SDValue DstReg64 = SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF,
+ dl, MVT::f64), 0);
+ DstReg64 = DAG.getTargetInsertSubreg(SP::sub_even, dl, MVT::f64,
+ DstReg64, Hi32);
+ DstReg64 = DAG.getTargetInsertSubreg(SP::sub_odd, dl, MVT::f64,
+ DstReg64, Lo32);
+ return DstReg64;
+}
+
+// Lower a f128 load into two f64 loads.
+static SDValue LowerF128Load(SDValue Op, SelectionDAG &DAG)
+{
+ SDLoc dl(Op);
+ LoadSDNode *LdNode = dyn_cast<LoadSDNode>(Op.getNode());
+ assert(LdNode && LdNode->getOffset().isUndef()
+ && "Unexpected node type");
+
+ unsigned alignment = LdNode->getAlignment();
+ if (alignment > 8)
+ alignment = 8;
+
+ SDValue Hi64 =
+ DAG.getLoad(MVT::f64, dl, LdNode->getChain(), LdNode->getBasePtr(),
+ LdNode->getPointerInfo(), alignment);
+ EVT addrVT = LdNode->getBasePtr().getValueType();
+ SDValue LoPtr = DAG.getNode(ISD::ADD, dl, addrVT,
+ LdNode->getBasePtr(),
+ DAG.getConstant(8, dl, addrVT));
+ SDValue Lo64 = DAG.getLoad(MVT::f64, dl, LdNode->getChain(), LoPtr,
+ LdNode->getPointerInfo(), alignment);
+
+ SDValue SubRegEven = DAG.getTargetConstant(SP::sub_even64, dl, MVT::i32);
+ SDValue SubRegOdd = DAG.getTargetConstant(SP::sub_odd64, dl, MVT::i32);
+
+ SDNode *InFP128 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF,
+ dl, MVT::f128);
+ InFP128 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl,
+ MVT::f128,
+ SDValue(InFP128, 0),
+ Hi64,
+ SubRegEven);
+ InFP128 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl,
+ MVT::f128,
+ SDValue(InFP128, 0),
+ Lo64,
+ SubRegOdd);
+ SDValue OutChains[2] = { SDValue(Hi64.getNode(), 1),
+ SDValue(Lo64.getNode(), 1) };
+ SDValue OutChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+ SDValue Ops[2] = {SDValue(InFP128,0), OutChain};
+ return DAG.getMergeValues(Ops, dl);
+}
+
+static SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG)
+{
+ LoadSDNode *LdNode = cast<LoadSDNode>(Op.getNode());
+
+ EVT MemVT = LdNode->getMemoryVT();
+ if (MemVT == MVT::f128)
+ return LowerF128Load(Op, DAG);
+
+ return Op;
+}
+
+// Lower a f128 store into two f64 stores.
+static SDValue LowerF128Store(SDValue Op, SelectionDAG &DAG) {
+ SDLoc dl(Op);
+ StoreSDNode *StNode = dyn_cast<StoreSDNode>(Op.getNode());
+ assert(StNode && StNode->getOffset().isUndef()
+ && "Unexpected node type");
+ SDValue SubRegEven = DAG.getTargetConstant(SP::sub_even64, dl, MVT::i32);
+ SDValue SubRegOdd = DAG.getTargetConstant(SP::sub_odd64, dl, MVT::i32);
+
+ SDNode *Hi64 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+ dl,
+ MVT::f64,
+ StNode->getValue(),
+ SubRegEven);
+ SDNode *Lo64 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+ dl,
+ MVT::f64,
+ StNode->getValue(),
+ SubRegOdd);
+
+ unsigned alignment = StNode->getAlignment();
+ if (alignment > 8)
+ alignment = 8;
+
+ SDValue OutChains[2];
+ OutChains[0] =
+ DAG.getStore(StNode->getChain(), dl, SDValue(Hi64, 0),
+ StNode->getBasePtr(), MachinePointerInfo(), alignment);
+ EVT addrVT = StNode->getBasePtr().getValueType();
+ SDValue LoPtr = DAG.getNode(ISD::ADD, dl, addrVT,
+ StNode->getBasePtr(),
+ DAG.getConstant(8, dl, addrVT));
+ OutChains[1] = DAG.getStore(StNode->getChain(), dl, SDValue(Lo64, 0), LoPtr,
+ MachinePointerInfo(), alignment);
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+}
+
+static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG)
+{
+ SDLoc dl(Op);
+ StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
+
+ EVT MemVT = St->getMemoryVT();
+ if (MemVT == MVT::f128)
+ return LowerF128Store(Op, DAG);
+
+ if (MemVT == MVT::i64) {
+ // Custom handling for i64 stores: turn it into a bitcast and a
+ // v2i32 store.
+ SDValue Val = DAG.getNode(ISD::BITCAST, dl, MVT::v2i32, St->getValue());
+ SDValue Chain = DAG.getStore(
+ St->getChain(), dl, Val, St->getBasePtr(), St->getPointerInfo(),
+ St->getAlignment(), St->getMemOperand()->getFlags(), St->getAAInfo());
+ return Chain;
+ }
+
+ return SDValue();
+}
+
+static SDValue LowerFNEGorFABS(SDValue Op, SelectionDAG &DAG, bool isV9) {
+ assert((Op.getOpcode() == ISD::FNEG || Op.getOpcode() == ISD::FABS)
+ && "invalid opcode");
+
+ SDLoc dl(Op);
+
+ if (Op.getValueType() == MVT::f64)
+ return LowerF64Op(Op.getOperand(0), dl, DAG, Op.getOpcode());
+ if (Op.getValueType() != MVT::f128)
+ return Op;
+
+ // Lower fabs/fneg on f128 to fabs/fneg on f64
+ // fabs/fneg f128 => fabs/fneg f64:sub_even64, fmov f64:sub_odd64
+ // (As with LowerF64Op, on little-endian, we need to negate the odd
+ // subreg)
+
+ SDValue SrcReg128 = Op.getOperand(0);
+ SDValue Hi64 = DAG.getTargetExtractSubreg(SP::sub_even64, dl, MVT::f64,
+ SrcReg128);
+ SDValue Lo64 = DAG.getTargetExtractSubreg(SP::sub_odd64, dl, MVT::f64,
+ SrcReg128);
+
+ if (DAG.getDataLayout().isLittleEndian()) {
+ if (isV9)
+ Lo64 = DAG.getNode(Op.getOpcode(), dl, MVT::f64, Lo64);
+ else
+ Lo64 = LowerF64Op(Lo64, dl, DAG, Op.getOpcode());
+ } else {
+ if (isV9)
+ Hi64 = DAG.getNode(Op.getOpcode(), dl, MVT::f64, Hi64);
+ else
+ Hi64 = LowerF64Op(Hi64, dl, DAG, Op.getOpcode());
+ }
+
+ SDValue DstReg128 = SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF,
+ dl, MVT::f128), 0);
+ DstReg128 = DAG.getTargetInsertSubreg(SP::sub_even64, dl, MVT::f128,
+ DstReg128, Hi64);
+ DstReg128 = DAG.getTargetInsertSubreg(SP::sub_odd64, dl, MVT::f128,
+ DstReg128, Lo64);
+ return DstReg128;
+}
+
+static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
+
+ if (Op.getValueType() != MVT::i64)
+ return Op;
+
+ SDLoc dl(Op);
+ SDValue Src1 = Op.getOperand(0);
+ SDValue Src1Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src1);
+ SDValue Src1Hi = DAG.getNode(ISD::SRL, dl, MVT::i64, Src1,
+ DAG.getConstant(32, dl, MVT::i64));
+ Src1Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src1Hi);
+
+ SDValue Src2 = Op.getOperand(1);
+ SDValue Src2Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src2);
+ SDValue Src2Hi = DAG.getNode(ISD::SRL, dl, MVT::i64, Src2,
+ DAG.getConstant(32, dl, MVT::i64));
+ Src2Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src2Hi);
+
+
+ bool hasChain = false;
+ unsigned hiOpc = Op.getOpcode();
+ switch (Op.getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case ISD::ADDC: hiOpc = ISD::ADDE; break;
+ case ISD::ADDE: hasChain = true; break;
+ case ISD::SUBC: hiOpc = ISD::SUBE; break;
+ case ISD::SUBE: hasChain = true; break;
+ }
+ SDValue Lo;
+ SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Glue);
+ if (hasChain) {
+ Lo = DAG.getNode(Op.getOpcode(), dl, VTs, Src1Lo, Src2Lo,
+ Op.getOperand(2));
+ } else {
+ Lo = DAG.getNode(Op.getOpcode(), dl, VTs, Src1Lo, Src2Lo);
+ }
+ SDValue Hi = DAG.getNode(hiOpc, dl, VTs, Src1Hi, Src2Hi, Lo.getValue(1));
+ SDValue Carry = Hi.getValue(1);
+
+ Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Lo);
+ Hi = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Hi);
+ Hi = DAG.getNode(ISD::SHL, dl, MVT::i64, Hi,
+ DAG.getConstant(32, dl, MVT::i64));
+
+ SDValue Dst = DAG.getNode(ISD::OR, dl, MVT::i64, Hi, Lo);
+ SDValue Ops[2] = { Dst, Carry };
+ return DAG.getMergeValues(Ops, dl);
+}
+
+// Custom lower UMULO/SMULO for SPARC. This code is similar to ExpandNode()
+// in LegalizeDAG.cpp except the order of arguments to the library function.
+static SDValue LowerUMULO_SMULO(SDValue Op, SelectionDAG &DAG,
+ const SparcTargetLowering &TLI)
+{
+ unsigned opcode = Op.getOpcode();
+ assert((opcode == ISD::UMULO || opcode == ISD::SMULO) && "Invalid Opcode.");
+
+ bool isSigned = (opcode == ISD::SMULO);
+ EVT VT = MVT::i64;
+ EVT WideVT = MVT::i128;
+ SDLoc dl(Op);
+ SDValue LHS = Op.getOperand(0);
+
+ if (LHS.getValueType() != VT)
+ return Op;
+
+ SDValue ShiftAmt = DAG.getConstant(63, dl, VT);
+
+ SDValue RHS = Op.getOperand(1);
+ SDValue HiLHS = DAG.getNode(ISD::SRA, dl, VT, LHS, ShiftAmt);
+ SDValue HiRHS = DAG.getNode(ISD::SRA, dl, MVT::i64, RHS, ShiftAmt);
+ SDValue Args[] = { HiLHS, LHS, HiRHS, RHS };
+
+ SDValue MulResult = TLI.makeLibCall(DAG,
+ RTLIB::MUL_I128, WideVT,
+ Args, isSigned, dl).first;
+ SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT,
+ MulResult, DAG.getIntPtrConstant(0, dl));
+ SDValue TopHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT,
+ MulResult, DAG.getIntPtrConstant(1, dl));
+ if (isSigned) {
+ SDValue Tmp1 = DAG.getNode(ISD::SRA, dl, VT, BottomHalf, ShiftAmt);
+ TopHalf = DAG.getSetCC(dl, MVT::i32, TopHalf, Tmp1, ISD::SETNE);
+ } else {
+ TopHalf = DAG.getSetCC(dl, MVT::i32, TopHalf, DAG.getConstant(0, dl, VT),
+ ISD::SETNE);
+ }
+ // MulResult is a node with an illegal type. Because such things are not
+ // generally permitted during this phase of legalization, ensure that
+ // nothing is left using the node. The above EXTRACT_ELEMENT nodes should have
+ // been folded.
+ assert(MulResult->use_empty() && "Illegally typed node still in use!");
+
+ SDValue Ops[2] = { BottomHalf, TopHalf } ;
+ return DAG.getMergeValues(Ops, dl);
+}
+
+static SDValue LowerATOMIC_LOAD_STORE(SDValue Op, SelectionDAG &DAG) {
+ if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering()))
+ // Expand with a fence.
+ return SDValue();
+
+ // Monotonic load/stores are legal.
+ return Op;
+}
+
+SDValue SparcTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
+ SelectionDAG &DAG) const {
+ unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ SDLoc dl(Op);
+ switch (IntNo) {
+ default: return SDValue(); // Don't custom lower most intrinsics.
+ case Intrinsic::thread_pointer: {
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ return DAG.getRegister(SP::G7, PtrVT);
+ }
+ }
+}
+
+SDValue SparcTargetLowering::
+LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+
+ bool hasHardQuad = Subtarget->hasHardQuad();
+ bool isV9 = Subtarget->isV9();
+
+ switch (Op.getOpcode()) {
+ default: llvm_unreachable("Should not custom lower this!");
+
+ case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG, *this,
+ Subtarget);
+ case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG,
+ Subtarget);
+ case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
+ case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
+ case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
+ case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
+ case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG, *this,
+ hasHardQuad);
+ case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG, *this,
+ hasHardQuad);
+ case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG, *this,
+ hasHardQuad);
+ case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG, *this,
+ hasHardQuad);
+ case ISD::BR_CC: return LowerBR_CC(Op, DAG, *this,
+ hasHardQuad);
+ case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG, *this,
+ hasHardQuad);
+ case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG, *this);
+ case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG, *this);
+ case ISD::VASTART: return LowerVASTART(Op, DAG, *this);
+ case ISD::VAARG: return LowerVAARG(Op, DAG);
+ case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG,
+ Subtarget);
+
+ case ISD::LOAD: return LowerLOAD(Op, DAG);
+ case ISD::STORE: return LowerSTORE(Op, DAG);
+ case ISD::FADD: return LowerF128Op(Op, DAG,
+ getLibcallName(RTLIB::ADD_F128), 2);
+ case ISD::FSUB: return LowerF128Op(Op, DAG,
+ getLibcallName(RTLIB::SUB_F128), 2);
+ case ISD::FMUL: return LowerF128Op(Op, DAG,
+ getLibcallName(RTLIB::MUL_F128), 2);
+ case ISD::FDIV: return LowerF128Op(Op, DAG,
+ getLibcallName(RTLIB::DIV_F128), 2);
+ case ISD::FSQRT: return LowerF128Op(Op, DAG,
+ getLibcallName(RTLIB::SQRT_F128),1);
+ case ISD::FABS:
+ case ISD::FNEG: return LowerFNEGorFABS(Op, DAG, isV9);
+ case ISD::FP_EXTEND: return LowerF128_FPEXTEND(Op, DAG, *this);
+ case ISD::FP_ROUND: return LowerF128_FPROUND(Op, DAG, *this);
+ case ISD::ADDC:
+ case ISD::ADDE:
+ case ISD::SUBC:
+ case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
+ case ISD::UMULO:
+ case ISD::SMULO: return LowerUMULO_SMULO(Op, DAG, *this);
+ case ISD::ATOMIC_LOAD:
+ case ISD::ATOMIC_STORE: return LowerATOMIC_LOAD_STORE(Op, DAG);
+ case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+ }
+}
+
+MachineBasicBlock *
+SparcTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ switch (MI.getOpcode()) {
+ default: llvm_unreachable("Unknown SELECT_CC!");
+ case SP::SELECT_CC_Int_ICC:
+ case SP::SELECT_CC_FP_ICC:
+ case SP::SELECT_CC_DFP_ICC:
+ case SP::SELECT_CC_QFP_ICC:
+ return expandSelectCC(MI, BB, SP::BCOND);
+ case SP::SELECT_CC_Int_FCC:
+ case SP::SELECT_CC_FP_FCC:
+ case SP::SELECT_CC_DFP_FCC:
+ case SP::SELECT_CC_QFP_FCC:
+ return expandSelectCC(MI, BB, SP::FBCOND);
+ case SP::EH_SJLJ_SETJMP32ri:
+ case SP::EH_SJLJ_SETJMP32rr:
+ return emitEHSjLjSetJmp(MI, BB);
+ case SP::EH_SJLJ_LONGJMP32rr:
+ case SP::EH_SJLJ_LONGJMP32ri:
+ return emitEHSjLjLongJmp(MI, BB);
+
+ }
+}
+
+MachineBasicBlock *
+SparcTargetLowering::expandSelectCC(MachineInstr &MI, MachineBasicBlock *BB,
+ unsigned BROpcode) const {
+ const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
+ DebugLoc dl = MI.getDebugLoc();
+ unsigned CC = (SPCC::CondCodes)MI.getOperand(3).getImm();
+
+ // To "insert" a SELECT_CC instruction, we actually have to insert the diamond
+ // control-flow pattern. The incoming instruction knows the destination vreg
+ // to set, the condition code register to branch on, the true/false values to
+ // select between, and a branch opcode to use.
+ const BasicBlock *LLVM_BB = BB->getBasicBlock();
+ MachineFunction::iterator It = ++BB->getIterator();
+
+ // thisMBB:
+ // ...
+ // TrueVal = ...
+ // [f]bCC copy1MBB
+ // fallthrough --> copy0MBB
+ MachineBasicBlock *thisMBB = BB;
+ MachineFunction *F = BB->getParent();
+ MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ F->insert(It, copy0MBB);
+ F->insert(It, sinkMBB);
+
+ // Transfer the remainder of BB and its successor edges to sinkMBB.
+ sinkMBB->splice(sinkMBB->begin(), BB,
+ std::next(MachineBasicBlock::iterator(MI)),
+ BB->end());
+ sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+ // Add the true and fallthrough blocks as its successors.
+ BB->addSuccessor(copy0MBB);
+ BB->addSuccessor(sinkMBB);
+
+ BuildMI(BB, dl, TII.get(BROpcode)).addMBB(sinkMBB).addImm(CC);
+
+ // copy0MBB:
+ // %FalseValue = ...
+ // # fallthrough to sinkMBB
+ BB = copy0MBB;
+
+ // Update machine-CFG edges
+ BB->addSuccessor(sinkMBB);
+
+ // sinkMBB:
+ // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+ // ...
+ BB = sinkMBB;
+ BuildMI(*BB, BB->begin(), dl, TII.get(SP::PHI), MI.getOperand(0).getReg())
+ .addReg(MI.getOperand(2).getReg())
+ .addMBB(copy0MBB)
+ .addReg(MI.getOperand(1).getReg())
+ .addMBB(thisMBB);
+
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
+ return BB;
+}
+
+MachineBasicBlock *
+SparcTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
+ MachineBasicBlock *MBB) const {
+ DebugLoc DL = MI.getDebugLoc();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+
+ MachineFunction *MF = MBB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ MachineInstrBuilder MIB;
+
+ MVT PVT = getPointerTy(MF->getDataLayout());
+ unsigned RegSize = PVT.getStoreSize();
+ assert(PVT == MVT::i32 && "Invalid Pointer Size!");
+
+ unsigned Buf = MI.getOperand(0).getReg();
+ unsigned JmpLoc = MRI.createVirtualRegister(&SP::IntRegsRegClass);
+
+ // TO DO: If we do 64-bit handling, this perhaps should be FLUSHW, not TA 3
+ MIB = BuildMI(*MBB, MI, DL, TII->get(SP::TRAPri), SP::G0).addImm(3).addImm(SPCC::ICC_A);
+
+ // Instruction to restore FP
+ const unsigned FP = SP::I6;
+ MIB = BuildMI(*MBB, MI, DL, TII->get(SP::LDri))
+ .addReg(FP)
+ .addReg(Buf)
+ .addImm(0);
+
+ // Instruction to load jmp location
+ MIB = BuildMI(*MBB, MI, DL, TII->get(SP::LDri))
+ .addReg(JmpLoc, RegState::Define)
+ .addReg(Buf)
+ .addImm(RegSize);
+
+ // Instruction to restore SP
+ const unsigned SP = SP::O6;
+ MIB = BuildMI(*MBB, MI, DL, TII->get(SP::LDri))
+ .addReg(SP)
+ .addReg(Buf)
+ .addImm(2 * RegSize);
+
+ // Instruction to restore I7
+ MIB = BuildMI(*MBB, MI, DL, TII->get(SP::LDri))
+ .addReg(SP::I7)
+ .addReg(Buf, RegState::Kill)
+ .addImm(3 * RegSize);
+
+ // Jump to JmpLoc
+ BuildMI(*MBB, MI, DL, TII->get(SP::JMPLrr)).addReg(SP::G0).addReg(JmpLoc, RegState::Kill).addReg(SP::G0);
+
+ MI.eraseFromParent();
+ return MBB;
+}
+
+MachineBasicBlock *
+SparcTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
+ MachineBasicBlock *MBB) const {
+ DebugLoc DL = MI.getDebugLoc();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+
+ MachineFunction *MF = MBB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ MachineInstrBuilder MIB;
+
+ MVT PVT = getPointerTy(MF->getDataLayout());
+ unsigned RegSize = PVT.getStoreSize();
+ assert(PVT == MVT::i32 && "Invalid Pointer Size!");
+
+ unsigned DstReg = MI.getOperand(0).getReg();
+ const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
+ assert(RC->hasType(MVT::i32) && "Invalid destination!");
+ unsigned mainDstReg = MRI.createVirtualRegister(RC);
+ unsigned restoreDstReg = MRI.createVirtualRegister(RC);
+
+ // For v = setjmp(buf), we generate
+ //
+ // thisMBB:
+ // buf[0] = FP
+ // buf[RegSize] = restoreMBB <-- takes address of restoreMBB
+ // buf[RegSize * 2] = O6
+ // buf[RegSize * 3] = I7
+ // Ensure restoreMBB remains in the relocations list (done using a bn instruction)
+ // b mainMBB
+ //
+ // mainMBB:
+ // v_main = 0
+ // b sinkMBB
+ //
+ // restoreMBB:
+ // v_restore = 1
+ // --fall through--
+ //
+ // sinkMBB:
+ // v = phi(main, restore)
+
+ const BasicBlock *BB = MBB->getBasicBlock();
+ MachineFunction::iterator It = ++MBB->getIterator();
+ MachineBasicBlock *thisMBB = MBB;
+ MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+
+ MF->insert(It, mainMBB);
+ MF->insert(It, restoreMBB);
+ MF->insert(It, sinkMBB);
+ restoreMBB->setHasAddressTaken();
+
+ // Transfer the remainder of BB and its successor edges to sinkMBB.
+ sinkMBB->splice(sinkMBB->begin(), MBB,
+ std::next(MachineBasicBlock::iterator(MI)),
+ MBB->end());
+ sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+ unsigned LabelReg = MRI.createVirtualRegister(&SP::IntRegsRegClass);
+ unsigned LabelReg2 = MRI.createVirtualRegister(&SP::IntRegsRegClass);
+ unsigned BufReg = MI.getOperand(1).getReg();
+
+ // Instruction to store FP
+ const unsigned FP = SP::I6;
+ MIB = BuildMI(thisMBB, DL, TII->get(SP::STri))
+ .addReg(BufReg)
+ .addImm(0)
+ .addReg(FP);
+
+ // Instructions to store jmp location
+ MIB = BuildMI(thisMBB, DL, TII->get(SP::SETHIi))
+ .addReg(LabelReg, RegState::Define)
+ .addMBB(restoreMBB, SparcMCExpr::VK_Sparc_HI);
+
+ MIB = BuildMI(thisMBB, DL, TII->get(SP::ORri))
+ .addReg(LabelReg2, RegState::Define)
+ .addReg(LabelReg, RegState::Kill)
+ .addMBB(restoreMBB, SparcMCExpr::VK_Sparc_LO);
+
+ MIB = BuildMI(thisMBB, DL, TII->get(SP::STri))
+ .addReg(BufReg)
+ .addImm(RegSize)
+ .addReg(LabelReg2, RegState::Kill);
+
+ // Instruction to store SP
+ const unsigned SP = SP::O6;
+ MIB = BuildMI(thisMBB, DL, TII->get(SP::STri))
+ .addReg(BufReg)
+ .addImm(2 * RegSize)
+ .addReg(SP);
+
+ // Instruction to store I7
+ MIB = BuildMI(thisMBB, DL, TII->get(SP::STri))
+ .addReg(BufReg)
+ .addImm(3 * RegSize)
+ .addReg(SP::I7);
+
+
+ // FIX ME: This next instruction ensures that the restoreMBB block address remains
+ // valid through optimization passes and serves no other purpose. The ICC_N ensures
+ // that the branch is never taken. This commented-out code here was an alternative
+ // attempt to achieve this which brought myriad problems.
+ //MIB = BuildMI(thisMBB, DL, TII->get(SP::EH_SjLj_Setup)).addMBB(restoreMBB, SparcMCExpr::VK_Sparc_None);
+ MIB = BuildMI(thisMBB, DL, TII->get(SP::BCOND))
+ .addMBB(restoreMBB)
+ .addImm(SPCC::ICC_N);
+
+ MIB = BuildMI(thisMBB, DL, TII->get(SP::BCOND))
+ .addMBB(mainMBB)
+ .addImm(SPCC::ICC_A);
+
+ thisMBB->addSuccessor(mainMBB);
+ thisMBB->addSuccessor(restoreMBB);
+
+
+ // mainMBB:
+ MIB = BuildMI(mainMBB, DL, TII->get(SP::ORrr))
+ .addReg(mainDstReg, RegState::Define)
+ .addReg(SP::G0)
+ .addReg(SP::G0);
+ MIB = BuildMI(mainMBB, DL, TII->get(SP::BCOND)).addMBB(sinkMBB).addImm(SPCC::ICC_A);
+
+ mainMBB->addSuccessor(sinkMBB);
+
+
+ // restoreMBB:
+ MIB = BuildMI(restoreMBB, DL, TII->get(SP::ORri))
+ .addReg(restoreDstReg, RegState::Define)
+ .addReg(SP::G0)
+ .addImm(1);
+ //MIB = BuildMI(restoreMBB, DL, TII->get(SP::BCOND)).addMBB(sinkMBB).addImm(SPCC::ICC_A);
+ restoreMBB->addSuccessor(sinkMBB);
+
+ // sinkMBB:
+ MIB = BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+ TII->get(SP::PHI), DstReg)
+ .addReg(mainDstReg).addMBB(mainMBB)
+ .addReg(restoreDstReg).addMBB(restoreMBB);
+
+ MI.eraseFromParent();
+ return sinkMBB;
+}
+
+//===----------------------------------------------------------------------===//
+// Sparc Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+SparcTargetLowering::ConstraintType
+SparcTargetLowering::getConstraintType(StringRef Constraint) const {
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ default: break;
+ case 'r': return C_RegisterClass;
+ case 'I': // SIMM13
+ return C_Other;
+ }
+ }
+
+ return TargetLowering::getConstraintType(Constraint);
+}
+
+TargetLowering::ConstraintWeight SparcTargetLowering::
+getSingleConstraintMatchWeight(AsmOperandInfo &info,
+ const char *constraint) const {
+ ConstraintWeight weight = CW_Invalid;
+ Value *CallOperandVal = info.CallOperandVal;
+ // If we don't have a value, we can't do a match,
+ // but allow it at the lowest weight.
+ if (!CallOperandVal)
+ return CW_Default;
+
+ // Look at the constraint type.
+ switch (*constraint) {
+ default:
+ weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+ break;
+ case 'I': // SIMM13
+ if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
+ if (isInt<13>(C->getSExtValue()))
+ weight = CW_Constant;
+ }
+ break;
+ }
+ return weight;
+}
+
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector. If it is invalid, don't add anything to Ops.
+void SparcTargetLowering::
+LowerAsmOperandForConstraint(SDValue Op,
+ std::string &Constraint,
+ std::vector<SDValue> &Ops,
+ SelectionDAG &DAG) const {
+ SDValue Result(nullptr, 0);
+
+ // Only support length 1 constraints for now.
+ if (Constraint.length() > 1)
+ return;
+
+ char ConstraintLetter = Constraint[0];
+ switch (ConstraintLetter) {
+ default: break;
+ case 'I':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (isInt<13>(C->getSExtValue())) {
+ Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
+ Op.getValueType());
+ break;
+ }
+ return;
+ }
+ }
+
+ if (Result.getNode()) {
+ Ops.push_back(Result);
+ return;
+ }
+ TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+SparcTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ StringRef Constraint,
+ MVT VT) const {
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ case 'r':
+ if (VT == MVT::v2i32)
+ return std::make_pair(0U, &SP::IntPairRegClass);
+ else
+ return std::make_pair(0U, &SP::IntRegsRegClass);
+ }
+ } else if (!Constraint.empty() && Constraint.size() <= 5
+ && Constraint[0] == '{' && *(Constraint.end()-1) == '}') {
+ // constraint = '{r<d>}'
+ // Remove the braces from around the name.
+ StringRef name(Constraint.data()+1, Constraint.size()-2);
+ // Handle register aliases:
+ // r0-r7 -> g0-g7
+ // r8-r15 -> o0-o7
+ // r16-r23 -> l0-l7
+ // r24-r31 -> i0-i7
+ uint64_t intVal = 0;
+ if (name.substr(0, 1).equals("r")
+ && !name.substr(1).getAsInteger(10, intVal) && intVal <= 31) {
+ const char regTypes[] = { 'g', 'o', 'l', 'i' };
+ char regType = regTypes[intVal/8];
+ char regIdx = '0' + (intVal % 8);
+ char tmp[] = { '{', regType, regIdx, '}', 0 };
+ std::string newConstraint = std::string(tmp);
+ return TargetLowering::getRegForInlineAsmConstraint(TRI, newConstraint,
+ VT);
+ }
+ }
+
+ return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+}
+
+bool
+SparcTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+ // The Sparc target isn't yet aware of offsets.
+ return false;
+}
+
+void SparcTargetLowering::ReplaceNodeResults(SDNode *N,
+ SmallVectorImpl<SDValue>& Results,
+ SelectionDAG &DAG) const {
+
+ SDLoc dl(N);
+
+ RTLIB::Libcall libCall = RTLIB::UNKNOWN_LIBCALL;
+
+ switch (N->getOpcode()) {
+ default:
+ llvm_unreachable("Do not know how to custom type legalize this operation!");
+
+ case ISD::FP_TO_SINT:
+ case ISD::FP_TO_UINT:
+ // Custom lower only if it involves f128 or i64.
+ if (N->getOperand(0).getValueType() != MVT::f128
+ || N->getValueType(0) != MVT::i64)
+ return;
+ libCall = ((N->getOpcode() == ISD::FP_TO_SINT)
+ ? RTLIB::FPTOSINT_F128_I64
+ : RTLIB::FPTOUINT_F128_I64);
+
+ Results.push_back(LowerF128Op(SDValue(N, 0),
+ DAG,
+ getLibcallName(libCall),
+ 1));
+ return;
+
+ case ISD::SINT_TO_FP:
+ case ISD::UINT_TO_FP:
+ // Custom lower only if it involves f128 or i64.
+ if (N->getValueType(0) != MVT::f128
+ || N->getOperand(0).getValueType() != MVT::i64)
+ return;
+
+ libCall = ((N->getOpcode() == ISD::SINT_TO_FP)
+ ? RTLIB::SINTTOFP_I64_F128
+ : RTLIB::UINTTOFP_I64_F128);
+
+ Results.push_back(LowerF128Op(SDValue(N, 0),
+ DAG,
+ getLibcallName(libCall),
+ 1));
+ return;
+ case ISD::LOAD: {
+ LoadSDNode *Ld = cast<LoadSDNode>(N);
+ // Custom handling only for i64: turn i64 load into a v2i32 load,
+ // and a bitcast.
+ if (Ld->getValueType(0) != MVT::i64 || Ld->getMemoryVT() != MVT::i64)
+ return;
+
+ SDLoc dl(N);
+ SDValue LoadRes = DAG.getExtLoad(
+ Ld->getExtensionType(), dl, MVT::v2i32, Ld->getChain(),
+ Ld->getBasePtr(), Ld->getPointerInfo(), MVT::v2i32, Ld->getAlignment(),
+ Ld->getMemOperand()->getFlags(), Ld->getAAInfo());
+
+ SDValue Res = DAG.getNode(ISD::BITCAST, dl, MVT::i64, LoadRes);
+ Results.push_back(Res);
+ Results.push_back(LoadRes.getValue(1));
+ return;
+ }
+ }
+}
+
+// Override to enable LOAD_STACK_GUARD lowering on Linux.
+bool SparcTargetLowering::useLoadStackGuardNode() const {
+ if (!Subtarget->isTargetLinux())
+ return TargetLowering::useLoadStackGuardNode();
+ return true;
+}
+
+// Override to disable global variable loading on Linux.
+void SparcTargetLowering::insertSSPDeclarations(Module &M) const {
+ if (!Subtarget->isTargetLinux())
+ return TargetLowering::insertSSPDeclarations(M);
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcISelLowering.h b/contrib/llvm/lib/Target/Sparc/SparcISelLowering.h
new file mode 100644
index 000000000000..e0a421b83712
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcISelLowering.h
@@ -0,0 +1,223 @@
+//===-- SparcISelLowering.h - Sparc DAG Lowering Interface ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Sparc uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPARC_SPARCISELLOWERING_H
+#define LLVM_LIB_TARGET_SPARC_SPARCISELLOWERING_H
+
+#include "Sparc.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+ class SparcSubtarget;
+
+ namespace SPISD {
+ enum NodeType : unsigned {
+ FIRST_NUMBER = ISD::BUILTIN_OP_END,
+ CMPICC, // Compare two GPR operands, set icc+xcc.
+ CMPFCC, // Compare two FP operands, set fcc.
+ BRICC, // Branch to dest on icc condition
+ BRXCC, // Branch to dest on xcc condition (64-bit only).
+ BRFCC, // Branch to dest on fcc condition
+ SELECT_ICC, // Select between two values using the current ICC flags.
+ SELECT_XCC, // Select between two values using the current XCC flags.
+ SELECT_FCC, // Select between two values using the current FCC flags.
+
+ EH_SJLJ_SETJMP, // builtin setjmp operation
+ EH_SJLJ_LONGJMP, // builtin longjmp operation
+
+ Hi, Lo, // Hi/Lo operations, typically on a global address.
+
+ FTOI, // FP to Int within a FP register.
+ ITOF, // Int to FP within a FP register.
+ FTOX, // FP to Int64 within a FP register.
+ XTOF, // Int64 to FP within a FP register.
+
+ CALL, // A call instruction.
+ RET_FLAG, // Return with a flag operand.
+ GLOBAL_BASE_REG, // Global base reg for PIC.
+ FLUSHW, // FLUSH register windows to stack.
+
+ TLS_ADD, // For Thread Local Storage (TLS).
+ TLS_LD,
+ TLS_CALL
+ };
+ }
+
+ class SparcTargetLowering : public TargetLowering {
+ const SparcSubtarget *Subtarget;
+ public:
+ SparcTargetLowering(const TargetMachine &TM, const SparcSubtarget &STI);
+ SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+ bool useSoftFloat() const override;
+
+ /// computeKnownBitsForTargetNode - Determine which of the bits specified
+ /// in Mask are known to be either zero or one and return them in the
+ /// KnownZero/KnownOne bitsets.
+ void computeKnownBitsForTargetNode(const SDValue Op,
+ APInt &KnownZero,
+ APInt &KnownOne,
+ const SelectionDAG &DAG,
+ unsigned Depth = 0) const override;
+
+ MachineBasicBlock *
+ EmitInstrWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *MBB) const override;
+
+ const char *getTargetNodeName(unsigned Opcode) const override;
+
+ ConstraintType getConstraintType(StringRef Constraint) const override;
+ ConstraintWeight
+ getSingleConstraintMatchWeight(AsmOperandInfo &info,
+ const char *constraint) const override;
+ void LowerAsmOperandForConstraint(SDValue Op,
+ std::string &Constraint,
+ std::vector<SDValue> &Ops,
+ SelectionDAG &DAG) const override;
+
+ unsigned
+ getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
+ if (ConstraintCode == "o")
+ return InlineAsm::Constraint_o;
+ return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
+ }
+
+ std::pair<unsigned, const TargetRegisterClass *>
+ getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ StringRef Constraint, MVT VT) const override;
+
+ bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+ MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
+ return MVT::i32;
+ }
+
+ unsigned getRegisterByName(const char* RegName, EVT VT,
+ SelectionDAG &DAG) const override;
+
+ /// If a physical register, this returns the register that receives the
+ /// exception address on entry to an EH pad.
+ unsigned
+ getExceptionPointerRegister(const Constant *PersonalityFn) const override {
+ return SP::I0;
+ }
+
+ /// If a physical register, this returns the register that receives the
+ /// exception typeid on entry to a landing pad.
+ unsigned
+ getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
+ return SP::I1;
+ }
+
+ /// Override to support customized stack guard loading.
+ bool useLoadStackGuardNode() const override;
+ void insertSSPDeclarations(Module &M) const override;
+
+ /// getSetCCResultType - Return the ISD::SETCC ValueType
+ EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+ EVT VT) const override;
+
+ SDValue
+ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const override;
+ SDValue LowerFormalArguments_32(SDValue Chain, CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const;
+ SDValue LowerFormalArguments_64(SDValue Chain, CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const;
+
+ SDValue
+ LowerCall(TargetLowering::CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const override;
+ SDValue LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const;
+ SDValue LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const;
+
+ SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SDLoc &dl, SelectionDAG &DAG) const override;
+ SDValue LowerReturn_32(SDValue Chain, CallingConv::ID CallConv,
+ bool IsVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SDLoc &DL, SelectionDAG &DAG) const;
+ SDValue LowerReturn_64(SDValue Chain, CallingConv::ID CallConv,
+ bool IsVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SDLoc &DL, SelectionDAG &DAG) const;
+
+ SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG,
+ const SparcTargetLowering &TLI) const ;
+ SDValue LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG,
+ const SparcTargetLowering &TLI) const ;
+
+ unsigned getSRetArgSize(SelectionDAG &DAG, SDValue Callee) const;
+ SDValue withTargetFlags(SDValue Op, unsigned TF, SelectionDAG &DAG) const;
+ SDValue makeHiLoPair(SDValue Op, unsigned HiTF, unsigned LoTF,
+ SelectionDAG &DAG) const;
+ SDValue makeAddress(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue LowerF128_LibCallArg(SDValue Chain, ArgListTy &Args, SDValue Arg,
+ const SDLoc &DL, SelectionDAG &DAG) const;
+ SDValue LowerF128Op(SDValue Op, SelectionDAG &DAG,
+ const char *LibFuncName,
+ unsigned numArgs) const;
+ SDValue LowerF128Compare(SDValue LHS, SDValue RHS, unsigned &SPCC,
+ const SDLoc &DL, SelectionDAG &DAG) const;
+
+ SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+
+ bool ShouldShrinkFPConstant(EVT VT) const override {
+ // Do not shrink FP constpool if VT == MVT::f128.
+ // (ldd, call _Q_fdtoq) is more expensive than two ldds.
+ return VT != MVT::f128;
+ }
+
+ bool shouldInsertFencesForAtomic(const Instruction *I) const override {
+ // FIXME: We insert fences for each atomics and generate
+ // sub-optimal code for PSO/TSO. (Approximately nobody uses any
+ // mode but TSO, which makes this even more silly)
+ return true;
+ }
+
+ AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
+
+ void ReplaceNodeResults(SDNode *N,
+ SmallVectorImpl<SDValue>& Results,
+ SelectionDAG &DAG) const override;
+
+ MachineBasicBlock *expandSelectCC(MachineInstr &MI, MachineBasicBlock *BB,
+ unsigned BROpcode) const;
+ MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI,
+ MachineBasicBlock *MBB) const;
+ MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr &MI,
+ MachineBasicBlock *MBB) const;
+ };
+} // end namespace llvm
+
+#endif // SPARC_ISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstr64Bit.td b/contrib/llvm/lib/Target/Sparc/SparcInstr64Bit.td
new file mode 100644
index 000000000000..f6518c936ebc
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcInstr64Bit.td
@@ -0,0 +1,541 @@
+//===-- SparcInstr64Bit.td - 64-bit instructions for Sparc Target ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains instruction definitions and patterns needed for 64-bit
+// code generation on SPARC v9.
+//
+// Some SPARC v9 instructions are defined in SparcInstrInfo.td because they can
+// also be used in 32-bit code running on a SPARC v9 CPU.
+//
+//===----------------------------------------------------------------------===//
+
+let Predicates = [Is64Bit] in {
+// The same integer registers are used for i32 and i64 values.
+// When registers hold i32 values, the high bits are don't care.
+// This give us free trunc and anyext.
+def : Pat<(i64 (anyext i32:$val)), (COPY_TO_REGCLASS $val, I64Regs)>;
+def : Pat<(i32 (trunc i64:$val)), (COPY_TO_REGCLASS $val, IntRegs)>;
+
+} // Predicates = [Is64Bit]
+
+
+//===----------------------------------------------------------------------===//
+// 64-bit Shift Instructions.
+//===----------------------------------------------------------------------===//
+//
+// The 32-bit shift instructions are still available. The left shift srl
+// instructions shift all 64 bits, but it only accepts a 5-bit shift amount.
+//
+// The srl instructions only shift the low 32 bits and clear the high 32 bits.
+// Finally, sra shifts the low 32 bits and sign-extends to 64 bits.
+
+let Predicates = [Is64Bit] in {
+
+def : Pat<(i64 (zext i32:$val)), (SRLri $val, 0)>;
+def : Pat<(i64 (sext i32:$val)), (SRAri $val, 0)>;
+
+def : Pat<(i64 (and i64:$val, 0xffffffff)), (SRLri $val, 0)>;
+def : Pat<(i64 (sext_inreg i64:$val, i32)), (SRAri $val, 0)>;
+
+defm SLLX : F3_S<"sllx", 0b100101, 1, shl, i64, I64Regs>;
+defm SRLX : F3_S<"srlx", 0b100110, 1, srl, i64, I64Regs>;
+defm SRAX : F3_S<"srax", 0b100111, 1, sra, i64, I64Regs>;
+
+} // Predicates = [Is64Bit]
+
+
+//===----------------------------------------------------------------------===//
+// 64-bit Immediates.
+//===----------------------------------------------------------------------===//
+//
+// All 32-bit immediates can be materialized with sethi+or, but 64-bit
+// immediates may require more code. There may be a point where it is
+// preferable to use a constant pool load instead, depending on the
+// microarchitecture.
+
+// Single-instruction patterns.
+
+// The ALU instructions want their simm13 operands as i32 immediates.
+def as_i32imm : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i32);
+}]>;
+def : Pat<(i64 simm13:$val), (ORri (i64 G0), (as_i32imm $val))>;
+def : Pat<(i64 SETHIimm:$val), (SETHIi (HI22 $val))>;
+
+// Double-instruction patterns.
+
+// All unsigned i32 immediates can be handled by sethi+or.
+def uimm32 : PatLeaf<(imm), [{ return isUInt<32>(N->getZExtValue()); }]>;
+def : Pat<(i64 uimm32:$val), (ORri (SETHIi (HI22 $val)), (LO10 $val))>,
+ Requires<[Is64Bit]>;
+
+// All negative i33 immediates can be handled by sethi+xor.
+def nimm33 : PatLeaf<(imm), [{
+ int64_t Imm = N->getSExtValue();
+ return Imm < 0 && isInt<33>(Imm);
+}]>;
+// Bits 10-31 inverted. Same as assembler's %hix.
+def HIX22 : SDNodeXForm<imm, [{
+ uint64_t Val = (~N->getZExtValue() >> 10) & ((1u << 22) - 1);
+ return CurDAG->getTargetConstant(Val, SDLoc(N), MVT::i32);
+}]>;
+// Bits 0-9 with ones in bits 10-31. Same as assembler's %lox.
+def LOX10 : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(~(~N->getZExtValue() & 0x3ff), SDLoc(N),
+ MVT::i32);
+}]>;
+def : Pat<(i64 nimm33:$val), (XORri (SETHIi (HIX22 $val)), (LOX10 $val))>,
+ Requires<[Is64Bit]>;
+
+// More possible patterns:
+//
+// (sllx sethi, n)
+// (sllx simm13, n)
+//
+// 3 instrs:
+//
+// (xor (sllx sethi), simm13)
+// (sllx (xor sethi, simm13))
+//
+// 4 instrs:
+//
+// (or sethi, (sllx sethi))
+// (xnor sethi, (sllx sethi))
+//
+// 5 instrs:
+//
+// (or (sllx sethi), (or sethi, simm13))
+// (xnor (sllx sethi), (or sethi, simm13))
+// (or (sllx sethi), (sllx sethi))
+// (xnor (sllx sethi), (sllx sethi))
+//
+// Worst case is 6 instrs:
+//
+// (or (sllx (or sethi, simmm13)), (or sethi, simm13))
+
+// Bits 42-63, same as assembler's %hh.
+def HH22 : SDNodeXForm<imm, [{
+ uint64_t Val = (N->getZExtValue() >> 42) & ((1u << 22) - 1);
+ return CurDAG->getTargetConstant(Val, SDLoc(N), MVT::i32);
+}]>;
+// Bits 32-41, same as assembler's %hm.
+def HM10 : SDNodeXForm<imm, [{
+ uint64_t Val = (N->getZExtValue() >> 32) & ((1u << 10) - 1);
+ return CurDAG->getTargetConstant(Val, SDLoc(N), MVT::i32);
+}]>;
+def : Pat<(i64 imm:$val),
+ (ORrr (SLLXri (ORri (SETHIi (HH22 $val)), (HM10 $val)), (i32 32)),
+ (ORri (SETHIi (HI22 $val)), (LO10 $val)))>,
+ Requires<[Is64Bit]>;
+
+
+//===----------------------------------------------------------------------===//
+// 64-bit Integer Arithmetic and Logic.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [Is64Bit] in {
+
+// Register-register instructions.
+let isCodeGenOnly = 1 in {
+defm ANDX : F3_12<"and", 0b000001, and, I64Regs, i64, i64imm>;
+defm ORX : F3_12<"or", 0b000010, or, I64Regs, i64, i64imm>;
+defm XORX : F3_12<"xor", 0b000011, xor, I64Regs, i64, i64imm>;
+
+def ANDXNrr : F3_1<2, 0b000101,
+ (outs I64Regs:$dst), (ins I64Regs:$b, I64Regs:$c),
+ "andn $b, $c, $dst",
+ [(set i64:$dst, (and i64:$b, (not i64:$c)))]>;
+def ORXNrr : F3_1<2, 0b000110,
+ (outs I64Regs:$dst), (ins I64Regs:$b, I64Regs:$c),
+ "orn $b, $c, $dst",
+ [(set i64:$dst, (or i64:$b, (not i64:$c)))]>;
+def XNORXrr : F3_1<2, 0b000111,
+ (outs I64Regs:$dst), (ins I64Regs:$b, I64Regs:$c),
+ "xnor $b, $c, $dst",
+ [(set i64:$dst, (not (xor i64:$b, i64:$c)))]>;
+
+defm ADDX : F3_12<"add", 0b000000, add, I64Regs, i64, i64imm>;
+defm SUBX : F3_12<"sub", 0b000100, sub, I64Regs, i64, i64imm>;
+
+def TLS_ADDXrr : F3_1<2, 0b000000, (outs I64Regs:$rd),
+ (ins I64Regs:$rs1, I64Regs:$rs2, TLSSym:$sym),
+ "add $rs1, $rs2, $rd, $sym",
+ [(set i64:$rd,
+ (tlsadd i64:$rs1, i64:$rs2, tglobaltlsaddr:$sym))]>;
+
+// "LEA" form of add
+def LEAX_ADDri : F3_2<2, 0b000000,
+ (outs I64Regs:$dst), (ins MEMri:$addr),
+ "add ${addr:arith}, $dst",
+ [(set iPTR:$dst, ADDRri:$addr)]>;
+}
+
+def : Pat<(SPcmpicc i64:$a, i64:$b), (CMPrr $a, $b)>;
+def : Pat<(SPcmpicc i64:$a, (i64 simm13:$b)), (CMPri $a, (as_i32imm $b))>;
+def : Pat<(ctpop i64:$src), (POPCrr $src)>;
+
+} // Predicates = [Is64Bit]
+
+
+//===----------------------------------------------------------------------===//
+// 64-bit Integer Multiply and Divide.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [Is64Bit] in {
+
+def MULXrr : F3_1<2, 0b001001,
+ (outs I64Regs:$rd), (ins I64Regs:$rs1, I64Regs:$rs2),
+ "mulx $rs1, $rs2, $rd",
+ [(set i64:$rd, (mul i64:$rs1, i64:$rs2))]>;
+def MULXri : F3_2<2, 0b001001,
+ (outs IntRegs:$rd), (ins IntRegs:$rs1, i64imm:$simm13),
+ "mulx $rs1, $simm13, $rd",
+ [(set i64:$rd, (mul i64:$rs1, (i64 simm13:$simm13)))]>;
+
+// Division can trap.
+let hasSideEffects = 1 in {
+def SDIVXrr : F3_1<2, 0b101101,
+ (outs I64Regs:$rd), (ins I64Regs:$rs1, I64Regs:$rs2),
+ "sdivx $rs1, $rs2, $rd",
+ [(set i64:$rd, (sdiv i64:$rs1, i64:$rs2))]>;
+def SDIVXri : F3_2<2, 0b101101,
+ (outs IntRegs:$rd), (ins IntRegs:$rs1, i64imm:$simm13),
+ "sdivx $rs1, $simm13, $rd",
+ [(set i64:$rd, (sdiv i64:$rs1, (i64 simm13:$simm13)))]>;
+
+def UDIVXrr : F3_1<2, 0b001101,
+ (outs I64Regs:$rd), (ins I64Regs:$rs1, I64Regs:$rs2),
+ "udivx $rs1, $rs2, $rd",
+ [(set i64:$rd, (udiv i64:$rs1, i64:$rs2))]>;
+def UDIVXri : F3_2<2, 0b001101,
+ (outs IntRegs:$rd), (ins IntRegs:$rs1, i64imm:$simm13),
+ "udivx $rs1, $simm13, $rd",
+ [(set i64:$rd, (udiv i64:$rs1, (i64 simm13:$simm13)))]>;
+} // hasSideEffects = 1
+
+} // Predicates = [Is64Bit]
+
+
+//===----------------------------------------------------------------------===//
+// 64-bit Loads and Stores.
+//===----------------------------------------------------------------------===//
+//
+// All the 32-bit loads and stores are available. The extending loads are sign
+// or zero-extending to 64 bits. The LDrr and LDri instructions load 32 bits
+// zero-extended to i64. Their mnemonic is lduw in SPARC v9 (Load Unsigned
+// Word).
+//
+// SPARC v9 adds 64-bit loads as well as a sign-extending ldsw i32 loads.
+
+let Predicates = [Is64Bit] in {
+
+// 64-bit loads.
+let DecoderMethod = "DecodeLoadInt" in
+ defm LDX : Load<"ldx", 0b001011, load, I64Regs, i64>;
+
+let mayLoad = 1, isCodeGenOnly = 1, isAsmParserOnly = 1 in
+ def TLS_LDXrr : F3_1<3, 0b001011,
+ (outs IntRegs:$dst), (ins MEMrr:$addr, TLSSym:$sym),
+ "ldx [$addr], $dst, $sym",
+ [(set i64:$dst,
+ (tlsld ADDRrr:$addr, tglobaltlsaddr:$sym))]>;
+
+// Extending loads to i64.
+def : Pat<(i64 (zextloadi1 ADDRrr:$addr)), (LDUBrr ADDRrr:$addr)>;
+def : Pat<(i64 (zextloadi1 ADDRri:$addr)), (LDUBri ADDRri:$addr)>;
+def : Pat<(i64 (extloadi1 ADDRrr:$addr)), (LDUBrr ADDRrr:$addr)>;
+def : Pat<(i64 (extloadi1 ADDRri:$addr)), (LDUBri ADDRri:$addr)>;
+
+def : Pat<(i64 (zextloadi8 ADDRrr:$addr)), (LDUBrr ADDRrr:$addr)>;
+def : Pat<(i64 (zextloadi8 ADDRri:$addr)), (LDUBri ADDRri:$addr)>;
+def : Pat<(i64 (extloadi8 ADDRrr:$addr)), (LDUBrr ADDRrr:$addr)>;
+def : Pat<(i64 (extloadi8 ADDRri:$addr)), (LDUBri ADDRri:$addr)>;
+def : Pat<(i64 (sextloadi8 ADDRrr:$addr)), (LDSBrr ADDRrr:$addr)>;
+def : Pat<(i64 (sextloadi8 ADDRri:$addr)), (LDSBri ADDRri:$addr)>;
+
+def : Pat<(i64 (zextloadi16 ADDRrr:$addr)), (LDUHrr ADDRrr:$addr)>;
+def : Pat<(i64 (zextloadi16 ADDRri:$addr)), (LDUHri ADDRri:$addr)>;
+def : Pat<(i64 (extloadi16 ADDRrr:$addr)), (LDUHrr ADDRrr:$addr)>;
+def : Pat<(i64 (extloadi16 ADDRri:$addr)), (LDUHri ADDRri:$addr)>;
+def : Pat<(i64 (sextloadi16 ADDRrr:$addr)), (LDSHrr ADDRrr:$addr)>;
+def : Pat<(i64 (sextloadi16 ADDRri:$addr)), (LDSHri ADDRri:$addr)>;
+
+def : Pat<(i64 (zextloadi32 ADDRrr:$addr)), (LDrr ADDRrr:$addr)>;
+def : Pat<(i64 (zextloadi32 ADDRri:$addr)), (LDri ADDRri:$addr)>;
+def : Pat<(i64 (extloadi32 ADDRrr:$addr)), (LDrr ADDRrr:$addr)>;
+def : Pat<(i64 (extloadi32 ADDRri:$addr)), (LDri ADDRri:$addr)>;
+
+// Sign-extending load of i32 into i64 is a new SPARC v9 instruction.
+let DecoderMethod = "DecodeLoadInt" in
+ defm LDSW : Load<"ldsw", 0b001000, sextloadi32, I64Regs, i64>;
+
+// 64-bit stores.
+let DecoderMethod = "DecodeStoreInt" in
+ defm STX : Store<"stx", 0b001110, store, I64Regs, i64>;
+
+// Truncating stores from i64 are identical to the i32 stores.
+def : Pat<(truncstorei8 i64:$src, ADDRrr:$addr), (STBrr ADDRrr:$addr, $src)>;
+def : Pat<(truncstorei8 i64:$src, ADDRri:$addr), (STBri ADDRri:$addr, $src)>;
+def : Pat<(truncstorei16 i64:$src, ADDRrr:$addr), (STHrr ADDRrr:$addr, $src)>;
+def : Pat<(truncstorei16 i64:$src, ADDRri:$addr), (STHri ADDRri:$addr, $src)>;
+def : Pat<(truncstorei32 i64:$src, ADDRrr:$addr), (STrr ADDRrr:$addr, $src)>;
+def : Pat<(truncstorei32 i64:$src, ADDRri:$addr), (STri ADDRri:$addr, $src)>;
+
+// store 0, addr -> store %g0, addr
+def : Pat<(store (i64 0), ADDRrr:$dst), (STXrr ADDRrr:$dst, (i64 G0))>;
+def : Pat<(store (i64 0), ADDRri:$dst), (STXri ADDRri:$dst, (i64 G0))>;
+
+} // Predicates = [Is64Bit]
+
+
+//===----------------------------------------------------------------------===//
+// 64-bit Conditionals.
+//===----------------------------------------------------------------------===//
+
+//
+// Flag-setting instructions like subcc and addcc set both icc and xcc flags.
+// The icc flags correspond to the 32-bit result, and the xcc are for the
+// full 64-bit result.
+//
+// We reuse CMPICC SDNodes for compares, but use new BRXCC branch nodes for
+// 64-bit compares. See LowerBR_CC.
+
+let Predicates = [Is64Bit] in {
+
+let Uses = [ICC], cc = 0b10 in
+ defm BPX : IPredBranch<"%xcc", [(SPbrxcc bb:$imm19, imm:$cond)]>;
+
+// Conditional moves on %xcc.
+let Uses = [ICC], Constraints = "$f = $rd" in {
+let intcc = 1, cc = 0b10 in {
+def MOVXCCrr : F4_1<0b101100, (outs IntRegs:$rd),
+ (ins IntRegs:$rs2, IntRegs:$f, CCOp:$cond),
+ "mov$cond %xcc, $rs2, $rd",
+ [(set i32:$rd,
+ (SPselectxcc i32:$rs2, i32:$f, imm:$cond))]>;
+def MOVXCCri : F4_2<0b101100, (outs IntRegs:$rd),
+ (ins i32imm:$simm11, IntRegs:$f, CCOp:$cond),
+ "mov$cond %xcc, $simm11, $rd",
+ [(set i32:$rd,
+ (SPselectxcc simm11:$simm11, i32:$f, imm:$cond))]>;
+} // cc
+
+let intcc = 1, opf_cc = 0b10 in {
+def FMOVS_XCC : F4_3<0b110101, 0b000001, (outs FPRegs:$rd),
+ (ins FPRegs:$rs2, FPRegs:$f, CCOp:$cond),
+ "fmovs$cond %xcc, $rs2, $rd",
+ [(set f32:$rd,
+ (SPselectxcc f32:$rs2, f32:$f, imm:$cond))]>;
+def FMOVD_XCC : F4_3<0b110101, 0b000010, (outs DFPRegs:$rd),
+ (ins DFPRegs:$rs2, DFPRegs:$f, CCOp:$cond),
+ "fmovd$cond %xcc, $rs2, $rd",
+ [(set f64:$rd,
+ (SPselectxcc f64:$rs2, f64:$f, imm:$cond))]>;
+def FMOVQ_XCC : F4_3<0b110101, 0b000011, (outs QFPRegs:$rd),
+ (ins QFPRegs:$rs2, QFPRegs:$f, CCOp:$cond),
+ "fmovq$cond %xcc, $rs2, $rd",
+ [(set f128:$rd,
+ (SPselectxcc f128:$rs2, f128:$f, imm:$cond))]>;
+} // opf_cc
+} // Uses, Constraints
+
+// Branch On integer register with Prediction (BPr).
+let isBranch = 1, isTerminator = 1, hasDelaySlot = 1 in
+multiclass BranchOnReg<bits<3> cond, string OpcStr> {
+ def napt : F2_4<cond, 0, 1, (outs), (ins I64Regs:$rs1, bprtarget16:$imm16),
+ !strconcat(OpcStr, " $rs1, $imm16"), []>;
+ def apt : F2_4<cond, 1, 1, (outs), (ins I64Regs:$rs1, bprtarget16:$imm16),
+ !strconcat(OpcStr, ",a $rs1, $imm16"), []>;
+ def napn : F2_4<cond, 0, 0, (outs), (ins I64Regs:$rs1, bprtarget16:$imm16),
+ !strconcat(OpcStr, ",pn $rs1, $imm16"), []>;
+ def apn : F2_4<cond, 1, 0, (outs), (ins I64Regs:$rs1, bprtarget16:$imm16),
+ !strconcat(OpcStr, ",a,pn $rs1, $imm16"), []>;
+}
+
+multiclass bpr_alias<string OpcStr, Instruction NAPT, Instruction APT> {
+ def : InstAlias<!strconcat(OpcStr, ",pt $rs1, $imm16"),
+ (NAPT I64Regs:$rs1, bprtarget16:$imm16), 0>;
+ def : InstAlias<!strconcat(OpcStr, ",a,pt $rs1, $imm16"),
+ (APT I64Regs:$rs1, bprtarget16:$imm16), 0>;
+}
+
+defm BPZ : BranchOnReg<0b001, "brz">;
+defm BPLEZ : BranchOnReg<0b010, "brlez">;
+defm BPLZ : BranchOnReg<0b011, "brlz">;
+defm BPNZ : BranchOnReg<0b101, "brnz">;
+defm BPGZ : BranchOnReg<0b110, "brgz">;
+defm BPGEZ : BranchOnReg<0b111, "brgez">;
+
+defm : bpr_alias<"brz", BPZnapt, BPZapt >;
+defm : bpr_alias<"brlez", BPLEZnapt, BPLEZapt>;
+defm : bpr_alias<"brlz", BPLZnapt, BPLZapt >;
+defm : bpr_alias<"brnz", BPNZnapt, BPNZapt >;
+defm : bpr_alias<"brgz", BPGZnapt, BPGZapt >;
+defm : bpr_alias<"brgez", BPGEZnapt, BPGEZapt>;
+
+// Move integer register on register condition (MOVr).
+multiclass MOVR< bits<3> rcond, string OpcStr> {
+ def rr : F4_4r<0b101111, 0b00000, rcond, (outs I64Regs:$rd),
+ (ins I64Regs:$rs1, IntRegs:$rs2),
+ !strconcat(OpcStr, " $rs1, $rs2, $rd"), []>;
+
+ def ri : F4_4i<0b101111, rcond, (outs I64Regs:$rd),
+ (ins I64Regs:$rs1, i64imm:$simm10),
+ !strconcat(OpcStr, " $rs1, $simm10, $rd"), []>;
+}
+
+defm MOVRRZ : MOVR<0b001, "movrz">;
+defm MOVRLEZ : MOVR<0b010, "movrlez">;
+defm MOVRLZ : MOVR<0b011, "movrlz">;
+defm MOVRNZ : MOVR<0b101, "movrnz">;
+defm MOVRGZ : MOVR<0b110, "movrgz">;
+defm MOVRGEZ : MOVR<0b111, "movrgez">;
+
+// Move FP register on integer register condition (FMOVr).
+multiclass FMOVR<bits<3> rcond, string OpcStr> {
+
+ def S : F4_4r<0b110101, 0b00101, rcond,
+ (outs FPRegs:$rd), (ins I64Regs:$rs1, FPRegs:$rs2),
+ !strconcat(!strconcat("fmovrs", OpcStr)," $rs1, $rs2, $rd"),
+ []>;
+ def D : F4_4r<0b110101, 0b00110, rcond,
+ (outs FPRegs:$rd), (ins I64Regs:$rs1, FPRegs:$rs2),
+ !strconcat(!strconcat("fmovrd", OpcStr)," $rs1, $rs2, $rd"),
+ []>;
+ def Q : F4_4r<0b110101, 0b00111, rcond,
+ (outs FPRegs:$rd), (ins I64Regs:$rs1, FPRegs:$rs2),
+ !strconcat(!strconcat("fmovrq", OpcStr)," $rs1, $rs2, $rd"),
+ []>, Requires<[HasHardQuad]>;
+}
+
+let Predicates = [HasV9] in {
+ defm FMOVRZ : FMOVR<0b001, "z">;
+ defm FMOVRLEZ : FMOVR<0b010, "lez">;
+ defm FMOVRLZ : FMOVR<0b011, "lz">;
+ defm FMOVRNZ : FMOVR<0b101, "nz">;
+ defm FMOVRGZ : FMOVR<0b110, "gz">;
+ defm FMOVRGEZ : FMOVR<0b111, "gez">;
+}
+
+//===----------------------------------------------------------------------===//
+// 64-bit Floating Point Conversions.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [Is64Bit] in {
+
+def FXTOS : F3_3u<2, 0b110100, 0b010000100,
+ (outs FPRegs:$rd), (ins DFPRegs:$rs2),
+ "fxtos $rs2, $rd",
+ [(set FPRegs:$rd, (SPxtof DFPRegs:$rs2))]>;
+def FXTOD : F3_3u<2, 0b110100, 0b010001000,
+ (outs DFPRegs:$rd), (ins DFPRegs:$rs2),
+ "fxtod $rs2, $rd",
+ [(set DFPRegs:$rd, (SPxtof DFPRegs:$rs2))]>;
+def FXTOQ : F3_3u<2, 0b110100, 0b010001100,
+ (outs QFPRegs:$rd), (ins DFPRegs:$rs2),
+ "fxtoq $rs2, $rd",
+ [(set QFPRegs:$rd, (SPxtof DFPRegs:$rs2))]>,
+ Requires<[HasHardQuad]>;
+
+def FSTOX : F3_3u<2, 0b110100, 0b010000001,
+ (outs DFPRegs:$rd), (ins FPRegs:$rs2),
+ "fstox $rs2, $rd",
+ [(set DFPRegs:$rd, (SPftox FPRegs:$rs2))]>;
+def FDTOX : F3_3u<2, 0b110100, 0b010000010,
+ (outs DFPRegs:$rd), (ins DFPRegs:$rs2),
+ "fdtox $rs2, $rd",
+ [(set DFPRegs:$rd, (SPftox DFPRegs:$rs2))]>;
+def FQTOX : F3_3u<2, 0b110100, 0b010000011,
+ (outs DFPRegs:$rd), (ins QFPRegs:$rs2),
+ "fqtox $rs2, $rd",
+ [(set DFPRegs:$rd, (SPftox QFPRegs:$rs2))]>,
+ Requires<[HasHardQuad]>;
+
+} // Predicates = [Is64Bit]
+
+def : Pat<(SPselectxcc i64:$t, i64:$f, imm:$cond),
+ (MOVXCCrr $t, $f, imm:$cond)>;
+def : Pat<(SPselectxcc (i64 simm11:$t), i64:$f, imm:$cond),
+ (MOVXCCri (as_i32imm $t), $f, imm:$cond)>;
+
+def : Pat<(SPselecticc i64:$t, i64:$f, imm:$cond),
+ (MOVICCrr $t, $f, imm:$cond)>;
+def : Pat<(SPselecticc (i64 simm11:$t), i64:$f, imm:$cond),
+ (MOVICCri (as_i32imm $t), $f, imm:$cond)>;
+
+def : Pat<(SPselectfcc i64:$t, i64:$f, imm:$cond),
+ (MOVFCCrr $t, $f, imm:$cond)>;
+def : Pat<(SPselectfcc (i64 simm11:$t), i64:$f, imm:$cond),
+ (MOVFCCri (as_i32imm $t), $f, imm:$cond)>;
+
+} // Predicates = [Is64Bit]
+
+
+// 64 bit SETHI
+let Predicates = [Is64Bit], isCodeGenOnly = 1 in {
+def SETHIXi : F2_1<0b100,
+ (outs IntRegs:$rd), (ins i64imm:$imm22),
+ "sethi $imm22, $rd",
+ [(set i64:$rd, SETHIimm:$imm22)]>;
+}
+
+// ATOMICS.
+let Predicates = [Is64Bit], Constraints = "$swap = $rd", asi = 0b10000000 in {
+ def CASXrr: F3_1_asi<3, 0b111110,
+ (outs I64Regs:$rd), (ins I64Regs:$rs1, I64Regs:$rs2,
+ I64Regs:$swap),
+ "casx [$rs1], $rs2, $rd",
+ [(set i64:$rd,
+ (atomic_cmp_swap_64 i64:$rs1, i64:$rs2, i64:$swap))]>;
+
+} // Predicates = [Is64Bit], Constraints = ...
+
+let Predicates = [Is64Bit] in {
+
+def : Pat<(atomic_fence imm, imm), (MEMBARi 0xf)>;
+
+// atomic_load_64 addr -> load addr
+def : Pat<(i64 (atomic_load_64 ADDRrr:$src)), (LDXrr ADDRrr:$src)>;
+def : Pat<(i64 (atomic_load_64 ADDRri:$src)), (LDXri ADDRri:$src)>;
+
+// atomic_store_64 val, addr -> store val, addr
+def : Pat<(atomic_store_64 ADDRrr:$dst, i64:$val), (STXrr ADDRrr:$dst, $val)>;
+def : Pat<(atomic_store_64 ADDRri:$dst, i64:$val), (STXri ADDRri:$dst, $val)>;
+
+} // Predicates = [Is64Bit]
+
+let Predicates = [Is64Bit], hasSideEffects = 1, Uses = [ICC], cc = 0b10 in
+ defm TXCC : TRAP<"%xcc">;
+
+// Global addresses, constant pool entries
+let Predicates = [Is64Bit] in {
+
+def : Pat<(SPhi tglobaladdr:$in), (SETHIi tglobaladdr:$in)>;
+def : Pat<(SPlo tglobaladdr:$in), (ORXri (i64 G0), tglobaladdr:$in)>;
+def : Pat<(SPhi tconstpool:$in), (SETHIi tconstpool:$in)>;
+def : Pat<(SPlo tconstpool:$in), (ORXri (i64 G0), tconstpool:$in)>;
+
+// GlobalTLS addresses
+def : Pat<(SPhi tglobaltlsaddr:$in), (SETHIi tglobaltlsaddr:$in)>;
+def : Pat<(SPlo tglobaltlsaddr:$in), (ORXri (i64 G0), tglobaltlsaddr:$in)>;
+def : Pat<(add (SPhi tglobaltlsaddr:$in1), (SPlo tglobaltlsaddr:$in2)),
+ (ADDXri (SETHIXi tglobaltlsaddr:$in1), (tglobaltlsaddr:$in2))>;
+def : Pat<(xor (SPhi tglobaltlsaddr:$in1), (SPlo tglobaltlsaddr:$in2)),
+ (XORXri (SETHIXi tglobaltlsaddr:$in1), (tglobaltlsaddr:$in2))>;
+
+// Blockaddress
+def : Pat<(SPhi tblockaddress:$in), (SETHIi tblockaddress:$in)>;
+def : Pat<(SPlo tblockaddress:$in), (ORXri (i64 G0), tblockaddress:$in)>;
+
+// Add reg, lo. This is used when taking the addr of a global/constpool entry.
+def : Pat<(add iPTR:$r, (SPlo tglobaladdr:$in)), (ADDXri $r, tglobaladdr:$in)>;
+def : Pat<(add iPTR:$r, (SPlo tconstpool:$in)), (ADDXri $r, tconstpool:$in)>;
+def : Pat<(add iPTR:$r, (SPlo tblockaddress:$in)),
+ (ADDXri $r, tblockaddress:$in)>;
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrAliases.td b/contrib/llvm/lib/Target/Sparc/SparcInstrAliases.td
new file mode 100644
index 000000000000..df570cea8da8
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcInstrAliases.td
@@ -0,0 +1,506 @@
+//===-- SparcInstrAliases.td - Instruction Aliases for Sparc Target -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains instruction aliases for Sparc.
+//===----------------------------------------------------------------------===//
+
+// Instruction aliases for conditional moves.
+
+// mov<cond> <ccreg> rs2, rd
+multiclass intcond_mov_alias<string cond, int condVal, string ccreg,
+ Instruction movrr, Instruction movri,
+ Instruction fmovs, Instruction fmovd> {
+
+ // mov<cond> (%icc|%xcc), rs2, rd
+ def : InstAlias<!strconcat(!strconcat(!strconcat("mov", cond), ccreg),
+ ", $rs2, $rd"),
+ (movrr IntRegs:$rd, IntRegs:$rs2, condVal)>;
+
+ // mov<cond> (%icc|%xcc), simm11, rd
+ def : InstAlias<!strconcat(!strconcat(!strconcat("mov", cond), ccreg),
+ ", $simm11, $rd"),
+ (movri IntRegs:$rd, i32imm:$simm11, condVal)>;
+
+ // fmovs<cond> (%icc|%xcc), $rs2, $rd
+ def : InstAlias<!strconcat(!strconcat(!strconcat("fmovs", cond), ccreg),
+ ", $rs2, $rd"),
+ (fmovs FPRegs:$rd, FPRegs:$rs2, condVal)>;
+
+ // fmovd<cond> (%icc|%xcc), $rs2, $rd
+ def : InstAlias<!strconcat(!strconcat(!strconcat("fmovd", cond), ccreg),
+ ", $rs2, $rd"),
+ (fmovd DFPRegs:$rd, DFPRegs:$rs2, condVal)>;
+}
+
+// mov<cond> <ccreg> rs2, rd
+multiclass fpcond_mov_alias<string cond, int condVal,
+ Instruction movrr, Instruction movri,
+ Instruction fmovs, Instruction fmovd> {
+
+ // mov<cond> %fcc[0-3], rs2, rd
+ def : InstAlias<!strconcat(!strconcat("mov", cond), " $cc, $rs2, $rd"),
+ (movrr IntRegs:$rd, FCCRegs:$cc, IntRegs:$rs2, condVal)>;
+
+ // mov<cond> %fcc[0-3], simm11, rd
+ def : InstAlias<!strconcat(!strconcat("mov", cond), " $cc, $simm11, $rd"),
+ (movri IntRegs:$rd, FCCRegs:$cc, i32imm:$simm11, condVal)>;
+
+ // fmovs<cond> %fcc[0-3], $rs2, $rd
+ def : InstAlias<!strconcat(!strconcat("fmovs", cond), " $cc, $rs2, $rd"),
+ (fmovs FPRegs:$rd, FCCRegs:$cc, FPRegs:$rs2, condVal)>;
+
+ // fmovd<cond> %fcc[0-3], $rs2, $rd
+ def : InstAlias<!strconcat(!strconcat("fmovd", cond), " $cc, $rs2, $rd"),
+ (fmovd DFPRegs:$rd, FCCRegs:$cc, DFPRegs:$rs2, condVal)>;
+}
+
+// Instruction aliases for integer conditional branches and moves.
+multiclass int_cond_alias<string cond, int condVal> {
+
+ // b<cond> $imm
+ def : InstAlias<!strconcat(!strconcat("b", cond), " $imm"),
+ (BCOND brtarget:$imm, condVal)>;
+
+ // b<cond>,a $imm
+ def : InstAlias<!strconcat(!strconcat("b", cond), ",a $imm"),
+ (BCONDA brtarget:$imm, condVal)>;
+
+ // b<cond> %icc, $imm
+ def : InstAlias<!strconcat(!strconcat("b", cond), " %icc, $imm"),
+ (BPICC brtarget:$imm, condVal)>, Requires<[HasV9]>;
+
+ // b<cond>,pt %icc, $imm
+ def : InstAlias<!strconcat(!strconcat("b", cond), ",pt %icc, $imm"),
+ (BPICC brtarget:$imm, condVal)>, Requires<[HasV9]>;
+
+ // b<cond>,a %icc, $imm
+ def : InstAlias<!strconcat(!strconcat("b", cond), ",a %icc, $imm"),
+ (BPICCA brtarget:$imm, condVal)>, Requires<[HasV9]>;
+
+ // b<cond>,a,pt %icc, $imm
+ def : InstAlias<!strconcat(!strconcat("b", cond), ",a,pt %icc, $imm"),
+ (BPICCA brtarget:$imm, condVal)>, Requires<[HasV9]>;
+
+ // b<cond>,pn %icc, $imm
+ def : InstAlias<!strconcat(!strconcat("b", cond), ",pn %icc, $imm"),
+ (BPICCNT brtarget:$imm, condVal)>, Requires<[HasV9]>;
+
+ // b<cond>,a,pn %icc, $imm
+ def : InstAlias<!strconcat(!strconcat("b", cond), ",a,pn %icc, $imm"),
+ (BPICCANT brtarget:$imm, condVal)>, Requires<[HasV9]>;
+
+ // b<cond> %xcc, $imm
+ def : InstAlias<!strconcat(!strconcat("b", cond), " %xcc, $imm"),
+ (BPXCC brtarget:$imm, condVal)>, Requires<[Is64Bit]>;
+
+ // b<cond>,pt %xcc, $imm
+ def : InstAlias<!strconcat(!strconcat("b", cond), ",pt %xcc, $imm"),
+ (BPXCC brtarget:$imm, condVal)>, Requires<[Is64Bit]>;
+
+ // b<cond>,a %xcc, $imm
+ def : InstAlias<!strconcat(!strconcat("b", cond), ",a %xcc, $imm"),
+ (BPXCCA brtarget:$imm, condVal)>, Requires<[Is64Bit]>;
+
+ // b<cond>,a,pt %xcc, $imm
+ def : InstAlias<!strconcat(!strconcat("b", cond), ",a,pt %xcc, $imm"),
+ (BPXCCA brtarget:$imm, condVal)>, Requires<[Is64Bit]>;
+
+ // b<cond>,pn %xcc, $imm
+ def : InstAlias<!strconcat(!strconcat("b", cond), ",pn %xcc, $imm"),
+ (BPXCCNT brtarget:$imm, condVal)>, Requires<[Is64Bit]>;
+
+ // b<cond>,a,pn %xcc, $imm
+ def : InstAlias<!strconcat(!strconcat("b", cond), ",a,pn %xcc, $imm"),
+ (BPXCCANT brtarget:$imm, condVal)>, Requires<[Is64Bit]>;
+
+
+ defm : intcond_mov_alias<cond, condVal, " %icc",
+ MOVICCrr, MOVICCri,
+ FMOVS_ICC, FMOVD_ICC>, Requires<[HasV9]>;
+
+ defm : intcond_mov_alias<cond, condVal, " %xcc",
+ MOVXCCrr, MOVXCCri,
+ FMOVS_XCC, FMOVD_XCC>, Requires<[Is64Bit]>;
+
+ // fmovq<cond> (%icc|%xcc), $rs2, $rd
+ def : InstAlias<!strconcat(!strconcat("fmovq", cond), " %icc, $rs2, $rd"),
+ (FMOVQ_ICC QFPRegs:$rd, QFPRegs:$rs2, condVal)>,
+ Requires<[HasV9, HasHardQuad]>;
+ def : InstAlias<!strconcat(!strconcat("fmovq", cond), " %xcc, $rs2, $rd"),
+ (FMOVQ_XCC QFPRegs:$rd, QFPRegs:$rs2, condVal)>,
+ Requires<[Is64Bit, HasHardQuad]>;
+
+ // t<cond> %icc, rs => t<cond> %icc, G0 + rs
+ def : InstAlias<!strconcat(!strconcat("t", cond), " %icc, $rs2"),
+ (TICCrr G0, IntRegs:$rs2, condVal)>,
+ Requires<[HasV9]>;
+ // t<cond> %icc, rs1 + rs2
+ def : InstAlias<!strconcat(!strconcat("t", cond), " %icc, $rs1 + $rs2"),
+ (TICCrr IntRegs:$rs1, IntRegs:$rs2, condVal)>,
+ Requires<[HasV9]>;
+
+
+ // t<cond> %xcc, rs => t<cond> %xcc, G0 + rs
+ def : InstAlias<!strconcat(!strconcat("t", cond), " %xcc, $rs2"),
+ (TXCCrr G0, IntRegs:$rs2, condVal)>,
+ Requires<[HasV9]>;
+ // t<cond> %xcc, rs1 + rs2
+ def : InstAlias<!strconcat(!strconcat("t", cond), " %xcc, $rs1 + $rs2"),
+ (TXCCrr IntRegs:$rs1, IntRegs:$rs2, condVal)>,
+ Requires<[HasV9]>;
+
+
+ // t<cond> rs=> t<cond> %icc, G0 + rs2
+ //def : InstAlias<!strconcat(!strconcat("t", cond), " $rs2"),
+ // (TICCrr G0, IntRegs:$rs2, condVal)>,
+ // Requires<[HasV9]>;
+
+ // t<cond> rs1 + rs2 => t<cond> %icc, rs1 + rs2
+ //def : InstAlias<!strconcat(!strconcat("t", cond), " $rs1 + $rs2"),
+ // (TICCrr IntRegs:$rs1, IntRegs:$rs2, condVal)>,
+ // Requires<[HasV9]>;
+
+ // t<cond> %icc, imm => t<cond> %icc, G0 + imm
+ def : InstAlias<!strconcat(!strconcat("t", cond), " %icc, $imm"),
+ (TICCri G0, i32imm:$imm, condVal)>,
+ Requires<[HasV9]>;
+ // t<cond> %icc, rs1 + imm
+ def : InstAlias<!strconcat(!strconcat("t", cond), " %icc, $rs1 + $imm"),
+ (TICCri IntRegs:$rs1, i32imm:$imm, condVal)>,
+ Requires<[HasV9]>;
+ // t<cond> %xcc, imm => t<cond> %xcc, G0 + imm
+ def : InstAlias<!strconcat(!strconcat("t", cond), " %xcc, $imm"),
+ (TXCCri G0, i32imm:$imm, condVal)>,
+ Requires<[HasV9]>;
+ // t<cond> %xcc, rs1 + imm
+ def : InstAlias<!strconcat(!strconcat("t", cond), " %xcc, $rs1 + $imm"),
+ (TXCCri IntRegs:$rs1, i32imm:$imm, condVal)>,
+ Requires<[HasV9]>;
+
+ // t<cond> imm => t<cond> G0 + imm
+ def : InstAlias<!strconcat(!strconcat("t", cond), " $imm"),
+ (TRAPri G0, i32imm:$imm, condVal)>;
+
+ // t<cond> rs1 + imm => t<cond> rs1 + imm
+ def : InstAlias<!strconcat(!strconcat("t", cond), " $rs1 + $imm"),
+ (TRAPri IntRegs:$rs1, i32imm:$imm, condVal)>;
+
+ // t<cond> rs1 => t<cond> G0 + rs1
+ def : InstAlias<!strconcat(!strconcat("t", cond), " $rs1"),
+ (TRAPrr G0, IntRegs:$rs1, condVal)>;
+
+ // t<cond> rs1 + rs2
+ def : InstAlias<!strconcat(!strconcat("t", cond), " $rs1 + $rs2"),
+ (TRAPrr IntRegs:$rs1, IntRegs:$rs2, condVal)>;
+}
+
+
+// Instruction aliases for floating point conditional branches and moves.
+multiclass fp_cond_alias<string cond, int condVal> {
+
+ // fb<cond> $imm
+ def : InstAlias<!strconcat(!strconcat("fb", cond), " $imm"),
+ (FBCOND brtarget:$imm, condVal), 0>;
+
+ // fb<cond>,a $imm
+ def : InstAlias<!strconcat(!strconcat("fb", cond), ",a $imm"),
+ (FBCONDA brtarget:$imm, condVal), 0>;
+
+ // fb<cond> %fcc0, $imm
+ def : InstAlias<!strconcat(!strconcat("fb", cond), " $cc, $imm"),
+ (BPFCC brtarget:$imm, condVal, FCCRegs:$cc)>,
+ Requires<[HasV9]>;
+
+ // fb<cond>,pt %fcc0, $imm
+ def : InstAlias<!strconcat(!strconcat("fb", cond), ",pt $cc, $imm"),
+ (BPFCC brtarget:$imm, condVal, FCCRegs:$cc)>,
+ Requires<[HasV9]>;
+
+ // fb<cond>,a %fcc0, $imm
+ def : InstAlias<!strconcat(!strconcat("fb", cond), ",a $cc, $imm"),
+ (BPFCCA brtarget:$imm, condVal, FCCRegs:$cc)>,
+ Requires<[HasV9]>;
+
+ // fb<cond>,a,pt %fcc0, $imm
+ def : InstAlias<!strconcat(!strconcat("fb", cond), ",a,pt $cc, $imm"),
+ (BPFCCA brtarget:$imm, condVal, FCCRegs:$cc)>,
+ Requires<[HasV9]>;
+
+ // fb<cond>,pn %fcc0, $imm
+ def : InstAlias<!strconcat(!strconcat("fb", cond), ",pn $cc, $imm"),
+ (BPFCCNT brtarget:$imm, condVal, FCCRegs:$cc)>,
+ Requires<[HasV9]>;
+
+ // fb<cond>,a,pn %fcc0, $imm
+ def : InstAlias<!strconcat(!strconcat("fb", cond), ",a,pn $cc, $imm"),
+ (BPFCCANT brtarget:$imm, condVal, FCCRegs:$cc)>,
+ Requires<[HasV9]>;
+
+ defm : fpcond_mov_alias<cond, condVal,
+ V9MOVFCCrr, V9MOVFCCri,
+ V9FMOVS_FCC, V9FMOVD_FCC>, Requires<[HasV9]>;
+
+ // fmovq<cond> %fcc0, $rs2, $rd
+ def : InstAlias<!strconcat(!strconcat("fmovq", cond), " $cc, $rs2, $rd"),
+ (V9FMOVQ_FCC QFPRegs:$rd, FCCRegs:$cc, QFPRegs:$rs2,
+ condVal)>,
+ Requires<[HasV9, HasHardQuad]>;
+}
+
+
+// Instruction aliases for co-processor conditional branches.
+multiclass cp_cond_alias<string cond, int condVal> {
+
+ // cb<cond> $imm
+ def : InstAlias<!strconcat(!strconcat("cb", cond), " $imm"),
+ (CBCOND brtarget:$imm, condVal), 0>;
+
+ // cb<cond>,a $imm
+ def : InstAlias<!strconcat(!strconcat("cb", cond), ",a $imm"),
+ (CBCONDA brtarget:$imm, condVal), 0>;
+}
+
+defm : int_cond_alias<"a", 0b1000>;
+defm : int_cond_alias<"n", 0b0000>;
+defm : int_cond_alias<"ne", 0b1001>;
+defm : int_cond_alias<"e", 0b0001>;
+defm : int_cond_alias<"g", 0b1010>;
+defm : int_cond_alias<"le", 0b0010>;
+defm : int_cond_alias<"ge", 0b1011>;
+defm : int_cond_alias<"l", 0b0011>;
+defm : int_cond_alias<"gu", 0b1100>;
+defm : int_cond_alias<"leu", 0b0100>;
+defm : int_cond_alias<"cc", 0b1101>;
+defm : int_cond_alias<"cs", 0b0101>;
+defm : int_cond_alias<"pos", 0b1110>;
+defm : int_cond_alias<"neg", 0b0110>;
+defm : int_cond_alias<"vc", 0b1111>;
+defm : int_cond_alias<"vs", 0b0111>;
+let EmitPriority = 0 in
+{
+ defm : int_cond_alias<"", 0b1000>; // same as a; gnu asm, not in manual
+ defm : int_cond_alias<"nz", 0b1001>; // same as ne
+ defm : int_cond_alias<"eq", 0b0001>; // same as e
+ defm : int_cond_alias<"z", 0b0001>; // same as e
+ defm : int_cond_alias<"geu", 0b1101>; // same as cc
+ defm : int_cond_alias<"lu", 0b0101>; // same as cs
+}
+defm : fp_cond_alias<"a", 0b1000>;
+defm : fp_cond_alias<"n", 0b0000>;
+defm : fp_cond_alias<"u", 0b0111>;
+defm : fp_cond_alias<"g", 0b0110>;
+defm : fp_cond_alias<"ug", 0b0101>;
+defm : fp_cond_alias<"l", 0b0100>;
+defm : fp_cond_alias<"ul", 0b0011>;
+defm : fp_cond_alias<"lg", 0b0010>;
+defm : fp_cond_alias<"ne", 0b0001>;
+defm : fp_cond_alias<"e", 0b1001>;
+defm : fp_cond_alias<"ue", 0b1010>;
+defm : fp_cond_alias<"ge", 0b1011>;
+defm : fp_cond_alias<"uge", 0b1100>;
+defm : fp_cond_alias<"le", 0b1101>;
+defm : fp_cond_alias<"ule", 0b1110>;
+defm : fp_cond_alias<"o", 0b1111>;
+let EmitPriority = 0 in
+{
+ defm : fp_cond_alias<"", 0b1000>; // same as a; gnu asm, not in manual
+ defm : fp_cond_alias<"nz", 0b0001>; // same as ne
+ defm : fp_cond_alias<"z", 0b1001>; // same as e
+}
+
+defm : cp_cond_alias<"a", 0b1000>;
+defm : cp_cond_alias<"n", 0b0000>;
+defm : cp_cond_alias<"3", 0b0111>;
+defm : cp_cond_alias<"2", 0b0110>;
+defm : cp_cond_alias<"23", 0b0101>;
+defm : cp_cond_alias<"1", 0b0100>;
+defm : cp_cond_alias<"13", 0b0011>;
+defm : cp_cond_alias<"12", 0b0010>;
+defm : cp_cond_alias<"123", 0b0001>;
+defm : cp_cond_alias<"0", 0b1001>;
+defm : cp_cond_alias<"03", 0b1010>;
+defm : cp_cond_alias<"02", 0b1011>;
+defm : cp_cond_alias<"023", 0b1100>;
+defm : cp_cond_alias<"01", 0b1101>;
+defm : cp_cond_alias<"013", 0b1110>;
+defm : cp_cond_alias<"012", 0b1111>;
+let EmitPriority = 0 in defm : cp_cond_alias<"", 0b1000>; // same as a; gnu asm, not in manual
+
+// Section A.3 Synthetic Instructions
+
+// Most are marked as Emit=0, so that they are not used for disassembly. This is
+// an aesthetic issue, but the chosen policy is to typically prefer using the
+// non-alias form, except for the most obvious and clarifying aliases: cmp, jmp,
+// call, tst, ret, retl.
+
+// Note: cmp is handled in SparcInstrInfo.
+// jmp/call/ret/retl have special case handling for output in
+// SparcInstPrinter.cpp
+
+// jmp addr -> jmpl addr, %g0
+def : InstAlias<"jmp $addr", (JMPLrr G0, MEMrr:$addr), 0>;
+def : InstAlias<"jmp $addr", (JMPLri G0, MEMri:$addr), 0>;
+
+// call addr -> jmpl addr, %o7
+def : InstAlias<"call $addr", (JMPLrr O7, MEMrr:$addr), 0>;
+def : InstAlias<"call $addr", (JMPLri O7, MEMri:$addr), 0>;
+
+// tst reg -> orcc %g0, reg, %g0
+def : InstAlias<"tst $rs2", (ORCCrr G0, IntRegs:$rs2, G0)>;
+
+// ret -> jmpl %i7+8, %g0 (aka RET 8)
+def : InstAlias<"ret", (RET 8)>;
+
+// retl -> jmpl %o7+8, %g0 (aka RETL 8)
+def : InstAlias<"retl", (RETL 8)>;
+
+// restore -> restore %g0, %g0, %g0
+def : InstAlias<"restore", (RESTORErr G0, G0, G0)>;
+
+// save -> restore %g0, %g0, %g0
+def : InstAlias<"save", (SAVErr G0, G0, G0)>;
+
+// set value, rd
+// (turns into a sequence of sethi+or, depending on the value)
+// def : InstAlias<"set $val, $rd", (ORri IntRegs:$rd, (SETHIi (HI22 imm:$val)), (LO10 imm:$val))>;
+def SET : AsmPseudoInst<(outs IntRegs:$rd), (ins i32imm:$val), "set $val, $rd">;
+
+// not rd -> xnor rd, %g0, rd
+def : InstAlias<"not $rd", (XNORrr IntRegs:$rd, IntRegs:$rd, G0), 0>;
+
+// not reg, rd -> xnor reg, %g0, rd
+def : InstAlias<"not $rs1, $rd", (XNORrr IntRegs:$rd, IntRegs:$rs1, G0), 0>;
+
+// neg rd -> sub %g0, rd, rd
+def : InstAlias<"neg $rd", (SUBrr IntRegs:$rd, G0, IntRegs:$rd), 0>;
+
+// neg reg, rd -> sub %g0, reg, rd
+def : InstAlias<"neg $rs2, $rd", (SUBrr IntRegs:$rd, G0, IntRegs:$rs2), 0>;
+
+// inc rd -> add rd, 1, rd
+def : InstAlias<"inc $rd", (ADDri IntRegs:$rd, IntRegs:$rd, 1), 0>;
+
+// inc simm13, rd -> add rd, simm13, rd
+def : InstAlias<"inc $simm13, $rd", (ADDri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>;
+
+// inccc rd -> addcc rd, 1, rd
+def : InstAlias<"inccc $rd", (ADDCCri IntRegs:$rd, IntRegs:$rd, 1), 0>;
+
+// inccc simm13, rd -> addcc rd, simm13, rd
+def : InstAlias<"inccc $simm13, $rd", (ADDCCri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>;
+
+// dec rd -> sub rd, 1, rd
+def : InstAlias<"dec $rd", (SUBri IntRegs:$rd, IntRegs:$rd, 1), 0>;
+
+// dec simm13, rd -> sub rd, simm13, rd
+def : InstAlias<"dec $simm13, $rd", (SUBri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>;
+
+// deccc rd -> subcc rd, 1, rd
+def : InstAlias<"deccc $rd", (SUBCCri IntRegs:$rd, IntRegs:$rd, 1), 0>;
+
+// deccc simm13, rd -> subcc rd, simm13, rd
+def : InstAlias<"deccc $simm13, $rd", (SUBCCri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>;
+
+// btst reg_or_imm, reg -> andcc reg,reg_or_imm,%g0
+def : InstAlias<"btst $rs2, $rs1", (ANDCCrr G0, IntRegs:$rs1, IntRegs:$rs2), 0>;
+def : InstAlias<"btst $simm13, $rs1", (ANDCCri G0, IntRegs:$rs1, i32imm:$simm13), 0>;
+
+// bset reg_or_imm, rd -> or rd,reg_or_imm,rd
+def : InstAlias<"bset $rs2, $rd", (ORrr IntRegs:$rd, IntRegs:$rd, IntRegs:$rs2), 0>;
+def : InstAlias<"bset $simm13, $rd", (ORri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>;
+
+// bclr reg_or_imm, rd -> andn rd,reg_or_imm,rd
+def : InstAlias<"bclr $rs2, $rd", (ANDNrr IntRegs:$rd, IntRegs:$rd, IntRegs:$rs2), 0>;
+def : InstAlias<"bclr $simm13, $rd", (ANDNri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>;
+
+// btog reg_or_imm, rd -> xor rd,reg_or_imm,rd
+def : InstAlias<"btog $rs2, $rd", (XORrr IntRegs:$rd, IntRegs:$rd, IntRegs:$rs2), 0>;
+def : InstAlias<"btog $simm13, $rd", (XORri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>;
+
+
+// clr rd -> or %g0, %g0, rd
+def : InstAlias<"clr $rd", (ORrr IntRegs:$rd, G0, G0), 0>;
+
+// clr{b,h,} [addr] -> st{b,h,} %g0, [addr]
+def : InstAlias<"clrb [$addr]", (STBrr MEMrr:$addr, G0), 0>;
+def : InstAlias<"clrb [$addr]", (STBri MEMri:$addr, G0), 0>;
+def : InstAlias<"clrh [$addr]", (STHrr MEMrr:$addr, G0), 0>;
+def : InstAlias<"clrh [$addr]", (STHri MEMri:$addr, G0), 0>;
+def : InstAlias<"clr [$addr]", (STrr MEMrr:$addr, G0), 0>;
+def : InstAlias<"clr [$addr]", (STri MEMri:$addr, G0), 0>;
+
+
+// mov reg_or_imm, rd -> or %g0, reg_or_imm, rd
+def : InstAlias<"mov $rs2, $rd", (ORrr IntRegs:$rd, G0, IntRegs:$rs2)>;
+def : InstAlias<"mov $simm13, $rd", (ORri IntRegs:$rd, G0, i32imm:$simm13)>;
+
+// mov specialreg, rd -> rd specialreg, rd
+def : InstAlias<"mov $asr, $rd", (RDASR IntRegs:$rd, ASRRegs:$asr), 0>;
+def : InstAlias<"mov %psr, $rd", (RDPSR IntRegs:$rd), 0>;
+def : InstAlias<"mov %wim, $rd", (RDWIM IntRegs:$rd), 0>;
+def : InstAlias<"mov %tbr, $rd", (RDTBR IntRegs:$rd), 0>;
+
+// mov reg_or_imm, specialreg -> wr %g0, reg_or_imm, specialreg
+def : InstAlias<"mov $rs2, $asr", (WRASRrr ASRRegs:$asr, G0, IntRegs:$rs2), 0>;
+def : InstAlias<"mov $simm13, $asr", (WRASRri ASRRegs:$asr, G0, i32imm:$simm13), 0>;
+def : InstAlias<"mov $rs2, %psr", (WRPSRrr G0, IntRegs:$rs2), 0>;
+def : InstAlias<"mov $simm13, %psr", (WRPSRri G0, i32imm:$simm13), 0>;
+def : InstAlias<"mov $rs2, %wim", (WRWIMrr G0, IntRegs:$rs2), 0>;
+def : InstAlias<"mov $simm13, %wim", (WRWIMri G0, i32imm:$simm13), 0>;
+def : InstAlias<"mov $rs2, %tbr", (WRTBRrr G0, IntRegs:$rs2), 0>;
+def : InstAlias<"mov $simm13, %tbr", (WRTBRri G0, i32imm:$simm13), 0>;
+
+// End of Section A.3
+
+// wr reg_or_imm, specialreg -> wr %g0, reg_or_imm, specialreg
+// (aka: omit the first arg when it's g0. This is not in the manual, but is
+// supported by gnu and solaris as)
+def : InstAlias<"wr $rs2, $asr", (WRASRrr ASRRegs:$asr, G0, IntRegs:$rs2), 0>;
+def : InstAlias<"wr $simm13, $asr", (WRASRri ASRRegs:$asr, G0, i32imm:$simm13), 0>;
+def : InstAlias<"wr $rs2, %psr", (WRPSRrr G0, IntRegs:$rs2), 0>;
+def : InstAlias<"wr $simm13, %psr", (WRPSRri G0, i32imm:$simm13), 0>;
+def : InstAlias<"wr $rs2, %wim", (WRWIMrr G0, IntRegs:$rs2), 0>;
+def : InstAlias<"wr $simm13, %wim", (WRWIMri G0, i32imm:$simm13), 0>;
+def : InstAlias<"wr $rs2, %tbr", (WRTBRrr G0, IntRegs:$rs2), 0>;
+def : InstAlias<"wr $simm13, %tbr", (WRTBRri G0, i32imm:$simm13), 0>;
+
+
+// flush -> flush %g0
+def : InstAlias<"flush", (FLUSH), 0>;
+
+
+def : MnemonicAlias<"lduw", "ld">, Requires<[HasV9]>;
+def : MnemonicAlias<"lduwa", "lda">, Requires<[HasV9]>;
+
+def : MnemonicAlias<"return", "rett">, Requires<[HasV9]>;
+
+def : MnemonicAlias<"addc", "addx">, Requires<[HasV9]>;
+def : MnemonicAlias<"addccc", "addxcc">, Requires<[HasV9]>;
+
+def : MnemonicAlias<"subc", "subx">, Requires<[HasV9]>;
+def : MnemonicAlias<"subccc", "subxcc">, Requires<[HasV9]>;
+
+
+def : InstAlias<"fcmps $rs1, $rs2", (V9FCMPS FCC0, FPRegs:$rs1, FPRegs:$rs2)>;
+def : InstAlias<"fcmpd $rs1, $rs2", (V9FCMPD FCC0, DFPRegs:$rs1, DFPRegs:$rs2)>;
+def : InstAlias<"fcmpq $rs1, $rs2", (V9FCMPQ FCC0, QFPRegs:$rs1, QFPRegs:$rs2)>,
+ Requires<[HasHardQuad]>;
+
+def : InstAlias<"fcmpes $rs1, $rs2", (V9FCMPES FCC0, FPRegs:$rs1, FPRegs:$rs2)>;
+def : InstAlias<"fcmped $rs1, $rs2", (V9FCMPED FCC0, DFPRegs:$rs1,
+ DFPRegs:$rs2)>;
+def : InstAlias<"fcmpeq $rs1, $rs2", (V9FCMPEQ FCC0, QFPRegs:$rs1,
+ QFPRegs:$rs2)>,
+ Requires<[HasHardQuad]>;
+
+// signx rd -> sra rd, %g0, rd
+def : InstAlias<"signx $rd", (SRArr IntRegs:$rd, IntRegs:$rd, G0), 0>, Requires<[HasV9]>;
+
+// signx reg, rd -> sra reg, %g0, rd
+def : InstAlias<"signx $rs1, $rd", (SRArr IntRegs:$rd, IntRegs:$rs1, G0), 0>, Requires<[HasV9]>;
diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrFormats.td b/contrib/llvm/lib/Target/Sparc/SparcInstrFormats.td
new file mode 100644
index 000000000000..76366c6695f4
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcInstrFormats.td
@@ -0,0 +1,369 @@
+//===-- SparcInstrFormats.td - Sparc Instruction Formats ---*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+class InstSP<dag outs, dag ins, string asmstr, list<dag> pattern,
+ InstrItinClass itin = NoItinerary>
+ : Instruction {
+ field bits<32> Inst;
+
+ let Namespace = "SP";
+ let Size = 4;
+
+ bits<2> op;
+ let Inst{31-30} = op; // Top two bits are the 'op' field
+
+ dag OutOperandList = outs;
+ dag InOperandList = ins;
+ let AsmString = asmstr;
+ let Pattern = pattern;
+
+ let DecoderNamespace = "Sparc";
+ field bits<32> SoftFail = 0;
+
+ let Itinerary = itin;
+}
+
+//===----------------------------------------------------------------------===//
+// Format #2 instruction classes in the Sparc
+//===----------------------------------------------------------------------===//
+
+// Format 2 instructions
+class F2<dag outs, dag ins, string asmstr, list<dag> pattern,
+ InstrItinClass itin = NoItinerary>
+ : InstSP<outs, ins, asmstr, pattern, itin> {
+ bits<3> op2;
+ bits<22> imm22;
+ let op = 0; // op = 0
+ let Inst{24-22} = op2;
+ let Inst{21-0} = imm22;
+}
+
+// Specific F2 classes: SparcV8 manual, page 44
+//
+class F2_1<bits<3> op2Val, dag outs, dag ins, string asmstr, list<dag> pattern,
+ InstrItinClass itin = NoItinerary>
+ : F2<outs, ins, asmstr, pattern, itin> {
+ bits<5> rd;
+
+ let op2 = op2Val;
+
+ let Inst{29-25} = rd;
+}
+
+class F2_2<bits<3> op2Val, bit annul, dag outs, dag ins, string asmstr,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : F2<outs, ins, asmstr, pattern, itin> {
+ bits<4> cond;
+ let op2 = op2Val;
+
+ let Inst{29} = annul;
+ let Inst{28-25} = cond;
+}
+
+class F2_3<bits<3> op2Val, bit annul, bit pred,
+ dag outs, dag ins, string asmstr, list<dag> pattern,
+ InstrItinClass itin = NoItinerary>
+ : InstSP<outs, ins, asmstr, pattern, itin> {
+ bits<2> cc;
+ bits<4> cond;
+ bits<19> imm19;
+
+ let op = 0; // op = 0
+
+ let Inst{29} = annul;
+ let Inst{28-25} = cond;
+ let Inst{24-22} = op2Val;
+ let Inst{21-20} = cc;
+ let Inst{19} = pred;
+ let Inst{18-0} = imm19;
+}
+
+class F2_4<bits<3> cond, bit annul, bit pred, dag outs, dag ins,
+ string asmstr, list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : InstSP<outs, ins, asmstr, pattern, itin> {
+ bits<16> imm16;
+ bits<5> rs1;
+
+ let op = 0; // op = 0
+
+ let Inst{29} = annul;
+ let Inst{28} = 0;
+ let Inst{27-25} = cond;
+ let Inst{24-22} = 0b011;
+ let Inst{21-20} = imm16{15-14};
+ let Inst{19} = pred;
+ let Inst{18-14} = rs1;
+ let Inst{13-0} = imm16{13-0};
+}
+
+
+//===----------------------------------------------------------------------===//
+// Format #3 instruction classes in the Sparc
+//===----------------------------------------------------------------------===//
+
+class F3<dag outs, dag ins, string asmstr, list<dag> pattern,
+ InstrItinClass itin = NoItinerary>
+ : InstSP<outs, ins, asmstr, pattern, itin> {
+ bits<5> rd;
+ bits<6> op3;
+ bits<5> rs1;
+ let op{1} = 1; // Op = 2 or 3
+ let Inst{29-25} = rd;
+ let Inst{24-19} = op3;
+ let Inst{18-14} = rs1;
+}
+
+// Specific F3 classes: SparcV8 manual, page 44
+//
+class F3_1_asi<bits<2> opVal, bits<6> op3val, dag outs, dag ins,
+ string asmstr, list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : F3<outs, ins, asmstr, pattern, itin> {
+ bits<8> asi;
+ bits<5> rs2;
+
+ let op = opVal;
+ let op3 = op3val;
+
+ let Inst{13} = 0; // i field = 0
+ let Inst{12-5} = asi; // address space identifier
+ let Inst{4-0} = rs2;
+}
+
+class F3_1<bits<2> opVal, bits<6> op3val, dag outs, dag ins, string asmstr,
+ list<dag> pattern, InstrItinClass itin = IIC_iu_instr>
+ : F3_1_asi<opVal, op3val, outs, ins, asmstr, pattern, itin> {
+ let asi = 0;
+}
+
+class F3_2<bits<2> opVal, bits<6> op3val, dag outs, dag ins,
+ string asmstr, list<dag> pattern, InstrItinClass itin = IIC_iu_instr>
+ : F3<outs, ins, asmstr, pattern, itin> {
+ bits<13> simm13;
+
+ let op = opVal;
+ let op3 = op3val;
+
+ let Inst{13} = 1; // i field = 1
+ let Inst{12-0} = simm13;
+}
+
+// floating-point
+class F3_3<bits<2> opVal, bits<6> op3val, bits<9> opfval, dag outs, dag ins,
+ string asmstr, list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : F3<outs, ins, asmstr, pattern, itin> {
+ bits<5> rs2;
+
+ let op = opVal;
+ let op3 = op3val;
+
+ let Inst{13-5} = opfval; // fp opcode
+ let Inst{4-0} = rs2;
+}
+
+// floating-point unary operations.
+class F3_3u<bits<2> opVal, bits<6> op3val, bits<9> opfval, dag outs, dag ins,
+ string asmstr, list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : F3<outs, ins, asmstr, pattern, itin> {
+ bits<5> rs2;
+
+ let op = opVal;
+ let op3 = op3val;
+ let rs1 = 0;
+
+ let Inst{13-5} = opfval; // fp opcode
+ let Inst{4-0} = rs2;
+}
+
+// floating-point compares.
+class F3_3c<bits<2> opVal, bits<6> op3val, bits<9> opfval, dag outs, dag ins,
+ string asmstr, list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : F3<outs, ins, asmstr, pattern, itin> {
+ bits<5> rs2;
+
+ let op = opVal;
+ let op3 = op3val;
+
+ let Inst{13-5} = opfval; // fp opcode
+ let Inst{4-0} = rs2;
+}
+
+// Shift by register rs2.
+class F3_Sr<bits<2> opVal, bits<6> op3val, bit xVal, dag outs, dag ins,
+ string asmstr, list<dag> pattern, InstrItinClass itin = IIC_iu_instr>
+ : F3<outs, ins, asmstr, pattern, itin> {
+ bit x = xVal; // 1 for 64-bit shifts.
+ bits<5> rs2;
+
+ let op = opVal;
+ let op3 = op3val;
+
+ let Inst{13} = 0; // i field = 0
+ let Inst{12} = x; // extended registers.
+ let Inst{4-0} = rs2;
+}
+
+// Shift by immediate.
+class F3_Si<bits<2> opVal, bits<6> op3val, bit xVal, dag outs, dag ins,
+ string asmstr, list<dag> pattern, InstrItinClass itin = IIC_iu_instr>
+ : F3<outs, ins, asmstr, pattern, itin> {
+ bit x = xVal; // 1 for 64-bit shifts.
+ bits<6> shcnt; // shcnt32 / shcnt64.
+
+ let op = opVal;
+ let op3 = op3val;
+
+ let Inst{13} = 1; // i field = 1
+ let Inst{12} = x; // extended registers.
+ let Inst{5-0} = shcnt;
+}
+
+// Define rr and ri shift instructions with patterns.
+multiclass F3_S<string OpcStr, bits<6> Op3Val, bit XVal, SDNode OpNode,
+ ValueType VT, RegisterClass RC,
+ InstrItinClass itin = IIC_iu_instr> {
+ def rr : F3_Sr<2, Op3Val, XVal, (outs RC:$rd), (ins RC:$rs1, IntRegs:$rs2),
+ !strconcat(OpcStr, " $rs1, $rs2, $rd"),
+ [(set VT:$rd, (OpNode VT:$rs1, i32:$rs2))],
+ itin>;
+ def ri : F3_Si<2, Op3Val, XVal, (outs RC:$rd), (ins RC:$rs1, i32imm:$shcnt),
+ !strconcat(OpcStr, " $rs1, $shcnt, $rd"),
+ [(set VT:$rd, (OpNode VT:$rs1, (i32 imm:$shcnt)))],
+ itin>;
+}
+
+class F4<bits<6> op3, dag outs, dag ins, string asmstr, list<dag> pattern,
+ InstrItinClass itin = NoItinerary>
+ : InstSP<outs, ins, asmstr, pattern, itin> {
+ bits<5> rd;
+
+ let op = 2;
+ let Inst{29-25} = rd;
+ let Inst{24-19} = op3;
+}
+
+
+class F4_1<bits<6> op3, dag outs, dag ins,
+ string asmstr, list<dag> pattern,
+ InstrItinClass itin = NoItinerary>
+ : F4<op3, outs, ins, asmstr, pattern, itin> {
+ bit intcc;
+ bits<2> cc;
+ bits<4> cond;
+ bits<5> rs2;
+
+ let Inst{4-0} = rs2;
+ let Inst{12-11} = cc;
+ let Inst{13} = 0;
+ let Inst{17-14} = cond;
+ let Inst{18} = intcc;
+}
+
+class F4_2<bits<6> op3, dag outs, dag ins,
+ string asmstr, list<dag> pattern,
+ InstrItinClass itin = NoItinerary>
+ : F4<op3, outs, ins, asmstr, pattern, itin> {
+ bit intcc;
+ bits<2> cc;
+ bits<4> cond;
+ bits<11> simm11;
+
+ let Inst{10-0} = simm11;
+ let Inst{12-11} = cc;
+ let Inst{13} = 1;
+ let Inst{17-14} = cond;
+ let Inst{18} = intcc;
+}
+
+class F4_3<bits<6> op3, bits<6> opf_low, dag outs, dag ins,
+ string asmstr, list<dag> pattern,
+ InstrItinClass itin = NoItinerary>
+ : F4<op3, outs, ins, asmstr, pattern, itin> {
+ bits<4> cond;
+ bit intcc;
+ bits<2> opf_cc;
+ bits<5> rs2;
+
+ let Inst{18} = 0;
+ let Inst{17-14} = cond;
+ let Inst{13} = intcc;
+ let Inst{12-11} = opf_cc;
+ let Inst{10-5} = opf_low;
+ let Inst{4-0} = rs2;
+}
+
+class F4_4r<bits<6> op3, bits<5> opf_low, bits<3> rcond, dag outs, dag ins,
+ string asmstr, list<dag> pattern,
+ InstrItinClass itin = NoItinerary>
+ : F4<op3, outs, ins, asmstr, pattern, itin> {
+ bits <5> rs1;
+ bits <5> rs2;
+ let Inst{18-14} = rs1;
+ let Inst{13} = 0; // IsImm
+ let Inst{12-10} = rcond;
+ let Inst{9-5} = opf_low;
+ let Inst{4-0} = rs2;
+}
+
+
+class F4_4i<bits<6> op3, bits<3> rcond, dag outs, dag ins,
+ string asmstr, list<dag> pattern,
+ InstrItinClass itin = NoItinerary>
+ : F4<op3, outs, ins, asmstr, pattern, itin> {
+ bits<5> rs1;
+ bits<10> simm10;
+ let Inst{18-14} = rs1;
+ let Inst{13} = 1; // IsImm
+ let Inst{12-10} = rcond;
+ let Inst{9-0} = simm10;
+}
+
+
+class TRAPSP<bits<6> op3Val, bit isimm, dag outs, dag ins,
+ string asmstr, list<dag> pattern,
+ InstrItinClass itin = NoItinerary>
+ : F3<outs, ins, asmstr, pattern, itin> {
+ bits<4> cond;
+ bits<2> cc;
+
+ let op = 0b10;
+ let rd{4} = 0;
+ let rd{3-0} = cond;
+ let op3 = op3Val;
+ let Inst{13} = isimm;
+ let Inst{12-11} = cc;
+
+}
+
+class TRAPSPrr<bits<6> op3Val, dag outs, dag ins,
+ string asmstr, list<dag> pattern,
+ InstrItinClass itin = NoItinerary>
+ : TRAPSP<op3Val, 0, outs, ins, asmstr, pattern, itin> {
+ bits<5> rs2;
+
+ let Inst{10-5} = 0;
+ let Inst{4-0} = rs2;
+}
+
+class TRAPSPri<bits<6> op3Val, dag outs, dag ins,
+ string asmstr, list<dag> pattern,
+ InstrItinClass itin = NoItinerary>
+ : TRAPSP<op3Val, 1, outs, ins, asmstr, pattern, itin> {
+ bits<8> imm;
+
+ let Inst{10-8} = 0;
+ let Inst{7-0} = imm;
+}
+
+// Pseudo-instructions for alternate assembly syntax (never used by codegen).
+// These are aliases that require C++ handling to convert to the target
+// instruction, while InstAliases can be handled directly by tblgen.
+class AsmPseudoInst<dag outs, dag ins, string asm>
+ : InstSP<outs, ins, asm, []> {
+ let isPseudo = 1;
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
new file mode 100644
index 000000000000..ea8ed830bafc
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -0,0 +1,510 @@
+//===-- SparcInstrInfo.cpp - Sparc Instruction Information ----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Sparc implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcInstrInfo.h"
+#include "Sparc.h"
+#include "SparcMachineFunctionInfo.h"
+#include "SparcSubtarget.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "SparcGenInstrInfo.inc"
+
+// Pin the vtable to this file.
+void SparcInstrInfo::anchor() {}
+
+SparcInstrInfo::SparcInstrInfo(SparcSubtarget &ST)
+ : SparcGenInstrInfo(SP::ADJCALLSTACKDOWN, SP::ADJCALLSTACKUP), RI(),
+ Subtarget(ST) {}
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the destination along with the FrameIndex of the loaded stack slot. If
+/// not, return 0. This predicate must return 0 if the instruction has
+/// any side effects other than loading from the stack slot.
+unsigned SparcInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const {
+ if (MI.getOpcode() == SP::LDri || MI.getOpcode() == SP::LDXri ||
+ MI.getOpcode() == SP::LDFri || MI.getOpcode() == SP::LDDFri ||
+ MI.getOpcode() == SP::LDQFri) {
+ if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
+ MI.getOperand(2).getImm() == 0) {
+ FrameIndex = MI.getOperand(1).getIndex();
+ return MI.getOperand(0).getReg();
+ }
+ }
+ return 0;
+}
+
+/// isStoreToStackSlot - If the specified machine instruction is a direct
+/// store to a stack slot, return the virtual or physical register number of
+/// the source reg along with the FrameIndex of the loaded stack slot. If
+/// not, return 0. This predicate must return 0 if the instruction has
+/// any side effects other than storing to the stack slot.
+unsigned SparcInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const {
+ if (MI.getOpcode() == SP::STri || MI.getOpcode() == SP::STXri ||
+ MI.getOpcode() == SP::STFri || MI.getOpcode() == SP::STDFri ||
+ MI.getOpcode() == SP::STQFri) {
+ if (MI.getOperand(0).isFI() && MI.getOperand(1).isImm() &&
+ MI.getOperand(1).getImm() == 0) {
+ FrameIndex = MI.getOperand(0).getIndex();
+ return MI.getOperand(2).getReg();
+ }
+ }
+ return 0;
+}
+
+static bool IsIntegerCC(unsigned CC)
+{
+ return (CC <= SPCC::ICC_VC);
+}
+
+static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
+{
+ switch(CC) {
+ case SPCC::ICC_A: return SPCC::ICC_N;
+ case SPCC::ICC_N: return SPCC::ICC_A;
+ case SPCC::ICC_NE: return SPCC::ICC_E;
+ case SPCC::ICC_E: return SPCC::ICC_NE;
+ case SPCC::ICC_G: return SPCC::ICC_LE;
+ case SPCC::ICC_LE: return SPCC::ICC_G;
+ case SPCC::ICC_GE: return SPCC::ICC_L;
+ case SPCC::ICC_L: return SPCC::ICC_GE;
+ case SPCC::ICC_GU: return SPCC::ICC_LEU;
+ case SPCC::ICC_LEU: return SPCC::ICC_GU;
+ case SPCC::ICC_CC: return SPCC::ICC_CS;
+ case SPCC::ICC_CS: return SPCC::ICC_CC;
+ case SPCC::ICC_POS: return SPCC::ICC_NEG;
+ case SPCC::ICC_NEG: return SPCC::ICC_POS;
+ case SPCC::ICC_VC: return SPCC::ICC_VS;
+ case SPCC::ICC_VS: return SPCC::ICC_VC;
+
+ case SPCC::FCC_A: return SPCC::FCC_N;
+ case SPCC::FCC_N: return SPCC::FCC_A;
+ case SPCC::FCC_U: return SPCC::FCC_O;
+ case SPCC::FCC_O: return SPCC::FCC_U;
+ case SPCC::FCC_G: return SPCC::FCC_ULE;
+ case SPCC::FCC_LE: return SPCC::FCC_UG;
+ case SPCC::FCC_UG: return SPCC::FCC_LE;
+ case SPCC::FCC_ULE: return SPCC::FCC_G;
+ case SPCC::FCC_L: return SPCC::FCC_UGE;
+ case SPCC::FCC_GE: return SPCC::FCC_UL;
+ case SPCC::FCC_UL: return SPCC::FCC_GE;
+ case SPCC::FCC_UGE: return SPCC::FCC_L;
+ case SPCC::FCC_LG: return SPCC::FCC_UE;
+ case SPCC::FCC_UE: return SPCC::FCC_LG;
+ case SPCC::FCC_NE: return SPCC::FCC_E;
+ case SPCC::FCC_E: return SPCC::FCC_NE;
+
+ case SPCC::CPCC_A: return SPCC::CPCC_N;
+ case SPCC::CPCC_N: return SPCC::CPCC_A;
+ case SPCC::CPCC_3: LLVM_FALLTHROUGH;
+ case SPCC::CPCC_2: LLVM_FALLTHROUGH;
+ case SPCC::CPCC_23: LLVM_FALLTHROUGH;
+ case SPCC::CPCC_1: LLVM_FALLTHROUGH;
+ case SPCC::CPCC_13: LLVM_FALLTHROUGH;
+ case SPCC::CPCC_12: LLVM_FALLTHROUGH;
+ case SPCC::CPCC_123: LLVM_FALLTHROUGH;
+ case SPCC::CPCC_0: LLVM_FALLTHROUGH;
+ case SPCC::CPCC_03: LLVM_FALLTHROUGH;
+ case SPCC::CPCC_02: LLVM_FALLTHROUGH;
+ case SPCC::CPCC_023: LLVM_FALLTHROUGH;
+ case SPCC::CPCC_01: LLVM_FALLTHROUGH;
+ case SPCC::CPCC_013: LLVM_FALLTHROUGH;
+ case SPCC::CPCC_012:
+ // "Opposite" code is not meaningful, as we don't know
+ // what the CoProc condition means here. The cond-code will
+ // only be used in inline assembler, so this code should
+ // not be reached in a normal compilation pass.
+ llvm_unreachable("Meaningless inversion of co-processor cond code");
+ }
+ llvm_unreachable("Invalid cond code");
+}
+
+static bool isUncondBranchOpcode(int Opc) { return Opc == SP::BA; }
+
+static bool isCondBranchOpcode(int Opc) {
+ return Opc == SP::FBCOND || Opc == SP::BCOND;
+}
+
+static bool isIndirectBranchOpcode(int Opc) {
+ return Opc == SP::BINDrr || Opc == SP::BINDri;
+}
+
+static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
+ SmallVectorImpl<MachineOperand> &Cond) {
+ Cond.push_back(MachineOperand::CreateImm(LastInst->getOperand(1).getImm()));
+ Target = LastInst->getOperand(0).getMBB();
+}
+
+bool SparcInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const {
+ MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
+ if (I == MBB.end())
+ return false;
+
+ if (!isUnpredicatedTerminator(*I))
+ return false;
+
+ // Get the last instruction in the block.
+ MachineInstr *LastInst = &*I;
+ unsigned LastOpc = LastInst->getOpcode();
+
+ // If there is only one terminator instruction, process it.
+ if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
+ if (isUncondBranchOpcode(LastOpc)) {
+ TBB = LastInst->getOperand(0).getMBB();
+ return false;
+ }
+ if (isCondBranchOpcode(LastOpc)) {
+ // Block ends with fall-through condbranch.
+ parseCondBranch(LastInst, TBB, Cond);
+ return false;
+ }
+ return true; // Can't handle indirect branch.
+ }
+
+ // Get the instruction before it if it is a terminator.
+ MachineInstr *SecondLastInst = &*I;
+ unsigned SecondLastOpc = SecondLastInst->getOpcode();
+
+ // If AllowModify is true and the block ends with two or more unconditional
+ // branches, delete all but the first unconditional branch.
+ if (AllowModify && isUncondBranchOpcode(LastOpc)) {
+ while (isUncondBranchOpcode(SecondLastOpc)) {
+ LastInst->eraseFromParent();
+ LastInst = SecondLastInst;
+ LastOpc = LastInst->getOpcode();
+ if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
+ // Return now the only terminator is an unconditional branch.
+ TBB = LastInst->getOperand(0).getMBB();
+ return false;
+ } else {
+ SecondLastInst = &*I;
+ SecondLastOpc = SecondLastInst->getOpcode();
+ }
+ }
+ }
+
+ // If there are three terminators, we don't know what sort of block this is.
+ if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
+ return true;
+
+ // If the block ends with a B and a Bcc, handle it.
+ if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+ parseCondBranch(SecondLastInst, TBB, Cond);
+ FBB = LastInst->getOperand(0).getMBB();
+ return false;
+ }
+
+ // If the block ends with two unconditional branches, handle it. The second
+ // one is not executed.
+ if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+ TBB = SecondLastInst->getOperand(0).getMBB();
+ return false;
+ }
+
+ // ...likewise if it ends with an indirect branch followed by an unconditional
+ // branch.
+ if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+ I = LastInst;
+ if (AllowModify)
+ I->eraseFromParent();
+ return true;
+ }
+
+ // Otherwise, can't handle this.
+ return true;
+}
+
+unsigned SparcInstrInfo::insertBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB,
+ ArrayRef<MachineOperand> Cond,
+ const DebugLoc &DL,
+ int *BytesAdded) const {
+ assert(TBB && "insertBranch must not be told to insert a fallthrough");
+ assert((Cond.size() == 1 || Cond.size() == 0) &&
+ "Sparc branch conditions should have one component!");
+ assert(!BytesAdded && "code size not handled");
+
+ if (Cond.empty()) {
+ assert(!FBB && "Unconditional branch with multiple successors!");
+ BuildMI(&MBB, DL, get(SP::BA)).addMBB(TBB);
+ return 1;
+ }
+
+ // Conditional branch
+ unsigned CC = Cond[0].getImm();
+
+ if (IsIntegerCC(CC))
+ BuildMI(&MBB, DL, get(SP::BCOND)).addMBB(TBB).addImm(CC);
+ else
+ BuildMI(&MBB, DL, get(SP::FBCOND)).addMBB(TBB).addImm(CC);
+ if (!FBB)
+ return 1;
+
+ BuildMI(&MBB, DL, get(SP::BA)).addMBB(FBB);
+ return 2;
+}
+
+unsigned SparcInstrInfo::removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved) const {
+ assert(!BytesRemoved && "code size not handled");
+
+ MachineBasicBlock::iterator I = MBB.end();
+ unsigned Count = 0;
+ while (I != MBB.begin()) {
+ --I;
+
+ if (I->isDebugValue())
+ continue;
+
+ if (I->getOpcode() != SP::BA
+ && I->getOpcode() != SP::BCOND
+ && I->getOpcode() != SP::FBCOND)
+ break; // Not a branch
+
+ I->eraseFromParent();
+ I = MBB.end();
+ ++Count;
+ }
+ return Count;
+}
+
+bool SparcInstrInfo::reverseBranchCondition(
+ SmallVectorImpl<MachineOperand> &Cond) const {
+ assert(Cond.size() == 1);
+ SPCC::CondCodes CC = static_cast<SPCC::CondCodes>(Cond[0].getImm());
+ Cond[0].setImm(GetOppositeBranchCondition(CC));
+ return false;
+}
+
+void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL, unsigned DestReg,
+ unsigned SrcReg, bool KillSrc) const {
+ unsigned numSubRegs = 0;
+ unsigned movOpc = 0;
+ const unsigned *subRegIdx = nullptr;
+ bool ExtraG0 = false;
+
+ const unsigned DW_SubRegsIdx[] = { SP::sub_even, SP::sub_odd };
+ const unsigned DFP_FP_SubRegsIdx[] = { SP::sub_even, SP::sub_odd };
+ const unsigned QFP_DFP_SubRegsIdx[] = { SP::sub_even64, SP::sub_odd64 };
+ const unsigned QFP_FP_SubRegsIdx[] = { SP::sub_even, SP::sub_odd,
+ SP::sub_odd64_then_sub_even,
+ SP::sub_odd64_then_sub_odd };
+
+ if (SP::IntRegsRegClass.contains(DestReg, SrcReg))
+ BuildMI(MBB, I, DL, get(SP::ORrr), DestReg).addReg(SP::G0)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ else if (SP::IntPairRegClass.contains(DestReg, SrcReg)) {
+ subRegIdx = DW_SubRegsIdx;
+ numSubRegs = 2;
+ movOpc = SP::ORrr;
+ ExtraG0 = true;
+ } else if (SP::FPRegsRegClass.contains(DestReg, SrcReg))
+ BuildMI(MBB, I, DL, get(SP::FMOVS), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ else if (SP::DFPRegsRegClass.contains(DestReg, SrcReg)) {
+ if (Subtarget.isV9()) {
+ BuildMI(MBB, I, DL, get(SP::FMOVD), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ } else {
+ // Use two FMOVS instructions.
+ subRegIdx = DFP_FP_SubRegsIdx;
+ numSubRegs = 2;
+ movOpc = SP::FMOVS;
+ }
+ } else if (SP::QFPRegsRegClass.contains(DestReg, SrcReg)) {
+ if (Subtarget.isV9()) {
+ if (Subtarget.hasHardQuad()) {
+ BuildMI(MBB, I, DL, get(SP::FMOVQ), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ } else {
+ // Use two FMOVD instructions.
+ subRegIdx = QFP_DFP_SubRegsIdx;
+ numSubRegs = 2;
+ movOpc = SP::FMOVD;
+ }
+ } else {
+ // Use four FMOVS instructions.
+ subRegIdx = QFP_FP_SubRegsIdx;
+ numSubRegs = 4;
+ movOpc = SP::FMOVS;
+ }
+ } else if (SP::ASRRegsRegClass.contains(DestReg) &&
+ SP::IntRegsRegClass.contains(SrcReg)) {
+ BuildMI(MBB, I, DL, get(SP::WRASRrr), DestReg)
+ .addReg(SP::G0)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ } else if (SP::IntRegsRegClass.contains(DestReg) &&
+ SP::ASRRegsRegClass.contains(SrcReg)) {
+ BuildMI(MBB, I, DL, get(SP::RDASR), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ } else
+ llvm_unreachable("Impossible reg-to-reg copy");
+
+ if (numSubRegs == 0 || subRegIdx == nullptr || movOpc == 0)
+ return;
+
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ MachineInstr *MovMI = nullptr;
+
+ for (unsigned i = 0; i != numSubRegs; ++i) {
+ unsigned Dst = TRI->getSubReg(DestReg, subRegIdx[i]);
+ unsigned Src = TRI->getSubReg(SrcReg, subRegIdx[i]);
+ assert(Dst && Src && "Bad sub-register");
+
+ MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(movOpc), Dst);
+ if (ExtraG0)
+ MIB.addReg(SP::G0);
+ MIB.addReg(Src);
+ MovMI = MIB.getInstr();
+ }
+ // Add implicit super-register defs and kills to the last MovMI.
+ MovMI->addRegisterDefined(DestReg, TRI);
+ if (KillSrc)
+ MovMI->addRegisterKilled(SrcReg, TRI);
+}
+
+void SparcInstrInfo::
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ unsigned SrcReg, bool isKill, int FI,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ DebugLoc DL;
+ if (I != MBB.end()) DL = I->getDebugLoc();
+
+ MachineFunction *MF = MBB.getParent();
+ const MachineFrameInfo &MFI = MF->getFrameInfo();
+ MachineMemOperand *MMO = MF->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore,
+ MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+
+ // On the order of operands here: think "[FrameIdx + 0] = SrcReg".
+ if (RC == &SP::I64RegsRegClass)
+ BuildMI(MBB, I, DL, get(SP::STXri)).addFrameIndex(FI).addImm(0)
+ .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
+ else if (RC == &SP::IntRegsRegClass)
+ BuildMI(MBB, I, DL, get(SP::STri)).addFrameIndex(FI).addImm(0)
+ .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
+ else if (RC == &SP::IntPairRegClass)
+ BuildMI(MBB, I, DL, get(SP::STDri)).addFrameIndex(FI).addImm(0)
+ .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
+ else if (RC == &SP::FPRegsRegClass)
+ BuildMI(MBB, I, DL, get(SP::STFri)).addFrameIndex(FI).addImm(0)
+ .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
+ else if (SP::DFPRegsRegClass.hasSubClassEq(RC))
+ BuildMI(MBB, I, DL, get(SP::STDFri)).addFrameIndex(FI).addImm(0)
+ .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
+ else if (SP::QFPRegsRegClass.hasSubClassEq(RC))
+ // Use STQFri irrespective of its legality. If STQ is not legal, it will be
+ // lowered into two STDs in eliminateFrameIndex.
+ BuildMI(MBB, I, DL, get(SP::STQFri)).addFrameIndex(FI).addImm(0)
+ .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
+ else
+ llvm_unreachable("Can't store this register to stack slot");
+}
+
+void SparcInstrInfo::
+loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ unsigned DestReg, int FI,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ DebugLoc DL;
+ if (I != MBB.end()) DL = I->getDebugLoc();
+
+ MachineFunction *MF = MBB.getParent();
+ const MachineFrameInfo &MFI = MF->getFrameInfo();
+ MachineMemOperand *MMO = MF->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad,
+ MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+
+ if (RC == &SP::I64RegsRegClass)
+ BuildMI(MBB, I, DL, get(SP::LDXri), DestReg).addFrameIndex(FI).addImm(0)
+ .addMemOperand(MMO);
+ else if (RC == &SP::IntRegsRegClass)
+ BuildMI(MBB, I, DL, get(SP::LDri), DestReg).addFrameIndex(FI).addImm(0)
+ .addMemOperand(MMO);
+ else if (RC == &SP::IntPairRegClass)
+ BuildMI(MBB, I, DL, get(SP::LDDri), DestReg).addFrameIndex(FI).addImm(0)
+ .addMemOperand(MMO);
+ else if (RC == &SP::FPRegsRegClass)
+ BuildMI(MBB, I, DL, get(SP::LDFri), DestReg).addFrameIndex(FI).addImm(0)
+ .addMemOperand(MMO);
+ else if (SP::DFPRegsRegClass.hasSubClassEq(RC))
+ BuildMI(MBB, I, DL, get(SP::LDDFri), DestReg).addFrameIndex(FI).addImm(0)
+ .addMemOperand(MMO);
+ else if (SP::QFPRegsRegClass.hasSubClassEq(RC))
+ // Use LDQFri irrespective of its legality. If LDQ is not legal, it will be
+ // lowered into two LDDs in eliminateFrameIndex.
+ BuildMI(MBB, I, DL, get(SP::LDQFri), DestReg).addFrameIndex(FI).addImm(0)
+ .addMemOperand(MMO);
+ else
+ llvm_unreachable("Can't load this register from stack slot");
+}
+
+unsigned SparcInstrInfo::getGlobalBaseReg(MachineFunction *MF) const
+{
+ SparcMachineFunctionInfo *SparcFI = MF->getInfo<SparcMachineFunctionInfo>();
+ unsigned GlobalBaseReg = SparcFI->getGlobalBaseReg();
+ if (GlobalBaseReg != 0)
+ return GlobalBaseReg;
+
+ // Insert the set of GlobalBaseReg into the first MBB of the function
+ MachineBasicBlock &FirstMBB = MF->front();
+ MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+ MachineRegisterInfo &RegInfo = MF->getRegInfo();
+
+ const TargetRegisterClass *PtrRC =
+ Subtarget.is64Bit() ? &SP::I64RegsRegClass : &SP::IntRegsRegClass;
+ GlobalBaseReg = RegInfo.createVirtualRegister(PtrRC);
+
+ DebugLoc dl;
+
+ BuildMI(FirstMBB, MBBI, dl, get(SP::GETPCX), GlobalBaseReg);
+ SparcFI->setGlobalBaseReg(GlobalBaseReg);
+ return GlobalBaseReg;
+}
+
+bool SparcInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ case TargetOpcode::LOAD_STACK_GUARD: {
+ assert(Subtarget.isTargetLinux() &&
+ "Only Linux target is expected to contain LOAD_STACK_GUARD");
+ // offsetof(tcbhead_t, stack_guard) from sysdeps/sparc/nptl/tls.h in glibc.
+ const int64_t Offset = Subtarget.is64Bit() ? 0x28 : 0x14;
+ MI.setDesc(get(Subtarget.is64Bit() ? SP::LDXri : SP::LDri));
+ MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+ .addReg(SP::G7)
+ .addImm(Offset);
+ return true;
+ }
+ }
+ return false;
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.h b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.h
new file mode 100644
index 000000000000..c053cc4c475b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.h
@@ -0,0 +1,108 @@
+//===-- SparcInstrInfo.h - Sparc Instruction Information --------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Sparc implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPARC_SPARCINSTRINFO_H
+#define LLVM_LIB_TARGET_SPARC_SPARCINSTRINFO_H
+
+#include "SparcRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "SparcGenInstrInfo.inc"
+
+namespace llvm {
+
+class SparcSubtarget;
+
+/// SPII - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace SPII {
+ enum {
+ Pseudo = (1<<0),
+ Load = (1<<1),
+ Store = (1<<2),
+ DelaySlot = (1<<3)
+ };
+}
+
+class SparcInstrInfo : public SparcGenInstrInfo {
+ const SparcRegisterInfo RI;
+ const SparcSubtarget& Subtarget;
+ virtual void anchor();
+public:
+ explicit SparcInstrInfo(SparcSubtarget &ST);
+
+ /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As
+ /// such, whenever a client has an instance of instruction info, it should
+ /// always be able to get register info as well (through this method).
+ ///
+ const SparcRegisterInfo &getRegisterInfo() const { return RI; }
+
+ /// isLoadFromStackSlot - If the specified machine instruction is a direct
+ /// load from a stack slot, return the virtual or physical register number of
+ /// the destination along with the FrameIndex of the loaded stack slot. If
+ /// not, return 0. This predicate must return 0 if the instruction has
+ /// any side effects other than loading from the stack slot.
+ unsigned isLoadFromStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const override;
+
+ /// isStoreToStackSlot - If the specified machine instruction is a direct
+ /// store to a stack slot, return the virtual or physical register number of
+ /// the source reg along with the FrameIndex of the loaded stack slot. If
+ /// not, return 0. This predicate must return 0 if the instruction has
+ /// any side effects other than storing to the stack slot.
+ unsigned isStoreToStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const override;
+
+ bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify = false) const override;
+
+ unsigned removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved = nullptr) const override;
+
+ unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+ const DebugLoc &DL,
+ int *BytesAdded = nullptr) const override;
+
+ bool
+ reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+
+ void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+ bool KillSrc) const override;
+
+ void storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ unsigned SrcReg, bool isKill, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
+
+ void loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ unsigned DestReg, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
+
+ unsigned getGlobalBaseReg(MachineFunction *MF) const;
+
+ // Lower pseudo instructions after register allocation.
+ bool expandPostRAPseudo(MachineInstr &MI) const override;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.td b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.td
new file mode 100644
index 000000000000..5a19c624abb5
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.td
@@ -0,0 +1,1692 @@
+//===-- SparcInstrInfo.td - Target Description for Sparc Target -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Sparc instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+
+include "SparcInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Feature predicates.
+//===----------------------------------------------------------------------===//
+
+// True when generating 32-bit code.
+def Is32Bit : Predicate<"!Subtarget->is64Bit()">;
+
+// True when generating 64-bit code. This also implies HasV9.
+def Is64Bit : Predicate<"Subtarget->is64Bit()">;
+
+// HasV9 - This predicate is true when the target processor supports V9
+// instructions. Note that the machine may be running in 32-bit mode.
+def HasV9 : Predicate<"Subtarget->isV9()">,
+ AssemblerPredicate<"FeatureV9">;
+
+// HasNoV9 - This predicate is true when the target doesn't have V9
+// instructions. Use of this is just a hack for the isel not having proper
+// costs for V8 instructions that are more expensive than their V9 ones.
+def HasNoV9 : Predicate<"!Subtarget->isV9()">;
+
+// HasVIS - This is true when the target processor has VIS extensions.
+def HasVIS : Predicate<"Subtarget->isVIS()">,
+ AssemblerPredicate<"FeatureVIS">;
+def HasVIS2 : Predicate<"Subtarget->isVIS2()">,
+ AssemblerPredicate<"FeatureVIS2">;
+def HasVIS3 : Predicate<"Subtarget->isVIS3()">,
+ AssemblerPredicate<"FeatureVIS3">;
+
+// HasHardQuad - This is true when the target processor supports quad floating
+// point instructions.
+def HasHardQuad : Predicate<"Subtarget->hasHardQuad()">;
+
+// HasLeonCASA - This is true when the target processor supports the CASA
+// instruction
+def HasLeonCASA : Predicate<"Subtarget->hasLeonCasa()">;
+
+// HasUMAC_SMAC - This is true when the target processor supports the
+// UMAC and SMAC instructions
+def HasUMAC_SMAC : Predicate<"Subtarget->hasUmacSmac()">;
+
+def HasNoFdivSqrtFix : Predicate<"!Subtarget->fixAllFDIVSQRT()">;
+def HasNoFmulsFix : Predicate<"!Subtarget->replaceFMULS()">;
+def HasNoFsmuldFix : Predicate<"!Subtarget->fixFSMULD()">;
+
+// UseDeprecatedInsts - This predicate is true when the target processor is a
+// V8, or when it is V9 but the V8 deprecated instructions are efficient enough
+// to use when appropriate. In either of these cases, the instruction selector
+// will pick deprecated instructions.
+def UseDeprecatedInsts : Predicate<"Subtarget->useDeprecatedV8Instructions()">;
+
+//===----------------------------------------------------------------------===//
+// Instruction Pattern Stuff
+//===----------------------------------------------------------------------===//
+
+def simm11 : PatLeaf<(imm), [{ return isInt<11>(N->getSExtValue()); }]>;
+
+def simm13 : PatLeaf<(imm), [{ return isInt<13>(N->getSExtValue()); }]>;
+
+def LO10 : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant((unsigned)N->getZExtValue() & 1023, SDLoc(N),
+ MVT::i32);
+}]>;
+
+def HI22 : SDNodeXForm<imm, [{
+ // Transformation function: shift the immediate value down into the low bits.
+ return CurDAG->getTargetConstant((unsigned)N->getZExtValue() >> 10, SDLoc(N),
+ MVT::i32);
+}]>;
+
+def SETHIimm : PatLeaf<(imm), [{
+ return isShiftedUInt<22, 10>(N->getZExtValue());
+}], HI22>;
+
+// Addressing modes.
+def ADDRrr : ComplexPattern<iPTR, 2, "SelectADDRrr", [], []>;
+def ADDRri : ComplexPattern<iPTR, 2, "SelectADDRri", [frameindex], []>;
+
+// Address operands
+def SparcMEMrrAsmOperand : AsmOperandClass {
+ let Name = "MEMrr";
+ let ParserMethod = "parseMEMOperand";
+}
+
+def SparcMEMriAsmOperand : AsmOperandClass {
+ let Name = "MEMri";
+ let ParserMethod = "parseMEMOperand";
+}
+
+def MEMrr : Operand<iPTR> {
+ let PrintMethod = "printMemOperand";
+ let MIOperandInfo = (ops ptr_rc, ptr_rc);
+ let ParserMatchClass = SparcMEMrrAsmOperand;
+}
+def MEMri : Operand<iPTR> {
+ let PrintMethod = "printMemOperand";
+ let MIOperandInfo = (ops ptr_rc, i32imm);
+ let ParserMatchClass = SparcMEMriAsmOperand;
+}
+
+def TLSSym : Operand<iPTR>;
+
+// Branch targets have OtherVT type.
+def brtarget : Operand<OtherVT> {
+ let EncoderMethod = "getBranchTargetOpValue";
+}
+
+def bprtarget : Operand<OtherVT> {
+ let EncoderMethod = "getBranchPredTargetOpValue";
+}
+
+def bprtarget16 : Operand<OtherVT> {
+ let EncoderMethod = "getBranchOnRegTargetOpValue";
+}
+
+def calltarget : Operand<i32> {
+ let EncoderMethod = "getCallTargetOpValue";
+ let DecoderMethod = "DecodeCall";
+}
+
+def simm13Op : Operand<i32> {
+ let DecoderMethod = "DecodeSIMM13";
+}
+
+// Operand for printing out a condition code.
+let PrintMethod = "printCCOperand" in
+ def CCOp : Operand<i32>;
+
+def SDTSPcmpicc :
+SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>]>;
+def SDTSPcmpfcc :
+SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisSameAs<0, 1>]>;
+def SDTSPbrcc :
+SDTypeProfile<0, 2, [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>]>;
+def SDTSPselectcc :
+SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i32>]>;
+def SDTSPFTOI :
+SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisFP<1>]>;
+def SDTSPITOF :
+SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVT<1, f32>]>;
+def SDTSPFTOX :
+SDTypeProfile<1, 1, [SDTCisVT<0, f64>, SDTCisFP<1>]>;
+def SDTSPXTOF :
+SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVT<1, f64>]>;
+
+def SDTSPtlsadd :
+SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
+def SDTSPtlsld :
+SDTypeProfile<1, 2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>;
+
+def SDTSPeh_sjlj_setjmp : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisPtrTy<1>]>;
+def SDTSPeh_sjlj_longjmp: SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+
+def SPcmpicc : SDNode<"SPISD::CMPICC", SDTSPcmpicc, [SDNPOutGlue]>;
+def SPcmpfcc : SDNode<"SPISD::CMPFCC", SDTSPcmpfcc, [SDNPOutGlue]>;
+def SPbricc : SDNode<"SPISD::BRICC", SDTSPbrcc, [SDNPHasChain, SDNPInGlue]>;
+def SPbrxcc : SDNode<"SPISD::BRXCC", SDTSPbrcc, [SDNPHasChain, SDNPInGlue]>;
+def SPbrfcc : SDNode<"SPISD::BRFCC", SDTSPbrcc, [SDNPHasChain, SDNPInGlue]>;
+
+def SPhi : SDNode<"SPISD::Hi", SDTIntUnaryOp>;
+def SPlo : SDNode<"SPISD::Lo", SDTIntUnaryOp>;
+
+def SPftoi : SDNode<"SPISD::FTOI", SDTSPFTOI>;
+def SPitof : SDNode<"SPISD::ITOF", SDTSPITOF>;
+def SPftox : SDNode<"SPISD::FTOX", SDTSPFTOX>;
+def SPxtof : SDNode<"SPISD::XTOF", SDTSPXTOF>;
+
+def SPselecticc : SDNode<"SPISD::SELECT_ICC", SDTSPselectcc, [SDNPInGlue]>;
+def SPselectxcc : SDNode<"SPISD::SELECT_XCC", SDTSPselectcc, [SDNPInGlue]>;
+def SPselectfcc : SDNode<"SPISD::SELECT_FCC", SDTSPselectcc, [SDNPInGlue]>;
+
+def SPsjlj_setjmp: SDNode<"SPISD::EH_SJLJ_SETJMP",
+ SDTSPeh_sjlj_setjmp,
+ [SDNPHasChain, SDNPSideEffect]>;
+def SPsjlj_longjmp: SDNode<"SPISD::EH_SJLJ_LONGJMP",
+ SDTSPeh_sjlj_longjmp,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+// These are target-independent nodes, but have target-specific formats.
+def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_SPCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>,
+ SDTCisVT<1, i32> ]>;
+
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_SPCallSeqStart,
+ [SDNPHasChain, SDNPOutGlue]>;
+def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_SPCallSeqEnd,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def SDT_SPCall : SDTypeProfile<0, -1, [SDTCisVT<0, i32>]>;
+def call : SDNode<"SPISD::CALL", SDT_SPCall,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+ SDNPVariadic]>;
+
+def SDT_SPRet : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
+def retflag : SDNode<"SPISD::RET_FLAG", SDT_SPRet,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+def flushw : SDNode<"SPISD::FLUSHW", SDTNone,
+ [SDNPHasChain, SDNPSideEffect, SDNPMayStore]>;
+
+def tlsadd : SDNode<"SPISD::TLS_ADD", SDTSPtlsadd>;
+def tlsld : SDNode<"SPISD::TLS_LD", SDTSPtlsld>;
+def tlscall : SDNode<"SPISD::TLS_CALL", SDT_SPCall,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+ SDNPVariadic]>;
+
+def getPCX : Operand<iPTR> {
+ let PrintMethod = "printGetPCX";
+}
+
+//===----------------------------------------------------------------------===//
+// SPARC Flag Conditions
+//===----------------------------------------------------------------------===//
+
+// Note that these values must be kept in sync with the CCOp::CondCode enum
+// values.
+class ICC_VAL<int N> : PatLeaf<(i32 N)>;
+def ICC_NE : ICC_VAL< 9>; // Not Equal
+def ICC_E : ICC_VAL< 1>; // Equal
+def ICC_G : ICC_VAL<10>; // Greater
+def ICC_LE : ICC_VAL< 2>; // Less or Equal
+def ICC_GE : ICC_VAL<11>; // Greater or Equal
+def ICC_L : ICC_VAL< 3>; // Less
+def ICC_GU : ICC_VAL<12>; // Greater Unsigned
+def ICC_LEU : ICC_VAL< 4>; // Less or Equal Unsigned
+def ICC_CC : ICC_VAL<13>; // Carry Clear/Great or Equal Unsigned
+def ICC_CS : ICC_VAL< 5>; // Carry Set/Less Unsigned
+def ICC_POS : ICC_VAL<14>; // Positive
+def ICC_NEG : ICC_VAL< 6>; // Negative
+def ICC_VC : ICC_VAL<15>; // Overflow Clear
+def ICC_VS : ICC_VAL< 7>; // Overflow Set
+
+class FCC_VAL<int N> : PatLeaf<(i32 N)>;
+def FCC_U : FCC_VAL<23>; // Unordered
+def FCC_G : FCC_VAL<22>; // Greater
+def FCC_UG : FCC_VAL<21>; // Unordered or Greater
+def FCC_L : FCC_VAL<20>; // Less
+def FCC_UL : FCC_VAL<19>; // Unordered or Less
+def FCC_LG : FCC_VAL<18>; // Less or Greater
+def FCC_NE : FCC_VAL<17>; // Not Equal
+def FCC_E : FCC_VAL<25>; // Equal
+def FCC_UE : FCC_VAL<26>; // Unordered or Equal
+def FCC_GE : FCC_VAL<27>; // Greater or Equal
+def FCC_UGE : FCC_VAL<28>; // Unordered or Greater or Equal
+def FCC_LE : FCC_VAL<29>; // Less or Equal
+def FCC_ULE : FCC_VAL<30>; // Unordered or Less or Equal
+def FCC_O : FCC_VAL<31>; // Ordered
+
+class CPCC_VAL<int N> : PatLeaf<(i32 N)>;
+def CPCC_3 : CPCC_VAL<39>; // 3
+def CPCC_2 : CPCC_VAL<38>; // 2
+def CPCC_23 : CPCC_VAL<37>; // 2 or 3
+def CPCC_1 : CPCC_VAL<36>; // 1
+def CPCC_13 : CPCC_VAL<35>; // 1 or 3
+def CPCC_12 : CPCC_VAL<34>; // 1 or 2
+def CPCC_123 : CPCC_VAL<33>; // 1 or 2 or 3
+def CPCC_0 : CPCC_VAL<41>; // 0
+def CPCC_03 : CPCC_VAL<42>; // 0 or 3
+def CPCC_02 : CPCC_VAL<43>; // 0 or 2
+def CPCC_023 : CPCC_VAL<44>; // 0 or 2 or 3
+def CPCC_01 : CPCC_VAL<45>; // 0 or 1
+def CPCC_013 : CPCC_VAL<46>; // 0 or 1 or 3
+def CPCC_012 : CPCC_VAL<47>; // 0 or 1 or 2
+
+//===----------------------------------------------------------------------===//
+// Instruction Class Templates
+//===----------------------------------------------------------------------===//
+
+/// F3_12 multiclass - Define a normal F3_1/F3_2 pattern in one shot.
+multiclass F3_12<string OpcStr, bits<6> Op3Val, SDNode OpNode,
+ RegisterClass RC, ValueType Ty, Operand immOp,
+ InstrItinClass itin = IIC_iu_instr> {
+ def rr : F3_1<2, Op3Val,
+ (outs RC:$rd), (ins RC:$rs1, RC:$rs2),
+ !strconcat(OpcStr, " $rs1, $rs2, $rd"),
+ [(set Ty:$rd, (OpNode Ty:$rs1, Ty:$rs2))],
+ itin>;
+ def ri : F3_2<2, Op3Val,
+ (outs RC:$rd), (ins RC:$rs1, immOp:$simm13),
+ !strconcat(OpcStr, " $rs1, $simm13, $rd"),
+ [(set Ty:$rd, (OpNode Ty:$rs1, (Ty simm13:$simm13)))],
+ itin>;
+}
+
+/// F3_12np multiclass - Define a normal F3_1/F3_2 pattern in one shot, with no
+/// pattern.
+multiclass F3_12np<string OpcStr, bits<6> Op3Val, InstrItinClass itin = IIC_iu_instr> {
+ def rr : F3_1<2, Op3Val,
+ (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2),
+ !strconcat(OpcStr, " $rs1, $rs2, $rd"), [],
+ itin>;
+ def ri : F3_2<2, Op3Val,
+ (outs IntRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13),
+ !strconcat(OpcStr, " $rs1, $simm13, $rd"), [],
+ itin>;
+}
+
+// Load multiclass - Define both Reg+Reg/Reg+Imm patterns in one shot.
+multiclass Load<string OpcStr, bits<6> Op3Val, SDPatternOperator OpNode,
+ RegisterClass RC, ValueType Ty, InstrItinClass itin = IIC_iu_instr> {
+ def rr : F3_1<3, Op3Val,
+ (outs RC:$dst), (ins MEMrr:$addr),
+ !strconcat(OpcStr, " [$addr], $dst"),
+ [(set Ty:$dst, (OpNode ADDRrr:$addr))],
+ itin>;
+ def ri : F3_2<3, Op3Val,
+ (outs RC:$dst), (ins MEMri:$addr),
+ !strconcat(OpcStr, " [$addr], $dst"),
+ [(set Ty:$dst, (OpNode ADDRri:$addr))],
+ itin>;
+}
+
+// TODO: Instructions of the LoadASI class are currently asm only; hooking up
+// CodeGen's address spaces to use these is a future task.
+class LoadASI<string OpcStr, bits<6> Op3Val, SDPatternOperator OpNode,
+ RegisterClass RC, ValueType Ty, InstrItinClass itin = NoItinerary> :
+ F3_1_asi<3, Op3Val, (outs RC:$dst), (ins MEMrr:$addr, i8imm:$asi),
+ !strconcat(OpcStr, "a [$addr] $asi, $dst"),
+ []>;
+
+// LoadA multiclass - As above, but also define alternate address space variant
+multiclass LoadA<string OpcStr, bits<6> Op3Val, bits<6> LoadAOp3Val,
+ SDPatternOperator OpNode, RegisterClass RC, ValueType Ty,
+ InstrItinClass itin = NoItinerary> :
+ Load<OpcStr, Op3Val, OpNode, RC, Ty, itin> {
+ def Arr : LoadASI<OpcStr, LoadAOp3Val, OpNode, RC, Ty>;
+}
+
+// The LDSTUB instruction is supported for asm only.
+// It is unlikely that general-purpose code could make use of it.
+// CAS is preferred for sparc v9.
+def LDSTUBrr : F3_1<3, 0b001101, (outs IntRegs:$dst), (ins MEMrr:$addr),
+ "ldstub [$addr], $dst", []>;
+def LDSTUBri : F3_2<3, 0b001101, (outs IntRegs:$dst), (ins MEMri:$addr),
+ "ldstub [$addr], $dst", []>;
+def LDSTUBArr : F3_1_asi<3, 0b011101, (outs IntRegs:$dst),
+ (ins MEMrr:$addr, i8imm:$asi),
+ "ldstuba [$addr] $asi, $dst", []>;
+
+// Store multiclass - Define both Reg+Reg/Reg+Imm patterns in one shot.
+multiclass Store<string OpcStr, bits<6> Op3Val, SDPatternOperator OpNode,
+ RegisterClass RC, ValueType Ty, InstrItinClass itin = IIC_st> {
+ def rr : F3_1<3, Op3Val,
+ (outs), (ins MEMrr:$addr, RC:$rd),
+ !strconcat(OpcStr, " $rd, [$addr]"),
+ [(OpNode Ty:$rd, ADDRrr:$addr)],
+ itin>;
+ def ri : F3_2<3, Op3Val,
+ (outs), (ins MEMri:$addr, RC:$rd),
+ !strconcat(OpcStr, " $rd, [$addr]"),
+ [(OpNode Ty:$rd, ADDRri:$addr)],
+ itin>;
+}
+
+// TODO: Instructions of the StoreASI class are currently asm only; hooking up
+// CodeGen's address spaces to use these is a future task.
+class StoreASI<string OpcStr, bits<6> Op3Val,
+ SDPatternOperator OpNode, RegisterClass RC, ValueType Ty,
+ InstrItinClass itin = IIC_st> :
+ F3_1_asi<3, Op3Val, (outs), (ins MEMrr:$addr, RC:$rd, i8imm:$asi),
+ !strconcat(OpcStr, "a $rd, [$addr] $asi"),
+ [],
+ itin>;
+
+multiclass StoreA<string OpcStr, bits<6> Op3Val, bits<6> StoreAOp3Val,
+ SDPatternOperator OpNode, RegisterClass RC, ValueType Ty,
+ InstrItinClass itin = IIC_st> :
+ Store<OpcStr, Op3Val, OpNode, RC, Ty> {
+ def Arr : StoreASI<OpcStr, StoreAOp3Val, OpNode, RC, Ty, itin>;
+}
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+// Pseudo instructions.
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSP<outs, ins, asmstr, pattern> {
+ let isCodeGenOnly = 1;
+ let isPseudo = 1;
+}
+
+// GETPCX for PIC
+let Defs = [O7] in {
+ def GETPCX : Pseudo<(outs getPCX:$getpcseq), (ins), "$getpcseq", [] >;
+}
+
+let Defs = [O6], Uses = [O6] in {
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
+ "!ADJCALLSTACKDOWN $amt",
+ [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+ "!ADJCALLSTACKUP $amt1",
+ [(callseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+let hasSideEffects = 1, mayStore = 1 in {
+ let rd = 0, rs1 = 0, rs2 = 0 in
+ def FLUSHW : F3_1<0b10, 0b101011, (outs), (ins),
+ "flushw",
+ [(flushw)]>, Requires<[HasV9]>;
+ let rd = 0, rs1 = 1, simm13 = 3 in
+ def TA3 : F3_2<0b10, 0b111010, (outs), (ins),
+ "ta 3",
+ [(flushw)]>;
+}
+
+// SELECT_CC_* - Used to implement the SELECT_CC DAG operation. Expanded after
+// instruction selection into a branch sequence. This has to handle all
+// permutations of selection between i32/f32/f64 on ICC and FCC.
+// Expanded after instruction selection.
+let Uses = [ICC], usesCustomInserter = 1 in {
+ def SELECT_CC_Int_ICC
+ : Pseudo<(outs IntRegs:$dst), (ins IntRegs:$T, IntRegs:$F, i32imm:$Cond),
+ "; SELECT_CC_Int_ICC PSEUDO!",
+ [(set i32:$dst, (SPselecticc i32:$T, i32:$F, imm:$Cond))]>;
+ def SELECT_CC_FP_ICC
+ : Pseudo<(outs FPRegs:$dst), (ins FPRegs:$T, FPRegs:$F, i32imm:$Cond),
+ "; SELECT_CC_FP_ICC PSEUDO!",
+ [(set f32:$dst, (SPselecticc f32:$T, f32:$F, imm:$Cond))]>;
+
+ def SELECT_CC_DFP_ICC
+ : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$T, DFPRegs:$F, i32imm:$Cond),
+ "; SELECT_CC_DFP_ICC PSEUDO!",
+ [(set f64:$dst, (SPselecticc f64:$T, f64:$F, imm:$Cond))]>;
+
+ def SELECT_CC_QFP_ICC
+ : Pseudo<(outs QFPRegs:$dst), (ins QFPRegs:$T, QFPRegs:$F, i32imm:$Cond),
+ "; SELECT_CC_QFP_ICC PSEUDO!",
+ [(set f128:$dst, (SPselecticc f128:$T, f128:$F, imm:$Cond))]>;
+}
+
+let usesCustomInserter = 1, Uses = [FCC0] in {
+
+ def SELECT_CC_Int_FCC
+ : Pseudo<(outs IntRegs:$dst), (ins IntRegs:$T, IntRegs:$F, i32imm:$Cond),
+ "; SELECT_CC_Int_FCC PSEUDO!",
+ [(set i32:$dst, (SPselectfcc i32:$T, i32:$F, imm:$Cond))]>;
+
+ def SELECT_CC_FP_FCC
+ : Pseudo<(outs FPRegs:$dst), (ins FPRegs:$T, FPRegs:$F, i32imm:$Cond),
+ "; SELECT_CC_FP_FCC PSEUDO!",
+ [(set f32:$dst, (SPselectfcc f32:$T, f32:$F, imm:$Cond))]>;
+ def SELECT_CC_DFP_FCC
+ : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$T, DFPRegs:$F, i32imm:$Cond),
+ "; SELECT_CC_DFP_FCC PSEUDO!",
+ [(set f64:$dst, (SPselectfcc f64:$T, f64:$F, imm:$Cond))]>;
+ def SELECT_CC_QFP_FCC
+ : Pseudo<(outs QFPRegs:$dst), (ins QFPRegs:$T, QFPRegs:$F, i32imm:$Cond),
+ "; SELECT_CC_QFP_FCC PSEUDO!",
+ [(set f128:$dst, (SPselectfcc f128:$T, f128:$F, imm:$Cond))]>;
+}
+
+let hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in {
+ let Defs = [WIM] in
+ def EH_SJLJ_SETJMP32ri : Pseudo<(outs IntRegs:$dst), (ins MEMri:$buf),
+ "#EH_SJLJ_SETJMP32",
+ [(set i32:$dst, (SPsjlj_setjmp ADDRri:$buf))]>,
+ Requires<[Is32Bit]>;
+ def EH_SJLJ_SETJMP32rr : Pseudo<(outs IntRegs:$dst), (ins MEMrr:$buf),
+ "#EH_SJLJ_SETJMP32",
+ [(set i32:$dst, (SPsjlj_setjmp ADDRrr:$buf))]>,
+ Requires<[Is32Bit]>;
+ let isTerminator = 1 in
+ def EH_SJLJ_LONGJMP32ri : Pseudo<(outs), (ins MEMri:$buf),
+ "#EH_SJLJ_LONGJMP32",
+ [(SPsjlj_longjmp ADDRri:$buf)]>,
+ Requires<[Is32Bit]>;
+ def EH_SJLJ_LONGJMP32rr : Pseudo<(outs), (ins MEMrr:$buf),
+ "#EH_SJLJ_LONGJMP32",
+ [(SPsjlj_longjmp ADDRrr:$buf)]>,
+ Requires<[Is32Bit]>;
+}
+
+// Section B.1 - Load Integer Instructions, p. 90
+let DecoderMethod = "DecodeLoadInt" in {
+ defm LDSB : LoadA<"ldsb", 0b001001, 0b011001, sextloadi8, IntRegs, i32>;
+ defm LDSH : LoadA<"ldsh", 0b001010, 0b011010, sextloadi16, IntRegs, i32>;
+ defm LDUB : LoadA<"ldub", 0b000001, 0b010001, zextloadi8, IntRegs, i32>;
+ defm LDUH : LoadA<"lduh", 0b000010, 0b010010, zextloadi16, IntRegs, i32>;
+ defm LD : LoadA<"ld", 0b000000, 0b010000, load, IntRegs, i32>;
+}
+
+let DecoderMethod = "DecodeLoadIntPair" in
+ defm LDD : LoadA<"ldd", 0b000011, 0b010011, load, IntPair, v2i32, IIC_ldd>;
+
+// Section B.2 - Load Floating-point Instructions, p. 92
+let DecoderMethod = "DecodeLoadFP" in {
+ defm LDF : Load<"ld", 0b100000, load, FPRegs, f32, IIC_iu_or_fpu_instr>;
+ def LDFArr : LoadASI<"ld", 0b110000, load, FPRegs, f32, IIC_iu_or_fpu_instr>,
+ Requires<[HasV9]>;
+}
+let DecoderMethod = "DecodeLoadDFP" in {
+ defm LDDF : Load<"ldd", 0b100011, load, DFPRegs, f64, IIC_ldd>;
+ def LDDFArr : LoadASI<"ldd", 0b110011, load, DFPRegs, f64>,
+ Requires<[HasV9]>;
+}
+let DecoderMethod = "DecodeLoadQFP" in
+ defm LDQF : LoadA<"ldq", 0b100010, 0b110010, load, QFPRegs, f128>,
+ Requires<[HasV9, HasHardQuad]>;
+
+let DecoderMethod = "DecodeLoadCP" in
+ defm LDC : Load<"ld", 0b110000, load, CoprocRegs, i32>;
+let DecoderMethod = "DecodeLoadCPPair" in
+ defm LDDC : Load<"ldd", 0b110011, load, CoprocPair, v2i32, IIC_ldd>;
+
+let DecoderMethod = "DecodeLoadCP", Defs = [CPSR] in {
+ let rd = 0 in {
+ def LDCSRrr : F3_1<3, 0b110001, (outs), (ins MEMrr:$addr),
+ "ld [$addr], %csr", []>;
+ def LDCSRri : F3_2<3, 0b110001, (outs), (ins MEMri:$addr),
+ "ld [$addr], %csr", []>;
+ }
+}
+
+let DecoderMethod = "DecodeLoadFP" in
+ let Defs = [FSR] in {
+ let rd = 0 in {
+ def LDFSRrr : F3_1<3, 0b100001, (outs), (ins MEMrr:$addr),
+ "ld [$addr], %fsr", [], IIC_iu_or_fpu_instr>;
+ def LDFSRri : F3_2<3, 0b100001, (outs), (ins MEMri:$addr),
+ "ld [$addr], %fsr", [], IIC_iu_or_fpu_instr>;
+ }
+ let rd = 1 in {
+ def LDXFSRrr : F3_1<3, 0b100001, (outs), (ins MEMrr:$addr),
+ "ldx [$addr], %fsr", []>, Requires<[HasV9]>;
+ def LDXFSRri : F3_2<3, 0b100001, (outs), (ins MEMri:$addr),
+ "ldx [$addr], %fsr", []>, Requires<[HasV9]>;
+ }
+ }
+
+// Section B.4 - Store Integer Instructions, p. 95
+let DecoderMethod = "DecodeStoreInt" in {
+ defm STB : StoreA<"stb", 0b000101, 0b010101, truncstorei8, IntRegs, i32>;
+ defm STH : StoreA<"sth", 0b000110, 0b010110, truncstorei16, IntRegs, i32>;
+ defm ST : StoreA<"st", 0b000100, 0b010100, store, IntRegs, i32>;
+}
+
+let DecoderMethod = "DecodeStoreIntPair" in
+ defm STD : StoreA<"std", 0b000111, 0b010111, store, IntPair, v2i32, IIC_std>;
+
+// Section B.5 - Store Floating-point Instructions, p. 97
+let DecoderMethod = "DecodeStoreFP" in {
+ defm STF : Store<"st", 0b100100, store, FPRegs, f32>;
+ def STFArr : StoreASI<"st", 0b110100, store, FPRegs, f32>,
+ Requires<[HasV9]>;
+}
+let DecoderMethod = "DecodeStoreDFP" in {
+ defm STDF : Store<"std", 0b100111, store, DFPRegs, f64, IIC_std>;
+ def STDFArr : StoreASI<"std", 0b110111, store, DFPRegs, f64>,
+ Requires<[HasV9]>;
+}
+let DecoderMethod = "DecodeStoreQFP" in
+ defm STQF : StoreA<"stq", 0b100110, 0b110110, store, QFPRegs, f128>,
+ Requires<[HasV9, HasHardQuad]>;
+
+let DecoderMethod = "DecodeStoreCP" in
+ defm STC : Store<"st", 0b110100, store, CoprocRegs, i32>;
+
+let DecoderMethod = "DecodeStoreCPPair" in
+ defm STDC : Store<"std", 0b110111, store, CoprocPair, v2i32, IIC_std>;
+
+let DecoderMethod = "DecodeStoreCP", rd = 0 in {
+ let Defs = [CPSR] in {
+ def STCSRrr : F3_1<3, 0b110101, (outs MEMrr:$addr), (ins),
+ "st %csr, [$addr]", [], IIC_st>;
+ def STCSRri : F3_2<3, 0b110101, (outs MEMri:$addr), (ins),
+ "st %csr, [$addr]", [], IIC_st>;
+ }
+ let Defs = [CPQ] in {
+ def STDCQrr : F3_1<3, 0b110110, (outs MEMrr:$addr), (ins),
+ "std %cq, [$addr]", [], IIC_std>;
+ def STDCQri : F3_2<3, 0b110110, (outs MEMri:$addr), (ins),
+ "std %cq, [$addr]", [], IIC_std>;
+ }
+}
+
+let DecoderMethod = "DecodeStoreFP" in {
+ let rd = 0 in {
+ let Defs = [FSR] in {
+ def STFSRrr : F3_1<3, 0b100101, (outs MEMrr:$addr), (ins),
+ "st %fsr, [$addr]", [], IIC_st>;
+ def STFSRri : F3_2<3, 0b100101, (outs MEMri:$addr), (ins),
+ "st %fsr, [$addr]", [], IIC_st>;
+ }
+ let Defs = [FQ] in {
+ def STDFQrr : F3_1<3, 0b100110, (outs MEMrr:$addr), (ins),
+ "std %fq, [$addr]", [], IIC_std>;
+ def STDFQri : F3_2<3, 0b100110, (outs MEMri:$addr), (ins),
+ "std %fq, [$addr]", [], IIC_std>;
+ }
+ }
+ let rd = 1, Defs = [FSR] in {
+ def STXFSRrr : F3_1<3, 0b100101, (outs MEMrr:$addr), (ins),
+ "stx %fsr, [$addr]", []>, Requires<[HasV9]>;
+ def STXFSRri : F3_2<3, 0b100101, (outs MEMri:$addr), (ins),
+ "stx %fsr, [$addr]", []>, Requires<[HasV9]>;
+ }
+}
+
+// Section B.8 - SWAP Register with Memory Instruction
+// (Atomic swap)
+let Constraints = "$val = $dst", DecoderMethod = "DecodeSWAP" in {
+ def SWAPrr : F3_1<3, 0b001111,
+ (outs IntRegs:$dst), (ins MEMrr:$addr, IntRegs:$val),
+ "swap [$addr], $dst",
+ [(set i32:$dst, (atomic_swap_32 ADDRrr:$addr, i32:$val))]>;
+ def SWAPri : F3_2<3, 0b001111,
+ (outs IntRegs:$dst), (ins MEMri:$addr, IntRegs:$val),
+ "swap [$addr], $dst",
+ [(set i32:$dst, (atomic_swap_32 ADDRri:$addr, i32:$val))]>;
+ def SWAPArr : F3_1_asi<3, 0b011111,
+ (outs IntRegs:$dst), (ins MEMrr:$addr, i8imm:$asi, IntRegs:$val),
+ "swapa [$addr] $asi, $dst",
+ [/*FIXME: pattern?*/]>;
+}
+
+
+// Section B.9 - SETHI Instruction, p. 104
+def SETHIi: F2_1<0b100,
+ (outs IntRegs:$rd), (ins i32imm:$imm22),
+ "sethi $imm22, $rd",
+ [(set i32:$rd, SETHIimm:$imm22)],
+ IIC_iu_instr>;
+
+// Section B.10 - NOP Instruction, p. 105
+// (It's a special case of SETHI)
+let rd = 0, imm22 = 0 in
+ def NOP : F2_1<0b100, (outs), (ins), "nop", []>;
+
+// Section B.11 - Logical Instructions, p. 106
+defm AND : F3_12<"and", 0b000001, and, IntRegs, i32, simm13Op>;
+
+def ANDNrr : F3_1<2, 0b000101,
+ (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2),
+ "andn $rs1, $rs2, $rd",
+ [(set i32:$rd, (and i32:$rs1, (not i32:$rs2)))]>;
+def ANDNri : F3_2<2, 0b000101,
+ (outs IntRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13),
+ "andn $rs1, $simm13, $rd", []>;
+
+defm OR : F3_12<"or", 0b000010, or, IntRegs, i32, simm13Op>;
+
+def ORNrr : F3_1<2, 0b000110,
+ (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2),
+ "orn $rs1, $rs2, $rd",
+ [(set i32:$rd, (or i32:$rs1, (not i32:$rs2)))]>;
+def ORNri : F3_2<2, 0b000110,
+ (outs IntRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13),
+ "orn $rs1, $simm13, $rd", []>;
+defm XOR : F3_12<"xor", 0b000011, xor, IntRegs, i32, simm13Op>;
+
+def XNORrr : F3_1<2, 0b000111,
+ (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2),
+ "xnor $rs1, $rs2, $rd",
+ [(set i32:$rd, (not (xor i32:$rs1, i32:$rs2)))]>;
+def XNORri : F3_2<2, 0b000111,
+ (outs IntRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13),
+ "xnor $rs1, $simm13, $rd", []>;
+
+let Defs = [ICC] in {
+ defm ANDCC : F3_12np<"andcc", 0b010001>;
+ defm ANDNCC : F3_12np<"andncc", 0b010101>;
+ defm ORCC : F3_12np<"orcc", 0b010010>;
+ defm ORNCC : F3_12np<"orncc", 0b010110>;
+ defm XORCC : F3_12np<"xorcc", 0b010011>;
+ defm XNORCC : F3_12np<"xnorcc", 0b010111>;
+}
+
+// Section B.12 - Shift Instructions, p. 107
+defm SLL : F3_12<"sll", 0b100101, shl, IntRegs, i32, simm13Op>;
+defm SRL : F3_12<"srl", 0b100110, srl, IntRegs, i32, simm13Op>;
+defm SRA : F3_12<"sra", 0b100111, sra, IntRegs, i32, simm13Op>;
+
+// Section B.13 - Add Instructions, p. 108
+defm ADD : F3_12<"add", 0b000000, add, IntRegs, i32, simm13Op>;
+
+// "LEA" forms of add (patterns to make tblgen happy)
+let Predicates = [Is32Bit], isCodeGenOnly = 1 in
+ def LEA_ADDri : F3_2<2, 0b000000,
+ (outs IntRegs:$dst), (ins MEMri:$addr),
+ "add ${addr:arith}, $dst",
+ [(set iPTR:$dst, ADDRri:$addr)]>;
+
+let Defs = [ICC] in
+ defm ADDCC : F3_12<"addcc", 0b010000, addc, IntRegs, i32, simm13Op>;
+
+let Uses = [ICC] in
+ defm ADDC : F3_12np<"addx", 0b001000>;
+
+let Uses = [ICC], Defs = [ICC] in
+ defm ADDE : F3_12<"addxcc", 0b011000, adde, IntRegs, i32, simm13Op>;
+
+// Section B.15 - Subtract Instructions, p. 110
+defm SUB : F3_12 <"sub" , 0b000100, sub, IntRegs, i32, simm13Op>;
+let Uses = [ICC], Defs = [ICC] in
+ defm SUBE : F3_12 <"subxcc" , 0b011100, sube, IntRegs, i32, simm13Op>;
+
+let Defs = [ICC] in
+ defm SUBCC : F3_12 <"subcc", 0b010100, subc, IntRegs, i32, simm13Op>;
+
+let Uses = [ICC] in
+ defm SUBC : F3_12np <"subx", 0b001100>;
+
+// cmp (from Section A.3) is a specialized alias for subcc
+let Defs = [ICC], rd = 0 in {
+ def CMPrr : F3_1<2, 0b010100,
+ (outs), (ins IntRegs:$rs1, IntRegs:$rs2),
+ "cmp $rs1, $rs2",
+ [(SPcmpicc i32:$rs1, i32:$rs2)]>;
+ def CMPri : F3_2<2, 0b010100,
+ (outs), (ins IntRegs:$rs1, simm13Op:$simm13),
+ "cmp $rs1, $simm13",
+ [(SPcmpicc i32:$rs1, (i32 simm13:$simm13))]>;
+}
+
+// Section B.18 - Multiply Instructions, p. 113
+let Defs = [Y] in {
+ defm UMUL : F3_12<"umul", 0b001010, umullohi, IntRegs, i32, simm13Op, IIC_iu_umul>;
+ defm SMUL : F3_12<"smul", 0b001011, smullohi, IntRegs, i32, simm13Op, IIC_iu_smul>;
+}
+
+let Defs = [Y, ICC] in {
+ defm UMULCC : F3_12np<"umulcc", 0b011010, IIC_iu_umul>;
+ defm SMULCC : F3_12np<"smulcc", 0b011011, IIC_iu_smul>;
+}
+
+let Defs = [Y, ICC], Uses = [Y, ICC] in {
+ defm MULSCC : F3_12np<"mulscc", 0b100100>;
+}
+
+// Section B.19 - Divide Instructions, p. 115
+let Uses = [Y], Defs = [Y] in {
+ defm UDIV : F3_12np<"udiv", 0b001110, IIC_iu_div>;
+ defm SDIV : F3_12np<"sdiv", 0b001111, IIC_iu_div>;
+}
+
+let Uses = [Y], Defs = [Y, ICC] in {
+ defm UDIVCC : F3_12np<"udivcc", 0b011110, IIC_iu_div>;
+ defm SDIVCC : F3_12np<"sdivcc", 0b011111, IIC_iu_div>;
+}
+
+// Section B.20 - SAVE and RESTORE, p. 117
+defm SAVE : F3_12np<"save" , 0b111100>;
+defm RESTORE : F3_12np<"restore", 0b111101>;
+
+// Section B.21 - Branch on Integer Condition Codes Instructions, p. 119
+
+// unconditional branch class.
+class BranchAlways<dag ins, string asmstr, list<dag> pattern>
+ : F2_2<0b010, 0, (outs), ins, asmstr, pattern> {
+ let isBranch = 1;
+ let isTerminator = 1;
+ let hasDelaySlot = 1;
+ let isBarrier = 1;
+}
+
+let cond = 8 in
+ def BA : BranchAlways<(ins brtarget:$imm22), "ba $imm22", [(br bb:$imm22)]>;
+
+
+let isBranch = 1, isTerminator = 1, hasDelaySlot = 1 in {
+
+// conditional branch class:
+class BranchSP<dag ins, string asmstr, list<dag> pattern>
+ : F2_2<0b010, 0, (outs), ins, asmstr, pattern, IIC_iu_instr>;
+
+// conditional branch with annul class:
+class BranchSPA<dag ins, string asmstr, list<dag> pattern>
+ : F2_2<0b010, 1, (outs), ins, asmstr, pattern, IIC_iu_instr>;
+
+// Conditional branch class on %icc|%xcc with predication:
+multiclass IPredBranch<string regstr, list<dag> CCPattern> {
+ def CC : F2_3<0b001, 0, 1, (outs), (ins bprtarget:$imm19, CCOp:$cond),
+ !strconcat("b$cond ", !strconcat(regstr, ", $imm19")),
+ CCPattern,
+ IIC_iu_instr>;
+ def CCA : F2_3<0b001, 1, 1, (outs), (ins bprtarget:$imm19, CCOp:$cond),
+ !strconcat("b$cond,a ", !strconcat(regstr, ", $imm19")),
+ [],
+ IIC_iu_instr>;
+ def CCNT : F2_3<0b001, 0, 0, (outs), (ins bprtarget:$imm19, CCOp:$cond),
+ !strconcat("b$cond,pn ", !strconcat(regstr, ", $imm19")),
+ [],
+ IIC_iu_instr>;
+ def CCANT : F2_3<0b001, 1, 0, (outs), (ins bprtarget:$imm19, CCOp:$cond),
+ !strconcat("b$cond,a,pn ", !strconcat(regstr, ", $imm19")),
+ [],
+ IIC_iu_instr>;
+}
+
+} // let isBranch = 1, isTerminator = 1, hasDelaySlot = 1
+
+
+// Indirect branch instructions.
+let isTerminator = 1, isBarrier = 1, hasDelaySlot = 1, isBranch =1,
+ isIndirectBranch = 1, rd = 0, isCodeGenOnly = 1 in {
+ def BINDrr : F3_1<2, 0b111000,
+ (outs), (ins MEMrr:$ptr),
+ "jmp $ptr",
+ [(brind ADDRrr:$ptr)]>;
+ def BINDri : F3_2<2, 0b111000,
+ (outs), (ins MEMri:$ptr),
+ "jmp $ptr",
+ [(brind ADDRri:$ptr)]>;
+}
+
+let Uses = [ICC] in {
+ def BCOND : BranchSP<(ins brtarget:$imm22, CCOp:$cond),
+ "b$cond $imm22",
+ [(SPbricc bb:$imm22, imm:$cond)]>;
+ def BCONDA : BranchSPA<(ins brtarget:$imm22, CCOp:$cond),
+ "b$cond,a $imm22", []>;
+
+ let Predicates = [HasV9], cc = 0b00 in
+ defm BPI : IPredBranch<"%icc", []>;
+}
+
+// Section B.22 - Branch on Floating-point Condition Codes Instructions, p. 121
+
+let isBranch = 1, isTerminator = 1, hasDelaySlot = 1 in {
+
+// floating-point conditional branch class:
+class FPBranchSP<dag ins, string asmstr, list<dag> pattern>
+ : F2_2<0b110, 0, (outs), ins, asmstr, pattern, IIC_fpu_normal_instr>;
+
+// floating-point conditional branch with annul class:
+class FPBranchSPA<dag ins, string asmstr, list<dag> pattern>
+ : F2_2<0b110, 1, (outs), ins, asmstr, pattern, IIC_fpu_normal_instr>;
+
+// Conditional branch class on %fcc0-%fcc3 with predication:
+multiclass FPredBranch {
+ def CC : F2_3<0b101, 0, 1, (outs), (ins bprtarget:$imm19, CCOp:$cond,
+ FCCRegs:$cc),
+ "fb$cond $cc, $imm19", [], IIC_fpu_normal_instr>;
+ def CCA : F2_3<0b101, 1, 1, (outs), (ins bprtarget:$imm19, CCOp:$cond,
+ FCCRegs:$cc),
+ "fb$cond,a $cc, $imm19", [], IIC_fpu_normal_instr>;
+ def CCNT : F2_3<0b101, 0, 0, (outs), (ins bprtarget:$imm19, CCOp:$cond,
+ FCCRegs:$cc),
+ "fb$cond,pn $cc, $imm19", [], IIC_fpu_normal_instr>;
+ def CCANT : F2_3<0b101, 1, 0, (outs), (ins bprtarget:$imm19, CCOp:$cond,
+ FCCRegs:$cc),
+ "fb$cond,a,pn $cc, $imm19", [], IIC_fpu_normal_instr>;
+}
+} // let isBranch = 1, isTerminator = 1, hasDelaySlot = 1
+
+let Uses = [FCC0] in {
+ def FBCOND : FPBranchSP<(ins brtarget:$imm22, CCOp:$cond),
+ "fb$cond $imm22",
+ [(SPbrfcc bb:$imm22, imm:$cond)]>;
+ def FBCONDA : FPBranchSPA<(ins brtarget:$imm22, CCOp:$cond),
+ "fb$cond,a $imm22", []>;
+}
+
+let Predicates = [HasV9] in
+ defm BPF : FPredBranch;
+
+// Section B.22 - Branch on Co-processor Condition Codes Instructions, p. 123
+let isBranch = 1, isTerminator = 1, hasDelaySlot = 1 in {
+
+// co-processor conditional branch class:
+class CPBranchSP<dag ins, string asmstr, list<dag> pattern>
+ : F2_2<0b111, 0, (outs), ins, asmstr, pattern>;
+
+// co-processor conditional branch with annul class:
+class CPBranchSPA<dag ins, string asmstr, list<dag> pattern>
+ : F2_2<0b111, 1, (outs), ins, asmstr, pattern>;
+
+} // let isBranch = 1, isTerminator = 1, hasDelaySlot = 1
+
+def CBCOND : CPBranchSP<(ins brtarget:$imm22, CCOp:$cond),
+ "cb$cond $imm22",
+ [(SPbrfcc bb:$imm22, imm:$cond)]>;
+def CBCONDA : CPBranchSPA<(ins brtarget:$imm22, CCOp:$cond),
+ "cb$cond,a $imm22", []>;
+
+// Section B.24 - Call and Link Instruction, p. 125
+// This is the only Format 1 instruction
+let Uses = [O6],
+ hasDelaySlot = 1, isCall = 1 in {
+ def CALL : InstSP<(outs), (ins calltarget:$disp, variable_ops),
+ "call $disp",
+ [],
+ IIC_jmp_or_call> {
+ bits<30> disp;
+ let op = 1;
+ let Inst{29-0} = disp;
+ }
+
+ // indirect calls: special cases of JMPL.
+ let isCodeGenOnly = 1, rd = 15 in {
+ def CALLrr : F3_1<2, 0b111000,
+ (outs), (ins MEMrr:$ptr, variable_ops),
+ "call $ptr",
+ [(call ADDRrr:$ptr)],
+ IIC_jmp_or_call>;
+ def CALLri : F3_2<2, 0b111000,
+ (outs), (ins MEMri:$ptr, variable_ops),
+ "call $ptr",
+ [(call ADDRri:$ptr)],
+ IIC_jmp_or_call>;
+ }
+}
+
+// Section B.25 - Jump and Link Instruction
+
+// JMPL Instruction.
+let isTerminator = 1, hasDelaySlot = 1, isBarrier = 1,
+ DecoderMethod = "DecodeJMPL" in {
+ def JMPLrr: F3_1<2, 0b111000,
+ (outs IntRegs:$dst), (ins MEMrr:$addr),
+ "jmpl $addr, $dst",
+ [],
+ IIC_jmp_or_call>;
+ def JMPLri: F3_2<2, 0b111000,
+ (outs IntRegs:$dst), (ins MEMri:$addr),
+ "jmpl $addr, $dst",
+ [],
+ IIC_jmp_or_call>;
+}
+
+// Section A.3 - Synthetic Instructions, p. 85
+// special cases of JMPL:
+let isReturn = 1, isTerminator = 1, hasDelaySlot = 1, isBarrier = 1,
+ isCodeGenOnly = 1 in {
+ let rd = 0, rs1 = 15 in
+ def RETL: F3_2<2, 0b111000,
+ (outs), (ins i32imm:$val),
+ "jmp %o7+$val",
+ [(retflag simm13:$val)],
+ IIC_jmp_or_call>;
+
+ let rd = 0, rs1 = 31 in
+ def RET: F3_2<2, 0b111000,
+ (outs), (ins i32imm:$val),
+ "jmp %i7+$val",
+ [],
+ IIC_jmp_or_call>;
+}
+
+// Section B.26 - Return from Trap Instruction
+let isReturn = 1, isTerminator = 1, hasDelaySlot = 1,
+ isBarrier = 1, rd = 0, DecoderMethod = "DecodeReturn" in {
+ def RETTrr : F3_1<2, 0b111001,
+ (outs), (ins MEMrr:$addr),
+ "rett $addr",
+ [],
+ IIC_jmp_or_call>;
+ def RETTri : F3_2<2, 0b111001,
+ (outs), (ins MEMri:$addr),
+ "rett $addr",
+ [],
+ IIC_jmp_or_call>;
+}
+
+
+// Section B.27 - Trap on Integer Condition Codes Instruction
+// conditional branch class:
+let DecoderNamespace = "SparcV8", DecoderMethod = "DecodeTRAP", hasSideEffects = 1, Uses = [ICC], cc = 0b00 in
+{
+ def TRAPrr : TRAPSPrr<0b111010,
+ (outs), (ins IntRegs:$rs1, IntRegs:$rs2, CCOp:$cond),
+ "t$cond $rs1 + $rs2",
+ []>;
+ def TRAPri : TRAPSPri<0b111010,
+ (outs), (ins IntRegs:$rs1, i32imm:$imm, CCOp:$cond),
+ "t$cond $rs1 + $imm",
+ []>;
+}
+
+multiclass TRAP<string regStr> {
+ def rr : TRAPSPrr<0b111010,
+ (outs), (ins IntRegs:$rs1, IntRegs:$rs2, CCOp:$cond),
+ !strconcat(!strconcat("t$cond ", regStr), ", $rs1 + $rs2"),
+ []>;
+ def ri : TRAPSPri<0b111010,
+ (outs), (ins IntRegs:$rs1, i32imm:$imm, CCOp:$cond),
+ !strconcat(!strconcat("t$cond ", regStr), ", $rs1 + $imm"),
+ []>;
+}
+
+let DecoderNamespace = "SparcV9", DecoderMethod = "DecodeTRAP", Predicates = [HasV9], hasSideEffects = 1, Uses = [ICC], cc = 0b00 in
+ defm TICC : TRAP<"%icc">;
+
+
+let isBarrier = 1, isTerminator = 1, rd = 0b01000, rs1 = 0, simm13 = 5 in
+ def TA5 : F3_2<0b10, 0b111010, (outs), (ins), "ta 5", [(trap)]>;
+
+// Section B.28 - Read State Register Instructions
+let rs2 = 0 in
+ def RDASR : F3_1<2, 0b101000,
+ (outs IntRegs:$rd), (ins ASRRegs:$rs1),
+ "rd $rs1, $rd", []>;
+
+// PSR, WIM, and TBR don't exist on the SparcV9, only the V8.
+let Predicates = [HasNoV9] in {
+ let rs2 = 0, rs1 = 0, Uses=[PSR] in
+ def RDPSR : F3_1<2, 0b101001,
+ (outs IntRegs:$rd), (ins),
+ "rd %psr, $rd", []>;
+
+ let rs2 = 0, rs1 = 0, Uses=[WIM] in
+ def RDWIM : F3_1<2, 0b101010,
+ (outs IntRegs:$rd), (ins),
+ "rd %wim, $rd", []>;
+
+ let rs2 = 0, rs1 = 0, Uses=[TBR] in
+ def RDTBR : F3_1<2, 0b101011,
+ (outs IntRegs:$rd), (ins),
+ "rd %tbr, $rd", []>;
+}
+
+// Section B.29 - Write State Register Instructions
+def WRASRrr : F3_1<2, 0b110000,
+ (outs ASRRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2),
+ "wr $rs1, $rs2, $rd", []>;
+def WRASRri : F3_2<2, 0b110000,
+ (outs ASRRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13),
+ "wr $rs1, $simm13, $rd", []>;
+
+// PSR, WIM, and TBR don't exist on the SparcV9, only the V8.
+let Predicates = [HasNoV9] in {
+ let Defs = [PSR], rd=0 in {
+ def WRPSRrr : F3_1<2, 0b110001,
+ (outs), (ins IntRegs:$rs1, IntRegs:$rs2),
+ "wr $rs1, $rs2, %psr", []>;
+ def WRPSRri : F3_2<2, 0b110001,
+ (outs), (ins IntRegs:$rs1, simm13Op:$simm13),
+ "wr $rs1, $simm13, %psr", []>;
+ }
+
+ let Defs = [WIM], rd=0 in {
+ def WRWIMrr : F3_1<2, 0b110010,
+ (outs), (ins IntRegs:$rs1, IntRegs:$rs2),
+ "wr $rs1, $rs2, %wim", []>;
+ def WRWIMri : F3_2<2, 0b110010,
+ (outs), (ins IntRegs:$rs1, simm13Op:$simm13),
+ "wr $rs1, $simm13, %wim", []>;
+ }
+
+ let Defs = [TBR], rd=0 in {
+ def WRTBRrr : F3_1<2, 0b110011,
+ (outs), (ins IntRegs:$rs1, IntRegs:$rs2),
+ "wr $rs1, $rs2, %tbr", []>;
+ def WRTBRri : F3_2<2, 0b110011,
+ (outs), (ins IntRegs:$rs1, simm13Op:$simm13),
+ "wr $rs1, $simm13, %tbr", []>;
+ }
+}
+
+// Section B.30 - STBAR Instruction
+let hasSideEffects = 1, rd = 0, rs1 = 0b01111, rs2 = 0 in
+ def STBAR : F3_1<2, 0b101000, (outs), (ins), "stbar", []>;
+
+
+// Section B.31 - Unimplmented Instruction
+let rd = 0 in
+ def UNIMP : F2_1<0b000, (outs), (ins i32imm:$imm22),
+ "unimp $imm22", []>;
+
+// Section B.32 - Flush Instruction Memory
+let rd = 0 in {
+ def FLUSHrr : F3_1<2, 0b111011, (outs), (ins MEMrr:$addr),
+ "flush $addr", []>;
+ def FLUSHri : F3_2<2, 0b111011, (outs), (ins MEMri:$addr),
+ "flush $addr", []>;
+
+ // The no-arg FLUSH is only here for the benefit of the InstAlias
+ // "flush", which cannot seem to use FLUSHrr, due to the inability
+ // to construct a MEMrr with fixed G0 registers.
+ let rs1 = 0, rs2 = 0 in
+ def FLUSH : F3_1<2, 0b111011, (outs), (ins), "flush %g0", []>;
+}
+
+// Section B.33 - Floating-point Operate (FPop) Instructions
+
+// Convert Integer to Floating-point Instructions, p. 141
+def FITOS : F3_3u<2, 0b110100, 0b011000100,
+ (outs FPRegs:$rd), (ins FPRegs:$rs2),
+ "fitos $rs2, $rd",
+ [(set FPRegs:$rd, (SPitof FPRegs:$rs2))],
+ IIC_fpu_fast_instr>;
+def FITOD : F3_3u<2, 0b110100, 0b011001000,
+ (outs DFPRegs:$rd), (ins FPRegs:$rs2),
+ "fitod $rs2, $rd",
+ [(set DFPRegs:$rd, (SPitof FPRegs:$rs2))],
+ IIC_fpu_fast_instr>;
+def FITOQ : F3_3u<2, 0b110100, 0b011001100,
+ (outs QFPRegs:$rd), (ins FPRegs:$rs2),
+ "fitoq $rs2, $rd",
+ [(set QFPRegs:$rd, (SPitof FPRegs:$rs2))]>,
+ Requires<[HasHardQuad]>;
+
+// Convert Floating-point to Integer Instructions, p. 142
+def FSTOI : F3_3u<2, 0b110100, 0b011010001,
+ (outs FPRegs:$rd), (ins FPRegs:$rs2),
+ "fstoi $rs2, $rd",
+ [(set FPRegs:$rd, (SPftoi FPRegs:$rs2))],
+ IIC_fpu_fast_instr>;
+def FDTOI : F3_3u<2, 0b110100, 0b011010010,
+ (outs FPRegs:$rd), (ins DFPRegs:$rs2),
+ "fdtoi $rs2, $rd",
+ [(set FPRegs:$rd, (SPftoi DFPRegs:$rs2))],
+ IIC_fpu_fast_instr>;
+def FQTOI : F3_3u<2, 0b110100, 0b011010011,
+ (outs FPRegs:$rd), (ins QFPRegs:$rs2),
+ "fqtoi $rs2, $rd",
+ [(set FPRegs:$rd, (SPftoi QFPRegs:$rs2))]>,
+ Requires<[HasHardQuad]>;
+
+// Convert between Floating-point Formats Instructions, p. 143
+def FSTOD : F3_3u<2, 0b110100, 0b011001001,
+ (outs DFPRegs:$rd), (ins FPRegs:$rs2),
+ "fstod $rs2, $rd",
+ [(set f64:$rd, (fpextend f32:$rs2))],
+ IIC_fpu_stod>;
+def FSTOQ : F3_3u<2, 0b110100, 0b011001101,
+ (outs QFPRegs:$rd), (ins FPRegs:$rs2),
+ "fstoq $rs2, $rd",
+ [(set f128:$rd, (fpextend f32:$rs2))]>,
+ Requires<[HasHardQuad]>;
+def FDTOS : F3_3u<2, 0b110100, 0b011000110,
+ (outs FPRegs:$rd), (ins DFPRegs:$rs2),
+ "fdtos $rs2, $rd",
+ [(set f32:$rd, (fpround f64:$rs2))],
+ IIC_fpu_fast_instr>;
+def FDTOQ : F3_3u<2, 0b110100, 0b011001110,
+ (outs QFPRegs:$rd), (ins DFPRegs:$rs2),
+ "fdtoq $rs2, $rd",
+ [(set f128:$rd, (fpextend f64:$rs2))]>,
+ Requires<[HasHardQuad]>;
+def FQTOS : F3_3u<2, 0b110100, 0b011000111,
+ (outs FPRegs:$rd), (ins QFPRegs:$rs2),
+ "fqtos $rs2, $rd",
+ [(set f32:$rd, (fpround f128:$rs2))]>,
+ Requires<[HasHardQuad]>;
+def FQTOD : F3_3u<2, 0b110100, 0b011001011,
+ (outs DFPRegs:$rd), (ins QFPRegs:$rs2),
+ "fqtod $rs2, $rd",
+ [(set f64:$rd, (fpround f128:$rs2))]>,
+ Requires<[HasHardQuad]>;
+
+// Floating-point Move Instructions, p. 144
+def FMOVS : F3_3u<2, 0b110100, 0b000000001,
+ (outs FPRegs:$rd), (ins FPRegs:$rs2),
+ "fmovs $rs2, $rd", []>;
+def FNEGS : F3_3u<2, 0b110100, 0b000000101,
+ (outs FPRegs:$rd), (ins FPRegs:$rs2),
+ "fnegs $rs2, $rd",
+ [(set f32:$rd, (fneg f32:$rs2))],
+ IIC_fpu_negs>;
+def FABSS : F3_3u<2, 0b110100, 0b000001001,
+ (outs FPRegs:$rd), (ins FPRegs:$rs2),
+ "fabss $rs2, $rd",
+ [(set f32:$rd, (fabs f32:$rs2))],
+ IIC_fpu_abs>;
+
+
+// Floating-point Square Root Instructions, p.145
+// FSQRTS generates an erratum on LEON processors, so by disabling this instruction
+// this will be promoted to use FSQRTD with doubles instead.
+let Predicates = [HasNoFdivSqrtFix] in
+def FSQRTS : F3_3u<2, 0b110100, 0b000101001,
+ (outs FPRegs:$rd), (ins FPRegs:$rs2),
+ "fsqrts $rs2, $rd",
+ [(set f32:$rd, (fsqrt f32:$rs2))],
+ IIC_fpu_sqrts>;
+def FSQRTD : F3_3u<2, 0b110100, 0b000101010,
+ (outs DFPRegs:$rd), (ins DFPRegs:$rs2),
+ "fsqrtd $rs2, $rd",
+ [(set f64:$rd, (fsqrt f64:$rs2))],
+ IIC_fpu_sqrtd>;
+def FSQRTQ : F3_3u<2, 0b110100, 0b000101011,
+ (outs QFPRegs:$rd), (ins QFPRegs:$rs2),
+ "fsqrtq $rs2, $rd",
+ [(set f128:$rd, (fsqrt f128:$rs2))]>,
+ Requires<[HasHardQuad]>;
+
+
+
+// Floating-point Add and Subtract Instructions, p. 146
+def FADDS : F3_3<2, 0b110100, 0b001000001,
+ (outs FPRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2),
+ "fadds $rs1, $rs2, $rd",
+ [(set f32:$rd, (fadd f32:$rs1, f32:$rs2))],
+ IIC_fpu_fast_instr>;
+def FADDD : F3_3<2, 0b110100, 0b001000010,
+ (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+ "faddd $rs1, $rs2, $rd",
+ [(set f64:$rd, (fadd f64:$rs1, f64:$rs2))],
+ IIC_fpu_fast_instr>;
+def FADDQ : F3_3<2, 0b110100, 0b001000011,
+ (outs QFPRegs:$rd), (ins QFPRegs:$rs1, QFPRegs:$rs2),
+ "faddq $rs1, $rs2, $rd",
+ [(set f128:$rd, (fadd f128:$rs1, f128:$rs2))]>,
+ Requires<[HasHardQuad]>;
+
+def FSUBS : F3_3<2, 0b110100, 0b001000101,
+ (outs FPRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2),
+ "fsubs $rs1, $rs2, $rd",
+ [(set f32:$rd, (fsub f32:$rs1, f32:$rs2))],
+ IIC_fpu_fast_instr>;
+def FSUBD : F3_3<2, 0b110100, 0b001000110,
+ (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+ "fsubd $rs1, $rs2, $rd",
+ [(set f64:$rd, (fsub f64:$rs1, f64:$rs2))],
+ IIC_fpu_fast_instr>;
+def FSUBQ : F3_3<2, 0b110100, 0b001000111,
+ (outs QFPRegs:$rd), (ins QFPRegs:$rs1, QFPRegs:$rs2),
+ "fsubq $rs1, $rs2, $rd",
+ [(set f128:$rd, (fsub f128:$rs1, f128:$rs2))]>,
+ Requires<[HasHardQuad]>;
+
+
+// Floating-point Multiply and Divide Instructions, p. 147
+// FMULS generates an erratum on LEON processors, so by disabling this instruction
+// this will be promoted to use FMULD with doubles instead.
+let Predicates = [HasNoFmulsFix] in
+def FMULS : F3_3<2, 0b110100, 0b001001001,
+ (outs FPRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2),
+ "fmuls $rs1, $rs2, $rd",
+ [(set f32:$rd, (fmul f32:$rs1, f32:$rs2))],
+ IIC_fpu_muls>;
+def FMULD : F3_3<2, 0b110100, 0b001001010,
+ (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+ "fmuld $rs1, $rs2, $rd",
+ [(set f64:$rd, (fmul f64:$rs1, f64:$rs2))],
+ IIC_fpu_muld>;
+def FMULQ : F3_3<2, 0b110100, 0b001001011,
+ (outs QFPRegs:$rd), (ins QFPRegs:$rs1, QFPRegs:$rs2),
+ "fmulq $rs1, $rs2, $rd",
+ [(set f128:$rd, (fmul f128:$rs1, f128:$rs2))]>,
+ Requires<[HasHardQuad]>;
+
+let Predicates = [HasNoFsmuldFix] in
+def FSMULD : F3_3<2, 0b110100, 0b001101001,
+ (outs DFPRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2),
+ "fsmuld $rs1, $rs2, $rd",
+ [(set f64:$rd, (fmul (fpextend f32:$rs1),
+ (fpextend f32:$rs2)))],
+ IIC_fpu_muld>;
+def FDMULQ : F3_3<2, 0b110100, 0b001101110,
+ (outs QFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+ "fdmulq $rs1, $rs2, $rd",
+ [(set f128:$rd, (fmul (fpextend f64:$rs1),
+ (fpextend f64:$rs2)))]>,
+ Requires<[HasHardQuad]>;
+
+// FDIVS generates an erratum on LEON processors, so by disabling this instruction
+// this will be promoted to use FDIVD with doubles instead.
+def FDIVS : F3_3<2, 0b110100, 0b001001101,
+ (outs FPRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2),
+ "fdivs $rs1, $rs2, $rd",
+ [(set f32:$rd, (fdiv f32:$rs1, f32:$rs2))],
+ IIC_fpu_divs>;
+def FDIVD : F3_3<2, 0b110100, 0b001001110,
+ (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+ "fdivd $rs1, $rs2, $rd",
+ [(set f64:$rd, (fdiv f64:$rs1, f64:$rs2))],
+ IIC_fpu_divd>;
+def FDIVQ : F3_3<2, 0b110100, 0b001001111,
+ (outs QFPRegs:$rd), (ins QFPRegs:$rs1, QFPRegs:$rs2),
+ "fdivq $rs1, $rs2, $rd",
+ [(set f128:$rd, (fdiv f128:$rs1, f128:$rs2))]>,
+ Requires<[HasHardQuad]>;
+
+// Floating-point Compare Instructions, p. 148
+// Note: the 2nd template arg is different for these guys.
+// Note 2: the result of a FCMP is not available until the 2nd cycle
+// after the instr is retired, but there is no interlock in Sparc V8.
+// This behavior is modeled with a forced noop after the instruction in
+// DelaySlotFiller.
+
+let Defs = [FCC0], rd = 0, isCodeGenOnly = 1 in {
+ def FCMPS : F3_3c<2, 0b110101, 0b001010001,
+ (outs), (ins FPRegs:$rs1, FPRegs:$rs2),
+ "fcmps $rs1, $rs2",
+ [(SPcmpfcc f32:$rs1, f32:$rs2)],
+ IIC_fpu_fast_instr>;
+ def FCMPD : F3_3c<2, 0b110101, 0b001010010,
+ (outs), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+ "fcmpd $rs1, $rs2",
+ [(SPcmpfcc f64:$rs1, f64:$rs2)],
+ IIC_fpu_fast_instr>;
+ def FCMPQ : F3_3c<2, 0b110101, 0b001010011,
+ (outs), (ins QFPRegs:$rs1, QFPRegs:$rs2),
+ "fcmpq $rs1, $rs2",
+ [(SPcmpfcc f128:$rs1, f128:$rs2)]>,
+ Requires<[HasHardQuad]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Instructions for Thread Local Storage(TLS).
+//===----------------------------------------------------------------------===//
+let isCodeGenOnly = 1, isAsmParserOnly = 1 in {
+def TLS_ADDrr : F3_1<2, 0b000000,
+ (outs IntRegs:$rd),
+ (ins IntRegs:$rs1, IntRegs:$rs2, TLSSym:$sym),
+ "add $rs1, $rs2, $rd, $sym",
+ [(set i32:$rd,
+ (tlsadd i32:$rs1, i32:$rs2, tglobaltlsaddr:$sym))]>;
+
+let mayLoad = 1 in
+ def TLS_LDrr : F3_1<3, 0b000000,
+ (outs IntRegs:$dst), (ins MEMrr:$addr, TLSSym:$sym),
+ "ld [$addr], $dst, $sym",
+ [(set i32:$dst,
+ (tlsld ADDRrr:$addr, tglobaltlsaddr:$sym))]>;
+
+let Uses = [O6], isCall = 1, hasDelaySlot = 1 in
+ def TLS_CALL : InstSP<(outs),
+ (ins calltarget:$disp, TLSSym:$sym, variable_ops),
+ "call $disp, $sym",
+ [(tlscall texternalsym:$disp, tglobaltlsaddr:$sym)],
+ IIC_jmp_or_call> {
+ bits<30> disp;
+ let op = 1;
+ let Inst{29-0} = disp;
+}
+}
+
+//===----------------------------------------------------------------------===//
+// V9 Instructions
+//===----------------------------------------------------------------------===//
+
+// V9 Conditional Moves.
+let Predicates = [HasV9], Constraints = "$f = $rd" in {
+ // Move Integer Register on Condition (MOVcc) p. 194 of the V9 manual.
+ let Uses = [ICC], intcc = 1, cc = 0b00 in {
+ def MOVICCrr
+ : F4_1<0b101100, (outs IntRegs:$rd),
+ (ins IntRegs:$rs2, IntRegs:$f, CCOp:$cond),
+ "mov$cond %icc, $rs2, $rd",
+ [(set i32:$rd, (SPselecticc i32:$rs2, i32:$f, imm:$cond))]>;
+
+ def MOVICCri
+ : F4_2<0b101100, (outs IntRegs:$rd),
+ (ins i32imm:$simm11, IntRegs:$f, CCOp:$cond),
+ "mov$cond %icc, $simm11, $rd",
+ [(set i32:$rd,
+ (SPselecticc simm11:$simm11, i32:$f, imm:$cond))]>;
+ }
+
+ let Uses = [FCC0], intcc = 0, cc = 0b00 in {
+ def MOVFCCrr
+ : F4_1<0b101100, (outs IntRegs:$rd),
+ (ins IntRegs:$rs2, IntRegs:$f, CCOp:$cond),
+ "mov$cond %fcc0, $rs2, $rd",
+ [(set i32:$rd, (SPselectfcc i32:$rs2, i32:$f, imm:$cond))]>;
+ def MOVFCCri
+ : F4_2<0b101100, (outs IntRegs:$rd),
+ (ins i32imm:$simm11, IntRegs:$f, CCOp:$cond),
+ "mov$cond %fcc0, $simm11, $rd",
+ [(set i32:$rd,
+ (SPselectfcc simm11:$simm11, i32:$f, imm:$cond))]>;
+ }
+
+ let Uses = [ICC], intcc = 1, opf_cc = 0b00 in {
+ def FMOVS_ICC
+ : F4_3<0b110101, 0b000001, (outs FPRegs:$rd),
+ (ins FPRegs:$rs2, FPRegs:$f, CCOp:$cond),
+ "fmovs$cond %icc, $rs2, $rd",
+ [(set f32:$rd, (SPselecticc f32:$rs2, f32:$f, imm:$cond))]>;
+ def FMOVD_ICC
+ : F4_3<0b110101, 0b000010, (outs DFPRegs:$rd),
+ (ins DFPRegs:$rs2, DFPRegs:$f, CCOp:$cond),
+ "fmovd$cond %icc, $rs2, $rd",
+ [(set f64:$rd, (SPselecticc f64:$rs2, f64:$f, imm:$cond))]>;
+ def FMOVQ_ICC
+ : F4_3<0b110101, 0b000011, (outs QFPRegs:$rd),
+ (ins QFPRegs:$rs2, QFPRegs:$f, CCOp:$cond),
+ "fmovq$cond %icc, $rs2, $rd",
+ [(set f128:$rd, (SPselecticc f128:$rs2, f128:$f, imm:$cond))]>,
+ Requires<[HasHardQuad]>;
+ }
+
+ let Uses = [FCC0], intcc = 0, opf_cc = 0b00 in {
+ def FMOVS_FCC
+ : F4_3<0b110101, 0b000001, (outs FPRegs:$rd),
+ (ins FPRegs:$rs2, FPRegs:$f, CCOp:$cond),
+ "fmovs$cond %fcc0, $rs2, $rd",
+ [(set f32:$rd, (SPselectfcc f32:$rs2, f32:$f, imm:$cond))]>;
+ def FMOVD_FCC
+ : F4_3<0b110101, 0b000010, (outs DFPRegs:$rd),
+ (ins DFPRegs:$rs2, DFPRegs:$f, CCOp:$cond),
+ "fmovd$cond %fcc0, $rs2, $rd",
+ [(set f64:$rd, (SPselectfcc f64:$rs2, f64:$f, imm:$cond))]>;
+ def FMOVQ_FCC
+ : F4_3<0b110101, 0b000011, (outs QFPRegs:$rd),
+ (ins QFPRegs:$rs2, QFPRegs:$f, CCOp:$cond),
+ "fmovq$cond %fcc0, $rs2, $rd",
+ [(set f128:$rd, (SPselectfcc f128:$rs2, f128:$f, imm:$cond))]>,
+ Requires<[HasHardQuad]>;
+ }
+
+}
+
+// Floating-Point Move Instructions, p. 164 of the V9 manual.
+let Predicates = [HasV9] in {
+ def FMOVD : F3_3u<2, 0b110100, 0b000000010,
+ (outs DFPRegs:$rd), (ins DFPRegs:$rs2),
+ "fmovd $rs2, $rd", []>;
+ def FMOVQ : F3_3u<2, 0b110100, 0b000000011,
+ (outs QFPRegs:$rd), (ins QFPRegs:$rs2),
+ "fmovq $rs2, $rd", []>,
+ Requires<[HasHardQuad]>;
+ def FNEGD : F3_3u<2, 0b110100, 0b000000110,
+ (outs DFPRegs:$rd), (ins DFPRegs:$rs2),
+ "fnegd $rs2, $rd",
+ [(set f64:$rd, (fneg f64:$rs2))]>;
+ def FNEGQ : F3_3u<2, 0b110100, 0b000000111,
+ (outs QFPRegs:$rd), (ins QFPRegs:$rs2),
+ "fnegq $rs2, $rd",
+ [(set f128:$rd, (fneg f128:$rs2))]>,
+ Requires<[HasHardQuad]>;
+ def FABSD : F3_3u<2, 0b110100, 0b000001010,
+ (outs DFPRegs:$rd), (ins DFPRegs:$rs2),
+ "fabsd $rs2, $rd",
+ [(set f64:$rd, (fabs f64:$rs2))]>;
+ def FABSQ : F3_3u<2, 0b110100, 0b000001011,
+ (outs QFPRegs:$rd), (ins QFPRegs:$rs2),
+ "fabsq $rs2, $rd",
+ [(set f128:$rd, (fabs f128:$rs2))]>,
+ Requires<[HasHardQuad]>;
+}
+
+// Floating-point compare instruction with %fcc0-%fcc3.
+def V9FCMPS : F3_3c<2, 0b110101, 0b001010001,
+ (outs FCCRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2),
+ "fcmps $rd, $rs1, $rs2", []>;
+def V9FCMPD : F3_3c<2, 0b110101, 0b001010010,
+ (outs FCCRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+ "fcmpd $rd, $rs1, $rs2", []>;
+def V9FCMPQ : F3_3c<2, 0b110101, 0b001010011,
+ (outs FCCRegs:$rd), (ins QFPRegs:$rs1, QFPRegs:$rs2),
+ "fcmpq $rd, $rs1, $rs2", []>,
+ Requires<[HasHardQuad]>;
+
+let hasSideEffects = 1 in {
+ def V9FCMPES : F3_3c<2, 0b110101, 0b001010101,
+ (outs FCCRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2),
+ "fcmpes $rd, $rs1, $rs2", []>;
+ def V9FCMPED : F3_3c<2, 0b110101, 0b001010110,
+ (outs FCCRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+ "fcmped $rd, $rs1, $rs2", []>;
+ def V9FCMPEQ : F3_3c<2, 0b110101, 0b001010111,
+ (outs FCCRegs:$rd), (ins QFPRegs:$rs1, QFPRegs:$rs2),
+ "fcmpeq $rd, $rs1, $rs2", []>,
+ Requires<[HasHardQuad]>;
+}
+
+// Floating point conditional move instrucitons with %fcc0-%fcc3.
+let Predicates = [HasV9] in {
+ let Constraints = "$f = $rd", intcc = 0 in {
+ def V9MOVFCCrr
+ : F4_1<0b101100, (outs IntRegs:$rd),
+ (ins FCCRegs:$cc, IntRegs:$rs2, IntRegs:$f, CCOp:$cond),
+ "mov$cond $cc, $rs2, $rd", []>;
+ def V9MOVFCCri
+ : F4_2<0b101100, (outs IntRegs:$rd),
+ (ins FCCRegs:$cc, i32imm:$simm11, IntRegs:$f, CCOp:$cond),
+ "mov$cond $cc, $simm11, $rd", []>;
+ def V9FMOVS_FCC
+ : F4_3<0b110101, 0b000001, (outs FPRegs:$rd),
+ (ins FCCRegs:$opf_cc, FPRegs:$rs2, FPRegs:$f, CCOp:$cond),
+ "fmovs$cond $opf_cc, $rs2, $rd", []>;
+ def V9FMOVD_FCC
+ : F4_3<0b110101, 0b000010, (outs DFPRegs:$rd),
+ (ins FCCRegs:$opf_cc, DFPRegs:$rs2, DFPRegs:$f, CCOp:$cond),
+ "fmovd$cond $opf_cc, $rs2, $rd", []>;
+ def V9FMOVQ_FCC
+ : F4_3<0b110101, 0b000011, (outs QFPRegs:$rd),
+ (ins FCCRegs:$opf_cc, QFPRegs:$rs2, QFPRegs:$f, CCOp:$cond),
+ "fmovq$cond $opf_cc, $rs2, $rd", []>,
+ Requires<[HasHardQuad]>;
+ } // Constraints = "$f = $rd", ...
+} // let Predicates = [hasV9]
+
+
+// POPCrr - This does a ctpop of a 64-bit register. As such, we have to clear
+// the top 32-bits before using it. To do this clearing, we use a SRLri X,0.
+let rs1 = 0 in
+ def POPCrr : F3_1<2, 0b101110,
+ (outs IntRegs:$rd), (ins IntRegs:$rs2),
+ "popc $rs2, $rd", []>, Requires<[HasV9]>;
+def : Pat<(ctpop i32:$src),
+ (POPCrr (SRLri $src, 0))>;
+
+let Predicates = [HasV9], hasSideEffects = 1, rd = 0, rs1 = 0b01111 in
+ def MEMBARi : F3_2<2, 0b101000, (outs), (ins simm13Op:$simm13),
+ "membar $simm13", []>;
+
+// The CAS instruction, unlike other instructions, only comes in a
+// form which requires an ASI be provided. The ASI value hardcoded
+// here is ASI_PRIMARY, the default unprivileged ASI for SparcV9.
+let Predicates = [HasV9], Constraints = "$swap = $rd", asi = 0b10000000 in
+ def CASrr: F3_1_asi<3, 0b111100,
+ (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2,
+ IntRegs:$swap),
+ "cas [$rs1], $rs2, $rd",
+ [(set i32:$rd,
+ (atomic_cmp_swap_32 iPTR:$rs1, i32:$rs2, i32:$swap))]>;
+
+
+// CASA is supported as an instruction on some LEON3 and all LEON4 processors.
+// This version can be automatically lowered from C code, selecting ASI 10
+let Predicates = [HasLeonCASA], Constraints = "$swap = $rd", asi = 0b00001010 in
+ def CASAasi10: F3_1_asi<3, 0b111100,
+ (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2,
+ IntRegs:$swap),
+ "casa [$rs1] 10, $rs2, $rd",
+ [(set i32:$rd,
+ (atomic_cmp_swap_32 iPTR:$rs1, i32:$rs2, i32:$swap))]>;
+
+// CASA supported on some LEON3 and all LEON4 processors. Same pattern as
+// CASrr, above, but with a different ASI. This version is supported for
+// inline assembly lowering only.
+let Predicates = [HasLeonCASA], Constraints = "$swap = $rd" in
+ def CASArr: F3_1_asi<3, 0b111100,
+ (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2,
+ IntRegs:$swap, i8imm:$asi),
+ "casa [$rs1] $asi, $rs2, $rd", []>;
+
+// TODO: Add DAG sequence to lower these instructions. Currently, only provided
+// as inline assembler-supported instructions.
+let Predicates = [HasUMAC_SMAC], Defs = [Y, ASR18], Uses = [Y, ASR18] in {
+ def SMACrr : F3_1<2, 0b111111,
+ (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2, ASRRegs:$asr18),
+ "smac $rs1, $rs2, $rd",
+ [], IIC_smac_umac>;
+
+ def SMACri : F3_2<2, 0b111111,
+ (outs IntRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13, ASRRegs:$asr18),
+ "smac $rs1, $simm13, $rd",
+ [], IIC_smac_umac>;
+
+ def UMACrr : F3_1<2, 0b111110,
+ (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2, ASRRegs:$asr18),
+ "umac $rs1, $rs2, $rd",
+ [], IIC_smac_umac>;
+
+ def UMACri : F3_2<2, 0b111110,
+ (outs IntRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13, ASRRegs:$asr18),
+ "umac $rs1, $simm13, $rd",
+ [], IIC_smac_umac>;
+}
+
+let Defs = [ICC] in {
+defm TADDCC : F3_12np<"taddcc", 0b100000>;
+defm TSUBCC : F3_12np<"tsubcc", 0b100001>;
+
+let hasSideEffects = 1 in {
+ defm TADDCCTV : F3_12np<"taddcctv", 0b100010>;
+ defm TSUBCCTV : F3_12np<"tsubcctv", 0b100011>;
+}
+}
+
+
+// Section A.43 - Read Privileged Register Instructions
+let Predicates = [HasV9] in {
+let rs2 = 0 in
+ def RDPR : F3_1<2, 0b101010,
+ (outs IntRegs:$rd), (ins PRRegs:$rs1),
+ "rdpr $rs1, $rd", []>;
+}
+
+// Section A.62 - Write Privileged Register Instructions
+let Predicates = [HasV9] in {
+ def WRPRrr : F3_1<2, 0b110010,
+ (outs PRRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2),
+ "wrpr $rs1, $rs2, $rd", []>;
+ def WRPRri : F3_2<2, 0b110010,
+ (outs PRRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13),
+ "wrpr $rs1, $simm13, $rd", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// Small immediates.
+def : Pat<(i32 simm13:$val),
+ (ORri (i32 G0), imm:$val)>;
+// Arbitrary immediates.
+def : Pat<(i32 imm:$val),
+ (ORri (SETHIi (HI22 imm:$val)), (LO10 imm:$val))>;
+
+
+// Global addresses, constant pool entries
+let Predicates = [Is32Bit] in {
+
+def : Pat<(SPhi tglobaladdr:$in), (SETHIi tglobaladdr:$in)>;
+def : Pat<(SPlo tglobaladdr:$in), (ORri (i32 G0), tglobaladdr:$in)>;
+def : Pat<(SPhi tconstpool:$in), (SETHIi tconstpool:$in)>;
+def : Pat<(SPlo tconstpool:$in), (ORri (i32 G0), tconstpool:$in)>;
+
+// GlobalTLS addresses
+def : Pat<(SPhi tglobaltlsaddr:$in), (SETHIi tglobaltlsaddr:$in)>;
+def : Pat<(SPlo tglobaltlsaddr:$in), (ORri (i32 G0), tglobaltlsaddr:$in)>;
+def : Pat<(add (SPhi tglobaltlsaddr:$in1), (SPlo tglobaltlsaddr:$in2)),
+ (ADDri (SETHIi tglobaltlsaddr:$in1), (tglobaltlsaddr:$in2))>;
+def : Pat<(xor (SPhi tglobaltlsaddr:$in1), (SPlo tglobaltlsaddr:$in2)),
+ (XORri (SETHIi tglobaltlsaddr:$in1), (tglobaltlsaddr:$in2))>;
+
+// Blockaddress
+def : Pat<(SPhi tblockaddress:$in), (SETHIi tblockaddress:$in)>;
+def : Pat<(SPlo tblockaddress:$in), (ORri (i32 G0), tblockaddress:$in)>;
+
+// Add reg, lo. This is used when taking the addr of a global/constpool entry.
+def : Pat<(add iPTR:$r, (SPlo tglobaladdr:$in)), (ADDri $r, tglobaladdr:$in)>;
+def : Pat<(add iPTR:$r, (SPlo tconstpool:$in)), (ADDri $r, tconstpool:$in)>;
+def : Pat<(add iPTR:$r, (SPlo tblockaddress:$in)),
+ (ADDri $r, tblockaddress:$in)>;
+}
+
+// Calls:
+def : Pat<(call tglobaladdr:$dst),
+ (CALL tglobaladdr:$dst)>;
+def : Pat<(call texternalsym:$dst),
+ (CALL texternalsym:$dst)>;
+
+// Map integer extload's to zextloads.
+def : Pat<(i32 (extloadi1 ADDRrr:$src)), (LDUBrr ADDRrr:$src)>;
+def : Pat<(i32 (extloadi1 ADDRri:$src)), (LDUBri ADDRri:$src)>;
+def : Pat<(i32 (extloadi8 ADDRrr:$src)), (LDUBrr ADDRrr:$src)>;
+def : Pat<(i32 (extloadi8 ADDRri:$src)), (LDUBri ADDRri:$src)>;
+def : Pat<(i32 (extloadi16 ADDRrr:$src)), (LDUHrr ADDRrr:$src)>;
+def : Pat<(i32 (extloadi16 ADDRri:$src)), (LDUHri ADDRri:$src)>;
+
+// zextload bool -> zextload byte
+def : Pat<(i32 (zextloadi1 ADDRrr:$src)), (LDUBrr ADDRrr:$src)>;
+def : Pat<(i32 (zextloadi1 ADDRri:$src)), (LDUBri ADDRri:$src)>;
+
+// store 0, addr -> store %g0, addr
+def : Pat<(store (i32 0), ADDRrr:$dst), (STrr ADDRrr:$dst, (i32 G0))>;
+def : Pat<(store (i32 0), ADDRri:$dst), (STri ADDRri:$dst, (i32 G0))>;
+
+// store bar for all atomic_fence in V8.
+let Predicates = [HasNoV9] in
+ def : Pat<(atomic_fence imm, imm), (STBAR)>;
+
+// atomic_load addr -> load addr
+def : Pat<(i32 (atomic_load_8 ADDRrr:$src)), (LDUBrr ADDRrr:$src)>;
+def : Pat<(i32 (atomic_load_8 ADDRri:$src)), (LDUBri ADDRri:$src)>;
+def : Pat<(i32 (atomic_load_16 ADDRrr:$src)), (LDUHrr ADDRrr:$src)>;
+def : Pat<(i32 (atomic_load_16 ADDRri:$src)), (LDUHri ADDRri:$src)>;
+def : Pat<(i32 (atomic_load_32 ADDRrr:$src)), (LDrr ADDRrr:$src)>;
+def : Pat<(i32 (atomic_load_32 ADDRri:$src)), (LDri ADDRri:$src)>;
+
+// atomic_store val, addr -> store val, addr
+def : Pat<(atomic_store_8 ADDRrr:$dst, i32:$val), (STBrr ADDRrr:$dst, $val)>;
+def : Pat<(atomic_store_8 ADDRri:$dst, i32:$val), (STBri ADDRri:$dst, $val)>;
+def : Pat<(atomic_store_16 ADDRrr:$dst, i32:$val), (STHrr ADDRrr:$dst, $val)>;
+def : Pat<(atomic_store_16 ADDRri:$dst, i32:$val), (STHri ADDRri:$dst, $val)>;
+def : Pat<(atomic_store_32 ADDRrr:$dst, i32:$val), (STrr ADDRrr:$dst, $val)>;
+def : Pat<(atomic_store_32 ADDRri:$dst, i32:$val), (STri ADDRri:$dst, $val)>;
+
+// extract_vector
+def : Pat<(extractelt (v2i32 IntPair:$Rn), 0),
+ (i32 (EXTRACT_SUBREG IntPair:$Rn, sub_even))>;
+def : Pat<(extractelt (v2i32 IntPair:$Rn), 1),
+ (i32 (EXTRACT_SUBREG IntPair:$Rn, sub_odd))>;
+
+// build_vector
+def : Pat<(build_vector (i32 IntRegs:$a1), (i32 IntRegs:$a2)),
+ (INSERT_SUBREG
+ (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), (i32 IntRegs:$a1), sub_even),
+ (i32 IntRegs:$a2), sub_odd)>;
+
+
+include "SparcInstr64Bit.td"
+include "SparcInstrVIS.td"
+include "SparcInstrAliases.td"
diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrVIS.td b/contrib/llvm/lib/Target/Sparc/SparcInstrVIS.td
new file mode 100644
index 000000000000..d9adf3e8b0f5
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcInstrVIS.td
@@ -0,0 +1,263 @@
+//===---- SparcInstrVIS.td - Visual Instruction Set extensions (VIS) -----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains instruction formats, definitions and patterns needed for
+// VIS, VIS II, VIS II instructions on SPARC.
+//===----------------------------------------------------------------------===//
+
+// VIS Instruction Format.
+class VISInstFormat<bits<9> opfval, dag outs, dag ins, string asmstr,
+ list<dag> pattern>
+ : F3_3<0b10, 0b110110, opfval, outs, ins, asmstr, pattern>;
+
+class VISInst<bits<9> opfval, string OpcStr, RegisterClass RC = DFPRegs>
+ : VISInstFormat<opfval,
+ (outs RC:$rd), (ins RC:$rs1, RC:$rs2),
+ !strconcat(OpcStr, " $rs1, $rs2, $rd"), []>;
+
+// VIS Instruction with integer destination register.
+class VISInstID<bits<9> opfval, string OpcStr>
+ : VISInstFormat<opfval,
+ (outs I64Regs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+ !strconcat(OpcStr, " $rs1, $rs2, $rd"), []>;
+
+// For VIS Instructions with no operand.
+let rd = 0, rs1 = 0, rs2 = 0 in
+class VISInst0<bits<9> opfval, string asmstr>
+ : VISInstFormat<opfval, (outs), (ins), asmstr, []>;
+
+// For VIS Instructions with only rs1, rd operands.
+let rs2 = 0 in
+class VISInst1<bits<9> opfval, string OpcStr, RegisterClass RC = DFPRegs>
+ : VISInstFormat<opfval,
+ (outs RC:$rd), (ins RC:$rs1),
+ !strconcat(OpcStr, " $rs1, $rd"), []>;
+
+// For VIS Instructions with only rs2, rd operands.
+let rs1 = 0 in
+class VISInst2<bits<9> opfval, string OpcStr, RegisterClass RC = DFPRegs>
+ : VISInstFormat<opfval,
+ (outs RC:$rd), (ins RC:$rs2),
+ !strconcat(OpcStr, " $rs2, $rd"), []>;
+
+// For VIS Instructions with only rd operand.
+let Constraints = "$rd = $f", rs1 = 0, rs2 = 0 in
+class VISInstD<bits<9> opfval, string OpcStr, RegisterClass RC = DFPRegs>
+ : VISInstFormat<opfval,
+ (outs RC:$rd), (ins RC:$f),
+ !strconcat(OpcStr, " $rd"), []>;
+
+// VIS 1 Instructions
+let Predicates = [HasVIS] in {
+
+def FPADD16 : VISInst<0b001010000, "fpadd16">;
+def FPADD16S : VISInst<0b001010001, "fpadd16s">;
+def FPADD32 : VISInst<0b001010010, "fpadd32">;
+def FPADD32S : VISInst<0b001010011, "fpadd32s">;
+def FPSUB16 : VISInst<0b001010100, "fpsub16">;
+def FPSUB16S : VISInst<0b001010101, "fpsub16S">;
+def FPSUB32 : VISInst<0b001010110, "fpsub32">;
+def FPSUB32S : VISInst<0b001010111, "fpsub32S">;
+
+def FPACK16 : VISInst2<0b000111011, "fpack16">;
+def FPACK32 : VISInst <0b000111010, "fpack32">;
+def FPACKFIX : VISInst2<0b000111101, "fpackfix">;
+def FEXPAND : VISInst2<0b001001101, "fexpand">;
+def FPMERGE : VISInst <0b001001011, "fpmerge">;
+
+def FMUL8X16 : VISInst<0b000110001, "fmul8x16">;
+def FMUL8X16AU : VISInst<0b000110011, "fmul8x16au">;
+def FMUL8X16AL : VISInst<0b000110101, "fmul8x16al">;
+def FMUL8SUX16 : VISInst<0b000110110, "fmul8sux16">;
+def FMUL8ULX16 : VISInst<0b000110111, "fmul8ulx16">;
+def FMULD8SUX16 : VISInst<0b000111000, "fmuld8sux16">;
+def FMULD8ULX16 : VISInst<0b000111001, "fmuld8ulx16">;
+
+def ALIGNADDR : VISInst<0b000011000, "alignaddr", I64Regs>;
+def ALIGNADDRL : VISInst<0b000011010, "alignaddrl", I64Regs>;
+def FALIGNADATA : VISInst<0b001001000, "faligndata">;
+
+def FZERO : VISInstD<0b001100000, "fzero">;
+def FZEROS : VISInstD<0b001100001, "fzeros", FPRegs>;
+def FONE : VISInstD<0b001111110, "fone">;
+def FONES : VISInstD<0b001111111, "fones", FPRegs>;
+def FSRC1 : VISInst1<0b001110100, "fsrc1">;
+def FSRC1S : VISInst1<0b001110101, "fsrc1s", FPRegs>;
+def FSRC2 : VISInst2<0b001111000, "fsrc2">;
+def FSRC2S : VISInst2<0b001111001, "fsrc2s", FPRegs>;
+def FNOT1 : VISInst1<0b001101010, "fnot1">;
+def FNOT1S : VISInst1<0b001101011, "fnot1s", FPRegs>;
+def FNOT2 : VISInst2<0b001100110, "fnot2">;
+def FNOT2S : VISInst2<0b001100111, "fnot2s", FPRegs>;
+def FOR : VISInst<0b001111100, "for">;
+def FORS : VISInst<0b001111101, "fors", FPRegs>;
+def FNOR : VISInst<0b001100010, "fnor">;
+def FNORS : VISInst<0b001100011, "fnors", FPRegs>;
+def FAND : VISInst<0b001110000, "fand">;
+def FANDS : VISInst<0b001110001, "fands", FPRegs>;
+def FNAND : VISInst<0b001101110, "fnand">;
+def FNANDS : VISInst<0b001101111, "fnands", FPRegs>;
+def FXOR : VISInst<0b001101100, "fxor">;
+def FXORS : VISInst<0b001101101, "fxors", FPRegs>;
+def FXNOR : VISInst<0b001110010, "fxnor">;
+def FXNORS : VISInst<0b001110011, "fxnors", FPRegs>;
+
+def FORNOT1 : VISInst<0b001111010, "fornot1">;
+def FORNOT1S : VISInst<0b001111011, "fornot1s", FPRegs>;
+def FORNOT2 : VISInst<0b001110110, "fornot2">;
+def FORNOT2S : VISInst<0b001110111, "fornot2s", FPRegs>;
+def FANDNOT1 : VISInst<0b001101000, "fandnot1">;
+def FANDNOT1S : VISInst<0b001101001, "fandnot1s", FPRegs>;
+def FANDNOT2 : VISInst<0b001100100, "fandnot2">;
+def FANDNOT2S : VISInst<0b001100101, "fandnot2s", FPRegs>;
+
+def FCMPGT16 : VISInstID<0b000101000, "fcmpgt16">;
+def FCMPGT32 : VISInstID<0b000101100, "fcmpgt32">;
+def FCMPLE16 : VISInstID<0b000100000, "fcmple16">;
+def FCMPLE32 : VISInstID<0b000100100, "fcmple32">;
+def FCMPNE16 : VISInstID<0b000100010, "fcmpne16">;
+def FCMPNE32 : VISInstID<0b000100110, "fcmpne32">;
+def FCMPEQ16 : VISInstID<0b000101010, "fcmpeq16">;
+def FCMPEQ32 : VISInstID<0b000101110, "fcmpeq32">;
+
+
+def EDGE8 : VISInst<0b000000000, "edge8", I64Regs>;
+def EDGE8L : VISInst<0b000000010, "edge8l", I64Regs>;
+def EDGE16 : VISInst<0b000000100, "edge16", I64Regs>;
+def EDGE16L : VISInst<0b000000110, "edge16l", I64Regs>;
+def EDGE32 : VISInst<0b000001000, "edge32", I64Regs>;
+def EDGE32L : VISInst<0b000001010, "edge32l", I64Regs>;
+
+def PDIST : VISInst<0b000111110, "pdist">;
+
+def ARRAY8 : VISInst<0b000010000, "array8", I64Regs>;
+def ARRAY16 : VISInst<0b000010010, "array16", I64Regs>;
+def ARRAY32 : VISInst<0b000010100, "array32", I64Regs>;
+
+def SHUTDOWN : VISInst0<0b010000000, "shutdown">;
+
+} // Predicates = [HasVIS]
+
+
+// VIS 2 Instructions.
+let Predicates = [HasVIS2] in {
+
+def BMASK : VISInst<0b000011001, "bmask", I64Regs>;
+def BSHUFFLE : VISInst<0b000011100, "bshuffle">;
+
+def SIAM : VISInst0<0b010000001, "siam">;
+
+def EDGE8N : VISInst<0b000000001, "edge8n", I64Regs>;
+def EDGE8LN : VISInst<0b000000011, "edge8ln", I64Regs>;
+def EDGE16N : VISInst<0b000000101, "edge16n", I64Regs>;
+def EDGE16LN : VISInst<0b000000111, "edge16ln", I64Regs>;
+def EDGE32N : VISInst<0b000001001, "edge32n", I64Regs>;
+def EDGE32LN : VISInst<0b000001011, "edge32ln", I64Regs>;
+} // Predicates = [HasVIS2]
+
+
+// VIS 3 Instructions.
+let Predicates = [HasVIS3] in {
+
+let Uses = [ICC] in
+def ADDXC : VISInst<0b000010001, "addxc", I64Regs>;
+
+let Defs = [ICC], Uses = [ICC] in
+def ADDXCCC : VISInst<0b000010011, "addxccc", I64Regs>;
+
+let rd = 0, rs1 = 0 in {
+def CMASK8 : VISInstFormat<0b000011011, (outs), (ins I64Regs:$rs2),
+ "cmask8 $rs2", []>;
+def CMASK16 : VISInstFormat<0b000011101, (outs), (ins I64Regs:$rs2),
+ "cmask16 $rs2", []>;
+def CMASK32 : VISInstFormat<0b000011111, (outs), (ins I64Regs:$rs2),
+ "cmask32 $rs2", []>;
+
+}
+
+def FCHKSM16 : VISInst<0b001000100, "fchksm16">;
+
+def FHADDS : F3_3<0b10, 0b110100, 0b001100001,
+ (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+ "fhadds $rs1, $rs2, $rd", []>;
+def FHADDD : F3_3<0b10, 0b110100, 0b001100010,
+ (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+ "fhaddd $rs1, $rs2, $rd", []>;
+def FHSUBS : F3_3<0b10, 0b110100, 0b001100101,
+ (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+ "fhsubs $rs1, $rs2, $rd", []>;
+def FHSUBD : F3_3<0b10, 0b110100, 0b001100110,
+ (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+ "fhsubd $rs1, $rs2, $rd", []>;
+def FLCMPS : VISInstFormat<0b101010001, (outs FCCRegs:$rd),
+ (ins DFPRegs:$rs1, DFPRegs:$rs2),
+ "flcmps $rd, $rs1, $rs2", []>;
+def FLCMPD : VISInstFormat<0b101010010, (outs FCCRegs:$rd),
+ (ins DFPRegs:$rs1, DFPRegs:$rs2),
+ "flcmpd $rd, $rs1, $rs2", []>;
+
+def FMEAN16 : VISInst<0b001000000, "fmean16">;
+
+def FNADDS : F3_3<0b10, 0b110100, 0b001010001,
+ (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+ "fnadds $rs1, $rs2, $rd", []>;
+def FNADDD : F3_3<0b10, 0b110100, 0b001010010,
+ (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+ "fnaddd $rs1, $rs2, $rd", []>;
+def FNHADDS : F3_3<0b10, 0b110100, 0b001110001,
+ (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+ "fnhadds $rs1, $rs2, $rd", []>;
+def FNHADDD : F3_3<0b10, 0b110100, 0b001110010,
+ (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+ "fnhaddd $rs1, $rs2, $rd", []>;
+
+def FNMULS : F3_3<0b10, 0b110100, 0b001011001,
+ (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+ "fnhadds $rs1, $rs2, $rd", []>;
+def FNMULD : F3_3<0b10, 0b110100, 0b001011010,
+ (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+ "fnhaddd $rs1, $rs2, $rd", []>;
+def FNSMULD : F3_3<0b10, 0b110100, 0b001111001,
+ (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+ "fnhadds $rs1, $rs2, $rd", []>;
+
+def FPADD64 : VISInst<0b001000010, "fpadd64">;
+
+def FSLL16 : VISInst<0b000100001, "fsll16">;
+def FSRL16 : VISInst<0b000100011, "fsrl16">;
+def FSLL32 : VISInst<0b000100101, "fsll32">;
+def FSRL32 : VISInst<0b000100111, "fsrl32">;
+def FSLAS16 : VISInst<0b000101001, "fslas16">;
+def FSRA16 : VISInst<0b000101011, "fsra16">;
+def FSLAS32 : VISInst<0b000101101, "fslas32">;
+def FSRA32 : VISInst<0b000101111, "fsra32">;
+
+let rs1 = 0 in
+def LZCNT : VISInstFormat<0b000010111, (outs I64Regs:$rd),
+ (ins I64Regs:$rs2), "lzcnt $rs2, $rd", []>;
+
+let rs1 = 0 in {
+def MOVSTOSW : VISInstFormat<0b100010011, (outs I64Regs:$rd),
+ (ins DFPRegs:$rs2), "movstosw $rs2, $rd", []>;
+def MOVSTOUW : VISInstFormat<0b100010001, (outs I64Regs:$rd),
+ (ins DFPRegs:$rs2), "movstouw $rs2, $rd", []>;
+def MOVDTOX : VISInstFormat<0b100010000, (outs I64Regs:$rd),
+ (ins DFPRegs:$rs2), "movdtox $rs2, $rd", []>;
+def MOVWTOS : VISInstFormat<0b100011001, (outs DFPRegs:$rd),
+ (ins I64Regs:$rs2), "movdtox $rs2, $rd", []>;
+def MOVXTOD : VISInstFormat<0b100011000, (outs DFPRegs:$rd),
+ (ins I64Regs:$rs2), "movdtox $rs2, $rd", []>;
+}
+
+def PDISTN : VISInst<0b000111111, "pdistn">;
+
+def UMULXHI : VISInst<0b000010110, "umulxhi", I64Regs>;
+def XMULX : VISInst<0b100010101, "xmulx", I64Regs>;
+def XMULXHI : VISInst<0b100010111, "xmulxhi", I64Regs>;
+} // Predicates = [IsVIS3]
diff --git a/contrib/llvm/lib/Target/Sparc/SparcMCInstLower.cpp b/contrib/llvm/lib/Target/Sparc/SparcMCInstLower.cpp
new file mode 100644
index 000000000000..a3cedcbf9dd1
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcMCInstLower.cpp
@@ -0,0 +1,108 @@
+//===-- SparcMCInstLower.cpp - Convert Sparc MachineInstr to MCInst -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower Sparc MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Sparc.h"
+#include "MCTargetDesc/SparcMCExpr.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+
+using namespace llvm;
+
+
+static MCOperand LowerSymbolOperand(const MachineInstr *MI,
+ const MachineOperand &MO,
+ AsmPrinter &AP) {
+
+ SparcMCExpr::VariantKind Kind =
+ (SparcMCExpr::VariantKind)MO.getTargetFlags();
+ const MCSymbol *Symbol = nullptr;
+
+ switch(MO.getType()) {
+ default: llvm_unreachable("Unknown type in LowerSymbolOperand");
+ case MachineOperand::MO_MachineBasicBlock:
+ Symbol = MO.getMBB()->getSymbol();
+ break;
+
+ case MachineOperand::MO_GlobalAddress:
+ Symbol = AP.getSymbol(MO.getGlobal());
+ break;
+
+ case MachineOperand::MO_BlockAddress:
+ Symbol = AP.GetBlockAddressSymbol(MO.getBlockAddress());
+ break;
+
+ case MachineOperand::MO_ExternalSymbol:
+ Symbol = AP.GetExternalSymbolSymbol(MO.getSymbolName());
+ break;
+
+ case MachineOperand::MO_ConstantPoolIndex:
+ Symbol = AP.GetCPISymbol(MO.getIndex());
+ break;
+ }
+
+ const MCSymbolRefExpr *MCSym = MCSymbolRefExpr::create(Symbol,
+ AP.OutContext);
+ const SparcMCExpr *expr = SparcMCExpr::create(Kind, MCSym,
+ AP.OutContext);
+ return MCOperand::createExpr(expr);
+}
+
+static MCOperand LowerOperand(const MachineInstr *MI,
+ const MachineOperand &MO,
+ AsmPrinter &AP) {
+ switch(MO.getType()) {
+ default: llvm_unreachable("unknown operand type"); break;
+ case MachineOperand::MO_Register:
+ if (MO.isImplicit())
+ break;
+ return MCOperand::createReg(MO.getReg());
+
+ case MachineOperand::MO_Immediate:
+ return MCOperand::createImm(MO.getImm());
+
+ case MachineOperand::MO_MachineBasicBlock:
+ case MachineOperand::MO_GlobalAddress:
+ case MachineOperand::MO_BlockAddress:
+ case MachineOperand::MO_ExternalSymbol:
+ case MachineOperand::MO_ConstantPoolIndex:
+ return LowerSymbolOperand(MI, MO, AP);
+
+ case MachineOperand::MO_RegisterMask: break;
+
+ }
+ return MCOperand();
+}
+
+void llvm::LowerSparcMachineInstrToMCInst(const MachineInstr *MI,
+ MCInst &OutMI,
+ AsmPrinter &AP)
+{
+
+ OutMI.setOpcode(MI->getOpcode());
+
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+ MCOperand MCOp = LowerOperand(MI, MO, AP);
+
+ if (MCOp.isValid())
+ OutMI.addOperand(MCOp);
+ }
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.cpp
new file mode 100644
index 000000000000..e7442826e78b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.cpp
@@ -0,0 +1,14 @@
+//===-- SparcMachineFunctionInfo.cpp - Sparc Machine Function Info --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcMachineFunctionInfo.h"
+
+using namespace llvm;
+
+void SparcMachineFunctionInfo::anchor() { }
diff --git a/contrib/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.h b/contrib/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.h
new file mode 100644
index 000000000000..104744279d9d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.h
@@ -0,0 +1,56 @@
+//===- SparcMachineFunctionInfo.h - Sparc Machine Function Info -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares Sparc specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIB_TARGET_SPARC_SPARCMACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_SPARC_SPARCMACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+ class SparcMachineFunctionInfo : public MachineFunctionInfo {
+ virtual void anchor();
+ private:
+ unsigned GlobalBaseReg;
+
+ /// VarArgsFrameOffset - Frame offset to start of varargs area.
+ int VarArgsFrameOffset;
+
+ /// SRetReturnReg - Holds the virtual register into which the sret
+ /// argument is passed.
+ unsigned SRetReturnReg;
+
+ /// IsLeafProc - True if the function is a leaf procedure.
+ bool IsLeafProc;
+ public:
+ SparcMachineFunctionInfo()
+ : GlobalBaseReg(0), VarArgsFrameOffset(0), SRetReturnReg(0),
+ IsLeafProc(false) {}
+ explicit SparcMachineFunctionInfo(MachineFunction &MF)
+ : GlobalBaseReg(0), VarArgsFrameOffset(0), SRetReturnReg(0),
+ IsLeafProc(false) {}
+
+ unsigned getGlobalBaseReg() const { return GlobalBaseReg; }
+ void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; }
+
+ int getVarArgsFrameOffset() const { return VarArgsFrameOffset; }
+ void setVarArgsFrameOffset(int Offset) { VarArgsFrameOffset = Offset; }
+
+ unsigned getSRetReturnReg() const { return SRetReturnReg; }
+ void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+
+ void setLeafProc(bool rhs) { IsLeafProc = rhs; }
+ bool isLeafProc() const { return IsLeafProc; }
+ };
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp
new file mode 100644
index 000000000000..37a1fdf4d770
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp
@@ -0,0 +1,237 @@
+//===-- SparcRegisterInfo.cpp - SPARC Register Information ----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SPARC implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcRegisterInfo.h"
+#include "Sparc.h"
+#include "SparcMachineFunctionInfo.h"
+#include "SparcSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define GET_REGINFO_TARGET_DESC
+#include "SparcGenRegisterInfo.inc"
+
+static cl::opt<bool>
+ReserveAppRegisters("sparc-reserve-app-registers", cl::Hidden, cl::init(false),
+ cl::desc("Reserve application registers (%g2-%g4)"));
+
+SparcRegisterInfo::SparcRegisterInfo() : SparcGenRegisterInfo(SP::O7) {}
+
+const MCPhysReg*
+SparcRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+ return CSR_SaveList;
+}
+
+const uint32_t *
+SparcRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
+ CallingConv::ID CC) const {
+ return CSR_RegMask;
+}
+
+const uint32_t*
+SparcRegisterInfo::getRTCallPreservedMask(CallingConv::ID CC) const {
+ return RTCSR_RegMask;
+}
+
+BitVector SparcRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+ BitVector Reserved(getNumRegs());
+ const SparcSubtarget &Subtarget = MF.getSubtarget<SparcSubtarget>();
+ // FIXME: G1 reserved for now for large imm generation by frame code.
+ Reserved.set(SP::G1);
+
+ // G1-G4 can be used in applications.
+ if (ReserveAppRegisters) {
+ Reserved.set(SP::G2);
+ Reserved.set(SP::G3);
+ Reserved.set(SP::G4);
+ }
+ // G5 is not reserved in 64 bit mode.
+ if (!Subtarget.is64Bit())
+ Reserved.set(SP::G5);
+
+ Reserved.set(SP::O6);
+ Reserved.set(SP::I6);
+ Reserved.set(SP::I7);
+ Reserved.set(SP::G0);
+ Reserved.set(SP::G6);
+ Reserved.set(SP::G7);
+
+ // Also reserve the register pair aliases covering the above
+ // registers, with the same conditions.
+ Reserved.set(SP::G0_G1);
+ if (ReserveAppRegisters)
+ Reserved.set(SP::G2_G3);
+ if (ReserveAppRegisters || !Subtarget.is64Bit())
+ Reserved.set(SP::G4_G5);
+
+ Reserved.set(SP::O6_O7);
+ Reserved.set(SP::I6_I7);
+ Reserved.set(SP::G6_G7);
+
+ // Unaliased double registers are not available in non-V9 targets.
+ if (!Subtarget.isV9()) {
+ for (unsigned n = 0; n != 16; ++n) {
+ for (MCRegAliasIterator AI(SP::D16 + n, this, true); AI.isValid(); ++AI)
+ Reserved.set(*AI);
+ }
+ }
+
+ return Reserved;
+}
+
+const TargetRegisterClass*
+SparcRegisterInfo::getPointerRegClass(const MachineFunction &MF,
+ unsigned Kind) const {
+ const SparcSubtarget &Subtarget = MF.getSubtarget<SparcSubtarget>();
+ return Subtarget.is64Bit() ? &SP::I64RegsRegClass : &SP::IntRegsRegClass;
+}
+
+static void replaceFI(MachineFunction &MF, MachineBasicBlock::iterator II,
+ MachineInstr &MI, const DebugLoc &dl,
+ unsigned FIOperandNum, int Offset, unsigned FramePtr) {
+ // Replace frame index with a frame pointer reference.
+ if (Offset >= -4096 && Offset <= 4095) {
+ // If the offset is small enough to fit in the immediate field, directly
+ // encode it.
+ MI.getOperand(FIOperandNum).ChangeToRegister(FramePtr, false);
+ MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+ return;
+ }
+
+ const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+
+ // FIXME: it would be better to scavenge a register here instead of
+ // reserving G1 all of the time.
+ if (Offset >= 0) {
+ // Emit nonnegaive immediates with sethi + or.
+ // sethi %hi(Offset), %g1
+ // add %g1, %fp, %g1
+ // Insert G1+%lo(offset) into the user.
+ BuildMI(*MI.getParent(), II, dl, TII.get(SP::SETHIi), SP::G1)
+ .addImm(HI22(Offset));
+
+
+ // Emit G1 = G1 + I6
+ BuildMI(*MI.getParent(), II, dl, TII.get(SP::ADDrr), SP::G1).addReg(SP::G1)
+ .addReg(FramePtr);
+ // Insert: G1+%lo(offset) into the user.
+ MI.getOperand(FIOperandNum).ChangeToRegister(SP::G1, false);
+ MI.getOperand(FIOperandNum + 1).ChangeToImmediate(LO10(Offset));
+ return;
+ }
+
+ // Emit Negative numbers with sethi + xor
+ // sethi %hix(Offset), %g1
+ // xor %g1, %lox(offset), %g1
+ // add %g1, %fp, %g1
+ // Insert: G1 + 0 into the user.
+ BuildMI(*MI.getParent(), II, dl, TII.get(SP::SETHIi), SP::G1)
+ .addImm(HIX22(Offset));
+ BuildMI(*MI.getParent(), II, dl, TII.get(SP::XORri), SP::G1)
+ .addReg(SP::G1).addImm(LOX10(Offset));
+
+ BuildMI(*MI.getParent(), II, dl, TII.get(SP::ADDrr), SP::G1).addReg(SP::G1)
+ .addReg(FramePtr);
+ // Insert: G1+%lo(offset) into the user.
+ MI.getOperand(FIOperandNum).ChangeToRegister(SP::G1, false);
+ MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0);
+}
+
+
+void
+SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+ int SPAdj, unsigned FIOperandNum,
+ RegScavenger *RS) const {
+ assert(SPAdj == 0 && "Unexpected");
+
+ MachineInstr &MI = *II;
+ DebugLoc dl = MI.getDebugLoc();
+ int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+ MachineFunction &MF = *MI.getParent()->getParent();
+ const SparcSubtarget &Subtarget = MF.getSubtarget<SparcSubtarget>();
+ const SparcFrameLowering *TFI = getFrameLowering(MF);
+
+ unsigned FrameReg;
+ int Offset;
+ Offset = TFI->getFrameIndexReference(MF, FrameIndex, FrameReg);
+
+ Offset += MI.getOperand(FIOperandNum + 1).getImm();
+
+ if (!Subtarget.isV9() || !Subtarget.hasHardQuad()) {
+ if (MI.getOpcode() == SP::STQFri) {
+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+ unsigned SrcReg = MI.getOperand(2).getReg();
+ unsigned SrcEvenReg = getSubReg(SrcReg, SP::sub_even64);
+ unsigned SrcOddReg = getSubReg(SrcReg, SP::sub_odd64);
+ MachineInstr *StMI =
+ BuildMI(*MI.getParent(), II, dl, TII.get(SP::STDFri))
+ .addReg(FrameReg).addImm(0).addReg(SrcEvenReg);
+ replaceFI(MF, II, *StMI, dl, 0, Offset, FrameReg);
+ MI.setDesc(TII.get(SP::STDFri));
+ MI.getOperand(2).setReg(SrcOddReg);
+ Offset += 8;
+ } else if (MI.getOpcode() == SP::LDQFri) {
+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+ unsigned DestReg = MI.getOperand(0).getReg();
+ unsigned DestEvenReg = getSubReg(DestReg, SP::sub_even64);
+ unsigned DestOddReg = getSubReg(DestReg, SP::sub_odd64);
+ MachineInstr *StMI =
+ BuildMI(*MI.getParent(), II, dl, TII.get(SP::LDDFri), DestEvenReg)
+ .addReg(FrameReg).addImm(0);
+ replaceFI(MF, II, *StMI, dl, 1, Offset, FrameReg);
+
+ MI.setDesc(TII.get(SP::LDDFri));
+ MI.getOperand(0).setReg(DestOddReg);
+ Offset += 8;
+ }
+ }
+
+ replaceFI(MF, II, MI, dl, FIOperandNum, Offset, FrameReg);
+
+}
+
+unsigned SparcRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+ return SP::I6;
+}
+
+// Sparc has no architectural need for stack realignment support,
+// except that LLVM unfortunately currently implements overaligned
+// stack objects by depending upon stack realignment support.
+// If that ever changes, this can probably be deleted.
+bool SparcRegisterInfo::canRealignStack(const MachineFunction &MF) const {
+ if (!TargetRegisterInfo::canRealignStack(MF))
+ return false;
+
+ // Sparc always has a fixed frame pointer register, so don't need to
+ // worry about needing to reserve it. [even if we don't have a frame
+ // pointer for our frame, it still cannot be used for other things,
+ // or register window traps will be SADNESS.]
+
+ // If there's a reserved call frame, we can use SP to access locals.
+ if (getFrameLowering(MF)->hasReservedCallFrame(MF))
+ return true;
+
+ // Otherwise, we'd need a base pointer, but those aren't implemented
+ // for SPARC at the moment.
+
+ return false;
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.h b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.h
new file mode 100644
index 000000000000..2ac51263957e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.h
@@ -0,0 +1,50 @@
+//===-- SparcRegisterInfo.h - Sparc Register Information Impl ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Sparc implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPARC_SPARCREGISTERINFO_H
+#define LLVM_LIB_TARGET_SPARC_SPARCREGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "SparcGenRegisterInfo.inc"
+
+namespace llvm {
+struct SparcRegisterInfo : public SparcGenRegisterInfo {
+ SparcRegisterInfo();
+
+ /// Code Generation virtual methods...
+ const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+ const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+ CallingConv::ID CC) const override;
+
+ const uint32_t* getRTCallPreservedMask(CallingConv::ID CC) const;
+
+ BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+ const TargetRegisterClass *getPointerRegClass(const MachineFunction &MF,
+ unsigned Kind) const override;
+
+ void eliminateFrameIndex(MachineBasicBlock::iterator II,
+ int SPAdj, unsigned FIOperandNum,
+ RegScavenger *RS = nullptr) const override;
+
+ unsigned getFrameRegister(const MachineFunction &MF) const override;
+
+ bool canRealignStack(const MachineFunction &MF) const override;
+
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.td b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.td
new file mode 100644
index 000000000000..6ecfddfc7d66
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.td
@@ -0,0 +1,377 @@
+//===-- SparcRegisterInfo.td - Sparc Register defs ---------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Declarations that describe the Sparc register file
+//===----------------------------------------------------------------------===//
+
+class SparcReg<bits<16> Enc, string n> : Register<n> {
+ let HWEncoding = Enc;
+ let Namespace = "SP";
+}
+
+class SparcCtrlReg<bits<16> Enc, string n>: Register<n> {
+ let HWEncoding = Enc;
+ let Namespace = "SP";
+}
+
+let Namespace = "SP" in {
+def sub_even : SubRegIndex<32>;
+def sub_odd : SubRegIndex<32, 32>;
+def sub_even64 : SubRegIndex<64>;
+def sub_odd64 : SubRegIndex<64, 64>;
+}
+
+// Registers are identified with 5-bit ID numbers.
+// Ri - 32-bit integer registers
+class Ri<bits<16> Enc, string n> : SparcReg<Enc, n>;
+
+// Rdi - pairs of 32-bit integer registers
+class Rdi<bits<16> Enc, string n, list<Register> subregs> : SparcReg<Enc, n> {
+ let SubRegs = subregs;
+ let SubRegIndices = [sub_even, sub_odd];
+ let CoveredBySubRegs = 1;
+}
+// Rf - 32-bit floating-point registers
+class Rf<bits<16> Enc, string n> : SparcReg<Enc, n>;
+
+// Rd - Slots in the FP register file for 64-bit floating-point values.
+class Rd<bits<16> Enc, string n, list<Register> subregs> : SparcReg<Enc, n> {
+ let SubRegs = subregs;
+ let SubRegIndices = [sub_even, sub_odd];
+ let CoveredBySubRegs = 1;
+}
+
+// Rq - Slots in the FP register file for 128-bit floating-point values.
+class Rq<bits<16> Enc, string n, list<Register> subregs> : SparcReg<Enc, n> {
+ let SubRegs = subregs;
+ let SubRegIndices = [sub_even64, sub_odd64];
+ let CoveredBySubRegs = 1;
+}
+
+// Control Registers
+def ICC : SparcCtrlReg<0, "ICC">; // This represents icc and xcc in 64-bit code.
+foreach I = 0-3 in
+ def FCC#I : SparcCtrlReg<I, "FCC"#I>;
+
+def FSR : SparcCtrlReg<0, "FSR">; // Floating-point state register.
+
+def FQ : SparcCtrlReg<0, "FQ">; // Floating-point deferred-trap queue.
+
+def CPSR : SparcCtrlReg<0, "CPSR">; // Co-processor state register.
+
+def CPQ : SparcCtrlReg<0, "CPQ">; // Co-processor queue.
+
+// Y register
+def Y : SparcCtrlReg<0, "Y">, DwarfRegNum<[64]>;
+// Ancillary state registers (implementation defined)
+def ASR1 : SparcCtrlReg<1, "ASR1">;
+def ASR2 : SparcCtrlReg<2, "ASR2">;
+def ASR3 : SparcCtrlReg<3, "ASR3">;
+def ASR4 : SparcCtrlReg<4, "ASR4">;
+def ASR5 : SparcCtrlReg<5, "ASR5">;
+def ASR6 : SparcCtrlReg<6, "ASR6">;
+def ASR7 : SparcCtrlReg<7, "ASR7">;
+def ASR8 : SparcCtrlReg<8, "ASR8">;
+def ASR9 : SparcCtrlReg<9, "ASR9">;
+def ASR10 : SparcCtrlReg<10, "ASR10">;
+def ASR11 : SparcCtrlReg<11, "ASR11">;
+def ASR12 : SparcCtrlReg<12, "ASR12">;
+def ASR13 : SparcCtrlReg<13, "ASR13">;
+def ASR14 : SparcCtrlReg<14, "ASR14">;
+def ASR15 : SparcCtrlReg<15, "ASR15">;
+def ASR16 : SparcCtrlReg<16, "ASR16">;
+def ASR17 : SparcCtrlReg<17, "ASR17">;
+def ASR18 : SparcCtrlReg<18, "ASR18">;
+def ASR19 : SparcCtrlReg<19, "ASR19">;
+def ASR20 : SparcCtrlReg<20, "ASR20">;
+def ASR21 : SparcCtrlReg<21, "ASR21">;
+def ASR22 : SparcCtrlReg<22, "ASR22">;
+def ASR23 : SparcCtrlReg<23, "ASR23">;
+def ASR24 : SparcCtrlReg<24, "ASR24">;
+def ASR25 : SparcCtrlReg<25, "ASR25">;
+def ASR26 : SparcCtrlReg<26, "ASR26">;
+def ASR27 : SparcCtrlReg<27, "ASR27">;
+def ASR28 : SparcCtrlReg<28, "ASR28">;
+def ASR29 : SparcCtrlReg<29, "ASR29">;
+def ASR30 : SparcCtrlReg<30, "ASR30">;
+def ASR31 : SparcCtrlReg<31, "ASR31">;
+
+// Note that PSR, WIM, and TBR don't exist on the SparcV9, only the V8.
+def PSR : SparcCtrlReg<0, "PSR">;
+def WIM : SparcCtrlReg<0, "WIM">;
+def TBR : SparcCtrlReg<0, "TBR">;
+
+def TPC : SparcCtrlReg<0, "TPC">;
+def TNPC : SparcCtrlReg<1, "TNPC">;
+def TSTATE : SparcCtrlReg<2, "TSTATE">;
+def TT : SparcCtrlReg<3, "TT">;
+def TICK : SparcCtrlReg<4, "TICK">;
+def TBA : SparcCtrlReg<5, "TBA">;
+def PSTATE : SparcCtrlReg<6, "PSTATE">;
+def TL : SparcCtrlReg<7, "TL">;
+def PIL : SparcCtrlReg<8, "PIL">;
+def CWP : SparcCtrlReg<9, "CWP">;
+def CANSAVE : SparcCtrlReg<10, "CANSAVE">;
+def CANRESTORE : SparcCtrlReg<11, "CANRESTORE">;
+def CLEANWIN : SparcCtrlReg<12, "CLEANWIN">;
+def OTHERWIN : SparcCtrlReg<13, "OTHERWIN">;
+def WSTATE : SparcCtrlReg<14, "WSTATE">;
+
+// Integer registers
+def G0 : Ri< 0, "G0">, DwarfRegNum<[0]>;
+def G1 : Ri< 1, "G1">, DwarfRegNum<[1]>;
+def G2 : Ri< 2, "G2">, DwarfRegNum<[2]>;
+def G3 : Ri< 3, "G3">, DwarfRegNum<[3]>;
+def G4 : Ri< 4, "G4">, DwarfRegNum<[4]>;
+def G5 : Ri< 5, "G5">, DwarfRegNum<[5]>;
+def G6 : Ri< 6, "G6">, DwarfRegNum<[6]>;
+def G7 : Ri< 7, "G7">, DwarfRegNum<[7]>;
+def O0 : Ri< 8, "O0">, DwarfRegNum<[8]>;
+def O1 : Ri< 9, "O1">, DwarfRegNum<[9]>;
+def O2 : Ri<10, "O2">, DwarfRegNum<[10]>;
+def O3 : Ri<11, "O3">, DwarfRegNum<[11]>;
+def O4 : Ri<12, "O4">, DwarfRegNum<[12]>;
+def O5 : Ri<13, "O5">, DwarfRegNum<[13]>;
+def O6 : Ri<14, "SP">, DwarfRegNum<[14]>;
+def O7 : Ri<15, "O7">, DwarfRegNum<[15]>;
+def L0 : Ri<16, "L0">, DwarfRegNum<[16]>;
+def L1 : Ri<17, "L1">, DwarfRegNum<[17]>;
+def L2 : Ri<18, "L2">, DwarfRegNum<[18]>;
+def L3 : Ri<19, "L3">, DwarfRegNum<[19]>;
+def L4 : Ri<20, "L4">, DwarfRegNum<[20]>;
+def L5 : Ri<21, "L5">, DwarfRegNum<[21]>;
+def L6 : Ri<22, "L6">, DwarfRegNum<[22]>;
+def L7 : Ri<23, "L7">, DwarfRegNum<[23]>;
+def I0 : Ri<24, "I0">, DwarfRegNum<[24]>;
+def I1 : Ri<25, "I1">, DwarfRegNum<[25]>;
+def I2 : Ri<26, "I2">, DwarfRegNum<[26]>;
+def I3 : Ri<27, "I3">, DwarfRegNum<[27]>;
+def I4 : Ri<28, "I4">, DwarfRegNum<[28]>;
+def I5 : Ri<29, "I5">, DwarfRegNum<[29]>;
+def I6 : Ri<30, "FP">, DwarfRegNum<[30]>;
+def I7 : Ri<31, "I7">, DwarfRegNum<[31]>;
+
+// Floating-point registers
+def F0 : Rf< 0, "F0">, DwarfRegNum<[32]>;
+def F1 : Rf< 1, "F1">, DwarfRegNum<[33]>;
+def F2 : Rf< 2, "F2">, DwarfRegNum<[34]>;
+def F3 : Rf< 3, "F3">, DwarfRegNum<[35]>;
+def F4 : Rf< 4, "F4">, DwarfRegNum<[36]>;
+def F5 : Rf< 5, "F5">, DwarfRegNum<[37]>;
+def F6 : Rf< 6, "F6">, DwarfRegNum<[38]>;
+def F7 : Rf< 7, "F7">, DwarfRegNum<[39]>;
+def F8 : Rf< 8, "F8">, DwarfRegNum<[40]>;
+def F9 : Rf< 9, "F9">, DwarfRegNum<[41]>;
+def F10 : Rf<10, "F10">, DwarfRegNum<[42]>;
+def F11 : Rf<11, "F11">, DwarfRegNum<[43]>;
+def F12 : Rf<12, "F12">, DwarfRegNum<[44]>;
+def F13 : Rf<13, "F13">, DwarfRegNum<[45]>;
+def F14 : Rf<14, "F14">, DwarfRegNum<[46]>;
+def F15 : Rf<15, "F15">, DwarfRegNum<[47]>;
+def F16 : Rf<16, "F16">, DwarfRegNum<[48]>;
+def F17 : Rf<17, "F17">, DwarfRegNum<[49]>;
+def F18 : Rf<18, "F18">, DwarfRegNum<[50]>;
+def F19 : Rf<19, "F19">, DwarfRegNum<[51]>;
+def F20 : Rf<20, "F20">, DwarfRegNum<[52]>;
+def F21 : Rf<21, "F21">, DwarfRegNum<[53]>;
+def F22 : Rf<22, "F22">, DwarfRegNum<[54]>;
+def F23 : Rf<23, "F23">, DwarfRegNum<[55]>;
+def F24 : Rf<24, "F24">, DwarfRegNum<[56]>;
+def F25 : Rf<25, "F25">, DwarfRegNum<[57]>;
+def F26 : Rf<26, "F26">, DwarfRegNum<[58]>;
+def F27 : Rf<27, "F27">, DwarfRegNum<[59]>;
+def F28 : Rf<28, "F28">, DwarfRegNum<[60]>;
+def F29 : Rf<29, "F29">, DwarfRegNum<[61]>;
+def F30 : Rf<30, "F30">, DwarfRegNum<[62]>;
+def F31 : Rf<31, "F31">, DwarfRegNum<[63]>;
+
+// Aliases of the F* registers used to hold 64-bit fp values (doubles)
+def D0 : Rd< 0, "F0", [F0, F1]>, DwarfRegNum<[72]>;
+def D1 : Rd< 2, "F2", [F2, F3]>, DwarfRegNum<[73]>;
+def D2 : Rd< 4, "F4", [F4, F5]>, DwarfRegNum<[74]>;
+def D3 : Rd< 6, "F6", [F6, F7]>, DwarfRegNum<[75]>;
+def D4 : Rd< 8, "F8", [F8, F9]>, DwarfRegNum<[76]>;
+def D5 : Rd<10, "F10", [F10, F11]>, DwarfRegNum<[77]>;
+def D6 : Rd<12, "F12", [F12, F13]>, DwarfRegNum<[78]>;
+def D7 : Rd<14, "F14", [F14, F15]>, DwarfRegNum<[79]>;
+def D8 : Rd<16, "F16", [F16, F17]>, DwarfRegNum<[80]>;
+def D9 : Rd<18, "F18", [F18, F19]>, DwarfRegNum<[81]>;
+def D10 : Rd<20, "F20", [F20, F21]>, DwarfRegNum<[82]>;
+def D11 : Rd<22, "F22", [F22, F23]>, DwarfRegNum<[83]>;
+def D12 : Rd<24, "F24", [F24, F25]>, DwarfRegNum<[84]>;
+def D13 : Rd<26, "F26", [F26, F27]>, DwarfRegNum<[85]>;
+def D14 : Rd<28, "F28", [F28, F29]>, DwarfRegNum<[86]>;
+def D15 : Rd<30, "F30", [F30, F31]>, DwarfRegNum<[87]>;
+
+// Co-processor registers
+def C0 : Ri< 0, "C0">;
+def C1 : Ri< 1, "C1">;
+def C2 : Ri< 2, "C2">;
+def C3 : Ri< 3, "C3">;
+def C4 : Ri< 4, "C4">;
+def C5 : Ri< 5, "C5">;
+def C6 : Ri< 6, "C6">;
+def C7 : Ri< 7, "C7">;
+def C8 : Ri< 8, "C8">;
+def C9 : Ri< 9, "C9">;
+def C10 : Ri< 10, "C10">;
+def C11 : Ri< 11, "C11">;
+def C12 : Ri< 12, "C12">;
+def C13 : Ri< 13, "C13">;
+def C14 : Ri< 14, "C14">;
+def C15 : Ri< 15, "C15">;
+def C16 : Ri< 16, "C16">;
+def C17 : Ri< 17, "C17">;
+def C18 : Ri< 18, "C18">;
+def C19 : Ri< 19, "C19">;
+def C20 : Ri< 20, "C20">;
+def C21 : Ri< 21, "C21">;
+def C22 : Ri< 22, "C22">;
+def C23 : Ri< 23, "C23">;
+def C24 : Ri< 24, "C24">;
+def C25 : Ri< 25, "C25">;
+def C26 : Ri< 26, "C26">;
+def C27 : Ri< 27, "C27">;
+def C28 : Ri< 28, "C28">;
+def C29 : Ri< 29, "C29">;
+def C30 : Ri< 30, "C30">;
+def C31 : Ri< 31, "C31">;
+
+// Unaliased double precision floating point registers.
+// FIXME: Define DwarfRegNum for these registers.
+def D16 : SparcReg< 1, "F32">;
+def D17 : SparcReg< 3, "F34">;
+def D18 : SparcReg< 5, "F36">;
+def D19 : SparcReg< 7, "F38">;
+def D20 : SparcReg< 9, "F40">;
+def D21 : SparcReg<11, "F42">;
+def D22 : SparcReg<13, "F44">;
+def D23 : SparcReg<15, "F46">;
+def D24 : SparcReg<17, "F48">;
+def D25 : SparcReg<19, "F50">;
+def D26 : SparcReg<21, "F52">;
+def D27 : SparcReg<23, "F54">;
+def D28 : SparcReg<25, "F56">;
+def D29 : SparcReg<27, "F58">;
+def D30 : SparcReg<29, "F60">;
+def D31 : SparcReg<31, "F62">;
+
+// Aliases of the F* registers used to hold 128-bit for values (long doubles).
+def Q0 : Rq< 0, "F0", [D0, D1]>;
+def Q1 : Rq< 4, "F4", [D2, D3]>;
+def Q2 : Rq< 8, "F8", [D4, D5]>;
+def Q3 : Rq<12, "F12", [D6, D7]>;
+def Q4 : Rq<16, "F16", [D8, D9]>;
+def Q5 : Rq<20, "F20", [D10, D11]>;
+def Q6 : Rq<24, "F24", [D12, D13]>;
+def Q7 : Rq<28, "F28", [D14, D15]>;
+def Q8 : Rq< 1, "F32", [D16, D17]>;
+def Q9 : Rq< 5, "F36", [D18, D19]>;
+def Q10 : Rq< 9, "F40", [D20, D21]>;
+def Q11 : Rq<13, "F44", [D22, D23]>;
+def Q12 : Rq<17, "F48", [D24, D25]>;
+def Q13 : Rq<21, "F52", [D26, D27]>;
+def Q14 : Rq<25, "F56", [D28, D29]>;
+def Q15 : Rq<29, "F60", [D30, D31]>;
+
+// Aliases of the integer registers used for LDD/STD double-word operations
+def G0_G1 : Rdi<0, "G0", [G0, G1]>;
+def G2_G3 : Rdi<2, "G2", [G2, G3]>;
+def G4_G5 : Rdi<4, "G4", [G4, G5]>;
+def G6_G7 : Rdi<6, "G6", [G6, G7]>;
+def O0_O1 : Rdi<8, "O0", [O0, O1]>;
+def O2_O3 : Rdi<10, "O2", [O2, O3]>;
+def O4_O5 : Rdi<12, "O4", [O4, O5]>;
+def O6_O7 : Rdi<14, "O6", [O6, O7]>;
+def L0_L1 : Rdi<16, "L0", [L0, L1]>;
+def L2_L3 : Rdi<18, "L2", [L2, L3]>;
+def L4_L5 : Rdi<20, "L4", [L4, L5]>;
+def L6_L7 : Rdi<22, "L6", [L6, L7]>;
+def I0_I1 : Rdi<24, "I0", [I0, I1]>;
+def I2_I3 : Rdi<26, "I2", [I2, I3]>;
+def I4_I5 : Rdi<28, "I4", [I4, I5]>;
+def I6_I7 : Rdi<30, "I6", [I6, I7]>;
+
+// Aliases of the co-processor registers used for LDD/STD double-word operations
+def C0_C1 : Rdi<0, "C0", [C0, C1]>;
+def C2_C3 : Rdi<2, "C2", [C2, C3]>;
+def C4_C5 : Rdi<4, "C4", [C4, C5]>;
+def C6_C7 : Rdi<6, "C6", [C6, C7]>;
+def C8_C9 : Rdi<8, "C8", [C8, C9]>;
+def C10_C11 : Rdi<10, "C10", [C10, C11]>;
+def C12_C13 : Rdi<12, "C12", [C12, C13]>;
+def C14_C15 : Rdi<14, "C14", [C14, C15]>;
+def C16_C17 : Rdi<16, "C16", [C16, C17]>;
+def C18_C19 : Rdi<18, "C18", [C18, C19]>;
+def C20_C21 : Rdi<20, "C20", [C20, C21]>;
+def C22_C23 : Rdi<22, "C22", [C22, C23]>;
+def C24_C25 : Rdi<24, "C24", [C24, C25]>;
+def C26_C27 : Rdi<26, "C26", [C26, C27]>;
+def C28_C29 : Rdi<28, "C28", [C28, C29]>;
+def C30_C31 : Rdi<30, "C30", [C30, C31]>;
+
+// Register classes.
+//
+// FIXME: the register order should be defined in terms of the preferred
+// allocation order...
+//
+// This register class should not be used to hold i64 values, use the I64Regs
+// register class for that. The i64 type is included here to allow i64 patterns
+// using the integer instructions.
+def IntRegs : RegisterClass<"SP", [i32, i64], 32,
+ (add (sequence "I%u", 0, 7),
+ (sequence "G%u", 0, 7),
+ (sequence "L%u", 0, 7),
+ (sequence "O%u", 0, 7))>;
+
+// Should be in the same order as IntRegs.
+def IntPair : RegisterClass<"SP", [v2i32], 64,
+ (add I0_I1, I2_I3, I4_I5, I6_I7,
+ G0_G1, G2_G3, G4_G5, G6_G7,
+ L0_L1, L2_L3, L4_L5, L6_L7,
+ O0_O1, O2_O3, O4_O5, O6_O7)>;
+
+// Register class for 64-bit mode, with a 64-bit spill slot size.
+// These are the same as the 32-bit registers, so TableGen will consider this
+// to be a sub-class of IntRegs. That works out because requiring a 64-bit
+// spill slot is a stricter constraint than only requiring a 32-bit spill slot.
+def I64Regs : RegisterClass<"SP", [i64], 64, (add IntRegs)>;
+
+// Floating point register classes.
+def FPRegs : RegisterClass<"SP", [f32], 32, (sequence "F%u", 0, 31)>;
+
+def DFPRegs : RegisterClass<"SP", [f64], 64, (sequence "D%u", 0, 31)>;
+
+def QFPRegs : RegisterClass<"SP", [f128], 128, (sequence "Q%u", 0, 15)>;
+
+// Floating point control register classes.
+def FCCRegs : RegisterClass<"SP", [i1], 1, (sequence "FCC%u", 0, 3)>;
+
+let isAllocatable = 0 in {
+ // Ancillary state registers
+ def ASRRegs : RegisterClass<"SP", [i32], 32,
+ (add Y, (sequence "ASR%u", 1, 31))>;
+
+ // This register class should not be used to hold i64 values.
+ def CoprocRegs : RegisterClass<"SP", [i32], 32,
+ (add (sequence "C%u", 0, 31))>;
+
+ // Should be in the same order as CoprocRegs.
+ def CoprocPair : RegisterClass<"SP", [v2i32], 64,
+ (add C0_C1, C2_C3, C4_C5, C6_C7,
+ C8_C9, C10_C11, C12_C13, C14_C15,
+ C16_C17, C18_C19, C20_C21, C22_C23,
+ C24_C25, C26_C27, C28_C29, C30_C31)>;
+}
+
+// Privileged Registers
+def PRRegs : RegisterClass<"SP", [i64], 64,
+ (add TPC, TNPC, TSTATE, TT, TICK, TBA, PSTATE, TL, PIL, CWP,
+ CANSAVE, CANRESTORE, CLEANWIN, OTHERWIN, WSTATE)>;
diff --git a/contrib/llvm/lib/Target/Sparc/SparcSchedule.td b/contrib/llvm/lib/Target/Sparc/SparcSchedule.td
new file mode 100755
index 000000000000..f243546b029b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcSchedule.td
@@ -0,0 +1,124 @@
+//===-- SparcSchedule.td - Describe the Sparc Itineries ----*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+def IIC_iu_or_fpu_instr : InstrItinClass;
+def IIC_iu_instr : InstrItinClass;
+def IIC_fpu_normal_instr : InstrItinClass;
+def IIC_fpu_fast_instr : InstrItinClass;
+def IIC_jmp_or_call : InstrItinClass;
+def IIC_ldd : InstrItinClass;
+def IIC_st : InstrItinClass;
+def IIC_std : InstrItinClass;
+def IIC_iu_smul : InstrItinClass;
+def IIC_iu_umul : InstrItinClass;
+def IIC_iu_div : InstrItinClass;
+def IIC_ticc : InstrItinClass;
+def IIC_ldstub : InstrItinClass;
+def IIC_fpu_muls : InstrItinClass;
+def IIC_fpu_muld : InstrItinClass;
+def IIC_fpu_divs : InstrItinClass;
+def IIC_fpu_divd : InstrItinClass;
+def IIC_fpu_sqrts : InstrItinClass;
+def IIC_fpu_sqrtd : InstrItinClass;
+def IIC_fpu_abs : InstrItinClass;
+def IIC_fpu_movs : InstrItinClass;
+def IIC_fpu_negs : InstrItinClass;
+def IIC_smac_umac : InstrItinClass;
+def IIC_fpu_stod : InstrItinClass;
+
+def LEONIU : FuncUnit; // integer unit
+def LEONFPU : FuncUnit; // floating-point unit
+
+// Ref: http://www.atmel.com/Images/doc4226.pdf
+
+def LEON2Itineraries : ProcessorItineraries<
+[LEONIU, LEONFPU], [], [
+ InstrItinData<IIC_iu_or_fpu_instr, [InstrStage<1, [LEONIU, LEONFPU]>], [1, 1]>,
+ InstrItinData<IIC_iu_instr, [InstrStage<1, [LEONIU]>], [1, 1]>,
+ InstrItinData<IIC_fpu_normal_instr, [InstrStage<1, [LEONFPU]>], [7, 1]>,
+ InstrItinData<IIC_fpu_fast_instr, [InstrStage<1, [LEONFPU]>], [7, 1]>,
+ InstrItinData<IIC_jmp_or_call, [InstrStage<1, [LEONIU, LEONFPU]>], [2, 1]>,
+ InstrItinData<IIC_ldd, [InstrStage<1, [LEONIU, LEONFPU]>], [2, 1]>,
+ InstrItinData<IIC_st, [InstrStage<1, [LEONIU, LEONFPU]>], [2, 1]>,
+ InstrItinData<IIC_std, [InstrStage<1, [LEONIU, LEONFPU]>], [3, 1]>,
+ InstrItinData<IIC_iu_smul, [InstrStage<1, [LEONIU]>], [5, 1]>,
+ InstrItinData<IIC_iu_umul, [InstrStage<1, [LEONIU]>], [5, 1]>,
+ InstrItinData<IIC_iu_div, [InstrStage<1, [LEONIU]>], [35, 1]>,
+ InstrItinData<IIC_ticc, [InstrStage<1, [LEONIU, LEONFPU]>], [4, 1]>,
+ InstrItinData<IIC_ldstub, [InstrStage<1, [LEONIU, LEONFPU]>], [3, 1]>,
+ InstrItinData<IIC_fpu_muls, [InstrStage<1, [LEONFPU]>], [16, 1]>,
+ InstrItinData<IIC_fpu_muld, [InstrStage<1, [LEONFPU]>], [21, 1]>,
+ InstrItinData<IIC_fpu_divs, [InstrStage<1, [LEONFPU]>], [20, 1]>,
+ InstrItinData<IIC_fpu_divd, [InstrStage<1, [LEONFPU]>], [36, 1]>,
+ InstrItinData<IIC_fpu_sqrts, [InstrStage<1, [LEONFPU]>], [37, 1]>,
+ InstrItinData<IIC_fpu_sqrtd, [InstrStage<1, [LEONFPU]>], [65, 1]>,
+ InstrItinData<IIC_fpu_abs, [InstrStage<1, [LEONFPU]>], [2, 1]>,
+ InstrItinData<IIC_fpu_movs, [InstrStage<1, [LEONFPU]>], [2, 1]>,
+ InstrItinData<IIC_fpu_negs, [InstrStage<1, [LEONFPU]>], [2, 1]>,
+ InstrItinData<IIC_fpu_stod, [InstrStage<1, [LEONFPU]>], [2, 1]>
+]>;
+
+def LEON3Itineraries : ProcessorItineraries<
+[LEONIU, LEONFPU], [], [
+ InstrItinData<IIC_iu_or_fpu_instr, [InstrStage<1, [LEONIU, LEONFPU]>], [1, 1]>,
+ InstrItinData<IIC_iu_instr, [InstrStage<1, [LEONIU]>], [1, 1]>,
+ InstrItinData<IIC_fpu_normal_instr, [InstrStage<1, [LEONFPU]>], [7, 1]>,
+ InstrItinData<IIC_fpu_fast_instr, [InstrStage<1, [LEONFPU]>], [4, 1]>,
+ InstrItinData<IIC_jmp_or_call, [InstrStage<1, [LEONIU, LEONFPU]>], [3, 1]>,
+ InstrItinData<IIC_ldd, [InstrStage<1, [LEONIU, LEONFPU]>], [2, 1]>,
+ InstrItinData<IIC_st, [InstrStage<1, [LEONIU, LEONFPU]>], [4, 1]>,
+ InstrItinData<IIC_std, [InstrStage<1, [LEONIU, LEONFPU]>], [5, 1]>,
+ InstrItinData<IIC_iu_smul, [InstrStage<1, [LEONIU]>], [1, 1]>,
+ InstrItinData<IIC_iu_umul, [InstrStage<1, [LEONIU]>], [4, 1]>,
+ InstrItinData<IIC_iu_div, [InstrStage<1, [LEONIU]>], [35, 1]>,
+ InstrItinData<IIC_smac_umac, [InstrStage<1, [LEONIU]>], [2, 1]>,
+ InstrItinData<IIC_ticc, [InstrStage<1, [LEONIU, LEONFPU]>], [5, 1]>,
+ InstrItinData<IIC_ldstub, [InstrStage<1, [LEONIU, LEONFPU]>], [3, 1]>,
+ InstrItinData<IIC_fpu_muls, [InstrStage<1, [LEONFPU]>], [4, 1]>,
+ InstrItinData<IIC_fpu_muld, [InstrStage<1, [LEONFPU]>], [4, 1]>,
+ InstrItinData<IIC_fpu_divs, [InstrStage<1, [LEONFPU]>], [16, 1]>,
+ InstrItinData<IIC_fpu_divd, [InstrStage<1, [LEONFPU]>], [17, 1]>,
+ InstrItinData<IIC_fpu_sqrts, [InstrStage<1, [LEONFPU]>], [24, 1]>,
+ InstrItinData<IIC_fpu_sqrtd, [InstrStage<1, [LEONFPU]>], [25, 1]>,
+ InstrItinData<IIC_fpu_abs, [InstrStage<1, [LEONFPU]>], [2, 1]>,
+ InstrItinData<IIC_fpu_movs, [InstrStage<1, [LEONFPU]>], [2, 1]>,
+ InstrItinData<IIC_fpu_negs, [InstrStage<1, [LEONFPU]>], [2, 1]>,
+ InstrItinData<IIC_fpu_stod, [InstrStage<1, [LEONFPU]>], [4, 1]>
+]>;
+
+def LEON4Itineraries : ProcessorItineraries<
+[LEONIU, LEONFPU], [], [
+ InstrItinData<IIC_iu_or_fpu_instr, [InstrStage<1, [LEONIU, LEONFPU]>], [1, 1]>,
+ InstrItinData<IIC_iu_instr, [InstrStage<1, [LEONIU]>], [1, 1]>,
+ InstrItinData<IIC_fpu_normal_instr, [InstrStage<1, [LEONFPU]>], [7, 1]>,
+ InstrItinData<IIC_fpu_fast_instr, [InstrStage<1, [LEONFPU]>], [4, 1]>,
+ InstrItinData<IIC_jmp_or_call, [InstrStage<1, [LEONIU, LEONFPU]>], [3, 1]>,
+ InstrItinData<IIC_ldd, [InstrStage<1, [LEONIU, LEONFPU]>], [1, 1]>,
+ InstrItinData<IIC_st, [InstrStage<1, [LEONIU, LEONFPU]>], [1, 1]>,
+ InstrItinData<IIC_std, [InstrStage<1, [LEONIU, LEONFPU]>], [1, 1]>,
+ InstrItinData<IIC_iu_smul, [InstrStage<1, [LEONIU]>], [1, 1]>,
+ InstrItinData<IIC_iu_umul, [InstrStage<1, [LEONIU]>], [4, 1]>,
+ InstrItinData<IIC_iu_div, [InstrStage<1, [LEONIU]>], [35, 1]>,
+ InstrItinData<IIC_smac_umac, [InstrStage<1, [LEONIU]>], [2, 1]>,
+ InstrItinData<IIC_ticc, [InstrStage<1, [LEONIU, LEONFPU]>], [5, 1]>,
+ InstrItinData<IIC_ldstub, [InstrStage<1, [LEONIU, LEONFPU]>], [3, 1]>,
+ InstrItinData<IIC_fpu_muls, [InstrStage<1, [LEONFPU]>], [4, 1]>,
+ InstrItinData<IIC_fpu_muld, [InstrStage<1, [LEONFPU]>], [4, 1]>,
+ InstrItinData<IIC_fpu_divs, [InstrStage<1, [LEONFPU]>], [16, 1]>,
+ InstrItinData<IIC_fpu_divd, [InstrStage<1, [LEONFPU]>], [17, 1]>,
+ InstrItinData<IIC_fpu_sqrts, [InstrStage<1, [LEONFPU]>], [24, 1]>,
+ InstrItinData<IIC_fpu_sqrtd, [InstrStage<1, [LEONFPU]>], [25, 1]>,
+ InstrItinData<IIC_fpu_abs, [InstrStage<1, [LEONFPU]>], [2, 1]>,
+ InstrItinData<IIC_fpu_movs, [InstrStage<1, [LEONFPU]>], [2, 1]>,
+ InstrItinData<IIC_fpu_negs, [InstrStage<1, [LEONFPU]>], [2, 1]>,
+ InstrItinData<IIC_fpu_stod, [InstrStage<1, [LEONFPU]>], [4, 1]>
+]>;
diff --git a/contrib/llvm/lib/Target/Sparc/SparcSubtarget.cpp b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.cpp
new file mode 100644
index 000000000000..43ddef3cc96e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.cpp
@@ -0,0 +1,99 @@
+//===-- SparcSubtarget.cpp - SPARC Subtarget Information ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SPARC specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcSubtarget.h"
+#include "Sparc.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "sparc-subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "SparcGenSubtargetInfo.inc"
+
+void SparcSubtarget::anchor() { }
+
+SparcSubtarget &SparcSubtarget::initializeSubtargetDependencies(StringRef CPU,
+ StringRef FS) {
+ IsV9 = false;
+ IsLeon = false;
+ V8DeprecatedInsts = false;
+ IsVIS = false;
+ HasHardQuad = false;
+ UsePopc = false;
+ UseSoftFloat = false;
+
+ // Leon features
+ HasLeonCasa = false;
+ HasUmacSmac = false;
+ PerformSDIVReplace = false;
+ InsertNOPLoad = false;
+ FixFSMULD = false;
+ ReplaceFMULS = false;
+ FixAllFDIVSQRT = false;
+ DetectRoundChange = false;
+
+ // Determine default and user specified characteristics
+ std::string CPUName = CPU;
+ if (CPUName.empty())
+ CPUName = (Is64Bit) ? "v9" : "v8";
+
+ // Parse features string.
+ ParseSubtargetFeatures(CPUName, FS);
+
+ // Popc is a v9-only instruction.
+ if (!IsV9)
+ UsePopc = false;
+
+ return *this;
+}
+
+SparcSubtarget::SparcSubtarget(const Triple &TT, const std::string &CPU,
+ const std::string &FS, const TargetMachine &TM,
+ bool is64Bit)
+ : SparcGenSubtargetInfo(TT, CPU, FS), TargetTriple(TT), Is64Bit(is64Bit),
+ InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
+ FrameLowering(*this) {}
+
+int SparcSubtarget::getAdjustedFrameSize(int frameSize) const {
+
+ if (is64Bit()) {
+ // All 64-bit stack frames must be 16-byte aligned, and must reserve space
+ // for spilling the 16 window registers at %sp+BIAS..%sp+BIAS+128.
+ frameSize += 128;
+ // Frames with calls must also reserve space for 6 outgoing arguments
+ // whether they are used or not. LowerCall_64 takes care of that.
+ frameSize = alignTo(frameSize, 16);
+ } else {
+ // Emit the correct save instruction based on the number of bytes in
+ // the frame. Minimum stack frame size according to V8 ABI is:
+ // 16 words for register window spill
+ // 1 word for address of returned aggregate-value
+ // + 6 words for passing parameters on the stack
+ // ----------
+ // 23 words * 4 bytes per word = 92 bytes
+ frameSize += 92;
+
+ // Round up to next doubleword boundary -- a double-word boundary
+ // is required by the ABI.
+ frameSize = alignTo(frameSize, 8);
+ }
+ return frameSize;
+}
+
+bool SparcSubtarget::enableMachineScheduler() const {
+ return true;
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcSubtarget.h b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.h
new file mode 100644
index 000000000000..fa42da425ff2
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.h
@@ -0,0 +1,122 @@
+//===-- SparcSubtarget.h - Define Subtarget for the SPARC -------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the SPARC specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPARC_SPARCSUBTARGET_H
+#define LLVM_LIB_TARGET_SPARC_SPARCSUBTARGET_H
+
+#include "SparcFrameLowering.h"
+#include "SparcISelLowering.h"
+#include "SparcInstrInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "SparcGenSubtargetInfo.inc"
+
+namespace llvm {
+class StringRef;
+
+class SparcSubtarget : public SparcGenSubtargetInfo {
+ Triple TargetTriple;
+ virtual void anchor();
+ bool IsV9;
+ bool IsLeon;
+ bool V8DeprecatedInsts;
+ bool IsVIS, IsVIS2, IsVIS3;
+ bool Is64Bit;
+ bool HasHardQuad;
+ bool UsePopc;
+ bool UseSoftFloat;
+
+ // LEON features
+ bool HasUmacSmac;
+ bool HasLeonCasa;
+ bool InsertNOPLoad;
+ bool FixFSMULD;
+ bool ReplaceFMULS;
+ bool FixAllFDIVSQRT;
+ bool DetectRoundChange;
+ bool PerformSDIVReplace;
+
+ SparcInstrInfo InstrInfo;
+ SparcTargetLowering TLInfo;
+ SelectionDAGTargetInfo TSInfo;
+ SparcFrameLowering FrameLowering;
+
+public:
+ SparcSubtarget(const Triple &TT, const std::string &CPU,
+ const std::string &FS, const TargetMachine &TM, bool is64bit);
+
+ const SparcInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+ const TargetFrameLowering *getFrameLowering() const override {
+ return &FrameLowering;
+ }
+ const SparcRegisterInfo *getRegisterInfo() const override {
+ return &InstrInfo.getRegisterInfo();
+ }
+ const SparcTargetLowering *getTargetLowering() const override {
+ return &TLInfo;
+ }
+ const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+ return &TSInfo;
+ }
+
+ bool enableMachineScheduler() const override;
+
+ bool isV9() const { return IsV9; }
+ bool isLeon() const { return IsLeon; }
+ bool isVIS() const { return IsVIS; }
+ bool isVIS2() const { return IsVIS2; }
+ bool isVIS3() const { return IsVIS3; }
+ bool useDeprecatedV8Instructions() const { return V8DeprecatedInsts; }
+ bool hasHardQuad() const { return HasHardQuad; }
+ bool usePopc() const { return UsePopc; }
+ bool useSoftFloat() const { return UseSoftFloat; }
+
+ // Leon options
+ bool hasUmacSmac() const { return HasUmacSmac; }
+ bool performSDIVReplace() const { return PerformSDIVReplace; }
+ bool hasLeonCasa() const { return HasLeonCasa; }
+ bool insertNOPLoad() const { return InsertNOPLoad; }
+ bool fixFSMULD() const { return FixFSMULD; }
+ bool replaceFMULS() const { return ReplaceFMULS; }
+ bool fixAllFDIVSQRT() const { return FixAllFDIVSQRT; }
+ bool detectRoundChange() const { return DetectRoundChange; }
+
+ /// ParseSubtargetFeatures - Parses features string setting specified
+ /// subtarget options. Definition of function is auto generated by tblgen.
+ void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+ SparcSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
+
+ bool is64Bit() const { return Is64Bit; }
+
+ /// The 64-bit ABI uses biased stack and frame pointers, so the stack frame
+ /// of the current function is the area from [%sp+BIAS] to [%fp+BIAS].
+ int64_t getStackPointerBias() const {
+ return is64Bit() ? 2047 : 0;
+ }
+
+ /// Given a actual stack size as determined by FrameInfo, this function
+ /// returns adjusted framesize which includes space for register window
+ /// spills and arguments.
+ int getAdjustedFrameSize(int stackSize) const;
+
+ bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.cpp b/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
new file mode 100644
index 000000000000..4ae64062d9e2
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -0,0 +1,197 @@
+//===-- SparcTargetMachine.cpp - Define TargetMachine for Sparc -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcTargetMachine.h"
+#include "SparcTargetObjectFile.h"
+#include "Sparc.h"
+#include "LeonPasses.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+extern "C" void LLVMInitializeSparcTarget() {
+ // Register the target.
+ RegisterTargetMachine<SparcV8TargetMachine> X(getTheSparcTarget());
+ RegisterTargetMachine<SparcV9TargetMachine> Y(getTheSparcV9Target());
+ RegisterTargetMachine<SparcelTargetMachine> Z(getTheSparcelTarget());
+}
+
+static std::string computeDataLayout(const Triple &T, bool is64Bit) {
+ // Sparc is typically big endian, but some are little.
+ std::string Ret = T.getArch() == Triple::sparcel ? "e" : "E";
+ Ret += "-m:e";
+
+ // Some ABIs have 32bit pointers.
+ if (!is64Bit)
+ Ret += "-p:32:32";
+
+ // Alignments for 64 bit integers.
+ Ret += "-i64:64";
+
+ // On SparcV9 128 floats are aligned to 128 bits, on others only to 64.
+ // On SparcV9 registers can hold 64 or 32 bits, on others only 32.
+ if (is64Bit)
+ Ret += "-n32:64";
+ else
+ Ret += "-f128:64-n32";
+
+ if (is64Bit)
+ Ret += "-S128";
+ else
+ Ret += "-S64";
+
+ return Ret;
+}
+
+static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
+ if (!RM.hasValue())
+ return Reloc::Static;
+ return *RM;
+}
+
+/// Create an ILP32 architecture model
+SparcTargetMachine::SparcTargetMachine(const Target &T, const Triple &TT,
+ StringRef CPU, StringRef FS,
+ const TargetOptions &Options,
+ Optional<Reloc::Model> RM,
+ CodeModel::Model CM,
+ CodeGenOpt::Level OL, bool is64bit)
+ : LLVMTargetMachine(T, computeDataLayout(TT, is64bit), TT, CPU, FS, Options,
+ getEffectiveRelocModel(RM), CM, OL),
+ TLOF(make_unique<SparcELFTargetObjectFile>()),
+ Subtarget(TT, CPU, FS, *this, is64bit), is64Bit(is64bit) {
+ initAsmInfo();
+}
+
+SparcTargetMachine::~SparcTargetMachine() {}
+
+const SparcSubtarget *
+SparcTargetMachine::getSubtargetImpl(const Function &F) const {
+ Attribute CPUAttr = F.getFnAttribute("target-cpu");
+ Attribute FSAttr = F.getFnAttribute("target-features");
+
+ std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
+ ? CPUAttr.getValueAsString().str()
+ : TargetCPU;
+ std::string FS = !FSAttr.hasAttribute(Attribute::None)
+ ? FSAttr.getValueAsString().str()
+ : TargetFS;
+
+ // FIXME: This is related to the code below to reset the target options,
+ // we need to know whether or not the soft float flag is set on the
+ // function, so we can enable it as a subtarget feature.
+ bool softFloat =
+ F.hasFnAttribute("use-soft-float") &&
+ F.getFnAttribute("use-soft-float").getValueAsString() == "true";
+
+ if (softFloat)
+ FS += FS.empty() ? "+soft-float" : ",+soft-float";
+
+ auto &I = SubtargetMap[CPU + FS];
+ if (!I) {
+ // This needs to be done before we create a new subtarget since any
+ // creation will depend on the TM and the code generation flags on the
+ // function that reside in TargetOptions.
+ resetTargetOptions(F);
+ I = llvm::make_unique<SparcSubtarget>(TargetTriple, CPU, FS, *this,
+ this->is64Bit);
+ }
+ return I.get();
+}
+
+namespace {
+/// Sparc Code Generator Pass Configuration Options.
+class SparcPassConfig : public TargetPassConfig {
+public:
+ SparcPassConfig(SparcTargetMachine *TM, PassManagerBase &PM)
+ : TargetPassConfig(TM, PM) {}
+
+ SparcTargetMachine &getSparcTargetMachine() const {
+ return getTM<SparcTargetMachine>();
+ }
+
+ void addIRPasses() override;
+ bool addInstSelector() override;
+ void addPreEmitPass() override;
+};
+} // namespace
+
+TargetPassConfig *SparcTargetMachine::createPassConfig(PassManagerBase &PM) {
+ return new SparcPassConfig(this, PM);
+}
+
+void SparcPassConfig::addIRPasses() {
+ addPass(createAtomicExpandPass(&getSparcTargetMachine()));
+
+ TargetPassConfig::addIRPasses();
+}
+
+bool SparcPassConfig::addInstSelector() {
+ addPass(createSparcISelDag(getSparcTargetMachine()));
+ return false;
+}
+
+void SparcPassConfig::addPreEmitPass(){
+ addPass(createSparcDelaySlotFillerPass(getSparcTargetMachine()));
+
+ if (this->getSparcTargetMachine().getSubtargetImpl()->insertNOPLoad())
+ {
+ addPass(new InsertNOPLoad(getSparcTargetMachine()));
+ }
+ if (this->getSparcTargetMachine().getSubtargetImpl()->fixFSMULD())
+ {
+ addPass(new FixFSMULD(getSparcTargetMachine()));
+ }
+ if (this->getSparcTargetMachine().getSubtargetImpl()->replaceFMULS())
+ {
+ addPass(new ReplaceFMULS(getSparcTargetMachine()));
+ }
+ if (this->getSparcTargetMachine().getSubtargetImpl()->detectRoundChange()) {
+ addPass(new DetectRoundChange(getSparcTargetMachine()));
+ }
+ if (this->getSparcTargetMachine().getSubtargetImpl()->fixAllFDIVSQRT())
+ {
+ addPass(new FixAllFDIVSQRT(getSparcTargetMachine()));
+ }
+}
+
+void SparcV8TargetMachine::anchor() { }
+
+SparcV8TargetMachine::SparcV8TargetMachine(const Target &T, const Triple &TT,
+ StringRef CPU, StringRef FS,
+ const TargetOptions &Options,
+ Optional<Reloc::Model> RM,
+ CodeModel::Model CM,
+ CodeGenOpt::Level OL)
+ : SparcTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
+
+void SparcV9TargetMachine::anchor() { }
+
+SparcV9TargetMachine::SparcV9TargetMachine(const Target &T, const Triple &TT,
+ StringRef CPU, StringRef FS,
+ const TargetOptions &Options,
+ Optional<Reloc::Model> RM,
+ CodeModel::Model CM,
+ CodeGenOpt::Level OL)
+ : SparcTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
+
+void SparcelTargetMachine::anchor() {}
+
+SparcelTargetMachine::SparcelTargetMachine(const Target &T, const Triple &TT,
+ StringRef CPU, StringRef FS,
+ const TargetOptions &Options,
+ Optional<Reloc::Model> RM,
+ CodeModel::Model CM,
+ CodeGenOpt::Level OL)
+ : SparcTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.h b/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.h
new file mode 100644
index 000000000000..48193fe095be
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.h
@@ -0,0 +1,79 @@
+//===-- SparcTargetMachine.h - Define TargetMachine for Sparc ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Sparc specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPARC_SPARCTARGETMACHINE_H
+#define LLVM_LIB_TARGET_SPARC_SPARCTARGETMACHINE_H
+
+#include "SparcInstrInfo.h"
+#include "SparcSubtarget.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class SparcTargetMachine : public LLVMTargetMachine {
+ std::unique_ptr<TargetLoweringObjectFile> TLOF;
+ SparcSubtarget Subtarget;
+ bool is64Bit;
+ mutable StringMap<std::unique_ptr<SparcSubtarget>> SubtargetMap;
+public:
+ SparcTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL, bool is64bit);
+ ~SparcTargetMachine() override;
+
+ const SparcSubtarget *getSubtargetImpl() const { return &Subtarget; }
+ const SparcSubtarget *getSubtargetImpl(const Function &) const override;
+
+ // Pass Pipeline Configuration
+ TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+ TargetLoweringObjectFile *getObjFileLowering() const override {
+ return TLOF.get();
+ }
+};
+
+/// Sparc 32-bit target machine
+///
+class SparcV8TargetMachine : public SparcTargetMachine {
+ virtual void anchor();
+public:
+ SparcV8TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL);
+};
+
+/// Sparc 64-bit target machine
+///
+class SparcV9TargetMachine : public SparcTargetMachine {
+ virtual void anchor();
+public:
+ SparcV9TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL);
+};
+
+class SparcelTargetMachine : public SparcTargetMachine {
+ virtual void anchor();
+
+public:
+ SparcelTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp b/contrib/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp
new file mode 100644
index 000000000000..8fdde15d8d27
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp
@@ -0,0 +1,42 @@
+//===------- SparcTargetObjectFile.cpp - Sparc Object Info Impl -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcTargetObjectFile.h"
+#include "MCTargetDesc/SparcMCExpr.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Target/TargetLowering.h"
+
+using namespace llvm;
+
+const MCExpr *SparcELFTargetObjectFile::getTTypeGlobalReference(
+ const GlobalValue *GV, unsigned Encoding, const TargetMachine &TM,
+ MachineModuleInfo *MMI, MCStreamer &Streamer) const {
+
+ if (Encoding & dwarf::DW_EH_PE_pcrel) {
+ MachineModuleInfoELF &ELFMMI = MMI->getObjFileInfo<MachineModuleInfoELF>();
+
+ MCSymbol *SSym = getSymbolWithGlobalValueBase(GV, ".DW.stub", TM);
+
+ // Add information about the stub reference to ELFMMI so that the stub
+ // gets emitted by the asmprinter.
+ MachineModuleInfoImpl::StubValueTy &StubSym = ELFMMI.getGVStubEntry(SSym);
+ if (!StubSym.getPointer()) {
+ MCSymbol *Sym = TM.getSymbol(GV);
+ StubSym = MachineModuleInfoImpl::StubValueTy(Sym, !GV->hasLocalLinkage());
+ }
+
+ MCContext &Ctx = getContext();
+ return SparcMCExpr::create(SparcMCExpr::VK_Sparc_R_DISP32,
+ MCSymbolRefExpr::create(SSym, Ctx), Ctx);
+ }
+
+ return TargetLoweringObjectFileELF::getTTypeGlobalReference(GV, Encoding, TM,
+ MMI, Streamer);
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcTargetObjectFile.h b/contrib/llvm/lib/Target/Sparc/SparcTargetObjectFile.h
new file mode 100644
index 000000000000..fe8800625a56
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcTargetObjectFile.h
@@ -0,0 +1,35 @@
+//===-- SparcTargetObjectFile.h - Sparc Object Info -------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPARC_SPARCTARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_SPARC_SPARCTARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+
+namespace llvm {
+
+class MCContext;
+class TargetMachine;
+
+class SparcELFTargetObjectFile : public TargetLoweringObjectFileELF {
+public:
+ SparcELFTargetObjectFile() :
+ TargetLoweringObjectFileELF()
+ {}
+
+ const MCExpr *getTTypeGlobalReference(const GlobalValue *GV,
+ unsigned Encoding,
+ const TargetMachine &TM,
+ MachineModuleInfo *MMI,
+ MCStreamer &Streamer) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/SparcTargetStreamer.h b/contrib/llvm/lib/Target/Sparc/SparcTargetStreamer.h
new file mode 100644
index 000000000000..3b503503abce
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcTargetStreamer.h
@@ -0,0 +1,49 @@
+//===-- SparcTargetStreamer.h - Sparc Target Streamer ----------*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPARC_SPARCTARGETSTREAMER_H
+#define LLVM_LIB_TARGET_SPARC_SPARCTARGETSTREAMER_H
+
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+class SparcTargetStreamer : public MCTargetStreamer {
+ virtual void anchor();
+
+public:
+ SparcTargetStreamer(MCStreamer &S);
+ /// Emit ".register <reg>, #ignore".
+ virtual void emitSparcRegisterIgnore(unsigned reg) = 0;
+ /// Emit ".register <reg>, #scratch".
+ virtual void emitSparcRegisterScratch(unsigned reg) = 0;
+};
+
+// This part is for ascii assembly output
+class SparcTargetAsmStreamer : public SparcTargetStreamer {
+ formatted_raw_ostream &OS;
+
+public:
+ SparcTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
+ void emitSparcRegisterIgnore(unsigned reg) override;
+ void emitSparcRegisterScratch(unsigned reg) override;
+
+};
+
+// This part is for ELF object output
+class SparcTargetELFStreamer : public SparcTargetStreamer {
+public:
+ SparcTargetELFStreamer(MCStreamer &S);
+ MCELFStreamer &getStreamer();
+ void emitSparcRegisterIgnore(unsigned reg) override {}
+ void emitSparcRegisterScratch(unsigned reg) override {}
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp b/contrib/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
new file mode 100644
index 000000000000..66178acd52ba
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
@@ -0,0 +1,35 @@
+//===-- SparcTargetInfo.cpp - Sparc Target Implementation -----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Sparc.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target &llvm::getTheSparcTarget() {
+ static Target TheSparcTarget;
+ return TheSparcTarget;
+}
+Target &llvm::getTheSparcV9Target() {
+ static Target TheSparcV9Target;
+ return TheSparcV9Target;
+}
+Target &llvm::getTheSparcelTarget() {
+ static Target TheSparcelTarget;
+ return TheSparcelTarget;
+}
+
+extern "C" void LLVMInitializeSparcTargetInfo() {
+ RegisterTarget<Triple::sparc, /*HasJIT=*/true> X(getTheSparcTarget(), "sparc",
+ "Sparc");
+ RegisterTarget<Triple::sparcv9, /*HasJIT=*/true> Y(getTheSparcV9Target(),
+ "sparcv9", "Sparc V9");
+ RegisterTarget<Triple::sparcel, /*HasJIT=*/true> Z(getTheSparcelTarget(),
+ "sparcel", "Sparc LE");
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/contrib/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
new file mode 100644
index 000000000000..a94717c93456
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -0,0 +1,1259 @@
+//===-- SystemZAsmParser.cpp - Parse SystemZ assembly instructions --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+// Return true if Expr is in the range [MinValue, MaxValue].
+static bool inRange(const MCExpr *Expr, int64_t MinValue, int64_t MaxValue) {
+ if (auto *CE = dyn_cast<MCConstantExpr>(Expr)) {
+ int64_t Value = CE->getValue();
+ return Value >= MinValue && Value <= MaxValue;
+ }
+ return false;
+}
+
+namespace {
+enum RegisterKind {
+ GR32Reg,
+ GRH32Reg,
+ GR64Reg,
+ GR128Reg,
+ ADDR32Reg,
+ ADDR64Reg,
+ FP32Reg,
+ FP64Reg,
+ FP128Reg,
+ VR32Reg,
+ VR64Reg,
+ VR128Reg,
+ AR32Reg,
+};
+
+enum MemoryKind {
+ BDMem,
+ BDXMem,
+ BDLMem,
+ BDRMem,
+ BDVMem
+};
+
+class SystemZOperand : public MCParsedAsmOperand {
+public:
+private:
+ enum OperandKind {
+ KindInvalid,
+ KindToken,
+ KindReg,
+ KindImm,
+ KindImmTLS,
+ KindMem
+ };
+
+ OperandKind Kind;
+ SMLoc StartLoc, EndLoc;
+
+ // A string of length Length, starting at Data.
+ struct TokenOp {
+ const char *Data;
+ unsigned Length;
+ };
+
+ // LLVM register Num, which has kind Kind. In some ways it might be
+ // easier for this class to have a register bank (general, floating-point
+ // or access) and a raw register number (0-15). This would postpone the
+ // interpretation of the operand to the add*() methods and avoid the need
+ // for context-dependent parsing. However, we do things the current way
+ // because of the virtual getReg() method, which needs to distinguish
+ // between (say) %r0 used as a single register and %r0 used as a pair.
+ // Context-dependent parsing can also give us slightly better error
+ // messages when invalid pairs like %r1 are used.
+ struct RegOp {
+ RegisterKind Kind;
+ unsigned Num;
+ };
+
+ // Base + Disp + Index, where Base and Index are LLVM registers or 0.
+ // MemKind says what type of memory this is and RegKind says what type
+ // the base register has (ADDR32Reg or ADDR64Reg). Length is the operand
+ // length for D(L,B)-style operands, otherwise it is null.
+ struct MemOp {
+ unsigned Base : 12;
+ unsigned Index : 12;
+ unsigned MemKind : 4;
+ unsigned RegKind : 4;
+ const MCExpr *Disp;
+ union {
+ const MCExpr *Imm;
+ unsigned Reg;
+ } Length;
+ };
+
+ // Imm is an immediate operand, and Sym is an optional TLS symbol
+ // for use with a __tls_get_offset marker relocation.
+ struct ImmTLSOp {
+ const MCExpr *Imm;
+ const MCExpr *Sym;
+ };
+
+ union {
+ TokenOp Token;
+ RegOp Reg;
+ const MCExpr *Imm;
+ ImmTLSOp ImmTLS;
+ MemOp Mem;
+ };
+
+ void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+ // Add as immediates when possible. Null MCExpr = 0.
+ if (!Expr)
+ Inst.addOperand(MCOperand::createImm(0));
+ else if (auto *CE = dyn_cast<MCConstantExpr>(Expr))
+ Inst.addOperand(MCOperand::createImm(CE->getValue()));
+ else
+ Inst.addOperand(MCOperand::createExpr(Expr));
+ }
+
+public:
+ SystemZOperand(OperandKind kind, SMLoc startLoc, SMLoc endLoc)
+ : Kind(kind), StartLoc(startLoc), EndLoc(endLoc) {}
+
+ // Create particular kinds of operand.
+ static std::unique_ptr<SystemZOperand> createInvalid(SMLoc StartLoc,
+ SMLoc EndLoc) {
+ return make_unique<SystemZOperand>(KindInvalid, StartLoc, EndLoc);
+ }
+ static std::unique_ptr<SystemZOperand> createToken(StringRef Str, SMLoc Loc) {
+ auto Op = make_unique<SystemZOperand>(KindToken, Loc, Loc);
+ Op->Token.Data = Str.data();
+ Op->Token.Length = Str.size();
+ return Op;
+ }
+ static std::unique_ptr<SystemZOperand>
+ createReg(RegisterKind Kind, unsigned Num, SMLoc StartLoc, SMLoc EndLoc) {
+ auto Op = make_unique<SystemZOperand>(KindReg, StartLoc, EndLoc);
+ Op->Reg.Kind = Kind;
+ Op->Reg.Num = Num;
+ return Op;
+ }
+ static std::unique_ptr<SystemZOperand>
+ createImm(const MCExpr *Expr, SMLoc StartLoc, SMLoc EndLoc) {
+ auto Op = make_unique<SystemZOperand>(KindImm, StartLoc, EndLoc);
+ Op->Imm = Expr;
+ return Op;
+ }
+ static std::unique_ptr<SystemZOperand>
+ createMem(MemoryKind MemKind, RegisterKind RegKind, unsigned Base,
+ const MCExpr *Disp, unsigned Index, const MCExpr *LengthImm,
+ unsigned LengthReg, SMLoc StartLoc, SMLoc EndLoc) {
+ auto Op = make_unique<SystemZOperand>(KindMem, StartLoc, EndLoc);
+ Op->Mem.MemKind = MemKind;
+ Op->Mem.RegKind = RegKind;
+ Op->Mem.Base = Base;
+ Op->Mem.Index = Index;
+ Op->Mem.Disp = Disp;
+ if (MemKind == BDLMem)
+ Op->Mem.Length.Imm = LengthImm;
+ if (MemKind == BDRMem)
+ Op->Mem.Length.Reg = LengthReg;
+ return Op;
+ }
+ static std::unique_ptr<SystemZOperand>
+ createImmTLS(const MCExpr *Imm, const MCExpr *Sym,
+ SMLoc StartLoc, SMLoc EndLoc) {
+ auto Op = make_unique<SystemZOperand>(KindImmTLS, StartLoc, EndLoc);
+ Op->ImmTLS.Imm = Imm;
+ Op->ImmTLS.Sym = Sym;
+ return Op;
+ }
+
+ // Token operands
+ bool isToken() const override {
+ return Kind == KindToken;
+ }
+ StringRef getToken() const {
+ assert(Kind == KindToken && "Not a token");
+ return StringRef(Token.Data, Token.Length);
+ }
+
+ // Register operands.
+ bool isReg() const override {
+ return Kind == KindReg;
+ }
+ bool isReg(RegisterKind RegKind) const {
+ return Kind == KindReg && Reg.Kind == RegKind;
+ }
+ unsigned getReg() const override {
+ assert(Kind == KindReg && "Not a register");
+ return Reg.Num;
+ }
+
+ // Immediate operands.
+ bool isImm() const override {
+ return Kind == KindImm;
+ }
+ bool isImm(int64_t MinValue, int64_t MaxValue) const {
+ return Kind == KindImm && inRange(Imm, MinValue, MaxValue);
+ }
+ const MCExpr *getImm() const {
+ assert(Kind == KindImm && "Not an immediate");
+ return Imm;
+ }
+
+ // Immediate operands with optional TLS symbol.
+ bool isImmTLS() const {
+ return Kind == KindImmTLS;
+ }
+
+ // Memory operands.
+ bool isMem() const override {
+ return Kind == KindMem;
+ }
+ bool isMem(MemoryKind MemKind) const {
+ return (Kind == KindMem &&
+ (Mem.MemKind == MemKind ||
+ // A BDMem can be treated as a BDXMem in which the index
+ // register field is 0.
+ (Mem.MemKind == BDMem && MemKind == BDXMem)));
+ }
+ bool isMem(MemoryKind MemKind, RegisterKind RegKind) const {
+ return isMem(MemKind) && Mem.RegKind == RegKind;
+ }
+ bool isMemDisp12(MemoryKind MemKind, RegisterKind RegKind) const {
+ return isMem(MemKind, RegKind) && inRange(Mem.Disp, 0, 0xfff);
+ }
+ bool isMemDisp20(MemoryKind MemKind, RegisterKind RegKind) const {
+ return isMem(MemKind, RegKind) && inRange(Mem.Disp, -524288, 524287);
+ }
+ bool isMemDisp12Len8(RegisterKind RegKind) const {
+ return isMemDisp12(BDLMem, RegKind) && inRange(Mem.Length.Imm, 1, 0x100);
+ }
+
+ // Override MCParsedAsmOperand.
+ SMLoc getStartLoc() const override { return StartLoc; }
+ SMLoc getEndLoc() const override { return EndLoc; }
+ void print(raw_ostream &OS) const override;
+
+ // Used by the TableGen code to add particular types of operand
+ // to an instruction.
+ void addRegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands");
+ Inst.addOperand(MCOperand::createReg(getReg()));
+ }
+ void addImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands");
+ addExpr(Inst, getImm());
+ }
+ void addBDAddrOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 2 && "Invalid number of operands");
+ assert(isMem(BDMem) && "Invalid operand type");
+ Inst.addOperand(MCOperand::createReg(Mem.Base));
+ addExpr(Inst, Mem.Disp);
+ }
+ void addBDXAddrOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 3 && "Invalid number of operands");
+ assert(isMem(BDXMem) && "Invalid operand type");
+ Inst.addOperand(MCOperand::createReg(Mem.Base));
+ addExpr(Inst, Mem.Disp);
+ Inst.addOperand(MCOperand::createReg(Mem.Index));
+ }
+ void addBDLAddrOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 3 && "Invalid number of operands");
+ assert(isMem(BDLMem) && "Invalid operand type");
+ Inst.addOperand(MCOperand::createReg(Mem.Base));
+ addExpr(Inst, Mem.Disp);
+ addExpr(Inst, Mem.Length.Imm);
+ }
+ void addBDRAddrOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 3 && "Invalid number of operands");
+ assert(isMem(BDRMem) && "Invalid operand type");
+ Inst.addOperand(MCOperand::createReg(Mem.Base));
+ addExpr(Inst, Mem.Disp);
+ Inst.addOperand(MCOperand::createReg(Mem.Length.Reg));
+ }
+ void addBDVAddrOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 3 && "Invalid number of operands");
+ assert(isMem(BDVMem) && "Invalid operand type");
+ Inst.addOperand(MCOperand::createReg(Mem.Base));
+ addExpr(Inst, Mem.Disp);
+ Inst.addOperand(MCOperand::createReg(Mem.Index));
+ }
+ void addImmTLSOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 2 && "Invalid number of operands");
+ assert(Kind == KindImmTLS && "Invalid operand type");
+ addExpr(Inst, ImmTLS.Imm);
+ if (ImmTLS.Sym)
+ addExpr(Inst, ImmTLS.Sym);
+ }
+
+ // Used by the TableGen code to check for particular operand types.
+ bool isGR32() const { return isReg(GR32Reg); }
+ bool isGRH32() const { return isReg(GRH32Reg); }
+ bool isGRX32() const { return false; }
+ bool isGR64() const { return isReg(GR64Reg); }
+ bool isGR128() const { return isReg(GR128Reg); }
+ bool isADDR32() const { return isReg(ADDR32Reg); }
+ bool isADDR64() const { return isReg(ADDR64Reg); }
+ bool isADDR128() const { return false; }
+ bool isFP32() const { return isReg(FP32Reg); }
+ bool isFP64() const { return isReg(FP64Reg); }
+ bool isFP128() const { return isReg(FP128Reg); }
+ bool isVR32() const { return isReg(VR32Reg); }
+ bool isVR64() const { return isReg(VR64Reg); }
+ bool isVF128() const { return false; }
+ bool isVR128() const { return isReg(VR128Reg); }
+ bool isAR32() const { return isReg(AR32Reg); }
+ bool isAnyReg() const { return (isReg() || isImm(0, 15)); }
+ bool isBDAddr32Disp12() const { return isMemDisp12(BDMem, ADDR32Reg); }
+ bool isBDAddr32Disp20() const { return isMemDisp20(BDMem, ADDR32Reg); }
+ bool isBDAddr64Disp12() const { return isMemDisp12(BDMem, ADDR64Reg); }
+ bool isBDAddr64Disp20() const { return isMemDisp20(BDMem, ADDR64Reg); }
+ bool isBDXAddr64Disp12() const { return isMemDisp12(BDXMem, ADDR64Reg); }
+ bool isBDXAddr64Disp20() const { return isMemDisp20(BDXMem, ADDR64Reg); }
+ bool isBDLAddr64Disp12Len8() const { return isMemDisp12Len8(ADDR64Reg); }
+ bool isBDRAddr64Disp12() const { return isMemDisp12(BDRMem, ADDR64Reg); }
+ bool isBDVAddr64Disp12() const { return isMemDisp12(BDVMem, ADDR64Reg); }
+ bool isU1Imm() const { return isImm(0, 1); }
+ bool isU2Imm() const { return isImm(0, 3); }
+ bool isU3Imm() const { return isImm(0, 7); }
+ bool isU4Imm() const { return isImm(0, 15); }
+ bool isU6Imm() const { return isImm(0, 63); }
+ bool isU8Imm() const { return isImm(0, 255); }
+ bool isS8Imm() const { return isImm(-128, 127); }
+ bool isU12Imm() const { return isImm(0, 4095); }
+ bool isU16Imm() const { return isImm(0, 65535); }
+ bool isS16Imm() const { return isImm(-32768, 32767); }
+ bool isU32Imm() const { return isImm(0, (1LL << 32) - 1); }
+ bool isS32Imm() const { return isImm(-(1LL << 31), (1LL << 31) - 1); }
+ bool isU48Imm() const { return isImm(0, (1LL << 48) - 1); }
+};
+
+class SystemZAsmParser : public MCTargetAsmParser {
+#define GET_ASSEMBLER_HEADER
+#include "SystemZGenAsmMatcher.inc"
+
+private:
+ MCAsmParser &Parser;
+ enum RegisterGroup {
+ RegGR,
+ RegFP,
+ RegV,
+ RegAR
+ };
+ struct Register {
+ RegisterGroup Group;
+ unsigned Num;
+ SMLoc StartLoc, EndLoc;
+ };
+
+ bool parseRegister(Register &Reg);
+
+ bool parseRegister(Register &Reg, RegisterGroup Group, const unsigned *Regs,
+ bool IsAddress = false);
+
+ OperandMatchResultTy parseRegister(OperandVector &Operands,
+ RegisterGroup Group, const unsigned *Regs,
+ RegisterKind Kind);
+
+ OperandMatchResultTy parseAnyRegister(OperandVector &Operands);
+
+ bool parseAddress(bool &HaveReg1, Register &Reg1,
+ bool &HaveReg2, Register &Reg2,
+ const MCExpr *&Disp, const MCExpr *&Length);
+ bool parseAddressRegister(Register &Reg);
+
+ bool ParseDirectiveInsn(SMLoc L);
+
+ OperandMatchResultTy parseAddress(OperandVector &Operands,
+ MemoryKind MemKind, const unsigned *Regs,
+ RegisterKind RegKind);
+
+ OperandMatchResultTy parsePCRel(OperandVector &Operands, int64_t MinVal,
+ int64_t MaxVal, bool AllowTLS);
+
+ bool parseOperand(OperandVector &Operands, StringRef Mnemonic);
+
+public:
+ SystemZAsmParser(const MCSubtargetInfo &sti, MCAsmParser &parser,
+ const MCInstrInfo &MII,
+ const MCTargetOptions &Options)
+ : MCTargetAsmParser(Options, sti), Parser(parser) {
+ MCAsmParserExtension::Initialize(Parser);
+
+ // Alias the .word directive to .short.
+ parser.addAliasForDirective(".word", ".short");
+
+ // Initialize the set of available features.
+ setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
+ }
+
+ // Override MCTargetAsmParser.
+ bool ParseDirective(AsmToken DirectiveID) override;
+ bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+ bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+ SMLoc NameLoc, OperandVector &Operands) override;
+ bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands, MCStreamer &Out,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) override;
+
+ // Used by the TableGen code to parse particular operand types.
+ OperandMatchResultTy parseGR32(OperandVector &Operands) {
+ return parseRegister(Operands, RegGR, SystemZMC::GR32Regs, GR32Reg);
+ }
+ OperandMatchResultTy parseGRH32(OperandVector &Operands) {
+ return parseRegister(Operands, RegGR, SystemZMC::GRH32Regs, GRH32Reg);
+ }
+ OperandMatchResultTy parseGRX32(OperandVector &Operands) {
+ llvm_unreachable("GRX32 should only be used for pseudo instructions");
+ }
+ OperandMatchResultTy parseGR64(OperandVector &Operands) {
+ return parseRegister(Operands, RegGR, SystemZMC::GR64Regs, GR64Reg);
+ }
+ OperandMatchResultTy parseGR128(OperandVector &Operands) {
+ return parseRegister(Operands, RegGR, SystemZMC::GR128Regs, GR128Reg);
+ }
+ OperandMatchResultTy parseADDR32(OperandVector &Operands) {
+ return parseRegister(Operands, RegGR, SystemZMC::GR32Regs, ADDR32Reg);
+ }
+ OperandMatchResultTy parseADDR64(OperandVector &Operands) {
+ return parseRegister(Operands, RegGR, SystemZMC::GR64Regs, ADDR64Reg);
+ }
+ OperandMatchResultTy parseADDR128(OperandVector &Operands) {
+ llvm_unreachable("Shouldn't be used as an operand");
+ }
+ OperandMatchResultTy parseFP32(OperandVector &Operands) {
+ return parseRegister(Operands, RegFP, SystemZMC::FP32Regs, FP32Reg);
+ }
+ OperandMatchResultTy parseFP64(OperandVector &Operands) {
+ return parseRegister(Operands, RegFP, SystemZMC::FP64Regs, FP64Reg);
+ }
+ OperandMatchResultTy parseFP128(OperandVector &Operands) {
+ return parseRegister(Operands, RegFP, SystemZMC::FP128Regs, FP128Reg);
+ }
+ OperandMatchResultTy parseVR32(OperandVector &Operands) {
+ return parseRegister(Operands, RegV, SystemZMC::VR32Regs, VR32Reg);
+ }
+ OperandMatchResultTy parseVR64(OperandVector &Operands) {
+ return parseRegister(Operands, RegV, SystemZMC::VR64Regs, VR64Reg);
+ }
+ OperandMatchResultTy parseVF128(OperandVector &Operands) {
+ llvm_unreachable("Shouldn't be used as an operand");
+ }
+ OperandMatchResultTy parseVR128(OperandVector &Operands) {
+ return parseRegister(Operands, RegV, SystemZMC::VR128Regs, VR128Reg);
+ }
+ OperandMatchResultTy parseAR32(OperandVector &Operands) {
+ return parseRegister(Operands, RegAR, SystemZMC::AR32Regs, AR32Reg);
+ }
+ OperandMatchResultTy parseAnyReg(OperandVector &Operands) {
+ return parseAnyRegister(Operands);
+ }
+ OperandMatchResultTy parseBDAddr32(OperandVector &Operands) {
+ return parseAddress(Operands, BDMem, SystemZMC::GR32Regs, ADDR32Reg);
+ }
+ OperandMatchResultTy parseBDAddr64(OperandVector &Operands) {
+ return parseAddress(Operands, BDMem, SystemZMC::GR64Regs, ADDR64Reg);
+ }
+ OperandMatchResultTy parseBDXAddr64(OperandVector &Operands) {
+ return parseAddress(Operands, BDXMem, SystemZMC::GR64Regs, ADDR64Reg);
+ }
+ OperandMatchResultTy parseBDLAddr64(OperandVector &Operands) {
+ return parseAddress(Operands, BDLMem, SystemZMC::GR64Regs, ADDR64Reg);
+ }
+ OperandMatchResultTy parseBDRAddr64(OperandVector &Operands) {
+ return parseAddress(Operands, BDRMem, SystemZMC::GR64Regs, ADDR64Reg);
+ }
+ OperandMatchResultTy parseBDVAddr64(OperandVector &Operands) {
+ return parseAddress(Operands, BDVMem, SystemZMC::GR64Regs, ADDR64Reg);
+ }
+ OperandMatchResultTy parsePCRel12(OperandVector &Operands) {
+ return parsePCRel(Operands, -(1LL << 12), (1LL << 12) - 1, false);
+ }
+ OperandMatchResultTy parsePCRel16(OperandVector &Operands) {
+ return parsePCRel(Operands, -(1LL << 16), (1LL << 16) - 1, false);
+ }
+ OperandMatchResultTy parsePCRel24(OperandVector &Operands) {
+ return parsePCRel(Operands, -(1LL << 24), (1LL << 24) - 1, false);
+ }
+ OperandMatchResultTy parsePCRel32(OperandVector &Operands) {
+ return parsePCRel(Operands, -(1LL << 32), (1LL << 32) - 1, false);
+ }
+ OperandMatchResultTy parsePCRelTLS16(OperandVector &Operands) {
+ return parsePCRel(Operands, -(1LL << 16), (1LL << 16) - 1, true);
+ }
+ OperandMatchResultTy parsePCRelTLS32(OperandVector &Operands) {
+ return parsePCRel(Operands, -(1LL << 32), (1LL << 32) - 1, true);
+ }
+};
+} // end anonymous namespace
+
+#define GET_REGISTER_MATCHER
+#define GET_SUBTARGET_FEATURE_NAME
+#define GET_MATCHER_IMPLEMENTATION
+#include "SystemZGenAsmMatcher.inc"
+
+// Used for the .insn directives; contains information needed to parse the
+// operands in the directive.
+struct InsnMatchEntry {
+ StringRef Format;
+ uint64_t Opcode;
+ int32_t NumOperands;
+ MatchClassKind OperandKinds[5];
+};
+
+// For equal_range comparison.
+struct CompareInsn {
+ bool operator() (const InsnMatchEntry &LHS, StringRef RHS) {
+ return LHS.Format < RHS;
+ }
+ bool operator() (StringRef LHS, const InsnMatchEntry &RHS) {
+ return LHS < RHS.Format;
+ }
+ bool operator() (const InsnMatchEntry &LHS, const InsnMatchEntry &RHS) {
+ return LHS.Format < RHS.Format;
+ }
+};
+
+// Table initializing information for parsing the .insn directive.
+static struct InsnMatchEntry InsnMatchTable[] = {
+ /* Format, Opcode, NumOperands, OperandKinds */
+ { "e", SystemZ::InsnE, 1,
+ { MCK_U16Imm } },
+ { "ri", SystemZ::InsnRI, 3,
+ { MCK_U32Imm, MCK_AnyReg, MCK_S16Imm } },
+ { "rie", SystemZ::InsnRIE, 4,
+ { MCK_U48Imm, MCK_AnyReg, MCK_AnyReg, MCK_PCRel16 } },
+ { "ril", SystemZ::InsnRIL, 3,
+ { MCK_U48Imm, MCK_AnyReg, MCK_PCRel32 } },
+ { "rilu", SystemZ::InsnRILU, 3,
+ { MCK_U48Imm, MCK_AnyReg, MCK_U32Imm } },
+ { "ris", SystemZ::InsnRIS, 5,
+ { MCK_U48Imm, MCK_AnyReg, MCK_S8Imm, MCK_U4Imm, MCK_BDAddr64Disp12 } },
+ { "rr", SystemZ::InsnRR, 3,
+ { MCK_U16Imm, MCK_AnyReg, MCK_AnyReg } },
+ { "rre", SystemZ::InsnRRE, 3,
+ { MCK_U32Imm, MCK_AnyReg, MCK_AnyReg } },
+ { "rrf", SystemZ::InsnRRF, 5,
+ { MCK_U32Imm, MCK_AnyReg, MCK_AnyReg, MCK_AnyReg, MCK_U4Imm } },
+ { "rrs", SystemZ::InsnRRS, 5,
+ { MCK_U48Imm, MCK_AnyReg, MCK_AnyReg, MCK_U4Imm, MCK_BDAddr64Disp12 } },
+ { "rs", SystemZ::InsnRS, 4,
+ { MCK_U32Imm, MCK_AnyReg, MCK_AnyReg, MCK_BDAddr64Disp12 } },
+ { "rse", SystemZ::InsnRSE, 4,
+ { MCK_U48Imm, MCK_AnyReg, MCK_AnyReg, MCK_BDAddr64Disp12 } },
+ { "rsi", SystemZ::InsnRSI, 4,
+ { MCK_U48Imm, MCK_AnyReg, MCK_AnyReg, MCK_PCRel16 } },
+ { "rsy", SystemZ::InsnRSY, 4,
+ { MCK_U48Imm, MCK_AnyReg, MCK_AnyReg, MCK_BDAddr64Disp20 } },
+ { "rx", SystemZ::InsnRX, 3,
+ { MCK_U32Imm, MCK_AnyReg, MCK_BDXAddr64Disp12 } },
+ { "rxe", SystemZ::InsnRXE, 3,
+ { MCK_U48Imm, MCK_AnyReg, MCK_BDXAddr64Disp12 } },
+ { "rxf", SystemZ::InsnRXF, 4,
+ { MCK_U48Imm, MCK_AnyReg, MCK_AnyReg, MCK_BDXAddr64Disp12 } },
+ { "rxy", SystemZ::InsnRXY, 3,
+ { MCK_U48Imm, MCK_AnyReg, MCK_BDXAddr64Disp20 } },
+ { "s", SystemZ::InsnS, 2,
+ { MCK_U32Imm, MCK_BDAddr64Disp12 } },
+ { "si", SystemZ::InsnSI, 3,
+ { MCK_U32Imm, MCK_BDAddr64Disp12, MCK_S8Imm } },
+ { "sil", SystemZ::InsnSIL, 3,
+ { MCK_U48Imm, MCK_BDAddr64Disp12, MCK_U16Imm } },
+ { "siy", SystemZ::InsnSIY, 3,
+ { MCK_U48Imm, MCK_BDAddr64Disp20, MCK_U8Imm } },
+ { "ss", SystemZ::InsnSS, 4,
+ { MCK_U48Imm, MCK_BDXAddr64Disp12, MCK_BDAddr64Disp12, MCK_AnyReg } },
+ { "sse", SystemZ::InsnSSE, 3,
+ { MCK_U48Imm, MCK_BDAddr64Disp12, MCK_BDAddr64Disp12 } },
+ { "ssf", SystemZ::InsnSSF, 4,
+ { MCK_U48Imm, MCK_BDAddr64Disp12, MCK_BDAddr64Disp12, MCK_AnyReg } }
+};
+
+void SystemZOperand::print(raw_ostream &OS) const {
+ llvm_unreachable("Not implemented");
+}
+
+// Parse one register of the form %<prefix><number>.
+bool SystemZAsmParser::parseRegister(Register &Reg) {
+ Reg.StartLoc = Parser.getTok().getLoc();
+
+ // Eat the % prefix.
+ if (Parser.getTok().isNot(AsmToken::Percent))
+ return Error(Parser.getTok().getLoc(), "register expected");
+ Parser.Lex();
+
+ // Expect a register name.
+ if (Parser.getTok().isNot(AsmToken::Identifier))
+ return Error(Reg.StartLoc, "invalid register");
+
+ // Check that there's a prefix.
+ StringRef Name = Parser.getTok().getString();
+ if (Name.size() < 2)
+ return Error(Reg.StartLoc, "invalid register");
+ char Prefix = Name[0];
+
+ // Treat the rest of the register name as a register number.
+ if (Name.substr(1).getAsInteger(10, Reg.Num))
+ return Error(Reg.StartLoc, "invalid register");
+
+ // Look for valid combinations of prefix and number.
+ if (Prefix == 'r' && Reg.Num < 16)
+ Reg.Group = RegGR;
+ else if (Prefix == 'f' && Reg.Num < 16)
+ Reg.Group = RegFP;
+ else if (Prefix == 'v' && Reg.Num < 32)
+ Reg.Group = RegV;
+ else if (Prefix == 'a' && Reg.Num < 16)
+ Reg.Group = RegAR;
+ else
+ return Error(Reg.StartLoc, "invalid register");
+
+ Reg.EndLoc = Parser.getTok().getLoc();
+ Parser.Lex();
+ return false;
+}
+
+// Parse a register of group Group. If Regs is nonnull, use it to map
+// the raw register number to LLVM numbering, with zero entries
+// indicating an invalid register. IsAddress says whether the
+// register appears in an address context. Allow FP Group if expecting
+// RegV Group, since the f-prefix yields the FP group even while used
+// with vector instructions.
+bool SystemZAsmParser::parseRegister(Register &Reg, RegisterGroup Group,
+ const unsigned *Regs, bool IsAddress) {
+ if (parseRegister(Reg))
+ return true;
+ if (Reg.Group != Group && !(Reg.Group == RegFP && Group == RegV))
+ return Error(Reg.StartLoc, "invalid operand for instruction");
+ if (Regs && Regs[Reg.Num] == 0)
+ return Error(Reg.StartLoc, "invalid register pair");
+ if (Reg.Num == 0 && IsAddress)
+ return Error(Reg.StartLoc, "%r0 used in an address");
+ if (Regs)
+ Reg.Num = Regs[Reg.Num];
+ return false;
+}
+
+// Parse a register and add it to Operands. The other arguments are as above.
+OperandMatchResultTy
+SystemZAsmParser::parseRegister(OperandVector &Operands, RegisterGroup Group,
+ const unsigned *Regs, RegisterKind Kind) {
+ if (Parser.getTok().isNot(AsmToken::Percent))
+ return MatchOperand_NoMatch;
+
+ Register Reg;
+ bool IsAddress = (Kind == ADDR32Reg || Kind == ADDR64Reg);
+ if (parseRegister(Reg, Group, Regs, IsAddress))
+ return MatchOperand_ParseFail;
+
+ Operands.push_back(SystemZOperand::createReg(Kind, Reg.Num,
+ Reg.StartLoc, Reg.EndLoc));
+ return MatchOperand_Success;
+}
+
+// Parse any type of register (including integers) and add it to Operands.
+OperandMatchResultTy
+SystemZAsmParser::parseAnyRegister(OperandVector &Operands) {
+ // Handle integer values.
+ if (Parser.getTok().is(AsmToken::Integer)) {
+ const MCExpr *Register;
+ SMLoc StartLoc = Parser.getTok().getLoc();
+ if (Parser.parseExpression(Register))
+ return MatchOperand_ParseFail;
+
+ if (auto *CE = dyn_cast<MCConstantExpr>(Register)) {
+ int64_t Value = CE->getValue();
+ if (Value < 0 || Value > 15) {
+ Error(StartLoc, "invalid register");
+ return MatchOperand_ParseFail;
+ }
+ }
+
+ SMLoc EndLoc =
+ SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
+ Operands.push_back(SystemZOperand::createImm(Register, StartLoc, EndLoc));
+ }
+ else {
+ Register Reg;
+ if (parseRegister(Reg))
+ return MatchOperand_ParseFail;
+
+ // Map to the correct register kind.
+ RegisterKind Kind;
+ unsigned RegNo;
+ if (Reg.Group == RegGR) {
+ Kind = GR64Reg;
+ RegNo = SystemZMC::GR64Regs[Reg.Num];
+ }
+ else if (Reg.Group == RegFP) {
+ Kind = FP64Reg;
+ RegNo = SystemZMC::FP64Regs[Reg.Num];
+ }
+ else if (Reg.Group == RegV) {
+ Kind = VR128Reg;
+ RegNo = SystemZMC::VR128Regs[Reg.Num];
+ }
+ else if (Reg.Group == RegAR) {
+ Kind = AR32Reg;
+ RegNo = SystemZMC::AR32Regs[Reg.Num];
+ }
+ else {
+ return MatchOperand_ParseFail;
+ }
+
+ Operands.push_back(SystemZOperand::createReg(Kind, RegNo,
+ Reg.StartLoc, Reg.EndLoc));
+ }
+ return MatchOperand_Success;
+}
+
+// Parse a memory operand into Reg1, Reg2, Disp, and Length.
+bool SystemZAsmParser::parseAddress(bool &HaveReg1, Register &Reg1,
+ bool &HaveReg2, Register &Reg2,
+ const MCExpr *&Disp,
+ const MCExpr *&Length) {
+ // Parse the displacement, which must always be present.
+ if (getParser().parseExpression(Disp))
+ return true;
+
+ // Parse the optional base and index.
+ HaveReg1 = false;
+ HaveReg2 = false;
+ Length = nullptr;
+ if (getLexer().is(AsmToken::LParen)) {
+ Parser.Lex();
+
+ if (getLexer().is(AsmToken::Percent)) {
+ // Parse the first register.
+ HaveReg1 = true;
+ if (parseRegister(Reg1))
+ return true;
+ } else {
+ // Parse the length.
+ if (getParser().parseExpression(Length))
+ return true;
+ }
+
+ // Check whether there's a second register.
+ if (getLexer().is(AsmToken::Comma)) {
+ Parser.Lex();
+ HaveReg2 = true;
+ if (parseRegister(Reg2))
+ return true;
+ }
+
+ // Consume the closing bracket.
+ if (getLexer().isNot(AsmToken::RParen))
+ return Error(Parser.getTok().getLoc(), "unexpected token in address");
+ Parser.Lex();
+ }
+ return false;
+}
+
+// Verify that Reg is a valid address register (base or index).
+bool
+SystemZAsmParser::parseAddressRegister(Register &Reg) {
+ if (Reg.Group == RegV) {
+ Error(Reg.StartLoc, "invalid use of vector addressing");
+ return true;
+ } else if (Reg.Group != RegGR) {
+ Error(Reg.StartLoc, "invalid address register");
+ return true;
+ } else if (Reg.Num == 0) {
+ Error(Reg.StartLoc, "%r0 used in an address");
+ return true;
+ }
+ return false;
+}
+
+// Parse a memory operand and add it to Operands. The other arguments
+// are as above.
+OperandMatchResultTy
+SystemZAsmParser::parseAddress(OperandVector &Operands, MemoryKind MemKind,
+ const unsigned *Regs, RegisterKind RegKind) {
+ SMLoc StartLoc = Parser.getTok().getLoc();
+ unsigned Base = 0, Index = 0, LengthReg = 0;
+ Register Reg1, Reg2;
+ bool HaveReg1, HaveReg2;
+ const MCExpr *Disp;
+ const MCExpr *Length;
+ if (parseAddress(HaveReg1, Reg1, HaveReg2, Reg2, Disp, Length))
+ return MatchOperand_ParseFail;
+
+ switch (MemKind) {
+ case BDMem:
+ // If we have Reg1, it must be an address register.
+ if (HaveReg1) {
+ if (parseAddressRegister(Reg1))
+ return MatchOperand_ParseFail;
+ Base = Regs[Reg1.Num];
+ }
+ // There must be no Reg2 or length.
+ if (Length) {
+ Error(StartLoc, "invalid use of length addressing");
+ return MatchOperand_ParseFail;
+ }
+ if (HaveReg2) {
+ Error(StartLoc, "invalid use of indexed addressing");
+ return MatchOperand_ParseFail;
+ }
+ break;
+ case BDXMem:
+ // If we have Reg1, it must be an address register.
+ if (HaveReg1) {
+ if (parseAddressRegister(Reg1))
+ return MatchOperand_ParseFail;
+ // If the are two registers, the first one is the index and the
+ // second is the base.
+ if (HaveReg2)
+ Index = Regs[Reg1.Num];
+ else
+ Base = Regs[Reg1.Num];
+ }
+ // If we have Reg2, it must be an address register.
+ if (HaveReg2) {
+ if (parseAddressRegister(Reg2))
+ return MatchOperand_ParseFail;
+ Base = Regs[Reg2.Num];
+ }
+ // There must be no length.
+ if (Length) {
+ Error(StartLoc, "invalid use of length addressing");
+ return MatchOperand_ParseFail;
+ }
+ break;
+ case BDLMem:
+ // If we have Reg2, it must be an address register.
+ if (HaveReg2) {
+ if (parseAddressRegister(Reg2))
+ return MatchOperand_ParseFail;
+ Base = Regs[Reg2.Num];
+ }
+ // We cannot support base+index addressing.
+ if (HaveReg1 && HaveReg2) {
+ Error(StartLoc, "invalid use of indexed addressing");
+ return MatchOperand_ParseFail;
+ }
+ // We must have a length.
+ if (!Length) {
+ Error(StartLoc, "missing length in address");
+ return MatchOperand_ParseFail;
+ }
+ break;
+ case BDRMem:
+ // We must have Reg1, and it must be a GPR.
+ if (!HaveReg1 || Reg1.Group != RegGR) {
+ Error(StartLoc, "invalid operand for instruction");
+ return MatchOperand_ParseFail;
+ }
+ LengthReg = SystemZMC::GR64Regs[Reg1.Num];
+ // If we have Reg2, it must be an address register.
+ if (HaveReg2) {
+ if (parseAddressRegister(Reg2))
+ return MatchOperand_ParseFail;
+ Base = Regs[Reg2.Num];
+ }
+ // There must be no length.
+ if (Length) {
+ Error(StartLoc, "invalid use of length addressing");
+ return MatchOperand_ParseFail;
+ }
+ break;
+ case BDVMem:
+ // We must have Reg1, and it must be a vector register.
+ if (!HaveReg1 || Reg1.Group != RegV) {
+ Error(StartLoc, "vector index required in address");
+ return MatchOperand_ParseFail;
+ }
+ Index = SystemZMC::VR128Regs[Reg1.Num];
+ // If we have Reg2, it must be an address register.
+ if (HaveReg2) {
+ if (parseAddressRegister(Reg2))
+ return MatchOperand_ParseFail;
+ Base = Regs[Reg2.Num];
+ }
+ // There must be no length.
+ if (Length) {
+ Error(StartLoc, "invalid use of length addressing");
+ return MatchOperand_ParseFail;
+ }
+ break;
+ }
+
+ SMLoc EndLoc =
+ SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+ Operands.push_back(SystemZOperand::createMem(MemKind, RegKind, Base, Disp,
+ Index, Length, LengthReg,
+ StartLoc, EndLoc));
+ return MatchOperand_Success;
+}
+
+bool SystemZAsmParser::ParseDirective(AsmToken DirectiveID) {
+ StringRef IDVal = DirectiveID.getIdentifier();
+
+ if (IDVal == ".insn")
+ return ParseDirectiveInsn(DirectiveID.getLoc());
+
+ return true;
+}
+
+/// ParseDirectiveInsn
+/// ::= .insn [ format, encoding, (operands (, operands)*) ]
+bool SystemZAsmParser::ParseDirectiveInsn(SMLoc L) {
+ MCAsmParser &Parser = getParser();
+
+ // Expect instruction format as identifier.
+ StringRef Format;
+ SMLoc ErrorLoc = Parser.getTok().getLoc();
+ if (Parser.parseIdentifier(Format))
+ return Error(ErrorLoc, "expected instruction format");
+
+ SmallVector<std::unique_ptr<MCParsedAsmOperand>, 8> Operands;
+
+ // Find entry for this format in InsnMatchTable.
+ auto EntryRange =
+ std::equal_range(std::begin(InsnMatchTable), std::end(InsnMatchTable),
+ Format, CompareInsn());
+
+ // If first == second, couldn't find a match in the table.
+ if (EntryRange.first == EntryRange.second)
+ return Error(ErrorLoc, "unrecognized format");
+
+ struct InsnMatchEntry *Entry = EntryRange.first;
+
+ // Format should match from equal_range.
+ assert(Entry->Format == Format);
+
+ // Parse the following operands using the table's information.
+ for (int i = 0; i < Entry->NumOperands; i++) {
+ MatchClassKind Kind = Entry->OperandKinds[i];
+
+ SMLoc StartLoc = Parser.getTok().getLoc();
+
+ // Always expect commas as separators for operands.
+ if (getLexer().isNot(AsmToken::Comma))
+ return Error(StartLoc, "unexpected token in directive");
+ Lex();
+
+ // Parse operands.
+ OperandMatchResultTy ResTy;
+ if (Kind == MCK_AnyReg)
+ ResTy = parseAnyReg(Operands);
+ else if (Kind == MCK_BDXAddr64Disp12 || Kind == MCK_BDXAddr64Disp20)
+ ResTy = parseBDXAddr64(Operands);
+ else if (Kind == MCK_BDAddr64Disp12 || Kind == MCK_BDAddr64Disp20)
+ ResTy = parseBDAddr64(Operands);
+ else if (Kind == MCK_PCRel32)
+ ResTy = parsePCRel32(Operands);
+ else if (Kind == MCK_PCRel16)
+ ResTy = parsePCRel16(Operands);
+ else {
+ // Only remaining operand kind is an immediate.
+ const MCExpr *Expr;
+ SMLoc StartLoc = Parser.getTok().getLoc();
+
+ // Expect immediate expression.
+ if (Parser.parseExpression(Expr))
+ return Error(StartLoc, "unexpected token in directive");
+
+ SMLoc EndLoc =
+ SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
+ Operands.push_back(SystemZOperand::createImm(Expr, StartLoc, EndLoc));
+ ResTy = MatchOperand_Success;
+ }
+
+ if (ResTy != MatchOperand_Success)
+ return true;
+ }
+
+ // Build the instruction with the parsed operands.
+ MCInst Inst = MCInstBuilder(Entry->Opcode);
+
+ for (size_t i = 0; i < Operands.size(); i++) {
+ MCParsedAsmOperand &Operand = *Operands[i];
+ MatchClassKind Kind = Entry->OperandKinds[i];
+
+ // Verify operand.
+ unsigned Res = validateOperandClass(Operand, Kind);
+ if (Res != Match_Success)
+ return Error(Operand.getStartLoc(), "unexpected operand type");
+
+ // Add operands to instruction.
+ SystemZOperand &ZOperand = static_cast<SystemZOperand &>(Operand);
+ if (ZOperand.isReg())
+ ZOperand.addRegOperands(Inst, 1);
+ else if (ZOperand.isMem(BDMem))
+ ZOperand.addBDAddrOperands(Inst, 2);
+ else if (ZOperand.isMem(BDXMem))
+ ZOperand.addBDXAddrOperands(Inst, 3);
+ else if (ZOperand.isImm())
+ ZOperand.addImmOperands(Inst, 1);
+ else
+ llvm_unreachable("unexpected operand type");
+ }
+
+ // Emit as a regular instruction.
+ Parser.getStreamer().EmitInstruction(Inst, getSTI());
+
+ return false;
+}
+
+bool SystemZAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+ SMLoc &EndLoc) {
+ Register Reg;
+ if (parseRegister(Reg))
+ return true;
+ if (Reg.Group == RegGR)
+ RegNo = SystemZMC::GR64Regs[Reg.Num];
+ else if (Reg.Group == RegFP)
+ RegNo = SystemZMC::FP64Regs[Reg.Num];
+ else if (Reg.Group == RegV)
+ RegNo = SystemZMC::VR128Regs[Reg.Num];
+ else if (Reg.Group == RegAR)
+ RegNo = SystemZMC::AR32Regs[Reg.Num];
+ StartLoc = Reg.StartLoc;
+ EndLoc = Reg.EndLoc;
+ return false;
+}
+
+bool SystemZAsmParser::ParseInstruction(ParseInstructionInfo &Info,
+ StringRef Name, SMLoc NameLoc,
+ OperandVector &Operands) {
+ Operands.push_back(SystemZOperand::createToken(Name, NameLoc));
+
+ // Read the remaining operands.
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ // Read the first operand.
+ if (parseOperand(Operands, Name)) {
+ return true;
+ }
+
+ // Read any subsequent operands.
+ while (getLexer().is(AsmToken::Comma)) {
+ Parser.Lex();
+ if (parseOperand(Operands, Name)) {
+ return true;
+ }
+ }
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ SMLoc Loc = getLexer().getLoc();
+ return Error(Loc, "unexpected token in argument list");
+ }
+ }
+
+ // Consume the EndOfStatement.
+ Parser.Lex();
+ return false;
+}
+
+bool SystemZAsmParser::parseOperand(OperandVector &Operands,
+ StringRef Mnemonic) {
+ // Check if the current operand has a custom associated parser, if so, try to
+ // custom parse the operand, or fallback to the general approach. Force all
+ // features to be available during the operand check, or else we will fail to
+ // find the custom parser, and then we will later get an InvalidOperand error
+ // instead of a MissingFeature errror.
+ uint64_t AvailableFeatures = getAvailableFeatures();
+ setAvailableFeatures(~(uint64_t)0);
+ OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+ setAvailableFeatures(AvailableFeatures);
+ if (ResTy == MatchOperand_Success)
+ return false;
+
+ // If there wasn't a custom match, try the generic matcher below. Otherwise,
+ // there was a match, but an error occurred, in which case, just return that
+ // the operand parsing failed.
+ if (ResTy == MatchOperand_ParseFail)
+ return true;
+
+ // Check for a register. All real register operands should have used
+ // a context-dependent parse routine, which gives the required register
+ // class. The code is here to mop up other cases, like those where
+ // the instruction isn't recognized.
+ if (Parser.getTok().is(AsmToken::Percent)) {
+ Register Reg;
+ if (parseRegister(Reg))
+ return true;
+ Operands.push_back(SystemZOperand::createInvalid(Reg.StartLoc, Reg.EndLoc));
+ return false;
+ }
+
+ // The only other type of operand is an immediate or address. As above,
+ // real address operands should have used a context-dependent parse routine,
+ // so we treat any plain expression as an immediate.
+ SMLoc StartLoc = Parser.getTok().getLoc();
+ Register Reg1, Reg2;
+ bool HaveReg1, HaveReg2;
+ const MCExpr *Expr;
+ const MCExpr *Length;
+ if (parseAddress(HaveReg1, Reg1, HaveReg2, Reg2, Expr, Length))
+ return true;
+ // If the register combination is not valid for any instruction, reject it.
+ // Otherwise, fall back to reporting an unrecognized instruction.
+ if (HaveReg1 && Reg1.Group != RegGR && Reg1.Group != RegV
+ && parseAddressRegister(Reg1))
+ return true;
+ if (HaveReg2 && parseAddressRegister(Reg2))
+ return true;
+
+ SMLoc EndLoc =
+ SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+ if (HaveReg1 || HaveReg2 || Length)
+ Operands.push_back(SystemZOperand::createInvalid(StartLoc, EndLoc));
+ else
+ Operands.push_back(SystemZOperand::createImm(Expr, StartLoc, EndLoc));
+ return false;
+}
+
+bool SystemZAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands,
+ MCStreamer &Out,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) {
+ MCInst Inst;
+ unsigned MatchResult;
+
+ MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo,
+ MatchingInlineAsm);
+ switch (MatchResult) {
+ case Match_Success:
+ Inst.setLoc(IDLoc);
+ Out.EmitInstruction(Inst, getSTI());
+ return false;
+
+ case Match_MissingFeature: {
+ assert(ErrorInfo && "Unknown missing feature!");
+ // Special case the error message for the very common case where only
+ // a single subtarget feature is missing
+ std::string Msg = "instruction requires:";
+ uint64_t Mask = 1;
+ for (unsigned I = 0; I < sizeof(ErrorInfo) * 8 - 1; ++I) {
+ if (ErrorInfo & Mask) {
+ Msg += " ";
+ Msg += getSubtargetFeatureName(ErrorInfo & Mask);
+ }
+ Mask <<= 1;
+ }
+ return Error(IDLoc, Msg);
+ }
+
+ case Match_InvalidOperand: {
+ SMLoc ErrorLoc = IDLoc;
+ if (ErrorInfo != ~0ULL) {
+ if (ErrorInfo >= Operands.size())
+ return Error(IDLoc, "too few operands for instruction");
+
+ ErrorLoc = ((SystemZOperand &)*Operands[ErrorInfo]).getStartLoc();
+ if (ErrorLoc == SMLoc())
+ ErrorLoc = IDLoc;
+ }
+ return Error(ErrorLoc, "invalid operand for instruction");
+ }
+
+ case Match_MnemonicFail:
+ return Error(IDLoc, "invalid instruction");
+ }
+
+ llvm_unreachable("Unexpected match type");
+}
+
+OperandMatchResultTy
+SystemZAsmParser::parsePCRel(OperandVector &Operands, int64_t MinVal,
+ int64_t MaxVal, bool AllowTLS) {
+ MCContext &Ctx = getContext();
+ MCStreamer &Out = getStreamer();
+ const MCExpr *Expr;
+ SMLoc StartLoc = Parser.getTok().getLoc();
+ if (getParser().parseExpression(Expr))
+ return MatchOperand_NoMatch;
+
+ // For consistency with the GNU assembler, treat immediates as offsets
+ // from ".".
+ if (auto *CE = dyn_cast<MCConstantExpr>(Expr)) {
+ int64_t Value = CE->getValue();
+ if ((Value & 1) || Value < MinVal || Value > MaxVal) {
+ Error(StartLoc, "offset out of range");
+ return MatchOperand_ParseFail;
+ }
+ MCSymbol *Sym = Ctx.createTempSymbol();
+ Out.EmitLabel(Sym);
+ const MCExpr *Base = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None,
+ Ctx);
+ Expr = Value == 0 ? Base : MCBinaryExpr::createAdd(Base, Expr, Ctx);
+ }
+
+ // Optionally match :tls_gdcall: or :tls_ldcall: followed by a TLS symbol.
+ const MCExpr *Sym = nullptr;
+ if (AllowTLS && getLexer().is(AsmToken::Colon)) {
+ Parser.Lex();
+
+ if (Parser.getTok().isNot(AsmToken::Identifier)) {
+ Error(Parser.getTok().getLoc(), "unexpected token");
+ return MatchOperand_ParseFail;
+ }
+
+ MCSymbolRefExpr::VariantKind Kind = MCSymbolRefExpr::VK_None;
+ StringRef Name = Parser.getTok().getString();
+ if (Name == "tls_gdcall")
+ Kind = MCSymbolRefExpr::VK_TLSGD;
+ else if (Name == "tls_ldcall")
+ Kind = MCSymbolRefExpr::VK_TLSLDM;
+ else {
+ Error(Parser.getTok().getLoc(), "unknown TLS tag");
+ return MatchOperand_ParseFail;
+ }
+ Parser.Lex();
+
+ if (Parser.getTok().isNot(AsmToken::Colon)) {
+ Error(Parser.getTok().getLoc(), "unexpected token");
+ return MatchOperand_ParseFail;
+ }
+ Parser.Lex();
+
+ if (Parser.getTok().isNot(AsmToken::Identifier)) {
+ Error(Parser.getTok().getLoc(), "unexpected token");
+ return MatchOperand_ParseFail;
+ }
+
+ StringRef Identifier = Parser.getTok().getString();
+ Sym = MCSymbolRefExpr::create(Ctx.getOrCreateSymbol(Identifier),
+ Kind, Ctx);
+ Parser.Lex();
+ }
+
+ SMLoc EndLoc =
+ SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
+ if (AllowTLS)
+ Operands.push_back(SystemZOperand::createImmTLS(Expr, Sym,
+ StartLoc, EndLoc));
+ else
+ Operands.push_back(SystemZOperand::createImm(Expr, StartLoc, EndLoc));
+
+ return MatchOperand_Success;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeSystemZAsmParser() {
+ RegisterMCAsmParser<SystemZAsmParser> X(getTheSystemZTarget());
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp b/contrib/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
new file mode 100644
index 000000000000..1806e015f61e
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
@@ -0,0 +1,451 @@
+//===-- SystemZDisassembler.cpp - Disassembler for SystemZ ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZ.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "systemz-disassembler"
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+class SystemZDisassembler : public MCDisassembler {
+public:
+ SystemZDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
+ : MCDisassembler(STI, Ctx) {}
+ ~SystemZDisassembler() override {}
+
+ DecodeStatus getInstruction(MCInst &instr, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes, uint64_t Address,
+ raw_ostream &VStream,
+ raw_ostream &CStream) const override;
+};
+} // end anonymous namespace
+
+static MCDisassembler *createSystemZDisassembler(const Target &T,
+ const MCSubtargetInfo &STI,
+ MCContext &Ctx) {
+ return new SystemZDisassembler(STI, Ctx);
+}
+
+extern "C" void LLVMInitializeSystemZDisassembler() {
+ // Register the disassembler.
+ TargetRegistry::RegisterMCDisassembler(getTheSystemZTarget(),
+ createSystemZDisassembler);
+}
+
+/// tryAddingSymbolicOperand - trys to add a symbolic operand in place of the
+/// immediate Value in the MCInst.
+///
+/// @param Value - The immediate Value, has had any PC adjustment made by
+/// the caller.
+/// @param isBranch - If the instruction is a branch instruction
+/// @param Address - The starting address of the instruction
+/// @param Offset - The byte offset to this immediate in the instruction
+/// @param Width - The byte width of this immediate in the instruction
+///
+/// If the getOpInfo() function was set when setupForSymbolicDisassembly() was
+/// called then that function is called to get any symbolic information for the
+/// immediate in the instruction using the Address, Offset and Width. If that
+/// returns non-zero then the symbolic information it returns is used to create
+/// an MCExpr and that is added as an operand to the MCInst. If getOpInfo()
+/// returns zero and isBranch is true then a symbol look up for immediate Value
+/// is done and if a symbol is found an MCExpr is created with that, else
+/// an MCExpr with the immediate Value is created. This function returns true
+/// if it adds an operand to the MCInst and false otherwise.
+static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch,
+ uint64_t Address, uint64_t Offset,
+ uint64_t Width, MCInst &MI,
+ const void *Decoder) {
+ const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
+ return Dis->tryAddingSymbolicOperand(MI, Value, Address, isBranch,
+ Offset, Width);
+}
+
+static DecodeStatus decodeRegisterClass(MCInst &Inst, uint64_t RegNo,
+ const unsigned *Regs, unsigned Size) {
+ assert(RegNo < Size && "Invalid register");
+ RegNo = Regs[RegNo];
+ if (RegNo == 0)
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createReg(RegNo));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeGR32BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeRegisterClass(Inst, RegNo, SystemZMC::GR32Regs, 16);
+}
+
+static DecodeStatus DecodeGRH32BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeRegisterClass(Inst, RegNo, SystemZMC::GRH32Regs, 16);
+}
+
+static DecodeStatus DecodeGR64BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeRegisterClass(Inst, RegNo, SystemZMC::GR64Regs, 16);
+}
+
+static DecodeStatus DecodeGR128BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeRegisterClass(Inst, RegNo, SystemZMC::GR128Regs, 16);
+}
+
+static DecodeStatus DecodeADDR64BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeRegisterClass(Inst, RegNo, SystemZMC::GR64Regs, 16);
+}
+
+static DecodeStatus DecodeFP32BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeRegisterClass(Inst, RegNo, SystemZMC::FP32Regs, 16);
+}
+
+static DecodeStatus DecodeFP64BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeRegisterClass(Inst, RegNo, SystemZMC::FP64Regs, 16);
+}
+
+static DecodeStatus DecodeFP128BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeRegisterClass(Inst, RegNo, SystemZMC::FP128Regs, 16);
+}
+
+static DecodeStatus DecodeVR32BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeRegisterClass(Inst, RegNo, SystemZMC::VR32Regs, 32);
+}
+
+static DecodeStatus DecodeVR64BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeRegisterClass(Inst, RegNo, SystemZMC::VR64Regs, 32);
+}
+
+static DecodeStatus DecodeVR128BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeRegisterClass(Inst, RegNo, SystemZMC::VR128Regs, 32);
+}
+
+static DecodeStatus DecodeAR32BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeRegisterClass(Inst, RegNo, SystemZMC::AR32Regs, 16);
+}
+
+template<unsigned N>
+static DecodeStatus decodeUImmOperand(MCInst &Inst, uint64_t Imm) {
+ if (!isUInt<N>(Imm))
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createImm(Imm));
+ return MCDisassembler::Success;
+}
+
+template<unsigned N>
+static DecodeStatus decodeSImmOperand(MCInst &Inst, uint64_t Imm) {
+ if (!isUInt<N>(Imm))
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createImm(SignExtend64<N>(Imm)));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeU1ImmOperand(MCInst &Inst, uint64_t Imm,
+ uint64_t Address, const void *Decoder) {
+ return decodeUImmOperand<1>(Inst, Imm);
+}
+
+static DecodeStatus decodeU2ImmOperand(MCInst &Inst, uint64_t Imm,
+ uint64_t Address, const void *Decoder) {
+ return decodeUImmOperand<2>(Inst, Imm);
+}
+
+static DecodeStatus decodeU3ImmOperand(MCInst &Inst, uint64_t Imm,
+ uint64_t Address, const void *Decoder) {
+ return decodeUImmOperand<3>(Inst, Imm);
+}
+
+static DecodeStatus decodeU4ImmOperand(MCInst &Inst, uint64_t Imm,
+ uint64_t Address, const void *Decoder) {
+ return decodeUImmOperand<4>(Inst, Imm);
+}
+
+static DecodeStatus decodeU6ImmOperand(MCInst &Inst, uint64_t Imm,
+ uint64_t Address, const void *Decoder) {
+ return decodeUImmOperand<6>(Inst, Imm);
+}
+
+static DecodeStatus decodeU8ImmOperand(MCInst &Inst, uint64_t Imm,
+ uint64_t Address, const void *Decoder) {
+ return decodeUImmOperand<8>(Inst, Imm);
+}
+
+static DecodeStatus decodeU12ImmOperand(MCInst &Inst, uint64_t Imm,
+ uint64_t Address, const void *Decoder) {
+ return decodeUImmOperand<12>(Inst, Imm);
+}
+
+static DecodeStatus decodeU16ImmOperand(MCInst &Inst, uint64_t Imm,
+ uint64_t Address, const void *Decoder) {
+ return decodeUImmOperand<16>(Inst, Imm);
+}
+
+static DecodeStatus decodeU32ImmOperand(MCInst &Inst, uint64_t Imm,
+ uint64_t Address, const void *Decoder) {
+ return decodeUImmOperand<32>(Inst, Imm);
+}
+
+static DecodeStatus decodeS8ImmOperand(MCInst &Inst, uint64_t Imm,
+ uint64_t Address, const void *Decoder) {
+ return decodeSImmOperand<8>(Inst, Imm);
+}
+
+static DecodeStatus decodeS16ImmOperand(MCInst &Inst, uint64_t Imm,
+ uint64_t Address, const void *Decoder) {
+ return decodeSImmOperand<16>(Inst, Imm);
+}
+
+static DecodeStatus decodeS32ImmOperand(MCInst &Inst, uint64_t Imm,
+ uint64_t Address, const void *Decoder) {
+ return decodeSImmOperand<32>(Inst, Imm);
+}
+
+template<unsigned N>
+static DecodeStatus decodePCDBLOperand(MCInst &Inst, uint64_t Imm,
+ uint64_t Address,
+ bool isBranch,
+ const void *Decoder) {
+ assert(isUInt<N>(Imm) && "Invalid PC-relative offset");
+ uint64_t Value = SignExtend64<N>(Imm) * 2 + Address;
+
+ if (!tryAddingSymbolicOperand(Value, isBranch, Address, 2, N / 8,
+ Inst, Decoder))
+ Inst.addOperand(MCOperand::createImm(Value));
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus decodePC12DBLBranchOperand(MCInst &Inst, uint64_t Imm,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodePCDBLOperand<12>(Inst, Imm, Address, true, Decoder);
+}
+
+static DecodeStatus decodePC16DBLBranchOperand(MCInst &Inst, uint64_t Imm,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodePCDBLOperand<16>(Inst, Imm, Address, true, Decoder);
+}
+
+static DecodeStatus decodePC24DBLBranchOperand(MCInst &Inst, uint64_t Imm,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodePCDBLOperand<24>(Inst, Imm, Address, true, Decoder);
+}
+
+static DecodeStatus decodePC32DBLBranchOperand(MCInst &Inst, uint64_t Imm,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodePCDBLOperand<32>(Inst, Imm, Address, true, Decoder);
+}
+
+static DecodeStatus decodePC32DBLOperand(MCInst &Inst, uint64_t Imm,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodePCDBLOperand<32>(Inst, Imm, Address, false, Decoder);
+}
+
+static DecodeStatus decodeBDAddr12Operand(MCInst &Inst, uint64_t Field,
+ const unsigned *Regs) {
+ uint64_t Base = Field >> 12;
+ uint64_t Disp = Field & 0xfff;
+ assert(Base < 16 && "Invalid BDAddr12");
+ Inst.addOperand(MCOperand::createReg(Base == 0 ? 0 : Regs[Base]));
+ Inst.addOperand(MCOperand::createImm(Disp));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeBDAddr20Operand(MCInst &Inst, uint64_t Field,
+ const unsigned *Regs) {
+ uint64_t Base = Field >> 20;
+ uint64_t Disp = ((Field << 12) & 0xff000) | ((Field >> 8) & 0xfff);
+ assert(Base < 16 && "Invalid BDAddr20");
+ Inst.addOperand(MCOperand::createReg(Base == 0 ? 0 : Regs[Base]));
+ Inst.addOperand(MCOperand::createImm(SignExtend64<20>(Disp)));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeBDXAddr12Operand(MCInst &Inst, uint64_t Field,
+ const unsigned *Regs) {
+ uint64_t Index = Field >> 16;
+ uint64_t Base = (Field >> 12) & 0xf;
+ uint64_t Disp = Field & 0xfff;
+ assert(Index < 16 && "Invalid BDXAddr12");
+ Inst.addOperand(MCOperand::createReg(Base == 0 ? 0 : Regs[Base]));
+ Inst.addOperand(MCOperand::createImm(Disp));
+ Inst.addOperand(MCOperand::createReg(Index == 0 ? 0 : Regs[Index]));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeBDXAddr20Operand(MCInst &Inst, uint64_t Field,
+ const unsigned *Regs) {
+ uint64_t Index = Field >> 24;
+ uint64_t Base = (Field >> 20) & 0xf;
+ uint64_t Disp = ((Field & 0xfff00) >> 8) | ((Field & 0xff) << 12);
+ assert(Index < 16 && "Invalid BDXAddr20");
+ Inst.addOperand(MCOperand::createReg(Base == 0 ? 0 : Regs[Base]));
+ Inst.addOperand(MCOperand::createImm(SignExtend64<20>(Disp)));
+ Inst.addOperand(MCOperand::createReg(Index == 0 ? 0 : Regs[Index]));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeBDLAddr12Len8Operand(MCInst &Inst, uint64_t Field,
+ const unsigned *Regs) {
+ uint64_t Length = Field >> 16;
+ uint64_t Base = (Field >> 12) & 0xf;
+ uint64_t Disp = Field & 0xfff;
+ assert(Length < 256 && "Invalid BDLAddr12Len8");
+ Inst.addOperand(MCOperand::createReg(Base == 0 ? 0 : Regs[Base]));
+ Inst.addOperand(MCOperand::createImm(Disp));
+ Inst.addOperand(MCOperand::createImm(Length + 1));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeBDRAddr12Operand(MCInst &Inst, uint64_t Field,
+ const unsigned *Regs) {
+ uint64_t Length = Field >> 16;
+ uint64_t Base = (Field >> 12) & 0xf;
+ uint64_t Disp = Field & 0xfff;
+ assert(Length < 16 && "Invalid BDRAddr12");
+ Inst.addOperand(MCOperand::createReg(Base == 0 ? 0 : Regs[Base]));
+ Inst.addOperand(MCOperand::createImm(Disp));
+ Inst.addOperand(MCOperand::createReg(Regs[Length]));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeBDVAddr12Operand(MCInst &Inst, uint64_t Field,
+ const unsigned *Regs) {
+ uint64_t Index = Field >> 16;
+ uint64_t Base = (Field >> 12) & 0xf;
+ uint64_t Disp = Field & 0xfff;
+ assert(Index < 32 && "Invalid BDVAddr12");
+ Inst.addOperand(MCOperand::createReg(Base == 0 ? 0 : Regs[Base]));
+ Inst.addOperand(MCOperand::createImm(Disp));
+ Inst.addOperand(MCOperand::createReg(SystemZMC::VR128Regs[Index]));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeBDAddr32Disp12Operand(MCInst &Inst, uint64_t Field,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeBDAddr12Operand(Inst, Field, SystemZMC::GR32Regs);
+}
+
+static DecodeStatus decodeBDAddr32Disp20Operand(MCInst &Inst, uint64_t Field,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeBDAddr20Operand(Inst, Field, SystemZMC::GR32Regs);
+}
+
+static DecodeStatus decodeBDAddr64Disp12Operand(MCInst &Inst, uint64_t Field,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeBDAddr12Operand(Inst, Field, SystemZMC::GR64Regs);
+}
+
+static DecodeStatus decodeBDAddr64Disp20Operand(MCInst &Inst, uint64_t Field,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeBDAddr20Operand(Inst, Field, SystemZMC::GR64Regs);
+}
+
+static DecodeStatus decodeBDXAddr64Disp12Operand(MCInst &Inst, uint64_t Field,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeBDXAddr12Operand(Inst, Field, SystemZMC::GR64Regs);
+}
+
+static DecodeStatus decodeBDXAddr64Disp20Operand(MCInst &Inst, uint64_t Field,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeBDXAddr20Operand(Inst, Field, SystemZMC::GR64Regs);
+}
+
+static DecodeStatus decodeBDLAddr64Disp12Len8Operand(MCInst &Inst,
+ uint64_t Field,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeBDLAddr12Len8Operand(Inst, Field, SystemZMC::GR64Regs);
+}
+
+static DecodeStatus decodeBDRAddr64Disp12Operand(MCInst &Inst,
+ uint64_t Field,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeBDRAddr12Operand(Inst, Field, SystemZMC::GR64Regs);
+}
+
+static DecodeStatus decodeBDVAddr64Disp12Operand(MCInst &Inst, uint64_t Field,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeBDVAddr12Operand(Inst, Field, SystemZMC::GR64Regs);
+}
+
+#include "SystemZGenDisassemblerTables.inc"
+
+DecodeStatus SystemZDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes,
+ uint64_t Address,
+ raw_ostream &OS,
+ raw_ostream &CS) const {
+ // Get the first two bytes of the instruction.
+ Size = 0;
+ if (Bytes.size() < 2)
+ return MCDisassembler::Fail;
+
+ // The top 2 bits of the first byte specify the size.
+ const uint8_t *Table;
+ if (Bytes[0] < 0x40) {
+ Size = 2;
+ Table = DecoderTable16;
+ } else if (Bytes[0] < 0xc0) {
+ Size = 4;
+ Table = DecoderTable32;
+ } else {
+ Size = 6;
+ Table = DecoderTable48;
+ }
+
+ // Read any remaining bytes.
+ if (Bytes.size() < Size)
+ return MCDisassembler::Fail;
+
+ // Construct the instruction.
+ uint64_t Inst = 0;
+ for (uint64_t I = 0; I < Size; ++I)
+ Inst = (Inst << 8) | Bytes[I];
+
+ return decodeInstruction(Table, MI, Inst, Address, this, STI);
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp b/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
new file mode 100644
index 000000000000..1207c7b327e8
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
@@ -0,0 +1,231 @@
+//===-- SystemZInstPrinter.cpp - Convert SystemZ MCInst to assembly syntax ===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZInstPrinter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+#include "SystemZGenAsmWriter.inc"
+
+void SystemZInstPrinter::printAddress(unsigned Base, int64_t Disp,
+ unsigned Index, raw_ostream &O) {
+ O << Disp;
+ if (Base || Index) {
+ O << '(';
+ if (Index) {
+ O << '%' << getRegisterName(Index);
+ if (Base)
+ O << ',';
+ }
+ if (Base)
+ O << '%' << getRegisterName(Base);
+ O << ')';
+ }
+}
+
+void SystemZInstPrinter::printOperand(const MCOperand &MO, const MCAsmInfo *MAI,
+ raw_ostream &O) {
+ if (MO.isReg())
+ O << '%' << getRegisterName(MO.getReg());
+ else if (MO.isImm())
+ O << MO.getImm();
+ else if (MO.isExpr())
+ MO.getExpr()->print(O, MAI);
+ else
+ llvm_unreachable("Invalid operand");
+}
+
+void SystemZInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+ StringRef Annot,
+ const MCSubtargetInfo &STI) {
+ printInstruction(MI, O);
+ printAnnotation(O, Annot);
+}
+
+void SystemZInstPrinter::printRegName(raw_ostream &O, unsigned RegNo) const {
+ O << '%' << getRegisterName(RegNo);
+}
+
+template <unsigned N>
+static void printUImmOperand(const MCInst *MI, int OpNum, raw_ostream &O) {
+ int64_t Value = MI->getOperand(OpNum).getImm();
+ assert(isUInt<N>(Value) && "Invalid uimm argument");
+ O << Value;
+}
+
+template <unsigned N>
+static void printSImmOperand(const MCInst *MI, int OpNum, raw_ostream &O) {
+ int64_t Value = MI->getOperand(OpNum).getImm();
+ assert(isInt<N>(Value) && "Invalid simm argument");
+ O << Value;
+}
+
+void SystemZInstPrinter::printU1ImmOperand(const MCInst *MI, int OpNum,
+ raw_ostream &O) {
+ printUImmOperand<1>(MI, OpNum, O);
+}
+
+void SystemZInstPrinter::printU2ImmOperand(const MCInst *MI, int OpNum,
+ raw_ostream &O) {
+ printUImmOperand<2>(MI, OpNum, O);
+}
+
+void SystemZInstPrinter::printU3ImmOperand(const MCInst *MI, int OpNum,
+ raw_ostream &O) {
+ printUImmOperand<3>(MI, OpNum, O);
+}
+
+void SystemZInstPrinter::printU4ImmOperand(const MCInst *MI, int OpNum,
+ raw_ostream &O) {
+ printUImmOperand<4>(MI, OpNum, O);
+}
+
+void SystemZInstPrinter::printU6ImmOperand(const MCInst *MI, int OpNum,
+ raw_ostream &O) {
+ printUImmOperand<6>(MI, OpNum, O);
+}
+
+void SystemZInstPrinter::printS8ImmOperand(const MCInst *MI, int OpNum,
+ raw_ostream &O) {
+ printSImmOperand<8>(MI, OpNum, O);
+}
+
+void SystemZInstPrinter::printU8ImmOperand(const MCInst *MI, int OpNum,
+ raw_ostream &O) {
+ printUImmOperand<8>(MI, OpNum, O);
+}
+
+void SystemZInstPrinter::printU12ImmOperand(const MCInst *MI, int OpNum,
+ raw_ostream &O) {
+ printUImmOperand<12>(MI, OpNum, O);
+}
+
+void SystemZInstPrinter::printS16ImmOperand(const MCInst *MI, int OpNum,
+ raw_ostream &O) {
+ printSImmOperand<16>(MI, OpNum, O);
+}
+
+void SystemZInstPrinter::printU16ImmOperand(const MCInst *MI, int OpNum,
+ raw_ostream &O) {
+ printUImmOperand<16>(MI, OpNum, O);
+}
+
+void SystemZInstPrinter::printS32ImmOperand(const MCInst *MI, int OpNum,
+ raw_ostream &O) {
+ printSImmOperand<32>(MI, OpNum, O);
+}
+
+void SystemZInstPrinter::printU32ImmOperand(const MCInst *MI, int OpNum,
+ raw_ostream &O) {
+ printUImmOperand<32>(MI, OpNum, O);
+}
+
+void SystemZInstPrinter::printU48ImmOperand(const MCInst *MI, int OpNum,
+ raw_ostream &O) {
+ printUImmOperand<48>(MI, OpNum, O);
+}
+
+void SystemZInstPrinter::printPCRelOperand(const MCInst *MI, int OpNum,
+ raw_ostream &O) {
+ const MCOperand &MO = MI->getOperand(OpNum);
+ if (MO.isImm()) {
+ O << "0x";
+ O.write_hex(MO.getImm());
+ } else
+ MO.getExpr()->print(O, &MAI);
+}
+
+void SystemZInstPrinter::printPCRelTLSOperand(const MCInst *MI, int OpNum,
+ raw_ostream &O) {
+ // Output the PC-relative operand.
+ printPCRelOperand(MI, OpNum, O);
+
+ // Output the TLS marker if present.
+ if ((unsigned)OpNum + 1 < MI->getNumOperands()) {
+ const MCOperand &MO = MI->getOperand(OpNum + 1);
+ const MCSymbolRefExpr &refExp = cast<MCSymbolRefExpr>(*MO.getExpr());
+ switch (refExp.getKind()) {
+ case MCSymbolRefExpr::VK_TLSGD:
+ O << ":tls_gdcall:";
+ break;
+ case MCSymbolRefExpr::VK_TLSLDM:
+ O << ":tls_ldcall:";
+ break;
+ default:
+ llvm_unreachable("Unexpected symbol kind");
+ }
+ O << refExp.getSymbol().getName();
+ }
+}
+
+void SystemZInstPrinter::printOperand(const MCInst *MI, int OpNum,
+ raw_ostream &O) {
+ printOperand(MI->getOperand(OpNum), &MAI, O);
+}
+
+void SystemZInstPrinter::printBDAddrOperand(const MCInst *MI, int OpNum,
+ raw_ostream &O) {
+ printAddress(MI->getOperand(OpNum).getReg(),
+ MI->getOperand(OpNum + 1).getImm(), 0, O);
+}
+
+void SystemZInstPrinter::printBDXAddrOperand(const MCInst *MI, int OpNum,
+ raw_ostream &O) {
+ printAddress(MI->getOperand(OpNum).getReg(),
+ MI->getOperand(OpNum + 1).getImm(),
+ MI->getOperand(OpNum + 2).getReg(), O);
+}
+
+void SystemZInstPrinter::printBDLAddrOperand(const MCInst *MI, int OpNum,
+ raw_ostream &O) {
+ unsigned Base = MI->getOperand(OpNum).getReg();
+ uint64_t Disp = MI->getOperand(OpNum + 1).getImm();
+ uint64_t Length = MI->getOperand(OpNum + 2).getImm();
+ O << Disp << '(' << Length;
+ if (Base)
+ O << ",%" << getRegisterName(Base);
+ O << ')';
+}
+
+void SystemZInstPrinter::printBDRAddrOperand(const MCInst *MI, int OpNum,
+ raw_ostream &O) {
+ unsigned Base = MI->getOperand(OpNum).getReg();
+ uint64_t Disp = MI->getOperand(OpNum + 1).getImm();
+ unsigned Length = MI->getOperand(OpNum + 2).getReg();
+ O << Disp << "(%" << getRegisterName(Length);
+ if (Base)
+ O << ",%" << getRegisterName(Base);
+ O << ')';
+}
+
+void SystemZInstPrinter::printBDVAddrOperand(const MCInst *MI, int OpNum,
+ raw_ostream &O) {
+ printAddress(MI->getOperand(OpNum).getReg(),
+ MI->getOperand(OpNum + 1).getImm(),
+ MI->getOperand(OpNum + 2).getReg(), O);
+}
+
+void SystemZInstPrinter::printCond4Operand(const MCInst *MI, int OpNum,
+ raw_ostream &O) {
+ static const char *const CondNames[] = {
+ "o", "h", "nle", "l", "nhe", "lh", "ne",
+ "e", "nlh", "he", "nl", "le", "nh", "no"
+ };
+ uint64_t Imm = MI->getOperand(OpNum).getImm();
+ assert(Imm > 0 && Imm < 15 && "Invalid condition");
+ O << CondNames[Imm - 1];
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h b/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
new file mode 100644
index 000000000000..6336f5ee0efa
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
@@ -0,0 +1,75 @@
+//==- SystemZInstPrinter.h - Convert SystemZ MCInst to assembly --*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints a SystemZ MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_INSTPRINTER_SYSTEMZINSTPRINTER_H
+#define LLVM_LIB_TARGET_SYSTEMZ_INSTPRINTER_SYSTEMZINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+class MCOperand;
+
+class SystemZInstPrinter : public MCInstPrinter {
+public:
+ SystemZInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI)
+ : MCInstPrinter(MAI, MII, MRI) {}
+
+ // Automatically generated by tblgen.
+ void printInstruction(const MCInst *MI, raw_ostream &O);
+ static const char *getRegisterName(unsigned RegNo);
+
+ // Print an address with the given base, displacement and index.
+ static void printAddress(unsigned Base, int64_t Disp, unsigned Index,
+ raw_ostream &O);
+
+ // Print the given operand.
+ static void printOperand(const MCOperand &MO, const MCAsmInfo *MAI,
+ raw_ostream &O);
+
+ // Override MCInstPrinter.
+ void printRegName(raw_ostream &O, unsigned RegNo) const override;
+ void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
+ const MCSubtargetInfo &STI) override;
+
+private:
+ // Print various types of operand.
+ void printOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+ void printBDAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+ void printBDXAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+ void printBDLAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+ void printBDRAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+ void printBDVAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+ void printU1ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+ void printU2ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+ void printU3ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+ void printU4ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+ void printU6ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+ void printS8ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+ void printU8ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+ void printU12ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+ void printS16ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+ void printU16ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+ void printS32ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+ void printU32ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+ void printU48ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+ void printPCRelOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+ void printPCRelTLSOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+
+ // Print the mnemonic for a condition-code mask ("ne", "lh", etc.)
+ // This forms part of the instruction name rather than the operand list.
+ void printCond4Operand(const MCInst *MI, int OpNum, raw_ostream &O);
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
new file mode 100644
index 000000000000..9192448afd04
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
@@ -0,0 +1,126 @@
+//===-- SystemZMCAsmBackend.cpp - SystemZ assembler backend ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "MCTargetDesc/SystemZMCFixups.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectWriter.h"
+
+using namespace llvm;
+
+// Value is a fully-resolved relocation value: Symbol + Addend [- Pivot].
+// Return the bits that should be installed in a relocation field for
+// fixup kind Kind.
+static uint64_t extractBitsForFixup(MCFixupKind Kind, uint64_t Value) {
+ if (Kind < FirstTargetFixupKind)
+ return Value;
+
+ switch (unsigned(Kind)) {
+ case SystemZ::FK_390_PC12DBL:
+ case SystemZ::FK_390_PC16DBL:
+ case SystemZ::FK_390_PC24DBL:
+ case SystemZ::FK_390_PC32DBL:
+ return (int64_t)Value / 2;
+
+ case SystemZ::FK_390_TLS_CALL:
+ return 0;
+ }
+
+ llvm_unreachable("Unknown fixup kind!");
+}
+
+namespace {
+class SystemZMCAsmBackend : public MCAsmBackend {
+ uint8_t OSABI;
+public:
+ SystemZMCAsmBackend(uint8_t osABI)
+ : OSABI(osABI) {}
+
+ // Override MCAsmBackend
+ unsigned getNumFixupKinds() const override {
+ return SystemZ::NumTargetFixupKinds;
+ }
+ const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
+ void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+ uint64_t Value, bool IsPCRel) const override;
+ bool mayNeedRelaxation(const MCInst &Inst) const override {
+ return false;
+ }
+ bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+ const MCRelaxableFragment *Fragment,
+ const MCAsmLayout &Layout) const override {
+ return false;
+ }
+ void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+ MCInst &Res) const override {
+ llvm_unreachable("SystemZ does do not have assembler relaxation");
+ }
+ bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+ MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+ return createSystemZObjectWriter(OS, OSABI);
+ }
+};
+} // end anonymous namespace
+
+const MCFixupKindInfo &
+SystemZMCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
+ const static MCFixupKindInfo Infos[SystemZ::NumTargetFixupKinds] = {
+ { "FK_390_PC12DBL", 4, 12, MCFixupKindInfo::FKF_IsPCRel },
+ { "FK_390_PC16DBL", 0, 16, MCFixupKindInfo::FKF_IsPCRel },
+ { "FK_390_PC24DBL", 0, 24, MCFixupKindInfo::FKF_IsPCRel },
+ { "FK_390_PC32DBL", 0, 32, MCFixupKindInfo::FKF_IsPCRel },
+ { "FK_390_TLS_CALL", 0, 0, 0 }
+ };
+
+ if (Kind < FirstTargetFixupKind)
+ return MCAsmBackend::getFixupKindInfo(Kind);
+
+ assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+ "Invalid kind!");
+ return Infos[Kind - FirstTargetFixupKind];
+}
+
+void SystemZMCAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+ unsigned DataSize, uint64_t Value,
+ bool IsPCRel) const {
+ MCFixupKind Kind = Fixup.getKind();
+ unsigned Offset = Fixup.getOffset();
+ unsigned BitSize = getFixupKindInfo(Kind).TargetSize;
+ unsigned Size = (BitSize + 7) / 8;
+
+ assert(Offset + Size <= DataSize && "Invalid fixup offset!");
+
+ // Big-endian insertion of Size bytes.
+ Value = extractBitsForFixup(Kind, Value);
+ if (BitSize < 64)
+ Value &= ((uint64_t)1 << BitSize) - 1;
+ unsigned ShiftValue = (Size * 8) - 8;
+ for (unsigned I = 0; I != Size; ++I) {
+ Data[Offset + I] |= uint8_t(Value >> ShiftValue);
+ ShiftValue -= 8;
+ }
+}
+
+bool SystemZMCAsmBackend::writeNopData(uint64_t Count,
+ MCObjectWriter *OW) const {
+ for (uint64_t I = 0; I != Count; ++I)
+ OW->write8(7);
+ return true;
+}
+
+MCAsmBackend *llvm::createSystemZMCAsmBackend(const Target &T,
+ const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options) {
+ uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TT.getOS());
+ return new SystemZMCAsmBackend(OSABI);
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
new file mode 100644
index 000000000000..b17977d41be1
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
@@ -0,0 +1,29 @@
+//===-- SystemZMCAsmInfo.cpp - SystemZ asm properties ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZMCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+
+using namespace llvm;
+
+SystemZMCAsmInfo::SystemZMCAsmInfo(const Triple &TT) {
+ PointerSize = 8;
+ CalleeSaveStackSlotSize = 8;
+ IsLittleEndian = false;
+
+ CommentString = "#";
+ ZeroDirective = "\t.space\t";
+ Data64bitsDirective = "\t.quad\t";
+ UsesELFSectionDirectiveForBSS = true;
+ SupportsDebugInformation = true;
+ ExceptionsType = ExceptionHandling::DwarfCFI;
+
+ UseIntegratedAssembler = true;
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
new file mode 100644
index 000000000000..800f89232063
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
@@ -0,0 +1,26 @@
+//====-- SystemZMCAsmInfo.h - SystemZ asm properties -----------*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_MCTARGETDESC_SYSTEMZMCASMINFO_H
+#define LLVM_LIB_TARGET_SYSTEMZ_MCTARGETDESC_SYSTEMZMCASMINFO_H
+
+#include "llvm/MC/MCAsmInfoELF.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+class Triple;
+
+class SystemZMCAsmInfo : public MCAsmInfoELF {
+public:
+ explicit SystemZMCAsmInfo(const Triple &TT);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
new file mode 100644
index 000000000000..7082abad716d
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
@@ -0,0 +1,284 @@
+//===-- SystemZMCCodeEmitter.cpp - Convert SystemZ code to machine code ---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SystemZMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "MCTargetDesc/SystemZMCFixups.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mccodeemitter"
+
+namespace {
+class SystemZMCCodeEmitter : public MCCodeEmitter {
+ const MCInstrInfo &MCII;
+ MCContext &Ctx;
+
+public:
+ SystemZMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
+ : MCII(mcii), Ctx(ctx) {
+ }
+
+ ~SystemZMCCodeEmitter() override {}
+
+ // OVerride MCCodeEmitter.
+ void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const override;
+
+private:
+ // Automatically generated by TableGen.
+ uint64_t getBinaryCodeForInstr(const MCInst &MI,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ // Called by the TableGen code to get the binary encoding of operand
+ // MO in MI. Fixups is the list of fixups against MI.
+ uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ // Called by the TableGen code to get the binary encoding of an address.
+ // The index or length, if any, is encoded first, followed by the base,
+ // followed by the displacement. In a 20-bit displacement,
+ // the low 12 bits are encoded before the high 8 bits.
+ uint64_t getBDAddr12Encoding(const MCInst &MI, unsigned OpNum,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ uint64_t getBDAddr20Encoding(const MCInst &MI, unsigned OpNum,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ uint64_t getBDXAddr12Encoding(const MCInst &MI, unsigned OpNum,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ uint64_t getBDXAddr20Encoding(const MCInst &MI, unsigned OpNum,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ uint64_t getBDLAddr12Len8Encoding(const MCInst &MI, unsigned OpNum,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ uint64_t getBDRAddr12Encoding(const MCInst &MI, unsigned OpNum,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ uint64_t getBDVAddr12Encoding(const MCInst &MI, unsigned OpNum,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ // Operand OpNum of MI needs a PC-relative fixup of kind Kind at
+ // Offset bytes from the start of MI. Add the fixup to Fixups
+ // and return the in-place addend, which since we're a RELA target
+ // is always 0. If AllowTLS is true and optional operand OpNum + 1
+ // is present, also emit a TLS call fixup for it.
+ uint64_t getPCRelEncoding(const MCInst &MI, unsigned OpNum,
+ SmallVectorImpl<MCFixup> &Fixups,
+ unsigned Kind, int64_t Offset,
+ bool AllowTLS) const;
+
+ uint64_t getPC16DBLEncoding(const MCInst &MI, unsigned OpNum,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ return getPCRelEncoding(MI, OpNum, Fixups,
+ SystemZ::FK_390_PC16DBL, 2, false);
+ }
+ uint64_t getPC32DBLEncoding(const MCInst &MI, unsigned OpNum,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ return getPCRelEncoding(MI, OpNum, Fixups,
+ SystemZ::FK_390_PC32DBL, 2, false);
+ }
+ uint64_t getPC16DBLTLSEncoding(const MCInst &MI, unsigned OpNum,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ return getPCRelEncoding(MI, OpNum, Fixups,
+ SystemZ::FK_390_PC16DBL, 2, true);
+ }
+ uint64_t getPC32DBLTLSEncoding(const MCInst &MI, unsigned OpNum,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ return getPCRelEncoding(MI, OpNum, Fixups,
+ SystemZ::FK_390_PC32DBL, 2, true);
+ }
+ uint64_t getPC12DBLBPPEncoding(const MCInst &MI, unsigned OpNum,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ return getPCRelEncoding(MI, OpNum, Fixups,
+ SystemZ::FK_390_PC12DBL, 1, false);
+ }
+ uint64_t getPC16DBLBPPEncoding(const MCInst &MI, unsigned OpNum,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ return getPCRelEncoding(MI, OpNum, Fixups,
+ SystemZ::FK_390_PC16DBL, 4, false);
+ }
+ uint64_t getPC24DBLBPPEncoding(const MCInst &MI, unsigned OpNum,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ return getPCRelEncoding(MI, OpNum, Fixups,
+ SystemZ::FK_390_PC24DBL, 3, false);
+ }
+
+private:
+ uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;
+ void verifyInstructionPredicates(const MCInst &MI,
+ uint64_t AvailableFeatures) const;
+};
+} // end anonymous namespace
+
+MCCodeEmitter *llvm::createSystemZMCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx) {
+ return new SystemZMCCodeEmitter(MCII, Ctx);
+}
+
+void SystemZMCCodeEmitter::
+encodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ verifyInstructionPredicates(MI,
+ computeAvailableFeatures(STI.getFeatureBits()));
+
+ uint64_t Bits = getBinaryCodeForInstr(MI, Fixups, STI);
+ unsigned Size = MCII.get(MI.getOpcode()).getSize();
+ // Big-endian insertion of Size bytes.
+ unsigned ShiftValue = (Size * 8) - 8;
+ for (unsigned I = 0; I != Size; ++I) {
+ OS << uint8_t(Bits >> ShiftValue);
+ ShiftValue -= 8;
+ }
+}
+
+uint64_t SystemZMCCodeEmitter::
+getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ if (MO.isReg())
+ return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
+ if (MO.isImm())
+ return static_cast<uint64_t>(MO.getImm());
+ llvm_unreachable("Unexpected operand type!");
+}
+
+uint64_t SystemZMCCodeEmitter::
+getBDAddr12Encoding(const MCInst &MI, unsigned OpNum,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI);
+ uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI);
+ assert(isUInt<4>(Base) && isUInt<12>(Disp));
+ return (Base << 12) | Disp;
+}
+
+uint64_t SystemZMCCodeEmitter::
+getBDAddr20Encoding(const MCInst &MI, unsigned OpNum,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI);
+ uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI);
+ assert(isUInt<4>(Base) && isInt<20>(Disp));
+ return (Base << 20) | ((Disp & 0xfff) << 8) | ((Disp & 0xff000) >> 12);
+}
+
+uint64_t SystemZMCCodeEmitter::
+getBDXAddr12Encoding(const MCInst &MI, unsigned OpNum,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI);
+ uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI);
+ uint64_t Index = getMachineOpValue(MI, MI.getOperand(OpNum + 2), Fixups, STI);
+ assert(isUInt<4>(Base) && isUInt<12>(Disp) && isUInt<4>(Index));
+ return (Index << 16) | (Base << 12) | Disp;
+}
+
+uint64_t SystemZMCCodeEmitter::
+getBDXAddr20Encoding(const MCInst &MI, unsigned OpNum,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI);
+ uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI);
+ uint64_t Index = getMachineOpValue(MI, MI.getOperand(OpNum + 2), Fixups, STI);
+ assert(isUInt<4>(Base) && isInt<20>(Disp) && isUInt<4>(Index));
+ return (Index << 24) | (Base << 20) | ((Disp & 0xfff) << 8)
+ | ((Disp & 0xff000) >> 12);
+}
+
+uint64_t SystemZMCCodeEmitter::
+getBDLAddr12Len8Encoding(const MCInst &MI, unsigned OpNum,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI);
+ uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI);
+ uint64_t Len = getMachineOpValue(MI, MI.getOperand(OpNum + 2), Fixups, STI) - 1;
+ assert(isUInt<4>(Base) && isUInt<12>(Disp) && isUInt<8>(Len));
+ return (Len << 16) | (Base << 12) | Disp;
+}
+
+uint64_t SystemZMCCodeEmitter::
+getBDRAddr12Encoding(const MCInst &MI, unsigned OpNum,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI);
+ uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI);
+ uint64_t Len = getMachineOpValue(MI, MI.getOperand(OpNum + 2), Fixups, STI);
+ assert(isUInt<4>(Base) && isUInt<12>(Disp) && isUInt<4>(Len));
+ return (Len << 16) | (Base << 12) | Disp;
+}
+
+uint64_t SystemZMCCodeEmitter::
+getBDVAddr12Encoding(const MCInst &MI, unsigned OpNum,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI);
+ uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI);
+ uint64_t Index = getMachineOpValue(MI, MI.getOperand(OpNum + 2), Fixups, STI);
+ assert(isUInt<4>(Base) && isUInt<12>(Disp) && isUInt<5>(Index));
+ return (Index << 16) | (Base << 12) | Disp;
+}
+
+uint64_t
+SystemZMCCodeEmitter::getPCRelEncoding(const MCInst &MI, unsigned OpNum,
+ SmallVectorImpl<MCFixup> &Fixups,
+ unsigned Kind, int64_t Offset,
+ bool AllowTLS) const {
+ const MCOperand &MO = MI.getOperand(OpNum);
+ const MCExpr *Expr;
+ if (MO.isImm())
+ Expr = MCConstantExpr::create(MO.getImm() + Offset, Ctx);
+ else {
+ Expr = MO.getExpr();
+ if (Offset) {
+ // The operand value is relative to the start of MI, but the fixup
+ // is relative to the operand field itself, which is Offset bytes
+ // into MI. Add Offset to the relocation value to cancel out
+ // this difference.
+ const MCExpr *OffsetExpr = MCConstantExpr::create(Offset, Ctx);
+ Expr = MCBinaryExpr::createAdd(Expr, OffsetExpr, Ctx);
+ }
+ }
+ Fixups.push_back(MCFixup::create(Offset, Expr, (MCFixupKind)Kind));
+
+ // Output the fixup for the TLS marker if present.
+ if (AllowTLS && OpNum + 1 < MI.getNumOperands()) {
+ const MCOperand &MOTLS = MI.getOperand(OpNum + 1);
+ Fixups.push_back(MCFixup::create(0, MOTLS.getExpr(),
+ (MCFixupKind)SystemZ::FK_390_TLS_CALL));
+ }
+ return 0;
+}
+
+#define ENABLE_INSTR_PREDICATE_VERIFIER
+#include "SystemZGenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h
new file mode 100644
index 000000000000..c012accc14dd
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h
@@ -0,0 +1,32 @@
+//===-- SystemZMCFixups.h - SystemZ-specific fixup entries ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_MCTARGETDESC_SYSTEMZMCFIXUPS_H
+#define LLVM_LIB_TARGET_SYSTEMZ_MCTARGETDESC_SYSTEMZMCFIXUPS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace SystemZ {
+enum FixupKind {
+ // These correspond directly to R_390_* relocations.
+ FK_390_PC12DBL = FirstTargetFixupKind,
+ FK_390_PC16DBL,
+ FK_390_PC24DBL,
+ FK_390_PC32DBL,
+ FK_390_TLS_CALL,
+
+ // Marker
+ LastTargetFixupKind,
+ NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+} // end namespace SystemZ
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
new file mode 100644
index 000000000000..43a96e84289c
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
@@ -0,0 +1,164 @@
+//===-- SystemZMCObjectWriter.cpp - SystemZ ELF writer --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "MCTargetDesc/SystemZMCFixups.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+
+using namespace llvm;
+
+namespace {
+class SystemZObjectWriter : public MCELFObjectTargetWriter {
+public:
+ SystemZObjectWriter(uint8_t OSABI);
+
+ ~SystemZObjectWriter() override;
+
+protected:
+ // Override MCELFObjectTargetWriter.
+ unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+ const MCFixup &Fixup, bool IsPCRel) const override;
+};
+} // end anonymous namespace
+
+SystemZObjectWriter::SystemZObjectWriter(uint8_t OSABI)
+ : MCELFObjectTargetWriter(/*Is64Bit=*/true, OSABI, ELF::EM_S390,
+ /*HasRelocationAddend=*/ true) {}
+
+SystemZObjectWriter::~SystemZObjectWriter() {
+}
+
+// Return the relocation type for an absolute value of MCFixupKind Kind.
+static unsigned getAbsoluteReloc(unsigned Kind) {
+ switch (Kind) {
+ case FK_Data_1: return ELF::R_390_8;
+ case FK_Data_2: return ELF::R_390_16;
+ case FK_Data_4: return ELF::R_390_32;
+ case FK_Data_8: return ELF::R_390_64;
+ }
+ llvm_unreachable("Unsupported absolute address");
+}
+
+// Return the relocation type for a PC-relative value of MCFixupKind Kind.
+static unsigned getPCRelReloc(unsigned Kind) {
+ switch (Kind) {
+ case FK_Data_2: return ELF::R_390_PC16;
+ case FK_Data_4: return ELF::R_390_PC32;
+ case FK_Data_8: return ELF::R_390_PC64;
+ case SystemZ::FK_390_PC12DBL: return ELF::R_390_PC12DBL;
+ case SystemZ::FK_390_PC16DBL: return ELF::R_390_PC16DBL;
+ case SystemZ::FK_390_PC24DBL: return ELF::R_390_PC24DBL;
+ case SystemZ::FK_390_PC32DBL: return ELF::R_390_PC32DBL;
+ }
+ llvm_unreachable("Unsupported PC-relative address");
+}
+
+// Return the R_390_TLS_LE* relocation type for MCFixupKind Kind.
+static unsigned getTLSLEReloc(unsigned Kind) {
+ switch (Kind) {
+ case FK_Data_4: return ELF::R_390_TLS_LE32;
+ case FK_Data_8: return ELF::R_390_TLS_LE64;
+ }
+ llvm_unreachable("Unsupported absolute address");
+}
+
+// Return the R_390_TLS_LDO* relocation type for MCFixupKind Kind.
+static unsigned getTLSLDOReloc(unsigned Kind) {
+ switch (Kind) {
+ case FK_Data_4: return ELF::R_390_TLS_LDO32;
+ case FK_Data_8: return ELF::R_390_TLS_LDO64;
+ }
+ llvm_unreachable("Unsupported absolute address");
+}
+
+// Return the R_390_TLS_LDM* relocation type for MCFixupKind Kind.
+static unsigned getTLSLDMReloc(unsigned Kind) {
+ switch (Kind) {
+ case FK_Data_4: return ELF::R_390_TLS_LDM32;
+ case FK_Data_8: return ELF::R_390_TLS_LDM64;
+ case SystemZ::FK_390_TLS_CALL: return ELF::R_390_TLS_LDCALL;
+ }
+ llvm_unreachable("Unsupported absolute address");
+}
+
+// Return the R_390_TLS_GD* relocation type for MCFixupKind Kind.
+static unsigned getTLSGDReloc(unsigned Kind) {
+ switch (Kind) {
+ case FK_Data_4: return ELF::R_390_TLS_GD32;
+ case FK_Data_8: return ELF::R_390_TLS_GD64;
+ case SystemZ::FK_390_TLS_CALL: return ELF::R_390_TLS_GDCALL;
+ }
+ llvm_unreachable("Unsupported absolute address");
+}
+
+// Return the PLT relocation counterpart of MCFixupKind Kind.
+static unsigned getPLTReloc(unsigned Kind) {
+ switch (Kind) {
+ case SystemZ::FK_390_PC12DBL: return ELF::R_390_PLT12DBL;
+ case SystemZ::FK_390_PC16DBL: return ELF::R_390_PLT16DBL;
+ case SystemZ::FK_390_PC24DBL: return ELF::R_390_PLT24DBL;
+ case SystemZ::FK_390_PC32DBL: return ELF::R_390_PLT32DBL;
+ }
+ llvm_unreachable("Unsupported absolute address");
+}
+
+unsigned SystemZObjectWriter::getRelocType(MCContext &Ctx,
+ const MCValue &Target,
+ const MCFixup &Fixup,
+ bool IsPCRel) const {
+ MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
+ unsigned Kind = Fixup.getKind();
+ switch (Modifier) {
+ case MCSymbolRefExpr::VK_None:
+ if (IsPCRel)
+ return getPCRelReloc(Kind);
+ return getAbsoluteReloc(Kind);
+
+ case MCSymbolRefExpr::VK_NTPOFF:
+ assert(!IsPCRel && "NTPOFF shouldn't be PC-relative");
+ return getTLSLEReloc(Kind);
+
+ case MCSymbolRefExpr::VK_INDNTPOFF:
+ if (IsPCRel && Kind == SystemZ::FK_390_PC32DBL)
+ return ELF::R_390_TLS_IEENT;
+ llvm_unreachable("Only PC-relative INDNTPOFF accesses are supported for now");
+
+ case MCSymbolRefExpr::VK_DTPOFF:
+ assert(!IsPCRel && "DTPOFF shouldn't be PC-relative");
+ return getTLSLDOReloc(Kind);
+
+ case MCSymbolRefExpr::VK_TLSLDM:
+ assert(!IsPCRel && "TLSLDM shouldn't be PC-relative");
+ return getTLSLDMReloc(Kind);
+
+ case MCSymbolRefExpr::VK_TLSGD:
+ assert(!IsPCRel && "TLSGD shouldn't be PC-relative");
+ return getTLSGDReloc(Kind);
+
+ case MCSymbolRefExpr::VK_GOT:
+ if (IsPCRel && Kind == SystemZ::FK_390_PC32DBL)
+ return ELF::R_390_GOTENT;
+ llvm_unreachable("Only PC-relative GOT accesses are supported for now");
+
+ case MCSymbolRefExpr::VK_PLT:
+ assert(IsPCRel && "@PLT shouldt be PC-relative");
+ return getPLTReloc(Kind);
+
+ default:
+ llvm_unreachable("Modifier not supported");
+ }
+}
+
+MCObjectWriter *llvm::createSystemZObjectWriter(raw_pwrite_stream &OS,
+ uint8_t OSABI) {
+ MCELFObjectTargetWriter *MOTW = new SystemZObjectWriter(OSABI);
+ return createELFObjectWriter(MOTW, OS, /*IsLittleEndian=*/false);
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
new file mode 100644
index 000000000000..dfea7e33fa15
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
@@ -0,0 +1,246 @@
+//===-- SystemZMCTargetDesc.cpp - SystemZ target descriptions -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZMCTargetDesc.h"
+#include "InstPrinter/SystemZInstPrinter.h"
+#include "SystemZMCAsmInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "SystemZGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "SystemZGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "SystemZGenRegisterInfo.inc"
+
+const unsigned SystemZMC::GR32Regs[16] = {
+ SystemZ::R0L, SystemZ::R1L, SystemZ::R2L, SystemZ::R3L,
+ SystemZ::R4L, SystemZ::R5L, SystemZ::R6L, SystemZ::R7L,
+ SystemZ::R8L, SystemZ::R9L, SystemZ::R10L, SystemZ::R11L,
+ SystemZ::R12L, SystemZ::R13L, SystemZ::R14L, SystemZ::R15L
+};
+
+const unsigned SystemZMC::GRH32Regs[16] = {
+ SystemZ::R0H, SystemZ::R1H, SystemZ::R2H, SystemZ::R3H,
+ SystemZ::R4H, SystemZ::R5H, SystemZ::R6H, SystemZ::R7H,
+ SystemZ::R8H, SystemZ::R9H, SystemZ::R10H, SystemZ::R11H,
+ SystemZ::R12H, SystemZ::R13H, SystemZ::R14H, SystemZ::R15H
+};
+
+const unsigned SystemZMC::GR64Regs[16] = {
+ SystemZ::R0D, SystemZ::R1D, SystemZ::R2D, SystemZ::R3D,
+ SystemZ::R4D, SystemZ::R5D, SystemZ::R6D, SystemZ::R7D,
+ SystemZ::R8D, SystemZ::R9D, SystemZ::R10D, SystemZ::R11D,
+ SystemZ::R12D, SystemZ::R13D, SystemZ::R14D, SystemZ::R15D
+};
+
+const unsigned SystemZMC::GR128Regs[16] = {
+ SystemZ::R0Q, 0, SystemZ::R2Q, 0,
+ SystemZ::R4Q, 0, SystemZ::R6Q, 0,
+ SystemZ::R8Q, 0, SystemZ::R10Q, 0,
+ SystemZ::R12Q, 0, SystemZ::R14Q, 0
+};
+
+const unsigned SystemZMC::FP32Regs[16] = {
+ SystemZ::F0S, SystemZ::F1S, SystemZ::F2S, SystemZ::F3S,
+ SystemZ::F4S, SystemZ::F5S, SystemZ::F6S, SystemZ::F7S,
+ SystemZ::F8S, SystemZ::F9S, SystemZ::F10S, SystemZ::F11S,
+ SystemZ::F12S, SystemZ::F13S, SystemZ::F14S, SystemZ::F15S
+};
+
+const unsigned SystemZMC::FP64Regs[16] = {
+ SystemZ::F0D, SystemZ::F1D, SystemZ::F2D, SystemZ::F3D,
+ SystemZ::F4D, SystemZ::F5D, SystemZ::F6D, SystemZ::F7D,
+ SystemZ::F8D, SystemZ::F9D, SystemZ::F10D, SystemZ::F11D,
+ SystemZ::F12D, SystemZ::F13D, SystemZ::F14D, SystemZ::F15D
+};
+
+const unsigned SystemZMC::FP128Regs[16] = {
+ SystemZ::F0Q, SystemZ::F1Q, 0, 0,
+ SystemZ::F4Q, SystemZ::F5Q, 0, 0,
+ SystemZ::F8Q, SystemZ::F9Q, 0, 0,
+ SystemZ::F12Q, SystemZ::F13Q, 0, 0
+};
+
+const unsigned SystemZMC::VR32Regs[32] = {
+ SystemZ::F0S, SystemZ::F1S, SystemZ::F2S, SystemZ::F3S,
+ SystemZ::F4S, SystemZ::F5S, SystemZ::F6S, SystemZ::F7S,
+ SystemZ::F8S, SystemZ::F9S, SystemZ::F10S, SystemZ::F11S,
+ SystemZ::F12S, SystemZ::F13S, SystemZ::F14S, SystemZ::F15S,
+ SystemZ::F16S, SystemZ::F17S, SystemZ::F18S, SystemZ::F19S,
+ SystemZ::F20S, SystemZ::F21S, SystemZ::F22S, SystemZ::F23S,
+ SystemZ::F24S, SystemZ::F25S, SystemZ::F26S, SystemZ::F27S,
+ SystemZ::F28S, SystemZ::F29S, SystemZ::F30S, SystemZ::F31S
+};
+
+const unsigned SystemZMC::VR64Regs[32] = {
+ SystemZ::F0D, SystemZ::F1D, SystemZ::F2D, SystemZ::F3D,
+ SystemZ::F4D, SystemZ::F5D, SystemZ::F6D, SystemZ::F7D,
+ SystemZ::F8D, SystemZ::F9D, SystemZ::F10D, SystemZ::F11D,
+ SystemZ::F12D, SystemZ::F13D, SystemZ::F14D, SystemZ::F15D,
+ SystemZ::F16D, SystemZ::F17D, SystemZ::F18D, SystemZ::F19D,
+ SystemZ::F20D, SystemZ::F21D, SystemZ::F22D, SystemZ::F23D,
+ SystemZ::F24D, SystemZ::F25D, SystemZ::F26D, SystemZ::F27D,
+ SystemZ::F28D, SystemZ::F29D, SystemZ::F30D, SystemZ::F31D
+};
+
+const unsigned SystemZMC::VR128Regs[32] = {
+ SystemZ::V0, SystemZ::V1, SystemZ::V2, SystemZ::V3,
+ SystemZ::V4, SystemZ::V5, SystemZ::V6, SystemZ::V7,
+ SystemZ::V8, SystemZ::V9, SystemZ::V10, SystemZ::V11,
+ SystemZ::V12, SystemZ::V13, SystemZ::V14, SystemZ::V15,
+ SystemZ::V16, SystemZ::V17, SystemZ::V18, SystemZ::V19,
+ SystemZ::V20, SystemZ::V21, SystemZ::V22, SystemZ::V23,
+ SystemZ::V24, SystemZ::V25, SystemZ::V26, SystemZ::V27,
+ SystemZ::V28, SystemZ::V29, SystemZ::V30, SystemZ::V31
+};
+
+const unsigned SystemZMC::AR32Regs[16] = {
+ SystemZ::A0, SystemZ::A1, SystemZ::A2, SystemZ::A3,
+ SystemZ::A4, SystemZ::A5, SystemZ::A6, SystemZ::A7,
+ SystemZ::A8, SystemZ::A9, SystemZ::A10, SystemZ::A11,
+ SystemZ::A12, SystemZ::A13, SystemZ::A14, SystemZ::A15
+};
+
+unsigned SystemZMC::getFirstReg(unsigned Reg) {
+ static unsigned Map[SystemZ::NUM_TARGET_REGS];
+ static bool Initialized = false;
+ if (!Initialized) {
+ for (unsigned I = 0; I < 16; ++I) {
+ Map[GR32Regs[I]] = I;
+ Map[GRH32Regs[I]] = I;
+ Map[GR64Regs[I]] = I;
+ Map[GR128Regs[I]] = I;
+ Map[FP128Regs[I]] = I;
+ Map[AR32Regs[I]] = I;
+ }
+ for (unsigned I = 0; I < 32; ++I) {
+ Map[VR32Regs[I]] = I;
+ Map[VR64Regs[I]] = I;
+ Map[VR128Regs[I]] = I;
+ }
+ }
+ assert(Reg < SystemZ::NUM_TARGET_REGS);
+ return Map[Reg];
+}
+
+static MCAsmInfo *createSystemZMCAsmInfo(const MCRegisterInfo &MRI,
+ const Triple &TT) {
+ MCAsmInfo *MAI = new SystemZMCAsmInfo(TT);
+ MCCFIInstruction Inst =
+ MCCFIInstruction::createDefCfa(nullptr,
+ MRI.getDwarfRegNum(SystemZ::R15D, true),
+ SystemZMC::CFAOffsetFromInitialSP);
+ MAI->addInitialFrameState(Inst);
+ return MAI;
+}
+
+static MCInstrInfo *createSystemZMCInstrInfo() {
+ MCInstrInfo *X = new MCInstrInfo();
+ InitSystemZMCInstrInfo(X);
+ return X;
+}
+
+static MCRegisterInfo *createSystemZMCRegisterInfo(const Triple &TT) {
+ MCRegisterInfo *X = new MCRegisterInfo();
+ InitSystemZMCRegisterInfo(X, SystemZ::R14D);
+ return X;
+}
+
+static MCSubtargetInfo *
+createSystemZMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
+ return createSystemZMCSubtargetInfoImpl(TT, CPU, FS);
+}
+
+static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM,
+ CodeModel::Model &CM) {
+ // For SystemZ we define the models as follows:
+ //
+ // Small: BRASL can call any function and will use a stub if necessary.
+ // Locally-binding symbols will always be in range of LARL.
+ //
+ // Medium: BRASL can call any function and will use a stub if necessary.
+ // GOT slots and locally-defined text will always be in range
+ // of LARL, but other symbols might not be.
+ //
+ // Large: Equivalent to Medium for now.
+ //
+ // Kernel: Equivalent to Medium for now.
+ //
+ // This means that any PIC module smaller than 4GB meets the
+ // requirements of Small, so Small seems like the best default there.
+ //
+ // All symbols bind locally in a non-PIC module, so the choice is less
+ // obvious. There are two cases:
+ //
+ // - When creating an executable, PLTs and copy relocations allow
+ // us to treat external symbols as part of the executable.
+ // Any executable smaller than 4GB meets the requirements of Small,
+ // so that seems like the best default.
+ //
+ // - When creating JIT code, stubs will be in range of BRASL if the
+ // image is less than 4GB in size. GOT entries will likewise be
+ // in range of LARL. However, the JIT environment has no equivalent
+ // of copy relocs, so locally-binding data symbols might not be in
+ // the range of LARL. We need the Medium model in that case.
+ if (CM == CodeModel::Default)
+ CM = CodeModel::Small;
+ else if (CM == CodeModel::JITDefault)
+ CM = RM == Reloc::PIC_ ? CodeModel::Small : CodeModel::Medium;
+}
+
+static MCInstPrinter *createSystemZMCInstPrinter(const Triple &T,
+ unsigned SyntaxVariant,
+ const MCAsmInfo &MAI,
+ const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI) {
+ return new SystemZInstPrinter(MAI, MII, MRI);
+}
+
+extern "C" void LLVMInitializeSystemZTargetMC() {
+ // Register the MCAsmInfo.
+ TargetRegistry::RegisterMCAsmInfo(getTheSystemZTarget(),
+ createSystemZMCAsmInfo);
+
+ // Register the adjustCodeGenOpts.
+ TargetRegistry::registerMCAdjustCodeGenOpts(getTheSystemZTarget(),
+ adjustCodeGenOpts);
+
+ // Register the MCCodeEmitter.
+ TargetRegistry::RegisterMCCodeEmitter(getTheSystemZTarget(),
+ createSystemZMCCodeEmitter);
+
+ // Register the MCInstrInfo.
+ TargetRegistry::RegisterMCInstrInfo(getTheSystemZTarget(),
+ createSystemZMCInstrInfo);
+
+ // Register the MCRegisterInfo.
+ TargetRegistry::RegisterMCRegInfo(getTheSystemZTarget(),
+ createSystemZMCRegisterInfo);
+
+ // Register the MCSubtargetInfo.
+ TargetRegistry::RegisterMCSubtargetInfo(getTheSystemZTarget(),
+ createSystemZMCSubtargetInfo);
+
+ // Register the MCAsmBackend.
+ TargetRegistry::RegisterMCAsmBackend(getTheSystemZTarget(),
+ createSystemZMCAsmBackend);
+
+ // Register the MCInstPrinter.
+ TargetRegistry::RegisterMCInstPrinter(getTheSystemZTarget(),
+ createSystemZMCInstPrinter);
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
new file mode 100644
index 000000000000..d9926c7e4986
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
@@ -0,0 +1,108 @@
+//===-- SystemZMCTargetDesc.h - SystemZ target descriptions -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_MCTARGETDESC_SYSTEMZMCTARGETDESC_H
+#define LLVM_LIB_TARGET_SYSTEMZ_MCTARGETDESC_SYSTEMZMCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class MCTargetOptions;
+class StringRef;
+class Target;
+class Triple;
+class raw_pwrite_stream;
+class raw_ostream;
+
+Target &getTheSystemZTarget();
+
+namespace SystemZMC {
+// How many bytes are in the ABI-defined, caller-allocated part of
+// a stack frame.
+const int64_t CallFrameSize = 160;
+
+// The offset of the DWARF CFA from the incoming stack pointer.
+const int64_t CFAOffsetFromInitialSP = CallFrameSize;
+
+// Maps of asm register numbers to LLVM register numbers, with 0 indicating
+// an invalid register. In principle we could use 32-bit and 64-bit register
+// classes directly, provided that we relegated the GPR allocation order
+// in SystemZRegisterInfo.td to an AltOrder and left the default order
+// as %r0-%r15. It seems better to provide the same interface for
+// all classes though.
+extern const unsigned GR32Regs[16];
+extern const unsigned GRH32Regs[16];
+extern const unsigned GR64Regs[16];
+extern const unsigned GR128Regs[16];
+extern const unsigned FP32Regs[16];
+extern const unsigned FP64Regs[16];
+extern const unsigned FP128Regs[16];
+extern const unsigned VR32Regs[32];
+extern const unsigned VR64Regs[32];
+extern const unsigned VR128Regs[32];
+extern const unsigned AR32Regs[16];
+
+// Return the 0-based number of the first architectural register that
+// contains the given LLVM register. E.g. R1D -> 1.
+unsigned getFirstReg(unsigned Reg);
+
+// Return the given register as a GR64.
+inline unsigned getRegAsGR64(unsigned Reg) {
+ return GR64Regs[getFirstReg(Reg)];
+}
+
+// Return the given register as a low GR32.
+inline unsigned getRegAsGR32(unsigned Reg) {
+ return GR32Regs[getFirstReg(Reg)];
+}
+
+// Return the given register as a high GR32.
+inline unsigned getRegAsGRH32(unsigned Reg) {
+ return GRH32Regs[getFirstReg(Reg)];
+}
+
+// Return the given register as a VR128.
+inline unsigned getRegAsVR128(unsigned Reg) {
+ return VR128Regs[getFirstReg(Reg)];
+}
+} // end namespace SystemZMC
+
+MCCodeEmitter *createSystemZMCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx);
+
+MCAsmBackend *createSystemZMCAsmBackend(const Target &T,
+ const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options);
+
+MCObjectWriter *createSystemZObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI);
+} // end namespace llvm
+
+// Defines symbolic names for SystemZ registers.
+// This defines a mapping from register name to register number.
+#define GET_REGINFO_ENUM
+#include "SystemZGenRegisterInfo.inc"
+
+// Defines symbolic names for the SystemZ instructions.
+#define GET_INSTRINFO_ENUM
+#include "SystemZGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "SystemZGenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/README.txt b/contrib/llvm/lib/Target/SystemZ/README.txt
new file mode 100644
index 000000000000..86a1322c9e23
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/README.txt
@@ -0,0 +1,154 @@
+//===---------------------------------------------------------------------===//
+// Random notes about and ideas for the SystemZ backend.
+//===---------------------------------------------------------------------===//
+
+The initial backend is deliberately restricted to z10. We should add support
+for later architectures at some point.
+
+--
+
+If an inline asm ties an i32 "r" result to an i64 input, the input
+will be treated as an i32, leaving the upper bits uninitialised.
+For example:
+
+define void @f4(i32 *%dst) {
+ %val = call i32 asm "blah $0", "=r,0" (i64 103)
+ store i32 %val, i32 *%dst
+ ret void
+}
+
+from CodeGen/SystemZ/asm-09.ll will use LHI rather than LGHI.
+to load 103. This seems to be a general target-independent problem.
+
+--
+
+The tuning of the choice between LOAD ADDRESS (LA) and addition in
+SystemZISelDAGToDAG.cpp is suspect. It should be tweaked based on
+performance measurements.
+
+--
+
+There is no scheduling support.
+
+--
+
+We don't use the BRANCH ON INDEX instructions.
+
+--
+
+We only use MVC, XC and CLC for constant-length block operations.
+We could extend them to variable-length operations too,
+using EXECUTE RELATIVE LONG.
+
+MVCIN, MVCLE and CLCLE may be worthwhile too.
+
+--
+
+We don't use CUSE or the TRANSLATE family of instructions for string
+operations. The TRANSLATE ones are probably more difficult to exploit.
+
+--
+
+We don't take full advantage of builtins like fabsl because the calling
+conventions require f128s to be returned by invisible reference.
+
+--
+
+ADD LOGICAL WITH SIGNED IMMEDIATE could be useful when we need to
+produce a carry. SUBTRACT LOGICAL IMMEDIATE could be useful when we
+need to produce a borrow. (Note that there are no memory forms of
+ADD LOGICAL WITH CARRY and SUBTRACT LOGICAL WITH BORROW, so the high
+part of 128-bit memory operations would probably need to be done
+via a register.)
+
+--
+
+We don't use ICM or STCM.
+
+--
+
+DAGCombiner doesn't yet fold truncations of extended loads. Functions like:
+
+ unsigned long f (unsigned long x, unsigned short *y)
+ {
+ return (x << 32) | *y;
+ }
+
+therefore end up as:
+
+ sllg %r2, %r2, 32
+ llgh %r0, 0(%r3)
+ lr %r2, %r0
+ br %r14
+
+but truncating the load would give:
+
+ sllg %r2, %r2, 32
+ lh %r2, 0(%r3)
+ br %r14
+
+--
+
+Functions like:
+
+define i64 @f1(i64 %a) {
+ %and = and i64 %a, 1
+ ret i64 %and
+}
+
+ought to be implemented as:
+
+ lhi %r0, 1
+ ngr %r2, %r0
+ br %r14
+
+but two-address optimizations reverse the order of the AND and force:
+
+ lhi %r0, 1
+ ngr %r0, %r2
+ lgr %r2, %r0
+ br %r14
+
+CodeGen/SystemZ/and-04.ll has several examples of this.
+
+--
+
+Out-of-range displacements are usually handled by loading the full
+address into a register. In many cases it would be better to create
+an anchor point instead. E.g. for:
+
+define void @f4a(i128 *%aptr, i64 %base) {
+ %addr = add i64 %base, 524288
+ %bptr = inttoptr i64 %addr to i128 *
+ %a = load volatile i128 *%aptr
+ %b = load i128 *%bptr
+ %add = add i128 %a, %b
+ store i128 %add, i128 *%aptr
+ ret void
+}
+
+(from CodeGen/SystemZ/int-add-08.ll) we load %base+524288 and %base+524296
+into separate registers, rather than using %base+524288 as a base for both.
+
+--
+
+Dynamic stack allocations round the size to 8 bytes and then allocate
+that rounded amount. It would be simpler to subtract the unrounded
+size from the copy of the stack pointer and then align the result.
+See CodeGen/SystemZ/alloca-01.ll for an example.
+
+--
+
+If needed, we can support 16-byte atomics using LPQ, STPQ and CSDG.
+
+--
+
+We might want to model all access registers and use them to spill
+32-bit values.
+
+--
+
+We might want to use the 'overflow' condition of eg. AR to support
+llvm.sadd.with.overflow.i32 and related instructions - the generated code
+for signed overflow check is currently quite bad. This would improve
+the results of using -ftrapv.
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZ.h b/contrib/llvm/lib/Target/SystemZ/SystemZ.h
new file mode 100644
index 000000000000..9a8e508e4119
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZ.h
@@ -0,0 +1,185 @@
+//==- SystemZ.h - Top-Level Interface for SystemZ representation -*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in
+// the LLVM SystemZ backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZ_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZ_H
+
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "llvm/Support/CodeGen.h"
+
+namespace llvm {
+class SystemZTargetMachine;
+class FunctionPass;
+
+namespace SystemZ {
+// Condition-code mask values.
+const unsigned CCMASK_0 = 1 << 3;
+const unsigned CCMASK_1 = 1 << 2;
+const unsigned CCMASK_2 = 1 << 1;
+const unsigned CCMASK_3 = 1 << 0;
+const unsigned CCMASK_ANY = CCMASK_0 | CCMASK_1 | CCMASK_2 | CCMASK_3;
+
+// Condition-code mask assignments for integer and floating-point
+// comparisons.
+const unsigned CCMASK_CMP_EQ = CCMASK_0;
+const unsigned CCMASK_CMP_LT = CCMASK_1;
+const unsigned CCMASK_CMP_GT = CCMASK_2;
+const unsigned CCMASK_CMP_NE = CCMASK_CMP_LT | CCMASK_CMP_GT;
+const unsigned CCMASK_CMP_LE = CCMASK_CMP_EQ | CCMASK_CMP_LT;
+const unsigned CCMASK_CMP_GE = CCMASK_CMP_EQ | CCMASK_CMP_GT;
+
+// Condition-code mask assignments for floating-point comparisons only.
+const unsigned CCMASK_CMP_UO = CCMASK_3;
+const unsigned CCMASK_CMP_O = CCMASK_ANY ^ CCMASK_CMP_UO;
+
+// All condition-code values produced by comparisons.
+const unsigned CCMASK_ICMP = CCMASK_0 | CCMASK_1 | CCMASK_2;
+const unsigned CCMASK_FCMP = CCMASK_0 | CCMASK_1 | CCMASK_2 | CCMASK_3;
+
+// Condition-code mask assignments for CS.
+const unsigned CCMASK_CS_EQ = CCMASK_0;
+const unsigned CCMASK_CS_NE = CCMASK_1;
+const unsigned CCMASK_CS = CCMASK_0 | CCMASK_1;
+
+// Condition-code mask assignments for a completed SRST loop.
+const unsigned CCMASK_SRST_FOUND = CCMASK_1;
+const unsigned CCMASK_SRST_NOTFOUND = CCMASK_2;
+const unsigned CCMASK_SRST = CCMASK_1 | CCMASK_2;
+
+// Condition-code mask assignments for TEST UNDER MASK.
+const unsigned CCMASK_TM_ALL_0 = CCMASK_0;
+const unsigned CCMASK_TM_MIXED_MSB_0 = CCMASK_1;
+const unsigned CCMASK_TM_MIXED_MSB_1 = CCMASK_2;
+const unsigned CCMASK_TM_ALL_1 = CCMASK_3;
+const unsigned CCMASK_TM_SOME_0 = CCMASK_TM_ALL_1 ^ CCMASK_ANY;
+const unsigned CCMASK_TM_SOME_1 = CCMASK_TM_ALL_0 ^ CCMASK_ANY;
+const unsigned CCMASK_TM_MSB_0 = CCMASK_0 | CCMASK_1;
+const unsigned CCMASK_TM_MSB_1 = CCMASK_2 | CCMASK_3;
+const unsigned CCMASK_TM = CCMASK_ANY;
+
+// Condition-code mask assignments for TRANSACTION_BEGIN.
+const unsigned CCMASK_TBEGIN_STARTED = CCMASK_0;
+const unsigned CCMASK_TBEGIN_INDETERMINATE = CCMASK_1;
+const unsigned CCMASK_TBEGIN_TRANSIENT = CCMASK_2;
+const unsigned CCMASK_TBEGIN_PERSISTENT = CCMASK_3;
+const unsigned CCMASK_TBEGIN = CCMASK_ANY;
+
+// Condition-code mask assignments for TRANSACTION_END.
+const unsigned CCMASK_TEND_TX = CCMASK_0;
+const unsigned CCMASK_TEND_NOTX = CCMASK_2;
+const unsigned CCMASK_TEND = CCMASK_TEND_TX | CCMASK_TEND_NOTX;
+
+// Condition-code mask assignments for vector comparisons (and similar
+// operations).
+const unsigned CCMASK_VCMP_ALL = CCMASK_0;
+const unsigned CCMASK_VCMP_MIXED = CCMASK_1;
+const unsigned CCMASK_VCMP_NONE = CCMASK_3;
+const unsigned CCMASK_VCMP = CCMASK_0 | CCMASK_1 | CCMASK_3;
+
+// Condition-code mask assignments for Test Data Class.
+const unsigned CCMASK_TDC_NOMATCH = CCMASK_0;
+const unsigned CCMASK_TDC_MATCH = CCMASK_1;
+const unsigned CCMASK_TDC = CCMASK_TDC_NOMATCH | CCMASK_TDC_MATCH;
+
+// The position of the low CC bit in an IPM result.
+const unsigned IPM_CC = 28;
+
+// Mask assignments for PFD.
+const unsigned PFD_READ = 1;
+const unsigned PFD_WRITE = 2;
+
+// Mask assignments for TDC
+const unsigned TDCMASK_ZERO_PLUS = 0x800;
+const unsigned TDCMASK_ZERO_MINUS = 0x400;
+const unsigned TDCMASK_NORMAL_PLUS = 0x200;
+const unsigned TDCMASK_NORMAL_MINUS = 0x100;
+const unsigned TDCMASK_SUBNORMAL_PLUS = 0x080;
+const unsigned TDCMASK_SUBNORMAL_MINUS = 0x040;
+const unsigned TDCMASK_INFINITY_PLUS = 0x020;
+const unsigned TDCMASK_INFINITY_MINUS = 0x010;
+const unsigned TDCMASK_QNAN_PLUS = 0x008;
+const unsigned TDCMASK_QNAN_MINUS = 0x004;
+const unsigned TDCMASK_SNAN_PLUS = 0x002;
+const unsigned TDCMASK_SNAN_MINUS = 0x001;
+
+const unsigned TDCMASK_ZERO = TDCMASK_ZERO_PLUS | TDCMASK_ZERO_MINUS;
+const unsigned TDCMASK_POSITIVE = TDCMASK_NORMAL_PLUS |
+ TDCMASK_SUBNORMAL_PLUS |
+ TDCMASK_INFINITY_PLUS;
+const unsigned TDCMASK_NEGATIVE = TDCMASK_NORMAL_MINUS |
+ TDCMASK_SUBNORMAL_MINUS |
+ TDCMASK_INFINITY_MINUS;
+const unsigned TDCMASK_NAN = TDCMASK_QNAN_PLUS |
+ TDCMASK_QNAN_MINUS |
+ TDCMASK_SNAN_PLUS |
+ TDCMASK_SNAN_MINUS;
+const unsigned TDCMASK_PLUS = TDCMASK_POSITIVE |
+ TDCMASK_ZERO_PLUS |
+ TDCMASK_QNAN_PLUS |
+ TDCMASK_SNAN_PLUS;
+const unsigned TDCMASK_MINUS = TDCMASK_NEGATIVE |
+ TDCMASK_ZERO_MINUS |
+ TDCMASK_QNAN_MINUS |
+ TDCMASK_SNAN_MINUS;
+const unsigned TDCMASK_ALL = TDCMASK_PLUS | TDCMASK_MINUS;
+
+// Number of bits in a vector register.
+const unsigned VectorBits = 128;
+
+// Number of bytes in a vector register (and consequently the number of
+// bytes in a general permute vector).
+const unsigned VectorBytes = VectorBits / 8;
+
+// Return true if Val fits an LLILL operand.
+static inline bool isImmLL(uint64_t Val) {
+ return (Val & ~0x000000000000ffffULL) == 0;
+}
+
+// Return true if Val fits an LLILH operand.
+static inline bool isImmLH(uint64_t Val) {
+ return (Val & ~0x00000000ffff0000ULL) == 0;
+}
+
+// Return true if Val fits an LLIHL operand.
+static inline bool isImmHL(uint64_t Val) {
+ return (Val & ~0x00000ffff00000000ULL) == 0;
+}
+
+// Return true if Val fits an LLIHH operand.
+static inline bool isImmHH(uint64_t Val) {
+ return (Val & ~0xffff000000000000ULL) == 0;
+}
+
+// Return true if Val fits an LLILF operand.
+static inline bool isImmLF(uint64_t Val) {
+ return (Val & ~0x00000000ffffffffULL) == 0;
+}
+
+// Return true if Val fits an LLIHF operand.
+static inline bool isImmHF(uint64_t Val) {
+ return (Val & ~0xffffffff00000000ULL) == 0;
+}
+} // end namespace SystemZ
+
+FunctionPass *createSystemZISelDag(SystemZTargetMachine &TM,
+ CodeGenOpt::Level OptLevel);
+FunctionPass *createSystemZElimComparePass(SystemZTargetMachine &TM);
+FunctionPass *createSystemZExpandPseudoPass(SystemZTargetMachine &TM);
+FunctionPass *createSystemZShortenInstPass(SystemZTargetMachine &TM);
+FunctionPass *createSystemZLongBranchPass(SystemZTargetMachine &TM);
+FunctionPass *createSystemZLDCleanupPass(SystemZTargetMachine &TM);
+FunctionPass *createSystemZTDCPass();
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZ.td b/contrib/llvm/lib/Target/SystemZ/SystemZ.td
new file mode 100644
index 000000000000..6bdfd4d07edc
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZ.td
@@ -0,0 +1,75 @@
+//===-- SystemZ.td - Describe the SystemZ target machine -----*- tblgen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// SystemZ subtarget features
+//===----------------------------------------------------------------------===//
+
+include "SystemZFeatures.td"
+
+//===----------------------------------------------------------------------===//
+// SystemZ subtarget scheduling models
+//===----------------------------------------------------------------------===//
+
+include "SystemZSchedule.td"
+
+//===----------------------------------------------------------------------===//
+// SystemZ supported processors
+//===----------------------------------------------------------------------===//
+
+include "SystemZProcessors.td"
+
+//===----------------------------------------------------------------------===//
+// Register file description
+//===----------------------------------------------------------------------===//
+
+include "SystemZRegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Calling convention description
+//===----------------------------------------------------------------------===//
+
+include "SystemZCallingConv.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction descriptions
+//===----------------------------------------------------------------------===//
+
+include "SystemZOperators.td"
+include "SystemZOperands.td"
+include "SystemZPatterns.td"
+include "SystemZInstrFormats.td"
+include "SystemZInstrInfo.td"
+include "SystemZInstrVector.td"
+include "SystemZInstrFP.td"
+
+def SystemZInstrInfo : InstrInfo {}
+
+//===----------------------------------------------------------------------===//
+// Assembly parser
+//===----------------------------------------------------------------------===//
+
+def SystemZAsmParser : AsmParser {
+ let ShouldEmitMatchRegisterName = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Top-level target declaration
+//===----------------------------------------------------------------------===//
+
+def SystemZ : Target {
+ let InstructionSet = SystemZInstrInfo;
+ let AssemblyParsers = [SystemZAsmParser];
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
new file mode 100644
index 000000000000..b39245b20b3c
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -0,0 +1,527 @@
+//===-- SystemZAsmPrinter.cpp - SystemZ LLVM assembly printer -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Streams SystemZ assembly language and associated data, in the form of
+// MCInsts and MCExprs respectively.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZAsmPrinter.h"
+#include "InstPrinter/SystemZInstPrinter.h"
+#include "SystemZConstantPoolValue.h"
+#include "SystemZMCInstLower.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+// Return an RI instruction like MI with opcode Opcode, but with the
+// GR64 register operands turned into GR32s.
+static MCInst lowerRILow(const MachineInstr *MI, unsigned Opcode) {
+ if (MI->isCompare())
+ return MCInstBuilder(Opcode)
+ .addReg(SystemZMC::getRegAsGR32(MI->getOperand(0).getReg()))
+ .addImm(MI->getOperand(1).getImm());
+ else
+ return MCInstBuilder(Opcode)
+ .addReg(SystemZMC::getRegAsGR32(MI->getOperand(0).getReg()))
+ .addReg(SystemZMC::getRegAsGR32(MI->getOperand(1).getReg()))
+ .addImm(MI->getOperand(2).getImm());
+}
+
+// Return an RI instruction like MI with opcode Opcode, but with the
+// GR64 register operands turned into GRH32s.
+static MCInst lowerRIHigh(const MachineInstr *MI, unsigned Opcode) {
+ if (MI->isCompare())
+ return MCInstBuilder(Opcode)
+ .addReg(SystemZMC::getRegAsGRH32(MI->getOperand(0).getReg()))
+ .addImm(MI->getOperand(1).getImm());
+ else
+ return MCInstBuilder(Opcode)
+ .addReg(SystemZMC::getRegAsGRH32(MI->getOperand(0).getReg()))
+ .addReg(SystemZMC::getRegAsGRH32(MI->getOperand(1).getReg()))
+ .addImm(MI->getOperand(2).getImm());
+}
+
+// Return an RI instruction like MI with opcode Opcode, but with the
+// R2 register turned into a GR64.
+static MCInst lowerRIEfLow(const MachineInstr *MI, unsigned Opcode) {
+ return MCInstBuilder(Opcode)
+ .addReg(MI->getOperand(0).getReg())
+ .addReg(MI->getOperand(1).getReg())
+ .addReg(SystemZMC::getRegAsGR64(MI->getOperand(2).getReg()))
+ .addImm(MI->getOperand(3).getImm())
+ .addImm(MI->getOperand(4).getImm())
+ .addImm(MI->getOperand(5).getImm());
+}
+
+static const MCSymbolRefExpr *getTLSGetOffset(MCContext &Context) {
+ StringRef Name = "__tls_get_offset";
+ return MCSymbolRefExpr::create(Context.getOrCreateSymbol(Name),
+ MCSymbolRefExpr::VK_PLT,
+ Context);
+}
+
+static const MCSymbolRefExpr *getGlobalOffsetTable(MCContext &Context) {
+ StringRef Name = "_GLOBAL_OFFSET_TABLE_";
+ return MCSymbolRefExpr::create(Context.getOrCreateSymbol(Name),
+ MCSymbolRefExpr::VK_None,
+ Context);
+}
+
+// MI loads the high part of a vector from memory. Return an instruction
+// that uses replicating vector load Opcode to do the same thing.
+static MCInst lowerSubvectorLoad(const MachineInstr *MI, unsigned Opcode) {
+ return MCInstBuilder(Opcode)
+ .addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg()))
+ .addReg(MI->getOperand(1).getReg())
+ .addImm(MI->getOperand(2).getImm())
+ .addReg(MI->getOperand(3).getReg());
+}
+
+// MI stores the high part of a vector to memory. Return an instruction
+// that uses elemental vector store Opcode to do the same thing.
+static MCInst lowerSubvectorStore(const MachineInstr *MI, unsigned Opcode) {
+ return MCInstBuilder(Opcode)
+ .addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg()))
+ .addReg(MI->getOperand(1).getReg())
+ .addImm(MI->getOperand(2).getImm())
+ .addReg(MI->getOperand(3).getReg())
+ .addImm(0);
+}
+
+void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+ SystemZMCInstLower Lower(MF->getContext(), *this);
+ MCInst LoweredMI;
+ switch (MI->getOpcode()) {
+ case SystemZ::Return:
+ LoweredMI = MCInstBuilder(SystemZ::BR).addReg(SystemZ::R14D);
+ break;
+
+ case SystemZ::CondReturn:
+ LoweredMI = MCInstBuilder(SystemZ::BCR)
+ .addImm(MI->getOperand(0).getImm())
+ .addImm(MI->getOperand(1).getImm())
+ .addReg(SystemZ::R14D);
+ break;
+
+ case SystemZ::CRBReturn:
+ LoweredMI = MCInstBuilder(SystemZ::CRB)
+ .addReg(MI->getOperand(0).getReg())
+ .addReg(MI->getOperand(1).getReg())
+ .addImm(MI->getOperand(2).getImm())
+ .addReg(SystemZ::R14D)
+ .addImm(0);
+ break;
+
+ case SystemZ::CGRBReturn:
+ LoweredMI = MCInstBuilder(SystemZ::CGRB)
+ .addReg(MI->getOperand(0).getReg())
+ .addReg(MI->getOperand(1).getReg())
+ .addImm(MI->getOperand(2).getImm())
+ .addReg(SystemZ::R14D)
+ .addImm(0);
+ break;
+
+ case SystemZ::CIBReturn:
+ LoweredMI = MCInstBuilder(SystemZ::CIB)
+ .addReg(MI->getOperand(0).getReg())
+ .addImm(MI->getOperand(1).getImm())
+ .addImm(MI->getOperand(2).getImm())
+ .addReg(SystemZ::R14D)
+ .addImm(0);
+ break;
+
+ case SystemZ::CGIBReturn:
+ LoweredMI = MCInstBuilder(SystemZ::CGIB)
+ .addReg(MI->getOperand(0).getReg())
+ .addImm(MI->getOperand(1).getImm())
+ .addImm(MI->getOperand(2).getImm())
+ .addReg(SystemZ::R14D)
+ .addImm(0);
+ break;
+
+ case SystemZ::CLRBReturn:
+ LoweredMI = MCInstBuilder(SystemZ::CLRB)
+ .addReg(MI->getOperand(0).getReg())
+ .addReg(MI->getOperand(1).getReg())
+ .addImm(MI->getOperand(2).getImm())
+ .addReg(SystemZ::R14D)
+ .addImm(0);
+ break;
+
+ case SystemZ::CLGRBReturn:
+ LoweredMI = MCInstBuilder(SystemZ::CLGRB)
+ .addReg(MI->getOperand(0).getReg())
+ .addReg(MI->getOperand(1).getReg())
+ .addImm(MI->getOperand(2).getImm())
+ .addReg(SystemZ::R14D)
+ .addImm(0);
+ break;
+
+ case SystemZ::CLIBReturn:
+ LoweredMI = MCInstBuilder(SystemZ::CLIB)
+ .addReg(MI->getOperand(0).getReg())
+ .addImm(MI->getOperand(1).getImm())
+ .addImm(MI->getOperand(2).getImm())
+ .addReg(SystemZ::R14D)
+ .addImm(0);
+ break;
+
+ case SystemZ::CLGIBReturn:
+ LoweredMI = MCInstBuilder(SystemZ::CLGIB)
+ .addReg(MI->getOperand(0).getReg())
+ .addImm(MI->getOperand(1).getImm())
+ .addImm(MI->getOperand(2).getImm())
+ .addReg(SystemZ::R14D)
+ .addImm(0);
+ break;
+
+ case SystemZ::CallBRASL:
+ LoweredMI = MCInstBuilder(SystemZ::BRASL)
+ .addReg(SystemZ::R14D)
+ .addExpr(Lower.getExpr(MI->getOperand(0), MCSymbolRefExpr::VK_PLT));
+ break;
+
+ case SystemZ::CallBASR:
+ LoweredMI = MCInstBuilder(SystemZ::BASR)
+ .addReg(SystemZ::R14D)
+ .addReg(MI->getOperand(0).getReg());
+ break;
+
+ case SystemZ::CallJG:
+ LoweredMI = MCInstBuilder(SystemZ::JG)
+ .addExpr(Lower.getExpr(MI->getOperand(0), MCSymbolRefExpr::VK_PLT));
+ break;
+
+ case SystemZ::CallBRCL:
+ LoweredMI = MCInstBuilder(SystemZ::BRCL)
+ .addImm(MI->getOperand(0).getImm())
+ .addImm(MI->getOperand(1).getImm())
+ .addExpr(Lower.getExpr(MI->getOperand(2), MCSymbolRefExpr::VK_PLT));
+ break;
+
+ case SystemZ::CallBR:
+ LoweredMI = MCInstBuilder(SystemZ::BR).addReg(SystemZ::R1D);
+ break;
+
+ case SystemZ::CallBCR:
+ LoweredMI = MCInstBuilder(SystemZ::BCR)
+ .addImm(MI->getOperand(0).getImm())
+ .addImm(MI->getOperand(1).getImm())
+ .addReg(SystemZ::R1D);
+ break;
+
+ case SystemZ::CRBCall:
+ LoweredMI = MCInstBuilder(SystemZ::CRB)
+ .addReg(MI->getOperand(0).getReg())
+ .addReg(MI->getOperand(1).getReg())
+ .addImm(MI->getOperand(2).getImm())
+ .addReg(SystemZ::R1D)
+ .addImm(0);
+ break;
+
+ case SystemZ::CGRBCall:
+ LoweredMI = MCInstBuilder(SystemZ::CGRB)
+ .addReg(MI->getOperand(0).getReg())
+ .addReg(MI->getOperand(1).getReg())
+ .addImm(MI->getOperand(2).getImm())
+ .addReg(SystemZ::R1D)
+ .addImm(0);
+ break;
+
+ case SystemZ::CIBCall:
+ LoweredMI = MCInstBuilder(SystemZ::CIB)
+ .addReg(MI->getOperand(0).getReg())
+ .addImm(MI->getOperand(1).getImm())
+ .addImm(MI->getOperand(2).getImm())
+ .addReg(SystemZ::R1D)
+ .addImm(0);
+ break;
+
+ case SystemZ::CGIBCall:
+ LoweredMI = MCInstBuilder(SystemZ::CGIB)
+ .addReg(MI->getOperand(0).getReg())
+ .addImm(MI->getOperand(1).getImm())
+ .addImm(MI->getOperand(2).getImm())
+ .addReg(SystemZ::R1D)
+ .addImm(0);
+ break;
+
+ case SystemZ::CLRBCall:
+ LoweredMI = MCInstBuilder(SystemZ::CLRB)
+ .addReg(MI->getOperand(0).getReg())
+ .addReg(MI->getOperand(1).getReg())
+ .addImm(MI->getOperand(2).getImm())
+ .addReg(SystemZ::R1D)
+ .addImm(0);
+ break;
+
+ case SystemZ::CLGRBCall:
+ LoweredMI = MCInstBuilder(SystemZ::CLGRB)
+ .addReg(MI->getOperand(0).getReg())
+ .addReg(MI->getOperand(1).getReg())
+ .addImm(MI->getOperand(2).getImm())
+ .addReg(SystemZ::R1D)
+ .addImm(0);
+ break;
+
+ case SystemZ::CLIBCall:
+ LoweredMI = MCInstBuilder(SystemZ::CLIB)
+ .addReg(MI->getOperand(0).getReg())
+ .addImm(MI->getOperand(1).getImm())
+ .addImm(MI->getOperand(2).getImm())
+ .addReg(SystemZ::R1D)
+ .addImm(0);
+ break;
+
+ case SystemZ::CLGIBCall:
+ LoweredMI = MCInstBuilder(SystemZ::CLGIB)
+ .addReg(MI->getOperand(0).getReg())
+ .addImm(MI->getOperand(1).getImm())
+ .addImm(MI->getOperand(2).getImm())
+ .addReg(SystemZ::R1D)
+ .addImm(0);
+ break;
+
+ case SystemZ::TLS_GDCALL:
+ LoweredMI = MCInstBuilder(SystemZ::BRASL)
+ .addReg(SystemZ::R14D)
+ .addExpr(getTLSGetOffset(MF->getContext()))
+ .addExpr(Lower.getExpr(MI->getOperand(0), MCSymbolRefExpr::VK_TLSGD));
+ break;
+
+ case SystemZ::TLS_LDCALL:
+ LoweredMI = MCInstBuilder(SystemZ::BRASL)
+ .addReg(SystemZ::R14D)
+ .addExpr(getTLSGetOffset(MF->getContext()))
+ .addExpr(Lower.getExpr(MI->getOperand(0), MCSymbolRefExpr::VK_TLSLDM));
+ break;
+
+ case SystemZ::GOT:
+ LoweredMI = MCInstBuilder(SystemZ::LARL)
+ .addReg(MI->getOperand(0).getReg())
+ .addExpr(getGlobalOffsetTable(MF->getContext()));
+ break;
+
+ case SystemZ::IILF64:
+ LoweredMI = MCInstBuilder(SystemZ::IILF)
+ .addReg(SystemZMC::getRegAsGR32(MI->getOperand(0).getReg()))
+ .addImm(MI->getOperand(2).getImm());
+ break;
+
+ case SystemZ::IIHF64:
+ LoweredMI = MCInstBuilder(SystemZ::IIHF)
+ .addReg(SystemZMC::getRegAsGRH32(MI->getOperand(0).getReg()))
+ .addImm(MI->getOperand(2).getImm());
+ break;
+
+ case SystemZ::RISBHH:
+ case SystemZ::RISBHL:
+ LoweredMI = lowerRIEfLow(MI, SystemZ::RISBHG);
+ break;
+
+ case SystemZ::RISBLH:
+ case SystemZ::RISBLL:
+ LoweredMI = lowerRIEfLow(MI, SystemZ::RISBLG);
+ break;
+
+ case SystemZ::VLVGP32:
+ LoweredMI = MCInstBuilder(SystemZ::VLVGP)
+ .addReg(MI->getOperand(0).getReg())
+ .addReg(SystemZMC::getRegAsGR64(MI->getOperand(1).getReg()))
+ .addReg(SystemZMC::getRegAsGR64(MI->getOperand(2).getReg()));
+ break;
+
+ case SystemZ::VLR32:
+ case SystemZ::VLR64:
+ LoweredMI = MCInstBuilder(SystemZ::VLR)
+ .addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg()))
+ .addReg(SystemZMC::getRegAsVR128(MI->getOperand(1).getReg()));
+ break;
+
+ case SystemZ::VL32:
+ LoweredMI = lowerSubvectorLoad(MI, SystemZ::VLREPF);
+ break;
+
+ case SystemZ::VL64:
+ LoweredMI = lowerSubvectorLoad(MI, SystemZ::VLREPG);
+ break;
+
+ case SystemZ::VST32:
+ LoweredMI = lowerSubvectorStore(MI, SystemZ::VSTEF);
+ break;
+
+ case SystemZ::VST64:
+ LoweredMI = lowerSubvectorStore(MI, SystemZ::VSTEG);
+ break;
+
+ case SystemZ::LFER:
+ LoweredMI = MCInstBuilder(SystemZ::VLGVF)
+ .addReg(SystemZMC::getRegAsGR64(MI->getOperand(0).getReg()))
+ .addReg(SystemZMC::getRegAsVR128(MI->getOperand(1).getReg()))
+ .addReg(0).addImm(0);
+ break;
+
+ case SystemZ::LEFR:
+ LoweredMI = MCInstBuilder(SystemZ::VLVGF)
+ .addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg()))
+ .addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg()))
+ .addReg(MI->getOperand(1).getReg())
+ .addReg(0).addImm(0);
+ break;
+
+#define LOWER_LOW(NAME) \
+ case SystemZ::NAME##64: LoweredMI = lowerRILow(MI, SystemZ::NAME); break
+
+ LOWER_LOW(IILL);
+ LOWER_LOW(IILH);
+ LOWER_LOW(TMLL);
+ LOWER_LOW(TMLH);
+ LOWER_LOW(NILL);
+ LOWER_LOW(NILH);
+ LOWER_LOW(NILF);
+ LOWER_LOW(OILL);
+ LOWER_LOW(OILH);
+ LOWER_LOW(OILF);
+ LOWER_LOW(XILF);
+
+#undef LOWER_LOW
+
+#define LOWER_HIGH(NAME) \
+ case SystemZ::NAME##64: LoweredMI = lowerRIHigh(MI, SystemZ::NAME); break
+
+ LOWER_HIGH(IIHL);
+ LOWER_HIGH(IIHH);
+ LOWER_HIGH(TMHL);
+ LOWER_HIGH(TMHH);
+ LOWER_HIGH(NIHL);
+ LOWER_HIGH(NIHH);
+ LOWER_HIGH(NIHF);
+ LOWER_HIGH(OIHL);
+ LOWER_HIGH(OIHH);
+ LOWER_HIGH(OIHF);
+ LOWER_HIGH(XIHF);
+
+#undef LOWER_HIGH
+
+ case SystemZ::Serialize:
+ if (MF->getSubtarget<SystemZSubtarget>().hasFastSerialization())
+ LoweredMI = MCInstBuilder(SystemZ::BCRAsm)
+ .addImm(14).addReg(SystemZ::R0D);
+ else
+ LoweredMI = MCInstBuilder(SystemZ::BCRAsm)
+ .addImm(15).addReg(SystemZ::R0D);
+ break;
+
+ // Emit nothing here but a comment if we can.
+ case SystemZ::MemBarrier:
+ OutStreamer->emitRawComment("MEMBARRIER");
+ return;
+
+ // We want to emit "j .+2" for traps, jumping to the relative immediate field
+ // of the jump instruction, which is an illegal instruction. We cannot emit a
+ // "." symbol, so create and emit a temp label before the instruction and use
+ // that instead.
+ case SystemZ::Trap: {
+ MCSymbol *DotSym = OutContext.createTempSymbol();
+ OutStreamer->EmitLabel(DotSym);
+
+ const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(DotSym, OutContext);
+ const MCConstantExpr *ConstExpr = MCConstantExpr::create(2, OutContext);
+ LoweredMI = MCInstBuilder(SystemZ::J)
+ .addExpr(MCBinaryExpr::createAdd(Expr, ConstExpr, OutContext));
+ }
+ break;
+
+ // Conditional traps will create a branch on condition instruction that jumps
+ // to the relative immediate field of the jump instruction. (eg. "jo .+2")
+ case SystemZ::CondTrap: {
+ MCSymbol *DotSym = OutContext.createTempSymbol();
+ OutStreamer->EmitLabel(DotSym);
+
+ const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(DotSym, OutContext);
+ const MCConstantExpr *ConstExpr = MCConstantExpr::create(2, OutContext);
+ LoweredMI = MCInstBuilder(SystemZ::BRC)
+ .addImm(MI->getOperand(0).getImm())
+ .addImm(MI->getOperand(1).getImm())
+ .addExpr(MCBinaryExpr::createAdd(Expr, ConstExpr, OutContext));
+ }
+ break;
+
+ default:
+ Lower.lower(MI, LoweredMI);
+ break;
+ }
+ EmitToStreamer(*OutStreamer, LoweredMI);
+}
+
+// Convert a SystemZ-specific constant pool modifier into the associated
+// MCSymbolRefExpr variant kind.
+static MCSymbolRefExpr::VariantKind
+getModifierVariantKind(SystemZCP::SystemZCPModifier Modifier) {
+ switch (Modifier) {
+ case SystemZCP::TLSGD: return MCSymbolRefExpr::VK_TLSGD;
+ case SystemZCP::TLSLDM: return MCSymbolRefExpr::VK_TLSLDM;
+ case SystemZCP::DTPOFF: return MCSymbolRefExpr::VK_DTPOFF;
+ case SystemZCP::NTPOFF: return MCSymbolRefExpr::VK_NTPOFF;
+ }
+ llvm_unreachable("Invalid SystemCPModifier!");
+}
+
+void SystemZAsmPrinter::
+EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
+ auto *ZCPV = static_cast<SystemZConstantPoolValue*>(MCPV);
+
+ const MCExpr *Expr =
+ MCSymbolRefExpr::create(getSymbol(ZCPV->getGlobalValue()),
+ getModifierVariantKind(ZCPV->getModifier()),
+ OutContext);
+ uint64_t Size = getDataLayout().getTypeAllocSize(ZCPV->getType());
+
+ OutStreamer->EmitValue(Expr, Size);
+}
+
+bool SystemZAsmPrinter::PrintAsmOperand(const MachineInstr *MI,
+ unsigned OpNo,
+ unsigned AsmVariant,
+ const char *ExtraCode,
+ raw_ostream &OS) {
+ if (ExtraCode && *ExtraCode == 'n') {
+ if (!MI->getOperand(OpNo).isImm())
+ return true;
+ OS << -int64_t(MI->getOperand(OpNo).getImm());
+ } else {
+ SystemZMCInstLower Lower(MF->getContext(), *this);
+ MCOperand MO(Lower.lowerOperand(MI->getOperand(OpNo)));
+ SystemZInstPrinter::printOperand(MO, MAI, OS);
+ }
+ return false;
+}
+
+bool SystemZAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+ unsigned OpNo,
+ unsigned AsmVariant,
+ const char *ExtraCode,
+ raw_ostream &OS) {
+ SystemZInstPrinter::printAddress(MI->getOperand(OpNo).getReg(),
+ MI->getOperand(OpNo + 1).getImm(),
+ MI->getOperand(OpNo + 2).getReg(), OS);
+ return false;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeSystemZAsmPrinter() {
+ RegisterAsmPrinter<SystemZAsmPrinter> X(getTheSystemZTarget());
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h b/contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h
new file mode 100644
index 000000000000..fe8c88fe23e3
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h
@@ -0,0 +1,42 @@
+//===-- SystemZAsmPrinter.h - SystemZ LLVM assembly printer ----*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZASMPRINTER_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZASMPRINTER_H
+
+#include "SystemZTargetMachine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+class MCStreamer;
+class MachineBasicBlock;
+class MachineInstr;
+class Module;
+class raw_ostream;
+
+class LLVM_LIBRARY_VISIBILITY SystemZAsmPrinter : public AsmPrinter {
+public:
+ SystemZAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
+ : AsmPrinter(TM, std::move(Streamer)) {}
+
+ // Override AsmPrinter.
+ StringRef getPassName() const override { return "SystemZ Assembly Printer"; }
+ void EmitInstruction(const MachineInstr *MI) override;
+ void EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) override;
+ bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &OS) override;
+ bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &OS) override;
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZCallingConv.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZCallingConv.cpp
new file mode 100644
index 000000000000..72da51f74b10
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZCallingConv.cpp
@@ -0,0 +1,21 @@
+//===-- SystemZCallingConv.cpp - Calling conventions for SystemZ ----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZCallingConv.h"
+#include "SystemZRegisterInfo.h"
+
+using namespace llvm;
+
+const MCPhysReg SystemZ::ArgGPRs[SystemZ::NumArgGPRs] = {
+ SystemZ::R2D, SystemZ::R3D, SystemZ::R4D, SystemZ::R5D, SystemZ::R6D
+};
+
+const MCPhysReg SystemZ::ArgFPRs[SystemZ::NumArgFPRs] = {
+ SystemZ::F0D, SystemZ::F2D, SystemZ::F4D, SystemZ::F6D
+};
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZCallingConv.h b/contrib/llvm/lib/Target/SystemZ/SystemZCallingConv.h
new file mode 100644
index 000000000000..b5523e586f4c
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZCallingConv.h
@@ -0,0 +1,130 @@
+//===-- SystemZCallingConv.h - Calling conventions for SystemZ --*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZCALLINGCONV_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZCALLINGCONV_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/MC/MCRegisterInfo.h"
+
+namespace llvm {
+namespace SystemZ {
+ const unsigned NumArgGPRs = 5;
+ extern const MCPhysReg ArgGPRs[NumArgGPRs];
+
+ const unsigned NumArgFPRs = 4;
+ extern const MCPhysReg ArgFPRs[NumArgFPRs];
+} // end namespace SystemZ
+
+class SystemZCCState : public CCState {
+private:
+ /// Records whether the value was a fixed argument.
+ /// See ISD::OutputArg::IsFixed.
+ SmallVector<bool, 4> ArgIsFixed;
+
+ /// Records whether the value was widened from a short vector type.
+ SmallVector<bool, 4> ArgIsShortVector;
+
+ // Check whether ArgVT is a short vector type.
+ bool IsShortVectorType(EVT ArgVT) {
+ return ArgVT.isVector() && ArgVT.getStoreSize() <= 8;
+ }
+
+public:
+ SystemZCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
+ SmallVectorImpl<CCValAssign> &locs, LLVMContext &C)
+ : CCState(CC, isVarArg, MF, locs, C) {}
+
+ void AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
+ CCAssignFn Fn) {
+ // Formal arguments are always fixed.
+ ArgIsFixed.clear();
+ for (unsigned i = 0; i < Ins.size(); ++i)
+ ArgIsFixed.push_back(true);
+ // Record whether the call operand was a short vector.
+ ArgIsShortVector.clear();
+ for (unsigned i = 0; i < Ins.size(); ++i)
+ ArgIsShortVector.push_back(IsShortVectorType(Ins[i].ArgVT));
+
+ CCState::AnalyzeFormalArguments(Ins, Fn);
+ }
+
+ void AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
+ CCAssignFn Fn) {
+ // Record whether the call operand was a fixed argument.
+ ArgIsFixed.clear();
+ for (unsigned i = 0; i < Outs.size(); ++i)
+ ArgIsFixed.push_back(Outs[i].IsFixed);
+ // Record whether the call operand was a short vector.
+ ArgIsShortVector.clear();
+ for (unsigned i = 0; i < Outs.size(); ++i)
+ ArgIsShortVector.push_back(IsShortVectorType(Outs[i].ArgVT));
+
+ CCState::AnalyzeCallOperands(Outs, Fn);
+ }
+
+ // This version of AnalyzeCallOperands in the base class is not usable
+ // since we must provide a means of accessing ISD::OutputArg::IsFixed.
+ void AnalyzeCallOperands(const SmallVectorImpl<MVT> &Outs,
+ SmallVectorImpl<ISD::ArgFlagsTy> &Flags,
+ CCAssignFn Fn) = delete;
+
+ bool IsFixed(unsigned ValNo) { return ArgIsFixed[ValNo]; }
+ bool IsShortVector(unsigned ValNo) { return ArgIsShortVector[ValNo]; }
+};
+
+// Handle i128 argument types. These need to be passed by implicit
+// reference. This could be as simple as the following .td line:
+// CCIfType<[i128], CCPassIndirect<i64>>,
+// except that i128 is not a legal type, and therefore gets split by
+// common code into a pair of i64 arguments.
+inline bool CC_SystemZ_I128Indirect(unsigned &ValNo, MVT &ValVT,
+ MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags,
+ CCState &State) {
+ SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
+
+ // ArgFlags.isSplit() is true on the first part of a i128 argument;
+ // PendingMembers.empty() is false on all subsequent parts.
+ if (!ArgFlags.isSplit() && PendingMembers.empty())
+ return false;
+
+ // Push a pending Indirect value location for each part.
+ LocVT = MVT::i64;
+ LocInfo = CCValAssign::Indirect;
+ PendingMembers.push_back(CCValAssign::getPending(ValNo, ValVT,
+ LocVT, LocInfo));
+ if (!ArgFlags.isSplitEnd())
+ return true;
+
+ // OK, we've collected all parts in the pending list. Allocate
+ // the location (register or stack slot) for the indirect pointer.
+ // (This duplicates the usual i64 calling convention rules.)
+ unsigned Reg = State.AllocateReg(SystemZ::ArgGPRs);
+ unsigned Offset = Reg ? 0 : State.AllocateStack(8, 8);
+
+ // Use that same location for all the pending parts.
+ for (auto &It : PendingMembers) {
+ if (Reg)
+ It.convertToReg(Reg);
+ else
+ It.convertToMem(Offset);
+ State.addLoc(It);
+ }
+
+ PendingMembers.clear();
+
+ return true;
+}
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/contrib/llvm/lib/Target/SystemZ/SystemZCallingConv.td
new file mode 100644
index 000000000000..2bf5ac29865f
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZCallingConv.td
@@ -0,0 +1,122 @@
+//=- SystemZCallingConv.td - Calling conventions for SystemZ -*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for the SystemZ ABI.
+//===----------------------------------------------------------------------===//
+
+class CCIfExtend<CCAction A>
+ : CCIf<"ArgFlags.isSExt() || ArgFlags.isZExt()", A>;
+
+class CCIfSubtarget<string F, CCAction A>
+ : CCIf<!strconcat("static_cast<const SystemZSubtarget&>"
+ "(State.getMachineFunction().getSubtarget()).", F),
+ A>;
+
+// Match if this specific argument is a fixed (i.e. named) argument.
+class CCIfFixed<CCAction A>
+ : CCIf<"static_cast<SystemZCCState *>(&State)->IsFixed(ValNo)", A>;
+
+// Match if this specific argument was widened from a short vector type.
+class CCIfShortVector<CCAction A>
+ : CCIf<"static_cast<SystemZCCState *>(&State)->IsShortVector(ValNo)", A>;
+
+
+//===----------------------------------------------------------------------===//
+// z/Linux return value calling convention
+//===----------------------------------------------------------------------===//
+def RetCC_SystemZ : CallingConv<[
+ // Promote i32 to i64 if it has an explicit extension type.
+ CCIfType<[i32], CCIfExtend<CCPromoteToType<i64>>>,
+
+ // A SwiftError is returned in R9.
+ CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R9D]>>>,
+
+ // ABI-compliant code returns 64-bit integers in R2. Make the other
+ // call-clobbered argument registers available for code that doesn't
+ // care about the ABI. (R6 is an argument register too, but is
+ // call-saved and therefore not suitable for return values.)
+ CCIfType<[i32], CCAssignToReg<[R2L, R3L, R4L, R5L]>>,
+ CCIfType<[i64], CCAssignToReg<[R2D, R3D, R4D, R5D]>>,
+
+ // ABI-complaint code returns float and double in F0. Make the
+ // other floating-point argument registers available for code that
+ // doesn't care about the ABI. All floating-point argument registers
+ // are call-clobbered, so we can use all of them here.
+ CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
+ CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>,
+
+ // Similarly for vectors, with V24 being the ABI-compliant choice.
+ // Sub-128 vectors are returned in the same way, but they're widened
+ // to one of these types during type legalization.
+ CCIfSubtarget<"hasVector()",
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToReg<[V24, V26, V28, V30, V25, V27, V29, V31]>>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// z/Linux argument calling conventions
+//===----------------------------------------------------------------------===//
+def CC_SystemZ : CallingConv<[
+ // Promote i32 to i64 if it has an explicit extension type.
+ // The convention is that true integer arguments that are smaller
+ // than 64 bits should be marked as extended, but structures that
+ // are smaller than 64 bits shouldn't.
+ CCIfType<[i32], CCIfExtend<CCPromoteToType<i64>>>,
+
+ // A SwiftSelf is passed in callee-saved R10.
+ CCIfSwiftSelf<CCIfType<[i64], CCAssignToReg<[R10D]>>>,
+
+ // A SwiftError is passed in callee-saved R9.
+ CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R9D]>>>,
+
+ // Force long double values to the stack and pass i64 pointers to them.
+ CCIfType<[f128], CCPassIndirect<i64>>,
+ // Same for i128 values. These are already split into two i64 here,
+ // so we have to use a custom handler.
+ CCIfType<[i64], CCCustom<"CC_SystemZ_I128Indirect">>,
+
+ // The first 5 integer arguments are passed in R2-R6. Note that R6
+ // is call-saved.
+ CCIfType<[i32], CCAssignToReg<[R2L, R3L, R4L, R5L, R6L]>>,
+ CCIfType<[i64], CCAssignToReg<[R2D, R3D, R4D, R5D, R6D]>>,
+
+ // The first 4 float and double arguments are passed in even registers F0-F6.
+ CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
+ CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>,
+
+ // The first 8 named vector arguments are passed in V24-V31. Sub-128 vectors
+ // are passed in the same way, but they're widened to one of these types
+ // during type legalization.
+ CCIfSubtarget<"hasVector()",
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCIfFixed<CCAssignToReg<[V24, V26, V28, V30,
+ V25, V27, V29, V31]>>>>,
+
+ // However, sub-128 vectors which need to go on the stack occupy just a
+ // single 8-byte-aligned 8-byte stack slot. Pass as i64.
+ CCIfSubtarget<"hasVector()",
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCIfShortVector<CCBitConvertToType<i64>>>>,
+
+ // Other vector arguments are passed in 8-byte-aligned 16-byte stack slots.
+ CCIfSubtarget<"hasVector()",
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToStack<16, 8>>>,
+
+ // Other arguments are passed in 8-byte-aligned 8-byte stack slots.
+ CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// z/Linux callee-saved registers
+//===----------------------------------------------------------------------===//
+def CSR_SystemZ : CalleeSavedRegs<(add (sequence "R%dD", 6, 15),
+ (sequence "F%dD", 8, 15))>;
+
+// R9 is used to return SwiftError; remove it from CSR.
+def CSR_SystemZ_SwiftError : CalleeSavedRegs<(sub CSR_SystemZ, R9D)>;
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
new file mode 100644
index 000000000000..4a6beb67f182
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
@@ -0,0 +1,52 @@
+//===-- SystemZConstantPoolValue.cpp - SystemZ constant-pool value --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZConstantPoolValue.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+SystemZConstantPoolValue::
+SystemZConstantPoolValue(const GlobalValue *gv,
+ SystemZCP::SystemZCPModifier modifier)
+ : MachineConstantPoolValue(gv->getType()), GV(gv), Modifier(modifier) {}
+
+SystemZConstantPoolValue *
+SystemZConstantPoolValue::Create(const GlobalValue *GV,
+ SystemZCP::SystemZCPModifier Modifier) {
+ return new SystemZConstantPoolValue(GV, Modifier);
+}
+
+int SystemZConstantPoolValue::
+getExistingMachineCPValue(MachineConstantPool *CP, unsigned Alignment) {
+ unsigned AlignMask = Alignment - 1;
+ const std::vector<MachineConstantPoolEntry> &Constants = CP->getConstants();
+ for (unsigned I = 0, E = Constants.size(); I != E; ++I) {
+ if (Constants[I].isMachineConstantPoolEntry() &&
+ (Constants[I].getAlignment() & AlignMask) == 0) {
+ auto *ZCPV =
+ static_cast<SystemZConstantPoolValue *>(Constants[I].Val.MachineCPVal);
+ if (ZCPV->GV == GV && ZCPV->Modifier == Modifier)
+ return I;
+ }
+ }
+ return -1;
+}
+
+void SystemZConstantPoolValue::addSelectionDAGCSEId(FoldingSetNodeID &ID) {
+ ID.AddPointer(GV);
+ ID.AddInteger(Modifier);
+}
+
+void SystemZConstantPoolValue::print(raw_ostream &O) const {
+ O << GV << "@" << int(Modifier);
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.h b/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.h
new file mode 100644
index 000000000000..a71b595560d2
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.h
@@ -0,0 +1,58 @@
+//===- SystemZConstantPoolValue.h - SystemZ constant-pool value -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZCONSTANTPOOLVALUE_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZCONSTANTPOOLVALUE_H
+
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+class GlobalValue;
+
+namespace SystemZCP {
+enum SystemZCPModifier {
+ TLSGD,
+ TLSLDM,
+ DTPOFF,
+ NTPOFF
+};
+} // end namespace SystemZCP
+
+/// A SystemZ-specific constant pool value. At present, the only
+/// defined constant pool values are module IDs or offsets of
+/// thread-local variables (written x@TLSGD, x@TLSLDM, x@DTPOFF,
+/// or x@NTPOFF).
+class SystemZConstantPoolValue : public MachineConstantPoolValue {
+ const GlobalValue *GV;
+ SystemZCP::SystemZCPModifier Modifier;
+
+protected:
+ SystemZConstantPoolValue(const GlobalValue *GV,
+ SystemZCP::SystemZCPModifier Modifier);
+
+public:
+ static SystemZConstantPoolValue *
+ Create(const GlobalValue *GV, SystemZCP::SystemZCPModifier Modifier);
+
+ // Override MachineConstantPoolValue.
+ int getExistingMachineCPValue(MachineConstantPool *CP,
+ unsigned Alignment) override;
+ void addSelectionDAGCSEId(FoldingSetNodeID &ID) override;
+ void print(raw_ostream &O) const override;
+
+ // Access SystemZ-specific fields.
+ const GlobalValue *getGlobalValue() const { return GV; }
+ SystemZCP::SystemZCPModifier getModifier() const { return Modifier; }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp
new file mode 100644
index 000000000000..b4c843f658aa
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp
@@ -0,0 +1,575 @@
+//===-- SystemZElimCompare.cpp - Eliminate comparison instructions --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass:
+// (1) tries to remove compares if CC already contains the required information
+// (2) fuses compares and branches into COMPARE AND BRANCH instructions
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "systemz-elim-compare"
+
+STATISTIC(BranchOnCounts, "Number of branch-on-count instructions");
+STATISTIC(LoadAndTraps, "Number of load-and-trap instructions");
+STATISTIC(EliminatedComparisons, "Number of eliminated comparisons");
+STATISTIC(FusedComparisons, "Number of fused compare-and-branch instructions");
+
+namespace {
+// Represents the references to a particular register in one or more
+// instructions.
+struct Reference {
+ Reference()
+ : Def(false), Use(false) {}
+
+ Reference &operator|=(const Reference &Other) {
+ Def |= Other.Def;
+ Use |= Other.Use;
+ return *this;
+ }
+
+ explicit operator bool() const { return Def || Use; }
+
+ // True if the register is defined or used in some form, either directly or
+ // via a sub- or super-register.
+ bool Def;
+ bool Use;
+};
+
+class SystemZElimCompare : public MachineFunctionPass {
+public:
+ static char ID;
+ SystemZElimCompare(const SystemZTargetMachine &tm)
+ : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr) {}
+
+ StringRef getPassName() const override {
+ return "SystemZ Comparison Elimination";
+ }
+
+ bool processBlock(MachineBasicBlock &MBB);
+ bool runOnMachineFunction(MachineFunction &F) override;
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+private:
+ Reference getRegReferences(MachineInstr &MI, unsigned Reg);
+ bool convertToBRCT(MachineInstr &MI, MachineInstr &Compare,
+ SmallVectorImpl<MachineInstr *> &CCUsers);
+ bool convertToLoadAndTrap(MachineInstr &MI, MachineInstr &Compare,
+ SmallVectorImpl<MachineInstr *> &CCUsers);
+ bool convertToLoadAndTest(MachineInstr &MI);
+ bool adjustCCMasksForInstr(MachineInstr &MI, MachineInstr &Compare,
+ SmallVectorImpl<MachineInstr *> &CCUsers);
+ bool optimizeCompareZero(MachineInstr &Compare,
+ SmallVectorImpl<MachineInstr *> &CCUsers);
+ bool fuseCompareOperations(MachineInstr &Compare,
+ SmallVectorImpl<MachineInstr *> &CCUsers);
+
+ const SystemZInstrInfo *TII;
+ const TargetRegisterInfo *TRI;
+};
+
+char SystemZElimCompare::ID = 0;
+} // end anonymous namespace
+
+FunctionPass *llvm::createSystemZElimComparePass(SystemZTargetMachine &TM) {
+ return new SystemZElimCompare(TM);
+}
+
+// Return true if CC is live out of MBB.
+static bool isCCLiveOut(MachineBasicBlock &MBB) {
+ for (auto SI = MBB.succ_begin(), SE = MBB.succ_end(); SI != SE; ++SI)
+ if ((*SI)->isLiveIn(SystemZ::CC))
+ return true;
+ return false;
+}
+
+// Return true if any CC result of MI would reflect the value of Reg.
+static bool resultTests(MachineInstr &MI, unsigned Reg) {
+ if (MI.getNumOperands() > 0 && MI.getOperand(0).isReg() &&
+ MI.getOperand(0).isDef() && MI.getOperand(0).getReg() == Reg)
+ return true;
+
+ switch (MI.getOpcode()) {
+ case SystemZ::LR:
+ case SystemZ::LGR:
+ case SystemZ::LGFR:
+ case SystemZ::LTR:
+ case SystemZ::LTGR:
+ case SystemZ::LTGFR:
+ case SystemZ::LER:
+ case SystemZ::LDR:
+ case SystemZ::LXR:
+ case SystemZ::LTEBR:
+ case SystemZ::LTDBR:
+ case SystemZ::LTXBR:
+ if (MI.getOperand(1).getReg() == Reg)
+ return true;
+ }
+
+ return false;
+}
+
+// Describe the references to Reg or any of its aliases in MI.
+Reference SystemZElimCompare::getRegReferences(MachineInstr &MI, unsigned Reg) {
+ Reference Ref;
+ for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+ const MachineOperand &MO = MI.getOperand(I);
+ if (MO.isReg()) {
+ if (unsigned MOReg = MO.getReg()) {
+ if (TRI->regsOverlap(MOReg, Reg)) {
+ if (MO.isUse())
+ Ref.Use = true;
+ else if (MO.isDef())
+ Ref.Def = true;
+ }
+ }
+ }
+ }
+ return Ref;
+}
+
+// Return true if this is a load and test which can be optimized the
+// same way as compare instruction.
+static bool isLoadAndTestAsCmp(MachineInstr &MI) {
+ // If we during isel used a load-and-test as a compare with 0, the
+ // def operand is dead.
+ return (MI.getOpcode() == SystemZ::LTEBR ||
+ MI.getOpcode() == SystemZ::LTDBR ||
+ MI.getOpcode() == SystemZ::LTXBR) &&
+ MI.getOperand(0).isDead();
+}
+
+// Return the source register of Compare, which is the unknown value
+// being tested.
+static unsigned getCompareSourceReg(MachineInstr &Compare) {
+ unsigned reg = 0;
+ if (Compare.isCompare())
+ reg = Compare.getOperand(0).getReg();
+ else if (isLoadAndTestAsCmp(Compare))
+ reg = Compare.getOperand(1).getReg();
+ assert (reg);
+
+ return reg;
+}
+
+// Compare compares the result of MI against zero. If MI is an addition
+// of -1 and if CCUsers is a single branch on nonzero, eliminate the addition
+// and convert the branch to a BRCT(G) or BRCTH. Return true on success.
+bool SystemZElimCompare::convertToBRCT(
+ MachineInstr &MI, MachineInstr &Compare,
+ SmallVectorImpl<MachineInstr *> &CCUsers) {
+ // Check whether we have an addition of -1.
+ unsigned Opcode = MI.getOpcode();
+ unsigned BRCT;
+ if (Opcode == SystemZ::AHI)
+ BRCT = SystemZ::BRCT;
+ else if (Opcode == SystemZ::AGHI)
+ BRCT = SystemZ::BRCTG;
+ else if (Opcode == SystemZ::AIH)
+ BRCT = SystemZ::BRCTH;
+ else
+ return false;
+ if (MI.getOperand(2).getImm() != -1)
+ return false;
+
+ // Check whether we have a single JLH.
+ if (CCUsers.size() != 1)
+ return false;
+ MachineInstr *Branch = CCUsers[0];
+ if (Branch->getOpcode() != SystemZ::BRC ||
+ Branch->getOperand(0).getImm() != SystemZ::CCMASK_ICMP ||
+ Branch->getOperand(1).getImm() != SystemZ::CCMASK_CMP_NE)
+ return false;
+
+ // We already know that there are no references to the register between
+ // MI and Compare. Make sure that there are also no references between
+ // Compare and Branch.
+ unsigned SrcReg = getCompareSourceReg(Compare);
+ MachineBasicBlock::iterator MBBI = Compare, MBBE = Branch;
+ for (++MBBI; MBBI != MBBE; ++MBBI)
+ if (getRegReferences(*MBBI, SrcReg))
+ return false;
+
+ // The transformation is OK. Rebuild Branch as a BRCT(G) or BRCTH.
+ MachineOperand Target(Branch->getOperand(2));
+ while (Branch->getNumOperands())
+ Branch->RemoveOperand(0);
+ Branch->setDesc(TII->get(BRCT));
+ MachineInstrBuilder MIB(*Branch->getParent()->getParent(), Branch);
+ MIB.addOperand(MI.getOperand(0))
+ .addOperand(MI.getOperand(1))
+ .addOperand(Target);
+ // Add a CC def to BRCT(G), since we may have to split them again if the
+ // branch displacement overflows. BRCTH has a 32-bit displacement, so
+ // this is not necessary there.
+ if (BRCT != SystemZ::BRCTH)
+ MIB.addReg(SystemZ::CC, RegState::ImplicitDefine | RegState::Dead);
+ MI.eraseFromParent();
+ return true;
+}
+
+// Compare compares the result of MI against zero. If MI is a suitable load
+// instruction and if CCUsers is a single conditional trap on zero, eliminate
+// the load and convert the branch to a load-and-trap. Return true on success.
+bool SystemZElimCompare::convertToLoadAndTrap(
+ MachineInstr &MI, MachineInstr &Compare,
+ SmallVectorImpl<MachineInstr *> &CCUsers) {
+ unsigned LATOpcode = TII->getLoadAndTrap(MI.getOpcode());
+ if (!LATOpcode)
+ return false;
+
+ // Check whether we have a single CondTrap that traps on zero.
+ if (CCUsers.size() != 1)
+ return false;
+ MachineInstr *Branch = CCUsers[0];
+ if (Branch->getOpcode() != SystemZ::CondTrap ||
+ Branch->getOperand(0).getImm() != SystemZ::CCMASK_ICMP ||
+ Branch->getOperand(1).getImm() != SystemZ::CCMASK_CMP_EQ)
+ return false;
+
+ // We already know that there are no references to the register between
+ // MI and Compare. Make sure that there are also no references between
+ // Compare and Branch.
+ unsigned SrcReg = getCompareSourceReg(Compare);
+ MachineBasicBlock::iterator MBBI = Compare, MBBE = Branch;
+ for (++MBBI; MBBI != MBBE; ++MBBI)
+ if (getRegReferences(*MBBI, SrcReg))
+ return false;
+
+ // The transformation is OK. Rebuild Branch as a load-and-trap.
+ while (Branch->getNumOperands())
+ Branch->RemoveOperand(0);
+ Branch->setDesc(TII->get(LATOpcode));
+ MachineInstrBuilder(*Branch->getParent()->getParent(), Branch)
+ .addOperand(MI.getOperand(0))
+ .addOperand(MI.getOperand(1))
+ .addOperand(MI.getOperand(2))
+ .addOperand(MI.getOperand(3));
+ MI.eraseFromParent();
+ return true;
+}
+
+// If MI is a load instruction, try to convert it into a LOAD AND TEST.
+// Return true on success.
+bool SystemZElimCompare::convertToLoadAndTest(MachineInstr &MI) {
+ unsigned Opcode = TII->getLoadAndTest(MI.getOpcode());
+ if (!Opcode)
+ return false;
+
+ MI.setDesc(TII->get(Opcode));
+ MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+ .addReg(SystemZ::CC, RegState::ImplicitDefine);
+ return true;
+}
+
+// The CC users in CCUsers are testing the result of a comparison of some
+// value X against zero and we know that any CC value produced by MI
+// would also reflect the value of X. Try to adjust CCUsers so that
+// they test the result of MI directly, returning true on success.
+// Leave everything unchanged on failure.
+bool SystemZElimCompare::adjustCCMasksForInstr(
+ MachineInstr &MI, MachineInstr &Compare,
+ SmallVectorImpl<MachineInstr *> &CCUsers) {
+ int Opcode = MI.getOpcode();
+ const MCInstrDesc &Desc = TII->get(Opcode);
+ unsigned MIFlags = Desc.TSFlags;
+
+ // See which compare-style condition codes are available.
+ unsigned ReusableCCMask = SystemZII::getCompareZeroCCMask(MIFlags);
+
+ // For unsigned comparisons with zero, only equality makes sense.
+ unsigned CompareFlags = Compare.getDesc().TSFlags;
+ if (CompareFlags & SystemZII::IsLogical)
+ ReusableCCMask &= SystemZ::CCMASK_CMP_EQ;
+
+ if (ReusableCCMask == 0)
+ return false;
+
+ unsigned CCValues = SystemZII::getCCValues(MIFlags);
+ assert((ReusableCCMask & ~CCValues) == 0 && "Invalid CCValues");
+
+ // Now check whether these flags are enough for all users.
+ SmallVector<MachineOperand *, 4> AlterMasks;
+ for (unsigned int I = 0, E = CCUsers.size(); I != E; ++I) {
+ MachineInstr *MI = CCUsers[I];
+
+ // Fail if this isn't a use of CC that we understand.
+ unsigned Flags = MI->getDesc().TSFlags;
+ unsigned FirstOpNum;
+ if (Flags & SystemZII::CCMaskFirst)
+ FirstOpNum = 0;
+ else if (Flags & SystemZII::CCMaskLast)
+ FirstOpNum = MI->getNumExplicitOperands() - 2;
+ else
+ return false;
+
+ // Check whether the instruction predicate treats all CC values
+ // outside of ReusableCCMask in the same way. In that case it
+ // doesn't matter what those CC values mean.
+ unsigned CCValid = MI->getOperand(FirstOpNum).getImm();
+ unsigned CCMask = MI->getOperand(FirstOpNum + 1).getImm();
+ unsigned OutValid = ~ReusableCCMask & CCValid;
+ unsigned OutMask = ~ReusableCCMask & CCMask;
+ if (OutMask != 0 && OutMask != OutValid)
+ return false;
+
+ AlterMasks.push_back(&MI->getOperand(FirstOpNum));
+ AlterMasks.push_back(&MI->getOperand(FirstOpNum + 1));
+ }
+
+ // All users are OK. Adjust the masks for MI.
+ for (unsigned I = 0, E = AlterMasks.size(); I != E; I += 2) {
+ AlterMasks[I]->setImm(CCValues);
+ unsigned CCMask = AlterMasks[I + 1]->getImm();
+ if (CCMask & ~ReusableCCMask)
+ AlterMasks[I + 1]->setImm((CCMask & ReusableCCMask) |
+ (CCValues & ~ReusableCCMask));
+ }
+
+ // CC is now live after MI.
+ int CCDef = MI.findRegisterDefOperandIdx(SystemZ::CC, false, true, TRI);
+ assert(CCDef >= 0 && "Couldn't find CC set");
+ MI.getOperand(CCDef).setIsDead(false);
+
+ // Clear any intervening kills of CC.
+ MachineBasicBlock::iterator MBBI = MI, MBBE = Compare;
+ for (++MBBI; MBBI != MBBE; ++MBBI)
+ MBBI->clearRegisterKills(SystemZ::CC, TRI);
+
+ return true;
+}
+
+// Return true if Compare is a comparison against zero.
+static bool isCompareZero(MachineInstr &Compare) {
+ switch (Compare.getOpcode()) {
+ case SystemZ::LTEBRCompare:
+ case SystemZ::LTDBRCompare:
+ case SystemZ::LTXBRCompare:
+ return true;
+
+ default:
+
+ if (isLoadAndTestAsCmp(Compare))
+ return true;
+
+ return Compare.getNumExplicitOperands() == 2 &&
+ Compare.getOperand(1).isImm() && Compare.getOperand(1).getImm() == 0;
+ }
+}
+
+// Try to optimize cases where comparison instruction Compare is testing
+// a value against zero. Return true on success and if Compare should be
+// deleted as dead. CCUsers is the list of instructions that use the CC
+// value produced by Compare.
+bool SystemZElimCompare::optimizeCompareZero(
+ MachineInstr &Compare, SmallVectorImpl<MachineInstr *> &CCUsers) {
+ if (!isCompareZero(Compare))
+ return false;
+
+ // Search back for CC results that are based on the first operand.
+ unsigned SrcReg = getCompareSourceReg(Compare);
+ MachineBasicBlock &MBB = *Compare.getParent();
+ MachineBasicBlock::iterator MBBI = Compare, MBBE = MBB.begin();
+ Reference CCRefs;
+ Reference SrcRefs;
+ while (MBBI != MBBE) {
+ --MBBI;
+ MachineInstr &MI = *MBBI;
+ if (resultTests(MI, SrcReg)) {
+ // Try to remove both MI and Compare by converting a branch to BRCT(G).
+ // or a load-and-trap instruction. We don't care in this case whether
+ // CC is modified between MI and Compare.
+ if (!CCRefs.Use && !SrcRefs) {
+ if (convertToBRCT(MI, Compare, CCUsers)) {
+ BranchOnCounts += 1;
+ return true;
+ }
+ if (convertToLoadAndTrap(MI, Compare, CCUsers)) {
+ LoadAndTraps += 1;
+ return true;
+ }
+ }
+ // Try to eliminate Compare by reusing a CC result from MI.
+ if ((!CCRefs && convertToLoadAndTest(MI)) ||
+ (!CCRefs.Def && adjustCCMasksForInstr(MI, Compare, CCUsers))) {
+ EliminatedComparisons += 1;
+ return true;
+ }
+ }
+ SrcRefs |= getRegReferences(MI, SrcReg);
+ if (SrcRefs.Def)
+ return false;
+ CCRefs |= getRegReferences(MI, SystemZ::CC);
+ if (CCRefs.Use && CCRefs.Def)
+ return false;
+ }
+ return false;
+}
+
+// Try to fuse comparison instruction Compare into a later branch.
+// Return true on success and if Compare is therefore redundant.
+bool SystemZElimCompare::fuseCompareOperations(
+ MachineInstr &Compare, SmallVectorImpl<MachineInstr *> &CCUsers) {
+ // See whether we have a single branch with which to fuse.
+ if (CCUsers.size() != 1)
+ return false;
+ MachineInstr *Branch = CCUsers[0];
+ SystemZII::FusedCompareType Type;
+ switch (Branch->getOpcode()) {
+ case SystemZ::BRC:
+ Type = SystemZII::CompareAndBranch;
+ break;
+ case SystemZ::CondReturn:
+ Type = SystemZII::CompareAndReturn;
+ break;
+ case SystemZ::CallBCR:
+ Type = SystemZII::CompareAndSibcall;
+ break;
+ case SystemZ::CondTrap:
+ Type = SystemZII::CompareAndTrap;
+ break;
+ default:
+ return false;
+ }
+
+ // See whether we have a comparison that can be fused.
+ unsigned FusedOpcode =
+ TII->getFusedCompare(Compare.getOpcode(), Type, &Compare);
+ if (!FusedOpcode)
+ return false;
+
+ // Make sure that the operands are available at the branch.
+ // SrcReg2 is the register if the source operand is a register,
+ // 0 if the source operand is immediate, and the base register
+ // if the source operand is memory (index is not supported).
+ unsigned SrcReg = Compare.getOperand(0).getReg();
+ unsigned SrcReg2 =
+ Compare.getOperand(1).isReg() ? Compare.getOperand(1).getReg() : 0;
+ MachineBasicBlock::iterator MBBI = Compare, MBBE = Branch;
+ for (++MBBI; MBBI != MBBE; ++MBBI)
+ if (MBBI->modifiesRegister(SrcReg, TRI) ||
+ (SrcReg2 && MBBI->modifiesRegister(SrcReg2, TRI)))
+ return false;
+
+ // Read the branch mask, target (if applicable), regmask (if applicable).
+ MachineOperand CCMask(MBBI->getOperand(1));
+ assert((CCMask.getImm() & ~SystemZ::CCMASK_ICMP) == 0 &&
+ "Invalid condition-code mask for integer comparison");
+ // This is only valid for CompareAndBranch.
+ MachineOperand Target(MBBI->getOperand(
+ Type == SystemZII::CompareAndBranch ? 2 : 0));
+ const uint32_t *RegMask;
+ if (Type == SystemZII::CompareAndSibcall)
+ RegMask = MBBI->getOperand(2).getRegMask();
+
+ // Clear out all current operands.
+ int CCUse = MBBI->findRegisterUseOperandIdx(SystemZ::CC, false, TRI);
+ assert(CCUse >= 0 && "BRC/BCR must use CC");
+ Branch->RemoveOperand(CCUse);
+ // Remove target (branch) or regmask (sibcall).
+ if (Type == SystemZII::CompareAndBranch ||
+ Type == SystemZII::CompareAndSibcall)
+ Branch->RemoveOperand(2);
+ Branch->RemoveOperand(1);
+ Branch->RemoveOperand(0);
+
+ // Rebuild Branch as a fused compare and branch.
+ // SrcNOps is the number of MI operands of the compare instruction
+ // that we need to copy over.
+ unsigned SrcNOps = 2;
+ if (FusedOpcode == SystemZ::CLT || FusedOpcode == SystemZ::CLGT)
+ SrcNOps = 3;
+ Branch->setDesc(TII->get(FusedOpcode));
+ MachineInstrBuilder MIB(*Branch->getParent()->getParent(), Branch);
+ for (unsigned I = 0; I < SrcNOps; I++)
+ MIB.addOperand(Compare.getOperand(I));
+ MIB.addOperand(CCMask);
+
+ if (Type == SystemZII::CompareAndBranch) {
+ // Only conditional branches define CC, as they may be converted back
+ // to a non-fused branch because of a long displacement. Conditional
+ // returns don't have that problem.
+ MIB.addOperand(Target)
+ .addReg(SystemZ::CC, RegState::ImplicitDefine | RegState::Dead);
+ }
+
+ if (Type == SystemZII::CompareAndSibcall)
+ MIB.addRegMask(RegMask);
+
+ // Clear any intervening kills of SrcReg and SrcReg2.
+ MBBI = Compare;
+ for (++MBBI; MBBI != MBBE; ++MBBI) {
+ MBBI->clearRegisterKills(SrcReg, TRI);
+ if (SrcReg2)
+ MBBI->clearRegisterKills(SrcReg2, TRI);
+ }
+ FusedComparisons += 1;
+ return true;
+}
+
+// Process all comparison instructions in MBB. Return true if something
+// changed.
+bool SystemZElimCompare::processBlock(MachineBasicBlock &MBB) {
+ bool Changed = false;
+
+ // Walk backwards through the block looking for comparisons, recording
+ // all CC users as we go. The subroutines can delete Compare and
+ // instructions before it.
+ bool CompleteCCUsers = !isCCLiveOut(MBB);
+ SmallVector<MachineInstr *, 4> CCUsers;
+ MachineBasicBlock::iterator MBBI = MBB.end();
+ while (MBBI != MBB.begin()) {
+ MachineInstr &MI = *--MBBI;
+ if (CompleteCCUsers && (MI.isCompare() || isLoadAndTestAsCmp(MI)) &&
+ (optimizeCompareZero(MI, CCUsers) ||
+ fuseCompareOperations(MI, CCUsers))) {
+ ++MBBI;
+ MI.eraseFromParent();
+ Changed = true;
+ CCUsers.clear();
+ continue;
+ }
+
+ if (MI.definesRegister(SystemZ::CC)) {
+ CCUsers.clear();
+ CompleteCCUsers = true;
+ }
+ if (MI.readsRegister(SystemZ::CC) && CompleteCCUsers)
+ CCUsers.push_back(&MI);
+ }
+ return Changed;
+}
+
+bool SystemZElimCompare::runOnMachineFunction(MachineFunction &F) {
+ if (skipFunction(*F.getFunction()))
+ return false;
+
+ TII = static_cast<const SystemZInstrInfo *>(F.getSubtarget().getInstrInfo());
+ TRI = &TII->getRegisterInfo();
+
+ bool Changed = false;
+ for (auto &MBB : F)
+ Changed |= processBlock(MBB);
+
+ return Changed;
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZExpandPseudo.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZExpandPseudo.cpp
new file mode 100644
index 000000000000..92ce8089c24f
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZExpandPseudo.cpp
@@ -0,0 +1,153 @@
+//==-- SystemZExpandPseudo.cpp - Expand pseudo instructions -------*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that expands pseudo instructions into target
+// instructions to allow proper scheduling and other late optimizations. This
+// pass should be run after register allocation but before the post-regalloc
+// scheduling pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZ.h"
+#include "SystemZInstrInfo.h"
+#include "SystemZSubtarget.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+using namespace llvm;
+
+#define SYSTEMZ_EXPAND_PSEUDO_NAME "SystemZ pseudo instruction expansion pass"
+
+namespace llvm {
+ void initializeSystemZExpandPseudoPass(PassRegistry&);
+}
+
+namespace {
+class SystemZExpandPseudo : public MachineFunctionPass {
+public:
+ static char ID;
+ SystemZExpandPseudo() : MachineFunctionPass(ID) {
+ initializeSystemZExpandPseudoPass(*PassRegistry::getPassRegistry());
+ }
+
+ const SystemZInstrInfo *TII;
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+
+ StringRef getPassName() const override { return SYSTEMZ_EXPAND_PSEUDO_NAME; }
+
+private:
+ bool expandMBB(MachineBasicBlock &MBB);
+ bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI);
+ bool expandLOCRMux(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI);
+};
+char SystemZExpandPseudo::ID = 0;
+}
+
+INITIALIZE_PASS(SystemZExpandPseudo, "systemz-expand-pseudo",
+ SYSTEMZ_EXPAND_PSEUDO_NAME, false, false)
+
+/// \brief Returns an instance of the pseudo instruction expansion pass.
+FunctionPass *llvm::createSystemZExpandPseudoPass(SystemZTargetMachine &TM) {
+ return new SystemZExpandPseudo();
+}
+
+// MI is a load-register-on-condition pseudo instruction that could not be
+// handled as a single hardware instruction. Replace it by a branch sequence.
+bool SystemZExpandPseudo::expandLOCRMux(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI) {
+ MachineFunction &MF = *MBB.getParent();
+ const BasicBlock *BB = MBB.getBasicBlock();
+ MachineInstr &MI = *MBBI;
+ DebugLoc DL = MI.getDebugLoc();
+ unsigned DestReg = MI.getOperand(0).getReg();
+ unsigned SrcReg = MI.getOperand(2).getReg();
+ unsigned CCValid = MI.getOperand(3).getImm();
+ unsigned CCMask = MI.getOperand(4).getImm();
+
+ LivePhysRegs LiveRegs(&TII->getRegisterInfo());
+ LiveRegs.addLiveOuts(MBB);
+ for (auto I = std::prev(MBB.end()); I != MBBI; --I)
+ LiveRegs.stepBackward(*I);
+
+ // Splice MBB at MI, moving the rest of the block into RestMBB.
+ MachineBasicBlock *RestMBB = MF.CreateMachineBasicBlock(BB);
+ MF.insert(std::next(MachineFunction::iterator(MBB)), RestMBB);
+ RestMBB->splice(RestMBB->begin(), &MBB, MI, MBB.end());
+ RestMBB->transferSuccessors(&MBB);
+ for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I)
+ RestMBB->addLiveIn(*I);
+
+ // Create a new block MoveMBB to hold the move instruction.
+ MachineBasicBlock *MoveMBB = MF.CreateMachineBasicBlock(BB);
+ MF.insert(std::next(MachineFunction::iterator(MBB)), MoveMBB);
+ MoveMBB->addLiveIn(SrcReg);
+ for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I)
+ MoveMBB->addLiveIn(*I);
+
+ // At the end of MBB, create a conditional branch to RestMBB if the
+ // condition is false, otherwise fall through to MoveMBB.
+ BuildMI(&MBB, DL, TII->get(SystemZ::BRC))
+ .addImm(CCValid).addImm(CCMask ^ CCValid).addMBB(RestMBB);
+ MBB.addSuccessor(RestMBB);
+ MBB.addSuccessor(MoveMBB);
+
+ // In MoveMBB, emit an instruction to move SrcReg into DestReg,
+ // then fall through to RestMBB.
+ TII->copyPhysReg(*MoveMBB, MoveMBB->end(), DL, DestReg, SrcReg,
+ MI.getOperand(2).isKill());
+ MoveMBB->addSuccessor(RestMBB);
+
+ NextMBBI = MBB.end();
+ MI.eraseFromParent();
+ return true;
+}
+
+/// \brief If MBBI references a pseudo instruction that should be expanded here,
+/// do the expansion and return true. Otherwise return false.
+bool SystemZExpandPseudo::expandMI(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI) {
+ MachineInstr &MI = *MBBI;
+ switch (MI.getOpcode()) {
+ case SystemZ::LOCRMux:
+ return expandLOCRMux(MBB, MBBI, NextMBBI);
+ default:
+ break;
+ }
+ return false;
+}
+
+/// \brief Iterate over the instructions in basic block MBB and expand any
+/// pseudo instructions. Return true if anything was modified.
+bool SystemZExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
+ bool Modified = false;
+
+ MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+ while (MBBI != E) {
+ MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+ Modified |= expandMI(MBB, MBBI, NMBBI);
+ MBBI = NMBBI;
+ }
+
+ return Modified;
+}
+
+bool SystemZExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
+ TII = static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+ bool Modified = false;
+ for (auto &MBB : MF)
+ Modified |= expandMBB(MBB);
+ return Modified;
+}
+
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZFeatures.td b/contrib/llvm/lib/Target/SystemZ/SystemZFeatures.td
new file mode 100644
index 000000000000..716e5add8051
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZFeatures.td
@@ -0,0 +1,171 @@
+//===-- SystemZ.td - SystemZ processors and features ---------*- tblgen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Feature definitions.
+//
+//===----------------------------------------------------------------------===//
+
+class SystemZFeature<string extname, string intname, string desc>
+ : Predicate<"Subtarget->has"##intname##"()">,
+ AssemblerPredicate<"Feature"##intname, extname>,
+ SubtargetFeature<extname, "Has"##intname, "true", desc>;
+
+class SystemZMissingFeature<string intname>
+ : Predicate<"!Subtarget->has"##intname##"()">;
+
+class SystemZFeatureList<list<SystemZFeature> x> {
+ list<SystemZFeature> List = x;
+}
+
+class SystemZFeatureAdd<list<SystemZFeature> x, list<SystemZFeature> y>
+ : SystemZFeatureList<!listconcat(x, y)>;
+
+//===----------------------------------------------------------------------===//
+//
+// New features added in the Ninth Edition of the z/Architecture
+//
+//===----------------------------------------------------------------------===//
+
+def FeatureDistinctOps : SystemZFeature<
+ "distinct-ops", "DistinctOps",
+ "Assume that the distinct-operands facility is installed"
+>;
+
+def FeatureFastSerialization : SystemZFeature<
+ "fast-serialization", "FastSerialization",
+ "Assume that the fast-serialization facility is installed"
+>;
+
+def FeatureFPExtension : SystemZFeature<
+ "fp-extension", "FPExtension",
+ "Assume that the floating-point extension facility is installed"
+>;
+
+def FeatureHighWord : SystemZFeature<
+ "high-word", "HighWord",
+ "Assume that the high-word facility is installed"
+>;
+
+def FeatureInterlockedAccess1 : SystemZFeature<
+ "interlocked-access1", "InterlockedAccess1",
+ "Assume that interlocked-access facility 1 is installed"
+>;
+def FeatureNoInterlockedAccess1 : SystemZMissingFeature<"InterlockedAccess1">;
+
+def FeatureLoadStoreOnCond : SystemZFeature<
+ "load-store-on-cond", "LoadStoreOnCond",
+ "Assume that the load/store-on-condition facility is installed"
+>;
+
+def FeaturePopulationCount : SystemZFeature<
+ "population-count", "PopulationCount",
+ "Assume that the population-count facility is installed"
+>;
+
+def Arch9NewFeatures : SystemZFeatureList<[
+ FeatureDistinctOps,
+ FeatureFastSerialization,
+ FeatureFPExtension,
+ FeatureHighWord,
+ FeatureInterlockedAccess1,
+ FeatureLoadStoreOnCond,
+ FeaturePopulationCount
+]>;
+
+//===----------------------------------------------------------------------===//
+//
+// New features added in the Tenth Edition of the z/Architecture
+//
+//===----------------------------------------------------------------------===//
+
+def FeatureExecutionHint : SystemZFeature<
+ "execution-hint", "ExecutionHint",
+ "Assume that the execution-hint facility is installed"
+>;
+
+def FeatureLoadAndTrap : SystemZFeature<
+ "load-and-trap", "LoadAndTrap",
+ "Assume that the load-and-trap facility is installed"
+>;
+
+def FeatureMiscellaneousExtensions : SystemZFeature<
+ "miscellaneous-extensions", "MiscellaneousExtensions",
+ "Assume that the miscellaneous-extensions facility is installed"
+>;
+
+def FeatureProcessorAssist : SystemZFeature<
+ "processor-assist", "ProcessorAssist",
+ "Assume that the processor-assist facility is installed"
+>;
+
+def FeatureTransactionalExecution : SystemZFeature<
+ "transactional-execution", "TransactionalExecution",
+ "Assume that the transactional-execution facility is installed"
+>;
+
+def Arch10NewFeatures : SystemZFeatureList<[
+ FeatureExecutionHint,
+ FeatureLoadAndTrap,
+ FeatureMiscellaneousExtensions,
+ FeatureProcessorAssist,
+ FeatureTransactionalExecution
+]>;
+
+//===----------------------------------------------------------------------===//
+//
+// New features added in the Eleventh Edition of the z/Architecture
+//
+//===----------------------------------------------------------------------===//
+
+def FeatureLoadAndZeroRightmostByte : SystemZFeature<
+ "load-and-zero-rightmost-byte", "LoadAndZeroRightmostByte",
+ "Assume that the load-and-zero-rightmost-byte facility is installed"
+>;
+
+def FeatureLoadStoreOnCond2 : SystemZFeature<
+ "load-store-on-cond-2", "LoadStoreOnCond2",
+ "Assume that the load/store-on-condition facility 2 is installed"
+>;
+
+def FeatureVector : SystemZFeature<
+ "vector", "Vector",
+ "Assume that the vectory facility is installed"
+>;
+def FeatureNoVector : SystemZMissingFeature<"Vector">;
+
+def Arch11NewFeatures : SystemZFeatureList<[
+ FeatureLoadAndZeroRightmostByte,
+ FeatureLoadStoreOnCond2,
+ FeatureVector
+]>;
+
+//===----------------------------------------------------------------------===//
+//
+// Cumulative supported and unsupported feature sets
+//
+//===----------------------------------------------------------------------===//
+
+def Arch8SupportedFeatures
+ : SystemZFeatureList<[]>;
+def Arch9SupportedFeatures
+ : SystemZFeatureAdd<Arch8SupportedFeatures.List, Arch9NewFeatures.List>;
+def Arch10SupportedFeatures
+ : SystemZFeatureAdd<Arch9SupportedFeatures.List, Arch10NewFeatures.List>;
+def Arch11SupportedFeatures
+ : SystemZFeatureAdd<Arch10SupportedFeatures.List, Arch11NewFeatures.List>;
+
+def Arch11UnsupportedFeatures
+ : SystemZFeatureList<[]>;
+def Arch10UnsupportedFeatures
+ : SystemZFeatureAdd<Arch11UnsupportedFeatures.List, Arch11NewFeatures.List>;
+def Arch9UnsupportedFeatures
+ : SystemZFeatureAdd<Arch10UnsupportedFeatures.List, Arch10NewFeatures.List>;
+def Arch8UnsupportedFeatures
+ : SystemZFeatureAdd<Arch9UnsupportedFeatures.List, Arch9NewFeatures.List>;
+
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
new file mode 100644
index 000000000000..a28a91e834f6
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -0,0 +1,549 @@
+//===-- SystemZFrameLowering.cpp - Frame lowering for SystemZ -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZFrameLowering.h"
+#include "SystemZCallingConv.h"
+#include "SystemZInstrBuilder.h"
+#include "SystemZInstrInfo.h"
+#include "SystemZMachineFunctionInfo.h"
+#include "SystemZRegisterInfo.h"
+#include "SystemZSubtarget.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+
+using namespace llvm;
+
+namespace {
+// The ABI-defined register save slots, relative to the incoming stack
+// pointer.
+static const TargetFrameLowering::SpillSlot SpillOffsetTable[] = {
+ { SystemZ::R2D, 0x10 },
+ { SystemZ::R3D, 0x18 },
+ { SystemZ::R4D, 0x20 },
+ { SystemZ::R5D, 0x28 },
+ { SystemZ::R6D, 0x30 },
+ { SystemZ::R7D, 0x38 },
+ { SystemZ::R8D, 0x40 },
+ { SystemZ::R9D, 0x48 },
+ { SystemZ::R10D, 0x50 },
+ { SystemZ::R11D, 0x58 },
+ { SystemZ::R12D, 0x60 },
+ { SystemZ::R13D, 0x68 },
+ { SystemZ::R14D, 0x70 },
+ { SystemZ::R15D, 0x78 },
+ { SystemZ::F0D, 0x80 },
+ { SystemZ::F2D, 0x88 },
+ { SystemZ::F4D, 0x90 },
+ { SystemZ::F6D, 0x98 }
+};
+} // end anonymous namespace
+
+SystemZFrameLowering::SystemZFrameLowering()
+ : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 8,
+ -SystemZMC::CallFrameSize, 8,
+ false /* StackRealignable */) {
+ // Create a mapping from register number to save slot offset.
+ RegSpillOffsets.grow(SystemZ::NUM_TARGET_REGS);
+ for (unsigned I = 0, E = array_lengthof(SpillOffsetTable); I != E; ++I)
+ RegSpillOffsets[SpillOffsetTable[I].Reg] = SpillOffsetTable[I].Offset;
+}
+
+const TargetFrameLowering::SpillSlot *
+SystemZFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const {
+ NumEntries = array_lengthof(SpillOffsetTable);
+ return SpillOffsetTable;
+}
+
+void SystemZFrameLowering::determineCalleeSaves(MachineFunction &MF,
+ BitVector &SavedRegs,
+ RegScavenger *RS) const {
+ TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+
+ MachineFrameInfo &MFFrame = MF.getFrameInfo();
+ const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+ bool HasFP = hasFP(MF);
+ SystemZMachineFunctionInfo *MFI = MF.getInfo<SystemZMachineFunctionInfo>();
+ bool IsVarArg = MF.getFunction()->isVarArg();
+
+ // va_start stores incoming FPR varargs in the normal way, but delegates
+ // the saving of incoming GPR varargs to spillCalleeSavedRegisters().
+ // Record these pending uses, which typically include the call-saved
+ // argument register R6D.
+ if (IsVarArg)
+ for (unsigned I = MFI->getVarArgsFirstGPR(); I < SystemZ::NumArgGPRs; ++I)
+ SavedRegs.set(SystemZ::ArgGPRs[I]);
+
+ // If there are any landing pads, entering them will modify r6/r7.
+ if (!MF.getLandingPads().empty()) {
+ SavedRegs.set(SystemZ::R6D);
+ SavedRegs.set(SystemZ::R7D);
+ }
+
+ // If the function requires a frame pointer, record that the hard
+ // frame pointer will be clobbered.
+ if (HasFP)
+ SavedRegs.set(SystemZ::R11D);
+
+ // If the function calls other functions, record that the return
+ // address register will be clobbered.
+ if (MFFrame.hasCalls())
+ SavedRegs.set(SystemZ::R14D);
+
+ // If we are saving GPRs other than the stack pointer, we might as well
+ // save and restore the stack pointer at the same time, via STMG and LMG.
+ // This allows the deallocation to be done by the LMG, rather than needing
+ // a separate %r15 addition.
+ const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF);
+ for (unsigned I = 0; CSRegs[I]; ++I) {
+ unsigned Reg = CSRegs[I];
+ if (SystemZ::GR64BitRegClass.contains(Reg) && SavedRegs.test(Reg)) {
+ SavedRegs.set(SystemZ::R15D);
+ break;
+ }
+ }
+}
+
+// Add GPR64 to the save instruction being built by MIB, which is in basic
+// block MBB. IsImplicit says whether this is an explicit operand to the
+// instruction, or an implicit one that comes between the explicit start
+// and end registers.
+static void addSavedGPR(MachineBasicBlock &MBB, MachineInstrBuilder &MIB,
+ unsigned GPR64, bool IsImplicit) {
+ const TargetRegisterInfo *RI =
+ MBB.getParent()->getSubtarget().getRegisterInfo();
+ unsigned GPR32 = RI->getSubReg(GPR64, SystemZ::subreg_l32);
+ bool IsLive = MBB.isLiveIn(GPR64) || MBB.isLiveIn(GPR32);
+ if (!IsLive || !IsImplicit) {
+ MIB.addReg(GPR64, getImplRegState(IsImplicit) | getKillRegState(!IsLive));
+ if (!IsLive)
+ MBB.addLiveIn(GPR64);
+ }
+}
+
+bool SystemZFrameLowering::
+spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const {
+ if (CSI.empty())
+ return false;
+
+ MachineFunction &MF = *MBB.getParent();
+ const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+ SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
+ bool IsVarArg = MF.getFunction()->isVarArg();
+ DebugLoc DL;
+
+ // Scan the call-saved GPRs and find the bounds of the register spill area.
+ unsigned LowGPR = 0;
+ unsigned HighGPR = SystemZ::R15D;
+ unsigned StartOffset = -1U;
+ for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
+ unsigned Reg = CSI[I].getReg();
+ if (SystemZ::GR64BitRegClass.contains(Reg)) {
+ unsigned Offset = RegSpillOffsets[Reg];
+ assert(Offset && "Unexpected GPR save");
+ if (StartOffset > Offset) {
+ LowGPR = Reg;
+ StartOffset = Offset;
+ }
+ }
+ }
+
+ // Save the range of call-saved registers, for use by the epilogue inserter.
+ ZFI->setLowSavedGPR(LowGPR);
+ ZFI->setHighSavedGPR(HighGPR);
+
+ // Include the GPR varargs, if any. R6D is call-saved, so would
+ // be included by the loop above, but we also need to handle the
+ // call-clobbered argument registers.
+ if (IsVarArg) {
+ unsigned FirstGPR = ZFI->getVarArgsFirstGPR();
+ if (FirstGPR < SystemZ::NumArgGPRs) {
+ unsigned Reg = SystemZ::ArgGPRs[FirstGPR];
+ unsigned Offset = RegSpillOffsets[Reg];
+ if (StartOffset > Offset) {
+ LowGPR = Reg; StartOffset = Offset;
+ }
+ }
+ }
+
+ // Save GPRs
+ if (LowGPR) {
+ assert(LowGPR != HighGPR && "Should be saving %r15 and something else");
+
+ // Build an STMG instruction.
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(SystemZ::STMG));
+
+ // Add the explicit register operands.
+ addSavedGPR(MBB, MIB, LowGPR, false);
+ addSavedGPR(MBB, MIB, HighGPR, false);
+
+ // Add the address.
+ MIB.addReg(SystemZ::R15D).addImm(StartOffset);
+
+ // Make sure all call-saved GPRs are included as operands and are
+ // marked as live on entry.
+ for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
+ unsigned Reg = CSI[I].getReg();
+ if (SystemZ::GR64BitRegClass.contains(Reg))
+ addSavedGPR(MBB, MIB, Reg, true);
+ }
+
+ // ...likewise GPR varargs.
+ if (IsVarArg)
+ for (unsigned I = ZFI->getVarArgsFirstGPR(); I < SystemZ::NumArgGPRs; ++I)
+ addSavedGPR(MBB, MIB, SystemZ::ArgGPRs[I], true);
+ }
+
+ // Save FPRs in the normal TargetInstrInfo way.
+ for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
+ unsigned Reg = CSI[I].getReg();
+ if (SystemZ::FP64BitRegClass.contains(Reg)) {
+ MBB.addLiveIn(Reg);
+ TII->storeRegToStackSlot(MBB, MBBI, Reg, true, CSI[I].getFrameIdx(),
+ &SystemZ::FP64BitRegClass, TRI);
+ }
+ }
+
+ return true;
+}
+
+bool SystemZFrameLowering::
+restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const {
+ if (CSI.empty())
+ return false;
+
+ MachineFunction &MF = *MBB.getParent();
+ const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+ SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
+ bool HasFP = hasFP(MF);
+ DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+ // Restore FPRs in the normal TargetInstrInfo way.
+ for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
+ unsigned Reg = CSI[I].getReg();
+ if (SystemZ::FP64BitRegClass.contains(Reg))
+ TII->loadRegFromStackSlot(MBB, MBBI, Reg, CSI[I].getFrameIdx(),
+ &SystemZ::FP64BitRegClass, TRI);
+ }
+
+ // Restore call-saved GPRs (but not call-clobbered varargs, which at
+ // this point might hold return values).
+ unsigned LowGPR = ZFI->getLowSavedGPR();
+ unsigned HighGPR = ZFI->getHighSavedGPR();
+ unsigned StartOffset = RegSpillOffsets[LowGPR];
+ if (LowGPR) {
+ // If we saved any of %r2-%r5 as varargs, we should also be saving
+ // and restoring %r6. If we're saving %r6 or above, we should be
+ // restoring it too.
+ assert(LowGPR != HighGPR && "Should be loading %r15 and something else");
+
+ // Build an LMG instruction.
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(SystemZ::LMG));
+
+ // Add the explicit register operands.
+ MIB.addReg(LowGPR, RegState::Define);
+ MIB.addReg(HighGPR, RegState::Define);
+
+ // Add the address.
+ MIB.addReg(HasFP ? SystemZ::R11D : SystemZ::R15D);
+ MIB.addImm(StartOffset);
+
+ // Do a second scan adding regs as being defined by instruction
+ for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
+ unsigned Reg = CSI[I].getReg();
+ if (Reg != LowGPR && Reg != HighGPR &&
+ SystemZ::GR64BitRegClass.contains(Reg))
+ MIB.addReg(Reg, RegState::ImplicitDefine);
+ }
+ }
+
+ return true;
+}
+
+void SystemZFrameLowering::
+processFunctionBeforeFrameFinalized(MachineFunction &MF,
+ RegScavenger *RS) const {
+ MachineFrameInfo &MFFrame = MF.getFrameInfo();
+ uint64_t MaxReach = (MFFrame.estimateStackSize(MF) +
+ SystemZMC::CallFrameSize * 2);
+ if (!isUInt<12>(MaxReach)) {
+ // We may need register scavenging slots if some parts of the frame
+ // are outside the reach of an unsigned 12-bit displacement.
+ // Create 2 for the case where both addresses in an MVC are
+ // out of range.
+ RS->addScavengingFrameIndex(MFFrame.CreateStackObject(8, 8, false));
+ RS->addScavengingFrameIndex(MFFrame.CreateStackObject(8, 8, false));
+ }
+}
+
+// Emit instructions before MBBI (in MBB) to add NumBytes to Reg.
+static void emitIncrement(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ const DebugLoc &DL,
+ unsigned Reg, int64_t NumBytes,
+ const TargetInstrInfo *TII) {
+ while (NumBytes) {
+ unsigned Opcode;
+ int64_t ThisVal = NumBytes;
+ if (isInt<16>(NumBytes))
+ Opcode = SystemZ::AGHI;
+ else {
+ Opcode = SystemZ::AGFI;
+ // Make sure we maintain 8-byte stack alignment.
+ int64_t MinVal = -uint64_t(1) << 31;
+ int64_t MaxVal = (int64_t(1) << 31) - 8;
+ if (ThisVal < MinVal)
+ ThisVal = MinVal;
+ else if (ThisVal > MaxVal)
+ ThisVal = MaxVal;
+ }
+ MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII->get(Opcode), Reg)
+ .addReg(Reg).addImm(ThisVal);
+ // The CC implicit def is dead.
+ MI->getOperand(3).setIsDead();
+ NumBytes -= ThisVal;
+ }
+}
+
+void SystemZFrameLowering::emitPrologue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
+ MachineFrameInfo &MFFrame = MF.getFrameInfo();
+ auto *ZII =
+ static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
+ SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
+ MachineBasicBlock::iterator MBBI = MBB.begin();
+ MachineModuleInfo &MMI = MF.getMMI();
+ const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+ const std::vector<CalleeSavedInfo> &CSI = MFFrame.getCalleeSavedInfo();
+ bool HasFP = hasFP(MF);
+
+ // Debug location must be unknown since the first debug location is used
+ // to determine the end of the prologue.
+ DebugLoc DL;
+
+ // The current offset of the stack pointer from the CFA.
+ int64_t SPOffsetFromCFA = -SystemZMC::CFAOffsetFromInitialSP;
+
+ if (ZFI->getLowSavedGPR()) {
+ // Skip over the GPR saves.
+ if (MBBI != MBB.end() && MBBI->getOpcode() == SystemZ::STMG)
+ ++MBBI;
+ else
+ llvm_unreachable("Couldn't skip over GPR saves");
+
+ // Add CFI for the GPR saves.
+ for (auto &Save : CSI) {
+ unsigned Reg = Save.getReg();
+ if (SystemZ::GR64BitRegClass.contains(Reg)) {
+ int64_t Offset = SPOffsetFromCFA + RegSpillOffsets[Reg];
+ unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
+ nullptr, MRI->getDwarfRegNum(Reg, true), Offset));
+ BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+ }
+ }
+ }
+
+ uint64_t StackSize = getAllocatedStackSize(MF);
+ if (StackSize) {
+ // Determine if we want to store a backchain.
+ bool StoreBackchain = MF.getFunction()->hasFnAttribute("backchain");
+
+ // If we need backchain, save current stack pointer. R1 is free at this
+ // point.
+ if (StoreBackchain)
+ BuildMI(MBB, MBBI, DL, ZII->get(SystemZ::LGR))
+ .addReg(SystemZ::R1D, RegState::Define).addReg(SystemZ::R15D);
+
+ // Allocate StackSize bytes.
+ int64_t Delta = -int64_t(StackSize);
+ emitIncrement(MBB, MBBI, DL, SystemZ::R15D, Delta, ZII);
+
+ // Add CFI for the allocation.
+ unsigned CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::createDefCfaOffset(nullptr, SPOffsetFromCFA + Delta));
+ BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+ SPOffsetFromCFA += Delta;
+
+ if (StoreBackchain)
+ BuildMI(MBB, MBBI, DL, ZII->get(SystemZ::STG))
+ .addReg(SystemZ::R1D, RegState::Kill).addReg(SystemZ::R15D).addImm(0).addReg(0);
+ }
+
+ if (HasFP) {
+ // Copy the base of the frame to R11.
+ BuildMI(MBB, MBBI, DL, ZII->get(SystemZ::LGR), SystemZ::R11D)
+ .addReg(SystemZ::R15D);
+
+ // Add CFI for the new frame location.
+ unsigned HardFP = MRI->getDwarfRegNum(SystemZ::R11D, true);
+ unsigned CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::createDefCfaRegister(nullptr, HardFP));
+ BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+
+ // Mark the FramePtr as live at the beginning of every block except
+ // the entry block. (We'll have marked R11 as live on entry when
+ // saving the GPRs.)
+ for (auto I = std::next(MF.begin()), E = MF.end(); I != E; ++I)
+ I->addLiveIn(SystemZ::R11D);
+ }
+
+ // Skip over the FPR saves.
+ SmallVector<unsigned, 8> CFIIndexes;
+ for (auto &Save : CSI) {
+ unsigned Reg = Save.getReg();
+ if (SystemZ::FP64BitRegClass.contains(Reg)) {
+ if (MBBI != MBB.end() &&
+ (MBBI->getOpcode() == SystemZ::STD ||
+ MBBI->getOpcode() == SystemZ::STDY))
+ ++MBBI;
+ else
+ llvm_unreachable("Couldn't skip over FPR save");
+
+ // Add CFI for the this save.
+ unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
+ unsigned IgnoredFrameReg;
+ int64_t Offset =
+ getFrameIndexReference(MF, Save.getFrameIdx(), IgnoredFrameReg);
+
+ unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
+ nullptr, DwarfReg, SPOffsetFromCFA + Offset));
+ CFIIndexes.push_back(CFIIndex);
+ }
+ }
+ // Complete the CFI for the FPR saves, modelling them as taking effect
+ // after the last save.
+ for (auto CFIIndex : CFIIndexes) {
+ BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+ }
+}
+
+void SystemZFrameLowering::emitEpilogue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+ auto *ZII =
+ static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
+ SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
+
+ // Skip the return instruction.
+ assert(MBBI->isReturn() && "Can only insert epilogue into returning blocks");
+
+ uint64_t StackSize = getAllocatedStackSize(MF);
+ if (ZFI->getLowSavedGPR()) {
+ --MBBI;
+ unsigned Opcode = MBBI->getOpcode();
+ if (Opcode != SystemZ::LMG)
+ llvm_unreachable("Expected to see callee-save register restore code");
+
+ unsigned AddrOpNo = 2;
+ DebugLoc DL = MBBI->getDebugLoc();
+ uint64_t Offset = StackSize + MBBI->getOperand(AddrOpNo + 1).getImm();
+ unsigned NewOpcode = ZII->getOpcodeForOffset(Opcode, Offset);
+
+ // If the offset is too large, use the largest stack-aligned offset
+ // and add the rest to the base register (the stack or frame pointer).
+ if (!NewOpcode) {
+ uint64_t NumBytes = Offset - 0x7fff8;
+ emitIncrement(MBB, MBBI, DL, MBBI->getOperand(AddrOpNo).getReg(),
+ NumBytes, ZII);
+ Offset -= NumBytes;
+ NewOpcode = ZII->getOpcodeForOffset(Opcode, Offset);
+ assert(NewOpcode && "No restore instruction available");
+ }
+
+ MBBI->setDesc(ZII->get(NewOpcode));
+ MBBI->getOperand(AddrOpNo + 1).ChangeToImmediate(Offset);
+ } else if (StackSize) {
+ DebugLoc DL = MBBI->getDebugLoc();
+ emitIncrement(MBB, MBBI, DL, SystemZ::R15D, StackSize, ZII);
+ }
+}
+
+bool SystemZFrameLowering::hasFP(const MachineFunction &MF) const {
+ return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
+ MF.getFrameInfo().hasVarSizedObjects() ||
+ MF.getInfo<SystemZMachineFunctionInfo>()->getManipulatesSP());
+}
+
+int SystemZFrameLowering::getFrameIndexReference(const MachineFunction &MF,
+ int FI,
+ unsigned &FrameReg) const {
+ const MachineFrameInfo &MFFrame = MF.getFrameInfo();
+ const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
+
+ // Fill in FrameReg output argument.
+ FrameReg = RI->getFrameRegister(MF);
+
+ // Start with the offset of FI from the top of the caller-allocated frame
+ // (i.e. the top of the 160 bytes allocated by the caller). This initial
+ // offset is therefore negative.
+ int64_t Offset = (MFFrame.getObjectOffset(FI) +
+ MFFrame.getOffsetAdjustment());
+
+ // Make the offset relative to the incoming stack pointer.
+ Offset -= getOffsetOfLocalArea();
+
+ // Make the offset relative to the bottom of the frame.
+ Offset += getAllocatedStackSize(MF);
+
+ return Offset;
+}
+
+uint64_t SystemZFrameLowering::
+getAllocatedStackSize(const MachineFunction &MF) const {
+ const MachineFrameInfo &MFFrame = MF.getFrameInfo();
+
+ // Start with the size of the local variables and spill slots.
+ uint64_t StackSize = MFFrame.getStackSize();
+
+ // We need to allocate the ABI-defined 160-byte base area whenever
+ // we allocate stack space for our own use and whenever we call another
+ // function.
+ if (StackSize || MFFrame.hasVarSizedObjects() || MFFrame.hasCalls())
+ StackSize += SystemZMC::CallFrameSize;
+
+ return StackSize;
+}
+
+bool
+SystemZFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+ // The ABI requires us to allocate 160 bytes of stack space for the callee,
+ // with any outgoing stack arguments being placed above that. It seems
+ // better to make that area a permanent feature of the frame even if
+ // we're using a frame pointer.
+ return true;
+}
+
+MachineBasicBlock::iterator SystemZFrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI) const {
+ switch (MI->getOpcode()) {
+ case SystemZ::ADJCALLSTACKDOWN:
+ case SystemZ::ADJCALLSTACKUP:
+ assert(hasReservedCallFrame(MF) &&
+ "ADJSTACKDOWN and ADJSTACKUP should be no-ops");
+ return MBB.erase(MI);
+ break;
+
+ default:
+ llvm_unreachable("Unexpected call frame instruction");
+ }
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.h b/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
new file mode 100644
index 000000000000..d43a176ad874
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
@@ -0,0 +1,64 @@
+//===-- SystemZFrameLowering.h - Frame lowering for SystemZ -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZFRAMELOWERING_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZFRAMELOWERING_H
+
+#include "llvm/ADT/IndexedMap.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+class SystemZTargetMachine;
+class SystemZSubtarget;
+
+class SystemZFrameLowering : public TargetFrameLowering {
+ IndexedMap<unsigned> RegSpillOffsets;
+
+public:
+ SystemZFrameLowering();
+
+ // Override TargetFrameLowering.
+ bool isFPCloseToIncomingSP() const override { return false; }
+ const SpillSlot *getCalleeSavedSpillSlots(unsigned &NumEntries) const
+ override;
+ void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+ RegScavenger *RS) const override;
+ bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const override;
+ bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBII,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const
+ override;
+ void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+ RegScavenger *RS) const override;
+ void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+ void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+ bool hasFP(const MachineFunction &MF) const override;
+ int getFrameIndexReference(const MachineFunction &MF, int FI,
+ unsigned &FrameReg) const override;
+ bool hasReservedCallFrame(const MachineFunction &MF) const override;
+ MachineBasicBlock::iterator
+ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI) const override;
+
+ // Return the number of bytes in the callee-allocated part of the frame.
+ uint64_t getAllocatedStackSize(const MachineFunction &MF) const;
+
+ // Return the byte offset from the incoming stack pointer of Reg's
+ // ABI-defined save slot. Return 0 if no slot is defined for Reg.
+ unsigned getRegSpillOffset(unsigned Reg) const {
+ return RegSpillOffsets[Reg];
+ }
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
new file mode 100644
index 000000000000..fe4b52b515e0
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
@@ -0,0 +1,337 @@
+//=-- SystemZHazardRecognizer.h - SystemZ Hazard Recognizer -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a hazard recognizer for the SystemZ scheduler.
+//
+// This class is used by the SystemZ scheduling strategy to maintain
+// the state during scheduling, and provide cost functions for
+// scheduling candidates. This includes:
+//
+// * Decoder grouping. A decoder group can maximally hold 3 uops, and
+// instructions that always begin a new group should be scheduled when
+// the current decoder group is empty.
+// * Processor resources usage. It is beneficial to balance the use of
+// resources.
+//
+// ===---------------------------------------------------------------------===//
+
+#include "SystemZHazardRecognizer.h"
+#include "llvm/ADT/Statistic.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "misched"
+
+// This is the limit of processor resource usage at which the
+// scheduler should try to look for other instructions (not using the
+// critical resource).
+static cl::opt<int> ProcResCostLim("procres-cost-lim", cl::Hidden,
+ cl::desc("The OOO window for processor "
+ "resources during scheduling."),
+ cl::init(8));
+
+SystemZHazardRecognizer::
+SystemZHazardRecognizer(const MachineSchedContext *C) : DAG(nullptr),
+ SchedModel(nullptr) {}
+
+unsigned SystemZHazardRecognizer::
+getNumDecoderSlots(SUnit *SU) const {
+ const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+ if (!SC->isValid())
+ return 0; // IMPLICIT_DEF / KILL -- will not make impact in output.
+
+ if (SC->BeginGroup) {
+ if (!SC->EndGroup)
+ return 2; // Cracked instruction
+ else
+ return 3; // Expanded/group-alone instruction
+ }
+
+ return 1; // Normal instruction
+}
+
+unsigned SystemZHazardRecognizer::getCurrCycleIdx() {
+ unsigned Idx = CurrGroupSize;
+ if (GrpCount % 2)
+ Idx += 3;
+ return Idx;
+}
+
+ScheduleHazardRecognizer::HazardType SystemZHazardRecognizer::
+getHazardType(SUnit *m, int Stalls) {
+ return (fitsIntoCurrentGroup(m) ? NoHazard : Hazard);
+}
+
+void SystemZHazardRecognizer::Reset() {
+ CurrGroupSize = 0;
+ clearProcResCounters();
+ GrpCount = 0;
+ LastFPdOpCycleIdx = UINT_MAX;
+ DEBUG(CurGroupDbg = "";);
+}
+
+bool
+SystemZHazardRecognizer::fitsIntoCurrentGroup(SUnit *SU) const {
+ const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+ if (!SC->isValid())
+ return true;
+
+ // A cracked instruction only fits into schedule if the current
+ // group is empty.
+ if (SC->BeginGroup)
+ return (CurrGroupSize == 0);
+
+ // Since a full group is handled immediately in EmitInstruction(),
+ // SU should fit into current group. NumSlots should be 1 or 0,
+ // since it is not a cracked or expanded instruction.
+ assert ((getNumDecoderSlots(SU) <= 1) && (CurrGroupSize < 3) &&
+ "Expected normal instruction to fit in non-full group!");
+
+ return true;
+}
+
+void SystemZHazardRecognizer::nextGroup(bool DbgOutput) {
+ if (CurrGroupSize > 0) {
+ DEBUG(dumpCurrGroup("Completed decode group"));
+ DEBUG(CurGroupDbg = "";);
+
+ GrpCount++;
+
+ // Reset counter for next group.
+ CurrGroupSize = 0;
+
+ // Decrease counters for execution units by one.
+ for (unsigned i = 0; i < SchedModel->getNumProcResourceKinds(); ++i)
+ if (ProcResourceCounters[i] > 0)
+ ProcResourceCounters[i]--;
+
+ // Clear CriticalResourceIdx if it is now below the threshold.
+ if (CriticalResourceIdx != UINT_MAX &&
+ (ProcResourceCounters[CriticalResourceIdx] <=
+ ProcResCostLim))
+ CriticalResourceIdx = UINT_MAX;
+ }
+
+ DEBUG(if (DbgOutput)
+ dumpProcResourceCounters(););
+}
+
+#ifndef NDEBUG // Debug output
+void SystemZHazardRecognizer::dumpSU(SUnit *SU, raw_ostream &OS) const {
+ OS << "SU(" << SU->NodeNum << "):";
+ OS << SchedModel->getInstrInfo()->getName(SU->getInstr()->getOpcode());
+
+ const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+ if (!SC->isValid())
+ return;
+
+ for (TargetSchedModel::ProcResIter
+ PI = SchedModel->getWriteProcResBegin(SC),
+ PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
+ const MCProcResourceDesc &PRD =
+ *SchedModel->getProcResource(PI->ProcResourceIdx);
+ std::string FU(PRD.Name);
+ // trim e.g. Z13_FXaUnit -> FXa
+ FU = FU.substr(FU.find("_") + 1);
+ FU.resize(FU.find("Unit"));
+ OS << "/" << FU;
+
+ if (PI->Cycles > 1)
+ OS << "(" << PI->Cycles << "cyc)";
+ }
+
+ if (SC->NumMicroOps > 1)
+ OS << "/" << SC->NumMicroOps << "uops";
+ if (SC->BeginGroup && SC->EndGroup)
+ OS << "/GroupsAlone";
+ else if (SC->BeginGroup)
+ OS << "/BeginsGroup";
+ else if (SC->EndGroup)
+ OS << "/EndsGroup";
+ if (SU->isUnbuffered)
+ OS << "/Unbuffered";
+}
+
+void SystemZHazardRecognizer::dumpCurrGroup(std::string Msg) const {
+ dbgs() << "+++ " << Msg;
+ dbgs() << ": ";
+
+ if (CurGroupDbg.empty())
+ dbgs() << " <empty>\n";
+ else {
+ dbgs() << "{ " << CurGroupDbg << " }";
+ dbgs() << " (" << CurrGroupSize << " decoder slot"
+ << (CurrGroupSize > 1 ? "s":"")
+ << ")\n";
+ }
+}
+
+void SystemZHazardRecognizer::dumpProcResourceCounters() const {
+ bool any = false;
+
+ for (unsigned i = 0; i < SchedModel->getNumProcResourceKinds(); ++i)
+ if (ProcResourceCounters[i] > 0) {
+ any = true;
+ break;
+ }
+
+ if (!any)
+ return;
+
+ dbgs() << "+++ Resource counters:\n";
+ for (unsigned i = 0; i < SchedModel->getNumProcResourceKinds(); ++i)
+ if (ProcResourceCounters[i] > 0) {
+ dbgs() << "+++ Extra schedule for execution unit "
+ << SchedModel->getProcResource(i)->Name
+ << ": " << ProcResourceCounters[i] << "\n";
+ any = true;
+ }
+}
+#endif //NDEBUG
+
+void SystemZHazardRecognizer::clearProcResCounters() {
+ ProcResourceCounters.assign(SchedModel->getNumProcResourceKinds(), 0);
+ CriticalResourceIdx = UINT_MAX;
+}
+
+// Update state with SU as the next scheduled unit.
+void SystemZHazardRecognizer::
+EmitInstruction(SUnit *SU) {
+ const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+ DEBUG( dumpCurrGroup("Decode group before emission"););
+
+ // If scheduling an SU that must begin a new decoder group, move on
+ // to next group.
+ if (!fitsIntoCurrentGroup(SU))
+ nextGroup();
+
+ DEBUG( dbgs() << "+++ HazardRecognizer emitting "; dumpSU(SU, dbgs());
+ dbgs() << "\n";
+ raw_string_ostream cgd(CurGroupDbg);
+ if (CurGroupDbg.length())
+ cgd << ", ";
+ dumpSU(SU, cgd););
+
+ // After returning from a call, we don't know much about the state.
+ if (SU->getInstr()->isCall()) {
+ DEBUG (dbgs() << "+++ Clearing state after call.\n";);
+ clearProcResCounters();
+ LastFPdOpCycleIdx = UINT_MAX;
+ CurrGroupSize += getNumDecoderSlots(SU);
+ assert (CurrGroupSize <= 3);
+ nextGroup();
+ return;
+ }
+
+ // Increase counter for execution unit(s).
+ for (TargetSchedModel::ProcResIter
+ PI = SchedModel->getWriteProcResBegin(SC),
+ PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
+ // Don't handle FPd together with the other resources.
+ if (SchedModel->getProcResource(PI->ProcResourceIdx)->BufferSize == 1)
+ continue;
+ int &CurrCounter =
+ ProcResourceCounters[PI->ProcResourceIdx];
+ CurrCounter += PI->Cycles;
+ // Check if this is now the new critical resource.
+ if ((CurrCounter > ProcResCostLim) &&
+ (CriticalResourceIdx == UINT_MAX ||
+ (PI->ProcResourceIdx != CriticalResourceIdx &&
+ CurrCounter >
+ ProcResourceCounters[CriticalResourceIdx]))) {
+ DEBUG( dbgs() << "+++ New critical resource: "
+ << SchedModel->getProcResource(PI->ProcResourceIdx)->Name
+ << "\n";);
+ CriticalResourceIdx = PI->ProcResourceIdx;
+ }
+ }
+
+ // Make note of an instruction that uses a blocking resource (FPd).
+ if (SU->isUnbuffered) {
+ LastFPdOpCycleIdx = getCurrCycleIdx();
+ DEBUG (dbgs() << "+++ Last FPd cycle index: "
+ << LastFPdOpCycleIdx << "\n";);
+ }
+
+ // Insert SU into current group by increasing number of slots used
+ // in current group.
+ CurrGroupSize += getNumDecoderSlots(SU);
+ assert (CurrGroupSize <= 3);
+
+ // Check if current group is now full/ended. If so, move on to next
+ // group to be ready to evaluate more candidates.
+ if (CurrGroupSize == 3 || SC->EndGroup)
+ nextGroup();
+}
+
+int SystemZHazardRecognizer::groupingCost(SUnit *SU) const {
+ const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+ if (!SC->isValid())
+ return 0;
+
+ // If SU begins new group, it can either break a current group early
+ // or fit naturally if current group is empty (negative cost).
+ if (SC->BeginGroup) {
+ if (CurrGroupSize)
+ return 3 - CurrGroupSize;
+ return -1;
+ }
+
+ // Similarly, a group-ending SU may either fit well (last in group), or
+ // end the group prematurely.
+ if (SC->EndGroup) {
+ unsigned resultingGroupSize =
+ (CurrGroupSize + getNumDecoderSlots(SU));
+ if (resultingGroupSize < 3)
+ return (3 - resultingGroupSize);
+ return -1;
+ }
+
+ // Most instructions can be placed in any decoder slot.
+ return 0;
+}
+
+bool SystemZHazardRecognizer::isFPdOpPreferred_distance(const SUnit *SU) {
+ assert (SU->isUnbuffered);
+ // If this is the first FPd op, it should be scheduled high.
+ if (LastFPdOpCycleIdx == UINT_MAX)
+ return true;
+ // If this is not the first PFd op, it should go into the other side
+ // of the processor to use the other FPd unit there. This should
+ // generally happen if two FPd ops are placed with 2 other
+ // instructions between them (modulo 6).
+ if (LastFPdOpCycleIdx > getCurrCycleIdx())
+ return ((LastFPdOpCycleIdx - getCurrCycleIdx()) == 3);
+ return ((getCurrCycleIdx() - LastFPdOpCycleIdx) == 3);
+}
+
+int SystemZHazardRecognizer::
+resourcesCost(SUnit *SU) {
+ int Cost = 0;
+
+ const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+ if (!SC->isValid())
+ return 0;
+
+ // For a FPd op, either return min or max value as indicated by the
+ // distance to any prior FPd op.
+ if (SU->isUnbuffered)
+ Cost = (isFPdOpPreferred_distance(SU) ? INT_MIN : INT_MAX);
+ // For other instructions, give a cost to the use of the critical resource.
+ else if (CriticalResourceIdx != UINT_MAX) {
+ for (TargetSchedModel::ProcResIter
+ PI = SchedModel->getWriteProcResBegin(SC),
+ PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI)
+ if (PI->ProcResourceIdx == CriticalResourceIdx)
+ Cost = PI->Cycles;
+ }
+
+ return Cost;
+}
+
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.h b/contrib/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.h
new file mode 100644
index 000000000000..8fa54ee434cf
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.h
@@ -0,0 +1,128 @@
+//=-- SystemZHazardRecognizer.h - SystemZ Hazard Recognizer -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares a hazard recognizer for the SystemZ scheduler.
+//
+// This class is used by the SystemZ scheduling strategy to maintain
+// the state during scheduling, and provide cost functions for
+// scheduling candidates. This includes:
+//
+// * Decoder grouping. A decoder group can maximally hold 3 uops, and
+// instructions that always begin a new group should be scheduled when
+// the current decoder group is empty.
+// * Processor resources usage. It is beneficial to balance the use of
+// resources.
+//
+// ===---------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZHAZARDRECOGNIZER_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZHAZARDRECOGNIZER_H
+
+#include "SystemZSubtarget.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/raw_ostream.h"
+#include <string>
+
+namespace llvm {
+
+/// SystemZHazardRecognizer maintains the state during scheduling.
+class SystemZHazardRecognizer : public ScheduleHazardRecognizer {
+
+ ScheduleDAGMI *DAG;
+ const TargetSchedModel *SchedModel;
+
+ /// Keep track of the number of decoder slots used in the current
+ /// decoder group.
+ unsigned CurrGroupSize;
+
+ /// The tracking of resources here are quite similar to the common
+ /// code use of a critical resource. However, z13 differs in the way
+ /// that it has two processor sides which may be interesting to
+ /// model in the future (a work in progress).
+
+ /// Counters for the number of uops scheduled per processor
+ /// resource.
+ SmallVector<int, 0> ProcResourceCounters;
+
+ /// This is the resource with the greatest queue, which the
+ /// scheduler tries to avoid.
+ unsigned CriticalResourceIdx;
+
+ /// Return the number of decoder slots MI requires.
+ inline unsigned getNumDecoderSlots(SUnit *SU) const;
+
+ /// Return true if MI fits into current decoder group.
+ bool fitsIntoCurrentGroup(SUnit *SU) const;
+
+ /// Two decoder groups per cycle are formed (for z13), meaning 2x3
+ /// instructions. This function returns a number between 0 and 5,
+ /// representing the current decoder slot of the current cycle.
+ unsigned getCurrCycleIdx();
+
+ /// LastFPdOpCycleIdx stores the numbeer returned by getCurrCycleIdx()
+ /// when a stalling operation is scheduled (which uses the FPd resource).
+ unsigned LastFPdOpCycleIdx;
+
+ /// A counter of decoder groups scheduled.
+ unsigned GrpCount;
+
+ unsigned getCurrGroupSize() {return CurrGroupSize;};
+
+ /// Start next decoder group.
+ void nextGroup(bool DbgOutput = true);
+
+ /// Clear all counters for processor resources.
+ void clearProcResCounters();
+
+ /// With the goal of alternating processor sides for stalling (FPd)
+ /// ops, return true if it seems good to schedule an FPd op next.
+ bool isFPdOpPreferred_distance(const SUnit *SU);
+
+public:
+ SystemZHazardRecognizer(const MachineSchedContext *C);
+
+ void setDAG(ScheduleDAGMI *dag) {
+ DAG = dag;
+ SchedModel = dag->getSchedModel();
+ }
+
+ HazardType getHazardType(SUnit *m, int Stalls = 0) override;
+ void Reset() override;
+ void EmitInstruction(SUnit *SU) override;
+
+ // Cost functions used by SystemZPostRASchedStrategy while
+ // evaluating candidates.
+
+ /// Return the cost of decoder grouping for SU. If SU must start a
+ /// new decoder group, this is negative if this fits the schedule or
+ /// positive if it would mean ending a group prematurely. For normal
+ /// instructions this returns 0.
+ int groupingCost(SUnit *SU) const;
+
+ /// Return the cost of SU in regards to processor resources usage.
+ /// A positive value means it would be better to wait with SU, while
+ /// a negative value means it would be good to schedule SU next.
+ int resourcesCost(SUnit *SU);
+
+#ifndef NDEBUG
+ // Debug dumping.
+ std::string CurGroupDbg; // current group as text
+ void dumpSU(SUnit *SU, raw_ostream &OS) const;
+ void dumpCurrGroup(std::string Msg = "") const;
+ void dumpProcResourceCounters() const;
+#endif
+};
+
+} // namespace llvm
+
+#endif /* LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZHAZARDRECOGNIZER_H */
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
new file mode 100644
index 000000000000..920b6e430e8f
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -0,0 +1,1419 @@
+//===-- SystemZISelDAGToDAG.cpp - A dag to dag inst selector for SystemZ --===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the SystemZ target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZTargetMachine.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "systemz-isel"
+
+namespace {
+// Used to build addressing modes.
+struct SystemZAddressingMode {
+ // The shape of the address.
+ enum AddrForm {
+ // base+displacement
+ FormBD,
+
+ // base+displacement+index for load and store operands
+ FormBDXNormal,
+
+ // base+displacement+index for load address operands
+ FormBDXLA,
+
+ // base+displacement+index+ADJDYNALLOC
+ FormBDXDynAlloc
+ };
+ AddrForm Form;
+
+ // The type of displacement. The enum names here correspond directly
+ // to the definitions in SystemZOperand.td. We could split them into
+ // flags -- single/pair, 128-bit, etc. -- but it hardly seems worth it.
+ enum DispRange {
+ Disp12Only,
+ Disp12Pair,
+ Disp20Only,
+ Disp20Only128,
+ Disp20Pair
+ };
+ DispRange DR;
+
+ // The parts of the address. The address is equivalent to:
+ //
+ // Base + Disp + Index + (IncludesDynAlloc ? ADJDYNALLOC : 0)
+ SDValue Base;
+ int64_t Disp;
+ SDValue Index;
+ bool IncludesDynAlloc;
+
+ SystemZAddressingMode(AddrForm form, DispRange dr)
+ : Form(form), DR(dr), Base(), Disp(0), Index(),
+ IncludesDynAlloc(false) {}
+
+ // True if the address can have an index register.
+ bool hasIndexField() { return Form != FormBD; }
+
+ // True if the address can (and must) include ADJDYNALLOC.
+ bool isDynAlloc() { return Form == FormBDXDynAlloc; }
+
+ void dump() {
+ errs() << "SystemZAddressingMode " << this << '\n';
+
+ errs() << " Base ";
+ if (Base.getNode())
+ Base.getNode()->dump();
+ else
+ errs() << "null\n";
+
+ if (hasIndexField()) {
+ errs() << " Index ";
+ if (Index.getNode())
+ Index.getNode()->dump();
+ else
+ errs() << "null\n";
+ }
+
+ errs() << " Disp " << Disp;
+ if (IncludesDynAlloc)
+ errs() << " + ADJDYNALLOC";
+ errs() << '\n';
+ }
+};
+
+// Return a mask with Count low bits set.
+static uint64_t allOnes(unsigned int Count) {
+ assert(Count <= 64);
+ if (Count > 63)
+ return UINT64_MAX;
+ return (uint64_t(1) << Count) - 1;
+}
+
+// Represents operands 2 to 5 of the ROTATE AND ... SELECTED BITS operation
+// given by Opcode. The operands are: Input (R2), Start (I3), End (I4) and
+// Rotate (I5). The combined operand value is effectively:
+//
+// (or (rotl Input, Rotate), ~Mask)
+//
+// for RNSBG and:
+//
+// (and (rotl Input, Rotate), Mask)
+//
+// otherwise. The output value has BitSize bits, although Input may be
+// narrower (in which case the upper bits are don't care), or wider (in which
+// case the result will be truncated as part of the operation).
+struct RxSBGOperands {
+ RxSBGOperands(unsigned Op, SDValue N)
+ : Opcode(Op), BitSize(N.getValueSizeInBits()),
+ Mask(allOnes(BitSize)), Input(N), Start(64 - BitSize), End(63),
+ Rotate(0) {}
+
+ unsigned Opcode;
+ unsigned BitSize;
+ uint64_t Mask;
+ SDValue Input;
+ unsigned Start;
+ unsigned End;
+ unsigned Rotate;
+};
+
+class SystemZDAGToDAGISel : public SelectionDAGISel {
+ const SystemZSubtarget *Subtarget;
+
+ // Used by SystemZOperands.td to create integer constants.
+ inline SDValue getImm(const SDNode *Node, uint64_t Imm) const {
+ return CurDAG->getTargetConstant(Imm, SDLoc(Node), Node->getValueType(0));
+ }
+
+ const SystemZTargetMachine &getTargetMachine() const {
+ return static_cast<const SystemZTargetMachine &>(TM);
+ }
+
+ const SystemZInstrInfo *getInstrInfo() const {
+ return Subtarget->getInstrInfo();
+ }
+
+ // Try to fold more of the base or index of AM into AM, where IsBase
+ // selects between the base and index.
+ bool expandAddress(SystemZAddressingMode &AM, bool IsBase) const;
+
+ // Try to describe N in AM, returning true on success.
+ bool selectAddress(SDValue N, SystemZAddressingMode &AM) const;
+
+ // Extract individual target operands from matched address AM.
+ void getAddressOperands(const SystemZAddressingMode &AM, EVT VT,
+ SDValue &Base, SDValue &Disp) const;
+ void getAddressOperands(const SystemZAddressingMode &AM, EVT VT,
+ SDValue &Base, SDValue &Disp, SDValue &Index) const;
+
+ // Try to match Addr as a FormBD address with displacement type DR.
+ // Return true on success, storing the base and displacement in
+ // Base and Disp respectively.
+ bool selectBDAddr(SystemZAddressingMode::DispRange DR, SDValue Addr,
+ SDValue &Base, SDValue &Disp) const;
+
+ // Try to match Addr as a FormBDX address with displacement type DR.
+ // Return true on success and if the result had no index. Store the
+ // base and displacement in Base and Disp respectively.
+ bool selectMVIAddr(SystemZAddressingMode::DispRange DR, SDValue Addr,
+ SDValue &Base, SDValue &Disp) const;
+
+ // Try to match Addr as a FormBDX* address of form Form with
+ // displacement type DR. Return true on success, storing the base,
+ // displacement and index in Base, Disp and Index respectively.
+ bool selectBDXAddr(SystemZAddressingMode::AddrForm Form,
+ SystemZAddressingMode::DispRange DR, SDValue Addr,
+ SDValue &Base, SDValue &Disp, SDValue &Index) const;
+
+ // PC-relative address matching routines used by SystemZOperands.td.
+ bool selectPCRelAddress(SDValue Addr, SDValue &Target) const {
+ if (SystemZISD::isPCREL(Addr.getOpcode())) {
+ Target = Addr.getOperand(0);
+ return true;
+ }
+ return false;
+ }
+
+ // BD matching routines used by SystemZOperands.td.
+ bool selectBDAddr12Only(SDValue Addr, SDValue &Base, SDValue &Disp) const {
+ return selectBDAddr(SystemZAddressingMode::Disp12Only, Addr, Base, Disp);
+ }
+ bool selectBDAddr12Pair(SDValue Addr, SDValue &Base, SDValue &Disp) const {
+ return selectBDAddr(SystemZAddressingMode::Disp12Pair, Addr, Base, Disp);
+ }
+ bool selectBDAddr20Only(SDValue Addr, SDValue &Base, SDValue &Disp) const {
+ return selectBDAddr(SystemZAddressingMode::Disp20Only, Addr, Base, Disp);
+ }
+ bool selectBDAddr20Pair(SDValue Addr, SDValue &Base, SDValue &Disp) const {
+ return selectBDAddr(SystemZAddressingMode::Disp20Pair, Addr, Base, Disp);
+ }
+
+ // MVI matching routines used by SystemZOperands.td.
+ bool selectMVIAddr12Pair(SDValue Addr, SDValue &Base, SDValue &Disp) const {
+ return selectMVIAddr(SystemZAddressingMode::Disp12Pair, Addr, Base, Disp);
+ }
+ bool selectMVIAddr20Pair(SDValue Addr, SDValue &Base, SDValue &Disp) const {
+ return selectMVIAddr(SystemZAddressingMode::Disp20Pair, Addr, Base, Disp);
+ }
+
+ // BDX matching routines used by SystemZOperands.td.
+ bool selectBDXAddr12Only(SDValue Addr, SDValue &Base, SDValue &Disp,
+ SDValue &Index) const {
+ return selectBDXAddr(SystemZAddressingMode::FormBDXNormal,
+ SystemZAddressingMode::Disp12Only,
+ Addr, Base, Disp, Index);
+ }
+ bool selectBDXAddr12Pair(SDValue Addr, SDValue &Base, SDValue &Disp,
+ SDValue &Index) const {
+ return selectBDXAddr(SystemZAddressingMode::FormBDXNormal,
+ SystemZAddressingMode::Disp12Pair,
+ Addr, Base, Disp, Index);
+ }
+ bool selectDynAlloc12Only(SDValue Addr, SDValue &Base, SDValue &Disp,
+ SDValue &Index) const {
+ return selectBDXAddr(SystemZAddressingMode::FormBDXDynAlloc,
+ SystemZAddressingMode::Disp12Only,
+ Addr, Base, Disp, Index);
+ }
+ bool selectBDXAddr20Only(SDValue Addr, SDValue &Base, SDValue &Disp,
+ SDValue &Index) const {
+ return selectBDXAddr(SystemZAddressingMode::FormBDXNormal,
+ SystemZAddressingMode::Disp20Only,
+ Addr, Base, Disp, Index);
+ }
+ bool selectBDXAddr20Only128(SDValue Addr, SDValue &Base, SDValue &Disp,
+ SDValue &Index) const {
+ return selectBDXAddr(SystemZAddressingMode::FormBDXNormal,
+ SystemZAddressingMode::Disp20Only128,
+ Addr, Base, Disp, Index);
+ }
+ bool selectBDXAddr20Pair(SDValue Addr, SDValue &Base, SDValue &Disp,
+ SDValue &Index) const {
+ return selectBDXAddr(SystemZAddressingMode::FormBDXNormal,
+ SystemZAddressingMode::Disp20Pair,
+ Addr, Base, Disp, Index);
+ }
+ bool selectLAAddr12Pair(SDValue Addr, SDValue &Base, SDValue &Disp,
+ SDValue &Index) const {
+ return selectBDXAddr(SystemZAddressingMode::FormBDXLA,
+ SystemZAddressingMode::Disp12Pair,
+ Addr, Base, Disp, Index);
+ }
+ bool selectLAAddr20Pair(SDValue Addr, SDValue &Base, SDValue &Disp,
+ SDValue &Index) const {
+ return selectBDXAddr(SystemZAddressingMode::FormBDXLA,
+ SystemZAddressingMode::Disp20Pair,
+ Addr, Base, Disp, Index);
+ }
+
+ // Try to match Addr as an address with a base, 12-bit displacement
+ // and index, where the index is element Elem of a vector.
+ // Return true on success, storing the base, displacement and vector
+ // in Base, Disp and Index respectively.
+ bool selectBDVAddr12Only(SDValue Addr, SDValue Elem, SDValue &Base,
+ SDValue &Disp, SDValue &Index) const;
+
+ // Check whether (or Op (and X InsertMask)) is effectively an insertion
+ // of X into bits InsertMask of some Y != Op. Return true if so and
+ // set Op to that Y.
+ bool detectOrAndInsertion(SDValue &Op, uint64_t InsertMask) const;
+
+ // Try to update RxSBG so that only the bits of RxSBG.Input in Mask are used.
+ // Return true on success.
+ bool refineRxSBGMask(RxSBGOperands &RxSBG, uint64_t Mask) const;
+
+ // Try to fold some of RxSBG.Input into other fields of RxSBG.
+ // Return true on success.
+ bool expandRxSBG(RxSBGOperands &RxSBG) const;
+
+ // Return an undefined value of type VT.
+ SDValue getUNDEF(const SDLoc &DL, EVT VT) const;
+
+ // Convert N to VT, if it isn't already.
+ SDValue convertTo(const SDLoc &DL, EVT VT, SDValue N) const;
+
+ // Try to implement AND or shift node N using RISBG with the zero flag set.
+ // Return the selected node on success, otherwise return null.
+ bool tryRISBGZero(SDNode *N);
+
+ // Try to use RISBG or Opcode to implement OR or XOR node N.
+ // Return the selected node on success, otherwise return null.
+ bool tryRxSBG(SDNode *N, unsigned Opcode);
+
+ // If Op0 is null, then Node is a constant that can be loaded using:
+ //
+ // (Opcode UpperVal LowerVal)
+ //
+ // If Op0 is nonnull, then Node can be implemented using:
+ //
+ // (Opcode (Opcode Op0 UpperVal) LowerVal)
+ void splitLargeImmediate(unsigned Opcode, SDNode *Node, SDValue Op0,
+ uint64_t UpperVal, uint64_t LowerVal);
+
+ // Try to use gather instruction Opcode to implement vector insertion N.
+ bool tryGather(SDNode *N, unsigned Opcode);
+
+ // Try to use scatter instruction Opcode to implement store Store.
+ bool tryScatter(StoreSDNode *Store, unsigned Opcode);
+
+ // Return true if Load and Store are loads and stores of the same size
+ // and are guaranteed not to overlap. Such operations can be implemented
+ // using block (SS-format) instructions.
+ //
+ // Partial overlap would lead to incorrect code, since the block operations
+ // are logically bytewise, even though they have a fast path for the
+ // non-overlapping case. We also need to avoid full overlap (i.e. two
+ // addresses that might be equal at run time) because although that case
+ // would be handled correctly, it might be implemented by millicode.
+ bool canUseBlockOperation(StoreSDNode *Store, LoadSDNode *Load) const;
+
+ // N is a (store (load Y), X) pattern. Return true if it can use an MVC
+ // from Y to X.
+ bool storeLoadCanUseMVC(SDNode *N) const;
+
+ // N is a (store (op (load A[0]), (load A[1])), X) pattern. Return true
+ // if A[1 - I] == X and if N can use a block operation like NC from A[I]
+ // to X.
+ bool storeLoadCanUseBlockBinary(SDNode *N, unsigned I) const;
+
+public:
+ SystemZDAGToDAGISel(SystemZTargetMachine &TM, CodeGenOpt::Level OptLevel)
+ : SelectionDAGISel(TM, OptLevel) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ Subtarget = &MF.getSubtarget<SystemZSubtarget>();
+ return SelectionDAGISel::runOnMachineFunction(MF);
+ }
+
+ // Override MachineFunctionPass.
+ StringRef getPassName() const override {
+ return "SystemZ DAG->DAG Pattern Instruction Selection";
+ }
+
+ // Override SelectionDAGISel.
+ void Select(SDNode *Node) override;
+ bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
+ std::vector<SDValue> &OutOps) override;
+
+ // Include the pieces autogenerated from the target description.
+ #include "SystemZGenDAGISel.inc"
+};
+} // end anonymous namespace
+
+FunctionPass *llvm::createSystemZISelDag(SystemZTargetMachine &TM,
+ CodeGenOpt::Level OptLevel) {
+ return new SystemZDAGToDAGISel(TM, OptLevel);
+}
+
+// Return true if Val should be selected as a displacement for an address
+// with range DR. Here we're interested in the range of both the instruction
+// described by DR and of any pairing instruction.
+static bool selectDisp(SystemZAddressingMode::DispRange DR, int64_t Val) {
+ switch (DR) {
+ case SystemZAddressingMode::Disp12Only:
+ return isUInt<12>(Val);
+
+ case SystemZAddressingMode::Disp12Pair:
+ case SystemZAddressingMode::Disp20Only:
+ case SystemZAddressingMode::Disp20Pair:
+ return isInt<20>(Val);
+
+ case SystemZAddressingMode::Disp20Only128:
+ return isInt<20>(Val) && isInt<20>(Val + 8);
+ }
+ llvm_unreachable("Unhandled displacement range");
+}
+
+// Change the base or index in AM to Value, where IsBase selects
+// between the base and index.
+static void changeComponent(SystemZAddressingMode &AM, bool IsBase,
+ SDValue Value) {
+ if (IsBase)
+ AM.Base = Value;
+ else
+ AM.Index = Value;
+}
+
+// The base or index of AM is equivalent to Value + ADJDYNALLOC,
+// where IsBase selects between the base and index. Try to fold the
+// ADJDYNALLOC into AM.
+static bool expandAdjDynAlloc(SystemZAddressingMode &AM, bool IsBase,
+ SDValue Value) {
+ if (AM.isDynAlloc() && !AM.IncludesDynAlloc) {
+ changeComponent(AM, IsBase, Value);
+ AM.IncludesDynAlloc = true;
+ return true;
+ }
+ return false;
+}
+
+// The base of AM is equivalent to Base + Index. Try to use Index as
+// the index register.
+static bool expandIndex(SystemZAddressingMode &AM, SDValue Base,
+ SDValue Index) {
+ if (AM.hasIndexField() && !AM.Index.getNode()) {
+ AM.Base = Base;
+ AM.Index = Index;
+ return true;
+ }
+ return false;
+}
+
+// The base or index of AM is equivalent to Op0 + Op1, where IsBase selects
+// between the base and index. Try to fold Op1 into AM's displacement.
+static bool expandDisp(SystemZAddressingMode &AM, bool IsBase,
+ SDValue Op0, uint64_t Op1) {
+ // First try adjusting the displacement.
+ int64_t TestDisp = AM.Disp + Op1;
+ if (selectDisp(AM.DR, TestDisp)) {
+ changeComponent(AM, IsBase, Op0);
+ AM.Disp = TestDisp;
+ return true;
+ }
+
+ // We could consider forcing the displacement into a register and
+ // using it as an index, but it would need to be carefully tuned.
+ return false;
+}
+
+bool SystemZDAGToDAGISel::expandAddress(SystemZAddressingMode &AM,
+ bool IsBase) const {
+ SDValue N = IsBase ? AM.Base : AM.Index;
+ unsigned Opcode = N.getOpcode();
+ if (Opcode == ISD::TRUNCATE) {
+ N = N.getOperand(0);
+ Opcode = N.getOpcode();
+ }
+ if (Opcode == ISD::ADD || CurDAG->isBaseWithConstantOffset(N)) {
+ SDValue Op0 = N.getOperand(0);
+ SDValue Op1 = N.getOperand(1);
+
+ unsigned Op0Code = Op0->getOpcode();
+ unsigned Op1Code = Op1->getOpcode();
+
+ if (Op0Code == SystemZISD::ADJDYNALLOC)
+ return expandAdjDynAlloc(AM, IsBase, Op1);
+ if (Op1Code == SystemZISD::ADJDYNALLOC)
+ return expandAdjDynAlloc(AM, IsBase, Op0);
+
+ if (Op0Code == ISD::Constant)
+ return expandDisp(AM, IsBase, Op1,
+ cast<ConstantSDNode>(Op0)->getSExtValue());
+ if (Op1Code == ISD::Constant)
+ return expandDisp(AM, IsBase, Op0,
+ cast<ConstantSDNode>(Op1)->getSExtValue());
+
+ if (IsBase && expandIndex(AM, Op0, Op1))
+ return true;
+ }
+ if (Opcode == SystemZISD::PCREL_OFFSET) {
+ SDValue Full = N.getOperand(0);
+ SDValue Base = N.getOperand(1);
+ SDValue Anchor = Base.getOperand(0);
+ uint64_t Offset = (cast<GlobalAddressSDNode>(Full)->getOffset() -
+ cast<GlobalAddressSDNode>(Anchor)->getOffset());
+ return expandDisp(AM, IsBase, Base, Offset);
+ }
+ return false;
+}
+
+// Return true if an instruction with displacement range DR should be
+// used for displacement value Val. selectDisp(DR, Val) must already hold.
+static bool isValidDisp(SystemZAddressingMode::DispRange DR, int64_t Val) {
+ assert(selectDisp(DR, Val) && "Invalid displacement");
+ switch (DR) {
+ case SystemZAddressingMode::Disp12Only:
+ case SystemZAddressingMode::Disp20Only:
+ case SystemZAddressingMode::Disp20Only128:
+ return true;
+
+ case SystemZAddressingMode::Disp12Pair:
+ // Use the other instruction if the displacement is too large.
+ return isUInt<12>(Val);
+
+ case SystemZAddressingMode::Disp20Pair:
+ // Use the other instruction if the displacement is small enough.
+ return !isUInt<12>(Val);
+ }
+ llvm_unreachable("Unhandled displacement range");
+}
+
+// Return true if Base + Disp + Index should be performed by LA(Y).
+static bool shouldUseLA(SDNode *Base, int64_t Disp, SDNode *Index) {
+ // Don't use LA(Y) for constants.
+ if (!Base)
+ return false;
+
+ // Always use LA(Y) for frame addresses, since we know that the destination
+ // register is almost always (perhaps always) going to be different from
+ // the frame register.
+ if (Base->getOpcode() == ISD::FrameIndex)
+ return true;
+
+ if (Disp) {
+ // Always use LA(Y) if there is a base, displacement and index.
+ if (Index)
+ return true;
+
+ // Always use LA if the displacement is small enough. It should always
+ // be no worse than AGHI (and better if it avoids a move).
+ if (isUInt<12>(Disp))
+ return true;
+
+ // For similar reasons, always use LAY if the constant is too big for AGHI.
+ // LAY should be no worse than AGFI.
+ if (!isInt<16>(Disp))
+ return true;
+ } else {
+ // Don't use LA for plain registers.
+ if (!Index)
+ return false;
+
+ // Don't use LA for plain addition if the index operand is only used
+ // once. It should be a natural two-operand addition in that case.
+ if (Index->hasOneUse())
+ return false;
+
+ // Prefer addition if the second operation is sign-extended, in the
+ // hope of using AGF.
+ unsigned IndexOpcode = Index->getOpcode();
+ if (IndexOpcode == ISD::SIGN_EXTEND ||
+ IndexOpcode == ISD::SIGN_EXTEND_INREG)
+ return false;
+ }
+
+ // Don't use LA for two-operand addition if either operand is only
+ // used once. The addition instructions are better in that case.
+ if (Base->hasOneUse())
+ return false;
+
+ return true;
+}
+
+// Return true if Addr is suitable for AM, updating AM if so.
+bool SystemZDAGToDAGISel::selectAddress(SDValue Addr,
+ SystemZAddressingMode &AM) const {
+ // Start out assuming that the address will need to be loaded separately,
+ // then try to extend it as much as we can.
+ AM.Base = Addr;
+
+ // First try treating the address as a constant.
+ if (Addr.getOpcode() == ISD::Constant &&
+ expandDisp(AM, true, SDValue(),
+ cast<ConstantSDNode>(Addr)->getSExtValue()))
+ ;
+ // Also see if it's a bare ADJDYNALLOC.
+ else if (Addr.getOpcode() == SystemZISD::ADJDYNALLOC &&
+ expandAdjDynAlloc(AM, true, SDValue()))
+ ;
+ else
+ // Otherwise try expanding each component.
+ while (expandAddress(AM, true) ||
+ (AM.Index.getNode() && expandAddress(AM, false)))
+ continue;
+
+ // Reject cases where it isn't profitable to use LA(Y).
+ if (AM.Form == SystemZAddressingMode::FormBDXLA &&
+ !shouldUseLA(AM.Base.getNode(), AM.Disp, AM.Index.getNode()))
+ return false;
+
+ // Reject cases where the other instruction in a pair should be used.
+ if (!isValidDisp(AM.DR, AM.Disp))
+ return false;
+
+ // Make sure that ADJDYNALLOC is included where necessary.
+ if (AM.isDynAlloc() && !AM.IncludesDynAlloc)
+ return false;
+
+ DEBUG(AM.dump());
+ return true;
+}
+
+// Insert a node into the DAG at least before Pos. This will reposition
+// the node as needed, and will assign it a node ID that is <= Pos's ID.
+// Note that this does *not* preserve the uniqueness of node IDs!
+// The selection DAG must no longer depend on their uniqueness when this
+// function is used.
+static void insertDAGNode(SelectionDAG *DAG, SDNode *Pos, SDValue N) {
+ if (N.getNode()->getNodeId() == -1 ||
+ N.getNode()->getNodeId() > Pos->getNodeId()) {
+ DAG->RepositionNode(Pos->getIterator(), N.getNode());
+ N.getNode()->setNodeId(Pos->getNodeId());
+ }
+}
+
+void SystemZDAGToDAGISel::getAddressOperands(const SystemZAddressingMode &AM,
+ EVT VT, SDValue &Base,
+ SDValue &Disp) const {
+ Base = AM.Base;
+ if (!Base.getNode())
+ // Register 0 means "no base". This is mostly useful for shifts.
+ Base = CurDAG->getRegister(0, VT);
+ else if (Base.getOpcode() == ISD::FrameIndex) {
+ // Lower a FrameIndex to a TargetFrameIndex.
+ int64_t FrameIndex = cast<FrameIndexSDNode>(Base)->getIndex();
+ Base = CurDAG->getTargetFrameIndex(FrameIndex, VT);
+ } else if (Base.getValueType() != VT) {
+ // Truncate values from i64 to i32, for shifts.
+ assert(VT == MVT::i32 && Base.getValueType() == MVT::i64 &&
+ "Unexpected truncation");
+ SDLoc DL(Base);
+ SDValue Trunc = CurDAG->getNode(ISD::TRUNCATE, DL, VT, Base);
+ insertDAGNode(CurDAG, Base.getNode(), Trunc);
+ Base = Trunc;
+ }
+
+ // Lower the displacement to a TargetConstant.
+ Disp = CurDAG->getTargetConstant(AM.Disp, SDLoc(Base), VT);
+}
+
+void SystemZDAGToDAGISel::getAddressOperands(const SystemZAddressingMode &AM,
+ EVT VT, SDValue &Base,
+ SDValue &Disp,
+ SDValue &Index) const {
+ getAddressOperands(AM, VT, Base, Disp);
+
+ Index = AM.Index;
+ if (!Index.getNode())
+ // Register 0 means "no index".
+ Index = CurDAG->getRegister(0, VT);
+}
+
+bool SystemZDAGToDAGISel::selectBDAddr(SystemZAddressingMode::DispRange DR,
+ SDValue Addr, SDValue &Base,
+ SDValue &Disp) const {
+ SystemZAddressingMode AM(SystemZAddressingMode::FormBD, DR);
+ if (!selectAddress(Addr, AM))
+ return false;
+
+ getAddressOperands(AM, Addr.getValueType(), Base, Disp);
+ return true;
+}
+
+bool SystemZDAGToDAGISel::selectMVIAddr(SystemZAddressingMode::DispRange DR,
+ SDValue Addr, SDValue &Base,
+ SDValue &Disp) const {
+ SystemZAddressingMode AM(SystemZAddressingMode::FormBDXNormal, DR);
+ if (!selectAddress(Addr, AM) || AM.Index.getNode())
+ return false;
+
+ getAddressOperands(AM, Addr.getValueType(), Base, Disp);
+ return true;
+}
+
+bool SystemZDAGToDAGISel::selectBDXAddr(SystemZAddressingMode::AddrForm Form,
+ SystemZAddressingMode::DispRange DR,
+ SDValue Addr, SDValue &Base,
+ SDValue &Disp, SDValue &Index) const {
+ SystemZAddressingMode AM(Form, DR);
+ if (!selectAddress(Addr, AM))
+ return false;
+
+ getAddressOperands(AM, Addr.getValueType(), Base, Disp, Index);
+ return true;
+}
+
+bool SystemZDAGToDAGISel::selectBDVAddr12Only(SDValue Addr, SDValue Elem,
+ SDValue &Base,
+ SDValue &Disp,
+ SDValue &Index) const {
+ SDValue Regs[2];
+ if (selectBDXAddr12Only(Addr, Regs[0], Disp, Regs[1]) &&
+ Regs[0].getNode() && Regs[1].getNode()) {
+ for (unsigned int I = 0; I < 2; ++I) {
+ Base = Regs[I];
+ Index = Regs[1 - I];
+ // We can't tell here whether the index vector has the right type
+ // for the access; the caller needs to do that instead.
+ if (Index.getOpcode() == ISD::ZERO_EXTEND)
+ Index = Index.getOperand(0);
+ if (Index.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ Index.getOperand(1) == Elem) {
+ Index = Index.getOperand(0);
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+bool SystemZDAGToDAGISel::detectOrAndInsertion(SDValue &Op,
+ uint64_t InsertMask) const {
+ // We're only interested in cases where the insertion is into some operand
+ // of Op, rather than into Op itself. The only useful case is an AND.
+ if (Op.getOpcode() != ISD::AND)
+ return false;
+
+ // We need a constant mask.
+ auto *MaskNode = dyn_cast<ConstantSDNode>(Op.getOperand(1).getNode());
+ if (!MaskNode)
+ return false;
+
+ // It's not an insertion of Op.getOperand(0) if the two masks overlap.
+ uint64_t AndMask = MaskNode->getZExtValue();
+ if (InsertMask & AndMask)
+ return false;
+
+ // It's only an insertion if all bits are covered or are known to be zero.
+ // The inner check covers all cases but is more expensive.
+ uint64_t Used = allOnes(Op.getValueSizeInBits());
+ if (Used != (AndMask | InsertMask)) {
+ APInt KnownZero, KnownOne;
+ CurDAG->computeKnownBits(Op.getOperand(0), KnownZero, KnownOne);
+ if (Used != (AndMask | InsertMask | KnownZero.getZExtValue()))
+ return false;
+ }
+
+ Op = Op.getOperand(0);
+ return true;
+}
+
+bool SystemZDAGToDAGISel::refineRxSBGMask(RxSBGOperands &RxSBG,
+ uint64_t Mask) const {
+ const SystemZInstrInfo *TII = getInstrInfo();
+ if (RxSBG.Rotate != 0)
+ Mask = (Mask << RxSBG.Rotate) | (Mask >> (64 - RxSBG.Rotate));
+ Mask &= RxSBG.Mask;
+ if (TII->isRxSBGMask(Mask, RxSBG.BitSize, RxSBG.Start, RxSBG.End)) {
+ RxSBG.Mask = Mask;
+ return true;
+ }
+ return false;
+}
+
+// Return true if any bits of (RxSBG.Input & Mask) are significant.
+static bool maskMatters(RxSBGOperands &RxSBG, uint64_t Mask) {
+ // Rotate the mask in the same way as RxSBG.Input is rotated.
+ if (RxSBG.Rotate != 0)
+ Mask = ((Mask << RxSBG.Rotate) | (Mask >> (64 - RxSBG.Rotate)));
+ return (Mask & RxSBG.Mask) != 0;
+}
+
+bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) const {
+ SDValue N = RxSBG.Input;
+ unsigned Opcode = N.getOpcode();
+ switch (Opcode) {
+ case ISD::TRUNCATE: {
+ if (RxSBG.Opcode == SystemZ::RNSBG)
+ return false;
+ uint64_t BitSize = N.getValueSizeInBits();
+ uint64_t Mask = allOnes(BitSize);
+ if (!refineRxSBGMask(RxSBG, Mask))
+ return false;
+ RxSBG.Input = N.getOperand(0);
+ return true;
+ }
+ case ISD::AND: {
+ if (RxSBG.Opcode == SystemZ::RNSBG)
+ return false;
+
+ auto *MaskNode = dyn_cast<ConstantSDNode>(N.getOperand(1).getNode());
+ if (!MaskNode)
+ return false;
+
+ SDValue Input = N.getOperand(0);
+ uint64_t Mask = MaskNode->getZExtValue();
+ if (!refineRxSBGMask(RxSBG, Mask)) {
+ // If some bits of Input are already known zeros, those bits will have
+ // been removed from the mask. See if adding them back in makes the
+ // mask suitable.
+ APInt KnownZero, KnownOne;
+ CurDAG->computeKnownBits(Input, KnownZero, KnownOne);
+ Mask |= KnownZero.getZExtValue();
+ if (!refineRxSBGMask(RxSBG, Mask))
+ return false;
+ }
+ RxSBG.Input = Input;
+ return true;
+ }
+
+ case ISD::OR: {
+ if (RxSBG.Opcode != SystemZ::RNSBG)
+ return false;
+
+ auto *MaskNode = dyn_cast<ConstantSDNode>(N.getOperand(1).getNode());
+ if (!MaskNode)
+ return false;
+
+ SDValue Input = N.getOperand(0);
+ uint64_t Mask = ~MaskNode->getZExtValue();
+ if (!refineRxSBGMask(RxSBG, Mask)) {
+ // If some bits of Input are already known ones, those bits will have
+ // been removed from the mask. See if adding them back in makes the
+ // mask suitable.
+ APInt KnownZero, KnownOne;
+ CurDAG->computeKnownBits(Input, KnownZero, KnownOne);
+ Mask &= ~KnownOne.getZExtValue();
+ if (!refineRxSBGMask(RxSBG, Mask))
+ return false;
+ }
+ RxSBG.Input = Input;
+ return true;
+ }
+
+ case ISD::ROTL: {
+ // Any 64-bit rotate left can be merged into the RxSBG.
+ if (RxSBG.BitSize != 64 || N.getValueType() != MVT::i64)
+ return false;
+ auto *CountNode = dyn_cast<ConstantSDNode>(N.getOperand(1).getNode());
+ if (!CountNode)
+ return false;
+
+ RxSBG.Rotate = (RxSBG.Rotate + CountNode->getZExtValue()) & 63;
+ RxSBG.Input = N.getOperand(0);
+ return true;
+ }
+
+ case ISD::ANY_EXTEND:
+ // Bits above the extended operand are don't-care.
+ RxSBG.Input = N.getOperand(0);
+ return true;
+
+ case ISD::ZERO_EXTEND:
+ if (RxSBG.Opcode != SystemZ::RNSBG) {
+ // Restrict the mask to the extended operand.
+ unsigned InnerBitSize = N.getOperand(0).getValueSizeInBits();
+ if (!refineRxSBGMask(RxSBG, allOnes(InnerBitSize)))
+ return false;
+
+ RxSBG.Input = N.getOperand(0);
+ return true;
+ }
+ LLVM_FALLTHROUGH;
+
+ case ISD::SIGN_EXTEND: {
+ // Check that the extension bits are don't-care (i.e. are masked out
+ // by the final mask).
+ unsigned InnerBitSize = N.getOperand(0).getValueSizeInBits();
+ if (maskMatters(RxSBG, allOnes(RxSBG.BitSize) - allOnes(InnerBitSize)))
+ return false;
+
+ RxSBG.Input = N.getOperand(0);
+ return true;
+ }
+
+ case ISD::SHL: {
+ auto *CountNode = dyn_cast<ConstantSDNode>(N.getOperand(1).getNode());
+ if (!CountNode)
+ return false;
+
+ uint64_t Count = CountNode->getZExtValue();
+ unsigned BitSize = N.getValueSizeInBits();
+ if (Count < 1 || Count >= BitSize)
+ return false;
+
+ if (RxSBG.Opcode == SystemZ::RNSBG) {
+ // Treat (shl X, count) as (rotl X, size-count) as long as the bottom
+ // count bits from RxSBG.Input are ignored.
+ if (maskMatters(RxSBG, allOnes(Count)))
+ return false;
+ } else {
+ // Treat (shl X, count) as (and (rotl X, count), ~0<<count).
+ if (!refineRxSBGMask(RxSBG, allOnes(BitSize - Count) << Count))
+ return false;
+ }
+
+ RxSBG.Rotate = (RxSBG.Rotate + Count) & 63;
+ RxSBG.Input = N.getOperand(0);
+ return true;
+ }
+
+ case ISD::SRL:
+ case ISD::SRA: {
+ auto *CountNode = dyn_cast<ConstantSDNode>(N.getOperand(1).getNode());
+ if (!CountNode)
+ return false;
+
+ uint64_t Count = CountNode->getZExtValue();
+ unsigned BitSize = N.getValueSizeInBits();
+ if (Count < 1 || Count >= BitSize)
+ return false;
+
+ if (RxSBG.Opcode == SystemZ::RNSBG || Opcode == ISD::SRA) {
+ // Treat (srl|sra X, count) as (rotl X, size-count) as long as the top
+ // count bits from RxSBG.Input are ignored.
+ if (maskMatters(RxSBG, allOnes(Count) << (BitSize - Count)))
+ return false;
+ } else {
+ // Treat (srl X, count), mask) as (and (rotl X, size-count), ~0>>count),
+ // which is similar to SLL above.
+ if (!refineRxSBGMask(RxSBG, allOnes(BitSize - Count)))
+ return false;
+ }
+
+ RxSBG.Rotate = (RxSBG.Rotate - Count) & 63;
+ RxSBG.Input = N.getOperand(0);
+ return true;
+ }
+ default:
+ return false;
+ }
+}
+
+SDValue SystemZDAGToDAGISel::getUNDEF(const SDLoc &DL, EVT VT) const {
+ SDNode *N = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
+ return SDValue(N, 0);
+}
+
+SDValue SystemZDAGToDAGISel::convertTo(const SDLoc &DL, EVT VT,
+ SDValue N) const {
+ if (N.getValueType() == MVT::i32 && VT == MVT::i64)
+ return CurDAG->getTargetInsertSubreg(SystemZ::subreg_l32,
+ DL, VT, getUNDEF(DL, MVT::i64), N);
+ if (N.getValueType() == MVT::i64 && VT == MVT::i32)
+ return CurDAG->getTargetExtractSubreg(SystemZ::subreg_l32, DL, VT, N);
+ assert(N.getValueType() == VT && "Unexpected value types");
+ return N;
+}
+
+bool SystemZDAGToDAGISel::tryRISBGZero(SDNode *N) {
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+ if (!VT.isInteger() || VT.getSizeInBits() > 64)
+ return false;
+ RxSBGOperands RISBG(SystemZ::RISBG, SDValue(N, 0));
+ unsigned Count = 0;
+ while (expandRxSBG(RISBG))
+ // The widening or narrowing is expected to be free.
+ // Counting widening or narrowing as a saved operation will result in
+ // preferring an R*SBG over a simple shift/logical instruction.
+ if (RISBG.Input.getOpcode() != ISD::ANY_EXTEND &&
+ RISBG.Input.getOpcode() != ISD::TRUNCATE)
+ Count += 1;
+ if (Count == 0)
+ return false;
+
+ // Prefer to use normal shift instructions over RISBG, since they can handle
+ // all cases and are sometimes shorter.
+ if (Count == 1 && N->getOpcode() != ISD::AND)
+ return false;
+
+ // Prefer register extensions like LLC over RISBG. Also prefer to start
+ // out with normal ANDs if one instruction would be enough. We can convert
+ // these ANDs into an RISBG later if a three-address instruction is useful.
+ if (RISBG.Rotate == 0) {
+ bool PreferAnd = false;
+ // Prefer AND for any 32-bit and-immediate operation.
+ if (VT == MVT::i32)
+ PreferAnd = true;
+ // As well as for any 64-bit operation that can be implemented via LLC(R),
+ // LLH(R), LLGT(R), or one of the and-immediate instructions.
+ else if (RISBG.Mask == 0xff ||
+ RISBG.Mask == 0xffff ||
+ RISBG.Mask == 0x7fffffff ||
+ SystemZ::isImmLF(~RISBG.Mask) ||
+ SystemZ::isImmHF(~RISBG.Mask))
+ PreferAnd = true;
+ // And likewise for the LLZRGF instruction, which doesn't have a register
+ // to register version.
+ else if (auto *Load = dyn_cast<LoadSDNode>(RISBG.Input)) {
+ if (Load->getMemoryVT() == MVT::i32 &&
+ (Load->getExtensionType() == ISD::EXTLOAD ||
+ Load->getExtensionType() == ISD::ZEXTLOAD) &&
+ RISBG.Mask == 0xffffff00 &&
+ Subtarget->hasLoadAndZeroRightmostByte())
+ PreferAnd = true;
+ }
+ if (PreferAnd) {
+ // Replace the current node with an AND. Note that the current node
+ // might already be that same AND, in which case it is already CSE'd
+ // with it, and we must not call ReplaceNode.
+ SDValue In = convertTo(DL, VT, RISBG.Input);
+ SDValue Mask = CurDAG->getConstant(RISBG.Mask, DL, VT);
+ SDValue New = CurDAG->getNode(ISD::AND, DL, VT, In, Mask);
+ if (N != New.getNode()) {
+ insertDAGNode(CurDAG, N, Mask);
+ insertDAGNode(CurDAG, N, New);
+ ReplaceNode(N, New.getNode());
+ N = New.getNode();
+ }
+ // Now, select the machine opcode to implement this operation.
+ SelectCode(N);
+ return true;
+ }
+ }
+
+ unsigned Opcode = SystemZ::RISBG;
+ // Prefer RISBGN if available, since it does not clobber CC.
+ if (Subtarget->hasMiscellaneousExtensions())
+ Opcode = SystemZ::RISBGN;
+ EVT OpcodeVT = MVT::i64;
+ if (VT == MVT::i32 && Subtarget->hasHighWord()) {
+ Opcode = SystemZ::RISBMux;
+ OpcodeVT = MVT::i32;
+ RISBG.Start &= 31;
+ RISBG.End &= 31;
+ }
+ SDValue Ops[5] = {
+ getUNDEF(DL, OpcodeVT),
+ convertTo(DL, OpcodeVT, RISBG.Input),
+ CurDAG->getTargetConstant(RISBG.Start, DL, MVT::i32),
+ CurDAG->getTargetConstant(RISBG.End | 128, DL, MVT::i32),
+ CurDAG->getTargetConstant(RISBG.Rotate, DL, MVT::i32)
+ };
+ SDValue New = convertTo(
+ DL, VT, SDValue(CurDAG->getMachineNode(Opcode, DL, OpcodeVT, Ops), 0));
+ ReplaceUses(N, New.getNode());
+ CurDAG->RemoveDeadNode(N);
+ return true;
+}
+
+bool SystemZDAGToDAGISel::tryRxSBG(SDNode *N, unsigned Opcode) {
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+ if (!VT.isInteger() || VT.getSizeInBits() > 64)
+ return false;
+ // Try treating each operand of N as the second operand of the RxSBG
+ // and see which goes deepest.
+ RxSBGOperands RxSBG[] = {
+ RxSBGOperands(Opcode, N->getOperand(0)),
+ RxSBGOperands(Opcode, N->getOperand(1))
+ };
+ unsigned Count[] = { 0, 0 };
+ for (unsigned I = 0; I < 2; ++I)
+ while (expandRxSBG(RxSBG[I]))
+ // The widening or narrowing is expected to be free.
+ // Counting widening or narrowing as a saved operation will result in
+ // preferring an R*SBG over a simple shift/logical instruction.
+ if (RxSBG[I].Input.getOpcode() != ISD::ANY_EXTEND &&
+ RxSBG[I].Input.getOpcode() != ISD::TRUNCATE)
+ Count[I] += 1;
+
+ // Do nothing if neither operand is suitable.
+ if (Count[0] == 0 && Count[1] == 0)
+ return false;
+
+ // Pick the deepest second operand.
+ unsigned I = Count[0] > Count[1] ? 0 : 1;
+ SDValue Op0 = N->getOperand(I ^ 1);
+
+ // Prefer IC for character insertions from memory.
+ if (Opcode == SystemZ::ROSBG && (RxSBG[I].Mask & 0xff) == 0)
+ if (auto *Load = dyn_cast<LoadSDNode>(Op0.getNode()))
+ if (Load->getMemoryVT() == MVT::i8)
+ return false;
+
+ // See whether we can avoid an AND in the first operand by converting
+ // ROSBG to RISBG.
+ if (Opcode == SystemZ::ROSBG && detectOrAndInsertion(Op0, RxSBG[I].Mask)) {
+ Opcode = SystemZ::RISBG;
+ // Prefer RISBGN if available, since it does not clobber CC.
+ if (Subtarget->hasMiscellaneousExtensions())
+ Opcode = SystemZ::RISBGN;
+ }
+
+ SDValue Ops[5] = {
+ convertTo(DL, MVT::i64, Op0),
+ convertTo(DL, MVT::i64, RxSBG[I].Input),
+ CurDAG->getTargetConstant(RxSBG[I].Start, DL, MVT::i32),
+ CurDAG->getTargetConstant(RxSBG[I].End, DL, MVT::i32),
+ CurDAG->getTargetConstant(RxSBG[I].Rotate, DL, MVT::i32)
+ };
+ SDValue New = convertTo(
+ DL, VT, SDValue(CurDAG->getMachineNode(Opcode, DL, MVT::i64, Ops), 0));
+ ReplaceNode(N, New.getNode());
+ return true;
+}
+
+void SystemZDAGToDAGISel::splitLargeImmediate(unsigned Opcode, SDNode *Node,
+ SDValue Op0, uint64_t UpperVal,
+ uint64_t LowerVal) {
+ EVT VT = Node->getValueType(0);
+ SDLoc DL(Node);
+ SDValue Upper = CurDAG->getConstant(UpperVal, DL, VT);
+ if (Op0.getNode())
+ Upper = CurDAG->getNode(Opcode, DL, VT, Op0, Upper);
+
+ {
+ // When we haven't passed in Op0, Upper will be a constant. In order to
+ // prevent folding back to the large immediate in `Or = getNode(...)` we run
+ // SelectCode first and end up with an opaque machine node. This means that
+ // we need to use a handle to keep track of Upper in case it gets CSE'd by
+ // SelectCode.
+ //
+ // Note that in the case where Op0 is passed in we could just call
+ // SelectCode(Upper) later, along with the SelectCode(Or), and avoid needing
+ // the handle at all, but it's fine to do it here.
+ //
+ // TODO: This is a pretty hacky way to do this. Can we do something that
+ // doesn't require a two paragraph explanation?
+ HandleSDNode Handle(Upper);
+ SelectCode(Upper.getNode());
+ Upper = Handle.getValue();
+ }
+
+ SDValue Lower = CurDAG->getConstant(LowerVal, DL, VT);
+ SDValue Or = CurDAG->getNode(Opcode, DL, VT, Upper, Lower);
+
+ ReplaceUses(Node, Or.getNode());
+ CurDAG->RemoveDeadNode(Node);
+
+ SelectCode(Or.getNode());
+}
+
+bool SystemZDAGToDAGISel::tryGather(SDNode *N, unsigned Opcode) {
+ SDValue ElemV = N->getOperand(2);
+ auto *ElemN = dyn_cast<ConstantSDNode>(ElemV);
+ if (!ElemN)
+ return false;
+
+ unsigned Elem = ElemN->getZExtValue();
+ EVT VT = N->getValueType(0);
+ if (Elem >= VT.getVectorNumElements())
+ return false;
+
+ auto *Load = dyn_cast<LoadSDNode>(N->getOperand(1));
+ if (!Load || !Load->hasOneUse())
+ return false;
+ if (Load->getMemoryVT().getSizeInBits() !=
+ Load->getValueType(0).getSizeInBits())
+ return false;
+
+ SDValue Base, Disp, Index;
+ if (!selectBDVAddr12Only(Load->getBasePtr(), ElemV, Base, Disp, Index) ||
+ Index.getValueType() != VT.changeVectorElementTypeToInteger())
+ return false;
+
+ SDLoc DL(Load);
+ SDValue Ops[] = {
+ N->getOperand(0), Base, Disp, Index,
+ CurDAG->getTargetConstant(Elem, DL, MVT::i32), Load->getChain()
+ };
+ SDNode *Res = CurDAG->getMachineNode(Opcode, DL, VT, MVT::Other, Ops);
+ ReplaceUses(SDValue(Load, 1), SDValue(Res, 1));
+ ReplaceNode(N, Res);
+ return true;
+}
+
+bool SystemZDAGToDAGISel::tryScatter(StoreSDNode *Store, unsigned Opcode) {
+ SDValue Value = Store->getValue();
+ if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return false;
+ if (Store->getMemoryVT().getSizeInBits() != Value.getValueSizeInBits())
+ return false;
+
+ SDValue ElemV = Value.getOperand(1);
+ auto *ElemN = dyn_cast<ConstantSDNode>(ElemV);
+ if (!ElemN)
+ return false;
+
+ SDValue Vec = Value.getOperand(0);
+ EVT VT = Vec.getValueType();
+ unsigned Elem = ElemN->getZExtValue();
+ if (Elem >= VT.getVectorNumElements())
+ return false;
+
+ SDValue Base, Disp, Index;
+ if (!selectBDVAddr12Only(Store->getBasePtr(), ElemV, Base, Disp, Index) ||
+ Index.getValueType() != VT.changeVectorElementTypeToInteger())
+ return false;
+
+ SDLoc DL(Store);
+ SDValue Ops[] = {
+ Vec, Base, Disp, Index, CurDAG->getTargetConstant(Elem, DL, MVT::i32),
+ Store->getChain()
+ };
+ ReplaceNode(Store, CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops));
+ return true;
+}
+
+bool SystemZDAGToDAGISel::canUseBlockOperation(StoreSDNode *Store,
+ LoadSDNode *Load) const {
+ // Check that the two memory operands have the same size.
+ if (Load->getMemoryVT() != Store->getMemoryVT())
+ return false;
+
+ // Volatility stops an access from being decomposed.
+ if (Load->isVolatile() || Store->isVolatile())
+ return false;
+
+ // There's no chance of overlap if the load is invariant.
+ if (Load->isInvariant() && Load->isDereferenceable())
+ return true;
+
+ // Otherwise we need to check whether there's an alias.
+ const Value *V1 = Load->getMemOperand()->getValue();
+ const Value *V2 = Store->getMemOperand()->getValue();
+ if (!V1 || !V2)
+ return false;
+
+ // Reject equality.
+ uint64_t Size = Load->getMemoryVT().getStoreSize();
+ int64_t End1 = Load->getSrcValueOffset() + Size;
+ int64_t End2 = Store->getSrcValueOffset() + Size;
+ if (V1 == V2 && End1 == End2)
+ return false;
+
+ return !AA->alias(MemoryLocation(V1, End1, Load->getAAInfo()),
+ MemoryLocation(V2, End2, Store->getAAInfo()));
+}
+
+bool SystemZDAGToDAGISel::storeLoadCanUseMVC(SDNode *N) const {
+ auto *Store = cast<StoreSDNode>(N);
+ auto *Load = cast<LoadSDNode>(Store->getValue());
+
+ // Prefer not to use MVC if either address can use ... RELATIVE LONG
+ // instructions.
+ uint64_t Size = Load->getMemoryVT().getStoreSize();
+ if (Size > 1 && Size <= 8) {
+ // Prefer LHRL, LRL and LGRL.
+ if (SystemZISD::isPCREL(Load->getBasePtr().getOpcode()))
+ return false;
+ // Prefer STHRL, STRL and STGRL.
+ if (SystemZISD::isPCREL(Store->getBasePtr().getOpcode()))
+ return false;
+ }
+
+ return canUseBlockOperation(Store, Load);
+}
+
+bool SystemZDAGToDAGISel::storeLoadCanUseBlockBinary(SDNode *N,
+ unsigned I) const {
+ auto *StoreA = cast<StoreSDNode>(N);
+ auto *LoadA = cast<LoadSDNode>(StoreA->getValue().getOperand(1 - I));
+ auto *LoadB = cast<LoadSDNode>(StoreA->getValue().getOperand(I));
+ return !LoadA->isVolatile() && canUseBlockOperation(StoreA, LoadB);
+}
+
+void SystemZDAGToDAGISel::Select(SDNode *Node) {
+ // Dump information about the Node being selected
+ DEBUG(errs() << "Selecting: "; Node->dump(CurDAG); errs() << "\n");
+
+ // If we have a custom node, we already have selected!
+ if (Node->isMachineOpcode()) {
+ DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+ Node->setNodeId(-1);
+ return;
+ }
+
+ unsigned Opcode = Node->getOpcode();
+ switch (Opcode) {
+ case ISD::OR:
+ if (Node->getOperand(1).getOpcode() != ISD::Constant)
+ if (tryRxSBG(Node, SystemZ::ROSBG))
+ return;
+ goto or_xor;
+
+ case ISD::XOR:
+ if (Node->getOperand(1).getOpcode() != ISD::Constant)
+ if (tryRxSBG(Node, SystemZ::RXSBG))
+ return;
+ // Fall through.
+ or_xor:
+ // If this is a 64-bit operation in which both 32-bit halves are nonzero,
+ // split the operation into two.
+ if (Node->getValueType(0) == MVT::i64)
+ if (auto *Op1 = dyn_cast<ConstantSDNode>(Node->getOperand(1))) {
+ uint64_t Val = Op1->getZExtValue();
+ if (!SystemZ::isImmLF(Val) && !SystemZ::isImmHF(Val)) {
+ splitLargeImmediate(Opcode, Node, Node->getOperand(0),
+ Val - uint32_t(Val), uint32_t(Val));
+ return;
+ }
+ }
+ break;
+
+ case ISD::AND:
+ if (Node->getOperand(1).getOpcode() != ISD::Constant)
+ if (tryRxSBG(Node, SystemZ::RNSBG))
+ return;
+ LLVM_FALLTHROUGH;
+ case ISD::ROTL:
+ case ISD::SHL:
+ case ISD::SRL:
+ case ISD::ZERO_EXTEND:
+ if (tryRISBGZero(Node))
+ return;
+ break;
+
+ case ISD::Constant:
+ // If this is a 64-bit constant that is out of the range of LLILF,
+ // LLIHF and LGFI, split it into two 32-bit pieces.
+ if (Node->getValueType(0) == MVT::i64) {
+ uint64_t Val = cast<ConstantSDNode>(Node)->getZExtValue();
+ if (!SystemZ::isImmLF(Val) && !SystemZ::isImmHF(Val) && !isInt<32>(Val)) {
+ splitLargeImmediate(ISD::OR, Node, SDValue(), Val - uint32_t(Val),
+ uint32_t(Val));
+ return;
+ }
+ }
+ break;
+
+ case SystemZISD::SELECT_CCMASK: {
+ SDValue Op0 = Node->getOperand(0);
+ SDValue Op1 = Node->getOperand(1);
+ // Prefer to put any load first, so that it can be matched as a
+ // conditional load. Likewise for constants in range for LOCHI.
+ if ((Op1.getOpcode() == ISD::LOAD && Op0.getOpcode() != ISD::LOAD) ||
+ (Subtarget->hasLoadStoreOnCond2() &&
+ Node->getValueType(0).isInteger() &&
+ Op1.getOpcode() == ISD::Constant &&
+ isInt<16>(cast<ConstantSDNode>(Op1)->getSExtValue()) &&
+ !(Op0.getOpcode() == ISD::Constant &&
+ isInt<16>(cast<ConstantSDNode>(Op0)->getSExtValue())))) {
+ SDValue CCValid = Node->getOperand(2);
+ SDValue CCMask = Node->getOperand(3);
+ uint64_t ConstCCValid =
+ cast<ConstantSDNode>(CCValid.getNode())->getZExtValue();
+ uint64_t ConstCCMask =
+ cast<ConstantSDNode>(CCMask.getNode())->getZExtValue();
+ // Invert the condition.
+ CCMask = CurDAG->getConstant(ConstCCValid ^ ConstCCMask, SDLoc(Node),
+ CCMask.getValueType());
+ SDValue Op4 = Node->getOperand(4);
+ Node = CurDAG->UpdateNodeOperands(Node, Op1, Op0, CCValid, CCMask, Op4);
+ }
+ break;
+ }
+
+ case ISD::INSERT_VECTOR_ELT: {
+ EVT VT = Node->getValueType(0);
+ unsigned ElemBitSize = VT.getScalarSizeInBits();
+ if (ElemBitSize == 32) {
+ if (tryGather(Node, SystemZ::VGEF))
+ return;
+ } else if (ElemBitSize == 64) {
+ if (tryGather(Node, SystemZ::VGEG))
+ return;
+ }
+ break;
+ }
+
+ case ISD::STORE: {
+ auto *Store = cast<StoreSDNode>(Node);
+ unsigned ElemBitSize = Store->getValue().getValueSizeInBits();
+ if (ElemBitSize == 32) {
+ if (tryScatter(Store, SystemZ::VSCEF))
+ return;
+ } else if (ElemBitSize == 64) {
+ if (tryScatter(Store, SystemZ::VSCEG))
+ return;
+ }
+ break;
+ }
+ }
+
+ SelectCode(Node);
+}
+
+bool SystemZDAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op,
+ unsigned ConstraintID,
+ std::vector<SDValue> &OutOps) {
+ SystemZAddressingMode::AddrForm Form;
+ SystemZAddressingMode::DispRange DispRange;
+ SDValue Base, Disp, Index;
+
+ switch(ConstraintID) {
+ default:
+ llvm_unreachable("Unexpected asm memory constraint");
+ case InlineAsm::Constraint_i:
+ case InlineAsm::Constraint_Q:
+ // Accept an address with a short displacement, but no index.
+ Form = SystemZAddressingMode::FormBD;
+ DispRange = SystemZAddressingMode::Disp12Only;
+ break;
+ case InlineAsm::Constraint_R:
+ // Accept an address with a short displacement and an index.
+ Form = SystemZAddressingMode::FormBDXNormal;
+ DispRange = SystemZAddressingMode::Disp12Only;
+ break;
+ case InlineAsm::Constraint_S:
+ // Accept an address with a long displacement, but no index.
+ Form = SystemZAddressingMode::FormBD;
+ DispRange = SystemZAddressingMode::Disp20Only;
+ break;
+ case InlineAsm::Constraint_T:
+ case InlineAsm::Constraint_m:
+ // Accept an address with a long displacement and an index.
+ // m works the same as T, as this is the most general case.
+ Form = SystemZAddressingMode::FormBDXNormal;
+ DispRange = SystemZAddressingMode::Disp20Only;
+ break;
+ }
+
+ if (selectBDXAddr(Form, DispRange, Op, Base, Disp, Index)) {
+ const TargetRegisterClass *TRC =
+ Subtarget->getRegisterInfo()->getPointerRegClass(*MF);
+ SDLoc DL(Base);
+ SDValue RC = CurDAG->getTargetConstant(TRC->getID(), DL, MVT::i32);
+
+ // Make sure that the base address doesn't go into %r0.
+ // If it's a TargetFrameIndex or a fixed register, we shouldn't do anything.
+ if (Base.getOpcode() != ISD::TargetFrameIndex &&
+ Base.getOpcode() != ISD::Register) {
+ Base =
+ SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+ DL, Base.getValueType(),
+ Base, RC), 0);
+ }
+
+ // Make sure that the index register isn't assigned to %r0 either.
+ if (Index.getOpcode() != ISD::Register) {
+ Index =
+ SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+ DL, Index.getValueType(),
+ Index, RC), 0);
+ }
+
+ OutOps.push_back(Base);
+ OutOps.push_back(Disp);
+ OutOps.push_back(Index);
+ return false;
+ }
+
+ return true;
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
new file mode 100644
index 000000000000..2081809def70
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -0,0 +1,6294 @@
+//===-- SystemZISelLowering.cpp - SystemZ DAG lowering implementation -----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SystemZTargetLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZISelLowering.h"
+#include "SystemZCallingConv.h"
+#include "SystemZConstantPoolValue.h"
+#include "SystemZMachineFunctionInfo.h"
+#include "SystemZTargetMachine.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/IR/Intrinsics.h"
+#include <cctype>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "systemz-lower"
+
+namespace {
+// Represents a sequence for extracting a 0/1 value from an IPM result:
+// (((X ^ XORValue) + AddValue) >> Bit)
+struct IPMConversion {
+ IPMConversion(unsigned xorValue, int64_t addValue, unsigned bit)
+ : XORValue(xorValue), AddValue(addValue), Bit(bit) {}
+
+ int64_t XORValue;
+ int64_t AddValue;
+ unsigned Bit;
+};
+
+// Represents information about a comparison.
+struct Comparison {
+ Comparison(SDValue Op0In, SDValue Op1In)
+ : Op0(Op0In), Op1(Op1In), Opcode(0), ICmpType(0), CCValid(0), CCMask(0) {}
+
+ // The operands to the comparison.
+ SDValue Op0, Op1;
+
+ // The opcode that should be used to compare Op0 and Op1.
+ unsigned Opcode;
+
+ // A SystemZICMP value. Only used for integer comparisons.
+ unsigned ICmpType;
+
+ // The mask of CC values that Opcode can produce.
+ unsigned CCValid;
+
+ // The mask of CC values for which the original condition is true.
+ unsigned CCMask;
+};
+} // end anonymous namespace
+
+// Classify VT as either 32 or 64 bit.
+static bool is32Bit(EVT VT) {
+ switch (VT.getSimpleVT().SimpleTy) {
+ case MVT::i32:
+ return true;
+ case MVT::i64:
+ return false;
+ default:
+ llvm_unreachable("Unsupported type");
+ }
+}
+
+// Return a version of MachineOperand that can be safely used before the
+// final use.
+static MachineOperand earlyUseOperand(MachineOperand Op) {
+ if (Op.isReg())
+ Op.setIsKill(false);
+ return Op;
+}
+
+SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
+ const SystemZSubtarget &STI)
+ : TargetLowering(TM), Subtarget(STI) {
+ MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
+
+ // Set up the register classes.
+ if (Subtarget.hasHighWord())
+ addRegisterClass(MVT::i32, &SystemZ::GRX32BitRegClass);
+ else
+ addRegisterClass(MVT::i32, &SystemZ::GR32BitRegClass);
+ addRegisterClass(MVT::i64, &SystemZ::GR64BitRegClass);
+ if (Subtarget.hasVector()) {
+ addRegisterClass(MVT::f32, &SystemZ::VR32BitRegClass);
+ addRegisterClass(MVT::f64, &SystemZ::VR64BitRegClass);
+ } else {
+ addRegisterClass(MVT::f32, &SystemZ::FP32BitRegClass);
+ addRegisterClass(MVT::f64, &SystemZ::FP64BitRegClass);
+ }
+ addRegisterClass(MVT::f128, &SystemZ::FP128BitRegClass);
+
+ if (Subtarget.hasVector()) {
+ addRegisterClass(MVT::v16i8, &SystemZ::VR128BitRegClass);
+ addRegisterClass(MVT::v8i16, &SystemZ::VR128BitRegClass);
+ addRegisterClass(MVT::v4i32, &SystemZ::VR128BitRegClass);
+ addRegisterClass(MVT::v2i64, &SystemZ::VR128BitRegClass);
+ addRegisterClass(MVT::v4f32, &SystemZ::VR128BitRegClass);
+ addRegisterClass(MVT::v2f64, &SystemZ::VR128BitRegClass);
+ }
+
+ // Compute derived properties from the register classes
+ computeRegisterProperties(Subtarget.getRegisterInfo());
+
+ // Set up special registers.
+ setStackPointerRegisterToSaveRestore(SystemZ::R15D);
+
+ // TODO: It may be better to default to latency-oriented scheduling, however
+ // LLVM's current latency-oriented scheduler can't handle physreg definitions
+ // such as SystemZ has with CC, so set this to the register-pressure
+ // scheduler, because it can.
+ setSchedulingPreference(Sched::RegPressure);
+
+ setBooleanContents(ZeroOrOneBooleanContent);
+ setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+
+ // Instructions are strings of 2-byte aligned 2-byte values.
+ setMinFunctionAlignment(2);
+
+ // Handle operations that are handled in a similar way for all types.
+ for (unsigned I = MVT::FIRST_INTEGER_VALUETYPE;
+ I <= MVT::LAST_FP_VALUETYPE;
+ ++I) {
+ MVT VT = MVT::SimpleValueType(I);
+ if (isTypeLegal(VT)) {
+ // Lower SET_CC into an IPM-based sequence.
+ setOperationAction(ISD::SETCC, VT, Custom);
+
+ // Expand SELECT(C, A, B) into SELECT_CC(X, 0, A, B, NE).
+ setOperationAction(ISD::SELECT, VT, Expand);
+
+ // Lower SELECT_CC and BR_CC into separate comparisons and branches.
+ setOperationAction(ISD::SELECT_CC, VT, Custom);
+ setOperationAction(ISD::BR_CC, VT, Custom);
+ }
+ }
+
+ // Expand jump table branches as address arithmetic followed by an
+ // indirect jump.
+ setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+
+ // Expand BRCOND into a BR_CC (see above).
+ setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+
+ // Handle integer types.
+ for (unsigned I = MVT::FIRST_INTEGER_VALUETYPE;
+ I <= MVT::LAST_INTEGER_VALUETYPE;
+ ++I) {
+ MVT VT = MVT::SimpleValueType(I);
+ if (isTypeLegal(VT)) {
+ // Expand individual DIV and REMs into DIVREMs.
+ setOperationAction(ISD::SDIV, VT, Expand);
+ setOperationAction(ISD::UDIV, VT, Expand);
+ setOperationAction(ISD::SREM, VT, Expand);
+ setOperationAction(ISD::UREM, VT, Expand);
+ setOperationAction(ISD::SDIVREM, VT, Custom);
+ setOperationAction(ISD::UDIVREM, VT, Custom);
+
+ // Lower ATOMIC_LOAD and ATOMIC_STORE into normal volatile loads and
+ // stores, putting a serialization instruction after the stores.
+ setOperationAction(ISD::ATOMIC_LOAD, VT, Custom);
+ setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
+
+ // Lower ATOMIC_LOAD_SUB into ATOMIC_LOAD_ADD if LAA and LAAG are
+ // available, or if the operand is constant.
+ setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
+
+ // Use POPCNT on z196 and above.
+ if (Subtarget.hasPopulationCount())
+ setOperationAction(ISD::CTPOP, VT, Custom);
+ else
+ setOperationAction(ISD::CTPOP, VT, Expand);
+
+ // No special instructions for these.
+ setOperationAction(ISD::CTTZ, VT, Expand);
+ setOperationAction(ISD::ROTR, VT, Expand);
+
+ // Use *MUL_LOHI where possible instead of MULH*.
+ setOperationAction(ISD::MULHS, VT, Expand);
+ setOperationAction(ISD::MULHU, VT, Expand);
+ setOperationAction(ISD::SMUL_LOHI, VT, Custom);
+ setOperationAction(ISD::UMUL_LOHI, VT, Custom);
+
+ // Only z196 and above have native support for conversions to unsigned.
+ if (!Subtarget.hasFPExtension())
+ setOperationAction(ISD::FP_TO_UINT, VT, Expand);
+ }
+ }
+
+ // Type legalization will convert 8- and 16-bit atomic operations into
+ // forms that operate on i32s (but still keeping the original memory VT).
+ // Lower them into full i32 operations.
+ setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Custom);
+ setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
+
+ setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
+
+ // Traps are legal, as we will convert them to "j .+2".
+ setOperationAction(ISD::TRAP, MVT::Other, Legal);
+
+ // z10 has instructions for signed but not unsigned FP conversion.
+ // Handle unsigned 32-bit types as signed 64-bit types.
+ if (!Subtarget.hasFPExtension()) {
+ setOperationAction(ISD::UINT_TO_FP, MVT::i32, Promote);
+ setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
+ }
+
+ // We have native support for a 64-bit CTLZ, via FLOGR.
+ setOperationAction(ISD::CTLZ, MVT::i32, Promote);
+ setOperationAction(ISD::CTLZ, MVT::i64, Legal);
+
+ // Give LowerOperation the chance to replace 64-bit ORs with subregs.
+ setOperationAction(ISD::OR, MVT::i64, Custom);
+
+ // FIXME: Can we support these natively?
+ setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
+ setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
+ setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
+
+ // We have native instructions for i8, i16 and i32 extensions, but not i1.
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+ for (MVT VT : MVT::integer_valuetypes()) {
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+ }
+
+ // Handle the various types of symbolic address.
+ setOperationAction(ISD::ConstantPool, PtrVT, Custom);
+ setOperationAction(ISD::GlobalAddress, PtrVT, Custom);
+ setOperationAction(ISD::GlobalTLSAddress, PtrVT, Custom);
+ setOperationAction(ISD::BlockAddress, PtrVT, Custom);
+ setOperationAction(ISD::JumpTable, PtrVT, Custom);
+
+ // We need to handle dynamic allocations specially because of the
+ // 160-byte area at the bottom of the stack.
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
+ setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, PtrVT, Custom);
+
+ // Use custom expanders so that we can force the function to use
+ // a frame pointer.
+ setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
+ setOperationAction(ISD::STACKRESTORE, MVT::Other, Custom);
+
+ // Handle prefetches with PFD or PFDRL.
+ setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
+
+ for (MVT VT : MVT::vector_valuetypes()) {
+ // Assume by default that all vector operations need to be expanded.
+ for (unsigned Opcode = 0; Opcode < ISD::BUILTIN_OP_END; ++Opcode)
+ if (getOperationAction(Opcode, VT) == Legal)
+ setOperationAction(Opcode, VT, Expand);
+
+ // Likewise all truncating stores and extending loads.
+ for (MVT InnerVT : MVT::vector_valuetypes()) {
+ setTruncStoreAction(VT, InnerVT, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
+ }
+
+ if (isTypeLegal(VT)) {
+ // These operations are legal for anything that can be stored in a
+ // vector register, even if there is no native support for the format
+ // as such. In particular, we can do these for v4f32 even though there
+ // are no specific instructions for that format.
+ setOperationAction(ISD::LOAD, VT, Legal);
+ setOperationAction(ISD::STORE, VT, Legal);
+ setOperationAction(ISD::VSELECT, VT, Legal);
+ setOperationAction(ISD::BITCAST, VT, Legal);
+ setOperationAction(ISD::UNDEF, VT, Legal);
+
+ // Likewise, except that we need to replace the nodes with something
+ // more specific.
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ }
+ }
+
+ // Handle integer vector types.
+ for (MVT VT : MVT::integer_vector_valuetypes()) {
+ if (isTypeLegal(VT)) {
+ // These operations have direct equivalents.
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Legal);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Legal);
+ setOperationAction(ISD::ADD, VT, Legal);
+ setOperationAction(ISD::SUB, VT, Legal);
+ if (VT != MVT::v2i64)
+ setOperationAction(ISD::MUL, VT, Legal);
+ setOperationAction(ISD::AND, VT, Legal);
+ setOperationAction(ISD::OR, VT, Legal);
+ setOperationAction(ISD::XOR, VT, Legal);
+ setOperationAction(ISD::CTPOP, VT, Custom);
+ setOperationAction(ISD::CTTZ, VT, Legal);
+ setOperationAction(ISD::CTLZ, VT, Legal);
+
+ // Convert a GPR scalar to a vector by inserting it into element 0.
+ setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
+
+ // Use a series of unpacks for extensions.
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
+ setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
+
+ // Detect shifts by a scalar amount and convert them into
+ // V*_BY_SCALAR.
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::SRA, VT, Custom);
+ setOperationAction(ISD::SRL, VT, Custom);
+
+ // At present ROTL isn't matched by DAGCombiner. ROTR should be
+ // converted into ROTL.
+ setOperationAction(ISD::ROTL, VT, Expand);
+ setOperationAction(ISD::ROTR, VT, Expand);
+
+ // Map SETCCs onto one of VCE, VCH or VCHL, swapping the operands
+ // and inverting the result as necessary.
+ setOperationAction(ISD::SETCC, VT, Custom);
+ }
+ }
+
+ if (Subtarget.hasVector()) {
+ // There should be no need to check for float types other than v2f64
+ // since <2 x f32> isn't a legal type.
+ setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
+ }
+
+ // Handle floating-point types.
+ for (unsigned I = MVT::FIRST_FP_VALUETYPE;
+ I <= MVT::LAST_FP_VALUETYPE;
+ ++I) {
+ MVT VT = MVT::SimpleValueType(I);
+ if (isTypeLegal(VT)) {
+ // We can use FI for FRINT.
+ setOperationAction(ISD::FRINT, VT, Legal);
+
+ // We can use the extended form of FI for other rounding operations.
+ if (Subtarget.hasFPExtension()) {
+ setOperationAction(ISD::FNEARBYINT, VT, Legal);
+ setOperationAction(ISD::FFLOOR, VT, Legal);
+ setOperationAction(ISD::FCEIL, VT, Legal);
+ setOperationAction(ISD::FTRUNC, VT, Legal);
+ setOperationAction(ISD::FROUND, VT, Legal);
+ }
+
+ // No special instructions for these.
+ setOperationAction(ISD::FSIN, VT, Expand);
+ setOperationAction(ISD::FCOS, VT, Expand);
+ setOperationAction(ISD::FSINCOS, VT, Expand);
+ setOperationAction(ISD::FREM, VT, Expand);
+ setOperationAction(ISD::FPOW, VT, Expand);
+ }
+ }
+
+ // Handle floating-point vector types.
+ if (Subtarget.hasVector()) {
+ // Scalar-to-vector conversion is just a subreg.
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
+
+ // Some insertions and extractions can be done directly but others
+ // need to go via integers.
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
+
+ // These operations have direct equivalents.
+ setOperationAction(ISD::FADD, MVT::v2f64, Legal);
+ setOperationAction(ISD::FNEG, MVT::v2f64, Legal);
+ setOperationAction(ISD::FSUB, MVT::v2f64, Legal);
+ setOperationAction(ISD::FMUL, MVT::v2f64, Legal);
+ setOperationAction(ISD::FMA, MVT::v2f64, Legal);
+ setOperationAction(ISD::FDIV, MVT::v2f64, Legal);
+ setOperationAction(ISD::FABS, MVT::v2f64, Legal);
+ setOperationAction(ISD::FSQRT, MVT::v2f64, Legal);
+ setOperationAction(ISD::FRINT, MVT::v2f64, Legal);
+ setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
+ setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
+ setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
+ setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
+ setOperationAction(ISD::FROUND, MVT::v2f64, Legal);
+ }
+
+ // We have fused multiply-addition for f32 and f64 but not f128.
+ setOperationAction(ISD::FMA, MVT::f32, Legal);
+ setOperationAction(ISD::FMA, MVT::f64, Legal);
+ setOperationAction(ISD::FMA, MVT::f128, Expand);
+
+ // Needed so that we don't try to implement f128 constant loads using
+ // a load-and-extend of a f80 constant (in cases where the constant
+ // would fit in an f80).
+ for (MVT VT : MVT::fp_valuetypes())
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
+
+ // Floating-point truncation and stores need to be done separately.
+ setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+ setTruncStoreAction(MVT::f128, MVT::f32, Expand);
+ setTruncStoreAction(MVT::f128, MVT::f64, Expand);
+
+ // We have 64-bit FPR<->GPR moves, but need special handling for
+ // 32-bit forms.
+ if (!Subtarget.hasVector()) {
+ setOperationAction(ISD::BITCAST, MVT::i32, Custom);
+ setOperationAction(ISD::BITCAST, MVT::f32, Custom);
+ }
+
+ // VASTART and VACOPY need to deal with the SystemZ-specific varargs
+ // structure, but VAEND is a no-op.
+ setOperationAction(ISD::VASTART, MVT::Other, Custom);
+ setOperationAction(ISD::VACOPY, MVT::Other, Custom);
+ setOperationAction(ISD::VAEND, MVT::Other, Expand);
+
+ // Codes for which we want to perform some z-specific combinations.
+ setTargetDAGCombine(ISD::SIGN_EXTEND);
+ setTargetDAGCombine(ISD::STORE);
+ setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+ setTargetDAGCombine(ISD::FP_ROUND);
+ setTargetDAGCombine(ISD::BSWAP);
+ setTargetDAGCombine(ISD::SHL);
+ setTargetDAGCombine(ISD::SRA);
+ setTargetDAGCombine(ISD::SRL);
+ setTargetDAGCombine(ISD::ROTL);
+
+ // Handle intrinsics.
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
+ // We want to use MVC in preference to even a single load/store pair.
+ MaxStoresPerMemcpy = 0;
+ MaxStoresPerMemcpyOptSize = 0;
+
+ // The main memset sequence is a byte store followed by an MVC.
+ // Two STC or MV..I stores win over that, but the kind of fused stores
+ // generated by target-independent code don't when the byte value is
+ // variable. E.g. "STC <reg>;MHI <reg>,257;STH <reg>" is not better
+ // than "STC;MVC". Handle the choice in target-specific code instead.
+ MaxStoresPerMemset = 0;
+ MaxStoresPerMemsetOptSize = 0;
+}
+
+EVT SystemZTargetLowering::getSetCCResultType(const DataLayout &DL,
+ LLVMContext &, EVT VT) const {
+ if (!VT.isVector())
+ return MVT::i32;
+ return VT.changeVectorElementTypeToInteger();
+}
+
+bool SystemZTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+ VT = VT.getScalarType();
+
+ if (!VT.isSimple())
+ return false;
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ case MVT::f32:
+ case MVT::f64:
+ return true;
+ case MVT::f128:
+ return false;
+ default:
+ break;
+ }
+
+ return false;
+}
+
+bool SystemZTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+ // We can load zero using LZ?R and negative zero using LZ?R;LC?BR.
+ return Imm.isZero() || Imm.isNegZero();
+}
+
+bool SystemZTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
+ // We can use CGFI or CLGFI.
+ return isInt<32>(Imm) || isUInt<32>(Imm);
+}
+
+bool SystemZTargetLowering::isLegalAddImmediate(int64_t Imm) const {
+ // We can use ALGFI or SLGFI.
+ return isUInt<32>(Imm) || isUInt<32>(-Imm);
+}
+
+bool SystemZTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+ unsigned,
+ unsigned,
+ bool *Fast) const {
+ // Unaligned accesses should never be slower than the expanded version.
+ // We check specifically for aligned accesses in the few cases where
+ // they are required.
+ if (Fast)
+ *Fast = true;
+ return true;
+}
+
+bool SystemZTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+ const AddrMode &AM, Type *Ty,
+ unsigned AS) const {
+ // Punt on globals for now, although they can be used in limited
+ // RELATIVE LONG cases.
+ if (AM.BaseGV)
+ return false;
+
+ // Require a 20-bit signed offset.
+ if (!isInt<20>(AM.BaseOffs))
+ return false;
+
+ // Indexing is OK but no scale factor can be applied.
+ return AM.Scale == 0 || AM.Scale == 1;
+}
+
+bool SystemZTargetLowering::isFoldableMemAccessOffset(Instruction *I,
+ int64_t Offset) const {
+ // This only applies to z13.
+ if (!Subtarget.hasVector())
+ return true;
+
+ // * Use LDE instead of LE/LEY to avoid partial register
+ // dependencies (LDE only supports small offsets).
+ // * Utilize the vector registers to hold floating point
+ // values (vector load / store instructions only support small
+ // offsets).
+
+ assert (isa<LoadInst>(I) || isa<StoreInst>(I));
+ Type *MemAccessTy = (isa<LoadInst>(I) ? I->getType() :
+ I->getOperand(0)->getType());
+ if (!isUInt<12>(Offset) &&
+ (MemAccessTy->isFloatingPointTy() || MemAccessTy->isVectorTy()))
+ return false;
+
+ return true;
+}
+
+bool SystemZTargetLowering::isTruncateFree(Type *FromType, Type *ToType) const {
+ if (!FromType->isIntegerTy() || !ToType->isIntegerTy())
+ return false;
+ unsigned FromBits = FromType->getPrimitiveSizeInBits();
+ unsigned ToBits = ToType->getPrimitiveSizeInBits();
+ return FromBits > ToBits;
+}
+
+bool SystemZTargetLowering::isTruncateFree(EVT FromVT, EVT ToVT) const {
+ if (!FromVT.isInteger() || !ToVT.isInteger())
+ return false;
+ unsigned FromBits = FromVT.getSizeInBits();
+ unsigned ToBits = ToVT.getSizeInBits();
+ return FromBits > ToBits;
+}
+
+//===----------------------------------------------------------------------===//
+// Inline asm support
+//===----------------------------------------------------------------------===//
+
+TargetLowering::ConstraintType
+SystemZTargetLowering::getConstraintType(StringRef Constraint) const {
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ case 'a': // Address register
+ case 'd': // Data register (equivalent to 'r')
+ case 'f': // Floating-point register
+ case 'h': // High-part register
+ case 'r': // General-purpose register
+ return C_RegisterClass;
+
+ case 'Q': // Memory with base and unsigned 12-bit displacement
+ case 'R': // Likewise, plus an index
+ case 'S': // Memory with base and signed 20-bit displacement
+ case 'T': // Likewise, plus an index
+ case 'm': // Equivalent to 'T'.
+ return C_Memory;
+
+ case 'I': // Unsigned 8-bit constant
+ case 'J': // Unsigned 12-bit constant
+ case 'K': // Signed 16-bit constant
+ case 'L': // Signed 20-bit displacement (on all targets we support)
+ case 'M': // 0x7fffffff
+ return C_Other;
+
+ default:
+ break;
+ }
+ }
+ return TargetLowering::getConstraintType(Constraint);
+}
+
+TargetLowering::ConstraintWeight SystemZTargetLowering::
+getSingleConstraintMatchWeight(AsmOperandInfo &info,
+ const char *constraint) const {
+ ConstraintWeight weight = CW_Invalid;
+ Value *CallOperandVal = info.CallOperandVal;
+ // If we don't have a value, we can't do a match,
+ // but allow it at the lowest weight.
+ if (!CallOperandVal)
+ return CW_Default;
+ Type *type = CallOperandVal->getType();
+ // Look at the constraint type.
+ switch (*constraint) {
+ default:
+ weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+ break;
+
+ case 'a': // Address register
+ case 'd': // Data register (equivalent to 'r')
+ case 'h': // High-part register
+ case 'r': // General-purpose register
+ if (CallOperandVal->getType()->isIntegerTy())
+ weight = CW_Register;
+ break;
+
+ case 'f': // Floating-point register
+ if (type->isFloatingPointTy())
+ weight = CW_Register;
+ break;
+
+ case 'I': // Unsigned 8-bit constant
+ if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
+ if (isUInt<8>(C->getZExtValue()))
+ weight = CW_Constant;
+ break;
+
+ case 'J': // Unsigned 12-bit constant
+ if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
+ if (isUInt<12>(C->getZExtValue()))
+ weight = CW_Constant;
+ break;
+
+ case 'K': // Signed 16-bit constant
+ if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
+ if (isInt<16>(C->getSExtValue()))
+ weight = CW_Constant;
+ break;
+
+ case 'L': // Signed 20-bit displacement (on all targets we support)
+ if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
+ if (isInt<20>(C->getSExtValue()))
+ weight = CW_Constant;
+ break;
+
+ case 'M': // 0x7fffffff
+ if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
+ if (C->getZExtValue() == 0x7fffffff)
+ weight = CW_Constant;
+ break;
+ }
+ return weight;
+}
+
+// Parse a "{tNNN}" register constraint for which the register type "t"
+// has already been verified. MC is the class associated with "t" and
+// Map maps 0-based register numbers to LLVM register numbers.
+static std::pair<unsigned, const TargetRegisterClass *>
+parseRegisterNumber(StringRef Constraint, const TargetRegisterClass *RC,
+ const unsigned *Map) {
+ assert(*(Constraint.end()-1) == '}' && "Missing '}'");
+ if (isdigit(Constraint[2])) {
+ unsigned Index;
+ bool Failed =
+ Constraint.slice(2, Constraint.size() - 1).getAsInteger(10, Index);
+ if (!Failed && Index < 16 && Map[Index])
+ return std::make_pair(Map[Index], RC);
+ }
+ return std::make_pair(0U, nullptr);
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+SystemZTargetLowering::getRegForInlineAsmConstraint(
+ const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
+ if (Constraint.size() == 1) {
+ // GCC Constraint Letters
+ switch (Constraint[0]) {
+ default: break;
+ case 'd': // Data register (equivalent to 'r')
+ case 'r': // General-purpose register
+ if (VT == MVT::i64)
+ return std::make_pair(0U, &SystemZ::GR64BitRegClass);
+ else if (VT == MVT::i128)
+ return std::make_pair(0U, &SystemZ::GR128BitRegClass);
+ return std::make_pair(0U, &SystemZ::GR32BitRegClass);
+
+ case 'a': // Address register
+ if (VT == MVT::i64)
+ return std::make_pair(0U, &SystemZ::ADDR64BitRegClass);
+ else if (VT == MVT::i128)
+ return std::make_pair(0U, &SystemZ::ADDR128BitRegClass);
+ return std::make_pair(0U, &SystemZ::ADDR32BitRegClass);
+
+ case 'h': // High-part register (an LLVM extension)
+ return std::make_pair(0U, &SystemZ::GRH32BitRegClass);
+
+ case 'f': // Floating-point register
+ if (VT == MVT::f64)
+ return std::make_pair(0U, &SystemZ::FP64BitRegClass);
+ else if (VT == MVT::f128)
+ return std::make_pair(0U, &SystemZ::FP128BitRegClass);
+ return std::make_pair(0U, &SystemZ::FP32BitRegClass);
+ }
+ }
+ if (Constraint.size() > 0 && Constraint[0] == '{') {
+ // We need to override the default register parsing for GPRs and FPRs
+ // because the interpretation depends on VT. The internal names of
+ // the registers are also different from the external names
+ // (F0D and F0S instead of F0, etc.).
+ if (Constraint[1] == 'r') {
+ if (VT == MVT::i32)
+ return parseRegisterNumber(Constraint, &SystemZ::GR32BitRegClass,
+ SystemZMC::GR32Regs);
+ if (VT == MVT::i128)
+ return parseRegisterNumber(Constraint, &SystemZ::GR128BitRegClass,
+ SystemZMC::GR128Regs);
+ return parseRegisterNumber(Constraint, &SystemZ::GR64BitRegClass,
+ SystemZMC::GR64Regs);
+ }
+ if (Constraint[1] == 'f') {
+ if (VT == MVT::f32)
+ return parseRegisterNumber(Constraint, &SystemZ::FP32BitRegClass,
+ SystemZMC::FP32Regs);
+ if (VT == MVT::f128)
+ return parseRegisterNumber(Constraint, &SystemZ::FP128BitRegClass,
+ SystemZMC::FP128Regs);
+ return parseRegisterNumber(Constraint, &SystemZ::FP64BitRegClass,
+ SystemZMC::FP64Regs);
+ }
+ }
+ return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+}
+
+void SystemZTargetLowering::
+LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
+ std::vector<SDValue> &Ops,
+ SelectionDAG &DAG) const {
+ // Only support length 1 constraints for now.
+ if (Constraint.length() == 1) {
+ switch (Constraint[0]) {
+ case 'I': // Unsigned 8-bit constant
+ if (auto *C = dyn_cast<ConstantSDNode>(Op))
+ if (isUInt<8>(C->getZExtValue()))
+ Ops.push_back(DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+ Op.getValueType()));
+ return;
+
+ case 'J': // Unsigned 12-bit constant
+ if (auto *C = dyn_cast<ConstantSDNode>(Op))
+ if (isUInt<12>(C->getZExtValue()))
+ Ops.push_back(DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+ Op.getValueType()));
+ return;
+
+ case 'K': // Signed 16-bit constant
+ if (auto *C = dyn_cast<ConstantSDNode>(Op))
+ if (isInt<16>(C->getSExtValue()))
+ Ops.push_back(DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
+ Op.getValueType()));
+ return;
+
+ case 'L': // Signed 20-bit displacement (on all targets we support)
+ if (auto *C = dyn_cast<ConstantSDNode>(Op))
+ if (isInt<20>(C->getSExtValue()))
+ Ops.push_back(DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
+ Op.getValueType()));
+ return;
+
+ case 'M': // 0x7fffffff
+ if (auto *C = dyn_cast<ConstantSDNode>(Op))
+ if (C->getZExtValue() == 0x7fffffff)
+ Ops.push_back(DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+ Op.getValueType()));
+ return;
+ }
+ }
+ TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+//===----------------------------------------------------------------------===//
+// Calling conventions
+//===----------------------------------------------------------------------===//
+
+#include "SystemZGenCallingConv.inc"
+
+bool SystemZTargetLowering::allowTruncateForTailCall(Type *FromType,
+ Type *ToType) const {
+ return isTruncateFree(FromType, ToType);
+}
+
+bool SystemZTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
+ return CI->isTailCall();
+}
+
+// We do not yet support 128-bit single-element vector types. If the user
+// attempts to use such types as function argument or return type, prefer
+// to error out instead of emitting code violating the ABI.
+static void VerifyVectorType(MVT VT, EVT ArgVT) {
+ if (ArgVT.isVector() && !VT.isVector())
+ report_fatal_error("Unsupported vector argument or return type");
+}
+
+static void VerifyVectorTypes(const SmallVectorImpl<ISD::InputArg> &Ins) {
+ for (unsigned i = 0; i < Ins.size(); ++i)
+ VerifyVectorType(Ins[i].VT, Ins[i].ArgVT);
+}
+
+static void VerifyVectorTypes(const SmallVectorImpl<ISD::OutputArg> &Outs) {
+ for (unsigned i = 0; i < Outs.size(); ++i)
+ VerifyVectorType(Outs[i].VT, Outs[i].ArgVT);
+}
+
+// Value is a value that has been passed to us in the location described by VA
+// (and so has type VA.getLocVT()). Convert Value to VA.getValVT(), chaining
+// any loads onto Chain.
+static SDValue convertLocVTToValVT(SelectionDAG &DAG, const SDLoc &DL,
+ CCValAssign &VA, SDValue Chain,
+ SDValue Value) {
+ // If the argument has been promoted from a smaller type, insert an
+ // assertion to capture this.
+ if (VA.getLocInfo() == CCValAssign::SExt)
+ Value = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Value,
+ DAG.getValueType(VA.getValVT()));
+ else if (VA.getLocInfo() == CCValAssign::ZExt)
+ Value = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Value,
+ DAG.getValueType(VA.getValVT()));
+
+ if (VA.isExtInLoc())
+ Value = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Value);
+ else if (VA.getLocInfo() == CCValAssign::BCvt) {
+ // If this is a short vector argument loaded from the stack,
+ // extend from i64 to full vector size and then bitcast.
+ assert(VA.getLocVT() == MVT::i64);
+ assert(VA.getValVT().isVector());
+ Value = DAG.getBuildVector(MVT::v2i64, DL, {Value, DAG.getUNDEF(MVT::i64)});
+ Value = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Value);
+ } else
+ assert(VA.getLocInfo() == CCValAssign::Full && "Unsupported getLocInfo");
+ return Value;
+}
+
+// Value is a value of type VA.getValVT() that we need to copy into
+// the location described by VA. Return a copy of Value converted to
+// VA.getValVT(). The caller is responsible for handling indirect values.
+static SDValue convertValVTToLocVT(SelectionDAG &DAG, const SDLoc &DL,
+ CCValAssign &VA, SDValue Value) {
+ switch (VA.getLocInfo()) {
+ case CCValAssign::SExt:
+ return DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Value);
+ case CCValAssign::ZExt:
+ return DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Value);
+ case CCValAssign::AExt:
+ return DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Value);
+ case CCValAssign::BCvt:
+ // If this is a short vector argument to be stored to the stack,
+ // bitcast to v2i64 and then extract first element.
+ assert(VA.getLocVT() == MVT::i64);
+ assert(VA.getValVT().isVector());
+ Value = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Value);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VA.getLocVT(), Value,
+ DAG.getConstant(0, DL, MVT::i32));
+ case CCValAssign::Full:
+ return Value;
+ default:
+ llvm_unreachable("Unhandled getLocInfo()");
+ }
+}
+
+SDValue SystemZTargetLowering::LowerFormalArguments(
+ SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ SystemZMachineFunctionInfo *FuncInfo =
+ MF.getInfo<SystemZMachineFunctionInfo>();
+ auto *TFL =
+ static_cast<const SystemZFrameLowering *>(Subtarget.getFrameLowering());
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+ // Detect unsupported vector argument types.
+ if (Subtarget.hasVector())
+ VerifyVectorTypes(Ins);
+
+ // Assign locations to all of the incoming arguments.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ SystemZCCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
+ CCInfo.AnalyzeFormalArguments(Ins, CC_SystemZ);
+
+ unsigned NumFixedGPRs = 0;
+ unsigned NumFixedFPRs = 0;
+ for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
+ SDValue ArgValue;
+ CCValAssign &VA = ArgLocs[I];
+ EVT LocVT = VA.getLocVT();
+ if (VA.isRegLoc()) {
+ // Arguments passed in registers
+ const TargetRegisterClass *RC;
+ switch (LocVT.getSimpleVT().SimpleTy) {
+ default:
+ // Integers smaller than i64 should be promoted to i64.
+ llvm_unreachable("Unexpected argument type");
+ case MVT::i32:
+ NumFixedGPRs += 1;
+ RC = &SystemZ::GR32BitRegClass;
+ break;
+ case MVT::i64:
+ NumFixedGPRs += 1;
+ RC = &SystemZ::GR64BitRegClass;
+ break;
+ case MVT::f32:
+ NumFixedFPRs += 1;
+ RC = &SystemZ::FP32BitRegClass;
+ break;
+ case MVT::f64:
+ NumFixedFPRs += 1;
+ RC = &SystemZ::FP64BitRegClass;
+ break;
+ case MVT::v16i8:
+ case MVT::v8i16:
+ case MVT::v4i32:
+ case MVT::v2i64:
+ case MVT::v4f32:
+ case MVT::v2f64:
+ RC = &SystemZ::VR128BitRegClass;
+ break;
+ }
+
+ unsigned VReg = MRI.createVirtualRegister(RC);
+ MRI.addLiveIn(VA.getLocReg(), VReg);
+ ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, LocVT);
+ } else {
+ assert(VA.isMemLoc() && "Argument not register or memory");
+
+ // Create the frame index object for this incoming parameter.
+ int FI = MFI.CreateFixedObject(LocVT.getSizeInBits() / 8,
+ VA.getLocMemOffset(), true);
+
+ // Create the SelectionDAG nodes corresponding to a load
+ // from this parameter. Unpromoted ints and floats are
+ // passed as right-justified 8-byte values.
+ SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+ if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32)
+ FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
+ DAG.getIntPtrConstant(4, DL));
+ ArgValue = DAG.getLoad(LocVT, DL, Chain, FIN,
+ MachinePointerInfo::getFixedStack(MF, FI));
+ }
+
+ // Convert the value of the argument register into the value that's
+ // being passed.
+ if (VA.getLocInfo() == CCValAssign::Indirect) {
+ InVals.push_back(DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue,
+ MachinePointerInfo()));
+ // If the original argument was split (e.g. i128), we need
+ // to load all parts of it here (using the same address).
+ unsigned ArgIndex = Ins[I].OrigArgIndex;
+ assert (Ins[I].PartOffset == 0);
+ while (I + 1 != E && Ins[I + 1].OrigArgIndex == ArgIndex) {
+ CCValAssign &PartVA = ArgLocs[I + 1];
+ unsigned PartOffset = Ins[I + 1].PartOffset;
+ SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, ArgValue,
+ DAG.getIntPtrConstant(PartOffset, DL));
+ InVals.push_back(DAG.getLoad(PartVA.getValVT(), DL, Chain, Address,
+ MachinePointerInfo()));
+ ++I;
+ }
+ } else
+ InVals.push_back(convertLocVTToValVT(DAG, DL, VA, Chain, ArgValue));
+ }
+
+ if (IsVarArg) {
+ // Save the number of non-varargs registers for later use by va_start, etc.
+ FuncInfo->setVarArgsFirstGPR(NumFixedGPRs);
+ FuncInfo->setVarArgsFirstFPR(NumFixedFPRs);
+
+ // Likewise the address (in the form of a frame index) of where the
+ // first stack vararg would be. The 1-byte size here is arbitrary.
+ int64_t StackSize = CCInfo.getNextStackOffset();
+ FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
+
+ // ...and a similar frame index for the caller-allocated save area
+ // that will be used to store the incoming registers.
+ int64_t RegSaveOffset = TFL->getOffsetOfLocalArea();
+ unsigned RegSaveIndex = MFI.CreateFixedObject(1, RegSaveOffset, true);
+ FuncInfo->setRegSaveFrameIndex(RegSaveIndex);
+
+ // Store the FPR varargs in the reserved frame slots. (We store the
+ // GPRs as part of the prologue.)
+ if (NumFixedFPRs < SystemZ::NumArgFPRs) {
+ SDValue MemOps[SystemZ::NumArgFPRs];
+ for (unsigned I = NumFixedFPRs; I < SystemZ::NumArgFPRs; ++I) {
+ unsigned Offset = TFL->getRegSpillOffset(SystemZ::ArgFPRs[I]);
+ int FI = MFI.CreateFixedObject(8, RegSaveOffset + Offset, true);
+ SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+ unsigned VReg = MF.addLiveIn(SystemZ::ArgFPRs[I],
+ &SystemZ::FP64BitRegClass);
+ SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f64);
+ MemOps[I] = DAG.getStore(ArgValue.getValue(1), DL, ArgValue, FIN,
+ MachinePointerInfo::getFixedStack(MF, FI));
+ }
+ // Join the stores, which are independent of one another.
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+ makeArrayRef(&MemOps[NumFixedFPRs],
+ SystemZ::NumArgFPRs-NumFixedFPRs));
+ }
+ }
+
+ return Chain;
+}
+
+static bool canUseSiblingCall(const CCState &ArgCCInfo,
+ SmallVectorImpl<CCValAssign> &ArgLocs,
+ SmallVectorImpl<ISD::OutputArg> &Outs) {
+ // Punt if there are any indirect or stack arguments, or if the call
+ // needs the callee-saved argument register R6, or if the call uses
+ // the callee-saved register arguments SwiftSelf and SwiftError.
+ for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
+ CCValAssign &VA = ArgLocs[I];
+ if (VA.getLocInfo() == CCValAssign::Indirect)
+ return false;
+ if (!VA.isRegLoc())
+ return false;
+ unsigned Reg = VA.getLocReg();
+ if (Reg == SystemZ::R6H || Reg == SystemZ::R6L || Reg == SystemZ::R6D)
+ return false;
+ if (Outs[I].Flags.isSwiftSelf() || Outs[I].Flags.isSwiftError())
+ return false;
+ }
+ return true;
+}
+
+SDValue
+SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const {
+ SelectionDAG &DAG = CLI.DAG;
+ SDLoc &DL = CLI.DL;
+ SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+ SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+ SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
+ SDValue Chain = CLI.Chain;
+ SDValue Callee = CLI.Callee;
+ bool &IsTailCall = CLI.IsTailCall;
+ CallingConv::ID CallConv = CLI.CallConv;
+ bool IsVarArg = CLI.IsVarArg;
+ MachineFunction &MF = DAG.getMachineFunction();
+ EVT PtrVT = getPointerTy(MF.getDataLayout());
+
+ // Detect unsupported vector argument and return types.
+ if (Subtarget.hasVector()) {
+ VerifyVectorTypes(Outs);
+ VerifyVectorTypes(Ins);
+ }
+
+ // Analyze the operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ SystemZCCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
+ ArgCCInfo.AnalyzeCallOperands(Outs, CC_SystemZ);
+
+ // We don't support GuaranteedTailCallOpt, only automatically-detected
+ // sibling calls.
+ if (IsTailCall && !canUseSiblingCall(ArgCCInfo, ArgLocs, Outs))
+ IsTailCall = false;
+
+ // Get a count of how many bytes are to be pushed on the stack.
+ unsigned NumBytes = ArgCCInfo.getNextStackOffset();
+
+ // Mark the start of the call.
+ if (!IsTailCall)
+ Chain = DAG.getCALLSEQ_START(Chain,
+ DAG.getConstant(NumBytes, DL, PtrVT, true),
+ DL);
+
+ // Copy argument values to their designated locations.
+ SmallVector<std::pair<unsigned, SDValue>, 9> RegsToPass;
+ SmallVector<SDValue, 8> MemOpChains;
+ SDValue StackPtr;
+ for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
+ CCValAssign &VA = ArgLocs[I];
+ SDValue ArgValue = OutVals[I];
+
+ if (VA.getLocInfo() == CCValAssign::Indirect) {
+ // Store the argument in a stack slot and pass its address.
+ SDValue SpillSlot = DAG.CreateStackTemporary(Outs[I].ArgVT);
+ int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
+ MemOpChains.push_back(
+ DAG.getStore(Chain, DL, ArgValue, SpillSlot,
+ MachinePointerInfo::getFixedStack(MF, FI)));
+ // If the original argument was split (e.g. i128), we need
+ // to store all parts of it here (and pass just one address).
+ unsigned ArgIndex = Outs[I].OrigArgIndex;
+ assert (Outs[I].PartOffset == 0);
+ while (I + 1 != E && Outs[I + 1].OrigArgIndex == ArgIndex) {
+ SDValue PartValue = OutVals[I + 1];
+ unsigned PartOffset = Outs[I + 1].PartOffset;
+ SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, SpillSlot,
+ DAG.getIntPtrConstant(PartOffset, DL));
+ MemOpChains.push_back(
+ DAG.getStore(Chain, DL, PartValue, Address,
+ MachinePointerInfo::getFixedStack(MF, FI)));
+ ++I;
+ }
+ ArgValue = SpillSlot;
+ } else
+ ArgValue = convertValVTToLocVT(DAG, DL, VA, ArgValue);
+
+ if (VA.isRegLoc())
+ // Queue up the argument copies and emit them at the end.
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgValue));
+ else {
+ assert(VA.isMemLoc() && "Argument not register or memory");
+
+ // Work out the address of the stack slot. Unpromoted ints and
+ // floats are passed as right-justified 8-byte values.
+ if (!StackPtr.getNode())
+ StackPtr = DAG.getCopyFromReg(Chain, DL, SystemZ::R15D, PtrVT);
+ unsigned Offset = SystemZMC::CallFrameSize + VA.getLocMemOffset();
+ if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32)
+ Offset += 4;
+ SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr,
+ DAG.getIntPtrConstant(Offset, DL));
+
+ // Emit the store.
+ MemOpChains.push_back(
+ DAG.getStore(Chain, DL, ArgValue, Address, MachinePointerInfo()));
+ }
+ }
+
+ // Join the stores, which are independent of one another.
+ if (!MemOpChains.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
+
+ // Accept direct calls by converting symbolic call addresses to the
+ // associated Target* opcodes. Force %r1 to be used for indirect
+ // tail calls.
+ SDValue Glue;
+ if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, PtrVT);
+ Callee = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Callee);
+ } else if (auto *E = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+ Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT);
+ Callee = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Callee);
+ } else if (IsTailCall) {
+ Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R1D, Callee, Glue);
+ Glue = Chain.getValue(1);
+ Callee = DAG.getRegister(SystemZ::R1D, Callee.getValueType());
+ }
+
+ // Build a sequence of copy-to-reg nodes, chained and glued together.
+ for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I) {
+ Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[I].first,
+ RegsToPass[I].second, Glue);
+ Glue = Chain.getValue(1);
+ }
+
+ // The first call operand is the chain and the second is the target address.
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(Chain);
+ Ops.push_back(Callee);
+
+ // Add argument registers to the end of the list so that they are
+ // known live into the call.
+ for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I)
+ Ops.push_back(DAG.getRegister(RegsToPass[I].first,
+ RegsToPass[I].second.getValueType()));
+
+ // Add a register mask operand representing the call-preserved registers.
+ const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+ const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
+ assert(Mask && "Missing call preserved mask for calling convention");
+ Ops.push_back(DAG.getRegisterMask(Mask));
+
+ // Glue the call to the argument copies, if any.
+ if (Glue.getNode())
+ Ops.push_back(Glue);
+
+ // Emit the call.
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+ if (IsTailCall)
+ return DAG.getNode(SystemZISD::SIBCALL, DL, NodeTys, Ops);
+ Chain = DAG.getNode(SystemZISD::CALL, DL, NodeTys, Ops);
+ Glue = Chain.getValue(1);
+
+ // Mark the end of the call, which is glued to the call itself.
+ Chain = DAG.getCALLSEQ_END(Chain,
+ DAG.getConstant(NumBytes, DL, PtrVT, true),
+ DAG.getConstant(0, DL, PtrVT, true),
+ Glue, DL);
+ Glue = Chain.getValue(1);
+
+ // Assign locations to each value returned by this call.
+ SmallVector<CCValAssign, 16> RetLocs;
+ CCState RetCCInfo(CallConv, IsVarArg, MF, RetLocs, *DAG.getContext());
+ RetCCInfo.AnalyzeCallResult(Ins, RetCC_SystemZ);
+
+ // Copy all of the result registers out of their specified physreg.
+ for (unsigned I = 0, E = RetLocs.size(); I != E; ++I) {
+ CCValAssign &VA = RetLocs[I];
+
+ // Copy the value out, gluing the copy to the end of the call sequence.
+ SDValue RetValue = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(),
+ VA.getLocVT(), Glue);
+ Chain = RetValue.getValue(1);
+ Glue = RetValue.getValue(2);
+
+ // Convert the value of the return register into the value that's
+ // being returned.
+ InVals.push_back(convertLocVTToValVT(DAG, DL, VA, Chain, RetValue));
+ }
+
+ return Chain;
+}
+
+bool SystemZTargetLowering::
+CanLowerReturn(CallingConv::ID CallConv,
+ MachineFunction &MF, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ LLVMContext &Context) const {
+ // Detect unsupported vector return types.
+ if (Subtarget.hasVector())
+ VerifyVectorTypes(Outs);
+
+ // Special case that we cannot easily detect in RetCC_SystemZ since
+ // i128 is not a legal type.
+ for (auto &Out : Outs)
+ if (Out.ArgVT == MVT::i128)
+ return false;
+
+ SmallVector<CCValAssign, 16> RetLocs;
+ CCState RetCCInfo(CallConv, isVarArg, MF, RetLocs, Context);
+ return RetCCInfo.CheckReturn(Outs, RetCC_SystemZ);
+}
+
+SDValue
+SystemZTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+ bool IsVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SDLoc &DL, SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+
+ // Detect unsupported vector return types.
+ if (Subtarget.hasVector())
+ VerifyVectorTypes(Outs);
+
+ // Assign locations to each returned value.
+ SmallVector<CCValAssign, 16> RetLocs;
+ CCState RetCCInfo(CallConv, IsVarArg, MF, RetLocs, *DAG.getContext());
+ RetCCInfo.AnalyzeReturn(Outs, RetCC_SystemZ);
+
+ // Quick exit for void returns
+ if (RetLocs.empty())
+ return DAG.getNode(SystemZISD::RET_FLAG, DL, MVT::Other, Chain);
+
+ // Copy the result values into the output registers.
+ SDValue Glue;
+ SmallVector<SDValue, 4> RetOps;
+ RetOps.push_back(Chain);
+ for (unsigned I = 0, E = RetLocs.size(); I != E; ++I) {
+ CCValAssign &VA = RetLocs[I];
+ SDValue RetValue = OutVals[I];
+
+ // Make the return register live on exit.
+ assert(VA.isRegLoc() && "Can only return in registers!");
+
+ // Promote the value as required.
+ RetValue = convertValVTToLocVT(DAG, DL, VA, RetValue);
+
+ // Chain and glue the copies together.
+ unsigned Reg = VA.getLocReg();
+ Chain = DAG.getCopyToReg(Chain, DL, Reg, RetValue, Glue);
+ Glue = Chain.getValue(1);
+ RetOps.push_back(DAG.getRegister(Reg, VA.getLocVT()));
+ }
+
+ // Update chain and glue.
+ RetOps[0] = Chain;
+ if (Glue.getNode())
+ RetOps.push_back(Glue);
+
+ return DAG.getNode(SystemZISD::RET_FLAG, DL, MVT::Other, RetOps);
+}
+
+SDValue SystemZTargetLowering::prepareVolatileOrAtomicLoad(
+ SDValue Chain, const SDLoc &DL, SelectionDAG &DAG) const {
+ return DAG.getNode(SystemZISD::SERIALIZE, DL, MVT::Other, Chain);
+}
+
+// Return true if Op is an intrinsic node with chain that returns the CC value
+// as its only (other) argument. Provide the associated SystemZISD opcode and
+// the mask of valid CC values if so.
+static bool isIntrinsicWithCCAndChain(SDValue Op, unsigned &Opcode,
+ unsigned &CCValid) {
+ unsigned Id = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+ switch (Id) {
+ case Intrinsic::s390_tbegin:
+ Opcode = SystemZISD::TBEGIN;
+ CCValid = SystemZ::CCMASK_TBEGIN;
+ return true;
+
+ case Intrinsic::s390_tbegin_nofloat:
+ Opcode = SystemZISD::TBEGIN_NOFLOAT;
+ CCValid = SystemZ::CCMASK_TBEGIN;
+ return true;
+
+ case Intrinsic::s390_tend:
+ Opcode = SystemZISD::TEND;
+ CCValid = SystemZ::CCMASK_TEND;
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+// Return true if Op is an intrinsic node without chain that returns the
+// CC value as its final argument. Provide the associated SystemZISD
+// opcode and the mask of valid CC values if so.
+static bool isIntrinsicWithCC(SDValue Op, unsigned &Opcode, unsigned &CCValid) {
+ unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ switch (Id) {
+ case Intrinsic::s390_vpkshs:
+ case Intrinsic::s390_vpksfs:
+ case Intrinsic::s390_vpksgs:
+ Opcode = SystemZISD::PACKS_CC;
+ CCValid = SystemZ::CCMASK_VCMP;
+ return true;
+
+ case Intrinsic::s390_vpklshs:
+ case Intrinsic::s390_vpklsfs:
+ case Intrinsic::s390_vpklsgs:
+ Opcode = SystemZISD::PACKLS_CC;
+ CCValid = SystemZ::CCMASK_VCMP;
+ return true;
+
+ case Intrinsic::s390_vceqbs:
+ case Intrinsic::s390_vceqhs:
+ case Intrinsic::s390_vceqfs:
+ case Intrinsic::s390_vceqgs:
+ Opcode = SystemZISD::VICMPES;
+ CCValid = SystemZ::CCMASK_VCMP;
+ return true;
+
+ case Intrinsic::s390_vchbs:
+ case Intrinsic::s390_vchhs:
+ case Intrinsic::s390_vchfs:
+ case Intrinsic::s390_vchgs:
+ Opcode = SystemZISD::VICMPHS;
+ CCValid = SystemZ::CCMASK_VCMP;
+ return true;
+
+ case Intrinsic::s390_vchlbs:
+ case Intrinsic::s390_vchlhs:
+ case Intrinsic::s390_vchlfs:
+ case Intrinsic::s390_vchlgs:
+ Opcode = SystemZISD::VICMPHLS;
+ CCValid = SystemZ::CCMASK_VCMP;
+ return true;
+
+ case Intrinsic::s390_vtm:
+ Opcode = SystemZISD::VTM;
+ CCValid = SystemZ::CCMASK_VCMP;
+ return true;
+
+ case Intrinsic::s390_vfaebs:
+ case Intrinsic::s390_vfaehs:
+ case Intrinsic::s390_vfaefs:
+ Opcode = SystemZISD::VFAE_CC;
+ CCValid = SystemZ::CCMASK_ANY;
+ return true;
+
+ case Intrinsic::s390_vfaezbs:
+ case Intrinsic::s390_vfaezhs:
+ case Intrinsic::s390_vfaezfs:
+ Opcode = SystemZISD::VFAEZ_CC;
+ CCValid = SystemZ::CCMASK_ANY;
+ return true;
+
+ case Intrinsic::s390_vfeebs:
+ case Intrinsic::s390_vfeehs:
+ case Intrinsic::s390_vfeefs:
+ Opcode = SystemZISD::VFEE_CC;
+ CCValid = SystemZ::CCMASK_ANY;
+ return true;
+
+ case Intrinsic::s390_vfeezbs:
+ case Intrinsic::s390_vfeezhs:
+ case Intrinsic::s390_vfeezfs:
+ Opcode = SystemZISD::VFEEZ_CC;
+ CCValid = SystemZ::CCMASK_ANY;
+ return true;
+
+ case Intrinsic::s390_vfenebs:
+ case Intrinsic::s390_vfenehs:
+ case Intrinsic::s390_vfenefs:
+ Opcode = SystemZISD::VFENE_CC;
+ CCValid = SystemZ::CCMASK_ANY;
+ return true;
+
+ case Intrinsic::s390_vfenezbs:
+ case Intrinsic::s390_vfenezhs:
+ case Intrinsic::s390_vfenezfs:
+ Opcode = SystemZISD::VFENEZ_CC;
+ CCValid = SystemZ::CCMASK_ANY;
+ return true;
+
+ case Intrinsic::s390_vistrbs:
+ case Intrinsic::s390_vistrhs:
+ case Intrinsic::s390_vistrfs:
+ Opcode = SystemZISD::VISTR_CC;
+ CCValid = SystemZ::CCMASK_0 | SystemZ::CCMASK_3;
+ return true;
+
+ case Intrinsic::s390_vstrcbs:
+ case Intrinsic::s390_vstrchs:
+ case Intrinsic::s390_vstrcfs:
+ Opcode = SystemZISD::VSTRC_CC;
+ CCValid = SystemZ::CCMASK_ANY;
+ return true;
+
+ case Intrinsic::s390_vstrczbs:
+ case Intrinsic::s390_vstrczhs:
+ case Intrinsic::s390_vstrczfs:
+ Opcode = SystemZISD::VSTRCZ_CC;
+ CCValid = SystemZ::CCMASK_ANY;
+ return true;
+
+ case Intrinsic::s390_vfcedbs:
+ Opcode = SystemZISD::VFCMPES;
+ CCValid = SystemZ::CCMASK_VCMP;
+ return true;
+
+ case Intrinsic::s390_vfchdbs:
+ Opcode = SystemZISD::VFCMPHS;
+ CCValid = SystemZ::CCMASK_VCMP;
+ return true;
+
+ case Intrinsic::s390_vfchedbs:
+ Opcode = SystemZISD::VFCMPHES;
+ CCValid = SystemZ::CCMASK_VCMP;
+ return true;
+
+ case Intrinsic::s390_vftcidb:
+ Opcode = SystemZISD::VFTCI;
+ CCValid = SystemZ::CCMASK_VCMP;
+ return true;
+
+ case Intrinsic::s390_tdc:
+ Opcode = SystemZISD::TDC;
+ CCValid = SystemZ::CCMASK_TDC;
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+// Emit an intrinsic with chain with a glued value instead of its CC result.
+static SDValue emitIntrinsicWithChainAndGlue(SelectionDAG &DAG, SDValue Op,
+ unsigned Opcode) {
+ // Copy all operands except the intrinsic ID.
+ unsigned NumOps = Op.getNumOperands();
+ SmallVector<SDValue, 6> Ops;
+ Ops.reserve(NumOps - 1);
+ Ops.push_back(Op.getOperand(0));
+ for (unsigned I = 2; I < NumOps; ++I)
+ Ops.push_back(Op.getOperand(I));
+
+ assert(Op->getNumValues() == 2 && "Expected only CC result and chain");
+ SDVTList RawVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue Intr = DAG.getNode(Opcode, SDLoc(Op), RawVTs, Ops);
+ SDValue OldChain = SDValue(Op.getNode(), 1);
+ SDValue NewChain = SDValue(Intr.getNode(), 0);
+ DAG.ReplaceAllUsesOfValueWith(OldChain, NewChain);
+ return Intr;
+}
+
+// Emit an intrinsic with a glued value instead of its CC result.
+static SDValue emitIntrinsicWithGlue(SelectionDAG &DAG, SDValue Op,
+ unsigned Opcode) {
+ // Copy all operands except the intrinsic ID.
+ unsigned NumOps = Op.getNumOperands();
+ SmallVector<SDValue, 6> Ops;
+ Ops.reserve(NumOps - 1);
+ for (unsigned I = 1; I < NumOps; ++I)
+ Ops.push_back(Op.getOperand(I));
+
+ if (Op->getNumValues() == 1)
+ return DAG.getNode(Opcode, SDLoc(Op), MVT::Glue, Ops);
+ assert(Op->getNumValues() == 2 && "Expected exactly one non-CC result");
+ SDVTList RawVTs = DAG.getVTList(Op->getValueType(0), MVT::Glue);
+ return DAG.getNode(Opcode, SDLoc(Op), RawVTs, Ops);
+}
+
+// CC is a comparison that will be implemented using an integer or
+// floating-point comparison. Return the condition code mask for
+// a branch on true. In the integer case, CCMASK_CMP_UO is set for
+// unsigned comparisons and clear for signed ones. In the floating-point
+// case, CCMASK_CMP_UO has its normal mask meaning (unordered).
+static unsigned CCMaskForCondCode(ISD::CondCode CC) {
+#define CONV(X) \
+ case ISD::SET##X: return SystemZ::CCMASK_CMP_##X; \
+ case ISD::SETO##X: return SystemZ::CCMASK_CMP_##X; \
+ case ISD::SETU##X: return SystemZ::CCMASK_CMP_UO | SystemZ::CCMASK_CMP_##X
+
+ switch (CC) {
+ default:
+ llvm_unreachable("Invalid integer condition!");
+
+ CONV(EQ);
+ CONV(NE);
+ CONV(GT);
+ CONV(GE);
+ CONV(LT);
+ CONV(LE);
+
+ case ISD::SETO: return SystemZ::CCMASK_CMP_O;
+ case ISD::SETUO: return SystemZ::CCMASK_CMP_UO;
+ }
+#undef CONV
+}
+
+// Return a sequence for getting a 1 from an IPM result when CC has a
+// value in CCMask and a 0 when CC has a value in CCValid & ~CCMask.
+// The handling of CC values outside CCValid doesn't matter.
+static IPMConversion getIPMConversion(unsigned CCValid, unsigned CCMask) {
+ // Deal with cases where the result can be taken directly from a bit
+ // of the IPM result.
+ if (CCMask == (CCValid & (SystemZ::CCMASK_1 | SystemZ::CCMASK_3)))
+ return IPMConversion(0, 0, SystemZ::IPM_CC);
+ if (CCMask == (CCValid & (SystemZ::CCMASK_2 | SystemZ::CCMASK_3)))
+ return IPMConversion(0, 0, SystemZ::IPM_CC + 1);
+
+ // Deal with cases where we can add a value to force the sign bit
+ // to contain the right value. Putting the bit in 31 means we can
+ // use SRL rather than RISBG(L), and also makes it easier to get a
+ // 0/-1 value, so it has priority over the other tests below.
+ //
+ // These sequences rely on the fact that the upper two bits of the
+ // IPM result are zero.
+ uint64_t TopBit = uint64_t(1) << 31;
+ if (CCMask == (CCValid & SystemZ::CCMASK_0))
+ return IPMConversion(0, -(1 << SystemZ::IPM_CC), 31);
+ if (CCMask == (CCValid & (SystemZ::CCMASK_0 | SystemZ::CCMASK_1)))
+ return IPMConversion(0, -(2 << SystemZ::IPM_CC), 31);
+ if (CCMask == (CCValid & (SystemZ::CCMASK_0
+ | SystemZ::CCMASK_1
+ | SystemZ::CCMASK_2)))
+ return IPMConversion(0, -(3 << SystemZ::IPM_CC), 31);
+ if (CCMask == (CCValid & SystemZ::CCMASK_3))
+ return IPMConversion(0, TopBit - (3 << SystemZ::IPM_CC), 31);
+ if (CCMask == (CCValid & (SystemZ::CCMASK_1
+ | SystemZ::CCMASK_2
+ | SystemZ::CCMASK_3)))
+ return IPMConversion(0, TopBit - (1 << SystemZ::IPM_CC), 31);
+
+ // Next try inverting the value and testing a bit. 0/1 could be
+ // handled this way too, but we dealt with that case above.
+ if (CCMask == (CCValid & (SystemZ::CCMASK_0 | SystemZ::CCMASK_2)))
+ return IPMConversion(-1, 0, SystemZ::IPM_CC);
+
+ // Handle cases where adding a value forces a non-sign bit to contain
+ // the right value.
+ if (CCMask == (CCValid & (SystemZ::CCMASK_1 | SystemZ::CCMASK_2)))
+ return IPMConversion(0, 1 << SystemZ::IPM_CC, SystemZ::IPM_CC + 1);
+ if (CCMask == (CCValid & (SystemZ::CCMASK_0 | SystemZ::CCMASK_3)))
+ return IPMConversion(0, -(1 << SystemZ::IPM_CC), SystemZ::IPM_CC + 1);
+
+ // The remaining cases are 1, 2, 0/1/3 and 0/2/3. All these are
+ // can be done by inverting the low CC bit and applying one of the
+ // sign-based extractions above.
+ if (CCMask == (CCValid & SystemZ::CCMASK_1))
+ return IPMConversion(1 << SystemZ::IPM_CC, -(1 << SystemZ::IPM_CC), 31);
+ if (CCMask == (CCValid & SystemZ::CCMASK_2))
+ return IPMConversion(1 << SystemZ::IPM_CC,
+ TopBit - (3 << SystemZ::IPM_CC), 31);
+ if (CCMask == (CCValid & (SystemZ::CCMASK_0
+ | SystemZ::CCMASK_1
+ | SystemZ::CCMASK_3)))
+ return IPMConversion(1 << SystemZ::IPM_CC, -(3 << SystemZ::IPM_CC), 31);
+ if (CCMask == (CCValid & (SystemZ::CCMASK_0
+ | SystemZ::CCMASK_2
+ | SystemZ::CCMASK_3)))
+ return IPMConversion(1 << SystemZ::IPM_CC,
+ TopBit - (1 << SystemZ::IPM_CC), 31);
+
+ llvm_unreachable("Unexpected CC combination");
+}
+
+// If C can be converted to a comparison against zero, adjust the operands
+// as necessary.
+static void adjustZeroCmp(SelectionDAG &DAG, const SDLoc &DL, Comparison &C) {
+ if (C.ICmpType == SystemZICMP::UnsignedOnly)
+ return;
+
+ auto *ConstOp1 = dyn_cast<ConstantSDNode>(C.Op1.getNode());
+ if (!ConstOp1)
+ return;
+
+ int64_t Value = ConstOp1->getSExtValue();
+ if ((Value == -1 && C.CCMask == SystemZ::CCMASK_CMP_GT) ||
+ (Value == -1 && C.CCMask == SystemZ::CCMASK_CMP_LE) ||
+ (Value == 1 && C.CCMask == SystemZ::CCMASK_CMP_LT) ||
+ (Value == 1 && C.CCMask == SystemZ::CCMASK_CMP_GE)) {
+ C.CCMask ^= SystemZ::CCMASK_CMP_EQ;
+ C.Op1 = DAG.getConstant(0, DL, C.Op1.getValueType());
+ }
+}
+
+// If a comparison described by C is suitable for CLI(Y), CHHSI or CLHHSI,
+// adjust the operands as necessary.
+static void adjustSubwordCmp(SelectionDAG &DAG, const SDLoc &DL,
+ Comparison &C) {
+ // For us to make any changes, it must a comparison between a single-use
+ // load and a constant.
+ if (!C.Op0.hasOneUse() ||
+ C.Op0.getOpcode() != ISD::LOAD ||
+ C.Op1.getOpcode() != ISD::Constant)
+ return;
+
+ // We must have an 8- or 16-bit load.
+ auto *Load = cast<LoadSDNode>(C.Op0);
+ unsigned NumBits = Load->getMemoryVT().getStoreSizeInBits();
+ if (NumBits != 8 && NumBits != 16)
+ return;
+
+ // The load must be an extending one and the constant must be within the
+ // range of the unextended value.
+ auto *ConstOp1 = cast<ConstantSDNode>(C.Op1);
+ uint64_t Value = ConstOp1->getZExtValue();
+ uint64_t Mask = (1 << NumBits) - 1;
+ if (Load->getExtensionType() == ISD::SEXTLOAD) {
+ // Make sure that ConstOp1 is in range of C.Op0.
+ int64_t SignedValue = ConstOp1->getSExtValue();
+ if (uint64_t(SignedValue) + (uint64_t(1) << (NumBits - 1)) > Mask)
+ return;
+ if (C.ICmpType != SystemZICMP::SignedOnly) {
+ // Unsigned comparison between two sign-extended values is equivalent
+ // to unsigned comparison between two zero-extended values.
+ Value &= Mask;
+ } else if (NumBits == 8) {
+ // Try to treat the comparison as unsigned, so that we can use CLI.
+ // Adjust CCMask and Value as necessary.
+ if (Value == 0 && C.CCMask == SystemZ::CCMASK_CMP_LT)
+ // Test whether the high bit of the byte is set.
+ Value = 127, C.CCMask = SystemZ::CCMASK_CMP_GT;
+ else if (Value == 0 && C.CCMask == SystemZ::CCMASK_CMP_GE)
+ // Test whether the high bit of the byte is clear.
+ Value = 128, C.CCMask = SystemZ::CCMASK_CMP_LT;
+ else
+ // No instruction exists for this combination.
+ return;
+ C.ICmpType = SystemZICMP::UnsignedOnly;
+ }
+ } else if (Load->getExtensionType() == ISD::ZEXTLOAD) {
+ if (Value > Mask)
+ return;
+ // If the constant is in range, we can use any comparison.
+ C.ICmpType = SystemZICMP::Any;
+ } else
+ return;
+
+ // Make sure that the first operand is an i32 of the right extension type.
+ ISD::LoadExtType ExtType = (C.ICmpType == SystemZICMP::SignedOnly ?
+ ISD::SEXTLOAD :
+ ISD::ZEXTLOAD);
+ if (C.Op0.getValueType() != MVT::i32 ||
+ Load->getExtensionType() != ExtType)
+ C.Op0 = DAG.getExtLoad(ExtType, SDLoc(Load), MVT::i32, Load->getChain(),
+ Load->getBasePtr(), Load->getPointerInfo(),
+ Load->getMemoryVT(), Load->getAlignment(),
+ Load->getMemOperand()->getFlags());
+
+ // Make sure that the second operand is an i32 with the right value.
+ if (C.Op1.getValueType() != MVT::i32 ||
+ Value != ConstOp1->getZExtValue())
+ C.Op1 = DAG.getConstant(Value, DL, MVT::i32);
+}
+
+// Return true if Op is either an unextended load, or a load suitable
+// for integer register-memory comparisons of type ICmpType.
+static bool isNaturalMemoryOperand(SDValue Op, unsigned ICmpType) {
+ auto *Load = dyn_cast<LoadSDNode>(Op.getNode());
+ if (Load) {
+ // There are no instructions to compare a register with a memory byte.
+ if (Load->getMemoryVT() == MVT::i8)
+ return false;
+ // Otherwise decide on extension type.
+ switch (Load->getExtensionType()) {
+ case ISD::NON_EXTLOAD:
+ return true;
+ case ISD::SEXTLOAD:
+ return ICmpType != SystemZICMP::UnsignedOnly;
+ case ISD::ZEXTLOAD:
+ return ICmpType != SystemZICMP::SignedOnly;
+ default:
+ break;
+ }
+ }
+ return false;
+}
+
+// Return true if it is better to swap the operands of C.
+static bool shouldSwapCmpOperands(const Comparison &C) {
+ // Leave f128 comparisons alone, since they have no memory forms.
+ if (C.Op0.getValueType() == MVT::f128)
+ return false;
+
+ // Always keep a floating-point constant second, since comparisons with
+ // zero can use LOAD TEST and comparisons with other constants make a
+ // natural memory operand.
+ if (isa<ConstantFPSDNode>(C.Op1))
+ return false;
+
+ // Never swap comparisons with zero since there are many ways to optimize
+ // those later.
+ auto *ConstOp1 = dyn_cast<ConstantSDNode>(C.Op1);
+ if (ConstOp1 && ConstOp1->getZExtValue() == 0)
+ return false;
+
+ // Also keep natural memory operands second if the loaded value is
+ // only used here. Several comparisons have memory forms.
+ if (isNaturalMemoryOperand(C.Op1, C.ICmpType) && C.Op1.hasOneUse())
+ return false;
+
+ // Look for cases where Cmp0 is a single-use load and Cmp1 isn't.
+ // In that case we generally prefer the memory to be second.
+ if (isNaturalMemoryOperand(C.Op0, C.ICmpType) && C.Op0.hasOneUse()) {
+ // The only exceptions are when the second operand is a constant and
+ // we can use things like CHHSI.
+ if (!ConstOp1)
+ return true;
+ // The unsigned memory-immediate instructions can handle 16-bit
+ // unsigned integers.
+ if (C.ICmpType != SystemZICMP::SignedOnly &&
+ isUInt<16>(ConstOp1->getZExtValue()))
+ return false;
+ // The signed memory-immediate instructions can handle 16-bit
+ // signed integers.
+ if (C.ICmpType != SystemZICMP::UnsignedOnly &&
+ isInt<16>(ConstOp1->getSExtValue()))
+ return false;
+ return true;
+ }
+
+ // Try to promote the use of CGFR and CLGFR.
+ unsigned Opcode0 = C.Op0.getOpcode();
+ if (C.ICmpType != SystemZICMP::UnsignedOnly && Opcode0 == ISD::SIGN_EXTEND)
+ return true;
+ if (C.ICmpType != SystemZICMP::SignedOnly && Opcode0 == ISD::ZERO_EXTEND)
+ return true;
+ if (C.ICmpType != SystemZICMP::SignedOnly &&
+ Opcode0 == ISD::AND &&
+ C.Op0.getOperand(1).getOpcode() == ISD::Constant &&
+ cast<ConstantSDNode>(C.Op0.getOperand(1))->getZExtValue() == 0xffffffff)
+ return true;
+
+ return false;
+}
+
+// Return a version of comparison CC mask CCMask in which the LT and GT
+// actions are swapped.
+static unsigned reverseCCMask(unsigned CCMask) {
+ return ((CCMask & SystemZ::CCMASK_CMP_EQ) |
+ (CCMask & SystemZ::CCMASK_CMP_GT ? SystemZ::CCMASK_CMP_LT : 0) |
+ (CCMask & SystemZ::CCMASK_CMP_LT ? SystemZ::CCMASK_CMP_GT : 0) |
+ (CCMask & SystemZ::CCMASK_CMP_UO));
+}
+
+// Check whether C tests for equality between X and Y and whether X - Y
+// or Y - X is also computed. In that case it's better to compare the
+// result of the subtraction against zero.
+static void adjustForSubtraction(SelectionDAG &DAG, const SDLoc &DL,
+ Comparison &C) {
+ if (C.CCMask == SystemZ::CCMASK_CMP_EQ ||
+ C.CCMask == SystemZ::CCMASK_CMP_NE) {
+ for (auto I = C.Op0->use_begin(), E = C.Op0->use_end(); I != E; ++I) {
+ SDNode *N = *I;
+ if (N->getOpcode() == ISD::SUB &&
+ ((N->getOperand(0) == C.Op0 && N->getOperand(1) == C.Op1) ||
+ (N->getOperand(0) == C.Op1 && N->getOperand(1) == C.Op0))) {
+ C.Op0 = SDValue(N, 0);
+ C.Op1 = DAG.getConstant(0, DL, N->getValueType(0));
+ return;
+ }
+ }
+ }
+}
+
+// Check whether C compares a floating-point value with zero and if that
+// floating-point value is also negated. In this case we can use the
+// negation to set CC, so avoiding separate LOAD AND TEST and
+// LOAD (NEGATIVE/COMPLEMENT) instructions.
+static void adjustForFNeg(Comparison &C) {
+ auto *C1 = dyn_cast<ConstantFPSDNode>(C.Op1);
+ if (C1 && C1->isZero()) {
+ for (auto I = C.Op0->use_begin(), E = C.Op0->use_end(); I != E; ++I) {
+ SDNode *N = *I;
+ if (N->getOpcode() == ISD::FNEG) {
+ C.Op0 = SDValue(N, 0);
+ C.CCMask = reverseCCMask(C.CCMask);
+ return;
+ }
+ }
+ }
+}
+
+// Check whether C compares (shl X, 32) with 0 and whether X is
+// also sign-extended. In that case it is better to test the result
+// of the sign extension using LTGFR.
+//
+// This case is important because InstCombine transforms a comparison
+// with (sext (trunc X)) into a comparison with (shl X, 32).
+static void adjustForLTGFR(Comparison &C) {
+ // Check for a comparison between (shl X, 32) and 0.
+ if (C.Op0.getOpcode() == ISD::SHL &&
+ C.Op0.getValueType() == MVT::i64 &&
+ C.Op1.getOpcode() == ISD::Constant &&
+ cast<ConstantSDNode>(C.Op1)->getZExtValue() == 0) {
+ auto *C1 = dyn_cast<ConstantSDNode>(C.Op0.getOperand(1));
+ if (C1 && C1->getZExtValue() == 32) {
+ SDValue ShlOp0 = C.Op0.getOperand(0);
+ // See whether X has any SIGN_EXTEND_INREG uses.
+ for (auto I = ShlOp0->use_begin(), E = ShlOp0->use_end(); I != E; ++I) {
+ SDNode *N = *I;
+ if (N->getOpcode() == ISD::SIGN_EXTEND_INREG &&
+ cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i32) {
+ C.Op0 = SDValue(N, 0);
+ return;
+ }
+ }
+ }
+ }
+}
+
+// If C compares the truncation of an extending load, try to compare
+// the untruncated value instead. This exposes more opportunities to
+// reuse CC.
+static void adjustICmpTruncate(SelectionDAG &DAG, const SDLoc &DL,
+ Comparison &C) {
+ if (C.Op0.getOpcode() == ISD::TRUNCATE &&
+ C.Op0.getOperand(0).getOpcode() == ISD::LOAD &&
+ C.Op1.getOpcode() == ISD::Constant &&
+ cast<ConstantSDNode>(C.Op1)->getZExtValue() == 0) {
+ auto *L = cast<LoadSDNode>(C.Op0.getOperand(0));
+ if (L->getMemoryVT().getStoreSizeInBits() <= C.Op0.getValueSizeInBits()) {
+ unsigned Type = L->getExtensionType();
+ if ((Type == ISD::ZEXTLOAD && C.ICmpType != SystemZICMP::SignedOnly) ||
+ (Type == ISD::SEXTLOAD && C.ICmpType != SystemZICMP::UnsignedOnly)) {
+ C.Op0 = C.Op0.getOperand(0);
+ C.Op1 = DAG.getConstant(0, DL, C.Op0.getValueType());
+ }
+ }
+ }
+}
+
+// Return true if shift operation N has an in-range constant shift value.
+// Store it in ShiftVal if so.
+static bool isSimpleShift(SDValue N, unsigned &ShiftVal) {
+ auto *Shift = dyn_cast<ConstantSDNode>(N.getOperand(1));
+ if (!Shift)
+ return false;
+
+ uint64_t Amount = Shift->getZExtValue();
+ if (Amount >= N.getValueSizeInBits())
+ return false;
+
+ ShiftVal = Amount;
+ return true;
+}
+
+// Check whether an AND with Mask is suitable for a TEST UNDER MASK
+// instruction and whether the CC value is descriptive enough to handle
+// a comparison of type Opcode between the AND result and CmpVal.
+// CCMask says which comparison result is being tested and BitSize is
+// the number of bits in the operands. If TEST UNDER MASK can be used,
+// return the corresponding CC mask, otherwise return 0.
+static unsigned getTestUnderMaskCond(unsigned BitSize, unsigned CCMask,
+ uint64_t Mask, uint64_t CmpVal,
+ unsigned ICmpType) {
+ assert(Mask != 0 && "ANDs with zero should have been removed by now");
+
+ // Check whether the mask is suitable for TMHH, TMHL, TMLH or TMLL.
+ if (!SystemZ::isImmLL(Mask) && !SystemZ::isImmLH(Mask) &&
+ !SystemZ::isImmHL(Mask) && !SystemZ::isImmHH(Mask))
+ return 0;
+
+ // Work out the masks for the lowest and highest bits.
+ unsigned HighShift = 63 - countLeadingZeros(Mask);
+ uint64_t High = uint64_t(1) << HighShift;
+ uint64_t Low = uint64_t(1) << countTrailingZeros(Mask);
+
+ // Signed ordered comparisons are effectively unsigned if the sign
+ // bit is dropped.
+ bool EffectivelyUnsigned = (ICmpType != SystemZICMP::SignedOnly);
+
+ // Check for equality comparisons with 0, or the equivalent.
+ if (CmpVal == 0) {
+ if (CCMask == SystemZ::CCMASK_CMP_EQ)
+ return SystemZ::CCMASK_TM_ALL_0;
+ if (CCMask == SystemZ::CCMASK_CMP_NE)
+ return SystemZ::CCMASK_TM_SOME_1;
+ }
+ if (EffectivelyUnsigned && CmpVal > 0 && CmpVal <= Low) {
+ if (CCMask == SystemZ::CCMASK_CMP_LT)
+ return SystemZ::CCMASK_TM_ALL_0;
+ if (CCMask == SystemZ::CCMASK_CMP_GE)
+ return SystemZ::CCMASK_TM_SOME_1;
+ }
+ if (EffectivelyUnsigned && CmpVal < Low) {
+ if (CCMask == SystemZ::CCMASK_CMP_LE)
+ return SystemZ::CCMASK_TM_ALL_0;
+ if (CCMask == SystemZ::CCMASK_CMP_GT)
+ return SystemZ::CCMASK_TM_SOME_1;
+ }
+
+ // Check for equality comparisons with the mask, or the equivalent.
+ if (CmpVal == Mask) {
+ if (CCMask == SystemZ::CCMASK_CMP_EQ)
+ return SystemZ::CCMASK_TM_ALL_1;
+ if (CCMask == SystemZ::CCMASK_CMP_NE)
+ return SystemZ::CCMASK_TM_SOME_0;
+ }
+ if (EffectivelyUnsigned && CmpVal >= Mask - Low && CmpVal < Mask) {
+ if (CCMask == SystemZ::CCMASK_CMP_GT)
+ return SystemZ::CCMASK_TM_ALL_1;
+ if (CCMask == SystemZ::CCMASK_CMP_LE)
+ return SystemZ::CCMASK_TM_SOME_0;
+ }
+ if (EffectivelyUnsigned && CmpVal > Mask - Low && CmpVal <= Mask) {
+ if (CCMask == SystemZ::CCMASK_CMP_GE)
+ return SystemZ::CCMASK_TM_ALL_1;
+ if (CCMask == SystemZ::CCMASK_CMP_LT)
+ return SystemZ::CCMASK_TM_SOME_0;
+ }
+
+ // Check for ordered comparisons with the top bit.
+ if (EffectivelyUnsigned && CmpVal >= Mask - High && CmpVal < High) {
+ if (CCMask == SystemZ::CCMASK_CMP_LE)
+ return SystemZ::CCMASK_TM_MSB_0;
+ if (CCMask == SystemZ::CCMASK_CMP_GT)
+ return SystemZ::CCMASK_TM_MSB_1;
+ }
+ if (EffectivelyUnsigned && CmpVal > Mask - High && CmpVal <= High) {
+ if (CCMask == SystemZ::CCMASK_CMP_LT)
+ return SystemZ::CCMASK_TM_MSB_0;
+ if (CCMask == SystemZ::CCMASK_CMP_GE)
+ return SystemZ::CCMASK_TM_MSB_1;
+ }
+
+ // If there are just two bits, we can do equality checks for Low and High
+ // as well.
+ if (Mask == Low + High) {
+ if (CCMask == SystemZ::CCMASK_CMP_EQ && CmpVal == Low)
+ return SystemZ::CCMASK_TM_MIXED_MSB_0;
+ if (CCMask == SystemZ::CCMASK_CMP_NE && CmpVal == Low)
+ return SystemZ::CCMASK_TM_MIXED_MSB_0 ^ SystemZ::CCMASK_ANY;
+ if (CCMask == SystemZ::CCMASK_CMP_EQ && CmpVal == High)
+ return SystemZ::CCMASK_TM_MIXED_MSB_1;
+ if (CCMask == SystemZ::CCMASK_CMP_NE && CmpVal == High)
+ return SystemZ::CCMASK_TM_MIXED_MSB_1 ^ SystemZ::CCMASK_ANY;
+ }
+
+ // Looks like we've exhausted our options.
+ return 0;
+}
+
+// See whether C can be implemented as a TEST UNDER MASK instruction.
+// Update the arguments with the TM version if so.
+static void adjustForTestUnderMask(SelectionDAG &DAG, const SDLoc &DL,
+ Comparison &C) {
+ // Check that we have a comparison with a constant.
+ auto *ConstOp1 = dyn_cast<ConstantSDNode>(C.Op1);
+ if (!ConstOp1)
+ return;
+ uint64_t CmpVal = ConstOp1->getZExtValue();
+
+ // Check whether the nonconstant input is an AND with a constant mask.
+ Comparison NewC(C);
+ uint64_t MaskVal;
+ ConstantSDNode *Mask = nullptr;
+ if (C.Op0.getOpcode() == ISD::AND) {
+ NewC.Op0 = C.Op0.getOperand(0);
+ NewC.Op1 = C.Op0.getOperand(1);
+ Mask = dyn_cast<ConstantSDNode>(NewC.Op1);
+ if (!Mask)
+ return;
+ MaskVal = Mask->getZExtValue();
+ } else {
+ // There is no instruction to compare with a 64-bit immediate
+ // so use TMHH instead if possible. We need an unsigned ordered
+ // comparison with an i64 immediate.
+ if (NewC.Op0.getValueType() != MVT::i64 ||
+ NewC.CCMask == SystemZ::CCMASK_CMP_EQ ||
+ NewC.CCMask == SystemZ::CCMASK_CMP_NE ||
+ NewC.ICmpType == SystemZICMP::SignedOnly)
+ return;
+ // Convert LE and GT comparisons into LT and GE.
+ if (NewC.CCMask == SystemZ::CCMASK_CMP_LE ||
+ NewC.CCMask == SystemZ::CCMASK_CMP_GT) {
+ if (CmpVal == uint64_t(-1))
+ return;
+ CmpVal += 1;
+ NewC.CCMask ^= SystemZ::CCMASK_CMP_EQ;
+ }
+ // If the low N bits of Op1 are zero than the low N bits of Op0 can
+ // be masked off without changing the result.
+ MaskVal = -(CmpVal & -CmpVal);
+ NewC.ICmpType = SystemZICMP::UnsignedOnly;
+ }
+ if (!MaskVal)
+ return;
+
+ // Check whether the combination of mask, comparison value and comparison
+ // type are suitable.
+ unsigned BitSize = NewC.Op0.getValueSizeInBits();
+ unsigned NewCCMask, ShiftVal;
+ if (NewC.ICmpType != SystemZICMP::SignedOnly &&
+ NewC.Op0.getOpcode() == ISD::SHL &&
+ isSimpleShift(NewC.Op0, ShiftVal) &&
+ (NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask,
+ MaskVal >> ShiftVal,
+ CmpVal >> ShiftVal,
+ SystemZICMP::Any))) {
+ NewC.Op0 = NewC.Op0.getOperand(0);
+ MaskVal >>= ShiftVal;
+ } else if (NewC.ICmpType != SystemZICMP::SignedOnly &&
+ NewC.Op0.getOpcode() == ISD::SRL &&
+ isSimpleShift(NewC.Op0, ShiftVal) &&
+ (NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask,
+ MaskVal << ShiftVal,
+ CmpVal << ShiftVal,
+ SystemZICMP::UnsignedOnly))) {
+ NewC.Op0 = NewC.Op0.getOperand(0);
+ MaskVal <<= ShiftVal;
+ } else {
+ NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask, MaskVal, CmpVal,
+ NewC.ICmpType);
+ if (!NewCCMask)
+ return;
+ }
+
+ // Go ahead and make the change.
+ C.Opcode = SystemZISD::TM;
+ C.Op0 = NewC.Op0;
+ if (Mask && Mask->getZExtValue() == MaskVal)
+ C.Op1 = SDValue(Mask, 0);
+ else
+ C.Op1 = DAG.getConstant(MaskVal, DL, C.Op0.getValueType());
+ C.CCValid = SystemZ::CCMASK_TM;
+ C.CCMask = NewCCMask;
+}
+
+// Return a Comparison that tests the condition-code result of intrinsic
+// node Call against constant integer CC using comparison code Cond.
+// Opcode is the opcode of the SystemZISD operation for the intrinsic
+// and CCValid is the set of possible condition-code results.
+static Comparison getIntrinsicCmp(SelectionDAG &DAG, unsigned Opcode,
+ SDValue Call, unsigned CCValid, uint64_t CC,
+ ISD::CondCode Cond) {
+ Comparison C(Call, SDValue());
+ C.Opcode = Opcode;
+ C.CCValid = CCValid;
+ if (Cond == ISD::SETEQ)
+ // bit 3 for CC==0, bit 0 for CC==3, always false for CC>3.
+ C.CCMask = CC < 4 ? 1 << (3 - CC) : 0;
+ else if (Cond == ISD::SETNE)
+ // ...and the inverse of that.
+ C.CCMask = CC < 4 ? ~(1 << (3 - CC)) : -1;
+ else if (Cond == ISD::SETLT || Cond == ISD::SETULT)
+ // bits above bit 3 for CC==0 (always false), bits above bit 0 for CC==3,
+ // always true for CC>3.
+ C.CCMask = CC < 4 ? ~0U << (4 - CC) : -1;
+ else if (Cond == ISD::SETGE || Cond == ISD::SETUGE)
+ // ...and the inverse of that.
+ C.CCMask = CC < 4 ? ~(~0U << (4 - CC)) : 0;
+ else if (Cond == ISD::SETLE || Cond == ISD::SETULE)
+ // bit 3 and above for CC==0, bit 0 and above for CC==3 (always true),
+ // always true for CC>3.
+ C.CCMask = CC < 4 ? ~0U << (3 - CC) : -1;
+ else if (Cond == ISD::SETGT || Cond == ISD::SETUGT)
+ // ...and the inverse of that.
+ C.CCMask = CC < 4 ? ~(~0U << (3 - CC)) : 0;
+ else
+ llvm_unreachable("Unexpected integer comparison type");
+ C.CCMask &= CCValid;
+ return C;
+}
+
+// Decide how to implement a comparison of type Cond between CmpOp0 with CmpOp1.
+static Comparison getCmp(SelectionDAG &DAG, SDValue CmpOp0, SDValue CmpOp1,
+ ISD::CondCode Cond, const SDLoc &DL) {
+ if (CmpOp1.getOpcode() == ISD::Constant) {
+ uint64_t Constant = cast<ConstantSDNode>(CmpOp1)->getZExtValue();
+ unsigned Opcode, CCValid;
+ if (CmpOp0.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
+ CmpOp0.getResNo() == 0 && CmpOp0->hasNUsesOfValue(1, 0) &&
+ isIntrinsicWithCCAndChain(CmpOp0, Opcode, CCValid))
+ return getIntrinsicCmp(DAG, Opcode, CmpOp0, CCValid, Constant, Cond);
+ if (CmpOp0.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
+ CmpOp0.getResNo() == CmpOp0->getNumValues() - 1 &&
+ isIntrinsicWithCC(CmpOp0, Opcode, CCValid))
+ return getIntrinsicCmp(DAG, Opcode, CmpOp0, CCValid, Constant, Cond);
+ }
+ Comparison C(CmpOp0, CmpOp1);
+ C.CCMask = CCMaskForCondCode(Cond);
+ if (C.Op0.getValueType().isFloatingPoint()) {
+ C.CCValid = SystemZ::CCMASK_FCMP;
+ C.Opcode = SystemZISD::FCMP;
+ adjustForFNeg(C);
+ } else {
+ C.CCValid = SystemZ::CCMASK_ICMP;
+ C.Opcode = SystemZISD::ICMP;
+ // Choose the type of comparison. Equality and inequality tests can
+ // use either signed or unsigned comparisons. The choice also doesn't
+ // matter if both sign bits are known to be clear. In those cases we
+ // want to give the main isel code the freedom to choose whichever
+ // form fits best.
+ if (C.CCMask == SystemZ::CCMASK_CMP_EQ ||
+ C.CCMask == SystemZ::CCMASK_CMP_NE ||
+ (DAG.SignBitIsZero(C.Op0) && DAG.SignBitIsZero(C.Op1)))
+ C.ICmpType = SystemZICMP::Any;
+ else if (C.CCMask & SystemZ::CCMASK_CMP_UO)
+ C.ICmpType = SystemZICMP::UnsignedOnly;
+ else
+ C.ICmpType = SystemZICMP::SignedOnly;
+ C.CCMask &= ~SystemZ::CCMASK_CMP_UO;
+ adjustZeroCmp(DAG, DL, C);
+ adjustSubwordCmp(DAG, DL, C);
+ adjustForSubtraction(DAG, DL, C);
+ adjustForLTGFR(C);
+ adjustICmpTruncate(DAG, DL, C);
+ }
+
+ if (shouldSwapCmpOperands(C)) {
+ std::swap(C.Op0, C.Op1);
+ C.CCMask = reverseCCMask(C.CCMask);
+ }
+
+ adjustForTestUnderMask(DAG, DL, C);
+ return C;
+}
+
+// Emit the comparison instruction described by C.
+static SDValue emitCmp(SelectionDAG &DAG, const SDLoc &DL, Comparison &C) {
+ if (!C.Op1.getNode()) {
+ SDValue Op;
+ switch (C.Op0.getOpcode()) {
+ case ISD::INTRINSIC_W_CHAIN:
+ Op = emitIntrinsicWithChainAndGlue(DAG, C.Op0, C.Opcode);
+ break;
+ case ISD::INTRINSIC_WO_CHAIN:
+ Op = emitIntrinsicWithGlue(DAG, C.Op0, C.Opcode);
+ break;
+ default:
+ llvm_unreachable("Invalid comparison operands");
+ }
+ return SDValue(Op.getNode(), Op->getNumValues() - 1);
+ }
+ if (C.Opcode == SystemZISD::ICMP)
+ return DAG.getNode(SystemZISD::ICMP, DL, MVT::Glue, C.Op0, C.Op1,
+ DAG.getConstant(C.ICmpType, DL, MVT::i32));
+ if (C.Opcode == SystemZISD::TM) {
+ bool RegisterOnly = (bool(C.CCMask & SystemZ::CCMASK_TM_MIXED_MSB_0) !=
+ bool(C.CCMask & SystemZ::CCMASK_TM_MIXED_MSB_1));
+ return DAG.getNode(SystemZISD::TM, DL, MVT::Glue, C.Op0, C.Op1,
+ DAG.getConstant(RegisterOnly, DL, MVT::i32));
+ }
+ return DAG.getNode(C.Opcode, DL, MVT::Glue, C.Op0, C.Op1);
+}
+
+// Implement a 32-bit *MUL_LOHI operation by extending both operands to
+// 64 bits. Extend is the extension type to use. Store the high part
+// in Hi and the low part in Lo.
+static void lowerMUL_LOHI32(SelectionDAG &DAG, const SDLoc &DL, unsigned Extend,
+ SDValue Op0, SDValue Op1, SDValue &Hi,
+ SDValue &Lo) {
+ Op0 = DAG.getNode(Extend, DL, MVT::i64, Op0);
+ Op1 = DAG.getNode(Extend, DL, MVT::i64, Op1);
+ SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, Op0, Op1);
+ Hi = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
+ DAG.getConstant(32, DL, MVT::i64));
+ Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Hi);
+ Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
+}
+
+// Lower a binary operation that produces two VT results, one in each
+// half of a GR128 pair. Op0 and Op1 are the VT operands to the operation,
+// Extend extends Op0 to a GR128, and Opcode performs the GR128 operation
+// on the extended Op0 and (unextended) Op1. Store the even register result
+// in Even and the odd register result in Odd.
+static void lowerGR128Binary(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
+ unsigned Extend, unsigned Opcode, SDValue Op0,
+ SDValue Op1, SDValue &Even, SDValue &Odd) {
+ SDNode *In128 = DAG.getMachineNode(Extend, DL, MVT::Untyped, Op0);
+ SDValue Result = DAG.getNode(Opcode, DL, MVT::Untyped,
+ SDValue(In128, 0), Op1);
+ bool Is32Bit = is32Bit(VT);
+ Even = DAG.getTargetExtractSubreg(SystemZ::even128(Is32Bit), DL, VT, Result);
+ Odd = DAG.getTargetExtractSubreg(SystemZ::odd128(Is32Bit), DL, VT, Result);
+}
+
+// Return an i32 value that is 1 if the CC value produced by Glue is
+// in the mask CCMask and 0 otherwise. CC is known to have a value
+// in CCValid, so other values can be ignored.
+static SDValue emitSETCC(SelectionDAG &DAG, const SDLoc &DL, SDValue Glue,
+ unsigned CCValid, unsigned CCMask) {
+ IPMConversion Conversion = getIPMConversion(CCValid, CCMask);
+ SDValue Result = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, Glue);
+
+ if (Conversion.XORValue)
+ Result = DAG.getNode(ISD::XOR, DL, MVT::i32, Result,
+ DAG.getConstant(Conversion.XORValue, DL, MVT::i32));
+
+ if (Conversion.AddValue)
+ Result = DAG.getNode(ISD::ADD, DL, MVT::i32, Result,
+ DAG.getConstant(Conversion.AddValue, DL, MVT::i32));
+
+ // The SHR/AND sequence should get optimized to an RISBG.
+ Result = DAG.getNode(ISD::SRL, DL, MVT::i32, Result,
+ DAG.getConstant(Conversion.Bit, DL, MVT::i32));
+ if (Conversion.Bit != 31)
+ Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
+ DAG.getConstant(1, DL, MVT::i32));
+ return Result;
+}
+
+// Return the SystemISD vector comparison operation for CC, or 0 if it cannot
+// be done directly. IsFP is true if CC is for a floating-point rather than
+// integer comparison.
+static unsigned getVectorComparison(ISD::CondCode CC, bool IsFP) {
+ switch (CC) {
+ case ISD::SETOEQ:
+ case ISD::SETEQ:
+ return IsFP ? SystemZISD::VFCMPE : SystemZISD::VICMPE;
+
+ case ISD::SETOGE:
+ case ISD::SETGE:
+ return IsFP ? SystemZISD::VFCMPHE : static_cast<SystemZISD::NodeType>(0);
+
+ case ISD::SETOGT:
+ case ISD::SETGT:
+ return IsFP ? SystemZISD::VFCMPH : SystemZISD::VICMPH;
+
+ case ISD::SETUGT:
+ return IsFP ? static_cast<SystemZISD::NodeType>(0) : SystemZISD::VICMPHL;
+
+ default:
+ return 0;
+ }
+}
+
+// Return the SystemZISD vector comparison operation for CC or its inverse,
+// or 0 if neither can be done directly. Indicate in Invert whether the
+// result is for the inverse of CC. IsFP is true if CC is for a
+// floating-point rather than integer comparison.
+static unsigned getVectorComparisonOrInvert(ISD::CondCode CC, bool IsFP,
+ bool &Invert) {
+ if (unsigned Opcode = getVectorComparison(CC, IsFP)) {
+ Invert = false;
+ return Opcode;
+ }
+
+ CC = ISD::getSetCCInverse(CC, !IsFP);
+ if (unsigned Opcode = getVectorComparison(CC, IsFP)) {
+ Invert = true;
+ return Opcode;
+ }
+
+ return 0;
+}
+
+// Return a v2f64 that contains the extended form of elements Start and Start+1
+// of v4f32 value Op.
+static SDValue expandV4F32ToV2F64(SelectionDAG &DAG, int Start, const SDLoc &DL,
+ SDValue Op) {
+ int Mask[] = { Start, -1, Start + 1, -1 };
+ Op = DAG.getVectorShuffle(MVT::v4f32, DL, Op, DAG.getUNDEF(MVT::v4f32), Mask);
+ return DAG.getNode(SystemZISD::VEXTEND, DL, MVT::v2f64, Op);
+}
+
+// Build a comparison of vectors CmpOp0 and CmpOp1 using opcode Opcode,
+// producing a result of type VT.
+static SDValue getVectorCmp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &DL,
+ EVT VT, SDValue CmpOp0, SDValue CmpOp1) {
+ // There is no hardware support for v4f32, so extend the vector into
+ // two v2f64s and compare those.
+ if (CmpOp0.getValueType() == MVT::v4f32) {
+ SDValue H0 = expandV4F32ToV2F64(DAG, 0, DL, CmpOp0);
+ SDValue L0 = expandV4F32ToV2F64(DAG, 2, DL, CmpOp0);
+ SDValue H1 = expandV4F32ToV2F64(DAG, 0, DL, CmpOp1);
+ SDValue L1 = expandV4F32ToV2F64(DAG, 2, DL, CmpOp1);
+ SDValue HRes = DAG.getNode(Opcode, DL, MVT::v2i64, H0, H1);
+ SDValue LRes = DAG.getNode(Opcode, DL, MVT::v2i64, L0, L1);
+ return DAG.getNode(SystemZISD::PACK, DL, VT, HRes, LRes);
+ }
+ return DAG.getNode(Opcode, DL, VT, CmpOp0, CmpOp1);
+}
+
+// Lower a vector comparison of type CC between CmpOp0 and CmpOp1, producing
+// an integer mask of type VT.
+static SDValue lowerVectorSETCC(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
+ ISD::CondCode CC, SDValue CmpOp0,
+ SDValue CmpOp1) {
+ bool IsFP = CmpOp0.getValueType().isFloatingPoint();
+ bool Invert = false;
+ SDValue Cmp;
+ switch (CC) {
+ // Handle tests for order using (or (ogt y x) (oge x y)).
+ case ISD::SETUO:
+ Invert = true;
+ case ISD::SETO: {
+ assert(IsFP && "Unexpected integer comparison");
+ SDValue LT = getVectorCmp(DAG, SystemZISD::VFCMPH, DL, VT, CmpOp1, CmpOp0);
+ SDValue GE = getVectorCmp(DAG, SystemZISD::VFCMPHE, DL, VT, CmpOp0, CmpOp1);
+ Cmp = DAG.getNode(ISD::OR, DL, VT, LT, GE);
+ break;
+ }
+
+ // Handle <> tests using (or (ogt y x) (ogt x y)).
+ case ISD::SETUEQ:
+ Invert = true;
+ case ISD::SETONE: {
+ assert(IsFP && "Unexpected integer comparison");
+ SDValue LT = getVectorCmp(DAG, SystemZISD::VFCMPH, DL, VT, CmpOp1, CmpOp0);
+ SDValue GT = getVectorCmp(DAG, SystemZISD::VFCMPH, DL, VT, CmpOp0, CmpOp1);
+ Cmp = DAG.getNode(ISD::OR, DL, VT, LT, GT);
+ break;
+ }
+
+ // Otherwise a single comparison is enough. It doesn't really
+ // matter whether we try the inversion or the swap first, since
+ // there are no cases where both work.
+ default:
+ if (unsigned Opcode = getVectorComparisonOrInvert(CC, IsFP, Invert))
+ Cmp = getVectorCmp(DAG, Opcode, DL, VT, CmpOp0, CmpOp1);
+ else {
+ CC = ISD::getSetCCSwappedOperands(CC);
+ if (unsigned Opcode = getVectorComparisonOrInvert(CC, IsFP, Invert))
+ Cmp = getVectorCmp(DAG, Opcode, DL, VT, CmpOp1, CmpOp0);
+ else
+ llvm_unreachable("Unhandled comparison");
+ }
+ break;
+ }
+ if (Invert) {
+ SDValue Mask = DAG.getNode(SystemZISD::BYTE_MASK, DL, MVT::v16i8,
+ DAG.getConstant(65535, DL, MVT::i32));
+ Mask = DAG.getNode(ISD::BITCAST, DL, VT, Mask);
+ Cmp = DAG.getNode(ISD::XOR, DL, VT, Cmp, Mask);
+ }
+ return Cmp;
+}
+
+SDValue SystemZTargetLowering::lowerSETCC(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue CmpOp0 = Op.getOperand(0);
+ SDValue CmpOp1 = Op.getOperand(1);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+ if (VT.isVector())
+ return lowerVectorSETCC(DAG, DL, VT, CC, CmpOp0, CmpOp1);
+
+ Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC, DL));
+ SDValue Glue = emitCmp(DAG, DL, C);
+ return emitSETCC(DAG, DL, Glue, C.CCValid, C.CCMask);
+}
+
+SDValue SystemZTargetLowering::lowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+ SDValue CmpOp0 = Op.getOperand(2);
+ SDValue CmpOp1 = Op.getOperand(3);
+ SDValue Dest = Op.getOperand(4);
+ SDLoc DL(Op);
+
+ Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC, DL));
+ SDValue Glue = emitCmp(DAG, DL, C);
+ return DAG.getNode(SystemZISD::BR_CCMASK, DL, Op.getValueType(),
+ Op.getOperand(0), DAG.getConstant(C.CCValid, DL, MVT::i32),
+ DAG.getConstant(C.CCMask, DL, MVT::i32), Dest, Glue);
+}
+
+// Return true if Pos is CmpOp and Neg is the negative of CmpOp,
+// allowing Pos and Neg to be wider than CmpOp.
+static bool isAbsolute(SDValue CmpOp, SDValue Pos, SDValue Neg) {
+ return (Neg.getOpcode() == ISD::SUB &&
+ Neg.getOperand(0).getOpcode() == ISD::Constant &&
+ cast<ConstantSDNode>(Neg.getOperand(0))->getZExtValue() == 0 &&
+ Neg.getOperand(1) == Pos &&
+ (Pos == CmpOp ||
+ (Pos.getOpcode() == ISD::SIGN_EXTEND &&
+ Pos.getOperand(0) == CmpOp)));
+}
+
+// Return the absolute or negative absolute of Op; IsNegative decides which.
+static SDValue getAbsolute(SelectionDAG &DAG, const SDLoc &DL, SDValue Op,
+ bool IsNegative) {
+ Op = DAG.getNode(SystemZISD::IABS, DL, Op.getValueType(), Op);
+ if (IsNegative)
+ Op = DAG.getNode(ISD::SUB, DL, Op.getValueType(),
+ DAG.getConstant(0, DL, Op.getValueType()), Op);
+ return Op;
+}
+
+SDValue SystemZTargetLowering::lowerSELECT_CC(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue CmpOp0 = Op.getOperand(0);
+ SDValue CmpOp1 = Op.getOperand(1);
+ SDValue TrueOp = Op.getOperand(2);
+ SDValue FalseOp = Op.getOperand(3);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+ SDLoc DL(Op);
+
+ Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC, DL));
+
+ // Check for absolute and negative-absolute selections, including those
+ // where the comparison value is sign-extended (for LPGFR and LNGFR).
+ // This check supplements the one in DAGCombiner.
+ if (C.Opcode == SystemZISD::ICMP &&
+ C.CCMask != SystemZ::CCMASK_CMP_EQ &&
+ C.CCMask != SystemZ::CCMASK_CMP_NE &&
+ C.Op1.getOpcode() == ISD::Constant &&
+ cast<ConstantSDNode>(C.Op1)->getZExtValue() == 0) {
+ if (isAbsolute(C.Op0, TrueOp, FalseOp))
+ return getAbsolute(DAG, DL, TrueOp, C.CCMask & SystemZ::CCMASK_CMP_LT);
+ if (isAbsolute(C.Op0, FalseOp, TrueOp))
+ return getAbsolute(DAG, DL, FalseOp, C.CCMask & SystemZ::CCMASK_CMP_GT);
+ }
+
+ SDValue Glue = emitCmp(DAG, DL, C);
+
+ // Special case for handling -1/0 results. The shifts we use here
+ // should get optimized with the IPM conversion sequence.
+ auto *TrueC = dyn_cast<ConstantSDNode>(TrueOp);
+ auto *FalseC = dyn_cast<ConstantSDNode>(FalseOp);
+ if (TrueC && FalseC) {
+ int64_t TrueVal = TrueC->getSExtValue();
+ int64_t FalseVal = FalseC->getSExtValue();
+ if ((TrueVal == -1 && FalseVal == 0) || (TrueVal == 0 && FalseVal == -1)) {
+ // Invert the condition if we want -1 on false.
+ if (TrueVal == 0)
+ C.CCMask ^= C.CCValid;
+ SDValue Result = emitSETCC(DAG, DL, Glue, C.CCValid, C.CCMask);
+ EVT VT = Op.getValueType();
+ // Extend the result to VT. Upper bits are ignored.
+ if (!is32Bit(VT))
+ Result = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Result);
+ // Sign-extend from the low bit.
+ SDValue ShAmt = DAG.getConstant(VT.getSizeInBits() - 1, DL, MVT::i32);
+ SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Result, ShAmt);
+ return DAG.getNode(ISD::SRA, DL, VT, Shl, ShAmt);
+ }
+ }
+
+ SDValue Ops[] = {TrueOp, FalseOp, DAG.getConstant(C.CCValid, DL, MVT::i32),
+ DAG.getConstant(C.CCMask, DL, MVT::i32), Glue};
+
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
+ return DAG.getNode(SystemZISD::SELECT_CCMASK, DL, VTs, Ops);
+}
+
+SDValue SystemZTargetLowering::lowerGlobalAddress(GlobalAddressSDNode *Node,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Node);
+ const GlobalValue *GV = Node->getGlobal();
+ int64_t Offset = Node->getOffset();
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ CodeModel::Model CM = DAG.getTarget().getCodeModel();
+
+ SDValue Result;
+ if (Subtarget.isPC32DBLSymbol(GV, CM)) {
+ // Assign anchors at 1<<12 byte boundaries.
+ uint64_t Anchor = Offset & ~uint64_t(0xfff);
+ Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT, Anchor);
+ Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
+
+ // The offset can be folded into the address if it is aligned to a halfword.
+ Offset -= Anchor;
+ if (Offset != 0 && (Offset & 1) == 0) {
+ SDValue Full = DAG.getTargetGlobalAddress(GV, DL, PtrVT, Anchor + Offset);
+ Result = DAG.getNode(SystemZISD::PCREL_OFFSET, DL, PtrVT, Full, Result);
+ Offset = 0;
+ }
+ } else {
+ Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, SystemZII::MO_GOT);
+ Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
+ Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+ }
+
+ // If there was a non-zero offset that we didn't fold, create an explicit
+ // addition for it.
+ if (Offset != 0)
+ Result = DAG.getNode(ISD::ADD, DL, PtrVT, Result,
+ DAG.getConstant(Offset, DL, PtrVT));
+
+ return Result;
+}
+
+SDValue SystemZTargetLowering::lowerTLSGetOffset(GlobalAddressSDNode *Node,
+ SelectionDAG &DAG,
+ unsigned Opcode,
+ SDValue GOTOffset) const {
+ SDLoc DL(Node);
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue Chain = DAG.getEntryNode();
+ SDValue Glue;
+
+ // __tls_get_offset takes the GOT offset in %r2 and the GOT in %r12.
+ SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT);
+ Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R12D, GOT, Glue);
+ Glue = Chain.getValue(1);
+ Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R2D, GOTOffset, Glue);
+ Glue = Chain.getValue(1);
+
+ // The first call operand is the chain and the second is the TLS symbol.
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(Chain);
+ Ops.push_back(DAG.getTargetGlobalAddress(Node->getGlobal(), DL,
+ Node->getValueType(0),
+ 0, 0));
+
+ // Add argument registers to the end of the list so that they are
+ // known live into the call.
+ Ops.push_back(DAG.getRegister(SystemZ::R2D, PtrVT));
+ Ops.push_back(DAG.getRegister(SystemZ::R12D, PtrVT));
+
+ // Add a register mask operand representing the call-preserved registers.
+ const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+ const uint32_t *Mask =
+ TRI->getCallPreservedMask(DAG.getMachineFunction(), CallingConv::C);
+ assert(Mask && "Missing call preserved mask for calling convention");
+ Ops.push_back(DAG.getRegisterMask(Mask));
+
+ // Glue the call to the argument copies.
+ Ops.push_back(Glue);
+
+ // Emit the call.
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+ Chain = DAG.getNode(Opcode, DL, NodeTys, Ops);
+ Glue = Chain.getValue(1);
+
+ // Copy the return value from %r2.
+ return DAG.getCopyFromReg(Chain, DL, SystemZ::R2D, PtrVT, Glue);
+}
+
+SDValue SystemZTargetLowering::lowerThreadPointer(const SDLoc &DL,
+ SelectionDAG &DAG) const {
+ SDValue Chain = DAG.getEntryNode();
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+ // The high part of the thread pointer is in access register 0.
+ SDValue TPHi = DAG.getCopyFromReg(Chain, DL, SystemZ::A0, MVT::i32);
+ TPHi = DAG.getNode(ISD::ANY_EXTEND, DL, PtrVT, TPHi);
+
+ // The low part of the thread pointer is in access register 1.
+ SDValue TPLo = DAG.getCopyFromReg(Chain, DL, SystemZ::A1, MVT::i32);
+ TPLo = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TPLo);
+
+ // Merge them into a single 64-bit address.
+ SDValue TPHiShifted = DAG.getNode(ISD::SHL, DL, PtrVT, TPHi,
+ DAG.getConstant(32, DL, PtrVT));
+ return DAG.getNode(ISD::OR, DL, PtrVT, TPHiShifted, TPLo);
+}
+
+SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
+ SelectionDAG &DAG) const {
+ if (DAG.getTarget().Options.EmulatedTLS)
+ return LowerToTLSEmulatedModel(Node, DAG);
+ SDLoc DL(Node);
+ const GlobalValue *GV = Node->getGlobal();
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
+
+ SDValue TP = lowerThreadPointer(DL, DAG);
+
+ // Get the offset of GA from the thread pointer, based on the TLS model.
+ SDValue Offset;
+ switch (model) {
+ case TLSModel::GeneralDynamic: {
+ // Load the GOT offset of the tls_index (module ID / per-symbol offset).
+ SystemZConstantPoolValue *CPV =
+ SystemZConstantPoolValue::Create(GV, SystemZCP::TLSGD);
+
+ Offset = DAG.getConstantPool(CPV, PtrVT, 8);
+ Offset = DAG.getLoad(
+ PtrVT, DL, DAG.getEntryNode(), Offset,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+
+ // Call __tls_get_offset to retrieve the offset.
+ Offset = lowerTLSGetOffset(Node, DAG, SystemZISD::TLS_GDCALL, Offset);
+ break;
+ }
+
+ case TLSModel::LocalDynamic: {
+ // Load the GOT offset of the module ID.
+ SystemZConstantPoolValue *CPV =
+ SystemZConstantPoolValue::Create(GV, SystemZCP::TLSLDM);
+
+ Offset = DAG.getConstantPool(CPV, PtrVT, 8);
+ Offset = DAG.getLoad(
+ PtrVT, DL, DAG.getEntryNode(), Offset,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+
+ // Call __tls_get_offset to retrieve the module base offset.
+ Offset = lowerTLSGetOffset(Node, DAG, SystemZISD::TLS_LDCALL, Offset);
+
+ // Note: The SystemZLDCleanupPass will remove redundant computations
+ // of the module base offset. Count total number of local-dynamic
+ // accesses to trigger execution of that pass.
+ SystemZMachineFunctionInfo* MFI =
+ DAG.getMachineFunction().getInfo<SystemZMachineFunctionInfo>();
+ MFI->incNumLocalDynamicTLSAccesses();
+
+ // Add the per-symbol offset.
+ CPV = SystemZConstantPoolValue::Create(GV, SystemZCP::DTPOFF);
+
+ SDValue DTPOffset = DAG.getConstantPool(CPV, PtrVT, 8);
+ DTPOffset = DAG.getLoad(
+ PtrVT, DL, DAG.getEntryNode(), DTPOffset,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+
+ Offset = DAG.getNode(ISD::ADD, DL, PtrVT, Offset, DTPOffset);
+ break;
+ }
+
+ case TLSModel::InitialExec: {
+ // Load the offset from the GOT.
+ Offset = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+ SystemZII::MO_INDNTPOFF);
+ Offset = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Offset);
+ Offset =
+ DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Offset,
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+ break;
+ }
+
+ case TLSModel::LocalExec: {
+ // Force the offset into the constant pool and load it from there.
+ SystemZConstantPoolValue *CPV =
+ SystemZConstantPoolValue::Create(GV, SystemZCP::NTPOFF);
+
+ Offset = DAG.getConstantPool(CPV, PtrVT, 8);
+ Offset = DAG.getLoad(
+ PtrVT, DL, DAG.getEntryNode(), Offset,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+ break;
+ }
+ }
+
+ // Add the base and offset together.
+ return DAG.getNode(ISD::ADD, DL, PtrVT, TP, Offset);
+}
+
+SDValue SystemZTargetLowering::lowerBlockAddress(BlockAddressSDNode *Node,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Node);
+ const BlockAddress *BA = Node->getBlockAddress();
+ int64_t Offset = Node->getOffset();
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+ SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset);
+ Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
+ return Result;
+}
+
+SDValue SystemZTargetLowering::lowerJumpTable(JumpTableSDNode *JT,
+ SelectionDAG &DAG) const {
+ SDLoc DL(JT);
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
+
+ // Use LARL to load the address of the table.
+ return DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
+}
+
+SDValue SystemZTargetLowering::lowerConstantPool(ConstantPoolSDNode *CP,
+ SelectionDAG &DAG) const {
+ SDLoc DL(CP);
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+ SDValue Result;
+ if (CP->isMachineConstantPoolEntry())
+ Result = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
+ CP->getAlignment());
+ else
+ Result = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
+ CP->getAlignment(), CP->getOffset());
+
+ // Use LARL to load the address of the constant pool entry.
+ return DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
+}
+
+SDValue SystemZTargetLowering::lowerFRAMEADDR(SDValue Op,
+ SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MFI.setFrameAddressIsTaken(true);
+
+ SDLoc DL(Op);
+ unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+ // If the back chain frame index has not been allocated yet, do so.
+ SystemZMachineFunctionInfo *FI = MF.getInfo<SystemZMachineFunctionInfo>();
+ int BackChainIdx = FI->getFramePointerSaveIndex();
+ if (!BackChainIdx) {
+ // By definition, the frame address is the address of the back chain.
+ BackChainIdx = MFI.CreateFixedObject(8, -SystemZMC::CallFrameSize, false);
+ FI->setFramePointerSaveIndex(BackChainIdx);
+ }
+ SDValue BackChain = DAG.getFrameIndex(BackChainIdx, PtrVT);
+
+ // FIXME The frontend should detect this case.
+ if (Depth > 0) {
+ report_fatal_error("Unsupported stack frame traversal count");
+ }
+
+ return BackChain;
+}
+
+SDValue SystemZTargetLowering::lowerRETURNADDR(SDValue Op,
+ SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MFI.setReturnAddressIsTaken(true);
+
+ if (verifyReturnAddressArgumentIsConstant(Op, DAG))
+ return SDValue();
+
+ SDLoc DL(Op);
+ unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+ // FIXME The frontend should detect this case.
+ if (Depth > 0) {
+ report_fatal_error("Unsupported stack frame traversal count");
+ }
+
+ // Return R14D, which has the return address. Mark it an implicit live-in.
+ unsigned LinkReg = MF.addLiveIn(SystemZ::R14D, &SystemZ::GR64BitRegClass);
+ return DAG.getCopyFromReg(DAG.getEntryNode(), DL, LinkReg, PtrVT);
+}
+
+SDValue SystemZTargetLowering::lowerBITCAST(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ SDValue In = Op.getOperand(0);
+ EVT InVT = In.getValueType();
+ EVT ResVT = Op.getValueType();
+
+ // Convert loads directly. This is normally done by DAGCombiner,
+ // but we need this case for bitcasts that are created during lowering
+ // and which are then lowered themselves.
+ if (auto *LoadN = dyn_cast<LoadSDNode>(In))
+ return DAG.getLoad(ResVT, DL, LoadN->getChain(), LoadN->getBasePtr(),
+ LoadN->getMemOperand());
+
+ if (InVT == MVT::i32 && ResVT == MVT::f32) {
+ SDValue In64;
+ if (Subtarget.hasHighWord()) {
+ SDNode *U64 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL,
+ MVT::i64);
+ In64 = DAG.getTargetInsertSubreg(SystemZ::subreg_h32, DL,
+ MVT::i64, SDValue(U64, 0), In);
+ } else {
+ In64 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, In);
+ In64 = DAG.getNode(ISD::SHL, DL, MVT::i64, In64,
+ DAG.getConstant(32, DL, MVT::i64));
+ }
+ SDValue Out64 = DAG.getNode(ISD::BITCAST, DL, MVT::f64, In64);
+ return DAG.getTargetExtractSubreg(SystemZ::subreg_r32,
+ DL, MVT::f32, Out64);
+ }
+ if (InVT == MVT::f32 && ResVT == MVT::i32) {
+ SDNode *U64 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::f64);
+ SDValue In64 = DAG.getTargetInsertSubreg(SystemZ::subreg_r32, DL,
+ MVT::f64, SDValue(U64, 0), In);
+ SDValue Out64 = DAG.getNode(ISD::BITCAST, DL, MVT::i64, In64);
+ if (Subtarget.hasHighWord())
+ return DAG.getTargetExtractSubreg(SystemZ::subreg_h32, DL,
+ MVT::i32, Out64);
+ SDValue Shift = DAG.getNode(ISD::SRL, DL, MVT::i64, Out64,
+ DAG.getConstant(32, DL, MVT::i64));
+ return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Shift);
+ }
+ llvm_unreachable("Unexpected bitcast combination");
+}
+
+SDValue SystemZTargetLowering::lowerVASTART(SDValue Op,
+ SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ SystemZMachineFunctionInfo *FuncInfo =
+ MF.getInfo<SystemZMachineFunctionInfo>();
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+ SDValue Chain = Op.getOperand(0);
+ SDValue Addr = Op.getOperand(1);
+ const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+ SDLoc DL(Op);
+
+ // The initial values of each field.
+ const unsigned NumFields = 4;
+ SDValue Fields[NumFields] = {
+ DAG.getConstant(FuncInfo->getVarArgsFirstGPR(), DL, PtrVT),
+ DAG.getConstant(FuncInfo->getVarArgsFirstFPR(), DL, PtrVT),
+ DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT),
+ DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT)
+ };
+
+ // Store each field into its respective slot.
+ SDValue MemOps[NumFields];
+ unsigned Offset = 0;
+ for (unsigned I = 0; I < NumFields; ++I) {
+ SDValue FieldAddr = Addr;
+ if (Offset != 0)
+ FieldAddr = DAG.getNode(ISD::ADD, DL, PtrVT, FieldAddr,
+ DAG.getIntPtrConstant(Offset, DL));
+ MemOps[I] = DAG.getStore(Chain, DL, Fields[I], FieldAddr,
+ MachinePointerInfo(SV, Offset));
+ Offset += 8;
+ }
+ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
+}
+
+SDValue SystemZTargetLowering::lowerVACOPY(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue Chain = Op.getOperand(0);
+ SDValue DstPtr = Op.getOperand(1);
+ SDValue SrcPtr = Op.getOperand(2);
+ const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
+ const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+ SDLoc DL(Op);
+
+ return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(32, DL),
+ /*Align*/8, /*isVolatile*/false, /*AlwaysInline*/false,
+ /*isTailCall*/false,
+ MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
+}
+
+SDValue SystemZTargetLowering::
+lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
+ const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
+ MachineFunction &MF = DAG.getMachineFunction();
+ bool RealignOpt = !MF.getFunction()-> hasFnAttribute("no-realign-stack");
+ bool StoreBackchain = MF.getFunction()->hasFnAttribute("backchain");
+
+ SDValue Chain = Op.getOperand(0);
+ SDValue Size = Op.getOperand(1);
+ SDValue Align = Op.getOperand(2);
+ SDLoc DL(Op);
+
+ // If user has set the no alignment function attribute, ignore
+ // alloca alignments.
+ uint64_t AlignVal = (RealignOpt ?
+ dyn_cast<ConstantSDNode>(Align)->getZExtValue() : 0);
+
+ uint64_t StackAlign = TFI->getStackAlignment();
+ uint64_t RequiredAlign = std::max(AlignVal, StackAlign);
+ uint64_t ExtraAlignSpace = RequiredAlign - StackAlign;
+
+ unsigned SPReg = getStackPointerRegisterToSaveRestore();
+ SDValue NeededSpace = Size;
+
+ // Get a reference to the stack pointer.
+ SDValue OldSP = DAG.getCopyFromReg(Chain, DL, SPReg, MVT::i64);
+
+ // If we need a backchain, save it now.
+ SDValue Backchain;
+ if (StoreBackchain)
+ Backchain = DAG.getLoad(MVT::i64, DL, Chain, OldSP, MachinePointerInfo());
+
+ // Add extra space for alignment if needed.
+ if (ExtraAlignSpace)
+ NeededSpace = DAG.getNode(ISD::ADD, DL, MVT::i64, NeededSpace,
+ DAG.getConstant(ExtraAlignSpace, DL, MVT::i64));
+
+ // Get the new stack pointer value.
+ SDValue NewSP = DAG.getNode(ISD::SUB, DL, MVT::i64, OldSP, NeededSpace);
+
+ // Copy the new stack pointer back.
+ Chain = DAG.getCopyToReg(Chain, DL, SPReg, NewSP);
+
+ // The allocated data lives above the 160 bytes allocated for the standard
+ // frame, plus any outgoing stack arguments. We don't know how much that
+ // amounts to yet, so emit a special ADJDYNALLOC placeholder.
+ SDValue ArgAdjust = DAG.getNode(SystemZISD::ADJDYNALLOC, DL, MVT::i64);
+ SDValue Result = DAG.getNode(ISD::ADD, DL, MVT::i64, NewSP, ArgAdjust);
+
+ // Dynamically realign if needed.
+ if (RequiredAlign > StackAlign) {
+ Result =
+ DAG.getNode(ISD::ADD, DL, MVT::i64, Result,
+ DAG.getConstant(ExtraAlignSpace, DL, MVT::i64));
+ Result =
+ DAG.getNode(ISD::AND, DL, MVT::i64, Result,
+ DAG.getConstant(~(RequiredAlign - 1), DL, MVT::i64));
+ }
+
+ if (StoreBackchain)
+ Chain = DAG.getStore(Chain, DL, Backchain, NewSP, MachinePointerInfo());
+
+ SDValue Ops[2] = { Result, Chain };
+ return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue SystemZTargetLowering::lowerGET_DYNAMIC_AREA_OFFSET(
+ SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+
+ return DAG.getNode(SystemZISD::ADJDYNALLOC, DL, MVT::i64);
+}
+
+SDValue SystemZTargetLowering::lowerSMUL_LOHI(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ SDLoc DL(Op);
+ SDValue Ops[2];
+ if (is32Bit(VT))
+ // Just do a normal 64-bit multiplication and extract the results.
+ // We define this so that it can be used for constant division.
+ lowerMUL_LOHI32(DAG, DL, ISD::SIGN_EXTEND, Op.getOperand(0),
+ Op.getOperand(1), Ops[1], Ops[0]);
+ else {
+ // Do a full 128-bit multiplication based on UMUL_LOHI64:
+ //
+ // (ll * rl) + ((lh * rl) << 64) + ((ll * rh) << 64)
+ //
+ // but using the fact that the upper halves are either all zeros
+ // or all ones:
+ //
+ // (ll * rl) - ((lh & rl) << 64) - ((ll & rh) << 64)
+ //
+ // and grouping the right terms together since they are quicker than the
+ // multiplication:
+ //
+ // (ll * rl) - (((lh & rl) + (ll & rh)) << 64)
+ SDValue C63 = DAG.getConstant(63, DL, MVT::i64);
+ SDValue LL = Op.getOperand(0);
+ SDValue RL = Op.getOperand(1);
+ SDValue LH = DAG.getNode(ISD::SRA, DL, VT, LL, C63);
+ SDValue RH = DAG.getNode(ISD::SRA, DL, VT, RL, C63);
+ // UMUL_LOHI64 returns the low result in the odd register and the high
+ // result in the even register. SMUL_LOHI is defined to return the
+ // low half first, so the results are in reverse order.
+ lowerGR128Binary(DAG, DL, VT, SystemZ::AEXT128_64, SystemZISD::UMUL_LOHI64,
+ LL, RL, Ops[1], Ops[0]);
+ SDValue NegLLTimesRH = DAG.getNode(ISD::AND, DL, VT, LL, RH);
+ SDValue NegLHTimesRL = DAG.getNode(ISD::AND, DL, VT, LH, RL);
+ SDValue NegSum = DAG.getNode(ISD::ADD, DL, VT, NegLLTimesRH, NegLHTimesRL);
+ Ops[1] = DAG.getNode(ISD::SUB, DL, VT, Ops[1], NegSum);
+ }
+ return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue SystemZTargetLowering::lowerUMUL_LOHI(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ SDLoc DL(Op);
+ SDValue Ops[2];
+ if (is32Bit(VT))
+ // Just do a normal 64-bit multiplication and extract the results.
+ // We define this so that it can be used for constant division.
+ lowerMUL_LOHI32(DAG, DL, ISD::ZERO_EXTEND, Op.getOperand(0),
+ Op.getOperand(1), Ops[1], Ops[0]);
+ else
+ // UMUL_LOHI64 returns the low result in the odd register and the high
+ // result in the even register. UMUL_LOHI is defined to return the
+ // low half first, so the results are in reverse order.
+ lowerGR128Binary(DAG, DL, VT, SystemZ::AEXT128_64, SystemZISD::UMUL_LOHI64,
+ Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
+ return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue SystemZTargetLowering::lowerSDIVREM(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ EVT VT = Op.getValueType();
+ SDLoc DL(Op);
+ unsigned Opcode;
+
+ // We use DSGF for 32-bit division.
+ if (is32Bit(VT)) {
+ Op0 = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Op0);
+ Opcode = SystemZISD::SDIVREM32;
+ } else if (DAG.ComputeNumSignBits(Op1) > 32) {
+ Op1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op1);
+ Opcode = SystemZISD::SDIVREM32;
+ } else
+ Opcode = SystemZISD::SDIVREM64;
+
+ // DSG(F) takes a 64-bit dividend, so the even register in the GR128
+ // input is "don't care". The instruction returns the remainder in
+ // the even register and the quotient in the odd register.
+ SDValue Ops[2];
+ lowerGR128Binary(DAG, DL, VT, SystemZ::AEXT128_64, Opcode,
+ Op0, Op1, Ops[1], Ops[0]);
+ return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue SystemZTargetLowering::lowerUDIVREM(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ SDLoc DL(Op);
+
+ // DL(G) uses a double-width dividend, so we need to clear the even
+ // register in the GR128 input. The instruction returns the remainder
+ // in the even register and the quotient in the odd register.
+ SDValue Ops[2];
+ if (is32Bit(VT))
+ lowerGR128Binary(DAG, DL, VT, SystemZ::ZEXT128_32, SystemZISD::UDIVREM32,
+ Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
+ else
+ lowerGR128Binary(DAG, DL, VT, SystemZ::ZEXT128_64, SystemZISD::UDIVREM64,
+ Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
+ return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue SystemZTargetLowering::lowerOR(SDValue Op, SelectionDAG &DAG) const {
+ assert(Op.getValueType() == MVT::i64 && "Should be 64-bit operation");
+
+ // Get the known-zero masks for each operand.
+ SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1) };
+ APInt KnownZero[2], KnownOne[2];
+ DAG.computeKnownBits(Ops[0], KnownZero[0], KnownOne[0]);
+ DAG.computeKnownBits(Ops[1], KnownZero[1], KnownOne[1]);
+
+ // See if the upper 32 bits of one operand and the lower 32 bits of the
+ // other are known zero. They are the low and high operands respectively.
+ uint64_t Masks[] = { KnownZero[0].getZExtValue(),
+ KnownZero[1].getZExtValue() };
+ unsigned High, Low;
+ if ((Masks[0] >> 32) == 0xffffffff && uint32_t(Masks[1]) == 0xffffffff)
+ High = 1, Low = 0;
+ else if ((Masks[1] >> 32) == 0xffffffff && uint32_t(Masks[0]) == 0xffffffff)
+ High = 0, Low = 1;
+ else
+ return Op;
+
+ SDValue LowOp = Ops[Low];
+ SDValue HighOp = Ops[High];
+
+ // If the high part is a constant, we're better off using IILH.
+ if (HighOp.getOpcode() == ISD::Constant)
+ return Op;
+
+ // If the low part is a constant that is outside the range of LHI,
+ // then we're better off using IILF.
+ if (LowOp.getOpcode() == ISD::Constant) {
+ int64_t Value = int32_t(cast<ConstantSDNode>(LowOp)->getZExtValue());
+ if (!isInt<16>(Value))
+ return Op;
+ }
+
+ // Check whether the high part is an AND that doesn't change the
+ // high 32 bits and just masks out low bits. We can skip it if so.
+ if (HighOp.getOpcode() == ISD::AND &&
+ HighOp.getOperand(1).getOpcode() == ISD::Constant) {
+ SDValue HighOp0 = HighOp.getOperand(0);
+ uint64_t Mask = cast<ConstantSDNode>(HighOp.getOperand(1))->getZExtValue();
+ if (DAG.MaskedValueIsZero(HighOp0, APInt(64, ~(Mask | 0xffffffff))))
+ HighOp = HighOp0;
+ }
+
+ // Take advantage of the fact that all GR32 operations only change the
+ // low 32 bits by truncating Low to an i32 and inserting it directly
+ // using a subreg. The interesting cases are those where the truncation
+ // can be folded.
+ SDLoc DL(Op);
+ SDValue Low32 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, LowOp);
+ return DAG.getTargetInsertSubreg(SystemZ::subreg_l32, DL,
+ MVT::i64, HighOp, Low32);
+}
+
+SDValue SystemZTargetLowering::lowerCTPOP(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ SDLoc DL(Op);
+ Op = Op.getOperand(0);
+
+ // Handle vector types via VPOPCT.
+ if (VT.isVector()) {
+ Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Op);
+ Op = DAG.getNode(SystemZISD::POPCNT, DL, MVT::v16i8, Op);
+ switch (VT.getScalarSizeInBits()) {
+ case 8:
+ break;
+ case 16: {
+ Op = DAG.getNode(ISD::BITCAST, DL, VT, Op);
+ SDValue Shift = DAG.getConstant(8, DL, MVT::i32);
+ SDValue Tmp = DAG.getNode(SystemZISD::VSHL_BY_SCALAR, DL, VT, Op, Shift);
+ Op = DAG.getNode(ISD::ADD, DL, VT, Op, Tmp);
+ Op = DAG.getNode(SystemZISD::VSRL_BY_SCALAR, DL, VT, Op, Shift);
+ break;
+ }
+ case 32: {
+ SDValue Tmp = DAG.getNode(SystemZISD::BYTE_MASK, DL, MVT::v16i8,
+ DAG.getConstant(0, DL, MVT::i32));
+ Op = DAG.getNode(SystemZISD::VSUM, DL, VT, Op, Tmp);
+ break;
+ }
+ case 64: {
+ SDValue Tmp = DAG.getNode(SystemZISD::BYTE_MASK, DL, MVT::v16i8,
+ DAG.getConstant(0, DL, MVT::i32));
+ Op = DAG.getNode(SystemZISD::VSUM, DL, MVT::v4i32, Op, Tmp);
+ Op = DAG.getNode(SystemZISD::VSUM, DL, VT, Op, Tmp);
+ break;
+ }
+ default:
+ llvm_unreachable("Unexpected type");
+ }
+ return Op;
+ }
+
+ // Get the known-zero mask for the operand.
+ APInt KnownZero, KnownOne;
+ DAG.computeKnownBits(Op, KnownZero, KnownOne);
+ unsigned NumSignificantBits = (~KnownZero).getActiveBits();
+ if (NumSignificantBits == 0)
+ return DAG.getConstant(0, DL, VT);
+
+ // Skip known-zero high parts of the operand.
+ int64_t OrigBitSize = VT.getSizeInBits();
+ int64_t BitSize = (int64_t)1 << Log2_32_Ceil(NumSignificantBits);
+ BitSize = std::min(BitSize, OrigBitSize);
+
+ // The POPCNT instruction counts the number of bits in each byte.
+ Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op);
+ Op = DAG.getNode(SystemZISD::POPCNT, DL, MVT::i64, Op);
+ Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
+
+ // Add up per-byte counts in a binary tree. All bits of Op at
+ // position larger than BitSize remain zero throughout.
+ for (int64_t I = BitSize / 2; I >= 8; I = I / 2) {
+ SDValue Tmp = DAG.getNode(ISD::SHL, DL, VT, Op, DAG.getConstant(I, DL, VT));
+ if (BitSize != OrigBitSize)
+ Tmp = DAG.getNode(ISD::AND, DL, VT, Tmp,
+ DAG.getConstant(((uint64_t)1 << BitSize) - 1, DL, VT));
+ Op = DAG.getNode(ISD::ADD, DL, VT, Op, Tmp);
+ }
+
+ // Extract overall result from high byte.
+ if (BitSize > 8)
+ Op = DAG.getNode(ISD::SRL, DL, VT, Op,
+ DAG.getConstant(BitSize - 8, DL, VT));
+
+ return Op;
+}
+
+SDValue SystemZTargetLowering::lowerATOMIC_FENCE(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
+ cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
+ SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
+ cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
+
+ // The only fence that needs an instruction is a sequentially-consistent
+ // cross-thread fence.
+ if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
+ FenceScope == CrossThread) {
+ return SDValue(DAG.getMachineNode(SystemZ::Serialize, DL, MVT::Other,
+ Op.getOperand(0)),
+ 0);
+ }
+
+ // MEMBARRIER is a compiler barrier; it codegens to a no-op.
+ return DAG.getNode(SystemZISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
+}
+
+// Op is an atomic load. Lower it into a normal volatile load.
+SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op,
+ SelectionDAG &DAG) const {
+ auto *Node = cast<AtomicSDNode>(Op.getNode());
+ return DAG.getExtLoad(ISD::EXTLOAD, SDLoc(Op), Op.getValueType(),
+ Node->getChain(), Node->getBasePtr(),
+ Node->getMemoryVT(), Node->getMemOperand());
+}
+
+// Op is an atomic store. Lower it into a normal volatile store followed
+// by a serialization.
+SDValue SystemZTargetLowering::lowerATOMIC_STORE(SDValue Op,
+ SelectionDAG &DAG) const {
+ auto *Node = cast<AtomicSDNode>(Op.getNode());
+ SDValue Chain = DAG.getTruncStore(Node->getChain(), SDLoc(Op), Node->getVal(),
+ Node->getBasePtr(), Node->getMemoryVT(),
+ Node->getMemOperand());
+ return SDValue(DAG.getMachineNode(SystemZ::Serialize, SDLoc(Op), MVT::Other,
+ Chain), 0);
+}
+
+// Op is an 8-, 16-bit or 32-bit ATOMIC_LOAD_* operation. Lower the first
+// two into the fullword ATOMIC_LOADW_* operation given by Opcode.
+SDValue SystemZTargetLowering::lowerATOMIC_LOAD_OP(SDValue Op,
+ SelectionDAG &DAG,
+ unsigned Opcode) const {
+ auto *Node = cast<AtomicSDNode>(Op.getNode());
+
+ // 32-bit operations need no code outside the main loop.
+ EVT NarrowVT = Node->getMemoryVT();
+ EVT WideVT = MVT::i32;
+ if (NarrowVT == WideVT)
+ return Op;
+
+ int64_t BitSize = NarrowVT.getSizeInBits();
+ SDValue ChainIn = Node->getChain();
+ SDValue Addr = Node->getBasePtr();
+ SDValue Src2 = Node->getVal();
+ MachineMemOperand *MMO = Node->getMemOperand();
+ SDLoc DL(Node);
+ EVT PtrVT = Addr.getValueType();
+
+ // Convert atomic subtracts of constants into additions.
+ if (Opcode == SystemZISD::ATOMIC_LOADW_SUB)
+ if (auto *Const = dyn_cast<ConstantSDNode>(Src2)) {
+ Opcode = SystemZISD::ATOMIC_LOADW_ADD;
+ Src2 = DAG.getConstant(-Const->getSExtValue(), DL, Src2.getValueType());
+ }
+
+ // Get the address of the containing word.
+ SDValue AlignedAddr = DAG.getNode(ISD::AND, DL, PtrVT, Addr,
+ DAG.getConstant(-4, DL, PtrVT));
+
+ // Get the number of bits that the word must be rotated left in order
+ // to bring the field to the top bits of a GR32.
+ SDValue BitShift = DAG.getNode(ISD::SHL, DL, PtrVT, Addr,
+ DAG.getConstant(3, DL, PtrVT));
+ BitShift = DAG.getNode(ISD::TRUNCATE, DL, WideVT, BitShift);
+
+ // Get the complementing shift amount, for rotating a field in the top
+ // bits back to its proper position.
+ SDValue NegBitShift = DAG.getNode(ISD::SUB, DL, WideVT,
+ DAG.getConstant(0, DL, WideVT), BitShift);
+
+ // Extend the source operand to 32 bits and prepare it for the inner loop.
+ // ATOMIC_SWAPW uses RISBG to rotate the field left, but all other
+ // operations require the source to be shifted in advance. (This shift
+ // can be folded if the source is constant.) For AND and NAND, the lower
+ // bits must be set, while for other opcodes they should be left clear.
+ if (Opcode != SystemZISD::ATOMIC_SWAPW)
+ Src2 = DAG.getNode(ISD::SHL, DL, WideVT, Src2,
+ DAG.getConstant(32 - BitSize, DL, WideVT));
+ if (Opcode == SystemZISD::ATOMIC_LOADW_AND ||
+ Opcode == SystemZISD::ATOMIC_LOADW_NAND)
+ Src2 = DAG.getNode(ISD::OR, DL, WideVT, Src2,
+ DAG.getConstant(uint32_t(-1) >> BitSize, DL, WideVT));
+
+ // Construct the ATOMIC_LOADW_* node.
+ SDVTList VTList = DAG.getVTList(WideVT, MVT::Other);
+ SDValue Ops[] = { ChainIn, AlignedAddr, Src2, BitShift, NegBitShift,
+ DAG.getConstant(BitSize, DL, WideVT) };
+ SDValue AtomicOp = DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops,
+ NarrowVT, MMO);
+
+ // Rotate the result of the final CS so that the field is in the lower
+ // bits of a GR32, then truncate it.
+ SDValue ResultShift = DAG.getNode(ISD::ADD, DL, WideVT, BitShift,
+ DAG.getConstant(BitSize, DL, WideVT));
+ SDValue Result = DAG.getNode(ISD::ROTL, DL, WideVT, AtomicOp, ResultShift);
+
+ SDValue RetOps[2] = { Result, AtomicOp.getValue(1) };
+ return DAG.getMergeValues(RetOps, DL);
+}
+
+// Op is an ATOMIC_LOAD_SUB operation. Lower 8- and 16-bit operations
+// into ATOMIC_LOADW_SUBs and decide whether to convert 32- and 64-bit
+// operations into additions.
+SDValue SystemZTargetLowering::lowerATOMIC_LOAD_SUB(SDValue Op,
+ SelectionDAG &DAG) const {
+ auto *Node = cast<AtomicSDNode>(Op.getNode());
+ EVT MemVT = Node->getMemoryVT();
+ if (MemVT == MVT::i32 || MemVT == MVT::i64) {
+ // A full-width operation.
+ assert(Op.getValueType() == MemVT && "Mismatched VTs");
+ SDValue Src2 = Node->getVal();
+ SDValue NegSrc2;
+ SDLoc DL(Src2);
+
+ if (auto *Op2 = dyn_cast<ConstantSDNode>(Src2)) {
+ // Use an addition if the operand is constant and either LAA(G) is
+ // available or the negative value is in the range of A(G)FHI.
+ int64_t Value = (-Op2->getAPIntValue()).getSExtValue();
+ if (isInt<32>(Value) || Subtarget.hasInterlockedAccess1())
+ NegSrc2 = DAG.getConstant(Value, DL, MemVT);
+ } else if (Subtarget.hasInterlockedAccess1())
+ // Use LAA(G) if available.
+ NegSrc2 = DAG.getNode(ISD::SUB, DL, MemVT, DAG.getConstant(0, DL, MemVT),
+ Src2);
+
+ if (NegSrc2.getNode())
+ return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, MemVT,
+ Node->getChain(), Node->getBasePtr(), NegSrc2,
+ Node->getMemOperand());
+
+ // Use the node as-is.
+ return Op;
+ }
+
+ return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_SUB);
+}
+
+// Node is an 8- or 16-bit ATOMIC_CMP_SWAP operation. Lower the first two
+// into a fullword ATOMIC_CMP_SWAPW operation.
+SDValue SystemZTargetLowering::lowerATOMIC_CMP_SWAP(SDValue Op,
+ SelectionDAG &DAG) const {
+ auto *Node = cast<AtomicSDNode>(Op.getNode());
+
+ // We have native support for 32-bit compare and swap.
+ EVT NarrowVT = Node->getMemoryVT();
+ EVT WideVT = MVT::i32;
+ if (NarrowVT == WideVT)
+ return Op;
+
+ int64_t BitSize = NarrowVT.getSizeInBits();
+ SDValue ChainIn = Node->getOperand(0);
+ SDValue Addr = Node->getOperand(1);
+ SDValue CmpVal = Node->getOperand(2);
+ SDValue SwapVal = Node->getOperand(3);
+ MachineMemOperand *MMO = Node->getMemOperand();
+ SDLoc DL(Node);
+ EVT PtrVT = Addr.getValueType();
+
+ // Get the address of the containing word.
+ SDValue AlignedAddr = DAG.getNode(ISD::AND, DL, PtrVT, Addr,
+ DAG.getConstant(-4, DL, PtrVT));
+
+ // Get the number of bits that the word must be rotated left in order
+ // to bring the field to the top bits of a GR32.
+ SDValue BitShift = DAG.getNode(ISD::SHL, DL, PtrVT, Addr,
+ DAG.getConstant(3, DL, PtrVT));
+ BitShift = DAG.getNode(ISD::TRUNCATE, DL, WideVT, BitShift);
+
+ // Get the complementing shift amount, for rotating a field in the top
+ // bits back to its proper position.
+ SDValue NegBitShift = DAG.getNode(ISD::SUB, DL, WideVT,
+ DAG.getConstant(0, DL, WideVT), BitShift);
+
+ // Construct the ATOMIC_CMP_SWAPW node.
+ SDVTList VTList = DAG.getVTList(WideVT, MVT::Other);
+ SDValue Ops[] = { ChainIn, AlignedAddr, CmpVal, SwapVal, BitShift,
+ NegBitShift, DAG.getConstant(BitSize, DL, WideVT) };
+ SDValue AtomicOp = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_CMP_SWAPW, DL,
+ VTList, Ops, NarrowVT, MMO);
+ return AtomicOp;
+}
+
+SDValue SystemZTargetLowering::lowerSTACKSAVE(SDValue Op,
+ SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MF.getInfo<SystemZMachineFunctionInfo>()->setManipulatesSP(true);
+ return DAG.getCopyFromReg(Op.getOperand(0), SDLoc(Op),
+ SystemZ::R15D, Op.getValueType());
+}
+
+SDValue SystemZTargetLowering::lowerSTACKRESTORE(SDValue Op,
+ SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MF.getInfo<SystemZMachineFunctionInfo>()->setManipulatesSP(true);
+ bool StoreBackchain = MF.getFunction()->hasFnAttribute("backchain");
+
+ SDValue Chain = Op.getOperand(0);
+ SDValue NewSP = Op.getOperand(1);
+ SDValue Backchain;
+ SDLoc DL(Op);
+
+ if (StoreBackchain) {
+ SDValue OldSP = DAG.getCopyFromReg(Chain, DL, SystemZ::R15D, MVT::i64);
+ Backchain = DAG.getLoad(MVT::i64, DL, Chain, OldSP, MachinePointerInfo());
+ }
+
+ Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R15D, NewSP);
+
+ if (StoreBackchain)
+ Chain = DAG.getStore(Chain, DL, Backchain, NewSP, MachinePointerInfo());
+
+ return Chain;
+}
+
+SDValue SystemZTargetLowering::lowerPREFETCH(SDValue Op,
+ SelectionDAG &DAG) const {
+ bool IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+ if (!IsData)
+ // Just preserve the chain.
+ return Op.getOperand(0);
+
+ SDLoc DL(Op);
+ bool IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+ unsigned Code = IsWrite ? SystemZ::PFD_WRITE : SystemZ::PFD_READ;
+ auto *Node = cast<MemIntrinsicSDNode>(Op.getNode());
+ SDValue Ops[] = {
+ Op.getOperand(0),
+ DAG.getConstant(Code, DL, MVT::i32),
+ Op.getOperand(1)
+ };
+ return DAG.getMemIntrinsicNode(SystemZISD::PREFETCH, DL,
+ Node->getVTList(), Ops,
+ Node->getMemoryVT(), Node->getMemOperand());
+}
+
+// Return an i32 that contains the value of CC immediately after After,
+// whose final operand must be MVT::Glue.
+static SDValue getCCResult(SelectionDAG &DAG, SDNode *After) {
+ SDLoc DL(After);
+ SDValue Glue = SDValue(After, After->getNumValues() - 1);
+ SDValue IPM = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, Glue);
+ return DAG.getNode(ISD::SRL, DL, MVT::i32, IPM,
+ DAG.getConstant(SystemZ::IPM_CC, DL, MVT::i32));
+}
+
+SDValue
+SystemZTargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
+ SelectionDAG &DAG) const {
+ unsigned Opcode, CCValid;
+ if (isIntrinsicWithCCAndChain(Op, Opcode, CCValid)) {
+ assert(Op->getNumValues() == 2 && "Expected only CC result and chain");
+ SDValue Glued = emitIntrinsicWithChainAndGlue(DAG, Op, Opcode);
+ SDValue CC = getCCResult(DAG, Glued.getNode());
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), CC);
+ return SDValue();
+ }
+
+ return SDValue();
+}
+
+SDValue
+SystemZTargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
+ SelectionDAG &DAG) const {
+ unsigned Opcode, CCValid;
+ if (isIntrinsicWithCC(Op, Opcode, CCValid)) {
+ SDValue Glued = emitIntrinsicWithGlue(DAG, Op, Opcode);
+ SDValue CC = getCCResult(DAG, Glued.getNode());
+ if (Op->getNumValues() == 1)
+ return CC;
+ assert(Op->getNumValues() == 2 && "Expected a CC and non-CC result");
+ return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), Op->getVTList(), Glued,
+ CC);
+ }
+
+ unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ switch (Id) {
+ case Intrinsic::thread_pointer:
+ return lowerThreadPointer(SDLoc(Op), DAG);
+
+ case Intrinsic::s390_vpdi:
+ return DAG.getNode(SystemZISD::PERMUTE_DWORDS, SDLoc(Op), Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+ case Intrinsic::s390_vperm:
+ return DAG.getNode(SystemZISD::PERMUTE, SDLoc(Op), Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+ case Intrinsic::s390_vuphb:
+ case Intrinsic::s390_vuphh:
+ case Intrinsic::s390_vuphf:
+ return DAG.getNode(SystemZISD::UNPACK_HIGH, SDLoc(Op), Op.getValueType(),
+ Op.getOperand(1));
+
+ case Intrinsic::s390_vuplhb:
+ case Intrinsic::s390_vuplhh:
+ case Intrinsic::s390_vuplhf:
+ return DAG.getNode(SystemZISD::UNPACKL_HIGH, SDLoc(Op), Op.getValueType(),
+ Op.getOperand(1));
+
+ case Intrinsic::s390_vuplb:
+ case Intrinsic::s390_vuplhw:
+ case Intrinsic::s390_vuplf:
+ return DAG.getNode(SystemZISD::UNPACK_LOW, SDLoc(Op), Op.getValueType(),
+ Op.getOperand(1));
+
+ case Intrinsic::s390_vupllb:
+ case Intrinsic::s390_vupllh:
+ case Intrinsic::s390_vupllf:
+ return DAG.getNode(SystemZISD::UNPACKL_LOW, SDLoc(Op), Op.getValueType(),
+ Op.getOperand(1));
+
+ case Intrinsic::s390_vsumb:
+ case Intrinsic::s390_vsumh:
+ case Intrinsic::s390_vsumgh:
+ case Intrinsic::s390_vsumgf:
+ case Intrinsic::s390_vsumqf:
+ case Intrinsic::s390_vsumqg:
+ return DAG.getNode(SystemZISD::VSUM, SDLoc(Op), Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ }
+
+ return SDValue();
+}
+
+namespace {
+// Says that SystemZISD operation Opcode can be used to perform the equivalent
+// of a VPERM with permute vector Bytes. If Opcode takes three operands,
+// Operand is the constant third operand, otherwise it is the number of
+// bytes in each element of the result.
+struct Permute {
+ unsigned Opcode;
+ unsigned Operand;
+ unsigned char Bytes[SystemZ::VectorBytes];
+};
+}
+
+static const Permute PermuteForms[] = {
+ // VMRHG
+ { SystemZISD::MERGE_HIGH, 8,
+ { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 } },
+ // VMRHF
+ { SystemZISD::MERGE_HIGH, 4,
+ { 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 } },
+ // VMRHH
+ { SystemZISD::MERGE_HIGH, 2,
+ { 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23 } },
+ // VMRHB
+ { SystemZISD::MERGE_HIGH, 1,
+ { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 } },
+ // VMRLG
+ { SystemZISD::MERGE_LOW, 8,
+ { 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 } },
+ // VMRLF
+ { SystemZISD::MERGE_LOW, 4,
+ { 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 } },
+ // VMRLH
+ { SystemZISD::MERGE_LOW, 2,
+ { 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31 } },
+ // VMRLB
+ { SystemZISD::MERGE_LOW, 1,
+ { 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 } },
+ // VPKG
+ { SystemZISD::PACK, 4,
+ { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 } },
+ // VPKF
+ { SystemZISD::PACK, 2,
+ { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 } },
+ // VPKH
+ { SystemZISD::PACK, 1,
+ { 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 } },
+ // VPDI V1, V2, 4 (low half of V1, high half of V2)
+ { SystemZISD::PERMUTE_DWORDS, 4,
+ { 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 } },
+ // VPDI V1, V2, 1 (high half of V1, low half of V2)
+ { SystemZISD::PERMUTE_DWORDS, 1,
+ { 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 } }
+};
+
+// Called after matching a vector shuffle against a particular pattern.
+// Both the original shuffle and the pattern have two vector operands.
+// OpNos[0] is the operand of the original shuffle that should be used for
+// operand 0 of the pattern, or -1 if operand 0 of the pattern can be anything.
+// OpNos[1] is the same for operand 1 of the pattern. Resolve these -1s and
+// set OpNo0 and OpNo1 to the shuffle operands that should actually be used
+// for operands 0 and 1 of the pattern.
+static bool chooseShuffleOpNos(int *OpNos, unsigned &OpNo0, unsigned &OpNo1) {
+ if (OpNos[0] < 0) {
+ if (OpNos[1] < 0)
+ return false;
+ OpNo0 = OpNo1 = OpNos[1];
+ } else if (OpNos[1] < 0) {
+ OpNo0 = OpNo1 = OpNos[0];
+ } else {
+ OpNo0 = OpNos[0];
+ OpNo1 = OpNos[1];
+ }
+ return true;
+}
+
+// Bytes is a VPERM-like permute vector, except that -1 is used for
+// undefined bytes. Return true if the VPERM can be implemented using P.
+// When returning true set OpNo0 to the VPERM operand that should be
+// used for operand 0 of P and likewise OpNo1 for operand 1 of P.
+//
+// For example, if swapping the VPERM operands allows P to match, OpNo0
+// will be 1 and OpNo1 will be 0. If instead Bytes only refers to one
+// operand, but rewriting it to use two duplicated operands allows it to
+// match P, then OpNo0 and OpNo1 will be the same.
+static bool matchPermute(const SmallVectorImpl<int> &Bytes, const Permute &P,
+ unsigned &OpNo0, unsigned &OpNo1) {
+ int OpNos[] = { -1, -1 };
+ for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) {
+ int Elt = Bytes[I];
+ if (Elt >= 0) {
+ // Make sure that the two permute vectors use the same suboperand
+ // byte number. Only the operand numbers (the high bits) are
+ // allowed to differ.
+ if ((Elt ^ P.Bytes[I]) & (SystemZ::VectorBytes - 1))
+ return false;
+ int ModelOpNo = P.Bytes[I] / SystemZ::VectorBytes;
+ int RealOpNo = unsigned(Elt) / SystemZ::VectorBytes;
+ // Make sure that the operand mappings are consistent with previous
+ // elements.
+ if (OpNos[ModelOpNo] == 1 - RealOpNo)
+ return false;
+ OpNos[ModelOpNo] = RealOpNo;
+ }
+ }
+ return chooseShuffleOpNos(OpNos, OpNo0, OpNo1);
+}
+
+// As above, but search for a matching permute.
+static const Permute *matchPermute(const SmallVectorImpl<int> &Bytes,
+ unsigned &OpNo0, unsigned &OpNo1) {
+ for (auto &P : PermuteForms)
+ if (matchPermute(Bytes, P, OpNo0, OpNo1))
+ return &P;
+ return nullptr;
+}
+
+// Bytes is a VPERM-like permute vector, except that -1 is used for
+// undefined bytes. This permute is an operand of an outer permute.
+// See whether redistributing the -1 bytes gives a shuffle that can be
+// implemented using P. If so, set Transform to a VPERM-like permute vector
+// that, when applied to the result of P, gives the original permute in Bytes.
+static bool matchDoublePermute(const SmallVectorImpl<int> &Bytes,
+ const Permute &P,
+ SmallVectorImpl<int> &Transform) {
+ unsigned To = 0;
+ for (unsigned From = 0; From < SystemZ::VectorBytes; ++From) {
+ int Elt = Bytes[From];
+ if (Elt < 0)
+ // Byte number From of the result is undefined.
+ Transform[From] = -1;
+ else {
+ while (P.Bytes[To] != Elt) {
+ To += 1;
+ if (To == SystemZ::VectorBytes)
+ return false;
+ }
+ Transform[From] = To;
+ }
+ }
+ return true;
+}
+
+// As above, but search for a matching permute.
+static const Permute *matchDoublePermute(const SmallVectorImpl<int> &Bytes,
+ SmallVectorImpl<int> &Transform) {
+ for (auto &P : PermuteForms)
+ if (matchDoublePermute(Bytes, P, Transform))
+ return &P;
+ return nullptr;
+}
+
+// Convert the mask of the given VECTOR_SHUFFLE into a byte-level mask,
+// as if it had type vNi8.
+static void getVPermMask(ShuffleVectorSDNode *VSN,
+ SmallVectorImpl<int> &Bytes) {
+ EVT VT = VSN->getValueType(0);
+ unsigned NumElements = VT.getVectorNumElements();
+ unsigned BytesPerElement = VT.getVectorElementType().getStoreSize();
+ Bytes.resize(NumElements * BytesPerElement, -1);
+ for (unsigned I = 0; I < NumElements; ++I) {
+ int Index = VSN->getMaskElt(I);
+ if (Index >= 0)
+ for (unsigned J = 0; J < BytesPerElement; ++J)
+ Bytes[I * BytesPerElement + J] = Index * BytesPerElement + J;
+ }
+}
+
+// Bytes is a VPERM-like permute vector, except that -1 is used for
+// undefined bytes. See whether bytes [Start, Start + BytesPerElement) of
+// the result come from a contiguous sequence of bytes from one input.
+// Set Base to the selector for the first byte if so.
+static bool getShuffleInput(const SmallVectorImpl<int> &Bytes, unsigned Start,
+ unsigned BytesPerElement, int &Base) {
+ Base = -1;
+ for (unsigned I = 0; I < BytesPerElement; ++I) {
+ if (Bytes[Start + I] >= 0) {
+ unsigned Elem = Bytes[Start + I];
+ if (Base < 0) {
+ Base = Elem - I;
+ // Make sure the bytes would come from one input operand.
+ if (unsigned(Base) % Bytes.size() + BytesPerElement > Bytes.size())
+ return false;
+ } else if (unsigned(Base) != Elem - I)
+ return false;
+ }
+ }
+ return true;
+}
+
+// Bytes is a VPERM-like permute vector, except that -1 is used for
+// undefined bytes. Return true if it can be performed using VSLDI.
+// When returning true, set StartIndex to the shift amount and OpNo0
+// and OpNo1 to the VPERM operands that should be used as the first
+// and second shift operand respectively.
+static bool isShlDoublePermute(const SmallVectorImpl<int> &Bytes,
+ unsigned &StartIndex, unsigned &OpNo0,
+ unsigned &OpNo1) {
+ int OpNos[] = { -1, -1 };
+ int Shift = -1;
+ for (unsigned I = 0; I < 16; ++I) {
+ int Index = Bytes[I];
+ if (Index >= 0) {
+ int ExpectedShift = (Index - I) % SystemZ::VectorBytes;
+ int ModelOpNo = unsigned(ExpectedShift + I) / SystemZ::VectorBytes;
+ int RealOpNo = unsigned(Index) / SystemZ::VectorBytes;
+ if (Shift < 0)
+ Shift = ExpectedShift;
+ else if (Shift != ExpectedShift)
+ return false;
+ // Make sure that the operand mappings are consistent with previous
+ // elements.
+ if (OpNos[ModelOpNo] == 1 - RealOpNo)
+ return false;
+ OpNos[ModelOpNo] = RealOpNo;
+ }
+ }
+ StartIndex = Shift;
+ return chooseShuffleOpNos(OpNos, OpNo0, OpNo1);
+}
+
+// Create a node that performs P on operands Op0 and Op1, casting the
+// operands to the appropriate type. The type of the result is determined by P.
+static SDValue getPermuteNode(SelectionDAG &DAG, const SDLoc &DL,
+ const Permute &P, SDValue Op0, SDValue Op1) {
+ // VPDI (PERMUTE_DWORDS) always operates on v2i64s. The input
+ // elements of a PACK are twice as wide as the outputs.
+ unsigned InBytes = (P.Opcode == SystemZISD::PERMUTE_DWORDS ? 8 :
+ P.Opcode == SystemZISD::PACK ? P.Operand * 2 :
+ P.Operand);
+ // Cast both operands to the appropriate type.
+ MVT InVT = MVT::getVectorVT(MVT::getIntegerVT(InBytes * 8),
+ SystemZ::VectorBytes / InBytes);
+ Op0 = DAG.getNode(ISD::BITCAST, DL, InVT, Op0);
+ Op1 = DAG.getNode(ISD::BITCAST, DL, InVT, Op1);
+ SDValue Op;
+ if (P.Opcode == SystemZISD::PERMUTE_DWORDS) {
+ SDValue Op2 = DAG.getConstant(P.Operand, DL, MVT::i32);
+ Op = DAG.getNode(SystemZISD::PERMUTE_DWORDS, DL, InVT, Op0, Op1, Op2);
+ } else if (P.Opcode == SystemZISD::PACK) {
+ MVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(P.Operand * 8),
+ SystemZ::VectorBytes / P.Operand);
+ Op = DAG.getNode(SystemZISD::PACK, DL, OutVT, Op0, Op1);
+ } else {
+ Op = DAG.getNode(P.Opcode, DL, InVT, Op0, Op1);
+ }
+ return Op;
+}
+
+// Bytes is a VPERM-like permute vector, except that -1 is used for
+// undefined bytes. Implement it on operands Ops[0] and Ops[1] using
+// VSLDI or VPERM.
+static SDValue getGeneralPermuteNode(SelectionDAG &DAG, const SDLoc &DL,
+ SDValue *Ops,
+ const SmallVectorImpl<int> &Bytes) {
+ for (unsigned I = 0; I < 2; ++I)
+ Ops[I] = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Ops[I]);
+
+ // First see whether VSLDI can be used.
+ unsigned StartIndex, OpNo0, OpNo1;
+ if (isShlDoublePermute(Bytes, StartIndex, OpNo0, OpNo1))
+ return DAG.getNode(SystemZISD::SHL_DOUBLE, DL, MVT::v16i8, Ops[OpNo0],
+ Ops[OpNo1], DAG.getConstant(StartIndex, DL, MVT::i32));
+
+ // Fall back on VPERM. Construct an SDNode for the permute vector.
+ SDValue IndexNodes[SystemZ::VectorBytes];
+ for (unsigned I = 0; I < SystemZ::VectorBytes; ++I)
+ if (Bytes[I] >= 0)
+ IndexNodes[I] = DAG.getConstant(Bytes[I], DL, MVT::i32);
+ else
+ IndexNodes[I] = DAG.getUNDEF(MVT::i32);
+ SDValue Op2 = DAG.getBuildVector(MVT::v16i8, DL, IndexNodes);
+ return DAG.getNode(SystemZISD::PERMUTE, DL, MVT::v16i8, Ops[0], Ops[1], Op2);
+}
+
+namespace {
+// Describes a general N-operand vector shuffle.
+struct GeneralShuffle {
+ GeneralShuffle(EVT vt) : VT(vt) {}
+ void addUndef();
+ void add(SDValue, unsigned);
+ SDValue getNode(SelectionDAG &, const SDLoc &);
+
+ // The operands of the shuffle.
+ SmallVector<SDValue, SystemZ::VectorBytes> Ops;
+
+ // Index I is -1 if byte I of the result is undefined. Otherwise the
+ // result comes from byte Bytes[I] % SystemZ::VectorBytes of operand
+ // Bytes[I] / SystemZ::VectorBytes.
+ SmallVector<int, SystemZ::VectorBytes> Bytes;
+
+ // The type of the shuffle result.
+ EVT VT;
+};
+}
+
+// Add an extra undefined element to the shuffle.
+void GeneralShuffle::addUndef() {
+ unsigned BytesPerElement = VT.getVectorElementType().getStoreSize();
+ for (unsigned I = 0; I < BytesPerElement; ++I)
+ Bytes.push_back(-1);
+}
+
+// Add an extra element to the shuffle, taking it from element Elem of Op.
+// A null Op indicates a vector input whose value will be calculated later;
+// there is at most one such input per shuffle and it always has the same
+// type as the result.
+void GeneralShuffle::add(SDValue Op, unsigned Elem) {
+ unsigned BytesPerElement = VT.getVectorElementType().getStoreSize();
+
+ // The source vector can have wider elements than the result,
+ // either through an explicit TRUNCATE or because of type legalization.
+ // We want the least significant part.
+ EVT FromVT = Op.getNode() ? Op.getValueType() : VT;
+ unsigned FromBytesPerElement = FromVT.getVectorElementType().getStoreSize();
+ assert(FromBytesPerElement >= BytesPerElement &&
+ "Invalid EXTRACT_VECTOR_ELT");
+ unsigned Byte = ((Elem * FromBytesPerElement) % SystemZ::VectorBytes +
+ (FromBytesPerElement - BytesPerElement));
+
+ // Look through things like shuffles and bitcasts.
+ while (Op.getNode()) {
+ if (Op.getOpcode() == ISD::BITCAST)
+ Op = Op.getOperand(0);
+ else if (Op.getOpcode() == ISD::VECTOR_SHUFFLE && Op.hasOneUse()) {
+ // See whether the bytes we need come from a contiguous part of one
+ // operand.
+ SmallVector<int, SystemZ::VectorBytes> OpBytes;
+ getVPermMask(cast<ShuffleVectorSDNode>(Op), OpBytes);
+ int NewByte;
+ if (!getShuffleInput(OpBytes, Byte, BytesPerElement, NewByte))
+ break;
+ if (NewByte < 0) {
+ addUndef();
+ return;
+ }
+ Op = Op.getOperand(unsigned(NewByte) / SystemZ::VectorBytes);
+ Byte = unsigned(NewByte) % SystemZ::VectorBytes;
+ } else if (Op.isUndef()) {
+ addUndef();
+ return;
+ } else
+ break;
+ }
+
+ // Make sure that the source of the extraction is in Ops.
+ unsigned OpNo = 0;
+ for (; OpNo < Ops.size(); ++OpNo)
+ if (Ops[OpNo] == Op)
+ break;
+ if (OpNo == Ops.size())
+ Ops.push_back(Op);
+
+ // Add the element to Bytes.
+ unsigned Base = OpNo * SystemZ::VectorBytes + Byte;
+ for (unsigned I = 0; I < BytesPerElement; ++I)
+ Bytes.push_back(Base + I);
+}
+
+// Return SDNodes for the completed shuffle.
+SDValue GeneralShuffle::getNode(SelectionDAG &DAG, const SDLoc &DL) {
+ assert(Bytes.size() == SystemZ::VectorBytes && "Incomplete vector");
+
+ if (Ops.size() == 0)
+ return DAG.getUNDEF(VT);
+
+ // Make sure that there are at least two shuffle operands.
+ if (Ops.size() == 1)
+ Ops.push_back(DAG.getUNDEF(MVT::v16i8));
+
+ // Create a tree of shuffles, deferring root node until after the loop.
+ // Try to redistribute the undefined elements of non-root nodes so that
+ // the non-root shuffles match something like a pack or merge, then adjust
+ // the parent node's permute vector to compensate for the new order.
+ // Among other things, this copes with vectors like <2 x i16> that were
+ // padded with undefined elements during type legalization.
+ //
+ // In the best case this redistribution will lead to the whole tree
+ // using packs and merges. It should rarely be a loss in other cases.
+ unsigned Stride = 1;
+ for (; Stride * 2 < Ops.size(); Stride *= 2) {
+ for (unsigned I = 0; I < Ops.size() - Stride; I += Stride * 2) {
+ SDValue SubOps[] = { Ops[I], Ops[I + Stride] };
+
+ // Create a mask for just these two operands.
+ SmallVector<int, SystemZ::VectorBytes> NewBytes(SystemZ::VectorBytes);
+ for (unsigned J = 0; J < SystemZ::VectorBytes; ++J) {
+ unsigned OpNo = unsigned(Bytes[J]) / SystemZ::VectorBytes;
+ unsigned Byte = unsigned(Bytes[J]) % SystemZ::VectorBytes;
+ if (OpNo == I)
+ NewBytes[J] = Byte;
+ else if (OpNo == I + Stride)
+ NewBytes[J] = SystemZ::VectorBytes + Byte;
+ else
+ NewBytes[J] = -1;
+ }
+ // See if it would be better to reorganize NewMask to avoid using VPERM.
+ SmallVector<int, SystemZ::VectorBytes> NewBytesMap(SystemZ::VectorBytes);
+ if (const Permute *P = matchDoublePermute(NewBytes, NewBytesMap)) {
+ Ops[I] = getPermuteNode(DAG, DL, *P, SubOps[0], SubOps[1]);
+ // Applying NewBytesMap to Ops[I] gets back to NewBytes.
+ for (unsigned J = 0; J < SystemZ::VectorBytes; ++J) {
+ if (NewBytes[J] >= 0) {
+ assert(unsigned(NewBytesMap[J]) < SystemZ::VectorBytes &&
+ "Invalid double permute");
+ Bytes[J] = I * SystemZ::VectorBytes + NewBytesMap[J];
+ } else
+ assert(NewBytesMap[J] < 0 && "Invalid double permute");
+ }
+ } else {
+ // Just use NewBytes on the operands.
+ Ops[I] = getGeneralPermuteNode(DAG, DL, SubOps, NewBytes);
+ for (unsigned J = 0; J < SystemZ::VectorBytes; ++J)
+ if (NewBytes[J] >= 0)
+ Bytes[J] = I * SystemZ::VectorBytes + J;
+ }
+ }
+ }
+
+ // Now we just have 2 inputs. Put the second operand in Ops[1].
+ if (Stride > 1) {
+ Ops[1] = Ops[Stride];
+ for (unsigned I = 0; I < SystemZ::VectorBytes; ++I)
+ if (Bytes[I] >= int(SystemZ::VectorBytes))
+ Bytes[I] -= (Stride - 1) * SystemZ::VectorBytes;
+ }
+
+ // Look for an instruction that can do the permute without resorting
+ // to VPERM.
+ unsigned OpNo0, OpNo1;
+ SDValue Op;
+ if (const Permute *P = matchPermute(Bytes, OpNo0, OpNo1))
+ Op = getPermuteNode(DAG, DL, *P, Ops[OpNo0], Ops[OpNo1]);
+ else
+ Op = getGeneralPermuteNode(DAG, DL, &Ops[0], Bytes);
+ return DAG.getNode(ISD::BITCAST, DL, VT, Op);
+}
+
+// Return true if the given BUILD_VECTOR is a scalar-to-vector conversion.
+static bool isScalarToVector(SDValue Op) {
+ for (unsigned I = 1, E = Op.getNumOperands(); I != E; ++I)
+ if (!Op.getOperand(I).isUndef())
+ return false;
+ return true;
+}
+
+// Return a vector of type VT that contains Value in the first element.
+// The other elements don't matter.
+static SDValue buildScalarToVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
+ SDValue Value) {
+ // If we have a constant, replicate it to all elements and let the
+ // BUILD_VECTOR lowering take care of it.
+ if (Value.getOpcode() == ISD::Constant ||
+ Value.getOpcode() == ISD::ConstantFP) {
+ SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Value);
+ return DAG.getBuildVector(VT, DL, Ops);
+ }
+ if (Value.isUndef())
+ return DAG.getUNDEF(VT);
+ return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value);
+}
+
+// Return a vector of type VT in which Op0 is in element 0 and Op1 is in
+// element 1. Used for cases in which replication is cheap.
+static SDValue buildMergeScalars(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
+ SDValue Op0, SDValue Op1) {
+ if (Op0.isUndef()) {
+ if (Op1.isUndef())
+ return DAG.getUNDEF(VT);
+ return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Op1);
+ }
+ if (Op1.isUndef())
+ return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Op0);
+ return DAG.getNode(SystemZISD::MERGE_HIGH, DL, VT,
+ buildScalarToVector(DAG, DL, VT, Op0),
+ buildScalarToVector(DAG, DL, VT, Op1));
+}
+
+// Extend GPR scalars Op0 and Op1 to doublewords and return a v2i64
+// vector for them.
+static SDValue joinDwords(SelectionDAG &DAG, const SDLoc &DL, SDValue Op0,
+ SDValue Op1) {
+ if (Op0.isUndef() && Op1.isUndef())
+ return DAG.getUNDEF(MVT::v2i64);
+ // If one of the two inputs is undefined then replicate the other one,
+ // in order to avoid using another register unnecessarily.
+ if (Op0.isUndef())
+ Op0 = Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op1);
+ else if (Op1.isUndef())
+ Op0 = Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0);
+ else {
+ Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0);
+ Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op1);
+ }
+ return DAG.getNode(SystemZISD::JOIN_DWORDS, DL, MVT::v2i64, Op0, Op1);
+}
+
+// Try to represent constant BUILD_VECTOR node BVN using a
+// SystemZISD::BYTE_MASK-style mask. Store the mask value in Mask
+// on success.
+static bool tryBuildVectorByteMask(BuildVectorSDNode *BVN, uint64_t &Mask) {
+ EVT ElemVT = BVN->getValueType(0).getVectorElementType();
+ unsigned BytesPerElement = ElemVT.getStoreSize();
+ for (unsigned I = 0, E = BVN->getNumOperands(); I != E; ++I) {
+ SDValue Op = BVN->getOperand(I);
+ if (!Op.isUndef()) {
+ uint64_t Value;
+ if (Op.getOpcode() == ISD::Constant)
+ Value = dyn_cast<ConstantSDNode>(Op)->getZExtValue();
+ else if (Op.getOpcode() == ISD::ConstantFP)
+ Value = (dyn_cast<ConstantFPSDNode>(Op)->getValueAPF().bitcastToAPInt()
+ .getZExtValue());
+ else
+ return false;
+ for (unsigned J = 0; J < BytesPerElement; ++J) {
+ uint64_t Byte = (Value >> (J * 8)) & 0xff;
+ if (Byte == 0xff)
+ Mask |= 1ULL << ((E - I - 1) * BytesPerElement + J);
+ else if (Byte != 0)
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+// Try to load a vector constant in which BitsPerElement-bit value Value
+// is replicated to fill the vector. VT is the type of the resulting
+// constant, which may have elements of a different size from BitsPerElement.
+// Return the SDValue of the constant on success, otherwise return
+// an empty value.
+static SDValue tryBuildVectorReplicate(SelectionDAG &DAG,
+ const SystemZInstrInfo *TII,
+ const SDLoc &DL, EVT VT, uint64_t Value,
+ unsigned BitsPerElement) {
+ // Signed 16-bit values can be replicated using VREPI.
+ int64_t SignedValue = SignExtend64(Value, BitsPerElement);
+ if (isInt<16>(SignedValue)) {
+ MVT VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement),
+ SystemZ::VectorBits / BitsPerElement);
+ SDValue Op = DAG.getNode(SystemZISD::REPLICATE, DL, VecVT,
+ DAG.getConstant(SignedValue, DL, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, DL, VT, Op);
+ }
+ // See whether rotating the constant left some N places gives a value that
+ // is one less than a power of 2 (i.e. all zeros followed by all ones).
+ // If so we can use VGM.
+ unsigned Start, End;
+ if (TII->isRxSBGMask(Value, BitsPerElement, Start, End)) {
+ // isRxSBGMask returns the bit numbers for a full 64-bit value,
+ // with 0 denoting 1 << 63 and 63 denoting 1. Convert them to
+ // bit numbers for an BitsPerElement value, so that 0 denotes
+ // 1 << (BitsPerElement-1).
+ Start -= 64 - BitsPerElement;
+ End -= 64 - BitsPerElement;
+ MVT VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement),
+ SystemZ::VectorBits / BitsPerElement);
+ SDValue Op = DAG.getNode(SystemZISD::ROTATE_MASK, DL, VecVT,
+ DAG.getConstant(Start, DL, MVT::i32),
+ DAG.getConstant(End, DL, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, DL, VT, Op);
+ }
+ return SDValue();
+}
+
+// If a BUILD_VECTOR contains some EXTRACT_VECTOR_ELTs, it's usually
+// better to use VECTOR_SHUFFLEs on them, only using BUILD_VECTOR for
+// the non-EXTRACT_VECTOR_ELT elements. See if the given BUILD_VECTOR
+// would benefit from this representation and return it if so.
+static SDValue tryBuildVectorShuffle(SelectionDAG &DAG,
+ BuildVectorSDNode *BVN) {
+ EVT VT = BVN->getValueType(0);
+ unsigned NumElements = VT.getVectorNumElements();
+
+ // Represent the BUILD_VECTOR as an N-operand VECTOR_SHUFFLE-like operation
+ // on byte vectors. If there are non-EXTRACT_VECTOR_ELT elements that still
+ // need a BUILD_VECTOR, add an additional placeholder operand for that
+ // BUILD_VECTOR and store its operands in ResidueOps.
+ GeneralShuffle GS(VT);
+ SmallVector<SDValue, SystemZ::VectorBytes> ResidueOps;
+ bool FoundOne = false;
+ for (unsigned I = 0; I < NumElements; ++I) {
+ SDValue Op = BVN->getOperand(I);
+ if (Op.getOpcode() == ISD::TRUNCATE)
+ Op = Op.getOperand(0);
+ if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ Op.getOperand(1).getOpcode() == ISD::Constant) {
+ unsigned Elem = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+ GS.add(Op.getOperand(0), Elem);
+ FoundOne = true;
+ } else if (Op.isUndef()) {
+ GS.addUndef();
+ } else {
+ GS.add(SDValue(), ResidueOps.size());
+ ResidueOps.push_back(BVN->getOperand(I));
+ }
+ }
+
+ // Nothing to do if there are no EXTRACT_VECTOR_ELTs.
+ if (!FoundOne)
+ return SDValue();
+
+ // Create the BUILD_VECTOR for the remaining elements, if any.
+ if (!ResidueOps.empty()) {
+ while (ResidueOps.size() < NumElements)
+ ResidueOps.push_back(DAG.getUNDEF(ResidueOps[0].getValueType()));
+ for (auto &Op : GS.Ops) {
+ if (!Op.getNode()) {
+ Op = DAG.getBuildVector(VT, SDLoc(BVN), ResidueOps);
+ break;
+ }
+ }
+ }
+ return GS.getNode(DAG, SDLoc(BVN));
+}
+
+// Combine GPR scalar values Elems into a vector of type VT.
+static SDValue buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
+ SmallVectorImpl<SDValue> &Elems) {
+ // See whether there is a single replicated value.
+ SDValue Single;
+ unsigned int NumElements = Elems.size();
+ unsigned int Count = 0;
+ for (auto Elem : Elems) {
+ if (!Elem.isUndef()) {
+ if (!Single.getNode())
+ Single = Elem;
+ else if (Elem != Single) {
+ Single = SDValue();
+ break;
+ }
+ Count += 1;
+ }
+ }
+ // There are three cases here:
+ //
+ // - if the only defined element is a loaded one, the best sequence
+ // is a replicating load.
+ //
+ // - otherwise, if the only defined element is an i64 value, we will
+ // end up with the same VLVGP sequence regardless of whether we short-cut
+ // for replication or fall through to the later code.
+ //
+ // - otherwise, if the only defined element is an i32 or smaller value,
+ // we would need 2 instructions to replicate it: VLVGP followed by VREPx.
+ // This is only a win if the single defined element is used more than once.
+ // In other cases we're better off using a single VLVGx.
+ if (Single.getNode() && (Count > 1 || Single.getOpcode() == ISD::LOAD))
+ return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Single);
+
+ // The best way of building a v2i64 from two i64s is to use VLVGP.
+ if (VT == MVT::v2i64)
+ return joinDwords(DAG, DL, Elems[0], Elems[1]);
+
+ // Use a 64-bit merge high to combine two doubles.
+ if (VT == MVT::v2f64)
+ return buildMergeScalars(DAG, DL, VT, Elems[0], Elems[1]);
+
+ // Build v4f32 values directly from the FPRs:
+ //
+ // <Axxx> <Bxxx> <Cxxxx> <Dxxx>
+ // V V VMRHF
+ // <ABxx> <CDxx>
+ // V VMRHG
+ // <ABCD>
+ if (VT == MVT::v4f32) {
+ SDValue Op01 = buildMergeScalars(DAG, DL, VT, Elems[0], Elems[1]);
+ SDValue Op23 = buildMergeScalars(DAG, DL, VT, Elems[2], Elems[3]);
+ // Avoid unnecessary undefs by reusing the other operand.
+ if (Op01.isUndef())
+ Op01 = Op23;
+ else if (Op23.isUndef())
+ Op23 = Op01;
+ // Merging identical replications is a no-op.
+ if (Op01.getOpcode() == SystemZISD::REPLICATE && Op01 == Op23)
+ return Op01;
+ Op01 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op01);
+ Op23 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op23);
+ SDValue Op = DAG.getNode(SystemZISD::MERGE_HIGH,
+ DL, MVT::v2i64, Op01, Op23);
+ return DAG.getNode(ISD::BITCAST, DL, VT, Op);
+ }
+
+ // Collect the constant terms.
+ SmallVector<SDValue, SystemZ::VectorBytes> Constants(NumElements, SDValue());
+ SmallVector<bool, SystemZ::VectorBytes> Done(NumElements, false);
+
+ unsigned NumConstants = 0;
+ for (unsigned I = 0; I < NumElements; ++I) {
+ SDValue Elem = Elems[I];
+ if (Elem.getOpcode() == ISD::Constant ||
+ Elem.getOpcode() == ISD::ConstantFP) {
+ NumConstants += 1;
+ Constants[I] = Elem;
+ Done[I] = true;
+ }
+ }
+ // If there was at least one constant, fill in the other elements of
+ // Constants with undefs to get a full vector constant and use that
+ // as the starting point.
+ SDValue Result;
+ if (NumConstants > 0) {
+ for (unsigned I = 0; I < NumElements; ++I)
+ if (!Constants[I].getNode())
+ Constants[I] = DAG.getUNDEF(Elems[I].getValueType());
+ Result = DAG.getBuildVector(VT, DL, Constants);
+ } else {
+ // Otherwise try to use VLVGP to start the sequence in order to
+ // avoid a false dependency on any previous contents of the vector
+ // register. This only makes sense if one of the associated elements
+ // is defined.
+ unsigned I1 = NumElements / 2 - 1;
+ unsigned I2 = NumElements - 1;
+ bool Def1 = !Elems[I1].isUndef();
+ bool Def2 = !Elems[I2].isUndef();
+ if (Def1 || Def2) {
+ SDValue Elem1 = Elems[Def1 ? I1 : I2];
+ SDValue Elem2 = Elems[Def2 ? I2 : I1];
+ Result = DAG.getNode(ISD::BITCAST, DL, VT,
+ joinDwords(DAG, DL, Elem1, Elem2));
+ Done[I1] = true;
+ Done[I2] = true;
+ } else
+ Result = DAG.getUNDEF(VT);
+ }
+
+ // Use VLVGx to insert the other elements.
+ for (unsigned I = 0; I < NumElements; ++I)
+ if (!Done[I] && !Elems[I].isUndef())
+ Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Result, Elems[I],
+ DAG.getConstant(I, DL, MVT::i32));
+ return Result;
+}
+
+SDValue SystemZTargetLowering::lowerBUILD_VECTOR(SDValue Op,
+ SelectionDAG &DAG) const {
+ const SystemZInstrInfo *TII =
+ static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+ auto *BVN = cast<BuildVectorSDNode>(Op.getNode());
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+
+ if (BVN->isConstant()) {
+ // Try using VECTOR GENERATE BYTE MASK. This is the architecturally-
+ // preferred way of creating all-zero and all-one vectors so give it
+ // priority over other methods below.
+ uint64_t Mask = 0;
+ if (tryBuildVectorByteMask(BVN, Mask)) {
+ SDValue Op = DAG.getNode(SystemZISD::BYTE_MASK, DL, MVT::v16i8,
+ DAG.getConstant(Mask, DL, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, DL, VT, Op);
+ }
+
+ // Try using some form of replication.
+ APInt SplatBits, SplatUndef;
+ unsigned SplatBitSize;
+ bool HasAnyUndefs;
+ if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
+ 8, true) &&
+ SplatBitSize <= 64) {
+ // First try assuming that any undefined bits above the highest set bit
+ // and below the lowest set bit are 1s. This increases the likelihood of
+ // being able to use a sign-extended element value in VECTOR REPLICATE
+ // IMMEDIATE or a wraparound mask in VECTOR GENERATE MASK.
+ uint64_t SplatBitsZ = SplatBits.getZExtValue();
+ uint64_t SplatUndefZ = SplatUndef.getZExtValue();
+ uint64_t Lower = (SplatUndefZ
+ & ((uint64_t(1) << findFirstSet(SplatBitsZ)) - 1));
+ uint64_t Upper = (SplatUndefZ
+ & ~((uint64_t(1) << findLastSet(SplatBitsZ)) - 1));
+ uint64_t Value = SplatBitsZ | Upper | Lower;
+ SDValue Op = tryBuildVectorReplicate(DAG, TII, DL, VT, Value,
+ SplatBitSize);
+ if (Op.getNode())
+ return Op;
+
+ // Now try assuming that any undefined bits between the first and
+ // last defined set bits are set. This increases the chances of
+ // using a non-wraparound mask.
+ uint64_t Middle = SplatUndefZ & ~Upper & ~Lower;
+ Value = SplatBitsZ | Middle;
+ Op = tryBuildVectorReplicate(DAG, TII, DL, VT, Value, SplatBitSize);
+ if (Op.getNode())
+ return Op;
+ }
+
+ // Fall back to loading it from memory.
+ return SDValue();
+ }
+
+ // See if we should use shuffles to construct the vector from other vectors.
+ if (SDValue Res = tryBuildVectorShuffle(DAG, BVN))
+ return Res;
+
+ // Detect SCALAR_TO_VECTOR conversions.
+ if (isOperationLegal(ISD::SCALAR_TO_VECTOR, VT) && isScalarToVector(Op))
+ return buildScalarToVector(DAG, DL, VT, Op.getOperand(0));
+
+ // Otherwise use buildVector to build the vector up from GPRs.
+ unsigned NumElements = Op.getNumOperands();
+ SmallVector<SDValue, SystemZ::VectorBytes> Ops(NumElements);
+ for (unsigned I = 0; I < NumElements; ++I)
+ Ops[I] = Op.getOperand(I);
+ return buildVector(DAG, DL, VT, Ops);
+}
+
+SDValue SystemZTargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
+ SelectionDAG &DAG) const {
+ auto *VSN = cast<ShuffleVectorSDNode>(Op.getNode());
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+ unsigned NumElements = VT.getVectorNumElements();
+
+ if (VSN->isSplat()) {
+ SDValue Op0 = Op.getOperand(0);
+ unsigned Index = VSN->getSplatIndex();
+ assert(Index < VT.getVectorNumElements() &&
+ "Splat index should be defined and in first operand");
+ // See whether the value we're splatting is directly available as a scalar.
+ if ((Index == 0 && Op0.getOpcode() == ISD::SCALAR_TO_VECTOR) ||
+ Op0.getOpcode() == ISD::BUILD_VECTOR)
+ return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Op0.getOperand(Index));
+ // Otherwise keep it as a vector-to-vector operation.
+ return DAG.getNode(SystemZISD::SPLAT, DL, VT, Op.getOperand(0),
+ DAG.getConstant(Index, DL, MVT::i32));
+ }
+
+ GeneralShuffle GS(VT);
+ for (unsigned I = 0; I < NumElements; ++I) {
+ int Elt = VSN->getMaskElt(I);
+ if (Elt < 0)
+ GS.addUndef();
+ else
+ GS.add(Op.getOperand(unsigned(Elt) / NumElements),
+ unsigned(Elt) % NumElements);
+ }
+ return GS.getNode(DAG, SDLoc(VSN));
+}
+
+SDValue SystemZTargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ // Just insert the scalar into element 0 of an undefined vector.
+ return DAG.getNode(ISD::INSERT_VECTOR_ELT, DL,
+ Op.getValueType(), DAG.getUNDEF(Op.getValueType()),
+ Op.getOperand(0), DAG.getConstant(0, DL, MVT::i32));
+}
+
+SDValue SystemZTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
+ SelectionDAG &DAG) const {
+ // Handle insertions of floating-point values.
+ SDLoc DL(Op);
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ SDValue Op2 = Op.getOperand(2);
+ EVT VT = Op.getValueType();
+
+ // Insertions into constant indices of a v2f64 can be done using VPDI.
+ // However, if the inserted value is a bitcast or a constant then it's
+ // better to use GPRs, as below.
+ if (VT == MVT::v2f64 &&
+ Op1.getOpcode() != ISD::BITCAST &&
+ Op1.getOpcode() != ISD::ConstantFP &&
+ Op2.getOpcode() == ISD::Constant) {
+ uint64_t Index = dyn_cast<ConstantSDNode>(Op2)->getZExtValue();
+ unsigned Mask = VT.getVectorNumElements() - 1;
+ if (Index <= Mask)
+ return Op;
+ }
+
+ // Otherwise bitcast to the equivalent integer form and insert via a GPR.
+ MVT IntVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
+ MVT IntVecVT = MVT::getVectorVT(IntVT, VT.getVectorNumElements());
+ SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, IntVecVT,
+ DAG.getNode(ISD::BITCAST, DL, IntVecVT, Op0),
+ DAG.getNode(ISD::BITCAST, DL, IntVT, Op1), Op2);
+ return DAG.getNode(ISD::BITCAST, DL, VT, Res);
+}
+
+SDValue
+SystemZTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
+ SelectionDAG &DAG) const {
+ // Handle extractions of floating-point values.
+ SDLoc DL(Op);
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ EVT VT = Op.getValueType();
+ EVT VecVT = Op0.getValueType();
+
+ // Extractions of constant indices can be done directly.
+ if (auto *CIndexN = dyn_cast<ConstantSDNode>(Op1)) {
+ uint64_t Index = CIndexN->getZExtValue();
+ unsigned Mask = VecVT.getVectorNumElements() - 1;
+ if (Index <= Mask)
+ return Op;
+ }
+
+ // Otherwise bitcast to the equivalent integer form and extract via a GPR.
+ MVT IntVT = MVT::getIntegerVT(VT.getSizeInBits());
+ MVT IntVecVT = MVT::getVectorVT(IntVT, VecVT.getVectorNumElements());
+ SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, IntVT,
+ DAG.getNode(ISD::BITCAST, DL, IntVecVT, Op0), Op1);
+ return DAG.getNode(ISD::BITCAST, DL, VT, Res);
+}
+
+SDValue
+SystemZTargetLowering::lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
+ unsigned UnpackHigh) const {
+ SDValue PackedOp = Op.getOperand(0);
+ EVT OutVT = Op.getValueType();
+ EVT InVT = PackedOp.getValueType();
+ unsigned ToBits = OutVT.getScalarSizeInBits();
+ unsigned FromBits = InVT.getScalarSizeInBits();
+ do {
+ FromBits *= 2;
+ EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(FromBits),
+ SystemZ::VectorBits / FromBits);
+ PackedOp = DAG.getNode(UnpackHigh, SDLoc(PackedOp), OutVT, PackedOp);
+ } while (FromBits != ToBits);
+ return PackedOp;
+}
+
+SDValue SystemZTargetLowering::lowerShift(SDValue Op, SelectionDAG &DAG,
+ unsigned ByScalar) const {
+ // Look for cases where a vector shift can use the *_BY_SCALAR form.
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+ unsigned ElemBitSize = VT.getScalarSizeInBits();
+
+ // See whether the shift vector is a splat represented as BUILD_VECTOR.
+ if (auto *BVN = dyn_cast<BuildVectorSDNode>(Op1)) {
+ APInt SplatBits, SplatUndef;
+ unsigned SplatBitSize;
+ bool HasAnyUndefs;
+ // Check for constant splats. Use ElemBitSize as the minimum element
+ // width and reject splats that need wider elements.
+ if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
+ ElemBitSize, true) &&
+ SplatBitSize == ElemBitSize) {
+ SDValue Shift = DAG.getConstant(SplatBits.getZExtValue() & 0xfff,
+ DL, MVT::i32);
+ return DAG.getNode(ByScalar, DL, VT, Op0, Shift);
+ }
+ // Check for variable splats.
+ BitVector UndefElements;
+ SDValue Splat = BVN->getSplatValue(&UndefElements);
+ if (Splat) {
+ // Since i32 is the smallest legal type, we either need a no-op
+ // or a truncation.
+ SDValue Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Splat);
+ return DAG.getNode(ByScalar, DL, VT, Op0, Shift);
+ }
+ }
+
+ // See whether the shift vector is a splat represented as SHUFFLE_VECTOR,
+ // and the shift amount is directly available in a GPR.
+ if (auto *VSN = dyn_cast<ShuffleVectorSDNode>(Op1)) {
+ if (VSN->isSplat()) {
+ SDValue VSNOp0 = VSN->getOperand(0);
+ unsigned Index = VSN->getSplatIndex();
+ assert(Index < VT.getVectorNumElements() &&
+ "Splat index should be defined and in first operand");
+ if ((Index == 0 && VSNOp0.getOpcode() == ISD::SCALAR_TO_VECTOR) ||
+ VSNOp0.getOpcode() == ISD::BUILD_VECTOR) {
+ // Since i32 is the smallest legal type, we either need a no-op
+ // or a truncation.
+ SDValue Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
+ VSNOp0.getOperand(Index));
+ return DAG.getNode(ByScalar, DL, VT, Op0, Shift);
+ }
+ }
+ }
+
+ // Otherwise just treat the current form as legal.
+ return Op;
+}
+
+SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
+ SelectionDAG &DAG) const {
+ switch (Op.getOpcode()) {
+ case ISD::FRAMEADDR:
+ return lowerFRAMEADDR(Op, DAG);
+ case ISD::RETURNADDR:
+ return lowerRETURNADDR(Op, DAG);
+ case ISD::BR_CC:
+ return lowerBR_CC(Op, DAG);
+ case ISD::SELECT_CC:
+ return lowerSELECT_CC(Op, DAG);
+ case ISD::SETCC:
+ return lowerSETCC(Op, DAG);
+ case ISD::GlobalAddress:
+ return lowerGlobalAddress(cast<GlobalAddressSDNode>(Op), DAG);
+ case ISD::GlobalTLSAddress:
+ return lowerGlobalTLSAddress(cast<GlobalAddressSDNode>(Op), DAG);
+ case ISD::BlockAddress:
+ return lowerBlockAddress(cast<BlockAddressSDNode>(Op), DAG);
+ case ISD::JumpTable:
+ return lowerJumpTable(cast<JumpTableSDNode>(Op), DAG);
+ case ISD::ConstantPool:
+ return lowerConstantPool(cast<ConstantPoolSDNode>(Op), DAG);
+ case ISD::BITCAST:
+ return lowerBITCAST(Op, DAG);
+ case ISD::VASTART:
+ return lowerVASTART(Op, DAG);
+ case ISD::VACOPY:
+ return lowerVACOPY(Op, DAG);
+ case ISD::DYNAMIC_STACKALLOC:
+ return lowerDYNAMIC_STACKALLOC(Op, DAG);
+ case ISD::GET_DYNAMIC_AREA_OFFSET:
+ return lowerGET_DYNAMIC_AREA_OFFSET(Op, DAG);
+ case ISD::SMUL_LOHI:
+ return lowerSMUL_LOHI(Op, DAG);
+ case ISD::UMUL_LOHI:
+ return lowerUMUL_LOHI(Op, DAG);
+ case ISD::SDIVREM:
+ return lowerSDIVREM(Op, DAG);
+ case ISD::UDIVREM:
+ return lowerUDIVREM(Op, DAG);
+ case ISD::OR:
+ return lowerOR(Op, DAG);
+ case ISD::CTPOP:
+ return lowerCTPOP(Op, DAG);
+ case ISD::ATOMIC_FENCE:
+ return lowerATOMIC_FENCE(Op, DAG);
+ case ISD::ATOMIC_SWAP:
+ return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_SWAPW);
+ case ISD::ATOMIC_STORE:
+ return lowerATOMIC_STORE(Op, DAG);
+ case ISD::ATOMIC_LOAD:
+ return lowerATOMIC_LOAD(Op, DAG);
+ case ISD::ATOMIC_LOAD_ADD:
+ return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_ADD);
+ case ISD::ATOMIC_LOAD_SUB:
+ return lowerATOMIC_LOAD_SUB(Op, DAG);
+ case ISD::ATOMIC_LOAD_AND:
+ return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_AND);
+ case ISD::ATOMIC_LOAD_OR:
+ return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_OR);
+ case ISD::ATOMIC_LOAD_XOR:
+ return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_XOR);
+ case ISD::ATOMIC_LOAD_NAND:
+ return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_NAND);
+ case ISD::ATOMIC_LOAD_MIN:
+ return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_MIN);
+ case ISD::ATOMIC_LOAD_MAX:
+ return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_MAX);
+ case ISD::ATOMIC_LOAD_UMIN:
+ return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_UMIN);
+ case ISD::ATOMIC_LOAD_UMAX:
+ return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_UMAX);
+ case ISD::ATOMIC_CMP_SWAP:
+ return lowerATOMIC_CMP_SWAP(Op, DAG);
+ case ISD::STACKSAVE:
+ return lowerSTACKSAVE(Op, DAG);
+ case ISD::STACKRESTORE:
+ return lowerSTACKRESTORE(Op, DAG);
+ case ISD::PREFETCH:
+ return lowerPREFETCH(Op, DAG);
+ case ISD::INTRINSIC_W_CHAIN:
+ return lowerINTRINSIC_W_CHAIN(Op, DAG);
+ case ISD::INTRINSIC_WO_CHAIN:
+ return lowerINTRINSIC_WO_CHAIN(Op, DAG);
+ case ISD::BUILD_VECTOR:
+ return lowerBUILD_VECTOR(Op, DAG);
+ case ISD::VECTOR_SHUFFLE:
+ return lowerVECTOR_SHUFFLE(Op, DAG);
+ case ISD::SCALAR_TO_VECTOR:
+ return lowerSCALAR_TO_VECTOR(Op, DAG);
+ case ISD::INSERT_VECTOR_ELT:
+ return lowerINSERT_VECTOR_ELT(Op, DAG);
+ case ISD::EXTRACT_VECTOR_ELT:
+ return lowerEXTRACT_VECTOR_ELT(Op, DAG);
+ case ISD::SIGN_EXTEND_VECTOR_INREG:
+ return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACK_HIGH);
+ case ISD::ZERO_EXTEND_VECTOR_INREG:
+ return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACKL_HIGH);
+ case ISD::SHL:
+ return lowerShift(Op, DAG, SystemZISD::VSHL_BY_SCALAR);
+ case ISD::SRL:
+ return lowerShift(Op, DAG, SystemZISD::VSRL_BY_SCALAR);
+ case ISD::SRA:
+ return lowerShift(Op, DAG, SystemZISD::VSRA_BY_SCALAR);
+ default:
+ llvm_unreachable("Unexpected node to lower");
+ }
+}
+
+const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
+#define OPCODE(NAME) case SystemZISD::NAME: return "SystemZISD::" #NAME
+ switch ((SystemZISD::NodeType)Opcode) {
+ case SystemZISD::FIRST_NUMBER: break;
+ OPCODE(RET_FLAG);
+ OPCODE(CALL);
+ OPCODE(SIBCALL);
+ OPCODE(TLS_GDCALL);
+ OPCODE(TLS_LDCALL);
+ OPCODE(PCREL_WRAPPER);
+ OPCODE(PCREL_OFFSET);
+ OPCODE(IABS);
+ OPCODE(ICMP);
+ OPCODE(FCMP);
+ OPCODE(TM);
+ OPCODE(BR_CCMASK);
+ OPCODE(SELECT_CCMASK);
+ OPCODE(ADJDYNALLOC);
+ OPCODE(POPCNT);
+ OPCODE(UMUL_LOHI64);
+ OPCODE(SDIVREM32);
+ OPCODE(SDIVREM64);
+ OPCODE(UDIVREM32);
+ OPCODE(UDIVREM64);
+ OPCODE(MVC);
+ OPCODE(MVC_LOOP);
+ OPCODE(NC);
+ OPCODE(NC_LOOP);
+ OPCODE(OC);
+ OPCODE(OC_LOOP);
+ OPCODE(XC);
+ OPCODE(XC_LOOP);
+ OPCODE(CLC);
+ OPCODE(CLC_LOOP);
+ OPCODE(STPCPY);
+ OPCODE(STRCMP);
+ OPCODE(SEARCH_STRING);
+ OPCODE(IPM);
+ OPCODE(SERIALIZE);
+ OPCODE(MEMBARRIER);
+ OPCODE(TBEGIN);
+ OPCODE(TBEGIN_NOFLOAT);
+ OPCODE(TEND);
+ OPCODE(BYTE_MASK);
+ OPCODE(ROTATE_MASK);
+ OPCODE(REPLICATE);
+ OPCODE(JOIN_DWORDS);
+ OPCODE(SPLAT);
+ OPCODE(MERGE_HIGH);
+ OPCODE(MERGE_LOW);
+ OPCODE(SHL_DOUBLE);
+ OPCODE(PERMUTE_DWORDS);
+ OPCODE(PERMUTE);
+ OPCODE(PACK);
+ OPCODE(PACKS_CC);
+ OPCODE(PACKLS_CC);
+ OPCODE(UNPACK_HIGH);
+ OPCODE(UNPACKL_HIGH);
+ OPCODE(UNPACK_LOW);
+ OPCODE(UNPACKL_LOW);
+ OPCODE(VSHL_BY_SCALAR);
+ OPCODE(VSRL_BY_SCALAR);
+ OPCODE(VSRA_BY_SCALAR);
+ OPCODE(VSUM);
+ OPCODE(VICMPE);
+ OPCODE(VICMPH);
+ OPCODE(VICMPHL);
+ OPCODE(VICMPES);
+ OPCODE(VICMPHS);
+ OPCODE(VICMPHLS);
+ OPCODE(VFCMPE);
+ OPCODE(VFCMPH);
+ OPCODE(VFCMPHE);
+ OPCODE(VFCMPES);
+ OPCODE(VFCMPHS);
+ OPCODE(VFCMPHES);
+ OPCODE(VFTCI);
+ OPCODE(VEXTEND);
+ OPCODE(VROUND);
+ OPCODE(VTM);
+ OPCODE(VFAE_CC);
+ OPCODE(VFAEZ_CC);
+ OPCODE(VFEE_CC);
+ OPCODE(VFEEZ_CC);
+ OPCODE(VFENE_CC);
+ OPCODE(VFENEZ_CC);
+ OPCODE(VISTR_CC);
+ OPCODE(VSTRC_CC);
+ OPCODE(VSTRCZ_CC);
+ OPCODE(TDC);
+ OPCODE(ATOMIC_SWAPW);
+ OPCODE(ATOMIC_LOADW_ADD);
+ OPCODE(ATOMIC_LOADW_SUB);
+ OPCODE(ATOMIC_LOADW_AND);
+ OPCODE(ATOMIC_LOADW_OR);
+ OPCODE(ATOMIC_LOADW_XOR);
+ OPCODE(ATOMIC_LOADW_NAND);
+ OPCODE(ATOMIC_LOADW_MIN);
+ OPCODE(ATOMIC_LOADW_MAX);
+ OPCODE(ATOMIC_LOADW_UMIN);
+ OPCODE(ATOMIC_LOADW_UMAX);
+ OPCODE(ATOMIC_CMP_SWAPW);
+ OPCODE(LRV);
+ OPCODE(STRV);
+ OPCODE(PREFETCH);
+ }
+ return nullptr;
+#undef OPCODE
+}
+
+// Return true if VT is a vector whose elements are a whole number of bytes
+// in width.
+static bool canTreatAsByteVector(EVT VT) {
+ return VT.isVector() && VT.getScalarSizeInBits() % 8 == 0;
+}
+
+// Try to simplify an EXTRACT_VECTOR_ELT from a vector of type VecVT
+// producing a result of type ResVT. Op is a possibly bitcast version
+// of the input vector and Index is the index (based on type VecVT) that
+// should be extracted. Return the new extraction if a simplification
+// was possible or if Force is true.
+SDValue SystemZTargetLowering::combineExtract(const SDLoc &DL, EVT ResVT,
+ EVT VecVT, SDValue Op,
+ unsigned Index,
+ DAGCombinerInfo &DCI,
+ bool Force) const {
+ SelectionDAG &DAG = DCI.DAG;
+
+ // The number of bytes being extracted.
+ unsigned BytesPerElement = VecVT.getVectorElementType().getStoreSize();
+
+ for (;;) {
+ unsigned Opcode = Op.getOpcode();
+ if (Opcode == ISD::BITCAST)
+ // Look through bitcasts.
+ Op = Op.getOperand(0);
+ else if (Opcode == ISD::VECTOR_SHUFFLE &&
+ canTreatAsByteVector(Op.getValueType())) {
+ // Get a VPERM-like permute mask and see whether the bytes covered
+ // by the extracted element are a contiguous sequence from one
+ // source operand.
+ SmallVector<int, SystemZ::VectorBytes> Bytes;
+ getVPermMask(cast<ShuffleVectorSDNode>(Op), Bytes);
+ int First;
+ if (!getShuffleInput(Bytes, Index * BytesPerElement,
+ BytesPerElement, First))
+ break;
+ if (First < 0)
+ return DAG.getUNDEF(ResVT);
+ // Make sure the contiguous sequence starts at a multiple of the
+ // original element size.
+ unsigned Byte = unsigned(First) % Bytes.size();
+ if (Byte % BytesPerElement != 0)
+ break;
+ // We can get the extracted value directly from an input.
+ Index = Byte / BytesPerElement;
+ Op = Op.getOperand(unsigned(First) / Bytes.size());
+ Force = true;
+ } else if (Opcode == ISD::BUILD_VECTOR &&
+ canTreatAsByteVector(Op.getValueType())) {
+ // We can only optimize this case if the BUILD_VECTOR elements are
+ // at least as wide as the extracted value.
+ EVT OpVT = Op.getValueType();
+ unsigned OpBytesPerElement = OpVT.getVectorElementType().getStoreSize();
+ if (OpBytesPerElement < BytesPerElement)
+ break;
+ // Make sure that the least-significant bit of the extracted value
+ // is the least significant bit of an input.
+ unsigned End = (Index + 1) * BytesPerElement;
+ if (End % OpBytesPerElement != 0)
+ break;
+ // We're extracting the low part of one operand of the BUILD_VECTOR.
+ Op = Op.getOperand(End / OpBytesPerElement - 1);
+ if (!Op.getValueType().isInteger()) {
+ EVT VT = MVT::getIntegerVT(Op.getValueSizeInBits());
+ Op = DAG.getNode(ISD::BITCAST, DL, VT, Op);
+ DCI.AddToWorklist(Op.getNode());
+ }
+ EVT VT = MVT::getIntegerVT(ResVT.getSizeInBits());
+ Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
+ if (VT != ResVT) {
+ DCI.AddToWorklist(Op.getNode());
+ Op = DAG.getNode(ISD::BITCAST, DL, ResVT, Op);
+ }
+ return Op;
+ } else if ((Opcode == ISD::SIGN_EXTEND_VECTOR_INREG ||
+ Opcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
+ Opcode == ISD::ANY_EXTEND_VECTOR_INREG) &&
+ canTreatAsByteVector(Op.getValueType()) &&
+ canTreatAsByteVector(Op.getOperand(0).getValueType())) {
+ // Make sure that only the unextended bits are significant.
+ EVT ExtVT = Op.getValueType();
+ EVT OpVT = Op.getOperand(0).getValueType();
+ unsigned ExtBytesPerElement = ExtVT.getVectorElementType().getStoreSize();
+ unsigned OpBytesPerElement = OpVT.getVectorElementType().getStoreSize();
+ unsigned Byte = Index * BytesPerElement;
+ unsigned SubByte = Byte % ExtBytesPerElement;
+ unsigned MinSubByte = ExtBytesPerElement - OpBytesPerElement;
+ if (SubByte < MinSubByte ||
+ SubByte + BytesPerElement > ExtBytesPerElement)
+ break;
+ // Get the byte offset of the unextended element
+ Byte = Byte / ExtBytesPerElement * OpBytesPerElement;
+ // ...then add the byte offset relative to that element.
+ Byte += SubByte - MinSubByte;
+ if (Byte % BytesPerElement != 0)
+ break;
+ Op = Op.getOperand(0);
+ Index = Byte / BytesPerElement;
+ Force = true;
+ } else
+ break;
+ }
+ if (Force) {
+ if (Op.getValueType() != VecVT) {
+ Op = DAG.getNode(ISD::BITCAST, DL, VecVT, Op);
+ DCI.AddToWorklist(Op.getNode());
+ }
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Op,
+ DAG.getConstant(Index, DL, MVT::i32));
+ }
+ return SDValue();
+}
+
+// Optimize vector operations in scalar value Op on the basis that Op
+// is truncated to TruncVT.
+SDValue SystemZTargetLowering::combineTruncateExtract(
+ const SDLoc &DL, EVT TruncVT, SDValue Op, DAGCombinerInfo &DCI) const {
+ // If we have (trunc (extract_vector_elt X, Y)), try to turn it into
+ // (extract_vector_elt (bitcast X), Y'), where (bitcast X) has elements
+ // of type TruncVT.
+ if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ TruncVT.getSizeInBits() % 8 == 0) {
+ SDValue Vec = Op.getOperand(0);
+ EVT VecVT = Vec.getValueType();
+ if (canTreatAsByteVector(VecVT)) {
+ if (auto *IndexN = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+ unsigned BytesPerElement = VecVT.getVectorElementType().getStoreSize();
+ unsigned TruncBytes = TruncVT.getStoreSize();
+ if (BytesPerElement % TruncBytes == 0) {
+ // Calculate the value of Y' in the above description. We are
+ // splitting the original elements into Scale equal-sized pieces
+ // and for truncation purposes want the last (least-significant)
+ // of these pieces for IndexN. This is easiest to do by calculating
+ // the start index of the following element and then subtracting 1.
+ unsigned Scale = BytesPerElement / TruncBytes;
+ unsigned NewIndex = (IndexN->getZExtValue() + 1) * Scale - 1;
+
+ // Defer the creation of the bitcast from X to combineExtract,
+ // which might be able to optimize the extraction.
+ VecVT = MVT::getVectorVT(MVT::getIntegerVT(TruncBytes * 8),
+ VecVT.getStoreSize() / TruncBytes);
+ EVT ResVT = (TruncBytes < 4 ? MVT::i32 : TruncVT);
+ return combineExtract(DL, ResVT, VecVT, Vec, NewIndex, DCI, true);
+ }
+ }
+ }
+ }
+ return SDValue();
+}
+
+SDValue SystemZTargetLowering::combineSIGN_EXTEND(
+ SDNode *N, DAGCombinerInfo &DCI) const {
+ // Convert (sext (ashr (shl X, C1), C2)) to
+ // (ashr (shl (anyext X), C1'), C2')), since wider shifts are as
+ // cheap as narrower ones.
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+ if (N0.hasOneUse() && N0.getOpcode() == ISD::SRA) {
+ auto *SraAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+ SDValue Inner = N0.getOperand(0);
+ if (SraAmt && Inner.hasOneUse() && Inner.getOpcode() == ISD::SHL) {
+ if (auto *ShlAmt = dyn_cast<ConstantSDNode>(Inner.getOperand(1))) {
+ unsigned Extra = (VT.getSizeInBits() - N0.getValueSizeInBits());
+ unsigned NewShlAmt = ShlAmt->getZExtValue() + Extra;
+ unsigned NewSraAmt = SraAmt->getZExtValue() + Extra;
+ EVT ShiftVT = N0.getOperand(1).getValueType();
+ SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SDLoc(Inner), VT,
+ Inner.getOperand(0));
+ SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(Inner), VT, Ext,
+ DAG.getConstant(NewShlAmt, SDLoc(Inner),
+ ShiftVT));
+ return DAG.getNode(ISD::SRA, SDLoc(N0), VT, Shl,
+ DAG.getConstant(NewSraAmt, SDLoc(N0), ShiftVT));
+ }
+ }
+ }
+ return SDValue();
+}
+
+SDValue SystemZTargetLowering::combineMERGE(
+ SDNode *N, DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ unsigned Opcode = N->getOpcode();
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ if (Op0.getOpcode() == ISD::BITCAST)
+ Op0 = Op0.getOperand(0);
+ if (Op0.getOpcode() == SystemZISD::BYTE_MASK &&
+ cast<ConstantSDNode>(Op0.getOperand(0))->getZExtValue() == 0) {
+ // (z_merge_* 0, 0) -> 0. This is mostly useful for using VLLEZF
+ // for v4f32.
+ if (Op1 == N->getOperand(0))
+ return Op1;
+ // (z_merge_? 0, X) -> (z_unpackl_? 0, X).
+ EVT VT = Op1.getValueType();
+ unsigned ElemBytes = VT.getVectorElementType().getStoreSize();
+ if (ElemBytes <= 4) {
+ Opcode = (Opcode == SystemZISD::MERGE_HIGH ?
+ SystemZISD::UNPACKL_HIGH : SystemZISD::UNPACKL_LOW);
+ EVT InVT = VT.changeVectorElementTypeToInteger();
+ EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(ElemBytes * 16),
+ SystemZ::VectorBytes / ElemBytes / 2);
+ if (VT != InVT) {
+ Op1 = DAG.getNode(ISD::BITCAST, SDLoc(N), InVT, Op1);
+ DCI.AddToWorklist(Op1.getNode());
+ }
+ SDValue Op = DAG.getNode(Opcode, SDLoc(N), OutVT, Op1);
+ DCI.AddToWorklist(Op.getNode());
+ return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
+ }
+ }
+ return SDValue();
+}
+
+SDValue SystemZTargetLowering::combineSTORE(
+ SDNode *N, DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ auto *SN = cast<StoreSDNode>(N);
+ auto &Op1 = N->getOperand(1);
+ EVT MemVT = SN->getMemoryVT();
+ // If we have (truncstoreiN (extract_vector_elt X, Y), Z) then it is better
+ // for the extraction to be done on a vMiN value, so that we can use VSTE.
+ // If X has wider elements then convert it to:
+ // (truncstoreiN (extract_vector_elt (bitcast X), Y2), Z).
+ if (MemVT.isInteger()) {
+ if (SDValue Value =
+ combineTruncateExtract(SDLoc(N), MemVT, SN->getValue(), DCI)) {
+ DCI.AddToWorklist(Value.getNode());
+
+ // Rewrite the store with the new form of stored value.
+ return DAG.getTruncStore(SN->getChain(), SDLoc(SN), Value,
+ SN->getBasePtr(), SN->getMemoryVT(),
+ SN->getMemOperand());
+ }
+ }
+ // Combine STORE (BSWAP) into STRVH/STRV/STRVG
+ // See comment in combineBSWAP about volatile accesses.
+ if (!SN->isVolatile() &&
+ Op1.getOpcode() == ISD::BSWAP &&
+ Op1.getNode()->hasOneUse() &&
+ (Op1.getValueType() == MVT::i16 ||
+ Op1.getValueType() == MVT::i32 ||
+ Op1.getValueType() == MVT::i64)) {
+
+ SDValue BSwapOp = Op1.getOperand(0);
+
+ if (BSwapOp.getValueType() == MVT::i16)
+ BSwapOp = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), MVT::i32, BSwapOp);
+
+ SDValue Ops[] = {
+ N->getOperand(0), BSwapOp, N->getOperand(2),
+ DAG.getValueType(Op1.getValueType())
+ };
+
+ return
+ DAG.getMemIntrinsicNode(SystemZISD::STRV, SDLoc(N), DAG.getVTList(MVT::Other),
+ Ops, MemVT, SN->getMemOperand());
+ }
+ return SDValue();
+}
+
+SDValue SystemZTargetLowering::combineEXTRACT_VECTOR_ELT(
+ SDNode *N, DAGCombinerInfo &DCI) const {
+ // Try to simplify a vector extraction.
+ if (auto *IndexN = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+ SDValue Op0 = N->getOperand(0);
+ EVT VecVT = Op0.getValueType();
+ return combineExtract(SDLoc(N), N->getValueType(0), VecVT, Op0,
+ IndexN->getZExtValue(), DCI, false);
+ }
+ return SDValue();
+}
+
+SDValue SystemZTargetLowering::combineJOIN_DWORDS(
+ SDNode *N, DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ // (join_dwords X, X) == (replicate X)
+ if (N->getOperand(0) == N->getOperand(1))
+ return DAG.getNode(SystemZISD::REPLICATE, SDLoc(N), N->getValueType(0),
+ N->getOperand(0));
+ return SDValue();
+}
+
+SDValue SystemZTargetLowering::combineFP_ROUND(
+ SDNode *N, DAGCombinerInfo &DCI) const {
+ // (fpround (extract_vector_elt X 0))
+ // (fpround (extract_vector_elt X 1)) ->
+ // (extract_vector_elt (VROUND X) 0)
+ // (extract_vector_elt (VROUND X) 1)
+ //
+ // This is a special case since the target doesn't really support v2f32s.
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue Op0 = N->getOperand(0);
+ if (N->getValueType(0) == MVT::f32 &&
+ Op0.hasOneUse() &&
+ Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ Op0.getOperand(0).getValueType() == MVT::v2f64 &&
+ Op0.getOperand(1).getOpcode() == ISD::Constant &&
+ cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue() == 0) {
+ SDValue Vec = Op0.getOperand(0);
+ for (auto *U : Vec->uses()) {
+ if (U != Op0.getNode() &&
+ U->hasOneUse() &&
+ U->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ U->getOperand(0) == Vec &&
+ U->getOperand(1).getOpcode() == ISD::Constant &&
+ cast<ConstantSDNode>(U->getOperand(1))->getZExtValue() == 1) {
+ SDValue OtherRound = SDValue(*U->use_begin(), 0);
+ if (OtherRound.getOpcode() == ISD::FP_ROUND &&
+ OtherRound.getOperand(0) == SDValue(U, 0) &&
+ OtherRound.getValueType() == MVT::f32) {
+ SDValue VRound = DAG.getNode(SystemZISD::VROUND, SDLoc(N),
+ MVT::v4f32, Vec);
+ DCI.AddToWorklist(VRound.getNode());
+ SDValue Extract1 =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(U), MVT::f32,
+ VRound, DAG.getConstant(2, SDLoc(U), MVT::i32));
+ DCI.AddToWorklist(Extract1.getNode());
+ DAG.ReplaceAllUsesOfValueWith(OtherRound, Extract1);
+ SDValue Extract0 =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f32,
+ VRound, DAG.getConstant(0, SDLoc(Op0), MVT::i32));
+ return Extract0;
+ }
+ }
+ }
+ }
+ return SDValue();
+}
+
+SDValue SystemZTargetLowering::combineBSWAP(
+ SDNode *N, DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ // Combine BSWAP (LOAD) into LRVH/LRV/LRVG
+ // These loads are allowed to access memory multiple times, and so we must check
+ // that the loads are not volatile before performing the combine.
+ if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
+ N->getOperand(0).hasOneUse() &&
+ (N->getValueType(0) == MVT::i16 || N->getValueType(0) == MVT::i32 ||
+ N->getValueType(0) == MVT::i64) &&
+ !cast<LoadSDNode>(N->getOperand(0))->isVolatile()) {
+ SDValue Load = N->getOperand(0);
+ LoadSDNode *LD = cast<LoadSDNode>(Load);
+
+ // Create the byte-swapping load.
+ SDValue Ops[] = {
+ LD->getChain(), // Chain
+ LD->getBasePtr(), // Ptr
+ DAG.getValueType(N->getValueType(0)) // VT
+ };
+ SDValue BSLoad =
+ DAG.getMemIntrinsicNode(SystemZISD::LRV, SDLoc(N),
+ DAG.getVTList(N->getValueType(0) == MVT::i64 ?
+ MVT::i64 : MVT::i32, MVT::Other),
+ Ops, LD->getMemoryVT(), LD->getMemOperand());
+
+ // If this is an i16 load, insert the truncate.
+ SDValue ResVal = BSLoad;
+ if (N->getValueType(0) == MVT::i16)
+ ResVal = DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i16, BSLoad);
+
+ // First, combine the bswap away. This makes the value produced by the
+ // load dead.
+ DCI.CombineTo(N, ResVal);
+
+ // Next, combine the load away, we give it a bogus result value but a real
+ // chain result. The result value is dead because the bswap is dead.
+ DCI.CombineTo(Load.getNode(), ResVal, BSLoad.getValue(1));
+
+ // Return N so it doesn't get rechecked!
+ return SDValue(N, 0);
+ }
+ return SDValue();
+}
+
+SDValue SystemZTargetLowering::combineSHIFTROT(
+ SDNode *N, DAGCombinerInfo &DCI) const {
+
+ SelectionDAG &DAG = DCI.DAG;
+
+ // Shift/rotate instructions only use the last 6 bits of the second operand
+ // register. If the second operand is the result of an AND with an immediate
+ // value that has its last 6 bits set, we can safely remove the AND operation.
+ //
+ // If the AND operation doesn't have the last 6 bits set, we can't remove it
+ // entirely, but we can still truncate it to a 16-bit value. This prevents
+ // us from ending up with a NILL with a signed operand, which will cause the
+ // instruction printer to abort.
+ SDValue N1 = N->getOperand(1);
+ if (N1.getOpcode() == ISD::AND) {
+ SDValue AndMaskOp = N1->getOperand(1);
+ auto *AndMask = dyn_cast<ConstantSDNode>(AndMaskOp);
+
+ // The AND mask is constant
+ if (AndMask) {
+ auto AmtVal = AndMask->getZExtValue();
+
+ // Bottom 6 bits are set
+ if ((AmtVal & 0x3f) == 0x3f) {
+ SDValue AndOp = N1->getOperand(0);
+
+ // This is the only use, so remove the node
+ if (N1.hasOneUse()) {
+ // Combine the AND away
+ DCI.CombineTo(N1.getNode(), AndOp);
+
+ // Return N so it isn't rechecked
+ return SDValue(N, 0);
+
+ // The node will be reused, so create a new node for this one use
+ } else {
+ SDValue Replace = DAG.getNode(N->getOpcode(), SDLoc(N),
+ N->getValueType(0), N->getOperand(0),
+ AndOp);
+ DCI.AddToWorklist(Replace.getNode());
+
+ return Replace;
+ }
+
+ // We can't remove the AND, but we can use NILL here (normally we would
+ // use NILF). Only keep the last 16 bits of the mask. The actual
+ // transformation will be handled by .td definitions.
+ } else if (AmtVal >> 16 != 0) {
+ SDValue AndOp = N1->getOperand(0);
+
+ auto NewMask = DAG.getConstant(AndMask->getZExtValue() & 0x0000ffff,
+ SDLoc(AndMaskOp),
+ AndMaskOp.getValueType());
+
+ auto NewAnd = DAG.getNode(N1.getOpcode(), SDLoc(N1), N1.getValueType(),
+ AndOp, NewMask);
+
+ SDValue Replace = DAG.getNode(N->getOpcode(), SDLoc(N),
+ N->getValueType(0), N->getOperand(0),
+ NewAnd);
+ DCI.AddToWorklist(Replace.getNode());
+
+ return Replace;
+ }
+ }
+ }
+
+ return SDValue();
+}
+
+SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ switch(N->getOpcode()) {
+ default: break;
+ case ISD::SIGN_EXTEND: return combineSIGN_EXTEND(N, DCI);
+ case SystemZISD::MERGE_HIGH:
+ case SystemZISD::MERGE_LOW: return combineMERGE(N, DCI);
+ case ISD::STORE: return combineSTORE(N, DCI);
+ case ISD::EXTRACT_VECTOR_ELT: return combineEXTRACT_VECTOR_ELT(N, DCI);
+ case SystemZISD::JOIN_DWORDS: return combineJOIN_DWORDS(N, DCI);
+ case ISD::FP_ROUND: return combineFP_ROUND(N, DCI);
+ case ISD::BSWAP: return combineBSWAP(N, DCI);
+ case ISD::SHL:
+ case ISD::SRA:
+ case ISD::SRL:
+ case ISD::ROTL: return combineSHIFTROT(N, DCI);
+ }
+
+ return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+// Custom insertion
+//===----------------------------------------------------------------------===//
+
+// Create a new basic block after MBB.
+static MachineBasicBlock *emitBlockAfter(MachineBasicBlock *MBB) {
+ MachineFunction &MF = *MBB->getParent();
+ MachineBasicBlock *NewMBB = MF.CreateMachineBasicBlock(MBB->getBasicBlock());
+ MF.insert(std::next(MachineFunction::iterator(MBB)), NewMBB);
+ return NewMBB;
+}
+
+// Split MBB after MI and return the new block (the one that contains
+// instructions after MI).
+static MachineBasicBlock *splitBlockAfter(MachineBasicBlock::iterator MI,
+ MachineBasicBlock *MBB) {
+ MachineBasicBlock *NewMBB = emitBlockAfter(MBB);
+ NewMBB->splice(NewMBB->begin(), MBB,
+ std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+ NewMBB->transferSuccessorsAndUpdatePHIs(MBB);
+ return NewMBB;
+}
+
+// Split MBB before MI and return the new block (the one that contains MI).
+static MachineBasicBlock *splitBlockBefore(MachineBasicBlock::iterator MI,
+ MachineBasicBlock *MBB) {
+ MachineBasicBlock *NewMBB = emitBlockAfter(MBB);
+ NewMBB->splice(NewMBB->begin(), MBB, MI, MBB->end());
+ NewMBB->transferSuccessorsAndUpdatePHIs(MBB);
+ return NewMBB;
+}
+
+// Force base value Base into a register before MI. Return the register.
+static unsigned forceReg(MachineInstr &MI, MachineOperand &Base,
+ const SystemZInstrInfo *TII) {
+ if (Base.isReg())
+ return Base.getReg();
+
+ MachineBasicBlock *MBB = MI.getParent();
+ MachineFunction &MF = *MBB->getParent();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+ BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LA), Reg)
+ .addOperand(Base)
+ .addImm(0)
+ .addReg(0);
+ return Reg;
+}
+
+// Implement EmitInstrWithCustomInserter for pseudo Select* instruction MI.
+MachineBasicBlock *
+SystemZTargetLowering::emitSelect(MachineInstr &MI,
+ MachineBasicBlock *MBB,
+ unsigned LOCROpcode) const {
+ const SystemZInstrInfo *TII =
+ static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+
+ unsigned DestReg = MI.getOperand(0).getReg();
+ unsigned TrueReg = MI.getOperand(1).getReg();
+ unsigned FalseReg = MI.getOperand(2).getReg();
+ unsigned CCValid = MI.getOperand(3).getImm();
+ unsigned CCMask = MI.getOperand(4).getImm();
+ DebugLoc DL = MI.getDebugLoc();
+
+ // Use LOCROpcode if possible.
+ if (LOCROpcode && Subtarget.hasLoadStoreOnCond()) {
+ BuildMI(*MBB, MI, DL, TII->get(LOCROpcode), DestReg)
+ .addReg(FalseReg).addReg(TrueReg)
+ .addImm(CCValid).addImm(CCMask);
+ MI.eraseFromParent();
+ return MBB;
+ }
+
+ MachineBasicBlock *StartMBB = MBB;
+ MachineBasicBlock *JoinMBB = splitBlockBefore(MI, MBB);
+ MachineBasicBlock *FalseMBB = emitBlockAfter(StartMBB);
+
+ // StartMBB:
+ // BRC CCMask, JoinMBB
+ // # fallthrough to FalseMBB
+ MBB = StartMBB;
+ BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+ .addImm(CCValid).addImm(CCMask).addMBB(JoinMBB);
+ MBB->addSuccessor(JoinMBB);
+ MBB->addSuccessor(FalseMBB);
+
+ // FalseMBB:
+ // # fallthrough to JoinMBB
+ MBB = FalseMBB;
+ MBB->addSuccessor(JoinMBB);
+
+ // JoinMBB:
+ // %Result = phi [ %FalseReg, FalseMBB ], [ %TrueReg, StartMBB ]
+ // ...
+ MBB = JoinMBB;
+ BuildMI(*MBB, MI, DL, TII->get(SystemZ::PHI), DestReg)
+ .addReg(TrueReg).addMBB(StartMBB)
+ .addReg(FalseReg).addMBB(FalseMBB);
+
+ MI.eraseFromParent();
+ return JoinMBB;
+}
+
+// Implement EmitInstrWithCustomInserter for pseudo CondStore* instruction MI.
+// StoreOpcode is the store to use and Invert says whether the store should
+// happen when the condition is false rather than true. If a STORE ON
+// CONDITION is available, STOCOpcode is its opcode, otherwise it is 0.
+MachineBasicBlock *SystemZTargetLowering::emitCondStore(MachineInstr &MI,
+ MachineBasicBlock *MBB,
+ unsigned StoreOpcode,
+ unsigned STOCOpcode,
+ bool Invert) const {
+ const SystemZInstrInfo *TII =
+ static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+
+ unsigned SrcReg = MI.getOperand(0).getReg();
+ MachineOperand Base = MI.getOperand(1);
+ int64_t Disp = MI.getOperand(2).getImm();
+ unsigned IndexReg = MI.getOperand(3).getReg();
+ unsigned CCValid = MI.getOperand(4).getImm();
+ unsigned CCMask = MI.getOperand(5).getImm();
+ DebugLoc DL = MI.getDebugLoc();
+
+ StoreOpcode = TII->getOpcodeForOffset(StoreOpcode, Disp);
+
+ // Use STOCOpcode if possible. We could use different store patterns in
+ // order to avoid matching the index register, but the performance trade-offs
+ // might be more complicated in that case.
+ if (STOCOpcode && !IndexReg && Subtarget.hasLoadStoreOnCond()) {
+ if (Invert)
+ CCMask ^= CCValid;
+ BuildMI(*MBB, MI, DL, TII->get(STOCOpcode))
+ .addReg(SrcReg).addOperand(Base).addImm(Disp)
+ .addImm(CCValid).addImm(CCMask);
+ MI.eraseFromParent();
+ return MBB;
+ }
+
+ // Get the condition needed to branch around the store.
+ if (!Invert)
+ CCMask ^= CCValid;
+
+ MachineBasicBlock *StartMBB = MBB;
+ MachineBasicBlock *JoinMBB = splitBlockBefore(MI, MBB);
+ MachineBasicBlock *FalseMBB = emitBlockAfter(StartMBB);
+
+ // StartMBB:
+ // BRC CCMask, JoinMBB
+ // # fallthrough to FalseMBB
+ MBB = StartMBB;
+ BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+ .addImm(CCValid).addImm(CCMask).addMBB(JoinMBB);
+ MBB->addSuccessor(JoinMBB);
+ MBB->addSuccessor(FalseMBB);
+
+ // FalseMBB:
+ // store %SrcReg, %Disp(%Index,%Base)
+ // # fallthrough to JoinMBB
+ MBB = FalseMBB;
+ BuildMI(MBB, DL, TII->get(StoreOpcode))
+ .addReg(SrcReg).addOperand(Base).addImm(Disp).addReg(IndexReg);
+ MBB->addSuccessor(JoinMBB);
+
+ MI.eraseFromParent();
+ return JoinMBB;
+}
+
+// Implement EmitInstrWithCustomInserter for pseudo ATOMIC_LOAD{,W}_*
+// or ATOMIC_SWAP{,W} instruction MI. BinOpcode is the instruction that
+// performs the binary operation elided by "*", or 0 for ATOMIC_SWAP{,W}.
+// BitSize is the width of the field in bits, or 0 if this is a partword
+// ATOMIC_LOADW_* or ATOMIC_SWAPW instruction, in which case the bitsize
+// is one of the operands. Invert says whether the field should be
+// inverted after performing BinOpcode (e.g. for NAND).
+MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadBinary(
+ MachineInstr &MI, MachineBasicBlock *MBB, unsigned BinOpcode,
+ unsigned BitSize, bool Invert) const {
+ MachineFunction &MF = *MBB->getParent();
+ const SystemZInstrInfo *TII =
+ static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ bool IsSubWord = (BitSize < 32);
+
+ // Extract the operands. Base can be a register or a frame index.
+ // Src2 can be a register or immediate.
+ unsigned Dest = MI.getOperand(0).getReg();
+ MachineOperand Base = earlyUseOperand(MI.getOperand(1));
+ int64_t Disp = MI.getOperand(2).getImm();
+ MachineOperand Src2 = earlyUseOperand(MI.getOperand(3));
+ unsigned BitShift = (IsSubWord ? MI.getOperand(4).getReg() : 0);
+ unsigned NegBitShift = (IsSubWord ? MI.getOperand(5).getReg() : 0);
+ DebugLoc DL = MI.getDebugLoc();
+ if (IsSubWord)
+ BitSize = MI.getOperand(6).getImm();
+
+ // Subword operations use 32-bit registers.
+ const TargetRegisterClass *RC = (BitSize <= 32 ?
+ &SystemZ::GR32BitRegClass :
+ &SystemZ::GR64BitRegClass);
+ unsigned LOpcode = BitSize <= 32 ? SystemZ::L : SystemZ::LG;
+ unsigned CSOpcode = BitSize <= 32 ? SystemZ::CS : SystemZ::CSG;
+
+ // Get the right opcodes for the displacement.
+ LOpcode = TII->getOpcodeForOffset(LOpcode, Disp);
+ CSOpcode = TII->getOpcodeForOffset(CSOpcode, Disp);
+ assert(LOpcode && CSOpcode && "Displacement out of range");
+
+ // Create virtual registers for temporary results.
+ unsigned OrigVal = MRI.createVirtualRegister(RC);
+ unsigned OldVal = MRI.createVirtualRegister(RC);
+ unsigned NewVal = (BinOpcode || IsSubWord ?
+ MRI.createVirtualRegister(RC) : Src2.getReg());
+ unsigned RotatedOldVal = (IsSubWord ? MRI.createVirtualRegister(RC) : OldVal);
+ unsigned RotatedNewVal = (IsSubWord ? MRI.createVirtualRegister(RC) : NewVal);
+
+ // Insert a basic block for the main loop.
+ MachineBasicBlock *StartMBB = MBB;
+ MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB);
+ MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB);
+
+ // StartMBB:
+ // ...
+ // %OrigVal = L Disp(%Base)
+ // # fall through to LoopMMB
+ MBB = StartMBB;
+ BuildMI(MBB, DL, TII->get(LOpcode), OrigVal)
+ .addOperand(Base).addImm(Disp).addReg(0);
+ MBB->addSuccessor(LoopMBB);
+
+ // LoopMBB:
+ // %OldVal = phi [ %OrigVal, StartMBB ], [ %Dest, LoopMBB ]
+ // %RotatedOldVal = RLL %OldVal, 0(%BitShift)
+ // %RotatedNewVal = OP %RotatedOldVal, %Src2
+ // %NewVal = RLL %RotatedNewVal, 0(%NegBitShift)
+ // %Dest = CS %OldVal, %NewVal, Disp(%Base)
+ // JNE LoopMBB
+ // # fall through to DoneMMB
+ MBB = LoopMBB;
+ BuildMI(MBB, DL, TII->get(SystemZ::PHI), OldVal)
+ .addReg(OrigVal).addMBB(StartMBB)
+ .addReg(Dest).addMBB(LoopMBB);
+ if (IsSubWord)
+ BuildMI(MBB, DL, TII->get(SystemZ::RLL), RotatedOldVal)
+ .addReg(OldVal).addReg(BitShift).addImm(0);
+ if (Invert) {
+ // Perform the operation normally and then invert every bit of the field.
+ unsigned Tmp = MRI.createVirtualRegister(RC);
+ BuildMI(MBB, DL, TII->get(BinOpcode), Tmp)
+ .addReg(RotatedOldVal).addOperand(Src2);
+ if (BitSize <= 32)
+ // XILF with the upper BitSize bits set.
+ BuildMI(MBB, DL, TII->get(SystemZ::XILF), RotatedNewVal)
+ .addReg(Tmp).addImm(-1U << (32 - BitSize));
+ else {
+ // Use LCGR and add -1 to the result, which is more compact than
+ // an XILF, XILH pair.
+ unsigned Tmp2 = MRI.createVirtualRegister(RC);
+ BuildMI(MBB, DL, TII->get(SystemZ::LCGR), Tmp2).addReg(Tmp);
+ BuildMI(MBB, DL, TII->get(SystemZ::AGHI), RotatedNewVal)
+ .addReg(Tmp2).addImm(-1);
+ }
+ } else if (BinOpcode)
+ // A simply binary operation.
+ BuildMI(MBB, DL, TII->get(BinOpcode), RotatedNewVal)
+ .addReg(RotatedOldVal).addOperand(Src2);
+ else if (IsSubWord)
+ // Use RISBG to rotate Src2 into position and use it to replace the
+ // field in RotatedOldVal.
+ BuildMI(MBB, DL, TII->get(SystemZ::RISBG32), RotatedNewVal)
+ .addReg(RotatedOldVal).addReg(Src2.getReg())
+ .addImm(32).addImm(31 + BitSize).addImm(32 - BitSize);
+ if (IsSubWord)
+ BuildMI(MBB, DL, TII->get(SystemZ::RLL), NewVal)
+ .addReg(RotatedNewVal).addReg(NegBitShift).addImm(0);
+ BuildMI(MBB, DL, TII->get(CSOpcode), Dest)
+ .addReg(OldVal).addReg(NewVal).addOperand(Base).addImm(Disp);
+ BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+ .addImm(SystemZ::CCMASK_CS).addImm(SystemZ::CCMASK_CS_NE).addMBB(LoopMBB);
+ MBB->addSuccessor(LoopMBB);
+ MBB->addSuccessor(DoneMBB);
+
+ MI.eraseFromParent();
+ return DoneMBB;
+}
+
+// Implement EmitInstrWithCustomInserter for pseudo
+// ATOMIC_LOAD{,W}_{,U}{MIN,MAX} instruction MI. CompareOpcode is the
+// instruction that should be used to compare the current field with the
+// minimum or maximum value. KeepOldMask is the BRC condition-code mask
+// for when the current field should be kept. BitSize is the width of
+// the field in bits, or 0 if this is a partword ATOMIC_LOADW_* instruction.
+MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadMinMax(
+ MachineInstr &MI, MachineBasicBlock *MBB, unsigned CompareOpcode,
+ unsigned KeepOldMask, unsigned BitSize) const {
+ MachineFunction &MF = *MBB->getParent();
+ const SystemZInstrInfo *TII =
+ static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ bool IsSubWord = (BitSize < 32);
+
+ // Extract the operands. Base can be a register or a frame index.
+ unsigned Dest = MI.getOperand(0).getReg();
+ MachineOperand Base = earlyUseOperand(MI.getOperand(1));
+ int64_t Disp = MI.getOperand(2).getImm();
+ unsigned Src2 = MI.getOperand(3).getReg();
+ unsigned BitShift = (IsSubWord ? MI.getOperand(4).getReg() : 0);
+ unsigned NegBitShift = (IsSubWord ? MI.getOperand(5).getReg() : 0);
+ DebugLoc DL = MI.getDebugLoc();
+ if (IsSubWord)
+ BitSize = MI.getOperand(6).getImm();
+
+ // Subword operations use 32-bit registers.
+ const TargetRegisterClass *RC = (BitSize <= 32 ?
+ &SystemZ::GR32BitRegClass :
+ &SystemZ::GR64BitRegClass);
+ unsigned LOpcode = BitSize <= 32 ? SystemZ::L : SystemZ::LG;
+ unsigned CSOpcode = BitSize <= 32 ? SystemZ::CS : SystemZ::CSG;
+
+ // Get the right opcodes for the displacement.
+ LOpcode = TII->getOpcodeForOffset(LOpcode, Disp);
+ CSOpcode = TII->getOpcodeForOffset(CSOpcode, Disp);
+ assert(LOpcode && CSOpcode && "Displacement out of range");
+
+ // Create virtual registers for temporary results.
+ unsigned OrigVal = MRI.createVirtualRegister(RC);
+ unsigned OldVal = MRI.createVirtualRegister(RC);
+ unsigned NewVal = MRI.createVirtualRegister(RC);
+ unsigned RotatedOldVal = (IsSubWord ? MRI.createVirtualRegister(RC) : OldVal);
+ unsigned RotatedAltVal = (IsSubWord ? MRI.createVirtualRegister(RC) : Src2);
+ unsigned RotatedNewVal = (IsSubWord ? MRI.createVirtualRegister(RC) : NewVal);
+
+ // Insert 3 basic blocks for the loop.
+ MachineBasicBlock *StartMBB = MBB;
+ MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB);
+ MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB);
+ MachineBasicBlock *UseAltMBB = emitBlockAfter(LoopMBB);
+ MachineBasicBlock *UpdateMBB = emitBlockAfter(UseAltMBB);
+
+ // StartMBB:
+ // ...
+ // %OrigVal = L Disp(%Base)
+ // # fall through to LoopMMB
+ MBB = StartMBB;
+ BuildMI(MBB, DL, TII->get(LOpcode), OrigVal)
+ .addOperand(Base).addImm(Disp).addReg(0);
+ MBB->addSuccessor(LoopMBB);
+
+ // LoopMBB:
+ // %OldVal = phi [ %OrigVal, StartMBB ], [ %Dest, UpdateMBB ]
+ // %RotatedOldVal = RLL %OldVal, 0(%BitShift)
+ // CompareOpcode %RotatedOldVal, %Src2
+ // BRC KeepOldMask, UpdateMBB
+ MBB = LoopMBB;
+ BuildMI(MBB, DL, TII->get(SystemZ::PHI), OldVal)
+ .addReg(OrigVal).addMBB(StartMBB)
+ .addReg(Dest).addMBB(UpdateMBB);
+ if (IsSubWord)
+ BuildMI(MBB, DL, TII->get(SystemZ::RLL), RotatedOldVal)
+ .addReg(OldVal).addReg(BitShift).addImm(0);
+ BuildMI(MBB, DL, TII->get(CompareOpcode))
+ .addReg(RotatedOldVal).addReg(Src2);
+ BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+ .addImm(SystemZ::CCMASK_ICMP).addImm(KeepOldMask).addMBB(UpdateMBB);
+ MBB->addSuccessor(UpdateMBB);
+ MBB->addSuccessor(UseAltMBB);
+
+ // UseAltMBB:
+ // %RotatedAltVal = RISBG %RotatedOldVal, %Src2, 32, 31 + BitSize, 0
+ // # fall through to UpdateMMB
+ MBB = UseAltMBB;
+ if (IsSubWord)
+ BuildMI(MBB, DL, TII->get(SystemZ::RISBG32), RotatedAltVal)
+ .addReg(RotatedOldVal).addReg(Src2)
+ .addImm(32).addImm(31 + BitSize).addImm(0);
+ MBB->addSuccessor(UpdateMBB);
+
+ // UpdateMBB:
+ // %RotatedNewVal = PHI [ %RotatedOldVal, LoopMBB ],
+ // [ %RotatedAltVal, UseAltMBB ]
+ // %NewVal = RLL %RotatedNewVal, 0(%NegBitShift)
+ // %Dest = CS %OldVal, %NewVal, Disp(%Base)
+ // JNE LoopMBB
+ // # fall through to DoneMMB
+ MBB = UpdateMBB;
+ BuildMI(MBB, DL, TII->get(SystemZ::PHI), RotatedNewVal)
+ .addReg(RotatedOldVal).addMBB(LoopMBB)
+ .addReg(RotatedAltVal).addMBB(UseAltMBB);
+ if (IsSubWord)
+ BuildMI(MBB, DL, TII->get(SystemZ::RLL), NewVal)
+ .addReg(RotatedNewVal).addReg(NegBitShift).addImm(0);
+ BuildMI(MBB, DL, TII->get(CSOpcode), Dest)
+ .addReg(OldVal).addReg(NewVal).addOperand(Base).addImm(Disp);
+ BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+ .addImm(SystemZ::CCMASK_CS).addImm(SystemZ::CCMASK_CS_NE).addMBB(LoopMBB);
+ MBB->addSuccessor(LoopMBB);
+ MBB->addSuccessor(DoneMBB);
+
+ MI.eraseFromParent();
+ return DoneMBB;
+}
+
+// Implement EmitInstrWithCustomInserter for pseudo ATOMIC_CMP_SWAPW
+// instruction MI.
+MachineBasicBlock *
+SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr &MI,
+ MachineBasicBlock *MBB) const {
+
+ MachineFunction &MF = *MBB->getParent();
+ const SystemZInstrInfo *TII =
+ static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ // Extract the operands. Base can be a register or a frame index.
+ unsigned Dest = MI.getOperand(0).getReg();
+ MachineOperand Base = earlyUseOperand(MI.getOperand(1));
+ int64_t Disp = MI.getOperand(2).getImm();
+ unsigned OrigCmpVal = MI.getOperand(3).getReg();
+ unsigned OrigSwapVal = MI.getOperand(4).getReg();
+ unsigned BitShift = MI.getOperand(5).getReg();
+ unsigned NegBitShift = MI.getOperand(6).getReg();
+ int64_t BitSize = MI.getOperand(7).getImm();
+ DebugLoc DL = MI.getDebugLoc();
+
+ const TargetRegisterClass *RC = &SystemZ::GR32BitRegClass;
+
+ // Get the right opcodes for the displacement.
+ unsigned LOpcode = TII->getOpcodeForOffset(SystemZ::L, Disp);
+ unsigned CSOpcode = TII->getOpcodeForOffset(SystemZ::CS, Disp);
+ assert(LOpcode && CSOpcode && "Displacement out of range");
+
+ // Create virtual registers for temporary results.
+ unsigned OrigOldVal = MRI.createVirtualRegister(RC);
+ unsigned OldVal = MRI.createVirtualRegister(RC);
+ unsigned CmpVal = MRI.createVirtualRegister(RC);
+ unsigned SwapVal = MRI.createVirtualRegister(RC);
+ unsigned StoreVal = MRI.createVirtualRegister(RC);
+ unsigned RetryOldVal = MRI.createVirtualRegister(RC);
+ unsigned RetryCmpVal = MRI.createVirtualRegister(RC);
+ unsigned RetrySwapVal = MRI.createVirtualRegister(RC);
+
+ // Insert 2 basic blocks for the loop.
+ MachineBasicBlock *StartMBB = MBB;
+ MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB);
+ MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB);
+ MachineBasicBlock *SetMBB = emitBlockAfter(LoopMBB);
+
+ // StartMBB:
+ // ...
+ // %OrigOldVal = L Disp(%Base)
+ // # fall through to LoopMMB
+ MBB = StartMBB;
+ BuildMI(MBB, DL, TII->get(LOpcode), OrigOldVal)
+ .addOperand(Base).addImm(Disp).addReg(0);
+ MBB->addSuccessor(LoopMBB);
+
+ // LoopMBB:
+ // %OldVal = phi [ %OrigOldVal, EntryBB ], [ %RetryOldVal, SetMBB ]
+ // %CmpVal = phi [ %OrigCmpVal, EntryBB ], [ %RetryCmpVal, SetMBB ]
+ // %SwapVal = phi [ %OrigSwapVal, EntryBB ], [ %RetrySwapVal, SetMBB ]
+ // %Dest = RLL %OldVal, BitSize(%BitShift)
+ // ^^ The low BitSize bits contain the field
+ // of interest.
+ // %RetryCmpVal = RISBG32 %CmpVal, %Dest, 32, 63-BitSize, 0
+ // ^^ Replace the upper 32-BitSize bits of the
+ // comparison value with those that we loaded,
+ // so that we can use a full word comparison.
+ // CR %Dest, %RetryCmpVal
+ // JNE DoneMBB
+ // # Fall through to SetMBB
+ MBB = LoopMBB;
+ BuildMI(MBB, DL, TII->get(SystemZ::PHI), OldVal)
+ .addReg(OrigOldVal).addMBB(StartMBB)
+ .addReg(RetryOldVal).addMBB(SetMBB);
+ BuildMI(MBB, DL, TII->get(SystemZ::PHI), CmpVal)
+ .addReg(OrigCmpVal).addMBB(StartMBB)
+ .addReg(RetryCmpVal).addMBB(SetMBB);
+ BuildMI(MBB, DL, TII->get(SystemZ::PHI), SwapVal)
+ .addReg(OrigSwapVal).addMBB(StartMBB)
+ .addReg(RetrySwapVal).addMBB(SetMBB);
+ BuildMI(MBB, DL, TII->get(SystemZ::RLL), Dest)
+ .addReg(OldVal).addReg(BitShift).addImm(BitSize);
+ BuildMI(MBB, DL, TII->get(SystemZ::RISBG32), RetryCmpVal)
+ .addReg(CmpVal).addReg(Dest).addImm(32).addImm(63 - BitSize).addImm(0);
+ BuildMI(MBB, DL, TII->get(SystemZ::CR))
+ .addReg(Dest).addReg(RetryCmpVal);
+ BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+ .addImm(SystemZ::CCMASK_ICMP)
+ .addImm(SystemZ::CCMASK_CMP_NE).addMBB(DoneMBB);
+ MBB->addSuccessor(DoneMBB);
+ MBB->addSuccessor(SetMBB);
+
+ // SetMBB:
+ // %RetrySwapVal = RISBG32 %SwapVal, %Dest, 32, 63-BitSize, 0
+ // ^^ Replace the upper 32-BitSize bits of the new
+ // value with those that we loaded.
+ // %StoreVal = RLL %RetrySwapVal, -BitSize(%NegBitShift)
+ // ^^ Rotate the new field to its proper position.
+ // %RetryOldVal = CS %Dest, %StoreVal, Disp(%Base)
+ // JNE LoopMBB
+ // # fall through to ExitMMB
+ MBB = SetMBB;
+ BuildMI(MBB, DL, TII->get(SystemZ::RISBG32), RetrySwapVal)
+ .addReg(SwapVal).addReg(Dest).addImm(32).addImm(63 - BitSize).addImm(0);
+ BuildMI(MBB, DL, TII->get(SystemZ::RLL), StoreVal)
+ .addReg(RetrySwapVal).addReg(NegBitShift).addImm(-BitSize);
+ BuildMI(MBB, DL, TII->get(CSOpcode), RetryOldVal)
+ .addReg(OldVal).addReg(StoreVal).addOperand(Base).addImm(Disp);
+ BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+ .addImm(SystemZ::CCMASK_CS).addImm(SystemZ::CCMASK_CS_NE).addMBB(LoopMBB);
+ MBB->addSuccessor(LoopMBB);
+ MBB->addSuccessor(DoneMBB);
+
+ MI.eraseFromParent();
+ return DoneMBB;
+}
+
+// Emit an extension from a GR32 or GR64 to a GR128. ClearEven is true
+// if the high register of the GR128 value must be cleared or false if
+// it's "don't care". SubReg is subreg_l32 when extending a GR32
+// and subreg_l64 when extending a GR64.
+MachineBasicBlock *SystemZTargetLowering::emitExt128(MachineInstr &MI,
+ MachineBasicBlock *MBB,
+ bool ClearEven,
+ unsigned SubReg) const {
+ MachineFunction &MF = *MBB->getParent();
+ const SystemZInstrInfo *TII =
+ static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ DebugLoc DL = MI.getDebugLoc();
+
+ unsigned Dest = MI.getOperand(0).getReg();
+ unsigned Src = MI.getOperand(1).getReg();
+ unsigned In128 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
+
+ BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::IMPLICIT_DEF), In128);
+ if (ClearEven) {
+ unsigned NewIn128 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
+ unsigned Zero64 = MRI.createVirtualRegister(&SystemZ::GR64BitRegClass);
+
+ BuildMI(*MBB, MI, DL, TII->get(SystemZ::LLILL), Zero64)
+ .addImm(0);
+ BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewIn128)
+ .addReg(In128).addReg(Zero64).addImm(SystemZ::subreg_h64);
+ In128 = NewIn128;
+ }
+ BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dest)
+ .addReg(In128).addReg(Src).addImm(SubReg);
+
+ MI.eraseFromParent();
+ return MBB;
+}
+
+MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
+ MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const {
+ MachineFunction &MF = *MBB->getParent();
+ const SystemZInstrInfo *TII =
+ static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ DebugLoc DL = MI.getDebugLoc();
+
+ MachineOperand DestBase = earlyUseOperand(MI.getOperand(0));
+ uint64_t DestDisp = MI.getOperand(1).getImm();
+ MachineOperand SrcBase = earlyUseOperand(MI.getOperand(2));
+ uint64_t SrcDisp = MI.getOperand(3).getImm();
+ uint64_t Length = MI.getOperand(4).getImm();
+
+ // When generating more than one CLC, all but the last will need to
+ // branch to the end when a difference is found.
+ MachineBasicBlock *EndMBB = (Length > 256 && Opcode == SystemZ::CLC ?
+ splitBlockAfter(MI, MBB) : nullptr);
+
+ // Check for the loop form, in which operand 5 is the trip count.
+ if (MI.getNumExplicitOperands() > 5) {
+ bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase);
+
+ uint64_t StartCountReg = MI.getOperand(5).getReg();
+ uint64_t StartSrcReg = forceReg(MI, SrcBase, TII);
+ uint64_t StartDestReg = (HaveSingleBase ? StartSrcReg :
+ forceReg(MI, DestBase, TII));
+
+ const TargetRegisterClass *RC = &SystemZ::ADDR64BitRegClass;
+ uint64_t ThisSrcReg = MRI.createVirtualRegister(RC);
+ uint64_t ThisDestReg = (HaveSingleBase ? ThisSrcReg :
+ MRI.createVirtualRegister(RC));
+ uint64_t NextSrcReg = MRI.createVirtualRegister(RC);
+ uint64_t NextDestReg = (HaveSingleBase ? NextSrcReg :
+ MRI.createVirtualRegister(RC));
+
+ RC = &SystemZ::GR64BitRegClass;
+ uint64_t ThisCountReg = MRI.createVirtualRegister(RC);
+ uint64_t NextCountReg = MRI.createVirtualRegister(RC);
+
+ MachineBasicBlock *StartMBB = MBB;
+ MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB);
+ MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB);
+ MachineBasicBlock *NextMBB = (EndMBB ? emitBlockAfter(LoopMBB) : LoopMBB);
+
+ // StartMBB:
+ // # fall through to LoopMMB
+ MBB->addSuccessor(LoopMBB);
+
+ // LoopMBB:
+ // %ThisDestReg = phi [ %StartDestReg, StartMBB ],
+ // [ %NextDestReg, NextMBB ]
+ // %ThisSrcReg = phi [ %StartSrcReg, StartMBB ],
+ // [ %NextSrcReg, NextMBB ]
+ // %ThisCountReg = phi [ %StartCountReg, StartMBB ],
+ // [ %NextCountReg, NextMBB ]
+ // ( PFD 2, 768+DestDisp(%ThisDestReg) )
+ // Opcode DestDisp(256,%ThisDestReg), SrcDisp(%ThisSrcReg)
+ // ( JLH EndMBB )
+ //
+ // The prefetch is used only for MVC. The JLH is used only for CLC.
+ MBB = LoopMBB;
+
+ BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisDestReg)
+ .addReg(StartDestReg).addMBB(StartMBB)
+ .addReg(NextDestReg).addMBB(NextMBB);
+ if (!HaveSingleBase)
+ BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisSrcReg)
+ .addReg(StartSrcReg).addMBB(StartMBB)
+ .addReg(NextSrcReg).addMBB(NextMBB);
+ BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisCountReg)
+ .addReg(StartCountReg).addMBB(StartMBB)
+ .addReg(NextCountReg).addMBB(NextMBB);
+ if (Opcode == SystemZ::MVC)
+ BuildMI(MBB, DL, TII->get(SystemZ::PFD))
+ .addImm(SystemZ::PFD_WRITE)
+ .addReg(ThisDestReg).addImm(DestDisp + 768).addReg(0);
+ BuildMI(MBB, DL, TII->get(Opcode))
+ .addReg(ThisDestReg).addImm(DestDisp).addImm(256)
+ .addReg(ThisSrcReg).addImm(SrcDisp);
+ if (EndMBB) {
+ BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+ .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
+ .addMBB(EndMBB);
+ MBB->addSuccessor(EndMBB);
+ MBB->addSuccessor(NextMBB);
+ }
+
+ // NextMBB:
+ // %NextDestReg = LA 256(%ThisDestReg)
+ // %NextSrcReg = LA 256(%ThisSrcReg)
+ // %NextCountReg = AGHI %ThisCountReg, -1
+ // CGHI %NextCountReg, 0
+ // JLH LoopMBB
+ // # fall through to DoneMMB
+ //
+ // The AGHI, CGHI and JLH should be converted to BRCTG by later passes.
+ MBB = NextMBB;
+
+ BuildMI(MBB, DL, TII->get(SystemZ::LA), NextDestReg)
+ .addReg(ThisDestReg).addImm(256).addReg(0);
+ if (!HaveSingleBase)
+ BuildMI(MBB, DL, TII->get(SystemZ::LA), NextSrcReg)
+ .addReg(ThisSrcReg).addImm(256).addReg(0);
+ BuildMI(MBB, DL, TII->get(SystemZ::AGHI), NextCountReg)
+ .addReg(ThisCountReg).addImm(-1);
+ BuildMI(MBB, DL, TII->get(SystemZ::CGHI))
+ .addReg(NextCountReg).addImm(0);
+ BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+ .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
+ .addMBB(LoopMBB);
+ MBB->addSuccessor(LoopMBB);
+ MBB->addSuccessor(DoneMBB);
+
+ DestBase = MachineOperand::CreateReg(NextDestReg, false);
+ SrcBase = MachineOperand::CreateReg(NextSrcReg, false);
+ Length &= 255;
+ MBB = DoneMBB;
+ }
+ // Handle any remaining bytes with straight-line code.
+ while (Length > 0) {
+ uint64_t ThisLength = std::min(Length, uint64_t(256));
+ // The previous iteration might have created out-of-range displacements.
+ // Apply them using LAY if so.
+ if (!isUInt<12>(DestDisp)) {
+ unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+ BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg)
+ .addOperand(DestBase)
+ .addImm(DestDisp)
+ .addReg(0);
+ DestBase = MachineOperand::CreateReg(Reg, false);
+ DestDisp = 0;
+ }
+ if (!isUInt<12>(SrcDisp)) {
+ unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+ BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg)
+ .addOperand(SrcBase)
+ .addImm(SrcDisp)
+ .addReg(0);
+ SrcBase = MachineOperand::CreateReg(Reg, false);
+ SrcDisp = 0;
+ }
+ BuildMI(*MBB, MI, DL, TII->get(Opcode))
+ .addOperand(DestBase).addImm(DestDisp).addImm(ThisLength)
+ .addOperand(SrcBase).addImm(SrcDisp);
+ DestDisp += ThisLength;
+ SrcDisp += ThisLength;
+ Length -= ThisLength;
+ // If there's another CLC to go, branch to the end if a difference
+ // was found.
+ if (EndMBB && Length > 0) {
+ MachineBasicBlock *NextMBB = splitBlockBefore(MI, MBB);
+ BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+ .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
+ .addMBB(EndMBB);
+ MBB->addSuccessor(EndMBB);
+ MBB->addSuccessor(NextMBB);
+ MBB = NextMBB;
+ }
+ }
+ if (EndMBB) {
+ MBB->addSuccessor(EndMBB);
+ MBB = EndMBB;
+ MBB->addLiveIn(SystemZ::CC);
+ }
+
+ MI.eraseFromParent();
+ return MBB;
+}
+
+// Decompose string pseudo-instruction MI into a loop that continually performs
+// Opcode until CC != 3.
+MachineBasicBlock *SystemZTargetLowering::emitStringWrapper(
+ MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const {
+ MachineFunction &MF = *MBB->getParent();
+ const SystemZInstrInfo *TII =
+ static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ DebugLoc DL = MI.getDebugLoc();
+
+ uint64_t End1Reg = MI.getOperand(0).getReg();
+ uint64_t Start1Reg = MI.getOperand(1).getReg();
+ uint64_t Start2Reg = MI.getOperand(2).getReg();
+ uint64_t CharReg = MI.getOperand(3).getReg();
+
+ const TargetRegisterClass *RC = &SystemZ::GR64BitRegClass;
+ uint64_t This1Reg = MRI.createVirtualRegister(RC);
+ uint64_t This2Reg = MRI.createVirtualRegister(RC);
+ uint64_t End2Reg = MRI.createVirtualRegister(RC);
+
+ MachineBasicBlock *StartMBB = MBB;
+ MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB);
+ MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB);
+
+ // StartMBB:
+ // # fall through to LoopMMB
+ MBB->addSuccessor(LoopMBB);
+
+ // LoopMBB:
+ // %This1Reg = phi [ %Start1Reg, StartMBB ], [ %End1Reg, LoopMBB ]
+ // %This2Reg = phi [ %Start2Reg, StartMBB ], [ %End2Reg, LoopMBB ]
+ // R0L = %CharReg
+ // %End1Reg, %End2Reg = CLST %This1Reg, %This2Reg -- uses R0L
+ // JO LoopMBB
+ // # fall through to DoneMMB
+ //
+ // The load of R0L can be hoisted by post-RA LICM.
+ MBB = LoopMBB;
+
+ BuildMI(MBB, DL, TII->get(SystemZ::PHI), This1Reg)
+ .addReg(Start1Reg).addMBB(StartMBB)
+ .addReg(End1Reg).addMBB(LoopMBB);
+ BuildMI(MBB, DL, TII->get(SystemZ::PHI), This2Reg)
+ .addReg(Start2Reg).addMBB(StartMBB)
+ .addReg(End2Reg).addMBB(LoopMBB);
+ BuildMI(MBB, DL, TII->get(TargetOpcode::COPY), SystemZ::R0L).addReg(CharReg);
+ BuildMI(MBB, DL, TII->get(Opcode))
+ .addReg(End1Reg, RegState::Define).addReg(End2Reg, RegState::Define)
+ .addReg(This1Reg).addReg(This2Reg);
+ BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+ .addImm(SystemZ::CCMASK_ANY).addImm(SystemZ::CCMASK_3).addMBB(LoopMBB);
+ MBB->addSuccessor(LoopMBB);
+ MBB->addSuccessor(DoneMBB);
+
+ DoneMBB->addLiveIn(SystemZ::CC);
+
+ MI.eraseFromParent();
+ return DoneMBB;
+}
+
+// Update TBEGIN instruction with final opcode and register clobbers.
+MachineBasicBlock *SystemZTargetLowering::emitTransactionBegin(
+ MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode,
+ bool NoFloat) const {
+ MachineFunction &MF = *MBB->getParent();
+ const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
+ const SystemZInstrInfo *TII = Subtarget.getInstrInfo();
+
+ // Update opcode.
+ MI.setDesc(TII->get(Opcode));
+
+ // We cannot handle a TBEGIN that clobbers the stack or frame pointer.
+ // Make sure to add the corresponding GRSM bits if they are missing.
+ uint64_t Control = MI.getOperand(2).getImm();
+ static const unsigned GPRControlBit[16] = {
+ 0x8000, 0x8000, 0x4000, 0x4000, 0x2000, 0x2000, 0x1000, 0x1000,
+ 0x0800, 0x0800, 0x0400, 0x0400, 0x0200, 0x0200, 0x0100, 0x0100
+ };
+ Control |= GPRControlBit[15];
+ if (TFI->hasFP(MF))
+ Control |= GPRControlBit[11];
+ MI.getOperand(2).setImm(Control);
+
+ // Add GPR clobbers.
+ for (int I = 0; I < 16; I++) {
+ if ((Control & GPRControlBit[I]) == 0) {
+ unsigned Reg = SystemZMC::GR64Regs[I];
+ MI.addOperand(MachineOperand::CreateReg(Reg, true, true));
+ }
+ }
+
+ // Add FPR/VR clobbers.
+ if (!NoFloat && (Control & 4) != 0) {
+ if (Subtarget.hasVector()) {
+ for (int I = 0; I < 32; I++) {
+ unsigned Reg = SystemZMC::VR128Regs[I];
+ MI.addOperand(MachineOperand::CreateReg(Reg, true, true));
+ }
+ } else {
+ for (int I = 0; I < 16; I++) {
+ unsigned Reg = SystemZMC::FP64Regs[I];
+ MI.addOperand(MachineOperand::CreateReg(Reg, true, true));
+ }
+ }
+ }
+
+ return MBB;
+}
+
+MachineBasicBlock *SystemZTargetLowering::emitLoadAndTestCmp0(
+ MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const {
+ MachineFunction &MF = *MBB->getParent();
+ MachineRegisterInfo *MRI = &MF.getRegInfo();
+ const SystemZInstrInfo *TII =
+ static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+ DebugLoc DL = MI.getDebugLoc();
+
+ unsigned SrcReg = MI.getOperand(0).getReg();
+
+ // Create new virtual register of the same class as source.
+ const TargetRegisterClass *RC = MRI->getRegClass(SrcReg);
+ unsigned DstReg = MRI->createVirtualRegister(RC);
+
+ // Replace pseudo with a normal load-and-test that models the def as
+ // well.
+ BuildMI(*MBB, MI, DL, TII->get(Opcode), DstReg)
+ .addReg(SrcReg);
+ MI.eraseFromParent();
+
+ return MBB;
+}
+
+MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter(
+ MachineInstr &MI, MachineBasicBlock *MBB) const {
+ switch (MI.getOpcode()) {
+ case SystemZ::Select32Mux:
+ return emitSelect(MI, MBB,
+ Subtarget.hasLoadStoreOnCond2()? SystemZ::LOCRMux : 0);
+ case SystemZ::Select32:
+ return emitSelect(MI, MBB, SystemZ::LOCR);
+ case SystemZ::Select64:
+ return emitSelect(MI, MBB, SystemZ::LOCGR);
+ case SystemZ::SelectF32:
+ case SystemZ::SelectF64:
+ case SystemZ::SelectF128:
+ return emitSelect(MI, MBB, 0);
+
+ case SystemZ::CondStore8Mux:
+ return emitCondStore(MI, MBB, SystemZ::STCMux, 0, false);
+ case SystemZ::CondStore8MuxInv:
+ return emitCondStore(MI, MBB, SystemZ::STCMux, 0, true);
+ case SystemZ::CondStore16Mux:
+ return emitCondStore(MI, MBB, SystemZ::STHMux, 0, false);
+ case SystemZ::CondStore16MuxInv:
+ return emitCondStore(MI, MBB, SystemZ::STHMux, 0, true);
+ case SystemZ::CondStore32Mux:
+ return emitCondStore(MI, MBB, SystemZ::STMux, SystemZ::STOCMux, false);
+ case SystemZ::CondStore32MuxInv:
+ return emitCondStore(MI, MBB, SystemZ::STMux, SystemZ::STOCMux, true);
+ case SystemZ::CondStore8:
+ return emitCondStore(MI, MBB, SystemZ::STC, 0, false);
+ case SystemZ::CondStore8Inv:
+ return emitCondStore(MI, MBB, SystemZ::STC, 0, true);
+ case SystemZ::CondStore16:
+ return emitCondStore(MI, MBB, SystemZ::STH, 0, false);
+ case SystemZ::CondStore16Inv:
+ return emitCondStore(MI, MBB, SystemZ::STH, 0, true);
+ case SystemZ::CondStore32:
+ return emitCondStore(MI, MBB, SystemZ::ST, SystemZ::STOC, false);
+ case SystemZ::CondStore32Inv:
+ return emitCondStore(MI, MBB, SystemZ::ST, SystemZ::STOC, true);
+ case SystemZ::CondStore64:
+ return emitCondStore(MI, MBB, SystemZ::STG, SystemZ::STOCG, false);
+ case SystemZ::CondStore64Inv:
+ return emitCondStore(MI, MBB, SystemZ::STG, SystemZ::STOCG, true);
+ case SystemZ::CondStoreF32:
+ return emitCondStore(MI, MBB, SystemZ::STE, 0, false);
+ case SystemZ::CondStoreF32Inv:
+ return emitCondStore(MI, MBB, SystemZ::STE, 0, true);
+ case SystemZ::CondStoreF64:
+ return emitCondStore(MI, MBB, SystemZ::STD, 0, false);
+ case SystemZ::CondStoreF64Inv:
+ return emitCondStore(MI, MBB, SystemZ::STD, 0, true);
+
+ case SystemZ::AEXT128_64:
+ return emitExt128(MI, MBB, false, SystemZ::subreg_l64);
+ case SystemZ::ZEXT128_32:
+ return emitExt128(MI, MBB, true, SystemZ::subreg_l32);
+ case SystemZ::ZEXT128_64:
+ return emitExt128(MI, MBB, true, SystemZ::subreg_l64);
+
+ case SystemZ::ATOMIC_SWAPW:
+ return emitAtomicLoadBinary(MI, MBB, 0, 0);
+ case SystemZ::ATOMIC_SWAP_32:
+ return emitAtomicLoadBinary(MI, MBB, 0, 32);
+ case SystemZ::ATOMIC_SWAP_64:
+ return emitAtomicLoadBinary(MI, MBB, 0, 64);
+
+ case SystemZ::ATOMIC_LOADW_AR:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::AR, 0);
+ case SystemZ::ATOMIC_LOADW_AFI:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::AFI, 0);
+ case SystemZ::ATOMIC_LOAD_AR:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::AR, 32);
+ case SystemZ::ATOMIC_LOAD_AHI:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::AHI, 32);
+ case SystemZ::ATOMIC_LOAD_AFI:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::AFI, 32);
+ case SystemZ::ATOMIC_LOAD_AGR:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::AGR, 64);
+ case SystemZ::ATOMIC_LOAD_AGHI:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::AGHI, 64);
+ case SystemZ::ATOMIC_LOAD_AGFI:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::AGFI, 64);
+
+ case SystemZ::ATOMIC_LOADW_SR:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::SR, 0);
+ case SystemZ::ATOMIC_LOAD_SR:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::SR, 32);
+ case SystemZ::ATOMIC_LOAD_SGR:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::SGR, 64);
+
+ case SystemZ::ATOMIC_LOADW_NR:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 0);
+ case SystemZ::ATOMIC_LOADW_NILH:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 0);
+ case SystemZ::ATOMIC_LOAD_NR:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 32);
+ case SystemZ::ATOMIC_LOAD_NILL:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL, 32);
+ case SystemZ::ATOMIC_LOAD_NILH:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 32);
+ case SystemZ::ATOMIC_LOAD_NILF:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF, 32);
+ case SystemZ::ATOMIC_LOAD_NGR:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::NGR, 64);
+ case SystemZ::ATOMIC_LOAD_NILL64:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL64, 64);
+ case SystemZ::ATOMIC_LOAD_NILH64:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH64, 64);
+ case SystemZ::ATOMIC_LOAD_NIHL64:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHL64, 64);
+ case SystemZ::ATOMIC_LOAD_NIHH64:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHH64, 64);
+ case SystemZ::ATOMIC_LOAD_NILF64:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF64, 64);
+ case SystemZ::ATOMIC_LOAD_NIHF64:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHF64, 64);
+
+ case SystemZ::ATOMIC_LOADW_OR:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::OR, 0);
+ case SystemZ::ATOMIC_LOADW_OILH:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH, 0);
+ case SystemZ::ATOMIC_LOAD_OR:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::OR, 32);
+ case SystemZ::ATOMIC_LOAD_OILL:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::OILL, 32);
+ case SystemZ::ATOMIC_LOAD_OILH:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH, 32);
+ case SystemZ::ATOMIC_LOAD_OILF:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::OILF, 32);
+ case SystemZ::ATOMIC_LOAD_OGR:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::OGR, 64);
+ case SystemZ::ATOMIC_LOAD_OILL64:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::OILL64, 64);
+ case SystemZ::ATOMIC_LOAD_OILH64:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH64, 64);
+ case SystemZ::ATOMIC_LOAD_OIHL64:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHL64, 64);
+ case SystemZ::ATOMIC_LOAD_OIHH64:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHH64, 64);
+ case SystemZ::ATOMIC_LOAD_OILF64:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::OILF64, 64);
+ case SystemZ::ATOMIC_LOAD_OIHF64:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHF64, 64);
+
+ case SystemZ::ATOMIC_LOADW_XR:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::XR, 0);
+ case SystemZ::ATOMIC_LOADW_XILF:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF, 0);
+ case SystemZ::ATOMIC_LOAD_XR:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::XR, 32);
+ case SystemZ::ATOMIC_LOAD_XILF:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF, 32);
+ case SystemZ::ATOMIC_LOAD_XGR:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::XGR, 64);
+ case SystemZ::ATOMIC_LOAD_XILF64:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF64, 64);
+ case SystemZ::ATOMIC_LOAD_XIHF64:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::XIHF64, 64);
+
+ case SystemZ::ATOMIC_LOADW_NRi:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 0, true);
+ case SystemZ::ATOMIC_LOADW_NILHi:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 0, true);
+ case SystemZ::ATOMIC_LOAD_NRi:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 32, true);
+ case SystemZ::ATOMIC_LOAD_NILLi:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL, 32, true);
+ case SystemZ::ATOMIC_LOAD_NILHi:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 32, true);
+ case SystemZ::ATOMIC_LOAD_NILFi:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF, 32, true);
+ case SystemZ::ATOMIC_LOAD_NGRi:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::NGR, 64, true);
+ case SystemZ::ATOMIC_LOAD_NILL64i:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL64, 64, true);
+ case SystemZ::ATOMIC_LOAD_NILH64i:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH64, 64, true);
+ case SystemZ::ATOMIC_LOAD_NIHL64i:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHL64, 64, true);
+ case SystemZ::ATOMIC_LOAD_NIHH64i:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHH64, 64, true);
+ case SystemZ::ATOMIC_LOAD_NILF64i:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF64, 64, true);
+ case SystemZ::ATOMIC_LOAD_NIHF64i:
+ return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHF64, 64, true);
+
+ case SystemZ::ATOMIC_LOADW_MIN:
+ return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR,
+ SystemZ::CCMASK_CMP_LE, 0);
+ case SystemZ::ATOMIC_LOAD_MIN_32:
+ return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR,
+ SystemZ::CCMASK_CMP_LE, 32);
+ case SystemZ::ATOMIC_LOAD_MIN_64:
+ return emitAtomicLoadMinMax(MI, MBB, SystemZ::CGR,
+ SystemZ::CCMASK_CMP_LE, 64);
+
+ case SystemZ::ATOMIC_LOADW_MAX:
+ return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR,
+ SystemZ::CCMASK_CMP_GE, 0);
+ case SystemZ::ATOMIC_LOAD_MAX_32:
+ return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR,
+ SystemZ::CCMASK_CMP_GE, 32);
+ case SystemZ::ATOMIC_LOAD_MAX_64:
+ return emitAtomicLoadMinMax(MI, MBB, SystemZ::CGR,
+ SystemZ::CCMASK_CMP_GE, 64);
+
+ case SystemZ::ATOMIC_LOADW_UMIN:
+ return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLR,
+ SystemZ::CCMASK_CMP_LE, 0);
+ case SystemZ::ATOMIC_LOAD_UMIN_32:
+ return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLR,
+ SystemZ::CCMASK_CMP_LE, 32);
+ case SystemZ::ATOMIC_LOAD_UMIN_64:
+ return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLGR,
+ SystemZ::CCMASK_CMP_LE, 64);
+
+ case SystemZ::ATOMIC_LOADW_UMAX:
+ return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLR,
+ SystemZ::CCMASK_CMP_GE, 0);
+ case SystemZ::ATOMIC_LOAD_UMAX_32:
+ return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLR,
+ SystemZ::CCMASK_CMP_GE, 32);
+ case SystemZ::ATOMIC_LOAD_UMAX_64:
+ return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLGR,
+ SystemZ::CCMASK_CMP_GE, 64);
+
+ case SystemZ::ATOMIC_CMP_SWAPW:
+ return emitAtomicCmpSwapW(MI, MBB);
+ case SystemZ::MVCSequence:
+ case SystemZ::MVCLoop:
+ return emitMemMemWrapper(MI, MBB, SystemZ::MVC);
+ case SystemZ::NCSequence:
+ case SystemZ::NCLoop:
+ return emitMemMemWrapper(MI, MBB, SystemZ::NC);
+ case SystemZ::OCSequence:
+ case SystemZ::OCLoop:
+ return emitMemMemWrapper(MI, MBB, SystemZ::OC);
+ case SystemZ::XCSequence:
+ case SystemZ::XCLoop:
+ return emitMemMemWrapper(MI, MBB, SystemZ::XC);
+ case SystemZ::CLCSequence:
+ case SystemZ::CLCLoop:
+ return emitMemMemWrapper(MI, MBB, SystemZ::CLC);
+ case SystemZ::CLSTLoop:
+ return emitStringWrapper(MI, MBB, SystemZ::CLST);
+ case SystemZ::MVSTLoop:
+ return emitStringWrapper(MI, MBB, SystemZ::MVST);
+ case SystemZ::SRSTLoop:
+ return emitStringWrapper(MI, MBB, SystemZ::SRST);
+ case SystemZ::TBEGIN:
+ return emitTransactionBegin(MI, MBB, SystemZ::TBEGIN, false);
+ case SystemZ::TBEGIN_nofloat:
+ return emitTransactionBegin(MI, MBB, SystemZ::TBEGIN, true);
+ case SystemZ::TBEGINC:
+ return emitTransactionBegin(MI, MBB, SystemZ::TBEGINC, true);
+ case SystemZ::LTEBRCompare_VecPseudo:
+ return emitLoadAndTestCmp0(MI, MBB, SystemZ::LTEBR);
+ case SystemZ::LTDBRCompare_VecPseudo:
+ return emitLoadAndTestCmp0(MI, MBB, SystemZ::LTDBR);
+ case SystemZ::LTXBRCompare_VecPseudo:
+ return emitLoadAndTestCmp0(MI, MBB, SystemZ::LTXBR);
+
+ default:
+ llvm_unreachable("Unexpected instr type to insert");
+ }
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.h
new file mode 100644
index 000000000000..7a21a474c119
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -0,0 +1,595 @@
+//===-- SystemZISelLowering.h - SystemZ DAG lowering interface --*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that SystemZ uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZISELLOWERING_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZISELLOWERING_H
+
+#include "SystemZ.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+namespace SystemZISD {
+enum NodeType : unsigned {
+ FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+ // Return with a flag operand. Operand 0 is the chain operand.
+ RET_FLAG,
+
+ // Calls a function. Operand 0 is the chain operand and operand 1
+ // is the target address. The arguments start at operand 2.
+ // There is an optional glue operand at the end.
+ CALL,
+ SIBCALL,
+
+ // TLS calls. Like regular calls, except operand 1 is the TLS symbol.
+ // (The call target is implicitly __tls_get_offset.)
+ TLS_GDCALL,
+ TLS_LDCALL,
+
+ // Wraps a TargetGlobalAddress that should be loaded using PC-relative
+ // accesses (LARL). Operand 0 is the address.
+ PCREL_WRAPPER,
+
+ // Used in cases where an offset is applied to a TargetGlobalAddress.
+ // Operand 0 is the full TargetGlobalAddress and operand 1 is a
+ // PCREL_WRAPPER for an anchor point. This is used so that we can
+ // cheaply refer to either the full address or the anchor point
+ // as a register base.
+ PCREL_OFFSET,
+
+ // Integer absolute.
+ IABS,
+
+ // Integer comparisons. There are three operands: the two values
+ // to compare, and an integer of type SystemZICMP.
+ ICMP,
+
+ // Floating-point comparisons. The two operands are the values to compare.
+ FCMP,
+
+ // Test under mask. The first operand is ANDed with the second operand
+ // and the condition codes are set on the result. The third operand is
+ // a boolean that is true if the condition codes need to distinguish
+ // between CCMASK_TM_MIXED_MSB_0 and CCMASK_TM_MIXED_MSB_1 (which the
+ // register forms do but the memory forms don't).
+ TM,
+
+ // Branches if a condition is true. Operand 0 is the chain operand;
+ // operand 1 is the 4-bit condition-code mask, with bit N in
+ // big-endian order meaning "branch if CC=N"; operand 2 is the
+ // target block and operand 3 is the flag operand.
+ BR_CCMASK,
+
+ // Selects between operand 0 and operand 1. Operand 2 is the
+ // mask of condition-code values for which operand 0 should be
+ // chosen over operand 1; it has the same form as BR_CCMASK.
+ // Operand 3 is the flag operand.
+ SELECT_CCMASK,
+
+ // Evaluates to the gap between the stack pointer and the
+ // base of the dynamically-allocatable area.
+ ADJDYNALLOC,
+
+ // Count number of bits set in operand 0 per byte.
+ POPCNT,
+
+ // Wrappers around the ISD opcodes of the same name. The output and
+ // first input operands are GR128s. The trailing numbers are the
+ // widths of the second operand in bits.
+ UMUL_LOHI64,
+ SDIVREM32,
+ SDIVREM64,
+ UDIVREM32,
+ UDIVREM64,
+
+ // Use a series of MVCs to copy bytes from one memory location to another.
+ // The operands are:
+ // - the target address
+ // - the source address
+ // - the constant length
+ //
+ // This isn't a memory opcode because we'd need to attach two
+ // MachineMemOperands rather than one.
+ MVC,
+
+ // Like MVC, but implemented as a loop that handles X*256 bytes
+ // followed by straight-line code to handle the rest (if any).
+ // The value of X is passed as an additional operand.
+ MVC_LOOP,
+
+ // Similar to MVC and MVC_LOOP, but for logic operations (AND, OR, XOR).
+ NC,
+ NC_LOOP,
+ OC,
+ OC_LOOP,
+ XC,
+ XC_LOOP,
+
+ // Use CLC to compare two blocks of memory, with the same comments
+ // as for MVC and MVC_LOOP.
+ CLC,
+ CLC_LOOP,
+
+ // Use an MVST-based sequence to implement stpcpy().
+ STPCPY,
+
+ // Use a CLST-based sequence to implement strcmp(). The two input operands
+ // are the addresses of the strings to compare.
+ STRCMP,
+
+ // Use an SRST-based sequence to search a block of memory. The first
+ // operand is the end address, the second is the start, and the third
+ // is the character to search for. CC is set to 1 on success and 2
+ // on failure.
+ SEARCH_STRING,
+
+ // Store the CC value in bits 29 and 28 of an integer.
+ IPM,
+
+ // Perform a serialization operation. (BCR 15,0 or BCR 14,0.)
+ SERIALIZE,
+
+ // Compiler barrier only; generate a no-op.
+ MEMBARRIER,
+
+ // Transaction begin. The first operand is the chain, the second
+ // the TDB pointer, and the third the immediate control field.
+ // Returns chain and glue.
+ TBEGIN,
+ TBEGIN_NOFLOAT,
+
+ // Transaction end. Just the chain operand. Returns chain and glue.
+ TEND,
+
+ // Create a vector constant by filling byte N of the result with bit
+ // 15-N of the single operand.
+ BYTE_MASK,
+
+ // Create a vector constant by replicating an element-sized RISBG-style mask.
+ // The first operand specifies the starting set bit and the second operand
+ // specifies the ending set bit. Both operands count from the MSB of the
+ // element.
+ ROTATE_MASK,
+
+ // Replicate a GPR scalar value into all elements of a vector.
+ REPLICATE,
+
+ // Create a vector from two i64 GPRs.
+ JOIN_DWORDS,
+
+ // Replicate one element of a vector into all elements. The first operand
+ // is the vector and the second is the index of the element to replicate.
+ SPLAT,
+
+ // Interleave elements from the high half of operand 0 and the high half
+ // of operand 1.
+ MERGE_HIGH,
+
+ // Likewise for the low halves.
+ MERGE_LOW,
+
+ // Concatenate the vectors in the first two operands, shift them left
+ // by the third operand, and take the first half of the result.
+ SHL_DOUBLE,
+
+ // Take one element of the first v2i64 operand and the one element of
+ // the second v2i64 operand and concatenate them to form a v2i64 result.
+ // The third operand is a 4-bit value of the form 0A0B, where A and B
+ // are the element selectors for the first operand and second operands
+ // respectively.
+ PERMUTE_DWORDS,
+
+ // Perform a general vector permute on vector operands 0 and 1.
+ // Each byte of operand 2 controls the corresponding byte of the result,
+ // in the same way as a byte-level VECTOR_SHUFFLE mask.
+ PERMUTE,
+
+ // Pack vector operands 0 and 1 into a single vector with half-sized elements.
+ PACK,
+
+ // Likewise, but saturate the result and set CC. PACKS_CC does signed
+ // saturation and PACKLS_CC does unsigned saturation.
+ PACKS_CC,
+ PACKLS_CC,
+
+ // Unpack the first half of vector operand 0 into double-sized elements.
+ // UNPACK_HIGH sign-extends and UNPACKL_HIGH zero-extends.
+ UNPACK_HIGH,
+ UNPACKL_HIGH,
+
+ // Likewise for the second half.
+ UNPACK_LOW,
+ UNPACKL_LOW,
+
+ // Shift each element of vector operand 0 by the number of bits specified
+ // by scalar operand 1.
+ VSHL_BY_SCALAR,
+ VSRL_BY_SCALAR,
+ VSRA_BY_SCALAR,
+
+ // For each element of the output type, sum across all sub-elements of
+ // operand 0 belonging to the corresponding element, and add in the
+ // rightmost sub-element of the corresponding element of operand 1.
+ VSUM,
+
+ // Compare integer vector operands 0 and 1 to produce the usual 0/-1
+ // vector result. VICMPE is for equality, VICMPH for "signed greater than"
+ // and VICMPHL for "unsigned greater than".
+ VICMPE,
+ VICMPH,
+ VICMPHL,
+
+ // Likewise, but also set the condition codes on the result.
+ VICMPES,
+ VICMPHS,
+ VICMPHLS,
+
+ // Compare floating-point vector operands 0 and 1 to preoduce the usual 0/-1
+ // vector result. VFCMPE is for "ordered and equal", VFCMPH for "ordered and
+ // greater than" and VFCMPHE for "ordered and greater than or equal to".
+ VFCMPE,
+ VFCMPH,
+ VFCMPHE,
+
+ // Likewise, but also set the condition codes on the result.
+ VFCMPES,
+ VFCMPHS,
+ VFCMPHES,
+
+ // Test floating-point data class for vectors.
+ VFTCI,
+
+ // Extend the even f32 elements of vector operand 0 to produce a vector
+ // of f64 elements.
+ VEXTEND,
+
+ // Round the f64 elements of vector operand 0 to f32s and store them in the
+ // even elements of the result.
+ VROUND,
+
+ // AND the two vector operands together and set CC based on the result.
+ VTM,
+
+ // String operations that set CC as a side-effect.
+ VFAE_CC,
+ VFAEZ_CC,
+ VFEE_CC,
+ VFEEZ_CC,
+ VFENE_CC,
+ VFENEZ_CC,
+ VISTR_CC,
+ VSTRC_CC,
+ VSTRCZ_CC,
+
+ // Test Data Class.
+ //
+ // Operand 0: the value to test
+ // Operand 1: the bit mask
+ TDC,
+
+ // Wrappers around the inner loop of an 8- or 16-bit ATOMIC_SWAP or
+ // ATOMIC_LOAD_<op>.
+ //
+ // Operand 0: the address of the containing 32-bit-aligned field
+ // Operand 1: the second operand of <op>, in the high bits of an i32
+ // for everything except ATOMIC_SWAPW
+ // Operand 2: how many bits to rotate the i32 left to bring the first
+ // operand into the high bits
+ // Operand 3: the negative of operand 2, for rotating the other way
+ // Operand 4: the width of the field in bits (8 or 16)
+ ATOMIC_SWAPW = ISD::FIRST_TARGET_MEMORY_OPCODE,
+ ATOMIC_LOADW_ADD,
+ ATOMIC_LOADW_SUB,
+ ATOMIC_LOADW_AND,
+ ATOMIC_LOADW_OR,
+ ATOMIC_LOADW_XOR,
+ ATOMIC_LOADW_NAND,
+ ATOMIC_LOADW_MIN,
+ ATOMIC_LOADW_MAX,
+ ATOMIC_LOADW_UMIN,
+ ATOMIC_LOADW_UMAX,
+
+ // A wrapper around the inner loop of an ATOMIC_CMP_SWAP.
+ //
+ // Operand 0: the address of the containing 32-bit-aligned field
+ // Operand 1: the compare value, in the low bits of an i32
+ // Operand 2: the swap value, in the low bits of an i32
+ // Operand 3: how many bits to rotate the i32 left to bring the first
+ // operand into the high bits
+ // Operand 4: the negative of operand 2, for rotating the other way
+ // Operand 5: the width of the field in bits (8 or 16)
+ ATOMIC_CMP_SWAPW,
+
+ // Byte swapping load.
+ //
+ // Operand 0: the address to load from
+ // Operand 1: the type of load (i16, i32, i64)
+ LRV,
+
+ // Byte swapping store.
+ //
+ // Operand 0: the value to store
+ // Operand 1: the address to store to
+ // Operand 2: the type of store (i16, i32, i64)
+ STRV,
+
+ // Prefetch from the second operand using the 4-bit control code in
+ // the first operand. The code is 1 for a load prefetch and 2 for
+ // a store prefetch.
+ PREFETCH
+};
+
+// Return true if OPCODE is some kind of PC-relative address.
+inline bool isPCREL(unsigned Opcode) {
+ return Opcode == PCREL_WRAPPER || Opcode == PCREL_OFFSET;
+}
+} // end namespace SystemZISD
+
+namespace SystemZICMP {
+// Describes whether an integer comparison needs to be signed or unsigned,
+// or whether either type is OK.
+enum {
+ Any,
+ UnsignedOnly,
+ SignedOnly
+};
+} // end namespace SystemZICMP
+
+class SystemZSubtarget;
+class SystemZTargetMachine;
+
+class SystemZTargetLowering : public TargetLowering {
+public:
+ explicit SystemZTargetLowering(const TargetMachine &TM,
+ const SystemZSubtarget &STI);
+
+ // Override TargetLowering.
+ MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
+ return MVT::i32;
+ }
+ MVT getVectorIdxTy(const DataLayout &DL) const override {
+ // Only the lower 12 bits of an element index are used, so we don't
+ // want to clobber the upper 32 bits of a GPR unnecessarily.
+ return MVT::i32;
+ }
+ TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT)
+ const override {
+ // Widen subvectors to the full width rather than promoting integer
+ // elements. This is better because:
+ //
+ // (a) it means that we can handle the ABI for passing and returning
+ // sub-128 vectors without having to handle them as legal types.
+ //
+ // (b) we don't have instructions to extend on load and truncate on store,
+ // so promoting the integers is less efficient.
+ //
+ // (c) there are no multiplication instructions for the widest integer
+ // type (v2i64).
+ if (VT.getScalarSizeInBits() % 8 == 0)
+ return TypeWidenVector;
+ return TargetLoweringBase::getPreferredVectorAction(VT);
+ }
+ EVT getSetCCResultType(const DataLayout &DL, LLVMContext &,
+ EVT) const override;
+ bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
+ bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+ bool isLegalICmpImmediate(int64_t Imm) const override;
+ bool isLegalAddImmediate(int64_t Imm) const override;
+ bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
+ unsigned AS) const override;
+ bool isFoldableMemAccessOffset(Instruction *I, int64_t Offset) const override;
+ bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
+ unsigned Align,
+ bool *Fast) const override;
+ bool isTruncateFree(Type *, Type *) const override;
+ bool isTruncateFree(EVT, EVT) const override;
+ const char *getTargetNodeName(unsigned Opcode) const override;
+ std::pair<unsigned, const TargetRegisterClass *>
+ getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ StringRef Constraint, MVT VT) const override;
+ TargetLowering::ConstraintType
+ getConstraintType(StringRef Constraint) const override;
+ TargetLowering::ConstraintWeight
+ getSingleConstraintMatchWeight(AsmOperandInfo &info,
+ const char *constraint) const override;
+ void LowerAsmOperandForConstraint(SDValue Op,
+ std::string &Constraint,
+ std::vector<SDValue> &Ops,
+ SelectionDAG &DAG) const override;
+
+ unsigned getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
+ if (ConstraintCode.size() == 1) {
+ switch(ConstraintCode[0]) {
+ default:
+ break;
+ case 'Q':
+ return InlineAsm::Constraint_Q;
+ case 'R':
+ return InlineAsm::Constraint_R;
+ case 'S':
+ return InlineAsm::Constraint_S;
+ case 'T':
+ return InlineAsm::Constraint_T;
+ }
+ }
+ return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
+ }
+
+ /// If a physical register, this returns the register that receives the
+ /// exception address on entry to an EH pad.
+ unsigned
+ getExceptionPointerRegister(const Constant *PersonalityFn) const override {
+ return SystemZ::R6D;
+ }
+
+ /// If a physical register, this returns the register that receives the
+ /// exception typeid on entry to a landing pad.
+ unsigned
+ getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
+ return SystemZ::R7D;
+ }
+
+ /// Override to support customized stack guard loading.
+ bool useLoadStackGuardNode() const override {
+ return true;
+ }
+ void insertSSPDeclarations(Module &M) const override {
+ }
+
+ MachineBasicBlock *
+ EmitInstrWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *BB) const override;
+ SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+ bool allowTruncateForTailCall(Type *, Type *) const override;
+ bool mayBeEmittedAsTailCall(CallInst *CI) const override;
+ SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &DL, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const override;
+ SDValue LowerCall(CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const override;
+
+ bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ LLVMContext &Context) const override;
+ SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
+ SelectionDAG &DAG) const override;
+ SDValue prepareVolatileOrAtomicLoad(SDValue Chain, const SDLoc &DL,
+ SelectionDAG &DAG) const override;
+ SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+ ISD::NodeType getExtendForAtomicOps() const override {
+ return ISD::ANY_EXTEND;
+ }
+
+ bool supportSwiftError() const override {
+ return true;
+ }
+
+private:
+ const SystemZSubtarget &Subtarget;
+
+ // Implement LowerOperation for individual opcodes.
+ SDValue lowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerGlobalAddress(GlobalAddressSDNode *Node,
+ SelectionDAG &DAG) const;
+ SDValue lowerTLSGetOffset(GlobalAddressSDNode *Node,
+ SelectionDAG &DAG, unsigned Opcode,
+ SDValue GOTOffset) const;
+ SDValue lowerThreadPointer(const SDLoc &DL, SelectionDAG &DAG) const;
+ SDValue lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
+ SelectionDAG &DAG) const;
+ SDValue lowerBlockAddress(BlockAddressSDNode *Node,
+ SelectionDAG &DAG) const;
+ SDValue lowerJumpTable(JumpTableSDNode *JT, SelectionDAG &DAG) const;
+ SDValue lowerConstantPool(ConstantPoolSDNode *CP, SelectionDAG &DAG) const;
+ SDValue lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerGET_DYNAMIC_AREA_OFFSET(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerSMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerUMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerSDIVREM(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerATOMIC_LOAD_OP(SDValue Op, SelectionDAG &DAG,
+ unsigned Opcode) const;
+ SDValue lowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerLOAD_SEQUENCE_POINT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
+ unsigned UnpackHigh) const;
+ SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const;
+
+ SDValue combineExtract(const SDLoc &DL, EVT ElemVT, EVT VecVT, SDValue OrigOp,
+ unsigned Index, DAGCombinerInfo &DCI,
+ bool Force) const;
+ SDValue combineTruncateExtract(const SDLoc &DL, EVT TruncVT, SDValue Op,
+ DAGCombinerInfo &DCI) const;
+ SDValue combineSIGN_EXTEND(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue combineMERGE(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue combineSTORE(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue combineEXTRACT_VECTOR_ELT(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue combineJOIN_DWORDS(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue combineFP_ROUND(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue combineBSWAP(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue combineSHIFTROT(SDNode *N, DAGCombinerInfo &DCI) const;
+
+ // If the last instruction before MBBI in MBB was some form of COMPARE,
+ // try to replace it with a COMPARE AND BRANCH just before MBBI.
+ // CCMask and Target are the BRC-like operands for the branch.
+ // Return true if the change was made.
+ bool convertPrevCompareToBranch(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator MBBI,
+ unsigned CCMask,
+ MachineBasicBlock *Target) const;
+
+ // Implement EmitInstrWithCustomInserter for individual operation types.
+ MachineBasicBlock *emitSelect(MachineInstr &MI, MachineBasicBlock *BB,
+ unsigned LOCROpcode) const;
+ MachineBasicBlock *emitCondStore(MachineInstr &MI, MachineBasicBlock *BB,
+ unsigned StoreOpcode, unsigned STOCOpcode,
+ bool Invert) const;
+ MachineBasicBlock *emitExt128(MachineInstr &MI, MachineBasicBlock *MBB,
+ bool ClearEven, unsigned SubReg) const;
+ MachineBasicBlock *emitAtomicLoadBinary(MachineInstr &MI,
+ MachineBasicBlock *BB,
+ unsigned BinOpcode, unsigned BitSize,
+ bool Invert = false) const;
+ MachineBasicBlock *emitAtomicLoadMinMax(MachineInstr &MI,
+ MachineBasicBlock *MBB,
+ unsigned CompareOpcode,
+ unsigned KeepOldMask,
+ unsigned BitSize) const;
+ MachineBasicBlock *emitAtomicCmpSwapW(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+ MachineBasicBlock *emitMemMemWrapper(MachineInstr &MI, MachineBasicBlock *BB,
+ unsigned Opcode) const;
+ MachineBasicBlock *emitStringWrapper(MachineInstr &MI, MachineBasicBlock *BB,
+ unsigned Opcode) const;
+ MachineBasicBlock *emitTransactionBegin(MachineInstr &MI,
+ MachineBasicBlock *MBB,
+ unsigned Opcode, bool NoFloat) const;
+ MachineBasicBlock *emitLoadAndTestCmp0(MachineInstr &MI,
+ MachineBasicBlock *MBB,
+ unsigned Opcode) const;
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrBuilder.h b/contrib/llvm/lib/Target/SystemZ/SystemZInstrBuilder.h
new file mode 100644
index 000000000000..896b665d25eb
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrBuilder.h
@@ -0,0 +1,46 @@
+//===-- SystemZInstrBuilder.h - Functions to aid building insts -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes functions that may be used with BuildMI from the
+// MachineInstrBuilder.h file to handle SystemZ'isms in a clean way.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZINSTRBUILDER_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZINSTRBUILDER_H
+
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+
+namespace llvm {
+
+/// Add a BDX memory reference for frame object FI to MIB.
+static inline const MachineInstrBuilder &
+addFrameReference(const MachineInstrBuilder &MIB, int FI) {
+ MachineInstr *MI = MIB;
+ MachineFunction &MF = *MI->getParent()->getParent();
+ MachineFrameInfo &MFFrame = MF.getFrameInfo();
+ const MCInstrDesc &MCID = MI->getDesc();
+ auto Flags = MachineMemOperand::MONone;
+ if (MCID.mayLoad())
+ Flags |= MachineMemOperand::MOLoad;
+ if (MCID.mayStore())
+ Flags |= MachineMemOperand::MOStore;
+ int64_t Offset = 0;
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FI, Offset), Flags,
+ MFFrame.getObjectSize(FI), MFFrame.getObjectAlignment(FI));
+ return MIB.addFrameIndex(FI).addImm(Offset).addReg(0).addMemOperand(MMO);
+}
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrFP.td b/contrib/llvm/lib/Target/SystemZ/SystemZInstrFP.td
new file mode 100644
index 000000000000..bb6d27e24828
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrFP.td
@@ -0,0 +1,507 @@
+//==- SystemZInstrFP.td - Floating-point SystemZ instructions --*- tblgen-*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Select instructions
+//===----------------------------------------------------------------------===//
+
+// C's ?: operator for floating-point operands.
+def SelectF32 : SelectWrapper<FP32>;
+def SelectF64 : SelectWrapper<FP64>;
+def SelectF128 : SelectWrapper<FP128>;
+
+defm CondStoreF32 : CondStores<FP32, nonvolatile_store,
+ nonvolatile_load, bdxaddr20only>;
+defm CondStoreF64 : CondStores<FP64, nonvolatile_store,
+ nonvolatile_load, bdxaddr20only>;
+
+//===----------------------------------------------------------------------===//
+// Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load zero.
+let hasSideEffects = 0, isAsCheapAsAMove = 1, isMoveImm = 1 in {
+ def LZER : InherentRRE<"lzer", 0xB374, FP32, fpimm0>;
+ def LZDR : InherentRRE<"lzdr", 0xB375, FP64, fpimm0>;
+ def LZXR : InherentRRE<"lzxr", 0xB376, FP128, fpimm0>;
+}
+
+// Moves between two floating-point registers.
+let hasSideEffects = 0 in {
+ def LER : UnaryRR <"ler", 0x38, null_frag, FP32, FP32>;
+ def LDR : UnaryRR <"ldr", 0x28, null_frag, FP64, FP64>;
+ def LXR : UnaryRRE<"lxr", 0xB365, null_frag, FP128, FP128>;
+
+ // For z13 we prefer LDR over LER to avoid partial register dependencies.
+ let isCodeGenOnly = 1 in
+ def LDR32 : UnaryRR<"ldr", 0x28, null_frag, FP32, FP32>;
+}
+
+// Moves between two floating-point registers that also set the condition
+// codes.
+let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
+ defm LTEBR : LoadAndTestRRE<"ltebr", 0xB302, FP32>;
+ defm LTDBR : LoadAndTestRRE<"ltdbr", 0xB312, FP64>;
+ defm LTXBR : LoadAndTestRRE<"ltxbr", 0xB342, FP128>;
+}
+// Note that LTxBRCompare is not available if we have vector support,
+// since load-and-test instructions will partially clobber the target
+// (vector) register.
+let Predicates = [FeatureNoVector] in {
+ defm : CompareZeroFP<LTEBRCompare, FP32>;
+ defm : CompareZeroFP<LTDBRCompare, FP64>;
+ defm : CompareZeroFP<LTXBRCompare, FP128>;
+}
+
+// Use a normal load-and-test for compare against zero in case of
+// vector support (via a pseudo to simplify instruction selection).
+let Defs = [CC], usesCustomInserter = 1 in {
+ def LTEBRCompare_VecPseudo : Pseudo<(outs), (ins FP32:$R1, FP32:$R2), []>;
+ def LTDBRCompare_VecPseudo : Pseudo<(outs), (ins FP64:$R1, FP64:$R2), []>;
+ def LTXBRCompare_VecPseudo : Pseudo<(outs), (ins FP128:$R1, FP128:$R2), []>;
+}
+let Predicates = [FeatureVector] in {
+ defm : CompareZeroFP<LTEBRCompare_VecPseudo, FP32>;
+ defm : CompareZeroFP<LTDBRCompare_VecPseudo, FP64>;
+ defm : CompareZeroFP<LTXBRCompare_VecPseudo, FP128>;
+}
+
+// Moves between 64-bit integer and floating-point registers.
+def LGDR : UnaryRRE<"lgdr", 0xB3CD, bitconvert, GR64, FP64>;
+def LDGR : UnaryRRE<"ldgr", 0xB3C1, bitconvert, FP64, GR64>;
+
+// fcopysign with an FP32 result.
+let isCodeGenOnly = 1 in {
+ def CPSDRss : BinaryRRFb<"cpsdr", 0xB372, fcopysign, FP32, FP32, FP32>;
+ def CPSDRsd : BinaryRRFb<"cpsdr", 0xB372, fcopysign, FP32, FP32, FP64>;
+}
+
+// The sign of an FP128 is in the high register.
+def : Pat<(fcopysign FP32:$src1, FP128:$src2),
+ (CPSDRsd FP32:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
+
+// fcopysign with an FP64 result.
+let isCodeGenOnly = 1 in
+ def CPSDRds : BinaryRRFb<"cpsdr", 0xB372, fcopysign, FP64, FP64, FP32>;
+def CPSDRdd : BinaryRRFb<"cpsdr", 0xB372, fcopysign, FP64, FP64, FP64>;
+
+// The sign of an FP128 is in the high register.
+def : Pat<(fcopysign FP64:$src1, FP128:$src2),
+ (CPSDRdd FP64:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
+
+// fcopysign with an FP128 result. Use "upper" as the high half and leave
+// the low half as-is.
+class CopySign128<RegisterOperand cls, dag upper>
+ : Pat<(fcopysign FP128:$src1, cls:$src2),
+ (INSERT_SUBREG FP128:$src1, upper, subreg_h64)>;
+
+def : CopySign128<FP32, (CPSDRds (EXTRACT_SUBREG FP128:$src1, subreg_h64),
+ FP32:$src2)>;
+def : CopySign128<FP64, (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_h64),
+ FP64:$src2)>;
+def : CopySign128<FP128, (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_h64),
+ (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
+
+defm LoadStoreF32 : MVCLoadStore<load, f32, MVCSequence, 4>;
+defm LoadStoreF64 : MVCLoadStore<load, f64, MVCSequence, 8>;
+defm LoadStoreF128 : MVCLoadStore<load, f128, MVCSequence, 16>;
+
+//===----------------------------------------------------------------------===//
+// Load instructions
+//===----------------------------------------------------------------------===//
+
+let canFoldAsLoad = 1, SimpleBDXLoad = 1 in {
+ defm LE : UnaryRXPair<"le", 0x78, 0xED64, load, FP32, 4>;
+ defm LD : UnaryRXPair<"ld", 0x68, 0xED65, load, FP64, 8>;
+
+ // For z13 we prefer LDE over LE to avoid partial register dependencies.
+ def LDE32 : UnaryRXE<"lde", 0xED24, null_frag, FP32, 4>;
+
+ // These instructions are split after register allocation, so we don't
+ // want a custom inserter.
+ let Has20BitOffset = 1, HasIndex = 1, Is128Bit = 1 in {
+ def LX : Pseudo<(outs FP128:$dst), (ins bdxaddr20only128:$src),
+ [(set FP128:$dst, (load bdxaddr20only128:$src))]>;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Store instructions
+//===----------------------------------------------------------------------===//
+
+let SimpleBDXStore = 1 in {
+ defm STE : StoreRXPair<"ste", 0x70, 0xED66, store, FP32, 4>;
+ defm STD : StoreRXPair<"std", 0x60, 0xED67, store, FP64, 8>;
+
+ // These instructions are split after register allocation, so we don't
+ // want a custom inserter.
+ let Has20BitOffset = 1, HasIndex = 1, Is128Bit = 1 in {
+ def STX : Pseudo<(outs), (ins FP128:$src, bdxaddr20only128:$dst),
+ [(store FP128:$src, bdxaddr20only128:$dst)]>;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Convert floating-point values to narrower representations, rounding
+// according to the current mode. The destination of LEXBR and LDXBR
+// is a 128-bit value, but only the first register of the pair is used.
+def LEDBR : UnaryRRE<"ledbr", 0xB344, fpround, FP32, FP64>;
+def LEXBR : UnaryRRE<"lexbr", 0xB346, null_frag, FP128, FP128>;
+def LDXBR : UnaryRRE<"ldxbr", 0xB345, null_frag, FP128, FP128>;
+
+def LEDBRA : TernaryRRFe<"ledbra", 0xB344, FP32, FP64>,
+ Requires<[FeatureFPExtension]>;
+def LEXBRA : TernaryRRFe<"lexbra", 0xB346, FP128, FP128>,
+ Requires<[FeatureFPExtension]>;
+def LDXBRA : TernaryRRFe<"ldxbra", 0xB345, FP128, FP128>,
+ Requires<[FeatureFPExtension]>;
+
+def : Pat<(f32 (fpround FP128:$src)),
+ (EXTRACT_SUBREG (LEXBR FP128:$src), subreg_hr32)>;
+def : Pat<(f64 (fpround FP128:$src)),
+ (EXTRACT_SUBREG (LDXBR FP128:$src), subreg_h64)>;
+
+// Extend register floating-point values to wider representations.
+def LDEBR : UnaryRRE<"ldebr", 0xB304, fpextend, FP64, FP32>;
+def LXEBR : UnaryRRE<"lxebr", 0xB306, fpextend, FP128, FP32>;
+def LXDBR : UnaryRRE<"lxdbr", 0xB305, fpextend, FP128, FP64>;
+
+// Extend memory floating-point values to wider representations.
+def LDEB : UnaryRXE<"ldeb", 0xED04, extloadf32, FP64, 4>;
+def LXEB : UnaryRXE<"lxeb", 0xED06, extloadf32, FP128, 4>;
+def LXDB : UnaryRXE<"lxdb", 0xED05, extloadf64, FP128, 8>;
+
+// Convert a signed integer register value to a floating-point one.
+def CEFBR : UnaryRRE<"cefbr", 0xB394, sint_to_fp, FP32, GR32>;
+def CDFBR : UnaryRRE<"cdfbr", 0xB395, sint_to_fp, FP64, GR32>;
+def CXFBR : UnaryRRE<"cxfbr", 0xB396, sint_to_fp, FP128, GR32>;
+
+def CEGBR : UnaryRRE<"cegbr", 0xB3A4, sint_to_fp, FP32, GR64>;
+def CDGBR : UnaryRRE<"cdgbr", 0xB3A5, sint_to_fp, FP64, GR64>;
+def CXGBR : UnaryRRE<"cxgbr", 0xB3A6, sint_to_fp, FP128, GR64>;
+
+// The FP extension feature provides versions of the above that allow
+// specifying rounding mode and inexact-exception suppression flags.
+let Predicates = [FeatureFPExtension] in {
+ def CEFBRA : TernaryRRFe<"cefbra", 0xB394, FP32, GR32>;
+ def CDFBRA : TernaryRRFe<"cdfbra", 0xB395, FP64, GR32>;
+ def CXFBRA : TernaryRRFe<"cxfbra", 0xB396, FP128, GR32>;
+
+ def CEGBRA : TernaryRRFe<"cegbra", 0xB3A4, FP32, GR64>;
+ def CDGBRA : TernaryRRFe<"cdgbra", 0xB3A5, FP64, GR64>;
+ def CXGBRA : TernaryRRFe<"cxgbra", 0xB3A6, FP128, GR64>;
+}
+
+// Convert am unsigned integer register value to a floating-point one.
+let Predicates = [FeatureFPExtension] in {
+ def CELFBR : TernaryRRFe<"celfbr", 0xB390, FP32, GR32>;
+ def CDLFBR : TernaryRRFe<"cdlfbr", 0xB391, FP64, GR32>;
+ def CXLFBR : TernaryRRFe<"cxlfbr", 0xB392, FP128, GR32>;
+
+ def CELGBR : TernaryRRFe<"celgbr", 0xB3A0, FP32, GR64>;
+ def CDLGBR : TernaryRRFe<"cdlgbr", 0xB3A1, FP64, GR64>;
+ def CXLGBR : TernaryRRFe<"cxlgbr", 0xB3A2, FP128, GR64>;
+
+ def : Pat<(f32 (uint_to_fp GR32:$src)), (CELFBR 0, GR32:$src, 0)>;
+ def : Pat<(f64 (uint_to_fp GR32:$src)), (CDLFBR 0, GR32:$src, 0)>;
+ def : Pat<(f128 (uint_to_fp GR32:$src)), (CXLFBR 0, GR32:$src, 0)>;
+
+ def : Pat<(f32 (uint_to_fp GR64:$src)), (CELGBR 0, GR64:$src, 0)>;
+ def : Pat<(f64 (uint_to_fp GR64:$src)), (CDLGBR 0, GR64:$src, 0)>;
+ def : Pat<(f128 (uint_to_fp GR64:$src)), (CXLGBR 0, GR64:$src, 0)>;
+}
+
+// Convert a floating-point register value to a signed integer value,
+// with the second operand (modifier M3) specifying the rounding mode.
+let Defs = [CC] in {
+ def CFEBR : BinaryRRFe<"cfebr", 0xB398, GR32, FP32>;
+ def CFDBR : BinaryRRFe<"cfdbr", 0xB399, GR32, FP64>;
+ def CFXBR : BinaryRRFe<"cfxbr", 0xB39A, GR32, FP128>;
+
+ def CGEBR : BinaryRRFe<"cgebr", 0xB3A8, GR64, FP32>;
+ def CGDBR : BinaryRRFe<"cgdbr", 0xB3A9, GR64, FP64>;
+ def CGXBR : BinaryRRFe<"cgxbr", 0xB3AA, GR64, FP128>;
+}
+
+// fp_to_sint always rounds towards zero, which is modifier value 5.
+def : Pat<(i32 (fp_to_sint FP32:$src)), (CFEBR 5, FP32:$src)>;
+def : Pat<(i32 (fp_to_sint FP64:$src)), (CFDBR 5, FP64:$src)>;
+def : Pat<(i32 (fp_to_sint FP128:$src)), (CFXBR 5, FP128:$src)>;
+
+def : Pat<(i64 (fp_to_sint FP32:$src)), (CGEBR 5, FP32:$src)>;
+def : Pat<(i64 (fp_to_sint FP64:$src)), (CGDBR 5, FP64:$src)>;
+def : Pat<(i64 (fp_to_sint FP128:$src)), (CGXBR 5, FP128:$src)>;
+
+// The FP extension feature provides versions of the above that allow
+// also specifying the inexact-exception suppression flag.
+let Predicates = [FeatureFPExtension], Defs = [CC] in {
+ def CFEBRA : TernaryRRFe<"cfebra", 0xB398, GR32, FP32>;
+ def CFDBRA : TernaryRRFe<"cfdbra", 0xB399, GR32, FP64>;
+ def CFXBRA : TernaryRRFe<"cfxbra", 0xB39A, GR32, FP128>;
+
+ def CGEBRA : TernaryRRFe<"cgebra", 0xB3A8, GR64, FP32>;
+ def CGDBRA : TernaryRRFe<"cgdbra", 0xB3A9, GR64, FP64>;
+ def CGXBRA : TernaryRRFe<"cgxbra", 0xB3AA, GR64, FP128>;
+}
+
+// Convert a floating-point register value to an unsigned integer value.
+let Predicates = [FeatureFPExtension] in {
+ let Defs = [CC] in {
+ def CLFEBR : TernaryRRFe<"clfebr", 0xB39C, GR32, FP32>;
+ def CLFDBR : TernaryRRFe<"clfdbr", 0xB39D, GR32, FP64>;
+ def CLFXBR : TernaryRRFe<"clfxbr", 0xB39E, GR32, FP128>;
+
+ def CLGEBR : TernaryRRFe<"clgebr", 0xB3AC, GR64, FP32>;
+ def CLGDBR : TernaryRRFe<"clgdbr", 0xB3AD, GR64, FP64>;
+ def CLGXBR : TernaryRRFe<"clgxbr", 0xB3AE, GR64, FP128>;
+ }
+
+ def : Pat<(i32 (fp_to_uint FP32:$src)), (CLFEBR 5, FP32:$src, 0)>;
+ def : Pat<(i32 (fp_to_uint FP64:$src)), (CLFDBR 5, FP64:$src, 0)>;
+ def : Pat<(i32 (fp_to_uint FP128:$src)), (CLFXBR 5, FP128:$src, 0)>;
+
+ def : Pat<(i64 (fp_to_uint FP32:$src)), (CLGEBR 5, FP32:$src, 0)>;
+ def : Pat<(i64 (fp_to_uint FP64:$src)), (CLGDBR 5, FP64:$src, 0)>;
+ def : Pat<(i64 (fp_to_uint FP128:$src)), (CLGXBR 5, FP128:$src, 0)>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// We prefer generic instructions during isel, because they do not
+// clobber CC and therefore give the scheduler more freedom. In cases
+// the CC is actually useful, the SystemZElimCompare pass will try to
+// convert generic instructions into opcodes that also set CC. Note
+// that lcdf / lpdf / lndf only affect the sign bit, and can therefore
+// be used with fp32 as well. This could be done for fp128, in which
+// case the operands would have to be tied.
+
+// Negation (Load Complement).
+let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
+ def LCEBR : UnaryRRE<"lcebr", 0xB303, null_frag, FP32, FP32>;
+ def LCDBR : UnaryRRE<"lcdbr", 0xB313, null_frag, FP64, FP64>;
+ def LCXBR : UnaryRRE<"lcxbr", 0xB343, fneg, FP128, FP128>;
+}
+// Generic form, which does not set CC.
+def LCDFR : UnaryRRE<"lcdfr", 0xB373, fneg, FP64, FP64>;
+let isCodeGenOnly = 1 in
+ def LCDFR_32 : UnaryRRE<"lcdfr", 0xB373, fneg, FP32, FP32>;
+
+// Absolute value (Load Positive).
+let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
+ def LPEBR : UnaryRRE<"lpebr", 0xB300, null_frag, FP32, FP32>;
+ def LPDBR : UnaryRRE<"lpdbr", 0xB310, null_frag, FP64, FP64>;
+ def LPXBR : UnaryRRE<"lpxbr", 0xB340, fabs, FP128, FP128>;
+}
+// Generic form, which does not set CC.
+def LPDFR : UnaryRRE<"lpdfr", 0xB370, fabs, FP64, FP64>;
+let isCodeGenOnly = 1 in
+ def LPDFR_32 : UnaryRRE<"lpdfr", 0xB370, fabs, FP32, FP32>;
+
+// Negative absolute value (Load Negative).
+let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
+ def LNEBR : UnaryRRE<"lnebr", 0xB301, null_frag, FP32, FP32>;
+ def LNDBR : UnaryRRE<"lndbr", 0xB311, null_frag, FP64, FP64>;
+ def LNXBR : UnaryRRE<"lnxbr", 0xB341, fnabs, FP128, FP128>;
+}
+// Generic form, which does not set CC.
+def LNDFR : UnaryRRE<"lndfr", 0xB371, fnabs, FP64, FP64>;
+let isCodeGenOnly = 1 in
+ def LNDFR_32 : UnaryRRE<"lndfr", 0xB371, fnabs, FP32, FP32>;
+
+// Square root.
+def SQEBR : UnaryRRE<"sqebr", 0xB314, fsqrt, FP32, FP32>;
+def SQDBR : UnaryRRE<"sqdbr", 0xB315, fsqrt, FP64, FP64>;
+def SQXBR : UnaryRRE<"sqxbr", 0xB316, fsqrt, FP128, FP128>;
+
+def SQEB : UnaryRXE<"sqeb", 0xED14, loadu<fsqrt>, FP32, 4>;
+def SQDB : UnaryRXE<"sqdb", 0xED15, loadu<fsqrt>, FP64, 8>;
+
+// Round to an integer, with the second operand (modifier M3) specifying
+// the rounding mode. These forms always check for inexact conditions.
+def FIEBR : BinaryRRFe<"fiebr", 0xB357, FP32, FP32>;
+def FIDBR : BinaryRRFe<"fidbr", 0xB35F, FP64, FP64>;
+def FIXBR : BinaryRRFe<"fixbr", 0xB347, FP128, FP128>;
+
+// frint rounds according to the current mode (modifier 0) and detects
+// inexact conditions.
+def : Pat<(frint FP32:$src), (FIEBR 0, FP32:$src)>;
+def : Pat<(frint FP64:$src), (FIDBR 0, FP64:$src)>;
+def : Pat<(frint FP128:$src), (FIXBR 0, FP128:$src)>;
+
+let Predicates = [FeatureFPExtension] in {
+ // Extended forms of the FIxBR instructions. M4 can be set to 4
+ // to suppress detection of inexact conditions.
+ def FIEBRA : TernaryRRFe<"fiebra", 0xB357, FP32, FP32>;
+ def FIDBRA : TernaryRRFe<"fidbra", 0xB35F, FP64, FP64>;
+ def FIXBRA : TernaryRRFe<"fixbra", 0xB347, FP128, FP128>;
+
+ // fnearbyint is like frint but does not detect inexact conditions.
+ def : Pat<(fnearbyint FP32:$src), (FIEBRA 0, FP32:$src, 4)>;
+ def : Pat<(fnearbyint FP64:$src), (FIDBRA 0, FP64:$src, 4)>;
+ def : Pat<(fnearbyint FP128:$src), (FIXBRA 0, FP128:$src, 4)>;
+
+ // floor is no longer allowed to raise an inexact condition,
+ // so restrict it to the cases where the condition can be suppressed.
+ // Mode 7 is round towards -inf.
+ def : Pat<(ffloor FP32:$src), (FIEBRA 7, FP32:$src, 4)>;
+ def : Pat<(ffloor FP64:$src), (FIDBRA 7, FP64:$src, 4)>;
+ def : Pat<(ffloor FP128:$src), (FIXBRA 7, FP128:$src, 4)>;
+
+ // Same idea for ceil, where mode 6 is round towards +inf.
+ def : Pat<(fceil FP32:$src), (FIEBRA 6, FP32:$src, 4)>;
+ def : Pat<(fceil FP64:$src), (FIDBRA 6, FP64:$src, 4)>;
+ def : Pat<(fceil FP128:$src), (FIXBRA 6, FP128:$src, 4)>;
+
+ // Same idea for trunc, where mode 5 is round towards zero.
+ def : Pat<(ftrunc FP32:$src), (FIEBRA 5, FP32:$src, 4)>;
+ def : Pat<(ftrunc FP64:$src), (FIDBRA 5, FP64:$src, 4)>;
+ def : Pat<(ftrunc FP128:$src), (FIXBRA 5, FP128:$src, 4)>;
+
+ // Same idea for round, where mode 1 is round towards nearest with
+ // ties away from zero.
+ def : Pat<(fround FP32:$src), (FIEBRA 1, FP32:$src, 4)>;
+ def : Pat<(fround FP64:$src), (FIDBRA 1, FP64:$src, 4)>;
+ def : Pat<(fround FP128:$src), (FIXBRA 1, FP128:$src, 4)>;
+}
+
+//===----------------------------------------------------------------------===//
+// Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition.
+let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
+ let isCommutable = 1 in {
+ def AEBR : BinaryRRE<"aebr", 0xB30A, fadd, FP32, FP32>;
+ def ADBR : BinaryRRE<"adbr", 0xB31A, fadd, FP64, FP64>;
+ def AXBR : BinaryRRE<"axbr", 0xB34A, fadd, FP128, FP128>;
+ }
+ def AEB : BinaryRXE<"aeb", 0xED0A, fadd, FP32, load, 4>;
+ def ADB : BinaryRXE<"adb", 0xED1A, fadd, FP64, load, 8>;
+}
+
+// Subtraction.
+let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
+ def SEBR : BinaryRRE<"sebr", 0xB30B, fsub, FP32, FP32>;
+ def SDBR : BinaryRRE<"sdbr", 0xB31B, fsub, FP64, FP64>;
+ def SXBR : BinaryRRE<"sxbr", 0xB34B, fsub, FP128, FP128>;
+
+ def SEB : BinaryRXE<"seb", 0xED0B, fsub, FP32, load, 4>;
+ def SDB : BinaryRXE<"sdb", 0xED1B, fsub, FP64, load, 8>;
+}
+
+// Multiplication.
+let isCommutable = 1 in {
+ def MEEBR : BinaryRRE<"meebr", 0xB317, fmul, FP32, FP32>;
+ def MDBR : BinaryRRE<"mdbr", 0xB31C, fmul, FP64, FP64>;
+ def MXBR : BinaryRRE<"mxbr", 0xB34C, fmul, FP128, FP128>;
+}
+def MEEB : BinaryRXE<"meeb", 0xED17, fmul, FP32, load, 4>;
+def MDB : BinaryRXE<"mdb", 0xED1C, fmul, FP64, load, 8>;
+
+// f64 multiplication of two FP32 registers.
+def MDEBR : BinaryRRE<"mdebr", 0xB30C, null_frag, FP64, FP32>;
+def : Pat<(fmul (f64 (fpextend FP32:$src1)), (f64 (fpextend FP32:$src2))),
+ (MDEBR (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+ FP32:$src1, subreg_r32), FP32:$src2)>;
+
+// f64 multiplication of an FP32 register and an f32 memory.
+def MDEB : BinaryRXE<"mdeb", 0xED0C, null_frag, FP64, load, 4>;
+def : Pat<(fmul (f64 (fpextend FP32:$src1)),
+ (f64 (extloadf32 bdxaddr12only:$addr))),
+ (MDEB (INSERT_SUBREG (f64 (IMPLICIT_DEF)), FP32:$src1, subreg_r32),
+ bdxaddr12only:$addr)>;
+
+// f128 multiplication of two FP64 registers.
+def MXDBR : BinaryRRE<"mxdbr", 0xB307, null_frag, FP128, FP64>;
+def : Pat<(fmul (f128 (fpextend FP64:$src1)), (f128 (fpextend FP64:$src2))),
+ (MXDBR (INSERT_SUBREG (f128 (IMPLICIT_DEF)),
+ FP64:$src1, subreg_h64), FP64:$src2)>;
+
+// f128 multiplication of an FP64 register and an f64 memory.
+def MXDB : BinaryRXE<"mxdb", 0xED07, null_frag, FP128, load, 8>;
+def : Pat<(fmul (f128 (fpextend FP64:$src1)),
+ (f128 (extloadf64 bdxaddr12only:$addr))),
+ (MXDB (INSERT_SUBREG (f128 (IMPLICIT_DEF)), FP64:$src1, subreg_h64),
+ bdxaddr12only:$addr)>;
+
+// Fused multiply-add.
+def MAEBR : TernaryRRD<"maebr", 0xB30E, z_fma, FP32>;
+def MADBR : TernaryRRD<"madbr", 0xB31E, z_fma, FP64>;
+
+def MAEB : TernaryRXF<"maeb", 0xED0E, z_fma, FP32, load, 4>;
+def MADB : TernaryRXF<"madb", 0xED1E, z_fma, FP64, load, 8>;
+
+// Fused multiply-subtract.
+def MSEBR : TernaryRRD<"msebr", 0xB30F, z_fms, FP32>;
+def MSDBR : TernaryRRD<"msdbr", 0xB31F, z_fms, FP64>;
+
+def MSEB : TernaryRXF<"mseb", 0xED0F, z_fms, FP32, load, 4>;
+def MSDB : TernaryRXF<"msdb", 0xED1F, z_fms, FP64, load, 8>;
+
+// Division.
+def DEBR : BinaryRRE<"debr", 0xB30D, fdiv, FP32, FP32>;
+def DDBR : BinaryRRE<"ddbr", 0xB31D, fdiv, FP64, FP64>;
+def DXBR : BinaryRRE<"dxbr", 0xB34D, fdiv, FP128, FP128>;
+
+def DEB : BinaryRXE<"deb", 0xED0D, fdiv, FP32, load, 4>;
+def DDB : BinaryRXE<"ddb", 0xED1D, fdiv, FP64, load, 8>;
+
+//===----------------------------------------------------------------------===//
+// Comparisons
+//===----------------------------------------------------------------------===//
+
+let Defs = [CC], CCValues = 0xF in {
+ def CEBR : CompareRRE<"cebr", 0xB309, z_fcmp, FP32, FP32>;
+ def CDBR : CompareRRE<"cdbr", 0xB319, z_fcmp, FP64, FP64>;
+ def CXBR : CompareRRE<"cxbr", 0xB349, z_fcmp, FP128, FP128>;
+
+ def CEB : CompareRXE<"ceb", 0xED09, z_fcmp, FP32, load, 4>;
+ def CDB : CompareRXE<"cdb", 0xED19, z_fcmp, FP64, load, 8>;
+}
+
+// Test Data Class.
+let Defs = [CC], CCValues = 0xC in {
+ def TCEB : TestRXE<"tceb", 0xED10, z_tdc, FP32>;
+ def TCDB : TestRXE<"tcdb", 0xED11, z_tdc, FP64>;
+ def TCXB : TestRXE<"tcxb", 0xED12, z_tdc, FP128>;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating-point control register instructions
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 1 in {
+ def EFPC : InherentRRE<"efpc", 0xB38C, GR32, int_s390_efpc>;
+ def STFPC : StoreInherentS<"stfpc", 0xB29C, storei<int_s390_efpc>, 4>;
+
+ def SFPC : SideEffectUnaryRRE<"sfpc", 0xB384, GR32, int_s390_sfpc>;
+ def LFPC : SideEffectUnaryS<"lfpc", 0xB29D, loadu<int_s390_sfpc>, 4>;
+
+ def SFASR : SideEffectUnaryRRE<"sfasr", 0xB385, GR32, null_frag>;
+ def LFAS : SideEffectUnaryS<"lfas", 0xB2BD, null_frag, 4>;
+
+ def SRNMB : SideEffectAddressS<"srnmb", 0xB2B8, null_frag, shift12only>,
+ Requires<[FeatureFPExtension]>;
+ def SRNM : SideEffectAddressS<"srnm", 0xB299, null_frag, shift12only>;
+ def SRNMT : SideEffectAddressS<"srnmt", 0xB2B9, null_frag, shift12only>;
+}
+
+//===----------------------------------------------------------------------===//
+// Peepholes
+//===----------------------------------------------------------------------===//
+
+def : Pat<(f32 fpimmneg0), (LCDFR_32 (LZER))>;
+def : Pat<(f64 fpimmneg0), (LCDFR (LZDR))>;
+def : Pat<(f128 fpimmneg0), (LCXBR (LZXR))>;
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/contrib/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
new file mode 100644
index 000000000000..c727f486087e
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -0,0 +1,4083 @@
+//==- SystemZInstrFormats.td - SystemZ Instruction Formats --*- tablegen -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Basic SystemZ instruction definition
+//===----------------------------------------------------------------------===//
+
+class InstSystemZ<int size, dag outs, dag ins, string asmstr,
+ list<dag> pattern> : Instruction {
+ let Namespace = "SystemZ";
+
+ dag OutOperandList = outs;
+ dag InOperandList = ins;
+ let Size = size;
+ let Pattern = pattern;
+ let AsmString = asmstr;
+
+ // Some instructions come in pairs, one having a 12-bit displacement
+ // and the other having a 20-bit displacement. Both instructions in
+ // the pair have the same DispKey and their DispSizes are "12" and "20"
+ // respectively.
+ string DispKey = "";
+ string DispSize = "none";
+
+ // Many register-based <INSN>R instructions have a memory-based <INSN>
+ // counterpart. OpKey uniquely identifies <INSN>R, while OpType is
+ // "reg" for <INSN>R and "mem" for <INSN>.
+ string OpKey = "";
+ string OpType = "none";
+
+ // Many distinct-operands instructions have older 2-operand equivalents.
+ // NumOpsKey uniquely identifies one of these 2-operand and 3-operand pairs,
+ // with NumOpsValue being "2" or "3" as appropriate.
+ string NumOpsKey = "";
+ string NumOpsValue = "none";
+
+ // True if this instruction is a simple D(X,B) load of a register
+ // (with no sign or zero extension).
+ bit SimpleBDXLoad = 0;
+
+ // True if this instruction is a simple D(X,B) store of a register
+ // (with no truncation).
+ bit SimpleBDXStore = 0;
+
+ // True if this instruction has a 20-bit displacement field.
+ bit Has20BitOffset = 0;
+
+ // True if addresses in this instruction have an index register.
+ bit HasIndex = 0;
+
+ // True if this is a 128-bit pseudo instruction that combines two 64-bit
+ // operations.
+ bit Is128Bit = 0;
+
+ // The access size of all memory operands in bytes, or 0 if not known.
+ bits<5> AccessBytes = 0;
+
+ // If the instruction sets CC to a useful value, this gives the mask
+ // of all possible CC results. The mask has the same form as
+ // SystemZ::CCMASK_*.
+ bits<4> CCValues = 0;
+
+ // The subset of CCValues that have the same meaning as they would after
+ // a comparison of the first operand against zero.
+ bits<4> CompareZeroCCMask = 0;
+
+ // True if the instruction is conditional and if the CC mask operand
+ // comes first (as for BRC, etc.).
+ bit CCMaskFirst = 0;
+
+ // Similar, but true if the CC mask operand comes last (as for LOC, etc.).
+ bit CCMaskLast = 0;
+
+ // True if the instruction is the "logical" rather than "arithmetic" form,
+ // in cases where a distinction exists.
+ bit IsLogical = 0;
+
+ let TSFlags{0} = SimpleBDXLoad;
+ let TSFlags{1} = SimpleBDXStore;
+ let TSFlags{2} = Has20BitOffset;
+ let TSFlags{3} = HasIndex;
+ let TSFlags{4} = Is128Bit;
+ let TSFlags{9-5} = AccessBytes;
+ let TSFlags{13-10} = CCValues;
+ let TSFlags{17-14} = CompareZeroCCMask;
+ let TSFlags{18} = CCMaskFirst;
+ let TSFlags{19} = CCMaskLast;
+ let TSFlags{20} = IsLogical;
+}
+
+//===----------------------------------------------------------------------===//
+// Mappings between instructions
+//===----------------------------------------------------------------------===//
+
+// Return the version of an instruction that has an unsigned 12-bit
+// displacement.
+def getDisp12Opcode : InstrMapping {
+ let FilterClass = "InstSystemZ";
+ let RowFields = ["DispKey"];
+ let ColFields = ["DispSize"];
+ let KeyCol = ["20"];
+ let ValueCols = [["12"]];
+}
+
+// Return the version of an instruction that has a signed 20-bit displacement.
+def getDisp20Opcode : InstrMapping {
+ let FilterClass = "InstSystemZ";
+ let RowFields = ["DispKey"];
+ let ColFields = ["DispSize"];
+ let KeyCol = ["12"];
+ let ValueCols = [["20"]];
+}
+
+// Return the memory form of a register instruction.
+def getMemOpcode : InstrMapping {
+ let FilterClass = "InstSystemZ";
+ let RowFields = ["OpKey"];
+ let ColFields = ["OpType"];
+ let KeyCol = ["reg"];
+ let ValueCols = [["mem"]];
+}
+
+// Return the 3-operand form of a 2-operand instruction.
+def getThreeOperandOpcode : InstrMapping {
+ let FilterClass = "InstSystemZ";
+ let RowFields = ["NumOpsKey"];
+ let ColFields = ["NumOpsValue"];
+ let KeyCol = ["2"];
+ let ValueCols = [["3"]];
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction formats
+//===----------------------------------------------------------------------===//
+//
+// Formats are specified using operand field declarations of the form:
+//
+// bits<4> Rn : register input or output for operand n
+// bits<5> Vn : vector register input or output for operand n
+// bits<m> In : immediate value of width m for operand n
+// bits<4> BDn : address operand n, which has a base and a displacement
+// bits<m> XBDn : address operand n, which has an index, a base and a
+// displacement
+// bits<m> VBDn : address operand n, which has a vector index, a base and a
+// displacement
+// bits<4> Xn : index register for address operand n
+// bits<4> Mn : mode value for operand n
+//
+// The operand numbers ("n" in the list above) follow the architecture manual.
+// Assembly operands sometimes have a different order; in particular, R3 often
+// is often written between operands 1 and 2.
+//
+//===----------------------------------------------------------------------===//
+
+class InstE<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<2, outs, ins, asmstr, pattern> {
+ field bits<16> Inst;
+ field bits<16> SoftFail = 0;
+
+ let Inst = op;
+}
+
+class InstI<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<2, outs, ins, asmstr, pattern> {
+ field bits<16> Inst;
+ field bits<16> SoftFail = 0;
+
+ bits<8> I1;
+
+ let Inst{15-8} = op;
+ let Inst{7-0} = I1;
+}
+
+class InstIE<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<4, outs, ins, asmstr, pattern> {
+ field bits<32> Inst;
+ field bits<32> SoftFail = 0;
+
+ bits<4> I1;
+ bits<4> I2;
+
+ let Inst{31-16} = op;
+ let Inst{15-8} = 0;
+ let Inst{7-4} = I1;
+ let Inst{3-0} = I2;
+}
+
+class InstMII<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<4> M1;
+ bits<12> RI2;
+ bits<24> RI3;
+
+ let Inst{47-40} = op;
+ let Inst{39-36} = M1;
+ let Inst{35-24} = RI2;
+ let Inst{23-0} = RI3;
+}
+
+class InstRIa<bits<12> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<4, outs, ins, asmstr, pattern> {
+ field bits<32> Inst;
+ field bits<32> SoftFail = 0;
+
+ bits<4> R1;
+ bits<16> I2;
+
+ let Inst{31-24} = op{11-4};
+ let Inst{23-20} = R1;
+ let Inst{19-16} = op{3-0};
+ let Inst{15-0} = I2;
+}
+
+class InstRIb<bits<12> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<4, outs, ins, asmstr, pattern> {
+ field bits<32> Inst;
+ field bits<32> SoftFail = 0;
+
+ bits<4> R1;
+ bits<16> RI2;
+
+ let Inst{31-24} = op{11-4};
+ let Inst{23-20} = R1;
+ let Inst{19-16} = op{3-0};
+ let Inst{15-0} = RI2;
+}
+
+class InstRIc<bits<12> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<4, outs, ins, asmstr, pattern> {
+ field bits<32> Inst;
+ field bits<32> SoftFail = 0;
+
+ bits<4> M1;
+ bits<16> RI2;
+
+ let Inst{31-24} = op{11-4};
+ let Inst{23-20} = M1;
+ let Inst{19-16} = op{3-0};
+ let Inst{15-0} = RI2;
+}
+
+class InstRIEa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<4> R1;
+ bits<16> I2;
+ bits<4> M3;
+
+ let Inst{47-40} = op{15-8};
+ let Inst{39-36} = R1;
+ let Inst{35-32} = 0;
+ let Inst{31-16} = I2;
+ let Inst{15-12} = M3;
+ let Inst{11-8} = 0;
+ let Inst{7-0} = op{7-0};
+}
+
+class InstRIEb<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<4> R1;
+ bits<4> R2;
+ bits<4> M3;
+ bits<16> RI4;
+
+ let Inst{47-40} = op{15-8};
+ let Inst{39-36} = R1;
+ let Inst{35-32} = R2;
+ let Inst{31-16} = RI4;
+ let Inst{15-12} = M3;
+ let Inst{11-8} = 0;
+ let Inst{7-0} = op{7-0};
+}
+
+class InstRIEc<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<4> R1;
+ bits<8> I2;
+ bits<4> M3;
+ bits<16> RI4;
+
+ let Inst{47-40} = op{15-8};
+ let Inst{39-36} = R1;
+ let Inst{35-32} = M3;
+ let Inst{31-16} = RI4;
+ let Inst{15-8} = I2;
+ let Inst{7-0} = op{7-0};
+}
+
+class InstRIEd<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<4> R1;
+ bits<4> R3;
+ bits<16> I2;
+
+ let Inst{47-40} = op{15-8};
+ let Inst{39-36} = R1;
+ let Inst{35-32} = R3;
+ let Inst{31-16} = I2;
+ let Inst{15-8} = 0;
+ let Inst{7-0} = op{7-0};
+}
+
+class InstRIEe<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<4> R1;
+ bits<4> R3;
+ bits<16> RI2;
+
+ let Inst{47-40} = op{15-8};
+ let Inst{39-36} = R1;
+ let Inst{35-32} = R3;
+ let Inst{31-16} = RI2;
+ let Inst{15-8} = 0;
+ let Inst{7-0} = op{7-0};
+}
+
+class InstRIEf<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<4> R1;
+ bits<4> R2;
+ bits<8> I3;
+ bits<8> I4;
+ bits<8> I5;
+
+ let Inst{47-40} = op{15-8};
+ let Inst{39-36} = R1;
+ let Inst{35-32} = R2;
+ let Inst{31-24} = I3;
+ let Inst{23-16} = I4;
+ let Inst{15-8} = I5;
+ let Inst{7-0} = op{7-0};
+}
+
+class InstRIEg<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<4> R1;
+ bits<4> M3;
+ bits<16> I2;
+
+ let Inst{47-40} = op{15-8};
+ let Inst{39-36} = R1;
+ let Inst{35-32} = M3;
+ let Inst{31-16} = I2;
+ let Inst{15-8} = 0;
+ let Inst{7-0} = op{7-0};
+}
+
+class InstRILa<bits<12> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<4> R1;
+ bits<32> I2;
+
+ let Inst{47-40} = op{11-4};
+ let Inst{39-36} = R1;
+ let Inst{35-32} = op{3-0};
+ let Inst{31-0} = I2;
+}
+
+class InstRILb<bits<12> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<4> R1;
+ bits<32> RI2;
+
+ let Inst{47-40} = op{11-4};
+ let Inst{39-36} = R1;
+ let Inst{35-32} = op{3-0};
+ let Inst{31-0} = RI2;
+}
+
+class InstRILc<bits<12> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<4> M1;
+ bits<32> RI2;
+
+ let Inst{47-40} = op{11-4};
+ let Inst{39-36} = M1;
+ let Inst{35-32} = op{3-0};
+ let Inst{31-0} = RI2;
+}
+
+class InstRIS<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<4> R1;
+ bits<8> I2;
+ bits<4> M3;
+ bits<16> BD4;
+
+ let Inst{47-40} = op{15-8};
+ let Inst{39-36} = R1;
+ let Inst{35-32} = M3;
+ let Inst{31-16} = BD4;
+ let Inst{15-8} = I2;
+ let Inst{7-0} = op{7-0};
+}
+
+class InstRR<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<2, outs, ins, asmstr, pattern> {
+ field bits<16> Inst;
+ field bits<16> SoftFail = 0;
+
+ bits<4> R1;
+ bits<4> R2;
+
+ let Inst{15-8} = op;
+ let Inst{7-4} = R1;
+ let Inst{3-0} = R2;
+}
+
+class InstRRD<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<4, outs, ins, asmstr, pattern> {
+ field bits<32> Inst;
+ field bits<32> SoftFail = 0;
+
+ bits<4> R1;
+ bits<4> R3;
+ bits<4> R2;
+
+ let Inst{31-16} = op;
+ let Inst{15-12} = R1;
+ let Inst{11-8} = 0;
+ let Inst{7-4} = R3;
+ let Inst{3-0} = R2;
+}
+
+class InstRRE<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<4, outs, ins, asmstr, pattern> {
+ field bits<32> Inst;
+ field bits<32> SoftFail = 0;
+
+ bits<4> R1;
+ bits<4> R2;
+
+ let Inst{31-16} = op;
+ let Inst{15-8} = 0;
+ let Inst{7-4} = R1;
+ let Inst{3-0} = R2;
+}
+
+class InstRRFa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<4, outs, ins, asmstr, pattern> {
+ field bits<32> Inst;
+ field bits<32> SoftFail = 0;
+
+ bits<4> R1;
+ bits<4> R2;
+ bits<4> R3;
+ bits<4> M4;
+
+ let Inst{31-16} = op;
+ let Inst{15-12} = R3;
+ let Inst{11-8} = M4;
+ let Inst{7-4} = R1;
+ let Inst{3-0} = R2;
+}
+
+class InstRRFb<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<4, outs, ins, asmstr, pattern> {
+ field bits<32> Inst;
+ field bits<32> SoftFail = 0;
+
+ bits<4> R1;
+ bits<4> R2;
+ bits<4> R3;
+ bits<4> M4;
+
+ let Inst{31-16} = op;
+ let Inst{15-12} = R3;
+ let Inst{11-8} = M4;
+ let Inst{7-4} = R1;
+ let Inst{3-0} = R2;
+}
+
+class InstRRFc<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<4, outs, ins, asmstr, pattern> {
+ field bits<32> Inst;
+ field bits<32> SoftFail = 0;
+
+ bits<4> R1;
+ bits<4> R2;
+ bits<4> M3;
+
+ let Inst{31-16} = op;
+ let Inst{15-12} = M3;
+ let Inst{11-8} = 0;
+ let Inst{7-4} = R1;
+ let Inst{3-0} = R2;
+}
+
+class InstRRFe<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<4, outs, ins, asmstr, pattern> {
+ field bits<32> Inst;
+ field bits<32> SoftFail = 0;
+
+ bits<4> R1;
+ bits<4> R2;
+ bits<4> M3;
+ bits<4> M4;
+
+ let Inst{31-16} = op;
+ let Inst{15-12} = M3;
+ let Inst{11-8} = M4;
+ let Inst{7-4} = R1;
+ let Inst{3-0} = R2;
+}
+
+class InstRRS<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<4> R1;
+ bits<4> R2;
+ bits<4> M3;
+ bits<16> BD4;
+
+ let Inst{47-40} = op{15-8};
+ let Inst{39-36} = R1;
+ let Inst{35-32} = R2;
+ let Inst{31-16} = BD4;
+ let Inst{15-12} = M3;
+ let Inst{11-8} = 0;
+ let Inst{7-0} = op{7-0};
+}
+
+class InstRXa<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<4, outs, ins, asmstr, pattern> {
+ field bits<32> Inst;
+ field bits<32> SoftFail = 0;
+
+ bits<4> R1;
+ bits<20> XBD2;
+
+ let Inst{31-24} = op;
+ let Inst{23-20} = R1;
+ let Inst{19-0} = XBD2;
+
+ let HasIndex = 1;
+}
+
+class InstRXb<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<4, outs, ins, asmstr, pattern> {
+ field bits<32> Inst;
+ field bits<32> SoftFail = 0;
+
+ bits<4> M1;
+ bits<20> XBD2;
+
+ let Inst{31-24} = op;
+ let Inst{23-20} = M1;
+ let Inst{19-0} = XBD2;
+
+ let HasIndex = 1;
+}
+
+class InstRXE<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<4> R1;
+ bits<20> XBD2;
+ bits<4> M3;
+
+ let Inst{47-40} = op{15-8};
+ let Inst{39-36} = R1;
+ let Inst{35-16} = XBD2;
+ let Inst{15-12} = M3;
+ let Inst{11-8} = 0;
+ let Inst{7-0} = op{7-0};
+
+ let HasIndex = 1;
+}
+
+class InstRXF<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<4> R1;
+ bits<4> R3;
+ bits<20> XBD2;
+
+ let Inst{47-40} = op{15-8};
+ let Inst{39-36} = R3;
+ let Inst{35-16} = XBD2;
+ let Inst{15-12} = R1;
+ let Inst{11-8} = 0;
+ let Inst{7-0} = op{7-0};
+
+ let HasIndex = 1;
+}
+
+class InstRXYa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<4> R1;
+ bits<28> XBD2;
+
+ let Inst{47-40} = op{15-8};
+ let Inst{39-36} = R1;
+ let Inst{35-8} = XBD2;
+ let Inst{7-0} = op{7-0};
+
+ let Has20BitOffset = 1;
+ let HasIndex = 1;
+}
+
+class InstRXYb<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<4> M1;
+ bits<28> XBD2;
+
+ let Inst{47-40} = op{15-8};
+ let Inst{39-36} = M1;
+ let Inst{35-8} = XBD2;
+ let Inst{7-0} = op{7-0};
+
+ let Has20BitOffset = 1;
+ let HasIndex = 1;
+}
+
+class InstRSa<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<4, outs, ins, asmstr, pattern> {
+ field bits<32> Inst;
+ field bits<32> SoftFail = 0;
+
+ bits<4> R1;
+ bits<4> R3;
+ bits<16> BD2;
+
+ let Inst{31-24} = op;
+ let Inst{23-20} = R1;
+ let Inst{19-16} = R3;
+ let Inst{15-0} = BD2;
+}
+
+class InstRSb<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<4, outs, ins, asmstr, pattern> {
+ field bits<32> Inst;
+ field bits<32> SoftFail = 0;
+
+ bits<4> R1;
+ bits<4> M3;
+ bits<16> BD2;
+
+ let Inst{31-24} = op;
+ let Inst{23-20} = R1;
+ let Inst{19-16} = M3;
+ let Inst{15-0} = BD2;
+}
+
+class InstRSI<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<4, outs, ins, asmstr, pattern> {
+ field bits<32> Inst;
+ field bits<32> SoftFail = 0;
+
+ bits<4> R1;
+ bits<4> R3;
+ bits<16> RI2;
+
+ let Inst{31-24} = op;
+ let Inst{23-20} = R1;
+ let Inst{19-16} = R3;
+ let Inst{15-0} = RI2;
+}
+
+class InstRSYa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<4> R1;
+ bits<4> R3;
+ bits<24> BD2;
+
+ let Inst{47-40} = op{15-8};
+ let Inst{39-36} = R1;
+ let Inst{35-32} = R3;
+ let Inst{31-8} = BD2;
+ let Inst{7-0} = op{7-0};
+
+ let Has20BitOffset = 1;
+}
+
+class InstRSYb<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<4> R1;
+ bits<4> M3;
+ bits<24> BD2;
+
+ let Inst{47-40} = op{15-8};
+ let Inst{39-36} = R1;
+ let Inst{35-32} = M3;
+ let Inst{31-8} = BD2;
+ let Inst{7-0} = op{7-0};
+
+ let Has20BitOffset = 1;
+}
+
+class InstSI<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<4, outs, ins, asmstr, pattern> {
+ field bits<32> Inst;
+ field bits<32> SoftFail = 0;
+
+ bits<16> BD1;
+ bits<8> I2;
+
+ let Inst{31-24} = op;
+ let Inst{23-16} = I2;
+ let Inst{15-0} = BD1;
+}
+
+class InstSIL<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<16> BD1;
+ bits<16> I2;
+
+ let Inst{47-32} = op;
+ let Inst{31-16} = BD1;
+ let Inst{15-0} = I2;
+}
+
+class InstSIY<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<24> BD1;
+ bits<8> I2;
+
+ let Inst{47-40} = op{15-8};
+ let Inst{39-32} = I2;
+ let Inst{31-8} = BD1;
+ let Inst{7-0} = op{7-0};
+
+ let Has20BitOffset = 1;
+}
+
+class InstSMI<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<4> M1;
+ bits<16> RI2;
+ bits<16> BD3;
+
+ let Inst{47-40} = op;
+ let Inst{39-36} = M1;
+ let Inst{35-32} = 0;
+ let Inst{31-16} = BD3;
+ let Inst{15-0} = RI2;
+}
+
+class InstSSa<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<24> BDL1;
+ bits<16> BD2;
+
+ let Inst{47-40} = op;
+ let Inst{39-16} = BDL1;
+ let Inst{15-0} = BD2;
+}
+
+class InstSSd<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<20> RBD1;
+ bits<16> BD2;
+ bits<4> R3;
+
+ let Inst{47-40} = op;
+ let Inst{39-36} = RBD1{19-16};
+ let Inst{35-32} = R3;
+ let Inst{31-16} = RBD1{15-0};
+ let Inst{15-0} = BD2;
+}
+
+class InstSSe<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<4> R1;
+ bits<16> BD2;
+ bits<4> R3;
+ bits<16> BD4;
+
+ let Inst{47-40} = op;
+ let Inst{39-36} = R1;
+ let Inst{35-32} = R3;
+ let Inst{31-16} = BD2;
+ let Inst{15-0} = BD4;
+}
+
+class InstSSE<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<16> BD1;
+ bits<16> BD2;
+
+ let Inst{47-32} = op;
+ let Inst{31-16} = BD1;
+ let Inst{15-0} = BD2;
+}
+
+class InstSSF<bits<12> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<16> BD1;
+ bits<16> BD2;
+ bits<4> R3;
+
+ let Inst{47-40} = op{11-4};
+ let Inst{39-36} = R3;
+ let Inst{35-32} = op{3-0};
+ let Inst{31-16} = BD1;
+ let Inst{15-0} = BD2;
+}
+
+class InstS<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<4, outs, ins, asmstr, pattern> {
+ field bits<32> Inst;
+ field bits<32> SoftFail = 0;
+
+ bits<16> BD2;
+
+ let Inst{31-16} = op;
+ let Inst{15-0} = BD2;
+}
+
+class InstVRIa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<5> V1;
+ bits<16> I2;
+ bits<4> M3;
+
+ let Inst{47-40} = op{15-8};
+ let Inst{39-36} = V1{3-0};
+ let Inst{35-32} = 0;
+ let Inst{31-16} = I2;
+ let Inst{15-12} = M3;
+ let Inst{11} = V1{4};
+ let Inst{10-8} = 0;
+ let Inst{7-0} = op{7-0};
+}
+
+class InstVRIb<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<5> V1;
+ bits<8> I2;
+ bits<8> I3;
+ bits<4> M4;
+
+ let Inst{47-40} = op{15-8};
+ let Inst{39-36} = V1{3-0};
+ let Inst{35-32} = 0;
+ let Inst{31-24} = I2;
+ let Inst{23-16} = I3;
+ let Inst{15-12} = M4;
+ let Inst{11} = V1{4};
+ let Inst{10-8} = 0;
+ let Inst{7-0} = op{7-0};
+}
+
+class InstVRIc<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<5> V1;
+ bits<5> V3;
+ bits<16> I2;
+ bits<4> M4;
+
+ let Inst{47-40} = op{15-8};
+ let Inst{39-36} = V1{3-0};
+ let Inst{35-32} = V3{3-0};
+ let Inst{31-16} = I2;
+ let Inst{15-12} = M4;
+ let Inst{11} = V1{4};
+ let Inst{10} = V3{4};
+ let Inst{9-8} = 0;
+ let Inst{7-0} = op{7-0};
+}
+
+class InstVRId<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<5> V1;
+ bits<5> V2;
+ bits<5> V3;
+ bits<8> I4;
+ bits<4> M5;
+
+ let Inst{47-40} = op{15-8};
+ let Inst{39-36} = V1{3-0};
+ let Inst{35-32} = V2{3-0};
+ let Inst{31-28} = V3{3-0};
+ let Inst{27-24} = 0;
+ let Inst{23-16} = I4;
+ let Inst{15-12} = M5;
+ let Inst{11} = V1{4};
+ let Inst{10} = V2{4};
+ let Inst{9} = V3{4};
+ let Inst{8} = 0;
+ let Inst{7-0} = op{7-0};
+}
+
+class InstVRIe<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<5> V1;
+ bits<5> V2;
+ bits<12> I3;
+ bits<4> M4;
+ bits<4> M5;
+
+ let Inst{47-40} = op{15-8};
+ let Inst{39-36} = V1{3-0};
+ let Inst{35-32} = V2{3-0};
+ let Inst{31-20} = I3;
+ let Inst{19-16} = M5;
+ let Inst{15-12} = M4;
+ let Inst{11} = V1{4};
+ let Inst{10} = V2{4};
+ let Inst{9-8} = 0;
+ let Inst{7-0} = op{7-0};
+}
+
+// Depending on the instruction mnemonic, certain bits may be or-ed into
+// the M4 value provided as explicit operand. These are passed as m4or.
+class InstVRRa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern,
+ bits<4> m4or = 0>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<5> V1;
+ bits<5> V2;
+ bits<4> M3;
+ bits<4> M4;
+ bits<4> M5;
+
+ let Inst{47-40} = op{15-8};
+ let Inst{39-36} = V1{3-0};
+ let Inst{35-32} = V2{3-0};
+ let Inst{31-24} = 0;
+ let Inst{23-20} = M5;
+ let Inst{19} = !if (!eq (m4or{3}, 1), 1, M4{3});
+ let Inst{18} = !if (!eq (m4or{2}, 1), 1, M4{2});
+ let Inst{17} = !if (!eq (m4or{1}, 1), 1, M4{1});
+ let Inst{16} = !if (!eq (m4or{0}, 1), 1, M4{0});
+ let Inst{15-12} = M3;
+ let Inst{11} = V1{4};
+ let Inst{10} = V2{4};
+ let Inst{9-8} = 0;
+ let Inst{7-0} = op{7-0};
+}
+
+// Depending on the instruction mnemonic, certain bits may be or-ed into
+// the M5 value provided as explicit operand. These are passed as m5or.
+class InstVRRb<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern,
+ bits<4> m5or = 0>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<5> V1;
+ bits<5> V2;
+ bits<5> V3;
+ bits<4> M4;
+ bits<4> M5;
+
+ let Inst{47-40} = op{15-8};
+ let Inst{39-36} = V1{3-0};
+ let Inst{35-32} = V2{3-0};
+ let Inst{31-28} = V3{3-0};
+ let Inst{27-24} = 0;
+ let Inst{23} = !if (!eq (m5or{3}, 1), 1, M5{3});
+ let Inst{22} = !if (!eq (m5or{2}, 1), 1, M5{2});
+ let Inst{21} = !if (!eq (m5or{1}, 1), 1, M5{1});
+ let Inst{20} = !if (!eq (m5or{0}, 1), 1, M5{0});
+ let Inst{19-16} = 0;
+ let Inst{15-12} = M4;
+ let Inst{11} = V1{4};
+ let Inst{10} = V2{4};
+ let Inst{9} = V3{4};
+ let Inst{8} = 0;
+ let Inst{7-0} = op{7-0};
+}
+
+class InstVRRc<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<5> V1;
+ bits<5> V2;
+ bits<5> V3;
+ bits<4> M4;
+ bits<4> M5;
+ bits<4> M6;
+
+ let Inst{47-40} = op{15-8};
+ let Inst{39-36} = V1{3-0};
+ let Inst{35-32} = V2{3-0};
+ let Inst{31-28} = V3{3-0};
+ let Inst{27-24} = 0;
+ let Inst{23-20} = M6;
+ let Inst{19-16} = M5;
+ let Inst{15-12} = M4;
+ let Inst{11} = V1{4};
+ let Inst{10} = V2{4};
+ let Inst{9} = V3{4};
+ let Inst{8} = 0;
+ let Inst{7-0} = op{7-0};
+}
+
+// Depending on the instruction mnemonic, certain bits may be or-ed into
+// the M6 value provided as explicit operand. These are passed as m6or.
+class InstVRRd<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern,
+ bits<4> m6or = 0>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<5> V1;
+ bits<5> V2;
+ bits<5> V3;
+ bits<5> V4;
+ bits<4> M5;
+ bits<4> M6;
+
+ let Inst{47-40} = op{15-8};
+ let Inst{39-36} = V1{3-0};
+ let Inst{35-32} = V2{3-0};
+ let Inst{31-28} = V3{3-0};
+ let Inst{27-24} = M5;
+ let Inst{23} = !if (!eq (m6or{3}, 1), 1, M6{3});
+ let Inst{22} = !if (!eq (m6or{2}, 1), 1, M6{2});
+ let Inst{21} = !if (!eq (m6or{1}, 1), 1, M6{1});
+ let Inst{20} = !if (!eq (m6or{0}, 1), 1, M6{0});
+ let Inst{19-16} = 0;
+ let Inst{15-12} = V4{3-0};
+ let Inst{11} = V1{4};
+ let Inst{10} = V2{4};
+ let Inst{9} = V3{4};
+ let Inst{8} = V4{4};
+ let Inst{7-0} = op{7-0};
+}
+
+class InstVRRe<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<5> V1;
+ bits<5> V2;
+ bits<5> V3;
+ bits<5> V4;
+ bits<4> M5;
+ bits<4> M6;
+
+ let Inst{47-40} = op{15-8};
+ let Inst{39-36} = V1{3-0};
+ let Inst{35-32} = V2{3-0};
+ let Inst{31-28} = V3{3-0};
+ let Inst{27-24} = M6;
+ let Inst{23-20} = 0;
+ let Inst{19-16} = M5;
+ let Inst{15-12} = V4{3-0};
+ let Inst{11} = V1{4};
+ let Inst{10} = V2{4};
+ let Inst{9} = V3{4};
+ let Inst{8} = V4{4};
+ let Inst{7-0} = op{7-0};
+}
+
+class InstVRRf<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<5> V1;
+ bits<4> R2;
+ bits<4> R3;
+
+ let Inst{47-40} = op{15-8};
+ let Inst{39-36} = V1{3-0};
+ let Inst{35-32} = R2;
+ let Inst{31-28} = R3;
+ let Inst{27-12} = 0;
+ let Inst{11} = V1{4};
+ let Inst{10-8} = 0;
+ let Inst{7-0} = op{7-0};
+}
+
+class InstVRSa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<5> V1;
+ bits<16> BD2;
+ bits<5> V3;
+ bits<4> M4;
+
+ let Inst{47-40} = op{15-8};
+ let Inst{39-36} = V1{3-0};
+ let Inst{35-32} = V3{3-0};
+ let Inst{31-16} = BD2;
+ let Inst{15-12} = M4;
+ let Inst{11} = V1{4};
+ let Inst{10} = V3{4};
+ let Inst{9-8} = 0;
+ let Inst{7-0} = op{7-0};
+}
+
+class InstVRSb<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<5> V1;
+ bits<16> BD2;
+ bits<4> R3;
+ bits<4> M4;
+
+ let Inst{47-40} = op{15-8};
+ let Inst{39-36} = V1{3-0};
+ let Inst{35-32} = R3;
+ let Inst{31-16} = BD2;
+ let Inst{15-12} = M4;
+ let Inst{11} = V1{4};
+ let Inst{10-8} = 0;
+ let Inst{7-0} = op{7-0};
+}
+
+class InstVRSc<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<4> R1;
+ bits<16> BD2;
+ bits<5> V3;
+ bits<4> M4;
+
+ let Inst{47-40} = op{15-8};
+ let Inst{39-36} = R1;
+ let Inst{35-32} = V3{3-0};
+ let Inst{31-16} = BD2;
+ let Inst{15-12} = M4;
+ let Inst{11} = 0;
+ let Inst{10} = V3{4};
+ let Inst{9-8} = 0;
+ let Inst{7-0} = op{7-0};
+}
+
+class InstVRV<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<5> V1;
+ bits<21> VBD2;
+ bits<4> M3;
+
+ let Inst{47-40} = op{15-8};
+ let Inst{39-36} = V1{3-0};
+ let Inst{35-16} = VBD2{19-0};
+ let Inst{15-12} = M3;
+ let Inst{11} = V1{4};
+ let Inst{10} = VBD2{20};
+ let Inst{9-8} = 0;
+ let Inst{7-0} = op{7-0};
+}
+
+class InstVRX<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<5> V1;
+ bits<20> XBD2;
+ bits<4> M3;
+
+ let Inst{47-40} = op{15-8};
+ let Inst{39-36} = V1{3-0};
+ let Inst{35-16} = XBD2;
+ let Inst{15-12} = M3;
+ let Inst{11} = V1{4};
+ let Inst{10-8} = 0;
+ let Inst{7-0} = op{7-0};
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction classes for .insn directives
+//===----------------------------------------------------------------------===//
+
+class DirectiveInsnE<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstE<0, outs, ins, asmstr, pattern> {
+ bits<16> enc;
+
+ let Inst = enc;
+}
+
+class DirectiveInsnRI<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstRIa<0, outs, ins, asmstr, pattern> {
+ bits<32> enc;
+
+ let Inst{31-24} = enc{31-24};
+ let Inst{19-16} = enc{19-16};
+}
+
+class DirectiveInsnRIE<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstRIEd<0, outs, ins, asmstr, pattern> {
+ bits<48> enc;
+
+ let Inst{47-40} = enc{47-40};
+ let Inst{7-0} = enc{7-0};
+}
+
+class DirectiveInsnRIL<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstRILa<0, outs, ins, asmstr, pattern> {
+ bits<48> enc;
+ string type;
+
+ let Inst{47-40} = enc{47-40};
+ let Inst{35-32} = enc{35-32};
+}
+
+class DirectiveInsnRIS<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstRIS<0, outs, ins, asmstr, pattern> {
+ bits<48> enc;
+
+ let Inst{47-40} = enc{47-40};
+ let Inst{7-0} = enc{7-0};
+}
+
+class DirectiveInsnRR<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstRR<0, outs, ins, asmstr, pattern> {
+ bits<16> enc;
+
+ let Inst{15-8} = enc{15-8};
+}
+
+class DirectiveInsnRRE<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstRRE<0, outs, ins, asmstr, pattern> {
+ bits<32> enc;
+
+ let Inst{31-16} = enc{31-16};
+}
+
+class DirectiveInsnRRF<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstRRFa<0, outs, ins, asmstr, pattern> {
+ bits<32> enc;
+
+ let Inst{31-16} = enc{31-16};
+}
+
+class DirectiveInsnRRS<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstRRS<0, outs, ins, asmstr, pattern> {
+ bits<48> enc;
+
+ let Inst{47-40} = enc{47-40};
+ let Inst{7-0} = enc{7-0};
+}
+
+class DirectiveInsnRS<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstRSa<0, outs, ins, asmstr, pattern> {
+ bits<32> enc;
+
+ let Inst{31-24} = enc{31-24};
+}
+
+// RSE is like RSY except with a 12 bit displacement (instead of 20).
+class DirectiveInsnRSE<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstRSYa<6, outs, ins, asmstr, pattern> {
+ bits <48> enc;
+
+ let Inst{47-40} = enc{47-40};
+ let Inst{31-16} = BD2{15-0};
+ let Inst{15-8} = 0;
+ let Inst{7-0} = enc{7-0};
+}
+
+class DirectiveInsnRSI<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstRSI<0, outs, ins, asmstr, pattern> {
+ bits<32> enc;
+
+ let Inst{31-24} = enc{31-24};
+}
+
+class DirectiveInsnRSY<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstRSYa<0, outs, ins, asmstr, pattern> {
+ bits<48> enc;
+
+ let Inst{47-40} = enc{47-40};
+ let Inst{7-0} = enc{7-0};
+}
+
+class DirectiveInsnRX<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstRXa<0, outs, ins, asmstr, pattern> {
+ bits<32> enc;
+
+ let Inst{31-24} = enc{31-24};
+}
+
+class DirectiveInsnRXE<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstRXE<0, outs, ins, asmstr, pattern> {
+ bits<48> enc;
+
+ let M3 = 0;
+
+ let Inst{47-40} = enc{47-40};
+ let Inst{7-0} = enc{7-0};
+}
+
+class DirectiveInsnRXF<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstRXF<0, outs, ins, asmstr, pattern> {
+ bits<48> enc;
+
+ let Inst{47-40} = enc{47-40};
+ let Inst{7-0} = enc{7-0};
+}
+
+class DirectiveInsnRXY<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstRXYa<0, outs, ins, asmstr, pattern> {
+ bits<48> enc;
+
+ let Inst{47-40} = enc{47-40};
+ let Inst{7-0} = enc{7-0};
+}
+
+class DirectiveInsnS<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstS<0, outs, ins, asmstr, pattern> {
+ bits<32> enc;
+
+ let Inst{31-16} = enc{31-16};
+}
+
+class DirectiveInsnSI<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSI<0, outs, ins, asmstr, pattern> {
+ bits<32> enc;
+
+ let Inst{31-24} = enc{31-24};
+}
+
+class DirectiveInsnSIY<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSIY<0, outs, ins, asmstr, pattern> {
+ bits<48> enc;
+
+ let Inst{47-40} = enc{47-40};
+ let Inst{7-0} = enc{7-0};
+}
+
+class DirectiveInsnSIL<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSIL<0, outs, ins, asmstr, pattern> {
+ bits<48> enc;
+
+ let Inst{47-32} = enc{47-32};
+}
+
+class DirectiveInsnSS<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSSd<0, outs, ins, asmstr, pattern> {
+ bits<48> enc;
+
+ let Inst{47-40} = enc{47-40};
+}
+
+class DirectiveInsnSSE<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSSE<0, outs, ins, asmstr, pattern> {
+ bits<48> enc;
+
+ let Inst{47-32} = enc{47-32};
+}
+
+class DirectiveInsnSSF<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSSF<0, outs, ins, asmstr, pattern> {
+ bits<48> enc;
+
+ let Inst{47-40} = enc{47-40};
+ let Inst{35-32} = enc{35-32};
+}
+
+//===----------------------------------------------------------------------===//
+// Variants of instructions with condition mask
+//===----------------------------------------------------------------------===//
+//
+// For instructions using a condition mask (e.g. conditional branches,
+// compare-and-branch instructions, or conditional move instructions),
+// we generally need to create multiple instruction patterns:
+//
+// - One used for code generation, which encodes the condition mask as an
+// MI operand, but writes out an extended mnemonic for better readability.
+// - One pattern for the base form of the instruction with an explicit
+// condition mask (encoded as a plain integer MI operand).
+// - Specific patterns for each extended mnemonic, where the condition mask
+// is implied by the pattern name and not otherwise encoded at all.
+//
+// We need the latter primarily for the assembler and disassembler, since the
+// assembler parser is not able to decode part of an instruction mnemonic
+// into an operand. Thus we provide separate patterns for each mnemonic.
+//
+// Note that in some cases there are two different mnemonics for the same
+// condition mask. In this case we cannot have both instructions available
+// to the disassembler at the same time since the encodings are not distinct.
+// Therefore the alternate forms are marked isAsmParserOnly.
+//
+// We don't make one of the two names an alias of the other because
+// we need the custom parsing routines to select the correct register class.
+//
+// This section provides helpers for generating the specific forms.
+//
+//===----------------------------------------------------------------------===//
+
+// A class to describe a variant of an instruction with condition mask.
+class CondVariant<bits<4> ccmaskin, string suffixin, bit alternatein> {
+ // The fixed condition mask to use.
+ bits<4> ccmask = ccmaskin;
+
+ // The suffix to use for the extended assembler mnemonic.
+ string suffix = suffixin;
+
+ // Whether this is an alternate that needs to be marked isAsmParserOnly.
+ bit alternate = alternatein;
+}
+
+// Condition mask 15 means "always true", which is used to define
+// unconditional branches as a variant of conditional branches.
+def CondAlways : CondVariant<15, "", 0>;
+
+// Condition masks for general instructions that can set all 4 bits.
+def CondVariantO : CondVariant<1, "o", 0>;
+def CondVariantH : CondVariant<2, "h", 0>;
+def CondVariantP : CondVariant<2, "p", 1>;
+def CondVariantNLE : CondVariant<3, "nle", 0>;
+def CondVariantL : CondVariant<4, "l", 0>;
+def CondVariantM : CondVariant<4, "m", 1>;
+def CondVariantNHE : CondVariant<5, "nhe", 0>;
+def CondVariantLH : CondVariant<6, "lh", 0>;
+def CondVariantNE : CondVariant<7, "ne", 0>;
+def CondVariantNZ : CondVariant<7, "nz", 1>;
+def CondVariantE : CondVariant<8, "e", 0>;
+def CondVariantZ : CondVariant<8, "z", 1>;
+def CondVariantNLH : CondVariant<9, "nlh", 0>;
+def CondVariantHE : CondVariant<10, "he", 0>;
+def CondVariantNL : CondVariant<11, "nl", 0>;
+def CondVariantNM : CondVariant<11, "nm", 1>;
+def CondVariantLE : CondVariant<12, "le", 0>;
+def CondVariantNH : CondVariant<13, "nh", 0>;
+def CondVariantNP : CondVariant<13, "np", 1>;
+def CondVariantNO : CondVariant<14, "no", 0>;
+
+// A helper class to look up one of the above by name.
+class CV<string name>
+ : CondVariant<!cast<CondVariant>("CondVariant"#name).ccmask,
+ !cast<CondVariant>("CondVariant"#name).suffix,
+ !cast<CondVariant>("CondVariant"#name).alternate>;
+
+// Condition masks for integer instructions (e.g. compare-and-branch).
+// This is like the list above, except that condition 3 is not possible
+// and that the low bit of the mask is therefore always 0. This means
+// that each condition has two names. Conditions "o" and "no" are not used.
+def IntCondVariantH : CondVariant<2, "h", 0>;
+def IntCondVariantNLE : CondVariant<2, "nle", 1>;
+def IntCondVariantL : CondVariant<4, "l", 0>;
+def IntCondVariantNHE : CondVariant<4, "nhe", 1>;
+def IntCondVariantLH : CondVariant<6, "lh", 0>;
+def IntCondVariantNE : CondVariant<6, "ne", 1>;
+def IntCondVariantE : CondVariant<8, "e", 0>;
+def IntCondVariantNLH : CondVariant<8, "nlh", 1>;
+def IntCondVariantHE : CondVariant<10, "he", 0>;
+def IntCondVariantNL : CondVariant<10, "nl", 1>;
+def IntCondVariantLE : CondVariant<12, "le", 0>;
+def IntCondVariantNH : CondVariant<12, "nh", 1>;
+
+// A helper class to look up one of the above by name.
+class ICV<string name>
+ : CondVariant<!cast<CondVariant>("IntCondVariant"#name).ccmask,
+ !cast<CondVariant>("IntCondVariant"#name).suffix,
+ !cast<CondVariant>("IntCondVariant"#name).alternate>;
+
+//===----------------------------------------------------------------------===//
+// Instruction definitions with semantics
+//===----------------------------------------------------------------------===//
+//
+// These classes have the form [Cond]<Category><Format>, where <Format> is one
+// of the formats defined above and where <Category> describes the inputs
+// and outputs. "Cond" is used if the instruction is conditional,
+// in which case the 4-bit condition-code mask is added as a final operand.
+// <Category> can be one of:
+//
+// Inherent:
+// One register output operand and no input operands.
+//
+// StoreInherent:
+// One address operand. The instruction stores to the address.
+//
+// SideEffectInherent:
+// No input or output operands, but causes some side effect.
+//
+// Branch:
+// One branch target. The instruction branches to the target.
+//
+// Call:
+// One output operand and one branch target. The instruction stores
+// the return address to the output operand and branches to the target.
+//
+// CmpBranch:
+// Two input operands and one optional branch target. The instruction
+// compares the two input operands and branches or traps on the result.
+//
+// BranchUnary:
+// One register output operand, one register input operand and one branch
+// target. The instructions stores a modified form of the source register
+// in the destination register and branches on the result.
+//
+// BranchBinary:
+// One register output operand, two register input operands and one branch
+// target. The instructions stores a modified form of one of the source
+// registers in the destination register and branches on the result.
+//
+// LoadMultiple:
+// One address input operand and two explicit output operands.
+// The instruction loads a range of registers from the address,
+// with the explicit operands giving the first and last register
+// to load. Other loaded registers are added as implicit definitions.
+//
+// StoreMultiple:
+// Two explicit input register operands and an address operand.
+// The instruction stores a range of registers to the address,
+// with the explicit operands giving the first and last register
+// to store. Other stored registers are added as implicit uses.
+//
+// StoreLength:
+// One value operand, one length operand and one address operand.
+// The instruction stores the value operand to the address but
+// doesn't write more than the number of bytes specified by the
+// length operand.
+//
+// LoadAddress:
+// One register output operand and one address operand.
+//
+// SideEffectAddress:
+// One address operand. No output operands, but causes some side effect.
+//
+// Unary:
+// One register output operand and one input operand.
+//
+// Store:
+// One address operand and one other input operand. The instruction
+// stores to the address.
+//
+// SideEffectUnary:
+// One input operand. No output operands, but causes some side effect.
+//
+// Binary:
+// One register output operand and two input operands.
+//
+// StoreBinary:
+// One address operand and two other input operands. The instruction
+// stores to the address.
+//
+// SideEffectBinary:
+// Two input operands. No output operands, but causes some side effect.
+//
+// Compare:
+// Two input operands and an implicit CC output operand.
+//
+// Test:
+// Two input operands and an implicit CC output operand. The second
+// input operand is an "address" operand used as a test class mask.
+//
+// Ternary:
+// One register output operand and three input operands.
+//
+// SideEffectTernary:
+// Three input operands. No output operands, but causes some side effect.
+//
+// Quaternary:
+// One register output operand and four input operands.
+//
+// LoadAndOp:
+// One output operand and two input operands, one of which is an address.
+// The instruction both reads from and writes to the address.
+//
+// CmpSwap:
+// One output operand and three input operands, one of which is an address.
+// The instruction both reads from and writes to the address.
+//
+// RotateSelect:
+// One output operand and five input operands. The first two operands
+// are registers and the other three are immediates.
+//
+// Prefetch:
+// One 4-bit immediate operand and one address operand. The immediate
+// operand is 1 for a load prefetch and 2 for a store prefetch.
+//
+// BranchPreload:
+// One 4-bit immediate operand and two address operands.
+//
+// The format determines which input operands are tied to output operands,
+// and also determines the shape of any address operand.
+//
+// Multiclasses of the form <Category><Format>Pair define two instructions,
+// one with <Category><Format> and one with <Category><Format>Y. The name
+// of the first instruction has no suffix, the name of the second has
+// an extra "y".
+//
+//===----------------------------------------------------------------------===//
+
+class InherentRRE<string mnemonic, bits<16> opcode, RegisterOperand cls,
+ SDPatternOperator operator>
+ : InstRRE<opcode, (outs cls:$R1), (ins),
+ mnemonic#"\t$R1",
+ [(set cls:$R1, (operator))]> {
+ let R2 = 0;
+}
+
+class InherentVRIa<string mnemonic, bits<16> opcode, bits<16> value>
+ : InstVRIa<opcode, (outs VR128:$V1), (ins), mnemonic#"\t$V1", []> {
+ let I2 = value;
+ let M3 = 0;
+}
+
+class StoreInherentS<string mnemonic, bits<16> opcode,
+ SDPatternOperator operator, bits<5> bytes>
+ : InstS<opcode, (outs), (ins bdaddr12only:$BD2),
+ mnemonic#"\t$BD2", [(operator bdaddr12only:$BD2)]> {
+ let mayStore = 1;
+ let AccessBytes = bytes;
+}
+
+class SideEffectInherentE<string mnemonic, bits<16>opcode>
+ : InstE<opcode, (outs), (ins), mnemonic, []>;
+
+class SideEffectInherentS<string mnemonic, bits<16> opcode,
+ SDPatternOperator operator>
+ : InstS<opcode, (outs), (ins), mnemonic, [(operator)]> {
+ let BD2 = 0;
+}
+
+// Allow an optional TLS marker symbol to generate TLS call relocations.
+class CallRI<string mnemonic, bits<12> opcode>
+ : InstRIb<opcode, (outs), (ins GR64:$R1, brtarget16tls:$RI2),
+ mnemonic#"\t$R1, $RI2", []>;
+
+// Allow an optional TLS marker symbol to generate TLS call relocations.
+class CallRIL<string mnemonic, bits<12> opcode>
+ : InstRILb<opcode, (outs), (ins GR64:$R1, brtarget32tls:$RI2),
+ mnemonic#"\t$R1, $RI2", []>;
+
+class CallRR<string mnemonic, bits<8> opcode>
+ : InstRR<opcode, (outs), (ins GR64:$R1, ADDR64:$R2),
+ mnemonic#"\t$R1, $R2", []>;
+
+class CallRX<string mnemonic, bits<8> opcode>
+ : InstRXa<opcode, (outs), (ins GR64:$R1, bdxaddr12only:$XBD2),
+ mnemonic#"\t$R1, $XBD2", []>;
+
+class CondBranchRI<string mnemonic, bits<12> opcode,
+ SDPatternOperator operator = null_frag>
+ : InstRIc<opcode, (outs), (ins cond4:$valid, cond4:$M1, brtarget16:$RI2),
+ !subst("#", "${M1}", mnemonic)#"\t$RI2",
+ [(operator cond4:$valid, cond4:$M1, bb:$RI2)]> {
+ let CCMaskFirst = 1;
+}
+
+class AsmCondBranchRI<string mnemonic, bits<12> opcode>
+ : InstRIc<opcode, (outs), (ins imm32zx4:$M1, brtarget16:$RI2),
+ mnemonic#"\t$M1, $RI2", []>;
+
+class FixedCondBranchRI<CondVariant V, string mnemonic, bits<12> opcode,
+ SDPatternOperator operator = null_frag>
+ : InstRIc<opcode, (outs), (ins brtarget16:$RI2),
+ !subst("#", V.suffix, mnemonic)#"\t$RI2", [(operator bb:$RI2)]> {
+ let isAsmParserOnly = V.alternate;
+ let M1 = V.ccmask;
+}
+
+class CondBranchRIL<string mnemonic, bits<12> opcode>
+ : InstRILc<opcode, (outs), (ins cond4:$valid, cond4:$M1, brtarget32:$RI2),
+ !subst("#", "${M1}", mnemonic)#"\t$RI2", []> {
+ let CCMaskFirst = 1;
+}
+
+class AsmCondBranchRIL<string mnemonic, bits<12> opcode>
+ : InstRILc<opcode, (outs), (ins imm32zx4:$M1, brtarget32:$RI2),
+ mnemonic#"\t$M1, $RI2", []>;
+
+class FixedCondBranchRIL<CondVariant V, string mnemonic, bits<12> opcode>
+ : InstRILc<opcode, (outs), (ins brtarget32:$RI2),
+ !subst("#", V.suffix, mnemonic)#"\t$RI2", []> {
+ let isAsmParserOnly = V.alternate;
+ let M1 = V.ccmask;
+}
+
+class CondBranchRR<string mnemonic, bits<8> opcode>
+ : InstRR<opcode, (outs), (ins cond4:$valid, cond4:$R1, GR64:$R2),
+ !subst("#", "${R1}", mnemonic)#"\t$R2", []> {
+ let CCMaskFirst = 1;
+}
+
+class AsmCondBranchRR<string mnemonic, bits<8> opcode>
+ : InstRR<opcode, (outs), (ins imm32zx4:$R1, GR64:$R2),
+ mnemonic#"\t$R1, $R2", []>;
+
+class FixedCondBranchRR<CondVariant V, string mnemonic, bits<8> opcode,
+ SDPatternOperator operator = null_frag>
+ : InstRR<opcode, (outs), (ins ADDR64:$R2),
+ !subst("#", V.suffix, mnemonic)#"\t$R2", [(operator ADDR64:$R2)]> {
+ let isAsmParserOnly = V.alternate;
+ let R1 = V.ccmask;
+}
+
+class CondBranchRX<string mnemonic, bits<8> opcode>
+ : InstRXb<opcode, (outs), (ins cond4:$valid, cond4:$M1, bdxaddr12only:$XBD2),
+ !subst("#", "${M1}", mnemonic)#"\t$XBD2", []> {
+ let CCMaskFirst = 1;
+}
+
+class AsmCondBranchRX<string mnemonic, bits<8> opcode>
+ : InstRXb<opcode, (outs), (ins imm32zx4:$M1, bdxaddr12only:$XBD2),
+ mnemonic#"\t$M1, $XBD2", []>;
+
+class FixedCondBranchRX<CondVariant V, string mnemonic, bits<8> opcode>
+ : InstRXb<opcode, (outs), (ins bdxaddr12only:$XBD2),
+ !subst("#", V.suffix, mnemonic)#"\t$XBD2", []> {
+ let isAsmParserOnly = V.alternate;
+ let M1 = V.ccmask;
+}
+
+class CmpBranchRIEa<string mnemonic, bits<16> opcode,
+ RegisterOperand cls, Immediate imm>
+ : InstRIEa<opcode, (outs), (ins cls:$R1, imm:$I2, cond4:$M3),
+ mnemonic#"$M3\t$R1, $I2", []>;
+
+class AsmCmpBranchRIEa<string mnemonic, bits<16> opcode,
+ RegisterOperand cls, Immediate imm>
+ : InstRIEa<opcode, (outs), (ins cls:$R1, imm:$I2, imm32zx4:$M3),
+ mnemonic#"\t$R1, $I2, $M3", []>;
+
+class FixedCmpBranchRIEa<CondVariant V, string mnemonic, bits<16> opcode,
+ RegisterOperand cls, Immediate imm>
+ : InstRIEa<opcode, (outs), (ins cls:$R1, imm:$I2),
+ mnemonic#V.suffix#"\t$R1, $I2", []> {
+ let isAsmParserOnly = V.alternate;
+ let M3 = V.ccmask;
+}
+
+multiclass CmpBranchRIEaPair<string mnemonic, bits<16> opcode,
+ RegisterOperand cls, Immediate imm> {
+ let isCodeGenOnly = 1 in
+ def "" : CmpBranchRIEa<mnemonic, opcode, cls, imm>;
+ def Asm : AsmCmpBranchRIEa<mnemonic, opcode, cls, imm>;
+}
+
+class CmpBranchRIEb<string mnemonic, bits<16> opcode,
+ RegisterOperand cls>
+ : InstRIEb<opcode, (outs),
+ (ins cls:$R1, cls:$R2, cond4:$M3, brtarget16:$RI4),
+ mnemonic#"$M3\t$R1, $R2, $RI4", []>;
+
+class AsmCmpBranchRIEb<string mnemonic, bits<16> opcode,
+ RegisterOperand cls>
+ : InstRIEb<opcode, (outs),
+ (ins cls:$R1, cls:$R2, imm32zx4:$M3, brtarget16:$RI4),
+ mnemonic#"\t$R1, $R2, $M3, $RI4", []>;
+
+class FixedCmpBranchRIEb<CondVariant V, string mnemonic, bits<16> opcode,
+ RegisterOperand cls>
+ : InstRIEb<opcode, (outs), (ins cls:$R1, cls:$R2, brtarget16:$RI4),
+ mnemonic#V.suffix#"\t$R1, $R2, $RI4", []> {
+ let isAsmParserOnly = V.alternate;
+ let M3 = V.ccmask;
+}
+
+multiclass CmpBranchRIEbPair<string mnemonic, bits<16> opcode,
+ RegisterOperand cls> {
+ let isCodeGenOnly = 1 in
+ def "" : CmpBranchRIEb<mnemonic, opcode, cls>;
+ def Asm : AsmCmpBranchRIEb<mnemonic, opcode, cls>;
+}
+
+class CmpBranchRIEc<string mnemonic, bits<16> opcode,
+ RegisterOperand cls, Immediate imm>
+ : InstRIEc<opcode, (outs),
+ (ins cls:$R1, imm:$I2, cond4:$M3, brtarget16:$RI4),
+ mnemonic#"$M3\t$R1, $I2, $RI4", []>;
+
+class AsmCmpBranchRIEc<string mnemonic, bits<16> opcode,
+ RegisterOperand cls, Immediate imm>
+ : InstRIEc<opcode, (outs),
+ (ins cls:$R1, imm:$I2, imm32zx4:$M3, brtarget16:$RI4),
+ mnemonic#"\t$R1, $I2, $M3, $RI4", []>;
+
+class FixedCmpBranchRIEc<CondVariant V, string mnemonic, bits<16> opcode,
+ RegisterOperand cls, Immediate imm>
+ : InstRIEc<opcode, (outs), (ins cls:$R1, imm:$I2, brtarget16:$RI4),
+ mnemonic#V.suffix#"\t$R1, $I2, $RI4", []> {
+ let isAsmParserOnly = V.alternate;
+ let M3 = V.ccmask;
+}
+
+multiclass CmpBranchRIEcPair<string mnemonic, bits<16> opcode,
+ RegisterOperand cls, Immediate imm> {
+ let isCodeGenOnly = 1 in
+ def "" : CmpBranchRIEc<mnemonic, opcode, cls, imm>;
+ def Asm : AsmCmpBranchRIEc<mnemonic, opcode, cls, imm>;
+}
+
+class CmpBranchRRFc<string mnemonic, bits<16> opcode,
+ RegisterOperand cls>
+ : InstRRFc<opcode, (outs), (ins cls:$R1, cls:$R2, cond4:$M3),
+ mnemonic#"$M3\t$R1, $R2", []>;
+
+class AsmCmpBranchRRFc<string mnemonic, bits<16> opcode,
+ RegisterOperand cls>
+ : InstRRFc<opcode, (outs), (ins cls:$R1, cls:$R2, imm32zx4:$M3),
+ mnemonic#"\t$R1, $R2, $M3", []>;
+
+multiclass CmpBranchRRFcPair<string mnemonic, bits<16> opcode,
+ RegisterOperand cls> {
+ let isCodeGenOnly = 1 in
+ def "" : CmpBranchRRFc<mnemonic, opcode, cls>;
+ def Asm : AsmCmpBranchRRFc<mnemonic, opcode, cls>;
+}
+
+class FixedCmpBranchRRFc<CondVariant V, string mnemonic, bits<16> opcode,
+ RegisterOperand cls>
+ : InstRRFc<opcode, (outs), (ins cls:$R1, cls:$R2),
+ mnemonic#V.suffix#"\t$R1, $R2", []> {
+ let isAsmParserOnly = V.alternate;
+ let M3 = V.ccmask;
+}
+
+class CmpBranchRRS<string mnemonic, bits<16> opcode,
+ RegisterOperand cls>
+ : InstRRS<opcode, (outs),
+ (ins cls:$R1, cls:$R2, cond4:$M3, bdaddr12only:$BD4),
+ mnemonic#"$M3\t$R1, $R2, $BD4", []>;
+
+class AsmCmpBranchRRS<string mnemonic, bits<16> opcode,
+ RegisterOperand cls>
+ : InstRRS<opcode, (outs),
+ (ins cls:$R1, cls:$R2, imm32zx4:$M3, bdaddr12only:$BD4),
+ mnemonic#"\t$R1, $R2, $M3, $BD4", []>;
+
+class FixedCmpBranchRRS<CondVariant V, string mnemonic, bits<16> opcode,
+ RegisterOperand cls>
+ : InstRRS<opcode, (outs), (ins cls:$R1, cls:$R2, bdaddr12only:$BD4),
+ mnemonic#V.suffix#"\t$R1, $R2, $BD4", []> {
+ let isAsmParserOnly = V.alternate;
+ let M3 = V.ccmask;
+}
+
+multiclass CmpBranchRRSPair<string mnemonic, bits<16> opcode,
+ RegisterOperand cls> {
+ let isCodeGenOnly = 1 in
+ def "" : CmpBranchRRS<mnemonic, opcode, cls>;
+ def Asm : AsmCmpBranchRRS<mnemonic, opcode, cls>;
+}
+
+class CmpBranchRIS<string mnemonic, bits<16> opcode,
+ RegisterOperand cls, Immediate imm>
+ : InstRIS<opcode, (outs),
+ (ins cls:$R1, imm:$I2, cond4:$M3, bdaddr12only:$BD4),
+ mnemonic#"$M3\t$R1, $I2, $BD4", []>;
+
+class AsmCmpBranchRIS<string mnemonic, bits<16> opcode,
+ RegisterOperand cls, Immediate imm>
+ : InstRIS<opcode, (outs),
+ (ins cls:$R1, imm:$I2, imm32zx4:$M3, bdaddr12only:$BD4),
+ mnemonic#"\t$R1, $I2, $M3, $BD4", []>;
+
+class FixedCmpBranchRIS<CondVariant V, string mnemonic, bits<16> opcode,
+ RegisterOperand cls, Immediate imm>
+ : InstRIS<opcode, (outs), (ins cls:$R1, imm:$I2, bdaddr12only:$BD4),
+ mnemonic#V.suffix#"\t$R1, $I2, $BD4", []> {
+ let isAsmParserOnly = V.alternate;
+ let M3 = V.ccmask;
+}
+
+multiclass CmpBranchRISPair<string mnemonic, bits<16> opcode,
+ RegisterOperand cls, Immediate imm> {
+ let isCodeGenOnly = 1 in
+ def "" : CmpBranchRIS<mnemonic, opcode, cls, imm>;
+ def Asm : AsmCmpBranchRIS<mnemonic, opcode, cls, imm>;
+}
+
+class CmpBranchRSYb<string mnemonic, bits<16> opcode,
+ RegisterOperand cls>
+ : InstRSYb<opcode, (outs), (ins cls:$R1, bdaddr20only:$BD2, cond4:$M3),
+ mnemonic#"$M3\t$R1, $BD2", []>;
+
+class AsmCmpBranchRSYb<string mnemonic, bits<16> opcode,
+ RegisterOperand cls>
+ : InstRSYb<opcode, (outs), (ins cls:$R1, bdaddr20only:$BD2, imm32zx4:$M3),
+ mnemonic#"\t$R1, $M3, $BD2", []>;
+
+multiclass CmpBranchRSYbPair<string mnemonic, bits<16> opcode,
+ RegisterOperand cls> {
+ let isCodeGenOnly = 1 in
+ def "" : CmpBranchRSYb<mnemonic, opcode, cls>;
+ def Asm : AsmCmpBranchRSYb<mnemonic, opcode, cls>;
+}
+
+class FixedCmpBranchRSYb<CondVariant V, string mnemonic, bits<16> opcode,
+ RegisterOperand cls>
+ : InstRSYb<opcode, (outs), (ins cls:$R1, bdaddr20only:$BD2),
+ mnemonic#V.suffix#"\t$R1, $BD2", []> {
+ let isAsmParserOnly = V.alternate;
+ let M3 = V.ccmask;
+}
+
+class BranchUnaryRI<string mnemonic, bits<12> opcode, RegisterOperand cls>
+ : InstRIb<opcode, (outs cls:$R1), (ins cls:$R1src, brtarget16:$RI2),
+ mnemonic##"\t$R1, $RI2", []> {
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+}
+
+class BranchUnaryRIL<string mnemonic, bits<12> opcode, RegisterOperand cls>
+ : InstRILb<opcode, (outs cls:$R1), (ins cls:$R1src, brtarget32:$RI2),
+ mnemonic##"\t$R1, $RI2", []> {
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+}
+
+class BranchUnaryRR<string mnemonic, bits<8> opcode, RegisterOperand cls>
+ : InstRR<opcode, (outs cls:$R1), (ins cls:$R1src, GR64:$R2),
+ mnemonic##"\t$R1, $R2", []> {
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+}
+
+class BranchUnaryRRE<string mnemonic, bits<16> opcode, RegisterOperand cls>
+ : InstRRE<opcode, (outs cls:$R1), (ins cls:$R1src, GR64:$R2),
+ mnemonic##"\t$R1, $R2", []> {
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+}
+
+class BranchUnaryRX<string mnemonic, bits<8> opcode, RegisterOperand cls>
+ : InstRXa<opcode, (outs cls:$R1), (ins cls:$R1src, bdxaddr12only:$XBD2),
+ mnemonic##"\t$R1, $XBD2", []> {
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+}
+
+class BranchUnaryRXY<string mnemonic, bits<16> opcode, RegisterOperand cls>
+ : InstRXYa<opcode, (outs cls:$R1), (ins cls:$R1src, bdxaddr20only:$XBD2),
+ mnemonic##"\t$R1, $XBD2", []> {
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+}
+
+class BranchBinaryRSI<string mnemonic, bits<8> opcode, RegisterOperand cls>
+ : InstRSI<opcode, (outs cls:$R1), (ins cls:$R1src, cls:$R3, brtarget16:$RI2),
+ mnemonic##"\t$R1, $R3, $RI2", []> {
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+}
+
+class BranchBinaryRIEe<string mnemonic, bits<16> opcode, RegisterOperand cls>
+ : InstRIEe<opcode, (outs cls:$R1),
+ (ins cls:$R1src, cls:$R3, brtarget16:$RI2),
+ mnemonic##"\t$R1, $R3, $RI2", []> {
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+}
+
+class BranchBinaryRS<string mnemonic, bits<8> opcode, RegisterOperand cls>
+ : InstRSa<opcode, (outs cls:$R1),
+ (ins cls:$R1src, cls:$R3, bdaddr12only:$BD2),
+ mnemonic##"\t$R1, $R3, $BD2", []> {
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+}
+
+class BranchBinaryRSY<string mnemonic, bits<16> opcode, RegisterOperand cls>
+ : InstRSYa<opcode,
+ (outs cls:$R1), (ins cls:$R1src, cls:$R3, bdaddr20only:$BD2),
+ mnemonic##"\t$R1, $R3, $BD2", []> {
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+}
+
+class LoadMultipleRS<string mnemonic, bits<8> opcode, RegisterOperand cls,
+ AddressingMode mode = bdaddr12only>
+ : InstRSa<opcode, (outs cls:$R1, cls:$R3), (ins mode:$BD2),
+ mnemonic#"\t$R1, $R3, $BD2", []> {
+ let mayLoad = 1;
+}
+
+class LoadMultipleRSY<string mnemonic, bits<16> opcode, RegisterOperand cls,
+ AddressingMode mode = bdaddr20only>
+ : InstRSYa<opcode, (outs cls:$R1, cls:$R3), (ins mode:$BD2),
+ mnemonic#"\t$R1, $R3, $BD2", []> {
+ let mayLoad = 1;
+}
+
+multiclass LoadMultipleRSPair<string mnemonic, bits<8> rsOpcode,
+ bits<16> rsyOpcode, RegisterOperand cls> {
+ let DispKey = mnemonic ## #cls in {
+ let DispSize = "12" in
+ def "" : LoadMultipleRS<mnemonic, rsOpcode, cls, bdaddr12pair>;
+ let DispSize = "20" in
+ def Y : LoadMultipleRSY<mnemonic#"y", rsyOpcode, cls, bdaddr20pair>;
+ }
+}
+
+class LoadMultipleVRSa<string mnemonic, bits<16> opcode>
+ : InstVRSa<opcode, (outs VR128:$V1, VR128:$V3), (ins bdaddr12only:$BD2),
+ mnemonic#"\t$V1, $V3, $BD2", []> {
+ let M4 = 0;
+ let mayLoad = 1;
+}
+
+class StoreRILPC<string mnemonic, bits<12> opcode, SDPatternOperator operator,
+ RegisterOperand cls>
+ : InstRILb<opcode, (outs), (ins cls:$R1, pcrel32:$RI2),
+ mnemonic#"\t$R1, $RI2",
+ [(operator cls:$R1, pcrel32:$RI2)]> {
+ let mayStore = 1;
+ // We want PC-relative addresses to be tried ahead of BD and BDX addresses.
+ // However, BDXs have two extra operands and are therefore 6 units more
+ // complex.
+ let AddedComplexity = 7;
+}
+
+class StoreRX<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+ RegisterOperand cls, bits<5> bytes,
+ AddressingMode mode = bdxaddr12only>
+ : InstRXa<opcode, (outs), (ins cls:$R1, mode:$XBD2),
+ mnemonic#"\t$R1, $XBD2",
+ [(operator cls:$R1, mode:$XBD2)]> {
+ let OpKey = mnemonic#"r"#cls;
+ let OpType = "mem";
+ let mayStore = 1;
+ let AccessBytes = bytes;
+}
+
+class StoreRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ RegisterOperand cls, bits<5> bytes,
+ AddressingMode mode = bdxaddr20only>
+ : InstRXYa<opcode, (outs), (ins cls:$R1, mode:$XBD2),
+ mnemonic#"\t$R1, $XBD2",
+ [(operator cls:$R1, mode:$XBD2)]> {
+ let OpKey = mnemonic#"r"#cls;
+ let OpType = "mem";
+ let mayStore = 1;
+ let AccessBytes = bytes;
+}
+
+multiclass StoreRXPair<string mnemonic, bits<8> rxOpcode, bits<16> rxyOpcode,
+ SDPatternOperator operator, RegisterOperand cls,
+ bits<5> bytes> {
+ let DispKey = mnemonic ## #cls in {
+ let DispSize = "12" in
+ def "" : StoreRX<mnemonic, rxOpcode, operator, cls, bytes, bdxaddr12pair>;
+ let DispSize = "20" in
+ def Y : StoreRXY<mnemonic#"y", rxyOpcode, operator, cls, bytes,
+ bdxaddr20pair>;
+ }
+}
+
+class StoreVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ TypedReg tr, bits<5> bytes, bits<4> type = 0>
+ : InstVRX<opcode, (outs), (ins tr.op:$V1, bdxaddr12only:$XBD2),
+ mnemonic#"\t$V1, $XBD2",
+ [(set tr.op:$V1, (tr.vt (operator bdxaddr12only:$XBD2)))]> {
+ let M3 = type;
+ let mayStore = 1;
+ let AccessBytes = bytes;
+}
+
+class StoreLengthVRSb<string mnemonic, bits<16> opcode,
+ SDPatternOperator operator, bits<5> bytes>
+ : InstVRSb<opcode, (outs), (ins VR128:$V1, GR32:$R3, bdaddr12only:$BD2),
+ mnemonic#"\t$V1, $R3, $BD2",
+ [(operator VR128:$V1, GR32:$R3, bdaddr12only:$BD2)]> {
+ let M4 = 0;
+ let mayStore = 1;
+ let AccessBytes = bytes;
+}
+
+class StoreMultipleRS<string mnemonic, bits<8> opcode, RegisterOperand cls,
+ AddressingMode mode = bdaddr12only>
+ : InstRSa<opcode, (outs), (ins cls:$R1, cls:$R3, mode:$BD2),
+ mnemonic#"\t$R1, $R3, $BD2", []> {
+ let mayStore = 1;
+}
+
+class StoreMultipleRSY<string mnemonic, bits<16> opcode, RegisterOperand cls,
+ AddressingMode mode = bdaddr20only>
+ : InstRSYa<opcode, (outs), (ins cls:$R1, cls:$R3, mode:$BD2),
+ mnemonic#"\t$R1, $R3, $BD2", []> {
+ let mayStore = 1;
+}
+
+multiclass StoreMultipleRSPair<string mnemonic, bits<8> rsOpcode,
+ bits<16> rsyOpcode, RegisterOperand cls> {
+ let DispKey = mnemonic ## #cls in {
+ let DispSize = "12" in
+ def "" : StoreMultipleRS<mnemonic, rsOpcode, cls, bdaddr12pair>;
+ let DispSize = "20" in
+ def Y : StoreMultipleRSY<mnemonic#"y", rsyOpcode, cls, bdaddr20pair>;
+ }
+}
+
+class StoreMultipleVRSa<string mnemonic, bits<16> opcode>
+ : InstVRSa<opcode, (outs), (ins VR128:$V1, VR128:$V3, bdaddr12only:$BD2),
+ mnemonic#"\t$V1, $V3, $BD2", []> {
+ let M4 = 0;
+ let mayStore = 1;
+}
+
+// StoreSI* instructions are used to store an integer to memory, but the
+// addresses are more restricted than for normal stores. If we are in the
+// situation of having to force either the address into a register or the
+// constant into a register, it's usually better to do the latter.
+// We therefore match the address in the same way as a normal store and
+// only use the StoreSI* instruction if the matched address is suitable.
+class StoreSI<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+ Immediate imm>
+ : InstSI<opcode, (outs), (ins mviaddr12pair:$BD1, imm:$I2),
+ mnemonic#"\t$BD1, $I2",
+ [(operator imm:$I2, mviaddr12pair:$BD1)]> {
+ let mayStore = 1;
+}
+
+class StoreSIY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ Immediate imm>
+ : InstSIY<opcode, (outs), (ins mviaddr20pair:$BD1, imm:$I2),
+ mnemonic#"\t$BD1, $I2",
+ [(operator imm:$I2, mviaddr20pair:$BD1)]> {
+ let mayStore = 1;
+}
+
+class StoreSIL<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ Immediate imm>
+ : InstSIL<opcode, (outs), (ins mviaddr12pair:$BD1, imm:$I2),
+ mnemonic#"\t$BD1, $I2",
+ [(operator imm:$I2, mviaddr12pair:$BD1)]> {
+ let mayStore = 1;
+}
+
+multiclass StoreSIPair<string mnemonic, bits<8> siOpcode, bits<16> siyOpcode,
+ SDPatternOperator operator, Immediate imm> {
+ let DispKey = mnemonic in {
+ let DispSize = "12" in
+ def "" : StoreSI<mnemonic, siOpcode, operator, imm>;
+ let DispSize = "20" in
+ def Y : StoreSIY<mnemonic#"y", siyOpcode, operator, imm>;
+ }
+}
+
+class StoreSSE<string mnemonic, bits<16> opcode>
+ : InstSSE<opcode, (outs), (ins bdaddr12only:$BD1, bdaddr12only:$BD2),
+ mnemonic#"\t$BD1, $BD2", []> {
+ let mayStore = 1;
+}
+
+class CondStoreRSY<string mnemonic, bits<16> opcode,
+ RegisterOperand cls, bits<5> bytes,
+ AddressingMode mode = bdaddr20only>
+ : InstRSYb<opcode, (outs), (ins cls:$R1, mode:$BD2, cond4:$valid, cond4:$M3),
+ mnemonic#"$M3\t$R1, $BD2", []> {
+ let mayStore = 1;
+ let AccessBytes = bytes;
+ let CCMaskLast = 1;
+}
+
+// Like CondStoreRSY, but used for the raw assembly form. The condition-code
+// mask is the third operand rather than being part of the mnemonic.
+class AsmCondStoreRSY<string mnemonic, bits<16> opcode,
+ RegisterOperand cls, bits<5> bytes,
+ AddressingMode mode = bdaddr20only>
+ : InstRSYb<opcode, (outs), (ins cls:$R1, mode:$BD2, imm32zx4:$M3),
+ mnemonic#"\t$R1, $BD2, $M3", []> {
+ let mayStore = 1;
+ let AccessBytes = bytes;
+}
+
+// Like CondStoreRSY, but with a fixed CC mask.
+class FixedCondStoreRSY<CondVariant V, string mnemonic, bits<16> opcode,
+ RegisterOperand cls, bits<5> bytes,
+ AddressingMode mode = bdaddr20only>
+ : InstRSYb<opcode, (outs), (ins cls:$R1, mode:$BD2),
+ mnemonic#V.suffix#"\t$R1, $BD2", []> {
+ let mayStore = 1;
+ let AccessBytes = bytes;
+ let isAsmParserOnly = V.alternate;
+ let M3 = V.ccmask;
+}
+
+multiclass CondStoreRSYPair<string mnemonic, bits<16> opcode,
+ RegisterOperand cls, bits<5> bytes,
+ AddressingMode mode = bdaddr20only> {
+ let isCodeGenOnly = 1 in
+ def "" : CondStoreRSY<mnemonic, opcode, cls, bytes, mode>;
+ def Asm : AsmCondStoreRSY<mnemonic, opcode, cls, bytes, mode>;
+}
+
+class SideEffectUnaryI<string mnemonic, bits<8> opcode, Immediate imm>
+ : InstI<opcode, (outs), (ins imm:$I1),
+ mnemonic#"\t$I1", []>;
+
+class SideEffectUnaryRR<string mnemonic, bits<8>opcode, RegisterOperand cls>
+ : InstRR<opcode, (outs), (ins cls:$R1),
+ mnemonic#"\t$R1", []> {
+ let R2 = 0;
+}
+
+class SideEffectUnaryRRE<string mnemonic, bits<16> opcode, RegisterOperand cls,
+ SDPatternOperator operator>
+ : InstRRE<opcode, (outs), (ins cls:$R1),
+ mnemonic#"\t$R1", [(operator cls:$R1)]> {
+ let R2 = 0;
+}
+
+class SideEffectUnaryS<string mnemonic, bits<16> opcode,
+ SDPatternOperator operator, bits<5> bytes,
+ AddressingMode mode = bdaddr12only>
+ : InstS<opcode, (outs), (ins mode:$BD2),
+ mnemonic#"\t$BD2", [(operator mode:$BD2)]> {
+ let mayLoad = 1;
+ let AccessBytes = bytes;
+}
+
+class SideEffectAddressS<string mnemonic, bits<16> opcode,
+ SDPatternOperator operator,
+ AddressingMode mode = bdaddr12only>
+ : InstS<opcode, (outs), (ins mode:$BD2),
+ mnemonic#"\t$BD2", [(operator mode:$BD2)]>;
+
+class LoadAddressRX<string mnemonic, bits<8> opcode,
+ SDPatternOperator operator, AddressingMode mode>
+ : InstRXa<opcode, (outs GR64:$R1), (ins mode:$XBD2),
+ mnemonic#"\t$R1, $XBD2",
+ [(set GR64:$R1, (operator mode:$XBD2))]>;
+
+class LoadAddressRXY<string mnemonic, bits<16> opcode,
+ SDPatternOperator operator, AddressingMode mode>
+ : InstRXYa<opcode, (outs GR64:$R1), (ins mode:$XBD2),
+ mnemonic#"\t$R1, $XBD2",
+ [(set GR64:$R1, (operator mode:$XBD2))]>;
+
+multiclass LoadAddressRXPair<string mnemonic, bits<8> rxOpcode,
+ bits<16> rxyOpcode, SDPatternOperator operator> {
+ let DispKey = mnemonic in {
+ let DispSize = "12" in
+ def "" : LoadAddressRX<mnemonic, rxOpcode, operator, laaddr12pair>;
+ let DispSize = "20" in
+ def Y : LoadAddressRXY<mnemonic#"y", rxyOpcode, operator, laaddr20pair>;
+ }
+}
+
+class LoadAddressRIL<string mnemonic, bits<12> opcode,
+ SDPatternOperator operator>
+ : InstRILb<opcode, (outs GR64:$R1), (ins pcrel32:$RI2),
+ mnemonic#"\t$R1, $RI2",
+ [(set GR64:$R1, (operator pcrel32:$RI2))]>;
+
+class UnaryRR<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+ RegisterOperand cls1, RegisterOperand cls2>
+ : InstRR<opcode, (outs cls1:$R1), (ins cls2:$R2),
+ mnemonic#"\t$R1, $R2",
+ [(set cls1:$R1, (operator cls2:$R2))]> {
+ let OpKey = mnemonic#cls1;
+ let OpType = "reg";
+}
+
+class UnaryRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ RegisterOperand cls1, RegisterOperand cls2>
+ : InstRRE<opcode, (outs cls1:$R1), (ins cls2:$R2),
+ mnemonic#"\t$R1, $R2",
+ [(set cls1:$R1, (operator cls2:$R2))]> {
+ let OpKey = mnemonic#cls1;
+ let OpType = "reg";
+}
+
+class UnaryRI<string mnemonic, bits<12> opcode, SDPatternOperator operator,
+ RegisterOperand cls, Immediate imm>
+ : InstRIa<opcode, (outs cls:$R1), (ins imm:$I2),
+ mnemonic#"\t$R1, $I2",
+ [(set cls:$R1, (operator imm:$I2))]>;
+
+class UnaryRIL<string mnemonic, bits<12> opcode, SDPatternOperator operator,
+ RegisterOperand cls, Immediate imm>
+ : InstRILa<opcode, (outs cls:$R1), (ins imm:$I2),
+ mnemonic#"\t$R1, $I2",
+ [(set cls:$R1, (operator imm:$I2))]>;
+
+class UnaryRILPC<string mnemonic, bits<12> opcode, SDPatternOperator operator,
+ RegisterOperand cls>
+ : InstRILb<opcode, (outs cls:$R1), (ins pcrel32:$RI2),
+ mnemonic#"\t$R1, $RI2",
+ [(set cls:$R1, (operator pcrel32:$RI2))]> {
+ let mayLoad = 1;
+ // We want PC-relative addresses to be tried ahead of BD and BDX addresses.
+ // However, BDXs have two extra operands and are therefore 6 units more
+ // complex.
+ let AddedComplexity = 7;
+}
+
+class CondUnaryRSY<string mnemonic, bits<16> opcode,
+ SDPatternOperator operator, RegisterOperand cls,
+ bits<5> bytes, AddressingMode mode = bdaddr20only>
+ : InstRSYb<opcode, (outs cls:$R1),
+ (ins cls:$R1src, mode:$BD2, cond4:$valid, cond4:$M3),
+ mnemonic#"$M3\t$R1, $BD2",
+ [(set cls:$R1,
+ (z_select_ccmask (operator bdaddr20only:$BD2), cls:$R1src,
+ cond4:$valid, cond4:$M3))]> {
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+ let mayLoad = 1;
+ let AccessBytes = bytes;
+ let CCMaskLast = 1;
+}
+
+// Like CondUnaryRSY, but used for the raw assembly form. The condition-code
+// mask is the third operand rather than being part of the mnemonic.
+class AsmCondUnaryRSY<string mnemonic, bits<16> opcode,
+ RegisterOperand cls, bits<5> bytes,
+ AddressingMode mode = bdaddr20only>
+ : InstRSYb<opcode, (outs cls:$R1), (ins cls:$R1src, mode:$BD2, imm32zx4:$M3),
+ mnemonic#"\t$R1, $BD2, $M3", []> {
+ let mayLoad = 1;
+ let AccessBytes = bytes;
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+}
+
+// Like CondUnaryRSY, but with a fixed CC mask.
+class FixedCondUnaryRSY<CondVariant V, string mnemonic, bits<16> opcode,
+ RegisterOperand cls, bits<5> bytes,
+ AddressingMode mode = bdaddr20only>
+ : InstRSYb<opcode, (outs cls:$R1), (ins cls:$R1src, mode:$BD2),
+ mnemonic#V.suffix#"\t$R1, $BD2", []> {
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+ let mayLoad = 1;
+ let AccessBytes = bytes;
+ let isAsmParserOnly = V.alternate;
+ let M3 = V.ccmask;
+}
+
+multiclass CondUnaryRSYPair<string mnemonic, bits<16> opcode,
+ SDPatternOperator operator,
+ RegisterOperand cls, bits<5> bytes,
+ AddressingMode mode = bdaddr20only> {
+ let isCodeGenOnly = 1 in
+ def "" : CondUnaryRSY<mnemonic, opcode, operator, cls, bytes, mode>;
+ def Asm : AsmCondUnaryRSY<mnemonic, opcode, cls, bytes, mode>;
+}
+
+
+class UnaryRX<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+ RegisterOperand cls, bits<5> bytes,
+ AddressingMode mode = bdxaddr12only>
+ : InstRXa<opcode, (outs cls:$R1), (ins mode:$XBD2),
+ mnemonic#"\t$R1, $XBD2",
+ [(set cls:$R1, (operator mode:$XBD2))]> {
+ let OpKey = mnemonic#"r"#cls;
+ let OpType = "mem";
+ let mayLoad = 1;
+ let AccessBytes = bytes;
+}
+
+class UnaryRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ RegisterOperand cls, bits<5> bytes>
+ : InstRXE<opcode, (outs cls:$R1), (ins bdxaddr12only:$XBD2),
+ mnemonic#"\t$R1, $XBD2",
+ [(set cls:$R1, (operator bdxaddr12only:$XBD2))]> {
+ let OpKey = mnemonic#"r"#cls;
+ let OpType = "mem";
+ let mayLoad = 1;
+ let AccessBytes = bytes;
+ let M3 = 0;
+}
+
+class UnaryRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ RegisterOperand cls, bits<5> bytes,
+ AddressingMode mode = bdxaddr20only>
+ : InstRXYa<opcode, (outs cls:$R1), (ins mode:$XBD2),
+ mnemonic#"\t$R1, $XBD2",
+ [(set cls:$R1, (operator mode:$XBD2))]> {
+ let OpKey = mnemonic#"r"#cls;
+ let OpType = "mem";
+ let mayLoad = 1;
+ let AccessBytes = bytes;
+}
+
+multiclass UnaryRXPair<string mnemonic, bits<8> rxOpcode, bits<16> rxyOpcode,
+ SDPatternOperator operator, RegisterOperand cls,
+ bits<5> bytes> {
+ let DispKey = mnemonic ## #cls in {
+ let DispSize = "12" in
+ def "" : UnaryRX<mnemonic, rxOpcode, operator, cls, bytes, bdxaddr12pair>;
+ let DispSize = "20" in
+ def Y : UnaryRXY<mnemonic#"y", rxyOpcode, operator, cls, bytes,
+ bdxaddr20pair>;
+ }
+}
+
+class UnaryVRIa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ TypedReg tr, Immediate imm, bits<4> type = 0>
+ : InstVRIa<opcode, (outs tr.op:$V1), (ins imm:$I2),
+ mnemonic#"\t$V1, $I2",
+ [(set tr.op:$V1, (tr.vt (operator imm:$I2)))]> {
+ let M3 = type;
+}
+
+class UnaryVRIaGeneric<string mnemonic, bits<16> opcode, Immediate imm>
+ : InstVRIa<opcode, (outs VR128:$V1), (ins imm:$I2, imm32zx4:$M3),
+ mnemonic#"\t$V1, $I2, $M3", []>;
+
+class UnaryVRRa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ TypedReg tr1, TypedReg tr2, bits<4> type = 0, bits<4> m4 = 0,
+ bits<4> m5 = 0>
+ : InstVRRa<opcode, (outs tr1.op:$V1), (ins tr2.op:$V2),
+ mnemonic#"\t$V1, $V2",
+ [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2))))]> {
+ let M3 = type;
+ let M4 = m4;
+ let M5 = m5;
+}
+
+class UnaryVRRaGeneric<string mnemonic, bits<16> opcode, bits<4> m4 = 0,
+ bits<4> m5 = 0>
+ : InstVRRa<opcode, (outs VR128:$V1), (ins VR128:$V2, imm32zx4:$M3),
+ mnemonic#"\t$V1, $V2, $M3", []> {
+ let M4 = m4;
+ let M5 = m5;
+}
+
+class UnaryVRRaFloatGeneric<string mnemonic, bits<16> opcode, bits<4> m5 = 0>
+ : InstVRRa<opcode, (outs VR128:$V1),
+ (ins VR128:$V2, imm32zx4:$M3, imm32zx4:$M4),
+ mnemonic#"\t$V1, $V2, $M3, $M4", []> {
+ let M5 = m5;
+}
+
+// Declare a pair of instructions, one which sets CC and one which doesn't.
+// The CC-setting form ends with "S" and sets the low bit of M5.
+// The form that does not set CC has an extra operand to optionally allow
+// specifying arbitrary M5 values in assembler.
+multiclass UnaryExtraVRRaSPair<string mnemonic, bits<16> opcode,
+ SDPatternOperator operator,
+ SDPatternOperator operator_cc,
+ TypedReg tr1, TypedReg tr2, bits<4> type> {
+ let M3 = type, M4 = 0 in
+ def "" : InstVRRa<opcode, (outs tr1.op:$V1),
+ (ins tr2.op:$V2, imm32zx4:$M5),
+ mnemonic#"\t$V1, $V2, $M5", []>;
+ def : Pat<(tr1.vt (operator (tr2.vt tr2.op:$V2))),
+ (!cast<Instruction>(NAME) tr2.op:$V2, 0)>;
+ def : InstAlias<mnemonic#"\t$V1, $V2",
+ (!cast<Instruction>(NAME) tr1.op:$V1, tr2.op:$V2, 0)>;
+ let Defs = [CC] in
+ def S : UnaryVRRa<mnemonic##"s", opcode, operator_cc, tr1, tr2,
+ type, 0, 1>;
+}
+
+multiclass UnaryExtraVRRaSPairGeneric<string mnemonic, bits<16> opcode> {
+ let M4 = 0 in
+ def "" : InstVRRa<opcode, (outs VR128:$V1),
+ (ins VR128:$V2, imm32zx4:$M3, imm32zx4:$M5),
+ mnemonic#"\t$V1, $V2, $M3, $M5", []>;
+ def : InstAlias<mnemonic#"\t$V1, $V2, $M3",
+ (!cast<Instruction>(NAME) VR128:$V1, VR128:$V2,
+ imm32zx4:$M3, 0)>;
+}
+
+class UnaryVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ TypedReg tr, bits<5> bytes, bits<4> type = 0>
+ : InstVRX<opcode, (outs tr.op:$V1), (ins bdxaddr12only:$XBD2),
+ mnemonic#"\t$V1, $XBD2",
+ [(set tr.op:$V1, (tr.vt (operator bdxaddr12only:$XBD2)))]> {
+ let M3 = type;
+ let mayLoad = 1;
+ let AccessBytes = bytes;
+}
+
+class UnaryVRXGeneric<string mnemonic, bits<16> opcode>
+ : InstVRX<opcode, (outs VR128:$V1), (ins bdxaddr12only:$XBD2, imm32zx4:$M3),
+ mnemonic#"\t$V1, $XBD2, $M3", []> {
+ let mayLoad = 1;
+}
+
+class SideEffectBinaryRX<string mnemonic, bits<8> opcode,
+ RegisterOperand cls>
+ : InstRXa<opcode, (outs), (ins cls:$R1, bdxaddr12only:$XBD2),
+ mnemonic##"\t$R1, $XBD2", []>;
+
+class SideEffectBinaryRILPC<string mnemonic, bits<12> opcode,
+ RegisterOperand cls>
+ : InstRILb<opcode, (outs), (ins cls:$R1, pcrel32:$RI2),
+ mnemonic##"\t$R1, $RI2", []> {
+ // We want PC-relative addresses to be tried ahead of BD and BDX addresses.
+ // However, BDXs have two extra operands and are therefore 6 units more
+ // complex.
+ let AddedComplexity = 7;
+}
+
+class SideEffectBinaryIE<string mnemonic, bits<16> opcode,
+ Immediate imm1, Immediate imm2>
+ : InstIE<opcode, (outs), (ins imm1:$I1, imm2:$I2),
+ mnemonic#"\t$I1, $I2", []>;
+
+class SideEffectBinarySIL<string mnemonic, bits<16> opcode,
+ SDPatternOperator operator, Immediate imm>
+ : InstSIL<opcode, (outs), (ins bdaddr12only:$BD1, imm:$I2),
+ mnemonic#"\t$BD1, $I2", [(operator bdaddr12only:$BD1, imm:$I2)]>;
+
+class BinaryRR<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+ RegisterOperand cls1, RegisterOperand cls2>
+ : InstRR<opcode, (outs cls1:$R1), (ins cls1:$R1src, cls2:$R2),
+ mnemonic#"\t$R1, $R2",
+ [(set cls1:$R1, (operator cls1:$R1src, cls2:$R2))]> {
+ let OpKey = mnemonic#cls1;
+ let OpType = "reg";
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+}
+
+class BinaryRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ RegisterOperand cls1, RegisterOperand cls2>
+ : InstRRE<opcode, (outs cls1:$R1), (ins cls1:$R1src, cls2:$R2),
+ mnemonic#"\t$R1, $R2",
+ [(set cls1:$R1, (operator cls1:$R1src, cls2:$R2))]> {
+ let OpKey = mnemonic#cls1;
+ let OpType = "reg";
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+}
+
+class BinaryRRFa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ RegisterOperand cls1, RegisterOperand cls2,
+ RegisterOperand cls3>
+ : InstRRFa<opcode, (outs cls1:$R1), (ins cls2:$R2, cls3:$R3),
+ mnemonic#"\t$R1, $R2, $R3",
+ [(set cls1:$R1, (operator cls2:$R2, cls3:$R3))]> {
+ let M4 = 0;
+}
+
+multiclass BinaryRRAndK<string mnemonic, bits<8> opcode1, bits<16> opcode2,
+ SDPatternOperator operator, RegisterOperand cls1,
+ RegisterOperand cls2> {
+ let NumOpsKey = mnemonic in {
+ let NumOpsValue = "3" in
+ def K : BinaryRRFa<mnemonic#"k", opcode2, null_frag, cls1, cls1, cls2>,
+ Requires<[FeatureDistinctOps]>;
+ let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in
+ def "" : BinaryRR<mnemonic, opcode1, operator, cls1, cls2>;
+ }
+}
+
+multiclass BinaryRREAndK<string mnemonic, bits<16> opcode1, bits<16> opcode2,
+ SDPatternOperator operator, RegisterOperand cls1,
+ RegisterOperand cls2> {
+ let NumOpsKey = mnemonic in {
+ let NumOpsValue = "3" in
+ def K : BinaryRRFa<mnemonic#"k", opcode2, null_frag, cls1, cls1, cls2>,
+ Requires<[FeatureDistinctOps]>;
+ let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in
+ def "" : BinaryRRE<mnemonic, opcode1, operator, cls1, cls2>;
+ }
+}
+
+class BinaryRRFb<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ RegisterOperand cls1, RegisterOperand cls2,
+ RegisterOperand cls3>
+ : InstRRFb<opcode, (outs cls1:$R1), (ins cls2:$R2, cls3:$R3),
+ mnemonic#"\t$R1, $R3, $R2",
+ [(set cls1:$R1, (operator cls2:$R2, cls3:$R3))]> {
+ let M4 = 0;
+}
+
+class BinaryRRFe<string mnemonic, bits<16> opcode, RegisterOperand cls1,
+ RegisterOperand cls2>
+ : InstRRFe<opcode, (outs cls1:$R1), (ins imm32zx4:$M3, cls2:$R2),
+ mnemonic#"\t$R1, $M3, $R2", []> {
+ let M4 = 0;
+}
+
+class CondBinaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1,
+ RegisterOperand cls2>
+ : InstRRFc<opcode, (outs cls1:$R1),
+ (ins cls1:$R1src, cls2:$R2, cond4:$valid, cond4:$M3),
+ mnemonic#"$M3\t$R1, $R2", []> {
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+ let CCMaskLast = 1;
+}
+
+// Like CondBinaryRRF, but used for the raw assembly form. The condition-code
+// mask is the third operand rather than being part of the mnemonic.
+class AsmCondBinaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1,
+ RegisterOperand cls2>
+ : InstRRFc<opcode, (outs cls1:$R1),
+ (ins cls1:$R1src, cls2:$R2, imm32zx4:$M3),
+ mnemonic#"\t$R1, $R2, $M3", []> {
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+}
+
+// Like CondBinaryRRF, but with a fixed CC mask.
+class FixedCondBinaryRRF<CondVariant V, string mnemonic, bits<16> opcode,
+ RegisterOperand cls1, RegisterOperand cls2>
+ : InstRRFc<opcode, (outs cls1:$R1), (ins cls1:$R1src, cls2:$R2),
+ mnemonic#V.suffix#"\t$R1, $R2", []> {
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+ let isAsmParserOnly = V.alternate;
+ let M3 = V.ccmask;
+}
+
+multiclass CondBinaryRRFPair<string mnemonic, bits<16> opcode,
+ RegisterOperand cls1, RegisterOperand cls2> {
+ let isCodeGenOnly = 1 in
+ def "" : CondBinaryRRF<mnemonic, opcode, cls1, cls2>;
+ def Asm : AsmCondBinaryRRF<mnemonic, opcode, cls1, cls2>;
+}
+
+class BinaryRI<string mnemonic, bits<12> opcode, SDPatternOperator operator,
+ RegisterOperand cls, Immediate imm>
+ : InstRIa<opcode, (outs cls:$R1), (ins cls:$R1src, imm:$I2),
+ mnemonic#"\t$R1, $I2",
+ [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+}
+
+class BinaryRIE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ RegisterOperand cls, Immediate imm>
+ : InstRIEd<opcode, (outs cls:$R1), (ins cls:$R3, imm:$I2),
+ mnemonic#"\t$R1, $R3, $I2",
+ [(set cls:$R1, (operator cls:$R3, imm:$I2))]>;
+
+multiclass BinaryRIAndK<string mnemonic, bits<12> opcode1, bits<16> opcode2,
+ SDPatternOperator operator, RegisterOperand cls,
+ Immediate imm> {
+ let NumOpsKey = mnemonic in {
+ let NumOpsValue = "3" in
+ def K : BinaryRIE<mnemonic##"k", opcode2, null_frag, cls, imm>,
+ Requires<[FeatureDistinctOps]>;
+ let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in
+ def "" : BinaryRI<mnemonic, opcode1, operator, cls, imm>;
+ }
+}
+
+class CondBinaryRIE<string mnemonic, bits<16> opcode, RegisterOperand cls,
+ Immediate imm>
+ : InstRIEg<opcode, (outs cls:$R1),
+ (ins cls:$R1src, imm:$I2, cond4:$valid, cond4:$M3),
+ mnemonic#"$M3\t$R1, $I2",
+ [(set cls:$R1, (z_select_ccmask imm:$I2, cls:$R1src,
+ cond4:$valid, cond4:$M3))]> {
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+ let CCMaskLast = 1;
+}
+
+// Like CondBinaryRIE, but used for the raw assembly form. The condition-code
+// mask is the third operand rather than being part of the mnemonic.
+class AsmCondBinaryRIE<string mnemonic, bits<16> opcode, RegisterOperand cls,
+ Immediate imm>
+ : InstRIEg<opcode, (outs cls:$R1),
+ (ins cls:$R1src, imm:$I2, imm32zx4:$M3),
+ mnemonic#"\t$R1, $I2, $M3", []> {
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+}
+
+// Like CondBinaryRIE, but with a fixed CC mask.
+class FixedCondBinaryRIE<CondVariant V, string mnemonic, bits<16> opcode,
+ RegisterOperand cls, Immediate imm>
+ : InstRIEg<opcode, (outs cls:$R1), (ins cls:$R1src, imm:$I2),
+ mnemonic#V.suffix#"\t$R1, $I2", []> {
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+ let isAsmParserOnly = V.alternate;
+ let M3 = V.ccmask;
+}
+
+multiclass CondBinaryRIEPair<string mnemonic, bits<16> opcode,
+ RegisterOperand cls, Immediate imm> {
+ let isCodeGenOnly = 1 in
+ def "" : CondBinaryRIE<mnemonic, opcode, cls, imm>;
+ def Asm : AsmCondBinaryRIE<mnemonic, opcode, cls, imm>;
+}
+
+class BinaryRIL<string mnemonic, bits<12> opcode, SDPatternOperator operator,
+ RegisterOperand cls, Immediate imm>
+ : InstRILa<opcode, (outs cls:$R1), (ins cls:$R1src, imm:$I2),
+ mnemonic#"\t$R1, $I2",
+ [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+}
+
+class BinaryRS<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+ RegisterOperand cls>
+ : InstRSa<opcode, (outs cls:$R1), (ins cls:$R1src, shift12only:$BD2),
+ mnemonic#"\t$R1, $BD2",
+ [(set cls:$R1, (operator cls:$R1src, shift12only:$BD2))]> {
+ let R3 = 0;
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+}
+
+class BinaryRSY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ RegisterOperand cls>
+ : InstRSYa<opcode, (outs cls:$R1), (ins cls:$R3, shift20only:$BD2),
+ mnemonic#"\t$R1, $R3, $BD2",
+ [(set cls:$R1, (operator cls:$R3, shift20only:$BD2))]>;
+
+multiclass BinaryRSAndK<string mnemonic, bits<8> opcode1, bits<16> opcode2,
+ SDPatternOperator operator, RegisterOperand cls> {
+ let NumOpsKey = mnemonic in {
+ let NumOpsValue = "3" in
+ def K : BinaryRSY<mnemonic##"k", opcode2, null_frag, cls>,
+ Requires<[FeatureDistinctOps]>;
+ let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in
+ def "" : BinaryRS<mnemonic, opcode1, operator, cls>;
+ }
+}
+
+class BinaryRX<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+ RegisterOperand cls, SDPatternOperator load, bits<5> bytes,
+ AddressingMode mode = bdxaddr12only>
+ : InstRXa<opcode, (outs cls:$R1), (ins cls:$R1src, mode:$XBD2),
+ mnemonic#"\t$R1, $XBD2",
+ [(set cls:$R1, (operator cls:$R1src, (load mode:$XBD2)))]> {
+ let OpKey = mnemonic#"r"#cls;
+ let OpType = "mem";
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+ let mayLoad = 1;
+ let AccessBytes = bytes;
+}
+
+class BinaryRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ RegisterOperand cls, SDPatternOperator load, bits<5> bytes>
+ : InstRXE<opcode, (outs cls:$R1), (ins cls:$R1src, bdxaddr12only:$XBD2),
+ mnemonic#"\t$R1, $XBD2",
+ [(set cls:$R1, (operator cls:$R1src,
+ (load bdxaddr12only:$XBD2)))]> {
+ let OpKey = mnemonic#"r"#cls;
+ let OpType = "mem";
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+ let mayLoad = 1;
+ let AccessBytes = bytes;
+ let M3 = 0;
+}
+
+class BinaryRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ RegisterOperand cls, SDPatternOperator load, bits<5> bytes,
+ AddressingMode mode = bdxaddr20only>
+ : InstRXYa<opcode, (outs cls:$R1), (ins cls:$R1src, mode:$XBD2),
+ mnemonic#"\t$R1, $XBD2",
+ [(set cls:$R1, (operator cls:$R1src, (load mode:$XBD2)))]> {
+ let OpKey = mnemonic#"r"#cls;
+ let OpType = "mem";
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+ let mayLoad = 1;
+ let AccessBytes = bytes;
+}
+
+multiclass BinaryRXPair<string mnemonic, bits<8> rxOpcode, bits<16> rxyOpcode,
+ SDPatternOperator operator, RegisterOperand cls,
+ SDPatternOperator load, bits<5> bytes> {
+ let DispKey = mnemonic ## #cls in {
+ let DispSize = "12" in
+ def "" : BinaryRX<mnemonic, rxOpcode, operator, cls, load, bytes,
+ bdxaddr12pair>;
+ let DispSize = "20" in
+ def Y : BinaryRXY<mnemonic#"y", rxyOpcode, operator, cls, load, bytes,
+ bdxaddr20pair>;
+ }
+}
+
+class BinarySI<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+ Operand imm, AddressingMode mode = bdaddr12only>
+ : InstSI<opcode, (outs), (ins mode:$BD1, imm:$I2),
+ mnemonic#"\t$BD1, $I2",
+ [(store (operator (load mode:$BD1), imm:$I2), mode:$BD1)]> {
+ let mayLoad = 1;
+ let mayStore = 1;
+}
+
+class BinarySIY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ Operand imm, AddressingMode mode = bdaddr20only>
+ : InstSIY<opcode, (outs), (ins mode:$BD1, imm:$I2),
+ mnemonic#"\t$BD1, $I2",
+ [(store (operator (load mode:$BD1), imm:$I2), mode:$BD1)]> {
+ let mayLoad = 1;
+ let mayStore = 1;
+}
+
+multiclass BinarySIPair<string mnemonic, bits<8> siOpcode,
+ bits<16> siyOpcode, SDPatternOperator operator,
+ Operand imm> {
+ let DispKey = mnemonic ## #cls in {
+ let DispSize = "12" in
+ def "" : BinarySI<mnemonic, siOpcode, operator, imm, bdaddr12pair>;
+ let DispSize = "20" in
+ def Y : BinarySIY<mnemonic#"y", siyOpcode, operator, imm, bdaddr20pair>;
+ }
+}
+
+class BinarySSF<string mnemonic, bits<12> opcode, RegisterOperand cls>
+ : InstSSF<opcode, (outs cls:$R3), (ins bdaddr12pair:$BD1, bdaddr12pair:$BD2),
+ mnemonic#"\t$R3, $BD1, $BD2", []> {
+ let mayLoad = 1;
+}
+
+class BinaryVRIb<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ TypedReg tr, bits<4> type>
+ : InstVRIb<opcode, (outs tr.op:$V1), (ins imm32zx8:$I2, imm32zx8:$I3),
+ mnemonic#"\t$V1, $I2, $I3",
+ [(set tr.op:$V1, (tr.vt (operator imm32zx8:$I2, imm32zx8:$I3)))]> {
+ let M4 = type;
+}
+
+class BinaryVRIbGeneric<string mnemonic, bits<16> opcode>
+ : InstVRIb<opcode, (outs VR128:$V1),
+ (ins imm32zx8:$I2, imm32zx8:$I3, imm32zx4:$M4),
+ mnemonic#"\t$V1, $I2, $I3, $M4", []>;
+
+class BinaryVRIc<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ TypedReg tr1, TypedReg tr2, bits<4> type>
+ : InstVRIc<opcode, (outs tr1.op:$V1), (ins tr2.op:$V3, imm32zx16:$I2),
+ mnemonic#"\t$V1, $V3, $I2",
+ [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V3),
+ imm32zx16:$I2)))]> {
+ let M4 = type;
+}
+
+class BinaryVRIcGeneric<string mnemonic, bits<16> opcode>
+ : InstVRIc<opcode, (outs VR128:$V1),
+ (ins VR128:$V3, imm32zx16:$I2, imm32zx4:$M4),
+ mnemonic#"\t$V1, $V3, $I2, $M4", []>;
+
+class BinaryVRIe<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ TypedReg tr1, TypedReg tr2, bits<4> type, bits<4> m5>
+ : InstVRIe<opcode, (outs tr1.op:$V1), (ins tr2.op:$V2, imm32zx12:$I3),
+ mnemonic#"\t$V1, $V2, $I3",
+ [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
+ imm32zx12:$I3)))]> {
+ let M4 = type;
+ let M5 = m5;
+}
+
+class BinaryVRIeFloatGeneric<string mnemonic, bits<16> opcode>
+ : InstVRIe<opcode, (outs VR128:$V1),
+ (ins VR128:$V2, imm32zx12:$I3, imm32zx4:$M4, imm32zx4:$M5),
+ mnemonic#"\t$V1, $V2, $I3, $M4, $M5", []>;
+
+class BinaryVRRa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ TypedReg tr1, TypedReg tr2, bits<4> type = 0, bits<4> m4 = 0>
+ : InstVRRa<opcode, (outs tr1.op:$V1), (ins tr2.op:$V2, imm32zx4:$M5),
+ mnemonic#"\t$V1, $V2, $M5",
+ [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
+ imm32zx12:$M5)))]> {
+ let M3 = type;
+ let M4 = m4;
+}
+
+class BinaryVRRaFloatGeneric<string mnemonic, bits<16> opcode>
+ : InstVRRa<opcode, (outs VR128:$V1),
+ (ins VR128:$V2, imm32zx4:$M3, imm32zx4:$M4, imm32zx4:$M5),
+ mnemonic#"\t$V1, $V2, $M3, $M4, $M5", []>;
+
+class BinaryVRRb<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ TypedReg tr1, TypedReg tr2, bits<4> type = 0,
+ bits<4> modifier = 0>
+ : InstVRRb<opcode, (outs tr1.op:$V1), (ins tr2.op:$V2, tr2.op:$V3),
+ mnemonic#"\t$V1, $V2, $V3",
+ [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
+ (tr2.vt tr2.op:$V3))))]> {
+ let M4 = type;
+ let M5 = modifier;
+}
+
+// Declare a pair of instructions, one which sets CC and one which doesn't.
+// The CC-setting form ends with "S" and sets the low bit of M5.
+multiclass BinaryVRRbSPair<string mnemonic, bits<16> opcode,
+ SDPatternOperator operator,
+ SDPatternOperator operator_cc, TypedReg tr1,
+ TypedReg tr2, bits<4> type, bits<4> modifier = 0> {
+ def "" : BinaryVRRb<mnemonic, opcode, operator, tr1, tr2, type,
+ !and (modifier, 14)>;
+ let Defs = [CC] in
+ def S : BinaryVRRb<mnemonic##"s", opcode, operator_cc, tr1, tr2, type,
+ !add (!and (modifier, 14), 1)>;
+}
+
+class BinaryVRRbSPairGeneric<string mnemonic, bits<16> opcode>
+ : InstVRRb<opcode, (outs VR128:$V1),
+ (ins VR128:$V2, VR128:$V3, imm32zx4:$M4, imm32zx4:$M5),
+ mnemonic#"\t$V1, $V2, $V3, $M4, $M5", []>;
+
+// Declare a pair of instructions, one which sets CC and one which doesn't.
+// The CC-setting form ends with "S" and sets the low bit of M5.
+// The form that does not set CC has an extra operand to optionally allow
+// specifying arbitrary M5 values in assembler.
+multiclass BinaryExtraVRRbSPair<string mnemonic, bits<16> opcode,
+ SDPatternOperator operator,
+ SDPatternOperator operator_cc,
+ TypedReg tr1, TypedReg tr2, bits<4> type> {
+ let M4 = type in
+ def "" : InstVRRb<opcode, (outs tr1.op:$V1),
+ (ins tr2.op:$V2, tr2.op:$V3, imm32zx4:$M5),
+ mnemonic#"\t$V1, $V2, $V3, $M5", []>;
+ def : Pat<(tr1.vt (operator (tr2.vt tr2.op:$V2), (tr2.vt tr2.op:$V3))),
+ (!cast<Instruction>(NAME) tr2.op:$V2, tr2.op:$V3, 0)>;
+ def : InstAlias<mnemonic#"\t$V1, $V2, $V3",
+ (!cast<Instruction>(NAME) tr1.op:$V1, tr2.op:$V2,
+ tr2.op:$V3, 0)>;
+ let Defs = [CC] in
+ def S : BinaryVRRb<mnemonic##"s", opcode, operator_cc, tr1, tr2, type, 1>;
+}
+
+multiclass BinaryExtraVRRbSPairGeneric<string mnemonic, bits<16> opcode> {
+ def "" : InstVRRb<opcode, (outs VR128:$V1),
+ (ins VR128:$V2, VR128:$V3, imm32zx4:$M4, imm32zx4:$M5),
+ mnemonic#"\t$V1, $V2, $V3, $M4, $M5", []>;
+ def : InstAlias<mnemonic#"\t$V1, $V2, $V3, $M4",
+ (!cast<Instruction>(NAME) VR128:$V1, VR128:$V2, VR128:$V3,
+ imm32zx4:$M4, 0)>;
+}
+
+class BinaryVRRc<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ TypedReg tr1, TypedReg tr2, bits<4> type = 0, bits<4> m5 = 0,
+ bits<4> m6 = 0>
+ : InstVRRc<opcode, (outs tr1.op:$V1), (ins tr2.op:$V2, tr2.op:$V3),
+ mnemonic#"\t$V1, $V2, $V3",
+ [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
+ (tr2.vt tr2.op:$V3))))]> {
+ let M4 = type;
+ let M5 = m5;
+ let M6 = m6;
+}
+
+class BinaryVRRcGeneric<string mnemonic, bits<16> opcode, bits<4> m5 = 0,
+ bits<4> m6 = 0>
+ : InstVRRc<opcode, (outs VR128:$V1),
+ (ins VR128:$V2, VR128:$V3, imm32zx4:$M4),
+ mnemonic#"\t$V1, $V2, $V3, $M4", []> {
+ let M5 = m5;
+ let M6 = m6;
+}
+
+class BinaryVRRcFloatGeneric<string mnemonic, bits<16> opcode, bits<4> m6 = 0>
+ : InstVRRc<opcode, (outs VR128:$V1),
+ (ins VR128:$V2, VR128:$V3, imm32zx4:$M4, imm32zx4:$M5),
+ mnemonic#"\t$V1, $V2, $V3, $M4, $M5", []> {
+ let M6 = m6;
+}
+
+// Declare a pair of instructions, one which sets CC and one which doesn't.
+// The CC-setting form ends with "S" and sets the low bit of M5.
+multiclass BinaryVRRcSPair<string mnemonic, bits<16> opcode,
+ SDPatternOperator operator,
+ SDPatternOperator operator_cc, TypedReg tr1,
+ TypedReg tr2, bits<4> type, bits<4> m5,
+ bits<4> modifier = 0> {
+ def "" : BinaryVRRc<mnemonic, opcode, operator, tr1, tr2, type,
+ m5, !and (modifier, 14)>;
+ let Defs = [CC] in
+ def S : BinaryVRRc<mnemonic##"s", opcode, operator_cc, tr1, tr2, type,
+ m5, !add (!and (modifier, 14), 1)>;
+}
+
+class BinaryVRRcSPairFloatGeneric<string mnemonic, bits<16> opcode>
+ : InstVRRc<opcode, (outs VR128:$V1),
+ (ins VR128:$V2, VR128:$V3, imm32zx4:$M4, imm32zx4:$M5,
+ imm32zx4:$M6),
+ mnemonic#"\t$V1, $V2, $V3, $M4, $M5, $M6", []>;
+
+class BinaryVRRf<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ TypedReg tr>
+ : InstVRRf<opcode, (outs tr.op:$V1), (ins GR64:$R2, GR64:$R3),
+ mnemonic#"\t$V1, $R2, $R3",
+ [(set tr.op:$V1, (tr.vt (operator GR64:$R2, GR64:$R3)))]>;
+
+class BinaryVRSa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ TypedReg tr1, TypedReg tr2, bits<4> type>
+ : InstVRSa<opcode, (outs tr1.op:$V1), (ins tr2.op:$V3, shift12only:$BD2),
+ mnemonic#"\t$V1, $V3, $BD2",
+ [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V3),
+ shift12only:$BD2)))]> {
+ let M4 = type;
+}
+
+class BinaryVRSaGeneric<string mnemonic, bits<16> opcode>
+ : InstVRSa<opcode, (outs VR128:$V1),
+ (ins VR128:$V3, shift12only:$BD2, imm32zx4:$M4),
+ mnemonic#"\t$V1, $V3, $BD2, $M4", []>;
+
+class BinaryVRSb<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ bits<5> bytes>
+ : InstVRSb<opcode, (outs VR128:$V1), (ins GR32:$R3, bdaddr12only:$BD2),
+ mnemonic#"\t$V1, $R3, $BD2",
+ [(set VR128:$V1, (operator GR32:$R3, bdaddr12only:$BD2))]> {
+ let M4 = 0;
+ let mayLoad = 1;
+ let AccessBytes = bytes;
+}
+
+class BinaryVRSc<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ TypedReg tr, bits<4> type>
+ : InstVRSc<opcode, (outs GR64:$R1), (ins tr.op:$V3, shift12only:$BD2),
+ mnemonic#"\t$R1, $V3, $BD2",
+ [(set GR64:$R1, (operator (tr.vt tr.op:$V3), shift12only:$BD2))]> {
+ let M4 = type;
+}
+
+class BinaryVRScGeneric<string mnemonic, bits<16> opcode>
+ : InstVRSc<opcode, (outs GR64:$R1),
+ (ins VR128:$V3, shift12only:$BD2, imm32zx4: $M4),
+ mnemonic#"\t$R1, $V3, $BD2, $M4", []>;
+
+class BinaryVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ TypedReg tr, bits<5> bytes>
+ : InstVRX<opcode, (outs VR128:$V1), (ins bdxaddr12only:$XBD2, imm32zx4:$M3),
+ mnemonic#"\t$V1, $XBD2, $M3",
+ [(set tr.op:$V1, (tr.vt (operator bdxaddr12only:$XBD2,
+ imm32zx4:$M3)))]> {
+ let mayLoad = 1;
+ let AccessBytes = bytes;
+}
+
+class StoreBinaryVRV<string mnemonic, bits<16> opcode, bits<5> bytes,
+ Immediate index>
+ : InstVRV<opcode, (outs), (ins VR128:$V1, bdvaddr12only:$VBD2, index:$M3),
+ mnemonic#"\t$V1, $VBD2, $M3", []> {
+ let mayStore = 1;
+ let AccessBytes = bytes;
+}
+
+class StoreBinaryVRX<string mnemonic, bits<16> opcode,
+ SDPatternOperator operator, TypedReg tr, bits<5> bytes,
+ Immediate index>
+ : InstVRX<opcode, (outs), (ins tr.op:$V1, bdxaddr12only:$XBD2, index:$M3),
+ mnemonic#"\t$V1, $XBD2, $M3",
+ [(operator (tr.vt tr.op:$V1), bdxaddr12only:$XBD2, index:$M3)]> {
+ let mayStore = 1;
+ let AccessBytes = bytes;
+}
+
+class MemoryBinarySSd<string mnemonic, bits<8> opcode,
+ RegisterOperand cls>
+ : InstSSd<opcode, (outs),
+ (ins bdraddr12only:$RBD1, bdaddr12only:$BD2, cls:$R3),
+ mnemonic#"\t$RBD1, $BD2, $R3", []>;
+
+class CompareRR<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+ RegisterOperand cls1, RegisterOperand cls2>
+ : InstRR<opcode, (outs), (ins cls1:$R1, cls2:$R2),
+ mnemonic#"\t$R1, $R2",
+ [(operator cls1:$R1, cls2:$R2)]> {
+ let OpKey = mnemonic#cls1;
+ let OpType = "reg";
+ let isCompare = 1;
+}
+
+class CompareRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ RegisterOperand cls1, RegisterOperand cls2>
+ : InstRRE<opcode, (outs), (ins cls1:$R1, cls2:$R2),
+ mnemonic#"\t$R1, $R2",
+ [(operator cls1:$R1, cls2:$R2)]> {
+ let OpKey = mnemonic#cls1;
+ let OpType = "reg";
+ let isCompare = 1;
+}
+
+class CompareRI<string mnemonic, bits<12> opcode, SDPatternOperator operator,
+ RegisterOperand cls, Immediate imm>
+ : InstRIa<opcode, (outs), (ins cls:$R1, imm:$I2),
+ mnemonic#"\t$R1, $I2",
+ [(operator cls:$R1, imm:$I2)]> {
+ let isCompare = 1;
+}
+
+class CompareRIL<string mnemonic, bits<12> opcode, SDPatternOperator operator,
+ RegisterOperand cls, Immediate imm>
+ : InstRILa<opcode, (outs), (ins cls:$R1, imm:$I2),
+ mnemonic#"\t$R1, $I2",
+ [(operator cls:$R1, imm:$I2)]> {
+ let isCompare = 1;
+}
+
+class CompareRILPC<string mnemonic, bits<12> opcode, SDPatternOperator operator,
+ RegisterOperand cls, SDPatternOperator load>
+ : InstRILb<opcode, (outs), (ins cls:$R1, pcrel32:$RI2),
+ mnemonic#"\t$R1, $RI2",
+ [(operator cls:$R1, (load pcrel32:$RI2))]> {
+ let isCompare = 1;
+ let mayLoad = 1;
+ // We want PC-relative addresses to be tried ahead of BD and BDX addresses.
+ // However, BDXs have two extra operands and are therefore 6 units more
+ // complex.
+ let AddedComplexity = 7;
+}
+
+class CompareRX<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+ RegisterOperand cls, SDPatternOperator load, bits<5> bytes,
+ AddressingMode mode = bdxaddr12only>
+ : InstRXa<opcode, (outs), (ins cls:$R1, mode:$XBD2),
+ mnemonic#"\t$R1, $XBD2",
+ [(operator cls:$R1, (load mode:$XBD2))]> {
+ let OpKey = mnemonic#"r"#cls;
+ let OpType = "mem";
+ let isCompare = 1;
+ let mayLoad = 1;
+ let AccessBytes = bytes;
+}
+
+class CompareRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ RegisterOperand cls, SDPatternOperator load, bits<5> bytes>
+ : InstRXE<opcode, (outs), (ins cls:$R1, bdxaddr12only:$XBD2),
+ mnemonic#"\t$R1, $XBD2",
+ [(operator cls:$R1, (load bdxaddr12only:$XBD2))]> {
+ let OpKey = mnemonic#"r"#cls;
+ let OpType = "mem";
+ let isCompare = 1;
+ let mayLoad = 1;
+ let AccessBytes = bytes;
+ let M3 = 0;
+}
+
+class CompareRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ RegisterOperand cls, SDPatternOperator load, bits<5> bytes,
+ AddressingMode mode = bdxaddr20only>
+ : InstRXYa<opcode, (outs), (ins cls:$R1, mode:$XBD2),
+ mnemonic#"\t$R1, $XBD2",
+ [(operator cls:$R1, (load mode:$XBD2))]> {
+ let OpKey = mnemonic#"r"#cls;
+ let OpType = "mem";
+ let isCompare = 1;
+ let mayLoad = 1;
+ let AccessBytes = bytes;
+}
+
+multiclass CompareRXPair<string mnemonic, bits<8> rxOpcode, bits<16> rxyOpcode,
+ SDPatternOperator operator, RegisterOperand cls,
+ SDPatternOperator load, bits<5> bytes> {
+ let DispKey = mnemonic ## #cls in {
+ let DispSize = "12" in
+ def "" : CompareRX<mnemonic, rxOpcode, operator, cls,
+ load, bytes, bdxaddr12pair>;
+ let DispSize = "20" in
+ def Y : CompareRXY<mnemonic#"y", rxyOpcode, operator, cls,
+ load, bytes, bdxaddr20pair>;
+ }
+}
+
+class CompareSI<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+ SDPatternOperator load, Immediate imm,
+ AddressingMode mode = bdaddr12only>
+ : InstSI<opcode, (outs), (ins mode:$BD1, imm:$I2),
+ mnemonic#"\t$BD1, $I2",
+ [(operator (load mode:$BD1), imm:$I2)]> {
+ let isCompare = 1;
+ let mayLoad = 1;
+}
+
+class CompareSIL<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ SDPatternOperator load, Immediate imm>
+ : InstSIL<opcode, (outs), (ins bdaddr12only:$BD1, imm:$I2),
+ mnemonic#"\t$BD1, $I2",
+ [(operator (load bdaddr12only:$BD1), imm:$I2)]> {
+ let isCompare = 1;
+ let mayLoad = 1;
+}
+
+class CompareSIY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ SDPatternOperator load, Immediate imm,
+ AddressingMode mode = bdaddr20only>
+ : InstSIY<opcode, (outs), (ins mode:$BD1, imm:$I2),
+ mnemonic#"\t$BD1, $I2",
+ [(operator (load mode:$BD1), imm:$I2)]> {
+ let isCompare = 1;
+ let mayLoad = 1;
+}
+
+multiclass CompareSIPair<string mnemonic, bits<8> siOpcode, bits<16> siyOpcode,
+ SDPatternOperator operator, SDPatternOperator load,
+ Immediate imm> {
+ let DispKey = mnemonic in {
+ let DispSize = "12" in
+ def "" : CompareSI<mnemonic, siOpcode, operator, load, imm, bdaddr12pair>;
+ let DispSize = "20" in
+ def Y : CompareSIY<mnemonic#"y", siyOpcode, operator, load, imm,
+ bdaddr20pair>;
+ }
+}
+
+class CompareVRRa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ TypedReg tr, bits<4> type>
+ : InstVRRa<opcode, (outs), (ins tr.op:$V1, tr.op:$V2),
+ mnemonic#"\t$V1, $V2",
+ [(operator (tr.vt tr.op:$V1), (tr.vt tr.op:$V2))]> {
+ let isCompare = 1;
+ let M3 = type;
+ let M4 = 0;
+ let M5 = 0;
+}
+
+class CompareVRRaGeneric<string mnemonic, bits<16> opcode>
+ : InstVRRa<opcode, (outs), (ins VR128:$V1, VR128:$V2, imm32zx4:$M3),
+ mnemonic#"\t$V1, $V2, $M3", []> {
+ let isCompare = 1;
+ let M4 = 0;
+ let M5 = 0;
+}
+
+class CompareVRRaFloatGeneric<string mnemonic, bits<16> opcode>
+ : InstVRRa<opcode, (outs),
+ (ins VR64:$V1, VR64:$V2, imm32zx4:$M3, imm32zx4:$M4),
+ mnemonic#"\t$V1, $V2, $M3, $M4", []> {
+ let isCompare = 1;
+ let M5 = 0;
+}
+
+class TestRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ RegisterOperand cls>
+ : InstRXE<opcode, (outs), (ins cls:$R1, bdxaddr12only:$XBD2),
+ mnemonic#"\t$R1, $XBD2",
+ [(operator cls:$R1, bdxaddr12only:$XBD2)]> {
+ let M3 = 0;
+}
+
+class SideEffectTernaryRRFc<string mnemonic, bits<16> opcode,
+ RegisterOperand cls1, RegisterOperand cls2,
+ Immediate imm>
+ : InstRRFc<opcode, (outs), (ins cls1:$R1, cls2:$R2, imm:$M3),
+ mnemonic#"\t$R1, $R2, $M3", []>;
+
+class SideEffectTernarySSF<string mnemonic, bits<12> opcode,
+ RegisterOperand cls>
+ : InstSSF<opcode, (outs),
+ (ins bdaddr12only:$BD1, bdaddr12only:$BD2, cls:$R3),
+ mnemonic#"\t$BD1, $BD2, $R3", []>;
+
+class TernaryRRFe<string mnemonic, bits<16> opcode, RegisterOperand cls1,
+ RegisterOperand cls2>
+ : InstRRFe<opcode, (outs cls1:$R1),
+ (ins imm32zx4:$M3, cls2:$R2, imm32zx4:$M4),
+ mnemonic#"\t$R1, $M3, $R2, $M4", []>;
+
+class TernaryRRD<string mnemonic, bits<16> opcode,
+ SDPatternOperator operator, RegisterOperand cls>
+ : InstRRD<opcode, (outs cls:$R1), (ins cls:$R1src, cls:$R3, cls:$R2),
+ mnemonic#"\t$R1, $R3, $R2",
+ [(set cls:$R1, (operator cls:$R1src, cls:$R3, cls:$R2))]> {
+ let OpKey = mnemonic#cls;
+ let OpType = "reg";
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+}
+
+class TernaryRS<string mnemonic, bits<8> opcode, RegisterOperand cls,
+ bits<5> bytes, AddressingMode mode = bdaddr12only>
+ : InstRSb<opcode, (outs cls:$R1),
+ (ins cls:$R1src, imm32zx4:$M3, mode:$BD2),
+ mnemonic#"\t$R1, $M3, $BD2", []> {
+
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+ let mayLoad = 1;
+ let AccessBytes = bytes;
+}
+
+class TernaryRSY<string mnemonic, bits<16> opcode, RegisterOperand cls,
+ bits<5> bytes, AddressingMode mode = bdaddr20only>
+ : InstRSYb<opcode, (outs cls:$R1),
+ (ins cls:$R1src, imm32zx4:$M3, mode:$BD2),
+ mnemonic#"\t$R1, $M3, $BD2", []> {
+
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+ let mayLoad = 1;
+ let AccessBytes = bytes;
+}
+
+multiclass TernaryRSPair<string mnemonic, bits<8> rsOpcode, bits<16> rsyOpcode,
+ RegisterOperand cls, bits<5> bytes> {
+ let DispKey = mnemonic ## #cls in {
+ let DispSize = "12" in
+ def "" : TernaryRS<mnemonic, rsOpcode, cls, bytes, bdaddr12pair>;
+ let DispSize = "20" in
+ def Y : TernaryRSY<mnemonic#"y", rsyOpcode, cls, bytes, bdaddr20pair>;
+ }
+}
+
+class TernaryRXF<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ RegisterOperand cls, SDPatternOperator load, bits<5> bytes>
+ : InstRXF<opcode, (outs cls:$R1),
+ (ins cls:$R1src, cls:$R3, bdxaddr12only:$XBD2),
+ mnemonic#"\t$R1, $R3, $XBD2",
+ [(set cls:$R1, (operator cls:$R1src, cls:$R3,
+ (load bdxaddr12only:$XBD2)))]> {
+ let OpKey = mnemonic#"r"#cls;
+ let OpType = "mem";
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+ let mayLoad = 1;
+ let AccessBytes = bytes;
+}
+
+class TernaryVRIa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ TypedReg tr1, TypedReg tr2, Immediate imm, Immediate index>
+ : InstVRIa<opcode, (outs tr1.op:$V1), (ins tr2.op:$V1src, imm:$I2, index:$M3),
+ mnemonic#"\t$V1, $I2, $M3",
+ [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V1src),
+ imm:$I2, index:$M3)))]> {
+ let Constraints = "$V1 = $V1src";
+ let DisableEncoding = "$V1src";
+}
+
+class TernaryVRId<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ TypedReg tr1, TypedReg tr2, bits<4> type>
+ : InstVRId<opcode, (outs tr1.op:$V1),
+ (ins tr2.op:$V2, tr2.op:$V3, imm32zx8:$I4),
+ mnemonic#"\t$V1, $V2, $V3, $I4",
+ [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
+ (tr2.vt tr2.op:$V3),
+ imm32zx8:$I4)))]> {
+ let M5 = type;
+}
+
+class TernaryVRRa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ TypedReg tr1, TypedReg tr2, bits<4> type, bits<4> m4or>
+ : InstVRRa<opcode, (outs tr1.op:$V1),
+ (ins tr2.op:$V2, imm32zx4:$M4, imm32zx4:$M5),
+ mnemonic#"\t$V1, $V2, $M4, $M5",
+ [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
+ imm32zx4:$M4,
+ imm32zx4:$M5)))],
+ m4or> {
+ let M3 = type;
+}
+
+class TernaryVRRaFloatGeneric<string mnemonic, bits<16> opcode>
+ : InstVRRa<opcode, (outs VR128:$V1),
+ (ins VR128:$V2, imm32zx4:$M3, imm32zx4:$M4, imm32zx4:$M5),
+ mnemonic#"\t$V1, $V2, $M3, $M4, $M5", []>;
+
+class TernaryVRRb<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ TypedReg tr1, TypedReg tr2, bits<4> type,
+ SDPatternOperator m5mask, bits<4> m5or>
+ : InstVRRb<opcode, (outs tr1.op:$V1),
+ (ins tr2.op:$V2, tr2.op:$V3, m5mask:$M5),
+ mnemonic#"\t$V1, $V2, $V3, $M5",
+ [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
+ (tr2.vt tr2.op:$V3),
+ m5mask:$M5)))],
+ m5or> {
+ let M4 = type;
+}
+
+// Declare a pair of instructions, one which sets CC and one which doesn't.
+// The CC-setting form ends with "S" and sets the low bit of M5.
+// Also create aliases to make use of M5 operand optional in assembler.
+multiclass TernaryOptVRRbSPair<string mnemonic, bits<16> opcode,
+ SDPatternOperator operator,
+ SDPatternOperator operator_cc,
+ TypedReg tr1, TypedReg tr2, bits<4> type,
+ bits<4> modifier = 0> {
+ def "" : TernaryVRRb<mnemonic, opcode, operator, tr1, tr2, type,
+ imm32zx4even, !and (modifier, 14)>;
+ def : InstAlias<mnemonic#"\t$V1, $V2, $V3",
+ (!cast<Instruction>(NAME) tr1.op:$V1, tr2.op:$V2,
+ tr2.op:$V3, 0)>;
+ let Defs = [CC] in
+ def S : TernaryVRRb<mnemonic##"s", opcode, operator_cc, tr1, tr2, type,
+ imm32zx4even, !add(!and (modifier, 14), 1)>;
+ def : InstAlias<mnemonic#"s\t$V1, $V2, $V3",
+ (!cast<Instruction>(NAME#"S") tr1.op:$V1, tr2.op:$V2,
+ tr2.op:$V3, 0)>;
+}
+
+multiclass TernaryOptVRRbSPairGeneric<string mnemonic, bits<16> opcode> {
+ def "" : InstVRRb<opcode, (outs VR128:$V1),
+ (ins VR128:$V2, VR128:$V3, imm32zx4:$M4, imm32zx4:$M5),
+ mnemonic#"\t$V1, $V2, $V3, $M4, $M5", []>;
+ def : InstAlias<mnemonic#"\t$V1, $V2, $V3, $M4",
+ (!cast<Instruction>(NAME) VR128:$V1, VR128:$V2, VR128:$V3,
+ imm32zx4:$M4, 0)>;
+}
+
+class TernaryVRRc<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ TypedReg tr1, TypedReg tr2>
+ : InstVRRc<opcode, (outs tr1.op:$V1),
+ (ins tr2.op:$V2, tr2.op:$V3, imm32zx4:$M4),
+ mnemonic#"\t$V1, $V2, $V3, $M4",
+ [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
+ (tr2.vt tr2.op:$V3),
+ imm32zx4:$M4)))]> {
+ let M5 = 0;
+ let M6 = 0;
+}
+
+class TernaryVRRd<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ TypedReg tr1, TypedReg tr2, bits<4> type = 0>
+ : InstVRRd<opcode, (outs tr1.op:$V1),
+ (ins tr2.op:$V2, tr2.op:$V3, tr1.op:$V4),
+ mnemonic#"\t$V1, $V2, $V3, $V4",
+ [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
+ (tr2.vt tr2.op:$V3),
+ (tr1.vt tr1.op:$V4))))]> {
+ let M5 = type;
+ let M6 = 0;
+}
+
+class TernaryVRRdGeneric<string mnemonic, bits<16> opcode>
+ : InstVRRd<opcode, (outs VR128:$V1),
+ (ins VR128:$V2, VR128:$V3, VR128:$V4, imm32zx4:$M5),
+ mnemonic#"\t$V1, $V2, $V3, $V4, $M5", []> {
+ let M6 = 0;
+}
+
+class TernaryVRRe<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ TypedReg tr1, TypedReg tr2, bits<4> m5 = 0, bits<4> type = 0>
+ : InstVRRe<opcode, (outs tr1.op:$V1),
+ (ins tr2.op:$V2, tr2.op:$V3, tr1.op:$V4),
+ mnemonic#"\t$V1, $V2, $V3, $V4",
+ [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
+ (tr2.vt tr2.op:$V3),
+ (tr1.vt tr1.op:$V4))))]> {
+ let M5 = m5;
+ let M6 = type;
+}
+
+class TernaryVRReFloatGeneric<string mnemonic, bits<16> opcode>
+ : InstVRRe<opcode, (outs VR128:$V1),
+ (ins VR128:$V2, VR128:$V3, VR128:$V4, imm32zx4:$M5, imm32zx4:$M6),
+ mnemonic#"\t$V1, $V2, $V3, $V4, $M5, $M6", []>;
+
+class TernaryVRSb<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ TypedReg tr1, TypedReg tr2, RegisterOperand cls, bits<4> type>
+ : InstVRSb<opcode, (outs tr1.op:$V1),
+ (ins tr2.op:$V1src, cls:$R3, shift12only:$BD2),
+ mnemonic#"\t$V1, $R3, $BD2",
+ [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V1src),
+ cls:$R3,
+ shift12only:$BD2)))]> {
+ let Constraints = "$V1 = $V1src";
+ let DisableEncoding = "$V1src";
+ let M4 = type;
+}
+
+class TernaryVRSbGeneric<string mnemonic, bits<16> opcode>
+ : InstVRSb<opcode, (outs VR128:$V1),
+ (ins VR128:$V1src, GR64:$R3, shift12only:$BD2, imm32zx4:$M4),
+ mnemonic#"\t$V1, $R3, $BD2, $M4", []> {
+ let Constraints = "$V1 = $V1src";
+ let DisableEncoding = "$V1src";
+}
+
+class TernaryVRV<string mnemonic, bits<16> opcode, bits<5> bytes,
+ Immediate index>
+ : InstVRV<opcode, (outs VR128:$V1),
+ (ins VR128:$V1src, bdvaddr12only:$VBD2, index:$M3),
+ mnemonic#"\t$V1, $VBD2, $M3", []> {
+ let Constraints = "$V1 = $V1src";
+ let DisableEncoding = "$V1src";
+ let mayLoad = 1;
+ let AccessBytes = bytes;
+}
+
+class TernaryVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ TypedReg tr1, TypedReg tr2, bits<5> bytes, Immediate index>
+ : InstVRX<opcode, (outs tr1.op:$V1),
+ (ins tr2.op:$V1src, bdxaddr12only:$XBD2, index:$M3),
+ mnemonic#"\t$V1, $XBD2, $M3",
+ [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V1src),
+ bdxaddr12only:$XBD2,
+ index:$M3)))]> {
+ let Constraints = "$V1 = $V1src";
+ let DisableEncoding = "$V1src";
+ let mayLoad = 1;
+ let AccessBytes = bytes;
+}
+
+class QuaternaryVRId<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ TypedReg tr1, TypedReg tr2, bits<4> type>
+ : InstVRId<opcode, (outs tr1.op:$V1),
+ (ins tr2.op:$V1src, tr2.op:$V2, tr2.op:$V3, imm32zx8:$I4),
+ mnemonic#"\t$V1, $V2, $V3, $I4",
+ [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V1src),
+ (tr2.vt tr2.op:$V2),
+ (tr2.vt tr2.op:$V3),
+ imm32zx8:$I4)))]> {
+ let Constraints = "$V1 = $V1src";
+ let DisableEncoding = "$V1src";
+ let M5 = type;
+}
+
+class QuaternaryVRIdGeneric<string mnemonic, bits<16> opcode>
+ : InstVRId<opcode, (outs VR128:$V1),
+ (ins VR128:$V1src, VR128:$V2, VR128:$V3,
+ imm32zx8:$I4, imm32zx4:$M5),
+ mnemonic#"\t$V1, $V2, $V3, $I4, $M5", []> {
+ let Constraints = "$V1 = $V1src";
+ let DisableEncoding = "$V1src";
+}
+
+class QuaternaryVRRd<string mnemonic, bits<16> opcode,
+ SDPatternOperator operator, TypedReg tr1, TypedReg tr2,
+ bits<4> type, SDPatternOperator m6mask, bits<4> m6or>
+ : InstVRRd<opcode, (outs tr1.op:$V1),
+ (ins tr2.op:$V2, tr2.op:$V3, tr2.op:$V4, m6mask:$M6),
+ mnemonic#"\t$V1, $V2, $V3, $V4, $M6",
+ [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
+ (tr2.vt tr2.op:$V3),
+ (tr2.vt tr2.op:$V4),
+ m6mask:$M6)))],
+ m6or> {
+ let M5 = type;
+}
+
+// Declare a pair of instructions, one which sets CC and one which doesn't.
+// The CC-setting form ends with "S" and sets the low bit of M6.
+// Also create aliases to make use of M6 operand optional in assembler.
+multiclass QuaternaryOptVRRdSPair<string mnemonic, bits<16> opcode,
+ SDPatternOperator operator,
+ SDPatternOperator operator_cc,
+ TypedReg tr1, TypedReg tr2, bits<4> type,
+ bits<4> modifier = 0> {
+ def "" : QuaternaryVRRd<mnemonic, opcode, operator, tr1, tr2, type,
+ imm32zx4even, !and (modifier, 14)>;
+ def : InstAlias<mnemonic#"\t$V1, $V2, $V3, $V4",
+ (!cast<Instruction>(NAME) tr1.op:$V1, tr2.op:$V2,
+ tr2.op:$V3, tr2.op:$V4, 0)>;
+ let Defs = [CC] in
+ def S : QuaternaryVRRd<mnemonic##"s", opcode, operator_cc, tr1, tr2, type,
+ imm32zx4even, !add (!and (modifier, 14), 1)>;
+ def : InstAlias<mnemonic#"s\t$V1, $V2, $V3, $V4",
+ (!cast<Instruction>(NAME#"S") tr1.op:$V1, tr2.op:$V2,
+ tr2.op:$V3, tr2.op:$V4, 0)>;
+}
+
+multiclass QuaternaryOptVRRdSPairGeneric<string mnemonic, bits<16> opcode> {
+ def "" : InstVRRd<opcode, (outs VR128:$V1),
+ (ins VR128:$V2, VR128:$V3, VR128:$V4,
+ imm32zx4:$M5, imm32zx4:$M6),
+ mnemonic#"\t$V1, $V2, $V3, $V4, $M5, $M6", []>;
+ def : InstAlias<mnemonic#"\t$V1, $V2, $V3, $V4, $M5",
+ (!cast<Instruction>(NAME) VR128:$V1, VR128:$V2, VR128:$V3,
+ VR128:$V4, imm32zx4:$M5, 0)>;
+}
+
+class SideEffectQuaternarySSe<string mnemonic, bits<8> opcode,
+ RegisterOperand cls>
+ : InstSSe<opcode, (outs),
+ (ins cls:$R1, bdaddr12only:$BD2, cls:$R3, bdaddr12only:$BD4),
+ mnemonic#"\t$R1, $BD2, $R3, $BD4", []>;
+
+class LoadAndOpRSY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ RegisterOperand cls, AddressingMode mode = bdaddr20only>
+ : InstRSYa<opcode, (outs cls:$R1), (ins cls:$R3, mode:$BD2),
+ mnemonic#"\t$R1, $R3, $BD2",
+ [(set cls:$R1, (operator mode:$BD2, cls:$R3))]> {
+ let mayLoad = 1;
+ let mayStore = 1;
+}
+
+class CmpSwapRS<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+ RegisterOperand cls, AddressingMode mode = bdaddr12only>
+ : InstRSa<opcode, (outs cls:$R1), (ins cls:$R1src, cls:$R3, mode:$BD2),
+ mnemonic#"\t$R1, $R3, $BD2",
+ [(set cls:$R1, (operator mode:$BD2, cls:$R1src, cls:$R3))]> {
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+ let mayLoad = 1;
+ let mayStore = 1;
+}
+
+class CmpSwapRSY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+ RegisterOperand cls, AddressingMode mode = bdaddr20only>
+ : InstRSYa<opcode, (outs cls:$R1), (ins cls:$R1src, cls:$R3, mode:$BD2),
+ mnemonic#"\t$R1, $R3, $BD2",
+ [(set cls:$R1, (operator mode:$BD2, cls:$R1src, cls:$R3))]> {
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+ let mayLoad = 1;
+ let mayStore = 1;
+}
+
+multiclass CmpSwapRSPair<string mnemonic, bits<8> rsOpcode, bits<16> rsyOpcode,
+ SDPatternOperator operator, RegisterOperand cls> {
+ let DispKey = mnemonic ## #cls in {
+ let DispSize = "12" in
+ def "" : CmpSwapRS<mnemonic, rsOpcode, operator, cls, bdaddr12pair>;
+ let DispSize = "20" in
+ def Y : CmpSwapRSY<mnemonic#"y", rsyOpcode, operator, cls, bdaddr20pair>;
+ }
+}
+
+class RotateSelectRIEf<string mnemonic, bits<16> opcode, RegisterOperand cls1,
+ RegisterOperand cls2>
+ : InstRIEf<opcode, (outs cls1:$R1),
+ (ins cls1:$R1src, cls2:$R2, imm32zx8:$I3, imm32zx8:$I4,
+ imm32zx6:$I5),
+ mnemonic#"\t$R1, $R2, $I3, $I4, $I5", []> {
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+}
+
+class PrefetchRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator>
+ : InstRXYb<opcode, (outs), (ins imm32zx4:$M1, bdxaddr20only:$XBD2),
+ mnemonic##"\t$M1, $XBD2",
+ [(operator imm32zx4:$M1, bdxaddr20only:$XBD2)]>;
+
+class PrefetchRILPC<string mnemonic, bits<12> opcode,
+ SDPatternOperator operator>
+ : InstRILc<opcode, (outs), (ins imm32zx4:$M1, pcrel32:$RI2),
+ mnemonic##"\t$M1, $RI2",
+ [(operator imm32zx4:$M1, pcrel32:$RI2)]> {
+ // We want PC-relative addresses to be tried ahead of BD and BDX addresses.
+ // However, BDXs have two extra operands and are therefore 6 units more
+ // complex.
+ let AddedComplexity = 7;
+}
+
+class BranchPreloadSMI<string mnemonic, bits<8> opcode>
+ : InstSMI<opcode, (outs),
+ (ins imm32zx4:$M1, brtarget16bpp:$RI2, bdxaddr12only:$BD3),
+ mnemonic#"\t$M1, $RI2, $BD3", []>;
+
+class BranchPreloadMII<string mnemonic, bits<8> opcode>
+ : InstMII<opcode, (outs),
+ (ins imm32zx4:$M1, brtarget12bpp:$RI2, brtarget24bpp:$RI3),
+ mnemonic#"\t$M1, $RI2, $RI3", []>;
+
+// A floating-point load-and test operation. Create both a normal unary
+// operation and one that acts as a comparison against zero.
+// Note that the comparison against zero operation is not available if we
+// have vector support, since load-and-test instructions will partially
+// clobber the target (vector) register.
+multiclass LoadAndTestRRE<string mnemonic, bits<16> opcode,
+ RegisterOperand cls> {
+ def "" : UnaryRRE<mnemonic, opcode, null_frag, cls, cls>;
+ let isCodeGenOnly = 1, Predicates = [FeatureNoVector] in
+ def Compare : CompareRRE<mnemonic, opcode, null_frag, cls, cls>;
+}
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions
+//===----------------------------------------------------------------------===//
+//
+// Convenience instructions that get lowered to real instructions
+// by either SystemZTargetLowering::EmitInstrWithCustomInserter()
+// or SystemZInstrInfo::expandPostRAPseudo().
+//
+//===----------------------------------------------------------------------===//
+
+class Pseudo<dag outs, dag ins, list<dag> pattern>
+ : InstSystemZ<0, outs, ins, "", pattern> {
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+}
+
+// Like SideEffectBinarySIL, but expanded later.
+class SideEffectBinarySILPseudo<SDPatternOperator operator, Immediate imm>
+ : Pseudo<(outs), (ins bdaddr12only:$BD1, imm:$I2),
+ [(operator bdaddr12only:$BD1, imm:$I2)]>;
+
+// Like UnaryRI, but expanded after RA depending on the choice of register.
+class UnaryRIPseudo<SDPatternOperator operator, RegisterOperand cls,
+ Immediate imm>
+ : Pseudo<(outs cls:$R1), (ins imm:$I2),
+ [(set cls:$R1, (operator imm:$I2))]>;
+
+// Like UnaryRXY, but expanded after RA depending on the choice of register.
+class UnaryRXYPseudo<string key, SDPatternOperator operator,
+ RegisterOperand cls, bits<5> bytes,
+ AddressingMode mode = bdxaddr20only>
+ : Pseudo<(outs cls:$R1), (ins mode:$XBD2),
+ [(set cls:$R1, (operator mode:$XBD2))]> {
+ let OpKey = key#"r"#cls;
+ let OpType = "mem";
+ let mayLoad = 1;
+ let Has20BitOffset = 1;
+ let HasIndex = 1;
+ let AccessBytes = bytes;
+}
+
+// Like UnaryRR, but expanded after RA depending on the choice of registers.
+class UnaryRRPseudo<string key, SDPatternOperator operator,
+ RegisterOperand cls1, RegisterOperand cls2>
+ : Pseudo<(outs cls1:$R1), (ins cls2:$R2),
+ [(set cls1:$R1, (operator cls2:$R2))]> {
+ let OpKey = key#cls1;
+ let OpType = "reg";
+}
+
+// Like BinaryRI, but expanded after RA depending on the choice of register.
+class BinaryRIPseudo<SDPatternOperator operator, RegisterOperand cls,
+ Immediate imm>
+ : Pseudo<(outs cls:$R1), (ins cls:$R1src, imm:$I2),
+ [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
+ let Constraints = "$R1 = $R1src";
+}
+
+// Like BinaryRIE, but expanded after RA depending on the choice of register.
+class BinaryRIEPseudo<SDPatternOperator operator, RegisterOperand cls,
+ Immediate imm>
+ : Pseudo<(outs cls:$R1), (ins cls:$R3, imm:$I2),
+ [(set cls:$R1, (operator cls:$R3, imm:$I2))]>;
+
+// Like BinaryRIAndK, but expanded after RA depending on the choice of register.
+multiclass BinaryRIAndKPseudo<string key, SDPatternOperator operator,
+ RegisterOperand cls, Immediate imm> {
+ let NumOpsKey = key in {
+ let NumOpsValue = "3" in
+ def K : BinaryRIEPseudo<null_frag, cls, imm>,
+ Requires<[FeatureHighWord, FeatureDistinctOps]>;
+ let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in
+ def "" : BinaryRIPseudo<operator, cls, imm>,
+ Requires<[FeatureHighWord]>;
+ }
+}
+
+// Like CompareRI, but expanded after RA depending on the choice of register.
+class CompareRIPseudo<SDPatternOperator operator, RegisterOperand cls,
+ Immediate imm>
+ : Pseudo<(outs), (ins cls:$R1, imm:$I2), [(operator cls:$R1, imm:$I2)]> {
+ let isCompare = 1;
+}
+
+// Like CompareRXY, but expanded after RA depending on the choice of register.
+class CompareRXYPseudo<SDPatternOperator operator, RegisterOperand cls,
+ SDPatternOperator load, bits<5> bytes,
+ AddressingMode mode = bdxaddr20only>
+ : Pseudo<(outs), (ins cls:$R1, mode:$XBD2),
+ [(operator cls:$R1, (load mode:$XBD2))]> {
+ let mayLoad = 1;
+ let Has20BitOffset = 1;
+ let HasIndex = 1;
+ let AccessBytes = bytes;
+}
+
+// Like CondBinaryRRF, but expanded after RA depending on the choice of
+// register.
+class CondBinaryRRFPseudo<RegisterOperand cls1, RegisterOperand cls2>
+ : Pseudo<(outs cls1:$R1),
+ (ins cls1:$R1src, cls2:$R2, cond4:$valid, cond4:$M3), []> {
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+ let CCMaskLast = 1;
+}
+
+// Like CondBinaryRIE, but expanded after RA depending on the choice of
+// register.
+class CondBinaryRIEPseudo<RegisterOperand cls, Immediate imm>
+ : Pseudo<(outs cls:$R1),
+ (ins cls:$R1src, imm:$I2, cond4:$valid, cond4:$M3),
+ [(set cls:$R1, (z_select_ccmask imm:$I2, cls:$R1src,
+ cond4:$valid, cond4:$M3))]> {
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+ let CCMaskLast = 1;
+}
+
+// Like CondUnaryRSY, but expanded after RA depending on the choice of
+// register.
+class CondUnaryRSYPseudo<SDPatternOperator operator, RegisterOperand cls,
+ bits<5> bytes, AddressingMode mode = bdaddr20only>
+ : Pseudo<(outs cls:$R1),
+ (ins cls:$R1src, mode:$BD2, cond4:$valid, cond4:$R3),
+ [(set cls:$R1,
+ (z_select_ccmask (operator mode:$BD2), cls:$R1src,
+ cond4:$valid, cond4:$R3))]> {
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+ let mayLoad = 1;
+ let AccessBytes = bytes;
+ let CCMaskLast = 1;
+}
+
+// Like CondStoreRSY, but expanded after RA depending on the choice of
+// register.
+class CondStoreRSYPseudo<RegisterOperand cls, bits<5> bytes,
+ AddressingMode mode = bdaddr20only>
+ : Pseudo<(outs), (ins cls:$R1, mode:$BD2, cond4:$valid, cond4:$R3), []> {
+ let mayStore = 1;
+ let AccessBytes = bytes;
+ let CCMaskLast = 1;
+}
+
+// Like StoreRXY, but expanded after RA depending on the choice of register.
+class StoreRXYPseudo<SDPatternOperator operator, RegisterOperand cls,
+ bits<5> bytes, AddressingMode mode = bdxaddr20only>
+ : Pseudo<(outs), (ins cls:$R1, mode:$XBD2),
+ [(operator cls:$R1, mode:$XBD2)]> {
+ let mayStore = 1;
+ let Has20BitOffset = 1;
+ let HasIndex = 1;
+ let AccessBytes = bytes;
+}
+
+// Like RotateSelectRIEf, but expanded after RA depending on the choice
+// of registers.
+class RotateSelectRIEfPseudo<RegisterOperand cls1, RegisterOperand cls2>
+ : Pseudo<(outs cls1:$R1),
+ (ins cls1:$R1src, cls2:$R2, imm32zx8:$I3, imm32zx8:$I4,
+ imm32zx6:$I5),
+ []> {
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+}
+
+// Implements "$dst = $cc & (8 >> CC) ? $src1 : $src2", where CC is
+// the value of the PSW's 2-bit condition code field.
+class SelectWrapper<RegisterOperand cls>
+ : Pseudo<(outs cls:$dst),
+ (ins cls:$src1, cls:$src2, imm32zx4:$valid, imm32zx4:$cc),
+ [(set cls:$dst, (z_select_ccmask cls:$src1, cls:$src2,
+ imm32zx4:$valid, imm32zx4:$cc))]> {
+ let usesCustomInserter = 1;
+ // Although the instructions used by these nodes do not in themselves
+ // change CC, the insertion requires new blocks, and CC cannot be live
+ // across them.
+ let Defs = [CC];
+ let Uses = [CC];
+}
+
+// Stores $new to $addr if $cc is true ("" case) or false (Inv case).
+multiclass CondStores<RegisterOperand cls, SDPatternOperator store,
+ SDPatternOperator load, AddressingMode mode> {
+ let Defs = [CC], Uses = [CC], usesCustomInserter = 1 in {
+ def "" : Pseudo<(outs),
+ (ins cls:$new, mode:$addr, imm32zx4:$valid, imm32zx4:$cc),
+ [(store (z_select_ccmask cls:$new, (load mode:$addr),
+ imm32zx4:$valid, imm32zx4:$cc),
+ mode:$addr)]>;
+ def Inv : Pseudo<(outs),
+ (ins cls:$new, mode:$addr, imm32zx4:$valid, imm32zx4:$cc),
+ [(store (z_select_ccmask (load mode:$addr), cls:$new,
+ imm32zx4:$valid, imm32zx4:$cc),
+ mode:$addr)]>;
+ }
+}
+
+// OPERATOR is ATOMIC_SWAP or an ATOMIC_LOAD_* operation. PAT and OPERAND
+// describe the second (non-memory) operand.
+class AtomicLoadBinary<SDPatternOperator operator, RegisterOperand cls,
+ dag pat, DAGOperand operand>
+ : Pseudo<(outs cls:$dst), (ins bdaddr20only:$ptr, operand:$src2),
+ [(set cls:$dst, (operator bdaddr20only:$ptr, pat))]> {
+ let Defs = [CC];
+ let Has20BitOffset = 1;
+ let mayLoad = 1;
+ let mayStore = 1;
+ let usesCustomInserter = 1;
+ let hasNoSchedulingInfo = 1;
+}
+
+// Specializations of AtomicLoadWBinary.
+class AtomicLoadBinaryReg32<SDPatternOperator operator>
+ : AtomicLoadBinary<operator, GR32, (i32 GR32:$src2), GR32>;
+class AtomicLoadBinaryImm32<SDPatternOperator operator, Immediate imm>
+ : AtomicLoadBinary<operator, GR32, (i32 imm:$src2), imm>;
+class AtomicLoadBinaryReg64<SDPatternOperator operator>
+ : AtomicLoadBinary<operator, GR64, (i64 GR64:$src2), GR64>;
+class AtomicLoadBinaryImm64<SDPatternOperator operator, Immediate imm>
+ : AtomicLoadBinary<operator, GR64, (i64 imm:$src2), imm>;
+
+// OPERATOR is ATOMIC_SWAPW or an ATOMIC_LOADW_* operation. PAT and OPERAND
+// describe the second (non-memory) operand.
+class AtomicLoadWBinary<SDPatternOperator operator, dag pat,
+ DAGOperand operand>
+ : Pseudo<(outs GR32:$dst),
+ (ins bdaddr20only:$ptr, operand:$src2, ADDR32:$bitshift,
+ ADDR32:$negbitshift, uimm32:$bitsize),
+ [(set GR32:$dst, (operator bdaddr20only:$ptr, pat, ADDR32:$bitshift,
+ ADDR32:$negbitshift, uimm32:$bitsize))]> {
+ let Defs = [CC];
+ let Has20BitOffset = 1;
+ let mayLoad = 1;
+ let mayStore = 1;
+ let usesCustomInserter = 1;
+ let hasNoSchedulingInfo = 1;
+}
+
+// Specializations of AtomicLoadWBinary.
+class AtomicLoadWBinaryReg<SDPatternOperator operator>
+ : AtomicLoadWBinary<operator, (i32 GR32:$src2), GR32>;
+class AtomicLoadWBinaryImm<SDPatternOperator operator, Immediate imm>
+ : AtomicLoadWBinary<operator, (i32 imm:$src2), imm>;
+
+// Define an instruction that operates on two fixed-length blocks of memory,
+// and associated pseudo instructions for operating on blocks of any size.
+// The Sequence form uses a straight-line sequence of instructions and
+// the Loop form uses a loop of length-256 instructions followed by
+// another instruction to handle the excess.
+multiclass MemorySS<string mnemonic, bits<8> opcode,
+ SDPatternOperator sequence, SDPatternOperator loop> {
+ def "" : InstSSa<opcode, (outs), (ins bdladdr12onlylen8:$BDL1,
+ bdaddr12only:$BD2),
+ mnemonic##"\t$BDL1, $BD2", []>;
+ let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
+ def Sequence : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
+ imm64:$length),
+ [(sequence bdaddr12only:$dest, bdaddr12only:$src,
+ imm64:$length)]>;
+ def Loop : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
+ imm64:$length, GR64:$count256),
+ [(loop bdaddr12only:$dest, bdaddr12only:$src,
+ imm64:$length, GR64:$count256)]>;
+ }
+}
+
+// Define an instruction that operates on two strings, both terminated
+// by the character in R0. The instruction processes a CPU-determinated
+// number of bytes at a time and sets CC to 3 if the instruction needs
+// to be repeated. Also define a pseudo instruction that represents
+// the full loop (the main instruction plus the branch on CC==3).
+multiclass StringRRE<string mnemonic, bits<16> opcode,
+ SDPatternOperator operator> {
+ def "" : InstRRE<opcode, (outs GR64:$R1, GR64:$R2),
+ (ins GR64:$R1src, GR64:$R2src),
+ mnemonic#"\t$R1, $R2", []> {
+ let Uses = [R0L];
+ let Constraints = "$R1 = $R1src, $R2 = $R2src";
+ let DisableEncoding = "$R1src, $R2src";
+ }
+ let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in
+ def Loop : Pseudo<(outs GR64:$end),
+ (ins GR64:$start1, GR64:$start2, GR32:$char),
+ [(set GR64:$end, (operator GR64:$start1, GR64:$start2,
+ GR32:$char))]>;
+}
+
+// A pseudo instruction that is a direct alias of a real instruction.
+// These aliases are used in cases where a particular register operand is
+// fixed or where the same instruction is used with different register sizes.
+// The size parameter is the size in bytes of the associated real instruction.
+class Alias<int size, dag outs, dag ins, list<dag> pattern>
+ : InstSystemZ<size, outs, ins, "", pattern> {
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+}
+
+class UnaryAliasVRS<RegisterOperand cls1, RegisterOperand cls2>
+ : Alias<6, (outs cls1:$src1), (ins cls2:$src2), []>;
+
+// An alias of a UnaryVRR*, but with different register sizes.
+class UnaryAliasVRR<SDPatternOperator operator, TypedReg tr1, TypedReg tr2>
+ : Alias<6, (outs tr1.op:$V1), (ins tr2.op:$V2),
+ [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2))))]>;
+
+// An alias of a UnaryVRX, but with different register sizes.
+class UnaryAliasVRX<SDPatternOperator operator, TypedReg tr,
+ AddressingMode mode = bdxaddr12only>
+ : Alias<6, (outs tr.op:$V1), (ins mode:$XBD2),
+ [(set tr.op:$V1, (tr.vt (operator mode:$XBD2)))]>;
+
+// An alias of a StoreVRX, but with different register sizes.
+class StoreAliasVRX<SDPatternOperator operator, TypedReg tr,
+ AddressingMode mode = bdxaddr12only>
+ : Alias<6, (outs), (ins tr.op:$V1, mode:$XBD2),
+ [(operator (tr.vt tr.op:$V1), mode:$XBD2)]>;
+
+// An alias of a BinaryRI, but with different register sizes.
+class BinaryAliasRI<SDPatternOperator operator, RegisterOperand cls,
+ Immediate imm>
+ : Alias<4, (outs cls:$R1), (ins cls:$R1src, imm:$I2),
+ [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
+ let Constraints = "$R1 = $R1src";
+}
+
+// An alias of a BinaryRIL, but with different register sizes.
+class BinaryAliasRIL<SDPatternOperator operator, RegisterOperand cls,
+ Immediate imm>
+ : Alias<6, (outs cls:$R1), (ins cls:$R1src, imm:$I2),
+ [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
+ let Constraints = "$R1 = $R1src";
+}
+
+// An alias of a BinaryVRRf, but with different register sizes.
+class BinaryAliasVRRf<RegisterOperand cls>
+ : Alias<6, (outs VR128:$V1), (ins cls:$R2, cls:$R3), []>;
+
+// An alias of a CompareRI, but with different register sizes.
+class CompareAliasRI<SDPatternOperator operator, RegisterOperand cls,
+ Immediate imm>
+ : Alias<4, (outs), (ins cls:$R1, imm:$I2), [(operator cls:$R1, imm:$I2)]> {
+ let isCompare = 1;
+}
+
+// An alias of a RotateSelectRIEf, but with different register sizes.
+class RotateSelectAliasRIEf<RegisterOperand cls1, RegisterOperand cls2>
+ : Alias<6, (outs cls1:$R1),
+ (ins cls1:$R1src, cls2:$R2, imm32zx8:$I3, imm32zx8:$I4,
+ imm32zx6:$I5), []> {
+ let Constraints = "$R1 = $R1src";
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
new file mode 100644
index 000000000000..3565d5f2c49c
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -0,0 +1,1752 @@
+//===-- SystemZInstrInfo.cpp - SystemZ instruction information ------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SystemZ implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZInstrInfo.h"
+#include "SystemZInstrBuilder.h"
+#include "SystemZTargetMachine.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_CTOR_DTOR
+#define GET_INSTRMAP_INFO
+#include "SystemZGenInstrInfo.inc"
+
+// Return a mask with Count low bits set.
+static uint64_t allOnes(unsigned int Count) {
+ return Count == 0 ? 0 : (uint64_t(1) << (Count - 1) << 1) - 1;
+}
+
+// Reg should be a 32-bit GPR. Return true if it is a high register rather
+// than a low register.
+static bool isHighReg(unsigned int Reg) {
+ if (SystemZ::GRH32BitRegClass.contains(Reg))
+ return true;
+ assert(SystemZ::GR32BitRegClass.contains(Reg) && "Invalid GRX32");
+ return false;
+}
+
+// Pin the vtable to this file.
+void SystemZInstrInfo::anchor() {}
+
+SystemZInstrInfo::SystemZInstrInfo(SystemZSubtarget &sti)
+ : SystemZGenInstrInfo(SystemZ::ADJCALLSTACKDOWN, SystemZ::ADJCALLSTACKUP),
+ RI(), STI(sti) {
+}
+
+// MI is a 128-bit load or store. Split it into two 64-bit loads or stores,
+// each having the opcode given by NewOpcode.
+void SystemZInstrInfo::splitMove(MachineBasicBlock::iterator MI,
+ unsigned NewOpcode) const {
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineFunction &MF = *MBB->getParent();
+
+ // Get two load or store instructions. Use the original instruction for one
+ // of them (arbitrarily the second here) and create a clone for the other.
+ MachineInstr *EarlierMI = MF.CloneMachineInstr(&*MI);
+ MBB->insert(MI, EarlierMI);
+
+ // Set up the two 64-bit registers.
+ MachineOperand &HighRegOp = EarlierMI->getOperand(0);
+ MachineOperand &LowRegOp = MI->getOperand(0);
+ HighRegOp.setReg(RI.getSubReg(HighRegOp.getReg(), SystemZ::subreg_h64));
+ LowRegOp.setReg(RI.getSubReg(LowRegOp.getReg(), SystemZ::subreg_l64));
+
+ // The address in the first (high) instruction is already correct.
+ // Adjust the offset in the second (low) instruction.
+ MachineOperand &HighOffsetOp = EarlierMI->getOperand(2);
+ MachineOperand &LowOffsetOp = MI->getOperand(2);
+ LowOffsetOp.setImm(LowOffsetOp.getImm() + 8);
+
+ // Clear the kill flags for the base and index registers in the first
+ // instruction.
+ EarlierMI->getOperand(1).setIsKill(false);
+ EarlierMI->getOperand(3).setIsKill(false);
+
+ // Set the opcodes.
+ unsigned HighOpcode = getOpcodeForOffset(NewOpcode, HighOffsetOp.getImm());
+ unsigned LowOpcode = getOpcodeForOffset(NewOpcode, LowOffsetOp.getImm());
+ assert(HighOpcode && LowOpcode && "Both offsets should be in range");
+
+ EarlierMI->setDesc(get(HighOpcode));
+ MI->setDesc(get(LowOpcode));
+}
+
+// Split ADJDYNALLOC instruction MI.
+void SystemZInstrInfo::splitAdjDynAlloc(MachineBasicBlock::iterator MI) const {
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineFunction &MF = *MBB->getParent();
+ MachineFrameInfo &MFFrame = MF.getFrameInfo();
+ MachineOperand &OffsetMO = MI->getOperand(2);
+
+ uint64_t Offset = (MFFrame.getMaxCallFrameSize() +
+ SystemZMC::CallFrameSize +
+ OffsetMO.getImm());
+ unsigned NewOpcode = getOpcodeForOffset(SystemZ::LA, Offset);
+ assert(NewOpcode && "No support for huge argument lists yet");
+ MI->setDesc(get(NewOpcode));
+ OffsetMO.setImm(Offset);
+}
+
+// MI is an RI-style pseudo instruction. Replace it with LowOpcode
+// if the first operand is a low GR32 and HighOpcode if the first operand
+// is a high GR32. ConvertHigh is true if LowOpcode takes a signed operand
+// and HighOpcode takes an unsigned 32-bit operand. In those cases,
+// MI has the same kind of operand as LowOpcode, so needs to be converted
+// if HighOpcode is used.
+void SystemZInstrInfo::expandRIPseudo(MachineInstr &MI, unsigned LowOpcode,
+ unsigned HighOpcode,
+ bool ConvertHigh) const {
+ unsigned Reg = MI.getOperand(0).getReg();
+ bool IsHigh = isHighReg(Reg);
+ MI.setDesc(get(IsHigh ? HighOpcode : LowOpcode));
+ if (IsHigh && ConvertHigh)
+ MI.getOperand(1).setImm(uint32_t(MI.getOperand(1).getImm()));
+}
+
+// MI is a three-operand RIE-style pseudo instruction. Replace it with
+// LowOpcodeK if the registers are both low GR32s, otherwise use a move
+// followed by HighOpcode or LowOpcode, depending on whether the target
+// is a high or low GR32.
+void SystemZInstrInfo::expandRIEPseudo(MachineInstr &MI, unsigned LowOpcode,
+ unsigned LowOpcodeK,
+ unsigned HighOpcode) const {
+ unsigned DestReg = MI.getOperand(0).getReg();
+ unsigned SrcReg = MI.getOperand(1).getReg();
+ bool DestIsHigh = isHighReg(DestReg);
+ bool SrcIsHigh = isHighReg(SrcReg);
+ if (!DestIsHigh && !SrcIsHigh)
+ MI.setDesc(get(LowOpcodeK));
+ else {
+ emitGRX32Move(*MI.getParent(), MI, MI.getDebugLoc(), DestReg, SrcReg,
+ SystemZ::LR, 32, MI.getOperand(1).isKill());
+ MI.setDesc(get(DestIsHigh ? HighOpcode : LowOpcode));
+ MI.getOperand(1).setReg(DestReg);
+ MI.tieOperands(0, 1);
+ }
+}
+
+// MI is an RXY-style pseudo instruction. Replace it with LowOpcode
+// if the first operand is a low GR32 and HighOpcode if the first operand
+// is a high GR32.
+void SystemZInstrInfo::expandRXYPseudo(MachineInstr &MI, unsigned LowOpcode,
+ unsigned HighOpcode) const {
+ unsigned Reg = MI.getOperand(0).getReg();
+ unsigned Opcode = getOpcodeForOffset(isHighReg(Reg) ? HighOpcode : LowOpcode,
+ MI.getOperand(2).getImm());
+ MI.setDesc(get(Opcode));
+}
+
+// MI is a load-on-condition pseudo instruction with a single register
+// (source or destination) operand. Replace it with LowOpcode if the
+// register is a low GR32 and HighOpcode if the register is a high GR32.
+void SystemZInstrInfo::expandLOCPseudo(MachineInstr &MI, unsigned LowOpcode,
+ unsigned HighOpcode) const {
+ unsigned Reg = MI.getOperand(0).getReg();
+ unsigned Opcode = isHighReg(Reg) ? HighOpcode : LowOpcode;
+ MI.setDesc(get(Opcode));
+}
+
+// MI is a load-register-on-condition pseudo instruction. Replace it with
+// LowOpcode if source and destination are both low GR32s and HighOpcode if
+// source and destination are both high GR32s.
+void SystemZInstrInfo::expandLOCRPseudo(MachineInstr &MI, unsigned LowOpcode,
+ unsigned HighOpcode) const {
+ unsigned DestReg = MI.getOperand(0).getReg();
+ unsigned SrcReg = MI.getOperand(2).getReg();
+ bool DestIsHigh = isHighReg(DestReg);
+ bool SrcIsHigh = isHighReg(SrcReg);
+
+ if (!DestIsHigh && !SrcIsHigh)
+ MI.setDesc(get(LowOpcode));
+ else if (DestIsHigh && SrcIsHigh)
+ MI.setDesc(get(HighOpcode));
+
+ // If we were unable to implement the pseudo with a single instruction, we
+ // need to convert it back into a branch sequence. This cannot be done here
+ // since the caller of expandPostRAPseudo does not handle changes to the CFG
+ // correctly. This change is defered to the SystemZExpandPseudo pass.
+}
+
+// MI is an RR-style pseudo instruction that zero-extends the low Size bits
+// of one GRX32 into another. Replace it with LowOpcode if both operands
+// are low registers, otherwise use RISB[LH]G.
+void SystemZInstrInfo::expandZExtPseudo(MachineInstr &MI, unsigned LowOpcode,
+ unsigned Size) const {
+ emitGRX32Move(*MI.getParent(), MI, MI.getDebugLoc(),
+ MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), LowOpcode,
+ Size, MI.getOperand(1).isKill());
+ MI.eraseFromParent();
+}
+
+void SystemZInstrInfo::expandLoadStackGuard(MachineInstr *MI) const {
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineFunction &MF = *MBB->getParent();
+ const unsigned Reg = MI->getOperand(0).getReg();
+
+ // Conveniently, all 4 instructions are cloned from LOAD_STACK_GUARD,
+ // so they already have operand 0 set to reg.
+
+ // ear <reg>, %a0
+ MachineInstr *Ear1MI = MF.CloneMachineInstr(MI);
+ MBB->insert(MI, Ear1MI);
+ Ear1MI->setDesc(get(SystemZ::EAR));
+ MachineInstrBuilder(MF, Ear1MI).addReg(SystemZ::A0);
+
+ // sllg <reg>, <reg>, 32
+ MachineInstr *SllgMI = MF.CloneMachineInstr(MI);
+ MBB->insert(MI, SllgMI);
+ SllgMI->setDesc(get(SystemZ::SLLG));
+ MachineInstrBuilder(MF, SllgMI).addReg(Reg).addReg(0).addImm(32);
+
+ // ear <reg>, %a1
+ MachineInstr *Ear2MI = MF.CloneMachineInstr(MI);
+ MBB->insert(MI, Ear2MI);
+ Ear2MI->setDesc(get(SystemZ::EAR));
+ MachineInstrBuilder(MF, Ear2MI).addReg(SystemZ::A1);
+
+ // lg <reg>, 40(<reg>)
+ MI->setDesc(get(SystemZ::LG));
+ MachineInstrBuilder(MF, MI).addReg(Reg).addImm(40).addReg(0);
+}
+
+// Emit a zero-extending move from 32-bit GPR SrcReg to 32-bit GPR
+// DestReg before MBBI in MBB. Use LowLowOpcode when both DestReg and SrcReg
+// are low registers, otherwise use RISB[LH]G. Size is the number of bits
+// taken from the low end of SrcReg (8 for LLCR, 16 for LLHR and 32 for LR).
+// KillSrc is true if this move is the last use of SrcReg.
+void SystemZInstrInfo::emitGRX32Move(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, unsigned DestReg,
+ unsigned SrcReg, unsigned LowLowOpcode,
+ unsigned Size, bool KillSrc) const {
+ unsigned Opcode;
+ bool DestIsHigh = isHighReg(DestReg);
+ bool SrcIsHigh = isHighReg(SrcReg);
+ if (DestIsHigh && SrcIsHigh)
+ Opcode = SystemZ::RISBHH;
+ else if (DestIsHigh && !SrcIsHigh)
+ Opcode = SystemZ::RISBHL;
+ else if (!DestIsHigh && SrcIsHigh)
+ Opcode = SystemZ::RISBLH;
+ else {
+ BuildMI(MBB, MBBI, DL, get(LowLowOpcode), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+ }
+ unsigned Rotate = (DestIsHigh != SrcIsHigh ? 32 : 0);
+ BuildMI(MBB, MBBI, DL, get(Opcode), DestReg)
+ .addReg(DestReg, RegState::Undef)
+ .addReg(SrcReg, getKillRegState(KillSrc))
+ .addImm(32 - Size).addImm(128 + 31).addImm(Rotate);
+}
+
+
+MachineInstr *SystemZInstrInfo::commuteInstructionImpl(MachineInstr &MI,
+ bool NewMI,
+ unsigned OpIdx1,
+ unsigned OpIdx2) const {
+ auto cloneIfNew = [NewMI](MachineInstr &MI) -> MachineInstr & {
+ if (NewMI)
+ return *MI.getParent()->getParent()->CloneMachineInstr(&MI);
+ return MI;
+ };
+
+ switch (MI.getOpcode()) {
+ case SystemZ::LOCRMux:
+ case SystemZ::LOCFHR:
+ case SystemZ::LOCR:
+ case SystemZ::LOCGR: {
+ auto &WorkingMI = cloneIfNew(MI);
+ // Invert condition.
+ unsigned CCValid = WorkingMI.getOperand(3).getImm();
+ unsigned CCMask = WorkingMI.getOperand(4).getImm();
+ WorkingMI.getOperand(4).setImm(CCMask ^ CCValid);
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+ default:
+ return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+ }
+}
+
+
+// If MI is a simple load or store for a frame object, return the register
+// it loads or stores and set FrameIndex to the index of the frame object.
+// Return 0 otherwise.
+//
+// Flag is SimpleBDXLoad for loads and SimpleBDXStore for stores.
+static int isSimpleMove(const MachineInstr &MI, int &FrameIndex,
+ unsigned Flag) {
+ const MCInstrDesc &MCID = MI.getDesc();
+ if ((MCID.TSFlags & Flag) && MI.getOperand(1).isFI() &&
+ MI.getOperand(2).getImm() == 0 && MI.getOperand(3).getReg() == 0) {
+ FrameIndex = MI.getOperand(1).getIndex();
+ return MI.getOperand(0).getReg();
+ }
+ return 0;
+}
+
+unsigned SystemZInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const {
+ return isSimpleMove(MI, FrameIndex, SystemZII::SimpleBDXLoad);
+}
+
+unsigned SystemZInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const {
+ return isSimpleMove(MI, FrameIndex, SystemZII::SimpleBDXStore);
+}
+
+bool SystemZInstrInfo::isStackSlotCopy(const MachineInstr &MI,
+ int &DestFrameIndex,
+ int &SrcFrameIndex) const {
+ // Check for MVC 0(Length,FI1),0(FI2)
+ const MachineFrameInfo &MFI = MI.getParent()->getParent()->getFrameInfo();
+ if (MI.getOpcode() != SystemZ::MVC || !MI.getOperand(0).isFI() ||
+ MI.getOperand(1).getImm() != 0 || !MI.getOperand(3).isFI() ||
+ MI.getOperand(4).getImm() != 0)
+ return false;
+
+ // Check that Length covers the full slots.
+ int64_t Length = MI.getOperand(2).getImm();
+ unsigned FI1 = MI.getOperand(0).getIndex();
+ unsigned FI2 = MI.getOperand(3).getIndex();
+ if (MFI.getObjectSize(FI1) != Length ||
+ MFI.getObjectSize(FI2) != Length)
+ return false;
+
+ DestFrameIndex = FI1;
+ SrcFrameIndex = FI2;
+ return true;
+}
+
+bool SystemZInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const {
+ // Most of the code and comments here are boilerplate.
+
+ // Start from the bottom of the block and work up, examining the
+ // terminator instructions.
+ MachineBasicBlock::iterator I = MBB.end();
+ while (I != MBB.begin()) {
+ --I;
+ if (I->isDebugValue())
+ continue;
+
+ // Working from the bottom, when we see a non-terminator instruction, we're
+ // done.
+ if (!isUnpredicatedTerminator(*I))
+ break;
+
+ // A terminator that isn't a branch can't easily be handled by this
+ // analysis.
+ if (!I->isBranch())
+ return true;
+
+ // Can't handle indirect branches.
+ SystemZII::Branch Branch(getBranchInfo(*I));
+ if (!Branch.Target->isMBB())
+ return true;
+
+ // Punt on compound branches.
+ if (Branch.Type != SystemZII::BranchNormal)
+ return true;
+
+ if (Branch.CCMask == SystemZ::CCMASK_ANY) {
+ // Handle unconditional branches.
+ if (!AllowModify) {
+ TBB = Branch.Target->getMBB();
+ continue;
+ }
+
+ // If the block has any instructions after a JMP, delete them.
+ while (std::next(I) != MBB.end())
+ std::next(I)->eraseFromParent();
+
+ Cond.clear();
+ FBB = nullptr;
+
+ // Delete the JMP if it's equivalent to a fall-through.
+ if (MBB.isLayoutSuccessor(Branch.Target->getMBB())) {
+ TBB = nullptr;
+ I->eraseFromParent();
+ I = MBB.end();
+ continue;
+ }
+
+ // TBB is used to indicate the unconditinal destination.
+ TBB = Branch.Target->getMBB();
+ continue;
+ }
+
+ // Working from the bottom, handle the first conditional branch.
+ if (Cond.empty()) {
+ // FIXME: add X86-style branch swap
+ FBB = TBB;
+ TBB = Branch.Target->getMBB();
+ Cond.push_back(MachineOperand::CreateImm(Branch.CCValid));
+ Cond.push_back(MachineOperand::CreateImm(Branch.CCMask));
+ continue;
+ }
+
+ // Handle subsequent conditional branches.
+ assert(Cond.size() == 2 && TBB && "Should have seen a conditional branch");
+
+ // Only handle the case where all conditional branches branch to the same
+ // destination.
+ if (TBB != Branch.Target->getMBB())
+ return true;
+
+ // If the conditions are the same, we can leave them alone.
+ unsigned OldCCValid = Cond[0].getImm();
+ unsigned OldCCMask = Cond[1].getImm();
+ if (OldCCValid == Branch.CCValid && OldCCMask == Branch.CCMask)
+ continue;
+
+ // FIXME: Try combining conditions like X86 does. Should be easy on Z!
+ return false;
+ }
+
+ return false;
+}
+
+unsigned SystemZInstrInfo::removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved) const {
+ assert(!BytesRemoved && "code size not handled");
+
+ // Most of the code and comments here are boilerplate.
+ MachineBasicBlock::iterator I = MBB.end();
+ unsigned Count = 0;
+
+ while (I != MBB.begin()) {
+ --I;
+ if (I->isDebugValue())
+ continue;
+ if (!I->isBranch())
+ break;
+ if (!getBranchInfo(*I).Target->isMBB())
+ break;
+ // Remove the branch.
+ I->eraseFromParent();
+ I = MBB.end();
+ ++Count;
+ }
+
+ return Count;
+}
+
+bool SystemZInstrInfo::
+reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+ assert(Cond.size() == 2 && "Invalid condition");
+ Cond[1].setImm(Cond[1].getImm() ^ Cond[0].getImm());
+ return false;
+}
+
+unsigned SystemZInstrInfo::insertBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB,
+ ArrayRef<MachineOperand> Cond,
+ const DebugLoc &DL,
+ int *BytesAdded) const {
+ // In this function we output 32-bit branches, which should always
+ // have enough range. They can be shortened and relaxed by later code
+ // in the pipeline, if desired.
+
+ // Shouldn't be a fall through.
+ assert(TBB && "insertBranch must not be told to insert a fallthrough");
+ assert((Cond.size() == 2 || Cond.size() == 0) &&
+ "SystemZ branch conditions have one component!");
+ assert(!BytesAdded && "code size not handled");
+
+ if (Cond.empty()) {
+ // Unconditional branch?
+ assert(!FBB && "Unconditional branch with multiple successors!");
+ BuildMI(&MBB, DL, get(SystemZ::J)).addMBB(TBB);
+ return 1;
+ }
+
+ // Conditional branch.
+ unsigned Count = 0;
+ unsigned CCValid = Cond[0].getImm();
+ unsigned CCMask = Cond[1].getImm();
+ BuildMI(&MBB, DL, get(SystemZ::BRC))
+ .addImm(CCValid).addImm(CCMask).addMBB(TBB);
+ ++Count;
+
+ if (FBB) {
+ // Two-way Conditional branch. Insert the second branch.
+ BuildMI(&MBB, DL, get(SystemZ::J)).addMBB(FBB);
+ ++Count;
+ }
+ return Count;
+}
+
+bool SystemZInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
+ unsigned &SrcReg2, int &Mask,
+ int &Value) const {
+ assert(MI.isCompare() && "Caller should have checked for a comparison");
+
+ if (MI.getNumExplicitOperands() == 2 && MI.getOperand(0).isReg() &&
+ MI.getOperand(1).isImm()) {
+ SrcReg = MI.getOperand(0).getReg();
+ SrcReg2 = 0;
+ Value = MI.getOperand(1).getImm();
+ Mask = ~0;
+ return true;
+ }
+
+ return false;
+}
+
+// If Reg is a virtual register, return its definition, otherwise return null.
+static MachineInstr *getDef(unsigned Reg,
+ const MachineRegisterInfo *MRI) {
+ if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ return nullptr;
+ return MRI->getUniqueVRegDef(Reg);
+}
+
+// Return true if MI is a shift of type Opcode by Imm bits.
+static bool isShift(MachineInstr *MI, unsigned Opcode, int64_t Imm) {
+ return (MI->getOpcode() == Opcode &&
+ !MI->getOperand(2).getReg() &&
+ MI->getOperand(3).getImm() == Imm);
+}
+
+// If the destination of MI has no uses, delete it as dead.
+static void eraseIfDead(MachineInstr *MI, const MachineRegisterInfo *MRI) {
+ if (MRI->use_nodbg_empty(MI->getOperand(0).getReg()))
+ MI->eraseFromParent();
+}
+
+// Compare compares SrcReg against zero. Check whether SrcReg contains
+// the result of an IPM sequence whose input CC survives until Compare,
+// and whether Compare is therefore redundant. Delete it and return
+// true if so.
+static bool removeIPMBasedCompare(MachineInstr &Compare, unsigned SrcReg,
+ const MachineRegisterInfo *MRI,
+ const TargetRegisterInfo *TRI) {
+ MachineInstr *LGFR = nullptr;
+ MachineInstr *RLL = getDef(SrcReg, MRI);
+ if (RLL && RLL->getOpcode() == SystemZ::LGFR) {
+ LGFR = RLL;
+ RLL = getDef(LGFR->getOperand(1).getReg(), MRI);
+ }
+ if (!RLL || !isShift(RLL, SystemZ::RLL, 31))
+ return false;
+
+ MachineInstr *SRL = getDef(RLL->getOperand(1).getReg(), MRI);
+ if (!SRL || !isShift(SRL, SystemZ::SRL, SystemZ::IPM_CC))
+ return false;
+
+ MachineInstr *IPM = getDef(SRL->getOperand(1).getReg(), MRI);
+ if (!IPM || IPM->getOpcode() != SystemZ::IPM)
+ return false;
+
+ // Check that there are no assignments to CC between the IPM and Compare,
+ if (IPM->getParent() != Compare.getParent())
+ return false;
+ MachineBasicBlock::iterator MBBI = IPM, MBBE = Compare.getIterator();
+ for (++MBBI; MBBI != MBBE; ++MBBI) {
+ MachineInstr &MI = *MBBI;
+ if (MI.modifiesRegister(SystemZ::CC, TRI))
+ return false;
+ }
+
+ Compare.eraseFromParent();
+ if (LGFR)
+ eraseIfDead(LGFR, MRI);
+ eraseIfDead(RLL, MRI);
+ eraseIfDead(SRL, MRI);
+ eraseIfDead(IPM, MRI);
+
+ return true;
+}
+
+bool SystemZInstrInfo::optimizeCompareInstr(
+ MachineInstr &Compare, unsigned SrcReg, unsigned SrcReg2, int Mask,
+ int Value, const MachineRegisterInfo *MRI) const {
+ assert(!SrcReg2 && "Only optimizing constant comparisons so far");
+ bool IsLogical = (Compare.getDesc().TSFlags & SystemZII::IsLogical) != 0;
+ return Value == 0 && !IsLogical &&
+ removeIPMBasedCompare(Compare, SrcReg, MRI, &RI);
+}
+
+
+bool SystemZInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
+ ArrayRef<MachineOperand> Pred,
+ unsigned TrueReg, unsigned FalseReg,
+ int &CondCycles, int &TrueCycles,
+ int &FalseCycles) const {
+ // Not all subtargets have LOCR instructions.
+ if (!STI.hasLoadStoreOnCond())
+ return false;
+ if (Pred.size() != 2)
+ return false;
+
+ // Check register classes.
+ const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ const TargetRegisterClass *RC =
+ RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
+ if (!RC)
+ return false;
+
+ // We have LOCR instructions for 32 and 64 bit general purpose registers.
+ if ((STI.hasLoadStoreOnCond2() &&
+ SystemZ::GRX32BitRegClass.hasSubClassEq(RC)) ||
+ SystemZ::GR32BitRegClass.hasSubClassEq(RC) ||
+ SystemZ::GR64BitRegClass.hasSubClassEq(RC)) {
+ CondCycles = 2;
+ TrueCycles = 2;
+ FalseCycles = 2;
+ return true;
+ }
+
+ // Can't do anything else.
+ return false;
+}
+
+void SystemZInstrInfo::insertSelect(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL, unsigned DstReg,
+ ArrayRef<MachineOperand> Pred,
+ unsigned TrueReg,
+ unsigned FalseReg) const {
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
+
+ assert(Pred.size() == 2 && "Invalid condition");
+ unsigned CCValid = Pred[0].getImm();
+ unsigned CCMask = Pred[1].getImm();
+
+ unsigned Opc;
+ if (SystemZ::GRX32BitRegClass.hasSubClassEq(RC)) {
+ if (STI.hasLoadStoreOnCond2())
+ Opc = SystemZ::LOCRMux;
+ else {
+ Opc = SystemZ::LOCR;
+ MRI.constrainRegClass(DstReg, &SystemZ::GR32BitRegClass);
+ }
+ } else if (SystemZ::GR64BitRegClass.hasSubClassEq(RC))
+ Opc = SystemZ::LOCGR;
+ else
+ llvm_unreachable("Invalid register class");
+
+ BuildMI(MBB, I, DL, get(Opc), DstReg)
+ .addReg(FalseReg).addReg(TrueReg)
+ .addImm(CCValid).addImm(CCMask);
+}
+
+bool SystemZInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
+ unsigned Reg,
+ MachineRegisterInfo *MRI) const {
+ unsigned DefOpc = DefMI.getOpcode();
+ if (DefOpc != SystemZ::LHIMux && DefOpc != SystemZ::LHI &&
+ DefOpc != SystemZ::LGHI)
+ return false;
+ if (DefMI.getOperand(0).getReg() != Reg)
+ return false;
+ int32_t ImmVal = (int32_t)DefMI.getOperand(1).getImm();
+
+ unsigned UseOpc = UseMI.getOpcode();
+ unsigned NewUseOpc;
+ unsigned UseIdx;
+ int CommuteIdx = -1;
+ switch (UseOpc) {
+ case SystemZ::LOCRMux:
+ if (!STI.hasLoadStoreOnCond2())
+ return false;
+ NewUseOpc = SystemZ::LOCHIMux;
+ if (UseMI.getOperand(2).getReg() == Reg)
+ UseIdx = 2;
+ else if (UseMI.getOperand(1).getReg() == Reg)
+ UseIdx = 2, CommuteIdx = 1;
+ else
+ return false;
+ break;
+ case SystemZ::LOCGR:
+ if (!STI.hasLoadStoreOnCond2())
+ return false;
+ NewUseOpc = SystemZ::LOCGHI;
+ if (UseMI.getOperand(2).getReg() == Reg)
+ UseIdx = 2;
+ else if (UseMI.getOperand(1).getReg() == Reg)
+ UseIdx = 2, CommuteIdx = 1;
+ else
+ return false;
+ break;
+ default:
+ return false;
+ }
+
+ if (CommuteIdx != -1)
+ if (!commuteInstruction(UseMI, false, CommuteIdx, UseIdx))
+ return false;
+
+ bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
+ UseMI.setDesc(get(NewUseOpc));
+ UseMI.getOperand(UseIdx).ChangeToImmediate(ImmVal);
+ if (DeleteDef)
+ DefMI.eraseFromParent();
+
+ return true;
+}
+
+bool SystemZInstrInfo::isPredicable(MachineInstr &MI) const {
+ unsigned Opcode = MI.getOpcode();
+ if (Opcode == SystemZ::Return ||
+ Opcode == SystemZ::Trap ||
+ Opcode == SystemZ::CallJG ||
+ Opcode == SystemZ::CallBR)
+ return true;
+ return false;
+}
+
+bool SystemZInstrInfo::
+isProfitableToIfCvt(MachineBasicBlock &MBB,
+ unsigned NumCycles, unsigned ExtraPredCycles,
+ BranchProbability Probability) const {
+ // Avoid using conditional returns at the end of a loop (since then
+ // we'd need to emit an unconditional branch to the beginning anyway,
+ // making the loop body longer). This doesn't apply for low-probability
+ // loops (eg. compare-and-swap retry), so just decide based on branch
+ // probability instead of looping structure.
+ // However, since Compare and Trap instructions cost the same as a regular
+ // Compare instruction, we should allow the if conversion to convert this
+ // into a Conditional Compare regardless of the branch probability.
+ if (MBB.getLastNonDebugInstr()->getOpcode() != SystemZ::Trap &&
+ MBB.succ_empty() && Probability < BranchProbability(1, 8))
+ return false;
+ // For now only convert single instructions.
+ return NumCycles == 1;
+}
+
+bool SystemZInstrInfo::
+isProfitableToIfCvt(MachineBasicBlock &TMBB,
+ unsigned NumCyclesT, unsigned ExtraPredCyclesT,
+ MachineBasicBlock &FMBB,
+ unsigned NumCyclesF, unsigned ExtraPredCyclesF,
+ BranchProbability Probability) const {
+ // For now avoid converting mutually-exclusive cases.
+ return false;
+}
+
+bool SystemZInstrInfo::
+isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
+ BranchProbability Probability) const {
+ // For now only duplicate single instructions.
+ return NumCycles == 1;
+}
+
+bool SystemZInstrInfo::PredicateInstruction(
+ MachineInstr &MI, ArrayRef<MachineOperand> Pred) const {
+ assert(Pred.size() == 2 && "Invalid condition");
+ unsigned CCValid = Pred[0].getImm();
+ unsigned CCMask = Pred[1].getImm();
+ assert(CCMask > 0 && CCMask < 15 && "Invalid predicate");
+ unsigned Opcode = MI.getOpcode();
+ if (Opcode == SystemZ::Trap) {
+ MI.setDesc(get(SystemZ::CondTrap));
+ MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+ .addImm(CCValid).addImm(CCMask)
+ .addReg(SystemZ::CC, RegState::Implicit);
+ return true;
+ }
+ if (Opcode == SystemZ::Return) {
+ MI.setDesc(get(SystemZ::CondReturn));
+ MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+ .addImm(CCValid).addImm(CCMask)
+ .addReg(SystemZ::CC, RegState::Implicit);
+ return true;
+ }
+ if (Opcode == SystemZ::CallJG) {
+ MachineOperand FirstOp = MI.getOperand(0);
+ const uint32_t *RegMask = MI.getOperand(1).getRegMask();
+ MI.RemoveOperand(1);
+ MI.RemoveOperand(0);
+ MI.setDesc(get(SystemZ::CallBRCL));
+ MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+ .addImm(CCValid).addImm(CCMask)
+ .addOperand(FirstOp)
+ .addRegMask(RegMask)
+ .addReg(SystemZ::CC, RegState::Implicit);
+ return true;
+ }
+ if (Opcode == SystemZ::CallBR) {
+ const uint32_t *RegMask = MI.getOperand(0).getRegMask();
+ MI.RemoveOperand(0);
+ MI.setDesc(get(SystemZ::CallBCR));
+ MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+ .addImm(CCValid).addImm(CCMask)
+ .addRegMask(RegMask)
+ .addReg(SystemZ::CC, RegState::Implicit);
+ return true;
+ }
+ return false;
+}
+
+void SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, unsigned DestReg,
+ unsigned SrcReg, bool KillSrc) const {
+ // Split 128-bit GPR moves into two 64-bit moves. This handles ADDR128 too.
+ if (SystemZ::GR128BitRegClass.contains(DestReg, SrcReg)) {
+ copyPhysReg(MBB, MBBI, DL, RI.getSubReg(DestReg, SystemZ::subreg_h64),
+ RI.getSubReg(SrcReg, SystemZ::subreg_h64), KillSrc);
+ copyPhysReg(MBB, MBBI, DL, RI.getSubReg(DestReg, SystemZ::subreg_l64),
+ RI.getSubReg(SrcReg, SystemZ::subreg_l64), KillSrc);
+ return;
+ }
+
+ if (SystemZ::GRX32BitRegClass.contains(DestReg, SrcReg)) {
+ emitGRX32Move(MBB, MBBI, DL, DestReg, SrcReg, SystemZ::LR, 32, KillSrc);
+ return;
+ }
+
+ // Everything else needs only one instruction.
+ unsigned Opcode;
+ if (SystemZ::GR64BitRegClass.contains(DestReg, SrcReg))
+ Opcode = SystemZ::LGR;
+ else if (SystemZ::FP32BitRegClass.contains(DestReg, SrcReg))
+ // For z13 we prefer LDR over LER to avoid partial register dependencies.
+ Opcode = STI.hasVector() ? SystemZ::LDR32 : SystemZ::LER;
+ else if (SystemZ::FP64BitRegClass.contains(DestReg, SrcReg))
+ Opcode = SystemZ::LDR;
+ else if (SystemZ::FP128BitRegClass.contains(DestReg, SrcReg))
+ Opcode = SystemZ::LXR;
+ else if (SystemZ::VR32BitRegClass.contains(DestReg, SrcReg))
+ Opcode = SystemZ::VLR32;
+ else if (SystemZ::VR64BitRegClass.contains(DestReg, SrcReg))
+ Opcode = SystemZ::VLR64;
+ else if (SystemZ::VR128BitRegClass.contains(DestReg, SrcReg))
+ Opcode = SystemZ::VLR;
+ else if (SystemZ::AR32BitRegClass.contains(DestReg, SrcReg))
+ Opcode = SystemZ::CPYA;
+ else if (SystemZ::AR32BitRegClass.contains(DestReg) &&
+ SystemZ::GR32BitRegClass.contains(SrcReg))
+ Opcode = SystemZ::SAR;
+ else if (SystemZ::GR32BitRegClass.contains(DestReg) &&
+ SystemZ::AR32BitRegClass.contains(SrcReg))
+ Opcode = SystemZ::EAR;
+ else
+ llvm_unreachable("Impossible reg-to-reg copy");
+
+ BuildMI(MBB, MBBI, DL, get(Opcode), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+}
+
+void SystemZInstrInfo::storeRegToStackSlot(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned SrcReg,
+ bool isKill, int FrameIdx, const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+ // Callers may expect a single instruction, so keep 128-bit moves
+ // together for now and lower them after register allocation.
+ unsigned LoadOpcode, StoreOpcode;
+ getLoadStoreOpcodes(RC, LoadOpcode, StoreOpcode);
+ addFrameReference(BuildMI(MBB, MBBI, DL, get(StoreOpcode))
+ .addReg(SrcReg, getKillRegState(isKill)),
+ FrameIdx);
+}
+
+void SystemZInstrInfo::loadRegFromStackSlot(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned DestReg,
+ int FrameIdx, const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+ // Callers may expect a single instruction, so keep 128-bit moves
+ // together for now and lower them after register allocation.
+ unsigned LoadOpcode, StoreOpcode;
+ getLoadStoreOpcodes(RC, LoadOpcode, StoreOpcode);
+ addFrameReference(BuildMI(MBB, MBBI, DL, get(LoadOpcode), DestReg),
+ FrameIdx);
+}
+
+// Return true if MI is a simple load or store with a 12-bit displacement
+// and no index. Flag is SimpleBDXLoad for loads and SimpleBDXStore for stores.
+static bool isSimpleBD12Move(const MachineInstr *MI, unsigned Flag) {
+ const MCInstrDesc &MCID = MI->getDesc();
+ return ((MCID.TSFlags & Flag) &&
+ isUInt<12>(MI->getOperand(2).getImm()) &&
+ MI->getOperand(3).getReg() == 0);
+}
+
+namespace {
+struct LogicOp {
+ LogicOp() : RegSize(0), ImmLSB(0), ImmSize(0) {}
+ LogicOp(unsigned regSize, unsigned immLSB, unsigned immSize)
+ : RegSize(regSize), ImmLSB(immLSB), ImmSize(immSize) {}
+
+ explicit operator bool() const { return RegSize; }
+
+ unsigned RegSize, ImmLSB, ImmSize;
+};
+} // end anonymous namespace
+
+static LogicOp interpretAndImmediate(unsigned Opcode) {
+ switch (Opcode) {
+ case SystemZ::NILMux: return LogicOp(32, 0, 16);
+ case SystemZ::NIHMux: return LogicOp(32, 16, 16);
+ case SystemZ::NILL64: return LogicOp(64, 0, 16);
+ case SystemZ::NILH64: return LogicOp(64, 16, 16);
+ case SystemZ::NIHL64: return LogicOp(64, 32, 16);
+ case SystemZ::NIHH64: return LogicOp(64, 48, 16);
+ case SystemZ::NIFMux: return LogicOp(32, 0, 32);
+ case SystemZ::NILF64: return LogicOp(64, 0, 32);
+ case SystemZ::NIHF64: return LogicOp(64, 32, 32);
+ default: return LogicOp();
+ }
+}
+
+static void transferDeadCC(MachineInstr *OldMI, MachineInstr *NewMI) {
+ if (OldMI->registerDefIsDead(SystemZ::CC)) {
+ MachineOperand *CCDef = NewMI->findRegisterDefOperand(SystemZ::CC);
+ if (CCDef != nullptr)
+ CCDef->setIsDead(true);
+ }
+}
+
+// Used to return from convertToThreeAddress after replacing two-address
+// instruction OldMI with three-address instruction NewMI.
+static MachineInstr *finishConvertToThreeAddress(MachineInstr *OldMI,
+ MachineInstr *NewMI,
+ LiveVariables *LV) {
+ if (LV) {
+ unsigned NumOps = OldMI->getNumOperands();
+ for (unsigned I = 1; I < NumOps; ++I) {
+ MachineOperand &Op = OldMI->getOperand(I);
+ if (Op.isReg() && Op.isKill())
+ LV->replaceKillInstruction(Op.getReg(), *OldMI, *NewMI);
+ }
+ }
+ transferDeadCC(OldMI, NewMI);
+ return NewMI;
+}
+
+MachineInstr *SystemZInstrInfo::convertToThreeAddress(
+ MachineFunction::iterator &MFI, MachineInstr &MI, LiveVariables *LV) const {
+ MachineBasicBlock *MBB = MI.getParent();
+ MachineFunction *MF = MBB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ unsigned Opcode = MI.getOpcode();
+ unsigned NumOps = MI.getNumOperands();
+
+ // Try to convert something like SLL into SLLK, if supported.
+ // We prefer to keep the two-operand form where possible both
+ // because it tends to be shorter and because some instructions
+ // have memory forms that can be used during spilling.
+ if (STI.hasDistinctOps()) {
+ MachineOperand &Dest = MI.getOperand(0);
+ MachineOperand &Src = MI.getOperand(1);
+ unsigned DestReg = Dest.getReg();
+ unsigned SrcReg = Src.getReg();
+ // AHIMux is only really a three-operand instruction when both operands
+ // are low registers. Try to constrain both operands to be low if
+ // possible.
+ if (Opcode == SystemZ::AHIMux &&
+ TargetRegisterInfo::isVirtualRegister(DestReg) &&
+ TargetRegisterInfo::isVirtualRegister(SrcReg) &&
+ MRI.getRegClass(DestReg)->contains(SystemZ::R1L) &&
+ MRI.getRegClass(SrcReg)->contains(SystemZ::R1L)) {
+ MRI.constrainRegClass(DestReg, &SystemZ::GR32BitRegClass);
+ MRI.constrainRegClass(SrcReg, &SystemZ::GR32BitRegClass);
+ }
+ int ThreeOperandOpcode = SystemZ::getThreeOperandOpcode(Opcode);
+ if (ThreeOperandOpcode >= 0) {
+ // Create three address instruction without adding the implicit
+ // operands. Those will instead be copied over from the original
+ // instruction by the loop below.
+ MachineInstrBuilder MIB(
+ *MF, MF->CreateMachineInstr(get(ThreeOperandOpcode), MI.getDebugLoc(),
+ /*NoImplicit=*/true));
+ MIB.addOperand(Dest);
+ // Keep the kill state, but drop the tied flag.
+ MIB.addReg(Src.getReg(), getKillRegState(Src.isKill()), Src.getSubReg());
+ // Keep the remaining operands as-is.
+ for (unsigned I = 2; I < NumOps; ++I)
+ MIB.addOperand(MI.getOperand(I));
+ MBB->insert(MI, MIB);
+ return finishConvertToThreeAddress(&MI, MIB, LV);
+ }
+ }
+
+ // Try to convert an AND into an RISBG-type instruction.
+ if (LogicOp And = interpretAndImmediate(Opcode)) {
+ uint64_t Imm = MI.getOperand(2).getImm() << And.ImmLSB;
+ // AND IMMEDIATE leaves the other bits of the register unchanged.
+ Imm |= allOnes(And.RegSize) & ~(allOnes(And.ImmSize) << And.ImmLSB);
+ unsigned Start, End;
+ if (isRxSBGMask(Imm, And.RegSize, Start, End)) {
+ unsigned NewOpcode;
+ if (And.RegSize == 64) {
+ NewOpcode = SystemZ::RISBG;
+ // Prefer RISBGN if available, since it does not clobber CC.
+ if (STI.hasMiscellaneousExtensions())
+ NewOpcode = SystemZ::RISBGN;
+ } else {
+ NewOpcode = SystemZ::RISBMux;
+ Start &= 31;
+ End &= 31;
+ }
+ MachineOperand &Dest = MI.getOperand(0);
+ MachineOperand &Src = MI.getOperand(1);
+ MachineInstrBuilder MIB =
+ BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpcode))
+ .addOperand(Dest)
+ .addReg(0)
+ .addReg(Src.getReg(), getKillRegState(Src.isKill()),
+ Src.getSubReg())
+ .addImm(Start)
+ .addImm(End + 128)
+ .addImm(0);
+ return finishConvertToThreeAddress(&MI, MIB, LV);
+ }
+ }
+ return nullptr;
+}
+
+MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
+ MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
+ MachineBasicBlock::iterator InsertPt, int FrameIndex,
+ LiveIntervals *LIS) const {
+ const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ unsigned Size = MFI.getObjectSize(FrameIndex);
+ unsigned Opcode = MI.getOpcode();
+
+ if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
+ if (LIS != nullptr && (Opcode == SystemZ::LA || Opcode == SystemZ::LAY) &&
+ isInt<8>(MI.getOperand(2).getImm()) && !MI.getOperand(3).getReg()) {
+
+ // Check CC liveness, since new instruction introduces a dead
+ // def of CC.
+ MCRegUnitIterator CCUnit(SystemZ::CC, TRI);
+ LiveRange &CCLiveRange = LIS->getRegUnit(*CCUnit);
+ ++CCUnit;
+ assert (!CCUnit.isValid() && "CC only has one reg unit.");
+ SlotIndex MISlot =
+ LIS->getSlotIndexes()->getInstructionIndex(MI).getRegSlot();
+ if (!CCLiveRange.liveAt(MISlot)) {
+ // LA(Y) %reg, CONST(%reg) -> AGSI %mem, CONST
+ MachineInstr *BuiltMI = BuildMI(*InsertPt->getParent(), InsertPt,
+ MI.getDebugLoc(), get(SystemZ::AGSI))
+ .addFrameIndex(FrameIndex)
+ .addImm(0)
+ .addImm(MI.getOperand(2).getImm());
+ BuiltMI->findRegisterDefOperand(SystemZ::CC)->setIsDead(true);
+ CCLiveRange.createDeadDef(MISlot, LIS->getVNInfoAllocator());
+ return BuiltMI;
+ }
+ }
+ return nullptr;
+ }
+
+ // All other cases require a single operand.
+ if (Ops.size() != 1)
+ return nullptr;
+
+ unsigned OpNum = Ops[0];
+ assert(Size ==
+ MF.getRegInfo()
+ .getRegClass(MI.getOperand(OpNum).getReg())
+ ->getSize() &&
+ "Invalid size combination");
+
+ if ((Opcode == SystemZ::AHI || Opcode == SystemZ::AGHI) && OpNum == 0 &&
+ isInt<8>(MI.getOperand(2).getImm())) {
+ // A(G)HI %reg, CONST -> A(G)SI %mem, CONST
+ Opcode = (Opcode == SystemZ::AHI ? SystemZ::ASI : SystemZ::AGSI);
+ MachineInstr *BuiltMI =
+ BuildMI(*InsertPt->getParent(), InsertPt, MI.getDebugLoc(), get(Opcode))
+ .addFrameIndex(FrameIndex)
+ .addImm(0)
+ .addImm(MI.getOperand(2).getImm());
+ transferDeadCC(&MI, BuiltMI);
+ return BuiltMI;
+ }
+
+ if (Opcode == SystemZ::LGDR || Opcode == SystemZ::LDGR) {
+ bool Op0IsGPR = (Opcode == SystemZ::LGDR);
+ bool Op1IsGPR = (Opcode == SystemZ::LDGR);
+ // If we're spilling the destination of an LDGR or LGDR, store the
+ // source register instead.
+ if (OpNum == 0) {
+ unsigned StoreOpcode = Op1IsGPR ? SystemZ::STG : SystemZ::STD;
+ return BuildMI(*InsertPt->getParent(), InsertPt, MI.getDebugLoc(),
+ get(StoreOpcode))
+ .addOperand(MI.getOperand(1))
+ .addFrameIndex(FrameIndex)
+ .addImm(0)
+ .addReg(0);
+ }
+ // If we're spilling the source of an LDGR or LGDR, load the
+ // destination register instead.
+ if (OpNum == 1) {
+ unsigned LoadOpcode = Op0IsGPR ? SystemZ::LG : SystemZ::LD;
+ unsigned Dest = MI.getOperand(0).getReg();
+ return BuildMI(*InsertPt->getParent(), InsertPt, MI.getDebugLoc(),
+ get(LoadOpcode), Dest)
+ .addFrameIndex(FrameIndex)
+ .addImm(0)
+ .addReg(0);
+ }
+ }
+
+ // Look for cases where the source of a simple store or the destination
+ // of a simple load is being spilled. Try to use MVC instead.
+ //
+ // Although MVC is in practice a fast choice in these cases, it is still
+ // logically a bytewise copy. This means that we cannot use it if the
+ // load or store is volatile. We also wouldn't be able to use MVC if
+ // the two memories partially overlap, but that case cannot occur here,
+ // because we know that one of the memories is a full frame index.
+ //
+ // For performance reasons, we also want to avoid using MVC if the addresses
+ // might be equal. We don't worry about that case here, because spill slot
+ // coloring happens later, and because we have special code to remove
+ // MVCs that turn out to be redundant.
+ if (OpNum == 0 && MI.hasOneMemOperand()) {
+ MachineMemOperand *MMO = *MI.memoperands_begin();
+ if (MMO->getSize() == Size && !MMO->isVolatile()) {
+ // Handle conversion of loads.
+ if (isSimpleBD12Move(&MI, SystemZII::SimpleBDXLoad)) {
+ return BuildMI(*InsertPt->getParent(), InsertPt, MI.getDebugLoc(),
+ get(SystemZ::MVC))
+ .addFrameIndex(FrameIndex)
+ .addImm(0)
+ .addImm(Size)
+ .addOperand(MI.getOperand(1))
+ .addImm(MI.getOperand(2).getImm())
+ .addMemOperand(MMO);
+ }
+ // Handle conversion of stores.
+ if (isSimpleBD12Move(&MI, SystemZII::SimpleBDXStore)) {
+ return BuildMI(*InsertPt->getParent(), InsertPt, MI.getDebugLoc(),
+ get(SystemZ::MVC))
+ .addOperand(MI.getOperand(1))
+ .addImm(MI.getOperand(2).getImm())
+ .addImm(Size)
+ .addFrameIndex(FrameIndex)
+ .addImm(0)
+ .addMemOperand(MMO);
+ }
+ }
+ }
+
+ // If the spilled operand is the final one, try to change <INSN>R
+ // into <INSN>.
+ int MemOpcode = SystemZ::getMemOpcode(Opcode);
+ if (MemOpcode >= 0) {
+ unsigned NumOps = MI.getNumExplicitOperands();
+ if (OpNum == NumOps - 1) {
+ const MCInstrDesc &MemDesc = get(MemOpcode);
+ uint64_t AccessBytes = SystemZII::getAccessSize(MemDesc.TSFlags);
+ assert(AccessBytes != 0 && "Size of access should be known");
+ assert(AccessBytes <= Size && "Access outside the frame index");
+ uint64_t Offset = Size - AccessBytes;
+ MachineInstrBuilder MIB = BuildMI(*InsertPt->getParent(), InsertPt,
+ MI.getDebugLoc(), get(MemOpcode));
+ for (unsigned I = 0; I < OpNum; ++I)
+ MIB.addOperand(MI.getOperand(I));
+ MIB.addFrameIndex(FrameIndex).addImm(Offset);
+ if (MemDesc.TSFlags & SystemZII::HasIndex)
+ MIB.addReg(0);
+ transferDeadCC(&MI, MIB);
+ return MIB;
+ }
+ }
+
+ return nullptr;
+}
+
+MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
+ MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
+ MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
+ LiveIntervals *LIS) const {
+ return nullptr;
+}
+
+bool SystemZInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ case SystemZ::L128:
+ splitMove(MI, SystemZ::LG);
+ return true;
+
+ case SystemZ::ST128:
+ splitMove(MI, SystemZ::STG);
+ return true;
+
+ case SystemZ::LX:
+ splitMove(MI, SystemZ::LD);
+ return true;
+
+ case SystemZ::STX:
+ splitMove(MI, SystemZ::STD);
+ return true;
+
+ case SystemZ::LBMux:
+ expandRXYPseudo(MI, SystemZ::LB, SystemZ::LBH);
+ return true;
+
+ case SystemZ::LHMux:
+ expandRXYPseudo(MI, SystemZ::LH, SystemZ::LHH);
+ return true;
+
+ case SystemZ::LLCRMux:
+ expandZExtPseudo(MI, SystemZ::LLCR, 8);
+ return true;
+
+ case SystemZ::LLHRMux:
+ expandZExtPseudo(MI, SystemZ::LLHR, 16);
+ return true;
+
+ case SystemZ::LLCMux:
+ expandRXYPseudo(MI, SystemZ::LLC, SystemZ::LLCH);
+ return true;
+
+ case SystemZ::LLHMux:
+ expandRXYPseudo(MI, SystemZ::LLH, SystemZ::LLHH);
+ return true;
+
+ case SystemZ::LMux:
+ expandRXYPseudo(MI, SystemZ::L, SystemZ::LFH);
+ return true;
+
+ case SystemZ::LOCMux:
+ expandLOCPseudo(MI, SystemZ::LOC, SystemZ::LOCFH);
+ return true;
+
+ case SystemZ::LOCHIMux:
+ expandLOCPseudo(MI, SystemZ::LOCHI, SystemZ::LOCHHI);
+ return true;
+
+ case SystemZ::LOCRMux:
+ expandLOCRPseudo(MI, SystemZ::LOCR, SystemZ::LOCFHR);
+ return true;
+
+ case SystemZ::STCMux:
+ expandRXYPseudo(MI, SystemZ::STC, SystemZ::STCH);
+ return true;
+
+ case SystemZ::STHMux:
+ expandRXYPseudo(MI, SystemZ::STH, SystemZ::STHH);
+ return true;
+
+ case SystemZ::STMux:
+ expandRXYPseudo(MI, SystemZ::ST, SystemZ::STFH);
+ return true;
+
+ case SystemZ::STOCMux:
+ expandLOCPseudo(MI, SystemZ::STOC, SystemZ::STOCFH);
+ return true;
+
+ case SystemZ::LHIMux:
+ expandRIPseudo(MI, SystemZ::LHI, SystemZ::IIHF, true);
+ return true;
+
+ case SystemZ::IIFMux:
+ expandRIPseudo(MI, SystemZ::IILF, SystemZ::IIHF, false);
+ return true;
+
+ case SystemZ::IILMux:
+ expandRIPseudo(MI, SystemZ::IILL, SystemZ::IIHL, false);
+ return true;
+
+ case SystemZ::IIHMux:
+ expandRIPseudo(MI, SystemZ::IILH, SystemZ::IIHH, false);
+ return true;
+
+ case SystemZ::NIFMux:
+ expandRIPseudo(MI, SystemZ::NILF, SystemZ::NIHF, false);
+ return true;
+
+ case SystemZ::NILMux:
+ expandRIPseudo(MI, SystemZ::NILL, SystemZ::NIHL, false);
+ return true;
+
+ case SystemZ::NIHMux:
+ expandRIPseudo(MI, SystemZ::NILH, SystemZ::NIHH, false);
+ return true;
+
+ case SystemZ::OIFMux:
+ expandRIPseudo(MI, SystemZ::OILF, SystemZ::OIHF, false);
+ return true;
+
+ case SystemZ::OILMux:
+ expandRIPseudo(MI, SystemZ::OILL, SystemZ::OIHL, false);
+ return true;
+
+ case SystemZ::OIHMux:
+ expandRIPseudo(MI, SystemZ::OILH, SystemZ::OIHH, false);
+ return true;
+
+ case SystemZ::XIFMux:
+ expandRIPseudo(MI, SystemZ::XILF, SystemZ::XIHF, false);
+ return true;
+
+ case SystemZ::TMLMux:
+ expandRIPseudo(MI, SystemZ::TMLL, SystemZ::TMHL, false);
+ return true;
+
+ case SystemZ::TMHMux:
+ expandRIPseudo(MI, SystemZ::TMLH, SystemZ::TMHH, false);
+ return true;
+
+ case SystemZ::AHIMux:
+ expandRIPseudo(MI, SystemZ::AHI, SystemZ::AIH, false);
+ return true;
+
+ case SystemZ::AHIMuxK:
+ expandRIEPseudo(MI, SystemZ::AHI, SystemZ::AHIK, SystemZ::AIH);
+ return true;
+
+ case SystemZ::AFIMux:
+ expandRIPseudo(MI, SystemZ::AFI, SystemZ::AIH, false);
+ return true;
+
+ case SystemZ::CHIMux:
+ expandRIPseudo(MI, SystemZ::CHI, SystemZ::CIH, false);
+ return true;
+
+ case SystemZ::CFIMux:
+ expandRIPseudo(MI, SystemZ::CFI, SystemZ::CIH, false);
+ return true;
+
+ case SystemZ::CLFIMux:
+ expandRIPseudo(MI, SystemZ::CLFI, SystemZ::CLIH, false);
+ return true;
+
+ case SystemZ::CMux:
+ expandRXYPseudo(MI, SystemZ::C, SystemZ::CHF);
+ return true;
+
+ case SystemZ::CLMux:
+ expandRXYPseudo(MI, SystemZ::CL, SystemZ::CLHF);
+ return true;
+
+ case SystemZ::RISBMux: {
+ bool DestIsHigh = isHighReg(MI.getOperand(0).getReg());
+ bool SrcIsHigh = isHighReg(MI.getOperand(2).getReg());
+ if (SrcIsHigh == DestIsHigh)
+ MI.setDesc(get(DestIsHigh ? SystemZ::RISBHH : SystemZ::RISBLL));
+ else {
+ MI.setDesc(get(DestIsHigh ? SystemZ::RISBHL : SystemZ::RISBLH));
+ MI.getOperand(5).setImm(MI.getOperand(5).getImm() ^ 32);
+ }
+ return true;
+ }
+
+ case SystemZ::ADJDYNALLOC:
+ splitAdjDynAlloc(MI);
+ return true;
+
+ case TargetOpcode::LOAD_STACK_GUARD:
+ expandLoadStackGuard(&MI);
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+unsigned SystemZInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
+ if (MI.getOpcode() == TargetOpcode::INLINEASM) {
+ const MachineFunction *MF = MI.getParent()->getParent();
+ const char *AsmStr = MI.getOperand(0).getSymbolName();
+ return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
+ }
+ return MI.getDesc().getSize();
+}
+
+SystemZII::Branch
+SystemZInstrInfo::getBranchInfo(const MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ case SystemZ::BR:
+ case SystemZ::J:
+ case SystemZ::JG:
+ return SystemZII::Branch(SystemZII::BranchNormal, SystemZ::CCMASK_ANY,
+ SystemZ::CCMASK_ANY, &MI.getOperand(0));
+
+ case SystemZ::BRC:
+ case SystemZ::BRCL:
+ return SystemZII::Branch(SystemZII::BranchNormal, MI.getOperand(0).getImm(),
+ MI.getOperand(1).getImm(), &MI.getOperand(2));
+
+ case SystemZ::BRCT:
+ case SystemZ::BRCTH:
+ return SystemZII::Branch(SystemZII::BranchCT, SystemZ::CCMASK_ICMP,
+ SystemZ::CCMASK_CMP_NE, &MI.getOperand(2));
+
+ case SystemZ::BRCTG:
+ return SystemZII::Branch(SystemZII::BranchCTG, SystemZ::CCMASK_ICMP,
+ SystemZ::CCMASK_CMP_NE, &MI.getOperand(2));
+
+ case SystemZ::CIJ:
+ case SystemZ::CRJ:
+ return SystemZII::Branch(SystemZII::BranchC, SystemZ::CCMASK_ICMP,
+ MI.getOperand(2).getImm(), &MI.getOperand(3));
+
+ case SystemZ::CLIJ:
+ case SystemZ::CLRJ:
+ return SystemZII::Branch(SystemZII::BranchCL, SystemZ::CCMASK_ICMP,
+ MI.getOperand(2).getImm(), &MI.getOperand(3));
+
+ case SystemZ::CGIJ:
+ case SystemZ::CGRJ:
+ return SystemZII::Branch(SystemZII::BranchCG, SystemZ::CCMASK_ICMP,
+ MI.getOperand(2).getImm(), &MI.getOperand(3));
+
+ case SystemZ::CLGIJ:
+ case SystemZ::CLGRJ:
+ return SystemZII::Branch(SystemZII::BranchCLG, SystemZ::CCMASK_ICMP,
+ MI.getOperand(2).getImm(), &MI.getOperand(3));
+
+ default:
+ llvm_unreachable("Unrecognized branch opcode");
+ }
+}
+
+void SystemZInstrInfo::getLoadStoreOpcodes(const TargetRegisterClass *RC,
+ unsigned &LoadOpcode,
+ unsigned &StoreOpcode) const {
+ if (RC == &SystemZ::GR32BitRegClass || RC == &SystemZ::ADDR32BitRegClass) {
+ LoadOpcode = SystemZ::L;
+ StoreOpcode = SystemZ::ST;
+ } else if (RC == &SystemZ::GRH32BitRegClass) {
+ LoadOpcode = SystemZ::LFH;
+ StoreOpcode = SystemZ::STFH;
+ } else if (RC == &SystemZ::GRX32BitRegClass) {
+ LoadOpcode = SystemZ::LMux;
+ StoreOpcode = SystemZ::STMux;
+ } else if (RC == &SystemZ::GR64BitRegClass ||
+ RC == &SystemZ::ADDR64BitRegClass) {
+ LoadOpcode = SystemZ::LG;
+ StoreOpcode = SystemZ::STG;
+ } else if (RC == &SystemZ::GR128BitRegClass ||
+ RC == &SystemZ::ADDR128BitRegClass) {
+ LoadOpcode = SystemZ::L128;
+ StoreOpcode = SystemZ::ST128;
+ } else if (RC == &SystemZ::FP32BitRegClass) {
+ LoadOpcode = SystemZ::LE;
+ StoreOpcode = SystemZ::STE;
+ } else if (RC == &SystemZ::FP64BitRegClass) {
+ LoadOpcode = SystemZ::LD;
+ StoreOpcode = SystemZ::STD;
+ } else if (RC == &SystemZ::FP128BitRegClass) {
+ LoadOpcode = SystemZ::LX;
+ StoreOpcode = SystemZ::STX;
+ } else if (RC == &SystemZ::VR32BitRegClass) {
+ LoadOpcode = SystemZ::VL32;
+ StoreOpcode = SystemZ::VST32;
+ } else if (RC == &SystemZ::VR64BitRegClass) {
+ LoadOpcode = SystemZ::VL64;
+ StoreOpcode = SystemZ::VST64;
+ } else if (RC == &SystemZ::VF128BitRegClass ||
+ RC == &SystemZ::VR128BitRegClass) {
+ LoadOpcode = SystemZ::VL;
+ StoreOpcode = SystemZ::VST;
+ } else
+ llvm_unreachable("Unsupported regclass to load or store");
+}
+
+unsigned SystemZInstrInfo::getOpcodeForOffset(unsigned Opcode,
+ int64_t Offset) const {
+ const MCInstrDesc &MCID = get(Opcode);
+ int64_t Offset2 = (MCID.TSFlags & SystemZII::Is128Bit ? Offset + 8 : Offset);
+ if (isUInt<12>(Offset) && isUInt<12>(Offset2)) {
+ // Get the instruction to use for unsigned 12-bit displacements.
+ int Disp12Opcode = SystemZ::getDisp12Opcode(Opcode);
+ if (Disp12Opcode >= 0)
+ return Disp12Opcode;
+
+ // All address-related instructions can use unsigned 12-bit
+ // displacements.
+ return Opcode;
+ }
+ if (isInt<20>(Offset) && isInt<20>(Offset2)) {
+ // Get the instruction to use for signed 20-bit displacements.
+ int Disp20Opcode = SystemZ::getDisp20Opcode(Opcode);
+ if (Disp20Opcode >= 0)
+ return Disp20Opcode;
+
+ // Check whether Opcode allows signed 20-bit displacements.
+ if (MCID.TSFlags & SystemZII::Has20BitOffset)
+ return Opcode;
+ }
+ return 0;
+}
+
+unsigned SystemZInstrInfo::getLoadAndTest(unsigned Opcode) const {
+ switch (Opcode) {
+ case SystemZ::L: return SystemZ::LT;
+ case SystemZ::LY: return SystemZ::LT;
+ case SystemZ::LG: return SystemZ::LTG;
+ case SystemZ::LGF: return SystemZ::LTGF;
+ case SystemZ::LR: return SystemZ::LTR;
+ case SystemZ::LGFR: return SystemZ::LTGFR;
+ case SystemZ::LGR: return SystemZ::LTGR;
+ case SystemZ::LER: return SystemZ::LTEBR;
+ case SystemZ::LDR: return SystemZ::LTDBR;
+ case SystemZ::LXR: return SystemZ::LTXBR;
+ case SystemZ::LCDFR: return SystemZ::LCDBR;
+ case SystemZ::LPDFR: return SystemZ::LPDBR;
+ case SystemZ::LNDFR: return SystemZ::LNDBR;
+ case SystemZ::LCDFR_32: return SystemZ::LCEBR;
+ case SystemZ::LPDFR_32: return SystemZ::LPEBR;
+ case SystemZ::LNDFR_32: return SystemZ::LNEBR;
+ // On zEC12 we prefer to use RISBGN. But if there is a chance to
+ // actually use the condition code, we may turn it back into RISGB.
+ // Note that RISBG is not really a "load-and-test" instruction,
+ // but sets the same condition code values, so is OK to use here.
+ case SystemZ::RISBGN: return SystemZ::RISBG;
+ default: return 0;
+ }
+}
+
+// Return true if Mask matches the regexp 0*1+0*, given that zero masks
+// have already been filtered out. Store the first set bit in LSB and
+// the number of set bits in Length if so.
+static bool isStringOfOnes(uint64_t Mask, unsigned &LSB, unsigned &Length) {
+ unsigned First = findFirstSet(Mask);
+ uint64_t Top = (Mask >> First) + 1;
+ if ((Top & -Top) == Top) {
+ LSB = First;
+ Length = findFirstSet(Top);
+ return true;
+ }
+ return false;
+}
+
+bool SystemZInstrInfo::isRxSBGMask(uint64_t Mask, unsigned BitSize,
+ unsigned &Start, unsigned &End) const {
+ // Reject trivial all-zero masks.
+ Mask &= allOnes(BitSize);
+ if (Mask == 0)
+ return false;
+
+ // Handle the 1+0+ or 0+1+0* cases. Start then specifies the index of
+ // the msb and End specifies the index of the lsb.
+ unsigned LSB, Length;
+ if (isStringOfOnes(Mask, LSB, Length)) {
+ Start = 63 - (LSB + Length - 1);
+ End = 63 - LSB;
+ return true;
+ }
+
+ // Handle the wrap-around 1+0+1+ cases. Start then specifies the msb
+ // of the low 1s and End specifies the lsb of the high 1s.
+ if (isStringOfOnes(Mask ^ allOnes(BitSize), LSB, Length)) {
+ assert(LSB > 0 && "Bottom bit must be set");
+ assert(LSB + Length < BitSize && "Top bit must be set");
+ Start = 63 - (LSB - 1);
+ End = 63 - (LSB + Length);
+ return true;
+ }
+
+ return false;
+}
+
+unsigned SystemZInstrInfo::getFusedCompare(unsigned Opcode,
+ SystemZII::FusedCompareType Type,
+ const MachineInstr *MI) const {
+ switch (Opcode) {
+ case SystemZ::CHI:
+ case SystemZ::CGHI:
+ if (!(MI && isInt<8>(MI->getOperand(1).getImm())))
+ return 0;
+ break;
+ case SystemZ::CLFI:
+ case SystemZ::CLGFI:
+ if (!(MI && isUInt<8>(MI->getOperand(1).getImm())))
+ return 0;
+ break;
+ case SystemZ::CL:
+ case SystemZ::CLG:
+ if (!STI.hasMiscellaneousExtensions())
+ return 0;
+ if (!(MI && MI->getOperand(3).getReg() == 0))
+ return 0;
+ break;
+ }
+ switch (Type) {
+ case SystemZII::CompareAndBranch:
+ switch (Opcode) {
+ case SystemZ::CR:
+ return SystemZ::CRJ;
+ case SystemZ::CGR:
+ return SystemZ::CGRJ;
+ case SystemZ::CHI:
+ return SystemZ::CIJ;
+ case SystemZ::CGHI:
+ return SystemZ::CGIJ;
+ case SystemZ::CLR:
+ return SystemZ::CLRJ;
+ case SystemZ::CLGR:
+ return SystemZ::CLGRJ;
+ case SystemZ::CLFI:
+ return SystemZ::CLIJ;
+ case SystemZ::CLGFI:
+ return SystemZ::CLGIJ;
+ default:
+ return 0;
+ }
+ case SystemZII::CompareAndReturn:
+ switch (Opcode) {
+ case SystemZ::CR:
+ return SystemZ::CRBReturn;
+ case SystemZ::CGR:
+ return SystemZ::CGRBReturn;
+ case SystemZ::CHI:
+ return SystemZ::CIBReturn;
+ case SystemZ::CGHI:
+ return SystemZ::CGIBReturn;
+ case SystemZ::CLR:
+ return SystemZ::CLRBReturn;
+ case SystemZ::CLGR:
+ return SystemZ::CLGRBReturn;
+ case SystemZ::CLFI:
+ return SystemZ::CLIBReturn;
+ case SystemZ::CLGFI:
+ return SystemZ::CLGIBReturn;
+ default:
+ return 0;
+ }
+ case SystemZII::CompareAndSibcall:
+ switch (Opcode) {
+ case SystemZ::CR:
+ return SystemZ::CRBCall;
+ case SystemZ::CGR:
+ return SystemZ::CGRBCall;
+ case SystemZ::CHI:
+ return SystemZ::CIBCall;
+ case SystemZ::CGHI:
+ return SystemZ::CGIBCall;
+ case SystemZ::CLR:
+ return SystemZ::CLRBCall;
+ case SystemZ::CLGR:
+ return SystemZ::CLGRBCall;
+ case SystemZ::CLFI:
+ return SystemZ::CLIBCall;
+ case SystemZ::CLGFI:
+ return SystemZ::CLGIBCall;
+ default:
+ return 0;
+ }
+ case SystemZII::CompareAndTrap:
+ switch (Opcode) {
+ case SystemZ::CR:
+ return SystemZ::CRT;
+ case SystemZ::CGR:
+ return SystemZ::CGRT;
+ case SystemZ::CHI:
+ return SystemZ::CIT;
+ case SystemZ::CGHI:
+ return SystemZ::CGIT;
+ case SystemZ::CLR:
+ return SystemZ::CLRT;
+ case SystemZ::CLGR:
+ return SystemZ::CLGRT;
+ case SystemZ::CLFI:
+ return SystemZ::CLFIT;
+ case SystemZ::CLGFI:
+ return SystemZ::CLGIT;
+ case SystemZ::CL:
+ return SystemZ::CLT;
+ case SystemZ::CLG:
+ return SystemZ::CLGT;
+ default:
+ return 0;
+ }
+ }
+ return 0;
+}
+
+unsigned SystemZInstrInfo::getLoadAndTrap(unsigned Opcode) const {
+ if (!STI.hasLoadAndTrap())
+ return 0;
+ switch (Opcode) {
+ case SystemZ::L:
+ case SystemZ::LY:
+ return SystemZ::LAT;
+ case SystemZ::LG:
+ return SystemZ::LGAT;
+ case SystemZ::LFH:
+ return SystemZ::LFHAT;
+ case SystemZ::LLGF:
+ return SystemZ::LLGFAT;
+ case SystemZ::LLGT:
+ return SystemZ::LLGTAT;
+ }
+ return 0;
+}
+
+void SystemZInstrInfo::loadImmediate(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ unsigned Reg, uint64_t Value) const {
+ DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+ unsigned Opcode;
+ if (isInt<16>(Value))
+ Opcode = SystemZ::LGHI;
+ else if (SystemZ::isImmLL(Value))
+ Opcode = SystemZ::LLILL;
+ else if (SystemZ::isImmLH(Value)) {
+ Opcode = SystemZ::LLILH;
+ Value >>= 16;
+ } else {
+ assert(isInt<32>(Value) && "Huge values not handled yet");
+ Opcode = SystemZ::LGFI;
+ }
+ BuildMI(MBB, MBBI, DL, get(Opcode), Reg).addImm(Value);
+}
+
+bool SystemZInstrInfo::
+areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
+ AliasAnalysis *AA) const {
+
+ if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand())
+ return false;
+
+ // If mem-operands show that the same address Value is used by both
+ // instructions, check for non-overlapping offsets and widths. Not
+ // sure if a register based analysis would be an improvement...
+
+ MachineMemOperand *MMOa = *MIa.memoperands_begin();
+ MachineMemOperand *MMOb = *MIb.memoperands_begin();
+ const Value *VALa = MMOa->getValue();
+ const Value *VALb = MMOb->getValue();
+ bool SameVal = (VALa && VALb && (VALa == VALb));
+ if (!SameVal) {
+ const PseudoSourceValue *PSVa = MMOa->getPseudoValue();
+ const PseudoSourceValue *PSVb = MMOb->getPseudoValue();
+ if (PSVa && PSVb && (PSVa == PSVb))
+ SameVal = true;
+ }
+ if (SameVal) {
+ int OffsetA = MMOa->getOffset(), OffsetB = MMOb->getOffset();
+ int WidthA = MMOa->getSize(), WidthB = MMOb->getSize();
+ int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
+ int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
+ int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
+ if (LowOffset + LowWidth <= HighOffset)
+ return true;
+ }
+
+ return false;
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.h b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
new file mode 100644
index 000000000000..794b193a501e
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -0,0 +1,309 @@
+//===-- SystemZInstrInfo.h - SystemZ instruction information ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SystemZ implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZINSTRINFO_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZINSTRINFO_H
+
+#include "SystemZ.h"
+#include "SystemZRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "SystemZGenInstrInfo.inc"
+
+namespace llvm {
+
+class SystemZTargetMachine;
+
+namespace SystemZII {
+enum {
+ // See comments in SystemZInstrFormats.td.
+ SimpleBDXLoad = (1 << 0),
+ SimpleBDXStore = (1 << 1),
+ Has20BitOffset = (1 << 2),
+ HasIndex = (1 << 3),
+ Is128Bit = (1 << 4),
+ AccessSizeMask = (31 << 5),
+ AccessSizeShift = 5,
+ CCValuesMask = (15 << 10),
+ CCValuesShift = 10,
+ CompareZeroCCMaskMask = (15 << 14),
+ CompareZeroCCMaskShift = 14,
+ CCMaskFirst = (1 << 18),
+ CCMaskLast = (1 << 19),
+ IsLogical = (1 << 20)
+};
+static inline unsigned getAccessSize(unsigned int Flags) {
+ return (Flags & AccessSizeMask) >> AccessSizeShift;
+}
+static inline unsigned getCCValues(unsigned int Flags) {
+ return (Flags & CCValuesMask) >> CCValuesShift;
+}
+static inline unsigned getCompareZeroCCMask(unsigned int Flags) {
+ return (Flags & CompareZeroCCMaskMask) >> CompareZeroCCMaskShift;
+}
+
+// SystemZ MachineOperand target flags.
+enum {
+ // Masks out the bits for the access model.
+ MO_SYMBOL_MODIFIER = (3 << 0),
+
+ // @GOT (aka @GOTENT)
+ MO_GOT = (1 << 0),
+
+ // @INDNTPOFF
+ MO_INDNTPOFF = (2 << 0)
+};
+// Classifies a branch.
+enum BranchType {
+ // An instruction that branches on the current value of CC.
+ BranchNormal,
+
+ // An instruction that peforms a 32-bit signed comparison and branches
+ // on the result.
+ BranchC,
+
+ // An instruction that peforms a 32-bit unsigned comparison and branches
+ // on the result.
+ BranchCL,
+
+ // An instruction that peforms a 64-bit signed comparison and branches
+ // on the result.
+ BranchCG,
+
+ // An instruction that peforms a 64-bit unsigned comparison and branches
+ // on the result.
+ BranchCLG,
+
+ // An instruction that decrements a 32-bit register and branches if
+ // the result is nonzero.
+ BranchCT,
+
+ // An instruction that decrements a 64-bit register and branches if
+ // the result is nonzero.
+ BranchCTG
+};
+// Information about a branch instruction.
+struct Branch {
+ // The type of the branch.
+ BranchType Type;
+
+ // CCMASK_<N> is set if CC might be equal to N.
+ unsigned CCValid;
+
+ // CCMASK_<N> is set if the branch should be taken when CC == N.
+ unsigned CCMask;
+
+ // The target of the branch.
+ const MachineOperand *Target;
+
+ Branch(BranchType type, unsigned ccValid, unsigned ccMask,
+ const MachineOperand *target)
+ : Type(type), CCValid(ccValid), CCMask(ccMask), Target(target) {}
+};
+// Kinds of fused compares in compare-and-* instructions. Together with type
+// of the converted compare, this identifies the compare-and-*
+// instruction.
+enum FusedCompareType {
+ // Relative branch - CRJ etc.
+ CompareAndBranch,
+
+ // Indirect branch, used for return - CRBReturn etc.
+ CompareAndReturn,
+
+ // Indirect branch, used for sibcall - CRBCall etc.
+ CompareAndSibcall,
+
+ // Trap
+ CompareAndTrap
+};
+} // end namespace SystemZII
+
+class SystemZSubtarget;
+class SystemZInstrInfo : public SystemZGenInstrInfo {
+ const SystemZRegisterInfo RI;
+ SystemZSubtarget &STI;
+
+ void splitMove(MachineBasicBlock::iterator MI, unsigned NewOpcode) const;
+ void splitAdjDynAlloc(MachineBasicBlock::iterator MI) const;
+ void expandRIPseudo(MachineInstr &MI, unsigned LowOpcode, unsigned HighOpcode,
+ bool ConvertHigh) const;
+ void expandRIEPseudo(MachineInstr &MI, unsigned LowOpcode,
+ unsigned LowOpcodeK, unsigned HighOpcode) const;
+ void expandRXYPseudo(MachineInstr &MI, unsigned LowOpcode,
+ unsigned HighOpcode) const;
+ void expandLOCPseudo(MachineInstr &MI, unsigned LowOpcode,
+ unsigned HighOpcode) const;
+ void expandLOCRPseudo(MachineInstr &MI, unsigned LowOpcode,
+ unsigned HighOpcode) const;
+ void expandZExtPseudo(MachineInstr &MI, unsigned LowOpcode,
+ unsigned Size) const;
+ void expandLoadStackGuard(MachineInstr *MI) const;
+ void emitGRX32Move(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+ unsigned LowLowOpcode, unsigned Size, bool KillSrc) const;
+ virtual void anchor();
+
+protected:
+ /// Commutes the operands in the given instruction by changing the operands
+ /// order and/or changing the instruction's opcode and/or the immediate value
+ /// operand.
+ ///
+ /// The arguments 'CommuteOpIdx1' and 'CommuteOpIdx2' specify the operands
+ /// to be commuted.
+ ///
+ /// Do not call this method for a non-commutable instruction or
+ /// non-commutable operands.
+ /// Even though the instruction is commutable, the method may still
+ /// fail to commute the operands, null pointer is returned in such cases.
+ MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+ unsigned CommuteOpIdx1,
+ unsigned CommuteOpIdx2) const override;
+
+public:
+ explicit SystemZInstrInfo(SystemZSubtarget &STI);
+
+ // Override TargetInstrInfo.
+ unsigned isLoadFromStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const override;
+ unsigned isStoreToStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const override;
+ bool isStackSlotCopy(const MachineInstr &MI, int &DestFrameIndex,
+ int &SrcFrameIndex) const override;
+ bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const override;
+ unsigned removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved = nullptr) const override;
+ unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+ const DebugLoc &DL,
+ int *BytesAdded = nullptr) const override;
+ bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
+ unsigned &SrcReg2, int &Mask, int &Value) const override;
+ bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
+ unsigned SrcReg2, int Mask, int Value,
+ const MachineRegisterInfo *MRI) const override;
+ bool canInsertSelect(const MachineBasicBlock&, ArrayRef<MachineOperand> Cond,
+ unsigned, unsigned, int&, int&, int&) const override;
+ void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ const DebugLoc &DL, unsigned DstReg,
+ ArrayRef<MachineOperand> Cond, unsigned TrueReg,
+ unsigned FalseReg) const override;
+ bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg,
+ MachineRegisterInfo *MRI) const override;
+ bool isPredicable(MachineInstr &MI) const override;
+ bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
+ unsigned ExtraPredCycles,
+ BranchProbability Probability) const override;
+ bool isProfitableToIfCvt(MachineBasicBlock &TMBB,
+ unsigned NumCyclesT, unsigned ExtraPredCyclesT,
+ MachineBasicBlock &FMBB,
+ unsigned NumCyclesF, unsigned ExtraPredCyclesF,
+ BranchProbability Probability) const override;
+ bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
+ BranchProbability Probability) const override;
+ bool PredicateInstruction(MachineInstr &MI,
+ ArrayRef<MachineOperand> Pred) const override;
+ void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+ bool KillSrc) const override;
+ void storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ unsigned SrcReg, bool isKill, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
+ void loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ unsigned DestReg, int FrameIdx,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
+ MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
+ MachineInstr &MI,
+ LiveVariables *LV) const override;
+ MachineInstr *
+ foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
+ ArrayRef<unsigned> Ops,
+ MachineBasicBlock::iterator InsertPt, int FrameIndex,
+ LiveIntervals *LIS = nullptr) const override;
+ MachineInstr *foldMemoryOperandImpl(
+ MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
+ MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
+ LiveIntervals *LIS = nullptr) const override;
+ bool expandPostRAPseudo(MachineInstr &MBBI) const override;
+ bool reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const
+ override;
+
+ // Return the SystemZRegisterInfo, which this class owns.
+ const SystemZRegisterInfo &getRegisterInfo() const { return RI; }
+
+ // Return the size in bytes of MI.
+ unsigned getInstSizeInBytes(const MachineInstr &MI) const override;
+
+ // Return true if MI is a conditional or unconditional branch.
+ // When returning true, set Cond to the mask of condition-code
+ // values on which the instruction will branch, and set Target
+ // to the operand that contains the branch target. This target
+ // can be a register or a basic block.
+ SystemZII::Branch getBranchInfo(const MachineInstr &MI) const;
+
+ // Get the load and store opcodes for a given register class.
+ void getLoadStoreOpcodes(const TargetRegisterClass *RC,
+ unsigned &LoadOpcode, unsigned &StoreOpcode) const;
+
+ // Opcode is the opcode of an instruction that has an address operand,
+ // and the caller wants to perform that instruction's operation on an
+ // address that has displacement Offset. Return the opcode of a suitable
+ // instruction (which might be Opcode itself) or 0 if no such instruction
+ // exists.
+ unsigned getOpcodeForOffset(unsigned Opcode, int64_t Offset) const;
+
+ // If Opcode is a load instruction that has a LOAD AND TEST form,
+ // return the opcode for the testing form, otherwise return 0.
+ unsigned getLoadAndTest(unsigned Opcode) const;
+
+ // Return true if ROTATE AND ... SELECTED BITS can be used to select bits
+ // Mask of the R2 operand, given that only the low BitSize bits of Mask are
+ // significant. Set Start and End to the I3 and I4 operands if so.
+ bool isRxSBGMask(uint64_t Mask, unsigned BitSize,
+ unsigned &Start, unsigned &End) const;
+
+ // If Opcode is a COMPARE opcode for which an associated fused COMPARE AND *
+ // operation exists, return the opcode for the latter, otherwise return 0.
+ // MI, if nonnull, is the compare instruction.
+ unsigned getFusedCompare(unsigned Opcode,
+ SystemZII::FusedCompareType Type,
+ const MachineInstr *MI = nullptr) const;
+
+ // If Opcode is a LOAD opcode for with an associated LOAD AND TRAP
+ // operation exists, returh the opcode for the latter, otherwise return 0.
+ unsigned getLoadAndTrap(unsigned Opcode) const;
+
+ // Emit code before MBBI in MI to move immediate value Value into
+ // physical register Reg.
+ void loadImmediate(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ unsigned Reg, uint64_t Value) const;
+
+ // Sometimes, it is possible for the target to tell, even without
+ // aliasing information, that two MIs access different memory
+ // addresses. This function returns true if two MIs access different
+ // memory addresses and false otherwise.
+ bool
+ areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
+ AliasAnalysis *AA = nullptr) const override;
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
new file mode 100644
index 000000000000..d63525f29412
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -0,0 +1,1929 @@
+//===-- SystemZInstrInfo.td - General SystemZ instructions ----*- tblgen-*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Stack allocation
+//===----------------------------------------------------------------------===//
+
+let hasNoSchedulingInfo = 1 in {
+ def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt),
+ [(callseq_start timm:$amt)]>;
+ def ADJCALLSTACKUP : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2),
+ [(callseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+let hasSideEffects = 0 in {
+ // Takes as input the value of the stack pointer after a dynamic allocation
+ // has been made. Sets the output to the address of the dynamically-
+ // allocated area itself, skipping the outgoing arguments.
+ //
+ // This expands to an LA or LAY instruction. We restrict the offset
+ // to the range of LA and keep the LAY range in reserve for when
+ // the size of the outgoing arguments is added.
+ def ADJDYNALLOC : Pseudo<(outs GR64:$dst), (ins dynalloc12only:$src),
+ [(set GR64:$dst, dynalloc12only:$src)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Branch instructions
+//===----------------------------------------------------------------------===//
+
+// Conditional branches.
+let isBranch = 1, isTerminator = 1, Uses = [CC] in {
+ // It's easier for LLVM to handle these branches in their raw BRC/BRCL form
+ // with the condition-code mask being the first operand. It seems friendlier
+ // to use mnemonic forms like JE and JLH when writing out the assembly though.
+ let isCodeGenOnly = 1 in {
+ // An assembler extended mnemonic for BRC.
+ def BRC : CondBranchRI <"j#", 0xA74, z_br_ccmask>;
+ // An assembler extended mnemonic for BRCL. (The extension is "G"
+ // rather than "L" because "JL" is "Jump if Less".)
+ def BRCL : CondBranchRIL<"jg#", 0xC04>;
+ let isIndirectBranch = 1 in {
+ def BC : CondBranchRX<"b#", 0x47>;
+ def BCR : CondBranchRR<"b#r", 0x07>;
+ }
+ }
+
+ // Allow using the raw forms directly from the assembler (and occasional
+ // special code generation needs) as well.
+ def BRCAsm : AsmCondBranchRI <"brc", 0xA74>;
+ def BRCLAsm : AsmCondBranchRIL<"brcl", 0xC04>;
+ let isIndirectBranch = 1 in {
+ def BCAsm : AsmCondBranchRX<"bc", 0x47>;
+ def BCRAsm : AsmCondBranchRR<"bcr", 0x07>;
+ }
+
+ // Define AsmParser extended mnemonics for each general condition-code mask
+ // (integer or floating-point)
+ foreach V = [ "E", "NE", "H", "NH", "L", "NL", "HE", "NHE", "LE", "NLE",
+ "Z", "NZ", "P", "NP", "M", "NM", "LH", "NLH", "O", "NO" ] in {
+ def JAsm#V : FixedCondBranchRI <CV<V>, "j#", 0xA74>;
+ def JGAsm#V : FixedCondBranchRIL<CV<V>, "jg#", 0xC04>;
+ let isIndirectBranch = 1 in {
+ def BAsm#V : FixedCondBranchRX <CV<V>, "b#", 0x47>;
+ def BRAsm#V : FixedCondBranchRR <CV<V>, "b#r", 0x07>;
+ }
+ }
+}
+
+// Unconditional branches. These are in fact simply variants of the
+// conditional branches with the condition mask set to "always".
+let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
+ def J : FixedCondBranchRI <CondAlways, "j", 0xA74, br>;
+ def JG : FixedCondBranchRIL<CondAlways, "jg", 0xC04>;
+ let isIndirectBranch = 1 in {
+ def B : FixedCondBranchRX<CondAlways, "b", 0x47>;
+ def BR : FixedCondBranchRR<CondAlways, "br", 0x07, brind>;
+ }
+}
+
+// NOPs. These are again variants of the conditional branches,
+// with the condition mask set to "never".
+def NOP : InstAlias<"nop\t$XBD", (BCAsm 0, bdxaddr12only:$XBD), 0>;
+def NOPR : InstAlias<"nopr\t$R", (BCRAsm 0, GR64:$R), 0>;
+
+// Fused compare-and-branch instructions.
+//
+// These instructions do not use or clobber the condition codes.
+// We nevertheless pretend that the relative compare-and-branch
+// instructions clobber CC, so that we can lower them to separate
+// comparisons and BRCLs if the branch ends up being out of range.
+let isBranch = 1, isTerminator = 1 in {
+ // As for normal branches, we handle these instructions internally in
+ // their raw CRJ-like form, but use assembly macros like CRJE when writing
+ // them out. Using the *Pair multiclasses, we also create the raw forms.
+ let Defs = [CC] in {
+ defm CRJ : CmpBranchRIEbPair<"crj", 0xEC76, GR32>;
+ defm CGRJ : CmpBranchRIEbPair<"cgrj", 0xEC64, GR64>;
+ defm CIJ : CmpBranchRIEcPair<"cij", 0xEC7E, GR32, imm32sx8>;
+ defm CGIJ : CmpBranchRIEcPair<"cgij", 0xEC7C, GR64, imm64sx8>;
+ defm CLRJ : CmpBranchRIEbPair<"clrj", 0xEC77, GR32>;
+ defm CLGRJ : CmpBranchRIEbPair<"clgrj", 0xEC65, GR64>;
+ defm CLIJ : CmpBranchRIEcPair<"clij", 0xEC7F, GR32, imm32zx8>;
+ defm CLGIJ : CmpBranchRIEcPair<"clgij", 0xEC7D, GR64, imm64zx8>;
+ }
+ let isIndirectBranch = 1 in {
+ defm CRB : CmpBranchRRSPair<"crb", 0xECF6, GR32>;
+ defm CGRB : CmpBranchRRSPair<"cgrb", 0xECE4, GR64>;
+ defm CIB : CmpBranchRISPair<"cib", 0xECFE, GR32, imm32sx8>;
+ defm CGIB : CmpBranchRISPair<"cgib", 0xECFC, GR64, imm64sx8>;
+ defm CLRB : CmpBranchRRSPair<"clrb", 0xECF7, GR32>;
+ defm CLGRB : CmpBranchRRSPair<"clgrb", 0xECE5, GR64>;
+ defm CLIB : CmpBranchRISPair<"clib", 0xECFF, GR32, imm32zx8>;
+ defm CLGIB : CmpBranchRISPair<"clgib", 0xECFD, GR64, imm64zx8>;
+ }
+
+ // Define AsmParser mnemonics for each integer condition-code mask.
+ foreach V = [ "E", "H", "L", "HE", "LE", "LH",
+ "NE", "NH", "NL", "NHE", "NLE", "NLH" ] in {
+ let Defs = [CC] in {
+ def CRJAsm#V : FixedCmpBranchRIEb<ICV<V>, "crj", 0xEC76, GR32>;
+ def CGRJAsm#V : FixedCmpBranchRIEb<ICV<V>, "cgrj", 0xEC64, GR64>;
+ def CIJAsm#V : FixedCmpBranchRIEc<ICV<V>, "cij", 0xEC7E, GR32,
+ imm32sx8>;
+ def CGIJAsm#V : FixedCmpBranchRIEc<ICV<V>, "cgij", 0xEC7C, GR64,
+ imm64sx8>;
+ def CLRJAsm#V : FixedCmpBranchRIEb<ICV<V>, "clrj", 0xEC77, GR32>;
+ def CLGRJAsm#V : FixedCmpBranchRIEb<ICV<V>, "clgrj", 0xEC65, GR64>;
+ def CLIJAsm#V : FixedCmpBranchRIEc<ICV<V>, "clij", 0xEC7F, GR32,
+ imm32zx8>;
+ def CLGIJAsm#V : FixedCmpBranchRIEc<ICV<V>, "clgij", 0xEC7D, GR64,
+ imm64zx8>;
+ }
+ let isIndirectBranch = 1 in {
+ def CRBAsm#V : FixedCmpBranchRRS<ICV<V>, "crb", 0xECF6, GR32>;
+ def CGRBAsm#V : FixedCmpBranchRRS<ICV<V>, "cgrb", 0xECE4, GR64>;
+ def CIBAsm#V : FixedCmpBranchRIS<ICV<V>, "cib", 0xECFE, GR32,
+ imm32sx8>;
+ def CGIBAsm#V : FixedCmpBranchRIS<ICV<V>, "cgib", 0xECFC, GR64,
+ imm64sx8>;
+ def CLRBAsm#V : FixedCmpBranchRRS<ICV<V>, "clrb", 0xECF7, GR32>;
+ def CLGRBAsm#V : FixedCmpBranchRRS<ICV<V>, "clgrb", 0xECE5, GR64>;
+ def CLIBAsm#V : FixedCmpBranchRIS<ICV<V>, "clib", 0xECFF, GR32,
+ imm32zx8>;
+ def CLGIBAsm#V : FixedCmpBranchRIS<ICV<V>, "clgib", 0xECFD, GR64,
+ imm64zx8>;
+ }
+ }
+}
+
+// Decrement a register and branch if it is nonzero. These don't clobber CC,
+// but we might need to split long relative branches into sequences that do.
+let isBranch = 1, isTerminator = 1 in {
+ let Defs = [CC] in {
+ def BRCT : BranchUnaryRI<"brct", 0xA76, GR32>;
+ def BRCTG : BranchUnaryRI<"brctg", 0xA77, GR64>;
+ }
+ // This doesn't need to clobber CC since we never need to split it.
+ def BRCTH : BranchUnaryRIL<"brcth", 0xCC6, GRH32>,
+ Requires<[FeatureHighWord]>;
+
+ def BCT : BranchUnaryRX<"bct", 0x46,GR32>;
+ def BCTR : BranchUnaryRR<"bctr", 0x06, GR32>;
+ def BCTG : BranchUnaryRXY<"bctg", 0xE346, GR64>;
+ def BCTGR : BranchUnaryRRE<"bctgr", 0xB946, GR64>;
+}
+
+let isBranch = 1, isTerminator = 1 in {
+ let Defs = [CC] in {
+ def BRXH : BranchBinaryRSI<"brxh", 0x84, GR32>;
+ def BRXLE : BranchBinaryRSI<"brxle", 0x85, GR32>;
+ def BRXHG : BranchBinaryRIEe<"brxhg", 0xEC44, GR64>;
+ def BRXLG : BranchBinaryRIEe<"brxlg", 0xEC45, GR64>;
+ }
+ def BXH : BranchBinaryRS<"bxh", 0x86, GR32>;
+ def BXLE : BranchBinaryRS<"bxle", 0x87, GR32>;
+ def BXHG : BranchBinaryRSY<"bxhg", 0xEB44, GR64>;
+ def BXLEG : BranchBinaryRSY<"bxleg", 0xEB45, GR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// Trap instructions
+//===----------------------------------------------------------------------===//
+
+// Unconditional trap.
+// FIXME: This trap instruction should be marked as isTerminator, but there is
+// currently a general bug that allows non-terminators to be placed between
+// terminators. Temporarily leave this unmarked until the bug is fixed.
+let isBarrier = 1, hasCtrlDep = 1 in
+ def Trap : Alias<4, (outs), (ins), [(trap)]>;
+
+// Conditional trap.
+let isTerminator = 1, hasCtrlDep = 1, Uses = [CC] in
+ def CondTrap : Alias<4, (outs), (ins cond4:$valid, cond4:$R1), []>;
+
+// Fused compare-and-trap instructions.
+let isTerminator = 1, hasCtrlDep = 1 in {
+ // These patterns work the same way as for compare-and-branch.
+ defm CRT : CmpBranchRRFcPair<"crt", 0xB972, GR32>;
+ defm CGRT : CmpBranchRRFcPair<"cgrt", 0xB960, GR64>;
+ defm CLRT : CmpBranchRRFcPair<"clrt", 0xB973, GR32>;
+ defm CLGRT : CmpBranchRRFcPair<"clgrt", 0xB961, GR64>;
+ defm CIT : CmpBranchRIEaPair<"cit", 0xEC72, GR32, imm32sx16>;
+ defm CGIT : CmpBranchRIEaPair<"cgit", 0xEC70, GR64, imm64sx16>;
+ defm CLFIT : CmpBranchRIEaPair<"clfit", 0xEC73, GR32, imm32zx16>;
+ defm CLGIT : CmpBranchRIEaPair<"clgit", 0xEC71, GR64, imm64zx16>;
+ let Predicates = [FeatureMiscellaneousExtensions] in {
+ defm CLT : CmpBranchRSYbPair<"clt", 0xEB23, GR32>;
+ defm CLGT : CmpBranchRSYbPair<"clgt", 0xEB2B, GR64>;
+ }
+
+ foreach V = [ "E", "H", "L", "HE", "LE", "LH",
+ "NE", "NH", "NL", "NHE", "NLE", "NLH" ] in {
+ def CRTAsm#V : FixedCmpBranchRRFc<ICV<V>, "crt", 0xB972, GR32>;
+ def CGRTAsm#V : FixedCmpBranchRRFc<ICV<V>, "cgrt", 0xB960, GR64>;
+ def CLRTAsm#V : FixedCmpBranchRRFc<ICV<V>, "clrt", 0xB973, GR32>;
+ def CLGRTAsm#V : FixedCmpBranchRRFc<ICV<V>, "clgrt", 0xB961, GR64>;
+ def CITAsm#V : FixedCmpBranchRIEa<ICV<V>, "cit", 0xEC72, GR32,
+ imm32sx16>;
+ def CGITAsm#V : FixedCmpBranchRIEa<ICV<V>, "cgit", 0xEC70, GR64,
+ imm64sx16>;
+ def CLFITAsm#V : FixedCmpBranchRIEa<ICV<V>, "clfit", 0xEC73, GR32,
+ imm32zx16>;
+ def CLGITAsm#V : FixedCmpBranchRIEa<ICV<V>, "clgit", 0xEC71, GR64,
+ imm64zx16>;
+ let Predicates = [FeatureMiscellaneousExtensions] in {
+ def CLTAsm#V : FixedCmpBranchRSYb<ICV<V>, "clt", 0xEB23, GR32>;
+ def CLGTAsm#V : FixedCmpBranchRSYb<ICV<V>, "clgt", 0xEB2B, GR64>;
+ }
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Call and return instructions
+//===----------------------------------------------------------------------===//
+
+// Define the general form of the call instructions for the asm parser.
+// These instructions don't hard-code %r14 as the return address register.
+let isCall = 1, Defs = [CC] in {
+ def BRAS : CallRI <"bras", 0xA75>;
+ def BRASL : CallRIL<"brasl", 0xC05>;
+ def BAS : CallRX <"bas", 0x4D>;
+ def BASR : CallRR <"basr", 0x0D>;
+}
+
+// Regular calls.
+let isCall = 1, Defs = [R14D, CC] in {
+ def CallBRASL : Alias<6, (outs), (ins pcrel32:$I2, variable_ops),
+ [(z_call pcrel32:$I2)]>;
+ def CallBASR : Alias<2, (outs), (ins ADDR64:$R2, variable_ops),
+ [(z_call ADDR64:$R2)]>;
+}
+
+// TLS calls. These will be lowered into a call to __tls_get_offset,
+// with an extra relocation specifying the TLS symbol.
+let isCall = 1, Defs = [R14D, CC] in {
+ def TLS_GDCALL : Alias<6, (outs), (ins tlssym:$I2, variable_ops),
+ [(z_tls_gdcall tglobaltlsaddr:$I2)]>;
+ def TLS_LDCALL : Alias<6, (outs), (ins tlssym:$I2, variable_ops),
+ [(z_tls_ldcall tglobaltlsaddr:$I2)]>;
+}
+
+// Sibling calls. Indirect sibling calls must be via R1, since R2 upwards
+// are argument registers and since branching to R0 is a no-op.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
+ def CallJG : Alias<6, (outs), (ins pcrel32:$I2),
+ [(z_sibcall pcrel32:$I2)]>;
+ let Uses = [R1D] in
+ def CallBR : Alias<2, (outs), (ins), [(z_sibcall R1D)]>;
+}
+
+// Conditional sibling calls.
+let CCMaskFirst = 1, isCall = 1, isTerminator = 1, isReturn = 1 in {
+ def CallBRCL : Alias<6, (outs), (ins cond4:$valid, cond4:$R1,
+ pcrel32:$I2), []>;
+ let Uses = [R1D] in
+ def CallBCR : Alias<2, (outs), (ins cond4:$valid, cond4:$R1), []>;
+}
+
+// Fused compare and conditional sibling calls.
+let isCall = 1, isTerminator = 1, isReturn = 1, Uses = [R1D] in {
+ def CRBCall : Alias<6, (outs), (ins GR32:$R1, GR32:$R2, cond4:$M3), []>;
+ def CGRBCall : Alias<6, (outs), (ins GR64:$R1, GR64:$R2, cond4:$M3), []>;
+ def CIBCall : Alias<6, (outs), (ins GR32:$R1, imm32sx8:$I2, cond4:$M3), []>;
+ def CGIBCall : Alias<6, (outs), (ins GR64:$R1, imm64sx8:$I2, cond4:$M3), []>;
+ def CLRBCall : Alias<6, (outs), (ins GR32:$R1, GR32:$R2, cond4:$M3), []>;
+ def CLGRBCall : Alias<6, (outs), (ins GR64:$R1, GR64:$R2, cond4:$M3), []>;
+ def CLIBCall : Alias<6, (outs), (ins GR32:$R1, imm32zx8:$I2, cond4:$M3), []>;
+ def CLGIBCall : Alias<6, (outs), (ins GR64:$R1, imm64zx8:$I2, cond4:$M3), []>;
+}
+
+// A return instruction (br %r14).
+let isReturn = 1, isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in
+ def Return : Alias<2, (outs), (ins), [(z_retflag)]>;
+
+// A conditional return instruction (bcr <cond>, %r14).
+let isReturn = 1, isTerminator = 1, hasCtrlDep = 1, CCMaskFirst = 1, Uses = [CC] in
+ def CondReturn : Alias<2, (outs), (ins cond4:$valid, cond4:$R1), []>;
+
+// Fused compare and conditional returns.
+let isReturn = 1, isTerminator = 1, hasCtrlDep = 1 in {
+ def CRBReturn : Alias<6, (outs), (ins GR32:$R1, GR32:$R2, cond4:$M3), []>;
+ def CGRBReturn : Alias<6, (outs), (ins GR64:$R1, GR64:$R2, cond4:$M3), []>;
+ def CIBReturn : Alias<6, (outs), (ins GR32:$R1, imm32sx8:$I2, cond4:$M3), []>;
+ def CGIBReturn : Alias<6, (outs), (ins GR64:$R1, imm64sx8:$I2, cond4:$M3), []>;
+ def CLRBReturn : Alias<6, (outs), (ins GR32:$R1, GR32:$R2, cond4:$M3), []>;
+ def CLGRBReturn : Alias<6, (outs), (ins GR64:$R1, GR64:$R2, cond4:$M3), []>;
+ def CLIBReturn : Alias<6, (outs), (ins GR32:$R1, imm32zx8:$I2, cond4:$M3), []>;
+ def CLGIBReturn : Alias<6, (outs), (ins GR64:$R1, imm64zx8:$I2, cond4:$M3), []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Select instructions
+//===----------------------------------------------------------------------===//
+
+def Select32Mux : SelectWrapper<GRX32>, Requires<[FeatureHighWord]>;
+def Select32 : SelectWrapper<GR32>;
+def Select64 : SelectWrapper<GR64>;
+
+// We don't define 32-bit Mux stores if we don't have STOCFH, because the
+// low-only STOC should then always be used if possible.
+defm CondStore8Mux : CondStores<GRX32, nonvolatile_truncstorei8,
+ nonvolatile_anyextloadi8, bdxaddr20only>,
+ Requires<[FeatureHighWord]>;
+defm CondStore16Mux : CondStores<GRX32, nonvolatile_truncstorei16,
+ nonvolatile_anyextloadi16, bdxaddr20only>,
+ Requires<[FeatureHighWord]>;
+defm CondStore32Mux : CondStores<GRX32, nonvolatile_store,
+ nonvolatile_load, bdxaddr20only>,
+ Requires<[FeatureLoadStoreOnCond2]>;
+defm CondStore8 : CondStores<GR32, nonvolatile_truncstorei8,
+ nonvolatile_anyextloadi8, bdxaddr20only>;
+defm CondStore16 : CondStores<GR32, nonvolatile_truncstorei16,
+ nonvolatile_anyextloadi16, bdxaddr20only>;
+defm CondStore32 : CondStores<GR32, nonvolatile_store,
+ nonvolatile_load, bdxaddr20only>;
+
+defm : CondStores64<CondStore8, CondStore8Inv, nonvolatile_truncstorei8,
+ nonvolatile_anyextloadi8, bdxaddr20only>;
+defm : CondStores64<CondStore16, CondStore16Inv, nonvolatile_truncstorei16,
+ nonvolatile_anyextloadi16, bdxaddr20only>;
+defm : CondStores64<CondStore32, CondStore32Inv, nonvolatile_truncstorei32,
+ nonvolatile_anyextloadi32, bdxaddr20only>;
+defm CondStore64 : CondStores<GR64, nonvolatile_store,
+ nonvolatile_load, bdxaddr20only>;
+
+//===----------------------------------------------------------------------===//
+// Move instructions
+//===----------------------------------------------------------------------===//
+
+// Register moves.
+let hasSideEffects = 0 in {
+ // Expands to LR, RISBHG or RISBLG, depending on the choice of registers.
+ def LRMux : UnaryRRPseudo<"lr", null_frag, GRX32, GRX32>,
+ Requires<[FeatureHighWord]>;
+ def LR : UnaryRR <"lr", 0x18, null_frag, GR32, GR32>;
+ def LGR : UnaryRRE<"lgr", 0xB904, null_frag, GR64, GR64>;
+}
+let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in {
+ def LTR : UnaryRR <"ltr", 0x12, null_frag, GR32, GR32>;
+ def LTGR : UnaryRRE<"ltgr", 0xB902, null_frag, GR64, GR64>;
+}
+
+// Immediate moves.
+let hasSideEffects = 0, isAsCheapAsAMove = 1, isMoveImm = 1,
+ isReMaterializable = 1 in {
+ // 16-bit sign-extended immediates. LHIMux expands to LHI or IIHF,
+ // deopending on the choice of register.
+ def LHIMux : UnaryRIPseudo<bitconvert, GRX32, imm32sx16>,
+ Requires<[FeatureHighWord]>;
+ def LHI : UnaryRI<"lhi", 0xA78, bitconvert, GR32, imm32sx16>;
+ def LGHI : UnaryRI<"lghi", 0xA79, bitconvert, GR64, imm64sx16>;
+
+ // Other 16-bit immediates.
+ def LLILL : UnaryRI<"llill", 0xA5F, bitconvert, GR64, imm64ll16>;
+ def LLILH : UnaryRI<"llilh", 0xA5E, bitconvert, GR64, imm64lh16>;
+ def LLIHL : UnaryRI<"llihl", 0xA5D, bitconvert, GR64, imm64hl16>;
+ def LLIHH : UnaryRI<"llihh", 0xA5C, bitconvert, GR64, imm64hh16>;
+
+ // 32-bit immediates.
+ def LGFI : UnaryRIL<"lgfi", 0xC01, bitconvert, GR64, imm64sx32>;
+ def LLILF : UnaryRIL<"llilf", 0xC0F, bitconvert, GR64, imm64lf32>;
+ def LLIHF : UnaryRIL<"llihf", 0xC0E, bitconvert, GR64, imm64hf32>;
+}
+
+// Register loads.
+let canFoldAsLoad = 1, SimpleBDXLoad = 1 in {
+ // Expands to L, LY or LFH, depending on the choice of register.
+ def LMux : UnaryRXYPseudo<"l", load, GRX32, 4>,
+ Requires<[FeatureHighWord]>;
+ defm L : UnaryRXPair<"l", 0x58, 0xE358, load, GR32, 4>;
+ def LFH : UnaryRXY<"lfh", 0xE3CA, load, GRH32, 4>,
+ Requires<[FeatureHighWord]>;
+ def LG : UnaryRXY<"lg", 0xE304, load, GR64, 8>;
+
+ // These instructions are split after register allocation, so we don't
+ // want a custom inserter.
+ let Has20BitOffset = 1, HasIndex = 1, Is128Bit = 1 in {
+ def L128 : Pseudo<(outs GR128:$dst), (ins bdxaddr20only128:$src),
+ [(set GR128:$dst, (load bdxaddr20only128:$src))]>;
+ }
+}
+let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in {
+ def LT : UnaryRXY<"lt", 0xE312, load, GR32, 4>;
+ def LTG : UnaryRXY<"ltg", 0xE302, load, GR64, 8>;
+}
+
+let canFoldAsLoad = 1 in {
+ def LRL : UnaryRILPC<"lrl", 0xC4D, aligned_load, GR32>;
+ def LGRL : UnaryRILPC<"lgrl", 0xC48, aligned_load, GR64>;
+}
+
+// Load and zero rightmost byte.
+let Predicates = [FeatureLoadAndZeroRightmostByte] in {
+ def LZRF : UnaryRXY<"lzrf", 0xE33B, null_frag, GR32, 4>;
+ def LZRG : UnaryRXY<"lzrg", 0xE32A, null_frag, GR64, 8>;
+ def : Pat<(and (i32 (load bdxaddr20only:$src)), 0xffffff00),
+ (LZRF bdxaddr20only:$src)>;
+ def : Pat<(and (i64 (load bdxaddr20only:$src)), 0xffffffffffffff00),
+ (LZRG bdxaddr20only:$src)>;
+}
+
+// Load and trap.
+let Predicates = [FeatureLoadAndTrap] in {
+ def LAT : UnaryRXY<"lat", 0xE39F, null_frag, GR32, 4>;
+ def LFHAT : UnaryRXY<"lfhat", 0xE3C8, null_frag, GRH32, 4>;
+ def LGAT : UnaryRXY<"lgat", 0xE385, null_frag, GR64, 8>;
+}
+
+// Register stores.
+let SimpleBDXStore = 1 in {
+ // Expands to ST, STY or STFH, depending on the choice of register.
+ def STMux : StoreRXYPseudo<store, GRX32, 4>,
+ Requires<[FeatureHighWord]>;
+ defm ST : StoreRXPair<"st", 0x50, 0xE350, store, GR32, 4>;
+ def STFH : StoreRXY<"stfh", 0xE3CB, store, GRH32, 4>,
+ Requires<[FeatureHighWord]>;
+ def STG : StoreRXY<"stg", 0xE324, store, GR64, 8>;
+
+ // These instructions are split after register allocation, so we don't
+ // want a custom inserter.
+ let Has20BitOffset = 1, HasIndex = 1, Is128Bit = 1 in {
+ def ST128 : Pseudo<(outs), (ins GR128:$src, bdxaddr20only128:$dst),
+ [(store GR128:$src, bdxaddr20only128:$dst)]>;
+ }
+}
+def STRL : StoreRILPC<"strl", 0xC4F, aligned_store, GR32>;
+def STGRL : StoreRILPC<"stgrl", 0xC4B, aligned_store, GR64>;
+
+// 8-bit immediate stores to 8-bit fields.
+defm MVI : StoreSIPair<"mvi", 0x92, 0xEB52, truncstorei8, imm32zx8trunc>;
+
+// 16-bit immediate stores to 16-, 32- or 64-bit fields.
+def MVHHI : StoreSIL<"mvhhi", 0xE544, truncstorei16, imm32sx16trunc>;
+def MVHI : StoreSIL<"mvhi", 0xE54C, store, imm32sx16>;
+def MVGHI : StoreSIL<"mvghi", 0xE548, store, imm64sx16>;
+
+// Memory-to-memory moves.
+let mayLoad = 1, mayStore = 1 in
+ defm MVC : MemorySS<"mvc", 0xD2, z_mvc, z_mvc_loop>;
+
+// String moves.
+let mayLoad = 1, mayStore = 1, Defs = [CC] in
+ defm MVST : StringRRE<"mvst", 0xB255, z_stpcpy>;
+
+//===----------------------------------------------------------------------===//
+// Conditional move instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureLoadStoreOnCond2], Uses = [CC] in {
+ // Load immediate on condition. Matched via DAG pattern and created
+ // by the PeepholeOptimizer via FoldImmediate.
+ let hasSideEffects = 0 in {
+ // Expands to LOCHI or LOCHHI, depending on the choice of register.
+ def LOCHIMux : CondBinaryRIEPseudo<GRX32, imm32sx16>;
+ defm LOCHHI : CondBinaryRIEPair<"lochhi", 0xEC4E, GRH32, imm32sx16>;
+ defm LOCHI : CondBinaryRIEPair<"lochi", 0xEC42, GR32, imm32sx16>;
+ defm LOCGHI : CondBinaryRIEPair<"locghi", 0xEC46, GR64, imm64sx16>;
+ }
+
+ // Move register on condition. Expanded from Select* pseudos and
+ // created by early if-conversion.
+ let hasSideEffects = 0, isCommutable = 1 in {
+ // Expands to LOCR or LOCFHR or a branch-and-move sequence,
+ // depending on the choice of registers.
+ def LOCRMux : CondBinaryRRFPseudo<GRX32, GRX32>;
+ defm LOCFHR : CondBinaryRRFPair<"locfhr", 0xB9E0, GRH32, GRH32>;
+ }
+
+ // Load on condition. Matched via DAG pattern.
+ // Expands to LOC or LOCFH, depending on the choice of register.
+ def LOCMux : CondUnaryRSYPseudo<nonvolatile_load, GRX32, 4>;
+ defm LOCFH : CondUnaryRSYPair<"locfh", 0xEBE0, nonvolatile_load, GRH32, 4>;
+
+ // Store on condition. Expanded from CondStore* pseudos.
+ // Expands to STOC or STOCFH, depending on the choice of register.
+ def STOCMux : CondStoreRSYPseudo<GRX32, 4>;
+ defm STOCFH : CondStoreRSYPair<"stocfh", 0xEBE1, GRH32, 4>;
+
+ // Define AsmParser extended mnemonics for each general condition-code mask.
+ foreach V = [ "E", "NE", "H", "NH", "L", "NL", "HE", "NHE", "LE", "NLE",
+ "Z", "NZ", "P", "NP", "M", "NM", "LH", "NLH", "O", "NO" ] in {
+ def LOCHIAsm#V : FixedCondBinaryRIE<CV<V>, "lochi", 0xEC42, GR32,
+ imm32sx16>;
+ def LOCGHIAsm#V : FixedCondBinaryRIE<CV<V>, "locghi", 0xEC46, GR64,
+ imm64sx16>;
+ def LOCHHIAsm#V : FixedCondBinaryRIE<CV<V>, "lochhi", 0xEC4E, GRH32,
+ imm32sx16>;
+ def LOCFHRAsm#V : FixedCondBinaryRRF<CV<V>, "locfhr", 0xB9E0, GRH32, GRH32>;
+ def LOCFHAsm#V : FixedCondUnaryRSY<CV<V>, "locfh", 0xEBE0, GRH32, 4>;
+ def STOCFHAsm#V : FixedCondStoreRSY<CV<V>, "stocfh", 0xEBE1, GRH32, 4>;
+ }
+}
+
+let Predicates = [FeatureLoadStoreOnCond], Uses = [CC] in {
+ // Move register on condition. Expanded from Select* pseudos and
+ // created by early if-conversion.
+ let hasSideEffects = 0, isCommutable = 1 in {
+ defm LOCR : CondBinaryRRFPair<"locr", 0xB9F2, GR32, GR32>;
+ defm LOCGR : CondBinaryRRFPair<"locgr", 0xB9E2, GR64, GR64>;
+ }
+
+ // Load on condition. Matched via DAG pattern.
+ defm LOC : CondUnaryRSYPair<"loc", 0xEBF2, nonvolatile_load, GR32, 4>;
+ defm LOCG : CondUnaryRSYPair<"locg", 0xEBE2, nonvolatile_load, GR64, 8>;
+
+ // Store on condition. Expanded from CondStore* pseudos.
+ defm STOC : CondStoreRSYPair<"stoc", 0xEBF3, GR32, 4>;
+ defm STOCG : CondStoreRSYPair<"stocg", 0xEBE3, GR64, 8>;
+
+ // Define AsmParser extended mnemonics for each general condition-code mask.
+ foreach V = [ "E", "NE", "H", "NH", "L", "NL", "HE", "NHE", "LE", "NLE",
+ "Z", "NZ", "P", "NP", "M", "NM", "LH", "NLH", "O", "NO" ] in {
+ def LOCRAsm#V : FixedCondBinaryRRF<CV<V>, "locr", 0xB9F2, GR32, GR32>;
+ def LOCGRAsm#V : FixedCondBinaryRRF<CV<V>, "locgr", 0xB9E2, GR64, GR64>;
+ def LOCAsm#V : FixedCondUnaryRSY<CV<V>, "loc", 0xEBF2, GR32, 4>;
+ def LOCGAsm#V : FixedCondUnaryRSY<CV<V>, "locg", 0xEBE2, GR64, 8>;
+ def STOCAsm#V : FixedCondStoreRSY<CV<V>, "stoc", 0xEBF3, GR32, 4>;
+ def STOCGAsm#V : FixedCondStoreRSY<CV<V>, "stocg", 0xEBE3, GR64, 8>;
+ }
+}
+//===----------------------------------------------------------------------===//
+// Sign extensions
+//===----------------------------------------------------------------------===//
+//
+// Note that putting these before zero extensions mean that we will prefer
+// them for anyextload*. There's not really much to choose between the two
+// either way, but signed-extending loads have a short LH and a long LHY,
+// while zero-extending loads have only the long LLH.
+//
+//===----------------------------------------------------------------------===//
+
+// 32-bit extensions from registers.
+let hasSideEffects = 0 in {
+ def LBR : UnaryRRE<"lbr", 0xB926, sext8, GR32, GR32>;
+ def LHR : UnaryRRE<"lhr", 0xB927, sext16, GR32, GR32>;
+}
+
+// 64-bit extensions from registers.
+let hasSideEffects = 0 in {
+ def LGBR : UnaryRRE<"lgbr", 0xB906, sext8, GR64, GR64>;
+ def LGHR : UnaryRRE<"lghr", 0xB907, sext16, GR64, GR64>;
+ def LGFR : UnaryRRE<"lgfr", 0xB914, sext32, GR64, GR32>;
+}
+let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in
+ def LTGFR : UnaryRRE<"ltgfr", 0xB912, null_frag, GR64, GR32>;
+
+// Match 32-to-64-bit sign extensions in which the source is already
+// in a 64-bit register.
+def : Pat<(sext_inreg GR64:$src, i32),
+ (LGFR (EXTRACT_SUBREG GR64:$src, subreg_l32))>;
+
+// 32-bit extensions from 8-bit memory. LBMux expands to LB or LBH,
+// depending on the choice of register.
+def LBMux : UnaryRXYPseudo<"lb", asextloadi8, GRX32, 1>,
+ Requires<[FeatureHighWord]>;
+def LB : UnaryRXY<"lb", 0xE376, asextloadi8, GR32, 1>;
+def LBH : UnaryRXY<"lbh", 0xE3C0, asextloadi8, GRH32, 1>,
+ Requires<[FeatureHighWord]>;
+
+// 32-bit extensions from 16-bit memory. LHMux expands to LH or LHH,
+// depending on the choice of register.
+def LHMux : UnaryRXYPseudo<"lh", asextloadi16, GRX32, 2>,
+ Requires<[FeatureHighWord]>;
+defm LH : UnaryRXPair<"lh", 0x48, 0xE378, asextloadi16, GR32, 2>;
+def LHH : UnaryRXY<"lhh", 0xE3C4, asextloadi16, GRH32, 2>,
+ Requires<[FeatureHighWord]>;
+def LHRL : UnaryRILPC<"lhrl", 0xC45, aligned_asextloadi16, GR32>;
+
+// 64-bit extensions from memory.
+def LGB : UnaryRXY<"lgb", 0xE377, asextloadi8, GR64, 1>;
+def LGH : UnaryRXY<"lgh", 0xE315, asextloadi16, GR64, 2>;
+def LGF : UnaryRXY<"lgf", 0xE314, asextloadi32, GR64, 4>;
+def LGHRL : UnaryRILPC<"lghrl", 0xC44, aligned_asextloadi16, GR64>;
+def LGFRL : UnaryRILPC<"lgfrl", 0xC4C, aligned_asextloadi32, GR64>;
+let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in
+ def LTGF : UnaryRXY<"ltgf", 0xE332, asextloadi32, GR64, 4>;
+
+//===----------------------------------------------------------------------===//
+// Zero extensions
+//===----------------------------------------------------------------------===//
+
+// 32-bit extensions from registers.
+let hasSideEffects = 0 in {
+ // Expands to LLCR or RISB[LH]G, depending on the choice of registers.
+ def LLCRMux : UnaryRRPseudo<"llcr", zext8, GRX32, GRX32>,
+ Requires<[FeatureHighWord]>;
+ def LLCR : UnaryRRE<"llcr", 0xB994, zext8, GR32, GR32>;
+ // Expands to LLHR or RISB[LH]G, depending on the choice of registers.
+ def LLHRMux : UnaryRRPseudo<"llhr", zext16, GRX32, GRX32>,
+ Requires<[FeatureHighWord]>;
+ def LLHR : UnaryRRE<"llhr", 0xB995, zext16, GR32, GR32>;
+}
+
+// 64-bit extensions from registers.
+let hasSideEffects = 0 in {
+ def LLGCR : UnaryRRE<"llgcr", 0xB984, zext8, GR64, GR64>;
+ def LLGHR : UnaryRRE<"llghr", 0xB985, zext16, GR64, GR64>;
+ def LLGFR : UnaryRRE<"llgfr", 0xB916, zext32, GR64, GR32>;
+}
+
+// Match 32-to-64-bit zero extensions in which the source is already
+// in a 64-bit register.
+def : Pat<(and GR64:$src, 0xffffffff),
+ (LLGFR (EXTRACT_SUBREG GR64:$src, subreg_l32))>;
+
+// 32-bit extensions from 8-bit memory. LLCMux expands to LLC or LLCH,
+// depending on the choice of register.
+def LLCMux : UnaryRXYPseudo<"llc", azextloadi8, GRX32, 1>,
+ Requires<[FeatureHighWord]>;
+def LLC : UnaryRXY<"llc", 0xE394, azextloadi8, GR32, 1>;
+def LLCH : UnaryRXY<"llch", 0xE3C2, azextloadi8, GRH32, 1>,
+ Requires<[FeatureHighWord]>;
+
+// 32-bit extensions from 16-bit memory. LLHMux expands to LLH or LLHH,
+// depending on the choice of register.
+def LLHMux : UnaryRXYPseudo<"llh", azextloadi16, GRX32, 2>,
+ Requires<[FeatureHighWord]>;
+def LLH : UnaryRXY<"llh", 0xE395, azextloadi16, GR32, 2>;
+def LLHH : UnaryRXY<"llhh", 0xE3C6, azextloadi16, GRH32, 2>,
+ Requires<[FeatureHighWord]>;
+def LLHRL : UnaryRILPC<"llhrl", 0xC42, aligned_azextloadi16, GR32>;
+
+// 64-bit extensions from memory.
+def LLGC : UnaryRXY<"llgc", 0xE390, azextloadi8, GR64, 1>;
+def LLGH : UnaryRXY<"llgh", 0xE391, azextloadi16, GR64, 2>;
+def LLGF : UnaryRXY<"llgf", 0xE316, azextloadi32, GR64, 4>;
+def LLGHRL : UnaryRILPC<"llghrl", 0xC46, aligned_azextloadi16, GR64>;
+def LLGFRL : UnaryRILPC<"llgfrl", 0xC4E, aligned_azextloadi32, GR64>;
+
+// 31-to-64-bit zero extensions.
+def LLGTR : UnaryRRE<"llgtr", 0xB917, null_frag, GR64, GR64>;
+def LLGT : UnaryRXY<"llgt", 0xE317, null_frag, GR64, 4>;
+def : Pat<(and GR64:$src, 0x7fffffff),
+ (LLGTR GR64:$src)>;
+def : Pat<(and (i64 (azextloadi32 bdxaddr20only:$src)), 0x7fffffff),
+ (LLGT bdxaddr20only:$src)>;
+
+// Load and zero rightmost byte.
+let Predicates = [FeatureLoadAndZeroRightmostByte] in {
+ def LLZRGF : UnaryRXY<"llzrgf", 0xE33A, null_frag, GR64, 4>;
+ def : Pat<(and (i64 (azextloadi32 bdxaddr20only:$src)), 0xffffff00),
+ (LLZRGF bdxaddr20only:$src)>;
+}
+
+// Load and trap.
+let Predicates = [FeatureLoadAndTrap] in {
+ def LLGFAT : UnaryRXY<"llgfat", 0xE39D, null_frag, GR64, 4>;
+ def LLGTAT : UnaryRXY<"llgtat", 0xE39C, null_frag, GR64, 4>;
+}
+
+//===----------------------------------------------------------------------===//
+// Truncations
+//===----------------------------------------------------------------------===//
+
+// Truncations of 64-bit registers to 32-bit registers.
+def : Pat<(i32 (trunc GR64:$src)),
+ (EXTRACT_SUBREG GR64:$src, subreg_l32)>;
+
+// Truncations of 32-bit registers to 8-bit memory. STCMux expands to
+// STC, STCY or STCH, depending on the choice of register.
+def STCMux : StoreRXYPseudo<truncstorei8, GRX32, 1>,
+ Requires<[FeatureHighWord]>;
+defm STC : StoreRXPair<"stc", 0x42, 0xE372, truncstorei8, GR32, 1>;
+def STCH : StoreRXY<"stch", 0xE3C3, truncstorei8, GRH32, 1>,
+ Requires<[FeatureHighWord]>;
+
+// Truncations of 32-bit registers to 16-bit memory. STHMux expands to
+// STH, STHY or STHH, depending on the choice of register.
+def STHMux : StoreRXYPseudo<truncstorei16, GRX32, 1>,
+ Requires<[FeatureHighWord]>;
+defm STH : StoreRXPair<"sth", 0x40, 0xE370, truncstorei16, GR32, 2>;
+def STHH : StoreRXY<"sthh", 0xE3C7, truncstorei16, GRH32, 2>,
+ Requires<[FeatureHighWord]>;
+def STHRL : StoreRILPC<"sthrl", 0xC47, aligned_truncstorei16, GR32>;
+
+// Truncations of 64-bit registers to memory.
+defm : StoreGR64Pair<STC, STCY, truncstorei8>;
+defm : StoreGR64Pair<STH, STHY, truncstorei16>;
+def : StoreGR64PC<STHRL, aligned_truncstorei16>;
+defm : StoreGR64Pair<ST, STY, truncstorei32>;
+def : StoreGR64PC<STRL, aligned_truncstorei32>;
+
+//===----------------------------------------------------------------------===//
+// Multi-register moves
+//===----------------------------------------------------------------------===//
+
+// Multi-register loads.
+defm LM : LoadMultipleRSPair<"lm", 0x98, 0xEB98, GR32>;
+def LMG : LoadMultipleRSY<"lmg", 0xEB04, GR64>;
+def LMH : LoadMultipleRSY<"lmh", 0xEB96, GRH32>;
+
+// Multi-register stores.
+defm STM : StoreMultipleRSPair<"stm", 0x90, 0xEB90, GR32>;
+def STMG : StoreMultipleRSY<"stmg", 0xEB24, GR64>;
+def STMH : StoreMultipleRSY<"stmh", 0xEB26, GRH32>;
+
+//===----------------------------------------------------------------------===//
+// Byte swaps
+//===----------------------------------------------------------------------===//
+
+// Byte-swapping register moves.
+let hasSideEffects = 0 in {
+ def LRVR : UnaryRRE<"lrvr", 0xB91F, bswap, GR32, GR32>;
+ def LRVGR : UnaryRRE<"lrvgr", 0xB90F, bswap, GR64, GR64>;
+}
+
+// Byte-swapping loads. Unlike normal loads, these instructions are
+// allowed to access storage more than once.
+def LRVH : UnaryRXY<"lrvh", 0xE31F, z_lrvh, GR32, 2>;
+def LRV : UnaryRXY<"lrv", 0xE31E, z_lrv, GR32, 4>;
+def LRVG : UnaryRXY<"lrvg", 0xE30F, z_lrvg, GR64, 8>;
+
+// Likewise byte-swapping stores.
+def STRVH : StoreRXY<"strvh", 0xE33F, z_strvh, GR32, 2>;
+def STRV : StoreRXY<"strv", 0xE33E, z_strv, GR32, 4>;
+def STRVG : StoreRXY<"strvg", 0xE32F, z_strvg, GR64, 8>;
+
+//===----------------------------------------------------------------------===//
+// Load address instructions
+//===----------------------------------------------------------------------===//
+
+// Load BDX-style addresses.
+let hasSideEffects = 0, isAsCheapAsAMove = 1, isReMaterializable = 1 in
+ defm LA : LoadAddressRXPair<"la", 0x41, 0xE371, bitconvert>;
+
+// Load a PC-relative address. There's no version of this instruction
+// with a 16-bit offset, so there's no relaxation.
+let hasSideEffects = 0, isAsCheapAsAMove = 1, isMoveImm = 1,
+ isReMaterializable = 1 in
+ def LARL : LoadAddressRIL<"larl", 0xC00, bitconvert>;
+
+// Load the Global Offset Table address. This will be lowered into a
+// larl $R1, _GLOBAL_OFFSET_TABLE_
+// instruction.
+def GOT : Alias<6, (outs GR64:$R1), (ins),
+ [(set GR64:$R1, (global_offset_table))]>;
+
+//===----------------------------------------------------------------------===//
+// Absolute and Negation
+//===----------------------------------------------------------------------===//
+
+let Defs = [CC] in {
+ let CCValues = 0xF, CompareZeroCCMask = 0x8 in {
+ def LPR : UnaryRR <"lpr", 0x10, z_iabs, GR32, GR32>;
+ def LPGR : UnaryRRE<"lpgr", 0xB900, z_iabs, GR64, GR64>;
+ }
+ let CCValues = 0xE, CompareZeroCCMask = 0xE in
+ def LPGFR : UnaryRRE<"lpgfr", 0xB910, null_frag, GR64, GR32>;
+}
+def : Pat<(z_iabs32 GR32:$src), (LPR GR32:$src)>;
+def : Pat<(z_iabs64 GR64:$src), (LPGR GR64:$src)>;
+defm : SXU<z_iabs, LPGFR>;
+defm : SXU<z_iabs64, LPGFR>;
+
+let Defs = [CC] in {
+ let CCValues = 0xF, CompareZeroCCMask = 0x8 in {
+ def LNR : UnaryRR <"lnr", 0x11, z_inegabs, GR32, GR32>;
+ def LNGR : UnaryRRE<"lngr", 0xB901, z_inegabs, GR64, GR64>;
+ }
+ let CCValues = 0xE, CompareZeroCCMask = 0xE in
+ def LNGFR : UnaryRRE<"lngfr", 0xB911, null_frag, GR64, GR32>;
+}
+def : Pat<(z_inegabs32 GR32:$src), (LNR GR32:$src)>;
+def : Pat<(z_inegabs64 GR64:$src), (LNGR GR64:$src)>;
+defm : SXU<z_inegabs, LNGFR>;
+defm : SXU<z_inegabs64, LNGFR>;
+
+let Defs = [CC] in {
+ let CCValues = 0xF, CompareZeroCCMask = 0x8 in {
+ def LCR : UnaryRR <"lcr", 0x13, ineg, GR32, GR32>;
+ def LCGR : UnaryRRE<"lcgr", 0xB903, ineg, GR64, GR64>;
+ }
+ let CCValues = 0xE, CompareZeroCCMask = 0xE in
+ def LCGFR : UnaryRRE<"lcgfr", 0xB913, null_frag, GR64, GR32>;
+}
+defm : SXU<ineg, LCGFR>;
+
+//===----------------------------------------------------------------------===//
+// Insertion
+//===----------------------------------------------------------------------===//
+
+let isCodeGenOnly = 1 in
+ defm IC32 : BinaryRXPair<"ic", 0x43, 0xE373, inserti8, GR32, azextloadi8, 1>;
+defm IC : BinaryRXPair<"ic", 0x43, 0xE373, inserti8, GR64, azextloadi8, 1>;
+
+defm : InsertMem<"inserti8", IC32, GR32, azextloadi8, bdxaddr12pair>;
+defm : InsertMem<"inserti8", IC32Y, GR32, azextloadi8, bdxaddr20pair>;
+
+defm : InsertMem<"inserti8", IC, GR64, azextloadi8, bdxaddr12pair>;
+defm : InsertMem<"inserti8", ICY, GR64, azextloadi8, bdxaddr20pair>;
+
+let Defs = [CC] in {
+ defm ICM : TernaryRSPair<"icm", 0xBF, 0xEB81, GR32, 0>;
+ def ICMH : TernaryRSY<"icmh", 0xEB80, GRH32, 0>;
+}
+
+// Insertions of a 16-bit immediate, leaving other bits unaffected.
+// We don't have or_as_insert equivalents of these operations because
+// OI is available instead.
+//
+// IIxMux expands to II[LH]x, depending on the choice of register.
+def IILMux : BinaryRIPseudo<insertll, GRX32, imm32ll16>,
+ Requires<[FeatureHighWord]>;
+def IIHMux : BinaryRIPseudo<insertlh, GRX32, imm32lh16>,
+ Requires<[FeatureHighWord]>;
+def IILL : BinaryRI<"iill", 0xA53, insertll, GR32, imm32ll16>;
+def IILH : BinaryRI<"iilh", 0xA52, insertlh, GR32, imm32lh16>;
+def IIHL : BinaryRI<"iihl", 0xA51, insertll, GRH32, imm32ll16>;
+def IIHH : BinaryRI<"iihh", 0xA50, insertlh, GRH32, imm32lh16>;
+def IILL64 : BinaryAliasRI<insertll, GR64, imm64ll16>;
+def IILH64 : BinaryAliasRI<insertlh, GR64, imm64lh16>;
+def IIHL64 : BinaryAliasRI<inserthl, GR64, imm64hl16>;
+def IIHH64 : BinaryAliasRI<inserthh, GR64, imm64hh16>;
+
+// ...likewise for 32-bit immediates. For GR32s this is a general
+// full-width move. (We use IILF rather than something like LLILF
+// for 32-bit moves because IILF leaves the upper 32 bits of the
+// GR64 unchanged.)
+let isAsCheapAsAMove = 1, isMoveImm = 1, isReMaterializable = 1 in {
+ def IIFMux : UnaryRIPseudo<bitconvert, GRX32, uimm32>,
+ Requires<[FeatureHighWord]>;
+ def IILF : UnaryRIL<"iilf", 0xC09, bitconvert, GR32, uimm32>;
+ def IIHF : UnaryRIL<"iihf", 0xC08, bitconvert, GRH32, uimm32>;
+}
+def IILF64 : BinaryAliasRIL<insertlf, GR64, imm64lf32>;
+def IIHF64 : BinaryAliasRIL<inserthf, GR64, imm64hf32>;
+
+// An alternative model of inserthf, with the first operand being
+// a zero-extended value.
+def : Pat<(or (zext32 GR32:$src), imm64hf32:$imm),
+ (IIHF64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, subreg_l32),
+ imm64hf32:$imm)>;
+
+//===----------------------------------------------------------------------===//
+// Addition
+//===----------------------------------------------------------------------===//
+
+// Plain addition.
+let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in {
+ // Addition of a register.
+ let isCommutable = 1 in {
+ defm AR : BinaryRRAndK<"ar", 0x1A, 0xB9F8, add, GR32, GR32>;
+ defm AGR : BinaryRREAndK<"agr", 0xB908, 0xB9E8, add, GR64, GR64>;
+ }
+ def AGFR : BinaryRRE<"agfr", 0xB918, null_frag, GR64, GR32>;
+
+ // Addition of signed 16-bit immediates.
+ defm AHIMux : BinaryRIAndKPseudo<"ahimux", add, GRX32, imm32sx16>;
+ defm AHI : BinaryRIAndK<"ahi", 0xA7A, 0xECD8, add, GR32, imm32sx16>;
+ defm AGHI : BinaryRIAndK<"aghi", 0xA7B, 0xECD9, add, GR64, imm64sx16>;
+
+ // Addition of signed 32-bit immediates.
+ def AFIMux : BinaryRIPseudo<add, GRX32, simm32>,
+ Requires<[FeatureHighWord]>;
+ def AFI : BinaryRIL<"afi", 0xC29, add, GR32, simm32>;
+ def AIH : BinaryRIL<"aih", 0xCC8, add, GRH32, simm32>,
+ Requires<[FeatureHighWord]>;
+ def AGFI : BinaryRIL<"agfi", 0xC28, add, GR64, imm64sx32>;
+
+ // Addition of memory.
+ defm AH : BinaryRXPair<"ah", 0x4A, 0xE37A, add, GR32, asextloadi16, 2>;
+ defm A : BinaryRXPair<"a", 0x5A, 0xE35A, add, GR32, load, 4>;
+ def AGF : BinaryRXY<"agf", 0xE318, add, GR64, asextloadi32, 4>;
+ def AG : BinaryRXY<"ag", 0xE308, add, GR64, load, 8>;
+
+ // Addition to memory.
+ def ASI : BinarySIY<"asi", 0xEB6A, add, imm32sx8>;
+ def AGSI : BinarySIY<"agsi", 0xEB7A, add, imm64sx8>;
+}
+defm : SXB<add, GR64, AGFR>;
+
+// Addition producing a carry.
+let Defs = [CC] in {
+ // Addition of a register.
+ let isCommutable = 1 in {
+ defm ALR : BinaryRRAndK<"alr", 0x1E, 0xB9FA, addc, GR32, GR32>;
+ defm ALGR : BinaryRREAndK<"algr", 0xB90A, 0xB9EA, addc, GR64, GR64>;
+ }
+ def ALGFR : BinaryRRE<"algfr", 0xB91A, null_frag, GR64, GR32>;
+
+ // Addition of signed 16-bit immediates.
+ def ALHSIK : BinaryRIE<"alhsik", 0xECDA, addc, GR32, imm32sx16>,
+ Requires<[FeatureDistinctOps]>;
+ def ALGHSIK : BinaryRIE<"alghsik", 0xECDB, addc, GR64, imm64sx16>,
+ Requires<[FeatureDistinctOps]>;
+
+ // Addition of unsigned 32-bit immediates.
+ def ALFI : BinaryRIL<"alfi", 0xC2B, addc, GR32, uimm32>;
+ def ALGFI : BinaryRIL<"algfi", 0xC2A, addc, GR64, imm64zx32>;
+
+ // Addition of memory.
+ defm AL : BinaryRXPair<"al", 0x5E, 0xE35E, addc, GR32, load, 4>;
+ def ALGF : BinaryRXY<"algf", 0xE31A, addc, GR64, azextloadi32, 4>;
+ def ALG : BinaryRXY<"alg", 0xE30A, addc, GR64, load, 8>;
+}
+defm : ZXB<addc, GR64, ALGFR>;
+
+// Addition producing and using a carry.
+let Defs = [CC], Uses = [CC] in {
+ // Addition of a register.
+ def ALCR : BinaryRRE<"alcr", 0xB998, adde, GR32, GR32>;
+ def ALCGR : BinaryRRE<"alcgr", 0xB988, adde, GR64, GR64>;
+
+ // Addition of memory.
+ def ALC : BinaryRXY<"alc", 0xE398, adde, GR32, load, 4>;
+ def ALCG : BinaryRXY<"alcg", 0xE388, adde, GR64, load, 8>;
+}
+
+//===----------------------------------------------------------------------===//
+// Subtraction
+//===----------------------------------------------------------------------===//
+
+// Plain subtraction. Although immediate forms exist, we use the
+// add-immediate instruction instead.
+let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in {
+ // Subtraction of a register.
+ defm SR : BinaryRRAndK<"sr", 0x1B, 0xB9F9, sub, GR32, GR32>;
+ def SGFR : BinaryRRE<"sgfr", 0xB919, null_frag, GR64, GR32>;
+ defm SGR : BinaryRREAndK<"sgr", 0xB909, 0xB9E9, sub, GR64, GR64>;
+
+ // Subtraction of memory.
+ defm SH : BinaryRXPair<"sh", 0x4B, 0xE37B, sub, GR32, asextloadi16, 2>;
+ defm S : BinaryRXPair<"s", 0x5B, 0xE35B, sub, GR32, load, 4>;
+ def SGF : BinaryRXY<"sgf", 0xE319, sub, GR64, asextloadi32, 4>;
+ def SG : BinaryRXY<"sg", 0xE309, sub, GR64, load, 8>;
+}
+defm : SXB<sub, GR64, SGFR>;
+
+// Subtraction producing a carry.
+let Defs = [CC] in {
+ // Subtraction of a register.
+ defm SLR : BinaryRRAndK<"slr", 0x1F, 0xB9FB, subc, GR32, GR32>;
+ def SLGFR : BinaryRRE<"slgfr", 0xB91B, null_frag, GR64, GR32>;
+ defm SLGR : BinaryRREAndK<"slgr", 0xB90B, 0xB9EB, subc, GR64, GR64>;
+
+ // Subtraction of unsigned 32-bit immediates. These don't match
+ // subc because we prefer addc for constants.
+ def SLFI : BinaryRIL<"slfi", 0xC25, null_frag, GR32, uimm32>;
+ def SLGFI : BinaryRIL<"slgfi", 0xC24, null_frag, GR64, imm64zx32>;
+
+ // Subtraction of memory.
+ defm SL : BinaryRXPair<"sl", 0x5F, 0xE35F, subc, GR32, load, 4>;
+ def SLGF : BinaryRXY<"slgf", 0xE31B, subc, GR64, azextloadi32, 4>;
+ def SLG : BinaryRXY<"slg", 0xE30B, subc, GR64, load, 8>;
+}
+defm : ZXB<subc, GR64, SLGFR>;
+
+// Subtraction producing and using a carry.
+let Defs = [CC], Uses = [CC] in {
+ // Subtraction of a register.
+ def SLBR : BinaryRRE<"slbr", 0xB999, sube, GR32, GR32>;
+ def SLBGR : BinaryRRE<"slbgr", 0xB989, sube, GR64, GR64>;
+
+ // Subtraction of memory.
+ def SLB : BinaryRXY<"slb", 0xE399, sube, GR32, load, 4>;
+ def SLBG : BinaryRXY<"slbg", 0xE389, sube, GR64, load, 8>;
+}
+
+//===----------------------------------------------------------------------===//
+// AND
+//===----------------------------------------------------------------------===//
+
+let Defs = [CC] in {
+ // ANDs of a register.
+ let isCommutable = 1, CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+ defm NR : BinaryRRAndK<"nr", 0x14, 0xB9F4, and, GR32, GR32>;
+ defm NGR : BinaryRREAndK<"ngr", 0xB980, 0xB9E4, and, GR64, GR64>;
+ }
+
+ let isConvertibleToThreeAddress = 1 in {
+ // ANDs of a 16-bit immediate, leaving other bits unaffected.
+ // The CC result only reflects the 16-bit field, not the full register.
+ //
+ // NIxMux expands to NI[LH]x, depending on the choice of register.
+ def NILMux : BinaryRIPseudo<and, GRX32, imm32ll16c>,
+ Requires<[FeatureHighWord]>;
+ def NIHMux : BinaryRIPseudo<and, GRX32, imm32lh16c>,
+ Requires<[FeatureHighWord]>;
+ def NILL : BinaryRI<"nill", 0xA57, and, GR32, imm32ll16c>;
+ def NILH : BinaryRI<"nilh", 0xA56, and, GR32, imm32lh16c>;
+ def NIHL : BinaryRI<"nihl", 0xA55, and, GRH32, imm32ll16c>;
+ def NIHH : BinaryRI<"nihh", 0xA54, and, GRH32, imm32lh16c>;
+ def NILL64 : BinaryAliasRI<and, GR64, imm64ll16c>;
+ def NILH64 : BinaryAliasRI<and, GR64, imm64lh16c>;
+ def NIHL64 : BinaryAliasRI<and, GR64, imm64hl16c>;
+ def NIHH64 : BinaryAliasRI<and, GR64, imm64hh16c>;
+
+ // ANDs of a 32-bit immediate, leaving other bits unaffected.
+ // The CC result only reflects the 32-bit field, which means we can
+ // use it as a zero indicator for i32 operations but not otherwise.
+ let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+ // Expands to NILF or NIHF, depending on the choice of register.
+ def NIFMux : BinaryRIPseudo<and, GRX32, uimm32>,
+ Requires<[FeatureHighWord]>;
+ def NILF : BinaryRIL<"nilf", 0xC0B, and, GR32, uimm32>;
+ def NIHF : BinaryRIL<"nihf", 0xC0A, and, GRH32, uimm32>;
+ }
+ def NILF64 : BinaryAliasRIL<and, GR64, imm64lf32c>;
+ def NIHF64 : BinaryAliasRIL<and, GR64, imm64hf32c>;
+ }
+
+ // ANDs of memory.
+ let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+ defm N : BinaryRXPair<"n", 0x54, 0xE354, and, GR32, load, 4>;
+ def NG : BinaryRXY<"ng", 0xE380, and, GR64, load, 8>;
+ }
+
+ // AND to memory
+ defm NI : BinarySIPair<"ni", 0x94, 0xEB54, null_frag, imm32zx8>;
+
+ // Block AND.
+ let mayLoad = 1, mayStore = 1 in
+ defm NC : MemorySS<"nc", 0xD4, z_nc, z_nc_loop>;
+}
+defm : RMWIByte<and, bdaddr12pair, NI>;
+defm : RMWIByte<and, bdaddr20pair, NIY>;
+
+//===----------------------------------------------------------------------===//
+// OR
+//===----------------------------------------------------------------------===//
+
+let Defs = [CC] in {
+ // ORs of a register.
+ let isCommutable = 1, CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+ defm OR : BinaryRRAndK<"or", 0x16, 0xB9F6, or, GR32, GR32>;
+ defm OGR : BinaryRREAndK<"ogr", 0xB981, 0xB9E6, or, GR64, GR64>;
+ }
+
+ // ORs of a 16-bit immediate, leaving other bits unaffected.
+ // The CC result only reflects the 16-bit field, not the full register.
+ //
+ // OIxMux expands to OI[LH]x, depending on the choice of register.
+ def OILMux : BinaryRIPseudo<or, GRX32, imm32ll16>,
+ Requires<[FeatureHighWord]>;
+ def OIHMux : BinaryRIPseudo<or, GRX32, imm32lh16>,
+ Requires<[FeatureHighWord]>;
+ def OILL : BinaryRI<"oill", 0xA5B, or, GR32, imm32ll16>;
+ def OILH : BinaryRI<"oilh", 0xA5A, or, GR32, imm32lh16>;
+ def OIHL : BinaryRI<"oihl", 0xA59, or, GRH32, imm32ll16>;
+ def OIHH : BinaryRI<"oihh", 0xA58, or, GRH32, imm32lh16>;
+ def OILL64 : BinaryAliasRI<or, GR64, imm64ll16>;
+ def OILH64 : BinaryAliasRI<or, GR64, imm64lh16>;
+ def OIHL64 : BinaryAliasRI<or, GR64, imm64hl16>;
+ def OIHH64 : BinaryAliasRI<or, GR64, imm64hh16>;
+
+ // ORs of a 32-bit immediate, leaving other bits unaffected.
+ // The CC result only reflects the 32-bit field, which means we can
+ // use it as a zero indicator for i32 operations but not otherwise.
+ let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+ // Expands to OILF or OIHF, depending on the choice of register.
+ def OIFMux : BinaryRIPseudo<or, GRX32, uimm32>,
+ Requires<[FeatureHighWord]>;
+ def OILF : BinaryRIL<"oilf", 0xC0D, or, GR32, uimm32>;
+ def OIHF : BinaryRIL<"oihf", 0xC0C, or, GRH32, uimm32>;
+ }
+ def OILF64 : BinaryAliasRIL<or, GR64, imm64lf32>;
+ def OIHF64 : BinaryAliasRIL<or, GR64, imm64hf32>;
+
+ // ORs of memory.
+ let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+ defm O : BinaryRXPair<"o", 0x56, 0xE356, or, GR32, load, 4>;
+ def OG : BinaryRXY<"og", 0xE381, or, GR64, load, 8>;
+ }
+
+ // OR to memory
+ defm OI : BinarySIPair<"oi", 0x96, 0xEB56, null_frag, imm32zx8>;
+
+ // Block OR.
+ let mayLoad = 1, mayStore = 1 in
+ defm OC : MemorySS<"oc", 0xD6, z_oc, z_oc_loop>;
+}
+defm : RMWIByte<or, bdaddr12pair, OI>;
+defm : RMWIByte<or, bdaddr20pair, OIY>;
+
+//===----------------------------------------------------------------------===//
+// XOR
+//===----------------------------------------------------------------------===//
+
+let Defs = [CC] in {
+ // XORs of a register.
+ let isCommutable = 1, CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+ defm XR : BinaryRRAndK<"xr", 0x17, 0xB9F7, xor, GR32, GR32>;
+ defm XGR : BinaryRREAndK<"xgr", 0xB982, 0xB9E7, xor, GR64, GR64>;
+ }
+
+ // XORs of a 32-bit immediate, leaving other bits unaffected.
+ // The CC result only reflects the 32-bit field, which means we can
+ // use it as a zero indicator for i32 operations but not otherwise.
+ let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+ // Expands to XILF or XIHF, depending on the choice of register.
+ def XIFMux : BinaryRIPseudo<xor, GRX32, uimm32>,
+ Requires<[FeatureHighWord]>;
+ def XILF : BinaryRIL<"xilf", 0xC07, xor, GR32, uimm32>;
+ def XIHF : BinaryRIL<"xihf", 0xC06, xor, GRH32, uimm32>;
+ }
+ def XILF64 : BinaryAliasRIL<xor, GR64, imm64lf32>;
+ def XIHF64 : BinaryAliasRIL<xor, GR64, imm64hf32>;
+
+ // XORs of memory.
+ let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+ defm X : BinaryRXPair<"x",0x57, 0xE357, xor, GR32, load, 4>;
+ def XG : BinaryRXY<"xg", 0xE382, xor, GR64, load, 8>;
+ }
+
+ // XOR to memory
+ defm XI : BinarySIPair<"xi", 0x97, 0xEB57, null_frag, imm32zx8>;
+
+ // Block XOR.
+ let mayLoad = 1, mayStore = 1 in
+ defm XC : MemorySS<"xc", 0xD7, z_xc, z_xc_loop>;
+}
+defm : RMWIByte<xor, bdaddr12pair, XI>;
+defm : RMWIByte<xor, bdaddr20pair, XIY>;
+
+//===----------------------------------------------------------------------===//
+// Multiplication
+//===----------------------------------------------------------------------===//
+
+// Multiplication of a register.
+let isCommutable = 1 in {
+ def MSR : BinaryRRE<"msr", 0xB252, mul, GR32, GR32>;
+ def MSGR : BinaryRRE<"msgr", 0xB90C, mul, GR64, GR64>;
+}
+def MSGFR : BinaryRRE<"msgfr", 0xB91C, null_frag, GR64, GR32>;
+defm : SXB<mul, GR64, MSGFR>;
+
+// Multiplication of a signed 16-bit immediate.
+def MHI : BinaryRI<"mhi", 0xA7C, mul, GR32, imm32sx16>;
+def MGHI : BinaryRI<"mghi", 0xA7D, mul, GR64, imm64sx16>;
+
+// Multiplication of a signed 32-bit immediate.
+def MSFI : BinaryRIL<"msfi", 0xC21, mul, GR32, simm32>;
+def MSGFI : BinaryRIL<"msgfi", 0xC20, mul, GR64, imm64sx32>;
+
+// Multiplication of memory.
+defm MH : BinaryRXPair<"mh", 0x4C, 0xE37C, mul, GR32, asextloadi16, 2>;
+defm MS : BinaryRXPair<"ms", 0x71, 0xE351, mul, GR32, load, 4>;
+def MSGF : BinaryRXY<"msgf", 0xE31C, mul, GR64, asextloadi32, 4>;
+def MSG : BinaryRXY<"msg", 0xE30C, mul, GR64, load, 8>;
+
+// Multiplication of a register, producing two results.
+def MLGR : BinaryRRE<"mlgr", 0xB986, z_umul_lohi64, GR128, GR64>;
+
+// Multiplication of memory, producing two results.
+def MLG : BinaryRXY<"mlg", 0xE386, z_umul_lohi64, GR128, load, 8>;
+
+//===----------------------------------------------------------------------===//
+// Division and remainder
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 1 in { // Do not speculatively execute.
+ // Division and remainder, from registers.
+ def DSGFR : BinaryRRE<"dsgfr", 0xB91D, z_sdivrem32, GR128, GR32>;
+ def DSGR : BinaryRRE<"dsgr", 0xB90D, z_sdivrem64, GR128, GR64>;
+ def DLR : BinaryRRE<"dlr", 0xB997, z_udivrem32, GR128, GR32>;
+ def DLGR : BinaryRRE<"dlgr", 0xB987, z_udivrem64, GR128, GR64>;
+
+ // Division and remainder, from memory.
+ def DSGF : BinaryRXY<"dsgf", 0xE31D, z_sdivrem32, GR128, load, 4>;
+ def DSG : BinaryRXY<"dsg", 0xE30D, z_sdivrem64, GR128, load, 8>;
+ def DL : BinaryRXY<"dl", 0xE397, z_udivrem32, GR128, load, 4>;
+ def DLG : BinaryRXY<"dlg", 0xE387, z_udivrem64, GR128, load, 8>;
+}
+
+//===----------------------------------------------------------------------===//
+// Shifts
+//===----------------------------------------------------------------------===//
+
+// Shift left.
+let hasSideEffects = 0 in {
+ defm SLL : BinaryRSAndK<"sll", 0x89, 0xEBDF, shl, GR32>;
+ defm SLA : BinaryRSAndK<"sla", 0x8B, 0xEBDD, null_frag, GR32>;
+ def SLLG : BinaryRSY<"sllg", 0xEB0D, shl, GR64>;
+}
+
+// Logical shift right.
+let hasSideEffects = 0 in {
+ defm SRL : BinaryRSAndK<"srl", 0x88, 0xEBDE, srl, GR32>;
+ def SRLG : BinaryRSY<"srlg", 0xEB0C, srl, GR64>;
+}
+
+// Arithmetic shift right.
+let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in {
+ defm SRA : BinaryRSAndK<"sra", 0x8A, 0xEBDC, sra, GR32>;
+ def SRAG : BinaryRSY<"srag", 0xEB0A, sra, GR64>;
+}
+
+// Rotate left.
+let hasSideEffects = 0 in {
+ def RLL : BinaryRSY<"rll", 0xEB1D, rotl, GR32>;
+ def RLLG : BinaryRSY<"rllg", 0xEB1C, rotl, GR64>;
+}
+
+// Rotate second operand left and inserted selected bits into first operand.
+// These can act like 32-bit operands provided that the constant start and
+// end bits (operands 2 and 3) are in the range [32, 64).
+let Defs = [CC] in {
+ let isCodeGenOnly = 1 in
+ def RISBG32 : RotateSelectRIEf<"risbg", 0xEC55, GR32, GR32>;
+ let CCValues = 0xE, CompareZeroCCMask = 0xE in
+ def RISBG : RotateSelectRIEf<"risbg", 0xEC55, GR64, GR64>;
+}
+
+// On zEC12 we have a variant of RISBG that does not set CC.
+let Predicates = [FeatureMiscellaneousExtensions] in
+ def RISBGN : RotateSelectRIEf<"risbgn", 0xEC59, GR64, GR64>;
+
+// Forms of RISBG that only affect one word of the destination register.
+// They do not set CC.
+let Predicates = [FeatureHighWord] in {
+ def RISBMux : RotateSelectRIEfPseudo<GRX32, GRX32>;
+ def RISBLL : RotateSelectAliasRIEf<GR32, GR32>;
+ def RISBLH : RotateSelectAliasRIEf<GR32, GRH32>;
+ def RISBHL : RotateSelectAliasRIEf<GRH32, GR32>;
+ def RISBHH : RotateSelectAliasRIEf<GRH32, GRH32>;
+ def RISBLG : RotateSelectRIEf<"risblg", 0xEC51, GR32, GR64>;
+ def RISBHG : RotateSelectRIEf<"risbhg", 0xEC5D, GRH32, GR64>;
+}
+
+// Rotate second operand left and perform a logical operation with selected
+// bits of the first operand. The CC result only describes the selected bits,
+// so isn't useful for a full comparison against zero.
+let Defs = [CC] in {
+ def RNSBG : RotateSelectRIEf<"rnsbg", 0xEC54, GR64, GR64>;
+ def ROSBG : RotateSelectRIEf<"rosbg", 0xEC56, GR64, GR64>;
+ def RXSBG : RotateSelectRIEf<"rxsbg", 0xEC57, GR64, GR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// Comparison
+//===----------------------------------------------------------------------===//
+
+// Signed comparisons. We put these before the unsigned comparisons because
+// some of the signed forms have COMPARE AND BRANCH equivalents whereas none
+// of the unsigned forms do.
+let Defs = [CC], CCValues = 0xE in {
+ // Comparison with a register.
+ def CR : CompareRR <"cr", 0x19, z_scmp, GR32, GR32>;
+ def CGFR : CompareRRE<"cgfr", 0xB930, null_frag, GR64, GR32>;
+ def CGR : CompareRRE<"cgr", 0xB920, z_scmp, GR64, GR64>;
+
+ // Comparison with a signed 16-bit immediate. CHIMux expands to CHI or CIH,
+ // depending on the choice of register.
+ def CHIMux : CompareRIPseudo<z_scmp, GRX32, imm32sx16>,
+ Requires<[FeatureHighWord]>;
+ def CHI : CompareRI<"chi", 0xA7E, z_scmp, GR32, imm32sx16>;
+ def CGHI : CompareRI<"cghi", 0xA7F, z_scmp, GR64, imm64sx16>;
+
+ // Comparison with a signed 32-bit immediate. CFIMux expands to CFI or CIH,
+ // depending on the choice of register.
+ def CFIMux : CompareRIPseudo<z_scmp, GRX32, simm32>,
+ Requires<[FeatureHighWord]>;
+ def CFI : CompareRIL<"cfi", 0xC2D, z_scmp, GR32, simm32>;
+ def CIH : CompareRIL<"cih", 0xCCD, z_scmp, GRH32, simm32>,
+ Requires<[FeatureHighWord]>;
+ def CGFI : CompareRIL<"cgfi", 0xC2C, z_scmp, GR64, imm64sx32>;
+
+ // Comparison with memory.
+ defm CH : CompareRXPair<"ch", 0x49, 0xE379, z_scmp, GR32, asextloadi16, 2>;
+ def CMux : CompareRXYPseudo<z_scmp, GRX32, load, 4>,
+ Requires<[FeatureHighWord]>;
+ defm C : CompareRXPair<"c", 0x59, 0xE359, z_scmp, GR32, load, 4>;
+ def CHF : CompareRXY<"chf", 0xE3CD, z_scmp, GRH32, load, 4>,
+ Requires<[FeatureHighWord]>;
+ def CGH : CompareRXY<"cgh", 0xE334, z_scmp, GR64, asextloadi16, 2>;
+ def CGF : CompareRXY<"cgf", 0xE330, z_scmp, GR64, asextloadi32, 4>;
+ def CG : CompareRXY<"cg", 0xE320, z_scmp, GR64, load, 8>;
+ def CHRL : CompareRILPC<"chrl", 0xC65, z_scmp, GR32, aligned_asextloadi16>;
+ def CRL : CompareRILPC<"crl", 0xC6D, z_scmp, GR32, aligned_load>;
+ def CGHRL : CompareRILPC<"cghrl", 0xC64, z_scmp, GR64, aligned_asextloadi16>;
+ def CGFRL : CompareRILPC<"cgfrl", 0xC6C, z_scmp, GR64, aligned_asextloadi32>;
+ def CGRL : CompareRILPC<"cgrl", 0xC68, z_scmp, GR64, aligned_load>;
+
+ // Comparison between memory and a signed 16-bit immediate.
+ def CHHSI : CompareSIL<"chhsi", 0xE554, z_scmp, asextloadi16, imm32sx16>;
+ def CHSI : CompareSIL<"chsi", 0xE55C, z_scmp, load, imm32sx16>;
+ def CGHSI : CompareSIL<"cghsi", 0xE558, z_scmp, load, imm64sx16>;
+}
+defm : SXB<z_scmp, GR64, CGFR>;
+
+// Unsigned comparisons.
+let Defs = [CC], CCValues = 0xE, IsLogical = 1 in {
+ // Comparison with a register.
+ def CLR : CompareRR <"clr", 0x15, z_ucmp, GR32, GR32>;
+ def CLGFR : CompareRRE<"clgfr", 0xB931, null_frag, GR64, GR32>;
+ def CLGR : CompareRRE<"clgr", 0xB921, z_ucmp, GR64, GR64>;
+
+ // Comparison with an unsigned 32-bit immediate. CLFIMux expands to CLFI
+ // or CLIH, depending on the choice of register.
+ def CLFIMux : CompareRIPseudo<z_ucmp, GRX32, uimm32>,
+ Requires<[FeatureHighWord]>;
+ def CLFI : CompareRIL<"clfi", 0xC2F, z_ucmp, GR32, uimm32>;
+ def CLIH : CompareRIL<"clih", 0xCCF, z_ucmp, GRH32, uimm32>,
+ Requires<[FeatureHighWord]>;
+ def CLGFI : CompareRIL<"clgfi", 0xC2E, z_ucmp, GR64, imm64zx32>;
+
+ // Comparison with memory.
+ def CLMux : CompareRXYPseudo<z_ucmp, GRX32, load, 4>,
+ Requires<[FeatureHighWord]>;
+ defm CL : CompareRXPair<"cl", 0x55, 0xE355, z_ucmp, GR32, load, 4>;
+ def CLHF : CompareRXY<"clhf", 0xE3CF, z_ucmp, GRH32, load, 4>,
+ Requires<[FeatureHighWord]>;
+ def CLGF : CompareRXY<"clgf", 0xE331, z_ucmp, GR64, azextloadi32, 4>;
+ def CLG : CompareRXY<"clg", 0xE321, z_ucmp, GR64, load, 8>;
+ def CLHRL : CompareRILPC<"clhrl", 0xC67, z_ucmp, GR32,
+ aligned_azextloadi16>;
+ def CLRL : CompareRILPC<"clrl", 0xC6F, z_ucmp, GR32,
+ aligned_load>;
+ def CLGHRL : CompareRILPC<"clghrl", 0xC66, z_ucmp, GR64,
+ aligned_azextloadi16>;
+ def CLGFRL : CompareRILPC<"clgfrl", 0xC6E, z_ucmp, GR64,
+ aligned_azextloadi32>;
+ def CLGRL : CompareRILPC<"clgrl", 0xC6A, z_ucmp, GR64,
+ aligned_load>;
+
+ // Comparison between memory and an unsigned 8-bit immediate.
+ defm CLI : CompareSIPair<"cli", 0x95, 0xEB55, z_ucmp, azextloadi8, imm32zx8>;
+
+ // Comparison between memory and an unsigned 16-bit immediate.
+ def CLHHSI : CompareSIL<"clhhsi", 0xE555, z_ucmp, azextloadi16, imm32zx16>;
+ def CLFHSI : CompareSIL<"clfhsi", 0xE55D, z_ucmp, load, imm32zx16>;
+ def CLGHSI : CompareSIL<"clghsi", 0xE559, z_ucmp, load, imm64zx16>;
+}
+defm : ZXB<z_ucmp, GR64, CLGFR>;
+
+// Memory-to-memory comparison.
+let mayLoad = 1, Defs = [CC] in
+ defm CLC : MemorySS<"clc", 0xD5, z_clc, z_clc_loop>;
+
+// String comparison.
+let mayLoad = 1, Defs = [CC] in
+ defm CLST : StringRRE<"clst", 0xB25D, z_strcmp>;
+
+// Test under mask.
+let Defs = [CC] in {
+ // TMxMux expands to TM[LH]x, depending on the choice of register.
+ def TMLMux : CompareRIPseudo<z_tm_reg, GRX32, imm32ll16>,
+ Requires<[FeatureHighWord]>;
+ def TMHMux : CompareRIPseudo<z_tm_reg, GRX32, imm32lh16>,
+ Requires<[FeatureHighWord]>;
+ def TMLL : CompareRI<"tmll", 0xA71, z_tm_reg, GR32, imm32ll16>;
+ def TMLH : CompareRI<"tmlh", 0xA70, z_tm_reg, GR32, imm32lh16>;
+ def TMHL : CompareRI<"tmhl", 0xA73, z_tm_reg, GRH32, imm32ll16>;
+ def TMHH : CompareRI<"tmhh", 0xA72, z_tm_reg, GRH32, imm32lh16>;
+
+ def TMLL64 : CompareAliasRI<z_tm_reg, GR64, imm64ll16>;
+ def TMLH64 : CompareAliasRI<z_tm_reg, GR64, imm64lh16>;
+ def TMHL64 : CompareAliasRI<z_tm_reg, GR64, imm64hl16>;
+ def TMHH64 : CompareAliasRI<z_tm_reg, GR64, imm64hh16>;
+
+ defm TM : CompareSIPair<"tm", 0x91, 0xEB51, z_tm_mem, anyextloadi8, imm32zx8>;
+}
+
+def TML : InstAlias<"tml\t$R, $I", (TMLL GR32:$R, imm32ll16:$I), 0>;
+def TMH : InstAlias<"tmh\t$R, $I", (TMLH GR32:$R, imm32lh16:$I), 0>;
+
+//===----------------------------------------------------------------------===//
+// Prefetch and execution hint
+//===----------------------------------------------------------------------===//
+
+def PFD : PrefetchRXY<"pfd", 0xE336, z_prefetch>;
+def PFDRL : PrefetchRILPC<"pfdrl", 0xC62, z_prefetch>;
+
+let Predicates = [FeatureExecutionHint] in {
+ // Branch Prediction Preload
+ def BPP : BranchPreloadSMI<"bpp", 0xC7>;
+ def BPRP : BranchPreloadMII<"bprp", 0xC5>;
+
+ // Next Instruction Access Intent
+ def NIAI : SideEffectBinaryIE<"niai", 0xB2FA, imm32zx4, imm32zx4>;
+}
+
+//===----------------------------------------------------------------------===//
+// Atomic operations
+//===----------------------------------------------------------------------===//
+
+// A serialization instruction that acts as a barrier for all memory
+// accesses, which expands to "bcr 14, 0".
+let hasSideEffects = 1 in
+def Serialize : Alias<2, (outs), (ins), [(z_serialize)]>;
+
+// A pseudo instruction that serves as a compiler barrier.
+let hasSideEffects = 1, hasNoSchedulingInfo = 1 in
+def MemBarrier : Pseudo<(outs), (ins), [(z_membarrier)]>;
+
+let Predicates = [FeatureInterlockedAccess1], Defs = [CC] in {
+ def LAA : LoadAndOpRSY<"laa", 0xEBF8, atomic_load_add_32, GR32>;
+ def LAAG : LoadAndOpRSY<"laag", 0xEBE8, atomic_load_add_64, GR64>;
+ def LAAL : LoadAndOpRSY<"laal", 0xEBFA, null_frag, GR32>;
+ def LAALG : LoadAndOpRSY<"laalg", 0xEBEA, null_frag, GR64>;
+ def LAN : LoadAndOpRSY<"lan", 0xEBF4, atomic_load_and_32, GR32>;
+ def LANG : LoadAndOpRSY<"lang", 0xEBE4, atomic_load_and_64, GR64>;
+ def LAO : LoadAndOpRSY<"lao", 0xEBF6, atomic_load_or_32, GR32>;
+ def LAOG : LoadAndOpRSY<"laog", 0xEBE6, atomic_load_or_64, GR64>;
+ def LAX : LoadAndOpRSY<"lax", 0xEBF7, atomic_load_xor_32, GR32>;
+ def LAXG : LoadAndOpRSY<"laxg", 0xEBE7, atomic_load_xor_64, GR64>;
+}
+
+def ATOMIC_SWAPW : AtomicLoadWBinaryReg<z_atomic_swapw>;
+def ATOMIC_SWAP_32 : AtomicLoadBinaryReg32<atomic_swap_32>;
+def ATOMIC_SWAP_64 : AtomicLoadBinaryReg64<atomic_swap_64>;
+
+def ATOMIC_LOADW_AR : AtomicLoadWBinaryReg<z_atomic_loadw_add>;
+def ATOMIC_LOADW_AFI : AtomicLoadWBinaryImm<z_atomic_loadw_add, simm32>;
+let Predicates = [FeatureNoInterlockedAccess1] in {
+ def ATOMIC_LOAD_AR : AtomicLoadBinaryReg32<atomic_load_add_32>;
+ def ATOMIC_LOAD_AHI : AtomicLoadBinaryImm32<atomic_load_add_32, imm32sx16>;
+ def ATOMIC_LOAD_AFI : AtomicLoadBinaryImm32<atomic_load_add_32, simm32>;
+ def ATOMIC_LOAD_AGR : AtomicLoadBinaryReg64<atomic_load_add_64>;
+ def ATOMIC_LOAD_AGHI : AtomicLoadBinaryImm64<atomic_load_add_64, imm64sx16>;
+ def ATOMIC_LOAD_AGFI : AtomicLoadBinaryImm64<atomic_load_add_64, imm64sx32>;
+}
+
+def ATOMIC_LOADW_SR : AtomicLoadWBinaryReg<z_atomic_loadw_sub>;
+def ATOMIC_LOAD_SR : AtomicLoadBinaryReg32<atomic_load_sub_32>;
+def ATOMIC_LOAD_SGR : AtomicLoadBinaryReg64<atomic_load_sub_64>;
+
+def ATOMIC_LOADW_NR : AtomicLoadWBinaryReg<z_atomic_loadw_and>;
+def ATOMIC_LOADW_NILH : AtomicLoadWBinaryImm<z_atomic_loadw_and, imm32lh16c>;
+let Predicates = [FeatureNoInterlockedAccess1] in {
+ def ATOMIC_LOAD_NR : AtomicLoadBinaryReg32<atomic_load_and_32>;
+ def ATOMIC_LOAD_NILL : AtomicLoadBinaryImm32<atomic_load_and_32,
+ imm32ll16c>;
+ def ATOMIC_LOAD_NILH : AtomicLoadBinaryImm32<atomic_load_and_32,
+ imm32lh16c>;
+ def ATOMIC_LOAD_NILF : AtomicLoadBinaryImm32<atomic_load_and_32, uimm32>;
+ def ATOMIC_LOAD_NGR : AtomicLoadBinaryReg64<atomic_load_and_64>;
+ def ATOMIC_LOAD_NILL64 : AtomicLoadBinaryImm64<atomic_load_and_64,
+ imm64ll16c>;
+ def ATOMIC_LOAD_NILH64 : AtomicLoadBinaryImm64<atomic_load_and_64,
+ imm64lh16c>;
+ def ATOMIC_LOAD_NIHL64 : AtomicLoadBinaryImm64<atomic_load_and_64,
+ imm64hl16c>;
+ def ATOMIC_LOAD_NIHH64 : AtomicLoadBinaryImm64<atomic_load_and_64,
+ imm64hh16c>;
+ def ATOMIC_LOAD_NILF64 : AtomicLoadBinaryImm64<atomic_load_and_64,
+ imm64lf32c>;
+ def ATOMIC_LOAD_NIHF64 : AtomicLoadBinaryImm64<atomic_load_and_64,
+ imm64hf32c>;
+}
+
+def ATOMIC_LOADW_OR : AtomicLoadWBinaryReg<z_atomic_loadw_or>;
+def ATOMIC_LOADW_OILH : AtomicLoadWBinaryImm<z_atomic_loadw_or, imm32lh16>;
+let Predicates = [FeatureNoInterlockedAccess1] in {
+ def ATOMIC_LOAD_OR : AtomicLoadBinaryReg32<atomic_load_or_32>;
+ def ATOMIC_LOAD_OILL : AtomicLoadBinaryImm32<atomic_load_or_32, imm32ll16>;
+ def ATOMIC_LOAD_OILH : AtomicLoadBinaryImm32<atomic_load_or_32, imm32lh16>;
+ def ATOMIC_LOAD_OILF : AtomicLoadBinaryImm32<atomic_load_or_32, uimm32>;
+ def ATOMIC_LOAD_OGR : AtomicLoadBinaryReg64<atomic_load_or_64>;
+ def ATOMIC_LOAD_OILL64 : AtomicLoadBinaryImm64<atomic_load_or_64, imm64ll16>;
+ def ATOMIC_LOAD_OILH64 : AtomicLoadBinaryImm64<atomic_load_or_64, imm64lh16>;
+ def ATOMIC_LOAD_OIHL64 : AtomicLoadBinaryImm64<atomic_load_or_64, imm64hl16>;
+ def ATOMIC_LOAD_OIHH64 : AtomicLoadBinaryImm64<atomic_load_or_64, imm64hh16>;
+ def ATOMIC_LOAD_OILF64 : AtomicLoadBinaryImm64<atomic_load_or_64, imm64lf32>;
+ def ATOMIC_LOAD_OIHF64 : AtomicLoadBinaryImm64<atomic_load_or_64, imm64hf32>;
+}
+
+def ATOMIC_LOADW_XR : AtomicLoadWBinaryReg<z_atomic_loadw_xor>;
+def ATOMIC_LOADW_XILF : AtomicLoadWBinaryImm<z_atomic_loadw_xor, uimm32>;
+let Predicates = [FeatureNoInterlockedAccess1] in {
+ def ATOMIC_LOAD_XR : AtomicLoadBinaryReg32<atomic_load_xor_32>;
+ def ATOMIC_LOAD_XILF : AtomicLoadBinaryImm32<atomic_load_xor_32, uimm32>;
+ def ATOMIC_LOAD_XGR : AtomicLoadBinaryReg64<atomic_load_xor_64>;
+ def ATOMIC_LOAD_XILF64 : AtomicLoadBinaryImm64<atomic_load_xor_64, imm64lf32>;
+ def ATOMIC_LOAD_XIHF64 : AtomicLoadBinaryImm64<atomic_load_xor_64, imm64hf32>;
+}
+
+def ATOMIC_LOADW_NRi : AtomicLoadWBinaryReg<z_atomic_loadw_nand>;
+def ATOMIC_LOADW_NILHi : AtomicLoadWBinaryImm<z_atomic_loadw_nand,
+ imm32lh16c>;
+def ATOMIC_LOAD_NRi : AtomicLoadBinaryReg32<atomic_load_nand_32>;
+def ATOMIC_LOAD_NILLi : AtomicLoadBinaryImm32<atomic_load_nand_32,
+ imm32ll16c>;
+def ATOMIC_LOAD_NILHi : AtomicLoadBinaryImm32<atomic_load_nand_32,
+ imm32lh16c>;
+def ATOMIC_LOAD_NILFi : AtomicLoadBinaryImm32<atomic_load_nand_32, uimm32>;
+def ATOMIC_LOAD_NGRi : AtomicLoadBinaryReg64<atomic_load_nand_64>;
+def ATOMIC_LOAD_NILL64i : AtomicLoadBinaryImm64<atomic_load_nand_64,
+ imm64ll16c>;
+def ATOMIC_LOAD_NILH64i : AtomicLoadBinaryImm64<atomic_load_nand_64,
+ imm64lh16c>;
+def ATOMIC_LOAD_NIHL64i : AtomicLoadBinaryImm64<atomic_load_nand_64,
+ imm64hl16c>;
+def ATOMIC_LOAD_NIHH64i : AtomicLoadBinaryImm64<atomic_load_nand_64,
+ imm64hh16c>;
+def ATOMIC_LOAD_NILF64i : AtomicLoadBinaryImm64<atomic_load_nand_64,
+ imm64lf32c>;
+def ATOMIC_LOAD_NIHF64i : AtomicLoadBinaryImm64<atomic_load_nand_64,
+ imm64hf32c>;
+
+def ATOMIC_LOADW_MIN : AtomicLoadWBinaryReg<z_atomic_loadw_min>;
+def ATOMIC_LOAD_MIN_32 : AtomicLoadBinaryReg32<atomic_load_min_32>;
+def ATOMIC_LOAD_MIN_64 : AtomicLoadBinaryReg64<atomic_load_min_64>;
+
+def ATOMIC_LOADW_MAX : AtomicLoadWBinaryReg<z_atomic_loadw_max>;
+def ATOMIC_LOAD_MAX_32 : AtomicLoadBinaryReg32<atomic_load_max_32>;
+def ATOMIC_LOAD_MAX_64 : AtomicLoadBinaryReg64<atomic_load_max_64>;
+
+def ATOMIC_LOADW_UMIN : AtomicLoadWBinaryReg<z_atomic_loadw_umin>;
+def ATOMIC_LOAD_UMIN_32 : AtomicLoadBinaryReg32<atomic_load_umin_32>;
+def ATOMIC_LOAD_UMIN_64 : AtomicLoadBinaryReg64<atomic_load_umin_64>;
+
+def ATOMIC_LOADW_UMAX : AtomicLoadWBinaryReg<z_atomic_loadw_umax>;
+def ATOMIC_LOAD_UMAX_32 : AtomicLoadBinaryReg32<atomic_load_umax_32>;
+def ATOMIC_LOAD_UMAX_64 : AtomicLoadBinaryReg64<atomic_load_umax_64>;
+
+def ATOMIC_CMP_SWAPW
+ : Pseudo<(outs GR32:$dst), (ins bdaddr20only:$addr, GR32:$cmp, GR32:$swap,
+ ADDR32:$bitshift, ADDR32:$negbitshift,
+ uimm32:$bitsize),
+ [(set GR32:$dst,
+ (z_atomic_cmp_swapw bdaddr20only:$addr, GR32:$cmp, GR32:$swap,
+ ADDR32:$bitshift, ADDR32:$negbitshift,
+ uimm32:$bitsize))]> {
+ let Defs = [CC];
+ let mayLoad = 1;
+ let mayStore = 1;
+ let usesCustomInserter = 1;
+ let hasNoSchedulingInfo = 1;
+}
+
+// Test and set.
+let mayLoad = 1, Defs = [CC] in
+ def TS : StoreInherentS<"ts", 0x9300, null_frag, 1>;
+
+// Compare and swap.
+let Defs = [CC] in {
+ defm CS : CmpSwapRSPair<"cs", 0xBA, 0xEB14, atomic_cmp_swap_32, GR32>;
+ def CSG : CmpSwapRSY<"csg", 0xEB30, atomic_cmp_swap_64, GR64>;
+}
+
+// Compare double and swap.
+let Defs = [CC] in {
+ defm CDS : CmpSwapRSPair<"cds", 0xBB, 0xEB31, null_frag, GR128>;
+ def CDSG : CmpSwapRSY<"cdsg", 0xEB3E, null_frag, GR128>;
+}
+
+// Compare and swap and store.
+let Uses = [R0L, R1D], Defs = [CC], mayStore = 1, mayLoad = 1 in
+ def CSST : SideEffectTernarySSF<"csst", 0xC82, GR64>;
+
+// Perform locked operation.
+let Uses = [R0L, R1D], Defs = [CC], mayStore = 1, mayLoad =1 in
+ def PLO : SideEffectQuaternarySSe<"plo", 0xEE, GR64>;
+
+// Load/store pair from/to quadword.
+def LPQ : UnaryRXY<"lpq", 0xE38F, null_frag, GR128, 16>;
+def STPQ : StoreRXY<"stpq", 0xE38E, null_frag, GR128, 16>;
+
+// Load pair disjoint.
+let Predicates = [FeatureInterlockedAccess1], Defs = [CC] in {
+ def LPD : BinarySSF<"lpd", 0xC84, GR128>;
+ def LPDG : BinarySSF<"lpdg", 0xC85, GR128>;
+}
+
+//===----------------------------------------------------------------------===//
+// Access registers
+//===----------------------------------------------------------------------===//
+
+// Read a 32-bit access register into a GR32. As with all GR32 operations,
+// the upper 32 bits of the enclosing GR64 remain unchanged, which is useful
+// when a 64-bit address is stored in a pair of access registers.
+def EAR : UnaryRRE<"ear", 0xB24F, null_frag, GR32, AR32>;
+
+// Set access register.
+def SAR : UnaryRRE<"sar", 0xB24E, null_frag, AR32, GR32>;
+
+// Copy access register.
+def CPYA : UnaryRRE<"cpya", 0xB24D, null_frag, AR32, AR32>;
+
+// Load address extended.
+defm LAE : LoadAddressRXPair<"lae", 0x51, 0xE375, null_frag>;
+
+// Load access multiple.
+defm LAM : LoadMultipleRSPair<"lam", 0x9A, 0xEB9A, AR32>;
+
+// Load access multiple.
+defm STAM : StoreMultipleRSPair<"stam", 0x9B, 0xEB9B, AR32>;
+
+//===----------------------------------------------------------------------===//
+// Program mask and addressing mode
+//===----------------------------------------------------------------------===//
+
+// Extract CC and program mask into a register. CC ends up in bits 29 and 28.
+let Uses = [CC] in
+ def IPM : InherentRRE<"ipm", 0xB222, GR32, z_ipm>;
+
+// Set CC and program mask from a register.
+let hasSideEffects = 1, Defs = [CC] in
+ def SPM : SideEffectUnaryRR<"spm", 0x04, GR32>;
+
+// Branch and link - like BAS, but also extracts CC and program mask.
+let isCall = 1, Uses = [CC], Defs = [CC] in {
+ def BAL : CallRX<"bal", 0x45>;
+ def BALR : CallRR<"balr", 0x05>;
+}
+
+// Test addressing mode.
+let Defs = [CC] in
+ def TAM : SideEffectInherentE<"tam", 0x010B>;
+
+// Set addressing mode.
+let hasSideEffects = 1 in {
+ def SAM24 : SideEffectInherentE<"sam24", 0x010C>;
+ def SAM31 : SideEffectInherentE<"sam31", 0x010D>;
+ def SAM64 : SideEffectInherentE<"sam64", 0x010E>;
+}
+
+// Branch and set mode. Not really a call, but also sets an output register.
+let isBranch = 1, isTerminator = 1, isBarrier = 1 in
+ def BSM : CallRR<"bsm", 0x0B>;
+
+// Branch and save and set mode.
+let isCall = 1, Defs = [CC] in
+ def BASSM : CallRR<"bassm", 0x0C>;
+
+//===----------------------------------------------------------------------===//
+// Transactional execution
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 1, Predicates = [FeatureTransactionalExecution] in {
+ // Transaction Begin
+ let mayStore = 1, usesCustomInserter = 1, Defs = [CC] in {
+ def TBEGIN : SideEffectBinarySIL<"tbegin", 0xE560, z_tbegin, imm32zx16>;
+ def TBEGIN_nofloat : SideEffectBinarySILPseudo<z_tbegin_nofloat, imm32zx16>;
+
+ def TBEGINC : SideEffectBinarySIL<"tbeginc", 0xE561,
+ int_s390_tbeginc, imm32zx16>;
+ }
+
+ // Transaction End
+ let Defs = [CC] in
+ def TEND : SideEffectInherentS<"tend", 0xB2F8, z_tend>;
+
+ // Transaction Abort
+ let isTerminator = 1, isBarrier = 1 in
+ def TABORT : SideEffectAddressS<"tabort", 0xB2FC, int_s390_tabort>;
+
+ // Nontransactional Store
+ def NTSTG : StoreRXY<"ntstg", 0xE325, int_s390_ntstg, GR64, 8>;
+
+ // Extract Transaction Nesting Depth
+ def ETND : InherentRRE<"etnd", 0xB2EC, GR32, int_s390_etnd>;
+}
+
+//===----------------------------------------------------------------------===//
+// Processor assist
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureProcessorAssist] in {
+ let hasSideEffects = 1 in
+ def PPA : SideEffectTernaryRRFc<"ppa", 0xB2E8, GR64, GR64, imm32zx4>;
+ def : Pat<(int_s390_ppa_txassist GR32:$src),
+ (PPA (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, subreg_l32),
+ 0, 1)>;
+}
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Instructions.
+//===----------------------------------------------------------------------===//
+
+// Find leftmost one, AKA count leading zeros. The instruction actually
+// returns a pair of GR64s, the first giving the number of leading zeros
+// and the second giving a copy of the source with the leftmost one bit
+// cleared. We only use the first result here.
+let Defs = [CC] in
+ def FLOGR : UnaryRRE<"flogr", 0xB983, null_frag, GR128, GR64>;
+def : Pat<(ctlz GR64:$src),
+ (EXTRACT_SUBREG (FLOGR GR64:$src), subreg_h64)>;
+
+// Population count. Counts bits set per byte.
+let Predicates = [FeaturePopulationCount], Defs = [CC] in
+ def POPCNT : UnaryRRE<"popcnt", 0xB9E1, z_popcnt, GR64, GR64>;
+
+// Use subregs to populate the "don't care" bits in a 32-bit to 64-bit anyext.
+def : Pat<(i64 (anyext GR32:$src)),
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, subreg_l32)>;
+
+// Extend GR32s and GR64s to GR128s.
+let usesCustomInserter = 1 in {
+ def AEXT128_64 : Pseudo<(outs GR128:$dst), (ins GR64:$src), []>;
+ def ZEXT128_32 : Pseudo<(outs GR128:$dst), (ins GR32:$src), []>;
+ def ZEXT128_64 : Pseudo<(outs GR128:$dst), (ins GR64:$src), []>;
+}
+
+// Search a block of memory for a character.
+let mayLoad = 1, Defs = [CC] in
+ defm SRST : StringRRE<"srst", 0xb25e, z_search_string>;
+
+// Supervisor call.
+let hasSideEffects = 1, isCall = 1, Defs = [CC] in
+ def SVC : SideEffectUnaryI<"svc", 0x0A, imm32zx8>;
+
+// Store clock.
+let hasSideEffects = 1, Defs = [CC] in {
+ def STCK : StoreInherentS<"stck", 0xB205, null_frag, 8>;
+ def STCKF : StoreInherentS<"stckf", 0xB27C, null_frag, 8>;
+ def STCKE : StoreInherentS<"stcke", 0xB278, null_frag, 16>;
+}
+
+// Store facility list.
+let hasSideEffects = 1, Uses = [R0D], Defs = [R0D, CC] in
+ def STFLE : StoreInherentS<"stfle", 0xB2B0, null_frag, 0>;
+
+// Extract CPU time.
+let Defs = [R0D, R1D], hasSideEffects = 1, mayLoad = 1 in
+ def ECTG : SideEffectTernarySSF<"ectg", 0xC81, GR64>;
+
+// Execute.
+let hasSideEffects = 1 in {
+ def EX : SideEffectBinaryRX<"ex", 0x44, GR64>;
+ def EXRL : SideEffectBinaryRILPC<"exrl", 0xC60, GR64>;
+}
+
+// Program return.
+let hasSideEffects = 1, Defs = [CC] in
+ def PR : SideEffectInherentE<"pr", 0x0101>;
+
+// Move with key.
+let mayLoad = 1, mayStore = 1, Defs = [CC] in
+ def MVCK : MemoryBinarySSd<"mvck", 0xD9, GR64>;
+
+// Store real address.
+def STRAG : StoreSSE<"strag", 0xE502>;
+
+//===----------------------------------------------------------------------===//
+// .insn directive instructions
+//===----------------------------------------------------------------------===//
+
+let isCodeGenOnly = 1 in {
+ def InsnE : DirectiveInsnE<(outs), (ins imm64zx16:$enc), ".insn e,$enc", []>;
+ def InsnRI : DirectiveInsnRI<(outs), (ins imm64zx32:$enc, AnyReg:$R1,
+ imm32sx16:$I2),
+ ".insn ri,$enc,$R1,$I2", []>;
+ def InsnRIE : DirectiveInsnRIE<(outs), (ins imm64zx48:$enc, AnyReg:$R1,
+ AnyReg:$R3, brtarget16:$I2),
+ ".insn rie,$enc,$R1,$R3,$I2", []>;
+ def InsnRIL : DirectiveInsnRIL<(outs), (ins imm64zx48:$enc, AnyReg:$R1,
+ brtarget32:$I2),
+ ".insn ril,$enc,$R1,$I2", []>;
+ def InsnRILU : DirectiveInsnRIL<(outs), (ins imm64zx48:$enc, AnyReg:$R1,
+ uimm32:$I2),
+ ".insn rilu,$enc,$R1,$I2", []>;
+ def InsnRIS : DirectiveInsnRIS<(outs),
+ (ins imm64zx48:$enc, AnyReg:$R1,
+ imm32sx8:$I2, imm32zx4:$M3,
+ bdaddr12only:$BD4),
+ ".insn ris,$enc,$R1,$I2,$M3,$BD4", []>;
+ def InsnRR : DirectiveInsnRR<(outs),
+ (ins imm64zx16:$enc, AnyReg:$R1, AnyReg:$R2),
+ ".insn rr,$enc,$R1,$R2", []>;
+ def InsnRRE : DirectiveInsnRRE<(outs), (ins imm64zx32:$enc,
+ AnyReg:$R1, AnyReg:$R2),
+ ".insn rre,$enc,$R1,$R2", []>;
+ def InsnRRF : DirectiveInsnRRF<(outs),
+ (ins imm64zx32:$enc, AnyReg:$R1, AnyReg:$R2,
+ AnyReg:$R3, imm32zx4:$M4),
+ ".insn rrf,$enc,$R1,$R2,$R3,$M4", []>;
+ def InsnRRS : DirectiveInsnRRS<(outs),
+ (ins imm64zx48:$enc, AnyReg:$R1,
+ AnyReg:$R2, imm32zx4:$M3,
+ bdaddr12only:$BD4),
+ ".insn rrs,$enc,$R1,$R2,$M3,$BD4", []>;
+ def InsnRS : DirectiveInsnRS<(outs),
+ (ins imm64zx32:$enc, AnyReg:$R1,
+ AnyReg:$R3, bdaddr12only:$BD2),
+ ".insn rs,$enc,$R1,$R3,$BD2", []>;
+ def InsnRSE : DirectiveInsnRSE<(outs),
+ (ins imm64zx48:$enc, AnyReg:$R1,
+ AnyReg:$R3, bdaddr12only:$BD2),
+ ".insn rse,$enc,$R1,$R3,$BD2", []>;
+ def InsnRSI : DirectiveInsnRSI<(outs),
+ (ins imm64zx48:$enc, AnyReg:$R1,
+ AnyReg:$R3, brtarget16:$RI2),
+ ".insn rsi,$enc,$R1,$R3,$RI2", []>;
+ def InsnRSY : DirectiveInsnRSY<(outs),
+ (ins imm64zx48:$enc, AnyReg:$R1,
+ AnyReg:$R3, bdaddr20only:$BD2),
+ ".insn rsy,$enc,$R1,$R3,$BD2", []>;
+ def InsnRX : DirectiveInsnRX<(outs), (ins imm64zx32:$enc, AnyReg:$R1,
+ bdxaddr12only:$XBD2),
+ ".insn rx,$enc,$R1,$XBD2", []>;
+ def InsnRXE : DirectiveInsnRXE<(outs), (ins imm64zx48:$enc, AnyReg:$R1,
+ bdxaddr12only:$XBD2),
+ ".insn rxe,$enc,$R1,$XBD2", []>;
+ def InsnRXF : DirectiveInsnRXF<(outs),
+ (ins imm64zx48:$enc, AnyReg:$R1,
+ AnyReg:$R3, bdxaddr12only:$XBD2),
+ ".insn rxf,$enc,$R1,$R3,$XBD2", []>;
+ def InsnRXY : DirectiveInsnRXY<(outs), (ins imm64zx48:$enc, AnyReg:$R1,
+ bdxaddr20only:$XBD2),
+ ".insn rxy,$enc,$R1,$XBD2", []>;
+ def InsnS : DirectiveInsnS<(outs),
+ (ins imm64zx32:$enc, bdaddr12only:$BD2),
+ ".insn s,$enc,$BD2", []>;
+ def InsnSI : DirectiveInsnSI<(outs),
+ (ins imm64zx32:$enc, bdaddr12only:$BD1,
+ imm32sx8:$I2),
+ ".insn si,$enc,$BD1,$I2", []>;
+ def InsnSIY : DirectiveInsnSIY<(outs),
+ (ins imm64zx48:$enc,
+ bdaddr20only:$BD1, imm32zx8:$I2),
+ ".insn siy,$enc,$BD1,$I2", []>;
+ def InsnSIL : DirectiveInsnSIL<(outs),
+ (ins imm64zx48:$enc, bdaddr12only:$BD1,
+ imm32zx16:$I2),
+ ".insn sil,$enc,$BD1,$I2", []>;
+ def InsnSS : DirectiveInsnSS<(outs),
+ (ins imm64zx48:$enc, bdraddr12only:$RBD1,
+ bdaddr12only:$BD2, AnyReg:$R3),
+ ".insn ss,$enc,$RBD1,$BD2,$R3", []>;
+ def InsnSSE : DirectiveInsnSSE<(outs),
+ (ins imm64zx48:$enc,
+ bdaddr12only:$BD1,bdaddr12only:$BD2),
+ ".insn sse,$enc,$BD1,$BD2", []>;
+ def InsnSSF : DirectiveInsnSSF<(outs),
+ (ins imm64zx48:$enc, bdaddr12only:$BD1,
+ bdaddr12only:$BD2, AnyReg:$R3),
+ ".insn ssf,$enc,$BD1,$BD2,$R3", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Peepholes.
+//===----------------------------------------------------------------------===//
+
+// Use AL* for GR64 additions of unsigned 32-bit values.
+defm : ZXB<add, GR64, ALGFR>;
+def : Pat<(add GR64:$src1, imm64zx32:$src2),
+ (ALGFI GR64:$src1, imm64zx32:$src2)>;
+def : Pat<(add GR64:$src1, (azextloadi32 bdxaddr20only:$addr)),
+ (ALGF GR64:$src1, bdxaddr20only:$addr)>;
+
+// Use SL* for GR64 subtractions of unsigned 32-bit values.
+defm : ZXB<sub, GR64, SLGFR>;
+def : Pat<(add GR64:$src1, imm64zx32n:$src2),
+ (SLGFI GR64:$src1, imm64zx32n:$src2)>;
+def : Pat<(sub GR64:$src1, (azextloadi32 bdxaddr20only:$addr)),
+ (SLGF GR64:$src1, bdxaddr20only:$addr)>;
+
+// Optimize sign-extended 1/0 selects to -1/0 selects. This is important
+// for vector legalization.
+def : Pat<(sra (shl (i32 (z_select_ccmask 1, 0, imm32zx4:$valid, imm32zx4:$cc)),
+ (i32 31)),
+ (i32 31)),
+ (Select32 (LHI -1), (LHI 0), imm32zx4:$valid, imm32zx4:$cc)>;
+def : Pat<(sra (shl (i64 (anyext (i32 (z_select_ccmask 1, 0, imm32zx4:$valid,
+ imm32zx4:$cc)))),
+ (i32 63)),
+ (i32 63)),
+ (Select64 (LGHI -1), (LGHI 0), imm32zx4:$valid, imm32zx4:$cc)>;
+
+// Avoid generating 2 XOR instructions. (xor (and x, y), y) is
+// equivalent to (and (xor x, -1), y)
+def : Pat<(and (xor GR64:$x, (i64 -1)), GR64:$y),
+ (XGR GR64:$y, (NGR GR64:$y, GR64:$x))>;
+
+// Shift/rotate instructions only use the last 6 bits of the second operand
+// register, so we can safely use NILL (16 fewer bits than NILF) to only AND the
+// last 16 bits.
+// Complexity is added so that we match this before we match NILF on the AND
+// operation alone.
+let AddedComplexity = 4 in {
+ def : Pat<(shl GR32:$val, (and GR32:$shift, uimm32:$imm)),
+ (SLL GR32:$val, (NILL GR32:$shift, uimm32:$imm), 0)>;
+
+ def : Pat<(sra GR32:$val, (and GR32:$shift, uimm32:$imm)),
+ (SRA GR32:$val, (NILL GR32:$shift, uimm32:$imm), 0)>;
+
+ def : Pat<(srl GR32:$val, (and GR32:$shift, uimm32:$imm)),
+ (SRL GR32:$val, (NILL GR32:$shift, uimm32:$imm), 0)>;
+
+ def : Pat<(shl GR64:$val, (and GR32:$shift, uimm32:$imm)),
+ (SLLG GR64:$val, (NILL GR32:$shift, uimm32:$imm), 0)>;
+
+ def : Pat<(sra GR64:$val, (and GR32:$shift, uimm32:$imm)),
+ (SRAG GR64:$val, (NILL GR32:$shift, uimm32:$imm), 0)>;
+
+ def : Pat<(srl GR64:$val, (and GR32:$shift, uimm32:$imm)),
+ (SRLG GR64:$val, (NILL GR32:$shift, uimm32:$imm), 0)>;
+
+ def : Pat<(rotl GR32:$val, (and GR32:$shift, uimm32:$imm)),
+ (RLL GR32:$val, (NILL GR32:$shift, uimm32:$imm), 0)>;
+
+ def : Pat<(rotl GR64:$val, (and GR32:$shift, uimm32:$imm)),
+ (RLLG GR64:$val, (NILL GR32:$shift, uimm32:$imm), 0)>;
+}
+
+// Peepholes for turning scalar operations into block operations.
+defm : BlockLoadStore<anyextloadi8, i32, MVCSequence, NCSequence, OCSequence,
+ XCSequence, 1>;
+defm : BlockLoadStore<anyextloadi16, i32, MVCSequence, NCSequence, OCSequence,
+ XCSequence, 2>;
+defm : BlockLoadStore<load, i32, MVCSequence, NCSequence, OCSequence,
+ XCSequence, 4>;
+defm : BlockLoadStore<anyextloadi8, i64, MVCSequence, NCSequence,
+ OCSequence, XCSequence, 1>;
+defm : BlockLoadStore<anyextloadi16, i64, MVCSequence, NCSequence, OCSequence,
+ XCSequence, 2>;
+defm : BlockLoadStore<anyextloadi32, i64, MVCSequence, NCSequence, OCSequence,
+ XCSequence, 4>;
+defm : BlockLoadStore<load, i64, MVCSequence, NCSequence, OCSequence,
+ XCSequence, 8>;
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/contrib/llvm/lib/Target/SystemZ/SystemZInstrVector.td
new file mode 100644
index 000000000000..738ea7a33729
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrVector.td
@@ -0,0 +1,1200 @@
+//==- SystemZInstrVector.td - SystemZ Vector instructions ------*- tblgen-*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Move instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureVector] in {
+ // Register move.
+ def VLR : UnaryVRRa<"vlr", 0xE756, null_frag, v128any, v128any>;
+ def VLR32 : UnaryAliasVRR<null_frag, v32eb, v32eb>;
+ def VLR64 : UnaryAliasVRR<null_frag, v64db, v64db>;
+
+ // Load GR from VR element.
+ def VLGV : BinaryVRScGeneric<"vlgv", 0xE721>;
+ def VLGVB : BinaryVRSc<"vlgvb", 0xE721, null_frag, v128b, 0>;
+ def VLGVH : BinaryVRSc<"vlgvh", 0xE721, null_frag, v128h, 1>;
+ def VLGVF : BinaryVRSc<"vlgvf", 0xE721, null_frag, v128f, 2>;
+ def VLGVG : BinaryVRSc<"vlgvg", 0xE721, z_vector_extract, v128g, 3>;
+
+ // Load VR element from GR.
+ def VLVG : TernaryVRSbGeneric<"vlvg", 0xE722>;
+ def VLVGB : TernaryVRSb<"vlvgb", 0xE722, z_vector_insert,
+ v128b, v128b, GR32, 0>;
+ def VLVGH : TernaryVRSb<"vlvgh", 0xE722, z_vector_insert,
+ v128h, v128h, GR32, 1>;
+ def VLVGF : TernaryVRSb<"vlvgf", 0xE722, z_vector_insert,
+ v128f, v128f, GR32, 2>;
+ def VLVGG : TernaryVRSb<"vlvgg", 0xE722, z_vector_insert,
+ v128g, v128g, GR64, 3>;
+
+ // Load VR from GRs disjoint.
+ def VLVGP : BinaryVRRf<"vlvgp", 0xE762, z_join_dwords, v128g>;
+ def VLVGP32 : BinaryAliasVRRf<GR32>;
+}
+
+// Extractions always assign to the full GR64, even if the element would
+// fit in the lower 32 bits. Sub-i64 extracts therefore need to take a
+// subreg of the result.
+class VectorExtractSubreg<ValueType type, Instruction insn>
+ : Pat<(i32 (z_vector_extract (type VR128:$vec), shift12only:$index)),
+ (EXTRACT_SUBREG (insn VR128:$vec, shift12only:$index), subreg_l32)>;
+
+def : VectorExtractSubreg<v16i8, VLGVB>;
+def : VectorExtractSubreg<v8i16, VLGVH>;
+def : VectorExtractSubreg<v4i32, VLGVF>;
+
+//===----------------------------------------------------------------------===//
+// Immediate instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureVector] in {
+ // Generate byte mask.
+ def VZERO : InherentVRIa<"vzero", 0xE744, 0>;
+ def VONE : InherentVRIa<"vone", 0xE744, 0xffff>;
+ def VGBM : UnaryVRIa<"vgbm", 0xE744, z_byte_mask, v128b, imm32zx16>;
+
+ // Generate mask.
+ def VGM : BinaryVRIbGeneric<"vgm", 0xE746>;
+ def VGMB : BinaryVRIb<"vgmb", 0xE746, z_rotate_mask, v128b, 0>;
+ def VGMH : BinaryVRIb<"vgmh", 0xE746, z_rotate_mask, v128h, 1>;
+ def VGMF : BinaryVRIb<"vgmf", 0xE746, z_rotate_mask, v128f, 2>;
+ def VGMG : BinaryVRIb<"vgmg", 0xE746, z_rotate_mask, v128g, 3>;
+
+ // Load element immediate.
+ //
+ // We want these instructions to be used ahead of VLVG* where possible.
+ // However, VLVG* takes a variable BD-format index whereas VLEI takes
+ // a plain immediate index. This means that VLVG* has an extra "base"
+ // register operand and is 3 units more complex. Bumping the complexity
+ // of the VLEI* instructions by 4 means that they are strictly better
+ // than VLVG* in cases where both forms match.
+ let AddedComplexity = 4 in {
+ def VLEIB : TernaryVRIa<"vleib", 0xE740, z_vector_insert,
+ v128b, v128b, imm32sx16trunc, imm32zx4>;
+ def VLEIH : TernaryVRIa<"vleih", 0xE741, z_vector_insert,
+ v128h, v128h, imm32sx16trunc, imm32zx3>;
+ def VLEIF : TernaryVRIa<"vleif", 0xE743, z_vector_insert,
+ v128f, v128f, imm32sx16, imm32zx2>;
+ def VLEIG : TernaryVRIa<"vleig", 0xE742, z_vector_insert,
+ v128g, v128g, imm64sx16, imm32zx1>;
+ }
+
+ // Replicate immediate.
+ def VREPI : UnaryVRIaGeneric<"vrepi", 0xE745, imm32sx16>;
+ def VREPIB : UnaryVRIa<"vrepib", 0xE745, z_replicate, v128b, imm32sx16, 0>;
+ def VREPIH : UnaryVRIa<"vrepih", 0xE745, z_replicate, v128h, imm32sx16, 1>;
+ def VREPIF : UnaryVRIa<"vrepif", 0xE745, z_replicate, v128f, imm32sx16, 2>;
+ def VREPIG : UnaryVRIa<"vrepig", 0xE745, z_replicate, v128g, imm32sx16, 3>;
+}
+
+//===----------------------------------------------------------------------===//
+// Loads
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureVector] in {
+ // Load.
+ def VL : UnaryVRX<"vl", 0xE706, null_frag, v128any, 16>;
+
+ // Load to block boundary. The number of loaded bytes is only known
+ // at run time. The instruction is really polymorphic, but v128b matches
+ // the return type of the associated intrinsic.
+ def VLBB : BinaryVRX<"vlbb", 0xE707, int_s390_vlbb, v128b, 0>;
+
+ // Load count to block boundary.
+ let Defs = [CC] in
+ def LCBB : InstRXE<0xE727, (outs GR32:$R1),
+ (ins bdxaddr12only:$XBD2, imm32zx4:$M3),
+ "lcbb\t$R1, $XBD2, $M3",
+ [(set GR32:$R1, (int_s390_lcbb bdxaddr12only:$XBD2,
+ imm32zx4:$M3))]>;
+
+ // Load with length. The number of loaded bytes is only known at run time.
+ def VLL : BinaryVRSb<"vll", 0xE737, int_s390_vll, 0>;
+
+ // Load multiple.
+ def VLM : LoadMultipleVRSa<"vlm", 0xE736>;
+
+ // Load and replicate
+ def VLREP : UnaryVRXGeneric<"vlrep", 0xE705>;
+ def VLREPB : UnaryVRX<"vlrepb", 0xE705, z_replicate_loadi8, v128b, 1, 0>;
+ def VLREPH : UnaryVRX<"vlreph", 0xE705, z_replicate_loadi16, v128h, 2, 1>;
+ def VLREPF : UnaryVRX<"vlrepf", 0xE705, z_replicate_loadi32, v128f, 4, 2>;
+ def VLREPG : UnaryVRX<"vlrepg", 0xE705, z_replicate_loadi64, v128g, 8, 3>;
+ def : Pat<(v4f32 (z_replicate_loadf32 bdxaddr12only:$addr)),
+ (VLREPF bdxaddr12only:$addr)>;
+ def : Pat<(v2f64 (z_replicate_loadf64 bdxaddr12only:$addr)),
+ (VLREPG bdxaddr12only:$addr)>;
+
+ // Use VLREP to load subvectors. These patterns use "12pair" because
+ // LEY and LDY offer full 20-bit displacement fields. It's often better
+ // to use those instructions rather than force a 20-bit displacement
+ // into a GPR temporary.
+ def VL32 : UnaryAliasVRX<load, v32eb, bdxaddr12pair>;
+ def VL64 : UnaryAliasVRX<load, v64db, bdxaddr12pair>;
+
+ // Load logical element and zero.
+ def VLLEZ : UnaryVRXGeneric<"vllez", 0xE704>;
+ def VLLEZB : UnaryVRX<"vllezb", 0xE704, z_vllezi8, v128b, 1, 0>;
+ def VLLEZH : UnaryVRX<"vllezh", 0xE704, z_vllezi16, v128h, 2, 1>;
+ def VLLEZF : UnaryVRX<"vllezf", 0xE704, z_vllezi32, v128f, 4, 2>;
+ def VLLEZG : UnaryVRX<"vllezg", 0xE704, z_vllezi64, v128g, 8, 3>;
+ def : Pat<(v4f32 (z_vllezf32 bdxaddr12only:$addr)),
+ (VLLEZF bdxaddr12only:$addr)>;
+ def : Pat<(v2f64 (z_vllezf64 bdxaddr12only:$addr)),
+ (VLLEZG bdxaddr12only:$addr)>;
+
+ // Load element.
+ def VLEB : TernaryVRX<"vleb", 0xE700, z_vlei8, v128b, v128b, 1, imm32zx4>;
+ def VLEH : TernaryVRX<"vleh", 0xE701, z_vlei16, v128h, v128h, 2, imm32zx3>;
+ def VLEF : TernaryVRX<"vlef", 0xE703, z_vlei32, v128f, v128f, 4, imm32zx2>;
+ def VLEG : TernaryVRX<"vleg", 0xE702, z_vlei64, v128g, v128g, 8, imm32zx1>;
+ def : Pat<(z_vlef32 (v4f32 VR128:$val), bdxaddr12only:$addr, imm32zx2:$index),
+ (VLEF VR128:$val, bdxaddr12only:$addr, imm32zx2:$index)>;
+ def : Pat<(z_vlef64 (v2f64 VR128:$val), bdxaddr12only:$addr, imm32zx1:$index),
+ (VLEG VR128:$val, bdxaddr12only:$addr, imm32zx1:$index)>;
+
+ // Gather element.
+ def VGEF : TernaryVRV<"vgef", 0xE713, 4, imm32zx2>;
+ def VGEG : TernaryVRV<"vgeg", 0xE712, 8, imm32zx1>;
+}
+
+// Use replicating loads if we're inserting a single element into an
+// undefined vector. This avoids a false dependency on the previous
+// register contents.
+multiclass ReplicatePeephole<Instruction vlrep, ValueType vectype,
+ SDPatternOperator load, ValueType scalartype> {
+ def : Pat<(vectype (z_vector_insert
+ (undef), (scalartype (load bdxaddr12only:$addr)), 0)),
+ (vlrep bdxaddr12only:$addr)>;
+ def : Pat<(vectype (scalar_to_vector
+ (scalartype (load bdxaddr12only:$addr)))),
+ (vlrep bdxaddr12only:$addr)>;
+}
+defm : ReplicatePeephole<VLREPB, v16i8, anyextloadi8, i32>;
+defm : ReplicatePeephole<VLREPH, v8i16, anyextloadi16, i32>;
+defm : ReplicatePeephole<VLREPF, v4i32, load, i32>;
+defm : ReplicatePeephole<VLREPG, v2i64, load, i64>;
+defm : ReplicatePeephole<VLREPF, v4f32, load, f32>;
+defm : ReplicatePeephole<VLREPG, v2f64, load, f64>;
+
+//===----------------------------------------------------------------------===//
+// Stores
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureVector] in {
+ // Store.
+ def VST : StoreVRX<"vst", 0xE70E, null_frag, v128any, 16>;
+
+ // Store with length. The number of stored bytes is only known at run time.
+ def VSTL : StoreLengthVRSb<"vstl", 0xE73F, int_s390_vstl, 0>;
+
+ // Store multiple.
+ def VSTM : StoreMultipleVRSa<"vstm", 0xE73E>;
+
+ // Store element.
+ def VSTEB : StoreBinaryVRX<"vsteb", 0xE708, z_vstei8, v128b, 1, imm32zx4>;
+ def VSTEH : StoreBinaryVRX<"vsteh", 0xE709, z_vstei16, v128h, 2, imm32zx3>;
+ def VSTEF : StoreBinaryVRX<"vstef", 0xE70B, z_vstei32, v128f, 4, imm32zx2>;
+ def VSTEG : StoreBinaryVRX<"vsteg", 0xE70A, z_vstei64, v128g, 8, imm32zx1>;
+ def : Pat<(z_vstef32 (v4f32 VR128:$val), bdxaddr12only:$addr,
+ imm32zx2:$index),
+ (VSTEF VR128:$val, bdxaddr12only:$addr, imm32zx2:$index)>;
+ def : Pat<(z_vstef64 (v2f64 VR128:$val), bdxaddr12only:$addr,
+ imm32zx1:$index),
+ (VSTEG VR128:$val, bdxaddr12only:$addr, imm32zx1:$index)>;
+
+ // Use VSTE to store subvectors. These patterns use "12pair" because
+ // STEY and STDY offer full 20-bit displacement fields. It's often better
+ // to use those instructions rather than force a 20-bit displacement
+ // into a GPR temporary.
+ def VST32 : StoreAliasVRX<store, v32eb, bdxaddr12pair>;
+ def VST64 : StoreAliasVRX<store, v64db, bdxaddr12pair>;
+
+ // Scatter element.
+ def VSCEF : StoreBinaryVRV<"vscef", 0xE71B, 4, imm32zx2>;
+ def VSCEG : StoreBinaryVRV<"vsceg", 0xE71A, 8, imm32zx1>;
+}
+
+//===----------------------------------------------------------------------===//
+// Selects and permutes
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureVector] in {
+ // Merge high.
+ def VMRH: BinaryVRRcGeneric<"vmrh", 0xE761>;
+ def VMRHB : BinaryVRRc<"vmrhb", 0xE761, z_merge_high, v128b, v128b, 0>;
+ def VMRHH : BinaryVRRc<"vmrhh", 0xE761, z_merge_high, v128h, v128h, 1>;
+ def VMRHF : BinaryVRRc<"vmrhf", 0xE761, z_merge_high, v128f, v128f, 2>;
+ def VMRHG : BinaryVRRc<"vmrhg", 0xE761, z_merge_high, v128g, v128g, 3>;
+ def : BinaryRRWithType<VMRHF, VR128, z_merge_high, v4f32>;
+ def : BinaryRRWithType<VMRHG, VR128, z_merge_high, v2f64>;
+
+ // Merge low.
+ def VMRL: BinaryVRRcGeneric<"vmrl", 0xE760>;
+ def VMRLB : BinaryVRRc<"vmrlb", 0xE760, z_merge_low, v128b, v128b, 0>;
+ def VMRLH : BinaryVRRc<"vmrlh", 0xE760, z_merge_low, v128h, v128h, 1>;
+ def VMRLF : BinaryVRRc<"vmrlf", 0xE760, z_merge_low, v128f, v128f, 2>;
+ def VMRLG : BinaryVRRc<"vmrlg", 0xE760, z_merge_low, v128g, v128g, 3>;
+ def : BinaryRRWithType<VMRLF, VR128, z_merge_low, v4f32>;
+ def : BinaryRRWithType<VMRLG, VR128, z_merge_low, v2f64>;
+
+ // Permute.
+ def VPERM : TernaryVRRe<"vperm", 0xE78C, z_permute, v128b, v128b>;
+
+ // Permute doubleword immediate.
+ def VPDI : TernaryVRRc<"vpdi", 0xE784, z_permute_dwords, v128g, v128g>;
+
+ // Replicate.
+ def VREP: BinaryVRIcGeneric<"vrep", 0xE74D>;
+ def VREPB : BinaryVRIc<"vrepb", 0xE74D, z_splat, v128b, v128b, 0>;
+ def VREPH : BinaryVRIc<"vreph", 0xE74D, z_splat, v128h, v128h, 1>;
+ def VREPF : BinaryVRIc<"vrepf", 0xE74D, z_splat, v128f, v128f, 2>;
+ def VREPG : BinaryVRIc<"vrepg", 0xE74D, z_splat, v128g, v128g, 3>;
+ def : Pat<(v4f32 (z_splat VR128:$vec, imm32zx16:$index)),
+ (VREPF VR128:$vec, imm32zx16:$index)>;
+ def : Pat<(v2f64 (z_splat VR128:$vec, imm32zx16:$index)),
+ (VREPG VR128:$vec, imm32zx16:$index)>;
+
+ // Select.
+ def VSEL : TernaryVRRe<"vsel", 0xE78D, null_frag, v128any, v128any>;
+}
+
+//===----------------------------------------------------------------------===//
+// Widening and narrowing
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureVector] in {
+ // Pack
+ def VPK : BinaryVRRcGeneric<"vpk", 0xE794>;
+ def VPKH : BinaryVRRc<"vpkh", 0xE794, z_pack, v128b, v128h, 1>;
+ def VPKF : BinaryVRRc<"vpkf", 0xE794, z_pack, v128h, v128f, 2>;
+ def VPKG : BinaryVRRc<"vpkg", 0xE794, z_pack, v128f, v128g, 3>;
+
+ // Pack saturate.
+ def VPKS : BinaryVRRbSPairGeneric<"vpks", 0xE797>;
+ defm VPKSH : BinaryVRRbSPair<"vpksh", 0xE797, int_s390_vpksh, z_packs_cc,
+ v128b, v128h, 1>;
+ defm VPKSF : BinaryVRRbSPair<"vpksf", 0xE797, int_s390_vpksf, z_packs_cc,
+ v128h, v128f, 2>;
+ defm VPKSG : BinaryVRRbSPair<"vpksg", 0xE797, int_s390_vpksg, z_packs_cc,
+ v128f, v128g, 3>;
+
+ // Pack saturate logical.
+ def VPKLS : BinaryVRRbSPairGeneric<"vpkls", 0xE795>;
+ defm VPKLSH : BinaryVRRbSPair<"vpklsh", 0xE795, int_s390_vpklsh, z_packls_cc,
+ v128b, v128h, 1>;
+ defm VPKLSF : BinaryVRRbSPair<"vpklsf", 0xE795, int_s390_vpklsf, z_packls_cc,
+ v128h, v128f, 2>;
+ defm VPKLSG : BinaryVRRbSPair<"vpklsg", 0xE795, int_s390_vpklsg, z_packls_cc,
+ v128f, v128g, 3>;
+
+ // Sign-extend to doubleword.
+ def VSEG : UnaryVRRaGeneric<"vseg", 0xE75F>;
+ def VSEGB : UnaryVRRa<"vsegb", 0xE75F, z_vsei8, v128g, v128g, 0>;
+ def VSEGH : UnaryVRRa<"vsegh", 0xE75F, z_vsei16, v128g, v128g, 1>;
+ def VSEGF : UnaryVRRa<"vsegf", 0xE75F, z_vsei32, v128g, v128g, 2>;
+ def : Pat<(z_vsei8_by_parts (v16i8 VR128:$src)), (VSEGB VR128:$src)>;
+ def : Pat<(z_vsei16_by_parts (v8i16 VR128:$src)), (VSEGH VR128:$src)>;
+ def : Pat<(z_vsei32_by_parts (v4i32 VR128:$src)), (VSEGF VR128:$src)>;
+
+ // Unpack high.
+ def VUPH : UnaryVRRaGeneric<"vuph", 0xE7D7>;
+ def VUPHB : UnaryVRRa<"vuphb", 0xE7D7, z_unpack_high, v128h, v128b, 0>;
+ def VUPHH : UnaryVRRa<"vuphh", 0xE7D7, z_unpack_high, v128f, v128h, 1>;
+ def VUPHF : UnaryVRRa<"vuphf", 0xE7D7, z_unpack_high, v128g, v128f, 2>;
+
+ // Unpack logical high.
+ def VUPLH : UnaryVRRaGeneric<"vuplh", 0xE7D5>;
+ def VUPLHB : UnaryVRRa<"vuplhb", 0xE7D5, z_unpackl_high, v128h, v128b, 0>;
+ def VUPLHH : UnaryVRRa<"vuplhh", 0xE7D5, z_unpackl_high, v128f, v128h, 1>;
+ def VUPLHF : UnaryVRRa<"vuplhf", 0xE7D5, z_unpackl_high, v128g, v128f, 2>;
+
+ // Unpack low.
+ def VUPL : UnaryVRRaGeneric<"vupl", 0xE7D6>;
+ def VUPLB : UnaryVRRa<"vuplb", 0xE7D6, z_unpack_low, v128h, v128b, 0>;
+ def VUPLHW : UnaryVRRa<"vuplhw", 0xE7D6, z_unpack_low, v128f, v128h, 1>;
+ def VUPLF : UnaryVRRa<"vuplf", 0xE7D6, z_unpack_low, v128g, v128f, 2>;
+
+ // Unpack logical low.
+ def VUPLL : UnaryVRRaGeneric<"vupll", 0xE7D4>;
+ def VUPLLB : UnaryVRRa<"vupllb", 0xE7D4, z_unpackl_low, v128h, v128b, 0>;
+ def VUPLLH : UnaryVRRa<"vupllh", 0xE7D4, z_unpackl_low, v128f, v128h, 1>;
+ def VUPLLF : UnaryVRRa<"vupllf", 0xE7D4, z_unpackl_low, v128g, v128f, 2>;
+}
+
+//===----------------------------------------------------------------------===//
+// Instantiating generic operations for specific types.
+//===----------------------------------------------------------------------===//
+
+multiclass GenericVectorOps<ValueType type, ValueType inttype> {
+ let Predicates = [FeatureVector] in {
+ def : Pat<(type (load bdxaddr12only:$addr)),
+ (VL bdxaddr12only:$addr)>;
+ def : Pat<(store (type VR128:$src), bdxaddr12only:$addr),
+ (VST VR128:$src, bdxaddr12only:$addr)>;
+ def : Pat<(type (vselect (inttype VR128:$x), VR128:$y, VR128:$z)),
+ (VSEL VR128:$y, VR128:$z, VR128:$x)>;
+ def : Pat<(type (vselect (inttype (z_vnot VR128:$x)), VR128:$y, VR128:$z)),
+ (VSEL VR128:$z, VR128:$y, VR128:$x)>;
+ }
+}
+
+defm : GenericVectorOps<v16i8, v16i8>;
+defm : GenericVectorOps<v8i16, v8i16>;
+defm : GenericVectorOps<v4i32, v4i32>;
+defm : GenericVectorOps<v2i64, v2i64>;
+defm : GenericVectorOps<v4f32, v4i32>;
+defm : GenericVectorOps<v2f64, v2i64>;
+
+//===----------------------------------------------------------------------===//
+// Integer arithmetic
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureVector] in {
+ // Add.
+ def VA : BinaryVRRcGeneric<"va", 0xE7F3>;
+ def VAB : BinaryVRRc<"vab", 0xE7F3, add, v128b, v128b, 0>;
+ def VAH : BinaryVRRc<"vah", 0xE7F3, add, v128h, v128h, 1>;
+ def VAF : BinaryVRRc<"vaf", 0xE7F3, add, v128f, v128f, 2>;
+ def VAG : BinaryVRRc<"vag", 0xE7F3, add, v128g, v128g, 3>;
+ def VAQ : BinaryVRRc<"vaq", 0xE7F3, int_s390_vaq, v128q, v128q, 4>;
+
+ // Add compute carry.
+ def VACC : BinaryVRRcGeneric<"vacc", 0xE7F1>;
+ def VACCB : BinaryVRRc<"vaccb", 0xE7F1, int_s390_vaccb, v128b, v128b, 0>;
+ def VACCH : BinaryVRRc<"vacch", 0xE7F1, int_s390_vacch, v128h, v128h, 1>;
+ def VACCF : BinaryVRRc<"vaccf", 0xE7F1, int_s390_vaccf, v128f, v128f, 2>;
+ def VACCG : BinaryVRRc<"vaccg", 0xE7F1, int_s390_vaccg, v128g, v128g, 3>;
+ def VACCQ : BinaryVRRc<"vaccq", 0xE7F1, int_s390_vaccq, v128q, v128q, 4>;
+
+ // Add with carry.
+ def VAC : TernaryVRRdGeneric<"vac", 0xE7BB>;
+ def VACQ : TernaryVRRd<"vacq", 0xE7BB, int_s390_vacq, v128q, v128q, 4>;
+
+ // Add with carry compute carry.
+ def VACCC : TernaryVRRdGeneric<"vaccc", 0xE7B9>;
+ def VACCCQ : TernaryVRRd<"vacccq", 0xE7B9, int_s390_vacccq, v128q, v128q, 4>;
+
+ // And.
+ def VN : BinaryVRRc<"vn", 0xE768, null_frag, v128any, v128any>;
+
+ // And with complement.
+ def VNC : BinaryVRRc<"vnc", 0xE769, null_frag, v128any, v128any>;
+
+ // Average.
+ def VAVG : BinaryVRRcGeneric<"vavg", 0xE7F2>;
+ def VAVGB : BinaryVRRc<"vavgb", 0xE7F2, int_s390_vavgb, v128b, v128b, 0>;
+ def VAVGH : BinaryVRRc<"vavgh", 0xE7F2, int_s390_vavgh, v128h, v128h, 1>;
+ def VAVGF : BinaryVRRc<"vavgf", 0xE7F2, int_s390_vavgf, v128f, v128f, 2>;
+ def VAVGG : BinaryVRRc<"vavgg", 0xE7F2, int_s390_vavgg, v128g, v128g, 3>;
+
+ // Average logical.
+ def VAVGL : BinaryVRRcGeneric<"vavgl", 0xE7F0>;
+ def VAVGLB : BinaryVRRc<"vavglb", 0xE7F0, int_s390_vavglb, v128b, v128b, 0>;
+ def VAVGLH : BinaryVRRc<"vavglh", 0xE7F0, int_s390_vavglh, v128h, v128h, 1>;
+ def VAVGLF : BinaryVRRc<"vavglf", 0xE7F0, int_s390_vavglf, v128f, v128f, 2>;
+ def VAVGLG : BinaryVRRc<"vavglg", 0xE7F0, int_s390_vavglg, v128g, v128g, 3>;
+
+ // Checksum.
+ def VCKSM : BinaryVRRc<"vcksm", 0xE766, int_s390_vcksm, v128f, v128f>;
+
+ // Count leading zeros.
+ def VCLZ : UnaryVRRaGeneric<"vclz", 0xE753>;
+ def VCLZB : UnaryVRRa<"vclzb", 0xE753, ctlz, v128b, v128b, 0>;
+ def VCLZH : UnaryVRRa<"vclzh", 0xE753, ctlz, v128h, v128h, 1>;
+ def VCLZF : UnaryVRRa<"vclzf", 0xE753, ctlz, v128f, v128f, 2>;
+ def VCLZG : UnaryVRRa<"vclzg", 0xE753, ctlz, v128g, v128g, 3>;
+
+ // Count trailing zeros.
+ def VCTZ : UnaryVRRaGeneric<"vctz", 0xE752>;
+ def VCTZB : UnaryVRRa<"vctzb", 0xE752, cttz, v128b, v128b, 0>;
+ def VCTZH : UnaryVRRa<"vctzh", 0xE752, cttz, v128h, v128h, 1>;
+ def VCTZF : UnaryVRRa<"vctzf", 0xE752, cttz, v128f, v128f, 2>;
+ def VCTZG : UnaryVRRa<"vctzg", 0xE752, cttz, v128g, v128g, 3>;
+
+ // Exclusive or.
+ def VX : BinaryVRRc<"vx", 0xE76D, null_frag, v128any, v128any>;
+
+ // Galois field multiply sum.
+ def VGFM : BinaryVRRcGeneric<"vgfm", 0xE7B4>;
+ def VGFMB : BinaryVRRc<"vgfmb", 0xE7B4, int_s390_vgfmb, v128h, v128b, 0>;
+ def VGFMH : BinaryVRRc<"vgfmh", 0xE7B4, int_s390_vgfmh, v128f, v128h, 1>;
+ def VGFMF : BinaryVRRc<"vgfmf", 0xE7B4, int_s390_vgfmf, v128g, v128f, 2>;
+ def VGFMG : BinaryVRRc<"vgfmg", 0xE7B4, int_s390_vgfmg, v128q, v128g, 3>;
+
+ // Galois field multiply sum and accumulate.
+ def VGFMA : TernaryVRRdGeneric<"vgfma", 0xE7BC>;
+ def VGFMAB : TernaryVRRd<"vgfmab", 0xE7BC, int_s390_vgfmab, v128h, v128b, 0>;
+ def VGFMAH : TernaryVRRd<"vgfmah", 0xE7BC, int_s390_vgfmah, v128f, v128h, 1>;
+ def VGFMAF : TernaryVRRd<"vgfmaf", 0xE7BC, int_s390_vgfmaf, v128g, v128f, 2>;
+ def VGFMAG : TernaryVRRd<"vgfmag", 0xE7BC, int_s390_vgfmag, v128q, v128g, 3>;
+
+ // Load complement.
+ def VLC : UnaryVRRaGeneric<"vlc", 0xE7DE>;
+ def VLCB : UnaryVRRa<"vlcb", 0xE7DE, z_vneg, v128b, v128b, 0>;
+ def VLCH : UnaryVRRa<"vlch", 0xE7DE, z_vneg, v128h, v128h, 1>;
+ def VLCF : UnaryVRRa<"vlcf", 0xE7DE, z_vneg, v128f, v128f, 2>;
+ def VLCG : UnaryVRRa<"vlcg", 0xE7DE, z_vneg, v128g, v128g, 3>;
+
+ // Load positive.
+ def VLP : UnaryVRRaGeneric<"vlp", 0xE7DF>;
+ def VLPB : UnaryVRRa<"vlpb", 0xE7DF, z_viabs8, v128b, v128b, 0>;
+ def VLPH : UnaryVRRa<"vlph", 0xE7DF, z_viabs16, v128h, v128h, 1>;
+ def VLPF : UnaryVRRa<"vlpf", 0xE7DF, z_viabs32, v128f, v128f, 2>;
+ def VLPG : UnaryVRRa<"vlpg", 0xE7DF, z_viabs64, v128g, v128g, 3>;
+
+ // Maximum.
+ def VMX : BinaryVRRcGeneric<"vmx", 0xE7FF>;
+ def VMXB : BinaryVRRc<"vmxb", 0xE7FF, null_frag, v128b, v128b, 0>;
+ def VMXH : BinaryVRRc<"vmxh", 0xE7FF, null_frag, v128h, v128h, 1>;
+ def VMXF : BinaryVRRc<"vmxf", 0xE7FF, null_frag, v128f, v128f, 2>;
+ def VMXG : BinaryVRRc<"vmxg", 0xE7FF, null_frag, v128g, v128g, 3>;
+
+ // Maximum logical.
+ def VMXL : BinaryVRRcGeneric<"vmxl", 0xE7FD>;
+ def VMXLB : BinaryVRRc<"vmxlb", 0xE7FD, null_frag, v128b, v128b, 0>;
+ def VMXLH : BinaryVRRc<"vmxlh", 0xE7FD, null_frag, v128h, v128h, 1>;
+ def VMXLF : BinaryVRRc<"vmxlf", 0xE7FD, null_frag, v128f, v128f, 2>;
+ def VMXLG : BinaryVRRc<"vmxlg", 0xE7FD, null_frag, v128g, v128g, 3>;
+
+ // Minimum.
+ def VMN : BinaryVRRcGeneric<"vmn", 0xE7FE>;
+ def VMNB : BinaryVRRc<"vmnb", 0xE7FE, null_frag, v128b, v128b, 0>;
+ def VMNH : BinaryVRRc<"vmnh", 0xE7FE, null_frag, v128h, v128h, 1>;
+ def VMNF : BinaryVRRc<"vmnf", 0xE7FE, null_frag, v128f, v128f, 2>;
+ def VMNG : BinaryVRRc<"vmng", 0xE7FE, null_frag, v128g, v128g, 3>;
+
+ // Minimum logical.
+ def VMNL : BinaryVRRcGeneric<"vmnl", 0xE7FC>;
+ def VMNLB : BinaryVRRc<"vmnlb", 0xE7FC, null_frag, v128b, v128b, 0>;
+ def VMNLH : BinaryVRRc<"vmnlh", 0xE7FC, null_frag, v128h, v128h, 1>;
+ def VMNLF : BinaryVRRc<"vmnlf", 0xE7FC, null_frag, v128f, v128f, 2>;
+ def VMNLG : BinaryVRRc<"vmnlg", 0xE7FC, null_frag, v128g, v128g, 3>;
+
+ // Multiply and add low.
+ def VMAL : TernaryVRRdGeneric<"vmal", 0xE7AA>;
+ def VMALB : TernaryVRRd<"vmalb", 0xE7AA, z_muladd, v128b, v128b, 0>;
+ def VMALHW : TernaryVRRd<"vmalhw", 0xE7AA, z_muladd, v128h, v128h, 1>;
+ def VMALF : TernaryVRRd<"vmalf", 0xE7AA, z_muladd, v128f, v128f, 2>;
+
+ // Multiply and add high.
+ def VMAH : TernaryVRRdGeneric<"vmah", 0xE7AB>;
+ def VMAHB : TernaryVRRd<"vmahb", 0xE7AB, int_s390_vmahb, v128b, v128b, 0>;
+ def VMAHH : TernaryVRRd<"vmahh", 0xE7AB, int_s390_vmahh, v128h, v128h, 1>;
+ def VMAHF : TernaryVRRd<"vmahf", 0xE7AB, int_s390_vmahf, v128f, v128f, 2>;
+
+ // Multiply and add logical high.
+ def VMALH : TernaryVRRdGeneric<"vmalh", 0xE7A9>;
+ def VMALHB : TernaryVRRd<"vmalhb", 0xE7A9, int_s390_vmalhb, v128b, v128b, 0>;
+ def VMALHH : TernaryVRRd<"vmalhh", 0xE7A9, int_s390_vmalhh, v128h, v128h, 1>;
+ def VMALHF : TernaryVRRd<"vmalhf", 0xE7A9, int_s390_vmalhf, v128f, v128f, 2>;
+
+ // Multiply and add even.
+ def VMAE : TernaryVRRdGeneric<"vmae", 0xE7AE>;
+ def VMAEB : TernaryVRRd<"vmaeb", 0xE7AE, int_s390_vmaeb, v128h, v128b, 0>;
+ def VMAEH : TernaryVRRd<"vmaeh", 0xE7AE, int_s390_vmaeh, v128f, v128h, 1>;
+ def VMAEF : TernaryVRRd<"vmaef", 0xE7AE, int_s390_vmaef, v128g, v128f, 2>;
+
+ // Multiply and add logical even.
+ def VMALE : TernaryVRRdGeneric<"vmale", 0xE7AC>;
+ def VMALEB : TernaryVRRd<"vmaleb", 0xE7AC, int_s390_vmaleb, v128h, v128b, 0>;
+ def VMALEH : TernaryVRRd<"vmaleh", 0xE7AC, int_s390_vmaleh, v128f, v128h, 1>;
+ def VMALEF : TernaryVRRd<"vmalef", 0xE7AC, int_s390_vmalef, v128g, v128f, 2>;
+
+ // Multiply and add odd.
+ def VMAO : TernaryVRRdGeneric<"vmao", 0xE7AF>;
+ def VMAOB : TernaryVRRd<"vmaob", 0xE7AF, int_s390_vmaob, v128h, v128b, 0>;
+ def VMAOH : TernaryVRRd<"vmaoh", 0xE7AF, int_s390_vmaoh, v128f, v128h, 1>;
+ def VMAOF : TernaryVRRd<"vmaof", 0xE7AF, int_s390_vmaof, v128g, v128f, 2>;
+
+ // Multiply and add logical odd.
+ def VMALO : TernaryVRRdGeneric<"vmalo", 0xE7AD>;
+ def VMALOB : TernaryVRRd<"vmalob", 0xE7AD, int_s390_vmalob, v128h, v128b, 0>;
+ def VMALOH : TernaryVRRd<"vmaloh", 0xE7AD, int_s390_vmaloh, v128f, v128h, 1>;
+ def VMALOF : TernaryVRRd<"vmalof", 0xE7AD, int_s390_vmalof, v128g, v128f, 2>;
+
+ // Multiply high.
+ def VMH : BinaryVRRcGeneric<"vmh", 0xE7A3>;
+ def VMHB : BinaryVRRc<"vmhb", 0xE7A3, int_s390_vmhb, v128b, v128b, 0>;
+ def VMHH : BinaryVRRc<"vmhh", 0xE7A3, int_s390_vmhh, v128h, v128h, 1>;
+ def VMHF : BinaryVRRc<"vmhf", 0xE7A3, int_s390_vmhf, v128f, v128f, 2>;
+
+ // Multiply logical high.
+ def VMLH : BinaryVRRcGeneric<"vmlh", 0xE7A1>;
+ def VMLHB : BinaryVRRc<"vmlhb", 0xE7A1, int_s390_vmlhb, v128b, v128b, 0>;
+ def VMLHH : BinaryVRRc<"vmlhh", 0xE7A1, int_s390_vmlhh, v128h, v128h, 1>;
+ def VMLHF : BinaryVRRc<"vmlhf", 0xE7A1, int_s390_vmlhf, v128f, v128f, 2>;
+
+ // Multiply low.
+ def VML : BinaryVRRcGeneric<"vml", 0xE7A2>;
+ def VMLB : BinaryVRRc<"vmlb", 0xE7A2, mul, v128b, v128b, 0>;
+ def VMLHW : BinaryVRRc<"vmlhw", 0xE7A2, mul, v128h, v128h, 1>;
+ def VMLF : BinaryVRRc<"vmlf", 0xE7A2, mul, v128f, v128f, 2>;
+
+ // Multiply even.
+ def VME : BinaryVRRcGeneric<"vme", 0xE7A6>;
+ def VMEB : BinaryVRRc<"vmeb", 0xE7A6, int_s390_vmeb, v128h, v128b, 0>;
+ def VMEH : BinaryVRRc<"vmeh", 0xE7A6, int_s390_vmeh, v128f, v128h, 1>;
+ def VMEF : BinaryVRRc<"vmef", 0xE7A6, int_s390_vmef, v128g, v128f, 2>;
+
+ // Multiply logical even.
+ def VMLE : BinaryVRRcGeneric<"vmle", 0xE7A4>;
+ def VMLEB : BinaryVRRc<"vmleb", 0xE7A4, int_s390_vmleb, v128h, v128b, 0>;
+ def VMLEH : BinaryVRRc<"vmleh", 0xE7A4, int_s390_vmleh, v128f, v128h, 1>;
+ def VMLEF : BinaryVRRc<"vmlef", 0xE7A4, int_s390_vmlef, v128g, v128f, 2>;
+
+ // Multiply odd.
+ def VMO : BinaryVRRcGeneric<"vmo", 0xE7A7>;
+ def VMOB : BinaryVRRc<"vmob", 0xE7A7, int_s390_vmob, v128h, v128b, 0>;
+ def VMOH : BinaryVRRc<"vmoh", 0xE7A7, int_s390_vmoh, v128f, v128h, 1>;
+ def VMOF : BinaryVRRc<"vmof", 0xE7A7, int_s390_vmof, v128g, v128f, 2>;
+
+ // Multiply logical odd.
+ def VMLO : BinaryVRRcGeneric<"vmlo", 0xE7A5>;
+ def VMLOB : BinaryVRRc<"vmlob", 0xE7A5, int_s390_vmlob, v128h, v128b, 0>;
+ def VMLOH : BinaryVRRc<"vmloh", 0xE7A5, int_s390_vmloh, v128f, v128h, 1>;
+ def VMLOF : BinaryVRRc<"vmlof", 0xE7A5, int_s390_vmlof, v128g, v128f, 2>;
+
+ // Nor.
+ def VNO : BinaryVRRc<"vno", 0xE76B, null_frag, v128any, v128any>;
+ def : InstAlias<"vnot\t$V1, $V2", (VNO VR128:$V1, VR128:$V2, VR128:$V2), 0>;
+
+ // Or.
+ def VO : BinaryVRRc<"vo", 0xE76A, null_frag, v128any, v128any>;
+
+ // Population count.
+ def VPOPCT : UnaryVRRaGeneric<"vpopct", 0xE750>;
+ def : Pat<(v16i8 (z_popcnt VR128:$x)), (VPOPCT VR128:$x, 0)>;
+
+ // Element rotate left logical (with vector shift amount).
+ def VERLLV : BinaryVRRcGeneric<"verllv", 0xE773>;
+ def VERLLVB : BinaryVRRc<"verllvb", 0xE773, int_s390_verllvb,
+ v128b, v128b, 0>;
+ def VERLLVH : BinaryVRRc<"verllvh", 0xE773, int_s390_verllvh,
+ v128h, v128h, 1>;
+ def VERLLVF : BinaryVRRc<"verllvf", 0xE773, int_s390_verllvf,
+ v128f, v128f, 2>;
+ def VERLLVG : BinaryVRRc<"verllvg", 0xE773, int_s390_verllvg,
+ v128g, v128g, 3>;
+
+ // Element rotate left logical (with scalar shift amount).
+ def VERLL : BinaryVRSaGeneric<"verll", 0xE733>;
+ def VERLLB : BinaryVRSa<"verllb", 0xE733, int_s390_verllb, v128b, v128b, 0>;
+ def VERLLH : BinaryVRSa<"verllh", 0xE733, int_s390_verllh, v128h, v128h, 1>;
+ def VERLLF : BinaryVRSa<"verllf", 0xE733, int_s390_verllf, v128f, v128f, 2>;
+ def VERLLG : BinaryVRSa<"verllg", 0xE733, int_s390_verllg, v128g, v128g, 3>;
+
+ // Element rotate and insert under mask.
+ def VERIM : QuaternaryVRIdGeneric<"verim", 0xE772>;
+ def VERIMB : QuaternaryVRId<"verimb", 0xE772, int_s390_verimb, v128b, v128b, 0>;
+ def VERIMH : QuaternaryVRId<"verimh", 0xE772, int_s390_verimh, v128h, v128h, 1>;
+ def VERIMF : QuaternaryVRId<"verimf", 0xE772, int_s390_verimf, v128f, v128f, 2>;
+ def VERIMG : QuaternaryVRId<"verimg", 0xE772, int_s390_verimg, v128g, v128g, 3>;
+
+ // Element shift left (with vector shift amount).
+ def VESLV : BinaryVRRcGeneric<"veslv", 0xE770>;
+ def VESLVB : BinaryVRRc<"veslvb", 0xE770, z_vshl, v128b, v128b, 0>;
+ def VESLVH : BinaryVRRc<"veslvh", 0xE770, z_vshl, v128h, v128h, 1>;
+ def VESLVF : BinaryVRRc<"veslvf", 0xE770, z_vshl, v128f, v128f, 2>;
+ def VESLVG : BinaryVRRc<"veslvg", 0xE770, z_vshl, v128g, v128g, 3>;
+
+ // Element shift left (with scalar shift amount).
+ def VESL : BinaryVRSaGeneric<"vesl", 0xE730>;
+ def VESLB : BinaryVRSa<"veslb", 0xE730, z_vshl_by_scalar, v128b, v128b, 0>;
+ def VESLH : BinaryVRSa<"veslh", 0xE730, z_vshl_by_scalar, v128h, v128h, 1>;
+ def VESLF : BinaryVRSa<"veslf", 0xE730, z_vshl_by_scalar, v128f, v128f, 2>;
+ def VESLG : BinaryVRSa<"veslg", 0xE730, z_vshl_by_scalar, v128g, v128g, 3>;
+
+ // Element shift right arithmetic (with vector shift amount).
+ def VESRAV : BinaryVRRcGeneric<"vesrav", 0xE77A>;
+ def VESRAVB : BinaryVRRc<"vesravb", 0xE77A, z_vsra, v128b, v128b, 0>;
+ def VESRAVH : BinaryVRRc<"vesravh", 0xE77A, z_vsra, v128h, v128h, 1>;
+ def VESRAVF : BinaryVRRc<"vesravf", 0xE77A, z_vsra, v128f, v128f, 2>;
+ def VESRAVG : BinaryVRRc<"vesravg", 0xE77A, z_vsra, v128g, v128g, 3>;
+
+ // Element shift right arithmetic (with scalar shift amount).
+ def VESRA : BinaryVRSaGeneric<"vesra", 0xE73A>;
+ def VESRAB : BinaryVRSa<"vesrab", 0xE73A, z_vsra_by_scalar, v128b, v128b, 0>;
+ def VESRAH : BinaryVRSa<"vesrah", 0xE73A, z_vsra_by_scalar, v128h, v128h, 1>;
+ def VESRAF : BinaryVRSa<"vesraf", 0xE73A, z_vsra_by_scalar, v128f, v128f, 2>;
+ def VESRAG : BinaryVRSa<"vesrag", 0xE73A, z_vsra_by_scalar, v128g, v128g, 3>;
+
+ // Element shift right logical (with vector shift amount).
+ def VESRLV : BinaryVRRcGeneric<"vesrlv", 0xE778>;
+ def VESRLVB : BinaryVRRc<"vesrlvb", 0xE778, z_vsrl, v128b, v128b, 0>;
+ def VESRLVH : BinaryVRRc<"vesrlvh", 0xE778, z_vsrl, v128h, v128h, 1>;
+ def VESRLVF : BinaryVRRc<"vesrlvf", 0xE778, z_vsrl, v128f, v128f, 2>;
+ def VESRLVG : BinaryVRRc<"vesrlvg", 0xE778, z_vsrl, v128g, v128g, 3>;
+
+ // Element shift right logical (with scalar shift amount).
+ def VESRL : BinaryVRSaGeneric<"vesrl", 0xE738>;
+ def VESRLB : BinaryVRSa<"vesrlb", 0xE738, z_vsrl_by_scalar, v128b, v128b, 0>;
+ def VESRLH : BinaryVRSa<"vesrlh", 0xE738, z_vsrl_by_scalar, v128h, v128h, 1>;
+ def VESRLF : BinaryVRSa<"vesrlf", 0xE738, z_vsrl_by_scalar, v128f, v128f, 2>;
+ def VESRLG : BinaryVRSa<"vesrlg", 0xE738, z_vsrl_by_scalar, v128g, v128g, 3>;
+
+ // Shift left.
+ def VSL : BinaryVRRc<"vsl", 0xE774, int_s390_vsl, v128b, v128b>;
+
+ // Shift left by byte.
+ def VSLB : BinaryVRRc<"vslb", 0xE775, int_s390_vslb, v128b, v128b>;
+
+ // Shift left double by byte.
+ def VSLDB : TernaryVRId<"vsldb", 0xE777, z_shl_double, v128b, v128b, 0>;
+ def : Pat<(int_s390_vsldb VR128:$x, VR128:$y, imm32zx8:$z),
+ (VSLDB VR128:$x, VR128:$y, imm32zx8:$z)>;
+
+ // Shift right arithmetic.
+ def VSRA : BinaryVRRc<"vsra", 0xE77E, int_s390_vsra, v128b, v128b>;
+
+ // Shift right arithmetic by byte.
+ def VSRAB : BinaryVRRc<"vsrab", 0xE77F, int_s390_vsrab, v128b, v128b>;
+
+ // Shift right logical.
+ def VSRL : BinaryVRRc<"vsrl", 0xE77C, int_s390_vsrl, v128b, v128b>;
+
+ // Shift right logical by byte.
+ def VSRLB : BinaryVRRc<"vsrlb", 0xE77D, int_s390_vsrlb, v128b, v128b>;
+
+ // Subtract.
+ def VS : BinaryVRRcGeneric<"vs", 0xE7F7>;
+ def VSB : BinaryVRRc<"vsb", 0xE7F7, sub, v128b, v128b, 0>;
+ def VSH : BinaryVRRc<"vsh", 0xE7F7, sub, v128h, v128h, 1>;
+ def VSF : BinaryVRRc<"vsf", 0xE7F7, sub, v128f, v128f, 2>;
+ def VSG : BinaryVRRc<"vsg", 0xE7F7, sub, v128g, v128g, 3>;
+ def VSQ : BinaryVRRc<"vsq", 0xE7F7, int_s390_vsq, v128q, v128q, 4>;
+
+ // Subtract compute borrow indication.
+ def VSCBI : BinaryVRRcGeneric<"vscbi", 0xE7F5>;
+ def VSCBIB : BinaryVRRc<"vscbib", 0xE7F5, int_s390_vscbib, v128b, v128b, 0>;
+ def VSCBIH : BinaryVRRc<"vscbih", 0xE7F5, int_s390_vscbih, v128h, v128h, 1>;
+ def VSCBIF : BinaryVRRc<"vscbif", 0xE7F5, int_s390_vscbif, v128f, v128f, 2>;
+ def VSCBIG : BinaryVRRc<"vscbig", 0xE7F5, int_s390_vscbig, v128g, v128g, 3>;
+ def VSCBIQ : BinaryVRRc<"vscbiq", 0xE7F5, int_s390_vscbiq, v128q, v128q, 4>;
+
+ // Subtract with borrow indication.
+ def VSBI : TernaryVRRdGeneric<"vsbi", 0xE7BF>;
+ def VSBIQ : TernaryVRRd<"vsbiq", 0xE7BF, int_s390_vsbiq, v128q, v128q, 4>;
+
+ // Subtract with borrow compute borrow indication.
+ def VSBCBI : TernaryVRRdGeneric<"vsbcbi", 0xE7BD>;
+ def VSBCBIQ : TernaryVRRd<"vsbcbiq", 0xE7BD, int_s390_vsbcbiq,
+ v128q, v128q, 4>;
+
+ // Sum across doubleword.
+ def VSUMG : BinaryVRRcGeneric<"vsumg", 0xE765>;
+ def VSUMGH : BinaryVRRc<"vsumgh", 0xE765, z_vsum, v128g, v128h, 1>;
+ def VSUMGF : BinaryVRRc<"vsumgf", 0xE765, z_vsum, v128g, v128f, 2>;
+
+ // Sum across quadword.
+ def VSUMQ : BinaryVRRcGeneric<"vsumq", 0xE767>;
+ def VSUMQF : BinaryVRRc<"vsumqf", 0xE767, z_vsum, v128q, v128f, 2>;
+ def VSUMQG : BinaryVRRc<"vsumqg", 0xE767, z_vsum, v128q, v128g, 3>;
+
+ // Sum across word.
+ def VSUM : BinaryVRRcGeneric<"vsum", 0xE764>;
+ def VSUMB : BinaryVRRc<"vsumb", 0xE764, z_vsum, v128f, v128b, 0>;
+ def VSUMH : BinaryVRRc<"vsumh", 0xE764, z_vsum, v128f, v128h, 1>;
+}
+
+// Instantiate the bitwise ops for type TYPE.
+multiclass BitwiseVectorOps<ValueType type> {
+ let Predicates = [FeatureVector] in {
+ def : Pat<(type (and VR128:$x, VR128:$y)), (VN VR128:$x, VR128:$y)>;
+ def : Pat<(type (and VR128:$x, (z_vnot VR128:$y))),
+ (VNC VR128:$x, VR128:$y)>;
+ def : Pat<(type (or VR128:$x, VR128:$y)), (VO VR128:$x, VR128:$y)>;
+ def : Pat<(type (xor VR128:$x, VR128:$y)), (VX VR128:$x, VR128:$y)>;
+ def : Pat<(type (or (and VR128:$x, VR128:$z),
+ (and VR128:$y, (z_vnot VR128:$z)))),
+ (VSEL VR128:$x, VR128:$y, VR128:$z)>;
+ def : Pat<(type (z_vnot (or VR128:$x, VR128:$y))),
+ (VNO VR128:$x, VR128:$y)>;
+ def : Pat<(type (z_vnot VR128:$x)), (VNO VR128:$x, VR128:$x)>;
+ }
+}
+
+defm : BitwiseVectorOps<v16i8>;
+defm : BitwiseVectorOps<v8i16>;
+defm : BitwiseVectorOps<v4i32>;
+defm : BitwiseVectorOps<v2i64>;
+
+// Instantiate additional patterns for absolute-related expressions on
+// type TYPE. LC is the negate instruction for TYPE and LP is the absolute
+// instruction.
+multiclass IntegerAbsoluteVectorOps<ValueType type, Instruction lc,
+ Instruction lp, int shift> {
+ let Predicates = [FeatureVector] in {
+ def : Pat<(type (vselect (type (z_vicmph_zero VR128:$x)),
+ (z_vneg VR128:$x), VR128:$x)),
+ (lc (lp VR128:$x))>;
+ def : Pat<(type (vselect (type (z_vnot (z_vicmph_zero VR128:$x))),
+ VR128:$x, (z_vneg VR128:$x))),
+ (lc (lp VR128:$x))>;
+ def : Pat<(type (vselect (type (z_vicmpl_zero VR128:$x)),
+ VR128:$x, (z_vneg VR128:$x))),
+ (lc (lp VR128:$x))>;
+ def : Pat<(type (vselect (type (z_vnot (z_vicmpl_zero VR128:$x))),
+ (z_vneg VR128:$x), VR128:$x)),
+ (lc (lp VR128:$x))>;
+ def : Pat<(type (or (and (z_vsra_by_scalar VR128:$x, (i32 shift)),
+ (z_vneg VR128:$x)),
+ (and (z_vnot (z_vsra_by_scalar VR128:$x, (i32 shift))),
+ VR128:$x))),
+ (lp VR128:$x)>;
+ def : Pat<(type (or (and (z_vsra_by_scalar VR128:$x, (i32 shift)),
+ VR128:$x),
+ (and (z_vnot (z_vsra_by_scalar VR128:$x, (i32 shift))),
+ (z_vneg VR128:$x)))),
+ (lc (lp VR128:$x))>;
+ }
+}
+
+defm : IntegerAbsoluteVectorOps<v16i8, VLCB, VLPB, 7>;
+defm : IntegerAbsoluteVectorOps<v8i16, VLCH, VLPH, 15>;
+defm : IntegerAbsoluteVectorOps<v4i32, VLCF, VLPF, 31>;
+defm : IntegerAbsoluteVectorOps<v2i64, VLCG, VLPG, 63>;
+
+// Instantiate minimum- and maximum-related patterns for TYPE. CMPH is the
+// signed or unsigned "set if greater than" comparison instruction and
+// MIN and MAX are the associated minimum and maximum instructions.
+multiclass IntegerMinMaxVectorOps<ValueType type, SDPatternOperator cmph,
+ Instruction min, Instruction max> {
+ let Predicates = [FeatureVector] in {
+ def : Pat<(type (vselect (cmph VR128:$x, VR128:$y), VR128:$x, VR128:$y)),
+ (max VR128:$x, VR128:$y)>;
+ def : Pat<(type (vselect (cmph VR128:$x, VR128:$y), VR128:$y, VR128:$x)),
+ (min VR128:$x, VR128:$y)>;
+ def : Pat<(type (vselect (z_vnot (cmph VR128:$x, VR128:$y)),
+ VR128:$x, VR128:$y)),
+ (min VR128:$x, VR128:$y)>;
+ def : Pat<(type (vselect (z_vnot (cmph VR128:$x, VR128:$y)),
+ VR128:$y, VR128:$x)),
+ (max VR128:$x, VR128:$y)>;
+ }
+}
+
+// Signed min/max.
+defm : IntegerMinMaxVectorOps<v16i8, z_vicmph, VMNB, VMXB>;
+defm : IntegerMinMaxVectorOps<v8i16, z_vicmph, VMNH, VMXH>;
+defm : IntegerMinMaxVectorOps<v4i32, z_vicmph, VMNF, VMXF>;
+defm : IntegerMinMaxVectorOps<v2i64, z_vicmph, VMNG, VMXG>;
+
+// Unsigned min/max.
+defm : IntegerMinMaxVectorOps<v16i8, z_vicmphl, VMNLB, VMXLB>;
+defm : IntegerMinMaxVectorOps<v8i16, z_vicmphl, VMNLH, VMXLH>;
+defm : IntegerMinMaxVectorOps<v4i32, z_vicmphl, VMNLF, VMXLF>;
+defm : IntegerMinMaxVectorOps<v2i64, z_vicmphl, VMNLG, VMXLG>;
+
+//===----------------------------------------------------------------------===//
+// Integer comparison
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureVector] in {
+ // Element compare.
+ let Defs = [CC] in {
+ def VEC : CompareVRRaGeneric<"vec", 0xE7DB>;
+ def VECB : CompareVRRa<"vecb", 0xE7DB, null_frag, v128b, 0>;
+ def VECH : CompareVRRa<"vech", 0xE7DB, null_frag, v128h, 1>;
+ def VECF : CompareVRRa<"vecf", 0xE7DB, null_frag, v128f, 2>;
+ def VECG : CompareVRRa<"vecg", 0xE7DB, null_frag, v128g, 3>;
+ }
+
+ // Element compare logical.
+ let Defs = [CC] in {
+ def VECL : CompareVRRaGeneric<"vecl", 0xE7D9>;
+ def VECLB : CompareVRRa<"veclb", 0xE7D9, null_frag, v128b, 0>;
+ def VECLH : CompareVRRa<"veclh", 0xE7D9, null_frag, v128h, 1>;
+ def VECLF : CompareVRRa<"veclf", 0xE7D9, null_frag, v128f, 2>;
+ def VECLG : CompareVRRa<"veclg", 0xE7D9, null_frag, v128g, 3>;
+ }
+
+ // Compare equal.
+ def VCEQ : BinaryVRRbSPairGeneric<"vceq", 0xE7F8>;
+ defm VCEQB : BinaryVRRbSPair<"vceqb", 0xE7F8, z_vicmpe, z_vicmpes,
+ v128b, v128b, 0>;
+ defm VCEQH : BinaryVRRbSPair<"vceqh", 0xE7F8, z_vicmpe, z_vicmpes,
+ v128h, v128h, 1>;
+ defm VCEQF : BinaryVRRbSPair<"vceqf", 0xE7F8, z_vicmpe, z_vicmpes,
+ v128f, v128f, 2>;
+ defm VCEQG : BinaryVRRbSPair<"vceqg", 0xE7F8, z_vicmpe, z_vicmpes,
+ v128g, v128g, 3>;
+
+ // Compare high.
+ def VCH : BinaryVRRbSPairGeneric<"vch", 0xE7FB>;
+ defm VCHB : BinaryVRRbSPair<"vchb", 0xE7FB, z_vicmph, z_vicmphs,
+ v128b, v128b, 0>;
+ defm VCHH : BinaryVRRbSPair<"vchh", 0xE7FB, z_vicmph, z_vicmphs,
+ v128h, v128h, 1>;
+ defm VCHF : BinaryVRRbSPair<"vchf", 0xE7FB, z_vicmph, z_vicmphs,
+ v128f, v128f, 2>;
+ defm VCHG : BinaryVRRbSPair<"vchg", 0xE7FB, z_vicmph, z_vicmphs,
+ v128g, v128g, 3>;
+
+ // Compare high logical.
+ def VCHL : BinaryVRRbSPairGeneric<"vchl", 0xE7F9>;
+ defm VCHLB : BinaryVRRbSPair<"vchlb", 0xE7F9, z_vicmphl, z_vicmphls,
+ v128b, v128b, 0>;
+ defm VCHLH : BinaryVRRbSPair<"vchlh", 0xE7F9, z_vicmphl, z_vicmphls,
+ v128h, v128h, 1>;
+ defm VCHLF : BinaryVRRbSPair<"vchlf", 0xE7F9, z_vicmphl, z_vicmphls,
+ v128f, v128f, 2>;
+ defm VCHLG : BinaryVRRbSPair<"vchlg", 0xE7F9, z_vicmphl, z_vicmphls,
+ v128g, v128g, 3>;
+
+ // Test under mask.
+ let Defs = [CC] in
+ def VTM : CompareVRRa<"vtm", 0xE7D8, z_vtm, v128b, 0>;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating-point arithmetic
+//===----------------------------------------------------------------------===//
+
+// See comments in SystemZInstrFP.td for the suppression flags and
+// rounding modes.
+multiclass VectorRounding<Instruction insn, TypedReg tr> {
+ def : FPConversion<insn, frint, tr, tr, 0, 0>;
+ def : FPConversion<insn, fnearbyint, tr, tr, 4, 0>;
+ def : FPConversion<insn, ffloor, tr, tr, 4, 7>;
+ def : FPConversion<insn, fceil, tr, tr, 4, 6>;
+ def : FPConversion<insn, ftrunc, tr, tr, 4, 5>;
+ def : FPConversion<insn, fround, tr, tr, 4, 1>;
+}
+
+let Predicates = [FeatureVector] in {
+ // Add.
+ def VFA : BinaryVRRcFloatGeneric<"vfa", 0xE7E3>;
+ def VFADB : BinaryVRRc<"vfadb", 0xE7E3, fadd, v128db, v128db, 3, 0>;
+ def WFADB : BinaryVRRc<"wfadb", 0xE7E3, fadd, v64db, v64db, 3, 8>;
+
+ // Convert from fixed 64-bit.
+ def VCDG : TernaryVRRaFloatGeneric<"vcdg", 0xE7C3>;
+ def VCDGB : TernaryVRRa<"vcdgb", 0xE7C3, null_frag, v128db, v128g, 3, 0>;
+ def WCDGB : TernaryVRRa<"wcdgb", 0xE7C3, null_frag, v64db, v64g, 3, 8>;
+ def : FPConversion<VCDGB, sint_to_fp, v128db, v128g, 0, 0>;
+
+ // Convert from logical 64-bit.
+ def VCDLG : TernaryVRRaFloatGeneric<"vcdlg", 0xE7C1>;
+ def VCDLGB : TernaryVRRa<"vcdlgb", 0xE7C1, null_frag, v128db, v128g, 3, 0>;
+ def WCDLGB : TernaryVRRa<"wcdlgb", 0xE7C1, null_frag, v64db, v64g, 3, 8>;
+ def : FPConversion<VCDLGB, uint_to_fp, v128db, v128g, 0, 0>;
+
+ // Convert to fixed 64-bit.
+ def VCGD : TernaryVRRaFloatGeneric<"vcgd", 0xE7C2>;
+ def VCGDB : TernaryVRRa<"vcgdb", 0xE7C2, null_frag, v128g, v128db, 3, 0>;
+ def WCGDB : TernaryVRRa<"wcgdb", 0xE7C2, null_frag, v64g, v64db, 3, 8>;
+ // Rounding mode should agree with SystemZInstrFP.td.
+ def : FPConversion<VCGDB, fp_to_sint, v128g, v128db, 0, 5>;
+
+ // Convert to logical 64-bit.
+ def VCLGD : TernaryVRRaFloatGeneric<"vclgd", 0xE7C0>;
+ def VCLGDB : TernaryVRRa<"vclgdb", 0xE7C0, null_frag, v128g, v128db, 3, 0>;
+ def WCLGDB : TernaryVRRa<"wclgdb", 0xE7C0, null_frag, v64g, v64db, 3, 8>;
+ // Rounding mode should agree with SystemZInstrFP.td.
+ def : FPConversion<VCLGDB, fp_to_uint, v128g, v128db, 0, 5>;
+
+ // Divide.
+ def VFD : BinaryVRRcFloatGeneric<"vfd", 0xE7E5>;
+ def VFDDB : BinaryVRRc<"vfddb", 0xE7E5, fdiv, v128db, v128db, 3, 0>;
+ def WFDDB : BinaryVRRc<"wfddb", 0xE7E5, fdiv, v64db, v64db, 3, 8>;
+
+ // Load FP integer.
+ def VFI : TernaryVRRaFloatGeneric<"vfi", 0xE7C7>;
+ def VFIDB : TernaryVRRa<"vfidb", 0xE7C7, int_s390_vfidb, v128db, v128db, 3, 0>;
+ def WFIDB : TernaryVRRa<"wfidb", 0xE7C7, null_frag, v64db, v64db, 3, 8>;
+ defm : VectorRounding<VFIDB, v128db>;
+ defm : VectorRounding<WFIDB, v64db>;
+
+ // Load lengthened.
+ def VLDE : UnaryVRRaFloatGeneric<"vlde", 0xE7C4>;
+ def VLDEB : UnaryVRRa<"vldeb", 0xE7C4, z_vextend, v128db, v128eb, 2, 0>;
+ def WLDEB : UnaryVRRa<"wldeb", 0xE7C4, fpextend, v64db, v32eb, 2, 8>;
+
+ // Load rounded,
+ def VLED : TernaryVRRaFloatGeneric<"vled", 0xE7C5>;
+ def VLEDB : TernaryVRRa<"vledb", 0xE7C5, null_frag, v128eb, v128db, 3, 0>;
+ def WLEDB : TernaryVRRa<"wledb", 0xE7C5, null_frag, v32eb, v64db, 3, 8>;
+ def : Pat<(v4f32 (z_vround (v2f64 VR128:$src))), (VLEDB VR128:$src, 0, 0)>;
+ def : FPConversion<WLEDB, fpround, v32eb, v64db, 0, 0>;
+
+ // Multiply.
+ def VFM : BinaryVRRcFloatGeneric<"vfm", 0xE7E7>;
+ def VFMDB : BinaryVRRc<"vfmdb", 0xE7E7, fmul, v128db, v128db, 3, 0>;
+ def WFMDB : BinaryVRRc<"wfmdb", 0xE7E7, fmul, v64db, v64db, 3, 8>;
+
+ // Multiply and add.
+ def VFMA : TernaryVRReFloatGeneric<"vfma", 0xE78F>;
+ def VFMADB : TernaryVRRe<"vfmadb", 0xE78F, fma, v128db, v128db, 0, 3>;
+ def WFMADB : TernaryVRRe<"wfmadb", 0xE78F, fma, v64db, v64db, 8, 3>;
+
+ // Multiply and subtract.
+ def VFMS : TernaryVRReFloatGeneric<"vfms", 0xE78E>;
+ def VFMSDB : TernaryVRRe<"vfmsdb", 0xE78E, fms, v128db, v128db, 0, 3>;
+ def WFMSDB : TernaryVRRe<"wfmsdb", 0xE78E, fms, v64db, v64db, 8, 3>;
+
+ // Perform sign operation.
+ def VFPSO : BinaryVRRaFloatGeneric<"vfpso", 0xE7CC>;
+ def VFPSODB : BinaryVRRa<"vfpsodb", 0xE7CC, null_frag, v128db, v128db, 3, 0>;
+ def WFPSODB : BinaryVRRa<"wfpsodb", 0xE7CC, null_frag, v64db, v64db, 3, 8>;
+
+ // Load complement.
+ def VFLCDB : UnaryVRRa<"vflcdb", 0xE7CC, fneg, v128db, v128db, 3, 0, 0>;
+ def WFLCDB : UnaryVRRa<"wflcdb", 0xE7CC, fneg, v64db, v64db, 3, 8, 0>;
+
+ // Load negative.
+ def VFLNDB : UnaryVRRa<"vflndb", 0xE7CC, fnabs, v128db, v128db, 3, 0, 1>;
+ def WFLNDB : UnaryVRRa<"wflndb", 0xE7CC, fnabs, v64db, v64db, 3, 8, 1>;
+
+ // Load positive.
+ def VFLPDB : UnaryVRRa<"vflpdb", 0xE7CC, fabs, v128db, v128db, 3, 0, 2>;
+ def WFLPDB : UnaryVRRa<"wflpdb", 0xE7CC, fabs, v64db, v64db, 3, 8, 2>;
+
+ // Square root.
+ def VFSQ : UnaryVRRaFloatGeneric<"vfsq", 0xE7CE>;
+ def VFSQDB : UnaryVRRa<"vfsqdb", 0xE7CE, fsqrt, v128db, v128db, 3, 0>;
+ def WFSQDB : UnaryVRRa<"wfsqdb", 0xE7CE, fsqrt, v64db, v64db, 3, 8>;
+
+ // Subtract.
+ def VFS : BinaryVRRcFloatGeneric<"vfs", 0xE7E2>;
+ def VFSDB : BinaryVRRc<"vfsdb", 0xE7E2, fsub, v128db, v128db, 3, 0>;
+ def WFSDB : BinaryVRRc<"wfsdb", 0xE7E2, fsub, v64db, v64db, 3, 8>;
+
+ // Test data class immediate.
+ let Defs = [CC] in {
+ def VFTCI : BinaryVRIeFloatGeneric<"vftci", 0xE74A>;
+ def VFTCIDB : BinaryVRIe<"vftcidb", 0xE74A, z_vftci, v128g, v128db, 3, 0>;
+ def WFTCIDB : BinaryVRIe<"wftcidb", 0xE74A, null_frag, v64g, v64db, 3, 8>;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Floating-point comparison
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureVector] in {
+ // Compare scalar.
+ let Defs = [CC] in {
+ def WFC : CompareVRRaFloatGeneric<"wfc", 0xE7CB>;
+ def WFCDB : CompareVRRa<"wfcdb", 0xE7CB, z_fcmp, v64db, 3>;
+ }
+
+ // Compare and signal scalar.
+ let Defs = [CC] in {
+ def WFK : CompareVRRaFloatGeneric<"wfk", 0xE7CA>;
+ def WFKDB : CompareVRRa<"wfkdb", 0xE7CA, null_frag, v64db, 3>;
+ }
+
+ // Compare equal.
+ def VFCE : BinaryVRRcSPairFloatGeneric<"vfce", 0xE7E8>;
+ defm VFCEDB : BinaryVRRcSPair<"vfcedb", 0xE7E8, z_vfcmpe, z_vfcmpes,
+ v128g, v128db, 3, 0>;
+ defm WFCEDB : BinaryVRRcSPair<"wfcedb", 0xE7E8, null_frag, null_frag,
+ v64g, v64db, 3, 8>;
+
+ // Compare high.
+ def VFCH : BinaryVRRcSPairFloatGeneric<"vfch", 0xE7EB>;
+ defm VFCHDB : BinaryVRRcSPair<"vfchdb", 0xE7EB, z_vfcmph, z_vfcmphs,
+ v128g, v128db, 3, 0>;
+ defm WFCHDB : BinaryVRRcSPair<"wfchdb", 0xE7EB, null_frag, null_frag,
+ v64g, v64db, 3, 8>;
+
+ // Compare high or equal.
+ def VFCHE : BinaryVRRcSPairFloatGeneric<"vfche", 0xE7EA>;
+ defm VFCHEDB : BinaryVRRcSPair<"vfchedb", 0xE7EA, z_vfcmphe, z_vfcmphes,
+ v128g, v128db, 3, 0>;
+ defm WFCHEDB : BinaryVRRcSPair<"wfchedb", 0xE7EA, null_frag, null_frag,
+ v64g, v64db, 3, 8>;
+}
+
+//===----------------------------------------------------------------------===//
+// Conversions
+//===----------------------------------------------------------------------===//
+
+def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
+
+def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
+
+def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
+
+def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
+
+def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
+
+def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
+
+//===----------------------------------------------------------------------===//
+// Replicating scalars
+//===----------------------------------------------------------------------===//
+
+// Define patterns for replicating a scalar GR32 into a vector of type TYPE.
+// INDEX is 8 minus the element size in bytes.
+class VectorReplicateScalar<ValueType type, Instruction insn, bits<16> index>
+ : Pat<(type (z_replicate GR32:$scalar)),
+ (insn (VLVGP32 GR32:$scalar, GR32:$scalar), index)>;
+
+def : VectorReplicateScalar<v16i8, VREPB, 7>;
+def : VectorReplicateScalar<v8i16, VREPH, 3>;
+def : VectorReplicateScalar<v4i32, VREPF, 1>;
+
+// i64 replications are just a single isntruction.
+def : Pat<(v2i64 (z_replicate GR64:$scalar)),
+ (VLVGP GR64:$scalar, GR64:$scalar)>;
+
+//===----------------------------------------------------------------------===//
+// Floating-point insertion and extraction
+//===----------------------------------------------------------------------===//
+
+// Moving 32-bit values between GPRs and FPRs can be done using VLVGF
+// and VLGVF.
+let Predicates = [FeatureVector] in {
+ def LEFR : UnaryAliasVRS<VR32, GR32>;
+ def LFER : UnaryAliasVRS<GR64, VR32>;
+ def : Pat<(f32 (bitconvert (i32 GR32:$src))), (LEFR GR32:$src)>;
+ def : Pat<(i32 (bitconvert (f32 VR32:$src))),
+ (EXTRACT_SUBREG (LFER VR32:$src), subreg_l32)>;
+}
+
+// Floating-point values are stored in element 0 of the corresponding
+// vector register. Scalar to vector conversion is just a subreg and
+// scalar replication can just replicate element 0 of the vector register.
+multiclass ScalarToVectorFP<Instruction vrep, ValueType vt, RegisterOperand cls,
+ SubRegIndex subreg> {
+ def : Pat<(vt (scalar_to_vector cls:$scalar)),
+ (INSERT_SUBREG (vt (IMPLICIT_DEF)), cls:$scalar, subreg)>;
+ def : Pat<(vt (z_replicate cls:$scalar)),
+ (vrep (INSERT_SUBREG (vt (IMPLICIT_DEF)), cls:$scalar,
+ subreg), 0)>;
+}
+defm : ScalarToVectorFP<VREPF, v4f32, FP32, subreg_r32>;
+defm : ScalarToVectorFP<VREPG, v2f64, FP64, subreg_r64>;
+
+// Match v2f64 insertions. The AddedComplexity counters the 3 added by
+// TableGen for the base register operand in VLVG-based integer insertions
+// and ensures that this version is strictly better.
+let AddedComplexity = 4 in {
+ def : Pat<(z_vector_insert (v2f64 VR128:$vec), FP64:$elt, 0),
+ (VPDI (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FP64:$elt,
+ subreg_r64), VR128:$vec, 1)>;
+ def : Pat<(z_vector_insert (v2f64 VR128:$vec), FP64:$elt, 1),
+ (VPDI VR128:$vec, (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FP64:$elt,
+ subreg_r64), 0)>;
+}
+
+// We extract floating-point element X by replicating (for elements other
+// than 0) and then taking a high subreg. The AddedComplexity counters the
+// 3 added by TableGen for the base register operand in VLGV-based integer
+// extractions and ensures that this version is strictly better.
+let AddedComplexity = 4 in {
+ def : Pat<(f32 (z_vector_extract (v4f32 VR128:$vec), 0)),
+ (EXTRACT_SUBREG VR128:$vec, subreg_r32)>;
+ def : Pat<(f32 (z_vector_extract (v4f32 VR128:$vec), imm32zx2:$index)),
+ (EXTRACT_SUBREG (VREPF VR128:$vec, imm32zx2:$index), subreg_r32)>;
+
+ def : Pat<(f64 (z_vector_extract (v2f64 VR128:$vec), 0)),
+ (EXTRACT_SUBREG VR128:$vec, subreg_r64)>;
+ def : Pat<(f64 (z_vector_extract (v2f64 VR128:$vec), imm32zx1:$index)),
+ (EXTRACT_SUBREG (VREPG VR128:$vec, imm32zx1:$index), subreg_r64)>;
+}
+
+//===----------------------------------------------------------------------===//
+// String instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureVector] in {
+ defm VFAE : TernaryOptVRRbSPairGeneric<"vfae", 0xE782>;
+ defm VFAEB : TernaryOptVRRbSPair<"vfaeb", 0xE782, int_s390_vfaeb,
+ z_vfae_cc, v128b, v128b, 0>;
+ defm VFAEH : TernaryOptVRRbSPair<"vfaeh", 0xE782, int_s390_vfaeh,
+ z_vfae_cc, v128h, v128h, 1>;
+ defm VFAEF : TernaryOptVRRbSPair<"vfaef", 0xE782, int_s390_vfaef,
+ z_vfae_cc, v128f, v128f, 2>;
+ defm VFAEZB : TernaryOptVRRbSPair<"vfaezb", 0xE782, int_s390_vfaezb,
+ z_vfaez_cc, v128b, v128b, 0, 2>;
+ defm VFAEZH : TernaryOptVRRbSPair<"vfaezh", 0xE782, int_s390_vfaezh,
+ z_vfaez_cc, v128h, v128h, 1, 2>;
+ defm VFAEZF : TernaryOptVRRbSPair<"vfaezf", 0xE782, int_s390_vfaezf,
+ z_vfaez_cc, v128f, v128f, 2, 2>;
+
+ defm VFEE : BinaryExtraVRRbSPairGeneric<"vfee", 0xE780>;
+ defm VFEEB : BinaryExtraVRRbSPair<"vfeeb", 0xE780, int_s390_vfeeb,
+ z_vfee_cc, v128b, v128b, 0>;
+ defm VFEEH : BinaryExtraVRRbSPair<"vfeeh", 0xE780, int_s390_vfeeh,
+ z_vfee_cc, v128h, v128h, 1>;
+ defm VFEEF : BinaryExtraVRRbSPair<"vfeef", 0xE780, int_s390_vfeef,
+ z_vfee_cc, v128f, v128f, 2>;
+ defm VFEEZB : BinaryVRRbSPair<"vfeezb", 0xE780, int_s390_vfeezb,
+ z_vfeez_cc, v128b, v128b, 0, 2>;
+ defm VFEEZH : BinaryVRRbSPair<"vfeezh", 0xE780, int_s390_vfeezh,
+ z_vfeez_cc, v128h, v128h, 1, 2>;
+ defm VFEEZF : BinaryVRRbSPair<"vfeezf", 0xE780, int_s390_vfeezf,
+ z_vfeez_cc, v128f, v128f, 2, 2>;
+
+ defm VFENE : BinaryExtraVRRbSPairGeneric<"vfene", 0xE781>;
+ defm VFENEB : BinaryExtraVRRbSPair<"vfeneb", 0xE781, int_s390_vfeneb,
+ z_vfene_cc, v128b, v128b, 0>;
+ defm VFENEH : BinaryExtraVRRbSPair<"vfeneh", 0xE781, int_s390_vfeneh,
+ z_vfene_cc, v128h, v128h, 1>;
+ defm VFENEF : BinaryExtraVRRbSPair<"vfenef", 0xE781, int_s390_vfenef,
+ z_vfene_cc, v128f, v128f, 2>;
+ defm VFENEZB : BinaryVRRbSPair<"vfenezb", 0xE781, int_s390_vfenezb,
+ z_vfenez_cc, v128b, v128b, 0, 2>;
+ defm VFENEZH : BinaryVRRbSPair<"vfenezh", 0xE781, int_s390_vfenezh,
+ z_vfenez_cc, v128h, v128h, 1, 2>;
+ defm VFENEZF : BinaryVRRbSPair<"vfenezf", 0xE781, int_s390_vfenezf,
+ z_vfenez_cc, v128f, v128f, 2, 2>;
+
+ defm VISTR : UnaryExtraVRRaSPairGeneric<"vistr", 0xE75C>;
+ defm VISTRB : UnaryExtraVRRaSPair<"vistrb", 0xE75C, int_s390_vistrb,
+ z_vistr_cc, v128b, v128b, 0>;
+ defm VISTRH : UnaryExtraVRRaSPair<"vistrh", 0xE75C, int_s390_vistrh,
+ z_vistr_cc, v128h, v128h, 1>;
+ defm VISTRF : UnaryExtraVRRaSPair<"vistrf", 0xE75C, int_s390_vistrf,
+ z_vistr_cc, v128f, v128f, 2>;
+
+ defm VSTRC : QuaternaryOptVRRdSPairGeneric<"vstrc", 0xE78A>;
+ defm VSTRCB : QuaternaryOptVRRdSPair<"vstrcb", 0xE78A, int_s390_vstrcb,
+ z_vstrc_cc, v128b, v128b, 0>;
+ defm VSTRCH : QuaternaryOptVRRdSPair<"vstrch", 0xE78A, int_s390_vstrch,
+ z_vstrc_cc, v128h, v128h, 1>;
+ defm VSTRCF : QuaternaryOptVRRdSPair<"vstrcf", 0xE78A, int_s390_vstrcf,
+ z_vstrc_cc, v128f, v128f, 2>;
+ defm VSTRCZB : QuaternaryOptVRRdSPair<"vstrczb", 0xE78A, int_s390_vstrczb,
+ z_vstrcz_cc, v128b, v128b, 0, 2>;
+ defm VSTRCZH : QuaternaryOptVRRdSPair<"vstrczh", 0xE78A, int_s390_vstrczh,
+ z_vstrcz_cc, v128h, v128h, 1, 2>;
+ defm VSTRCZF : QuaternaryOptVRRdSPair<"vstrczf", 0xE78A, int_s390_vstrczf,
+ z_vstrcz_cc, v128f, v128f, 2, 2>;
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp
new file mode 100644
index 000000000000..ec8ce6e911fa
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp
@@ -0,0 +1,146 @@
+//===-- SystemZLDCleanup.cpp - Clean up local-dynamic TLS accesses --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass combines multiple accesses to local-dynamic TLS variables so that
+// the TLS base address for the module is only fetched once per execution path
+// through the function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZTargetMachine.h"
+#include "SystemZMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+class SystemZLDCleanup : public MachineFunctionPass {
+public:
+ static char ID;
+ SystemZLDCleanup(const SystemZTargetMachine &tm)
+ : MachineFunctionPass(ID), TII(nullptr), MF(nullptr) {}
+
+ StringRef getPassName() const override {
+ return "SystemZ Local Dynamic TLS Access Clean-up";
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+private:
+ bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg);
+ MachineInstr *ReplaceTLSCall(MachineInstr *I, unsigned TLSBaseAddrReg);
+ MachineInstr *SetRegister(MachineInstr *I, unsigned *TLSBaseAddrReg);
+
+ const SystemZInstrInfo *TII;
+ MachineFunction *MF;
+};
+
+char SystemZLDCleanup::ID = 0;
+
+} // end anonymous namespace
+
+FunctionPass *llvm::createSystemZLDCleanupPass(SystemZTargetMachine &TM) {
+ return new SystemZLDCleanup(TM);
+}
+
+void SystemZLDCleanup::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesCFG();
+ AU.addRequired<MachineDominatorTree>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool SystemZLDCleanup::runOnMachineFunction(MachineFunction &F) {
+ if (skipFunction(*F.getFunction()))
+ return false;
+
+ TII = static_cast<const SystemZInstrInfo *>(F.getSubtarget().getInstrInfo());
+ MF = &F;
+
+ SystemZMachineFunctionInfo* MFI = F.getInfo<SystemZMachineFunctionInfo>();
+ if (MFI->getNumLocalDynamicTLSAccesses() < 2) {
+ // No point folding accesses if there isn't at least two.
+ return false;
+ }
+
+ MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
+ return VisitNode(DT->getRootNode(), 0);
+}
+
+// Visit the dominator subtree rooted at Node in pre-order.
+// If TLSBaseAddrReg is non-null, then use that to replace any
+// TLS_LDCALL instructions. Otherwise, create the register
+// when the first such instruction is seen, and then use it
+// as we encounter more instructions.
+bool SystemZLDCleanup::VisitNode(MachineDomTreeNode *Node,
+ unsigned TLSBaseAddrReg) {
+ MachineBasicBlock *BB = Node->getBlock();
+ bool Changed = false;
+
+ // Traverse the current block.
+ for (auto I = BB->begin(), E = BB->end(); I != E; ++I) {
+ switch (I->getOpcode()) {
+ case SystemZ::TLS_LDCALL:
+ if (TLSBaseAddrReg)
+ I = ReplaceTLSCall(&*I, TLSBaseAddrReg);
+ else
+ I = SetRegister(&*I, &TLSBaseAddrReg);
+ Changed = true;
+ break;
+ default:
+ break;
+ }
+ }
+
+ // Visit the children of this block in the dominator tree.
+ for (auto I = Node->begin(), E = Node->end(); I != E; ++I)
+ Changed |= VisitNode(*I, TLSBaseAddrReg);
+
+ return Changed;
+}
+
+// Replace the TLS_LDCALL instruction I with a copy from TLSBaseAddrReg,
+// returning the new instruction.
+MachineInstr *SystemZLDCleanup::ReplaceTLSCall(MachineInstr *I,
+ unsigned TLSBaseAddrReg) {
+ // Insert a Copy from TLSBaseAddrReg to R2.
+ MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(),
+ TII->get(TargetOpcode::COPY), SystemZ::R2D)
+ .addReg(TLSBaseAddrReg);
+
+ // Erase the TLS_LDCALL instruction.
+ I->eraseFromParent();
+
+ return Copy;
+}
+
+// Create a virtal register in *TLSBaseAddrReg, and populate it by
+// inserting a copy instruction after I. Returns the new instruction.
+MachineInstr *SystemZLDCleanup::SetRegister(MachineInstr *I,
+ unsigned *TLSBaseAddrReg) {
+ // Create a virtual register for the TLS base address.
+ MachineRegisterInfo &RegInfo = MF->getRegInfo();
+ *TLSBaseAddrReg = RegInfo.createVirtualRegister(&SystemZ::GR64BitRegClass);
+
+ // Insert a copy from R2 to TLSBaseAddrReg.
+ MachineInstr *Next = I->getNextNode();
+ MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(),
+ TII->get(TargetOpcode::COPY), *TLSBaseAddrReg)
+ .addReg(SystemZ::R2D);
+
+ return Copy;
+}
+
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp
new file mode 100644
index 000000000000..14ff6afbd4ae
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp
@@ -0,0 +1,465 @@
+//===-- SystemZLongBranch.cpp - Branch lengthening for SystemZ ------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass makes sure that all branches are in range. There are several ways
+// in which this could be done. One aggressive approach is to assume that all
+// branches are in range and successively replace those that turn out not
+// to be in range with a longer form (branch relaxation). A simple
+// implementation is to continually walk through the function relaxing
+// branches until no more changes are needed and a fixed point is reached.
+// However, in the pathological worst case, this implementation is
+// quadratic in the number of blocks; relaxing branch N can make branch N-1
+// go out of range, which in turn can make branch N-2 go out of range,
+// and so on.
+//
+// An alternative approach is to assume that all branches must be
+// converted to their long forms, then reinstate the short forms of
+// branches that, even under this pessimistic assumption, turn out to be
+// in range (branch shortening). This too can be implemented as a function
+// walk that is repeated until a fixed point is reached. In general,
+// the result of shortening is not as good as that of relaxation, and
+// shortening is also quadratic in the worst case; shortening branch N
+// can bring branch N-1 in range of the short form, which in turn can do
+// the same for branch N-2, and so on. The main advantage of shortening
+// is that each walk through the function produces valid code, so it is
+// possible to stop at any point after the first walk. The quadraticness
+// could therefore be handled with a maximum pass count, although the
+// question then becomes: what maximum count should be used?
+//
+// On SystemZ, long branches are only needed for functions bigger than 64k,
+// which are relatively rare to begin with, and the long branch sequences
+// are actually relatively cheap. It therefore doesn't seem worth spending
+// much compilation time on the problem. Instead, the approach we take is:
+//
+// (1) Work out the address that each block would have if no branches
+// need relaxing. Exit the pass early if all branches are in range
+// according to this assumption.
+//
+// (2) Work out the address that each block would have if all branches
+// need relaxing.
+//
+// (3) Walk through the block calculating the final address of each instruction
+// and relaxing those that need to be relaxed. For backward branches,
+// this check uses the final address of the target block, as calculated
+// earlier in the walk. For forward branches, this check uses the
+// address of the target block that was calculated in (2). Both checks
+// give a conservatively-correct range.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "systemz-long-branch"
+
+STATISTIC(LongBranches, "Number of long branches.");
+
+namespace {
+// Represents positional information about a basic block.
+struct MBBInfo {
+ // The address that we currently assume the block has.
+ uint64_t Address;
+
+ // The size of the block in bytes, excluding terminators.
+ // This value never changes.
+ uint64_t Size;
+
+ // The minimum alignment of the block, as a log2 value.
+ // This value never changes.
+ unsigned Alignment;
+
+ // The number of terminators in this block. This value never changes.
+ unsigned NumTerminators;
+
+ MBBInfo()
+ : Address(0), Size(0), Alignment(0), NumTerminators(0) {}
+};
+
+// Represents the state of a block terminator.
+struct TerminatorInfo {
+ // If this terminator is a relaxable branch, this points to the branch
+ // instruction, otherwise it is null.
+ MachineInstr *Branch;
+
+ // The address that we currently assume the terminator has.
+ uint64_t Address;
+
+ // The current size of the terminator in bytes.
+ uint64_t Size;
+
+ // If Branch is nonnull, this is the number of the target block,
+ // otherwise it is unused.
+ unsigned TargetBlock;
+
+ // If Branch is nonnull, this is the length of the longest relaxed form,
+ // otherwise it is zero.
+ unsigned ExtraRelaxSize;
+
+ TerminatorInfo() : Branch(nullptr), Size(0), TargetBlock(0),
+ ExtraRelaxSize(0) {}
+};
+
+// Used to keep track of the current position while iterating over the blocks.
+struct BlockPosition {
+ // The address that we assume this position has.
+ uint64_t Address;
+
+ // The number of low bits in Address that are known to be the same
+ // as the runtime address.
+ unsigned KnownBits;
+
+ BlockPosition(unsigned InitialAlignment)
+ : Address(0), KnownBits(InitialAlignment) {}
+};
+
+class SystemZLongBranch : public MachineFunctionPass {
+public:
+ static char ID;
+ SystemZLongBranch(const SystemZTargetMachine &tm)
+ : MachineFunctionPass(ID), TII(nullptr) {}
+
+ StringRef getPassName() const override { return "SystemZ Long Branch"; }
+
+ bool runOnMachineFunction(MachineFunction &F) override;
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+private:
+ void skipNonTerminators(BlockPosition &Position, MBBInfo &Block);
+ void skipTerminator(BlockPosition &Position, TerminatorInfo &Terminator,
+ bool AssumeRelaxed);
+ TerminatorInfo describeTerminator(MachineInstr &MI);
+ uint64_t initMBBInfo();
+ bool mustRelaxBranch(const TerminatorInfo &Terminator, uint64_t Address);
+ bool mustRelaxABranch();
+ void setWorstCaseAddresses();
+ void splitBranchOnCount(MachineInstr *MI, unsigned AddOpcode);
+ void splitCompareBranch(MachineInstr *MI, unsigned CompareOpcode);
+ void relaxBranch(TerminatorInfo &Terminator);
+ void relaxBranches();
+
+ const SystemZInstrInfo *TII;
+ MachineFunction *MF;
+ SmallVector<MBBInfo, 16> MBBs;
+ SmallVector<TerminatorInfo, 16> Terminators;
+};
+
+char SystemZLongBranch::ID = 0;
+
+const uint64_t MaxBackwardRange = 0x10000;
+const uint64_t MaxForwardRange = 0xfffe;
+} // end anonymous namespace
+
+FunctionPass *llvm::createSystemZLongBranchPass(SystemZTargetMachine &TM) {
+ return new SystemZLongBranch(TM);
+}
+
+// Position describes the state immediately before Block. Update Block
+// accordingly and move Position to the end of the block's non-terminator
+// instructions.
+void SystemZLongBranch::skipNonTerminators(BlockPosition &Position,
+ MBBInfo &Block) {
+ if (Block.Alignment > Position.KnownBits) {
+ // When calculating the address of Block, we need to conservatively
+ // assume that Block had the worst possible misalignment.
+ Position.Address += ((uint64_t(1) << Block.Alignment) -
+ (uint64_t(1) << Position.KnownBits));
+ Position.KnownBits = Block.Alignment;
+ }
+
+ // Align the addresses.
+ uint64_t AlignMask = (uint64_t(1) << Block.Alignment) - 1;
+ Position.Address = (Position.Address + AlignMask) & ~AlignMask;
+
+ // Record the block's position.
+ Block.Address = Position.Address;
+
+ // Move past the non-terminators in the block.
+ Position.Address += Block.Size;
+}
+
+// Position describes the state immediately before Terminator.
+// Update Terminator accordingly and move Position past it.
+// Assume that Terminator will be relaxed if AssumeRelaxed.
+void SystemZLongBranch::skipTerminator(BlockPosition &Position,
+ TerminatorInfo &Terminator,
+ bool AssumeRelaxed) {
+ Terminator.Address = Position.Address;
+ Position.Address += Terminator.Size;
+ if (AssumeRelaxed)
+ Position.Address += Terminator.ExtraRelaxSize;
+}
+
+// Return a description of terminator instruction MI.
+TerminatorInfo SystemZLongBranch::describeTerminator(MachineInstr &MI) {
+ TerminatorInfo Terminator;
+ Terminator.Size = TII->getInstSizeInBytes(MI);
+ if (MI.isConditionalBranch() || MI.isUnconditionalBranch()) {
+ switch (MI.getOpcode()) {
+ case SystemZ::J:
+ // Relaxes to JG, which is 2 bytes longer.
+ Terminator.ExtraRelaxSize = 2;
+ break;
+ case SystemZ::BRC:
+ // Relaxes to BRCL, which is 2 bytes longer.
+ Terminator.ExtraRelaxSize = 2;
+ break;
+ case SystemZ::BRCT:
+ case SystemZ::BRCTG:
+ // Relaxes to A(G)HI and BRCL, which is 6 bytes longer.
+ Terminator.ExtraRelaxSize = 6;
+ break;
+ case SystemZ::BRCTH:
+ // Never needs to be relaxed.
+ Terminator.ExtraRelaxSize = 0;
+ break;
+ case SystemZ::CRJ:
+ case SystemZ::CLRJ:
+ // Relaxes to a C(L)R/BRCL sequence, which is 2 bytes longer.
+ Terminator.ExtraRelaxSize = 2;
+ break;
+ case SystemZ::CGRJ:
+ case SystemZ::CLGRJ:
+ // Relaxes to a C(L)GR/BRCL sequence, which is 4 bytes longer.
+ Terminator.ExtraRelaxSize = 4;
+ break;
+ case SystemZ::CIJ:
+ case SystemZ::CGIJ:
+ // Relaxes to a C(G)HI/BRCL sequence, which is 4 bytes longer.
+ Terminator.ExtraRelaxSize = 4;
+ break;
+ case SystemZ::CLIJ:
+ case SystemZ::CLGIJ:
+ // Relaxes to a CL(G)FI/BRCL sequence, which is 6 bytes longer.
+ Terminator.ExtraRelaxSize = 6;
+ break;
+ default:
+ llvm_unreachable("Unrecognized branch instruction");
+ }
+ Terminator.Branch = &MI;
+ Terminator.TargetBlock =
+ TII->getBranchInfo(MI).Target->getMBB()->getNumber();
+ }
+ return Terminator;
+}
+
+// Fill MBBs and Terminators, setting the addresses on the assumption
+// that no branches need relaxation. Return the size of the function under
+// this assumption.
+uint64_t SystemZLongBranch::initMBBInfo() {
+ MF->RenumberBlocks();
+ unsigned NumBlocks = MF->size();
+
+ MBBs.clear();
+ MBBs.resize(NumBlocks);
+
+ Terminators.clear();
+ Terminators.reserve(NumBlocks);
+
+ BlockPosition Position(MF->getAlignment());
+ for (unsigned I = 0; I < NumBlocks; ++I) {
+ MachineBasicBlock *MBB = MF->getBlockNumbered(I);
+ MBBInfo &Block = MBBs[I];
+
+ // Record the alignment, for quick access.
+ Block.Alignment = MBB->getAlignment();
+
+ // Calculate the size of the fixed part of the block.
+ MachineBasicBlock::iterator MI = MBB->begin();
+ MachineBasicBlock::iterator End = MBB->end();
+ while (MI != End && !MI->isTerminator()) {
+ Block.Size += TII->getInstSizeInBytes(*MI);
+ ++MI;
+ }
+ skipNonTerminators(Position, Block);
+
+ // Add the terminators.
+ while (MI != End) {
+ if (!MI->isDebugValue()) {
+ assert(MI->isTerminator() && "Terminator followed by non-terminator");
+ Terminators.push_back(describeTerminator(*MI));
+ skipTerminator(Position, Terminators.back(), false);
+ ++Block.NumTerminators;
+ }
+ ++MI;
+ }
+ }
+
+ return Position.Address;
+}
+
+// Return true if, under current assumptions, Terminator would need to be
+// relaxed if it were placed at address Address.
+bool SystemZLongBranch::mustRelaxBranch(const TerminatorInfo &Terminator,
+ uint64_t Address) {
+ if (!Terminator.Branch)
+ return false;
+
+ const MBBInfo &Target = MBBs[Terminator.TargetBlock];
+ if (Address >= Target.Address) {
+ if (Address - Target.Address <= MaxBackwardRange)
+ return false;
+ } else {
+ if (Target.Address - Address <= MaxForwardRange)
+ return false;
+ }
+
+ return true;
+}
+
+// Return true if, under current assumptions, any terminator needs
+// to be relaxed.
+bool SystemZLongBranch::mustRelaxABranch() {
+ for (auto &Terminator : Terminators)
+ if (mustRelaxBranch(Terminator, Terminator.Address))
+ return true;
+ return false;
+}
+
+// Set the address of each block on the assumption that all branches
+// must be long.
+void SystemZLongBranch::setWorstCaseAddresses() {
+ SmallVector<TerminatorInfo, 16>::iterator TI = Terminators.begin();
+ BlockPosition Position(MF->getAlignment());
+ for (auto &Block : MBBs) {
+ skipNonTerminators(Position, Block);
+ for (unsigned BTI = 0, BTE = Block.NumTerminators; BTI != BTE; ++BTI) {
+ skipTerminator(Position, *TI, true);
+ ++TI;
+ }
+ }
+}
+
+// Split BRANCH ON COUNT MI into the addition given by AddOpcode followed
+// by a BRCL on the result.
+void SystemZLongBranch::splitBranchOnCount(MachineInstr *MI,
+ unsigned AddOpcode) {
+ MachineBasicBlock *MBB = MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+ BuildMI(*MBB, MI, DL, TII->get(AddOpcode))
+ .addOperand(MI->getOperand(0))
+ .addOperand(MI->getOperand(1))
+ .addImm(-1);
+ MachineInstr *BRCL = BuildMI(*MBB, MI, DL, TII->get(SystemZ::BRCL))
+ .addImm(SystemZ::CCMASK_ICMP)
+ .addImm(SystemZ::CCMASK_CMP_NE)
+ .addOperand(MI->getOperand(2));
+ // The implicit use of CC is a killing use.
+ BRCL->addRegisterKilled(SystemZ::CC, &TII->getRegisterInfo());
+ MI->eraseFromParent();
+}
+
+// Split MI into the comparison given by CompareOpcode followed
+// a BRCL on the result.
+void SystemZLongBranch::splitCompareBranch(MachineInstr *MI,
+ unsigned CompareOpcode) {
+ MachineBasicBlock *MBB = MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+ BuildMI(*MBB, MI, DL, TII->get(CompareOpcode))
+ .addOperand(MI->getOperand(0))
+ .addOperand(MI->getOperand(1));
+ MachineInstr *BRCL = BuildMI(*MBB, MI, DL, TII->get(SystemZ::BRCL))
+ .addImm(SystemZ::CCMASK_ICMP)
+ .addOperand(MI->getOperand(2))
+ .addOperand(MI->getOperand(3));
+ // The implicit use of CC is a killing use.
+ BRCL->addRegisterKilled(SystemZ::CC, &TII->getRegisterInfo());
+ MI->eraseFromParent();
+}
+
+// Relax the branch described by Terminator.
+void SystemZLongBranch::relaxBranch(TerminatorInfo &Terminator) {
+ MachineInstr *Branch = Terminator.Branch;
+ switch (Branch->getOpcode()) {
+ case SystemZ::J:
+ Branch->setDesc(TII->get(SystemZ::JG));
+ break;
+ case SystemZ::BRC:
+ Branch->setDesc(TII->get(SystemZ::BRCL));
+ break;
+ case SystemZ::BRCT:
+ splitBranchOnCount(Branch, SystemZ::AHI);
+ break;
+ case SystemZ::BRCTG:
+ splitBranchOnCount(Branch, SystemZ::AGHI);
+ break;
+ case SystemZ::CRJ:
+ splitCompareBranch(Branch, SystemZ::CR);
+ break;
+ case SystemZ::CGRJ:
+ splitCompareBranch(Branch, SystemZ::CGR);
+ break;
+ case SystemZ::CIJ:
+ splitCompareBranch(Branch, SystemZ::CHI);
+ break;
+ case SystemZ::CGIJ:
+ splitCompareBranch(Branch, SystemZ::CGHI);
+ break;
+ case SystemZ::CLRJ:
+ splitCompareBranch(Branch, SystemZ::CLR);
+ break;
+ case SystemZ::CLGRJ:
+ splitCompareBranch(Branch, SystemZ::CLGR);
+ break;
+ case SystemZ::CLIJ:
+ splitCompareBranch(Branch, SystemZ::CLFI);
+ break;
+ case SystemZ::CLGIJ:
+ splitCompareBranch(Branch, SystemZ::CLGFI);
+ break;
+ default:
+ llvm_unreachable("Unrecognized branch");
+ }
+
+ Terminator.Size += Terminator.ExtraRelaxSize;
+ Terminator.ExtraRelaxSize = 0;
+ Terminator.Branch = nullptr;
+
+ ++LongBranches;
+}
+
+// Run a shortening pass and relax any branches that need to be relaxed.
+void SystemZLongBranch::relaxBranches() {
+ SmallVector<TerminatorInfo, 16>::iterator TI = Terminators.begin();
+ BlockPosition Position(MF->getAlignment());
+ for (auto &Block : MBBs) {
+ skipNonTerminators(Position, Block);
+ for (unsigned BTI = 0, BTE = Block.NumTerminators; BTI != BTE; ++BTI) {
+ assert(Position.Address <= TI->Address &&
+ "Addresses shouldn't go forwards");
+ if (mustRelaxBranch(*TI, Position.Address))
+ relaxBranch(*TI);
+ skipTerminator(Position, *TI, false);
+ ++TI;
+ }
+ }
+}
+
+bool SystemZLongBranch::runOnMachineFunction(MachineFunction &F) {
+ TII = static_cast<const SystemZInstrInfo *>(F.getSubtarget().getInstrInfo());
+ MF = &F;
+ uint64_t Size = initMBBInfo();
+ if (Size <= MaxForwardRange || !mustRelaxABranch())
+ return false;
+
+ setWorstCaseAddresses();
+ relaxBranches();
+ return true;
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZMCInstLower.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZMCInstLower.cpp
new file mode 100644
index 000000000000..2655e4866b20
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZMCInstLower.cpp
@@ -0,0 +1,103 @@
+//===-- SystemZMCInstLower.cpp - Lower MachineInstr to MCInst -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZMCInstLower.h"
+#include "SystemZAsmPrinter.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+
+using namespace llvm;
+
+// Return the VK_* enumeration for MachineOperand target flags Flags.
+static MCSymbolRefExpr::VariantKind getVariantKind(unsigned Flags) {
+ switch (Flags & SystemZII::MO_SYMBOL_MODIFIER) {
+ case 0:
+ return MCSymbolRefExpr::VK_None;
+ case SystemZII::MO_GOT:
+ return MCSymbolRefExpr::VK_GOT;
+ case SystemZII::MO_INDNTPOFF:
+ return MCSymbolRefExpr::VK_INDNTPOFF;
+ }
+ llvm_unreachable("Unrecognised MO_ACCESS_MODEL");
+}
+
+SystemZMCInstLower::SystemZMCInstLower(MCContext &ctx,
+ SystemZAsmPrinter &asmprinter)
+ : Ctx(ctx), AsmPrinter(asmprinter) {}
+
+const MCExpr *
+SystemZMCInstLower::getExpr(const MachineOperand &MO,
+ MCSymbolRefExpr::VariantKind Kind) const {
+ const MCSymbol *Symbol;
+ bool HasOffset = true;
+ switch (MO.getType()) {
+ case MachineOperand::MO_MachineBasicBlock:
+ Symbol = MO.getMBB()->getSymbol();
+ HasOffset = false;
+ break;
+
+ case MachineOperand::MO_GlobalAddress:
+ Symbol = AsmPrinter.getSymbol(MO.getGlobal());
+ break;
+
+ case MachineOperand::MO_ExternalSymbol:
+ Symbol = AsmPrinter.GetExternalSymbolSymbol(MO.getSymbolName());
+ break;
+
+ case MachineOperand::MO_JumpTableIndex:
+ Symbol = AsmPrinter.GetJTISymbol(MO.getIndex());
+ HasOffset = false;
+ break;
+
+ case MachineOperand::MO_ConstantPoolIndex:
+ Symbol = AsmPrinter.GetCPISymbol(MO.getIndex());
+ break;
+
+ case MachineOperand::MO_BlockAddress:
+ Symbol = AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress());
+ break;
+
+ default:
+ llvm_unreachable("unknown operand type");
+ }
+ const MCExpr *Expr = MCSymbolRefExpr::create(Symbol, Kind, Ctx);
+ if (HasOffset)
+ if (int64_t Offset = MO.getOffset()) {
+ const MCExpr *OffsetExpr = MCConstantExpr::create(Offset, Ctx);
+ Expr = MCBinaryExpr::createAdd(Expr, OffsetExpr, Ctx);
+ }
+ return Expr;
+}
+
+MCOperand SystemZMCInstLower::lowerOperand(const MachineOperand &MO) const {
+ switch (MO.getType()) {
+ case MachineOperand::MO_Register:
+ return MCOperand::createReg(MO.getReg());
+
+ case MachineOperand::MO_Immediate:
+ return MCOperand::createImm(MO.getImm());
+
+ default: {
+ MCSymbolRefExpr::VariantKind Kind = getVariantKind(MO.getTargetFlags());
+ return MCOperand::createExpr(getExpr(MO, Kind));
+ }
+ }
+}
+
+void SystemZMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
+ OutMI.setOpcode(MI->getOpcode());
+ for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
+ const MachineOperand &MO = MI->getOperand(I);
+ // Ignore all implicit register operands.
+ if (!MO.isReg() || !MO.isImplicit())
+ OutMI.addOperand(lowerOperand(MO));
+ }
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZMCInstLower.h b/contrib/llvm/lib/Target/SystemZ/SystemZMCInstLower.h
new file mode 100644
index 000000000000..7173cfa42959
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZMCInstLower.h
@@ -0,0 +1,44 @@
+//===-- SystemZMCInstLower.h - Lower MachineInstr to MCInst ----*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMCINSTLOWER_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMCINSTLOWER_H
+
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+class MCInst;
+class MCOperand;
+class MachineInstr;
+class MachineOperand;
+class Mangler;
+class SystemZAsmPrinter;
+
+class LLVM_LIBRARY_VISIBILITY SystemZMCInstLower {
+ MCContext &Ctx;
+ SystemZAsmPrinter &AsmPrinter;
+
+public:
+ SystemZMCInstLower(MCContext &ctx, SystemZAsmPrinter &asmPrinter);
+
+ // Lower MachineInstr MI to MCInst OutMI.
+ void lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+ // Return an MCOperand for MO.
+ MCOperand lowerOperand(const MachineOperand& MO) const;
+
+ // Return an MCExpr for symbolic operand MO with variant kind Kind.
+ const MCExpr *getExpr(const MachineOperand &MO,
+ MCSymbolRefExpr::VariantKind Kind) const;
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp
new file mode 100644
index 000000000000..1a7c0d7f687a
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp
@@ -0,0 +1,17 @@
+//=== SystemZMachineFunctionInfo.cpp - SystemZ machine function info ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZMachineFunctionInfo.h"
+
+using namespace llvm;
+
+
+// pin vtable to this file
+void SystemZMachineFunctionInfo::anchor() {}
+
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.h b/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
new file mode 100644
index 000000000000..4f64f4c65f1d
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
@@ -0,0 +1,79 @@
+//=== SystemZMachineFunctionInfo.h - SystemZ machine function info -*- C++ -*-//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+class SystemZMachineFunctionInfo : public MachineFunctionInfo {
+ virtual void anchor();
+ unsigned LowSavedGPR;
+ unsigned HighSavedGPR;
+ unsigned VarArgsFirstGPR;
+ unsigned VarArgsFirstFPR;
+ unsigned VarArgsFrameIndex;
+ unsigned RegSaveFrameIndex;
+ int FramePointerSaveIndex;
+ bool ManipulatesSP;
+ unsigned NumLocalDynamics;
+
+public:
+ explicit SystemZMachineFunctionInfo(MachineFunction &MF)
+ : LowSavedGPR(0), HighSavedGPR(0), VarArgsFirstGPR(0), VarArgsFirstFPR(0),
+ VarArgsFrameIndex(0), RegSaveFrameIndex(0), FramePointerSaveIndex(0),
+ ManipulatesSP(false), NumLocalDynamics(0) {}
+
+ // Get and set the first call-saved GPR that should be saved and restored
+ // by this function. This is 0 if no GPRs need to be saved or restored.
+ unsigned getLowSavedGPR() const { return LowSavedGPR; }
+ void setLowSavedGPR(unsigned Reg) { LowSavedGPR = Reg; }
+
+ // Get and set the last call-saved GPR that should be saved and restored
+ // by this function.
+ unsigned getHighSavedGPR() const { return HighSavedGPR; }
+ void setHighSavedGPR(unsigned Reg) { HighSavedGPR = Reg; }
+
+ // Get and set the number of fixed (as opposed to variable) arguments
+ // that are passed in GPRs to this function.
+ unsigned getVarArgsFirstGPR() const { return VarArgsFirstGPR; }
+ void setVarArgsFirstGPR(unsigned GPR) { VarArgsFirstGPR = GPR; }
+
+ // Likewise FPRs.
+ unsigned getVarArgsFirstFPR() const { return VarArgsFirstFPR; }
+ void setVarArgsFirstFPR(unsigned FPR) { VarArgsFirstFPR = FPR; }
+
+ // Get and set the frame index of the first stack vararg.
+ unsigned getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+ void setVarArgsFrameIndex(unsigned FI) { VarArgsFrameIndex = FI; }
+
+ // Get and set the frame index of the register save area
+ // (i.e. the incoming stack pointer).
+ unsigned getRegSaveFrameIndex() const { return RegSaveFrameIndex; }
+ void setRegSaveFrameIndex(unsigned FI) { RegSaveFrameIndex = FI; }
+
+ // Get and set the frame index of where the old frame pointer is stored.
+ int getFramePointerSaveIndex() const { return FramePointerSaveIndex; }
+ void setFramePointerSaveIndex(int Idx) { FramePointerSaveIndex = Idx; }
+
+ // Get and set whether the function directly manipulates the stack pointer,
+ // e.g. through STACKSAVE or STACKRESTORE.
+ bool getManipulatesSP() const { return ManipulatesSP; }
+ void setManipulatesSP(bool MSP) { ManipulatesSP = MSP; }
+
+ // Count number of local-dynamic TLS symbols used.
+ unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; }
+ void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp
new file mode 100644
index 000000000000..ab6020f3f189
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp
@@ -0,0 +1,153 @@
+//-- SystemZMachineScheduler.cpp - SystemZ Scheduler Interface -*- C++ -*---==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// -------------------------- Post RA scheduling ---------------------------- //
+// SystemZPostRASchedStrategy is a scheduling strategy which is plugged into
+// the MachineScheduler. It has a sorted Available set of SUs and a pickNode()
+// implementation that looks to optimize decoder grouping and balance the
+// usage of processor resources.
+//===----------------------------------------------------------------------===//
+
+#include "SystemZMachineScheduler.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "misched"
+
+#ifndef NDEBUG
+// Print the set of SUs
+void SystemZPostRASchedStrategy::SUSet::
+dump(SystemZHazardRecognizer &HazardRec) {
+ dbgs() << "{";
+ for (auto &SU : *this) {
+ HazardRec.dumpSU(SU, dbgs());
+ if (SU != *rbegin())
+ dbgs() << ", ";
+ }
+ dbgs() << "}\n";
+}
+#endif
+
+SystemZPostRASchedStrategy::
+SystemZPostRASchedStrategy(const MachineSchedContext *C)
+ : DAG(nullptr), HazardRec(C) {}
+
+void SystemZPostRASchedStrategy::initialize(ScheduleDAGMI *dag) {
+ DAG = dag;
+ HazardRec.setDAG(dag);
+ HazardRec.Reset();
+}
+
+// Pick the next node to schedule.
+SUnit *SystemZPostRASchedStrategy::pickNode(bool &IsTopNode) {
+ // Only scheduling top-down.
+ IsTopNode = true;
+
+ if (Available.empty())
+ return nullptr;
+
+ // If only one choice, return it.
+ if (Available.size() == 1) {
+ DEBUG (dbgs() << "+++ Only one: ";
+ HazardRec.dumpSU(*Available.begin(), dbgs()); dbgs() << "\n";);
+ return *Available.begin();
+ }
+
+ // All nodes that are possible to schedule are stored by in the
+ // Available set.
+ DEBUG(dbgs() << "+++ Available: "; Available.dump(HazardRec););
+
+ Candidate Best;
+ for (auto *SU : Available) {
+
+ // SU is the next candidate to be compared against current Best.
+ Candidate c(SU, HazardRec);
+
+ // Remeber which SU is the best candidate.
+ if (Best.SU == nullptr || c < Best) {
+ Best = c;
+ DEBUG(dbgs() << "+++ Best sofar: ";
+ HazardRec.dumpSU(Best.SU, dbgs());
+ if (Best.GroupingCost != 0)
+ dbgs() << "\tGrouping cost:" << Best.GroupingCost;
+ if (Best.ResourcesCost != 0)
+ dbgs() << " Resource cost:" << Best.ResourcesCost;
+ dbgs() << " Height:" << Best.SU->getHeight();
+ dbgs() << "\n";);
+ }
+
+ // Once we know we have seen all SUs that affect grouping or use unbuffered
+ // resources, we can stop iterating if Best looks good.
+ if (!SU->isScheduleHigh && Best.noCost())
+ break;
+ }
+
+ assert (Best.SU != nullptr);
+ return Best.SU;
+}
+
+SystemZPostRASchedStrategy::Candidate::
+Candidate(SUnit *SU_, SystemZHazardRecognizer &HazardRec) : Candidate() {
+ SU = SU_;
+
+ // Check the grouping cost. For a node that must begin / end a
+ // group, it is positive if it would do so prematurely, or negative
+ // if it would fit naturally into the schedule.
+ GroupingCost = HazardRec.groupingCost(SU);
+
+ // Check the resources cost for this SU.
+ ResourcesCost = HazardRec.resourcesCost(SU);
+}
+
+bool SystemZPostRASchedStrategy::Candidate::
+operator<(const Candidate &other) {
+
+ // Check decoder grouping.
+ if (GroupingCost < other.GroupingCost)
+ return true;
+ if (GroupingCost > other.GroupingCost)
+ return false;
+
+ // Compare the use of resources.
+ if (ResourcesCost < other.ResourcesCost)
+ return true;
+ if (ResourcesCost > other.ResourcesCost)
+ return false;
+
+ // Higher SU is otherwise generally better.
+ if (SU->getHeight() > other.SU->getHeight())
+ return true;
+ if (SU->getHeight() < other.SU->getHeight())
+ return false;
+
+ // If all same, fall back to original order.
+ if (SU->NodeNum < other.SU->NodeNum)
+ return true;
+
+ return false;
+}
+
+void SystemZPostRASchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
+ DEBUG(dbgs() << "+++ Scheduling SU(" << SU->NodeNum << ")\n";);
+
+ // Remove SU from Available set and update HazardRec.
+ Available.erase(SU);
+ HazardRec.EmitInstruction(SU);
+}
+
+void SystemZPostRASchedStrategy::releaseTopNode(SUnit *SU) {
+ // Set isScheduleHigh flag on all SUs that we want to consider first in
+ // pickNode().
+ const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+ bool AffectsGrouping = (SC->isValid() && (SC->BeginGroup || SC->EndGroup));
+ SU->isScheduleHigh = (AffectsGrouping || SU->isUnbuffered);
+
+ // Put all released SUs in the Available set.
+ Available.insert(SU);
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h b/contrib/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h
new file mode 100644
index 000000000000..b919758b70e7
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h
@@ -0,0 +1,112 @@
+//==-- SystemZMachineScheduler.h - SystemZ Scheduler Interface -*- C++ -*---==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// -------------------------- Post RA scheduling ---------------------------- //
+// SystemZPostRASchedStrategy is a scheduling strategy which is plugged into
+// the MachineScheduler. It has a sorted Available set of SUs and a pickNode()
+// implementation that looks to optimize decoder grouping and balance the
+// usage of processor resources.
+//===----------------------------------------------------------------------===//
+
+#include "SystemZInstrInfo.h"
+#include "SystemZHazardRecognizer.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/Support/Debug.h"
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMACHINESCHEDULER_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMACHINESCHEDULER_H
+
+using namespace llvm;
+
+namespace llvm {
+
+/// A MachineSchedStrategy implementation for SystemZ post RA scheduling.
+class SystemZPostRASchedStrategy : public MachineSchedStrategy {
+ ScheduleDAGMI *DAG;
+
+ /// A candidate during instruction evaluation.
+ struct Candidate {
+ SUnit *SU;
+
+ /// The decoding cost.
+ int GroupingCost;
+
+ /// The processor resources cost.
+ int ResourcesCost;
+
+ Candidate() : SU(nullptr), GroupingCost(0), ResourcesCost(0) {}
+ Candidate(SUnit *SU_, SystemZHazardRecognizer &HazardRec);
+
+ // Compare two candidates.
+ bool operator<(const Candidate &other);
+
+ // Check if this node is free of cost ("as good as any").
+ bool inline noCost() {
+ return (GroupingCost <= 0 && !ResourcesCost);
+ }
+ };
+
+ // A sorter for the Available set that makes sure that SUs are considered
+ // in the best order.
+ struct SUSorter {
+ bool operator() (SUnit *lhs, SUnit *rhs) const {
+ if (lhs->isScheduleHigh && !rhs->isScheduleHigh)
+ return true;
+ if (!lhs->isScheduleHigh && rhs->isScheduleHigh)
+ return false;
+
+ if (lhs->getHeight() > rhs->getHeight())
+ return true;
+ else if (lhs->getHeight() < rhs->getHeight())
+ return false;
+
+ return (lhs->NodeNum < rhs->NodeNum);
+ }
+ };
+ // A set of SUs with a sorter and dump method.
+ struct SUSet : std::set<SUnit*, SUSorter> {
+ #ifndef NDEBUG
+ void dump(SystemZHazardRecognizer &HazardRec);
+ #endif
+ };
+
+ /// The set of available SUs to schedule next.
+ SUSet Available;
+
+ // HazardRecognizer that tracks the scheduler state for the current
+ // region.
+ SystemZHazardRecognizer HazardRec;
+
+ public:
+ SystemZPostRASchedStrategy(const MachineSchedContext *C);
+
+ /// PostRA scheduling does not track pressure.
+ bool shouldTrackPressure() const override { return false; }
+
+ /// Initialize the strategy after building the DAG for a new region.
+ void initialize(ScheduleDAGMI *dag) override;
+
+ /// Pick the next node to schedule, or return NULL.
+ SUnit *pickNode(bool &IsTopNode) override;
+
+ /// ScheduleDAGMI has scheduled an instruction - tell HazardRec
+ /// about it.
+ void schedNode(SUnit *SU, bool IsTopNode) override;
+
+ /// SU has had all predecessor dependencies resolved. Put it into
+ /// Available.
+ void releaseTopNode(SUnit *SU) override;
+
+ /// Currently only scheduling top-down, so this method is empty.
+ void releaseBottomNode(SUnit *SU) override {};
+};
+
+} // namespace llvm
+
+#endif /* LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMACHINESCHEDULER_H */
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZOperands.td b/contrib/llvm/lib/Target/SystemZ/SystemZOperands.td
new file mode 100644
index 000000000000..7bb4fe5afb3f
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZOperands.td
@@ -0,0 +1,593 @@
+//===-- SystemZOperands.td - SystemZ instruction operands ----*- tblgen-*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Class definitions
+//===----------------------------------------------------------------------===//
+
+class ImmediateAsmOperand<string name>
+ : AsmOperandClass {
+ let Name = name;
+ let RenderMethod = "addImmOperands";
+}
+class ImmediateTLSAsmOperand<string name>
+ : AsmOperandClass {
+ let Name = name;
+ let RenderMethod = "addImmTLSOperands";
+}
+
+// Constructs both a DAG pattern and instruction operand for an immediate
+// of type VT. PRED returns true if a node is acceptable and XFORM returns
+// the operand value associated with the node. ASMOP is the name of the
+// associated asm operand, and also forms the basis of the asm print method.
+class Immediate<ValueType vt, code pred, SDNodeXForm xform, string asmop>
+ : PatLeaf<(vt imm), pred, xform>, Operand<vt> {
+ let PrintMethod = "print"##asmop##"Operand";
+ let DecoderMethod = "decode"##asmop##"Operand";
+ let ParserMatchClass = !cast<AsmOperandClass>(asmop);
+}
+
+// Constructs an asm operand for a PC-relative address. SIZE says how
+// many bits there are.
+class PCRelAsmOperand<string size> : ImmediateAsmOperand<"PCRel"##size> {
+ let PredicateMethod = "isImm";
+ let ParserMethod = "parsePCRel"##size;
+}
+class PCRelTLSAsmOperand<string size>
+ : ImmediateTLSAsmOperand<"PCRelTLS"##size> {
+ let PredicateMethod = "isImmTLS";
+ let ParserMethod = "parsePCRelTLS"##size;
+}
+
+// Constructs an operand for a PC-relative address with address type VT.
+// ASMOP is the associated asm operand.
+class PCRelOperand<ValueType vt, AsmOperandClass asmop> : Operand<vt> {
+ let PrintMethod = "printPCRelOperand";
+ let ParserMatchClass = asmop;
+}
+class PCRelTLSOperand<ValueType vt, AsmOperandClass asmop> : Operand<vt> {
+ let PrintMethod = "printPCRelTLSOperand";
+ let ParserMatchClass = asmop;
+}
+
+// Constructs both a DAG pattern and instruction operand for a PC-relative
+// address with address size VT. SELF is the name of the operand and
+// ASMOP is the associated asm operand.
+class PCRelAddress<ValueType vt, string self, AsmOperandClass asmop>
+ : ComplexPattern<vt, 1, "selectPCRelAddress",
+ [z_pcrel_wrapper, z_pcrel_offset]>,
+ PCRelOperand<vt, asmop> {
+ let MIOperandInfo = (ops !cast<Operand>(self));
+}
+
+// Constructs an AsmOperandClass for addressing mode FORMAT, treating the
+// registers as having BITSIZE bits and displacements as having DISPSIZE bits.
+// LENGTH is "LenN" for addresses with an N-bit length field, otherwise it
+// is "".
+class AddressAsmOperand<string format, string bitsize, string dispsize,
+ string length = "">
+ : AsmOperandClass {
+ let Name = format##bitsize##"Disp"##dispsize##length;
+ let ParserMethod = "parse"##format##bitsize;
+ let RenderMethod = "add"##format##"Operands";
+}
+
+// Constructs an instruction operand for an addressing mode. FORMAT,
+// BITSIZE, DISPSIZE and LENGTH are the parameters to an associated
+// AddressAsmOperand. OPERANDS is a list of individual operands
+// (base register, displacement, etc.).
+class AddressOperand<string bitsize, string dispsize, string length,
+ string format, dag operands>
+ : Operand<!cast<ValueType>("i"##bitsize)> {
+ let PrintMethod = "print"##format##"Operand";
+ let EncoderMethod = "get"##format##dispsize##length##"Encoding";
+ let DecoderMethod =
+ "decode"##format##bitsize##"Disp"##dispsize##length##"Operand";
+ let MIOperandInfo = operands;
+ let ParserMatchClass =
+ !cast<AddressAsmOperand>(format##bitsize##"Disp"##dispsize##length);
+}
+
+// Constructs both a DAG pattern and instruction operand for an addressing mode.
+// FORMAT, BITSIZE, DISPSIZE and LENGTH are the parameters to an associated
+// AddressAsmOperand. OPERANDS is a list of NUMOPS individual operands
+// (base register, displacement, etc.). SELTYPE is the type of the memory
+// operand for selection purposes; sometimes we want different selection
+// choices for the same underlying addressing mode. SUFFIX is similarly
+// a suffix appended to the displacement for selection purposes;
+// e.g. we want to reject small 20-bit displacements if a 12-bit form
+// also exists, but we want to accept them otherwise.
+class AddressingMode<string seltype, string bitsize, string dispsize,
+ string suffix, string length, int numops, string format,
+ dag operands>
+ : ComplexPattern<!cast<ValueType>("i"##bitsize), numops,
+ "select"##seltype##dispsize##suffix##length,
+ [add, sub, or, frameindex, z_adjdynalloc]>,
+ AddressOperand<bitsize, dispsize, length, format, operands>;
+
+// An addressing mode with a base and displacement but no index.
+class BDMode<string type, string bitsize, string dispsize, string suffix>
+ : AddressingMode<type, bitsize, dispsize, suffix, "", 2, "BDAddr",
+ (ops !cast<RegisterOperand>("ADDR"##bitsize),
+ !cast<Immediate>("disp"##dispsize##"imm"##bitsize))>;
+
+// An addressing mode with a base, displacement and index.
+class BDXMode<string type, string bitsize, string dispsize, string suffix>
+ : AddressingMode<type, bitsize, dispsize, suffix, "", 3, "BDXAddr",
+ (ops !cast<RegisterOperand>("ADDR"##bitsize),
+ !cast<Immediate>("disp"##dispsize##"imm"##bitsize),
+ !cast<RegisterOperand>("ADDR"##bitsize))>;
+
+// A BDMode paired with an immediate length operand of LENSIZE bits.
+class BDLMode<string type, string bitsize, string dispsize, string suffix,
+ string lensize>
+ : AddressingMode<type, bitsize, dispsize, suffix, "Len"##lensize, 3,
+ "BDLAddr",
+ (ops !cast<RegisterOperand>("ADDR"##bitsize),
+ !cast<Immediate>("disp"##dispsize##"imm"##bitsize),
+ !cast<Immediate>("imm"##bitsize))>;
+
+// A BDMode paired with a register length operand.
+class BDRMode<string type, string bitsize, string dispsize, string suffix>
+ : AddressingMode<type, bitsize, dispsize, suffix, "", 3, "BDRAddr",
+ (ops !cast<RegisterOperand>("ADDR"##bitsize),
+ !cast<Immediate>("disp"##dispsize##"imm"##bitsize),
+ !cast<RegisterOperand>("GR"##bitsize))>;
+
+// An addressing mode with a base, displacement and a vector index.
+class BDVMode<string bitsize, string dispsize>
+ : AddressOperand<bitsize, dispsize, "", "BDVAddr",
+ (ops !cast<RegisterOperand>("ADDR"##bitsize),
+ !cast<Immediate>("disp"##dispsize##"imm"##bitsize),
+ !cast<RegisterOperand>("VR128"))>;
+
+//===----------------------------------------------------------------------===//
+// Extracting immediate operands from nodes
+// These all create MVT::i64 nodes to ensure the value is not sign-extended
+// when converted from an SDNode to a MachineOperand later on.
+//===----------------------------------------------------------------------===//
+
+// Bits 0-15 (counting from the lsb).
+def LL16 : SDNodeXForm<imm, [{
+ uint64_t Value = N->getZExtValue() & 0x000000000000FFFFULL;
+ return CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i64);
+}]>;
+
+// Bits 16-31 (counting from the lsb).
+def LH16 : SDNodeXForm<imm, [{
+ uint64_t Value = (N->getZExtValue() & 0x00000000FFFF0000ULL) >> 16;
+ return CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i64);
+}]>;
+
+// Bits 32-47 (counting from the lsb).
+def HL16 : SDNodeXForm<imm, [{
+ uint64_t Value = (N->getZExtValue() & 0x0000FFFF00000000ULL) >> 32;
+ return CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i64);
+}]>;
+
+// Bits 48-63 (counting from the lsb).
+def HH16 : SDNodeXForm<imm, [{
+ uint64_t Value = (N->getZExtValue() & 0xFFFF000000000000ULL) >> 48;
+ return CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i64);
+}]>;
+
+// Low 32 bits.
+def LF32 : SDNodeXForm<imm, [{
+ uint64_t Value = N->getZExtValue() & 0x00000000FFFFFFFFULL;
+ return CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i64);
+}]>;
+
+// High 32 bits.
+def HF32 : SDNodeXForm<imm, [{
+ uint64_t Value = N->getZExtValue() >> 32;
+ return CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i64);
+}]>;
+
+// Truncate an immediate to a 8-bit signed quantity.
+def SIMM8 : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(int8_t(N->getZExtValue()), SDLoc(N),
+ MVT::i64);
+}]>;
+
+// Truncate an immediate to a 8-bit unsigned quantity.
+def UIMM8 : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(uint8_t(N->getZExtValue()), SDLoc(N),
+ MVT::i64);
+}]>;
+
+// Truncate an immediate to a 8-bit unsigned quantity and mask off low bit.
+def UIMM8EVEN : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() & 0xfe, SDLoc(N),
+ MVT::i64);
+}]>;
+
+// Truncate an immediate to a 12-bit unsigned quantity.
+def UIMM12 : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() & 0xfff, SDLoc(N),
+ MVT::i64);
+}]>;
+
+// Truncate an immediate to a 16-bit signed quantity.
+def SIMM16 : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(int16_t(N->getZExtValue()), SDLoc(N),
+ MVT::i64);
+}]>;
+
+// Truncate an immediate to a 16-bit unsigned quantity.
+def UIMM16 : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(uint16_t(N->getZExtValue()), SDLoc(N),
+ MVT::i64);
+}]>;
+
+// Truncate an immediate to a 32-bit signed quantity.
+def SIMM32 : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(int32_t(N->getZExtValue()), SDLoc(N),
+ MVT::i64);
+}]>;
+
+// Truncate an immediate to a 32-bit unsigned quantity.
+def UIMM32 : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(uint32_t(N->getZExtValue()), SDLoc(N),
+ MVT::i64);
+}]>;
+
+// Truncate an immediate to a 48-bit unsigned quantity.
+def UIMM48 : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(uint64_t(N->getZExtValue()) & 0xffffffffffff,
+ SDLoc(N), MVT::i64);
+}]>;
+
+// Negate and then truncate an immediate to a 32-bit unsigned quantity.
+def NEGIMM32 : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(uint32_t(-N->getZExtValue()), SDLoc(N),
+ MVT::i64);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Immediate asm operands.
+//===----------------------------------------------------------------------===//
+
+def U1Imm : ImmediateAsmOperand<"U1Imm">;
+def U2Imm : ImmediateAsmOperand<"U2Imm">;
+def U3Imm : ImmediateAsmOperand<"U3Imm">;
+def U4Imm : ImmediateAsmOperand<"U4Imm">;
+def U6Imm : ImmediateAsmOperand<"U6Imm">;
+def S8Imm : ImmediateAsmOperand<"S8Imm">;
+def U8Imm : ImmediateAsmOperand<"U8Imm">;
+def U12Imm : ImmediateAsmOperand<"U12Imm">;
+def S16Imm : ImmediateAsmOperand<"S16Imm">;
+def U16Imm : ImmediateAsmOperand<"U16Imm">;
+def S32Imm : ImmediateAsmOperand<"S32Imm">;
+def U32Imm : ImmediateAsmOperand<"U32Imm">;
+def U48Imm : ImmediateAsmOperand<"U48Imm">;
+
+//===----------------------------------------------------------------------===//
+// i32 immediates
+//===----------------------------------------------------------------------===//
+
+// Immediates for the lower and upper 16 bits of an i32, with the other
+// bits of the i32 being zero.
+def imm32ll16 : Immediate<i32, [{
+ return SystemZ::isImmLL(N->getZExtValue());
+}], LL16, "U16Imm">;
+
+def imm32lh16 : Immediate<i32, [{
+ return SystemZ::isImmLH(N->getZExtValue());
+}], LH16, "U16Imm">;
+
+// Immediates for the lower and upper 16 bits of an i32, with the other
+// bits of the i32 being one.
+def imm32ll16c : Immediate<i32, [{
+ return SystemZ::isImmLL(uint32_t(~N->getZExtValue()));
+}], LL16, "U16Imm">;
+
+def imm32lh16c : Immediate<i32, [{
+ return SystemZ::isImmLH(uint32_t(~N->getZExtValue()));
+}], LH16, "U16Imm">;
+
+// Short immediates
+def imm32zx1 : Immediate<i32, [{
+ return isUInt<1>(N->getZExtValue());
+}], NOOP_SDNodeXForm, "U1Imm">;
+
+def imm32zx2 : Immediate<i32, [{
+ return isUInt<2>(N->getZExtValue());
+}], NOOP_SDNodeXForm, "U2Imm">;
+
+def imm32zx3 : Immediate<i32, [{
+ return isUInt<3>(N->getZExtValue());
+}], NOOP_SDNodeXForm, "U3Imm">;
+
+def imm32zx4 : Immediate<i32, [{
+ return isUInt<4>(N->getZExtValue());
+}], NOOP_SDNodeXForm, "U4Imm">;
+
+// Note: this enforces an even value during code generation only.
+// When used from the assembler, any 4-bit value is allowed.
+def imm32zx4even : Immediate<i32, [{
+ return isUInt<4>(N->getZExtValue());
+}], UIMM8EVEN, "U4Imm">;
+
+def imm32zx6 : Immediate<i32, [{
+ return isUInt<6>(N->getZExtValue());
+}], NOOP_SDNodeXForm, "U6Imm">;
+
+def imm32sx8 : Immediate<i32, [{
+ return isInt<8>(N->getSExtValue());
+}], SIMM8, "S8Imm">;
+
+def imm32zx8 : Immediate<i32, [{
+ return isUInt<8>(N->getZExtValue());
+}], UIMM8, "U8Imm">;
+
+def imm32zx8trunc : Immediate<i32, [{}], UIMM8, "U8Imm">;
+
+def imm32zx12 : Immediate<i32, [{
+ return isUInt<12>(N->getZExtValue());
+}], UIMM12, "U12Imm">;
+
+def imm32sx16 : Immediate<i32, [{
+ return isInt<16>(N->getSExtValue());
+}], SIMM16, "S16Imm">;
+
+def imm32zx16 : Immediate<i32, [{
+ return isUInt<16>(N->getZExtValue());
+}], UIMM16, "U16Imm">;
+
+def imm32sx16trunc : Immediate<i32, [{}], SIMM16, "S16Imm">;
+
+// Full 32-bit immediates. we need both signed and unsigned versions
+// because the assembler is picky. E.g. AFI requires signed operands
+// while NILF requires unsigned ones.
+def simm32 : Immediate<i32, [{}], SIMM32, "S32Imm">;
+def uimm32 : Immediate<i32, [{}], UIMM32, "U32Imm">;
+
+def imm32 : ImmLeaf<i32, [{}]>;
+
+//===----------------------------------------------------------------------===//
+// 64-bit immediates
+//===----------------------------------------------------------------------===//
+
+// Immediates for 16-bit chunks of an i64, with the other bits of the
+// i32 being zero.
+def imm64ll16 : Immediate<i64, [{
+ return SystemZ::isImmLL(N->getZExtValue());
+}], LL16, "U16Imm">;
+
+def imm64lh16 : Immediate<i64, [{
+ return SystemZ::isImmLH(N->getZExtValue());
+}], LH16, "U16Imm">;
+
+def imm64hl16 : Immediate<i64, [{
+ return SystemZ::isImmHL(N->getZExtValue());
+}], HL16, "U16Imm">;
+
+def imm64hh16 : Immediate<i64, [{
+ return SystemZ::isImmHH(N->getZExtValue());
+}], HH16, "U16Imm">;
+
+// Immediates for 16-bit chunks of an i64, with the other bits of the
+// i32 being one.
+def imm64ll16c : Immediate<i64, [{
+ return SystemZ::isImmLL(uint64_t(~N->getZExtValue()));
+}], LL16, "U16Imm">;
+
+def imm64lh16c : Immediate<i64, [{
+ return SystemZ::isImmLH(uint64_t(~N->getZExtValue()));
+}], LH16, "U16Imm">;
+
+def imm64hl16c : Immediate<i64, [{
+ return SystemZ::isImmHL(uint64_t(~N->getZExtValue()));
+}], HL16, "U16Imm">;
+
+def imm64hh16c : Immediate<i64, [{
+ return SystemZ::isImmHH(uint64_t(~N->getZExtValue()));
+}], HH16, "U16Imm">;
+
+// Immediates for the lower and upper 32 bits of an i64, with the other
+// bits of the i32 being zero.
+def imm64lf32 : Immediate<i64, [{
+ return SystemZ::isImmLF(N->getZExtValue());
+}], LF32, "U32Imm">;
+
+def imm64hf32 : Immediate<i64, [{
+ return SystemZ::isImmHF(N->getZExtValue());
+}], HF32, "U32Imm">;
+
+// Immediates for the lower and upper 32 bits of an i64, with the other
+// bits of the i32 being one.
+def imm64lf32c : Immediate<i64, [{
+ return SystemZ::isImmLF(uint64_t(~N->getZExtValue()));
+}], LF32, "U32Imm">;
+
+def imm64hf32c : Immediate<i64, [{
+ return SystemZ::isImmHF(uint64_t(~N->getZExtValue()));
+}], HF32, "U32Imm">;
+
+// Short immediates.
+def imm64sx8 : Immediate<i64, [{
+ return isInt<8>(N->getSExtValue());
+}], SIMM8, "S8Imm">;
+
+def imm64zx8 : Immediate<i64, [{
+ return isUInt<8>(N->getSExtValue());
+}], UIMM8, "U8Imm">;
+
+def imm64sx16 : Immediate<i64, [{
+ return isInt<16>(N->getSExtValue());
+}], SIMM16, "S16Imm">;
+
+def imm64zx16 : Immediate<i64, [{
+ return isUInt<16>(N->getZExtValue());
+}], UIMM16, "U16Imm">;
+
+def imm64sx32 : Immediate<i64, [{
+ return isInt<32>(N->getSExtValue());
+}], SIMM32, "S32Imm">;
+
+def imm64zx32 : Immediate<i64, [{
+ return isUInt<32>(N->getZExtValue());
+}], UIMM32, "U32Imm">;
+
+def imm64zx32n : Immediate<i64, [{
+ return isUInt<32>(-N->getSExtValue());
+}], NEGIMM32, "U32Imm">;
+
+def imm64zx48 : Immediate<i64, [{
+ return isUInt<64>(N->getZExtValue());
+}], UIMM48, "U48Imm">;
+
+def imm64 : ImmLeaf<i64, [{}]>, Operand<i64>;
+
+//===----------------------------------------------------------------------===//
+// Floating-point immediates
+//===----------------------------------------------------------------------===//
+
+// Floating-point zero.
+def fpimm0 : PatLeaf<(fpimm), [{ return N->isExactlyValue(+0.0); }]>;
+
+// Floating point negative zero.
+def fpimmneg0 : PatLeaf<(fpimm), [{ return N->isExactlyValue(-0.0); }]>;
+
+//===----------------------------------------------------------------------===//
+// Symbolic address operands
+//===----------------------------------------------------------------------===//
+
+// PC-relative asm operands.
+def PCRel12 : PCRelAsmOperand<"12">;
+def PCRel16 : PCRelAsmOperand<"16">;
+def PCRel24 : PCRelAsmOperand<"24">;
+def PCRel32 : PCRelAsmOperand<"32">;
+def PCRelTLS16 : PCRelTLSAsmOperand<"16">;
+def PCRelTLS32 : PCRelTLSAsmOperand<"32">;
+
+// PC-relative offsets of a basic block. The offset is sign-extended
+// and multiplied by 2.
+def brtarget16 : PCRelOperand<OtherVT, PCRel16> {
+ let EncoderMethod = "getPC16DBLEncoding";
+ let DecoderMethod = "decodePC16DBLBranchOperand";
+}
+def brtarget32 : PCRelOperand<OtherVT, PCRel32> {
+ let EncoderMethod = "getPC32DBLEncoding";
+ let DecoderMethod = "decodePC32DBLBranchOperand";
+}
+
+// Variants of brtarget for use with branch prediction preload.
+def brtarget12bpp : PCRelOperand<OtherVT, PCRel12> {
+ let EncoderMethod = "getPC12DBLBPPEncoding";
+ let DecoderMethod = "decodePC12DBLBranchOperand";
+}
+def brtarget16bpp : PCRelOperand<OtherVT, PCRel16> {
+ let EncoderMethod = "getPC16DBLBPPEncoding";
+ let DecoderMethod = "decodePC16DBLBranchOperand";
+}
+def brtarget24bpp : PCRelOperand<OtherVT, PCRel24> {
+ let EncoderMethod = "getPC24DBLBPPEncoding";
+ let DecoderMethod = "decodePC24DBLBranchOperand";
+}
+
+// Variants of brtarget16/32 with an optional additional TLS symbol.
+// These are used to annotate calls to __tls_get_offset.
+def tlssym : Operand<i64> { }
+def brtarget16tls : PCRelTLSOperand<OtherVT, PCRelTLS16> {
+ let MIOperandInfo = (ops brtarget16:$func, tlssym:$sym);
+ let EncoderMethod = "getPC16DBLTLSEncoding";
+ let DecoderMethod = "decodePC16DBLBranchOperand";
+}
+def brtarget32tls : PCRelTLSOperand<OtherVT, PCRelTLS32> {
+ let MIOperandInfo = (ops brtarget32:$func, tlssym:$sym);
+ let EncoderMethod = "getPC32DBLTLSEncoding";
+ let DecoderMethod = "decodePC32DBLBranchOperand";
+}
+
+// A PC-relative offset of a global value. The offset is sign-extended
+// and multiplied by 2.
+def pcrel32 : PCRelAddress<i64, "pcrel32", PCRel32> {
+ let EncoderMethod = "getPC32DBLEncoding";
+ let DecoderMethod = "decodePC32DBLOperand";
+}
+
+//===----------------------------------------------------------------------===//
+// Addressing modes
+//===----------------------------------------------------------------------===//
+
+// 12-bit displacement operands.
+def disp12imm32 : Operand<i32>;
+def disp12imm64 : Operand<i64>;
+
+// 20-bit displacement operands.
+def disp20imm32 : Operand<i32>;
+def disp20imm64 : Operand<i64>;
+
+def BDAddr32Disp12 : AddressAsmOperand<"BDAddr", "32", "12">;
+def BDAddr32Disp20 : AddressAsmOperand<"BDAddr", "32", "20">;
+def BDAddr64Disp12 : AddressAsmOperand<"BDAddr", "64", "12">;
+def BDAddr64Disp20 : AddressAsmOperand<"BDAddr", "64", "20">;
+def BDXAddr64Disp12 : AddressAsmOperand<"BDXAddr", "64", "12">;
+def BDXAddr64Disp20 : AddressAsmOperand<"BDXAddr", "64", "20">;
+def BDLAddr64Disp12Len8 : AddressAsmOperand<"BDLAddr", "64", "12", "Len8">;
+def BDRAddr64Disp12 : AddressAsmOperand<"BDRAddr", "64", "12">;
+def BDVAddr64Disp12 : AddressAsmOperand<"BDVAddr", "64", "12">;
+
+// DAG patterns and operands for addressing modes. Each mode has
+// the form <type><range><group>[<len>] where:
+//
+// <type> is one of:
+// shift : base + displacement (32-bit)
+// bdaddr : base + displacement
+// mviaddr : like bdaddr, but reject cases with a natural index
+// bdxaddr : base + displacement + index
+// laaddr : like bdxaddr, but used for Load Address operations
+// dynalloc : base + displacement + index + ADJDYNALLOC
+// bdladdr : base + displacement with a length field
+// bdvaddr : base + displacement with a vector index
+//
+// <range> is one of:
+// 12 : the displacement is an unsigned 12-bit value
+// 20 : the displacement is a signed 20-bit value
+//
+// <group> is one of:
+// pair : used when there is an equivalent instruction with the opposite
+// range value (12 or 20)
+// only : used when there is no equivalent instruction with the opposite
+// range value
+//
+// <len> is one of:
+//
+// <empty> : there is no length field
+// len8 : the length field is 8 bits, with a range of [1, 0x100].
+def shift12only : BDMode <"BDAddr", "32", "12", "Only">;
+def shift20only : BDMode <"BDAddr", "32", "20", "Only">;
+def bdaddr12only : BDMode <"BDAddr", "64", "12", "Only">;
+def bdaddr12pair : BDMode <"BDAddr", "64", "12", "Pair">;
+def bdaddr20only : BDMode <"BDAddr", "64", "20", "Only">;
+def bdaddr20pair : BDMode <"BDAddr", "64", "20", "Pair">;
+def mviaddr12pair : BDMode <"MVIAddr", "64", "12", "Pair">;
+def mviaddr20pair : BDMode <"MVIAddr", "64", "20", "Pair">;
+def bdxaddr12only : BDXMode<"BDXAddr", "64", "12", "Only">;
+def bdxaddr12pair : BDXMode<"BDXAddr", "64", "12", "Pair">;
+def bdxaddr20only : BDXMode<"BDXAddr", "64", "20", "Only">;
+def bdxaddr20only128 : BDXMode<"BDXAddr", "64", "20", "Only128">;
+def bdxaddr20pair : BDXMode<"BDXAddr", "64", "20", "Pair">;
+def dynalloc12only : BDXMode<"DynAlloc", "64", "12", "Only">;
+def laaddr12pair : BDXMode<"LAAddr", "64", "12", "Pair">;
+def laaddr20pair : BDXMode<"LAAddr", "64", "20", "Pair">;
+def bdladdr12onlylen8 : BDLMode<"BDLAddr", "64", "12", "Only", "8">;
+def bdraddr12only : BDRMode<"BDRAddr", "64", "12", "Only">;
+def bdvaddr12only : BDVMode< "64", "12">;
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous
+//===----------------------------------------------------------------------===//
+
+// A 4-bit condition-code mask.
+def cond4 : PatLeaf<(i32 imm), [{ return (N->getZExtValue() < 16); }]>,
+ Operand<i32> {
+ let PrintMethod = "printCond4Operand";
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZOperators.td b/contrib/llvm/lib/Target/SystemZ/SystemZOperators.td
new file mode 100644
index 000000000000..fde26ed4e1c5
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZOperators.td
@@ -0,0 +1,684 @@
+//===-- SystemZOperators.td - SystemZ-specific operators ------*- tblgen-*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Type profiles
+//===----------------------------------------------------------------------===//
+def SDT_CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i64>]>;
+def SDT_CallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i64>,
+ SDTCisVT<1, i64>]>;
+def SDT_ZCall : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
+def SDT_ZCmp : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
+def SDT_ZICmp : SDTypeProfile<0, 3,
+ [SDTCisSameAs<0, 1>,
+ SDTCisVT<2, i32>]>;
+def SDT_ZBRCCMask : SDTypeProfile<0, 3,
+ [SDTCisVT<0, i32>,
+ SDTCisVT<1, i32>,
+ SDTCisVT<2, OtherVT>]>;
+def SDT_ZSelectCCMask : SDTypeProfile<1, 4,
+ [SDTCisSameAs<0, 1>,
+ SDTCisSameAs<1, 2>,
+ SDTCisVT<3, i32>,
+ SDTCisVT<4, i32>]>;
+def SDT_ZWrapPtr : SDTypeProfile<1, 1,
+ [SDTCisSameAs<0, 1>,
+ SDTCisPtrTy<0>]>;
+def SDT_ZWrapOffset : SDTypeProfile<1, 2,
+ [SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>,
+ SDTCisPtrTy<0>]>;
+def SDT_ZAdjDynAlloc : SDTypeProfile<1, 0, [SDTCisVT<0, i64>]>;
+def SDT_ZGR128Binary32 : SDTypeProfile<1, 2,
+ [SDTCisVT<0, untyped>,
+ SDTCisVT<1, untyped>,
+ SDTCisVT<2, i32>]>;
+def SDT_ZGR128Binary64 : SDTypeProfile<1, 2,
+ [SDTCisVT<0, untyped>,
+ SDTCisVT<1, untyped>,
+ SDTCisVT<2, i64>]>;
+def SDT_ZAtomicLoadBinaryW : SDTypeProfile<1, 5,
+ [SDTCisVT<0, i32>,
+ SDTCisPtrTy<1>,
+ SDTCisVT<2, i32>,
+ SDTCisVT<3, i32>,
+ SDTCisVT<4, i32>,
+ SDTCisVT<5, i32>]>;
+def SDT_ZAtomicCmpSwapW : SDTypeProfile<1, 6,
+ [SDTCisVT<0, i32>,
+ SDTCisPtrTy<1>,
+ SDTCisVT<2, i32>,
+ SDTCisVT<3, i32>,
+ SDTCisVT<4, i32>,
+ SDTCisVT<5, i32>,
+ SDTCisVT<6, i32>]>;
+def SDT_ZMemMemLength : SDTypeProfile<0, 3,
+ [SDTCisPtrTy<0>,
+ SDTCisPtrTy<1>,
+ SDTCisVT<2, i64>]>;
+def SDT_ZMemMemLoop : SDTypeProfile<0, 4,
+ [SDTCisPtrTy<0>,
+ SDTCisPtrTy<1>,
+ SDTCisVT<2, i64>,
+ SDTCisVT<3, i64>]>;
+def SDT_ZString : SDTypeProfile<1, 3,
+ [SDTCisPtrTy<0>,
+ SDTCisPtrTy<1>,
+ SDTCisPtrTy<2>,
+ SDTCisVT<3, i32>]>;
+def SDT_ZI32Intrinsic : SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>;
+def SDT_ZPrefetch : SDTypeProfile<0, 2,
+ [SDTCisVT<0, i32>,
+ SDTCisPtrTy<1>]>;
+def SDT_ZLoadBSwap : SDTypeProfile<1, 2,
+ [SDTCisInt<0>,
+ SDTCisPtrTy<1>,
+ SDTCisVT<2, OtherVT>]>;
+def SDT_ZStoreBSwap : SDTypeProfile<0, 3,
+ [SDTCisInt<0>,
+ SDTCisPtrTy<1>,
+ SDTCisVT<2, OtherVT>]>;
+def SDT_ZTBegin : SDTypeProfile<0, 2,
+ [SDTCisPtrTy<0>,
+ SDTCisVT<1, i32>]>;
+def SDT_ZInsertVectorElt : SDTypeProfile<1, 3,
+ [SDTCisVec<0>,
+ SDTCisSameAs<0, 1>,
+ SDTCisVT<3, i32>]>;
+def SDT_ZExtractVectorElt : SDTypeProfile<1, 2,
+ [SDTCisVec<1>,
+ SDTCisVT<2, i32>]>;
+def SDT_ZReplicate : SDTypeProfile<1, 1,
+ [SDTCisVec<0>]>;
+def SDT_ZVecUnaryConv : SDTypeProfile<1, 1,
+ [SDTCisVec<0>,
+ SDTCisVec<1>]>;
+def SDT_ZVecUnary : SDTypeProfile<1, 1,
+ [SDTCisVec<0>,
+ SDTCisSameAs<0, 1>]>;
+def SDT_ZVecBinary : SDTypeProfile<1, 2,
+ [SDTCisVec<0>,
+ SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>]>;
+def SDT_ZVecBinaryInt : SDTypeProfile<1, 2,
+ [SDTCisVec<0>,
+ SDTCisSameAs<0, 1>,
+ SDTCisVT<2, i32>]>;
+def SDT_ZVecBinaryConv : SDTypeProfile<1, 2,
+ [SDTCisVec<0>,
+ SDTCisVec<1>,
+ SDTCisSameAs<1, 2>]>;
+def SDT_ZVecBinaryConvInt : SDTypeProfile<1, 2,
+ [SDTCisVec<0>,
+ SDTCisVec<1>,
+ SDTCisVT<2, i32>]>;
+def SDT_ZRotateMask : SDTypeProfile<1, 2,
+ [SDTCisVec<0>,
+ SDTCisVT<1, i32>,
+ SDTCisVT<2, i32>]>;
+def SDT_ZJoinDwords : SDTypeProfile<1, 2,
+ [SDTCisVT<0, v2i64>,
+ SDTCisVT<1, i64>,
+ SDTCisVT<2, i64>]>;
+def SDT_ZVecTernary : SDTypeProfile<1, 3,
+ [SDTCisVec<0>,
+ SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>,
+ SDTCisSameAs<0, 3>]>;
+def SDT_ZVecTernaryInt : SDTypeProfile<1, 3,
+ [SDTCisVec<0>,
+ SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>,
+ SDTCisVT<3, i32>]>;
+def SDT_ZVecQuaternaryInt : SDTypeProfile<1, 4,
+ [SDTCisVec<0>,
+ SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>,
+ SDTCisSameAs<0, 3>,
+ SDTCisVT<4, i32>]>;
+def SDT_ZTest : SDTypeProfile<0, 2, [SDTCisVT<1, i64>]>;
+
+//===----------------------------------------------------------------------===//
+// Node definitions
+//===----------------------------------------------------------------------===//
+
+// These are target-independent nodes, but have target-specific formats.
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_CallSeqStart,
+ [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>;
+def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_CallSeqEnd,
+ [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue,
+ SDNPOutGlue]>;
+def global_offset_table : SDNode<"ISD::GLOBAL_OFFSET_TABLE", SDTPtrLeaf>;
+
+// Nodes for SystemZISD::*. See SystemZISelLowering.h for more details.
+def z_retflag : SDNode<"SystemZISD::RET_FLAG", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def z_call : SDNode<"SystemZISD::CALL", SDT_ZCall,
+ [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
+ SDNPVariadic]>;
+def z_sibcall : SDNode<"SystemZISD::SIBCALL", SDT_ZCall,
+ [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
+ SDNPVariadic]>;
+def z_tls_gdcall : SDNode<"SystemZISD::TLS_GDCALL", SDT_ZCall,
+ [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
+ SDNPVariadic]>;
+def z_tls_ldcall : SDNode<"SystemZISD::TLS_LDCALL", SDT_ZCall,
+ [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
+ SDNPVariadic]>;
+def z_pcrel_wrapper : SDNode<"SystemZISD::PCREL_WRAPPER", SDT_ZWrapPtr, []>;
+def z_pcrel_offset : SDNode<"SystemZISD::PCREL_OFFSET",
+ SDT_ZWrapOffset, []>;
+def z_iabs : SDNode<"SystemZISD::IABS", SDTIntUnaryOp, []>;
+def z_icmp : SDNode<"SystemZISD::ICMP", SDT_ZICmp, [SDNPOutGlue]>;
+def z_fcmp : SDNode<"SystemZISD::FCMP", SDT_ZCmp, [SDNPOutGlue]>;
+def z_tm : SDNode<"SystemZISD::TM", SDT_ZICmp, [SDNPOutGlue]>;
+def z_br_ccmask : SDNode<"SystemZISD::BR_CCMASK", SDT_ZBRCCMask,
+ [SDNPHasChain, SDNPInGlue]>;
+def z_select_ccmask : SDNode<"SystemZISD::SELECT_CCMASK", SDT_ZSelectCCMask,
+ [SDNPInGlue]>;
+def z_adjdynalloc : SDNode<"SystemZISD::ADJDYNALLOC", SDT_ZAdjDynAlloc>;
+def z_popcnt : SDNode<"SystemZISD::POPCNT", SDTIntUnaryOp>;
+def z_umul_lohi64 : SDNode<"SystemZISD::UMUL_LOHI64", SDT_ZGR128Binary64>;
+def z_sdivrem32 : SDNode<"SystemZISD::SDIVREM32", SDT_ZGR128Binary32>;
+def z_sdivrem64 : SDNode<"SystemZISD::SDIVREM64", SDT_ZGR128Binary64>;
+def z_udivrem32 : SDNode<"SystemZISD::UDIVREM32", SDT_ZGR128Binary32>;
+def z_udivrem64 : SDNode<"SystemZISD::UDIVREM64", SDT_ZGR128Binary64>;
+
+def z_serialize : SDNode<"SystemZISD::SERIALIZE", SDTNone,
+ [SDNPHasChain, SDNPMayStore]>;
+def z_membarrier : SDNode<"SystemZISD::MEMBARRIER", SDTNone,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+def z_loadbswap : SDNode<"SystemZISD::LRV", SDT_ZLoadBSwap,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def z_storebswap : SDNode<"SystemZISD::STRV", SDT_ZStoreBSwap,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+def z_tdc : SDNode<"SystemZISD::TDC", SDT_ZTest, [SDNPOutGlue]>;
+
+// Defined because the index is an i32 rather than a pointer.
+def z_vector_insert : SDNode<"ISD::INSERT_VECTOR_ELT",
+ SDT_ZInsertVectorElt>;
+def z_vector_extract : SDNode<"ISD::EXTRACT_VECTOR_ELT",
+ SDT_ZExtractVectorElt>;
+def z_byte_mask : SDNode<"SystemZISD::BYTE_MASK", SDT_ZReplicate>;
+def z_rotate_mask : SDNode<"SystemZISD::ROTATE_MASK", SDT_ZRotateMask>;
+def z_replicate : SDNode<"SystemZISD::REPLICATE", SDT_ZReplicate>;
+def z_join_dwords : SDNode<"SystemZISD::JOIN_DWORDS", SDT_ZJoinDwords>;
+def z_splat : SDNode<"SystemZISD::SPLAT", SDT_ZVecBinaryInt>;
+def z_merge_high : SDNode<"SystemZISD::MERGE_HIGH", SDT_ZVecBinary>;
+def z_merge_low : SDNode<"SystemZISD::MERGE_LOW", SDT_ZVecBinary>;
+def z_shl_double : SDNode<"SystemZISD::SHL_DOUBLE", SDT_ZVecTernaryInt>;
+def z_permute_dwords : SDNode<"SystemZISD::PERMUTE_DWORDS",
+ SDT_ZVecTernaryInt>;
+def z_permute : SDNode<"SystemZISD::PERMUTE", SDT_ZVecTernary>;
+def z_pack : SDNode<"SystemZISD::PACK", SDT_ZVecBinaryConv>;
+def z_packs_cc : SDNode<"SystemZISD::PACKS_CC", SDT_ZVecBinaryConv,
+ [SDNPOutGlue]>;
+def z_packls_cc : SDNode<"SystemZISD::PACKLS_CC", SDT_ZVecBinaryConv,
+ [SDNPOutGlue]>;
+def z_unpack_high : SDNode<"SystemZISD::UNPACK_HIGH", SDT_ZVecUnaryConv>;
+def z_unpackl_high : SDNode<"SystemZISD::UNPACKL_HIGH", SDT_ZVecUnaryConv>;
+def z_unpack_low : SDNode<"SystemZISD::UNPACK_LOW", SDT_ZVecUnaryConv>;
+def z_unpackl_low : SDNode<"SystemZISD::UNPACKL_LOW", SDT_ZVecUnaryConv>;
+def z_vshl_by_scalar : SDNode<"SystemZISD::VSHL_BY_SCALAR",
+ SDT_ZVecBinaryInt>;
+def z_vsrl_by_scalar : SDNode<"SystemZISD::VSRL_BY_SCALAR",
+ SDT_ZVecBinaryInt>;
+def z_vsra_by_scalar : SDNode<"SystemZISD::VSRA_BY_SCALAR",
+ SDT_ZVecBinaryInt>;
+def z_vsum : SDNode<"SystemZISD::VSUM", SDT_ZVecBinaryConv>;
+def z_vicmpe : SDNode<"SystemZISD::VICMPE", SDT_ZVecBinary>;
+def z_vicmph : SDNode<"SystemZISD::VICMPH", SDT_ZVecBinary>;
+def z_vicmphl : SDNode<"SystemZISD::VICMPHL", SDT_ZVecBinary>;
+def z_vicmpes : SDNode<"SystemZISD::VICMPES", SDT_ZVecBinary,
+ [SDNPOutGlue]>;
+def z_vicmphs : SDNode<"SystemZISD::VICMPHS", SDT_ZVecBinary,
+ [SDNPOutGlue]>;
+def z_vicmphls : SDNode<"SystemZISD::VICMPHLS", SDT_ZVecBinary,
+ [SDNPOutGlue]>;
+def z_vfcmpe : SDNode<"SystemZISD::VFCMPE", SDT_ZVecBinaryConv>;
+def z_vfcmph : SDNode<"SystemZISD::VFCMPH", SDT_ZVecBinaryConv>;
+def z_vfcmphe : SDNode<"SystemZISD::VFCMPHE", SDT_ZVecBinaryConv>;
+def z_vfcmpes : SDNode<"SystemZISD::VFCMPES", SDT_ZVecBinaryConv,
+ [SDNPOutGlue]>;
+def z_vfcmphs : SDNode<"SystemZISD::VFCMPHS", SDT_ZVecBinaryConv,
+ [SDNPOutGlue]>;
+def z_vfcmphes : SDNode<"SystemZISD::VFCMPHES", SDT_ZVecBinaryConv,
+ [SDNPOutGlue]>;
+def z_vextend : SDNode<"SystemZISD::VEXTEND", SDT_ZVecUnaryConv>;
+def z_vround : SDNode<"SystemZISD::VROUND", SDT_ZVecUnaryConv>;
+def z_vtm : SDNode<"SystemZISD::VTM", SDT_ZCmp, [SDNPOutGlue]>;
+def z_vfae_cc : SDNode<"SystemZISD::VFAE_CC", SDT_ZVecTernaryInt,
+ [SDNPOutGlue]>;
+def z_vfaez_cc : SDNode<"SystemZISD::VFAEZ_CC", SDT_ZVecTernaryInt,
+ [SDNPOutGlue]>;
+def z_vfee_cc : SDNode<"SystemZISD::VFEE_CC", SDT_ZVecBinary,
+ [SDNPOutGlue]>;
+def z_vfeez_cc : SDNode<"SystemZISD::VFEEZ_CC", SDT_ZVecBinary,
+ [SDNPOutGlue]>;
+def z_vfene_cc : SDNode<"SystemZISD::VFENE_CC", SDT_ZVecBinary,
+ [SDNPOutGlue]>;
+def z_vfenez_cc : SDNode<"SystemZISD::VFENEZ_CC", SDT_ZVecBinary,
+ [SDNPOutGlue]>;
+def z_vistr_cc : SDNode<"SystemZISD::VISTR_CC", SDT_ZVecUnary,
+ [SDNPOutGlue]>;
+def z_vstrc_cc : SDNode<"SystemZISD::VSTRC_CC", SDT_ZVecQuaternaryInt,
+ [SDNPOutGlue]>;
+def z_vstrcz_cc : SDNode<"SystemZISD::VSTRCZ_CC",
+ SDT_ZVecQuaternaryInt, [SDNPOutGlue]>;
+def z_vftci : SDNode<"SystemZISD::VFTCI", SDT_ZVecBinaryConvInt,
+ [SDNPOutGlue]>;
+
+class AtomicWOp<string name, SDTypeProfile profile = SDT_ZAtomicLoadBinaryW>
+ : SDNode<"SystemZISD::"##name, profile,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
+
+def z_atomic_swapw : AtomicWOp<"ATOMIC_SWAPW">;
+def z_atomic_loadw_add : AtomicWOp<"ATOMIC_LOADW_ADD">;
+def z_atomic_loadw_sub : AtomicWOp<"ATOMIC_LOADW_SUB">;
+def z_atomic_loadw_and : AtomicWOp<"ATOMIC_LOADW_AND">;
+def z_atomic_loadw_or : AtomicWOp<"ATOMIC_LOADW_OR">;
+def z_atomic_loadw_xor : AtomicWOp<"ATOMIC_LOADW_XOR">;
+def z_atomic_loadw_nand : AtomicWOp<"ATOMIC_LOADW_NAND">;
+def z_atomic_loadw_min : AtomicWOp<"ATOMIC_LOADW_MIN">;
+def z_atomic_loadw_max : AtomicWOp<"ATOMIC_LOADW_MAX">;
+def z_atomic_loadw_umin : AtomicWOp<"ATOMIC_LOADW_UMIN">;
+def z_atomic_loadw_umax : AtomicWOp<"ATOMIC_LOADW_UMAX">;
+def z_atomic_cmp_swapw : AtomicWOp<"ATOMIC_CMP_SWAPW", SDT_ZAtomicCmpSwapW>;
+
+def z_mvc : SDNode<"SystemZISD::MVC", SDT_ZMemMemLength,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_mvc_loop : SDNode<"SystemZISD::MVC_LOOP", SDT_ZMemMemLoop,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_nc : SDNode<"SystemZISD::NC", SDT_ZMemMemLength,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_nc_loop : SDNode<"SystemZISD::NC_LOOP", SDT_ZMemMemLoop,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_oc : SDNode<"SystemZISD::OC", SDT_ZMemMemLength,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_oc_loop : SDNode<"SystemZISD::OC_LOOP", SDT_ZMemMemLoop,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_xc : SDNode<"SystemZISD::XC", SDT_ZMemMemLength,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_xc_loop : SDNode<"SystemZISD::XC_LOOP", SDT_ZMemMemLoop,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_clc : SDNode<"SystemZISD::CLC", SDT_ZMemMemLength,
+ [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
+def z_clc_loop : SDNode<"SystemZISD::CLC_LOOP", SDT_ZMemMemLoop,
+ [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
+def z_strcmp : SDNode<"SystemZISD::STRCMP", SDT_ZString,
+ [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
+def z_stpcpy : SDNode<"SystemZISD::STPCPY", SDT_ZString,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_search_string : SDNode<"SystemZISD::SEARCH_STRING", SDT_ZString,
+ [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
+def z_ipm : SDNode<"SystemZISD::IPM", SDT_ZI32Intrinsic,
+ [SDNPInGlue]>;
+def z_prefetch : SDNode<"SystemZISD::PREFETCH", SDT_ZPrefetch,
+ [SDNPHasChain, SDNPMayLoad, SDNPMayStore,
+ SDNPMemOperand]>;
+
+def z_tbegin : SDNode<"SystemZISD::TBEGIN", SDT_ZTBegin,
+ [SDNPHasChain, SDNPOutGlue, SDNPMayStore,
+ SDNPSideEffect]>;
+def z_tbegin_nofloat : SDNode<"SystemZISD::TBEGIN_NOFLOAT", SDT_ZTBegin,
+ [SDNPHasChain, SDNPOutGlue, SDNPMayStore,
+ SDNPSideEffect]>;
+def z_tend : SDNode<"SystemZISD::TEND", SDTNone,
+ [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
+
+def z_vshl : SDNode<"ISD::SHL", SDT_ZVecBinary>;
+def z_vsra : SDNode<"ISD::SRA", SDT_ZVecBinary>;
+def z_vsrl : SDNode<"ISD::SRL", SDT_ZVecBinary>;
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments
+//===----------------------------------------------------------------------===//
+
+def z_lrvh : PatFrag<(ops node:$addr), (z_loadbswap node:$addr, i16)>;
+def z_lrv : PatFrag<(ops node:$addr), (z_loadbswap node:$addr, i32)>;
+def z_lrvg : PatFrag<(ops node:$addr), (z_loadbswap node:$addr, i64)>;
+
+def z_strvh : PatFrag<(ops node:$src, node:$addr),
+ (z_storebswap node:$src, node:$addr, i16)>;
+def z_strv : PatFrag<(ops node:$src, node:$addr),
+ (z_storebswap node:$src, node:$addr, i32)>;
+def z_strvg : PatFrag<(ops node:$src, node:$addr),
+ (z_storebswap node:$src, node:$addr, i64)>;
+
+// Signed and unsigned comparisons.
+def z_scmp : PatFrag<(ops node:$a, node:$b), (z_icmp node:$a, node:$b, imm), [{
+ unsigned Type = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+ return Type != SystemZICMP::UnsignedOnly;
+}]>;
+def z_ucmp : PatFrag<(ops node:$a, node:$b), (z_icmp node:$a, node:$b, imm), [{
+ unsigned Type = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+ return Type != SystemZICMP::SignedOnly;
+}]>;
+
+// Register- and memory-based TEST UNDER MASK.
+def z_tm_reg : PatFrag<(ops node:$a, node:$b), (z_tm node:$a, node:$b, imm)>;
+def z_tm_mem : PatFrag<(ops node:$a, node:$b), (z_tm node:$a, node:$b, 0)>;
+
+// Register sign-extend operations. Sub-32-bit values are represented as i32s.
+def sext8 : PatFrag<(ops node:$src), (sext_inreg node:$src, i8)>;
+def sext16 : PatFrag<(ops node:$src), (sext_inreg node:$src, i16)>;
+def sext32 : PatFrag<(ops node:$src), (sext (i32 node:$src))>;
+
+// Match extensions of an i32 to an i64, followed by an in-register sign
+// extension from a sub-i32 value.
+def sext8dbl : PatFrag<(ops node:$src), (sext8 (anyext node:$src))>;
+def sext16dbl : PatFrag<(ops node:$src), (sext16 (anyext node:$src))>;
+
+// Register zero-extend operations. Sub-32-bit values are represented as i32s.
+def zext8 : PatFrag<(ops node:$src), (and node:$src, 0xff)>;
+def zext16 : PatFrag<(ops node:$src), (and node:$src, 0xffff)>;
+def zext32 : PatFrag<(ops node:$src), (zext (i32 node:$src))>;
+
+// Extending loads in which the extension type can be signed.
+def asextload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{
+ unsigned Type = cast<LoadSDNode>(N)->getExtensionType();
+ return Type == ISD::EXTLOAD || Type == ISD::SEXTLOAD;
+}]>;
+def asextloadi8 : PatFrag<(ops node:$ptr), (asextload node:$ptr), [{
+ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+def asextloadi16 : PatFrag<(ops node:$ptr), (asextload node:$ptr), [{
+ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def asextloadi32 : PatFrag<(ops node:$ptr), (asextload node:$ptr), [{
+ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+// Extending loads in which the extension type can be unsigned.
+def azextload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{
+ unsigned Type = cast<LoadSDNode>(N)->getExtensionType();
+ return Type == ISD::EXTLOAD || Type == ISD::ZEXTLOAD;
+}]>;
+def azextloadi8 : PatFrag<(ops node:$ptr), (azextload node:$ptr), [{
+ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+def azextloadi16 : PatFrag<(ops node:$ptr), (azextload node:$ptr), [{
+ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def azextloadi32 : PatFrag<(ops node:$ptr), (azextload node:$ptr), [{
+ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+// Extending loads in which the extension type doesn't matter.
+def anyextload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{
+ return cast<LoadSDNode>(N)->getExtensionType() != ISD::NON_EXTLOAD;
+}]>;
+def anyextloadi8 : PatFrag<(ops node:$ptr), (anyextload node:$ptr), [{
+ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+def anyextloadi16 : PatFrag<(ops node:$ptr), (anyextload node:$ptr), [{
+ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def anyextloadi32 : PatFrag<(ops node:$ptr), (anyextload node:$ptr), [{
+ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+// Aligned loads.
+class AlignedLoad<SDPatternOperator load>
+ : PatFrag<(ops node:$addr), (load node:$addr), [{
+ auto *Load = cast<LoadSDNode>(N);
+ return Load->getAlignment() >= Load->getMemoryVT().getStoreSize();
+}]>;
+def aligned_load : AlignedLoad<load>;
+def aligned_asextloadi16 : AlignedLoad<asextloadi16>;
+def aligned_asextloadi32 : AlignedLoad<asextloadi32>;
+def aligned_azextloadi16 : AlignedLoad<azextloadi16>;
+def aligned_azextloadi32 : AlignedLoad<azextloadi32>;
+
+// Aligned stores.
+class AlignedStore<SDPatternOperator store>
+ : PatFrag<(ops node:$src, node:$addr), (store node:$src, node:$addr), [{
+ auto *Store = cast<StoreSDNode>(N);
+ return Store->getAlignment() >= Store->getMemoryVT().getStoreSize();
+}]>;
+def aligned_store : AlignedStore<store>;
+def aligned_truncstorei16 : AlignedStore<truncstorei16>;
+def aligned_truncstorei32 : AlignedStore<truncstorei32>;
+
+// Non-volatile loads. Used for instructions that might access the storage
+// location multiple times.
+class NonvolatileLoad<SDPatternOperator load>
+ : PatFrag<(ops node:$addr), (load node:$addr), [{
+ auto *Load = cast<LoadSDNode>(N);
+ return !Load->isVolatile();
+}]>;
+def nonvolatile_load : NonvolatileLoad<load>;
+def nonvolatile_anyextloadi8 : NonvolatileLoad<anyextloadi8>;
+def nonvolatile_anyextloadi16 : NonvolatileLoad<anyextloadi16>;
+def nonvolatile_anyextloadi32 : NonvolatileLoad<anyextloadi32>;
+
+// Non-volatile stores.
+class NonvolatileStore<SDPatternOperator store>
+ : PatFrag<(ops node:$src, node:$addr), (store node:$src, node:$addr), [{
+ auto *Store = cast<StoreSDNode>(N);
+ return !Store->isVolatile();
+}]>;
+def nonvolatile_store : NonvolatileStore<store>;
+def nonvolatile_truncstorei8 : NonvolatileStore<truncstorei8>;
+def nonvolatile_truncstorei16 : NonvolatileStore<truncstorei16>;
+def nonvolatile_truncstorei32 : NonvolatileStore<truncstorei32>;
+
+// A store of a load that can be implemented using MVC.
+def mvc_store : PatFrag<(ops node:$value, node:$addr),
+ (unindexedstore node:$value, node:$addr),
+ [{ return storeLoadCanUseMVC(N); }]>;
+
+// Binary read-modify-write operations on memory in which the other
+// operand is also memory and for which block operations like NC can
+// be used. There are two patterns for each operator, depending on
+// which operand contains the "other" load.
+multiclass block_op<SDPatternOperator operator> {
+ def "1" : PatFrag<(ops node:$value, node:$addr),
+ (unindexedstore (operator node:$value,
+ (unindexedload node:$addr)),
+ node:$addr),
+ [{ return storeLoadCanUseBlockBinary(N, 0); }]>;
+ def "2" : PatFrag<(ops node:$value, node:$addr),
+ (unindexedstore (operator (unindexedload node:$addr),
+ node:$value),
+ node:$addr),
+ [{ return storeLoadCanUseBlockBinary(N, 1); }]>;
+}
+defm block_and : block_op<and>;
+defm block_or : block_op<or>;
+defm block_xor : block_op<xor>;
+
+// Insertions.
+def inserti8 : PatFrag<(ops node:$src1, node:$src2),
+ (or (and node:$src1, -256), node:$src2)>;
+def insertll : PatFrag<(ops node:$src1, node:$src2),
+ (or (and node:$src1, 0xffffffffffff0000), node:$src2)>;
+def insertlh : PatFrag<(ops node:$src1, node:$src2),
+ (or (and node:$src1, 0xffffffff0000ffff), node:$src2)>;
+def inserthl : PatFrag<(ops node:$src1, node:$src2),
+ (or (and node:$src1, 0xffff0000ffffffff), node:$src2)>;
+def inserthh : PatFrag<(ops node:$src1, node:$src2),
+ (or (and node:$src1, 0x0000ffffffffffff), node:$src2)>;
+def insertlf : PatFrag<(ops node:$src1, node:$src2),
+ (or (and node:$src1, 0xffffffff00000000), node:$src2)>;
+def inserthf : PatFrag<(ops node:$src1, node:$src2),
+ (or (and node:$src1, 0x00000000ffffffff), node:$src2)>;
+
+// ORs that can be treated as insertions.
+def or_as_inserti8 : PatFrag<(ops node:$src1, node:$src2),
+ (or node:$src1, node:$src2), [{
+ unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
+ return CurDAG->MaskedValueIsZero(N->getOperand(0),
+ APInt::getLowBitsSet(BitWidth, 8));
+}]>;
+
+// ORs that can be treated as reversed insertions.
+def or_as_revinserti8 : PatFrag<(ops node:$src1, node:$src2),
+ (or node:$src1, node:$src2), [{
+ unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
+ return CurDAG->MaskedValueIsZero(N->getOperand(1),
+ APInt::getLowBitsSet(BitWidth, 8));
+}]>;
+
+// Negative integer absolute.
+def z_inegabs : PatFrag<(ops node:$src), (ineg (z_iabs node:$src))>;
+
+// Integer absolute, matching the canonical form generated by DAGCombiner.
+def z_iabs32 : PatFrag<(ops node:$src),
+ (xor (add node:$src, (sra node:$src, (i32 31))),
+ (sra node:$src, (i32 31)))>;
+def z_iabs64 : PatFrag<(ops node:$src),
+ (xor (add node:$src, (sra node:$src, (i32 63))),
+ (sra node:$src, (i32 63)))>;
+def z_inegabs32 : PatFrag<(ops node:$src), (ineg (z_iabs32 node:$src))>;
+def z_inegabs64 : PatFrag<(ops node:$src), (ineg (z_iabs64 node:$src))>;
+
+// Integer multiply-and-add
+def z_muladd : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (add (mul node:$src1, node:$src2), node:$src3)>;
+
+// Fused multiply-subtract, using the natural operand order.
+def fms : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (fma node:$src1, node:$src2, (fneg node:$src3))>;
+
+// Fused multiply-add and multiply-subtract, but with the order of the
+// operands matching SystemZ's MA and MS instructions.
+def z_fma : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (fma node:$src2, node:$src3, node:$src1)>;
+def z_fms : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (fma node:$src2, node:$src3, (fneg node:$src1))>;
+
+// Floating-point negative absolute.
+def fnabs : PatFrag<(ops node:$ptr), (fneg (fabs node:$ptr))>;
+
+// Create a unary operator that loads from memory and then performs
+// the given operation on it.
+class loadu<SDPatternOperator operator, SDPatternOperator load = load>
+ : PatFrag<(ops node:$addr), (operator (load node:$addr))>;
+
+// Create a store operator that performs the given unary operation
+// on the value before storing it.
+class storeu<SDPatternOperator operator, SDPatternOperator store = store>
+ : PatFrag<(ops node:$value, node:$addr),
+ (store (operator node:$value), node:$addr)>;
+
+// Create a store operator that performs the given inherent operation
+// and stores the resulting value.
+class storei<SDPatternOperator operator, SDPatternOperator store = store>
+ : PatFrag<(ops node:$addr),
+ (store (operator), node:$addr)>;
+
+// Vector representation of all-zeros and all-ones.
+def z_vzero : PatFrag<(ops), (bitconvert (v16i8 (z_byte_mask (i32 0))))>;
+def z_vones : PatFrag<(ops), (bitconvert (v16i8 (z_byte_mask (i32 65535))))>;
+
+// Load a scalar and replicate it in all elements of a vector.
+class z_replicate_load<ValueType scalartype, SDPatternOperator load>
+ : PatFrag<(ops node:$addr),
+ (z_replicate (scalartype (load node:$addr)))>;
+def z_replicate_loadi8 : z_replicate_load<i32, anyextloadi8>;
+def z_replicate_loadi16 : z_replicate_load<i32, anyextloadi16>;
+def z_replicate_loadi32 : z_replicate_load<i32, load>;
+def z_replicate_loadi64 : z_replicate_load<i64, load>;
+def z_replicate_loadf32 : z_replicate_load<f32, load>;
+def z_replicate_loadf64 : z_replicate_load<f64, load>;
+
+// Load a scalar and insert it into a single element of a vector.
+class z_vle<ValueType scalartype, SDPatternOperator load>
+ : PatFrag<(ops node:$vec, node:$addr, node:$index),
+ (z_vector_insert node:$vec, (scalartype (load node:$addr)),
+ node:$index)>;
+def z_vlei8 : z_vle<i32, anyextloadi8>;
+def z_vlei16 : z_vle<i32, anyextloadi16>;
+def z_vlei32 : z_vle<i32, load>;
+def z_vlei64 : z_vle<i64, load>;
+def z_vlef32 : z_vle<f32, load>;
+def z_vlef64 : z_vle<f64, load>;
+
+// Load a scalar and insert it into the low element of the high i64 of a
+// zeroed vector.
+class z_vllez<ValueType scalartype, SDPatternOperator load, int index>
+ : PatFrag<(ops node:$addr),
+ (z_vector_insert (z_vzero),
+ (scalartype (load node:$addr)), (i32 index))>;
+def z_vllezi8 : z_vllez<i32, anyextloadi8, 7>;
+def z_vllezi16 : z_vllez<i32, anyextloadi16, 3>;
+def z_vllezi32 : z_vllez<i32, load, 1>;
+def z_vllezi64 : PatFrag<(ops node:$addr),
+ (z_join_dwords (i64 (load node:$addr)), (i64 0))>;
+// We use high merges to form a v4f32 from four f32s. Propagating zero
+// into all elements but index 1 gives this expression.
+def z_vllezf32 : PatFrag<(ops node:$addr),
+ (bitconvert
+ (z_merge_high
+ (v2i64
+ (z_unpackl_high
+ (v4i32
+ (bitconvert
+ (v4f32 (scalar_to_vector
+ (f32 (load node:$addr)))))))),
+ (v2i64 (z_vzero))))>;
+def z_vllezf64 : PatFrag<(ops node:$addr),
+ (z_merge_high
+ (scalar_to_vector (f64 (load node:$addr))),
+ (z_vzero))>;
+
+// Store one element of a vector.
+class z_vste<ValueType scalartype, SDPatternOperator store>
+ : PatFrag<(ops node:$vec, node:$addr, node:$index),
+ (store (scalartype (z_vector_extract node:$vec, node:$index)),
+ node:$addr)>;
+def z_vstei8 : z_vste<i32, truncstorei8>;
+def z_vstei16 : z_vste<i32, truncstorei16>;
+def z_vstei32 : z_vste<i32, store>;
+def z_vstei64 : z_vste<i64, store>;
+def z_vstef32 : z_vste<f32, store>;
+def z_vstef64 : z_vste<f64, store>;
+
+// Arithmetic negation on vectors.
+def z_vneg : PatFrag<(ops node:$x), (sub (z_vzero), node:$x)>;
+
+// Bitwise negation on vectors.
+def z_vnot : PatFrag<(ops node:$x), (xor node:$x, (z_vones))>;
+
+// Signed "integer greater than zero" on vectors.
+def z_vicmph_zero : PatFrag<(ops node:$x), (z_vicmph node:$x, (z_vzero))>;
+
+// Signed "integer less than zero" on vectors.
+def z_vicmpl_zero : PatFrag<(ops node:$x), (z_vicmph (z_vzero), node:$x)>;
+
+// Integer absolute on vectors.
+class z_viabs<int shift>
+ : PatFrag<(ops node:$src),
+ (xor (add node:$src, (z_vsra_by_scalar node:$src, (i32 shift))),
+ (z_vsra_by_scalar node:$src, (i32 shift)))>;
+def z_viabs8 : z_viabs<7>;
+def z_viabs16 : z_viabs<15>;
+def z_viabs32 : z_viabs<31>;
+def z_viabs64 : z_viabs<63>;
+
+// Sign-extend the i64 elements of a vector.
+class z_vse<int shift>
+ : PatFrag<(ops node:$src),
+ (z_vsra_by_scalar (z_vshl_by_scalar node:$src, shift), shift)>;
+def z_vsei8 : z_vse<56>;
+def z_vsei16 : z_vse<48>;
+def z_vsei32 : z_vse<32>;
+
+// ...and again with the extensions being done on individual i64 scalars.
+class z_vse_by_parts<SDPatternOperator operator, int index1, int index2>
+ : PatFrag<(ops node:$src),
+ (z_join_dwords
+ (operator (z_vector_extract node:$src, index1)),
+ (operator (z_vector_extract node:$src, index2)))>;
+def z_vsei8_by_parts : z_vse_by_parts<sext8dbl, 7, 15>;
+def z_vsei16_by_parts : z_vse_by_parts<sext16dbl, 3, 7>;
+def z_vsei32_by_parts : z_vse_by_parts<sext32, 1, 3>;
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZPatterns.td b/contrib/llvm/lib/Target/SystemZ/SystemZPatterns.td
new file mode 100644
index 000000000000..16a7ed784d70
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZPatterns.td
@@ -0,0 +1,169 @@
+//===-- SystemZPatterns.td - SystemZ-specific pattern rules ---*- tblgen-*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// Record that INSN performs a 64-bit version of unary operator OPERATOR
+// in which the operand is sign-extended from 32 to 64 bits.
+multiclass SXU<SDPatternOperator operator, Instruction insn> {
+ def : Pat<(operator (sext (i32 GR32:$src))),
+ (insn GR32:$src)>;
+ def : Pat<(operator (sext_inreg GR64:$src, i32)),
+ (insn (EXTRACT_SUBREG GR64:$src, subreg_l32))>;
+}
+
+// Record that INSN performs a 64-bit version of binary operator OPERATOR
+// in which the first operand has class CLS and which the second operand
+// is sign-extended from a 32-bit register.
+multiclass SXB<SDPatternOperator operator, RegisterOperand cls,
+ Instruction insn> {
+ def : Pat<(operator cls:$src1, (sext GR32:$src2)),
+ (insn cls:$src1, GR32:$src2)>;
+ def : Pat<(operator cls:$src1, (sext_inreg GR64:$src2, i32)),
+ (insn cls:$src1, (EXTRACT_SUBREG GR64:$src2, subreg_l32))>;
+}
+
+// Like SXB, but for zero extension.
+multiclass ZXB<SDPatternOperator operator, RegisterOperand cls,
+ Instruction insn> {
+ def : Pat<(operator cls:$src1, (zext GR32:$src2)),
+ (insn cls:$src1, GR32:$src2)>;
+ def : Pat<(operator cls:$src1, (and GR64:$src2, 0xffffffff)),
+ (insn cls:$src1, (EXTRACT_SUBREG GR64:$src2, subreg_l32))>;
+}
+
+// Record that INSN performs a binary read-modify-write operation,
+// with LOAD, OPERATOR and STORE being the read, modify and write
+// respectively. MODE is the addressing mode and IMM is the type
+// of the second operand.
+class RMWI<SDPatternOperator load, SDPatternOperator operator,
+ SDPatternOperator store, AddressingMode mode,
+ PatFrag imm, Instruction insn>
+ : Pat<(store (operator (load mode:$addr), imm:$src), mode:$addr),
+ (insn mode:$addr, (UIMM8 imm:$src))>;
+
+// Record that INSN performs binary operation OPERATION on a byte
+// memory location. IMM is the type of the second operand.
+multiclass RMWIByte<SDPatternOperator operator, AddressingMode mode,
+ Instruction insn> {
+ def : RMWI<anyextloadi8, operator, truncstorei8, mode, imm32, insn>;
+ def : RMWI<anyextloadi8, operator, truncstorei8, mode, imm64, insn>;
+}
+
+// Record that INSN performs insertion TYPE into a register of class CLS.
+// The inserted operand is loaded using LOAD from an address of mode MODE.
+multiclass InsertMem<string type, Instruction insn, RegisterOperand cls,
+ SDPatternOperator load, AddressingMode mode> {
+ def : Pat<(!cast<SDPatternOperator>("or_as_"##type)
+ cls:$src1, (load mode:$src2)),
+ (insn cls:$src1, mode:$src2)>;
+ def : Pat<(!cast<SDPatternOperator>("or_as_rev"##type)
+ (load mode:$src2), cls:$src1),
+ (insn cls:$src1, mode:$src2)>;
+}
+
+// INSN stores the low 32 bits of a GPR to a memory with addressing mode MODE.
+// Record that it is equivalent to using OPERATOR to store a GR64.
+class StoreGR64<Instruction insn, SDPatternOperator operator,
+ AddressingMode mode>
+ : Pat<(operator GR64:$R1, mode:$XBD2),
+ (insn (EXTRACT_SUBREG GR64:$R1, subreg_l32), mode:$XBD2)>;
+
+// INSN and INSNY are an RX/RXY pair of instructions that store the low
+// 32 bits of a GPR to memory. Record that they are equivalent to using
+// OPERATOR to store a GR64.
+multiclass StoreGR64Pair<Instruction insn, Instruction insny,
+ SDPatternOperator operator> {
+ def : StoreGR64<insn, operator, bdxaddr12pair>;
+ def : StoreGR64<insny, operator, bdxaddr20pair>;
+}
+
+// INSN stores the low 32 bits of a GPR using PC-relative addressing.
+// Record that it is equivalent to using OPERATOR to store a GR64.
+class StoreGR64PC<Instruction insn, SDPatternOperator operator>
+ : Pat<(operator GR64:$R1, pcrel32:$XBD2),
+ (insn (EXTRACT_SUBREG GR64:$R1, subreg_l32), pcrel32:$XBD2)> {
+ // We want PC-relative addresses to be tried ahead of BD and BDX addresses.
+ // However, BDXs have two extra operands and are therefore 6 units more
+ // complex.
+ let AddedComplexity = 7;
+}
+
+// INSN and INSNINV conditionally store the low 32 bits of a GPR to memory,
+// with INSN storing when the condition is true and INSNINV storing when the
+// condition is false. Record that they are equivalent to a LOAD/select/STORE
+// sequence for GR64s.
+multiclass CondStores64<Instruction insn, Instruction insninv,
+ SDPatternOperator store, SDPatternOperator load,
+ AddressingMode mode> {
+ def : Pat<(store (z_select_ccmask GR64:$new, (load mode:$addr),
+ imm32zx4:$valid, imm32zx4:$cc),
+ mode:$addr),
+ (insn (EXTRACT_SUBREG GR64:$new, subreg_l32), mode:$addr,
+ imm32zx4:$valid, imm32zx4:$cc)>;
+ def : Pat<(store (z_select_ccmask (load mode:$addr), GR64:$new,
+ imm32zx4:$valid, imm32zx4:$cc),
+ mode:$addr),
+ (insninv (EXTRACT_SUBREG GR64:$new, subreg_l32), mode:$addr,
+ imm32zx4:$valid, imm32zx4:$cc)>;
+}
+
+// Try to use MVC instruction INSN for a load of type LOAD followed by a store
+// of the same size. VT is the type of the intermediate (legalized) value and
+// LENGTH is the number of bytes loaded by LOAD.
+multiclass MVCLoadStore<SDPatternOperator load, ValueType vt, Instruction insn,
+ bits<5> length> {
+ def : Pat<(mvc_store (vt (load bdaddr12only:$src)), bdaddr12only:$dest),
+ (insn bdaddr12only:$dest, bdaddr12only:$src, length)>;
+}
+
+// Use NC-like instruction INSN for block_op operation OPERATOR.
+// The other operand is a load of type LOAD, which accesses LENGTH bytes.
+// VT is the intermediate legalized type in which the binary operation
+// is actually done.
+multiclass BinaryLoadStore<SDPatternOperator operator, SDPatternOperator load,
+ ValueType vt, Instruction insn, bits<5> length> {
+ def : Pat<(operator (vt (load bdaddr12only:$src)), bdaddr12only:$dest),
+ (insn bdaddr12only:$dest, bdaddr12only:$src, length)>;
+}
+
+// A convenient way of generating all block peepholes for a particular
+// LOAD/VT/LENGTH combination.
+multiclass BlockLoadStore<SDPatternOperator load, ValueType vt,
+ Instruction mvc, Instruction nc, Instruction oc,
+ Instruction xc, bits<5> length> {
+ defm : MVCLoadStore<load, vt, mvc, length>;
+ defm : BinaryLoadStore<block_and1, load, vt, nc, length>;
+ defm : BinaryLoadStore<block_and2, load, vt, nc, length>;
+ defm : BinaryLoadStore<block_or1, load, vt, oc, length>;
+ defm : BinaryLoadStore<block_or2, load, vt, oc, length>;
+ defm : BinaryLoadStore<block_xor1, load, vt, xc, length>;
+ defm : BinaryLoadStore<block_xor2, load, vt, xc, length>;
+}
+
+// Record that INSN is a LOAD AND TEST that can be used to compare
+// registers in CLS against zero. The instruction has separate R1 and R2
+// operands, but they must be the same when the instruction is used like this.
+multiclass CompareZeroFP<Instruction insn, RegisterOperand cls> {
+ def : Pat<(z_fcmp cls:$reg, (fpimm0)), (insn cls:$reg, cls:$reg)>;
+ // The sign of the zero makes no difference.
+ def : Pat<(z_fcmp cls:$reg, (fpimmneg0)), (insn cls:$reg, cls:$reg)>;
+}
+
+// Use INSN for performing binary operation OPERATION of type VT
+// on registers of class CLS.
+class BinaryRRWithType<Instruction insn, RegisterOperand cls,
+ SDPatternOperator operator, ValueType vt>
+ : Pat<(vt (operator cls:$x, cls:$y)), (insn cls:$x, cls:$y)>;
+
+// Use INSN to perform conversion operation OPERATOR, with the input being
+// TR2 and the output being TR1. SUPPRESS is 4 to suppress inexact conditions
+// and 0 to allow them. MODE is the rounding mode to use.
+class FPConversion<Instruction insn, SDPatternOperator operator, TypedReg tr1,
+ TypedReg tr2, bits<3> suppress, bits<4> mode>
+ : Pat<(tr1.vt (operator (tr2.vt tr2.op:$vec))),
+ (insn tr2.op:$vec, suppress, mode)>;
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZProcessors.td b/contrib/llvm/lib/Target/SystemZ/SystemZProcessors.td
new file mode 100644
index 000000000000..1cdc0949ff4a
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZProcessors.td
@@ -0,0 +1,35 @@
+//===-- SystemZ.td - SystemZ processors and features ---------*- tblgen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Processor definitions.
+//
+// For compatibility with other compilers on the platform, each model can
+// be identifed either by the system name (e.g. z10) or the level of the
+// architecture the model supports, as identified by the edition level
+// of the z/Architecture Principles of Operation document (e.g. arch8).
+//
+// The minimum architecture level supported by LLVM is as defined in
+// the Eighth Edition of the PoP (i.e. as implemented on z10).
+//
+//===----------------------------------------------------------------------===//
+
+def : ProcessorModel<"generic", NoSchedModel, []>;
+
+def : ProcessorModel<"arch8", NoSchedModel, Arch8SupportedFeatures.List>;
+def : ProcessorModel<"z10", NoSchedModel, Arch8SupportedFeatures.List>;
+
+def : ProcessorModel<"arch9", Z196Model, Arch9SupportedFeatures.List>;
+def : ProcessorModel<"z196", Z196Model, Arch9SupportedFeatures.List>;
+
+def : ProcessorModel<"arch10", ZEC12Model, Arch10SupportedFeatures.List>;
+def : ProcessorModel<"zEC12", ZEC12Model, Arch10SupportedFeatures.List>;
+
+def : ProcessorModel<"arch11", Z13Model, Arch11SupportedFeatures.List>;
+def : ProcessorModel<"z13", Z13Model, Arch11SupportedFeatures.List>;
+
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
new file mode 100644
index 000000000000..6ef8000d6f43
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -0,0 +1,159 @@
+//===-- SystemZRegisterInfo.cpp - SystemZ register information ------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZInstrInfo.h"
+#include "SystemZRegisterInfo.h"
+#include "SystemZSubtarget.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+using namespace llvm;
+
+#define GET_REGINFO_TARGET_DESC
+#include "SystemZGenRegisterInfo.inc"
+
+SystemZRegisterInfo::SystemZRegisterInfo()
+ : SystemZGenRegisterInfo(SystemZ::R14D) {}
+
+const MCPhysReg *
+SystemZRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+ if (MF->getSubtarget().getTargetLowering()->supportSwiftError() &&
+ MF->getFunction()->getAttributes().hasAttrSomewhere(
+ Attribute::SwiftError))
+ return CSR_SystemZ_SwiftError_SaveList;
+ return CSR_SystemZ_SaveList;
+}
+
+const uint32_t *
+SystemZRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
+ CallingConv::ID CC) const {
+ if (MF.getSubtarget().getTargetLowering()->supportSwiftError() &&
+ MF.getFunction()->getAttributes().hasAttrSomewhere(
+ Attribute::SwiftError))
+ return CSR_SystemZ_SwiftError_RegMask;
+ return CSR_SystemZ_RegMask;
+}
+
+BitVector
+SystemZRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+ BitVector Reserved(getNumRegs());
+ const SystemZFrameLowering *TFI = getFrameLowering(MF);
+
+ if (TFI->hasFP(MF)) {
+ // R11D is the frame pointer. Reserve all aliases.
+ Reserved.set(SystemZ::R11D);
+ Reserved.set(SystemZ::R11L);
+ Reserved.set(SystemZ::R11H);
+ Reserved.set(SystemZ::R10Q);
+ }
+
+ // R15D is the stack pointer. Reserve all aliases.
+ Reserved.set(SystemZ::R15D);
+ Reserved.set(SystemZ::R15L);
+ Reserved.set(SystemZ::R15H);
+ Reserved.set(SystemZ::R14Q);
+
+ // A0 and A1 hold the thread pointer.
+ Reserved.set(SystemZ::A0);
+ Reserved.set(SystemZ::A1);
+
+ return Reserved;
+}
+
+void
+SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
+ int SPAdj, unsigned FIOperandNum,
+ RegScavenger *RS) const {
+ assert(SPAdj == 0 && "Outgoing arguments should be part of the frame");
+
+ MachineBasicBlock &MBB = *MI->getParent();
+ MachineFunction &MF = *MBB.getParent();
+ auto *TII =
+ static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
+ const SystemZFrameLowering *TFI = getFrameLowering(MF);
+ DebugLoc DL = MI->getDebugLoc();
+
+ // Decompose the frame index into a base and offset.
+ int FrameIndex = MI->getOperand(FIOperandNum).getIndex();
+ unsigned BasePtr;
+ int64_t Offset = (TFI->getFrameIndexReference(MF, FrameIndex, BasePtr) +
+ MI->getOperand(FIOperandNum + 1).getImm());
+
+ // Special handling of dbg_value instructions.
+ if (MI->isDebugValue()) {
+ MI->getOperand(FIOperandNum).ChangeToRegister(BasePtr, /*isDef*/ false);
+ MI->getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+ return;
+ }
+
+ // See if the offset is in range, or if an equivalent instruction that
+ // accepts the offset exists.
+ unsigned Opcode = MI->getOpcode();
+ unsigned OpcodeForOffset = TII->getOpcodeForOffset(Opcode, Offset);
+ if (OpcodeForOffset) {
+ if (OpcodeForOffset == SystemZ::LE &&
+ MF.getSubtarget<SystemZSubtarget>().hasVector()) {
+ // If LE is ok for offset, use LDE instead on z13.
+ OpcodeForOffset = SystemZ::LDE32;
+ }
+ MI->getOperand(FIOperandNum).ChangeToRegister(BasePtr, false);
+ }
+ else {
+ // Create an anchor point that is in range. Start at 0xffff so that
+ // can use LLILH to load the immediate.
+ int64_t OldOffset = Offset;
+ int64_t Mask = 0xffff;
+ do {
+ Offset = OldOffset & Mask;
+ OpcodeForOffset = TII->getOpcodeForOffset(Opcode, Offset);
+ Mask >>= 1;
+ assert(Mask && "One offset must be OK");
+ } while (!OpcodeForOffset);
+
+ unsigned ScratchReg =
+ MF.getRegInfo().createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+ int64_t HighOffset = OldOffset - Offset;
+
+ if (MI->getDesc().TSFlags & SystemZII::HasIndex
+ && MI->getOperand(FIOperandNum + 2).getReg() == 0) {
+ // Load the offset into the scratch register and use it as an index.
+ // The scratch register then dies here.
+ TII->loadImmediate(MBB, MI, ScratchReg, HighOffset);
+ MI->getOperand(FIOperandNum).ChangeToRegister(BasePtr, false);
+ MI->getOperand(FIOperandNum + 2).ChangeToRegister(ScratchReg,
+ false, false, true);
+ } else {
+ // Load the anchor address into a scratch register.
+ unsigned LAOpcode = TII->getOpcodeForOffset(SystemZ::LA, HighOffset);
+ if (LAOpcode)
+ BuildMI(MBB, MI, DL, TII->get(LAOpcode),ScratchReg)
+ .addReg(BasePtr).addImm(HighOffset).addReg(0);
+ else {
+ // Load the high offset into the scratch register and use it as
+ // an index.
+ TII->loadImmediate(MBB, MI, ScratchReg, HighOffset);
+ BuildMI(MBB, MI, DL, TII->get(SystemZ::AGR),ScratchReg)
+ .addReg(ScratchReg, RegState::Kill).addReg(BasePtr);
+ }
+
+ // Use the scratch register as the base. It then dies here.
+ MI->getOperand(FIOperandNum).ChangeToRegister(ScratchReg,
+ false, false, true);
+ }
+ }
+ MI->setDesc(TII->get(OpcodeForOffset));
+ MI->getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+}
+
+unsigned
+SystemZRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+ const SystemZFrameLowering *TFI = getFrameLowering(MF);
+ return TFI->hasFP(MF) ? SystemZ::R11D : SystemZ::R15D;
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
new file mode 100644
index 000000000000..e41c06c98af2
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
@@ -0,0 +1,67 @@
+//===-- SystemZRegisterInfo.h - SystemZ register information ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZREGISTERINFO_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZREGISTERINFO_H
+
+#include "SystemZ.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "SystemZGenRegisterInfo.inc"
+
+namespace llvm {
+
+namespace SystemZ {
+// Return the subreg to use for referring to the even and odd registers
+// in a GR128 pair. Is32Bit says whether we want a GR32 or GR64.
+inline unsigned even128(bool Is32bit) {
+ return Is32bit ? subreg_hl32 : subreg_h64;
+}
+inline unsigned odd128(bool Is32bit) {
+ return Is32bit ? subreg_l32 : subreg_l64;
+}
+} // end namespace SystemZ
+
+struct SystemZRegisterInfo : public SystemZGenRegisterInfo {
+public:
+ SystemZRegisterInfo();
+
+ /// getPointerRegClass - Return the register class to use to hold pointers.
+ /// This is currently only used by LOAD_STACK_GUARD, which requires a non-%r0
+ /// register, hence ADDR64.
+ const TargetRegisterClass *
+ getPointerRegClass(const MachineFunction &MF,
+ unsigned Kind=0) const override {
+ return &SystemZ::ADDR64BitRegClass;
+ }
+
+ // Override TargetRegisterInfo.h.
+ bool requiresRegisterScavenging(const MachineFunction &MF) const override {
+ return true;
+ }
+ bool requiresFrameIndexScavenging(const MachineFunction &MF) const override {
+ return true;
+ }
+ bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override {
+ return true;
+ }
+ const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+ const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+ CallingConv::ID CC) const override;
+ BitVector getReservedRegs(const MachineFunction &MF) const override;
+ void eliminateFrameIndex(MachineBasicBlock::iterator MI,
+ int SPAdj, unsigned FIOperandNum,
+ RegScavenger *RS) const override;
+ unsigned getFrameRegister(const MachineFunction &MF) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
new file mode 100644
index 000000000000..47d2f75cc11a
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
@@ -0,0 +1,306 @@
+//==- SystemZRegisterInfo.td - SystemZ register definitions -*- tablegen -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Class definitions.
+//===----------------------------------------------------------------------===//
+
+class SystemZReg<string n> : Register<n> {
+ let Namespace = "SystemZ";
+}
+
+class SystemZRegWithSubregs<string n, list<Register> subregs>
+ : RegisterWithSubRegs<n, subregs> {
+ let Namespace = "SystemZ";
+}
+
+let Namespace = "SystemZ" in {
+def subreg_l32 : SubRegIndex<32, 0>; // Also acts as subreg_ll32.
+def subreg_h32 : SubRegIndex<32, 32>; // Also acts as subreg_lh32.
+def subreg_l64 : SubRegIndex<64, 0>;
+def subreg_h64 : SubRegIndex<64, 64>;
+def subreg_r32 : SubRegIndex<32, 32>; // Reinterpret a wider reg as 32 bits.
+def subreg_r64 : SubRegIndex<64, 64>; // Reinterpret a wider reg as 64 bits.
+def subreg_hh32 : ComposedSubRegIndex<subreg_h64, subreg_h32>;
+def subreg_hl32 : ComposedSubRegIndex<subreg_h64, subreg_l32>;
+def subreg_hr32 : ComposedSubRegIndex<subreg_h64, subreg_r32>;
+}
+
+// Define a register class that contains values of types TYPES and an
+// associated operand called NAME. SIZE is the size and alignment
+// of the registers and REGLIST is the list of individual registers.
+multiclass SystemZRegClass<string name, list<ValueType> types, int size,
+ dag regList, bit allocatable = 1> {
+ def AsmOperand : AsmOperandClass {
+ let Name = name;
+ let ParserMethod = "parse"##name;
+ let RenderMethod = "addRegOperands";
+ }
+ let isAllocatable = allocatable in
+ def Bit : RegisterClass<"SystemZ", types, size, regList> {
+ let Size = size;
+ }
+ def "" : RegisterOperand<!cast<RegisterClass>(name##"Bit")> {
+ let ParserMatchClass = !cast<AsmOperandClass>(name##"AsmOperand");
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// General-purpose registers
+//===----------------------------------------------------------------------===//
+
+// Lower 32 bits of one of the 16 64-bit general-purpose registers
+class GPR32<bits<16> num, string n> : SystemZReg<n> {
+ let HWEncoding = num;
+}
+
+// One of the 16 64-bit general-purpose registers.
+class GPR64<bits<16> num, string n, GPR32 low, GPR32 high>
+ : SystemZRegWithSubregs<n, [low, high]> {
+ let HWEncoding = num;
+ let SubRegIndices = [subreg_l32, subreg_h32];
+}
+
+// 8 even-odd pairs of GPR64s.
+class GPR128<bits<16> num, string n, GPR64 low, GPR64 high>
+ : SystemZRegWithSubregs<n, [low, high]> {
+ let HWEncoding = num;
+ let SubRegIndices = [subreg_l64, subreg_h64];
+}
+
+// General-purpose registers
+foreach I = 0-15 in {
+ def R#I#L : GPR32<I, "r"#I>;
+ def R#I#H : GPR32<I, "r"#I>;
+ def R#I#D : GPR64<I, "r"#I, !cast<GPR32>("R"#I#"L"), !cast<GPR32>("R"#I#"H")>,
+ DwarfRegNum<[I]>;
+}
+
+foreach I = [0, 2, 4, 6, 8, 10, 12, 14] in {
+ def R#I#Q : GPR128<I, "r"#I, !cast<GPR64>("R"#!add(I, 1)#"D"),
+ !cast<GPR64>("R"#I#"D")>;
+}
+
+/// Allocate the callee-saved R6-R13 backwards. That way they can be saved
+/// together with R14 and R15 in one prolog instruction.
+defm GR32 : SystemZRegClass<"GR32", [i32], 32,
+ (add (sequence "R%uL", 0, 5),
+ (sequence "R%uL", 15, 6))>;
+defm GRH32 : SystemZRegClass<"GRH32", [i32], 32,
+ (add (sequence "R%uH", 0, 5),
+ (sequence "R%uH", 15, 6))>;
+defm GR64 : SystemZRegClass<"GR64", [i64], 64,
+ (add (sequence "R%uD", 0, 5),
+ (sequence "R%uD", 15, 6))>;
+
+// Combine the low and high GR32s into a single class. This can only be
+// used for virtual registers if the high-word facility is available.
+defm GRX32 : SystemZRegClass<"GRX32", [i32], 32,
+ (add (sequence "R%uL", 0, 5),
+ (sequence "R%uH", 0, 5),
+ R15L, R15H, R14L, R14H, R13L, R13H,
+ R12L, R12H, R11L, R11H, R10L, R10H,
+ R9L, R9H, R8L, R8H, R7L, R7H, R6L, R6H)>;
+
+// The architecture doesn't really have any i128 support, so model the
+// register pairs as untyped instead.
+defm GR128 : SystemZRegClass<"GR128", [untyped], 128,
+ (add R0Q, R2Q, R4Q, R12Q, R10Q, R8Q, R6Q, R14Q)>;
+
+// Base and index registers. Everything except R0, which in an address
+// context evaluates as 0.
+defm ADDR32 : SystemZRegClass<"ADDR32", [i32], 32, (sub GR32Bit, R0L)>;
+defm ADDR64 : SystemZRegClass<"ADDR64", [i64], 64, (sub GR64Bit, R0D)>;
+
+// Not used directly, but needs to exist for ADDR32 and ADDR64 subregs
+// of a GR128.
+defm ADDR128 : SystemZRegClass<"ADDR128", [untyped], 128, (sub GR128Bit, R0Q)>;
+
+// Any type register. Used for .insn directives when we don't know what the
+// register types could be.
+defm AnyReg : SystemZRegClass<"AnyReg",
+ [i64, f64, v8i8, v4i16, v2i32, v2f32], 64,
+ (add (sequence "R%uD", 0, 15),
+ (sequence "F%uD", 0, 15),
+ (sequence "V%u", 0, 15))>;
+
+//===----------------------------------------------------------------------===//
+// Floating-point registers
+//===----------------------------------------------------------------------===//
+
+// Maps FPR register numbers to their DWARF encoding.
+class DwarfMapping<int id> { int Id = id; }
+
+def F0Dwarf : DwarfMapping<16>;
+def F2Dwarf : DwarfMapping<17>;
+def F4Dwarf : DwarfMapping<18>;
+def F6Dwarf : DwarfMapping<19>;
+
+def F1Dwarf : DwarfMapping<20>;
+def F3Dwarf : DwarfMapping<21>;
+def F5Dwarf : DwarfMapping<22>;
+def F7Dwarf : DwarfMapping<23>;
+
+def F8Dwarf : DwarfMapping<24>;
+def F10Dwarf : DwarfMapping<25>;
+def F12Dwarf : DwarfMapping<26>;
+def F14Dwarf : DwarfMapping<27>;
+
+def F9Dwarf : DwarfMapping<28>;
+def F11Dwarf : DwarfMapping<29>;
+def F13Dwarf : DwarfMapping<30>;
+def F15Dwarf : DwarfMapping<31>;
+
+def F16Dwarf : DwarfMapping<68>;
+def F18Dwarf : DwarfMapping<69>;
+def F20Dwarf : DwarfMapping<70>;
+def F22Dwarf : DwarfMapping<71>;
+
+def F17Dwarf : DwarfMapping<72>;
+def F19Dwarf : DwarfMapping<73>;
+def F21Dwarf : DwarfMapping<74>;
+def F23Dwarf : DwarfMapping<75>;
+
+def F24Dwarf : DwarfMapping<76>;
+def F26Dwarf : DwarfMapping<77>;
+def F28Dwarf : DwarfMapping<78>;
+def F30Dwarf : DwarfMapping<79>;
+
+def F25Dwarf : DwarfMapping<80>;
+def F27Dwarf : DwarfMapping<81>;
+def F29Dwarf : DwarfMapping<82>;
+def F31Dwarf : DwarfMapping<83>;
+
+// Upper 32 bits of one of the floating-point registers
+class FPR32<bits<16> num, string n> : SystemZReg<n> {
+ let HWEncoding = num;
+}
+
+// One of the floating-point registers.
+class FPR64<bits<16> num, string n, FPR32 high>
+ : SystemZRegWithSubregs<n, [high]> {
+ let HWEncoding = num;
+ let SubRegIndices = [subreg_r32];
+}
+
+// 8 pairs of FPR64s, with a one-register gap inbetween.
+class FPR128<bits<16> num, string n, FPR64 low, FPR64 high>
+ : SystemZRegWithSubregs<n, [low, high]> {
+ let HWEncoding = num;
+ let SubRegIndices = [subreg_l64, subreg_h64];
+}
+
+// Floating-point registers. Registers 16-31 require the vector facility.
+foreach I = 0-15 in {
+ def F#I#S : FPR32<I, "f"#I>;
+ def F#I#D : FPR64<I, "f"#I, !cast<FPR32>("F"#I#"S")>,
+ DwarfRegNum<[!cast<DwarfMapping>("F"#I#"Dwarf").Id]>;
+}
+foreach I = 16-31 in {
+ def F#I#S : FPR32<I, "v"#I>;
+ def F#I#D : FPR64<I, "v"#I, !cast<FPR32>("F"#I#"S")>,
+ DwarfRegNum<[!cast<DwarfMapping>("F"#I#"Dwarf").Id]>;
+}
+
+foreach I = [0, 1, 4, 5, 8, 9, 12, 13] in {
+ def F#I#Q : FPR128<I, "f"#I, !cast<FPR64>("F"#!add(I, 2)#"D"),
+ !cast<FPR64>("F"#I#"D")>;
+}
+
+// There's no store-multiple instruction for FPRs, so we're not fussy
+// about the order in which call-saved registers are allocated.
+defm FP32 : SystemZRegClass<"FP32", [f32], 32, (sequence "F%uS", 0, 15)>;
+defm FP64 : SystemZRegClass<"FP64", [f64], 64, (sequence "F%uD", 0, 15)>;
+defm FP128 : SystemZRegClass<"FP128", [f128], 128,
+ (add F0Q, F1Q, F4Q, F5Q, F8Q, F9Q, F12Q, F13Q)>;
+
+//===----------------------------------------------------------------------===//
+// Vector registers
+//===----------------------------------------------------------------------===//
+
+// A full 128-bit vector register, with an FPR64 as its high part.
+class VR128<bits<16> num, string n, FPR64 high>
+ : SystemZRegWithSubregs<n, [high]> {
+ let HWEncoding = num;
+ let SubRegIndices = [subreg_r64];
+}
+
+// Full vector registers.
+foreach I = 0-31 in {
+ def V#I : VR128<I, "v"#I, !cast<FPR64>("F"#I#"D")>,
+ DwarfRegNum<[!cast<DwarfMapping>("F"#I#"Dwarf").Id]>;
+}
+
+// Class used to store 32-bit values in the first element of a vector
+// register. f32 scalars are used for the WLEDB and WLDEB instructions.
+defm VR32 : SystemZRegClass<"VR32", [f32, v4i8, v2i16], 32,
+ (add (sequence "F%uS", 0, 7),
+ (sequence "F%uS", 16, 31),
+ (sequence "F%uS", 8, 15))>;
+
+// Class used to store 64-bit values in the upper half of a vector register.
+// The vector facility also includes scalar f64 instructions that operate
+// on the full vector register set.
+defm VR64 : SystemZRegClass<"VR64", [f64, v8i8, v4i16, v2i32, v2f32], 64,
+ (add (sequence "F%uD", 0, 7),
+ (sequence "F%uD", 16, 31),
+ (sequence "F%uD", 8, 15))>;
+
+// The subset of vector registers that can be used for floating-point
+// operations too.
+defm VF128 : SystemZRegClass<"VF128",
+ [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128,
+ (sequence "V%u", 0, 15)>;
+
+// All vector registers.
+defm VR128 : SystemZRegClass<"VR128",
+ [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128,
+ (add (sequence "V%u", 0, 7),
+ (sequence "V%u", 16, 31),
+ (sequence "V%u", 8, 15))>;
+
+// Attaches a ValueType to a register operand, to make the instruction
+// definitions easier.
+class TypedReg<ValueType vtin, RegisterOperand opin> {
+ ValueType vt = vtin;
+ RegisterOperand op = opin;
+}
+
+def v32eb : TypedReg<f32, VR32>;
+def v64g : TypedReg<i64, VR64>;
+def v64db : TypedReg<f64, VR64>;
+def v128b : TypedReg<v16i8, VR128>;
+def v128h : TypedReg<v8i16, VR128>;
+def v128f : TypedReg<v4i32, VR128>;
+def v128g : TypedReg<v2i64, VR128>;
+def v128q : TypedReg<v16i8, VR128>;
+def v128eb : TypedReg<v4f32, VR128>;
+def v128db : TypedReg<v2f64, VR128>;
+def v128any : TypedReg<untyped, VR128>;
+
+//===----------------------------------------------------------------------===//
+// Other registers
+//===----------------------------------------------------------------------===//
+
+// The 2-bit condition code field of the PSW. Every register named in an
+// inline asm needs a class associated with it.
+def CC : SystemZReg<"cc">;
+let isAllocatable = 0 in
+ def CCRegs : RegisterClass<"SystemZ", [i32], 32, (add CC)>;
+
+// Access registers.
+class ACR32<bits<16> num, string n> : SystemZReg<n> {
+ let HWEncoding = num;
+}
+foreach I = 0-15 in {
+ def A#I : ACR32<I, "a"#I>, DwarfRegNum<[!add(I, 48)]>;
+}
+defm AR32 : SystemZRegClass<"AR32", [i32], 32,
+ (add (sequence "A%u", 0, 15)), 0>;
+
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZSchedule.td b/contrib/llvm/lib/Target/SystemZ/SystemZSchedule.td
new file mode 100644
index 000000000000..dbba8ab42b5a
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZSchedule.td
@@ -0,0 +1,77 @@
+//==-- SystemZSchedule.td - SystemZ Scheduling Definitions ----*- tblgen -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// Scheduler resources
+// Resources ending with a '2' use that resource for 2 cycles. An instruction
+// using two such resources use the mapped unit for 4 cycles, and 2 is added
+// to the total number of uops of the sched class.
+
+// These three resources are used to express decoder grouping rules.
+// The number of decoder slots needed by an instructions is normally
+// one. For a cracked instruction (BeginGroup && !EndGroup) it is
+// two. Expanded instructions (BeginGroup && EndGroup) group alone.
+def GroupAlone : SchedWrite;
+def BeginGroup : SchedWrite;
+def EndGroup : SchedWrite;
+
+// Latencies, to make code a bit neater. If more than one resource is
+// used for an instruction, the greatest latency (not the sum) will be
+// output by Tablegen. Therefore, in such cases one of these resources
+// is needed.
+def Lat2 : SchedWrite;
+def Lat3 : SchedWrite;
+def Lat4 : SchedWrite;
+def Lat5 : SchedWrite;
+def Lat6 : SchedWrite;
+def Lat7 : SchedWrite;
+def Lat8 : SchedWrite;
+def Lat9 : SchedWrite;
+def Lat10 : SchedWrite;
+def Lat11 : SchedWrite;
+def Lat12 : SchedWrite;
+def Lat15 : SchedWrite;
+def Lat20 : SchedWrite;
+def Lat30 : SchedWrite;
+
+// Fixed-point
+def FXa : SchedWrite;
+def FXa2 : SchedWrite;
+def FXb : SchedWrite;
+def FXU : SchedWrite;
+
+// Load/store unit
+def LSU : SchedWrite;
+
+// Model a return without latency, otherwise if-converter will model
+// extra cost and abort (currently there is an assert that checks that
+// all instructions have at least one uop).
+def LSU_lat1 : SchedWrite;
+
+// Floating point unit (zEC12 and earlier)
+def FPU : SchedWrite;
+def FPU2 : SchedWrite;
+
+// Vector sub units (z13)
+def VecBF : SchedWrite;
+def VecBF2 : SchedWrite;
+def VecDF : SchedWrite;
+def VecDF2 : SchedWrite;
+def VecFPd : SchedWrite; // Blocking BFP div/sqrt unit.
+def VecMul : SchedWrite;
+def VecStr : SchedWrite;
+def VecXsPm : SchedWrite;
+
+// Virtual branching unit
+def VBU : SchedWrite;
+
+
+include "SystemZScheduleZ13.td"
+include "SystemZScheduleZEC12.td"
+include "SystemZScheduleZ196.td"
+
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td b/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
new file mode 100644
index 000000000000..e97d61d8355d
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
@@ -0,0 +1,1064 @@
+//-- SystemZScheduleZ13.td - SystemZ Scheduling Definitions ----*- tblgen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Z13 to support instruction
+// scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def Z13Model : SchedMachineModel {
+
+ let UnsupportedFeatures = Arch11UnsupportedFeatures.List;
+
+ let IssueWidth = 8;
+ let MicroOpBufferSize = 60; // Issue queues
+ let LoadLatency = 1; // Optimistic load latency.
+
+ let PostRAScheduler = 1;
+
+ // Extra cycles for a mispredicted branch.
+ let MispredictPenalty = 20;
+}
+
+let SchedModel = Z13Model in {
+
+// These definitions could be put in a subtarget common include file,
+// but it seems the include system in Tablegen currently rejects
+// multiple includes of same file.
+def : WriteRes<GroupAlone, []> {
+ let NumMicroOps = 0;
+ let BeginGroup = 1;
+ let EndGroup = 1;
+}
+def : WriteRes<BeginGroup, []> {
+ let NumMicroOps = 0;
+ let BeginGroup = 1;
+}
+def : WriteRes<EndGroup, []> {
+ let NumMicroOps = 0;
+ let EndGroup = 1;
+}
+def : WriteRes<Lat2, []> { let Latency = 2; let NumMicroOps = 0;}
+def : WriteRes<Lat3, []> { let Latency = 3; let NumMicroOps = 0;}
+def : WriteRes<Lat4, []> { let Latency = 4; let NumMicroOps = 0;}
+def : WriteRes<Lat5, []> { let Latency = 5; let NumMicroOps = 0;}
+def : WriteRes<Lat6, []> { let Latency = 6; let NumMicroOps = 0;}
+def : WriteRes<Lat7, []> { let Latency = 7; let NumMicroOps = 0;}
+def : WriteRes<Lat8, []> { let Latency = 8; let NumMicroOps = 0;}
+def : WriteRes<Lat9, []> { let Latency = 9; let NumMicroOps = 0;}
+def : WriteRes<Lat10, []> { let Latency = 10; let NumMicroOps = 0;}
+def : WriteRes<Lat11, []> { let Latency = 11; let NumMicroOps = 0;}
+def : WriteRes<Lat12, []> { let Latency = 12; let NumMicroOps = 0;}
+def : WriteRes<Lat15, []> { let Latency = 15; let NumMicroOps = 0;}
+def : WriteRes<Lat20, []> { let Latency = 20; let NumMicroOps = 0;}
+def : WriteRes<Lat30, []> { let Latency = 30; let NumMicroOps = 0;}
+
+// Execution units.
+def Z13_FXaUnit : ProcResource<2>;
+def Z13_FXbUnit : ProcResource<2>;
+def Z13_LSUnit : ProcResource<2>;
+def Z13_VecUnit : ProcResource<2>;
+def Z13_VecFPdUnit : ProcResource<2> { let BufferSize = 1; /* blocking */ }
+def Z13_VBUnit : ProcResource<2>;
+
+// Subtarget specific definitions of scheduling resources.
+def : WriteRes<FXa, [Z13_FXaUnit]> { let Latency = 1; }
+def : WriteRes<FXa2, [Z13_FXaUnit, Z13_FXaUnit]> { let Latency = 2; }
+def : WriteRes<FXb, [Z13_FXbUnit]> { let Latency = 1; }
+def : WriteRes<LSU, [Z13_LSUnit]> { let Latency = 4; }
+def : WriteRes<VecBF, [Z13_VecUnit]> { let Latency = 8; }
+def : WriteRes<VecBF2, [Z13_VecUnit, Z13_VecUnit]> { let Latency = 9; }
+def : WriteRes<VecDF, [Z13_VecUnit]> { let Latency = 8; }
+def : WriteRes<VecDF2, [Z13_VecUnit, Z13_VecUnit]> { let Latency = 9; }
+def : WriteRes<VecFPd, [Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
+ Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
+ Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
+ Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
+ Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
+ Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
+ Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
+ Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
+ Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
+ Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit]>
+ { let Latency = 30; }
+def : WriteRes<VecMul, [Z13_VecUnit]> { let Latency = 5; }
+def : WriteRes<VecStr, [Z13_VecUnit]> { let Latency = 4; }
+def : WriteRes<VecXsPm, [Z13_VecUnit]> { let Latency = 3; }
+def : WriteRes<VBU, [Z13_VBUnit]>; // Virtual Branching Unit
+
+// -------------------------- INSTRUCTIONS ---------------------------------- //
+
+// InstRW constructs have been used in order to preserve the
+// readability of the InstrInfo files.
+
+// For each instruction, as matched by a regexp, provide a list of
+// resources that it needs. These will be combined into a SchedClass.
+
+//===----------------------------------------------------------------------===//
+// Stack allocation
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "ADJDYNALLOC$")>; // Pseudo -> LA / LAY
+
+//===----------------------------------------------------------------------===//
+// Branch instructions
+//===----------------------------------------------------------------------===//
+
+// Branch
+def : InstRW<[VBU], (instregex "(Call)?BRC(L)?(Asm.*)?$")>;
+def : InstRW<[VBU], (instregex "(Call)?J(G)?(Asm.*)?$")>;
+def : InstRW<[FXb], (instregex "(Call)?BC(R)?(Asm.*)?$")>;
+def : InstRW<[FXb], (instregex "(Call)?B(R)?(Asm.*)?$")>;
+def : InstRW<[FXa, EndGroup], (instregex "BRCT(G)?$")>;
+def : InstRW<[FXb, FXa, Lat2, GroupAlone], (instregex "BRCTH$")>;
+def : InstRW<[FXb, FXa, Lat2, GroupAlone], (instregex "BCT(G)?(R)?$")>;
+def : InstRW<[FXa, FXa, FXb, FXb, Lat4, GroupAlone],
+ (instregex "B(R)?X(H|L).*$")>;
+
+// Compare and branch
+def : InstRW<[FXb], (instregex "C(L)?(G)?(I|R)J(Asm.*)?$")>;
+def : InstRW<[FXb, FXb, Lat2, GroupAlone],
+ (instregex "C(L)?(G)?(I|R)B(Call|Return|Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Trap instructions
+//===----------------------------------------------------------------------===//
+
+// Trap
+def : InstRW<[VBU], (instregex "(Cond)?Trap$")>;
+
+// Compare and trap
+def : InstRW<[FXb], (instregex "C(G)?(I|R)T(Asm.*)?$")>;
+def : InstRW<[FXb], (instregex "CL(G)?RT(Asm.*)?$")>;
+def : InstRW<[FXb], (instregex "CL(F|G)IT(Asm.*)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CL(G)?T(Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Call and return instructions
+//===----------------------------------------------------------------------===//
+
+// Call
+def : InstRW<[VBU, FXa, FXa, Lat3, GroupAlone], (instregex "(Call)?BRAS$")>;
+def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "(Call)?BRASL$")>;
+def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "(Call)?BAS(R)?$")>;
+def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
+
+// Return
+def : InstRW<[FXb, EndGroup], (instregex "Return$")>;
+def : InstRW<[FXb], (instregex "CondReturn$")>;
+
+//===----------------------------------------------------------------------===//
+// Select instructions
+//===----------------------------------------------------------------------===//
+
+// Select pseudo
+def : InstRW<[FXa], (instregex "Select(32|64|32Mux)$")>;
+
+// CondStore pseudos
+def : InstRW<[FXa], (instregex "CondStore16(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStore16Mux(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStore32(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStore32Mux(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStore64(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStore8(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStore8Mux(Inv)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Move instructions
+//===----------------------------------------------------------------------===//
+
+// Moves
+def : InstRW<[FXb, LSU, Lat5], (instregex "MV(G|H)?HI$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "MVI(Y)?$")>;
+
+// Move character
+def : InstRW<[FXb, LSU, LSU, LSU, Lat8, GroupAlone], (instregex "MVC$")>;
+
+// Pseudo -> reg move
+def : InstRW<[FXa], (instregex "COPY(_TO_REGCLASS)?$")>;
+def : InstRW<[FXa], (instregex "EXTRACT_SUBREG$")>;
+def : InstRW<[FXa], (instregex "INSERT_SUBREG$")>;
+def : InstRW<[FXa], (instregex "REG_SEQUENCE$")>;
+def : InstRW<[FXa], (instregex "SUBREG_TO_REG$")>;
+
+// Loads
+def : InstRW<[LSU], (instregex "L(Y|FH|RL|Mux|CBB)?$")>;
+def : InstRW<[LSU], (instregex "LG(RL)?$")>;
+def : InstRW<[LSU], (instregex "L128$")>;
+
+def : InstRW<[FXa], (instregex "LLIH(F|H|L)$")>;
+def : InstRW<[FXa], (instregex "LLIL(F|H|L)$")>;
+
+def : InstRW<[FXa], (instregex "LG(F|H)I$")>;
+def : InstRW<[FXa], (instregex "LHI(Mux)?$")>;
+def : InstRW<[FXa], (instregex "LR(Mux)?$")>;
+
+// Load and zero rightmost byte
+def : InstRW<[LSU], (instregex "LZR(F|G)$")>;
+
+// Load and trap
+def : InstRW<[FXb, LSU, Lat5], (instregex "L(FH|G)?AT$")>;
+
+// Load and test
+def : InstRW<[FXa, LSU, Lat5], (instregex "LT(G)?$")>;
+def : InstRW<[FXa], (instregex "LT(G)?R$")>;
+
+// Stores
+def : InstRW<[FXb, LSU, Lat5], (instregex "STG(RL)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "ST128$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "ST(Y|FH|RL|Mux)?$")>;
+
+// String moves.
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVST$")>;
+
+//===----------------------------------------------------------------------===//
+// Conditional move instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, Lat2], (instregex "LOCRMux$")>;
+def : InstRW<[FXa, Lat2], (instregex "LOC(G|FH)?R(Asm.*)?$")>;
+def : InstRW<[FXa, Lat2], (instregex "LOC(G|H)?HI(Asm.*)?$")>;
+def : InstRW<[FXa, LSU, Lat6], (instregex "LOC(G|FH|Mux)?(Asm.*)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "STOC(G|FH|Mux)?(Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Sign extensions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "L(B|H|G)R$")>;
+def : InstRW<[FXa], (instregex "LG(B|H|F)R$")>;
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "LTGF$")>;
+def : InstRW<[FXa], (instregex "LTGFR$")>;
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "LB(H|Mux)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "LH(Y)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "LH(H|Mux|RL)$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "LG(B|H|F)$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "LG(H|F)RL$")>;
+
+//===----------------------------------------------------------------------===//
+// Zero extensions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "LLCR(Mux)?$")>;
+def : InstRW<[FXa], (instregex "LLHR(Mux)?$")>;
+def : InstRW<[FXa], (instregex "LLG(C|H|F|T)R$")>;
+def : InstRW<[LSU], (instregex "LLC(Mux)?$")>;
+def : InstRW<[LSU], (instregex "LLH(Mux)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "LL(C|H)H$")>;
+def : InstRW<[LSU], (instregex "LLHRL$")>;
+def : InstRW<[LSU], (instregex "LLG(C|H|F|T|HRL|FRL)$")>;
+
+// Load and zero rightmost byte
+def : InstRW<[LSU], (instregex "LLZRGF$")>;
+
+// Load and trap
+def : InstRW<[FXb, LSU, Lat5], (instregex "LLG(F|T)?AT$")>;
+
+//===----------------------------------------------------------------------===//
+// Truncations
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Multi-register moves
+//===----------------------------------------------------------------------===//
+
+// Load multiple (estimated average of 5 ops)
+def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
+ (instregex "LM(H|Y|G)?$")>;
+
+// Store multiple (estimated average of ceil(5/2) FXb ops)
+def : InstRW<[LSU, LSU, FXb, FXb, FXb, Lat10,
+ GroupAlone], (instregex "STM(G|H|Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Byte swaps
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "LRV(G)?R$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "LRV(G|H)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "STRV(G|H)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Load address instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "LA(Y|RL)?$")>;
+
+// Load the Global Offset Table address ( -> larl )
+def : InstRW<[FXa], (instregex "GOT$")>;
+
+//===----------------------------------------------------------------------===//
+// Absolute and Negation
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, Lat2], (instregex "LP(G)?R$")>;
+def : InstRW<[FXa, FXa, Lat3, BeginGroup], (instregex "L(N|P)GFR$")>;
+def : InstRW<[FXa, Lat2], (instregex "LN(R|GR)$")>;
+def : InstRW<[FXa], (instregex "LC(R|GR)$")>;
+def : InstRW<[FXa, FXa, Lat2, BeginGroup], (instregex "LCGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// Insertion
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "IC(Y)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "IC32(Y)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "ICM(H|Y)?$")>;
+def : InstRW<[FXa], (instregex "II(F|H|L)Mux$")>;
+def : InstRW<[FXa], (instregex "IIHF(64)?$")>;
+def : InstRW<[FXa], (instregex "IIHH(64)?$")>;
+def : InstRW<[FXa], (instregex "IIHL(64)?$")>;
+def : InstRW<[FXa], (instregex "IILF(64)?$")>;
+def : InstRW<[FXa], (instregex "IILH(64)?$")>;
+def : InstRW<[FXa], (instregex "IILL(64)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Addition
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "A(Y)?$")>;
+def : InstRW<[FXa, LSU, Lat6], (instregex "AH(Y)?$")>;
+def : InstRW<[FXa], (instregex "AIH$")>;
+def : InstRW<[FXa], (instregex "AFI(Mux)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "AG$")>;
+def : InstRW<[FXa], (instregex "AGFI$")>;
+def : InstRW<[FXa], (instregex "AGHI(K)?$")>;
+def : InstRW<[FXa], (instregex "AGR(K)?$")>;
+def : InstRW<[FXa], (instregex "AHI(K)?$")>;
+def : InstRW<[FXa], (instregex "AHIMux(K)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "AL(Y)?$")>;
+def : InstRW<[FXa], (instregex "AL(FI|HSIK)$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "ALG(F)?$")>;
+def : InstRW<[FXa], (instregex "ALGHSIK$")>;
+def : InstRW<[FXa], (instregex "ALGF(I|R)$")>;
+def : InstRW<[FXa], (instregex "ALGR(K)?$")>;
+def : InstRW<[FXa], (instregex "ALR(K)?$")>;
+def : InstRW<[FXa], (instregex "AR(K)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "A(G)?SI$")>;
+
+// Logical addition with carry
+def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "ALC(G)?$")>;
+def : InstRW<[FXa, Lat2, GroupAlone], (instregex "ALC(G)?R$")>;
+
+// Add with sign extension (32 -> 64)
+def : InstRW<[FXa, LSU, Lat6], (instregex "AGF$")>;
+def : InstRW<[FXa, Lat2], (instregex "AGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// Subtraction
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "S(G|Y)?$")>;
+def : InstRW<[FXa, LSU, Lat6], (instregex "SH(Y)?$")>;
+def : InstRW<[FXa], (instregex "SGR(K)?$")>;
+def : InstRW<[FXa], (instregex "SLFI$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "SL(G|GF|Y)?$")>;
+def : InstRW<[FXa], (instregex "SLGF(I|R)$")>;
+def : InstRW<[FXa], (instregex "SLGR(K)?$")>;
+def : InstRW<[FXa], (instregex "SLR(K)?$")>;
+def : InstRW<[FXa], (instregex "SR(K)?$")>;
+
+// Subtraction with borrow
+def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "SLB(G)?$")>;
+def : InstRW<[FXa, Lat2, GroupAlone], (instregex "SLB(G)?R$")>;
+
+// Subtraction with sign extension (32 -> 64)
+def : InstRW<[FXa, LSU, Lat6], (instregex "SGF$")>;
+def : InstRW<[FXa, Lat2], (instregex "SGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// AND
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "N(G|Y)?$")>;
+def : InstRW<[FXa], (instregex "NGR(K)?$")>;
+def : InstRW<[FXa], (instregex "NI(FMux|HMux|LMux)$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "NI(Y)?$")>;
+def : InstRW<[FXa], (instregex "NIHF(64)?$")>;
+def : InstRW<[FXa], (instregex "NIHH(64)?$")>;
+def : InstRW<[FXa], (instregex "NIHL(64)?$")>;
+def : InstRW<[FXa], (instregex "NILF(64)?$")>;
+def : InstRW<[FXa], (instregex "NILH(64)?$")>;
+def : InstRW<[FXa], (instregex "NILL(64)?$")>;
+def : InstRW<[FXa], (instregex "NR(K)?$")>;
+def : InstRW<[LSU, LSU, FXb, Lat9, BeginGroup], (instregex "NC$")>;
+
+//===----------------------------------------------------------------------===//
+// OR
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "O(G|Y)?$")>;
+def : InstRW<[FXa], (instregex "OGR(K)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "OI(Y)?$")>;
+def : InstRW<[FXa], (instregex "OI(FMux|HMux|LMux)$")>;
+def : InstRW<[FXa], (instregex "OIHF(64)?$")>;
+def : InstRW<[FXa], (instregex "OIHH(64)?$")>;
+def : InstRW<[FXa], (instregex "OIHL(64)?$")>;
+def : InstRW<[FXa], (instregex "OILF(64)?$")>;
+def : InstRW<[FXa], (instregex "OILH(64)?$")>;
+def : InstRW<[FXa], (instregex "OILL(64)?$")>;
+def : InstRW<[FXa], (instregex "OR(K)?$")>;
+def : InstRW<[LSU, LSU, FXb, Lat9, BeginGroup], (instregex "OC$")>;
+
+//===----------------------------------------------------------------------===//
+// XOR
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "X(G|Y)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "XI(Y)?$")>;
+def : InstRW<[FXa], (instregex "XIFMux$")>;
+def : InstRW<[FXa], (instregex "XGR(K)?$")>;
+def : InstRW<[FXa], (instregex "XIHF(64)?$")>;
+def : InstRW<[FXa], (instregex "XILF(64)?$")>;
+def : InstRW<[FXa], (instregex "XR(K)?$")>;
+def : InstRW<[LSU, LSU, FXb, Lat9, BeginGroup], (instregex "XC$")>;
+
+//===----------------------------------------------------------------------===//
+// Multiplication
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat10], (instregex "MS(GF|Y)?$")>;
+def : InstRW<[FXa, Lat6], (instregex "MS(R|FI)$")>;
+def : InstRW<[FXa, LSU, Lat12], (instregex "MSG$")>;
+def : InstRW<[FXa, Lat8], (instregex "MSGR$")>;
+def : InstRW<[FXa, Lat6], (instregex "MSGF(I|R)$")>;
+def : InstRW<[FXa, LSU, Lat15, GroupAlone], (instregex "MLG$")>;
+def : InstRW<[FXa, Lat9, GroupAlone], (instregex "MLGR$")>;
+def : InstRW<[FXa, Lat5], (instregex "MGHI$")>;
+def : InstRW<[FXa, Lat5], (instregex "MHI$")>;
+def : InstRW<[FXa, LSU, Lat9], (instregex "MH(Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Division and remainder
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, Lat30, GroupAlone], (instregex "DSG(F)?R$")>;
+def : InstRW<[LSU, FXa, Lat30, GroupAlone], (instregex "DSG(F)?$")>;
+def : InstRW<[FXa2, FXa2, Lat20, GroupAlone], (instregex "DLR$")>;
+def : InstRW<[FXa2, FXa2, Lat30, GroupAlone], (instregex "DLGR$")>;
+def : InstRW<[FXa2, FXa2, LSU, Lat30, GroupAlone], (instregex "DL(G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Shifts
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "SLL(G|K)?$")>;
+def : InstRW<[FXa], (instregex "SRL(G|K)?$")>;
+def : InstRW<[FXa], (instregex "SRA(G|K)?$")>;
+def : InstRW<[FXa], (instregex "SLA(K)?$")>;
+
+// Rotate
+def : InstRW<[FXa, LSU, Lat6], (instregex "RLL(G)?$")>;
+
+// Rotate and insert
+def : InstRW<[FXa], (instregex "RISBG(N|32)?$")>;
+def : InstRW<[FXa], (instregex "RISBH(G|H|L)$")>;
+def : InstRW<[FXa], (instregex "RISBL(G|H|L)$")>;
+def : InstRW<[FXa], (instregex "RISBMux$")>;
+
+// Rotate and Select
+def : InstRW<[FXa, FXa, Lat3, BeginGroup], (instregex "R(N|O|X)SBG$")>;
+
+//===----------------------------------------------------------------------===//
+// Comparison
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat5], (instregex "C(G|Y|Mux|RL)?$")>;
+def : InstRW<[FXb], (instregex "C(F|H)I(Mux)?$")>;
+def : InstRW<[FXb], (instregex "CG(F|H)I$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CG(HSI|RL)$")>;
+def : InstRW<[FXb], (instregex "C(G)?R$")>;
+def : InstRW<[FXb], (instregex "CIH$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CH(F|SI)$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CL(Y|Mux|FHSI)?$")>;
+def : InstRW<[FXb], (instregex "CLFI(Mux)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLG(HRL|HSI)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLGF(RL)?$")>;
+def : InstRW<[FXb], (instregex "CLGF(I|R)$")>;
+def : InstRW<[FXb], (instregex "CLGR$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLGRL$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLH(F|RL|HSI)$")>;
+def : InstRW<[FXb], (instregex "CLIH$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLI(Y)?$")>;
+def : InstRW<[FXb], (instregex "CLR$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLRL$")>;
+
+// Compare halfword
+def : InstRW<[FXb, LSU, Lat6], (instregex "CH(Y|RL)?$")>;
+def : InstRW<[FXb, LSU, Lat6], (instregex "CGH(RL)?$")>;
+def : InstRW<[FXa, FXb, LSU, Lat6, BeginGroup], (instregex "CHHSI$")>;
+
+// Compare with sign extension (32 -> 64)
+def : InstRW<[FXb, LSU, Lat6], (instregex "CGF(RL)?$")>;
+def : InstRW<[FXb, Lat2], (instregex "CGFR$")>;
+
+// Compare logical character
+def : InstRW<[FXb, LSU, LSU, Lat9, BeginGroup], (instregex "CLC$")>;
+
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>;
+
+// Test under mask
+def : InstRW<[FXb, LSU, Lat5], (instregex "TM(Y)?$")>;
+def : InstRW<[FXb], (instregex "TM(H|L)Mux$")>;
+def : InstRW<[FXb], (instregex "TMHH(64)?$")>;
+def : InstRW<[FXb], (instregex "TMHL(64)?$")>;
+def : InstRW<[FXb], (instregex "TMLH(64)?$")>;
+def : InstRW<[FXb], (instregex "TMLL(64)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Prefetch and execution hint
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU], (instregex "PFD(RL)?$")>;
+def : InstRW<[FXb, Lat2], (instregex "BPP$")>;
+def : InstRW<[FXb, EndGroup], (instregex "BPRP$")>;
+def : InstRW<[FXb], (instregex "NIAI$")>;
+
+//===----------------------------------------------------------------------===//
+// Atomic operations
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, EndGroup], (instregex "Serialize$")>;
+
+def : InstRW<[FXb, LSU, Lat5], (instregex "LAA(G)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "LAAL(G)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "LAN(G)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "LAO(G)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "LAX(G)?$")>;
+
+// Test and set
+def : InstRW<[FXb, LSU, Lat5, EndGroup], (instregex "TS$")>;
+
+// Compare and swap
+def : InstRW<[FXa, FXb, LSU, Lat6, GroupAlone], (instregex "CS(G|Y)?$")>;
+
+// Compare double and swap
+def : InstRW<[FXa, FXa, FXb, FXb, FXa, LSU, Lat10, GroupAlone],
+ (instregex "CDS(Y)?$")>;
+def : InstRW<[FXa, FXa, FXb, FXb, LSU, FXb, FXb, LSU, LSU, Lat20, GroupAlone],
+ (instregex "CDSG$")>;
+
+// Compare and swap and store
+def : InstRW<[FXa, Lat30, GroupAlone], (instregex "CSST$")>;
+
+// Perform locked operation
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PLO$")>;
+
+// Load/store pair from/to quadword
+def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPQ$")>;
+def : InstRW<[FXb, FXb, LSU, Lat6, GroupAlone], (instregex "STPQ$")>;
+
+// Load pair disjoint
+def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Access registers
+//===----------------------------------------------------------------------===//
+
+// Extract/set/copy access register
+def : InstRW<[LSU], (instregex "(EAR|SAR|CPYA)$")>;
+
+// Load address extended
+def : InstRW<[LSU, FXa, Lat5, BeginGroup], (instregex "LAE(Y)?$")>;
+
+// Load/store access multiple (not modeled precisely)
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(L|ST)AM(Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Program mask and addressing mode
+//===----------------------------------------------------------------------===//
+
+// Insert Program Mask
+def : InstRW<[FXa, Lat3, EndGroup], (instregex "IPM$")>;
+
+// Set Program Mask
+def : InstRW<[LSU, EndGroup], (instregex "SPM$")>;
+
+// Branch and link
+def : InstRW<[FXa, FXa, FXb, Lat5, GroupAlone], (instregex "BAL(R)?$")>;
+
+// Test addressing mode
+def : InstRW<[FXb], (instregex "TAM$")>;
+
+// Set addressing mode
+def : InstRW<[FXb, Lat2, EndGroup], (instregex "SAM(24|31|64)$")>;
+
+// Branch (and save) and set mode.
+def : InstRW<[FXa, FXb, Lat2, GroupAlone], (instregex "BSM$")>;
+def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "BASSM$")>;
+
+//===----------------------------------------------------------------------===//
+// Transactional execution
+//===----------------------------------------------------------------------===//
+
+// Transaction begin
+def : InstRW<[LSU, LSU, FXb, FXb, FXb, FXb, FXb, Lat15, GroupAlone],
+ (instregex "TBEGIN(C|_nofloat)?$")>;
+
+// Transaction end
+def : InstRW<[FXb, GroupAlone], (instregex "TEND$")>;
+
+// Transaction abort
+def : InstRW<[LSU, GroupAlone], (instregex "TABORT$")>;
+
+// Extract Transaction Nesting Depth
+def : InstRW<[FXa], (instregex "ETND$")>;
+
+// Nontransactional store
+def : InstRW<[FXb, LSU, Lat5], (instregex "NTSTG$")>;
+
+//===----------------------------------------------------------------------===//
+// Processor assist
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb], (instregex "PPA$")>;
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Instructions.
+//===----------------------------------------------------------------------===//
+
+// Find leftmost one
+def : InstRW<[FXa, Lat6, GroupAlone], (instregex "FLOGR$")>;
+
+// Population count
+def : InstRW<[FXa, Lat3], (instregex "POPCNT$")>;
+
+// Extend
+def : InstRW<[FXa], (instregex "AEXT128_64$")>;
+def : InstRW<[FXa], (instregex "ZEXT128_(32|64)$")>;
+
+// String instructions
+def : InstRW<[FXa, LSU, Lat30], (instregex "SRST$")>;
+
+// Move with key
+def : InstRW<[FXa, FXa, FXb, LSU, Lat8, GroupAlone], (instregex "MVCK$")>;
+
+// Extract CPU Time
+def : InstRW<[FXa, Lat5, LSU], (instregex "ECTG$")>;
+
+// Execute
+def : InstRW<[FXb, GroupAlone], (instregex "EX(RL)?$")>;
+
+// Program return
+def : InstRW<[FXb, Lat30], (instregex "PR$")>;
+
+// Inline assembly
+def : InstRW<[LSU, LSU, LSU, FXa, FXa, FXb, Lat9, GroupAlone],
+ (instregex "STCK(F)?$")>;
+def : InstRW<[LSU, LSU, LSU, LSU, FXa, FXa, FXb, FXb, Lat11, GroupAlone],
+ (instregex "STCKE$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "STFLE$")>;
+def : InstRW<[FXb, Lat30], (instregex "SVC$")>;
+
+// Store real address
+def : InstRW<[FXb, LSU, Lat5], (instregex "STRAG$")>;
+
+//===----------------------------------------------------------------------===//
+// .insn directive instructions
+//===----------------------------------------------------------------------===//
+
+// An "empty" sched-class will be assigned instead of the "invalid sched-class".
+// getNumDecoderSlots() will then return 1 instead of 0.
+def : InstRW<[], (instregex "Insn.*")>;
+
+
+// ----------------------------- Floating point ----------------------------- //
+
+//===----------------------------------------------------------------------===//
+// FP: Select instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "SelectF(32|64|128)$")>;
+def : InstRW<[FXa], (instregex "CondStoreF32(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStoreF64(Inv)?$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load zero
+def : InstRW<[FXb], (instregex "LZ(DR|ER)$")>;
+def : InstRW<[FXb, FXb, Lat2, BeginGroup], (instregex "LZXR$")>;
+
+// Load
+def : InstRW<[VecXsPm], (instregex "LER$")>;
+def : InstRW<[FXb], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[FXb, Lat3], (instregex "LGDR$")>;
+def : InstRW<[FXb, FXb, Lat2, GroupAlone], (instregex "LXR$")>;
+
+// Load and Test
+def : InstRW<[VecXsPm, Lat4], (instregex "LT(D|E)BR$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "LTEBRCompare(_VecPseudo)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "LTDBRCompare(_VecPseudo)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LTXBR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone],
+ (instregex "LTXBRCompare(_VecPseudo)?$")>;
+
+// Copy sign
+def : InstRW<[VecXsPm], (instregex "CPSDRd(d|s)$")>;
+def : InstRW<[VecXsPm], (instregex "CPSDRs(d|s)$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Load instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm, LSU, Lat7], (instregex "LE(Y)?$")>;
+def : InstRW<[LSU], (instregex "LD(Y|E32)?$")>;
+def : InstRW<[LSU], (instregex "LX$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Store instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat7], (instregex "STD(Y)?$")>;
+def : InstRW<[FXb, LSU, Lat7], (instregex "STE(Y)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "STX$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[VecBF], (instregex "LEDBR(A)?$")>;
+def : InstRW<[VecDF, VecDF, Lat20], (instregex "LEXBR(A)?$")>;
+def : InstRW<[VecDF, VecDF, Lat20], (instregex "LDXBR(A)?$")>;
+
+// Load lengthened
+def : InstRW<[VecBF, LSU, Lat12], (instregex "LDEB$")>;
+def : InstRW<[VecBF], (instregex "LDEBR$")>;
+def : InstRW<[VecBF2, VecBF2, LSU, Lat12 , GroupAlone], (instregex "LX(D|E)B$")>;
+def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "LX(D|E)BR$")>;
+
+// Convert from fixed / logical
+def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CE(F|G)BR(A)?$")>;
+def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CD(F|G)BR(A)?$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat12, GroupAlone], (instregex "CX(F|G)BR(A)?$")>;
+def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CEL(F|G)BR$")>;
+def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CDL(F|G)BR$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat12, GroupAlone], (instregex "CXL(F|G)BR$")>;
+
+// Convert to fixed / logical
+def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CF(E|D)BR(A)?$")>;
+def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CG(E|D)BR(A)?$")>;
+def : InstRW<[FXb, VecDF, VecDF, Lat20, BeginGroup], (instregex "C(F|G)XBR(A)?$")>;
+def : InstRW<[FXb, VecBF, Lat11, GroupAlone], (instregex "CLFEBR$")>;
+def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CLFDBR$")>;
+def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CLG(E|D)BR$")>;
+def : InstRW<[FXb, VecDF, VecDF, Lat20, BeginGroup], (instregex "CL(F|G)XBR$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load Complement / Negative / Positive
+def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)DBR$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)EBR$")>;
+def : InstRW<[FXb], (instregex "LCDFR(_32)?$")>;
+def : InstRW<[FXb], (instregex "LNDFR(_32)?$")>;
+def : InstRW<[FXb], (instregex "LPDFR(_32)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "L(C|N|P)XBR$")>;
+
+// Square root
+def : InstRW<[VecFPd, LSU], (instregex "SQ(E|D)B$")>;
+def : InstRW<[VecFPd], (instregex "SQ(E|D)BR$")>;
+def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "SQXBR$")>;
+
+// Load FP integer
+def : InstRW<[VecBF], (instregex "FIEBR(A)?$")>;
+def : InstRW<[VecBF], (instregex "FIDBR(A)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "FIXBR(A)?$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[VecBF, LSU, Lat12], (instregex "A(E|D)B$")>;
+def : InstRW<[VecBF], (instregex "A(E|D)BR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "AXBR$")>;
+
+// Subtraction
+def : InstRW<[VecBF, LSU, Lat12], (instregex "S(E|D)B$")>;
+def : InstRW<[VecBF], (instregex "S(E|D)BR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "SXBR$")>;
+
+// Multiply
+def : InstRW<[VecBF, LSU, Lat12], (instregex "M(D|DE|EE)B$")>;
+def : InstRW<[VecBF], (instregex "M(D|DE|EE)BR$")>;
+def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MXDB$")>;
+def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MXDBR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat20, GroupAlone], (instregex "MXBR$")>;
+
+// Multiply and add / subtract
+def : InstRW<[VecBF, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>;
+def : InstRW<[VecBF, GroupAlone], (instregex "M(A|S)EBR$")>;
+def : InstRW<[VecBF, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>;
+def : InstRW<[VecBF], (instregex "M(A|S)DBR$")>;
+
+// Division
+def : InstRW<[VecFPd, LSU], (instregex "D(E|D)B$")>;
+def : InstRW<[VecFPd], (instregex "D(E|D)BR$")>;
+def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "DXBR$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[VecXsPm, LSU, Lat8], (instregex "C(E|D)B$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "C(E|D)BR?$")>;
+def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "CXBR$")>;
+
+// Test Data Class
+def : InstRW<[LSU, VecXsPm, Lat9], (instregex "TC(E|D)B$")>;
+def : InstRW<[LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "TCXB$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Floating-point control register instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat4, GroupAlone], (instregex "EFPC$")>;
+def : InstRW<[FXb, LSU, Lat5, GroupAlone], (instregex "STFPC$")>;
+def : InstRW<[LSU, Lat3, GroupAlone], (instregex "SFPC$")>;
+def : InstRW<[LSU, LSU, Lat6, GroupAlone], (instregex "LFPC$")>;
+def : InstRW<[FXa, Lat30, GroupAlone], (instregex "SFASR$")>;
+def : InstRW<[FXa, LSU, Lat30, GroupAlone], (instregex "LFAS$")>;
+def : InstRW<[FXb, Lat3, GroupAlone], (instregex "SRNM(B|T)?$")>;
+
+// --------------------------------- Vector --------------------------------- //
+
+//===----------------------------------------------------------------------===//
+// Vector: Move instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb], (instregex "VLR(32|64)?$")>;
+def : InstRW<[FXb, Lat4], (instregex "VLGV(B|F|G|H)?$")>;
+def : InstRW<[FXb], (instregex "VLVG(B|F|G|H)?$")>;
+def : InstRW<[FXb, Lat2], (instregex "VLVGP(32)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Immediate instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm], (instregex "VZERO$")>;
+def : InstRW<[VecXsPm], (instregex "VONE$")>;
+def : InstRW<[VecXsPm], (instregex "VGBM$")>;
+def : InstRW<[VecXsPm], (instregex "VGM(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VLEI(B|F|G|H)$")>;
+def : InstRW<[VecXsPm], (instregex "VREPI(B|F|G|H)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Loads
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU], (instregex "VL(L|BB)?$")>;
+def : InstRW<[LSU], (instregex "VL(32|64)$")>;
+def : InstRW<[LSU], (instregex "VLLEZ(B|F|G|H)?$")>;
+def : InstRW<[LSU], (instregex "VLREP(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm, LSU, Lat7], (instregex "VLE(B|F|G|H)$")>;
+def : InstRW<[FXb, LSU, VecXsPm, Lat11, BeginGroup], (instregex "VGE(F|G)$")>;
+def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
+ (instregex "VLM$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Stores
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat8], (instregex "VST(L|32|64)?$")>;
+def : InstRW<[FXb, LSU, Lat8], (instregex "VSTE(F|G)$")>;
+def : InstRW<[FXb, LSU, VecXsPm, Lat11, BeginGroup], (instregex "VSTE(B|H)$")>;
+def : InstRW<[LSU, LSU, FXb, FXb, FXb, FXb, FXb, Lat20, GroupAlone],
+ (instregex "VSTM$")>;
+def : InstRW<[FXb, FXb, LSU, Lat12, BeginGroup], (instregex "VSCE(F|G)$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Selects and permutes
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm], (instregex "VMRH(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VMRL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VPERM$")>;
+def : InstRW<[VecXsPm], (instregex "VPDI$")>;
+def : InstRW<[VecXsPm], (instregex "VREP(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VSEL$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Widening and narrowing
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm], (instregex "VPK(F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VPKS(F|G|H)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VPKS(F|G|H)S$")>;
+def : InstRW<[VecXsPm], (instregex "VPKLS(F|G|H)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VPKLS(F|G|H)S$")>;
+def : InstRW<[VecXsPm], (instregex "VSEG(B|F|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VUPH(B|F|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VUPL(B|F)?$")>;
+def : InstRW<[VecXsPm], (instregex "VUPLH(B|F|H|W)?$")>;
+def : InstRW<[VecXsPm], (instregex "VUPLL(B|F|H)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Integer arithmetic
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm], (instregex "VA(B|F|G|H|Q|C|CQ)?$")>;
+def : InstRW<[VecXsPm], (instregex "VACC(B|F|G|H|Q|C|CQ)?$")>;
+def : InstRW<[VecXsPm], (instregex "VAVG(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VAVGL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VN(C|O)?$")>;
+def : InstRW<[VecXsPm], (instregex "VO$")>;
+def : InstRW<[VecMul], (instregex "VCKSM$")>;
+def : InstRW<[VecXsPm], (instregex "VCLZ(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VCTZ(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VX$")>;
+def : InstRW<[VecMul], (instregex "VGFM?$")>;
+def : InstRW<[VecMul], (instregex "VGFMA(B|F|G|H)?$")>;
+def : InstRW<[VecMul], (instregex "VGFM(B|F|G|H)$")>;
+def : InstRW<[VecXsPm], (instregex "VLC(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VLP(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VMX(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VMXL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VMN(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VMNL(B|F|G|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMAL(B|F)?$")>;
+def : InstRW<[VecMul], (instregex "VMALE(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMALH(B|F|H|W)?$")>;
+def : InstRW<[VecMul], (instregex "VMALO(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMAO(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMAE(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMAH(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VME(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMH(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VML(B|F)?$")>;
+def : InstRW<[VecMul], (instregex "VMLE(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMLH(B|F|H|W)?$")>;
+def : InstRW<[VecMul], (instregex "VMLO(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMO(B|F|H)?$")>;
+
+def : InstRW<[VecXsPm], (instregex "VPOPCT$")>;
+
+def : InstRW<[VecXsPm], (instregex "VERLL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VERLLV(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VERIM(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VESL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VESLV(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VESRA(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VESRAV(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VESRL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VESRLV(B|F|G|H)?$")>;
+
+def : InstRW<[VecXsPm], (instregex "VSL(DB)?$")>;
+def : InstRW<[VecXsPm, VecXsPm, Lat8], (instregex "VSLB$")>;
+def : InstRW<[VecXsPm], (instregex "VSR(A|L)$")>;
+def : InstRW<[VecXsPm, VecXsPm, Lat8], (instregex "VSR(A|L)B$")>;
+
+def : InstRW<[VecXsPm], (instregex "VSB(I|IQ|CBI|CBIQ)?$")>;
+def : InstRW<[VecXsPm], (instregex "VSCBI(B|F|G|H|Q)?$")>;
+def : InstRW<[VecXsPm], (instregex "VS(F|G|H|Q)?$")>;
+
+def : InstRW<[VecMul], (instregex "VSUM(B|H)?$")>;
+def : InstRW<[VecMul], (instregex "VSUMG(F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VSUMQ(F|G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Integer comparison
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm, Lat4], (instregex "VEC(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VECL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VCEQ(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VCEQ(B|F|G|H)S$")>;
+def : InstRW<[VecXsPm], (instregex "VCH(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VCH(B|F|G|H)S$")>;
+def : InstRW<[VecXsPm], (instregex "VCHL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VCHL(B|F|G|H)S$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VTM$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Floating-point arithmetic
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecBF2], (instregex "VCD(G|GB|LG|LGB)$")>;
+def : InstRW<[VecBF], (instregex "WCD(GB|LGB)$")>;
+def : InstRW<[VecBF2], (instregex "VC(L)?GD$")>;
+def : InstRW<[VecBF2], (instregex "VFADB$")>;
+def : InstRW<[VecBF], (instregex "WFADB$")>;
+def : InstRW<[VecBF2], (instregex "VCGDB$")>;
+def : InstRW<[VecBF], (instregex "WCGDB$")>;
+def : InstRW<[VecBF2], (instregex "VF(I|M|A|S)$")>;
+def : InstRW<[VecBF2], (instregex "VF(I|M|S)DB$")>;
+def : InstRW<[VecBF], (instregex "WF(I|M|S)DB$")>;
+def : InstRW<[VecBF2], (instregex "VCLGDB$")>;
+def : InstRW<[VecBF], (instregex "WCLGDB$")>;
+def : InstRW<[VecXsPm], (instregex "VFL(C|N|P)DB$")>;
+def : InstRW<[VecXsPm], (instregex "WFL(C|N|P)DB$")>;
+def : InstRW<[VecBF2], (instregex "VFM(A|S)$")>;
+def : InstRW<[VecBF2], (instregex "VFM(A|S)DB$")>;
+def : InstRW<[VecBF], (instregex "WFM(A|S)DB$")>;
+def : InstRW<[VecXsPm], (instregex "VFPSO$")>;
+def : InstRW<[VecXsPm], (instregex "(V|W)FPSODB$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VFTCI(DB)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "WFTCIDB$")>;
+def : InstRW<[VecBF2], (instregex "VL(DE|ED)$")>;
+def : InstRW<[VecBF2], (instregex "VL(DE|ED)B$")>;
+def : InstRW<[VecBF], (instregex "WL(DE|ED)B$")>;
+
+// divide / square root
+def : InstRW<[VecFPd], (instregex "VFD$")>;
+def : InstRW<[VecFPd], (instregex "(V|W)FDDB$")>;
+def : InstRW<[VecFPd], (instregex "VFSQ$")>;
+def : InstRW<[VecFPd], (instregex "(V|W)FSQDB$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Floating-point comparison
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm], (instregex "VFC(E|H|HE)$")>;
+def : InstRW<[VecXsPm], (instregex "VFC(E|H|HE)DB$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)$")>;
+def : InstRW<[VecXsPm], (instregex "WFC(E|H|HE)DB$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VFC(E|H|HE)DBS$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "WFC(E|H|HE)DBS$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)DB$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Floating-point insertion and extraction
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb], (instregex "LEFR$")>;
+def : InstRW<[FXb, Lat4], (instregex "LFER$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: String instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecStr], (instregex "VFAE(B)?$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VFAEBS$")>;
+def : InstRW<[VecStr], (instregex "VFAE(F|H)$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VFAE(F|H)S$")>;
+def : InstRW<[VecStr], (instregex "VFAEZ(B|F|H)$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VFAEZ(B|F|H)S$")>;
+def : InstRW<[VecStr], (instregex "VFEE(B|F|H|ZB|ZF|ZH)?$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VFEE(B|F|H|ZB|ZF|ZH)S$")>;
+def : InstRW<[VecStr], (instregex "VFENE(B|F|H|ZB|ZF|ZH)?$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VFENE(B|F|H|ZB|ZF|ZH)S$")>;
+def : InstRW<[VecStr], (instregex "VISTR(B|F|H)?$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VISTR(B|F|H)S$")>;
+def : InstRW<[VecStr], (instregex "VSTRC(B|F|H)?$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VSTRC(B|F|H)S$")>;
+def : InstRW<[VecStr], (instregex "VSTRCZ(B|F|H)$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VSTRCZ(B|F|H)S$")>;
+
+}
+
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td b/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td
new file mode 100644
index 000000000000..a950e54e7601
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td
@@ -0,0 +1,769 @@
+//=- SystemZScheduleZ196.td - SystemZ Scheduling Definitions ---*- tblgen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Z196 to support instruction
+// scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def Z196Model : SchedMachineModel {
+
+ let UnsupportedFeatures = Arch9UnsupportedFeatures.List;
+
+ let IssueWidth = 5;
+ let MicroOpBufferSize = 40; // Issue queues
+ let LoadLatency = 1; // Optimistic load latency.
+
+ let PostRAScheduler = 1;
+
+ // Extra cycles for a mispredicted branch.
+ let MispredictPenalty = 16;
+}
+
+let SchedModel = Z196Model in {
+
+// These definitions could be put in a subtarget common include file,
+// but it seems the include system in Tablegen currently rejects
+// multiple includes of same file.
+def : WriteRes<GroupAlone, []> {
+ let NumMicroOps = 0;
+ let BeginGroup = 1;
+ let EndGroup = 1;
+}
+def : WriteRes<EndGroup, []> {
+ let NumMicroOps = 0;
+ let EndGroup = 1;
+}
+def : WriteRes<Lat2, []> { let Latency = 2; let NumMicroOps = 0;}
+def : WriteRes<Lat3, []> { let Latency = 3; let NumMicroOps = 0;}
+def : WriteRes<Lat4, []> { let Latency = 4; let NumMicroOps = 0;}
+def : WriteRes<Lat5, []> { let Latency = 5; let NumMicroOps = 0;}
+def : WriteRes<Lat6, []> { let Latency = 6; let NumMicroOps = 0;}
+def : WriteRes<Lat7, []> { let Latency = 7; let NumMicroOps = 0;}
+def : WriteRes<Lat8, []> { let Latency = 8; let NumMicroOps = 0;}
+def : WriteRes<Lat9, []> { let Latency = 9; let NumMicroOps = 0;}
+def : WriteRes<Lat10, []> { let Latency = 10; let NumMicroOps = 0;}
+def : WriteRes<Lat11, []> { let Latency = 11; let NumMicroOps = 0;}
+def : WriteRes<Lat12, []> { let Latency = 12; let NumMicroOps = 0;}
+def : WriteRes<Lat15, []> { let Latency = 15; let NumMicroOps = 0;}
+def : WriteRes<Lat20, []> { let Latency = 20; let NumMicroOps = 0;}
+def : WriteRes<Lat30, []> { let Latency = 30; let NumMicroOps = 0;}
+
+// Execution units.
+def Z196_FXUnit : ProcResource<2>;
+def Z196_LSUnit : ProcResource<2>;
+def Z196_FPUnit : ProcResource<1>;
+
+// Subtarget specific definitions of scheduling resources.
+def : WriteRes<FXU, [Z196_FXUnit]> { let Latency = 1; }
+def : WriteRes<LSU, [Z196_LSUnit]> { let Latency = 4; }
+def : WriteRes<LSU_lat1, [Z196_LSUnit]> { let Latency = 1; }
+def : WriteRes<FPU, [Z196_FPUnit]> { let Latency = 8; }
+def : WriteRes<FPU2, [Z196_FPUnit, Z196_FPUnit]> { let Latency = 9; }
+
+// -------------------------- INSTRUCTIONS ---------------------------------- //
+
+// InstRW constructs have been used in order to preserve the
+// readability of the InstrInfo files.
+
+// For each instruction, as matched by a regexp, provide a list of
+// resources that it needs. These will be combined into a SchedClass.
+
+//===----------------------------------------------------------------------===//
+// Stack allocation
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU], (instregex "ADJDYNALLOC$")>; // Pseudo -> LA / LAY
+
+//===----------------------------------------------------------------------===//
+// Branch instructions
+//===----------------------------------------------------------------------===//
+
+// Branch
+def : InstRW<[LSU, EndGroup], (instregex "(Call)?BRC(L)?(Asm.*)?$")>;
+def : InstRW<[LSU, EndGroup], (instregex "(Call)?J(G)?(Asm.*)?$")>;
+def : InstRW<[LSU, EndGroup], (instregex "(Call)?BC(R)?(Asm.*)?$")>;
+def : InstRW<[LSU, EndGroup], (instregex "(Call)?B(R)?(Asm.*)?$")>;
+def : InstRW<[FXU, LSU, Lat5, GroupAlone], (instregex "BRCT(G|H)?$")>;
+def : InstRW<[FXU, LSU, Lat5, GroupAlone], (instregex "BCT(G)?(R)?$")>;
+def : InstRW<[FXU, FXU, FXU, LSU, Lat7, GroupAlone],
+ (instregex "B(R)?X(H|L).*$")>;
+
+// Compare and branch
+def : InstRW<[FXU, LSU, Lat5, GroupAlone],
+ (instregex "C(L)?(G)?(I|R)J(Asm.*)?$")>;
+def : InstRW<[FXU, LSU, Lat5, GroupAlone],
+ (instregex "C(L)?(G)?(I|R)B(Call|Return|Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Trap instructions
+//===----------------------------------------------------------------------===//
+
+// Trap
+def : InstRW<[LSU, EndGroup], (instregex "(Cond)?Trap$")>;
+
+// Compare and trap
+def : InstRW<[FXU], (instregex "C(G)?(I|R)T(Asm.*)?$")>;
+def : InstRW<[FXU], (instregex "CL(G)?RT(Asm.*)?$")>;
+def : InstRW<[FXU], (instregex "CL(F|G)IT(Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Call and return instructions
+//===----------------------------------------------------------------------===//
+
+// Call
+def : InstRW<[LSU, FXU, FXU, Lat6, GroupAlone], (instregex "(Call)?BRAS$")>;
+def : InstRW<[LSU, FXU, FXU, Lat6, GroupAlone], (instregex "(Call)?BRASL$")>;
+def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "(Call)?BAS(R)?$")>;
+def : InstRW<[LSU, FXU, FXU, Lat6, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
+
+// Return
+def : InstRW<[LSU_lat1, EndGroup], (instregex "Return$")>;
+def : InstRW<[LSU_lat1, EndGroup], (instregex "CondReturn$")>;
+
+//===----------------------------------------------------------------------===//
+// Select instructions
+//===----------------------------------------------------------------------===//
+
+// Select pseudo
+def : InstRW<[FXU], (instregex "Select(32|64|32Mux)$")>;
+
+// CondStore pseudos
+def : InstRW<[FXU], (instregex "CondStore16(Inv)?$")>;
+def : InstRW<[FXU], (instregex "CondStore16Mux(Inv)?$")>;
+def : InstRW<[FXU], (instregex "CondStore32(Inv)?$")>;
+def : InstRW<[FXU], (instregex "CondStore64(Inv)?$")>;
+def : InstRW<[FXU], (instregex "CondStore8(Inv)?$")>;
+def : InstRW<[FXU], (instregex "CondStore8Mux(Inv)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Move instructions
+//===----------------------------------------------------------------------===//
+
+// Moves
+def : InstRW<[FXU, LSU, Lat5], (instregex "MV(G|H)?HI$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "MVI(Y)?$")>;
+
+// Move character
+def : InstRW<[LSU, LSU, LSU, FXU, Lat8, GroupAlone], (instregex "MVC$")>;
+
+// Pseudo -> reg move
+def : InstRW<[FXU], (instregex "COPY(_TO_REGCLASS)?$")>;
+def : InstRW<[FXU], (instregex "EXTRACT_SUBREG$")>;
+def : InstRW<[FXU], (instregex "INSERT_SUBREG$")>;
+def : InstRW<[FXU], (instregex "REG_SEQUENCE$")>;
+def : InstRW<[FXU], (instregex "SUBREG_TO_REG$")>;
+
+// Loads
+def : InstRW<[LSU], (instregex "L(Y|FH|RL|Mux)?$")>;
+def : InstRW<[LSU], (instregex "LG(RL)?$")>;
+def : InstRW<[LSU], (instregex "L128$")>;
+
+def : InstRW<[FXU], (instregex "LLIH(F|H|L)$")>;
+def : InstRW<[FXU], (instregex "LLIL(F|H|L)$")>;
+
+def : InstRW<[FXU], (instregex "LG(F|H)I$")>;
+def : InstRW<[FXU], (instregex "LHI(Mux)?$")>;
+def : InstRW<[FXU], (instregex "LR(Mux)?$")>;
+
+// Load and test
+def : InstRW<[FXU, LSU, Lat5], (instregex "LT(G)?$")>;
+def : InstRW<[FXU], (instregex "LT(G)?R$")>;
+
+// Stores
+def : InstRW<[FXU, LSU, Lat5], (instregex "STG(RL)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "ST128$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "ST(Y|FH|RL|Mux)?$")>;
+
+// String moves.
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVST$")>;
+
+//===----------------------------------------------------------------------===//
+// Conditional move instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat2, EndGroup], (instregex "LOC(G)?R(Asm.*)?$")>;
+def : InstRW<[FXU, LSU, Lat6, EndGroup], (instregex "LOC(G)?(Asm.*)?$")>;
+def : InstRW<[FXU, LSU, Lat5, EndGroup], (instregex "STOC(G)?(Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Sign extensions
+//===----------------------------------------------------------------------===//
+def : InstRW<[FXU], (instregex "L(B|H|G)R$")>;
+def : InstRW<[FXU], (instregex "LG(B|H|F)R$")>;
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "LTGF$")>;
+def : InstRW<[FXU], (instregex "LTGFR$")>;
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "LB(H|Mux)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LH(Y)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LH(H|Mux|RL)$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LG(B|H|F)$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LG(H|F)RL$")>;
+
+//===----------------------------------------------------------------------===//
+// Zero extensions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU], (instregex "LLCR(Mux)?$")>;
+def : InstRW<[FXU], (instregex "LLHR(Mux)?$")>;
+def : InstRW<[FXU], (instregex "LLG(C|F|H|T)R$")>;
+def : InstRW<[LSU], (instregex "LLC(Mux)?$")>;
+def : InstRW<[LSU], (instregex "LLH(Mux)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LL(C|H)H$")>;
+def : InstRW<[LSU], (instregex "LLHRL$")>;
+def : InstRW<[LSU], (instregex "LLG(C|F|H|T|FRL|HRL)$")>;
+
+//===----------------------------------------------------------------------===//
+// Truncations
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Multi-register moves
+//===----------------------------------------------------------------------===//
+
+// Load multiple (estimated average of 5 ops)
+def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
+ (instregex "LM(H|Y|G)?$")>;
+
+// Store multiple (estimated average of 3 ops)
+def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone],
+ (instregex "STM(H|Y|G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Byte swaps
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU], (instregex "LRV(G)?R$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LRV(G|H)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "STRV(G|H)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Load address instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU], (instregex "LA(Y|RL)?$")>;
+
+// Load the Global Offset Table address
+def : InstRW<[FXU], (instregex "GOT$")>;
+
+//===----------------------------------------------------------------------===//
+// Absolute and Negation
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat2], (instregex "LP(G)?R$")>;
+def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "L(N|P)GFR$")>;
+def : InstRW<[FXU, Lat2], (instregex "LN(R|GR)$")>;
+def : InstRW<[FXU], (instregex "LC(R|GR)$")>;
+def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "LCGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// Insertion
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "IC(Y)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "IC32(Y)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "ICM(H|Y)?$")>;
+def : InstRW<[FXU], (instregex "II(F|H|L)Mux$")>;
+def : InstRW<[FXU], (instregex "IIHF(64)?$")>;
+def : InstRW<[FXU], (instregex "IIHH(64)?$")>;
+def : InstRW<[FXU], (instregex "IIHL(64)?$")>;
+def : InstRW<[FXU], (instregex "IILF(64)?$")>;
+def : InstRW<[FXU], (instregex "IILH(64)?$")>;
+def : InstRW<[FXU], (instregex "IILL(64)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Addition
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "A(Y|SI)?$")>;
+def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "AH(Y)?$")>;
+def : InstRW<[FXU], (instregex "AIH$")>;
+def : InstRW<[FXU], (instregex "AFI(Mux)?$")>;
+def : InstRW<[FXU], (instregex "AGFI$")>;
+def : InstRW<[FXU], (instregex "AGHI(K)?$")>;
+def : InstRW<[FXU], (instregex "AGR(K)?$")>;
+def : InstRW<[FXU], (instregex "AHI(K)?$")>;
+def : InstRW<[FXU], (instregex "AHIMux(K)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "AL(Y)?$")>;
+def : InstRW<[FXU], (instregex "AL(FI|HSIK)$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "ALG(F)?$")>;
+def : InstRW<[FXU], (instregex "ALGHSIK$")>;
+def : InstRW<[FXU], (instregex "ALGF(I|R)$")>;
+def : InstRW<[FXU], (instregex "ALGR(K)?$")>;
+def : InstRW<[FXU], (instregex "ALR(K)?$")>;
+def : InstRW<[FXU], (instregex "AR(K)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "AG(SI)?$")>;
+
+// Logical addition with carry
+def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "ALC(G)?$")>;
+def : InstRW<[FXU, Lat3, GroupAlone], (instregex "ALC(G)?R$")>;
+
+// Add with sign extension (32 -> 64)
+def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "AGF$")>;
+def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "AGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// Subtraction
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "S(G|Y)?$")>;
+def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "SH(Y)?$")>;
+def : InstRW<[FXU], (instregex "SGR(K)?$")>;
+def : InstRW<[FXU], (instregex "SLFI$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "SL(G|GF|Y)?$")>;
+def : InstRW<[FXU], (instregex "SLGF(I|R)$")>;
+def : InstRW<[FXU], (instregex "SLGR(K)?$")>;
+def : InstRW<[FXU], (instregex "SLR(K)?$")>;
+def : InstRW<[FXU], (instregex "SR(K)?$")>;
+
+// Subtraction with borrow
+def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "SLB(G)?$")>;
+def : InstRW<[FXU, Lat3, GroupAlone], (instregex "SLB(G)?R$")>;
+
+// Subtraction with sign extension (32 -> 64)
+def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "SGF$")>;
+def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "SGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// AND
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "N(G|Y)?$")>;
+def : InstRW<[FXU], (instregex "NGR(K)?$")>;
+def : InstRW<[FXU], (instregex "NI(FMux|HMux|LMux)$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "NI(Y)?$")>;
+def : InstRW<[FXU], (instregex "NIHF(64)?$")>;
+def : InstRW<[FXU], (instregex "NIHH(64)?$")>;
+def : InstRW<[FXU], (instregex "NIHL(64)?$")>;
+def : InstRW<[FXU], (instregex "NILF(64)?$")>;
+def : InstRW<[FXU], (instregex "NILH(64)?$")>;
+def : InstRW<[FXU], (instregex "NILL(64)?$")>;
+def : InstRW<[FXU], (instregex "NR(K)?$")>;
+def : InstRW<[LSU, LSU, FXU, Lat9, GroupAlone], (instregex "NC$")>;
+
+//===----------------------------------------------------------------------===//
+// OR
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "O(G|Y)?$")>;
+def : InstRW<[FXU], (instregex "OGR(K)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "OI(Y)?$")>;
+def : InstRW<[FXU], (instregex "OI(FMux|HMux|LMux)$")>;
+def : InstRW<[FXU], (instregex "OIHF(64)?$")>;
+def : InstRW<[FXU], (instregex "OIHH(64)?$")>;
+def : InstRW<[FXU], (instregex "OIHL(64)?$")>;
+def : InstRW<[FXU], (instregex "OILF(64)?$")>;
+def : InstRW<[FXU], (instregex "OILH(64)?$")>;
+def : InstRW<[FXU], (instregex "OILL(64)?$")>;
+def : InstRW<[FXU], (instregex "OR(K)?$")>;
+def : InstRW<[LSU, LSU, FXU, Lat9, GroupAlone], (instregex "OC$")>;
+
+//===----------------------------------------------------------------------===//
+// XOR
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "X(G|Y)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "XI(Y)?$")>;
+def : InstRW<[FXU], (instregex "XIFMux$")>;
+def : InstRW<[FXU], (instregex "XGR(K)?$")>;
+def : InstRW<[FXU], (instregex "XIHF(64)?$")>;
+def : InstRW<[FXU], (instregex "XILF(64)?$")>;
+def : InstRW<[FXU], (instregex "XR(K)?$")>;
+def : InstRW<[LSU, LSU, FXU, Lat9, GroupAlone], (instregex "XC$")>;
+
+//===----------------------------------------------------------------------===//
+// Multiplication
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat10], (instregex "MS(GF|Y)?$")>;
+def : InstRW<[FXU, Lat6], (instregex "MS(R|FI)$")>;
+def : InstRW<[FXU, LSU, Lat12], (instregex "MSG$")>;
+def : InstRW<[FXU, Lat8], (instregex "MSGR$")>;
+def : InstRW<[FXU, Lat6], (instregex "MSGF(I|R)$")>;
+def : InstRW<[FXU, LSU, Lat15, GroupAlone], (instregex "MLG$")>;
+def : InstRW<[FXU, Lat9, GroupAlone], (instregex "MLGR$")>;
+def : InstRW<[FXU, Lat5], (instregex "MGHI$")>;
+def : InstRW<[FXU, Lat5], (instregex "MHI$")>;
+def : InstRW<[FXU, LSU, Lat9], (instregex "MH(Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Division and remainder
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
+ (instregex "DSG(F)?R$")>;
+def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, Lat30, GroupAlone],
+ (instregex "DSG(F)?$")>;
+def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
+ (instregex "DL(G)?R$")>;
+def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
+ (instregex "DL(G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Shifts
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU], (instregex "SLL(G|K)?$")>;
+def : InstRW<[FXU], (instregex "SRL(G|K)?$")>;
+def : InstRW<[FXU], (instregex "SRA(G|K)?$")>;
+def : InstRW<[FXU, Lat2], (instregex "SLA(K)?$")>;
+
+// Rotate
+def : InstRW<[FXU, LSU, Lat6], (instregex "RLL(G)?$")>;
+
+// Rotate and insert
+def : InstRW<[FXU], (instregex "RISBG(32)?$")>;
+def : InstRW<[FXU], (instregex "RISBH(G|H|L)$")>;
+def : InstRW<[FXU], (instregex "RISBL(G|H|L)$")>;
+def : InstRW<[FXU], (instregex "RISBMux$")>;
+
+// Rotate and Select
+def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "R(N|O|X)SBG$")>;
+
+//===----------------------------------------------------------------------===//
+// Comparison
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "C(G|Y|Mux|RL)?$")>;
+def : InstRW<[FXU], (instregex "C(F|H)I(Mux)?$")>;
+def : InstRW<[FXU], (instregex "CG(F|H)I$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "CG(HSI|RL)$")>;
+def : InstRW<[FXU], (instregex "C(G)?R$")>;
+def : InstRW<[FXU], (instregex "CIH$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "CH(F|SI)$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "CL(Y|Mux|FHSI)?$")>;
+def : InstRW<[FXU], (instregex "CLFI(Mux)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "CLG(HRL|HSI)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "CLGF(RL)?$")>;
+def : InstRW<[FXU], (instregex "CLGF(I|R)$")>;
+def : InstRW<[FXU], (instregex "CLGR$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "CLGRL$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "CLH(F|RL|HSI)$")>;
+def : InstRW<[FXU], (instregex "CLIH$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "CLI(Y)?$")>;
+def : InstRW<[FXU], (instregex "CLR$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "CLRL$")>;
+
+// Compare halfword
+def : InstRW<[FXU, LSU, FXU, Lat6, GroupAlone], (instregex "CH(Y|RL)?$")>;
+def : InstRW<[FXU, LSU, FXU, Lat6, GroupAlone], (instregex "CGH(RL)?$")>;
+def : InstRW<[FXU, LSU, FXU, Lat6, GroupAlone], (instregex "CHHSI$")>;
+
+// Compare with sign extension (32 -> 64)
+def : InstRW<[FXU, FXU, LSU, Lat6, Lat2, GroupAlone], (instregex "CGF(RL)?$")>;
+def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "CGFR$")>;
+
+// Compare logical character
+def : InstRW<[LSU, LSU, FXU, Lat9, GroupAlone], (instregex "CLC$")>;
+
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>;
+
+// Test under mask
+def : InstRW<[FXU, LSU, Lat5], (instregex "TM(Y)?$")>;
+def : InstRW<[FXU], (instregex "TM(H|L)Mux$")>;
+def : InstRW<[FXU], (instregex "TMHH(64)?$")>;
+def : InstRW<[FXU], (instregex "TMHL(64)?$")>;
+def : InstRW<[FXU], (instregex "TMLH(64)?$")>;
+def : InstRW<[FXU], (instregex "TMLL(64)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Prefetch
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU, GroupAlone], (instregex "PFD(RL)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Atomic operations
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU, EndGroup], (instregex "Serialize$")>;
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "LAA(G)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LAAL(G)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LAN(G)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LAO(G)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LAX(G)?$")>;
+
+// Test and set
+def : InstRW<[FXU, LSU, Lat5, EndGroup], (instregex "TS$")>;
+
+// Compare and swap
+def : InstRW<[FXU, LSU, FXU, Lat6, GroupAlone], (instregex "CS(G|Y)?$")>;
+
+// Compare double and swap
+def : InstRW<[FXU, FXU, FXU, FXU, FXU, LSU, Lat10, GroupAlone],
+ (instregex "CDS(Y)?$")>;
+def : InstRW<[FXU, FXU, FXU, FXU, FXU, FXU, LSU, LSU, Lat12, GroupAlone],
+ (instregex "CDSG$")>;
+
+// Compare and swap and store
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "CSST$")>;
+
+// Perform locked operation
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PLO$")>;
+
+// Load/store pair from/to quadword
+def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPQ$")>;
+def : InstRW<[FXU, FXU, LSU, LSU, Lat6, GroupAlone], (instregex "STPQ$")>;
+
+// Load pair disjoint
+def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Access registers
+//===----------------------------------------------------------------------===//
+
+// Extract/set/copy access register
+def : InstRW<[LSU], (instregex "(EAR|SAR|CPYA)$")>;
+
+// Load address extended
+def : InstRW<[LSU, FXU, Lat5, GroupAlone], (instregex "LAE(Y)?$")>;
+
+// Load/store access multiple (not modeled precisely)
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(L|ST)AM(Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Program mask and addressing mode
+//===----------------------------------------------------------------------===//
+
+// Insert Program Mask
+def : InstRW<[FXU, Lat3, EndGroup], (instregex "IPM$")>;
+
+// Set Program Mask
+def : InstRW<[LSU, EndGroup], (instregex "SPM$")>;
+
+// Branch and link
+def : InstRW<[FXU, FXU, LSU, Lat8, GroupAlone], (instregex "BAL(R)?$")>;
+
+// Test addressing mode
+def : InstRW<[FXU], (instregex "TAM$")>;
+
+// Set addressing mode
+def : InstRW<[LSU, EndGroup], (instregex "SAM(24|31|64)$")>;
+
+// Branch (and save) and set mode.
+def : InstRW<[FXU, LSU, Lat5, GroupAlone], (instregex "BSM$")>;
+def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "BASSM$")>;
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Instructions.
+//===----------------------------------------------------------------------===//
+
+// Find leftmost one
+def : InstRW<[FXU, Lat7, GroupAlone], (instregex "FLOGR$")>;
+
+// Population count
+def : InstRW<[FXU, Lat3], (instregex "POPCNT$")>;
+
+// Extend
+def : InstRW<[FXU], (instregex "AEXT128_64$")>;
+def : InstRW<[FXU], (instregex "ZEXT128_(32|64)$")>;
+
+// String instructions
+def : InstRW<[FXU, LSU, Lat30], (instregex "SRST$")>;
+
+// Move with key
+def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVCK$")>;
+
+// Extract CPU Time
+def : InstRW<[FXU, Lat5, LSU], (instregex "ECTG$")>;
+
+// Execute
+def : InstRW<[LSU, GroupAlone], (instregex "EX(RL)?$")>;
+
+// Program return
+def : InstRW<[FXU, Lat30], (instregex "PR$")>;
+
+// Inline assembly
+def : InstRW<[FXU, LSU, Lat15], (instregex "STCK$")>;
+def : InstRW<[FXU, LSU, Lat12], (instregex "STCKF$")>;
+def : InstRW<[LSU, FXU, Lat5], (instregex "STCKE$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "STFLE$")>;
+def : InstRW<[FXU, Lat30], (instregex "SVC$")>;
+
+// Store real address
+def : InstRW<[FXU, LSU, Lat5], (instregex "STRAG$")>;
+
+//===----------------------------------------------------------------------===//
+// .insn directive instructions
+//===----------------------------------------------------------------------===//
+
+// An "empty" sched-class will be assigned instead of the "invalid sched-class".
+// getNumDecoderSlots() will then return 1 instead of 0.
+def : InstRW<[], (instregex "Insn.*")>;
+
+
+// ----------------------------- Floating point ----------------------------- //
+
+//===----------------------------------------------------------------------===//
+// FP: Select instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU], (instregex "SelectF(32|64|128)$")>;
+def : InstRW<[FXU], (instregex "CondStoreF32(Inv)?$")>;
+def : InstRW<[FXU], (instregex "CondStoreF64(Inv)?$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load zero
+def : InstRW<[FXU], (instregex "LZ(DR|ER)$")>;
+def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "LZXR$")>;
+
+// Load
+def : InstRW<[FXU], (instregex "LER$")>;
+def : InstRW<[FXU], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[FXU, Lat3], (instregex "LGDR$")>;
+def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "LXR$")>;
+
+// Load and Test
+def : InstRW<[FPU], (instregex "LT(D|E)BR$")>;
+def : InstRW<[FPU], (instregex "LTEBRCompare(_VecPseudo)?$")>;
+def : InstRW<[FPU], (instregex "LTDBRCompare(_VecPseudo)?$")>;
+def : InstRW<[FPU2, FPU2, Lat9, GroupAlone], (instregex "LTXBR$")>;
+def : InstRW<[FPU2, FPU2, Lat9, GroupAlone],
+ (instregex "LTXBRCompare(_VecPseudo)?$")>;
+
+// Copy sign
+def : InstRW<[FXU, FXU, Lat5, GroupAlone], (instregex "CPSDRd(d|s)$")>;
+def : InstRW<[FXU, FXU, Lat5, GroupAlone], (instregex "CPSDRs(d|s)$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Load instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU], (instregex "LE(Y)?$")>;
+def : InstRW<[LSU], (instregex "LD(Y|E32)?$")>;
+def : InstRW<[LSU], (instregex "LX$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Store instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat7], (instregex "STD(Y)?$")>;
+def : InstRW<[FXU, LSU, Lat7], (instregex "STE(Y)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "STX$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[FPU], (instregex "LEDBR(A)?$")>;
+def : InstRW<[FPU, FPU, Lat20], (instregex "LEXBR(A)?$")>;
+def : InstRW<[FPU, FPU, Lat20], (instregex "LDXBR(A)?$")>;
+
+// Load lengthened
+def : InstRW<[FPU, LSU, Lat12], (instregex "LDEB$")>;
+def : InstRW<[FPU], (instregex "LDEBR$")>;
+def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "LX(D|E)B$")>;
+def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "LX(D|E)BR$")>;
+
+// Convert from fixed / logical
+def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CE(F|G)BR(A)?$")>;
+def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CD(F|G)BR(A)?$")>;
+def : InstRW<[FXU, FPU2, FPU2, Lat11, GroupAlone], (instregex "CX(F|G)BR(A)?$")>;
+def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CEL(F|G)BR$")>;
+def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CDL(F|G)BR$")>;
+def : InstRW<[FXU, FPU2, FPU2, Lat11, GroupAlone], (instregex "CXL(F|G)BR$")>;
+
+// Convert to fixed / logical
+def : InstRW<[FXU, FPU, Lat12, GroupAlone], (instregex "CF(E|D)BR(A)?$")>;
+def : InstRW<[FXU, FPU, Lat12, GroupAlone], (instregex "CG(E|D)BR(A)?$")>;
+def : InstRW<[FXU, FPU, FPU, Lat20, GroupAlone], (instregex "C(F|G)XBR(A)?$")>;
+def : InstRW<[FXU, FPU, Lat11, GroupAlone], (instregex "CLF(E|D)BR$")>;
+def : InstRW<[FXU, FPU, Lat11, GroupAlone], (instregex "CLG(E|D)BR$")>;
+def : InstRW<[FXU, FPU, FPU, Lat20, GroupAlone], (instregex "CL(F|G)XBR$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load Complement / Negative / Positive
+def : InstRW<[FPU], (instregex "L(C|N|P)DBR$")>;
+def : InstRW<[FPU], (instregex "L(C|N|P)EBR$")>;
+def : InstRW<[FXU], (instregex "LCDFR(_32)?$")>;
+def : InstRW<[FXU], (instregex "LNDFR(_32)?$")>;
+def : InstRW<[FXU], (instregex "LPDFR(_32)?$")>;
+def : InstRW<[FPU2, FPU2, Lat9, GroupAlone], (instregex "L(C|N|P)XBR$")>;
+
+// Square root
+def : InstRW<[FPU, LSU, Lat30], (instregex "SQ(E|D)B$")>;
+def : InstRW<[FPU, Lat30], (instregex "SQ(E|D)BR$")>;
+def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "SQXBR$")>;
+
+// Load FP integer
+def : InstRW<[FPU], (instregex "FIEBR(A)?$")>;
+def : InstRW<[FPU], (instregex "FIDBR(A)?$")>;
+def : InstRW<[FPU2, FPU2, Lat15, GroupAlone], (instregex "FIXBR(A)?$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[FPU, LSU, Lat12], (instregex "A(E|D)B$")>;
+def : InstRW<[FPU], (instregex "A(E|D)BR$")>;
+def : InstRW<[FPU2, FPU2, Lat20, GroupAlone], (instregex "AXBR$")>;
+
+// Subtraction
+def : InstRW<[FPU, LSU, Lat12], (instregex "S(E|D)B$")>;
+def : InstRW<[FPU], (instregex "S(E|D)BR$")>;
+def : InstRW<[FPU2, FPU2, Lat20, GroupAlone], (instregex "SXBR$")>;
+
+// Multiply
+def : InstRW<[FPU, LSU, Lat12], (instregex "M(D|DE|EE)B$")>;
+def : InstRW<[FPU], (instregex "M(D|DE|EE)BR$")>;
+def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MXDB$")>;
+def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MXDBR$")>;
+def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "MXBR$")>;
+
+// Multiply and add / subtract
+def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>;
+def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)EBR$")>;
+def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>;
+def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)DBR$")>;
+
+// Division
+def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)B$")>;
+def : InstRW<[FPU, Lat30], (instregex "D(E|D)BR$")>;
+def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXBR$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[FPU, LSU, Lat12], (instregex "C(E|D)B$")>;
+def : InstRW<[FPU], (instregex "C(E|D)BR$")>;
+def : InstRW<[FPU, FPU, Lat30], (instregex "CXBR$")>;
+
+// Test Data Class
+def : InstRW<[FPU, LSU, Lat15], (instregex "TC(E|D)B$")>;
+def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "TCXB$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Floating-point control register instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat4, GroupAlone], (instregex "EFPC$")>;
+def : InstRW<[LSU, Lat3, GroupAlone], (instregex "SFPC$")>;
+def : InstRW<[LSU, LSU, Lat6, GroupAlone], (instregex "LFPC$")>;
+def : InstRW<[LSU, Lat3, GroupAlone], (instregex "STFPC$")>;
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "SFASR$")>;
+def : InstRW<[FXU, LSU, Lat30, GroupAlone], (instregex "LFAS$")>;
+def : InstRW<[FXU, Lat2, GroupAlone], (instregex "SRNM(B|T)?$")>;
+
+}
+
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td b/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td
new file mode 100644
index 000000000000..8ab6c826f1ed
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td
@@ -0,0 +1,807 @@
+//=- SystemZScheduleZEC12.td - SystemZ Scheduling Definitions --*- tblgen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for ZEC12 to support instruction
+// scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def ZEC12Model : SchedMachineModel {
+
+ let UnsupportedFeatures = Arch10UnsupportedFeatures.List;
+
+ let IssueWidth = 5;
+ let MicroOpBufferSize = 40; // Issue queues
+ let LoadLatency = 1; // Optimistic load latency.
+
+ let PostRAScheduler = 1;
+
+ // Extra cycles for a mispredicted branch.
+ let MispredictPenalty = 16;
+}
+
+let SchedModel = ZEC12Model in {
+
+// These definitions could be put in a subtarget common include file,
+// but it seems the include system in Tablegen currently rejects
+// multiple includes of same file.
+def : WriteRes<GroupAlone, []> {
+ let NumMicroOps = 0;
+ let BeginGroup = 1;
+ let EndGroup = 1;
+}
+def : WriteRes<EndGroup, []> {
+ let NumMicroOps = 0;
+ let EndGroup = 1;
+}
+def : WriteRes<Lat2, []> { let Latency = 2; let NumMicroOps = 0;}
+def : WriteRes<Lat3, []> { let Latency = 3; let NumMicroOps = 0;}
+def : WriteRes<Lat4, []> { let Latency = 4; let NumMicroOps = 0;}
+def : WriteRes<Lat5, []> { let Latency = 5; let NumMicroOps = 0;}
+def : WriteRes<Lat6, []> { let Latency = 6; let NumMicroOps = 0;}
+def : WriteRes<Lat7, []> { let Latency = 7; let NumMicroOps = 0;}
+def : WriteRes<Lat8, []> { let Latency = 8; let NumMicroOps = 0;}
+def : WriteRes<Lat9, []> { let Latency = 9; let NumMicroOps = 0;}
+def : WriteRes<Lat10, []> { let Latency = 10; let NumMicroOps = 0;}
+def : WriteRes<Lat11, []> { let Latency = 11; let NumMicroOps = 0;}
+def : WriteRes<Lat12, []> { let Latency = 12; let NumMicroOps = 0;}
+def : WriteRes<Lat15, []> { let Latency = 15; let NumMicroOps = 0;}
+def : WriteRes<Lat20, []> { let Latency = 20; let NumMicroOps = 0;}
+def : WriteRes<Lat30, []> { let Latency = 30; let NumMicroOps = 0;}
+
+// Execution units.
+def ZEC12_FXUnit : ProcResource<2>;
+def ZEC12_LSUnit : ProcResource<2>;
+def ZEC12_FPUnit : ProcResource<1>;
+def ZEC12_VBUnit : ProcResource<1>;
+
+// Subtarget specific definitions of scheduling resources.
+def : WriteRes<FXU, [ZEC12_FXUnit]> { let Latency = 1; }
+def : WriteRes<LSU, [ZEC12_LSUnit]> { let Latency = 4; }
+def : WriteRes<LSU_lat1, [ZEC12_LSUnit]> { let Latency = 1; }
+def : WriteRes<FPU, [ZEC12_FPUnit]> { let Latency = 8; }
+def : WriteRes<FPU2, [ZEC12_FPUnit, ZEC12_FPUnit]> { let Latency = 9; }
+def : WriteRes<VBU, [ZEC12_VBUnit]>; // Virtual Branching Unit
+
+// -------------------------- INSTRUCTIONS ---------------------------------- //
+
+// InstRW constructs have been used in order to preserve the
+// readability of the InstrInfo files.
+
+// For each instruction, as matched by a regexp, provide a list of
+// resources that it needs. These will be combined into a SchedClass.
+
+//===----------------------------------------------------------------------===//
+// Stack allocation
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU], (instregex "ADJDYNALLOC$")>; // Pseudo -> LA / LAY
+
+//===----------------------------------------------------------------------===//
+// Branch instructions
+//===----------------------------------------------------------------------===//
+
+// Branch
+def : InstRW<[VBU], (instregex "(Call)?BRC(L)?(Asm.*)?$")>;
+def : InstRW<[VBU], (instregex "(Call)?J(G)?(Asm.*)?$")>;
+def : InstRW<[LSU, Lat4], (instregex "(Call)?BC(R)?(Asm.*)?$")>;
+def : InstRW<[LSU, Lat4], (instregex "(Call)?B(R)?(Asm.*)?$")>;
+def : InstRW<[FXU, EndGroup], (instregex "BRCT(G)?$")>;
+def : InstRW<[FXU, LSU, Lat5, GroupAlone], (instregex "BRCTH$")>;
+def : InstRW<[FXU, LSU, Lat5, GroupAlone], (instregex "BCT(G)?(R)?$")>;
+def : InstRW<[FXU, FXU, FXU, LSU, Lat7, GroupAlone],
+ (instregex "B(R)?X(H|L).*$")>;
+
+// Compare and branch
+def : InstRW<[FXU], (instregex "C(L)?(G)?(I|R)J(Asm.*)?$")>;
+def : InstRW<[FXU, LSU, Lat5, GroupAlone],
+ (instregex "C(L)?(G)?(I|R)B(Call|Return|Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Trap instructions
+//===----------------------------------------------------------------------===//
+
+// Trap
+def : InstRW<[VBU], (instregex "(Cond)?Trap$")>;
+
+// Compare and trap
+def : InstRW<[FXU], (instregex "C(G)?(I|R)T(Asm.*)?$")>;
+def : InstRW<[FXU], (instregex "CL(G)?RT(Asm.*)?$")>;
+def : InstRW<[FXU], (instregex "CL(F|G)IT(Asm.*)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "CL(G)?T(Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Call and return instructions
+//===----------------------------------------------------------------------===//
+
+// Call
+def : InstRW<[VBU, FXU, FXU, Lat3, GroupAlone], (instregex "(Call)?BRAS$")>;
+def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "(Call)?BRASL$")>;
+def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "(Call)?BAS(R)?$")>;
+def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
+
+// Return
+def : InstRW<[LSU_lat1, EndGroup], (instregex "Return$")>;
+def : InstRW<[LSU_lat1], (instregex "CondReturn$")>;
+
+//===----------------------------------------------------------------------===//
+// Select instructions
+//===----------------------------------------------------------------------===//
+
+// Select pseudo
+def : InstRW<[FXU], (instregex "Select(32|64|32Mux)$")>;
+
+// CondStore pseudos
+def : InstRW<[FXU], (instregex "CondStore16(Inv)?$")>;
+def : InstRW<[FXU], (instregex "CondStore16Mux(Inv)?$")>;
+def : InstRW<[FXU], (instregex "CondStore32(Inv)?$")>;
+def : InstRW<[FXU], (instregex "CondStore64(Inv)?$")>;
+def : InstRW<[FXU], (instregex "CondStore8(Inv)?$")>;
+def : InstRW<[FXU], (instregex "CondStore8Mux(Inv)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Move instructions
+//===----------------------------------------------------------------------===//
+
+// Moves
+def : InstRW<[FXU, LSU, Lat5], (instregex "MV(G|H)?HI$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "MVI(Y)?$")>;
+
+// Move character
+def : InstRW<[LSU, LSU, LSU, FXU, Lat8, GroupAlone], (instregex "MVC$")>;
+
+// Pseudo -> reg move
+def : InstRW<[FXU], (instregex "COPY(_TO_REGCLASS)?$")>;
+def : InstRW<[FXU], (instregex "EXTRACT_SUBREG$")>;
+def : InstRW<[FXU], (instregex "INSERT_SUBREG$")>;
+def : InstRW<[FXU], (instregex "REG_SEQUENCE$")>;
+def : InstRW<[FXU], (instregex "SUBREG_TO_REG$")>;
+
+// Loads
+def : InstRW<[LSU], (instregex "L(Y|FH|RL|Mux)?$")>;
+def : InstRW<[LSU], (instregex "LG(RL)?$")>;
+def : InstRW<[LSU], (instregex "L128$")>;
+
+def : InstRW<[FXU], (instregex "LLIH(F|H|L)$")>;
+def : InstRW<[FXU], (instregex "LLIL(F|H|L)$")>;
+
+def : InstRW<[FXU], (instregex "LG(F|H)I$")>;
+def : InstRW<[FXU], (instregex "LHI(Mux)?$")>;
+def : InstRW<[FXU], (instregex "LR(Mux)?$")>;
+
+// Load and trap
+def : InstRW<[FXU, LSU, Lat5], (instregex "L(FH|G)?AT$")>;
+
+// Load and test
+def : InstRW<[FXU, LSU, Lat5], (instregex "LT(G)?$")>;
+def : InstRW<[FXU], (instregex "LT(G)?R$")>;
+
+// Stores
+def : InstRW<[FXU, LSU, Lat5], (instregex "STG(RL)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "ST128$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "ST(Y|FH|RL|Mux)?$")>;
+
+// String moves.
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVST$")>;
+
+//===----------------------------------------------------------------------===//
+// Conditional move instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat2], (instregex "LOC(G)?R(Asm.*)?$")>;
+def : InstRW<[FXU, LSU, Lat6], (instregex "LOC(G)?(Asm.*)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "STOC(G)?(Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Sign extensions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU], (instregex "L(B|H|G)R$")>;
+def : InstRW<[FXU], (instregex "LG(B|H|F)R$")>;
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "LTGF$")>;
+def : InstRW<[FXU], (instregex "LTGFR$")>;
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "LB(H|Mux)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LH(Y)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LH(H|Mux|RL)$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LG(B|H|F)$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LG(H|F)RL$")>;
+
+//===----------------------------------------------------------------------===//
+// Zero extensions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU], (instregex "LLCR(Mux)?$")>;
+def : InstRW<[FXU], (instregex "LLHR(Mux)?$")>;
+def : InstRW<[FXU], (instregex "LLG(C|H|F|T)R$")>;
+def : InstRW<[LSU], (instregex "LLC(Mux)?$")>;
+def : InstRW<[LSU], (instregex "LLH(Mux)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LL(C|H)H$")>;
+def : InstRW<[LSU], (instregex "LLHRL$")>;
+def : InstRW<[LSU], (instregex "LLG(C|H|F|T|HRL|FRL)$")>;
+
+// Load and trap
+def : InstRW<[FXU, LSU, Lat5], (instregex "LLG(F|T)?AT$")>;
+
+//===----------------------------------------------------------------------===//
+// Truncations
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Multi-register moves
+//===----------------------------------------------------------------------===//
+
+// Load multiple (estimated average of 5 ops)
+def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
+ (instregex "LM(H|Y|G)?$")>;
+
+// Store multiple (estimated average of 3 ops)
+def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone],
+ (instregex "STM(H|Y|G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Byte swaps
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU], (instregex "LRV(G)?R$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LRV(G|H)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "STRV(G|H)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Load address instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU], (instregex "LA(Y|RL)?$")>;
+
+// Load the Global Offset Table address
+def : InstRW<[FXU], (instregex "GOT$")>;
+
+//===----------------------------------------------------------------------===//
+// Absolute and Negation
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat2], (instregex "LP(G)?R$")>;
+def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "L(N|P)GFR$")>;
+def : InstRW<[FXU, Lat2], (instregex "LN(R|GR)$")>;
+def : InstRW<[FXU], (instregex "LC(R|GR)$")>;
+def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "LCGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// Insertion
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "IC(Y)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "IC32(Y)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "ICM(H|Y)?$")>;
+def : InstRW<[FXU], (instregex "II(F|H|L)Mux$")>;
+def : InstRW<[FXU], (instregex "IIHF(64)?$")>;
+def : InstRW<[FXU], (instregex "IIHH(64)?$")>;
+def : InstRW<[FXU], (instregex "IIHL(64)?$")>;
+def : InstRW<[FXU], (instregex "IILF(64)?$")>;
+def : InstRW<[FXU], (instregex "IILH(64)?$")>;
+def : InstRW<[FXU], (instregex "IILL(64)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Addition
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "A(Y|SI)?$")>;
+def : InstRW<[FXU, LSU, Lat6], (instregex "AH(Y)?$")>;
+def : InstRW<[FXU], (instregex "AIH$")>;
+def : InstRW<[FXU], (instregex "AFI(Mux)?$")>;
+def : InstRW<[FXU], (instregex "AGFI$")>;
+def : InstRW<[FXU], (instregex "AGHI(K)?$")>;
+def : InstRW<[FXU], (instregex "AGR(K)?$")>;
+def : InstRW<[FXU], (instregex "AHI(K)?$")>;
+def : InstRW<[FXU], (instregex "AHIMux(K)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "AL(Y)?$")>;
+def : InstRW<[FXU], (instregex "AL(FI|HSIK)$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "ALG(F)?$")>;
+def : InstRW<[FXU], (instregex "ALGHSIK$")>;
+def : InstRW<[FXU], (instregex "ALGF(I|R)$")>;
+def : InstRW<[FXU], (instregex "ALGR(K)?$")>;
+def : InstRW<[FXU], (instregex "ALR(K)?$")>;
+def : InstRW<[FXU], (instregex "AR(K)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "AG(SI)?$")>;
+
+// Logical addition with carry
+def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "ALC(G)?$")>;
+def : InstRW<[FXU, Lat3, GroupAlone], (instregex "ALC(G)?R$")>;
+
+// Add with sign extension (32 -> 64)
+def : InstRW<[FXU, LSU, Lat6], (instregex "AGF$")>;
+def : InstRW<[FXU, Lat2], (instregex "AGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// Subtraction
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "S(G|Y)?$")>;
+def : InstRW<[FXU, LSU, Lat6], (instregex "SH(Y)?$")>;
+def : InstRW<[FXU], (instregex "SGR(K)?$")>;
+def : InstRW<[FXU], (instregex "SLFI$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "SL(G|GF|Y)?$")>;
+def : InstRW<[FXU], (instregex "SLGF(I|R)$")>;
+def : InstRW<[FXU], (instregex "SLGR(K)?$")>;
+def : InstRW<[FXU], (instregex "SLR(K)?$")>;
+def : InstRW<[FXU], (instregex "SR(K)?$")>;
+
+// Subtraction with borrow
+def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "SLB(G)?$")>;
+def : InstRW<[FXU, Lat3, GroupAlone], (instregex "SLB(G)?R$")>;
+
+// Subtraction with sign extension (32 -> 64)
+def : InstRW<[FXU, LSU, Lat6], (instregex "SGF$")>;
+def : InstRW<[FXU, Lat2], (instregex "SGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// AND
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "N(G|Y)?$")>;
+def : InstRW<[FXU], (instregex "NGR(K)?$")>;
+def : InstRW<[FXU], (instregex "NI(FMux|HMux|LMux)$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "NI(Y)?$")>;
+def : InstRW<[FXU], (instregex "NIHF(64)?$")>;
+def : InstRW<[FXU], (instregex "NIHH(64)?$")>;
+def : InstRW<[FXU], (instregex "NIHL(64)?$")>;
+def : InstRW<[FXU], (instregex "NILF(64)?$")>;
+def : InstRW<[FXU], (instregex "NILH(64)?$")>;
+def : InstRW<[FXU], (instregex "NILL(64)?$")>;
+def : InstRW<[FXU], (instregex "NR(K)?$")>;
+def : InstRW<[LSU, LSU, FXU, Lat9, GroupAlone], (instregex "NC$")>;
+
+//===----------------------------------------------------------------------===//
+// OR
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "O(G|Y)?$")>;
+def : InstRW<[FXU], (instregex "OGR(K)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "OI(Y)?$")>;
+def : InstRW<[FXU], (instregex "OI(FMux|HMux|LMux)$")>;
+def : InstRW<[FXU], (instregex "OIHF(64)?$")>;
+def : InstRW<[FXU], (instregex "OIHH(64)?$")>;
+def : InstRW<[FXU], (instregex "OIHL(64)?$")>;
+def : InstRW<[FXU], (instregex "OILF(64)?$")>;
+def : InstRW<[FXU], (instregex "OILH(64)?$")>;
+def : InstRW<[FXU], (instregex "OILL(64)?$")>;
+def : InstRW<[FXU], (instregex "OR(K)?$")>;
+def : InstRW<[LSU, LSU, FXU, Lat9, GroupAlone], (instregex "OC$")>;
+
+//===----------------------------------------------------------------------===//
+// XOR
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "X(G|Y)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "XI(Y)?$")>;
+def : InstRW<[FXU], (instregex "XIFMux$")>;
+def : InstRW<[FXU], (instregex "XGR(K)?$")>;
+def : InstRW<[FXU], (instregex "XIHF(64)?$")>;
+def : InstRW<[FXU], (instregex "XILF(64)?$")>;
+def : InstRW<[FXU], (instregex "XR(K)?$")>;
+def : InstRW<[LSU, LSU, FXU, Lat9, GroupAlone], (instregex "XC$")>;
+
+//===----------------------------------------------------------------------===//
+// Multiplication
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat10], (instregex "MS(GF|Y)?$")>;
+def : InstRW<[FXU, Lat6], (instregex "MS(R|FI)$")>;
+def : InstRW<[FXU, LSU, Lat12], (instregex "MSG$")>;
+def : InstRW<[FXU, Lat8], (instregex "MSGR$")>;
+def : InstRW<[FXU, Lat6], (instregex "MSGF(I|R)$")>;
+def : InstRW<[FXU, LSU, Lat15, GroupAlone], (instregex "MLG$")>;
+def : InstRW<[FXU, Lat9, GroupAlone], (instregex "MLGR$")>;
+def : InstRW<[FXU, Lat5], (instregex "MGHI$")>;
+def : InstRW<[FXU, Lat5], (instregex "MHI$")>;
+def : InstRW<[FXU, LSU, Lat9], (instregex "MH(Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Division and remainder
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
+ (instregex "DSG(F)?R$")>;
+def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, Lat30, GroupAlone],
+ (instregex "DSG(F)?$")>;
+def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
+ (instregex "DL(G)?R$")>;
+def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
+ (instregex "DL(G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Shifts
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU], (instregex "SLL(G|K)?$")>;
+def : InstRW<[FXU], (instregex "SRL(G|K)?$")>;
+def : InstRW<[FXU], (instregex "SRA(G|K)?$")>;
+def : InstRW<[FXU], (instregex "SLA(K)?$")>;
+
+// Rotate
+def : InstRW<[FXU, LSU, Lat6], (instregex "RLL(G)?$")>;
+
+// Rotate and insert
+def : InstRW<[FXU], (instregex "RISBG(N|32)?$")>;
+def : InstRW<[FXU], (instregex "RISBH(G|H|L)$")>;
+def : InstRW<[FXU], (instregex "RISBL(G|H|L)$")>;
+def : InstRW<[FXU], (instregex "RISBMux$")>;
+
+// Rotate and Select
+def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "R(N|O|X)SBG$")>;
+
+//===----------------------------------------------------------------------===//
+// Comparison
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "C(G|Y|Mux|RL)?$")>;
+def : InstRW<[FXU], (instregex "C(F|H)I(Mux)?$")>;
+def : InstRW<[FXU], (instregex "CG(F|H)I$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "CG(HSI|RL)$")>;
+def : InstRW<[FXU], (instregex "C(G)?R$")>;
+def : InstRW<[FXU], (instregex "CIH$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "CH(F|SI)$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "CL(Y|Mux|FHSI)?$")>;
+def : InstRW<[FXU], (instregex "CLFI(Mux)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "CLG(HRL|HSI)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "CLGF(RL)?$")>;
+def : InstRW<[FXU], (instregex "CLGF(I|R)$")>;
+def : InstRW<[FXU], (instregex "CLGR$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "CLGRL$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "CLH(F|RL|HSI)$")>;
+def : InstRW<[FXU], (instregex "CLIH$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "CLI(Y)?$")>;
+def : InstRW<[FXU], (instregex "CLR$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "CLRL$")>;
+
+// Compare halfword
+def : InstRW<[FXU, LSU, Lat6], (instregex "CH(Y|RL)?$")>;
+def : InstRW<[FXU, LSU, Lat6], (instregex "CGH(RL)?$")>;
+def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "CHHSI$")>;
+
+// Compare with sign extension (32 -> 64)
+def : InstRW<[FXU, LSU, Lat6], (instregex "CGF(RL)?$")>;
+def : InstRW<[FXU, Lat2], (instregex "CGFR$")>;
+
+// Compare logical character
+def : InstRW<[FXU, LSU, LSU, Lat9, GroupAlone], (instregex "CLC$")>;
+
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>;
+
+// Test under mask
+def : InstRW<[FXU, LSU, Lat5], (instregex "TM(Y)?$")>;
+def : InstRW<[FXU], (instregex "TM(H|L)Mux$")>;
+def : InstRW<[FXU], (instregex "TMHH(64)?$")>;
+def : InstRW<[FXU], (instregex "TMHL(64)?$")>;
+def : InstRW<[FXU], (instregex "TMLH(64)?$")>;
+def : InstRW<[FXU], (instregex "TMLL(64)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Prefetch and execution hint
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU], (instregex "PFD(RL)?$")>;
+def : InstRW<[LSU], (instregex "BP(R)?P$")>;
+def : InstRW<[FXU], (instregex "NIAI$")>;
+
+//===----------------------------------------------------------------------===//
+// Atomic operations
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU, EndGroup], (instregex "Serialize$")>;
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "LAA(G)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LAAL(G)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LAN(G)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LAO(G)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LAX(G)?$")>;
+
+// Test and set
+def : InstRW<[FXU, LSU, Lat5, EndGroup], (instregex "TS$")>;
+
+// Compare and swap
+def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "CS(G|Y)?$")>;
+
+// Compare double and swap
+def : InstRW<[FXU, FXU, FXU, FXU, FXU, LSU, Lat10, GroupAlone],
+ (instregex "CDS(Y)?$")>;
+def : InstRW<[FXU, FXU, FXU, FXU, FXU, FXU, LSU, LSU, Lat12, GroupAlone],
+ (instregex "CDSG$")>;
+
+// Compare and swap and store
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "CSST$")>;
+
+// Perform locked operation
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PLO$")>;
+
+// Load/store pair from/to quadword
+def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPQ$")>;
+def : InstRW<[FXU, FXU, LSU, LSU, Lat6, GroupAlone], (instregex "STPQ$")>;
+
+// Load pair disjoint
+def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Access registers
+//===----------------------------------------------------------------------===//
+
+// Extract/set/copy access register
+def : InstRW<[LSU], (instregex "(EAR|SAR|CPYA)$")>;
+
+// Load address extended
+def : InstRW<[LSU, FXU, Lat5, GroupAlone], (instregex "LAE(Y)?$")>;
+
+// Load/store access multiple (not modeled precisely)
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(L|ST)AM(Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Program mask and addressing mode
+//===----------------------------------------------------------------------===//
+
+// Insert Program Mask
+def : InstRW<[FXU, Lat3, EndGroup], (instregex "IPM$")>;
+
+// Set Program Mask
+def : InstRW<[LSU, EndGroup], (instregex "SPM$")>;
+
+// Branch and link
+def : InstRW<[FXU, FXU, LSU, Lat8, GroupAlone], (instregex "BAL(R)?$")>;
+
+// Test addressing mode
+def : InstRW<[FXU], (instregex "TAM$")>;
+
+// Set addressing mode
+def : InstRW<[LSU, EndGroup], (instregex "SAM(24|31|64)$")>;
+
+// Branch (and save) and set mode.
+def : InstRW<[FXU, LSU, Lat5, GroupAlone], (instregex "BSM$")>;
+def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "BASSM$")>;
+
+//===----------------------------------------------------------------------===//
+// Transactional execution
+//===----------------------------------------------------------------------===//
+
+// Transaction begin
+def : InstRW<[LSU, LSU, FXU, FXU, FXU, FXU, FXU, Lat15, GroupAlone],
+ (instregex "TBEGIN(C|_nofloat)?$")>;
+
+// Transaction end
+def : InstRW<[LSU, GroupAlone], (instregex "TEND$")>;
+
+// Transaction abort
+def : InstRW<[LSU, GroupAlone], (instregex "TABORT$")>;
+
+// Extract Transaction Nesting Depth
+def : InstRW<[FXU], (instregex "ETND$")>;
+
+// Nontransactional store
+def : InstRW<[FXU, LSU, Lat5], (instregex "NTSTG$")>;
+
+//===----------------------------------------------------------------------===//
+// Processor assist
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU], (instregex "PPA$")>;
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Instructions.
+//===----------------------------------------------------------------------===//
+
+// Find leftmost one
+def : InstRW<[FXU, Lat7, GroupAlone], (instregex "FLOGR$")>;
+
+// Population count
+def : InstRW<[FXU, Lat3], (instregex "POPCNT$")>;
+
+// Extend
+def : InstRW<[FXU], (instregex "AEXT128_64$")>;
+def : InstRW<[FXU], (instregex "ZEXT128_(32|64)$")>;
+
+// String instructions
+def : InstRW<[FXU, LSU, Lat30], (instregex "SRST$")>;
+
+// Move with key
+def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVCK$")>;
+
+// Extract CPU Time
+def : InstRW<[FXU, Lat5, LSU], (instregex "ECTG$")>;
+
+// Execute
+def : InstRW<[LSU, GroupAlone], (instregex "EX(RL)?$")>;
+
+// Program return
+def : InstRW<[FXU, Lat30], (instregex "PR$")>;
+
+// Inline assembly
+def : InstRW<[FXU, LSU, LSU, Lat9, GroupAlone], (instregex "STCK(F)?$")>;
+def : InstRW<[LSU, LSU, LSU, LSU, FXU, FXU, Lat20, GroupAlone],
+ (instregex "STCKE$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "STFLE$")>;
+def : InstRW<[FXU, Lat30], (instregex "SVC$")>;
+
+// Store real address
+def : InstRW<[FXU, LSU, Lat5], (instregex "STRAG$")>;
+
+//===----------------------------------------------------------------------===//
+// .insn directive instructions
+//===----------------------------------------------------------------------===//
+
+// An "empty" sched-class will be assigned instead of the "invalid sched-class".
+// getNumDecoderSlots() will then return 1 instead of 0.
+def : InstRW<[], (instregex "Insn.*")>;
+
+
+// ----------------------------- Floating point ----------------------------- //
+
+//===----------------------------------------------------------------------===//
+// FP: Select instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU], (instregex "SelectF(32|64|128)$")>;
+def : InstRW<[FXU], (instregex "CondStoreF32(Inv)?$")>;
+def : InstRW<[FXU], (instregex "CondStoreF64(Inv)?$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load zero
+def : InstRW<[FXU], (instregex "LZ(DR|ER)$")>;
+def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "LZXR$")>;
+
+// Load
+def : InstRW<[FXU], (instregex "LER$")>;
+def : InstRW<[FXU], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[FXU, Lat3], (instregex "LGDR$")>;
+def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "LXR$")>;
+
+// Load and Test
+def : InstRW<[FPU], (instregex "LT(D|E)BR$")>;
+def : InstRW<[FPU], (instregex "LTEBRCompare(_VecPseudo)?$")>;
+def : InstRW<[FPU], (instregex "LTDBRCompare(_VecPseudo)?$")>;
+def : InstRW<[FPU2, FPU2, Lat9, GroupAlone], (instregex "LTXBR$")>;
+def : InstRW<[FPU2, FPU2, Lat9, GroupAlone],
+ (instregex "LTXBRCompare(_VecPseudo)?$")>;
+
+// Copy sign
+def : InstRW<[FXU, FXU, Lat5, GroupAlone], (instregex "CPSDRd(d|s)$")>;
+def : InstRW<[FXU, FXU, Lat5, GroupAlone], (instregex "CPSDRs(d|s)$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Load instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU], (instregex "LE(Y)?$")>;
+def : InstRW<[LSU], (instregex "LD(Y|E32)?$")>;
+def : InstRW<[LSU], (instregex "LX$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Store instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat7], (instregex "STD(Y)?$")>;
+def : InstRW<[FXU, LSU, Lat7], (instregex "STE(Y)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "STX$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[FPU], (instregex "LEDBR(A)?$")>;
+def : InstRW<[FPU, FPU, Lat20], (instregex "LEXBR(A)?$")>;
+def : InstRW<[FPU, FPU, Lat20], (instregex "LDXBR(A)?$")>;
+
+// Load lengthened
+def : InstRW<[FPU, LSU, Lat12], (instregex "LDEB$")>;
+def : InstRW<[FPU], (instregex "LDEBR$")>;
+def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "LX(D|E)B$")>;
+def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "LX(D|E)BR$")>;
+
+// Convert from fixed / logical
+def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CE(F|G)BR(A?)$")>;
+def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CD(F|G)BR(A?)$")>;
+def : InstRW<[FXU, FPU2, FPU2, Lat11, GroupAlone], (instregex "CX(F|G)BR(A?)$")>;
+def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CEL(F|G)BR$")>;
+def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CDL(F|G)BR$")>;
+def : InstRW<[FXU, FPU2, FPU2, Lat11, GroupAlone], (instregex "CXL(F|G)BR$")>;
+
+// Convert to fixed / logical
+def : InstRW<[FXU, FPU, Lat12, GroupAlone], (instregex "CF(E|D)BR(A?)$")>;
+def : InstRW<[FXU, FPU, Lat12, GroupAlone], (instregex "CG(E|D)BR(A?)$")>;
+def : InstRW<[FXU, FPU, FPU, Lat20, GroupAlone], (instregex "C(F|G)XBR(A?)$")>;
+def : InstRW<[FXU, FPU, Lat11, GroupAlone], (instregex "CLF(E|D)BR$")>;
+def : InstRW<[FXU, FPU, Lat11, GroupAlone], (instregex "CLG(E|D)BR$")>;
+def : InstRW<[FXU, FPU, FPU, Lat20, GroupAlone], (instregex "CL(F|G)XBR$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load Complement / Negative / Positive
+def : InstRW<[FPU], (instregex "L(C|N|P)DBR$")>;
+def : InstRW<[FPU], (instregex "L(C|N|P)EBR$")>;
+def : InstRW<[FXU], (instregex "LCDFR(_32)?$")>;
+def : InstRW<[FXU], (instregex "LNDFR(_32)?$")>;
+def : InstRW<[FXU], (instregex "LPDFR(_32)?$")>;
+def : InstRW<[FPU2, FPU2, Lat9, GroupAlone], (instregex "L(C|N|P)XBR$")>;
+
+// Square root
+def : InstRW<[FPU, LSU, Lat30], (instregex "SQ(E|D)B$")>;
+def : InstRW<[FPU, Lat30], (instregex "SQ(E|D)BR$")>;
+def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "SQXBR$")>;
+
+// Load FP integer
+def : InstRW<[FPU], (instregex "FIEBR(A)?$")>;
+def : InstRW<[FPU], (instregex "FIDBR(A)?$")>;
+def : InstRW<[FPU2, FPU2, Lat15, GroupAlone], (instregex "FIXBR(A)?$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[FPU, LSU, Lat12], (instregex "A(E|D)B$")>;
+def : InstRW<[FPU], (instregex "A(E|D)BR$")>;
+def : InstRW<[FPU2, FPU2, Lat20, GroupAlone], (instregex "AXBR$")>;
+
+// Subtraction
+def : InstRW<[FPU, LSU, Lat12], (instregex "S(E|D)B$")>;
+def : InstRW<[FPU], (instregex "S(E|D)BR$")>;
+def : InstRW<[FPU2, FPU2, Lat20, GroupAlone], (instregex "SXBR$")>;
+
+// Multiply
+def : InstRW<[FPU, LSU, Lat12], (instregex "M(D|DE|EE)B$")>;
+def : InstRW<[FPU], (instregex "M(D|DE|EE)BR$")>;
+def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MXDB$")>;
+def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MXDBR$")>;
+def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "MXBR$")>;
+
+// Multiply and add / subtract
+def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>;
+def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)EBR$")>;
+def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>;
+def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)DBR$")>;
+
+// Division
+def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)B$")>;
+def : InstRW<[FPU, Lat30], (instregex "D(E|D)BR$")>;
+def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXBR$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[FPU, LSU, Lat12], (instregex "C(E|D)B$")>;
+def : InstRW<[FPU], (instregex "C(E|D)BR$")>;
+def : InstRW<[FPU, FPU, Lat30], (instregex "CXBR$")>;
+
+// Test Data Class
+def : InstRW<[FPU, LSU, Lat15], (instregex "TC(E|D)B$")>;
+def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "TCXB$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Floating-point control register instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat4, GroupAlone], (instregex "EFPC$")>;
+def : InstRW<[LSU, Lat3, GroupAlone], (instregex "SFPC$")>;
+def : InstRW<[LSU, LSU, Lat6, GroupAlone], (instregex "LFPC$")>;
+def : InstRW<[LSU, Lat3, GroupAlone], (instregex "STFPC$")>;
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "SFASR$")>;
+def : InstRW<[FXU, LSU, Lat30, GroupAlone], (instregex "LFAS$")>;
+def : InstRW<[FXU, Lat2, GroupAlone], (instregex "SRNM(B|T)?$")>;
+
+}
+
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
new file mode 100644
index 000000000000..657482504045
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -0,0 +1,275 @@
+//===-- SystemZSelectionDAGInfo.cpp - SystemZ SelectionDAG Info -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SystemZSelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "systemz-selectiondag-info"
+
+// Decide whether it is best to use a loop or straight-line code for
+// a block operation of Size bytes with source address Src and destination
+// address Dest. Sequence is the opcode to use for straight-line code
+// (such as MVC) and Loop is the opcode to use for loops (such as MVC_LOOP).
+// Return the chain for the completed operation.
+static SDValue emitMemMem(SelectionDAG &DAG, const SDLoc &DL, unsigned Sequence,
+ unsigned Loop, SDValue Chain, SDValue Dst,
+ SDValue Src, uint64_t Size) {
+ EVT PtrVT = Src.getValueType();
+ // The heuristic we use is to prefer loops for anything that would
+ // require 7 or more MVCs. With these kinds of sizes there isn't
+ // much to choose between straight-line code and looping code,
+ // since the time will be dominated by the MVCs themselves.
+ // However, the loop has 4 or 5 instructions (depending on whether
+ // the base addresses can be proved equal), so there doesn't seem
+ // much point using a loop for 5 * 256 bytes or fewer. Anything in
+ // the range (5 * 256, 6 * 256) will need another instruction after
+ // the loop, so it doesn't seem worth using a loop then either.
+ // The next value up, 6 * 256, can be implemented in the same
+ // number of straight-line MVCs as 6 * 256 - 1.
+ if (Size > 6 * 256)
+ return DAG.getNode(Loop, DL, MVT::Other, Chain, Dst, Src,
+ DAG.getConstant(Size, DL, PtrVT),
+ DAG.getConstant(Size / 256, DL, PtrVT));
+ return DAG.getNode(Sequence, DL, MVT::Other, Chain, Dst, Src,
+ DAG.getConstant(Size, DL, PtrVT));
+}
+
+SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemcpy(
+ SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align, bool IsVolatile, bool AlwaysInline,
+ MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
+ if (IsVolatile)
+ return SDValue();
+
+ if (auto *CSize = dyn_cast<ConstantSDNode>(Size))
+ return emitMemMem(DAG, DL, SystemZISD::MVC, SystemZISD::MVC_LOOP,
+ Chain, Dst, Src, CSize->getZExtValue());
+ return SDValue();
+}
+
+// Handle a memset of 1, 2, 4 or 8 bytes with the operands given by
+// Chain, Dst, ByteVal and Size. These cases are expected to use
+// MVI, MVHHI, MVHI and MVGHI respectively.
+static SDValue memsetStore(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
+ SDValue Dst, uint64_t ByteVal, uint64_t Size,
+ unsigned Align, MachinePointerInfo DstPtrInfo) {
+ uint64_t StoreVal = ByteVal;
+ for (unsigned I = 1; I < Size; ++I)
+ StoreVal |= ByteVal << (I * 8);
+ return DAG.getStore(
+ Chain, DL, DAG.getConstant(StoreVal, DL, MVT::getIntegerVT(Size * 8)),
+ Dst, DstPtrInfo, Align);
+}
+
+SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemset(
+ SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst,
+ SDValue Byte, SDValue Size, unsigned Align, bool IsVolatile,
+ MachinePointerInfo DstPtrInfo) const {
+ EVT PtrVT = Dst.getValueType();
+
+ if (IsVolatile)
+ return SDValue();
+
+ if (auto *CSize = dyn_cast<ConstantSDNode>(Size)) {
+ uint64_t Bytes = CSize->getZExtValue();
+ if (Bytes == 0)
+ return SDValue();
+ if (auto *CByte = dyn_cast<ConstantSDNode>(Byte)) {
+ // Handle cases that can be done using at most two of
+ // MVI, MVHI, MVHHI and MVGHI. The latter two can only be
+ // used if ByteVal is all zeros or all ones; in other casees,
+ // we can move at most 2 halfwords.
+ uint64_t ByteVal = CByte->getZExtValue();
+ if (ByteVal == 0 || ByteVal == 255 ?
+ Bytes <= 16 && countPopulation(Bytes) <= 2 :
+ Bytes <= 4) {
+ unsigned Size1 = Bytes == 16 ? 8 : 1 << findLastSet(Bytes);
+ unsigned Size2 = Bytes - Size1;
+ SDValue Chain1 = memsetStore(DAG, DL, Chain, Dst, ByteVal, Size1,
+ Align, DstPtrInfo);
+ if (Size2 == 0)
+ return Chain1;
+ Dst = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
+ DAG.getConstant(Size1, DL, PtrVT));
+ DstPtrInfo = DstPtrInfo.getWithOffset(Size1);
+ SDValue Chain2 = memsetStore(DAG, DL, Chain, Dst, ByteVal, Size2,
+ std::min(Align, Size1), DstPtrInfo);
+ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chain1, Chain2);
+ }
+ } else {
+ // Handle one and two bytes using STC.
+ if (Bytes <= 2) {
+ SDValue Chain1 = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo, Align);
+ if (Bytes == 1)
+ return Chain1;
+ SDValue Dst2 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
+ DAG.getConstant(1, DL, PtrVT));
+ SDValue Chain2 =
+ DAG.getStore(Chain, DL, Byte, Dst2, DstPtrInfo.getWithOffset(1),
+ /* Alignment = */ 1);
+ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chain1, Chain2);
+ }
+ }
+ assert(Bytes >= 2 && "Should have dealt with 0- and 1-byte cases already");
+
+ // Handle the special case of a memset of 0, which can use XC.
+ auto *CByte = dyn_cast<ConstantSDNode>(Byte);
+ if (CByte && CByte->getZExtValue() == 0)
+ return emitMemMem(DAG, DL, SystemZISD::XC, SystemZISD::XC_LOOP,
+ Chain, Dst, Dst, Bytes);
+
+ // Copy the byte to the first location and then use MVC to copy
+ // it to the rest.
+ Chain = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo, Align);
+ SDValue DstPlus1 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
+ DAG.getConstant(1, DL, PtrVT));
+ return emitMemMem(DAG, DL, SystemZISD::MVC, SystemZISD::MVC_LOOP,
+ Chain, DstPlus1, Dst, Bytes - 1);
+ }
+ return SDValue();
+}
+
+// Use CLC to compare [Src1, Src1 + Size) with [Src2, Src2 + Size),
+// deciding whether to use a loop or straight-line code.
+static SDValue emitCLC(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
+ SDValue Src1, SDValue Src2, uint64_t Size) {
+ SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ EVT PtrVT = Src1.getValueType();
+ // A two-CLC sequence is a clear win over a loop, not least because it
+ // needs only one branch. A three-CLC sequence needs the same number
+ // of branches as a loop (i.e. 2), but is shorter. That brings us to
+ // lengths greater than 768 bytes. It seems relatively likely that
+ // a difference will be found within the first 768 bytes, so we just
+ // optimize for the smallest number of branch instructions, in order
+ // to avoid polluting the prediction buffer too much. A loop only ever
+ // needs 2 branches, whereas a straight-line sequence would need 3 or more.
+ if (Size > 3 * 256)
+ return DAG.getNode(SystemZISD::CLC_LOOP, DL, VTs, Chain, Src1, Src2,
+ DAG.getConstant(Size, DL, PtrVT),
+ DAG.getConstant(Size / 256, DL, PtrVT));
+ return DAG.getNode(SystemZISD::CLC, DL, VTs, Chain, Src1, Src2,
+ DAG.getConstant(Size, DL, PtrVT));
+}
+
+// Convert the current CC value into an integer that is 0 if CC == 0,
+// less than zero if CC == 1 and greater than zero if CC >= 2.
+// The sequence starts with IPM, which puts CC into bits 29 and 28
+// of an integer and clears bits 30 and 31.
+static SDValue addIPMSequence(const SDLoc &DL, SDValue Glue,
+ SelectionDAG &DAG) {
+ SDValue IPM = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, Glue);
+ SDValue SRL = DAG.getNode(ISD::SRL, DL, MVT::i32, IPM,
+ DAG.getConstant(SystemZ::IPM_CC, DL, MVT::i32));
+ SDValue ROTL = DAG.getNode(ISD::ROTL, DL, MVT::i32, SRL,
+ DAG.getConstant(31, DL, MVT::i32));
+ return ROTL;
+}
+
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::EmitTargetCodeForMemcmp(
+ SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Src1,
+ SDValue Src2, SDValue Size, MachinePointerInfo Op1PtrInfo,
+ MachinePointerInfo Op2PtrInfo) const {
+ if (auto *CSize = dyn_cast<ConstantSDNode>(Size)) {
+ uint64_t Bytes = CSize->getZExtValue();
+ assert(Bytes > 0 && "Caller should have handled 0-size case");
+ Chain = emitCLC(DAG, DL, Chain, Src1, Src2, Bytes);
+ SDValue Glue = Chain.getValue(1);
+ return std::make_pair(addIPMSequence(DL, Glue, DAG), Chain);
+ }
+ return std::make_pair(SDValue(), SDValue());
+}
+
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::EmitTargetCodeForMemchr(
+ SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Src,
+ SDValue Char, SDValue Length, MachinePointerInfo SrcPtrInfo) const {
+ // Use SRST to find the character. End is its address on success.
+ EVT PtrVT = Src.getValueType();
+ SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other, MVT::Glue);
+ Length = DAG.getZExtOrTrunc(Length, DL, PtrVT);
+ Char = DAG.getZExtOrTrunc(Char, DL, MVT::i32);
+ Char = DAG.getNode(ISD::AND, DL, MVT::i32, Char,
+ DAG.getConstant(255, DL, MVT::i32));
+ SDValue Limit = DAG.getNode(ISD::ADD, DL, PtrVT, Src, Length);
+ SDValue End = DAG.getNode(SystemZISD::SEARCH_STRING, DL, VTs, Chain,
+ Limit, Src, Char);
+ Chain = End.getValue(1);
+ SDValue Glue = End.getValue(2);
+
+ // Now select between End and null, depending on whether the character
+ // was found.
+ SDValue Ops[] = {End, DAG.getConstant(0, DL, PtrVT),
+ DAG.getConstant(SystemZ::CCMASK_SRST, DL, MVT::i32),
+ DAG.getConstant(SystemZ::CCMASK_SRST_FOUND, DL, MVT::i32),
+ Glue};
+ VTs = DAG.getVTList(PtrVT, MVT::Glue);
+ End = DAG.getNode(SystemZISD::SELECT_CCMASK, DL, VTs, Ops);
+ return std::make_pair(End, Chain);
+}
+
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::EmitTargetCodeForStrcpy(
+ SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dest,
+ SDValue Src, MachinePointerInfo DestPtrInfo, MachinePointerInfo SrcPtrInfo,
+ bool isStpcpy) const {
+ SDVTList VTs = DAG.getVTList(Dest.getValueType(), MVT::Other);
+ SDValue EndDest = DAG.getNode(SystemZISD::STPCPY, DL, VTs, Chain, Dest, Src,
+ DAG.getConstant(0, DL, MVT::i32));
+ return std::make_pair(isStpcpy ? EndDest : Dest, EndDest.getValue(1));
+}
+
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::EmitTargetCodeForStrcmp(
+ SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Src1,
+ SDValue Src2, MachinePointerInfo Op1PtrInfo,
+ MachinePointerInfo Op2PtrInfo) const {
+ SDVTList VTs = DAG.getVTList(Src1.getValueType(), MVT::Other, MVT::Glue);
+ SDValue Unused = DAG.getNode(SystemZISD::STRCMP, DL, VTs, Chain, Src1, Src2,
+ DAG.getConstant(0, DL, MVT::i32));
+ Chain = Unused.getValue(1);
+ SDValue Glue = Chain.getValue(2);
+ return std::make_pair(addIPMSequence(DL, Glue, DAG), Chain);
+}
+
+// Search from Src for a null character, stopping once Src reaches Limit.
+// Return a pair of values, the first being the number of nonnull characters
+// and the second being the out chain.
+//
+// This can be used for strlen by setting Limit to 0.
+static std::pair<SDValue, SDValue> getBoundedStrlen(SelectionDAG &DAG,
+ const SDLoc &DL,
+ SDValue Chain, SDValue Src,
+ SDValue Limit) {
+ EVT PtrVT = Src.getValueType();
+ SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other, MVT::Glue);
+ SDValue End = DAG.getNode(SystemZISD::SEARCH_STRING, DL, VTs, Chain,
+ Limit, Src, DAG.getConstant(0, DL, MVT::i32));
+ Chain = End.getValue(1);
+ SDValue Len = DAG.getNode(ISD::SUB, DL, PtrVT, End, Src);
+ return std::make_pair(Len, Chain);
+}
+
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::EmitTargetCodeForStrlen(
+ SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Src,
+ MachinePointerInfo SrcPtrInfo) const {
+ EVT PtrVT = Src.getValueType();
+ return getBoundedStrlen(DAG, DL, Chain, Src, DAG.getConstant(0, DL, PtrVT));
+}
+
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::EmitTargetCodeForStrnlen(
+ SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Src,
+ SDValue MaxLength, MachinePointerInfo SrcPtrInfo) const {
+ EVT PtrVT = Src.getValueType();
+ MaxLength = DAG.getZExtOrTrunc(MaxLength, DL, PtrVT);
+ SDValue Limit = DAG.getNode(ISD::ADD, DL, PtrVT, Src, MaxLength);
+ return getBoundedStrlen(DAG, DL, Chain, Src, Limit);
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h b/contrib/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
new file mode 100644
index 000000000000..93cd970c30c6
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
@@ -0,0 +1,74 @@
+//===-- SystemZSelectionDAGInfo.h - SystemZ SelectionDAG Info ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the SystemZ subclass for SelectionDAGTargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZSELECTIONDAGINFO_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZSELECTIONDAGINFO_H
+
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+
+namespace llvm {
+
+class SystemZTargetMachine;
+
+class SystemZSelectionDAGInfo : public SelectionDAGTargetInfo {
+public:
+ explicit SystemZSelectionDAGInfo() = default;
+
+ SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &DL,
+ SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align, bool IsVolatile,
+ bool AlwaysInline,
+ MachinePointerInfo DstPtrInfo,
+ MachinePointerInfo SrcPtrInfo) const override;
+
+ SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &DL,
+ SDValue Chain, SDValue Dst, SDValue Byte,
+ SDValue Size, unsigned Align, bool IsVolatile,
+ MachinePointerInfo DstPtrInfo) const override;
+
+ std::pair<SDValue, SDValue>
+ EmitTargetCodeForMemcmp(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
+ SDValue Src1, SDValue Src2, SDValue Size,
+ MachinePointerInfo Op1PtrInfo,
+ MachinePointerInfo Op2PtrInfo) const override;
+
+ std::pair<SDValue, SDValue>
+ EmitTargetCodeForMemchr(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
+ SDValue Src, SDValue Char, SDValue Length,
+ MachinePointerInfo SrcPtrInfo) const override;
+
+ std::pair<SDValue, SDValue> EmitTargetCodeForStrcpy(
+ SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dest,
+ SDValue Src, MachinePointerInfo DestPtrInfo,
+ MachinePointerInfo SrcPtrInfo, bool isStpcpy) const override;
+
+ std::pair<SDValue, SDValue>
+ EmitTargetCodeForStrcmp(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
+ SDValue Src1, SDValue Src2,
+ MachinePointerInfo Op1PtrInfo,
+ MachinePointerInfo Op2PtrInfo) const override;
+
+ std::pair<SDValue, SDValue>
+ EmitTargetCodeForStrlen(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
+ SDValue Src,
+ MachinePointerInfo SrcPtrInfo) const override;
+
+ std::pair<SDValue, SDValue>
+ EmitTargetCodeForStrnlen(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
+ SDValue Src, SDValue MaxLength,
+ MachinePointerInfo SrcPtrInfo) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp
new file mode 100644
index 000000000000..83882fc0310a
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp
@@ -0,0 +1,285 @@
+//===-- SystemZShortenInst.cpp - Instruction-shortening pass --------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to replace instructions with shorter forms. For example,
+// IILF can be replaced with LLILL or LLILH if the constant fits and if the
+// other 32 bits of the GR64 destination are not live.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZTargetMachine.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "systemz-shorten-inst"
+
+namespace {
+class SystemZShortenInst : public MachineFunctionPass {
+public:
+ static char ID;
+ SystemZShortenInst(const SystemZTargetMachine &tm);
+
+ StringRef getPassName() const override {
+ return "SystemZ Instruction Shortening";
+ }
+
+ bool processBlock(MachineBasicBlock &MBB);
+ bool runOnMachineFunction(MachineFunction &F) override;
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+private:
+ bool shortenIIF(MachineInstr &MI, unsigned LLIxL, unsigned LLIxH);
+ bool shortenOn0(MachineInstr &MI, unsigned Opcode);
+ bool shortenOn01(MachineInstr &MI, unsigned Opcode);
+ bool shortenOn001(MachineInstr &MI, unsigned Opcode);
+ bool shortenOn001AddCC(MachineInstr &MI, unsigned Opcode);
+ bool shortenFPConv(MachineInstr &MI, unsigned Opcode);
+
+ const SystemZInstrInfo *TII;
+ const TargetRegisterInfo *TRI;
+ LivePhysRegs LiveRegs;
+};
+
+char SystemZShortenInst::ID = 0;
+} // end anonymous namespace
+
+FunctionPass *llvm::createSystemZShortenInstPass(SystemZTargetMachine &TM) {
+ return new SystemZShortenInst(TM);
+}
+
+SystemZShortenInst::SystemZShortenInst(const SystemZTargetMachine &tm)
+ : MachineFunctionPass(ID), TII(nullptr) {}
+
+// Tie operands if MI has become a two-address instruction.
+static void tieOpsIfNeeded(MachineInstr &MI) {
+ if (MI.getDesc().getOperandConstraint(0, MCOI::TIED_TO) &&
+ !MI.getOperand(0).isTied())
+ MI.tieOperands(0, 1);
+}
+
+// MI loads one word of a GPR using an IIxF instruction and LLIxL and LLIxH
+// are the halfword immediate loads for the same word. Try to use one of them
+// instead of IIxF.
+bool SystemZShortenInst::shortenIIF(MachineInstr &MI, unsigned LLIxL,
+ unsigned LLIxH) {
+ unsigned Reg = MI.getOperand(0).getReg();
+ // The new opcode will clear the other half of the GR64 reg, so
+ // cancel if that is live.
+ unsigned thisSubRegIdx =
+ (SystemZ::GRH32BitRegClass.contains(Reg) ? SystemZ::subreg_h32
+ : SystemZ::subreg_l32);
+ unsigned otherSubRegIdx =
+ (thisSubRegIdx == SystemZ::subreg_l32 ? SystemZ::subreg_h32
+ : SystemZ::subreg_l32);
+ unsigned GR64BitReg =
+ TRI->getMatchingSuperReg(Reg, thisSubRegIdx, &SystemZ::GR64BitRegClass);
+ unsigned OtherReg = TRI->getSubReg(GR64BitReg, otherSubRegIdx);
+ if (LiveRegs.contains(OtherReg))
+ return false;
+
+ uint64_t Imm = MI.getOperand(1).getImm();
+ if (SystemZ::isImmLL(Imm)) {
+ MI.setDesc(TII->get(LLIxL));
+ MI.getOperand(0).setReg(SystemZMC::getRegAsGR64(Reg));
+ return true;
+ }
+ if (SystemZ::isImmLH(Imm)) {
+ MI.setDesc(TII->get(LLIxH));
+ MI.getOperand(0).setReg(SystemZMC::getRegAsGR64(Reg));
+ MI.getOperand(1).setImm(Imm >> 16);
+ return true;
+ }
+ return false;
+}
+
+// Change MI's opcode to Opcode if register operand 0 has a 4-bit encoding.
+bool SystemZShortenInst::shortenOn0(MachineInstr &MI, unsigned Opcode) {
+ if (SystemZMC::getFirstReg(MI.getOperand(0).getReg()) < 16) {
+ MI.setDesc(TII->get(Opcode));
+ return true;
+ }
+ return false;
+}
+
+// Change MI's opcode to Opcode if register operands 0 and 1 have a
+// 4-bit encoding.
+bool SystemZShortenInst::shortenOn01(MachineInstr &MI, unsigned Opcode) {
+ if (SystemZMC::getFirstReg(MI.getOperand(0).getReg()) < 16 &&
+ SystemZMC::getFirstReg(MI.getOperand(1).getReg()) < 16) {
+ MI.setDesc(TII->get(Opcode));
+ return true;
+ }
+ return false;
+}
+
+// Change MI's opcode to Opcode if register operands 0, 1 and 2 have a
+// 4-bit encoding and if operands 0 and 1 are tied. Also ties op 0
+// with op 1, if MI becomes 2-address.
+bool SystemZShortenInst::shortenOn001(MachineInstr &MI, unsigned Opcode) {
+ if (SystemZMC::getFirstReg(MI.getOperand(0).getReg()) < 16 &&
+ MI.getOperand(1).getReg() == MI.getOperand(0).getReg() &&
+ SystemZMC::getFirstReg(MI.getOperand(2).getReg()) < 16) {
+ MI.setDesc(TII->get(Opcode));
+ tieOpsIfNeeded(MI);
+ return true;
+ }
+ return false;
+}
+
+// Calls shortenOn001 if CCLive is false. CC def operand is added in
+// case of success.
+bool SystemZShortenInst::shortenOn001AddCC(MachineInstr &MI, unsigned Opcode) {
+ if (!LiveRegs.contains(SystemZ::CC) && shortenOn001(MI, Opcode)) {
+ MachineInstrBuilder(*MI.getParent()->getParent(), &MI)
+ .addReg(SystemZ::CC, RegState::ImplicitDefine | RegState::Dead);
+ return true;
+ }
+ return false;
+}
+
+// MI is a vector-style conversion instruction with the operand order:
+// destination, source, exact-suppress, rounding-mode. If both registers
+// have a 4-bit encoding then change it to Opcode, which has operand order:
+// destination, rouding-mode, source, exact-suppress.
+bool SystemZShortenInst::shortenFPConv(MachineInstr &MI, unsigned Opcode) {
+ if (SystemZMC::getFirstReg(MI.getOperand(0).getReg()) < 16 &&
+ SystemZMC::getFirstReg(MI.getOperand(1).getReg()) < 16) {
+ MachineOperand Dest(MI.getOperand(0));
+ MachineOperand Src(MI.getOperand(1));
+ MachineOperand Suppress(MI.getOperand(2));
+ MachineOperand Mode(MI.getOperand(3));
+ MI.RemoveOperand(3);
+ MI.RemoveOperand(2);
+ MI.RemoveOperand(1);
+ MI.RemoveOperand(0);
+ MI.setDesc(TII->get(Opcode));
+ MachineInstrBuilder(*MI.getParent()->getParent(), &MI)
+ .addOperand(Dest)
+ .addOperand(Mode)
+ .addOperand(Src)
+ .addOperand(Suppress);
+ return true;
+ }
+ return false;
+}
+
+// Process all instructions in MBB. Return true if something changed.
+bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) {
+ bool Changed = false;
+
+ // Set up the set of live registers at the end of MBB (live out)
+ LiveRegs.clear();
+ LiveRegs.addLiveOuts(MBB);
+
+ // Iterate backwards through the block looking for instructions to change.
+ for (auto MBBI = MBB.rbegin(), MBBE = MBB.rend(); MBBI != MBBE; ++MBBI) {
+ MachineInstr &MI = *MBBI;
+ switch (MI.getOpcode()) {
+ case SystemZ::IILF:
+ Changed |= shortenIIF(MI, SystemZ::LLILL, SystemZ::LLILH);
+ break;
+
+ case SystemZ::IIHF:
+ Changed |= shortenIIF(MI, SystemZ::LLIHL, SystemZ::LLIHH);
+ break;
+
+ case SystemZ::WFADB:
+ Changed |= shortenOn001AddCC(MI, SystemZ::ADBR);
+ break;
+
+ case SystemZ::WFDDB:
+ Changed |= shortenOn001(MI, SystemZ::DDBR);
+ break;
+
+ case SystemZ::WFIDB:
+ Changed |= shortenFPConv(MI, SystemZ::FIDBRA);
+ break;
+
+ case SystemZ::WLDEB:
+ Changed |= shortenOn01(MI, SystemZ::LDEBR);
+ break;
+
+ case SystemZ::WLEDB:
+ Changed |= shortenFPConv(MI, SystemZ::LEDBRA);
+ break;
+
+ case SystemZ::WFMDB:
+ Changed |= shortenOn001(MI, SystemZ::MDBR);
+ break;
+
+ case SystemZ::WFLCDB:
+ Changed |= shortenOn01(MI, SystemZ::LCDFR);
+ break;
+
+ case SystemZ::WFLNDB:
+ Changed |= shortenOn01(MI, SystemZ::LNDFR);
+ break;
+
+ case SystemZ::WFLPDB:
+ Changed |= shortenOn01(MI, SystemZ::LPDFR);
+ break;
+
+ case SystemZ::WFSQDB:
+ Changed |= shortenOn01(MI, SystemZ::SQDBR);
+ break;
+
+ case SystemZ::WFSDB:
+ Changed |= shortenOn001AddCC(MI, SystemZ::SDBR);
+ break;
+
+ case SystemZ::WFCDB:
+ Changed |= shortenOn01(MI, SystemZ::CDBR);
+ break;
+
+ case SystemZ::VL32:
+ // For z13 we prefer LDE over LE to avoid partial register dependencies.
+ Changed |= shortenOn0(MI, SystemZ::LDE32);
+ break;
+
+ case SystemZ::VST32:
+ Changed |= shortenOn0(MI, SystemZ::STE);
+ break;
+
+ case SystemZ::VL64:
+ Changed |= shortenOn0(MI, SystemZ::LD);
+ break;
+
+ case SystemZ::VST64:
+ Changed |= shortenOn0(MI, SystemZ::STD);
+ break;
+ }
+
+ LiveRegs.stepBackward(MI);
+ }
+
+ return Changed;
+}
+
+bool SystemZShortenInst::runOnMachineFunction(MachineFunction &F) {
+ if (skipFunction(*F.getFunction()))
+ return false;
+
+ const SystemZSubtarget &ST = F.getSubtarget<SystemZSubtarget>();
+ TII = ST.getInstrInfo();
+ TRI = ST.getRegisterInfo();
+ LiveRegs.init(*TRI);
+
+ bool Changed = false;
+ for (auto &MBB : F)
+ Changed |= processBlock(MBB);
+
+ return Changed;
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
new file mode 100644
index 000000000000..ce07ea3318a5
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -0,0 +1,64 @@
+//===-- SystemZSubtarget.cpp - SystemZ subtarget information --------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZSubtarget.h"
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "llvm/IR/GlobalValue.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "systemz-subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "SystemZGenSubtargetInfo.inc"
+
+// Pin the vtable to this file.
+void SystemZSubtarget::anchor() {}
+
+SystemZSubtarget &
+SystemZSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
+ std::string CPUName = CPU;
+ if (CPUName.empty())
+ CPUName = "generic";
+ // Parse features string.
+ ParseSubtargetFeatures(CPUName, FS);
+ return *this;
+}
+
+SystemZSubtarget::SystemZSubtarget(const Triple &TT, const std::string &CPU,
+ const std::string &FS,
+ const TargetMachine &TM)
+ : SystemZGenSubtargetInfo(TT, CPU, FS), HasDistinctOps(false),
+ HasLoadStoreOnCond(false), HasHighWord(false), HasFPExtension(false),
+ HasPopulationCount(false), HasFastSerialization(false),
+ HasInterlockedAccess1(false), HasMiscellaneousExtensions(false),
+ HasExecutionHint(false), HasLoadAndTrap(false),
+ HasTransactionalExecution(false), HasProcessorAssist(false),
+ HasVector(false), HasLoadStoreOnCond2(false),
+ HasLoadAndZeroRightmostByte(false),
+ TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
+ TLInfo(TM, *this), TSInfo(), FrameLowering() {}
+
+bool SystemZSubtarget::isPC32DBLSymbol(const GlobalValue *GV,
+ CodeModel::Model CM) const {
+ // PC32DBL accesses require the low bit to be clear. Note that a zero
+ // value selects the default alignment and is therefore OK.
+ if (GV->getAlignment() == 1)
+ return false;
+
+ // For the small model, all locally-binding symbols are in range.
+ if (CM == CodeModel::Small)
+ return TLInfo.getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
+
+ // For Medium and above, assume that the symbol is not within the 4GB range.
+ // Taking the address of locally-defined text would be OK, but that
+ // case isn't easy to detect.
+ return false;
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZSubtarget.h b/contrib/llvm/lib/Target/SystemZ/SystemZSubtarget.h
new file mode 100644
index 000000000000..cdb61327a16a
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZSubtarget.h
@@ -0,0 +1,146 @@
+//===-- SystemZSubtarget.h - SystemZ subtarget information -----*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the SystemZ specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZSUBTARGET_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZSUBTARGET_H
+
+#include "SystemZFrameLowering.h"
+#include "SystemZISelLowering.h"
+#include "SystemZInstrInfo.h"
+#include "SystemZRegisterInfo.h"
+#include "SystemZSelectionDAGInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "SystemZGenSubtargetInfo.inc"
+
+namespace llvm {
+class GlobalValue;
+class StringRef;
+
+class SystemZSubtarget : public SystemZGenSubtargetInfo {
+ virtual void anchor();
+protected:
+ bool HasDistinctOps;
+ bool HasLoadStoreOnCond;
+ bool HasHighWord;
+ bool HasFPExtension;
+ bool HasPopulationCount;
+ bool HasFastSerialization;
+ bool HasInterlockedAccess1;
+ bool HasMiscellaneousExtensions;
+ bool HasExecutionHint;
+ bool HasLoadAndTrap;
+ bool HasTransactionalExecution;
+ bool HasProcessorAssist;
+ bool HasVector;
+ bool HasLoadStoreOnCond2;
+ bool HasLoadAndZeroRightmostByte;
+
+private:
+ Triple TargetTriple;
+ SystemZInstrInfo InstrInfo;
+ SystemZTargetLowering TLInfo;
+ SystemZSelectionDAGInfo TSInfo;
+ SystemZFrameLowering FrameLowering;
+
+ SystemZSubtarget &initializeSubtargetDependencies(StringRef CPU,
+ StringRef FS);
+public:
+ SystemZSubtarget(const Triple &TT, const std::string &CPU,
+ const std::string &FS, const TargetMachine &TM);
+
+ const TargetFrameLowering *getFrameLowering() const override {
+ return &FrameLowering;
+ }
+ const SystemZInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+ const SystemZRegisterInfo *getRegisterInfo() const override {
+ return &InstrInfo.getRegisterInfo();
+ }
+ const SystemZTargetLowering *getTargetLowering() const override {
+ return &TLInfo;
+ }
+ const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+ return &TSInfo;
+ }
+
+ // This is important for reducing register pressure in vector code.
+ bool useAA() const override { return true; }
+
+ // Always enable the early if-conversion pass.
+ bool enableEarlyIfConversion() const override { return true; }
+
+ // Automatically generated by tblgen.
+ void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+ // Return true if the target has the distinct-operands facility.
+ bool hasDistinctOps() const { return HasDistinctOps; }
+
+ // Return true if the target has the load/store-on-condition facility.
+ bool hasLoadStoreOnCond() const { return HasLoadStoreOnCond; }
+
+ // Return true if the target has the load/store-on-condition facility 2.
+ bool hasLoadStoreOnCond2() const { return HasLoadStoreOnCond2; }
+
+ // Return true if the target has the high-word facility.
+ bool hasHighWord() const { return HasHighWord; }
+
+ // Return true if the target has the floating-point extension facility.
+ bool hasFPExtension() const { return HasFPExtension; }
+
+ // Return true if the target has the population-count facility.
+ bool hasPopulationCount() const { return HasPopulationCount; }
+
+ // Return true if the target has the fast-serialization facility.
+ bool hasFastSerialization() const { return HasFastSerialization; }
+
+ // Return true if the target has interlocked-access facility 1.
+ bool hasInterlockedAccess1() const { return HasInterlockedAccess1; }
+
+ // Return true if the target has the miscellaneous-extensions facility.
+ bool hasMiscellaneousExtensions() const {
+ return HasMiscellaneousExtensions;
+ }
+
+ // Return true if the target has the execution-hint facility.
+ bool hasExecutionHint() const { return HasExecutionHint; }
+
+ // Return true if the target has the load-and-trap facility.
+ bool hasLoadAndTrap() const { return HasLoadAndTrap; }
+
+ // Return true if the target has the transactional-execution facility.
+ bool hasTransactionalExecution() const { return HasTransactionalExecution; }
+
+ // Return true if the target has the processor-assist facility.
+ bool hasProcessorAssist() const { return HasProcessorAssist; }
+
+ // Return true if the target has the load-and-zero-rightmost-byte facility.
+ bool hasLoadAndZeroRightmostByte() const {
+ return HasLoadAndZeroRightmostByte;
+ }
+
+ // Return true if the target has the vector facility.
+ bool hasVector() const { return HasVector; }
+
+ // Return true if GV can be accessed using LARL for reloc model RM
+ // and code model CM.
+ bool isPC32DBLSymbol(const GlobalValue *GV, CodeModel::Model CM) const;
+
+ bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZTDC.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZTDC.cpp
new file mode 100644
index 000000000000..96a9ef82c125
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZTDC.cpp
@@ -0,0 +1,382 @@
+//===-- SystemZTDC.cpp - Utilize Test Data Class instruction --------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass looks for instructions that can be replaced by a Test Data Class
+// instruction, and replaces them when profitable.
+//
+// Roughly, the following rules are recognized:
+//
+// 1: fcmp pred X, 0 -> tdc X, mask
+// 2: fcmp pred X, +-inf -> tdc X, mask
+// 3: fcmp pred X, +-minnorm -> tdc X, mask
+// 4: tdc (fabs X), mask -> tdc X, newmask
+// 5: icmp slt (bitcast float X to int), 0 -> tdc X, mask [ie. signbit]
+// 6: icmp sgt (bitcast float X to int), -1 -> tdc X, mask
+// 7: icmp ne/eq (call @llvm.s390.tdc.*(X, mask)) -> tdc X, mask/~mask
+// 8: and i1 (tdc X, M1), (tdc X, M2) -> tdc X, (M1 & M2)
+// 9: or i1 (tdc X, M1), (tdc X, M2) -> tdc X, (M1 | M2)
+// 10: xor i1 (tdc X, M1), (tdc X, M2) -> tdc X, (M1 ^ M2)
+//
+// The pass works in 4 steps:
+//
+// 1. All fcmp and icmp instructions in a function are checked for a match
+// with rules 1-3 and 5-7. Their TDC equivalents are stored in
+// the ConvertedInsts mapping. If the operand of a fcmp instruction is
+// a fabs, it's also folded according to rule 4.
+// 2. All and/or/xor i1 instructions whose both operands have been already
+// mapped are mapped according to rules 8-10. LogicOpsWorklist is used
+// as a queue of instructions to check.
+// 3. All mapped instructions that are considered worthy of conversion (ie.
+// replacing them will actually simplify the final code) are replaced
+// with a call to the s390.tdc intrinsic.
+// 4. All intermediate results of replaced instructions are removed if unused.
+//
+// Instructions that match rules 1-3 are considered unworthy of conversion
+// on their own (since a comparison instruction is superior), but are mapped
+// in the hopes of folding the result using rules 4 and 8-10 (likely removing
+// the original comparison in the process).
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZ.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include <deque>
+#include <set>
+
+using namespace llvm;
+
+namespace llvm {
+ void initializeSystemZTDCPassPass(PassRegistry&);
+}
+
+namespace {
+
+class SystemZTDCPass : public FunctionPass {
+public:
+ static char ID;
+ SystemZTDCPass() : FunctionPass(ID) {
+ initializeSystemZTDCPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+private:
+ // Maps seen instructions that can be mapped to a TDC, values are
+ // (TDC operand, TDC mask, worthy flag) triples.
+ MapVector<Instruction *, std::tuple<Value *, int, bool>> ConvertedInsts;
+ // The queue of and/or/xor i1 instructions to be potentially folded.
+ std::vector<BinaryOperator *> LogicOpsWorklist;
+ // Instructions matched while folding, to be removed at the end if unused.
+ std::set<Instruction *> PossibleJunk;
+
+ // Tries to convert a fcmp instruction.
+ void convertFCmp(CmpInst &I);
+
+ // Tries to convert an icmp instruction.
+ void convertICmp(CmpInst &I);
+
+ // Tries to convert an i1 and/or/xor instruction, whose both operands
+ // have been already converted.
+ void convertLogicOp(BinaryOperator &I);
+
+ // Marks an instruction as converted - adds it to ConvertedInsts and adds
+ // any and/or/xor i1 users to the queue.
+ void converted(Instruction *I, Value *V, int Mask, bool Worthy) {
+ ConvertedInsts[I] = std::make_tuple(V, Mask, Worthy);
+ auto &M = *I->getFunction()->getParent();
+ auto &Ctx = M.getContext();
+ for (auto *U : I->users()) {
+ auto *LI = dyn_cast<BinaryOperator>(U);
+ if (LI && LI->getType() == Type::getInt1Ty(Ctx) &&
+ (LI->getOpcode() == Instruction::And ||
+ LI->getOpcode() == Instruction::Or ||
+ LI->getOpcode() == Instruction::Xor)) {
+ LogicOpsWorklist.push_back(LI);
+ }
+ }
+ }
+};
+
+} // end anonymous namespace
+
+char SystemZTDCPass::ID = 0;
+INITIALIZE_PASS(SystemZTDCPass, "systemz-tdc",
+ "SystemZ Test Data Class optimization", false, false)
+
+FunctionPass *llvm::createSystemZTDCPass() {
+ return new SystemZTDCPass();
+}
+
+void SystemZTDCPass::convertFCmp(CmpInst &I) {
+ Value *Op0 = I.getOperand(0);
+ auto *Const = dyn_cast<ConstantFP>(I.getOperand(1));
+ auto Pred = I.getPredicate();
+ // Only comparisons with consts are interesting.
+ if (!Const)
+ return;
+ // Compute the smallest normal number (and its negation).
+ auto &Sem = Op0->getType()->getFltSemantics();
+ APFloat Smallest = APFloat::getSmallestNormalized(Sem);
+ APFloat NegSmallest = Smallest;
+ NegSmallest.changeSign();
+ // Check if Const is one of our recognized consts.
+ int WhichConst;
+ if (Const->isZero()) {
+ // All comparisons with 0 can be converted.
+ WhichConst = 0;
+ } else if (Const->isInfinity()) {
+ // Likewise for infinities.
+ WhichConst = Const->isNegative() ? 2 : 1;
+ } else if (Const->isExactlyValue(Smallest)) {
+ // For Smallest, we cannot do EQ separately from GT.
+ if ((Pred & CmpInst::FCMP_OGE) != CmpInst::FCMP_OGE &&
+ (Pred & CmpInst::FCMP_OGE) != 0)
+ return;
+ WhichConst = 3;
+ } else if (Const->isExactlyValue(NegSmallest)) {
+ // Likewise for NegSmallest, we cannot do EQ separately from LT.
+ if ((Pred & CmpInst::FCMP_OLE) != CmpInst::FCMP_OLE &&
+ (Pred & CmpInst::FCMP_OLE) != 0)
+ return;
+ WhichConst = 4;
+ } else {
+ // Not one of our special constants.
+ return;
+ }
+ // Partial masks to use for EQ, GT, LT, UN comparisons, respectively.
+ static const int Masks[][4] = {
+ { // 0
+ SystemZ::TDCMASK_ZERO, // eq
+ SystemZ::TDCMASK_POSITIVE, // gt
+ SystemZ::TDCMASK_NEGATIVE, // lt
+ SystemZ::TDCMASK_NAN, // un
+ },
+ { // inf
+ SystemZ::TDCMASK_INFINITY_PLUS, // eq
+ 0, // gt
+ (SystemZ::TDCMASK_ZERO |
+ SystemZ::TDCMASK_NEGATIVE |
+ SystemZ::TDCMASK_NORMAL_PLUS |
+ SystemZ::TDCMASK_SUBNORMAL_PLUS), // lt
+ SystemZ::TDCMASK_NAN, // un
+ },
+ { // -inf
+ SystemZ::TDCMASK_INFINITY_MINUS, // eq
+ (SystemZ::TDCMASK_ZERO |
+ SystemZ::TDCMASK_POSITIVE |
+ SystemZ::TDCMASK_NORMAL_MINUS |
+ SystemZ::TDCMASK_SUBNORMAL_MINUS), // gt
+ 0, // lt
+ SystemZ::TDCMASK_NAN, // un
+ },
+ { // minnorm
+ 0, // eq (unsupported)
+ (SystemZ::TDCMASK_NORMAL_PLUS |
+ SystemZ::TDCMASK_INFINITY_PLUS), // gt (actually ge)
+ (SystemZ::TDCMASK_ZERO |
+ SystemZ::TDCMASK_NEGATIVE |
+ SystemZ::TDCMASK_SUBNORMAL_PLUS), // lt
+ SystemZ::TDCMASK_NAN, // un
+ },
+ { // -minnorm
+ 0, // eq (unsupported)
+ (SystemZ::TDCMASK_ZERO |
+ SystemZ::TDCMASK_POSITIVE |
+ SystemZ::TDCMASK_SUBNORMAL_MINUS), // gt
+ (SystemZ::TDCMASK_NORMAL_MINUS |
+ SystemZ::TDCMASK_INFINITY_MINUS), // lt (actually le)
+ SystemZ::TDCMASK_NAN, // un
+ }
+ };
+ // Construct the mask as a combination of the partial masks.
+ int Mask = 0;
+ if (Pred & CmpInst::FCMP_OEQ)
+ Mask |= Masks[WhichConst][0];
+ if (Pred & CmpInst::FCMP_OGT)
+ Mask |= Masks[WhichConst][1];
+ if (Pred & CmpInst::FCMP_OLT)
+ Mask |= Masks[WhichConst][2];
+ if (Pred & CmpInst::FCMP_UNO)
+ Mask |= Masks[WhichConst][3];
+ // A lone fcmp is unworthy of tdc conversion on its own, but may become
+ // worthy if combined with fabs.
+ bool Worthy = false;
+ if (CallInst *CI = dyn_cast<CallInst>(Op0)) {
+ Function *F = CI->getCalledFunction();
+ if (F && F->getIntrinsicID() == Intrinsic::fabs) {
+ // Fold with fabs - adjust the mask appropriately.
+ Mask &= SystemZ::TDCMASK_PLUS;
+ Mask |= Mask >> 1;
+ Op0 = CI->getArgOperand(0);
+ // A combination of fcmp with fabs is a win, unless the constant
+ // involved is 0 (which is handled by later passes).
+ Worthy = WhichConst != 0;
+ PossibleJunk.insert(CI);
+ }
+ }
+ converted(&I, Op0, Mask, Worthy);
+}
+
+void SystemZTDCPass::convertICmp(CmpInst &I) {
+ Value *Op0 = I.getOperand(0);
+ auto *Const = dyn_cast<ConstantInt>(I.getOperand(1));
+ auto Pred = I.getPredicate();
+ // All our icmp rules involve comparisons with consts.
+ if (!Const)
+ return;
+ if (auto *Cast = dyn_cast<BitCastInst>(Op0)) {
+ // Check for icmp+bitcast used for signbit.
+ if (!Cast->getSrcTy()->isFloatTy() &&
+ !Cast->getSrcTy()->isDoubleTy() &&
+ !Cast->getSrcTy()->isFP128Ty())
+ return;
+ Value *V = Cast->getOperand(0);
+ int Mask;
+ if (Pred == CmpInst::ICMP_SLT && Const->isZero()) {
+ // icmp slt (bitcast X), 0 - set if sign bit true
+ Mask = SystemZ::TDCMASK_MINUS;
+ } else if (Pred == CmpInst::ICMP_SGT && Const->isMinusOne()) {
+ // icmp sgt (bitcast X), -1 - set if sign bit false
+ Mask = SystemZ::TDCMASK_PLUS;
+ } else {
+ // Not a sign bit check.
+ return;
+ }
+ PossibleJunk.insert(Cast);
+ converted(&I, V, Mask, true);
+ } else if (auto *CI = dyn_cast<CallInst>(Op0)) {
+ // Check if this is a pre-existing call of our tdc intrinsic.
+ Function *F = CI->getCalledFunction();
+ if (!F || F->getIntrinsicID() != Intrinsic::s390_tdc)
+ return;
+ if (!Const->isZero())
+ return;
+ Value *V = CI->getArgOperand(0);
+ auto *MaskC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+ // Bail if the mask is not a constant.
+ if (!MaskC)
+ return;
+ int Mask = MaskC->getZExtValue();
+ Mask &= SystemZ::TDCMASK_ALL;
+ if (Pred == CmpInst::ICMP_NE) {
+ // icmp ne (call llvm.s390.tdc(...)), 0 -> simple TDC
+ } else if (Pred == CmpInst::ICMP_EQ) {
+ // icmp eq (call llvm.s390.tdc(...)), 0 -> TDC with inverted mask
+ Mask ^= SystemZ::TDCMASK_ALL;
+ } else {
+ // An unknown comparison - ignore.
+ return;
+ }
+ PossibleJunk.insert(CI);
+ converted(&I, V, Mask, false);
+ }
+}
+
+void SystemZTDCPass::convertLogicOp(BinaryOperator &I) {
+ Value *Op0, *Op1;
+ int Mask0, Mask1;
+ bool Worthy0, Worthy1;
+ std::tie(Op0, Mask0, Worthy0) = ConvertedInsts[cast<Instruction>(I.getOperand(0))];
+ std::tie(Op1, Mask1, Worthy1) = ConvertedInsts[cast<Instruction>(I.getOperand(1))];
+ if (Op0 != Op1)
+ return;
+ int Mask;
+ switch (I.getOpcode()) {
+ case Instruction::And:
+ Mask = Mask0 & Mask1;
+ break;
+ case Instruction::Or:
+ Mask = Mask0 | Mask1;
+ break;
+ case Instruction::Xor:
+ Mask = Mask0 ^ Mask1;
+ break;
+ default:
+ llvm_unreachable("Unknown op in convertLogicOp");
+ }
+ converted(&I, Op0, Mask, true);
+}
+
+bool SystemZTDCPass::runOnFunction(Function &F) {
+ ConvertedInsts.clear();
+ LogicOpsWorklist.clear();
+ PossibleJunk.clear();
+
+ // Look for icmp+fcmp instructions.
+ for (auto &I : instructions(F)) {
+ if (I.getOpcode() == Instruction::FCmp)
+ convertFCmp(cast<CmpInst>(I));
+ else if (I.getOpcode() == Instruction::ICmp)
+ convertICmp(cast<CmpInst>(I));
+ }
+
+ // If none found, bail already.
+ if (ConvertedInsts.empty())
+ return false;
+
+ // Process the queue of logic instructions.
+ while (!LogicOpsWorklist.empty()) {
+ BinaryOperator *Op = LogicOpsWorklist.back();
+ LogicOpsWorklist.pop_back();
+ // If both operands mapped, and the instruction itself not yet mapped,
+ // convert it.
+ if (ConvertedInsts.count(dyn_cast<Instruction>(Op->getOperand(0))) &&
+ ConvertedInsts.count(dyn_cast<Instruction>(Op->getOperand(1))) &&
+ !ConvertedInsts.count(Op))
+ convertLogicOp(*Op);
+ }
+
+ // Time to actually replace the instructions. Do it in the reverse order
+ // of finding them, since there's a good chance the earlier ones will be
+ // unused (due to being folded into later ones).
+ Module &M = *F.getParent();
+ auto &Ctx = M.getContext();
+ Value *Zero32 = ConstantInt::get(Type::getInt32Ty(Ctx), 0);
+ bool MadeChange = false;
+ for (auto &It : reverse(ConvertedInsts)) {
+ Instruction *I = It.first;
+ Value *V;
+ int Mask;
+ bool Worthy;
+ std::tie(V, Mask, Worthy) = It.second;
+ if (!I->user_empty()) {
+ // If used and unworthy of conversion, skip it.
+ if (!Worthy)
+ continue;
+ // Call the intrinsic, compare result with 0.
+ Value *TDCFunc = Intrinsic::getDeclaration(&M, Intrinsic::s390_tdc,
+ V->getType());
+ IRBuilder<> IRB(I);
+ Value *MaskVal = ConstantInt::get(Type::getInt64Ty(Ctx), Mask);
+ Instruction *TDC = IRB.CreateCall(TDCFunc, {V, MaskVal});
+ Value *ICmp = IRB.CreateICmp(CmpInst::ICMP_NE, TDC, Zero32);
+ I->replaceAllUsesWith(ICmp);
+ }
+ // If unused, or used and converted, remove it.
+ I->eraseFromParent();
+ MadeChange = true;
+ }
+
+ if (!MadeChange)
+ return false;
+
+ // We've actually done something - now clear misc accumulated junk (fabs,
+ // bitcast).
+ for (auto *I : PossibleJunk)
+ if (I->user_empty())
+ I->eraseFromParent();
+
+ return true;
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
new file mode 100644
index 000000000000..33fdb8f90825
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -0,0 +1,209 @@
+//===-- SystemZTargetMachine.cpp - Define TargetMachine for SystemZ -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZTargetMachine.h"
+#include "SystemZTargetTransformInfo.h"
+#include "SystemZMachineScheduler.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+
+using namespace llvm;
+
+extern "C" void LLVMInitializeSystemZTarget() {
+ // Register the target.
+ RegisterTargetMachine<SystemZTargetMachine> X(getTheSystemZTarget());
+}
+
+// Determine whether we use the vector ABI.
+static bool UsesVectorABI(StringRef CPU, StringRef FS) {
+ // We use the vector ABI whenever the vector facility is avaiable.
+ // This is the case by default if CPU is z13 or later, and can be
+ // overridden via "[+-]vector" feature string elements.
+ bool VectorABI = true;
+ if (CPU.empty() || CPU == "generic" ||
+ CPU == "z10" || CPU == "z196" || CPU == "zEC12")
+ VectorABI = false;
+
+ SmallVector<StringRef, 3> Features;
+ FS.split(Features, ',', -1, false /* KeepEmpty */);
+ for (auto &Feature : Features) {
+ if (Feature == "vector" || Feature == "+vector")
+ VectorABI = true;
+ if (Feature == "-vector")
+ VectorABI = false;
+ }
+
+ return VectorABI;
+}
+
+static std::string computeDataLayout(const Triple &TT, StringRef CPU,
+ StringRef FS) {
+ bool VectorABI = UsesVectorABI(CPU, FS);
+ std::string Ret = "";
+
+ // Big endian.
+ Ret += "E";
+
+ // Data mangling.
+ Ret += DataLayout::getManglingComponent(TT);
+
+ // Make sure that global data has at least 16 bits of alignment by
+ // default, so that we can refer to it using LARL. We don't have any
+ // special requirements for stack variables though.
+ Ret += "-i1:8:16-i8:8:16";
+
+ // 64-bit integers are naturally aligned.
+ Ret += "-i64:64";
+
+ // 128-bit floats are aligned only to 64 bits.
+ Ret += "-f128:64";
+
+ // When using the vector ABI, 128-bit vectors are also aligned to 64 bits.
+ if (VectorABI)
+ Ret += "-v128:64";
+
+ // We prefer 16 bits of aligned for all globals; see above.
+ Ret += "-a:8:16";
+
+ // Integer registers are 32 or 64 bits.
+ Ret += "-n32:64";
+
+ return Ret;
+}
+
+static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
+ // Static code is suitable for use in a dynamic executable; there is no
+ // separate DynamicNoPIC model.
+ if (!RM.hasValue() || *RM == Reloc::DynamicNoPIC)
+ return Reloc::Static;
+ return *RM;
+}
+
+SystemZTargetMachine::SystemZTargetMachine(const Target &T, const Triple &TT,
+ StringRef CPU, StringRef FS,
+ const TargetOptions &Options,
+ Optional<Reloc::Model> RM,
+ CodeModel::Model CM,
+ CodeGenOpt::Level OL)
+ : LLVMTargetMachine(T, computeDataLayout(TT, CPU, FS), TT, CPU, FS, Options,
+ getEffectiveRelocModel(RM), CM, OL),
+ TLOF(make_unique<TargetLoweringObjectFileELF>()),
+ Subtarget(TT, CPU, FS, *this) {
+ initAsmInfo();
+}
+
+SystemZTargetMachine::~SystemZTargetMachine() {}
+
+namespace {
+/// SystemZ Code Generator Pass Configuration Options.
+class SystemZPassConfig : public TargetPassConfig {
+public:
+ SystemZPassConfig(SystemZTargetMachine *TM, PassManagerBase &PM)
+ : TargetPassConfig(TM, PM) {}
+
+ SystemZTargetMachine &getSystemZTargetMachine() const {
+ return getTM<SystemZTargetMachine>();
+ }
+
+ ScheduleDAGInstrs *
+ createPostMachineScheduler(MachineSchedContext *C) const override {
+ return new ScheduleDAGMI(C, make_unique<SystemZPostRASchedStrategy>(C),
+ /*RemoveKillFlags=*/true);
+ }
+
+ void addIRPasses() override;
+ bool addInstSelector() override;
+ bool addILPOpts() override;
+ void addPreSched2() override;
+ void addPreEmitPass() override;
+};
+} // end anonymous namespace
+
+void SystemZPassConfig::addIRPasses() {
+ if (getOptLevel() != CodeGenOpt::None)
+ addPass(createSystemZTDCPass());
+
+ TargetPassConfig::addIRPasses();
+}
+
+bool SystemZPassConfig::addInstSelector() {
+ addPass(createSystemZISelDag(getSystemZTargetMachine(), getOptLevel()));
+
+ if (getOptLevel() != CodeGenOpt::None)
+ addPass(createSystemZLDCleanupPass(getSystemZTargetMachine()));
+
+ return false;
+}
+
+bool SystemZPassConfig::addILPOpts() {
+ addPass(&EarlyIfConverterID);
+ return true;
+}
+
+void SystemZPassConfig::addPreSched2() {
+ addPass(createSystemZExpandPseudoPass(getSystemZTargetMachine()));
+
+ if (getOptLevel() != CodeGenOpt::None)
+ addPass(&IfConverterID);
+}
+
+void SystemZPassConfig::addPreEmitPass() {
+
+ // Do instruction shortening before compare elimination because some
+ // vector instructions will be shortened into opcodes that compare
+ // elimination recognizes.
+ if (getOptLevel() != CodeGenOpt::None)
+ addPass(createSystemZShortenInstPass(getSystemZTargetMachine()), false);
+
+ // We eliminate comparisons here rather than earlier because some
+ // transformations can change the set of available CC values and we
+ // generally want those transformations to have priority. This is
+ // especially true in the commonest case where the result of the comparison
+ // is used by a single in-range branch instruction, since we will then
+ // be able to fuse the compare and the branch instead.
+ //
+ // For example, two-address NILF can sometimes be converted into
+ // three-address RISBLG. NILF produces a CC value that indicates whether
+ // the low word is zero, but RISBLG does not modify CC at all. On the
+ // other hand, 64-bit ANDs like NILL can sometimes be converted to RISBG.
+ // The CC value produced by NILL isn't useful for our purposes, but the
+ // value produced by RISBG can be used for any comparison with zero
+ // (not just equality). So there are some transformations that lose
+ // CC values (while still being worthwhile) and others that happen to make
+ // the CC result more useful than it was originally.
+ //
+ // Another reason is that we only want to use BRANCH ON COUNT in cases
+ // where we know that the count register is not going to be spilled.
+ //
+ // Doing it so late makes it more likely that a register will be reused
+ // between the comparison and the branch, but it isn't clear whether
+ // preventing that would be a win or not.
+ if (getOptLevel() != CodeGenOpt::None)
+ addPass(createSystemZElimComparePass(getSystemZTargetMachine()), false);
+ addPass(createSystemZLongBranchPass(getSystemZTargetMachine()));
+
+ // Do final scheduling after all other optimizations, to get an
+ // optimal input for the decoder (branch relaxation must happen
+ // after block placement).
+ if (getOptLevel() != CodeGenOpt::None)
+ addPass(&PostMachineSchedulerID);
+}
+
+TargetPassConfig *SystemZTargetMachine::createPassConfig(PassManagerBase &PM) {
+ return new SystemZPassConfig(this, PM);
+}
+
+TargetIRAnalysis SystemZTargetMachine::getTargetIRAnalysis() {
+ return TargetIRAnalysis([this](const Function &F) {
+ return TargetTransformInfo(SystemZTTIImpl(this, F));
+ });
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.h b/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.h
new file mode 100644
index 000000000000..69cf9bc6e525
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.h
@@ -0,0 +1,53 @@
+//==- SystemZTargetMachine.h - Define TargetMachine for SystemZ ---*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the SystemZ specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZTARGETMACHINE_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZTARGETMACHINE_H
+
+#include "SystemZSubtarget.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class TargetFrameLowering;
+
+class SystemZTargetMachine : public LLVMTargetMachine {
+ std::unique_ptr<TargetLoweringObjectFile> TLOF;
+ SystemZSubtarget Subtarget;
+
+public:
+ SystemZTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL);
+ ~SystemZTargetMachine() override;
+
+ const SystemZSubtarget *getSubtargetImpl() const { return &Subtarget; }
+ const SystemZSubtarget *getSubtargetImpl(const Function &) const override {
+ return &Subtarget;
+ }
+ // Override LLVMTargetMachine
+ TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+ TargetIRAnalysis getTargetIRAnalysis() override;
+ TargetLoweringObjectFile *getObjFileLowering() const override {
+ return TLOF.get();
+ }
+
+ bool targetSchedulesPostRAScheduling() const override { return true; };
+
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
new file mode 100644
index 000000000000..b10c0e09a0d4
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -0,0 +1,315 @@
+//===-- SystemZTargetTransformInfo.cpp - SystemZ-specific TTI -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a TargetTransformInfo analysis pass specific to the
+// SystemZ target machine. It uses the target's detailed information to provide
+// more precise answers to certain TTI queries, while letting the target
+// independent and default TTI implementations handle the rest.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZTargetTransformInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/CostTable.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "systemztti"
+
+//===----------------------------------------------------------------------===//
+//
+// SystemZ cost model.
+//
+//===----------------------------------------------------------------------===//
+
+int SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
+ assert(Ty->isIntegerTy());
+
+ unsigned BitSize = Ty->getPrimitiveSizeInBits();
+ // There is no cost model for constants with a bit size of 0. Return TCC_Free
+ // here, so that constant hoisting will ignore this constant.
+ if (BitSize == 0)
+ return TTI::TCC_Free;
+ // No cost model for operations on integers larger than 64 bit implemented yet.
+ if (BitSize > 64)
+ return TTI::TCC_Free;
+
+ if (Imm == 0)
+ return TTI::TCC_Free;
+
+ if (Imm.getBitWidth() <= 64) {
+ // Constants loaded via lgfi.
+ if (isInt<32>(Imm.getSExtValue()))
+ return TTI::TCC_Basic;
+ // Constants loaded via llilf.
+ if (isUInt<32>(Imm.getZExtValue()))
+ return TTI::TCC_Basic;
+ // Constants loaded via llihf:
+ if ((Imm.getZExtValue() & 0xffffffff) == 0)
+ return TTI::TCC_Basic;
+
+ return 2 * TTI::TCC_Basic;
+ }
+
+ return 4 * TTI::TCC_Basic;
+}
+
+int SystemZTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
+ const APInt &Imm, Type *Ty) {
+ assert(Ty->isIntegerTy());
+
+ unsigned BitSize = Ty->getPrimitiveSizeInBits();
+ // There is no cost model for constants with a bit size of 0. Return TCC_Free
+ // here, so that constant hoisting will ignore this constant.
+ if (BitSize == 0)
+ return TTI::TCC_Free;
+ // No cost model for operations on integers larger than 64 bit implemented yet.
+ if (BitSize > 64)
+ return TTI::TCC_Free;
+
+ switch (Opcode) {
+ default:
+ return TTI::TCC_Free;
+ case Instruction::GetElementPtr:
+ // Always hoist the base address of a GetElementPtr. This prevents the
+ // creation of new constants for every base constant that gets constant
+ // folded with the offset.
+ if (Idx == 0)
+ return 2 * TTI::TCC_Basic;
+ return TTI::TCC_Free;
+ case Instruction::Store:
+ if (Idx == 0 && Imm.getBitWidth() <= 64) {
+ // Any 8-bit immediate store can by implemented via mvi.
+ if (BitSize == 8)
+ return TTI::TCC_Free;
+ // 16-bit immediate values can be stored via mvhhi/mvhi/mvghi.
+ if (isInt<16>(Imm.getSExtValue()))
+ return TTI::TCC_Free;
+ }
+ break;
+ case Instruction::ICmp:
+ if (Idx == 1 && Imm.getBitWidth() <= 64) {
+ // Comparisons against signed 32-bit immediates implemented via cgfi.
+ if (isInt<32>(Imm.getSExtValue()))
+ return TTI::TCC_Free;
+ // Comparisons against unsigned 32-bit immediates implemented via clgfi.
+ if (isUInt<32>(Imm.getZExtValue()))
+ return TTI::TCC_Free;
+ }
+ break;
+ case Instruction::Add:
+ case Instruction::Sub:
+ if (Idx == 1 && Imm.getBitWidth() <= 64) {
+ // We use algfi/slgfi to add/subtract 32-bit unsigned immediates.
+ if (isUInt<32>(Imm.getZExtValue()))
+ return TTI::TCC_Free;
+ // Or their negation, by swapping addition vs. subtraction.
+ if (isUInt<32>(-Imm.getSExtValue()))
+ return TTI::TCC_Free;
+ }
+ break;
+ case Instruction::Mul:
+ if (Idx == 1 && Imm.getBitWidth() <= 64) {
+ // We use msgfi to multiply by 32-bit signed immediates.
+ if (isInt<32>(Imm.getSExtValue()))
+ return TTI::TCC_Free;
+ }
+ break;
+ case Instruction::Or:
+ case Instruction::Xor:
+ if (Idx == 1 && Imm.getBitWidth() <= 64) {
+ // Masks supported by oilf/xilf.
+ if (isUInt<32>(Imm.getZExtValue()))
+ return TTI::TCC_Free;
+ // Masks supported by oihf/xihf.
+ if ((Imm.getZExtValue() & 0xffffffff) == 0)
+ return TTI::TCC_Free;
+ }
+ break;
+ case Instruction::And:
+ if (Idx == 1 && Imm.getBitWidth() <= 64) {
+ // Any 32-bit AND operation can by implemented via nilf.
+ if (BitSize <= 32)
+ return TTI::TCC_Free;
+ // 64-bit masks supported by nilf.
+ if (isUInt<32>(~Imm.getZExtValue()))
+ return TTI::TCC_Free;
+ // 64-bit masks supported by nilh.
+ if ((Imm.getZExtValue() & 0xffffffff) == 0xffffffff)
+ return TTI::TCC_Free;
+ // Some 64-bit AND operations can be implemented via risbg.
+ const SystemZInstrInfo *TII = ST->getInstrInfo();
+ unsigned Start, End;
+ if (TII->isRxSBGMask(Imm.getZExtValue(), BitSize, Start, End))
+ return TTI::TCC_Free;
+ }
+ break;
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ // Always return TCC_Free for the shift value of a shift instruction.
+ if (Idx == 1)
+ return TTI::TCC_Free;
+ break;
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::Trunc:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::IntToPtr:
+ case Instruction::PtrToInt:
+ case Instruction::BitCast:
+ case Instruction::PHI:
+ case Instruction::Call:
+ case Instruction::Select:
+ case Instruction::Ret:
+ case Instruction::Load:
+ break;
+ }
+
+ return SystemZTTIImpl::getIntImmCost(Imm, Ty);
+}
+
+int SystemZTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
+ const APInt &Imm, Type *Ty) {
+ assert(Ty->isIntegerTy());
+
+ unsigned BitSize = Ty->getPrimitiveSizeInBits();
+ // There is no cost model for constants with a bit size of 0. Return TCC_Free
+ // here, so that constant hoisting will ignore this constant.
+ if (BitSize == 0)
+ return TTI::TCC_Free;
+ // No cost model for operations on integers larger than 64 bit implemented yet.
+ if (BitSize > 64)
+ return TTI::TCC_Free;
+
+ switch (IID) {
+ default:
+ return TTI::TCC_Free;
+ case Intrinsic::sadd_with_overflow:
+ case Intrinsic::uadd_with_overflow:
+ case Intrinsic::ssub_with_overflow:
+ case Intrinsic::usub_with_overflow:
+ // These get expanded to include a normal addition/subtraction.
+ if (Idx == 1 && Imm.getBitWidth() <= 64) {
+ if (isUInt<32>(Imm.getZExtValue()))
+ return TTI::TCC_Free;
+ if (isUInt<32>(-Imm.getSExtValue()))
+ return TTI::TCC_Free;
+ }
+ break;
+ case Intrinsic::smul_with_overflow:
+ case Intrinsic::umul_with_overflow:
+ // These get expanded to include a normal multiplication.
+ if (Idx == 1 && Imm.getBitWidth() <= 64) {
+ if (isInt<32>(Imm.getSExtValue()))
+ return TTI::TCC_Free;
+ }
+ break;
+ case Intrinsic::experimental_stackmap:
+ if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+ return TTI::TCC_Free;
+ break;
+ case Intrinsic::experimental_patchpoint_void:
+ case Intrinsic::experimental_patchpoint_i64:
+ if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+ return TTI::TCC_Free;
+ break;
+ }
+ return SystemZTTIImpl::getIntImmCost(Imm, Ty);
+}
+
+TargetTransformInfo::PopcntSupportKind
+SystemZTTIImpl::getPopcntSupport(unsigned TyWidth) {
+ assert(isPowerOf2_32(TyWidth) && "Type width must be power of 2");
+ if (ST->hasPopulationCount() && TyWidth <= 64)
+ return TTI::PSK_FastHardware;
+ return TTI::PSK_Software;
+}
+
+void SystemZTTIImpl::getUnrollingPreferences(Loop *L,
+ TTI::UnrollingPreferences &UP) {
+ // Find out if L contains a call, what the machine instruction count
+ // estimate is, and how many stores there are.
+ bool HasCall = false;
+ unsigned NumStores = 0;
+ for (auto &BB : L->blocks())
+ for (auto &I : *BB) {
+ if (isa<CallInst>(&I) || isa<InvokeInst>(&I)) {
+ ImmutableCallSite CS(&I);
+ if (const Function *F = CS.getCalledFunction()) {
+ if (isLoweredToCall(F))
+ HasCall = true;
+ if (F->getIntrinsicID() == Intrinsic::memcpy ||
+ F->getIntrinsicID() == Intrinsic::memset)
+ NumStores++;
+ } else { // indirect call.
+ HasCall = true;
+ }
+ }
+ if (isa<StoreInst>(&I)) {
+ NumStores++;
+ Type *MemAccessTy = I.getOperand(0)->getType();
+ if((MemAccessTy->isIntegerTy() || MemAccessTy->isFloatingPointTy()) &&
+ (getDataLayout().getTypeSizeInBits(MemAccessTy) == 128))
+ NumStores++; // 128 bit fp/int stores get split.
+ }
+ }
+
+ // The z13 processor will run out of store tags if too many stores
+ // are fed into it too quickly. Therefore make sure there are not
+ // too many stores in the resulting unrolled loop.
+ unsigned const Max = (NumStores ? (12 / NumStores) : UINT_MAX);
+
+ if (HasCall) {
+ // Only allow full unrolling if loop has any calls.
+ UP.FullUnrollMaxCount = Max;
+ UP.MaxCount = 1;
+ return;
+ }
+
+ UP.MaxCount = Max;
+ if (UP.MaxCount <= 1)
+ return;
+
+ // Allow partial and runtime trip count unrolling.
+ UP.Partial = UP.Runtime = true;
+
+ UP.PartialThreshold = 75;
+ UP.DefaultUnrollRuntimeCount = 4;
+
+ // Allow expensive instructions in the pre-header of the loop.
+ UP.AllowExpensiveTripCount = true;
+
+ UP.Force = true;
+}
+
+unsigned SystemZTTIImpl::getNumberOfRegisters(bool Vector) {
+ if (!Vector)
+ // Discount the stack pointer. Also leave out %r0, since it can't
+ // be used in an address.
+ return 14;
+ if (ST->hasVector())
+ return 32;
+ return 0;
+}
+
+unsigned SystemZTTIImpl::getRegisterBitWidth(bool Vector) {
+ if (!Vector)
+ return 64;
+ if (ST->hasVector())
+ return 128;
+ return 0;
+}
+
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
new file mode 100644
index 000000000000..f7d2d827f11b
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -0,0 +1,61 @@
+//===-- SystemZTargetTransformInfo.h - SystemZ-specific TTI ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZTARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZTARGETTRANSFORMINFO_H
+
+#include "SystemZTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+
+namespace llvm {
+
+class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
+ typedef BasicTTIImplBase<SystemZTTIImpl> BaseT;
+ typedef TargetTransformInfo TTI;
+ friend BaseT;
+
+ const SystemZSubtarget *ST;
+ const SystemZTargetLowering *TLI;
+
+ const SystemZSubtarget *getST() const { return ST; }
+ const SystemZTargetLowering *getTLI() const { return TLI; }
+
+public:
+ explicit SystemZTTIImpl(const SystemZTargetMachine *TM, const Function &F)
+ : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+ TLI(ST->getTargetLowering()) {}
+
+ /// \name Scalar TTI Implementations
+ /// @{
+
+ int getIntImmCost(const APInt &Imm, Type *Ty);
+
+ int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
+ int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+ Type *Ty);
+
+ TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
+
+ void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
+
+ /// @}
+
+ /// \name Vector TTI Implementations
+ /// @{
+
+ unsigned getNumberOfRegisters(bool Vector);
+ unsigned getRegisterBitWidth(bool Vector);
+
+ /// @}
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp b/contrib/llvm/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp
new file mode 100644
index 000000000000..d3c53a43b391
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp
@@ -0,0 +1,23 @@
+//===-- SystemZTargetInfo.cpp - SystemZ target implementation -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZ.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+Target &llvm::getTheSystemZTarget() {
+ static Target TheSystemZTarget;
+ return TheSystemZTarget;
+}
+
+extern "C" void LLVMInitializeSystemZTargetInfo() {
+ RegisterTarget<Triple::systemz, /*HasJIT=*/true> X(getTheSystemZTarget(),
+ "systemz", "SystemZ");
+}
diff --git a/contrib/llvm/lib/Target/Target.cpp b/contrib/llvm/lib/Target/Target.cpp
new file mode 100644
index 000000000000..5d1616d03779
--- /dev/null
+++ b/contrib/llvm/lib/Target/Target.cpp
@@ -0,0 +1,141 @@
+//===-- Target.cpp --------------------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the common infrastructure (including C bindings) for
+// libLLVMTarget.a, which implements target information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/Target.h"
+#include "llvm-c/Initialization.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include <cstring>
+
+using namespace llvm;
+
+// Avoid including "llvm-c/Core.h" for compile time, fwd-declare this instead.
+extern "C" LLVMContextRef LLVMGetGlobalContext(void);
+
+inline TargetLibraryInfoImpl *unwrap(LLVMTargetLibraryInfoRef P) {
+ return reinterpret_cast<TargetLibraryInfoImpl*>(P);
+}
+
+inline LLVMTargetLibraryInfoRef wrap(const TargetLibraryInfoImpl *P) {
+ TargetLibraryInfoImpl *X = const_cast<TargetLibraryInfoImpl*>(P);
+ return reinterpret_cast<LLVMTargetLibraryInfoRef>(X);
+}
+
+void llvm::initializeTarget(PassRegistry &Registry) {
+ initializeTargetLibraryInfoWrapperPassPass(Registry);
+ initializeTargetTransformInfoWrapperPassPass(Registry);
+}
+
+void LLVMInitializeTarget(LLVMPassRegistryRef R) {
+ initializeTarget(*unwrap(R));
+}
+
+LLVMTargetDataRef LLVMGetModuleDataLayout(LLVMModuleRef M) {
+ return wrap(&unwrap(M)->getDataLayout());
+}
+
+void LLVMSetModuleDataLayout(LLVMModuleRef M, LLVMTargetDataRef DL) {
+ unwrap(M)->setDataLayout(*unwrap(DL));
+}
+
+LLVMTargetDataRef LLVMCreateTargetData(const char *StringRep) {
+ return wrap(new DataLayout(StringRep));
+}
+
+void LLVMDisposeTargetData(LLVMTargetDataRef TD) {
+ delete unwrap(TD);
+}
+
+void LLVMAddTargetLibraryInfo(LLVMTargetLibraryInfoRef TLI,
+ LLVMPassManagerRef PM) {
+ unwrap(PM)->add(new TargetLibraryInfoWrapperPass(*unwrap(TLI)));
+}
+
+char *LLVMCopyStringRepOfTargetData(LLVMTargetDataRef TD) {
+ std::string StringRep = unwrap(TD)->getStringRepresentation();
+ return strdup(StringRep.c_str());
+}
+
+LLVMByteOrdering LLVMByteOrder(LLVMTargetDataRef TD) {
+ return unwrap(TD)->isLittleEndian() ? LLVMLittleEndian : LLVMBigEndian;
+}
+
+unsigned LLVMPointerSize(LLVMTargetDataRef TD) {
+ return unwrap(TD)->getPointerSize(0);
+}
+
+unsigned LLVMPointerSizeForAS(LLVMTargetDataRef TD, unsigned AS) {
+ return unwrap(TD)->getPointerSize(AS);
+}
+
+LLVMTypeRef LLVMIntPtrType(LLVMTargetDataRef TD) {
+ return wrap(unwrap(TD)->getIntPtrType(*unwrap(LLVMGetGlobalContext())));
+}
+
+LLVMTypeRef LLVMIntPtrTypeForAS(LLVMTargetDataRef TD, unsigned AS) {
+ return wrap(unwrap(TD)->getIntPtrType(*unwrap(LLVMGetGlobalContext()), AS));
+}
+
+LLVMTypeRef LLVMIntPtrTypeInContext(LLVMContextRef C, LLVMTargetDataRef TD) {
+ return wrap(unwrap(TD)->getIntPtrType(*unwrap(C)));
+}
+
+LLVMTypeRef LLVMIntPtrTypeForASInContext(LLVMContextRef C, LLVMTargetDataRef TD, unsigned AS) {
+ return wrap(unwrap(TD)->getIntPtrType(*unwrap(C), AS));
+}
+
+unsigned long long LLVMSizeOfTypeInBits(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
+ return unwrap(TD)->getTypeSizeInBits(unwrap(Ty));
+}
+
+unsigned long long LLVMStoreSizeOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
+ return unwrap(TD)->getTypeStoreSize(unwrap(Ty));
+}
+
+unsigned long long LLVMABISizeOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
+ return unwrap(TD)->getTypeAllocSize(unwrap(Ty));
+}
+
+unsigned LLVMABIAlignmentOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
+ return unwrap(TD)->getABITypeAlignment(unwrap(Ty));
+}
+
+unsigned LLVMCallFrameAlignmentOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
+ return unwrap(TD)->getABITypeAlignment(unwrap(Ty));
+}
+
+unsigned LLVMPreferredAlignmentOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
+ return unwrap(TD)->getPrefTypeAlignment(unwrap(Ty));
+}
+
+unsigned LLVMPreferredAlignmentOfGlobal(LLVMTargetDataRef TD,
+ LLVMValueRef GlobalVar) {
+ return unwrap(TD)->getPreferredAlignment(unwrap<GlobalVariable>(GlobalVar));
+}
+
+unsigned LLVMElementAtOffset(LLVMTargetDataRef TD, LLVMTypeRef StructTy,
+ unsigned long long Offset) {
+ StructType *STy = unwrap<StructType>(StructTy);
+ return unwrap(TD)->getStructLayout(STy)->getElementContainingOffset(Offset);
+}
+
+unsigned long long LLVMOffsetOfElement(LLVMTargetDataRef TD, LLVMTypeRef StructTy,
+ unsigned Element) {
+ StructType *STy = unwrap<StructType>(StructTy);
+ return unwrap(TD)->getStructLayout(STy)->getElementOffset(Element);
+}
diff --git a/contrib/llvm/lib/Target/TargetIntrinsicInfo.cpp b/contrib/llvm/lib/Target/TargetIntrinsicInfo.cpp
new file mode 100644
index 000000000000..e8b71924e0d9
--- /dev/null
+++ b/contrib/llvm/lib/Target/TargetIntrinsicInfo.cpp
@@ -0,0 +1,30 @@
+//===-- TargetIntrinsicInfo.cpp - Target Instruction Information ----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the TargetIntrinsicInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/IR/Function.h"
+using namespace llvm;
+
+TargetIntrinsicInfo::TargetIntrinsicInfo() {
+}
+
+TargetIntrinsicInfo::~TargetIntrinsicInfo() {
+}
+
+unsigned TargetIntrinsicInfo::getIntrinsicID(const Function *F) const {
+ const ValueName *ValName = F->getValueName();
+ if (!ValName)
+ return 0;
+ return lookupName(ValName->getKeyData(), ValName->getKeyLength());
+}
diff --git a/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp b/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp
new file mode 100644
index 000000000000..375f8511f7ad
--- /dev/null
+++ b/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp
@@ -0,0 +1,326 @@
+//===-- llvm/Target/TargetLoweringObjectFile.cpp - Object File Info -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements classes used to handle lowerings specific to common
+// object file formats.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Generic Code
+//===----------------------------------------------------------------------===//
+
+/// Initialize - this method must be called before any actual lowering is
+/// done. This specifies the current context for codegen, and gives the
+/// lowering implementations a chance to set up their default sections.
+void TargetLoweringObjectFile::Initialize(MCContext &ctx,
+ const TargetMachine &TM) {
+ Ctx = &ctx;
+ // `Initialize` can be called more than once.
+ if (Mang != nullptr) delete Mang;
+ Mang = new Mangler();
+ InitMCObjectFileInfo(TM.getTargetTriple(), TM.isPositionIndependent(),
+ TM.getCodeModel(), *Ctx);
+}
+
+TargetLoweringObjectFile::~TargetLoweringObjectFile() {
+ delete Mang;
+}
+
+static bool isSuitableForBSS(const GlobalVariable *GV, bool NoZerosInBSS) {
+ const Constant *C = GV->getInitializer();
+
+ // Must have zero initializer.
+ if (!C->isNullValue())
+ return false;
+
+ // Leave constant zeros in readonly constant sections, so they can be shared.
+ if (GV->isConstant())
+ return false;
+
+ // If the global has an explicit section specified, don't put it in BSS.
+ if (GV->hasSection())
+ return false;
+
+ // If -nozero-initialized-in-bss is specified, don't ever use BSS.
+ if (NoZerosInBSS)
+ return false;
+
+ // Otherwise, put it in BSS!
+ return true;
+}
+
+/// IsNullTerminatedString - Return true if the specified constant (which is
+/// known to have a type that is an array of 1/2/4 byte elements) ends with a
+/// nul value and contains no other nuls in it. Note that this is more general
+/// than ConstantDataSequential::isString because we allow 2 & 4 byte strings.
+static bool IsNullTerminatedString(const Constant *C) {
+ // First check: is we have constant array terminated with zero
+ if (const ConstantDataSequential *CDS = dyn_cast<ConstantDataSequential>(C)) {
+ unsigned NumElts = CDS->getNumElements();
+ assert(NumElts != 0 && "Can't have an empty CDS");
+
+ if (CDS->getElementAsInteger(NumElts-1) != 0)
+ return false; // Not null terminated.
+
+ // Verify that the null doesn't occur anywhere else in the string.
+ for (unsigned i = 0; i != NumElts-1; ++i)
+ if (CDS->getElementAsInteger(i) == 0)
+ return false;
+ return true;
+ }
+
+ // Another possibility: [1 x i8] zeroinitializer
+ if (isa<ConstantAggregateZero>(C))
+ return cast<ArrayType>(C->getType())->getNumElements() == 1;
+
+ return false;
+}
+
+MCSymbol *TargetLoweringObjectFile::getSymbolWithGlobalValueBase(
+ const GlobalValue *GV, StringRef Suffix, const TargetMachine &TM) const {
+ assert(!Suffix.empty());
+
+ SmallString<60> NameStr;
+ NameStr += GV->getParent()->getDataLayout().getPrivateGlobalPrefix();
+ TM.getNameWithPrefix(NameStr, GV, *Mang);
+ NameStr.append(Suffix.begin(), Suffix.end());
+ return Ctx->getOrCreateSymbol(NameStr);
+}
+
+MCSymbol *TargetLoweringObjectFile::getCFIPersonalitySymbol(
+ const GlobalValue *GV, const TargetMachine &TM,
+ MachineModuleInfo *MMI) const {
+ return TM.getSymbol(GV);
+}
+
+void TargetLoweringObjectFile::emitPersonalityValue(MCStreamer &Streamer,
+ const DataLayout &,
+ const MCSymbol *Sym) const {
+}
+
+
+/// getKindForGlobal - This is a top-level target-independent classifier for
+/// a global variable. Given an global variable and information from TM, it
+/// classifies the global in a variety of ways that make various target
+/// implementations simpler. The target implementation is free to ignore this
+/// extra info of course.
+SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalObject *GO,
+ const TargetMachine &TM){
+ assert(!GO->isDeclaration() && !GO->hasAvailableExternallyLinkage() &&
+ "Can only be used for global definitions");
+
+ Reloc::Model ReloModel = TM.getRelocationModel();
+
+ // Early exit - functions should be always in text sections.
+ const auto *GVar = dyn_cast<GlobalVariable>(GO);
+ if (!GVar)
+ return SectionKind::getText();
+
+ // Handle thread-local data first.
+ if (GVar->isThreadLocal()) {
+ if (isSuitableForBSS(GVar, TM.Options.NoZerosInBSS))
+ return SectionKind::getThreadBSS();
+ return SectionKind::getThreadData();
+ }
+
+ // Variables with common linkage always get classified as common.
+ if (GVar->hasCommonLinkage())
+ return SectionKind::getCommon();
+
+ // Variable can be easily put to BSS section.
+ if (isSuitableForBSS(GVar, TM.Options.NoZerosInBSS)) {
+ if (GVar->hasLocalLinkage())
+ return SectionKind::getBSSLocal();
+ else if (GVar->hasExternalLinkage())
+ return SectionKind::getBSSExtern();
+ return SectionKind::getBSS();
+ }
+
+ const Constant *C = GVar->getInitializer();
+
+ // If the global is marked constant, we can put it into a mergable section,
+ // a mergable string section, or general .data if it contains relocations.
+ if (GVar->isConstant()) {
+ // If the initializer for the global contains something that requires a
+ // relocation, then we may have to drop this into a writable data section
+ // even though it is marked const.
+ if (!C->needsRelocation()) {
+ // If the global is required to have a unique address, it can't be put
+ // into a mergable section: just drop it into the general read-only
+ // section instead.
+ if (!GVar->hasGlobalUnnamedAddr())
+ return SectionKind::getReadOnly();
+
+ // If initializer is a null-terminated string, put it in a "cstring"
+ // section of the right width.
+ if (ArrayType *ATy = dyn_cast<ArrayType>(C->getType())) {
+ if (IntegerType *ITy =
+ dyn_cast<IntegerType>(ATy->getElementType())) {
+ if ((ITy->getBitWidth() == 8 || ITy->getBitWidth() == 16 ||
+ ITy->getBitWidth() == 32) &&
+ IsNullTerminatedString(C)) {
+ if (ITy->getBitWidth() == 8)
+ return SectionKind::getMergeable1ByteCString();
+ if (ITy->getBitWidth() == 16)
+ return SectionKind::getMergeable2ByteCString();
+
+ assert(ITy->getBitWidth() == 32 && "Unknown width");
+ return SectionKind::getMergeable4ByteCString();
+ }
+ }
+ }
+
+ // Otherwise, just drop it into a mergable constant section. If we have
+ // a section for this size, use it, otherwise use the arbitrary sized
+ // mergable section.
+ switch (
+ GVar->getParent()->getDataLayout().getTypeAllocSize(C->getType())) {
+ case 4: return SectionKind::getMergeableConst4();
+ case 8: return SectionKind::getMergeableConst8();
+ case 16: return SectionKind::getMergeableConst16();
+ case 32: return SectionKind::getMergeableConst32();
+ default:
+ return SectionKind::getReadOnly();
+ }
+
+ } else {
+ // In static, ROPI and RWPI relocation models, the linker will resolve
+ // all addresses, so the relocation entries will actually be constants by
+ // the time the app starts up. However, we can't put this into a
+ // mergable section, because the linker doesn't take relocations into
+ // consideration when it tries to merge entries in the section.
+ if (ReloModel == Reloc::Static || ReloModel == Reloc::ROPI ||
+ ReloModel == Reloc::RWPI || ReloModel == Reloc::ROPI_RWPI)
+ return SectionKind::getReadOnly();
+
+ // Otherwise, the dynamic linker needs to fix it up, put it in the
+ // writable data.rel section.
+ return SectionKind::getReadOnlyWithRel();
+ }
+ }
+
+ // Okay, this isn't a constant.
+ return SectionKind::getData();
+}
+
+/// This method computes the appropriate section to emit the specified global
+/// variable or function definition. This should not be passed external (or
+/// available externally) globals.
+MCSection *TargetLoweringObjectFile::SectionForGlobal(
+ const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+ // Select section name.
+ if (GO->hasSection())
+ return getExplicitSectionGlobal(GO, Kind, TM);
+
+ // Use default section depending on the 'type' of global
+ return SelectSectionForGlobal(GO, Kind, TM);
+}
+
+MCSection *TargetLoweringObjectFile::getSectionForJumpTable(
+ const Function &F, const TargetMachine &TM) const {
+ unsigned Align = 0;
+ return getSectionForConstant(F.getParent()->getDataLayout(),
+ SectionKind::getReadOnly(), /*C=*/nullptr,
+ Align);
+}
+
+bool TargetLoweringObjectFile::shouldPutJumpTableInFunctionSection(
+ bool UsesLabelDifference, const Function &F) const {
+ // In PIC mode, we need to emit the jump table to the same section as the
+ // function body itself, otherwise the label differences won't make sense.
+ // FIXME: Need a better predicate for this: what about custom entries?
+ if (UsesLabelDifference)
+ return true;
+
+ // We should also do if the section name is NULL or function is declared
+ // in discardable section
+ // FIXME: this isn't the right predicate, should be based on the MCSection
+ // for the function.
+ if (F.isWeakForLinker())
+ return true;
+
+ return false;
+}
+
+/// Given a mergable constant with the specified size and relocation
+/// information, return a section that it should be placed in.
+MCSection *TargetLoweringObjectFile::getSectionForConstant(
+ const DataLayout &DL, SectionKind Kind, const Constant *C,
+ unsigned &Align) const {
+ if (Kind.isReadOnly() && ReadOnlySection != nullptr)
+ return ReadOnlySection;
+
+ return DataSection;
+}
+
+/// getTTypeGlobalReference - Return an MCExpr to use for a
+/// reference to the specified global variable from exception
+/// handling information.
+const MCExpr *TargetLoweringObjectFile::getTTypeGlobalReference(
+ const GlobalValue *GV, unsigned Encoding, const TargetMachine &TM,
+ MachineModuleInfo *MMI, MCStreamer &Streamer) const {
+ const MCSymbolRefExpr *Ref =
+ MCSymbolRefExpr::create(TM.getSymbol(GV), getContext());
+
+ return getTTypeReference(Ref, Encoding, Streamer);
+}
+
+const MCExpr *TargetLoweringObjectFile::
+getTTypeReference(const MCSymbolRefExpr *Sym, unsigned Encoding,
+ MCStreamer &Streamer) const {
+ switch (Encoding & 0x70) {
+ default:
+ report_fatal_error("We do not support this DWARF encoding yet!");
+ case dwarf::DW_EH_PE_absptr:
+ // Do nothing special
+ return Sym;
+ case dwarf::DW_EH_PE_pcrel: {
+ // Emit a label to the streamer for the current position. This gives us
+ // .-foo addressing.
+ MCSymbol *PCSym = getContext().createTempSymbol();
+ Streamer.EmitLabel(PCSym);
+ const MCExpr *PC = MCSymbolRefExpr::create(PCSym, getContext());
+ return MCBinaryExpr::createSub(Sym, PC, getContext());
+ }
+ }
+}
+
+const MCExpr *TargetLoweringObjectFile::getDebugThreadLocalSymbol(const MCSymbol *Sym) const {
+ // FIXME: It's not clear what, if any, default this should have - perhaps a
+ // null return could mean 'no location' & we should just do that here.
+ return MCSymbolRefExpr::create(Sym, *Ctx);
+}
+
+void TargetLoweringObjectFile::getNameWithPrefix(
+ SmallVectorImpl<char> &OutName, const GlobalValue *GV,
+ const TargetMachine &TM) const {
+ Mang->getNameWithPrefix(OutName, GV, /*CannotUsePrivateLabel=*/false);
+}
diff --git a/contrib/llvm/lib/Target/TargetMachine.cpp b/contrib/llvm/lib/Target/TargetMachine.cpp
new file mode 100644
index 000000000000..e16ced1661a1
--- /dev/null
+++ b/contrib/llvm/lib/Target/TargetMachine.cpp
@@ -0,0 +1,221 @@
+//===-- TargetMachine.cpp - General Target Information ---------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the general parts of a Target machine.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/MC/SectionKind.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+cl::opt<bool> EnableIPRA("enable-ipra", cl::init(false), cl::Hidden,
+ cl::desc("Enable interprocedural register allocation "
+ "to reduce load/store at procedure calls."));
+
+//---------------------------------------------------------------------------
+// TargetMachine Class
+//
+
+TargetMachine::TargetMachine(const Target &T, StringRef DataLayoutString,
+ const Triple &TT, StringRef CPU, StringRef FS,
+ const TargetOptions &Options)
+ : TheTarget(T), DL(DataLayoutString), TargetTriple(TT), TargetCPU(CPU),
+ TargetFS(FS), AsmInfo(nullptr), MRI(nullptr), MII(nullptr), STI(nullptr),
+ RequireStructuredCFG(false), Options(Options) {
+ if (EnableIPRA.getNumOccurrences())
+ this->Options.EnableIPRA = EnableIPRA;
+}
+
+TargetMachine::~TargetMachine() {
+ delete AsmInfo;
+ delete MRI;
+ delete MII;
+ delete STI;
+}
+
+bool TargetMachine::isPositionIndependent() const {
+ return getRelocationModel() == Reloc::PIC_;
+}
+
+/// \brief Reset the target options based on the function's attributes.
+// FIXME: This function needs to go away for a number of reasons:
+// a) global state on the TargetMachine is terrible in general,
+// b) there's no default state here to keep,
+// c) these target options should be passed only on the function
+// and not on the TargetMachine (via TargetOptions) at all.
+void TargetMachine::resetTargetOptions(const Function &F) const {
+#define RESET_OPTION(X, Y) \
+ do { \
+ if (F.hasFnAttribute(Y)) \
+ Options.X = (F.getFnAttribute(Y).getValueAsString() == "true"); \
+ } while (0)
+
+ RESET_OPTION(LessPreciseFPMADOption, "less-precise-fpmad");
+ RESET_OPTION(UnsafeFPMath, "unsafe-fp-math");
+ RESET_OPTION(NoInfsFPMath, "no-infs-fp-math");
+ RESET_OPTION(NoNaNsFPMath, "no-nans-fp-math");
+ RESET_OPTION(NoTrappingFPMath, "no-trapping-math");
+
+ StringRef Denormal =
+ F.getFnAttribute("denormal-fp-math").getValueAsString();
+ if (Denormal == "ieee")
+ Options.FPDenormalMode = FPDenormal::IEEE;
+ else if (Denormal == "preserve-sign")
+ Options.FPDenormalMode = FPDenormal::PreserveSign;
+ else if (Denormal == "positive-zero")
+ Options.FPDenormalMode = FPDenormal::PositiveZero;
+}
+
+/// Returns the code generation relocation model. The choices are static, PIC,
+/// and dynamic-no-pic.
+Reloc::Model TargetMachine::getRelocationModel() const { return RM; }
+
+/// Returns the code model. The choices are small, kernel, medium, large, and
+/// target default.
+CodeModel::Model TargetMachine::getCodeModel() const { return CMModel; }
+
+/// Get the IR-specified TLS model for Var.
+static TLSModel::Model getSelectedTLSModel(const GlobalValue *GV) {
+ switch (GV->getThreadLocalMode()) {
+ case GlobalVariable::NotThreadLocal:
+ llvm_unreachable("getSelectedTLSModel for non-TLS variable");
+ break;
+ case GlobalVariable::GeneralDynamicTLSModel:
+ return TLSModel::GeneralDynamic;
+ case GlobalVariable::LocalDynamicTLSModel:
+ return TLSModel::LocalDynamic;
+ case GlobalVariable::InitialExecTLSModel:
+ return TLSModel::InitialExec;
+ case GlobalVariable::LocalExecTLSModel:
+ return TLSModel::LocalExec;
+ }
+ llvm_unreachable("invalid TLS model");
+}
+
+bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
+ const GlobalValue *GV) const {
+ Reloc::Model RM = getRelocationModel();
+ const Triple &TT = getTargetTriple();
+
+ // DLLImport explicitly marks the GV as external.
+ if (GV && GV->hasDLLImportStorageClass())
+ return false;
+
+ // Every other GV is local on COFF.
+ // Make an exception for windows OS in the triple: Some firmwares builds use
+ // *-win32-macho triples. This (accidentally?) produced windows relocations
+ // without GOT tables in older clang versions; Keep this behaviour.
+ if (TT.isOSBinFormatCOFF() || (TT.isOSWindows() && TT.isOSBinFormatMachO()))
+ return true;
+
+ if (GV && (GV->hasLocalLinkage() || !GV->hasDefaultVisibility()))
+ return true;
+
+ if (TT.isOSBinFormatMachO()) {
+ if (RM == Reloc::Static)
+ return true;
+ return GV && GV->isStrongDefinitionForLinker();
+ }
+
+ assert(TT.isOSBinFormatELF());
+ assert(RM != Reloc::DynamicNoPIC);
+
+ bool IsExecutable =
+ RM == Reloc::Static || M.getPIELevel() != PIELevel::Default;
+ if (IsExecutable) {
+ // If the symbol is defined, it cannot be preempted.
+ if (GV && !GV->isDeclarationForLinker())
+ return true;
+
+ bool IsTLS = GV && GV->isThreadLocal();
+ bool IsAccessViaCopyRelocs =
+ Options.MCOptions.MCPIECopyRelocations && GV && isa<GlobalVariable>(GV);
+ // Check if we can use copy relocations.
+ if (!IsTLS && (RM == Reloc::Static || IsAccessViaCopyRelocs))
+ return true;
+ }
+
+ // ELF supports preemption of other symbols.
+ return false;
+}
+
+TLSModel::Model TargetMachine::getTLSModel(const GlobalValue *GV) const {
+ bool IsPIE = GV->getParent()->getPIELevel() != PIELevel::Default;
+ Reloc::Model RM = getRelocationModel();
+ bool IsSharedLibrary = RM == Reloc::PIC_ && !IsPIE;
+ bool IsLocal = shouldAssumeDSOLocal(*GV->getParent(), GV);
+
+ TLSModel::Model Model;
+ if (IsSharedLibrary) {
+ if (IsLocal)
+ Model = TLSModel::LocalDynamic;
+ else
+ Model = TLSModel::GeneralDynamic;
+ } else {
+ if (IsLocal)
+ Model = TLSModel::LocalExec;
+ else
+ Model = TLSModel::InitialExec;
+ }
+
+ // If the user specified a more specific model, use that.
+ TLSModel::Model SelectedModel = getSelectedTLSModel(GV);
+ if (SelectedModel > Model)
+ return SelectedModel;
+
+ return Model;
+}
+
+/// Returns the optimization level: None, Less, Default, or Aggressive.
+CodeGenOpt::Level TargetMachine::getOptLevel() const { return OptLevel; }
+
+void TargetMachine::setOptLevel(CodeGenOpt::Level Level) { OptLevel = Level; }
+
+TargetIRAnalysis TargetMachine::getTargetIRAnalysis() {
+ return TargetIRAnalysis([this](const Function &F) {
+ return TargetTransformInfo(F.getParent()->getDataLayout());
+ });
+}
+
+void TargetMachine::getNameWithPrefix(SmallVectorImpl<char> &Name,
+ const GlobalValue *GV, Mangler &Mang,
+ bool MayAlwaysUsePrivate) const {
+ if (MayAlwaysUsePrivate || !GV->hasPrivateLinkage()) {
+ // Simple case: If GV is not private, it is not important to find out if
+ // private labels are legal in this case or not.
+ Mang.getNameWithPrefix(Name, GV, false);
+ return;
+ }
+ const TargetLoweringObjectFile *TLOF = getObjFileLowering();
+ TLOF->getNameWithPrefix(Name, GV, *this);
+}
+
+MCSymbol *TargetMachine::getSymbol(const GlobalValue *GV) const {
+ const TargetLoweringObjectFile *TLOF = getObjFileLowering();
+ SmallString<128> NameStr;
+ getNameWithPrefix(NameStr, GV, TLOF->getMangler());
+ return TLOF->getContext().getOrCreateSymbol(NameStr);
+}
diff --git a/contrib/llvm/lib/Target/TargetMachineC.cpp b/contrib/llvm/lib/Target/TargetMachineC.cpp
new file mode 100644
index 000000000000..5fb5b0227800
--- /dev/null
+++ b/contrib/llvm/lib/Target/TargetMachineC.cpp
@@ -0,0 +1,243 @@
+//===-- TargetMachine.cpp -------------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LLVM-C part of TargetMachine.h
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/TargetMachine.h"
+#include "llvm-c/Core.h"
+#include "llvm-c/Target.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Support/CodeGenCWrappers.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+
+using namespace llvm;
+
+static TargetMachine *unwrap(LLVMTargetMachineRef P) {
+ return reinterpret_cast<TargetMachine *>(P);
+}
+static Target *unwrap(LLVMTargetRef P) {
+ return reinterpret_cast<Target*>(P);
+}
+static LLVMTargetMachineRef wrap(const TargetMachine *P) {
+ return reinterpret_cast<LLVMTargetMachineRef>(const_cast<TargetMachine *>(P));
+}
+static LLVMTargetRef wrap(const Target * P) {
+ return reinterpret_cast<LLVMTargetRef>(const_cast<Target*>(P));
+}
+
+LLVMTargetRef LLVMGetFirstTarget() {
+ if (TargetRegistry::targets().begin() == TargetRegistry::targets().end()) {
+ return nullptr;
+ }
+
+ const Target *target = &*TargetRegistry::targets().begin();
+ return wrap(target);
+}
+LLVMTargetRef LLVMGetNextTarget(LLVMTargetRef T) {
+ return wrap(unwrap(T)->getNext());
+}
+
+LLVMTargetRef LLVMGetTargetFromName(const char *Name) {
+ StringRef NameRef = Name;
+ auto I = find_if(TargetRegistry::targets(),
+ [&](const Target &T) { return T.getName() == NameRef; });
+ return I != TargetRegistry::targets().end() ? wrap(&*I) : nullptr;
+}
+
+LLVMBool LLVMGetTargetFromTriple(const char* TripleStr, LLVMTargetRef *T,
+ char **ErrorMessage) {
+ std::string Error;
+
+ *T = wrap(TargetRegistry::lookupTarget(TripleStr, Error));
+
+ if (!*T) {
+ if (ErrorMessage)
+ *ErrorMessage = strdup(Error.c_str());
+
+ return 1;
+ }
+
+ return 0;
+}
+
+const char * LLVMGetTargetName(LLVMTargetRef T) {
+ return unwrap(T)->getName();
+}
+
+const char * LLVMGetTargetDescription(LLVMTargetRef T) {
+ return unwrap(T)->getShortDescription();
+}
+
+LLVMBool LLVMTargetHasJIT(LLVMTargetRef T) {
+ return unwrap(T)->hasJIT();
+}
+
+LLVMBool LLVMTargetHasTargetMachine(LLVMTargetRef T) {
+ return unwrap(T)->hasTargetMachine();
+}
+
+LLVMBool LLVMTargetHasAsmBackend(LLVMTargetRef T) {
+ return unwrap(T)->hasMCAsmBackend();
+}
+
+LLVMTargetMachineRef LLVMCreateTargetMachine(LLVMTargetRef T,
+ const char* Triple, const char* CPU, const char* Features,
+ LLVMCodeGenOptLevel Level, LLVMRelocMode Reloc,
+ LLVMCodeModel CodeModel) {
+ Optional<Reloc::Model> RM;
+ switch (Reloc){
+ case LLVMRelocStatic:
+ RM = Reloc::Static;
+ break;
+ case LLVMRelocPIC:
+ RM = Reloc::PIC_;
+ break;
+ case LLVMRelocDynamicNoPic:
+ RM = Reloc::DynamicNoPIC;
+ break;
+ default:
+ break;
+ }
+
+ CodeModel::Model CM = unwrap(CodeModel);
+
+ CodeGenOpt::Level OL;
+ switch (Level) {
+ case LLVMCodeGenLevelNone:
+ OL = CodeGenOpt::None;
+ break;
+ case LLVMCodeGenLevelLess:
+ OL = CodeGenOpt::Less;
+ break;
+ case LLVMCodeGenLevelAggressive:
+ OL = CodeGenOpt::Aggressive;
+ break;
+ default:
+ OL = CodeGenOpt::Default;
+ break;
+ }
+
+ TargetOptions opt;
+ return wrap(unwrap(T)->createTargetMachine(Triple, CPU, Features, opt, RM,
+ CM, OL));
+}
+
+void LLVMDisposeTargetMachine(LLVMTargetMachineRef T) { delete unwrap(T); }
+
+LLVMTargetRef LLVMGetTargetMachineTarget(LLVMTargetMachineRef T) {
+ const Target* target = &(unwrap(T)->getTarget());
+ return wrap(target);
+}
+
+char* LLVMGetTargetMachineTriple(LLVMTargetMachineRef T) {
+ std::string StringRep = unwrap(T)->getTargetTriple().str();
+ return strdup(StringRep.c_str());
+}
+
+char* LLVMGetTargetMachineCPU(LLVMTargetMachineRef T) {
+ std::string StringRep = unwrap(T)->getTargetCPU();
+ return strdup(StringRep.c_str());
+}
+
+char* LLVMGetTargetMachineFeatureString(LLVMTargetMachineRef T) {
+ std::string StringRep = unwrap(T)->getTargetFeatureString();
+ return strdup(StringRep.c_str());
+}
+
+void LLVMSetTargetMachineAsmVerbosity(LLVMTargetMachineRef T,
+ LLVMBool VerboseAsm) {
+ unwrap(T)->Options.MCOptions.AsmVerbose = VerboseAsm;
+}
+
+LLVMTargetDataRef LLVMCreateTargetDataLayout(LLVMTargetMachineRef T) {
+ return wrap(new DataLayout(unwrap(T)->createDataLayout()));
+}
+
+static LLVMBool LLVMTargetMachineEmit(LLVMTargetMachineRef T, LLVMModuleRef M,
+ raw_pwrite_stream &OS,
+ LLVMCodeGenFileType codegen,
+ char **ErrorMessage) {
+ TargetMachine* TM = unwrap(T);
+ Module* Mod = unwrap(M);
+
+ legacy::PassManager pass;
+
+ std::string error;
+
+ Mod->setDataLayout(TM->createDataLayout());
+
+ TargetMachine::CodeGenFileType ft;
+ switch (codegen) {
+ case LLVMAssemblyFile:
+ ft = TargetMachine::CGFT_AssemblyFile;
+ break;
+ default:
+ ft = TargetMachine::CGFT_ObjectFile;
+ break;
+ }
+ if (TM->addPassesToEmitFile(pass, OS, ft)) {
+ error = "TargetMachine can't emit a file of this type";
+ *ErrorMessage = strdup(error.c_str());
+ return true;
+ }
+
+ pass.run(*Mod);
+
+ OS.flush();
+ return false;
+}
+
+LLVMBool LLVMTargetMachineEmitToFile(LLVMTargetMachineRef T, LLVMModuleRef M,
+ char* Filename, LLVMCodeGenFileType codegen, char** ErrorMessage) {
+ std::error_code EC;
+ raw_fd_ostream dest(Filename, EC, sys::fs::F_None);
+ if (EC) {
+ *ErrorMessage = strdup(EC.message().c_str());
+ return true;
+ }
+ bool Result = LLVMTargetMachineEmit(T, M, dest, codegen, ErrorMessage);
+ dest.flush();
+ return Result;
+}
+
+LLVMBool LLVMTargetMachineEmitToMemoryBuffer(LLVMTargetMachineRef T,
+ LLVMModuleRef M, LLVMCodeGenFileType codegen, char** ErrorMessage,
+ LLVMMemoryBufferRef *OutMemBuf) {
+ SmallString<0> CodeString;
+ raw_svector_ostream OStream(CodeString);
+ bool Result = LLVMTargetMachineEmit(T, M, OStream, codegen, ErrorMessage);
+
+ StringRef Data = OStream.str();
+ *OutMemBuf =
+ LLVMCreateMemoryBufferWithMemoryRangeCopy(Data.data(), Data.size(), "");
+ return Result;
+}
+
+char *LLVMGetDefaultTargetTriple(void) {
+ return strdup(sys::getDefaultTargetTriple().c_str());
+}
+
+void LLVMAddAnalysisPasses(LLVMTargetMachineRef T, LLVMPassManagerRef PM) {
+ unwrap(PM)->add(
+ createTargetTransformInfoWrapperPass(unwrap(T)->getTargetIRAnalysis()));
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp b/contrib/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
new file mode 100644
index 000000000000..b4763ca60ab6
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
@@ -0,0 +1,151 @@
+//==- WebAssemblyDisassembler.cpp - Disassembler for WebAssembly -*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file is part of the WebAssembly Disassembler.
+///
+/// It contains code to translate the data produced by the decoder into
+/// MCInsts.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-disassembler"
+
+namespace {
+class WebAssemblyDisassembler final : public MCDisassembler {
+ std::unique_ptr<const MCInstrInfo> MCII;
+
+ DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes, uint64_t Address,
+ raw_ostream &VStream,
+ raw_ostream &CStream) const override;
+
+public:
+ WebAssemblyDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
+ std::unique_ptr<const MCInstrInfo> MCII)
+ : MCDisassembler(STI, Ctx), MCII(std::move(MCII)) {}
+};
+} // end anonymous namespace
+
+static MCDisassembler *createWebAssemblyDisassembler(const Target &T,
+ const MCSubtargetInfo &STI,
+ MCContext &Ctx) {
+ std::unique_ptr<const MCInstrInfo> MCII(T.createMCInstrInfo());
+ return new WebAssemblyDisassembler(STI, Ctx, std::move(MCII));
+}
+
+extern "C" void LLVMInitializeWebAssemblyDisassembler() {
+ // Register the disassembler for each target.
+ TargetRegistry::RegisterMCDisassembler(getTheWebAssemblyTarget32(),
+ createWebAssemblyDisassembler);
+ TargetRegistry::RegisterMCDisassembler(getTheWebAssemblyTarget64(),
+ createWebAssemblyDisassembler);
+}
+
+MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction(
+ MCInst &MI, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t /*Address*/,
+ raw_ostream &OS, raw_ostream &CS) const {
+ Size = 0;
+ uint64_t Pos = 0;
+
+ // Read the opcode.
+ if (Pos + sizeof(uint64_t) > Bytes.size())
+ return MCDisassembler::Fail;
+ uint64_t Opcode = support::endian::read64le(Bytes.data() + Pos);
+ Pos += sizeof(uint64_t);
+
+ if (Opcode >= WebAssembly::INSTRUCTION_LIST_END)
+ return MCDisassembler::Fail;
+
+ MI.setOpcode(Opcode);
+ const MCInstrDesc &Desc = MCII->get(Opcode);
+ unsigned NumFixedOperands = Desc.NumOperands;
+
+ // If it's variadic, read the number of extra operands.
+ unsigned NumExtraOperands = 0;
+ if (Desc.isVariadic()) {
+ if (Pos + sizeof(uint64_t) > Bytes.size())
+ return MCDisassembler::Fail;
+ NumExtraOperands = support::endian::read64le(Bytes.data() + Pos);
+ Pos += sizeof(uint64_t);
+ }
+
+ // Read the fixed operands. These are described by the MCInstrDesc.
+ for (unsigned i = 0; i < NumFixedOperands; ++i) {
+ const MCOperandInfo &Info = Desc.OpInfo[i];
+ switch (Info.OperandType) {
+ case MCOI::OPERAND_IMMEDIATE:
+ case WebAssembly::OPERAND_LOCAL:
+ case WebAssembly::OPERAND_P2ALIGN:
+ case WebAssembly::OPERAND_BASIC_BLOCK: {
+ if (Pos + sizeof(uint64_t) > Bytes.size())
+ return MCDisassembler::Fail;
+ uint64_t Imm = support::endian::read64le(Bytes.data() + Pos);
+ Pos += sizeof(uint64_t);
+ MI.addOperand(MCOperand::createImm(Imm));
+ break;
+ }
+ case MCOI::OPERAND_REGISTER: {
+ if (Pos + sizeof(uint64_t) > Bytes.size())
+ return MCDisassembler::Fail;
+ uint64_t Reg = support::endian::read64le(Bytes.data() + Pos);
+ Pos += sizeof(uint64_t);
+ MI.addOperand(MCOperand::createReg(Reg));
+ break;
+ }
+ case WebAssembly::OPERAND_F32IMM:
+ case WebAssembly::OPERAND_F64IMM: {
+ // TODO: MC converts all floating point immediate operands to double.
+ // This is fine for numeric values, but may cause NaNs to change bits.
+ if (Pos + sizeof(uint64_t) > Bytes.size())
+ return MCDisassembler::Fail;
+ uint64_t Bits = support::endian::read64le(Bytes.data() + Pos);
+ Pos += sizeof(uint64_t);
+ double Imm;
+ memcpy(&Imm, &Bits, sizeof(Imm));
+ MI.addOperand(MCOperand::createFPImm(Imm));
+ break;
+ }
+ default:
+ llvm_unreachable("unimplemented operand kind");
+ }
+ }
+
+ // Read the extra operands.
+ assert(NumExtraOperands == 0 || Desc.isVariadic());
+ for (unsigned i = 0; i < NumExtraOperands; ++i) {
+ if (Pos + sizeof(uint64_t) > Bytes.size())
+ return MCDisassembler::Fail;
+ if (Desc.TSFlags & WebAssemblyII::VariableOpIsImmediate) {
+ // Decode extra immediate operands.
+ uint64_t Imm = support::endian::read64le(Bytes.data() + Pos);
+ MI.addOperand(MCOperand::createImm(Imm));
+ } else {
+ // Decode extra register operands.
+ uint64_t Reg = support::endian::read64le(Bytes.data() + Pos);
+ MI.addOperand(MCOperand::createReg(Reg));
+ }
+ Pos += sizeof(uint64_t);
+ }
+
+ Size = Pos;
+ return MCDisassembler::Success;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp b/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
new file mode 100644
index 000000000000..0af13cffdb04
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
@@ -0,0 +1,244 @@
+//=- WebAssemblyInstPrinter.cpp - WebAssembly assembly instruction printing -=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Print MCInst instructions to wasm format.
+///
+//===----------------------------------------------------------------------===//
+
+#include "InstPrinter/WebAssemblyInstPrinter.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssembly.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+#include "WebAssemblyGenAsmWriter.inc"
+
+WebAssemblyInstPrinter::WebAssemblyInstPrinter(const MCAsmInfo &MAI,
+ const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI)
+ : MCInstPrinter(MAI, MII, MRI), ControlFlowCounter(0) {}
+
+void WebAssemblyInstPrinter::printRegName(raw_ostream &OS,
+ unsigned RegNo) const {
+ assert(RegNo != WebAssemblyFunctionInfo::UnusedReg);
+ // Note that there's an implicit get_local/set_local here!
+ OS << "$" << RegNo;
+}
+
+void WebAssemblyInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+ StringRef Annot,
+ const MCSubtargetInfo & /*STI*/) {
+ // Print the instruction (this uses the AsmStrings from the .td files).
+ printInstruction(MI, OS);
+
+ // Print any additional variadic operands.
+ const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+ if (Desc.isVariadic())
+ for (auto i = Desc.getNumOperands(), e = MI->getNumOperands(); i < e; ++i) {
+ // FIXME: For CALL_INDIRECT_VOID, don't print a leading comma, because
+ // we have an extra flags operand which is not currently printed, for
+ // compatiblity reasons.
+ if (i != 0 &&
+ (MI->getOpcode() != WebAssembly::CALL_INDIRECT_VOID ||
+ i != Desc.getNumOperands()))
+ OS << ", ";
+ printOperand(MI, i, OS);
+ }
+
+ // Print any added annotation.
+ printAnnotation(OS, Annot);
+
+ if (CommentStream) {
+ // Observe any effects on the control flow stack, for use in annotating
+ // control flow label references.
+ switch (MI->getOpcode()) {
+ default:
+ break;
+ case WebAssembly::LOOP: {
+ printAnnotation(OS, "label" + utostr(ControlFlowCounter) + ':');
+ ControlFlowStack.push_back(std::make_pair(ControlFlowCounter++, true));
+ break;
+ }
+ case WebAssembly::BLOCK:
+ ControlFlowStack.push_back(std::make_pair(ControlFlowCounter++, false));
+ break;
+ case WebAssembly::END_LOOP:
+ ControlFlowStack.pop_back();
+ break;
+ case WebAssembly::END_BLOCK:
+ printAnnotation(
+ OS, "label" + utostr(ControlFlowStack.pop_back_val().first) + ':');
+ break;
+ }
+
+ // Annotate any control flow label references.
+ unsigned NumFixedOperands = Desc.NumOperands;
+ SmallSet<uint64_t, 8> Printed;
+ for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
+ if (!(i < NumFixedOperands
+ ? (Desc.OpInfo[i].OperandType ==
+ WebAssembly::OPERAND_BASIC_BLOCK)
+ : (Desc.TSFlags & WebAssemblyII::VariableOpImmediateIsLabel)))
+ continue;
+ uint64_t Depth = MI->getOperand(i).getImm();
+ if (!Printed.insert(Depth).second)
+ continue;
+ const auto &Pair = ControlFlowStack.rbegin()[Depth];
+ printAnnotation(OS, utostr(Depth) + ": " + (Pair.second ? "up" : "down") +
+ " to label" + utostr(Pair.first));
+ }
+ }
+}
+
+static std::string toString(const APFloat &FP) {
+ // Print NaNs with custom payloads specially.
+ if (FP.isNaN() &&
+ !FP.bitwiseIsEqual(APFloat::getQNaN(FP.getSemantics())) &&
+ !FP.bitwiseIsEqual(
+ APFloat::getQNaN(FP.getSemantics(), /*Negative=*/true))) {
+ APInt AI = FP.bitcastToAPInt();
+ return
+ std::string(AI.isNegative() ? "-" : "") + "nan:0x" +
+ utohexstr(AI.getZExtValue() &
+ (AI.getBitWidth() == 32 ? INT64_C(0x007fffff) :
+ INT64_C(0x000fffffffffffff)),
+ /*LowerCase=*/true);
+ }
+
+ // Use C99's hexadecimal floating-point representation.
+ static const size_t BufBytes = 128;
+ char buf[BufBytes];
+ auto Written = FP.convertToHexString(
+ buf, /*hexDigits=*/0, /*upperCase=*/false, APFloat::rmNearestTiesToEven);
+ (void)Written;
+ assert(Written != 0);
+ assert(Written < BufBytes);
+ return buf;
+}
+
+void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isReg()) {
+ assert((OpNo < MII.get(MI->getOpcode()).getNumOperands() ||
+ MII.get(MI->getOpcode()).TSFlags == 0) &&
+ "WebAssembly variable_ops register ops don't use TSFlags");
+ unsigned WAReg = Op.getReg();
+ if (int(WAReg) >= 0)
+ printRegName(O, WAReg);
+ else if (OpNo >= MII.get(MI->getOpcode()).getNumDefs())
+ O << "$pop" << WebAssemblyFunctionInfo::getWARegStackId(WAReg);
+ else if (WAReg != WebAssemblyFunctionInfo::UnusedReg)
+ O << "$push" << WebAssemblyFunctionInfo::getWARegStackId(WAReg);
+ else
+ O << "$drop";
+ // Add a '=' suffix if this is a def.
+ if (OpNo < MII.get(MI->getOpcode()).getNumDefs())
+ O << '=';
+ } else if (Op.isImm()) {
+ const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+ assert((OpNo < Desc.getNumOperands() ||
+ (Desc.TSFlags & WebAssemblyII::VariableOpIsImmediate)) &&
+ "WebAssemblyII::VariableOpIsImmediate should be set for "
+ "variable_ops immediate ops");
+ (void)Desc;
+ // TODO: (MII.get(MI->getOpcode()).TSFlags &
+ // WebAssemblyII::VariableOpImmediateIsLabel)
+ // can tell us whether this is an immediate referencing a label in the
+ // control flow stack, and it may be nice to pretty-print.
+ O << Op.getImm();
+ } else if (Op.isFPImm()) {
+ const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+ assert(OpNo < Desc.getNumOperands() &&
+ "Unexpected floating-point immediate as a non-fixed operand");
+ assert(Desc.TSFlags == 0 &&
+ "WebAssembly variable_ops floating point ops don't use TSFlags");
+ const MCOperandInfo &Info = Desc.OpInfo[OpNo];
+ if (Info.OperandType == WebAssembly::OPERAND_F32IMM) {
+ // TODO: MC converts all floating point immediate operands to double.
+ // This is fine for numeric values, but may cause NaNs to change bits.
+ O << toString(APFloat(float(Op.getFPImm())));
+ } else {
+ assert(Info.OperandType == WebAssembly::OPERAND_F64IMM);
+ O << toString(APFloat(Op.getFPImm()));
+ }
+ } else {
+ assert((OpNo < MII.get(MI->getOpcode()).getNumOperands() ||
+ (MII.get(MI->getOpcode()).TSFlags &
+ WebAssemblyII::VariableOpIsImmediate)) &&
+ "WebAssemblyII::VariableOpIsImmediate should be set for "
+ "variable_ops expr ops");
+ assert(Op.isExpr() && "unknown operand kind in printOperand");
+ Op.getExpr()->print(O, &MAI);
+ }
+}
+
+void
+WebAssemblyInstPrinter::printWebAssemblyP2AlignOperand(const MCInst *MI,
+ unsigned OpNo,
+ raw_ostream &O) {
+ int64_t Imm = MI->getOperand(OpNo).getImm();
+ if (Imm == WebAssembly::GetDefaultP2Align(MI->getOpcode()))
+ return;
+ O << ":p2align=" << Imm;
+}
+
+void
+WebAssemblyInstPrinter::printWebAssemblySignatureOperand(const MCInst *MI,
+ unsigned OpNo,
+ raw_ostream &O) {
+ int64_t Imm = MI->getOperand(OpNo).getImm();
+ switch (WebAssembly::ExprType(Imm)) {
+ case WebAssembly::ExprType::Void: break;
+ case WebAssembly::ExprType::I32: O << "i32"; break;
+ case WebAssembly::ExprType::I64: O << "i64"; break;
+ case WebAssembly::ExprType::F32: O << "f32"; break;
+ case WebAssembly::ExprType::F64: O << "f64"; break;
+ case WebAssembly::ExprType::I8x16: O << "i8x16"; break;
+ case WebAssembly::ExprType::I16x8: O << "i16x8"; break;
+ case WebAssembly::ExprType::I32x4: O << "i32x4"; break;
+ case WebAssembly::ExprType::F32x4: O << "f32x4"; break;
+ case WebAssembly::ExprType::B8x16: O << "b8x16"; break;
+ case WebAssembly::ExprType::B16x8: O << "b16x8"; break;
+ case WebAssembly::ExprType::B32x4: O << "b32x4"; break;
+ }
+}
+
+const char *llvm::WebAssembly::TypeToString(MVT Ty) {
+ switch (Ty.SimpleTy) {
+ case MVT::i32:
+ return "i32";
+ case MVT::i64:
+ return "i64";
+ case MVT::f32:
+ return "f32";
+ case MVT::f64:
+ return "f64";
+ case MVT::v16i8:
+ case MVT::v8i16:
+ case MVT::v4i32:
+ case MVT::v4f32:
+ return "v128";
+ default:
+ llvm_unreachable("unsupported type");
+ }
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h b/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
new file mode 100644
index 000000000000..d11f99c1ff39
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
@@ -0,0 +1,58 @@
+// WebAssemblyInstPrinter.h - Print wasm MCInst to assembly syntax -*- C++ -*-//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This class prints an WebAssembly MCInst to wasm file syntax.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_INSTPRINTER_WEBASSEMBLYINSTPRINTER_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_INSTPRINTER_WEBASSEMBLYINSTPRINTER_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+
+class MCSubtargetInfo;
+
+class WebAssemblyInstPrinter final : public MCInstPrinter {
+ uint64_t ControlFlowCounter;
+ SmallVector<std::pair<uint64_t, bool>, 0> ControlFlowStack;
+
+public:
+ WebAssemblyInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI);
+
+ void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+ void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot,
+ const MCSubtargetInfo &STI) override;
+
+ // Used by tblegen code.
+ void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printWebAssemblyP2AlignOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O);
+ void printWebAssemblySignatureOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O);
+
+ // Autogenerated by tblgen.
+ void printInstruction(const MCInst *MI, raw_ostream &O);
+ static const char *getRegisterName(unsigned RegNo);
+};
+
+namespace WebAssembly {
+
+const char *TypeToString(MVT Ty);
+
+} // end namespace WebAssembly
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
new file mode 100644
index 000000000000..97454a824a34
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
@@ -0,0 +1,105 @@
+//===-- WebAssemblyAsmBackend.cpp - WebAssembly Assembler Backend ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements the WebAssemblyAsmBackend class.
+///
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+namespace {
+class WebAssemblyAsmBackend final : public MCAsmBackend {
+ bool Is64Bit;
+
+public:
+ explicit WebAssemblyAsmBackend(bool Is64Bit)
+ : MCAsmBackend(), Is64Bit(Is64Bit) {}
+ ~WebAssemblyAsmBackend() override {}
+
+ void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+ uint64_t Value, bool IsPCRel) const override;
+
+ MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
+
+ // No instruction requires relaxation
+ bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+ const MCRelaxableFragment *DF,
+ const MCAsmLayout &Layout) const override {
+ return false;
+ }
+
+ unsigned getNumFixupKinds() const override {
+ // We currently just use the generic fixups in MCFixup.h and don't have any
+ // target-specific fixups.
+ return 0;
+ }
+
+ bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
+
+ void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+ MCInst &Res) const override {}
+
+ bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+};
+
+bool WebAssemblyAsmBackend::writeNopData(uint64_t Count,
+ MCObjectWriter *OW) const {
+ if (Count == 0)
+ return true;
+
+ for (uint64_t i = 0; i < Count; ++i)
+ OW->write8(WebAssembly::Nop);
+
+ return true;
+}
+
+void WebAssemblyAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+ unsigned DataSize, uint64_t Value,
+ bool IsPCRel) const {
+ const MCFixupKindInfo &Info = getFixupKindInfo(Fixup.getKind());
+ assert(Info.Flags == 0 && "WebAssembly does not use MCFixupKindInfo flags");
+
+ unsigned NumBytes = (Info.TargetSize + 7) / 8;
+ if (Value == 0)
+ return; // Doesn't change encoding.
+
+ // Shift the value into position.
+ Value <<= Info.TargetOffset;
+
+ unsigned Offset = Fixup.getOffset();
+ assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+
+ // For each byte of the fragment that the fixup touches, mask in the
+ // bits from the fixup value.
+ for (unsigned i = 0; i != NumBytes; ++i)
+ Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+}
+
+MCObjectWriter *
+WebAssemblyAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
+ return createWebAssemblyELFObjectWriter(OS, Is64Bit, 0);
+}
+} // end anonymous namespace
+
+MCAsmBackend *llvm::createWebAssemblyAsmBackend(const Triple &TT) {
+ return new WebAssemblyAsmBackend(TT.isArch64Bit());
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp
new file mode 100644
index 000000000000..2146f67959b8
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp
@@ -0,0 +1,67 @@
+//===-- WebAssemblyELFObjectWriter.cpp - WebAssembly ELF Writer -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file handles ELF-specific object emission, converting LLVM's
+/// internal fixups into the appropriate relocations.
+///
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/Support/ErrorHandling.h"
+using namespace llvm;
+
+namespace {
+class WebAssemblyELFObjectWriter final : public MCELFObjectTargetWriter {
+public:
+ WebAssemblyELFObjectWriter(bool Is64Bit, uint8_t OSABI);
+
+protected:
+ unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+ const MCFixup &Fixup, bool IsPCRel) const override;
+};
+} // end anonymous namespace
+
+WebAssemblyELFObjectWriter::WebAssemblyELFObjectWriter(bool Is64Bit,
+ uint8_t OSABI)
+ : MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_WEBASSEMBLY,
+ /*HasRelocationAddend=*/false) {}
+
+unsigned WebAssemblyELFObjectWriter::getRelocType(MCContext &Ctx,
+ const MCValue &Target,
+ const MCFixup &Fixup,
+ bool IsPCRel) const {
+ // WebAssembly functions are not allocated in the address space. To resolve a
+ // pointer to a function, we must use a special relocation type.
+ if (const MCSymbolRefExpr *SyExp =
+ dyn_cast<MCSymbolRefExpr>(Fixup.getValue()))
+ if (SyExp->getKind() == MCSymbolRefExpr::VK_WebAssembly_FUNCTION)
+ return ELF::R_WEBASSEMBLY_FUNCTION;
+
+ switch (Fixup.getKind()) {
+ case FK_Data_4:
+ assert(!is64Bit() && "4-byte relocations only supported on wasm32");
+ return ELF::R_WEBASSEMBLY_DATA;
+ case FK_Data_8:
+ assert(is64Bit() && "8-byte relocations only supported on wasm64");
+ return ELF::R_WEBASSEMBLY_DATA;
+ default:
+ llvm_unreachable("unimplemented fixup kind");
+ }
+}
+
+MCObjectWriter *llvm::createWebAssemblyELFObjectWriter(raw_pwrite_stream &OS,
+ bool Is64Bit,
+ uint8_t OSABI) {
+ MCELFObjectTargetWriter *MOTW =
+ new WebAssemblyELFObjectWriter(Is64Bit, OSABI);
+ return createELFObjectWriter(MOTW, OS, /*IsLittleEndian=*/true);
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
new file mode 100644
index 000000000000..d8c39216c53b
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
@@ -0,0 +1,53 @@
+//===-- WebAssemblyMCAsmInfo.cpp - WebAssembly asm properties -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains the declarations of the WebAssemblyMCAsmInfo
+/// properties.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyMCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-mc-asm-info"
+
+WebAssemblyMCAsmInfo::~WebAssemblyMCAsmInfo() {}
+
+WebAssemblyMCAsmInfo::WebAssemblyMCAsmInfo(const Triple &T) {
+ PointerSize = CalleeSaveStackSlotSize = T.isArch64Bit() ? 8 : 4;
+
+ // TODO: What should MaxInstLength be?
+
+ UseDataRegionDirectives = true;
+
+ // Use .skip instead of .zero because .zero is confusing when used with two
+ // arguments (it doesn't actually zero things out).
+ ZeroDirective = "\t.skip\t";
+
+ Data8bitsDirective = "\t.int8\t";
+ Data16bitsDirective = "\t.int16\t";
+ Data32bitsDirective = "\t.int32\t";
+ Data64bitsDirective = "\t.int64\t";
+
+ AlignmentIsInBytes = false;
+ COMMDirectiveAlignmentIsInBytes = false;
+ LCOMMDirectiveAlignmentType = LCOMM::Log2Alignment;
+
+ SupportsDebugInformation = true;
+
+ // For now, WebAssembly does not support exceptions.
+ ExceptionsType = ExceptionHandling::None;
+
+ // TODO: UseIntegratedAssembler?
+
+ // WebAssembly's stack is never executable.
+ UsesNonexecutableStackSection = false;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h
new file mode 100644
index 000000000000..2dcf2cd3c892
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h
@@ -0,0 +1,32 @@
+//===-- WebAssemblyMCAsmInfo.h - WebAssembly asm properties -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains the declaration of the WebAssemblyMCAsmInfo class.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCASMINFO_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCASMINFO_H
+
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+
+class Triple;
+
+class WebAssemblyMCAsmInfo final : public MCAsmInfoELF {
+public:
+ explicit WebAssemblyMCAsmInfo(const Triple &T);
+ ~WebAssemblyMCAsmInfo() override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
new file mode 100644
index 000000000000..d0e0eecd3002
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
@@ -0,0 +1,121 @@
+//=- WebAssemblyMCCodeEmitter.cpp - Convert WebAssembly code to machine code -//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements the WebAssemblyMCCodeEmitter class.
+///
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "mccodeemitter"
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted.");
+STATISTIC(MCNumFixups, "Number of MC fixups created.");
+
+namespace {
+class WebAssemblyMCCodeEmitter final : public MCCodeEmitter {
+ const MCInstrInfo &MCII;
+
+ // Implementation generated by tablegen.
+ uint64_t getBinaryCodeForInstr(const MCInst &MI,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const override;
+
+public:
+ explicit WebAssemblyMCCodeEmitter(const MCInstrInfo &mcii) : MCII(mcii) {}
+};
+} // end anonymous namespace
+
+MCCodeEmitter *llvm::createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII) {
+ return new WebAssemblyMCCodeEmitter(MCII);
+}
+
+void WebAssemblyMCCodeEmitter::encodeInstruction(
+ const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ uint64_t Start = OS.tell();
+
+ uint64_t Binary = getBinaryCodeForInstr(MI, Fixups, STI);
+ assert(Binary < UINT8_MAX && "Multi-byte opcodes not supported yet");
+ OS << uint8_t(Binary);
+
+ const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+ for (unsigned i = 0, e = MI.getNumOperands(); i < e; ++i) {
+ const MCOperand &MO = MI.getOperand(i);
+ if (MO.isReg()) {
+ /* nothing to encode */
+ } else if (MO.isImm()) {
+ if (i < Desc.getNumOperands()) {
+ assert(Desc.TSFlags == 0 &&
+ "WebAssembly non-variable_ops don't use TSFlags");
+ const MCOperandInfo &Info = Desc.OpInfo[i];
+ if (Info.OperandType == WebAssembly::OPERAND_I32IMM) {
+ encodeSLEB128(int32_t(MO.getImm()), OS);
+ } else if (Info.OperandType == WebAssembly::OPERAND_I64IMM) {
+ encodeSLEB128(int64_t(MO.getImm()), OS);
+ } else {
+ encodeULEB128(uint64_t(MO.getImm()), OS);
+ }
+ } else {
+ assert(Desc.TSFlags == (WebAssemblyII::VariableOpIsImmediate |
+ WebAssemblyII::VariableOpImmediateIsLabel));
+ encodeULEB128(uint64_t(MO.getImm()), OS);
+ }
+ } else if (MO.isFPImm()) {
+ assert(i < Desc.getNumOperands() &&
+ "Unexpected floating-point immediate as a non-fixed operand");
+ assert(Desc.TSFlags == 0 &&
+ "WebAssembly variable_ops floating point ops don't use TSFlags");
+ const MCOperandInfo &Info = Desc.OpInfo[i];
+ if (Info.OperandType == WebAssembly::OPERAND_F32IMM) {
+ // TODO: MC converts all floating point immediate operands to double.
+ // This is fine for numeric values, but may cause NaNs to change bits.
+ float f = float(MO.getFPImm());
+ support::endian::Writer<support::little>(OS).write<float>(f);
+ } else {
+ assert(Info.OperandType == WebAssembly::OPERAND_F64IMM);
+ double d = MO.getFPImm();
+ support::endian::Writer<support::little>(OS).write<double>(d);
+ }
+ } else if (MO.isExpr()) {
+ Fixups.push_back(MCFixup::create(
+ OS.tell() - Start, MO.getExpr(),
+ STI.getTargetTriple().isArch64Bit() ? FK_Data_8 : FK_Data_4,
+ MI.getLoc()));
+ ++MCNumFixups;
+ encodeULEB128(STI.getTargetTriple().isArch64Bit() ? UINT64_MAX
+ : uint64_t(UINT32_MAX),
+ OS);
+ } else {
+ llvm_unreachable("unexpected operand kind");
+ }
+ }
+
+ ++MCNumEmitted; // Keep track of the # of mi's emitted.
+}
+
+#include "WebAssemblyGenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
new file mode 100644
index 000000000000..3dc1ded17116
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
@@ -0,0 +1,146 @@
+//===-- WebAssemblyMCTargetDesc.cpp - WebAssembly Target Descriptions -----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file provides WebAssembly-specific target descriptions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyMCTargetDesc.h"
+#include "InstPrinter/WebAssemblyInstPrinter.h"
+#include "WebAssemblyMCAsmInfo.h"
+#include "WebAssemblyTargetStreamer.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-mc-target-desc"
+
+#define GET_INSTRINFO_MC_DESC
+#include "WebAssemblyGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "WebAssemblyGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "WebAssemblyGenRegisterInfo.inc"
+
+static MCAsmInfo *createMCAsmInfo(const MCRegisterInfo & /*MRI*/,
+ const Triple &TT) {
+ return new WebAssemblyMCAsmInfo(TT);
+}
+
+static void adjustCodeGenOpts(const Triple & /*TT*/, Reloc::Model /*RM*/,
+ CodeModel::Model &CM) {
+ CodeModel::Model M = (CM == CodeModel::Default || CM == CodeModel::JITDefault)
+ ? CodeModel::Large
+ : CM;
+ if (M != CodeModel::Large)
+ report_fatal_error("Non-large code models are not supported yet");
+}
+
+static MCInstrInfo *createMCInstrInfo() {
+ MCInstrInfo *X = new MCInstrInfo();
+ InitWebAssemblyMCInstrInfo(X);
+ return X;
+}
+
+static MCRegisterInfo *createMCRegisterInfo(const Triple & /*T*/) {
+ MCRegisterInfo *X = new MCRegisterInfo();
+ InitWebAssemblyMCRegisterInfo(X, 0);
+ return X;
+}
+
+static MCInstPrinter *createMCInstPrinter(const Triple & /*T*/,
+ unsigned SyntaxVariant,
+ const MCAsmInfo &MAI,
+ const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI) {
+ assert(SyntaxVariant == 0 && "WebAssembly only has one syntax variant");
+ return new WebAssemblyInstPrinter(MAI, MII, MRI);
+}
+
+static MCCodeEmitter *createCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo & /*MRI*/,
+ MCContext & /*Ctx*/) {
+ return createWebAssemblyMCCodeEmitter(MCII);
+}
+
+static MCAsmBackend *createAsmBackend(const Target & /*T*/,
+ const MCRegisterInfo & /*MRI*/,
+ const Triple &TT, StringRef /*CPU*/,
+ const MCTargetOptions & /*Options*/) {
+ return createWebAssemblyAsmBackend(TT);
+}
+
+static MCSubtargetInfo *createMCSubtargetInfo(const Triple &TT, StringRef CPU,
+ StringRef FS) {
+ return createWebAssemblyMCSubtargetInfoImpl(TT, CPU, FS);
+}
+
+static MCTargetStreamer *
+createObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo & /*STI*/) {
+ return new WebAssemblyTargetELFStreamer(S);
+}
+
+static MCTargetStreamer *createAsmTargetStreamer(MCStreamer &S,
+ formatted_raw_ostream &OS,
+ MCInstPrinter * /*InstPrint*/,
+ bool /*isVerboseAsm*/) {
+ return new WebAssemblyTargetAsmStreamer(S, OS);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeWebAssemblyTargetMC() {
+ for (Target *T :
+ {&getTheWebAssemblyTarget32(), &getTheWebAssemblyTarget64()}) {
+ // Register the MC asm info.
+ RegisterMCAsmInfoFn X(*T, createMCAsmInfo);
+
+ // Register the MC instruction info.
+ TargetRegistry::RegisterMCInstrInfo(*T, createMCInstrInfo);
+
+ // Register the MC codegen info.
+ TargetRegistry::registerMCAdjustCodeGenOpts(*T, adjustCodeGenOpts);
+
+ // Register the MC register info.
+ TargetRegistry::RegisterMCRegInfo(*T, createMCRegisterInfo);
+
+ // Register the MCInstPrinter.
+ TargetRegistry::RegisterMCInstPrinter(*T, createMCInstPrinter);
+
+ // Register the MC code emitter.
+ TargetRegistry::RegisterMCCodeEmitter(*T, createCodeEmitter);
+
+ // Register the ASM Backend.
+ TargetRegistry::RegisterMCAsmBackend(*T, createAsmBackend);
+
+ // Register the MC subtarget info.
+ TargetRegistry::RegisterMCSubtargetInfo(*T, createMCSubtargetInfo);
+
+ // Register the object target streamer.
+ TargetRegistry::RegisterObjectTargetStreamer(*T,
+ createObjectTargetStreamer);
+ // Register the asm target streamer.
+ TargetRegistry::RegisterAsmTargetStreamer(*T, createAsmTargetStreamer);
+ }
+}
+
+WebAssembly::ValType WebAssembly::toValType(const MVT &Ty) {
+ switch (Ty.SimpleTy) {
+ case MVT::i32: return WebAssembly::ValType::I32;
+ case MVT::i64: return WebAssembly::ValType::I64;
+ case MVT::f32: return WebAssembly::ValType::F32;
+ case MVT::f64: return WebAssembly::ValType::F64;
+ default: llvm_unreachable("unexpected type");
+ }
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
new file mode 100644
index 000000000000..8583b772deab
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -0,0 +1,182 @@
+//==- WebAssemblyMCTargetDesc.h - WebAssembly Target Descriptions -*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file provides WebAssembly-specific target descriptions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCTARGETDESC_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCTARGETDESC_H
+
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCSubtargetInfo;
+class MVT;
+class Target;
+class Triple;
+class raw_pwrite_stream;
+
+Target &getTheWebAssemblyTarget32();
+Target &getTheWebAssemblyTarget64();
+
+MCCodeEmitter *createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII);
+
+MCAsmBackend *createWebAssemblyAsmBackend(const Triple &TT);
+
+MCObjectWriter *createWebAssemblyELFObjectWriter(raw_pwrite_stream &OS,
+ bool Is64Bit, uint8_t OSABI);
+
+namespace WebAssembly {
+enum OperandType {
+ /// Basic block label in a branch construct.
+ OPERAND_BASIC_BLOCK = MCOI::OPERAND_FIRST_TARGET,
+ /// Local index.
+ OPERAND_LOCAL,
+ /// 32-bit integer immediates.
+ OPERAND_I32IMM,
+ /// 64-bit integer immediates.
+ OPERAND_I64IMM,
+ /// 32-bit floating-point immediates.
+ OPERAND_F32IMM,
+ /// 64-bit floating-point immediates.
+ OPERAND_F64IMM,
+ /// 32-bit unsigned function indices.
+ OPERAND_FUNCTION32,
+ /// 32-bit unsigned memory offsets.
+ OPERAND_OFFSET32,
+ /// p2align immediate for load and store address alignment.
+ OPERAND_P2ALIGN,
+ /// signature immediate for block/loop.
+ OPERAND_SIGNATURE
+};
+} // end namespace WebAssembly
+
+namespace WebAssemblyII {
+enum {
+ // For variadic instructions, this flag indicates whether an operand
+ // in the variable_ops range is an immediate value.
+ VariableOpIsImmediate = (1 << 0),
+ // For immediate values in the variable_ops range, this flag indicates
+ // whether the value represents a control-flow label.
+ VariableOpImmediateIsLabel = (1 << 1)
+};
+} // end namespace WebAssemblyII
+
+} // end namespace llvm
+
+// Defines symbolic names for WebAssembly registers. This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "WebAssemblyGenRegisterInfo.inc"
+
+// Defines symbolic names for the WebAssembly instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "WebAssemblyGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "WebAssemblyGenSubtargetInfo.inc"
+
+namespace llvm {
+namespace WebAssembly {
+
+/// Return the default p2align value for a load or store with the given opcode.
+inline unsigned GetDefaultP2Align(unsigned Opcode) {
+ switch (Opcode) {
+ case WebAssembly::LOAD8_S_I32:
+ case WebAssembly::LOAD8_U_I32:
+ case WebAssembly::LOAD8_S_I64:
+ case WebAssembly::LOAD8_U_I64:
+ case WebAssembly::STORE8_I32:
+ case WebAssembly::STORE8_I64:
+ return 0;
+ case WebAssembly::LOAD16_S_I32:
+ case WebAssembly::LOAD16_U_I32:
+ case WebAssembly::LOAD16_S_I64:
+ case WebAssembly::LOAD16_U_I64:
+ case WebAssembly::STORE16_I32:
+ case WebAssembly::STORE16_I64:
+ return 1;
+ case WebAssembly::LOAD_I32:
+ case WebAssembly::LOAD_F32:
+ case WebAssembly::STORE_I32:
+ case WebAssembly::STORE_F32:
+ case WebAssembly::LOAD32_S_I64:
+ case WebAssembly::LOAD32_U_I64:
+ case WebAssembly::STORE32_I64:
+ return 2;
+ case WebAssembly::LOAD_I64:
+ case WebAssembly::LOAD_F64:
+ case WebAssembly::STORE_I64:
+ case WebAssembly::STORE_F64:
+ return 3;
+ default:
+ llvm_unreachable("Only loads and stores have p2align values");
+ }
+}
+
+/// The operand number of the load or store address in load/store instructions.
+static const unsigned LoadAddressOperandNo = 3;
+static const unsigned StoreAddressOperandNo = 2;
+
+/// The operand number of the load or store p2align in load/store instructions.
+static const unsigned LoadP2AlignOperandNo = 1;
+static const unsigned StoreP2AlignOperandNo = 0;
+
+/// This is used to indicate block signatures.
+enum class ExprType {
+ Void = 0x40,
+ I32 = 0x7f,
+ I64 = 0x7e,
+ F32 = 0x7d,
+ F64 = 0x7c,
+ I8x16 = 0x7b,
+ I16x8 = 0x7a,
+ I32x4 = 0x79,
+ F32x4 = 0x78,
+ B8x16 = 0x77,
+ B16x8 = 0x76,
+ B32x4 = 0x75
+};
+
+/// This is used to indicate local types.
+enum class ValType {
+ I32 = 0x7f,
+ I64 = 0x7e,
+ F32 = 0x7d,
+ F64 = 0x7c,
+ I8x16 = 0x7b,
+ I16x8 = 0x7a,
+ I32x4 = 0x79,
+ F32x4 = 0x78,
+ B8x16 = 0x77,
+ B16x8 = 0x76,
+ B32x4 = 0x75
+};
+
+/// Instruction opcodes emitted via means other than CodeGen.
+static const unsigned Nop = 0x01;
+static const unsigned End = 0x0b;
+
+ValType toValType(const MVT &Ty);
+
+} // end namespace WebAssembly
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
new file mode 100644
index 000000000000..3cee8b2a1844
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
@@ -0,0 +1,120 @@
+//==-- WebAssemblyTargetStreamer.cpp - WebAssembly Target Streamer Methods --=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file defines WebAssembly-specific target streamer classes.
+/// These are for implementing support for target-specific assembly directives.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyTargetStreamer.h"
+#include "InstPrinter/WebAssemblyInstPrinter.h"
+#include "WebAssemblyMCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+using namespace llvm;
+
+WebAssemblyTargetStreamer::WebAssemblyTargetStreamer(MCStreamer &S)
+ : MCTargetStreamer(S) {}
+
+WebAssemblyTargetAsmStreamer::WebAssemblyTargetAsmStreamer(
+ MCStreamer &S, formatted_raw_ostream &OS)
+ : WebAssemblyTargetStreamer(S), OS(OS) {}
+
+WebAssemblyTargetELFStreamer::WebAssemblyTargetELFStreamer(MCStreamer &S)
+ : WebAssemblyTargetStreamer(S) {}
+
+static void PrintTypes(formatted_raw_ostream &OS, ArrayRef<MVT> Types) {
+ bool First = true;
+ for (MVT Type : Types) {
+ if (First)
+ First = false;
+ else
+ OS << ", ";
+ OS << WebAssembly::TypeToString(Type);
+ }
+ OS << '\n';
+}
+
+void WebAssemblyTargetAsmStreamer::emitParam(ArrayRef<MVT> Types) {
+ OS << "\t.param \t";
+ PrintTypes(OS, Types);
+}
+
+void WebAssemblyTargetAsmStreamer::emitResult(ArrayRef<MVT> Types) {
+ OS << "\t.result \t";
+ PrintTypes(OS, Types);
+}
+
+void WebAssemblyTargetAsmStreamer::emitLocal(ArrayRef<MVT> Types) {
+ if (!Types.empty()) {
+ OS << "\t.local \t";
+ PrintTypes(OS, Types);
+ }
+}
+
+void WebAssemblyTargetAsmStreamer::emitEndFunc() { OS << "\t.endfunc\n"; }
+
+void WebAssemblyTargetAsmStreamer::emitIndirectFunctionType(
+ StringRef name, SmallVectorImpl<MVT> &Params, SmallVectorImpl<MVT> &Results) {
+ OS << "\t.functype\t" << name;
+ if (Results.empty())
+ OS << ", void";
+ else {
+ assert(Results.size() == 1);
+ OS << ", " << WebAssembly::TypeToString(Results.front());
+ }
+ for (auto Ty : Params)
+ OS << ", " << WebAssembly::TypeToString(Ty);
+ OS << '\n';
+}
+
+void WebAssemblyTargetAsmStreamer::emitGlobalImport(StringRef name) {
+ OS << "\t.import_global\t" << name << '\n';
+}
+
+void WebAssemblyTargetAsmStreamer::emitIndIdx(const MCExpr *Value) {
+ OS << "\t.indidx \t" << *Value << '\n';
+}
+
+void WebAssemblyTargetELFStreamer::emitParam(ArrayRef<MVT> Types) {
+ // Nothing to emit; params are declared as part of the function signature.
+}
+
+void WebAssemblyTargetELFStreamer::emitResult(ArrayRef<MVT> Types) {
+ // Nothing to emit; results are declared as part of the function signature.
+}
+
+void WebAssemblyTargetELFStreamer::emitLocal(ArrayRef<MVT> Types) {
+ Streamer.EmitULEB128IntValue(Types.size());
+ for (MVT Type : Types)
+ Streamer.EmitIntValue(int64_t(WebAssembly::toValType(Type)), 1);
+}
+
+void WebAssemblyTargetELFStreamer::emitEndFunc() {
+ Streamer.EmitIntValue(WebAssembly::End, 1);
+}
+
+void WebAssemblyTargetELFStreamer::emitIndIdx(const MCExpr *Value) {
+ llvm_unreachable(".indidx encoding not yet implemented");
+}
+
+void WebAssemblyTargetELFStreamer::emitIndirectFunctionType(
+ StringRef name, SmallVectorImpl<MVT> &Params, SmallVectorImpl<MVT> &Results) {
+ // Nothing to emit here. TODO: Re-design how linking works and re-evaluate
+ // whether it's necessary for .o files to declare indirect function types.
+}
+
+void WebAssemblyTargetELFStreamer::emitGlobalImport(StringRef name) {
+} \ No newline at end of file
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
new file mode 100644
index 000000000000..23ac3190243a
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
@@ -0,0 +1,88 @@
+//==-- WebAssemblyTargetStreamer.h - WebAssembly Target Streamer -*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file declares WebAssembly-specific target streamer classes.
+/// These are for implementing support for target-specific assembly directives.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYTARGETSTREAMER_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYTARGETSTREAMER_H
+
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+
+class MCELFStreamer;
+
+/// WebAssembly-specific streamer interface, to implement support
+/// WebAssembly-specific assembly directives.
+class WebAssemblyTargetStreamer : public MCTargetStreamer {
+public:
+ explicit WebAssemblyTargetStreamer(MCStreamer &S);
+
+ /// .param
+ virtual void emitParam(ArrayRef<MVT> Types) = 0;
+ /// .result
+ virtual void emitResult(ArrayRef<MVT> Types) = 0;
+ /// .local
+ virtual void emitLocal(ArrayRef<MVT> Types) = 0;
+ /// .endfunc
+ virtual void emitEndFunc() = 0;
+ /// .functype
+ virtual void emitIndirectFunctionType(StringRef name,
+ SmallVectorImpl<MVT> &Params,
+ SmallVectorImpl<MVT> &Results) {
+ llvm_unreachable("emitIndirectFunctionType not implemented");
+ }
+ /// .indidx
+ virtual void emitIndIdx(const MCExpr *Value) = 0;
+ /// .import_global
+ virtual void emitGlobalImport(StringRef name) = 0;
+};
+
+/// This part is for ascii assembly output
+class WebAssemblyTargetAsmStreamer final : public WebAssemblyTargetStreamer {
+ formatted_raw_ostream &OS;
+
+public:
+ WebAssemblyTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
+
+ void emitParam(ArrayRef<MVT> Types) override;
+ void emitResult(ArrayRef<MVT> Types) override;
+ void emitLocal(ArrayRef<MVT> Types) override;
+ void emitEndFunc() override;
+ void emitIndirectFunctionType(StringRef name,
+ SmallVectorImpl<MVT> &Params,
+ SmallVectorImpl<MVT> &Results) override;
+ void emitIndIdx(const MCExpr *Value) override;
+ void emitGlobalImport(StringRef name) override;
+};
+
+/// This part is for ELF object output
+class WebAssemblyTargetELFStreamer final : public WebAssemblyTargetStreamer {
+public:
+ explicit WebAssemblyTargetELFStreamer(MCStreamer &S);
+
+ void emitParam(ArrayRef<MVT> Types) override;
+ void emitResult(ArrayRef<MVT> Types) override;
+ void emitLocal(ArrayRef<MVT> Types) override;
+ void emitEndFunc() override;
+ void emitIndirectFunctionType(StringRef name,
+ SmallVectorImpl<MVT> &Params,
+ SmallVectorImpl<MVT> &Results) override;
+ void emitIndIdx(const MCExpr *Value) override;
+ void emitGlobalImport(StringRef name) override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/README.txt b/contrib/llvm/lib/Target/WebAssembly/README.txt
new file mode 100644
index 000000000000..64991ad14071
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/README.txt
@@ -0,0 +1,147 @@
+//===-- README.txt - Notes for WebAssembly code gen -----------------------===//
+
+This WebAssembly backend is presently under development.
+
+Currently the easiest way to use it is through Emscripten, which provides a
+compilation environment that includes standard libraries, tools, and packaging
+for producing WebAssembly applications that can run in browsers and other
+environments. For more information, see the Emscripten documentation in
+general, and this page in particular:
+ * https://github.com/kripken/emscripten/wiki/New-WebAssembly-Backend
+
+Other ways of using this backend, such as via a standalone "clang", are also
+under development, though they are not generally usable yet.
+
+For more information on WebAssembly itself, see the home page:
+ * https://webassembly.github.io/
+
+The following documents contain some information on the semantics and binary
+encoding of WebAssembly itself:
+ * https://github.com/WebAssembly/design/blob/master/Semantics.md
+ * https://github.com/WebAssembly/design/blob/master/BinaryEncoding.md
+
+The backend is built, tested and archived on the following waterfall:
+ https://wasm-stat.us
+
+The backend's bringup is done in part by using the GCC torture test suite, since
+it doesn't require C library support. Current known failures are in
+known_gcc_test_failures.txt, all other tests should pass. The waterfall will
+turn red if not. Once most of these pass, further testing will use LLVM's own
+test suite. The tests can be run locally using:
+ https://github.com/WebAssembly/waterfall/blob/master/src/compile_torture_tests.py
+
+//===---------------------------------------------------------------------===//
+
+Br, br_if, and br_table instructions can support having a value on the value
+stack across the jump (sometimes). We should (a) model this, and (b) extend
+the stackifier to utilize it.
+
+//===---------------------------------------------------------------------===//
+
+The min/max instructions aren't exactly a<b?a:b because of NaN and negative zero
+behavior. The ARM target has the same kind of min/max instructions and has
+implemented optimizations for them; we should do similar optimizations for
+WebAssembly.
+
+//===---------------------------------------------------------------------===//
+
+AArch64 runs SeparateConstOffsetFromGEPPass, followed by EarlyCSE and LICM.
+Would these be useful to run for WebAssembly too? Also, it has an option to
+run SimplifyCFG after running the AtomicExpand pass. Would this be useful for
+us too?
+
+//===---------------------------------------------------------------------===//
+
+Register stackification uses the VALUE_STACK physical register to impose
+ordering dependencies on instructions with stack operands. This is pessimistic;
+we should consider alternate ways to model stack dependencies.
+
+//===---------------------------------------------------------------------===//
+
+Lots of things could be done in WebAssemblyTargetTransformInfo.cpp. Similarly,
+there are numerous optimization-related hooks that can be overridden in
+WebAssemblyTargetLowering.
+
+//===---------------------------------------------------------------------===//
+
+Instead of the OptimizeReturned pass, which should consider preserving the
+"returned" attribute through to MachineInstrs and extending the StoreResults
+pass to do this optimization on calls too. That would also let the
+WebAssemblyPeephole pass clean up dead defs for such calls, as it does for
+stores.
+
+//===---------------------------------------------------------------------===//
+
+Consider implementing optimizeSelect, optimizeCompareInstr, optimizeCondBranch,
+optimizeLoadInstr, and/or getMachineCombinerPatterns.
+
+//===---------------------------------------------------------------------===//
+
+Find a clean way to fix the problem which leads to the Shrink Wrapping pass
+being run after the WebAssembly PEI pass.
+
+//===---------------------------------------------------------------------===//
+
+When setting multiple local variables to the same constant, we currently get
+code like this:
+
+ i32.const $4=, 0
+ i32.const $3=, 0
+
+It could be done with a smaller encoding like this:
+
+ i32.const $push5=, 0
+ tee_local $push6=, $4=, $pop5
+ copy_local $3=, $pop6
+
+//===---------------------------------------------------------------------===//
+
+WebAssembly registers are implicitly initialized to zero. Explicit zeroing is
+therefore often redundant and could be optimized away.
+
+//===---------------------------------------------------------------------===//
+
+Small indices may use smaller encodings than large indices.
+WebAssemblyRegColoring and/or WebAssemblyRegRenumbering should sort registers
+according to their usage frequency to maximize the usage of smaller encodings.
+
+//===---------------------------------------------------------------------===//
+
+Many cases of irreducible control flow could be transformed more optimally
+than via the transform in WebAssemblyFixIrreducibleControlFlow.cpp.
+
+It may also be worthwhile to do transforms before register coloring,
+particularly when duplicating code, to allow register coloring to be aware of
+the duplication.
+
+//===---------------------------------------------------------------------===//
+
+WebAssemblyRegStackify could use AliasAnalysis to reorder loads and stores more
+aggressively.
+
+//===---------------------------------------------------------------------===//
+
+WebAssemblyRegStackify is currently a greedy algorithm. This means that, for
+example, a binary operator will stackify with its user before its operands.
+However, if moving the binary operator to its user moves it to a place where
+its operands can't be moved to, it would be better to leave it in place, or
+perhaps move it up, so that it can stackify its operands. A binary operator
+has two operands and one result, so in such cases there could be a net win by
+prefering the operands.
+
+//===---------------------------------------------------------------------===//
+
+Instruction ordering has a significant influence on register stackification and
+coloring. Consider experimenting with the MachineScheduler (enable via
+enableMachineScheduler) and determine if it can be configured to schedule
+instructions advantageously for this purpose.
+
+//===---------------------------------------------------------------------===//
+
+WebAssembly is now officially a stack machine, rather than an AST, and this
+comes with additional opportunities for WebAssemblyRegStackify. Specifically,
+the stack doesn't need to be empty after an instruction with no return values.
+WebAssemblyRegStackify could be extended, or possibly rewritten, to take
+advantage of the new opportunities.
+
+//===---------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp
new file mode 100644
index 000000000000..f310f0a44461
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp
@@ -0,0 +1,36 @@
+//===-- WebAssemblyTargetInfo.cpp - WebAssembly Target Implementation -----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file registers the WebAssembly target.
+///
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-target-info"
+
+Target &llvm::getTheWebAssemblyTarget32() {
+ static Target TheWebAssemblyTarget32;
+ return TheWebAssemblyTarget32;
+}
+Target &llvm::getTheWebAssemblyTarget64() {
+ static Target TheWebAssemblyTarget64;
+ return TheWebAssemblyTarget64;
+}
+
+extern "C" void LLVMInitializeWebAssemblyTargetInfo() {
+ RegisterTarget<Triple::wasm32> X(getTheWebAssemblyTarget32(), "wasm32",
+ "WebAssembly 32-bit");
+ RegisterTarget<Triple::wasm64> Y(getTheWebAssemblyTarget64(), "wasm64",
+ "WebAssembly 64-bit");
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssembly.h b/contrib/llvm/lib/Target/WebAssembly/WebAssembly.h
new file mode 100644
index 000000000000..09c35b4825fc
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssembly.h
@@ -0,0 +1,56 @@
+//===-- WebAssembly.h - Top-level interface for WebAssembly ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains the entry points for global functions defined in
+/// the LLVM WebAssembly back-end.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLY_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLY_H
+
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/CodeGen.h"
+
+namespace llvm {
+
+class WebAssemblyTargetMachine;
+class ModulePass;
+class FunctionPass;
+
+// LLVM IR passes.
+ModulePass *createWebAssemblyLowerEmscriptenEHSjLj(bool DoEH, bool DoSjLj);
+void initializeWebAssemblyLowerEmscriptenEHSjLjPass(PassRegistry &);
+FunctionPass *createWebAssemblyOptimizeReturned();
+
+// ISel and immediate followup passes.
+FunctionPass *createWebAssemblyISelDag(WebAssemblyTargetMachine &TM,
+ CodeGenOpt::Level OptLevel);
+FunctionPass *createWebAssemblyArgumentMove();
+FunctionPass *createWebAssemblySetP2AlignOperands();
+
+// Late passes.
+FunctionPass *createWebAssemblyReplacePhysRegs();
+FunctionPass *createWebAssemblyPrepareForLiveIntervals();
+FunctionPass *createWebAssemblyOptimizeLiveIntervals();
+FunctionPass *createWebAssemblyStoreResults();
+FunctionPass *createWebAssemblyRegStackify();
+FunctionPass *createWebAssemblyRegColoring();
+FunctionPass *createWebAssemblyExplicitLocals();
+FunctionPass *createWebAssemblyFixIrreducibleControlFlow();
+FunctionPass *createWebAssemblyCFGStackify();
+FunctionPass *createWebAssemblyLowerBrUnless();
+FunctionPass *createWebAssemblyRegNumbering();
+FunctionPass *createWebAssemblyPeephole();
+FunctionPass *createWebAssemblyCallIndirectFixup();
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssembly.td b/contrib/llvm/lib/Target/WebAssembly/WebAssembly.td
new file mode 100644
index 000000000000..f647349d759b
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssembly.td
@@ -0,0 +1,66 @@
+//- WebAssembly.td - Describe the WebAssembly Target Machine --*- tablegen -*-//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This is a target description file for the WebAssembly architecture,
+/// which is also known as "wasm".
+///
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// WebAssembly Subtarget features.
+//===----------------------------------------------------------------------===//
+
+def FeatureSIMD128 : SubtargetFeature<"simd128", "HasSIMD128", "true",
+ "Enable 128-bit SIMD">;
+
+//===----------------------------------------------------------------------===//
+// Architectures.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "WebAssemblyRegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "WebAssemblyInstrInfo.td"
+
+def WebAssemblyInstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// WebAssembly Processors supported.
+//===----------------------------------------------------------------------===//
+
+// Minimal Viable Product.
+def : ProcessorModel<"mvp", NoSchedModel, []>;
+
+// Generic processor: latest stable version.
+def : ProcessorModel<"generic", NoSchedModel, []>;
+
+// Latest and greatest experimental version of WebAssembly. Bugs included!
+def : ProcessorModel<"bleeding-edge", NoSchedModel, [FeatureSIMD128]>;
+
+//===----------------------------------------------------------------------===//
+// Target Declaration
+//===----------------------------------------------------------------------===//
+
+def WebAssembly : Target {
+ let InstructionSet = WebAssemblyInstrInfo;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp
new file mode 100644
index 000000000000..5fadca38b820
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp
@@ -0,0 +1,95 @@
+//===-- WebAssemblyArgumentMove.cpp - Argument instruction moving ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file moves ARGUMENT instructions after ScheduleDAG scheduling.
+///
+/// Arguments are really live-in registers, however, since we use virtual
+/// registers and LLVM doesn't support live-in virtual registers, we're
+/// currently making do with ARGUMENT instructions which are placed at the top
+/// of the entry block. The trick is to get them to *stay* at the top of the
+/// entry block.
+///
+/// The ARGUMENTS physical register keeps these instructions pinned in place
+/// during liveness-aware CodeGen passes, however one thing which does not
+/// respect this is the ScheduleDAG scheduler. This pass is therefore run
+/// immediately after that.
+///
+/// This is all hopefully a temporary solution until we find a better solution
+/// for describing the live-in nature of arguments.
+///
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssembly.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "WebAssemblyUtilities.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-argument-move"
+
+namespace {
+class WebAssemblyArgumentMove final : public MachineFunctionPass {
+public:
+ static char ID; // Pass identification, replacement for typeid
+ WebAssemblyArgumentMove() : MachineFunctionPass(ID) {}
+
+ StringRef getPassName() const override { return "WebAssembly Argument Move"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addPreserved<MachineBlockFrequencyInfo>();
+ AU.addPreservedID(MachineDominatorsID);
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // end anonymous namespace
+
+char WebAssemblyArgumentMove::ID = 0;
+FunctionPass *llvm::createWebAssemblyArgumentMove() {
+ return new WebAssemblyArgumentMove();
+}
+
+bool WebAssemblyArgumentMove::runOnMachineFunction(MachineFunction &MF) {
+ DEBUG({
+ dbgs() << "********** Argument Move **********\n"
+ << "********** Function: " << MF.getName() << '\n';
+ });
+
+ bool Changed = false;
+ MachineBasicBlock &EntryMBB = MF.front();
+ MachineBasicBlock::iterator InsertPt = EntryMBB.end();
+
+ // Look for the first NonArg instruction.
+ for (MachineInstr &MI : EntryMBB) {
+ if (!WebAssembly::isArgument(MI)) {
+ InsertPt = MI;
+ break;
+ }
+ }
+
+ // Now move any argument instructions later in the block
+ // to before our first NonArg instruction.
+ for (MachineInstr &MI : llvm::make_range(InsertPt, EntryMBB.end())) {
+ if (WebAssembly::isArgument(MI)) {
+ EntryMBB.insert(InsertPt, MI.removeFromParent());
+ Changed = true;
+ }
+ }
+
+ return Changed;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
new file mode 100644
index 000000000000..5b4b82eb5603
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -0,0 +1,322 @@
+//===-- WebAssemblyAsmPrinter.cpp - WebAssembly LLVM assembly writer ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains a printer that converts from our internal
+/// representation of machine-dependent LLVM code to the WebAssembly assembly
+/// language.
+///
+//===----------------------------------------------------------------------===//
+
+#include "InstPrinter/WebAssemblyInstPrinter.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "MCTargetDesc/WebAssemblyTargetStreamer.h"
+#include "WebAssembly.h"
+#include "WebAssemblyMCInstLower.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblyRegisterInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+namespace {
+
+class WebAssemblyAsmPrinter final : public AsmPrinter {
+ const MachineRegisterInfo *MRI;
+ WebAssemblyFunctionInfo *MFI;
+
+public:
+ WebAssemblyAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
+ : AsmPrinter(TM, std::move(Streamer)), MRI(nullptr), MFI(nullptr) {}
+
+private:
+ StringRef getPassName() const override {
+ return "WebAssembly Assembly Printer";
+ }
+
+ //===------------------------------------------------------------------===//
+ // MachineFunctionPass Implementation.
+ //===------------------------------------------------------------------===//
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ MRI = &MF.getRegInfo();
+ MFI = MF.getInfo<WebAssemblyFunctionInfo>();
+ return AsmPrinter::runOnMachineFunction(MF);
+ }
+
+ //===------------------------------------------------------------------===//
+ // AsmPrinter Implementation.
+ //===------------------------------------------------------------------===//
+
+ void EmitEndOfAsmFile(Module &M) override;
+ void EmitJumpTableInfo() override;
+ void EmitConstantPool() override;
+ void EmitFunctionBodyStart() override;
+ void EmitFunctionBodyEnd() override;
+ void EmitInstruction(const MachineInstr *MI) override;
+ const MCExpr *lowerConstant(const Constant *CV) override;
+ bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &OS) override;
+ bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &OS) override;
+
+ MVT getRegType(unsigned RegNo) const;
+ std::string regToString(const MachineOperand &MO);
+ WebAssemblyTargetStreamer *getTargetStreamer();
+};
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// Helpers.
+//===----------------------------------------------------------------------===//
+
+MVT WebAssemblyAsmPrinter::getRegType(unsigned RegNo) const {
+ const TargetRegisterClass *TRC = MRI->getRegClass(RegNo);
+ for (MVT T : {MVT::i32, MVT::i64, MVT::f32, MVT::f64, MVT::v16i8, MVT::v8i16,
+ MVT::v4i32, MVT::v4f32})
+ if (TRC->hasType(T))
+ return T;
+ DEBUG(errs() << "Unknown type for register number: " << RegNo);
+ llvm_unreachable("Unknown register type");
+ return MVT::Other;
+}
+
+std::string WebAssemblyAsmPrinter::regToString(const MachineOperand &MO) {
+ unsigned RegNo = MO.getReg();
+ assert(TargetRegisterInfo::isVirtualRegister(RegNo) &&
+ "Unlowered physical register encountered during assembly printing");
+ assert(!MFI->isVRegStackified(RegNo));
+ unsigned WAReg = MFI->getWAReg(RegNo);
+ assert(WAReg != WebAssemblyFunctionInfo::UnusedReg);
+ return '$' + utostr(WAReg);
+}
+
+WebAssemblyTargetStreamer *WebAssemblyAsmPrinter::getTargetStreamer() {
+ MCTargetStreamer *TS = OutStreamer->getTargetStreamer();
+ return static_cast<WebAssemblyTargetStreamer *>(TS);
+}
+
+//===----------------------------------------------------------------------===//
+// WebAssemblyAsmPrinter Implementation.
+//===----------------------------------------------------------------------===//
+
+void WebAssemblyAsmPrinter::EmitEndOfAsmFile(Module &M) {
+ for (const auto &F : M) {
+ // Emit function type info for all undefined functions
+ if (F.isDeclarationForLinker() && !F.isIntrinsic()) {
+ SmallVector<MVT, 4> Results;
+ SmallVector<MVT, 4> Params;
+ ComputeSignatureVTs(F, TM, Params, Results);
+ getTargetStreamer()->emitIndirectFunctionType(F.getName(), Params,
+ Results);
+ }
+ }
+ for (const auto &G : M.globals()) {
+ if (!G.hasInitializer() && G.hasExternalLinkage()) {
+ getTargetStreamer()->emitGlobalImport(G.getGlobalIdentifier());
+ }
+ }
+}
+
+void WebAssemblyAsmPrinter::EmitConstantPool() {
+ assert(MF->getConstantPool()->getConstants().empty() &&
+ "WebAssembly disables constant pools");
+}
+
+void WebAssemblyAsmPrinter::EmitJumpTableInfo() {
+ // Nothing to do; jump tables are incorporated into the instruction stream.
+}
+
+void WebAssemblyAsmPrinter::EmitFunctionBodyStart() {
+ if (!MFI->getParams().empty())
+ getTargetStreamer()->emitParam(MFI->getParams());
+
+ SmallVector<MVT, 4> ResultVTs;
+ const Function &F(*MF->getFunction());
+
+ // Emit the function index.
+ if (MDNode *Idx = F.getMetadata("wasm.index")) {
+ assert(Idx->getNumOperands() == 1);
+
+ getTargetStreamer()->emitIndIdx(AsmPrinter::lowerConstant(
+ cast<ConstantAsMetadata>(Idx->getOperand(0))->getValue()));
+ }
+
+ ComputeLegalValueVTs(F, TM, F.getReturnType(), ResultVTs);
+
+ // If the return type needs to be legalized it will get converted into
+ // passing a pointer.
+ if (ResultVTs.size() == 1)
+ getTargetStreamer()->emitResult(ResultVTs);
+
+ // FIXME: When ExplicitLocals is enabled by default, we won't need
+ // to define the locals here (and MFI can go back to being pointer-to-const).
+ for (unsigned Idx = 0, IdxE = MRI->getNumVirtRegs(); Idx != IdxE; ++Idx) {
+ unsigned VReg = TargetRegisterInfo::index2VirtReg(Idx);
+ unsigned WAReg = MFI->getWAReg(VReg);
+ // Don't declare unused registers.
+ if (WAReg == WebAssemblyFunctionInfo::UnusedReg)
+ continue;
+ // Don't redeclare parameters.
+ if (WAReg < MFI->getParams().size())
+ continue;
+ // Don't declare stackified registers.
+ if (int(WAReg) < 0)
+ continue;
+ MFI->addLocal(getRegType(VReg));
+ }
+
+ getTargetStreamer()->emitLocal(MFI->getLocals());
+
+ AsmPrinter::EmitFunctionBodyStart();
+}
+
+void WebAssemblyAsmPrinter::EmitFunctionBodyEnd() {
+ getTargetStreamer()->emitEndFunc();
+}
+
+void WebAssemblyAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+ DEBUG(dbgs() << "EmitInstruction: " << *MI << '\n');
+
+ switch (MI->getOpcode()) {
+ case WebAssembly::ARGUMENT_I32:
+ case WebAssembly::ARGUMENT_I64:
+ case WebAssembly::ARGUMENT_F32:
+ case WebAssembly::ARGUMENT_F64:
+ case WebAssembly::ARGUMENT_v16i8:
+ case WebAssembly::ARGUMENT_v8i16:
+ case WebAssembly::ARGUMENT_v4i32:
+ case WebAssembly::ARGUMENT_v4f32:
+ // These represent values which are live into the function entry, so there's
+ // no instruction to emit.
+ break;
+ case WebAssembly::FALLTHROUGH_RETURN_I32:
+ case WebAssembly::FALLTHROUGH_RETURN_I64:
+ case WebAssembly::FALLTHROUGH_RETURN_F32:
+ case WebAssembly::FALLTHROUGH_RETURN_F64:
+ case WebAssembly::FALLTHROUGH_RETURN_v16i8:
+ case WebAssembly::FALLTHROUGH_RETURN_v8i16:
+ case WebAssembly::FALLTHROUGH_RETURN_v4i32:
+ case WebAssembly::FALLTHROUGH_RETURN_v4f32: {
+ // These instructions represent the implicit return at the end of a
+ // function body. The operand is always a pop.
+ assert(MFI->isVRegStackified(MI->getOperand(0).getReg()));
+
+ if (isVerbose()) {
+ OutStreamer->AddComment("fallthrough-return: $pop" +
+ utostr(MFI->getWARegStackId(
+ MFI->getWAReg(MI->getOperand(0).getReg()))));
+ OutStreamer->AddBlankLine();
+ }
+ break;
+ }
+ case WebAssembly::FALLTHROUGH_RETURN_VOID:
+ // This instruction represents the implicit return at the end of a
+ // function body with no return value.
+ if (isVerbose()) {
+ OutStreamer->AddComment("fallthrough-return");
+ OutStreamer->AddBlankLine();
+ }
+ break;
+ default: {
+ WebAssemblyMCInstLower MCInstLowering(OutContext, *this);
+ MCInst TmpInst;
+ MCInstLowering.Lower(MI, TmpInst);
+ EmitToStreamer(*OutStreamer, TmpInst);
+ break;
+ }
+ }
+}
+
+const MCExpr *WebAssemblyAsmPrinter::lowerConstant(const Constant *CV) {
+ if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV))
+ if (GV->getValueType()->isFunctionTy())
+ return MCSymbolRefExpr::create(
+ getSymbol(GV), MCSymbolRefExpr::VK_WebAssembly_FUNCTION, OutContext);
+ return AsmPrinter::lowerConstant(CV);
+}
+
+bool WebAssemblyAsmPrinter::PrintAsmOperand(const MachineInstr *MI,
+ unsigned OpNo, unsigned AsmVariant,
+ const char *ExtraCode,
+ raw_ostream &OS) {
+ if (AsmVariant != 0)
+ report_fatal_error("There are no defined alternate asm variants");
+
+ // First try the generic code, which knows about modifiers like 'c' and 'n'.
+ if (!AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, OS))
+ return false;
+
+ if (!ExtraCode) {
+ const MachineOperand &MO = MI->getOperand(OpNo);
+ switch (MO.getType()) {
+ case MachineOperand::MO_Immediate:
+ OS << MO.getImm();
+ return false;
+ case MachineOperand::MO_Register:
+ OS << regToString(MO);
+ return false;
+ case MachineOperand::MO_GlobalAddress:
+ getSymbol(MO.getGlobal())->print(OS, MAI);
+ printOffset(MO.getOffset(), OS);
+ return false;
+ case MachineOperand::MO_ExternalSymbol:
+ GetExternalSymbolSymbol(MO.getSymbolName())->print(OS, MAI);
+ printOffset(MO.getOffset(), OS);
+ return false;
+ case MachineOperand::MO_MachineBasicBlock:
+ MO.getMBB()->getSymbol()->print(OS, MAI);
+ return false;
+ default:
+ break;
+ }
+ }
+
+ return true;
+}
+
+bool WebAssemblyAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+ unsigned OpNo,
+ unsigned AsmVariant,
+ const char *ExtraCode,
+ raw_ostream &OS) {
+ if (AsmVariant != 0)
+ report_fatal_error("There are no defined alternate asm variants");
+
+ if (!ExtraCode) {
+ // TODO: For now, we just hard-code 0 as the constant offset; teach
+ // SelectInlineAsmMemoryOperand how to do address mode matching.
+ OS << "0(" + regToString(MI->getOperand(OpNo)) + ')';
+ return false;
+ }
+
+ return AsmPrinter::PrintAsmMemoryOperand(MI, OpNo, AsmVariant, ExtraCode, OS);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeWebAssemblyAsmPrinter() {
+ RegisterAsmPrinter<WebAssemblyAsmPrinter> X(getTheWebAssemblyTarget32());
+ RegisterAsmPrinter<WebAssemblyAsmPrinter> Y(getTheWebAssemblyTarget64());
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
new file mode 100644
index 000000000000..49b9754e6b62
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -0,0 +1,579 @@
+//===-- WebAssemblyCFGStackify.cpp - CFG Stackification -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements a CFG stacking pass.
+///
+/// This pass reorders the blocks in a function to put them into topological
+/// order, ignoring loop backedges, and without any loop being interrupted
+/// by a block not dominated by the loop header, with special care to keep the
+/// order as similar as possible to the original order.
+///
+/// Then, it inserts BLOCK and LOOP markers to mark the start of scopes, since
+/// scope boundaries serve as the labels for WebAssembly's control transfers.
+///
+/// This is sufficient to convert arbitrary CFGs into a form that works on
+/// WebAssembly, provided that all loops are single-entry.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "WebAssemblyUtilities.h"
+#include "llvm/ADT/PriorityQueue.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-cfg-stackify"
+
+namespace {
+class WebAssemblyCFGStackify final : public MachineFunctionPass {
+ StringRef getPassName() const override { return "WebAssembly CFG Stackify"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineLoopInfo>();
+ AU.addPreserved<MachineLoopInfo>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+public:
+ static char ID; // Pass identification, replacement for typeid
+ WebAssemblyCFGStackify() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyCFGStackify::ID = 0;
+FunctionPass *llvm::createWebAssemblyCFGStackify() {
+ return new WebAssemblyCFGStackify();
+}
+
+/// Return the "bottom" block of a loop. This differs from
+/// MachineLoop::getBottomBlock in that it works even if the loop is
+/// discontiguous.
+static MachineBasicBlock *LoopBottom(const MachineLoop *Loop) {
+ MachineBasicBlock *Bottom = Loop->getHeader();
+ for (MachineBasicBlock *MBB : Loop->blocks())
+ if (MBB->getNumber() > Bottom->getNumber())
+ Bottom = MBB;
+ return Bottom;
+}
+
+static void MaybeUpdateTerminator(MachineBasicBlock *MBB) {
+#ifndef NDEBUG
+ bool AnyBarrier = false;
+#endif
+ bool AllAnalyzable = true;
+ for (const MachineInstr &Term : MBB->terminators()) {
+#ifndef NDEBUG
+ AnyBarrier |= Term.isBarrier();
+#endif
+ AllAnalyzable &= Term.isBranch() && !Term.isIndirectBranch();
+ }
+ assert((AnyBarrier || AllAnalyzable) &&
+ "AnalyzeBranch needs to analyze any block with a fallthrough");
+ if (AllAnalyzable)
+ MBB->updateTerminator();
+}
+
+namespace {
+/// Sort blocks by their number.
+struct CompareBlockNumbers {
+ bool operator()(const MachineBasicBlock *A,
+ const MachineBasicBlock *B) const {
+ return A->getNumber() > B->getNumber();
+ }
+};
+/// Sort blocks by their number in the opposite order..
+struct CompareBlockNumbersBackwards {
+ bool operator()(const MachineBasicBlock *A,
+ const MachineBasicBlock *B) const {
+ return A->getNumber() < B->getNumber();
+ }
+};
+/// Bookkeeping for a loop to help ensure that we don't mix blocks not dominated
+/// by the loop header among the loop's blocks.
+struct Entry {
+ const MachineLoop *Loop;
+ unsigned NumBlocksLeft;
+
+ /// List of blocks not dominated by Loop's header that are deferred until
+ /// after all of Loop's blocks have been seen.
+ std::vector<MachineBasicBlock *> Deferred;
+
+ explicit Entry(const MachineLoop *L)
+ : Loop(L), NumBlocksLeft(L->getNumBlocks()) {}
+};
+}
+
+/// Sort the blocks, taking special care to make sure that loops are not
+/// interrupted by blocks not dominated by their header.
+/// TODO: There are many opportunities for improving the heuristics here.
+/// Explore them.
+static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
+ const MachineDominatorTree &MDT) {
+ // Prepare for a topological sort: Record the number of predecessors each
+ // block has, ignoring loop backedges.
+ MF.RenumberBlocks();
+ SmallVector<unsigned, 16> NumPredsLeft(MF.getNumBlockIDs(), 0);
+ for (MachineBasicBlock &MBB : MF) {
+ unsigned N = MBB.pred_size();
+ if (MachineLoop *L = MLI.getLoopFor(&MBB))
+ if (L->getHeader() == &MBB)
+ for (const MachineBasicBlock *Pred : MBB.predecessors())
+ if (L->contains(Pred))
+ --N;
+ NumPredsLeft[MBB.getNumber()] = N;
+ }
+
+ // Topological sort the CFG, with additional constraints:
+ // - Between a loop header and the last block in the loop, there can be
+ // no blocks not dominated by the loop header.
+ // - It's desirable to preserve the original block order when possible.
+ // We use two ready lists; Preferred and Ready. Preferred has recently
+ // processed sucessors, to help preserve block sequences from the original
+ // order. Ready has the remaining ready blocks.
+ PriorityQueue<MachineBasicBlock *, std::vector<MachineBasicBlock *>,
+ CompareBlockNumbers>
+ Preferred;
+ PriorityQueue<MachineBasicBlock *, std::vector<MachineBasicBlock *>,
+ CompareBlockNumbersBackwards>
+ Ready;
+ SmallVector<Entry, 4> Loops;
+ for (MachineBasicBlock *MBB = &MF.front();;) {
+ const MachineLoop *L = MLI.getLoopFor(MBB);
+ if (L) {
+ // If MBB is a loop header, add it to the active loop list. We can't put
+ // any blocks that it doesn't dominate until we see the end of the loop.
+ if (L->getHeader() == MBB)
+ Loops.push_back(Entry(L));
+ // For each active loop the block is in, decrement the count. If MBB is
+ // the last block in an active loop, take it off the list and pick up any
+ // blocks deferred because the header didn't dominate them.
+ for (Entry &E : Loops)
+ if (E.Loop->contains(MBB) && --E.NumBlocksLeft == 0)
+ for (auto DeferredBlock : E.Deferred)
+ Ready.push(DeferredBlock);
+ while (!Loops.empty() && Loops.back().NumBlocksLeft == 0)
+ Loops.pop_back();
+ }
+ // The main topological sort logic.
+ for (MachineBasicBlock *Succ : MBB->successors()) {
+ // Ignore backedges.
+ if (MachineLoop *SuccL = MLI.getLoopFor(Succ))
+ if (SuccL->getHeader() == Succ && SuccL->contains(MBB))
+ continue;
+ // Decrement the predecessor count. If it's now zero, it's ready.
+ if (--NumPredsLeft[Succ->getNumber()] == 0)
+ Preferred.push(Succ);
+ }
+ // Determine the block to follow MBB. First try to find a preferred block,
+ // to preserve the original block order when possible.
+ MachineBasicBlock *Next = nullptr;
+ while (!Preferred.empty()) {
+ Next = Preferred.top();
+ Preferred.pop();
+ // If X isn't dominated by the top active loop header, defer it until that
+ // loop is done.
+ if (!Loops.empty() &&
+ !MDT.dominates(Loops.back().Loop->getHeader(), Next)) {
+ Loops.back().Deferred.push_back(Next);
+ Next = nullptr;
+ continue;
+ }
+ // If Next was originally ordered before MBB, and it isn't because it was
+ // loop-rotated above the header, it's not preferred.
+ if (Next->getNumber() < MBB->getNumber() &&
+ (!L || !L->contains(Next) ||
+ L->getHeader()->getNumber() < Next->getNumber())) {
+ Ready.push(Next);
+ Next = nullptr;
+ continue;
+ }
+ break;
+ }
+ // If we didn't find a suitable block in the Preferred list, check the
+ // general Ready list.
+ if (!Next) {
+ // If there are no more blocks to process, we're done.
+ if (Ready.empty()) {
+ MaybeUpdateTerminator(MBB);
+ break;
+ }
+ for (;;) {
+ Next = Ready.top();
+ Ready.pop();
+ // If Next isn't dominated by the top active loop header, defer it until
+ // that loop is done.
+ if (!Loops.empty() &&
+ !MDT.dominates(Loops.back().Loop->getHeader(), Next)) {
+ Loops.back().Deferred.push_back(Next);
+ continue;
+ }
+ break;
+ }
+ }
+ // Move the next block into place and iterate.
+ Next->moveAfter(MBB);
+ MaybeUpdateTerminator(MBB);
+ MBB = Next;
+ }
+ assert(Loops.empty() && "Active loop list not finished");
+ MF.RenumberBlocks();
+
+#ifndef NDEBUG
+ SmallSetVector<MachineLoop *, 8> OnStack;
+
+ // Insert a sentinel representing the degenerate loop that starts at the
+ // function entry block and includes the entire function as a "loop" that
+ // executes once.
+ OnStack.insert(nullptr);
+
+ for (auto &MBB : MF) {
+ assert(MBB.getNumber() >= 0 && "Renumbered blocks should be non-negative.");
+
+ MachineLoop *Loop = MLI.getLoopFor(&MBB);
+ if (Loop && &MBB == Loop->getHeader()) {
+ // Loop header. The loop predecessor should be sorted above, and the other
+ // predecessors should be backedges below.
+ for (auto Pred : MBB.predecessors())
+ assert(
+ (Pred->getNumber() < MBB.getNumber() || Loop->contains(Pred)) &&
+ "Loop header predecessors must be loop predecessors or backedges");
+ assert(OnStack.insert(Loop) && "Loops should be declared at most once.");
+ } else {
+ // Not a loop header. All predecessors should be sorted above.
+ for (auto Pred : MBB.predecessors())
+ assert(Pred->getNumber() < MBB.getNumber() &&
+ "Non-loop-header predecessors should be topologically sorted");
+ assert(OnStack.count(MLI.getLoopFor(&MBB)) &&
+ "Blocks must be nested in their loops");
+ }
+ while (OnStack.size() > 1 && &MBB == LoopBottom(OnStack.back()))
+ OnStack.pop_back();
+ }
+ assert(OnStack.pop_back_val() == nullptr &&
+ "The function entry block shouldn't actually be a loop header");
+ assert(OnStack.empty() &&
+ "Control flow stack pushes and pops should be balanced.");
+#endif
+}
+
+/// Test whether Pred has any terminators explicitly branching to MBB, as
+/// opposed to falling through. Note that it's possible (eg. in unoptimized
+/// code) for a branch instruction to both branch to a block and fallthrough
+/// to it, so we check the actual branch operands to see if there are any
+/// explicit mentions.
+static bool ExplicitlyBranchesTo(MachineBasicBlock *Pred,
+ MachineBasicBlock *MBB) {
+ for (MachineInstr &MI : Pred->terminators())
+ for (MachineOperand &MO : MI.explicit_operands())
+ if (MO.isMBB() && MO.getMBB() == MBB)
+ return true;
+ return false;
+}
+
+/// Insert a BLOCK marker for branches to MBB (if needed).
+static void PlaceBlockMarker(
+ MachineBasicBlock &MBB, MachineFunction &MF,
+ SmallVectorImpl<MachineBasicBlock *> &ScopeTops,
+ DenseMap<const MachineInstr *, MachineInstr *> &BlockTops,
+ DenseMap<const MachineInstr *, MachineInstr *> &LoopTops,
+ const WebAssemblyInstrInfo &TII,
+ const MachineLoopInfo &MLI,
+ MachineDominatorTree &MDT,
+ WebAssemblyFunctionInfo &MFI) {
+ // First compute the nearest common dominator of all forward non-fallthrough
+ // predecessors so that we minimize the time that the BLOCK is on the stack,
+ // which reduces overall stack height.
+ MachineBasicBlock *Header = nullptr;
+ bool IsBranchedTo = false;
+ int MBBNumber = MBB.getNumber();
+ for (MachineBasicBlock *Pred : MBB.predecessors())
+ if (Pred->getNumber() < MBBNumber) {
+ Header = Header ? MDT.findNearestCommonDominator(Header, Pred) : Pred;
+ if (ExplicitlyBranchesTo(Pred, &MBB))
+ IsBranchedTo = true;
+ }
+ if (!Header)
+ return;
+ if (!IsBranchedTo)
+ return;
+
+ assert(&MBB != &MF.front() && "Header blocks shouldn't have predecessors");
+ MachineBasicBlock *LayoutPred = &*std::prev(MachineFunction::iterator(&MBB));
+
+ // If the nearest common dominator is inside a more deeply nested context,
+ // walk out to the nearest scope which isn't more deeply nested.
+ for (MachineFunction::iterator I(LayoutPred), E(Header); I != E; --I) {
+ if (MachineBasicBlock *ScopeTop = ScopeTops[I->getNumber()]) {
+ if (ScopeTop->getNumber() > Header->getNumber()) {
+ // Skip over an intervening scope.
+ I = std::next(MachineFunction::iterator(ScopeTop));
+ } else {
+ // We found a scope level at an appropriate depth.
+ Header = ScopeTop;
+ break;
+ }
+ }
+ }
+
+ // Decide where in Header to put the BLOCK.
+ MachineBasicBlock::iterator InsertPos;
+ MachineLoop *HeaderLoop = MLI.getLoopFor(Header);
+ if (HeaderLoop && MBB.getNumber() > LoopBottom(HeaderLoop)->getNumber()) {
+ // Header is the header of a loop that does not lexically contain MBB, so
+ // the BLOCK needs to be above the LOOP, after any END constructs.
+ InsertPos = Header->begin();
+ while (InsertPos->getOpcode() == WebAssembly::END_BLOCK ||
+ InsertPos->getOpcode() == WebAssembly::END_LOOP)
+ ++InsertPos;
+ } else {
+ // Otherwise, insert the BLOCK as late in Header as we can, but before the
+ // beginning of the local expression tree and any nested BLOCKs.
+ InsertPos = Header->getFirstTerminator();
+ while (InsertPos != Header->begin() &&
+ WebAssembly::isChild(*std::prev(InsertPos), MFI) &&
+ std::prev(InsertPos)->getOpcode() != WebAssembly::LOOP &&
+ std::prev(InsertPos)->getOpcode() != WebAssembly::END_BLOCK &&
+ std::prev(InsertPos)->getOpcode() != WebAssembly::END_LOOP)
+ --InsertPos;
+ }
+
+ // Add the BLOCK.
+ MachineInstr *Begin = BuildMI(*Header, InsertPos, DebugLoc(),
+ TII.get(WebAssembly::BLOCK))
+ .addImm(int64_t(WebAssembly::ExprType::Void));
+
+ // Mark the end of the block.
+ InsertPos = MBB.begin();
+ while (InsertPos != MBB.end() &&
+ InsertPos->getOpcode() == WebAssembly::END_LOOP &&
+ LoopTops[&*InsertPos]->getParent()->getNumber() >= Header->getNumber())
+ ++InsertPos;
+ MachineInstr *End = BuildMI(MBB, InsertPos, DebugLoc(),
+ TII.get(WebAssembly::END_BLOCK));
+ BlockTops[End] = Begin;
+
+ // Track the farthest-spanning scope that ends at this point.
+ int Number = MBB.getNumber();
+ if (!ScopeTops[Number] ||
+ ScopeTops[Number]->getNumber() > Header->getNumber())
+ ScopeTops[Number] = Header;
+}
+
+/// Insert a LOOP marker for a loop starting at MBB (if it's a loop header).
+static void PlaceLoopMarker(
+ MachineBasicBlock &MBB, MachineFunction &MF,
+ SmallVectorImpl<MachineBasicBlock *> &ScopeTops,
+ DenseMap<const MachineInstr *, MachineInstr *> &LoopTops,
+ const WebAssemblyInstrInfo &TII, const MachineLoopInfo &MLI) {
+ MachineLoop *Loop = MLI.getLoopFor(&MBB);
+ if (!Loop || Loop->getHeader() != &MBB)
+ return;
+
+ // The operand of a LOOP is the first block after the loop. If the loop is the
+ // bottom of the function, insert a dummy block at the end.
+ MachineBasicBlock *Bottom = LoopBottom(Loop);
+ auto Iter = std::next(MachineFunction::iterator(Bottom));
+ if (Iter == MF.end()) {
+ MachineBasicBlock *Label = MF.CreateMachineBasicBlock();
+ // Give it a fake predecessor so that AsmPrinter prints its label.
+ Label->addSuccessor(Label);
+ MF.push_back(Label);
+ Iter = std::next(MachineFunction::iterator(Bottom));
+ }
+ MachineBasicBlock *AfterLoop = &*Iter;
+
+ // Mark the beginning of the loop (after the end of any existing loop that
+ // ends here).
+ auto InsertPos = MBB.begin();
+ while (InsertPos != MBB.end() &&
+ InsertPos->getOpcode() == WebAssembly::END_LOOP)
+ ++InsertPos;
+ MachineInstr *Begin = BuildMI(MBB, InsertPos, DebugLoc(),
+ TII.get(WebAssembly::LOOP))
+ .addImm(int64_t(WebAssembly::ExprType::Void));
+
+ // Mark the end of the loop.
+ MachineInstr *End = BuildMI(*AfterLoop, AfterLoop->begin(), DebugLoc(),
+ TII.get(WebAssembly::END_LOOP));
+ LoopTops[End] = Begin;
+
+ assert((!ScopeTops[AfterLoop->getNumber()] ||
+ ScopeTops[AfterLoop->getNumber()]->getNumber() < MBB.getNumber()) &&
+ "With block sorting the outermost loop for a block should be first.");
+ if (!ScopeTops[AfterLoop->getNumber()])
+ ScopeTops[AfterLoop->getNumber()] = &MBB;
+}
+
+static unsigned
+GetDepth(const SmallVectorImpl<const MachineBasicBlock *> &Stack,
+ const MachineBasicBlock *MBB) {
+ unsigned Depth = 0;
+ for (auto X : reverse(Stack)) {
+ if (X == MBB)
+ break;
+ ++Depth;
+ }
+ assert(Depth < Stack.size() && "Branch destination should be in scope");
+ return Depth;
+}
+
+/// In normal assembly languages, when the end of a function is unreachable,
+/// because the function ends in an infinite loop or a noreturn call or similar,
+/// it isn't necessary to worry about the function return type at the end of
+/// the function, because it's never reached. However, in WebAssembly, blocks
+/// that end at the function end need to have a return type signature that
+/// matches the function signature, even though it's unreachable. This function
+/// checks for such cases and fixes up the signatures.
+static void FixEndsAtEndOfFunction(
+ MachineFunction &MF,
+ const WebAssemblyFunctionInfo &MFI,
+ DenseMap<const MachineInstr *, MachineInstr *> &BlockTops,
+ DenseMap<const MachineInstr *, MachineInstr *> &LoopTops) {
+ assert(MFI.getResults().size() <= 1);
+
+ if (MFI.getResults().empty())
+ return;
+
+ WebAssembly::ExprType retType;
+ switch (MFI.getResults().front().SimpleTy) {
+ case MVT::i32: retType = WebAssembly::ExprType::I32; break;
+ case MVT::i64: retType = WebAssembly::ExprType::I64; break;
+ case MVT::f32: retType = WebAssembly::ExprType::F32; break;
+ case MVT::f64: retType = WebAssembly::ExprType::F64; break;
+ case MVT::v16i8: retType = WebAssembly::ExprType::I8x16; break;
+ case MVT::v8i16: retType = WebAssembly::ExprType::I16x8; break;
+ case MVT::v4i32: retType = WebAssembly::ExprType::I32x4; break;
+ case MVT::v4f32: retType = WebAssembly::ExprType::F32x4; break;
+ default: llvm_unreachable("unexpected return type");
+ }
+
+ for (MachineBasicBlock &MBB : reverse(MF)) {
+ for (MachineInstr &MI : reverse(MBB)) {
+ if (MI.isPosition() || MI.isDebugValue())
+ continue;
+ if (MI.getOpcode() == WebAssembly::END_BLOCK) {
+ BlockTops[&MI]->getOperand(0).setImm(int32_t(retType));
+ continue;
+ }
+ if (MI.getOpcode() == WebAssembly::END_LOOP) {
+ LoopTops[&MI]->getOperand(0).setImm(int32_t(retType));
+ continue;
+ }
+ // Something other than an `end`. We're done.
+ return;
+ }
+ }
+}
+
+/// Insert LOOP and BLOCK markers at appropriate places.
+static void PlaceMarkers(MachineFunction &MF, const MachineLoopInfo &MLI,
+ const WebAssemblyInstrInfo &TII,
+ MachineDominatorTree &MDT,
+ WebAssemblyFunctionInfo &MFI) {
+ // For each block whose label represents the end of a scope, record the block
+ // which holds the beginning of the scope. This will allow us to quickly skip
+ // over scoped regions when walking blocks. We allocate one more than the
+ // number of blocks in the function to accommodate for the possible fake block
+ // we may insert at the end.
+ SmallVector<MachineBasicBlock *, 8> ScopeTops(MF.getNumBlockIDs() + 1);
+
+ // For each LOOP_END, the corresponding LOOP.
+ DenseMap<const MachineInstr *, MachineInstr *> LoopTops;
+
+ // For each END_BLOCK, the corresponding BLOCK.
+ DenseMap<const MachineInstr *, MachineInstr *> BlockTops;
+
+ for (auto &MBB : MF) {
+ // Place the LOOP for MBB if MBB is the header of a loop.
+ PlaceLoopMarker(MBB, MF, ScopeTops, LoopTops, TII, MLI);
+
+ // Place the BLOCK for MBB if MBB is branched to from above.
+ PlaceBlockMarker(MBB, MF, ScopeTops, BlockTops, LoopTops, TII, MLI, MDT, MFI);
+ }
+
+ // Now rewrite references to basic blocks to be depth immediates.
+ SmallVector<const MachineBasicBlock *, 8> Stack;
+ for (auto &MBB : reverse(MF)) {
+ for (auto &MI : reverse(MBB)) {
+ switch (MI.getOpcode()) {
+ case WebAssembly::BLOCK:
+ assert(ScopeTops[Stack.back()->getNumber()]->getNumber() <= MBB.getNumber() &&
+ "Block should be balanced");
+ Stack.pop_back();
+ break;
+ case WebAssembly::LOOP:
+ assert(Stack.back() == &MBB && "Loop top should be balanced");
+ Stack.pop_back();
+ break;
+ case WebAssembly::END_BLOCK:
+ Stack.push_back(&MBB);
+ break;
+ case WebAssembly::END_LOOP:
+ Stack.push_back(LoopTops[&MI]->getParent());
+ break;
+ default:
+ if (MI.isTerminator()) {
+ // Rewrite MBB operands to be depth immediates.
+ SmallVector<MachineOperand, 4> Ops(MI.operands());
+ while (MI.getNumOperands() > 0)
+ MI.RemoveOperand(MI.getNumOperands() - 1);
+ for (auto MO : Ops) {
+ if (MO.isMBB())
+ MO = MachineOperand::CreateImm(GetDepth(Stack, MO.getMBB()));
+ MI.addOperand(MF, MO);
+ }
+ }
+ break;
+ }
+ }
+ }
+ assert(Stack.empty() && "Control flow should be balanced");
+
+ // Fix up block/loop signatures at the end of the function to conform to
+ // WebAssembly's rules.
+ FixEndsAtEndOfFunction(MF, MFI, BlockTops, LoopTops);
+}
+
+bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) {
+ DEBUG(dbgs() << "********** CFG Stackifying **********\n"
+ "********** Function: "
+ << MF.getName() << '\n');
+
+ const auto &MLI = getAnalysis<MachineLoopInfo>();
+ auto &MDT = getAnalysis<MachineDominatorTree>();
+ // Liveness is not tracked for VALUE_STACK physreg.
+ const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+ WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
+ MF.getRegInfo().invalidateLiveness();
+
+ // Sort the blocks, with contiguous loops.
+ SortBlocks(MF, MLI, MDT);
+
+ // Place the BLOCK and LOOP markers to indicate the beginnings of scopes.
+ PlaceMarkers(MF, MLI, TII, MDT, MFI);
+
+ return true;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp
new file mode 100644
index 000000000000..fc0a01ca30e5
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp
@@ -0,0 +1,120 @@
+//===-- WebAssemblyCallIndirectFixup.cpp - Fix call_indirects -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file converts pseudo call_indirect instructions into real
+/// call_indirects.
+///
+/// The order of arguments for a call_indirect is the arguments to the function
+/// call, followed by the function pointer. There's no natural way to express
+/// a machineinstr with varargs followed by one more arg, so we express it as
+/// the function pointer followed by varargs, then rewrite it here.
+///
+/// We need to rewrite the order of the arguments on the machineinstrs
+/// themselves so that register stackification knows the order they'll be
+/// executed in.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" // for WebAssembly::ARGUMENT_*
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-call-indirect-fixup"
+
+namespace {
+class WebAssemblyCallIndirectFixup final : public MachineFunctionPass {
+ StringRef getPassName() const override {
+ return "WebAssembly CallIndirect Fixup";
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+public:
+ static char ID; // Pass identification, replacement for typeid
+ WebAssemblyCallIndirectFixup() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyCallIndirectFixup::ID = 0;
+FunctionPass *llvm::createWebAssemblyCallIndirectFixup() {
+ return new WebAssemblyCallIndirectFixup();
+}
+
+static unsigned GetNonPseudoCallIndirectOpcode(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ using namespace WebAssembly;
+ case PCALL_INDIRECT_VOID: return CALL_INDIRECT_VOID;
+ case PCALL_INDIRECT_I32: return CALL_INDIRECT_I32;
+ case PCALL_INDIRECT_I64: return CALL_INDIRECT_I64;
+ case PCALL_INDIRECT_F32: return CALL_INDIRECT_F32;
+ case PCALL_INDIRECT_F64: return CALL_INDIRECT_F64;
+ case PCALL_INDIRECT_v16i8: return CALL_INDIRECT_v16i8;
+ case PCALL_INDIRECT_v8i16: return CALL_INDIRECT_v8i16;
+ case PCALL_INDIRECT_v4i32: return CALL_INDIRECT_v4i32;
+ case PCALL_INDIRECT_v4f32: return CALL_INDIRECT_v4f32;
+ default: return INSTRUCTION_LIST_END;
+ }
+}
+
+static bool IsPseudoCallIndirect(const MachineInstr &MI) {
+ return GetNonPseudoCallIndirectOpcode(MI) !=
+ WebAssembly::INSTRUCTION_LIST_END;
+}
+
+bool WebAssemblyCallIndirectFixup::runOnMachineFunction(MachineFunction &MF) {
+ DEBUG(dbgs() << "********** Fixing up CALL_INDIRECTs **********\n"
+ << MF.getName() << '\n');
+
+ bool Changed = false;
+ const WebAssemblyInstrInfo *TII =
+ MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ if (IsPseudoCallIndirect(MI)) {
+ DEBUG(dbgs() << "Found call_indirect: " << MI << '\n');
+
+ // Rewrite pseudo to non-pseudo
+ const MCInstrDesc &Desc = TII->get(GetNonPseudoCallIndirectOpcode(MI));
+ MI.setDesc(Desc);
+
+ // Rewrite argument order
+ auto Uses = MI.explicit_uses();
+ MachineInstr::mop_iterator it = Uses.begin();
+ const MachineOperand MO = *it;
+
+ // Set up the flags immediate, which currently has no defined flags
+ // so it's always zero.
+ it->ChangeToImmediate(0);
+
+ MI.addOperand(MF, MO);
+
+ DEBUG(dbgs() << " After transform: " << MI);
+ Changed = true;
+ }
+ }
+ }
+
+ DEBUG(dbgs() << "\nDone fixing up CALL_INDIRECTs\n\n");
+
+ return Changed;
+}
+
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
new file mode 100644
index 000000000000..04ede7ff110c
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
@@ -0,0 +1,308 @@
+//===-- WebAssemblyExplicitLocals.cpp - Make Locals Explicit --------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file converts any remaining registers into WebAssembly locals.
+///
+/// After register stackification and register coloring, convert non-stackified
+/// registers into locals, inserting explicit get_local and set_local
+/// instructions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssembly.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "WebAssemblyUtilities.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-explicit-locals"
+
+namespace {
+class WebAssemblyExplicitLocals final : public MachineFunctionPass {
+ StringRef getPassName() const override {
+ return "WebAssembly Explicit Locals";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addPreserved<MachineBlockFrequencyInfo>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+public:
+ static char ID; // Pass identification, replacement for typeid
+ WebAssemblyExplicitLocals() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyExplicitLocals::ID = 0;
+FunctionPass *llvm::createWebAssemblyExplicitLocals() {
+ return new WebAssemblyExplicitLocals();
+}
+
+/// Return a local id number for the given register, assigning it a new one
+/// if it doesn't yet have one.
+static unsigned getLocalId(DenseMap<unsigned, unsigned> &Reg2Local,
+ unsigned &CurLocal, unsigned Reg) {
+ return Reg2Local.insert(std::make_pair(Reg, CurLocal++)).first->second;
+}
+
+/// Get the appropriate get_local opcode for the given register class.
+static unsigned getGetLocalOpcode(const TargetRegisterClass *RC) {
+ if (RC == &WebAssembly::I32RegClass)
+ return WebAssembly::GET_LOCAL_I32;
+ if (RC == &WebAssembly::I64RegClass)
+ return WebAssembly::GET_LOCAL_I64;
+ if (RC == &WebAssembly::F32RegClass)
+ return WebAssembly::GET_LOCAL_F32;
+ if (RC == &WebAssembly::F64RegClass)
+ return WebAssembly::GET_LOCAL_F64;
+ if (RC == &WebAssembly::V128RegClass)
+ return WebAssembly::GET_LOCAL_V128;
+ llvm_unreachable("Unexpected register class");
+}
+
+/// Get the appropriate set_local opcode for the given register class.
+static unsigned getSetLocalOpcode(const TargetRegisterClass *RC) {
+ if (RC == &WebAssembly::I32RegClass)
+ return WebAssembly::SET_LOCAL_I32;
+ if (RC == &WebAssembly::I64RegClass)
+ return WebAssembly::SET_LOCAL_I64;
+ if (RC == &WebAssembly::F32RegClass)
+ return WebAssembly::SET_LOCAL_F32;
+ if (RC == &WebAssembly::F64RegClass)
+ return WebAssembly::SET_LOCAL_F64;
+ if (RC == &WebAssembly::V128RegClass)
+ return WebAssembly::SET_LOCAL_V128;
+ llvm_unreachable("Unexpected register class");
+}
+
+/// Get the appropriate tee_local opcode for the given register class.
+static unsigned getTeeLocalOpcode(const TargetRegisterClass *RC) {
+ if (RC == &WebAssembly::I32RegClass)
+ return WebAssembly::TEE_LOCAL_I32;
+ if (RC == &WebAssembly::I64RegClass)
+ return WebAssembly::TEE_LOCAL_I64;
+ if (RC == &WebAssembly::F32RegClass)
+ return WebAssembly::TEE_LOCAL_F32;
+ if (RC == &WebAssembly::F64RegClass)
+ return WebAssembly::TEE_LOCAL_F64;
+ if (RC == &WebAssembly::V128RegClass)
+ return WebAssembly::TEE_LOCAL_V128;
+ llvm_unreachable("Unexpected register class");
+}
+
+/// Get the type associated with the given register class.
+static MVT typeForRegClass(const TargetRegisterClass *RC) {
+ if (RC == &WebAssembly::I32RegClass)
+ return MVT::i32;
+ if (RC == &WebAssembly::I64RegClass)
+ return MVT::i64;
+ if (RC == &WebAssembly::F32RegClass)
+ return MVT::f32;
+ if (RC == &WebAssembly::F64RegClass)
+ return MVT::f64;
+ llvm_unreachable("unrecognized register class");
+}
+
+/// Given a MachineOperand of a stackified vreg, return the instruction at the
+/// start of the expression tree.
+static MachineInstr *FindStartOfTree(MachineOperand &MO,
+ MachineRegisterInfo &MRI,
+ WebAssemblyFunctionInfo &MFI) {
+ unsigned Reg = MO.getReg();
+ assert(MFI.isVRegStackified(Reg));
+ MachineInstr *Def = MRI.getVRegDef(Reg);
+
+ // Find the first stackified use and proceed from there.
+ for (MachineOperand &DefMO : Def->explicit_uses()) {
+ if (!DefMO.isReg())
+ continue;
+ return FindStartOfTree(DefMO, MRI, MFI);
+ }
+
+ // If there were no stackified uses, we've reached the start.
+ return Def;
+}
+
+bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
+ DEBUG(dbgs() << "********** Make Locals Explicit **********\n"
+ "********** Function: "
+ << MF.getName() << '\n');
+
+ // Disable this pass if we aren't doing direct wasm object emission.
+ if (MF.getSubtarget<WebAssemblySubtarget>()
+ .getTargetTriple().isOSBinFormatELF())
+ return false;
+
+ bool Changed = false;
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
+ const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+
+ // Map non-stackified virtual registers to their local ids.
+ DenseMap<unsigned, unsigned> Reg2Local;
+
+ // Handle ARGUMENTS first to ensure that they get the designated numbers.
+ for (MachineBasicBlock::iterator I = MF.begin()->begin(),
+ E = MF.begin()->end();
+ I != E;) {
+ MachineInstr &MI = *I++;
+ if (!WebAssembly::isArgument(MI))
+ break;
+ unsigned Reg = MI.getOperand(0).getReg();
+ assert(!MFI.isVRegStackified(Reg));
+ Reg2Local[Reg] = MI.getOperand(1).getImm();
+ MI.eraseFromParent();
+ Changed = true;
+ }
+
+ // Start assigning local numbers after the last parameter.
+ unsigned CurLocal = MFI.getParams().size();
+
+ // Visit each instruction in the function.
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
+ MachineInstr &MI = *I++;
+ assert(!WebAssembly::isArgument(MI));
+
+ if (MI.isDebugValue() || MI.isLabel())
+ continue;
+
+ // Replace tee instructions with tee_local. The difference is that tee
+ // instructins have two defs, while tee_local instructions have one def
+ // and an index of a local to write to.
+ if (WebAssembly::isTee(MI)) {
+ assert(MFI.isVRegStackified(MI.getOperand(0).getReg()));
+ assert(!MFI.isVRegStackified(MI.getOperand(1).getReg()));
+ unsigned OldReg = MI.getOperand(2).getReg();
+ const TargetRegisterClass *RC = MRI.getRegClass(OldReg);
+
+ // Stackify the input if it isn't stackified yet.
+ if (!MFI.isVRegStackified(OldReg)) {
+ unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg);
+ unsigned NewReg = MRI.createVirtualRegister(RC);
+ unsigned Opc = getGetLocalOpcode(RC);
+ BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(Opc), NewReg)
+ .addImm(LocalId);
+ MI.getOperand(2).setReg(NewReg);
+ MFI.stackifyVReg(NewReg);
+ }
+
+ // Replace the TEE with a TEE_LOCAL.
+ unsigned LocalId =
+ getLocalId(Reg2Local, CurLocal, MI.getOperand(1).getReg());
+ unsigned Opc = getTeeLocalOpcode(RC);
+ BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(Opc),
+ MI.getOperand(0).getReg())
+ .addImm(LocalId)
+ .addReg(MI.getOperand(2).getReg());
+
+ MI.eraseFromParent();
+ Changed = true;
+ continue;
+ }
+
+ // Insert set_locals for any defs that aren't stackified yet. Currently
+ // we handle at most one def.
+ assert(MI.getDesc().getNumDefs() <= 1);
+ if (MI.getDesc().getNumDefs() == 1) {
+ unsigned OldReg = MI.getOperand(0).getReg();
+ if (!MFI.isVRegStackified(OldReg) && !MRI.use_empty(OldReg)) {
+ unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg);
+ const TargetRegisterClass *RC = MRI.getRegClass(OldReg);
+ unsigned NewReg = MRI.createVirtualRegister(RC);
+ auto InsertPt = std::next(MachineBasicBlock::iterator(&MI));
+ unsigned Opc = getSetLocalOpcode(RC);
+ BuildMI(MBB, InsertPt, MI.getDebugLoc(), TII->get(Opc))
+ .addImm(LocalId)
+ .addReg(NewReg);
+ MI.getOperand(0).setReg(NewReg);
+ MFI.stackifyVReg(NewReg);
+ Changed = true;
+ }
+ }
+
+ // Insert get_locals for any uses that aren't stackified yet.
+ MachineInstr *InsertPt = &MI;
+ for (MachineOperand &MO : reverse(MI.explicit_uses())) {
+ if (!MO.isReg())
+ continue;
+
+ unsigned OldReg = MO.getReg();
+
+ // If we see a stackified register, prepare to insert subsequent
+ // get_locals before the start of its tree.
+ if (MFI.isVRegStackified(OldReg)) {
+ InsertPt = FindStartOfTree(MO, MRI, MFI);
+ continue;
+ }
+
+ // Insert a get_local.
+ unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg);
+ const TargetRegisterClass *RC = MRI.getRegClass(OldReg);
+ unsigned NewReg = MRI.createVirtualRegister(RC);
+ unsigned Opc = getGetLocalOpcode(RC);
+ InsertPt =
+ BuildMI(MBB, InsertPt, MI.getDebugLoc(), TII->get(Opc), NewReg)
+ .addImm(LocalId);
+ MO.setReg(NewReg);
+ MFI.stackifyVReg(NewReg);
+ Changed = true;
+ }
+
+ // Coalesce and eliminate COPY instructions.
+ if (WebAssembly::isCopy(MI)) {
+ MRI.replaceRegWith(MI.getOperand(1).getReg(),
+ MI.getOperand(0).getReg());
+ MI.eraseFromParent();
+ Changed = true;
+ }
+ }
+ }
+
+ // Define the locals.
+ for (size_t i = 0, e = MRI.getNumVirtRegs(); i < e; ++i) {
+ unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+ auto I = Reg2Local.find(Reg);
+ if (I == Reg2Local.end() || I->second < MFI.getParams().size())
+ continue;
+
+ MFI.addLocal(typeForRegClass(MRI.getRegClass(Reg)));
+ Changed = true;
+ }
+
+#ifndef NDEBUG
+ // Assert that all registers have been stackified at this point.
+ for (const MachineBasicBlock &MBB : MF) {
+ for (const MachineInstr &MI : MBB) {
+ if (MI.isDebugValue() || MI.isLabel())
+ continue;
+ for (const MachineOperand &MO : MI.explicit_operands()) {
+ assert(
+ (!MO.isReg() || MRI.use_empty(MO.getReg()) ||
+ MFI.isVRegStackified(MO.getReg())) &&
+ "WebAssemblyExplicitLocals failed to stackify a register operand");
+ }
+ }
+ }
+#endif
+
+ return Changed;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
new file mode 100644
index 000000000000..529540ea4ed2
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -0,0 +1,1274 @@
+//===-- WebAssemblyFastISel.cpp - WebAssembly FastISel implementation -----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file defines the WebAssembly-specific support for the FastISel
+/// class. Some of the target-specific code is generated by tablegen in the file
+/// WebAssemblyGenFastISel.inc, which is #included here.
+///
+/// TODO: kill flags
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "WebAssemblyTargetMachine.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-fastisel"
+
+namespace {
+
+class WebAssemblyFastISel final : public FastISel {
+ // All possible address modes.
+ class Address {
+ public:
+ typedef enum { RegBase, FrameIndexBase } BaseKind;
+
+ private:
+ BaseKind Kind;
+ union {
+ unsigned Reg;
+ int FI;
+ } Base;
+
+ int64_t Offset;
+
+ const GlobalValue *GV;
+
+ public:
+ // Innocuous defaults for our address.
+ Address() : Kind(RegBase), Offset(0), GV(0) { Base.Reg = 0; }
+ void setKind(BaseKind K) { Kind = K; }
+ BaseKind getKind() const { return Kind; }
+ bool isRegBase() const { return Kind == RegBase; }
+ bool isFIBase() const { return Kind == FrameIndexBase; }
+ void setReg(unsigned Reg) {
+ assert(isRegBase() && "Invalid base register access!");
+ Base.Reg = Reg;
+ }
+ unsigned getReg() const {
+ assert(isRegBase() && "Invalid base register access!");
+ return Base.Reg;
+ }
+ void setFI(unsigned FI) {
+ assert(isFIBase() && "Invalid base frame index access!");
+ Base.FI = FI;
+ }
+ unsigned getFI() const {
+ assert(isFIBase() && "Invalid base frame index access!");
+ return Base.FI;
+ }
+
+ void setOffset(int64_t Offset_) {
+ assert(Offset_ >= 0 && "Offsets must be non-negative");
+ Offset = Offset_;
+ }
+ int64_t getOffset() const { return Offset; }
+ void setGlobalValue(const GlobalValue *G) { GV = G; }
+ const GlobalValue *getGlobalValue() const { return GV; }
+ };
+
+ /// Keep a pointer to the WebAssemblySubtarget around so that we can make the
+ /// right decision when generating code for different targets.
+ const WebAssemblySubtarget *Subtarget;
+ LLVMContext *Context;
+
+private:
+ // Utility helper routines
+ MVT::SimpleValueType getSimpleType(Type *Ty) {
+ EVT VT = TLI.getValueType(DL, Ty, /*HandleUnknown=*/true);
+ return VT.isSimple() ? VT.getSimpleVT().SimpleTy :
+ MVT::INVALID_SIMPLE_VALUE_TYPE;
+ }
+ MVT::SimpleValueType getLegalType(MVT::SimpleValueType VT) {
+ switch (VT) {
+ case MVT::i1:
+ case MVT::i8:
+ case MVT::i16:
+ return MVT::i32;
+ case MVT::i32:
+ case MVT::i64:
+ case MVT::f32:
+ case MVT::f64:
+ return VT;
+ case MVT::v16i8:
+ case MVT::v8i16:
+ case MVT::v4i32:
+ case MVT::v4f32:
+ if (Subtarget->hasSIMD128())
+ return VT;
+ break;
+ default:
+ break;
+ }
+ return MVT::INVALID_SIMPLE_VALUE_TYPE;
+ }
+ bool computeAddress(const Value *Obj, Address &Addr);
+ void materializeLoadStoreOperands(Address &Addr);
+ void addLoadStoreOperands(const Address &Addr, const MachineInstrBuilder &MIB,
+ MachineMemOperand *MMO);
+ unsigned maskI1Value(unsigned Reg, const Value *V);
+ unsigned getRegForI1Value(const Value *V, bool &Not);
+ unsigned zeroExtendToI32(unsigned Reg, const Value *V,
+ MVT::SimpleValueType From);
+ unsigned signExtendToI32(unsigned Reg, const Value *V,
+ MVT::SimpleValueType From);
+ unsigned zeroExtend(unsigned Reg, const Value *V,
+ MVT::SimpleValueType From,
+ MVT::SimpleValueType To);
+ unsigned signExtend(unsigned Reg, const Value *V,
+ MVT::SimpleValueType From,
+ MVT::SimpleValueType To);
+ unsigned getRegForUnsignedValue(const Value *V);
+ unsigned getRegForSignedValue(const Value *V);
+ unsigned getRegForPromotedValue(const Value *V, bool IsSigned);
+ unsigned notValue(unsigned Reg);
+ unsigned copyValue(unsigned Reg);
+
+ // Backend specific FastISel code.
+ unsigned fastMaterializeAlloca(const AllocaInst *AI) override;
+ unsigned fastMaterializeConstant(const Constant *C) override;
+ bool fastLowerArguments() override;
+
+ // Selection routines.
+ bool selectCall(const Instruction *I);
+ bool selectSelect(const Instruction *I);
+ bool selectTrunc(const Instruction *I);
+ bool selectZExt(const Instruction *I);
+ bool selectSExt(const Instruction *I);
+ bool selectICmp(const Instruction *I);
+ bool selectFCmp(const Instruction *I);
+ bool selectBitCast(const Instruction *I);
+ bool selectLoad(const Instruction *I);
+ bool selectStore(const Instruction *I);
+ bool selectBr(const Instruction *I);
+ bool selectRet(const Instruction *I);
+ bool selectUnreachable(const Instruction *I);
+
+public:
+ // Backend specific FastISel code.
+ WebAssemblyFastISel(FunctionLoweringInfo &FuncInfo,
+ const TargetLibraryInfo *LibInfo)
+ : FastISel(FuncInfo, LibInfo, /*SkipTargetIndependentISel=*/true) {
+ Subtarget = &FuncInfo.MF->getSubtarget<WebAssemblySubtarget>();
+ Context = &FuncInfo.Fn->getContext();
+ }
+
+ bool fastSelectInstruction(const Instruction *I) override;
+
+#include "WebAssemblyGenFastISel.inc"
+};
+
+} // end anonymous namespace
+
+bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) {
+
+ const User *U = nullptr;
+ unsigned Opcode = Instruction::UserOp1;
+ if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
+ // Don't walk into other basic blocks unless the object is an alloca from
+ // another block, otherwise it may not have a virtual register assigned.
+ if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(Obj)) ||
+ FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
+ Opcode = I->getOpcode();
+ U = I;
+ }
+ } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(Obj)) {
+ Opcode = C->getOpcode();
+ U = C;
+ }
+
+ if (auto *Ty = dyn_cast<PointerType>(Obj->getType()))
+ if (Ty->getAddressSpace() > 255)
+ // Fast instruction selection doesn't support the special
+ // address spaces.
+ return false;
+
+ if (const GlobalValue *GV = dyn_cast<GlobalValue>(Obj)) {
+ if (Addr.getGlobalValue())
+ return false;
+ Addr.setGlobalValue(GV);
+ return true;
+ }
+
+ switch (Opcode) {
+ default:
+ break;
+ case Instruction::BitCast: {
+ // Look through bitcasts.
+ return computeAddress(U->getOperand(0), Addr);
+ }
+ case Instruction::IntToPtr: {
+ // Look past no-op inttoptrs.
+ if (TLI.getValueType(DL, U->getOperand(0)->getType()) ==
+ TLI.getPointerTy(DL))
+ return computeAddress(U->getOperand(0), Addr);
+ break;
+ }
+ case Instruction::PtrToInt: {
+ // Look past no-op ptrtoints.
+ if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
+ return computeAddress(U->getOperand(0), Addr);
+ break;
+ }
+ case Instruction::GetElementPtr: {
+ Address SavedAddr = Addr;
+ uint64_t TmpOffset = Addr.getOffset();
+ // Non-inbounds geps can wrap; wasm's offsets can't.
+ if (!cast<GEPOperator>(U)->isInBounds())
+ goto unsupported_gep;
+ // Iterate through the GEP folding the constants into offsets where
+ // we can.
+ for (gep_type_iterator GTI = gep_type_begin(U), E = gep_type_end(U);
+ GTI != E; ++GTI) {
+ const Value *Op = GTI.getOperand();
+ if (StructType *STy = GTI.getStructTypeOrNull()) {
+ const StructLayout *SL = DL.getStructLayout(STy);
+ unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
+ TmpOffset += SL->getElementOffset(Idx);
+ } else {
+ uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
+ for (;;) {
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+ // Constant-offset addressing.
+ TmpOffset += CI->getSExtValue() * S;
+ break;
+ }
+ if (S == 1 && Addr.isRegBase() && Addr.getReg() == 0) {
+ // An unscaled add of a register. Set it as the new base.
+ Addr.setReg(getRegForValue(Op));
+ break;
+ }
+ if (canFoldAddIntoGEP(U, Op)) {
+ // A compatible add with a constant operand. Fold the constant.
+ ConstantInt *CI =
+ cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+ TmpOffset += CI->getSExtValue() * S;
+ // Iterate on the other operand.
+ Op = cast<AddOperator>(Op)->getOperand(0);
+ continue;
+ }
+ // Unsupported
+ goto unsupported_gep;
+ }
+ }
+ }
+ // Don't fold in negative offsets.
+ if (int64_t(TmpOffset) >= 0) {
+ // Try to grab the base operand now.
+ Addr.setOffset(TmpOffset);
+ if (computeAddress(U->getOperand(0), Addr))
+ return true;
+ }
+ // We failed, restore everything and try the other options.
+ Addr = SavedAddr;
+ unsupported_gep:
+ break;
+ }
+ case Instruction::Alloca: {
+ const AllocaInst *AI = cast<AllocaInst>(Obj);
+ DenseMap<const AllocaInst *, int>::iterator SI =
+ FuncInfo.StaticAllocaMap.find(AI);
+ if (SI != FuncInfo.StaticAllocaMap.end()) {
+ Addr.setKind(Address::FrameIndexBase);
+ Addr.setFI(SI->second);
+ return true;
+ }
+ break;
+ }
+ case Instruction::Add: {
+ // Adds of constants are common and easy enough.
+ const Value *LHS = U->getOperand(0);
+ const Value *RHS = U->getOperand(1);
+
+ if (isa<ConstantInt>(LHS))
+ std::swap(LHS, RHS);
+
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
+ uint64_t TmpOffset = Addr.getOffset() + CI->getSExtValue();
+ if (int64_t(TmpOffset) >= 0) {
+ Addr.setOffset(TmpOffset);
+ return computeAddress(LHS, Addr);
+ }
+ }
+
+ Address Backup = Addr;
+ if (computeAddress(LHS, Addr) && computeAddress(RHS, Addr))
+ return true;
+ Addr = Backup;
+
+ break;
+ }
+ case Instruction::Sub: {
+ // Subs of constants are common and easy enough.
+ const Value *LHS = U->getOperand(0);
+ const Value *RHS = U->getOperand(1);
+
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
+ int64_t TmpOffset = Addr.getOffset() - CI->getSExtValue();
+ if (TmpOffset >= 0) {
+ Addr.setOffset(TmpOffset);
+ return computeAddress(LHS, Addr);
+ }
+ }
+ break;
+ }
+ }
+ Addr.setReg(getRegForValue(Obj));
+ return Addr.getReg() != 0;
+}
+
+void WebAssemblyFastISel::materializeLoadStoreOperands(Address &Addr) {
+ if (Addr.isRegBase()) {
+ unsigned Reg = Addr.getReg();
+ if (Reg == 0) {
+ Reg = createResultReg(Subtarget->hasAddr64() ?
+ &WebAssembly::I64RegClass :
+ &WebAssembly::I32RegClass);
+ unsigned Opc = Subtarget->hasAddr64() ?
+ WebAssembly::CONST_I64 :
+ WebAssembly::CONST_I32;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), Reg)
+ .addImm(0);
+ Addr.setReg(Reg);
+ }
+ }
+}
+
+void WebAssemblyFastISel::addLoadStoreOperands(const Address &Addr,
+ const MachineInstrBuilder &MIB,
+ MachineMemOperand *MMO) {
+ // Set the alignment operand (this is rewritten in SetP2AlignOperands).
+ // TODO: Disable SetP2AlignOperands for FastISel and just do it here.
+ MIB.addImm(0);
+
+ if (const GlobalValue *GV = Addr.getGlobalValue())
+ MIB.addGlobalAddress(GV, Addr.getOffset());
+ else
+ MIB.addImm(Addr.getOffset());
+
+ if (Addr.isRegBase())
+ MIB.addReg(Addr.getReg());
+ else
+ MIB.addFrameIndex(Addr.getFI());
+
+ MIB.addMemOperand(MMO);
+}
+
+unsigned WebAssemblyFastISel::maskI1Value(unsigned Reg, const Value *V) {
+ return zeroExtendToI32(Reg, V, MVT::i1);
+}
+
+unsigned WebAssemblyFastISel::getRegForI1Value(const Value *V, bool &Not) {
+ if (const ICmpInst *ICmp = dyn_cast<ICmpInst>(V))
+ if (const ConstantInt *C = dyn_cast<ConstantInt>(ICmp->getOperand(1)))
+ if (ICmp->isEquality() && C->isZero() && C->getType()->isIntegerTy(32)) {
+ Not = ICmp->isTrueWhenEqual();
+ return getRegForValue(ICmp->getOperand(0));
+ }
+
+ if (BinaryOperator::isNot(V)) {
+ Not = true;
+ return getRegForValue(BinaryOperator::getNotArgument(V));
+ }
+
+ Not = false;
+ return maskI1Value(getRegForValue(V), V);
+}
+
+unsigned WebAssemblyFastISel::zeroExtendToI32(unsigned Reg, const Value *V,
+ MVT::SimpleValueType From) {
+ if (Reg == 0)
+ return 0;
+
+ switch (From) {
+ case MVT::i1:
+ // If the value is naturally an i1, we don't need to mask it.
+ // TODO: Recursively examine selects, phis, and, or, xor, constants.
+ if (From == MVT::i1 && V != nullptr) {
+ if (isa<CmpInst>(V) ||
+ (isa<Argument>(V) && cast<Argument>(V)->hasZExtAttr()))
+ return copyValue(Reg);
+ }
+ case MVT::i8:
+ case MVT::i16:
+ break;
+ case MVT::i32:
+ return copyValue(Reg);
+ default:
+ return 0;
+ }
+
+ unsigned Imm = createResultReg(&WebAssembly::I32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(WebAssembly::CONST_I32), Imm)
+ .addImm(~(~uint64_t(0) << MVT(From).getSizeInBits()));
+
+ unsigned Result = createResultReg(&WebAssembly::I32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(WebAssembly::AND_I32), Result)
+ .addReg(Reg)
+ .addReg(Imm);
+
+ return Result;
+}
+
+unsigned WebAssemblyFastISel::signExtendToI32(unsigned Reg, const Value *V,
+ MVT::SimpleValueType From) {
+ if (Reg == 0)
+ return 0;
+
+ switch (From) {
+ case MVT::i1:
+ case MVT::i8:
+ case MVT::i16:
+ break;
+ case MVT::i32:
+ return copyValue(Reg);
+ default:
+ return 0;
+ }
+
+ unsigned Imm = createResultReg(&WebAssembly::I32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(WebAssembly::CONST_I32), Imm)
+ .addImm(32 - MVT(From).getSizeInBits());
+
+ unsigned Left = createResultReg(&WebAssembly::I32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(WebAssembly::SHL_I32), Left)
+ .addReg(Reg)
+ .addReg(Imm);
+
+ unsigned Right = createResultReg(&WebAssembly::I32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(WebAssembly::SHR_S_I32), Right)
+ .addReg(Left)
+ .addReg(Imm);
+
+ return Right;
+}
+
+unsigned WebAssemblyFastISel::zeroExtend(unsigned Reg, const Value *V,
+ MVT::SimpleValueType From,
+ MVT::SimpleValueType To) {
+ if (To == MVT::i64) {
+ if (From == MVT::i64)
+ return copyValue(Reg);
+
+ Reg = zeroExtendToI32(Reg, V, From);
+
+ unsigned Result = createResultReg(&WebAssembly::I64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(WebAssembly::I64_EXTEND_U_I32), Result)
+ .addReg(Reg);
+ return Result;
+ }
+
+ return zeroExtendToI32(Reg, V, From);
+}
+
+unsigned WebAssemblyFastISel::signExtend(unsigned Reg, const Value *V,
+ MVT::SimpleValueType From,
+ MVT::SimpleValueType To) {
+ if (To == MVT::i64) {
+ if (From == MVT::i64)
+ return copyValue(Reg);
+
+ Reg = signExtendToI32(Reg, V, From);
+
+ unsigned Result = createResultReg(&WebAssembly::I64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(WebAssembly::I64_EXTEND_S_I32), Result)
+ .addReg(Reg);
+ return Result;
+ }
+
+ return signExtendToI32(Reg, V, From);
+}
+
+unsigned WebAssemblyFastISel::getRegForUnsignedValue(const Value *V) {
+ MVT::SimpleValueType From = getSimpleType(V->getType());
+ MVT::SimpleValueType To = getLegalType(From);
+ return zeroExtend(getRegForValue(V), V, From, To);
+}
+
+unsigned WebAssemblyFastISel::getRegForSignedValue(const Value *V) {
+ MVT::SimpleValueType From = getSimpleType(V->getType());
+ MVT::SimpleValueType To = getLegalType(From);
+ return zeroExtend(getRegForValue(V), V, From, To);
+}
+
+unsigned WebAssemblyFastISel::getRegForPromotedValue(const Value *V,
+ bool IsSigned) {
+ return IsSigned ? getRegForSignedValue(V) :
+ getRegForUnsignedValue(V);
+}
+
+unsigned WebAssemblyFastISel::notValue(unsigned Reg) {
+ assert(MRI.getRegClass(Reg) == &WebAssembly::I32RegClass);
+
+ unsigned NotReg = createResultReg(&WebAssembly::I32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(WebAssembly::EQZ_I32), NotReg)
+ .addReg(Reg);
+ return NotReg;
+}
+
+unsigned WebAssemblyFastISel::copyValue(unsigned Reg) {
+ unsigned ResultReg = createResultReg(MRI.getRegClass(Reg));
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(WebAssembly::COPY), ResultReg)
+ .addReg(Reg);
+ return ResultReg;
+}
+
+unsigned WebAssemblyFastISel::fastMaterializeAlloca(const AllocaInst *AI) {
+ DenseMap<const AllocaInst *, int>::iterator SI =
+ FuncInfo.StaticAllocaMap.find(AI);
+
+ if (SI != FuncInfo.StaticAllocaMap.end()) {
+ unsigned ResultReg = createResultReg(Subtarget->hasAddr64() ?
+ &WebAssembly::I64RegClass :
+ &WebAssembly::I32RegClass);
+ unsigned Opc = Subtarget->hasAddr64() ?
+ WebAssembly::COPY_I64 :
+ WebAssembly::COPY_I32;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+ .addFrameIndex(SI->second);
+ return ResultReg;
+ }
+
+ return 0;
+}
+
+unsigned WebAssemblyFastISel::fastMaterializeConstant(const Constant *C) {
+ if (const GlobalValue *GV = dyn_cast<GlobalValue>(C)) {
+ unsigned ResultReg = createResultReg(Subtarget->hasAddr64() ?
+ &WebAssembly::I64RegClass :
+ &WebAssembly::I32RegClass);
+ unsigned Opc = Subtarget->hasAddr64() ?
+ WebAssembly::CONST_I64 :
+ WebAssembly::CONST_I32;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+ .addGlobalAddress(GV);
+ return ResultReg;
+ }
+
+ // Let target-independent code handle it.
+ return 0;
+}
+
+bool WebAssemblyFastISel::fastLowerArguments() {
+ if (!FuncInfo.CanLowerReturn)
+ return false;
+
+ const Function *F = FuncInfo.Fn;
+ if (F->isVarArg())
+ return false;
+
+ unsigned i = 0;
+ for (auto const &Arg : F->args()) {
+ const AttributeSet &Attrs = F->getAttributes();
+ if (Attrs.hasAttribute(i+1, Attribute::ByVal) ||
+ Attrs.hasAttribute(i+1, Attribute::SwiftSelf) ||
+ Attrs.hasAttribute(i+1, Attribute::SwiftError) ||
+ Attrs.hasAttribute(i+1, Attribute::InAlloca) ||
+ Attrs.hasAttribute(i+1, Attribute::Nest))
+ return false;
+
+ Type *ArgTy = Arg.getType();
+ if (ArgTy->isStructTy() || ArgTy->isArrayTy())
+ return false;
+ if (!Subtarget->hasSIMD128() && ArgTy->isVectorTy())
+ return false;
+
+ unsigned Opc;
+ const TargetRegisterClass *RC;
+ switch (getSimpleType(ArgTy)) {
+ case MVT::i1:
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32:
+ Opc = WebAssembly::ARGUMENT_I32;
+ RC = &WebAssembly::I32RegClass;
+ break;
+ case MVT::i64:
+ Opc = WebAssembly::ARGUMENT_I64;
+ RC = &WebAssembly::I64RegClass;
+ break;
+ case MVT::f32:
+ Opc = WebAssembly::ARGUMENT_F32;
+ RC = &WebAssembly::F32RegClass;
+ break;
+ case MVT::f64:
+ Opc = WebAssembly::ARGUMENT_F64;
+ RC = &WebAssembly::F64RegClass;
+ break;
+ case MVT::v16i8:
+ Opc = WebAssembly::ARGUMENT_v16i8;
+ RC = &WebAssembly::V128RegClass;
+ break;
+ case MVT::v8i16:
+ Opc = WebAssembly::ARGUMENT_v8i16;
+ RC = &WebAssembly::V128RegClass;
+ break;
+ case MVT::v4i32:
+ Opc = WebAssembly::ARGUMENT_v4i32;
+ RC = &WebAssembly::V128RegClass;
+ break;
+ case MVT::v4f32:
+ Opc = WebAssembly::ARGUMENT_v4f32;
+ RC = &WebAssembly::V128RegClass;
+ break;
+ default:
+ return false;
+ }
+ unsigned ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+ .addImm(i);
+ updateValueMap(&Arg, ResultReg);
+
+ ++i;
+ }
+
+ MRI.addLiveIn(WebAssembly::ARGUMENTS);
+
+ auto *MFI = MF->getInfo<WebAssemblyFunctionInfo>();
+ for (auto const &Arg : F->args())
+ MFI->addParam(getLegalType(getSimpleType(Arg.getType())));
+
+ return true;
+}
+
+bool WebAssemblyFastISel::selectCall(const Instruction *I) {
+ const CallInst *Call = cast<CallInst>(I);
+
+ if (Call->isMustTailCall() || Call->isInlineAsm() ||
+ Call->getFunctionType()->isVarArg())
+ return false;
+
+ Function *Func = Call->getCalledFunction();
+ if (Func && Func->isIntrinsic())
+ return false;
+
+ FunctionType *FuncTy = Call->getFunctionType();
+ unsigned Opc;
+ bool IsDirect = Func != nullptr;
+ bool IsVoid = FuncTy->getReturnType()->isVoidTy();
+ unsigned ResultReg;
+ if (IsVoid) {
+ Opc = IsDirect ? WebAssembly::CALL_VOID : WebAssembly::PCALL_INDIRECT_VOID;
+ } else {
+ if (!Subtarget->hasSIMD128() && Call->getType()->isVectorTy())
+ return false;
+
+ MVT::SimpleValueType RetTy = getSimpleType(Call->getType());
+ switch (RetTy) {
+ case MVT::i1:
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32:
+ Opc = IsDirect ? WebAssembly::CALL_I32 : WebAssembly::PCALL_INDIRECT_I32;
+ ResultReg = createResultReg(&WebAssembly::I32RegClass);
+ break;
+ case MVT::i64:
+ Opc = IsDirect ? WebAssembly::CALL_I64 : WebAssembly::PCALL_INDIRECT_I64;
+ ResultReg = createResultReg(&WebAssembly::I64RegClass);
+ break;
+ case MVT::f32:
+ Opc = IsDirect ? WebAssembly::CALL_F32 : WebAssembly::PCALL_INDIRECT_F32;
+ ResultReg = createResultReg(&WebAssembly::F32RegClass);
+ break;
+ case MVT::f64:
+ Opc = IsDirect ? WebAssembly::CALL_F64 : WebAssembly::PCALL_INDIRECT_F64;
+ ResultReg = createResultReg(&WebAssembly::F64RegClass);
+ break;
+ case MVT::v16i8:
+ Opc =
+ IsDirect ? WebAssembly::CALL_v16i8 : WebAssembly::PCALL_INDIRECT_v16i8;
+ ResultReg = createResultReg(&WebAssembly::V128RegClass);
+ break;
+ case MVT::v8i16:
+ Opc =
+ IsDirect ? WebAssembly::CALL_v8i16 : WebAssembly::PCALL_INDIRECT_v8i16;
+ ResultReg = createResultReg(&WebAssembly::V128RegClass);
+ break;
+ case MVT::v4i32:
+ Opc =
+ IsDirect ? WebAssembly::CALL_v4i32 : WebAssembly::PCALL_INDIRECT_v4i32;
+ ResultReg = createResultReg(&WebAssembly::V128RegClass);
+ break;
+ case MVT::v4f32:
+ Opc =
+ IsDirect ? WebAssembly::CALL_v4f32 : WebAssembly::PCALL_INDIRECT_v4f32;
+ ResultReg = createResultReg(&WebAssembly::V128RegClass);
+ break;
+ default:
+ return false;
+ }
+ }
+
+ SmallVector<unsigned, 8> Args;
+ for (unsigned i = 0, e = Call->getNumArgOperands(); i < e; ++i) {
+ Value *V = Call->getArgOperand(i);
+ MVT::SimpleValueType ArgTy = getSimpleType(V->getType());
+ if (ArgTy == MVT::INVALID_SIMPLE_VALUE_TYPE)
+ return false;
+
+ const AttributeSet &Attrs = Call->getAttributes();
+ if (Attrs.hasAttribute(i+1, Attribute::ByVal) ||
+ Attrs.hasAttribute(i+1, Attribute::SwiftSelf) ||
+ Attrs.hasAttribute(i+1, Attribute::SwiftError) ||
+ Attrs.hasAttribute(i+1, Attribute::InAlloca) ||
+ Attrs.hasAttribute(i+1, Attribute::Nest))
+ return false;
+
+ unsigned Reg;
+
+ if (Attrs.hasAttribute(i+1, Attribute::SExt))
+ Reg = getRegForSignedValue(V);
+ else if (Attrs.hasAttribute(i+1, Attribute::ZExt))
+ Reg = getRegForUnsignedValue(V);
+ else
+ Reg = getRegForValue(V);
+
+ if (Reg == 0)
+ return false;
+
+ Args.push_back(Reg);
+ }
+
+ auto MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
+
+ if (!IsVoid)
+ MIB.addReg(ResultReg, RegState::Define);
+
+ if (IsDirect)
+ MIB.addGlobalAddress(Func);
+ else
+ MIB.addReg(getRegForValue(Call->getCalledValue()));
+
+ for (unsigned ArgReg : Args)
+ MIB.addReg(ArgReg);
+
+ if (!IsVoid)
+ updateValueMap(Call, ResultReg);
+ return true;
+}
+
+bool WebAssemblyFastISel::selectSelect(const Instruction *I) {
+ const SelectInst *Select = cast<SelectInst>(I);
+
+ bool Not;
+ unsigned CondReg = getRegForI1Value(Select->getCondition(), Not);
+ if (CondReg == 0)
+ return false;
+
+ unsigned TrueReg = getRegForValue(Select->getTrueValue());
+ if (TrueReg == 0)
+ return false;
+
+ unsigned FalseReg = getRegForValue(Select->getFalseValue());
+ if (FalseReg == 0)
+ return false;
+
+ if (Not)
+ std::swap(TrueReg, FalseReg);
+
+ unsigned Opc;
+ const TargetRegisterClass *RC;
+ switch (getSimpleType(Select->getType())) {
+ case MVT::i1:
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32:
+ Opc = WebAssembly::SELECT_I32;
+ RC = &WebAssembly::I32RegClass;
+ break;
+ case MVT::i64:
+ Opc = WebAssembly::SELECT_I64;
+ RC = &WebAssembly::I64RegClass;
+ break;
+ case MVT::f32:
+ Opc = WebAssembly::SELECT_F32;
+ RC = &WebAssembly::F32RegClass;
+ break;
+ case MVT::f64:
+ Opc = WebAssembly::SELECT_F64;
+ RC = &WebAssembly::F64RegClass;
+ break;
+ default:
+ return false;
+ }
+
+ unsigned ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+ .addReg(TrueReg)
+ .addReg(FalseReg)
+ .addReg(CondReg);
+
+ updateValueMap(Select, ResultReg);
+ return true;
+}
+
+bool WebAssemblyFastISel::selectTrunc(const Instruction *I) {
+ const TruncInst *Trunc = cast<TruncInst>(I);
+
+ unsigned Reg = getRegForValue(Trunc->getOperand(0));
+ if (Reg == 0)
+ return false;
+
+ if (Trunc->getOperand(0)->getType()->isIntegerTy(64)) {
+ unsigned Result = createResultReg(&WebAssembly::I32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(WebAssembly::I32_WRAP_I64), Result)
+ .addReg(Reg);
+ Reg = Result;
+ }
+
+ updateValueMap(Trunc, Reg);
+ return true;
+}
+
+bool WebAssemblyFastISel::selectZExt(const Instruction *I) {
+ const ZExtInst *ZExt = cast<ZExtInst>(I);
+
+ const Value *Op = ZExt->getOperand(0);
+ MVT::SimpleValueType From = getSimpleType(Op->getType());
+ MVT::SimpleValueType To = getLegalType(getSimpleType(ZExt->getType()));
+ unsigned Reg = zeroExtend(getRegForValue(Op), Op, From, To);
+ if (Reg == 0)
+ return false;
+
+ updateValueMap(ZExt, Reg);
+ return true;
+}
+
+bool WebAssemblyFastISel::selectSExt(const Instruction *I) {
+ const SExtInst *SExt = cast<SExtInst>(I);
+
+ const Value *Op = SExt->getOperand(0);
+ MVT::SimpleValueType From = getSimpleType(Op->getType());
+ MVT::SimpleValueType To = getLegalType(getSimpleType(SExt->getType()));
+ unsigned Reg = signExtend(getRegForValue(Op), Op, From, To);
+ if (Reg == 0)
+ return false;
+
+ updateValueMap(SExt, Reg);
+ return true;
+}
+
+bool WebAssemblyFastISel::selectICmp(const Instruction *I) {
+ const ICmpInst *ICmp = cast<ICmpInst>(I);
+
+ bool I32 = getSimpleType(ICmp->getOperand(0)->getType()) != MVT::i64;
+ unsigned Opc;
+ bool isSigned = false;
+ switch (ICmp->getPredicate()) {
+ case ICmpInst::ICMP_EQ:
+ Opc = I32 ? WebAssembly::EQ_I32 : WebAssembly::EQ_I64;
+ break;
+ case ICmpInst::ICMP_NE:
+ Opc = I32 ? WebAssembly::NE_I32 : WebAssembly::NE_I64;
+ break;
+ case ICmpInst::ICMP_UGT:
+ Opc = I32 ? WebAssembly::GT_U_I32 : WebAssembly::GT_U_I64;
+ break;
+ case ICmpInst::ICMP_UGE:
+ Opc = I32 ? WebAssembly::GE_U_I32 : WebAssembly::GE_U_I64;
+ break;
+ case ICmpInst::ICMP_ULT:
+ Opc = I32 ? WebAssembly::LT_U_I32 : WebAssembly::LT_U_I64;
+ break;
+ case ICmpInst::ICMP_ULE:
+ Opc = I32 ? WebAssembly::LE_U_I32 : WebAssembly::LE_U_I64;
+ break;
+ case ICmpInst::ICMP_SGT:
+ Opc = I32 ? WebAssembly::GT_S_I32 : WebAssembly::GT_S_I64;
+ isSigned = true;
+ break;
+ case ICmpInst::ICMP_SGE:
+ Opc = I32 ? WebAssembly::GE_S_I32 : WebAssembly::GE_S_I64;
+ isSigned = true;
+ break;
+ case ICmpInst::ICMP_SLT:
+ Opc = I32 ? WebAssembly::LT_S_I32 : WebAssembly::LT_S_I64;
+ isSigned = true;
+ break;
+ case ICmpInst::ICMP_SLE:
+ Opc = I32 ? WebAssembly::LE_S_I32 : WebAssembly::LE_S_I64;
+ isSigned = true;
+ break;
+ default: return false;
+ }
+
+ unsigned LHS = getRegForPromotedValue(ICmp->getOperand(0), isSigned);
+ if (LHS == 0)
+ return false;
+
+ unsigned RHS = getRegForPromotedValue(ICmp->getOperand(1), isSigned);
+ if (RHS == 0)
+ return false;
+
+ unsigned ResultReg = createResultReg(&WebAssembly::I32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+ .addReg(LHS)
+ .addReg(RHS);
+ updateValueMap(ICmp, ResultReg);
+ return true;
+}
+
+bool WebAssemblyFastISel::selectFCmp(const Instruction *I) {
+ const FCmpInst *FCmp = cast<FCmpInst>(I);
+
+ unsigned LHS = getRegForValue(FCmp->getOperand(0));
+ if (LHS == 0)
+ return false;
+
+ unsigned RHS = getRegForValue(FCmp->getOperand(1));
+ if (RHS == 0)
+ return false;
+
+ bool F32 = getSimpleType(FCmp->getOperand(0)->getType()) != MVT::f64;
+ unsigned Opc;
+ bool Not = false;
+ switch (FCmp->getPredicate()) {
+ case FCmpInst::FCMP_OEQ:
+ Opc = F32 ? WebAssembly::EQ_F32 : WebAssembly::EQ_F64;
+ break;
+ case FCmpInst::FCMP_UNE:
+ Opc = F32 ? WebAssembly::NE_F32 : WebAssembly::NE_F64;
+ break;
+ case FCmpInst::FCMP_OGT:
+ Opc = F32 ? WebAssembly::GT_F32 : WebAssembly::GT_F64;
+ break;
+ case FCmpInst::FCMP_OGE:
+ Opc = F32 ? WebAssembly::GE_F32 : WebAssembly::GE_F64;
+ break;
+ case FCmpInst::FCMP_OLT:
+ Opc = F32 ? WebAssembly::LT_F32 : WebAssembly::LT_F64;
+ break;
+ case FCmpInst::FCMP_OLE:
+ Opc = F32 ? WebAssembly::LE_F32 : WebAssembly::LE_F64;
+ break;
+ case FCmpInst::FCMP_UGT:
+ Opc = F32 ? WebAssembly::LE_F32 : WebAssembly::LE_F64;
+ Not = true;
+ break;
+ case FCmpInst::FCMP_UGE:
+ Opc = F32 ? WebAssembly::LT_F32 : WebAssembly::LT_F64;
+ Not = true;
+ break;
+ case FCmpInst::FCMP_ULT:
+ Opc = F32 ? WebAssembly::GE_F32 : WebAssembly::GE_F64;
+ Not = true;
+ break;
+ case FCmpInst::FCMP_ULE:
+ Opc = F32 ? WebAssembly::GT_F32 : WebAssembly::GT_F64;
+ Not = true;
+ break;
+ default:
+ return false;
+ }
+
+ unsigned ResultReg = createResultReg(&WebAssembly::I32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+ .addReg(LHS)
+ .addReg(RHS);
+
+ if (Not)
+ ResultReg = notValue(ResultReg);
+
+ updateValueMap(FCmp, ResultReg);
+ return true;
+}
+
+bool WebAssemblyFastISel::selectBitCast(const Instruction *I) {
+ // Target-independent code can handle this, except it doesn't set the dead
+ // flag on the ARGUMENTS clobber, so we have to do that manually in order
+ // to satisfy code that expects this of isBitcast() instructions.
+ EVT VT = TLI.getValueType(DL, I->getOperand(0)->getType());
+ EVT RetVT = TLI.getValueType(DL, I->getType());
+ if (!VT.isSimple() || !RetVT.isSimple())
+ return false;
+
+ if (VT == RetVT) {
+ // No-op bitcast.
+ updateValueMap(I, getRegForValue(I->getOperand(0)));
+ return true;
+ }
+
+ unsigned Reg = fastEmit_ISD_BITCAST_r(VT.getSimpleVT(), RetVT.getSimpleVT(),
+ getRegForValue(I->getOperand(0)),
+ I->getOperand(0)->hasOneUse());
+ if (!Reg)
+ return false;
+ MachineBasicBlock::iterator Iter = FuncInfo.InsertPt;
+ --Iter;
+ assert(Iter->isBitcast());
+ Iter->setPhysRegsDeadExcept(ArrayRef<unsigned>(), TRI);
+ updateValueMap(I, Reg);
+ return true;
+}
+
+bool WebAssemblyFastISel::selectLoad(const Instruction *I) {
+ const LoadInst *Load = cast<LoadInst>(I);
+ if (Load->isAtomic())
+ return false;
+ if (!Subtarget->hasSIMD128() && Load->getType()->isVectorTy())
+ return false;
+
+ Address Addr;
+ if (!computeAddress(Load->getPointerOperand(), Addr))
+ return false;
+
+ // TODO: Fold a following sign-/zero-extend into the load instruction.
+
+ unsigned Opc;
+ const TargetRegisterClass *RC;
+ switch (getSimpleType(Load->getType())) {
+ case MVT::i1:
+ case MVT::i8:
+ Opc = WebAssembly::LOAD8_U_I32;
+ RC = &WebAssembly::I32RegClass;
+ break;
+ case MVT::i16:
+ Opc = WebAssembly::LOAD16_U_I32;
+ RC = &WebAssembly::I32RegClass;
+ break;
+ case MVT::i32:
+ Opc = WebAssembly::LOAD_I32;
+ RC = &WebAssembly::I32RegClass;
+ break;
+ case MVT::i64:
+ Opc = WebAssembly::LOAD_I64;
+ RC = &WebAssembly::I64RegClass;
+ break;
+ case MVT::f32:
+ Opc = WebAssembly::LOAD_F32;
+ RC = &WebAssembly::F32RegClass;
+ break;
+ case MVT::f64:
+ Opc = WebAssembly::LOAD_F64;
+ RC = &WebAssembly::F64RegClass;
+ break;
+ default:
+ return false;
+ }
+
+ materializeLoadStoreOperands(Addr);
+
+ unsigned ResultReg = createResultReg(RC);
+ auto MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
+ ResultReg);
+
+ addLoadStoreOperands(Addr, MIB, createMachineMemOperandFor(Load));
+
+ updateValueMap(Load, ResultReg);
+ return true;
+}
+
+bool WebAssemblyFastISel::selectStore(const Instruction *I) {
+ const StoreInst *Store = cast<StoreInst>(I);
+ if (Store->isAtomic())
+ return false;
+ if (!Subtarget->hasSIMD128() &&
+ Store->getValueOperand()->getType()->isVectorTy())
+ return false;
+
+ Address Addr;
+ if (!computeAddress(Store->getPointerOperand(), Addr))
+ return false;
+
+ unsigned Opc;
+ bool VTIsi1 = false;
+ switch (getSimpleType(Store->getValueOperand()->getType())) {
+ case MVT::i1:
+ VTIsi1 = true;
+ case MVT::i8:
+ Opc = WebAssembly::STORE8_I32;
+ break;
+ case MVT::i16:
+ Opc = WebAssembly::STORE16_I32;
+ break;
+ case MVT::i32:
+ Opc = WebAssembly::STORE_I32;
+ break;
+ case MVT::i64:
+ Opc = WebAssembly::STORE_I64;
+ break;
+ case MVT::f32:
+ Opc = WebAssembly::STORE_F32;
+ break;
+ case MVT::f64:
+ Opc = WebAssembly::STORE_F64;
+ break;
+ default: return false;
+ }
+
+ materializeLoadStoreOperands(Addr);
+
+ unsigned ValueReg = getRegForValue(Store->getValueOperand());
+ if (ValueReg == 0)
+ return false;
+ if (VTIsi1)
+ ValueReg = maskI1Value(ValueReg, Store->getValueOperand());
+
+ auto MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
+
+ addLoadStoreOperands(Addr, MIB, createMachineMemOperandFor(Store));
+
+ MIB.addReg(ValueReg);
+ return true;
+}
+
+bool WebAssemblyFastISel::selectBr(const Instruction *I) {
+ const BranchInst *Br = cast<BranchInst>(I);
+ if (Br->isUnconditional()) {
+ MachineBasicBlock *MSucc = FuncInfo.MBBMap[Br->getSuccessor(0)];
+ fastEmitBranch(MSucc, Br->getDebugLoc());
+ return true;
+ }
+
+ MachineBasicBlock *TBB = FuncInfo.MBBMap[Br->getSuccessor(0)];
+ MachineBasicBlock *FBB = FuncInfo.MBBMap[Br->getSuccessor(1)];
+
+ bool Not;
+ unsigned CondReg = getRegForI1Value(Br->getCondition(), Not);
+ if (CondReg == 0)
+ return false;
+
+ unsigned Opc = WebAssembly::BR_IF;
+ if (Not)
+ Opc = WebAssembly::BR_UNLESS;
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
+ .addMBB(TBB)
+ .addReg(CondReg);
+
+ finishCondBranch(Br->getParent(), TBB, FBB);
+ return true;
+}
+
+bool WebAssemblyFastISel::selectRet(const Instruction *I) {
+ if (!FuncInfo.CanLowerReturn)
+ return false;
+
+ const ReturnInst *Ret = cast<ReturnInst>(I);
+
+ if (Ret->getNumOperands() == 0) {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(WebAssembly::RETURN_VOID));
+ return true;
+ }
+
+ Value *RV = Ret->getOperand(0);
+ if (!Subtarget->hasSIMD128() && RV->getType()->isVectorTy())
+ return false;
+
+ unsigned Opc;
+ switch (getSimpleType(RV->getType())) {
+ case MVT::i1: case MVT::i8:
+ case MVT::i16: case MVT::i32:
+ Opc = WebAssembly::RETURN_I32;
+ break;
+ case MVT::i64:
+ Opc = WebAssembly::RETURN_I64;
+ break;
+ case MVT::f32:
+ Opc = WebAssembly::RETURN_F32;
+ break;
+ case MVT::f64:
+ Opc = WebAssembly::RETURN_F64;
+ break;
+ case MVT::v16i8:
+ Opc = WebAssembly::RETURN_v16i8;
+ break;
+ case MVT::v8i16:
+ Opc = WebAssembly::RETURN_v8i16;
+ break;
+ case MVT::v4i32:
+ Opc = WebAssembly::RETURN_v4i32;
+ break;
+ case MVT::v4f32:
+ Opc = WebAssembly::RETURN_v4f32;
+ break;
+ default: return false;
+ }
+
+ unsigned Reg;
+ if (FuncInfo.Fn->getAttributes().hasAttribute(0, Attribute::SExt))
+ Reg = getRegForSignedValue(RV);
+ else if (FuncInfo.Fn->getAttributes().hasAttribute(0, Attribute::ZExt))
+ Reg = getRegForUnsignedValue(RV);
+ else
+ Reg = getRegForValue(RV);
+
+ if (Reg == 0)
+ return false;
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)).addReg(Reg);
+ return true;
+}
+
+bool WebAssemblyFastISel::selectUnreachable(const Instruction *I) {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(WebAssembly::UNREACHABLE));
+ return true;
+}
+
+bool WebAssemblyFastISel::fastSelectInstruction(const Instruction *I) {
+ switch (I->getOpcode()) {
+ case Instruction::Call:
+ if (selectCall(I))
+ return true;
+ break;
+ case Instruction::Select: return selectSelect(I);
+ case Instruction::Trunc: return selectTrunc(I);
+ case Instruction::ZExt: return selectZExt(I);
+ case Instruction::SExt: return selectSExt(I);
+ case Instruction::ICmp: return selectICmp(I);
+ case Instruction::FCmp: return selectFCmp(I);
+ case Instruction::BitCast: return selectBitCast(I);
+ case Instruction::Load: return selectLoad(I);
+ case Instruction::Store: return selectStore(I);
+ case Instruction::Br: return selectBr(I);
+ case Instruction::Ret: return selectRet(I);
+ case Instruction::Unreachable: return selectUnreachable(I);
+ default: break;
+ }
+
+ // Fall back to target-independent instruction selection.
+ return selectOperator(I, I->getOpcode());
+}
+
+FastISel *WebAssembly::createFastISel(FunctionLoweringInfo &FuncInfo,
+ const TargetLibraryInfo *LibInfo) {
+ return new WebAssemblyFastISel(FuncInfo, LibInfo);
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp
new file mode 100644
index 000000000000..2bbf7a2b42f9
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp
@@ -0,0 +1,296 @@
+//=- WebAssemblyFixIrreducibleControlFlow.cpp - Fix irreducible control flow -//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements a pass that transforms irreducible control flow
+/// into reducible control flow. Irreducible control flow means multiple-entry
+/// loops; they appear as CFG cycles that are not recorded in MachineLoopInfo
+/// due to being unnatural.
+///
+/// Note that LLVM has a generic pass that lowers irreducible control flow, but
+/// it linearizes control flow, turning diamonds into two triangles, which is
+/// both unnecessary and undesirable for WebAssembly.
+///
+/// TODO: The transformation implemented here handles all irreducible control
+/// flow, without exponential code-size expansion, though it does so by creating
+/// inefficient code in many cases. Ideally, we should add other
+/// transformations, including code-duplicating cases, which can be more
+/// efficient in common cases, and they can fall back to this conservative
+/// implementation as needed.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/ADT/PriorityQueue.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-fix-irreducible-control-flow"
+
+namespace {
+class WebAssemblyFixIrreducibleControlFlow final : public MachineFunctionPass {
+ StringRef getPassName() const override {
+ return "WebAssembly Fix Irreducible Control Flow";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineLoopInfo>();
+ AU.addPreserved<MachineLoopInfo>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ bool VisitLoop(MachineFunction &MF, MachineLoopInfo &MLI, MachineLoop *Loop);
+
+public:
+ static char ID; // Pass identification, replacement for typeid
+ WebAssemblyFixIrreducibleControlFlow() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyFixIrreducibleControlFlow::ID = 0;
+FunctionPass *llvm::createWebAssemblyFixIrreducibleControlFlow() {
+ return new WebAssemblyFixIrreducibleControlFlow();
+}
+
+namespace {
+
+/// A utility for walking the blocks of a loop, handling a nested inner
+/// loop as a monolithic conceptual block.
+class MetaBlock {
+ MachineBasicBlock *Block;
+ SmallVector<MachineBasicBlock *, 2> Preds;
+ SmallVector<MachineBasicBlock *, 2> Succs;
+
+public:
+ explicit MetaBlock(MachineBasicBlock *MBB)
+ : Block(MBB), Preds(MBB->pred_begin(), MBB->pred_end()),
+ Succs(MBB->succ_begin(), MBB->succ_end()) {}
+
+ explicit MetaBlock(MachineLoop *Loop) : Block(Loop->getHeader()) {
+ Loop->getExitBlocks(Succs);
+ for (MachineBasicBlock *Pred : Block->predecessors())
+ if (!Loop->contains(Pred))
+ Preds.push_back(Pred);
+ }
+
+ MachineBasicBlock *getBlock() const { return Block; }
+
+ const SmallVectorImpl<MachineBasicBlock *> &predecessors() const {
+ return Preds;
+ }
+ const SmallVectorImpl<MachineBasicBlock *> &successors() const {
+ return Succs;
+ }
+
+ bool operator==(const MetaBlock &MBB) { return Block == MBB.Block; }
+ bool operator!=(const MetaBlock &MBB) { return Block != MBB.Block; }
+};
+
+class SuccessorList final : public MetaBlock {
+ size_t Index;
+ size_t Num;
+
+public:
+ explicit SuccessorList(MachineBasicBlock *MBB)
+ : MetaBlock(MBB), Index(0), Num(successors().size()) {}
+
+ explicit SuccessorList(MachineLoop *Loop)
+ : MetaBlock(Loop), Index(0), Num(successors().size()) {}
+
+ bool HasNext() const { return Index != Num; }
+
+ MachineBasicBlock *Next() {
+ assert(HasNext());
+ return successors()[Index++];
+ }
+};
+
+} // end anonymous namespace
+
+bool WebAssemblyFixIrreducibleControlFlow::VisitLoop(MachineFunction &MF,
+ MachineLoopInfo &MLI,
+ MachineLoop *Loop) {
+ MachineBasicBlock *Header = Loop ? Loop->getHeader() : &*MF.begin();
+ SetVector<MachineBasicBlock *> RewriteSuccs;
+
+ // DFS through Loop's body, looking for for irreducible control flow. Loop is
+ // natural, and we stay in its body, and we treat any nested loops
+ // monolithically, so any cycles we encounter indicate irreducibility.
+ SmallPtrSet<MachineBasicBlock *, 8> OnStack;
+ SmallPtrSet<MachineBasicBlock *, 8> Visited;
+ SmallVector<SuccessorList, 4> LoopWorklist;
+ LoopWorklist.push_back(SuccessorList(Header));
+ OnStack.insert(Header);
+ Visited.insert(Header);
+ while (!LoopWorklist.empty()) {
+ SuccessorList &Top = LoopWorklist.back();
+ if (Top.HasNext()) {
+ MachineBasicBlock *Next = Top.Next();
+ if (Next == Header || (Loop && !Loop->contains(Next)))
+ continue;
+ if (LLVM_LIKELY(OnStack.insert(Next).second)) {
+ if (!Visited.insert(Next).second) {
+ OnStack.erase(Next);
+ continue;
+ }
+ MachineLoop *InnerLoop = MLI.getLoopFor(Next);
+ if (InnerLoop != Loop)
+ LoopWorklist.push_back(SuccessorList(InnerLoop));
+ else
+ LoopWorklist.push_back(SuccessorList(Next));
+ } else {
+ RewriteSuccs.insert(Top.getBlock());
+ }
+ continue;
+ }
+ OnStack.erase(Top.getBlock());
+ LoopWorklist.pop_back();
+ }
+
+ // Most likely, we didn't find any irreducible control flow.
+ if (LLVM_LIKELY(RewriteSuccs.empty()))
+ return false;
+
+ DEBUG(dbgs() << "Irreducible control flow detected!\n");
+
+ // Ok. We have irreducible control flow! Create a dispatch block which will
+ // contains a jump table to any block in the problematic set of blocks.
+ MachineBasicBlock *Dispatch = MF.CreateMachineBasicBlock();
+ MF.insert(MF.end(), Dispatch);
+ MLI.changeLoopFor(Dispatch, Loop);
+
+ // Add the jump table.
+ const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+ MachineInstrBuilder MIB = BuildMI(*Dispatch, Dispatch->end(), DebugLoc(),
+ TII.get(WebAssembly::BR_TABLE_I32));
+
+ // Add the register which will be used to tell the jump table which block to
+ // jump to.
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ unsigned Reg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
+ MIB.addReg(Reg);
+
+ // Collect all the blocks which need to have their successors rewritten,
+ // add the successors to the jump table, and remember their index.
+ DenseMap<MachineBasicBlock *, unsigned> Indices;
+ SmallVector<MachineBasicBlock *, 4> SuccWorklist(RewriteSuccs.begin(),
+ RewriteSuccs.end());
+ while (!SuccWorklist.empty()) {
+ MachineBasicBlock *MBB = SuccWorklist.pop_back_val();
+ auto Pair = Indices.insert(std::make_pair(MBB, 0));
+ if (!Pair.second)
+ continue;
+
+ unsigned Index = MIB.getInstr()->getNumExplicitOperands() - 1;
+ DEBUG(dbgs() << "MBB#" << MBB->getNumber() << " has index " << Index
+ << "\n");
+
+ Pair.first->second = Index;
+ for (auto Pred : MBB->predecessors())
+ RewriteSuccs.insert(Pred);
+
+ MIB.addMBB(MBB);
+ Dispatch->addSuccessor(MBB);
+
+ MetaBlock Meta(MBB);
+ for (auto *Succ : Meta.successors())
+ if (Succ != Header && (!Loop || Loop->contains(Succ)))
+ SuccWorklist.push_back(Succ);
+ }
+
+ // Rewrite the problematic successors for every block in RewriteSuccs.
+ // For simplicity, we just introduce a new block for every edge we need to
+ // rewrite. Fancier things are possible.
+ for (MachineBasicBlock *MBB : RewriteSuccs) {
+ DenseMap<MachineBasicBlock *, MachineBasicBlock *> Map;
+ for (auto *Succ : MBB->successors()) {
+ if (!Indices.count(Succ))
+ continue;
+
+ MachineBasicBlock *Split = MF.CreateMachineBasicBlock();
+ MF.insert(MBB->isLayoutSuccessor(Succ) ? MachineFunction::iterator(Succ)
+ : MF.end(),
+ Split);
+ MLI.changeLoopFor(Split, Loop);
+
+ // Set the jump table's register of the index of the block we wish to
+ // jump to, and jump to the jump table.
+ BuildMI(*Split, Split->end(), DebugLoc(), TII.get(WebAssembly::CONST_I32),
+ Reg)
+ .addImm(Indices[Succ]);
+ BuildMI(*Split, Split->end(), DebugLoc(), TII.get(WebAssembly::BR))
+ .addMBB(Dispatch);
+ Split->addSuccessor(Dispatch);
+ Map[Succ] = Split;
+ }
+ // Remap the terminator operands and the successor list.
+ for (MachineInstr &Term : MBB->terminators())
+ for (auto &Op : Term.explicit_uses())
+ if (Op.isMBB() && Indices.count(Op.getMBB()))
+ Op.setMBB(Map[Op.getMBB()]);
+ for (auto Rewrite : Map)
+ MBB->replaceSuccessor(Rewrite.first, Rewrite.second);
+ }
+
+ // Create a fake default label, because br_table requires one.
+ MIB.addMBB(MIB.getInstr()
+ ->getOperand(MIB.getInstr()->getNumExplicitOperands() - 1)
+ .getMBB());
+
+ return true;
+}
+
+bool WebAssemblyFixIrreducibleControlFlow::runOnMachineFunction(
+ MachineFunction &MF) {
+ DEBUG(dbgs() << "********** Fixing Irreducible Control Flow **********\n"
+ "********** Function: "
+ << MF.getName() << '\n');
+
+ bool Changed = false;
+ auto &MLI = getAnalysis<MachineLoopInfo>();
+
+ // Visit the function body, which is identified as a null loop.
+ Changed |= VisitLoop(MF, MLI, nullptr);
+
+ // Visit all the loops.
+ SmallVector<MachineLoop *, 8> Worklist(MLI.begin(), MLI.end());
+ while (!Worklist.empty()) {
+ MachineLoop *CurLoop = Worklist.pop_back_val();
+ Worklist.append(CurLoop->begin(), CurLoop->end());
+ Changed |= VisitLoop(MF, MLI, CurLoop);
+ }
+
+ // If we made any changes, completely recompute everything.
+ if (LLVM_UNLIKELY(Changed)) {
+ DEBUG(dbgs() << "Recomputing dominators and loops.\n");
+ MF.getRegInfo().invalidateLiveness();
+ MF.RenumberBlocks();
+ getAnalysis<MachineDominatorTree>().runOnMachineFunction(MF);
+ MLI.runOnMachineFunction(MF);
+ }
+
+ return Changed;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
new file mode 100644
index 000000000000..a6a2c0bf06ae
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
@@ -0,0 +1,257 @@
+//===-- WebAssemblyFrameLowering.cpp - WebAssembly Frame Lowering ----------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains the WebAssembly implementation of
+/// TargetFrameLowering class.
+///
+/// On WebAssembly, there aren't a lot of things to do here. There are no
+/// callee-saved registers to save, and no spill slots.
+///
+/// The stack grows downward.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyFrameLowering.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyInstrInfo.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "WebAssemblyTargetMachine.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-frame-info"
+
+// TODO: wasm64
+// TODO: Emit TargetOpcode::CFI_INSTRUCTION instructions
+
+/// We need a base pointer in the case of having items on the stack that
+/// require stricter alignment than the stack pointer itself. Because we need
+/// to shift the stack pointer by some unknown amount to force the alignment,
+/// we need to record the value of the stack pointer on entry to the function.
+bool WebAssemblyFrameLowering::hasBP(
+ const MachineFunction &MF) const {
+ const auto *RegInfo =
+ MF.getSubtarget<WebAssemblySubtarget>().getRegisterInfo();
+ return RegInfo->needsStackRealignment(MF);
+}
+
+/// Return true if the specified function should have a dedicated frame pointer
+/// register.
+bool WebAssemblyFrameLowering::hasFP(const MachineFunction &MF) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ // When we have var-sized objects, we move the stack pointer by an unknown
+ // amount, and need to emit a frame pointer to restore the stack to where we
+ // were on function entry.
+ // If we already need a base pointer, we use that to fix up the stack pointer.
+ // If there are no fixed-size objects, we would have no use of a frame
+ // pointer, and thus should not emit one.
+ bool HasFixedSizedObjects = MFI.getStackSize() > 0;
+ bool NeedsFixedReference = !hasBP(MF) || HasFixedSizedObjects;
+
+ return MFI.isFrameAddressTaken() ||
+ (MFI.hasVarSizedObjects() && NeedsFixedReference) ||
+ MFI.hasStackMap() || MFI.hasPatchPoint();
+}
+
+/// Under normal circumstances, when a frame pointer is not required, we reserve
+/// argument space for call sites in the function immediately on entry to the
+/// current function. This eliminates the need for add/sub sp brackets around
+/// call sites. Returns true if the call frame is included as part of the stack
+/// frame.
+bool WebAssemblyFrameLowering::hasReservedCallFrame(
+ const MachineFunction &MF) const {
+ return !MF.getFrameInfo().hasVarSizedObjects();
+}
+
+
+/// Returns true if this function needs a local user-space stack pointer.
+/// Unlike a machine stack pointer, the wasm user stack pointer is a global
+/// variable, so it is loaded into a register in the prolog.
+bool WebAssemblyFrameLowering::needsSP(const MachineFunction &MF,
+ const MachineFrameInfo &MFI) const {
+ return MFI.getStackSize() || MFI.adjustsStack() || hasFP(MF);
+}
+
+/// Returns true if the local user-space stack pointer needs to be written back
+/// to memory by this function (this is not meaningful if needsSP is false). If
+/// false, the stack red zone can be used and only a local SP is needed.
+bool WebAssemblyFrameLowering::needsSPWriteback(
+ const MachineFunction &MF, const MachineFrameInfo &MFI) const {
+ assert(needsSP(MF, MFI));
+ return MFI.getStackSize() > RedZoneSize || MFI.hasCalls() ||
+ MF.getFunction()->hasFnAttribute(Attribute::NoRedZone);
+}
+
+static void writeSPToMemory(unsigned SrcReg, MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &InsertAddr,
+ MachineBasicBlock::iterator &InsertStore,
+ const DebugLoc &DL) {
+ const char *ES = "__stack_pointer";
+ auto *SPSymbol = MF.createExternalSymbolName(ES);
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const TargetRegisterClass *PtrRC =
+ MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
+ unsigned Zero = MRI.createVirtualRegister(PtrRC);
+ const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+
+ BuildMI(MBB, InsertAddr, DL, TII->get(WebAssembly::CONST_I32), Zero)
+ .addImm(0);
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo(MF.getPSVManager().getExternalSymbolCallEntry(ES)),
+ MachineMemOperand::MOStore, 4, 4);
+ BuildMI(MBB, InsertStore, DL, TII->get(WebAssembly::STORE_I32))
+ .addImm(2) // p2align
+ .addExternalSymbol(SPSymbol)
+ .addReg(Zero)
+ .addReg(SrcReg)
+ .addMemOperand(MMO);
+}
+
+MachineBasicBlock::iterator
+WebAssemblyFrameLowering::eliminateCallFramePseudoInstr(
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
+ assert(!I->getOperand(0).getImm() && (hasFP(MF) || hasBP(MF)) &&
+ "Call frame pseudos should only be used for dynamic stack adjustment");
+ const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+ if (I->getOpcode() == TII->getCallFrameDestroyOpcode() &&
+ needsSPWriteback(MF, MF.getFrameInfo())) {
+ DebugLoc DL = I->getDebugLoc();
+ writeSPToMemory(WebAssembly::SP32, MF, MBB, I, I, DL);
+ }
+ return MBB.erase(I);
+}
+
+void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ // TODO: Do ".setMIFlag(MachineInstr::FrameSetup)" on emitted instructions
+ auto &MFI = MF.getFrameInfo();
+ assert(MFI.getCalleeSavedInfo().empty() &&
+ "WebAssembly should not have callee-saved registers");
+
+ if (!needsSP(MF, MFI)) return;
+ uint64_t StackSize = MFI.getStackSize();
+
+ const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+ auto &MRI = MF.getRegInfo();
+
+ auto InsertPt = MBB.begin();
+ DebugLoc DL;
+
+ const TargetRegisterClass *PtrRC =
+ MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
+ unsigned Zero = MRI.createVirtualRegister(PtrRC);
+ unsigned SPReg = WebAssembly::SP32;
+ if (StackSize)
+ SPReg = MRI.createVirtualRegister(PtrRC);
+ const char *ES = "__stack_pointer";
+ auto *SPSymbol = MF.createExternalSymbolName(ES);
+ BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), Zero)
+ .addImm(0);
+ MachineMemOperand *LoadMMO = MF.getMachineMemOperand(
+ MachinePointerInfo(MF.getPSVManager().getExternalSymbolCallEntry(ES)),
+ MachineMemOperand::MOLoad, 4, 4);
+ // Load the SP value.
+ BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::LOAD_I32), SPReg)
+ .addImm(2) // p2align
+ .addExternalSymbol(SPSymbol)
+ .addReg(Zero) // addr
+ .addMemOperand(LoadMMO);
+
+ bool HasBP = hasBP(MF);
+ if (HasBP) {
+ auto FI = MF.getInfo<WebAssemblyFunctionInfo>();
+ unsigned BasePtr = MRI.createVirtualRegister(PtrRC);
+ FI->setBasePointerVreg(BasePtr);
+ BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::COPY), BasePtr)
+ .addReg(SPReg);
+ }
+ if (StackSize) {
+ // Subtract the frame size
+ unsigned OffsetReg = MRI.createVirtualRegister(PtrRC);
+ BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg)
+ .addImm(StackSize);
+ BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::SUB_I32),
+ WebAssembly::SP32)
+ .addReg(SPReg)
+ .addReg(OffsetReg);
+ }
+ if (HasBP) {
+ unsigned BitmaskReg = MRI.createVirtualRegister(PtrRC);
+ unsigned Alignment = MFI.getMaxAlignment();
+ assert((1u << countTrailingZeros(Alignment)) == Alignment &&
+ "Alignment must be a power of 2");
+ BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), BitmaskReg)
+ .addImm((int)~(Alignment - 1));
+ BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::AND_I32),
+ WebAssembly::SP32)
+ .addReg(WebAssembly::SP32)
+ .addReg(BitmaskReg);
+ }
+ if (hasFP(MF)) {
+ // Unlike most conventional targets (where FP points to the saved FP),
+ // FP points to the bottom of the fixed-size locals, so we can use positive
+ // offsets in load/store instructions.
+ BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::COPY),
+ WebAssembly::FP32)
+ .addReg(WebAssembly::SP32);
+ }
+ if (StackSize && needsSPWriteback(MF, MFI)) {
+ writeSPToMemory(WebAssembly::SP32, MF, MBB, InsertPt, InsertPt, DL);
+ }
+}
+
+void WebAssemblyFrameLowering::emitEpilogue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ auto &MFI = MF.getFrameInfo();
+ uint64_t StackSize = MFI.getStackSize();
+ if (!needsSP(MF, MFI) || !needsSPWriteback(MF, MFI)) return;
+ const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+ auto &MRI = MF.getRegInfo();
+ auto InsertPt = MBB.getFirstTerminator();
+ DebugLoc DL;
+
+ if (InsertPt != MBB.end())
+ DL = InsertPt->getDebugLoc();
+
+ // Restore the stack pointer. If we had fixed-size locals, add the offset
+ // subtracted in the prolog.
+ unsigned SPReg = 0;
+ MachineBasicBlock::iterator InsertAddr = InsertPt;
+ if (hasBP(MF)) {
+ auto FI = MF.getInfo<WebAssemblyFunctionInfo>();
+ SPReg = FI->getBasePointerVreg();
+ } else if (StackSize) {
+ const TargetRegisterClass *PtrRC =
+ MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
+ unsigned OffsetReg = MRI.createVirtualRegister(PtrRC);
+ InsertAddr =
+ BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg)
+ .addImm(StackSize);
+ // In the epilog we don't need to write the result back to the SP32 physreg
+ // because it won't be used again. We can use a stackified register instead.
+ SPReg = MRI.createVirtualRegister(PtrRC);
+ BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::ADD_I32), SPReg)
+ .addReg(hasFP(MF) ? WebAssembly::FP32 : WebAssembly::SP32)
+ .addReg(OffsetReg);
+ } else {
+ SPReg = hasFP(MF) ? WebAssembly::FP32 : WebAssembly::SP32;
+ }
+
+ writeSPToMemory(SPReg, MF, MBB, InsertAddr, InsertPt, DL);
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
new file mode 100644
index 000000000000..bf326fce88fa
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
@@ -0,0 +1,57 @@
+// WebAssemblyFrameLowering.h - TargetFrameLowering for WebAssembly -*- C++ -*-/
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This class implements WebAssembly-specific bits of
+/// TargetFrameLowering class.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYFRAMELOWERING_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYFRAMELOWERING_H
+
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+class MachineFrameInfo;
+
+class WebAssemblyFrameLowering final : public TargetFrameLowering {
+ public:
+ /// Size of the red zone for the user stack (leaf functions can use this much
+ /// space below the stack pointer without writing it back to memory).
+ // TODO: (ABI) Revisit and decide how large it should be.
+ static const size_t RedZoneSize = 128;
+
+ WebAssemblyFrameLowering()
+ : TargetFrameLowering(StackGrowsDown, /*StackAlignment=*/16,
+ /*LocalAreaOffset=*/0,
+ /*TransientStackAlignment=*/16,
+ /*StackRealignable=*/true) {}
+
+ MachineBasicBlock::iterator eliminateCallFramePseudoInstr(
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const override;
+
+ /// These methods insert prolog and epilog code into the function.
+ void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+ void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+ bool hasFP(const MachineFunction &MF) const override;
+ bool hasReservedCallFrame(const MachineFunction &MF) const override;
+
+ private:
+ bool hasBP(const MachineFunction &MF) const;
+ bool needsSP(const MachineFunction &MF, const MachineFrameInfo &MFI) const;
+ bool needsSPWriteback(const MachineFunction &MF,
+ const MachineFrameInfo &MFI) const;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISD.def b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
new file mode 100644
index 000000000000..2f0f106ef5b7
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
@@ -0,0 +1,25 @@
+//- WebAssemblyISD.def - WebAssembly ISD ---------------------------*- C++ -*-//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file describes the various WebAssembly ISD node types.
+///
+//===----------------------------------------------------------------------===//
+
+// NOTE: NO INCLUDE GUARD DESIRED!
+
+HANDLE_NODETYPE(CALL1)
+HANDLE_NODETYPE(CALL0)
+HANDLE_NODETYPE(RETURN)
+HANDLE_NODETYPE(ARGUMENT)
+HANDLE_NODETYPE(Wrapper)
+HANDLE_NODETYPE(BR_IF)
+HANDLE_NODETYPE(BR_TABLE)
+
+// add memory opcodes starting at ISD::FIRST_TARGET_MEMORY_OPCODE here...
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
new file mode 100644
index 000000000000..a67137f867e7
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
@@ -0,0 +1,118 @@
+//- WebAssemblyISelDAGToDAG.cpp - A dag to dag inst selector for WebAssembly -//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file defines an instruction selector for the WebAssembly target.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Function.h" // To access function attributes.
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-isel"
+
+//===--------------------------------------------------------------------===//
+/// WebAssembly-specific code to select WebAssembly machine instructions for
+/// SelectionDAG operations.
+///
+namespace {
+class WebAssemblyDAGToDAGISel final : public SelectionDAGISel {
+ /// Keep a pointer to the WebAssemblySubtarget around so that we can make the
+ /// right decision when generating code for different targets.
+ const WebAssemblySubtarget *Subtarget;
+
+ bool ForCodeSize;
+
+public:
+ WebAssemblyDAGToDAGISel(WebAssemblyTargetMachine &tm,
+ CodeGenOpt::Level OptLevel)
+ : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr), ForCodeSize(false) {
+ }
+
+ StringRef getPassName() const override {
+ return "WebAssembly Instruction Selection";
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ ForCodeSize =
+ MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) ||
+ MF.getFunction()->hasFnAttribute(Attribute::MinSize);
+ Subtarget = &MF.getSubtarget<WebAssemblySubtarget>();
+ return SelectionDAGISel::runOnMachineFunction(MF);
+ }
+
+ void Select(SDNode *Node) override;
+
+ bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
+ std::vector<SDValue> &OutOps) override;
+
+// Include the pieces autogenerated from the target description.
+#include "WebAssemblyGenDAGISel.inc"
+
+private:
+ // add select functions here...
+};
+} // end anonymous namespace
+
+void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
+ // Dump information about the Node being selected.
+ DEBUG(errs() << "Selecting: ");
+ DEBUG(Node->dump(CurDAG));
+ DEBUG(errs() << "\n");
+
+ // If we have a custom node, we already have selected!
+ if (Node->isMachineOpcode()) {
+ DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+ Node->setNodeId(-1);
+ return;
+ }
+
+ // Few custom selection stuff.
+ EVT VT = Node->getValueType(0);
+
+ switch (Node->getOpcode()) {
+ default:
+ break;
+ // If we need WebAssembly-specific selection, it would go here.
+ (void)VT;
+ }
+
+ // Select the default instruction.
+ SelectCode(Node);
+}
+
+bool WebAssemblyDAGToDAGISel::SelectInlineAsmMemoryOperand(
+ const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) {
+ switch (ConstraintID) {
+ case InlineAsm::Constraint_i:
+ case InlineAsm::Constraint_m:
+ // We just support simple memory operands that just have a single address
+ // operand and need no special handling.
+ OutOps.push_back(Op);
+ return false;
+ default:
+ break;
+ }
+
+ return true;
+}
+
+/// This pass converts a legalized DAG into a WebAssembly-specific DAG, ready
+/// for instruction scheduling.
+FunctionPass *llvm::createWebAssemblyISelDag(WebAssemblyTargetMachine &TM,
+ CodeGenOpt::Level OptLevel) {
+ return new WebAssemblyDAGToDAGISel(TM, OptLevel);
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
new file mode 100644
index 000000000000..6a7f75a6b3a1
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -0,0 +1,707 @@
+//=- WebAssemblyISelLowering.cpp - WebAssembly DAG Lowering Implementation -==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements the WebAssemblyTargetLowering class.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyISelLowering.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "WebAssemblyTargetMachine.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-lower"
+
+WebAssemblyTargetLowering::WebAssemblyTargetLowering(
+ const TargetMachine &TM, const WebAssemblySubtarget &STI)
+ : TargetLowering(TM), Subtarget(&STI) {
+ auto MVTPtr = Subtarget->hasAddr64() ? MVT::i64 : MVT::i32;
+
+ // Booleans always contain 0 or 1.
+ setBooleanContents(ZeroOrOneBooleanContent);
+ // WebAssembly does not produce floating-point exceptions on normal floating
+ // point operations.
+ setHasFloatingPointExceptions(false);
+ // We don't know the microarchitecture here, so just reduce register pressure.
+ setSchedulingPreference(Sched::RegPressure);
+ // Tell ISel that we have a stack pointer.
+ setStackPointerRegisterToSaveRestore(
+ Subtarget->hasAddr64() ? WebAssembly::SP64 : WebAssembly::SP32);
+ // Set up the register classes.
+ addRegisterClass(MVT::i32, &WebAssembly::I32RegClass);
+ addRegisterClass(MVT::i64, &WebAssembly::I64RegClass);
+ addRegisterClass(MVT::f32, &WebAssembly::F32RegClass);
+ addRegisterClass(MVT::f64, &WebAssembly::F64RegClass);
+ if (Subtarget->hasSIMD128()) {
+ addRegisterClass(MVT::v16i8, &WebAssembly::V128RegClass);
+ addRegisterClass(MVT::v8i16, &WebAssembly::V128RegClass);
+ addRegisterClass(MVT::v4i32, &WebAssembly::V128RegClass);
+ addRegisterClass(MVT::v4f32, &WebAssembly::V128RegClass);
+ }
+ // Compute derived properties from the register classes.
+ computeRegisterProperties(Subtarget->getRegisterInfo());
+
+ setOperationAction(ISD::GlobalAddress, MVTPtr, Custom);
+ setOperationAction(ISD::ExternalSymbol, MVTPtr, Custom);
+ setOperationAction(ISD::JumpTable, MVTPtr, Custom);
+ setOperationAction(ISD::BlockAddress, MVTPtr, Custom);
+ setOperationAction(ISD::BRIND, MVT::Other, Custom);
+
+ // Take the default expansion for va_arg, va_copy, and va_end. There is no
+ // default action for va_start, so we do that custom.
+ setOperationAction(ISD::VASTART, MVT::Other, Custom);
+ setOperationAction(ISD::VAARG, MVT::Other, Expand);
+ setOperationAction(ISD::VACOPY, MVT::Other, Expand);
+ setOperationAction(ISD::VAEND, MVT::Other, Expand);
+
+ for (auto T : {MVT::f32, MVT::f64}) {
+ // Don't expand the floating-point types to constant pools.
+ setOperationAction(ISD::ConstantFP, T, Legal);
+ // Expand floating-point comparisons.
+ for (auto CC : {ISD::SETO, ISD::SETUO, ISD::SETUEQ, ISD::SETONE,
+ ISD::SETULT, ISD::SETULE, ISD::SETUGT, ISD::SETUGE})
+ setCondCodeAction(CC, T, Expand);
+ // Expand floating-point library function operators.
+ for (auto Op : {ISD::FSIN, ISD::FCOS, ISD::FSINCOS, ISD::FPOWI, ISD::FPOW,
+ ISD::FREM, ISD::FMA})
+ setOperationAction(Op, T, Expand);
+ // Note supported floating-point library function operators that otherwise
+ // default to expand.
+ for (auto Op :
+ {ISD::FCEIL, ISD::FFLOOR, ISD::FTRUNC, ISD::FNEARBYINT, ISD::FRINT})
+ setOperationAction(Op, T, Legal);
+ // Support minnan and maxnan, which otherwise default to expand.
+ setOperationAction(ISD::FMINNAN, T, Legal);
+ setOperationAction(ISD::FMAXNAN, T, Legal);
+ }
+
+ for (auto T : {MVT::i32, MVT::i64}) {
+ // Expand unavailable integer operations.
+ for (auto Op :
+ {ISD::BSWAP, ISD::SMUL_LOHI, ISD::UMUL_LOHI,
+ ISD::MULHS, ISD::MULHU, ISD::SDIVREM, ISD::UDIVREM, ISD::SHL_PARTS,
+ ISD::SRA_PARTS, ISD::SRL_PARTS, ISD::ADDC, ISD::ADDE, ISD::SUBC,
+ ISD::SUBE}) {
+ setOperationAction(Op, T, Expand);
+ }
+ }
+
+ // As a special case, these operators use the type to mean the type to
+ // sign-extend from.
+ for (auto T : {MVT::i1, MVT::i8, MVT::i16, MVT::i32})
+ setOperationAction(ISD::SIGN_EXTEND_INREG, T, Expand);
+
+ // Dynamic stack allocation: use the default expansion.
+ setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+ setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVTPtr, Expand);
+
+ setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
+ setOperationAction(ISD::CopyToReg, MVT::Other, Custom);
+
+ // Expand these forms; we pattern-match the forms that we can handle in isel.
+ for (auto T : {MVT::i32, MVT::i64, MVT::f32, MVT::f64})
+ for (auto Op : {ISD::BR_CC, ISD::SELECT_CC})
+ setOperationAction(Op, T, Expand);
+
+ // We have custom switch handling.
+ setOperationAction(ISD::BR_JT, MVT::Other, Custom);
+
+ // WebAssembly doesn't have:
+ // - Floating-point extending loads.
+ // - Floating-point truncating stores.
+ // - i1 extending loads.
+ setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
+ setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+ for (auto T : MVT::integer_valuetypes())
+ for (auto Ext : {ISD::EXTLOAD, ISD::ZEXTLOAD, ISD::SEXTLOAD})
+ setLoadExtAction(Ext, T, MVT::i1, Promote);
+
+ // Trap lowers to wasm unreachable
+ setOperationAction(ISD::TRAP, MVT::Other, Legal);
+}
+
+FastISel *WebAssemblyTargetLowering::createFastISel(
+ FunctionLoweringInfo &FuncInfo, const TargetLibraryInfo *LibInfo) const {
+ return WebAssembly::createFastISel(FuncInfo, LibInfo);
+}
+
+bool WebAssemblyTargetLowering::isOffsetFoldingLegal(
+ const GlobalAddressSDNode * /*GA*/) const {
+ // All offsets can be folded.
+ return true;
+}
+
+MVT WebAssemblyTargetLowering::getScalarShiftAmountTy(const DataLayout & /*DL*/,
+ EVT VT) const {
+ unsigned BitWidth = NextPowerOf2(VT.getSizeInBits() - 1);
+ if (BitWidth > 1 && BitWidth < 8) BitWidth = 8;
+
+ if (BitWidth > 64) {
+ // The shift will be lowered to a libcall, and compiler-rt libcalls expect
+ // the count to be an i32.
+ BitWidth = 32;
+ assert(BitWidth >= Log2_32_Ceil(VT.getSizeInBits()) &&
+ "32-bit shift counts ought to be enough for anyone");
+ }
+
+ MVT Result = MVT::getIntegerVT(BitWidth);
+ assert(Result != MVT::INVALID_SIMPLE_VALUE_TYPE &&
+ "Unable to represent scalar shift amount type");
+ return Result;
+}
+
+const char *WebAssemblyTargetLowering::getTargetNodeName(
+ unsigned Opcode) const {
+ switch (static_cast<WebAssemblyISD::NodeType>(Opcode)) {
+ case WebAssemblyISD::FIRST_NUMBER:
+ break;
+#define HANDLE_NODETYPE(NODE) \
+ case WebAssemblyISD::NODE: \
+ return "WebAssemblyISD::" #NODE;
+#include "WebAssemblyISD.def"
+#undef HANDLE_NODETYPE
+ }
+ return nullptr;
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+WebAssemblyTargetLowering::getRegForInlineAsmConstraint(
+ const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
+ // First, see if this is a constraint that directly corresponds to a
+ // WebAssembly register class.
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ case 'r':
+ assert(VT != MVT::iPTR && "Pointer MVT not expected here");
+ if (Subtarget->hasSIMD128() && VT.isVector()) {
+ if (VT.getSizeInBits() == 128)
+ return std::make_pair(0U, &WebAssembly::V128RegClass);
+ }
+ if (VT.isInteger() && !VT.isVector()) {
+ if (VT.getSizeInBits() <= 32)
+ return std::make_pair(0U, &WebAssembly::I32RegClass);
+ if (VT.getSizeInBits() <= 64)
+ return std::make_pair(0U, &WebAssembly::I64RegClass);
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+}
+
+bool WebAssemblyTargetLowering::isCheapToSpeculateCttz() const {
+ // Assume ctz is a relatively cheap operation.
+ return true;
+}
+
+bool WebAssemblyTargetLowering::isCheapToSpeculateCtlz() const {
+ // Assume clz is a relatively cheap operation.
+ return true;
+}
+
+bool WebAssemblyTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+ const AddrMode &AM,
+ Type *Ty,
+ unsigned AS) const {
+ // WebAssembly offsets are added as unsigned without wrapping. The
+ // isLegalAddressingMode gives us no way to determine if wrapping could be
+ // happening, so we approximate this by accepting only non-negative offsets.
+ if (AM.BaseOffs < 0) return false;
+
+ // WebAssembly has no scale register operands.
+ if (AM.Scale != 0) return false;
+
+ // Everything else is legal.
+ return true;
+}
+
+bool WebAssemblyTargetLowering::allowsMisalignedMemoryAccesses(
+ EVT /*VT*/, unsigned /*AddrSpace*/, unsigned /*Align*/, bool *Fast) const {
+ // WebAssembly supports unaligned accesses, though it should be declared
+ // with the p2align attribute on loads and stores which do so, and there
+ // may be a performance impact. We tell LLVM they're "fast" because
+ // for the kinds of things that LLVM uses this for (merging adjacent stores
+ // of constants, etc.), WebAssembly implementations will either want the
+ // unaligned access or they'll split anyway.
+ if (Fast) *Fast = true;
+ return true;
+}
+
+bool WebAssemblyTargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
+ // The current thinking is that wasm engines will perform this optimization,
+ // so we can save on code size.
+ return true;
+}
+
+//===----------------------------------------------------------------------===//
+// WebAssembly Lowering private implementation.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Lowering Code
+//===----------------------------------------------------------------------===//
+
+static void fail(const SDLoc &DL, SelectionDAG &DAG, const char *msg) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ DAG.getContext()->diagnose(
+ DiagnosticInfoUnsupported(*MF.getFunction(), msg, DL.getDebugLoc()));
+}
+
+// Test whether the given calling convention is supported.
+static bool CallingConvSupported(CallingConv::ID CallConv) {
+ // We currently support the language-independent target-independent
+ // conventions. We don't yet have a way to annotate calls with properties like
+ // "cold", and we don't have any call-clobbered registers, so these are mostly
+ // all handled the same.
+ return CallConv == CallingConv::C || CallConv == CallingConv::Fast ||
+ CallConv == CallingConv::Cold ||
+ CallConv == CallingConv::PreserveMost ||
+ CallConv == CallingConv::PreserveAll ||
+ CallConv == CallingConv::CXX_FAST_TLS;
+}
+
+SDValue WebAssemblyTargetLowering::LowerCall(
+ CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const {
+ SelectionDAG &DAG = CLI.DAG;
+ SDLoc DL = CLI.DL;
+ SDValue Chain = CLI.Chain;
+ SDValue Callee = CLI.Callee;
+ MachineFunction &MF = DAG.getMachineFunction();
+ auto Layout = MF.getDataLayout();
+
+ CallingConv::ID CallConv = CLI.CallConv;
+ if (!CallingConvSupported(CallConv))
+ fail(DL, DAG,
+ "WebAssembly doesn't support language-specific or target-specific "
+ "calling conventions yet");
+ if (CLI.IsPatchPoint)
+ fail(DL, DAG, "WebAssembly doesn't support patch point yet");
+
+ // WebAssembly doesn't currently support explicit tail calls. If they are
+ // required, fail. Otherwise, just disable them.
+ if ((CallConv == CallingConv::Fast && CLI.IsTailCall &&
+ MF.getTarget().Options.GuaranteedTailCallOpt) ||
+ (CLI.CS && CLI.CS->isMustTailCall()))
+ fail(DL, DAG, "WebAssembly doesn't support tail call yet");
+ CLI.IsTailCall = false;
+
+ SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
+ if (Ins.size() > 1)
+ fail(DL, DAG, "WebAssembly doesn't support more than 1 returned value yet");
+
+ SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+ SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+ for (unsigned i = 0; i < Outs.size(); ++i) {
+ const ISD::OutputArg &Out = Outs[i];
+ SDValue &OutVal = OutVals[i];
+ if (Out.Flags.isNest())
+ fail(DL, DAG, "WebAssembly hasn't implemented nest arguments");
+ if (Out.Flags.isInAlloca())
+ fail(DL, DAG, "WebAssembly hasn't implemented inalloca arguments");
+ if (Out.Flags.isInConsecutiveRegs())
+ fail(DL, DAG, "WebAssembly hasn't implemented cons regs arguments");
+ if (Out.Flags.isInConsecutiveRegsLast())
+ fail(DL, DAG, "WebAssembly hasn't implemented cons regs last arguments");
+ if (Out.Flags.isByVal() && Out.Flags.getByValSize() != 0) {
+ auto &MFI = MF.getFrameInfo();
+ int FI = MFI.CreateStackObject(Out.Flags.getByValSize(),
+ Out.Flags.getByValAlign(),
+ /*isSS=*/false);
+ SDValue SizeNode =
+ DAG.getConstant(Out.Flags.getByValSize(), DL, MVT::i32);
+ SDValue FINode = DAG.getFrameIndex(FI, getPointerTy(Layout));
+ Chain = DAG.getMemcpy(
+ Chain, DL, FINode, OutVal, SizeNode, Out.Flags.getByValAlign(),
+ /*isVolatile*/ false, /*AlwaysInline=*/false,
+ /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());
+ OutVal = FINode;
+ }
+ }
+
+ bool IsVarArg = CLI.IsVarArg;
+ unsigned NumFixedArgs = CLI.NumFixedArgs;
+
+ auto PtrVT = getPointerTy(Layout);
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
+
+ if (IsVarArg) {
+ // Outgoing non-fixed arguments are placed in a buffer. First
+ // compute their offsets and the total amount of buffer space needed.
+ for (SDValue Arg :
+ make_range(OutVals.begin() + NumFixedArgs, OutVals.end())) {
+ EVT VT = Arg.getValueType();
+ assert(VT != MVT::iPTR && "Legalized args should be concrete");
+ Type *Ty = VT.getTypeForEVT(*DAG.getContext());
+ unsigned Offset = CCInfo.AllocateStack(Layout.getTypeAllocSize(Ty),
+ Layout.getABITypeAlignment(Ty));
+ CCInfo.addLoc(CCValAssign::getMem(ArgLocs.size(), VT.getSimpleVT(),
+ Offset, VT.getSimpleVT(),
+ CCValAssign::Full));
+ }
+ }
+
+ unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
+
+ SDValue FINode;
+ if (IsVarArg && NumBytes) {
+ // For non-fixed arguments, next emit stores to store the argument values
+ // to the stack buffer at the offsets computed above.
+ int FI = MF.getFrameInfo().CreateStackObject(NumBytes,
+ Layout.getStackAlignment(),
+ /*isSS=*/false);
+ unsigned ValNo = 0;
+ SmallVector<SDValue, 8> Chains;
+ for (SDValue Arg :
+ make_range(OutVals.begin() + NumFixedArgs, OutVals.end())) {
+ assert(ArgLocs[ValNo].getValNo() == ValNo &&
+ "ArgLocs should remain in order and only hold varargs args");
+ unsigned Offset = ArgLocs[ValNo++].getLocMemOffset();
+ FINode = DAG.getFrameIndex(FI, getPointerTy(Layout));
+ SDValue Add = DAG.getNode(ISD::ADD, DL, PtrVT, FINode,
+ DAG.getConstant(Offset, DL, PtrVT));
+ Chains.push_back(DAG.getStore(
+ Chain, DL, Arg, Add,
+ MachinePointerInfo::getFixedStack(MF, FI, Offset), 0));
+ }
+ if (!Chains.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+ } else if (IsVarArg) {
+ FINode = DAG.getIntPtrConstant(0, DL);
+ }
+
+ // Compute the operands for the CALLn node.
+ SmallVector<SDValue, 16> Ops;
+ Ops.push_back(Chain);
+ Ops.push_back(Callee);
+
+ // Add all fixed arguments. Note that for non-varargs calls, NumFixedArgs
+ // isn't reliable.
+ Ops.append(OutVals.begin(),
+ IsVarArg ? OutVals.begin() + NumFixedArgs : OutVals.end());
+ // Add a pointer to the vararg buffer.
+ if (IsVarArg) Ops.push_back(FINode);
+
+ SmallVector<EVT, 8> InTys;
+ for (const auto &In : Ins) {
+ assert(!In.Flags.isByVal() && "byval is not valid for return values");
+ assert(!In.Flags.isNest() && "nest is not valid for return values");
+ if (In.Flags.isInAlloca())
+ fail(DL, DAG, "WebAssembly hasn't implemented inalloca return values");
+ if (In.Flags.isInConsecutiveRegs())
+ fail(DL, DAG, "WebAssembly hasn't implemented cons regs return values");
+ if (In.Flags.isInConsecutiveRegsLast())
+ fail(DL, DAG,
+ "WebAssembly hasn't implemented cons regs last return values");
+ // Ignore In.getOrigAlign() because all our arguments are passed in
+ // registers.
+ InTys.push_back(In.VT);
+ }
+ InTys.push_back(MVT::Other);
+ SDVTList InTyList = DAG.getVTList(InTys);
+ SDValue Res =
+ DAG.getNode(Ins.empty() ? WebAssemblyISD::CALL0 : WebAssemblyISD::CALL1,
+ DL, InTyList, Ops);
+ if (Ins.empty()) {
+ Chain = Res;
+ } else {
+ InVals.push_back(Res);
+ Chain = Res.getValue(1);
+ }
+
+ return Chain;
+}
+
+bool WebAssemblyTargetLowering::CanLowerReturn(
+ CallingConv::ID /*CallConv*/, MachineFunction & /*MF*/, bool /*IsVarArg*/,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ LLVMContext & /*Context*/) const {
+ // WebAssembly can't currently handle returning tuples.
+ return Outs.size() <= 1;
+}
+
+SDValue WebAssemblyTargetLowering::LowerReturn(
+ SDValue Chain, CallingConv::ID CallConv, bool /*IsVarArg*/,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
+ SelectionDAG &DAG) const {
+ assert(Outs.size() <= 1 && "WebAssembly can only return up to one value");
+ if (!CallingConvSupported(CallConv))
+ fail(DL, DAG, "WebAssembly doesn't support non-C calling conventions");
+
+ SmallVector<SDValue, 4> RetOps(1, Chain);
+ RetOps.append(OutVals.begin(), OutVals.end());
+ Chain = DAG.getNode(WebAssemblyISD::RETURN, DL, MVT::Other, RetOps);
+
+ // Record the number and types of the return values.
+ for (const ISD::OutputArg &Out : Outs) {
+ assert(!Out.Flags.isByVal() && "byval is not valid for return values");
+ assert(!Out.Flags.isNest() && "nest is not valid for return values");
+ assert(Out.IsFixed && "non-fixed return value is not valid");
+ if (Out.Flags.isInAlloca())
+ fail(DL, DAG, "WebAssembly hasn't implemented inalloca results");
+ if (Out.Flags.isInConsecutiveRegs())
+ fail(DL, DAG, "WebAssembly hasn't implemented cons regs results");
+ if (Out.Flags.isInConsecutiveRegsLast())
+ fail(DL, DAG, "WebAssembly hasn't implemented cons regs last results");
+ }
+
+ return Chain;
+}
+
+SDValue WebAssemblyTargetLowering::LowerFormalArguments(
+ SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+ if (!CallingConvSupported(CallConv))
+ fail(DL, DAG, "WebAssembly doesn't support non-C calling conventions");
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ auto *MFI = MF.getInfo<WebAssemblyFunctionInfo>();
+
+ // Set up the incoming ARGUMENTS value, which serves to represent the liveness
+ // of the incoming values before they're represented by virtual registers.
+ MF.getRegInfo().addLiveIn(WebAssembly::ARGUMENTS);
+
+ for (const ISD::InputArg &In : Ins) {
+ if (In.Flags.isInAlloca())
+ fail(DL, DAG, "WebAssembly hasn't implemented inalloca arguments");
+ if (In.Flags.isNest())
+ fail(DL, DAG, "WebAssembly hasn't implemented nest arguments");
+ if (In.Flags.isInConsecutiveRegs())
+ fail(DL, DAG, "WebAssembly hasn't implemented cons regs arguments");
+ if (In.Flags.isInConsecutiveRegsLast())
+ fail(DL, DAG, "WebAssembly hasn't implemented cons regs last arguments");
+ // Ignore In.getOrigAlign() because all our arguments are passed in
+ // registers.
+ InVals.push_back(
+ In.Used
+ ? DAG.getNode(WebAssemblyISD::ARGUMENT, DL, In.VT,
+ DAG.getTargetConstant(InVals.size(), DL, MVT::i32))
+ : DAG.getUNDEF(In.VT));
+
+ // Record the number and types of arguments.
+ MFI->addParam(In.VT);
+ }
+
+ // Varargs are copied into a buffer allocated by the caller, and a pointer to
+ // the buffer is passed as an argument.
+ if (IsVarArg) {
+ MVT PtrVT = getPointerTy(MF.getDataLayout());
+ unsigned VarargVreg =
+ MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrVT));
+ MFI->setVarargBufferVreg(VarargVreg);
+ Chain = DAG.getCopyToReg(
+ Chain, DL, VarargVreg,
+ DAG.getNode(WebAssemblyISD::ARGUMENT, DL, PtrVT,
+ DAG.getTargetConstant(Ins.size(), DL, MVT::i32)));
+ MFI->addParam(PtrVT);
+ }
+
+ // Record the number and types of results.
+ SmallVector<MVT, 4> Params;
+ SmallVector<MVT, 4> Results;
+ ComputeSignatureVTs(*MF.getFunction(), DAG.getTarget(), Params, Results);
+ for (MVT VT : Results)
+ MFI->addResult(VT);
+
+ return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+// Custom lowering hooks.
+//===----------------------------------------------------------------------===//
+
+SDValue WebAssemblyTargetLowering::LowerOperation(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ switch (Op.getOpcode()) {
+ default:
+ llvm_unreachable("unimplemented operation lowering");
+ return SDValue();
+ case ISD::FrameIndex:
+ return LowerFrameIndex(Op, DAG);
+ case ISD::GlobalAddress:
+ return LowerGlobalAddress(Op, DAG);
+ case ISD::ExternalSymbol:
+ return LowerExternalSymbol(Op, DAG);
+ case ISD::JumpTable:
+ return LowerJumpTable(Op, DAG);
+ case ISD::BR_JT:
+ return LowerBR_JT(Op, DAG);
+ case ISD::VASTART:
+ return LowerVASTART(Op, DAG);
+ case ISD::BlockAddress:
+ case ISD::BRIND:
+ fail(DL, DAG, "WebAssembly hasn't implemented computed gotos");
+ return SDValue();
+ case ISD::RETURNADDR: // Probably nothing meaningful can be returned here.
+ fail(DL, DAG, "WebAssembly hasn't implemented __builtin_return_address");
+ return SDValue();
+ case ISD::FRAMEADDR:
+ return LowerFRAMEADDR(Op, DAG);
+ case ISD::CopyToReg:
+ return LowerCopyToReg(Op, DAG);
+ }
+}
+
+SDValue WebAssemblyTargetLowering::LowerCopyToReg(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue Src = Op.getOperand(2);
+ if (isa<FrameIndexSDNode>(Src.getNode())) {
+ // CopyToReg nodes don't support FrameIndex operands. Other targets select
+ // the FI to some LEA-like instruction, but since we don't have that, we
+ // need to insert some kind of instruction that can take an FI operand and
+ // produces a value usable by CopyToReg (i.e. in a vreg). So insert a dummy
+ // copy_local between Op and its FI operand.
+ SDValue Chain = Op.getOperand(0);
+ SDLoc DL(Op);
+ unsigned Reg = cast<RegisterSDNode>(Op.getOperand(1))->getReg();
+ EVT VT = Src.getValueType();
+ SDValue Copy(
+ DAG.getMachineNode(VT == MVT::i32 ? WebAssembly::COPY_I32
+ : WebAssembly::COPY_I64,
+ DL, VT, Src),
+ 0);
+ return Op.getNode()->getNumValues() == 1
+ ? DAG.getCopyToReg(Chain, DL, Reg, Copy)
+ : DAG.getCopyToReg(Chain, DL, Reg, Copy, Op.getNumOperands() == 4
+ ? Op.getOperand(3)
+ : SDValue());
+ }
+ return SDValue();
+}
+
+SDValue WebAssemblyTargetLowering::LowerFrameIndex(SDValue Op,
+ SelectionDAG &DAG) const {
+ int FI = cast<FrameIndexSDNode>(Op)->getIndex();
+ return DAG.getTargetFrameIndex(FI, Op.getValueType());
+}
+
+SDValue WebAssemblyTargetLowering::LowerFRAMEADDR(SDValue Op,
+ SelectionDAG &DAG) const {
+ // Non-zero depths are not supported by WebAssembly currently. Use the
+ // legalizer's default expansion, which is to return 0 (what this function is
+ // documented to do).
+ if (Op.getConstantOperandVal(0) > 0)
+ return SDValue();
+
+ DAG.getMachineFunction().getFrameInfo().setFrameAddressIsTaken(true);
+ EVT VT = Op.getValueType();
+ unsigned FP =
+ Subtarget->getRegisterInfo()->getFrameRegister(DAG.getMachineFunction());
+ return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(Op), FP, VT);
+}
+
+SDValue WebAssemblyTargetLowering::LowerGlobalAddress(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ const auto *GA = cast<GlobalAddressSDNode>(Op);
+ EVT VT = Op.getValueType();
+ assert(GA->getTargetFlags() == 0 &&
+ "Unexpected target flags on generic GlobalAddressSDNode");
+ if (GA->getAddressSpace() != 0)
+ fail(DL, DAG, "WebAssembly only expects the 0 address space");
+ return DAG.getNode(
+ WebAssemblyISD::Wrapper, DL, VT,
+ DAG.getTargetGlobalAddress(GA->getGlobal(), DL, VT, GA->getOffset()));
+}
+
+SDValue WebAssemblyTargetLowering::LowerExternalSymbol(
+ SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ const auto *ES = cast<ExternalSymbolSDNode>(Op);
+ EVT VT = Op.getValueType();
+ assert(ES->getTargetFlags() == 0 &&
+ "Unexpected target flags on generic ExternalSymbolSDNode");
+ // Set the TargetFlags to 0x1 which indicates that this is a "function"
+ // symbol rather than a data symbol. We do this unconditionally even though
+ // we don't know anything about the symbol other than its name, because all
+ // external symbols used in target-independent SelectionDAG code are for
+ // functions.
+ return DAG.getNode(WebAssemblyISD::Wrapper, DL, VT,
+ DAG.getTargetExternalSymbol(ES->getSymbol(), VT,
+ /*TargetFlags=*/0x1));
+}
+
+SDValue WebAssemblyTargetLowering::LowerJumpTable(SDValue Op,
+ SelectionDAG &DAG) const {
+ // There's no need for a Wrapper node because we always incorporate a jump
+ // table operand into a BR_TABLE instruction, rather than ever
+ // materializing it in a register.
+ const JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+ return DAG.getTargetJumpTable(JT->getIndex(), Op.getValueType(),
+ JT->getTargetFlags());
+}
+
+SDValue WebAssemblyTargetLowering::LowerBR_JT(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ SDValue Chain = Op.getOperand(0);
+ const auto *JT = cast<JumpTableSDNode>(Op.getOperand(1));
+ SDValue Index = Op.getOperand(2);
+ assert(JT->getTargetFlags() == 0 && "WebAssembly doesn't set target flags");
+
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(Chain);
+ Ops.push_back(Index);
+
+ MachineJumpTableInfo *MJTI = DAG.getMachineFunction().getJumpTableInfo();
+ const auto &MBBs = MJTI->getJumpTables()[JT->getIndex()].MBBs;
+
+ // Add an operand for each case.
+ for (auto MBB : MBBs) Ops.push_back(DAG.getBasicBlock(MBB));
+
+ // TODO: For now, we just pick something arbitrary for a default case for now.
+ // We really want to sniff out the guard and put in the real default case (and
+ // delete the guard).
+ Ops.push_back(DAG.getBasicBlock(MBBs[0]));
+
+ return DAG.getNode(WebAssemblyISD::BR_TABLE, DL, MVT::Other, Ops);
+}
+
+SDValue WebAssemblyTargetLowering::LowerVASTART(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ EVT PtrVT = getPointerTy(DAG.getMachineFunction().getDataLayout());
+
+ auto *MFI = DAG.getMachineFunction().getInfo<WebAssemblyFunctionInfo>();
+ const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+
+ SDValue ArgN = DAG.getCopyFromReg(DAG.getEntryNode(), DL,
+ MFI->getVarargBufferVreg(), PtrVT);
+ return DAG.getStore(Op.getOperand(0), DL, ArgN, Op.getOperand(1),
+ MachinePointerInfo(SV), 0);
+}
+
+//===----------------------------------------------------------------------===//
+// WebAssembly Optimization Hooks
+//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
new file mode 100644
index 000000000000..5bc723028e63
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -0,0 +1,98 @@
+//- WebAssemblyISelLowering.h - WebAssembly DAG Lowering Interface -*- C++ -*-//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file defines the interfaces that WebAssembly uses to lower LLVM
+/// code into a selection DAG.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYISELLOWERING_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYISELLOWERING_H
+
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+namespace WebAssemblyISD {
+
+enum NodeType : unsigned {
+ FIRST_NUMBER = ISD::BUILTIN_OP_END,
+#define HANDLE_NODETYPE(NODE) NODE,
+#include "WebAssemblyISD.def"
+#undef HANDLE_NODETYPE
+};
+
+} // end namespace WebAssemblyISD
+
+class WebAssemblySubtarget;
+class WebAssemblyTargetMachine;
+
+class WebAssemblyTargetLowering final : public TargetLowering {
+ public:
+ WebAssemblyTargetLowering(const TargetMachine &TM,
+ const WebAssemblySubtarget &STI);
+
+ private:
+ /// Keep a pointer to the WebAssemblySubtarget around so that we can make the
+ /// right decision when generating code for different targets.
+ const WebAssemblySubtarget *Subtarget;
+
+ FastISel *createFastISel(FunctionLoweringInfo &FuncInfo,
+ const TargetLibraryInfo *LibInfo) const override;
+ bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+ MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override;
+ const char *getTargetNodeName(unsigned Opcode) const override;
+ std::pair<unsigned, const TargetRegisterClass *> getRegForInlineAsmConstraint(
+ const TargetRegisterInfo *TRI, StringRef Constraint,
+ MVT VT) const override;
+ bool isCheapToSpeculateCttz() const override;
+ bool isCheapToSpeculateCtlz() const override;
+ bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
+ unsigned AS) const override;
+ bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace, unsigned Align,
+ bool *Fast) const override;
+ bool isIntDivCheap(EVT VT, AttributeSet Attr) const override;
+
+ SDValue LowerCall(CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const override;
+ bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ LLVMContext &Context) const override;
+ SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals, const SDLoc &dl,
+ SelectionDAG &DAG) const override;
+ SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+ bool IsVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &DL, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const override;
+
+ // Custom lowering hooks.
+ SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+ SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerCopyToReg(SDValue Op, SelectionDAG &DAG) const;
+};
+
+namespace WebAssembly {
+FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo);
+} // end namespace WebAssembly
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
new file mode 100644
index 000000000000..64415658ed81
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
@@ -0,0 +1,47 @@
+// WebAssemblyInstrAtomics.td-WebAssembly Atomic codegen support-*- tablegen -*-
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief WebAssembly Atomic operand code-gen constructs.
+///
+//===----------------------------------------------------------------------===//
+
+// TODO: Implement atomic instructions.
+
+//===----------------------------------------------------------------------===//
+// Atomic fences
+//===----------------------------------------------------------------------===//
+
+// TODO: add atomic fences here...
+
+//===----------------------------------------------------------------------===//
+// Atomic loads
+//===----------------------------------------------------------------------===//
+
+// TODO: add atomic loads here...
+
+//===----------------------------------------------------------------------===//
+// Atomic stores
+//===----------------------------------------------------------------------===//
+
+// TODO: add atomic stores here...
+
+//===----------------------------------------------------------------------===//
+// Low-level exclusive operations
+//===----------------------------------------------------------------------===//
+
+// TODO: add exclusive operations here...
+
+// Load-exclusives.
+
+// Store-exclusives.
+
+// Store-release-exclusives.
+
+// And clear exclusive.
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td
new file mode 100644
index 000000000000..047f4be066c0
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td
@@ -0,0 +1,130 @@
+//===- WebAssemblyInstrCall.td-WebAssembly Call codegen support -*- tablegen -*-
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief WebAssembly Call operand code-gen constructs.
+///
+//===----------------------------------------------------------------------===//
+
+// TODO: addr64: These currently assume the callee address is 32-bit.
+
+let Defs = [ARGUMENTS] in {
+
+// Call sequence markers. These have an immediate which represents the amount of
+// stack space to allocate or free, which is used for varargs lowering.
+let Uses = [SP32, SP64], Defs = [SP32, SP64], isCodeGenOnly = 1 in {
+def ADJCALLSTACKDOWN : I<(outs), (ins i32imm:$amt),
+ [(WebAssemblycallseq_start timm:$amt)]>;
+def ADJCALLSTACKUP : I<(outs), (ins i32imm:$amt, i32imm:$amt2),
+ [(WebAssemblycallseq_end timm:$amt, timm:$amt2)]>;
+} // isCodeGenOnly = 1
+
+multiclass CALL<WebAssemblyRegClass vt, string prefix> {
+ def CALL_#vt : I<(outs vt:$dst), (ins function32_op:$callee, variable_ops),
+ [(set vt:$dst, (WebAssemblycall1 (i32 imm:$callee)))],
+ !strconcat(prefix, "call\t$dst, $callee"),
+ 0x10>;
+ let isCodeGenOnly = 1 in {
+ def PCALL_INDIRECT_#vt : I<(outs vt:$dst), (ins I32:$callee, variable_ops),
+ [(set vt:$dst, (WebAssemblycall1 I32:$callee))],
+ "PSEUDO CALL INDIRECT\t$callee">;
+ } // isCodeGenOnly = 1
+
+ def CALL_INDIRECT_#vt : I<(outs vt:$dst), (ins i32imm:$flags, variable_ops),
+ [],
+ !strconcat(prefix, "call_indirect\t$dst"),
+ 0x11>;
+}
+
+multiclass SIMD_CALL<ValueType vt, string prefix> {
+ def CALL_#vt : SIMD_I<(outs V128:$dst), (ins function32_op:$callee, variable_ops),
+ [(set (vt V128:$dst),
+ (WebAssemblycall1 (i32 imm:$callee)))],
+ !strconcat(prefix, "call\t$dst, $callee"),
+ 0x10>;
+ let isCodeGenOnly = 1 in {
+ def PCALL_INDIRECT_#vt : SIMD_I<(outs V128:$dst),
+ (ins I32:$callee, variable_ops),
+ [(set (vt V128:$dst),
+ (WebAssemblycall1 I32:$callee))],
+ "PSEUDO CALL INDIRECT\t$callee">;
+ } // isCodeGenOnly = 1
+
+ def CALL_INDIRECT_#vt : SIMD_I<(outs V128:$dst),
+ (ins i32imm:$flags, variable_ops),
+ [],
+ !strconcat(prefix, "call_indirect\t$dst"),
+ 0x11>;
+}
+
+let Uses = [SP32, SP64], isCall = 1 in {
+ defm : CALL<I32, "i32.">;
+ defm : CALL<I64, "i64.">;
+ defm : CALL<F32, "f32.">;
+ defm : CALL<F64, "f64.">;
+ defm : SIMD_CALL<v16i8, "i8x16.">;
+ defm : SIMD_CALL<v8i16, "i16x8.">;
+ defm : SIMD_CALL<v4i32, "i32x4.">;
+ defm : SIMD_CALL<v4f32, "f32x4.">;
+
+ def CALL_VOID : I<(outs), (ins function32_op:$callee, variable_ops),
+ [(WebAssemblycall0 (i32 imm:$callee))],
+ "call \t$callee", 0x10>;
+ let isCodeGenOnly = 1 in {
+ def PCALL_INDIRECT_VOID : I<(outs), (ins I32:$callee, variable_ops),
+ [(WebAssemblycall0 I32:$callee)],
+ "PSEUDO CALL INDIRECT\t$callee">;
+ } // isCodeGenOnly = 1
+
+ def CALL_INDIRECT_VOID : I<(outs), (ins i32imm:$flags, variable_ops),
+ [],
+ "call_indirect\t", 0x11>;
+} // Uses = [SP32,SP64], isCall = 1
+
+} // Defs = [ARGUMENTS]
+
+// Patterns for matching a direct call to a global address.
+def : Pat<(i32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+ (CALL_I32 tglobaladdr:$callee)>;
+def : Pat<(i64 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+ (CALL_I64 tglobaladdr:$callee)>;
+def : Pat<(f32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+ (CALL_F32 tglobaladdr:$callee)>;
+def : Pat<(f64 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+ (CALL_F64 tglobaladdr:$callee)>;
+def : Pat<(v16i8 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+ (CALL_v16i8 tglobaladdr:$callee)>, Requires<[HasSIMD128]>;
+def : Pat<(v8i16 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+ (CALL_v8i16 tglobaladdr:$callee)>, Requires<[HasSIMD128]>;
+def : Pat<(v4i32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+ (CALL_v4i32 tglobaladdr:$callee)>, Requires<[HasSIMD128]>;
+def : Pat<(v4f32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+ (CALL_v4f32 tglobaladdr:$callee)>, Requires<[HasSIMD128]>;
+def : Pat<(WebAssemblycall0 (WebAssemblywrapper tglobaladdr:$callee)),
+ (CALL_VOID tglobaladdr:$callee)>;
+
+// Patterns for matching a direct call to an external symbol.
+def : Pat<(i32 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+ (CALL_I32 texternalsym:$callee)>;
+def : Pat<(i64 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+ (CALL_I64 texternalsym:$callee)>;
+def : Pat<(f32 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+ (CALL_F32 texternalsym:$callee)>;
+def : Pat<(f64 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+ (CALL_F64 texternalsym:$callee)>;
+def : Pat<(v16i8 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+ (CALL_v16i8 texternalsym:$callee)>, Requires<[HasSIMD128]>;
+def : Pat<(v8i16 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+ (CALL_v8i16 texternalsym:$callee)>, Requires<[HasSIMD128]>;
+def : Pat<(v4i32 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+ (CALL_v4i32 texternalsym:$callee)>, Requires<[HasSIMD128]>;
+def : Pat<(v4f32 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+ (CALL_v4f32 texternalsym:$callee)>, Requires<[HasSIMD128]>;
+def : Pat<(WebAssemblycall0 (WebAssemblywrapper texternalsym:$callee)),
+ (CALL_VOID texternalsym:$callee)>;
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
new file mode 100644
index 000000000000..1146431e6b77
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -0,0 +1,114 @@
+//===- WebAssemblyInstrControl.td-WebAssembly control-flow ------*- tablegen -*-
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief WebAssembly control-flow code-gen constructs.
+///
+//===----------------------------------------------------------------------===//
+
+let Defs = [ARGUMENTS] in {
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1 in {
+// The condition operand is a boolean value which WebAssembly represents as i32.
+def BR_IF : I<(outs), (ins bb_op:$dst, I32:$cond),
+ [(brcond I32:$cond, bb:$dst)],
+ "br_if \t$dst, $cond", 0x0d>;
+let isCodeGenOnly = 1 in
+def BR_UNLESS : I<(outs), (ins bb_op:$dst, I32:$cond), []>;
+let isBarrier = 1 in {
+def BR : I<(outs), (ins bb_op:$dst),
+ [(br bb:$dst)],
+ "br \t$dst", 0x0c>;
+} // isBarrier = 1
+} // isBranch = 1, isTerminator = 1, hasCtrlDep = 1
+
+} // Defs = [ARGUMENTS]
+
+def : Pat<(brcond (i32 (setne I32:$cond, 0)), bb:$dst),
+ (BR_IF bb_op:$dst, I32:$cond)>;
+def : Pat<(brcond (i32 (seteq I32:$cond, 0)), bb:$dst),
+ (BR_UNLESS bb_op:$dst, I32:$cond)>;
+
+let Defs = [ARGUMENTS] in {
+
+// TODO: SelectionDAG's lowering insists on using a pointer as the index for
+// jump tables, so in practice we don't ever use BR_TABLE_I64 in wasm32 mode
+// currently.
+// Set TSFlags{0} to 1 to indicate that the variable_ops are immediates.
+// Set TSFlags{1} to 1 to indicate that the immediates represent labels.
+let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
+def BR_TABLE_I32 : I<(outs), (ins I32:$index, variable_ops),
+ [(WebAssemblybr_table I32:$index)],
+ "br_table \t$index", 0x0e> {
+ let TSFlags{0} = 1;
+ let TSFlags{1} = 1;
+}
+def BR_TABLE_I64 : I<(outs), (ins I64:$index, variable_ops),
+ [(WebAssemblybr_table I64:$index)],
+ "br_table \t$index"> {
+ let TSFlags{0} = 1;
+ let TSFlags{1} = 1;
+}
+} // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
+
+// Placemarkers to indicate the start or end of a block or loop scope. These
+// use/clobber VALUE_STACK to prevent them from being moved into the middle of
+// an expression tree.
+let Uses = [VALUE_STACK], Defs = [VALUE_STACK] in {
+def BLOCK : I<(outs), (ins Signature:$sig), [], "block \t$sig", 0x02>;
+def LOOP : I<(outs), (ins Signature:$sig), [], "loop \t$sig", 0x03>;
+
+// END_BLOCK and END_LOOP are represented with the same opcode in wasm.
+def END_BLOCK : I<(outs), (ins), [], "end_block", 0x0b>;
+def END_LOOP : I<(outs), (ins), [], "end_loop", 0x0b>;
+} // Uses = [VALUE_STACK], Defs = [VALUE_STACK]
+
+multiclass RETURN<WebAssemblyRegClass vt> {
+ def RETURN_#vt : I<(outs), (ins vt:$val), [(WebAssemblyreturn vt:$val)],
+ "return \t$val", 0x0f>;
+ // Equivalent to RETURN_#vt, for use at the end of a function when wasm
+ // semantics return by falling off the end of the block.
+ let isCodeGenOnly = 1 in
+ def FALLTHROUGH_RETURN_#vt : I<(outs), (ins vt:$val), []>;
+}
+
+multiclass SIMD_RETURN<ValueType vt> {
+ def RETURN_#vt : SIMD_I<(outs), (ins V128:$val),
+ [(WebAssemblyreturn (vt V128:$val))],
+ "return \t$val", 0x0f>;
+ // Equivalent to RETURN_#vt, for use at the end of a function when wasm
+ // semantics return by falling off the end of the block.
+ let isCodeGenOnly = 1 in
+ def FALLTHROUGH_RETURN_#vt : SIMD_I<(outs), (ins V128:$val), []>;
+}
+
+let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
+
+let isReturn = 1 in {
+ defm : RETURN<I32>;
+ defm : RETURN<I64>;
+ defm : RETURN<F32>;
+ defm : RETURN<F64>;
+ defm : SIMD_RETURN<v16i8>;
+ defm : SIMD_RETURN<v8i16>;
+ defm : SIMD_RETURN<v4i32>;
+ defm : SIMD_RETURN<v4f32>;
+
+ def RETURN_VOID : I<(outs), (ins), [(WebAssemblyreturn)], "return", 0x0f>;
+
+ // This is to RETURN_VOID what FALLTHROUGH_RETURN_#vt is to RETURN_#vt.
+ let isCodeGenOnly = 1 in
+ def FALLTHROUGH_RETURN_VOID : I<(outs), (ins), []>;
+} // isReturn = 1
+
+def UNREACHABLE : I<(outs), (ins), [(trap)], "unreachable", 0x00>;
+
+} // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
+
+} // Defs = [ARGUMENTS]
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrConv.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrConv.td
new file mode 100644
index 000000000000..29483ba663d5
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrConv.td
@@ -0,0 +1,111 @@
+//===-- WebAssemblyInstrConv.td-WebAssembly Conversion support -*- tablegen -*-=
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief WebAssembly datatype conversions, truncations, reinterpretations,
+/// promotions, and demotions operand code-gen constructs.
+///
+//===----------------------------------------------------------------------===//
+
+let Defs = [ARGUMENTS] in {
+
+def I32_WRAP_I64 : I<(outs I32:$dst), (ins I64:$src),
+ [(set I32:$dst, (trunc I64:$src))],
+ "i32.wrap/i64\t$dst, $src", 0xa7>;
+
+def I64_EXTEND_S_I32 : I<(outs I64:$dst), (ins I32:$src),
+ [(set I64:$dst, (sext I32:$src))],
+ "i64.extend_s/i32\t$dst, $src", 0xac>;
+def I64_EXTEND_U_I32 : I<(outs I64:$dst), (ins I32:$src),
+ [(set I64:$dst, (zext I32:$src))],
+ "i64.extend_u/i32\t$dst, $src", 0xad>;
+
+} // defs = [ARGUMENTS]
+
+// Expand a "don't care" extend into zero-extend (chosen over sign-extend
+// somewhat arbitrarily, although it favors popular hardware architectures
+// and is conceptually a simpler operation).
+def : Pat<(i64 (anyext I32:$src)), (I64_EXTEND_U_I32 I32:$src)>;
+
+let Defs = [ARGUMENTS] in {
+
+// Conversion from floating point to integer traps on overflow and invalid.
+let hasSideEffects = 1 in {
+def I32_TRUNC_S_F32 : I<(outs I32:$dst), (ins F32:$src),
+ [(set I32:$dst, (fp_to_sint F32:$src))],
+ "i32.trunc_s/f32\t$dst, $src", 0xa8>;
+def I32_TRUNC_U_F32 : I<(outs I32:$dst), (ins F32:$src),
+ [(set I32:$dst, (fp_to_uint F32:$src))],
+ "i32.trunc_u/f32\t$dst, $src", 0xa9>;
+def I64_TRUNC_S_F32 : I<(outs I64:$dst), (ins F32:$src),
+ [(set I64:$dst, (fp_to_sint F32:$src))],
+ "i64.trunc_s/f32\t$dst, $src", 0xae>;
+def I64_TRUNC_U_F32 : I<(outs I64:$dst), (ins F32:$src),
+ [(set I64:$dst, (fp_to_uint F32:$src))],
+ "i64.trunc_u/f32\t$dst, $src", 0xaf>;
+def I32_TRUNC_S_F64 : I<(outs I32:$dst), (ins F64:$src),
+ [(set I32:$dst, (fp_to_sint F64:$src))],
+ "i32.trunc_s/f64\t$dst, $src", 0xaa>;
+def I32_TRUNC_U_F64 : I<(outs I32:$dst), (ins F64:$src),
+ [(set I32:$dst, (fp_to_uint F64:$src))],
+ "i32.trunc_u/f64\t$dst, $src", 0xab>;
+def I64_TRUNC_S_F64 : I<(outs I64:$dst), (ins F64:$src),
+ [(set I64:$dst, (fp_to_sint F64:$src))],
+ "i64.trunc_s/f64\t$dst, $src", 0xb0>;
+def I64_TRUNC_U_F64 : I<(outs I64:$dst), (ins F64:$src),
+ [(set I64:$dst, (fp_to_uint F64:$src))],
+ "i64.trunc_u/f64\t$dst, $src", 0xb1>;
+} // hasSideEffects = 1
+
+def F32_CONVERT_S_I32 : I<(outs F32:$dst), (ins I32:$src),
+ [(set F32:$dst, (sint_to_fp I32:$src))],
+ "f32.convert_s/i32\t$dst, $src", 0xb2>;
+def F32_CONVERT_U_I32 : I<(outs F32:$dst), (ins I32:$src),
+ [(set F32:$dst, (uint_to_fp I32:$src))],
+ "f32.convert_u/i32\t$dst, $src", 0xb3>;
+def F64_CONVERT_S_I32 : I<(outs F64:$dst), (ins I32:$src),
+ [(set F64:$dst, (sint_to_fp I32:$src))],
+ "f64.convert_s/i32\t$dst, $src", 0xb7>;
+def F64_CONVERT_U_I32 : I<(outs F64:$dst), (ins I32:$src),
+ [(set F64:$dst, (uint_to_fp I32:$src))],
+ "f64.convert_u/i32\t$dst, $src", 0xb8>;
+def F32_CONVERT_S_I64 : I<(outs F32:$dst), (ins I64:$src),
+ [(set F32:$dst, (sint_to_fp I64:$src))],
+ "f32.convert_s/i64\t$dst, $src", 0xb4>;
+def F32_CONVERT_U_I64 : I<(outs F32:$dst), (ins I64:$src),
+ [(set F32:$dst, (uint_to_fp I64:$src))],
+ "f32.convert_u/i64\t$dst, $src", 0xb5>;
+def F64_CONVERT_S_I64 : I<(outs F64:$dst), (ins I64:$src),
+ [(set F64:$dst, (sint_to_fp I64:$src))],
+ "f64.convert_s/i64\t$dst, $src", 0xb9>;
+def F64_CONVERT_U_I64 : I<(outs F64:$dst), (ins I64:$src),
+ [(set F64:$dst, (uint_to_fp I64:$src))],
+ "f64.convert_u/i64\t$dst, $src", 0xba>;
+
+def F64_PROMOTE_F32 : I<(outs F64:$dst), (ins F32:$src),
+ [(set F64:$dst, (fpextend F32:$src))],
+ "f64.promote/f32\t$dst, $src", 0xbb>;
+def F32_DEMOTE_F64 : I<(outs F32:$dst), (ins F64:$src),
+ [(set F32:$dst, (fpround F64:$src))],
+ "f32.demote/f64\t$dst, $src", 0xb6>;
+
+def I32_REINTERPRET_F32 : I<(outs I32:$dst), (ins F32:$src),
+ [(set I32:$dst, (bitconvert F32:$src))],
+ "i32.reinterpret/f32\t$dst, $src", 0xbc>;
+def F32_REINTERPRET_I32 : I<(outs F32:$dst), (ins I32:$src),
+ [(set F32:$dst, (bitconvert I32:$src))],
+ "f32.reinterpret/i32\t$dst, $src", 0xbe>;
+def I64_REINTERPRET_F64 : I<(outs I64:$dst), (ins F64:$src),
+ [(set I64:$dst, (bitconvert F64:$src))],
+ "i64.reinterpret/f64\t$dst, $src", 0xbd>;
+def F64_REINTERPRET_I64 : I<(outs F64:$dst), (ins I64:$src),
+ [(set F64:$dst, (bitconvert I64:$src))],
+ "f64.reinterpret/i64\t$dst, $src", 0xbf>;
+
+} // Defs = [ARGUMENTS]
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFloat.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
new file mode 100644
index 000000000000..030be0862a56
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
@@ -0,0 +1,101 @@
+// WebAssemblyInstrFloat.td-WebAssembly Float codegen support ---*- tablegen -*-
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief WebAssembly Floating-point operand code-gen constructs.
+///
+//===----------------------------------------------------------------------===//
+
+let Defs = [ARGUMENTS] in {
+
+let isCommutable = 1 in
+defm ADD : BinaryFP<fadd, "add ", 0x92, 0xa0>;
+defm SUB : BinaryFP<fsub, "sub ", 0x93, 0xa1>;
+let isCommutable = 1 in
+defm MUL : BinaryFP<fmul, "mul ", 0x94, 0xa2>;
+defm DIV : BinaryFP<fdiv, "div ", 0x95, 0xa3>;
+defm SQRT : UnaryFP<fsqrt, "sqrt", 0x91, 0x9f>;
+
+defm ABS : UnaryFP<fabs, "abs ", 0x8b, 0x99>;
+defm NEG : UnaryFP<fneg, "neg ", 0x8c, 0x9a>;
+defm COPYSIGN : BinaryFP<fcopysign, "copysign", 0x98, 0xa6>;
+
+let isCommutable = 1 in {
+defm MIN : BinaryFP<fminnan, "min ", 0x96, 0xa4>;
+defm MAX : BinaryFP<fmaxnan, "max ", 0x97, 0xa5>;
+} // isCommutable = 1
+
+defm CEIL : UnaryFP<fceil, "ceil", 0x8d, 0x9b>;
+defm FLOOR : UnaryFP<ffloor, "floor", 0x8e, 0x9c>;
+defm TRUNC : UnaryFP<ftrunc, "trunc", 0x8f, 0x9d>;
+defm NEAREST : UnaryFP<fnearbyint, "nearest", 0x90, 0x9e>;
+
+} // Defs = [ARGUMENTS]
+
+// DAGCombine oddly folds casts into the rhs of copysign. Unfold them.
+def : Pat<(fcopysign F64:$lhs, F32:$rhs),
+ (COPYSIGN_F64 F64:$lhs, (F64_PROMOTE_F32 F32:$rhs))>;
+def : Pat<(fcopysign F32:$lhs, F64:$rhs),
+ (COPYSIGN_F32 F32:$lhs, (F32_DEMOTE_F64 F64:$rhs))>;
+
+// WebAssembly doesn't expose inexact exceptions, so map frint to fnearbyint.
+def : Pat<(frint f32:$src), (NEAREST_F32 f32:$src)>;
+def : Pat<(frint f64:$src), (NEAREST_F64 f64:$src)>;
+
+let Defs = [ARGUMENTS] in {
+
+let isCommutable = 1 in {
+defm EQ : ComparisonFP<SETOEQ, "eq ", 0x5b, 0x61>;
+defm NE : ComparisonFP<SETUNE, "ne ", 0x5c, 0x62>;
+} // isCommutable = 1
+defm LT : ComparisonFP<SETOLT, "lt ", 0x5d, 0x63>;
+defm LE : ComparisonFP<SETOLE, "le ", 0x5e, 0x64>;
+defm GT : ComparisonFP<SETOGT, "gt ", 0x5f, 0x65>;
+defm GE : ComparisonFP<SETOGE, "ge ", 0x60, 0x66>;
+
+} // Defs = [ARGUMENTS]
+
+// Don't care floating-point comparisons, supported via other comparisons.
+def : Pat<(seteq f32:$lhs, f32:$rhs), (EQ_F32 f32:$lhs, f32:$rhs)>;
+def : Pat<(setne f32:$lhs, f32:$rhs), (NE_F32 f32:$lhs, f32:$rhs)>;
+def : Pat<(setlt f32:$lhs, f32:$rhs), (LT_F32 f32:$lhs, f32:$rhs)>;
+def : Pat<(setle f32:$lhs, f32:$rhs), (LE_F32 f32:$lhs, f32:$rhs)>;
+def : Pat<(setgt f32:$lhs, f32:$rhs), (GT_F32 f32:$lhs, f32:$rhs)>;
+def : Pat<(setge f32:$lhs, f32:$rhs), (GE_F32 f32:$lhs, f32:$rhs)>;
+def : Pat<(seteq f64:$lhs, f64:$rhs), (EQ_F64 f64:$lhs, f64:$rhs)>;
+def : Pat<(setne f64:$lhs, f64:$rhs), (NE_F64 f64:$lhs, f64:$rhs)>;
+def : Pat<(setlt f64:$lhs, f64:$rhs), (LT_F64 f64:$lhs, f64:$rhs)>;
+def : Pat<(setle f64:$lhs, f64:$rhs), (LE_F64 f64:$lhs, f64:$rhs)>;
+def : Pat<(setgt f64:$lhs, f64:$rhs), (GT_F64 f64:$lhs, f64:$rhs)>;
+def : Pat<(setge f64:$lhs, f64:$rhs), (GE_F64 f64:$lhs, f64:$rhs)>;
+
+let Defs = [ARGUMENTS] in {
+
+def SELECT_F32 : I<(outs F32:$dst), (ins F32:$lhs, F32:$rhs, I32:$cond),
+ [(set F32:$dst, (select I32:$cond, F32:$lhs, F32:$rhs))],
+ "f32.select\t$dst, $lhs, $rhs, $cond", 0x1b>;
+def SELECT_F64 : I<(outs F64:$dst), (ins F64:$lhs, F64:$rhs, I32:$cond),
+ [(set F64:$dst, (select I32:$cond, F64:$lhs, F64:$rhs))],
+ "f64.select\t$dst, $lhs, $rhs, $cond", 0x1b>;
+
+} // Defs = [ARGUMENTS]
+
+// ISD::SELECT requires its operand to conform to getBooleanContents, but
+// WebAssembly's select interprets any non-zero value as true, so we can fold
+// a setne with 0 into a select.
+def : Pat<(select (i32 (setne I32:$cond, 0)), F32:$lhs, F32:$rhs),
+ (SELECT_F32 F32:$lhs, F32:$rhs, I32:$cond)>;
+def : Pat<(select (i32 (setne I32:$cond, 0)), F64:$lhs, F64:$rhs),
+ (SELECT_F64 F64:$lhs, F64:$rhs, I32:$cond)>;
+
+// And again, this time with seteq instead of setne and the arms reversed.
+def : Pat<(select (i32 (seteq I32:$cond, 0)), F32:$lhs, F32:$rhs),
+ (SELECT_F32 F32:$rhs, F32:$lhs, I32:$cond)>;
+def : Pat<(select (i32 (seteq I32:$cond, 0)), F64:$lhs, F64:$rhs),
+ (SELECT_F64 F64:$rhs, F64:$lhs, I32:$cond)>;
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFormats.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
new file mode 100644
index 000000000000..5b2498402571
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
@@ -0,0 +1,102 @@
+//=- WebAssemblyInstrFormats.td - WebAssembly Instr. Formats -*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief WebAssembly instruction format definitions.
+///
+//===----------------------------------------------------------------------===//
+
+// WebAssembly Instruction Format.
+class WebAssemblyInst<bits<32> inst, string asmstr> : Instruction {
+ field bits<32> Inst = inst; // Instruction encoding.
+ let Namespace = "WebAssembly";
+ let Pattern = [];
+ let AsmString = asmstr;
+}
+
+// Normal instructions.
+class I<dag oops, dag iops, list<dag> pattern, string asmstr = "", bits<32> inst = -1>
+ : WebAssemblyInst<inst, asmstr> {
+ dag OutOperandList = oops;
+ dag InOperandList = iops;
+ let Pattern = pattern;
+}
+
+class SIMD_I<dag oops, dag iops, list<dag> pattern,
+ string asmstr = "", bits<32> inst = -1>
+ : I<oops, iops, pattern, asmstr, inst>, Requires<[HasSIMD128]>;
+
+// Unary and binary instructions, for the local types that WebAssembly supports.
+multiclass UnaryInt<SDNode node, string name, bits<32> i32Inst, bits<32> i64Inst> {
+ def _I32 : I<(outs I32:$dst), (ins I32:$src),
+ [(set I32:$dst, (node I32:$src))],
+ !strconcat("i32.", !strconcat(name, "\t$dst, $src")), i32Inst>;
+ def _I64 : I<(outs I64:$dst), (ins I64:$src),
+ [(set I64:$dst, (node I64:$src))],
+ !strconcat("i64.", !strconcat(name, "\t$dst, $src")), i64Inst>;
+}
+multiclass BinaryInt<SDNode node, string name, bits<32> i32Inst, bits<32> i64Inst> {
+ def _I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs),
+ [(set I32:$dst, (node I32:$lhs, I32:$rhs))],
+ !strconcat("i32.", !strconcat(name, "\t$dst, $lhs, $rhs")), i32Inst>;
+ def _I64 : I<(outs I64:$dst), (ins I64:$lhs, I64:$rhs),
+ [(set I64:$dst, (node I64:$lhs, I64:$rhs))],
+ !strconcat("i64.", !strconcat(name, "\t$dst, $lhs, $rhs")), i64Inst>;
+}
+multiclass UnaryFP<SDNode node, string name, bits<32> f32Inst, bits<32> f64Inst> {
+ def _F32 : I<(outs F32:$dst), (ins F32:$src),
+ [(set F32:$dst, (node F32:$src))],
+ !strconcat("f32.", !strconcat(name, "\t$dst, $src")), f32Inst>;
+ def _F64 : I<(outs F64:$dst), (ins F64:$src),
+ [(set F64:$dst, (node F64:$src))],
+ !strconcat("f64.", !strconcat(name, "\t$dst, $src")), f64Inst>;
+}
+multiclass BinaryFP<SDNode node, string name, bits<32> f32Inst, bits<32> f64Inst> {
+ def _F32 : I<(outs F32:$dst), (ins F32:$lhs, F32:$rhs),
+ [(set F32:$dst, (node F32:$lhs, F32:$rhs))],
+ !strconcat("f32.", !strconcat(name, "\t$dst, $lhs, $rhs")), f32Inst>;
+ def _F64 : I<(outs F64:$dst), (ins F64:$lhs, F64:$rhs),
+ [(set F64:$dst, (node F64:$lhs, F64:$rhs))],
+ !strconcat("f64.", !strconcat(name, "\t$dst, $lhs, $rhs")), f64Inst>;
+}
+multiclass SIMDBinary<SDNode node, SDNode fnode, string name> {
+ def _I8x16 : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
+ [(set (v16i8 V128:$dst), (node V128:$lhs, V128:$rhs))],
+ !strconcat("i8x16.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
+ def _I16x8 : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
+ [(set (v8i16 V128:$dst), (node V128:$lhs, V128:$rhs))],
+ !strconcat("i16x8.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
+ def _I32x4 : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
+ [(set (v4i32 V128:$dst), (node V128:$lhs, V128:$rhs))],
+ !strconcat("i32x4.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
+ def _F32x4 : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
+ [(set (v4f32 V128:$dst), (fnode V128:$lhs, V128:$rhs))],
+ !strconcat("f32x4.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
+
+}
+multiclass ComparisonInt<CondCode cond, string name, bits<32> i32Inst, bits<32> i64Inst> {
+ def _I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs),
+ [(set I32:$dst, (setcc I32:$lhs, I32:$rhs, cond))],
+ !strconcat("i32.", !strconcat(name, "\t$dst, $lhs, $rhs")),
+ i32Inst>;
+ def _I64 : I<(outs I32:$dst), (ins I64:$lhs, I64:$rhs),
+ [(set I32:$dst, (setcc I64:$lhs, I64:$rhs, cond))],
+ !strconcat("i64.", !strconcat(name, "\t$dst, $lhs, $rhs")),
+ i64Inst>;
+}
+multiclass ComparisonFP<CondCode cond, string name, bits<32> f32Inst, bits<32> f64Inst> {
+ def _F32 : I<(outs I32:$dst), (ins F32:$lhs, F32:$rhs),
+ [(set I32:$dst, (setcc F32:$lhs, F32:$rhs, cond))],
+ !strconcat("f32.", !strconcat(name, "\t$dst, $lhs, $rhs")),
+ f32Inst>;
+ def _F64 : I<(outs I32:$dst), (ins F64:$lhs, F64:$rhs),
+ [(set I32:$dst, (setcc F64:$lhs, F64:$rhs, cond))],
+ !strconcat("f64.", !strconcat(name, "\t$dst, $lhs, $rhs")),
+ f64Inst>;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
new file mode 100644
index 000000000000..0e2d8bbaf64c
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
@@ -0,0 +1,204 @@
+//===-- WebAssemblyInstrInfo.cpp - WebAssembly Instruction Information ----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains the WebAssembly implementation of the
+/// TargetInstrInfo class.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyInstrInfo.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-instr-info"
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "WebAssemblyGenInstrInfo.inc"
+
+WebAssemblyInstrInfo::WebAssemblyInstrInfo(const WebAssemblySubtarget &STI)
+ : WebAssemblyGenInstrInfo(WebAssembly::ADJCALLSTACKDOWN,
+ WebAssembly::ADJCALLSTACKUP),
+ RI(STI.getTargetTriple()) {}
+
+bool WebAssemblyInstrInfo::isReallyTriviallyReMaterializable(
+ const MachineInstr &MI, AliasAnalysis *AA) const {
+ switch (MI.getOpcode()) {
+ case WebAssembly::CONST_I32:
+ case WebAssembly::CONST_I64:
+ case WebAssembly::CONST_F32:
+ case WebAssembly::CONST_F64:
+ // isReallyTriviallyReMaterializableGeneric misses these because of the
+ // ARGUMENTS implicit def, so we manualy override it here.
+ return true;
+ default:
+ return false;
+ }
+}
+
+void WebAssemblyInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL, unsigned DestReg,
+ unsigned SrcReg, bool KillSrc) const {
+ // This method is called by post-RA expansion, which expects only pregs to
+ // exist. However we need to handle both here.
+ auto &MRI = MBB.getParent()->getRegInfo();
+ const TargetRegisterClass *RC =
+ TargetRegisterInfo::isVirtualRegister(DestReg)
+ ? MRI.getRegClass(DestReg)
+ : MRI.getTargetRegisterInfo()->getMinimalPhysRegClass(DestReg);
+
+ unsigned CopyOpcode;
+ if (RC == &WebAssembly::I32RegClass)
+ CopyOpcode = WebAssembly::COPY_I32;
+ else if (RC == &WebAssembly::I64RegClass)
+ CopyOpcode = WebAssembly::COPY_I64;
+ else if (RC == &WebAssembly::F32RegClass)
+ CopyOpcode = WebAssembly::COPY_F32;
+ else if (RC == &WebAssembly::F64RegClass)
+ CopyOpcode = WebAssembly::COPY_F64;
+ else
+ llvm_unreachable("Unexpected register class");
+
+ BuildMI(MBB, I, DL, get(CopyOpcode), DestReg)
+ .addReg(SrcReg, KillSrc ? RegState::Kill : 0);
+}
+
+MachineInstr *
+WebAssemblyInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+ unsigned OpIdx1,
+ unsigned OpIdx2) const {
+ // If the operands are stackified, we can't reorder them.
+ WebAssemblyFunctionInfo &MFI =
+ *MI.getParent()->getParent()->getInfo<WebAssemblyFunctionInfo>();
+ if (MFI.isVRegStackified(MI.getOperand(OpIdx1).getReg()) ||
+ MFI.isVRegStackified(MI.getOperand(OpIdx2).getReg()))
+ return nullptr;
+
+ // Otherwise use the default implementation.
+ return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+}
+
+// Branch analysis.
+bool WebAssemblyInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool /*AllowModify*/) const {
+ bool HaveCond = false;
+ for (MachineInstr &MI : MBB.terminators()) {
+ switch (MI.getOpcode()) {
+ default:
+ // Unhandled instruction; bail out.
+ return true;
+ case WebAssembly::BR_IF:
+ if (HaveCond)
+ return true;
+ // If we're running after CFGStackify, we can't optimize further.
+ if (!MI.getOperand(0).isMBB())
+ return true;
+ Cond.push_back(MachineOperand::CreateImm(true));
+ Cond.push_back(MI.getOperand(1));
+ TBB = MI.getOperand(0).getMBB();
+ HaveCond = true;
+ break;
+ case WebAssembly::BR_UNLESS:
+ if (HaveCond)
+ return true;
+ // If we're running after CFGStackify, we can't optimize further.
+ if (!MI.getOperand(0).isMBB())
+ return true;
+ Cond.push_back(MachineOperand::CreateImm(false));
+ Cond.push_back(MI.getOperand(1));
+ TBB = MI.getOperand(0).getMBB();
+ HaveCond = true;
+ break;
+ case WebAssembly::BR:
+ // If we're running after CFGStackify, we can't optimize further.
+ if (!MI.getOperand(0).isMBB())
+ return true;
+ if (!HaveCond)
+ TBB = MI.getOperand(0).getMBB();
+ else
+ FBB = MI.getOperand(0).getMBB();
+ break;
+ }
+ if (MI.isBarrier())
+ break;
+ }
+
+ return false;
+}
+
+unsigned WebAssemblyInstrInfo::removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved) const {
+ assert(!BytesRemoved && "code size not handled");
+
+ MachineBasicBlock::instr_iterator I = MBB.instr_end();
+ unsigned Count = 0;
+
+ while (I != MBB.instr_begin()) {
+ --I;
+ if (I->isDebugValue())
+ continue;
+ if (!I->isTerminator())
+ break;
+ // Remove the branch.
+ I->eraseFromParent();
+ I = MBB.instr_end();
+ ++Count;
+ }
+
+ return Count;
+}
+
+unsigned WebAssemblyInstrInfo::insertBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB,
+ ArrayRef<MachineOperand> Cond,
+ const DebugLoc &DL,
+ int *BytesAdded) const {
+ assert(!BytesAdded && "code size not handled");
+
+ if (Cond.empty()) {
+ if (!TBB)
+ return 0;
+
+ BuildMI(&MBB, DL, get(WebAssembly::BR)).addMBB(TBB);
+ return 1;
+ }
+
+ assert(Cond.size() == 2 && "Expected a flag and a successor block");
+
+ if (Cond[0].getImm()) {
+ BuildMI(&MBB, DL, get(WebAssembly::BR_IF)).addMBB(TBB).addOperand(Cond[1]);
+ } else {
+ BuildMI(&MBB, DL, get(WebAssembly::BR_UNLESS))
+ .addMBB(TBB)
+ .addOperand(Cond[1]);
+ }
+ if (!FBB)
+ return 1;
+
+ BuildMI(&MBB, DL, get(WebAssembly::BR)).addMBB(FBB);
+ return 2;
+}
+
+bool WebAssemblyInstrInfo::reverseBranchCondition(
+ SmallVectorImpl<MachineOperand> &Cond) const {
+ assert(Cond.size() == 2 && "Expected a flag and a successor block");
+ Cond.front() = MachineOperand::CreateImm(!Cond.front().getImm());
+ return false;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
new file mode 100644
index 000000000000..df6c937a364b
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
@@ -0,0 +1,63 @@
+//=- WebAssemblyInstrInfo.h - WebAssembly Instruction Information -*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains the WebAssembly implementation of the
+/// TargetInstrInfo class.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYINSTRINFO_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYINSTRINFO_H
+
+#include "WebAssemblyRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "WebAssemblyGenInstrInfo.inc"
+
+namespace llvm {
+
+class WebAssemblySubtarget;
+
+class WebAssemblyInstrInfo final : public WebAssemblyGenInstrInfo {
+ const WebAssemblyRegisterInfo RI;
+
+public:
+ explicit WebAssemblyInstrInfo(const WebAssemblySubtarget &STI);
+
+ const WebAssemblyRegisterInfo &getRegisterInfo() const { return RI; }
+
+ bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
+ AliasAnalysis *AA) const override;
+
+ void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+ bool KillSrc) const override;
+ MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+ unsigned OpIdx1,
+ unsigned OpIdx2) const override;
+
+ bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify = false) const override;
+ unsigned removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved = nullptr) const override;
+ unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+ const DebugLoc &DL,
+ int *BytesAdded = nullptr) const override;
+ bool
+ reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
new file mode 100644
index 000000000000..dcfd1a42c6aa
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -0,0 +1,222 @@
+// WebAssemblyInstrInfo.td-Describe the WebAssembly Instructions-*- tablegen -*-
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief WebAssembly Instruction definitions.
+///
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// WebAssembly Instruction Predicate Definitions.
+//===----------------------------------------------------------------------===//
+
+def HasAddr32 : Predicate<"!Subtarget->hasAddr64()">;
+def HasAddr64 : Predicate<"Subtarget->hasAddr64()">;
+def HasSIMD128 : Predicate<"Subtarget->hasSIMD128()">,
+ AssemblerPredicate<"FeatureSIMD128", "simd128">;
+
+//===----------------------------------------------------------------------===//
+// WebAssembly-specific DAG Node Types.
+//===----------------------------------------------------------------------===//
+
+def SDT_WebAssemblyCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>]>;
+def SDT_WebAssemblyCallSeqEnd :
+ SDCallSeqEnd<[SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
+def SDT_WebAssemblyCall0 : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
+def SDT_WebAssemblyCall1 : SDTypeProfile<1, -1, [SDTCisPtrTy<1>]>;
+def SDT_WebAssemblyBrTable : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
+def SDT_WebAssemblyArgument : SDTypeProfile<1, 1, [SDTCisVT<1, i32>]>;
+def SDT_WebAssemblyReturn : SDTypeProfile<0, -1, []>;
+def SDT_WebAssemblyWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
+ SDTCisPtrTy<0>]>;
+
+//===----------------------------------------------------------------------===//
+// WebAssembly-specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def WebAssemblycallseq_start :
+ SDNode<"ISD::CALLSEQ_START", SDT_WebAssemblyCallSeqStart,
+ [SDNPHasChain, SDNPOutGlue]>;
+def WebAssemblycallseq_end :
+ SDNode<"ISD::CALLSEQ_END", SDT_WebAssemblyCallSeqEnd,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def WebAssemblycall0 : SDNode<"WebAssemblyISD::CALL0",
+ SDT_WebAssemblyCall0,
+ [SDNPHasChain, SDNPVariadic]>;
+def WebAssemblycall1 : SDNode<"WebAssemblyISD::CALL1",
+ SDT_WebAssemblyCall1,
+ [SDNPHasChain, SDNPVariadic]>;
+def WebAssemblybr_table : SDNode<"WebAssemblyISD::BR_TABLE",
+ SDT_WebAssemblyBrTable,
+ [SDNPHasChain, SDNPVariadic]>;
+def WebAssemblyargument : SDNode<"WebAssemblyISD::ARGUMENT",
+ SDT_WebAssemblyArgument>;
+def WebAssemblyreturn : SDNode<"WebAssemblyISD::RETURN",
+ SDT_WebAssemblyReturn, [SDNPHasChain]>;
+def WebAssemblywrapper : SDNode<"WebAssemblyISD::Wrapper",
+ SDT_WebAssemblyWrapper>;
+
+//===----------------------------------------------------------------------===//
+// WebAssembly-specific Operands.
+//===----------------------------------------------------------------------===//
+
+let OperandNamespace = "WebAssembly" in {
+
+let OperandType = "OPERAND_BASIC_BLOCK" in
+def bb_op : Operand<OtherVT>;
+
+let OperandType = "OPERAND_LOCAL" in
+def local_op : Operand<i32>;
+
+let OperandType = "OPERAND_I32IMM" in
+def i32imm_op : Operand<i32>;
+
+let OperandType = "OPERAND_I64IMM" in
+def i64imm_op : Operand<i64>;
+
+let OperandType = "OPERAND_F32IMM" in
+def f32imm_op : Operand<f32>;
+
+let OperandType = "OPERAND_F64IMM" in
+def f64imm_op : Operand<f64>;
+
+let OperandType = "OPERAND_FUNCTION32" in
+def function32_op : Operand<i32>;
+
+let OperandType = "OPERAND_OFFSET32" in
+def offset32_op : Operand<i32>;
+
+let OperandType = "OPERAND_P2ALIGN" in {
+def P2Align : Operand<i32> {
+ let PrintMethod = "printWebAssemblyP2AlignOperand";
+}
+} // OperandType = "OPERAND_P2ALIGN"
+
+let OperandType = "OPERAND_SIGNATURE" in {
+def Signature : Operand<i32> {
+ let PrintMethod = "printWebAssemblySignatureOperand";
+}
+} // OperandType = "OPERAND_SIGNATURE"
+
+} // OperandNamespace = "WebAssembly"
+
+//===----------------------------------------------------------------------===//
+// WebAssembly Instruction Format Definitions.
+//===----------------------------------------------------------------------===//
+
+include "WebAssemblyInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Additional instructions.
+//===----------------------------------------------------------------------===//
+
+multiclass ARGUMENT<WebAssemblyRegClass vt> {
+ let hasSideEffects = 1, Uses = [ARGUMENTS], isCodeGenOnly = 1 in
+ def ARGUMENT_#vt : I<(outs vt:$res), (ins i32imm:$argno),
+ [(set vt:$res, (WebAssemblyargument timm:$argno))]>;
+}
+multiclass SIMD_ARGUMENT<ValueType vt> {
+ let hasSideEffects = 1, Uses = [ARGUMENTS], isCodeGenOnly = 1 in
+ def ARGUMENT_#vt : SIMD_I<(outs V128:$res), (ins i32imm:$argno),
+ [(set (vt V128:$res),
+ (WebAssemblyargument timm:$argno))]>;
+}
+defm : ARGUMENT<I32>;
+defm : ARGUMENT<I64>;
+defm : ARGUMENT<F32>;
+defm : ARGUMENT<F64>;
+defm : SIMD_ARGUMENT<v16i8>;
+defm : SIMD_ARGUMENT<v8i16>;
+defm : SIMD_ARGUMENT<v4i32>;
+defm : SIMD_ARGUMENT<v4f32>;
+
+let Defs = [ARGUMENTS] in {
+
+// get_local and set_local are not generated by instruction selection; they
+// are implied by virtual register uses and defs.
+multiclass LOCAL<WebAssemblyRegClass vt> {
+let hasSideEffects = 0 in {
+ // COPY is not an actual instruction in wasm, but since we allow get_local and
+ // set_local to be implicit during most of codegen, we can have a COPY which
+ // is actually a no-op because all the work is done in the implied get_local
+ // and set_local. COPYs are eliminated (and replaced with
+ // get_local/set_local) in the ExplicitLocals pass.
+ let isAsCheapAsAMove = 1, isCodeGenOnly = 1 in
+ def COPY_#vt : I<(outs vt:$res), (ins vt:$src), [], "copy_local\t$res, $src">;
+
+ // TEE is similar to COPY, but writes two copies of its result. Typically
+ // this would be used to stackify one result and write the other result to a
+ // local.
+ let isAsCheapAsAMove = 1, isCodeGenOnly = 1 in
+ def TEE_#vt : I<(outs vt:$res, vt:$also), (ins vt:$src), [],
+ "tee_local\t$res, $also, $src">;
+
+ // This is the actual get_local instruction in wasm. These are made explicit
+ // by the ExplicitLocals pass. It has mayLoad because it reads from a wasm
+ // local, which is a side effect not otherwise modeled in LLVM.
+ let mayLoad = 1, isAsCheapAsAMove = 1 in
+ def GET_LOCAL_#vt : I<(outs vt:$res), (ins local_op:$local), [],
+ "get_local\t$res, $local", 0x20>;
+
+ // This is the actual set_local instruction in wasm. These are made explicit
+ // by the ExplicitLocals pass. It has mayStore because it writes to a wasm
+ // local, which is a side effect not otherwise modeled in LLVM.
+ let mayStore = 1, isAsCheapAsAMove = 1 in
+ def SET_LOCAL_#vt : I<(outs), (ins local_op:$local, vt:$src), [],
+ "set_local\t$local, $src", 0x21>;
+
+ // This is the actual tee_local instruction in wasm. TEEs are turned into
+ // TEE_LOCALs by the ExplicitLocals pass. It has mayStore for the same reason
+ // as SET_LOCAL.
+ let mayStore = 1, isAsCheapAsAMove = 1 in
+ def TEE_LOCAL_#vt : I<(outs vt:$res), (ins local_op:$local, vt:$src), [],
+ "tee_local\t$res, $local, $src", 0x22>;
+
+} // hasSideEffects = 0
+}
+defm : LOCAL<I32>;
+defm : LOCAL<I64>;
+defm : LOCAL<F32>;
+defm : LOCAL<F64>;
+defm : LOCAL<V128>, Requires<[HasSIMD128]>;
+
+let isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1 in {
+def CONST_I32 : I<(outs I32:$res), (ins i32imm_op:$imm),
+ [(set I32:$res, imm:$imm)],
+ "i32.const\t$res, $imm", 0x41>;
+def CONST_I64 : I<(outs I64:$res), (ins i64imm_op:$imm),
+ [(set I64:$res, imm:$imm)],
+ "i64.const\t$res, $imm", 0x42>;
+def CONST_F32 : I<(outs F32:$res), (ins f32imm_op:$imm),
+ [(set F32:$res, fpimm:$imm)],
+ "f32.const\t$res, $imm", 0x43>;
+def CONST_F64 : I<(outs F64:$res), (ins f64imm_op:$imm),
+ [(set F64:$res, fpimm:$imm)],
+ "f64.const\t$res, $imm", 0x44>;
+} // isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1
+
+} // Defs = [ARGUMENTS]
+
+def : Pat<(i32 (WebAssemblywrapper tglobaladdr:$addr)),
+ (CONST_I32 tglobaladdr:$addr)>;
+def : Pat<(i32 (WebAssemblywrapper texternalsym:$addr)),
+ (CONST_I32 texternalsym:$addr)>;
+
+//===----------------------------------------------------------------------===//
+// Additional sets of instructions.
+//===----------------------------------------------------------------------===//
+
+include "WebAssemblyInstrMemory.td"
+include "WebAssemblyInstrCall.td"
+include "WebAssemblyInstrControl.td"
+include "WebAssemblyInstrInteger.td"
+include "WebAssemblyInstrConv.td"
+include "WebAssemblyInstrFloat.td"
+include "WebAssemblyInstrAtomics.td"
+include "WebAssemblyInstrSIMD.td"
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
new file mode 100644
index 000000000000..8a3248ee669e
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
@@ -0,0 +1,97 @@
+// WebAssemblyInstrInteger.td-WebAssembly Integer codegen -------*- tablegen -*-
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief WebAssembly Integer operand code-gen constructs.
+///
+//===----------------------------------------------------------------------===//
+
+let Defs = [ARGUMENTS] in {
+
+// The spaces after the names are for aesthetic purposes only, to make
+// operands line up vertically after tab expansion.
+let isCommutable = 1 in
+defm ADD : BinaryInt<add, "add ", 0x6a, 0x7c>;
+defm SUB : BinaryInt<sub, "sub ", 0x6b, 0x7d>;
+let isCommutable = 1 in
+defm MUL : BinaryInt<mul, "mul ", 0x6c, 0x7e>;
+// Divide and remainder trap on a zero denominator.
+let hasSideEffects = 1 in {
+defm DIV_S : BinaryInt<sdiv, "div_s", 0x6d, 0x7f>;
+defm DIV_U : BinaryInt<udiv, "div_u", 0x6e, 0x80>;
+defm REM_S : BinaryInt<srem, "rem_s", 0x6f, 0x81>;
+defm REM_U : BinaryInt<urem, "rem_u", 0x70, 0x82>;
+} // hasSideEffects = 1
+let isCommutable = 1 in {
+defm AND : BinaryInt<and, "and ", 0x71, 0x83>;
+defm OR : BinaryInt<or, "or ", 0x72, 0x84>;
+defm XOR : BinaryInt<xor, "xor ", 0x73, 0x85>;
+} // isCommutable = 1
+defm SHL : BinaryInt<shl, "shl ", 0x74, 0x86>;
+defm SHR_S : BinaryInt<sra, "shr_s", 0x75, 0x87>;
+defm SHR_U : BinaryInt<srl, "shr_u", 0x76, 0x88>;
+defm ROTL : BinaryInt<rotl, "rotl", 0x77, 0x89>;
+defm ROTR : BinaryInt<rotr, "rotr", 0x78, 0x8a>;
+
+let isCommutable = 1 in {
+defm EQ : ComparisonInt<SETEQ, "eq ", 0x46, 0x68>;
+defm NE : ComparisonInt<SETNE, "ne ", 0x47, 0x69>;
+} // isCommutable = 1
+defm LT_S : ComparisonInt<SETLT, "lt_s", 0x48, 0x53>;
+defm LT_U : ComparisonInt<SETULT, "lt_u", 0x49, 0x54>;
+defm GT_S : ComparisonInt<SETGT, "gt_s", 0x4a, 0x55>;
+defm GT_U : ComparisonInt<SETUGT, "gt_u", 0x4b, 0x56>;
+defm LE_S : ComparisonInt<SETLE, "le_s", 0x4c, 0x57>;
+defm LE_U : ComparisonInt<SETULE, "le_u", 0x4d, 0x58>;
+defm GE_S : ComparisonInt<SETGE, "ge_s", 0x4e, 0x59>;
+defm GE_U : ComparisonInt<SETUGE, "ge_u", 0x4f, 0x5a>;
+
+defm CLZ : UnaryInt<ctlz, "clz ", 0x67, 0x79>;
+defm CTZ : UnaryInt<cttz, "ctz ", 0x68, 0x7a>;
+defm POPCNT : UnaryInt<ctpop, "popcnt", 0x69, 0x7b>;
+
+def EQZ_I32 : I<(outs I32:$dst), (ins I32:$src),
+ [(set I32:$dst, (setcc I32:$src, 0, SETEQ))],
+ "i32.eqz \t$dst, $src", 0x45>;
+def EQZ_I64 : I<(outs I32:$dst), (ins I64:$src),
+ [(set I32:$dst, (setcc I64:$src, 0, SETEQ))],
+ "i64.eqz \t$dst, $src", 0x50>;
+
+} // Defs = [ARGUMENTS]
+
+// Optimize away an explicit mask on a rotate count.
+def : Pat<(rotl I32:$lhs, (and I32:$rhs, 31)), (ROTL_I32 I32:$lhs, I32:$rhs)>;
+def : Pat<(rotr I32:$lhs, (and I32:$rhs, 31)), (ROTR_I32 I32:$lhs, I32:$rhs)>;
+def : Pat<(rotl I64:$lhs, (and I64:$rhs, 63)), (ROTL_I64 I64:$lhs, I64:$rhs)>;
+def : Pat<(rotr I64:$lhs, (and I64:$rhs, 63)), (ROTR_I64 I64:$lhs, I64:$rhs)>;
+
+let Defs = [ARGUMENTS] in {
+
+def SELECT_I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs, I32:$cond),
+ [(set I32:$dst, (select I32:$cond, I32:$lhs, I32:$rhs))],
+ "i32.select\t$dst, $lhs, $rhs, $cond", 0x1b>;
+def SELECT_I64 : I<(outs I64:$dst), (ins I64:$lhs, I64:$rhs, I32:$cond),
+ [(set I64:$dst, (select I32:$cond, I64:$lhs, I64:$rhs))],
+ "i64.select\t$dst, $lhs, $rhs, $cond", 0x1b>;
+
+} // Defs = [ARGUMENTS]
+
+// ISD::SELECT requires its operand to conform to getBooleanContents, but
+// WebAssembly's select interprets any non-zero value as true, so we can fold
+// a setne with 0 into a select.
+def : Pat<(select (i32 (setne I32:$cond, 0)), I32:$lhs, I32:$rhs),
+ (SELECT_I32 I32:$lhs, I32:$rhs, I32:$cond)>;
+def : Pat<(select (i32 (setne I32:$cond, 0)), I64:$lhs, I64:$rhs),
+ (SELECT_I64 I64:$lhs, I64:$rhs, I32:$cond)>;
+
+// And again, this time with seteq instead of setne and the arms reversed.
+def : Pat<(select (i32 (seteq I32:$cond, 0)), I32:$lhs, I32:$rhs),
+ (SELECT_I32 I32:$rhs, I32:$lhs, I32:$cond)>;
+def : Pat<(select (i32 (seteq I32:$cond, 0)), I64:$lhs, I64:$rhs),
+ (SELECT_I64 I64:$rhs, I64:$lhs, I32:$cond)>;
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
new file mode 100644
index 000000000000..b606ebb0a68d
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
@@ -0,0 +1,686 @@
+// WebAssemblyInstrMemory.td-WebAssembly Memory codegen support -*- tablegen -*-
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief WebAssembly Memory operand code-gen constructs.
+///
+//===----------------------------------------------------------------------===//
+
+// TODO:
+// - HasAddr64
+// - WebAssemblyTargetLowering having to do with atomics
+// - Each has optional alignment.
+
+// WebAssembly has i8/i16/i32/i64/f32/f64 memory types, but doesn't have i8/i16
+// local types. These memory-only types instead zero- or sign-extend into local
+// types when loading, and truncate when storing.
+
+// WebAssembly constant offsets are performed as unsigned with infinite
+// precision, so we need to check for NoUnsignedWrap so that we don't fold an
+// offset for an add that needs wrapping.
+def regPlusImm : PatFrag<(ops node:$addr, node:$off),
+ (add node:$addr, node:$off),
+ [{ return N->getFlags()->hasNoUnsignedWrap(); }]>;
+
+// Treat an 'or' node as an 'add' if the or'ed bits are known to be zero.
+def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{
+ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1)))
+ return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue());
+
+ APInt KnownZero0, KnownOne0;
+ CurDAG->computeKnownBits(N->getOperand(0), KnownZero0, KnownOne0, 0);
+ APInt KnownZero1, KnownOne1;
+ CurDAG->computeKnownBits(N->getOperand(1), KnownZero1, KnownOne1, 0);
+ return (~KnownZero0 & ~KnownZero1) == 0;
+}]>;
+
+// GlobalAddresses are conceptually unsigned values, so we can also fold them
+// into immediate values as long as the add is 'nuw'.
+// TODO: We'd like to also match GA offsets but there are cases where the
+// register can have a negative value. Find out what more we can do.
+def regPlusGA : PatFrag<(ops node:$addr, node:$off),
+ (add node:$addr, node:$off),
+ [{
+ return N->getFlags()->hasNoUnsignedWrap();
+}]>;
+
+// We don't need a regPlusES because external symbols never have constant
+// offsets folded into them, so we can just use add.
+
+let Defs = [ARGUMENTS] in {
+
+// Basic load.
+// FIXME: When we can break syntax compatibility, reorder the fields in the
+// asmstrings to match the binary encoding.
+def LOAD_I32 : I<(outs I32:$dst),
+ (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+ [], "i32.load\t$dst, ${off}(${addr})${p2align}", 0x28>;
+def LOAD_I64 : I<(outs I64:$dst),
+ (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+ [], "i64.load\t$dst, ${off}(${addr})${p2align}", 0x29>;
+def LOAD_F32 : I<(outs F32:$dst),
+ (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+ [], "f32.load\t$dst, ${off}(${addr})${p2align}", 0x2a>;
+def LOAD_F64 : I<(outs F64:$dst),
+ (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+ [], "f64.load\t$dst, ${off}(${addr})${p2align}", 0x2b>;
+
+} // Defs = [ARGUMENTS]
+
+// Select loads with no constant offset.
+def : Pat<(i32 (load I32:$addr)), (LOAD_I32 0, 0, $addr)>;
+def : Pat<(i64 (load I32:$addr)), (LOAD_I64 0, 0, $addr)>;
+def : Pat<(f32 (load I32:$addr)), (LOAD_F32 0, 0, $addr)>;
+def : Pat<(f64 (load I32:$addr)), (LOAD_F64 0, 0, $addr)>;
+
+// Select loads with a constant offset.
+def : Pat<(i32 (load (regPlusImm I32:$addr, imm:$off))),
+ (LOAD_I32 0, imm:$off, $addr)>;
+def : Pat<(i64 (load (regPlusImm I32:$addr, imm:$off))),
+ (LOAD_I64 0, imm:$off, $addr)>;
+def : Pat<(f32 (load (regPlusImm I32:$addr, imm:$off))),
+ (LOAD_F32 0, imm:$off, $addr)>;
+def : Pat<(f64 (load (regPlusImm I32:$addr, imm:$off))),
+ (LOAD_F64 0, imm:$off, $addr)>;
+def : Pat<(i32 (load (or_is_add I32:$addr, imm:$off))),
+ (LOAD_I32 0, imm:$off, $addr)>;
+def : Pat<(i64 (load (or_is_add I32:$addr, imm:$off))),
+ (LOAD_I64 0, imm:$off, $addr)>;
+def : Pat<(f32 (load (or_is_add I32:$addr, imm:$off))),
+ (LOAD_F32 0, imm:$off, $addr)>;
+def : Pat<(f64 (load (or_is_add I32:$addr, imm:$off))),
+ (LOAD_F64 0, imm:$off, $addr)>;
+def : Pat<(i32 (load (regPlusGA I32:$addr,
+ (WebAssemblywrapper tglobaladdr:$off)))),
+ (LOAD_I32 0, tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (load (regPlusGA I32:$addr,
+ (WebAssemblywrapper tglobaladdr:$off)))),
+ (LOAD_I64 0, tglobaladdr:$off, $addr)>;
+def : Pat<(f32 (load (regPlusGA I32:$addr,
+ (WebAssemblywrapper tglobaladdr:$off)))),
+ (LOAD_F32 0, tglobaladdr:$off, $addr)>;
+def : Pat<(f64 (load (regPlusGA I32:$addr,
+ (WebAssemblywrapper tglobaladdr:$off)))),
+ (LOAD_F64 0, tglobaladdr:$off, $addr)>;
+def : Pat<(i32 (load (add I32:$addr, (WebAssemblywrapper texternalsym:$off)))),
+ (LOAD_I32 0, texternalsym:$off, $addr)>;
+def : Pat<(i64 (load (add I32:$addr, (WebAssemblywrapper texternalsym:$off)))),
+ (LOAD_I64 0, texternalsym:$off, $addr)>;
+def : Pat<(f32 (load (add I32:$addr, (WebAssemblywrapper texternalsym:$off)))),
+ (LOAD_F32 0, texternalsym:$off, $addr)>;
+def : Pat<(f64 (load (add I32:$addr, (WebAssemblywrapper texternalsym:$off)))),
+ (LOAD_F64 0, texternalsym:$off, $addr)>;
+
+// Select loads with just a constant offset.
+def : Pat<(i32 (load imm:$off)), (LOAD_I32 0, imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (load imm:$off)), (LOAD_I64 0, imm:$off, (CONST_I32 0))>;
+def : Pat<(f32 (load imm:$off)), (LOAD_F32 0, imm:$off, (CONST_I32 0))>;
+def : Pat<(f64 (load imm:$off)), (LOAD_F64 0, imm:$off, (CONST_I32 0))>;
+def : Pat<(i32 (load (WebAssemblywrapper tglobaladdr:$off))),
+ (LOAD_I32 0, tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (load (WebAssemblywrapper tglobaladdr:$off))),
+ (LOAD_I64 0, tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(f32 (load (WebAssemblywrapper tglobaladdr:$off))),
+ (LOAD_F32 0, tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(f64 (load (WebAssemblywrapper tglobaladdr:$off))),
+ (LOAD_F64 0, tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i32 (load (WebAssemblywrapper texternalsym:$off))),
+ (LOAD_I32 0, texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (load (WebAssemblywrapper texternalsym:$off))),
+ (LOAD_I64 0, texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(f32 (load (WebAssemblywrapper texternalsym:$off))),
+ (LOAD_F32 0, texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(f64 (load (WebAssemblywrapper texternalsym:$off))),
+ (LOAD_F64 0, texternalsym:$off, (CONST_I32 0))>;
+
+let Defs = [ARGUMENTS] in {
+
+// Extending load.
+def LOAD8_S_I32 : I<(outs I32:$dst),
+ (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+ [], "i32.load8_s\t$dst, ${off}(${addr})${p2align}", 0x2c>;
+def LOAD8_U_I32 : I<(outs I32:$dst),
+ (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+ [], "i32.load8_u\t$dst, ${off}(${addr})${p2align}", 0x2d>;
+def LOAD16_S_I32 : I<(outs I32:$dst),
+ (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+ [], "i32.load16_s\t$dst, ${off}(${addr})${p2align}", 0x2e>;
+def LOAD16_U_I32 : I<(outs I32:$dst),
+ (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+ [], "i32.load16_u\t$dst, ${off}(${addr})${p2align}", 0x2f>;
+def LOAD8_S_I64 : I<(outs I64:$dst),
+ (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+ [], "i64.load8_s\t$dst, ${off}(${addr})${p2align}", 0x30>;
+def LOAD8_U_I64 : I<(outs I64:$dst),
+ (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+ [], "i64.load8_u\t$dst, ${off}(${addr})${p2align}", 0x31>;
+def LOAD16_S_I64 : I<(outs I64:$dst),
+ (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+ [], "i64.load16_s\t$dst, ${off}(${addr})${p2align}", 0x32>;
+def LOAD16_U_I64 : I<(outs I64:$dst),
+ (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+ [], "i64.load16_u\t$dst, ${off}(${addr})${p2align}", 0x33>;
+def LOAD32_S_I64 : I<(outs I64:$dst),
+ (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+ [], "i64.load32_s\t$dst, ${off}(${addr})${p2align}", 0x34>;
+def LOAD32_U_I64 : I<(outs I64:$dst),
+ (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+ [], "i64.load32_u\t$dst, ${off}(${addr})${p2align}", 0x35>;
+
+} // Defs = [ARGUMENTS]
+
+// Select extending loads with no constant offset.
+def : Pat<(i32 (sextloadi8 I32:$addr)), (LOAD8_S_I32 0, 0, $addr)>;
+def : Pat<(i32 (zextloadi8 I32:$addr)), (LOAD8_U_I32 0, 0, $addr)>;
+def : Pat<(i32 (sextloadi16 I32:$addr)), (LOAD16_S_I32 0, 0, $addr)>;
+def : Pat<(i32 (zextloadi16 I32:$addr)), (LOAD16_U_I32 0, 0, $addr)>;
+def : Pat<(i64 (sextloadi8 I32:$addr)), (LOAD8_S_I64 0, 0, $addr)>;
+def : Pat<(i64 (zextloadi8 I32:$addr)), (LOAD8_U_I64 0, 0, $addr)>;
+def : Pat<(i64 (sextloadi16 I32:$addr)), (LOAD16_S_I64 0, 0, $addr)>;
+def : Pat<(i64 (zextloadi16 I32:$addr)), (LOAD16_U_I64 0, 0, $addr)>;
+def : Pat<(i64 (sextloadi32 I32:$addr)), (LOAD32_S_I64 0, 0, $addr)>;
+def : Pat<(i64 (zextloadi32 I32:$addr)), (LOAD32_U_I64 0, 0, $addr)>;
+
+// Select extending loads with a constant offset.
+def : Pat<(i32 (sextloadi8 (regPlusImm I32:$addr, imm:$off))),
+ (LOAD8_S_I32 0, imm:$off, $addr)>;
+def : Pat<(i32 (zextloadi8 (regPlusImm I32:$addr, imm:$off))),
+ (LOAD8_U_I32 0, imm:$off, $addr)>;
+def : Pat<(i32 (sextloadi16 (regPlusImm I32:$addr, imm:$off))),
+ (LOAD16_S_I32 0, imm:$off, $addr)>;
+def : Pat<(i32 (zextloadi16 (regPlusImm I32:$addr, imm:$off))),
+ (LOAD16_U_I32 0, imm:$off, $addr)>;
+def : Pat<(i64 (sextloadi8 (regPlusImm I32:$addr, imm:$off))),
+ (LOAD8_S_I64 0, imm:$off, $addr)>;
+def : Pat<(i64 (zextloadi8 (regPlusImm I32:$addr, imm:$off))),
+ (LOAD8_U_I64 0, imm:$off, $addr)>;
+def : Pat<(i64 (sextloadi16 (regPlusImm I32:$addr, imm:$off))),
+ (LOAD16_S_I64 0, imm:$off, $addr)>;
+def : Pat<(i64 (zextloadi16 (regPlusImm I32:$addr, imm:$off))),
+ (LOAD16_U_I64 0, imm:$off, $addr)>;
+def : Pat<(i64 (sextloadi32 (regPlusImm I32:$addr, imm:$off))),
+ (LOAD32_S_I64 0, imm:$off, $addr)>;
+def : Pat<(i64 (zextloadi32 (regPlusImm I32:$addr, imm:$off))),
+ (LOAD32_U_I64 0, imm:$off, $addr)>;
+def : Pat<(i32 (sextloadi8 (or_is_add I32:$addr, imm:$off))),
+ (LOAD8_S_I32 0, imm:$off, $addr)>;
+def : Pat<(i32 (zextloadi8 (or_is_add I32:$addr, imm:$off))),
+ (LOAD8_U_I32 0, imm:$off, $addr)>;
+def : Pat<(i32 (sextloadi16 (or_is_add I32:$addr, imm:$off))),
+ (LOAD16_S_I32 0, imm:$off, $addr)>;
+def : Pat<(i32 (zextloadi16 (or_is_add I32:$addr, imm:$off))),
+ (LOAD16_U_I32 0, imm:$off, $addr)>;
+def : Pat<(i64 (sextloadi8 (or_is_add I32:$addr, imm:$off))),
+ (LOAD8_S_I64 0, imm:$off, $addr)>;
+def : Pat<(i64 (zextloadi8 (or_is_add I32:$addr, imm:$off))),
+ (LOAD8_U_I64 0, imm:$off, $addr)>;
+def : Pat<(i64 (sextloadi16 (or_is_add I32:$addr, imm:$off))),
+ (LOAD16_S_I64 0, imm:$off, $addr)>;
+def : Pat<(i64 (zextloadi16 (or_is_add I32:$addr, imm:$off))),
+ (LOAD16_U_I64 0, imm:$off, $addr)>;
+def : Pat<(i64 (sextloadi32 (or_is_add I32:$addr, imm:$off))),
+ (LOAD32_S_I64 0, imm:$off, $addr)>;
+def : Pat<(i64 (zextloadi32 (or_is_add I32:$addr, imm:$off))),
+ (LOAD32_U_I64 0, imm:$off, $addr)>;
+def : Pat<(i32 (sextloadi8 (regPlusGA I32:$addr,
+ (WebAssemblywrapper tglobaladdr:$off)))),
+ (LOAD8_S_I32 0, tglobaladdr:$off, $addr)>;
+def : Pat<(i32 (zextloadi8 (regPlusGA I32:$addr,
+ (WebAssemblywrapper tglobaladdr:$off)))),
+ (LOAD8_U_I32 0, tglobaladdr:$off, $addr)>;
+def : Pat<(i32 (sextloadi16 (regPlusGA I32:$addr,
+ (WebAssemblywrapper tglobaladdr:$off)))),
+ (LOAD16_S_I32 0, tglobaladdr:$off, $addr)>;
+def : Pat<(i32 (zextloadi16 (regPlusGA I32:$addr,
+ (WebAssemblywrapper tglobaladdr:$off)))),
+ (LOAD16_U_I32 0, tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (sextloadi8 (regPlusGA I32:$addr,
+ (WebAssemblywrapper tglobaladdr:$off)))),
+ (LOAD8_S_I64 0, tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (zextloadi8 (regPlusGA I32:$addr,
+ (WebAssemblywrapper tglobaladdr:$off)))),
+ (LOAD8_U_I64 0, tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (sextloadi16 (regPlusGA I32:$addr,
+ (WebAssemblywrapper tglobaladdr:$off)))),
+ (LOAD16_S_I64 0, tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (zextloadi16 (regPlusGA I32:$addr,
+ (WebAssemblywrapper tglobaladdr:$off)))),
+ (LOAD16_U_I64 0, tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (sextloadi32 (regPlusGA I32:$addr,
+ (WebAssemblywrapper tglobaladdr:$off)))),
+ (LOAD32_S_I64 0, tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (zextloadi32 (regPlusGA I32:$addr,
+ (WebAssemblywrapper tglobaladdr:$off)))),
+ (LOAD32_U_I64 0, tglobaladdr:$off, $addr)>;
+def : Pat<(i32 (sextloadi8 (add I32:$addr,
+ (WebAssemblywrapper texternalsym:$off)))),
+ (LOAD8_S_I32 0, texternalsym:$off, $addr)>;
+def : Pat<(i32 (zextloadi8 (add I32:$addr,
+ (WebAssemblywrapper texternalsym:$off)))),
+ (LOAD8_U_I32 0, texternalsym:$off, $addr)>;
+def : Pat<(i32 (sextloadi16 (add I32:$addr,
+ (WebAssemblywrapper texternalsym:$off)))),
+ (LOAD16_S_I32 0, texternalsym:$off, $addr)>;
+def : Pat<(i32 (zextloadi16 (add I32:$addr,
+ (WebAssemblywrapper texternalsym:$off)))),
+ (LOAD16_U_I32 0, texternalsym:$off, $addr)>;
+def : Pat<(i64 (sextloadi8 (add I32:$addr,
+ (WebAssemblywrapper texternalsym:$off)))),
+ (LOAD8_S_I64 0, texternalsym:$off, $addr)>;
+def : Pat<(i64 (zextloadi8 (add I32:$addr,
+ (WebAssemblywrapper texternalsym:$off)))),
+ (LOAD8_U_I64 0, texternalsym:$off, $addr)>;
+def : Pat<(i64 (sextloadi16 (add I32:$addr,
+ (WebAssemblywrapper texternalsym:$off)))),
+ (LOAD16_S_I64 0, texternalsym:$off, $addr)>;
+def : Pat<(i64 (zextloadi16 (add I32:$addr,
+ (WebAssemblywrapper texternalsym:$off)))),
+ (LOAD16_U_I64 0, texternalsym:$off, $addr)>;
+def : Pat<(i64 (sextloadi32 (add I32:$addr,
+ (WebAssemblywrapper texternalsym:$off)))),
+ (LOAD32_S_I64 0, texternalsym:$off, $addr)>;
+def : Pat<(i64 (zextloadi32 (add I32:$addr,
+ (WebAssemblywrapper texternalsym:$off)))),
+ (LOAD32_U_I64 0, texternalsym:$off, $addr)>;
+
+// Select extending loads with just a constant offset.
+def : Pat<(i32 (sextloadi8 imm:$off)),
+ (LOAD8_S_I32 0, imm:$off, (CONST_I32 0))>;
+def : Pat<(i32 (zextloadi8 imm:$off)),
+ (LOAD8_U_I32 0, imm:$off, (CONST_I32 0))>;
+def : Pat<(i32 (sextloadi16 imm:$off)),
+ (LOAD16_S_I32 0, imm:$off, (CONST_I32 0))>;
+def : Pat<(i32 (zextloadi16 imm:$off)),
+ (LOAD16_U_I32 0, imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (sextloadi8 imm:$off)),
+ (LOAD8_S_I64 0, imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (zextloadi8 imm:$off)),
+ (LOAD8_U_I64 0, imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (sextloadi16 imm:$off)),
+ (LOAD16_S_I64 0, imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (zextloadi16 imm:$off)),
+ (LOAD16_U_I64 0, imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (sextloadi32 imm:$off)),
+ (LOAD32_S_I64 0, imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (zextloadi32 imm:$off)),
+ (LOAD32_U_I64 0, imm:$off, (CONST_I32 0))>;
+def : Pat<(i32 (sextloadi8 (WebAssemblywrapper tglobaladdr:$off))),
+ (LOAD8_S_I32 0, tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i32 (zextloadi8 (WebAssemblywrapper tglobaladdr:$off))),
+ (LOAD8_U_I32 0, tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i32 (sextloadi16 (WebAssemblywrapper tglobaladdr:$off))),
+ (LOAD16_S_I32 0, tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i32 (zextloadi16 (WebAssemblywrapper tglobaladdr:$off))),
+ (LOAD16_U_I32 0, tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (sextloadi8 (WebAssemblywrapper tglobaladdr:$off))),
+ (LOAD8_S_I64 0, tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (zextloadi8 (WebAssemblywrapper tglobaladdr:$off))),
+ (LOAD8_U_I64 0, tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (sextloadi16 (WebAssemblywrapper tglobaladdr:$off))),
+ (LOAD16_S_I64 0, tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (zextloadi16 (WebAssemblywrapper tglobaladdr:$off))),
+ (LOAD16_U_I64 0, tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (sextloadi32 (WebAssemblywrapper tglobaladdr:$off))),
+ (LOAD32_S_I64 0, tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (zextloadi32 (WebAssemblywrapper tglobaladdr:$off))),
+ (LOAD32_U_I64 0, tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i32 (sextloadi8 (WebAssemblywrapper texternalsym:$off))),
+ (LOAD8_S_I32 0, texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i32 (zextloadi8 (WebAssemblywrapper texternalsym:$off))),
+ (LOAD8_U_I32 0, texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i32 (sextloadi16 (WebAssemblywrapper texternalsym:$off))),
+ (LOAD16_S_I32 0, texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i32 (zextloadi16 (WebAssemblywrapper texternalsym:$off))),
+ (LOAD16_U_I32 0, texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (sextloadi8 (WebAssemblywrapper texternalsym:$off))),
+ (LOAD8_S_I64 0, texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (zextloadi8 (WebAssemblywrapper texternalsym:$off))),
+ (LOAD8_U_I64 0, texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (sextloadi16 (WebAssemblywrapper texternalsym:$off))),
+ (LOAD16_S_I64 0, texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (zextloadi16 (WebAssemblywrapper texternalsym:$off))),
+ (LOAD16_U_I64 0, texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (sextloadi32 (WebAssemblywrapper texternalsym:$off))),
+ (LOAD32_S_I64 0, texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (zextloadi32 (WebAssemblywrapper texternalsym:$off))),
+ (LOAD32_U_I64 0, texternalsym:$off, (CONST_I32 0))>;
+
+// Resolve "don't care" extending loads to zero-extending loads. This is
+// somewhat arbitrary, but zero-extending is conceptually simpler.
+
+// Select "don't care" extending loads with no constant offset.
+def : Pat<(i32 (extloadi8 I32:$addr)), (LOAD8_U_I32 0, 0, $addr)>;
+def : Pat<(i32 (extloadi16 I32:$addr)), (LOAD16_U_I32 0, 0, $addr)>;
+def : Pat<(i64 (extloadi8 I32:$addr)), (LOAD8_U_I64 0, 0, $addr)>;
+def : Pat<(i64 (extloadi16 I32:$addr)), (LOAD16_U_I64 0, 0, $addr)>;
+def : Pat<(i64 (extloadi32 I32:$addr)), (LOAD32_U_I64 0, 0, $addr)>;
+
+// Select "don't care" extending loads with a constant offset.
+def : Pat<(i32 (extloadi8 (regPlusImm I32:$addr, imm:$off))),
+ (LOAD8_U_I32 0, imm:$off, $addr)>;
+def : Pat<(i32 (extloadi16 (regPlusImm I32:$addr, imm:$off))),
+ (LOAD16_U_I32 0, imm:$off, $addr)>;
+def : Pat<(i64 (extloadi8 (regPlusImm I32:$addr, imm:$off))),
+ (LOAD8_U_I64 0, imm:$off, $addr)>;
+def : Pat<(i64 (extloadi16 (regPlusImm I32:$addr, imm:$off))),
+ (LOAD16_U_I64 0, imm:$off, $addr)>;
+def : Pat<(i64 (extloadi32 (regPlusImm I32:$addr, imm:$off))),
+ (LOAD32_U_I64 0, imm:$off, $addr)>;
+def : Pat<(i32 (extloadi8 (or_is_add I32:$addr, imm:$off))),
+ (LOAD8_U_I32 0, imm:$off, $addr)>;
+def : Pat<(i32 (extloadi16 (or_is_add I32:$addr, imm:$off))),
+ (LOAD16_U_I32 0, imm:$off, $addr)>;
+def : Pat<(i64 (extloadi8 (or_is_add I32:$addr, imm:$off))),
+ (LOAD8_U_I64 0, imm:$off, $addr)>;
+def : Pat<(i64 (extloadi16 (or_is_add I32:$addr, imm:$off))),
+ (LOAD16_U_I64 0, imm:$off, $addr)>;
+def : Pat<(i64 (extloadi32 (or_is_add I32:$addr, imm:$off))),
+ (LOAD32_U_I64 0, imm:$off, $addr)>;
+def : Pat<(i32 (extloadi8 (regPlusGA I32:$addr,
+ (WebAssemblywrapper tglobaladdr:$off)))),
+ (LOAD8_U_I32 0, tglobaladdr:$off, $addr)>;
+def : Pat<(i32 (extloadi16 (regPlusGA I32:$addr,
+ (WebAssemblywrapper tglobaladdr:$off)))),
+ (LOAD16_U_I32 0, tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (extloadi8 (regPlusGA I32:$addr,
+ (WebAssemblywrapper tglobaladdr:$off)))),
+ (LOAD8_U_I64 0, tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (extloadi16 (regPlusGA I32:$addr,
+ (WebAssemblywrapper tglobaladdr:$off)))),
+ (LOAD16_U_I64 0, tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (extloadi32 (regPlusGA I32:$addr,
+ (WebAssemblywrapper tglobaladdr:$off)))),
+ (LOAD32_U_I64 0, tglobaladdr:$off, $addr)>;
+def : Pat<(i32 (extloadi8 (add I32:$addr,
+ (WebAssemblywrapper texternalsym:$off)))),
+ (LOAD8_U_I32 0, texternalsym:$off, $addr)>;
+def : Pat<(i32 (extloadi16 (add I32:$addr,
+ (WebAssemblywrapper texternalsym:$off)))),
+ (LOAD16_U_I32 0, texternalsym:$off, $addr)>;
+def : Pat<(i64 (extloadi8 (add I32:$addr,
+ (WebAssemblywrapper texternalsym:$off)))),
+ (LOAD8_U_I64 0, texternalsym:$off, $addr)>;
+def : Pat<(i64 (extloadi16 (add I32:$addr,
+ (WebAssemblywrapper texternalsym:$off)))),
+ (LOAD16_U_I64 0, texternalsym:$off, $addr)>;
+def : Pat<(i64 (extloadi32 (add I32:$addr,
+ (WebAssemblywrapper texternalsym:$off)))),
+ (LOAD32_U_I64 0, texternalsym:$off, $addr)>;
+
+// Select "don't care" extending loads with just a constant offset.
+def : Pat<(i32 (extloadi8 imm:$off)),
+ (LOAD8_U_I32 0, imm:$off, (CONST_I32 0))>;
+def : Pat<(i32 (extloadi16 imm:$off)),
+ (LOAD16_U_I32 0, imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (extloadi8 imm:$off)),
+ (LOAD8_U_I64 0, imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (extloadi16 imm:$off)),
+ (LOAD16_U_I64 0, imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (extloadi32 imm:$off)),
+ (LOAD32_U_I64 0, imm:$off, (CONST_I32 0))>;
+def : Pat<(i32 (extloadi8 (WebAssemblywrapper tglobaladdr:$off))),
+ (LOAD8_U_I32 0, tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i32 (extloadi16 (WebAssemblywrapper tglobaladdr:$off))),
+ (LOAD16_U_I32 0, tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (extloadi8 (WebAssemblywrapper tglobaladdr:$off))),
+ (LOAD8_U_I64 0, tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (extloadi16 (WebAssemblywrapper tglobaladdr:$off))),
+ (LOAD16_U_I64 0, tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (extloadi32 (WebAssemblywrapper tglobaladdr:$off))),
+ (LOAD32_U_I64 0, tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i32 (extloadi8 (WebAssemblywrapper texternalsym:$off))),
+ (LOAD8_U_I32 0, texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i32 (extloadi16 (WebAssemblywrapper texternalsym:$off))),
+ (LOAD16_U_I32 0, texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (extloadi8 (WebAssemblywrapper texternalsym:$off))),
+ (LOAD8_U_I64 0, texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (extloadi16 (WebAssemblywrapper texternalsym:$off))),
+ (LOAD16_U_I64 0, texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (extloadi32 (WebAssemblywrapper texternalsym:$off))),
+ (LOAD32_U_I64 0, tglobaladdr:$off, (CONST_I32 0))>;
+
+let Defs = [ARGUMENTS] in {
+
+// Basic store.
+// Note: WebAssembly inverts SelectionDAG's usual operand order.
+def STORE_I32 : I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr,
+ I32:$val), [],
+ "i32.store\t${off}(${addr})${p2align}, $val", 0x36>;
+def STORE_I64 : I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr,
+ I64:$val), [],
+ "i64.store\t${off}(${addr})${p2align}, $val", 0x37>;
+def STORE_F32 : I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr,
+ F32:$val), [],
+ "f32.store\t${off}(${addr})${p2align}, $val", 0x38>;
+def STORE_F64 : I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr,
+ F64:$val), [],
+ "f64.store\t${off}(${addr})${p2align}, $val", 0x39>;
+
+} // Defs = [ARGUMENTS]
+
+// Select stores with no constant offset.
+def : Pat<(store I32:$val, I32:$addr), (STORE_I32 0, 0, I32:$addr, I32:$val)>;
+def : Pat<(store I64:$val, I32:$addr), (STORE_I64 0, 0, I32:$addr, I64:$val)>;
+def : Pat<(store F32:$val, I32:$addr), (STORE_F32 0, 0, I32:$addr, F32:$val)>;
+def : Pat<(store F64:$val, I32:$addr), (STORE_F64 0, 0, I32:$addr, F64:$val)>;
+
+// Select stores with a constant offset.
+def : Pat<(store I32:$val, (regPlusImm I32:$addr, imm:$off)),
+ (STORE_I32 0, imm:$off, I32:$addr, I32:$val)>;
+def : Pat<(store I64:$val, (regPlusImm I32:$addr, imm:$off)),
+ (STORE_I64 0, imm:$off, I32:$addr, I64:$val)>;
+def : Pat<(store F32:$val, (regPlusImm I32:$addr, imm:$off)),
+ (STORE_F32 0, imm:$off, I32:$addr, F32:$val)>;
+def : Pat<(store F64:$val, (regPlusImm I32:$addr, imm:$off)),
+ (STORE_F64 0, imm:$off, I32:$addr, F64:$val)>;
+def : Pat<(store I32:$val, (or_is_add I32:$addr, imm:$off)),
+ (STORE_I32 0, imm:$off, I32:$addr, I32:$val)>;
+def : Pat<(store I64:$val, (or_is_add I32:$addr, imm:$off)),
+ (STORE_I64 0, imm:$off, I32:$addr, I64:$val)>;
+def : Pat<(store F32:$val, (or_is_add I32:$addr, imm:$off)),
+ (STORE_F32 0, imm:$off, I32:$addr, F32:$val)>;
+def : Pat<(store F64:$val, (or_is_add I32:$addr, imm:$off)),
+ (STORE_F64 0, imm:$off, I32:$addr, F64:$val)>;
+def : Pat<(store I32:$val, (regPlusGA I32:$addr,
+ (WebAssemblywrapper tglobaladdr:$off))),
+ (STORE_I32 0, tglobaladdr:$off, I32:$addr, I32:$val)>;
+def : Pat<(store I64:$val, (regPlusGA I32:$addr,
+ (WebAssemblywrapper tglobaladdr:$off))),
+ (STORE_I64 0, tglobaladdr:$off, I32:$addr, I64:$val)>;
+def : Pat<(store F32:$val, (regPlusGA I32:$addr,
+ (WebAssemblywrapper tglobaladdr:$off))),
+ (STORE_F32 0, tglobaladdr:$off, I32:$addr, F32:$val)>;
+def : Pat<(store F64:$val, (regPlusGA I32:$addr,
+ (WebAssemblywrapper tglobaladdr:$off))),
+ (STORE_F64 0, tglobaladdr:$off, I32:$addr, F64:$val)>;
+def : Pat<(store I32:$val, (add I32:$addr,
+ (WebAssemblywrapper texternalsym:$off))),
+ (STORE_I32 0, texternalsym:$off, I32:$addr, I32:$val)>;
+def : Pat<(store I64:$val, (add I32:$addr,
+ (WebAssemblywrapper texternalsym:$off))),
+ (STORE_I64 0, texternalsym:$off, I32:$addr, I64:$val)>;
+def : Pat<(store F32:$val, (add I32:$addr,
+ (WebAssemblywrapper texternalsym:$off))),
+ (STORE_F32 0, texternalsym:$off, I32:$addr, F32:$val)>;
+def : Pat<(store F64:$val, (add I32:$addr,
+ (WebAssemblywrapper texternalsym:$off))),
+ (STORE_F64 0, texternalsym:$off, I32:$addr, F64:$val)>;
+
+// Select stores with just a constant offset.
+def : Pat<(store I32:$val, imm:$off),
+ (STORE_I32 0, imm:$off, (CONST_I32 0), I32:$val)>;
+def : Pat<(store I64:$val, imm:$off),
+ (STORE_I64 0, imm:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(store F32:$val, imm:$off),
+ (STORE_F32 0, imm:$off, (CONST_I32 0), F32:$val)>;
+def : Pat<(store F64:$val, imm:$off),
+ (STORE_F64 0, imm:$off, (CONST_I32 0), F64:$val)>;
+def : Pat<(store I32:$val, (WebAssemblywrapper tglobaladdr:$off)),
+ (STORE_I32 0, tglobaladdr:$off, (CONST_I32 0), I32:$val)>;
+def : Pat<(store I64:$val, (WebAssemblywrapper tglobaladdr:$off)),
+ (STORE_I64 0, tglobaladdr:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(store F32:$val, (WebAssemblywrapper tglobaladdr:$off)),
+ (STORE_F32 0, tglobaladdr:$off, (CONST_I32 0), F32:$val)>;
+def : Pat<(store F64:$val, (WebAssemblywrapper tglobaladdr:$off)),
+ (STORE_F64 0, tglobaladdr:$off, (CONST_I32 0), F64:$val)>;
+def : Pat<(store I32:$val, (WebAssemblywrapper texternalsym:$off)),
+ (STORE_I32 0, texternalsym:$off, (CONST_I32 0), I32:$val)>;
+def : Pat<(store I64:$val, (WebAssemblywrapper texternalsym:$off)),
+ (STORE_I64 0, texternalsym:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(store F32:$val, (WebAssemblywrapper texternalsym:$off)),
+ (STORE_F32 0, texternalsym:$off, (CONST_I32 0), F32:$val)>;
+def : Pat<(store F64:$val, (WebAssemblywrapper texternalsym:$off)),
+ (STORE_F64 0, texternalsym:$off, (CONST_I32 0), F64:$val)>;
+
+let Defs = [ARGUMENTS] in {
+
+// Truncating store.
+def STORE8_I32 : I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr,
+ I32:$val), [],
+ "i32.store8\t${off}(${addr})${p2align}, $val", 0x3a>;
+def STORE16_I32 : I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr,
+ I32:$val), [],
+ "i32.store16\t${off}(${addr})${p2align}, $val", 0x3b>;
+def STORE8_I64 : I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr,
+ I64:$val), [],
+ "i64.store8\t${off}(${addr})${p2align}, $val", 0x3c>;
+def STORE16_I64 : I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr,
+ I64:$val), [],
+ "i64.store16\t${off}(${addr})${p2align}, $val", 0x3d>;
+def STORE32_I64 : I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr,
+ I64:$val), [],
+ "i64.store32\t${off}(${addr})${p2align}, $val", 0x3e>;
+
+} // Defs = [ARGUMENTS]
+
+// Select truncating stores with no constant offset.
+def : Pat<(truncstorei8 I32:$val, I32:$addr),
+ (STORE8_I32 0, 0, I32:$addr, I32:$val)>;
+def : Pat<(truncstorei16 I32:$val, I32:$addr),
+ (STORE16_I32 0, 0, I32:$addr, I32:$val)>;
+def : Pat<(truncstorei8 I64:$val, I32:$addr),
+ (STORE8_I64 0, 0, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei16 I64:$val, I32:$addr),
+ (STORE16_I64 0, 0, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei32 I64:$val, I32:$addr),
+ (STORE32_I64 0, 0, I32:$addr, I64:$val)>;
+
+// Select truncating stores with a constant offset.
+def : Pat<(truncstorei8 I32:$val, (regPlusImm I32:$addr, imm:$off)),
+ (STORE8_I32 0, imm:$off, I32:$addr, I32:$val)>;
+def : Pat<(truncstorei16 I32:$val, (regPlusImm I32:$addr, imm:$off)),
+ (STORE16_I32 0, imm:$off, I32:$addr, I32:$val)>;
+def : Pat<(truncstorei8 I64:$val, (regPlusImm I32:$addr, imm:$off)),
+ (STORE8_I64 0, imm:$off, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei16 I64:$val, (regPlusImm I32:$addr, imm:$off)),
+ (STORE16_I64 0, imm:$off, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei32 I64:$val, (regPlusImm I32:$addr, imm:$off)),
+ (STORE32_I64 0, imm:$off, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei8 I32:$val, (or_is_add I32:$addr, imm:$off)),
+ (STORE8_I32 0, imm:$off, I32:$addr, I32:$val)>;
+def : Pat<(truncstorei16 I32:$val, (or_is_add I32:$addr, imm:$off)),
+ (STORE16_I32 0, imm:$off, I32:$addr, I32:$val)>;
+def : Pat<(truncstorei8 I64:$val, (or_is_add I32:$addr, imm:$off)),
+ (STORE8_I64 0, imm:$off, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei16 I64:$val, (or_is_add I32:$addr, imm:$off)),
+ (STORE16_I64 0, imm:$off, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei32 I64:$val, (or_is_add I32:$addr, imm:$off)),
+ (STORE32_I64 0, imm:$off, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei8 I32:$val,
+ (regPlusGA I32:$addr,
+ (WebAssemblywrapper tglobaladdr:$off))),
+ (STORE8_I32 0, tglobaladdr:$off, I32:$addr, I32:$val)>;
+def : Pat<(truncstorei16 I32:$val,
+ (regPlusGA I32:$addr,
+ (WebAssemblywrapper tglobaladdr:$off))),
+ (STORE16_I32 0, tglobaladdr:$off, I32:$addr, I32:$val)>;
+def : Pat<(truncstorei8 I64:$val,
+ (regPlusGA I32:$addr,
+ (WebAssemblywrapper tglobaladdr:$off))),
+ (STORE8_I64 0, tglobaladdr:$off, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei16 I64:$val,
+ (regPlusGA I32:$addr,
+ (WebAssemblywrapper tglobaladdr:$off))),
+ (STORE16_I64 0, tglobaladdr:$off, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei32 I64:$val,
+ (regPlusGA I32:$addr,
+ (WebAssemblywrapper tglobaladdr:$off))),
+ (STORE32_I64 0, tglobaladdr:$off, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei8 I32:$val, (add I32:$addr,
+ (WebAssemblywrapper texternalsym:$off))),
+ (STORE8_I32 0, texternalsym:$off, I32:$addr, I32:$val)>;
+def : Pat<(truncstorei16 I32:$val,
+ (add I32:$addr,
+ (WebAssemblywrapper texternalsym:$off))),
+ (STORE16_I32 0, texternalsym:$off, I32:$addr, I32:$val)>;
+def : Pat<(truncstorei8 I64:$val,
+ (add I32:$addr,
+ (WebAssemblywrapper texternalsym:$off))),
+ (STORE8_I64 0, texternalsym:$off, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei16 I64:$val,
+ (add I32:$addr,
+ (WebAssemblywrapper texternalsym:$off))),
+ (STORE16_I64 0, texternalsym:$off, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei32 I64:$val,
+ (add I32:$addr,
+ (WebAssemblywrapper texternalsym:$off))),
+ (STORE32_I64 0, texternalsym:$off, I32:$addr, I64:$val)>;
+
+// Select truncating stores with just a constant offset.
+def : Pat<(truncstorei8 I32:$val, imm:$off),
+ (STORE8_I32 0, imm:$off, (CONST_I32 0), I32:$val)>;
+def : Pat<(truncstorei16 I32:$val, imm:$off),
+ (STORE16_I32 0, imm:$off, (CONST_I32 0), I32:$val)>;
+def : Pat<(truncstorei8 I64:$val, imm:$off),
+ (STORE8_I64 0, imm:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(truncstorei16 I64:$val, imm:$off),
+ (STORE16_I64 0, imm:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(truncstorei32 I64:$val, imm:$off),
+ (STORE32_I64 0, imm:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(truncstorei8 I32:$val, (WebAssemblywrapper tglobaladdr:$off)),
+ (STORE8_I32 0, tglobaladdr:$off, (CONST_I32 0), I32:$val)>;
+def : Pat<(truncstorei16 I32:$val, (WebAssemblywrapper tglobaladdr:$off)),
+ (STORE16_I32 0, tglobaladdr:$off, (CONST_I32 0), I32:$val)>;
+def : Pat<(truncstorei8 I64:$val, (WebAssemblywrapper tglobaladdr:$off)),
+ (STORE8_I64 0, tglobaladdr:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(truncstorei16 I64:$val, (WebAssemblywrapper tglobaladdr:$off)),
+ (STORE16_I64 0, tglobaladdr:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(truncstorei32 I64:$val, (WebAssemblywrapper tglobaladdr:$off)),
+ (STORE32_I64 0, tglobaladdr:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(truncstorei8 I32:$val, (WebAssemblywrapper texternalsym:$off)),
+ (STORE8_I32 0, texternalsym:$off, (CONST_I32 0), I32:$val)>;
+def : Pat<(truncstorei16 I32:$val, (WebAssemblywrapper texternalsym:$off)),
+ (STORE16_I32 0, texternalsym:$off, (CONST_I32 0), I32:$val)>;
+def : Pat<(truncstorei8 I64:$val, (WebAssemblywrapper texternalsym:$off)),
+ (STORE8_I64 0, texternalsym:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(truncstorei16 I64:$val, (WebAssemblywrapper texternalsym:$off)),
+ (STORE16_I64 0, texternalsym:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(truncstorei32 I64:$val, (WebAssemblywrapper texternalsym:$off)),
+ (STORE32_I64 0, texternalsym:$off, (CONST_I32 0), I64:$val)>;
+
+let Defs = [ARGUMENTS] in {
+
+// Current memory size.
+def CURRENT_MEMORY_I32 : I<(outs I32:$dst), (ins i32imm:$flags),
+ [],
+ "current_memory\t$dst", 0x3f>,
+ Requires<[HasAddr32]>;
+
+// Grow memory.
+def GROW_MEMORY_I32 : I<(outs), (ins i32imm:$flags, I32:$delta),
+ [],
+ "grow_memory\t$delta", 0x40>,
+ Requires<[HasAddr32]>;
+
+} // Defs = [ARGUMENTS]
+
+def : Pat<(int_wasm_current_memory),
+ (CURRENT_MEMORY_I32 0)>;
+def : Pat<(int_wasm_grow_memory I32:$delta),
+ (GROW_MEMORY_I32 0, $delta)>;
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
new file mode 100644
index 000000000000..e403534d580a
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -0,0 +1,19 @@
+// WebAssemblyInstrSIMD.td - WebAssembly SIMD codegen support -*- tablegen -*-//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief WebAssembly SIMD operand code-gen constructs.
+///
+//===----------------------------------------------------------------------===//
+
+let isCommutable = 1 in {
+defm ADD : SIMDBinary<add, fadd, "add ">;
+defm MUL: SIMDBinary<mul, fmul, "mul ">;
+} // isCommutable = 1
+defm SUB: SIMDBinary<sub, fsub, "sub ">;
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
new file mode 100644
index 000000000000..7ea5d05a1b21
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
@@ -0,0 +1,128 @@
+//===-- WebAssemblyLowerBrUnless.cpp - Lower br_unless --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file lowers br_unless into br_if with an inverted condition.
+///
+/// br_unless is not currently in the spec, but it's very convenient for LLVM
+/// to use. This pass allows LLVM to use it, for now.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-lower-br_unless"
+
+namespace {
+class WebAssemblyLowerBrUnless final : public MachineFunctionPass {
+ StringRef getPassName() const override {
+ return "WebAssembly Lower br_unless";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+public:
+ static char ID; // Pass identification, replacement for typeid
+ WebAssemblyLowerBrUnless() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyLowerBrUnless::ID = 0;
+FunctionPass *llvm::createWebAssemblyLowerBrUnless() {
+ return new WebAssemblyLowerBrUnless();
+}
+
+bool WebAssemblyLowerBrUnless::runOnMachineFunction(MachineFunction &MF) {
+ DEBUG(dbgs() << "********** Lowering br_unless **********\n"
+ "********** Function: "
+ << MF.getName() << '\n');
+
+ auto &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
+ const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+ auto &MRI = MF.getRegInfo();
+
+ for (auto &MBB : MF) {
+ for (auto MII = MBB.begin(); MII != MBB.end();) {
+ MachineInstr *MI = &*MII++;
+ if (MI->getOpcode() != WebAssembly::BR_UNLESS)
+ continue;
+
+ unsigned Cond = MI->getOperand(1).getReg();
+ bool Inverted = false;
+
+ // Attempt to invert the condition in place.
+ if (MFI.isVRegStackified(Cond)) {
+ assert(MRI.hasOneDef(Cond));
+ MachineInstr *Def = MRI.getVRegDef(Cond);
+ switch (Def->getOpcode()) {
+ using namespace WebAssembly;
+ case EQ_I32: Def->setDesc(TII.get(NE_I32)); Inverted = true; break;
+ case NE_I32: Def->setDesc(TII.get(EQ_I32)); Inverted = true; break;
+ case GT_S_I32: Def->setDesc(TII.get(LE_S_I32)); Inverted = true; break;
+ case GE_S_I32: Def->setDesc(TII.get(LT_S_I32)); Inverted = true; break;
+ case LT_S_I32: Def->setDesc(TII.get(GE_S_I32)); Inverted = true; break;
+ case LE_S_I32: Def->setDesc(TII.get(GT_S_I32)); Inverted = true; break;
+ case GT_U_I32: Def->setDesc(TII.get(LE_U_I32)); Inverted = true; break;
+ case GE_U_I32: Def->setDesc(TII.get(LT_U_I32)); Inverted = true; break;
+ case LT_U_I32: Def->setDesc(TII.get(GE_U_I32)); Inverted = true; break;
+ case LE_U_I32: Def->setDesc(TII.get(GT_U_I32)); Inverted = true; break;
+ case EQ_I64: Def->setDesc(TII.get(NE_I64)); Inverted = true; break;
+ case NE_I64: Def->setDesc(TII.get(EQ_I64)); Inverted = true; break;
+ case GT_S_I64: Def->setDesc(TII.get(LE_S_I64)); Inverted = true; break;
+ case GE_S_I64: Def->setDesc(TII.get(LT_S_I64)); Inverted = true; break;
+ case LT_S_I64: Def->setDesc(TII.get(GE_S_I64)); Inverted = true; break;
+ case LE_S_I64: Def->setDesc(TII.get(GT_S_I64)); Inverted = true; break;
+ case GT_U_I64: Def->setDesc(TII.get(LE_U_I64)); Inverted = true; break;
+ case GE_U_I64: Def->setDesc(TII.get(LT_U_I64)); Inverted = true; break;
+ case LT_U_I64: Def->setDesc(TII.get(GE_U_I64)); Inverted = true; break;
+ case LE_U_I64: Def->setDesc(TII.get(GT_U_I64)); Inverted = true; break;
+ case EQ_F32: Def->setDesc(TII.get(NE_F32)); Inverted = true; break;
+ case NE_F32: Def->setDesc(TII.get(EQ_F32)); Inverted = true; break;
+ case EQ_F64: Def->setDesc(TII.get(NE_F64)); Inverted = true; break;
+ case NE_F64: Def->setDesc(TII.get(EQ_F64)); Inverted = true; break;
+ default: break;
+ }
+ }
+
+ // If we weren't able to invert the condition in place. Insert an
+ // instruction to invert it.
+ if (!Inverted) {
+ unsigned Tmp = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
+ BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(WebAssembly::EQZ_I32), Tmp)
+ .addReg(Cond);
+ MFI.stackifyVReg(Tmp);
+ Cond = Tmp;
+ Inverted = true;
+ }
+
+ // The br_unless condition has now been inverted. Insert a br_if and
+ // delete the br_unless.
+ assert(Inverted);
+ BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(WebAssembly::BR_IF))
+ .addOperand(MI->getOperand(0))
+ .addReg(Cond);
+ MBB.erase(MI);
+ }
+ }
+
+ return true;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
new file mode 100644
index 000000000000..72cb1ccbe668
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -0,0 +1,1184 @@
+//=== WebAssemblyLowerEmscriptenEHSjLj.cpp - Lower exceptions for Emscripten =//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file lowers exception-related instructions and setjmp/longjmp
+/// function calls in order to use Emscripten's JavaScript try and catch
+/// mechanism.
+///
+/// To handle exceptions and setjmp/longjmps, this scheme relies on JavaScript's
+/// try and catch syntax and relevant exception-related libraries implemented
+/// in JavaScript glue code that will be produced by Emscripten. This is similar
+/// to the current Emscripten asm.js exception handling in fastcomp. For
+/// fastcomp's EH / SjLj scheme, see these files in fastcomp LLVM branch:
+/// (Location: https://github.com/kripken/emscripten-fastcomp)
+/// lib/Target/JSBackend/NaCl/LowerEmExceptionsPass.cpp
+/// lib/Target/JSBackend/NaCl/LowerEmSetjmp.cpp
+/// lib/Target/JSBackend/JSBackend.cpp
+/// lib/Target/JSBackend/CallHandlers.h
+///
+/// * Exception handling
+/// This pass lowers invokes and landingpads into library functions in JS glue
+/// code. Invokes are lowered into function wrappers called invoke wrappers that
+/// exist in JS side, which wraps the original function call with JS try-catch.
+/// If an exception occurred, cxa_throw() function in JS side sets some
+/// variables (see below) so we can check whether an exception occurred from
+/// wasm code and handle it appropriately.
+///
+/// * Setjmp-longjmp handling
+/// This pass lowers setjmp to a reasonably-performant approach for emscripten.
+/// The idea is that each block with a setjmp is broken up into two parts: the
+/// part containing setjmp and the part right after the setjmp. The latter part
+/// is either reached from the setjmp, or later from a longjmp. To handle the
+/// longjmp, all calls that might longjmp are also called using invoke wrappers
+/// and thus JS / try-catch. JS longjmp() function also sets some variables so
+/// we can check / whether a longjmp occurred from wasm code. Each block with a
+/// function call that might longjmp is also split up after the longjmp call.
+/// After the longjmp call, we check whether a longjmp occurred, and if it did,
+/// which setjmp it corresponds to, and jump to the right post-setjmp block.
+/// We assume setjmp-longjmp handling always run after EH handling, which means
+/// we don't expect any exception-related instructions when SjLj runs.
+/// FIXME Currently this scheme does not support indirect call of setjmp,
+/// because of the limitation of the scheme itself. fastcomp does not support it
+/// either.
+///
+/// In detail, this pass does following things:
+///
+/// 1) Create three global variables: __THREW__, __threwValue, and __tempRet0.
+/// __tempRet0 will be set within __cxa_find_matching_catch() function in
+/// JS library, and __THREW__ and __threwValue will be set in invoke wrappers
+/// in JS glue code. For what invoke wrappers are, refer to 3). These
+/// variables are used for both exceptions and setjmp/longjmps.
+/// __THREW__ indicates whether an exception or a longjmp occurred or not. 0
+/// means nothing occurred, 1 means an exception occurred, and other numbers
+/// mean a longjmp occurred. In the case of longjmp, __threwValue variable
+/// indicates the corresponding setjmp buffer the longjmp corresponds to.
+/// In exception handling, __tempRet0 indicates the type of an exception
+/// caught, and in setjmp/longjmp, it means the second argument to longjmp
+/// function.
+///
+/// * Exception handling
+///
+/// 2) Create setThrew and setTempRet0 functions.
+/// The global variables created in 1) will exist in wasm address space,
+/// but their values should be set in JS code, so we provide these functions
+/// as interfaces to JS glue code. These functions are equivalent to the
+/// following JS functions, which actually exist in asm.js version of JS
+/// library.
+///
+/// function setThrew(threw, value) {
+/// if (__THREW__ == 0) {
+/// __THREW__ = threw;
+/// __threwValue = value;
+/// }
+/// }
+///
+/// function setTempRet0(value) {
+/// __tempRet0 = value;
+/// }
+///
+/// 3) Lower
+/// invoke @func(arg1, arg2) to label %invoke.cont unwind label %lpad
+/// into
+/// __THREW__ = 0;
+/// call @__invoke_SIG(func, arg1, arg2)
+/// %__THREW__.val = __THREW__;
+/// __THREW__ = 0;
+/// if (%__THREW__.val == 1)
+/// goto %lpad
+/// else
+/// goto %invoke.cont
+/// SIG is a mangled string generated based on the LLVM IR-level function
+/// signature. After LLVM IR types are lowered to the target wasm types,
+/// the names for these wrappers will change based on wasm types as well,
+/// as in invoke_vi (function takes an int and returns void). The bodies of
+/// these wrappers will be generated in JS glue code, and inside those
+/// wrappers we use JS try-catch to generate actual exception effects. It
+/// also calls the original callee function. An example wrapper in JS code
+/// would look like this:
+/// function invoke_vi(index,a1) {
+/// try {
+/// Module["dynCall_vi"](index,a1); // This calls original callee
+/// } catch(e) {
+/// if (typeof e !== 'number' && e !== 'longjmp') throw e;
+/// asm["setThrew"](1, 0); // setThrew is called here
+/// }
+/// }
+/// If an exception is thrown, __THREW__ will be set to true in a wrapper,
+/// so we can jump to the right BB based on this value.
+///
+/// 4) Lower
+/// %val = landingpad catch c1 catch c2 catch c3 ...
+/// ... use %val ...
+/// into
+/// %fmc = call @__cxa_find_matching_catch_N(c1, c2, c3, ...)
+/// %val = {%fmc, __tempRet0}
+/// ... use %val ...
+/// Here N is a number calculated based on the number of clauses.
+/// Global variable __tempRet0 is set within __cxa_find_matching_catch() in
+/// JS glue code.
+///
+/// 5) Lower
+/// resume {%a, %b}
+/// into
+/// call @__resumeException(%a)
+/// where __resumeException() is a function in JS glue code.
+///
+/// 6) Lower
+/// call @llvm.eh.typeid.for(type) (intrinsic)
+/// into
+/// call @llvm_eh_typeid_for(type)
+/// llvm_eh_typeid_for function will be generated in JS glue code.
+///
+/// * Setjmp / Longjmp handling
+///
+/// 7) In the function entry that calls setjmp, initialize setjmpTable and
+/// sejmpTableSize as follows:
+/// setjmpTableSize = 4;
+/// setjmpTable = (int *) malloc(40);
+/// setjmpTable[0] = 0;
+/// setjmpTable and setjmpTableSize are used in saveSetjmp() function in JS
+/// code.
+///
+/// 8) Lower
+/// setjmp(buf)
+/// into
+/// setjmpTable = saveSetjmp(buf, label, setjmpTable, setjmpTableSize);
+/// setjmpTableSize = __tempRet0;
+/// For each dynamic setjmp call, setjmpTable stores its ID (a number which
+/// is incrementally assigned from 0) and its label (a unique number that
+/// represents each callsite of setjmp). When we need more entries in
+/// setjmpTable, it is reallocated in saveSetjmp() in JS code and it will
+/// return the new table address, and assign the new table size in
+/// __tempRet0. saveSetjmp also stores the setjmp's ID into the buffer buf.
+/// A BB with setjmp is split into two after setjmp call in order to make the
+/// post-setjmp BB the possible destination of longjmp BB.
+///
+/// 9) Lower
+/// longjmp(buf, value)
+/// into
+/// emscripten_longjmp_jmpbuf(buf, value)
+/// emscripten_longjmp_jmpbuf will be lowered to emscripten_longjmp later.
+///
+/// 10) Lower every call that might longjmp into
+/// __THREW__ = 0;
+/// call @__invoke_SIG(func, arg1, arg2)
+/// %__THREW__.val = __THREW__;
+/// __THREW__ = 0;
+/// if (%__THREW__.val != 0 & __threwValue != 0) {
+/// %label = testSetjmp(mem[%__THREW__.val], setjmpTable,
+/// setjmpTableSize);
+/// if (%label == 0)
+/// emscripten_longjmp(%__THREW__.val, __threwValue);
+/// __tempRet0 = __threwValue;
+/// } else {
+/// %label = -1;
+/// }
+/// longjmp_result = __tempRet0;
+/// switch label {
+/// label 1: goto post-setjmp BB 1
+/// label 2: goto post-setjmp BB 2
+/// ...
+/// default: goto splitted next BB
+/// }
+/// testSetjmp examines setjmpTable to see if there is a matching setjmp
+/// call. After calling an invoke wrapper, if a longjmp occurred, __THREW__
+/// will be the address of matching jmp_buf buffer and __threwValue be the
+/// second argument to longjmp. mem[__THREW__.val] is a setjmp ID that is
+/// stored in saveSetjmp. testSetjmp returns a setjmp label, a unique ID to
+/// each setjmp callsite. Label 0 means this longjmp buffer does not
+/// correspond to one of the setjmp callsites in this function, so in this
+/// case we just chain the longjmp to the caller. (Here we call
+/// emscripten_longjmp, which is different from emscripten_longjmp_jmpbuf.
+/// emscripten_longjmp_jmpbuf takes jmp_buf as its first argument, while
+/// emscripten_longjmp takes an int. Both of them will eventually be lowered
+/// to emscripten_longjmp in s2wasm, but here we need two signatures - we
+/// can't translate an int value to a jmp_buf.)
+/// Label -1 means no longjmp occurred. Otherwise we jump to the right
+/// post-setjmp BB based on the label.
+///
+///===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-lower-em-ehsjlj"
+
+static cl::list<std::string>
+ EHWhitelist("emscripten-cxx-exceptions-whitelist",
+ cl::desc("The list of function names in which Emscripten-style "
+ "exception handling is enabled (see emscripten "
+ "EMSCRIPTEN_CATCHING_WHITELIST options)"),
+ cl::CommaSeparated);
+
+namespace {
+class WebAssemblyLowerEmscriptenEHSjLj final : public ModulePass {
+ static const char *ThrewGVName;
+ static const char *ThrewValueGVName;
+ static const char *TempRet0GVName;
+ static const char *ResumeFName;
+ static const char *EHTypeIDFName;
+ static const char *SetThrewFName;
+ static const char *SetTempRet0FName;
+ static const char *EmLongjmpFName;
+ static const char *EmLongjmpJmpbufFName;
+ static const char *SaveSetjmpFName;
+ static const char *TestSetjmpFName;
+ static const char *FindMatchingCatchPrefix;
+ static const char *InvokePrefix;
+
+ bool EnableEH; // Enable exception handling
+ bool EnableSjLj; // Enable setjmp/longjmp handling
+
+ GlobalVariable *ThrewGV;
+ GlobalVariable *ThrewValueGV;
+ GlobalVariable *TempRet0GV;
+ Function *ResumeF;
+ Function *EHTypeIDF;
+ Function *EmLongjmpF;
+ Function *EmLongjmpJmpbufF;
+ Function *SaveSetjmpF;
+ Function *TestSetjmpF;
+
+ // __cxa_find_matching_catch_N functions.
+ // Indexed by the number of clauses in an original landingpad instruction.
+ DenseMap<int, Function *> FindMatchingCatches;
+ // Map of <function signature string, invoke_ wrappers>
+ StringMap<Function *> InvokeWrappers;
+ // Set of whitelisted function names for exception handling
+ std::set<std::string> EHWhitelistSet;
+
+ StringRef getPassName() const override {
+ return "WebAssembly Lower Emscripten Exceptions";
+ }
+
+ bool runEHOnFunction(Function &F);
+ bool runSjLjOnFunction(Function &F);
+ Function *getFindMatchingCatch(Module &M, unsigned NumClauses);
+
+ template <typename CallOrInvoke> Value *wrapInvoke(CallOrInvoke *CI);
+ void wrapTestSetjmp(BasicBlock *BB, Instruction *InsertPt, Value *Threw,
+ Value *SetjmpTable, Value *SetjmpTableSize, Value *&Label,
+ Value *&LongjmpResult, BasicBlock *&EndBB);
+ template <typename CallOrInvoke> Function *getInvokeWrapper(CallOrInvoke *CI);
+
+ bool areAllExceptionsAllowed() const { return EHWhitelistSet.empty(); }
+ bool canLongjmp(Module &M, const Value *Callee) const;
+
+ void createSetThrewFunction(Module &M);
+ void createSetTempRet0Function(Module &M);
+
+ void rebuildSSA(Function &F);
+
+public:
+ static char ID;
+
+ WebAssemblyLowerEmscriptenEHSjLj(bool EnableEH = true, bool EnableSjLj = true)
+ : ModulePass(ID), EnableEH(EnableEH), EnableSjLj(EnableSjLj),
+ ThrewGV(nullptr), ThrewValueGV(nullptr), TempRet0GV(nullptr),
+ ResumeF(nullptr), EHTypeIDF(nullptr), EmLongjmpF(nullptr),
+ EmLongjmpJmpbufF(nullptr), SaveSetjmpF(nullptr), TestSetjmpF(nullptr) {
+ EHWhitelistSet.insert(EHWhitelist.begin(), EHWhitelist.end());
+ }
+ bool runOnModule(Module &M) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<DominatorTreeWrapperPass>();
+ }
+};
+} // End anonymous namespace
+
+const char *WebAssemblyLowerEmscriptenEHSjLj::ThrewGVName = "__THREW__";
+const char *WebAssemblyLowerEmscriptenEHSjLj::ThrewValueGVName = "__threwValue";
+const char *WebAssemblyLowerEmscriptenEHSjLj::TempRet0GVName = "__tempRet0";
+const char *WebAssemblyLowerEmscriptenEHSjLj::ResumeFName = "__resumeException";
+const char *WebAssemblyLowerEmscriptenEHSjLj::EHTypeIDFName =
+ "llvm_eh_typeid_for";
+const char *WebAssemblyLowerEmscriptenEHSjLj::SetThrewFName = "setThrew";
+const char *WebAssemblyLowerEmscriptenEHSjLj::SetTempRet0FName = "setTempRet0";
+const char *WebAssemblyLowerEmscriptenEHSjLj::EmLongjmpFName =
+ "emscripten_longjmp";
+const char *WebAssemblyLowerEmscriptenEHSjLj::EmLongjmpJmpbufFName =
+ "emscripten_longjmp_jmpbuf";
+const char *WebAssemblyLowerEmscriptenEHSjLj::SaveSetjmpFName = "saveSetjmp";
+const char *WebAssemblyLowerEmscriptenEHSjLj::TestSetjmpFName = "testSetjmp";
+const char *WebAssemblyLowerEmscriptenEHSjLj::FindMatchingCatchPrefix =
+ "__cxa_find_matching_catch_";
+const char *WebAssemblyLowerEmscriptenEHSjLj::InvokePrefix = "__invoke_";
+
+char WebAssemblyLowerEmscriptenEHSjLj::ID = 0;
+INITIALIZE_PASS(WebAssemblyLowerEmscriptenEHSjLj, DEBUG_TYPE,
+ "WebAssembly Lower Emscripten Exceptions / Setjmp / Longjmp",
+ false, false)
+
+ModulePass *llvm::createWebAssemblyLowerEmscriptenEHSjLj(bool EnableEH,
+ bool EnableSjLj) {
+ return new WebAssemblyLowerEmscriptenEHSjLj(EnableEH, EnableSjLj);
+}
+
+static bool canThrow(const Value *V) {
+ if (const auto *F = dyn_cast<const Function>(V)) {
+ // Intrinsics cannot throw
+ if (F->isIntrinsic())
+ return false;
+ StringRef Name = F->getName();
+ // leave setjmp and longjmp (mostly) alone, we process them properly later
+ if (Name == "setjmp" || Name == "longjmp")
+ return false;
+ return !F->doesNotThrow();
+ }
+ // not a function, so an indirect call - can throw, we can't tell
+ return true;
+}
+
+// Returns an available name for a global value.
+// If the proposed name already exists in the module, adds '_' at the end of
+// the name until the name is available.
+static inline std::string createGlobalValueName(const Module &M,
+ const std::string &Propose) {
+ std::string Name = Propose;
+ while (M.getNamedGlobal(Name))
+ Name += "_";
+ return Name;
+}
+
+// Simple function name mangler.
+// This function simply takes LLVM's string representation of parameter types
+// and concatenate them with '_'. There are non-alphanumeric characters but llc
+// is ok with it, and we need to postprocess these names after the lowering
+// phase anyway.
+static std::string getSignature(FunctionType *FTy) {
+ std::string Sig;
+ raw_string_ostream OS(Sig);
+ OS << *FTy->getReturnType();
+ for (Type *ParamTy : FTy->params())
+ OS << "_" << *ParamTy;
+ if (FTy->isVarArg())
+ OS << "_...";
+ Sig = OS.str();
+ Sig.erase(remove_if(Sig, isspace), Sig.end());
+ // When s2wasm parses .s file, a comma means the end of an argument. So a
+ // mangled function name can contain any character but a comma.
+ std::replace(Sig.begin(), Sig.end(), ',', '.');
+ return Sig;
+}
+
+// Returns __cxa_find_matching_catch_N function, where N = NumClauses + 2.
+// This is because a landingpad instruction contains two more arguments, a
+// personality function and a cleanup bit, and __cxa_find_matching_catch_N
+// functions are named after the number of arguments in the original landingpad
+// instruction.
+Function *
+WebAssemblyLowerEmscriptenEHSjLj::getFindMatchingCatch(Module &M,
+ unsigned NumClauses) {
+ if (FindMatchingCatches.count(NumClauses))
+ return FindMatchingCatches[NumClauses];
+ PointerType *Int8PtrTy = Type::getInt8PtrTy(M.getContext());
+ SmallVector<Type *, 16> Args(NumClauses, Int8PtrTy);
+ FunctionType *FTy = FunctionType::get(Int8PtrTy, Args, false);
+ Function *F =
+ Function::Create(FTy, GlobalValue::ExternalLinkage,
+ FindMatchingCatchPrefix + Twine(NumClauses + 2), &M);
+ FindMatchingCatches[NumClauses] = F;
+ return F;
+}
+
+// Generate invoke wrapper seqence with preamble and postamble
+// Preamble:
+// __THREW__ = 0;
+// Postamble:
+// %__THREW__.val = __THREW__; __THREW__ = 0;
+// Returns %__THREW__.val, which indicates whether an exception is thrown (or
+// whether longjmp occurred), for future use.
+template <typename CallOrInvoke>
+Value *WebAssemblyLowerEmscriptenEHSjLj::wrapInvoke(CallOrInvoke *CI) {
+ LLVMContext &C = CI->getModule()->getContext();
+
+ // If we are calling a function that is noreturn, we must remove that
+ // attribute. The code we insert here does expect it to return, after we
+ // catch the exception.
+ if (CI->doesNotReturn()) {
+ if (auto *F = dyn_cast<Function>(CI->getCalledValue()))
+ F->removeFnAttr(Attribute::NoReturn);
+ CI->removeAttribute(AttributeSet::FunctionIndex, Attribute::NoReturn);
+ }
+
+ IRBuilder<> IRB(C);
+ IRB.SetInsertPoint(CI);
+
+ // Pre-invoke
+ // __THREW__ = 0;
+ IRB.CreateStore(IRB.getInt32(0), ThrewGV);
+
+ // Invoke function wrapper in JavaScript
+ SmallVector<Value *, 16> Args;
+ // Put the pointer to the callee as first argument, so it can be called
+ // within the invoke wrapper later
+ Args.push_back(CI->getCalledValue());
+ Args.append(CI->arg_begin(), CI->arg_end());
+ CallInst *NewCall = IRB.CreateCall(getInvokeWrapper(CI), Args);
+ NewCall->takeName(CI);
+ NewCall->setCallingConv(CI->getCallingConv());
+ NewCall->setDebugLoc(CI->getDebugLoc());
+
+ // Because we added the pointer to the callee as first argument, all
+ // argument attribute indices have to be incremented by one.
+ SmallVector<AttributeSet, 8> AttributesVec;
+ const AttributeSet &InvokePAL = CI->getAttributes();
+ CallSite::arg_iterator AI = CI->arg_begin();
+ unsigned i = 1; // Argument attribute index starts from 1
+ for (unsigned e = CI->getNumArgOperands(); i <= e; ++AI, ++i) {
+ if (InvokePAL.hasAttributes(i)) {
+ AttrBuilder B(InvokePAL, i);
+ AttributesVec.push_back(AttributeSet::get(C, i + 1, B));
+ }
+ }
+ // Add any return attributes.
+ if (InvokePAL.hasAttributes(AttributeSet::ReturnIndex))
+ AttributesVec.push_back(AttributeSet::get(C, InvokePAL.getRetAttributes()));
+ // Add any function attributes.
+ if (InvokePAL.hasAttributes(AttributeSet::FunctionIndex))
+ AttributesVec.push_back(AttributeSet::get(C, InvokePAL.getFnAttributes()));
+ // Reconstruct the AttributesList based on the vector we constructed.
+ AttributeSet NewCallPAL = AttributeSet::get(C, AttributesVec);
+ NewCall->setAttributes(NewCallPAL);
+
+ CI->replaceAllUsesWith(NewCall);
+
+ // Post-invoke
+ // %__THREW__.val = __THREW__; __THREW__ = 0;
+ Value *Threw = IRB.CreateLoad(ThrewGV, ThrewGV->getName() + ".val");
+ IRB.CreateStore(IRB.getInt32(0), ThrewGV);
+ return Threw;
+}
+
+// Get matching invoke wrapper based on callee signature
+template <typename CallOrInvoke>
+Function *WebAssemblyLowerEmscriptenEHSjLj::getInvokeWrapper(CallOrInvoke *CI) {
+ Module *M = CI->getModule();
+ SmallVector<Type *, 16> ArgTys;
+ Value *Callee = CI->getCalledValue();
+ FunctionType *CalleeFTy;
+ if (auto *F = dyn_cast<Function>(Callee))
+ CalleeFTy = F->getFunctionType();
+ else {
+ auto *CalleeTy = cast<PointerType>(Callee->getType())->getElementType();
+ CalleeFTy = dyn_cast<FunctionType>(CalleeTy);
+ }
+
+ std::string Sig = getSignature(CalleeFTy);
+ if (InvokeWrappers.find(Sig) != InvokeWrappers.end())
+ return InvokeWrappers[Sig];
+
+ // Put the pointer to the callee as first argument
+ ArgTys.push_back(PointerType::getUnqual(CalleeFTy));
+ // Add argument types
+ ArgTys.append(CalleeFTy->param_begin(), CalleeFTy->param_end());
+
+ FunctionType *FTy = FunctionType::get(CalleeFTy->getReturnType(), ArgTys,
+ CalleeFTy->isVarArg());
+ Function *F = Function::Create(FTy, GlobalValue::ExternalLinkage,
+ InvokePrefix + Sig, M);
+ InvokeWrappers[Sig] = F;
+ return F;
+}
+
+bool WebAssemblyLowerEmscriptenEHSjLj::canLongjmp(Module &M,
+ const Value *Callee) const {
+ if (auto *CalleeF = dyn_cast<Function>(Callee))
+ if (CalleeF->isIntrinsic())
+ return false;
+
+ // The reason we include malloc/free here is to exclude the malloc/free
+ // calls generated in setjmp prep / cleanup routines.
+ Function *SetjmpF = M.getFunction("setjmp");
+ Function *MallocF = M.getFunction("malloc");
+ Function *FreeF = M.getFunction("free");
+ if (Callee == SetjmpF || Callee == MallocF || Callee == FreeF)
+ return false;
+
+ // There are functions in JS glue code
+ if (Callee == ResumeF || Callee == EHTypeIDF || Callee == SaveSetjmpF ||
+ Callee == TestSetjmpF)
+ return false;
+
+ // __cxa_find_matching_catch_N functions cannot longjmp
+ if (Callee->getName().startswith(FindMatchingCatchPrefix))
+ return false;
+
+ // Exception-catching related functions
+ Function *BeginCatchF = M.getFunction("__cxa_begin_catch");
+ Function *EndCatchF = M.getFunction("__cxa_end_catch");
+ Function *AllocExceptionF = M.getFunction("__cxa_allocate_exception");
+ Function *ThrowF = M.getFunction("__cxa_throw");
+ Function *TerminateF = M.getFunction("__clang_call_terminate");
+ if (Callee == BeginCatchF || Callee == EndCatchF ||
+ Callee == AllocExceptionF || Callee == ThrowF || Callee == TerminateF)
+ return false;
+
+ // Otherwise we don't know
+ return true;
+}
+
+// Generate testSetjmp function call seqence with preamble and postamble.
+// The code this generates is equivalent to the following JavaScript code:
+// if (%__THREW__.val != 0 & threwValue != 0) {
+// %label = _testSetjmp(mem[%__THREW__.val], setjmpTable, setjmpTableSize);
+// if (%label == 0)
+// emscripten_longjmp(%__THREW__.val, threwValue);
+// __tempRet0 = threwValue;
+// } else {
+// %label = -1;
+// }
+// %longjmp_result = __tempRet0;
+//
+// As output parameters. returns %label, %longjmp_result, and the BB the last
+// instruction (%longjmp_result = ...) is in.
+void WebAssemblyLowerEmscriptenEHSjLj::wrapTestSetjmp(
+ BasicBlock *BB, Instruction *InsertPt, Value *Threw, Value *SetjmpTable,
+ Value *SetjmpTableSize, Value *&Label, Value *&LongjmpResult,
+ BasicBlock *&EndBB) {
+ Function *F = BB->getParent();
+ LLVMContext &C = BB->getModule()->getContext();
+ IRBuilder<> IRB(C);
+ IRB.SetInsertPoint(InsertPt);
+
+ // if (%__THREW__.val != 0 & threwValue != 0)
+ IRB.SetInsertPoint(BB);
+ BasicBlock *ThenBB1 = BasicBlock::Create(C, "if.then1", F);
+ BasicBlock *ElseBB1 = BasicBlock::Create(C, "if.else1", F);
+ BasicBlock *EndBB1 = BasicBlock::Create(C, "if.end", F);
+ Value *ThrewCmp = IRB.CreateICmpNE(Threw, IRB.getInt32(0));
+ Value *ThrewValue =
+ IRB.CreateLoad(ThrewValueGV, ThrewValueGV->getName() + ".val");
+ Value *ThrewValueCmp = IRB.CreateICmpNE(ThrewValue, IRB.getInt32(0));
+ Value *Cmp1 = IRB.CreateAnd(ThrewCmp, ThrewValueCmp, "cmp1");
+ IRB.CreateCondBr(Cmp1, ThenBB1, ElseBB1);
+
+ // %label = _testSetjmp(mem[%__THREW__.val], _setjmpTable, _setjmpTableSize);
+ // if (%label == 0)
+ IRB.SetInsertPoint(ThenBB1);
+ BasicBlock *ThenBB2 = BasicBlock::Create(C, "if.then2", F);
+ BasicBlock *EndBB2 = BasicBlock::Create(C, "if.end2", F);
+ Value *ThrewInt = IRB.CreateIntToPtr(Threw, Type::getInt32PtrTy(C),
+ Threw->getName() + ".i32p");
+ Value *LoadedThrew =
+ IRB.CreateLoad(ThrewInt, ThrewInt->getName() + ".loaded");
+ Value *ThenLabel = IRB.CreateCall(
+ TestSetjmpF, {LoadedThrew, SetjmpTable, SetjmpTableSize}, "label");
+ Value *Cmp2 = IRB.CreateICmpEQ(ThenLabel, IRB.getInt32(0));
+ IRB.CreateCondBr(Cmp2, ThenBB2, EndBB2);
+
+ // emscripten_longjmp(%__THREW__.val, threwValue);
+ IRB.SetInsertPoint(ThenBB2);
+ IRB.CreateCall(EmLongjmpF, {Threw, ThrewValue});
+ IRB.CreateUnreachable();
+
+ // __tempRet0 = threwValue;
+ IRB.SetInsertPoint(EndBB2);
+ IRB.CreateStore(ThrewValue, TempRet0GV);
+ IRB.CreateBr(EndBB1);
+
+ IRB.SetInsertPoint(ElseBB1);
+ IRB.CreateBr(EndBB1);
+
+ // longjmp_result = __tempRet0;
+ IRB.SetInsertPoint(EndBB1);
+ PHINode *LabelPHI = IRB.CreatePHI(IRB.getInt32Ty(), 2, "label");
+ LabelPHI->addIncoming(ThenLabel, EndBB2);
+
+ LabelPHI->addIncoming(IRB.getInt32(-1), ElseBB1);
+
+ // Output parameter assignment
+ Label = LabelPHI;
+ EndBB = EndBB1;
+ LongjmpResult = IRB.CreateLoad(TempRet0GV, "longjmp_result");
+}
+
+// Create setThrew function
+// function setThrew(threw, value) {
+// if (__THREW__ == 0) {
+// __THREW__ = threw;
+// __threwValue = value;
+// }
+// }
+void WebAssemblyLowerEmscriptenEHSjLj::createSetThrewFunction(Module &M) {
+ LLVMContext &C = M.getContext();
+ IRBuilder<> IRB(C);
+
+ assert(!M.getNamedGlobal(SetThrewFName) && "setThrew already exists");
+ Type *Params[] = {IRB.getInt32Ty(), IRB.getInt32Ty()};
+ FunctionType *FTy = FunctionType::get(IRB.getVoidTy(), Params, false);
+ Function *F =
+ Function::Create(FTy, GlobalValue::ExternalLinkage, SetThrewFName, &M);
+ Argument *Arg1 = &*(F->arg_begin());
+ Argument *Arg2 = &*(++F->arg_begin());
+ Arg1->setName("threw");
+ Arg2->setName("value");
+ BasicBlock *EntryBB = BasicBlock::Create(C, "entry", F);
+ BasicBlock *ThenBB = BasicBlock::Create(C, "if.then", F);
+ BasicBlock *EndBB = BasicBlock::Create(C, "if.end", F);
+
+ IRB.SetInsertPoint(EntryBB);
+ Value *Threw = IRB.CreateLoad(ThrewGV, ThrewGV->getName() + ".val");
+ Value *Cmp = IRB.CreateICmpEQ(Threw, IRB.getInt32(0), "cmp");
+ IRB.CreateCondBr(Cmp, ThenBB, EndBB);
+
+ IRB.SetInsertPoint(ThenBB);
+ IRB.CreateStore(Arg1, ThrewGV);
+ IRB.CreateStore(Arg2, ThrewValueGV);
+ IRB.CreateBr(EndBB);
+
+ IRB.SetInsertPoint(EndBB);
+ IRB.CreateRetVoid();
+}
+
+// Create setTempRet0 function
+// function setTempRet0(value) {
+// __tempRet0 = value;
+// }
+void WebAssemblyLowerEmscriptenEHSjLj::createSetTempRet0Function(Module &M) {
+ LLVMContext &C = M.getContext();
+ IRBuilder<> IRB(C);
+
+ assert(!M.getNamedGlobal(SetTempRet0FName) && "setTempRet0 already exists");
+ Type *Params[] = {IRB.getInt32Ty()};
+ FunctionType *FTy = FunctionType::get(IRB.getVoidTy(), Params, false);
+ Function *F =
+ Function::Create(FTy, GlobalValue::ExternalLinkage, SetTempRet0FName, &M);
+ F->arg_begin()->setName("value");
+ BasicBlock *EntryBB = BasicBlock::Create(C, "entry", F);
+ IRB.SetInsertPoint(EntryBB);
+ IRB.CreateStore(&*F->arg_begin(), TempRet0GV);
+ IRB.CreateRetVoid();
+}
+
+void WebAssemblyLowerEmscriptenEHSjLj::rebuildSSA(Function &F) {
+ DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
+ DT.recalculate(F); // CFG has been changed
+ SSAUpdater SSA;
+ for (BasicBlock &BB : F) {
+ for (Instruction &I : BB) {
+ for (auto UI = I.use_begin(), UE = I.use_end(); UI != UE;) {
+ Use &U = *UI;
+ ++UI;
+ SSA.Initialize(I.getType(), I.getName());
+ SSA.AddAvailableValue(&BB, &I);
+ Instruction *User = cast<Instruction>(U.getUser());
+ if (User->getParent() == &BB)
+ continue;
+
+ if (PHINode *UserPN = dyn_cast<PHINode>(User))
+ if (UserPN->getIncomingBlock(U) == &BB)
+ continue;
+
+ if (DT.dominates(&I, User))
+ continue;
+ SSA.RewriteUseAfterInsertions(U);
+ }
+ }
+ }
+}
+
+bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) {
+ LLVMContext &C = M.getContext();
+ IRBuilder<> IRB(C);
+
+ Function *SetjmpF = M.getFunction("setjmp");
+ Function *LongjmpF = M.getFunction("longjmp");
+ bool SetjmpUsed = SetjmpF && !SetjmpF->use_empty();
+ bool LongjmpUsed = LongjmpF && !LongjmpF->use_empty();
+ bool DoSjLj = EnableSjLj && (SetjmpUsed || LongjmpUsed);
+
+ // Create global variables __THREW__, threwValue, and __tempRet0, which are
+ // used in common for both exception handling and setjmp/longjmp handling
+ ThrewGV = new GlobalVariable(M, IRB.getInt32Ty(), false,
+ GlobalValue::ExternalLinkage, IRB.getInt32(0),
+ createGlobalValueName(M, ThrewGVName));
+ ThrewValueGV = new GlobalVariable(
+ M, IRB.getInt32Ty(), false, GlobalValue::ExternalLinkage, IRB.getInt32(0),
+ createGlobalValueName(M, ThrewValueGVName));
+ TempRet0GV = new GlobalVariable(M, IRB.getInt32Ty(), false,
+ GlobalValue::ExternalLinkage, IRB.getInt32(0),
+ createGlobalValueName(M, TempRet0GVName));
+
+ bool Changed = false;
+
+ // Exception handling
+ if (EnableEH) {
+ // Register __resumeException function
+ FunctionType *ResumeFTy =
+ FunctionType::get(IRB.getVoidTy(), IRB.getInt8PtrTy(), false);
+ ResumeF = Function::Create(ResumeFTy, GlobalValue::ExternalLinkage,
+ ResumeFName, &M);
+
+ // Register llvm_eh_typeid_for function
+ FunctionType *EHTypeIDTy =
+ FunctionType::get(IRB.getInt32Ty(), IRB.getInt8PtrTy(), false);
+ EHTypeIDF = Function::Create(EHTypeIDTy, GlobalValue::ExternalLinkage,
+ EHTypeIDFName, &M);
+
+ for (Function &F : M) {
+ if (F.isDeclaration())
+ continue;
+ Changed |= runEHOnFunction(F);
+ }
+ }
+
+ // Setjmp/longjmp handling
+ if (DoSjLj) {
+ Changed = true; // We have setjmp or longjmp somewhere
+
+ Function *MallocF = M.getFunction("malloc");
+ Function *FreeF = M.getFunction("free");
+ if (!MallocF || !FreeF)
+ report_fatal_error(
+ "malloc and free must be linked into the module if setjmp is used");
+
+ // Register saveSetjmp function
+ FunctionType *SetjmpFTy = SetjmpF->getFunctionType();
+ SmallVector<Type *, 4> Params = {SetjmpFTy->getParamType(0),
+ IRB.getInt32Ty(), Type::getInt32PtrTy(C),
+ IRB.getInt32Ty()};
+ FunctionType *FTy =
+ FunctionType::get(Type::getInt32PtrTy(C), Params, false);
+ SaveSetjmpF = Function::Create(FTy, GlobalValue::ExternalLinkage,
+ SaveSetjmpFName, &M);
+
+ // Register testSetjmp function
+ Params = {IRB.getInt32Ty(), Type::getInt32PtrTy(C), IRB.getInt32Ty()};
+ FTy = FunctionType::get(IRB.getInt32Ty(), Params, false);
+ TestSetjmpF = Function::Create(FTy, GlobalValue::ExternalLinkage,
+ TestSetjmpFName, &M);
+
+ if (LongjmpF) {
+ // Replace all uses of longjmp with emscripten_longjmp_jmpbuf, which is
+ // defined in JS code
+ EmLongjmpJmpbufF = Function::Create(LongjmpF->getFunctionType(),
+ GlobalValue::ExternalLinkage,
+ EmLongjmpJmpbufFName, &M);
+
+ LongjmpF->replaceAllUsesWith(EmLongjmpJmpbufF);
+ }
+ FTy = FunctionType::get(IRB.getVoidTy(),
+ {IRB.getInt32Ty(), IRB.getInt32Ty()}, false);
+ EmLongjmpF =
+ Function::Create(FTy, GlobalValue::ExternalLinkage, EmLongjmpFName, &M);
+
+ // Only traverse functions that uses setjmp in order not to insert
+ // unnecessary prep / cleanup code in every function
+ SmallPtrSet<Function *, 8> SetjmpUsers;
+ for (User *U : SetjmpF->users()) {
+ auto *UI = cast<Instruction>(U);
+ SetjmpUsers.insert(UI->getFunction());
+ }
+ for (Function *F : SetjmpUsers)
+ runSjLjOnFunction(*F);
+ }
+
+ if (!Changed) {
+ // Delete unused global variables and functions
+ ThrewGV->eraseFromParent();
+ ThrewValueGV->eraseFromParent();
+ TempRet0GV->eraseFromParent();
+ if (ResumeF)
+ ResumeF->eraseFromParent();
+ if (EHTypeIDF)
+ EHTypeIDF->eraseFromParent();
+ if (EmLongjmpF)
+ EmLongjmpF->eraseFromParent();
+ if (SaveSetjmpF)
+ SaveSetjmpF->eraseFromParent();
+ if (TestSetjmpF)
+ TestSetjmpF->eraseFromParent();
+ return false;
+ }
+
+ // If we have made any changes while doing exception handling or
+ // setjmp/longjmp handling, we have to create these functions for JavaScript
+ // to call.
+ createSetThrewFunction(M);
+ createSetTempRet0Function(M);
+
+ return true;
+}
+
+bool WebAssemblyLowerEmscriptenEHSjLj::runEHOnFunction(Function &F) {
+ Module &M = *F.getParent();
+ LLVMContext &C = F.getContext();
+ IRBuilder<> IRB(C);
+ bool Changed = false;
+ SmallVector<Instruction *, 64> ToErase;
+ SmallPtrSet<LandingPadInst *, 32> LandingPads;
+ bool AllowExceptions =
+ areAllExceptionsAllowed() || EHWhitelistSet.count(F.getName());
+
+ for (BasicBlock &BB : F) {
+ auto *II = dyn_cast<InvokeInst>(BB.getTerminator());
+ if (!II)
+ continue;
+ Changed = true;
+ LandingPads.insert(II->getLandingPadInst());
+ IRB.SetInsertPoint(II);
+
+ bool NeedInvoke = AllowExceptions && canThrow(II->getCalledValue());
+ if (NeedInvoke) {
+ // Wrap invoke with invoke wrapper and generate preamble/postamble
+ Value *Threw = wrapInvoke(II);
+ ToErase.push_back(II);
+
+ // Insert a branch based on __THREW__ variable
+ Value *Cmp = IRB.CreateICmpEQ(Threw, IRB.getInt32(1), "cmp");
+ IRB.CreateCondBr(Cmp, II->getUnwindDest(), II->getNormalDest());
+
+ } else {
+ // This can't throw, and we don't need this invoke, just replace it with a
+ // call+branch
+ SmallVector<Value *, 16> Args(II->arg_begin(), II->arg_end());
+ CallInst *NewCall = IRB.CreateCall(II->getCalledValue(), Args);
+ NewCall->takeName(II);
+ NewCall->setCallingConv(II->getCallingConv());
+ NewCall->setDebugLoc(II->getDebugLoc());
+ NewCall->setAttributes(II->getAttributes());
+ II->replaceAllUsesWith(NewCall);
+ ToErase.push_back(II);
+
+ IRB.CreateBr(II->getNormalDest());
+
+ // Remove any PHI node entries from the exception destination
+ II->getUnwindDest()->removePredecessor(&BB);
+ }
+ }
+
+ // Process resume instructions
+ for (BasicBlock &BB : F) {
+ // Scan the body of the basic block for resumes
+ for (Instruction &I : BB) {
+ auto *RI = dyn_cast<ResumeInst>(&I);
+ if (!RI)
+ continue;
+
+ // Split the input into legal values
+ Value *Input = RI->getValue();
+ IRB.SetInsertPoint(RI);
+ Value *Low = IRB.CreateExtractValue(Input, 0, "low");
+ // Create a call to __resumeException function
+ IRB.CreateCall(ResumeF, {Low});
+ // Add a terminator to the block
+ IRB.CreateUnreachable();
+ ToErase.push_back(RI);
+ }
+ }
+
+ // Process llvm.eh.typeid.for intrinsics
+ for (BasicBlock &BB : F) {
+ for (Instruction &I : BB) {
+ auto *CI = dyn_cast<CallInst>(&I);
+ if (!CI)
+ continue;
+ const Function *Callee = CI->getCalledFunction();
+ if (!Callee)
+ continue;
+ if (Callee->getIntrinsicID() != Intrinsic::eh_typeid_for)
+ continue;
+
+ IRB.SetInsertPoint(CI);
+ CallInst *NewCI =
+ IRB.CreateCall(EHTypeIDF, CI->getArgOperand(0), "typeid");
+ CI->replaceAllUsesWith(NewCI);
+ ToErase.push_back(CI);
+ }
+ }
+
+ // Look for orphan landingpads, can occur in blocks with no predecesors
+ for (BasicBlock &BB : F) {
+ Instruction *I = BB.getFirstNonPHI();
+ if (auto *LPI = dyn_cast<LandingPadInst>(I))
+ LandingPads.insert(LPI);
+ }
+
+ // Handle all the landingpad for this function together, as multiple invokes
+ // may share a single lp
+ for (LandingPadInst *LPI : LandingPads) {
+ IRB.SetInsertPoint(LPI);
+ SmallVector<Value *, 16> FMCArgs;
+ for (unsigned i = 0, e = LPI->getNumClauses(); i < e; ++i) {
+ Constant *Clause = LPI->getClause(i);
+ // As a temporary workaround for the lack of aggregate varargs support
+ // in the interface between JS and wasm, break out filter operands into
+ // their component elements.
+ if (LPI->isFilter(i)) {
+ auto *ATy = cast<ArrayType>(Clause->getType());
+ for (unsigned j = 0, e = ATy->getNumElements(); j < e; ++j) {
+ Value *EV = IRB.CreateExtractValue(Clause, makeArrayRef(j), "filter");
+ FMCArgs.push_back(EV);
+ }
+ } else
+ FMCArgs.push_back(Clause);
+ }
+
+ // Create a call to __cxa_find_matching_catch_N function
+ Function *FMCF = getFindMatchingCatch(M, FMCArgs.size());
+ CallInst *FMCI = IRB.CreateCall(FMCF, FMCArgs, "fmc");
+ Value *Undef = UndefValue::get(LPI->getType());
+ Value *Pair0 = IRB.CreateInsertValue(Undef, FMCI, 0, "pair0");
+ Value *TempRet0 =
+ IRB.CreateLoad(TempRet0GV, TempRet0GV->getName() + ".val");
+ Value *Pair1 = IRB.CreateInsertValue(Pair0, TempRet0, 1, "pair1");
+
+ LPI->replaceAllUsesWith(Pair1);
+ ToErase.push_back(LPI);
+ }
+
+ // Erase everything we no longer need in this function
+ for (Instruction *I : ToErase)
+ I->eraseFromParent();
+
+ return Changed;
+}
+
+bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) {
+ Module &M = *F.getParent();
+ LLVMContext &C = F.getContext();
+ IRBuilder<> IRB(C);
+ SmallVector<Instruction *, 64> ToErase;
+ // Vector of %setjmpTable values
+ std::vector<Instruction *> SetjmpTableInsts;
+ // Vector of %setjmpTableSize values
+ std::vector<Instruction *> SetjmpTableSizeInsts;
+
+ // Setjmp preparation
+
+ // This instruction effectively means %setjmpTableSize = 4.
+ // We create this as an instruction intentionally, and we don't want to fold
+ // this instruction to a constant 4, because this value will be used in
+ // SSAUpdater.AddAvailableValue(...) later.
+ BasicBlock &EntryBB = F.getEntryBlock();
+ BinaryOperator *SetjmpTableSize = BinaryOperator::Create(
+ Instruction::Add, IRB.getInt32(4), IRB.getInt32(0), "setjmpTableSize",
+ &*EntryBB.getFirstInsertionPt());
+ // setjmpTable = (int *) malloc(40);
+ Instruction *SetjmpTable = CallInst::CreateMalloc(
+ SetjmpTableSize, IRB.getInt32Ty(), IRB.getInt32Ty(), IRB.getInt32(40),
+ nullptr, nullptr, "setjmpTable");
+ // setjmpTable[0] = 0;
+ IRB.SetInsertPoint(SetjmpTableSize);
+ IRB.CreateStore(IRB.getInt32(0), SetjmpTable);
+ SetjmpTableInsts.push_back(SetjmpTable);
+ SetjmpTableSizeInsts.push_back(SetjmpTableSize);
+
+ // Setjmp transformation
+ std::vector<PHINode *> SetjmpRetPHIs;
+ Function *SetjmpF = M.getFunction("setjmp");
+ for (User *U : SetjmpF->users()) {
+ auto *CI = dyn_cast<CallInst>(U);
+ if (!CI)
+ report_fatal_error("Does not support indirect calls to setjmp");
+
+ BasicBlock *BB = CI->getParent();
+ if (BB->getParent() != &F) // in other function
+ continue;
+
+ // The tail is everything right after the call, and will be reached once
+ // when setjmp is called, and later when longjmp returns to the setjmp
+ BasicBlock *Tail = SplitBlock(BB, CI->getNextNode());
+ // Add a phi to the tail, which will be the output of setjmp, which
+ // indicates if this is the first call or a longjmp back. The phi directly
+ // uses the right value based on where we arrive from
+ IRB.SetInsertPoint(Tail->getFirstNonPHI());
+ PHINode *SetjmpRet = IRB.CreatePHI(IRB.getInt32Ty(), 2, "setjmp.ret");
+
+ // setjmp initial call returns 0
+ SetjmpRet->addIncoming(IRB.getInt32(0), BB);
+ // The proper output is now this, not the setjmp call itself
+ CI->replaceAllUsesWith(SetjmpRet);
+ // longjmp returns to the setjmp will add themselves to this phi
+ SetjmpRetPHIs.push_back(SetjmpRet);
+
+ // Fix call target
+ // Our index in the function is our place in the array + 1 to avoid index
+ // 0, because index 0 means the longjmp is not ours to handle.
+ IRB.SetInsertPoint(CI);
+ Value *Args[] = {CI->getArgOperand(0), IRB.getInt32(SetjmpRetPHIs.size()),
+ SetjmpTable, SetjmpTableSize};
+ Instruction *NewSetjmpTable =
+ IRB.CreateCall(SaveSetjmpF, Args, "setjmpTable");
+ Instruction *NewSetjmpTableSize =
+ IRB.CreateLoad(TempRet0GV, "setjmpTableSize");
+ SetjmpTableInsts.push_back(NewSetjmpTable);
+ SetjmpTableSizeInsts.push_back(NewSetjmpTableSize);
+ ToErase.push_back(CI);
+ }
+
+ // Update each call that can longjmp so it can return to a setjmp where
+ // relevant.
+
+ // Because we are creating new BBs while processing and don't want to make
+ // all these newly created BBs candidates again for longjmp processing, we
+ // first make the vector of candidate BBs.
+ std::vector<BasicBlock *> BBs;
+ for (BasicBlock &BB : F)
+ BBs.push_back(&BB);
+
+ // BBs.size() will change within the loop, so we query it every time
+ for (unsigned i = 0; i < BBs.size(); i++) {
+ BasicBlock *BB = BBs[i];
+ for (Instruction &I : *BB) {
+ assert(!isa<InvokeInst>(&I));
+ auto *CI = dyn_cast<CallInst>(&I);
+ if (!CI)
+ continue;
+
+ const Value *Callee = CI->getCalledValue();
+ if (!canLongjmp(M, Callee))
+ continue;
+
+ Value *Threw = nullptr;
+ BasicBlock *Tail;
+ if (Callee->getName().startswith(InvokePrefix)) {
+ // If invoke wrapper has already been generated for this call in
+ // previous EH phase, search for the load instruction
+ // %__THREW__.val = __THREW__;
+ // in postamble after the invoke wrapper call
+ LoadInst *ThrewLI = nullptr;
+ StoreInst *ThrewResetSI = nullptr;
+ for (auto I = std::next(BasicBlock::iterator(CI)), IE = BB->end();
+ I != IE; ++I) {
+ if (auto *LI = dyn_cast<LoadInst>(I))
+ if (auto *GV = dyn_cast<GlobalVariable>(LI->getPointerOperand()))
+ if (GV == ThrewGV) {
+ Threw = ThrewLI = LI;
+ break;
+ }
+ }
+ // Search for the store instruction after the load above
+ // __THREW__ = 0;
+ for (auto I = std::next(BasicBlock::iterator(ThrewLI)), IE = BB->end();
+ I != IE; ++I) {
+ if (auto *SI = dyn_cast<StoreInst>(I))
+ if (auto *GV = dyn_cast<GlobalVariable>(SI->getPointerOperand()))
+ if (GV == ThrewGV && SI->getValueOperand() == IRB.getInt32(0)) {
+ ThrewResetSI = SI;
+ break;
+ }
+ }
+ assert(Threw && ThrewLI && "Cannot find __THREW__ load after invoke");
+ assert(ThrewResetSI && "Cannot find __THREW__ store after invoke");
+ Tail = SplitBlock(BB, ThrewResetSI->getNextNode());
+
+ } else {
+ // Wrap call with invoke wrapper and generate preamble/postamble
+ Threw = wrapInvoke(CI);
+ ToErase.push_back(CI);
+ Tail = SplitBlock(BB, CI->getNextNode());
+ }
+
+ // We need to replace the terminator in Tail - SplitBlock makes BB go
+ // straight to Tail, we need to check if a longjmp occurred, and go to the
+ // right setjmp-tail if so
+ ToErase.push_back(BB->getTerminator());
+
+ // Generate a function call to testSetjmp function and preamble/postamble
+ // code to figure out (1) whether longjmp occurred (2) if longjmp
+ // occurred, which setjmp it corresponds to
+ Value *Label = nullptr;
+ Value *LongjmpResult = nullptr;
+ BasicBlock *EndBB = nullptr;
+ wrapTestSetjmp(BB, CI, Threw, SetjmpTable, SetjmpTableSize, Label,
+ LongjmpResult, EndBB);
+ assert(Label && LongjmpResult && EndBB);
+
+ // Create switch instruction
+ IRB.SetInsertPoint(EndBB);
+ SwitchInst *SI = IRB.CreateSwitch(Label, Tail, SetjmpRetPHIs.size());
+ // -1 means no longjmp happened, continue normally (will hit the default
+ // switch case). 0 means a longjmp that is not ours to handle, needs a
+ // rethrow. Otherwise the index is the same as the index in P+1 (to avoid
+ // 0).
+ for (unsigned i = 0; i < SetjmpRetPHIs.size(); i++) {
+ SI->addCase(IRB.getInt32(i + 1), SetjmpRetPHIs[i]->getParent());
+ SetjmpRetPHIs[i]->addIncoming(LongjmpResult, EndBB);
+ }
+
+ // We are splitting the block here, and must continue to find other calls
+ // in the block - which is now split. so continue to traverse in the Tail
+ BBs.push_back(Tail);
+ }
+ }
+
+ // Erase everything we no longer need in this function
+ for (Instruction *I : ToErase)
+ I->eraseFromParent();
+
+ // Free setjmpTable buffer before each return instruction
+ for (BasicBlock &BB : F) {
+ TerminatorInst *TI = BB.getTerminator();
+ if (isa<ReturnInst>(TI))
+ CallInst::CreateFree(SetjmpTable, TI);
+ }
+
+ // Every call to saveSetjmp can change setjmpTable and setjmpTableSize
+ // (when buffer reallocation occurs)
+ // entry:
+ // setjmpTableSize = 4;
+ // setjmpTable = (int *) malloc(40);
+ // setjmpTable[0] = 0;
+ // ...
+ // somebb:
+ // setjmpTable = saveSetjmp(buf, label, setjmpTable, setjmpTableSize);
+ // setjmpTableSize = __tempRet0;
+ // So we need to make sure the SSA for these variables is valid so that every
+ // saveSetjmp and testSetjmp calls have the correct arguments.
+ SSAUpdater SetjmpTableSSA;
+ SSAUpdater SetjmpTableSizeSSA;
+ SetjmpTableSSA.Initialize(Type::getInt32PtrTy(C), "setjmpTable");
+ SetjmpTableSizeSSA.Initialize(Type::getInt32Ty(C), "setjmpTableSize");
+ for (Instruction *I : SetjmpTableInsts)
+ SetjmpTableSSA.AddAvailableValue(I->getParent(), I);
+ for (Instruction *I : SetjmpTableSizeInsts)
+ SetjmpTableSizeSSA.AddAvailableValue(I->getParent(), I);
+
+ for (auto UI = SetjmpTable->use_begin(), UE = SetjmpTable->use_end();
+ UI != UE;) {
+ // Grab the use before incrementing the iterator.
+ Use &U = *UI;
+ // Increment the iterator before removing the use from the list.
+ ++UI;
+ if (Instruction *I = dyn_cast<Instruction>(U.getUser()))
+ if (I->getParent() != &EntryBB)
+ SetjmpTableSSA.RewriteUse(U);
+ }
+ for (auto UI = SetjmpTableSize->use_begin(), UE = SetjmpTableSize->use_end();
+ UI != UE;) {
+ Use &U = *UI;
+ ++UI;
+ if (Instruction *I = dyn_cast<Instruction>(U.getUser()))
+ if (I->getParent() != &EntryBB)
+ SetjmpTableSizeSSA.RewriteUse(U);
+ }
+
+ // Finally, our modifications to the cfg can break dominance of SSA variables.
+ // For example, in this code,
+ // if (x()) { .. setjmp() .. }
+ // if (y()) { .. longjmp() .. }
+ // We must split the longjmp block, and it can jump into the block splitted
+ // from setjmp one. But that means that when we split the setjmp block, it's
+ // first part no longer dominates its second part - there is a theoretically
+ // possible control flow path where x() is false, then y() is true and we
+ // reach the second part of the setjmp block, without ever reaching the first
+ // part. So, we rebuild SSA form here.
+ rebuildSSA(F);
+ return true;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
new file mode 100644
index 000000000000..022a448590ec
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
@@ -0,0 +1,115 @@
+// WebAssemblyMCInstLower.cpp - Convert WebAssembly MachineInstr to an MCInst //
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains code to lower WebAssembly MachineInstrs to their
+/// corresponding MCInst records.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyMCInstLower.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+MCSymbol *
+WebAssemblyMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
+ return Printer.getSymbol(MO.getGlobal());
+}
+
+MCSymbol *WebAssemblyMCInstLower::GetExternalSymbolSymbol(
+ const MachineOperand &MO) const {
+ return Printer.GetExternalSymbolSymbol(MO.getSymbolName());
+}
+
+MCOperand WebAssemblyMCInstLower::LowerSymbolOperand(MCSymbol *Sym,
+ int64_t Offset,
+ bool IsFunc) const {
+ MCSymbolRefExpr::VariantKind VK =
+ IsFunc ? MCSymbolRefExpr::VK_WebAssembly_FUNCTION
+ : MCSymbolRefExpr::VK_None;
+ const MCExpr *Expr = MCSymbolRefExpr::create(Sym, VK, Ctx);
+
+ if (Offset != 0) {
+ if (IsFunc)
+ report_fatal_error("Function addresses with offsets not supported");
+ Expr =
+ MCBinaryExpr::createAdd(Expr, MCConstantExpr::create(Offset, Ctx), Ctx);
+ }
+
+ return MCOperand::createExpr(Expr);
+}
+
+void WebAssemblyMCInstLower::Lower(const MachineInstr *MI,
+ MCInst &OutMI) const {
+ OutMI.setOpcode(MI->getOpcode());
+
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+
+ MCOperand MCOp;
+ switch (MO.getType()) {
+ default:
+ MI->dump();
+ llvm_unreachable("unknown operand type");
+ case MachineOperand::MO_MachineBasicBlock:
+ MI->dump();
+ llvm_unreachable("MachineBasicBlock operand should have been rewritten");
+ case MachineOperand::MO_Register: {
+ // Ignore all implicit register operands.
+ if (MO.isImplicit())
+ continue;
+ const WebAssemblyFunctionInfo &MFI =
+ *MI->getParent()->getParent()->getInfo<WebAssemblyFunctionInfo>();
+ unsigned WAReg = MFI.getWAReg(MO.getReg());
+ MCOp = MCOperand::createReg(WAReg);
+ break;
+ }
+ case MachineOperand::MO_Immediate:
+ MCOp = MCOperand::createImm(MO.getImm());
+ break;
+ case MachineOperand::MO_FPImmediate: {
+ // TODO: MC converts all floating point immediate operands to double.
+ // This is fine for numeric values, but may cause NaNs to change bits.
+ const ConstantFP *Imm = MO.getFPImm();
+ if (Imm->getType()->isFloatTy())
+ MCOp = MCOperand::createFPImm(Imm->getValueAPF().convertToFloat());
+ else if (Imm->getType()->isDoubleTy())
+ MCOp = MCOperand::createFPImm(Imm->getValueAPF().convertToDouble());
+ else
+ llvm_unreachable("unknown floating point immediate type");
+ break;
+ }
+ case MachineOperand::MO_GlobalAddress:
+ assert(MO.getTargetFlags() == 0 &&
+ "WebAssembly does not use target flags on GlobalAddresses");
+ MCOp = LowerSymbolOperand(GetGlobalAddressSymbol(MO), MO.getOffset(),
+ MO.getGlobal()->getValueType()->isFunctionTy());
+ break;
+ case MachineOperand::MO_ExternalSymbol:
+ // The target flag indicates whether this is a symbol for a
+ // variable or a function.
+ assert((MO.getTargetFlags() & -2) == 0 &&
+ "WebAssembly uses only one target flag bit on ExternalSymbols");
+ MCOp = LowerSymbolOperand(GetExternalSymbolSymbol(MO), /*Offset=*/0,
+ MO.getTargetFlags() & 1);
+ break;
+ }
+
+ OutMI.addOperand(MCOp);
+ }
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
new file mode 100644
index 000000000000..ab4ba1c28d53
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
@@ -0,0 +1,46 @@
+//===-- WebAssemblyMCInstLower.h - Lower MachineInstr to MCInst -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file declares the class to lower WebAssembly MachineInstrs to
+/// their corresponding MCInst records.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYMCINSTLOWER_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYMCINSTLOWER_H
+
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+class AsmPrinter;
+class MCContext;
+class MCSymbol;
+class MachineInstr;
+class MachineOperand;
+
+/// This class is used to lower an MachineInstr into an MCInst.
+class LLVM_LIBRARY_VISIBILITY WebAssemblyMCInstLower {
+ MCContext &Ctx;
+ AsmPrinter &Printer;
+
+ MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
+ MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
+ MCOperand LowerSymbolOperand(MCSymbol *Sym, int64_t Offset,
+ bool IsFunc) const;
+
+public:
+ WebAssemblyMCInstLower(MCContext &ctx, AsmPrinter &printer)
+ : Ctx(ctx), Printer(printer) {}
+ void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
new file mode 100644
index 000000000000..ccf6a18b32ea
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
@@ -0,0 +1,62 @@
+//=- WebAssemblyMachineFunctionInfo.cpp - WebAssembly Machine Function Info -=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements WebAssembly-specific per-machine-function
+/// information.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblyISelLowering.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/CodeGen/Analysis.h"
+using namespace llvm;
+
+WebAssemblyFunctionInfo::~WebAssemblyFunctionInfo() {}
+
+void WebAssemblyFunctionInfo::initWARegs() {
+ assert(WARegs.empty());
+ unsigned Reg = UnusedReg;
+ WARegs.resize(MF.getRegInfo().getNumVirtRegs(), Reg);
+}
+
+void llvm::ComputeLegalValueVTs(const Function &F, const TargetMachine &TM,
+ Type *Ty, SmallVectorImpl<MVT> &ValueVTs) {
+ const DataLayout &DL(F.getParent()->getDataLayout());
+ const WebAssemblyTargetLowering &TLI =
+ *TM.getSubtarget<WebAssemblySubtarget>(F).getTargetLowering();
+ SmallVector<EVT, 4> VTs;
+ ComputeValueVTs(TLI, DL, Ty, VTs);
+
+ for (EVT VT : VTs) {
+ unsigned NumRegs = TLI.getNumRegisters(F.getContext(), VT);
+ MVT RegisterVT = TLI.getRegisterType(F.getContext(), VT);
+ for (unsigned i = 0; i != NumRegs; ++i)
+ ValueVTs.push_back(RegisterVT);
+ }
+}
+
+void llvm::ComputeSignatureVTs(const Function &F, const TargetMachine &TM,
+ SmallVectorImpl<MVT> &Params,
+ SmallVectorImpl<MVT> &Results) {
+ ComputeLegalValueVTs(F, TM, F.getReturnType(), Results);
+
+ if (Results.size() > 1) {
+ // WebAssembly currently can't lower returns of multiple values without
+ // demoting to sret (see WebAssemblyTargetLowering::CanLowerReturn). So
+ // replace multiple return values with a pointer parameter.
+ Results.clear();
+ Params.push_back(
+ MVT::getIntegerVT(TM.createDataLayout().getPointerSizeInBits()));
+ }
+
+ for (auto &Arg : F.args())
+ ComputeLegalValueVTs(F, TM, Arg.getType(), Params);
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
new file mode 100644
index 000000000000..756619bebbed
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
@@ -0,0 +1,119 @@
+// WebAssemblyMachineFunctionInfo.h-WebAssembly machine function info-*- C++ -*-
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file declares WebAssembly-specific per-machine-function
+/// information.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYMACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYMACHINEFUNCTIONINFO_H
+
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+namespace llvm {
+
+/// This class is derived from MachineFunctionInfo and contains private
+/// WebAssembly-specific information for each MachineFunction.
+class WebAssemblyFunctionInfo final : public MachineFunctionInfo {
+ MachineFunction &MF;
+
+ std::vector<MVT> Params;
+ std::vector<MVT> Results;
+ std::vector<MVT> Locals;
+
+ /// A mapping from CodeGen vreg index to WebAssembly register number.
+ std::vector<unsigned> WARegs;
+
+ /// A mapping from CodeGen vreg index to a boolean value indicating whether
+ /// the given register is considered to be "stackified", meaning it has been
+ /// determined or made to meet the stack requirements:
+ /// - single use (per path)
+ /// - single def (per path)
+ /// - defined and used in LIFO order with other stack registers
+ BitVector VRegStackified;
+
+ // A virtual register holding the pointer to the vararg buffer for vararg
+ // functions. It is created and set in TLI::LowerFormalArguments and read by
+ // TLI::LowerVASTART
+ unsigned VarargVreg = -1U;
+
+ // A virtual register holding the base pointer for functions that have
+ // overaligned values on the user stack.
+ unsigned BasePtrVreg = -1U;
+
+ public:
+ explicit WebAssemblyFunctionInfo(MachineFunction &MF) : MF(MF) {}
+ ~WebAssemblyFunctionInfo() override;
+
+ void addParam(MVT VT) { Params.push_back(VT); }
+ const std::vector<MVT> &getParams() const { return Params; }
+
+ void addResult(MVT VT) { Results.push_back(VT); }
+ const std::vector<MVT> &getResults() const { return Results; }
+
+ void addLocal(MVT VT) { Locals.push_back(VT); }
+ const std::vector<MVT> &getLocals() const { return Locals; }
+
+ unsigned getVarargBufferVreg() const {
+ assert(VarargVreg != -1U && "Vararg vreg hasn't been set");
+ return VarargVreg;
+ }
+ void setVarargBufferVreg(unsigned Reg) { VarargVreg = Reg; }
+
+ unsigned getBasePointerVreg() const {
+ assert(BasePtrVreg != -1U && "Base ptr vreg hasn't been set");
+ return BasePtrVreg;
+ }
+ void setBasePointerVreg(unsigned Reg) { BasePtrVreg = Reg; }
+
+ static const unsigned UnusedReg = -1u;
+
+ void stackifyVReg(unsigned VReg) {
+ assert(MF.getRegInfo().getUniqueVRegDef(VReg));
+ if (TargetRegisterInfo::virtReg2Index(VReg) >= VRegStackified.size())
+ VRegStackified.resize(TargetRegisterInfo::virtReg2Index(VReg) + 1);
+ VRegStackified.set(TargetRegisterInfo::virtReg2Index(VReg));
+ }
+ bool isVRegStackified(unsigned VReg) const {
+ if (TargetRegisterInfo::virtReg2Index(VReg) >= VRegStackified.size())
+ return false;
+ return VRegStackified.test(TargetRegisterInfo::virtReg2Index(VReg));
+ }
+
+ void initWARegs();
+ void setWAReg(unsigned VReg, unsigned WAReg) {
+ assert(WAReg != UnusedReg);
+ assert(TargetRegisterInfo::virtReg2Index(VReg) < WARegs.size());
+ WARegs[TargetRegisterInfo::virtReg2Index(VReg)] = WAReg;
+ }
+ unsigned getWAReg(unsigned Reg) const {
+ assert(TargetRegisterInfo::virtReg2Index(Reg) < WARegs.size());
+ return WARegs[TargetRegisterInfo::virtReg2Index(Reg)];
+ }
+
+ // For a given stackified WAReg, return the id number to print with push/pop.
+ static unsigned getWARegStackId(unsigned Reg) {
+ assert(Reg & INT32_MIN);
+ return Reg & INT32_MAX;
+ }
+};
+
+void ComputeLegalValueVTs(const Function &F, const TargetMachine &TM,
+ Type *Ty, SmallVectorImpl<MVT> &ValueVTs);
+
+void ComputeSignatureVTs(const Function &F, const TargetMachine &TM,
+ SmallVectorImpl<MVT> &Params,
+ SmallVectorImpl<MVT> &Results);
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
new file mode 100644
index 000000000000..5a3a7411ed46
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
@@ -0,0 +1,105 @@
+//===--- WebAssemblyOptimizeLiveIntervals.cpp - LiveInterval processing ---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Optimize LiveIntervals for use in a post-RA context.
+//
+/// LiveIntervals normally runs before register allocation when the code is
+/// only recently lowered out of SSA form, so it's uncommon for registers to
+/// have multiple defs, and then they do, the defs are usually closely related.
+/// Later, after coalescing, tail duplication, and other optimizations, it's
+/// more common to see registers with multiple unrelated defs. This pass
+/// updates LiveIntervalAnalysis to distribute the value numbers across separate
+/// LiveIntervals.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-optimize-live-intervals"
+
+namespace {
+class WebAssemblyOptimizeLiveIntervals final : public MachineFunctionPass {
+ StringRef getPassName() const override {
+ return "WebAssembly Optimize Live Intervals";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<LiveIntervals>();
+ AU.addPreserved<MachineBlockFrequencyInfo>();
+ AU.addPreserved<SlotIndexes>();
+ AU.addPreserved<LiveIntervals>();
+ AU.addPreservedID(LiveVariablesID);
+ AU.addPreservedID(MachineDominatorsID);
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+public:
+ static char ID; // Pass identification, replacement for typeid
+ WebAssemblyOptimizeLiveIntervals() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyOptimizeLiveIntervals::ID = 0;
+FunctionPass *llvm::createWebAssemblyOptimizeLiveIntervals() {
+ return new WebAssemblyOptimizeLiveIntervals();
+}
+
+bool WebAssemblyOptimizeLiveIntervals::runOnMachineFunction(MachineFunction &MF) {
+ DEBUG(dbgs() << "********** Optimize LiveIntervals **********\n"
+ "********** Function: "
+ << MF.getName() << '\n');
+
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ LiveIntervals &LIS = getAnalysis<LiveIntervals>();
+
+ // We don't preserve SSA form.
+ MRI.leaveSSA();
+
+ assert(MRI.tracksLiveness() &&
+ "OptimizeLiveIntervals expects liveness");
+
+ // Split multiple-VN LiveIntervals into multiple LiveIntervals.
+ SmallVector<LiveInterval*, 4> SplitLIs;
+ for (unsigned i = 0, e = MRI.getNumVirtRegs(); i < e; ++i) {
+ unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+ if (MRI.reg_nodbg_empty(Reg))
+ continue;
+
+ LIS.splitSeparateComponents(LIS.getInterval(Reg), SplitLIs);
+ SplitLIs.clear();
+ }
+
+ // In PrepareForLiveIntervals, we conservatively inserted IMPLICIT_DEF
+ // instructions to satisfy LiveIntervals' requirement that all uses be
+ // dominated by defs. Now that LiveIntervals has computed which of these
+ // defs are actually needed and which are dead, remove the dead ones.
+ for (auto MII = MF.begin()->begin(), MIE = MF.begin()->end(); MII != MIE; ) {
+ MachineInstr *MI = &*MII++;
+ if (MI->isImplicitDef() && MI->getOperand(0).isDead()) {
+ LiveInterval &LI = LIS.getInterval(MI->getOperand(0).getReg());
+ LIS.removeVRegDefAt(LI, LIS.getInstructionIndex(*MI).getRegSlot());
+ LIS.RemoveMachineInstrFromMaps(*MI);
+ MI->eraseFromParent();
+ }
+ }
+
+ return false;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
new file mode 100644
index 000000000000..96520aa5d28c
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
@@ -0,0 +1,76 @@
+//===-- WebAssemblyOptimizeReturned.cpp - Optimize "returned" attributes --===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Optimize calls with "returned" attributes for WebAssembly.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-optimize-returned"
+
+namespace {
+class OptimizeReturned final : public FunctionPass,
+ public InstVisitor<OptimizeReturned> {
+ StringRef getPassName() const override {
+ return "WebAssembly Optimize Returned";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ FunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnFunction(Function &F) override;
+
+ DominatorTree *DT;
+
+public:
+ static char ID;
+ OptimizeReturned() : FunctionPass(ID), DT(nullptr) {}
+
+ void visitCallSite(CallSite CS);
+};
+} // End anonymous namespace
+
+char OptimizeReturned::ID = 0;
+FunctionPass *llvm::createWebAssemblyOptimizeReturned() {
+ return new OptimizeReturned();
+}
+
+void OptimizeReturned::visitCallSite(CallSite CS) {
+ for (unsigned i = 0, e = CS.getNumArgOperands(); i < e; ++i)
+ if (CS.paramHasAttr(1 + i, Attribute::Returned)) {
+ Instruction *Inst = CS.getInstruction();
+ Value *Arg = CS.getArgOperand(i);
+ // Ignore constants, globals, undef, etc.
+ if (isa<Constant>(Arg))
+ continue;
+ // Like replaceDominatedUsesWith but using Instruction/Use dominance.
+ for (auto UI = Arg->use_begin(), UE = Arg->use_end(); UI != UE;) {
+ Use &U = *UI++;
+ if (DT->dominates(Inst, U))
+ U.set(Inst);
+ }
+ }
+}
+
+bool OptimizeReturned::runOnFunction(Function &F) {
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ visit(F);
+ return true;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
new file mode 100644
index 000000000000..32dde88c2234
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
@@ -0,0 +1,198 @@
+//===-- WebAssemblyPeephole.cpp - WebAssembly Peephole Optimiztions -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Late peephole optimizations for WebAssembly.
+///
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssembly.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-peephole"
+
+static cl::opt<bool> DisableWebAssemblyFallthroughReturnOpt(
+ "disable-wasm-fallthrough-return-opt", cl::Hidden,
+ cl::desc("WebAssembly: Disable fallthrough-return optimizations."),
+ cl::init(false));
+
+namespace {
+class WebAssemblyPeephole final : public MachineFunctionPass {
+ StringRef getPassName() const override {
+ return "WebAssembly late peephole optimizer";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+public:
+ static char ID;
+ WebAssemblyPeephole() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyPeephole::ID = 0;
+FunctionPass *llvm::createWebAssemblyPeephole() {
+ return new WebAssemblyPeephole();
+}
+
+/// If desirable, rewrite NewReg to a drop register.
+static bool MaybeRewriteToDrop(unsigned OldReg, unsigned NewReg,
+ MachineOperand &MO, WebAssemblyFunctionInfo &MFI,
+ MachineRegisterInfo &MRI) {
+ bool Changed = false;
+ if (OldReg == NewReg) {
+ Changed = true;
+ unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
+ MO.setReg(NewReg);
+ MO.setIsDead();
+ MFI.stackifyVReg(NewReg);
+ }
+ return Changed;
+}
+
+static bool MaybeRewriteToFallthrough(MachineInstr &MI, MachineBasicBlock &MBB,
+ const MachineFunction &MF,
+ WebAssemblyFunctionInfo &MFI,
+ MachineRegisterInfo &MRI,
+ const WebAssemblyInstrInfo &TII,
+ unsigned FallthroughOpc,
+ unsigned CopyLocalOpc) {
+ if (DisableWebAssemblyFallthroughReturnOpt)
+ return false;
+ if (&MBB != &MF.back())
+ return false;
+ if (&MI != &MBB.back())
+ return false;
+
+ // If the operand isn't stackified, insert a COPY to read the operand and
+ // stackify it.
+ MachineOperand &MO = MI.getOperand(0);
+ unsigned Reg = MO.getReg();
+ if (!MFI.isVRegStackified(Reg)) {
+ unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
+ BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(CopyLocalOpc), NewReg)
+ .addReg(Reg);
+ MO.setReg(NewReg);
+ MFI.stackifyVReg(NewReg);
+ }
+
+ // Rewrite the return.
+ MI.setDesc(TII.get(FallthroughOpc));
+ return true;
+}
+
+bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) {
+ DEBUG({
+ dbgs() << "********** Peephole **********\n"
+ << "********** Function: " << MF.getName() << '\n';
+ });
+
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
+ const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+ const WebAssemblyTargetLowering &TLI =
+ *MF.getSubtarget<WebAssemblySubtarget>().getTargetLowering();
+ auto &LibInfo = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ bool Changed = false;
+
+ for (auto &MBB : MF)
+ for (auto &MI : MBB)
+ switch (MI.getOpcode()) {
+ default:
+ break;
+ case WebAssembly::CALL_I32:
+ case WebAssembly::CALL_I64: {
+ MachineOperand &Op1 = MI.getOperand(1);
+ if (Op1.isSymbol()) {
+ StringRef Name(Op1.getSymbolName());
+ if (Name == TLI.getLibcallName(RTLIB::MEMCPY) ||
+ Name == TLI.getLibcallName(RTLIB::MEMMOVE) ||
+ Name == TLI.getLibcallName(RTLIB::MEMSET)) {
+ LibFunc::Func Func;
+ if (LibInfo.getLibFunc(Name, Func)) {
+ const auto &Op2 = MI.getOperand(2);
+ if (!Op2.isReg())
+ report_fatal_error("Peephole: call to builtin function with "
+ "wrong signature, not consuming reg");
+ MachineOperand &MO = MI.getOperand(0);
+ unsigned OldReg = MO.getReg();
+ unsigned NewReg = Op2.getReg();
+
+ if (MRI.getRegClass(NewReg) != MRI.getRegClass(OldReg))
+ report_fatal_error("Peephole: call to builtin function with "
+ "wrong signature, from/to mismatch");
+ Changed |= MaybeRewriteToDrop(OldReg, NewReg, MO, MFI, MRI);
+ }
+ }
+ }
+ break;
+ }
+ // Optimize away an explicit void return at the end of the function.
+ case WebAssembly::RETURN_I32:
+ Changed |= MaybeRewriteToFallthrough(
+ MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_I32,
+ WebAssembly::COPY_I32);
+ break;
+ case WebAssembly::RETURN_I64:
+ Changed |= MaybeRewriteToFallthrough(
+ MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_I64,
+ WebAssembly::COPY_I64);
+ break;
+ case WebAssembly::RETURN_F32:
+ Changed |= MaybeRewriteToFallthrough(
+ MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_F32,
+ WebAssembly::COPY_F32);
+ break;
+ case WebAssembly::RETURN_F64:
+ Changed |= MaybeRewriteToFallthrough(
+ MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_F64,
+ WebAssembly::COPY_F64);
+ break;
+ case WebAssembly::RETURN_v16i8:
+ Changed |= MaybeRewriteToFallthrough(
+ MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_v16i8,
+ WebAssembly::COPY_V128);
+ break;
+ case WebAssembly::RETURN_v8i16:
+ Changed |= MaybeRewriteToFallthrough(
+ MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_v8i16,
+ WebAssembly::COPY_V128);
+ break;
+ case WebAssembly::RETURN_v4i32:
+ Changed |= MaybeRewriteToFallthrough(
+ MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_v4i32,
+ WebAssembly::COPY_V128);
+ break;
+ case WebAssembly::RETURN_v4f32:
+ Changed |= MaybeRewriteToFallthrough(
+ MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_v4f32,
+ WebAssembly::COPY_V128);
+ break;
+ case WebAssembly::RETURN_VOID:
+ if (!DisableWebAssemblyFallthroughReturnOpt &&
+ &MBB == &MF.back() && &MI == &MBB.back())
+ MI.setDesc(TII.get(WebAssembly::FALLTHROUGH_RETURN_VOID));
+ break;
+ }
+
+ return Changed;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
new file mode 100644
index 000000000000..473dcb7a33fd
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
@@ -0,0 +1,124 @@
+//===- WebAssemblyPrepareForLiveIntervals.cpp - Prepare for LiveIntervals -===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Fix up code to meet LiveInterval's requirements.
+///
+/// Some CodeGen passes don't preserve LiveInterval's requirements, because
+/// they run after register allocation and it isn't important. However,
+/// WebAssembly runs LiveIntervals in a late pass. This pass transforms code
+/// to meet LiveIntervals' requirements; primarily, it ensures that all
+/// virtual register uses have definitions (IMPLICIT_DEF definitions if
+/// nothing else).
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "WebAssemblyUtilities.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-prepare-for-live-intervals"
+
+namespace {
+class WebAssemblyPrepareForLiveIntervals final : public MachineFunctionPass {
+public:
+ static char ID; // Pass identification, replacement for typeid
+ WebAssemblyPrepareForLiveIntervals() : MachineFunctionPass(ID) {}
+
+private:
+ StringRef getPassName() const override {
+ return "WebAssembly Prepare For LiveIntervals";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // end anonymous namespace
+
+char WebAssemblyPrepareForLiveIntervals::ID = 0;
+FunctionPass *llvm::createWebAssemblyPrepareForLiveIntervals() {
+ return new WebAssemblyPrepareForLiveIntervals();
+}
+
+// Test whether the given register has an ARGUMENT def.
+static bool HasArgumentDef(unsigned Reg, const MachineRegisterInfo &MRI) {
+ for (const auto &Def : MRI.def_instructions(Reg))
+ if (WebAssembly::isArgument(Def))
+ return true;
+ return false;
+}
+
+bool WebAssemblyPrepareForLiveIntervals::runOnMachineFunction(MachineFunction &MF) {
+ DEBUG({
+ dbgs() << "********** Prepare For LiveIntervals **********\n"
+ << "********** Function: " << MF.getName() << '\n';
+ });
+
+ bool Changed = false;
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+ MachineBasicBlock &Entry = *MF.begin();
+
+ assert(!mustPreserveAnalysisID(LiveIntervalsID) &&
+ "LiveIntervals shouldn't be active yet!");
+
+ // We don't preserve SSA form.
+ MRI.leaveSSA();
+
+ // BranchFolding and perhaps other passes don't preserve IMPLICIT_DEF
+ // instructions. LiveIntervals requires that all paths to virtual register
+ // uses provide a definition. Insert IMPLICIT_DEFs in the entry block to
+ // conservatively satisfy this.
+ //
+ // TODO: This is fairly heavy-handed; find a better approach.
+ //
+ for (unsigned i = 0, e = MRI.getNumVirtRegs(); i < e; ++i) {
+ unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+
+ // Skip unused registers.
+ if (MRI.use_nodbg_empty(Reg))
+ continue;
+
+ // Skip registers that have an ARGUMENT definition.
+ if (HasArgumentDef(Reg, MRI))
+ continue;
+
+ BuildMI(Entry, Entry.begin(), DebugLoc(),
+ TII.get(WebAssembly::IMPLICIT_DEF), Reg);
+ Changed = true;
+ }
+
+ // Move ARGUMENT_* instructions to the top of the entry block, so that their
+ // liveness reflects the fact that these really are live-in values.
+ for (auto MII = Entry.begin(), MIE = Entry.end(); MII != MIE; ) {
+ MachineInstr &MI = *MII++;
+ if (WebAssembly::isArgument(MI)) {
+ MI.removeFromParent();
+ Entry.insert(Entry.begin(), &MI);
+ }
+ }
+
+ // Ok, we're now ready to run LiveIntervalAnalysis again.
+ MF.getProperties().set(MachineFunctionProperties::Property::TracksLiveness);
+
+ return Changed;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
new file mode 100644
index 000000000000..5fd4a8d1949e
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
@@ -0,0 +1,175 @@
+//===-- WebAssemblyRegColoring.cpp - Register coloring --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements a virtual register coloring pass.
+///
+/// WebAssembly doesn't have a fixed number of registers, but it is still
+/// desirable to minimize the total number of registers used in each function.
+///
+/// This code is modeled after lib/CodeGen/StackSlotColoring.cpp.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-reg-coloring"
+
+namespace {
+class WebAssemblyRegColoring final : public MachineFunctionPass {
+public:
+ static char ID; // Pass identification, replacement for typeid
+ WebAssemblyRegColoring() : MachineFunctionPass(ID) {}
+
+ StringRef getPassName() const override {
+ return "WebAssembly Register Coloring";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<LiveIntervals>();
+ AU.addRequired<MachineBlockFrequencyInfo>();
+ AU.addPreserved<MachineBlockFrequencyInfo>();
+ AU.addPreservedID(MachineDominatorsID);
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+};
+} // end anonymous namespace
+
+char WebAssemblyRegColoring::ID = 0;
+FunctionPass *llvm::createWebAssemblyRegColoring() {
+ return new WebAssemblyRegColoring();
+}
+
+// Compute the total spill weight for VReg.
+static float computeWeight(const MachineRegisterInfo *MRI,
+ const MachineBlockFrequencyInfo *MBFI,
+ unsigned VReg) {
+ float weight = 0.0f;
+ for (MachineOperand &MO : MRI->reg_nodbg_operands(VReg))
+ weight += LiveIntervals::getSpillWeight(MO.isDef(), MO.isUse(), MBFI,
+ *MO.getParent());
+ return weight;
+}
+
+bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
+ DEBUG({
+ dbgs() << "********** Register Coloring **********\n"
+ << "********** Function: " << MF.getName() << '\n';
+ });
+
+ // If there are calls to setjmp or sigsetjmp, don't perform coloring. Virtual
+ // registers could be modified before the longjmp is executed, resulting in
+ // the wrong value being used afterwards. (See <rdar://problem/8007500>.)
+ // TODO: Does WebAssembly need to care about setjmp for register coloring?
+ if (MF.exposesReturnsTwice())
+ return false;
+
+ MachineRegisterInfo *MRI = &MF.getRegInfo();
+ LiveIntervals *Liveness = &getAnalysis<LiveIntervals>();
+ const MachineBlockFrequencyInfo *MBFI =
+ &getAnalysis<MachineBlockFrequencyInfo>();
+ WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
+
+ // Gather all register intervals into a list and sort them.
+ unsigned NumVRegs = MRI->getNumVirtRegs();
+ SmallVector<LiveInterval *, 0> SortedIntervals;
+ SortedIntervals.reserve(NumVRegs);
+
+ DEBUG(dbgs() << "Interesting register intervals:\n");
+ for (unsigned i = 0; i < NumVRegs; ++i) {
+ unsigned VReg = TargetRegisterInfo::index2VirtReg(i);
+ if (MFI.isVRegStackified(VReg))
+ continue;
+ // Skip unused registers, which can use $drop.
+ if (MRI->use_empty(VReg))
+ continue;
+
+ LiveInterval *LI = &Liveness->getInterval(VReg);
+ assert(LI->weight == 0.0f);
+ LI->weight = computeWeight(MRI, MBFI, VReg);
+ DEBUG(LI->dump());
+ SortedIntervals.push_back(LI);
+ }
+ DEBUG(dbgs() << '\n');
+
+ // Sort them to put arguments first (since we don't want to rename live-in
+ // registers), by weight next, and then by position.
+ // TODO: Investigate more intelligent sorting heuristics. For starters, we
+ // should try to coalesce adjacent live intervals before non-adjacent ones.
+ std::sort(SortedIntervals.begin(), SortedIntervals.end(),
+ [MRI](LiveInterval *LHS, LiveInterval *RHS) {
+ if (MRI->isLiveIn(LHS->reg) != MRI->isLiveIn(RHS->reg))
+ return MRI->isLiveIn(LHS->reg);
+ if (LHS->weight != RHS->weight)
+ return LHS->weight > RHS->weight;
+ if (LHS->empty() || RHS->empty())
+ return !LHS->empty() && RHS->empty();
+ return *LHS < *RHS;
+ });
+
+ DEBUG(dbgs() << "Coloring register intervals:\n");
+ SmallVector<unsigned, 16> SlotMapping(SortedIntervals.size(), -1u);
+ SmallVector<SmallVector<LiveInterval *, 4>, 16> Assignments(
+ SortedIntervals.size());
+ BitVector UsedColors(SortedIntervals.size());
+ bool Changed = false;
+ for (size_t i = 0, e = SortedIntervals.size(); i < e; ++i) {
+ LiveInterval *LI = SortedIntervals[i];
+ unsigned Old = LI->reg;
+ size_t Color = i;
+ const TargetRegisterClass *RC = MRI->getRegClass(Old);
+
+ // Check if it's possible to reuse any of the used colors.
+ if (!MRI->isLiveIn(Old))
+ for (int C(UsedColors.find_first()); C != -1;
+ C = UsedColors.find_next(C)) {
+ if (MRI->getRegClass(SortedIntervals[C]->reg) != RC)
+ continue;
+ for (LiveInterval *OtherLI : Assignments[C])
+ if (!OtherLI->empty() && OtherLI->overlaps(*LI))
+ goto continue_outer;
+ Color = C;
+ break;
+ continue_outer:;
+ }
+
+ unsigned New = SortedIntervals[Color]->reg;
+ SlotMapping[i] = New;
+ Changed |= Old != New;
+ UsedColors.set(Color);
+ Assignments[Color].push_back(LI);
+ DEBUG(dbgs() << "Assigning vreg"
+ << TargetRegisterInfo::virtReg2Index(LI->reg) << " to vreg"
+ << TargetRegisterInfo::virtReg2Index(New) << "\n");
+ }
+ if (!Changed)
+ return false;
+
+ // Rewrite register operands.
+ for (size_t i = 0, e = SortedIntervals.size(); i < e; ++i) {
+ unsigned Old = SortedIntervals[i]->reg;
+ unsigned New = SlotMapping[i];
+ if (Old != New)
+ MRI->replaceRegWith(Old, New);
+ }
+ return true;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
new file mode 100644
index 000000000000..e3470825940c
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
@@ -0,0 +1,107 @@
+//===-- WebAssemblyRegNumbering.cpp - Register Numbering ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements a pass which assigns WebAssembly register
+/// numbers for CodeGen virtual registers.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "WebAssemblyUtilities.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-reg-numbering"
+
+namespace {
+class WebAssemblyRegNumbering final : public MachineFunctionPass {
+ StringRef getPassName() const override {
+ return "WebAssembly Register Numbering";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+public:
+ static char ID; // Pass identification, replacement for typeid
+ WebAssemblyRegNumbering() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyRegNumbering::ID = 0;
+FunctionPass *llvm::createWebAssemblyRegNumbering() {
+ return new WebAssemblyRegNumbering();
+}
+
+bool WebAssemblyRegNumbering::runOnMachineFunction(MachineFunction &MF) {
+ DEBUG(dbgs() << "********** Register Numbering **********\n"
+ "********** Function: "
+ << MF.getName() << '\n');
+
+ WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ MFI.initWARegs();
+
+ // WebAssembly argument registers are in the same index space as local
+ // variables. Assign the numbers for them first.
+ MachineBasicBlock &EntryMBB = MF.front();
+ for (MachineInstr &MI : EntryMBB) {
+ if (!WebAssembly::isArgument(MI))
+ break;
+
+ int64_t Imm = MI.getOperand(1).getImm();
+ DEBUG(dbgs() << "Arg VReg " << MI.getOperand(0).getReg() << " -> WAReg "
+ << Imm << "\n");
+ MFI.setWAReg(MI.getOperand(0).getReg(), Imm);
+ }
+
+ // Then assign regular WebAssembly registers for all remaining used
+ // virtual registers. TODO: Consider sorting the registers by frequency of
+ // use, to maximize usage of small immediate fields.
+ unsigned NumVRegs = MF.getRegInfo().getNumVirtRegs();
+ unsigned NumStackRegs = 0;
+ // Start the numbering for locals after the arg regs
+ unsigned CurReg = MFI.getParams().size();
+ for (unsigned VRegIdx = 0; VRegIdx < NumVRegs; ++VRegIdx) {
+ unsigned VReg = TargetRegisterInfo::index2VirtReg(VRegIdx);
+ // Skip unused registers.
+ if (MRI.use_empty(VReg))
+ continue;
+ // Handle stackified registers.
+ if (MFI.isVRegStackified(VReg)) {
+ DEBUG(dbgs() << "VReg " << VReg << " -> WAReg "
+ << (INT32_MIN | NumStackRegs) << "\n");
+ MFI.setWAReg(VReg, INT32_MIN | NumStackRegs++);
+ continue;
+ }
+ if (MFI.getWAReg(VReg) == WebAssemblyFunctionInfo::UnusedReg) {
+ DEBUG(dbgs() << "VReg " << VReg << " -> WAReg " << CurReg << "\n");
+ MFI.setWAReg(VReg, CurReg++);
+ }
+ }
+
+ return true;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
new file mode 100644
index 000000000000..32ee09e45796
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -0,0 +1,884 @@
+//===-- WebAssemblyRegStackify.cpp - Register Stackification --------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements a register stacking pass.
+///
+/// This pass reorders instructions to put register uses and defs in an order
+/// such that they form single-use expression trees. Registers fitting this form
+/// are then marked as "stackified", meaning references to them are replaced by
+/// "push" and "pop" from the value stack.
+///
+/// This is primarily a code size optimization, since temporary values on the
+/// value stack don't need to be named.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" // for WebAssembly::ARGUMENT_*
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "WebAssemblyUtilities.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-reg-stackify"
+
+namespace {
+class WebAssemblyRegStackify final : public MachineFunctionPass {
+ StringRef getPassName() const override {
+ return "WebAssembly Register Stackify";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<LiveIntervals>();
+ AU.addPreserved<MachineBlockFrequencyInfo>();
+ AU.addPreserved<SlotIndexes>();
+ AU.addPreserved<LiveIntervals>();
+ AU.addPreservedID(LiveVariablesID);
+ AU.addPreserved<MachineDominatorTree>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+public:
+ static char ID; // Pass identification, replacement for typeid
+ WebAssemblyRegStackify() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyRegStackify::ID = 0;
+FunctionPass *llvm::createWebAssemblyRegStackify() {
+ return new WebAssemblyRegStackify();
+}
+
+// Decorate the given instruction with implicit operands that enforce the
+// expression stack ordering constraints for an instruction which is on
+// the expression stack.
+static void ImposeStackOrdering(MachineInstr *MI) {
+ // Write the opaque VALUE_STACK register.
+ if (!MI->definesRegister(WebAssembly::VALUE_STACK))
+ MI->addOperand(MachineOperand::CreateReg(WebAssembly::VALUE_STACK,
+ /*isDef=*/true,
+ /*isImp=*/true));
+
+ // Also read the opaque VALUE_STACK register.
+ if (!MI->readsRegister(WebAssembly::VALUE_STACK))
+ MI->addOperand(MachineOperand::CreateReg(WebAssembly::VALUE_STACK,
+ /*isDef=*/false,
+ /*isImp=*/true));
+}
+
+// Convert an IMPLICIT_DEF instruction into an instruction which defines
+// a constant zero value.
+static void ConvertImplicitDefToConstZero(MachineInstr *MI,
+ MachineRegisterInfo &MRI,
+ const TargetInstrInfo *TII,
+ MachineFunction &MF) {
+ assert(MI->getOpcode() == TargetOpcode::IMPLICIT_DEF);
+
+ const auto *RegClass =
+ MRI.getRegClass(MI->getOperand(0).getReg());
+ if (RegClass == &WebAssembly::I32RegClass) {
+ MI->setDesc(TII->get(WebAssembly::CONST_I32));
+ MI->addOperand(MachineOperand::CreateImm(0));
+ } else if (RegClass == &WebAssembly::I64RegClass) {
+ MI->setDesc(TII->get(WebAssembly::CONST_I64));
+ MI->addOperand(MachineOperand::CreateImm(0));
+ } else if (RegClass == &WebAssembly::F32RegClass) {
+ MI->setDesc(TII->get(WebAssembly::CONST_F32));
+ ConstantFP *Val = cast<ConstantFP>(Constant::getNullValue(
+ Type::getFloatTy(MF.getFunction()->getContext())));
+ MI->addOperand(MachineOperand::CreateFPImm(Val));
+ } else if (RegClass == &WebAssembly::F64RegClass) {
+ MI->setDesc(TII->get(WebAssembly::CONST_F64));
+ ConstantFP *Val = cast<ConstantFP>(Constant::getNullValue(
+ Type::getDoubleTy(MF.getFunction()->getContext())));
+ MI->addOperand(MachineOperand::CreateFPImm(Val));
+ } else {
+ llvm_unreachable("Unexpected reg class");
+ }
+}
+
+// Determine whether a call to the callee referenced by
+// MI->getOperand(CalleeOpNo) reads memory, writes memory, and/or has side
+// effects.
+static void QueryCallee(const MachineInstr &MI, unsigned CalleeOpNo, bool &Read,
+ bool &Write, bool &Effects, bool &StackPointer) {
+ // All calls can use the stack pointer.
+ StackPointer = true;
+
+ const MachineOperand &MO = MI.getOperand(CalleeOpNo);
+ if (MO.isGlobal()) {
+ const Constant *GV = MO.getGlobal();
+ if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+ if (!GA->isInterposable())
+ GV = GA->getAliasee();
+
+ if (const Function *F = dyn_cast<Function>(GV)) {
+ if (!F->doesNotThrow())
+ Effects = true;
+ if (F->doesNotAccessMemory())
+ return;
+ if (F->onlyReadsMemory()) {
+ Read = true;
+ return;
+ }
+ }
+ }
+
+ // Assume the worst.
+ Write = true;
+ Read = true;
+ Effects = true;
+}
+
+// Determine whether MI reads memory, writes memory, has side effects,
+// and/or uses the __stack_pointer value.
+static void Query(const MachineInstr &MI, AliasAnalysis &AA, bool &Read,
+ bool &Write, bool &Effects, bool &StackPointer) {
+ assert(!MI.isPosition());
+ assert(!MI.isTerminator());
+
+ if (MI.isDebugValue())
+ return;
+
+ // Check for loads.
+ if (MI.mayLoad() && !MI.isDereferenceableInvariantLoad(&AA))
+ Read = true;
+
+ // Check for stores.
+ if (MI.mayStore()) {
+ Write = true;
+
+ // Check for stores to __stack_pointer.
+ for (auto MMO : MI.memoperands()) {
+ const MachinePointerInfo &MPI = MMO->getPointerInfo();
+ if (MPI.V.is<const PseudoSourceValue *>()) {
+ auto PSV = MPI.V.get<const PseudoSourceValue *>();
+ if (const ExternalSymbolPseudoSourceValue *EPSV =
+ dyn_cast<ExternalSymbolPseudoSourceValue>(PSV))
+ if (StringRef(EPSV->getSymbol()) == "__stack_pointer")
+ StackPointer = true;
+ }
+ }
+ } else if (MI.hasOrderedMemoryRef()) {
+ switch (MI.getOpcode()) {
+ case WebAssembly::DIV_S_I32: case WebAssembly::DIV_S_I64:
+ case WebAssembly::REM_S_I32: case WebAssembly::REM_S_I64:
+ case WebAssembly::DIV_U_I32: case WebAssembly::DIV_U_I64:
+ case WebAssembly::REM_U_I32: case WebAssembly::REM_U_I64:
+ case WebAssembly::I32_TRUNC_S_F32: case WebAssembly::I64_TRUNC_S_F32:
+ case WebAssembly::I32_TRUNC_S_F64: case WebAssembly::I64_TRUNC_S_F64:
+ case WebAssembly::I32_TRUNC_U_F32: case WebAssembly::I64_TRUNC_U_F32:
+ case WebAssembly::I32_TRUNC_U_F64: case WebAssembly::I64_TRUNC_U_F64:
+ // These instruction have hasUnmodeledSideEffects() returning true
+ // because they trap on overflow and invalid so they can't be arbitrarily
+ // moved, however hasOrderedMemoryRef() interprets this plus their lack
+ // of memoperands as having a potential unknown memory reference.
+ break;
+ default:
+ // Record volatile accesses, unless it's a call, as calls are handled
+ // specially below.
+ if (!MI.isCall()) {
+ Write = true;
+ Effects = true;
+ }
+ break;
+ }
+ }
+
+ // Check for side effects.
+ if (MI.hasUnmodeledSideEffects()) {
+ switch (MI.getOpcode()) {
+ case WebAssembly::DIV_S_I32: case WebAssembly::DIV_S_I64:
+ case WebAssembly::REM_S_I32: case WebAssembly::REM_S_I64:
+ case WebAssembly::DIV_U_I32: case WebAssembly::DIV_U_I64:
+ case WebAssembly::REM_U_I32: case WebAssembly::REM_U_I64:
+ case WebAssembly::I32_TRUNC_S_F32: case WebAssembly::I64_TRUNC_S_F32:
+ case WebAssembly::I32_TRUNC_S_F64: case WebAssembly::I64_TRUNC_S_F64:
+ case WebAssembly::I32_TRUNC_U_F32: case WebAssembly::I64_TRUNC_U_F32:
+ case WebAssembly::I32_TRUNC_U_F64: case WebAssembly::I64_TRUNC_U_F64:
+ // These instructions have hasUnmodeledSideEffects() returning true
+ // because they trap on overflow and invalid so they can't be arbitrarily
+ // moved, however in the specific case of register stackifying, it is safe
+ // to move them because overflow and invalid are Undefined Behavior.
+ break;
+ default:
+ Effects = true;
+ break;
+ }
+ }
+
+ // Analyze calls.
+ if (MI.isCall()) {
+ switch (MI.getOpcode()) {
+ case WebAssembly::CALL_VOID:
+ case WebAssembly::CALL_INDIRECT_VOID:
+ QueryCallee(MI, 0, Read, Write, Effects, StackPointer);
+ break;
+ case WebAssembly::CALL_I32: case WebAssembly::CALL_I64:
+ case WebAssembly::CALL_F32: case WebAssembly::CALL_F64:
+ case WebAssembly::CALL_INDIRECT_I32: case WebAssembly::CALL_INDIRECT_I64:
+ case WebAssembly::CALL_INDIRECT_F32: case WebAssembly::CALL_INDIRECT_F64:
+ QueryCallee(MI, 1, Read, Write, Effects, StackPointer);
+ break;
+ default:
+ llvm_unreachable("unexpected call opcode");
+ }
+ }
+}
+
+// Test whether Def is safe and profitable to rematerialize.
+static bool ShouldRematerialize(const MachineInstr &Def, AliasAnalysis &AA,
+ const WebAssemblyInstrInfo *TII) {
+ return Def.isAsCheapAsAMove() && TII->isTriviallyReMaterializable(Def, &AA);
+}
+
+// Identify the definition for this register at this point. This is a
+// generalization of MachineRegisterInfo::getUniqueVRegDef that uses
+// LiveIntervals to handle complex cases.
+static MachineInstr *GetVRegDef(unsigned Reg, const MachineInstr *Insert,
+ const MachineRegisterInfo &MRI,
+ const LiveIntervals &LIS)
+{
+ // Most registers are in SSA form here so we try a quick MRI query first.
+ if (MachineInstr *Def = MRI.getUniqueVRegDef(Reg))
+ return Def;
+
+ // MRI doesn't know what the Def is. Try asking LIS.
+ if (const VNInfo *ValNo = LIS.getInterval(Reg).getVNInfoBefore(
+ LIS.getInstructionIndex(*Insert)))
+ return LIS.getInstructionFromIndex(ValNo->def);
+
+ return nullptr;
+}
+
+// Test whether Reg, as defined at Def, has exactly one use. This is a
+// generalization of MachineRegisterInfo::hasOneUse that uses LiveIntervals
+// to handle complex cases.
+static bool HasOneUse(unsigned Reg, MachineInstr *Def,
+ MachineRegisterInfo &MRI, MachineDominatorTree &MDT,
+ LiveIntervals &LIS) {
+ // Most registers are in SSA form here so we try a quick MRI query first.
+ if (MRI.hasOneUse(Reg))
+ return true;
+
+ bool HasOne = false;
+ const LiveInterval &LI = LIS.getInterval(Reg);
+ const VNInfo *DefVNI = LI.getVNInfoAt(
+ LIS.getInstructionIndex(*Def).getRegSlot());
+ assert(DefVNI);
+ for (auto &I : MRI.use_nodbg_operands(Reg)) {
+ const auto &Result = LI.Query(LIS.getInstructionIndex(*I.getParent()));
+ if (Result.valueIn() == DefVNI) {
+ if (!Result.isKill())
+ return false;
+ if (HasOne)
+ return false;
+ HasOne = true;
+ }
+ }
+ return HasOne;
+}
+
+// Test whether it's safe to move Def to just before Insert.
+// TODO: Compute memory dependencies in a way that doesn't require always
+// walking the block.
+// TODO: Compute memory dependencies in a way that uses AliasAnalysis to be
+// more precise.
+static bool IsSafeToMove(const MachineInstr *Def, const MachineInstr *Insert,
+ AliasAnalysis &AA, const MachineRegisterInfo &MRI) {
+ assert(Def->getParent() == Insert->getParent());
+
+ // Check for register dependencies.
+ SmallVector<unsigned, 4> MutableRegisters;
+ for (const MachineOperand &MO : Def->operands()) {
+ if (!MO.isReg() || MO.isUndef())
+ continue;
+ unsigned Reg = MO.getReg();
+
+ // If the register is dead here and at Insert, ignore it.
+ if (MO.isDead() && Insert->definesRegister(Reg) &&
+ !Insert->readsRegister(Reg))
+ continue;
+
+ if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+ // Ignore ARGUMENTS; it's just used to keep the ARGUMENT_* instructions
+ // from moving down, and we've already checked for that.
+ if (Reg == WebAssembly::ARGUMENTS)
+ continue;
+ // If the physical register is never modified, ignore it.
+ if (!MRI.isPhysRegModified(Reg))
+ continue;
+ // Otherwise, it's a physical register with unknown liveness.
+ return false;
+ }
+
+ // If one of the operands isn't in SSA form, it has different values at
+ // different times, and we need to make sure we don't move our use across
+ // a different def.
+ if (!MO.isDef() && !MRI.hasOneDef(Reg))
+ MutableRegisters.push_back(Reg);
+ }
+
+ bool Read = false, Write = false, Effects = false, StackPointer = false;
+ Query(*Def, AA, Read, Write, Effects, StackPointer);
+
+ // If the instruction does not access memory and has no side effects, it has
+ // no additional dependencies.
+ bool HasMutableRegisters = !MutableRegisters.empty();
+ if (!Read && !Write && !Effects && !StackPointer && !HasMutableRegisters)
+ return true;
+
+ // Scan through the intervening instructions between Def and Insert.
+ MachineBasicBlock::const_iterator D(Def), I(Insert);
+ for (--I; I != D; --I) {
+ bool InterveningRead = false;
+ bool InterveningWrite = false;
+ bool InterveningEffects = false;
+ bool InterveningStackPointer = false;
+ Query(*I, AA, InterveningRead, InterveningWrite, InterveningEffects,
+ InterveningStackPointer);
+ if (Effects && InterveningEffects)
+ return false;
+ if (Read && InterveningWrite)
+ return false;
+ if (Write && (InterveningRead || InterveningWrite))
+ return false;
+ if (StackPointer && InterveningStackPointer)
+ return false;
+
+ for (unsigned Reg : MutableRegisters)
+ for (const MachineOperand &MO : I->operands())
+ if (MO.isReg() && MO.isDef() && MO.getReg() == Reg)
+ return false;
+ }
+
+ return true;
+}
+
+/// Test whether OneUse, a use of Reg, dominates all of Reg's other uses.
+static bool OneUseDominatesOtherUses(unsigned Reg, const MachineOperand &OneUse,
+ const MachineBasicBlock &MBB,
+ const MachineRegisterInfo &MRI,
+ const MachineDominatorTree &MDT,
+ LiveIntervals &LIS,
+ WebAssemblyFunctionInfo &MFI) {
+ const LiveInterval &LI = LIS.getInterval(Reg);
+
+ const MachineInstr *OneUseInst = OneUse.getParent();
+ VNInfo *OneUseVNI = LI.getVNInfoBefore(LIS.getInstructionIndex(*OneUseInst));
+
+ for (const MachineOperand &Use : MRI.use_nodbg_operands(Reg)) {
+ if (&Use == &OneUse)
+ continue;
+
+ const MachineInstr *UseInst = Use.getParent();
+ VNInfo *UseVNI = LI.getVNInfoBefore(LIS.getInstructionIndex(*UseInst));
+
+ if (UseVNI != OneUseVNI)
+ continue;
+
+ const MachineInstr *OneUseInst = OneUse.getParent();
+ if (UseInst == OneUseInst) {
+ // Another use in the same instruction. We need to ensure that the one
+ // selected use happens "before" it.
+ if (&OneUse > &Use)
+ return false;
+ } else {
+ // Test that the use is dominated by the one selected use.
+ while (!MDT.dominates(OneUseInst, UseInst)) {
+ // Actually, dominating is over-conservative. Test that the use would
+ // happen after the one selected use in the stack evaluation order.
+ //
+ // This is needed as a consequence of using implicit get_locals for
+ // uses and implicit set_locals for defs.
+ if (UseInst->getDesc().getNumDefs() == 0)
+ return false;
+ const MachineOperand &MO = UseInst->getOperand(0);
+ if (!MO.isReg())
+ return false;
+ unsigned DefReg = MO.getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(DefReg) ||
+ !MFI.isVRegStackified(DefReg))
+ return false;
+ assert(MRI.hasOneUse(DefReg));
+ const MachineOperand &NewUse = *MRI.use_begin(DefReg);
+ const MachineInstr *NewUseInst = NewUse.getParent();
+ if (NewUseInst == OneUseInst) {
+ if (&OneUse > &NewUse)
+ return false;
+ break;
+ }
+ UseInst = NewUseInst;
+ }
+ }
+ }
+ return true;
+}
+
+/// Get the appropriate tee opcode for the given register class.
+static unsigned GetTeeOpcode(const TargetRegisterClass *RC) {
+ if (RC == &WebAssembly::I32RegClass)
+ return WebAssembly::TEE_I32;
+ if (RC == &WebAssembly::I64RegClass)
+ return WebAssembly::TEE_I64;
+ if (RC == &WebAssembly::F32RegClass)
+ return WebAssembly::TEE_F32;
+ if (RC == &WebAssembly::F64RegClass)
+ return WebAssembly::TEE_F64;
+ if (RC == &WebAssembly::V128RegClass)
+ return WebAssembly::TEE_V128;
+ llvm_unreachable("Unexpected register class");
+}
+
+// Shrink LI to its uses, cleaning up LI.
+static void ShrinkToUses(LiveInterval &LI, LiveIntervals &LIS) {
+ if (LIS.shrinkToUses(&LI)) {
+ SmallVector<LiveInterval*, 4> SplitLIs;
+ LIS.splitSeparateComponents(LI, SplitLIs);
+ }
+}
+
+/// A single-use def in the same block with no intervening memory or register
+/// dependencies; move the def down and nest it with the current instruction.
+static MachineInstr *MoveForSingleUse(unsigned Reg, MachineOperand& Op,
+ MachineInstr *Def,
+ MachineBasicBlock &MBB,
+ MachineInstr *Insert, LiveIntervals &LIS,
+ WebAssemblyFunctionInfo &MFI,
+ MachineRegisterInfo &MRI) {
+ DEBUG(dbgs() << "Move for single use: "; Def->dump());
+
+ MBB.splice(Insert, &MBB, Def);
+ LIS.handleMove(*Def);
+
+ if (MRI.hasOneDef(Reg) && MRI.hasOneUse(Reg)) {
+ // No one else is using this register for anything so we can just stackify
+ // it in place.
+ MFI.stackifyVReg(Reg);
+ } else {
+ // The register may have unrelated uses or defs; create a new register for
+ // just our one def and use so that we can stackify it.
+ unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
+ Def->getOperand(0).setReg(NewReg);
+ Op.setReg(NewReg);
+
+ // Tell LiveIntervals about the new register.
+ LIS.createAndComputeVirtRegInterval(NewReg);
+
+ // Tell LiveIntervals about the changes to the old register.
+ LiveInterval &LI = LIS.getInterval(Reg);
+ LI.removeSegment(LIS.getInstructionIndex(*Def).getRegSlot(),
+ LIS.getInstructionIndex(*Op.getParent()).getRegSlot(),
+ /*RemoveDeadValNo=*/true);
+
+ MFI.stackifyVReg(NewReg);
+
+ DEBUG(dbgs() << " - Replaced register: "; Def->dump());
+ }
+
+ ImposeStackOrdering(Def);
+ return Def;
+}
+
+/// A trivially cloneable instruction; clone it and nest the new copy with the
+/// current instruction.
+static MachineInstr *RematerializeCheapDef(
+ unsigned Reg, MachineOperand &Op, MachineInstr &Def, MachineBasicBlock &MBB,
+ MachineBasicBlock::instr_iterator Insert, LiveIntervals &LIS,
+ WebAssemblyFunctionInfo &MFI, MachineRegisterInfo &MRI,
+ const WebAssemblyInstrInfo *TII, const WebAssemblyRegisterInfo *TRI) {
+ DEBUG(dbgs() << "Rematerializing cheap def: "; Def.dump());
+ DEBUG(dbgs() << " - for use in "; Op.getParent()->dump());
+
+ unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
+ TII->reMaterialize(MBB, Insert, NewReg, 0, Def, *TRI);
+ Op.setReg(NewReg);
+ MachineInstr *Clone = &*std::prev(Insert);
+ LIS.InsertMachineInstrInMaps(*Clone);
+ LIS.createAndComputeVirtRegInterval(NewReg);
+ MFI.stackifyVReg(NewReg);
+ ImposeStackOrdering(Clone);
+
+ DEBUG(dbgs() << " - Cloned to "; Clone->dump());
+
+ // Shrink the interval.
+ bool IsDead = MRI.use_empty(Reg);
+ if (!IsDead) {
+ LiveInterval &LI = LIS.getInterval(Reg);
+ ShrinkToUses(LI, LIS);
+ IsDead = !LI.liveAt(LIS.getInstructionIndex(Def).getDeadSlot());
+ }
+
+ // If that was the last use of the original, delete the original.
+ if (IsDead) {
+ DEBUG(dbgs() << " - Deleting original\n");
+ SlotIndex Idx = LIS.getInstructionIndex(Def).getRegSlot();
+ LIS.removePhysRegDefAt(WebAssembly::ARGUMENTS, Idx);
+ LIS.removeInterval(Reg);
+ LIS.RemoveMachineInstrFromMaps(Def);
+ Def.eraseFromParent();
+ }
+
+ return Clone;
+}
+
+/// A multiple-use def in the same block with no intervening memory or register
+/// dependencies; move the def down, nest it with the current instruction, and
+/// insert a tee to satisfy the rest of the uses. As an illustration, rewrite
+/// this:
+///
+/// Reg = INST ... // Def
+/// INST ..., Reg, ... // Insert
+/// INST ..., Reg, ...
+/// INST ..., Reg, ...
+///
+/// to this:
+///
+/// DefReg = INST ... // Def (to become the new Insert)
+/// TeeReg, Reg = TEE_... DefReg
+/// INST ..., TeeReg, ... // Insert
+/// INST ..., Reg, ...
+/// INST ..., Reg, ...
+///
+/// with DefReg and TeeReg stackified. This eliminates a get_local from the
+/// resulting code.
+static MachineInstr *MoveAndTeeForMultiUse(
+ unsigned Reg, MachineOperand &Op, MachineInstr *Def, MachineBasicBlock &MBB,
+ MachineInstr *Insert, LiveIntervals &LIS, WebAssemblyFunctionInfo &MFI,
+ MachineRegisterInfo &MRI, const WebAssemblyInstrInfo *TII) {
+ DEBUG(dbgs() << "Move and tee for multi-use:"; Def->dump());
+
+ // Move Def into place.
+ MBB.splice(Insert, &MBB, Def);
+ LIS.handleMove(*Def);
+
+ // Create the Tee and attach the registers.
+ const auto *RegClass = MRI.getRegClass(Reg);
+ unsigned TeeReg = MRI.createVirtualRegister(RegClass);
+ unsigned DefReg = MRI.createVirtualRegister(RegClass);
+ MachineOperand &DefMO = Def->getOperand(0);
+ MachineInstr *Tee = BuildMI(MBB, Insert, Insert->getDebugLoc(),
+ TII->get(GetTeeOpcode(RegClass)), TeeReg)
+ .addReg(Reg, RegState::Define)
+ .addReg(DefReg, getUndefRegState(DefMO.isDead()));
+ Op.setReg(TeeReg);
+ DefMO.setReg(DefReg);
+ SlotIndex TeeIdx = LIS.InsertMachineInstrInMaps(*Tee).getRegSlot();
+ SlotIndex DefIdx = LIS.getInstructionIndex(*Def).getRegSlot();
+
+ // Tell LiveIntervals we moved the original vreg def from Def to Tee.
+ LiveInterval &LI = LIS.getInterval(Reg);
+ LiveInterval::iterator I = LI.FindSegmentContaining(DefIdx);
+ VNInfo *ValNo = LI.getVNInfoAt(DefIdx);
+ I->start = TeeIdx;
+ ValNo->def = TeeIdx;
+ ShrinkToUses(LI, LIS);
+
+ // Finish stackifying the new regs.
+ LIS.createAndComputeVirtRegInterval(TeeReg);
+ LIS.createAndComputeVirtRegInterval(DefReg);
+ MFI.stackifyVReg(DefReg);
+ MFI.stackifyVReg(TeeReg);
+ ImposeStackOrdering(Def);
+ ImposeStackOrdering(Tee);
+
+ DEBUG(dbgs() << " - Replaced register: "; Def->dump());
+ DEBUG(dbgs() << " - Tee instruction: "; Tee->dump());
+ return Def;
+}
+
+namespace {
+/// A stack for walking the tree of instructions being built, visiting the
+/// MachineOperands in DFS order.
+class TreeWalkerState {
+ typedef MachineInstr::mop_iterator mop_iterator;
+ typedef std::reverse_iterator<mop_iterator> mop_reverse_iterator;
+ typedef iterator_range<mop_reverse_iterator> RangeTy;
+ SmallVector<RangeTy, 4> Worklist;
+
+public:
+ explicit TreeWalkerState(MachineInstr *Insert) {
+ const iterator_range<mop_iterator> &Range = Insert->explicit_uses();
+ if (Range.begin() != Range.end())
+ Worklist.push_back(reverse(Range));
+ }
+
+ bool Done() const { return Worklist.empty(); }
+
+ MachineOperand &Pop() {
+ RangeTy &Range = Worklist.back();
+ MachineOperand &Op = *Range.begin();
+ Range = drop_begin(Range, 1);
+ if (Range.begin() == Range.end())
+ Worklist.pop_back();
+ assert((Worklist.empty() ||
+ Worklist.back().begin() != Worklist.back().end()) &&
+ "Empty ranges shouldn't remain in the worklist");
+ return Op;
+ }
+
+ /// Push Instr's operands onto the stack to be visited.
+ void PushOperands(MachineInstr *Instr) {
+ const iterator_range<mop_iterator> &Range(Instr->explicit_uses());
+ if (Range.begin() != Range.end())
+ Worklist.push_back(reverse(Range));
+ }
+
+ /// Some of Instr's operands are on the top of the stack; remove them and
+ /// re-insert them starting from the beginning (because we've commuted them).
+ void ResetTopOperands(MachineInstr *Instr) {
+ assert(HasRemainingOperands(Instr) &&
+ "Reseting operands should only be done when the instruction has "
+ "an operand still on the stack");
+ Worklist.back() = reverse(Instr->explicit_uses());
+ }
+
+ /// Test whether Instr has operands remaining to be visited at the top of
+ /// the stack.
+ bool HasRemainingOperands(const MachineInstr *Instr) const {
+ if (Worklist.empty())
+ return false;
+ const RangeTy &Range = Worklist.back();
+ return Range.begin() != Range.end() && Range.begin()->getParent() == Instr;
+ }
+
+ /// Test whether the given register is present on the stack, indicating an
+ /// operand in the tree that we haven't visited yet. Moving a definition of
+ /// Reg to a point in the tree after that would change its value.
+ ///
+ /// This is needed as a consequence of using implicit get_locals for
+ /// uses and implicit set_locals for defs.
+ bool IsOnStack(unsigned Reg) const {
+ for (const RangeTy &Range : Worklist)
+ for (const MachineOperand &MO : Range)
+ if (MO.isReg() && MO.getReg() == Reg)
+ return true;
+ return false;
+ }
+};
+
+/// State to keep track of whether commuting is in flight or whether it's been
+/// tried for the current instruction and didn't work.
+class CommutingState {
+ /// There are effectively three states: the initial state where we haven't
+ /// started commuting anything and we don't know anything yet, the tenative
+ /// state where we've commuted the operands of the current instruction and are
+ /// revisting it, and the declined state where we've reverted the operands
+ /// back to their original order and will no longer commute it further.
+ bool TentativelyCommuting;
+ bool Declined;
+
+ /// During the tentative state, these hold the operand indices of the commuted
+ /// operands.
+ unsigned Operand0, Operand1;
+
+public:
+ CommutingState() : TentativelyCommuting(false), Declined(false) {}
+
+ /// Stackification for an operand was not successful due to ordering
+ /// constraints. If possible, and if we haven't already tried it and declined
+ /// it, commute Insert's operands and prepare to revisit it.
+ void MaybeCommute(MachineInstr *Insert, TreeWalkerState &TreeWalker,
+ const WebAssemblyInstrInfo *TII) {
+ if (TentativelyCommuting) {
+ assert(!Declined &&
+ "Don't decline commuting until you've finished trying it");
+ // Commuting didn't help. Revert it.
+ TII->commuteInstruction(*Insert, /*NewMI=*/false, Operand0, Operand1);
+ TentativelyCommuting = false;
+ Declined = true;
+ } else if (!Declined && TreeWalker.HasRemainingOperands(Insert)) {
+ Operand0 = TargetInstrInfo::CommuteAnyOperandIndex;
+ Operand1 = TargetInstrInfo::CommuteAnyOperandIndex;
+ if (TII->findCommutedOpIndices(*Insert, Operand0, Operand1)) {
+ // Tentatively commute the operands and try again.
+ TII->commuteInstruction(*Insert, /*NewMI=*/false, Operand0, Operand1);
+ TreeWalker.ResetTopOperands(Insert);
+ TentativelyCommuting = true;
+ Declined = false;
+ }
+ }
+ }
+
+ /// Stackification for some operand was successful. Reset to the default
+ /// state.
+ void Reset() {
+ TentativelyCommuting = false;
+ Declined = false;
+ }
+};
+} // end anonymous namespace
+
+bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
+ DEBUG(dbgs() << "********** Register Stackifying **********\n"
+ "********** Function: "
+ << MF.getName() << '\n');
+
+ bool Changed = false;
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
+ const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+ const auto *TRI = MF.getSubtarget<WebAssemblySubtarget>().getRegisterInfo();
+ AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+ MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
+ LiveIntervals &LIS = getAnalysis<LiveIntervals>();
+
+ // Walk the instructions from the bottom up. Currently we don't look past
+ // block boundaries, and the blocks aren't ordered so the block visitation
+ // order isn't significant, but we may want to change this in the future.
+ for (MachineBasicBlock &MBB : MF) {
+ // Don't use a range-based for loop, because we modify the list as we're
+ // iterating over it and the end iterator may change.
+ for (auto MII = MBB.rbegin(); MII != MBB.rend(); ++MII) {
+ MachineInstr *Insert = &*MII;
+ // Don't nest anything inside an inline asm, because we don't have
+ // constraints for $push inputs.
+ if (Insert->getOpcode() == TargetOpcode::INLINEASM)
+ continue;
+
+ // Ignore debugging intrinsics.
+ if (Insert->getOpcode() == TargetOpcode::DBG_VALUE)
+ continue;
+
+ // Iterate through the inputs in reverse order, since we'll be pulling
+ // operands off the stack in LIFO order.
+ CommutingState Commuting;
+ TreeWalkerState TreeWalker(Insert);
+ while (!TreeWalker.Done()) {
+ MachineOperand &Op = TreeWalker.Pop();
+
+ // We're only interested in explicit virtual register operands.
+ if (!Op.isReg())
+ continue;
+
+ unsigned Reg = Op.getReg();
+ assert(Op.isUse() && "explicit_uses() should only iterate over uses");
+ assert(!Op.isImplicit() &&
+ "explicit_uses() should only iterate over explicit operands");
+ if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ continue;
+
+ // Identify the definition for this register at this point.
+ MachineInstr *Def = GetVRegDef(Reg, Insert, MRI, LIS);
+ if (!Def)
+ continue;
+
+ // Don't nest an INLINE_ASM def into anything, because we don't have
+ // constraints for $pop outputs.
+ if (Def->getOpcode() == TargetOpcode::INLINEASM)
+ continue;
+
+ // Argument instructions represent live-in registers and not real
+ // instructions.
+ if (WebAssembly::isArgument(*Def))
+ continue;
+
+ // Decide which strategy to take. Prefer to move a single-use value
+ // over cloning it, and prefer cloning over introducing a tee.
+ // For moving, we require the def to be in the same block as the use;
+ // this makes things simpler (LiveIntervals' handleMove function only
+ // supports intra-block moves) and it's MachineSink's job to catch all
+ // the sinking opportunities anyway.
+ bool SameBlock = Def->getParent() == &MBB;
+ bool CanMove = SameBlock && IsSafeToMove(Def, Insert, AA, MRI) &&
+ !TreeWalker.IsOnStack(Reg);
+ if (CanMove && HasOneUse(Reg, Def, MRI, MDT, LIS)) {
+ Insert = MoveForSingleUse(Reg, Op, Def, MBB, Insert, LIS, MFI, MRI);
+ } else if (ShouldRematerialize(*Def, AA, TII)) {
+ Insert =
+ RematerializeCheapDef(Reg, Op, *Def, MBB, Insert->getIterator(),
+ LIS, MFI, MRI, TII, TRI);
+ } else if (CanMove &&
+ OneUseDominatesOtherUses(Reg, Op, MBB, MRI, MDT, LIS, MFI)) {
+ Insert = MoveAndTeeForMultiUse(Reg, Op, Def, MBB, Insert, LIS, MFI,
+ MRI, TII);
+ } else {
+ // We failed to stackify the operand. If the problem was ordering
+ // constraints, Commuting may be able to help.
+ if (!CanMove && SameBlock)
+ Commuting.MaybeCommute(Insert, TreeWalker, TII);
+ // Proceed to the next operand.
+ continue;
+ }
+
+ // If the instruction we just stackified is an IMPLICIT_DEF, convert it
+ // to a constant 0 so that the def is explicit, and the push/pop
+ // correspondence is maintained.
+ if (Insert->getOpcode() == TargetOpcode::IMPLICIT_DEF)
+ ConvertImplicitDefToConstZero(Insert, MRI, TII, MF);
+
+ // We stackified an operand. Add the defining instruction's operands to
+ // the worklist stack now to continue to build an ever deeper tree.
+ Commuting.Reset();
+ TreeWalker.PushOperands(Insert);
+ }
+
+ // If we stackified any operands, skip over the tree to start looking for
+ // the next instruction we can build a tree on.
+ if (Insert != &*MII) {
+ ImposeStackOrdering(&*MII);
+ MII = MachineBasicBlock::iterator(Insert).getReverse();
+ Changed = true;
+ }
+ }
+ }
+
+ // If we used VALUE_STACK anywhere, add it to the live-in sets everywhere so
+ // that it never looks like a use-before-def.
+ if (Changed) {
+ MF.getRegInfo().addLiveIn(WebAssembly::VALUE_STACK);
+ for (MachineBasicBlock &MBB : MF)
+ MBB.addLiveIn(WebAssembly::VALUE_STACK);
+ }
+
+#ifndef NDEBUG
+ // Verify that pushes and pops are performed in LIFO order.
+ SmallVector<unsigned, 0> Stack;
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ if (MI.isDebugValue())
+ continue;
+ for (MachineOperand &MO : reverse(MI.explicit_operands())) {
+ if (!MO.isReg())
+ continue;
+ unsigned Reg = MO.getReg();
+
+ if (MFI.isVRegStackified(Reg)) {
+ if (MO.isDef())
+ Stack.push_back(Reg);
+ else
+ assert(Stack.pop_back_val() == Reg &&
+ "Register stack pop should be paired with a push");
+ }
+ }
+ }
+ // TODO: Generalize this code to support keeping values on the stack across
+ // basic block boundaries.
+ assert(Stack.empty() &&
+ "Register stack pushes and pops should be balanced");
+ }
+#endif
+
+ return Changed;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
new file mode 100644
index 000000000000..9367464c806e
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
@@ -0,0 +1,148 @@
+//===-- WebAssemblyRegisterInfo.cpp - WebAssembly Register Information ----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains the WebAssembly implementation of the
+/// TargetRegisterInfo class.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyRegisterInfo.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyFrameLowering.h"
+#include "WebAssemblyInstrInfo.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-reg-info"
+
+#define GET_REGINFO_TARGET_DESC
+#include "WebAssemblyGenRegisterInfo.inc"
+
+WebAssemblyRegisterInfo::WebAssemblyRegisterInfo(const Triple &TT)
+ : WebAssemblyGenRegisterInfo(0), TT(TT) {}
+
+const MCPhysReg *
+WebAssemblyRegisterInfo::getCalleeSavedRegs(const MachineFunction *) const {
+ static const MCPhysReg CalleeSavedRegs[] = {0};
+ return CalleeSavedRegs;
+}
+
+BitVector
+WebAssemblyRegisterInfo::getReservedRegs(const MachineFunction & /*MF*/) const {
+ BitVector Reserved(getNumRegs());
+ for (auto Reg : {WebAssembly::SP32, WebAssembly::SP64, WebAssembly::FP32,
+ WebAssembly::FP64})
+ Reserved.set(Reg);
+ return Reserved;
+}
+
+void WebAssemblyRegisterInfo::eliminateFrameIndex(
+ MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum,
+ RegScavenger * /*RS*/) const {
+ assert(SPAdj == 0);
+ MachineInstr &MI = *II;
+
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineFunction &MF = *MBB.getParent();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ int64_t FrameOffset = MFI.getStackSize() + MFI.getObjectOffset(FrameIndex);
+
+ assert(MFI.getObjectSize(FrameIndex) != 0 &&
+ "We assume that variable-sized objects have already been lowered, "
+ "and don't use FrameIndex operands.");
+ unsigned FrameRegister = getFrameRegister(MF);
+
+ // If this is the address operand of a load or store, make it relative to SP
+ // and fold the frame offset directly in.
+ if ((MI.mayLoad() && FIOperandNum == WebAssembly::LoadAddressOperandNo) ||
+ (MI.mayStore() && FIOperandNum == WebAssembly::StoreAddressOperandNo)) {
+ assert(FrameOffset >= 0 && MI.getOperand(FIOperandNum - 1).getImm() >= 0);
+ int64_t Offset = MI.getOperand(FIOperandNum - 1).getImm() + FrameOffset;
+
+ if (static_cast<uint64_t>(Offset) <= std::numeric_limits<uint32_t>::max()) {
+ MI.getOperand(FIOperandNum - 1).setImm(Offset);
+ MI.getOperand(FIOperandNum)
+ .ChangeToRegister(FrameRegister, /*IsDef=*/false);
+ return;
+ }
+ }
+
+ // If this is an address being added to a constant, fold the frame offset
+ // into the constant.
+ if (MI.getOpcode() == WebAssembly::ADD_I32) {
+ MachineOperand &OtherMO = MI.getOperand(3 - FIOperandNum);
+ if (OtherMO.isReg()) {
+ unsigned OtherMOReg = OtherMO.getReg();
+ if (TargetRegisterInfo::isVirtualRegister(OtherMOReg)) {
+ MachineInstr *Def = MF.getRegInfo().getUniqueVRegDef(OtherMOReg);
+ // TODO: For now we just opportunistically do this in the case where
+ // the CONST_I32 happens to have exactly one def and one use. We
+ // should generalize this to optimize in more cases.
+ if (Def && Def->getOpcode() == WebAssembly::CONST_I32 &&
+ MRI.hasOneNonDBGUse(Def->getOperand(0).getReg())) {
+ MachineOperand &ImmMO = Def->getOperand(1);
+ ImmMO.setImm(ImmMO.getImm() + uint32_t(FrameOffset));
+ MI.getOperand(FIOperandNum)
+ .ChangeToRegister(FrameRegister, /*IsDef=*/false);
+ return;
+ }
+ }
+ }
+ }
+
+ // Otherwise create an i32.add SP, offset and make it the operand.
+ const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+
+ unsigned FIRegOperand = FrameRegister;
+ if (FrameOffset) {
+ // Create i32.add SP, offset and make it the operand.
+ const TargetRegisterClass *PtrRC =
+ MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
+ unsigned OffsetOp = MRI.createVirtualRegister(PtrRC);
+ BuildMI(MBB, *II, II->getDebugLoc(), TII->get(WebAssembly::CONST_I32),
+ OffsetOp)
+ .addImm(FrameOffset);
+ FIRegOperand = MRI.createVirtualRegister(PtrRC);
+ BuildMI(MBB, *II, II->getDebugLoc(), TII->get(WebAssembly::ADD_I32),
+ FIRegOperand)
+ .addReg(FrameRegister)
+ .addReg(OffsetOp);
+ }
+ MI.getOperand(FIOperandNum).ChangeToRegister(FIRegOperand, /*IsDef=*/false);
+}
+
+unsigned
+WebAssemblyRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+ static const unsigned Regs[2][2] = {
+ /* !isArch64Bit isArch64Bit */
+ /* !hasFP */ {WebAssembly::SP32, WebAssembly::SP64},
+ /* hasFP */ {WebAssembly::FP32, WebAssembly::FP64}};
+ const WebAssemblyFrameLowering *TFI = getFrameLowering(MF);
+ return Regs[TFI->hasFP(MF)][TT.isArch64Bit()];
+}
+
+const TargetRegisterClass *
+WebAssemblyRegisterInfo::getPointerRegClass(const MachineFunction &MF,
+ unsigned Kind) const {
+ assert(Kind == 0 && "Only one kind of pointer on WebAssembly");
+ if (MF.getSubtarget<WebAssemblySubtarget>().hasAddr64())
+ return &WebAssembly::I64RegClass;
+ return &WebAssembly::I32RegClass;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h
new file mode 100644
index 000000000000..ad1d71eebf22
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h
@@ -0,0 +1,52 @@
+// WebAssemblyRegisterInfo.h - WebAssembly Register Information Impl -*- C++ -*-
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains the WebAssembly implementation of the
+/// WebAssemblyRegisterInfo class.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYREGISTERINFO_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYREGISTERINFO_H
+
+#define GET_REGINFO_HEADER
+#include "WebAssemblyGenRegisterInfo.inc"
+
+namespace llvm {
+
+class MachineFunction;
+class RegScavenger;
+class TargetRegisterClass;
+class Triple;
+
+class WebAssemblyRegisterInfo final : public WebAssemblyGenRegisterInfo {
+ const Triple &TT;
+
+public:
+ explicit WebAssemblyRegisterInfo(const Triple &TT);
+
+ // Code Generation virtual methods.
+ const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+ BitVector getReservedRegs(const MachineFunction &MF) const override;
+ void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
+ unsigned FIOperandNum,
+ RegScavenger *RS = nullptr) const override;
+
+ // Debug information queries.
+ unsigned getFrameRegister(const MachineFunction &MF) const override;
+
+ const TargetRegisterClass *
+ getPointerRegClass(const MachineFunction &MF,
+ unsigned Kind = 0) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
new file mode 100644
index 000000000000..90888100be17
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
@@ -0,0 +1,62 @@
+//WebAssemblyRegisterInfo.td-Describe the WebAssembly Registers -*- tablegen -*-
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file describes the WebAssembly register classes and some nominal
+/// physical registers.
+///
+//===----------------------------------------------------------------------===//
+
+class WebAssemblyReg<string n> : Register<n> {
+ let Namespace = "WebAssembly";
+}
+
+class WebAssemblyRegClass<list<ValueType> regTypes, int alignment, dag regList>
+ : RegisterClass<"WebAssembly", regTypes, alignment, regList>;
+
+//===----------------------------------------------------------------------===//
+// Registers
+//===----------------------------------------------------------------------===//
+
+// Special registers used as the frame and stack pointer.
+//
+// WebAssembly may someday supports mixed 32-bit and 64-bit heaps in the same
+// application, which requires separate width FP and SP.
+def FP32 : WebAssemblyReg<"%FP32">;
+def FP64 : WebAssemblyReg<"%FP64">;
+def SP32 : WebAssemblyReg<"%SP32">;
+def SP64 : WebAssemblyReg<"%SP64">;
+
+// The register allocation framework requires register classes have at least
+// one register, so we define a few for the floating point register classes
+// since we otherwise don't need a physical register in those classes.
+def F32_0 : WebAssemblyReg<"%f32.0">;
+def F64_0 : WebAssemblyReg<"%f64.0">;
+
+def V128_0: WebAssemblyReg<"%v128">;
+
+// The value stack "register". This is an opaque entity which serves to order
+// uses and defs that must remain in LIFO order.
+def VALUE_STACK : WebAssemblyReg<"STACK">;
+
+// The incoming arguments "register". This is an opaque entity which serves to
+// order the ARGUMENT instructions that are emulating live-in registers and
+// must not be scheduled below other instructions.
+def ARGUMENTS : WebAssemblyReg<"ARGUMENTS">;
+
+//===----------------------------------------------------------------------===//
+// Register classes
+//===----------------------------------------------------------------------===//
+
+def I32 : WebAssemblyRegClass<[i32], 32, (add FP32, SP32)>;
+def I64 : WebAssemblyRegClass<[i64], 64, (add FP64, SP64)>;
+def F32 : WebAssemblyRegClass<[f32], 32, (add F32_0)>;
+def F64 : WebAssemblyRegClass<[f64], 64, (add F64_0)>;
+def V128 : WebAssemblyRegClass<[v4f32, v4i32, v16i8, v8i16], 128, (add V128_0)>;
+
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp
new file mode 100644
index 000000000000..9e944df637d9
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp
@@ -0,0 +1,99 @@
+//===-- WebAssemblyReplacePhysRegs.cpp - Replace phys regs with virt regs -===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements a pass that replaces physical registers with
+/// virtual registers.
+///
+/// LLVM expects certain physical registers, such as a stack pointer. However,
+/// WebAssembly doesn't actually have such physical registers. This pass is run
+/// once LLVM no longer needs these registers, and replaces them with virtual
+/// registers, so they can participate in register stackifying and coloring in
+/// the normal way.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-replace-phys-regs"
+
+namespace {
+class WebAssemblyReplacePhysRegs final : public MachineFunctionPass {
+public:
+ static char ID; // Pass identification, replacement for typeid
+ WebAssemblyReplacePhysRegs() : MachineFunctionPass(ID) {}
+
+private:
+ StringRef getPassName() const override {
+ return "WebAssembly Replace Physical Registers";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // end anonymous namespace
+
+char WebAssemblyReplacePhysRegs::ID = 0;
+FunctionPass *llvm::createWebAssemblyReplacePhysRegs() {
+ return new WebAssemblyReplacePhysRegs();
+}
+
+bool WebAssemblyReplacePhysRegs::runOnMachineFunction(MachineFunction &MF) {
+ DEBUG({
+ dbgs() << "********** Replace Physical Registers **********\n"
+ << "********** Function: " << MF.getName() << '\n';
+ });
+
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const auto &TRI = *MF.getSubtarget<WebAssemblySubtarget>().getRegisterInfo();
+ bool Changed = false;
+
+ assert(!mustPreserveAnalysisID(LiveIntervalsID) &&
+ "LiveIntervals shouldn't be active yet!");
+ // We don't preserve SSA or liveness.
+ MRI.leaveSSA();
+ MRI.invalidateLiveness();
+
+ for (unsigned PReg = WebAssembly::NoRegister + 1;
+ PReg < WebAssembly::NUM_TARGET_REGS; ++PReg) {
+ // Skip fake registers that are never used explicitly.
+ if (PReg == WebAssembly::VALUE_STACK || PReg == WebAssembly::ARGUMENTS)
+ continue;
+
+ // Replace explicit uses of the physical register with a virtual register.
+ const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(PReg);
+ unsigned VReg = WebAssembly::NoRegister;
+ for (auto I = MRI.reg_begin(PReg), E = MRI.reg_end(); I != E; ) {
+ MachineOperand &MO = *I++;
+ if (!MO.isImplicit()) {
+ if (VReg == WebAssembly::NoRegister)
+ VReg = MRI.createVirtualRegister(RC);
+ MO.setReg(VReg);
+ if (MO.getParent()->isDebugValue())
+ MO.setIsDebug();
+ Changed = true;
+ }
+ }
+ }
+
+ return Changed;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
new file mode 100644
index 000000000000..fae9c6100510
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
@@ -0,0 +1,20 @@
+//===-- WebAssemblySelectionDAGInfo.cpp - WebAssembly SelectionDAG Info ---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements the WebAssemblySelectionDAGInfo class.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyTargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-selectiondag-info"
+
+WebAssemblySelectionDAGInfo::~WebAssemblySelectionDAGInfo() {}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
new file mode 100644
index 000000000000..533c66b7a22f
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
@@ -0,0 +1,30 @@
+//=- WebAssemblySelectionDAGInfo.h - WebAssembly SelectionDAG Info -*- C++ -*-//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file defines the WebAssembly subclass for
+/// SelectionDAGTargetInfo.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYSELECTIONDAGINFO_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYSELECTIONDAGINFO_H
+
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+
+namespace llvm {
+
+class WebAssemblySelectionDAGInfo final : public SelectionDAGTargetInfo {
+public:
+ ~WebAssemblySelectionDAGInfo() override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp
new file mode 100644
index 000000000000..2441ead7cb27
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp
@@ -0,0 +1,119 @@
+//=- WebAssemblySetP2AlignOperands.cpp - Set alignments on loads and stores -=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file sets the p2align operands on load and store instructions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-set-p2align-operands"
+
+namespace {
+class WebAssemblySetP2AlignOperands final : public MachineFunctionPass {
+public:
+ static char ID; // Pass identification, replacement for typeid
+ WebAssemblySetP2AlignOperands() : MachineFunctionPass(ID) {}
+
+ StringRef getPassName() const override {
+ return "WebAssembly Set p2align Operands";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addPreserved<MachineBlockFrequencyInfo>();
+ AU.addPreservedID(MachineDominatorsID);
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // end anonymous namespace
+
+char WebAssemblySetP2AlignOperands::ID = 0;
+FunctionPass *llvm::createWebAssemblySetP2AlignOperands() {
+ return new WebAssemblySetP2AlignOperands();
+}
+
+static void RewriteP2Align(MachineInstr &MI, unsigned OperandNo) {
+ assert(MI.getOperand(OperandNo).getImm() == 0 &&
+ "ISel should set p2align operands to 0");
+ assert(MI.hasOneMemOperand() &&
+ "Load and store instructions have exactly one mem operand");
+ assert((*MI.memoperands_begin())->getSize() ==
+ (UINT64_C(1)
+ << WebAssembly::GetDefaultP2Align(MI.getOpcode())) &&
+ "Default p2align value should be natural");
+ assert(MI.getDesc().OpInfo[OperandNo].OperandType ==
+ WebAssembly::OPERAND_P2ALIGN &&
+ "Load and store instructions should have a p2align operand");
+ uint64_t P2Align = Log2_64((*MI.memoperands_begin())->getAlignment());
+
+ // WebAssembly does not currently support supernatural alignment.
+ P2Align = std::min(
+ P2Align, uint64_t(WebAssembly::GetDefaultP2Align(MI.getOpcode())));
+
+ MI.getOperand(OperandNo).setImm(P2Align);
+}
+
+bool WebAssemblySetP2AlignOperands::runOnMachineFunction(MachineFunction &MF) {
+ DEBUG({
+ dbgs() << "********** Set p2align Operands **********\n"
+ << "********** Function: " << MF.getName() << '\n';
+ });
+
+ bool Changed = false;
+
+ for (auto &MBB : MF) {
+ for (auto &MI : MBB) {
+ switch (MI.getOpcode()) {
+ case WebAssembly::LOAD_I32:
+ case WebAssembly::LOAD_I64:
+ case WebAssembly::LOAD_F32:
+ case WebAssembly::LOAD_F64:
+ case WebAssembly::LOAD8_S_I32:
+ case WebAssembly::LOAD8_U_I32:
+ case WebAssembly::LOAD16_S_I32:
+ case WebAssembly::LOAD16_U_I32:
+ case WebAssembly::LOAD8_S_I64:
+ case WebAssembly::LOAD8_U_I64:
+ case WebAssembly::LOAD16_S_I64:
+ case WebAssembly::LOAD16_U_I64:
+ case WebAssembly::LOAD32_S_I64:
+ case WebAssembly::LOAD32_U_I64:
+ RewriteP2Align(MI, WebAssembly::LoadP2AlignOperandNo);
+ break;
+ case WebAssembly::STORE_I32:
+ case WebAssembly::STORE_I64:
+ case WebAssembly::STORE_F32:
+ case WebAssembly::STORE_F64:
+ case WebAssembly::STORE8_I32:
+ case WebAssembly::STORE16_I32:
+ case WebAssembly::STORE8_I64:
+ case WebAssembly::STORE16_I64:
+ case WebAssembly::STORE32_I64:
+ RewriteP2Align(MI, WebAssembly::StoreP2AlignOperandNo);
+ break;
+ default:
+ break;
+ }
+ }
+ }
+
+ return Changed;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
new file mode 100644
index 000000000000..34ec6f2d34a7
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
@@ -0,0 +1,202 @@
+//===-- WebAssemblyStoreResults.cpp - Optimize using store result values --===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements an optimization pass using store result values.
+///
+/// WebAssembly's store instructions return the stored value. This is to enable
+/// an optimization wherein uses of the stored value can be replaced by uses of
+/// the store's result value, making the stored value register more likely to
+/// be single-use, thus more likely to be useful to register stackifying, and
+/// potentially also exposing the store to register stackifying. These both can
+/// reduce get_local/set_local traffic.
+///
+/// This pass also performs this optimization for memcpy, memmove, and memset
+/// calls, since the LLVM intrinsics for these return void so they can't use the
+/// returned attribute and consequently aren't handled by the OptimizeReturned
+/// pass.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-store-results"
+
+namespace {
+class WebAssemblyStoreResults final : public MachineFunctionPass {
+public:
+ static char ID; // Pass identification, replacement for typeid
+ WebAssemblyStoreResults() : MachineFunctionPass(ID) {}
+
+ StringRef getPassName() const override { return "WebAssembly Store Results"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<MachineBlockFrequencyInfo>();
+ AU.addPreserved<MachineBlockFrequencyInfo>();
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<LiveIntervals>();
+ AU.addPreserved<SlotIndexes>();
+ AU.addPreserved<LiveIntervals>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+};
+} // end anonymous namespace
+
+char WebAssemblyStoreResults::ID = 0;
+FunctionPass *llvm::createWebAssemblyStoreResults() {
+ return new WebAssemblyStoreResults();
+}
+
+// Replace uses of FromReg with ToReg if they are dominated by MI.
+static bool ReplaceDominatedUses(MachineBasicBlock &MBB, MachineInstr &MI,
+ unsigned FromReg, unsigned ToReg,
+ const MachineRegisterInfo &MRI,
+ MachineDominatorTree &MDT,
+ LiveIntervals &LIS) {
+ bool Changed = false;
+
+ LiveInterval *FromLI = &LIS.getInterval(FromReg);
+ LiveInterval *ToLI = &LIS.getInterval(ToReg);
+
+ SlotIndex FromIdx = LIS.getInstructionIndex(MI).getRegSlot();
+ VNInfo *FromVNI = FromLI->getVNInfoAt(FromIdx);
+
+ SmallVector<SlotIndex, 4> Indices;
+
+ for (auto I = MRI.use_nodbg_begin(FromReg), E = MRI.use_nodbg_end(); I != E;) {
+ MachineOperand &O = *I++;
+ MachineInstr *Where = O.getParent();
+
+ // Check that MI dominates the instruction in the normal way.
+ if (&MI == Where || !MDT.dominates(&MI, Where))
+ continue;
+
+ // If this use gets a different value, skip it.
+ SlotIndex WhereIdx = LIS.getInstructionIndex(*Where);
+ VNInfo *WhereVNI = FromLI->getVNInfoAt(WhereIdx);
+ if (WhereVNI && WhereVNI != FromVNI)
+ continue;
+
+ // Make sure ToReg isn't clobbered before it gets there.
+ VNInfo *ToVNI = ToLI->getVNInfoAt(WhereIdx);
+ if (ToVNI && ToVNI != FromVNI)
+ continue;
+
+ Changed = true;
+ DEBUG(dbgs() << "Setting operand " << O << " in " << *Where << " from "
+ << MI << "\n");
+ O.setReg(ToReg);
+
+ // If the store's def was previously dead, it is no longer.
+ if (!O.isUndef()) {
+ MI.getOperand(0).setIsDead(false);
+
+ Indices.push_back(WhereIdx.getRegSlot());
+ }
+ }
+
+ if (Changed) {
+ // Extend ToReg's liveness.
+ LIS.extendToIndices(*ToLI, Indices);
+
+ // Shrink FromReg's liveness.
+ LIS.shrinkToUses(FromLI);
+
+ // If we replaced all dominated uses, FromReg is now killed at MI.
+ if (!FromLI->liveAt(FromIdx.getDeadSlot()))
+ MI.addRegisterKilled(FromReg,
+ MBB.getParent()->getSubtarget<WebAssemblySubtarget>()
+ .getRegisterInfo());
+ }
+
+ return Changed;
+}
+
+static bool optimizeCall(MachineBasicBlock &MBB, MachineInstr &MI,
+ const MachineRegisterInfo &MRI,
+ MachineDominatorTree &MDT,
+ LiveIntervals &LIS,
+ const WebAssemblyTargetLowering &TLI,
+ const TargetLibraryInfo &LibInfo) {
+ MachineOperand &Op1 = MI.getOperand(1);
+ if (!Op1.isSymbol())
+ return false;
+
+ StringRef Name(Op1.getSymbolName());
+ bool callReturnsInput = Name == TLI.getLibcallName(RTLIB::MEMCPY) ||
+ Name == TLI.getLibcallName(RTLIB::MEMMOVE) ||
+ Name == TLI.getLibcallName(RTLIB::MEMSET);
+ if (!callReturnsInput)
+ return false;
+
+ LibFunc::Func Func;
+ if (!LibInfo.getLibFunc(Name, Func))
+ return false;
+
+ unsigned FromReg = MI.getOperand(2).getReg();
+ unsigned ToReg = MI.getOperand(0).getReg();
+ if (MRI.getRegClass(FromReg) != MRI.getRegClass(ToReg))
+ report_fatal_error("Store results: call to builtin function with wrong "
+ "signature, from/to mismatch");
+ return ReplaceDominatedUses(MBB, MI, FromReg, ToReg, MRI, MDT, LIS);
+}
+
+bool WebAssemblyStoreResults::runOnMachineFunction(MachineFunction &MF) {
+ DEBUG({
+ dbgs() << "********** Store Results **********\n"
+ << "********** Function: " << MF.getName() << '\n';
+ });
+
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
+ const WebAssemblyTargetLowering &TLI =
+ *MF.getSubtarget<WebAssemblySubtarget>().getTargetLowering();
+ const auto &LibInfo = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ LiveIntervals &LIS = getAnalysis<LiveIntervals>();
+ bool Changed = false;
+
+ // We don't preserve SSA form.
+ MRI.leaveSSA();
+
+ assert(MRI.tracksLiveness() && "StoreResults expects liveness tracking");
+
+ for (auto &MBB : MF) {
+ DEBUG(dbgs() << "Basic Block: " << MBB.getName() << '\n');
+ for (auto &MI : MBB)
+ switch (MI.getOpcode()) {
+ default:
+ break;
+ case WebAssembly::CALL_I32:
+ case WebAssembly::CALL_I64:
+ Changed |= optimizeCall(MBB, MI, MRI, MDT, LIS, TLI, LibInfo);
+ break;
+ }
+ }
+
+ return Changed;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
new file mode 100644
index 000000000000..ce39051b0555
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
@@ -0,0 +1,55 @@
+//===-- WebAssemblySubtarget.cpp - WebAssembly Subtarget Information ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements the WebAssembly-specific subclass of
+/// TargetSubtarget.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblySubtarget.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyInstrInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-subtarget"
+
+#define GET_SUBTARGETINFO_CTOR
+#define GET_SUBTARGETINFO_TARGET_DESC
+#include "WebAssemblyGenSubtargetInfo.inc"
+
+WebAssemblySubtarget &
+WebAssemblySubtarget::initializeSubtargetDependencies(StringRef FS) {
+ // Determine default and user-specified characteristics
+
+ if (CPUString.empty())
+ CPUString = "generic";
+
+ ParseSubtargetFeatures(CPUString, FS);
+ return *this;
+}
+
+WebAssemblySubtarget::WebAssemblySubtarget(const Triple &TT,
+ const std::string &CPU,
+ const std::string &FS,
+ const TargetMachine &TM)
+ : WebAssemblyGenSubtargetInfo(TT, CPU, FS), HasSIMD128(false),
+ CPUString(CPU), TargetTriple(TT), FrameLowering(),
+ InstrInfo(initializeSubtargetDependencies(FS)), TSInfo(),
+ TLInfo(TM, *this) {}
+
+bool WebAssemblySubtarget::enableMachineScheduler() const {
+ // Disable the MachineScheduler for now. Even with ShouldTrackPressure set and
+ // enableMachineSchedDefaultSched overridden, it appears to have an overall
+ // negative effect for the kinds of register optimizations we're doing.
+ return false;
+}
+
+bool WebAssemblySubtarget::useAA() const { return true; }
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
new file mode 100644
index 000000000000..f530a290fa0e
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
@@ -0,0 +1,85 @@
+//=- WebAssemblySubtarget.h - Define Subtarget for the WebAssembly -*- C++ -*-//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file declares the WebAssembly-specific subclass of
+/// TargetSubtarget.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYSUBTARGET_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYSUBTARGET_H
+
+#include "WebAssemblyFrameLowering.h"
+#include "WebAssemblyISelLowering.h"
+#include "WebAssemblyInstrInfo.h"
+#include "WebAssemblySelectionDAGInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "WebAssemblyGenSubtargetInfo.inc"
+
+namespace llvm {
+
+class WebAssemblySubtarget final : public WebAssemblyGenSubtargetInfo {
+ bool HasSIMD128;
+
+ /// String name of used CPU.
+ std::string CPUString;
+
+ /// What processor and OS we're targeting.
+ Triple TargetTriple;
+
+ WebAssemblyFrameLowering FrameLowering;
+ WebAssemblyInstrInfo InstrInfo;
+ WebAssemblySelectionDAGInfo TSInfo;
+ WebAssemblyTargetLowering TLInfo;
+
+ /// Initializes using CPUString and the passed in feature string so that we
+ /// can use initializer lists for subtarget initialization.
+ WebAssemblySubtarget &initializeSubtargetDependencies(StringRef FS);
+
+public:
+ /// This constructor initializes the data members to match that
+ /// of the specified triple.
+ WebAssemblySubtarget(const Triple &TT, const std::string &CPU,
+ const std::string &FS, const TargetMachine &TM);
+
+ const WebAssemblySelectionDAGInfo *getSelectionDAGInfo() const override {
+ return &TSInfo;
+ }
+ const WebAssemblyFrameLowering *getFrameLowering() const override {
+ return &FrameLowering;
+ }
+ const WebAssemblyTargetLowering *getTargetLowering() const override {
+ return &TLInfo;
+ }
+ const WebAssemblyInstrInfo *getInstrInfo() const override {
+ return &InstrInfo;
+ }
+ const WebAssemblyRegisterInfo *getRegisterInfo() const override {
+ return &getInstrInfo()->getRegisterInfo();
+ }
+ const Triple &getTargetTriple() const { return TargetTriple; }
+ bool enableMachineScheduler() const override;
+ bool useAA() const override;
+
+ // Predicates used by WebAssemblyInstrInfo.td.
+ bool hasAddr64() const { return TargetTriple.isArch64Bit(); }
+ bool hasSIMD128() const { return HasSIMD128; }
+
+ /// Parses features string setting specified subtarget options. Definition of
+ /// function is auto generated by tblgen.
+ void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
new file mode 100644
index 000000000000..b61bc0a08143
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -0,0 +1,277 @@
+//===- WebAssemblyTargetMachine.cpp - Define TargetMachine for WebAssembly -==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file defines the WebAssembly-specific subclass of TargetMachine.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyTargetMachine.h"
+#include "WebAssemblyTargetObjectFile.h"
+#include "WebAssemblyTargetTransformInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm"
+
+// Emscripten's asm.js-style exception handling
+static cl::opt<bool> EnableEmException(
+ "enable-emscripten-cxx-exceptions",
+ cl::desc("WebAssembly Emscripten-style exception handling"),
+ cl::init(false));
+
+// Emscripten's asm.js-style setjmp/longjmp handling
+static cl::opt<bool> EnableEmSjLj(
+ "enable-emscripten-sjlj",
+ cl::desc("WebAssembly Emscripten-style setjmp/longjmp handling"),
+ cl::init(false));
+
+extern "C" void LLVMInitializeWebAssemblyTarget() {
+ // Register the target.
+ RegisterTargetMachine<WebAssemblyTargetMachine> X(
+ getTheWebAssemblyTarget32());
+ RegisterTargetMachine<WebAssemblyTargetMachine> Y(
+ getTheWebAssemblyTarget64());
+
+ // Register exception handling pass to opt
+ initializeWebAssemblyLowerEmscriptenEHSjLjPass(
+ *PassRegistry::getPassRegistry());
+}
+
+//===----------------------------------------------------------------------===//
+// WebAssembly Lowering public interface.
+//===----------------------------------------------------------------------===//
+
+static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
+ if (!RM.hasValue())
+ return Reloc::PIC_;
+ return *RM;
+}
+
+/// Create an WebAssembly architecture model.
+///
+WebAssemblyTargetMachine::WebAssemblyTargetMachine(
+ const Target &T, const Triple &TT, StringRef CPU, StringRef FS,
+ const TargetOptions &Options, Optional<Reloc::Model> RM,
+ CodeModel::Model CM, CodeGenOpt::Level OL)
+ : LLVMTargetMachine(T,
+ TT.isArch64Bit() ? "e-m:e-p:64:64-i64:64-n32:64-S128"
+ : "e-m:e-p:32:32-i64:64-n32:64-S128",
+ TT, CPU, FS, Options, getEffectiveRelocModel(RM),
+ CM, OL),
+ TLOF(make_unique<WebAssemblyTargetObjectFile>()) {
+ // WebAssembly type-checks instructions, but a noreturn function with a return
+ // type that doesn't match the context will cause a check failure. So we lower
+ // LLVM 'unreachable' to ISD::TRAP and then lower that to WebAssembly's
+ // 'unreachable' instructions which is meant for that case.
+ this->Options.TrapUnreachable = true;
+
+ initAsmInfo();
+
+ // Note that we don't use setRequiresStructuredCFG(true). It disables
+ // optimizations than we're ok with, and want, such as critical edge
+ // splitting and tail merging.
+}
+
+WebAssemblyTargetMachine::~WebAssemblyTargetMachine() {}
+
+const WebAssemblySubtarget *
+WebAssemblyTargetMachine::getSubtargetImpl(const Function &F) const {
+ Attribute CPUAttr = F.getFnAttribute("target-cpu");
+ Attribute FSAttr = F.getFnAttribute("target-features");
+
+ std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
+ ? CPUAttr.getValueAsString().str()
+ : TargetCPU;
+ std::string FS = !FSAttr.hasAttribute(Attribute::None)
+ ? FSAttr.getValueAsString().str()
+ : TargetFS;
+
+ auto &I = SubtargetMap[CPU + FS];
+ if (!I) {
+ // This needs to be done before we create a new subtarget since any
+ // creation will depend on the TM and the code generation flags on the
+ // function that reside in TargetOptions.
+ resetTargetOptions(F);
+ I = llvm::make_unique<WebAssemblySubtarget>(TargetTriple, CPU, FS, *this);
+ }
+ return I.get();
+}
+
+namespace {
+/// WebAssembly Code Generator Pass Configuration Options.
+class WebAssemblyPassConfig final : public TargetPassConfig {
+public:
+ WebAssemblyPassConfig(WebAssemblyTargetMachine *TM, PassManagerBase &PM)
+ : TargetPassConfig(TM, PM) {}
+
+ WebAssemblyTargetMachine &getWebAssemblyTargetMachine() const {
+ return getTM<WebAssemblyTargetMachine>();
+ }
+
+ FunctionPass *createTargetRegisterAllocator(bool) override;
+
+ void addIRPasses() override;
+ bool addInstSelector() override;
+ void addPostRegAlloc() override;
+ bool addGCPasses() override { return false; }
+ void addPreEmitPass() override;
+};
+} // end anonymous namespace
+
+TargetIRAnalysis WebAssemblyTargetMachine::getTargetIRAnalysis() {
+ return TargetIRAnalysis([this](const Function &F) {
+ return TargetTransformInfo(WebAssemblyTTIImpl(this, F));
+ });
+}
+
+TargetPassConfig *
+WebAssemblyTargetMachine::createPassConfig(PassManagerBase &PM) {
+ return new WebAssemblyPassConfig(this, PM);
+}
+
+FunctionPass *WebAssemblyPassConfig::createTargetRegisterAllocator(bool) {
+ return nullptr; // No reg alloc
+}
+
+//===----------------------------------------------------------------------===//
+// The following functions are called from lib/CodeGen/Passes.cpp to modify
+// the CodeGen pass sequence.
+//===----------------------------------------------------------------------===//
+
+void WebAssemblyPassConfig::addIRPasses() {
+ if (TM->Options.ThreadModel == ThreadModel::Single)
+ // In "single" mode, atomics get lowered to non-atomics.
+ addPass(createLowerAtomicPass());
+ else
+ // Expand some atomic operations. WebAssemblyTargetLowering has hooks which
+ // control specifically what gets lowered.
+ addPass(createAtomicExpandPass(TM));
+
+ // Optimize "returned" function attributes.
+ if (getOptLevel() != CodeGenOpt::None)
+ addPass(createWebAssemblyOptimizeReturned());
+
+ // If exception handling is not enabled and setjmp/longjmp handling is
+ // enabled, we lower invokes into calls and delete unreachable landingpad
+ // blocks. Lowering invokes when there is no EH support is done in
+ // TargetPassConfig::addPassesToHandleExceptions, but this runs after this
+ // function and SjLj handling expects all invokes to be lowered before.
+ if (!EnableEmException) {
+ addPass(createLowerInvokePass());
+ // The lower invoke pass may create unreachable code. Remove it in order not
+ // to process dead blocks in setjmp/longjmp handling.
+ addPass(createUnreachableBlockEliminationPass());
+ }
+
+ // Handle exceptions and setjmp/longjmp if enabled.
+ if (EnableEmException || EnableEmSjLj)
+ addPass(createWebAssemblyLowerEmscriptenEHSjLj(EnableEmException,
+ EnableEmSjLj));
+
+ TargetPassConfig::addIRPasses();
+}
+
+bool WebAssemblyPassConfig::addInstSelector() {
+ (void)TargetPassConfig::addInstSelector();
+ addPass(
+ createWebAssemblyISelDag(getWebAssemblyTargetMachine(), getOptLevel()));
+ // Run the argument-move pass immediately after the ScheduleDAG scheduler
+ // so that we can fix up the ARGUMENT instructions before anything else
+ // sees them in the wrong place.
+ addPass(createWebAssemblyArgumentMove());
+ // Set the p2align operands. This information is present during ISel, however
+ // it's inconvenient to collect. Collect it now, and update the immediate
+ // operands.
+ addPass(createWebAssemblySetP2AlignOperands());
+ return false;
+}
+
+void WebAssemblyPassConfig::addPostRegAlloc() {
+ // TODO: The following CodeGen passes don't currently support code containing
+ // virtual registers. Consider removing their restrictions and re-enabling
+ // them.
+
+ // Has no asserts of its own, but was not written to handle virtual regs.
+ disablePass(&ShrinkWrapID);
+
+ // These functions all require the NoVRegs property.
+ disablePass(&MachineCopyPropagationID);
+ disablePass(&PostRASchedulerID);
+ disablePass(&FuncletLayoutID);
+ disablePass(&StackMapLivenessID);
+ disablePass(&LiveDebugValuesID);
+ disablePass(&PatchableFunctionID);
+
+ TargetPassConfig::addPostRegAlloc();
+}
+
+void WebAssemblyPassConfig::addPreEmitPass() {
+ TargetPassConfig::addPreEmitPass();
+
+ // Now that we have a prologue and epilogue and all frame indices are
+ // rewritten, eliminate SP and FP. This allows them to be stackified,
+ // colored, and numbered with the rest of the registers.
+ addPass(createWebAssemblyReplacePhysRegs());
+
+ // Rewrite pseudo call_indirect instructions as real instructions.
+ // This needs to run before register stackification, because we change the
+ // order of the arguments.
+ addPass(createWebAssemblyCallIndirectFixup());
+
+ if (getOptLevel() != CodeGenOpt::None) {
+ // LiveIntervals isn't commonly run this late. Re-establish preconditions.
+ addPass(createWebAssemblyPrepareForLiveIntervals());
+
+ // Depend on LiveIntervals and perform some optimizations on it.
+ addPass(createWebAssemblyOptimizeLiveIntervals());
+
+ // Prepare store instructions for register stackifying.
+ addPass(createWebAssemblyStoreResults());
+
+ // Mark registers as representing wasm's value stack. This is a key
+ // code-compression technique in WebAssembly. We run this pass (and
+ // StoreResults above) very late, so that it sees as much code as possible,
+ // including code emitted by PEI and expanded by late tail duplication.
+ addPass(createWebAssemblyRegStackify());
+
+ // Run the register coloring pass to reduce the total number of registers.
+ // This runs after stackification so that it doesn't consider registers
+ // that become stackified.
+ addPass(createWebAssemblyRegColoring());
+ }
+
+ // Insert explicit get_local and set_local operators.
+ addPass(createWebAssemblyExplicitLocals());
+
+ // Eliminate multiple-entry loops.
+ addPass(createWebAssemblyFixIrreducibleControlFlow());
+
+ // Put the CFG in structured form; insert BLOCK and LOOP markers.
+ addPass(createWebAssemblyCFGStackify());
+
+ // Lower br_unless into br_if.
+ addPass(createWebAssemblyLowerBrUnless());
+
+ // Perform the very last peephole optimizations on the code.
+ if (getOptLevel() != CodeGenOpt::None)
+ addPass(createWebAssemblyPeephole());
+
+ // Create a mapping from LLVM CodeGen virtual registers to wasm registers.
+ addPass(createWebAssemblyRegNumbering());
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.h
new file mode 100644
index 000000000000..52a2ef78736a
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.h
@@ -0,0 +1,53 @@
+// WebAssemblyTargetMachine.h - Define TargetMachine for WebAssembly -*- C++ -*-
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file declares the WebAssembly-specific subclass of
+/// TargetMachine.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYTARGETMACHINE_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYTARGETMACHINE_H
+
+#include "WebAssemblySubtarget.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class WebAssemblyTargetMachine final : public LLVMTargetMachine {
+ std::unique_ptr<TargetLoweringObjectFile> TLOF;
+ mutable StringMap<std::unique_ptr<WebAssemblySubtarget>> SubtargetMap;
+
+public:
+ WebAssemblyTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL);
+
+ ~WebAssemblyTargetMachine() override;
+ const WebAssemblySubtarget *
+ getSubtargetImpl(const Function &F) const override;
+
+ // Pass Pipeline Configuration
+ TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+ TargetLoweringObjectFile *getObjFileLowering() const override {
+ return TLOF.get();
+ }
+
+ /// \brief Get the TargetIRAnalysis for this target.
+ TargetIRAnalysis getTargetIRAnalysis() override;
+
+ bool usesPhysRegsForPEI() const override { return false; }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp
new file mode 100644
index 000000000000..74e33b93e00d
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp
@@ -0,0 +1,24 @@
+//===-- WebAssemblyTargetObjectFile.cpp - WebAssembly Object Info ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file defines the functions of the WebAssembly-specific subclass
+/// of TargetLoweringObjectFile.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyTargetObjectFile.h"
+#include "WebAssemblyTargetMachine.h"
+using namespace llvm;
+
+void WebAssemblyTargetObjectFile::Initialize(MCContext &Ctx,
+ const TargetMachine &TM) {
+ TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+ InitializeELF(TM.Options.UseInitArray);
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h
new file mode 100644
index 000000000000..39e50c9c575d
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h
@@ -0,0 +1,30 @@
+//===-- WebAssemblyTargetObjectFile.h - WebAssembly Object Info -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file declares the WebAssembly-specific subclass of
+/// TargetLoweringObjectFile.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYTARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYTARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+
+namespace llvm {
+
+class WebAssemblyTargetObjectFile final : public TargetLoweringObjectFileELF {
+public:
+ void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
new file mode 100644
index 000000000000..bf546dab5fbb
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -0,0 +1,83 @@
+//===-- WebAssemblyTargetTransformInfo.cpp - WebAssembly-specific TTI -----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file defines the WebAssembly-specific TargetTransformInfo
+/// implementation.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyTargetTransformInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/CostTable.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasmtti"
+
+TargetTransformInfo::PopcntSupportKind
+WebAssemblyTTIImpl::getPopcntSupport(unsigned TyWidth) const {
+ assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
+ return TargetTransformInfo::PSK_FastHardware;
+}
+
+unsigned WebAssemblyTTIImpl::getNumberOfRegisters(bool Vector) {
+ unsigned Result = BaseT::getNumberOfRegisters(Vector);
+
+ // For SIMD, use at least 16 registers, as a rough guess.
+ if (Vector)
+ Result = std::max(Result, 16u);
+
+ return Result;
+}
+
+unsigned WebAssemblyTTIImpl::getRegisterBitWidth(bool Vector) {
+ if (Vector && getST()->hasSIMD128())
+ return 128;
+
+ return 64;
+}
+
+unsigned WebAssemblyTTIImpl::getArithmeticInstrCost(
+ unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
+ TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
+ TTI::OperandValueProperties Opd2PropInfo) {
+
+ unsigned Cost = BasicTTIImplBase<WebAssemblyTTIImpl>::getArithmeticInstrCost(
+ Opcode, Ty, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo);
+
+ if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+ switch (Opcode) {
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::Shl:
+ // SIMD128's shifts currently only accept a scalar shift count. For each
+ // element, we'll need to extract, op, insert. The following is a rough
+ // approxmation.
+ if (Opd2Info != TTI::OK_UniformValue &&
+ Opd2Info != TTI::OK_UniformConstantValue)
+ Cost = VTy->getNumElements() *
+ (TargetTransformInfo::TCC_Basic +
+ getArithmeticInstrCost(Opcode, VTy->getElementType()) +
+ TargetTransformInfo::TCC_Basic);
+ break;
+ }
+ }
+ return Cost;
+}
+
+unsigned WebAssemblyTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
+ unsigned Index) {
+ unsigned Cost = BasicTTIImplBase::getVectorInstrCost(Opcode, Val, Index);
+
+ // SIMD128's insert/extract currently only take constant indices.
+ if (Index == -1u)
+ return Cost + 25 * TargetTransformInfo::TCC_Expensive;
+
+ return Cost;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
new file mode 100644
index 000000000000..2a2e3941f82d
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -0,0 +1,72 @@
+//==- WebAssemblyTargetTransformInfo.h - WebAssembly-specific TTI -*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file a TargetTransformInfo::Concept conforming object specific
+/// to the WebAssembly target machine.
+///
+/// It uses the target's detailed information to provide more precise answers to
+/// certain TTI queries, while letting the target independent and default TTI
+/// implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYTARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYTARGETTRANSFORMINFO_H
+
+#include "WebAssemblyTargetMachine.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include <algorithm>
+
+namespace llvm {
+
+class WebAssemblyTTIImpl final : public BasicTTIImplBase<WebAssemblyTTIImpl> {
+ typedef BasicTTIImplBase<WebAssemblyTTIImpl> BaseT;
+ typedef TargetTransformInfo TTI;
+ friend BaseT;
+
+ const WebAssemblySubtarget *ST;
+ const WebAssemblyTargetLowering *TLI;
+
+ const WebAssemblySubtarget *getST() const { return ST; }
+ const WebAssemblyTargetLowering *getTLI() const { return TLI; }
+
+public:
+ WebAssemblyTTIImpl(const WebAssemblyTargetMachine *TM, const Function &F)
+ : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+ TLI(ST->getTargetLowering()) {}
+
+ /// \name Scalar TTI Implementations
+ /// @{
+
+ // TODO: Implement more Scalar TTI for WebAssembly
+
+ TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const;
+
+ /// @}
+
+ /// \name Vector TTI Implementations
+ /// @{
+
+ unsigned getNumberOfRegisters(bool Vector);
+ unsigned getRegisterBitWidth(bool Vector);
+ unsigned getArithmeticInstrCost(
+ unsigned Opcode, Type *Ty,
+ TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+ TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+ TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+ unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+
+ /// @}
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
new file mode 100644
index 000000000000..a0049c147d2c
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
@@ -0,0 +1,71 @@
+//===-- WebAssemblyUtilities.cpp - WebAssembly Utility Functions ----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements several utility functions for WebAssembly.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyUtilities.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+using namespace llvm;
+
+bool WebAssembly::isArgument(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case WebAssembly::ARGUMENT_I32:
+ case WebAssembly::ARGUMENT_I64:
+ case WebAssembly::ARGUMENT_F32:
+ case WebAssembly::ARGUMENT_F64:
+ case WebAssembly::ARGUMENT_v16i8:
+ case WebAssembly::ARGUMENT_v8i16:
+ case WebAssembly::ARGUMENT_v4i32:
+ case WebAssembly::ARGUMENT_v4f32:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool WebAssembly::isCopy(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case WebAssembly::COPY_I32:
+ case WebAssembly::COPY_I64:
+ case WebAssembly::COPY_F32:
+ case WebAssembly::COPY_F64:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool WebAssembly::isTee(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case WebAssembly::TEE_I32:
+ case WebAssembly::TEE_I64:
+ case WebAssembly::TEE_F32:
+ case WebAssembly::TEE_F64:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/// Test whether MI is a child of some other node in an expression tree.
+bool WebAssembly::isChild(const MachineInstr &MI,
+ const WebAssemblyFunctionInfo &MFI) {
+ if (MI.getNumOperands() == 0)
+ return false;
+ const MachineOperand &MO = MI.getOperand(0);
+ if (!MO.isReg() || MO.isImplicit() || !MO.isDef())
+ return false;
+ unsigned Reg = MO.getReg();
+ return TargetRegisterInfo::isVirtualRegister(Reg) &&
+ MFI.isVRegStackified(Reg);
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.h
new file mode 100644
index 000000000000..eb114403d14e
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.h
@@ -0,0 +1,34 @@
+//===-- WebAssemblyUtilities - WebAssembly Utility Functions ---*- C++ -*-====//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains the declaration of the WebAssembly-specific
+/// utility functions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYUTILITIES_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYUTILITIES_H
+
+namespace llvm {
+
+class MachineInstr;
+class WebAssemblyFunctionInfo;
+
+namespace WebAssembly {
+
+bool isArgument(const MachineInstr &MI);
+bool isCopy(const MachineInstr &MI);
+bool isTee(const MachineInstr &MI);
+bool isChild(const MachineInstr &MI, const WebAssemblyFunctionInfo &MFI);
+
+} // end namespace WebAssembly
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt b/contrib/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt
new file mode 100644
index 000000000000..8dd5e8a03e2e
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt
@@ -0,0 +1,68 @@
+# Tests which are known to fail from the GCC torture test suite.
+
+# Computed gotos are not supported (Cannot select BlockAddress/BRIND)
+20040302-1.c
+20071210-1.c
+920501-4.c
+920501-5.c
+comp-goto-1.c
+980526-1.c
+990208-1.c
+
+# WebAssembly hasn't implemented (will never?) __builtin_return_address
+20010122-1.c
+20030323-1.c
+20030811-1.c
+pr17377.c
+
+# Error: invalid output constraint '=t' in asm.
+990413-2.c
+
+# Error: __builtin_setjmp / __builtin_longjmp is not supported for the current target.
+built-in-setjmp.c
+pr60003.c
+
+# Error in the program / unsupported by Clang.
+scal-to-vec1.c
+scal-to-vec2.c
+scal-to-vec3.c
+20000822-1.c
+20010209-1.c
+20010605-1.c
+20030501-1.c
+20040520-1.c
+20061220-1.c
+20090219-1.c
+920415-1.c
+920428-2.c
+920501-7.c
+920612-2.c
+920721-4.c
+921017-1.c
+921215-1.c
+931002-1.c
+comp-goto-2.c
+nest-align-1.c
+nest-stdar-1.c
+nestfunc-1.c
+nestfunc-2.c
+nestfunc-3.c
+nestfunc-5.c
+nestfunc-6.c
+nestfunc-7.c
+pr22061-3.c
+pr22061-4.c
+pr24135.c
+pr51447.c
+20020412-1.c
+20040308-1.c
+20040423-1.c
+20041218-2.c
+20070919-1.c
+align-nest.c
+pr41935.c
+920302-1.c
+920501-3.c
+920728-1.c
+pr28865.c
+widechar-2.c
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
new file mode 100644
index 000000000000..c38a7d1dd44d
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
@@ -0,0 +1,1077 @@
+//===-- X86AsmInstrumentation.cpp - Instrument X86 inline assembly C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86AsmInstrumentation.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "X86Operand.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Support/CommandLine.h"
+#include <algorithm>
+#include <cassert>
+#include <vector>
+
+// Following comment describes how assembly instrumentation works.
+// Currently we have only AddressSanitizer instrumentation, but we're
+// planning to implement MemorySanitizer for inline assembly too. If
+// you're not familiar with AddressSanitizer algorithm, please, read
+// https://code.google.com/p/address-sanitizer/wiki/AddressSanitizerAlgorithm.
+//
+// When inline assembly is parsed by an instance of X86AsmParser, all
+// instructions are emitted via EmitInstruction method. That's the
+// place where X86AsmInstrumentation analyzes an instruction and
+// decides, whether the instruction should be emitted as is or
+// instrumentation is required. The latter case happens when an
+// instruction reads from or writes to memory. Now instruction opcode
+// is explicitly checked, and if an instruction has a memory operand
+// (for instance, movq (%rsi, %rcx, 8), %rax) - it should be
+// instrumented. There're also exist instructions that modify
+// memory but don't have an explicit memory operands, for instance,
+// movs.
+//
+// Let's consider at first 8-byte memory accesses when an instruction
+// has an explicit memory operand. In this case we need two registers -
+// AddressReg to compute address of a memory cells which are accessed
+// and ShadowReg to compute corresponding shadow address. So, we need
+// to spill both registers before instrumentation code and restore them
+// after instrumentation. Thus, in general, instrumentation code will
+// look like this:
+// PUSHF # Store flags, otherwise they will be overwritten
+// PUSH AddressReg # spill AddressReg
+// PUSH ShadowReg # spill ShadowReg
+// LEA MemOp, AddressReg # compute address of the memory operand
+// MOV AddressReg, ShadowReg
+// SHR ShadowReg, 3
+// # ShadowOffset(AddressReg >> 3) contains address of a shadow
+// # corresponding to MemOp.
+// CMP ShadowOffset(ShadowReg), 0 # test shadow value
+// JZ .Done # when shadow equals to zero, everything is fine
+// MOV AddressReg, RDI
+// # Call __asan_report function with AddressReg as an argument
+// CALL __asan_report
+// .Done:
+// POP ShadowReg # Restore ShadowReg
+// POP AddressReg # Restore AddressReg
+// POPF # Restore flags
+//
+// Memory accesses with different size (1-, 2-, 4- and 16-byte) are
+// handled in a similar manner, but small memory accesses (less than 8
+// byte) require an additional ScratchReg, which is used for shadow value.
+//
+// If, suppose, we're instrumenting an instruction like movs, only
+// contents of RDI, RDI + AccessSize * RCX, RSI, RSI + AccessSize *
+// RCX are checked. In this case there're no need to spill and restore
+// AddressReg , ShadowReg or flags four times, they're saved on stack
+// just once, before instrumentation of these four addresses, and restored
+// at the end of the instrumentation.
+//
+// There exist several things which complicate this simple algorithm.
+// * Instrumented memory operand can have RSP as a base or an index
+// register. So we need to add a constant offset before computation
+// of memory address, since flags, AddressReg, ShadowReg, etc. were
+// already stored on stack and RSP was modified.
+// * Debug info (usually, DWARF) should be adjusted, because sometimes
+// RSP is used as a frame register. So, we need to select some
+// register as a frame register and temprorary override current CFA
+// register.
+
+namespace llvm {
+namespace {
+
+static cl::opt<bool> ClAsanInstrumentAssembly(
+ "asan-instrument-assembly",
+ cl::desc("instrument assembly with AddressSanitizer checks"), cl::Hidden,
+ cl::init(false));
+
+const int64_t MinAllowedDisplacement = std::numeric_limits<int32_t>::min();
+const int64_t MaxAllowedDisplacement = std::numeric_limits<int32_t>::max();
+
+int64_t ApplyDisplacementBounds(int64_t Displacement) {
+ return std::max(std::min(MaxAllowedDisplacement, Displacement),
+ MinAllowedDisplacement);
+}
+
+void CheckDisplacementBounds(int64_t Displacement) {
+ assert(Displacement >= MinAllowedDisplacement &&
+ Displacement <= MaxAllowedDisplacement);
+}
+
+bool IsStackReg(unsigned Reg) { return Reg == X86::RSP || Reg == X86::ESP; }
+
+bool IsSmallMemAccess(unsigned AccessSize) { return AccessSize < 8; }
+
+class X86AddressSanitizer : public X86AsmInstrumentation {
+public:
+ struct RegisterContext {
+ private:
+ enum RegOffset {
+ REG_OFFSET_ADDRESS = 0,
+ REG_OFFSET_SHADOW,
+ REG_OFFSET_SCRATCH
+ };
+
+ public:
+ RegisterContext(unsigned AddressReg, unsigned ShadowReg,
+ unsigned ScratchReg) {
+ BusyRegs.push_back(convReg(AddressReg, 64));
+ BusyRegs.push_back(convReg(ShadowReg, 64));
+ BusyRegs.push_back(convReg(ScratchReg, 64));
+ }
+
+ unsigned AddressReg(unsigned Size) const {
+ return convReg(BusyRegs[REG_OFFSET_ADDRESS], Size);
+ }
+
+ unsigned ShadowReg(unsigned Size) const {
+ return convReg(BusyRegs[REG_OFFSET_SHADOW], Size);
+ }
+
+ unsigned ScratchReg(unsigned Size) const {
+ return convReg(BusyRegs[REG_OFFSET_SCRATCH], Size);
+ }
+
+ void AddBusyReg(unsigned Reg) {
+ if (Reg != X86::NoRegister)
+ BusyRegs.push_back(convReg(Reg, 64));
+ }
+
+ void AddBusyRegs(const X86Operand &Op) {
+ AddBusyReg(Op.getMemBaseReg());
+ AddBusyReg(Op.getMemIndexReg());
+ }
+
+ unsigned ChooseFrameReg(unsigned Size) const {
+ static const MCPhysReg Candidates[] = { X86::RBP, X86::RAX, X86::RBX,
+ X86::RCX, X86::RDX, X86::RDI,
+ X86::RSI };
+ for (unsigned Reg : Candidates) {
+ if (!std::count(BusyRegs.begin(), BusyRegs.end(), Reg))
+ return convReg(Reg, Size);
+ }
+ return X86::NoRegister;
+ }
+
+ private:
+ unsigned convReg(unsigned Reg, unsigned Size) const {
+ return Reg == X86::NoRegister ? Reg : getX86SubSuperRegister(Reg, Size);
+ }
+
+ std::vector<unsigned> BusyRegs;
+ };
+
+ X86AddressSanitizer(const MCSubtargetInfo *&STI)
+ : X86AsmInstrumentation(STI), RepPrefix(false), OrigSPOffset(0) {}
+
+ ~X86AddressSanitizer() override {}
+
+ // X86AsmInstrumentation implementation:
+ void InstrumentAndEmitInstruction(const MCInst &Inst,
+ OperandVector &Operands,
+ MCContext &Ctx,
+ const MCInstrInfo &MII,
+ MCStreamer &Out) override {
+ InstrumentMOVS(Inst, Operands, Ctx, MII, Out);
+ if (RepPrefix)
+ EmitInstruction(Out, MCInstBuilder(X86::REP_PREFIX));
+
+ InstrumentMOV(Inst, Operands, Ctx, MII, Out);
+
+ RepPrefix = (Inst.getOpcode() == X86::REP_PREFIX);
+ if (!RepPrefix)
+ EmitInstruction(Out, Inst);
+ }
+
+ // Adjusts up stack and saves all registers used in instrumentation.
+ virtual void InstrumentMemOperandPrologue(const RegisterContext &RegCtx,
+ MCContext &Ctx,
+ MCStreamer &Out) = 0;
+
+ // Restores all registers used in instrumentation and adjusts stack.
+ virtual void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx,
+ MCContext &Ctx,
+ MCStreamer &Out) = 0;
+
+ virtual void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize,
+ bool IsWrite,
+ const RegisterContext &RegCtx,
+ MCContext &Ctx, MCStreamer &Out) = 0;
+ virtual void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize,
+ bool IsWrite,
+ const RegisterContext &RegCtx,
+ MCContext &Ctx, MCStreamer &Out) = 0;
+
+ virtual void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx,
+ MCStreamer &Out) = 0;
+
+ void InstrumentMemOperand(X86Operand &Op, unsigned AccessSize, bool IsWrite,
+ const RegisterContext &RegCtx, MCContext &Ctx,
+ MCStreamer &Out);
+ void InstrumentMOVSBase(unsigned DstReg, unsigned SrcReg, unsigned CntReg,
+ unsigned AccessSize, MCContext &Ctx, MCStreamer &Out);
+
+ void InstrumentMOVS(const MCInst &Inst, OperandVector &Operands,
+ MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out);
+ void InstrumentMOV(const MCInst &Inst, OperandVector &Operands,
+ MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out);
+
+protected:
+ void EmitLabel(MCStreamer &Out, MCSymbol *Label) { Out.EmitLabel(Label); }
+
+ void EmitLEA(X86Operand &Op, unsigned Size, unsigned Reg, MCStreamer &Out) {
+ assert(Size == 32 || Size == 64);
+ MCInst Inst;
+ Inst.setOpcode(Size == 32 ? X86::LEA32r : X86::LEA64r);
+ Inst.addOperand(MCOperand::createReg(getX86SubSuperRegister(Reg, Size)));
+ Op.addMemOperands(Inst, 5);
+ EmitInstruction(Out, Inst);
+ }
+
+ void ComputeMemOperandAddress(X86Operand &Op, unsigned Size,
+ unsigned Reg, MCContext &Ctx, MCStreamer &Out);
+
+ // Creates new memory operand with Displacement added to an original
+ // displacement. Residue will contain a residue which could happen when the
+ // total displacement exceeds 32-bit limitation.
+ std::unique_ptr<X86Operand> AddDisplacement(X86Operand &Op,
+ int64_t Displacement,
+ MCContext &Ctx, int64_t *Residue);
+
+ bool is64BitMode() const {
+ return STI->getFeatureBits()[X86::Mode64Bit];
+ }
+ bool is32BitMode() const {
+ return STI->getFeatureBits()[X86::Mode32Bit];
+ }
+ bool is16BitMode() const {
+ return STI->getFeatureBits()[X86::Mode16Bit];
+ }
+
+ unsigned getPointerWidth() {
+ if (is16BitMode()) return 16;
+ if (is32BitMode()) return 32;
+ if (is64BitMode()) return 64;
+ llvm_unreachable("invalid mode");
+ }
+
+ // True when previous instruction was actually REP prefix.
+ bool RepPrefix;
+
+ // Offset from the original SP register.
+ int64_t OrigSPOffset;
+};
+
+void X86AddressSanitizer::InstrumentMemOperand(
+ X86Operand &Op, unsigned AccessSize, bool IsWrite,
+ const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
+ assert(Op.isMem() && "Op should be a memory operand.");
+ assert((AccessSize & (AccessSize - 1)) == 0 && AccessSize <= 16 &&
+ "AccessSize should be a power of two, less or equal than 16.");
+ // FIXME: take into account load/store alignment.
+ if (IsSmallMemAccess(AccessSize))
+ InstrumentMemOperandSmall(Op, AccessSize, IsWrite, RegCtx, Ctx, Out);
+ else
+ InstrumentMemOperandLarge(Op, AccessSize, IsWrite, RegCtx, Ctx, Out);
+}
+
+void X86AddressSanitizer::InstrumentMOVSBase(unsigned DstReg, unsigned SrcReg,
+ unsigned CntReg,
+ unsigned AccessSize,
+ MCContext &Ctx, MCStreamer &Out) {
+ // FIXME: check whole ranges [DstReg .. DstReg + AccessSize * (CntReg - 1)]
+ // and [SrcReg .. SrcReg + AccessSize * (CntReg - 1)].
+ RegisterContext RegCtx(X86::RDX /* AddressReg */, X86::RAX /* ShadowReg */,
+ IsSmallMemAccess(AccessSize)
+ ? X86::RBX
+ : X86::NoRegister /* ScratchReg */);
+ RegCtx.AddBusyReg(DstReg);
+ RegCtx.AddBusyReg(SrcReg);
+ RegCtx.AddBusyReg(CntReg);
+
+ InstrumentMemOperandPrologue(RegCtx, Ctx, Out);
+
+ // Test (%SrcReg)
+ {
+ const MCExpr *Disp = MCConstantExpr::create(0, Ctx);
+ std::unique_ptr<X86Operand> Op(X86Operand::CreateMem(
+ getPointerWidth(), 0, Disp, SrcReg, 0, AccessSize, SMLoc(), SMLoc()));
+ InstrumentMemOperand(*Op, AccessSize, false /* IsWrite */, RegCtx, Ctx,
+ Out);
+ }
+
+ // Test -1(%SrcReg, %CntReg, AccessSize)
+ {
+ const MCExpr *Disp = MCConstantExpr::create(-1, Ctx);
+ std::unique_ptr<X86Operand> Op(X86Operand::CreateMem(
+ getPointerWidth(), 0, Disp, SrcReg, CntReg, AccessSize, SMLoc(),
+ SMLoc()));
+ InstrumentMemOperand(*Op, AccessSize, false /* IsWrite */, RegCtx, Ctx,
+ Out);
+ }
+
+ // Test (%DstReg)
+ {
+ const MCExpr *Disp = MCConstantExpr::create(0, Ctx);
+ std::unique_ptr<X86Operand> Op(X86Operand::CreateMem(
+ getPointerWidth(), 0, Disp, DstReg, 0, AccessSize, SMLoc(), SMLoc()));
+ InstrumentMemOperand(*Op, AccessSize, true /* IsWrite */, RegCtx, Ctx, Out);
+ }
+
+ // Test -1(%DstReg, %CntReg, AccessSize)
+ {
+ const MCExpr *Disp = MCConstantExpr::create(-1, Ctx);
+ std::unique_ptr<X86Operand> Op(X86Operand::CreateMem(
+ getPointerWidth(), 0, Disp, DstReg, CntReg, AccessSize, SMLoc(),
+ SMLoc()));
+ InstrumentMemOperand(*Op, AccessSize, true /* IsWrite */, RegCtx, Ctx, Out);
+ }
+
+ InstrumentMemOperandEpilogue(RegCtx, Ctx, Out);
+}
+
+void X86AddressSanitizer::InstrumentMOVS(const MCInst &Inst,
+ OperandVector &Operands,
+ MCContext &Ctx, const MCInstrInfo &MII,
+ MCStreamer &Out) {
+ // Access size in bytes.
+ unsigned AccessSize = 0;
+
+ switch (Inst.getOpcode()) {
+ case X86::MOVSB:
+ AccessSize = 1;
+ break;
+ case X86::MOVSW:
+ AccessSize = 2;
+ break;
+ case X86::MOVSL:
+ AccessSize = 4;
+ break;
+ case X86::MOVSQ:
+ AccessSize = 8;
+ break;
+ default:
+ return;
+ }
+
+ InstrumentMOVSImpl(AccessSize, Ctx, Out);
+}
+
+void X86AddressSanitizer::InstrumentMOV(const MCInst &Inst,
+ OperandVector &Operands, MCContext &Ctx,
+ const MCInstrInfo &MII,
+ MCStreamer &Out) {
+ // Access size in bytes.
+ unsigned AccessSize = 0;
+
+ switch (Inst.getOpcode()) {
+ case X86::MOV8mi:
+ case X86::MOV8mr:
+ case X86::MOV8rm:
+ AccessSize = 1;
+ break;
+ case X86::MOV16mi:
+ case X86::MOV16mr:
+ case X86::MOV16rm:
+ AccessSize = 2;
+ break;
+ case X86::MOV32mi:
+ case X86::MOV32mr:
+ case X86::MOV32rm:
+ AccessSize = 4;
+ break;
+ case X86::MOV64mi32:
+ case X86::MOV64mr:
+ case X86::MOV64rm:
+ AccessSize = 8;
+ break;
+ case X86::MOVAPDmr:
+ case X86::MOVAPSmr:
+ case X86::MOVAPDrm:
+ case X86::MOVAPSrm:
+ AccessSize = 16;
+ break;
+ default:
+ return;
+ }
+
+ const bool IsWrite = MII.get(Inst.getOpcode()).mayStore();
+
+ for (unsigned Ix = 0; Ix < Operands.size(); ++Ix) {
+ assert(Operands[Ix]);
+ MCParsedAsmOperand &Op = *Operands[Ix];
+ if (Op.isMem()) {
+ X86Operand &MemOp = static_cast<X86Operand &>(Op);
+ RegisterContext RegCtx(
+ X86::RDI /* AddressReg */, X86::RAX /* ShadowReg */,
+ IsSmallMemAccess(AccessSize) ? X86::RCX
+ : X86::NoRegister /* ScratchReg */);
+ RegCtx.AddBusyRegs(MemOp);
+ InstrumentMemOperandPrologue(RegCtx, Ctx, Out);
+ InstrumentMemOperand(MemOp, AccessSize, IsWrite, RegCtx, Ctx, Out);
+ InstrumentMemOperandEpilogue(RegCtx, Ctx, Out);
+ }
+ }
+}
+
+void X86AddressSanitizer::ComputeMemOperandAddress(X86Operand &Op,
+ unsigned Size,
+ unsigned Reg, MCContext &Ctx,
+ MCStreamer &Out) {
+ int64_t Displacement = 0;
+ if (IsStackReg(Op.getMemBaseReg()))
+ Displacement -= OrigSPOffset;
+ if (IsStackReg(Op.getMemIndexReg()))
+ Displacement -= OrigSPOffset * Op.getMemScale();
+
+ assert(Displacement >= 0);
+
+ // Emit Op as is.
+ if (Displacement == 0) {
+ EmitLEA(Op, Size, Reg, Out);
+ return;
+ }
+
+ int64_t Residue;
+ std::unique_ptr<X86Operand> NewOp =
+ AddDisplacement(Op, Displacement, Ctx, &Residue);
+ EmitLEA(*NewOp, Size, Reg, Out);
+
+ while (Residue != 0) {
+ const MCConstantExpr *Disp =
+ MCConstantExpr::create(ApplyDisplacementBounds(Residue), Ctx);
+ std::unique_ptr<X86Operand> DispOp =
+ X86Operand::CreateMem(getPointerWidth(), 0, Disp, Reg, 0, 1, SMLoc(),
+ SMLoc());
+ EmitLEA(*DispOp, Size, Reg, Out);
+ Residue -= Disp->getValue();
+ }
+}
+
+std::unique_ptr<X86Operand>
+X86AddressSanitizer::AddDisplacement(X86Operand &Op, int64_t Displacement,
+ MCContext &Ctx, int64_t *Residue) {
+ assert(Displacement >= 0);
+
+ if (Displacement == 0 ||
+ (Op.getMemDisp() && Op.getMemDisp()->getKind() != MCExpr::Constant)) {
+ *Residue = Displacement;
+ return X86Operand::CreateMem(Op.getMemModeSize(), Op.getMemSegReg(),
+ Op.getMemDisp(), Op.getMemBaseReg(),
+ Op.getMemIndexReg(), Op.getMemScale(),
+ SMLoc(), SMLoc());
+ }
+
+ int64_t OrigDisplacement =
+ static_cast<const MCConstantExpr *>(Op.getMemDisp())->getValue();
+ CheckDisplacementBounds(OrigDisplacement);
+ Displacement += OrigDisplacement;
+
+ int64_t NewDisplacement = ApplyDisplacementBounds(Displacement);
+ CheckDisplacementBounds(NewDisplacement);
+
+ *Residue = Displacement - NewDisplacement;
+ const MCExpr *Disp = MCConstantExpr::create(NewDisplacement, Ctx);
+ return X86Operand::CreateMem(Op.getMemModeSize(), Op.getMemSegReg(), Disp,
+ Op.getMemBaseReg(), Op.getMemIndexReg(),
+ Op.getMemScale(), SMLoc(), SMLoc());
+}
+
+class X86AddressSanitizer32 : public X86AddressSanitizer {
+public:
+ static const long kShadowOffset = 0x20000000;
+
+ X86AddressSanitizer32(const MCSubtargetInfo *&STI)
+ : X86AddressSanitizer(STI) {}
+
+ ~X86AddressSanitizer32() override {}
+
+ unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) {
+ unsigned FrameReg = GetFrameRegGeneric(Ctx, Out);
+ if (FrameReg == X86::NoRegister)
+ return FrameReg;
+ return getX86SubSuperRegister(FrameReg, 32);
+ }
+
+ void SpillReg(MCStreamer &Out, unsigned Reg) {
+ EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(Reg));
+ OrigSPOffset -= 4;
+ }
+
+ void RestoreReg(MCStreamer &Out, unsigned Reg) {
+ EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(Reg));
+ OrigSPOffset += 4;
+ }
+
+ void StoreFlags(MCStreamer &Out) {
+ EmitInstruction(Out, MCInstBuilder(X86::PUSHF32));
+ OrigSPOffset -= 4;
+ }
+
+ void RestoreFlags(MCStreamer &Out) {
+ EmitInstruction(Out, MCInstBuilder(X86::POPF32));
+ OrigSPOffset += 4;
+ }
+
+ void InstrumentMemOperandPrologue(const RegisterContext &RegCtx,
+ MCContext &Ctx,
+ MCStreamer &Out) override {
+ unsigned LocalFrameReg = RegCtx.ChooseFrameReg(32);
+ assert(LocalFrameReg != X86::NoRegister);
+
+ const MCRegisterInfo *MRI = Ctx.getRegisterInfo();
+ unsigned FrameReg = GetFrameReg(Ctx, Out);
+ if (MRI && FrameReg != X86::NoRegister) {
+ SpillReg(Out, LocalFrameReg);
+ if (FrameReg == X86::ESP) {
+ Out.EmitCFIAdjustCfaOffset(4 /* byte size of the LocalFrameReg */);
+ Out.EmitCFIRelOffset(
+ MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */), 0);
+ }
+ EmitInstruction(
+ Out,
+ MCInstBuilder(X86::MOV32rr).addReg(LocalFrameReg).addReg(FrameReg));
+ Out.EmitCFIRememberState();
+ Out.EmitCFIDefCfaRegister(
+ MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */));
+ }
+
+ SpillReg(Out, RegCtx.AddressReg(32));
+ SpillReg(Out, RegCtx.ShadowReg(32));
+ if (RegCtx.ScratchReg(32) != X86::NoRegister)
+ SpillReg(Out, RegCtx.ScratchReg(32));
+ StoreFlags(Out);
+ }
+
+ void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx,
+ MCContext &Ctx,
+ MCStreamer &Out) override {
+ unsigned LocalFrameReg = RegCtx.ChooseFrameReg(32);
+ assert(LocalFrameReg != X86::NoRegister);
+
+ RestoreFlags(Out);
+ if (RegCtx.ScratchReg(32) != X86::NoRegister)
+ RestoreReg(Out, RegCtx.ScratchReg(32));
+ RestoreReg(Out, RegCtx.ShadowReg(32));
+ RestoreReg(Out, RegCtx.AddressReg(32));
+
+ unsigned FrameReg = GetFrameReg(Ctx, Out);
+ if (Ctx.getRegisterInfo() && FrameReg != X86::NoRegister) {
+ RestoreReg(Out, LocalFrameReg);
+ Out.EmitCFIRestoreState();
+ if (FrameReg == X86::ESP)
+ Out.EmitCFIAdjustCfaOffset(-4 /* byte size of the LocalFrameReg */);
+ }
+ }
+
+ void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize,
+ bool IsWrite,
+ const RegisterContext &RegCtx,
+ MCContext &Ctx,
+ MCStreamer &Out) override;
+ void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize,
+ bool IsWrite,
+ const RegisterContext &RegCtx,
+ MCContext &Ctx,
+ MCStreamer &Out) override;
+ void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx,
+ MCStreamer &Out) override;
+
+private:
+ void EmitCallAsanReport(unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+ MCStreamer &Out, const RegisterContext &RegCtx) {
+ EmitInstruction(Out, MCInstBuilder(X86::CLD));
+ EmitInstruction(Out, MCInstBuilder(X86::MMX_EMMS));
+
+ EmitInstruction(Out, MCInstBuilder(X86::AND64ri8)
+ .addReg(X86::ESP)
+ .addReg(X86::ESP)
+ .addImm(-16));
+ EmitInstruction(
+ Out, MCInstBuilder(X86::PUSH32r).addReg(RegCtx.AddressReg(32)));
+
+ MCSymbol *FnSym = Ctx.getOrCreateSymbol(llvm::Twine("__asan_report_") +
+ (IsWrite ? "store" : "load") +
+ llvm::Twine(AccessSize));
+ const MCSymbolRefExpr *FnExpr =
+ MCSymbolRefExpr::create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx);
+ EmitInstruction(Out, MCInstBuilder(X86::CALLpcrel32).addExpr(FnExpr));
+ }
+};
+
+void X86AddressSanitizer32::InstrumentMemOperandSmall(
+ X86Operand &Op, unsigned AccessSize, bool IsWrite,
+ const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
+ unsigned AddressRegI32 = RegCtx.AddressReg(32);
+ unsigned ShadowRegI32 = RegCtx.ShadowReg(32);
+ unsigned ShadowRegI8 = RegCtx.ShadowReg(8);
+
+ assert(RegCtx.ScratchReg(32) != X86::NoRegister);
+ unsigned ScratchRegI32 = RegCtx.ScratchReg(32);
+
+ ComputeMemOperandAddress(Op, 32, AddressRegI32, Ctx, Out);
+
+ EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ShadowRegI32).addReg(
+ AddressRegI32));
+ EmitInstruction(Out, MCInstBuilder(X86::SHR32ri)
+ .addReg(ShadowRegI32)
+ .addReg(ShadowRegI32)
+ .addImm(3));
+
+ {
+ MCInst Inst;
+ Inst.setOpcode(X86::MOV8rm);
+ Inst.addOperand(MCOperand::createReg(ShadowRegI8));
+ const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx);
+ std::unique_ptr<X86Operand> Op(
+ X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI32, 0, 1,
+ SMLoc(), SMLoc()));
+ Op->addMemOperands(Inst, 5);
+ EmitInstruction(Out, Inst);
+ }
+
+ EmitInstruction(
+ Out, MCInstBuilder(X86::TEST8rr).addReg(ShadowRegI8).addReg(ShadowRegI8));
+ MCSymbol *DoneSym = Ctx.createTempSymbol();
+ const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
+ EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
+
+ EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ScratchRegI32).addReg(
+ AddressRegI32));
+ EmitInstruction(Out, MCInstBuilder(X86::AND32ri)
+ .addReg(ScratchRegI32)
+ .addReg(ScratchRegI32)
+ .addImm(7));
+
+ switch (AccessSize) {
+ default: llvm_unreachable("Incorrect access size");
+ case 1:
+ break;
+ case 2: {
+ const MCExpr *Disp = MCConstantExpr::create(1, Ctx);
+ std::unique_ptr<X86Operand> Op(
+ X86Operand::CreateMem(getPointerWidth(), 0, Disp, ScratchRegI32, 0, 1,
+ SMLoc(), SMLoc()));
+ EmitLEA(*Op, 32, ScratchRegI32, Out);
+ break;
+ }
+ case 4:
+ EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8)
+ .addReg(ScratchRegI32)
+ .addReg(ScratchRegI32)
+ .addImm(3));
+ break;
+ }
+
+ EmitInstruction(
+ Out,
+ MCInstBuilder(X86::MOVSX32rr8).addReg(ShadowRegI32).addReg(ShadowRegI8));
+ EmitInstruction(Out, MCInstBuilder(X86::CMP32rr).addReg(ScratchRegI32).addReg(
+ ShadowRegI32));
+ EmitInstruction(Out, MCInstBuilder(X86::JL_1).addExpr(DoneExpr));
+
+ EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx);
+ EmitLabel(Out, DoneSym);
+}
+
+void X86AddressSanitizer32::InstrumentMemOperandLarge(
+ X86Operand &Op, unsigned AccessSize, bool IsWrite,
+ const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
+ unsigned AddressRegI32 = RegCtx.AddressReg(32);
+ unsigned ShadowRegI32 = RegCtx.ShadowReg(32);
+
+ ComputeMemOperandAddress(Op, 32, AddressRegI32, Ctx, Out);
+
+ EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ShadowRegI32).addReg(
+ AddressRegI32));
+ EmitInstruction(Out, MCInstBuilder(X86::SHR32ri)
+ .addReg(ShadowRegI32)
+ .addReg(ShadowRegI32)
+ .addImm(3));
+ {
+ MCInst Inst;
+ switch (AccessSize) {
+ default: llvm_unreachable("Incorrect access size");
+ case 8:
+ Inst.setOpcode(X86::CMP8mi);
+ break;
+ case 16:
+ Inst.setOpcode(X86::CMP16mi);
+ break;
+ }
+ const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx);
+ std::unique_ptr<X86Operand> Op(
+ X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI32, 0, 1,
+ SMLoc(), SMLoc()));
+ Op->addMemOperands(Inst, 5);
+ Inst.addOperand(MCOperand::createImm(0));
+ EmitInstruction(Out, Inst);
+ }
+ MCSymbol *DoneSym = Ctx.createTempSymbol();
+ const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
+ EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
+
+ EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx);
+ EmitLabel(Out, DoneSym);
+}
+
+void X86AddressSanitizer32::InstrumentMOVSImpl(unsigned AccessSize,
+ MCContext &Ctx,
+ MCStreamer &Out) {
+ StoreFlags(Out);
+
+ // No need to test when ECX is equals to zero.
+ MCSymbol *DoneSym = Ctx.createTempSymbol();
+ const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
+ EmitInstruction(
+ Out, MCInstBuilder(X86::TEST32rr).addReg(X86::ECX).addReg(X86::ECX));
+ EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
+
+ // Instrument first and last elements in src and dst range.
+ InstrumentMOVSBase(X86::EDI /* DstReg */, X86::ESI /* SrcReg */,
+ X86::ECX /* CntReg */, AccessSize, Ctx, Out);
+
+ EmitLabel(Out, DoneSym);
+ RestoreFlags(Out);
+}
+
+class X86AddressSanitizer64 : public X86AddressSanitizer {
+public:
+ static const long kShadowOffset = 0x7fff8000;
+
+ X86AddressSanitizer64(const MCSubtargetInfo *&STI)
+ : X86AddressSanitizer(STI) {}
+
+ ~X86AddressSanitizer64() override {}
+
+ unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) {
+ unsigned FrameReg = GetFrameRegGeneric(Ctx, Out);
+ if (FrameReg == X86::NoRegister)
+ return FrameReg;
+ return getX86SubSuperRegister(FrameReg, 64);
+ }
+
+ void SpillReg(MCStreamer &Out, unsigned Reg) {
+ EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(Reg));
+ OrigSPOffset -= 8;
+ }
+
+ void RestoreReg(MCStreamer &Out, unsigned Reg) {
+ EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(Reg));
+ OrigSPOffset += 8;
+ }
+
+ void StoreFlags(MCStreamer &Out) {
+ EmitInstruction(Out, MCInstBuilder(X86::PUSHF64));
+ OrigSPOffset -= 8;
+ }
+
+ void RestoreFlags(MCStreamer &Out) {
+ EmitInstruction(Out, MCInstBuilder(X86::POPF64));
+ OrigSPOffset += 8;
+ }
+
+ void InstrumentMemOperandPrologue(const RegisterContext &RegCtx,
+ MCContext &Ctx,
+ MCStreamer &Out) override {
+ unsigned LocalFrameReg = RegCtx.ChooseFrameReg(64);
+ assert(LocalFrameReg != X86::NoRegister);
+
+ const MCRegisterInfo *MRI = Ctx.getRegisterInfo();
+ unsigned FrameReg = GetFrameReg(Ctx, Out);
+ if (MRI && FrameReg != X86::NoRegister) {
+ SpillReg(Out, X86::RBP);
+ if (FrameReg == X86::RSP) {
+ Out.EmitCFIAdjustCfaOffset(8 /* byte size of the LocalFrameReg */);
+ Out.EmitCFIRelOffset(
+ MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */), 0);
+ }
+ EmitInstruction(
+ Out,
+ MCInstBuilder(X86::MOV64rr).addReg(LocalFrameReg).addReg(FrameReg));
+ Out.EmitCFIRememberState();
+ Out.EmitCFIDefCfaRegister(
+ MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */));
+ }
+
+ EmitAdjustRSP(Ctx, Out, -128);
+ SpillReg(Out, RegCtx.ShadowReg(64));
+ SpillReg(Out, RegCtx.AddressReg(64));
+ if (RegCtx.ScratchReg(64) != X86::NoRegister)
+ SpillReg(Out, RegCtx.ScratchReg(64));
+ StoreFlags(Out);
+ }
+
+ void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx,
+ MCContext &Ctx,
+ MCStreamer &Out) override {
+ unsigned LocalFrameReg = RegCtx.ChooseFrameReg(64);
+ assert(LocalFrameReg != X86::NoRegister);
+
+ RestoreFlags(Out);
+ if (RegCtx.ScratchReg(64) != X86::NoRegister)
+ RestoreReg(Out, RegCtx.ScratchReg(64));
+ RestoreReg(Out, RegCtx.AddressReg(64));
+ RestoreReg(Out, RegCtx.ShadowReg(64));
+ EmitAdjustRSP(Ctx, Out, 128);
+
+ unsigned FrameReg = GetFrameReg(Ctx, Out);
+ if (Ctx.getRegisterInfo() && FrameReg != X86::NoRegister) {
+ RestoreReg(Out, LocalFrameReg);
+ Out.EmitCFIRestoreState();
+ if (FrameReg == X86::RSP)
+ Out.EmitCFIAdjustCfaOffset(-8 /* byte size of the LocalFrameReg */);
+ }
+ }
+
+ void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize,
+ bool IsWrite,
+ const RegisterContext &RegCtx,
+ MCContext &Ctx,
+ MCStreamer &Out) override;
+ void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize,
+ bool IsWrite,
+ const RegisterContext &RegCtx,
+ MCContext &Ctx,
+ MCStreamer &Out) override;
+ void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx,
+ MCStreamer &Out) override;
+
+private:
+ void EmitAdjustRSP(MCContext &Ctx, MCStreamer &Out, long Offset) {
+ const MCExpr *Disp = MCConstantExpr::create(Offset, Ctx);
+ std::unique_ptr<X86Operand> Op(
+ X86Operand::CreateMem(getPointerWidth(), 0, Disp, X86::RSP, 0, 1,
+ SMLoc(), SMLoc()));
+ EmitLEA(*Op, 64, X86::RSP, Out);
+ OrigSPOffset += Offset;
+ }
+
+ void EmitCallAsanReport(unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+ MCStreamer &Out, const RegisterContext &RegCtx) {
+ EmitInstruction(Out, MCInstBuilder(X86::CLD));
+ EmitInstruction(Out, MCInstBuilder(X86::MMX_EMMS));
+
+ EmitInstruction(Out, MCInstBuilder(X86::AND64ri8)
+ .addReg(X86::RSP)
+ .addReg(X86::RSP)
+ .addImm(-16));
+
+ if (RegCtx.AddressReg(64) != X86::RDI) {
+ EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(X86::RDI).addReg(
+ RegCtx.AddressReg(64)));
+ }
+ MCSymbol *FnSym = Ctx.getOrCreateSymbol(llvm::Twine("__asan_report_") +
+ (IsWrite ? "store" : "load") +
+ llvm::Twine(AccessSize));
+ const MCSymbolRefExpr *FnExpr =
+ MCSymbolRefExpr::create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx);
+ EmitInstruction(Out, MCInstBuilder(X86::CALL64pcrel32).addExpr(FnExpr));
+ }
+};
+
+void X86AddressSanitizer64::InstrumentMemOperandSmall(
+ X86Operand &Op, unsigned AccessSize, bool IsWrite,
+ const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
+ unsigned AddressRegI64 = RegCtx.AddressReg(64);
+ unsigned AddressRegI32 = RegCtx.AddressReg(32);
+ unsigned ShadowRegI64 = RegCtx.ShadowReg(64);
+ unsigned ShadowRegI32 = RegCtx.ShadowReg(32);
+ unsigned ShadowRegI8 = RegCtx.ShadowReg(8);
+
+ assert(RegCtx.ScratchReg(32) != X86::NoRegister);
+ unsigned ScratchRegI32 = RegCtx.ScratchReg(32);
+
+ ComputeMemOperandAddress(Op, 64, AddressRegI64, Ctx, Out);
+
+ EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(ShadowRegI64).addReg(
+ AddressRegI64));
+ EmitInstruction(Out, MCInstBuilder(X86::SHR64ri)
+ .addReg(ShadowRegI64)
+ .addReg(ShadowRegI64)
+ .addImm(3));
+ {
+ MCInst Inst;
+ Inst.setOpcode(X86::MOV8rm);
+ Inst.addOperand(MCOperand::createReg(ShadowRegI8));
+ const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx);
+ std::unique_ptr<X86Operand> Op(
+ X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI64, 0, 1,
+ SMLoc(), SMLoc()));
+ Op->addMemOperands(Inst, 5);
+ EmitInstruction(Out, Inst);
+ }
+
+ EmitInstruction(
+ Out, MCInstBuilder(X86::TEST8rr).addReg(ShadowRegI8).addReg(ShadowRegI8));
+ MCSymbol *DoneSym = Ctx.createTempSymbol();
+ const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
+ EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
+
+ EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ScratchRegI32).addReg(
+ AddressRegI32));
+ EmitInstruction(Out, MCInstBuilder(X86::AND32ri)
+ .addReg(ScratchRegI32)
+ .addReg(ScratchRegI32)
+ .addImm(7));
+
+ switch (AccessSize) {
+ default: llvm_unreachable("Incorrect access size");
+ case 1:
+ break;
+ case 2: {
+ const MCExpr *Disp = MCConstantExpr::create(1, Ctx);
+ std::unique_ptr<X86Operand> Op(
+ X86Operand::CreateMem(getPointerWidth(), 0, Disp, ScratchRegI32, 0, 1,
+ SMLoc(), SMLoc()));
+ EmitLEA(*Op, 32, ScratchRegI32, Out);
+ break;
+ }
+ case 4:
+ EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8)
+ .addReg(ScratchRegI32)
+ .addReg(ScratchRegI32)
+ .addImm(3));
+ break;
+ }
+
+ EmitInstruction(
+ Out,
+ MCInstBuilder(X86::MOVSX32rr8).addReg(ShadowRegI32).addReg(ShadowRegI8));
+ EmitInstruction(Out, MCInstBuilder(X86::CMP32rr).addReg(ScratchRegI32).addReg(
+ ShadowRegI32));
+ EmitInstruction(Out, MCInstBuilder(X86::JL_1).addExpr(DoneExpr));
+
+ EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx);
+ EmitLabel(Out, DoneSym);
+}
+
+void X86AddressSanitizer64::InstrumentMemOperandLarge(
+ X86Operand &Op, unsigned AccessSize, bool IsWrite,
+ const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
+ unsigned AddressRegI64 = RegCtx.AddressReg(64);
+ unsigned ShadowRegI64 = RegCtx.ShadowReg(64);
+
+ ComputeMemOperandAddress(Op, 64, AddressRegI64, Ctx, Out);
+
+ EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(ShadowRegI64).addReg(
+ AddressRegI64));
+ EmitInstruction(Out, MCInstBuilder(X86::SHR64ri)
+ .addReg(ShadowRegI64)
+ .addReg(ShadowRegI64)
+ .addImm(3));
+ {
+ MCInst Inst;
+ switch (AccessSize) {
+ default: llvm_unreachable("Incorrect access size");
+ case 8:
+ Inst.setOpcode(X86::CMP8mi);
+ break;
+ case 16:
+ Inst.setOpcode(X86::CMP16mi);
+ break;
+ }
+ const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx);
+ std::unique_ptr<X86Operand> Op(
+ X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI64, 0, 1,
+ SMLoc(), SMLoc()));
+ Op->addMemOperands(Inst, 5);
+ Inst.addOperand(MCOperand::createImm(0));
+ EmitInstruction(Out, Inst);
+ }
+
+ MCSymbol *DoneSym = Ctx.createTempSymbol();
+ const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
+ EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
+
+ EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx);
+ EmitLabel(Out, DoneSym);
+}
+
+void X86AddressSanitizer64::InstrumentMOVSImpl(unsigned AccessSize,
+ MCContext &Ctx,
+ MCStreamer &Out) {
+ StoreFlags(Out);
+
+ // No need to test when RCX is equals to zero.
+ MCSymbol *DoneSym = Ctx.createTempSymbol();
+ const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
+ EmitInstruction(
+ Out, MCInstBuilder(X86::TEST64rr).addReg(X86::RCX).addReg(X86::RCX));
+ EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
+
+ // Instrument first and last elements in src and dst range.
+ InstrumentMOVSBase(X86::RDI /* DstReg */, X86::RSI /* SrcReg */,
+ X86::RCX /* CntReg */, AccessSize, Ctx, Out);
+
+ EmitLabel(Out, DoneSym);
+ RestoreFlags(Out);
+}
+
+} // End anonymous namespace
+
+X86AsmInstrumentation::X86AsmInstrumentation(const MCSubtargetInfo *&STI)
+ : STI(STI), InitialFrameReg(0) {}
+
+X86AsmInstrumentation::~X86AsmInstrumentation() {}
+
+void X86AsmInstrumentation::InstrumentAndEmitInstruction(
+ const MCInst &Inst, OperandVector &Operands, MCContext &Ctx,
+ const MCInstrInfo &MII, MCStreamer &Out) {
+ EmitInstruction(Out, Inst);
+}
+
+void X86AsmInstrumentation::EmitInstruction(MCStreamer &Out,
+ const MCInst &Inst) {
+ Out.EmitInstruction(Inst, *STI);
+}
+
+unsigned X86AsmInstrumentation::GetFrameRegGeneric(const MCContext &Ctx,
+ MCStreamer &Out) {
+ if (!Out.getNumFrameInfos()) // No active dwarf frame
+ return X86::NoRegister;
+ const MCDwarfFrameInfo &Frame = Out.getDwarfFrameInfos().back();
+ if (Frame.End) // Active dwarf frame is closed
+ return X86::NoRegister;
+ const MCRegisterInfo *MRI = Ctx.getRegisterInfo();
+ if (!MRI) // No register info
+ return X86::NoRegister;
+
+ if (InitialFrameReg) {
+ // FrameReg is set explicitly, we're instrumenting a MachineFunction.
+ return InitialFrameReg;
+ }
+
+ return MRI->getLLVMRegNum(Frame.CurrentCfaRegister, true /* IsEH */);
+}
+
+X86AsmInstrumentation *
+CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
+ const MCContext &Ctx, const MCSubtargetInfo *&STI) {
+ Triple T(STI->getTargetTriple());
+ const bool hasCompilerRTSupport = T.isOSLinux();
+ if (ClAsanInstrumentAssembly && hasCompilerRTSupport &&
+ MCOptions.SanitizeAddress) {
+ if (STI->getFeatureBits()[X86::Mode32Bit] != 0)
+ return new X86AddressSanitizer32(STI);
+ if (STI->getFeatureBits()[X86::Mode64Bit] != 0)
+ return new X86AddressSanitizer64(STI);
+ }
+ return new X86AsmInstrumentation(STI);
+}
+
+} // end llvm namespace
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
new file mode 100644
index 000000000000..470ceadb0aa6
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
@@ -0,0 +1,68 @@
+//===- X86AsmInstrumentation.h - Instrument X86 inline assembly *- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMINSTRUMENTATION_H
+#define LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMINSTRUMENTATION_H
+
+#include "llvm/ADT/SmallVector.h"
+
+#include <memory>
+
+namespace llvm {
+
+class MCContext;
+class MCInst;
+class MCInstrInfo;
+class MCParsedAsmOperand;
+class MCStreamer;
+class MCSubtargetInfo;
+class MCTargetOptions;
+
+class X86AsmInstrumentation;
+
+X86AsmInstrumentation *
+CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
+ const MCContext &Ctx,
+ const MCSubtargetInfo *&STI);
+
+class X86AsmInstrumentation {
+public:
+ virtual ~X86AsmInstrumentation();
+
+ // Sets frame register corresponding to a current frame.
+ void SetInitialFrameRegister(unsigned RegNo) {
+ InitialFrameReg = RegNo;
+ }
+
+ // Tries to instrument and emit instruction.
+ virtual void InstrumentAndEmitInstruction(
+ const MCInst &Inst,
+ SmallVectorImpl<std::unique_ptr<MCParsedAsmOperand> > &Operands,
+ MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out);
+
+protected:
+ friend X86AsmInstrumentation *
+ CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
+ const MCContext &Ctx,
+ const MCSubtargetInfo *&STI);
+
+ X86AsmInstrumentation(const MCSubtargetInfo *&STI);
+
+ unsigned GetFrameRegGeneric(const MCContext &Ctx, MCStreamer &Out);
+
+ void EmitInstruction(MCStreamer &Out, const MCInst &Inst);
+
+ const MCSubtargetInfo *&STI;
+
+ unsigned InitialFrameReg;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
new file mode 100644
index 000000000000..e692118f47fd
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -0,0 +1,3184 @@
+//===-- X86AsmParser.cpp - Parse X86 assembly to MCInst instructions ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "X86AsmInstrumentation.h"
+#include "X86AsmParserCommon.h"
+#include "X86Operand.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <memory>
+
+using namespace llvm;
+
+namespace {
+
+static const char OpPrecedence[] = {
+ 0, // IC_OR
+ 1, // IC_XOR
+ 2, // IC_AND
+ 3, // IC_LSHIFT
+ 3, // IC_RSHIFT
+ 4, // IC_PLUS
+ 4, // IC_MINUS
+ 5, // IC_MULTIPLY
+ 5, // IC_DIVIDE
+ 6, // IC_RPAREN
+ 7, // IC_LPAREN
+ 0, // IC_IMM
+ 0 // IC_REGISTER
+};
+
+class X86AsmParser : public MCTargetAsmParser {
+ const MCInstrInfo &MII;
+ ParseInstructionInfo *InstInfo;
+ std::unique_ptr<X86AsmInstrumentation> Instrumentation;
+ bool Code16GCC;
+
+private:
+ SMLoc consumeToken() {
+ MCAsmParser &Parser = getParser();
+ SMLoc Result = Parser.getTok().getLoc();
+ Parser.Lex();
+ return Result;
+ }
+
+ unsigned MatchInstruction(const OperandVector &Operands, MCInst &Inst,
+ uint64_t &ErrorInfo, bool matchingInlineAsm,
+ unsigned VariantID = 0) {
+ // In Code16GCC mode, match as 32-bit.
+ if (Code16GCC)
+ SwitchMode(X86::Mode32Bit);
+ unsigned rv = MatchInstructionImpl(Operands, Inst, ErrorInfo,
+ matchingInlineAsm, VariantID);
+ if (Code16GCC)
+ SwitchMode(X86::Mode16Bit);
+ return rv;
+ }
+
+ enum InfixCalculatorTok {
+ IC_OR = 0,
+ IC_XOR,
+ IC_AND,
+ IC_LSHIFT,
+ IC_RSHIFT,
+ IC_PLUS,
+ IC_MINUS,
+ IC_MULTIPLY,
+ IC_DIVIDE,
+ IC_RPAREN,
+ IC_LPAREN,
+ IC_IMM,
+ IC_REGISTER
+ };
+
+ class InfixCalculator {
+ typedef std::pair< InfixCalculatorTok, int64_t > ICToken;
+ SmallVector<InfixCalculatorTok, 4> InfixOperatorStack;
+ SmallVector<ICToken, 4> PostfixStack;
+
+ public:
+ int64_t popOperand() {
+ assert (!PostfixStack.empty() && "Poped an empty stack!");
+ ICToken Op = PostfixStack.pop_back_val();
+ assert ((Op.first == IC_IMM || Op.first == IC_REGISTER)
+ && "Expected and immediate or register!");
+ return Op.second;
+ }
+ void pushOperand(InfixCalculatorTok Op, int64_t Val = 0) {
+ assert ((Op == IC_IMM || Op == IC_REGISTER) &&
+ "Unexpected operand!");
+ PostfixStack.push_back(std::make_pair(Op, Val));
+ }
+
+ void popOperator() { InfixOperatorStack.pop_back(); }
+ void pushOperator(InfixCalculatorTok Op) {
+ // Push the new operator if the stack is empty.
+ if (InfixOperatorStack.empty()) {
+ InfixOperatorStack.push_back(Op);
+ return;
+ }
+
+ // Push the new operator if it has a higher precedence than the operator
+ // on the top of the stack or the operator on the top of the stack is a
+ // left parentheses.
+ unsigned Idx = InfixOperatorStack.size() - 1;
+ InfixCalculatorTok StackOp = InfixOperatorStack[Idx];
+ if (OpPrecedence[Op] > OpPrecedence[StackOp] || StackOp == IC_LPAREN) {
+ InfixOperatorStack.push_back(Op);
+ return;
+ }
+
+ // The operator on the top of the stack has higher precedence than the
+ // new operator.
+ unsigned ParenCount = 0;
+ while (1) {
+ // Nothing to process.
+ if (InfixOperatorStack.empty())
+ break;
+
+ Idx = InfixOperatorStack.size() - 1;
+ StackOp = InfixOperatorStack[Idx];
+ if (!(OpPrecedence[StackOp] >= OpPrecedence[Op] || ParenCount))
+ break;
+
+ // If we have an even parentheses count and we see a left parentheses,
+ // then stop processing.
+ if (!ParenCount && StackOp == IC_LPAREN)
+ break;
+
+ if (StackOp == IC_RPAREN) {
+ ++ParenCount;
+ InfixOperatorStack.pop_back();
+ } else if (StackOp == IC_LPAREN) {
+ --ParenCount;
+ InfixOperatorStack.pop_back();
+ } else {
+ InfixOperatorStack.pop_back();
+ PostfixStack.push_back(std::make_pair(StackOp, 0));
+ }
+ }
+ // Push the new operator.
+ InfixOperatorStack.push_back(Op);
+ }
+
+ int64_t execute() {
+ // Push any remaining operators onto the postfix stack.
+ while (!InfixOperatorStack.empty()) {
+ InfixCalculatorTok StackOp = InfixOperatorStack.pop_back_val();
+ if (StackOp != IC_LPAREN && StackOp != IC_RPAREN)
+ PostfixStack.push_back(std::make_pair(StackOp, 0));
+ }
+
+ if (PostfixStack.empty())
+ return 0;
+
+ SmallVector<ICToken, 16> OperandStack;
+ for (unsigned i = 0, e = PostfixStack.size(); i != e; ++i) {
+ ICToken Op = PostfixStack[i];
+ if (Op.first == IC_IMM || Op.first == IC_REGISTER) {
+ OperandStack.push_back(Op);
+ } else {
+ assert (OperandStack.size() > 1 && "Too few operands.");
+ int64_t Val;
+ ICToken Op2 = OperandStack.pop_back_val();
+ ICToken Op1 = OperandStack.pop_back_val();
+ switch (Op.first) {
+ default:
+ report_fatal_error("Unexpected operator!");
+ break;
+ case IC_PLUS:
+ Val = Op1.second + Op2.second;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_MINUS:
+ Val = Op1.second - Op2.second;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_MULTIPLY:
+ assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Multiply operation with an immediate and a register!");
+ Val = Op1.second * Op2.second;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_DIVIDE:
+ assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Divide operation with an immediate and a register!");
+ assert (Op2.second != 0 && "Division by zero!");
+ Val = Op1.second / Op2.second;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_OR:
+ assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Or operation with an immediate and a register!");
+ Val = Op1.second | Op2.second;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_XOR:
+ assert(Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Xor operation with an immediate and a register!");
+ Val = Op1.second ^ Op2.second;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_AND:
+ assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "And operation with an immediate and a register!");
+ Val = Op1.second & Op2.second;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_LSHIFT:
+ assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Left shift operation with an immediate and a register!");
+ Val = Op1.second << Op2.second;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_RSHIFT:
+ assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Right shift operation with an immediate and a register!");
+ Val = Op1.second >> Op2.second;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ }
+ }
+ }
+ assert (OperandStack.size() == 1 && "Expected a single result.");
+ return OperandStack.pop_back_val().second;
+ }
+ };
+
+ enum IntelExprState {
+ IES_OR,
+ IES_XOR,
+ IES_AND,
+ IES_LSHIFT,
+ IES_RSHIFT,
+ IES_PLUS,
+ IES_MINUS,
+ IES_NOT,
+ IES_MULTIPLY,
+ IES_DIVIDE,
+ IES_LBRAC,
+ IES_RBRAC,
+ IES_LPAREN,
+ IES_RPAREN,
+ IES_REGISTER,
+ IES_INTEGER,
+ IES_IDENTIFIER,
+ IES_ERROR
+ };
+
+ class IntelExprStateMachine {
+ IntelExprState State, PrevState;
+ unsigned BaseReg, IndexReg, TmpReg, Scale;
+ int64_t Imm;
+ const MCExpr *Sym;
+ StringRef SymName;
+ bool StopOnLBrac, AddImmPrefix;
+ InfixCalculator IC;
+ InlineAsmIdentifierInfo Info;
+
+ public:
+ IntelExprStateMachine(int64_t imm, bool stoponlbrac, bool addimmprefix) :
+ State(IES_PLUS), PrevState(IES_ERROR), BaseReg(0), IndexReg(0), TmpReg(0),
+ Scale(1), Imm(imm), Sym(nullptr), StopOnLBrac(stoponlbrac),
+ AddImmPrefix(addimmprefix) { Info.clear(); }
+
+ unsigned getBaseReg() { return BaseReg; }
+ unsigned getIndexReg() { return IndexReg; }
+ unsigned getScale() { return Scale; }
+ const MCExpr *getSym() { return Sym; }
+ StringRef getSymName() { return SymName; }
+ int64_t getImm() { return Imm + IC.execute(); }
+ bool isValidEndState() {
+ return State == IES_RBRAC || State == IES_INTEGER;
+ }
+ bool getStopOnLBrac() { return StopOnLBrac; }
+ bool getAddImmPrefix() { return AddImmPrefix; }
+ bool hadError() { return State == IES_ERROR; }
+
+ InlineAsmIdentifierInfo &getIdentifierInfo() {
+ return Info;
+ }
+
+ void onOr() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ case IES_REGISTER:
+ State = IES_OR;
+ IC.pushOperator(IC_OR);
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onXor() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ case IES_REGISTER:
+ State = IES_XOR;
+ IC.pushOperator(IC_XOR);
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onAnd() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ case IES_REGISTER:
+ State = IES_AND;
+ IC.pushOperator(IC_AND);
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onLShift() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ case IES_REGISTER:
+ State = IES_LSHIFT;
+ IC.pushOperator(IC_LSHIFT);
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onRShift() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ case IES_REGISTER:
+ State = IES_RSHIFT;
+ IC.pushOperator(IC_RSHIFT);
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onPlus() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ case IES_REGISTER:
+ State = IES_PLUS;
+ IC.pushOperator(IC_PLUS);
+ if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
+ // If we already have a BaseReg, then assume this is the IndexReg with
+ // a scale of 1.
+ if (!BaseReg) {
+ BaseReg = TmpReg;
+ } else {
+ assert (!IndexReg && "BaseReg/IndexReg already set!");
+ IndexReg = TmpReg;
+ Scale = 1;
+ }
+ }
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onMinus() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_PLUS:
+ case IES_NOT:
+ case IES_MULTIPLY:
+ case IES_DIVIDE:
+ case IES_LPAREN:
+ case IES_RPAREN:
+ case IES_LBRAC:
+ case IES_RBRAC:
+ case IES_INTEGER:
+ case IES_REGISTER:
+ State = IES_MINUS;
+ // Only push the minus operator if it is not a unary operator.
+ if (!(CurrState == IES_PLUS || CurrState == IES_MINUS ||
+ CurrState == IES_MULTIPLY || CurrState == IES_DIVIDE ||
+ CurrState == IES_LPAREN || CurrState == IES_LBRAC))
+ IC.pushOperator(IC_MINUS);
+ if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
+ // If we already have a BaseReg, then assume this is the IndexReg with
+ // a scale of 1.
+ if (!BaseReg) {
+ BaseReg = TmpReg;
+ } else {
+ assert (!IndexReg && "BaseReg/IndexReg already set!");
+ IndexReg = TmpReg;
+ Scale = 1;
+ }
+ }
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onNot() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_PLUS:
+ case IES_NOT:
+ State = IES_NOT;
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onRegister(unsigned Reg) {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_PLUS:
+ case IES_LPAREN:
+ State = IES_REGISTER;
+ TmpReg = Reg;
+ IC.pushOperand(IC_REGISTER);
+ break;
+ case IES_MULTIPLY:
+ // Index Register - Scale * Register
+ if (PrevState == IES_INTEGER) {
+ assert (!IndexReg && "IndexReg already set!");
+ State = IES_REGISTER;
+ IndexReg = Reg;
+ // Get the scale and replace the 'Scale * Register' with '0'.
+ Scale = IC.popOperand();
+ IC.pushOperand(IC_IMM);
+ IC.popOperator();
+ } else {
+ State = IES_ERROR;
+ }
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onIdentifierExpr(const MCExpr *SymRef, StringRef SymRefName) {
+ PrevState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_PLUS:
+ case IES_MINUS:
+ case IES_NOT:
+ State = IES_INTEGER;
+ Sym = SymRef;
+ SymName = SymRefName;
+ IC.pushOperand(IC_IMM);
+ break;
+ }
+ }
+ bool onInteger(int64_t TmpInt, StringRef &ErrMsg) {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_PLUS:
+ case IES_MINUS:
+ case IES_NOT:
+ case IES_OR:
+ case IES_XOR:
+ case IES_AND:
+ case IES_LSHIFT:
+ case IES_RSHIFT:
+ case IES_DIVIDE:
+ case IES_MULTIPLY:
+ case IES_LPAREN:
+ State = IES_INTEGER;
+ if (PrevState == IES_REGISTER && CurrState == IES_MULTIPLY) {
+ // Index Register - Register * Scale
+ assert (!IndexReg && "IndexReg already set!");
+ IndexReg = TmpReg;
+ Scale = TmpInt;
+ if(Scale != 1 && Scale != 2 && Scale != 4 && Scale != 8) {
+ ErrMsg = "scale factor in address must be 1, 2, 4 or 8";
+ return true;
+ }
+ // Get the scale and replace the 'Register * Scale' with '0'.
+ IC.popOperator();
+ } else if ((PrevState == IES_PLUS || PrevState == IES_MINUS ||
+ PrevState == IES_OR || PrevState == IES_AND ||
+ PrevState == IES_LSHIFT || PrevState == IES_RSHIFT ||
+ PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE ||
+ PrevState == IES_LPAREN || PrevState == IES_LBRAC ||
+ PrevState == IES_NOT || PrevState == IES_XOR) &&
+ CurrState == IES_MINUS) {
+ // Unary minus. No need to pop the minus operand because it was never
+ // pushed.
+ IC.pushOperand(IC_IMM, -TmpInt); // Push -Imm.
+ } else if ((PrevState == IES_PLUS || PrevState == IES_MINUS ||
+ PrevState == IES_OR || PrevState == IES_AND ||
+ PrevState == IES_LSHIFT || PrevState == IES_RSHIFT ||
+ PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE ||
+ PrevState == IES_LPAREN || PrevState == IES_LBRAC ||
+ PrevState == IES_NOT || PrevState == IES_XOR) &&
+ CurrState == IES_NOT) {
+ // Unary not. No need to pop the not operand because it was never
+ // pushed.
+ IC.pushOperand(IC_IMM, ~TmpInt); // Push ~Imm.
+ } else {
+ IC.pushOperand(IC_IMM, TmpInt);
+ }
+ break;
+ }
+ PrevState = CurrState;
+ return false;
+ }
+ void onStar() {
+ PrevState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_REGISTER:
+ case IES_RPAREN:
+ State = IES_MULTIPLY;
+ IC.pushOperator(IC_MULTIPLY);
+ break;
+ }
+ }
+ void onDivide() {
+ PrevState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ State = IES_DIVIDE;
+ IC.pushOperator(IC_DIVIDE);
+ break;
+ }
+ }
+ void onLBrac() {
+ PrevState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_RBRAC:
+ State = IES_PLUS;
+ IC.pushOperator(IC_PLUS);
+ break;
+ }
+ }
+ void onRBrac() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_REGISTER:
+ case IES_RPAREN:
+ State = IES_RBRAC;
+ if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
+ // If we already have a BaseReg, then assume this is the IndexReg with
+ // a scale of 1.
+ if (!BaseReg) {
+ BaseReg = TmpReg;
+ } else {
+ assert (!IndexReg && "BaseReg/IndexReg already set!");
+ IndexReg = TmpReg;
+ Scale = 1;
+ }
+ }
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onLParen() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_PLUS:
+ case IES_MINUS:
+ case IES_NOT:
+ case IES_OR:
+ case IES_XOR:
+ case IES_AND:
+ case IES_LSHIFT:
+ case IES_RSHIFT:
+ case IES_MULTIPLY:
+ case IES_DIVIDE:
+ case IES_LPAREN:
+ // FIXME: We don't handle this type of unary minus or not, yet.
+ if ((PrevState == IES_PLUS || PrevState == IES_MINUS ||
+ PrevState == IES_OR || PrevState == IES_AND ||
+ PrevState == IES_LSHIFT || PrevState == IES_RSHIFT ||
+ PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE ||
+ PrevState == IES_LPAREN || PrevState == IES_LBRAC ||
+ PrevState == IES_NOT || PrevState == IES_XOR) &&
+ (CurrState == IES_MINUS || CurrState == IES_NOT)) {
+ State = IES_ERROR;
+ break;
+ }
+ State = IES_LPAREN;
+ IC.pushOperator(IC_LPAREN);
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onRParen() {
+ PrevState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_REGISTER:
+ case IES_RPAREN:
+ State = IES_RPAREN;
+ IC.pushOperator(IC_RPAREN);
+ break;
+ }
+ }
+ };
+
+ bool Error(SMLoc L, const Twine &Msg, SMRange Range = None,
+ bool MatchingInlineAsm = false) {
+ MCAsmParser &Parser = getParser();
+ if (MatchingInlineAsm) {
+ if (!getLexer().isAtStartOfStatement())
+ Parser.eatToEndOfStatement();
+ return false;
+ }
+ return Parser.Error(L, Msg, Range);
+ }
+
+ std::nullptr_t ErrorOperand(SMLoc Loc, StringRef Msg) {
+ Error(Loc, Msg);
+ return nullptr;
+ }
+
+ std::unique_ptr<X86Operand> DefaultMemSIOperand(SMLoc Loc);
+ std::unique_ptr<X86Operand> DefaultMemDIOperand(SMLoc Loc);
+ bool IsSIReg(unsigned Reg);
+ unsigned GetSIDIForRegClass(unsigned RegClassID, unsigned Reg, bool IsSIReg);
+ void
+ AddDefaultSrcDestOperands(OperandVector &Operands,
+ std::unique_ptr<llvm::MCParsedAsmOperand> &&Src,
+ std::unique_ptr<llvm::MCParsedAsmOperand> &&Dst);
+ bool VerifyAndAdjustOperands(OperandVector &OrigOperands,
+ OperandVector &FinalOperands);
+ std::unique_ptr<X86Operand> ParseOperand();
+ std::unique_ptr<X86Operand> ParseATTOperand();
+ std::unique_ptr<X86Operand> ParseIntelOperand();
+ std::unique_ptr<X86Operand> ParseIntelOffsetOfOperator();
+ bool ParseIntelDotOperator(const MCExpr *Disp, const MCExpr *&NewDisp);
+ std::unique_ptr<X86Operand> ParseIntelOperator(unsigned OpKind);
+ std::unique_ptr<X86Operand>
+ ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, unsigned Size);
+ std::unique_ptr<X86Operand> ParseRoundingModeOp(SMLoc Start, SMLoc End);
+ bool ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End);
+ std::unique_ptr<X86Operand>
+ ParseIntelBracExpression(unsigned SegReg, SMLoc Start, int64_t ImmDisp,
+ bool isSymbol, unsigned Size);
+ bool ParseIntelIdentifier(const MCExpr *&Val, StringRef &Identifier,
+ InlineAsmIdentifierInfo &Info,
+ bool IsUnevaluatedOperand, SMLoc &End);
+
+ std::unique_ptr<X86Operand> ParseMemOperand(unsigned SegReg, SMLoc StartLoc);
+
+ std::unique_ptr<X86Operand>
+ CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg,
+ unsigned IndexReg, unsigned Scale, SMLoc Start,
+ SMLoc End, unsigned Size, StringRef Identifier,
+ InlineAsmIdentifierInfo &Info,
+ bool AllowBetterSizeMatch = false);
+
+ bool parseDirectiveEven(SMLoc L);
+ bool ParseDirectiveWord(unsigned Size, SMLoc L);
+ bool ParseDirectiveCode(StringRef IDVal, SMLoc L);
+
+ bool processInstruction(MCInst &Inst, const OperandVector &Ops);
+
+ /// Wrapper around MCStreamer::EmitInstruction(). Possibly adds
+ /// instrumentation around Inst.
+ void EmitInstruction(MCInst &Inst, OperandVector &Operands, MCStreamer &Out);
+
+ bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands, MCStreamer &Out,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) override;
+
+ void MatchFPUWaitAlias(SMLoc IDLoc, X86Operand &Op, OperandVector &Operands,
+ MCStreamer &Out, bool MatchingInlineAsm);
+
+ bool ErrorMissingFeature(SMLoc IDLoc, uint64_t ErrorInfo,
+ bool MatchingInlineAsm);
+
+ bool MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands, MCStreamer &Out,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm);
+
+ bool MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands, MCStreamer &Out,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm);
+
+ bool OmitRegisterFromClobberLists(unsigned RegNo) override;
+
+ /// Parses AVX512 specific operand primitives: masked registers ({%k<NUM>}, {z})
+ /// and memory broadcasting ({1to<NUM>}) primitives, updating Operands vector if required.
+ /// return false if no parsing errors occurred, true otherwise.
+ bool HandleAVX512Operand(OperandVector &Operands,
+ const MCParsedAsmOperand &Op);
+
+ bool ParseZ(std::unique_ptr<X86Operand> &Z, const SMLoc &StartLoc);
+
+ /// MS-compatibility:
+ /// Obtain an appropriate size qualifier, when facing its absence,
+ /// upon AVX512 vector/broadcast memory operand
+ unsigned AdjustAVX512Mem(unsigned Size, X86Operand* UnsizedMemOpNext);
+
+ bool is64BitMode() const {
+ // FIXME: Can tablegen auto-generate this?
+ return getSTI().getFeatureBits()[X86::Mode64Bit];
+ }
+ bool is32BitMode() const {
+ // FIXME: Can tablegen auto-generate this?
+ return getSTI().getFeatureBits()[X86::Mode32Bit];
+ }
+ bool is16BitMode() const {
+ // FIXME: Can tablegen auto-generate this?
+ return getSTI().getFeatureBits()[X86::Mode16Bit];
+ }
+ void SwitchMode(unsigned mode) {
+ MCSubtargetInfo &STI = copySTI();
+ FeatureBitset AllModes({X86::Mode64Bit, X86::Mode32Bit, X86::Mode16Bit});
+ FeatureBitset OldMode = STI.getFeatureBits() & AllModes;
+ unsigned FB = ComputeAvailableFeatures(
+ STI.ToggleFeature(OldMode.flip(mode)));
+ setAvailableFeatures(FB);
+
+ assert(FeatureBitset({mode}) == (STI.getFeatureBits() & AllModes));
+ }
+
+ unsigned getPointerWidth() {
+ if (is16BitMode()) return 16;
+ if (is32BitMode()) return 32;
+ if (is64BitMode()) return 64;
+ llvm_unreachable("invalid mode");
+ }
+
+ bool isParsingIntelSyntax() {
+ return getParser().getAssemblerDialect();
+ }
+
+ /// @name Auto-generated Matcher Functions
+ /// {
+
+#define GET_ASSEMBLER_HEADER
+#include "X86GenAsmMatcher.inc"
+
+ /// }
+
+public:
+ X86AsmParser(const MCSubtargetInfo &sti, MCAsmParser &Parser,
+ const MCInstrInfo &mii, const MCTargetOptions &Options)
+ : MCTargetAsmParser(Options, sti), MII(mii), InstInfo(nullptr),
+ Code16GCC(false) {
+
+ // Initialize the set of available features.
+ setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
+ Instrumentation.reset(
+ CreateX86AsmInstrumentation(Options, Parser.getContext(), STI));
+ }
+
+ bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+
+ void SetFrameRegister(unsigned RegNo) override;
+
+ bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+ SMLoc NameLoc, OperandVector &Operands) override;
+
+ bool ParseDirective(AsmToken DirectiveID) override;
+};
+} // end anonymous namespace
+
+/// @name Auto-generated Match Functions
+/// {
+
+static unsigned MatchRegisterName(StringRef Name);
+
+/// }
+
+static bool CheckBaseRegAndIndexReg(unsigned BaseReg, unsigned IndexReg,
+ StringRef &ErrMsg) {
+ // If we have both a base register and an index register make sure they are
+ // both 64-bit or 32-bit registers.
+ // To support VSIB, IndexReg can be 128-bit or 256-bit registers.
+
+ if ((BaseReg == X86::RIP && IndexReg != 0) || (IndexReg == X86::RIP)) {
+ ErrMsg = "invalid base+index expression";
+ return true;
+ }
+ if (BaseReg != 0 && IndexReg != 0) {
+ if (X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg) &&
+ (X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
+ X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg)) &&
+ IndexReg != X86::RIZ) {
+ ErrMsg = "base register is 64-bit, but index register is not";
+ return true;
+ }
+ if (X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg) &&
+ (X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
+ X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg)) &&
+ IndexReg != X86::EIZ){
+ ErrMsg = "base register is 32-bit, but index register is not";
+ return true;
+ }
+ if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg)) {
+ if (X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg) ||
+ X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg)) {
+ ErrMsg = "base register is 16-bit, but index register is not";
+ return true;
+ }
+ if (((BaseReg == X86::BX || BaseReg == X86::BP) &&
+ IndexReg != X86::SI && IndexReg != X86::DI) ||
+ ((BaseReg == X86::SI || BaseReg == X86::DI) &&
+ IndexReg != X86::BX && IndexReg != X86::BP)) {
+ ErrMsg = "invalid 16-bit base/index register combination";
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+bool X86AsmParser::ParseRegister(unsigned &RegNo,
+ SMLoc &StartLoc, SMLoc &EndLoc) {
+ MCAsmParser &Parser = getParser();
+ RegNo = 0;
+ const AsmToken &PercentTok = Parser.getTok();
+ StartLoc = PercentTok.getLoc();
+
+ // If we encounter a %, ignore it. This code handles registers with and
+ // without the prefix, unprefixed registers can occur in cfi directives.
+ if (!isParsingIntelSyntax() && PercentTok.is(AsmToken::Percent))
+ Parser.Lex(); // Eat percent token.
+
+ const AsmToken &Tok = Parser.getTok();
+ EndLoc = Tok.getEndLoc();
+
+ if (Tok.isNot(AsmToken::Identifier)) {
+ if (isParsingIntelSyntax()) return true;
+ return Error(StartLoc, "invalid register name",
+ SMRange(StartLoc, EndLoc));
+ }
+
+ RegNo = MatchRegisterName(Tok.getString());
+
+ // If the match failed, try the register name as lowercase.
+ if (RegNo == 0)
+ RegNo = MatchRegisterName(Tok.getString().lower());
+
+ // The "flags" register cannot be referenced directly.
+ // Treat it as an identifier instead.
+ if (isParsingInlineAsm() && isParsingIntelSyntax() && RegNo == X86::EFLAGS)
+ RegNo = 0;
+
+ if (!is64BitMode()) {
+ // FIXME: This should be done using Requires<Not64BitMode> and
+ // Requires<In64BitMode> so "eiz" usage in 64-bit instructions can be also
+ // checked.
+ // FIXME: Check AH, CH, DH, BH cannot be used in an instruction requiring a
+ // REX prefix.
+ if (RegNo == X86::RIZ ||
+ X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo) ||
+ X86II::isX86_64NonExtLowByteReg(RegNo) ||
+ X86II::isX86_64ExtendedReg(RegNo))
+ return Error(StartLoc, "register %"
+ + Tok.getString() + " is only available in 64-bit mode",
+ SMRange(StartLoc, EndLoc));
+ } else if (!getSTI().getFeatureBits()[X86::FeatureAVX512]) {
+ if (X86II::is32ExtendedReg(RegNo))
+ return Error(StartLoc, "register %"
+ + Tok.getString() + " is only available with AVX512",
+ SMRange(StartLoc, EndLoc));
+ }
+
+ // Parse "%st" as "%st(0)" and "%st(1)", which is multiple tokens.
+ if (RegNo == 0 && (Tok.getString() == "st" || Tok.getString() == "ST")) {
+ RegNo = X86::ST0;
+ Parser.Lex(); // Eat 'st'
+
+ // Check to see if we have '(4)' after %st.
+ if (getLexer().isNot(AsmToken::LParen))
+ return false;
+ // Lex the paren.
+ getParser().Lex();
+
+ const AsmToken &IntTok = Parser.getTok();
+ if (IntTok.isNot(AsmToken::Integer))
+ return Error(IntTok.getLoc(), "expected stack index");
+ switch (IntTok.getIntVal()) {
+ case 0: RegNo = X86::ST0; break;
+ case 1: RegNo = X86::ST1; break;
+ case 2: RegNo = X86::ST2; break;
+ case 3: RegNo = X86::ST3; break;
+ case 4: RegNo = X86::ST4; break;
+ case 5: RegNo = X86::ST5; break;
+ case 6: RegNo = X86::ST6; break;
+ case 7: RegNo = X86::ST7; break;
+ default: return Error(IntTok.getLoc(), "invalid stack index");
+ }
+
+ if (getParser().Lex().isNot(AsmToken::RParen))
+ return Error(Parser.getTok().getLoc(), "expected ')'");
+
+ EndLoc = Parser.getTok().getEndLoc();
+ Parser.Lex(); // Eat ')'
+ return false;
+ }
+
+ EndLoc = Parser.getTok().getEndLoc();
+
+ // If this is "db[0-7]", match it as an alias
+ // for dr[0-7].
+ if (RegNo == 0 && Tok.getString().size() == 3 &&
+ Tok.getString().startswith("db")) {
+ switch (Tok.getString()[2]) {
+ case '0': RegNo = X86::DR0; break;
+ case '1': RegNo = X86::DR1; break;
+ case '2': RegNo = X86::DR2; break;
+ case '3': RegNo = X86::DR3; break;
+ case '4': RegNo = X86::DR4; break;
+ case '5': RegNo = X86::DR5; break;
+ case '6': RegNo = X86::DR6; break;
+ case '7': RegNo = X86::DR7; break;
+ }
+
+ if (RegNo != 0) {
+ EndLoc = Parser.getTok().getEndLoc();
+ Parser.Lex(); // Eat it.
+ return false;
+ }
+ }
+
+ if (RegNo == 0) {
+ if (isParsingIntelSyntax()) return true;
+ return Error(StartLoc, "invalid register name",
+ SMRange(StartLoc, EndLoc));
+ }
+
+ Parser.Lex(); // Eat identifier token.
+ return false;
+}
+
+void X86AsmParser::SetFrameRegister(unsigned RegNo) {
+ Instrumentation->SetInitialFrameRegister(RegNo);
+}
+
+std::unique_ptr<X86Operand> X86AsmParser::DefaultMemSIOperand(SMLoc Loc) {
+ bool Parse32 = is32BitMode() || Code16GCC;
+ unsigned Basereg = is64BitMode() ? X86::RSI : (Parse32 ? X86::ESI : X86::SI);
+ const MCExpr *Disp = MCConstantExpr::create(0, getContext());
+ return X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp,
+ /*BaseReg=*/Basereg, /*IndexReg=*/0, /*Scale=*/1,
+ Loc, Loc, 0);
+}
+
+std::unique_ptr<X86Operand> X86AsmParser::DefaultMemDIOperand(SMLoc Loc) {
+ bool Parse32 = is32BitMode() || Code16GCC;
+ unsigned Basereg = is64BitMode() ? X86::RDI : (Parse32 ? X86::EDI : X86::DI);
+ const MCExpr *Disp = MCConstantExpr::create(0, getContext());
+ return X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp,
+ /*BaseReg=*/Basereg, /*IndexReg=*/0, /*Scale=*/1,
+ Loc, Loc, 0);
+}
+
+bool X86AsmParser::IsSIReg(unsigned Reg) {
+ switch (Reg) {
+ default: llvm_unreachable("Only (R|E)SI and (R|E)DI are expected!");
+ case X86::RSI:
+ case X86::ESI:
+ case X86::SI:
+ return true;
+ case X86::RDI:
+ case X86::EDI:
+ case X86::DI:
+ return false;
+ }
+}
+
+unsigned X86AsmParser::GetSIDIForRegClass(unsigned RegClassID, unsigned Reg,
+ bool IsSIReg) {
+ switch (RegClassID) {
+ default: llvm_unreachable("Unexpected register class");
+ case X86::GR64RegClassID:
+ return IsSIReg ? X86::RSI : X86::RDI;
+ case X86::GR32RegClassID:
+ return IsSIReg ? X86::ESI : X86::EDI;
+ case X86::GR16RegClassID:
+ return IsSIReg ? X86::SI : X86::DI;
+ }
+}
+
+void X86AsmParser::AddDefaultSrcDestOperands(
+ OperandVector& Operands, std::unique_ptr<llvm::MCParsedAsmOperand> &&Src,
+ std::unique_ptr<llvm::MCParsedAsmOperand> &&Dst) {
+ if (isParsingIntelSyntax()) {
+ Operands.push_back(std::move(Dst));
+ Operands.push_back(std::move(Src));
+ }
+ else {
+ Operands.push_back(std::move(Src));
+ Operands.push_back(std::move(Dst));
+ }
+}
+
+bool X86AsmParser::VerifyAndAdjustOperands(OperandVector &OrigOperands,
+ OperandVector &FinalOperands) {
+
+ if (OrigOperands.size() > 1) {
+ // Check if sizes match, OrigOperands also contains the instruction name
+ assert(OrigOperands.size() == FinalOperands.size() + 1 &&
+ "Operand size mismatch");
+
+ SmallVector<std::pair<SMLoc, std::string>, 2> Warnings;
+ // Verify types match
+ int RegClassID = -1;
+ for (unsigned int i = 0; i < FinalOperands.size(); ++i) {
+ X86Operand &OrigOp = static_cast<X86Operand &>(*OrigOperands[i + 1]);
+ X86Operand &FinalOp = static_cast<X86Operand &>(*FinalOperands[i]);
+
+ if (FinalOp.isReg() &&
+ (!OrigOp.isReg() || FinalOp.getReg() != OrigOp.getReg()))
+ // Return false and let a normal complaint about bogus operands happen
+ return false;
+
+ if (FinalOp.isMem()) {
+
+ if (!OrigOp.isMem())
+ // Return false and let a normal complaint about bogus operands happen
+ return false;
+
+ unsigned OrigReg = OrigOp.Mem.BaseReg;
+ unsigned FinalReg = FinalOp.Mem.BaseReg;
+
+ // If we've already encounterd a register class, make sure all register
+ // bases are of the same register class
+ if (RegClassID != -1 &&
+ !X86MCRegisterClasses[RegClassID].contains(OrigReg)) {
+ return Error(OrigOp.getStartLoc(),
+ "mismatching source and destination index registers");
+ }
+
+ if (X86MCRegisterClasses[X86::GR64RegClassID].contains(OrigReg))
+ RegClassID = X86::GR64RegClassID;
+ else if (X86MCRegisterClasses[X86::GR32RegClassID].contains(OrigReg))
+ RegClassID = X86::GR32RegClassID;
+ else if (X86MCRegisterClasses[X86::GR16RegClassID].contains(OrigReg))
+ RegClassID = X86::GR16RegClassID;
+ else
+ // Unexpected register class type
+ // Return false and let a normal complaint about bogus operands happen
+ return false;
+
+ bool IsSI = IsSIReg(FinalReg);
+ FinalReg = GetSIDIForRegClass(RegClassID, FinalReg, IsSI);
+
+ if (FinalReg != OrigReg) {
+ std::string RegName = IsSI ? "ES:(R|E)SI" : "ES:(R|E)DI";
+ Warnings.push_back(std::make_pair(
+ OrigOp.getStartLoc(),
+ "memory operand is only for determining the size, " + RegName +
+ " will be used for the location"));
+ }
+
+ FinalOp.Mem.Size = OrigOp.Mem.Size;
+ FinalOp.Mem.SegReg = OrigOp.Mem.SegReg;
+ FinalOp.Mem.BaseReg = FinalReg;
+ }
+ }
+
+ // Produce warnings only if all the operands passed the adjustment - prevent
+ // legal cases like "movsd (%rax), %xmm0" mistakenly produce warnings
+ for (auto &WarningMsg : Warnings) {
+ Warning(WarningMsg.first, WarningMsg.second);
+ }
+
+ // Remove old operands
+ for (unsigned int i = 0; i < FinalOperands.size(); ++i)
+ OrigOperands.pop_back();
+ }
+ // OrigOperands.append(FinalOperands.begin(), FinalOperands.end());
+ for (unsigned int i = 0; i < FinalOperands.size(); ++i)
+ OrigOperands.push_back(std::move(FinalOperands[i]));
+
+ return false;
+}
+
+std::unique_ptr<X86Operand> X86AsmParser::ParseOperand() {
+ if (isParsingIntelSyntax())
+ return ParseIntelOperand();
+ return ParseATTOperand();
+}
+
+/// getIntelMemOperandSize - Return intel memory operand size.
+static unsigned getIntelMemOperandSize(StringRef OpStr) {
+ unsigned Size = StringSwitch<unsigned>(OpStr)
+ .Cases("BYTE", "byte", 8)
+ .Cases("WORD", "word", 16)
+ .Cases("DWORD", "dword", 32)
+ .Cases("FWORD", "fword", 48)
+ .Cases("QWORD", "qword", 64)
+ .Cases("MMWORD","mmword", 64)
+ .Cases("XWORD", "xword", 80)
+ .Cases("TBYTE", "tbyte", 80)
+ .Cases("XMMWORD", "xmmword", 128)
+ .Cases("YMMWORD", "ymmword", 256)
+ .Cases("ZMMWORD", "zmmword", 512)
+ .Cases("OPAQUE", "opaque", -1U) // needs to be non-zero, but doesn't matter
+ .Default(0);
+ return Size;
+}
+
+std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm(
+ unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, unsigned IndexReg,
+ unsigned Scale, SMLoc Start, SMLoc End, unsigned Size, StringRef Identifier,
+ InlineAsmIdentifierInfo &Info, bool AllowBetterSizeMatch) {
+ // If we found a decl other than a VarDecl, then assume it is a FuncDecl or
+ // some other label reference.
+ if (isa<MCSymbolRefExpr>(Disp) && Info.OpDecl && !Info.IsVarDecl) {
+ // Insert an explicit size if the user didn't have one.
+ if (!Size) {
+ Size = getPointerWidth();
+ InstInfo->AsmRewrites->emplace_back(AOK_SizeDirective, Start,
+ /*Len=*/0, Size);
+ }
+
+ // Create an absolute memory reference in order to match against
+ // instructions taking a PC relative operand.
+ return X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size,
+ Identifier, Info.OpDecl);
+ }
+
+ // We either have a direct symbol reference, or an offset from a symbol. The
+ // parser always puts the symbol on the LHS, so look there for size
+ // calculation purposes.
+ const MCBinaryExpr *BinOp = dyn_cast<MCBinaryExpr>(Disp);
+ bool IsSymRef =
+ isa<MCSymbolRefExpr>(BinOp ? BinOp->getLHS() : Disp);
+ if (IsSymRef) {
+ if (!Size) {
+ Size = Info.Type * 8; // Size is in terms of bits in this context.
+ if (Size)
+ InstInfo->AsmRewrites->emplace_back(AOK_SizeDirective, Start,
+ /*Len=*/0, Size);
+ if (AllowBetterSizeMatch)
+ // Handle cases where size qualifier is absent, upon an indirect symbol
+ // reference - e.g. "vaddps zmm1, zmm2, [var]"
+ // set Size to zero to allow matching mechansim to try and find a better
+ // size qualifier than our initial guess, based on available variants of
+ // the given instruction
+ Size = 0;
+ }
+ }
+
+ // When parsing inline assembly we set the base register to a non-zero value
+ // if we don't know the actual value at this time. This is necessary to
+ // get the matching correct in some cases.
+ BaseReg = BaseReg ? BaseReg : 1;
+ return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg,
+ IndexReg, Scale, Start, End, Size, Identifier,
+ Info.OpDecl);
+}
+
+static void
+RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> &AsmRewrites,
+ StringRef SymName, int64_t ImmDisp,
+ int64_t FinalImmDisp, SMLoc &BracLoc,
+ SMLoc &StartInBrac, SMLoc &End) {
+ // Remove the '[' and ']' from the IR string.
+ AsmRewrites.emplace_back(AOK_Skip, BracLoc, 1);
+ AsmRewrites.emplace_back(AOK_Skip, End, 1);
+
+ // If ImmDisp is non-zero, then we parsed a displacement before the
+ // bracketed expression (i.e., ImmDisp [ BaseReg + Scale*IndexReg + Disp])
+ // If ImmDisp doesn't match the displacement computed by the state machine
+ // then we have an additional displacement in the bracketed expression.
+ if (ImmDisp != FinalImmDisp) {
+ if (ImmDisp) {
+ // We have an immediate displacement before the bracketed expression.
+ // Adjust this to match the final immediate displacement.
+ bool Found = false;
+ for (AsmRewrite &AR : AsmRewrites) {
+ if (AR.Loc.getPointer() > BracLoc.getPointer())
+ continue;
+ if (AR.Kind == AOK_ImmPrefix || AR.Kind == AOK_Imm) {
+ assert (!Found && "ImmDisp already rewritten.");
+ AR.Kind = AOK_Imm;
+ AR.Len = BracLoc.getPointer() - AR.Loc.getPointer();
+ AR.Val = FinalImmDisp;
+ Found = true;
+ break;
+ }
+ }
+ assert (Found && "Unable to rewrite ImmDisp.");
+ (void)Found;
+ } else {
+ // We have a symbolic and an immediate displacement, but no displacement
+ // before the bracketed expression. Put the immediate displacement
+ // before the bracketed expression.
+ AsmRewrites.emplace_back(AOK_Imm, BracLoc, 0, FinalImmDisp);
+ }
+ }
+ // Remove all the ImmPrefix rewrites within the brackets.
+ for (AsmRewrite &AR : AsmRewrites) {
+ if (AR.Loc.getPointer() < StartInBrac.getPointer())
+ continue;
+ if (AR.Kind == AOK_ImmPrefix)
+ AR.Kind = AOK_Delete;
+ }
+ const char *SymLocPtr = SymName.data();
+ // Skip everything before the symbol.
+ if (unsigned Len = SymLocPtr - StartInBrac.getPointer()) {
+ assert(Len > 0 && "Expected a non-negative length.");
+ AsmRewrites.emplace_back(AOK_Skip, StartInBrac, Len);
+ }
+ // Skip everything after the symbol.
+ if (unsigned Len = End.getPointer() - (SymLocPtr + SymName.size())) {
+ SMLoc Loc = SMLoc::getFromPointer(SymLocPtr + SymName.size());
+ assert(Len > 0 && "Expected a non-negative length.");
+ AsmRewrites.emplace_back(AOK_Skip, Loc, Len);
+ }
+}
+
+bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
+ MCAsmParser &Parser = getParser();
+ const AsmToken &Tok = Parser.getTok();
+
+ AsmToken::TokenKind PrevTK = AsmToken::Error;
+ bool Done = false;
+ while (!Done) {
+ bool UpdateLocLex = true;
+
+ // The period in the dot operator (e.g., [ebx].foo.bar) is parsed as an
+ // identifier. Don't try an parse it as a register.
+ if (PrevTK != AsmToken::Error && Tok.getString().startswith("."))
+ break;
+
+ // If we're parsing an immediate expression, we don't expect a '['.
+ if (SM.getStopOnLBrac() && getLexer().getKind() == AsmToken::LBrac)
+ break;
+
+ AsmToken::TokenKind TK = getLexer().getKind();
+ switch (TK) {
+ default: {
+ if (SM.isValidEndState()) {
+ Done = true;
+ break;
+ }
+ return Error(Tok.getLoc(), "unknown token in expression");
+ }
+ case AsmToken::EndOfStatement: {
+ Done = true;
+ break;
+ }
+ case AsmToken::String:
+ case AsmToken::Identifier: {
+ // This could be a register or a symbolic displacement.
+ unsigned TmpReg;
+ const MCExpr *Val;
+ SMLoc IdentLoc = Tok.getLoc();
+ StringRef Identifier = Tok.getString();
+ if (TK != AsmToken::String && !ParseRegister(TmpReg, IdentLoc, End)) {
+ SM.onRegister(TmpReg);
+ UpdateLocLex = false;
+ break;
+ } else {
+ if (!isParsingInlineAsm()) {
+ if (getParser().parsePrimaryExpr(Val, End))
+ return Error(Tok.getLoc(), "Unexpected identifier!");
+ } else {
+ // This is a dot operator, not an adjacent identifier.
+ if (Identifier.find('.') != StringRef::npos &&
+ PrevTK == AsmToken::RBrac) {
+ return false;
+ } else {
+ InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo();
+ if (ParseIntelIdentifier(Val, Identifier, Info,
+ /*Unevaluated=*/false, End))
+ return true;
+ }
+ }
+ SM.onIdentifierExpr(Val, Identifier);
+ UpdateLocLex = false;
+ break;
+ }
+ return Error(Tok.getLoc(), "Unexpected identifier!");
+ }
+ case AsmToken::Integer: {
+ StringRef ErrMsg;
+ if (isParsingInlineAsm() && SM.getAddImmPrefix())
+ InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, Tok.getLoc());
+ // Look for 'b' or 'f' following an Integer as a directional label
+ SMLoc Loc = getTok().getLoc();
+ int64_t IntVal = getTok().getIntVal();
+ End = consumeToken();
+ UpdateLocLex = false;
+ if (getLexer().getKind() == AsmToken::Identifier) {
+ StringRef IDVal = getTok().getString();
+ if (IDVal == "f" || IDVal == "b") {
+ MCSymbol *Sym =
+ getContext().getDirectionalLocalSymbol(IntVal, IDVal == "b");
+ MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
+ const MCExpr *Val =
+ MCSymbolRefExpr::create(Sym, Variant, getContext());
+ if (IDVal == "b" && Sym->isUndefined())
+ return Error(Loc, "invalid reference to undefined symbol");
+ StringRef Identifier = Sym->getName();
+ SM.onIdentifierExpr(Val, Identifier);
+ End = consumeToken();
+ } else {
+ if (SM.onInteger(IntVal, ErrMsg))
+ return Error(Loc, ErrMsg);
+ }
+ } else {
+ if (SM.onInteger(IntVal, ErrMsg))
+ return Error(Loc, ErrMsg);
+ }
+ break;
+ }
+ case AsmToken::Plus: SM.onPlus(); break;
+ case AsmToken::Minus: SM.onMinus(); break;
+ case AsmToken::Tilde: SM.onNot(); break;
+ case AsmToken::Star: SM.onStar(); break;
+ case AsmToken::Slash: SM.onDivide(); break;
+ case AsmToken::Pipe: SM.onOr(); break;
+ case AsmToken::Caret: SM.onXor(); break;
+ case AsmToken::Amp: SM.onAnd(); break;
+ case AsmToken::LessLess:
+ SM.onLShift(); break;
+ case AsmToken::GreaterGreater:
+ SM.onRShift(); break;
+ case AsmToken::LBrac: SM.onLBrac(); break;
+ case AsmToken::RBrac: SM.onRBrac(); break;
+ case AsmToken::LParen: SM.onLParen(); break;
+ case AsmToken::RParen: SM.onRParen(); break;
+ }
+ if (SM.hadError())
+ return Error(Tok.getLoc(), "unknown token in expression");
+
+ if (!Done && UpdateLocLex)
+ End = consumeToken();
+
+ PrevTK = TK;
+ }
+ return false;
+}
+
+std::unique_ptr<X86Operand>
+X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
+ int64_t ImmDisp, bool isSymbol,
+ unsigned Size) {
+ MCAsmParser &Parser = getParser();
+ const AsmToken &Tok = Parser.getTok();
+ SMLoc BracLoc = Tok.getLoc(), End = Tok.getEndLoc();
+ if (getLexer().isNot(AsmToken::LBrac))
+ return ErrorOperand(BracLoc, "Expected '[' token!");
+ Parser.Lex(); // Eat '['
+
+ SMLoc StartInBrac = Parser.getTok().getLoc();
+ // Parse [ Symbol + ImmDisp ] and [ BaseReg + Scale*IndexReg + ImmDisp ]. We
+ // may have already parsed an immediate displacement before the bracketed
+ // expression.
+ IntelExprStateMachine SM(ImmDisp, /*StopOnLBrac=*/false, /*AddImmPrefix=*/true);
+ if (ParseIntelExpression(SM, End))
+ return nullptr;
+
+ const MCExpr *Disp = nullptr;
+ if (const MCExpr *Sym = SM.getSym()) {
+ // A symbolic displacement.
+ Disp = Sym;
+ if (isParsingInlineAsm())
+ RewriteIntelBracExpression(*InstInfo->AsmRewrites, SM.getSymName(),
+ ImmDisp, SM.getImm(), BracLoc, StartInBrac,
+ End);
+ }
+
+ if (SM.getImm() || !Disp) {
+ const MCExpr *Imm = MCConstantExpr::create(SM.getImm(), getContext());
+ if (Disp)
+ Disp = MCBinaryExpr::createAdd(Disp, Imm, getContext());
+ else
+ Disp = Imm; // An immediate displacement only.
+ }
+
+ // Parse struct field access. Intel requires a dot, but MSVC doesn't. MSVC
+ // will in fact do global lookup the field name inside all global typedefs,
+ // but we don't emulate that.
+ if ((Parser.getTok().getKind() == AsmToken::Identifier ||
+ Parser.getTok().getKind() == AsmToken::Dot ||
+ Parser.getTok().getKind() == AsmToken::Real) &&
+ Parser.getTok().getString().find('.') != StringRef::npos) {
+ const MCExpr *NewDisp;
+ if (ParseIntelDotOperator(Disp, NewDisp))
+ return nullptr;
+
+ End = Tok.getEndLoc();
+ Parser.Lex(); // Eat the field.
+ Disp = NewDisp;
+ }
+
+ if (isSymbol) {
+ if (SM.getSym()) {
+ Error(Start, "cannot use more than one symbol in memory operand");
+ return nullptr;
+ }
+ if (SM.getBaseReg()) {
+ Error(Start, "cannot use base register with variable reference");
+ return nullptr;
+ }
+ if (SM.getIndexReg()) {
+ Error(Start, "cannot use index register with variable reference");
+ return nullptr;
+ }
+ }
+
+ int BaseReg = SM.getBaseReg();
+ int IndexReg = SM.getIndexReg();
+ int Scale = SM.getScale();
+ if (!isParsingInlineAsm()) {
+ // handle [-42]
+ if (!BaseReg && !IndexReg) {
+ if (!SegReg)
+ return X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size);
+ return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, 0, 0, 1,
+ Start, End, Size);
+ }
+ StringRef ErrMsg;
+ if (CheckBaseRegAndIndexReg(BaseReg, IndexReg, ErrMsg)) {
+ Error(StartInBrac, ErrMsg);
+ return nullptr;
+ }
+ return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg,
+ IndexReg, Scale, Start, End, Size);
+ }
+
+ InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo();
+ return CreateMemForInlineAsm(SegReg, Disp, BaseReg, IndexReg, Scale, Start,
+ End, Size, SM.getSymName(), Info,
+ isParsingInlineAsm());
+}
+
+// Inline assembly may use variable names with namespace alias qualifiers.
+bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val,
+ StringRef &Identifier,
+ InlineAsmIdentifierInfo &Info,
+ bool IsUnevaluatedOperand, SMLoc &End) {
+ MCAsmParser &Parser = getParser();
+ assert(isParsingInlineAsm() && "Expected to be parsing inline assembly.");
+ Val = nullptr;
+
+ StringRef LineBuf(Identifier.data());
+ void *Result =
+ SemaCallback->LookupInlineAsmIdentifier(LineBuf, Info, IsUnevaluatedOperand);
+
+ const AsmToken &Tok = Parser.getTok();
+ SMLoc Loc = Tok.getLoc();
+
+ // Advance the token stream until the end of the current token is
+ // after the end of what the frontend claimed.
+ const char *EndPtr = Tok.getLoc().getPointer() + LineBuf.size();
+ do {
+ End = Tok.getEndLoc();
+ getLexer().Lex();
+ } while (End.getPointer() < EndPtr);
+ Identifier = LineBuf;
+
+ // The frontend should end parsing on an assembler token boundary, unless it
+ // failed parsing.
+ assert((End.getPointer() == EndPtr || !Result) &&
+ "frontend claimed part of a token?");
+
+ // If the identifier lookup was unsuccessful, assume that we are dealing with
+ // a label.
+ if (!Result) {
+ StringRef InternalName =
+ SemaCallback->LookupInlineAsmLabel(Identifier, getSourceManager(),
+ Loc, false);
+ assert(InternalName.size() && "We should have an internal name here.");
+ // Push a rewrite for replacing the identifier name with the internal name.
+ InstInfo->AsmRewrites->emplace_back(AOK_Label, Loc, Identifier.size(),
+ InternalName);
+ }
+
+ // Create the symbol reference.
+ MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier);
+ MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
+ Val = MCSymbolRefExpr::create(Sym, Variant, getParser().getContext());
+ return false;
+}
+
+/// \brief Parse intel style segment override.
+std::unique_ptr<X86Operand>
+X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start,
+ unsigned Size) {
+ MCAsmParser &Parser = getParser();
+ assert(SegReg != 0 && "Tried to parse a segment override without a segment!");
+ const AsmToken &Tok = Parser.getTok(); // Eat colon.
+ if (Tok.isNot(AsmToken::Colon))
+ return ErrorOperand(Tok.getLoc(), "Expected ':' token!");
+ Parser.Lex(); // Eat ':'
+
+ int64_t ImmDisp = 0;
+ if (getLexer().is(AsmToken::Integer)) {
+ ImmDisp = Tok.getIntVal();
+ AsmToken ImmDispToken = Parser.Lex(); // Eat the integer.
+
+ if (isParsingInlineAsm())
+ InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, ImmDispToken.getLoc());
+
+ if (getLexer().isNot(AsmToken::LBrac)) {
+ // An immediate following a 'segment register', 'colon' token sequence can
+ // be followed by a bracketed expression. If it isn't we know we have our
+ // final segment override.
+ const MCExpr *Disp = MCConstantExpr::create(ImmDisp, getContext());
+ return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp,
+ /*BaseReg=*/0, /*IndexReg=*/0, /*Scale=*/1,
+ Start, ImmDispToken.getEndLoc(), Size);
+ }
+ }
+
+ if (getLexer().is(AsmToken::LBrac))
+ return ParseIntelBracExpression(SegReg, Start, ImmDisp, false, Size);
+
+ const MCExpr *Val;
+ SMLoc End;
+ if (!isParsingInlineAsm()) {
+ if (getParser().parsePrimaryExpr(Val, End))
+ return ErrorOperand(Tok.getLoc(), "unknown token in expression");
+
+ return X86Operand::CreateMem(getPointerWidth(), Val, Start, End, Size);
+ }
+
+ InlineAsmIdentifierInfo Info;
+ StringRef Identifier = Tok.getString();
+ if (ParseIntelIdentifier(Val, Identifier, Info,
+ /*Unevaluated=*/false, End))
+ return nullptr;
+ return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0,/*IndexReg=*/0,
+ /*Scale=*/1, Start, End, Size, Identifier, Info);
+}
+
+//ParseRoundingModeOp - Parse AVX-512 rounding mode operand
+std::unique_ptr<X86Operand>
+X86AsmParser::ParseRoundingModeOp(SMLoc Start, SMLoc End) {
+ MCAsmParser &Parser = getParser();
+ const AsmToken &Tok = Parser.getTok();
+ // Eat "{" and mark the current place.
+ const SMLoc consumedToken = consumeToken();
+ if (Tok.getIdentifier().startswith("r")){
+ int rndMode = StringSwitch<int>(Tok.getIdentifier())
+ .Case("rn", X86::STATIC_ROUNDING::TO_NEAREST_INT)
+ .Case("rd", X86::STATIC_ROUNDING::TO_NEG_INF)
+ .Case("ru", X86::STATIC_ROUNDING::TO_POS_INF)
+ .Case("rz", X86::STATIC_ROUNDING::TO_ZERO)
+ .Default(-1);
+ if (-1 == rndMode)
+ return ErrorOperand(Tok.getLoc(), "Invalid rounding mode.");
+ Parser.Lex(); // Eat "r*" of r*-sae
+ if (!getLexer().is(AsmToken::Minus))
+ return ErrorOperand(Tok.getLoc(), "Expected - at this point");
+ Parser.Lex(); // Eat "-"
+ Parser.Lex(); // Eat the sae
+ if (!getLexer().is(AsmToken::RCurly))
+ return ErrorOperand(Tok.getLoc(), "Expected } at this point");
+ Parser.Lex(); // Eat "}"
+ const MCExpr *RndModeOp =
+ MCConstantExpr::create(rndMode, Parser.getContext());
+ return X86Operand::CreateImm(RndModeOp, Start, End);
+ }
+ if(Tok.getIdentifier().equals("sae")){
+ Parser.Lex(); // Eat the sae
+ if (!getLexer().is(AsmToken::RCurly))
+ return ErrorOperand(Tok.getLoc(), "Expected } at this point");
+ Parser.Lex(); // Eat "}"
+ return X86Operand::CreateToken("{sae}", consumedToken);
+ }
+ return ErrorOperand(Tok.getLoc(), "unknown token in expression");
+}
+
+/// Parse the '.' operator.
+bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
+ const MCExpr *&NewDisp) {
+ MCAsmParser &Parser = getParser();
+ const AsmToken &Tok = Parser.getTok();
+ int64_t OrigDispVal, DotDispVal;
+
+ // FIXME: Handle non-constant expressions.
+ if (const MCConstantExpr *OrigDisp = dyn_cast<MCConstantExpr>(Disp))
+ OrigDispVal = OrigDisp->getValue();
+ else
+ return Error(Tok.getLoc(), "Non-constant offsets are not supported!");
+
+ // Drop the optional '.'.
+ StringRef DotDispStr = Tok.getString();
+ if (DotDispStr.startswith("."))
+ DotDispStr = DotDispStr.drop_front(1);
+
+ // .Imm gets lexed as a real.
+ if (Tok.is(AsmToken::Real)) {
+ APInt DotDisp;
+ DotDispStr.getAsInteger(10, DotDisp);
+ DotDispVal = DotDisp.getZExtValue();
+ } else if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) {
+ unsigned DotDisp;
+ std::pair<StringRef, StringRef> BaseMember = DotDispStr.split('.');
+ if (SemaCallback->LookupInlineAsmField(BaseMember.first, BaseMember.second,
+ DotDisp))
+ return Error(Tok.getLoc(), "Unable to lookup field reference!");
+ DotDispVal = DotDisp;
+ } else
+ return Error(Tok.getLoc(), "Unexpected token type!");
+
+ if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) {
+ SMLoc Loc = SMLoc::getFromPointer(DotDispStr.data());
+ unsigned Len = DotDispStr.size();
+ unsigned Val = OrigDispVal + DotDispVal;
+ InstInfo->AsmRewrites->emplace_back(AOK_DotOperator, Loc, Len, Val);
+ }
+
+ NewDisp = MCConstantExpr::create(OrigDispVal + DotDispVal, getContext());
+ return false;
+}
+
+/// Parse the 'offset' operator. This operator is used to specify the
+/// location rather then the content of a variable.
+std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOffsetOfOperator() {
+ MCAsmParser &Parser = getParser();
+ const AsmToken &Tok = Parser.getTok();
+ SMLoc OffsetOfLoc = Tok.getLoc();
+ Parser.Lex(); // Eat offset.
+
+ const MCExpr *Val;
+ InlineAsmIdentifierInfo Info;
+ SMLoc Start = Tok.getLoc(), End;
+ StringRef Identifier = Tok.getString();
+ if (ParseIntelIdentifier(Val, Identifier, Info,
+ /*Unevaluated=*/false, End))
+ return nullptr;
+
+ // Don't emit the offset operator.
+ InstInfo->AsmRewrites->emplace_back(AOK_Skip, OffsetOfLoc, 7);
+
+ // The offset operator will have an 'r' constraint, thus we need to create
+ // register operand to ensure proper matching. Just pick a GPR based on
+ // the size of a pointer.
+ bool Parse32 = is32BitMode() || Code16GCC;
+ unsigned RegNo = is64BitMode() ? X86::RBX : (Parse32 ? X86::EBX : X86::BX);
+
+ return X86Operand::CreateReg(RegNo, Start, End, /*GetAddress=*/true,
+ OffsetOfLoc, Identifier, Info.OpDecl);
+}
+
+enum IntelOperatorKind {
+ IOK_LENGTH,
+ IOK_SIZE,
+ IOK_TYPE
+};
+
+/// Parse the 'LENGTH', 'TYPE' and 'SIZE' operators. The LENGTH operator
+/// returns the number of elements in an array. It returns the value 1 for
+/// non-array variables. The SIZE operator returns the size of a C or C++
+/// variable. A variable's size is the product of its LENGTH and TYPE. The
+/// TYPE operator returns the size of a C or C++ type or variable. If the
+/// variable is an array, TYPE returns the size of a single element.
+std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperator(unsigned OpKind) {
+ MCAsmParser &Parser = getParser();
+ const AsmToken &Tok = Parser.getTok();
+ SMLoc TypeLoc = Tok.getLoc();
+ Parser.Lex(); // Eat operator.
+
+ const MCExpr *Val = nullptr;
+ InlineAsmIdentifierInfo Info;
+ SMLoc Start = Tok.getLoc(), End;
+ StringRef Identifier = Tok.getString();
+ if (ParseIntelIdentifier(Val, Identifier, Info,
+ /*Unevaluated=*/true, End))
+ return nullptr;
+
+ if (!Info.OpDecl)
+ return ErrorOperand(Start, "unable to lookup expression");
+
+ unsigned CVal = 0;
+ switch(OpKind) {
+ default: llvm_unreachable("Unexpected operand kind!");
+ case IOK_LENGTH: CVal = Info.Length; break;
+ case IOK_SIZE: CVal = Info.Size; break;
+ case IOK_TYPE: CVal = Info.Type; break;
+ }
+
+ // Rewrite the type operator and the C or C++ type or variable in terms of an
+ // immediate. E.g. TYPE foo -> $$4
+ unsigned Len = End.getPointer() - TypeLoc.getPointer();
+ InstInfo->AsmRewrites->emplace_back(AOK_Imm, TypeLoc, Len, CVal);
+
+ const MCExpr *Imm = MCConstantExpr::create(CVal, getContext());
+ return X86Operand::CreateImm(Imm, Start, End);
+}
+
+std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
+ MCAsmParser &Parser = getParser();
+ const AsmToken &Tok = Parser.getTok();
+ SMLoc Start, End;
+
+ // Offset, length, type and size operators.
+ if (isParsingInlineAsm()) {
+ StringRef AsmTokStr = Tok.getString();
+ if (AsmTokStr == "offset" || AsmTokStr == "OFFSET")
+ return ParseIntelOffsetOfOperator();
+ if (AsmTokStr == "length" || AsmTokStr == "LENGTH")
+ return ParseIntelOperator(IOK_LENGTH);
+ if (AsmTokStr == "size" || AsmTokStr == "SIZE")
+ return ParseIntelOperator(IOK_SIZE);
+ if (AsmTokStr == "type" || AsmTokStr == "TYPE")
+ return ParseIntelOperator(IOK_TYPE);
+ }
+
+ bool PtrInOperand = false;
+ unsigned Size = getIntelMemOperandSize(Tok.getString());
+ if (Size) {
+ Parser.Lex(); // Eat operand size (e.g., byte, word).
+ if (Tok.getString() != "PTR" && Tok.getString() != "ptr")
+ return ErrorOperand(Tok.getLoc(), "Expected 'PTR' or 'ptr' token!");
+ Parser.Lex(); // Eat ptr.
+ PtrInOperand = true;
+ }
+
+ Start = Tok.getLoc();
+
+ // rounding mode token
+ if (getSTI().getFeatureBits()[X86::FeatureAVX512] &&
+ getLexer().is(AsmToken::LCurly))
+ return ParseRoundingModeOp(Start, End);
+
+ // Register.
+ unsigned RegNo = 0;
+ if (getLexer().is(AsmToken::Identifier) &&
+ !ParseRegister(RegNo, Start, End)) {
+ // If this is a segment register followed by a ':', then this is the start
+ // of a segment override, otherwise this is a normal register reference.
+ // In case it is a normal register and there is ptr in the operand this
+ // is an error
+ if (RegNo == X86::RIP)
+ return ErrorOperand(Start, "rip can only be used as a base register");
+ if (getLexer().isNot(AsmToken::Colon)) {
+ if (PtrInOperand) {
+ return ErrorOperand(Start, "expected memory operand after "
+ "'ptr', found register operand instead");
+ }
+ return X86Operand::CreateReg(RegNo, Start, End);
+ }
+ return ParseIntelSegmentOverride(/*SegReg=*/RegNo, Start, Size);
+ }
+
+ // Immediates and Memory
+
+ // Parse [ BaseReg + Scale*IndexReg + Disp ].
+ if (getLexer().is(AsmToken::LBrac))
+ return ParseIntelBracExpression(/*SegReg=*/0, Start, /*ImmDisp=*/0, false,
+ Size);
+
+ AsmToken StartTok = Tok;
+ IntelExprStateMachine SM(/*Imm=*/0, /*StopOnLBrac=*/true,
+ /*AddImmPrefix=*/false);
+ if (ParseIntelExpression(SM, End))
+ return nullptr;
+
+ bool isSymbol = SM.getSym() && SM.getSym()->getKind() != MCExpr::Constant;
+ int64_t Imm = SM.getImm();
+ if (SM.getSym() && SM.getSym()->getKind() == MCExpr::Constant)
+ SM.getSym()->evaluateAsAbsolute(Imm);
+
+ if (StartTok.isNot(AsmToken::Identifier) &&
+ StartTok.isNot(AsmToken::String) && isParsingInlineAsm()) {
+ unsigned Len = Tok.getLoc().getPointer() - Start.getPointer();
+ if (StartTok.getString().size() == Len)
+ // Just add a prefix if this wasn't a complex immediate expression.
+ InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, Start);
+ else
+ // Otherwise, rewrite the complex expression as a single immediate.
+ InstInfo->AsmRewrites->emplace_back(AOK_Imm, Start, Len, Imm);
+ }
+
+ if (getLexer().isNot(AsmToken::LBrac)) {
+ // If a directional label (ie. 1f or 2b) was parsed above from
+ // ParseIntelExpression() then SM.getSym() was set to a pointer to
+ // to the MCExpr with the directional local symbol and this is a
+ // memory operand not an immediate operand.
+ if (isSymbol) {
+ if (isParsingInlineAsm())
+ return CreateMemForInlineAsm(/*SegReg=*/0, SM.getSym(), /*BaseReg=*/0,
+ /*IndexReg=*/0,
+ /*Scale=*/1, Start, End, Size,
+ SM.getSymName(), SM.getIdentifierInfo());
+ return X86Operand::CreateMem(getPointerWidth(), SM.getSym(), Start, End,
+ Size);
+ }
+
+ const MCExpr *ImmExpr = MCConstantExpr::create(Imm, getContext());
+ return X86Operand::CreateImm(ImmExpr, Start, End);
+ }
+
+ // Only positive immediates are valid.
+ if (Imm < 0)
+ return ErrorOperand(Start, "expected a positive immediate displacement "
+ "before bracketed expr.");
+
+ return ParseIntelBracExpression(/*SegReg=*/0, Start, Imm, isSymbol, Size);
+}
+
+std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() {
+ MCAsmParser &Parser = getParser();
+ switch (getLexer().getKind()) {
+ default:
+ // Parse a memory operand with no segment register.
+ return ParseMemOperand(0, Parser.getTok().getLoc());
+ case AsmToken::Percent: {
+ // Read the register.
+ unsigned RegNo;
+ SMLoc Start, End;
+ if (ParseRegister(RegNo, Start, End)) return nullptr;
+ if (RegNo == X86::EIZ || RegNo == X86::RIZ) {
+ Error(Start, "%eiz and %riz can only be used as index registers",
+ SMRange(Start, End));
+ return nullptr;
+ }
+ if (RegNo == X86::RIP) {
+ Error(Start, "%rip can only be used as a base register",
+ SMRange(Start, End));
+ return nullptr;
+ }
+
+ // If this is a segment register followed by a ':', then this is the start
+ // of a memory reference, otherwise this is a normal register reference.
+ if (getLexer().isNot(AsmToken::Colon))
+ return X86Operand::CreateReg(RegNo, Start, End);
+
+ if (!X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(RegNo))
+ return ErrorOperand(Start, "invalid segment register");
+
+ getParser().Lex(); // Eat the colon.
+ return ParseMemOperand(RegNo, Start);
+ }
+ case AsmToken::Dollar: {
+ // $42 -> immediate.
+ SMLoc Start = Parser.getTok().getLoc(), End;
+ Parser.Lex();
+ const MCExpr *Val;
+ if (getParser().parseExpression(Val, End))
+ return nullptr;
+ return X86Operand::CreateImm(Val, Start, End);
+ }
+ case AsmToken::LCurly:{
+ SMLoc Start = Parser.getTok().getLoc(), End;
+ if (getSTI().getFeatureBits()[X86::FeatureAVX512])
+ return ParseRoundingModeOp(Start, End);
+ return ErrorOperand(Start, "Unexpected '{' in expression");
+ }
+ }
+}
+
+// true on failure, false otherwise
+// If no {z} mark was found - Parser doesn't advance
+bool X86AsmParser::ParseZ(std::unique_ptr<X86Operand> &Z,
+ const SMLoc &StartLoc) {
+ MCAsmParser &Parser = getParser();
+ // Assuming we are just pass the '{' mark, quering the next token
+ // Searched for {z}, but none was found. Return false, as no parsing error was
+ // encountered
+ if (!(getLexer().is(AsmToken::Identifier) &&
+ (getLexer().getTok().getIdentifier() == "z")))
+ return false;
+ Parser.Lex(); // Eat z
+ // Query and eat the '}' mark
+ if (!getLexer().is(AsmToken::RCurly))
+ return Error(getLexer().getLoc(), "Expected } at this point");
+ Parser.Lex(); // Eat '}'
+ // Assign Z with the {z} mark opernad
+ Z = X86Operand::CreateToken("{z}", StartLoc);
+ return false;
+}
+
+// true on failure, false otherwise
+bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands,
+ const MCParsedAsmOperand &Op) {
+ MCAsmParser &Parser = getParser();
+ if(getSTI().getFeatureBits()[X86::FeatureAVX512]) {
+ if (getLexer().is(AsmToken::LCurly)) {
+ // Eat "{" and mark the current place.
+ const SMLoc consumedToken = consumeToken();
+ // Distinguish {1to<NUM>} from {%k<NUM>}.
+ if(getLexer().is(AsmToken::Integer)) {
+ // Parse memory broadcasting ({1to<NUM>}).
+ if (getLexer().getTok().getIntVal() != 1)
+ return TokError("Expected 1to<NUM> at this point");
+ Parser.Lex(); // Eat "1" of 1to8
+ if (!getLexer().is(AsmToken::Identifier) ||
+ !getLexer().getTok().getIdentifier().startswith("to"))
+ return TokError("Expected 1to<NUM> at this point");
+ // Recognize only reasonable suffixes.
+ const char *BroadcastPrimitive =
+ StringSwitch<const char*>(getLexer().getTok().getIdentifier())
+ .Case("to2", "{1to2}")
+ .Case("to4", "{1to4}")
+ .Case("to8", "{1to8}")
+ .Case("to16", "{1to16}")
+ .Default(nullptr);
+ if (!BroadcastPrimitive)
+ return TokError("Invalid memory broadcast primitive.");
+ Parser.Lex(); // Eat "toN" of 1toN
+ if (!getLexer().is(AsmToken::RCurly))
+ return TokError("Expected } at this point");
+ Parser.Lex(); // Eat "}"
+ Operands.push_back(X86Operand::CreateToken(BroadcastPrimitive,
+ consumedToken));
+ // No AVX512 specific primitives can pass
+ // after memory broadcasting, so return.
+ return false;
+ } else {
+ // Parse either {k}{z}, {z}{k}, {k} or {z}
+ // last one have no meaning, but GCC accepts it
+ // Currently, we're just pass a '{' mark
+ std::unique_ptr<X86Operand> Z;
+ if (ParseZ(Z, consumedToken))
+ return true;
+ // Reaching here means that parsing of the allegadly '{z}' mark yielded
+ // no errors.
+ // Query for the need of further parsing for a {%k<NUM>} mark
+ if (!Z || getLexer().is(AsmToken::LCurly)) {
+ const SMLoc StartLoc = Z ? consumeToken() : consumedToken;
+ // Parse an op-mask register mark ({%k<NUM>}), which is now to be
+ // expected
+ if (std::unique_ptr<X86Operand> Op = ParseOperand()) {
+ if (!getLexer().is(AsmToken::RCurly))
+ return Error(getLexer().getLoc(), "Expected } at this point");
+ Operands.push_back(X86Operand::CreateToken("{", StartLoc));
+ Operands.push_back(std::move(Op));
+ Operands.push_back(X86Operand::CreateToken("}", consumeToken()));
+ } else
+ return Error(getLexer().getLoc(),
+ "Expected an op-mask register at this point");
+ // {%k<NUM>} mark is found, inquire for {z}
+ if (getLexer().is(AsmToken::LCurly) && !Z) {
+ // Have we've found a parsing error, or found no (expected) {z} mark
+ // - report an error
+ if (ParseZ(Z, consumeToken()) || !Z)
+ return true;
+
+ }
+ // '{z}' on its own is meaningless, hence should be ignored.
+ // on the contrary - have it been accompanied by a K register,
+ // allow it.
+ if (Z)
+ Operands.push_back(std::move(Z));
+ }
+ }
+ }
+ }
+ return false;
+}
+
+/// ParseMemOperand: segment: disp(basereg, indexreg, scale). The '%ds:' prefix
+/// has already been parsed if present.
+std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
+ SMLoc MemStart) {
+
+ MCAsmParser &Parser = getParser();
+ // We have to disambiguate a parenthesized expression "(4+5)" from the start
+ // of a memory operand with a missing displacement "(%ebx)" or "(,%eax)". The
+ // only way to do this without lookahead is to eat the '(' and see what is
+ // after it.
+ const MCExpr *Disp = MCConstantExpr::create(0, getParser().getContext());
+ if (getLexer().isNot(AsmToken::LParen)) {
+ SMLoc ExprEnd;
+ if (getParser().parseExpression(Disp, ExprEnd)) return nullptr;
+
+ // After parsing the base expression we could either have a parenthesized
+ // memory address or not. If not, return now. If so, eat the (.
+ if (getLexer().isNot(AsmToken::LParen)) {
+ // Unless we have a segment register, treat this as an immediate.
+ if (SegReg == 0)
+ return X86Operand::CreateMem(getPointerWidth(), Disp, MemStart, ExprEnd);
+ return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, 0, 0, 1,
+ MemStart, ExprEnd);
+ }
+
+ // Eat the '('.
+ Parser.Lex();
+ } else {
+ // Okay, we have a '('. We don't know if this is an expression or not, but
+ // so we have to eat the ( to see beyond it.
+ SMLoc LParenLoc = Parser.getTok().getLoc();
+ Parser.Lex(); // Eat the '('.
+
+ if (getLexer().is(AsmToken::Percent) || getLexer().is(AsmToken::Comma)) {
+ // Nothing to do here, fall into the code below with the '(' part of the
+ // memory operand consumed.
+ } else {
+ SMLoc ExprEnd;
+
+ // It must be an parenthesized expression, parse it now.
+ if (getParser().parseParenExpression(Disp, ExprEnd))
+ return nullptr;
+
+ // After parsing the base expression we could either have a parenthesized
+ // memory address or not. If not, return now. If so, eat the (.
+ if (getLexer().isNot(AsmToken::LParen)) {
+ // Unless we have a segment register, treat this as an immediate.
+ if (SegReg == 0)
+ return X86Operand::CreateMem(getPointerWidth(), Disp, LParenLoc,
+ ExprEnd);
+ return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, 0, 0, 1,
+ MemStart, ExprEnd);
+ }
+
+ // Eat the '('.
+ Parser.Lex();
+ }
+ }
+
+ // If we reached here, then we just ate the ( of the memory operand. Process
+ // the rest of the memory operand.
+ unsigned BaseReg = 0, IndexReg = 0, Scale = 1;
+ SMLoc IndexLoc, BaseLoc;
+
+ if (getLexer().is(AsmToken::Percent)) {
+ SMLoc StartLoc, EndLoc;
+ BaseLoc = Parser.getTok().getLoc();
+ if (ParseRegister(BaseReg, StartLoc, EndLoc)) return nullptr;
+ if (BaseReg == X86::EIZ || BaseReg == X86::RIZ) {
+ Error(StartLoc, "eiz and riz can only be used as index registers",
+ SMRange(StartLoc, EndLoc));
+ return nullptr;
+ }
+ }
+
+ if (getLexer().is(AsmToken::Comma)) {
+ Parser.Lex(); // Eat the comma.
+ IndexLoc = Parser.getTok().getLoc();
+
+ // Following the comma we should have either an index register, or a scale
+ // value. We don't support the later form, but we want to parse it
+ // correctly.
+ //
+ // Not that even though it would be completely consistent to support syntax
+ // like "1(%eax,,1)", the assembler doesn't. Use "eiz" or "riz" for this.
+ if (getLexer().is(AsmToken::Percent)) {
+ SMLoc L;
+ if (ParseRegister(IndexReg, L, L))
+ return nullptr;
+ if (BaseReg == X86::RIP) {
+ Error(IndexLoc, "%rip as base register can not have an index register");
+ return nullptr;
+ }
+ if (IndexReg == X86::RIP) {
+ Error(IndexLoc, "%rip is not allowed as an index register");
+ return nullptr;
+ }
+
+ if (getLexer().isNot(AsmToken::RParen)) {
+ // Parse the scale amount:
+ // ::= ',' [scale-expression]
+ if (getLexer().isNot(AsmToken::Comma)) {
+ Error(Parser.getTok().getLoc(),
+ "expected comma in scale expression");
+ return nullptr;
+ }
+ Parser.Lex(); // Eat the comma.
+
+ if (getLexer().isNot(AsmToken::RParen)) {
+ SMLoc Loc = Parser.getTok().getLoc();
+
+ int64_t ScaleVal;
+ if (getParser().parseAbsoluteExpression(ScaleVal)){
+ Error(Loc, "expected scale expression");
+ return nullptr;
+ }
+
+ // Validate the scale amount.
+ if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) &&
+ ScaleVal != 1) {
+ Error(Loc, "scale factor in 16-bit address must be 1");
+ return nullptr;
+ }
+ if (ScaleVal != 1 && ScaleVal != 2 && ScaleVal != 4 &&
+ ScaleVal != 8) {
+ Error(Loc, "scale factor in address must be 1, 2, 4 or 8");
+ return nullptr;
+ }
+ Scale = (unsigned)ScaleVal;
+ }
+ }
+ } else if (getLexer().isNot(AsmToken::RParen)) {
+ // A scale amount without an index is ignored.
+ // index.
+ SMLoc Loc = Parser.getTok().getLoc();
+
+ int64_t Value;
+ if (getParser().parseAbsoluteExpression(Value))
+ return nullptr;
+
+ if (Value != 1)
+ Warning(Loc, "scale factor without index register is ignored");
+ Scale = 1;
+ }
+ }
+
+ // Ok, we've eaten the memory operand, verify we have a ')' and eat it too.
+ if (getLexer().isNot(AsmToken::RParen)) {
+ Error(Parser.getTok().getLoc(), "unexpected token in memory operand");
+ return nullptr;
+ }
+ SMLoc MemEnd = Parser.getTok().getEndLoc();
+ Parser.Lex(); // Eat the ')'.
+
+ // Check for use of invalid 16-bit registers. Only BX/BP/SI/DI are allowed,
+ // and then only in non-64-bit modes. Except for DX, which is a special case
+ // because an unofficial form of in/out instructions uses it.
+ if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) &&
+ (is64BitMode() || (BaseReg != X86::BX && BaseReg != X86::BP &&
+ BaseReg != X86::SI && BaseReg != X86::DI)) &&
+ BaseReg != X86::DX) {
+ Error(BaseLoc, "invalid 16-bit base register");
+ return nullptr;
+ }
+ if (BaseReg == 0 &&
+ X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg)) {
+ Error(IndexLoc, "16-bit memory operand may not include only index register");
+ return nullptr;
+ }
+
+ StringRef ErrMsg;
+ if (CheckBaseRegAndIndexReg(BaseReg, IndexReg, ErrMsg)) {
+ Error(BaseLoc, ErrMsg);
+ return nullptr;
+ }
+
+ if (SegReg || BaseReg || IndexReg)
+ return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg,
+ IndexReg, Scale, MemStart, MemEnd);
+ return X86Operand::CreateMem(getPointerWidth(), Disp, MemStart, MemEnd);
+}
+
+bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+ SMLoc NameLoc, OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ InstInfo = &Info;
+ StringRef PatchedName = Name;
+
+ if (Name == "jmp" && isParsingIntelSyntax() && isParsingInlineAsm()) {
+ StringRef NextTok = Parser.getTok().getString();
+ if (NextTok == "short") {
+ SMLoc NameEndLoc =
+ NameLoc.getFromPointer(NameLoc.getPointer() + Name.size());
+ // Eat the short keyword
+ Parser.Lex();
+ // MS ignores the short keyword, it determines the jmp type based
+ // on the distance of the label
+ InstInfo->AsmRewrites->emplace_back(AOK_Skip, NameEndLoc,
+ NextTok.size() + 1);
+ }
+ }
+
+ // FIXME: Hack to recognize setneb as setne.
+ if (PatchedName.startswith("set") && PatchedName.endswith("b") &&
+ PatchedName != "setb" && PatchedName != "setnb")
+ PatchedName = PatchedName.substr(0, Name.size()-1);
+
+ // FIXME: Hack to recognize cmp<comparison code>{ss,sd,ps,pd}.
+ if ((PatchedName.startswith("cmp") || PatchedName.startswith("vcmp")) &&
+ (PatchedName.endswith("ss") || PatchedName.endswith("sd") ||
+ PatchedName.endswith("ps") || PatchedName.endswith("pd"))) {
+ bool IsVCMP = PatchedName[0] == 'v';
+ unsigned CCIdx = IsVCMP ? 4 : 3;
+ unsigned ComparisonCode = StringSwitch<unsigned>(
+ PatchedName.slice(CCIdx, PatchedName.size() - 2))
+ .Case("eq", 0x00)
+ .Case("eq_oq", 0x00)
+ .Case("lt", 0x01)
+ .Case("lt_os", 0x01)
+ .Case("le", 0x02)
+ .Case("le_os", 0x02)
+ .Case("unord", 0x03)
+ .Case("unord_q", 0x03)
+ .Case("neq", 0x04)
+ .Case("neq_uq", 0x04)
+ .Case("nlt", 0x05)
+ .Case("nlt_us", 0x05)
+ .Case("nle", 0x06)
+ .Case("nle_us", 0x06)
+ .Case("ord", 0x07)
+ .Case("ord_q", 0x07)
+ /* AVX only from here */
+ .Case("eq_uq", 0x08)
+ .Case("nge", 0x09)
+ .Case("nge_us", 0x09)
+ .Case("ngt", 0x0A)
+ .Case("ngt_us", 0x0A)
+ .Case("false", 0x0B)
+ .Case("false_oq", 0x0B)
+ .Case("neq_oq", 0x0C)
+ .Case("ge", 0x0D)
+ .Case("ge_os", 0x0D)
+ .Case("gt", 0x0E)
+ .Case("gt_os", 0x0E)
+ .Case("true", 0x0F)
+ .Case("true_uq", 0x0F)
+ .Case("eq_os", 0x10)
+ .Case("lt_oq", 0x11)
+ .Case("le_oq", 0x12)
+ .Case("unord_s", 0x13)
+ .Case("neq_us", 0x14)
+ .Case("nlt_uq", 0x15)
+ .Case("nle_uq", 0x16)
+ .Case("ord_s", 0x17)
+ .Case("eq_us", 0x18)
+ .Case("nge_uq", 0x19)
+ .Case("ngt_uq", 0x1A)
+ .Case("false_os", 0x1B)
+ .Case("neq_os", 0x1C)
+ .Case("ge_oq", 0x1D)
+ .Case("gt_oq", 0x1E)
+ .Case("true_us", 0x1F)
+ .Default(~0U);
+ if (ComparisonCode != ~0U && (IsVCMP || ComparisonCode < 8)) {
+
+ Operands.push_back(X86Operand::CreateToken(PatchedName.slice(0, CCIdx),
+ NameLoc));
+
+ const MCExpr *ImmOp = MCConstantExpr::create(ComparisonCode,
+ getParser().getContext());
+ Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc));
+
+ PatchedName = PatchedName.substr(PatchedName.size() - 2);
+ }
+ }
+
+ // FIXME: Hack to recognize vpcmp<comparison code>{ub,uw,ud,uq,b,w,d,q}.
+ if (PatchedName.startswith("vpcmp") &&
+ (PatchedName.endswith("b") || PatchedName.endswith("w") ||
+ PatchedName.endswith("d") || PatchedName.endswith("q"))) {
+ unsigned CCIdx = PatchedName.drop_back().back() == 'u' ? 2 : 1;
+ unsigned ComparisonCode = StringSwitch<unsigned>(
+ PatchedName.slice(5, PatchedName.size() - CCIdx))
+ .Case("eq", 0x0) // Only allowed on unsigned. Checked below.
+ .Case("lt", 0x1)
+ .Case("le", 0x2)
+ //.Case("false", 0x3) // Not a documented alias.
+ .Case("neq", 0x4)
+ .Case("nlt", 0x5)
+ .Case("nle", 0x6)
+ //.Case("true", 0x7) // Not a documented alias.
+ .Default(~0U);
+ if (ComparisonCode != ~0U && (ComparisonCode != 0 || CCIdx == 2)) {
+ Operands.push_back(X86Operand::CreateToken("vpcmp", NameLoc));
+
+ const MCExpr *ImmOp = MCConstantExpr::create(ComparisonCode,
+ getParser().getContext());
+ Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc));
+
+ PatchedName = PatchedName.substr(PatchedName.size() - CCIdx);
+ }
+ }
+
+ // FIXME: Hack to recognize vpcom<comparison code>{ub,uw,ud,uq,b,w,d,q}.
+ if (PatchedName.startswith("vpcom") &&
+ (PatchedName.endswith("b") || PatchedName.endswith("w") ||
+ PatchedName.endswith("d") || PatchedName.endswith("q"))) {
+ unsigned CCIdx = PatchedName.drop_back().back() == 'u' ? 2 : 1;
+ unsigned ComparisonCode = StringSwitch<unsigned>(
+ PatchedName.slice(5, PatchedName.size() - CCIdx))
+ .Case("lt", 0x0)
+ .Case("le", 0x1)
+ .Case("gt", 0x2)
+ .Case("ge", 0x3)
+ .Case("eq", 0x4)
+ .Case("neq", 0x5)
+ .Case("false", 0x6)
+ .Case("true", 0x7)
+ .Default(~0U);
+ if (ComparisonCode != ~0U) {
+ Operands.push_back(X86Operand::CreateToken("vpcom", NameLoc));
+
+ const MCExpr *ImmOp = MCConstantExpr::create(ComparisonCode,
+ getParser().getContext());
+ Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc));
+
+ PatchedName = PatchedName.substr(PatchedName.size() - CCIdx);
+ }
+ }
+
+ Operands.push_back(X86Operand::CreateToken(PatchedName, NameLoc));
+
+ // Determine whether this is an instruction prefix.
+ bool isPrefix =
+ Name == "lock" || Name == "rep" ||
+ Name == "repe" || Name == "repz" ||
+ Name == "repne" || Name == "repnz" ||
+ Name == "rex64" || Name == "data16";
+
+ bool CurlyAsEndOfStatement = false;
+ // This does the actual operand parsing. Don't parse any more if we have a
+ // prefix juxtaposed with an operation like "lock incl 4(%rax)", because we
+ // just want to parse the "lock" as the first instruction and the "incl" as
+ // the next one.
+ if (getLexer().isNot(AsmToken::EndOfStatement) && !isPrefix) {
+
+ // Parse '*' modifier.
+ if (getLexer().is(AsmToken::Star))
+ Operands.push_back(X86Operand::CreateToken("*", consumeToken()));
+
+ // Read the operands.
+ while(1) {
+ if (std::unique_ptr<X86Operand> Op = ParseOperand()) {
+ Operands.push_back(std::move(Op));
+ if (HandleAVX512Operand(Operands, *Operands.back()))
+ return true;
+ } else {
+ return true;
+ }
+ // check for comma and eat it
+ if (getLexer().is(AsmToken::Comma))
+ Parser.Lex();
+ else
+ break;
+ }
+
+ // In MS inline asm curly braces mark the begining/end of a block, therefore
+ // they should be interepreted as end of statement
+ CurlyAsEndOfStatement =
+ isParsingIntelSyntax() && isParsingInlineAsm() &&
+ (getLexer().is(AsmToken::LCurly) || getLexer().is(AsmToken::RCurly));
+ if (getLexer().isNot(AsmToken::EndOfStatement) && !CurlyAsEndOfStatement)
+ return TokError("unexpected token in argument list");
+ }
+
+ // Consume the EndOfStatement or the prefix separator Slash
+ if (getLexer().is(AsmToken::EndOfStatement) ||
+ (isPrefix && getLexer().is(AsmToken::Slash)))
+ Parser.Lex();
+ else if (CurlyAsEndOfStatement)
+ // Add an actual EndOfStatement before the curly brace
+ Info.AsmRewrites->emplace_back(AOK_EndOfStatement,
+ getLexer().getTok().getLoc(), 0);
+
+ // This is for gas compatibility and cannot be done in td.
+ // Adding "p" for some floating point with no argument.
+ // For example: fsub --> fsubp
+ bool IsFp =
+ Name == "fsub" || Name == "fdiv" || Name == "fsubr" || Name == "fdivr";
+ if (IsFp && Operands.size() == 1) {
+ const char *Repl = StringSwitch<const char *>(Name)
+ .Case("fsub", "fsubp")
+ .Case("fdiv", "fdivp")
+ .Case("fsubr", "fsubrp")
+ .Case("fdivr", "fdivrp");
+ static_cast<X86Operand &>(*Operands[0]).setTokenValue(Repl);
+ }
+
+ // Moving a 32 or 16 bit value into a segment register has the same
+ // behavior. Modify such instructions to always take shorter form.
+ if ((Name == "mov" || Name == "movw" || Name == "movl") &&
+ (Operands.size() == 3)) {
+ X86Operand &Op1 = (X86Operand &)*Operands[1];
+ X86Operand &Op2 = (X86Operand &)*Operands[2];
+ SMLoc Loc = Op1.getEndLoc();
+ if (Op1.isReg() && Op2.isReg() &&
+ X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(
+ Op2.getReg()) &&
+ (X86MCRegisterClasses[X86::GR16RegClassID].contains(Op1.getReg()) ||
+ X86MCRegisterClasses[X86::GR32RegClassID].contains(Op1.getReg()))) {
+ // Change instruction name to match new instruction.
+ if (Name != "mov" && Name[3] == (is16BitMode() ? 'l' : 'w')) {
+ Name = is16BitMode() ? "movw" : "movl";
+ Operands[0] = X86Operand::CreateToken(Name, NameLoc);
+ }
+ // Select the correct equivalent 16-/32-bit source register.
+ unsigned Reg =
+ getX86SubSuperRegisterOrZero(Op1.getReg(), is16BitMode() ? 16 : 32);
+ Operands[1] = X86Operand::CreateReg(Reg, Loc, Loc);
+ }
+ }
+
+ // This is a terrible hack to handle "out[s]?[bwl]? %al, (%dx)" ->
+ // "outb %al, %dx". Out doesn't take a memory form, but this is a widely
+ // documented form in various unofficial manuals, so a lot of code uses it.
+ if ((Name == "outb" || Name == "outsb" || Name == "outw" || Name == "outsw" ||
+ Name == "outl" || Name == "outsl" || Name == "out" || Name == "outs") &&
+ Operands.size() == 3) {
+ X86Operand &Op = (X86Operand &)*Operands.back();
+ if (Op.isMem() && Op.Mem.SegReg == 0 &&
+ isa<MCConstantExpr>(Op.Mem.Disp) &&
+ cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
+ Op.Mem.BaseReg == MatchRegisterName("dx") && Op.Mem.IndexReg == 0) {
+ SMLoc Loc = Op.getEndLoc();
+ Operands.back() = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc);
+ }
+ }
+ // Same hack for "in[s]?[bwl]? (%dx), %al" -> "inb %dx, %al".
+ if ((Name == "inb" || Name == "insb" || Name == "inw" || Name == "insw" ||
+ Name == "inl" || Name == "insl" || Name == "in" || Name == "ins") &&
+ Operands.size() == 3) {
+ X86Operand &Op = (X86Operand &)*Operands[1];
+ if (Op.isMem() && Op.Mem.SegReg == 0 &&
+ isa<MCConstantExpr>(Op.Mem.Disp) &&
+ cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
+ Op.Mem.BaseReg == MatchRegisterName("dx") && Op.Mem.IndexReg == 0) {
+ SMLoc Loc = Op.getEndLoc();
+ Operands[1] = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc);
+ }
+ }
+
+ SmallVector<std::unique_ptr<MCParsedAsmOperand>, 2> TmpOperands;
+ bool HadVerifyError = false;
+
+ // Append default arguments to "ins[bwld]"
+ if (Name.startswith("ins") &&
+ (Operands.size() == 1 || Operands.size() == 3) &&
+ (Name == "insb" || Name == "insw" || Name == "insl" || Name == "insd" ||
+ Name == "ins")) {
+
+ AddDefaultSrcDestOperands(TmpOperands,
+ X86Operand::CreateReg(X86::DX, NameLoc, NameLoc),
+ DefaultMemDIOperand(NameLoc));
+ HadVerifyError = VerifyAndAdjustOperands(Operands, TmpOperands);
+ }
+
+ // Append default arguments to "outs[bwld]"
+ if (Name.startswith("outs") &&
+ (Operands.size() == 1 || Operands.size() == 3) &&
+ (Name == "outsb" || Name == "outsw" || Name == "outsl" ||
+ Name == "outsd" || Name == "outs")) {
+ AddDefaultSrcDestOperands(TmpOperands, DefaultMemSIOperand(NameLoc),
+ X86Operand::CreateReg(X86::DX, NameLoc, NameLoc));
+ HadVerifyError = VerifyAndAdjustOperands(Operands, TmpOperands);
+ }
+
+ // Transform "lods[bwlq]" into "lods[bwlq] ($SIREG)" for appropriate
+ // values of $SIREG according to the mode. It would be nice if this
+ // could be achieved with InstAlias in the tables.
+ if (Name.startswith("lods") &&
+ (Operands.size() == 1 || Operands.size() == 2) &&
+ (Name == "lods" || Name == "lodsb" || Name == "lodsw" ||
+ Name == "lodsl" || Name == "lodsd" || Name == "lodsq")) {
+ TmpOperands.push_back(DefaultMemSIOperand(NameLoc));
+ HadVerifyError = VerifyAndAdjustOperands(Operands, TmpOperands);
+ }
+
+ // Transform "stos[bwlq]" into "stos[bwlq] ($DIREG)" for appropriate
+ // values of $DIREG according to the mode. It would be nice if this
+ // could be achieved with InstAlias in the tables.
+ if (Name.startswith("stos") &&
+ (Operands.size() == 1 || Operands.size() == 2) &&
+ (Name == "stos" || Name == "stosb" || Name == "stosw" ||
+ Name == "stosl" || Name == "stosd" || Name == "stosq")) {
+ TmpOperands.push_back(DefaultMemDIOperand(NameLoc));
+ HadVerifyError = VerifyAndAdjustOperands(Operands, TmpOperands);
+ }
+
+ // Transform "scas[bwlq]" into "scas[bwlq] ($DIREG)" for appropriate
+ // values of $DIREG according to the mode. It would be nice if this
+ // could be achieved with InstAlias in the tables.
+ if (Name.startswith("scas") &&
+ (Operands.size() == 1 || Operands.size() == 2) &&
+ (Name == "scas" || Name == "scasb" || Name == "scasw" ||
+ Name == "scasl" || Name == "scasd" || Name == "scasq")) {
+ TmpOperands.push_back(DefaultMemDIOperand(NameLoc));
+ HadVerifyError = VerifyAndAdjustOperands(Operands, TmpOperands);
+ }
+
+ // Add default SI and DI operands to "cmps[bwlq]".
+ if (Name.startswith("cmps") &&
+ (Operands.size() == 1 || Operands.size() == 3) &&
+ (Name == "cmps" || Name == "cmpsb" || Name == "cmpsw" ||
+ Name == "cmpsl" || Name == "cmpsd" || Name == "cmpsq")) {
+ AddDefaultSrcDestOperands(TmpOperands, DefaultMemDIOperand(NameLoc),
+ DefaultMemSIOperand(NameLoc));
+ HadVerifyError = VerifyAndAdjustOperands(Operands, TmpOperands);
+ }
+
+ // Add default SI and DI operands to "movs[bwlq]".
+ if (((Name.startswith("movs") &&
+ (Name == "movs" || Name == "movsb" || Name == "movsw" ||
+ Name == "movsl" || Name == "movsd" || Name == "movsq")) ||
+ (Name.startswith("smov") &&
+ (Name == "smov" || Name == "smovb" || Name == "smovw" ||
+ Name == "smovl" || Name == "smovd" || Name == "smovq"))) &&
+ (Operands.size() == 1 || Operands.size() == 3)) {
+ if (Name == "movsd" && Operands.size() == 1 && !isParsingIntelSyntax())
+ Operands.back() = X86Operand::CreateToken("movsl", NameLoc);
+ AddDefaultSrcDestOperands(TmpOperands, DefaultMemSIOperand(NameLoc),
+ DefaultMemDIOperand(NameLoc));
+ HadVerifyError = VerifyAndAdjustOperands(Operands, TmpOperands);
+ }
+
+ // Check if we encountered an error for one the string insturctions
+ if (HadVerifyError) {
+ return HadVerifyError;
+ }
+
+ // FIXME: Hack to handle recognize s{hr,ar,hl} $1, <op>. Canonicalize to
+ // "shift <op>".
+ if ((Name.startswith("shr") || Name.startswith("sar") ||
+ Name.startswith("shl") || Name.startswith("sal") ||
+ Name.startswith("rcl") || Name.startswith("rcr") ||
+ Name.startswith("rol") || Name.startswith("ror")) &&
+ Operands.size() == 3) {
+ if (isParsingIntelSyntax()) {
+ // Intel syntax
+ X86Operand &Op1 = static_cast<X86Operand &>(*Operands[2]);
+ if (Op1.isImm() && isa<MCConstantExpr>(Op1.getImm()) &&
+ cast<MCConstantExpr>(Op1.getImm())->getValue() == 1)
+ Operands.pop_back();
+ } else {
+ X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]);
+ if (Op1.isImm() && isa<MCConstantExpr>(Op1.getImm()) &&
+ cast<MCConstantExpr>(Op1.getImm())->getValue() == 1)
+ Operands.erase(Operands.begin() + 1);
+ }
+ }
+
+ // Transforms "int $3" into "int3" as a size optimization. We can't write an
+ // instalias with an immediate operand yet.
+ if (Name == "int" && Operands.size() == 2) {
+ X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]);
+ if (Op1.isImm())
+ if (auto *CE = dyn_cast<MCConstantExpr>(Op1.getImm()))
+ if (CE->getValue() == 3) {
+ Operands.erase(Operands.begin() + 1);
+ static_cast<X86Operand &>(*Operands[0]).setTokenValue("int3");
+ }
+ }
+
+ // Transforms "xlat mem8" into "xlatb"
+ if ((Name == "xlat" || Name == "xlatb") && Operands.size() == 2) {
+ X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]);
+ if (Op1.isMem8()) {
+ Warning(Op1.getStartLoc(), "memory operand is only for determining the "
+ "size, (R|E)BX will be used for the location");
+ Operands.pop_back();
+ static_cast<X86Operand &>(*Operands[0]).setTokenValue("xlatb");
+ }
+ }
+
+ return false;
+}
+
+bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) {
+ return false;
+}
+
+static const char *getSubtargetFeatureName(uint64_t Val);
+
+void X86AsmParser::EmitInstruction(MCInst &Inst, OperandVector &Operands,
+ MCStreamer &Out) {
+ Instrumentation->InstrumentAndEmitInstruction(Inst, Operands, getContext(),
+ MII, Out);
+}
+
+bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands,
+ MCStreamer &Out, uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) {
+ if (isParsingIntelSyntax())
+ return MatchAndEmitIntelInstruction(IDLoc, Opcode, Operands, Out, ErrorInfo,
+ MatchingInlineAsm);
+ return MatchAndEmitATTInstruction(IDLoc, Opcode, Operands, Out, ErrorInfo,
+ MatchingInlineAsm);
+}
+
+void X86AsmParser::MatchFPUWaitAlias(SMLoc IDLoc, X86Operand &Op,
+ OperandVector &Operands, MCStreamer &Out,
+ bool MatchingInlineAsm) {
+ // FIXME: This should be replaced with a real .td file alias mechanism.
+ // Also, MatchInstructionImpl should actually *do* the EmitInstruction
+ // call.
+ const char *Repl = StringSwitch<const char *>(Op.getToken())
+ .Case("finit", "fninit")
+ .Case("fsave", "fnsave")
+ .Case("fstcw", "fnstcw")
+ .Case("fstcww", "fnstcw")
+ .Case("fstenv", "fnstenv")
+ .Case("fstsw", "fnstsw")
+ .Case("fstsww", "fnstsw")
+ .Case("fclex", "fnclex")
+ .Default(nullptr);
+ if (Repl) {
+ MCInst Inst;
+ Inst.setOpcode(X86::WAIT);
+ Inst.setLoc(IDLoc);
+ if (!MatchingInlineAsm)
+ EmitInstruction(Inst, Operands, Out);
+ Operands[0] = X86Operand::CreateToken(Repl, IDLoc);
+ }
+}
+
+bool X86AsmParser::ErrorMissingFeature(SMLoc IDLoc, uint64_t ErrorInfo,
+ bool MatchingInlineAsm) {
+ assert(ErrorInfo && "Unknown missing feature!");
+ SmallString<126> Msg;
+ raw_svector_ostream OS(Msg);
+ OS << "instruction requires:";
+ uint64_t Mask = 1;
+ for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) {
+ if (ErrorInfo & Mask)
+ OS << ' ' << getSubtargetFeatureName(ErrorInfo & Mask);
+ Mask <<= 1;
+ }
+ return Error(IDLoc, OS.str(), SMRange(), MatchingInlineAsm);
+}
+
+bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands,
+ MCStreamer &Out,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) {
+ assert(!Operands.empty() && "Unexpect empty operand list!");
+ X86Operand &Op = static_cast<X86Operand &>(*Operands[0]);
+ assert(Op.isToken() && "Leading operand should always be a mnemonic!");
+ SMRange EmptyRange = None;
+
+ // First, handle aliases that expand to multiple instructions.
+ MatchFPUWaitAlias(IDLoc, Op, Operands, Out, MatchingInlineAsm);
+
+ bool WasOriginallyInvalidOperand = false;
+ MCInst Inst;
+
+ // First, try a direct match.
+ switch (MatchInstruction(Operands, Inst, ErrorInfo, MatchingInlineAsm,
+ isParsingIntelSyntax())) {
+ default: llvm_unreachable("Unexpected match result!");
+ case Match_Success:
+ // Some instructions need post-processing to, for example, tweak which
+ // encoding is selected. Loop on it while changes happen so the
+ // individual transformations can chain off each other.
+ if (!MatchingInlineAsm)
+ while (processInstruction(Inst, Operands))
+ ;
+
+ Inst.setLoc(IDLoc);
+ if (!MatchingInlineAsm)
+ EmitInstruction(Inst, Operands, Out);
+ Opcode = Inst.getOpcode();
+ return false;
+ case Match_MissingFeature:
+ return ErrorMissingFeature(IDLoc, ErrorInfo, MatchingInlineAsm);
+ case Match_InvalidOperand:
+ WasOriginallyInvalidOperand = true;
+ break;
+ case Match_MnemonicFail:
+ break;
+ }
+
+ // FIXME: Ideally, we would only attempt suffix matches for things which are
+ // valid prefixes, and we could just infer the right unambiguous
+ // type. However, that requires substantially more matcher support than the
+ // following hack.
+
+ // Change the operand to point to a temporary token.
+ StringRef Base = Op.getToken();
+ SmallString<16> Tmp;
+ Tmp += Base;
+ Tmp += ' ';
+ Op.setTokenValue(Tmp);
+
+ // If this instruction starts with an 'f', then it is a floating point stack
+ // instruction. These come in up to three forms for 32-bit, 64-bit, and
+ // 80-bit floating point, which use the suffixes s,l,t respectively.
+ //
+ // Otherwise, we assume that this may be an integer instruction, which comes
+ // in 8/16/32/64-bit forms using the b,w,l,q suffixes respectively.
+ const char *Suffixes = Base[0] != 'f' ? "bwlq" : "slt\0";
+
+ // Check for the various suffix matches.
+ uint64_t ErrorInfoIgnore;
+ uint64_t ErrorInfoMissingFeature = 0; // Init suppresses compiler warnings.
+ unsigned Match[4];
+
+ for (unsigned I = 0, E = array_lengthof(Match); I != E; ++I) {
+ Tmp.back() = Suffixes[I];
+ Match[I] = MatchInstruction(Operands, Inst, ErrorInfoIgnore,
+ MatchingInlineAsm, isParsingIntelSyntax());
+ // If this returned as a missing feature failure, remember that.
+ if (Match[I] == Match_MissingFeature)
+ ErrorInfoMissingFeature = ErrorInfoIgnore;
+ }
+
+ // Restore the old token.
+ Op.setTokenValue(Base);
+
+ // If exactly one matched, then we treat that as a successful match (and the
+ // instruction will already have been filled in correctly, since the failing
+ // matches won't have modified it).
+ unsigned NumSuccessfulMatches =
+ std::count(std::begin(Match), std::end(Match), Match_Success);
+ if (NumSuccessfulMatches == 1) {
+ Inst.setLoc(IDLoc);
+ if (!MatchingInlineAsm)
+ EmitInstruction(Inst, Operands, Out);
+ Opcode = Inst.getOpcode();
+ return false;
+ }
+
+ // Otherwise, the match failed, try to produce a decent error message.
+
+ // If we had multiple suffix matches, then identify this as an ambiguous
+ // match.
+ if (NumSuccessfulMatches > 1) {
+ char MatchChars[4];
+ unsigned NumMatches = 0;
+ for (unsigned I = 0, E = array_lengthof(Match); I != E; ++I)
+ if (Match[I] == Match_Success)
+ MatchChars[NumMatches++] = Suffixes[I];
+
+ SmallString<126> Msg;
+ raw_svector_ostream OS(Msg);
+ OS << "ambiguous instructions require an explicit suffix (could be ";
+ for (unsigned i = 0; i != NumMatches; ++i) {
+ if (i != 0)
+ OS << ", ";
+ if (i + 1 == NumMatches)
+ OS << "or ";
+ OS << "'" << Base << MatchChars[i] << "'";
+ }
+ OS << ")";
+ Error(IDLoc, OS.str(), EmptyRange, MatchingInlineAsm);
+ return true;
+ }
+
+ // Okay, we know that none of the variants matched successfully.
+
+ // If all of the instructions reported an invalid mnemonic, then the original
+ // mnemonic was invalid.
+ if (std::count(std::begin(Match), std::end(Match), Match_MnemonicFail) == 4) {
+ if (!WasOriginallyInvalidOperand) {
+ return Error(IDLoc, "invalid instruction mnemonic '" + Base + "'",
+ Op.getLocRange(), MatchingInlineAsm);
+ }
+
+ // Recover location info for the operand if we know which was the problem.
+ if (ErrorInfo != ~0ULL) {
+ if (ErrorInfo >= Operands.size())
+ return Error(IDLoc, "too few operands for instruction", EmptyRange,
+ MatchingInlineAsm);
+
+ X86Operand &Operand = (X86Operand &)*Operands[ErrorInfo];
+ if (Operand.getStartLoc().isValid()) {
+ SMRange OperandRange = Operand.getLocRange();
+ return Error(Operand.getStartLoc(), "invalid operand for instruction",
+ OperandRange, MatchingInlineAsm);
+ }
+ }
+
+ return Error(IDLoc, "invalid operand for instruction", EmptyRange,
+ MatchingInlineAsm);
+ }
+
+ // If one instruction matched with a missing feature, report this as a
+ // missing feature.
+ if (std::count(std::begin(Match), std::end(Match),
+ Match_MissingFeature) == 1) {
+ ErrorInfo = ErrorInfoMissingFeature;
+ return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeature,
+ MatchingInlineAsm);
+ }
+
+ // If one instruction matched with an invalid operand, report this as an
+ // operand failure.
+ if (std::count(std::begin(Match), std::end(Match),
+ Match_InvalidOperand) == 1) {
+ return Error(IDLoc, "invalid operand for instruction", EmptyRange,
+ MatchingInlineAsm);
+ }
+
+ // If all of these were an outright failure, report it in a useless way.
+ Error(IDLoc, "unknown use of instruction mnemonic without a size suffix",
+ EmptyRange, MatchingInlineAsm);
+ return true;
+}
+
+unsigned X86AsmParser::AdjustAVX512Mem(unsigned Size,
+ X86Operand* UnsizedMemOpNext) {
+ // Check for the existence of an AVX512 platform
+ if (!getSTI().getFeatureBits()[X86::FeatureAVX512])
+ return 0;
+ // Allow adjusting upon a (x|y|z)mm
+ if (Size == 512 || Size == 256 || Size == 128)
+ return Size;
+ // This is an allegadly broadcasting mem op adjustment,
+ // allow some more inquiring to validate it
+ if (Size == 64 || Size == 32)
+ return UnsizedMemOpNext && UnsizedMemOpNext->isToken() &&
+ UnsizedMemOpNext->getToken().substr(0, 4).equals("{1to") ? Size : 0;
+ // Do not allow any other type of adjustments
+ return 0;
+}
+
+bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands,
+ MCStreamer &Out,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) {
+ assert(!Operands.empty() && "Unexpect empty operand list!");
+ X86Operand &Op = static_cast<X86Operand &>(*Operands[0]);
+ assert(Op.isToken() && "Leading operand should always be a mnemonic!");
+ StringRef Mnemonic = Op.getToken();
+ SMRange EmptyRange = None;
+ StringRef Base = Op.getToken();
+
+ // First, handle aliases that expand to multiple instructions.
+ MatchFPUWaitAlias(IDLoc, Op, Operands, Out, MatchingInlineAsm);
+
+ MCInst Inst;
+
+ // Find one unsized memory operand, if present.
+ X86Operand *UnsizedMemOp = nullptr;
+ // If unsized memory operand was found - obtain following operand.
+ // For use in AdjustAVX512Mem
+ X86Operand *UnsizedMemOpNext = nullptr;
+ for (const auto &Op : Operands) {
+ X86Operand *X86Op = static_cast<X86Operand *>(Op.get());
+ if (UnsizedMemOp) {
+ UnsizedMemOpNext = X86Op;
+ // Have we found an unqualified memory operand,
+ // break. IA allows only one memory operand.
+ break;
+ }
+ if (X86Op->isMemUnsized())
+ UnsizedMemOp = X86Op;
+ }
+
+ // Allow some instructions to have implicitly pointer-sized operands. This is
+ // compatible with gas.
+ if (UnsizedMemOp) {
+ static const char *const PtrSizedInstrs[] = {"call", "jmp", "push"};
+ for (const char *Instr : PtrSizedInstrs) {
+ if (Mnemonic == Instr) {
+ UnsizedMemOp->Mem.Size = getPointerWidth();
+ break;
+ }
+ }
+ }
+
+ SmallVector<unsigned, 8> Match;
+ uint64_t ErrorInfoMissingFeature = 0;
+
+ // If unsized push has immediate operand we should default the default pointer
+ // size for the size.
+ if (Mnemonic == "push" && Operands.size() == 2) {
+ auto *X86Op = static_cast<X86Operand *>(Operands[1].get());
+ if (X86Op->isImm()) {
+ // If it's not a constant fall through and let remainder take care of it.
+ const auto *CE = dyn_cast<MCConstantExpr>(X86Op->getImm());
+ unsigned Size = getPointerWidth();
+ if (CE &&
+ (isIntN(Size, CE->getValue()) || isUIntN(Size, CE->getValue()))) {
+ SmallString<16> Tmp;
+ Tmp += Base;
+ Tmp += (is64BitMode())
+ ? "q"
+ : (is32BitMode()) ? "l" : (is16BitMode()) ? "w" : " ";
+ Op.setTokenValue(Tmp);
+ // Do match in ATT mode to allow explicit suffix usage.
+ Match.push_back(MatchInstruction(Operands, Inst, ErrorInfo,
+ MatchingInlineAsm,
+ false /*isParsingIntelSyntax()*/));
+ Op.setTokenValue(Base);
+ }
+ }
+ }
+
+ // If an unsized memory operand is present, try to match with each memory
+ // operand size. In Intel assembly, the size is not part of the instruction
+ // mnemonic.
+ unsigned MatchedSize = 0;
+ if (UnsizedMemOp && UnsizedMemOp->isMemUnsized()) {
+ static const unsigned MopSizes[] = {8, 16, 32, 64, 80, 128, 256, 512};
+ for (unsigned Size : MopSizes) {
+ UnsizedMemOp->Mem.Size = Size;
+ uint64_t ErrorInfoIgnore;
+ unsigned LastOpcode = Inst.getOpcode();
+ unsigned M = MatchInstruction(Operands, Inst, ErrorInfoIgnore,
+ MatchingInlineAsm, isParsingIntelSyntax());
+ if (Match.empty() || LastOpcode != Inst.getOpcode())
+ Match.push_back(M);
+
+ // If this returned as a missing feature failure, remember that.
+ if (Match.back() == Match_MissingFeature)
+ ErrorInfoMissingFeature = ErrorInfoIgnore;
+ if (M == Match_Success)
+ // MS-compatability:
+ // Adjust AVX512 vector/broadcast memory operand,
+ // when facing the absence of a size qualifier.
+ // Match GCC behavior on respective cases.
+ MatchedSize = AdjustAVX512Mem(Size, UnsizedMemOpNext);
+ }
+
+ // Restore the size of the unsized memory operand if we modified it.
+ if (UnsizedMemOp)
+ UnsizedMemOp->Mem.Size = 0;
+ }
+
+ // If we haven't matched anything yet, this is not a basic integer or FPU
+ // operation. There shouldn't be any ambiguity in our mnemonic table, so try
+ // matching with the unsized operand.
+ if (Match.empty()) {
+ Match.push_back(MatchInstruction(
+ Operands, Inst, ErrorInfo, MatchingInlineAsm, isParsingIntelSyntax()));
+ // If this returned as a missing feature failure, remember that.
+ if (Match.back() == Match_MissingFeature)
+ ErrorInfoMissingFeature = ErrorInfo;
+ }
+
+ // Restore the size of the unsized memory operand if we modified it.
+ if (UnsizedMemOp)
+ UnsizedMemOp->Mem.Size = 0;
+
+ // If it's a bad mnemonic, all results will be the same.
+ if (Match.back() == Match_MnemonicFail) {
+ return Error(IDLoc, "invalid instruction mnemonic '" + Mnemonic + "'",
+ Op.getLocRange(), MatchingInlineAsm);
+ }
+
+ // If exactly one matched, then we treat that as a successful match (and the
+ // instruction will already have been filled in correctly, since the failing
+ // matches won't have modified it).
+ unsigned NumSuccessfulMatches =
+ std::count(std::begin(Match), std::end(Match), Match_Success);
+ if (NumSuccessfulMatches == 1) {
+ if (MatchedSize && isParsingInlineAsm() && isParsingIntelSyntax())
+ // MS compatibility -
+ // Fix the rewrite according to the matched memory size
+ // MS inline assembly only
+ for (AsmRewrite &AR : *InstInfo->AsmRewrites)
+ if ((AR.Loc.getPointer() == UnsizedMemOp->StartLoc.getPointer()) &&
+ (AR.Kind == AOK_SizeDirective))
+ AR.Val = MatchedSize;
+ // Some instructions need post-processing to, for example, tweak which
+ // encoding is selected. Loop on it while changes happen so the individual
+ // transformations can chain off each other.
+ if (!MatchingInlineAsm)
+ while (processInstruction(Inst, Operands))
+ ;
+ Inst.setLoc(IDLoc);
+ if (!MatchingInlineAsm)
+ EmitInstruction(Inst, Operands, Out);
+ Opcode = Inst.getOpcode();
+ return false;
+ } else if (NumSuccessfulMatches > 1) {
+ assert(UnsizedMemOp &&
+ "multiple matches only possible with unsized memory operands");
+ return Error(UnsizedMemOp->getStartLoc(),
+ "ambiguous operand size for instruction '" + Mnemonic + "\'",
+ UnsizedMemOp->getLocRange(), MatchingInlineAsm);
+ }
+
+ // If one instruction matched with a missing feature, report this as a
+ // missing feature.
+ if (std::count(std::begin(Match), std::end(Match),
+ Match_MissingFeature) == 1) {
+ ErrorInfo = ErrorInfoMissingFeature;
+ return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeature,
+ MatchingInlineAsm);
+ }
+
+ // If one instruction matched with an invalid operand, report this as an
+ // operand failure.
+ if (std::count(std::begin(Match), std::end(Match),
+ Match_InvalidOperand) == 1) {
+ return Error(IDLoc, "invalid operand for instruction", EmptyRange,
+ MatchingInlineAsm);
+ }
+
+ // If all of these were an outright failure, report it in a useless way.
+ return Error(IDLoc, "unknown instruction mnemonic", EmptyRange,
+ MatchingInlineAsm);
+}
+
+bool X86AsmParser::OmitRegisterFromClobberLists(unsigned RegNo) {
+ return X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(RegNo);
+}
+
+bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
+ MCAsmParser &Parser = getParser();
+ StringRef IDVal = DirectiveID.getIdentifier();
+ if (IDVal == ".word")
+ return ParseDirectiveWord(2, DirectiveID.getLoc());
+ else if (IDVal.startswith(".code"))
+ return ParseDirectiveCode(IDVal, DirectiveID.getLoc());
+ else if (IDVal.startswith(".att_syntax")) {
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ if (Parser.getTok().getString() == "prefix")
+ Parser.Lex();
+ else if (Parser.getTok().getString() == "noprefix")
+ return Error(DirectiveID.getLoc(), "'.att_syntax noprefix' is not "
+ "supported: registers must have a "
+ "'%' prefix in .att_syntax");
+ }
+ getParser().setAssemblerDialect(0);
+ return false;
+ } else if (IDVal.startswith(".intel_syntax")) {
+ getParser().setAssemblerDialect(1);
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ if (Parser.getTok().getString() == "noprefix")
+ Parser.Lex();
+ else if (Parser.getTok().getString() == "prefix")
+ return Error(DirectiveID.getLoc(), "'.intel_syntax prefix' is not "
+ "supported: registers must not have "
+ "a '%' prefix in .intel_syntax");
+ }
+ return false;
+ } else if (IDVal == ".even")
+ return parseDirectiveEven(DirectiveID.getLoc());
+ return true;
+}
+
+/// parseDirectiveEven
+/// ::= .even
+bool X86AsmParser::parseDirectiveEven(SMLoc L) {
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ TokError("unexpected token in directive");
+ return false;
+ }
+ const MCSection *Section = getStreamer().getCurrentSectionOnly();
+ if (!Section) {
+ getStreamer().InitSections(false);
+ Section = getStreamer().getCurrentSectionOnly();
+ }
+ if (Section->UseCodeAlign())
+ getStreamer().EmitCodeAlignment(2, 0);
+ else
+ getStreamer().EmitValueToAlignment(2, 0, 1, 0);
+ return false;
+}
+/// ParseDirectiveWord
+/// ::= .word [ expression (, expression)* ]
+bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
+ MCAsmParser &Parser = getParser();
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ for (;;) {
+ const MCExpr *Value;
+ SMLoc ExprLoc = getLexer().getLoc();
+ if (getParser().parseExpression(Value))
+ return false;
+
+ if (const auto *MCE = dyn_cast<MCConstantExpr>(Value)) {
+ assert(Size <= 8 && "Invalid size");
+ uint64_t IntValue = MCE->getValue();
+ if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue))
+ return Error(ExprLoc, "literal value out of range for directive");
+ getStreamer().EmitIntValue(IntValue, Size);
+ } else {
+ getStreamer().EmitValue(Value, Size, ExprLoc);
+ }
+
+ if (getLexer().is(AsmToken::EndOfStatement))
+ break;
+
+ // FIXME: Improve diagnostic.
+ if (getLexer().isNot(AsmToken::Comma)) {
+ Error(L, "unexpected token in directive");
+ return false;
+ }
+ Parser.Lex();
+ }
+ }
+
+ Parser.Lex();
+ return false;
+}
+
+/// ParseDirectiveCode
+/// ::= .code16 | .code32 | .code64
+bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) {
+ MCAsmParser &Parser = getParser();
+ Code16GCC = false;
+ if (IDVal == ".code16") {
+ Parser.Lex();
+ if (!is16BitMode()) {
+ SwitchMode(X86::Mode16Bit);
+ getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16);
+ }
+ } else if (IDVal == ".code16gcc") {
+ // .code16gcc parses as if in 32-bit mode, but emits code in 16-bit mode.
+ Parser.Lex();
+ Code16GCC = true;
+ if (!is16BitMode()) {
+ SwitchMode(X86::Mode16Bit);
+ getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16);
+ }
+ } else if (IDVal == ".code32") {
+ Parser.Lex();
+ if (!is32BitMode()) {
+ SwitchMode(X86::Mode32Bit);
+ getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32);
+ }
+ } else if (IDVal == ".code64") {
+ Parser.Lex();
+ if (!is64BitMode()) {
+ SwitchMode(X86::Mode64Bit);
+ getParser().getStreamer().EmitAssemblerFlag(MCAF_Code64);
+ }
+ } else {
+ Error(L, "unknown directive " + IDVal);
+ return false;
+ }
+
+ return false;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeX86AsmParser() {
+ RegisterMCAsmParser<X86AsmParser> X(getTheX86_32Target());
+ RegisterMCAsmParser<X86AsmParser> Y(getTheX86_64Target());
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#define GET_SUBTARGET_FEATURE_NAME
+#include "X86GenAsmMatcher.inc"
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h
new file mode 100644
index 000000000000..c45a3f14ef11
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h
@@ -0,0 +1,41 @@
+//===-- X86AsmParserCommon.h - Common functions for X86AsmParser ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMPARSERCOMMON_H
+#define LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMPARSERCOMMON_H
+
+#include "llvm/Support/MathExtras.h"
+
+namespace llvm {
+
+inline bool isImmSExti16i8Value(uint64_t Value) {
+ return isInt<8>(Value) ||
+ (isUInt<16>(Value) && isInt<8>(static_cast<int16_t>(Value)));
+}
+
+inline bool isImmSExti32i8Value(uint64_t Value) {
+ return isInt<8>(Value) ||
+ (isUInt<32>(Value) && isInt<8>(static_cast<int32_t>(Value)));
+}
+
+inline bool isImmSExti64i8Value(uint64_t Value) {
+ return isInt<8>(Value);
+}
+
+inline bool isImmSExti64i32Value(uint64_t Value) {
+ return isInt<32>(Value);
+}
+
+inline bool isImmUnsignedi8Value(uint64_t Value) {
+ return isUInt<8>(Value) || isInt<8>(Value);
+}
+
+} // End of namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h b/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h
new file mode 100644
index 000000000000..9db1a8483bee
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h
@@ -0,0 +1,546 @@
+//===-- X86Operand.h - Parsed X86 machine instruction --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H
+#define LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H
+
+#include "X86AsmParserCommon.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/ADT/STLExtras.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+
+namespace llvm {
+
+/// X86Operand - Instances of this class represent a parsed X86 machine
+/// instruction.
+struct X86Operand : public MCParsedAsmOperand {
+ enum KindTy {
+ Token,
+ Register,
+ Immediate,
+ Memory
+ } Kind;
+
+ SMLoc StartLoc, EndLoc;
+ SMLoc OffsetOfLoc;
+ StringRef SymName;
+ void *OpDecl;
+ bool AddressOf;
+
+ struct TokOp {
+ const char *Data;
+ unsigned Length;
+ };
+
+ struct RegOp {
+ unsigned RegNo;
+ };
+
+ struct ImmOp {
+ const MCExpr *Val;
+ };
+
+ struct MemOp {
+ unsigned SegReg;
+ const MCExpr *Disp;
+ unsigned BaseReg;
+ unsigned IndexReg;
+ unsigned Scale;
+ unsigned Size;
+ unsigned ModeSize;
+ };
+
+ union {
+ struct TokOp Tok;
+ struct RegOp Reg;
+ struct ImmOp Imm;
+ struct MemOp Mem;
+ };
+
+ X86Operand(KindTy K, SMLoc Start, SMLoc End)
+ : Kind(K), StartLoc(Start), EndLoc(End) {}
+
+ StringRef getSymName() override { return SymName; }
+ void *getOpDecl() override { return OpDecl; }
+
+ /// getStartLoc - Get the location of the first token of this operand.
+ SMLoc getStartLoc() const override { return StartLoc; }
+ /// getEndLoc - Get the location of the last token of this operand.
+ SMLoc getEndLoc() const override { return EndLoc; }
+ /// getLocRange - Get the range between the first and last token of this
+ /// operand.
+ SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); }
+ /// getOffsetOfLoc - Get the location of the offset operator.
+ SMLoc getOffsetOfLoc() const override { return OffsetOfLoc; }
+
+ void print(raw_ostream &OS) const override {}
+
+ StringRef getToken() const {
+ assert(Kind == Token && "Invalid access!");
+ return StringRef(Tok.Data, Tok.Length);
+ }
+ void setTokenValue(StringRef Value) {
+ assert(Kind == Token && "Invalid access!");
+ Tok.Data = Value.data();
+ Tok.Length = Value.size();
+ }
+
+ unsigned getReg() const override {
+ assert(Kind == Register && "Invalid access!");
+ return Reg.RegNo;
+ }
+
+ const MCExpr *getImm() const {
+ assert(Kind == Immediate && "Invalid access!");
+ return Imm.Val;
+ }
+
+ const MCExpr *getMemDisp() const {
+ assert(Kind == Memory && "Invalid access!");
+ return Mem.Disp;
+ }
+ unsigned getMemSegReg() const {
+ assert(Kind == Memory && "Invalid access!");
+ return Mem.SegReg;
+ }
+ unsigned getMemBaseReg() const {
+ assert(Kind == Memory && "Invalid access!");
+ return Mem.BaseReg;
+ }
+ unsigned getMemIndexReg() const {
+ assert(Kind == Memory && "Invalid access!");
+ return Mem.IndexReg;
+ }
+ unsigned getMemScale() const {
+ assert(Kind == Memory && "Invalid access!");
+ return Mem.Scale;
+ }
+ unsigned getMemModeSize() const {
+ assert(Kind == Memory && "Invalid access!");
+ return Mem.ModeSize;
+ }
+
+ bool isToken() const override {return Kind == Token; }
+
+ bool isImm() const override { return Kind == Immediate; }
+
+ bool isImmSExti16i8() const {
+ if (!isImm())
+ return false;
+
+ // If this isn't a constant expr, just assume it fits and let relaxation
+ // handle it.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE)
+ return true;
+
+ // Otherwise, check the value is in a range that makes sense for this
+ // extension.
+ return isImmSExti16i8Value(CE->getValue());
+ }
+ bool isImmSExti32i8() const {
+ if (!isImm())
+ return false;
+
+ // If this isn't a constant expr, just assume it fits and let relaxation
+ // handle it.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE)
+ return true;
+
+ // Otherwise, check the value is in a range that makes sense for this
+ // extension.
+ return isImmSExti32i8Value(CE->getValue());
+ }
+ bool isImmSExti64i8() const {
+ if (!isImm())
+ return false;
+
+ // If this isn't a constant expr, just assume it fits and let relaxation
+ // handle it.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE)
+ return true;
+
+ // Otherwise, check the value is in a range that makes sense for this
+ // extension.
+ return isImmSExti64i8Value(CE->getValue());
+ }
+ bool isImmSExti64i32() const {
+ if (!isImm())
+ return false;
+
+ // If this isn't a constant expr, just assume it fits and let relaxation
+ // handle it.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE)
+ return true;
+
+ // Otherwise, check the value is in a range that makes sense for this
+ // extension.
+ return isImmSExti64i32Value(CE->getValue());
+ }
+
+ bool isImmUnsignedi8() const {
+ if (!isImm()) return false;
+ // If this isn't a constant expr, just assume it fits and let relaxation
+ // handle it.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return true;
+ return isImmUnsignedi8Value(CE->getValue());
+ }
+
+ bool isOffsetOf() const override {
+ return OffsetOfLoc.getPointer();
+ }
+
+ bool needAddressOf() const override {
+ return AddressOf;
+ }
+
+ bool isMem() const override { return Kind == Memory; }
+ bool isMemUnsized() const {
+ return Kind == Memory && Mem.Size == 0;
+ }
+ bool isMem8() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 8);
+ }
+ bool isMem16() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 16);
+ }
+ bool isMem32() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 32);
+ }
+ bool isMem64() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 64);
+ }
+ bool isMem80() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 80);
+ }
+ bool isMem128() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 128);
+ }
+ bool isMem256() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 256);
+ }
+ bool isMem512() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 512);
+ }
+ bool isMemIndexReg(unsigned LowR, unsigned HighR) const {
+ assert(Kind == Memory && "Invalid access!");
+ return Mem.IndexReg >= LowR && Mem.IndexReg <= HighR;
+ }
+
+ bool isMem64_RC128() const {
+ return isMem64() && isMemIndexReg(X86::XMM0, X86::XMM15);
+ }
+ bool isMem128_RC128() const {
+ return isMem128() && isMemIndexReg(X86::XMM0, X86::XMM15);
+ }
+ bool isMem128_RC256() const {
+ return isMem128() && isMemIndexReg(X86::YMM0, X86::YMM15);
+ }
+ bool isMem256_RC128() const {
+ return isMem256() && isMemIndexReg(X86::XMM0, X86::XMM15);
+ }
+ bool isMem256_RC256() const {
+ return isMem256() && isMemIndexReg(X86::YMM0, X86::YMM15);
+ }
+
+ bool isMem64_RC128X() const {
+ return isMem64() && isMemIndexReg(X86::XMM0, X86::XMM31);
+ }
+ bool isMem128_RC128X() const {
+ return isMem128() && isMemIndexReg(X86::XMM0, X86::XMM31);
+ }
+ bool isMem128_RC256X() const {
+ return isMem128() && isMemIndexReg(X86::YMM0, X86::YMM31);
+ }
+ bool isMem256_RC128X() const {
+ return isMem256() && isMemIndexReg(X86::XMM0, X86::XMM31);
+ }
+ bool isMem256_RC256X() const {
+ return isMem256() && isMemIndexReg(X86::YMM0, X86::YMM31);
+ }
+ bool isMem512_RC256X() const {
+ return isMem512() && isMemIndexReg(X86::YMM0, X86::YMM31);
+ }
+ bool isMem512_RC512() const {
+ return isMem512() && isMemIndexReg(X86::ZMM0, X86::ZMM31);
+ }
+
+ bool isAbsMem() const {
+ return Kind == Memory && !getMemSegReg() && !getMemBaseReg() &&
+ !getMemIndexReg() && getMemScale() == 1;
+ }
+ bool isAVX512RC() const{
+ return isImm();
+ }
+
+ bool isAbsMem16() const {
+ return isAbsMem() && Mem.ModeSize == 16;
+ }
+
+ bool isSrcIdx() const {
+ return !getMemIndexReg() && getMemScale() == 1 &&
+ (getMemBaseReg() == X86::RSI || getMemBaseReg() == X86::ESI ||
+ getMemBaseReg() == X86::SI) && isa<MCConstantExpr>(getMemDisp()) &&
+ cast<MCConstantExpr>(getMemDisp())->getValue() == 0;
+ }
+ bool isSrcIdx8() const {
+ return isMem8() && isSrcIdx();
+ }
+ bool isSrcIdx16() const {
+ return isMem16() && isSrcIdx();
+ }
+ bool isSrcIdx32() const {
+ return isMem32() && isSrcIdx();
+ }
+ bool isSrcIdx64() const {
+ return isMem64() && isSrcIdx();
+ }
+
+ bool isDstIdx() const {
+ return !getMemIndexReg() && getMemScale() == 1 &&
+ (getMemSegReg() == 0 || getMemSegReg() == X86::ES) &&
+ (getMemBaseReg() == X86::RDI || getMemBaseReg() == X86::EDI ||
+ getMemBaseReg() == X86::DI) && isa<MCConstantExpr>(getMemDisp()) &&
+ cast<MCConstantExpr>(getMemDisp())->getValue() == 0;
+ }
+ bool isDstIdx8() const {
+ return isMem8() && isDstIdx();
+ }
+ bool isDstIdx16() const {
+ return isMem16() && isDstIdx();
+ }
+ bool isDstIdx32() const {
+ return isMem32() && isDstIdx();
+ }
+ bool isDstIdx64() const {
+ return isMem64() && isDstIdx();
+ }
+
+ bool isMemOffs() const {
+ return Kind == Memory && !getMemBaseReg() && !getMemIndexReg() &&
+ getMemScale() == 1;
+ }
+
+ bool isMemOffs16_8() const {
+ return isMemOffs() && Mem.ModeSize == 16 && (!Mem.Size || Mem.Size == 8);
+ }
+ bool isMemOffs16_16() const {
+ return isMemOffs() && Mem.ModeSize == 16 && (!Mem.Size || Mem.Size == 16);
+ }
+ bool isMemOffs16_32() const {
+ return isMemOffs() && Mem.ModeSize == 16 && (!Mem.Size || Mem.Size == 32);
+ }
+ bool isMemOffs32_8() const {
+ return isMemOffs() && Mem.ModeSize == 32 && (!Mem.Size || Mem.Size == 8);
+ }
+ bool isMemOffs32_16() const {
+ return isMemOffs() && Mem.ModeSize == 32 && (!Mem.Size || Mem.Size == 16);
+ }
+ bool isMemOffs32_32() const {
+ return isMemOffs() && Mem.ModeSize == 32 && (!Mem.Size || Mem.Size == 32);
+ }
+ bool isMemOffs32_64() const {
+ return isMemOffs() && Mem.ModeSize == 32 && (!Mem.Size || Mem.Size == 64);
+ }
+ bool isMemOffs64_8() const {
+ return isMemOffs() && Mem.ModeSize == 64 && (!Mem.Size || Mem.Size == 8);
+ }
+ bool isMemOffs64_16() const {
+ return isMemOffs() && Mem.ModeSize == 64 && (!Mem.Size || Mem.Size == 16);
+ }
+ bool isMemOffs64_32() const {
+ return isMemOffs() && Mem.ModeSize == 64 && (!Mem.Size || Mem.Size == 32);
+ }
+ bool isMemOffs64_64() const {
+ return isMemOffs() && Mem.ModeSize == 64 && (!Mem.Size || Mem.Size == 64);
+ }
+
+ bool isReg() const override { return Kind == Register; }
+
+ bool isGR32orGR64() const {
+ return Kind == Register &&
+ (X86MCRegisterClasses[X86::GR32RegClassID].contains(getReg()) ||
+ X86MCRegisterClasses[X86::GR64RegClassID].contains(getReg()));
+ }
+
+ void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+ // Add as immediates when possible.
+ if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+ Inst.addOperand(MCOperand::createImm(CE->getValue()));
+ else
+ Inst.addOperand(MCOperand::createExpr(Expr));
+ }
+
+ void addRegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getReg()));
+ }
+
+ static unsigned getGR32FromGR64(unsigned RegNo) {
+ switch (RegNo) {
+ default: llvm_unreachable("Unexpected register");
+ case X86::RAX: return X86::EAX;
+ case X86::RCX: return X86::ECX;
+ case X86::RDX: return X86::EDX;
+ case X86::RBX: return X86::EBX;
+ case X86::RBP: return X86::EBP;
+ case X86::RSP: return X86::ESP;
+ case X86::RSI: return X86::ESI;
+ case X86::RDI: return X86::EDI;
+ case X86::R8: return X86::R8D;
+ case X86::R9: return X86::R9D;
+ case X86::R10: return X86::R10D;
+ case X86::R11: return X86::R11D;
+ case X86::R12: return X86::R12D;
+ case X86::R13: return X86::R13D;
+ case X86::R14: return X86::R14D;
+ case X86::R15: return X86::R15D;
+ case X86::RIP: return X86::EIP;
+ }
+ }
+
+ void addGR32orGR64Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ unsigned RegNo = getReg();
+ if (X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo))
+ RegNo = getGR32FromGR64(RegNo);
+ Inst.addOperand(MCOperand::createReg(RegNo));
+ }
+ void addAVX512RCOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ addExpr(Inst, getImm());
+ }
+ void addImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ addExpr(Inst, getImm());
+ }
+
+ void addMemOperands(MCInst &Inst, unsigned N) const {
+ assert((N == 5) && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getMemBaseReg()));
+ Inst.addOperand(MCOperand::createImm(getMemScale()));
+ Inst.addOperand(MCOperand::createReg(getMemIndexReg()));
+ addExpr(Inst, getMemDisp());
+ Inst.addOperand(MCOperand::createReg(getMemSegReg()));
+ }
+
+ void addAbsMemOperands(MCInst &Inst, unsigned N) const {
+ assert((N == 1) && "Invalid number of operands!");
+ // Add as immediates when possible.
+ if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemDisp()))
+ Inst.addOperand(MCOperand::createImm(CE->getValue()));
+ else
+ Inst.addOperand(MCOperand::createExpr(getMemDisp()));
+ }
+
+ void addSrcIdxOperands(MCInst &Inst, unsigned N) const {
+ assert((N == 2) && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getMemBaseReg()));
+ Inst.addOperand(MCOperand::createReg(getMemSegReg()));
+ }
+ void addDstIdxOperands(MCInst &Inst, unsigned N) const {
+ assert((N == 1) && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getMemBaseReg()));
+ }
+
+ void addMemOffsOperands(MCInst &Inst, unsigned N) const {
+ assert((N == 2) && "Invalid number of operands!");
+ // Add as immediates when possible.
+ if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemDisp()))
+ Inst.addOperand(MCOperand::createImm(CE->getValue()));
+ else
+ Inst.addOperand(MCOperand::createExpr(getMemDisp()));
+ Inst.addOperand(MCOperand::createReg(getMemSegReg()));
+ }
+
+ static std::unique_ptr<X86Operand> CreateToken(StringRef Str, SMLoc Loc) {
+ SMLoc EndLoc = SMLoc::getFromPointer(Loc.getPointer() + Str.size());
+ auto Res = llvm::make_unique<X86Operand>(Token, Loc, EndLoc);
+ Res->Tok.Data = Str.data();
+ Res->Tok.Length = Str.size();
+ return Res;
+ }
+
+ static std::unique_ptr<X86Operand>
+ CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc,
+ bool AddressOf = false, SMLoc OffsetOfLoc = SMLoc(),
+ StringRef SymName = StringRef(), void *OpDecl = nullptr) {
+ auto Res = llvm::make_unique<X86Operand>(Register, StartLoc, EndLoc);
+ Res->Reg.RegNo = RegNo;
+ Res->AddressOf = AddressOf;
+ Res->OffsetOfLoc = OffsetOfLoc;
+ Res->SymName = SymName;
+ Res->OpDecl = OpDecl;
+ return Res;
+ }
+
+ static std::unique_ptr<X86Operand> CreateImm(const MCExpr *Val,
+ SMLoc StartLoc, SMLoc EndLoc) {
+ auto Res = llvm::make_unique<X86Operand>(Immediate, StartLoc, EndLoc);
+ Res->Imm.Val = Val;
+ return Res;
+ }
+
+ /// Create an absolute memory operand.
+ static std::unique_ptr<X86Operand>
+ CreateMem(unsigned ModeSize, const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc,
+ unsigned Size = 0, StringRef SymName = StringRef(),
+ void *OpDecl = nullptr) {
+ auto Res = llvm::make_unique<X86Operand>(Memory, StartLoc, EndLoc);
+ Res->Mem.SegReg = 0;
+ Res->Mem.Disp = Disp;
+ Res->Mem.BaseReg = 0;
+ Res->Mem.IndexReg = 0;
+ Res->Mem.Scale = 1;
+ Res->Mem.Size = Size;
+ Res->Mem.ModeSize = ModeSize;
+ Res->SymName = SymName;
+ Res->OpDecl = OpDecl;
+ Res->AddressOf = false;
+ return Res;
+ }
+
+ /// Create a generalized memory operand.
+ static std::unique_ptr<X86Operand>
+ CreateMem(unsigned ModeSize, unsigned SegReg, const MCExpr *Disp,
+ unsigned BaseReg, unsigned IndexReg, unsigned Scale, SMLoc StartLoc,
+ SMLoc EndLoc, unsigned Size = 0, StringRef SymName = StringRef(),
+ void *OpDecl = nullptr) {
+ // We should never just have a displacement, that should be parsed as an
+ // absolute memory operand.
+ assert((SegReg || BaseReg || IndexReg) && "Invalid memory operand!");
+
+ // The scale should always be one of {1,2,4,8}.
+ assert(((Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8)) &&
+ "Invalid scale!");
+ auto Res = llvm::make_unique<X86Operand>(Memory, StartLoc, EndLoc);
+ Res->Mem.SegReg = SegReg;
+ Res->Mem.Disp = Disp;
+ Res->Mem.BaseReg = BaseReg;
+ Res->Mem.IndexReg = IndexReg;
+ Res->Mem.Scale = Scale;
+ Res->Mem.Size = Size;
+ Res->Mem.ModeSize = ModeSize;
+ Res->SymName = SymName;
+ Res->OpDecl = OpDecl;
+ Res->AddressOf = false;
+ return Res;
+ }
+};
+
+} // End of namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
new file mode 100644
index 000000000000..0871888bbfcd
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -0,0 +1,1083 @@
+//===-- X86Disassembler.cpp - Disassembler for x86 and x86_64 -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the X86 Disassembler.
+// It contains code to translate the data produced by the decoder into
+// MCInsts.
+//
+//
+// The X86 disassembler is a table-driven disassembler for the 16-, 32-, and
+// 64-bit X86 instruction sets. The main decode sequence for an assembly
+// instruction in this disassembler is:
+//
+// 1. Read the prefix bytes and determine the attributes of the instruction.
+// These attributes, recorded in enum attributeBits
+// (X86DisassemblerDecoderCommon.h), form a bitmask. The table CONTEXTS_SYM
+// provides a mapping from bitmasks to contexts, which are represented by
+// enum InstructionContext (ibid.).
+//
+// 2. Read the opcode, and determine what kind of opcode it is. The
+// disassembler distinguishes four kinds of opcodes, which are enumerated in
+// OpcodeType (X86DisassemblerDecoderCommon.h): one-byte (0xnn), two-byte
+// (0x0f 0xnn), three-byte-38 (0x0f 0x38 0xnn), or three-byte-3a
+// (0x0f 0x3a 0xnn). Mandatory prefixes are treated as part of the context.
+//
+// 3. Depending on the opcode type, look in one of four ClassDecision structures
+// (X86DisassemblerDecoderCommon.h). Use the opcode class to determine which
+// OpcodeDecision (ibid.) to look the opcode in. Look up the opcode, to get
+// a ModRMDecision (ibid.).
+//
+// 4. Some instructions, such as escape opcodes or extended opcodes, or even
+// instructions that have ModRM*Reg / ModRM*Mem forms in LLVM, need the
+// ModR/M byte to complete decode. The ModRMDecision's type is an entry from
+// ModRMDecisionType (X86DisassemblerDecoderCommon.h) that indicates if the
+// ModR/M byte is required and how to interpret it.
+//
+// 5. After resolving the ModRMDecision, the disassembler has a unique ID
+// of type InstrUID (X86DisassemblerDecoderCommon.h). Looking this ID up in
+// INSTRUCTIONS_SYM yields the name of the instruction and the encodings and
+// meanings of its operands.
+//
+// 6. For each operand, its encoding is an entry from OperandEncoding
+// (X86DisassemblerDecoderCommon.h) and its type is an entry from
+// OperandType (ibid.). The encoding indicates how to read it from the
+// instruction; the type indicates how to interpret the value once it has
+// been read. For example, a register operand could be stored in the R/M
+// field of the ModR/M byte, the REG field of the ModR/M byte, or added to
+// the main opcode. This is orthogonal from its meaning (an GPR or an XMM
+// register, for instance). Given this information, the operands can be
+// extracted and interpreted.
+//
+// 7. As the last step, the disassembler translates the instruction information
+// and operands into a format understandable by the client - in this case, an
+// MCInst for use by the MC infrastructure.
+//
+// The disassembler is broken broadly into two parts: the table emitter that
+// emits the instruction decode tables discussed above during compilation, and
+// the disassembler itself. The table emitter is documented in more detail in
+// utils/TableGen/X86DisassemblerEmitter.h.
+//
+// X86Disassembler.cpp contains the code responsible for step 7, and for
+// invoking the decoder to execute steps 1-6.
+// X86DisassemblerDecoderCommon.h contains the definitions needed by both the
+// table emitter and the disassembler.
+// X86DisassemblerDecoder.h contains the public interface of the decoder,
+// factored out into C for possible use by other projects.
+// X86DisassemblerDecoder.c contains the source code of the decoder, which is
+// responsible for steps 1-6.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86DisassemblerDecoder.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::X86Disassembler;
+
+#define DEBUG_TYPE "x86-disassembler"
+
+void llvm::X86Disassembler::Debug(const char *file, unsigned line,
+ const char *s) {
+ dbgs() << file << ":" << line << ": " << s;
+}
+
+StringRef llvm::X86Disassembler::GetInstrName(unsigned Opcode,
+ const void *mii) {
+ const MCInstrInfo *MII = static_cast<const MCInstrInfo *>(mii);
+ return MII->getName(Opcode);
+}
+
+#define debug(s) DEBUG(Debug(__FILE__, __LINE__, s));
+
+namespace llvm {
+
+// Fill-ins to make the compiler happy. These constants are never actually
+// assigned; they are just filler to make an automatically-generated switch
+// statement work.
+namespace X86 {
+ enum {
+ BX_SI = 500,
+ BX_DI = 501,
+ BP_SI = 502,
+ BP_DI = 503,
+ sib = 504,
+ sib64 = 505
+ };
+}
+
+}
+
+static bool translateInstruction(MCInst &target,
+ InternalInstruction &source,
+ const MCDisassembler *Dis);
+
+namespace {
+
+/// Generic disassembler for all X86 platforms. All each platform class should
+/// have to do is subclass the constructor, and provide a different
+/// disassemblerMode value.
+class X86GenericDisassembler : public MCDisassembler {
+ std::unique_ptr<const MCInstrInfo> MII;
+public:
+ X86GenericDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
+ std::unique_ptr<const MCInstrInfo> MII);
+public:
+ DecodeStatus getInstruction(MCInst &instr, uint64_t &size,
+ ArrayRef<uint8_t> Bytes, uint64_t Address,
+ raw_ostream &vStream,
+ raw_ostream &cStream) const override;
+
+private:
+ DisassemblerMode fMode;
+};
+
+}
+
+X86GenericDisassembler::X86GenericDisassembler(
+ const MCSubtargetInfo &STI,
+ MCContext &Ctx,
+ std::unique_ptr<const MCInstrInfo> MII)
+ : MCDisassembler(STI, Ctx), MII(std::move(MII)) {
+ const FeatureBitset &FB = STI.getFeatureBits();
+ if (FB[X86::Mode16Bit]) {
+ fMode = MODE_16BIT;
+ return;
+ } else if (FB[X86::Mode32Bit]) {
+ fMode = MODE_32BIT;
+ return;
+ } else if (FB[X86::Mode64Bit]) {
+ fMode = MODE_64BIT;
+ return;
+ }
+
+ llvm_unreachable("Invalid CPU mode");
+}
+
+namespace {
+struct Region {
+ ArrayRef<uint8_t> Bytes;
+ uint64_t Base;
+ Region(ArrayRef<uint8_t> Bytes, uint64_t Base) : Bytes(Bytes), Base(Base) {}
+};
+} // end anonymous namespace
+
+/// A callback function that wraps the readByte method from Region.
+///
+/// @param Arg - The generic callback parameter. In this case, this should
+/// be a pointer to a Region.
+/// @param Byte - A pointer to the byte to be read.
+/// @param Address - The address to be read.
+static int regionReader(const void *Arg, uint8_t *Byte, uint64_t Address) {
+ auto *R = static_cast<const Region *>(Arg);
+ ArrayRef<uint8_t> Bytes = R->Bytes;
+ unsigned Index = Address - R->Base;
+ if (Bytes.size() <= Index)
+ return -1;
+ *Byte = Bytes[Index];
+ return 0;
+}
+
+/// logger - a callback function that wraps the operator<< method from
+/// raw_ostream.
+///
+/// @param arg - The generic callback parameter. This should be a pointe
+/// to a raw_ostream.
+/// @param log - A string to be logged. logger() adds a newline.
+static void logger(void* arg, const char* log) {
+ if (!arg)
+ return;
+
+ raw_ostream &vStream = *(static_cast<raw_ostream*>(arg));
+ vStream << log << "\n";
+}
+
+//
+// Public interface for the disassembler
+//
+
+MCDisassembler::DecodeStatus X86GenericDisassembler::getInstruction(
+ MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address,
+ raw_ostream &VStream, raw_ostream &CStream) const {
+ CommentStream = &CStream;
+
+ InternalInstruction InternalInstr;
+
+ dlog_t LoggerFn = logger;
+ if (&VStream == &nulls())
+ LoggerFn = nullptr; // Disable logging completely if it's going to nulls().
+
+ Region R(Bytes, Address);
+
+ int Ret = decodeInstruction(&InternalInstr, regionReader, (const void *)&R,
+ LoggerFn, (void *)&VStream,
+ (const void *)MII.get(), Address, fMode);
+
+ if (Ret) {
+ Size = InternalInstr.readerCursor - Address;
+ return Fail;
+ } else {
+ Size = InternalInstr.length;
+ return (!translateInstruction(Instr, InternalInstr, this)) ? Success : Fail;
+ }
+}
+
+//
+// Private code that translates from struct InternalInstructions to MCInsts.
+//
+
+/// translateRegister - Translates an internal register to the appropriate LLVM
+/// register, and appends it as an operand to an MCInst.
+///
+/// @param mcInst - The MCInst to append to.
+/// @param reg - The Reg to append.
+static void translateRegister(MCInst &mcInst, Reg reg) {
+#define ENTRY(x) X86::x,
+ uint8_t llvmRegnums[] = {
+ ALL_REGS
+ 0
+ };
+#undef ENTRY
+
+ uint8_t llvmRegnum = llvmRegnums[reg];
+ mcInst.addOperand(MCOperand::createReg(llvmRegnum));
+}
+
+/// tryAddingSymbolicOperand - trys to add a symbolic operand in place of the
+/// immediate Value in the MCInst.
+///
+/// @param Value - The immediate Value, has had any PC adjustment made by
+/// the caller.
+/// @param isBranch - If the instruction is a branch instruction
+/// @param Address - The starting address of the instruction
+/// @param Offset - The byte offset to this immediate in the instruction
+/// @param Width - The byte width of this immediate in the instruction
+///
+/// If the getOpInfo() function was set when setupForSymbolicDisassembly() was
+/// called then that function is called to get any symbolic information for the
+/// immediate in the instruction using the Address, Offset and Width. If that
+/// returns non-zero then the symbolic information it returns is used to create
+/// an MCExpr and that is added as an operand to the MCInst. If getOpInfo()
+/// returns zero and isBranch is true then a symbol look up for immediate Value
+/// is done and if a symbol is found an MCExpr is created with that, else
+/// an MCExpr with the immediate Value is created. This function returns true
+/// if it adds an operand to the MCInst and false otherwise.
+static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch,
+ uint64_t Address, uint64_t Offset,
+ uint64_t Width, MCInst &MI,
+ const MCDisassembler *Dis) {
+ return Dis->tryAddingSymbolicOperand(MI, Value, Address, isBranch,
+ Offset, Width);
+}
+
+/// tryAddingPcLoadReferenceComment - trys to add a comment as to what is being
+/// referenced by a load instruction with the base register that is the rip.
+/// These can often be addresses in a literal pool. The Address of the
+/// instruction and its immediate Value are used to determine the address
+/// being referenced in the literal pool entry. The SymbolLookUp call back will
+/// return a pointer to a literal 'C' string if the referenced address is an
+/// address into a section with 'C' string literals.
+static void tryAddingPcLoadReferenceComment(uint64_t Address, uint64_t Value,
+ const void *Decoder) {
+ const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
+ Dis->tryAddingPcLoadReferenceComment(Value, Address);
+}
+
+static const uint8_t segmentRegnums[SEG_OVERRIDE_max] = {
+ 0, // SEG_OVERRIDE_NONE
+ X86::CS,
+ X86::SS,
+ X86::DS,
+ X86::ES,
+ X86::FS,
+ X86::GS
+};
+
+/// translateSrcIndex - Appends a source index operand to an MCInst.
+///
+/// @param mcInst - The MCInst to append to.
+/// @param insn - The internal instruction.
+static bool translateSrcIndex(MCInst &mcInst, InternalInstruction &insn) {
+ unsigned baseRegNo;
+
+ if (insn.mode == MODE_64BIT)
+ baseRegNo = insn.prefixPresent[0x67] ? X86::ESI : X86::RSI;
+ else if (insn.mode == MODE_32BIT)
+ baseRegNo = insn.prefixPresent[0x67] ? X86::SI : X86::ESI;
+ else {
+ assert(insn.mode == MODE_16BIT);
+ baseRegNo = insn.prefixPresent[0x67] ? X86::ESI : X86::SI;
+ }
+ MCOperand baseReg = MCOperand::createReg(baseRegNo);
+ mcInst.addOperand(baseReg);
+
+ MCOperand segmentReg;
+ segmentReg = MCOperand::createReg(segmentRegnums[insn.segmentOverride]);
+ mcInst.addOperand(segmentReg);
+ return false;
+}
+
+/// translateDstIndex - Appends a destination index operand to an MCInst.
+///
+/// @param mcInst - The MCInst to append to.
+/// @param insn - The internal instruction.
+
+static bool translateDstIndex(MCInst &mcInst, InternalInstruction &insn) {
+ unsigned baseRegNo;
+
+ if (insn.mode == MODE_64BIT)
+ baseRegNo = insn.prefixPresent[0x67] ? X86::EDI : X86::RDI;
+ else if (insn.mode == MODE_32BIT)
+ baseRegNo = insn.prefixPresent[0x67] ? X86::DI : X86::EDI;
+ else {
+ assert(insn.mode == MODE_16BIT);
+ baseRegNo = insn.prefixPresent[0x67] ? X86::EDI : X86::DI;
+ }
+ MCOperand baseReg = MCOperand::createReg(baseRegNo);
+ mcInst.addOperand(baseReg);
+ return false;
+}
+
+/// translateImmediate - Appends an immediate operand to an MCInst.
+///
+/// @param mcInst - The MCInst to append to.
+/// @param immediate - The immediate value to append.
+/// @param operand - The operand, as stored in the descriptor table.
+/// @param insn - The internal instruction.
+static void translateImmediate(MCInst &mcInst, uint64_t immediate,
+ const OperandSpecifier &operand,
+ InternalInstruction &insn,
+ const MCDisassembler *Dis) {
+ // Sign-extend the immediate if necessary.
+
+ OperandType type = (OperandType)operand.type;
+
+ bool isBranch = false;
+ uint64_t pcrel = 0;
+ if (type == TYPE_RELv) {
+ isBranch = true;
+ pcrel = insn.startLocation +
+ insn.immediateOffset + insn.immediateSize;
+ switch (insn.displacementSize) {
+ default:
+ break;
+ case 1:
+ if(immediate & 0x80)
+ immediate |= ~(0xffull);
+ break;
+ case 2:
+ if(immediate & 0x8000)
+ immediate |= ~(0xffffull);
+ break;
+ case 4:
+ if(immediate & 0x80000000)
+ immediate |= ~(0xffffffffull);
+ break;
+ case 8:
+ break;
+ }
+ }
+ // By default sign-extend all X86 immediates based on their encoding.
+ else if (type == TYPE_IMM8 || type == TYPE_IMM16 || type == TYPE_IMM32 ||
+ type == TYPE_IMM64 || type == TYPE_IMMv) {
+ switch (operand.encoding) {
+ default:
+ break;
+ case ENCODING_IB:
+ if(immediate & 0x80)
+ immediate |= ~(0xffull);
+ break;
+ case ENCODING_IW:
+ if(immediate & 0x8000)
+ immediate |= ~(0xffffull);
+ break;
+ case ENCODING_ID:
+ if(immediate & 0x80000000)
+ immediate |= ~(0xffffffffull);
+ break;
+ case ENCODING_IO:
+ break;
+ }
+ } else if (type == TYPE_IMM3) {
+ // Check for immediates that printSSECC can't handle.
+ if (immediate >= 8) {
+ unsigned NewOpc;
+ switch (mcInst.getOpcode()) {
+ default: llvm_unreachable("unexpected opcode");
+ case X86::CMPPDrmi: NewOpc = X86::CMPPDrmi_alt; break;
+ case X86::CMPPDrri: NewOpc = X86::CMPPDrri_alt; break;
+ case X86::CMPPSrmi: NewOpc = X86::CMPPSrmi_alt; break;
+ case X86::CMPPSrri: NewOpc = X86::CMPPSrri_alt; break;
+ case X86::CMPSDrm: NewOpc = X86::CMPSDrm_alt; break;
+ case X86::CMPSDrr: NewOpc = X86::CMPSDrr_alt; break;
+ case X86::CMPSSrm: NewOpc = X86::CMPSSrm_alt; break;
+ case X86::CMPSSrr: NewOpc = X86::CMPSSrr_alt; break;
+ case X86::VPCOMBri: NewOpc = X86::VPCOMBri_alt; break;
+ case X86::VPCOMBmi: NewOpc = X86::VPCOMBmi_alt; break;
+ case X86::VPCOMWri: NewOpc = X86::VPCOMWri_alt; break;
+ case X86::VPCOMWmi: NewOpc = X86::VPCOMWmi_alt; break;
+ case X86::VPCOMDri: NewOpc = X86::VPCOMDri_alt; break;
+ case X86::VPCOMDmi: NewOpc = X86::VPCOMDmi_alt; break;
+ case X86::VPCOMQri: NewOpc = X86::VPCOMQri_alt; break;
+ case X86::VPCOMQmi: NewOpc = X86::VPCOMQmi_alt; break;
+ case X86::VPCOMUBri: NewOpc = X86::VPCOMUBri_alt; break;
+ case X86::VPCOMUBmi: NewOpc = X86::VPCOMUBmi_alt; break;
+ case X86::VPCOMUWri: NewOpc = X86::VPCOMUWri_alt; break;
+ case X86::VPCOMUWmi: NewOpc = X86::VPCOMUWmi_alt; break;
+ case X86::VPCOMUDri: NewOpc = X86::VPCOMUDri_alt; break;
+ case X86::VPCOMUDmi: NewOpc = X86::VPCOMUDmi_alt; break;
+ case X86::VPCOMUQri: NewOpc = X86::VPCOMUQri_alt; break;
+ case X86::VPCOMUQmi: NewOpc = X86::VPCOMUQmi_alt; break;
+ }
+ // Switch opcode to the one that doesn't get special printing.
+ mcInst.setOpcode(NewOpc);
+ }
+ } else if (type == TYPE_IMM5) {
+ // Check for immediates that printAVXCC can't handle.
+ if (immediate >= 32) {
+ unsigned NewOpc;
+ switch (mcInst.getOpcode()) {
+ default: llvm_unreachable("unexpected opcode");
+ case X86::VCMPPDrmi: NewOpc = X86::VCMPPDrmi_alt; break;
+ case X86::VCMPPDrri: NewOpc = X86::VCMPPDrri_alt; break;
+ case X86::VCMPPSrmi: NewOpc = X86::VCMPPSrmi_alt; break;
+ case X86::VCMPPSrri: NewOpc = X86::VCMPPSrri_alt; break;
+ case X86::VCMPSDrm: NewOpc = X86::VCMPSDrm_alt; break;
+ case X86::VCMPSDrr: NewOpc = X86::VCMPSDrr_alt; break;
+ case X86::VCMPSSrm: NewOpc = X86::VCMPSSrm_alt; break;
+ case X86::VCMPSSrr: NewOpc = X86::VCMPSSrr_alt; break;
+ case X86::VCMPPDYrmi: NewOpc = X86::VCMPPDYrmi_alt; break;
+ case X86::VCMPPDYrri: NewOpc = X86::VCMPPDYrri_alt; break;
+ case X86::VCMPPSYrmi: NewOpc = X86::VCMPPSYrmi_alt; break;
+ case X86::VCMPPSYrri: NewOpc = X86::VCMPPSYrri_alt; break;
+ case X86::VCMPPDZrmi: NewOpc = X86::VCMPPDZrmi_alt; break;
+ case X86::VCMPPDZrri: NewOpc = X86::VCMPPDZrri_alt; break;
+ case X86::VCMPPDZrrib: NewOpc = X86::VCMPPDZrrib_alt; break;
+ case X86::VCMPPSZrmi: NewOpc = X86::VCMPPSZrmi_alt; break;
+ case X86::VCMPPSZrri: NewOpc = X86::VCMPPSZrri_alt; break;
+ case X86::VCMPPSZrrib: NewOpc = X86::VCMPPSZrrib_alt; break;
+ case X86::VCMPPDZ128rmi: NewOpc = X86::VCMPPDZ128rmi_alt; break;
+ case X86::VCMPPDZ128rri: NewOpc = X86::VCMPPDZ128rri_alt; break;
+ case X86::VCMPPSZ128rmi: NewOpc = X86::VCMPPSZ128rmi_alt; break;
+ case X86::VCMPPSZ128rri: NewOpc = X86::VCMPPSZ128rri_alt; break;
+ case X86::VCMPPDZ256rmi: NewOpc = X86::VCMPPDZ256rmi_alt; break;
+ case X86::VCMPPDZ256rri: NewOpc = X86::VCMPPDZ256rri_alt; break;
+ case X86::VCMPPSZ256rmi: NewOpc = X86::VCMPPSZ256rmi_alt; break;
+ case X86::VCMPPSZ256rri: NewOpc = X86::VCMPPSZ256rri_alt; break;
+ case X86::VCMPSDZrm_Int: NewOpc = X86::VCMPSDZrmi_alt; break;
+ case X86::VCMPSDZrr_Int: NewOpc = X86::VCMPSDZrri_alt; break;
+ case X86::VCMPSDZrrb_Int: NewOpc = X86::VCMPSDZrrb_alt; break;
+ case X86::VCMPSSZrm_Int: NewOpc = X86::VCMPSSZrmi_alt; break;
+ case X86::VCMPSSZrr_Int: NewOpc = X86::VCMPSSZrri_alt; break;
+ case X86::VCMPSSZrrb_Int: NewOpc = X86::VCMPSSZrrb_alt; break;
+ }
+ // Switch opcode to the one that doesn't get special printing.
+ mcInst.setOpcode(NewOpc);
+ }
+ } else if (type == TYPE_AVX512ICC) {
+ if (immediate >= 8 || ((immediate & 0x3) == 3)) {
+ unsigned NewOpc;
+ switch (mcInst.getOpcode()) {
+ default: llvm_unreachable("unexpected opcode");
+ case X86::VPCMPBZ128rmi: NewOpc = X86::VPCMPBZ128rmi_alt; break;
+ case X86::VPCMPBZ128rmik: NewOpc = X86::VPCMPBZ128rmik_alt; break;
+ case X86::VPCMPBZ128rri: NewOpc = X86::VPCMPBZ128rri_alt; break;
+ case X86::VPCMPBZ128rrik: NewOpc = X86::VPCMPBZ128rrik_alt; break;
+ case X86::VPCMPBZ256rmi: NewOpc = X86::VPCMPBZ256rmi_alt; break;
+ case X86::VPCMPBZ256rmik: NewOpc = X86::VPCMPBZ256rmik_alt; break;
+ case X86::VPCMPBZ256rri: NewOpc = X86::VPCMPBZ256rri_alt; break;
+ case X86::VPCMPBZ256rrik: NewOpc = X86::VPCMPBZ256rrik_alt; break;
+ case X86::VPCMPBZrmi: NewOpc = X86::VPCMPBZrmi_alt; break;
+ case X86::VPCMPBZrmik: NewOpc = X86::VPCMPBZrmik_alt; break;
+ case X86::VPCMPBZrri: NewOpc = X86::VPCMPBZrri_alt; break;
+ case X86::VPCMPBZrrik: NewOpc = X86::VPCMPBZrrik_alt; break;
+ case X86::VPCMPDZ128rmi: NewOpc = X86::VPCMPDZ128rmi_alt; break;
+ case X86::VPCMPDZ128rmib: NewOpc = X86::VPCMPDZ128rmib_alt; break;
+ case X86::VPCMPDZ128rmibk: NewOpc = X86::VPCMPDZ128rmibk_alt; break;
+ case X86::VPCMPDZ128rmik: NewOpc = X86::VPCMPDZ128rmik_alt; break;
+ case X86::VPCMPDZ128rri: NewOpc = X86::VPCMPDZ128rri_alt; break;
+ case X86::VPCMPDZ128rrik: NewOpc = X86::VPCMPDZ128rrik_alt; break;
+ case X86::VPCMPDZ256rmi: NewOpc = X86::VPCMPDZ256rmi_alt; break;
+ case X86::VPCMPDZ256rmib: NewOpc = X86::VPCMPDZ256rmib_alt; break;
+ case X86::VPCMPDZ256rmibk: NewOpc = X86::VPCMPDZ256rmibk_alt; break;
+ case X86::VPCMPDZ256rmik: NewOpc = X86::VPCMPDZ256rmik_alt; break;
+ case X86::VPCMPDZ256rri: NewOpc = X86::VPCMPDZ256rri_alt; break;
+ case X86::VPCMPDZ256rrik: NewOpc = X86::VPCMPDZ256rrik_alt; break;
+ case X86::VPCMPDZrmi: NewOpc = X86::VPCMPDZrmi_alt; break;
+ case X86::VPCMPDZrmib: NewOpc = X86::VPCMPDZrmib_alt; break;
+ case X86::VPCMPDZrmibk: NewOpc = X86::VPCMPDZrmibk_alt; break;
+ case X86::VPCMPDZrmik: NewOpc = X86::VPCMPDZrmik_alt; break;
+ case X86::VPCMPDZrri: NewOpc = X86::VPCMPDZrri_alt; break;
+ case X86::VPCMPDZrrik: NewOpc = X86::VPCMPDZrrik_alt; break;
+ case X86::VPCMPQZ128rmi: NewOpc = X86::VPCMPQZ128rmi_alt; break;
+ case X86::VPCMPQZ128rmib: NewOpc = X86::VPCMPQZ128rmib_alt; break;
+ case X86::VPCMPQZ128rmibk: NewOpc = X86::VPCMPQZ128rmibk_alt; break;
+ case X86::VPCMPQZ128rmik: NewOpc = X86::VPCMPQZ128rmik_alt; break;
+ case X86::VPCMPQZ128rri: NewOpc = X86::VPCMPQZ128rri_alt; break;
+ case X86::VPCMPQZ128rrik: NewOpc = X86::VPCMPQZ128rrik_alt; break;
+ case X86::VPCMPQZ256rmi: NewOpc = X86::VPCMPQZ256rmi_alt; break;
+ case X86::VPCMPQZ256rmib: NewOpc = X86::VPCMPQZ256rmib_alt; break;
+ case X86::VPCMPQZ256rmibk: NewOpc = X86::VPCMPQZ256rmibk_alt; break;
+ case X86::VPCMPQZ256rmik: NewOpc = X86::VPCMPQZ256rmik_alt; break;
+ case X86::VPCMPQZ256rri: NewOpc = X86::VPCMPQZ256rri_alt; break;
+ case X86::VPCMPQZ256rrik: NewOpc = X86::VPCMPQZ256rrik_alt; break;
+ case X86::VPCMPQZrmi: NewOpc = X86::VPCMPQZrmi_alt; break;
+ case X86::VPCMPQZrmib: NewOpc = X86::VPCMPQZrmib_alt; break;
+ case X86::VPCMPQZrmibk: NewOpc = X86::VPCMPQZrmibk_alt; break;
+ case X86::VPCMPQZrmik: NewOpc = X86::VPCMPQZrmik_alt; break;
+ case X86::VPCMPQZrri: NewOpc = X86::VPCMPQZrri_alt; break;
+ case X86::VPCMPQZrrik: NewOpc = X86::VPCMPQZrrik_alt; break;
+ case X86::VPCMPUBZ128rmi: NewOpc = X86::VPCMPUBZ128rmi_alt; break;
+ case X86::VPCMPUBZ128rmik: NewOpc = X86::VPCMPUBZ128rmik_alt; break;
+ case X86::VPCMPUBZ128rri: NewOpc = X86::VPCMPUBZ128rri_alt; break;
+ case X86::VPCMPUBZ128rrik: NewOpc = X86::VPCMPUBZ128rrik_alt; break;
+ case X86::VPCMPUBZ256rmi: NewOpc = X86::VPCMPUBZ256rmi_alt; break;
+ case X86::VPCMPUBZ256rmik: NewOpc = X86::VPCMPUBZ256rmik_alt; break;
+ case X86::VPCMPUBZ256rri: NewOpc = X86::VPCMPUBZ256rri_alt; break;
+ case X86::VPCMPUBZ256rrik: NewOpc = X86::VPCMPUBZ256rrik_alt; break;
+ case X86::VPCMPUBZrmi: NewOpc = X86::VPCMPUBZrmi_alt; break;
+ case X86::VPCMPUBZrmik: NewOpc = X86::VPCMPUBZrmik_alt; break;
+ case X86::VPCMPUBZrri: NewOpc = X86::VPCMPUBZrri_alt; break;
+ case X86::VPCMPUBZrrik: NewOpc = X86::VPCMPUBZrrik_alt; break;
+ case X86::VPCMPUDZ128rmi: NewOpc = X86::VPCMPUDZ128rmi_alt; break;
+ case X86::VPCMPUDZ128rmib: NewOpc = X86::VPCMPUDZ128rmib_alt; break;
+ case X86::VPCMPUDZ128rmibk: NewOpc = X86::VPCMPUDZ128rmibk_alt; break;
+ case X86::VPCMPUDZ128rmik: NewOpc = X86::VPCMPUDZ128rmik_alt; break;
+ case X86::VPCMPUDZ128rri: NewOpc = X86::VPCMPUDZ128rri_alt; break;
+ case X86::VPCMPUDZ128rrik: NewOpc = X86::VPCMPUDZ128rrik_alt; break;
+ case X86::VPCMPUDZ256rmi: NewOpc = X86::VPCMPUDZ256rmi_alt; break;
+ case X86::VPCMPUDZ256rmib: NewOpc = X86::VPCMPUDZ256rmib_alt; break;
+ case X86::VPCMPUDZ256rmibk: NewOpc = X86::VPCMPUDZ256rmibk_alt; break;
+ case X86::VPCMPUDZ256rmik: NewOpc = X86::VPCMPUDZ256rmik_alt; break;
+ case X86::VPCMPUDZ256rri: NewOpc = X86::VPCMPUDZ256rri_alt; break;
+ case X86::VPCMPUDZ256rrik: NewOpc = X86::VPCMPUDZ256rrik_alt; break;
+ case X86::VPCMPUDZrmi: NewOpc = X86::VPCMPUDZrmi_alt; break;
+ case X86::VPCMPUDZrmib: NewOpc = X86::VPCMPUDZrmib_alt; break;
+ case X86::VPCMPUDZrmibk: NewOpc = X86::VPCMPUDZrmibk_alt; break;
+ case X86::VPCMPUDZrmik: NewOpc = X86::VPCMPUDZrmik_alt; break;
+ case X86::VPCMPUDZrri: NewOpc = X86::VPCMPUDZrri_alt; break;
+ case X86::VPCMPUDZrrik: NewOpc = X86::VPCMPUDZrrik_alt; break;
+ case X86::VPCMPUQZ128rmi: NewOpc = X86::VPCMPUQZ128rmi_alt; break;
+ case X86::VPCMPUQZ128rmib: NewOpc = X86::VPCMPUQZ128rmib_alt; break;
+ case X86::VPCMPUQZ128rmibk: NewOpc = X86::VPCMPUQZ128rmibk_alt; break;
+ case X86::VPCMPUQZ128rmik: NewOpc = X86::VPCMPUQZ128rmik_alt; break;
+ case X86::VPCMPUQZ128rri: NewOpc = X86::VPCMPUQZ128rri_alt; break;
+ case X86::VPCMPUQZ128rrik: NewOpc = X86::VPCMPUQZ128rrik_alt; break;
+ case X86::VPCMPUQZ256rmi: NewOpc = X86::VPCMPUQZ256rmi_alt; break;
+ case X86::VPCMPUQZ256rmib: NewOpc = X86::VPCMPUQZ256rmib_alt; break;
+ case X86::VPCMPUQZ256rmibk: NewOpc = X86::VPCMPUQZ256rmibk_alt; break;
+ case X86::VPCMPUQZ256rmik: NewOpc = X86::VPCMPUQZ256rmik_alt; break;
+ case X86::VPCMPUQZ256rri: NewOpc = X86::VPCMPUQZ256rri_alt; break;
+ case X86::VPCMPUQZ256rrik: NewOpc = X86::VPCMPUQZ256rrik_alt; break;
+ case X86::VPCMPUQZrmi: NewOpc = X86::VPCMPUQZrmi_alt; break;
+ case X86::VPCMPUQZrmib: NewOpc = X86::VPCMPUQZrmib_alt; break;
+ case X86::VPCMPUQZrmibk: NewOpc = X86::VPCMPUQZrmibk_alt; break;
+ case X86::VPCMPUQZrmik: NewOpc = X86::VPCMPUQZrmik_alt; break;
+ case X86::VPCMPUQZrri: NewOpc = X86::VPCMPUQZrri_alt; break;
+ case X86::VPCMPUQZrrik: NewOpc = X86::VPCMPUQZrrik_alt; break;
+ case X86::VPCMPUWZ128rmi: NewOpc = X86::VPCMPUWZ128rmi_alt; break;
+ case X86::VPCMPUWZ128rmik: NewOpc = X86::VPCMPUWZ128rmik_alt; break;
+ case X86::VPCMPUWZ128rri: NewOpc = X86::VPCMPUWZ128rri_alt; break;
+ case X86::VPCMPUWZ128rrik: NewOpc = X86::VPCMPUWZ128rrik_alt; break;
+ case X86::VPCMPUWZ256rmi: NewOpc = X86::VPCMPUWZ256rmi_alt; break;
+ case X86::VPCMPUWZ256rmik: NewOpc = X86::VPCMPUWZ256rmik_alt; break;
+ case X86::VPCMPUWZ256rri: NewOpc = X86::VPCMPUWZ256rri_alt; break;
+ case X86::VPCMPUWZ256rrik: NewOpc = X86::VPCMPUWZ256rrik_alt; break;
+ case X86::VPCMPUWZrmi: NewOpc = X86::VPCMPUWZrmi_alt; break;
+ case X86::VPCMPUWZrmik: NewOpc = X86::VPCMPUWZrmik_alt; break;
+ case X86::VPCMPUWZrri: NewOpc = X86::VPCMPUWZrri_alt; break;
+ case X86::VPCMPUWZrrik: NewOpc = X86::VPCMPUWZrrik_alt; break;
+ case X86::VPCMPWZ128rmi: NewOpc = X86::VPCMPWZ128rmi_alt; break;
+ case X86::VPCMPWZ128rmik: NewOpc = X86::VPCMPWZ128rmik_alt; break;
+ case X86::VPCMPWZ128rri: NewOpc = X86::VPCMPWZ128rri_alt; break;
+ case X86::VPCMPWZ128rrik: NewOpc = X86::VPCMPWZ128rrik_alt; break;
+ case X86::VPCMPWZ256rmi: NewOpc = X86::VPCMPWZ256rmi_alt; break;
+ case X86::VPCMPWZ256rmik: NewOpc = X86::VPCMPWZ256rmik_alt; break;
+ case X86::VPCMPWZ256rri: NewOpc = X86::VPCMPWZ256rri_alt; break;
+ case X86::VPCMPWZ256rrik: NewOpc = X86::VPCMPWZ256rrik_alt; break;
+ case X86::VPCMPWZrmi: NewOpc = X86::VPCMPWZrmi_alt; break;
+ case X86::VPCMPWZrmik: NewOpc = X86::VPCMPWZrmik_alt; break;
+ case X86::VPCMPWZrri: NewOpc = X86::VPCMPWZrri_alt; break;
+ case X86::VPCMPWZrrik: NewOpc = X86::VPCMPWZrrik_alt; break;
+ }
+ // Switch opcode to the one that doesn't get special printing.
+ mcInst.setOpcode(NewOpc);
+ }
+ }
+
+ switch (type) {
+ case TYPE_XMM32:
+ case TYPE_XMM64:
+ case TYPE_XMM128:
+ mcInst.addOperand(MCOperand::createReg(X86::XMM0 + (immediate >> 4)));
+ return;
+ case TYPE_XMM256:
+ mcInst.addOperand(MCOperand::createReg(X86::YMM0 + (immediate >> 4)));
+ return;
+ case TYPE_XMM512:
+ mcInst.addOperand(MCOperand::createReg(X86::ZMM0 + (immediate >> 4)));
+ return;
+ case TYPE_BNDR:
+ mcInst.addOperand(MCOperand::createReg(X86::BND0 + (immediate >> 4)));
+ case TYPE_REL8:
+ isBranch = true;
+ pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize;
+ if (immediate & 0x80)
+ immediate |= ~(0xffull);
+ break;
+ case TYPE_REL16:
+ isBranch = true;
+ pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize;
+ if (immediate & 0x8000)
+ immediate |= ~(0xffffull);
+ break;
+ case TYPE_REL32:
+ case TYPE_REL64:
+ isBranch = true;
+ pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize;
+ if(immediate & 0x80000000)
+ immediate |= ~(0xffffffffull);
+ break;
+ default:
+ // operand is 64 bits wide. Do nothing.
+ break;
+ }
+
+ if(!tryAddingSymbolicOperand(immediate + pcrel, isBranch, insn.startLocation,
+ insn.immediateOffset, insn.immediateSize,
+ mcInst, Dis))
+ mcInst.addOperand(MCOperand::createImm(immediate));
+
+ if (type == TYPE_MOFFS8 || type == TYPE_MOFFS16 ||
+ type == TYPE_MOFFS32 || type == TYPE_MOFFS64) {
+ MCOperand segmentReg;
+ segmentReg = MCOperand::createReg(segmentRegnums[insn.segmentOverride]);
+ mcInst.addOperand(segmentReg);
+ }
+}
+
+/// translateRMRegister - Translates a register stored in the R/M field of the
+/// ModR/M byte to its LLVM equivalent and appends it to an MCInst.
+/// @param mcInst - The MCInst to append to.
+/// @param insn - The internal instruction to extract the R/M field
+/// from.
+/// @return - 0 on success; -1 otherwise
+static bool translateRMRegister(MCInst &mcInst,
+ InternalInstruction &insn) {
+ if (insn.eaBase == EA_BASE_sib || insn.eaBase == EA_BASE_sib64) {
+ debug("A R/M register operand may not have a SIB byte");
+ return true;
+ }
+
+ switch (insn.eaBase) {
+ default:
+ debug("Unexpected EA base register");
+ return true;
+ case EA_BASE_NONE:
+ debug("EA_BASE_NONE for ModR/M base");
+ return true;
+#define ENTRY(x) case EA_BASE_##x:
+ ALL_EA_BASES
+#undef ENTRY
+ debug("A R/M register operand may not have a base; "
+ "the operand must be a register.");
+ return true;
+#define ENTRY(x) \
+ case EA_REG_##x: \
+ mcInst.addOperand(MCOperand::createReg(X86::x)); break;
+ ALL_REGS
+#undef ENTRY
+ }
+
+ return false;
+}
+
+/// translateRMMemory - Translates a memory operand stored in the Mod and R/M
+/// fields of an internal instruction (and possibly its SIB byte) to a memory
+/// operand in LLVM's format, and appends it to an MCInst.
+///
+/// @param mcInst - The MCInst to append to.
+/// @param insn - The instruction to extract Mod, R/M, and SIB fields
+/// from.
+/// @return - 0 on success; nonzero otherwise
+static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
+ const MCDisassembler *Dis) {
+ // Addresses in an MCInst are represented as five operands:
+ // 1. basereg (register) The R/M base, or (if there is a SIB) the
+ // SIB base
+ // 2. scaleamount (immediate) 1, or (if there is a SIB) the specified
+ // scale amount
+ // 3. indexreg (register) x86_registerNONE, or (if there is a SIB)
+ // the index (which is multiplied by the
+ // scale amount)
+ // 4. displacement (immediate) 0, or the displacement if there is one
+ // 5. segmentreg (register) x86_registerNONE for now, but could be set
+ // if we have segment overrides
+
+ MCOperand baseReg;
+ MCOperand scaleAmount;
+ MCOperand indexReg;
+ MCOperand displacement;
+ MCOperand segmentReg;
+ uint64_t pcrel = 0;
+
+ if (insn.eaBase == EA_BASE_sib || insn.eaBase == EA_BASE_sib64) {
+ if (insn.sibBase != SIB_BASE_NONE) {
+ switch (insn.sibBase) {
+ default:
+ debug("Unexpected sibBase");
+ return true;
+#define ENTRY(x) \
+ case SIB_BASE_##x: \
+ baseReg = MCOperand::createReg(X86::x); break;
+ ALL_SIB_BASES
+#undef ENTRY
+ }
+ } else {
+ baseReg = MCOperand::createReg(0);
+ }
+
+ // Check whether we are handling VSIB addressing mode for GATHER.
+ // If sibIndex was set to SIB_INDEX_NONE, index offset is 4 and
+ // we should use SIB_INDEX_XMM4|YMM4 for VSIB.
+ // I don't see a way to get the correct IndexReg in readSIB:
+ // We can tell whether it is VSIB or SIB after instruction ID is decoded,
+ // but instruction ID may not be decoded yet when calling readSIB.
+ uint32_t Opcode = mcInst.getOpcode();
+ bool IndexIs128 = (Opcode == X86::VGATHERDPDrm ||
+ Opcode == X86::VGATHERDPDYrm ||
+ Opcode == X86::VGATHERQPDrm ||
+ Opcode == X86::VGATHERDPSrm ||
+ Opcode == X86::VGATHERQPSrm ||
+ Opcode == X86::VPGATHERDQrm ||
+ Opcode == X86::VPGATHERDQYrm ||
+ Opcode == X86::VPGATHERQQrm ||
+ Opcode == X86::VPGATHERDDrm ||
+ Opcode == X86::VPGATHERQDrm);
+ bool IndexIs256 = (Opcode == X86::VGATHERQPDYrm ||
+ Opcode == X86::VGATHERDPSYrm ||
+ Opcode == X86::VGATHERQPSYrm ||
+ Opcode == X86::VGATHERDPDZrm ||
+ Opcode == X86::VPGATHERDQZrm ||
+ Opcode == X86::VPGATHERQQYrm ||
+ Opcode == X86::VPGATHERDDYrm ||
+ Opcode == X86::VPGATHERQDYrm);
+ bool IndexIs512 = (Opcode == X86::VGATHERQPDZrm ||
+ Opcode == X86::VGATHERDPSZrm ||
+ Opcode == X86::VGATHERQPSZrm ||
+ Opcode == X86::VPGATHERQQZrm ||
+ Opcode == X86::VPGATHERDDZrm ||
+ Opcode == X86::VPGATHERQDZrm);
+ if (IndexIs128 || IndexIs256 || IndexIs512) {
+ unsigned IndexOffset = insn.sibIndex -
+ (insn.addressSize == 8 ? SIB_INDEX_RAX:SIB_INDEX_EAX);
+ SIBIndex IndexBase = IndexIs512 ? SIB_INDEX_ZMM0 :
+ IndexIs256 ? SIB_INDEX_YMM0 : SIB_INDEX_XMM0;
+ insn.sibIndex = (SIBIndex)(IndexBase +
+ (insn.sibIndex == SIB_INDEX_NONE ? 4 : IndexOffset));
+ }
+
+ if (insn.sibIndex != SIB_INDEX_NONE) {
+ switch (insn.sibIndex) {
+ default:
+ debug("Unexpected sibIndex");
+ return true;
+#define ENTRY(x) \
+ case SIB_INDEX_##x: \
+ indexReg = MCOperand::createReg(X86::x); break;
+ EA_BASES_32BIT
+ EA_BASES_64BIT
+ REGS_XMM
+ REGS_YMM
+ REGS_ZMM
+#undef ENTRY
+ }
+ } else {
+ indexReg = MCOperand::createReg(0);
+ }
+
+ scaleAmount = MCOperand::createImm(insn.sibScale);
+ } else {
+ switch (insn.eaBase) {
+ case EA_BASE_NONE:
+ if (insn.eaDisplacement == EA_DISP_NONE) {
+ debug("EA_BASE_NONE and EA_DISP_NONE for ModR/M base");
+ return true;
+ }
+ if (insn.mode == MODE_64BIT){
+ pcrel = insn.startLocation +
+ insn.displacementOffset + insn.displacementSize;
+ tryAddingPcLoadReferenceComment(insn.startLocation +
+ insn.displacementOffset,
+ insn.displacement + pcrel, Dis);
+ baseReg = MCOperand::createReg(X86::RIP); // Section 2.2.1.6
+ }
+ else
+ baseReg = MCOperand::createReg(0);
+
+ indexReg = MCOperand::createReg(0);
+ break;
+ case EA_BASE_BX_SI:
+ baseReg = MCOperand::createReg(X86::BX);
+ indexReg = MCOperand::createReg(X86::SI);
+ break;
+ case EA_BASE_BX_DI:
+ baseReg = MCOperand::createReg(X86::BX);
+ indexReg = MCOperand::createReg(X86::DI);
+ break;
+ case EA_BASE_BP_SI:
+ baseReg = MCOperand::createReg(X86::BP);
+ indexReg = MCOperand::createReg(X86::SI);
+ break;
+ case EA_BASE_BP_DI:
+ baseReg = MCOperand::createReg(X86::BP);
+ indexReg = MCOperand::createReg(X86::DI);
+ break;
+ default:
+ indexReg = MCOperand::createReg(0);
+ switch (insn.eaBase) {
+ default:
+ debug("Unexpected eaBase");
+ return true;
+ // Here, we will use the fill-ins defined above. However,
+ // BX_SI, BX_DI, BP_SI, and BP_DI are all handled above and
+ // sib and sib64 were handled in the top-level if, so they're only
+ // placeholders to keep the compiler happy.
+#define ENTRY(x) \
+ case EA_BASE_##x: \
+ baseReg = MCOperand::createReg(X86::x); break;
+ ALL_EA_BASES
+#undef ENTRY
+#define ENTRY(x) case EA_REG_##x:
+ ALL_REGS
+#undef ENTRY
+ debug("A R/M memory operand may not be a register; "
+ "the base field must be a base.");
+ return true;
+ }
+ }
+
+ scaleAmount = MCOperand::createImm(1);
+ }
+
+ displacement = MCOperand::createImm(insn.displacement);
+
+ segmentReg = MCOperand::createReg(segmentRegnums[insn.segmentOverride]);
+
+ mcInst.addOperand(baseReg);
+ mcInst.addOperand(scaleAmount);
+ mcInst.addOperand(indexReg);
+ if(!tryAddingSymbolicOperand(insn.displacement + pcrel, false,
+ insn.startLocation, insn.displacementOffset,
+ insn.displacementSize, mcInst, Dis))
+ mcInst.addOperand(displacement);
+ mcInst.addOperand(segmentReg);
+ return false;
+}
+
+/// translateRM - Translates an operand stored in the R/M (and possibly SIB)
+/// byte of an instruction to LLVM form, and appends it to an MCInst.
+///
+/// @param mcInst - The MCInst to append to.
+/// @param operand - The operand, as stored in the descriptor table.
+/// @param insn - The instruction to extract Mod, R/M, and SIB fields
+/// from.
+/// @return - 0 on success; nonzero otherwise
+static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
+ InternalInstruction &insn, const MCDisassembler *Dis) {
+ switch (operand.type) {
+ default:
+ debug("Unexpected type for a R/M operand");
+ return true;
+ case TYPE_R8:
+ case TYPE_R16:
+ case TYPE_R32:
+ case TYPE_R64:
+ case TYPE_Rv:
+ case TYPE_MM64:
+ case TYPE_XMM32:
+ case TYPE_XMM64:
+ case TYPE_XMM128:
+ case TYPE_XMM256:
+ case TYPE_XMM512:
+ case TYPE_VK1:
+ case TYPE_VK2:
+ case TYPE_VK4:
+ case TYPE_VK8:
+ case TYPE_VK16:
+ case TYPE_VK32:
+ case TYPE_VK64:
+ case TYPE_DEBUGREG:
+ case TYPE_CONTROLREG:
+ case TYPE_BNDR:
+ return translateRMRegister(mcInst, insn);
+ case TYPE_M:
+ case TYPE_M8:
+ case TYPE_M16:
+ case TYPE_M32:
+ case TYPE_M64:
+ case TYPE_M128:
+ case TYPE_M256:
+ case TYPE_M512:
+ case TYPE_Mv:
+ case TYPE_M32FP:
+ case TYPE_M64FP:
+ case TYPE_M80FP:
+ case TYPE_M1616:
+ case TYPE_M1632:
+ case TYPE_M1664:
+ case TYPE_LEA:
+ return translateRMMemory(mcInst, insn, Dis);
+ }
+}
+
+/// translateFPRegister - Translates a stack position on the FPU stack to its
+/// LLVM form, and appends it to an MCInst.
+///
+/// @param mcInst - The MCInst to append to.
+/// @param stackPos - The stack position to translate.
+static void translateFPRegister(MCInst &mcInst,
+ uint8_t stackPos) {
+ mcInst.addOperand(MCOperand::createReg(X86::ST0 + stackPos));
+}
+
+/// translateMaskRegister - Translates a 3-bit mask register number to
+/// LLVM form, and appends it to an MCInst.
+///
+/// @param mcInst - The MCInst to append to.
+/// @param maskRegNum - Number of mask register from 0 to 7.
+/// @return - false on success; true otherwise.
+static bool translateMaskRegister(MCInst &mcInst,
+ uint8_t maskRegNum) {
+ if (maskRegNum >= 8) {
+ debug("Invalid mask register number");
+ return true;
+ }
+
+ mcInst.addOperand(MCOperand::createReg(X86::K0 + maskRegNum));
+ return false;
+}
+
+/// translateOperand - Translates an operand stored in an internal instruction
+/// to LLVM's format and appends it to an MCInst.
+///
+/// @param mcInst - The MCInst to append to.
+/// @param operand - The operand, as stored in the descriptor table.
+/// @param insn - The internal instruction.
+/// @return - false on success; true otherwise.
+static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand,
+ InternalInstruction &insn,
+ const MCDisassembler *Dis) {
+ switch (operand.encoding) {
+ default:
+ debug("Unhandled operand encoding during translation");
+ return true;
+ case ENCODING_REG:
+ translateRegister(mcInst, insn.reg);
+ return false;
+ case ENCODING_WRITEMASK:
+ return translateMaskRegister(mcInst, insn.writemask);
+ CASE_ENCODING_RM:
+ return translateRM(mcInst, operand, insn, Dis);
+ case ENCODING_IB:
+ case ENCODING_IW:
+ case ENCODING_ID:
+ case ENCODING_IO:
+ case ENCODING_Iv:
+ case ENCODING_Ia:
+ translateImmediate(mcInst,
+ insn.immediates[insn.numImmediatesTranslated++],
+ operand,
+ insn,
+ Dis);
+ return false;
+ case ENCODING_SI:
+ return translateSrcIndex(mcInst, insn);
+ case ENCODING_DI:
+ return translateDstIndex(mcInst, insn);
+ case ENCODING_RB:
+ case ENCODING_RW:
+ case ENCODING_RD:
+ case ENCODING_RO:
+ case ENCODING_Rv:
+ translateRegister(mcInst, insn.opcodeRegister);
+ return false;
+ case ENCODING_FP:
+ translateFPRegister(mcInst, insn.modRM & 7);
+ return false;
+ case ENCODING_VVVV:
+ translateRegister(mcInst, insn.vvvv);
+ return false;
+ case ENCODING_DUP:
+ return translateOperand(mcInst, insn.operands[operand.type - TYPE_DUP0],
+ insn, Dis);
+ }
+}
+
+/// translateInstruction - Translates an internal instruction and all its
+/// operands to an MCInst.
+///
+/// @param mcInst - The MCInst to populate with the instruction's data.
+/// @param insn - The internal instruction.
+/// @return - false on success; true otherwise.
+static bool translateInstruction(MCInst &mcInst,
+ InternalInstruction &insn,
+ const MCDisassembler *Dis) {
+ if (!insn.spec) {
+ debug("Instruction has no specification");
+ return true;
+ }
+
+ mcInst.clear();
+ mcInst.setOpcode(insn.instructionID);
+ // If when reading the prefix bytes we determined the overlapping 0xf2 or 0xf3
+ // prefix bytes should be disassembled as xrelease and xacquire then set the
+ // opcode to those instead of the rep and repne opcodes.
+ if (insn.xAcquireRelease) {
+ if(mcInst.getOpcode() == X86::REP_PREFIX)
+ mcInst.setOpcode(X86::XRELEASE_PREFIX);
+ else if(mcInst.getOpcode() == X86::REPNE_PREFIX)
+ mcInst.setOpcode(X86::XACQUIRE_PREFIX);
+ }
+
+ insn.numImmediatesTranslated = 0;
+
+ for (const auto &Op : insn.operands) {
+ if (Op.encoding != ENCODING_NONE) {
+ if (translateOperand(mcInst, Op, insn, Dis)) {
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+static MCDisassembler *createX86Disassembler(const Target &T,
+ const MCSubtargetInfo &STI,
+ MCContext &Ctx) {
+ std::unique_ptr<const MCInstrInfo> MII(T.createMCInstrInfo());
+ return new X86GenericDisassembler(STI, Ctx, std::move(MII));
+}
+
+extern "C" void LLVMInitializeX86Disassembler() {
+ // Register the disassembler.
+ TargetRegistry::RegisterMCDisassembler(getTheX86_32Target(),
+ createX86Disassembler);
+ TargetRegistry::RegisterMCDisassembler(getTheX86_64Target(),
+ createX86Disassembler);
+}
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
new file mode 100644
index 000000000000..ab64d6fcf70b
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
@@ -0,0 +1,1901 @@
+//===-- X86DisassemblerDecoder.cpp - Disassembler decoder -----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the X86 Disassembler.
+// It contains the implementation of the instruction decoder.
+// Documentation for the disassembler can be found in X86Disassembler.h.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cstdarg> /* for va_*() */
+#include <cstdio> /* for vsnprintf() */
+#include <cstdlib> /* for exit() */
+#include <cstring> /* for memset() */
+
+#include "X86DisassemblerDecoder.h"
+
+using namespace llvm::X86Disassembler;
+
+/// Specifies whether a ModR/M byte is needed and (if so) which
+/// instruction each possible value of the ModR/M byte corresponds to. Once
+/// this information is known, we have narrowed down to a single instruction.
+struct ModRMDecision {
+ uint8_t modrm_type;
+ uint16_t instructionIDs;
+};
+
+/// Specifies which set of ModR/M->instruction tables to look at
+/// given a particular opcode.
+struct OpcodeDecision {
+ ModRMDecision modRMDecisions[256];
+};
+
+/// Specifies which opcode->instruction tables to look at given
+/// a particular context (set of attributes). Since there are many possible
+/// contexts, the decoder first uses CONTEXTS_SYM to determine which context
+/// applies given a specific set of attributes. Hence there are only IC_max
+/// entries in this table, rather than 2^(ATTR_max).
+struct ContextDecision {
+ OpcodeDecision opcodeDecisions[IC_max];
+};
+
+#include "X86GenDisassemblerTables.inc"
+
+#ifndef NDEBUG
+#define debug(s) do { Debug(__FILE__, __LINE__, s); } while (0)
+#else
+#define debug(s) do { } while (0)
+#endif
+
+/*
+ * contextForAttrs - Client for the instruction context table. Takes a set of
+ * attributes and returns the appropriate decode context.
+ *
+ * @param attrMask - Attributes, from the enumeration attributeBits.
+ * @return - The InstructionContext to use when looking up an
+ * an instruction with these attributes.
+ */
+static InstructionContext contextForAttrs(uint16_t attrMask) {
+ return static_cast<InstructionContext>(CONTEXTS_SYM[attrMask]);
+}
+
+/*
+ * modRMRequired - Reads the appropriate instruction table to determine whether
+ * the ModR/M byte is required to decode a particular instruction.
+ *
+ * @param type - The opcode type (i.e., how many bytes it has).
+ * @param insnContext - The context for the instruction, as returned by
+ * contextForAttrs.
+ * @param opcode - The last byte of the instruction's opcode, not counting
+ * ModR/M extensions and escapes.
+ * @return - true if the ModR/M byte is required, false otherwise.
+ */
+static int modRMRequired(OpcodeType type,
+ InstructionContext insnContext,
+ uint16_t opcode) {
+ const struct ContextDecision* decision = nullptr;
+
+ switch (type) {
+ case ONEBYTE:
+ decision = &ONEBYTE_SYM;
+ break;
+ case TWOBYTE:
+ decision = &TWOBYTE_SYM;
+ break;
+ case THREEBYTE_38:
+ decision = &THREEBYTE38_SYM;
+ break;
+ case THREEBYTE_3A:
+ decision = &THREEBYTE3A_SYM;
+ break;
+ case XOP8_MAP:
+ decision = &XOP8_MAP_SYM;
+ break;
+ case XOP9_MAP:
+ decision = &XOP9_MAP_SYM;
+ break;
+ case XOPA_MAP:
+ decision = &XOPA_MAP_SYM;
+ break;
+ }
+
+ return decision->opcodeDecisions[insnContext].modRMDecisions[opcode].
+ modrm_type != MODRM_ONEENTRY;
+}
+
+/*
+ * decode - Reads the appropriate instruction table to obtain the unique ID of
+ * an instruction.
+ *
+ * @param type - See modRMRequired().
+ * @param insnContext - See modRMRequired().
+ * @param opcode - See modRMRequired().
+ * @param modRM - The ModR/M byte if required, or any value if not.
+ * @return - The UID of the instruction, or 0 on failure.
+ */
+static InstrUID decode(OpcodeType type,
+ InstructionContext insnContext,
+ uint8_t opcode,
+ uint8_t modRM) {
+ const struct ModRMDecision* dec = nullptr;
+
+ switch (type) {
+ case ONEBYTE:
+ dec = &ONEBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ case TWOBYTE:
+ dec = &TWOBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ case THREEBYTE_38:
+ dec = &THREEBYTE38_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ case THREEBYTE_3A:
+ dec = &THREEBYTE3A_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ case XOP8_MAP:
+ dec = &XOP8_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ case XOP9_MAP:
+ dec = &XOP9_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ case XOPA_MAP:
+ dec = &XOPA_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ }
+
+ switch (dec->modrm_type) {
+ default:
+ debug("Corrupt table! Unknown modrm_type");
+ return 0;
+ case MODRM_ONEENTRY:
+ return modRMTable[dec->instructionIDs];
+ case MODRM_SPLITRM:
+ if (modFromModRM(modRM) == 0x3)
+ return modRMTable[dec->instructionIDs+1];
+ return modRMTable[dec->instructionIDs];
+ case MODRM_SPLITREG:
+ if (modFromModRM(modRM) == 0x3)
+ return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)+8];
+ return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)];
+ case MODRM_SPLITMISC:
+ if (modFromModRM(modRM) == 0x3)
+ return modRMTable[dec->instructionIDs+(modRM & 0x3f)+8];
+ return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)];
+ case MODRM_FULL:
+ return modRMTable[dec->instructionIDs+modRM];
+ }
+}
+
+/*
+ * specifierForUID - Given a UID, returns the name and operand specification for
+ * that instruction.
+ *
+ * @param uid - The unique ID for the instruction. This should be returned by
+ * decode(); specifierForUID will not check bounds.
+ * @return - A pointer to the specification for that instruction.
+ */
+static const struct InstructionSpecifier *specifierForUID(InstrUID uid) {
+ return &INSTRUCTIONS_SYM[uid];
+}
+
+/*
+ * consumeByte - Uses the reader function provided by the user to consume one
+ * byte from the instruction's memory and advance the cursor.
+ *
+ * @param insn - The instruction with the reader function to use. The cursor
+ * for this instruction is advanced.
+ * @param byte - A pointer to a pre-allocated memory buffer to be populated
+ * with the data read.
+ * @return - 0 if the read was successful; nonzero otherwise.
+ */
+static int consumeByte(struct InternalInstruction* insn, uint8_t* byte) {
+ int ret = insn->reader(insn->readerArg, byte, insn->readerCursor);
+
+ if (!ret)
+ ++(insn->readerCursor);
+
+ return ret;
+}
+
+/*
+ * lookAtByte - Like consumeByte, but does not advance the cursor.
+ *
+ * @param insn - See consumeByte().
+ * @param byte - See consumeByte().
+ * @return - See consumeByte().
+ */
+static int lookAtByte(struct InternalInstruction* insn, uint8_t* byte) {
+ return insn->reader(insn->readerArg, byte, insn->readerCursor);
+}
+
+static void unconsumeByte(struct InternalInstruction* insn) {
+ insn->readerCursor--;
+}
+
+#define CONSUME_FUNC(name, type) \
+ static int name(struct InternalInstruction* insn, type* ptr) { \
+ type combined = 0; \
+ unsigned offset; \
+ for (offset = 0; offset < sizeof(type); ++offset) { \
+ uint8_t byte; \
+ int ret = insn->reader(insn->readerArg, \
+ &byte, \
+ insn->readerCursor + offset); \
+ if (ret) \
+ return ret; \
+ combined = combined | ((uint64_t)byte << (offset * 8)); \
+ } \
+ *ptr = combined; \
+ insn->readerCursor += sizeof(type); \
+ return 0; \
+ }
+
+/*
+ * consume* - Use the reader function provided by the user to consume data
+ * values of various sizes from the instruction's memory and advance the
+ * cursor appropriately. These readers perform endian conversion.
+ *
+ * @param insn - See consumeByte().
+ * @param ptr - A pointer to a pre-allocated memory of appropriate size to
+ * be populated with the data read.
+ * @return - See consumeByte().
+ */
+CONSUME_FUNC(consumeInt8, int8_t)
+CONSUME_FUNC(consumeInt16, int16_t)
+CONSUME_FUNC(consumeInt32, int32_t)
+CONSUME_FUNC(consumeUInt16, uint16_t)
+CONSUME_FUNC(consumeUInt32, uint32_t)
+CONSUME_FUNC(consumeUInt64, uint64_t)
+
+/*
+ * dbgprintf - Uses the logging function provided by the user to log a single
+ * message, typically without a carriage-return.
+ *
+ * @param insn - The instruction containing the logging function.
+ * @param format - See printf().
+ * @param ... - See printf().
+ */
+static void dbgprintf(struct InternalInstruction* insn,
+ const char* format,
+ ...) {
+ char buffer[256];
+ va_list ap;
+
+ if (!insn->dlog)
+ return;
+
+ va_start(ap, format);
+ (void)vsnprintf(buffer, sizeof(buffer), format, ap);
+ va_end(ap);
+
+ insn->dlog(insn->dlogArg, buffer);
+}
+
+/*
+ * setPrefixPresent - Marks that a particular prefix is present at a particular
+ * location.
+ *
+ * @param insn - The instruction to be marked as having the prefix.
+ * @param prefix - The prefix that is present.
+ * @param location - The location where the prefix is located (in the address
+ * space of the instruction's reader).
+ */
+static void setPrefixPresent(struct InternalInstruction* insn,
+ uint8_t prefix,
+ uint64_t location)
+{
+ insn->prefixPresent[prefix] = 1;
+ insn->prefixLocations[prefix] = location;
+}
+
+/*
+ * isPrefixAtLocation - Queries an instruction to determine whether a prefix is
+ * present at a given location.
+ *
+ * @param insn - The instruction to be queried.
+ * @param prefix - The prefix.
+ * @param location - The location to query.
+ * @return - Whether the prefix is at that location.
+ */
+static bool isPrefixAtLocation(struct InternalInstruction* insn,
+ uint8_t prefix,
+ uint64_t location)
+{
+ return insn->prefixPresent[prefix] == 1 &&
+ insn->prefixLocations[prefix] == location;
+}
+
+/*
+ * readPrefixes - Consumes all of an instruction's prefix bytes, and marks the
+ * instruction as having them. Also sets the instruction's default operand,
+ * address, and other relevant data sizes to report operands correctly.
+ *
+ * @param insn - The instruction whose prefixes are to be read.
+ * @return - 0 if the instruction could be read until the end of the prefix
+ * bytes, and no prefixes conflicted; nonzero otherwise.
+ */
+static int readPrefixes(struct InternalInstruction* insn) {
+ bool isPrefix = true;
+ bool prefixGroups[4] = { false };
+ uint64_t prefixLocation;
+ uint8_t byte = 0;
+ uint8_t nextByte;
+
+ bool hasAdSize = false;
+ bool hasOpSize = false;
+
+ dbgprintf(insn, "readPrefixes()");
+
+ while (isPrefix) {
+ prefixLocation = insn->readerCursor;
+
+ /* If we fail reading prefixes, just stop here and let the opcode reader deal with it */
+ if (consumeByte(insn, &byte))
+ break;
+
+ /*
+ * If the byte is a LOCK/REP/REPNE prefix and not a part of the opcode, then
+ * break and let it be disassembled as a normal "instruction".
+ */
+ if (insn->readerCursor - 1 == insn->startLocation && byte == 0xf0)
+ break;
+
+ if (insn->readerCursor - 1 == insn->startLocation
+ && (byte == 0xf2 || byte == 0xf3)
+ && !lookAtByte(insn, &nextByte))
+ {
+ /*
+ * If the byte is 0xf2 or 0xf3, and any of the following conditions are
+ * met:
+ * - it is followed by a LOCK (0xf0) prefix
+ * - it is followed by an xchg instruction
+ * then it should be disassembled as a xacquire/xrelease not repne/rep.
+ */
+ if ((byte == 0xf2 || byte == 0xf3) &&
+ ((nextByte == 0xf0) ||
+ ((nextByte & 0xfe) == 0x86 || (nextByte & 0xf8) == 0x90)))
+ insn->xAcquireRelease = true;
+ /*
+ * Also if the byte is 0xf3, and the following condition is met:
+ * - it is followed by a "mov mem, reg" (opcode 0x88/0x89) or
+ * "mov mem, imm" (opcode 0xc6/0xc7) instructions.
+ * then it should be disassembled as an xrelease not rep.
+ */
+ if (byte == 0xf3 &&
+ (nextByte == 0x88 || nextByte == 0x89 ||
+ nextByte == 0xc6 || nextByte == 0xc7))
+ insn->xAcquireRelease = true;
+ if (insn->mode == MODE_64BIT && (nextByte & 0xf0) == 0x40) {
+ if (consumeByte(insn, &nextByte))
+ return -1;
+ if (lookAtByte(insn, &nextByte))
+ return -1;
+ unconsumeByte(insn);
+ }
+ if (nextByte != 0x0f && nextByte != 0x90)
+ break;
+ }
+
+ switch (byte) {
+ case 0xf0: /* LOCK */
+ case 0xf2: /* REPNE/REPNZ */
+ case 0xf3: /* REP or REPE/REPZ */
+ if (prefixGroups[0])
+ dbgprintf(insn, "Redundant Group 1 prefix");
+ prefixGroups[0] = true;
+ setPrefixPresent(insn, byte, prefixLocation);
+ break;
+ case 0x2e: /* CS segment override -OR- Branch not taken */
+ case 0x36: /* SS segment override -OR- Branch taken */
+ case 0x3e: /* DS segment override */
+ case 0x26: /* ES segment override */
+ case 0x64: /* FS segment override */
+ case 0x65: /* GS segment override */
+ switch (byte) {
+ case 0x2e:
+ insn->segmentOverride = SEG_OVERRIDE_CS;
+ break;
+ case 0x36:
+ insn->segmentOverride = SEG_OVERRIDE_SS;
+ break;
+ case 0x3e:
+ insn->segmentOverride = SEG_OVERRIDE_DS;
+ break;
+ case 0x26:
+ insn->segmentOverride = SEG_OVERRIDE_ES;
+ break;
+ case 0x64:
+ insn->segmentOverride = SEG_OVERRIDE_FS;
+ break;
+ case 0x65:
+ insn->segmentOverride = SEG_OVERRIDE_GS;
+ break;
+ default:
+ debug("Unhandled override");
+ return -1;
+ }
+ if (prefixGroups[1])
+ dbgprintf(insn, "Redundant Group 2 prefix");
+ prefixGroups[1] = true;
+ setPrefixPresent(insn, byte, prefixLocation);
+ break;
+ case 0x66: /* Operand-size override */
+ if (prefixGroups[2])
+ dbgprintf(insn, "Redundant Group 3 prefix");
+ prefixGroups[2] = true;
+ hasOpSize = true;
+ setPrefixPresent(insn, byte, prefixLocation);
+ break;
+ case 0x67: /* Address-size override */
+ if (prefixGroups[3])
+ dbgprintf(insn, "Redundant Group 4 prefix");
+ prefixGroups[3] = true;
+ hasAdSize = true;
+ setPrefixPresent(insn, byte, prefixLocation);
+ break;
+ default: /* Not a prefix byte */
+ isPrefix = false;
+ break;
+ }
+
+ if (isPrefix)
+ dbgprintf(insn, "Found prefix 0x%hhx", byte);
+ }
+
+ insn->vectorExtensionType = TYPE_NO_VEX_XOP;
+
+ if (byte == 0x62) {
+ uint8_t byte1, byte2;
+
+ if (consumeByte(insn, &byte1)) {
+ dbgprintf(insn, "Couldn't read second byte of EVEX prefix");
+ return -1;
+ }
+
+ if (lookAtByte(insn, &byte2)) {
+ dbgprintf(insn, "Couldn't read third byte of EVEX prefix");
+ return -1;
+ }
+
+ if ((insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) &&
+ ((~byte1 & 0xc) == 0xc) && ((byte2 & 0x4) == 0x4)) {
+ insn->vectorExtensionType = TYPE_EVEX;
+ } else {
+ unconsumeByte(insn); /* unconsume byte1 */
+ unconsumeByte(insn); /* unconsume byte */
+ insn->necessaryPrefixLocation = insn->readerCursor - 2;
+ }
+
+ if (insn->vectorExtensionType == TYPE_EVEX) {
+ insn->vectorExtensionPrefix[0] = byte;
+ insn->vectorExtensionPrefix[1] = byte1;
+ if (consumeByte(insn, &insn->vectorExtensionPrefix[2])) {
+ dbgprintf(insn, "Couldn't read third byte of EVEX prefix");
+ return -1;
+ }
+ if (consumeByte(insn, &insn->vectorExtensionPrefix[3])) {
+ dbgprintf(insn, "Couldn't read fourth byte of EVEX prefix");
+ return -1;
+ }
+
+ /* We simulate the REX prefix for simplicity's sake */
+ if (insn->mode == MODE_64BIT) {
+ insn->rexPrefix = 0x40
+ | (wFromEVEX3of4(insn->vectorExtensionPrefix[2]) << 3)
+ | (rFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 2)
+ | (xFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 1)
+ | (bFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 0);
+ }
+
+ dbgprintf(insn, "Found EVEX prefix 0x%hhx 0x%hhx 0x%hhx 0x%hhx",
+ insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1],
+ insn->vectorExtensionPrefix[2], insn->vectorExtensionPrefix[3]);
+ }
+ } else if (byte == 0xc4) {
+ uint8_t byte1;
+
+ if (lookAtByte(insn, &byte1)) {
+ dbgprintf(insn, "Couldn't read second byte of VEX");
+ return -1;
+ }
+
+ if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) {
+ insn->vectorExtensionType = TYPE_VEX_3B;
+ insn->necessaryPrefixLocation = insn->readerCursor - 1;
+ } else {
+ unconsumeByte(insn);
+ insn->necessaryPrefixLocation = insn->readerCursor - 1;
+ }
+
+ if (insn->vectorExtensionType == TYPE_VEX_3B) {
+ insn->vectorExtensionPrefix[0] = byte;
+ consumeByte(insn, &insn->vectorExtensionPrefix[1]);
+ consumeByte(insn, &insn->vectorExtensionPrefix[2]);
+
+ /* We simulate the REX prefix for simplicity's sake */
+
+ if (insn->mode == MODE_64BIT) {
+ insn->rexPrefix = 0x40
+ | (wFromVEX3of3(insn->vectorExtensionPrefix[2]) << 3)
+ | (rFromVEX2of3(insn->vectorExtensionPrefix[1]) << 2)
+ | (xFromVEX2of3(insn->vectorExtensionPrefix[1]) << 1)
+ | (bFromVEX2of3(insn->vectorExtensionPrefix[1]) << 0);
+ }
+
+ dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx 0x%hhx",
+ insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1],
+ insn->vectorExtensionPrefix[2]);
+ }
+ } else if (byte == 0xc5) {
+ uint8_t byte1;
+
+ if (lookAtByte(insn, &byte1)) {
+ dbgprintf(insn, "Couldn't read second byte of VEX");
+ return -1;
+ }
+
+ if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) {
+ insn->vectorExtensionType = TYPE_VEX_2B;
+ } else {
+ unconsumeByte(insn);
+ }
+
+ if (insn->vectorExtensionType == TYPE_VEX_2B) {
+ insn->vectorExtensionPrefix[0] = byte;
+ consumeByte(insn, &insn->vectorExtensionPrefix[1]);
+
+ if (insn->mode == MODE_64BIT) {
+ insn->rexPrefix = 0x40
+ | (rFromVEX2of2(insn->vectorExtensionPrefix[1]) << 2);
+ }
+
+ switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1])) {
+ default:
+ break;
+ case VEX_PREFIX_66:
+ hasOpSize = true;
+ break;
+ }
+
+ dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx",
+ insn->vectorExtensionPrefix[0],
+ insn->vectorExtensionPrefix[1]);
+ }
+ } else if (byte == 0x8f) {
+ uint8_t byte1;
+
+ if (lookAtByte(insn, &byte1)) {
+ dbgprintf(insn, "Couldn't read second byte of XOP");
+ return -1;
+ }
+
+ if ((byte1 & 0x38) != 0x0) { /* 0 in these 3 bits is a POP instruction. */
+ insn->vectorExtensionType = TYPE_XOP;
+ insn->necessaryPrefixLocation = insn->readerCursor - 1;
+ } else {
+ unconsumeByte(insn);
+ insn->necessaryPrefixLocation = insn->readerCursor - 1;
+ }
+
+ if (insn->vectorExtensionType == TYPE_XOP) {
+ insn->vectorExtensionPrefix[0] = byte;
+ consumeByte(insn, &insn->vectorExtensionPrefix[1]);
+ consumeByte(insn, &insn->vectorExtensionPrefix[2]);
+
+ /* We simulate the REX prefix for simplicity's sake */
+
+ if (insn->mode == MODE_64BIT) {
+ insn->rexPrefix = 0x40
+ | (wFromXOP3of3(insn->vectorExtensionPrefix[2]) << 3)
+ | (rFromXOP2of3(insn->vectorExtensionPrefix[1]) << 2)
+ | (xFromXOP2of3(insn->vectorExtensionPrefix[1]) << 1)
+ | (bFromXOP2of3(insn->vectorExtensionPrefix[1]) << 0);
+ }
+
+ switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2])) {
+ default:
+ break;
+ case VEX_PREFIX_66:
+ hasOpSize = true;
+ break;
+ }
+
+ dbgprintf(insn, "Found XOP prefix 0x%hhx 0x%hhx 0x%hhx",
+ insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1],
+ insn->vectorExtensionPrefix[2]);
+ }
+ } else {
+ if (insn->mode == MODE_64BIT) {
+ if ((byte & 0xf0) == 0x40) {
+ uint8_t opcodeByte;
+
+ if (lookAtByte(insn, &opcodeByte) || ((opcodeByte & 0xf0) == 0x40)) {
+ dbgprintf(insn, "Redundant REX prefix");
+ return -1;
+ }
+
+ insn->rexPrefix = byte;
+ insn->necessaryPrefixLocation = insn->readerCursor - 2;
+
+ dbgprintf(insn, "Found REX prefix 0x%hhx", byte);
+ } else {
+ unconsumeByte(insn);
+ insn->necessaryPrefixLocation = insn->readerCursor - 1;
+ }
+ } else {
+ unconsumeByte(insn);
+ insn->necessaryPrefixLocation = insn->readerCursor - 1;
+ }
+ }
+
+ if (insn->mode == MODE_16BIT) {
+ insn->registerSize = (hasOpSize ? 4 : 2);
+ insn->addressSize = (hasAdSize ? 4 : 2);
+ insn->displacementSize = (hasAdSize ? 4 : 2);
+ insn->immediateSize = (hasOpSize ? 4 : 2);
+ } else if (insn->mode == MODE_32BIT) {
+ insn->registerSize = (hasOpSize ? 2 : 4);
+ insn->addressSize = (hasAdSize ? 2 : 4);
+ insn->displacementSize = (hasAdSize ? 2 : 4);
+ insn->immediateSize = (hasOpSize ? 2 : 4);
+ } else if (insn->mode == MODE_64BIT) {
+ if (insn->rexPrefix && wFromREX(insn->rexPrefix)) {
+ insn->registerSize = 8;
+ insn->addressSize = (hasAdSize ? 4 : 8);
+ insn->displacementSize = 4;
+ insn->immediateSize = 4;
+ } else if (insn->rexPrefix) {
+ insn->registerSize = (hasOpSize ? 2 : 4);
+ insn->addressSize = (hasAdSize ? 4 : 8);
+ insn->displacementSize = (hasOpSize ? 2 : 4);
+ insn->immediateSize = (hasOpSize ? 2 : 4);
+ } else {
+ insn->registerSize = (hasOpSize ? 2 : 4);
+ insn->addressSize = (hasAdSize ? 4 : 8);
+ insn->displacementSize = (hasOpSize ? 2 : 4);
+ insn->immediateSize = (hasOpSize ? 2 : 4);
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * readOpcode - Reads the opcode (excepting the ModR/M byte in the case of
+ * extended or escape opcodes).
+ *
+ * @param insn - The instruction whose opcode is to be read.
+ * @return - 0 if the opcode could be read successfully; nonzero otherwise.
+ */
+static int readOpcode(struct InternalInstruction* insn) {
+ /* Determine the length of the primary opcode */
+
+ uint8_t current;
+
+ dbgprintf(insn, "readOpcode()");
+
+ insn->opcodeType = ONEBYTE;
+
+ if (insn->vectorExtensionType == TYPE_EVEX) {
+ switch (mmFromEVEX2of4(insn->vectorExtensionPrefix[1])) {
+ default:
+ dbgprintf(insn, "Unhandled mm field for instruction (0x%hhx)",
+ mmFromEVEX2of4(insn->vectorExtensionPrefix[1]));
+ return -1;
+ case VEX_LOB_0F:
+ insn->opcodeType = TWOBYTE;
+ return consumeByte(insn, &insn->opcode);
+ case VEX_LOB_0F38:
+ insn->opcodeType = THREEBYTE_38;
+ return consumeByte(insn, &insn->opcode);
+ case VEX_LOB_0F3A:
+ insn->opcodeType = THREEBYTE_3A;
+ return consumeByte(insn, &insn->opcode);
+ }
+ } else if (insn->vectorExtensionType == TYPE_VEX_3B) {
+ switch (mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1])) {
+ default:
+ dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)",
+ mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1]));
+ return -1;
+ case VEX_LOB_0F:
+ insn->opcodeType = TWOBYTE;
+ return consumeByte(insn, &insn->opcode);
+ case VEX_LOB_0F38:
+ insn->opcodeType = THREEBYTE_38;
+ return consumeByte(insn, &insn->opcode);
+ case VEX_LOB_0F3A:
+ insn->opcodeType = THREEBYTE_3A;
+ return consumeByte(insn, &insn->opcode);
+ }
+ } else if (insn->vectorExtensionType == TYPE_VEX_2B) {
+ insn->opcodeType = TWOBYTE;
+ return consumeByte(insn, &insn->opcode);
+ } else if (insn->vectorExtensionType == TYPE_XOP) {
+ switch (mmmmmFromXOP2of3(insn->vectorExtensionPrefix[1])) {
+ default:
+ dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)",
+ mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1]));
+ return -1;
+ case XOP_MAP_SELECT_8:
+ insn->opcodeType = XOP8_MAP;
+ return consumeByte(insn, &insn->opcode);
+ case XOP_MAP_SELECT_9:
+ insn->opcodeType = XOP9_MAP;
+ return consumeByte(insn, &insn->opcode);
+ case XOP_MAP_SELECT_A:
+ insn->opcodeType = XOPA_MAP;
+ return consumeByte(insn, &insn->opcode);
+ }
+ }
+
+ if (consumeByte(insn, &current))
+ return -1;
+
+ if (current == 0x0f) {
+ dbgprintf(insn, "Found a two-byte escape prefix (0x%hhx)", current);
+
+ if (consumeByte(insn, &current))
+ return -1;
+
+ if (current == 0x38) {
+ dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
+
+ if (consumeByte(insn, &current))
+ return -1;
+
+ insn->opcodeType = THREEBYTE_38;
+ } else if (current == 0x3a) {
+ dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
+
+ if (consumeByte(insn, &current))
+ return -1;
+
+ insn->opcodeType = THREEBYTE_3A;
+ } else {
+ dbgprintf(insn, "Didn't find a three-byte escape prefix");
+
+ insn->opcodeType = TWOBYTE;
+ }
+ }
+
+ /*
+ * At this point we have consumed the full opcode.
+ * Anything we consume from here on must be unconsumed.
+ */
+
+ insn->opcode = current;
+
+ return 0;
+}
+
+static int readModRM(struct InternalInstruction* insn);
+
+/*
+ * getIDWithAttrMask - Determines the ID of an instruction, consuming
+ * the ModR/M byte as appropriate for extended and escape opcodes,
+ * and using a supplied attribute mask.
+ *
+ * @param instructionID - A pointer whose target is filled in with the ID of the
+ * instruction.
+ * @param insn - The instruction whose ID is to be determined.
+ * @param attrMask - The attribute mask to search.
+ * @return - 0 if the ModR/M could be read when needed or was not
+ * needed; nonzero otherwise.
+ */
+static int getIDWithAttrMask(uint16_t* instructionID,
+ struct InternalInstruction* insn,
+ uint16_t attrMask) {
+ bool hasModRMExtension;
+
+ InstructionContext instructionClass = contextForAttrs(attrMask);
+
+ hasModRMExtension = modRMRequired(insn->opcodeType,
+ instructionClass,
+ insn->opcode);
+
+ if (hasModRMExtension) {
+ if (readModRM(insn))
+ return -1;
+
+ *instructionID = decode(insn->opcodeType,
+ instructionClass,
+ insn->opcode,
+ insn->modRM);
+ } else {
+ *instructionID = decode(insn->opcodeType,
+ instructionClass,
+ insn->opcode,
+ 0);
+ }
+
+ return 0;
+}
+
+/*
+ * is16BitEquivalent - Determines whether two instruction names refer to
+ * equivalent instructions but one is 16-bit whereas the other is not.
+ *
+ * @param orig - The instruction that is not 16-bit
+ * @param equiv - The instruction that is 16-bit
+ */
+static bool is16BitEquivalent(const char *orig, const char *equiv) {
+ off_t i;
+
+ for (i = 0;; i++) {
+ if (orig[i] == '\0' && equiv[i] == '\0')
+ return true;
+ if (orig[i] == '\0' || equiv[i] == '\0')
+ return false;
+ if (orig[i] != equiv[i]) {
+ if ((orig[i] == 'Q' || orig[i] == 'L') && equiv[i] == 'W')
+ continue;
+ if ((orig[i] == '6' || orig[i] == '3') && equiv[i] == '1')
+ continue;
+ if ((orig[i] == '4' || orig[i] == '2') && equiv[i] == '6')
+ continue;
+ return false;
+ }
+ }
+}
+
+/*
+ * is64Bit - Determines whether this instruction is a 64-bit instruction.
+ *
+ * @param name - The instruction that is not 16-bit
+ */
+static bool is64Bit(const char *name) {
+ off_t i;
+
+ for (i = 0;; ++i) {
+ if (name[i] == '\0')
+ return false;
+ if (name[i] == '6' && name[i+1] == '4')
+ return true;
+ }
+}
+
+/*
+ * getID - Determines the ID of an instruction, consuming the ModR/M byte as
+ * appropriate for extended and escape opcodes. Determines the attributes and
+ * context for the instruction before doing so.
+ *
+ * @param insn - The instruction whose ID is to be determined.
+ * @return - 0 if the ModR/M could be read when needed or was not needed;
+ * nonzero otherwise.
+ */
+static int getID(struct InternalInstruction* insn, const void *miiArg) {
+ uint16_t attrMask;
+ uint16_t instructionID;
+
+ dbgprintf(insn, "getID()");
+
+ attrMask = ATTR_NONE;
+
+ if (insn->mode == MODE_64BIT)
+ attrMask |= ATTR_64BIT;
+
+ if (insn->vectorExtensionType != TYPE_NO_VEX_XOP) {
+ attrMask |= (insn->vectorExtensionType == TYPE_EVEX) ? ATTR_EVEX : ATTR_VEX;
+
+ if (insn->vectorExtensionType == TYPE_EVEX) {
+ switch (ppFromEVEX3of4(insn->vectorExtensionPrefix[2])) {
+ case VEX_PREFIX_66:
+ attrMask |= ATTR_OPSIZE;
+ break;
+ case VEX_PREFIX_F3:
+ attrMask |= ATTR_XS;
+ break;
+ case VEX_PREFIX_F2:
+ attrMask |= ATTR_XD;
+ break;
+ }
+
+ if (zFromEVEX4of4(insn->vectorExtensionPrefix[3]))
+ attrMask |= ATTR_EVEXKZ;
+ if (bFromEVEX4of4(insn->vectorExtensionPrefix[3]))
+ attrMask |= ATTR_EVEXB;
+ if (aaaFromEVEX4of4(insn->vectorExtensionPrefix[3]))
+ attrMask |= ATTR_EVEXK;
+ if (lFromEVEX4of4(insn->vectorExtensionPrefix[3]))
+ attrMask |= ATTR_EVEXL;
+ if (l2FromEVEX4of4(insn->vectorExtensionPrefix[3]))
+ attrMask |= ATTR_EVEXL2;
+ } else if (insn->vectorExtensionType == TYPE_VEX_3B) {
+ switch (ppFromVEX3of3(insn->vectorExtensionPrefix[2])) {
+ case VEX_PREFIX_66:
+ attrMask |= ATTR_OPSIZE;
+ break;
+ case VEX_PREFIX_F3:
+ attrMask |= ATTR_XS;
+ break;
+ case VEX_PREFIX_F2:
+ attrMask |= ATTR_XD;
+ break;
+ }
+
+ if (lFromVEX3of3(insn->vectorExtensionPrefix[2]))
+ attrMask |= ATTR_VEXL;
+ } else if (insn->vectorExtensionType == TYPE_VEX_2B) {
+ switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1])) {
+ case VEX_PREFIX_66:
+ attrMask |= ATTR_OPSIZE;
+ break;
+ case VEX_PREFIX_F3:
+ attrMask |= ATTR_XS;
+ break;
+ case VEX_PREFIX_F2:
+ attrMask |= ATTR_XD;
+ break;
+ }
+
+ if (lFromVEX2of2(insn->vectorExtensionPrefix[1]))
+ attrMask |= ATTR_VEXL;
+ } else if (insn->vectorExtensionType == TYPE_XOP) {
+ switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2])) {
+ case VEX_PREFIX_66:
+ attrMask |= ATTR_OPSIZE;
+ break;
+ case VEX_PREFIX_F3:
+ attrMask |= ATTR_XS;
+ break;
+ case VEX_PREFIX_F2:
+ attrMask |= ATTR_XD;
+ break;
+ }
+
+ if (lFromXOP3of3(insn->vectorExtensionPrefix[2]))
+ attrMask |= ATTR_VEXL;
+ } else {
+ return -1;
+ }
+ } else {
+ if (insn->mode != MODE_16BIT && isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation))
+ attrMask |= ATTR_OPSIZE;
+ else if (isPrefixAtLocation(insn, 0x67, insn->necessaryPrefixLocation))
+ attrMask |= ATTR_ADSIZE;
+ else if (isPrefixAtLocation(insn, 0xf3, insn->necessaryPrefixLocation))
+ attrMask |= ATTR_XS;
+ else if (isPrefixAtLocation(insn, 0xf2, insn->necessaryPrefixLocation))
+ attrMask |= ATTR_XD;
+ }
+
+ if (insn->rexPrefix & 0x08)
+ attrMask |= ATTR_REXW;
+
+ /*
+ * JCXZ/JECXZ need special handling for 16-bit mode because the meaning
+ * of the AdSize prefix is inverted w.r.t. 32-bit mode.
+ */
+ if (insn->mode == MODE_16BIT && insn->opcodeType == ONEBYTE &&
+ insn->opcode == 0xE3)
+ attrMask ^= ATTR_ADSIZE;
+
+ /*
+ * In 64-bit mode all f64 superscripted opcodes ignore opcode size prefix
+ * CALL/JMP/JCC instructions need to ignore 0x66 and consume 4 bytes
+ */
+
+ if (insn->mode == MODE_64BIT &&
+ isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation)) {
+ switch (insn->opcode) {
+ case 0xE8:
+ case 0xE9:
+ // Take care of psubsb and other mmx instructions.
+ if (insn->opcodeType == ONEBYTE) {
+ attrMask ^= ATTR_OPSIZE;
+ insn->immediateSize = 4;
+ insn->displacementSize = 4;
+ }
+ break;
+ case 0x82:
+ case 0x83:
+ case 0x84:
+ case 0x85:
+ case 0x86:
+ case 0x87:
+ case 0x88:
+ case 0x89:
+ case 0x8A:
+ case 0x8B:
+ case 0x8C:
+ case 0x8D:
+ case 0x8E:
+ case 0x8F:
+ // Take care of lea and three byte ops.
+ if (insn->opcodeType == TWOBYTE) {
+ attrMask ^= ATTR_OPSIZE;
+ insn->immediateSize = 4;
+ insn->displacementSize = 4;
+ }
+ break;
+ }
+ }
+
+ if (getIDWithAttrMask(&instructionID, insn, attrMask))
+ return -1;
+
+ /* The following clauses compensate for limitations of the tables. */
+
+ if (insn->mode != MODE_64BIT &&
+ insn->vectorExtensionType != TYPE_NO_VEX_XOP) {
+ /*
+ * The tables can't distinquish between cases where the W-bit is used to
+ * select register size and cases where its a required part of the opcode.
+ */
+ if ((insn->vectorExtensionType == TYPE_EVEX &&
+ wFromEVEX3of4(insn->vectorExtensionPrefix[2])) ||
+ (insn->vectorExtensionType == TYPE_VEX_3B &&
+ wFromVEX3of3(insn->vectorExtensionPrefix[2])) ||
+ (insn->vectorExtensionType == TYPE_XOP &&
+ wFromXOP3of3(insn->vectorExtensionPrefix[2]))) {
+
+ uint16_t instructionIDWithREXW;
+ if (getIDWithAttrMask(&instructionIDWithREXW,
+ insn, attrMask | ATTR_REXW)) {
+ insn->instructionID = instructionID;
+ insn->spec = specifierForUID(instructionID);
+ return 0;
+ }
+
+ auto SpecName = GetInstrName(instructionIDWithREXW, miiArg);
+ // If not a 64-bit instruction. Switch the opcode.
+ if (!is64Bit(SpecName.data())) {
+ insn->instructionID = instructionIDWithREXW;
+ insn->spec = specifierForUID(instructionIDWithREXW);
+ return 0;
+ }
+ }
+ }
+
+ /*
+ * Absolute moves need special handling.
+ * -For 16-bit mode because the meaning of the AdSize and OpSize prefixes are
+ * inverted w.r.t.
+ * -For 32-bit mode we need to ensure the ADSIZE prefix is observed in
+ * any position.
+ */
+ if (insn->opcodeType == ONEBYTE && ((insn->opcode & 0xFC) == 0xA0)) {
+ /* Make sure we observed the prefixes in any position. */
+ if (insn->prefixPresent[0x67])
+ attrMask |= ATTR_ADSIZE;
+ if (insn->prefixPresent[0x66])
+ attrMask |= ATTR_OPSIZE;
+
+ /* In 16-bit, invert the attributes. */
+ if (insn->mode == MODE_16BIT)
+ attrMask ^= ATTR_ADSIZE | ATTR_OPSIZE;
+
+ if (getIDWithAttrMask(&instructionID, insn, attrMask))
+ return -1;
+
+ insn->instructionID = instructionID;
+ insn->spec = specifierForUID(instructionID);
+ return 0;
+ }
+
+ if ((insn->mode == MODE_16BIT || insn->prefixPresent[0x66]) &&
+ !(attrMask & ATTR_OPSIZE)) {
+ /*
+ * The instruction tables make no distinction between instructions that
+ * allow OpSize anywhere (i.e., 16-bit operations) and that need it in a
+ * particular spot (i.e., many MMX operations). In general we're
+ * conservative, but in the specific case where OpSize is present but not
+ * in the right place we check if there's a 16-bit operation.
+ */
+
+ const struct InstructionSpecifier *spec;
+ uint16_t instructionIDWithOpsize;
+ llvm::StringRef specName, specWithOpSizeName;
+
+ spec = specifierForUID(instructionID);
+
+ if (getIDWithAttrMask(&instructionIDWithOpsize,
+ insn,
+ attrMask | ATTR_OPSIZE)) {
+ /*
+ * ModRM required with OpSize but not present; give up and return version
+ * without OpSize set
+ */
+
+ insn->instructionID = instructionID;
+ insn->spec = spec;
+ return 0;
+ }
+
+ specName = GetInstrName(instructionID, miiArg);
+ specWithOpSizeName = GetInstrName(instructionIDWithOpsize, miiArg);
+
+ if (is16BitEquivalent(specName.data(), specWithOpSizeName.data()) &&
+ (insn->mode == MODE_16BIT) ^ insn->prefixPresent[0x66]) {
+ insn->instructionID = instructionIDWithOpsize;
+ insn->spec = specifierForUID(instructionIDWithOpsize);
+ } else {
+ insn->instructionID = instructionID;
+ insn->spec = spec;
+ }
+ return 0;
+ }
+
+ if (insn->opcodeType == ONEBYTE && insn->opcode == 0x90 &&
+ insn->rexPrefix & 0x01) {
+ /*
+ * NOOP shouldn't decode as NOOP if REX.b is set. Instead
+ * it should decode as XCHG %r8, %eax.
+ */
+
+ const struct InstructionSpecifier *spec;
+ uint16_t instructionIDWithNewOpcode;
+ const struct InstructionSpecifier *specWithNewOpcode;
+
+ spec = specifierForUID(instructionID);
+
+ /* Borrow opcode from one of the other XCHGar opcodes */
+ insn->opcode = 0x91;
+
+ if (getIDWithAttrMask(&instructionIDWithNewOpcode,
+ insn,
+ attrMask)) {
+ insn->opcode = 0x90;
+
+ insn->instructionID = instructionID;
+ insn->spec = spec;
+ return 0;
+ }
+
+ specWithNewOpcode = specifierForUID(instructionIDWithNewOpcode);
+
+ /* Change back */
+ insn->opcode = 0x90;
+
+ insn->instructionID = instructionIDWithNewOpcode;
+ insn->spec = specWithNewOpcode;
+
+ return 0;
+ }
+
+ insn->instructionID = instructionID;
+ insn->spec = specifierForUID(insn->instructionID);
+
+ return 0;
+}
+
+/*
+ * readSIB - Consumes the SIB byte to determine addressing information for an
+ * instruction.
+ *
+ * @param insn - The instruction whose SIB byte is to be read.
+ * @return - 0 if the SIB byte was successfully read; nonzero otherwise.
+ */
+static int readSIB(struct InternalInstruction* insn) {
+ SIBIndex sibIndexBase = SIB_INDEX_NONE;
+ SIBBase sibBaseBase = SIB_BASE_NONE;
+ uint8_t index, base;
+
+ dbgprintf(insn, "readSIB()");
+
+ if (insn->consumedSIB)
+ return 0;
+
+ insn->consumedSIB = true;
+
+ switch (insn->addressSize) {
+ case 2:
+ dbgprintf(insn, "SIB-based addressing doesn't work in 16-bit mode");
+ return -1;
+ case 4:
+ sibIndexBase = SIB_INDEX_EAX;
+ sibBaseBase = SIB_BASE_EAX;
+ break;
+ case 8:
+ sibIndexBase = SIB_INDEX_RAX;
+ sibBaseBase = SIB_BASE_RAX;
+ break;
+ }
+
+ if (consumeByte(insn, &insn->sib))
+ return -1;
+
+ index = indexFromSIB(insn->sib) | (xFromREX(insn->rexPrefix) << 3);
+
+ // FIXME: The fifth bit (bit index 4) is only to be used for instructions
+ // that understand VSIB indexing. ORing the bit in here is mildy dangerous
+ // because performing math on an 'enum SIBIndex' can produce garbage.
+ // Excluding the "none" value, it should cover 6 spaces of register names:
+ // - 16 possibilities for 16-bit GPR starting at SIB_INDEX_BX_SI
+ // - 16 possibilities for 32-bit GPR starting at SIB_INDEX_EAX
+ // - 16 possibilities for 64-bit GPR starting at SIB_INDEX_RAX
+ // - 32 possibilities for each of XMM, YMM, ZMM registers
+ // When sibIndexBase gets assigned SIB_INDEX_RAX as it does in 64-bit mode,
+ // summing in a fully decoded index between 0 and 31 can end up with a value
+ // that looks like something in the low half of the XMM range.
+ // translateRMMemory() tries to reverse the damage, with only partial success,
+ // as evidenced by known bugs in "test/MC/Disassembler/X86/x86-64.txt"
+ if (insn->vectorExtensionType == TYPE_EVEX)
+ index |= v2FromEVEX4of4(insn->vectorExtensionPrefix[3]) << 4;
+
+ if (index == 0x4) {
+ insn->sibIndex = SIB_INDEX_NONE;
+ } else {
+ insn->sibIndex = (SIBIndex)(sibIndexBase + index);
+ }
+
+ insn->sibScale = 1 << scaleFromSIB(insn->sib);
+
+ base = baseFromSIB(insn->sib) | (bFromREX(insn->rexPrefix) << 3);
+
+ switch (base) {
+ case 0x5:
+ case 0xd:
+ switch (modFromModRM(insn->modRM)) {
+ case 0x0:
+ insn->eaDisplacement = EA_DISP_32;
+ insn->sibBase = SIB_BASE_NONE;
+ break;
+ case 0x1:
+ insn->eaDisplacement = EA_DISP_8;
+ insn->sibBase = (SIBBase)(sibBaseBase + base);
+ break;
+ case 0x2:
+ insn->eaDisplacement = EA_DISP_32;
+ insn->sibBase = (SIBBase)(sibBaseBase + base);
+ break;
+ case 0x3:
+ debug("Cannot have Mod = 0b11 and a SIB byte");
+ return -1;
+ }
+ break;
+ default:
+ insn->sibBase = (SIBBase)(sibBaseBase + base);
+ break;
+ }
+
+ return 0;
+}
+
+/*
+ * readDisplacement - Consumes the displacement of an instruction.
+ *
+ * @param insn - The instruction whose displacement is to be read.
+ * @return - 0 if the displacement byte was successfully read; nonzero
+ * otherwise.
+ */
+static int readDisplacement(struct InternalInstruction* insn) {
+ int8_t d8;
+ int16_t d16;
+ int32_t d32;
+
+ dbgprintf(insn, "readDisplacement()");
+
+ if (insn->consumedDisplacement)
+ return 0;
+
+ insn->consumedDisplacement = true;
+ insn->displacementOffset = insn->readerCursor - insn->startLocation;
+
+ switch (insn->eaDisplacement) {
+ case EA_DISP_NONE:
+ insn->consumedDisplacement = false;
+ break;
+ case EA_DISP_8:
+ if (consumeInt8(insn, &d8))
+ return -1;
+ insn->displacement = d8;
+ break;
+ case EA_DISP_16:
+ if (consumeInt16(insn, &d16))
+ return -1;
+ insn->displacement = d16;
+ break;
+ case EA_DISP_32:
+ if (consumeInt32(insn, &d32))
+ return -1;
+ insn->displacement = d32;
+ break;
+ }
+
+ insn->consumedDisplacement = true;
+ return 0;
+}
+
+/*
+ * readModRM - Consumes all addressing information (ModR/M byte, SIB byte, and
+ * displacement) for an instruction and interprets it.
+ *
+ * @param insn - The instruction whose addressing information is to be read.
+ * @return - 0 if the information was successfully read; nonzero otherwise.
+ */
+static int readModRM(struct InternalInstruction* insn) {
+ uint8_t mod, rm, reg;
+
+ dbgprintf(insn, "readModRM()");
+
+ if (insn->consumedModRM)
+ return 0;
+
+ if (consumeByte(insn, &insn->modRM))
+ return -1;
+ insn->consumedModRM = true;
+
+ mod = modFromModRM(insn->modRM);
+ rm = rmFromModRM(insn->modRM);
+ reg = regFromModRM(insn->modRM);
+
+ /*
+ * This goes by insn->registerSize to pick the correct register, which messes
+ * up if we're using (say) XMM or 8-bit register operands. That gets fixed in
+ * fixupReg().
+ */
+ switch (insn->registerSize) {
+ case 2:
+ insn->regBase = MODRM_REG_AX;
+ insn->eaRegBase = EA_REG_AX;
+ break;
+ case 4:
+ insn->regBase = MODRM_REG_EAX;
+ insn->eaRegBase = EA_REG_EAX;
+ break;
+ case 8:
+ insn->regBase = MODRM_REG_RAX;
+ insn->eaRegBase = EA_REG_RAX;
+ break;
+ }
+
+ reg |= rFromREX(insn->rexPrefix) << 3;
+ rm |= bFromREX(insn->rexPrefix) << 3;
+ if (insn->vectorExtensionType == TYPE_EVEX) {
+ reg |= r2FromEVEX2of4(insn->vectorExtensionPrefix[1]) << 4;
+ rm |= xFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 4;
+ }
+
+ insn->reg = (Reg)(insn->regBase + reg);
+
+ switch (insn->addressSize) {
+ case 2:
+ insn->eaBaseBase = EA_BASE_BX_SI;
+
+ switch (mod) {
+ case 0x0:
+ if (rm == 0x6) {
+ insn->eaBase = EA_BASE_NONE;
+ insn->eaDisplacement = EA_DISP_16;
+ if (readDisplacement(insn))
+ return -1;
+ } else {
+ insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+ insn->eaDisplacement = EA_DISP_NONE;
+ }
+ break;
+ case 0x1:
+ insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+ insn->eaDisplacement = EA_DISP_8;
+ insn->displacementSize = 1;
+ if (readDisplacement(insn))
+ return -1;
+ break;
+ case 0x2:
+ insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+ insn->eaDisplacement = EA_DISP_16;
+ if (readDisplacement(insn))
+ return -1;
+ break;
+ case 0x3:
+ insn->eaBase = (EABase)(insn->eaRegBase + rm);
+ if (readDisplacement(insn))
+ return -1;
+ break;
+ }
+ break;
+ case 4:
+ case 8:
+ insn->eaBaseBase = (insn->addressSize == 4 ? EA_BASE_EAX : EA_BASE_RAX);
+
+ switch (mod) {
+ case 0x0:
+ insn->eaDisplacement = EA_DISP_NONE; /* readSIB may override this */
+ // In determining whether RIP-relative mode is used (rm=5),
+ // or whether a SIB byte is present (rm=4),
+ // the extension bits (REX.b and EVEX.x) are ignored.
+ switch (rm & 7) {
+ case 0x4: // SIB byte is present
+ insn->eaBase = (insn->addressSize == 4 ?
+ EA_BASE_sib : EA_BASE_sib64);
+ if (readSIB(insn) || readDisplacement(insn))
+ return -1;
+ break;
+ case 0x5: // RIP-relative
+ insn->eaBase = EA_BASE_NONE;
+ insn->eaDisplacement = EA_DISP_32;
+ if (readDisplacement(insn))
+ return -1;
+ break;
+ default:
+ insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+ break;
+ }
+ break;
+ case 0x1:
+ insn->displacementSize = 1;
+ /* FALLTHROUGH */
+ case 0x2:
+ insn->eaDisplacement = (mod == 0x1 ? EA_DISP_8 : EA_DISP_32);
+ switch (rm & 7) {
+ case 0x4: // SIB byte is present
+ insn->eaBase = EA_BASE_sib;
+ if (readSIB(insn) || readDisplacement(insn))
+ return -1;
+ break;
+ default:
+ insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+ if (readDisplacement(insn))
+ return -1;
+ break;
+ }
+ break;
+ case 0x3:
+ insn->eaDisplacement = EA_DISP_NONE;
+ insn->eaBase = (EABase)(insn->eaRegBase + rm);
+ break;
+ }
+ break;
+ } /* switch (insn->addressSize) */
+
+ return 0;
+}
+
+#define GENERIC_FIXUP_FUNC(name, base, prefix) \
+ static uint16_t name(struct InternalInstruction *insn, \
+ OperandType type, \
+ uint8_t index, \
+ uint8_t *valid) { \
+ *valid = 1; \
+ switch (type) { \
+ default: \
+ debug("Unhandled register type"); \
+ *valid = 0; \
+ return 0; \
+ case TYPE_Rv: \
+ return base + index; \
+ case TYPE_R8: \
+ if (insn->rexPrefix && \
+ index >= 4 && index <= 7) { \
+ return prefix##_SPL + (index - 4); \
+ } else { \
+ return prefix##_AL + index; \
+ } \
+ case TYPE_R16: \
+ return prefix##_AX + index; \
+ case TYPE_R32: \
+ return prefix##_EAX + index; \
+ case TYPE_R64: \
+ return prefix##_RAX + index; \
+ case TYPE_XMM512: \
+ return prefix##_ZMM0 + index; \
+ case TYPE_XMM256: \
+ return prefix##_YMM0 + index; \
+ case TYPE_XMM128: \
+ case TYPE_XMM64: \
+ case TYPE_XMM32: \
+ return prefix##_XMM0 + index; \
+ case TYPE_VK1: \
+ case TYPE_VK2: \
+ case TYPE_VK4: \
+ case TYPE_VK8: \
+ case TYPE_VK16: \
+ case TYPE_VK32: \
+ case TYPE_VK64: \
+ if (index > 7) \
+ *valid = 0; \
+ return prefix##_K0 + index; \
+ case TYPE_MM64: \
+ return prefix##_MM0 + (index & 0x7); \
+ case TYPE_SEGMENTREG: \
+ if (index > 5) \
+ *valid = 0; \
+ return prefix##_ES + index; \
+ case TYPE_DEBUGREG: \
+ return prefix##_DR0 + index; \
+ case TYPE_CONTROLREG: \
+ return prefix##_CR0 + index; \
+ case TYPE_BNDR: \
+ if (index > 3) \
+ *valid = 0; \
+ return prefix##_BND0 + index; \
+ } \
+ }
+
+/*
+ * fixup*Value - Consults an operand type to determine the meaning of the
+ * reg or R/M field. If the operand is an XMM operand, for example, an
+ * operand would be XMM0 instead of AX, which readModRM() would otherwise
+ * misinterpret it as.
+ *
+ * @param insn - The instruction containing the operand.
+ * @param type - The operand type.
+ * @param index - The existing value of the field as reported by readModRM().
+ * @param valid - The address of a uint8_t. The target is set to 1 if the
+ * field is valid for the register class; 0 if not.
+ * @return - The proper value.
+ */
+GENERIC_FIXUP_FUNC(fixupRegValue, insn->regBase, MODRM_REG)
+GENERIC_FIXUP_FUNC(fixupRMValue, insn->eaRegBase, EA_REG)
+
+/*
+ * fixupReg - Consults an operand specifier to determine which of the
+ * fixup*Value functions to use in correcting readModRM()'ss interpretation.
+ *
+ * @param insn - See fixup*Value().
+ * @param op - The operand specifier.
+ * @return - 0 if fixup was successful; -1 if the register returned was
+ * invalid for its class.
+ */
+static int fixupReg(struct InternalInstruction *insn,
+ const struct OperandSpecifier *op) {
+ uint8_t valid;
+
+ dbgprintf(insn, "fixupReg()");
+
+ switch ((OperandEncoding)op->encoding) {
+ default:
+ debug("Expected a REG or R/M encoding in fixupReg");
+ return -1;
+ case ENCODING_VVVV:
+ insn->vvvv = (Reg)fixupRegValue(insn,
+ (OperandType)op->type,
+ insn->vvvv,
+ &valid);
+ if (!valid)
+ return -1;
+ break;
+ case ENCODING_REG:
+ insn->reg = (Reg)fixupRegValue(insn,
+ (OperandType)op->type,
+ insn->reg - insn->regBase,
+ &valid);
+ if (!valid)
+ return -1;
+ break;
+ CASE_ENCODING_RM:
+ if (insn->eaBase >= insn->eaRegBase) {
+ insn->eaBase = (EABase)fixupRMValue(insn,
+ (OperandType)op->type,
+ insn->eaBase - insn->eaRegBase,
+ &valid);
+ if (!valid)
+ return -1;
+ }
+ break;
+ }
+
+ return 0;
+}
+
+/*
+ * readOpcodeRegister - Reads an operand from the opcode field of an
+ * instruction and interprets it appropriately given the operand width.
+ * Handles AddRegFrm instructions.
+ *
+ * @param insn - the instruction whose opcode field is to be read.
+ * @param size - The width (in bytes) of the register being specified.
+ * 1 means AL and friends, 2 means AX, 4 means EAX, and 8 means
+ * RAX.
+ * @return - 0 on success; nonzero otherwise.
+ */
+static int readOpcodeRegister(struct InternalInstruction* insn, uint8_t size) {
+ dbgprintf(insn, "readOpcodeRegister()");
+
+ if (size == 0)
+ size = insn->registerSize;
+
+ switch (size) {
+ case 1:
+ insn->opcodeRegister = (Reg)(MODRM_REG_AL + ((bFromREX(insn->rexPrefix) << 3)
+ | (insn->opcode & 7)));
+ if (insn->rexPrefix &&
+ insn->opcodeRegister >= MODRM_REG_AL + 0x4 &&
+ insn->opcodeRegister < MODRM_REG_AL + 0x8) {
+ insn->opcodeRegister = (Reg)(MODRM_REG_SPL
+ + (insn->opcodeRegister - MODRM_REG_AL - 4));
+ }
+
+ break;
+ case 2:
+ insn->opcodeRegister = (Reg)(MODRM_REG_AX
+ + ((bFromREX(insn->rexPrefix) << 3)
+ | (insn->opcode & 7)));
+ break;
+ case 4:
+ insn->opcodeRegister = (Reg)(MODRM_REG_EAX
+ + ((bFromREX(insn->rexPrefix) << 3)
+ | (insn->opcode & 7)));
+ break;
+ case 8:
+ insn->opcodeRegister = (Reg)(MODRM_REG_RAX
+ + ((bFromREX(insn->rexPrefix) << 3)
+ | (insn->opcode & 7)));
+ break;
+ }
+
+ return 0;
+}
+
+/*
+ * readImmediate - Consumes an immediate operand from an instruction, given the
+ * desired operand size.
+ *
+ * @param insn - The instruction whose operand is to be read.
+ * @param size - The width (in bytes) of the operand.
+ * @return - 0 if the immediate was successfully consumed; nonzero
+ * otherwise.
+ */
+static int readImmediate(struct InternalInstruction* insn, uint8_t size) {
+ uint8_t imm8;
+ uint16_t imm16;
+ uint32_t imm32;
+ uint64_t imm64;
+
+ dbgprintf(insn, "readImmediate()");
+
+ if (insn->numImmediatesConsumed == 2) {
+ debug("Already consumed two immediates");
+ return -1;
+ }
+
+ if (size == 0)
+ size = insn->immediateSize;
+ else
+ insn->immediateSize = size;
+ insn->immediateOffset = insn->readerCursor - insn->startLocation;
+
+ switch (size) {
+ case 1:
+ if (consumeByte(insn, &imm8))
+ return -1;
+ insn->immediates[insn->numImmediatesConsumed] = imm8;
+ break;
+ case 2:
+ if (consumeUInt16(insn, &imm16))
+ return -1;
+ insn->immediates[insn->numImmediatesConsumed] = imm16;
+ break;
+ case 4:
+ if (consumeUInt32(insn, &imm32))
+ return -1;
+ insn->immediates[insn->numImmediatesConsumed] = imm32;
+ break;
+ case 8:
+ if (consumeUInt64(insn, &imm64))
+ return -1;
+ insn->immediates[insn->numImmediatesConsumed] = imm64;
+ break;
+ }
+
+ insn->numImmediatesConsumed++;
+
+ return 0;
+}
+
+/*
+ * readVVVV - Consumes vvvv from an instruction if it has a VEX prefix.
+ *
+ * @param insn - The instruction whose operand is to be read.
+ * @return - 0 if the vvvv was successfully consumed; nonzero
+ * otherwise.
+ */
+static int readVVVV(struct InternalInstruction* insn) {
+ dbgprintf(insn, "readVVVV()");
+
+ int vvvv;
+ if (insn->vectorExtensionType == TYPE_EVEX)
+ vvvv = (v2FromEVEX4of4(insn->vectorExtensionPrefix[3]) << 4 |
+ vvvvFromEVEX3of4(insn->vectorExtensionPrefix[2]));
+ else if (insn->vectorExtensionType == TYPE_VEX_3B)
+ vvvv = vvvvFromVEX3of3(insn->vectorExtensionPrefix[2]);
+ else if (insn->vectorExtensionType == TYPE_VEX_2B)
+ vvvv = vvvvFromVEX2of2(insn->vectorExtensionPrefix[1]);
+ else if (insn->vectorExtensionType == TYPE_XOP)
+ vvvv = vvvvFromXOP3of3(insn->vectorExtensionPrefix[2]);
+ else
+ return -1;
+
+ if (insn->mode != MODE_64BIT)
+ vvvv &= 0x7;
+
+ insn->vvvv = static_cast<Reg>(vvvv);
+ return 0;
+}
+
+/*
+ * readMaskRegister - Reads an mask register from the opcode field of an
+ * instruction.
+ *
+ * @param insn - The instruction whose opcode field is to be read.
+ * @return - 0 on success; nonzero otherwise.
+ */
+static int readMaskRegister(struct InternalInstruction* insn) {
+ dbgprintf(insn, "readMaskRegister()");
+
+ if (insn->vectorExtensionType != TYPE_EVEX)
+ return -1;
+
+ insn->writemask =
+ static_cast<Reg>(aaaFromEVEX4of4(insn->vectorExtensionPrefix[3]));
+ return 0;
+}
+
+/*
+ * readOperands - Consults the specifier for an instruction and consumes all
+ * operands for that instruction, interpreting them as it goes.
+ *
+ * @param insn - The instruction whose operands are to be read and interpreted.
+ * @return - 0 if all operands could be read; nonzero otherwise.
+ */
+static int readOperands(struct InternalInstruction* insn) {
+ int hasVVVV, needVVVV;
+ int sawRegImm = 0;
+
+ dbgprintf(insn, "readOperands()");
+
+ /* If non-zero vvvv specified, need to make sure one of the operands
+ uses it. */
+ hasVVVV = !readVVVV(insn);
+ needVVVV = hasVVVV && (insn->vvvv != 0);
+
+ for (const auto &Op : x86OperandSets[insn->spec->operands]) {
+ switch (Op.encoding) {
+ case ENCODING_NONE:
+ case ENCODING_SI:
+ case ENCODING_DI:
+ break;
+ case ENCODING_REG:
+ CASE_ENCODING_RM:
+ if (readModRM(insn))
+ return -1;
+ if (fixupReg(insn, &Op))
+ return -1;
+ // Apply the AVX512 compressed displacement scaling factor.
+ if (Op.encoding != ENCODING_REG && insn->eaDisplacement == EA_DISP_8)
+ insn->displacement *= 1 << (Op.encoding - ENCODING_RM);
+ break;
+ case ENCODING_IB:
+ if (sawRegImm) {
+ /* Saw a register immediate so don't read again and instead split the
+ previous immediate. FIXME: This is a hack. */
+ insn->immediates[insn->numImmediatesConsumed] =
+ insn->immediates[insn->numImmediatesConsumed - 1] & 0xf;
+ ++insn->numImmediatesConsumed;
+ break;
+ }
+ if (readImmediate(insn, 1))
+ return -1;
+ if (Op.type == TYPE_XMM128 ||
+ Op.type == TYPE_XMM256)
+ sawRegImm = 1;
+ break;
+ case ENCODING_IW:
+ if (readImmediate(insn, 2))
+ return -1;
+ break;
+ case ENCODING_ID:
+ if (readImmediate(insn, 4))
+ return -1;
+ break;
+ case ENCODING_IO:
+ if (readImmediate(insn, 8))
+ return -1;
+ break;
+ case ENCODING_Iv:
+ if (readImmediate(insn, insn->immediateSize))
+ return -1;
+ break;
+ case ENCODING_Ia:
+ if (readImmediate(insn, insn->addressSize))
+ return -1;
+ break;
+ case ENCODING_RB:
+ if (readOpcodeRegister(insn, 1))
+ return -1;
+ break;
+ case ENCODING_RW:
+ if (readOpcodeRegister(insn, 2))
+ return -1;
+ break;
+ case ENCODING_RD:
+ if (readOpcodeRegister(insn, 4))
+ return -1;
+ break;
+ case ENCODING_RO:
+ if (readOpcodeRegister(insn, 8))
+ return -1;
+ break;
+ case ENCODING_Rv:
+ if (readOpcodeRegister(insn, 0))
+ return -1;
+ break;
+ case ENCODING_FP:
+ break;
+ case ENCODING_VVVV:
+ needVVVV = 0; /* Mark that we have found a VVVV operand. */
+ if (!hasVVVV)
+ return -1;
+ if (fixupReg(insn, &Op))
+ return -1;
+ break;
+ case ENCODING_WRITEMASK:
+ if (readMaskRegister(insn))
+ return -1;
+ break;
+ case ENCODING_DUP:
+ break;
+ default:
+ dbgprintf(insn, "Encountered an operand with an unknown encoding.");
+ return -1;
+ }
+ }
+
+ /* If we didn't find ENCODING_VVVV operand, but non-zero vvvv present, fail */
+ if (needVVVV) return -1;
+
+ return 0;
+}
+
+/*
+ * decodeInstruction - Reads and interprets a full instruction provided by the
+ * user.
+ *
+ * @param insn - A pointer to the instruction to be populated. Must be
+ * pre-allocated.
+ * @param reader - The function to be used to read the instruction's bytes.
+ * @param readerArg - A generic argument to be passed to the reader to store
+ * any internal state.
+ * @param logger - If non-NULL, the function to be used to write log messages
+ * and warnings.
+ * @param loggerArg - A generic argument to be passed to the logger to store
+ * any internal state.
+ * @param startLoc - The address (in the reader's address space) of the first
+ * byte in the instruction.
+ * @param mode - The mode (real mode, IA-32e, or IA-32e in 64-bit mode) to
+ * decode the instruction in.
+ * @return - 0 if the instruction's memory could be read; nonzero if
+ * not.
+ */
+int llvm::X86Disassembler::decodeInstruction(
+ struct InternalInstruction *insn, byteReader_t reader,
+ const void *readerArg, dlog_t logger, void *loggerArg, const void *miiArg,
+ uint64_t startLoc, DisassemblerMode mode) {
+ memset(insn, 0, sizeof(struct InternalInstruction));
+
+ insn->reader = reader;
+ insn->readerArg = readerArg;
+ insn->dlog = logger;
+ insn->dlogArg = loggerArg;
+ insn->startLocation = startLoc;
+ insn->readerCursor = startLoc;
+ insn->mode = mode;
+ insn->numImmediatesConsumed = 0;
+
+ if (readPrefixes(insn) ||
+ readOpcode(insn) ||
+ getID(insn, miiArg) ||
+ insn->instructionID == 0 ||
+ readOperands(insn))
+ return -1;
+
+ insn->operands = x86OperandSets[insn->spec->operands];
+
+ insn->length = insn->readerCursor - insn->startLocation;
+
+ dbgprintf(insn, "Read from 0x%llx to 0x%llx: length %zu",
+ startLoc, insn->readerCursor, insn->length);
+
+ if (insn->length > 15)
+ dbgprintf(insn, "Instruction exceeds 15-byte limit");
+
+ return 0;
+}
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
new file mode 100644
index 000000000000..b07fd0b17d35
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -0,0 +1,682 @@
+//===-- X86DisassemblerDecoderInternal.h - Disassembler decoder -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the X86 Disassembler.
+// It contains the public interface of the instruction decoder.
+// Documentation for the disassembler can be found in X86Disassembler.h.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODER_H
+#define LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODER_H
+
+#include "X86DisassemblerDecoderCommon.h"
+#include "llvm/ADT/ArrayRef.h"
+
+namespace llvm {
+namespace X86Disassembler {
+
+// Accessor functions for various fields of an Intel instruction
+#define modFromModRM(modRM) (((modRM) & 0xc0) >> 6)
+#define regFromModRM(modRM) (((modRM) & 0x38) >> 3)
+#define rmFromModRM(modRM) ((modRM) & 0x7)
+#define scaleFromSIB(sib) (((sib) & 0xc0) >> 6)
+#define indexFromSIB(sib) (((sib) & 0x38) >> 3)
+#define baseFromSIB(sib) ((sib) & 0x7)
+#define wFromREX(rex) (((rex) & 0x8) >> 3)
+#define rFromREX(rex) (((rex) & 0x4) >> 2)
+#define xFromREX(rex) (((rex) & 0x2) >> 1)
+#define bFromREX(rex) ((rex) & 0x1)
+
+#define rFromEVEX2of4(evex) (((~(evex)) & 0x80) >> 7)
+#define xFromEVEX2of4(evex) (((~(evex)) & 0x40) >> 6)
+#define bFromEVEX2of4(evex) (((~(evex)) & 0x20) >> 5)
+#define r2FromEVEX2of4(evex) (((~(evex)) & 0x10) >> 4)
+#define mmFromEVEX2of4(evex) ((evex) & 0x3)
+#define wFromEVEX3of4(evex) (((evex) & 0x80) >> 7)
+#define vvvvFromEVEX3of4(evex) (((~(evex)) & 0x78) >> 3)
+#define ppFromEVEX3of4(evex) ((evex) & 0x3)
+#define zFromEVEX4of4(evex) (((evex) & 0x80) >> 7)
+#define l2FromEVEX4of4(evex) (((evex) & 0x40) >> 6)
+#define lFromEVEX4of4(evex) (((evex) & 0x20) >> 5)
+#define bFromEVEX4of4(evex) (((evex) & 0x10) >> 4)
+#define v2FromEVEX4of4(evex) (((~evex) & 0x8) >> 3)
+#define aaaFromEVEX4of4(evex) ((evex) & 0x7)
+
+#define rFromVEX2of3(vex) (((~(vex)) & 0x80) >> 7)
+#define xFromVEX2of3(vex) (((~(vex)) & 0x40) >> 6)
+#define bFromVEX2of3(vex) (((~(vex)) & 0x20) >> 5)
+#define mmmmmFromVEX2of3(vex) ((vex) & 0x1f)
+#define wFromVEX3of3(vex) (((vex) & 0x80) >> 7)
+#define vvvvFromVEX3of3(vex) (((~(vex)) & 0x78) >> 3)
+#define lFromVEX3of3(vex) (((vex) & 0x4) >> 2)
+#define ppFromVEX3of3(vex) ((vex) & 0x3)
+
+#define rFromVEX2of2(vex) (((~(vex)) & 0x80) >> 7)
+#define vvvvFromVEX2of2(vex) (((~(vex)) & 0x78) >> 3)
+#define lFromVEX2of2(vex) (((vex) & 0x4) >> 2)
+#define ppFromVEX2of2(vex) ((vex) & 0x3)
+
+#define rFromXOP2of3(xop) (((~(xop)) & 0x80) >> 7)
+#define xFromXOP2of3(xop) (((~(xop)) & 0x40) >> 6)
+#define bFromXOP2of3(xop) (((~(xop)) & 0x20) >> 5)
+#define mmmmmFromXOP2of3(xop) ((xop) & 0x1f)
+#define wFromXOP3of3(xop) (((xop) & 0x80) >> 7)
+#define vvvvFromXOP3of3(vex) (((~(vex)) & 0x78) >> 3)
+#define lFromXOP3of3(xop) (((xop) & 0x4) >> 2)
+#define ppFromXOP3of3(xop) ((xop) & 0x3)
+
+// These enums represent Intel registers for use by the decoder.
+#define REGS_8BIT \
+ ENTRY(AL) \
+ ENTRY(CL) \
+ ENTRY(DL) \
+ ENTRY(BL) \
+ ENTRY(AH) \
+ ENTRY(CH) \
+ ENTRY(DH) \
+ ENTRY(BH) \
+ ENTRY(R8B) \
+ ENTRY(R9B) \
+ ENTRY(R10B) \
+ ENTRY(R11B) \
+ ENTRY(R12B) \
+ ENTRY(R13B) \
+ ENTRY(R14B) \
+ ENTRY(R15B) \
+ ENTRY(SPL) \
+ ENTRY(BPL) \
+ ENTRY(SIL) \
+ ENTRY(DIL)
+
+#define EA_BASES_16BIT \
+ ENTRY(BX_SI) \
+ ENTRY(BX_DI) \
+ ENTRY(BP_SI) \
+ ENTRY(BP_DI) \
+ ENTRY(SI) \
+ ENTRY(DI) \
+ ENTRY(BP) \
+ ENTRY(BX) \
+ ENTRY(R8W) \
+ ENTRY(R9W) \
+ ENTRY(R10W) \
+ ENTRY(R11W) \
+ ENTRY(R12W) \
+ ENTRY(R13W) \
+ ENTRY(R14W) \
+ ENTRY(R15W)
+
+#define REGS_16BIT \
+ ENTRY(AX) \
+ ENTRY(CX) \
+ ENTRY(DX) \
+ ENTRY(BX) \
+ ENTRY(SP) \
+ ENTRY(BP) \
+ ENTRY(SI) \
+ ENTRY(DI) \
+ ENTRY(R8W) \
+ ENTRY(R9W) \
+ ENTRY(R10W) \
+ ENTRY(R11W) \
+ ENTRY(R12W) \
+ ENTRY(R13W) \
+ ENTRY(R14W) \
+ ENTRY(R15W)
+
+#define EA_BASES_32BIT \
+ ENTRY(EAX) \
+ ENTRY(ECX) \
+ ENTRY(EDX) \
+ ENTRY(EBX) \
+ ENTRY(sib) \
+ ENTRY(EBP) \
+ ENTRY(ESI) \
+ ENTRY(EDI) \
+ ENTRY(R8D) \
+ ENTRY(R9D) \
+ ENTRY(R10D) \
+ ENTRY(R11D) \
+ ENTRY(R12D) \
+ ENTRY(R13D) \
+ ENTRY(R14D) \
+ ENTRY(R15D)
+
+#define REGS_32BIT \
+ ENTRY(EAX) \
+ ENTRY(ECX) \
+ ENTRY(EDX) \
+ ENTRY(EBX) \
+ ENTRY(ESP) \
+ ENTRY(EBP) \
+ ENTRY(ESI) \
+ ENTRY(EDI) \
+ ENTRY(R8D) \
+ ENTRY(R9D) \
+ ENTRY(R10D) \
+ ENTRY(R11D) \
+ ENTRY(R12D) \
+ ENTRY(R13D) \
+ ENTRY(R14D) \
+ ENTRY(R15D)
+
+#define EA_BASES_64BIT \
+ ENTRY(RAX) \
+ ENTRY(RCX) \
+ ENTRY(RDX) \
+ ENTRY(RBX) \
+ ENTRY(sib64) \
+ ENTRY(RBP) \
+ ENTRY(RSI) \
+ ENTRY(RDI) \
+ ENTRY(R8) \
+ ENTRY(R9) \
+ ENTRY(R10) \
+ ENTRY(R11) \
+ ENTRY(R12) \
+ ENTRY(R13) \
+ ENTRY(R14) \
+ ENTRY(R15)
+
+#define REGS_64BIT \
+ ENTRY(RAX) \
+ ENTRY(RCX) \
+ ENTRY(RDX) \
+ ENTRY(RBX) \
+ ENTRY(RSP) \
+ ENTRY(RBP) \
+ ENTRY(RSI) \
+ ENTRY(RDI) \
+ ENTRY(R8) \
+ ENTRY(R9) \
+ ENTRY(R10) \
+ ENTRY(R11) \
+ ENTRY(R12) \
+ ENTRY(R13) \
+ ENTRY(R14) \
+ ENTRY(R15)
+
+#define REGS_MMX \
+ ENTRY(MM0) \
+ ENTRY(MM1) \
+ ENTRY(MM2) \
+ ENTRY(MM3) \
+ ENTRY(MM4) \
+ ENTRY(MM5) \
+ ENTRY(MM6) \
+ ENTRY(MM7)
+
+#define REGS_XMM \
+ ENTRY(XMM0) \
+ ENTRY(XMM1) \
+ ENTRY(XMM2) \
+ ENTRY(XMM3) \
+ ENTRY(XMM4) \
+ ENTRY(XMM5) \
+ ENTRY(XMM6) \
+ ENTRY(XMM7) \
+ ENTRY(XMM8) \
+ ENTRY(XMM9) \
+ ENTRY(XMM10) \
+ ENTRY(XMM11) \
+ ENTRY(XMM12) \
+ ENTRY(XMM13) \
+ ENTRY(XMM14) \
+ ENTRY(XMM15) \
+ ENTRY(XMM16) \
+ ENTRY(XMM17) \
+ ENTRY(XMM18) \
+ ENTRY(XMM19) \
+ ENTRY(XMM20) \
+ ENTRY(XMM21) \
+ ENTRY(XMM22) \
+ ENTRY(XMM23) \
+ ENTRY(XMM24) \
+ ENTRY(XMM25) \
+ ENTRY(XMM26) \
+ ENTRY(XMM27) \
+ ENTRY(XMM28) \
+ ENTRY(XMM29) \
+ ENTRY(XMM30) \
+ ENTRY(XMM31)
+
+#define REGS_YMM \
+ ENTRY(YMM0) \
+ ENTRY(YMM1) \
+ ENTRY(YMM2) \
+ ENTRY(YMM3) \
+ ENTRY(YMM4) \
+ ENTRY(YMM5) \
+ ENTRY(YMM6) \
+ ENTRY(YMM7) \
+ ENTRY(YMM8) \
+ ENTRY(YMM9) \
+ ENTRY(YMM10) \
+ ENTRY(YMM11) \
+ ENTRY(YMM12) \
+ ENTRY(YMM13) \
+ ENTRY(YMM14) \
+ ENTRY(YMM15) \
+ ENTRY(YMM16) \
+ ENTRY(YMM17) \
+ ENTRY(YMM18) \
+ ENTRY(YMM19) \
+ ENTRY(YMM20) \
+ ENTRY(YMM21) \
+ ENTRY(YMM22) \
+ ENTRY(YMM23) \
+ ENTRY(YMM24) \
+ ENTRY(YMM25) \
+ ENTRY(YMM26) \
+ ENTRY(YMM27) \
+ ENTRY(YMM28) \
+ ENTRY(YMM29) \
+ ENTRY(YMM30) \
+ ENTRY(YMM31)
+
+#define REGS_ZMM \
+ ENTRY(ZMM0) \
+ ENTRY(ZMM1) \
+ ENTRY(ZMM2) \
+ ENTRY(ZMM3) \
+ ENTRY(ZMM4) \
+ ENTRY(ZMM5) \
+ ENTRY(ZMM6) \
+ ENTRY(ZMM7) \
+ ENTRY(ZMM8) \
+ ENTRY(ZMM9) \
+ ENTRY(ZMM10) \
+ ENTRY(ZMM11) \
+ ENTRY(ZMM12) \
+ ENTRY(ZMM13) \
+ ENTRY(ZMM14) \
+ ENTRY(ZMM15) \
+ ENTRY(ZMM16) \
+ ENTRY(ZMM17) \
+ ENTRY(ZMM18) \
+ ENTRY(ZMM19) \
+ ENTRY(ZMM20) \
+ ENTRY(ZMM21) \
+ ENTRY(ZMM22) \
+ ENTRY(ZMM23) \
+ ENTRY(ZMM24) \
+ ENTRY(ZMM25) \
+ ENTRY(ZMM26) \
+ ENTRY(ZMM27) \
+ ENTRY(ZMM28) \
+ ENTRY(ZMM29) \
+ ENTRY(ZMM30) \
+ ENTRY(ZMM31)
+
+#define REGS_MASKS \
+ ENTRY(K0) \
+ ENTRY(K1) \
+ ENTRY(K2) \
+ ENTRY(K3) \
+ ENTRY(K4) \
+ ENTRY(K5) \
+ ENTRY(K6) \
+ ENTRY(K7)
+
+#define REGS_SEGMENT \
+ ENTRY(ES) \
+ ENTRY(CS) \
+ ENTRY(SS) \
+ ENTRY(DS) \
+ ENTRY(FS) \
+ ENTRY(GS)
+
+#define REGS_DEBUG \
+ ENTRY(DR0) \
+ ENTRY(DR1) \
+ ENTRY(DR2) \
+ ENTRY(DR3) \
+ ENTRY(DR4) \
+ ENTRY(DR5) \
+ ENTRY(DR6) \
+ ENTRY(DR7) \
+ ENTRY(DR8) \
+ ENTRY(DR9) \
+ ENTRY(DR10) \
+ ENTRY(DR11) \
+ ENTRY(DR12) \
+ ENTRY(DR13) \
+ ENTRY(DR14) \
+ ENTRY(DR15)
+
+#define REGS_CONTROL \
+ ENTRY(CR0) \
+ ENTRY(CR1) \
+ ENTRY(CR2) \
+ ENTRY(CR3) \
+ ENTRY(CR4) \
+ ENTRY(CR5) \
+ ENTRY(CR6) \
+ ENTRY(CR7) \
+ ENTRY(CR8) \
+ ENTRY(CR9) \
+ ENTRY(CR10) \
+ ENTRY(CR11) \
+ ENTRY(CR12) \
+ ENTRY(CR13) \
+ ENTRY(CR14) \
+ ENTRY(CR15)
+
+#define REGS_BOUND \
+ ENTRY(BND0) \
+ ENTRY(BND1) \
+ ENTRY(BND2) \
+ ENTRY(BND3)
+
+#define ALL_EA_BASES \
+ EA_BASES_16BIT \
+ EA_BASES_32BIT \
+ EA_BASES_64BIT
+
+#define ALL_SIB_BASES \
+ REGS_32BIT \
+ REGS_64BIT
+
+#define ALL_REGS \
+ REGS_8BIT \
+ REGS_16BIT \
+ REGS_32BIT \
+ REGS_64BIT \
+ REGS_MMX \
+ REGS_XMM \
+ REGS_YMM \
+ REGS_ZMM \
+ REGS_MASKS \
+ REGS_SEGMENT \
+ REGS_DEBUG \
+ REGS_CONTROL \
+ REGS_BOUND \
+ ENTRY(RIP)
+
+/// \brief All possible values of the base field for effective-address
+/// computations, a.k.a. the Mod and R/M fields of the ModR/M byte.
+/// We distinguish between bases (EA_BASE_*) and registers that just happen
+/// to be referred to when Mod == 0b11 (EA_REG_*).
+enum EABase {
+ EA_BASE_NONE,
+#define ENTRY(x) EA_BASE_##x,
+ ALL_EA_BASES
+#undef ENTRY
+#define ENTRY(x) EA_REG_##x,
+ ALL_REGS
+#undef ENTRY
+ EA_max
+};
+
+/// \brief All possible values of the SIB index field.
+/// borrows entries from ALL_EA_BASES with the special case that
+/// sib is synonymous with NONE.
+/// Vector SIB: index can be XMM or YMM.
+enum SIBIndex {
+ SIB_INDEX_NONE,
+#define ENTRY(x) SIB_INDEX_##x,
+ ALL_EA_BASES
+ REGS_XMM
+ REGS_YMM
+ REGS_ZMM
+#undef ENTRY
+ SIB_INDEX_max
+};
+
+/// \brief All possible values of the SIB base field.
+enum SIBBase {
+ SIB_BASE_NONE,
+#define ENTRY(x) SIB_BASE_##x,
+ ALL_SIB_BASES
+#undef ENTRY
+ SIB_BASE_max
+};
+
+/// \brief Possible displacement types for effective-address computations.
+typedef enum {
+ EA_DISP_NONE,
+ EA_DISP_8,
+ EA_DISP_16,
+ EA_DISP_32
+} EADisplacement;
+
+/// \brief All possible values of the reg field in the ModR/M byte.
+enum Reg {
+#define ENTRY(x) MODRM_REG_##x,
+ ALL_REGS
+#undef ENTRY
+ MODRM_REG_max
+};
+
+/// \brief All possible segment overrides.
+enum SegmentOverride {
+ SEG_OVERRIDE_NONE,
+ SEG_OVERRIDE_CS,
+ SEG_OVERRIDE_SS,
+ SEG_OVERRIDE_DS,
+ SEG_OVERRIDE_ES,
+ SEG_OVERRIDE_FS,
+ SEG_OVERRIDE_GS,
+ SEG_OVERRIDE_max
+};
+
+/// \brief Possible values for the VEX.m-mmmm field
+enum VEXLeadingOpcodeByte {
+ VEX_LOB_0F = 0x1,
+ VEX_LOB_0F38 = 0x2,
+ VEX_LOB_0F3A = 0x3
+};
+
+enum XOPMapSelect {
+ XOP_MAP_SELECT_8 = 0x8,
+ XOP_MAP_SELECT_9 = 0x9,
+ XOP_MAP_SELECT_A = 0xA
+};
+
+/// \brief Possible values for the VEX.pp/EVEX.pp field
+enum VEXPrefixCode {
+ VEX_PREFIX_NONE = 0x0,
+ VEX_PREFIX_66 = 0x1,
+ VEX_PREFIX_F3 = 0x2,
+ VEX_PREFIX_F2 = 0x3
+};
+
+enum VectorExtensionType {
+ TYPE_NO_VEX_XOP = 0x0,
+ TYPE_VEX_2B = 0x1,
+ TYPE_VEX_3B = 0x2,
+ TYPE_EVEX = 0x3,
+ TYPE_XOP = 0x4
+};
+
+/// \brief Type for the byte reader that the consumer must provide to
+/// the decoder. Reads a single byte from the instruction's address space.
+/// \param arg A baton that the consumer can associate with any internal
+/// state that it needs.
+/// \param byte A pointer to a single byte in memory that should be set to
+/// contain the value at address.
+/// \param address The address in the instruction's address space that should
+/// be read from.
+/// \return -1 if the byte cannot be read for any reason; 0 otherwise.
+typedef int (*byteReader_t)(const void *arg, uint8_t *byte, uint64_t address);
+
+/// \brief Type for the logging function that the consumer can provide to
+/// get debugging output from the decoder.
+/// \param arg A baton that the consumer can associate with any internal
+/// state that it needs.
+/// \param log A string that contains the message. Will be reused after
+/// the logger returns.
+typedef void (*dlog_t)(void *arg, const char *log);
+
+/// The specification for how to extract and interpret a full instruction and
+/// its operands.
+struct InstructionSpecifier {
+ uint16_t operands;
+};
+
+/// The x86 internal instruction, which is produced by the decoder.
+struct InternalInstruction {
+ // Reader interface (C)
+ byteReader_t reader;
+ // Opaque value passed to the reader
+ const void* readerArg;
+ // The address of the next byte to read via the reader
+ uint64_t readerCursor;
+
+ // Logger interface (C)
+ dlog_t dlog;
+ // Opaque value passed to the logger
+ void* dlogArg;
+
+ // General instruction information
+
+ // The mode to disassemble for (64-bit, protected, real)
+ DisassemblerMode mode;
+ // The start of the instruction, usable with the reader
+ uint64_t startLocation;
+ // The length of the instruction, in bytes
+ size_t length;
+
+ // Prefix state
+
+ // 1 if the prefix byte corresponding to the entry is present; 0 if not
+ uint8_t prefixPresent[0x100];
+ // contains the location (for use with the reader) of the prefix byte
+ uint64_t prefixLocations[0x100];
+ // The value of the vector extension prefix(EVEX/VEX/XOP), if present
+ uint8_t vectorExtensionPrefix[4];
+ // The type of the vector extension prefix
+ VectorExtensionType vectorExtensionType;
+ // The value of the REX prefix, if present
+ uint8_t rexPrefix;
+ // The location where a mandatory prefix would have to be (i.e., right before
+ // the opcode, or right before the REX prefix if one is present).
+ uint64_t necessaryPrefixLocation;
+ // The segment override type
+ SegmentOverride segmentOverride;
+ // 1 if the prefix byte, 0xf2 or 0xf3 is xacquire or xrelease
+ bool xAcquireRelease;
+
+ // Sizes of various critical pieces of data, in bytes
+ uint8_t registerSize;
+ uint8_t addressSize;
+ uint8_t displacementSize;
+ uint8_t immediateSize;
+
+ // Offsets from the start of the instruction to the pieces of data, which is
+ // needed to find relocation entries for adding symbolic operands.
+ uint8_t displacementOffset;
+ uint8_t immediateOffset;
+
+ // opcode state
+
+ // The last byte of the opcode, not counting any ModR/M extension
+ uint8_t opcode;
+
+ // decode state
+
+ // The type of opcode, used for indexing into the array of decode tables
+ OpcodeType opcodeType;
+ // The instruction ID, extracted from the decode table
+ uint16_t instructionID;
+ // The specifier for the instruction, from the instruction info table
+ const InstructionSpecifier *spec;
+
+ // state for additional bytes, consumed during operand decode. Pattern:
+ // consumed___ indicates that the byte was already consumed and does not
+ // need to be consumed again.
+
+ // The VEX.vvvv field, which contains a third register operand for some AVX
+ // instructions.
+ Reg vvvv;
+
+ // The writemask for AVX-512 instructions which is contained in EVEX.aaa
+ Reg writemask;
+
+ // The ModR/M byte, which contains most register operands and some portion of
+ // all memory operands.
+ bool consumedModRM;
+ uint8_t modRM;
+
+ // The SIB byte, used for more complex 32- or 64-bit memory operands
+ bool consumedSIB;
+ uint8_t sib;
+
+ // The displacement, used for memory operands
+ bool consumedDisplacement;
+ int32_t displacement;
+
+ // Immediates. There can be two in some cases
+ uint8_t numImmediatesConsumed;
+ uint8_t numImmediatesTranslated;
+ uint64_t immediates[2];
+
+ // A register or immediate operand encoded into the opcode
+ Reg opcodeRegister;
+
+ // Portions of the ModR/M byte
+
+ // These fields determine the allowable values for the ModR/M fields, which
+ // depend on operand and address widths.
+ EABase eaBaseBase;
+ EABase eaRegBase;
+ Reg regBase;
+
+ // The Mod and R/M fields can encode a base for an effective address, or a
+ // register. These are separated into two fields here.
+ EABase eaBase;
+ EADisplacement eaDisplacement;
+ // The reg field always encodes a register
+ Reg reg;
+
+ // SIB state
+ SIBIndex sibIndex;
+ uint8_t sibScale;
+ SIBBase sibBase;
+
+ ArrayRef<OperandSpecifier> operands;
+};
+
+/// \brief Decode one instruction and store the decoding results in
+/// a buffer provided by the consumer.
+/// \param insn The buffer to store the instruction in. Allocated by the
+/// consumer.
+/// \param reader The byteReader_t for the bytes to be read.
+/// \param readerArg An argument to pass to the reader for storing context
+/// specific to the consumer. May be NULL.
+/// \param logger The dlog_t to be used in printing status messages from the
+/// disassembler. May be NULL.
+/// \param loggerArg An argument to pass to the logger for storing context
+/// specific to the logger. May be NULL.
+/// \param startLoc The address (in the reader's address space) of the first
+/// byte in the instruction.
+/// \param mode The mode (16-bit, 32-bit, 64-bit) to decode in.
+/// \return Nonzero if there was an error during decode, 0 otherwise.
+int decodeInstruction(InternalInstruction *insn,
+ byteReader_t reader,
+ const void *readerArg,
+ dlog_t logger,
+ void *loggerArg,
+ const void *miiArg,
+ uint64_t startLoc,
+ DisassemblerMode mode);
+
+/// \brief Print a message to debugs()
+/// \param file The name of the file printing the debug message.
+/// \param line The line number that printed the debug message.
+/// \param s The message to print.
+void Debug(const char *file, unsigned line, const char *s);
+
+StringRef GetInstrName(unsigned Opcode, const void *mii);
+
+} // namespace X86Disassembler
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
new file mode 100644
index 000000000000..0a835b876d90
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
@@ -0,0 +1,493 @@
+//===-- X86DisassemblerDecoderCommon.h - Disassembler decoder ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the X86 Disassembler.
+// It contains common definitions used by both the disassembler and the table
+// generator.
+// Documentation for the disassembler can be found in X86Disassembler.h.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODERCOMMON_H
+#define LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODERCOMMON_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+namespace X86Disassembler {
+
+#define INSTRUCTIONS_SYM x86DisassemblerInstrSpecifiers
+#define CONTEXTS_SYM x86DisassemblerContexts
+#define ONEBYTE_SYM x86DisassemblerOneByteOpcodes
+#define TWOBYTE_SYM x86DisassemblerTwoByteOpcodes
+#define THREEBYTE38_SYM x86DisassemblerThreeByte38Opcodes
+#define THREEBYTE3A_SYM x86DisassemblerThreeByte3AOpcodes
+#define XOP8_MAP_SYM x86DisassemblerXOP8Opcodes
+#define XOP9_MAP_SYM x86DisassemblerXOP9Opcodes
+#define XOPA_MAP_SYM x86DisassemblerXOPAOpcodes
+
+#define INSTRUCTIONS_STR "x86DisassemblerInstrSpecifiers"
+#define CONTEXTS_STR "x86DisassemblerContexts"
+#define ONEBYTE_STR "x86DisassemblerOneByteOpcodes"
+#define TWOBYTE_STR "x86DisassemblerTwoByteOpcodes"
+#define THREEBYTE38_STR "x86DisassemblerThreeByte38Opcodes"
+#define THREEBYTE3A_STR "x86DisassemblerThreeByte3AOpcodes"
+#define XOP8_MAP_STR "x86DisassemblerXOP8Opcodes"
+#define XOP9_MAP_STR "x86DisassemblerXOP9Opcodes"
+#define XOPA_MAP_STR "x86DisassemblerXOPAOpcodes"
+
+// Attributes of an instruction that must be known before the opcode can be
+// processed correctly. Most of these indicate the presence of particular
+// prefixes, but ATTR_64BIT is simply an attribute of the decoding context.
+#define ATTRIBUTE_BITS \
+ ENUM_ENTRY(ATTR_NONE, 0x00) \
+ ENUM_ENTRY(ATTR_64BIT, (0x1 << 0)) \
+ ENUM_ENTRY(ATTR_XS, (0x1 << 1)) \
+ ENUM_ENTRY(ATTR_XD, (0x1 << 2)) \
+ ENUM_ENTRY(ATTR_REXW, (0x1 << 3)) \
+ ENUM_ENTRY(ATTR_OPSIZE, (0x1 << 4)) \
+ ENUM_ENTRY(ATTR_ADSIZE, (0x1 << 5)) \
+ ENUM_ENTRY(ATTR_VEX, (0x1 << 6)) \
+ ENUM_ENTRY(ATTR_VEXL, (0x1 << 7)) \
+ ENUM_ENTRY(ATTR_EVEX, (0x1 << 8)) \
+ ENUM_ENTRY(ATTR_EVEXL, (0x1 << 9)) \
+ ENUM_ENTRY(ATTR_EVEXL2, (0x1 << 10)) \
+ ENUM_ENTRY(ATTR_EVEXK, (0x1 << 11)) \
+ ENUM_ENTRY(ATTR_EVEXKZ, (0x1 << 12)) \
+ ENUM_ENTRY(ATTR_EVEXB, (0x1 << 13))
+
+#define ENUM_ENTRY(n, v) n = v,
+enum attributeBits {
+ ATTRIBUTE_BITS
+ ATTR_max
+};
+#undef ENUM_ENTRY
+
+// Combinations of the above attributes that are relevant to instruction
+// decode. Although other combinations are possible, they can be reduced to
+// these without affecting the ultimately decoded instruction.
+
+// Class name Rank Rationale for rank assignment
+#define INSTRUCTION_CONTEXTS \
+ ENUM_ENTRY(IC, 0, "says nothing about the instruction") \
+ ENUM_ENTRY(IC_64BIT, 1, "says the instruction applies in " \
+ "64-bit mode but no more") \
+ ENUM_ENTRY(IC_OPSIZE, 3, "requires an OPSIZE prefix, so " \
+ "operands change width") \
+ ENUM_ENTRY(IC_ADSIZE, 3, "requires an ADSIZE prefix, so " \
+ "operands change width") \
+ ENUM_ENTRY(IC_OPSIZE_ADSIZE, 4, "requires ADSIZE and OPSIZE prefixes") \
+ ENUM_ENTRY(IC_XD, 2, "may say something about the opcode " \
+ "but not the operands") \
+ ENUM_ENTRY(IC_XS, 2, "may say something about the opcode " \
+ "but not the operands") \
+ ENUM_ENTRY(IC_XD_OPSIZE, 3, "requires an OPSIZE prefix, so " \
+ "operands change width") \
+ ENUM_ENTRY(IC_XS_OPSIZE, 3, "requires an OPSIZE prefix, so " \
+ "operands change width") \
+ ENUM_ENTRY(IC_64BIT_REXW, 5, "requires a REX.W prefix, so operands "\
+ "change width; overrides IC_OPSIZE") \
+ ENUM_ENTRY(IC_64BIT_REXW_ADSIZE, 6, "requires a REX.W prefix and 0x67 " \
+ "prefix") \
+ ENUM_ENTRY(IC_64BIT_OPSIZE, 3, "Just as meaningful as IC_OPSIZE") \
+ ENUM_ENTRY(IC_64BIT_ADSIZE, 3, "Just as meaningful as IC_ADSIZE") \
+ ENUM_ENTRY(IC_64BIT_OPSIZE_ADSIZE, 4, "Just as meaningful as IC_OPSIZE/" \
+ "IC_ADSIZE") \
+ ENUM_ENTRY(IC_64BIT_XD, 6, "XD instructions are SSE; REX.W is " \
+ "secondary") \
+ ENUM_ENTRY(IC_64BIT_XS, 6, "Just as meaningful as IC_64BIT_XD") \
+ ENUM_ENTRY(IC_64BIT_XD_OPSIZE, 3, "Just as meaningful as IC_XD_OPSIZE") \
+ ENUM_ENTRY(IC_64BIT_XS_OPSIZE, 3, "Just as meaningful as IC_XS_OPSIZE") \
+ ENUM_ENTRY(IC_64BIT_REXW_XS, 7, "OPSIZE could mean a different " \
+ "opcode") \
+ ENUM_ENTRY(IC_64BIT_REXW_XD, 7, "Just as meaningful as " \
+ "IC_64BIT_REXW_XS") \
+ ENUM_ENTRY(IC_64BIT_REXW_OPSIZE, 8, "The Dynamic Duo! Prefer over all " \
+ "else because this changes most " \
+ "operands' meaning") \
+ ENUM_ENTRY(IC_VEX, 1, "requires a VEX prefix") \
+ ENUM_ENTRY(IC_VEX_XS, 2, "requires VEX and the XS prefix") \
+ ENUM_ENTRY(IC_VEX_XD, 2, "requires VEX and the XD prefix") \
+ ENUM_ENTRY(IC_VEX_OPSIZE, 2, "requires VEX and the OpSize prefix") \
+ ENUM_ENTRY(IC_VEX_W, 3, "requires VEX and the W prefix") \
+ ENUM_ENTRY(IC_VEX_W_XS, 4, "requires VEX, W, and XS prefix") \
+ ENUM_ENTRY(IC_VEX_W_XD, 4, "requires VEX, W, and XD prefix") \
+ ENUM_ENTRY(IC_VEX_W_OPSIZE, 4, "requires VEX, W, and OpSize") \
+ ENUM_ENTRY(IC_VEX_L, 3, "requires VEX and the L prefix") \
+ ENUM_ENTRY(IC_VEX_L_XS, 4, "requires VEX and the L and XS prefix")\
+ ENUM_ENTRY(IC_VEX_L_XD, 4, "requires VEX and the L and XD prefix")\
+ ENUM_ENTRY(IC_VEX_L_OPSIZE, 4, "requires VEX, L, and OpSize") \
+ ENUM_ENTRY(IC_VEX_L_W, 4, "requires VEX, L and W") \
+ ENUM_ENTRY(IC_VEX_L_W_XS, 5, "requires VEX, L, W and XS prefix") \
+ ENUM_ENTRY(IC_VEX_L_W_XD, 5, "requires VEX, L, W and XD prefix") \
+ ENUM_ENTRY(IC_VEX_L_W_OPSIZE, 5, "requires VEX, L, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX, 1, "requires an EVEX prefix") \
+ ENUM_ENTRY(IC_EVEX_XS, 2, "requires EVEX and the XS prefix") \
+ ENUM_ENTRY(IC_EVEX_XD, 2, "requires EVEX and the XD prefix") \
+ ENUM_ENTRY(IC_EVEX_OPSIZE, 2, "requires EVEX and the OpSize prefix") \
+ ENUM_ENTRY(IC_EVEX_W, 3, "requires EVEX and the W prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XS, 4, "requires EVEX, W, and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XD, 4, "requires EVEX, W, and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_W_OPSIZE, 4, "requires EVEX, W, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L, 3, "requires EVEX and the L prefix") \
+ ENUM_ENTRY(IC_EVEX_L_XS, 4, "requires EVEX and the L and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L_XD, 4, "requires EVEX and the L and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L_OPSIZE, 4, "requires EVEX, L, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L_W, 3, "requires EVEX, L and W") \
+ ENUM_ENTRY(IC_EVEX_L_W_XS, 4, "requires EVEX, L, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_XD, 4, "requires EVEX, L, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_OPSIZE, 4, "requires EVEX, L, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2, 3, "requires EVEX and the L2 prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_XS, 4, "requires EVEX and the L2 and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_XD, 4, "requires EVEX and the L2 and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_OPSIZE, 4, "requires EVEX, L2, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2_W, 3, "requires EVEX, L2 and W") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XS, 4, "requires EVEX, L2, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XD, 4, "requires EVEX, L2, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE, 4, "requires EVEX, L2, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX_K, 1, "requires an EVEX_K prefix") \
+ ENUM_ENTRY(IC_EVEX_XS_K, 2, "requires EVEX_K and the XS prefix") \
+ ENUM_ENTRY(IC_EVEX_XD_K, 2, "requires EVEX_K and the XD prefix") \
+ ENUM_ENTRY(IC_EVEX_OPSIZE_K, 2, "requires EVEX_K and the OpSize prefix") \
+ ENUM_ENTRY(IC_EVEX_W_K, 3, "requires EVEX_K and the W prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XS_K, 4, "requires EVEX_K, W, and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XD_K, 4, "requires EVEX_K, W, and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_W_OPSIZE_K, 4, "requires EVEX_K, W, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L_K, 3, "requires EVEX_K and the L prefix") \
+ ENUM_ENTRY(IC_EVEX_L_XS_K, 4, "requires EVEX_K and the L and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L_XD_K, 4, "requires EVEX_K and the L and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L_OPSIZE_K, 4, "requires EVEX_K, L, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L_W_K, 3, "requires EVEX_K, L and W") \
+ ENUM_ENTRY(IC_EVEX_L_W_XS_K, 4, "requires EVEX_K, L, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_XD_K, 4, "requires EVEX_K, L, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_K, 4, "requires EVEX_K, L, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2_K, 3, "requires EVEX_K and the L2 prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_XS_K, 4, "requires EVEX_K and the L2 and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_XD_K, 4, "requires EVEX_K and the L2 and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_OPSIZE_K, 4, "requires EVEX_K, L2, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2_W_K, 3, "requires EVEX_K, L2 and W") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XS_K, 4, "requires EVEX_K, L2, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XD_K, 4, "requires EVEX_K, L2, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_K, 4, "requires EVEX_K, L2, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX_B, 1, "requires an EVEX_B prefix") \
+ ENUM_ENTRY(IC_EVEX_XS_B, 2, "requires EVEX_B and the XS prefix") \
+ ENUM_ENTRY(IC_EVEX_XD_B, 2, "requires EVEX_B and the XD prefix") \
+ ENUM_ENTRY(IC_EVEX_OPSIZE_B, 2, "requires EVEX_B and the OpSize prefix") \
+ ENUM_ENTRY(IC_EVEX_W_B, 3, "requires EVEX_B and the W prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XS_B, 4, "requires EVEX_B, W, and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XD_B, 4, "requires EVEX_B, W, and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_W_OPSIZE_B, 4, "requires EVEX_B, W, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L_B, 3, "requires EVEX_B and the L prefix") \
+ ENUM_ENTRY(IC_EVEX_L_XS_B, 4, "requires EVEX_B and the L and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L_XD_B, 4, "requires EVEX_B and the L and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L_OPSIZE_B, 4, "requires EVEX_B, L, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L_W_B, 3, "requires EVEX_B, L and W") \
+ ENUM_ENTRY(IC_EVEX_L_W_XS_B, 4, "requires EVEX_B, L, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_XD_B, 4, "requires EVEX_B, L, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_B, 4, "requires EVEX_B, L, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2_B, 3, "requires EVEX_B and the L2 prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_XS_B, 4, "requires EVEX_B and the L2 and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_XD_B, 4, "requires EVEX_B and the L2 and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_OPSIZE_B, 4, "requires EVEX_B, L2, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2_W_B, 3, "requires EVEX_B, L2 and W") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XS_B, 4, "requires EVEX_B, L2, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XD_B, 4, "requires EVEX_B, L2, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_B, 4, "requires EVEX_B, L2, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX_K_B, 1, "requires EVEX_B and EVEX_K prefix") \
+ ENUM_ENTRY(IC_EVEX_XS_K_B, 2, "requires EVEX_B, EVEX_K and the XS prefix") \
+ ENUM_ENTRY(IC_EVEX_XD_K_B, 2, "requires EVEX_B, EVEX_K and the XD prefix") \
+ ENUM_ENTRY(IC_EVEX_OPSIZE_K_B, 2, "requires EVEX_B, EVEX_K and the OpSize prefix") \
+ ENUM_ENTRY(IC_EVEX_W_K_B, 3, "requires EVEX_B, EVEX_K and the W prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XS_K_B, 4, "requires EVEX_B, EVEX_K, W, and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XD_K_B, 4, "requires EVEX_B, EVEX_K, W, and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_W_OPSIZE_K_B, 4, "requires EVEX_B, EVEX_K, W, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L_K_B, 3, "requires EVEX_B, EVEX_K and the L prefix") \
+ ENUM_ENTRY(IC_EVEX_L_XS_K_B, 4, "requires EVEX_B, EVEX_K and the L and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L_XD_K_B, 4, "requires EVEX_B, EVEX_K and the L and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L_OPSIZE_K_B, 4, "requires EVEX_B, EVEX_K, L, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L_W_K_B, 3, "requires EVEX_B, EVEX_K, L and W") \
+ ENUM_ENTRY(IC_EVEX_L_W_XS_K_B, 4, "requires EVEX_B, EVEX_K, L, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_XD_K_B, 4, "requires EVEX_B, EVEX_K, L, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_K_B,4, "requires EVEX_B, EVEX_K, L, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2_K_B, 3, "requires EVEX_B, EVEX_K and the L2 prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_XS_K_B, 4, "requires EVEX_B, EVEX_K and the L2 and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_XD_K_B, 4, "requires EVEX_B, EVEX_K and the L2 and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_OPSIZE_K_B, 4, "requires EVEX_B, EVEX_K, L2, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2_W_K_B, 3, "requires EVEX_B, EVEX_K, L2 and W") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XS_K_B, 4, "requires EVEX_B, EVEX_K, L2, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XD_K_B, 4, "requires EVEX_B, EVEX_K, L2, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_K_B,4, "requires EVEX_B, EVEX_K, L2, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX_KZ_B, 1, "requires EVEX_B and EVEX_KZ prefix") \
+ ENUM_ENTRY(IC_EVEX_XS_KZ_B, 2, "requires EVEX_B, EVEX_KZ and the XS prefix") \
+ ENUM_ENTRY(IC_EVEX_XD_KZ_B, 2, "requires EVEX_B, EVEX_KZ and the XD prefix") \
+ ENUM_ENTRY(IC_EVEX_OPSIZE_KZ_B, 2, "requires EVEX_B, EVEX_KZ and the OpSize prefix") \
+ ENUM_ENTRY(IC_EVEX_W_KZ_B, 3, "requires EVEX_B, EVEX_KZ and the W prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XS_KZ_B, 4, "requires EVEX_B, EVEX_KZ, W, and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XD_KZ_B, 4, "requires EVEX_B, EVEX_KZ, W, and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_W_OPSIZE_KZ_B, 4, "requires EVEX_B, EVEX_KZ, W, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L_KZ_B, 3, "requires EVEX_B, EVEX_KZ and the L prefix") \
+ ENUM_ENTRY(IC_EVEX_L_XS_KZ_B, 4, "requires EVEX_B, EVEX_KZ and the L and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L_XD_KZ_B, 4, "requires EVEX_B, EVEX_KZ and the L and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L_OPSIZE_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L_W_KZ_B, 3, "requires EVEX_B, EVEX_KZ, L and W") \
+ ENUM_ENTRY(IC_EVEX_L_W_XS_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_XD_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2_KZ_B, 3, "requires EVEX_B, EVEX_KZ and the L2 prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_XS_KZ_B, 4, "requires EVEX_B, EVEX_KZ and the L2 and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_XD_KZ_B, 4, "requires EVEX_B, EVEX_KZ and the L2 and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_OPSIZE_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L2, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2_W_KZ_B, 3, "requires EVEX_B, EVEX_KZ, L2 and W") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XS_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L2, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XD_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L2, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L2, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX_KZ, 1, "requires an EVEX_KZ prefix") \
+ ENUM_ENTRY(IC_EVEX_XS_KZ, 2, "requires EVEX_KZ and the XS prefix") \
+ ENUM_ENTRY(IC_EVEX_XD_KZ, 2, "requires EVEX_KZ and the XD prefix") \
+ ENUM_ENTRY(IC_EVEX_OPSIZE_KZ, 2, "requires EVEX_KZ and the OpSize prefix") \
+ ENUM_ENTRY(IC_EVEX_W_KZ, 3, "requires EVEX_KZ and the W prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XS_KZ, 4, "requires EVEX_KZ, W, and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XD_KZ, 4, "requires EVEX_KZ, W, and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_W_OPSIZE_KZ, 4, "requires EVEX_KZ, W, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L_KZ, 3, "requires EVEX_KZ and the L prefix") \
+ ENUM_ENTRY(IC_EVEX_L_XS_KZ, 4, "requires EVEX_KZ and the L and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L_XD_KZ, 4, "requires EVEX_KZ and the L and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L_OPSIZE_KZ, 4, "requires EVEX_KZ, L, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L_W_KZ, 3, "requires EVEX_KZ, L and W") \
+ ENUM_ENTRY(IC_EVEX_L_W_XS_KZ, 4, "requires EVEX_KZ, L, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_XD_KZ, 4, "requires EVEX_KZ, L, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_KZ, 4, "requires EVEX_KZ, L, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2_KZ, 3, "requires EVEX_KZ and the L2 prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_XS_KZ, 4, "requires EVEX_KZ and the L2 and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_XD_KZ, 4, "requires EVEX_KZ and the L2 and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_OPSIZE_KZ, 4, "requires EVEX_KZ, L2, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2_W_KZ, 3, "requires EVEX_KZ, L2 and W") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XS_KZ, 4, "requires EVEX_KZ, L2, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XD_KZ, 4, "requires EVEX_KZ, L2, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_KZ, 4, "requires EVEX_KZ, L2, W and OpSize")
+
+#define ENUM_ENTRY(n, r, d) n,
+enum InstructionContext {
+ INSTRUCTION_CONTEXTS
+ IC_max
+};
+#undef ENUM_ENTRY
+
+// Opcode types, which determine which decode table to use, both in the Intel
+// manual and also for the decoder.
+enum OpcodeType {
+ ONEBYTE = 0,
+ TWOBYTE = 1,
+ THREEBYTE_38 = 2,
+ THREEBYTE_3A = 3,
+ XOP8_MAP = 4,
+ XOP9_MAP = 5,
+ XOPA_MAP = 6
+};
+
+// The following structs are used for the hierarchical decode table. After
+// determining the instruction's class (i.e., which IC_* constant applies to
+// it), the decoder reads the opcode. Some instructions require specific
+// values of the ModR/M byte, so the ModR/M byte indexes into the final table.
+//
+// If a ModR/M byte is not required, "required" is left unset, and the values
+// for each instructionID are identical.
+typedef uint16_t InstrUID;
+
+// ModRMDecisionType - describes the type of ModR/M decision, allowing the
+// consumer to determine the number of entries in it.
+//
+// MODRM_ONEENTRY - No matter what the value of the ModR/M byte is, the decoded
+// instruction is the same.
+// MODRM_SPLITRM - If the ModR/M byte is between 0x00 and 0xbf, the opcode
+// corresponds to one instruction; otherwise, it corresponds to
+// a different instruction.
+// MODRM_SPLITMISC- If the ModR/M byte is between 0x00 and 0xbf, ModR/M byte
+// divided by 8 is used to select instruction; otherwise, each
+// value of the ModR/M byte could correspond to a different
+// instruction.
+// MODRM_SPLITREG - ModR/M byte divided by 8 is used to select instruction. This
+// corresponds to instructions that use reg field as opcode
+// MODRM_FULL - Potentially, each value of the ModR/M byte could correspond
+// to a different instruction.
+#define MODRMTYPES \
+ ENUM_ENTRY(MODRM_ONEENTRY) \
+ ENUM_ENTRY(MODRM_SPLITRM) \
+ ENUM_ENTRY(MODRM_SPLITMISC) \
+ ENUM_ENTRY(MODRM_SPLITREG) \
+ ENUM_ENTRY(MODRM_FULL)
+
+#define ENUM_ENTRY(n) n,
+enum ModRMDecisionType {
+ MODRMTYPES
+ MODRM_max
+};
+#undef ENUM_ENTRY
+
+#define CASE_ENCODING_RM \
+ case ENCODING_RM: \
+ case ENCODING_RM_CD2: \
+ case ENCODING_RM_CD4: \
+ case ENCODING_RM_CD8: \
+ case ENCODING_RM_CD16: \
+ case ENCODING_RM_CD32: \
+ case ENCODING_RM_CD64
+
+// Physical encodings of instruction operands.
+#define ENCODINGS \
+ ENUM_ENTRY(ENCODING_NONE, "") \
+ ENUM_ENTRY(ENCODING_REG, "Register operand in ModR/M byte.") \
+ ENUM_ENTRY(ENCODING_RM, "R/M operand in ModR/M byte.") \
+ ENUM_ENTRY(ENCODING_RM_CD2, "R/M operand with CDisp scaling of 2") \
+ ENUM_ENTRY(ENCODING_RM_CD4, "R/M operand with CDisp scaling of 4") \
+ ENUM_ENTRY(ENCODING_RM_CD8, "R/M operand with CDisp scaling of 8") \
+ ENUM_ENTRY(ENCODING_RM_CD16,"R/M operand with CDisp scaling of 16") \
+ ENUM_ENTRY(ENCODING_RM_CD32,"R/M operand with CDisp scaling of 32") \
+ ENUM_ENTRY(ENCODING_RM_CD64,"R/M operand with CDisp scaling of 64") \
+ ENUM_ENTRY(ENCODING_VVVV, "Register operand in VEX.vvvv byte.") \
+ ENUM_ENTRY(ENCODING_WRITEMASK, "Register operand in EVEX.aaa byte.") \
+ ENUM_ENTRY(ENCODING_IB, "1-byte immediate") \
+ ENUM_ENTRY(ENCODING_IW, "2-byte") \
+ ENUM_ENTRY(ENCODING_ID, "4-byte") \
+ ENUM_ENTRY(ENCODING_IO, "8-byte") \
+ ENUM_ENTRY(ENCODING_RB, "(AL..DIL, R8L..R15L) Register code added to " \
+ "the opcode byte") \
+ ENUM_ENTRY(ENCODING_RW, "(AX..DI, R8W..R15W)") \
+ ENUM_ENTRY(ENCODING_RD, "(EAX..EDI, R8D..R15D)") \
+ ENUM_ENTRY(ENCODING_RO, "(RAX..RDI, R8..R15)") \
+ ENUM_ENTRY(ENCODING_FP, "Position on floating-point stack in ModR/M " \
+ "byte.") \
+ \
+ ENUM_ENTRY(ENCODING_Iv, "Immediate of operand size") \
+ ENUM_ENTRY(ENCODING_Ia, "Immediate of address size") \
+ ENUM_ENTRY(ENCODING_Rv, "Register code of operand size added to the " \
+ "opcode byte") \
+ ENUM_ENTRY(ENCODING_DUP, "Duplicate of another operand; ID is encoded " \
+ "in type") \
+ ENUM_ENTRY(ENCODING_SI, "Source index; encoded in OpSize/Adsize prefix") \
+ ENUM_ENTRY(ENCODING_DI, "Destination index; encoded in prefixes")
+
+#define ENUM_ENTRY(n, d) n,
+enum OperandEncoding {
+ ENCODINGS
+ ENCODING_max
+};
+#undef ENUM_ENTRY
+
+// Semantic interpretations of instruction operands.
+#define TYPES \
+ ENUM_ENTRY(TYPE_NONE, "") \
+ ENUM_ENTRY(TYPE_REL8, "1-byte immediate address") \
+ ENUM_ENTRY(TYPE_REL16, "2-byte") \
+ ENUM_ENTRY(TYPE_REL32, "4-byte") \
+ ENUM_ENTRY(TYPE_REL64, "8-byte") \
+ ENUM_ENTRY(TYPE_PTR1616, "2+2-byte segment+offset address") \
+ ENUM_ENTRY(TYPE_PTR1632, "2+4-byte") \
+ ENUM_ENTRY(TYPE_PTR1664, "2+8-byte") \
+ ENUM_ENTRY(TYPE_R8, "1-byte register operand") \
+ ENUM_ENTRY(TYPE_R16, "2-byte") \
+ ENUM_ENTRY(TYPE_R32, "4-byte") \
+ ENUM_ENTRY(TYPE_R64, "8-byte") \
+ ENUM_ENTRY(TYPE_IMM8, "1-byte immediate operand") \
+ ENUM_ENTRY(TYPE_IMM16, "2-byte") \
+ ENUM_ENTRY(TYPE_IMM32, "4-byte") \
+ ENUM_ENTRY(TYPE_IMM64, "8-byte") \
+ ENUM_ENTRY(TYPE_IMM3, "1-byte immediate operand between 0 and 7") \
+ ENUM_ENTRY(TYPE_IMM5, "1-byte immediate operand between 0 and 31") \
+ ENUM_ENTRY(TYPE_AVX512ICC, "1-byte immediate operand for AVX512 icmp") \
+ ENUM_ENTRY(TYPE_UIMM8, "1-byte unsigned immediate operand") \
+ ENUM_ENTRY(TYPE_RM8, "1-byte register or memory operand") \
+ ENUM_ENTRY(TYPE_RM16, "2-byte") \
+ ENUM_ENTRY(TYPE_RM32, "4-byte") \
+ ENUM_ENTRY(TYPE_RM64, "8-byte") \
+ ENUM_ENTRY(TYPE_M, "Memory operand") \
+ ENUM_ENTRY(TYPE_M8, "1-byte") \
+ ENUM_ENTRY(TYPE_M16, "2-byte") \
+ ENUM_ENTRY(TYPE_M32, "4-byte") \
+ ENUM_ENTRY(TYPE_M64, "8-byte") \
+ ENUM_ENTRY(TYPE_LEA, "Effective address") \
+ ENUM_ENTRY(TYPE_M128, "16-byte (SSE/SSE2)") \
+ ENUM_ENTRY(TYPE_M256, "256-byte (AVX)") \
+ ENUM_ENTRY(TYPE_M1616, "2+2-byte segment+offset address") \
+ ENUM_ENTRY(TYPE_M1632, "2+4-byte") \
+ ENUM_ENTRY(TYPE_M1664, "2+8-byte") \
+ ENUM_ENTRY(TYPE_SRCIDX8, "1-byte memory at source index") \
+ ENUM_ENTRY(TYPE_SRCIDX16, "2-byte memory at source index") \
+ ENUM_ENTRY(TYPE_SRCIDX32, "4-byte memory at source index") \
+ ENUM_ENTRY(TYPE_SRCIDX64, "8-byte memory at source index") \
+ ENUM_ENTRY(TYPE_DSTIDX8, "1-byte memory at destination index") \
+ ENUM_ENTRY(TYPE_DSTIDX16, "2-byte memory at destination index") \
+ ENUM_ENTRY(TYPE_DSTIDX32, "4-byte memory at destination index") \
+ ENUM_ENTRY(TYPE_DSTIDX64, "8-byte memory at destination index") \
+ ENUM_ENTRY(TYPE_MOFFS8, "1-byte memory offset (relative to segment " \
+ "base)") \
+ ENUM_ENTRY(TYPE_MOFFS16, "2-byte") \
+ ENUM_ENTRY(TYPE_MOFFS32, "4-byte") \
+ ENUM_ENTRY(TYPE_MOFFS64, "8-byte") \
+ ENUM_ENTRY(TYPE_M32FP, "32-bit IEE754 memory floating-point operand") \
+ ENUM_ENTRY(TYPE_M64FP, "64-bit") \
+ ENUM_ENTRY(TYPE_M80FP, "80-bit extended") \
+ ENUM_ENTRY(TYPE_ST, "Position on the floating-point stack") \
+ ENUM_ENTRY(TYPE_MM64, "8-byte MMX register") \
+ ENUM_ENTRY(TYPE_XMM32, "4-byte XMM register or memory operand") \
+ ENUM_ENTRY(TYPE_XMM64, "8-byte") \
+ ENUM_ENTRY(TYPE_XMM128, "16-byte") \
+ ENUM_ENTRY(TYPE_XMM256, "32-byte") \
+ ENUM_ENTRY(TYPE_XMM512, "64-byte") \
+ ENUM_ENTRY(TYPE_VK1, "1-bit") \
+ ENUM_ENTRY(TYPE_VK2, "2-bit") \
+ ENUM_ENTRY(TYPE_VK4, "4-bit") \
+ ENUM_ENTRY(TYPE_VK8, "8-bit") \
+ ENUM_ENTRY(TYPE_VK16, "16-bit") \
+ ENUM_ENTRY(TYPE_VK32, "32-bit") \
+ ENUM_ENTRY(TYPE_VK64, "64-bit") \
+ ENUM_ENTRY(TYPE_SEGMENTREG, "Segment register operand") \
+ ENUM_ENTRY(TYPE_DEBUGREG, "Debug register operand") \
+ ENUM_ENTRY(TYPE_CONTROLREG, "Control register operand") \
+ ENUM_ENTRY(TYPE_BNDR, "MPX bounds register") \
+ \
+ ENUM_ENTRY(TYPE_Mv, "Memory operand of operand size") \
+ ENUM_ENTRY(TYPE_Rv, "Register operand of operand size") \
+ ENUM_ENTRY(TYPE_IMMv, "Immediate operand of operand size") \
+ ENUM_ENTRY(TYPE_RELv, "Immediate address of operand size") \
+ ENUM_ENTRY(TYPE_DUP0, "Duplicate of operand 0") \
+ ENUM_ENTRY(TYPE_DUP1, "operand 1") \
+ ENUM_ENTRY(TYPE_DUP2, "operand 2") \
+ ENUM_ENTRY(TYPE_DUP3, "operand 3") \
+ ENUM_ENTRY(TYPE_DUP4, "operand 4") \
+ ENUM_ENTRY(TYPE_M512, "512-bit FPU/MMX/XMM/MXCSR state")
+
+#define ENUM_ENTRY(n, d) n,
+enum OperandType {
+ TYPES
+ TYPE_max
+};
+#undef ENUM_ENTRY
+
+/// \brief The specification for how to extract and interpret one operand.
+struct OperandSpecifier {
+ uint8_t encoding;
+ uint8_t type;
+};
+
+static const unsigned X86_MAX_OPERANDS = 6;
+
+/// Decoding mode for the Intel disassembler. 16-bit, 32-bit, and 64-bit mode
+/// are supported, and represent real mode, IA-32e, and IA-32e in 64-bit mode,
+/// respectively.
+enum DisassemblerMode {
+ MODE_16BIT,
+ MODE_32BIT,
+ MODE_64BIT
+};
+
+} // namespace X86Disassembler
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
new file mode 100644
index 000000000000..10b7e6ff5ee2
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -0,0 +1,299 @@
+//===-- X86ATTInstPrinter.cpp - AT&T assembly instruction printing --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes code for rendering MCInst instances as AT&T-style
+// assembly.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ATTInstPrinter.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "X86InstComments.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormattedStream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+// Include the auto-generated portion of the assembly writer.
+#define PRINT_ALIAS_INSTR
+#include "X86GenAsmWriter.inc"
+
+void X86ATTInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+ OS << markup("<reg:") << '%' << getRegisterName(RegNo) << markup(">");
+}
+
+void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+ StringRef Annot, const MCSubtargetInfo &STI) {
+ const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+ uint64_t TSFlags = Desc.TSFlags;
+
+ // If verbose assembly is enabled, we can print some informative comments.
+ if (CommentStream)
+ HasCustomInstComment =
+ EmitAnyX86InstComments(MI, *CommentStream, getRegisterName);
+
+ if (TSFlags & X86II::LOCK)
+ OS << "\tlock\t";
+
+ // Output CALLpcrel32 as "callq" in 64-bit mode.
+ // In Intel annotation it's always emitted as "call".
+ //
+ // TODO: Probably this hack should be redesigned via InstAlias in
+ // InstrInfo.td as soon as Requires clause is supported properly
+ // for InstAlias.
+ if (MI->getOpcode() == X86::CALLpcrel32 &&
+ (STI.getFeatureBits()[X86::Mode64Bit])) {
+ OS << "\tcallq\t";
+ printPCRelImm(MI, 0, OS);
+ }
+ // Try to print any aliases first.
+ else if (!printAliasInstr(MI, OS))
+ printInstruction(MI, OS);
+
+ // Next always print the annotation.
+ printAnnotation(OS, Annot);
+}
+
+void X86ATTInstPrinter::printSSEAVXCC(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ int64_t Imm = MI->getOperand(Op).getImm();
+ switch (Imm) {
+ default: llvm_unreachable("Invalid ssecc/avxcc argument!");
+ case 0: O << "eq"; break;
+ case 1: O << "lt"; break;
+ case 2: O << "le"; break;
+ case 3: O << "unord"; break;
+ case 4: O << "neq"; break;
+ case 5: O << "nlt"; break;
+ case 6: O << "nle"; break;
+ case 7: O << "ord"; break;
+ case 8: O << "eq_uq"; break;
+ case 9: O << "nge"; break;
+ case 0xa: O << "ngt"; break;
+ case 0xb: O << "false"; break;
+ case 0xc: O << "neq_oq"; break;
+ case 0xd: O << "ge"; break;
+ case 0xe: O << "gt"; break;
+ case 0xf: O << "true"; break;
+ case 0x10: O << "eq_os"; break;
+ case 0x11: O << "lt_oq"; break;
+ case 0x12: O << "le_oq"; break;
+ case 0x13: O << "unord_s"; break;
+ case 0x14: O << "neq_us"; break;
+ case 0x15: O << "nlt_uq"; break;
+ case 0x16: O << "nle_uq"; break;
+ case 0x17: O << "ord_s"; break;
+ case 0x18: O << "eq_us"; break;
+ case 0x19: O << "nge_uq"; break;
+ case 0x1a: O << "ngt_uq"; break;
+ case 0x1b: O << "false_os"; break;
+ case 0x1c: O << "neq_os"; break;
+ case 0x1d: O << "ge_oq"; break;
+ case 0x1e: O << "gt_oq"; break;
+ case 0x1f: O << "true_us"; break;
+ }
+}
+
+void X86ATTInstPrinter::printXOPCC(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ int64_t Imm = MI->getOperand(Op).getImm();
+ switch (Imm) {
+ default: llvm_unreachable("Invalid xopcc argument!");
+ case 0: O << "lt"; break;
+ case 1: O << "le"; break;
+ case 2: O << "gt"; break;
+ case 3: O << "ge"; break;
+ case 4: O << "eq"; break;
+ case 5: O << "neq"; break;
+ case 6: O << "false"; break;
+ case 7: O << "true"; break;
+ }
+}
+
+void X86ATTInstPrinter::printRoundingControl(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ int64_t Imm = MI->getOperand(Op).getImm() & 0x3;
+ switch (Imm) {
+ case 0: O << "{rn-sae}"; break;
+ case 1: O << "{rd-sae}"; break;
+ case 2: O << "{ru-sae}"; break;
+ case 3: O << "{rz-sae}"; break;
+ }
+}
+/// printPCRelImm - This is used to print an immediate value that ends up
+/// being encoded as a pc-relative value (e.g. for jumps and calls). These
+/// print slightly differently than normal immediates. For example, a $ is not
+/// emitted.
+void X86ATTInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isImm())
+ O << formatImm(Op.getImm());
+ else {
+ assert(Op.isExpr() && "unknown pcrel immediate operand");
+ // If a symbolic branch target was added as a constant expression then print
+ // that address in hex.
+ const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
+ int64_t Address;
+ if (BranchTarget && BranchTarget->evaluateAsAbsolute(Address)) {
+ O << formatHex((uint64_t)Address);
+ } else {
+ // Otherwise, just print the expression.
+ Op.getExpr()->print(O, &MAI);
+ }
+ }
+}
+
+void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isReg()) {
+ printRegName(O, Op.getReg());
+ } else if (Op.isImm()) {
+ // Print immediates as signed values.
+ int64_t Imm = Op.getImm();
+ O << markup("<imm:") << '$' << formatImm(Imm) << markup(">");
+
+ // TODO: This should be in a helper function in the base class, so it can
+ // be used by other printers.
+
+ // If there are no instruction-specific comments, add a comment clarifying
+ // the hex value of the immediate operand when it isn't in the range
+ // [-256,255].
+ if (CommentStream && !HasCustomInstComment && (Imm > 255 || Imm < -256)) {
+ // Don't print unnecessary hex sign bits.
+ if (Imm == (int16_t)(Imm))
+ *CommentStream << format("imm = 0x%" PRIX16 "\n", (uint16_t)Imm);
+ else if (Imm == (int32_t)(Imm))
+ *CommentStream << format("imm = 0x%" PRIX32 "\n", (uint32_t)Imm);
+ else
+ *CommentStream << format("imm = 0x%" PRIX64 "\n", (uint64_t)Imm);
+ }
+ } else {
+ assert(Op.isExpr() && "unknown operand kind in printOperand");
+ O << markup("<imm:") << '$';
+ Op.getExpr()->print(O, &MAI);
+ O << markup(">");
+ }
+}
+
+void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ const MCOperand &BaseReg = MI->getOperand(Op + X86::AddrBaseReg);
+ const MCOperand &IndexReg = MI->getOperand(Op + X86::AddrIndexReg);
+ const MCOperand &DispSpec = MI->getOperand(Op + X86::AddrDisp);
+ const MCOperand &SegReg = MI->getOperand(Op + X86::AddrSegmentReg);
+
+ O << markup("<mem:");
+
+ // If this has a segment register, print it.
+ if (SegReg.getReg()) {
+ printOperand(MI, Op + X86::AddrSegmentReg, O);
+ O << ':';
+ }
+
+ if (DispSpec.isImm()) {
+ int64_t DispVal = DispSpec.getImm();
+ if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg()))
+ O << formatImm(DispVal);
+ } else {
+ assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
+ DispSpec.getExpr()->print(O, &MAI);
+ }
+
+ if (IndexReg.getReg() || BaseReg.getReg()) {
+ O << '(';
+ if (BaseReg.getReg())
+ printOperand(MI, Op + X86::AddrBaseReg, O);
+
+ if (IndexReg.getReg()) {
+ O << ',';
+ printOperand(MI, Op + X86::AddrIndexReg, O);
+ unsigned ScaleVal = MI->getOperand(Op + X86::AddrScaleAmt).getImm();
+ if (ScaleVal != 1) {
+ O << ',' << markup("<imm:") << ScaleVal // never printed in hex.
+ << markup(">");
+ }
+ }
+ O << ')';
+ }
+
+ O << markup(">");
+}
+
+void X86ATTInstPrinter::printSrcIdx(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ const MCOperand &SegReg = MI->getOperand(Op + 1);
+
+ O << markup("<mem:");
+
+ // If this has a segment register, print it.
+ if (SegReg.getReg()) {
+ printOperand(MI, Op + 1, O);
+ O << ':';
+ }
+
+ O << "(";
+ printOperand(MI, Op, O);
+ O << ")";
+
+ O << markup(">");
+}
+
+void X86ATTInstPrinter::printDstIdx(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ O << markup("<mem:");
+
+ O << "%es:(";
+ printOperand(MI, Op, O);
+ O << ")";
+
+ O << markup(">");
+}
+
+void X86ATTInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ const MCOperand &DispSpec = MI->getOperand(Op);
+ const MCOperand &SegReg = MI->getOperand(Op + 1);
+
+ O << markup("<mem:");
+
+ // If this has a segment register, print it.
+ if (SegReg.getReg()) {
+ printOperand(MI, Op + 1, O);
+ O << ':';
+ }
+
+ if (DispSpec.isImm()) {
+ O << formatImm(DispSpec.getImm());
+ } else {
+ assert(DispSpec.isExpr() && "non-immediate displacement?");
+ DispSpec.getExpr()->print(O, &MAI);
+ }
+
+ O << markup(">");
+}
+
+void X86ATTInstPrinter::printU8Imm(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ if (MI->getOperand(Op).isExpr())
+ return printOperand(MI, Op, O);
+
+ O << markup("<imm:") << '$' << formatImm(MI->getOperand(Op).getImm() & 0xff)
+ << markup(">");
+}
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
new file mode 100644
index 000000000000..bbb309076610
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
@@ -0,0 +1,142 @@
+//==- X86ATTInstPrinter.h - Convert X86 MCInst to assembly syntax -*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an X86 MCInst to AT&T style .s file syntax.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H
+#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+
+class X86ATTInstPrinter final : public MCInstPrinter {
+public:
+ X86ATTInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI)
+ : MCInstPrinter(MAI, MII, MRI) {}
+
+ void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+ void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot,
+ const MCSubtargetInfo &STI) override;
+
+ // Autogenerated by tblgen, returns true if we successfully printed an
+ // alias.
+ bool printAliasInstr(const MCInst *MI, raw_ostream &OS);
+ void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+ unsigned PrintMethodIdx, raw_ostream &O);
+
+ // Autogenerated by tblgen.
+ void printInstruction(const MCInst *MI, raw_ostream &OS);
+ static const char *getRegisterName(unsigned RegNo);
+
+ void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+ void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &OS);
+ void printSSEAVXCC(const MCInst *MI, unsigned Op, raw_ostream &OS);
+ void printXOPCC(const MCInst *MI, unsigned Op, raw_ostream &OS);
+ void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+ void printSrcIdx(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+ void printDstIdx(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+ void printMemOffset(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+ void printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &OS);
+ void printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &OS);
+
+ void printanymem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+
+ void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+
+ void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printi16mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printi32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printi64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printi128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printi512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printf64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printf80mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printf128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printf512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+
+ void printSrcIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printSrcIdx(MI, OpNo, O);
+ }
+ void printSrcIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printSrcIdx(MI, OpNo, O);
+ }
+ void printSrcIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printSrcIdx(MI, OpNo, O);
+ }
+ void printSrcIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printSrcIdx(MI, OpNo, O);
+ }
+ void printDstIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printDstIdx(MI, OpNo, O);
+ }
+ void printDstIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printDstIdx(MI, OpNo, O);
+ }
+ void printDstIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printDstIdx(MI, OpNo, O);
+ }
+ void printDstIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printDstIdx(MI, OpNo, O);
+ }
+ void printMemOffs8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemOffset(MI, OpNo, O);
+ }
+ void printMemOffs16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemOffset(MI, OpNo, O);
+ }
+ void printMemOffs32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemOffset(MI, OpNo, O);
+ }
+ void printMemOffs64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemOffset(MI, OpNo, O);
+ }
+
+private:
+ bool HasCustomInstComment;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp
new file mode 100644
index 000000000000..8594addb5dd4
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -0,0 +1,1197 @@
+//===-- X86InstComments.cpp - Generate verbose-asm comments for instrs ----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This defines functionality used to emit comments about X86 instructions to
+// an output stream for -fverbose-asm.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstComments.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "Utils/X86ShuffleDecode.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define CASE_SSE_INS_COMMON(Inst, src) \
+ case X86::Inst##src:
+
+#define CASE_AVX_INS_COMMON(Inst, Suffix, src) \
+ case X86::V##Inst##Suffix##src:
+
+#define CASE_MASK_INS_COMMON(Inst, Suffix, src) \
+ case X86::V##Inst##Suffix##src##k:
+
+#define CASE_MASKZ_INS_COMMON(Inst, Suffix, src) \
+ case X86::V##Inst##Suffix##src##kz:
+
+#define CASE_AVX512_INS_COMMON(Inst, Suffix, src) \
+ CASE_AVX_INS_COMMON(Inst, Suffix, src) \
+ CASE_MASK_INS_COMMON(Inst, Suffix, src) \
+ CASE_MASKZ_INS_COMMON(Inst, Suffix, src)
+
+#define CASE_MOVDUP(Inst, src) \
+ CASE_AVX512_INS_COMMON(Inst, Z, r##src) \
+ CASE_AVX512_INS_COMMON(Inst, Z256, r##src) \
+ CASE_AVX512_INS_COMMON(Inst, Z128, r##src) \
+ CASE_AVX_INS_COMMON(Inst, , r##src) \
+ CASE_AVX_INS_COMMON(Inst, Y, r##src) \
+ CASE_SSE_INS_COMMON(Inst, r##src)
+
+#define CASE_MASK_MOVDUP(Inst, src) \
+ CASE_MASK_INS_COMMON(Inst, Z, r##src) \
+ CASE_MASK_INS_COMMON(Inst, Z256, r##src) \
+ CASE_MASK_INS_COMMON(Inst, Z128, r##src)
+
+#define CASE_MASKZ_MOVDUP(Inst, src) \
+ CASE_MASKZ_INS_COMMON(Inst, Z, r##src) \
+ CASE_MASKZ_INS_COMMON(Inst, Z256, r##src) \
+ CASE_MASKZ_INS_COMMON(Inst, Z128, r##src)
+
+#define CASE_PMOVZX(Inst, src) \
+ CASE_AVX512_INS_COMMON(Inst, Z, r##src) \
+ CASE_AVX512_INS_COMMON(Inst, Z256, r##src) \
+ CASE_AVX512_INS_COMMON(Inst, Z128, r##src) \
+ CASE_AVX_INS_COMMON(Inst, , r##src) \
+ CASE_AVX_INS_COMMON(Inst, Y, r##src) \
+ CASE_SSE_INS_COMMON(Inst, r##src)
+
+#define CASE_MASK_PMOVZX(Inst, src) \
+ CASE_MASK_INS_COMMON(Inst, Z, r##src) \
+ CASE_MASK_INS_COMMON(Inst, Z256, r##src) \
+ CASE_MASK_INS_COMMON(Inst, Z128, r##src)
+
+#define CASE_MASKZ_PMOVZX(Inst, src) \
+ CASE_MASKZ_INS_COMMON(Inst, Z, r##src) \
+ CASE_MASKZ_INS_COMMON(Inst, Z256, r##src) \
+ CASE_MASKZ_INS_COMMON(Inst, Z128, r##src)
+
+#define CASE_UNPCK(Inst, src) \
+ CASE_AVX512_INS_COMMON(Inst, Z, r##src) \
+ CASE_AVX512_INS_COMMON(Inst, Z256, r##src) \
+ CASE_AVX512_INS_COMMON(Inst, Z128, r##src) \
+ CASE_AVX_INS_COMMON(Inst, , r##src) \
+ CASE_AVX_INS_COMMON(Inst, Y, r##src) \
+ CASE_SSE_INS_COMMON(Inst, r##src)
+
+#define CASE_MASK_UNPCK(Inst, src) \
+ CASE_MASK_INS_COMMON(Inst, Z, r##src) \
+ CASE_MASK_INS_COMMON(Inst, Z256, r##src) \
+ CASE_MASK_INS_COMMON(Inst, Z128, r##src)
+
+#define CASE_MASKZ_UNPCK(Inst, src) \
+ CASE_MASKZ_INS_COMMON(Inst, Z, r##src) \
+ CASE_MASKZ_INS_COMMON(Inst, Z256, r##src) \
+ CASE_MASKZ_INS_COMMON(Inst, Z128, r##src)
+
+#define CASE_SHUF(Inst, suf) \
+ CASE_AVX512_INS_COMMON(Inst, Z, suf) \
+ CASE_AVX512_INS_COMMON(Inst, Z256, suf) \
+ CASE_AVX512_INS_COMMON(Inst, Z128, suf) \
+ CASE_AVX_INS_COMMON(Inst, , suf) \
+ CASE_AVX_INS_COMMON(Inst, Y, suf) \
+ CASE_SSE_INS_COMMON(Inst, suf)
+
+#define CASE_MASK_SHUF(Inst, src) \
+ CASE_MASK_INS_COMMON(Inst, Z, r##src##i) \
+ CASE_MASK_INS_COMMON(Inst, Z256, r##src##i) \
+ CASE_MASK_INS_COMMON(Inst, Z128, r##src##i)
+
+#define CASE_MASKZ_SHUF(Inst, src) \
+ CASE_MASKZ_INS_COMMON(Inst, Z, r##src##i) \
+ CASE_MASKZ_INS_COMMON(Inst, Z256, r##src##i) \
+ CASE_MASKZ_INS_COMMON(Inst, Z128, r##src##i)
+
+#define CASE_VPERMILPI(Inst, src) \
+ CASE_AVX512_INS_COMMON(Inst, Z, src##i) \
+ CASE_AVX512_INS_COMMON(Inst, Z256, src##i) \
+ CASE_AVX512_INS_COMMON(Inst, Z128, src##i) \
+ CASE_AVX_INS_COMMON(Inst, , src##i) \
+ CASE_AVX_INS_COMMON(Inst, Y, src##i)
+
+#define CASE_MASK_VPERMILPI(Inst, src) \
+ CASE_MASK_INS_COMMON(Inst, Z, src##i) \
+ CASE_MASK_INS_COMMON(Inst, Z256, src##i) \
+ CASE_MASK_INS_COMMON(Inst, Z128, src##i)
+
+#define CASE_MASKZ_VPERMILPI(Inst, src) \
+ CASE_MASKZ_INS_COMMON(Inst, Z, src##i) \
+ CASE_MASKZ_INS_COMMON(Inst, Z256, src##i) \
+ CASE_MASKZ_INS_COMMON(Inst, Z128, src##i)
+
+#define CASE_VPERM(Inst, src) \
+ CASE_AVX512_INS_COMMON(Inst, Z, src##i) \
+ CASE_AVX512_INS_COMMON(Inst, Z256, src##i) \
+ CASE_AVX_INS_COMMON(Inst, Y, src##i)
+
+#define CASE_MASK_VPERM(Inst, src) \
+ CASE_MASK_INS_COMMON(Inst, Z, src##i) \
+ CASE_MASK_INS_COMMON(Inst, Z256, src##i)
+
+#define CASE_MASKZ_VPERM(Inst, src) \
+ CASE_MASKZ_INS_COMMON(Inst, Z, src##i) \
+ CASE_MASKZ_INS_COMMON(Inst, Z256, src##i)
+
+#define CASE_VSHUF(Inst, src) \
+ CASE_AVX512_INS_COMMON(SHUFF##Inst, Z, r##src##i) \
+ CASE_AVX512_INS_COMMON(SHUFI##Inst, Z, r##src##i) \
+ CASE_AVX512_INS_COMMON(SHUFF##Inst, Z256, r##src##i) \
+ CASE_AVX512_INS_COMMON(SHUFI##Inst, Z256, r##src##i)
+
+#define CASE_MASK_VSHUF(Inst, src) \
+ CASE_MASK_INS_COMMON(SHUFF##Inst, Z, r##src##i) \
+ CASE_MASK_INS_COMMON(SHUFI##Inst, Z, r##src##i) \
+ CASE_MASK_INS_COMMON(SHUFF##Inst, Z256, r##src##i) \
+ CASE_MASK_INS_COMMON(SHUFI##Inst, Z256, r##src##i)
+
+#define CASE_MASKZ_VSHUF(Inst, src) \
+ CASE_MASKZ_INS_COMMON(SHUFF##Inst, Z, r##src##i) \
+ CASE_MASKZ_INS_COMMON(SHUFI##Inst, Z, r##src##i) \
+ CASE_MASKZ_INS_COMMON(SHUFF##Inst, Z256, r##src##i) \
+ CASE_MASKZ_INS_COMMON(SHUFI##Inst, Z256, r##src##i)
+
+static unsigned getVectorRegSize(unsigned RegNo) {
+ if (X86::ZMM0 <= RegNo && RegNo <= X86::ZMM31)
+ return 512;
+ if (X86::YMM0 <= RegNo && RegNo <= X86::YMM31)
+ return 256;
+ if (X86::XMM0 <= RegNo && RegNo <= X86::XMM31)
+ return 128;
+ if (X86::MM0 <= RegNo && RegNo <= X86::MM7)
+ return 64;
+
+ llvm_unreachable("Unknown vector reg!");
+}
+
+static MVT getRegOperandVectorVT(const MCInst *MI, const MVT &ScalarVT,
+ unsigned OperandIndex) {
+ unsigned OpReg = MI->getOperand(OperandIndex).getReg();
+ return MVT::getVectorVT(ScalarVT,
+ getVectorRegSize(OpReg)/ScalarVT.getSizeInBits());
+}
+
+/// \brief Extracts the dst type for a given zero extension instruction.
+static MVT getZeroExtensionResultType(const MCInst *MI) {
+ switch (MI->getOpcode()) {
+ default:
+ llvm_unreachable("Unknown zero extension instruction");
+ // zero extension to i16
+ CASE_PMOVZX(PMOVZXBW, m)
+ CASE_PMOVZX(PMOVZXBW, r)
+ return getRegOperandVectorVT(MI, MVT::i16, 0);
+ // zero extension to i32
+ CASE_PMOVZX(PMOVZXBD, m)
+ CASE_PMOVZX(PMOVZXBD, r)
+ CASE_PMOVZX(PMOVZXWD, m)
+ CASE_PMOVZX(PMOVZXWD, r)
+ return getRegOperandVectorVT(MI, MVT::i32, 0);
+ // zero extension to i64
+ CASE_PMOVZX(PMOVZXBQ, m)
+ CASE_PMOVZX(PMOVZXBQ, r)
+ CASE_PMOVZX(PMOVZXWQ, m)
+ CASE_PMOVZX(PMOVZXWQ, r)
+ CASE_PMOVZX(PMOVZXDQ, m)
+ CASE_PMOVZX(PMOVZXDQ, r)
+ return getRegOperandVectorVT(MI, MVT::i64, 0);
+ }
+}
+
+/// Wraps the destination register name with AVX512 mask/maskz filtering.
+static std::string getMaskName(const MCInst *MI, const char *DestName,
+ const char *(*getRegName)(unsigned)) {
+ std::string OpMaskName(DestName);
+
+ bool MaskWithZero = false;
+ const char *MaskRegName = nullptr;
+
+ switch (MI->getOpcode()) {
+ default:
+ return OpMaskName;
+ CASE_MASKZ_MOVDUP(MOVDDUP, m)
+ CASE_MASKZ_MOVDUP(MOVDDUP, r)
+ CASE_MASKZ_MOVDUP(MOVSHDUP, m)
+ CASE_MASKZ_MOVDUP(MOVSHDUP, r)
+ CASE_MASKZ_MOVDUP(MOVSLDUP, m)
+ CASE_MASKZ_MOVDUP(MOVSLDUP, r)
+ CASE_MASKZ_PMOVZX(PMOVZXBD, m)
+ CASE_MASKZ_PMOVZX(PMOVZXBD, r)
+ CASE_MASKZ_PMOVZX(PMOVZXBQ, m)
+ CASE_MASKZ_PMOVZX(PMOVZXBQ, r)
+ CASE_MASKZ_PMOVZX(PMOVZXBW, m)
+ CASE_MASKZ_PMOVZX(PMOVZXBW, r)
+ CASE_MASKZ_PMOVZX(PMOVZXDQ, m)
+ CASE_MASKZ_PMOVZX(PMOVZXDQ, r)
+ CASE_MASKZ_PMOVZX(PMOVZXWD, m)
+ CASE_MASKZ_PMOVZX(PMOVZXWD, r)
+ CASE_MASKZ_PMOVZX(PMOVZXWQ, m)
+ CASE_MASKZ_PMOVZX(PMOVZXWQ, r)
+ CASE_MASKZ_UNPCK(PUNPCKHBW, m)
+ CASE_MASKZ_UNPCK(PUNPCKHBW, r)
+ CASE_MASKZ_UNPCK(PUNPCKHWD, m)
+ CASE_MASKZ_UNPCK(PUNPCKHWD, r)
+ CASE_MASKZ_UNPCK(PUNPCKHDQ, m)
+ CASE_MASKZ_UNPCK(PUNPCKHDQ, r)
+ CASE_MASKZ_UNPCK(PUNPCKLBW, m)
+ CASE_MASKZ_UNPCK(PUNPCKLBW, r)
+ CASE_MASKZ_UNPCK(PUNPCKLWD, m)
+ CASE_MASKZ_UNPCK(PUNPCKLWD, r)
+ CASE_MASKZ_UNPCK(PUNPCKLDQ, m)
+ CASE_MASKZ_UNPCK(PUNPCKLDQ, r)
+ CASE_MASKZ_UNPCK(UNPCKHPD, m)
+ CASE_MASKZ_UNPCK(UNPCKHPD, r)
+ CASE_MASKZ_UNPCK(UNPCKHPS, m)
+ CASE_MASKZ_UNPCK(UNPCKHPS, r)
+ CASE_MASKZ_UNPCK(UNPCKLPD, m)
+ CASE_MASKZ_UNPCK(UNPCKLPD, r)
+ CASE_MASKZ_UNPCK(UNPCKLPS, m)
+ CASE_MASKZ_UNPCK(UNPCKLPS, r)
+ CASE_MASKZ_SHUF(PALIGNR, r)
+ CASE_MASKZ_SHUF(PALIGNR, m)
+ CASE_MASKZ_SHUF(ALIGNQ, r)
+ CASE_MASKZ_SHUF(ALIGNQ, m)
+ CASE_MASKZ_SHUF(ALIGND, r)
+ CASE_MASKZ_SHUF(ALIGND, m)
+ CASE_MASKZ_SHUF(SHUFPD, m)
+ CASE_MASKZ_SHUF(SHUFPD, r)
+ CASE_MASKZ_SHUF(SHUFPS, m)
+ CASE_MASKZ_SHUF(SHUFPS, r)
+ CASE_MASKZ_VPERMILPI(PERMILPD, m)
+ CASE_MASKZ_VPERMILPI(PERMILPD, r)
+ CASE_MASKZ_VPERMILPI(PERMILPS, m)
+ CASE_MASKZ_VPERMILPI(PERMILPS, r)
+ CASE_MASKZ_VPERMILPI(PSHUFD, m)
+ CASE_MASKZ_VPERMILPI(PSHUFD, r)
+ CASE_MASKZ_VPERMILPI(PSHUFHW, m)
+ CASE_MASKZ_VPERMILPI(PSHUFHW, r)
+ CASE_MASKZ_VPERMILPI(PSHUFLW, m)
+ CASE_MASKZ_VPERMILPI(PSHUFLW, r)
+ CASE_MASKZ_VPERM(PERMPD, m)
+ CASE_MASKZ_VPERM(PERMPD, r)
+ CASE_MASKZ_VPERM(PERMQ, m)
+ CASE_MASKZ_VPERM(PERMQ, r)
+ CASE_MASKZ_VSHUF(64X2, m)
+ CASE_MASKZ_VSHUF(64X2, r)
+ CASE_MASKZ_VSHUF(32X4, m)
+ CASE_MASKZ_VSHUF(32X4, r)
+ CASE_MASKZ_INS_COMMON(BROADCASTF64X2, Z128, rm)
+ CASE_MASKZ_INS_COMMON(BROADCASTI64X2, Z128, rm)
+ CASE_MASKZ_INS_COMMON(BROADCASTF64X2, , rm)
+ CASE_MASKZ_INS_COMMON(BROADCASTI64X2, , rm)
+ CASE_MASKZ_INS_COMMON(BROADCASTF64X4, , rm)
+ CASE_MASKZ_INS_COMMON(BROADCASTI64X4, , rm)
+ CASE_MASKZ_INS_COMMON(BROADCASTF32X4, Z256, rm)
+ CASE_MASKZ_INS_COMMON(BROADCASTI32X4, Z256, rm)
+ CASE_MASKZ_INS_COMMON(BROADCASTF32X4, , rm)
+ CASE_MASKZ_INS_COMMON(BROADCASTI32X4, , rm)
+ CASE_MASKZ_INS_COMMON(BROADCASTF32X8, , rm)
+ CASE_MASKZ_INS_COMMON(BROADCASTI32X8, , rm)
+ CASE_MASKZ_INS_COMMON(BROADCASTF32X2, Z256, r)
+ CASE_MASKZ_INS_COMMON(BROADCASTI32X2, Z256, r)
+ CASE_MASKZ_INS_COMMON(BROADCASTF32X2, Z256, m)
+ CASE_MASKZ_INS_COMMON(BROADCASTI32X2, Z256, m)
+ CASE_MASKZ_INS_COMMON(BROADCASTF32X2, Z, r)
+ CASE_MASKZ_INS_COMMON(BROADCASTI32X2, Z, r)
+ CASE_MASKZ_INS_COMMON(BROADCASTF32X2, Z, m)
+ CASE_MASKZ_INS_COMMON(BROADCASTI32X2, Z, m)
+ MaskWithZero = true;
+ MaskRegName = getRegName(MI->getOperand(1).getReg());
+ break;
+ CASE_MASK_MOVDUP(MOVDDUP, m)
+ CASE_MASK_MOVDUP(MOVDDUP, r)
+ CASE_MASK_MOVDUP(MOVSHDUP, m)
+ CASE_MASK_MOVDUP(MOVSHDUP, r)
+ CASE_MASK_MOVDUP(MOVSLDUP, m)
+ CASE_MASK_MOVDUP(MOVSLDUP, r)
+ CASE_MASK_PMOVZX(PMOVZXBD, m)
+ CASE_MASK_PMOVZX(PMOVZXBD, r)
+ CASE_MASK_PMOVZX(PMOVZXBQ, m)
+ CASE_MASK_PMOVZX(PMOVZXBQ, r)
+ CASE_MASK_PMOVZX(PMOVZXBW, m)
+ CASE_MASK_PMOVZX(PMOVZXBW, r)
+ CASE_MASK_PMOVZX(PMOVZXDQ, m)
+ CASE_MASK_PMOVZX(PMOVZXDQ, r)
+ CASE_MASK_PMOVZX(PMOVZXWD, m)
+ CASE_MASK_PMOVZX(PMOVZXWD, r)
+ CASE_MASK_PMOVZX(PMOVZXWQ, m)
+ CASE_MASK_PMOVZX(PMOVZXWQ, r)
+ CASE_MASK_UNPCK(PUNPCKHBW, m)
+ CASE_MASK_UNPCK(PUNPCKHBW, r)
+ CASE_MASK_UNPCK(PUNPCKHWD, m)
+ CASE_MASK_UNPCK(PUNPCKHWD, r)
+ CASE_MASK_UNPCK(PUNPCKHDQ, m)
+ CASE_MASK_UNPCK(PUNPCKHDQ, r)
+ CASE_MASK_UNPCK(PUNPCKLBW, m)
+ CASE_MASK_UNPCK(PUNPCKLBW, r)
+ CASE_MASK_UNPCK(PUNPCKLWD, m)
+ CASE_MASK_UNPCK(PUNPCKLWD, r)
+ CASE_MASK_UNPCK(PUNPCKLDQ, m)
+ CASE_MASK_UNPCK(PUNPCKLDQ, r)
+ CASE_MASK_UNPCK(UNPCKHPD, m)
+ CASE_MASK_UNPCK(UNPCKHPD, r)
+ CASE_MASK_UNPCK(UNPCKHPS, m)
+ CASE_MASK_UNPCK(UNPCKHPS, r)
+ CASE_MASK_UNPCK(UNPCKLPD, m)
+ CASE_MASK_UNPCK(UNPCKLPD, r)
+ CASE_MASK_UNPCK(UNPCKLPS, m)
+ CASE_MASK_UNPCK(UNPCKLPS, r)
+ CASE_MASK_SHUF(PALIGNR, r)
+ CASE_MASK_SHUF(PALIGNR, m)
+ CASE_MASK_SHUF(ALIGNQ, r)
+ CASE_MASK_SHUF(ALIGNQ, m)
+ CASE_MASK_SHUF(ALIGND, r)
+ CASE_MASK_SHUF(ALIGND, m)
+ CASE_MASK_SHUF(SHUFPD, m)
+ CASE_MASK_SHUF(SHUFPD, r)
+ CASE_MASK_SHUF(SHUFPS, m)
+ CASE_MASK_SHUF(SHUFPS, r)
+ CASE_MASK_VPERMILPI(PERMILPD, m)
+ CASE_MASK_VPERMILPI(PERMILPD, r)
+ CASE_MASK_VPERMILPI(PERMILPS, m)
+ CASE_MASK_VPERMILPI(PERMILPS, r)
+ CASE_MASK_VPERMILPI(PSHUFD, m)
+ CASE_MASK_VPERMILPI(PSHUFD, r)
+ CASE_MASK_VPERMILPI(PSHUFHW, m)
+ CASE_MASK_VPERMILPI(PSHUFHW, r)
+ CASE_MASK_VPERMILPI(PSHUFLW, m)
+ CASE_MASK_VPERMILPI(PSHUFLW, r)
+ CASE_MASK_VPERM(PERMPD, m)
+ CASE_MASK_VPERM(PERMPD, r)
+ CASE_MASK_VPERM(PERMQ, m)
+ CASE_MASK_VPERM(PERMQ, r)
+ CASE_MASK_VSHUF(64X2, m)
+ CASE_MASK_VSHUF(64X2, r)
+ CASE_MASK_VSHUF(32X4, m)
+ CASE_MASK_VSHUF(32X4, r)
+ CASE_MASK_INS_COMMON(BROADCASTF64X2, Z128, rm)
+ CASE_MASK_INS_COMMON(BROADCASTI64X2, Z128, rm)
+ CASE_MASK_INS_COMMON(BROADCASTF64X2, , rm)
+ CASE_MASK_INS_COMMON(BROADCASTI64X2, , rm)
+ CASE_MASK_INS_COMMON(BROADCASTF64X4, , rm)
+ CASE_MASK_INS_COMMON(BROADCASTI64X4, , rm)
+ CASE_MASK_INS_COMMON(BROADCASTF32X4, Z256, rm)
+ CASE_MASK_INS_COMMON(BROADCASTI32X4, Z256, rm)
+ CASE_MASK_INS_COMMON(BROADCASTF32X4, , rm)
+ CASE_MASK_INS_COMMON(BROADCASTI32X4, , rm)
+ CASE_MASK_INS_COMMON(BROADCASTF32X8, , rm)
+ CASE_MASK_INS_COMMON(BROADCASTI32X8, , rm)
+ CASE_MASK_INS_COMMON(BROADCASTF32X2, Z256, r)
+ CASE_MASK_INS_COMMON(BROADCASTI32X2, Z256, r)
+ CASE_MASK_INS_COMMON(BROADCASTF32X2, Z256, m)
+ CASE_MASK_INS_COMMON(BROADCASTI32X2, Z256, m)
+ CASE_MASK_INS_COMMON(BROADCASTF32X2, Z, r)
+ CASE_MASK_INS_COMMON(BROADCASTI32X2, Z, r)
+ CASE_MASK_INS_COMMON(BROADCASTF32X2, Z, m)
+ CASE_MASK_INS_COMMON(BROADCASTI32X2, Z, m)
+ MaskRegName = getRegName(MI->getOperand(2).getReg());
+ break;
+ }
+
+ // MASK: zmmX {%kY}
+ OpMaskName += " {%";
+ OpMaskName += MaskRegName;
+ OpMaskName += "}";
+
+ // MASKZ: zmmX {%kY} {z}
+ if (MaskWithZero)
+ OpMaskName += " {z}";
+
+ return OpMaskName;
+}
+
+//===----------------------------------------------------------------------===//
+// Top Level Entrypoint
+//===----------------------------------------------------------------------===//
+
+/// EmitAnyX86InstComments - This function decodes x86 instructions and prints
+/// newline terminated strings to the specified string if desired. This
+/// information is shown in disassembly dumps when verbose assembly is enabled.
+bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
+ const char *(*getRegName)(unsigned)) {
+ // If this is a shuffle operation, the switch should fill in this state.
+ SmallVector<int, 8> ShuffleMask;
+ const char *DestName = nullptr, *Src1Name = nullptr, *Src2Name = nullptr;
+ unsigned NumOperands = MI->getNumOperands();
+ bool RegForm = false;
+
+ switch (MI->getOpcode()) {
+ default:
+ // Not an instruction for which we can decode comments.
+ return false;
+
+ case X86::BLENDPDrri:
+ case X86::VBLENDPDrri:
+ case X86::VBLENDPDYrri:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ LLVM_FALLTHROUGH;
+ case X86::BLENDPDrmi:
+ case X86::VBLENDPDrmi:
+ case X86::VBLENDPDYrmi:
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::f64, 0),
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::BLENDPSrri:
+ case X86::VBLENDPSrri:
+ case X86::VBLENDPSYrri:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ LLVM_FALLTHROUGH;
+ case X86::BLENDPSrmi:
+ case X86::VBLENDPSrmi:
+ case X86::VBLENDPSYrmi:
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::f32, 0),
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::PBLENDWrri:
+ case X86::VPBLENDWrri:
+ case X86::VPBLENDWYrri:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ LLVM_FALLTHROUGH;
+ case X86::PBLENDWrmi:
+ case X86::VPBLENDWrmi:
+ case X86::VPBLENDWYrmi:
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::i16, 0),
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::VPBLENDDrri:
+ case X86::VPBLENDDYrri:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ LLVM_FALLTHROUGH;
+ case X86::VPBLENDDrmi:
+ case X86::VPBLENDDYrmi:
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::i32, 0),
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::INSERTPSrr:
+ case X86::VINSERTPSrr:
+ case X86::VINSERTPSZrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ LLVM_FALLTHROUGH;
+ case X86::INSERTPSrm:
+ case X86::VINSERTPSrm:
+ case X86::VINSERTPSZrm:
+ DestName = getRegName(MI->getOperand(0).getReg());
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodeINSERTPSMask(MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ break;
+
+ case X86::MOVLHPSrr:
+ case X86::VMOVLHPSrr:
+ case X86::VMOVLHPSZrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeMOVLHPSMask(2, ShuffleMask);
+ break;
+
+ case X86::MOVHLPSrr:
+ case X86::VMOVHLPSrr:
+ case X86::VMOVHLPSZrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeMOVHLPSMask(2, ShuffleMask);
+ break;
+
+ case X86::MOVHPDrm:
+ case X86::VMOVHPDrm:
+ case X86::VMOVHPDZ128rm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeInsertElementMask(MVT::v2f64, 1, 1, ShuffleMask);
+ break;
+
+ case X86::MOVHPSrm:
+ case X86::VMOVHPSrm:
+ case X86::VMOVHPSZ128rm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeInsertElementMask(MVT::v4f32, 2, 2, ShuffleMask);
+ break;
+
+ case X86::MOVLPDrm:
+ case X86::VMOVLPDrm:
+ case X86::VMOVLPDZ128rm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeInsertElementMask(MVT::v2f64, 0, 1, ShuffleMask);
+ break;
+
+ case X86::MOVLPSrm:
+ case X86::VMOVLPSrm:
+ case X86::VMOVLPSZ128rm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeInsertElementMask(MVT::v4f32, 0, 2, ShuffleMask);
+ break;
+
+ CASE_MOVDUP(MOVSLDUP, r)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ LLVM_FALLTHROUGH;
+
+ CASE_MOVDUP(MOVSLDUP, m)
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeMOVSLDUPMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask);
+ break;
+
+ CASE_MOVDUP(MOVSHDUP, r)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ LLVM_FALLTHROUGH;
+
+ CASE_MOVDUP(MOVSHDUP, m)
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeMOVSHDUPMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask);
+ break;
+
+ CASE_MOVDUP(MOVDDUP, r)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ LLVM_FALLTHROUGH;
+
+ CASE_MOVDUP(MOVDDUP, m)
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeMOVDDUPMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask);
+ break;
+
+ case X86::PSLLDQri:
+ case X86::VPSLLDQri:
+ case X86::VPSLLDQYri:
+ case X86::VPSLLDQZ128rr:
+ case X86::VPSLLDQZ256rr:
+ case X86::VPSLLDQZ512rr:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ case X86::VPSLLDQZ128rm:
+ case X86::VPSLLDQZ256rm:
+ case X86::VPSLLDQZ512rm:
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodePSLLDQMask(getRegOperandVectorVT(MI, MVT::i8, 0),
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ break;
+
+ case X86::PSRLDQri:
+ case X86::VPSRLDQri:
+ case X86::VPSRLDQYri:
+ case X86::VPSRLDQZ128rr:
+ case X86::VPSRLDQZ256rr:
+ case X86::VPSRLDQZ512rr:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ case X86::VPSRLDQZ128rm:
+ case X86::VPSRLDQZ256rm:
+ case X86::VPSRLDQZ512rm:
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodePSRLDQMask(getRegOperandVectorVT(MI, MVT::i8, 0),
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ break;
+
+ CASE_SHUF(PALIGNR, rri)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_SHUF(PALIGNR, rmi)
+ Src2Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodePALIGNRMask(getRegOperandVectorVT(MI, MVT::i8, 0),
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ break;
+
+ CASE_AVX512_INS_COMMON(ALIGNQ, Z, rri)
+ CASE_AVX512_INS_COMMON(ALIGNQ, Z256, rri)
+ CASE_AVX512_INS_COMMON(ALIGNQ, Z128, rri)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_AVX512_INS_COMMON(ALIGNQ, Z, rmi)
+ CASE_AVX512_INS_COMMON(ALIGNQ, Z256, rmi)
+ CASE_AVX512_INS_COMMON(ALIGNQ, Z128, rmi)
+ Src2Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodeVALIGNMask(getRegOperandVectorVT(MI, MVT::i64, 0),
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ break;
+
+ CASE_AVX512_INS_COMMON(ALIGND, Z, rri)
+ CASE_AVX512_INS_COMMON(ALIGND, Z256, rri)
+ CASE_AVX512_INS_COMMON(ALIGND, Z128, rri)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_AVX512_INS_COMMON(ALIGND, Z, rmi)
+ CASE_AVX512_INS_COMMON(ALIGND, Z256, rmi)
+ CASE_AVX512_INS_COMMON(ALIGND, Z128, rmi)
+ Src2Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodeVALIGNMask(getRegOperandVectorVT(MI, MVT::i32, 0),
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ break;
+
+ CASE_SHUF(PSHUFD, ri)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+ LLVM_FALLTHROUGH;
+
+ CASE_SHUF(PSHUFD, mi)
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::i32, 0),
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ break;
+
+ CASE_SHUF(PSHUFHW, ri)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+ LLVM_FALLTHROUGH;
+
+ CASE_SHUF(PSHUFHW, mi)
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodePSHUFHWMask(getRegOperandVectorVT(MI, MVT::i16, 0),
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ break;
+
+ CASE_SHUF(PSHUFLW, ri)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+ LLVM_FALLTHROUGH;
+
+ CASE_SHUF(PSHUFLW, mi)
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodePSHUFLWMask(getRegOperandVectorVT(MI, MVT::i16, 0),
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ break;
+
+ case X86::MMX_PSHUFWri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ LLVM_FALLTHROUGH;
+
+ case X86::MMX_PSHUFWmi:
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodePSHUFMask(MVT::v4i16,
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ break;
+
+ case X86::PSWAPDrr:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ LLVM_FALLTHROUGH;
+
+ case X86::PSWAPDrm:
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodePSWAPMask(MVT::v2i32, ShuffleMask);
+ break;
+
+ CASE_UNPCK(PUNPCKHBW, r)
+ case X86::MMX_PUNPCKHBWirr:
+ Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_UNPCK(PUNPCKHBW, m)
+ case X86::MMX_PUNPCKHBWirm:
+ Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i8, 0), ShuffleMask);
+ break;
+
+ CASE_UNPCK(PUNPCKHWD, r)
+ case X86::MMX_PUNPCKHWDirr:
+ Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_UNPCK(PUNPCKHWD, m)
+ case X86::MMX_PUNPCKHWDirm:
+ Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i16, 0), ShuffleMask);
+ break;
+
+ CASE_UNPCK(PUNPCKHDQ, r)
+ case X86::MMX_PUNPCKHDQirr:
+ Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_UNPCK(PUNPCKHDQ, m)
+ case X86::MMX_PUNPCKHDQirm:
+ Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i32, 0), ShuffleMask);
+ break;
+
+ CASE_UNPCK(PUNPCKHQDQ, r)
+ Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_UNPCK(PUNPCKHQDQ, m)
+ Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i64, 0), ShuffleMask);
+ break;
+
+ CASE_UNPCK(PUNPCKLBW, r)
+ case X86::MMX_PUNPCKLBWirr:
+ Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_UNPCK(PUNPCKLBW, m)
+ case X86::MMX_PUNPCKLBWirm:
+ Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i8, 0), ShuffleMask);
+ break;
+
+ CASE_UNPCK(PUNPCKLWD, r)
+ case X86::MMX_PUNPCKLWDirr:
+ Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_UNPCK(PUNPCKLWD, m)
+ case X86::MMX_PUNPCKLWDirm:
+ Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i16, 0), ShuffleMask);
+ break;
+
+ CASE_UNPCK(PUNPCKLDQ, r)
+ case X86::MMX_PUNPCKLDQirr:
+ Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_UNPCK(PUNPCKLDQ, m)
+ case X86::MMX_PUNPCKLDQirm:
+ Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i32, 0), ShuffleMask);
+ break;
+
+ CASE_UNPCK(PUNPCKLQDQ, r)
+ Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_UNPCK(PUNPCKLQDQ, m)
+ Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i64, 0), ShuffleMask);
+ break;
+
+ CASE_SHUF(SHUFPD, rri)
+ Src2Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_SHUF(SHUFPD, rmi)
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodeSHUFPMask(getRegOperandVectorVT(MI, MVT::f64, 0),
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_SHUF(SHUFPS, rri)
+ Src2Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_SHUF(SHUFPS, rmi)
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodeSHUFPMask(getRegOperandVectorVT(MI, MVT::f32, 0),
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_VSHUF(64X2, r)
+ Src2Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_VSHUF(64X2, m)
+ decodeVSHUF64x2FamilyMask(getRegOperandVectorVT(MI, MVT::i64, 0),
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_VSHUF(32X4, r)
+ Src2Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_VSHUF(32X4, m)
+ decodeVSHUF64x2FamilyMask(getRegOperandVectorVT(MI, MVT::i32, 0),
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_UNPCK(UNPCKLPD, r)
+ Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_UNPCK(UNPCKLPD, m)
+ DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_UNPCK(UNPCKLPS, r)
+ Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_UNPCK(UNPCKLPS, m)
+ DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_UNPCK(UNPCKHPD, r)
+ Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_UNPCK(UNPCKHPD, m)
+ DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_UNPCK(UNPCKHPS, r)
+ Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_UNPCK(UNPCKHPS, m)
+ DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_VPERMILPI(PERMILPS, r)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+ LLVM_FALLTHROUGH;
+
+ CASE_VPERMILPI(PERMILPS, m)
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::f32, 0),
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_VPERMILPI(PERMILPD, r)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+ LLVM_FALLTHROUGH;
+
+ CASE_VPERMILPI(PERMILPD, m)
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::f64, 0),
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::VPERM2F128rr:
+ case X86::VPERM2I128rr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ LLVM_FALLTHROUGH;
+
+ case X86::VPERM2F128rm:
+ case X86::VPERM2I128rm:
+ // For instruction comments purpose, assume the 256-bit vector is v4i64.
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodeVPERM2X128Mask(MVT::v4i64,
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_VPERM(PERMPD, r)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+ LLVM_FALLTHROUGH;
+
+ CASE_VPERM(PERMPD, m)
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodeVPERMMask(getRegOperandVectorVT(MI, MVT::f64, 0),
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_VPERM(PERMQ, r)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+ LLVM_FALLTHROUGH;
+
+ CASE_VPERM(PERMQ, m)
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodeVPERMMask(getRegOperandVectorVT(MI, MVT::i64, 0),
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::MOVSDrr:
+ case X86::VMOVSDrr:
+ case X86::VMOVSDZrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ LLVM_FALLTHROUGH;
+
+ case X86::MOVSDrm:
+ case X86::VMOVSDrm:
+ case X86::VMOVSDZrm:
+ DecodeScalarMoveMask(MVT::v2f64, nullptr == Src2Name, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::MOVSSrr:
+ case X86::VMOVSSrr:
+ case X86::VMOVSSZrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ LLVM_FALLTHROUGH;
+
+ case X86::MOVSSrm:
+ case X86::VMOVSSrm:
+ case X86::VMOVSSZrm:
+ DecodeScalarMoveMask(MVT::v4f32, nullptr == Src2Name, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::MOVPQI2QIrr:
+ case X86::MOVZPQILo2PQIrr:
+ case X86::VMOVPQI2QIrr:
+ case X86::VMOVZPQILo2PQIrr:
+ case X86::VMOVZPQILo2PQIZrr:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ LLVM_FALLTHROUGH;
+
+ case X86::MOVQI2PQIrm:
+ case X86::VMOVQI2PQIrm:
+ case X86::VMOVQI2PQIZrm:
+ DecodeZeroMoveLowMask(MVT::v2i64, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::MOVDI2PDIrm:
+ case X86::VMOVDI2PDIrm:
+ case X86::VMOVDI2PDIZrm:
+ DecodeZeroMoveLowMask(MVT::v4i32, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::EXTRQI:
+ if (MI->getOperand(2).isImm() &&
+ MI->getOperand(3).isImm())
+ DecodeEXTRQIMask(MI->getOperand(2).getImm(),
+ MI->getOperand(3).getImm(),
+ ShuffleMask);
+
+ DestName = getRegName(MI->getOperand(0).getReg());
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ break;
+
+ case X86::INSERTQI:
+ if (MI->getOperand(3).isImm() &&
+ MI->getOperand(4).isImm())
+ DecodeINSERTQIMask(MI->getOperand(3).getImm(),
+ MI->getOperand(4).getImm(),
+ ShuffleMask);
+
+ DestName = getRegName(MI->getOperand(0).getReg());
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ break;
+
+ case X86::VBROADCASTF128:
+ case X86::VBROADCASTI128:
+ CASE_AVX512_INS_COMMON(BROADCASTF64X2, Z128, rm)
+ CASE_AVX512_INS_COMMON(BROADCASTI64X2, Z128, rm)
+ DecodeSubVectorBroadcast(MVT::v4f64, MVT::v2f64, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ CASE_AVX512_INS_COMMON(BROADCASTF64X2, , rm)
+ CASE_AVX512_INS_COMMON(BROADCASTI64X2, , rm)
+ DecodeSubVectorBroadcast(MVT::v8f64, MVT::v2f64, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ CASE_AVX512_INS_COMMON(BROADCASTF64X4, , rm)
+ CASE_AVX512_INS_COMMON(BROADCASTI64X4, , rm)
+ DecodeSubVectorBroadcast(MVT::v8f64, MVT::v4f64, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ CASE_AVX512_INS_COMMON(BROADCASTF32X4, Z256, rm)
+ CASE_AVX512_INS_COMMON(BROADCASTI32X4, Z256, rm)
+ DecodeSubVectorBroadcast(MVT::v8f32, MVT::v4f32, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ CASE_AVX512_INS_COMMON(BROADCASTF32X4, , rm)
+ CASE_AVX512_INS_COMMON(BROADCASTI32X4, , rm)
+ DecodeSubVectorBroadcast(MVT::v16f32, MVT::v4f32, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ CASE_AVX512_INS_COMMON(BROADCASTF32X8, , rm)
+ CASE_AVX512_INS_COMMON(BROADCASTI32X8, , rm)
+ DecodeSubVectorBroadcast(MVT::v16f32, MVT::v8f32, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, r)
+ CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, r)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, m)
+ CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, m)
+ DecodeSubVectorBroadcast(MVT::v8f32, MVT::v2f32, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, r)
+ CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, r)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, m)
+ CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, m)
+ DecodeSubVectorBroadcast(MVT::v16f32, MVT::v2f32, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_PMOVZX(PMOVZXBW, r)
+ CASE_PMOVZX(PMOVZXBD, r)
+ CASE_PMOVZX(PMOVZXBQ, r)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ LLVM_FALLTHROUGH;
+
+ CASE_PMOVZX(PMOVZXBW, m)
+ CASE_PMOVZX(PMOVZXBD, m)
+ CASE_PMOVZX(PMOVZXBQ, m)
+ DecodeZeroExtendMask(MVT::i8, getZeroExtensionResultType(MI), ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_PMOVZX(PMOVZXWD, r)
+ CASE_PMOVZX(PMOVZXWQ, r)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ LLVM_FALLTHROUGH;
+
+ CASE_PMOVZX(PMOVZXWD, m)
+ CASE_PMOVZX(PMOVZXWQ, m)
+ DecodeZeroExtendMask(MVT::i16, getZeroExtensionResultType(MI), ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_PMOVZX(PMOVZXDQ, r)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ LLVM_FALLTHROUGH;
+
+ CASE_PMOVZX(PMOVZXDQ, m)
+ DecodeZeroExtendMask(MVT::i32, getZeroExtensionResultType(MI), ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ }
+
+ // The only comments we decode are shuffles, so give up if we were unable to
+ // decode a shuffle mask.
+ if (ShuffleMask.empty())
+ return false;
+
+ if (!DestName) DestName = Src1Name;
+ OS << (DestName ? getMaskName(MI, DestName, getRegName) : "mem") << " = ";
+
+ // If the two sources are the same, canonicalize the input elements to be
+ // from the first src so that we get larger element spans.
+ if (Src1Name == Src2Name) {
+ for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) {
+ if ((int)ShuffleMask[i] >= 0 && // Not sentinel.
+ ShuffleMask[i] >= (int)e) // From second mask.
+ ShuffleMask[i] -= e;
+ }
+ }
+
+ // The shuffle mask specifies which elements of the src1/src2 fill in the
+ // destination, with a few sentinel values. Loop through and print them
+ // out.
+ for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) {
+ if (i != 0)
+ OS << ',';
+ if (ShuffleMask[i] == SM_SentinelZero) {
+ OS << "zero";
+ continue;
+ }
+
+ // Otherwise, it must come from src1 or src2. Print the span of elements
+ // that comes from this src.
+ bool isSrc1 = ShuffleMask[i] < (int)ShuffleMask.size();
+ const char *SrcName = isSrc1 ? Src1Name : Src2Name;
+ OS << (SrcName ? SrcName : "mem") << '[';
+ bool IsFirst = true;
+ while (i != e && (int)ShuffleMask[i] != SM_SentinelZero &&
+ (ShuffleMask[i] < (int)ShuffleMask.size()) == isSrc1) {
+ if (!IsFirst)
+ OS << ',';
+ else
+ IsFirst = false;
+ if (ShuffleMask[i] == SM_SentinelUndef)
+ OS << "u";
+ else
+ OS << ShuffleMask[i] % ShuffleMask.size();
+ ++i;
+ }
+ OS << ']';
+ --i; // For loop increments element #.
+ }
+ //MI->print(OS, 0);
+ OS << "\n";
+
+ // We successfully added a comment to this instruction.
+ return true;
+}
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h
new file mode 100644
index 000000000000..c6d0d85a7d3d
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h
@@ -0,0 +1,30 @@
+//=- X86InstComments.h - Generate verbose-asm comments for instrs -*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This defines functionality used to emit comments about X86 instructions to
+// an output stream for -fverbose-asm.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTCOMMENTS_H
+#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTCOMMENTS_H
+
+namespace llvm {
+
+ enum AsmComments {
+ AC_EVEX_2_VEX = 0x2 // For instr that was compressed from EVEX to VEX.
+ };
+
+ class MCInst;
+ class raw_ostream;
+ bool EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
+ const char *(*getRegName)(unsigned));
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
new file mode 100644
index 000000000000..4443edb8e342
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
@@ -0,0 +1,260 @@
+//===-- X86IntelInstPrinter.cpp - Intel assembly instruction printing -----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes code for rendering MCInst instances as Intel-style
+// assembly.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86IntelInstPrinter.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "X86InstComments.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include <cctype>
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+#include "X86GenAsmWriter1.inc"
+
+void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+ OS << getRegisterName(RegNo);
+}
+
+void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+ StringRef Annot,
+ const MCSubtargetInfo &STI) {
+ const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+ uint64_t TSFlags = Desc.TSFlags;
+
+ if (TSFlags & X86II::LOCK)
+ OS << "\tlock\n";
+
+ printInstruction(MI, OS);
+
+ // Next always print the annotation.
+ printAnnotation(OS, Annot);
+
+ // If verbose assembly is enabled, we can print some informative comments.
+ if (CommentStream)
+ EmitAnyX86InstComments(MI, *CommentStream, getRegisterName);
+}
+
+void X86IntelInstPrinter::printSSEAVXCC(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ int64_t Imm = MI->getOperand(Op).getImm();
+ switch (Imm) {
+ default: llvm_unreachable("Invalid avxcc argument!");
+ case 0: O << "eq"; break;
+ case 1: O << "lt"; break;
+ case 2: O << "le"; break;
+ case 3: O << "unord"; break;
+ case 4: O << "neq"; break;
+ case 5: O << "nlt"; break;
+ case 6: O << "nle"; break;
+ case 7: O << "ord"; break;
+ case 8: O << "eq_uq"; break;
+ case 9: O << "nge"; break;
+ case 0xa: O << "ngt"; break;
+ case 0xb: O << "false"; break;
+ case 0xc: O << "neq_oq"; break;
+ case 0xd: O << "ge"; break;
+ case 0xe: O << "gt"; break;
+ case 0xf: O << "true"; break;
+ case 0x10: O << "eq_os"; break;
+ case 0x11: O << "lt_oq"; break;
+ case 0x12: O << "le_oq"; break;
+ case 0x13: O << "unord_s"; break;
+ case 0x14: O << "neq_us"; break;
+ case 0x15: O << "nlt_uq"; break;
+ case 0x16: O << "nle_uq"; break;
+ case 0x17: O << "ord_s"; break;
+ case 0x18: O << "eq_us"; break;
+ case 0x19: O << "nge_uq"; break;
+ case 0x1a: O << "ngt_uq"; break;
+ case 0x1b: O << "false_os"; break;
+ case 0x1c: O << "neq_os"; break;
+ case 0x1d: O << "ge_oq"; break;
+ case 0x1e: O << "gt_oq"; break;
+ case 0x1f: O << "true_us"; break;
+ }
+}
+
+void X86IntelInstPrinter::printXOPCC(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ int64_t Imm = MI->getOperand(Op).getImm();
+ switch (Imm) {
+ default: llvm_unreachable("Invalid xopcc argument!");
+ case 0: O << "lt"; break;
+ case 1: O << "le"; break;
+ case 2: O << "gt"; break;
+ case 3: O << "ge"; break;
+ case 4: O << "eq"; break;
+ case 5: O << "neq"; break;
+ case 6: O << "false"; break;
+ case 7: O << "true"; break;
+ }
+}
+
+void X86IntelInstPrinter::printRoundingControl(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ int64_t Imm = MI->getOperand(Op).getImm() & 0x3;
+ switch (Imm) {
+ case 0: O << "{rn-sae}"; break;
+ case 1: O << "{rd-sae}"; break;
+ case 2: O << "{ru-sae}"; break;
+ case 3: O << "{rz-sae}"; break;
+ }
+}
+
+/// printPCRelImm - This is used to print an immediate value that ends up
+/// being encoded as a pc-relative value.
+void X86IntelInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isImm())
+ O << formatImm(Op.getImm());
+ else {
+ assert(Op.isExpr() && "unknown pcrel immediate operand");
+ // If a symbolic branch target was added as a constant expression then print
+ // that address in hex.
+ const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
+ int64_t Address;
+ if (BranchTarget && BranchTarget->evaluateAsAbsolute(Address)) {
+ O << formatHex((uint64_t)Address);
+ }
+ else {
+ // Otherwise, just print the expression.
+ Op.getExpr()->print(O, &MAI);
+ }
+ }
+}
+
+void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isReg()) {
+ printRegName(O, Op.getReg());
+ } else if (Op.isImm()) {
+ O << formatImm((int64_t)Op.getImm());
+ } else {
+ assert(Op.isExpr() && "unknown operand kind in printOperand");
+ Op.getExpr()->print(O, &MAI);
+ }
+}
+
+void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ const MCOperand &BaseReg = MI->getOperand(Op+X86::AddrBaseReg);
+ unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
+ const MCOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
+ const MCOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp);
+ const MCOperand &SegReg = MI->getOperand(Op+X86::AddrSegmentReg);
+
+ // If this has a segment register, print it.
+ if (SegReg.getReg()) {
+ printOperand(MI, Op+X86::AddrSegmentReg, O);
+ O << ':';
+ }
+
+ O << '[';
+
+ bool NeedPlus = false;
+ if (BaseReg.getReg()) {
+ printOperand(MI, Op+X86::AddrBaseReg, O);
+ NeedPlus = true;
+ }
+
+ if (IndexReg.getReg()) {
+ if (NeedPlus) O << " + ";
+ if (ScaleVal != 1)
+ O << ScaleVal << '*';
+ printOperand(MI, Op+X86::AddrIndexReg, O);
+ NeedPlus = true;
+ }
+
+ if (!DispSpec.isImm()) {
+ if (NeedPlus) O << " + ";
+ assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
+ DispSpec.getExpr()->print(O, &MAI);
+ } else {
+ int64_t DispVal = DispSpec.getImm();
+ if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) {
+ if (NeedPlus) {
+ if (DispVal > 0)
+ O << " + ";
+ else {
+ O << " - ";
+ DispVal = -DispVal;
+ }
+ }
+ O << formatImm(DispVal);
+ }
+ }
+
+ O << ']';
+}
+
+void X86IntelInstPrinter::printSrcIdx(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ const MCOperand &SegReg = MI->getOperand(Op+1);
+
+ // If this has a segment register, print it.
+ if (SegReg.getReg()) {
+ printOperand(MI, Op+1, O);
+ O << ':';
+ }
+ O << '[';
+ printOperand(MI, Op, O);
+ O << ']';
+}
+
+void X86IntelInstPrinter::printDstIdx(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ // DI accesses are always ES-based.
+ O << "es:[";
+ printOperand(MI, Op, O);
+ O << ']';
+}
+
+void X86IntelInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ const MCOperand &DispSpec = MI->getOperand(Op);
+ const MCOperand &SegReg = MI->getOperand(Op+1);
+
+ // If this has a segment register, print it.
+ if (SegReg.getReg()) {
+ printOperand(MI, Op+1, O);
+ O << ':';
+ }
+
+ O << '[';
+
+ if (DispSpec.isImm()) {
+ O << formatImm(DispSpec.getImm());
+ } else {
+ assert(DispSpec.isExpr() && "non-immediate displacement?");
+ DispSpec.getExpr()->print(O, &MAI);
+ }
+
+ O << ']';
+}
+
+void X86IntelInstPrinter::printU8Imm(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ if (MI->getOperand(Op).isExpr())
+ return MI->getOperand(Op).getExpr()->print(O, &MAI);
+
+ O << formatImm(MI->getOperand(Op).getImm() & 0xff);
+}
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
new file mode 100644
index 000000000000..20cd7ffb2e63
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
@@ -0,0 +1,162 @@
+//= X86IntelInstPrinter.h - Convert X86 MCInst to assembly syntax -*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an X86 MCInst to Intel style .s file syntax.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86INTELINSTPRINTER_H
+#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86INTELINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+class X86IntelInstPrinter final : public MCInstPrinter {
+public:
+ X86IntelInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI)
+ : MCInstPrinter(MAI, MII, MRI) {}
+
+ void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+ void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot,
+ const MCSubtargetInfo &STI) override;
+
+ // Autogenerated by tblgen.
+ void printInstruction(const MCInst *MI, raw_ostream &O);
+ static const char *getRegisterName(unsigned RegNo);
+
+ void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &O);
+ void printSSEAVXCC(const MCInst *MI, unsigned Op, raw_ostream &O);
+ void printXOPCC(const MCInst *MI, unsigned Op, raw_ostream &O);
+ void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printMemOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printSrcIdx(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printDstIdx(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &OS);
+ void printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &O);
+
+ void printanymem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+
+ void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "opaque ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+
+ void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "byte ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printi16mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "word ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printi32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "dword ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printi64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "qword ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printi128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "xmmword ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "ymmword ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printi512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "zmmword ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "dword ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printf64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "qword ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printf80mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "xword ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printf128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "xmmword ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "ymmword ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printf512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "zmmword ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+
+
+ void printSrcIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "byte ptr ";
+ printSrcIdx(MI, OpNo, O);
+ }
+ void printSrcIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "word ptr ";
+ printSrcIdx(MI, OpNo, O);
+ }
+ void printSrcIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "dword ptr ";
+ printSrcIdx(MI, OpNo, O);
+ }
+ void printSrcIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "qword ptr ";
+ printSrcIdx(MI, OpNo, O);
+ }
+ void printDstIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "byte ptr ";
+ printDstIdx(MI, OpNo, O);
+ }
+ void printDstIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "word ptr ";
+ printDstIdx(MI, OpNo, O);
+ }
+ void printDstIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "dword ptr ";
+ printDstIdx(MI, OpNo, O);
+ }
+ void printDstIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "qword ptr ";
+ printDstIdx(MI, OpNo, O);
+ }
+ void printMemOffs8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "byte ptr ";
+ printMemOffset(MI, OpNo, O);
+ }
+ void printMemOffs16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "word ptr ";
+ printMemOffset(MI, OpNo, O);
+ }
+ void printMemOffs32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "dword ptr ";
+ printMemOffset(MI, OpNo, O);
+ }
+ void printMemOffs64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "qword ptr ";
+ printMemOffset(MI, OpNo, O);
+ }
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
new file mode 100644
index 000000000000..e83ec9f4045a
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -0,0 +1,881 @@
+//===-- X86AsmBackend.cpp - X86 Assembler Backend -------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+static unsigned getFixupKindLog2Size(unsigned Kind) {
+ switch (Kind) {
+ default:
+ llvm_unreachable("invalid fixup kind!");
+ case FK_PCRel_1:
+ case FK_SecRel_1:
+ case FK_Data_1:
+ return 0;
+ case FK_PCRel_2:
+ case FK_SecRel_2:
+ case FK_Data_2:
+ return 1;
+ case FK_PCRel_4:
+ case X86::reloc_riprel_4byte:
+ case X86::reloc_riprel_4byte_relax:
+ case X86::reloc_riprel_4byte_relax_rex:
+ case X86::reloc_riprel_4byte_movq_load:
+ case X86::reloc_signed_4byte:
+ case X86::reloc_signed_4byte_relax:
+ case X86::reloc_global_offset_table:
+ case FK_SecRel_4:
+ case FK_Data_4:
+ return 2;
+ case FK_PCRel_8:
+ case FK_SecRel_8:
+ case FK_Data_8:
+ case X86::reloc_global_offset_table8:
+ return 3;
+ }
+}
+
+namespace {
+
+class X86ELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+ X86ELFObjectWriter(bool is64Bit, uint8_t OSABI, uint16_t EMachine,
+ bool HasRelocationAddend, bool foobar)
+ : MCELFObjectTargetWriter(is64Bit, OSABI, EMachine, HasRelocationAddend) {}
+};
+
+class X86AsmBackend : public MCAsmBackend {
+ const StringRef CPU;
+ bool HasNopl;
+ const uint64_t MaxNopLength;
+public:
+ X86AsmBackend(const Target &T, StringRef CPU)
+ : MCAsmBackend(), CPU(CPU),
+ MaxNopLength((CPU == "slm") ? 7 : 15) {
+ HasNopl = CPU != "generic" && CPU != "i386" && CPU != "i486" &&
+ CPU != "i586" && CPU != "pentium" && CPU != "pentium-mmx" &&
+ CPU != "i686" && CPU != "k6" && CPU != "k6-2" && CPU != "k6-3" &&
+ CPU != "geode" && CPU != "winchip-c6" && CPU != "winchip2" &&
+ CPU != "c3" && CPU != "c3-2" && CPU != "lakemont";
+ }
+
+ unsigned getNumFixupKinds() const override {
+ return X86::NumTargetFixupKinds;
+ }
+
+ const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
+ const static MCFixupKindInfo Infos[X86::NumTargetFixupKinds] = {
+ {"reloc_riprel_4byte", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ {"reloc_riprel_4byte_movq_load", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ {"reloc_riprel_4byte_relax", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ {"reloc_riprel_4byte_relax_rex", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ {"reloc_signed_4byte", 0, 32, 0},
+ {"reloc_signed_4byte_relax", 0, 32, 0},
+ {"reloc_global_offset_table", 0, 32, 0},
+ {"reloc_global_offset_table8", 0, 64, 0},
+ };
+
+ if (Kind < FirstTargetFixupKind)
+ return MCAsmBackend::getFixupKindInfo(Kind);
+
+ assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+ "Invalid kind!");
+ return Infos[Kind - FirstTargetFixupKind];
+ }
+
+ void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+ uint64_t Value, bool IsPCRel) const override {
+ unsigned Size = 1 << getFixupKindLog2Size(Fixup.getKind());
+
+ assert(Fixup.getOffset() + Size <= DataSize &&
+ "Invalid fixup offset!");
+
+ // Check that uppper bits are either all zeros or all ones.
+ // Specifically ignore overflow/underflow as long as the leakage is
+ // limited to the lower bits. This is to remain compatible with
+ // other assemblers.
+ assert(isIntN(Size * 8 + 1, Value) &&
+ "Value does not fit in the Fixup field");
+
+ for (unsigned i = 0; i != Size; ++i)
+ Data[Fixup.getOffset() + i] = uint8_t(Value >> (i * 8));
+ }
+
+ bool mayNeedRelaxation(const MCInst &Inst) const override;
+
+ bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+ const MCRelaxableFragment *DF,
+ const MCAsmLayout &Layout) const override;
+
+ void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+ MCInst &Res) const override;
+
+ bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+};
+} // end anonymous namespace
+
+static unsigned getRelaxedOpcodeBranch(const MCInst &Inst, bool is16BitMode) {
+ unsigned Op = Inst.getOpcode();
+ switch (Op) {
+ default:
+ return Op;
+ case X86::JAE_1:
+ return (is16BitMode) ? X86::JAE_2 : X86::JAE_4;
+ case X86::JA_1:
+ return (is16BitMode) ? X86::JA_2 : X86::JA_4;
+ case X86::JBE_1:
+ return (is16BitMode) ? X86::JBE_2 : X86::JBE_4;
+ case X86::JB_1:
+ return (is16BitMode) ? X86::JB_2 : X86::JB_4;
+ case X86::JE_1:
+ return (is16BitMode) ? X86::JE_2 : X86::JE_4;
+ case X86::JGE_1:
+ return (is16BitMode) ? X86::JGE_2 : X86::JGE_4;
+ case X86::JG_1:
+ return (is16BitMode) ? X86::JG_2 : X86::JG_4;
+ case X86::JLE_1:
+ return (is16BitMode) ? X86::JLE_2 : X86::JLE_4;
+ case X86::JL_1:
+ return (is16BitMode) ? X86::JL_2 : X86::JL_4;
+ case X86::JMP_1:
+ return (is16BitMode) ? X86::JMP_2 : X86::JMP_4;
+ case X86::JNE_1:
+ return (is16BitMode) ? X86::JNE_2 : X86::JNE_4;
+ case X86::JNO_1:
+ return (is16BitMode) ? X86::JNO_2 : X86::JNO_4;
+ case X86::JNP_1:
+ return (is16BitMode) ? X86::JNP_2 : X86::JNP_4;
+ case X86::JNS_1:
+ return (is16BitMode) ? X86::JNS_2 : X86::JNS_4;
+ case X86::JO_1:
+ return (is16BitMode) ? X86::JO_2 : X86::JO_4;
+ case X86::JP_1:
+ return (is16BitMode) ? X86::JP_2 : X86::JP_4;
+ case X86::JS_1:
+ return (is16BitMode) ? X86::JS_2 : X86::JS_4;
+ }
+}
+
+static unsigned getRelaxedOpcodeArith(const MCInst &Inst) {
+ unsigned Op = Inst.getOpcode();
+ switch (Op) {
+ default:
+ return Op;
+
+ // IMUL
+ case X86::IMUL16rri8: return X86::IMUL16rri;
+ case X86::IMUL16rmi8: return X86::IMUL16rmi;
+ case X86::IMUL32rri8: return X86::IMUL32rri;
+ case X86::IMUL32rmi8: return X86::IMUL32rmi;
+ case X86::IMUL64rri8: return X86::IMUL64rri32;
+ case X86::IMUL64rmi8: return X86::IMUL64rmi32;
+
+ // AND
+ case X86::AND16ri8: return X86::AND16ri;
+ case X86::AND16mi8: return X86::AND16mi;
+ case X86::AND32ri8: return X86::AND32ri;
+ case X86::AND32mi8: return X86::AND32mi;
+ case X86::AND64ri8: return X86::AND64ri32;
+ case X86::AND64mi8: return X86::AND64mi32;
+
+ // OR
+ case X86::OR16ri8: return X86::OR16ri;
+ case X86::OR16mi8: return X86::OR16mi;
+ case X86::OR32ri8: return X86::OR32ri;
+ case X86::OR32mi8: return X86::OR32mi;
+ case X86::OR64ri8: return X86::OR64ri32;
+ case X86::OR64mi8: return X86::OR64mi32;
+
+ // XOR
+ case X86::XOR16ri8: return X86::XOR16ri;
+ case X86::XOR16mi8: return X86::XOR16mi;
+ case X86::XOR32ri8: return X86::XOR32ri;
+ case X86::XOR32mi8: return X86::XOR32mi;
+ case X86::XOR64ri8: return X86::XOR64ri32;
+ case X86::XOR64mi8: return X86::XOR64mi32;
+
+ // ADD
+ case X86::ADD16ri8: return X86::ADD16ri;
+ case X86::ADD16mi8: return X86::ADD16mi;
+ case X86::ADD32ri8: return X86::ADD32ri;
+ case X86::ADD32mi8: return X86::ADD32mi;
+ case X86::ADD64ri8: return X86::ADD64ri32;
+ case X86::ADD64mi8: return X86::ADD64mi32;
+
+ // ADC
+ case X86::ADC16ri8: return X86::ADC16ri;
+ case X86::ADC16mi8: return X86::ADC16mi;
+ case X86::ADC32ri8: return X86::ADC32ri;
+ case X86::ADC32mi8: return X86::ADC32mi;
+ case X86::ADC64ri8: return X86::ADC64ri32;
+ case X86::ADC64mi8: return X86::ADC64mi32;
+
+ // SUB
+ case X86::SUB16ri8: return X86::SUB16ri;
+ case X86::SUB16mi8: return X86::SUB16mi;
+ case X86::SUB32ri8: return X86::SUB32ri;
+ case X86::SUB32mi8: return X86::SUB32mi;
+ case X86::SUB64ri8: return X86::SUB64ri32;
+ case X86::SUB64mi8: return X86::SUB64mi32;
+
+ // SBB
+ case X86::SBB16ri8: return X86::SBB16ri;
+ case X86::SBB16mi8: return X86::SBB16mi;
+ case X86::SBB32ri8: return X86::SBB32ri;
+ case X86::SBB32mi8: return X86::SBB32mi;
+ case X86::SBB64ri8: return X86::SBB64ri32;
+ case X86::SBB64mi8: return X86::SBB64mi32;
+
+ // CMP
+ case X86::CMP16ri8: return X86::CMP16ri;
+ case X86::CMP16mi8: return X86::CMP16mi;
+ case X86::CMP32ri8: return X86::CMP32ri;
+ case X86::CMP32mi8: return X86::CMP32mi;
+ case X86::CMP64ri8: return X86::CMP64ri32;
+ case X86::CMP64mi8: return X86::CMP64mi32;
+
+ // PUSH
+ case X86::PUSH32i8: return X86::PUSHi32;
+ case X86::PUSH16i8: return X86::PUSHi16;
+ case X86::PUSH64i8: return X86::PUSH64i32;
+ }
+}
+
+static unsigned getRelaxedOpcode(const MCInst &Inst, bool is16BitMode) {
+ unsigned R = getRelaxedOpcodeArith(Inst);
+ if (R != Inst.getOpcode())
+ return R;
+ return getRelaxedOpcodeBranch(Inst, is16BitMode);
+}
+
+bool X86AsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
+ // Branches can always be relaxed in either mode.
+ if (getRelaxedOpcodeBranch(Inst, false) != Inst.getOpcode())
+ return true;
+
+ // Check if this instruction is ever relaxable.
+ if (getRelaxedOpcodeArith(Inst) == Inst.getOpcode())
+ return false;
+
+
+ // Check if the relaxable operand has an expression. For the current set of
+ // relaxable instructions, the relaxable operand is always the last operand.
+ unsigned RelaxableOp = Inst.getNumOperands() - 1;
+ if (Inst.getOperand(RelaxableOp).isExpr())
+ return true;
+
+ return false;
+}
+
+bool X86AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
+ uint64_t Value,
+ const MCRelaxableFragment *DF,
+ const MCAsmLayout &Layout) const {
+ // Relax if the value is too big for a (signed) i8.
+ return int64_t(Value) != int64_t(int8_t(Value));
+}
+
+// FIXME: Can tblgen help at all here to verify there aren't other instructions
+// we can relax?
+void X86AsmBackend::relaxInstruction(const MCInst &Inst,
+ const MCSubtargetInfo &STI,
+ MCInst &Res) const {
+ // The only relaxations X86 does is from a 1byte pcrel to a 4byte pcrel.
+ bool is16BitMode = STI.getFeatureBits()[X86::Mode16Bit];
+ unsigned RelaxedOp = getRelaxedOpcode(Inst, is16BitMode);
+
+ if (RelaxedOp == Inst.getOpcode()) {
+ SmallString<256> Tmp;
+ raw_svector_ostream OS(Tmp);
+ Inst.dump_pretty(OS);
+ OS << "\n";
+ report_fatal_error("unexpected instruction to relax: " + OS.str());
+ }
+
+ Res = Inst;
+ Res.setOpcode(RelaxedOp);
+}
+
+/// \brief Write a sequence of optimal nops to the output, covering \p Count
+/// bytes.
+/// \return - true on success, false on failure
+bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+ static const uint8_t Nops[10][10] = {
+ // nop
+ {0x90},
+ // xchg %ax,%ax
+ {0x66, 0x90},
+ // nopl (%[re]ax)
+ {0x0f, 0x1f, 0x00},
+ // nopl 0(%[re]ax)
+ {0x0f, 0x1f, 0x40, 0x00},
+ // nopl 0(%[re]ax,%[re]ax,1)
+ {0x0f, 0x1f, 0x44, 0x00, 0x00},
+ // nopw 0(%[re]ax,%[re]ax,1)
+ {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00},
+ // nopl 0L(%[re]ax)
+ {0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00},
+ // nopl 0L(%[re]ax,%[re]ax,1)
+ {0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+ // nopw 0L(%[re]ax,%[re]ax,1)
+ {0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+ // nopw %cs:0L(%[re]ax,%[re]ax,1)
+ {0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+ };
+
+ // This CPU doesn't support long nops. If needed add more.
+ // FIXME: Can we get this from the subtarget somehow?
+ // FIXME: We could generated something better than plain 0x90.
+ if (!HasNopl) {
+ for (uint64_t i = 0; i < Count; ++i)
+ OW->write8(0x90);
+ return true;
+ }
+
+ // 15 is the longest single nop instruction. Emit as many 15-byte nops as
+ // needed, then emit a nop of the remaining length.
+ do {
+ const uint8_t ThisNopLength = (uint8_t) std::min(Count, MaxNopLength);
+ const uint8_t Prefixes = ThisNopLength <= 10 ? 0 : ThisNopLength - 10;
+ for (uint8_t i = 0; i < Prefixes; i++)
+ OW->write8(0x66);
+ const uint8_t Rest = ThisNopLength - Prefixes;
+ for (uint8_t i = 0; i < Rest; i++)
+ OW->write8(Nops[Rest - 1][i]);
+ Count -= ThisNopLength;
+ } while (Count != 0);
+
+ return true;
+}
+
+/* *** */
+
+namespace {
+
+class ELFX86AsmBackend : public X86AsmBackend {
+public:
+ uint8_t OSABI;
+ ELFX86AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
+ : X86AsmBackend(T, CPU), OSABI(OSABI) {}
+};
+
+class ELFX86_32AsmBackend : public ELFX86AsmBackend {
+public:
+ ELFX86_32AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
+ : ELFX86AsmBackend(T, OSABI, CPU) {}
+
+ MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+ return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI, ELF::EM_386);
+ }
+};
+
+class ELFX86_X32AsmBackend : public ELFX86AsmBackend {
+public:
+ ELFX86_X32AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
+ : ELFX86AsmBackend(T, OSABI, CPU) {}
+
+ MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+ return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI,
+ ELF::EM_X86_64);
+ }
+};
+
+class ELFX86_IAMCUAsmBackend : public ELFX86AsmBackend {
+public:
+ ELFX86_IAMCUAsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
+ : ELFX86AsmBackend(T, OSABI, CPU) {}
+
+ MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+ return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI,
+ ELF::EM_IAMCU);
+ }
+};
+
+class ELFX86_64AsmBackend : public ELFX86AsmBackend {
+public:
+ ELFX86_64AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
+ : ELFX86AsmBackend(T, OSABI, CPU) {}
+
+ MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+ return createX86ELFObjectWriter(OS, /*IsELF64*/ true, OSABI, ELF::EM_X86_64);
+ }
+};
+
+class WindowsX86AsmBackend : public X86AsmBackend {
+ bool Is64Bit;
+
+public:
+ WindowsX86AsmBackend(const Target &T, bool is64Bit, StringRef CPU)
+ : X86AsmBackend(T, CPU)
+ , Is64Bit(is64Bit) {
+ }
+
+ Optional<MCFixupKind> getFixupKind(StringRef Name) const override {
+ return StringSwitch<Optional<MCFixupKind>>(Name)
+ .Case("dir32", FK_Data_4)
+ .Case("secrel32", FK_SecRel_4)
+ .Case("secidx", FK_SecRel_2)
+ .Default(MCAsmBackend::getFixupKind(Name));
+ }
+
+ MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+ return createX86WinCOFFObjectWriter(OS, Is64Bit);
+ }
+};
+
+namespace CU {
+
+ /// Compact unwind encoding values.
+ enum CompactUnwindEncodings {
+ /// [RE]BP based frame where [RE]BP is pused on the stack immediately after
+ /// the return address, then [RE]SP is moved to [RE]BP.
+ UNWIND_MODE_BP_FRAME = 0x01000000,
+
+ /// A frameless function with a small constant stack size.
+ UNWIND_MODE_STACK_IMMD = 0x02000000,
+
+ /// A frameless function with a large constant stack size.
+ UNWIND_MODE_STACK_IND = 0x03000000,
+
+ /// No compact unwind encoding is available.
+ UNWIND_MODE_DWARF = 0x04000000,
+
+ /// Mask for encoding the frame registers.
+ UNWIND_BP_FRAME_REGISTERS = 0x00007FFF,
+
+ /// Mask for encoding the frameless registers.
+ UNWIND_FRAMELESS_STACK_REG_PERMUTATION = 0x000003FF
+ };
+
+} // end CU namespace
+
+class DarwinX86AsmBackend : public X86AsmBackend {
+ const MCRegisterInfo &MRI;
+
+ /// \brief Number of registers that can be saved in a compact unwind encoding.
+ enum { CU_NUM_SAVED_REGS = 6 };
+
+ mutable unsigned SavedRegs[CU_NUM_SAVED_REGS];
+ bool Is64Bit;
+
+ unsigned OffsetSize; ///< Offset of a "push" instruction.
+ unsigned MoveInstrSize; ///< Size of a "move" instruction.
+ unsigned StackDivide; ///< Amount to adjust stack size by.
+protected:
+ /// \brief Size of a "push" instruction for the given register.
+ unsigned PushInstrSize(unsigned Reg) const {
+ switch (Reg) {
+ case X86::EBX:
+ case X86::ECX:
+ case X86::EDX:
+ case X86::EDI:
+ case X86::ESI:
+ case X86::EBP:
+ case X86::RBX:
+ case X86::RBP:
+ return 1;
+ case X86::R12:
+ case X86::R13:
+ case X86::R14:
+ case X86::R15:
+ return 2;
+ }
+ return 1;
+ }
+
+ /// \brief Implementation of algorithm to generate the compact unwind encoding
+ /// for the CFI instructions.
+ uint32_t
+ generateCompactUnwindEncodingImpl(ArrayRef<MCCFIInstruction> Instrs) const {
+ if (Instrs.empty()) return 0;
+
+ // Reset the saved registers.
+ unsigned SavedRegIdx = 0;
+ memset(SavedRegs, 0, sizeof(SavedRegs));
+
+ bool HasFP = false;
+
+ // Encode that we are using EBP/RBP as the frame pointer.
+ uint32_t CompactUnwindEncoding = 0;
+
+ unsigned SubtractInstrIdx = Is64Bit ? 3 : 2;
+ unsigned InstrOffset = 0;
+ unsigned StackAdjust = 0;
+ unsigned StackSize = 0;
+ unsigned PrevStackSize = 0;
+ unsigned NumDefCFAOffsets = 0;
+
+ for (unsigned i = 0, e = Instrs.size(); i != e; ++i) {
+ const MCCFIInstruction &Inst = Instrs[i];
+
+ switch (Inst.getOperation()) {
+ default:
+ // Any other CFI directives indicate a frame that we aren't prepared
+ // to represent via compact unwind, so just bail out.
+ return 0;
+ case MCCFIInstruction::OpDefCfaRegister: {
+ // Defines a frame pointer. E.g.
+ //
+ // movq %rsp, %rbp
+ // L0:
+ // .cfi_def_cfa_register %rbp
+ //
+ HasFP = true;
+
+ // If the frame pointer is other than esp/rsp, we do not have a way to
+ // generate a compact unwinding representation, so bail out.
+ if (MRI.getLLVMRegNum(Inst.getRegister(), true) !=
+ (Is64Bit ? X86::RBP : X86::EBP))
+ return 0;
+
+ // Reset the counts.
+ memset(SavedRegs, 0, sizeof(SavedRegs));
+ StackAdjust = 0;
+ SavedRegIdx = 0;
+ InstrOffset += MoveInstrSize;
+ break;
+ }
+ case MCCFIInstruction::OpDefCfaOffset: {
+ // Defines a new offset for the CFA. E.g.
+ //
+ // With frame:
+ //
+ // pushq %rbp
+ // L0:
+ // .cfi_def_cfa_offset 16
+ //
+ // Without frame:
+ //
+ // subq $72, %rsp
+ // L0:
+ // .cfi_def_cfa_offset 80
+ //
+ PrevStackSize = StackSize;
+ StackSize = std::abs(Inst.getOffset()) / StackDivide;
+ ++NumDefCFAOffsets;
+ break;
+ }
+ case MCCFIInstruction::OpOffset: {
+ // Defines a "push" of a callee-saved register. E.g.
+ //
+ // pushq %r15
+ // pushq %r14
+ // pushq %rbx
+ // L0:
+ // subq $120, %rsp
+ // L1:
+ // .cfi_offset %rbx, -40
+ // .cfi_offset %r14, -32
+ // .cfi_offset %r15, -24
+ //
+ if (SavedRegIdx == CU_NUM_SAVED_REGS)
+ // If there are too many saved registers, we cannot use a compact
+ // unwind encoding.
+ return CU::UNWIND_MODE_DWARF;
+
+ unsigned Reg = MRI.getLLVMRegNum(Inst.getRegister(), true);
+ SavedRegs[SavedRegIdx++] = Reg;
+ StackAdjust += OffsetSize;
+ InstrOffset += PushInstrSize(Reg);
+ break;
+ }
+ }
+ }
+
+ StackAdjust /= StackDivide;
+
+ if (HasFP) {
+ if ((StackAdjust & 0xFF) != StackAdjust)
+ // Offset was too big for a compact unwind encoding.
+ return CU::UNWIND_MODE_DWARF;
+
+ // Get the encoding of the saved registers when we have a frame pointer.
+ uint32_t RegEnc = encodeCompactUnwindRegistersWithFrame();
+ if (RegEnc == ~0U) return CU::UNWIND_MODE_DWARF;
+
+ CompactUnwindEncoding |= CU::UNWIND_MODE_BP_FRAME;
+ CompactUnwindEncoding |= (StackAdjust & 0xFF) << 16;
+ CompactUnwindEncoding |= RegEnc & CU::UNWIND_BP_FRAME_REGISTERS;
+ } else {
+ // If the amount of the stack allocation is the size of a register, then
+ // we "push" the RAX/EAX register onto the stack instead of adjusting the
+ // stack pointer with a SUB instruction. We don't support the push of the
+ // RAX/EAX register with compact unwind. So we check for that situation
+ // here.
+ if ((NumDefCFAOffsets == SavedRegIdx + 1 &&
+ StackSize - PrevStackSize == 1) ||
+ (Instrs.size() == 1 && NumDefCFAOffsets == 1 && StackSize == 2))
+ return CU::UNWIND_MODE_DWARF;
+
+ SubtractInstrIdx += InstrOffset;
+ ++StackAdjust;
+
+ if ((StackSize & 0xFF) == StackSize) {
+ // Frameless stack with a small stack size.
+ CompactUnwindEncoding |= CU::UNWIND_MODE_STACK_IMMD;
+
+ // Encode the stack size.
+ CompactUnwindEncoding |= (StackSize & 0xFF) << 16;
+ } else {
+ if ((StackAdjust & 0x7) != StackAdjust)
+ // The extra stack adjustments are too big for us to handle.
+ return CU::UNWIND_MODE_DWARF;
+
+ // Frameless stack with an offset too large for us to encode compactly.
+ CompactUnwindEncoding |= CU::UNWIND_MODE_STACK_IND;
+
+ // Encode the offset to the nnnnnn value in the 'subl $nnnnnn, ESP'
+ // instruction.
+ CompactUnwindEncoding |= (SubtractInstrIdx & 0xFF) << 16;
+
+ // Encode any extra stack stack adjustments (done via push
+ // instructions).
+ CompactUnwindEncoding |= (StackAdjust & 0x7) << 13;
+ }
+
+ // Encode the number of registers saved. (Reverse the list first.)
+ std::reverse(&SavedRegs[0], &SavedRegs[SavedRegIdx]);
+ CompactUnwindEncoding |= (SavedRegIdx & 0x7) << 10;
+
+ // Get the encoding of the saved registers when we don't have a frame
+ // pointer.
+ uint32_t RegEnc = encodeCompactUnwindRegistersWithoutFrame(SavedRegIdx);
+ if (RegEnc == ~0U) return CU::UNWIND_MODE_DWARF;
+
+ // Encode the register encoding.
+ CompactUnwindEncoding |=
+ RegEnc & CU::UNWIND_FRAMELESS_STACK_REG_PERMUTATION;
+ }
+
+ return CompactUnwindEncoding;
+ }
+
+private:
+ /// \brief Get the compact unwind number for a given register. The number
+ /// corresponds to the enum lists in compact_unwind_encoding.h.
+ int getCompactUnwindRegNum(unsigned Reg) const {
+ static const MCPhysReg CU32BitRegs[7] = {
+ X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
+ };
+ static const MCPhysReg CU64BitRegs[] = {
+ X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
+ };
+ const MCPhysReg *CURegs = Is64Bit ? CU64BitRegs : CU32BitRegs;
+ for (int Idx = 1; *CURegs; ++CURegs, ++Idx)
+ if (*CURegs == Reg)
+ return Idx;
+
+ return -1;
+ }
+
+ /// \brief Return the registers encoded for a compact encoding with a frame
+ /// pointer.
+ uint32_t encodeCompactUnwindRegistersWithFrame() const {
+ // Encode the registers in the order they were saved --- 3-bits per
+ // register. The list of saved registers is assumed to be in reverse
+ // order. The registers are numbered from 1 to CU_NUM_SAVED_REGS.
+ uint32_t RegEnc = 0;
+ for (int i = 0, Idx = 0; i != CU_NUM_SAVED_REGS; ++i) {
+ unsigned Reg = SavedRegs[i];
+ if (Reg == 0) break;
+
+ int CURegNum = getCompactUnwindRegNum(Reg);
+ if (CURegNum == -1) return ~0U;
+
+ // Encode the 3-bit register number in order, skipping over 3-bits for
+ // each register.
+ RegEnc |= (CURegNum & 0x7) << (Idx++ * 3);
+ }
+
+ assert((RegEnc & 0x3FFFF) == RegEnc &&
+ "Invalid compact register encoding!");
+ return RegEnc;
+ }
+
+ /// \brief Create the permutation encoding used with frameless stacks. It is
+ /// passed the number of registers to be saved and an array of the registers
+ /// saved.
+ uint32_t encodeCompactUnwindRegistersWithoutFrame(unsigned RegCount) const {
+ // The saved registers are numbered from 1 to 6. In order to encode the
+ // order in which they were saved, we re-number them according to their
+ // place in the register order. The re-numbering is relative to the last
+ // re-numbered register. E.g., if we have registers {6, 2, 4, 5} saved in
+ // that order:
+ //
+ // Orig Re-Num
+ // ---- ------
+ // 6 6
+ // 2 2
+ // 4 3
+ // 5 3
+ //
+ for (unsigned i = 0; i < RegCount; ++i) {
+ int CUReg = getCompactUnwindRegNum(SavedRegs[i]);
+ if (CUReg == -1) return ~0U;
+ SavedRegs[i] = CUReg;
+ }
+
+ // Reverse the list.
+ std::reverse(&SavedRegs[0], &SavedRegs[CU_NUM_SAVED_REGS]);
+
+ uint32_t RenumRegs[CU_NUM_SAVED_REGS];
+ for (unsigned i = CU_NUM_SAVED_REGS - RegCount; i < CU_NUM_SAVED_REGS; ++i){
+ unsigned Countless = 0;
+ for (unsigned j = CU_NUM_SAVED_REGS - RegCount; j < i; ++j)
+ if (SavedRegs[j] < SavedRegs[i])
+ ++Countless;
+
+ RenumRegs[i] = SavedRegs[i] - Countless - 1;
+ }
+
+ // Take the renumbered values and encode them into a 10-bit number.
+ uint32_t permutationEncoding = 0;
+ switch (RegCount) {
+ case 6:
+ permutationEncoding |= 120 * RenumRegs[0] + 24 * RenumRegs[1]
+ + 6 * RenumRegs[2] + 2 * RenumRegs[3]
+ + RenumRegs[4];
+ break;
+ case 5:
+ permutationEncoding |= 120 * RenumRegs[1] + 24 * RenumRegs[2]
+ + 6 * RenumRegs[3] + 2 * RenumRegs[4]
+ + RenumRegs[5];
+ break;
+ case 4:
+ permutationEncoding |= 60 * RenumRegs[2] + 12 * RenumRegs[3]
+ + 3 * RenumRegs[4] + RenumRegs[5];
+ break;
+ case 3:
+ permutationEncoding |= 20 * RenumRegs[3] + 4 * RenumRegs[4]
+ + RenumRegs[5];
+ break;
+ case 2:
+ permutationEncoding |= 5 * RenumRegs[4] + RenumRegs[5];
+ break;
+ case 1:
+ permutationEncoding |= RenumRegs[5];
+ break;
+ }
+
+ assert((permutationEncoding & 0x3FF) == permutationEncoding &&
+ "Invalid compact register encoding!");
+ return permutationEncoding;
+ }
+
+public:
+ DarwinX86AsmBackend(const Target &T, const MCRegisterInfo &MRI, StringRef CPU,
+ bool Is64Bit)
+ : X86AsmBackend(T, CPU), MRI(MRI), Is64Bit(Is64Bit) {
+ memset(SavedRegs, 0, sizeof(SavedRegs));
+ OffsetSize = Is64Bit ? 8 : 4;
+ MoveInstrSize = Is64Bit ? 3 : 2;
+ StackDivide = Is64Bit ? 8 : 4;
+ }
+};
+
+class DarwinX86_32AsmBackend : public DarwinX86AsmBackend {
+public:
+ DarwinX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+ StringRef CPU)
+ : DarwinX86AsmBackend(T, MRI, CPU, false) {}
+
+ MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+ return createX86MachObjectWriter(OS, /*Is64Bit=*/false,
+ MachO::CPU_TYPE_I386,
+ MachO::CPU_SUBTYPE_I386_ALL);
+ }
+
+ /// \brief Generate the compact unwind encoding for the CFI instructions.
+ uint32_t generateCompactUnwindEncoding(
+ ArrayRef<MCCFIInstruction> Instrs) const override {
+ return generateCompactUnwindEncodingImpl(Instrs);
+ }
+};
+
+class DarwinX86_64AsmBackend : public DarwinX86AsmBackend {
+ const MachO::CPUSubTypeX86 Subtype;
+public:
+ DarwinX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+ StringRef CPU, MachO::CPUSubTypeX86 st)
+ : DarwinX86AsmBackend(T, MRI, CPU, true), Subtype(st) {}
+
+ MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+ return createX86MachObjectWriter(OS, /*Is64Bit=*/true,
+ MachO::CPU_TYPE_X86_64, Subtype);
+ }
+
+ /// \brief Generate the compact unwind encoding for the CFI instructions.
+ uint32_t generateCompactUnwindEncoding(
+ ArrayRef<MCCFIInstruction> Instrs) const override {
+ return generateCompactUnwindEncodingImpl(Instrs);
+ }
+};
+
+} // end anonymous namespace
+
+MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T,
+ const MCRegisterInfo &MRI,
+ const Triple &TheTriple,
+ StringRef CPU,
+ const MCTargetOptions &Options) {
+ if (TheTriple.isOSBinFormatMachO())
+ return new DarwinX86_32AsmBackend(T, MRI, CPU);
+
+ if (TheTriple.isOSWindows() && TheTriple.isOSBinFormatCOFF())
+ return new WindowsX86AsmBackend(T, false, CPU);
+
+ uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
+
+ if (TheTriple.isOSIAMCU())
+ return new ELFX86_IAMCUAsmBackend(T, OSABI, CPU);
+
+ return new ELFX86_32AsmBackend(T, OSABI, CPU);
+}
+
+MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T,
+ const MCRegisterInfo &MRI,
+ const Triple &TheTriple,
+ StringRef CPU,
+ const MCTargetOptions &Options) {
+ if (TheTriple.isOSBinFormatMachO()) {
+ MachO::CPUSubTypeX86 CS =
+ StringSwitch<MachO::CPUSubTypeX86>(TheTriple.getArchName())
+ .Case("x86_64h", MachO::CPU_SUBTYPE_X86_64_H)
+ .Default(MachO::CPU_SUBTYPE_X86_64_ALL);
+ return new DarwinX86_64AsmBackend(T, MRI, CPU, CS);
+ }
+
+ if (TheTriple.isOSWindows() && TheTriple.isOSBinFormatCOFF())
+ return new WindowsX86AsmBackend(T, true, CPU);
+
+ uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
+
+ if (TheTriple.getEnvironment() == Triple::GNUX32)
+ return new ELFX86_X32AsmBackend(T, OSABI, CPU);
+ return new ELFX86_64AsmBackend(T, OSABI, CPU);
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
new file mode 100644
index 000000000000..aab552547fac
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -0,0 +1,784 @@
+//===-- X86BaseInfo.h - Top level definitions for X86 -------- --*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the X86 target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86BASEINFO_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86BASEINFO_H
+
+#include "X86MCTargetDesc.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+namespace X86 {
+ // Enums for memory operand decoding. Each memory operand is represented with
+ // a 5 operand sequence in the form:
+ // [BaseReg, ScaleAmt, IndexReg, Disp, Segment]
+ // These enums help decode this.
+ enum {
+ AddrBaseReg = 0,
+ AddrScaleAmt = 1,
+ AddrIndexReg = 2,
+ AddrDisp = 3,
+
+ /// AddrSegmentReg - The operand # of the segment in the memory operand.
+ AddrSegmentReg = 4,
+
+ /// AddrNumOperands - Total number of operands in a memory reference.
+ AddrNumOperands = 5
+ };
+
+ /// AVX512 static rounding constants. These need to match the values in
+ /// avx512fintrin.h.
+ enum STATIC_ROUNDING {
+ TO_NEAREST_INT = 0,
+ TO_NEG_INF = 1,
+ TO_POS_INF = 2,
+ TO_ZERO = 3,
+ CUR_DIRECTION = 4
+ };
+} // end namespace X86;
+
+/// X86II - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace X86II {
+ /// Target Operand Flag enum.
+ enum TOF {
+ //===------------------------------------------------------------------===//
+ // X86 Specific MachineOperand flags.
+
+ MO_NO_FLAG,
+
+ /// MO_GOT_ABSOLUTE_ADDRESS - On a symbol operand, this represents a
+ /// relocation of:
+ /// SYMBOL_LABEL + [. - PICBASELABEL]
+ MO_GOT_ABSOLUTE_ADDRESS,
+
+ /// MO_PIC_BASE_OFFSET - On a symbol operand this indicates that the
+ /// immediate should get the value of the symbol minus the PIC base label:
+ /// SYMBOL_LABEL - PICBASELABEL
+ MO_PIC_BASE_OFFSET,
+
+ /// MO_GOT - On a symbol operand this indicates that the immediate is the
+ /// offset to the GOT entry for the symbol name from the base of the GOT.
+ ///
+ /// See the X86-64 ELF ABI supplement for more details.
+ /// SYMBOL_LABEL @GOT
+ MO_GOT,
+
+ /// MO_GOTOFF - On a symbol operand this indicates that the immediate is
+ /// the offset to the location of the symbol name from the base of the GOT.
+ ///
+ /// See the X86-64 ELF ABI supplement for more details.
+ /// SYMBOL_LABEL @GOTOFF
+ MO_GOTOFF,
+
+ /// MO_GOTPCREL - On a symbol operand this indicates that the immediate is
+ /// offset to the GOT entry for the symbol name from the current code
+ /// location.
+ ///
+ /// See the X86-64 ELF ABI supplement for more details.
+ /// SYMBOL_LABEL @GOTPCREL
+ MO_GOTPCREL,
+
+ /// MO_PLT - On a symbol operand this indicates that the immediate is
+ /// offset to the PLT entry of symbol name from the current code location.
+ ///
+ /// See the X86-64 ELF ABI supplement for more details.
+ /// SYMBOL_LABEL @PLT
+ MO_PLT,
+
+ /// MO_TLSGD - On a symbol operand this indicates that the immediate is
+ /// the offset of the GOT entry with the TLS index structure that contains
+ /// the module number and variable offset for the symbol. Used in the
+ /// general dynamic TLS access model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @TLSGD
+ MO_TLSGD,
+
+ /// MO_TLSLD - On a symbol operand this indicates that the immediate is
+ /// the offset of the GOT entry with the TLS index for the module that
+ /// contains the symbol. When this index is passed to a call to
+ /// __tls_get_addr, the function will return the base address of the TLS
+ /// block for the symbol. Used in the x86-64 local dynamic TLS access model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @TLSLD
+ MO_TLSLD,
+
+ /// MO_TLSLDM - On a symbol operand this indicates that the immediate is
+ /// the offset of the GOT entry with the TLS index for the module that
+ /// contains the symbol. When this index is passed to a call to
+ /// ___tls_get_addr, the function will return the base address of the TLS
+ /// block for the symbol. Used in the IA32 local dynamic TLS access model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @TLSLDM
+ MO_TLSLDM,
+
+ /// MO_GOTTPOFF - On a symbol operand this indicates that the immediate is
+ /// the offset of the GOT entry with the thread-pointer offset for the
+ /// symbol. Used in the x86-64 initial exec TLS access model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @GOTTPOFF
+ MO_GOTTPOFF,
+
+ /// MO_INDNTPOFF - On a symbol operand this indicates that the immediate is
+ /// the absolute address of the GOT entry with the negative thread-pointer
+ /// offset for the symbol. Used in the non-PIC IA32 initial exec TLS access
+ /// model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @INDNTPOFF
+ MO_INDNTPOFF,
+
+ /// MO_TPOFF - On a symbol operand this indicates that the immediate is
+ /// the thread-pointer offset for the symbol. Used in the x86-64 local
+ /// exec TLS access model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @TPOFF
+ MO_TPOFF,
+
+ /// MO_DTPOFF - On a symbol operand this indicates that the immediate is
+ /// the offset of the GOT entry with the TLS offset of the symbol. Used
+ /// in the local dynamic TLS access model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @DTPOFF
+ MO_DTPOFF,
+
+ /// MO_NTPOFF - On a symbol operand this indicates that the immediate is
+ /// the negative thread-pointer offset for the symbol. Used in the IA32
+ /// local exec TLS access model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @NTPOFF
+ MO_NTPOFF,
+
+ /// MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is
+ /// the offset of the GOT entry with the negative thread-pointer offset for
+ /// the symbol. Used in the PIC IA32 initial exec TLS access model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @GOTNTPOFF
+ MO_GOTNTPOFF,
+
+ /// MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the
+ /// reference is actually to the "__imp_FOO" symbol. This is used for
+ /// dllimport linkage on windows.
+ MO_DLLIMPORT,
+
+ /// MO_DARWIN_NONLAZY - On a symbol operand "FOO", this indicates that the
+ /// reference is actually to the "FOO$non_lazy_ptr" symbol, which is a
+ /// non-PIC-base-relative reference to a non-hidden dyld lazy pointer stub.
+ MO_DARWIN_NONLAZY,
+
+ /// MO_DARWIN_NONLAZY_PIC_BASE - On a symbol operand "FOO", this indicates
+ /// that the reference is actually to "FOO$non_lazy_ptr - PICBASE", which is
+ /// a PIC-base-relative reference to a non-hidden dyld lazy pointer stub.
+ MO_DARWIN_NONLAZY_PIC_BASE,
+
+ /// MO_TLVP - On a symbol operand this indicates that the immediate is
+ /// some TLS offset.
+ ///
+ /// This is the TLS offset for the Darwin TLS mechanism.
+ MO_TLVP,
+
+ /// MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate
+ /// is some TLS offset from the picbase.
+ ///
+ /// This is the 32-bit TLS offset for Darwin TLS in PIC mode.
+ MO_TLVP_PIC_BASE,
+
+ /// MO_SECREL - On a symbol operand this indicates that the immediate is
+ /// the offset from beginning of section.
+ ///
+ /// This is the TLS offset for the COFF/Windows TLS mechanism.
+ MO_SECREL
+ };
+
+ enum : uint64_t {
+ //===------------------------------------------------------------------===//
+ // Instruction encodings. These are the standard/most common forms for X86
+ // instructions.
+ //
+
+ // PseudoFrm - This represents an instruction that is a pseudo instruction
+ // or one that has not been implemented yet. It is illegal to code generate
+ // it, but tolerated for intermediate implementation stages.
+ Pseudo = 0,
+
+ /// Raw - This form is for instructions that don't have any operands, so
+ /// they are just a fixed opcode value, like 'leave'.
+ RawFrm = 1,
+
+ /// AddRegFrm - This form is used for instructions like 'push r32' that have
+ /// their one register operand added to their opcode.
+ AddRegFrm = 2,
+
+ /// RawFrmMemOffs - This form is for instructions that store an absolute
+ /// memory offset as an immediate with a possible segment override.
+ RawFrmMemOffs = 3,
+
+ /// RawFrmSrc - This form is for instructions that use the source index
+ /// register SI/ESI/RSI with a possible segment override.
+ RawFrmSrc = 4,
+
+ /// RawFrmDst - This form is for instructions that use the destination index
+ /// register DI/EDI/ESI.
+ RawFrmDst = 5,
+
+ /// RawFrmSrc - This form is for instructions that use the source index
+ /// register SI/ESI/ERI with a possible segment override, and also the
+ /// destination index register DI/ESI/RDI.
+ RawFrmDstSrc = 6,
+
+ /// RawFrmImm8 - This is used for the ENTER instruction, which has two
+ /// immediates, the first of which is a 16-bit immediate (specified by
+ /// the imm encoding) and the second is a 8-bit fixed value.
+ RawFrmImm8 = 7,
+
+ /// RawFrmImm16 - This is used for CALL FAR instructions, which have two
+ /// immediates, the first of which is a 16 or 32-bit immediate (specified by
+ /// the imm encoding) and the second is a 16-bit fixed value. In the AMD
+ /// manual, this operand is described as pntr16:32 and pntr16:16
+ RawFrmImm16 = 8,
+
+ /// MRM[0-7][rm] - These forms are used to represent instructions that use
+ /// a Mod/RM byte, and use the middle field to hold extended opcode
+ /// information. In the intel manual these are represented as /0, /1, ...
+ ///
+
+ /// MRMDestMem - This form is used for instructions that use the Mod/RM byte
+ /// to specify a destination, which in this case is memory.
+ ///
+ MRMDestMem = 32,
+
+ /// MRMSrcMem - This form is used for instructions that use the Mod/RM byte
+ /// to specify a source, which in this case is memory.
+ ///
+ MRMSrcMem = 33,
+
+ /// MRMSrcMem4VOp3 - This form is used for instructions that encode
+ /// operand 3 with VEX.VVVV and load from memory.
+ ///
+ MRMSrcMem4VOp3 = 34,
+
+ /// MRMSrcMemOp4 - This form is used for instructions that use the Mod/RM
+ /// byte to specify the fourth source, which in this case is memory.
+ ///
+ MRMSrcMemOp4 = 35,
+
+ /// MRMXm - This form is used for instructions that use the Mod/RM byte
+ /// to specify a memory source, but doesn't use the middle field.
+ ///
+ MRMXm = 39, // Instruction that uses Mod/RM but not the middle field.
+
+ // Next, instructions that operate on a memory r/m operand...
+ MRM0m = 40, MRM1m = 41, MRM2m = 42, MRM3m = 43, // Format /0 /1 /2 /3
+ MRM4m = 44, MRM5m = 45, MRM6m = 46, MRM7m = 47, // Format /4 /5 /6 /7
+
+ /// MRMDestReg - This form is used for instructions that use the Mod/RM byte
+ /// to specify a destination, which in this case is a register.
+ ///
+ MRMDestReg = 48,
+
+ /// MRMSrcReg - This form is used for instructions that use the Mod/RM byte
+ /// to specify a source, which in this case is a register.
+ ///
+ MRMSrcReg = 49,
+
+ /// MRMSrcReg4VOp3 - This form is used for instructions that encode
+ /// operand 3 with VEX.VVVV and do not load from memory.
+ ///
+ MRMSrcReg4VOp3 = 50,
+
+ /// MRMSrcRegOp4 - This form is used for instructions that use the Mod/RM
+ /// byte to specify the fourth source, which in this case is a register.
+ ///
+ MRMSrcRegOp4 = 51,
+
+ /// MRMXr - This form is used for instructions that use the Mod/RM byte
+ /// to specify a register source, but doesn't use the middle field.
+ ///
+ MRMXr = 55, // Instruction that uses Mod/RM but not the middle field.
+
+ // Instructions that operate on a register r/m operand...
+ MRM0r = 56, MRM1r = 57, MRM2r = 58, MRM3r = 59, // Format /0 /1 /2 /3
+ MRM4r = 60, MRM5r = 61, MRM6r = 62, MRM7r = 63, // Format /4 /5 /6 /7
+
+ /// MRM_XX - A mod/rm byte of exactly 0xXX.
+ MRM_C0 = 64, MRM_C1 = 65, MRM_C2 = 66, MRM_C3 = 67,
+ MRM_C4 = 68, MRM_C5 = 69, MRM_C6 = 70, MRM_C7 = 71,
+ MRM_C8 = 72, MRM_C9 = 73, MRM_CA = 74, MRM_CB = 75,
+ MRM_CC = 76, MRM_CD = 77, MRM_CE = 78, MRM_CF = 79,
+ MRM_D0 = 80, MRM_D1 = 81, MRM_D2 = 82, MRM_D3 = 83,
+ MRM_D4 = 84, MRM_D5 = 85, MRM_D6 = 86, MRM_D7 = 87,
+ MRM_D8 = 88, MRM_D9 = 89, MRM_DA = 90, MRM_DB = 91,
+ MRM_DC = 92, MRM_DD = 93, MRM_DE = 94, MRM_DF = 95,
+ MRM_E0 = 96, MRM_E1 = 97, MRM_E2 = 98, MRM_E3 = 99,
+ MRM_E4 = 100, MRM_E5 = 101, MRM_E6 = 102, MRM_E7 = 103,
+ MRM_E8 = 104, MRM_E9 = 105, MRM_EA = 106, MRM_EB = 107,
+ MRM_EC = 108, MRM_ED = 109, MRM_EE = 110, MRM_EF = 111,
+ MRM_F0 = 112, MRM_F1 = 113, MRM_F2 = 114, MRM_F3 = 115,
+ MRM_F4 = 116, MRM_F5 = 117, MRM_F6 = 118, MRM_F7 = 119,
+ MRM_F8 = 120, MRM_F9 = 121, MRM_FA = 122, MRM_FB = 123,
+ MRM_FC = 124, MRM_FD = 125, MRM_FE = 126, MRM_FF = 127,
+
+ FormMask = 127,
+
+ //===------------------------------------------------------------------===//
+ // Actual flags...
+
+ // OpSize - OpSizeFixed implies instruction never needs a 0x66 prefix.
+ // OpSize16 means this is a 16-bit instruction and needs 0x66 prefix in
+ // 32-bit mode. OpSize32 means this is a 32-bit instruction needs a 0x66
+ // prefix in 16-bit mode.
+ OpSizeShift = 7,
+ OpSizeMask = 0x3 << OpSizeShift,
+
+ OpSizeFixed = 0 << OpSizeShift,
+ OpSize16 = 1 << OpSizeShift,
+ OpSize32 = 2 << OpSizeShift,
+
+ // AsSize - AdSizeX implies this instruction determines its need of 0x67
+ // prefix from a normal ModRM memory operand. The other types indicate that
+ // an operand is encoded with a specific width and a prefix is needed if
+ // it differs from the current mode.
+ AdSizeShift = OpSizeShift + 2,
+ AdSizeMask = 0x3 << AdSizeShift,
+
+ AdSizeX = 1 << AdSizeShift,
+ AdSize16 = 1 << AdSizeShift,
+ AdSize32 = 2 << AdSizeShift,
+ AdSize64 = 3 << AdSizeShift,
+
+ //===------------------------------------------------------------------===//
+ // OpPrefix - There are several prefix bytes that are used as opcode
+ // extensions. These are 0x66, 0xF3, and 0xF2. If this field is 0 there is
+ // no prefix.
+ //
+ OpPrefixShift = AdSizeShift + 2,
+ OpPrefixMask = 0x7 << OpPrefixShift,
+
+ // PS, PD - Prefix code for packed single and double precision vector
+ // floating point operations performed in the SSE registers.
+ PS = 1 << OpPrefixShift, PD = 2 << OpPrefixShift,
+
+ // XS, XD - These prefix codes are for single and double precision scalar
+ // floating point operations performed in the SSE registers.
+ XS = 3 << OpPrefixShift, XD = 4 << OpPrefixShift,
+
+ //===------------------------------------------------------------------===//
+ // OpMap - This field determines which opcode map this instruction
+ // belongs to. i.e. one-byte, two-byte, 0x0f 0x38, 0x0f 0x3a, etc.
+ //
+ OpMapShift = OpPrefixShift + 3,
+ OpMapMask = 0x7 << OpMapShift,
+
+ // OB - OneByte - Set if this instruction has a one byte opcode.
+ OB = 0 << OpMapShift,
+
+ // TB - TwoByte - Set if this instruction has a two byte opcode, which
+ // starts with a 0x0F byte before the real opcode.
+ TB = 1 << OpMapShift,
+
+ // T8, TA - Prefix after the 0x0F prefix.
+ T8 = 2 << OpMapShift, TA = 3 << OpMapShift,
+
+ // XOP8 - Prefix to include use of imm byte.
+ XOP8 = 4 << OpMapShift,
+
+ // XOP9 - Prefix to exclude use of imm byte.
+ XOP9 = 5 << OpMapShift,
+
+ // XOPA - Prefix to encode 0xA in VEX.MMMM of XOP instructions.
+ XOPA = 6 << OpMapShift,
+
+ //===------------------------------------------------------------------===//
+ // REX_W - REX prefixes are instruction prefixes used in 64-bit mode.
+ // They are used to specify GPRs and SSE registers, 64-bit operand size,
+ // etc. We only cares about REX.W and REX.R bits and only the former is
+ // statically determined.
+ //
+ REXShift = OpMapShift + 3,
+ REX_W = 1 << REXShift,
+
+ //===------------------------------------------------------------------===//
+ // This three-bit field describes the size of an immediate operand. Zero is
+ // unused so that we can tell if we forgot to set a value.
+ ImmShift = REXShift + 1,
+ ImmMask = 15 << ImmShift,
+ Imm8 = 1 << ImmShift,
+ Imm8PCRel = 2 << ImmShift,
+ Imm8Reg = 3 << ImmShift,
+ Imm16 = 4 << ImmShift,
+ Imm16PCRel = 5 << ImmShift,
+ Imm32 = 6 << ImmShift,
+ Imm32PCRel = 7 << ImmShift,
+ Imm32S = 8 << ImmShift,
+ Imm64 = 9 << ImmShift,
+
+ //===------------------------------------------------------------------===//
+ // FP Instruction Classification... Zero is non-fp instruction.
+
+ // FPTypeMask - Mask for all of the FP types...
+ FPTypeShift = ImmShift + 4,
+ FPTypeMask = 7 << FPTypeShift,
+
+ // NotFP - The default, set for instructions that do not use FP registers.
+ NotFP = 0 << FPTypeShift,
+
+ // ZeroArgFP - 0 arg FP instruction which implicitly pushes ST(0), f.e. fld0
+ ZeroArgFP = 1 << FPTypeShift,
+
+ // OneArgFP - 1 arg FP instructions which implicitly read ST(0), such as fst
+ OneArgFP = 2 << FPTypeShift,
+
+ // OneArgFPRW - 1 arg FP instruction which implicitly read ST(0) and write a
+ // result back to ST(0). For example, fcos, fsqrt, etc.
+ //
+ OneArgFPRW = 3 << FPTypeShift,
+
+ // TwoArgFP - 2 arg FP instructions which implicitly read ST(0), and an
+ // explicit argument, storing the result to either ST(0) or the implicit
+ // argument. For example: fadd, fsub, fmul, etc...
+ TwoArgFP = 4 << FPTypeShift,
+
+ // CompareFP - 2 arg FP instructions which implicitly read ST(0) and an
+ // explicit argument, but have no destination. Example: fucom, fucomi, ...
+ CompareFP = 5 << FPTypeShift,
+
+ // CondMovFP - "2 operand" floating point conditional move instructions.
+ CondMovFP = 6 << FPTypeShift,
+
+ // SpecialFP - Special instruction forms. Dispatch by opcode explicitly.
+ SpecialFP = 7 << FPTypeShift,
+
+ // Lock prefix
+ LOCKShift = FPTypeShift + 3,
+ LOCK = 1 << LOCKShift,
+
+ // REP prefix
+ REPShift = LOCKShift + 1,
+ REP = 1 << REPShift,
+
+ // Execution domain for SSE instructions.
+ // 0 means normal, non-SSE instruction.
+ SSEDomainShift = REPShift + 1,
+
+ // Encoding
+ EncodingShift = SSEDomainShift + 2,
+ EncodingMask = 0x3 << EncodingShift,
+
+ // VEX - encoding using 0xC4/0xC5
+ VEX = 1 << EncodingShift,
+
+ /// XOP - Opcode prefix used by XOP instructions.
+ XOP = 2 << EncodingShift,
+
+ // VEX_EVEX - Specifies that this instruction use EVEX form which provides
+ // syntax support up to 32 512-bit register operands and up to 7 16-bit
+ // mask operands as well as source operand data swizzling/memory operand
+ // conversion, eviction hint, and rounding mode.
+ EVEX = 3 << EncodingShift,
+
+ // Opcode
+ OpcodeShift = EncodingShift + 2,
+
+ /// VEX_W - Has a opcode specific functionality, but is used in the same
+ /// way as REX_W is for regular SSE instructions.
+ VEX_WShift = OpcodeShift + 8,
+ VEX_W = 1ULL << VEX_WShift,
+
+ /// VEX_4V - Used to specify an additional AVX/SSE register. Several 2
+ /// address instructions in SSE are represented as 3 address ones in AVX
+ /// and the additional register is encoded in VEX_VVVV prefix.
+ VEX_4VShift = VEX_WShift + 1,
+ VEX_4V = 1ULL << VEX_4VShift,
+
+ /// VEX_L - Stands for a bit in the VEX opcode prefix meaning the current
+ /// instruction uses 256-bit wide registers. This is usually auto detected
+ /// if a VR256 register is used, but some AVX instructions also have this
+ /// field marked when using a f256 memory references.
+ VEX_LShift = VEX_4VShift + 1,
+ VEX_L = 1ULL << VEX_LShift,
+
+ // EVEX_K - Set if this instruction requires masking
+ EVEX_KShift = VEX_LShift + 1,
+ EVEX_K = 1ULL << EVEX_KShift,
+
+ // EVEX_Z - Set if this instruction has EVEX.Z field set.
+ EVEX_ZShift = EVEX_KShift + 1,
+ EVEX_Z = 1ULL << EVEX_ZShift,
+
+ // EVEX_L2 - Set if this instruction has EVEX.L' field set.
+ EVEX_L2Shift = EVEX_ZShift + 1,
+ EVEX_L2 = 1ULL << EVEX_L2Shift,
+
+ // EVEX_B - Set if this instruction has EVEX.B field set.
+ EVEX_BShift = EVEX_L2Shift + 1,
+ EVEX_B = 1ULL << EVEX_BShift,
+
+ // The scaling factor for the AVX512's 8-bit compressed displacement.
+ CD8_Scale_Shift = EVEX_BShift + 1,
+ CD8_Scale_Mask = 127ULL << CD8_Scale_Shift,
+
+ /// Has3DNow0F0FOpcode - This flag indicates that the instruction uses the
+ /// wacky 0x0F 0x0F prefix for 3DNow! instructions. The manual documents
+ /// this as having a 0x0F prefix with a 0x0F opcode, and each instruction
+ /// storing a classifier in the imm8 field. To simplify our implementation,
+ /// we handle this by storeing the classifier in the opcode field and using
+ /// this flag to indicate that the encoder should do the wacky 3DNow! thing.
+ Has3DNow0F0FOpcodeShift = CD8_Scale_Shift + 7,
+ Has3DNow0F0FOpcode = 1ULL << Has3DNow0F0FOpcodeShift,
+
+ /// Explicitly specified rounding control
+ EVEX_RCShift = Has3DNow0F0FOpcodeShift + 1,
+ EVEX_RC = 1ULL << EVEX_RCShift
+ };
+
+ // getBaseOpcodeFor - This function returns the "base" X86 opcode for the
+ // specified machine instruction.
+ //
+ inline unsigned char getBaseOpcodeFor(uint64_t TSFlags) {
+ return TSFlags >> X86II::OpcodeShift;
+ }
+
+ inline bool hasImm(uint64_t TSFlags) {
+ return (TSFlags & X86II::ImmMask) != 0;
+ }
+
+ /// getSizeOfImm - Decode the "size of immediate" field from the TSFlags field
+ /// of the specified instruction.
+ inline unsigned getSizeOfImm(uint64_t TSFlags) {
+ switch (TSFlags & X86II::ImmMask) {
+ default: llvm_unreachable("Unknown immediate size");
+ case X86II::Imm8:
+ case X86II::Imm8PCRel:
+ case X86II::Imm8Reg: return 1;
+ case X86II::Imm16:
+ case X86II::Imm16PCRel: return 2;
+ case X86II::Imm32:
+ case X86II::Imm32S:
+ case X86II::Imm32PCRel: return 4;
+ case X86II::Imm64: return 8;
+ }
+ }
+
+ /// isImmPCRel - Return true if the immediate of the specified instruction's
+ /// TSFlags indicates that it is pc relative.
+ inline unsigned isImmPCRel(uint64_t TSFlags) {
+ switch (TSFlags & X86II::ImmMask) {
+ default: llvm_unreachable("Unknown immediate size");
+ case X86II::Imm8PCRel:
+ case X86II::Imm16PCRel:
+ case X86II::Imm32PCRel:
+ return true;
+ case X86II::Imm8:
+ case X86II::Imm8Reg:
+ case X86II::Imm16:
+ case X86II::Imm32:
+ case X86II::Imm32S:
+ case X86II::Imm64:
+ return false;
+ }
+ }
+
+ /// isImmSigned - Return true if the immediate of the specified instruction's
+ /// TSFlags indicates that it is signed.
+ inline unsigned isImmSigned(uint64_t TSFlags) {
+ switch (TSFlags & X86II::ImmMask) {
+ default: llvm_unreachable("Unknown immediate signedness");
+ case X86II::Imm32S:
+ return true;
+ case X86II::Imm8:
+ case X86II::Imm8PCRel:
+ case X86II::Imm8Reg:
+ case X86II::Imm16:
+ case X86II::Imm16PCRel:
+ case X86II::Imm32:
+ case X86II::Imm32PCRel:
+ case X86II::Imm64:
+ return false;
+ }
+ }
+
+ /// getOperandBias - compute any additional adjustment needed to
+ /// the offset to the start of the memory operand
+ /// in this instruction.
+ /// If this is a two-address instruction,skip one of the register operands.
+ /// FIXME: This should be handled during MCInst lowering.
+ inline unsigned getOperandBias(const MCInstrDesc& Desc)
+ {
+ unsigned NumOps = Desc.getNumOperands();
+ if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0)
+ return 1;
+ if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
+ Desc.getOperandConstraint(3, MCOI::TIED_TO) == 1)
+ // Special case for AVX-512 GATHER with 2 TIED_TO operands
+ // Skip the first 2 operands: dst, mask_wb
+ return 2;
+ if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
+ Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1)
+ // Special case for GATHER with 2 TIED_TO operands
+ // Skip the first 2 operands: dst, mask_wb
+ return 2;
+ if (NumOps > 2 && Desc.getOperandConstraint(NumOps - 2, MCOI::TIED_TO) == 0)
+ // SCATTER
+ return 1;
+ return 0;
+ }
+
+ /// getMemoryOperandNo - The function returns the MCInst operand # for the
+ /// first field of the memory operand. If the instruction doesn't have a
+ /// memory operand, this returns -1.
+ ///
+ /// Note that this ignores tied operands. If there is a tied register which
+ /// is duplicated in the MCInst (e.g. "EAX = addl EAX, [mem]") it is only
+ /// counted as one operand.
+ ///
+ inline int getMemoryOperandNo(uint64_t TSFlags) {
+ bool HasVEX_4V = TSFlags & X86II::VEX_4V;
+ bool HasEVEX_K = TSFlags & X86II::EVEX_K;
+
+ switch (TSFlags & X86II::FormMask) {
+ default: llvm_unreachable("Unknown FormMask value in getMemoryOperandNo!");
+ case X86II::Pseudo:
+ case X86II::RawFrm:
+ case X86II::AddRegFrm:
+ case X86II::RawFrmImm8:
+ case X86II::RawFrmImm16:
+ case X86II::RawFrmMemOffs:
+ case X86II::RawFrmSrc:
+ case X86II::RawFrmDst:
+ case X86II::RawFrmDstSrc:
+ return -1;
+ case X86II::MRMDestMem:
+ return 0;
+ case X86II::MRMSrcMem:
+ // Start from 1, skip any registers encoded in VEX_VVVV or I8IMM, or a
+ // mask register.
+ return 1 + HasVEX_4V + HasEVEX_K;
+ case X86II::MRMSrcMem4VOp3:
+ // Skip registers encoded in reg.
+ return 1 + HasEVEX_K;
+ case X86II::MRMSrcMemOp4:
+ // Skip registers encoded in reg, VEX_VVVV, and I8IMM.
+ return 3;
+ case X86II::MRMDestReg:
+ case X86II::MRMSrcReg:
+ case X86II::MRMSrcReg4VOp3:
+ case X86II::MRMSrcRegOp4:
+ case X86II::MRMXr:
+ case X86II::MRM0r: case X86II::MRM1r:
+ case X86II::MRM2r: case X86II::MRM3r:
+ case X86II::MRM4r: case X86II::MRM5r:
+ case X86II::MRM6r: case X86II::MRM7r:
+ return -1;
+ case X86II::MRMXm:
+ case X86II::MRM0m: case X86II::MRM1m:
+ case X86II::MRM2m: case X86II::MRM3m:
+ case X86II::MRM4m: case X86II::MRM5m:
+ case X86II::MRM6m: case X86II::MRM7m:
+ // Start from 0, skip registers encoded in VEX_VVVV or a mask register.
+ return 0 + HasVEX_4V + HasEVEX_K;
+ case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2:
+ case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C5:
+ case X86II::MRM_C6: case X86II::MRM_C7: case X86II::MRM_C8:
+ case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB:
+ case X86II::MRM_CC: case X86II::MRM_CD: case X86II::MRM_CE:
+ case X86II::MRM_CF: case X86II::MRM_D0: case X86II::MRM_D1:
+ case X86II::MRM_D2: case X86II::MRM_D3: case X86II::MRM_D4:
+ case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D7:
+ case X86II::MRM_D8: case X86II::MRM_D9: case X86II::MRM_DA:
+ case X86II::MRM_DB: case X86II::MRM_DC: case X86II::MRM_DD:
+ case X86II::MRM_DE: case X86II::MRM_DF: case X86II::MRM_E0:
+ case X86II::MRM_E1: case X86II::MRM_E2: case X86II::MRM_E3:
+ case X86II::MRM_E4: case X86II::MRM_E5: case X86II::MRM_E6:
+ case X86II::MRM_E7: case X86II::MRM_E8: case X86II::MRM_E9:
+ case X86II::MRM_EA: case X86II::MRM_EB: case X86II::MRM_EC:
+ case X86II::MRM_ED: case X86II::MRM_EE: case X86II::MRM_EF:
+ case X86II::MRM_F0: case X86II::MRM_F1: case X86II::MRM_F2:
+ case X86II::MRM_F3: case X86II::MRM_F4: case X86II::MRM_F5:
+ case X86II::MRM_F6: case X86II::MRM_F7: case X86II::MRM_F8:
+ case X86II::MRM_F9: case X86II::MRM_FA: case X86II::MRM_FB:
+ case X86II::MRM_FC: case X86II::MRM_FD: case X86II::MRM_FE:
+ case X86II::MRM_FF:
+ return -1;
+ }
+ }
+
+ /// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended (r8 or
+ /// higher) register? e.g. r8, xmm8, xmm13, etc.
+ inline bool isX86_64ExtendedReg(unsigned RegNo) {
+ if ((RegNo >= X86::XMM8 && RegNo <= X86::XMM31) ||
+ (RegNo >= X86::YMM8 && RegNo <= X86::YMM31) ||
+ (RegNo >= X86::ZMM8 && RegNo <= X86::ZMM31))
+ return true;
+
+ switch (RegNo) {
+ default: break;
+ case X86::R8: case X86::R9: case X86::R10: case X86::R11:
+ case X86::R12: case X86::R13: case X86::R14: case X86::R15:
+ case X86::R8D: case X86::R9D: case X86::R10D: case X86::R11D:
+ case X86::R12D: case X86::R13D: case X86::R14D: case X86::R15D:
+ case X86::R8W: case X86::R9W: case X86::R10W: case X86::R11W:
+ case X86::R12W: case X86::R13W: case X86::R14W: case X86::R15W:
+ case X86::R8B: case X86::R9B: case X86::R10B: case X86::R11B:
+ case X86::R12B: case X86::R13B: case X86::R14B: case X86::R15B:
+ case X86::CR8: case X86::CR9: case X86::CR10: case X86::CR11:
+ case X86::CR12: case X86::CR13: case X86::CR14: case X86::CR15:
+ case X86::DR8: case X86::DR9: case X86::DR10: case X86::DR11:
+ case X86::DR12: case X86::DR13: case X86::DR14: case X86::DR15:
+ return true;
+ }
+ return false;
+ }
+
+ /// is32ExtendedReg - Is the MemoryOperand a 32 extended (zmm16 or higher)
+ /// registers? e.g. zmm21, etc.
+ static inline bool is32ExtendedReg(unsigned RegNo) {
+ return ((RegNo >= X86::XMM16 && RegNo <= X86::XMM31) ||
+ (RegNo >= X86::YMM16 && RegNo <= X86::YMM31) ||
+ (RegNo >= X86::ZMM16 && RegNo <= X86::ZMM31));
+ }
+
+
+ inline bool isX86_64NonExtLowByteReg(unsigned reg) {
+ return (reg == X86::SPL || reg == X86::BPL ||
+ reg == X86::SIL || reg == X86::DIL);
+ }
+
+ /// isKMasked - Is this a masked instruction.
+ inline bool isKMasked(uint64_t TSFlags) {
+ return (TSFlags & X86II::EVEX_K) != 0;
+ }
+
+ /// isKMergedMasked - Is this a merge masked instruction.
+ inline bool isKMergeMasked(uint64_t TSFlags) {
+ return isKMasked(TSFlags) && (TSFlags & X86II::EVEX_Z) == 0;
+ }
+}
+
+} // end namespace llvm;
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
new file mode 100644
index 000000000000..da69da51df10
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -0,0 +1,303 @@
+//===-- X86ELFObjectWriter.cpp - X86 ELF Writer ---------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+ class X86ELFObjectWriter : public MCELFObjectTargetWriter {
+ public:
+ X86ELFObjectWriter(bool IsELF64, uint8_t OSABI, uint16_t EMachine);
+
+ ~X86ELFObjectWriter() override;
+
+ protected:
+ unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+ const MCFixup &Fixup, bool IsPCRel) const override;
+ };
+}
+
+X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI,
+ uint16_t EMachine)
+ : MCELFObjectTargetWriter(IsELF64, OSABI, EMachine,
+ // Only i386 and IAMCU use Rel instead of RelA.
+ /*HasRelocationAddend*/
+ (EMachine != ELF::EM_386) &&
+ (EMachine != ELF::EM_IAMCU)) {}
+
+X86ELFObjectWriter::~X86ELFObjectWriter()
+{}
+
+enum X86_64RelType { RT64_64, RT64_32, RT64_32S, RT64_16, RT64_8 };
+
+static X86_64RelType getType64(unsigned Kind,
+ MCSymbolRefExpr::VariantKind &Modifier,
+ bool &IsPCRel) {
+ switch (Kind) {
+ default:
+ llvm_unreachable("Unimplemented");
+ case X86::reloc_global_offset_table8:
+ Modifier = MCSymbolRefExpr::VK_GOT;
+ IsPCRel = true;
+ return RT64_64;
+ case FK_Data_8:
+ return RT64_64;
+ case X86::reloc_signed_4byte:
+ case X86::reloc_signed_4byte_relax:
+ if (Modifier == MCSymbolRefExpr::VK_None && !IsPCRel)
+ return RT64_32S;
+ return RT64_32;
+ case X86::reloc_global_offset_table:
+ Modifier = MCSymbolRefExpr::VK_GOT;
+ IsPCRel = true;
+ return RT64_32;
+ case FK_Data_4:
+ case FK_PCRel_4:
+ case X86::reloc_riprel_4byte:
+ case X86::reloc_riprel_4byte_relax:
+ case X86::reloc_riprel_4byte_relax_rex:
+ case X86::reloc_riprel_4byte_movq_load:
+ return RT64_32;
+ case FK_PCRel_2:
+ case FK_Data_2:
+ return RT64_16;
+ case FK_PCRel_1:
+ case FK_Data_1:
+ return RT64_8;
+ }
+}
+
+static void checkIs32(MCContext &Ctx, SMLoc Loc, X86_64RelType Type) {
+ if (Type != RT64_32)
+ Ctx.reportError(Loc,
+ "32 bit reloc applied to a field with a different size");
+}
+
+static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
+ MCSymbolRefExpr::VariantKind Modifier,
+ X86_64RelType Type, bool IsPCRel,
+ unsigned Kind) {
+ switch (Modifier) {
+ default:
+ llvm_unreachable("Unimplemented");
+ case MCSymbolRefExpr::VK_None:
+ switch (Type) {
+ case RT64_64:
+ return IsPCRel ? ELF::R_X86_64_PC64 : ELF::R_X86_64_64;
+ case RT64_32:
+ return IsPCRel ? ELF::R_X86_64_PC32 : ELF::R_X86_64_32;
+ case RT64_32S:
+ return ELF::R_X86_64_32S;
+ case RT64_16:
+ return IsPCRel ? ELF::R_X86_64_PC16 : ELF::R_X86_64_16;
+ case RT64_8:
+ return IsPCRel ? ELF::R_X86_64_PC8 : ELF::R_X86_64_8;
+ }
+ case MCSymbolRefExpr::VK_GOT:
+ switch (Type) {
+ case RT64_64:
+ return IsPCRel ? ELF::R_X86_64_GOTPC64 : ELF::R_X86_64_GOT64;
+ case RT64_32:
+ return IsPCRel ? ELF::R_X86_64_GOTPC32 : ELF::R_X86_64_GOT32;
+ case RT64_32S:
+ case RT64_16:
+ case RT64_8:
+ llvm_unreachable("Unimplemented");
+ }
+ case MCSymbolRefExpr::VK_GOTOFF:
+ assert(Type == RT64_64);
+ assert(!IsPCRel);
+ return ELF::R_X86_64_GOTOFF64;
+ case MCSymbolRefExpr::VK_TPOFF:
+ assert(!IsPCRel);
+ switch (Type) {
+ case RT64_64:
+ return ELF::R_X86_64_TPOFF64;
+ case RT64_32:
+ return ELF::R_X86_64_TPOFF32;
+ case RT64_32S:
+ case RT64_16:
+ case RT64_8:
+ llvm_unreachable("Unimplemented");
+ }
+ case MCSymbolRefExpr::VK_DTPOFF:
+ assert(!IsPCRel);
+ switch (Type) {
+ case RT64_64:
+ return ELF::R_X86_64_DTPOFF64;
+ case RT64_32:
+ return ELF::R_X86_64_DTPOFF32;
+ case RT64_32S:
+ case RT64_16:
+ case RT64_8:
+ llvm_unreachable("Unimplemented");
+ }
+ case MCSymbolRefExpr::VK_SIZE:
+ assert(!IsPCRel);
+ switch (Type) {
+ case RT64_64:
+ return ELF::R_X86_64_SIZE64;
+ case RT64_32:
+ return ELF::R_X86_64_SIZE32;
+ case RT64_32S:
+ case RT64_16:
+ case RT64_8:
+ llvm_unreachable("Unimplemented");
+ }
+ case MCSymbolRefExpr::VK_TLSCALL:
+ return ELF::R_X86_64_TLSDESC_CALL;
+ case MCSymbolRefExpr::VK_TLSDESC:
+ return ELF::R_X86_64_GOTPC32_TLSDESC;
+ case MCSymbolRefExpr::VK_TLSGD:
+ checkIs32(Ctx, Loc, Type);
+ return ELF::R_X86_64_TLSGD;
+ case MCSymbolRefExpr::VK_GOTTPOFF:
+ checkIs32(Ctx, Loc, Type);
+ return ELF::R_X86_64_GOTTPOFF;
+ case MCSymbolRefExpr::VK_TLSLD:
+ checkIs32(Ctx, Loc, Type);
+ return ELF::R_X86_64_TLSLD;
+ case MCSymbolRefExpr::VK_PLT:
+ checkIs32(Ctx, Loc, Type);
+ return ELF::R_X86_64_PLT32;
+ case MCSymbolRefExpr::VK_GOTPCREL:
+ checkIs32(Ctx, Loc, Type);
+ // Older versions of ld.bfd/ld.gold/lld
+ // do not support GOTPCRELX/REX_GOTPCRELX,
+ // and we want to keep back-compatibility.
+ if (!Ctx.getAsmInfo()->canRelaxRelocations())
+ return ELF::R_X86_64_GOTPCREL;
+ switch (Kind) {
+ default:
+ return ELF::R_X86_64_GOTPCREL;
+ case X86::reloc_riprel_4byte_relax:
+ return ELF::R_X86_64_GOTPCRELX;
+ case X86::reloc_riprel_4byte_relax_rex:
+ case X86::reloc_riprel_4byte_movq_load:
+ return ELF::R_X86_64_REX_GOTPCRELX;
+ }
+ }
+}
+
+enum X86_32RelType { RT32_32, RT32_16, RT32_8 };
+
+static X86_32RelType getType32(X86_64RelType T) {
+ switch (T) {
+ case RT64_64:
+ llvm_unreachable("Unimplemented");
+ case RT64_32:
+ case RT64_32S:
+ return RT32_32;
+ case RT64_16:
+ return RT32_16;
+ case RT64_8:
+ return RT32_8;
+ }
+ llvm_unreachable("unexpected relocation type!");
+}
+
+static unsigned getRelocType32(MCContext &Ctx,
+ MCSymbolRefExpr::VariantKind Modifier,
+ X86_32RelType Type, bool IsPCRel,
+ unsigned Kind) {
+ switch (Modifier) {
+ default:
+ llvm_unreachable("Unimplemented");
+ case MCSymbolRefExpr::VK_None:
+ switch (Type) {
+ case RT32_32:
+ return IsPCRel ? ELF::R_386_PC32 : ELF::R_386_32;
+ case RT32_16:
+ return IsPCRel ? ELF::R_386_PC16 : ELF::R_386_16;
+ case RT32_8:
+ return IsPCRel ? ELF::R_386_PC8 : ELF::R_386_8;
+ }
+ case MCSymbolRefExpr::VK_GOT:
+ assert(Type == RT32_32);
+ if (IsPCRel)
+ return ELF::R_386_GOTPC;
+ // Older versions of ld.bfd/ld.gold/lld do not support R_386_GOT32X and we
+ // want to maintain compatibility.
+ if (!Ctx.getAsmInfo()->canRelaxRelocations())
+ return ELF::R_386_GOT32;
+
+ return Kind == X86::reloc_signed_4byte_relax ? ELF::R_386_GOT32X
+ : ELF::R_386_GOT32;
+ case MCSymbolRefExpr::VK_GOTOFF:
+ assert(Type == RT32_32);
+ assert(!IsPCRel);
+ return ELF::R_386_GOTOFF;
+ case MCSymbolRefExpr::VK_TPOFF:
+ assert(Type == RT32_32);
+ assert(!IsPCRel);
+ return ELF::R_386_TLS_LE_32;
+ case MCSymbolRefExpr::VK_DTPOFF:
+ assert(Type == RT32_32);
+ assert(!IsPCRel);
+ return ELF::R_386_TLS_LDO_32;
+ case MCSymbolRefExpr::VK_TLSGD:
+ assert(Type == RT32_32);
+ assert(!IsPCRel);
+ return ELF::R_386_TLS_GD;
+ case MCSymbolRefExpr::VK_GOTTPOFF:
+ assert(Type == RT32_32);
+ assert(!IsPCRel);
+ return ELF::R_386_TLS_IE_32;
+ case MCSymbolRefExpr::VK_PLT:
+ assert(Type == RT32_32);
+ return ELF::R_386_PLT32;
+ case MCSymbolRefExpr::VK_INDNTPOFF:
+ assert(Type == RT32_32);
+ assert(!IsPCRel);
+ return ELF::R_386_TLS_IE;
+ case MCSymbolRefExpr::VK_NTPOFF:
+ assert(Type == RT32_32);
+ assert(!IsPCRel);
+ return ELF::R_386_TLS_LE;
+ case MCSymbolRefExpr::VK_GOTNTPOFF:
+ assert(Type == RT32_32);
+ assert(!IsPCRel);
+ return ELF::R_386_TLS_GOTIE;
+ case MCSymbolRefExpr::VK_TLSLDM:
+ assert(Type == RT32_32);
+ assert(!IsPCRel);
+ return ELF::R_386_TLS_LDM;
+ }
+}
+
+unsigned X86ELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
+ const MCFixup &Fixup,
+ bool IsPCRel) const {
+ MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
+ unsigned Kind = Fixup.getKind();
+ X86_64RelType Type = getType64(Kind, Modifier, IsPCRel);
+ if (getEMachine() == ELF::EM_X86_64)
+ return getRelocType64(Ctx, Fixup.getLoc(), Modifier, Type, IsPCRel, Kind);
+
+ assert((getEMachine() == ELF::EM_386 || getEMachine() == ELF::EM_IAMCU) &&
+ "Unsupported ELF machine type.");
+ return getRelocType32(Ctx, Modifier, getType32(Type), IsPCRel, Kind);
+}
+
+MCObjectWriter *llvm::createX86ELFObjectWriter(raw_pwrite_stream &OS,
+ bool IsELF64, uint8_t OSABI,
+ uint16_t EMachine) {
+ MCELFObjectTargetWriter *MOTW =
+ new X86ELFObjectWriter(IsELF64, OSABI, EMachine);
+ return createELFObjectWriter(MOTW, OS, /*IsLittleEndian=*/true);
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
new file mode 100644
index 000000000000..dfdc9ec29aec
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
@@ -0,0 +1,40 @@
+//===-- X86FixupKinds.h - X86 Specific Fixup Entries ------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86FIXUPKINDS_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86FIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace X86 {
+enum Fixups {
+ reloc_riprel_4byte = FirstTargetFixupKind, // 32-bit rip-relative
+ reloc_riprel_4byte_movq_load, // 32-bit rip-relative in movq
+ reloc_riprel_4byte_relax, // 32-bit rip-relative in relaxable
+ // instruction
+ reloc_riprel_4byte_relax_rex, // 32-bit rip-relative in relaxable
+ // instruction with rex prefix
+ reloc_signed_4byte, // 32-bit signed. Unlike FK_Data_4
+ // this will be sign extended at
+ // runtime.
+ reloc_signed_4byte_relax, // like reloc_signed_4byte, but
+ // in a relaxable instruction.
+ reloc_global_offset_table, // 32-bit, relative to the start
+ // of the instruction. Used only
+ // for _GLOBAL_OFFSET_TABLE_.
+ reloc_global_offset_table8, // 64-bit variant.
+ // Marker
+ LastTargetFixupKind,
+ NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+}
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
new file mode 100644
index 000000000000..48a1d8f1330c
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -0,0 +1,171 @@
+//===-- X86MCAsmInfo.cpp - X86 asm properties -----------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the X86MCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ELF.h"
+using namespace llvm;
+
+enum AsmWriterFlavorTy {
+ // Note: This numbering has to match the GCC assembler dialects for inline
+ // asm alternatives to work right.
+ ATT = 0, Intel = 1
+};
+
+static cl::opt<AsmWriterFlavorTy>
+AsmWriterFlavor("x86-asm-syntax", cl::init(ATT),
+ cl::desc("Choose style of code to emit from X86 backend:"),
+ cl::values(clEnumValN(ATT, "att", "Emit AT&T-style assembly"),
+ clEnumValN(Intel, "intel", "Emit Intel-style assembly")));
+
+static cl::opt<bool>
+MarkedJTDataRegions("mark-data-regions", cl::init(true),
+ cl::desc("Mark code section jump table data regions."),
+ cl::Hidden);
+
+void X86MCAsmInfoDarwin::anchor() { }
+
+X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) {
+ bool is64Bit = T.getArch() == Triple::x86_64;
+ if (is64Bit)
+ PointerSize = CalleeSaveStackSlotSize = 8;
+
+ AssemblerDialect = AsmWriterFlavor;
+
+ TextAlignFillValue = 0x90;
+
+ if (!is64Bit)
+ Data64bitsDirective = nullptr; // we can't emit a 64-bit unit
+
+ // Use ## as a comment string so that .s files generated by llvm can go
+ // through the GCC preprocessor without causing an error. This is needed
+ // because "clang foo.s" runs the C preprocessor, which is usually reserved
+ // for .S files on other systems. Perhaps this is because the file system
+ // wasn't always case preserving or something.
+ CommentString = "##";
+
+ SupportsDebugInformation = true;
+ UseDataRegionDirectives = MarkedJTDataRegions;
+
+ // Exceptions handling
+ ExceptionsType = ExceptionHandling::DwarfCFI;
+
+ // old assembler lacks some directives
+ // FIXME: this should really be a check on the assembler characteristics
+ // rather than OS version
+ if (T.isMacOSX() && T.isMacOSXVersionLT(10, 6))
+ HasWeakDefCanBeHiddenDirective = false;
+
+ // Assume ld64 is new enough that the abs-ified FDE relocs may be used
+ // (actually, must, since otherwise the non-extern relocations we produce
+ // overwhelm ld64's tiny little mind and it fails).
+ DwarfFDESymbolsUseAbsDiff = true;
+
+ UseIntegratedAssembler = true;
+}
+
+X86_64MCAsmInfoDarwin::X86_64MCAsmInfoDarwin(const Triple &Triple)
+ : X86MCAsmInfoDarwin(Triple) {
+}
+
+void X86ELFMCAsmInfo::anchor() { }
+
+X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) {
+ bool is64Bit = T.getArch() == Triple::x86_64;
+ bool isX32 = T.getEnvironment() == Triple::GNUX32;
+
+ // For ELF, x86-64 pointer size depends on the ABI.
+ // For x86-64 without the x32 ABI, pointer size is 8. For x86 and for x86-64
+ // with the x32 ABI, pointer size remains the default 4.
+ PointerSize = (is64Bit && !isX32) ? 8 : 4;
+
+ // OTOH, stack slot size is always 8 for x86-64, even with the x32 ABI.
+ CalleeSaveStackSlotSize = is64Bit ? 8 : 4;
+
+ AssemblerDialect = AsmWriterFlavor;
+
+ TextAlignFillValue = 0x90;
+
+ // Debug Information
+ SupportsDebugInformation = true;
+
+ // Exceptions handling
+ ExceptionsType = ExceptionHandling::DwarfCFI;
+
+ // Always enable the integrated assembler by default.
+ // Clang also enabled it when the OS is Solaris but that is redundant here.
+ UseIntegratedAssembler = true;
+}
+
+const MCExpr *
+X86_64MCAsmInfoDarwin::getExprForPersonalitySymbol(const MCSymbol *Sym,
+ unsigned Encoding,
+ MCStreamer &Streamer) const {
+ MCContext &Context = Streamer.getContext();
+ const MCExpr *Res =
+ MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Context);
+ const MCExpr *Four = MCConstantExpr::create(4, Context);
+ return MCBinaryExpr::createAdd(Res, Four, Context);
+}
+
+void X86MCAsmInfoMicrosoft::anchor() { }
+
+X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) {
+ if (Triple.getArch() == Triple::x86_64) {
+ PrivateGlobalPrefix = ".L";
+ PrivateLabelPrefix = ".L";
+ PointerSize = 8;
+ WinEHEncodingType = WinEH::EncodingType::Itanium;
+ } else {
+ // 32-bit X86 doesn't use CFI, so this isn't a real encoding type. It's just
+ // a place holder that the Windows EHStreamer looks for to suppress CFI
+ // output. In particular, usesWindowsCFI() returns false.
+ WinEHEncodingType = WinEH::EncodingType::X86;
+ }
+
+ ExceptionsType = ExceptionHandling::WinEH;
+
+ AssemblerDialect = AsmWriterFlavor;
+
+ TextAlignFillValue = 0x90;
+
+ AllowAtInName = true;
+
+ UseIntegratedAssembler = true;
+}
+
+void X86MCAsmInfoGNUCOFF::anchor() { }
+
+X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) {
+ assert(Triple.isOSWindows() && "Windows is the only supported COFF target");
+ if (Triple.getArch() == Triple::x86_64) {
+ PrivateGlobalPrefix = ".L";
+ PrivateLabelPrefix = ".L";
+ PointerSize = 8;
+ WinEHEncodingType = WinEH::EncodingType::Itanium;
+ ExceptionsType = ExceptionHandling::WinEH;
+ } else {
+ ExceptionsType = ExceptionHandling::DwarfCFI;
+ }
+
+ AssemblerDialect = AsmWriterFlavor;
+
+ TextAlignFillValue = 0x90;
+
+ UseIntegratedAssembler = true;
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
new file mode 100644
index 000000000000..30d5c802d1ed
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
@@ -0,0 +1,61 @@
+//===-- X86MCAsmInfo.h - X86 asm properties --------------------*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the X86MCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCASMINFO_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoCOFF.h"
+#include "llvm/MC/MCAsmInfoDarwin.h"
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+class Triple;
+
+class X86MCAsmInfoDarwin : public MCAsmInfoDarwin {
+ virtual void anchor();
+
+public:
+ explicit X86MCAsmInfoDarwin(const Triple &Triple);
+};
+
+struct X86_64MCAsmInfoDarwin : public X86MCAsmInfoDarwin {
+ explicit X86_64MCAsmInfoDarwin(const Triple &Triple);
+ const MCExpr *
+ getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
+ MCStreamer &Streamer) const override;
+};
+
+class X86ELFMCAsmInfo : public MCAsmInfoELF {
+ void anchor() override;
+
+public:
+ explicit X86ELFMCAsmInfo(const Triple &Triple);
+};
+
+class X86MCAsmInfoMicrosoft : public MCAsmInfoMicrosoft {
+ void anchor() override;
+
+public:
+ explicit X86MCAsmInfoMicrosoft(const Triple &Triple);
+};
+
+class X86MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF {
+ void anchor() override;
+
+public:
+ explicit X86MCAsmInfoGNUCOFF(const Triple &Triple);
+};
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
new file mode 100644
index 000000000000..8045e7c6d872
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -0,0 +1,1529 @@
+//===-- X86MCCodeEmitter.cpp - Convert X86 code to machine code -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the X86MCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mccodeemitter"
+
+namespace {
+class X86MCCodeEmitter : public MCCodeEmitter {
+ X86MCCodeEmitter(const X86MCCodeEmitter &) = delete;
+ void operator=(const X86MCCodeEmitter &) = delete;
+ const MCInstrInfo &MCII;
+ MCContext &Ctx;
+public:
+ X86MCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
+ : MCII(mcii), Ctx(ctx) {
+ }
+
+ ~X86MCCodeEmitter() override {}
+
+ bool is64BitMode(const MCSubtargetInfo &STI) const {
+ return STI.getFeatureBits()[X86::Mode64Bit];
+ }
+
+ bool is32BitMode(const MCSubtargetInfo &STI) const {
+ return STI.getFeatureBits()[X86::Mode32Bit];
+ }
+
+ bool is16BitMode(const MCSubtargetInfo &STI) const {
+ return STI.getFeatureBits()[X86::Mode16Bit];
+ }
+
+ /// Is16BitMemOperand - Return true if the specified instruction has
+ /// a 16-bit memory operand. Op specifies the operand # of the memoperand.
+ bool Is16BitMemOperand(const MCInst &MI, unsigned Op,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg);
+ const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+ const MCOperand &Disp = MI.getOperand(Op+X86::AddrDisp);
+
+ if (is16BitMode(STI) && BaseReg.getReg() == 0 &&
+ Disp.isImm() && Disp.getImm() < 0x10000)
+ return true;
+ if ((BaseReg.getReg() != 0 &&
+ X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg.getReg())) ||
+ (IndexReg.getReg() != 0 &&
+ X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg.getReg())))
+ return true;
+ return false;
+ }
+
+ unsigned GetX86RegNum(const MCOperand &MO) const {
+ return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg()) & 0x7;
+ }
+
+ unsigned getX86RegEncoding(const MCInst &MI, unsigned OpNum) const {
+ return Ctx.getRegisterInfo()->getEncodingValue(
+ MI.getOperand(OpNum).getReg());
+ }
+
+ // Does this register require a bit to be set in REX prefix.
+ bool isREXExtendedReg(const MCInst &MI, unsigned OpNum) const {
+ return (getX86RegEncoding(MI, OpNum) >> 3) & 1;
+ }
+
+ void EmitByte(uint8_t C, unsigned &CurByte, raw_ostream &OS) const {
+ OS << (char)C;
+ ++CurByte;
+ }
+
+ void EmitConstant(uint64_t Val, unsigned Size, unsigned &CurByte,
+ raw_ostream &OS) const {
+ // Output the constant in little endian byte order.
+ for (unsigned i = 0; i != Size; ++i) {
+ EmitByte(Val & 255, CurByte, OS);
+ Val >>= 8;
+ }
+ }
+
+ void EmitImmediate(const MCOperand &Disp, SMLoc Loc,
+ unsigned ImmSize, MCFixupKind FixupKind,
+ unsigned &CurByte, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ int ImmOffset = 0) const;
+
+ inline static uint8_t ModRMByte(unsigned Mod, unsigned RegOpcode,
+ unsigned RM) {
+ assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!");
+ return RM | (RegOpcode << 3) | (Mod << 6);
+ }
+
+ void EmitRegModRMByte(const MCOperand &ModRMReg, unsigned RegOpcodeFld,
+ unsigned &CurByte, raw_ostream &OS) const {
+ EmitByte(ModRMByte(3, RegOpcodeFld, GetX86RegNum(ModRMReg)), CurByte, OS);
+ }
+
+ void EmitSIBByte(unsigned SS, unsigned Index, unsigned Base,
+ unsigned &CurByte, raw_ostream &OS) const {
+ // SIB byte is in the same format as the ModRMByte.
+ EmitByte(ModRMByte(SS, Index, Base), CurByte, OS);
+ }
+
+ void emitMemModRMByte(const MCInst &MI, unsigned Op, unsigned RegOpcodeField,
+ uint64_t TSFlags, bool Rex, unsigned &CurByte,
+ raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const override;
+
+ void EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand,
+ const MCInst &MI, const MCInstrDesc &Desc,
+ raw_ostream &OS) const;
+
+ void EmitSegmentOverridePrefix(unsigned &CurByte, unsigned SegOperand,
+ const MCInst &MI, raw_ostream &OS) const;
+
+ bool emitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand,
+ const MCInst &MI, const MCInstrDesc &Desc,
+ const MCSubtargetInfo &STI, raw_ostream &OS) const;
+
+ uint8_t DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
+ int MemOperand, const MCInstrDesc &Desc) const;
+};
+
+} // end anonymous namespace
+
+MCCodeEmitter *llvm::createX86MCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx) {
+ return new X86MCCodeEmitter(MCII, Ctx);
+}
+
+/// isDisp8 - Return true if this signed displacement fits in a 8-bit
+/// sign-extended field.
+static bool isDisp8(int Value) {
+ return Value == (int8_t)Value;
+}
+
+/// isCDisp8 - Return true if this signed displacement fits in a 8-bit
+/// compressed dispacement field.
+static bool isCDisp8(uint64_t TSFlags, int Value, int& CValue) {
+ assert(((TSFlags & X86II::EncodingMask) == X86II::EVEX) &&
+ "Compressed 8-bit displacement is only valid for EVEX inst.");
+
+ unsigned CD8_Scale =
+ (TSFlags & X86II::CD8_Scale_Mask) >> X86II::CD8_Scale_Shift;
+ if (CD8_Scale == 0) {
+ CValue = Value;
+ return isDisp8(Value);
+ }
+
+ unsigned Mask = CD8_Scale - 1;
+ assert((CD8_Scale & Mask) == 0 && "Invalid memory object size.");
+ if (Value & Mask) // Unaligned offset
+ return false;
+ Value /= (int)CD8_Scale;
+ bool Ret = (Value == (int8_t)Value);
+
+ if (Ret)
+ CValue = Value;
+ return Ret;
+}
+
+/// getImmFixupKind - Return the appropriate fixup kind to use for an immediate
+/// in an instruction with the specified TSFlags.
+static MCFixupKind getImmFixupKind(uint64_t TSFlags) {
+ unsigned Size = X86II::getSizeOfImm(TSFlags);
+ bool isPCRel = X86II::isImmPCRel(TSFlags);
+
+ if (X86II::isImmSigned(TSFlags)) {
+ switch (Size) {
+ default: llvm_unreachable("Unsupported signed fixup size!");
+ case 4: return MCFixupKind(X86::reloc_signed_4byte);
+ }
+ }
+ return MCFixup::getKindForSize(Size, isPCRel);
+}
+
+/// Is32BitMemOperand - Return true if the specified instruction has
+/// a 32-bit memory operand. Op specifies the operand # of the memoperand.
+static bool Is32BitMemOperand(const MCInst &MI, unsigned Op) {
+ const MCOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg);
+ const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+
+ if ((BaseReg.getReg() != 0 &&
+ X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg.getReg())) ||
+ (IndexReg.getReg() != 0 &&
+ X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg.getReg())))
+ return true;
+ if (BaseReg.getReg() == X86::EIP) {
+ assert(IndexReg.getReg() == 0 && "Invalid eip-based address.");
+ return true;
+ }
+ return false;
+}
+
+/// Is64BitMemOperand - Return true if the specified instruction has
+/// a 64-bit memory operand. Op specifies the operand # of the memoperand.
+#ifndef NDEBUG
+static bool Is64BitMemOperand(const MCInst &MI, unsigned Op) {
+ const MCOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg);
+ const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+
+ if ((BaseReg.getReg() != 0 &&
+ X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg.getReg())) ||
+ (IndexReg.getReg() != 0 &&
+ X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg.getReg())))
+ return true;
+ return false;
+}
+#endif
+
+/// StartsWithGlobalOffsetTable - Check if this expression starts with
+/// _GLOBAL_OFFSET_TABLE_ and if it is of the form
+/// _GLOBAL_OFFSET_TABLE_-symbol. This is needed to support PIC on ELF
+/// i386 as _GLOBAL_OFFSET_TABLE_ is magical. We check only simple case that
+/// are know to be used: _GLOBAL_OFFSET_TABLE_ by itself or at the start
+/// of a binary expression.
+enum GlobalOffsetTableExprKind {
+ GOT_None,
+ GOT_Normal,
+ GOT_SymDiff
+};
+static GlobalOffsetTableExprKind
+StartsWithGlobalOffsetTable(const MCExpr *Expr) {
+ const MCExpr *RHS = nullptr;
+ if (Expr->getKind() == MCExpr::Binary) {
+ const MCBinaryExpr *BE = static_cast<const MCBinaryExpr *>(Expr);
+ Expr = BE->getLHS();
+ RHS = BE->getRHS();
+ }
+
+ if (Expr->getKind() != MCExpr::SymbolRef)
+ return GOT_None;
+
+ const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Expr);
+ const MCSymbol &S = Ref->getSymbol();
+ if (S.getName() != "_GLOBAL_OFFSET_TABLE_")
+ return GOT_None;
+ if (RHS && RHS->getKind() == MCExpr::SymbolRef)
+ return GOT_SymDiff;
+ return GOT_Normal;
+}
+
+static bool HasSecRelSymbolRef(const MCExpr *Expr) {
+ if (Expr->getKind() == MCExpr::SymbolRef) {
+ const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Expr);
+ return Ref->getKind() == MCSymbolRefExpr::VK_SECREL;
+ }
+ return false;
+}
+
+void X86MCCodeEmitter::
+EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size,
+ MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups, int ImmOffset) const {
+ const MCExpr *Expr = nullptr;
+ if (DispOp.isImm()) {
+ // If this is a simple integer displacement that doesn't require a
+ // relocation, emit it now.
+ if (FixupKind != FK_PCRel_1 &&
+ FixupKind != FK_PCRel_2 &&
+ FixupKind != FK_PCRel_4) {
+ EmitConstant(DispOp.getImm()+ImmOffset, Size, CurByte, OS);
+ return;
+ }
+ Expr = MCConstantExpr::create(DispOp.getImm(), Ctx);
+ } else {
+ Expr = DispOp.getExpr();
+ }
+
+ // If we have an immoffset, add it to the expression.
+ if ((FixupKind == FK_Data_4 ||
+ FixupKind == FK_Data_8 ||
+ FixupKind == MCFixupKind(X86::reloc_signed_4byte))) {
+ GlobalOffsetTableExprKind Kind = StartsWithGlobalOffsetTable(Expr);
+ if (Kind != GOT_None) {
+ assert(ImmOffset == 0);
+
+ if (Size == 8) {
+ FixupKind = MCFixupKind(X86::reloc_global_offset_table8);
+ } else {
+ assert(Size == 4);
+ FixupKind = MCFixupKind(X86::reloc_global_offset_table);
+ }
+
+ if (Kind == GOT_Normal)
+ ImmOffset = CurByte;
+ } else if (Expr->getKind() == MCExpr::SymbolRef) {
+ if (HasSecRelSymbolRef(Expr)) {
+ FixupKind = MCFixupKind(FK_SecRel_4);
+ }
+ } else if (Expr->getKind() == MCExpr::Binary) {
+ const MCBinaryExpr *Bin = static_cast<const MCBinaryExpr*>(Expr);
+ if (HasSecRelSymbolRef(Bin->getLHS())
+ || HasSecRelSymbolRef(Bin->getRHS())) {
+ FixupKind = MCFixupKind(FK_SecRel_4);
+ }
+ }
+ }
+
+ // If the fixup is pc-relative, we need to bias the value to be relative to
+ // the start of the field, not the end of the field.
+ if (FixupKind == FK_PCRel_4 ||
+ FixupKind == MCFixupKind(X86::reloc_riprel_4byte) ||
+ FixupKind == MCFixupKind(X86::reloc_riprel_4byte_movq_load) ||
+ FixupKind == MCFixupKind(X86::reloc_riprel_4byte_relax) ||
+ FixupKind == MCFixupKind(X86::reloc_riprel_4byte_relax_rex))
+ ImmOffset -= 4;
+ if (FixupKind == FK_PCRel_2)
+ ImmOffset -= 2;
+ if (FixupKind == FK_PCRel_1)
+ ImmOffset -= 1;
+
+ if (ImmOffset)
+ Expr = MCBinaryExpr::createAdd(Expr, MCConstantExpr::create(ImmOffset, Ctx),
+ Ctx);
+
+ // Emit a symbolic constant as a fixup and 4 zeros.
+ Fixups.push_back(MCFixup::create(CurByte, Expr, FixupKind, Loc));
+ EmitConstant(0, Size, CurByte, OS);
+}
+
+void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
+ unsigned RegOpcodeField,
+ uint64_t TSFlags, bool Rex,
+ unsigned &CurByte, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &Disp = MI.getOperand(Op+X86::AddrDisp);
+ const MCOperand &Base = MI.getOperand(Op+X86::AddrBaseReg);
+ const MCOperand &Scale = MI.getOperand(Op+X86::AddrScaleAmt);
+ const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+ unsigned BaseReg = Base.getReg();
+ bool HasEVEX = (TSFlags & X86II::EncodingMask) == X86II::EVEX;
+
+ // Handle %rip relative addressing.
+ if (BaseReg == X86::RIP ||
+ BaseReg == X86::EIP) { // [disp32+rIP] in X86-64 mode
+ assert(is64BitMode(STI) && "Rip-relative addressing requires 64-bit mode");
+ assert(IndexReg.getReg() == 0 && "Invalid rip-relative address");
+ EmitByte(ModRMByte(0, RegOpcodeField, 5), CurByte, OS);
+
+ unsigned Opcode = MI.getOpcode();
+ // movq loads are handled with a special relocation form which allows the
+ // linker to eliminate some loads for GOT references which end up in the
+ // same linkage unit.
+ unsigned FixupKind = [=]() {
+ switch (Opcode) {
+ default:
+ return X86::reloc_riprel_4byte;
+ case X86::MOV64rm:
+ assert(Rex);
+ return X86::reloc_riprel_4byte_movq_load;
+ case X86::CALL64m:
+ case X86::JMP64m:
+ case X86::TEST64rm:
+ case X86::ADC64rm:
+ case X86::ADD64rm:
+ case X86::AND64rm:
+ case X86::CMP64rm:
+ case X86::OR64rm:
+ case X86::SBB64rm:
+ case X86::SUB64rm:
+ case X86::XOR64rm:
+ return Rex ? X86::reloc_riprel_4byte_relax_rex
+ : X86::reloc_riprel_4byte_relax;
+ }
+ }();
+
+ // rip-relative addressing is actually relative to the *next* instruction.
+ // Since an immediate can follow the mod/rm byte for an instruction, this
+ // means that we need to bias the immediate field of the instruction with
+ // the size of the immediate field. If we have this case, add it into the
+ // expression to emit.
+ int ImmSize = X86II::hasImm(TSFlags) ? X86II::getSizeOfImm(TSFlags) : 0;
+
+ EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind),
+ CurByte, OS, Fixups, -ImmSize);
+ return;
+ }
+
+ unsigned BaseRegNo = BaseReg ? GetX86RegNum(Base) : -1U;
+
+ // 16-bit addressing forms of the ModR/M byte have a different encoding for
+ // the R/M field and are far more limited in which registers can be used.
+ if (Is16BitMemOperand(MI, Op, STI)) {
+ if (BaseReg) {
+ // For 32-bit addressing, the row and column values in Table 2-2 are
+ // basically the same. It's AX/CX/DX/BX/SP/BP/SI/DI in that order, with
+ // some special cases. And GetX86RegNum reflects that numbering.
+ // For 16-bit addressing it's more fun, as shown in the SDM Vol 2A,
+ // Table 2-1 "16-Bit Addressing Forms with the ModR/M byte". We can only
+ // use SI/DI/BP/BX, which have "row" values 4-7 in no particular order,
+ // while values 0-3 indicate the allowed combinations (base+index) of
+ // those: 0 for BX+SI, 1 for BX+DI, 2 for BP+SI, 3 for BP+DI.
+ //
+ // R16Table[] is a lookup from the normal RegNo, to the row values from
+ // Table 2-1 for 16-bit addressing modes. Where zero means disallowed.
+ static const unsigned R16Table[] = { 0, 0, 0, 7, 0, 6, 4, 5 };
+ unsigned RMfield = R16Table[BaseRegNo];
+
+ assert(RMfield && "invalid 16-bit base register");
+
+ if (IndexReg.getReg()) {
+ unsigned IndexReg16 = R16Table[GetX86RegNum(IndexReg)];
+
+ assert(IndexReg16 && "invalid 16-bit index register");
+ // We must have one of SI/DI (4,5), and one of BP/BX (6,7).
+ assert(((IndexReg16 ^ RMfield) & 2) &&
+ "invalid 16-bit base/index register combination");
+ assert(Scale.getImm() == 1 &&
+ "invalid scale for 16-bit memory reference");
+
+ // Allow base/index to appear in either order (although GAS doesn't).
+ if (IndexReg16 & 2)
+ RMfield = (RMfield & 1) | ((7 - IndexReg16) << 1);
+ else
+ RMfield = (IndexReg16 & 1) | ((7 - RMfield) << 1);
+ }
+
+ if (Disp.isImm() && isDisp8(Disp.getImm())) {
+ if (Disp.getImm() == 0 && BaseRegNo != N86::EBP) {
+ // There is no displacement; just the register.
+ EmitByte(ModRMByte(0, RegOpcodeField, RMfield), CurByte, OS);
+ return;
+ }
+ // Use the [REG]+disp8 form, including for [BP] which cannot be encoded.
+ EmitByte(ModRMByte(1, RegOpcodeField, RMfield), CurByte, OS);
+ EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups);
+ return;
+ }
+ // This is the [REG]+disp16 case.
+ EmitByte(ModRMByte(2, RegOpcodeField, RMfield), CurByte, OS);
+ } else {
+ // There is no BaseReg; this is the plain [disp16] case.
+ EmitByte(ModRMByte(0, RegOpcodeField, 6), CurByte, OS);
+ }
+
+ // Emit 16-bit displacement for plain disp16 or [REG]+disp16 cases.
+ EmitImmediate(Disp, MI.getLoc(), 2, FK_Data_2, CurByte, OS, Fixups);
+ return;
+ }
+
+ // Determine whether a SIB byte is needed.
+ // If no BaseReg, issue a RIP relative instruction only if the MCE can
+ // resolve addresses on-the-fly, otherwise use SIB (Intel Manual 2A, table
+ // 2-7) and absolute references.
+
+ if (// The SIB byte must be used if there is an index register.
+ IndexReg.getReg() == 0 &&
+ // The SIB byte must be used if the base is ESP/RSP/R12, all of which
+ // encode to an R/M value of 4, which indicates that a SIB byte is
+ // present.
+ BaseRegNo != N86::ESP &&
+ // If there is no base register and we're in 64-bit mode, we need a SIB
+ // byte to emit an addr that is just 'disp32' (the non-RIP relative form).
+ (!is64BitMode(STI) || BaseReg != 0)) {
+
+ if (BaseReg == 0) { // [disp32] in X86-32 mode
+ EmitByte(ModRMByte(0, RegOpcodeField, 5), CurByte, OS);
+ EmitImmediate(Disp, MI.getLoc(), 4, FK_Data_4, CurByte, OS, Fixups);
+ return;
+ }
+
+ // If the base is not EBP/ESP and there is no displacement, use simple
+ // indirect register encoding, this handles addresses like [EAX]. The
+ // encoding for [EBP] with no displacement means [disp32] so we handle it
+ // by emitting a displacement of 0 below.
+ if (Disp.isImm() && Disp.getImm() == 0 && BaseRegNo != N86::EBP) {
+ EmitByte(ModRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS);
+ return;
+ }
+
+ // Otherwise, if the displacement fits in a byte, encode as [REG+disp8].
+ if (Disp.isImm()) {
+ if (!HasEVEX && isDisp8(Disp.getImm())) {
+ EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS);
+ EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups);
+ return;
+ }
+ // Try EVEX compressed 8-bit displacement first; if failed, fall back to
+ // 32-bit displacement.
+ int CDisp8 = 0;
+ if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) {
+ EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS);
+ EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups,
+ CDisp8 - Disp.getImm());
+ return;
+ }
+ }
+
+ // Otherwise, emit the most general non-SIB encoding: [REG+disp32]
+ EmitByte(ModRMByte(2, RegOpcodeField, BaseRegNo), CurByte, OS);
+ unsigned Opcode = MI.getOpcode();
+ unsigned FixupKind = Opcode == X86::MOV32rm ? X86::reloc_signed_4byte_relax
+ : X86::reloc_signed_4byte;
+ EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind), CurByte, OS,
+ Fixups);
+ return;
+ }
+
+ // We need a SIB byte, so start by outputting the ModR/M byte first
+ assert(IndexReg.getReg() != X86::ESP &&
+ IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!");
+
+ bool ForceDisp32 = false;
+ bool ForceDisp8 = false;
+ int CDisp8 = 0;
+ int ImmOffset = 0;
+ if (BaseReg == 0) {
+ // If there is no base register, we emit the special case SIB byte with
+ // MOD=0, BASE=5, to JUST get the index, scale, and displacement.
+ EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS);
+ ForceDisp32 = true;
+ } else if (!Disp.isImm()) {
+ // Emit the normal disp32 encoding.
+ EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS);
+ ForceDisp32 = true;
+ } else if (Disp.getImm() == 0 &&
+ // Base reg can't be anything that ends up with '5' as the base
+ // reg, it is the magic [*] nomenclature that indicates no base.
+ BaseRegNo != N86::EBP) {
+ // Emit no displacement ModR/M byte
+ EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS);
+ } else if (!HasEVEX && isDisp8(Disp.getImm())) {
+ // Emit the disp8 encoding.
+ EmitByte(ModRMByte(1, RegOpcodeField, 4), CurByte, OS);
+ ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP
+ } else if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) {
+ // Emit the disp8 encoding.
+ EmitByte(ModRMByte(1, RegOpcodeField, 4), CurByte, OS);
+ ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP
+ ImmOffset = CDisp8 - Disp.getImm();
+ } else {
+ // Emit the normal disp32 encoding.
+ EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS);
+ }
+
+ // Calculate what the SS field value should be...
+ static const unsigned SSTable[] = { ~0U, 0, 1, ~0U, 2, ~0U, ~0U, ~0U, 3 };
+ unsigned SS = SSTable[Scale.getImm()];
+
+ if (BaseReg == 0) {
+ // Handle the SIB byte for the case where there is no base, see Intel
+ // Manual 2A, table 2-7. The displacement has already been output.
+ unsigned IndexRegNo;
+ if (IndexReg.getReg())
+ IndexRegNo = GetX86RegNum(IndexReg);
+ else // Examples: [ESP+1*<noreg>+4] or [scaled idx]+disp32 (MOD=0,BASE=5)
+ IndexRegNo = 4;
+ EmitSIBByte(SS, IndexRegNo, 5, CurByte, OS);
+ } else {
+ unsigned IndexRegNo;
+ if (IndexReg.getReg())
+ IndexRegNo = GetX86RegNum(IndexReg);
+ else
+ IndexRegNo = 4; // For example [ESP+1*<noreg>+4]
+ EmitSIBByte(SS, IndexRegNo, GetX86RegNum(Base), CurByte, OS);
+ }
+
+ // Do we need to output a displacement?
+ if (ForceDisp8)
+ EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups, ImmOffset);
+ else if (ForceDisp32 || Disp.getImm() != 0)
+ EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte),
+ CurByte, OS, Fixups);
+}
+
+/// EmitVEXOpcodePrefix - AVX instructions are encoded using a opcode prefix
+/// called VEX.
+void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
+ int MemOperand, const MCInst &MI,
+ const MCInstrDesc &Desc,
+ raw_ostream &OS) const {
+ assert(!(TSFlags & X86II::LOCK) && "Can't have LOCK VEX.");
+
+ uint64_t Encoding = TSFlags & X86II::EncodingMask;
+ bool HasEVEX_K = TSFlags & X86II::EVEX_K;
+ bool HasVEX_4V = TSFlags & X86II::VEX_4V;
+ bool HasEVEX_RC = TSFlags & X86II::EVEX_RC;
+
+ // VEX_R: opcode externsion equivalent to REX.R in
+ // 1's complement (inverted) form
+ //
+ // 1: Same as REX_R=0 (must be 1 in 32-bit mode)
+ // 0: Same as REX_R=1 (64 bit mode only)
+ //
+ uint8_t VEX_R = 0x1;
+ uint8_t EVEX_R2 = 0x1;
+
+ // VEX_X: equivalent to REX.X, only used when a
+ // register is used for index in SIB Byte.
+ //
+ // 1: Same as REX.X=0 (must be 1 in 32-bit mode)
+ // 0: Same as REX.X=1 (64-bit mode only)
+ uint8_t VEX_X = 0x1;
+
+ // VEX_B:
+ //
+ // 1: Same as REX_B=0 (ignored in 32-bit mode)
+ // 0: Same as REX_B=1 (64 bit mode only)
+ //
+ uint8_t VEX_B = 0x1;
+
+ // VEX_W: opcode specific (use like REX.W, or used for
+ // opcode extension, or ignored, depending on the opcode byte)
+ uint8_t VEX_W = (TSFlags & X86II::VEX_W) ? 1 : 0;
+
+ // VEX_5M (VEX m-mmmmm field):
+ //
+ // 0b00000: Reserved for future use
+ // 0b00001: implied 0F leading opcode
+ // 0b00010: implied 0F 38 leading opcode bytes
+ // 0b00011: implied 0F 3A leading opcode bytes
+ // 0b00100-0b11111: Reserved for future use
+ // 0b01000: XOP map select - 08h instructions with imm byte
+ // 0b01001: XOP map select - 09h instructions with no imm byte
+ // 0b01010: XOP map select - 0Ah instructions with imm dword
+ uint8_t VEX_5M;
+ switch (TSFlags & X86II::OpMapMask) {
+ default: llvm_unreachable("Invalid prefix!");
+ case X86II::TB: VEX_5M = 0x1; break; // 0F
+ case X86II::T8: VEX_5M = 0x2; break; // 0F 38
+ case X86II::TA: VEX_5M = 0x3; break; // 0F 3A
+ case X86II::XOP8: VEX_5M = 0x8; break;
+ case X86II::XOP9: VEX_5M = 0x9; break;
+ case X86II::XOPA: VEX_5M = 0xA; break;
+ }
+
+ // VEX_4V (VEX vvvv field): a register specifier
+ // (in 1's complement form) or 1111 if unused.
+ uint8_t VEX_4V = 0xf;
+ uint8_t EVEX_V2 = 0x1;
+
+ // EVEX_L2/VEX_L (Vector Length):
+ //
+ // L2 L
+ // 0 0: scalar or 128-bit vector
+ // 0 1: 256-bit vector
+ // 1 0: 512-bit vector
+ //
+ uint8_t VEX_L = (TSFlags & X86II::VEX_L) ? 1 : 0;
+ uint8_t EVEX_L2 = (TSFlags & X86II::EVEX_L2) ? 1 : 0;
+
+ // VEX_PP: opcode extension providing equivalent
+ // functionality of a SIMD prefix
+ //
+ // 0b00: None
+ // 0b01: 66
+ // 0b10: F3
+ // 0b11: F2
+ //
+ uint8_t VEX_PP;
+ switch (TSFlags & X86II::OpPrefixMask) {
+ default: llvm_unreachable("Invalid op prefix!");
+ case X86II::PS: VEX_PP = 0x0; break; // none
+ case X86II::PD: VEX_PP = 0x1; break; // 66
+ case X86II::XS: VEX_PP = 0x2; break; // F3
+ case X86II::XD: VEX_PP = 0x3; break; // F2
+ }
+
+ // EVEX_U
+ uint8_t EVEX_U = 1; // Always '1' so far
+
+ // EVEX_z
+ uint8_t EVEX_z = (HasEVEX_K && (TSFlags & X86II::EVEX_Z)) ? 1 : 0;
+
+ // EVEX_b
+ uint8_t EVEX_b = (TSFlags & X86II::EVEX_B) ? 1 : 0;
+
+ // EVEX_rc
+ uint8_t EVEX_rc = 0;
+
+ // EVEX_aaa
+ uint8_t EVEX_aaa = 0;
+
+ bool EncodeRC = false;
+
+ // Classify VEX_B, VEX_4V, VEX_R, VEX_X
+ unsigned NumOps = Desc.getNumOperands();
+ unsigned CurOp = X86II::getOperandBias(Desc);
+
+ switch (TSFlags & X86II::FormMask) {
+ default: llvm_unreachable("Unexpected form in EmitVEXOpcodePrefix!");
+ case X86II::RawFrm:
+ break;
+ case X86II::MRMDestMem: {
+ // MRMDestMem instructions forms:
+ // MemAddr, src1(ModR/M)
+ // MemAddr, src1(VEX_4V), src2(ModR/M)
+ // MemAddr, src1(ModR/M), imm8
+ //
+ unsigned BaseRegEnc = getX86RegEncoding(MI, MemOperand + X86::AddrBaseReg);
+ VEX_B = ~(BaseRegEnc >> 3) & 1;
+ unsigned IndexRegEnc = getX86RegEncoding(MI, MemOperand+X86::AddrIndexReg);
+ VEX_X = ~(IndexRegEnc >> 3) & 1;
+ if (!HasVEX_4V) // Only needed with VSIB which don't use VVVV.
+ EVEX_V2 = ~(IndexRegEnc >> 4) & 1;
+
+ CurOp += X86::AddrNumOperands;
+
+ if (HasEVEX_K)
+ EVEX_aaa = getX86RegEncoding(MI, CurOp++);
+
+ if (HasVEX_4V) {
+ unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_4V = ~VRegEnc & 0xf;
+ EVEX_V2 = ~(VRegEnc >> 4) & 1;
+ }
+
+ unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_R = ~(RegEnc >> 3) & 1;
+ EVEX_R2 = ~(RegEnc >> 4) & 1;
+ break;
+ }
+ case X86II::MRMSrcMem: {
+ // MRMSrcMem instructions forms:
+ // src1(ModR/M), MemAddr
+ // src1(ModR/M), src2(VEX_4V), MemAddr
+ // src1(ModR/M), MemAddr, imm8
+ // src1(ModR/M), MemAddr, src2(Imm[7:4])
+ //
+ // FMA4:
+ // dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(Imm[7:4])
+ unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_R = ~(RegEnc >> 3) & 1;
+ EVEX_R2 = ~(RegEnc >> 4) & 1;
+
+ if (HasEVEX_K)
+ EVEX_aaa = getX86RegEncoding(MI, CurOp++);
+
+ if (HasVEX_4V) {
+ unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_4V = ~VRegEnc & 0xf;
+ EVEX_V2 = ~(VRegEnc >> 4) & 1;
+ }
+
+ unsigned BaseRegEnc = getX86RegEncoding(MI, MemOperand + X86::AddrBaseReg);
+ VEX_B = ~(BaseRegEnc >> 3) & 1;
+ unsigned IndexRegEnc = getX86RegEncoding(MI, MemOperand+X86::AddrIndexReg);
+ VEX_X = ~(IndexRegEnc >> 3) & 1;
+ if (!HasVEX_4V) // Only needed with VSIB which don't use VVVV.
+ EVEX_V2 = ~(IndexRegEnc >> 4) & 1;
+
+ break;
+ }
+ case X86II::MRMSrcMem4VOp3: {
+ // Instruction format for 4VOp3:
+ // src1(ModR/M), MemAddr, src3(VEX_4V)
+ unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_R = ~(RegEnc >> 3) & 1;
+
+ unsigned BaseRegEnc = getX86RegEncoding(MI, MemOperand + X86::AddrBaseReg);
+ VEX_B = ~(BaseRegEnc >> 3) & 1;
+ unsigned IndexRegEnc = getX86RegEncoding(MI, MemOperand+X86::AddrIndexReg);
+ VEX_X = ~(IndexRegEnc >> 3) & 1;
+
+ VEX_4V = ~getX86RegEncoding(MI, CurOp + X86::AddrNumOperands) & 0xf;
+ break;
+ }
+ case X86II::MRMSrcMemOp4: {
+ // dst(ModR/M.reg), src1(VEX_4V), src2(Imm[7:4]), src3(ModR/M),
+ unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_R = ~(RegEnc >> 3) & 1;
+
+ unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_4V = ~VRegEnc & 0xf;
+
+ unsigned BaseRegEnc = getX86RegEncoding(MI, MemOperand + X86::AddrBaseReg);
+ VEX_B = ~(BaseRegEnc >> 3) & 1;
+ unsigned IndexRegEnc = getX86RegEncoding(MI, MemOperand+X86::AddrIndexReg);
+ VEX_X = ~(IndexRegEnc >> 3) & 1;
+ break;
+ }
+ case X86II::MRM0m: case X86II::MRM1m:
+ case X86II::MRM2m: case X86II::MRM3m:
+ case X86II::MRM4m: case X86II::MRM5m:
+ case X86II::MRM6m: case X86II::MRM7m: {
+ // MRM[0-9]m instructions forms:
+ // MemAddr
+ // src1(VEX_4V), MemAddr
+ if (HasVEX_4V) {
+ unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_4V = ~VRegEnc & 0xf;
+ EVEX_V2 = ~(VRegEnc >> 4) & 1;
+ }
+
+ if (HasEVEX_K)
+ EVEX_aaa = getX86RegEncoding(MI, CurOp++);
+
+ unsigned BaseRegEnc = getX86RegEncoding(MI, MemOperand + X86::AddrBaseReg);
+ VEX_B = ~(BaseRegEnc >> 3) & 1;
+ unsigned IndexRegEnc = getX86RegEncoding(MI, MemOperand+X86::AddrIndexReg);
+ VEX_X = ~(IndexRegEnc >> 3) & 1;
+ break;
+ }
+ case X86II::MRMSrcReg: {
+ // MRMSrcReg instructions forms:
+ // dst(ModR/M), src1(VEX_4V), src2(ModR/M), src3(Imm[7:4])
+ // dst(ModR/M), src1(ModR/M)
+ // dst(ModR/M), src1(ModR/M), imm8
+ //
+ // FMA4:
+ // dst(ModR/M.reg), src1(VEX_4V), src2(Imm[7:4]), src3(ModR/M),
+ unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_R = ~(RegEnc >> 3) & 1;
+ EVEX_R2 = ~(RegEnc >> 4) & 1;
+
+ if (HasEVEX_K)
+ EVEX_aaa = getX86RegEncoding(MI, CurOp++);
+
+ if (HasVEX_4V) {
+ unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_4V = ~VRegEnc & 0xf;
+ EVEX_V2 = ~(VRegEnc >> 4) & 1;
+ }
+
+ RegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_B = ~(RegEnc >> 3) & 1;
+ VEX_X = ~(RegEnc >> 4) & 1;
+
+ if (EVEX_b) {
+ if (HasEVEX_RC) {
+ unsigned RcOperand = NumOps-1;
+ assert(RcOperand >= CurOp);
+ EVEX_rc = MI.getOperand(RcOperand).getImm() & 0x3;
+ }
+ EncodeRC = true;
+ }
+ break;
+ }
+ case X86II::MRMSrcReg4VOp3: {
+ // Instruction format for 4VOp3:
+ // src1(ModR/M), src2(ModR/M), src3(VEX_4V)
+ unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_R = ~(RegEnc >> 3) & 1;
+
+ RegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_B = ~(RegEnc >> 3) & 1;
+
+ VEX_4V = ~getX86RegEncoding(MI, CurOp++) & 0xf;
+ break;
+ }
+ case X86II::MRMSrcRegOp4: {
+ // dst(ModR/M.reg), src1(VEX_4V), src2(Imm[7:4]), src3(ModR/M),
+ unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_R = ~(RegEnc >> 3) & 1;
+
+ unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_4V = ~VRegEnc & 0xf;
+
+ // Skip second register source (encoded in Imm[7:4])
+ ++CurOp;
+
+ RegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_B = ~(RegEnc >> 3) & 1;
+ VEX_X = ~(RegEnc >> 4) & 1;
+ break;
+ }
+ case X86II::MRMDestReg: {
+ // MRMDestReg instructions forms:
+ // dst(ModR/M), src(ModR/M)
+ // dst(ModR/M), src(ModR/M), imm8
+ // dst(ModR/M), src1(VEX_4V), src2(ModR/M)
+ unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_B = ~(RegEnc >> 3) & 1;
+ VEX_X = ~(RegEnc >> 4) & 1;
+
+ if (HasEVEX_K)
+ EVEX_aaa = getX86RegEncoding(MI, CurOp++);
+
+ if (HasVEX_4V) {
+ unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_4V = ~VRegEnc & 0xf;
+ EVEX_V2 = ~(VRegEnc >> 4) & 1;
+ }
+
+ RegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_R = ~(RegEnc >> 3) & 1;
+ EVEX_R2 = ~(RegEnc >> 4) & 1;
+ if (EVEX_b)
+ EncodeRC = true;
+ break;
+ }
+ case X86II::MRM0r: case X86II::MRM1r:
+ case X86II::MRM2r: case X86II::MRM3r:
+ case X86II::MRM4r: case X86II::MRM5r:
+ case X86II::MRM6r: case X86II::MRM7r: {
+ // MRM0r-MRM7r instructions forms:
+ // dst(VEX_4V), src(ModR/M), imm8
+ if (HasVEX_4V) {
+ unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_4V = ~VRegEnc & 0xf;
+ EVEX_V2 = ~(VRegEnc >> 4) & 1;
+ }
+ if (HasEVEX_K)
+ EVEX_aaa = getX86RegEncoding(MI, CurOp++);
+
+ unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_B = ~(RegEnc >> 3) & 1;
+ VEX_X = ~(RegEnc >> 4) & 1;
+ break;
+ }
+ }
+
+ if (Encoding == X86II::VEX || Encoding == X86II::XOP) {
+ // VEX opcode prefix can have 2 or 3 bytes
+ //
+ // 3 bytes:
+ // +-----+ +--------------+ +-------------------+
+ // | C4h | | RXB | m-mmmm | | W | vvvv | L | pp |
+ // +-----+ +--------------+ +-------------------+
+ // 2 bytes:
+ // +-----+ +-------------------+
+ // | C5h | | R | vvvv | L | pp |
+ // +-----+ +-------------------+
+ //
+ // XOP uses a similar prefix:
+ // +-----+ +--------------+ +-------------------+
+ // | 8Fh | | RXB | m-mmmm | | W | vvvv | L | pp |
+ // +-----+ +--------------+ +-------------------+
+ uint8_t LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3);
+
+ // Can we use the 2 byte VEX prefix?
+ if (Encoding == X86II::VEX && VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) {
+ EmitByte(0xC5, CurByte, OS);
+ EmitByte(LastByte | (VEX_R << 7), CurByte, OS);
+ return;
+ }
+
+ // 3 byte VEX prefix
+ EmitByte(Encoding == X86II::XOP ? 0x8F : 0xC4, CurByte, OS);
+ EmitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, CurByte, OS);
+ EmitByte(LastByte | (VEX_W << 7), CurByte, OS);
+ } else {
+ assert(Encoding == X86II::EVEX && "unknown encoding!");
+ // EVEX opcode prefix can have 4 bytes
+ //
+ // +-----+ +--------------+ +-------------------+ +------------------------+
+ // | 62h | | RXBR' | 00mm | | W | vvvv | U | pp | | z | L'L | b | v' | aaa |
+ // +-----+ +--------------+ +-------------------+ +------------------------+
+ assert((VEX_5M & 0x3) == VEX_5M
+ && "More than 2 significant bits in VEX.m-mmmm fields for EVEX!");
+
+ EmitByte(0x62, CurByte, OS);
+ EmitByte((VEX_R << 7) |
+ (VEX_X << 6) |
+ (VEX_B << 5) |
+ (EVEX_R2 << 4) |
+ VEX_5M, CurByte, OS);
+ EmitByte((VEX_W << 7) |
+ (VEX_4V << 3) |
+ (EVEX_U << 2) |
+ VEX_PP, CurByte, OS);
+ if (EncodeRC)
+ EmitByte((EVEX_z << 7) |
+ (EVEX_rc << 5) |
+ (EVEX_b << 4) |
+ (EVEX_V2 << 3) |
+ EVEX_aaa, CurByte, OS);
+ else
+ EmitByte((EVEX_z << 7) |
+ (EVEX_L2 << 6) |
+ (VEX_L << 5) |
+ (EVEX_b << 4) |
+ (EVEX_V2 << 3) |
+ EVEX_aaa, CurByte, OS);
+ }
+}
+
+/// DetermineREXPrefix - Determine if the MCInst has to be encoded with a X86-64
+/// REX prefix which specifies 1) 64-bit instructions, 2) non-default operand
+/// size, and 3) use of X86-64 extended registers.
+uint8_t X86MCCodeEmitter::DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
+ int MemOperand,
+ const MCInstrDesc &Desc) const {
+ uint8_t REX = 0;
+ bool UsesHighByteReg = false;
+
+ if (TSFlags & X86II::REX_W)
+ REX |= 1 << 3; // set REX.W
+
+ if (MI.getNumOperands() == 0) return REX;
+
+ unsigned NumOps = MI.getNumOperands();
+ unsigned CurOp = X86II::getOperandBias(Desc);
+
+ // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
+ for (unsigned i = CurOp; i != NumOps; ++i) {
+ const MCOperand &MO = MI.getOperand(i);
+ if (!MO.isReg()) continue;
+ unsigned Reg = MO.getReg();
+ if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH || Reg == X86::DH)
+ UsesHighByteReg = true;
+ if (X86II::isX86_64NonExtLowByteReg(Reg))
+ // FIXME: The caller of DetermineREXPrefix slaps this prefix onto anything
+ // that returns non-zero.
+ REX |= 0x40; // REX fixed encoding prefix
+ }
+
+ switch (TSFlags & X86II::FormMask) {
+ case X86II::AddRegFrm:
+ REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
+ break;
+ case X86II::MRMSrcReg:
+ REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
+ REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
+ break;
+ case X86II::MRMSrcMem: {
+ REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
+ REX |= isREXExtendedReg(MI, MemOperand+X86::AddrBaseReg) << 0; // REX.B
+ REX |= isREXExtendedReg(MI, MemOperand+X86::AddrIndexReg) << 1; // REX.X
+ CurOp += X86::AddrNumOperands;
+ break;
+ }
+ case X86II::MRMDestReg:
+ REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
+ REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
+ break;
+ case X86II::MRMDestMem:
+ REX |= isREXExtendedReg(MI, MemOperand+X86::AddrBaseReg) << 0; // REX.B
+ REX |= isREXExtendedReg(MI, MemOperand+X86::AddrIndexReg) << 1; // REX.X
+ CurOp += X86::AddrNumOperands;
+ REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
+ break;
+ case X86II::MRMXm:
+ case X86II::MRM0m: case X86II::MRM1m:
+ case X86II::MRM2m: case X86II::MRM3m:
+ case X86II::MRM4m: case X86II::MRM5m:
+ case X86II::MRM6m: case X86II::MRM7m:
+ REX |= isREXExtendedReg(MI, MemOperand+X86::AddrBaseReg) << 0; // REX.B
+ REX |= isREXExtendedReg(MI, MemOperand+X86::AddrIndexReg) << 1; // REX.X
+ break;
+ case X86II::MRMXr:
+ case X86II::MRM0r: case X86II::MRM1r:
+ case X86II::MRM2r: case X86II::MRM3r:
+ case X86II::MRM4r: case X86II::MRM5r:
+ case X86II::MRM6r: case X86II::MRM7r:
+ REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
+ break;
+ }
+ if (REX && UsesHighByteReg)
+ report_fatal_error("Cannot encode high byte register in REX-prefixed instruction");
+
+ return REX;
+}
+
+/// EmitSegmentOverridePrefix - Emit segment override opcode prefix as needed
+void X86MCCodeEmitter::EmitSegmentOverridePrefix(unsigned &CurByte,
+ unsigned SegOperand,
+ const MCInst &MI,
+ raw_ostream &OS) const {
+ // Check for explicit segment override on memory operand.
+ switch (MI.getOperand(SegOperand).getReg()) {
+ default: llvm_unreachable("Unknown segment register!");
+ case 0: break;
+ case X86::CS: EmitByte(0x2E, CurByte, OS); break;
+ case X86::SS: EmitByte(0x36, CurByte, OS); break;
+ case X86::DS: EmitByte(0x3E, CurByte, OS); break;
+ case X86::ES: EmitByte(0x26, CurByte, OS); break;
+ case X86::FS: EmitByte(0x64, CurByte, OS); break;
+ case X86::GS: EmitByte(0x65, CurByte, OS); break;
+ }
+}
+
+/// Emit all instruction prefixes prior to the opcode.
+///
+/// MemOperand is the operand # of the start of a memory operand if present. If
+/// Not present, it is -1.
+///
+/// Returns true if a REX prefix was used.
+bool X86MCCodeEmitter::emitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
+ int MemOperand, const MCInst &MI,
+ const MCInstrDesc &Desc,
+ const MCSubtargetInfo &STI,
+ raw_ostream &OS) const {
+ bool Ret = false;
+ // Emit the operand size opcode prefix as needed.
+ if ((TSFlags & X86II::OpSizeMask) == (is16BitMode(STI) ? X86II::OpSize32
+ : X86II::OpSize16))
+ EmitByte(0x66, CurByte, OS);
+
+ // Emit the LOCK opcode prefix.
+ if (TSFlags & X86II::LOCK)
+ EmitByte(0xF0, CurByte, OS);
+
+ switch (TSFlags & X86II::OpPrefixMask) {
+ case X86II::PD: // 66
+ EmitByte(0x66, CurByte, OS);
+ break;
+ case X86II::XS: // F3
+ EmitByte(0xF3, CurByte, OS);
+ break;
+ case X86II::XD: // F2
+ EmitByte(0xF2, CurByte, OS);
+ break;
+ }
+
+ // Handle REX prefix.
+ // FIXME: Can this come before F2 etc to simplify emission?
+ if (is64BitMode(STI)) {
+ if (uint8_t REX = DetermineREXPrefix(MI, TSFlags, MemOperand, Desc)) {
+ EmitByte(0x40 | REX, CurByte, OS);
+ Ret = true;
+ }
+ }
+
+ // 0x0F escape code must be emitted just before the opcode.
+ switch (TSFlags & X86II::OpMapMask) {
+ case X86II::TB: // Two-byte opcode map
+ case X86II::T8: // 0F 38
+ case X86II::TA: // 0F 3A
+ EmitByte(0x0F, CurByte, OS);
+ break;
+ }
+
+ switch (TSFlags & X86II::OpMapMask) {
+ case X86II::T8: // 0F 38
+ EmitByte(0x38, CurByte, OS);
+ break;
+ case X86II::TA: // 0F 3A
+ EmitByte(0x3A, CurByte, OS);
+ break;
+ }
+ return Ret;
+}
+
+void X86MCCodeEmitter::
+encodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ unsigned Opcode = MI.getOpcode();
+ const MCInstrDesc &Desc = MCII.get(Opcode);
+ uint64_t TSFlags = Desc.TSFlags;
+
+ // Pseudo instructions don't get encoded.
+ if ((TSFlags & X86II::FormMask) == X86II::Pseudo)
+ return;
+
+ unsigned NumOps = Desc.getNumOperands();
+ unsigned CurOp = X86II::getOperandBias(Desc);
+
+ // Keep track of the current byte being emitted.
+ unsigned CurByte = 0;
+
+ // Encoding type for this instruction.
+ uint64_t Encoding = TSFlags & X86II::EncodingMask;
+
+ // It uses the VEX.VVVV field?
+ bool HasVEX_4V = TSFlags & X86II::VEX_4V;
+ bool HasVEX_I8Reg = (TSFlags & X86II::ImmMask) == X86II::Imm8Reg;
+
+ // It uses the EVEX.aaa field?
+ bool HasEVEX_K = TSFlags & X86II::EVEX_K;
+ bool HasEVEX_RC = TSFlags & X86II::EVEX_RC;
+
+ // Used if a register is encoded in 7:4 of immediate.
+ unsigned I8RegNum = 0;
+
+ // Determine where the memory operand starts, if present.
+ int MemoryOperand = X86II::getMemoryOperandNo(TSFlags);
+ if (MemoryOperand != -1) MemoryOperand += CurOp;
+
+ // Emit segment override opcode prefix as needed.
+ if (MemoryOperand >= 0)
+ EmitSegmentOverridePrefix(CurByte, MemoryOperand+X86::AddrSegmentReg,
+ MI, OS);
+
+ // Emit the repeat opcode prefix as needed.
+ if (TSFlags & X86II::REP)
+ EmitByte(0xF3, CurByte, OS);
+
+ // Emit the address size opcode prefix as needed.
+ bool need_address_override;
+ uint64_t AdSize = TSFlags & X86II::AdSizeMask;
+ if ((is16BitMode(STI) && AdSize == X86II::AdSize32) ||
+ (is32BitMode(STI) && AdSize == X86II::AdSize16) ||
+ (is64BitMode(STI) && AdSize == X86II::AdSize32)) {
+ need_address_override = true;
+ } else if (MemoryOperand < 0) {
+ need_address_override = false;
+ } else if (is64BitMode(STI)) {
+ assert(!Is16BitMemOperand(MI, MemoryOperand, STI));
+ need_address_override = Is32BitMemOperand(MI, MemoryOperand);
+ } else if (is32BitMode(STI)) {
+ assert(!Is64BitMemOperand(MI, MemoryOperand));
+ need_address_override = Is16BitMemOperand(MI, MemoryOperand, STI);
+ } else {
+ assert(is16BitMode(STI));
+ assert(!Is64BitMemOperand(MI, MemoryOperand));
+ need_address_override = !Is16BitMemOperand(MI, MemoryOperand, STI);
+ }
+
+ if (need_address_override)
+ EmitByte(0x67, CurByte, OS);
+
+ bool Rex = false;
+ if (Encoding == 0)
+ Rex = emitOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, STI, OS);
+ else
+ EmitVEXOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, OS);
+
+ uint8_t BaseOpcode = X86II::getBaseOpcodeFor(TSFlags);
+
+ if (TSFlags & X86II::Has3DNow0F0FOpcode)
+ BaseOpcode = 0x0F; // Weird 3DNow! encoding.
+
+ uint64_t Form = TSFlags & X86II::FormMask;
+ switch (Form) {
+ default: errs() << "FORM: " << Form << "\n";
+ llvm_unreachable("Unknown FormMask value in X86MCCodeEmitter!");
+ case X86II::Pseudo:
+ llvm_unreachable("Pseudo instruction shouldn't be emitted");
+ case X86II::RawFrmDstSrc: {
+ unsigned siReg = MI.getOperand(1).getReg();
+ assert(((siReg == X86::SI && MI.getOperand(0).getReg() == X86::DI) ||
+ (siReg == X86::ESI && MI.getOperand(0).getReg() == X86::EDI) ||
+ (siReg == X86::RSI && MI.getOperand(0).getReg() == X86::RDI)) &&
+ "SI and DI register sizes do not match");
+ // Emit segment override opcode prefix as needed (not for %ds).
+ if (MI.getOperand(2).getReg() != X86::DS)
+ EmitSegmentOverridePrefix(CurByte, 2, MI, OS);
+ // Emit AdSize prefix as needed.
+ if ((!is32BitMode(STI) && siReg == X86::ESI) ||
+ (is32BitMode(STI) && siReg == X86::SI))
+ EmitByte(0x67, CurByte, OS);
+ CurOp += 3; // Consume operands.
+ EmitByte(BaseOpcode, CurByte, OS);
+ break;
+ }
+ case X86II::RawFrmSrc: {
+ unsigned siReg = MI.getOperand(0).getReg();
+ // Emit segment override opcode prefix as needed (not for %ds).
+ if (MI.getOperand(1).getReg() != X86::DS)
+ EmitSegmentOverridePrefix(CurByte, 1, MI, OS);
+ // Emit AdSize prefix as needed.
+ if ((!is32BitMode(STI) && siReg == X86::ESI) ||
+ (is32BitMode(STI) && siReg == X86::SI))
+ EmitByte(0x67, CurByte, OS);
+ CurOp += 2; // Consume operands.
+ EmitByte(BaseOpcode, CurByte, OS);
+ break;
+ }
+ case X86II::RawFrmDst: {
+ unsigned siReg = MI.getOperand(0).getReg();
+ // Emit AdSize prefix as needed.
+ if ((!is32BitMode(STI) && siReg == X86::EDI) ||
+ (is32BitMode(STI) && siReg == X86::DI))
+ EmitByte(0x67, CurByte, OS);
+ ++CurOp; // Consume operand.
+ EmitByte(BaseOpcode, CurByte, OS);
+ break;
+ }
+ case X86II::RawFrm:
+ EmitByte(BaseOpcode, CurByte, OS);
+ break;
+ case X86II::RawFrmMemOffs:
+ // Emit segment override opcode prefix as needed.
+ EmitSegmentOverridePrefix(CurByte, 1, MI, OS);
+ EmitByte(BaseOpcode, CurByte, OS);
+ EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
+ X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
+ CurByte, OS, Fixups);
+ ++CurOp; // skip segment operand
+ break;
+ case X86II::RawFrmImm8:
+ EmitByte(BaseOpcode, CurByte, OS);
+ EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
+ X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
+ CurByte, OS, Fixups);
+ EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 1, FK_Data_1, CurByte,
+ OS, Fixups);
+ break;
+ case X86II::RawFrmImm16:
+ EmitByte(BaseOpcode, CurByte, OS);
+ EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
+ X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
+ CurByte, OS, Fixups);
+ EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 2, FK_Data_2, CurByte,
+ OS, Fixups);
+ break;
+
+ case X86II::AddRegFrm:
+ EmitByte(BaseOpcode + GetX86RegNum(MI.getOperand(CurOp++)), CurByte, OS);
+ break;
+
+ case X86II::MRMDestReg: {
+ EmitByte(BaseOpcode, CurByte, OS);
+ unsigned SrcRegNum = CurOp + 1;
+
+ if (HasEVEX_K) // Skip writemask
+ ++SrcRegNum;
+
+ if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+ ++SrcRegNum;
+
+ EmitRegModRMByte(MI.getOperand(CurOp),
+ GetX86RegNum(MI.getOperand(SrcRegNum)), CurByte, OS);
+ CurOp = SrcRegNum + 1;
+ break;
+ }
+ case X86II::MRMDestMem: {
+ EmitByte(BaseOpcode, CurByte, OS);
+ unsigned SrcRegNum = CurOp + X86::AddrNumOperands;
+
+ if (HasEVEX_K) // Skip writemask
+ ++SrcRegNum;
+
+ if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+ ++SrcRegNum;
+
+ emitMemModRMByte(MI, CurOp, GetX86RegNum(MI.getOperand(SrcRegNum)), TSFlags,
+ Rex, CurByte, OS, Fixups, STI);
+ CurOp = SrcRegNum + 1;
+ break;
+ }
+ case X86II::MRMSrcReg: {
+ EmitByte(BaseOpcode, CurByte, OS);
+ unsigned SrcRegNum = CurOp + 1;
+
+ if (HasEVEX_K) // Skip writemask
+ ++SrcRegNum;
+
+ if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+ ++SrcRegNum;
+
+ EmitRegModRMByte(MI.getOperand(SrcRegNum),
+ GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
+ CurOp = SrcRegNum + 1;
+ if (HasVEX_I8Reg)
+ I8RegNum = getX86RegEncoding(MI, CurOp++);
+ // do not count the rounding control operand
+ if (HasEVEX_RC)
+ --NumOps;
+ break;
+ }
+ case X86II::MRMSrcReg4VOp3: {
+ EmitByte(BaseOpcode, CurByte, OS);
+ unsigned SrcRegNum = CurOp + 1;
+
+ EmitRegModRMByte(MI.getOperand(SrcRegNum),
+ GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
+ CurOp = SrcRegNum + 1;
+ ++CurOp; // Encoded in VEX.VVVV
+ break;
+ }
+ case X86II::MRMSrcRegOp4: {
+ EmitByte(BaseOpcode, CurByte, OS);
+ unsigned SrcRegNum = CurOp + 1;
+
+ // Skip 1st src (which is encoded in VEX_VVVV)
+ ++SrcRegNum;
+
+ // Capture 2nd src (which is encoded in Imm[7:4])
+ assert(HasVEX_I8Reg && "MRMSrcRegOp4 should imply VEX_I8Reg");
+ I8RegNum = getX86RegEncoding(MI, SrcRegNum++);
+
+ EmitRegModRMByte(MI.getOperand(SrcRegNum),
+ GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
+ CurOp = SrcRegNum + 1;
+ break;
+ }
+ case X86II::MRMSrcMem: {
+ unsigned FirstMemOp = CurOp+1;
+
+ if (HasEVEX_K) // Skip writemask
+ ++FirstMemOp;
+
+ if (HasVEX_4V)
+ ++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV).
+
+ EmitByte(BaseOpcode, CurByte, OS);
+
+ emitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(CurOp)),
+ TSFlags, Rex, CurByte, OS, Fixups, STI);
+ CurOp = FirstMemOp + X86::AddrNumOperands;
+ if (HasVEX_I8Reg)
+ I8RegNum = getX86RegEncoding(MI, CurOp++);
+ break;
+ }
+ case X86II::MRMSrcMem4VOp3: {
+ unsigned FirstMemOp = CurOp+1;
+
+ EmitByte(BaseOpcode, CurByte, OS);
+
+ emitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(CurOp)),
+ TSFlags, Rex, CurByte, OS, Fixups, STI);
+ CurOp = FirstMemOp + X86::AddrNumOperands;
+ ++CurOp; // Encoded in VEX.VVVV.
+ break;
+ }
+ case X86II::MRMSrcMemOp4: {
+ unsigned FirstMemOp = CurOp+1;
+
+ ++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV).
+
+ // Capture second register source (encoded in Imm[7:4])
+ assert(HasVEX_I8Reg && "MRMSrcRegOp4 should imply VEX_I8Reg");
+ I8RegNum = getX86RegEncoding(MI, FirstMemOp++);
+
+ EmitByte(BaseOpcode, CurByte, OS);
+
+ emitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(CurOp)),
+ TSFlags, Rex, CurByte, OS, Fixups, STI);
+ CurOp = FirstMemOp + X86::AddrNumOperands;
+ break;
+ }
+
+ case X86II::MRMXr:
+ case X86II::MRM0r: case X86II::MRM1r:
+ case X86II::MRM2r: case X86II::MRM3r:
+ case X86II::MRM4r: case X86II::MRM5r:
+ case X86II::MRM6r: case X86II::MRM7r: {
+ if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
+ ++CurOp;
+ if (HasEVEX_K) // Skip writemask
+ ++CurOp;
+ EmitByte(BaseOpcode, CurByte, OS);
+ EmitRegModRMByte(MI.getOperand(CurOp++),
+ (Form == X86II::MRMXr) ? 0 : Form-X86II::MRM0r,
+ CurByte, OS);
+ break;
+ }
+
+ case X86II::MRMXm:
+ case X86II::MRM0m: case X86II::MRM1m:
+ case X86II::MRM2m: case X86II::MRM3m:
+ case X86II::MRM4m: case X86II::MRM5m:
+ case X86II::MRM6m: case X86II::MRM7m: {
+ if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
+ ++CurOp;
+ if (HasEVEX_K) // Skip writemask
+ ++CurOp;
+ EmitByte(BaseOpcode, CurByte, OS);
+ emitMemModRMByte(MI, CurOp,
+ (Form == X86II::MRMXm) ? 0 : Form - X86II::MRM0m, TSFlags,
+ Rex, CurByte, OS, Fixups, STI);
+ CurOp += X86::AddrNumOperands;
+ break;
+ }
+ case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2:
+ case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C5:
+ case X86II::MRM_C6: case X86II::MRM_C7: case X86II::MRM_C8:
+ case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB:
+ case X86II::MRM_CC: case X86II::MRM_CD: case X86II::MRM_CE:
+ case X86II::MRM_CF: case X86II::MRM_D0: case X86II::MRM_D1:
+ case X86II::MRM_D2: case X86II::MRM_D3: case X86II::MRM_D4:
+ case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D7:
+ case X86II::MRM_D8: case X86II::MRM_D9: case X86II::MRM_DA:
+ case X86II::MRM_DB: case X86II::MRM_DC: case X86II::MRM_DD:
+ case X86II::MRM_DE: case X86II::MRM_DF: case X86II::MRM_E0:
+ case X86II::MRM_E1: case X86II::MRM_E2: case X86II::MRM_E3:
+ case X86II::MRM_E4: case X86II::MRM_E5: case X86II::MRM_E6:
+ case X86II::MRM_E7: case X86II::MRM_E8: case X86II::MRM_E9:
+ case X86II::MRM_EA: case X86II::MRM_EB: case X86II::MRM_EC:
+ case X86II::MRM_ED: case X86II::MRM_EE: case X86II::MRM_EF:
+ case X86II::MRM_F0: case X86II::MRM_F1: case X86II::MRM_F2:
+ case X86II::MRM_F3: case X86II::MRM_F4: case X86II::MRM_F5:
+ case X86II::MRM_F6: case X86II::MRM_F7: case X86II::MRM_F8:
+ case X86II::MRM_F9: case X86II::MRM_FA: case X86II::MRM_FB:
+ case X86II::MRM_FC: case X86II::MRM_FD: case X86II::MRM_FE:
+ case X86II::MRM_FF:
+ EmitByte(BaseOpcode, CurByte, OS);
+ EmitByte(0xC0 + Form - X86II::MRM_C0, CurByte, OS);
+ break;
+ }
+
+ if (HasVEX_I8Reg) {
+ // The last source register of a 4 operand instruction in AVX is encoded
+ // in bits[7:4] of a immediate byte.
+ assert(I8RegNum < 16 && "Register encoding out of range");
+ I8RegNum <<= 4;
+ if (CurOp != NumOps) {
+ unsigned Val = MI.getOperand(CurOp++).getImm();
+ assert(Val < 16 && "Immediate operand value out of range");
+ I8RegNum |= Val;
+ }
+ EmitImmediate(MCOperand::createImm(I8RegNum), MI.getLoc(), 1, FK_Data_1,
+ CurByte, OS, Fixups);
+ } else {
+ // If there is a remaining operand, it must be a trailing immediate. Emit it
+ // according to the right size for the instruction. Some instructions
+ // (SSE4a extrq and insertq) have two trailing immediates.
+ while (CurOp != NumOps && NumOps - CurOp <= 2) {
+ EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
+ X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
+ CurByte, OS, Fixups);
+ }
+ }
+
+ if (TSFlags & X86II::Has3DNow0F0FOpcode)
+ EmitByte(X86II::getBaseOpcodeFor(TSFlags), CurByte, OS);
+
+#ifndef NDEBUG
+ // FIXME: Verify.
+ if (/*!Desc.isVariadic() &&*/ CurOp != NumOps) {
+ errs() << "Cannot encode all operands of: ";
+ MI.dump();
+ errs() << '\n';
+ abort();
+ }
+#endif
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
new file mode 100644
index 000000000000..22cb0fac33cb
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -0,0 +1,456 @@
+//===-- X86MCTargetDesc.cpp - X86 Target Descriptions ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides X86 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MCTargetDesc.h"
+#include "InstPrinter/X86ATTInstPrinter.h"
+#include "InstPrinter/X86IntelInstPrinter.h"
+#include "X86MCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#if _MSC_VER
+#include <intrin.h>
+#endif
+
+using namespace llvm;
+
+#define GET_REGINFO_MC_DESC
+#include "X86GenRegisterInfo.inc"
+
+#define GET_INSTRINFO_MC_DESC
+#include "X86GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "X86GenSubtargetInfo.inc"
+
+std::string X86_MC::ParseX86Triple(const Triple &TT) {
+ std::string FS;
+ if (TT.getArch() == Triple::x86_64)
+ FS = "+64bit-mode,-32bit-mode,-16bit-mode";
+ else if (TT.getEnvironment() != Triple::CODE16)
+ FS = "-64bit-mode,+32bit-mode,-16bit-mode";
+ else
+ FS = "-64bit-mode,-32bit-mode,+16bit-mode";
+
+ return FS;
+}
+
+unsigned X86_MC::getDwarfRegFlavour(const Triple &TT, bool isEH) {
+ if (TT.getArch() == Triple::x86_64)
+ return DWARFFlavour::X86_64;
+
+ if (TT.isOSDarwin())
+ return isEH ? DWARFFlavour::X86_32_DarwinEH : DWARFFlavour::X86_32_Generic;
+ if (TT.isOSCygMing())
+ // Unsupported by now, just quick fallback
+ return DWARFFlavour::X86_32_Generic;
+ return DWARFFlavour::X86_32_Generic;
+}
+
+void X86_MC::initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI) {
+ // FIXME: TableGen these.
+ for (unsigned Reg = X86::NoRegister + 1; Reg < X86::NUM_TARGET_REGS; ++Reg) {
+ unsigned SEH = MRI->getEncodingValue(Reg);
+ MRI->mapLLVMRegToSEHReg(Reg, SEH);
+ }
+
+ // These CodeView registers are numbered sequentially starting at value 1.
+ static const MCPhysReg LowCVRegs[] = {
+ X86::AL, X86::CL, X86::DL, X86::BL, X86::AH, X86::CH,
+ X86::DH, X86::BH, X86::AX, X86::CX, X86::DX, X86::BX,
+ X86::SP, X86::BP, X86::SI, X86::DI, X86::EAX, X86::ECX,
+ X86::EDX, X86::EBX, X86::ESP, X86::EBP, X86::ESI, X86::EDI,
+ };
+ unsigned CVLowRegStart = 1;
+ for (unsigned I = 0; I < array_lengthof(LowCVRegs); ++I)
+ MRI->mapLLVMRegToCVReg(LowCVRegs[I], I + CVLowRegStart);
+
+ MRI->mapLLVMRegToCVReg(X86::EFLAGS, 34);
+
+ // The x87 registers start at 128 and are numbered sequentially.
+ unsigned FP0Start = 128;
+ for (unsigned I = 0; I < 8; ++I)
+ MRI->mapLLVMRegToCVReg(X86::FP0 + I, FP0Start + I);
+
+ // The low 8 XMM registers start at 154 and are numbered sequentially.
+ unsigned CVXMM0Start = 154;
+ for (unsigned I = 0; I < 8; ++I)
+ MRI->mapLLVMRegToCVReg(X86::XMM0 + I, CVXMM0Start + I);
+
+ // The high 8 XMM registers start at 252 and are numbered sequentially.
+ unsigned CVXMM8Start = 252;
+ for (unsigned I = 0; I < 8; ++I)
+ MRI->mapLLVMRegToCVReg(X86::XMM8 + I, CVXMM8Start + I);
+
+ // FIXME: XMM16 and above from AVX512 not yet documented.
+
+ // AMD64 registers start at 324 and count up.
+ unsigned CVX64RegStart = 324;
+ static const MCPhysReg CVX64Regs[] = {
+ X86::SIL, X86::DIL, X86::BPL, X86::SPL, X86::RAX, X86::RBX,
+ X86::RCX, X86::RDX, X86::RSI, X86::RDI, X86::RBP, X86::RSP,
+ X86::R8, X86::R9, X86::R10, X86::R11, X86::R12, X86::R13,
+ X86::R14, X86::R15, X86::R8B, X86::R9B, X86::R10B, X86::R11B,
+ X86::R12B, X86::R13B, X86::R14B, X86::R15B, X86::R8W, X86::R9W,
+ X86::R10W, X86::R11W, X86::R12W, X86::R13W, X86::R14W, X86::R15W,
+ X86::R8D, X86::R9D, X86::R10D, X86::R11D, X86::R12D, X86::R13D,
+ X86::R14D, X86::R15D, X86::YMM0, X86::YMM1, X86::YMM2, X86::YMM3,
+ X86::YMM4, X86::YMM5, X86::YMM6, X86::YMM7, X86::YMM8, X86::YMM9,
+ X86::YMM10, X86::YMM11, X86::YMM12, X86::YMM13, X86::YMM14, X86::YMM15,
+ };
+ for (unsigned I = 0; I < array_lengthof(CVX64Regs); ++I)
+ MRI->mapLLVMRegToCVReg(CVX64Regs[I], CVX64RegStart + I);
+}
+
+MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(const Triple &TT,
+ StringRef CPU, StringRef FS) {
+ std::string ArchFS = X86_MC::ParseX86Triple(TT);
+ if (!FS.empty()) {
+ if (!ArchFS.empty())
+ ArchFS = (Twine(ArchFS) + "," + FS).str();
+ else
+ ArchFS = FS;
+ }
+
+ std::string CPUName = CPU;
+ if (CPUName.empty())
+ CPUName = "generic";
+
+ return createX86MCSubtargetInfoImpl(TT, CPUName, ArchFS);
+}
+
+static MCInstrInfo *createX86MCInstrInfo() {
+ MCInstrInfo *X = new MCInstrInfo();
+ InitX86MCInstrInfo(X);
+ return X;
+}
+
+static MCRegisterInfo *createX86MCRegisterInfo(const Triple &TT) {
+ unsigned RA = (TT.getArch() == Triple::x86_64)
+ ? X86::RIP // Should have dwarf #16.
+ : X86::EIP; // Should have dwarf #8.
+
+ MCRegisterInfo *X = new MCRegisterInfo();
+ InitX86MCRegisterInfo(X, RA, X86_MC::getDwarfRegFlavour(TT, false),
+ X86_MC::getDwarfRegFlavour(TT, true), RA);
+ X86_MC::initLLVMToSEHAndCVRegMapping(X);
+ return X;
+}
+
+static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI,
+ const Triple &TheTriple) {
+ bool is64Bit = TheTriple.getArch() == Triple::x86_64;
+
+ MCAsmInfo *MAI;
+ if (TheTriple.isOSBinFormatMachO()) {
+ if (is64Bit)
+ MAI = new X86_64MCAsmInfoDarwin(TheTriple);
+ else
+ MAI = new X86MCAsmInfoDarwin(TheTriple);
+ } else if (TheTriple.isOSBinFormatELF()) {
+ // Force the use of an ELF container.
+ MAI = new X86ELFMCAsmInfo(TheTriple);
+ } else if (TheTriple.isWindowsMSVCEnvironment() ||
+ TheTriple.isWindowsCoreCLREnvironment()) {
+ MAI = new X86MCAsmInfoMicrosoft(TheTriple);
+ } else if (TheTriple.isOSCygMing() ||
+ TheTriple.isWindowsItaniumEnvironment()) {
+ MAI = new X86MCAsmInfoGNUCOFF(TheTriple);
+ } else {
+ // The default is ELF.
+ MAI = new X86ELFMCAsmInfo(TheTriple);
+ }
+
+ // Initialize initial frame state.
+ // Calculate amount of bytes used for return address storing
+ int stackGrowth = is64Bit ? -8 : -4;
+
+ // Initial state of the frame pointer is esp+stackGrowth.
+ unsigned StackPtr = is64Bit ? X86::RSP : X86::ESP;
+ MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(
+ nullptr, MRI.getDwarfRegNum(StackPtr, true), -stackGrowth);
+ MAI->addInitialFrameState(Inst);
+
+ // Add return address to move list
+ unsigned InstPtr = is64Bit ? X86::RIP : X86::EIP;
+ MCCFIInstruction Inst2 = MCCFIInstruction::createOffset(
+ nullptr, MRI.getDwarfRegNum(InstPtr, true), stackGrowth);
+ MAI->addInitialFrameState(Inst2);
+
+ return MAI;
+}
+
+static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM,
+ CodeModel::Model &CM) {
+ bool is64Bit = TT.getArch() == Triple::x86_64;
+
+ // For static codegen, if we're not already set, use Small codegen.
+ if (CM == CodeModel::Default)
+ CM = CodeModel::Small;
+ else if (CM == CodeModel::JITDefault)
+ // 64-bit JIT places everything in the same buffer except external funcs.
+ CM = is64Bit ? CodeModel::Large : CodeModel::Small;
+}
+
+static MCInstPrinter *createX86MCInstPrinter(const Triple &T,
+ unsigned SyntaxVariant,
+ const MCAsmInfo &MAI,
+ const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI) {
+ if (SyntaxVariant == 0)
+ return new X86ATTInstPrinter(MAI, MII, MRI);
+ if (SyntaxVariant == 1)
+ return new X86IntelInstPrinter(MAI, MII, MRI);
+ return nullptr;
+}
+
+static MCRelocationInfo *createX86MCRelocationInfo(const Triple &TheTriple,
+ MCContext &Ctx) {
+ // Default to the stock relocation info.
+ return llvm::createMCRelocationInfo(TheTriple, Ctx);
+}
+
+static MCInstrAnalysis *createX86MCInstrAnalysis(const MCInstrInfo *Info) {
+ return new MCInstrAnalysis(Info);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeX86TargetMC() {
+ for (Target *T : {&getTheX86_32Target(), &getTheX86_64Target()}) {
+ // Register the MC asm info.
+ RegisterMCAsmInfoFn X(*T, createX86MCAsmInfo);
+
+ // Register the MC codegen info.
+ RegisterMCAdjustCodeGenOptsFn Y(*T, adjustCodeGenOpts);
+
+ // Register the MC instruction info.
+ TargetRegistry::RegisterMCInstrInfo(*T, createX86MCInstrInfo);
+
+ // Register the MC register info.
+ TargetRegistry::RegisterMCRegInfo(*T, createX86MCRegisterInfo);
+
+ // Register the MC subtarget info.
+ TargetRegistry::RegisterMCSubtargetInfo(*T,
+ X86_MC::createX86MCSubtargetInfo);
+
+ // Register the MC instruction analyzer.
+ TargetRegistry::RegisterMCInstrAnalysis(*T, createX86MCInstrAnalysis);
+
+ // Register the code emitter.
+ TargetRegistry::RegisterMCCodeEmitter(*T, createX86MCCodeEmitter);
+
+ // Register the object streamer.
+ TargetRegistry::RegisterCOFFStreamer(*T, createX86WinCOFFStreamer);
+
+ // Register the MCInstPrinter.
+ TargetRegistry::RegisterMCInstPrinter(*T, createX86MCInstPrinter);
+
+ // Register the MC relocation info.
+ TargetRegistry::RegisterMCRelocationInfo(*T, createX86MCRelocationInfo);
+ }
+
+ // Register the asm backend.
+ TargetRegistry::RegisterMCAsmBackend(getTheX86_32Target(),
+ createX86_32AsmBackend);
+ TargetRegistry::RegisterMCAsmBackend(getTheX86_64Target(),
+ createX86_64AsmBackend);
+}
+
+unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size,
+ bool High) {
+ switch (Size) {
+ default: return 0;
+ case 8:
+ if (High) {
+ switch (Reg) {
+ default: return getX86SubSuperRegisterOrZero(Reg, 64);
+ case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+ return X86::SI;
+ case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+ return X86::DI;
+ case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+ return X86::BP;
+ case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+ return X86::SP;
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::AH;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::DH;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::CH;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::BH;
+ }
+ } else {
+ switch (Reg) {
+ default: return 0;
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::AL;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::DL;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::CL;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::BL;
+ case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+ return X86::SIL;
+ case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+ return X86::DIL;
+ case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+ return X86::BPL;
+ case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+ return X86::SPL;
+ case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+ return X86::R8B;
+ case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+ return X86::R9B;
+ case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+ return X86::R10B;
+ case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+ return X86::R11B;
+ case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+ return X86::R12B;
+ case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+ return X86::R13B;
+ case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+ return X86::R14B;
+ case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+ return X86::R15B;
+ }
+ }
+ case 16:
+ switch (Reg) {
+ default: return 0;
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::AX;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::DX;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::CX;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::BX;
+ case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+ return X86::SI;
+ case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+ return X86::DI;
+ case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+ return X86::BP;
+ case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+ return X86::SP;
+ case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+ return X86::R8W;
+ case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+ return X86::R9W;
+ case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+ return X86::R10W;
+ case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+ return X86::R11W;
+ case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+ return X86::R12W;
+ case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+ return X86::R13W;
+ case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+ return X86::R14W;
+ case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+ return X86::R15W;
+ }
+ case 32:
+ switch (Reg) {
+ default: return 0;
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::EAX;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::EDX;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::ECX;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::EBX;
+ case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+ return X86::ESI;
+ case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+ return X86::EDI;
+ case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+ return X86::EBP;
+ case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+ return X86::ESP;
+ case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+ return X86::R8D;
+ case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+ return X86::R9D;
+ case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+ return X86::R10D;
+ case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+ return X86::R11D;
+ case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+ return X86::R12D;
+ case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+ return X86::R13D;
+ case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+ return X86::R14D;
+ case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+ return X86::R15D;
+ }
+ case 64:
+ switch (Reg) {
+ default: return 0;
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::RAX;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::RDX;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::RCX;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::RBX;
+ case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+ return X86::RSI;
+ case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+ return X86::RDI;
+ case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+ return X86::RBP;
+ case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+ return X86::RSP;
+ case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+ return X86::R8;
+ case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+ return X86::R9;
+ case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+ return X86::R10;
+ case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+ return X86::R11;
+ case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+ return X86::R12;
+ case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+ return X86::R13;
+ case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+ return X86::R14;
+ case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+ return X86::R15;
+ }
+ }
+}
+
+unsigned llvm::getX86SubSuperRegister(unsigned Reg, unsigned Size, bool High) {
+ unsigned Res = getX86SubSuperRegisterOrZero(Reg, Size, High);
+ assert(Res != 0 && "Unexpected register or VT");
+ return Res;
+}
+
+
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
new file mode 100644
index 000000000000..f73e734b9b0e
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -0,0 +1,127 @@
+//===-- X86MCTargetDesc.h - X86 Target Descriptions -------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides X86 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H
+
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/DataTypes.h"
+#include <string>
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class MCRelocationInfo;
+class MCTargetOptions;
+class Target;
+class Triple;
+class StringRef;
+class raw_ostream;
+class raw_pwrite_stream;
+
+Target &getTheX86_32Target();
+Target &getTheX86_64Target();
+
+/// Flavour of dwarf regnumbers
+///
+namespace DWARFFlavour {
+ enum {
+ X86_64 = 0, X86_32_DarwinEH = 1, X86_32_Generic = 2
+ };
+}
+
+/// Native X86 register numbers
+///
+namespace N86 {
+ enum {
+ EAX = 0, ECX = 1, EDX = 2, EBX = 3, ESP = 4, EBP = 5, ESI = 6, EDI = 7
+ };
+}
+
+namespace X86_MC {
+std::string ParseX86Triple(const Triple &TT);
+
+unsigned getDwarfRegFlavour(const Triple &TT, bool isEH);
+
+void initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI);
+
+/// Create a X86 MCSubtargetInfo instance. This is exposed so Asm parser, etc.
+/// do not need to go through TargetRegistry.
+MCSubtargetInfo *createX86MCSubtargetInfo(const Triple &TT, StringRef CPU,
+ StringRef FS);
+}
+
+MCCodeEmitter *createX86MCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx);
+
+MCAsmBackend *createX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options);
+MCAsmBackend *createX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options);
+
+/// Construct an X86 Windows COFF machine code streamer which will generate
+/// PE/COFF format object files.
+///
+/// Takes ownership of \p AB and \p CE.
+MCStreamer *createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB,
+ raw_pwrite_stream &OS, MCCodeEmitter *CE,
+ bool RelaxAll, bool IncrementalLinkerCompatible);
+
+/// Construct an X86 Mach-O object writer.
+MCObjectWriter *createX86MachObjectWriter(raw_pwrite_stream &OS, bool Is64Bit,
+ uint32_t CPUType,
+ uint32_t CPUSubtype);
+
+/// Construct an X86 ELF object writer.
+MCObjectWriter *createX86ELFObjectWriter(raw_pwrite_stream &OS, bool IsELF64,
+ uint8_t OSABI, uint16_t EMachine);
+/// Construct an X86 Win COFF object writer.
+MCObjectWriter *createX86WinCOFFObjectWriter(raw_pwrite_stream &OS,
+ bool Is64Bit);
+
+/// Returns the sub or super register of a specific X86 register.
+/// e.g. getX86SubSuperRegister(X86::EAX, 16) returns X86::AX.
+/// Aborts on error.
+unsigned getX86SubSuperRegister(unsigned, unsigned, bool High=false);
+
+/// Returns the sub or super register of a specific X86 register.
+/// Like getX86SubSuperRegister() but returns 0 on error.
+unsigned getX86SubSuperRegisterOrZero(unsigned, unsigned,
+ bool High = false);
+
+} // End llvm namespace
+
+
+// Defines symbolic names for X86 registers. This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "X86GenRegisterInfo.inc"
+
+// Defines symbolic names for the X86 instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "X86GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "X86GenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
new file mode 100644
index 000000000000..297926ddcfda
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -0,0 +1,610 @@
+//===-- X86MachObjectWriter.cpp - X86 Mach-O Writer -----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MachO.h"
+
+using namespace llvm;
+
+namespace {
+class X86MachObjectWriter : public MCMachObjectTargetWriter {
+ bool recordScatteredRelocation(MachObjectWriter *Writer,
+ const MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFragment *Fragment,
+ const MCFixup &Fixup,
+ MCValue Target,
+ unsigned Log2Size,
+ uint64_t &FixedValue);
+ void recordTLVPRelocation(MachObjectWriter *Writer,
+ const MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFragment *Fragment,
+ const MCFixup &Fixup,
+ MCValue Target,
+ uint64_t &FixedValue);
+
+ void RecordX86Relocation(MachObjectWriter *Writer,
+ const MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFragment *Fragment,
+ const MCFixup &Fixup,
+ MCValue Target,
+ uint64_t &FixedValue);
+ void RecordX86_64Relocation(MachObjectWriter *Writer, MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFragment *Fragment, const MCFixup &Fixup,
+ MCValue Target, uint64_t &FixedValue);
+
+public:
+ X86MachObjectWriter(bool Is64Bit, uint32_t CPUType, uint32_t CPUSubtype)
+ : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype) {}
+
+ void recordRelocation(MachObjectWriter *Writer, MCAssembler &Asm,
+ const MCAsmLayout &Layout, const MCFragment *Fragment,
+ const MCFixup &Fixup, MCValue Target,
+ uint64_t &FixedValue) override {
+ if (Writer->is64Bit())
+ RecordX86_64Relocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+ FixedValue);
+ else
+ RecordX86Relocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+ FixedValue);
+ }
+};
+}
+
+static bool isFixupKindRIPRel(unsigned Kind) {
+ return Kind == X86::reloc_riprel_4byte ||
+ Kind == X86::reloc_riprel_4byte_movq_load ||
+ Kind == X86::reloc_riprel_4byte_relax ||
+ Kind == X86::reloc_riprel_4byte_relax_rex;
+}
+
+static unsigned getFixupKindLog2Size(unsigned Kind) {
+ switch (Kind) {
+ default:
+ llvm_unreachable("invalid fixup kind!");
+ case FK_PCRel_1:
+ case FK_Data_1: return 0;
+ case FK_PCRel_2:
+ case FK_Data_2: return 1;
+ case FK_PCRel_4:
+ // FIXME: Remove these!!!
+ case X86::reloc_riprel_4byte:
+ case X86::reloc_riprel_4byte_relax:
+ case X86::reloc_riprel_4byte_relax_rex:
+ case X86::reloc_riprel_4byte_movq_load:
+ case X86::reloc_signed_4byte:
+ case X86::reloc_signed_4byte_relax:
+ case FK_Data_4: return 2;
+ case FK_Data_8: return 3;
+ }
+}
+
+void X86MachObjectWriter::RecordX86_64Relocation(
+ MachObjectWriter *Writer, MCAssembler &Asm, const MCAsmLayout &Layout,
+ const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
+ uint64_t &FixedValue) {
+ unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+ unsigned IsRIPRel = isFixupKindRIPRel(Fixup.getKind());
+ unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind());
+
+ // See <reloc.h>.
+ uint32_t FixupOffset =
+ Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
+ uint32_t FixupAddress =
+ Writer->getFragmentAddress(Fragment, Layout) + Fixup.getOffset();
+ int64_t Value = 0;
+ unsigned Index = 0;
+ unsigned IsExtern = 0;
+ unsigned Type = 0;
+ const MCSymbol *RelSymbol = nullptr;
+
+ Value = Target.getConstant();
+
+ if (IsPCRel) {
+ // Compensate for the relocation offset, Darwin x86_64 relocations only have
+ // the addend and appear to have attempted to define it to be the actual
+ // expression addend without the PCrel bias. However, instructions with data
+ // following the relocation are not accommodated for (see comment below
+ // regarding SIGNED{1,2,4}), so it isn't exactly that either.
+ Value += 1LL << Log2Size;
+ }
+
+ if (Target.isAbsolute()) { // constant
+ // SymbolNum of 0 indicates the absolute section.
+ Type = MachO::X86_64_RELOC_UNSIGNED;
+
+ // FIXME: I believe this is broken, I don't think the linker can understand
+ // it. I think it would require a local relocation, but I'm not sure if that
+ // would work either. The official way to get an absolute PCrel relocation
+ // is to use an absolute symbol (which we don't support yet).
+ if (IsPCRel) {
+ IsExtern = 1;
+ Type = MachO::X86_64_RELOC_BRANCH;
+ }
+ } else if (Target.getSymB()) { // A - B + constant
+ const MCSymbol *A = &Target.getSymA()->getSymbol();
+ if (A->isTemporary())
+ A = &Writer->findAliasedSymbol(*A);
+ const MCSymbol *A_Base = Asm.getAtom(*A);
+
+ const MCSymbol *B = &Target.getSymB()->getSymbol();
+ if (B->isTemporary())
+ B = &Writer->findAliasedSymbol(*B);
+ const MCSymbol *B_Base = Asm.getAtom(*B);
+
+ // Neither symbol can be modified.
+ if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
+ Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) {
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "unsupported relocation of modified symbol");
+ return;
+ }
+
+ // We don't support PCrel relocations of differences. Darwin 'as' doesn't
+ // implement most of these correctly.
+ if (IsPCRel) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(), "unsupported pc-relative relocation of difference");
+ return;
+ }
+
+ // The support for the situation where one or both of the symbols would
+ // require a local relocation is handled just like if the symbols were
+ // external. This is certainly used in the case of debug sections where the
+ // section has only temporary symbols and thus the symbols don't have base
+ // symbols. This is encoded using the section ordinal and non-extern
+ // relocation entries.
+
+ // Darwin 'as' doesn't emit correct relocations for this (it ends up with a
+ // single SIGNED relocation); reject it for now. Except the case where both
+ // symbols don't have a base, equal but both NULL.
+ if (A_Base == B_Base && A_Base) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(), "unsupported relocation with identical base");
+ return;
+ }
+
+ // A subtraction expression where either symbol is undefined is a
+ // non-relocatable expression.
+ if (A->isUndefined() || B->isUndefined()) {
+ StringRef Name = A->isUndefined() ? A->getName() : B->getName();
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "unsupported relocation with subtraction expression, symbol '" +
+ Name + "' can not be undefined in a subtraction expression");
+ return;
+ }
+
+ Value += Writer->getSymbolAddress(*A, Layout) -
+ (!A_Base ? 0 : Writer->getSymbolAddress(*A_Base, Layout));
+ Value -= Writer->getSymbolAddress(*B, Layout) -
+ (!B_Base ? 0 : Writer->getSymbolAddress(*B_Base, Layout));
+
+ if (!A_Base)
+ Index = A->getFragment()->getParent()->getOrdinal() + 1;
+ Type = MachO::X86_64_RELOC_UNSIGNED;
+
+ MachO::any_relocation_info MRE;
+ MRE.r_word0 = FixupOffset;
+ MRE.r_word1 =
+ (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
+ Writer->addRelocation(A_Base, Fragment->getParent(), MRE);
+
+ if (B_Base)
+ RelSymbol = B_Base;
+ else
+ Index = B->getFragment()->getParent()->getOrdinal() + 1;
+ Type = MachO::X86_64_RELOC_SUBTRACTOR;
+ } else {
+ const MCSymbol *Symbol = &Target.getSymA()->getSymbol();
+ if (Symbol->isTemporary() && Value) {
+ const MCSection &Sec = Symbol->getSection();
+ if (!Asm.getContext().getAsmInfo()->isSectionAtomizableBySymbols(Sec))
+ Symbol->setUsedInReloc();
+ }
+ RelSymbol = Asm.getAtom(*Symbol);
+
+ // Relocations inside debug sections always use local relocations when
+ // possible. This seems to be done because the debugger doesn't fully
+ // understand x86_64 relocation entries, and expects to find values that
+ // have already been fixed up.
+ if (Symbol->isInSection()) {
+ const MCSectionMachO &Section =
+ static_cast<const MCSectionMachO &>(*Fragment->getParent());
+ if (Section.hasAttribute(MachO::S_ATTR_DEBUG))
+ RelSymbol = nullptr;
+ }
+
+ // x86_64 almost always uses external relocations, except when there is no
+ // symbol to use as a base address (a local symbol with no preceding
+ // non-local symbol).
+ if (RelSymbol) {
+ // Add the local offset, if needed.
+ if (RelSymbol != Symbol)
+ Value += Layout.getSymbolOffset(*Symbol) -
+ Layout.getSymbolOffset(*RelSymbol);
+ } else if (Symbol->isInSection() && !Symbol->isVariable()) {
+ // The index is the section ordinal (1-based).
+ Index = Symbol->getFragment()->getParent()->getOrdinal() + 1;
+ Value += Writer->getSymbolAddress(*Symbol, Layout);
+
+ if (IsPCRel)
+ Value -= FixupAddress + (1 << Log2Size);
+ } else if (Symbol->isVariable()) {
+ const MCExpr *Value = Symbol->getVariableValue();
+ int64_t Res;
+ bool isAbs = Value->evaluateAsAbsolute(Res, Layout,
+ Writer->getSectionAddressMap());
+ if (isAbs) {
+ FixedValue = Res;
+ return;
+ } else {
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "unsupported relocation of variable '" +
+ Symbol->getName() + "'");
+ return;
+ }
+ } else {
+ Asm.getContext().reportError(
+ Fixup.getLoc(), "unsupported relocation of undefined symbol '" +
+ Symbol->getName() + "'");
+ return;
+ }
+
+ MCSymbolRefExpr::VariantKind Modifier = Target.getSymA()->getKind();
+ if (IsPCRel) {
+ if (IsRIPRel) {
+ if (Modifier == MCSymbolRefExpr::VK_GOTPCREL) {
+ // x86_64 distinguishes movq foo@GOTPCREL so that the linker can
+ // rewrite the movq to an leaq at link time if the symbol ends up in
+ // the same linkage unit.
+ if (unsigned(Fixup.getKind()) == X86::reloc_riprel_4byte_movq_load)
+ Type = MachO::X86_64_RELOC_GOT_LOAD;
+ else
+ Type = MachO::X86_64_RELOC_GOT;
+ } else if (Modifier == MCSymbolRefExpr::VK_TLVP) {
+ Type = MachO::X86_64_RELOC_TLV;
+ } else if (Modifier != MCSymbolRefExpr::VK_None) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(), "unsupported symbol modifier in relocation");
+ return;
+ } else {
+ Type = MachO::X86_64_RELOC_SIGNED;
+
+ // The Darwin x86_64 relocation format has a problem where it cannot
+ // encode an address (L<foo> + <constant>) which is outside the atom
+ // containing L<foo>. Generally, this shouldn't occur but it does
+ // happen when we have a RIPrel instruction with data following the
+ // relocation entry (e.g., movb $012, L0(%rip)). Even with the PCrel
+ // adjustment Darwin x86_64 uses, the offset is still negative and the
+ // linker has no way to recognize this.
+ //
+ // To work around this, Darwin uses several special relocation types
+ // to indicate the offsets. However, the specification or
+ // implementation of these seems to also be incomplete; they should
+ // adjust the addend as well based on the actual encoded instruction
+ // (the additional bias), but instead appear to just look at the final
+ // offset.
+ switch (-(Target.getConstant() + (1LL << Log2Size))) {
+ case 1: Type = MachO::X86_64_RELOC_SIGNED_1; break;
+ case 2: Type = MachO::X86_64_RELOC_SIGNED_2; break;
+ case 4: Type = MachO::X86_64_RELOC_SIGNED_4; break;
+ }
+ }
+ } else {
+ if (Modifier != MCSymbolRefExpr::VK_None) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(),
+ "unsupported symbol modifier in branch relocation");
+ return;
+ }
+
+ Type = MachO::X86_64_RELOC_BRANCH;
+ }
+ } else {
+ if (Modifier == MCSymbolRefExpr::VK_GOT) {
+ Type = MachO::X86_64_RELOC_GOT;
+ } else if (Modifier == MCSymbolRefExpr::VK_GOTPCREL) {
+ // GOTPCREL is allowed as a modifier on non-PCrel instructions, in which
+ // case all we do is set the PCrel bit in the relocation entry; this is
+ // used with exception handling, for example. The source is required to
+ // include any necessary offset directly.
+ Type = MachO::X86_64_RELOC_GOT;
+ IsPCRel = 1;
+ } else if (Modifier == MCSymbolRefExpr::VK_TLVP) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(), "TLVP symbol modifier should have been rip-rel");
+ return;
+ } else if (Modifier != MCSymbolRefExpr::VK_None) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(), "unsupported symbol modifier in relocation");
+ return;
+ } else {
+ Type = MachO::X86_64_RELOC_UNSIGNED;
+ unsigned Kind = Fixup.getKind();
+ if (Kind == X86::reloc_signed_4byte) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(),
+ "32-bit absolute addressing is not supported in 64-bit mode");
+ return;
+ }
+ }
+ }
+ }
+
+ // x86_64 always writes custom values into the fixups.
+ FixedValue = Value;
+
+ // struct relocation_info (8 bytes)
+ MachO::any_relocation_info MRE;
+ MRE.r_word0 = FixupOffset;
+ MRE.r_word1 = (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
+ (IsExtern << 27) | (Type << 28);
+ Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
+}
+
+bool X86MachObjectWriter::recordScatteredRelocation(MachObjectWriter *Writer,
+ const MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFragment *Fragment,
+ const MCFixup &Fixup,
+ MCValue Target,
+ unsigned Log2Size,
+ uint64_t &FixedValue) {
+ uint64_t OriginalFixedValue = FixedValue;
+ uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+ unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+ unsigned Type = MachO::GENERIC_RELOC_VANILLA;
+
+ // See <reloc.h>.
+ const MCSymbol *A = &Target.getSymA()->getSymbol();
+
+ if (!A->getFragment()) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(),
+ "symbol '" + A->getName() +
+ "' can not be undefined in a subtraction expression");
+ return false;
+ }
+
+ uint32_t Value = Writer->getSymbolAddress(*A, Layout);
+ uint64_t SecAddr = Writer->getSectionAddress(A->getFragment()->getParent());
+ FixedValue += SecAddr;
+ uint32_t Value2 = 0;
+
+ if (const MCSymbolRefExpr *B = Target.getSymB()) {
+ const MCSymbol *SB = &B->getSymbol();
+
+ if (!SB->getFragment()) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(),
+ "symbol '" + B->getSymbol().getName() +
+ "' can not be undefined in a subtraction expression");
+ return false;
+ }
+
+ // Select the appropriate difference relocation type.
+ //
+ // Note that there is no longer any semantic difference between these two
+ // relocation types from the linkers point of view, this is done solely for
+ // pedantic compatibility with 'as'.
+ Type = A->isExternal() ? (unsigned)MachO::GENERIC_RELOC_SECTDIFF
+ : (unsigned)MachO::GENERIC_RELOC_LOCAL_SECTDIFF;
+ Value2 = Writer->getSymbolAddress(B->getSymbol(), Layout);
+ FixedValue -= Writer->getSectionAddress(SB->getFragment()->getParent());
+ }
+
+ // Relocations are written out in reverse order, so the PAIR comes first.
+ if (Type == MachO::GENERIC_RELOC_SECTDIFF ||
+ Type == MachO::GENERIC_RELOC_LOCAL_SECTDIFF) {
+ // If the offset is too large to fit in a scattered relocation,
+ // we're hosed. It's an unfortunate limitation of the MachO format.
+ if (FixupOffset > 0xffffff) {
+ char Buffer[32];
+ format("0x%x", FixupOffset).print(Buffer, sizeof(Buffer));
+ Asm.getContext().reportError(Fixup.getLoc(),
+ Twine("Section too large, can't encode "
+ "r_address (") + Buffer +
+ ") into 24 bits of scattered "
+ "relocation entry.");
+ return false;
+ }
+
+ MachO::any_relocation_info MRE;
+ MRE.r_word0 = ((0 << 0) | // r_address
+ (MachO::GENERIC_RELOC_PAIR << 24) | // r_type
+ (Log2Size << 28) |
+ (IsPCRel << 30) |
+ MachO::R_SCATTERED);
+ MRE.r_word1 = Value2;
+ Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
+ } else {
+ // If the offset is more than 24-bits, it won't fit in a scattered
+ // relocation offset field, so we fall back to using a non-scattered
+ // relocation. This is a bit risky, as if the offset reaches out of
+ // the block and the linker is doing scattered loading on this
+ // symbol, things can go badly.
+ //
+ // Required for 'as' compatibility.
+ if (FixupOffset > 0xffffff) {
+ FixedValue = OriginalFixedValue;
+ return false;
+ }
+ }
+
+ MachO::any_relocation_info MRE;
+ MRE.r_word0 = ((FixupOffset << 0) |
+ (Type << 24) |
+ (Log2Size << 28) |
+ (IsPCRel << 30) |
+ MachO::R_SCATTERED);
+ MRE.r_word1 = Value;
+ Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
+ return true;
+}
+
+void X86MachObjectWriter::recordTLVPRelocation(MachObjectWriter *Writer,
+ const MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFragment *Fragment,
+ const MCFixup &Fixup,
+ MCValue Target,
+ uint64_t &FixedValue) {
+ assert(Target.getSymA()->getKind() == MCSymbolRefExpr::VK_TLVP &&
+ !is64Bit() &&
+ "Should only be called with a 32-bit TLVP relocation!");
+
+ unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind());
+ uint32_t Value = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+ unsigned IsPCRel = 0;
+
+ // We're only going to have a second symbol in pic mode and it'll be a
+ // subtraction from the picbase. For 32-bit pic the addend is the difference
+ // between the picbase and the next address. For 32-bit static the addend is
+ // zero.
+ if (Target.getSymB()) {
+ // If this is a subtraction then we're pcrel.
+ uint32_t FixupAddress =
+ Writer->getFragmentAddress(Fragment, Layout) + Fixup.getOffset();
+ IsPCRel = 1;
+ FixedValue =
+ FixupAddress -
+ Writer->getSymbolAddress(Target.getSymB()->getSymbol(), Layout) +
+ Target.getConstant();
+ FixedValue += 1ULL << Log2Size;
+ } else {
+ FixedValue = 0;
+ }
+
+ // struct relocation_info (8 bytes)
+ MachO::any_relocation_info MRE;
+ MRE.r_word0 = Value;
+ MRE.r_word1 =
+ (IsPCRel << 24) | (Log2Size << 25) | (MachO::GENERIC_RELOC_TLV << 28);
+ Writer->addRelocation(&Target.getSymA()->getSymbol(), Fragment->getParent(),
+ MRE);
+}
+
+void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
+ const MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFragment *Fragment,
+ const MCFixup &Fixup,
+ MCValue Target,
+ uint64_t &FixedValue) {
+ unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+ unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind());
+
+ // If this is a 32-bit TLVP reloc it's handled a bit differently.
+ if (Target.getSymA() &&
+ Target.getSymA()->getKind() == MCSymbolRefExpr::VK_TLVP) {
+ recordTLVPRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+ FixedValue);
+ return;
+ }
+
+ // If this is a difference or a defined symbol plus an offset, then we need a
+ // scattered relocation entry. Differences always require scattered
+ // relocations.
+ if (Target.getSymB()) {
+ recordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
+ Target, Log2Size, FixedValue);
+ return;
+ }
+
+ // Get the symbol data, if any.
+ const MCSymbol *A = nullptr;
+ if (Target.getSymA())
+ A = &Target.getSymA()->getSymbol();
+
+ // If this is an internal relocation with an offset, it also needs a scattered
+ // relocation entry.
+ uint32_t Offset = Target.getConstant();
+ if (IsPCRel)
+ Offset += 1 << Log2Size;
+ // Try to record the scattered relocation if needed. Fall back to non
+ // scattered if necessary (see comments in recordScatteredRelocation()
+ // for details).
+ if (Offset && A && !Writer->doesSymbolRequireExternRelocation(*A) &&
+ recordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+ Log2Size, FixedValue))
+ return;
+
+ // See <reloc.h>.
+ uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+ unsigned Index = 0;
+ unsigned Type = 0;
+ const MCSymbol *RelSymbol = nullptr;
+
+ if (Target.isAbsolute()) { // constant
+ // SymbolNum of 0 indicates the absolute section.
+ //
+ // FIXME: Currently, these are never generated (see code below). I cannot
+ // find a case where they are actually emitted.
+ Type = MachO::GENERIC_RELOC_VANILLA;
+ } else {
+ // Resolve constant variables.
+ if (A->isVariable()) {
+ int64_t Res;
+ if (A->getVariableValue()->evaluateAsAbsolute(
+ Res, Layout, Writer->getSectionAddressMap())) {
+ FixedValue = Res;
+ return;
+ }
+ }
+
+ // Check whether we need an external or internal relocation.
+ if (Writer->doesSymbolRequireExternRelocation(*A)) {
+ RelSymbol = A;
+ // For external relocations, make sure to offset the fixup value to
+ // compensate for the addend of the symbol address, if it was
+ // undefined. This occurs with weak definitions, for example.
+ if (!A->isUndefined())
+ FixedValue -= Layout.getSymbolOffset(*A);
+ } else {
+ // The index is the section ordinal (1-based).
+ const MCSection &Sec = A->getSection();
+ Index = Sec.getOrdinal() + 1;
+ FixedValue += Writer->getSectionAddress(&Sec);
+ }
+ if (IsPCRel)
+ FixedValue -= Writer->getSectionAddress(Fragment->getParent());
+
+ Type = MachO::GENERIC_RELOC_VANILLA;
+ }
+
+ // struct relocation_info (8 bytes)
+ MachO::any_relocation_info MRE;
+ MRE.r_word0 = FixupOffset;
+ MRE.r_word1 =
+ (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
+ Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
+}
+
+MCObjectWriter *llvm::createX86MachObjectWriter(raw_pwrite_stream &OS,
+ bool Is64Bit, uint32_t CPUType,
+ uint32_t CPUSubtype) {
+ return createMachObjectWriter(new X86MachObjectWriter(Is64Bit,
+ CPUType,
+ CPUSubtype),
+ OS, /*IsLittleEndian=*/true);
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
new file mode 100644
index 000000000000..33376b6d1b90
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -0,0 +1,105 @@
+//===-- X86WinCOFFObjectWriter.cpp - X86 Win COFF Writer ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/MC/MCWinCOFFObjectWriter.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace llvm {
+ class MCObjectWriter;
+}
+
+namespace {
+ class X86WinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter {
+ public:
+ X86WinCOFFObjectWriter(bool Is64Bit);
+ ~X86WinCOFFObjectWriter() override;
+
+ unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup,
+ bool IsCrossSection,
+ const MCAsmBackend &MAB) const override;
+ };
+}
+
+X86WinCOFFObjectWriter::X86WinCOFFObjectWriter(bool Is64Bit)
+ : MCWinCOFFObjectTargetWriter(Is64Bit ? COFF::IMAGE_FILE_MACHINE_AMD64
+ : COFF::IMAGE_FILE_MACHINE_I386) {}
+
+X86WinCOFFObjectWriter::~X86WinCOFFObjectWriter() {}
+
+unsigned X86WinCOFFObjectWriter::getRelocType(const MCValue &Target,
+ const MCFixup &Fixup,
+ bool IsCrossSection,
+ const MCAsmBackend &MAB) const {
+ unsigned FixupKind = IsCrossSection ? FK_PCRel_4 : Fixup.getKind();
+
+ MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ?
+ MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
+
+ if (getMachine() == COFF::IMAGE_FILE_MACHINE_AMD64) {
+ switch (FixupKind) {
+ case FK_PCRel_4:
+ case X86::reloc_riprel_4byte:
+ case X86::reloc_riprel_4byte_movq_load:
+ case X86::reloc_riprel_4byte_relax:
+ case X86::reloc_riprel_4byte_relax_rex:
+ return COFF::IMAGE_REL_AMD64_REL32;
+ case FK_Data_4:
+ case X86::reloc_signed_4byte:
+ case X86::reloc_signed_4byte_relax:
+ if (Modifier == MCSymbolRefExpr::VK_COFF_IMGREL32)
+ return COFF::IMAGE_REL_AMD64_ADDR32NB;
+ if (Modifier == MCSymbolRefExpr::VK_SECREL)
+ return COFF::IMAGE_REL_AMD64_SECREL;
+ return COFF::IMAGE_REL_AMD64_ADDR32;
+ case FK_Data_8:
+ return COFF::IMAGE_REL_AMD64_ADDR64;
+ case FK_SecRel_2:
+ return COFF::IMAGE_REL_AMD64_SECTION;
+ case FK_SecRel_4:
+ return COFF::IMAGE_REL_AMD64_SECREL;
+ default:
+ llvm_unreachable("unsupported relocation type");
+ }
+ } else if (getMachine() == COFF::IMAGE_FILE_MACHINE_I386) {
+ switch (FixupKind) {
+ case FK_PCRel_4:
+ case X86::reloc_riprel_4byte:
+ case X86::reloc_riprel_4byte_movq_load:
+ return COFF::IMAGE_REL_I386_REL32;
+ case FK_Data_4:
+ case X86::reloc_signed_4byte:
+ case X86::reloc_signed_4byte_relax:
+ if (Modifier == MCSymbolRefExpr::VK_COFF_IMGREL32)
+ return COFF::IMAGE_REL_I386_DIR32NB;
+ if (Modifier == MCSymbolRefExpr::VK_SECREL)
+ return COFF::IMAGE_REL_AMD64_SECREL;
+ return COFF::IMAGE_REL_I386_DIR32;
+ case FK_SecRel_2:
+ return COFF::IMAGE_REL_I386_SECTION;
+ case FK_SecRel_4:
+ return COFF::IMAGE_REL_I386_SECREL;
+ default:
+ llvm_unreachable("unsupported relocation type");
+ }
+ } else
+ llvm_unreachable("Unsupported COFF machine type.");
+}
+
+MCObjectWriter *llvm::createX86WinCOFFObjectWriter(raw_pwrite_stream &OS,
+ bool Is64Bit) {
+ MCWinCOFFObjectTargetWriter *MOTW = new X86WinCOFFObjectWriter(Is64Bit);
+ return createWinCOFFObjectWriter(MOTW, OS);
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
new file mode 100644
index 000000000000..d04511873b46
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
@@ -0,0 +1,60 @@
+//===-- X86WinCOFFStreamer.cpp - X86 Target WinCOFF Streamer ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MCTargetDesc.h"
+#include "llvm/MC/MCWin64EH.h"
+#include "llvm/MC/MCWinCOFFStreamer.h"
+
+using namespace llvm;
+
+namespace {
+class X86WinCOFFStreamer : public MCWinCOFFStreamer {
+ Win64EH::UnwindEmitter EHStreamer;
+public:
+ X86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, MCCodeEmitter *CE,
+ raw_pwrite_stream &OS)
+ : MCWinCOFFStreamer(C, AB, *CE, OS) {}
+
+ void EmitWinEHHandlerData() override;
+ void EmitWindowsUnwindTables() override;
+ void FinishImpl() override;
+};
+
+void X86WinCOFFStreamer::EmitWinEHHandlerData() {
+ MCStreamer::EmitWinEHHandlerData();
+
+ // We have to emit the unwind info now, because this directive
+ // actually switches to the .xdata section!
+ EHStreamer.EmitUnwindInfo(*this, getCurrentWinFrameInfo());
+}
+
+void X86WinCOFFStreamer::EmitWindowsUnwindTables() {
+ if (!getNumWinFrameInfos())
+ return;
+ EHStreamer.Emit(*this);
+}
+
+void X86WinCOFFStreamer::FinishImpl() {
+ EmitFrames(nullptr);
+ EmitWindowsUnwindTables();
+
+ MCWinCOFFStreamer::FinishImpl();
+}
+}
+
+MCStreamer *llvm::createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB,
+ raw_pwrite_stream &OS,
+ MCCodeEmitter *CE, bool RelaxAll,
+ bool IncrementalLinkerCompatible) {
+ X86WinCOFFStreamer *S = new X86WinCOFFStreamer(C, AB, CE, OS);
+ S->getAssembler().setRelaxAll(RelaxAll);
+ S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible);
+ return S;
+}
+
diff --git a/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp b/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
new file mode 100644
index 000000000000..d2654fc67ed5
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
@@ -0,0 +1,29 @@
+//===-- X86TargetInfo.cpp - X86 Target Implementation ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target &llvm::getTheX86_32Target() {
+ static Target TheX86_32Target;
+ return TheX86_32Target;
+}
+Target &llvm::getTheX86_64Target() {
+ static Target TheX86_64Target;
+ return TheX86_64Target;
+}
+
+extern "C" void LLVMInitializeX86TargetInfo() {
+ RegisterTarget<Triple::x86, /*HasJIT=*/true> X(
+ getTheX86_32Target(), "x86", "32-bit X86: Pentium-Pro and above");
+
+ RegisterTarget<Triple::x86_64, /*HasJIT=*/true> Y(
+ getTheX86_64Target(), "x86-64", "64-bit X86: EM64T and AMD64");
+}
diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp
new file mode 100644
index 000000000000..1be5aec849fc
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -0,0 +1,606 @@
+//===-- X86ShuffleDecode.cpp - X86 shuffle decode logic -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Define several functions to decode x86 specific shuffle semantics into a
+// generic vector mask.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ShuffleDecode.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/CodeGen/MachineValueType.h"
+
+//===----------------------------------------------------------------------===//
+// Vector Mask Decoding
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+
+void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+ // Defaults the copying the dest value.
+ ShuffleMask.push_back(0);
+ ShuffleMask.push_back(1);
+ ShuffleMask.push_back(2);
+ ShuffleMask.push_back(3);
+
+ // Decode the immediate.
+ unsigned ZMask = Imm & 15;
+ unsigned CountD = (Imm >> 4) & 3;
+ unsigned CountS = (Imm >> 6) & 3;
+
+ // CountS selects which input element to use.
+ unsigned InVal = 4 + CountS;
+ // CountD specifies which element of destination to update.
+ ShuffleMask[CountD] = InVal;
+ // ZMask zaps values, potentially overriding the CountD elt.
+ if (ZMask & 1) ShuffleMask[0] = SM_SentinelZero;
+ if (ZMask & 2) ShuffleMask[1] = SM_SentinelZero;
+ if (ZMask & 4) ShuffleMask[2] = SM_SentinelZero;
+ if (ZMask & 8) ShuffleMask[3] = SM_SentinelZero;
+}
+
+void DecodeInsertElementMask(MVT VT, unsigned Idx, unsigned Len,
+ SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+ assert((Idx + Len) <= NumElts && "Insertion out of range");
+
+ for (unsigned i = 0; i != NumElts; ++i)
+ ShuffleMask.push_back(i);
+ for (unsigned i = 0; i != Len; ++i)
+ ShuffleMask[Idx + i] = NumElts + i;
+}
+
+// <3,1> or <6,7,2,3>
+void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) {
+ for (unsigned i = NElts / 2; i != NElts; ++i)
+ ShuffleMask.push_back(NElts + i);
+
+ for (unsigned i = NElts / 2; i != NElts; ++i)
+ ShuffleMask.push_back(i);
+}
+
+// <0,2> or <0,1,4,5>
+void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) {
+ for (unsigned i = 0; i != NElts / 2; ++i)
+ ShuffleMask.push_back(i);
+
+ for (unsigned i = 0; i != NElts / 2; ++i)
+ ShuffleMask.push_back(NElts + i);
+}
+
+void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+ for (int i = 0, e = NumElts / 2; i < e; ++i) {
+ ShuffleMask.push_back(2 * i);
+ ShuffleMask.push_back(2 * i);
+ }
+}
+
+void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+ for (int i = 0, e = NumElts / 2; i < e; ++i) {
+ ShuffleMask.push_back(2 * i + 1);
+ ShuffleMask.push_back(2 * i + 1);
+ }
+}
+
+void DecodeMOVDDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned VectorSizeInBits = VT.getSizeInBits();
+ unsigned ScalarSizeInBits = VT.getScalarSizeInBits();
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned NumLanes = VectorSizeInBits / 128;
+ unsigned NumLaneElts = NumElts / NumLanes;
+ unsigned NumLaneSubElts = 64 / ScalarSizeInBits;
+
+ for (unsigned l = 0; l < NumElts; l += NumLaneElts)
+ for (unsigned i = 0; i < NumLaneElts; i += NumLaneSubElts)
+ for (unsigned s = 0; s != NumLaneSubElts; s++)
+ ShuffleMask.push_back(l + s);
+}
+
+void DecodePSLLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned VectorSizeInBits = VT.getSizeInBits();
+ unsigned NumElts = VectorSizeInBits / 8;
+ unsigned NumLanes = VectorSizeInBits / 128;
+ unsigned NumLaneElts = NumElts / NumLanes;
+
+ for (unsigned l = 0; l < NumElts; l += NumLaneElts)
+ for (unsigned i = 0; i < NumLaneElts; ++i) {
+ int M = SM_SentinelZero;
+ if (i >= Imm) M = i - Imm + l;
+ ShuffleMask.push_back(M);
+ }
+}
+
+void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned VectorSizeInBits = VT.getSizeInBits();
+ unsigned NumElts = VectorSizeInBits / 8;
+ unsigned NumLanes = VectorSizeInBits / 128;
+ unsigned NumLaneElts = NumElts / NumLanes;
+
+ for (unsigned l = 0; l < NumElts; l += NumLaneElts)
+ for (unsigned i = 0; i < NumLaneElts; ++i) {
+ unsigned Base = i + Imm;
+ int M = Base + l;
+ if (Base >= NumLaneElts) M = SM_SentinelZero;
+ ShuffleMask.push_back(M);
+ }
+}
+
+void DecodePALIGNRMask(MVT VT, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned Offset = Imm * (VT.getScalarSizeInBits() / 8);
+
+ unsigned NumLanes = VT.getSizeInBits() / 128;
+ unsigned NumLaneElts = NumElts / NumLanes;
+
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = 0; i != NumLaneElts; ++i) {
+ unsigned Base = i + Offset;
+ // if i+offset is out of this lane then we actually need the other source
+ if (Base >= NumLaneElts) Base += NumElts - NumLaneElts;
+ ShuffleMask.push_back(Base + l);
+ }
+ }
+}
+
+void DecodeVALIGNMask(MVT VT, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ int NumElts = VT.getVectorNumElements();
+ // Not all bits of the immediate are used so mask it.
+ assert(isPowerOf2_32(NumElts) && "NumElts should be power of 2");
+ Imm = Imm & (NumElts - 1);
+ for (int i = 0; i != NumElts; ++i)
+ ShuffleMask.push_back(i + Imm);
+}
+
+/// DecodePSHUFMask - This decodes the shuffle masks for pshufw, pshufd, and vpermilp*.
+/// VT indicates the type of the vector allowing it to handle different
+/// datatypes and vector widths.
+void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ unsigned NumLanes = VT.getSizeInBits() / 128;
+ if (NumLanes == 0) NumLanes = 1; // Handle MMX
+ unsigned NumLaneElts = NumElts / NumLanes;
+
+ unsigned NewImm = Imm;
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = 0; i != NumLaneElts; ++i) {
+ ShuffleMask.push_back(NewImm % NumLaneElts + l);
+ NewImm /= NumLaneElts;
+ }
+ if (NumLaneElts == 4) NewImm = Imm; // reload imm
+ }
+}
+
+void DecodePSHUFHWMask(MVT VT, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ for (unsigned l = 0; l != NumElts; l += 8) {
+ unsigned NewImm = Imm;
+ for (unsigned i = 0, e = 4; i != e; ++i) {
+ ShuffleMask.push_back(l + i);
+ }
+ for (unsigned i = 4, e = 8; i != e; ++i) {
+ ShuffleMask.push_back(l + 4 + (NewImm & 3));
+ NewImm >>= 2;
+ }
+ }
+}
+
+void DecodePSHUFLWMask(MVT VT, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ for (unsigned l = 0; l != NumElts; l += 8) {
+ unsigned NewImm = Imm;
+ for (unsigned i = 0, e = 4; i != e; ++i) {
+ ShuffleMask.push_back(l + (NewImm & 3));
+ NewImm >>= 2;
+ }
+ for (unsigned i = 4, e = 8; i != e; ++i) {
+ ShuffleMask.push_back(l + i);
+ }
+ }
+}
+
+void DecodePSWAPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned NumHalfElts = NumElts / 2;
+
+ for (unsigned l = 0; l != NumHalfElts; ++l)
+ ShuffleMask.push_back(l + NumHalfElts);
+ for (unsigned h = 0; h != NumHalfElts; ++h)
+ ShuffleMask.push_back(h);
+}
+
+/// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates
+/// the type of the vector allowing it to handle different datatypes and vector
+/// widths.
+void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ unsigned NumLanes = VT.getSizeInBits() / 128;
+ unsigned NumLaneElts = NumElts / NumLanes;
+
+ unsigned NewImm = Imm;
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ // each half of a lane comes from different source
+ for (unsigned s = 0; s != NumElts * 2; s += NumElts) {
+ for (unsigned i = 0; i != NumLaneElts / 2; ++i) {
+ ShuffleMask.push_back(NewImm % NumLaneElts + s + l);
+ NewImm /= NumLaneElts;
+ }
+ }
+ if (NumLaneElts == 4) NewImm = Imm; // reload imm
+ }
+}
+
+/// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd
+/// and punpckh*. VT indicates the type of the vector allowing it to handle
+/// different datatypes and vector widths.
+void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+ // independently on 128-bit lanes.
+ unsigned NumLanes = VT.getSizeInBits() / 128;
+ if (NumLanes == 0) NumLanes = 1; // Handle MMX
+ unsigned NumLaneElts = NumElts / NumLanes;
+
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = l + NumLaneElts / 2, e = l + NumLaneElts; i != e; ++i) {
+ ShuffleMask.push_back(i); // Reads from dest/src1
+ ShuffleMask.push_back(i + NumElts); // Reads from src/src2
+ }
+ }
+}
+
+/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd
+/// and punpckl*. VT indicates the type of the vector allowing it to handle
+/// different datatypes and vector widths.
+void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+ // independently on 128-bit lanes.
+ unsigned NumLanes = VT.getSizeInBits() / 128;
+ if (NumLanes == 0 ) NumLanes = 1; // Handle MMX
+ unsigned NumLaneElts = NumElts / NumLanes;
+
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = l, e = l + NumLaneElts / 2; i != e; ++i) {
+ ShuffleMask.push_back(i); // Reads from dest/src1
+ ShuffleMask.push_back(i + NumElts); // Reads from src/src2
+ }
+ }
+}
+
+/// Decodes a broadcast of the first element of a vector.
+void DecodeVectorBroadcast(MVT DstVT, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = DstVT.getVectorNumElements();
+ ShuffleMask.append(NumElts, 0);
+}
+
+/// Decodes a broadcast of a subvector to a larger vector type.
+void DecodeSubVectorBroadcast(MVT DstVT, MVT SrcVT,
+ SmallVectorImpl<int> &ShuffleMask) {
+ assert(SrcVT.getScalarType() == DstVT.getScalarType() &&
+ "Non matching vector element types");
+ unsigned NumElts = SrcVT.getVectorNumElements();
+ unsigned Scale = DstVT.getSizeInBits() / SrcVT.getSizeInBits();
+
+ for (unsigned i = 0; i != Scale; ++i)
+ for (unsigned j = 0; j != NumElts; ++j)
+ ShuffleMask.push_back(j);
+}
+
+/// \brief Decode a shuffle packed values at 128-bit granularity
+/// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2)
+/// immediate mask into a shuffle mask.
+void decodeVSHUF64x2FamilyMask(MVT VT, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumLanes = VT.getSizeInBits() / 128;
+ unsigned NumElementsInLane = 128 / VT.getScalarSizeInBits();
+ unsigned ControlBitsMask = NumLanes - 1;
+ unsigned NumControlBits = NumLanes / 2;
+
+ for (unsigned l = 0; l != NumLanes; ++l) {
+ unsigned LaneMask = (Imm >> (l * NumControlBits)) & ControlBitsMask;
+ // We actually need the other source.
+ if (l >= NumLanes / 2)
+ LaneMask += NumLanes;
+ for (unsigned i = 0; i != NumElementsInLane; ++i)
+ ShuffleMask.push_back(LaneMask * NumElementsInLane + i);
+ }
+}
+
+void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ unsigned HalfSize = VT.getVectorNumElements() / 2;
+
+ for (unsigned l = 0; l != 2; ++l) {
+ unsigned HalfMask = Imm >> (l * 4);
+ unsigned HalfBegin = (HalfMask & 0x3) * HalfSize;
+ for (unsigned i = HalfBegin, e = HalfBegin + HalfSize; i != e; ++i)
+ ShuffleMask.push_back(HalfMask & 8 ? SM_SentinelZero : (int)i);
+ }
+}
+
+void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask,
+ SmallVectorImpl<int> &ShuffleMask) {
+ for (int i = 0, e = RawMask.size(); i < e; ++i) {
+ uint64_t M = RawMask[i];
+ if (M == (uint64_t)SM_SentinelUndef) {
+ ShuffleMask.push_back(M);
+ continue;
+ }
+ // For 256/512-bit vectors the base of the shuffle is the 128-bit
+ // subvector we're inside.
+ int Base = (i / 16) * 16;
+ // If the high bit (7) of the byte is set, the element is zeroed.
+ if (M & (1 << 7))
+ ShuffleMask.push_back(SM_SentinelZero);
+ else {
+ // Only the least significant 4 bits of the byte are used.
+ int Index = Base + (M & 0xf);
+ ShuffleMask.push_back(Index);
+ }
+ }
+}
+
+void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+ int ElementBits = VT.getScalarSizeInBits();
+ int NumElements = VT.getVectorNumElements();
+ for (int i = 0; i < NumElements; ++i) {
+ // If there are more than 8 elements in the vector, then any immediate blend
+ // mask applies to each 128-bit lane. There can never be more than
+ // 8 elements in a 128-bit lane with an immediate blend.
+ int Bit = NumElements > 8 ? i % (128 / ElementBits) : i;
+ assert(Bit < 8 &&
+ "Immediate blends only operate over 8 elements at a time!");
+ ShuffleMask.push_back(((Imm >> Bit) & 1) ? NumElements + i : i);
+ }
+}
+
+void DecodeVPPERMMask(ArrayRef<uint64_t> RawMask,
+ SmallVectorImpl<int> &ShuffleMask) {
+ assert(RawMask.size() == 16 && "Illegal VPPERM shuffle mask size");
+
+ // VPPERM Operation
+ // Bits[4:0] - Byte Index (0 - 31)
+ // Bits[7:5] - Permute Operation
+ //
+ // Permute Operation:
+ // 0 - Source byte (no logical operation).
+ // 1 - Invert source byte.
+ // 2 - Bit reverse of source byte.
+ // 3 - Bit reverse of inverted source byte.
+ // 4 - 00h (zero - fill).
+ // 5 - FFh (ones - fill).
+ // 6 - Most significant bit of source byte replicated in all bit positions.
+ // 7 - Invert most significant bit of source byte and replicate in all bit positions.
+ for (int i = 0, e = RawMask.size(); i < e; ++i) {
+ uint64_t M = RawMask[i];
+ if (M == (uint64_t)SM_SentinelUndef) {
+ ShuffleMask.push_back(M);
+ continue;
+ }
+
+ uint64_t PermuteOp = (M >> 5) & 0x7;
+ if (PermuteOp == 4) {
+ ShuffleMask.push_back(SM_SentinelZero);
+ continue;
+ }
+ if (PermuteOp != 0) {
+ ShuffleMask.clear();
+ return;
+ }
+
+ uint64_t Index = M & 0x1F;
+ ShuffleMask.push_back((int)Index);
+ }
+}
+
+/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
+void DecodeVPERMMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+ assert((VT.is256BitVector() || VT.is512BitVector()) &&
+ (VT.getScalarSizeInBits() == 64) && "Unexpected vector value type");
+ unsigned NumElts = VT.getVectorNumElements();
+ for (unsigned l = 0; l != NumElts; l += 4)
+ for (unsigned i = 0; i != 4; ++i)
+ ShuffleMask.push_back(l + ((Imm >> (2 * i)) & 3));
+}
+
+void DecodeZeroExtendMask(MVT SrcScalarVT, MVT DstVT, SmallVectorImpl<int> &Mask) {
+ unsigned NumDstElts = DstVT.getVectorNumElements();
+ unsigned SrcScalarBits = SrcScalarVT.getSizeInBits();
+ unsigned DstScalarBits = DstVT.getScalarSizeInBits();
+ unsigned Scale = DstScalarBits / SrcScalarBits;
+ assert(SrcScalarBits < DstScalarBits &&
+ "Expected zero extension mask to increase scalar size");
+
+ for (unsigned i = 0; i != NumDstElts; i++) {
+ Mask.push_back(i);
+ for (unsigned j = 1; j != Scale; j++)
+ Mask.push_back(SM_SentinelZero);
+ }
+}
+
+void DecodeZeroMoveLowMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+ ShuffleMask.push_back(0);
+ for (unsigned i = 1; i < NumElts; i++)
+ ShuffleMask.push_back(SM_SentinelZero);
+}
+
+void DecodeScalarMoveMask(MVT VT, bool IsLoad, SmallVectorImpl<int> &Mask) {
+ // First element comes from the first element of second source.
+ // Remaining elements: Load zero extends / Move copies from first source.
+ unsigned NumElts = VT.getVectorNumElements();
+ Mask.push_back(NumElts);
+ for (unsigned i = 1; i < NumElts; i++)
+ Mask.push_back(IsLoad ? static_cast<int>(SM_SentinelZero) : i);
+}
+
+void DecodeEXTRQIMask(int Len, int Idx,
+ SmallVectorImpl<int> &ShuffleMask) {
+ // Only the bottom 6 bits are valid for each immediate.
+ Len &= 0x3F;
+ Idx &= 0x3F;
+
+ // We can only decode this bit extraction instruction as a shuffle if both the
+ // length and index work with whole bytes.
+ if (0 != (Len % 8) || 0 != (Idx % 8))
+ return;
+
+ // A length of zero is equivalent to a bit length of 64.
+ if (Len == 0)
+ Len = 64;
+
+ // If the length + index exceeds the bottom 64 bits the result is undefined.
+ if ((Len + Idx) > 64) {
+ ShuffleMask.append(16, SM_SentinelUndef);
+ return;
+ }
+
+ // Convert index and index to work with bytes.
+ Len /= 8;
+ Idx /= 8;
+
+ // EXTRQ: Extract Len bytes starting from Idx. Zero pad the remaining bytes
+ // of the lower 64-bits. The upper 64-bits are undefined.
+ for (int i = 0; i != Len; ++i)
+ ShuffleMask.push_back(i + Idx);
+ for (int i = Len; i != 8; ++i)
+ ShuffleMask.push_back(SM_SentinelZero);
+ for (int i = 8; i != 16; ++i)
+ ShuffleMask.push_back(SM_SentinelUndef);
+}
+
+void DecodeINSERTQIMask(int Len, int Idx,
+ SmallVectorImpl<int> &ShuffleMask) {
+ // Only the bottom 6 bits are valid for each immediate.
+ Len &= 0x3F;
+ Idx &= 0x3F;
+
+ // We can only decode this bit insertion instruction as a shuffle if both the
+ // length and index work with whole bytes.
+ if (0 != (Len % 8) || 0 != (Idx % 8))
+ return;
+
+ // A length of zero is equivalent to a bit length of 64.
+ if (Len == 0)
+ Len = 64;
+
+ // If the length + index exceeds the bottom 64 bits the result is undefined.
+ if ((Len + Idx) > 64) {
+ ShuffleMask.append(16, SM_SentinelUndef);
+ return;
+ }
+
+ // Convert index and index to work with bytes.
+ Len /= 8;
+ Idx /= 8;
+
+ // INSERTQ: Extract lowest Len bytes from lower half of second source and
+ // insert over first source starting at Idx byte. The upper 64-bits are
+ // undefined.
+ for (int i = 0; i != Idx; ++i)
+ ShuffleMask.push_back(i);
+ for (int i = 0; i != Len; ++i)
+ ShuffleMask.push_back(i + 16);
+ for (int i = Idx + Len; i != 8; ++i)
+ ShuffleMask.push_back(i);
+ for (int i = 8; i != 16; ++i)
+ ShuffleMask.push_back(SM_SentinelUndef);
+}
+
+void DecodeVPERMILPMask(MVT VT, ArrayRef<uint64_t> RawMask,
+ SmallVectorImpl<int> &ShuffleMask) {
+ unsigned VecSize = VT.getSizeInBits();
+ unsigned EltSize = VT.getScalarSizeInBits();
+ unsigned NumLanes = VecSize / 128;
+ unsigned NumEltsPerLane = VT.getVectorNumElements() / NumLanes;
+ assert((VecSize == 128 || VecSize == 256 || VecSize == 512) &&
+ "Unexpected vector size");
+ assert((EltSize == 32 || EltSize == 64) && "Unexpected element size");
+
+ for (unsigned i = 0, e = RawMask.size(); i < e; ++i) {
+ uint64_t M = RawMask[i];
+ M = (EltSize == 64 ? ((M >> 1) & 0x1) : (M & 0x3));
+ unsigned LaneOffset = i & ~(NumEltsPerLane - 1);
+ ShuffleMask.push_back((int)(LaneOffset + M));
+ }
+}
+
+void DecodeVPERMIL2PMask(MVT VT, unsigned M2Z, ArrayRef<uint64_t> RawMask,
+ SmallVectorImpl<int> &ShuffleMask) {
+ unsigned VecSize = VT.getSizeInBits();
+ unsigned EltSize = VT.getScalarSizeInBits();
+ unsigned NumLanes = VecSize / 128;
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned NumEltsPerLane = NumElts / NumLanes;
+ assert((VecSize == 128 || VecSize == 256) && "Unexpected vector size");
+ assert((EltSize == 32 || EltSize == 64) && "Unexpected element size");
+ assert((NumElts == RawMask.size()) && "Unexpected mask size");
+
+ for (unsigned i = 0, e = RawMask.size(); i < e; ++i) {
+ // VPERMIL2 Operation.
+ // Bits[3] - Match Bit.
+ // Bits[2:1] - (Per Lane) PD Shuffle Mask.
+ // Bits[2:0] - (Per Lane) PS Shuffle Mask.
+ uint64_t Selector = RawMask[i];
+ unsigned MatchBit = (Selector >> 3) & 0x1;
+
+ // M2Z[0:1] MatchBit
+ // 0Xb X Source selected by Selector index.
+ // 10b 0 Source selected by Selector index.
+ // 10b 1 Zero.
+ // 11b 0 Zero.
+ // 11b 1 Source selected by Selector index.
+ if ((M2Z & 0x2) != 0 && MatchBit != (M2Z & 0x1)) {
+ ShuffleMask.push_back(SM_SentinelZero);
+ continue;
+ }
+
+ int Index = i & ~(NumEltsPerLane - 1);
+ if (EltSize == 64)
+ Index += (Selector >> 1) & 0x1;
+ else
+ Index += Selector & 0x3;
+
+ int Src = (Selector >> 2) & 0x1;
+ Index += Src * NumElts;
+ ShuffleMask.push_back(Index);
+ }
+}
+
+void DecodeVPERMVMask(ArrayRef<uint64_t> RawMask,
+ SmallVectorImpl<int> &ShuffleMask) {
+ uint64_t EltMaskSize = RawMask.size() - 1;
+ for (auto M : RawMask) {
+ M &= EltMaskSize;
+ ShuffleMask.push_back((int)M);
+ }
+}
+
+void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask,
+ SmallVectorImpl<int> &ShuffleMask) {
+ uint64_t EltMaskSize = (RawMask.size() * 2) - 1;
+ for (auto M : RawMask) {
+ M &= EltMaskSize;
+ ShuffleMask.push_back((int)M);
+ }
+}
+
+} // llvm namespace
diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h
new file mode 100644
index 000000000000..17619d09d059
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h
@@ -0,0 +1,162 @@
+//===-- X86ShuffleDecode.h - X86 shuffle decode logic -----------*-C++-*---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Define several functions to decode x86 specific shuffle semantics into a
+// generic vector mask.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_UTILS_X86SHUFFLEDECODE_H
+#define LLVM_LIB_TARGET_X86_UTILS_X86SHUFFLEDECODE_H
+
+#include "llvm/ADT/SmallVector.h"
+
+//===----------------------------------------------------------------------===//
+// Vector Mask Decoding
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+template <typename T> class ArrayRef;
+class MVT;
+
+enum { SM_SentinelUndef = -1, SM_SentinelZero = -2 };
+
+/// Decode a 128-bit INSERTPS instruction as a v4f32 shuffle mask.
+void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+// Insert the bottom Len elements from a second source into a vector starting at
+// element Idx.
+void DecodeInsertElementMask(MVT VT, unsigned Idx, unsigned Len,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a MOVHLPS instruction as a v2f64/v4f32 shuffle mask.
+/// i.e. <3,1> or <6,7,2,3>
+void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a MOVLHPS instruction as a v2f64/v4f32 shuffle mask.
+/// i.e. <0,2> or <0,1,4,5>
+void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodeMOVDDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePSLLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePALIGNRMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodeVALIGNMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes the shuffle masks for pshufd/pshufw/vpermilpd/vpermilps.
+/// VT indicates the type of the vector allowing it to handle different
+/// datatypes and vector widths.
+void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes the shuffle masks for pshufhw.
+/// VT indicates the type of the vector allowing it to handle different
+/// datatypes and vector widths.
+void DecodePSHUFHWMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes the shuffle masks for pshuflw.
+/// VT indicates the type of the vector allowing it to handle different
+/// datatypes and vector widths.
+void DecodePSHUFLWMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes a PSWAPD 3DNow! instruction.
+void DecodePSWAPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes the shuffle masks for shufp*.
+/// VT indicates the type of the vector allowing it to handle different
+/// datatypes and vector widths.
+void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes the shuffle masks for unpckhps/unpckhpd and punpckh*.
+/// VT indicates the type of the vector allowing it to handle different
+/// datatypes and vector widths.
+void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes the shuffle masks for unpcklps/unpcklpd and punpckl*.
+/// VT indicates the type of the vector allowing it to handle different
+/// datatypes and vector widths.
+void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes a broadcast of the first element of a vector.
+void DecodeVectorBroadcast(MVT DstVT, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes a broadcast of a subvector to a larger vector type.
+void DecodeSubVectorBroadcast(MVT DstVT, MVT SrcVT,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a PSHUFB mask from a raw array of constants such as from
+/// BUILD_VECTOR.
+void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a BLEND immediate mask into a shuffle mask.
+void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a shuffle packed values at 128-bit granularity
+/// immediate mask into a shuffle mask.
+void decodeVSHUF64x2FamilyMask(MVT VT, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes the shuffle masks for VPERMQ/VPERMPD.
+void DecodeVPERMMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a VPPERM mask from a raw array of constants such as from
+/// BUILD_VECTOR.
+/// This can only basic masks (permutes + zeros), not any of the other
+/// operations that VPPERM can perform.
+void DecodeVPPERMMask(ArrayRef<uint64_t> RawMask,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a zero extension instruction as a shuffle mask.
+void DecodeZeroExtendMask(MVT SrcScalarVT, MVT DstVT,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a move lower and zero upper instruction as a shuffle mask.
+void DecodeZeroMoveLowMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a scalar float move instruction as a shuffle mask.
+void DecodeScalarMoveMask(MVT VT, bool IsLoad,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a SSE4A EXTRQ instruction as a v16i8 shuffle mask.
+void DecodeEXTRQIMask(int Len, int Idx,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a SSE4A INSERTQ instruction as a v16i8 shuffle mask.
+void DecodeINSERTQIMask(int Len, int Idx,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a VPERMILPD/VPERMILPS variable mask from a raw array of constants.
+void DecodeVPERMILPMask(MVT VT, ArrayRef<uint64_t> RawMask,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a VPERMIL2PD/VPERMIL2PS variable mask from a raw array of constants.
+void DecodeVPERMIL2PMask(MVT VT, unsigned M2Z, ArrayRef<uint64_t> RawMask,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
+void DecodeVPERMVMask(ArrayRef<uint64_t> RawMask,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants.
+void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask,
+ SmallVectorImpl<int> &ShuffleMask);
+} // llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86.h b/contrib/llvm/lib/Target/X86/X86.h
new file mode 100644
index 000000000000..2cb80a482d06
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86.h
@@ -0,0 +1,99 @@
+//===-- X86.h - Top-level interface for X86 representation ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the x86
+// target library, as used by the LLVM JIT.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86_H
+#define LLVM_LIB_TARGET_X86_X86_H
+
+#include "llvm/Support/CodeGen.h"
+
+namespace llvm {
+
+class FunctionPass;
+class ImmutablePass;
+class PassRegistry;
+class X86TargetMachine;
+
+/// This pass converts a legalized DAG into a X86-specific DAG, ready for
+/// instruction scheduling.
+FunctionPass *createX86ISelDag(X86TargetMachine &TM,
+ CodeGenOpt::Level OptLevel);
+
+/// This pass initializes a global base register for PIC on x86-32.
+FunctionPass *createX86GlobalBaseRegPass();
+
+/// This pass combines multiple accesses to local-dynamic TLS variables so that
+/// the TLS base address for the module is only fetched once per execution path
+/// through the function.
+FunctionPass *createCleanupLocalDynamicTLSPass();
+
+/// This function returns a pass which converts floating-point register
+/// references and pseudo instructions into floating-point stack references and
+/// physical instructions.
+FunctionPass *createX86FloatingPointStackifierPass();
+
+/// This pass inserts AVX vzeroupper instructions before each call to avoid
+/// transition penalty between functions encoded with AVX and SSE.
+FunctionPass *createX86IssueVZeroUpperPass();
+
+/// Return a pass that pads short functions with NOOPs.
+/// This will prevent a stall when returning on the Atom.
+FunctionPass *createX86PadShortFunctions();
+
+/// Return a pass that selectively replaces certain instructions (like add,
+/// sub, inc, dec, some shifts, and some multiplies) by equivalent LEA
+/// instructions, in order to eliminate execution delays in some processors.
+FunctionPass *createX86FixupLEAs();
+
+/// Return a pass that removes redundant LEA instructions and redundant address
+/// recalculations.
+FunctionPass *createX86OptimizeLEAs();
+
+/// Return a pass that transforms setcc + movzx pairs into xor + setcc.
+FunctionPass *createX86FixupSetCC();
+
+/// Return a pass that expands WinAlloca pseudo-instructions.
+FunctionPass *createX86WinAllocaExpander();
+
+/// Return a pass that optimizes the code-size of x86 call sequences. This is
+/// done by replacing esp-relative movs with pushes.
+FunctionPass *createX86CallFrameOptimization();
+
+/// Return an IR pass that inserts EH registration stack objects and explicit
+/// EH state updates. This pass must run after EH preparation, which does
+/// Windows-specific but architecture-neutral preparation.
+FunctionPass *createX86WinEHStatePass();
+
+/// Return a Machine IR pass that expands X86-specific pseudo
+/// instructions into a sequence of actual instructions. This pass
+/// must run after prologue/epilogue insertion and before lowering
+/// the MachineInstr to MC.
+FunctionPass *createX86ExpandPseudoPass();
+
+/// Return a Machine IR pass that selectively replaces
+/// certain byte and word instructions by equivalent 32 bit instructions,
+/// in order to eliminate partial register usage, false dependences on
+/// the upper portions of registers, and to save code size.
+FunctionPass *createX86FixupBWInsts();
+
+void initializeFixupBWInstPassPass(PassRegistry &);
+
+/// This pass replaces EVEX ecnoded of AVX-512 instructiosn by VEX
+/// encoding when possible in order to reduce code size.
+FunctionPass *createX86EvexToVexInsts();
+
+void initializeEvexToVexInstPassPass(PassRegistry &);
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86.td b/contrib/llvm/lib/Target/X86/X86.td
new file mode 100644
index 000000000000..dc18a59a30ba
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86.td
@@ -0,0 +1,856 @@
+//===-- X86.td - Target definition file for the Intel X86 --*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a target description file for the Intel i386 architecture, referred
+// to here as the "X86" architecture.
+//
+//===----------------------------------------------------------------------===//
+
+// Get the target-independent interfaces which we are implementing...
+//
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// X86 Subtarget state
+//
+
+def Mode64Bit : SubtargetFeature<"64bit-mode", "In64BitMode", "true",
+ "64-bit mode (x86_64)">;
+def Mode32Bit : SubtargetFeature<"32bit-mode", "In32BitMode", "true",
+ "32-bit mode (80386)">;
+def Mode16Bit : SubtargetFeature<"16bit-mode", "In16BitMode", "true",
+ "16-bit mode (i8086)">;
+
+//===----------------------------------------------------------------------===//
+// X86 Subtarget features
+//===----------------------------------------------------------------------===//
+
+def FeatureX87 : SubtargetFeature<"x87","HasX87", "true",
+ "Enable X87 float instructions">;
+
+def FeatureCMOV : SubtargetFeature<"cmov","HasCMov", "true",
+ "Enable conditional move instructions">;
+
+def FeaturePOPCNT : SubtargetFeature<"popcnt", "HasPOPCNT", "true",
+ "Support POPCNT instruction">;
+
+def FeatureFXSR : SubtargetFeature<"fxsr", "HasFXSR", "true",
+ "Support fxsave/fxrestore instructions">;
+
+def FeatureXSAVE : SubtargetFeature<"xsave", "HasXSAVE", "true",
+ "Support xsave instructions">;
+
+def FeatureXSAVEOPT: SubtargetFeature<"xsaveopt", "HasXSAVEOPT", "true",
+ "Support xsaveopt instructions">;
+
+def FeatureXSAVEC : SubtargetFeature<"xsavec", "HasXSAVEC", "true",
+ "Support xsavec instructions">;
+
+def FeatureXSAVES : SubtargetFeature<"xsaves", "HasXSAVES", "true",
+ "Support xsaves instructions">;
+
+def FeatureSSE1 : SubtargetFeature<"sse", "X86SSELevel", "SSE1",
+ "Enable SSE instructions",
+ // SSE codegen depends on cmovs, and all
+ // SSE1+ processors support them.
+ [FeatureCMOV]>;
+def FeatureSSE2 : SubtargetFeature<"sse2", "X86SSELevel", "SSE2",
+ "Enable SSE2 instructions",
+ [FeatureSSE1]>;
+def FeatureSSE3 : SubtargetFeature<"sse3", "X86SSELevel", "SSE3",
+ "Enable SSE3 instructions",
+ [FeatureSSE2]>;
+def FeatureSSSE3 : SubtargetFeature<"ssse3", "X86SSELevel", "SSSE3",
+ "Enable SSSE3 instructions",
+ [FeatureSSE3]>;
+def FeatureSSE41 : SubtargetFeature<"sse4.1", "X86SSELevel", "SSE41",
+ "Enable SSE 4.1 instructions",
+ [FeatureSSSE3]>;
+def FeatureSSE42 : SubtargetFeature<"sse4.2", "X86SSELevel", "SSE42",
+ "Enable SSE 4.2 instructions",
+ [FeatureSSE41]>;
+// The MMX subtarget feature is separate from the rest of the SSE features
+// because it's important (for odd compatibility reasons) to be able to
+// turn it off explicitly while allowing SSE+ to be on.
+def FeatureMMX : SubtargetFeature<"mmx","X863DNowLevel", "MMX",
+ "Enable MMX instructions">;
+def Feature3DNow : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow",
+ "Enable 3DNow! instructions",
+ [FeatureMMX]>;
+def Feature3DNowA : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA",
+ "Enable 3DNow! Athlon instructions",
+ [Feature3DNow]>;
+// All x86-64 hardware has SSE2, but we don't mark SSE2 as an implied
+// feature, because SSE2 can be disabled (e.g. for compiling OS kernels)
+// without disabling 64-bit mode.
+def Feature64Bit : SubtargetFeature<"64bit", "HasX86_64", "true",
+ "Support 64-bit instructions",
+ [FeatureCMOV]>;
+def FeatureCMPXCHG16B : SubtargetFeature<"cx16", "HasCmpxchg16b", "true",
+ "64-bit with cmpxchg16b",
+ [Feature64Bit]>;
+def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true",
+ "Bit testing of memory is slow">;
+def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
+ "SHLD instruction is slow">;
+def FeatureSlowPMULLD : SubtargetFeature<"slow-pmulld", "IsPMULLDSlow", "true",
+ "PMULLD instruction is slow">;
+// FIXME: This should not apply to CPUs that do not have SSE.
+def FeatureSlowUAMem16 : SubtargetFeature<"slow-unaligned-mem-16",
+ "IsUAMem16Slow", "true",
+ "Slow unaligned 16-byte memory access">;
+def FeatureSlowUAMem32 : SubtargetFeature<"slow-unaligned-mem-32",
+ "IsUAMem32Slow", "true",
+ "Slow unaligned 32-byte memory access">;
+def FeatureSSE4A : SubtargetFeature<"sse4a", "HasSSE4A", "true",
+ "Support SSE 4a instructions",
+ [FeatureSSE3]>;
+
+def FeatureAVX : SubtargetFeature<"avx", "X86SSELevel", "AVX",
+ "Enable AVX instructions",
+ [FeatureSSE42]>;
+def FeatureAVX2 : SubtargetFeature<"avx2", "X86SSELevel", "AVX2",
+ "Enable AVX2 instructions",
+ [FeatureAVX]>;
+def FeatureAVX512 : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512F",
+ "Enable AVX-512 instructions",
+ [FeatureAVX2]>;
+def FeatureERI : SubtargetFeature<"avx512er", "HasERI", "true",
+ "Enable AVX-512 Exponential and Reciprocal Instructions",
+ [FeatureAVX512]>;
+def FeatureCDI : SubtargetFeature<"avx512cd", "HasCDI", "true",
+ "Enable AVX-512 Conflict Detection Instructions",
+ [FeatureAVX512]>;
+def FeaturePFI : SubtargetFeature<"avx512pf", "HasPFI", "true",
+ "Enable AVX-512 PreFetch Instructions",
+ [FeatureAVX512]>;
+def FeaturePREFETCHWT1 : SubtargetFeature<"prefetchwt1", "HasPFPREFETCHWT1",
+ "true",
+ "Prefetch with Intent to Write and T1 Hint">;
+def FeatureDQI : SubtargetFeature<"avx512dq", "HasDQI", "true",
+ "Enable AVX-512 Doubleword and Quadword Instructions",
+ [FeatureAVX512]>;
+def FeatureBWI : SubtargetFeature<"avx512bw", "HasBWI", "true",
+ "Enable AVX-512 Byte and Word Instructions",
+ [FeatureAVX512]>;
+def FeatureVLX : SubtargetFeature<"avx512vl", "HasVLX", "true",
+ "Enable AVX-512 Vector Length eXtensions",
+ [FeatureAVX512]>;
+def FeatureVBMI : SubtargetFeature<"avx512vbmi", "HasVBMI", "true",
+ "Enable AVX-512 Vector Byte Manipulation Instructions",
+ [FeatureBWI]>;
+def FeatureIFMA : SubtargetFeature<"avx512ifma", "HasIFMA", "true",
+ "Enable AVX-512 Integer Fused Multiple-Add",
+ [FeatureAVX512]>;
+def FeaturePKU : SubtargetFeature<"pku", "HasPKU", "true",
+ "Enable protection keys">;
+def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true",
+ "Enable packed carry-less multiplication instructions",
+ [FeatureSSE2]>;
+def FeatureFMA : SubtargetFeature<"fma", "HasFMA", "true",
+ "Enable three-operand fused multiple-add",
+ [FeatureAVX]>;
+def FeatureFMA4 : SubtargetFeature<"fma4", "HasFMA4", "true",
+ "Enable four-operand fused multiple-add",
+ [FeatureAVX, FeatureSSE4A]>;
+def FeatureXOP : SubtargetFeature<"xop", "HasXOP", "true",
+ "Enable XOP instructions",
+ [FeatureFMA4]>;
+def FeatureSSEUnalignedMem : SubtargetFeature<"sse-unaligned-mem",
+ "HasSSEUnalignedMem", "true",
+ "Allow unaligned memory operands with SSE instructions">;
+def FeatureAES : SubtargetFeature<"aes", "HasAES", "true",
+ "Enable AES instructions",
+ [FeatureSSE2]>;
+def FeatureTBM : SubtargetFeature<"tbm", "HasTBM", "true",
+ "Enable TBM instructions">;
+def FeatureMOVBE : SubtargetFeature<"movbe", "HasMOVBE", "true",
+ "Support MOVBE instruction">;
+def FeatureRDRAND : SubtargetFeature<"rdrnd", "HasRDRAND", "true",
+ "Support RDRAND instruction">;
+def FeatureF16C : SubtargetFeature<"f16c", "HasF16C", "true",
+ "Support 16-bit floating point conversion instructions",
+ [FeatureAVX]>;
+def FeatureFSGSBase : SubtargetFeature<"fsgsbase", "HasFSGSBase", "true",
+ "Support FS/GS Base instructions">;
+def FeatureLZCNT : SubtargetFeature<"lzcnt", "HasLZCNT", "true",
+ "Support LZCNT instruction">;
+def FeatureBMI : SubtargetFeature<"bmi", "HasBMI", "true",
+ "Support BMI instructions">;
+def FeatureBMI2 : SubtargetFeature<"bmi2", "HasBMI2", "true",
+ "Support BMI2 instructions">;
+def FeatureRTM : SubtargetFeature<"rtm", "HasRTM", "true",
+ "Support RTM instructions">;
+def FeatureHLE : SubtargetFeature<"hle", "HasHLE", "true",
+ "Support HLE">;
+def FeatureADX : SubtargetFeature<"adx", "HasADX", "true",
+ "Support ADX instructions">;
+def FeatureSHA : SubtargetFeature<"sha", "HasSHA", "true",
+ "Enable SHA instructions",
+ [FeatureSSE2]>;
+def FeaturePRFCHW : SubtargetFeature<"prfchw", "HasPRFCHW", "true",
+ "Support PRFCHW instructions">;
+def FeatureRDSEED : SubtargetFeature<"rdseed", "HasRDSEED", "true",
+ "Support RDSEED instruction">;
+def FeatureLAHFSAHF : SubtargetFeature<"sahf", "HasLAHFSAHF", "true",
+ "Support LAHF and SAHF instructions">;
+def FeatureMWAITX : SubtargetFeature<"mwaitx", "HasMWAITX", "true",
+ "Enable MONITORX/MWAITX timer functionality">;
+def FeatureMPX : SubtargetFeature<"mpx", "HasMPX", "true",
+ "Support MPX instructions">;
+def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
+ "Use LEA for adjusting the stack pointer">;
+def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb",
+ "HasSlowDivide32", "true",
+ "Use 8-bit divide for positive values less than 256">;
+def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divw",
+ "HasSlowDivide64", "true",
+ "Use 16-bit divide for positive values less than 65536">;
+def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions",
+ "PadShortFunctions", "true",
+ "Pad short functions">;
+def FeatureINVPCID : SubtargetFeature<"invpcid", "HasInvPCId", "true",
+ "Invalidate Process-Context Identifier">;
+def FeatureVMFUNC : SubtargetFeature<"vmfunc", "HasVMFUNC", "true",
+ "VM Functions">;
+def FeatureSMAP : SubtargetFeature<"smap", "HasSMAP", "true",
+ "Supervisor Mode Access Protection">;
+def FeatureSGX : SubtargetFeature<"sgx", "HasSGX", "true",
+ "Enable Software Guard Extensions">;
+def FeatureCLFLUSHOPT : SubtargetFeature<"clflushopt", "HasCLFLUSHOPT", "true",
+ "Flush A Cache Line Optimized">;
+def FeaturePCOMMIT : SubtargetFeature<"pcommit", "HasPCOMMIT", "true",
+ "Enable Persistent Commit">;
+def FeatureCLWB : SubtargetFeature<"clwb", "HasCLWB", "true",
+ "Cache Line Write Back">;
+// TODO: This feature ought to be renamed.
+// What it really refers to are CPUs for which certain instructions
+// (which ones besides the example below?) are microcoded.
+// The best examples of this are the memory forms of CALL and PUSH
+// instructions, which should be avoided in favor of a MOV + register CALL/PUSH.
+def FeatureCallRegIndirect : SubtargetFeature<"call-reg-indirect",
+ "CallRegIndirect", "true",
+ "Call register indirect">;
+def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true",
+ "LEA instruction needs inputs at AG stage">;
+def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true",
+ "LEA instruction with certain arguments is slow">;
+def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
+ "INC and DEC instructions are slower than ADD and SUB">;
+def FeatureSoftFloat
+ : SubtargetFeature<"soft-float", "UseSoftFloat", "true",
+ "Use software floating point features.">;
+// On at least some AMD processors, there is no performance hazard to writing
+// only the lower parts of a YMM register without clearing the upper part.
+def FeatureFastPartialYMMWrite
+ : SubtargetFeature<"fast-partial-ymm-write", "HasFastPartialYMMWrite",
+ "true", "Partial writes to YMM registers are fast">;
+// FeatureFastScalarFSQRT should be enabled if scalar FSQRT has shorter latency
+// than the corresponding NR code. FeatureFastVectorFSQRT should be enabled if
+// vector FSQRT has higher throughput than the corresponding NR code.
+// The idea is that throughput bound code is likely to be vectorized, so for
+// vectorized code we should care about the throughput of SQRT operations.
+// But if the code is scalar that probably means that the code has some kind of
+// dependency and we should care more about reducing the latency.
+def FeatureFastScalarFSQRT
+ : SubtargetFeature<"fast-scalar-fsqrt", "HasFastScalarFSQRT",
+ "true", "Scalar SQRT is fast (disable Newton-Raphson)">;
+def FeatureFastVectorFSQRT
+ : SubtargetFeature<"fast-vector-fsqrt", "HasFastVectorFSQRT",
+ "true", "Vector SQRT is fast (disable Newton-Raphson)">;
+// If lzcnt has equivalent latency/throughput to most simple integer ops, it can
+// be used to replace test/set sequences.
+def FeatureFastLZCNT
+ : SubtargetFeature<
+ "fast-lzcnt", "HasFastLZCNT", "true",
+ "LZCNT instructions are as fast as most simple integer ops">;
+
+//===----------------------------------------------------------------------===//
+// X86 processors supported.
+//===----------------------------------------------------------------------===//
+
+include "X86Schedule.td"
+
+def ProcIntelAtom : SubtargetFeature<"atom", "X86ProcFamily", "IntelAtom",
+ "Intel Atom processors">;
+def ProcIntelSLM : SubtargetFeature<"slm", "X86ProcFamily", "IntelSLM",
+ "Intel Silvermont processors">;
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : ProcessorModel<Name, GenericModel, Features>;
+
+def : Proc<"generic", [FeatureX87, FeatureSlowUAMem16]>;
+def : Proc<"i386", [FeatureX87, FeatureSlowUAMem16]>;
+def : Proc<"i486", [FeatureX87, FeatureSlowUAMem16]>;
+def : Proc<"i586", [FeatureX87, FeatureSlowUAMem16]>;
+def : Proc<"pentium", [FeatureX87, FeatureSlowUAMem16]>;
+def : Proc<"pentium-mmx", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>;
+def : Proc<"i686", [FeatureX87, FeatureSlowUAMem16]>;
+def : Proc<"pentiumpro", [FeatureX87, FeatureSlowUAMem16, FeatureCMOV]>;
+def : Proc<"pentium2", [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
+ FeatureCMOV, FeatureFXSR]>;
+def : Proc<"pentium3", [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
+ FeatureSSE1, FeatureFXSR]>;
+def : Proc<"pentium3m", [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
+ FeatureSSE1, FeatureFXSR, FeatureSlowBTMem]>;
+
+// Enable the PostRAScheduler for SSE2 and SSE3 class cpus.
+// The intent is to enable it for pentium4 which is the current default
+// processor in a vanilla 32-bit clang compilation when no specific
+// architecture is specified. This generally gives a nice performance
+// increase on silvermont, with largely neutral behavior on other
+// contemporary large core processors.
+// pentium-m, pentium4m, prescott and nocona are included as a preventative
+// measure to avoid performance surprises, in case clang's default cpu
+// changes slightly.
+
+def : ProcessorModel<"pentium-m", GenericPostRAModel,
+ [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
+ FeatureSSE2, FeatureFXSR, FeatureSlowBTMem]>;
+
+def : ProcessorModel<"pentium4", GenericPostRAModel,
+ [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
+ FeatureSSE2, FeatureFXSR]>;
+
+def : ProcessorModel<"pentium4m", GenericPostRAModel,
+ [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
+ FeatureSSE2, FeatureFXSR, FeatureSlowBTMem]>;
+
+// Intel Quark.
+def : Proc<"lakemont", []>;
+
+// Intel Core Duo.
+def : ProcessorModel<"yonah", SandyBridgeModel,
+ [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3,
+ FeatureFXSR, FeatureSlowBTMem]>;
+
+// NetBurst.
+def : ProcessorModel<"prescott", GenericPostRAModel,
+ [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3,
+ FeatureFXSR, FeatureSlowBTMem]>;
+def : ProcessorModel<"nocona", GenericPostRAModel, [
+ FeatureX87,
+ FeatureSlowUAMem16,
+ FeatureMMX,
+ FeatureSSE3,
+ FeatureFXSR,
+ FeatureCMPXCHG16B,
+ FeatureSlowBTMem
+]>;
+
+// Intel Core 2 Solo/Duo.
+def : ProcessorModel<"core2", SandyBridgeModel, [
+ FeatureX87,
+ FeatureSlowUAMem16,
+ FeatureMMX,
+ FeatureSSSE3,
+ FeatureFXSR,
+ FeatureCMPXCHG16B,
+ FeatureSlowBTMem,
+ FeatureLAHFSAHF
+]>;
+def : ProcessorModel<"penryn", SandyBridgeModel, [
+ FeatureX87,
+ FeatureSlowUAMem16,
+ FeatureMMX,
+ FeatureSSE41,
+ FeatureFXSR,
+ FeatureCMPXCHG16B,
+ FeatureSlowBTMem,
+ FeatureLAHFSAHF
+]>;
+
+// Atom CPUs.
+class BonnellProc<string Name> : ProcessorModel<Name, AtomModel, [
+ ProcIntelAtom,
+ FeatureX87,
+ FeatureSlowUAMem16,
+ FeatureMMX,
+ FeatureSSSE3,
+ FeatureFXSR,
+ FeatureCMPXCHG16B,
+ FeatureMOVBE,
+ FeatureSlowBTMem,
+ FeatureLEAForSP,
+ FeatureSlowDivide32,
+ FeatureSlowDivide64,
+ FeatureCallRegIndirect,
+ FeatureLEAUsesAG,
+ FeaturePadShortFunctions,
+ FeatureLAHFSAHF
+]>;
+def : BonnellProc<"bonnell">;
+def : BonnellProc<"atom">; // Pin the generic name to the baseline.
+
+class SilvermontProc<string Name> : ProcessorModel<Name, SLMModel, [
+ ProcIntelSLM,
+ FeatureX87,
+ FeatureMMX,
+ FeatureSSE42,
+ FeatureFXSR,
+ FeatureCMPXCHG16B,
+ FeatureMOVBE,
+ FeaturePOPCNT,
+ FeaturePCLMUL,
+ FeatureAES,
+ FeatureSlowDivide64,
+ FeatureCallRegIndirect,
+ FeaturePRFCHW,
+ FeatureSlowLEA,
+ FeatureSlowIncDec,
+ FeatureSlowBTMem,
+ FeatureSlowPMULLD,
+ FeatureLAHFSAHF
+]>;
+def : SilvermontProc<"silvermont">;
+def : SilvermontProc<"slm">; // Legacy alias.
+
+// "Arrandale" along with corei3 and corei5
+class NehalemProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
+ FeatureX87,
+ FeatureMMX,
+ FeatureSSE42,
+ FeatureFXSR,
+ FeatureCMPXCHG16B,
+ FeatureSlowBTMem,
+ FeaturePOPCNT,
+ FeatureLAHFSAHF
+]>;
+def : NehalemProc<"nehalem">;
+def : NehalemProc<"corei7">;
+
+// Westmere is a similar machine to nehalem with some additional features.
+// Westmere is the corei3/i5/i7 path from nehalem to sandybridge
+class WestmereProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
+ FeatureX87,
+ FeatureMMX,
+ FeatureSSE42,
+ FeatureFXSR,
+ FeatureCMPXCHG16B,
+ FeatureSlowBTMem,
+ FeaturePOPCNT,
+ FeatureAES,
+ FeaturePCLMUL,
+ FeatureLAHFSAHF
+]>;
+def : WestmereProc<"westmere">;
+
+class ProcessorFeatures<list<SubtargetFeature> Inherited,
+ list<SubtargetFeature> NewFeatures> {
+ list<SubtargetFeature> Value = !listconcat(Inherited, NewFeatures);
+}
+
+class ProcModel<string Name, SchedMachineModel Model,
+ list<SubtargetFeature> ProcFeatures,
+ list<SubtargetFeature> OtherFeatures> :
+ ProcessorModel<Name, Model, !listconcat(ProcFeatures, OtherFeatures)>;
+
+// SSE is not listed here since llvm treats AVX as a reimplementation of SSE,
+// rather than a superset.
+def SNBFeatures : ProcessorFeatures<[], [
+ FeatureX87,
+ FeatureMMX,
+ FeatureAVX,
+ FeatureFXSR,
+ FeatureCMPXCHG16B,
+ FeaturePOPCNT,
+ FeatureAES,
+ FeaturePCLMUL,
+ FeatureXSAVE,
+ FeatureXSAVEOPT,
+ FeatureLAHFSAHF,
+ FeatureFastScalarFSQRT
+]>;
+
+class SandyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,
+ SNBFeatures.Value, [
+ FeatureSlowBTMem,
+ FeatureSlowUAMem32
+]>;
+def : SandyBridgeProc<"sandybridge">;
+def : SandyBridgeProc<"corei7-avx">; // Legacy alias.
+
+def IVBFeatures : ProcessorFeatures<SNBFeatures.Value, [
+ FeatureRDRAND,
+ FeatureF16C,
+ FeatureFSGSBase
+]>;
+
+class IvyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,
+ IVBFeatures.Value, [
+ FeatureSlowBTMem,
+ FeatureSlowUAMem32
+]>;
+def : IvyBridgeProc<"ivybridge">;
+def : IvyBridgeProc<"core-avx-i">; // Legacy alias.
+
+def HSWFeatures : ProcessorFeatures<IVBFeatures.Value, [
+ FeatureAVX2,
+ FeatureBMI,
+ FeatureBMI2,
+ FeatureFMA,
+ FeatureLZCNT,
+ FeatureMOVBE,
+ FeatureINVPCID,
+ FeatureVMFUNC,
+ FeatureRTM,
+ FeatureHLE,
+ FeatureSlowIncDec
+]>;
+
+class HaswellProc<string Name> : ProcModel<Name, HaswellModel,
+ HSWFeatures.Value, []>;
+def : HaswellProc<"haswell">;
+def : HaswellProc<"core-avx2">; // Legacy alias.
+
+def BDWFeatures : ProcessorFeatures<HSWFeatures.Value, [
+ FeatureADX,
+ FeatureRDSEED,
+ FeatureSMAP
+]>;
+class BroadwellProc<string Name> : ProcModel<Name, HaswellModel,
+ BDWFeatures.Value, []>;
+def : BroadwellProc<"broadwell">;
+
+def SKLFeatures : ProcessorFeatures<BDWFeatures.Value, [
+ FeatureMPX,
+ FeatureXSAVEC,
+ FeatureXSAVES,
+ FeatureSGX,
+ FeatureCLFLUSHOPT,
+ FeatureFastVectorFSQRT
+]>;
+
+// FIXME: define SKL model
+class SkylakeClientProc<string Name> : ProcModel<Name, HaswellModel,
+ SKLFeatures.Value, []>;
+def : SkylakeClientProc<"skylake">;
+
+// FIXME: define KNL model
+class KnightsLandingProc<string Name> : ProcModel<Name, HaswellModel,
+ IVBFeatures.Value, [
+ FeatureAVX512,
+ FeatureERI,
+ FeatureCDI,
+ FeaturePFI,
+ FeaturePREFETCHWT1,
+ FeatureADX,
+ FeatureRDSEED,
+ FeatureMOVBE,
+ FeatureLZCNT,
+ FeatureBMI,
+ FeatureBMI2,
+ FeatureFMA
+]>;
+def : KnightsLandingProc<"knl">;
+
+def SKXFeatures : ProcessorFeatures<SKLFeatures.Value, [
+ FeatureAVX512,
+ FeatureCDI,
+ FeatureDQI,
+ FeatureBWI,
+ FeatureVLX,
+ FeaturePKU,
+ FeaturePCOMMIT,
+ FeatureCLWB
+]>;
+
+// FIXME: define SKX model
+class SkylakeServerProc<string Name> : ProcModel<Name, HaswellModel,
+ SKXFeatures.Value, []>;
+def : SkylakeServerProc<"skylake-avx512">;
+def : SkylakeServerProc<"skx">; // Legacy alias.
+
+def CNLFeatures : ProcessorFeatures<SKXFeatures.Value, [
+ FeatureVBMI,
+ FeatureIFMA,
+ FeatureSHA
+]>;
+
+class CannonlakeProc<string Name> : ProcModel<Name, HaswellModel,
+ CNLFeatures.Value, []>;
+def : CannonlakeProc<"cannonlake">;
+
+// AMD CPUs.
+
+def : Proc<"k6", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>;
+def : Proc<"k6-2", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;
+def : Proc<"k6-3", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;
+def : Proc<"athlon", [FeatureX87, FeatureSlowUAMem16, Feature3DNowA,
+ FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"athlon-tbird", [FeatureX87, FeatureSlowUAMem16, Feature3DNowA,
+ FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"athlon-4", [FeatureX87, FeatureSlowUAMem16, FeatureSSE1,
+ Feature3DNowA, FeatureFXSR, FeatureSlowBTMem,
+ FeatureSlowSHLD]>;
+def : Proc<"athlon-xp", [FeatureX87, FeatureSlowUAMem16, FeatureSSE1,
+ Feature3DNowA, FeatureFXSR, FeatureSlowBTMem,
+ FeatureSlowSHLD]>;
+def : Proc<"athlon-mp", [FeatureX87, FeatureSlowUAMem16, FeatureSSE1,
+ Feature3DNowA, FeatureFXSR, FeatureSlowBTMem,
+ FeatureSlowSHLD]>;
+def : Proc<"k8", [FeatureX87, FeatureSlowUAMem16, FeatureSSE2,
+ Feature3DNowA, FeatureFXSR, Feature64Bit,
+ FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"opteron", [FeatureX87, FeatureSlowUAMem16, FeatureSSE2,
+ Feature3DNowA, FeatureFXSR, Feature64Bit,
+ FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"athlon64", [FeatureX87, FeatureSlowUAMem16, FeatureSSE2,
+ Feature3DNowA, FeatureFXSR, Feature64Bit,
+ FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"athlon-fx", [FeatureX87, FeatureSlowUAMem16, FeatureSSE2,
+ Feature3DNowA, FeatureFXSR, Feature64Bit,
+ FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"k8-sse3", [FeatureX87, FeatureSlowUAMem16, FeatureSSE3,
+ Feature3DNowA, FeatureFXSR, FeatureCMPXCHG16B,
+ FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"opteron-sse3", [FeatureX87, FeatureSlowUAMem16, FeatureSSE3,
+ Feature3DNowA, FeatureFXSR, FeatureCMPXCHG16B,
+ FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"athlon64-sse3", [FeatureX87, FeatureSlowUAMem16, FeatureSSE3,
+ Feature3DNowA, FeatureFXSR, FeatureCMPXCHG16B,
+ FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"amdfam10", [FeatureX87, FeatureSSE4A, Feature3DNowA,
+ FeatureFXSR, FeatureCMPXCHG16B, FeatureLZCNT,
+ FeaturePOPCNT, FeatureSlowBTMem, FeatureSlowSHLD,
+ FeatureLAHFSAHF]>;
+def : Proc<"barcelona", [FeatureX87, FeatureSSE4A, Feature3DNowA,
+ FeatureFXSR, FeatureCMPXCHG16B, FeatureLZCNT,
+ FeaturePOPCNT, FeatureSlowBTMem, FeatureSlowSHLD,
+ FeatureLAHFSAHF]>;
+
+// Bobcat
+def : Proc<"btver1", [
+ FeatureX87,
+ FeatureMMX,
+ FeatureSSSE3,
+ FeatureSSE4A,
+ FeatureFXSR,
+ FeatureCMPXCHG16B,
+ FeaturePRFCHW,
+ FeatureLZCNT,
+ FeaturePOPCNT,
+ FeatureSlowSHLD,
+ FeatureLAHFSAHF
+]>;
+
+// Jaguar
+def : ProcessorModel<"btver2", BtVer2Model, [
+ FeatureX87,
+ FeatureMMX,
+ FeatureAVX,
+ FeatureFXSR,
+ FeatureSSE4A,
+ FeatureCMPXCHG16B,
+ FeaturePRFCHW,
+ FeatureAES,
+ FeaturePCLMUL,
+ FeatureBMI,
+ FeatureF16C,
+ FeatureMOVBE,
+ FeatureLZCNT,
+ FeatureFastLZCNT,
+ FeaturePOPCNT,
+ FeatureXSAVE,
+ FeatureXSAVEOPT,
+ FeatureSlowSHLD,
+ FeatureLAHFSAHF,
+ FeatureFastPartialYMMWrite
+]>;
+
+// Bulldozer
+def : Proc<"bdver1", [
+ FeatureX87,
+ FeatureXOP,
+ FeatureFMA4,
+ FeatureCMPXCHG16B,
+ FeatureAES,
+ FeaturePRFCHW,
+ FeaturePCLMUL,
+ FeatureMMX,
+ FeatureAVX,
+ FeatureFXSR,
+ FeatureSSE4A,
+ FeatureLZCNT,
+ FeaturePOPCNT,
+ FeatureXSAVE,
+ FeatureSlowSHLD,
+ FeatureLAHFSAHF
+]>;
+// Piledriver
+def : Proc<"bdver2", [
+ FeatureX87,
+ FeatureXOP,
+ FeatureFMA4,
+ FeatureCMPXCHG16B,
+ FeatureAES,
+ FeaturePRFCHW,
+ FeaturePCLMUL,
+ FeatureMMX,
+ FeatureAVX,
+ FeatureFXSR,
+ FeatureSSE4A,
+ FeatureF16C,
+ FeatureLZCNT,
+ FeaturePOPCNT,
+ FeatureXSAVE,
+ FeatureBMI,
+ FeatureTBM,
+ FeatureFMA,
+ FeatureSlowSHLD,
+ FeatureLAHFSAHF
+]>;
+
+// Steamroller
+def : Proc<"bdver3", [
+ FeatureX87,
+ FeatureXOP,
+ FeatureFMA4,
+ FeatureCMPXCHG16B,
+ FeatureAES,
+ FeaturePRFCHW,
+ FeaturePCLMUL,
+ FeatureMMX,
+ FeatureAVX,
+ FeatureFXSR,
+ FeatureSSE4A,
+ FeatureF16C,
+ FeatureLZCNT,
+ FeaturePOPCNT,
+ FeatureXSAVE,
+ FeatureBMI,
+ FeatureTBM,
+ FeatureFMA,
+ FeatureXSAVEOPT,
+ FeatureSlowSHLD,
+ FeatureFSGSBase,
+ FeatureLAHFSAHF
+]>;
+
+// Excavator
+def : Proc<"bdver4", [
+ FeatureX87,
+ FeatureMMX,
+ FeatureAVX2,
+ FeatureFXSR,
+ FeatureXOP,
+ FeatureFMA4,
+ FeatureCMPXCHG16B,
+ FeatureAES,
+ FeaturePRFCHW,
+ FeaturePCLMUL,
+ FeatureF16C,
+ FeatureLZCNT,
+ FeaturePOPCNT,
+ FeatureXSAVE,
+ FeatureBMI,
+ FeatureBMI2,
+ FeatureTBM,
+ FeatureFMA,
+ FeatureXSAVEOPT,
+ FeatureSlowSHLD,
+ FeatureFSGSBase,
+ FeatureLAHFSAHF,
+ FeatureMWAITX
+]>;
+
+def : Proc<"geode", [FeatureX87, FeatureSlowUAMem16, Feature3DNowA]>;
+
+def : Proc<"winchip-c6", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>;
+def : Proc<"winchip2", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;
+def : Proc<"c3", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;
+def : Proc<"c3-2", [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
+ FeatureSSE1, FeatureFXSR]>;
+
+// We also provide a generic 64-bit specific x86 processor model which tries to
+// be good for modern chips without enabling instruction set encodings past the
+// basic SSE2 and 64-bit ones. It disables slow things from any mainstream and
+// modern 64-bit x86 chip, and enables features that are generally beneficial.
+//
+// We currently use the Sandy Bridge model as the default scheduling model as
+// we use it across Nehalem, Westmere, Sandy Bridge, and Ivy Bridge which
+// covers a huge swath of x86 processors. If there are specific scheduling
+// knobs which need to be tuned differently for AMD chips, we might consider
+// forming a common base for them.
+def : ProcessorModel<"x86-64", SandyBridgeModel,
+ [FeatureX87, FeatureMMX, FeatureSSE2, FeatureFXSR,
+ Feature64Bit, FeatureSlowBTMem ]>;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "X86RegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "X86InstrInfo.td"
+
+def X86InstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// Calling Conventions
+//===----------------------------------------------------------------------===//
+
+include "X86CallingConv.td"
+
+
+//===----------------------------------------------------------------------===//
+// Assembly Parser
+//===----------------------------------------------------------------------===//
+
+def ATTAsmParserVariant : AsmParserVariant {
+ int Variant = 0;
+
+ // Variant name.
+ string Name = "att";
+
+ // Discard comments in assembly strings.
+ string CommentDelimiter = "#";
+
+ // Recognize hard coded registers.
+ string RegisterPrefix = "%";
+}
+
+def IntelAsmParserVariant : AsmParserVariant {
+ int Variant = 1;
+
+ // Variant name.
+ string Name = "intel";
+
+ // Discard comments in assembly strings.
+ string CommentDelimiter = ";";
+
+ // Recognize hard coded registers.
+ string RegisterPrefix = "";
+}
+
+//===----------------------------------------------------------------------===//
+// Assembly Printers
+//===----------------------------------------------------------------------===//
+
+// The X86 target supports two different syntaxes for emitting machine code.
+// This is controlled by the -x86-asm-syntax={att|intel}
+def ATTAsmWriter : AsmWriter {
+ string AsmWriterClassName = "ATTInstPrinter";
+ int Variant = 0;
+}
+def IntelAsmWriter : AsmWriter {
+ string AsmWriterClassName = "IntelInstPrinter";
+ int Variant = 1;
+}
+
+def X86 : Target {
+ // Information about the instructions...
+ let InstructionSet = X86InstrInfo;
+ let AssemblyParserVariants = [ATTAsmParserVariant, IntelAsmParserVariant];
+ let AssemblyWriters = [ATTAsmWriter, IntelAsmWriter];
+}
diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp
new file mode 100644
index 000000000000..d42e1187ce64
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -0,0 +1,661 @@
+//===-- X86AsmPrinter.cpp - Convert X86 LLVM code to AT&T assembly --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to X86 machine code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86AsmPrinter.h"
+#include "InstPrinter/X86ATTInstPrinter.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Primitive Helper Functions.
+//===----------------------------------------------------------------------===//
+
+/// runOnMachineFunction - Emit the function body.
+///
+bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+ Subtarget = &MF.getSubtarget<X86Subtarget>();
+
+ SMShadowTracker.startFunction(MF);
+ CodeEmitter.reset(TM.getTarget().createMCCodeEmitter(
+ *MF.getSubtarget().getInstrInfo(), *MF.getSubtarget().getRegisterInfo(),
+ MF.getContext()));
+
+ SetupMachineFunction(MF);
+
+ if (Subtarget->isTargetCOFF()) {
+ bool Local = MF.getFunction()->hasLocalLinkage();
+ OutStreamer->BeginCOFFSymbolDef(CurrentFnSym);
+ OutStreamer->EmitCOFFSymbolStorageClass(
+ Local ? COFF::IMAGE_SYM_CLASS_STATIC : COFF::IMAGE_SYM_CLASS_EXTERNAL);
+ OutStreamer->EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION
+ << COFF::SCT_COMPLEX_TYPE_SHIFT);
+ OutStreamer->EndCOFFSymbolDef();
+ }
+
+ // Emit the rest of the function body.
+ EmitFunctionBody();
+
+ // Emit the XRay table for this function.
+ EmitXRayTable();
+
+ // We didn't modify anything.
+ return false;
+}
+
+/// printSymbolOperand - Print a raw symbol reference operand. This handles
+/// jump tables, constant pools, global address and external symbols, all of
+/// which print to a label with various suffixes for relocation types etc.
+static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO,
+ raw_ostream &O) {
+ switch (MO.getType()) {
+ default: llvm_unreachable("unknown symbol type!");
+ case MachineOperand::MO_ConstantPoolIndex:
+ P.GetCPISymbol(MO.getIndex())->print(O, P.MAI);
+ P.printOffset(MO.getOffset(), O);
+ break;
+ case MachineOperand::MO_GlobalAddress: {
+ const GlobalValue *GV = MO.getGlobal();
+
+ MCSymbol *GVSym;
+ if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY ||
+ MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE)
+ GVSym = P.getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+ else
+ GVSym = P.getSymbol(GV);
+
+ // Handle dllimport linkage.
+ if (MO.getTargetFlags() == X86II::MO_DLLIMPORT)
+ GVSym =
+ P.OutContext.getOrCreateSymbol(Twine("__imp_") + GVSym->getName());
+
+ if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY ||
+ MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE) {
+ MCSymbol *Sym = P.getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+ MachineModuleInfoImpl::StubValueTy &StubSym =
+ P.MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(Sym);
+ if (!StubSym.getPointer())
+ StubSym = MachineModuleInfoImpl::
+ StubValueTy(P.getSymbol(GV), !GV->hasInternalLinkage());
+ }
+
+ // If the name begins with a dollar-sign, enclose it in parens. We do this
+ // to avoid having it look like an integer immediate to the assembler.
+ if (GVSym->getName()[0] != '$')
+ GVSym->print(O, P.MAI);
+ else {
+ O << '(';
+ GVSym->print(O, P.MAI);
+ O << ')';
+ }
+ P.printOffset(MO.getOffset(), O);
+ break;
+ }
+ }
+
+ switch (MO.getTargetFlags()) {
+ default:
+ llvm_unreachable("Unknown target flag on GV operand");
+ case X86II::MO_NO_FLAG: // No flag.
+ break;
+ case X86II::MO_DARWIN_NONLAZY:
+ case X86II::MO_DLLIMPORT:
+ // These affect the name of the symbol, not any suffix.
+ break;
+ case X86II::MO_GOT_ABSOLUTE_ADDRESS:
+ O << " + [.-";
+ P.MF->getPICBaseSymbol()->print(O, P.MAI);
+ O << ']';
+ break;
+ case X86II::MO_PIC_BASE_OFFSET:
+ case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
+ O << '-';
+ P.MF->getPICBaseSymbol()->print(O, P.MAI);
+ break;
+ case X86II::MO_TLSGD: O << "@TLSGD"; break;
+ case X86II::MO_TLSLD: O << "@TLSLD"; break;
+ case X86II::MO_TLSLDM: O << "@TLSLDM"; break;
+ case X86II::MO_GOTTPOFF: O << "@GOTTPOFF"; break;
+ case X86II::MO_INDNTPOFF: O << "@INDNTPOFF"; break;
+ case X86II::MO_TPOFF: O << "@TPOFF"; break;
+ case X86II::MO_DTPOFF: O << "@DTPOFF"; break;
+ case X86II::MO_NTPOFF: O << "@NTPOFF"; break;
+ case X86II::MO_GOTNTPOFF: O << "@GOTNTPOFF"; break;
+ case X86II::MO_GOTPCREL: O << "@GOTPCREL"; break;
+ case X86II::MO_GOT: O << "@GOT"; break;
+ case X86II::MO_GOTOFF: O << "@GOTOFF"; break;
+ case X86II::MO_PLT: O << "@PLT"; break;
+ case X86II::MO_TLVP: O << "@TLVP"; break;
+ case X86II::MO_TLVP_PIC_BASE:
+ O << "@TLVP" << '-';
+ P.MF->getPICBaseSymbol()->print(O, P.MAI);
+ break;
+ case X86II::MO_SECREL: O << "@SECREL32"; break;
+ }
+}
+
+static void printOperand(X86AsmPrinter &P, const MachineInstr *MI,
+ unsigned OpNo, raw_ostream &O,
+ const char *Modifier = nullptr, unsigned AsmVariant = 0);
+
+/// printPCRelImm - This is used to print an immediate value that ends up
+/// being encoded as a pc-relative value. These print slightly differently, for
+/// example, a $ is not emitted.
+static void printPCRelImm(X86AsmPrinter &P, const MachineInstr *MI,
+ unsigned OpNo, raw_ostream &O) {
+ const MachineOperand &MO = MI->getOperand(OpNo);
+ switch (MO.getType()) {
+ default: llvm_unreachable("Unknown pcrel immediate operand");
+ case MachineOperand::MO_Register:
+ // pc-relativeness was handled when computing the value in the reg.
+ printOperand(P, MI, OpNo, O);
+ return;
+ case MachineOperand::MO_Immediate:
+ O << MO.getImm();
+ return;
+ case MachineOperand::MO_GlobalAddress:
+ printSymbolOperand(P, MO, O);
+ return;
+ }
+}
+
+static void printOperand(X86AsmPrinter &P, const MachineInstr *MI,
+ unsigned OpNo, raw_ostream &O, const char *Modifier,
+ unsigned AsmVariant) {
+ const MachineOperand &MO = MI->getOperand(OpNo);
+ switch (MO.getType()) {
+ default: llvm_unreachable("unknown operand type!");
+ case MachineOperand::MO_Register: {
+ // FIXME: Enumerating AsmVariant, so we can remove magic number.
+ if (AsmVariant == 0) O << '%';
+ unsigned Reg = MO.getReg();
+ if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) {
+ unsigned Size = (strcmp(Modifier+6,"64") == 0) ? 64 :
+ (strcmp(Modifier+6,"32") == 0) ? 32 :
+ (strcmp(Modifier+6,"16") == 0) ? 16 : 8;
+ Reg = getX86SubSuperRegister(Reg, Size);
+ }
+ O << X86ATTInstPrinter::getRegisterName(Reg);
+ return;
+ }
+
+ case MachineOperand::MO_Immediate:
+ if (AsmVariant == 0) O << '$';
+ O << MO.getImm();
+ return;
+
+ case MachineOperand::MO_GlobalAddress: {
+ if (AsmVariant == 0) O << '$';
+ printSymbolOperand(P, MO, O);
+ break;
+ }
+ }
+}
+
+static void printLeaMemReference(X86AsmPrinter &P, const MachineInstr *MI,
+ unsigned Op, raw_ostream &O,
+ const char *Modifier = nullptr) {
+ const MachineOperand &BaseReg = MI->getOperand(Op+X86::AddrBaseReg);
+ const MachineOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
+ const MachineOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp);
+
+ // If we really don't want to print out (rip), don't.
+ bool HasBaseReg = BaseReg.getReg() != 0;
+ if (HasBaseReg && Modifier && !strcmp(Modifier, "no-rip") &&
+ BaseReg.getReg() == X86::RIP)
+ HasBaseReg = false;
+
+ // HasParenPart - True if we will print out the () part of the mem ref.
+ bool HasParenPart = IndexReg.getReg() || HasBaseReg;
+
+ switch (DispSpec.getType()) {
+ default:
+ llvm_unreachable("unknown operand type!");
+ case MachineOperand::MO_Immediate: {
+ int DispVal = DispSpec.getImm();
+ if (DispVal || !HasParenPart)
+ O << DispVal;
+ break;
+ }
+ case MachineOperand::MO_GlobalAddress:
+ case MachineOperand::MO_ConstantPoolIndex:
+ printSymbolOperand(P, DispSpec, O);
+ }
+
+ if (Modifier && strcmp(Modifier, "H") == 0)
+ O << "+8";
+
+ if (HasParenPart) {
+ assert(IndexReg.getReg() != X86::ESP &&
+ "X86 doesn't allow scaling by ESP");
+
+ O << '(';
+ if (HasBaseReg)
+ printOperand(P, MI, Op+X86::AddrBaseReg, O, Modifier);
+
+ if (IndexReg.getReg()) {
+ O << ',';
+ printOperand(P, MI, Op+X86::AddrIndexReg, O, Modifier);
+ unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
+ if (ScaleVal != 1)
+ O << ',' << ScaleVal;
+ }
+ O << ')';
+ }
+}
+
+static void printMemReference(X86AsmPrinter &P, const MachineInstr *MI,
+ unsigned Op, raw_ostream &O,
+ const char *Modifier = nullptr) {
+ assert(isMem(*MI, Op) && "Invalid memory reference!");
+ const MachineOperand &Segment = MI->getOperand(Op+X86::AddrSegmentReg);
+ if (Segment.getReg()) {
+ printOperand(P, MI, Op+X86::AddrSegmentReg, O, Modifier);
+ O << ':';
+ }
+ printLeaMemReference(P, MI, Op, O, Modifier);
+}
+
+static void printIntelMemReference(X86AsmPrinter &P, const MachineInstr *MI,
+ unsigned Op, raw_ostream &O,
+ const char *Modifier = nullptr,
+ unsigned AsmVariant = 1) {
+ const MachineOperand &BaseReg = MI->getOperand(Op+X86::AddrBaseReg);
+ unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
+ const MachineOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
+ const MachineOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp);
+ const MachineOperand &SegReg = MI->getOperand(Op+X86::AddrSegmentReg);
+
+ // If this has a segment register, print it.
+ if (SegReg.getReg()) {
+ printOperand(P, MI, Op+X86::AddrSegmentReg, O, Modifier, AsmVariant);
+ O << ':';
+ }
+
+ O << '[';
+
+ bool NeedPlus = false;
+ if (BaseReg.getReg()) {
+ printOperand(P, MI, Op+X86::AddrBaseReg, O, Modifier, AsmVariant);
+ NeedPlus = true;
+ }
+
+ if (IndexReg.getReg()) {
+ if (NeedPlus) O << " + ";
+ if (ScaleVal != 1)
+ O << ScaleVal << '*';
+ printOperand(P, MI, Op+X86::AddrIndexReg, O, Modifier, AsmVariant);
+ NeedPlus = true;
+ }
+
+ if (!DispSpec.isImm()) {
+ if (NeedPlus) O << " + ";
+ printOperand(P, MI, Op+X86::AddrDisp, O, Modifier, AsmVariant);
+ } else {
+ int64_t DispVal = DispSpec.getImm();
+ if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) {
+ if (NeedPlus) {
+ if (DispVal > 0)
+ O << " + ";
+ else {
+ O << " - ";
+ DispVal = -DispVal;
+ }
+ }
+ O << DispVal;
+ }
+ }
+ O << ']';
+}
+
+static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
+ char Mode, raw_ostream &O) {
+ unsigned Reg = MO.getReg();
+ switch (Mode) {
+ default: return true; // Unknown mode.
+ case 'b': // Print QImode register
+ Reg = getX86SubSuperRegister(Reg, 8);
+ break;
+ case 'h': // Print QImode high register
+ Reg = getX86SubSuperRegister(Reg, 8, true);
+ break;
+ case 'w': // Print HImode register
+ Reg = getX86SubSuperRegister(Reg, 16);
+ break;
+ case 'k': // Print SImode register
+ Reg = getX86SubSuperRegister(Reg, 32);
+ break;
+ case 'q':
+ // Print 64-bit register names if 64-bit integer registers are available.
+ // Otherwise, print 32-bit register names.
+ Reg = getX86SubSuperRegister(Reg, P.getSubtarget().is64Bit() ? 64 : 32);
+ break;
+ }
+
+ O << '%' << X86ATTInstPrinter::getRegisterName(Reg);
+ return false;
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant,
+ const char *ExtraCode, raw_ostream &O) {
+ // Does this asm operand have a single letter operand modifier?
+ if (ExtraCode && ExtraCode[0]) {
+ if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+ const MachineOperand &MO = MI->getOperand(OpNo);
+
+ switch (ExtraCode[0]) {
+ default:
+ // See if this is a generic print operand
+ return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+ case 'a': // This is an address. Currently only 'i' and 'r' are expected.
+ switch (MO.getType()) {
+ default:
+ return true;
+ case MachineOperand::MO_Immediate:
+ O << MO.getImm();
+ return false;
+ case MachineOperand::MO_ConstantPoolIndex:
+ case MachineOperand::MO_JumpTableIndex:
+ case MachineOperand::MO_ExternalSymbol:
+ llvm_unreachable("unexpected operand type!");
+ case MachineOperand::MO_GlobalAddress:
+ printSymbolOperand(*this, MO, O);
+ if (Subtarget->isPICStyleRIPRel())
+ O << "(%rip)";
+ return false;
+ case MachineOperand::MO_Register:
+ O << '(';
+ printOperand(*this, MI, OpNo, O);
+ O << ')';
+ return false;
+ }
+
+ case 'c': // Don't print "$" before a global var name or constant.
+ switch (MO.getType()) {
+ default:
+ printOperand(*this, MI, OpNo, O);
+ break;
+ case MachineOperand::MO_Immediate:
+ O << MO.getImm();
+ break;
+ case MachineOperand::MO_ConstantPoolIndex:
+ case MachineOperand::MO_JumpTableIndex:
+ case MachineOperand::MO_ExternalSymbol:
+ llvm_unreachable("unexpected operand type!");
+ case MachineOperand::MO_GlobalAddress:
+ printSymbolOperand(*this, MO, O);
+ break;
+ }
+ return false;
+
+ case 'A': // Print '*' before a register (it must be a register)
+ if (MO.isReg()) {
+ O << '*';
+ printOperand(*this, MI, OpNo, O);
+ return false;
+ }
+ return true;
+
+ case 'b': // Print QImode register
+ case 'h': // Print QImode high register
+ case 'w': // Print HImode register
+ case 'k': // Print SImode register
+ case 'q': // Print DImode register
+ if (MO.isReg())
+ return printAsmMRegister(*this, MO, ExtraCode[0], O);
+ printOperand(*this, MI, OpNo, O);
+ return false;
+
+ case 'P': // This is the operand of a call, treat specially.
+ printPCRelImm(*this, MI, OpNo, O);
+ return false;
+
+ case 'n': // Negate the immediate or print a '-' before the operand.
+ // Note: this is a temporary solution. It should be handled target
+ // independently as part of the 'MC' work.
+ if (MO.isImm()) {
+ O << -MO.getImm();
+ return false;
+ }
+ O << '-';
+ }
+ }
+
+ printOperand(*this, MI, OpNo, O, /*Modifier*/ nullptr, AsmVariant);
+ return false;
+}
+
+bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+ unsigned OpNo, unsigned AsmVariant,
+ const char *ExtraCode,
+ raw_ostream &O) {
+ if (AsmVariant) {
+ printIntelMemReference(*this, MI, OpNo, O);
+ return false;
+ }
+
+ if (ExtraCode && ExtraCode[0]) {
+ if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+ switch (ExtraCode[0]) {
+ default: return true; // Unknown modifier.
+ case 'b': // Print QImode register
+ case 'h': // Print QImode high register
+ case 'w': // Print HImode register
+ case 'k': // Print SImode register
+ case 'q': // Print SImode register
+ // These only apply to registers, ignore on mem.
+ break;
+ case 'H':
+ printMemReference(*this, MI, OpNo, O, "H");
+ return false;
+ case 'P': // Don't print @PLT, but do print as memory.
+ printMemReference(*this, MI, OpNo, O, "no-rip");
+ return false;
+ }
+ }
+ printMemReference(*this, MI, OpNo, O);
+ return false;
+}
+
+void X86AsmPrinter::EmitStartOfAsmFile(Module &M) {
+ const Triple &TT = TM.getTargetTriple();
+
+ if (TT.isOSBinFormatMachO())
+ OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
+
+ if (TT.isOSBinFormatCOFF()) {
+ // Emit an absolute @feat.00 symbol. This appears to be some kind of
+ // compiler features bitfield read by link.exe.
+ if (TT.getArch() == Triple::x86) {
+ MCSymbol *S = MMI->getContext().getOrCreateSymbol(StringRef("@feat.00"));
+ OutStreamer->BeginCOFFSymbolDef(S);
+ OutStreamer->EmitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_STATIC);
+ OutStreamer->EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_NULL);
+ OutStreamer->EndCOFFSymbolDef();
+ // According to the PE-COFF spec, the LSB of this value marks the object
+ // for "registered SEH". This means that all SEH handler entry points
+ // must be registered in .sxdata. Use of any unregistered handlers will
+ // cause the process to terminate immediately. LLVM does not know how to
+ // register any SEH handlers, so its object files should be safe.
+ OutStreamer->EmitSymbolAttribute(S, MCSA_Global);
+ OutStreamer->EmitAssignment(
+ S, MCConstantExpr::create(int64_t(1), MMI->getContext()));
+ }
+ }
+ OutStreamer->EmitSyntaxDirective();
+
+ // If this is not inline asm and we're in 16-bit
+ // mode prefix assembly with .code16.
+ bool is16 = TT.getEnvironment() == Triple::CODE16;
+ if (M.getModuleInlineAsm().empty() && is16)
+ OutStreamer->EmitAssemblerFlag(MCAF_Code16);
+}
+
+static void
+emitNonLazySymbolPointer(MCStreamer &OutStreamer, MCSymbol *StubLabel,
+ MachineModuleInfoImpl::StubValueTy &MCSym) {
+ // L_foo$stub:
+ OutStreamer.EmitLabel(StubLabel);
+ // .indirect_symbol _foo
+ OutStreamer.EmitSymbolAttribute(MCSym.getPointer(), MCSA_IndirectSymbol);
+
+ if (MCSym.getInt())
+ // External to current translation unit.
+ OutStreamer.EmitIntValue(0, 4/*size*/);
+ else
+ // Internal to current translation unit.
+ //
+ // When we place the LSDA into the TEXT section, the type info
+ // pointers need to be indirect and pc-rel. We accomplish this by
+ // using NLPs; however, sometimes the types are local to the file.
+ // We need to fill in the value for the NLP in those cases.
+ OutStreamer.EmitValue(
+ MCSymbolRefExpr::create(MCSym.getPointer(), OutStreamer.getContext()),
+ 4 /*size*/);
+}
+
+MCSymbol *X86AsmPrinter::GetCPISymbol(unsigned CPID) const {
+ if (Subtarget->isTargetKnownWindowsMSVC()) {
+ const MachineConstantPoolEntry &CPE =
+ MF->getConstantPool()->getConstants()[CPID];
+ if (!CPE.isMachineConstantPoolEntry()) {
+ const DataLayout &DL = MF->getDataLayout();
+ SectionKind Kind = CPE.getSectionKind(&DL);
+ const Constant *C = CPE.Val.ConstVal;
+ unsigned Align = CPE.Alignment;
+ if (const MCSectionCOFF *S = dyn_cast<MCSectionCOFF>(
+ getObjFileLowering().getSectionForConstant(DL, Kind, C, Align))) {
+ if (MCSymbol *Sym = S->getCOMDATSymbol()) {
+ if (Sym->isUndefined())
+ OutStreamer->EmitSymbolAttribute(Sym, MCSA_Global);
+ return Sym;
+ }
+ }
+ }
+ }
+
+ return AsmPrinter::GetCPISymbol(CPID);
+}
+
+void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
+ const Triple &TT = TM.getTargetTriple();
+
+ if (TT.isOSBinFormatMachO()) {
+ // All darwin targets use mach-o.
+ MachineModuleInfoMachO &MMIMacho =
+ MMI->getObjFileInfo<MachineModuleInfoMachO>();
+
+ // Output stubs for dynamically-linked functions.
+ MachineModuleInfoMachO::SymbolListTy Stubs;
+
+ // Output stubs for external and common global variables.
+ Stubs = MMIMacho.GetGVStubList();
+ if (!Stubs.empty()) {
+ MCSection *TheSection = OutContext.getMachOSection(
+ "__IMPORT", "__pointers", MachO::S_NON_LAZY_SYMBOL_POINTERS,
+ SectionKind::getMetadata());
+ OutStreamer->SwitchSection(TheSection);
+
+ for (auto &Stub : Stubs)
+ emitNonLazySymbolPointer(*OutStreamer, Stub.first, Stub.second);
+
+ Stubs.clear();
+ OutStreamer->AddBlankLine();
+ }
+
+ SM.serializeToStackMapSection();
+ FM.serializeToFaultMapSection();
+
+ // Funny Darwin hack: This flag tells the linker that no global symbols
+ // contain code that falls through to other global symbols (e.g. the obvious
+ // implementation of multiple entry points). If this doesn't occur, the
+ // linker can safely perform dead code stripping. Since LLVM never
+ // generates code that does this, it is always safe to set.
+ OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+ }
+
+ if (TT.isKnownWindowsMSVCEnvironment() && MMI->usesVAFloatArgument()) {
+ StringRef SymbolName =
+ (TT.getArch() == Triple::x86_64) ? "_fltused" : "__fltused";
+ MCSymbol *S = MMI->getContext().getOrCreateSymbol(SymbolName);
+ OutStreamer->EmitSymbolAttribute(S, MCSA_Global);
+ }
+
+ if (TT.isOSBinFormatCOFF()) {
+ const TargetLoweringObjectFileCOFF &TLOFCOFF =
+ static_cast<const TargetLoweringObjectFileCOFF&>(getObjFileLowering());
+
+ std::string Flags;
+ raw_string_ostream FlagsOS(Flags);
+
+ for (const auto &Function : M)
+ TLOFCOFF.emitLinkerFlagsForGlobal(FlagsOS, &Function);
+ for (const auto &Global : M.globals())
+ TLOFCOFF.emitLinkerFlagsForGlobal(FlagsOS, &Global);
+ for (const auto &Alias : M.aliases())
+ TLOFCOFF.emitLinkerFlagsForGlobal(FlagsOS, &Alias);
+
+ FlagsOS.flush();
+
+ // Output collected flags.
+ if (!Flags.empty()) {
+ OutStreamer->SwitchSection(TLOFCOFF.getDrectveSection());
+ OutStreamer->EmitBytes(Flags);
+ }
+
+ SM.serializeToStackMapSection();
+ }
+
+ if (TT.isOSBinFormatELF()) {
+ SM.serializeToStackMapSection();
+ FM.serializeToFaultMapSection();
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Target Registry Stuff
+//===----------------------------------------------------------------------===//
+
+// Force static initialization.
+extern "C" void LLVMInitializeX86AsmPrinter() {
+ RegisterAsmPrinter<X86AsmPrinter> X(getTheX86_32Target());
+ RegisterAsmPrinter<X86AsmPrinter> Y(getTheX86_64Target());
+}
diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.h b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h
new file mode 100644
index 000000000000..6798253d0f6a
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h
@@ -0,0 +1,141 @@
+//===-- X86AsmPrinter.h - X86 implementation of AsmPrinter ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86ASMPRINTER_H
+#define LLVM_LIB_TARGET_X86_X86ASMPRINTER_H
+
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/FaultMaps.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/Target/TargetMachine.h"
+
+// Implemented in X86MCInstLower.cpp
+namespace {
+ class X86MCInstLower;
+}
+
+namespace llvm {
+class MCStreamer;
+class MCSymbol;
+
+class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
+ const X86Subtarget *Subtarget;
+ StackMaps SM;
+ FaultMaps FM;
+ std::unique_ptr<MCCodeEmitter> CodeEmitter;
+
+ // This utility class tracks the length of a stackmap instruction's 'shadow'.
+ // It is used by the X86AsmPrinter to ensure that the stackmap shadow
+ // invariants (i.e. no other stackmaps, patchpoints, or control flow within
+ // the shadow) are met, while outputting a minimal number of NOPs for padding.
+ //
+ // To minimise the number of NOPs used, the shadow tracker counts the number
+ // of instruction bytes output since the last stackmap. Only if there are too
+ // few instruction bytes to cover the shadow are NOPs used for padding.
+ class StackMapShadowTracker {
+ public:
+ void startFunction(MachineFunction &MF) {
+ this->MF = &MF;
+ }
+ void count(MCInst &Inst, const MCSubtargetInfo &STI,
+ MCCodeEmitter *CodeEmitter);
+
+ // Called to signal the start of a shadow of RequiredSize bytes.
+ void reset(unsigned RequiredSize) {
+ RequiredShadowSize = RequiredSize;
+ CurrentShadowSize = 0;
+ InShadow = true;
+ }
+
+ // Called before every stackmap/patchpoint, and at the end of basic blocks,
+ // to emit any necessary padding-NOPs.
+ void emitShadowPadding(MCStreamer &OutStreamer, const MCSubtargetInfo &STI);
+ private:
+ const MachineFunction *MF;
+ bool InShadow = false;
+
+ // RequiredShadowSize holds the length of the shadow specified in the most
+ // recently encountered STACKMAP instruction.
+ // CurrentShadowSize counts the number of bytes encoded since the most
+ // recently encountered STACKMAP, stopping when that number is greater than
+ // or equal to RequiredShadowSize.
+ unsigned RequiredShadowSize = 0, CurrentShadowSize = 0;
+ };
+
+ StackMapShadowTracker SMShadowTracker;
+
+ // All instructions emitted by the X86AsmPrinter should use this helper
+ // method.
+ //
+ // This helper function invokes the SMShadowTracker on each instruction before
+ // outputting it to the OutStream. This allows the shadow tracker to minimise
+ // the number of NOPs used for stackmap padding.
+ void EmitAndCountInstruction(MCInst &Inst);
+ void LowerSTACKMAP(const MachineInstr &MI);
+ void LowerPATCHPOINT(const MachineInstr &MI, X86MCInstLower &MCIL);
+ void LowerSTATEPOINT(const MachineInstr &MI, X86MCInstLower &MCIL);
+ void LowerFAULTING_LOAD_OP(const MachineInstr &MI, X86MCInstLower &MCIL);
+ void LowerPATCHABLE_OP(const MachineInstr &MI, X86MCInstLower &MCIL);
+
+ void LowerTlsAddr(X86MCInstLower &MCInstLowering, const MachineInstr &MI);
+
+ // XRay-specific lowering for X86.
+ void LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI,
+ X86MCInstLower &MCIL);
+ void LowerPATCHABLE_RET(const MachineInstr &MI, X86MCInstLower &MCIL);
+ void LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI, X86MCInstLower &MCIL);
+
+ // Helper function that emits the XRay sleds we've collected for a particular
+ // function.
+ void EmitXRayTable();
+
+public:
+ explicit X86AsmPrinter(TargetMachine &TM,
+ std::unique_ptr<MCStreamer> Streamer)
+ : AsmPrinter(TM, std::move(Streamer)), SM(*this), FM(*this) {}
+
+ StringRef getPassName() const override {
+ return "X86 Assembly Printer";
+ }
+
+ const X86Subtarget &getSubtarget() const { return *Subtarget; }
+
+ void EmitStartOfAsmFile(Module &M) override;
+
+ void EmitEndOfAsmFile(Module &M) override;
+
+ void EmitInstruction(const MachineInstr *MI) override;
+
+ void EmitBasicBlockEnd(const MachineBasicBlock &MBB) override {
+ SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
+ }
+
+ bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &OS) override;
+ bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &OS) override;
+
+ /// \brief Return the symbol for the specified constant pool entry.
+ MCSymbol *GetCPISymbol(unsigned CPID) const override;
+
+ bool doInitialization(Module &M) override {
+ SMShadowTracker.reset(0);
+ SM.reset();
+ return AsmPrinter::doInitialization(M);
+ }
+
+ bool runOnMachineFunction(MachineFunction &F) override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
new file mode 100644
index 000000000000..844c66d5a462
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -0,0 +1,591 @@
+//===----- X86CallFrameOptimization.cpp - Optimize x86 call sequences -----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pass that optimizes call sequences on x86.
+// Currently, it converts movs of function parameters onto the stack into
+// pushes. This is beneficial for two main reasons:
+// 1) The push instruction encoding is much smaller than a stack-ptr-based mov.
+// 2) It is possible to push memory arguments directly. So, if the
+// the transformation is performed pre-reg-alloc, it can help relieve
+// register pressure.
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-cf-opt"
+
+static cl::opt<bool>
+ NoX86CFOpt("no-x86-call-frame-opt",
+ cl::desc("Avoid optimizing x86 call frames for size"),
+ cl::init(false), cl::Hidden);
+
+namespace {
+class X86CallFrameOptimization : public MachineFunctionPass {
+public:
+ X86CallFrameOptimization() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+ // Information we know about a particular call site
+ struct CallContext {
+ CallContext()
+ : FrameSetup(nullptr), Call(nullptr), SPCopy(nullptr), ExpectedDist(0),
+ MovVector(4, nullptr), NoStackParams(false), UsePush(false) {}
+
+ // Iterator referring to the frame setup instruction
+ MachineBasicBlock::iterator FrameSetup;
+
+ // Actual call instruction
+ MachineInstr *Call;
+
+ // A copy of the stack pointer
+ MachineInstr *SPCopy;
+
+ // The total displacement of all passed parameters
+ int64_t ExpectedDist;
+
+ // The sequence of movs used to pass the parameters
+ SmallVector<MachineInstr *, 4> MovVector;
+
+ // True if this call site has no stack parameters
+ bool NoStackParams;
+
+ // True if this call site can use push instructions
+ bool UsePush;
+ };
+
+ typedef SmallVector<CallContext, 8> ContextVector;
+
+ bool isLegal(MachineFunction &MF);
+
+ bool isProfitable(MachineFunction &MF, ContextVector &CallSeqMap);
+
+ void collectCallInfo(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I, CallContext &Context);
+
+ void adjustCallSequence(MachineFunction &MF, const CallContext &Context);
+
+ MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup,
+ unsigned Reg);
+
+ enum InstClassification { Convert, Skip, Exit };
+
+ InstClassification classifyInstruction(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const X86RegisterInfo &RegInfo,
+ DenseSet<unsigned int> &UsedRegs);
+
+ StringRef getPassName() const override { return "X86 Optimize Call Frame"; }
+
+ const TargetInstrInfo *TII;
+ const X86FrameLowering *TFL;
+ const X86Subtarget *STI;
+ MachineRegisterInfo *MRI;
+ unsigned SlotSize;
+ unsigned Log2SlotSize;
+ static char ID;
+};
+
+char X86CallFrameOptimization::ID = 0;
+} // end anonymous namespace
+
+FunctionPass *llvm::createX86CallFrameOptimization() {
+ return new X86CallFrameOptimization();
+}
+
+// This checks whether the transformation is legal.
+// Also returns false in cases where it's potentially legal, but
+// we don't even want to try.
+bool X86CallFrameOptimization::isLegal(MachineFunction &MF) {
+ if (NoX86CFOpt.getValue())
+ return false;
+
+ // Work around LLVM PR30879 (bad interaction between CFO and libunwind)
+ if (STI->isTargetFreeBSD() && STI->is32Bit() &&
+ STI->getTargetTriple().getOSMajorVersion() >= 12)
+ return false;
+
+ // We can't encode multiple DW_CFA_GNU_args_size or DW_CFA_def_cfa_offset
+ // in the compact unwind encoding that Darwin uses. So, bail if there
+ // is a danger of that being generated.
+ if (STI->isTargetDarwin() &&
+ (!MF.getLandingPads().empty() ||
+ (MF.getFunction()->needsUnwindTableEntry() && !TFL->hasFP(MF))))
+ return false;
+
+ // It is not valid to change the stack pointer outside the prolog/epilog
+ // on 64-bit Windows.
+ if (STI->isTargetWin64())
+ return false;
+
+ // You would expect straight-line code between call-frame setup and
+ // call-frame destroy. You would be wrong. There are circumstances (e.g.
+ // CMOV_GR8 expansion of a select that feeds a function call!) where we can
+ // end up with the setup and the destroy in different basic blocks.
+ // This is bad, and breaks SP adjustment.
+ // So, check that all of the frames in the function are closed inside
+ // the same block, and, for good measure, that there are no nested frames.
+ unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode();
+ unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
+ for (MachineBasicBlock &BB : MF) {
+ bool InsideFrameSequence = false;
+ for (MachineInstr &MI : BB) {
+ if (MI.getOpcode() == FrameSetupOpcode) {
+ if (InsideFrameSequence)
+ return false;
+ InsideFrameSequence = true;
+ } else if (MI.getOpcode() == FrameDestroyOpcode) {
+ if (!InsideFrameSequence)
+ return false;
+ InsideFrameSequence = false;
+ }
+ }
+
+ if (InsideFrameSequence)
+ return false;
+ }
+
+ return true;
+}
+
+// Check whether this transformation is profitable for a particular
+// function - in terms of code size.
+bool X86CallFrameOptimization::isProfitable(MachineFunction &MF,
+ ContextVector &CallSeqVector) {
+ // This transformation is always a win when we do not expect to have
+ // a reserved call frame. Under other circumstances, it may be either
+ // a win or a loss, and requires a heuristic.
+ bool CannotReserveFrame = MF.getFrameInfo().hasVarSizedObjects();
+ if (CannotReserveFrame)
+ return true;
+
+ unsigned StackAlign = TFL->getStackAlignment();
+
+ int64_t Advantage = 0;
+ for (auto CC : CallSeqVector) {
+ // Call sites where no parameters are passed on the stack
+ // do not affect the cost, since there needs to be no
+ // stack adjustment.
+ if (CC.NoStackParams)
+ continue;
+
+ if (!CC.UsePush) {
+ // If we don't use pushes for a particular call site,
+ // we pay for not having a reserved call frame with an
+ // additional sub/add esp pair. The cost is ~3 bytes per instruction,
+ // depending on the size of the constant.
+ // TODO: Callee-pop functions should have a smaller penalty, because
+ // an add is needed even with a reserved call frame.
+ Advantage -= 6;
+ } else {
+ // We can use pushes. First, account for the fixed costs.
+ // We'll need a add after the call.
+ Advantage -= 3;
+ // If we have to realign the stack, we'll also need a sub before
+ if (CC.ExpectedDist % StackAlign)
+ Advantage -= 3;
+ // Now, for each push, we save ~3 bytes. For small constants, we actually,
+ // save more (up to 5 bytes), but 3 should be a good approximation.
+ Advantage += (CC.ExpectedDist >> Log2SlotSize) * 3;
+ }
+ }
+
+ return Advantage >= 0;
+}
+
+bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) {
+ STI = &MF.getSubtarget<X86Subtarget>();
+ TII = STI->getInstrInfo();
+ TFL = STI->getFrameLowering();
+ MRI = &MF.getRegInfo();
+
+ const X86RegisterInfo &RegInfo =
+ *static_cast<const X86RegisterInfo *>(STI->getRegisterInfo());
+ SlotSize = RegInfo.getSlotSize();
+ assert(isPowerOf2_32(SlotSize) && "Expect power of 2 stack slot size");
+ Log2SlotSize = Log2_32(SlotSize);
+
+ if (skipFunction(*MF.getFunction()) || !isLegal(MF))
+ return false;
+
+ unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode();
+
+ bool Changed = false;
+
+ ContextVector CallSeqVector;
+
+ for (auto &MBB : MF)
+ for (auto &MI : MBB)
+ if (MI.getOpcode() == FrameSetupOpcode) {
+ CallContext Context;
+ collectCallInfo(MF, MBB, MI, Context);
+ CallSeqVector.push_back(Context);
+ }
+
+ if (!isProfitable(MF, CallSeqVector))
+ return false;
+
+ for (auto CC : CallSeqVector) {
+ if (CC.UsePush) {
+ adjustCallSequence(MF, CC);
+ Changed = true;
+ }
+ }
+
+ return Changed;
+}
+
+X86CallFrameOptimization::InstClassification
+X86CallFrameOptimization::classifyInstruction(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ const X86RegisterInfo &RegInfo, DenseSet<unsigned int> &UsedRegs) {
+ if (MI == MBB.end())
+ return Exit;
+
+ // The instructions we actually care about are movs onto the stack
+ int Opcode = MI->getOpcode();
+ if (Opcode == X86::MOV32mi || Opcode == X86::MOV32mr ||
+ Opcode == X86::MOV64mi32 || Opcode == X86::MOV64mr)
+ return Convert;
+
+ // Not all calling conventions have only stack MOVs between the stack
+ // adjust and the call.
+
+ // We want to tolerate other instructions, to cover more cases.
+ // In particular:
+ // a) PCrel calls, where we expect an additional COPY of the basereg.
+ // b) Passing frame-index addresses.
+ // c) Calling conventions that have inreg parameters. These generate
+ // both copies and movs into registers.
+ // To avoid creating lots of special cases, allow any instruction
+ // that does not write into memory, does not def or use the stack
+ // pointer, and does not def any register that was used by a preceding
+ // push.
+ // (Reading from memory is allowed, even if referenced through a
+ // frame index, since these will get adjusted properly in PEI)
+
+ // The reason for the last condition is that the pushes can't replace
+ // the movs in place, because the order must be reversed.
+ // So if we have a MOV32mr that uses EDX, then an instruction that defs
+ // EDX, and then the call, after the transformation the push will use
+ // the modified version of EDX, and not the original one.
+ // Since we are still in SSA form at this point, we only need to
+ // make sure we don't clobber any *physical* registers that were
+ // used by an earlier mov that will become a push.
+
+ if (MI->isCall() || MI->mayStore())
+ return Exit;
+
+ for (const MachineOperand &MO : MI->operands()) {
+ if (!MO.isReg())
+ continue;
+ unsigned int Reg = MO.getReg();
+ if (!RegInfo.isPhysicalRegister(Reg))
+ continue;
+ if (RegInfo.regsOverlap(Reg, RegInfo.getStackRegister()))
+ return Exit;
+ if (MO.isDef()) {
+ for (unsigned int U : UsedRegs)
+ if (RegInfo.regsOverlap(Reg, U))
+ return Exit;
+ }
+ }
+
+ return Skip;
+}
+
+void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ CallContext &Context) {
+ // Check that this particular call sequence is amenable to the
+ // transformation.
+ const X86RegisterInfo &RegInfo =
+ *static_cast<const X86RegisterInfo *>(STI->getRegisterInfo());
+ unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
+
+ // We expect to enter this at the beginning of a call sequence
+ assert(I->getOpcode() == TII->getCallFrameSetupOpcode());
+ MachineBasicBlock::iterator FrameSetup = I++;
+ Context.FrameSetup = FrameSetup;
+
+ // How much do we adjust the stack? This puts an upper bound on
+ // the number of parameters actually passed on it.
+ unsigned int MaxAdjust =
+ FrameSetup->getOperand(0).getImm() >> Log2SlotSize;
+
+ // A zero adjustment means no stack parameters
+ if (!MaxAdjust) {
+ Context.NoStackParams = true;
+ return;
+ }
+
+ // Skip over DEBUG_VALUE.
+ // For globals in PIC mode, we can have some LEAs here. Skip them as well.
+ // TODO: Extend this to something that covers more cases.
+ while (I->getOpcode() == X86::LEA32r || I->isDebugValue())
+ ++I;
+
+ unsigned StackPtr = RegInfo.getStackRegister();
+ // SelectionDAG (but not FastISel) inserts a copy of ESP into a virtual
+ // register here. If it's there, use that virtual register as stack pointer
+ // instead.
+ if (I->isCopy() && I->getOperand(0).isReg() && I->getOperand(1).isReg() &&
+ I->getOperand(1).getReg() == StackPtr) {
+ Context.SPCopy = &*I++;
+ StackPtr = Context.SPCopy->getOperand(0).getReg();
+ }
+
+ // Scan the call setup sequence for the pattern we're looking for.
+ // We only handle a simple case - a sequence of store instructions that
+ // push a sequence of stack-slot-aligned values onto the stack, with
+ // no gaps between them.
+ if (MaxAdjust > 4)
+ Context.MovVector.resize(MaxAdjust, nullptr);
+
+ InstClassification Classification;
+ DenseSet<unsigned int> UsedRegs;
+
+ while ((Classification = classifyInstruction(MBB, I, RegInfo, UsedRegs)) !=
+ Exit) {
+ if (Classification == Skip) {
+ ++I;
+ continue;
+ }
+
+ // We know the instruction has a supported store opcode.
+ // We only want movs of the form:
+ // mov imm/reg, k(%StackPtr)
+ // If we run into something else, bail.
+ // Note that AddrBaseReg may, counter to its name, not be a register,
+ // but rather a frame index.
+ // TODO: Support the fi case. This should probably work now that we
+ // have the infrastructure to track the stack pointer within a call
+ // sequence.
+ if (!I->getOperand(X86::AddrBaseReg).isReg() ||
+ (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) ||
+ !I->getOperand(X86::AddrScaleAmt).isImm() ||
+ (I->getOperand(X86::AddrScaleAmt).getImm() != 1) ||
+ (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) ||
+ (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) ||
+ !I->getOperand(X86::AddrDisp).isImm())
+ return;
+
+ int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm();
+ assert(StackDisp >= 0 &&
+ "Negative stack displacement when passing parameters");
+
+ // We really don't want to consider the unaligned case.
+ if (StackDisp & (SlotSize - 1))
+ return;
+ StackDisp >>= Log2SlotSize;
+
+ assert((size_t)StackDisp < Context.MovVector.size() &&
+ "Function call has more parameters than the stack is adjusted for.");
+
+ // If the same stack slot is being filled twice, something's fishy.
+ if (Context.MovVector[StackDisp] != nullptr)
+ return;
+ Context.MovVector[StackDisp] = &*I;
+
+ for (const MachineOperand &MO : I->uses()) {
+ if (!MO.isReg())
+ continue;
+ unsigned int Reg = MO.getReg();
+ if (RegInfo.isPhysicalRegister(Reg))
+ UsedRegs.insert(Reg);
+ }
+
+ ++I;
+ }
+
+ // We now expect the end of the sequence. If we stopped early,
+ // or reached the end of the block without finding a call, bail.
+ if (I == MBB.end() || !I->isCall())
+ return;
+
+ Context.Call = &*I;
+ if ((++I)->getOpcode() != FrameDestroyOpcode)
+ return;
+
+ // Now, go through the vector, and see that we don't have any gaps,
+ // but only a series of MOVs.
+ auto MMI = Context.MovVector.begin(), MME = Context.MovVector.end();
+ for (; MMI != MME; ++MMI, Context.ExpectedDist += SlotSize)
+ if (*MMI == nullptr)
+ break;
+
+ // If the call had no parameters, do nothing
+ if (MMI == Context.MovVector.begin())
+ return;
+
+ // We are either at the last parameter, or a gap.
+ // Make sure it's not a gap
+ for (; MMI != MME; ++MMI)
+ if (*MMI != nullptr)
+ return;
+
+ Context.UsePush = true;
+}
+
+void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
+ const CallContext &Context) {
+ // Ok, we can in fact do the transformation for this call.
+ // Do not remove the FrameSetup instruction, but adjust the parameters.
+ // PEI will end up finalizing the handling of this.
+ MachineBasicBlock::iterator FrameSetup = Context.FrameSetup;
+ MachineBasicBlock &MBB = *(FrameSetup->getParent());
+ FrameSetup->getOperand(1).setImm(Context.ExpectedDist);
+
+ DebugLoc DL = FrameSetup->getDebugLoc();
+ bool Is64Bit = STI->is64Bit();
+ // Now, iterate through the vector in reverse order, and replace the movs
+ // with pushes. MOVmi/MOVmr doesn't have any defs, so no need to
+ // replace uses.
+ for (int Idx = (Context.ExpectedDist >> Log2SlotSize) - 1; Idx >= 0; --Idx) {
+ MachineBasicBlock::iterator MOV = *Context.MovVector[Idx];
+ MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands);
+ MachineBasicBlock::iterator Push = nullptr;
+ unsigned PushOpcode;
+ switch (MOV->getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected Opcode!");
+ case X86::MOV32mi:
+ case X86::MOV64mi32:
+ PushOpcode = Is64Bit ? X86::PUSH64i32 : X86::PUSHi32;
+ // If the operand is a small (8-bit) immediate, we can use a
+ // PUSH instruction with a shorter encoding.
+ // Note that isImm() may fail even though this is a MOVmi, because
+ // the operand can also be a symbol.
+ if (PushOp.isImm()) {
+ int64_t Val = PushOp.getImm();
+ if (isInt<8>(Val))
+ PushOpcode = Is64Bit ? X86::PUSH64i8 : X86::PUSH32i8;
+ }
+ Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode))
+ .addOperand(PushOp);
+ break;
+ case X86::MOV32mr:
+ case X86::MOV64mr:
+ unsigned int Reg = PushOp.getReg();
+
+ // If storing a 32-bit vreg on 64-bit targets, extend to a 64-bit vreg
+ // in preparation for the PUSH64. The upper 32 bits can be undef.
+ if (Is64Bit && MOV->getOpcode() == X86::MOV32mr) {
+ unsigned UndefReg = MRI->createVirtualRegister(&X86::GR64RegClass);
+ Reg = MRI->createVirtualRegister(&X86::GR64RegClass);
+ BuildMI(MBB, Context.Call, DL, TII->get(X86::IMPLICIT_DEF), UndefReg);
+ BuildMI(MBB, Context.Call, DL, TII->get(X86::INSERT_SUBREG), Reg)
+ .addReg(UndefReg)
+ .addOperand(PushOp)
+ .addImm(X86::sub_32bit);
+ }
+
+ // If PUSHrmm is not slow on this target, try to fold the source of the
+ // push into the instruction.
+ bool SlowPUSHrmm = STI->isAtom() || STI->isSLM();
+
+ // Check that this is legal to fold. Right now, we're extremely
+ // conservative about that.
+ MachineInstr *DefMov = nullptr;
+ if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) {
+ PushOpcode = Is64Bit ? X86::PUSH64rmm : X86::PUSH32rmm;
+ Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode));
+
+ unsigned NumOps = DefMov->getDesc().getNumOperands();
+ for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
+ Push->addOperand(DefMov->getOperand(i));
+
+ DefMov->eraseFromParent();
+ } else {
+ PushOpcode = Is64Bit ? X86::PUSH64r : X86::PUSH32r;
+ Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode))
+ .addReg(Reg)
+ .getInstr();
+ }
+ break;
+ }
+
+ // For debugging, when using SP-based CFA, we need to adjust the CFA
+ // offset after each push.
+ // TODO: This is needed only if we require precise CFA.
+ if (!TFL->hasFP(MF))
+ TFL->BuildCFI(
+ MBB, std::next(Push), DL,
+ MCCFIInstruction::createAdjustCfaOffset(nullptr, SlotSize));
+
+ MBB.erase(MOV);
+ }
+
+ // The stack-pointer copy is no longer used in the call sequences.
+ // There should not be any other users, but we can't commit to that, so:
+ if (Context.SPCopy && MRI->use_empty(Context.SPCopy->getOperand(0).getReg()))
+ Context.SPCopy->eraseFromParent();
+
+ // Once we've done this, we need to make sure PEI doesn't assume a reserved
+ // frame.
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+ FuncInfo->setHasPushSequences(true);
+}
+
+MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush(
+ MachineBasicBlock::iterator FrameSetup, unsigned Reg) {
+ // Do an extremely restricted form of load folding.
+ // ISel will often create patterns like:
+ // movl 4(%edi), %eax
+ // movl 8(%edi), %ecx
+ // movl 12(%edi), %edx
+ // movl %edx, 8(%esp)
+ // movl %ecx, 4(%esp)
+ // movl %eax, (%esp)
+ // call
+ // Get rid of those with prejudice.
+ if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ return nullptr;
+
+ // Make sure this is the only use of Reg.
+ if (!MRI->hasOneNonDBGUse(Reg))
+ return nullptr;
+
+ MachineInstr &DefMI = *MRI->getVRegDef(Reg);
+
+ // Make sure the def is a MOV from memory.
+ // If the def is in another block, give up.
+ if ((DefMI.getOpcode() != X86::MOV32rm &&
+ DefMI.getOpcode() != X86::MOV64rm) ||
+ DefMI.getParent() != FrameSetup->getParent())
+ return nullptr;
+
+ // Make sure we don't have any instructions between DefMI and the
+ // push that make folding the load illegal.
+ for (MachineBasicBlock::iterator I = DefMI; I != FrameSetup; ++I)
+ if (I->isLoadFoldBarrier())
+ return nullptr;
+
+ return &DefMI;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86CallLowering.cpp b/contrib/llvm/lib/Target/X86/X86CallLowering.cpp
new file mode 100644
index 000000000000..5ae4962378d3
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86CallLowering.cpp
@@ -0,0 +1,46 @@
+//===-- llvm/lib/Target/X86/X86CallLowering.cpp - Call lowering -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the lowering of LLVM calls to machine code calls for
+/// GlobalISel.
+///
+//===----------------------------------------------------------------------===//
+
+#include "X86CallLowering.h"
+#include "X86ISelLowering.h"
+#include "X86InstrInfo.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "This shouldn't be built without GISel"
+#endif
+
+X86CallLowering::X86CallLowering(const X86TargetLowering &TLI)
+ : CallLowering(&TLI) {}
+
+bool X86CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
+ const Value *Val, unsigned VReg) const {
+ // TODO: handle functions returning non-void values.
+ if (Val)
+ return false;
+
+ MIRBuilder.buildInstr(X86::RET).addImm(0);
+
+ return true;
+}
+
+bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
+ const Function &F,
+ ArrayRef<unsigned> VRegs) const {
+ // TODO: handle functions with one or more arguments.
+ return F.arg_empty();
+}
diff --git a/contrib/llvm/lib/Target/X86/X86CallLowering.h b/contrib/llvm/lib/Target/X86/X86CallLowering.h
new file mode 100644
index 000000000000..f2672f09d855
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86CallLowering.h
@@ -0,0 +1,39 @@
+//===-- llvm/lib/Target/X86/X86CallLowering.h - Call lowering -----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file describes how to lower LLVM calls to machine code calls.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86CALLLOWERING
+#define LLVM_LIB_TARGET_X86_X86CALLLOWERING
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+
+namespace llvm {
+
+class Function;
+class MachineIRBuilder;
+class X86TargetLowering;
+class Value;
+
+class X86CallLowering : public CallLowering {
+public:
+ X86CallLowering(const X86TargetLowering &TLI);
+
+ bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val,
+ unsigned VReg) const override;
+
+ bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
+ ArrayRef<unsigned> VRegs) const override;
+};
+} // End of namespace llvm;
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.cpp b/contrib/llvm/lib/Target/X86/X86CallingConv.cpp
new file mode 100644
index 000000000000..59dde982f512
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86CallingConv.cpp
@@ -0,0 +1,208 @@
+//=== X86CallingConv.cpp - X86 Custom Calling Convention Impl -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of custom routines for the X86
+// Calling Convention that aren't done by tablegen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/IR/CallingConv.h"
+
+namespace llvm {
+
+bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+ // List of GPR registers that are available to store values in regcall
+ // calling convention.
+ static const MCPhysReg RegList[] = {X86::EAX, X86::ECX, X86::EDX, X86::EDI,
+ X86::ESI};
+
+ // The vector will save all the available registers for allocation.
+ SmallVector<unsigned, 5> AvailableRegs;
+
+ // searching for the available registers.
+ for (auto Reg : RegList) {
+ if (!State.isAllocated(Reg))
+ AvailableRegs.push_back(Reg);
+ }
+
+ const size_t RequiredGprsUponSplit = 2;
+ if (AvailableRegs.size() < RequiredGprsUponSplit)
+ return false; // Not enough free registers - continue the search.
+
+ // Allocating the available registers.
+ for (unsigned I = 0; I < RequiredGprsUponSplit; I++) {
+
+ // Marking the register as located.
+ unsigned Reg = State.AllocateReg(AvailableRegs[I]);
+
+ // Since we previously made sure that 2 registers are available
+ // we expect that a real register number will be returned.
+ assert(Reg && "Expecting a register will be available");
+
+ // Assign the value to the allocated register
+ State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ }
+
+ // Successful in allocating regsiters - stop scanning next rules.
+ return true;
+}
+
+static ArrayRef<MCPhysReg> CC_X86_VectorCallGetSSEs(const MVT &ValVT) {
+ if (ValVT.is512BitVector()) {
+ static const MCPhysReg RegListZMM[] = {X86::ZMM0, X86::ZMM1, X86::ZMM2,
+ X86::ZMM3, X86::ZMM4, X86::ZMM5};
+ return makeArrayRef(std::begin(RegListZMM), std::end(RegListZMM));
+ }
+
+ if (ValVT.is256BitVector()) {
+ static const MCPhysReg RegListYMM[] = {X86::YMM0, X86::YMM1, X86::YMM2,
+ X86::YMM3, X86::YMM4, X86::YMM5};
+ return makeArrayRef(std::begin(RegListYMM), std::end(RegListYMM));
+ }
+
+ static const MCPhysReg RegListXMM[] = {X86::XMM0, X86::XMM1, X86::XMM2,
+ X86::XMM3, X86::XMM4, X86::XMM5};
+ return makeArrayRef(std::begin(RegListXMM), std::end(RegListXMM));
+}
+
+static ArrayRef<MCPhysReg> CC_X86_64_VectorCallGetGPRs() {
+ static const MCPhysReg RegListGPR[] = {X86::RCX, X86::RDX, X86::R8, X86::R9};
+ return makeArrayRef(std::begin(RegListGPR), std::end(RegListGPR));
+}
+
+static bool CC_X86_VectorCallAssignRegister(unsigned &ValNo, MVT &ValVT,
+ MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags,
+ CCState &State) {
+
+ ArrayRef<MCPhysReg> RegList = CC_X86_VectorCallGetSSEs(ValVT);
+ bool Is64bit = static_cast<const X86Subtarget &>(
+ State.getMachineFunction().getSubtarget())
+ .is64Bit();
+
+ for (auto Reg : RegList) {
+ // If the register is not marked as allocated - assign to it.
+ if (!State.isAllocated(Reg)) {
+ unsigned AssigedReg = State.AllocateReg(Reg);
+ assert(AssigedReg == Reg && "Expecting a valid register allocation");
+ State.addLoc(
+ CCValAssign::getReg(ValNo, ValVT, AssigedReg, LocVT, LocInfo));
+ return true;
+ }
+ // If the register is marked as shadow allocated - assign to it.
+ if (Is64bit && State.IsShadowAllocatedReg(Reg)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return true;
+ }
+ }
+
+ llvm_unreachable("Clang should ensure that hva marked vectors will have "
+ "an available register.");
+ return false;
+}
+
+bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+ // On the second pass, go through the HVAs only.
+ if (ArgFlags.isSecArgPass()) {
+ if (ArgFlags.isHva())
+ return CC_X86_VectorCallAssignRegister(ValNo, ValVT, LocVT, LocInfo,
+ ArgFlags, State);
+ return true;
+ }
+
+ // Process only vector types as defined by vectorcall spec:
+ // "A vector type is either a floating-point type, for example,
+ // a float or double, or an SIMD vector type, for example, __m128 or __m256".
+ if (!(ValVT.isFloatingPoint() ||
+ (ValVT.isVector() && ValVT.getSizeInBits() >= 128))) {
+ // If R9 was already assigned it means that we are after the fourth element
+ // and because this is not an HVA / Vector type, we need to allocate
+ // shadow XMM register.
+ if (State.isAllocated(X86::R9)) {
+ // Assign shadow XMM register.
+ (void)State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT));
+ }
+
+ return false;
+ }
+
+ if (!ArgFlags.isHva() || ArgFlags.isHvaStart()) {
+ // Assign shadow GPR register.
+ (void)State.AllocateReg(CC_X86_64_VectorCallGetGPRs());
+
+ // Assign XMM register - (shadow for HVA and non-shadow for non HVA).
+ if (unsigned Reg = State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT))) {
+ // In Vectorcall Calling convention, additional shadow stack can be
+ // created on top of the basic 32 bytes of win64.
+ // It can happen if the fifth or sixth argument is vector type or HVA.
+ // At that case for each argument a shadow stack of 8 bytes is allocated.
+ if (Reg == X86::XMM4 || Reg == X86::XMM5)
+ State.AllocateStack(8, 8);
+
+ if (!ArgFlags.isHva()) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return true; // Allocated a register - Stop the search.
+ }
+ }
+ }
+
+ // If this is an HVA - Stop the search,
+ // otherwise continue the search.
+ return ArgFlags.isHva();
+}
+
+bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+ // On the second pass, go through the HVAs only.
+ if (ArgFlags.isSecArgPass()) {
+ if (ArgFlags.isHva())
+ return CC_X86_VectorCallAssignRegister(ValNo, ValVT, LocVT, LocInfo,
+ ArgFlags, State);
+ return true;
+ }
+
+ // Process only vector types as defined by vectorcall spec:
+ // "A vector type is either a floating point type, for example,
+ // a float or double, or an SIMD vector type, for example, __m128 or __m256".
+ if (!(ValVT.isFloatingPoint() ||
+ (ValVT.isVector() && ValVT.getSizeInBits() >= 128))) {
+ return false;
+ }
+
+ if (ArgFlags.isHva())
+ return true; // If this is an HVA - Stop the search.
+
+ // Assign XMM register.
+ if (unsigned Reg = State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT))) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return true;
+ }
+
+ // In case we did not find an available XMM register for a vector -
+ // pass it indirectly.
+ // It is similar to CCPassIndirect, with the addition of inreg.
+ if (!ValVT.isFloatingPoint()) {
+ LocVT = MVT::i32;
+ LocInfo = CCValAssign::Indirect;
+ ArgFlags.setInReg();
+ }
+
+ return false; // No register was assigned - Continue the search.
+}
+
+} // End llvm namespace
diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.h b/contrib/llvm/lib/Target/X86/X86CallingConv.h
new file mode 100644
index 000000000000..c49a6838fa44
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86CallingConv.h
@@ -0,0 +1,121 @@
+//=== X86CallingConv.h - X86 Custom Calling Convention Routines -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the custom routines for the X86 Calling Convention that
+// aren't done by tablegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86CALLINGCONV_H
+#define LLVM_LIB_TARGET_X86_X86CALLINGCONV_H
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/IR/CallingConv.h"
+
+namespace llvm {
+
+/// When regcall calling convention compiled to 32 bit arch, special treatment
+/// is required for 64 bit masks.
+/// The value should be assigned to two GPRs.
+/// \return true if registers were allocated and false otherwise.
+bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State);
+
+/// Vectorcall calling convention has special handling for vector types or
+/// HVA for 64 bit arch.
+/// For HVAs shadow registers might be allocated on the first pass
+/// and actual XMM registers are allocated on the second pass.
+/// For vector types, actual XMM registers are allocated on the first pass.
+/// \return true if registers were allocated and false otherwise.
+bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State);
+
+/// Vectorcall calling convention has special handling for vector types or
+/// HVA for 32 bit arch.
+/// For HVAs actual XMM registers are allocated on the second pass.
+/// For vector types, actual XMM registers are allocated on the first pass.
+/// \return true if registers were allocated and false otherwise.
+bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State);
+
+inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &,
+ CCValAssign::LocInfo &, ISD::ArgFlagsTy &,
+ CCState &) {
+ llvm_unreachable("The AnyReg calling convention is only supported by the " \
+ "stackmap and patchpoint intrinsics.");
+ // gracefully fallback to X86 C calling convention on Release builds.
+ return false;
+}
+
+inline bool CC_X86_32_MCUInReg(unsigned &ValNo, MVT &ValVT,
+ MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags,
+ CCState &State) {
+ // This is similar to CCAssignToReg<[EAX, EDX, ECX]>, but makes sure
+ // not to split i64 and double between a register and stack
+ static const MCPhysReg RegList[] = {X86::EAX, X86::EDX, X86::ECX};
+ static const unsigned NumRegs = sizeof(RegList)/sizeof(RegList[0]);
+
+ SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
+
+ // If this is the first part of an double/i64/i128, or if we're already
+ // in the middle of a split, add to the pending list. If this is not
+ // the end of the split, return, otherwise go on to process the pending
+ // list
+ if (ArgFlags.isSplit() || !PendingMembers.empty()) {
+ PendingMembers.push_back(
+ CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
+ if (!ArgFlags.isSplitEnd())
+ return true;
+ }
+
+ // If there are no pending members, we are not in the middle of a split,
+ // so do the usual inreg stuff.
+ if (PendingMembers.empty()) {
+ if (unsigned Reg = State.AllocateReg(RegList)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return true;
+ }
+ return false;
+ }
+
+ assert(ArgFlags.isSplitEnd());
+
+ // We now have the entire original argument in PendingMembers, so decide
+ // whether to use registers or the stack.
+ // Per the MCU ABI:
+ // a) To use registers, we need to have enough of them free to contain
+ // the entire argument.
+ // b) We never want to use more than 2 registers for a single argument.
+
+ unsigned FirstFree = State.getFirstUnallocated(RegList);
+ bool UseRegs = PendingMembers.size() <= std::min(2U, NumRegs - FirstFree);
+
+ for (auto &It : PendingMembers) {
+ if (UseRegs)
+ It.convertToReg(State.AllocateReg(RegList[FirstFree++]));
+ else
+ It.convertToMem(State.AllocateStack(4, 4));
+ State.addLoc(It);
+ }
+
+ PendingMembers.clear();
+
+ return true;
+}
+
+} // End llvm namespace
+
+#endif
+
diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.td b/contrib/llvm/lib/Target/X86/X86CallingConv.td
new file mode 100644
index 000000000000..cf7bc981b8a5
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86CallingConv.td
@@ -0,0 +1,1121 @@
+//===-- X86CallingConv.td - Calling Conventions X86 32/64 --*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the X86-32 and X86-64
+// architectures.
+//
+//===----------------------------------------------------------------------===//
+
+/// CCIfSubtarget - Match if the current subtarget has a feature F.
+class CCIfSubtarget<string F, CCAction A>
+ : CCIf<!strconcat("static_cast<const X86Subtarget&>"
+ "(State.getMachineFunction().getSubtarget()).", F),
+ A>;
+
+// Register classes for RegCall
+class RC_X86_RegCall {
+ list<Register> GPR_8 = [];
+ list<Register> GPR_16 = [];
+ list<Register> GPR_32 = [];
+ list<Register> GPR_64 = [];
+ list<Register> FP_CALL = [FP0];
+ list<Register> FP_RET = [FP0, FP1];
+ list<Register> XMM = [];
+ list<Register> YMM = [];
+ list<Register> ZMM = [];
+}
+
+// RegCall register classes for 32 bits
+def RC_X86_32_RegCall : RC_X86_RegCall {
+ let GPR_8 = [AL, CL, DL, DIL, SIL];
+ let GPR_16 = [AX, CX, DX, DI, SI];
+ let GPR_32 = [EAX, ECX, EDX, EDI, ESI];
+ let GPR_64 = [RAX]; ///< Not actually used, but AssignToReg can't handle []
+ ///< \todo Fix AssignToReg to enable empty lists
+ let XMM = [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7];
+ let YMM = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7];
+ let ZMM = [ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5, ZMM6, ZMM7];
+}
+
+class RC_X86_64_RegCall : RC_X86_RegCall {
+ let XMM = [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+ XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15];
+ let YMM = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
+ YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15];
+ let ZMM = [ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5, ZMM6, ZMM7,
+ ZMM8, ZMM9, ZMM10, ZMM11, ZMM12, ZMM13, ZMM14, ZMM15];
+}
+
+def RC_X86_64_RegCall_Win : RC_X86_64_RegCall {
+ let GPR_8 = [AL, CL, DL, DIL, SIL, R8B, R9B, R10B, R11B, R12B, R14B, R15B];
+ let GPR_16 = [AX, CX, DX, DI, SI, R8W, R9W, R10W, R11W, R12W, R14W, R15W];
+ let GPR_32 = [EAX, ECX, EDX, EDI, ESI, R8D, R9D, R10D, R11D, R12D, R14D, R15D];
+ let GPR_64 = [RAX, RCX, RDX, RDI, RSI, R8, R9, R10, R11, R12, R14, R15];
+}
+
+def RC_X86_64_RegCall_SysV : RC_X86_64_RegCall {
+ let GPR_8 = [AL, CL, DL, DIL, SIL, R8B, R9B, R12B, R13B, R14B, R15B];
+ let GPR_16 = [AX, CX, DX, DI, SI, R8W, R9W, R12W, R13W, R14W, R15W];
+ let GPR_32 = [EAX, ECX, EDX, EDI, ESI, R8D, R9D, R12D, R13D, R14D, R15D];
+ let GPR_64 = [RAX, RCX, RDX, RDI, RSI, R8, R9, R12, R13, R14, R15];
+}
+
+// X86-64 Intel regcall calling convention.
+multiclass X86_RegCall_base<RC_X86_RegCall RC> {
+def CC_#NAME : CallingConv<[
+ // Handles byval parameters.
+ CCIfSubtarget<"is64Bit()", CCIfByVal<CCPassByVal<8, 8>>>,
+ CCIfByVal<CCPassByVal<4, 4>>,
+
+ // Promote i1/i8/i16 arguments to i32.
+ CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+ // Promote v8i1/v16i1/v32i1 arguments to i32.
+ CCIfType<[v8i1, v16i1, v32i1], CCPromoteToType<i32>>,
+
+ // bool, char, int, enum, long, pointer --> GPR
+ CCIfType<[i32], CCAssignToReg<RC.GPR_32>>,
+
+ // long long, __int64 --> GPR
+ CCIfType<[i64], CCAssignToReg<RC.GPR_64>>,
+
+ // __mmask64 (v64i1) --> GPR64 (for x64) or 2 x GPR32 (for IA32)
+ CCIfType<[v64i1], CCPromoteToType<i64>>,
+ CCIfSubtarget<"is64Bit()", CCIfType<[i64],
+ CCAssignToReg<RC.GPR_64>>>,
+ CCIfSubtarget<"is32Bit()", CCIfType<[i64],
+ CCCustom<"CC_X86_32_RegCall_Assign2Regs">>>,
+
+ // float, double, float128 --> XMM
+ // In the case of SSE disabled --> save to stack
+ CCIfType<[f32, f64, f128],
+ CCIfSubtarget<"hasSSE1()", CCAssignToReg<RC.XMM>>>,
+
+ // long double --> FP
+ CCIfType<[f80], CCAssignToReg<RC.FP_CALL>>,
+
+ // __m128, __m128i, __m128d --> XMM
+ // In the case of SSE disabled --> save to stack
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCIfSubtarget<"hasSSE1()", CCAssignToReg<RC.XMM>>>,
+
+ // __m256, __m256i, __m256d --> YMM
+ // In the case of SSE disabled --> save to stack
+ CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+ CCIfSubtarget<"hasAVX()", CCAssignToReg<RC.YMM>>>,
+
+ // __m512, __m512i, __m512d --> ZMM
+ // In the case of SSE disabled --> save to stack
+ CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+ CCIfSubtarget<"hasAVX512()",CCAssignToReg<RC.ZMM>>>,
+
+ // If no register was found -> assign to stack
+
+ // In 64 bit, assign 64/32 bit values to 8 byte stack
+ CCIfSubtarget<"is64Bit()", CCIfType<[i32, i64, f32, f64],
+ CCAssignToStack<8, 8>>>,
+
+ // In 32 bit, assign 64/32 bit values to 8/4 byte stack
+ CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+ CCIfType<[i64, f64], CCAssignToStack<8, 4>>,
+
+ // MMX type gets 8 byte slot in stack , while alignment depends on target
+ CCIfSubtarget<"is64Bit()", CCIfType<[x86mmx], CCAssignToStack<8, 8>>>,
+ CCIfType<[x86mmx], CCAssignToStack<8, 4>>,
+
+ // float 128 get stack slots whose size and alignment depends
+ // on the subtarget.
+ CCIfType<[f80, f128], CCAssignToStack<0, 0>>,
+
+ // Vectors get 16-byte stack slots that are 16-byte aligned.
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToStack<16, 16>>,
+
+ // 256-bit vectors get 32-byte stack slots that are 32-byte aligned.
+ CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+ CCAssignToStack<32, 32>>,
+
+ // 512-bit vectors get 64-byte stack slots that are 64-byte aligned.
+ CCIfType<[v16i32, v8i64, v16f32, v8f64], CCAssignToStack<64, 64>>
+]>;
+
+def RetCC_#NAME : CallingConv<[
+ // Promote i1, v8i1 arguments to i8.
+ CCIfType<[i1, v8i1], CCPromoteToType<i8>>,
+
+ // Promote v16i1 arguments to i16.
+ CCIfType<[v16i1], CCPromoteToType<i16>>,
+
+ // Promote v32i1 arguments to i32.
+ CCIfType<[v32i1], CCPromoteToType<i32>>,
+
+ // bool, char, int, enum, long, pointer --> GPR
+ CCIfType<[i8], CCAssignToReg<RC.GPR_8>>,
+ CCIfType<[i16], CCAssignToReg<RC.GPR_16>>,
+ CCIfType<[i32], CCAssignToReg<RC.GPR_32>>,
+
+ // long long, __int64 --> GPR
+ CCIfType<[i64], CCAssignToReg<RC.GPR_64>>,
+
+ // __mmask64 (v64i1) --> GPR64 (for x64) or 2 x GPR32 (for IA32)
+ CCIfType<[v64i1], CCPromoteToType<i64>>,
+ CCIfSubtarget<"is64Bit()", CCIfType<[i64],
+ CCAssignToReg<RC.GPR_64>>>,
+ CCIfSubtarget<"is32Bit()", CCIfType<[i64],
+ CCCustom<"CC_X86_32_RegCall_Assign2Regs">>>,
+
+ // long double --> FP
+ CCIfType<[f80], CCAssignToReg<RC.FP_RET>>,
+
+ // float, double, float128 --> XMM
+ CCIfType<[f32, f64, f128],
+ CCIfSubtarget<"hasSSE1()", CCAssignToReg<RC.XMM>>>,
+
+ // __m128, __m128i, __m128d --> XMM
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCIfSubtarget<"hasSSE1()", CCAssignToReg<RC.XMM>>>,
+
+ // __m256, __m256i, __m256d --> YMM
+ CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+ CCIfSubtarget<"hasAVX()", CCAssignToReg<RC.YMM>>>,
+
+ // __m512, __m512i, __m512d --> ZMM
+ CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+ CCIfSubtarget<"hasAVX512()", CCAssignToReg<RC.ZMM>>>
+]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// Return-value conventions common to all X86 CC's.
+def RetCC_X86Common : CallingConv<[
+ // Scalar values are returned in AX first, then DX. For i8, the ABI
+ // requires the values to be in AL and AH, however this code uses AL and DL
+ // instead. This is because using AH for the second register conflicts with
+ // the way LLVM does multiple return values -- a return of {i16,i8} would end
+ // up in AX and AH, which overlap. Front-ends wishing to conform to the ABI
+ // for functions that return two i8 values are currently expected to pack the
+ // values into an i16 (which uses AX, and thus AL:AH).
+ //
+ // For code that doesn't care about the ABI, we allow returning more than two
+ // integer values in registers.
+ CCIfType<[i1], CCPromoteToType<i8>>,
+ CCIfType<[i8] , CCAssignToReg<[AL, DL, CL]>>,
+ CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>,
+ CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>,
+ CCIfType<[i64], CCAssignToReg<[RAX, RDX, RCX]>>,
+
+ // Boolean vectors of AVX-512 are returned in SIMD registers.
+ // The call from AVX to AVX-512 function should work,
+ // since the boolean types in AVX/AVX2 are promoted by default.
+ CCIfType<[v2i1], CCPromoteToType<v2i64>>,
+ CCIfType<[v4i1], CCPromoteToType<v4i32>>,
+ CCIfType<[v8i1], CCPromoteToType<v8i16>>,
+ CCIfType<[v16i1], CCPromoteToType<v16i8>>,
+ CCIfType<[v32i1], CCPromoteToType<v32i8>>,
+ CCIfType<[v64i1], CCPromoteToType<v64i8>>,
+
+ // Vector types are returned in XMM0 and XMM1, when they fit. XMM2 and XMM3
+ // can only be used by ABI non-compliant code. If the target doesn't have XMM
+ // registers, it won't have vector types.
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
+
+ // 256-bit vectors are returned in YMM0 and XMM1, when they fit. YMM2 and YMM3
+ // can only be used by ABI non-compliant code. This vector type is only
+ // supported while using the AVX target feature.
+ CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+ CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>,
+
+ // 512-bit vectors are returned in ZMM0 and ZMM1, when they fit. ZMM2 and ZMM3
+ // can only be used by ABI non-compliant code. This vector type is only
+ // supported while using the AVX-512 target feature.
+ CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+ CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>,
+
+ // MMX vector types are always returned in MM0. If the target doesn't have
+ // MM0, it doesn't support these vector types.
+ CCIfType<[x86mmx], CCAssignToReg<[MM0]>>,
+
+ // Long double types are always returned in FP0 (even with SSE).
+ CCIfType<[f80], CCAssignToReg<[FP0, FP1]>>
+]>;
+
+// X86-32 C return-value convention.
+def RetCC_X86_32_C : CallingConv<[
+ // The X86-32 calling convention returns FP values in FP0, unless marked
+ // with "inreg" (used here to distinguish one kind of reg from another,
+ // weirdly; this is really the sse-regparm calling convention) in which
+ // case they use XMM0, otherwise it is the same as the common X86 calling
+ // conv.
+ CCIfInReg<CCIfSubtarget<"hasSSE2()",
+ CCIfType<[f32, f64], CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
+ CCIfType<[f32,f64], CCAssignToReg<[FP0, FP1]>>,
+ CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-32 FastCC return-value convention.
+def RetCC_X86_32_Fast : CallingConv<[
+ // The X86-32 fastcc returns 1, 2, or 3 FP values in XMM0-2 if the target has
+ // SSE2.
+ // This can happen when a float, 2 x float, or 3 x float vector is split by
+ // target lowering, and is returned in 1-3 sse regs.
+ CCIfType<[f32], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>,
+ CCIfType<[f64], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>,
+
+ // For integers, ECX can be used as an extra return register
+ CCIfType<[i8], CCAssignToReg<[AL, DL, CL]>>,
+ CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>,
+ CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>,
+
+ // Otherwise, it is the same as the common X86 calling convention.
+ CCDelegateTo<RetCC_X86Common>
+]>;
+
+// Intel_OCL_BI return-value convention.
+def RetCC_Intel_OCL_BI : CallingConv<[
+ // Vector types are returned in XMM0,XMM1,XMMM2 and XMM3.
+ CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
+
+ // 256-bit FP vectors
+ // No more than 4 registers
+ CCIfType<[v8f32, v4f64, v8i32, v4i64],
+ CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>,
+
+ // 512-bit FP vectors
+ CCIfType<[v16f32, v8f64, v16i32, v8i64],
+ CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>,
+
+ // i32, i64 in the standard way
+ CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-32 HiPE return-value convention.
+def RetCC_X86_32_HiPE : CallingConv<[
+ // Promote all types to i32
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // Return: HP, P, VAL1, VAL2
+ CCIfType<[i32], CCAssignToReg<[ESI, EBP, EAX, EDX]>>
+]>;
+
+// X86-32 Vectorcall return-value convention.
+def RetCC_X86_32_VectorCall : CallingConv<[
+ // Floating Point types are returned in XMM0,XMM1,XMMM2 and XMM3.
+ CCIfType<[f32, f64, f128],
+ CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
+
+ // Return integers in the standard way.
+ CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-64 C return-value convention.
+def RetCC_X86_64_C : CallingConv<[
+ // The X86-64 calling convention always returns FP values in XMM0.
+ CCIfType<[f32], CCAssignToReg<[XMM0, XMM1]>>,
+ CCIfType<[f64], CCAssignToReg<[XMM0, XMM1]>>,
+ CCIfType<[f128], CCAssignToReg<[XMM0, XMM1]>>,
+
+ // MMX vector types are always returned in XMM0.
+ CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1]>>,
+
+ CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R12]>>>,
+
+ CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-Win64 C return-value convention.
+def RetCC_X86_Win64_C : CallingConv<[
+ // The X86-Win64 calling convention always returns __m64 values in RAX.
+ CCIfType<[x86mmx], CCBitConvertToType<i64>>,
+
+ // Otherwise, everything is the same as 'normal' X86-64 C CC.
+ CCDelegateTo<RetCC_X86_64_C>
+]>;
+
+// X86-64 vectorcall return-value convention.
+def RetCC_X86_64_Vectorcall : CallingConv<[
+ // Vectorcall calling convention always returns FP values in XMMs.
+ CCIfType<[f32, f64, f128],
+ CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
+
+ // Otherwise, everything is the same as Windows X86-64 C CC.
+ CCDelegateTo<RetCC_X86_Win64_C>
+]>;
+
+// X86-64 HiPE return-value convention.
+def RetCC_X86_64_HiPE : CallingConv<[
+ // Promote all types to i64
+ CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+ // Return: HP, P, VAL1, VAL2
+ CCIfType<[i64], CCAssignToReg<[R15, RBP, RAX, RDX]>>
+]>;
+
+// X86-64 WebKit_JS return-value convention.
+def RetCC_X86_64_WebKit_JS : CallingConv<[
+ // Promote all types to i64
+ CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+ // Return: RAX
+ CCIfType<[i64], CCAssignToReg<[RAX]>>
+]>;
+
+def RetCC_X86_64_Swift : CallingConv<[
+
+ CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R12]>>>,
+
+ // For integers, ECX, R8D can be used as extra return registers.
+ CCIfType<[i1], CCPromoteToType<i8>>,
+ CCIfType<[i8] , CCAssignToReg<[AL, DL, CL, R8B]>>,
+ CCIfType<[i16], CCAssignToReg<[AX, DX, CX, R8W]>>,
+ CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX, R8D]>>,
+ CCIfType<[i64], CCAssignToReg<[RAX, RDX, RCX, R8]>>,
+
+ // XMM0, XMM1, XMM2 and XMM3 can be used to return FP values.
+ CCIfType<[f32], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
+ CCIfType<[f64], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
+ CCIfType<[f128], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
+
+ // MMX vector types are returned in XMM0, XMM1, XMM2 and XMM3.
+ CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
+ CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-64 AnyReg return-value convention. No explicit register is specified for
+// the return-value. The register allocator is allowed and expected to choose
+// any free register.
+//
+// This calling convention is currently only supported by the stackmap and
+// patchpoint intrinsics. All other uses will result in an assert on Debug
+// builds. On Release builds we fallback to the X86 C calling convention.
+def RetCC_X86_64_AnyReg : CallingConv<[
+ CCCustom<"CC_X86_AnyReg_Error">
+]>;
+
+// X86-64 HHVM return-value convention.
+def RetCC_X86_64_HHVM: CallingConv<[
+ // Promote all types to i64
+ CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+ // Return: could return in any GP register save RSP and R12.
+ CCIfType<[i64], CCAssignToReg<[RBX, RBP, RDI, RSI, RDX, RCX, R8, R9,
+ RAX, R10, R11, R13, R14, R15]>>
+]>;
+
+
+defm X86_32_RegCall :
+ X86_RegCall_base<RC_X86_32_RegCall>;
+defm X86_Win64_RegCall :
+ X86_RegCall_base<RC_X86_64_RegCall_Win>;
+defm X86_SysV64_RegCall :
+ X86_RegCall_base<RC_X86_64_RegCall_SysV>;
+
+// This is the root return-value convention for the X86-32 backend.
+def RetCC_X86_32 : CallingConv<[
+ // If FastCC, use RetCC_X86_32_Fast.
+ CCIfCC<"CallingConv::Fast", CCDelegateTo<RetCC_X86_32_Fast>>,
+ // If HiPE, use RetCC_X86_32_HiPE.
+ CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_32_HiPE>>,
+ CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<RetCC_X86_32_VectorCall>>,
+ CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo<RetCC_X86_32_RegCall>>,
+
+ // Otherwise, use RetCC_X86_32_C.
+ CCDelegateTo<RetCC_X86_32_C>
+]>;
+
+// This is the root return-value convention for the X86-64 backend.
+def RetCC_X86_64 : CallingConv<[
+ // HiPE uses RetCC_X86_64_HiPE
+ CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_64_HiPE>>,
+
+ // Handle JavaScript calls.
+ CCIfCC<"CallingConv::WebKit_JS", CCDelegateTo<RetCC_X86_64_WebKit_JS>>,
+ CCIfCC<"CallingConv::AnyReg", CCDelegateTo<RetCC_X86_64_AnyReg>>,
+
+ // Handle Swift calls.
+ CCIfCC<"CallingConv::Swift", CCDelegateTo<RetCC_X86_64_Swift>>,
+
+ // Handle explicit CC selection
+ CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<RetCC_X86_Win64_C>>,
+ CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<RetCC_X86_64_C>>,
+
+ // Handle Vectorcall CC
+ CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<RetCC_X86_64_Vectorcall>>,
+
+ // Handle HHVM calls.
+ CCIfCC<"CallingConv::HHVM", CCDelegateTo<RetCC_X86_64_HHVM>>,
+
+ CCIfCC<"CallingConv::X86_RegCall",
+ CCIfSubtarget<"isTargetWin64()",
+ CCDelegateTo<RetCC_X86_Win64_RegCall>>>,
+ CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo<RetCC_X86_SysV64_RegCall>>,
+
+ // Mingw64 and native Win64 use Win64 CC
+ CCIfSubtarget<"isTargetWin64()", CCDelegateTo<RetCC_X86_Win64_C>>,
+
+ // Otherwise, drop to normal X86-64 CC
+ CCDelegateTo<RetCC_X86_64_C>
+]>;
+
+// This is the return-value convention used for the entire X86 backend.
+def RetCC_X86 : CallingConv<[
+
+ // Check if this is the Intel OpenCL built-ins calling convention
+ CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<RetCC_Intel_OCL_BI>>,
+
+ CCIfSubtarget<"is64Bit()", CCDelegateTo<RetCC_X86_64>>,
+ CCDelegateTo<RetCC_X86_32>
+]>;
+
+//===----------------------------------------------------------------------===//
+// X86-64 Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+
+def CC_X86_64_C : CallingConv<[
+ // Handles byval parameters.
+ CCIfByVal<CCPassByVal<8, 8>>,
+
+ // Promote i1/i8/i16 arguments to i32.
+ CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+ // The 'nest' parameter, if any, is passed in R10.
+ CCIfNest<CCIfSubtarget<"isTarget64BitILP32()", CCAssignToReg<[R10D]>>>,
+ CCIfNest<CCAssignToReg<[R10]>>,
+
+ // Pass SwiftSelf in a callee saved register.
+ CCIfSwiftSelf<CCIfType<[i64], CCAssignToReg<[R13]>>>,
+
+ // A SwiftError is passed in R12.
+ CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R12]>>>,
+
+ // For Swift Calling Convention, pass sret in %RAX.
+ CCIfCC<"CallingConv::Swift",
+ CCIfSRet<CCIfType<[i64], CCAssignToReg<[RAX]>>>>,
+
+ // The first 6 integer arguments are passed in integer registers.
+ CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D, R9D]>>,
+ CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>,
+
+ // The first 8 MMX vector arguments are passed in XMM registers on Darwin.
+ CCIfType<[x86mmx],
+ CCIfSubtarget<"isTargetDarwin()",
+ CCIfSubtarget<"hasSSE2()",
+ CCPromoteToType<v2i64>>>>,
+
+ // Boolean vectors of AVX-512 are passed in SIMD registers.
+ // The call from AVX to AVX-512 function should work,
+ // since the boolean types in AVX/AVX2 are promoted by default.
+ CCIfType<[v2i1], CCPromoteToType<v2i64>>,
+ CCIfType<[v4i1], CCPromoteToType<v4i32>>,
+ CCIfType<[v8i1], CCPromoteToType<v8i16>>,
+ CCIfType<[v16i1], CCPromoteToType<v16i8>>,
+ CCIfType<[v32i1], CCPromoteToType<v32i8>>,
+ CCIfType<[v64i1], CCPromoteToType<v64i8>>,
+
+ // The first 8 FP/Vector arguments are passed in XMM registers.
+ CCIfType<[f32, f64, f128, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCIfSubtarget<"hasSSE1()",
+ CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>,
+
+ // The first 8 256-bit vector arguments are passed in YMM registers, unless
+ // this is a vararg function.
+ // FIXME: This isn't precisely correct; the x86-64 ABI document says that
+ // fixed arguments to vararg functions are supposed to be passed in
+ // registers. Actually modeling that would be a lot of work, though.
+ CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+ CCIfSubtarget<"hasFp256()",
+ CCAssignToReg<[YMM0, YMM1, YMM2, YMM3,
+ YMM4, YMM5, YMM6, YMM7]>>>>,
+
+ // The first 8 512-bit vector arguments are passed in ZMM registers.
+ CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+ CCIfSubtarget<"hasAVX512()",
+ CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5, ZMM6, ZMM7]>>>>,
+
+ // Integer/FP values get stored in stack slots that are 8 bytes in size and
+ // 8-byte aligned if there are no more registers to hold them.
+ CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
+
+ // Long doubles get stack slots whose size and alignment depends on the
+ // subtarget.
+ CCIfType<[f80, f128], CCAssignToStack<0, 0>>,
+
+ // Vectors get 16-byte stack slots that are 16-byte aligned.
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+
+ // 256-bit vectors get 32-byte stack slots that are 32-byte aligned.
+ CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+ CCAssignToStack<32, 32>>,
+
+ // 512-bit vectors get 64-byte stack slots that are 64-byte aligned.
+ CCIfType<[v16i32, v8i64, v16f32, v8f64],
+ CCAssignToStack<64, 64>>
+]>;
+
+// Calling convention for X86-64 HHVM.
+def CC_X86_64_HHVM : CallingConv<[
+ // Use all/any GP registers for args, except RSP.
+ CCIfType<[i64], CCAssignToReg<[RBX, R12, RBP, R15,
+ RDI, RSI, RDX, RCX, R8, R9,
+ RAX, R10, R11, R13, R14]>>
+]>;
+
+// Calling convention for helper functions in HHVM.
+def CC_X86_64_HHVM_C : CallingConv<[
+ // Pass the first argument in RBP.
+ CCIfType<[i64], CCAssignToReg<[RBP]>>,
+
+ // Otherwise it's the same as the regular C calling convention.
+ CCDelegateTo<CC_X86_64_C>
+]>;
+
+// Calling convention used on Win64
+def CC_X86_Win64_C : CallingConv<[
+ // FIXME: Handle byval stuff.
+ // FIXME: Handle varargs.
+
+ // Promote i1/i8/i16 arguments to i32.
+ CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+ // The 'nest' parameter, if any, is passed in R10.
+ CCIfNest<CCAssignToReg<[R10]>>,
+
+ // 128 bit vectors are passed by pointer
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCPassIndirect<i64>>,
+
+
+ // 256 bit vectors are passed by pointer
+ CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCPassIndirect<i64>>,
+
+ // 512 bit vectors are passed by pointer
+ CCIfType<[v16i32, v16f32, v8f64, v8i64], CCPassIndirect<i64>>,
+
+ // The first 4 MMX vector arguments are passed in GPRs.
+ CCIfType<[x86mmx], CCBitConvertToType<i64>>,
+
+ // The first 4 integer arguments are passed in integer registers.
+ CCIfType<[i32], CCAssignToRegWithShadow<[ECX , EDX , R8D , R9D ],
+ [XMM0, XMM1, XMM2, XMM3]>>,
+
+ // Do not pass the sret argument in RCX, the Win64 thiscall calling
+ // convention requires "this" to be passed in RCX.
+ CCIfCC<"CallingConv::X86_ThisCall",
+ CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[RDX , R8 , R9 ],
+ [XMM1, XMM2, XMM3]>>>>,
+
+ CCIfType<[i64], CCAssignToRegWithShadow<[RCX , RDX , R8 , R9 ],
+ [XMM0, XMM1, XMM2, XMM3]>>,
+
+ // The first 4 FP/Vector arguments are passed in XMM registers.
+ CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToRegWithShadow<[XMM0, XMM1, XMM2, XMM3],
+ [RCX , RDX , R8 , R9 ]>>,
+
+ // Integer/FP values get stored in stack slots that are 8 bytes in size and
+ // 8-byte aligned if there are no more registers to hold them.
+ CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
+
+ // Long doubles get stack slots whose size and alignment depends on the
+ // subtarget.
+ CCIfType<[f80], CCAssignToStack<0, 0>>
+]>;
+
+def CC_X86_Win64_VectorCall : CallingConv<[
+ CCCustom<"CC_X86_64_VectorCall">,
+
+ // Delegate to fastcall to handle integer types.
+ CCDelegateTo<CC_X86_Win64_C>
+]>;
+
+
+def CC_X86_64_GHC : CallingConv<[
+ // Promote i8/i16/i32 arguments to i64.
+ CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+ // Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, R5, R6, SpLim
+ CCIfType<[i64],
+ CCAssignToReg<[R13, RBP, R12, RBX, R14, RSI, RDI, R8, R9, R15]>>,
+
+ // Pass in STG registers: F1, F2, F3, F4, D1, D2
+ CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCIfSubtarget<"hasSSE1()",
+ CCAssignToReg<[XMM1, XMM2, XMM3, XMM4, XMM5, XMM6]>>>
+]>;
+
+def CC_X86_64_HiPE : CallingConv<[
+ // Promote i8/i16/i32 arguments to i64.
+ CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+ // Pass in VM's registers: HP, P, ARG0, ARG1, ARG2, ARG3
+ CCIfType<[i64], CCAssignToReg<[R15, RBP, RSI, RDX, RCX, R8]>>,
+
+ // Integer/FP values get stored in stack slots that are 8 bytes in size and
+ // 8-byte aligned if there are no more registers to hold them.
+ CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>
+]>;
+
+def CC_X86_64_WebKit_JS : CallingConv<[
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // Only the first integer argument is passed in register.
+ CCIfType<[i32], CCAssignToReg<[EAX]>>,
+ CCIfType<[i64], CCAssignToReg<[RAX]>>,
+
+ // The remaining integer arguments are passed on the stack. 32bit integer and
+ // floating-point arguments are aligned to 4 byte and stored in 4 byte slots.
+ // 64bit integer and floating-point arguments are aligned to 8 byte and stored
+ // in 8 byte stack slots.
+ CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+ CCIfType<[i64, f64], CCAssignToStack<8, 8>>
+]>;
+
+// No explicit register is specified for the AnyReg calling convention. The
+// register allocator may assign the arguments to any free register.
+//
+// This calling convention is currently only supported by the stackmap and
+// patchpoint intrinsics. All other uses will result in an assert on Debug
+// builds. On Release builds we fallback to the X86 C calling convention.
+def CC_X86_64_AnyReg : CallingConv<[
+ CCCustom<"CC_X86_AnyReg_Error">
+]>;
+
+//===----------------------------------------------------------------------===//
+// X86 C Calling Convention
+//===----------------------------------------------------------------------===//
+
+/// CC_X86_32_Vector_Common - In all X86-32 calling conventions, extra vector
+/// values are spilled on the stack.
+def CC_X86_32_Vector_Common : CallingConv<[
+ // Other SSE vectors get 16-byte stack slots that are 16-byte aligned.
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+
+ // 256-bit AVX vectors get 32-byte stack slots that are 32-byte aligned.
+ CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+ CCAssignToStack<32, 32>>,
+
+ // 512-bit AVX 512-bit vectors get 64-byte stack slots that are 64-byte aligned.
+ CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+ CCAssignToStack<64, 64>>
+]>;
+
+// CC_X86_32_Vector_Standard - The first 3 vector arguments are passed in
+// vector registers
+def CC_X86_32_Vector_Standard : CallingConv<[
+ // SSE vector arguments are passed in XMM registers.
+ CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToReg<[XMM0, XMM1, XMM2]>>>,
+
+ // AVX 256-bit vector arguments are passed in YMM registers.
+ CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+ CCIfSubtarget<"hasFp256()",
+ CCAssignToReg<[YMM0, YMM1, YMM2]>>>>,
+
+ // AVX 512-bit vector arguments are passed in ZMM registers.
+ CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+ CCAssignToReg<[ZMM0, ZMM1, ZMM2]>>>,
+
+ CCDelegateTo<CC_X86_32_Vector_Common>
+]>;
+
+// CC_X86_32_Vector_Darwin - The first 4 vector arguments are passed in
+// vector registers.
+def CC_X86_32_Vector_Darwin : CallingConv<[
+ // SSE vector arguments are passed in XMM registers.
+ CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>>,
+
+ // AVX 256-bit vector arguments are passed in YMM registers.
+ CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+ CCIfSubtarget<"hasFp256()",
+ CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>>>,
+
+ // AVX 512-bit vector arguments are passed in ZMM registers.
+ CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+ CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>>,
+
+ CCDelegateTo<CC_X86_32_Vector_Common>
+]>;
+
+/// CC_X86_32_Common - In all X86-32 calling conventions, extra integers and FP
+/// values are spilled on the stack.
+def CC_X86_32_Common : CallingConv<[
+ // Handles byval parameters.
+ CCIfByVal<CCPassByVal<4, 4>>,
+
+ // The first 3 float or double arguments, if marked 'inreg' and if the call
+ // is not a vararg call and if SSE2 is available, are passed in SSE registers.
+ CCIfNotVarArg<CCIfInReg<CCIfType<[f32,f64],
+ CCIfSubtarget<"hasSSE2()",
+ CCAssignToReg<[XMM0,XMM1,XMM2]>>>>>,
+
+ // The first 3 __m64 vector arguments are passed in mmx registers if the
+ // call is not a vararg call.
+ CCIfNotVarArg<CCIfType<[x86mmx],
+ CCAssignToReg<[MM0, MM1, MM2]>>>,
+
+ // Integer/Float values get stored in stack slots that are 4 bytes in
+ // size and 4-byte aligned.
+ CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+
+ // Doubles get 8-byte slots that are 4-byte aligned.
+ CCIfType<[f64], CCAssignToStack<8, 4>>,
+
+ // Long doubles get slots whose size depends on the subtarget.
+ CCIfType<[f80], CCAssignToStack<0, 4>>,
+
+ // Boolean vectors of AVX-512 are passed in SIMD registers.
+ // The call from AVX to AVX-512 function should work,
+ // since the boolean types in AVX/AVX2 are promoted by default.
+ CCIfType<[v2i1], CCPromoteToType<v2i64>>,
+ CCIfType<[v4i1], CCPromoteToType<v4i32>>,
+ CCIfType<[v8i1], CCPromoteToType<v8i16>>,
+ CCIfType<[v16i1], CCPromoteToType<v16i8>>,
+ CCIfType<[v32i1], CCPromoteToType<v32i8>>,
+ CCIfType<[v64i1], CCPromoteToType<v64i8>>,
+
+ // __m64 vectors get 8-byte stack slots that are 4-byte aligned. They are
+ // passed in the parameter area.
+ CCIfType<[x86mmx], CCAssignToStack<8, 4>>,
+
+ // Darwin passes vectors in a form that differs from the i386 psABI
+ CCIfSubtarget<"isTargetDarwin()", CCDelegateTo<CC_X86_32_Vector_Darwin>>,
+
+ // Otherwise, drop to 'normal' X86-32 CC
+ CCDelegateTo<CC_X86_32_Vector_Standard>
+]>;
+
+def CC_X86_32_C : CallingConv<[
+ // Promote i1/i8/i16 arguments to i32.
+ CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+ // The 'nest' parameter, if any, is passed in ECX.
+ CCIfNest<CCAssignToReg<[ECX]>>,
+
+ // The first 3 integer arguments, if marked 'inreg' and if the call is not
+ // a vararg call, are passed in integer registers.
+ CCIfNotVarArg<CCIfInReg<CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>>>,
+
+ // Otherwise, same as everything else.
+ CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_MCU : CallingConv<[
+ // Handles byval parameters. Note that, like FastCC, we can't rely on
+ // the delegation to CC_X86_32_Common because that happens after code that
+ // puts arguments in registers.
+ CCIfByVal<CCPassByVal<4, 4>>,
+
+ // Promote i1/i8/i16 arguments to i32.
+ CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+ // If the call is not a vararg call, some arguments may be passed
+ // in integer registers.
+ CCIfNotVarArg<CCIfType<[i32], CCCustom<"CC_X86_32_MCUInReg">>>,
+
+ // Otherwise, same as everything else.
+ CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_FastCall : CallingConv<[
+ // Promote i1/i8/i16 arguments to i32.
+ CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+ // The 'nest' parameter, if any, is passed in EAX.
+ CCIfNest<CCAssignToReg<[EAX]>>,
+
+ // The first 2 integer arguments are passed in ECX/EDX
+ CCIfInReg<CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>>,
+
+ // Otherwise, same as everything else.
+ CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_Win32_VectorCall : CallingConv<[
+ // Pass floating point in XMMs
+ CCCustom<"CC_X86_32_VectorCall">,
+
+ // Delegate to fastcall to handle integer types.
+ CCDelegateTo<CC_X86_32_FastCall>
+]>;
+
+def CC_X86_32_ThisCall_Common : CallingConv<[
+ // The first integer argument is passed in ECX
+ CCIfType<[i32], CCAssignToReg<[ECX]>>,
+
+ // Otherwise, same as everything else.
+ CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_ThisCall_Mingw : CallingConv<[
+ // Promote i1/i8/i16 arguments to i32.
+ CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+ CCDelegateTo<CC_X86_32_ThisCall_Common>
+]>;
+
+def CC_X86_32_ThisCall_Win : CallingConv<[
+ // Promote i1/i8/i16 arguments to i32.
+ CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+ // Pass sret arguments indirectly through stack.
+ CCIfSRet<CCAssignToStack<4, 4>>,
+
+ CCDelegateTo<CC_X86_32_ThisCall_Common>
+]>;
+
+def CC_X86_32_ThisCall : CallingConv<[
+ CCIfSubtarget<"isTargetCygMing()", CCDelegateTo<CC_X86_32_ThisCall_Mingw>>,
+ CCDelegateTo<CC_X86_32_ThisCall_Win>
+]>;
+
+def CC_X86_32_FastCC : CallingConv<[
+ // Handles byval parameters. Note that we can't rely on the delegation
+ // to CC_X86_32_Common for this because that happens after code that
+ // puts arguments in registers.
+ CCIfByVal<CCPassByVal<4, 4>>,
+
+ // Promote i1/i8/i16 arguments to i32.
+ CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+ // The 'nest' parameter, if any, is passed in EAX.
+ CCIfNest<CCAssignToReg<[EAX]>>,
+
+ // The first 2 integer arguments are passed in ECX/EDX
+ CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>,
+
+ // The first 3 float or double arguments, if the call is not a vararg
+ // call and if SSE2 is available, are passed in SSE registers.
+ CCIfNotVarArg<CCIfType<[f32,f64],
+ CCIfSubtarget<"hasSSE2()",
+ CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
+
+ // Doubles get 8-byte slots that are 8-byte aligned.
+ CCIfType<[f64], CCAssignToStack<8, 8>>,
+
+ // Otherwise, same as everything else.
+ CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_GHC : CallingConv<[
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // Pass in STG registers: Base, Sp, Hp, R1
+ CCIfType<[i32], CCAssignToReg<[EBX, EBP, EDI, ESI]>>
+]>;
+
+def CC_X86_32_HiPE : CallingConv<[
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // Pass in VM's registers: HP, P, ARG0, ARG1, ARG2
+ CCIfType<[i32], CCAssignToReg<[ESI, EBP, EAX, EDX, ECX]>>,
+
+ // Integer/Float values get stored in stack slots that are 4 bytes in
+ // size and 4-byte aligned.
+ CCIfType<[i32, f32], CCAssignToStack<4, 4>>
+]>;
+
+// X86-64 Intel OpenCL built-ins calling convention.
+def CC_Intel_OCL_BI : CallingConv<[
+
+ CCIfType<[i32], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[ECX, EDX, R8D, R9D]>>>,
+ CCIfType<[i64], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[RCX, RDX, R8, R9 ]>>>,
+
+ CCIfType<[i32], CCIfSubtarget<"is64Bit()", CCAssignToReg<[EDI, ESI, EDX, ECX]>>>,
+ CCIfType<[i64], CCIfSubtarget<"is64Bit()", CCAssignToReg<[RDI, RSI, RDX, RCX]>>>,
+
+ CCIfType<[i32], CCAssignToStack<4, 4>>,
+
+ // The SSE vector arguments are passed in XMM registers.
+ CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
+
+ // The 256-bit vector arguments are passed in YMM registers.
+ CCIfType<[v8f32, v4f64, v8i32, v4i64],
+ CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>,
+
+ // The 512-bit vector arguments are passed in ZMM registers.
+ CCIfType<[v16f32, v8f64, v16i32, v8i64],
+ CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>,
+
+ // Pass masks in mask registers
+ CCIfType<[v16i1, v8i1], CCAssignToReg<[K1]>>,
+
+ CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,
+ CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64_C>>,
+ CCDelegateTo<CC_X86_32_C>
+]>;
+
+def CC_X86_32_Intr : CallingConv<[
+ CCAssignToStack<4, 4>
+]>;
+
+def CC_X86_64_Intr : CallingConv<[
+ CCAssignToStack<8, 8>
+]>;
+
+//===----------------------------------------------------------------------===//
+// X86 Root Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// This is the root argument convention for the X86-32 backend.
+def CC_X86_32 : CallingConv<[
+ // X86_INTR calling convention is valid in MCU target and should override the
+ // MCU calling convention. Thus, this should be checked before isTargetMCU().
+ CCIfCC<"CallingConv::X86_INTR", CCDelegateTo<CC_X86_32_Intr>>,
+ CCIfSubtarget<"isTargetMCU()", CCDelegateTo<CC_X86_32_MCU>>,
+ CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo<CC_X86_32_FastCall>>,
+ CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win32_VectorCall>>,
+ CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>,
+ CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>,
+ CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>,
+ CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_32_HiPE>>,
+ CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo<CC_X86_32_RegCall>>,
+
+ // Otherwise, drop to normal X86-32 CC
+ CCDelegateTo<CC_X86_32_C>
+]>;
+
+// This is the root argument convention for the X86-64 backend.
+def CC_X86_64 : CallingConv<[
+ CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_64_GHC>>,
+ CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_64_HiPE>>,
+ CCIfCC<"CallingConv::WebKit_JS", CCDelegateTo<CC_X86_64_WebKit_JS>>,
+ CCIfCC<"CallingConv::AnyReg", CCDelegateTo<CC_X86_64_AnyReg>>,
+ CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<CC_X86_Win64_C>>,
+ CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<CC_X86_64_C>>,
+ CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win64_VectorCall>>,
+ CCIfCC<"CallingConv::HHVM", CCDelegateTo<CC_X86_64_HHVM>>,
+ CCIfCC<"CallingConv::HHVM_C", CCDelegateTo<CC_X86_64_HHVM_C>>,
+ CCIfCC<"CallingConv::X86_RegCall",
+ CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_RegCall>>>,
+ CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo<CC_X86_SysV64_RegCall>>,
+ CCIfCC<"CallingConv::X86_INTR", CCDelegateTo<CC_X86_64_Intr>>,
+
+ // Mingw64 and native Win64 use Win64 CC
+ CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,
+
+ // Otherwise, drop to normal X86-64 CC
+ CCDelegateTo<CC_X86_64_C>
+]>;
+
+// This is the argument convention used for the entire X86 backend.
+def CC_X86 : CallingConv<[
+ CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<CC_Intel_OCL_BI>>,
+ CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64>>,
+ CCDelegateTo<CC_X86_32>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Callee-saved Registers.
+//===----------------------------------------------------------------------===//
+
+def CSR_NoRegs : CalleeSavedRegs<(add)>;
+
+def CSR_32 : CalleeSavedRegs<(add ESI, EDI, EBX, EBP)>;
+def CSR_64 : CalleeSavedRegs<(add RBX, R12, R13, R14, R15, RBP)>;
+
+def CSR_64_SwiftError : CalleeSavedRegs<(sub CSR_64, R12)>;
+
+def CSR_32EHRet : CalleeSavedRegs<(add EAX, EDX, CSR_32)>;
+def CSR_64EHRet : CalleeSavedRegs<(add RAX, RDX, CSR_64)>;
+
+def CSR_Win64_NoSSE : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15)>;
+
+def CSR_Win64 : CalleeSavedRegs<(add CSR_Win64_NoSSE,
+ (sequence "XMM%u", 6, 15))>;
+
+// The function used by Darwin to obtain the address of a thread-local variable
+// uses rdi to pass a single parameter and rax for the return value. All other
+// GPRs are preserved.
+def CSR_64_TLS_Darwin : CalleeSavedRegs<(add CSR_64, RCX, RDX, RSI,
+ R8, R9, R10, R11)>;
+
+// CSRs that are handled by prologue, epilogue.
+def CSR_64_CXX_TLS_Darwin_PE : CalleeSavedRegs<(add RBP)>;
+
+// CSRs that are handled explicitly via copies.
+def CSR_64_CXX_TLS_Darwin_ViaCopy : CalleeSavedRegs<(sub CSR_64_TLS_Darwin, RBP)>;
+
+// All GPRs - except r11
+def CSR_64_RT_MostRegs : CalleeSavedRegs<(add CSR_64, RAX, RCX, RDX, RSI, RDI,
+ R8, R9, R10, RSP)>;
+
+// All registers - except r11
+def CSR_64_RT_AllRegs : CalleeSavedRegs<(add CSR_64_RT_MostRegs,
+ (sequence "XMM%u", 0, 15))>;
+def CSR_64_RT_AllRegs_AVX : CalleeSavedRegs<(add CSR_64_RT_MostRegs,
+ (sequence "YMM%u", 0, 15))>;
+
+def CSR_64_MostRegs : CalleeSavedRegs<(add RBX, RCX, RDX, RSI, RDI, R8, R9, R10,
+ R11, R12, R13, R14, R15, RBP,
+ (sequence "XMM%u", 0, 15))>;
+
+def CSR_32_AllRegs : CalleeSavedRegs<(add EAX, EBX, ECX, EDX, EBP, ESI,
+ EDI)>;
+def CSR_32_AllRegs_SSE : CalleeSavedRegs<(add CSR_32_AllRegs,
+ (sequence "XMM%u", 0, 7))>;
+def CSR_32_AllRegs_AVX : CalleeSavedRegs<(add CSR_32_AllRegs,
+ (sequence "YMM%u", 0, 7))>;
+def CSR_32_AllRegs_AVX512 : CalleeSavedRegs<(add CSR_32_AllRegs,
+ (sequence "ZMM%u", 0, 7),
+ (sequence "K%u", 0, 7))>;
+
+def CSR_64_AllRegs : CalleeSavedRegs<(add CSR_64_MostRegs, RAX)>;
+def CSR_64_AllRegs_AVX : CalleeSavedRegs<(sub (add CSR_64_MostRegs, RAX,
+ (sequence "YMM%u", 0, 15)),
+ (sequence "XMM%u", 0, 15))>;
+def CSR_64_AllRegs_AVX512 : CalleeSavedRegs<(sub (add CSR_64_MostRegs, RAX,
+ (sequence "ZMM%u", 0, 31),
+ (sequence "K%u", 0, 7)),
+ (sequence "XMM%u", 0, 15))>;
+
+// Standard C + YMM6-15
+def CSR_Win64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12,
+ R13, R14, R15,
+ (sequence "YMM%u", 6, 15))>;
+
+def CSR_Win64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI,
+ R12, R13, R14, R15,
+ (sequence "ZMM%u", 6, 21),
+ K4, K5, K6, K7)>;
+//Standard C + XMM 8-15
+def CSR_64_Intel_OCL_BI : CalleeSavedRegs<(add CSR_64,
+ (sequence "XMM%u", 8, 15))>;
+
+//Standard C + YMM 8-15
+def CSR_64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add CSR_64,
+ (sequence "YMM%u", 8, 15))>;
+
+def CSR_64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RDI, RSI, R14, R15,
+ (sequence "ZMM%u", 16, 31),
+ K4, K5, K6, K7)>;
+
+// Only R12 is preserved for PHP calls in HHVM.
+def CSR_64_HHVM : CalleeSavedRegs<(add R12)>;
+
+// Register calling convention preserves few GPR and XMM8-15
+def CSR_32_RegCall_NoSSE : CalleeSavedRegs<(add ESI, EDI, EBX, EBP, ESP)>;
+def CSR_32_RegCall : CalleeSavedRegs<(add CSR_32_RegCall_NoSSE,
+ (sequence "XMM%u", 4, 7))>;
+def CSR_Win64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP, RSP,
+ (sequence "R%u", 10, 15))>;
+def CSR_Win64_RegCall : CalleeSavedRegs<(add CSR_Win64_RegCall_NoSSE,
+ (sequence "XMM%u", 8, 15))>;
+def CSR_SysV64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP, RSP,
+ (sequence "R%u", 12, 15))>;
+def CSR_SysV64_RegCall : CalleeSavedRegs<(add CSR_SysV64_RegCall_NoSSE,
+ (sequence "XMM%u", 8, 15))>;
+
diff --git a/contrib/llvm/lib/Target/X86/X86EvexToVex.cpp b/contrib/llvm/lib/Target/X86/X86EvexToVex.cpp
new file mode 100755
index 000000000000..bdd1ab537bb2
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86EvexToVex.cpp
@@ -0,0 +1,213 @@
+//===----------------------- X86EvexToVex.cpp ----------------------------===//
+// Compress EVEX instructions to VEX encoding when possible to reduce code size
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+/// \file
+/// This file defines the pass that goes over all AVX-512 instructions which
+/// are encoded using the EVEX prefix and if possible replaces them by their
+/// corresponding VEX encoding which is usually shorter by 2 bytes.
+/// EVEX instructions may be encoded via the VEX prefix when the AVX-512
+/// instruction has a corresponding AVX/AVX2 opcode and when it does not
+/// use the xmm or the mask registers or xmm/ymm registers wuith indexes
+/// higher than 15.
+/// The pass applies code reduction on the generated code for AVX-512 instrs.
+///
+//===---------------------------------------------------------------------===//
+
+#include "InstPrinter/X86InstComments.h"
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86InstrTablesInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+
+using namespace llvm;
+
+#define EVEX2VEX_DESC "Compressing EVEX instrs to VEX encoding when possible"
+#define EVEX2VEX_NAME "x86-evex-to-vex-compress"
+
+#define DEBUG_TYPE EVEX2VEX_NAME
+
+namespace {
+
+class EvexToVexInstPass : public MachineFunctionPass {
+
+ /// X86EvexToVexCompressTable - Evex to Vex encoding opcode map.
+ typedef DenseMap<unsigned, uint16_t> EvexToVexTableType;
+ EvexToVexTableType EvexToVex128Table;
+ EvexToVexTableType EvexToVex256Table;
+
+ /// For EVEX instructions that can be encoded using VEX encoding, replace
+ /// them by the VEX encoding in order to reduce size.
+ bool CompressEvexToVexImpl(MachineInstr &MI) const;
+
+ /// For initializing the hash map tables of all AVX-512 EVEX
+ /// corresponding to AVX/AVX2 opcodes.
+ void AddTableEntry(EvexToVexTableType &EvexToVexTable, uint16_t EvexOp,
+ uint16_t VexOp);
+
+public:
+ static char ID;
+
+ StringRef getPassName() const override { return EVEX2VEX_DESC; }
+
+ EvexToVexInstPass() : MachineFunctionPass(ID) {
+ initializeEvexToVexInstPassPass(*PassRegistry::getPassRegistry());
+
+ // Initialize the EVEX to VEX 128 table map.
+ for (X86EvexToVexCompressTableEntry Entry : X86EvexToVex128CompressTable) {
+ AddTableEntry(EvexToVex128Table, Entry.EvexOpcode, Entry.VexOpcode);
+ }
+
+ // Initialize the EVEX to VEX 256 table map.
+ for (X86EvexToVexCompressTableEntry Entry : X86EvexToVex256CompressTable) {
+ AddTableEntry(EvexToVex256Table, Entry.EvexOpcode, Entry.VexOpcode);
+ }
+ }
+
+ /// Loop over all of the basic blocks, replacing EVEX instructions
+ /// by equivalent VEX instructions when possible for reducing code size.
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ // This pass runs after regalloc and doesn't support VReg operands.
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+private:
+ /// Machine instruction info used throughout the class.
+ const X86InstrInfo *TII;
+};
+
+char EvexToVexInstPass::ID = 0;
+}
+
+INITIALIZE_PASS(EvexToVexInstPass, EVEX2VEX_NAME, EVEX2VEX_DESC, false, false)
+
+FunctionPass *llvm::createX86EvexToVexInsts() {
+ return new EvexToVexInstPass();
+}
+
+bool EvexToVexInstPass::runOnMachineFunction(MachineFunction &MF) {
+ TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+
+ const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
+ if (!ST.hasAVX512())
+ return false;
+
+ bool Changed = false;
+
+ /// Go over all basic blocks in function and replace
+ /// EVEX encoded instrs by VEX encoding when possible.
+ for (MachineBasicBlock &MBB : MF) {
+
+ // Traverse the basic block.
+ for (MachineInstr &MI : MBB)
+ Changed |= CompressEvexToVexImpl(MI);
+ }
+
+ return Changed;
+}
+
+void EvexToVexInstPass::AddTableEntry(EvexToVexTableType &EvexToVexTable,
+ uint16_t EvexOp, uint16_t VexOp) {
+ EvexToVexTable[EvexOp] = VexOp;
+}
+
+// For EVEX instructions that can be encoded using VEX encoding
+// replace them by the VEX encoding in order to reduce size.
+bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const {
+
+ // VEX format.
+ // # of bytes: 0,2,3 1 1 0,1 0,1,2,4 0,1
+ // [Prefixes] [VEX] OPCODE ModR/M [SIB] [DISP] [IMM]
+ //
+ // EVEX format.
+ // # of bytes: 4 1 1 1 4 / 1 1
+ // [Prefixes] EVEX Opcode ModR/M [SIB] [Disp32] / [Disp8*N] [Immediate]
+
+ const MCInstrDesc &Desc = MI.getDesc();
+
+ // Check for EVEX instructions only.
+ if ((Desc.TSFlags & X86II::EncodingMask) != X86II::EVEX)
+ return false;
+
+ // Check for EVEX instructions with mask or broadcast as in these cases
+ // the EVEX prefix is needed in order to carry this information
+ // thus preventing the transformation to VEX encoding.
+ if (Desc.TSFlags & (X86II::EVEX_K | X86II::EVEX_B))
+ return false;
+
+ // Check for non EVEX_V512 instrs only.
+ // EVEX_V512 instr: bit EVEX_L2 = 1; bit VEX_L = 0.
+ if ((Desc.TSFlags & X86II::EVEX_L2) && !(Desc.TSFlags & X86II::VEX_L))
+ return false;
+
+ // EVEX_V128 instr: bit EVEX_L2 = 0, bit VEX_L = 0.
+ bool IsEVEX_V128 =
+ (!(Desc.TSFlags & X86II::EVEX_L2) && !(Desc.TSFlags & X86II::VEX_L));
+
+ // EVEX_V256 instr: bit EVEX_L2 = 0, bit VEX_L = 1.
+ bool IsEVEX_V256 =
+ (!(Desc.TSFlags & X86II::EVEX_L2) && (Desc.TSFlags & X86II::VEX_L));
+
+ unsigned NewOpc = 0;
+
+ // Check for EVEX_V256 instructions.
+ if (IsEVEX_V256) {
+ // Search for opcode in the EvexToVex256 table.
+ auto It = EvexToVex256Table.find(MI.getOpcode());
+ if (It != EvexToVex256Table.end())
+ NewOpc = It->second;
+ }
+
+ // Check for EVEX_V128 or Scalar instructions.
+ else if (IsEVEX_V128) {
+ // Search for opcode in the EvexToVex128 table.
+ auto It = EvexToVex128Table.find(MI.getOpcode());
+ if (It != EvexToVex128Table.end())
+ NewOpc = It->second;
+ }
+
+ if (!NewOpc)
+ return false;
+
+ auto isHiRegIdx = [](unsigned Reg) {
+ // Check for XMM register with indexes between 16 - 31.
+ if (Reg >= X86::XMM16 && Reg <= X86::XMM31)
+ return true;
+
+ // Check for YMM register with indexes between 16 - 31.
+ if (Reg >= X86::YMM16 && Reg <= X86::YMM31)
+ return true;
+
+ return false;
+ };
+
+ // Check that operands are not ZMM regs or
+ // XMM/YMM regs with hi indexes between 16 - 31.
+ for (const MachineOperand &MO : MI.explicit_operands()) {
+ if (!MO.isReg())
+ continue;
+
+ unsigned Reg = MO.getReg();
+
+ assert (!(Reg >= X86::ZMM0 && Reg <= X86::ZMM31));
+
+ if (isHiRegIdx(Reg))
+ return false;
+ }
+
+ const MCInstrDesc &MCID = TII->get(NewOpc);
+ MI.setDesc(MCID);
+ MI.setAsmPrinterFlag(AC_EVEX_2_VEX);
+ return true;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp
new file mode 100644
index 000000000000..192b942490e7
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -0,0 +1,297 @@
+//===------- X86ExpandPseudo.cpp - Expand pseudo instructions -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that expands pseudo instructions into target
+// instructions to allow proper scheduling, if-conversion, other late
+// optimizations, or simply the encoding of the instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86FrameLowering.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h" // For IDs of passes that are preserved.
+#include "llvm/IR/GlobalValue.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-pseudo"
+
+namespace {
+class X86ExpandPseudo : public MachineFunctionPass {
+public:
+ static char ID;
+ X86ExpandPseudo() : MachineFunctionPass(ID) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addPreservedID(MachineLoopInfoID);
+ AU.addPreservedID(MachineDominatorsID);
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ const X86Subtarget *STI;
+ const X86InstrInfo *TII;
+ const X86RegisterInfo *TRI;
+ const X86MachineFunctionInfo *X86FI;
+ const X86FrameLowering *X86FL;
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+ StringRef getPassName() const override {
+ return "X86 pseudo instruction expansion pass";
+ }
+
+private:
+ bool ExpandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
+ bool ExpandMBB(MachineBasicBlock &MBB);
+};
+char X86ExpandPseudo::ID = 0;
+} // End anonymous namespace.
+
+/// If \p MBBI is a pseudo instruction, this method expands
+/// it to the corresponding (sequence of) actual instruction(s).
+/// \returns true if \p MBBI has been expanded.
+bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned Opcode = MI.getOpcode();
+ DebugLoc DL = MBBI->getDebugLoc();
+ switch (Opcode) {
+ default:
+ return false;
+ case X86::TCRETURNdi:
+ case X86::TCRETURNdicc:
+ case X86::TCRETURNri:
+ case X86::TCRETURNmi:
+ case X86::TCRETURNdi64:
+ case X86::TCRETURNdi64cc:
+ case X86::TCRETURNri64:
+ case X86::TCRETURNmi64: {
+ bool isMem = Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64;
+ MachineOperand &JumpTarget = MBBI->getOperand(0);
+ MachineOperand &StackAdjust = MBBI->getOperand(isMem ? 5 : 1);
+ assert(StackAdjust.isImm() && "Expecting immediate value.");
+
+ // Adjust stack pointer.
+ int StackAdj = StackAdjust.getImm();
+ int MaxTCDelta = X86FI->getTCReturnAddrDelta();
+ int Offset = 0;
+ assert(MaxTCDelta <= 0 && "MaxTCDelta should never be positive");
+
+ // Incoporate the retaddr area.
+ Offset = StackAdj - MaxTCDelta;
+ assert(Offset >= 0 && "Offset should never be negative");
+
+ if (Opcode == X86::TCRETURNdicc || Opcode == X86::TCRETURNdi64cc) {
+ assert(Offset == 0 && "Conditional tail call cannot adjust the stack.");
+ }
+
+ if (Offset) {
+ // Check for possible merge with preceding ADD instruction.
+ Offset += X86FL->mergeSPUpdates(MBB, MBBI, true);
+ X86FL->emitSPUpdate(MBB, MBBI, Offset, /*InEpilogue=*/true);
+ }
+
+ // Jump to label or value in register.
+ bool IsWin64 = STI->isTargetWin64();
+ if (Opcode == X86::TCRETURNdi || Opcode == X86::TCRETURNdicc ||
+ Opcode == X86::TCRETURNdi64 || Opcode == X86::TCRETURNdi64cc) {
+ unsigned Op;
+ switch (Opcode) {
+ case X86::TCRETURNdi:
+ Op = X86::TAILJMPd;
+ break;
+ case X86::TCRETURNdicc:
+ Op = X86::TAILJMPd_CC;
+ break;
+ case X86::TCRETURNdi64cc:
+ assert(!IsWin64 && "Conditional tail calls confuse the Win64 unwinder.");
+ // TODO: We could do it for Win64 "leaf" functions though; PR30337.
+ Op = X86::TAILJMPd64_CC;
+ break;
+ default:
+ // Note: Win64 uses REX prefixes indirect jumps out of functions, but
+ // not direct ones.
+ Op = X86::TAILJMPd64;
+ break;
+ }
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op));
+ if (JumpTarget.isGlobal()) {
+ MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
+ JumpTarget.getTargetFlags());
+ } else {
+ assert(JumpTarget.isSymbol());
+ MIB.addExternalSymbol(JumpTarget.getSymbolName(),
+ JumpTarget.getTargetFlags());
+ }
+ if (Op == X86::TAILJMPd_CC || Op == X86::TAILJMPd64_CC) {
+ MIB.addImm(MBBI->getOperand(2).getImm());
+ }
+
+ } else if (Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64) {
+ unsigned Op = (Opcode == X86::TCRETURNmi)
+ ? X86::TAILJMPm
+ : (IsWin64 ? X86::TAILJMPm64_REX : X86::TAILJMPm64);
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op));
+ for (unsigned i = 0; i != 5; ++i)
+ MIB.addOperand(MBBI->getOperand(i));
+ } else if (Opcode == X86::TCRETURNri64) {
+ BuildMI(MBB, MBBI, DL,
+ TII->get(IsWin64 ? X86::TAILJMPr64_REX : X86::TAILJMPr64))
+ .addReg(JumpTarget.getReg(), RegState::Kill);
+ } else {
+ BuildMI(MBB, MBBI, DL, TII->get(X86::TAILJMPr))
+ .addReg(JumpTarget.getReg(), RegState::Kill);
+ }
+
+ MachineInstr &NewMI = *std::prev(MBBI);
+ NewMI.copyImplicitOps(*MBBI->getParent()->getParent(), *MBBI);
+
+ // Delete the pseudo instruction TCRETURN.
+ MBB.erase(MBBI);
+
+ return true;
+ }
+ case X86::EH_RETURN:
+ case X86::EH_RETURN64: {
+ MachineOperand &DestAddr = MBBI->getOperand(0);
+ assert(DestAddr.isReg() && "Offset should be in register!");
+ const bool Uses64BitFramePtr =
+ STI->isTarget64BitLP64() || STI->isTargetNaCl64();
+ unsigned StackPtr = TRI->getStackRegister();
+ BuildMI(MBB, MBBI, DL,
+ TII->get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), StackPtr)
+ .addReg(DestAddr.getReg());
+ // The EH_RETURN pseudo is really removed during the MC Lowering.
+ return true;
+ }
+ case X86::IRET: {
+ // Adjust stack to erase error code
+ int64_t StackAdj = MBBI->getOperand(0).getImm();
+ X86FL->emitSPUpdate(MBB, MBBI, StackAdj, true);
+ // Replace pseudo with machine iret
+ BuildMI(MBB, MBBI, DL,
+ TII->get(STI->is64Bit() ? X86::IRET64 : X86::IRET32));
+ MBB.erase(MBBI);
+ return true;
+ }
+ case X86::RET: {
+ // Adjust stack to erase error code
+ int64_t StackAdj = MBBI->getOperand(0).getImm();
+ MachineInstrBuilder MIB;
+ if (StackAdj == 0) {
+ MIB = BuildMI(MBB, MBBI, DL,
+ TII->get(STI->is64Bit() ? X86::RETQ : X86::RETL));
+ } else if (isUInt<16>(StackAdj)) {
+ MIB = BuildMI(MBB, MBBI, DL,
+ TII->get(STI->is64Bit() ? X86::RETIQ : X86::RETIL))
+ .addImm(StackAdj);
+ } else {
+ assert(!STI->is64Bit() &&
+ "shouldn't need to do this for x86_64 targets!");
+ // A ret can only handle immediates as big as 2**16-1. If we need to pop
+ // off bytes before the return address, we must do it manually.
+ BuildMI(MBB, MBBI, DL, TII->get(X86::POP32r)).addReg(X86::ECX, RegState::Define);
+ X86FL->emitSPUpdate(MBB, MBBI, StackAdj, /*InEpilogue=*/true);
+ BuildMI(MBB, MBBI, DL, TII->get(X86::PUSH32r)).addReg(X86::ECX);
+ MIB = BuildMI(MBB, MBBI, DL, TII->get(X86::RETL));
+ }
+ for (unsigned I = 1, E = MBBI->getNumOperands(); I != E; ++I)
+ MIB.addOperand(MBBI->getOperand(I));
+ MBB.erase(MBBI);
+ return true;
+ }
+ case X86::EH_RESTORE: {
+ // Restore ESP and EBP, and optionally ESI if required.
+ bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(
+ MBB.getParent()->getFunction()->getPersonalityFn()));
+ X86FL->restoreWin32EHStackPointers(MBB, MBBI, DL, /*RestoreSP=*/IsSEH);
+ MBBI->eraseFromParent();
+ return true;
+ }
+ case X86::LCMPXCHG8B_SAVE_EBX:
+ case X86::LCMPXCHG16B_SAVE_RBX: {
+ // Perform the following transformation.
+ // SaveRbx = pseudocmpxchg Addr, <4 opds for the address>, InArg, SaveRbx
+ // =>
+ // [E|R]BX = InArg
+ // actualcmpxchg Addr
+ // [E|R]BX = SaveRbx
+ const MachineOperand &InArg = MBBI->getOperand(6);
+ unsigned SaveRbx = MBBI->getOperand(7).getReg();
+
+ unsigned ActualInArg =
+ Opcode == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
+ // Copy the input argument of the pseudo into the argument of the
+ // actual instruction.
+ TII->copyPhysReg(MBB, MBBI, DL, ActualInArg, InArg.getReg(),
+ InArg.isKill());
+ // Create the actual instruction.
+ unsigned ActualOpc =
+ Opcode == X86::LCMPXCHG8B_SAVE_EBX ? X86::LCMPXCHG8B : X86::LCMPXCHG16B;
+ MachineInstr *NewInstr = BuildMI(MBB, MBBI, DL, TII->get(ActualOpc));
+ // Copy the operands related to the address.
+ for (unsigned Idx = 1; Idx < 6; ++Idx)
+ NewInstr->addOperand(MBBI->getOperand(Idx));
+ // Finally, restore the value of RBX.
+ TII->copyPhysReg(MBB, MBBI, DL, ActualInArg, SaveRbx,
+ /*SrcIsKill*/ true);
+
+ // Delete the pseudo.
+ MBBI->eraseFromParent();
+ return true;
+ }
+ }
+ llvm_unreachable("Previous switch has a fallthrough?");
+}
+
+/// Expand all pseudo instructions contained in \p MBB.
+/// \returns true if any expansion occurred for \p MBB.
+bool X86ExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
+ bool Modified = false;
+
+ // MBBI may be invalidated by the expansion.
+ MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+ while (MBBI != E) {
+ MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+ Modified |= ExpandMI(MBB, MBBI);
+ MBBI = NMBBI;
+ }
+
+ return Modified;
+}
+
+bool X86ExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
+ STI = &static_cast<const X86Subtarget &>(MF.getSubtarget());
+ TII = STI->getInstrInfo();
+ TRI = STI->getRegisterInfo();
+ X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ X86FL = STI->getFrameLowering();
+
+ bool Modified = false;
+ for (MachineBasicBlock &MBB : MF)
+ Modified |= ExpandMBB(MBB);
+ return Modified;
+}
+
+/// Returns an instance of the pseudo instruction expansion pass.
+FunctionPass *llvm::createX86ExpandPseudoPass() {
+ return new X86ExpandPseudo();
+}
diff --git a/contrib/llvm/lib/Target/X86/X86FastISel.cpp b/contrib/llvm/lib/Target/X86/X86FastISel.cpp
new file mode 100644
index 000000000000..c890fdd1e519
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FastISel.cpp
@@ -0,0 +1,3933 @@
+//===-- X86FastISel.cpp - X86 FastISel implementation ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86-specific support for the FastISel class. Much
+// of the target-specific code is generated by tablegen in the file
+// X86GenFastISel.inc, which is #included here.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86CallingConv.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+namespace {
+
+class X86FastISel final : public FastISel {
+ /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
+ /// make the right decision when generating code for different targets.
+ const X86Subtarget *Subtarget;
+
+ /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87
+ /// floating point ops.
+ /// When SSE is available, use it for f32 operations.
+ /// When SSE2 is available, use it for f64 operations.
+ bool X86ScalarSSEf64;
+ bool X86ScalarSSEf32;
+
+public:
+ explicit X86FastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo)
+ : FastISel(funcInfo, libInfo) {
+ Subtarget = &funcInfo.MF->getSubtarget<X86Subtarget>();
+ X86ScalarSSEf64 = Subtarget->hasSSE2();
+ X86ScalarSSEf32 = Subtarget->hasSSE1();
+ }
+
+ bool fastSelectInstruction(const Instruction *I) override;
+
+ /// \brief The specified machine instr operand is a vreg, and that
+ /// vreg is being provided by the specified load instruction. If possible,
+ /// try to fold the load as an operand to the instruction, returning true if
+ /// possible.
+ bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
+ const LoadInst *LI) override;
+
+ bool fastLowerArguments() override;
+ bool fastLowerCall(CallLoweringInfo &CLI) override;
+ bool fastLowerIntrinsicCall(const IntrinsicInst *II) override;
+
+#include "X86GenFastISel.inc"
+
+private:
+ bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT,
+ const DebugLoc &DL);
+
+ bool X86FastEmitLoad(EVT VT, X86AddressMode &AM, MachineMemOperand *MMO,
+ unsigned &ResultReg, unsigned Alignment = 1);
+
+ bool X86FastEmitStore(EVT VT, const Value *Val, X86AddressMode &AM,
+ MachineMemOperand *MMO = nullptr, bool Aligned = false);
+ bool X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
+ X86AddressMode &AM,
+ MachineMemOperand *MMO = nullptr, bool Aligned = false);
+
+ bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT,
+ unsigned &ResultReg);
+
+ bool X86SelectAddress(const Value *V, X86AddressMode &AM);
+ bool X86SelectCallAddress(const Value *V, X86AddressMode &AM);
+
+ bool X86SelectLoad(const Instruction *I);
+
+ bool X86SelectStore(const Instruction *I);
+
+ bool X86SelectRet(const Instruction *I);
+
+ bool X86SelectCmp(const Instruction *I);
+
+ bool X86SelectZExt(const Instruction *I);
+
+ bool X86SelectBranch(const Instruction *I);
+
+ bool X86SelectShift(const Instruction *I);
+
+ bool X86SelectDivRem(const Instruction *I);
+
+ bool X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I);
+
+ bool X86FastEmitSSESelect(MVT RetVT, const Instruction *I);
+
+ bool X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I);
+
+ bool X86SelectSelect(const Instruction *I);
+
+ bool X86SelectTrunc(const Instruction *I);
+
+ bool X86SelectFPExtOrFPTrunc(const Instruction *I, unsigned Opc,
+ const TargetRegisterClass *RC);
+
+ bool X86SelectFPExt(const Instruction *I);
+ bool X86SelectFPTrunc(const Instruction *I);
+ bool X86SelectSIToFP(const Instruction *I);
+
+ const X86InstrInfo *getInstrInfo() const {
+ return Subtarget->getInstrInfo();
+ }
+ const X86TargetMachine *getTargetMachine() const {
+ return static_cast<const X86TargetMachine *>(&TM);
+ }
+
+ bool handleConstantAddresses(const Value *V, X86AddressMode &AM);
+
+ unsigned X86MaterializeInt(const ConstantInt *CI, MVT VT);
+ unsigned X86MaterializeFP(const ConstantFP *CFP, MVT VT);
+ unsigned X86MaterializeGV(const GlobalValue *GV, MVT VT);
+ unsigned fastMaterializeConstant(const Constant *C) override;
+
+ unsigned fastMaterializeAlloca(const AllocaInst *C) override;
+
+ unsigned fastMaterializeFloatZero(const ConstantFP *CF) override;
+
+ /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
+ /// computed in an SSE register, not on the X87 floating point stack.
+ bool isScalarFPTypeInSSEReg(EVT VT) const {
+ return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
+ (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1
+ }
+
+ bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false);
+
+ bool IsMemcpySmall(uint64_t Len);
+
+ bool TryEmitSmallMemcpy(X86AddressMode DestAM,
+ X86AddressMode SrcAM, uint64_t Len);
+
+ bool foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
+ const Value *Cond);
+
+ const MachineInstrBuilder &addFullAddress(const MachineInstrBuilder &MIB,
+ X86AddressMode &AM);
+
+ unsigned fastEmitInst_rrrr(unsigned MachineInstOpcode,
+ const TargetRegisterClass *RC, unsigned Op0,
+ bool Op0IsKill, unsigned Op1, bool Op1IsKill,
+ unsigned Op2, bool Op2IsKill, unsigned Op3,
+ bool Op3IsKill);
+};
+
+} // end anonymous namespace.
+
+static std::pair<X86::CondCode, bool>
+getX86ConditionCode(CmpInst::Predicate Predicate) {
+ X86::CondCode CC = X86::COND_INVALID;
+ bool NeedSwap = false;
+ switch (Predicate) {
+ default: break;
+ // Floating-point Predicates
+ case CmpInst::FCMP_UEQ: CC = X86::COND_E; break;
+ case CmpInst::FCMP_OLT: NeedSwap = true; LLVM_FALLTHROUGH;
+ case CmpInst::FCMP_OGT: CC = X86::COND_A; break;
+ case CmpInst::FCMP_OLE: NeedSwap = true; LLVM_FALLTHROUGH;
+ case CmpInst::FCMP_OGE: CC = X86::COND_AE; break;
+ case CmpInst::FCMP_UGT: NeedSwap = true; LLVM_FALLTHROUGH;
+ case CmpInst::FCMP_ULT: CC = X86::COND_B; break;
+ case CmpInst::FCMP_UGE: NeedSwap = true; LLVM_FALLTHROUGH;
+ case CmpInst::FCMP_ULE: CC = X86::COND_BE; break;
+ case CmpInst::FCMP_ONE: CC = X86::COND_NE; break;
+ case CmpInst::FCMP_UNO: CC = X86::COND_P; break;
+ case CmpInst::FCMP_ORD: CC = X86::COND_NP; break;
+ case CmpInst::FCMP_OEQ: LLVM_FALLTHROUGH;
+ case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break;
+
+ // Integer Predicates
+ case CmpInst::ICMP_EQ: CC = X86::COND_E; break;
+ case CmpInst::ICMP_NE: CC = X86::COND_NE; break;
+ case CmpInst::ICMP_UGT: CC = X86::COND_A; break;
+ case CmpInst::ICMP_UGE: CC = X86::COND_AE; break;
+ case CmpInst::ICMP_ULT: CC = X86::COND_B; break;
+ case CmpInst::ICMP_ULE: CC = X86::COND_BE; break;
+ case CmpInst::ICMP_SGT: CC = X86::COND_G; break;
+ case CmpInst::ICMP_SGE: CC = X86::COND_GE; break;
+ case CmpInst::ICMP_SLT: CC = X86::COND_L; break;
+ case CmpInst::ICMP_SLE: CC = X86::COND_LE; break;
+ }
+
+ return std::make_pair(CC, NeedSwap);
+}
+
+static std::pair<unsigned, bool>
+getX86SSEConditionCode(CmpInst::Predicate Predicate) {
+ unsigned CC;
+ bool NeedSwap = false;
+
+ // SSE Condition code mapping:
+ // 0 - EQ
+ // 1 - LT
+ // 2 - LE
+ // 3 - UNORD
+ // 4 - NEQ
+ // 5 - NLT
+ // 6 - NLE
+ // 7 - ORD
+ switch (Predicate) {
+ default: llvm_unreachable("Unexpected predicate");
+ case CmpInst::FCMP_OEQ: CC = 0; break;
+ case CmpInst::FCMP_OGT: NeedSwap = true; LLVM_FALLTHROUGH;
+ case CmpInst::FCMP_OLT: CC = 1; break;
+ case CmpInst::FCMP_OGE: NeedSwap = true; LLVM_FALLTHROUGH;
+ case CmpInst::FCMP_OLE: CC = 2; break;
+ case CmpInst::FCMP_UNO: CC = 3; break;
+ case CmpInst::FCMP_UNE: CC = 4; break;
+ case CmpInst::FCMP_ULE: NeedSwap = true; LLVM_FALLTHROUGH;
+ case CmpInst::FCMP_UGE: CC = 5; break;
+ case CmpInst::FCMP_ULT: NeedSwap = true; LLVM_FALLTHROUGH;
+ case CmpInst::FCMP_UGT: CC = 6; break;
+ case CmpInst::FCMP_ORD: CC = 7; break;
+ case CmpInst::FCMP_UEQ:
+ case CmpInst::FCMP_ONE: CC = 8; break;
+ }
+
+ return std::make_pair(CC, NeedSwap);
+}
+
+/// \brief Adds a complex addressing mode to the given machine instr builder.
+/// Note, this will constrain the index register. If its not possible to
+/// constrain the given index register, then a new one will be created. The
+/// IndexReg field of the addressing mode will be updated to match in this case.
+const MachineInstrBuilder &
+X86FastISel::addFullAddress(const MachineInstrBuilder &MIB,
+ X86AddressMode &AM) {
+ // First constrain the index register. It needs to be a GR64_NOSP.
+ AM.IndexReg = constrainOperandRegClass(MIB->getDesc(), AM.IndexReg,
+ MIB->getNumOperands() +
+ X86::AddrIndexReg);
+ return ::addFullAddress(MIB, AM);
+}
+
+/// \brief Check if it is possible to fold the condition from the XALU intrinsic
+/// into the user. The condition code will only be updated on success.
+bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
+ const Value *Cond) {
+ if (!isa<ExtractValueInst>(Cond))
+ return false;
+
+ const auto *EV = cast<ExtractValueInst>(Cond);
+ if (!isa<IntrinsicInst>(EV->getAggregateOperand()))
+ return false;
+
+ const auto *II = cast<IntrinsicInst>(EV->getAggregateOperand());
+ MVT RetVT;
+ const Function *Callee = II->getCalledFunction();
+ Type *RetTy =
+ cast<StructType>(Callee->getReturnType())->getTypeAtIndex(0U);
+ if (!isTypeLegal(RetTy, RetVT))
+ return false;
+
+ if (RetVT != MVT::i32 && RetVT != MVT::i64)
+ return false;
+
+ X86::CondCode TmpCC;
+ switch (II->getIntrinsicID()) {
+ default: return false;
+ case Intrinsic::sadd_with_overflow:
+ case Intrinsic::ssub_with_overflow:
+ case Intrinsic::smul_with_overflow:
+ case Intrinsic::umul_with_overflow: TmpCC = X86::COND_O; break;
+ case Intrinsic::uadd_with_overflow:
+ case Intrinsic::usub_with_overflow: TmpCC = X86::COND_B; break;
+ }
+
+ // Check if both instructions are in the same basic block.
+ if (II->getParent() != I->getParent())
+ return false;
+
+ // Make sure nothing is in the way
+ BasicBlock::const_iterator Start(I);
+ BasicBlock::const_iterator End(II);
+ for (auto Itr = std::prev(Start); Itr != End; --Itr) {
+ // We only expect extractvalue instructions between the intrinsic and the
+ // instruction to be selected.
+ if (!isa<ExtractValueInst>(Itr))
+ return false;
+
+ // Check that the extractvalue operand comes from the intrinsic.
+ const auto *EVI = cast<ExtractValueInst>(Itr);
+ if (EVI->getAggregateOperand() != II)
+ return false;
+ }
+
+ CC = TmpCC;
+ return true;
+}
+
+bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
+ EVT evt = TLI.getValueType(DL, Ty, /*HandleUnknown=*/true);
+ if (evt == MVT::Other || !evt.isSimple())
+ // Unhandled type. Halt "fast" selection and bail.
+ return false;
+
+ VT = evt.getSimpleVT();
+ // For now, require SSE/SSE2 for performing floating-point operations,
+ // since x87 requires additional work.
+ if (VT == MVT::f64 && !X86ScalarSSEf64)
+ return false;
+ if (VT == MVT::f32 && !X86ScalarSSEf32)
+ return false;
+ // Similarly, no f80 support yet.
+ if (VT == MVT::f80)
+ return false;
+ // We only handle legal types. For example, on x86-32 the instruction
+ // selector contains all of the 64-bit instructions from x86-64,
+ // under the assumption that i64 won't be used if the target doesn't
+ // support it.
+ return (AllowI1 && VT == MVT::i1) || TLI.isTypeLegal(VT);
+}
+
+#include "X86GenCallingConv.inc"
+
+/// X86FastEmitLoad - Emit a machine instruction to load a value of type VT.
+/// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV.
+/// Return true and the result register by reference if it is possible.
+bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
+ MachineMemOperand *MMO, unsigned &ResultReg,
+ unsigned Alignment) {
+ bool HasSSE41 = Subtarget->hasSSE41();
+ bool HasAVX = Subtarget->hasAVX();
+ bool HasAVX2 = Subtarget->hasAVX2();
+ bool HasAVX512 = Subtarget->hasAVX512();
+ bool HasVLX = Subtarget->hasVLX();
+ bool IsNonTemporal = MMO && MMO->isNonTemporal();
+
+ // Get opcode and regclass of the output for the given load instruction.
+ unsigned Opc = 0;
+ const TargetRegisterClass *RC = nullptr;
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return false;
+ case MVT::i1:
+ case MVT::i8:
+ Opc = X86::MOV8rm;
+ RC = &X86::GR8RegClass;
+ break;
+ case MVT::i16:
+ Opc = X86::MOV16rm;
+ RC = &X86::GR16RegClass;
+ break;
+ case MVT::i32:
+ Opc = X86::MOV32rm;
+ RC = &X86::GR32RegClass;
+ break;
+ case MVT::i64:
+ // Must be in x86-64 mode.
+ Opc = X86::MOV64rm;
+ RC = &X86::GR64RegClass;
+ break;
+ case MVT::f32:
+ if (X86ScalarSSEf32) {
+ Opc = HasAVX512 ? X86::VMOVSSZrm : HasAVX ? X86::VMOVSSrm : X86::MOVSSrm;
+ RC = &X86::FR32RegClass;
+ } else {
+ Opc = X86::LD_Fp32m;
+ RC = &X86::RFP32RegClass;
+ }
+ break;
+ case MVT::f64:
+ if (X86ScalarSSEf64) {
+ Opc = HasAVX512 ? X86::VMOVSDZrm : HasAVX ? X86::VMOVSDrm : X86::MOVSDrm;
+ RC = &X86::FR64RegClass;
+ } else {
+ Opc = X86::LD_Fp64m;
+ RC = &X86::RFP64RegClass;
+ }
+ break;
+ case MVT::f80:
+ // No f80 support yet.
+ return false;
+ case MVT::v4f32:
+ if (IsNonTemporal && Alignment >= 16 && HasSSE41)
+ Opc = HasVLX ? X86::VMOVNTDQAZ128rm :
+ HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
+ else if (Alignment >= 16)
+ Opc = HasVLX ? X86::VMOVAPSZ128rm :
+ HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm;
+ else
+ Opc = HasVLX ? X86::VMOVUPSZ128rm :
+ HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm;
+ RC = &X86::VR128RegClass;
+ break;
+ case MVT::v2f64:
+ if (IsNonTemporal && Alignment >= 16 && HasSSE41)
+ Opc = HasVLX ? X86::VMOVNTDQAZ128rm :
+ HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
+ else if (Alignment >= 16)
+ Opc = HasVLX ? X86::VMOVAPDZ128rm :
+ HasAVX ? X86::VMOVAPDrm : X86::MOVAPDrm;
+ else
+ Opc = HasVLX ? X86::VMOVUPDZ128rm :
+ HasAVX ? X86::VMOVUPDrm : X86::MOVUPDrm;
+ RC = &X86::VR128RegClass;
+ break;
+ case MVT::v4i32:
+ case MVT::v2i64:
+ case MVT::v8i16:
+ case MVT::v16i8:
+ if (IsNonTemporal && Alignment >= 16)
+ Opc = HasVLX ? X86::VMOVNTDQAZ128rm :
+ HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
+ else if (Alignment >= 16)
+ Opc = HasVLX ? X86::VMOVDQA64Z128rm :
+ HasAVX ? X86::VMOVDQArm : X86::MOVDQArm;
+ else
+ Opc = HasVLX ? X86::VMOVDQU64Z128rm :
+ HasAVX ? X86::VMOVDQUrm : X86::MOVDQUrm;
+ RC = &X86::VR128RegClass;
+ break;
+ case MVT::v8f32:
+ assert(HasAVX);
+ if (IsNonTemporal && Alignment >= 32 && HasAVX2)
+ Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm;
+ else if (Alignment >= 32)
+ Opc = HasVLX ? X86::VMOVAPSZ256rm : X86::VMOVAPSYrm;
+ else
+ Opc = HasVLX ? X86::VMOVUPSZ256rm : X86::VMOVUPSYrm;
+ RC = &X86::VR256RegClass;
+ break;
+ case MVT::v4f64:
+ assert(HasAVX);
+ if (IsNonTemporal && Alignment >= 32 && HasAVX2)
+ Opc = X86::VMOVNTDQAYrm;
+ else if (Alignment >= 32)
+ Opc = HasVLX ? X86::VMOVAPDZ256rm : X86::VMOVAPDYrm;
+ else
+ Opc = HasVLX ? X86::VMOVUPDZ256rm : X86::VMOVUPDYrm;
+ RC = &X86::VR256RegClass;
+ break;
+ case MVT::v8i32:
+ case MVT::v4i64:
+ case MVT::v16i16:
+ case MVT::v32i8:
+ assert(HasAVX);
+ if (IsNonTemporal && Alignment >= 32 && HasAVX2)
+ Opc = X86::VMOVNTDQAYrm;
+ else if (Alignment >= 32)
+ Opc = HasVLX ? X86::VMOVDQA64Z256rm : X86::VMOVDQAYrm;
+ else
+ Opc = HasVLX ? X86::VMOVDQU64Z256rm : X86::VMOVDQUYrm;
+ RC = &X86::VR256RegClass;
+ break;
+ case MVT::v16f32:
+ assert(HasAVX512);
+ if (IsNonTemporal && Alignment >= 64)
+ Opc = X86::VMOVNTDQAZrm;
+ else
+ Opc = (Alignment >= 64) ? X86::VMOVAPSZrm : X86::VMOVUPSZrm;
+ RC = &X86::VR512RegClass;
+ break;
+ case MVT::v8f64:
+ assert(HasAVX512);
+ if (IsNonTemporal && Alignment >= 64)
+ Opc = X86::VMOVNTDQAZrm;
+ else
+ Opc = (Alignment >= 64) ? X86::VMOVAPDZrm : X86::VMOVUPDZrm;
+ RC = &X86::VR512RegClass;
+ break;
+ case MVT::v8i64:
+ case MVT::v16i32:
+ case MVT::v32i16:
+ case MVT::v64i8:
+ assert(HasAVX512);
+ // Note: There are a lot more choices based on type with AVX-512, but
+ // there's really no advantage when the load isn't masked.
+ if (IsNonTemporal && Alignment >= 64)
+ Opc = X86::VMOVNTDQAZrm;
+ else
+ Opc = (Alignment >= 64) ? X86::VMOVDQA64Zrm : X86::VMOVDQU64Zrm;
+ RC = &X86::VR512RegClass;
+ break;
+ }
+
+ ResultReg = createResultReg(RC);
+ MachineInstrBuilder MIB =
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
+ addFullAddress(MIB, AM);
+ if (MMO)
+ MIB->addMemOperand(*FuncInfo.MF, MMO);
+ return true;
+}
+
+/// X86FastEmitStore - Emit a machine instruction to store a value Val of
+/// type VT. The address is either pre-computed, consisted of a base ptr, Ptr
+/// and a displacement offset, or a GlobalAddress,
+/// i.e. V. Return true if it is possible.
+bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
+ X86AddressMode &AM,
+ MachineMemOperand *MMO, bool Aligned) {
+ bool HasSSE2 = Subtarget->hasSSE2();
+ bool HasSSE4A = Subtarget->hasSSE4A();
+ bool HasAVX = Subtarget->hasAVX();
+ bool HasAVX512 = Subtarget->hasAVX512();
+ bool HasVLX = Subtarget->hasVLX();
+ bool IsNonTemporal = MMO && MMO->isNonTemporal();
+
+ // Get opcode and regclass of the output for the given store instruction.
+ unsigned Opc = 0;
+ switch (VT.getSimpleVT().SimpleTy) {
+ case MVT::f80: // No f80 support yet.
+ default: return false;
+ case MVT::i1: {
+ // Mask out all but lowest bit.
+ unsigned AndResult = createResultReg(&X86::GR8RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(X86::AND8ri), AndResult)
+ .addReg(ValReg, getKillRegState(ValIsKill)).addImm(1);
+ ValReg = AndResult;
+ LLVM_FALLTHROUGH; // handle i1 as i8.
+ }
+ case MVT::i8: Opc = X86::MOV8mr; break;
+ case MVT::i16: Opc = X86::MOV16mr; break;
+ case MVT::i32:
+ Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTImr : X86::MOV32mr;
+ break;
+ case MVT::i64:
+ // Must be in x86-64 mode.
+ Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTI_64mr : X86::MOV64mr;
+ break;
+ case MVT::f32:
+ if (X86ScalarSSEf32) {
+ if (IsNonTemporal && HasSSE4A)
+ Opc = X86::MOVNTSS;
+ else
+ Opc = HasAVX512 ? X86::VMOVSSZmr :
+ HasAVX ? X86::VMOVSSmr : X86::MOVSSmr;
+ } else
+ Opc = X86::ST_Fp32m;
+ break;
+ case MVT::f64:
+ if (X86ScalarSSEf32) {
+ if (IsNonTemporal && HasSSE4A)
+ Opc = X86::MOVNTSD;
+ else
+ Opc = HasAVX512 ? X86::VMOVSDZmr :
+ HasAVX ? X86::VMOVSDmr : X86::MOVSDmr;
+ } else
+ Opc = X86::ST_Fp64m;
+ break;
+ case MVT::v4f32:
+ if (Aligned) {
+ if (IsNonTemporal)
+ Opc = HasVLX ? X86::VMOVNTPSZ128mr :
+ HasAVX ? X86::VMOVNTPSmr : X86::MOVNTPSmr;
+ else
+ Opc = HasVLX ? X86::VMOVAPSZ128mr :
+ HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr;
+ } else
+ Opc = HasVLX ? X86::VMOVUPSZ128mr :
+ HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr;
+ break;
+ case MVT::v2f64:
+ if (Aligned) {
+ if (IsNonTemporal)
+ Opc = HasVLX ? X86::VMOVNTPDZ128mr :
+ HasAVX ? X86::VMOVNTPDmr : X86::MOVNTPDmr;
+ else
+ Opc = HasVLX ? X86::VMOVAPDZ128mr :
+ HasAVX ? X86::VMOVAPDmr : X86::MOVAPDmr;
+ } else
+ Opc = HasVLX ? X86::VMOVUPDZ128mr :
+ HasAVX ? X86::VMOVUPDmr : X86::MOVUPDmr;
+ break;
+ case MVT::v4i32:
+ case MVT::v2i64:
+ case MVT::v8i16:
+ case MVT::v16i8:
+ if (Aligned) {
+ if (IsNonTemporal)
+ Opc = HasVLX ? X86::VMOVNTDQZ128mr :
+ HasAVX ? X86::VMOVNTDQmr : X86::MOVNTDQmr;
+ else
+ Opc = HasVLX ? X86::VMOVDQA64Z128mr :
+ HasAVX ? X86::VMOVDQAmr : X86::MOVDQAmr;
+ } else
+ Opc = HasVLX ? X86::VMOVDQU64Z128mr :
+ HasAVX ? X86::VMOVDQUmr : X86::MOVDQUmr;
+ break;
+ case MVT::v8f32:
+ assert(HasAVX);
+ if (Aligned) {
+ if (IsNonTemporal)
+ Opc = HasVLX ? X86::VMOVNTPSZ256mr : X86::VMOVNTPSYmr;
+ else
+ Opc = HasVLX ? X86::VMOVAPSZ256mr : X86::VMOVAPSYmr;
+ } else
+ Opc = HasVLX ? X86::VMOVUPSZ256mr : X86::VMOVUPSYmr;
+ break;
+ case MVT::v4f64:
+ assert(HasAVX);
+ if (Aligned) {
+ if (IsNonTemporal)
+ Opc = HasVLX ? X86::VMOVNTPDZ256mr : X86::VMOVNTPDYmr;
+ else
+ Opc = HasVLX ? X86::VMOVAPDZ256mr : X86::VMOVAPDYmr;
+ } else
+ Opc = HasVLX ? X86::VMOVUPDZ256mr : X86::VMOVUPDYmr;
+ break;
+ case MVT::v8i32:
+ case MVT::v4i64:
+ case MVT::v16i16:
+ case MVT::v32i8:
+ assert(HasAVX);
+ if (Aligned) {
+ if (IsNonTemporal)
+ Opc = HasVLX ? X86::VMOVNTDQZ256mr : X86::VMOVNTDQYmr;
+ else
+ Opc = HasVLX ? X86::VMOVDQA64Z256mr : X86::VMOVDQAYmr;
+ } else
+ Opc = HasVLX ? X86::VMOVDQU64Z256mr : X86::VMOVDQUYmr;
+ break;
+ case MVT::v16f32:
+ assert(HasAVX512);
+ if (Aligned)
+ Opc = IsNonTemporal ? X86::VMOVNTPSZmr : X86::VMOVAPSZmr;
+ else
+ Opc = X86::VMOVUPSZmr;
+ break;
+ case MVT::v8f64:
+ assert(HasAVX512);
+ if (Aligned) {
+ Opc = IsNonTemporal ? X86::VMOVNTPDZmr : X86::VMOVAPDZmr;
+ } else
+ Opc = X86::VMOVUPDZmr;
+ break;
+ case MVT::v8i64:
+ case MVT::v16i32:
+ case MVT::v32i16:
+ case MVT::v64i8:
+ assert(HasAVX512);
+ // Note: There are a lot more choices based on type with AVX-512, but
+ // there's really no advantage when the store isn't masked.
+ if (Aligned)
+ Opc = IsNonTemporal ? X86::VMOVNTDQZmr : X86::VMOVDQA64Zmr;
+ else
+ Opc = X86::VMOVDQU64Zmr;
+ break;
+ }
+
+ const MCInstrDesc &Desc = TII.get(Opc);
+ // Some of the instructions in the previous switch use FR128 instead
+ // of FR32 for ValReg. Make sure the register we feed the instruction
+ // matches its register class constraints.
+ // Note: This is fine to do a copy from FR32 to FR128, this is the
+ // same registers behind the scene and actually why it did not trigger
+ // any bugs before.
+ ValReg = constrainOperandRegClass(Desc, ValReg, Desc.getNumOperands() - 1);
+ MachineInstrBuilder MIB =
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, Desc);
+ addFullAddress(MIB, AM).addReg(ValReg, getKillRegState(ValIsKill));
+ if (MMO)
+ MIB->addMemOperand(*FuncInfo.MF, MMO);
+
+ return true;
+}
+
+bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
+ X86AddressMode &AM,
+ MachineMemOperand *MMO, bool Aligned) {
+ // Handle 'null' like i32/i64 0.
+ if (isa<ConstantPointerNull>(Val))
+ Val = Constant::getNullValue(DL.getIntPtrType(Val->getContext()));
+
+ // If this is a store of a simple constant, fold the constant into the store.
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
+ unsigned Opc = 0;
+ bool Signed = true;
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: break;
+ case MVT::i1:
+ Signed = false;
+ LLVM_FALLTHROUGH; // Handle as i8.
+ case MVT::i8: Opc = X86::MOV8mi; break;
+ case MVT::i16: Opc = X86::MOV16mi; break;
+ case MVT::i32: Opc = X86::MOV32mi; break;
+ case MVT::i64:
+ // Must be a 32-bit sign extended value.
+ if (isInt<32>(CI->getSExtValue()))
+ Opc = X86::MOV64mi32;
+ break;
+ }
+
+ if (Opc) {
+ MachineInstrBuilder MIB =
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
+ addFullAddress(MIB, AM).addImm(Signed ? (uint64_t) CI->getSExtValue()
+ : CI->getZExtValue());
+ if (MMO)
+ MIB->addMemOperand(*FuncInfo.MF, MMO);
+ return true;
+ }
+ }
+
+ unsigned ValReg = getRegForValue(Val);
+ if (ValReg == 0)
+ return false;
+
+ bool ValKill = hasTrivialKill(Val);
+ return X86FastEmitStore(VT, ValReg, ValKill, AM, MMO, Aligned);
+}
+
+/// X86FastEmitExtend - Emit a machine instruction to extend a value Src of
+/// type SrcVT to type DstVT using the specified extension opcode Opc (e.g.
+/// ISD::SIGN_EXTEND).
+bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT,
+ unsigned Src, EVT SrcVT,
+ unsigned &ResultReg) {
+ unsigned RR = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc,
+ Src, /*TODO: Kill=*/false);
+ if (RR == 0)
+ return false;
+
+ ResultReg = RR;
+ return true;
+}
+
+bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
+ // Handle constant address.
+ if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+ // Can't handle alternate code models yet.
+ if (TM.getCodeModel() != CodeModel::Small)
+ return false;
+
+ // Can't handle TLS yet.
+ if (GV->isThreadLocal())
+ return false;
+
+ // RIP-relative addresses can't have additional register operands, so if
+ // we've already folded stuff into the addressing mode, just force the
+ // global value into its own register, which we can use as the basereg.
+ if (!Subtarget->isPICStyleRIPRel() ||
+ (AM.Base.Reg == 0 && AM.IndexReg == 0)) {
+ // Okay, we've committed to selecting this global. Set up the address.
+ AM.GV = GV;
+
+ // Allow the subtarget to classify the global.
+ unsigned char GVFlags = Subtarget->classifyGlobalReference(GV);
+
+ // If this reference is relative to the pic base, set it now.
+ if (isGlobalRelativeToPICBase(GVFlags)) {
+ // FIXME: How do we know Base.Reg is free??
+ AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
+ }
+
+ // Unless the ABI requires an extra load, return a direct reference to
+ // the global.
+ if (!isGlobalStubReference(GVFlags)) {
+ if (Subtarget->isPICStyleRIPRel()) {
+ // Use rip-relative addressing if we can. Above we verified that the
+ // base and index registers are unused.
+ assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
+ AM.Base.Reg = X86::RIP;
+ }
+ AM.GVOpFlags = GVFlags;
+ return true;
+ }
+
+ // Ok, we need to do a load from a stub. If we've already loaded from
+ // this stub, reuse the loaded pointer, otherwise emit the load now.
+ DenseMap<const Value *, unsigned>::iterator I = LocalValueMap.find(V);
+ unsigned LoadReg;
+ if (I != LocalValueMap.end() && I->second != 0) {
+ LoadReg = I->second;
+ } else {
+ // Issue load from stub.
+ unsigned Opc = 0;
+ const TargetRegisterClass *RC = nullptr;
+ X86AddressMode StubAM;
+ StubAM.Base.Reg = AM.Base.Reg;
+ StubAM.GV = GV;
+ StubAM.GVOpFlags = GVFlags;
+
+ // Prepare for inserting code in the local-value area.
+ SavePoint SaveInsertPt = enterLocalValueArea();
+
+ if (TLI.getPointerTy(DL) == MVT::i64) {
+ Opc = X86::MOV64rm;
+ RC = &X86::GR64RegClass;
+
+ if (Subtarget->isPICStyleRIPRel())
+ StubAM.Base.Reg = X86::RIP;
+ } else {
+ Opc = X86::MOV32rm;
+ RC = &X86::GR32RegClass;
+ }
+
+ LoadReg = createResultReg(RC);
+ MachineInstrBuilder LoadMI =
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), LoadReg);
+ addFullAddress(LoadMI, StubAM);
+
+ // Ok, back to normal mode.
+ leaveLocalValueArea(SaveInsertPt);
+
+ // Prevent loading GV stub multiple times in same MBB.
+ LocalValueMap[V] = LoadReg;
+ }
+
+ // Now construct the final address. Note that the Disp, Scale,
+ // and Index values may already be set here.
+ AM.Base.Reg = LoadReg;
+ AM.GV = nullptr;
+ return true;
+ }
+ }
+
+ // If all else fails, try to materialize the value in a register.
+ if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
+ if (AM.Base.Reg == 0) {
+ AM.Base.Reg = getRegForValue(V);
+ return AM.Base.Reg != 0;
+ }
+ if (AM.IndexReg == 0) {
+ assert(AM.Scale == 1 && "Scale with no index!");
+ AM.IndexReg = getRegForValue(V);
+ return AM.IndexReg != 0;
+ }
+ }
+
+ return false;
+}
+
+/// X86SelectAddress - Attempt to fill in an address from the given value.
+///
+bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
+ SmallVector<const Value *, 32> GEPs;
+redo_gep:
+ const User *U = nullptr;
+ unsigned Opcode = Instruction::UserOp1;
+ if (const Instruction *I = dyn_cast<Instruction>(V)) {
+ // Don't walk into other basic blocks; it's possible we haven't
+ // visited them yet, so the instructions may not yet be assigned
+ // virtual registers.
+ if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(V)) ||
+ FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
+ Opcode = I->getOpcode();
+ U = I;
+ }
+ } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
+ Opcode = C->getOpcode();
+ U = C;
+ }
+
+ if (PointerType *Ty = dyn_cast<PointerType>(V->getType()))
+ if (Ty->getAddressSpace() > 255)
+ // Fast instruction selection doesn't support the special
+ // address spaces.
+ return false;
+
+ switch (Opcode) {
+ default: break;
+ case Instruction::BitCast:
+ // Look past bitcasts.
+ return X86SelectAddress(U->getOperand(0), AM);
+
+ case Instruction::IntToPtr:
+ // Look past no-op inttoptrs.
+ if (TLI.getValueType(DL, U->getOperand(0)->getType()) ==
+ TLI.getPointerTy(DL))
+ return X86SelectAddress(U->getOperand(0), AM);
+ break;
+
+ case Instruction::PtrToInt:
+ // Look past no-op ptrtoints.
+ if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
+ return X86SelectAddress(U->getOperand(0), AM);
+ break;
+
+ case Instruction::Alloca: {
+ // Do static allocas.
+ const AllocaInst *A = cast<AllocaInst>(V);
+ DenseMap<const AllocaInst *, int>::iterator SI =
+ FuncInfo.StaticAllocaMap.find(A);
+ if (SI != FuncInfo.StaticAllocaMap.end()) {
+ AM.BaseType = X86AddressMode::FrameIndexBase;
+ AM.Base.FrameIndex = SI->second;
+ return true;
+ }
+ break;
+ }
+
+ case Instruction::Add: {
+ // Adds of constants are common and easy enough.
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) {
+ uint64_t Disp = (int32_t)AM.Disp + (uint64_t)CI->getSExtValue();
+ // They have to fit in the 32-bit signed displacement field though.
+ if (isInt<32>(Disp)) {
+ AM.Disp = (uint32_t)Disp;
+ return X86SelectAddress(U->getOperand(0), AM);
+ }
+ }
+ break;
+ }
+
+ case Instruction::GetElementPtr: {
+ X86AddressMode SavedAM = AM;
+
+ // Pattern-match simple GEPs.
+ uint64_t Disp = (int32_t)AM.Disp;
+ unsigned IndexReg = AM.IndexReg;
+ unsigned Scale = AM.Scale;
+ gep_type_iterator GTI = gep_type_begin(U);
+ // Iterate through the indices, folding what we can. Constants can be
+ // folded, and one dynamic index can be handled, if the scale is supported.
+ for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end();
+ i != e; ++i, ++GTI) {
+ const Value *Op = *i;
+ if (StructType *STy = GTI.getStructTypeOrNull()) {
+ const StructLayout *SL = DL.getStructLayout(STy);
+ Disp += SL->getElementOffset(cast<ConstantInt>(Op)->getZExtValue());
+ continue;
+ }
+
+ // A array/variable index is always of the form i*S where S is the
+ // constant scale size. See if we can push the scale into immediates.
+ uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
+ for (;;) {
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+ // Constant-offset addressing.
+ Disp += CI->getSExtValue() * S;
+ break;
+ }
+ if (canFoldAddIntoGEP(U, Op)) {
+ // A compatible add with a constant operand. Fold the constant.
+ ConstantInt *CI =
+ cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+ Disp += CI->getSExtValue() * S;
+ // Iterate on the other operand.
+ Op = cast<AddOperator>(Op)->getOperand(0);
+ continue;
+ }
+ if (IndexReg == 0 &&
+ (!AM.GV || !Subtarget->isPICStyleRIPRel()) &&
+ (S == 1 || S == 2 || S == 4 || S == 8)) {
+ // Scaled-index addressing.
+ Scale = S;
+ IndexReg = getRegForGEPIndex(Op).first;
+ if (IndexReg == 0)
+ return false;
+ break;
+ }
+ // Unsupported.
+ goto unsupported_gep;
+ }
+ }
+
+ // Check for displacement overflow.
+ if (!isInt<32>(Disp))
+ break;
+
+ AM.IndexReg = IndexReg;
+ AM.Scale = Scale;
+ AM.Disp = (uint32_t)Disp;
+ GEPs.push_back(V);
+
+ if (const GetElementPtrInst *GEP =
+ dyn_cast<GetElementPtrInst>(U->getOperand(0))) {
+ // Ok, the GEP indices were covered by constant-offset and scaled-index
+ // addressing. Update the address state and move on to examining the base.
+ V = GEP;
+ goto redo_gep;
+ } else if (X86SelectAddress(U->getOperand(0), AM)) {
+ return true;
+ }
+
+ // If we couldn't merge the gep value into this addr mode, revert back to
+ // our address and just match the value instead of completely failing.
+ AM = SavedAM;
+
+ for (const Value *I : reverse(GEPs))
+ if (handleConstantAddresses(I, AM))
+ return true;
+
+ return false;
+ unsupported_gep:
+ // Ok, the GEP indices weren't all covered.
+ break;
+ }
+ }
+
+ return handleConstantAddresses(V, AM);
+}
+
+/// X86SelectCallAddress - Attempt to fill in an address from the given value.
+///
+bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
+ const User *U = nullptr;
+ unsigned Opcode = Instruction::UserOp1;
+ const Instruction *I = dyn_cast<Instruction>(V);
+ // Record if the value is defined in the same basic block.
+ //
+ // This information is crucial to know whether or not folding an
+ // operand is valid.
+ // Indeed, FastISel generates or reuses a virtual register for all
+ // operands of all instructions it selects. Obviously, the definition and
+ // its uses must use the same virtual register otherwise the produced
+ // code is incorrect.
+ // Before instruction selection, FunctionLoweringInfo::set sets the virtual
+ // registers for values that are alive across basic blocks. This ensures
+ // that the values are consistently set between across basic block, even
+ // if different instruction selection mechanisms are used (e.g., a mix of
+ // SDISel and FastISel).
+ // For values local to a basic block, the instruction selection process
+ // generates these virtual registers with whatever method is appropriate
+ // for its needs. In particular, FastISel and SDISel do not share the way
+ // local virtual registers are set.
+ // Therefore, this is impossible (or at least unsafe) to share values
+ // between basic blocks unless they use the same instruction selection
+ // method, which is not guarantee for X86.
+ // Moreover, things like hasOneUse could not be used accurately, if we
+ // allow to reference values across basic blocks whereas they are not
+ // alive across basic blocks initially.
+ bool InMBB = true;
+ if (I) {
+ Opcode = I->getOpcode();
+ U = I;
+ InMBB = I->getParent() == FuncInfo.MBB->getBasicBlock();
+ } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
+ Opcode = C->getOpcode();
+ U = C;
+ }
+
+ switch (Opcode) {
+ default: break;
+ case Instruction::BitCast:
+ // Look past bitcasts if its operand is in the same BB.
+ if (InMBB)
+ return X86SelectCallAddress(U->getOperand(0), AM);
+ break;
+
+ case Instruction::IntToPtr:
+ // Look past no-op inttoptrs if its operand is in the same BB.
+ if (InMBB &&
+ TLI.getValueType(DL, U->getOperand(0)->getType()) ==
+ TLI.getPointerTy(DL))
+ return X86SelectCallAddress(U->getOperand(0), AM);
+ break;
+
+ case Instruction::PtrToInt:
+ // Look past no-op ptrtoints if its operand is in the same BB.
+ if (InMBB && TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
+ return X86SelectCallAddress(U->getOperand(0), AM);
+ break;
+ }
+
+ // Handle constant address.
+ if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+ // Can't handle alternate code models yet.
+ if (TM.getCodeModel() != CodeModel::Small)
+ return false;
+
+ // RIP-relative addresses can't have additional register operands.
+ if (Subtarget->isPICStyleRIPRel() &&
+ (AM.Base.Reg != 0 || AM.IndexReg != 0))
+ return false;
+
+ // Can't handle DLL Import.
+ if (GV->hasDLLImportStorageClass())
+ return false;
+
+ // Can't handle TLS.
+ if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
+ if (GVar->isThreadLocal())
+ return false;
+
+ // Okay, we've committed to selecting this global. Set up the basic address.
+ AM.GV = GV;
+
+ // No ABI requires an extra load for anything other than DLLImport, which
+ // we rejected above. Return a direct reference to the global.
+ if (Subtarget->isPICStyleRIPRel()) {
+ // Use rip-relative addressing if we can. Above we verified that the
+ // base and index registers are unused.
+ assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
+ AM.Base.Reg = X86::RIP;
+ } else {
+ AM.GVOpFlags = Subtarget->classifyLocalReference(nullptr);
+ }
+
+ return true;
+ }
+
+ // If all else fails, try to materialize the value in a register.
+ if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
+ if (AM.Base.Reg == 0) {
+ AM.Base.Reg = getRegForValue(V);
+ return AM.Base.Reg != 0;
+ }
+ if (AM.IndexReg == 0) {
+ assert(AM.Scale == 1 && "Scale with no index!");
+ AM.IndexReg = getRegForValue(V);
+ return AM.IndexReg != 0;
+ }
+ }
+
+ return false;
+}
+
+
+/// X86SelectStore - Select and emit code to implement store instructions.
+bool X86FastISel::X86SelectStore(const Instruction *I) {
+ // Atomic stores need special handling.
+ const StoreInst *S = cast<StoreInst>(I);
+
+ if (S->isAtomic())
+ return false;
+
+ const Value *PtrV = I->getOperand(1);
+ if (TLI.supportSwiftError()) {
+ // Swifterror values can come from either a function parameter with
+ // swifterror attribute or an alloca with swifterror attribute.
+ if (const Argument *Arg = dyn_cast<Argument>(PtrV)) {
+ if (Arg->hasSwiftErrorAttr())
+ return false;
+ }
+
+ if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrV)) {
+ if (Alloca->isSwiftError())
+ return false;
+ }
+ }
+
+ const Value *Val = S->getValueOperand();
+ const Value *Ptr = S->getPointerOperand();
+
+ MVT VT;
+ if (!isTypeLegal(Val->getType(), VT, /*AllowI1=*/true))
+ return false;
+
+ unsigned Alignment = S->getAlignment();
+ unsigned ABIAlignment = DL.getABITypeAlignment(Val->getType());
+ if (Alignment == 0) // Ensure that codegen never sees alignment 0
+ Alignment = ABIAlignment;
+ bool Aligned = Alignment >= ABIAlignment;
+
+ X86AddressMode AM;
+ if (!X86SelectAddress(Ptr, AM))
+ return false;
+
+ return X86FastEmitStore(VT, Val, AM, createMachineMemOperandFor(I), Aligned);
+}
+
+/// X86SelectRet - Select and emit code to implement ret instructions.
+bool X86FastISel::X86SelectRet(const Instruction *I) {
+ const ReturnInst *Ret = cast<ReturnInst>(I);
+ const Function &F = *I->getParent()->getParent();
+ const X86MachineFunctionInfo *X86MFInfo =
+ FuncInfo.MF->getInfo<X86MachineFunctionInfo>();
+
+ if (!FuncInfo.CanLowerReturn)
+ return false;
+
+ if (TLI.supportSwiftError() &&
+ F.getAttributes().hasAttrSomewhere(Attribute::SwiftError))
+ return false;
+
+ if (TLI.supportSplitCSR(FuncInfo.MF))
+ return false;
+
+ CallingConv::ID CC = F.getCallingConv();
+ if (CC != CallingConv::C &&
+ CC != CallingConv::Fast &&
+ CC != CallingConv::X86_FastCall &&
+ CC != CallingConv::X86_StdCall &&
+ CC != CallingConv::X86_ThisCall &&
+ CC != CallingConv::X86_64_SysV &&
+ CC != CallingConv::X86_64_Win64)
+ return false;
+
+ // Don't handle popping bytes if they don't fit the ret's immediate.
+ if (!isUInt<16>(X86MFInfo->getBytesToPopOnReturn()))
+ return false;
+
+ // fastcc with -tailcallopt is intended to provide a guaranteed
+ // tail call optimization. Fastisel doesn't know how to do that.
+ if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt)
+ return false;
+
+ // Let SDISel handle vararg functions.
+ if (F.isVarArg())
+ return false;
+
+ // Build a list of return value registers.
+ SmallVector<unsigned, 4> RetRegs;
+
+ if (Ret->getNumOperands() > 0) {
+ SmallVector<ISD::OutputArg, 4> Outs;
+ GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL);
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ValLocs;
+ CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, I->getContext());
+ CCInfo.AnalyzeReturn(Outs, RetCC_X86);
+
+ const Value *RV = Ret->getOperand(0);
+ unsigned Reg = getRegForValue(RV);
+ if (Reg == 0)
+ return false;
+
+ // Only handle a single return value for now.
+ if (ValLocs.size() != 1)
+ return false;
+
+ CCValAssign &VA = ValLocs[0];
+
+ // Don't bother handling odd stuff for now.
+ if (VA.getLocInfo() != CCValAssign::Full)
+ return false;
+ // Only handle register returns for now.
+ if (!VA.isRegLoc())
+ return false;
+
+ // The calling-convention tables for x87 returns don't tell
+ // the whole story.
+ if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
+ return false;
+
+ unsigned SrcReg = Reg + VA.getValNo();
+ EVT SrcVT = TLI.getValueType(DL, RV->getType());
+ EVT DstVT = VA.getValVT();
+ // Special handling for extended integers.
+ if (SrcVT != DstVT) {
+ if (SrcVT != MVT::i1 && SrcVT != MVT::i8 && SrcVT != MVT::i16)
+ return false;
+
+ if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt())
+ return false;
+
+ assert(DstVT == MVT::i32 && "X86 should always ext to i32");
+
+ if (SrcVT == MVT::i1) {
+ if (Outs[0].Flags.isSExt())
+ return false;
+ SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg, /*TODO: Kill=*/false);
+ SrcVT = MVT::i8;
+ }
+ unsigned Op = Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND :
+ ISD::SIGN_EXTEND;
+ SrcReg = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op,
+ SrcReg, /*TODO: Kill=*/false);
+ }
+
+ // Make the copy.
+ unsigned DstReg = VA.getLocReg();
+ const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
+ // Avoid a cross-class copy. This is very unlikely.
+ if (!SrcRC->contains(DstReg))
+ return false;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), DstReg).addReg(SrcReg);
+
+ // Add register to return instruction.
+ RetRegs.push_back(VA.getLocReg());
+ }
+
+ // Swift calling convention does not require we copy the sret argument
+ // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
+
+ // All x86 ABIs require that for returning structs by value we copy
+ // the sret argument into %rax/%eax (depending on ABI) for the return.
+ // We saved the argument into a virtual register in the entry block,
+ // so now we copy the value out and into %rax/%eax.
+ if (F.hasStructRetAttr() && CC != CallingConv::Swift) {
+ unsigned Reg = X86MFInfo->getSRetReturnReg();
+ assert(Reg &&
+ "SRetReturnReg should have been set in LowerFormalArguments()!");
+ unsigned RetReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), RetReg).addReg(Reg);
+ RetRegs.push_back(RetReg);
+ }
+
+ // Now emit the RET.
+ MachineInstrBuilder MIB;
+ if (X86MFInfo->getBytesToPopOnReturn()) {
+ MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Subtarget->is64Bit() ? X86::RETIQ : X86::RETIL))
+ .addImm(X86MFInfo->getBytesToPopOnReturn());
+ } else {
+ MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Subtarget->is64Bit() ? X86::RETQ : X86::RETL));
+ }
+ for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
+ MIB.addReg(RetRegs[i], RegState::Implicit);
+ return true;
+}
+
+/// X86SelectLoad - Select and emit code to implement load instructions.
+///
+bool X86FastISel::X86SelectLoad(const Instruction *I) {
+ const LoadInst *LI = cast<LoadInst>(I);
+
+ // Atomic loads need special handling.
+ if (LI->isAtomic())
+ return false;
+
+ const Value *SV = I->getOperand(0);
+ if (TLI.supportSwiftError()) {
+ // Swifterror values can come from either a function parameter with
+ // swifterror attribute or an alloca with swifterror attribute.
+ if (const Argument *Arg = dyn_cast<Argument>(SV)) {
+ if (Arg->hasSwiftErrorAttr())
+ return false;
+ }
+
+ if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(SV)) {
+ if (Alloca->isSwiftError())
+ return false;
+ }
+ }
+
+ MVT VT;
+ if (!isTypeLegal(LI->getType(), VT, /*AllowI1=*/true))
+ return false;
+
+ const Value *Ptr = LI->getPointerOperand();
+
+ X86AddressMode AM;
+ if (!X86SelectAddress(Ptr, AM))
+ return false;
+
+ unsigned Alignment = LI->getAlignment();
+ unsigned ABIAlignment = DL.getABITypeAlignment(LI->getType());
+ if (Alignment == 0) // Ensure that codegen never sees alignment 0
+ Alignment = ABIAlignment;
+
+ unsigned ResultReg = 0;
+ if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg,
+ Alignment))
+ return false;
+
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) {
+ bool HasAVX = Subtarget->hasAVX();
+ bool X86ScalarSSEf32 = Subtarget->hasSSE1();
+ bool X86ScalarSSEf64 = Subtarget->hasSSE2();
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return 0;
+ case MVT::i8: return X86::CMP8rr;
+ case MVT::i16: return X86::CMP16rr;
+ case MVT::i32: return X86::CMP32rr;
+ case MVT::i64: return X86::CMP64rr;
+ case MVT::f32:
+ return X86ScalarSSEf32 ? (HasAVX ? X86::VUCOMISSrr : X86::UCOMISSrr) : 0;
+ case MVT::f64:
+ return X86ScalarSSEf64 ? (HasAVX ? X86::VUCOMISDrr : X86::UCOMISDrr) : 0;
+ }
+}
+
+/// If we have a comparison with RHS as the RHS of the comparison, return an
+/// opcode that works for the compare (e.g. CMP32ri) otherwise return 0.
+static unsigned X86ChooseCmpImmediateOpcode(EVT VT, const ConstantInt *RHSC) {
+ int64_t Val = RHSC->getSExtValue();
+ switch (VT.getSimpleVT().SimpleTy) {
+ // Otherwise, we can't fold the immediate into this comparison.
+ default:
+ return 0;
+ case MVT::i8:
+ return X86::CMP8ri;
+ case MVT::i16:
+ if (isInt<8>(Val))
+ return X86::CMP16ri8;
+ return X86::CMP16ri;
+ case MVT::i32:
+ if (isInt<8>(Val))
+ return X86::CMP32ri8;
+ return X86::CMP32ri;
+ case MVT::i64:
+ if (isInt<8>(Val))
+ return X86::CMP64ri8;
+ // 64-bit comparisons are only valid if the immediate fits in a 32-bit sext
+ // field.
+ if (isInt<32>(Val))
+ return X86::CMP64ri32;
+ return 0;
+ }
+}
+
+bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1, EVT VT,
+ const DebugLoc &CurDbgLoc) {
+ unsigned Op0Reg = getRegForValue(Op0);
+ if (Op0Reg == 0) return false;
+
+ // Handle 'null' like i32/i64 0.
+ if (isa<ConstantPointerNull>(Op1))
+ Op1 = Constant::getNullValue(DL.getIntPtrType(Op0->getContext()));
+
+ // We have two options: compare with register or immediate. If the RHS of
+ // the compare is an immediate that we can fold into this compare, use
+ // CMPri, otherwise use CMPrr.
+ if (const ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
+ if (unsigned CompareImmOpc = X86ChooseCmpImmediateOpcode(VT, Op1C)) {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareImmOpc))
+ .addReg(Op0Reg)
+ .addImm(Op1C->getSExtValue());
+ return true;
+ }
+ }
+
+ unsigned CompareOpc = X86ChooseCmpOpcode(VT, Subtarget);
+ if (CompareOpc == 0) return false;
+
+ unsigned Op1Reg = getRegForValue(Op1);
+ if (Op1Reg == 0) return false;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareOpc))
+ .addReg(Op0Reg)
+ .addReg(Op1Reg);
+
+ return true;
+}
+
+bool X86FastISel::X86SelectCmp(const Instruction *I) {
+ const CmpInst *CI = cast<CmpInst>(I);
+
+ MVT VT;
+ if (!isTypeLegal(I->getOperand(0)->getType(), VT))
+ return false;
+
+ if (I->getType()->isIntegerTy(1) && Subtarget->hasAVX512())
+ return false;
+
+ // Try to optimize or fold the cmp.
+ CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+ unsigned ResultReg = 0;
+ switch (Predicate) {
+ default: break;
+ case CmpInst::FCMP_FALSE: {
+ ResultReg = createResultReg(&X86::GR32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32r0),
+ ResultReg);
+ ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg, /*Kill=*/true,
+ X86::sub_8bit);
+ if (!ResultReg)
+ return false;
+ break;
+ }
+ case CmpInst::FCMP_TRUE: {
+ ResultReg = createResultReg(&X86::GR8RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri),
+ ResultReg).addImm(1);
+ break;
+ }
+ }
+
+ if (ResultReg) {
+ updateValueMap(I, ResultReg);
+ return true;
+ }
+
+ const Value *LHS = CI->getOperand(0);
+ const Value *RHS = CI->getOperand(1);
+
+ // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0.
+ // We don't have to materialize a zero constant for this case and can just use
+ // %x again on the RHS.
+ if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
+ const auto *RHSC = dyn_cast<ConstantFP>(RHS);
+ if (RHSC && RHSC->isNullValue())
+ RHS = LHS;
+ }
+
+ // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
+ static const uint16_t SETFOpcTable[2][3] = {
+ { X86::SETEr, X86::SETNPr, X86::AND8rr },
+ { X86::SETNEr, X86::SETPr, X86::OR8rr }
+ };
+ const uint16_t *SETFOpc = nullptr;
+ switch (Predicate) {
+ default: break;
+ case CmpInst::FCMP_OEQ: SETFOpc = &SETFOpcTable[0][0]; break;
+ case CmpInst::FCMP_UNE: SETFOpc = &SETFOpcTable[1][0]; break;
+ }
+
+ ResultReg = createResultReg(&X86::GR8RegClass);
+ if (SETFOpc) {
+ if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc()))
+ return false;
+
+ unsigned FlagReg1 = createResultReg(&X86::GR8RegClass);
+ unsigned FlagReg2 = createResultReg(&X86::GR8RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]),
+ FlagReg1);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]),
+ FlagReg2);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[2]),
+ ResultReg).addReg(FlagReg1).addReg(FlagReg2);
+ updateValueMap(I, ResultReg);
+ return true;
+ }
+
+ X86::CondCode CC;
+ bool SwapArgs;
+ std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate);
+ assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
+ unsigned Opc = X86::getSETFromCond(CC);
+
+ if (SwapArgs)
+ std::swap(LHS, RHS);
+
+ // Emit a compare of LHS/RHS.
+ if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc()))
+ return false;
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool X86FastISel::X86SelectZExt(const Instruction *I) {
+ EVT DstVT = TLI.getValueType(DL, I->getType());
+ if (!TLI.isTypeLegal(DstVT))
+ return false;
+
+ unsigned ResultReg = getRegForValue(I->getOperand(0));
+ if (ResultReg == 0)
+ return false;
+
+ // Handle zero-extension from i1 to i8, which is common.
+ MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType());
+ if (SrcVT == MVT::i1) {
+ // Set the high bits to zero.
+ ResultReg = fastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false);
+ SrcVT = MVT::i8;
+
+ if (ResultReg == 0)
+ return false;
+ }
+
+ if (DstVT == MVT::i64) {
+ // Handle extension to 64-bits via sub-register shenanigans.
+ unsigned MovInst;
+
+ switch (SrcVT.SimpleTy) {
+ case MVT::i8: MovInst = X86::MOVZX32rr8; break;
+ case MVT::i16: MovInst = X86::MOVZX32rr16; break;
+ case MVT::i32: MovInst = X86::MOV32rr; break;
+ default: llvm_unreachable("Unexpected zext to i64 source type");
+ }
+
+ unsigned Result32 = createResultReg(&X86::GR32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovInst), Result32)
+ .addReg(ResultReg);
+
+ ResultReg = createResultReg(&X86::GR64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::SUBREG_TO_REG),
+ ResultReg)
+ .addImm(0).addReg(Result32).addImm(X86::sub_32bit);
+ } else if (DstVT != MVT::i8) {
+ ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND,
+ ResultReg, /*Kill=*/true);
+ if (ResultReg == 0)
+ return false;
+ }
+
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool X86FastISel::X86SelectBranch(const Instruction *I) {
+ // Unconditional branches are selected by tablegen-generated code.
+ // Handle a conditional branch.
+ const BranchInst *BI = cast<BranchInst>(I);
+ MachineBasicBlock *TrueMBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
+ MachineBasicBlock *FalseMBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
+
+ // Fold the common case of a conditional branch with a comparison
+ // in the same block (values defined on other blocks may not have
+ // initialized registers).
+ X86::CondCode CC;
+ if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
+ if (CI->hasOneUse() && CI->getParent() == I->getParent()) {
+ EVT VT = TLI.getValueType(DL, CI->getOperand(0)->getType());
+
+ // Try to optimize or fold the cmp.
+ CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+ switch (Predicate) {
+ default: break;
+ case CmpInst::FCMP_FALSE: fastEmitBranch(FalseMBB, DbgLoc); return true;
+ case CmpInst::FCMP_TRUE: fastEmitBranch(TrueMBB, DbgLoc); return true;
+ }
+
+ const Value *CmpLHS = CI->getOperand(0);
+ const Value *CmpRHS = CI->getOperand(1);
+
+ // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x,
+ // 0.0.
+ // We don't have to materialize a zero constant for this case and can just
+ // use %x again on the RHS.
+ if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
+ const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS);
+ if (CmpRHSC && CmpRHSC->isNullValue())
+ CmpRHS = CmpLHS;
+ }
+
+ // Try to take advantage of fallthrough opportunities.
+ if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
+ std::swap(TrueMBB, FalseMBB);
+ Predicate = CmpInst::getInversePredicate(Predicate);
+ }
+
+ // FCMP_OEQ and FCMP_UNE cannot be expressed with a single flag/condition
+ // code check. Instead two branch instructions are required to check all
+ // the flags. First we change the predicate to a supported condition code,
+ // which will be the first branch. Later one we will emit the second
+ // branch.
+ bool NeedExtraBranch = false;
+ switch (Predicate) {
+ default: break;
+ case CmpInst::FCMP_OEQ:
+ std::swap(TrueMBB, FalseMBB);
+ LLVM_FALLTHROUGH;
+ case CmpInst::FCMP_UNE:
+ NeedExtraBranch = true;
+ Predicate = CmpInst::FCMP_ONE;
+ break;
+ }
+
+ bool SwapArgs;
+ unsigned BranchOpc;
+ std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate);
+ assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
+
+ BranchOpc = X86::GetCondBranchFromCond(CC);
+ if (SwapArgs)
+ std::swap(CmpLHS, CmpRHS);
+
+ // Emit a compare of the LHS and RHS, setting the flags.
+ if (!X86FastEmitCompare(CmpLHS, CmpRHS, VT, CI->getDebugLoc()))
+ return false;
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc))
+ .addMBB(TrueMBB);
+
+ // X86 requires a second branch to handle UNE (and OEQ, which is mapped
+ // to UNE above).
+ if (NeedExtraBranch) {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JP_1))
+ .addMBB(TrueMBB);
+ }
+
+ finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
+ return true;
+ }
+ } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
+ // Handle things like "%cond = trunc i32 %X to i1 / br i1 %cond", which
+ // typically happen for _Bool and C++ bools.
+ MVT SourceVT;
+ if (TI->hasOneUse() && TI->getParent() == I->getParent() &&
+ isTypeLegal(TI->getOperand(0)->getType(), SourceVT)) {
+ unsigned TestOpc = 0;
+ switch (SourceVT.SimpleTy) {
+ default: break;
+ case MVT::i8: TestOpc = X86::TEST8ri; break;
+ case MVT::i16: TestOpc = X86::TEST16ri; break;
+ case MVT::i32: TestOpc = X86::TEST32ri; break;
+ case MVT::i64: TestOpc = X86::TEST64ri32; break;
+ }
+ if (TestOpc) {
+ unsigned OpReg = getRegForValue(TI->getOperand(0));
+ if (OpReg == 0) return false;
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TestOpc))
+ .addReg(OpReg).addImm(1);
+
+ unsigned JmpOpc = X86::JNE_1;
+ if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
+ std::swap(TrueMBB, FalseMBB);
+ JmpOpc = X86::JE_1;
+ }
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(JmpOpc))
+ .addMBB(TrueMBB);
+
+ finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
+ return true;
+ }
+ }
+ } else if (foldX86XALUIntrinsic(CC, BI, BI->getCondition())) {
+ // Fake request the condition, otherwise the intrinsic might be completely
+ // optimized away.
+ unsigned TmpReg = getRegForValue(BI->getCondition());
+ if (TmpReg == 0)
+ return false;
+
+ unsigned BranchOpc = X86::GetCondBranchFromCond(CC);
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc))
+ .addMBB(TrueMBB);
+ finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
+ return true;
+ }
+
+ // Otherwise do a clumsy setcc and re-test it.
+ // Note that i1 essentially gets ANY_EXTEND'ed to i8 where it isn't used
+ // in an explicit cast, so make sure to handle that correctly.
+ unsigned OpReg = getRegForValue(BI->getCondition());
+ if (OpReg == 0) return false;
+
+ // In case OpReg is a K register, COPY to a GPR
+ if (MRI.getRegClass(OpReg) == &X86::VK1RegClass) {
+ unsigned KOpReg = OpReg;
+ OpReg = createResultReg(&X86::GR8RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), OpReg)
+ .addReg(KOpReg);
+ }
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
+ .addReg(OpReg)
+ .addImm(1);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_1))
+ .addMBB(TrueMBB);
+ finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
+ return true;
+}
+
+bool X86FastISel::X86SelectShift(const Instruction *I) {
+ unsigned CReg = 0, OpReg = 0;
+ const TargetRegisterClass *RC = nullptr;
+ if (I->getType()->isIntegerTy(8)) {
+ CReg = X86::CL;
+ RC = &X86::GR8RegClass;
+ switch (I->getOpcode()) {
+ case Instruction::LShr: OpReg = X86::SHR8rCL; break;
+ case Instruction::AShr: OpReg = X86::SAR8rCL; break;
+ case Instruction::Shl: OpReg = X86::SHL8rCL; break;
+ default: return false;
+ }
+ } else if (I->getType()->isIntegerTy(16)) {
+ CReg = X86::CX;
+ RC = &X86::GR16RegClass;
+ switch (I->getOpcode()) {
+ case Instruction::LShr: OpReg = X86::SHR16rCL; break;
+ case Instruction::AShr: OpReg = X86::SAR16rCL; break;
+ case Instruction::Shl: OpReg = X86::SHL16rCL; break;
+ default: return false;
+ }
+ } else if (I->getType()->isIntegerTy(32)) {
+ CReg = X86::ECX;
+ RC = &X86::GR32RegClass;
+ switch (I->getOpcode()) {
+ case Instruction::LShr: OpReg = X86::SHR32rCL; break;
+ case Instruction::AShr: OpReg = X86::SAR32rCL; break;
+ case Instruction::Shl: OpReg = X86::SHL32rCL; break;
+ default: return false;
+ }
+ } else if (I->getType()->isIntegerTy(64)) {
+ CReg = X86::RCX;
+ RC = &X86::GR64RegClass;
+ switch (I->getOpcode()) {
+ case Instruction::LShr: OpReg = X86::SHR64rCL; break;
+ case Instruction::AShr: OpReg = X86::SAR64rCL; break;
+ case Instruction::Shl: OpReg = X86::SHL64rCL; break;
+ default: return false;
+ }
+ } else {
+ return false;
+ }
+
+ MVT VT;
+ if (!isTypeLegal(I->getType(), VT))
+ return false;
+
+ unsigned Op0Reg = getRegForValue(I->getOperand(0));
+ if (Op0Reg == 0) return false;
+
+ unsigned Op1Reg = getRegForValue(I->getOperand(1));
+ if (Op1Reg == 0) return false;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY),
+ CReg).addReg(Op1Reg);
+
+ // The shift instruction uses X86::CL. If we defined a super-register
+ // of X86::CL, emit a subreg KILL to precisely describe what we're doing here.
+ if (CReg != X86::CL)
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::KILL), X86::CL)
+ .addReg(CReg, RegState::Kill);
+
+ unsigned ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpReg), ResultReg)
+ .addReg(Op0Reg);
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool X86FastISel::X86SelectDivRem(const Instruction *I) {
+ const static unsigned NumTypes = 4; // i8, i16, i32, i64
+ const static unsigned NumOps = 4; // SDiv, SRem, UDiv, URem
+ const static bool S = true; // IsSigned
+ const static bool U = false; // !IsSigned
+ const static unsigned Copy = TargetOpcode::COPY;
+ // For the X86 DIV/IDIV instruction, in most cases the dividend
+ // (numerator) must be in a specific register pair highreg:lowreg,
+ // producing the quotient in lowreg and the remainder in highreg.
+ // For most data types, to set up the instruction, the dividend is
+ // copied into lowreg, and lowreg is sign-extended or zero-extended
+ // into highreg. The exception is i8, where the dividend is defined
+ // as a single register rather than a register pair, and we
+ // therefore directly sign-extend or zero-extend the dividend into
+ // lowreg, instead of copying, and ignore the highreg.
+ const static struct DivRemEntry {
+ // The following portion depends only on the data type.
+ const TargetRegisterClass *RC;
+ unsigned LowInReg; // low part of the register pair
+ unsigned HighInReg; // high part of the register pair
+ // The following portion depends on both the data type and the operation.
+ struct DivRemResult {
+ unsigned OpDivRem; // The specific DIV/IDIV opcode to use.
+ unsigned OpSignExtend; // Opcode for sign-extending lowreg into
+ // highreg, or copying a zero into highreg.
+ unsigned OpCopy; // Opcode for copying dividend into lowreg, or
+ // zero/sign-extending into lowreg for i8.
+ unsigned DivRemResultReg; // Register containing the desired result.
+ bool IsOpSigned; // Whether to use signed or unsigned form.
+ } ResultTable[NumOps];
+ } OpTable[NumTypes] = {
+ { &X86::GR8RegClass, X86::AX, 0, {
+ { X86::IDIV8r, 0, X86::MOVSX16rr8, X86::AL, S }, // SDiv
+ { X86::IDIV8r, 0, X86::MOVSX16rr8, X86::AH, S }, // SRem
+ { X86::DIV8r, 0, X86::MOVZX16rr8, X86::AL, U }, // UDiv
+ { X86::DIV8r, 0, X86::MOVZX16rr8, X86::AH, U }, // URem
+ }
+ }, // i8
+ { &X86::GR16RegClass, X86::AX, X86::DX, {
+ { X86::IDIV16r, X86::CWD, Copy, X86::AX, S }, // SDiv
+ { X86::IDIV16r, X86::CWD, Copy, X86::DX, S }, // SRem
+ { X86::DIV16r, X86::MOV32r0, Copy, X86::AX, U }, // UDiv
+ { X86::DIV16r, X86::MOV32r0, Copy, X86::DX, U }, // URem
+ }
+ }, // i16
+ { &X86::GR32RegClass, X86::EAX, X86::EDX, {
+ { X86::IDIV32r, X86::CDQ, Copy, X86::EAX, S }, // SDiv
+ { X86::IDIV32r, X86::CDQ, Copy, X86::EDX, S }, // SRem
+ { X86::DIV32r, X86::MOV32r0, Copy, X86::EAX, U }, // UDiv
+ { X86::DIV32r, X86::MOV32r0, Copy, X86::EDX, U }, // URem
+ }
+ }, // i32
+ { &X86::GR64RegClass, X86::RAX, X86::RDX, {
+ { X86::IDIV64r, X86::CQO, Copy, X86::RAX, S }, // SDiv
+ { X86::IDIV64r, X86::CQO, Copy, X86::RDX, S }, // SRem
+ { X86::DIV64r, X86::MOV32r0, Copy, X86::RAX, U }, // UDiv
+ { X86::DIV64r, X86::MOV32r0, Copy, X86::RDX, U }, // URem
+ }
+ }, // i64
+ };
+
+ MVT VT;
+ if (!isTypeLegal(I->getType(), VT))
+ return false;
+
+ unsigned TypeIndex, OpIndex;
+ switch (VT.SimpleTy) {
+ default: return false;
+ case MVT::i8: TypeIndex = 0; break;
+ case MVT::i16: TypeIndex = 1; break;
+ case MVT::i32: TypeIndex = 2; break;
+ case MVT::i64: TypeIndex = 3;
+ if (!Subtarget->is64Bit())
+ return false;
+ break;
+ }
+
+ switch (I->getOpcode()) {
+ default: llvm_unreachable("Unexpected div/rem opcode");
+ case Instruction::SDiv: OpIndex = 0; break;
+ case Instruction::SRem: OpIndex = 1; break;
+ case Instruction::UDiv: OpIndex = 2; break;
+ case Instruction::URem: OpIndex = 3; break;
+ }
+
+ const DivRemEntry &TypeEntry = OpTable[TypeIndex];
+ const DivRemEntry::DivRemResult &OpEntry = TypeEntry.ResultTable[OpIndex];
+ unsigned Op0Reg = getRegForValue(I->getOperand(0));
+ if (Op0Reg == 0)
+ return false;
+ unsigned Op1Reg = getRegForValue(I->getOperand(1));
+ if (Op1Reg == 0)
+ return false;
+
+ // Move op0 into low-order input register.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(OpEntry.OpCopy), TypeEntry.LowInReg).addReg(Op0Reg);
+ // Zero-extend or sign-extend into high-order input register.
+ if (OpEntry.OpSignExtend) {
+ if (OpEntry.IsOpSigned)
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(OpEntry.OpSignExtend));
+ else {
+ unsigned Zero32 = createResultReg(&X86::GR32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(X86::MOV32r0), Zero32);
+
+ // Copy the zero into the appropriate sub/super/identical physical
+ // register. Unfortunately the operations needed are not uniform enough
+ // to fit neatly into the table above.
+ if (VT == MVT::i16) {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Copy), TypeEntry.HighInReg)
+ .addReg(Zero32, 0, X86::sub_16bit);
+ } else if (VT == MVT::i32) {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Copy), TypeEntry.HighInReg)
+ .addReg(Zero32);
+ } else if (VT == MVT::i64) {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::SUBREG_TO_REG), TypeEntry.HighInReg)
+ .addImm(0).addReg(Zero32).addImm(X86::sub_32bit);
+ }
+ }
+ }
+ // Generate the DIV/IDIV instruction.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(OpEntry.OpDivRem)).addReg(Op1Reg);
+ // For i8 remainder, we can't reference AH directly, as we'll end
+ // up with bogus copies like %R9B = COPY %AH. Reference AX
+ // instead to prevent AH references in a REX instruction.
+ //
+ // The current assumption of the fast register allocator is that isel
+ // won't generate explicit references to the GPR8_NOREX registers. If
+ // the allocator and/or the backend get enhanced to be more robust in
+ // that regard, this can be, and should be, removed.
+ unsigned ResultReg = 0;
+ if ((I->getOpcode() == Instruction::SRem ||
+ I->getOpcode() == Instruction::URem) &&
+ OpEntry.DivRemResultReg == X86::AH && Subtarget->is64Bit()) {
+ unsigned SourceSuperReg = createResultReg(&X86::GR16RegClass);
+ unsigned ResultSuperReg = createResultReg(&X86::GR16RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Copy), SourceSuperReg).addReg(X86::AX);
+
+ // Shift AX right by 8 bits instead of using AH.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SHR16ri),
+ ResultSuperReg).addReg(SourceSuperReg).addImm(8);
+
+ // Now reference the 8-bit subreg of the result.
+ ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultSuperReg,
+ /*Kill=*/true, X86::sub_8bit);
+ }
+ // Copy the result out of the physreg if we haven't already.
+ if (!ResultReg) {
+ ResultReg = createResultReg(TypeEntry.RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Copy), ResultReg)
+ .addReg(OpEntry.DivRemResultReg);
+ }
+ updateValueMap(I, ResultReg);
+
+ return true;
+}
+
+/// \brief Emit a conditional move instruction (if the are supported) to lower
+/// the select.
+bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
+ // Check if the subtarget supports these instructions.
+ if (!Subtarget->hasCMov())
+ return false;
+
+ // FIXME: Add support for i8.
+ if (RetVT < MVT::i16 || RetVT > MVT::i64)
+ return false;
+
+ const Value *Cond = I->getOperand(0);
+ const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
+ bool NeedTest = true;
+ X86::CondCode CC = X86::COND_NE;
+
+ // Optimize conditions coming from a compare if both instructions are in the
+ // same basic block (values defined in other basic blocks may not have
+ // initialized registers).
+ const auto *CI = dyn_cast<CmpInst>(Cond);
+ if (CI && (CI->getParent() == I->getParent())) {
+ CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+
+ // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
+ static const uint16_t SETFOpcTable[2][3] = {
+ { X86::SETNPr, X86::SETEr , X86::TEST8rr },
+ { X86::SETPr, X86::SETNEr, X86::OR8rr }
+ };
+ const uint16_t *SETFOpc = nullptr;
+ switch (Predicate) {
+ default: break;
+ case CmpInst::FCMP_OEQ:
+ SETFOpc = &SETFOpcTable[0][0];
+ Predicate = CmpInst::ICMP_NE;
+ break;
+ case CmpInst::FCMP_UNE:
+ SETFOpc = &SETFOpcTable[1][0];
+ Predicate = CmpInst::ICMP_NE;
+ break;
+ }
+
+ bool NeedSwap;
+ std::tie(CC, NeedSwap) = getX86ConditionCode(Predicate);
+ assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
+
+ const Value *CmpLHS = CI->getOperand(0);
+ const Value *CmpRHS = CI->getOperand(1);
+ if (NeedSwap)
+ std::swap(CmpLHS, CmpRHS);
+
+ EVT CmpVT = TLI.getValueType(DL, CmpLHS->getType());
+ // Emit a compare of the LHS and RHS, setting the flags.
+ if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc()))
+ return false;
+
+ if (SETFOpc) {
+ unsigned FlagReg1 = createResultReg(&X86::GR8RegClass);
+ unsigned FlagReg2 = createResultReg(&X86::GR8RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]),
+ FlagReg1);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]),
+ FlagReg2);
+ auto const &II = TII.get(SETFOpc[2]);
+ if (II.getNumDefs()) {
+ unsigned TmpReg = createResultReg(&X86::GR8RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, TmpReg)
+ .addReg(FlagReg2).addReg(FlagReg1);
+ } else {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+ .addReg(FlagReg2).addReg(FlagReg1);
+ }
+ }
+ NeedTest = false;
+ } else if (foldX86XALUIntrinsic(CC, I, Cond)) {
+ // Fake request the condition, otherwise the intrinsic might be completely
+ // optimized away.
+ unsigned TmpReg = getRegForValue(Cond);
+ if (TmpReg == 0)
+ return false;
+
+ NeedTest = false;
+ }
+
+ if (NeedTest) {
+ // Selects operate on i1, however, CondReg is 8 bits width and may contain
+ // garbage. Indeed, only the less significant bit is supposed to be
+ // accurate. If we read more than the lsb, we may see non-zero values
+ // whereas lsb is zero. Therefore, we have to truncate Op0Reg to i1 for
+ // the select. This is achieved by performing TEST against 1.
+ unsigned CondReg = getRegForValue(Cond);
+ if (CondReg == 0)
+ return false;
+ bool CondIsKill = hasTrivialKill(Cond);
+
+ // In case OpReg is a K register, COPY to a GPR
+ if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) {
+ unsigned KCondReg = CondReg;
+ CondReg = createResultReg(&X86::GR8RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), CondReg)
+ .addReg(KCondReg, getKillRegState(CondIsKill));
+ }
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
+ .addReg(CondReg, getKillRegState(CondIsKill))
+ .addImm(1);
+ }
+
+ const Value *LHS = I->getOperand(1);
+ const Value *RHS = I->getOperand(2);
+
+ unsigned RHSReg = getRegForValue(RHS);
+ bool RHSIsKill = hasTrivialKill(RHS);
+
+ unsigned LHSReg = getRegForValue(LHS);
+ bool LHSIsKill = hasTrivialKill(LHS);
+
+ if (!LHSReg || !RHSReg)
+ return false;
+
+ unsigned Opc = X86::getCMovFromCond(CC, RC->getSize());
+ unsigned ResultReg = fastEmitInst_rr(Opc, RC, RHSReg, RHSIsKill,
+ LHSReg, LHSIsKill);
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+/// \brief Emit SSE or AVX instructions to lower the select.
+///
+/// Try to use SSE1/SSE2 instructions to simulate a select without branches.
+/// This lowers fp selects into a CMP/AND/ANDN/OR sequence when the necessary
+/// SSE instructions are available. If AVX is available, try to use a VBLENDV.
+bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
+ // Optimize conditions coming from a compare if both instructions are in the
+ // same basic block (values defined in other basic blocks may not have
+ // initialized registers).
+ const auto *CI = dyn_cast<FCmpInst>(I->getOperand(0));
+ if (!CI || (CI->getParent() != I->getParent()))
+ return false;
+
+ if (I->getType() != CI->getOperand(0)->getType() ||
+ !((Subtarget->hasSSE1() && RetVT == MVT::f32) ||
+ (Subtarget->hasSSE2() && RetVT == MVT::f64)))
+ return false;
+
+ const Value *CmpLHS = CI->getOperand(0);
+ const Value *CmpRHS = CI->getOperand(1);
+ CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+
+ // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0.
+ // We don't have to materialize a zero constant for this case and can just use
+ // %x again on the RHS.
+ if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
+ const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS);
+ if (CmpRHSC && CmpRHSC->isNullValue())
+ CmpRHS = CmpLHS;
+ }
+
+ unsigned CC;
+ bool NeedSwap;
+ std::tie(CC, NeedSwap) = getX86SSEConditionCode(Predicate);
+ if (CC > 7)
+ return false;
+
+ if (NeedSwap)
+ std::swap(CmpLHS, CmpRHS);
+
+ // Choose the SSE instruction sequence based on data type (float or double).
+ static const uint16_t OpcTable[2][4] = {
+ { X86::CMPSSrr, X86::ANDPSrr, X86::ANDNPSrr, X86::ORPSrr },
+ { X86::CMPSDrr, X86::ANDPDrr, X86::ANDNPDrr, X86::ORPDrr }
+ };
+
+ const uint16_t *Opc = nullptr;
+ switch (RetVT.SimpleTy) {
+ default: return false;
+ case MVT::f32: Opc = &OpcTable[0][0]; break;
+ case MVT::f64: Opc = &OpcTable[1][0]; break;
+ }
+
+ const Value *LHS = I->getOperand(1);
+ const Value *RHS = I->getOperand(2);
+
+ unsigned LHSReg = getRegForValue(LHS);
+ bool LHSIsKill = hasTrivialKill(LHS);
+
+ unsigned RHSReg = getRegForValue(RHS);
+ bool RHSIsKill = hasTrivialKill(RHS);
+
+ unsigned CmpLHSReg = getRegForValue(CmpLHS);
+ bool CmpLHSIsKill = hasTrivialKill(CmpLHS);
+
+ unsigned CmpRHSReg = getRegForValue(CmpRHS);
+ bool CmpRHSIsKill = hasTrivialKill(CmpRHS);
+
+ if (!LHSReg || !RHSReg || !CmpLHS || !CmpRHS)
+ return false;
+
+ const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
+ unsigned ResultReg;
+
+ if (Subtarget->hasAVX512()) {
+ // If we have AVX512 we can use a mask compare and masked movss/sd.
+ const TargetRegisterClass *VR128X = &X86::VR128XRegClass;
+ const TargetRegisterClass *VK1 = &X86::VK1RegClass;
+
+ unsigned CmpOpcode =
+ (RetVT == MVT::f32) ? X86::VCMPSSZrr : X86::VCMPSDZrr;
+ unsigned CmpReg = fastEmitInst_rri(CmpOpcode, VK1, CmpLHSReg, CmpLHSIsKill,
+ CmpRHSReg, CmpRHSIsKill, CC);
+
+ // Need an IMPLICIT_DEF for the input that is used to generate the upper
+ // bits of the result register since its not based on any of the inputs.
+ unsigned ImplicitDefReg = createResultReg(VR128X);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
+
+ // Place RHSReg is the passthru of the masked movss/sd operation and put
+ // LHS in the input. The mask input comes from the compare.
+ unsigned MovOpcode =
+ (RetVT == MVT::f32) ? X86::VMOVSSZrrk : X86::VMOVSDZrrk;
+ unsigned MovReg = fastEmitInst_rrrr(MovOpcode, VR128X, RHSReg, RHSIsKill,
+ CmpReg, true, ImplicitDefReg, true,
+ LHSReg, LHSIsKill);
+
+ ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg).addReg(MovReg);
+
+ } else if (Subtarget->hasAVX()) {
+ const TargetRegisterClass *VR128 = &X86::VR128RegClass;
+
+ // If we have AVX, create 1 blendv instead of 3 logic instructions.
+ // Blendv was introduced with SSE 4.1, but the 2 register form implicitly
+ // uses XMM0 as the selection register. That may need just as many
+ // instructions as the AND/ANDN/OR sequence due to register moves, so
+ // don't bother.
+ unsigned CmpOpcode =
+ (RetVT == MVT::f32) ? X86::VCMPSSrr : X86::VCMPSDrr;
+ unsigned BlendOpcode =
+ (RetVT == MVT::f32) ? X86::VBLENDVPSrr : X86::VBLENDVPDrr;
+
+ unsigned CmpReg = fastEmitInst_rri(CmpOpcode, RC, CmpLHSReg, CmpLHSIsKill,
+ CmpRHSReg, CmpRHSIsKill, CC);
+ unsigned VBlendReg = fastEmitInst_rrr(BlendOpcode, VR128, RHSReg, RHSIsKill,
+ LHSReg, LHSIsKill, CmpReg, true);
+ ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg).addReg(VBlendReg);
+ } else {
+ const TargetRegisterClass *VR128 = &X86::VR128RegClass;
+ unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill,
+ CmpRHSReg, CmpRHSIsKill, CC);
+ unsigned AndReg = fastEmitInst_rr(Opc[1], VR128, CmpReg, /*IsKill=*/false,
+ LHSReg, LHSIsKill);
+ unsigned AndNReg = fastEmitInst_rr(Opc[2], VR128, CmpReg, /*IsKill=*/true,
+ RHSReg, RHSIsKill);
+ unsigned OrReg = fastEmitInst_rr(Opc[3], VR128, AndNReg, /*IsKill=*/true,
+ AndReg, /*IsKill=*/true);
+ ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg).addReg(OrReg);
+ }
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
+ // These are pseudo CMOV instructions and will be later expanded into control-
+ // flow.
+ unsigned Opc;
+ switch (RetVT.SimpleTy) {
+ default: return false;
+ case MVT::i8: Opc = X86::CMOV_GR8; break;
+ case MVT::i16: Opc = X86::CMOV_GR16; break;
+ case MVT::i32: Opc = X86::CMOV_GR32; break;
+ case MVT::f32: Opc = X86::CMOV_FR32; break;
+ case MVT::f64: Opc = X86::CMOV_FR64; break;
+ }
+
+ const Value *Cond = I->getOperand(0);
+ X86::CondCode CC = X86::COND_NE;
+
+ // Optimize conditions coming from a compare if both instructions are in the
+ // same basic block (values defined in other basic blocks may not have
+ // initialized registers).
+ const auto *CI = dyn_cast<CmpInst>(Cond);
+ if (CI && (CI->getParent() == I->getParent())) {
+ bool NeedSwap;
+ std::tie(CC, NeedSwap) = getX86ConditionCode(CI->getPredicate());
+ if (CC > X86::LAST_VALID_COND)
+ return false;
+
+ const Value *CmpLHS = CI->getOperand(0);
+ const Value *CmpRHS = CI->getOperand(1);
+
+ if (NeedSwap)
+ std::swap(CmpLHS, CmpRHS);
+
+ EVT CmpVT = TLI.getValueType(DL, CmpLHS->getType());
+ if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc()))
+ return false;
+ } else {
+ unsigned CondReg = getRegForValue(Cond);
+ if (CondReg == 0)
+ return false;
+ bool CondIsKill = hasTrivialKill(Cond);
+
+ // In case OpReg is a K register, COPY to a GPR
+ if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) {
+ unsigned KCondReg = CondReg;
+ CondReg = createResultReg(&X86::GR8RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), CondReg)
+ .addReg(KCondReg, getKillRegState(CondIsKill));
+ }
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
+ .addReg(CondReg, getKillRegState(CondIsKill))
+ .addImm(1);
+ }
+
+ const Value *LHS = I->getOperand(1);
+ const Value *RHS = I->getOperand(2);
+
+ unsigned LHSReg = getRegForValue(LHS);
+ bool LHSIsKill = hasTrivialKill(LHS);
+
+ unsigned RHSReg = getRegForValue(RHS);
+ bool RHSIsKill = hasTrivialKill(RHS);
+
+ if (!LHSReg || !RHSReg)
+ return false;
+
+ const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
+
+ unsigned ResultReg =
+ fastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CC);
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool X86FastISel::X86SelectSelect(const Instruction *I) {
+ MVT RetVT;
+ if (!isTypeLegal(I->getType(), RetVT))
+ return false;
+
+ // Check if we can fold the select.
+ if (const auto *CI = dyn_cast<CmpInst>(I->getOperand(0))) {
+ CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+ const Value *Opnd = nullptr;
+ switch (Predicate) {
+ default: break;
+ case CmpInst::FCMP_FALSE: Opnd = I->getOperand(2); break;
+ case CmpInst::FCMP_TRUE: Opnd = I->getOperand(1); break;
+ }
+ // No need for a select anymore - this is an unconditional move.
+ if (Opnd) {
+ unsigned OpReg = getRegForValue(Opnd);
+ if (OpReg == 0)
+ return false;
+ bool OpIsKill = hasTrivialKill(Opnd);
+ const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
+ unsigned ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg)
+ .addReg(OpReg, getKillRegState(OpIsKill));
+ updateValueMap(I, ResultReg);
+ return true;
+ }
+ }
+
+ // First try to use real conditional move instructions.
+ if (X86FastEmitCMoveSelect(RetVT, I))
+ return true;
+
+ // Try to use a sequence of SSE instructions to simulate a conditional move.
+ if (X86FastEmitSSESelect(RetVT, I))
+ return true;
+
+ // Fall-back to pseudo conditional move instructions, which will be later
+ // converted to control-flow.
+ if (X86FastEmitPseudoSelect(RetVT, I))
+ return true;
+
+ return false;
+}
+
+bool X86FastISel::X86SelectSIToFP(const Instruction *I) {
+ // The target-independent selection algorithm in FastISel already knows how
+ // to select a SINT_TO_FP if the target is SSE but not AVX.
+ // Early exit if the subtarget doesn't have AVX.
+ if (!Subtarget->hasAVX())
+ return false;
+
+ if (!I->getOperand(0)->getType()->isIntegerTy(32))
+ return false;
+
+ // Select integer to float/double conversion.
+ unsigned OpReg = getRegForValue(I->getOperand(0));
+ if (OpReg == 0)
+ return false;
+
+ const TargetRegisterClass *RC = nullptr;
+ unsigned Opcode;
+
+ if (I->getType()->isDoubleTy()) {
+ // sitofp int -> double
+ Opcode = X86::VCVTSI2SDrr;
+ RC = &X86::FR64RegClass;
+ } else if (I->getType()->isFloatTy()) {
+ // sitofp int -> float
+ Opcode = X86::VCVTSI2SSrr;
+ RC = &X86::FR32RegClass;
+ } else
+ return false;
+
+ unsigned ImplicitDefReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
+ unsigned ResultReg =
+ fastEmitInst_rr(Opcode, RC, ImplicitDefReg, true, OpReg, false);
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+// Helper method used by X86SelectFPExt and X86SelectFPTrunc.
+bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I,
+ unsigned TargetOpc,
+ const TargetRegisterClass *RC) {
+ assert((I->getOpcode() == Instruction::FPExt ||
+ I->getOpcode() == Instruction::FPTrunc) &&
+ "Instruction must be an FPExt or FPTrunc!");
+
+ unsigned OpReg = getRegForValue(I->getOperand(0));
+ if (OpReg == 0)
+ return false;
+
+ unsigned ResultReg = createResultReg(RC);
+ MachineInstrBuilder MIB;
+ MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpc),
+ ResultReg);
+ if (Subtarget->hasAVX())
+ MIB.addReg(OpReg);
+ MIB.addReg(OpReg);
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool X86FastISel::X86SelectFPExt(const Instruction *I) {
+ if (X86ScalarSSEf64 && I->getType()->isDoubleTy() &&
+ I->getOperand(0)->getType()->isFloatTy()) {
+ // fpext from float to double.
+ unsigned Opc = Subtarget->hasAVX() ? X86::VCVTSS2SDrr : X86::CVTSS2SDrr;
+ return X86SelectFPExtOrFPTrunc(I, Opc, &X86::FR64RegClass);
+ }
+
+ return false;
+}
+
+bool X86FastISel::X86SelectFPTrunc(const Instruction *I) {
+ if (X86ScalarSSEf64 && I->getType()->isFloatTy() &&
+ I->getOperand(0)->getType()->isDoubleTy()) {
+ // fptrunc from double to float.
+ unsigned Opc = Subtarget->hasAVX() ? X86::VCVTSD2SSrr : X86::CVTSD2SSrr;
+ return X86SelectFPExtOrFPTrunc(I, Opc, &X86::FR32RegClass);
+ }
+
+ return false;
+}
+
+bool X86FastISel::X86SelectTrunc(const Instruction *I) {
+ EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
+ EVT DstVT = TLI.getValueType(DL, I->getType());
+
+ // This code only handles truncation to byte.
+ if (DstVT != MVT::i8 && DstVT != MVT::i1)
+ return false;
+ if (!TLI.isTypeLegal(SrcVT))
+ return false;
+
+ unsigned InputReg = getRegForValue(I->getOperand(0));
+ if (!InputReg)
+ // Unhandled operand. Halt "fast" selection and bail.
+ return false;
+
+ if (SrcVT == MVT::i8) {
+ // Truncate from i8 to i1; no code needed.
+ updateValueMap(I, InputReg);
+ return true;
+ }
+
+ bool KillInputReg = false;
+ if (!Subtarget->is64Bit()) {
+ // If we're on x86-32; we can't extract an i8 from a general register.
+ // First issue a copy to GR16_ABCD or GR32_ABCD.
+ const TargetRegisterClass *CopyRC =
+ (SrcVT == MVT::i16) ? &X86::GR16_ABCDRegClass : &X86::GR32_ABCDRegClass;
+ unsigned CopyReg = createResultReg(CopyRC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), CopyReg).addReg(InputReg);
+ InputReg = CopyReg;
+ KillInputReg = true;
+ }
+
+ // Issue an extract_subreg.
+ unsigned ResultReg = fastEmitInst_extractsubreg(MVT::i8,
+ InputReg, KillInputReg,
+ X86::sub_8bit);
+ if (!ResultReg)
+ return false;
+
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool X86FastISel::IsMemcpySmall(uint64_t Len) {
+ return Len <= (Subtarget->is64Bit() ? 32 : 16);
+}
+
+bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM,
+ X86AddressMode SrcAM, uint64_t Len) {
+
+ // Make sure we don't bloat code by inlining very large memcpy's.
+ if (!IsMemcpySmall(Len))
+ return false;
+
+ bool i64Legal = Subtarget->is64Bit();
+
+ // We don't care about alignment here since we just emit integer accesses.
+ while (Len) {
+ MVT VT;
+ if (Len >= 8 && i64Legal)
+ VT = MVT::i64;
+ else if (Len >= 4)
+ VT = MVT::i32;
+ else if (Len >= 2)
+ VT = MVT::i16;
+ else
+ VT = MVT::i8;
+
+ unsigned Reg;
+ bool RV = X86FastEmitLoad(VT, SrcAM, nullptr, Reg);
+ RV &= X86FastEmitStore(VT, Reg, /*Kill=*/true, DestAM);
+ assert(RV && "Failed to emit load or store??");
+
+ unsigned Size = VT.getSizeInBits()/8;
+ Len -= Size;
+ DestAM.Disp += Size;
+ SrcAM.Disp += Size;
+ }
+
+ return true;
+}
+
+bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
+ // FIXME: Handle more intrinsics.
+ switch (II->getIntrinsicID()) {
+ default: return false;
+ case Intrinsic::convert_from_fp16:
+ case Intrinsic::convert_to_fp16: {
+ if (Subtarget->useSoftFloat() || !Subtarget->hasF16C())
+ return false;
+
+ const Value *Op = II->getArgOperand(0);
+ unsigned InputReg = getRegForValue(Op);
+ if (InputReg == 0)
+ return false;
+
+ // F16C only allows converting from float to half and from half to float.
+ bool IsFloatToHalf = II->getIntrinsicID() == Intrinsic::convert_to_fp16;
+ if (IsFloatToHalf) {
+ if (!Op->getType()->isFloatTy())
+ return false;
+ } else {
+ if (!II->getType()->isFloatTy())
+ return false;
+ }
+
+ unsigned ResultReg = 0;
+ const TargetRegisterClass *RC = TLI.getRegClassFor(MVT::v8i16);
+ if (IsFloatToHalf) {
+ // 'InputReg' is implicitly promoted from register class FR32 to
+ // register class VR128 by method 'constrainOperandRegClass' which is
+ // directly called by 'fastEmitInst_ri'.
+ // Instruction VCVTPS2PHrr takes an extra immediate operand which is
+ // used to provide rounding control: use MXCSR.RC, encoded as 0b100.
+ // It's consistent with the other FP instructions, which are usually
+ // controlled by MXCSR.
+ InputReg = fastEmitInst_ri(X86::VCVTPS2PHrr, RC, InputReg, false, 4);
+
+ // Move the lower 32-bits of ResultReg to another register of class GR32.
+ ResultReg = createResultReg(&X86::GR32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(X86::VMOVPDI2DIrr), ResultReg)
+ .addReg(InputReg, RegState::Kill);
+
+ // The result value is in the lower 16-bits of ResultReg.
+ unsigned RegIdx = X86::sub_16bit;
+ ResultReg = fastEmitInst_extractsubreg(MVT::i16, ResultReg, true, RegIdx);
+ } else {
+ assert(Op->getType()->isIntegerTy(16) && "Expected a 16-bit integer!");
+ // Explicitly sign-extend the input to 32-bit.
+ InputReg = fastEmit_r(MVT::i16, MVT::i32, ISD::SIGN_EXTEND, InputReg,
+ /*Kill=*/false);
+
+ // The following SCALAR_TO_VECTOR will be expanded into a VMOVDI2PDIrr.
+ InputReg = fastEmit_r(MVT::i32, MVT::v4i32, ISD::SCALAR_TO_VECTOR,
+ InputReg, /*Kill=*/true);
+
+ InputReg = fastEmitInst_r(X86::VCVTPH2PSrr, RC, InputReg, /*Kill=*/true);
+
+ // The result value is in the lower 32-bits of ResultReg.
+ // Emit an explicit copy from register class VR128 to register class FR32.
+ ResultReg = createResultReg(&X86::FR32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg)
+ .addReg(InputReg, RegState::Kill);
+ }
+
+ updateValueMap(II, ResultReg);
+ return true;
+ }
+ case Intrinsic::frameaddress: {
+ MachineFunction *MF = FuncInfo.MF;
+ if (MF->getTarget().getMCAsmInfo()->usesWindowsCFI())
+ return false;
+
+ Type *RetTy = II->getCalledFunction()->getReturnType();
+
+ MVT VT;
+ if (!isTypeLegal(RetTy, VT))
+ return false;
+
+ unsigned Opc;
+ const TargetRegisterClass *RC = nullptr;
+
+ switch (VT.SimpleTy) {
+ default: llvm_unreachable("Invalid result type for frameaddress.");
+ case MVT::i32: Opc = X86::MOV32rm; RC = &X86::GR32RegClass; break;
+ case MVT::i64: Opc = X86::MOV64rm; RC = &X86::GR64RegClass; break;
+ }
+
+ // This needs to be set before we call getPtrSizedFrameRegister, otherwise
+ // we get the wrong frame register.
+ MachineFrameInfo &MFI = MF->getFrameInfo();
+ MFI.setFrameAddressIsTaken(true);
+
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(*MF);
+ assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
+ (FrameReg == X86::EBP && VT == MVT::i32)) &&
+ "Invalid Frame Register!");
+
+ // Always make a copy of the frame register to to a vreg first, so that we
+ // never directly reference the frame register (the TwoAddressInstruction-
+ // Pass doesn't like that).
+ unsigned SrcReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), SrcReg).addReg(FrameReg);
+
+ // Now recursively load from the frame address.
+ // movq (%rbp), %rax
+ // movq (%rax), %rax
+ // movq (%rax), %rax
+ // ...
+ unsigned DestReg;
+ unsigned Depth = cast<ConstantInt>(II->getOperand(0))->getZExtValue();
+ while (Depth--) {
+ DestReg = createResultReg(RC);
+ addDirectMem(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc), DestReg), SrcReg);
+ SrcReg = DestReg;
+ }
+
+ updateValueMap(II, SrcReg);
+ return true;
+ }
+ case Intrinsic::memcpy: {
+ const MemCpyInst *MCI = cast<MemCpyInst>(II);
+ // Don't handle volatile or variable length memcpys.
+ if (MCI->isVolatile())
+ return false;
+
+ if (isa<ConstantInt>(MCI->getLength())) {
+ // Small memcpy's are common enough that we want to do them
+ // without a call if possible.
+ uint64_t Len = cast<ConstantInt>(MCI->getLength())->getZExtValue();
+ if (IsMemcpySmall(Len)) {
+ X86AddressMode DestAM, SrcAM;
+ if (!X86SelectAddress(MCI->getRawDest(), DestAM) ||
+ !X86SelectAddress(MCI->getRawSource(), SrcAM))
+ return false;
+ TryEmitSmallMemcpy(DestAM, SrcAM, Len);
+ return true;
+ }
+ }
+
+ unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
+ if (!MCI->getLength()->getType()->isIntegerTy(SizeWidth))
+ return false;
+
+ if (MCI->getSourceAddressSpace() > 255 || MCI->getDestAddressSpace() > 255)
+ return false;
+
+ return lowerCallTo(II, "memcpy", II->getNumArgOperands() - 2);
+ }
+ case Intrinsic::memset: {
+ const MemSetInst *MSI = cast<MemSetInst>(II);
+
+ if (MSI->isVolatile())
+ return false;
+
+ unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
+ if (!MSI->getLength()->getType()->isIntegerTy(SizeWidth))
+ return false;
+
+ if (MSI->getDestAddressSpace() > 255)
+ return false;
+
+ return lowerCallTo(II, "memset", II->getNumArgOperands() - 2);
+ }
+ case Intrinsic::stackprotector: {
+ // Emit code to store the stack guard onto the stack.
+ EVT PtrTy = TLI.getPointerTy(DL);
+
+ const Value *Op1 = II->getArgOperand(0); // The guard's value.
+ const AllocaInst *Slot = cast<AllocaInst>(II->getArgOperand(1));
+
+ MFI.setStackProtectorIndex(FuncInfo.StaticAllocaMap[Slot]);
+
+ // Grab the frame index.
+ X86AddressMode AM;
+ if (!X86SelectAddress(Slot, AM)) return false;
+ if (!X86FastEmitStore(PtrTy, Op1, AM)) return false;
+ return true;
+ }
+ case Intrinsic::dbg_declare: {
+ const DbgDeclareInst *DI = cast<DbgDeclareInst>(II);
+ X86AddressMode AM;
+ assert(DI->getAddress() && "Null address should be checked earlier!");
+ if (!X86SelectAddress(DI->getAddress(), AM))
+ return false;
+ const MCInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE);
+ // FIXME may need to add RegState::Debug to any registers produced,
+ // although ESP/EBP should be the only ones at the moment.
+ assert(DI->getVariable()->isValidLocationForIntrinsic(DbgLoc) &&
+ "Expected inlined-at fields to agree");
+ addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II), AM)
+ .addImm(0)
+ .addMetadata(DI->getVariable())
+ .addMetadata(DI->getExpression());
+ return true;
+ }
+ case Intrinsic::trap: {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TRAP));
+ return true;
+ }
+ case Intrinsic::sqrt: {
+ if (!Subtarget->hasSSE1())
+ return false;
+
+ Type *RetTy = II->getCalledFunction()->getReturnType();
+
+ MVT VT;
+ if (!isTypeLegal(RetTy, VT))
+ return false;
+
+ // Unfortunately we can't use fastEmit_r, because the AVX version of FSQRT
+ // is not generated by FastISel yet.
+ // FIXME: Update this code once tablegen can handle it.
+ static const uint16_t SqrtOpc[2][2] = {
+ {X86::SQRTSSr, X86::VSQRTSSr},
+ {X86::SQRTSDr, X86::VSQRTSDr}
+ };
+ bool HasAVX = Subtarget->hasAVX();
+ unsigned Opc;
+ const TargetRegisterClass *RC;
+ switch (VT.SimpleTy) {
+ default: return false;
+ case MVT::f32: Opc = SqrtOpc[0][HasAVX]; RC = &X86::FR32RegClass; break;
+ case MVT::f64: Opc = SqrtOpc[1][HasAVX]; RC = &X86::FR64RegClass; break;
+ }
+
+ const Value *SrcVal = II->getArgOperand(0);
+ unsigned SrcReg = getRegForValue(SrcVal);
+
+ if (SrcReg == 0)
+ return false;
+
+ unsigned ImplicitDefReg = 0;
+ if (HasAVX) {
+ ImplicitDefReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
+ }
+
+ unsigned ResultReg = createResultReg(RC);
+ MachineInstrBuilder MIB;
+ MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
+ ResultReg);
+
+ if (ImplicitDefReg)
+ MIB.addReg(ImplicitDefReg);
+
+ MIB.addReg(SrcReg);
+
+ updateValueMap(II, ResultReg);
+ return true;
+ }
+ case Intrinsic::sadd_with_overflow:
+ case Intrinsic::uadd_with_overflow:
+ case Intrinsic::ssub_with_overflow:
+ case Intrinsic::usub_with_overflow:
+ case Intrinsic::smul_with_overflow:
+ case Intrinsic::umul_with_overflow: {
+ // This implements the basic lowering of the xalu with overflow intrinsics
+ // into add/sub/mul followed by either seto or setb.
+ const Function *Callee = II->getCalledFunction();
+ auto *Ty = cast<StructType>(Callee->getReturnType());
+ Type *RetTy = Ty->getTypeAtIndex(0U);
+ assert(Ty->getTypeAtIndex(1)->isIntegerTy() &&
+ Ty->getTypeAtIndex(1)->getScalarSizeInBits() == 1 &&
+ "Overflow value expected to be an i1");
+
+ MVT VT;
+ if (!isTypeLegal(RetTy, VT))
+ return false;
+
+ if (VT < MVT::i8 || VT > MVT::i64)
+ return false;
+
+ const Value *LHS = II->getArgOperand(0);
+ const Value *RHS = II->getArgOperand(1);
+
+ // Canonicalize immediate to the RHS.
+ if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) &&
+ isCommutativeIntrinsic(II))
+ std::swap(LHS, RHS);
+
+ bool UseIncDec = false;
+ if (isa<ConstantInt>(RHS) && cast<ConstantInt>(RHS)->isOne())
+ UseIncDec = true;
+
+ unsigned BaseOpc, CondOpc;
+ switch (II->getIntrinsicID()) {
+ default: llvm_unreachable("Unexpected intrinsic!");
+ case Intrinsic::sadd_with_overflow:
+ BaseOpc = UseIncDec ? unsigned(X86ISD::INC) : unsigned(ISD::ADD);
+ CondOpc = X86::SETOr;
+ break;
+ case Intrinsic::uadd_with_overflow:
+ BaseOpc = ISD::ADD; CondOpc = X86::SETBr; break;
+ case Intrinsic::ssub_with_overflow:
+ BaseOpc = UseIncDec ? unsigned(X86ISD::DEC) : unsigned(ISD::SUB);
+ CondOpc = X86::SETOr;
+ break;
+ case Intrinsic::usub_with_overflow:
+ BaseOpc = ISD::SUB; CondOpc = X86::SETBr; break;
+ case Intrinsic::smul_with_overflow:
+ BaseOpc = X86ISD::SMUL; CondOpc = X86::SETOr; break;
+ case Intrinsic::umul_with_overflow:
+ BaseOpc = X86ISD::UMUL; CondOpc = X86::SETOr; break;
+ }
+
+ unsigned LHSReg = getRegForValue(LHS);
+ if (LHSReg == 0)
+ return false;
+ bool LHSIsKill = hasTrivialKill(LHS);
+
+ unsigned ResultReg = 0;
+ // Check if we have an immediate version.
+ if (const auto *CI = dyn_cast<ConstantInt>(RHS)) {
+ static const uint16_t Opc[2][4] = {
+ { X86::INC8r, X86::INC16r, X86::INC32r, X86::INC64r },
+ { X86::DEC8r, X86::DEC16r, X86::DEC32r, X86::DEC64r }
+ };
+
+ if (BaseOpc == X86ISD::INC || BaseOpc == X86ISD::DEC) {
+ ResultReg = createResultReg(TLI.getRegClassFor(VT));
+ bool IsDec = BaseOpc == X86ISD::DEC;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc[IsDec][VT.SimpleTy-MVT::i8]), ResultReg)
+ .addReg(LHSReg, getKillRegState(LHSIsKill));
+ } else
+ ResultReg = fastEmit_ri(VT, VT, BaseOpc, LHSReg, LHSIsKill,
+ CI->getZExtValue());
+ }
+
+ unsigned RHSReg;
+ bool RHSIsKill;
+ if (!ResultReg) {
+ RHSReg = getRegForValue(RHS);
+ if (RHSReg == 0)
+ return false;
+ RHSIsKill = hasTrivialKill(RHS);
+ ResultReg = fastEmit_rr(VT, VT, BaseOpc, LHSReg, LHSIsKill, RHSReg,
+ RHSIsKill);
+ }
+
+ // FastISel doesn't have a pattern for all X86::MUL*r and X86::IMUL*r. Emit
+ // it manually.
+ if (BaseOpc == X86ISD::UMUL && !ResultReg) {
+ static const uint16_t MULOpc[] =
+ { X86::MUL8r, X86::MUL16r, X86::MUL32r, X86::MUL64r };
+ static const MCPhysReg Reg[] = { X86::AL, X86::AX, X86::EAX, X86::RAX };
+ // First copy the first operand into RAX, which is an implicit input to
+ // the X86::MUL*r instruction.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), Reg[VT.SimpleTy-MVT::i8])
+ .addReg(LHSReg, getKillRegState(LHSIsKill));
+ ResultReg = fastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8],
+ TLI.getRegClassFor(VT), RHSReg, RHSIsKill);
+ } else if (BaseOpc == X86ISD::SMUL && !ResultReg) {
+ static const uint16_t MULOpc[] =
+ { X86::IMUL8r, X86::IMUL16rr, X86::IMUL32rr, X86::IMUL64rr };
+ if (VT == MVT::i8) {
+ // Copy the first operand into AL, which is an implicit input to the
+ // X86::IMUL8r instruction.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), X86::AL)
+ .addReg(LHSReg, getKillRegState(LHSIsKill));
+ ResultReg = fastEmitInst_r(MULOpc[0], TLI.getRegClassFor(VT), RHSReg,
+ RHSIsKill);
+ } else
+ ResultReg = fastEmitInst_rr(MULOpc[VT.SimpleTy-MVT::i8],
+ TLI.getRegClassFor(VT), LHSReg, LHSIsKill,
+ RHSReg, RHSIsKill);
+ }
+
+ if (!ResultReg)
+ return false;
+
+ // Assign to a GPR since the overflow return value is lowered to a SETcc.
+ unsigned ResultReg2 = createResultReg(&X86::GR8RegClass);
+ assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers.");
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CondOpc),
+ ResultReg2);
+
+ updateValueMap(II, ResultReg, 2);
+ return true;
+ }
+ case Intrinsic::x86_sse_cvttss2si:
+ case Intrinsic::x86_sse_cvttss2si64:
+ case Intrinsic::x86_sse2_cvttsd2si:
+ case Intrinsic::x86_sse2_cvttsd2si64: {
+ bool IsInputDouble;
+ switch (II->getIntrinsicID()) {
+ default: llvm_unreachable("Unexpected intrinsic.");
+ case Intrinsic::x86_sse_cvttss2si:
+ case Intrinsic::x86_sse_cvttss2si64:
+ if (!Subtarget->hasSSE1())
+ return false;
+ IsInputDouble = false;
+ break;
+ case Intrinsic::x86_sse2_cvttsd2si:
+ case Intrinsic::x86_sse2_cvttsd2si64:
+ if (!Subtarget->hasSSE2())
+ return false;
+ IsInputDouble = true;
+ break;
+ }
+
+ Type *RetTy = II->getCalledFunction()->getReturnType();
+ MVT VT;
+ if (!isTypeLegal(RetTy, VT))
+ return false;
+
+ static const uint16_t CvtOpc[2][2][2] = {
+ { { X86::CVTTSS2SIrr, X86::VCVTTSS2SIrr },
+ { X86::CVTTSS2SI64rr, X86::VCVTTSS2SI64rr } },
+ { { X86::CVTTSD2SIrr, X86::VCVTTSD2SIrr },
+ { X86::CVTTSD2SI64rr, X86::VCVTTSD2SI64rr } }
+ };
+ bool HasAVX = Subtarget->hasAVX();
+ unsigned Opc;
+ switch (VT.SimpleTy) {
+ default: llvm_unreachable("Unexpected result type.");
+ case MVT::i32: Opc = CvtOpc[IsInputDouble][0][HasAVX]; break;
+ case MVT::i64: Opc = CvtOpc[IsInputDouble][1][HasAVX]; break;
+ }
+
+ // Check if we can fold insertelement instructions into the convert.
+ const Value *Op = II->getArgOperand(0);
+ while (auto *IE = dyn_cast<InsertElementInst>(Op)) {
+ const Value *Index = IE->getOperand(2);
+ if (!isa<ConstantInt>(Index))
+ break;
+ unsigned Idx = cast<ConstantInt>(Index)->getZExtValue();
+
+ if (Idx == 0) {
+ Op = IE->getOperand(1);
+ break;
+ }
+ Op = IE->getOperand(0);
+ }
+
+ unsigned Reg = getRegForValue(Op);
+ if (Reg == 0)
+ return false;
+
+ unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+ .addReg(Reg);
+
+ updateValueMap(II, ResultReg);
+ return true;
+ }
+ }
+}
+
+bool X86FastISel::fastLowerArguments() {
+ if (!FuncInfo.CanLowerReturn)
+ return false;
+
+ const Function *F = FuncInfo.Fn;
+ if (F->isVarArg())
+ return false;
+
+ CallingConv::ID CC = F->getCallingConv();
+ if (CC != CallingConv::C)
+ return false;
+
+ if (Subtarget->isCallingConvWin64(CC))
+ return false;
+
+ if (!Subtarget->is64Bit())
+ return false;
+
+ // Only handle simple cases. i.e. Up to 6 i32/i64 scalar arguments.
+ unsigned GPRCnt = 0;
+ unsigned FPRCnt = 0;
+ unsigned Idx = 0;
+ for (auto const &Arg : F->args()) {
+ // The first argument is at index 1.
+ ++Idx;
+ if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) ||
+ F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
+ F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
+ F->getAttributes().hasAttribute(Idx, Attribute::SwiftSelf) ||
+ F->getAttributes().hasAttribute(Idx, Attribute::SwiftError) ||
+ F->getAttributes().hasAttribute(Idx, Attribute::Nest))
+ return false;
+
+ Type *ArgTy = Arg.getType();
+ if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy())
+ return false;
+
+ EVT ArgVT = TLI.getValueType(DL, ArgTy);
+ if (!ArgVT.isSimple()) return false;
+ switch (ArgVT.getSimpleVT().SimpleTy) {
+ default: return false;
+ case MVT::i32:
+ case MVT::i64:
+ ++GPRCnt;
+ break;
+ case MVT::f32:
+ case MVT::f64:
+ if (!Subtarget->hasSSE1())
+ return false;
+ ++FPRCnt;
+ break;
+ }
+
+ if (GPRCnt > 6)
+ return false;
+
+ if (FPRCnt > 8)
+ return false;
+ }
+
+ static const MCPhysReg GPR32ArgRegs[] = {
+ X86::EDI, X86::ESI, X86::EDX, X86::ECX, X86::R8D, X86::R9D
+ };
+ static const MCPhysReg GPR64ArgRegs[] = {
+ X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8 , X86::R9
+ };
+ static const MCPhysReg XMMArgRegs[] = {
+ X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+ X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+ };
+
+ unsigned GPRIdx = 0;
+ unsigned FPRIdx = 0;
+ for (auto const &Arg : F->args()) {
+ MVT VT = TLI.getSimpleValueType(DL, Arg.getType());
+ const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
+ unsigned SrcReg;
+ switch (VT.SimpleTy) {
+ default: llvm_unreachable("Unexpected value type.");
+ case MVT::i32: SrcReg = GPR32ArgRegs[GPRIdx++]; break;
+ case MVT::i64: SrcReg = GPR64ArgRegs[GPRIdx++]; break;
+ case MVT::f32: LLVM_FALLTHROUGH;
+ case MVT::f64: SrcReg = XMMArgRegs[FPRIdx++]; break;
+ }
+ unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
+ // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
+ // Without this, EmitLiveInCopies may eliminate the livein if its only
+ // use is a bitcast (which isn't turned into an instruction).
+ unsigned ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg)
+ .addReg(DstReg, getKillRegState(true));
+ updateValueMap(&Arg, ResultReg);
+ }
+ return true;
+}
+
+static unsigned computeBytesPoppedByCalleeForSRet(const X86Subtarget *Subtarget,
+ CallingConv::ID CC,
+ ImmutableCallSite *CS) {
+ if (Subtarget->is64Bit())
+ return 0;
+ if (Subtarget->getTargetTriple().isOSMSVCRT())
+ return 0;
+ if (CC == CallingConv::Fast || CC == CallingConv::GHC ||
+ CC == CallingConv::HiPE)
+ return 0;
+
+ if (CS)
+ if (CS->arg_empty() || !CS->paramHasAttr(1, Attribute::StructRet) ||
+ CS->paramHasAttr(1, Attribute::InReg) || Subtarget->isTargetMCU())
+ return 0;
+
+ return 4;
+}
+
+bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
+ auto &OutVals = CLI.OutVals;
+ auto &OutFlags = CLI.OutFlags;
+ auto &OutRegs = CLI.OutRegs;
+ auto &Ins = CLI.Ins;
+ auto &InRegs = CLI.InRegs;
+ CallingConv::ID CC = CLI.CallConv;
+ bool &IsTailCall = CLI.IsTailCall;
+ bool IsVarArg = CLI.IsVarArg;
+ const Value *Callee = CLI.Callee;
+ MCSymbol *Symbol = CLI.Symbol;
+
+ bool Is64Bit = Subtarget->is64Bit();
+ bool IsWin64 = Subtarget->isCallingConvWin64(CC);
+
+ // Handle only C, fastcc, and webkit_js calling conventions for now.
+ switch (CC) {
+ default: return false;
+ case CallingConv::C:
+ case CallingConv::Fast:
+ case CallingConv::WebKit_JS:
+ case CallingConv::Swift:
+ case CallingConv::X86_FastCall:
+ case CallingConv::X86_StdCall:
+ case CallingConv::X86_ThisCall:
+ case CallingConv::X86_64_Win64:
+ case CallingConv::X86_64_SysV:
+ break;
+ }
+
+ // Allow SelectionDAG isel to handle tail calls.
+ if (IsTailCall)
+ return false;
+
+ // fastcc with -tailcallopt is intended to provide a guaranteed
+ // tail call optimization. Fastisel doesn't know how to do that.
+ if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt)
+ return false;
+
+ // Don't know how to handle Win64 varargs yet. Nothing special needed for
+ // x86-32. Special handling for x86-64 is implemented.
+ if (IsVarArg && IsWin64)
+ return false;
+
+ // Don't know about inalloca yet.
+ if (CLI.CS && CLI.CS->hasInAllocaArgument())
+ return false;
+
+ for (auto Flag : CLI.OutFlags)
+ if (Flag.isSwiftError())
+ return false;
+
+ SmallVector<MVT, 16> OutVTs;
+ SmallVector<unsigned, 16> ArgRegs;
+
+ // If this is a constant i1/i8/i16 argument, promote to i32 to avoid an extra
+ // instruction. This is safe because it is common to all FastISel supported
+ // calling conventions on x86.
+ for (int i = 0, e = OutVals.size(); i != e; ++i) {
+ Value *&Val = OutVals[i];
+ ISD::ArgFlagsTy Flags = OutFlags[i];
+ if (auto *CI = dyn_cast<ConstantInt>(Val)) {
+ if (CI->getBitWidth() < 32) {
+ if (Flags.isSExt())
+ Val = ConstantExpr::getSExt(CI, Type::getInt32Ty(CI->getContext()));
+ else
+ Val = ConstantExpr::getZExt(CI, Type::getInt32Ty(CI->getContext()));
+ }
+ }
+
+ // Passing bools around ends up doing a trunc to i1 and passing it.
+ // Codegen this as an argument + "and 1".
+ MVT VT;
+ auto *TI = dyn_cast<TruncInst>(Val);
+ unsigned ResultReg;
+ if (TI && TI->getType()->isIntegerTy(1) && CLI.CS &&
+ (TI->getParent() == CLI.CS->getInstruction()->getParent()) &&
+ TI->hasOneUse()) {
+ Value *PrevVal = TI->getOperand(0);
+ ResultReg = getRegForValue(PrevVal);
+
+ if (!ResultReg)
+ return false;
+
+ if (!isTypeLegal(PrevVal->getType(), VT))
+ return false;
+
+ ResultReg =
+ fastEmit_ri(VT, VT, ISD::AND, ResultReg, hasTrivialKill(PrevVal), 1);
+ } else {
+ if (!isTypeLegal(Val->getType(), VT))
+ return false;
+ ResultReg = getRegForValue(Val);
+ }
+
+ if (!ResultReg)
+ return false;
+
+ ArgRegs.push_back(ResultReg);
+ OutVTs.push_back(VT);
+ }
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, ArgLocs, CLI.RetTy->getContext());
+
+ // Allocate shadow area for Win64
+ if (IsWin64)
+ CCInfo.AllocateStack(32, 8);
+
+ CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86);
+
+ // Get a count of how many bytes are to be pushed on the stack.
+ unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
+
+ // Issue CALLSEQ_START
+ unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
+ .addImm(NumBytes).addImm(0);
+
+ // Walk the register/memloc assignments, inserting copies/loads.
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign const &VA = ArgLocs[i];
+ const Value *ArgVal = OutVals[VA.getValNo()];
+ MVT ArgVT = OutVTs[VA.getValNo()];
+
+ if (ArgVT == MVT::x86mmx)
+ return false;
+
+ unsigned ArgReg = ArgRegs[VA.getValNo()];
+
+ // Promote the value if needed.
+ switch (VA.getLocInfo()) {
+ case CCValAssign::Full: break;
+ case CCValAssign::SExt: {
+ assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
+ "Unexpected extend");
+
+ if (ArgVT == MVT::i1)
+ return false;
+
+ bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg,
+ ArgVT, ArgReg);
+ assert(Emitted && "Failed to emit a sext!"); (void)Emitted;
+ ArgVT = VA.getLocVT();
+ break;
+ }
+ case CCValAssign::ZExt: {
+ assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
+ "Unexpected extend");
+
+ // Handle zero-extension from i1 to i8, which is common.
+ if (ArgVT == MVT::i1) {
+ // Set the high bits to zero.
+ ArgReg = fastEmitZExtFromI1(MVT::i8, ArgReg, /*TODO: Kill=*/false);
+ ArgVT = MVT::i8;
+
+ if (ArgReg == 0)
+ return false;
+ }
+
+ bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg,
+ ArgVT, ArgReg);
+ assert(Emitted && "Failed to emit a zext!"); (void)Emitted;
+ ArgVT = VA.getLocVT();
+ break;
+ }
+ case CCValAssign::AExt: {
+ assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
+ "Unexpected extend");
+ bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(), ArgReg,
+ ArgVT, ArgReg);
+ if (!Emitted)
+ Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg,
+ ArgVT, ArgReg);
+ if (!Emitted)
+ Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg,
+ ArgVT, ArgReg);
+
+ assert(Emitted && "Failed to emit a aext!"); (void)Emitted;
+ ArgVT = VA.getLocVT();
+ break;
+ }
+ case CCValAssign::BCvt: {
+ ArgReg = fastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, ArgReg,
+ /*TODO: Kill=*/false);
+ assert(ArgReg && "Failed to emit a bitcast!");
+ ArgVT = VA.getLocVT();
+ break;
+ }
+ case CCValAssign::VExt:
+ // VExt has not been implemented, so this should be impossible to reach
+ // for now. However, fallback to Selection DAG isel once implemented.
+ return false;
+ case CCValAssign::AExtUpper:
+ case CCValAssign::SExtUpper:
+ case CCValAssign::ZExtUpper:
+ case CCValAssign::FPExt:
+ llvm_unreachable("Unexpected loc info!");
+ case CCValAssign::Indirect:
+ // FIXME: Indirect doesn't need extending, but fast-isel doesn't fully
+ // support this.
+ return false;
+ }
+
+ if (VA.isRegLoc()) {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg);
+ OutRegs.push_back(VA.getLocReg());
+ } else {
+ assert(VA.isMemLoc());
+
+ // Don't emit stores for undef values.
+ if (isa<UndefValue>(ArgVal))
+ continue;
+
+ unsigned LocMemOffset = VA.getLocMemOffset();
+ X86AddressMode AM;
+ AM.Base.Reg = RegInfo->getStackRegister();
+ AM.Disp = LocMemOffset;
+ ISD::ArgFlagsTy Flags = OutFlags[VA.getValNo()];
+ unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType());
+ MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+ MachinePointerInfo::getStack(*FuncInfo.MF, LocMemOffset),
+ MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment);
+ if (Flags.isByVal()) {
+ X86AddressMode SrcAM;
+ SrcAM.Base.Reg = ArgReg;
+ if (!TryEmitSmallMemcpy(AM, SrcAM, Flags.getByValSize()))
+ return false;
+ } else if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal)) {
+ // If this is a really simple value, emit this with the Value* version
+ // of X86FastEmitStore. If it isn't simple, we don't want to do this,
+ // as it can cause us to reevaluate the argument.
+ if (!X86FastEmitStore(ArgVT, ArgVal, AM, MMO))
+ return false;
+ } else {
+ bool ValIsKill = hasTrivialKill(ArgVal);
+ if (!X86FastEmitStore(ArgVT, ArgReg, ValIsKill, AM, MMO))
+ return false;
+ }
+ }
+ }
+
+ // ELF / PIC requires GOT in the EBX register before function calls via PLT
+ // GOT pointer.
+ if (Subtarget->isPICStyleGOT()) {
+ unsigned Base = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), X86::EBX).addReg(Base);
+ }
+
+ if (Is64Bit && IsVarArg && !IsWin64) {
+ // From AMD64 ABI document:
+ // For calls that may call functions that use varargs or stdargs
+ // (prototype-less calls or calls to functions containing ellipsis (...) in
+ // the declaration) %al is used as hidden argument to specify the number
+ // of SSE registers used. The contents of %al do not need to match exactly
+ // the number of registers, but must be an ubound on the number of SSE
+ // registers used and is in the range 0 - 8 inclusive.
+
+ // Count the number of XMM registers allocated.
+ static const MCPhysReg XMMArgRegs[] = {
+ X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+ X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+ };
+ unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
+ assert((Subtarget->hasSSE1() || !NumXMMRegs)
+ && "SSE registers cannot be used when SSE is disabled");
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri),
+ X86::AL).addImm(NumXMMRegs);
+ }
+
+ // Materialize callee address in a register. FIXME: GV address can be
+ // handled with a CALLpcrel32 instead.
+ X86AddressMode CalleeAM;
+ if (!X86SelectCallAddress(Callee, CalleeAM))
+ return false;
+
+ unsigned CalleeOp = 0;
+ const GlobalValue *GV = nullptr;
+ if (CalleeAM.GV != nullptr) {
+ GV = CalleeAM.GV;
+ } else if (CalleeAM.Base.Reg != 0) {
+ CalleeOp = CalleeAM.Base.Reg;
+ } else
+ return false;
+
+ // Issue the call.
+ MachineInstrBuilder MIB;
+ if (CalleeOp) {
+ // Register-indirect call.
+ unsigned CallOpc = Is64Bit ? X86::CALL64r : X86::CALL32r;
+ MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc))
+ .addReg(CalleeOp);
+ } else {
+ // Direct call.
+ assert(GV && "Not a direct call");
+ unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32;
+
+ // See if we need any target-specific flags on the GV operand.
+ unsigned char OpFlags = Subtarget->classifyGlobalFunctionReference(GV);
+ // Ignore NonLazyBind attribute in FastISel
+ if (OpFlags == X86II::MO_GOTPCREL)
+ OpFlags = 0;
+
+ MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc));
+ if (Symbol)
+ MIB.addSym(Symbol, OpFlags);
+ else
+ MIB.addGlobalAddress(GV, 0, OpFlags);
+ }
+
+ // Add a register mask operand representing the call-preserved registers.
+ // Proper defs for return values will be added by setPhysRegsDeadExcept().
+ MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC));
+
+ // Add an implicit use GOT pointer in EBX.
+ if (Subtarget->isPICStyleGOT())
+ MIB.addReg(X86::EBX, RegState::Implicit);
+
+ if (Is64Bit && IsVarArg && !IsWin64)
+ MIB.addReg(X86::AL, RegState::Implicit);
+
+ // Add implicit physical register uses to the call.
+ for (auto Reg : OutRegs)
+ MIB.addReg(Reg, RegState::Implicit);
+
+ // Issue CALLSEQ_END
+ unsigned NumBytesForCalleeToPop =
+ X86::isCalleePop(CC, Subtarget->is64Bit(), IsVarArg,
+ TM.Options.GuaranteedTailCallOpt)
+ ? NumBytes // Callee pops everything.
+ : computeBytesPoppedByCalleeForSRet(Subtarget, CC, CLI.CS);
+ unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp))
+ .addImm(NumBytes).addImm(NumBytesForCalleeToPop);
+
+ // Now handle call return values.
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCRetInfo(CC, IsVarArg, *FuncInfo.MF, RVLocs,
+ CLI.RetTy->getContext());
+ CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86);
+
+ // Copy all of the result registers out of their specified physreg.
+ unsigned ResultReg = FuncInfo.CreateRegs(CLI.RetTy);
+ for (unsigned i = 0; i != RVLocs.size(); ++i) {
+ CCValAssign &VA = RVLocs[i];
+ EVT CopyVT = VA.getValVT();
+ unsigned CopyReg = ResultReg + i;
+
+ // If this is x86-64, and we disabled SSE, we can't return FP values
+ if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
+ ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
+ report_fatal_error("SSE register return with SSE disabled");
+ }
+
+ // If we prefer to use the value in xmm registers, copy it out as f80 and
+ // use a truncate to move it from fp stack reg to xmm reg.
+ if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
+ isScalarFPTypeInSSEReg(VA.getValVT())) {
+ CopyVT = MVT::f80;
+ CopyReg = createResultReg(&X86::RFP80RegClass);
+ }
+
+ // Copy out the result.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), CopyReg).addReg(VA.getLocReg());
+ InRegs.push_back(VA.getLocReg());
+
+ // Round the f80 to the right size, which also moves it to the appropriate
+ // xmm register. This is accomplished by storing the f80 value in memory
+ // and then loading it back.
+ if (CopyVT != VA.getValVT()) {
+ EVT ResVT = VA.getValVT();
+ unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64;
+ unsigned MemSize = ResVT.getSizeInBits()/8;
+ int FI = MFI.CreateStackObject(MemSize, MemSize, false);
+ addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc)), FI)
+ .addReg(CopyReg);
+ Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm;
+ addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc), ResultReg + i), FI);
+ }
+ }
+
+ CLI.ResultReg = ResultReg;
+ CLI.NumResultRegs = RVLocs.size();
+ CLI.Call = MIB;
+
+ return true;
+}
+
+bool
+X86FastISel::fastSelectInstruction(const Instruction *I) {
+ switch (I->getOpcode()) {
+ default: break;
+ case Instruction::Load:
+ return X86SelectLoad(I);
+ case Instruction::Store:
+ return X86SelectStore(I);
+ case Instruction::Ret:
+ return X86SelectRet(I);
+ case Instruction::ICmp:
+ case Instruction::FCmp:
+ return X86SelectCmp(I);
+ case Instruction::ZExt:
+ return X86SelectZExt(I);
+ case Instruction::Br:
+ return X86SelectBranch(I);
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::Shl:
+ return X86SelectShift(I);
+ case Instruction::SDiv:
+ case Instruction::UDiv:
+ case Instruction::SRem:
+ case Instruction::URem:
+ return X86SelectDivRem(I);
+ case Instruction::Select:
+ return X86SelectSelect(I);
+ case Instruction::Trunc:
+ return X86SelectTrunc(I);
+ case Instruction::FPExt:
+ return X86SelectFPExt(I);
+ case Instruction::FPTrunc:
+ return X86SelectFPTrunc(I);
+ case Instruction::SIToFP:
+ return X86SelectSIToFP(I);
+ case Instruction::IntToPtr: // Deliberate fall-through.
+ case Instruction::PtrToInt: {
+ EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
+ EVT DstVT = TLI.getValueType(DL, I->getType());
+ if (DstVT.bitsGT(SrcVT))
+ return X86SelectZExt(I);
+ if (DstVT.bitsLT(SrcVT))
+ return X86SelectTrunc(I);
+ unsigned Reg = getRegForValue(I->getOperand(0));
+ if (Reg == 0) return false;
+ updateValueMap(I, Reg);
+ return true;
+ }
+ case Instruction::BitCast: {
+ // Select SSE2/AVX bitcasts between 128/256 bit vector types.
+ if (!Subtarget->hasSSE2())
+ return false;
+
+ EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
+ EVT DstVT = TLI.getValueType(DL, I->getType());
+
+ if (!SrcVT.isSimple() || !DstVT.isSimple())
+ return false;
+
+ MVT SVT = SrcVT.getSimpleVT();
+ MVT DVT = DstVT.getSimpleVT();
+
+ if (!SVT.is128BitVector() &&
+ !(Subtarget->hasAVX() && SVT.is256BitVector()) &&
+ !(Subtarget->hasAVX512() && SVT.is512BitVector() &&
+ (Subtarget->hasBWI() || (SVT.getScalarSizeInBits() >= 32 &&
+ DVT.getScalarSizeInBits() >= 32))))
+ return false;
+
+ unsigned Reg = getRegForValue(I->getOperand(0));
+ if (Reg == 0)
+ return false;
+
+ // No instruction is needed for conversion. Reuse the register used by
+ // the fist operand.
+ updateValueMap(I, Reg);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
+ if (VT > MVT::i64)
+ return 0;
+
+ uint64_t Imm = CI->getZExtValue();
+ if (Imm == 0) {
+ unsigned SrcReg = fastEmitInst_(X86::MOV32r0, &X86::GR32RegClass);
+ switch (VT.SimpleTy) {
+ default: llvm_unreachable("Unexpected value type");
+ case MVT::i1:
+ case MVT::i8:
+ return fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Kill=*/true,
+ X86::sub_8bit);
+ case MVT::i16:
+ return fastEmitInst_extractsubreg(MVT::i16, SrcReg, /*Kill=*/true,
+ X86::sub_16bit);
+ case MVT::i32:
+ return SrcReg;
+ case MVT::i64: {
+ unsigned ResultReg = createResultReg(&X86::GR64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg)
+ .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit);
+ return ResultReg;
+ }
+ }
+ }
+
+ unsigned Opc = 0;
+ switch (VT.SimpleTy) {
+ default: llvm_unreachable("Unexpected value type");
+ case MVT::i1: VT = MVT::i8; LLVM_FALLTHROUGH;
+ case MVT::i8: Opc = X86::MOV8ri; break;
+ case MVT::i16: Opc = X86::MOV16ri; break;
+ case MVT::i32: Opc = X86::MOV32ri; break;
+ case MVT::i64: {
+ if (isUInt<32>(Imm))
+ Opc = X86::MOV32ri;
+ else if (isInt<32>(Imm))
+ Opc = X86::MOV64ri32;
+ else
+ Opc = X86::MOV64ri;
+ break;
+ }
+ }
+ if (VT == MVT::i64 && Opc == X86::MOV32ri) {
+ unsigned SrcReg = fastEmitInst_i(Opc, &X86::GR32RegClass, Imm);
+ unsigned ResultReg = createResultReg(&X86::GR64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg)
+ .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit);
+ return ResultReg;
+ }
+ return fastEmitInst_i(Opc, TLI.getRegClassFor(VT), Imm);
+}
+
+unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
+ if (CFP->isNullValue())
+ return fastMaterializeFloatZero(CFP);
+
+ // Can't handle alternate code models yet.
+ CodeModel::Model CM = TM.getCodeModel();
+ if (CM != CodeModel::Small && CM != CodeModel::Large)
+ return 0;
+
+ // Get opcode and regclass of the output for the given load instruction.
+ unsigned Opc = 0;
+ const TargetRegisterClass *RC = nullptr;
+ switch (VT.SimpleTy) {
+ default: return 0;
+ case MVT::f32:
+ if (X86ScalarSSEf32) {
+ Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm;
+ RC = &X86::FR32RegClass;
+ } else {
+ Opc = X86::LD_Fp32m;
+ RC = &X86::RFP32RegClass;
+ }
+ break;
+ case MVT::f64:
+ if (X86ScalarSSEf64) {
+ Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm;
+ RC = &X86::FR64RegClass;
+ } else {
+ Opc = X86::LD_Fp64m;
+ RC = &X86::RFP64RegClass;
+ }
+ break;
+ case MVT::f80:
+ // No f80 support yet.
+ return 0;
+ }
+
+ // MachineConstantPool wants an explicit alignment.
+ unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
+ if (Align == 0) {
+ // Alignment of vector types. FIXME!
+ Align = DL.getTypeAllocSize(CFP->getType());
+ }
+
+ // x86-32 PIC requires a PIC base register for constant pools.
+ unsigned PICBase = 0;
+ unsigned char OpFlag = Subtarget->classifyLocalReference(nullptr);
+ if (OpFlag == X86II::MO_PIC_BASE_OFFSET)
+ PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
+ else if (OpFlag == X86II::MO_GOTOFF)
+ PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
+ else if (Subtarget->is64Bit() && TM.getCodeModel() == CodeModel::Small)
+ PICBase = X86::RIP;
+
+ // Create the load from the constant pool.
+ unsigned CPI = MCP.getConstantPoolIndex(CFP, Align);
+ unsigned ResultReg = createResultReg(RC);
+
+ if (CM == CodeModel::Large) {
+ unsigned AddrReg = createResultReg(&X86::GR64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri),
+ AddrReg)
+ .addConstantPoolIndex(CPI, 0, OpFlag);
+ MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc), ResultReg);
+ addDirectMem(MIB, AddrReg);
+ MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+ MachinePointerInfo::getConstantPool(*FuncInfo.MF),
+ MachineMemOperand::MOLoad, DL.getPointerSize(), Align);
+ MIB->addMemOperand(*FuncInfo.MF, MMO);
+ return ResultReg;
+ }
+
+ addConstantPoolReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc), ResultReg),
+ CPI, PICBase, OpFlag);
+ return ResultReg;
+}
+
+unsigned X86FastISel::X86MaterializeGV(const GlobalValue *GV, MVT VT) {
+ // Can't handle alternate code models yet.
+ if (TM.getCodeModel() != CodeModel::Small)
+ return 0;
+
+ // Materialize addresses with LEA/MOV instructions.
+ X86AddressMode AM;
+ if (X86SelectAddress(GV, AM)) {
+ // If the expression is just a basereg, then we're done, otherwise we need
+ // to emit an LEA.
+ if (AM.BaseType == X86AddressMode::RegBase &&
+ AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr)
+ return AM.Base.Reg;
+
+ unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+ if (TM.getRelocationModel() == Reloc::Static &&
+ TLI.getPointerTy(DL) == MVT::i64) {
+ // The displacement code could be more than 32 bits away so we need to use
+ // an instruction with a 64 bit immediate
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri),
+ ResultReg)
+ .addGlobalAddress(GV);
+ } else {
+ unsigned Opc =
+ TLI.getPointerTy(DL) == MVT::i32
+ ? (Subtarget->isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r)
+ : X86::LEA64r;
+ addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc), ResultReg), AM);
+ }
+ return ResultReg;
+ }
+ return 0;
+}
+
+unsigned X86FastISel::fastMaterializeConstant(const Constant *C) {
+ EVT CEVT = TLI.getValueType(DL, C->getType(), true);
+
+ // Only handle simple types.
+ if (!CEVT.isSimple())
+ return 0;
+ MVT VT = CEVT.getSimpleVT();
+
+ if (const auto *CI = dyn_cast<ConstantInt>(C))
+ return X86MaterializeInt(CI, VT);
+ else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
+ return X86MaterializeFP(CFP, VT);
+ else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+ return X86MaterializeGV(GV, VT);
+
+ return 0;
+}
+
+unsigned X86FastISel::fastMaterializeAlloca(const AllocaInst *C) {
+ // Fail on dynamic allocas. At this point, getRegForValue has already
+ // checked its CSE maps, so if we're here trying to handle a dynamic
+ // alloca, we're not going to succeed. X86SelectAddress has a
+ // check for dynamic allocas, because it's called directly from
+ // various places, but targetMaterializeAlloca also needs a check
+ // in order to avoid recursion between getRegForValue,
+ // X86SelectAddrss, and targetMaterializeAlloca.
+ if (!FuncInfo.StaticAllocaMap.count(C))
+ return 0;
+ assert(C->isStaticAlloca() && "dynamic alloca in the static alloca map?");
+
+ X86AddressMode AM;
+ if (!X86SelectAddress(C, AM))
+ return 0;
+ unsigned Opc =
+ TLI.getPointerTy(DL) == MVT::i32
+ ? (Subtarget->isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r)
+ : X86::LEA64r;
+ const TargetRegisterClass *RC = TLI.getRegClassFor(TLI.getPointerTy(DL));
+ unsigned ResultReg = createResultReg(RC);
+ addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc), ResultReg), AM);
+ return ResultReg;
+}
+
+unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) {
+ MVT VT;
+ if (!isTypeLegal(CF->getType(), VT))
+ return 0;
+
+ // Get opcode and regclass for the given zero.
+ unsigned Opc = 0;
+ const TargetRegisterClass *RC = nullptr;
+ switch (VT.SimpleTy) {
+ default: return 0;
+ case MVT::f32:
+ if (X86ScalarSSEf32) {
+ Opc = X86::FsFLD0SS;
+ RC = &X86::FR32RegClass;
+ } else {
+ Opc = X86::LD_Fp032;
+ RC = &X86::RFP32RegClass;
+ }
+ break;
+ case MVT::f64:
+ if (X86ScalarSSEf64) {
+ Opc = X86::FsFLD0SD;
+ RC = &X86::FR64RegClass;
+ } else {
+ Opc = X86::LD_Fp064;
+ RC = &X86::RFP64RegClass;
+ }
+ break;
+ case MVT::f80:
+ // No f80 support yet.
+ return 0;
+ }
+
+ unsigned ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
+ return ResultReg;
+}
+
+
+bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
+ const LoadInst *LI) {
+ const Value *Ptr = LI->getPointerOperand();
+ X86AddressMode AM;
+ if (!X86SelectAddress(Ptr, AM))
+ return false;
+
+ const X86InstrInfo &XII = (const X86InstrInfo &)TII;
+
+ unsigned Size = DL.getTypeAllocSize(LI->getType());
+ unsigned Alignment = LI->getAlignment();
+
+ if (Alignment == 0) // Ensure that codegen never sees alignment 0
+ Alignment = DL.getABITypeAlignment(LI->getType());
+
+ SmallVector<MachineOperand, 8> AddrOps;
+ AM.getFullAddress(AddrOps);
+
+ MachineInstr *Result = XII.foldMemoryOperandImpl(
+ *FuncInfo.MF, *MI, OpNo, AddrOps, FuncInfo.InsertPt, Size, Alignment,
+ /*AllowCommute=*/true);
+ if (!Result)
+ return false;
+
+ // The index register could be in the wrong register class. Unfortunately,
+ // foldMemoryOperandImpl could have commuted the instruction so its not enough
+ // to just look at OpNo + the offset to the index reg. We actually need to
+ // scan the instruction to find the index reg and see if its the correct reg
+ // class.
+ unsigned OperandNo = 0;
+ for (MachineInstr::mop_iterator I = Result->operands_begin(),
+ E = Result->operands_end(); I != E; ++I, ++OperandNo) {
+ MachineOperand &MO = *I;
+ if (!MO.isReg() || MO.isDef() || MO.getReg() != AM.IndexReg)
+ continue;
+ // Found the index reg, now try to rewrite it.
+ unsigned IndexReg = constrainOperandRegClass(Result->getDesc(),
+ MO.getReg(), OperandNo);
+ if (IndexReg == MO.getReg())
+ continue;
+ MO.setReg(IndexReg);
+ }
+
+ Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI));
+ MI->eraseFromParent();
+ return true;
+}
+
+unsigned X86FastISel::fastEmitInst_rrrr(unsigned MachineInstOpcode,
+ const TargetRegisterClass *RC,
+ unsigned Op0, bool Op0IsKill,
+ unsigned Op1, bool Op1IsKill,
+ unsigned Op2, bool Op2IsKill,
+ unsigned Op3, bool Op3IsKill) {
+ const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+ unsigned ResultReg = createResultReg(RC);
+ Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
+ Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1);
+ Op2 = constrainOperandRegClass(II, Op2, II.getNumDefs() + 2);
+ Op2 = constrainOperandRegClass(II, Op2, II.getNumDefs() + 3);
+
+ if (II.getNumDefs() >= 1)
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+ .addReg(Op0, getKillRegState(Op0IsKill))
+ .addReg(Op1, getKillRegState(Op1IsKill))
+ .addReg(Op2, getKillRegState(Op2IsKill))
+ .addReg(Op3, getKillRegState(Op3IsKill));
+ else {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+ .addReg(Op0, getKillRegState(Op0IsKill))
+ .addReg(Op1, getKillRegState(Op1IsKill))
+ .addReg(Op2, getKillRegState(Op2IsKill))
+ .addReg(Op3, getKillRegState(Op3IsKill));
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
+ }
+ return ResultReg;
+}
+
+
+namespace llvm {
+ FastISel *X86::createFastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo) {
+ return new X86FastISel(funcInfo, libInfo);
+ }
+}
diff --git a/contrib/llvm/lib/Target/X86/X86FixupBWInsts.cpp b/contrib/llvm/lib/Target/X86/X86FixupBWInsts.cpp
new file mode 100644
index 000000000000..8bde4bf98d66
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FixupBWInsts.cpp
@@ -0,0 +1,367 @@
+//===-- X86FixupBWInsts.cpp - Fixup Byte or Word instructions -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines the pass that looks through the machine instructions
+/// late in the compilation, and finds byte or word instructions that
+/// can be profitably replaced with 32 bit instructions that give equivalent
+/// results for the bits of the results that are used. There are two possible
+/// reasons to do this.
+///
+/// One reason is to avoid false-dependences on the upper portions
+/// of the registers. Only instructions that have a destination register
+/// which is not in any of the source registers can be affected by this.
+/// Any instruction where one of the source registers is also the destination
+/// register is unaffected, because it has a true dependence on the source
+/// register already. So, this consideration primarily affects load
+/// instructions and register-to-register moves. It would
+/// seem like cmov(s) would also be affected, but because of the way cmov is
+/// really implemented by most machines as reading both the destination and
+/// and source regsters, and then "merging" the two based on a condition,
+/// it really already should be considered as having a true dependence on the
+/// destination register as well.
+///
+/// The other reason to do this is for potential code size savings. Word
+/// operations need an extra override byte compared to their 32 bit
+/// versions. So this can convert many word operations to their larger
+/// size, saving a byte in encoding. This could introduce partial register
+/// dependences where none existed however. As an example take:
+/// orw ax, $0x1000
+/// addw ax, $3
+/// now if this were to get transformed into
+/// orw ax, $1000
+/// addl eax, $3
+/// because the addl encodes shorter than the addw, this would introduce
+/// a use of a register that was only partially written earlier. On older
+/// Intel processors this can be quite a performance penalty, so this should
+/// probably only be done when it can be proven that a new partial dependence
+/// wouldn't be created, or when your know a newer processor is being
+/// targeted, or when optimizing for minimum code size.
+///
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+using namespace llvm;
+
+#define FIXUPBW_DESC "X86 Byte/Word Instruction Fixup"
+#define FIXUPBW_NAME "x86-fixup-bw-insts"
+
+#define DEBUG_TYPE FIXUPBW_NAME
+
+// Option to allow this optimization pass to have fine-grained control.
+static cl::opt<bool>
+ FixupBWInsts("fixup-byte-word-insts",
+ cl::desc("Change byte and word instructions to larger sizes"),
+ cl::init(true), cl::Hidden);
+
+namespace {
+class FixupBWInstPass : public MachineFunctionPass {
+ /// Loop over all of the instructions in the basic block replacing applicable
+ /// byte or word instructions with better alternatives.
+ void processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB);
+
+ /// This sets the \p SuperDestReg to the 32 bit super reg of the original
+ /// destination register of the MachineInstr passed in. It returns true if
+ /// that super register is dead just prior to \p OrigMI, and false if not.
+ bool getSuperRegDestIfDead(MachineInstr *OrigMI,
+ unsigned &SuperDestReg) const;
+
+ /// Change the MachineInstr \p MI into the equivalent extending load to 32 bit
+ /// register if it is safe to do so. Return the replacement instruction if
+ /// OK, otherwise return nullptr.
+ MachineInstr *tryReplaceLoad(unsigned New32BitOpcode, MachineInstr *MI) const;
+
+ /// Change the MachineInstr \p MI into the equivalent 32-bit copy if it is
+ /// safe to do so. Return the replacement instruction if OK, otherwise return
+ /// nullptr.
+ MachineInstr *tryReplaceCopy(MachineInstr *MI) const;
+
+ // Change the MachineInstr \p MI into an eqivalent 32 bit instruction if
+ // possible. Return the replacement instruction if OK, return nullptr
+ // otherwise. Set WasCandidate to true or false depending on whether the
+ // MI was a candidate for this sort of transformation.
+ MachineInstr *tryReplaceInstr(MachineInstr *MI, MachineBasicBlock &MBB,
+ bool &WasCandidate) const;
+public:
+ static char ID;
+
+ StringRef getPassName() const override { return FIXUPBW_DESC; }
+
+ FixupBWInstPass() : MachineFunctionPass(ID) {
+ initializeFixupBWInstPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineLoopInfo>(); // Machine loop info is used to
+ // guide some heuristics.
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ /// Loop over all of the basic blocks, replacing byte and word instructions by
+ /// equivalent 32 bit instructions where performance or code size can be
+ /// improved.
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+private:
+ MachineFunction *MF;
+
+ /// Machine instruction info used throughout the class.
+ const X86InstrInfo *TII;
+
+ /// Local member for function's OptForSize attribute.
+ bool OptForSize;
+
+ /// Machine loop info used for guiding some heruistics.
+ MachineLoopInfo *MLI;
+
+ /// Register Liveness information after the current instruction.
+ LivePhysRegs LiveRegs;
+};
+char FixupBWInstPass::ID = 0;
+}
+
+INITIALIZE_PASS(FixupBWInstPass, FIXUPBW_NAME, FIXUPBW_DESC, false, false)
+
+FunctionPass *llvm::createX86FixupBWInsts() { return new FixupBWInstPass(); }
+
+bool FixupBWInstPass::runOnMachineFunction(MachineFunction &MF) {
+ if (!FixupBWInsts || skipFunction(*MF.getFunction()))
+ return false;
+
+ this->MF = &MF;
+ TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+ OptForSize = MF.getFunction()->optForSize();
+ MLI = &getAnalysis<MachineLoopInfo>();
+ LiveRegs.init(TII->getRegisterInfo());
+
+ DEBUG(dbgs() << "Start X86FixupBWInsts\n";);
+
+ // Process all basic blocks.
+ for (auto &MBB : MF)
+ processBasicBlock(MF, MBB);
+
+ DEBUG(dbgs() << "End X86FixupBWInsts\n";);
+
+ return true;
+}
+
+// TODO: This method of analysis can miss some legal cases, because the
+// super-register could be live into the address expression for a memory
+// reference for the instruction, and still be killed/last used by the
+// instruction. However, the existing query interfaces don't seem to
+// easily allow that to be checked.
+//
+// What we'd really like to know is whether after OrigMI, the
+// only portion of SuperDestReg that is alive is the portion that
+// was the destination register of OrigMI.
+bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI,
+ unsigned &SuperDestReg) const {
+ auto *TRI = &TII->getRegisterInfo();
+
+ unsigned OrigDestReg = OrigMI->getOperand(0).getReg();
+ SuperDestReg = getX86SubSuperRegister(OrigDestReg, 32);
+
+ const auto SubRegIdx = TRI->getSubRegIndex(SuperDestReg, OrigDestReg);
+
+ // Make sure that the sub-register that this instruction has as its
+ // destination is the lowest order sub-register of the super-register.
+ // If it isn't, then the register isn't really dead even if the
+ // super-register is considered dead.
+ if (SubRegIdx == X86::sub_8bit_hi)
+ return false;
+
+ if (LiveRegs.contains(SuperDestReg))
+ return false;
+
+ if (SubRegIdx == X86::sub_8bit) {
+ // In the case of byte registers, we also have to check that the upper
+ // byte register is also dead. That is considered to be independent of
+ // whether the super-register is dead.
+ unsigned UpperByteReg =
+ getX86SubSuperRegister(SuperDestReg, 8, /*High=*/true);
+
+ if (LiveRegs.contains(UpperByteReg))
+ return false;
+ }
+
+ return true;
+}
+
+MachineInstr *FixupBWInstPass::tryReplaceLoad(unsigned New32BitOpcode,
+ MachineInstr *MI) const {
+ unsigned NewDestReg;
+
+ // We are going to try to rewrite this load to a larger zero-extending
+ // load. This is safe if all portions of the 32 bit super-register
+ // of the original destination register, except for the original destination
+ // register are dead. getSuperRegDestIfDead checks that.
+ if (!getSuperRegDestIfDead(MI, NewDestReg))
+ return nullptr;
+
+ // Safe to change the instruction.
+ MachineInstrBuilder MIB =
+ BuildMI(*MF, MI->getDebugLoc(), TII->get(New32BitOpcode), NewDestReg);
+
+ unsigned NumArgs = MI->getNumOperands();
+ for (unsigned i = 1; i < NumArgs; ++i)
+ MIB.addOperand(MI->getOperand(i));
+
+ MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+ return MIB;
+}
+
+MachineInstr *FixupBWInstPass::tryReplaceCopy(MachineInstr *MI) const {
+ assert(MI->getNumExplicitOperands() == 2);
+ auto &OldDest = MI->getOperand(0);
+ auto &OldSrc = MI->getOperand(1);
+
+ unsigned NewDestReg;
+ if (!getSuperRegDestIfDead(MI, NewDestReg))
+ return nullptr;
+
+ unsigned NewSrcReg = getX86SubSuperRegister(OldSrc.getReg(), 32);
+
+ // This is only correct if we access the same subregister index: otherwise,
+ // we could try to replace "movb %ah, %al" with "movl %eax, %eax".
+ auto *TRI = &TII->getRegisterInfo();
+ if (TRI->getSubRegIndex(NewSrcReg, OldSrc.getReg()) !=
+ TRI->getSubRegIndex(NewDestReg, OldDest.getReg()))
+ return nullptr;
+
+ // Safe to change the instruction.
+ // Don't set src flags, as we don't know if we're also killing the superreg.
+ // However, the superregister might not be defined; make it explicit that
+ // we don't care about the higher bits by reading it as Undef, and adding
+ // an imp-use on the original subregister.
+ MachineInstrBuilder MIB =
+ BuildMI(*MF, MI->getDebugLoc(), TII->get(X86::MOV32rr), NewDestReg)
+ .addReg(NewSrcReg, RegState::Undef)
+ .addReg(OldSrc.getReg(), RegState::Implicit);
+
+ // Drop imp-defs/uses that would be redundant with the new def/use.
+ for (auto &Op : MI->implicit_operands())
+ if (Op.getReg() != (Op.isDef() ? NewDestReg : NewSrcReg))
+ MIB.addOperand(Op);
+
+ return MIB;
+}
+
+MachineInstr *FixupBWInstPass::tryReplaceInstr(
+ MachineInstr *MI, MachineBasicBlock &MBB,
+ bool &WasCandidate) const {
+ MachineInstr *NewMI = nullptr;
+ WasCandidate = false;
+
+ // See if this is an instruction of the type we are currently looking for.
+ switch (MI->getOpcode()) {
+
+ case X86::MOV8rm:
+ // Only replace 8 bit loads with the zero extending versions if
+ // in an inner most loop and not optimizing for size. This takes
+ // an extra byte to encode, and provides limited performance upside.
+ if (MachineLoop *ML = MLI->getLoopFor(&MBB)) {
+ if (ML->begin() == ML->end() && !OptForSize) {
+ NewMI = tryReplaceLoad(X86::MOVZX32rm8, MI);
+ WasCandidate = true;
+ }
+ }
+ break;
+
+ case X86::MOV16rm:
+ // Always try to replace 16 bit load with 32 bit zero extending.
+ // Code size is the same, and there is sometimes a perf advantage
+ // from eliminating a false dependence on the upper portion of
+ // the register.
+ NewMI = tryReplaceLoad(X86::MOVZX32rm16, MI);
+ WasCandidate = true;
+ break;
+
+ case X86::MOV8rr:
+ case X86::MOV16rr:
+ // Always try to replace 8/16 bit copies with a 32 bit copy.
+ // Code size is either less (16) or equal (8), and there is sometimes a
+ // perf advantage from eliminating a false dependence on the upper portion
+ // of the register.
+ NewMI = tryReplaceCopy(MI);
+ WasCandidate = true;
+ break;
+
+ default:
+ // nothing to do here.
+ break;
+ }
+
+ return NewMI;
+}
+
+void FixupBWInstPass::processBasicBlock(MachineFunction &MF,
+ MachineBasicBlock &MBB) {
+
+ // This algorithm doesn't delete the instructions it is replacing
+ // right away. By leaving the existing instructions in place, the
+ // register liveness information doesn't change, and this makes the
+ // analysis that goes on be better than if the replaced instructions
+ // were immediately removed.
+ //
+ // This algorithm always creates a replacement instruction
+ // and notes that and the original in a data structure, until the
+ // whole BB has been analyzed. This keeps the replacement instructions
+ // from making it seem as if the larger register might be live.
+ SmallVector<std::pair<MachineInstr *, MachineInstr *>, 8> MIReplacements;
+
+ // Start computing liveness for this block. We iterate from the end to be able
+ // to update this for each instruction.
+ LiveRegs.clear();
+ // We run after PEI, so we need to AddPristinesAndCSRs.
+ LiveRegs.addLiveOuts(MBB);
+
+ bool WasCandidate = false;
+
+ for (auto I = MBB.rbegin(); I != MBB.rend(); ++I) {
+ MachineInstr *MI = &*I;
+
+ MachineInstr *NewMI = tryReplaceInstr(MI, MBB, WasCandidate);
+
+ // Add this to replacements if it was a candidate, even if NewMI is
+ // nullptr. We will revisit that in a bit.
+ if (WasCandidate) {
+ MIReplacements.push_back(std::make_pair(MI, NewMI));
+ }
+
+ // We're done with this instruction, update liveness for the next one.
+ LiveRegs.stepBackward(*MI);
+ }
+
+ while (!MIReplacements.empty()) {
+ MachineInstr *MI = MIReplacements.back().first;
+ MachineInstr *NewMI = MIReplacements.back().second;
+ MIReplacements.pop_back();
+ if (NewMI) {
+ MBB.insert(MI, NewMI);
+ MBB.erase(MI);
+ }
+ }
+}
diff --git a/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp
new file mode 100644
index 000000000000..12095917ca30
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp
@@ -0,0 +1,418 @@
+//===-- X86FixupLEAs.cpp - use or replace LEA instructions -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass that finds instructions that can be
+// re-written as LEA instructions in order to reduce pipeline delays.
+// When optimizing for size it replaces suitable LEAs with INC or DEC.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-fixup-LEAs"
+
+STATISTIC(NumLEAs, "Number of LEA instructions created");
+
+namespace {
+class FixupLEAPass : public MachineFunctionPass {
+ enum RegUsageState { RU_NotUsed, RU_Write, RU_Read };
+ static char ID;
+ /// \brief Loop over all of the instructions in the basic block
+ /// replacing applicable instructions with LEA instructions,
+ /// where appropriate.
+ bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI);
+
+ StringRef getPassName() const override { return "X86 LEA Fixup"; }
+
+ /// \brief Given a machine register, look for the instruction
+ /// which writes it in the current basic block. If found,
+ /// try to replace it with an equivalent LEA instruction.
+ /// If replacement succeeds, then also process the newly created
+ /// instruction.
+ void seekLEAFixup(MachineOperand &p, MachineBasicBlock::iterator &I,
+ MachineFunction::iterator MFI);
+
+ /// \brief Given a memory access or LEA instruction
+ /// whose address mode uses a base and/or index register, look for
+ /// an opportunity to replace the instruction which sets the base or index
+ /// register with an equivalent LEA instruction.
+ void processInstruction(MachineBasicBlock::iterator &I,
+ MachineFunction::iterator MFI);
+
+ /// \brief Given a LEA instruction which is unprofitable
+ /// on Silvermont try to replace it with an equivalent ADD instruction
+ void processInstructionForSLM(MachineBasicBlock::iterator &I,
+ MachineFunction::iterator MFI);
+
+ /// \brief Look for LEAs that add 1 to reg or subtract 1 from reg
+ /// and convert them to INC or DEC respectively.
+ bool fixupIncDec(MachineBasicBlock::iterator &I,
+ MachineFunction::iterator MFI) const;
+
+ /// \brief Determine if an instruction references a machine register
+ /// and, if so, whether it reads or writes the register.
+ RegUsageState usesRegister(MachineOperand &p, MachineBasicBlock::iterator I);
+
+ /// \brief Step backwards through a basic block, looking
+ /// for an instruction which writes a register within
+ /// a maximum of INSTR_DISTANCE_THRESHOLD instruction latency cycles.
+ MachineBasicBlock::iterator searchBackwards(MachineOperand &p,
+ MachineBasicBlock::iterator &I,
+ MachineFunction::iterator MFI);
+
+ /// \brief if an instruction can be converted to an
+ /// equivalent LEA, insert the new instruction into the basic block
+ /// and return a pointer to it. Otherwise, return zero.
+ MachineInstr *postRAConvertToLEA(MachineFunction::iterator &MFI,
+ MachineBasicBlock::iterator &MBBI) const;
+
+public:
+ FixupLEAPass() : MachineFunctionPass(ID) {}
+
+ /// \brief Loop over all of the basic blocks,
+ /// replacing instructions by equivalent LEA instructions
+ /// if needed and when possible.
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ // This pass runs after regalloc and doesn't support VReg operands.
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+private:
+ MachineFunction *MF;
+ const X86InstrInfo *TII; // Machine instruction info.
+ bool OptIncDec;
+ bool OptLEA;
+};
+char FixupLEAPass::ID = 0;
+}
+
+MachineInstr *
+FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
+ MachineBasicBlock::iterator &MBBI) const {
+ MachineInstr &MI = *MBBI;
+ switch (MI.getOpcode()) {
+ case X86::MOV32rr:
+ case X86::MOV64rr: {
+ const MachineOperand &Src = MI.getOperand(1);
+ const MachineOperand &Dest = MI.getOperand(0);
+ MachineInstr *NewMI =
+ BuildMI(*MF, MI.getDebugLoc(),
+ TII->get(MI.getOpcode() == X86::MOV32rr ? X86::LEA32r
+ : X86::LEA64r))
+ .addOperand(Dest)
+ .addOperand(Src)
+ .addImm(1)
+ .addReg(0)
+ .addImm(0)
+ .addReg(0);
+ MFI->insert(MBBI, NewMI); // Insert the new inst
+ return NewMI;
+ }
+ case X86::ADD64ri32:
+ case X86::ADD64ri8:
+ case X86::ADD64ri32_DB:
+ case X86::ADD64ri8_DB:
+ case X86::ADD32ri:
+ case X86::ADD32ri8:
+ case X86::ADD32ri_DB:
+ case X86::ADD32ri8_DB:
+ case X86::ADD16ri:
+ case X86::ADD16ri8:
+ case X86::ADD16ri_DB:
+ case X86::ADD16ri8_DB:
+ if (!MI.getOperand(2).isImm()) {
+ // convertToThreeAddress will call getImm()
+ // which requires isImm() to be true
+ return nullptr;
+ }
+ break;
+ case X86::ADD16rr:
+ case X86::ADD16rr_DB:
+ if (MI.getOperand(1).getReg() != MI.getOperand(2).getReg()) {
+ // if src1 != src2, then convertToThreeAddress will
+ // need to create a Virtual register, which we cannot do
+ // after register allocation.
+ return nullptr;
+ }
+ }
+ return TII->convertToThreeAddress(MFI, MI, nullptr);
+}
+
+FunctionPass *llvm::createX86FixupLEAs() { return new FixupLEAPass(); }
+
+bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
+ if (skipFunction(*Func.getFunction()))
+ return false;
+
+ MF = &Func;
+ const X86Subtarget &ST = Func.getSubtarget<X86Subtarget>();
+ OptIncDec = !ST.slowIncDec() || Func.getFunction()->optForMinSize();
+ OptLEA = ST.LEAusesAG() || ST.slowLEA();
+
+ if (!OptLEA && !OptIncDec)
+ return false;
+
+ TII = ST.getInstrInfo();
+
+ DEBUG(dbgs() << "Start X86FixupLEAs\n";);
+ // Process all basic blocks.
+ for (MachineFunction::iterator I = Func.begin(), E = Func.end(); I != E; ++I)
+ processBasicBlock(Func, I);
+ DEBUG(dbgs() << "End X86FixupLEAs\n";);
+
+ return true;
+}
+
+FixupLEAPass::RegUsageState
+FixupLEAPass::usesRegister(MachineOperand &p, MachineBasicBlock::iterator I) {
+ RegUsageState RegUsage = RU_NotUsed;
+ MachineInstr &MI = *I;
+
+ for (unsigned int i = 0; i < MI.getNumOperands(); ++i) {
+ MachineOperand &opnd = MI.getOperand(i);
+ if (opnd.isReg() && opnd.getReg() == p.getReg()) {
+ if (opnd.isDef())
+ return RU_Write;
+ RegUsage = RU_Read;
+ }
+ }
+ return RegUsage;
+}
+
+/// getPreviousInstr - Given a reference to an instruction in a basic
+/// block, return a reference to the previous instruction in the block,
+/// wrapping around to the last instruction of the block if the block
+/// branches to itself.
+static inline bool getPreviousInstr(MachineBasicBlock::iterator &I,
+ MachineFunction::iterator MFI) {
+ if (I == MFI->begin()) {
+ if (MFI->isPredecessor(&*MFI)) {
+ I = --MFI->end();
+ return true;
+ } else
+ return false;
+ }
+ --I;
+ return true;
+}
+
+MachineBasicBlock::iterator
+FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I,
+ MachineFunction::iterator MFI) {
+ int InstrDistance = 1;
+ MachineBasicBlock::iterator CurInst;
+ static const int INSTR_DISTANCE_THRESHOLD = 5;
+
+ CurInst = I;
+ bool Found;
+ Found = getPreviousInstr(CurInst, MFI);
+ while (Found && I != CurInst) {
+ if (CurInst->isCall() || CurInst->isInlineAsm())
+ break;
+ if (InstrDistance > INSTR_DISTANCE_THRESHOLD)
+ break; // too far back to make a difference
+ if (usesRegister(p, CurInst) == RU_Write) {
+ return CurInst;
+ }
+ InstrDistance += TII->getInstrLatency(
+ MF->getSubtarget().getInstrItineraryData(), *CurInst);
+ Found = getPreviousInstr(CurInst, MFI);
+ }
+ return MachineBasicBlock::iterator();
+}
+
+static inline bool isLEA(const int opcode) {
+ return opcode == X86::LEA16r || opcode == X86::LEA32r ||
+ opcode == X86::LEA64r || opcode == X86::LEA64_32r;
+}
+
+/// isLEASimpleIncOrDec - Does this LEA have one these forms:
+/// lea %reg, 1(%reg)
+/// lea %reg, -1(%reg)
+static inline bool isLEASimpleIncOrDec(MachineInstr &LEA) {
+ unsigned SrcReg = LEA.getOperand(1 + X86::AddrBaseReg).getReg();
+ unsigned DstReg = LEA.getOperand(0).getReg();
+ unsigned AddrDispOp = 1 + X86::AddrDisp;
+ return SrcReg == DstReg &&
+ LEA.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
+ LEA.getOperand(1 + X86::AddrSegmentReg).getReg() == 0 &&
+ LEA.getOperand(AddrDispOp).isImm() &&
+ (LEA.getOperand(AddrDispOp).getImm() == 1 ||
+ LEA.getOperand(AddrDispOp).getImm() == -1);
+}
+
+bool FixupLEAPass::fixupIncDec(MachineBasicBlock::iterator &I,
+ MachineFunction::iterator MFI) const {
+ MachineInstr &MI = *I;
+ int Opcode = MI.getOpcode();
+ if (!isLEA(Opcode))
+ return false;
+
+ if (isLEASimpleIncOrDec(MI) && TII->isSafeToClobberEFLAGS(*MFI, I)) {
+ int NewOpcode;
+ bool isINC = MI.getOperand(4).getImm() == 1;
+ switch (Opcode) {
+ case X86::LEA16r:
+ NewOpcode = isINC ? X86::INC16r : X86::DEC16r;
+ break;
+ case X86::LEA32r:
+ case X86::LEA64_32r:
+ NewOpcode = isINC ? X86::INC32r : X86::DEC32r;
+ break;
+ case X86::LEA64r:
+ NewOpcode = isINC ? X86::INC64r : X86::DEC64r;
+ break;
+ }
+
+ MachineInstr *NewMI =
+ BuildMI(*MFI, I, MI.getDebugLoc(), TII->get(NewOpcode))
+ .addOperand(MI.getOperand(0))
+ .addOperand(MI.getOperand(1));
+ MFI->erase(I);
+ I = static_cast<MachineBasicBlock::iterator>(NewMI);
+ return true;
+ }
+ return false;
+}
+
+void FixupLEAPass::processInstruction(MachineBasicBlock::iterator &I,
+ MachineFunction::iterator MFI) {
+ // Process a load, store, or LEA instruction.
+ MachineInstr &MI = *I;
+ const MCInstrDesc &Desc = MI.getDesc();
+ int AddrOffset = X86II::getMemoryOperandNo(Desc.TSFlags);
+ if (AddrOffset >= 0) {
+ AddrOffset += X86II::getOperandBias(Desc);
+ MachineOperand &p = MI.getOperand(AddrOffset + X86::AddrBaseReg);
+ if (p.isReg() && p.getReg() != X86::ESP) {
+ seekLEAFixup(p, I, MFI);
+ }
+ MachineOperand &q = MI.getOperand(AddrOffset + X86::AddrIndexReg);
+ if (q.isReg() && q.getReg() != X86::ESP) {
+ seekLEAFixup(q, I, MFI);
+ }
+ }
+}
+
+void FixupLEAPass::seekLEAFixup(MachineOperand &p,
+ MachineBasicBlock::iterator &I,
+ MachineFunction::iterator MFI) {
+ MachineBasicBlock::iterator MBI = searchBackwards(p, I, MFI);
+ if (MBI != MachineBasicBlock::iterator()) {
+ MachineInstr *NewMI = postRAConvertToLEA(MFI, MBI);
+ if (NewMI) {
+ ++NumLEAs;
+ DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MBI->dump(););
+ // now to replace with an equivalent LEA...
+ DEBUG(dbgs() << "FixLEA: Replaced by: "; NewMI->dump(););
+ MFI->erase(MBI);
+ MachineBasicBlock::iterator J =
+ static_cast<MachineBasicBlock::iterator>(NewMI);
+ processInstruction(J, MFI);
+ }
+ }
+}
+
+void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
+ MachineFunction::iterator MFI) {
+ MachineInstr &MI = *I;
+ const int opcode = MI.getOpcode();
+ if (!isLEA(opcode))
+ return;
+ if (MI.getOperand(5).getReg() != 0 || !MI.getOperand(4).isImm() ||
+ !TII->isSafeToClobberEFLAGS(*MFI, I))
+ return;
+ const unsigned DstR = MI.getOperand(0).getReg();
+ const unsigned SrcR1 = MI.getOperand(1).getReg();
+ const unsigned SrcR2 = MI.getOperand(3).getReg();
+ if ((SrcR1 == 0 || SrcR1 != DstR) && (SrcR2 == 0 || SrcR2 != DstR))
+ return;
+ if (MI.getOperand(2).getImm() > 1)
+ return;
+ int addrr_opcode, addri_opcode;
+ switch (opcode) {
+ default:
+ llvm_unreachable("Unexpected LEA instruction");
+ case X86::LEA16r:
+ addrr_opcode = X86::ADD16rr;
+ addri_opcode = X86::ADD16ri;
+ break;
+ case X86::LEA32r:
+ addrr_opcode = X86::ADD32rr;
+ addri_opcode = X86::ADD32ri;
+ break;
+ case X86::LEA64_32r:
+ case X86::LEA64r:
+ addrr_opcode = X86::ADD64rr;
+ addri_opcode = X86::ADD64ri32;
+ break;
+ }
+ DEBUG(dbgs() << "FixLEA: Candidate to replace:"; I->dump(););
+ DEBUG(dbgs() << "FixLEA: Replaced by: ";);
+ MachineInstr *NewMI = nullptr;
+ const MachineOperand &Dst = MI.getOperand(0);
+ // Make ADD instruction for two registers writing to LEA's destination
+ if (SrcR1 != 0 && SrcR2 != 0) {
+ const MachineOperand &Src1 = MI.getOperand(SrcR1 == DstR ? 1 : 3);
+ const MachineOperand &Src2 = MI.getOperand(SrcR1 == DstR ? 3 : 1);
+ NewMI = BuildMI(*MF, MI.getDebugLoc(), TII->get(addrr_opcode))
+ .addOperand(Dst)
+ .addOperand(Src1)
+ .addOperand(Src2);
+ MFI->insert(I, NewMI);
+ DEBUG(NewMI->dump(););
+ }
+ // Make ADD instruction for immediate
+ if (MI.getOperand(4).getImm() != 0) {
+ const MachineOperand &SrcR = MI.getOperand(SrcR1 == DstR ? 1 : 3);
+ NewMI = BuildMI(*MF, MI.getDebugLoc(), TII->get(addri_opcode))
+ .addOperand(Dst)
+ .addOperand(SrcR)
+ .addImm(MI.getOperand(4).getImm());
+ MFI->insert(I, NewMI);
+ DEBUG(NewMI->dump(););
+ }
+ if (NewMI) {
+ MFI->erase(I);
+ I = static_cast<MachineBasicBlock::iterator>(NewMI);
+ }
+}
+
+bool FixupLEAPass::processBasicBlock(MachineFunction &MF,
+ MachineFunction::iterator MFI) {
+
+ for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I) {
+ if (OptIncDec)
+ if (fixupIncDec(I, MFI))
+ continue;
+
+ if (OptLEA) {
+ if (MF.getSubtarget<X86Subtarget>().isSLM())
+ processInstructionForSLM(I, MFI);
+ else
+ processInstruction(I, MFI);
+ }
+ }
+ return false;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86FixupSetCC.cpp b/contrib/llvm/lib/Target/X86/X86FixupSetCC.cpp
new file mode 100644
index 000000000000..a86eb997635e
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FixupSetCC.cpp
@@ -0,0 +1,187 @@
+//===---- X86FixupSetCC.cpp - optimize usage of LEA instructions ----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pass that fixes zero-extension of setcc patterns.
+// X86 setcc instructions are modeled to have no input arguments, and a single
+// GR8 output argument. This is consistent with other similar instructions
+// (e.g. movb), but means it is impossible to directly generate a setcc into
+// the lower GR8 of a specified GR32.
+// This means that ISel must select (zext (setcc)) into something like
+// seta %al; movzbl %al, %eax.
+// Unfortunately, this can cause a stall due to the partial register write
+// performed by the setcc. Instead, we can use:
+// xor %eax, %eax; seta %al
+// This both avoids the stall, and encodes shorter.
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-fixup-setcc"
+
+STATISTIC(NumSubstZexts, "Number of setcc + zext pairs substituted");
+
+namespace {
+class X86FixupSetCCPass : public MachineFunctionPass {
+public:
+ X86FixupSetCCPass() : MachineFunctionPass(ID) {}
+
+ StringRef getPassName() const override { return "X86 Fixup SetCC"; }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+ // Find the preceding instruction that imp-defs eflags.
+ MachineInstr *findFlagsImpDef(MachineBasicBlock *MBB,
+ MachineBasicBlock::reverse_iterator MI);
+
+ // Return true if MI imp-uses eflags.
+ bool impUsesFlags(MachineInstr *MI);
+
+ // Return true if this is the opcode of a SetCC instruction with a register
+ // output.
+ bool isSetCCr(unsigned Opode);
+
+ MachineRegisterInfo *MRI;
+ const X86InstrInfo *TII;
+
+ enum { SearchBound = 16 };
+
+ static char ID;
+};
+
+char X86FixupSetCCPass::ID = 0;
+}
+
+FunctionPass *llvm::createX86FixupSetCC() { return new X86FixupSetCCPass(); }
+
+bool X86FixupSetCCPass::isSetCCr(unsigned Opcode) {
+ switch (Opcode) {
+ default:
+ return false;
+ case X86::SETOr:
+ case X86::SETNOr:
+ case X86::SETBr:
+ case X86::SETAEr:
+ case X86::SETEr:
+ case X86::SETNEr:
+ case X86::SETBEr:
+ case X86::SETAr:
+ case X86::SETSr:
+ case X86::SETNSr:
+ case X86::SETPr:
+ case X86::SETNPr:
+ case X86::SETLr:
+ case X86::SETGEr:
+ case X86::SETLEr:
+ case X86::SETGr:
+ return true;
+ }
+}
+
+// We expect the instruction *immediately* before the setcc to imp-def
+// EFLAGS (because of scheduling glue). To make this less brittle w.r.t
+// scheduling, look backwards until we hit the beginning of the
+// basic-block, or a small bound (to avoid quadratic behavior).
+MachineInstr *
+X86FixupSetCCPass::findFlagsImpDef(MachineBasicBlock *MBB,
+ MachineBasicBlock::reverse_iterator MI) {
+ // FIXME: Should this be instr_rend(), and MI be reverse_instr_iterator?
+ auto MBBStart = MBB->rend();
+ for (int i = 0; (i < SearchBound) && (MI != MBBStart); ++i, ++MI)
+ for (auto &Op : MI->implicit_operands())
+ if ((Op.getReg() == X86::EFLAGS) && (Op.isDef()))
+ return &*MI;
+
+ return nullptr;
+}
+
+bool X86FixupSetCCPass::impUsesFlags(MachineInstr *MI) {
+ for (auto &Op : MI->implicit_operands())
+ if ((Op.getReg() == X86::EFLAGS) && (Op.isUse()))
+ return true;
+
+ return false;
+}
+
+bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) {
+ bool Changed = false;
+ MRI = &MF.getRegInfo();
+ TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+
+ SmallVector<MachineInstr*, 4> ToErase;
+
+ for (auto &MBB : MF) {
+ for (auto &MI : MBB) {
+ // Find a setcc that is used by a zext.
+ // This doesn't have to be the only use, the transformation is safe
+ // regardless.
+ if (!isSetCCr(MI.getOpcode()))
+ continue;
+
+ MachineInstr *ZExt = nullptr;
+ for (auto &Use : MRI->use_instructions(MI.getOperand(0).getReg()))
+ if (Use.getOpcode() == X86::MOVZX32rr8)
+ ZExt = &Use;
+
+ if (!ZExt)
+ continue;
+
+ // Find the preceding instruction that imp-defs eflags.
+ MachineInstr *FlagsDefMI = findFlagsImpDef(
+ MI.getParent(), MachineBasicBlock::reverse_iterator(&MI));
+ if (!FlagsDefMI)
+ continue;
+
+ // We'd like to put something that clobbers eflags directly before
+ // FlagsDefMI. This can't hurt anything after FlagsDefMI, because
+ // it, itself, by definition, clobbers eflags. But it may happen that
+ // FlagsDefMI also *uses* eflags, in which case the transformation is
+ // invalid.
+ if (impUsesFlags(FlagsDefMI))
+ continue;
+
+ ++NumSubstZexts;
+ Changed = true;
+
+ // On 32-bit, we need to be careful to force an ABCD register.
+ const TargetRegisterClass *RC = MF.getSubtarget<X86Subtarget>().is64Bit()
+ ? &X86::GR32RegClass
+ : &X86::GR32_ABCDRegClass;
+ unsigned ZeroReg = MRI->createVirtualRegister(RC);
+ unsigned InsertReg = MRI->createVirtualRegister(RC);
+
+ // Initialize a register with 0. This must go before the eflags def
+ BuildMI(MBB, FlagsDefMI, MI.getDebugLoc(), TII->get(X86::MOV32r0),
+ ZeroReg);
+
+ // X86 setcc only takes an output GR8, so fake a GR32 input by inserting
+ // the setcc result into the low byte of the zeroed register.
+ BuildMI(*ZExt->getParent(), ZExt, ZExt->getDebugLoc(),
+ TII->get(X86::INSERT_SUBREG), InsertReg)
+ .addReg(ZeroReg)
+ .addReg(MI.getOperand(0).getReg())
+ .addImm(X86::sub_8bit);
+ MRI->replaceRegWith(ZExt->getOperand(0).getReg(), InsertReg);
+ ToErase.push_back(ZExt);
+ }
+ }
+
+ for (auto &I : ToErase)
+ I->eraseFromParent();
+
+ return Changed;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp
new file mode 100644
index 000000000000..a5489b9aa8b7
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp
@@ -0,0 +1,1696 @@
+//===-- X86FloatingPoint.cpp - Floating point Reg -> Stack converter ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which converts floating point instructions from
+// pseudo registers into register stack instructions. This pass uses live
+// variable information to indicate where the FPn registers are used and their
+// lifetimes.
+//
+// The x87 hardware tracks liveness of the stack registers, so it is necessary
+// to implement exact liveness tracking between basic blocks. The CFG edges are
+// partitioned into bundles where the same FP registers must be live in
+// identical stack positions. Instructions are inserted at the end of each basic
+// block to rearrange the live registers to match the outgoing bundle.
+//
+// This approach avoids splitting critical edges at the potential cost of more
+// live register shuffling instructions when critical edges are present.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/EdgeBundles.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <bitset>
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-codegen"
+
+STATISTIC(NumFXCH, "Number of fxch instructions inserted");
+STATISTIC(NumFP , "Number of floating point instructions");
+
+namespace {
+ const unsigned ScratchFPReg = 7;
+
+ struct FPS : public MachineFunctionPass {
+ static char ID;
+ FPS() : MachineFunctionPass(ID) {
+ initializeEdgeBundlesPass(*PassRegistry::getPassRegistry());
+ // This is really only to keep valgrind quiet.
+ // The logic in isLive() is too much for it.
+ memset(Stack, 0, sizeof(Stack));
+ memset(RegMap, 0, sizeof(RegMap));
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<EdgeBundles>();
+ AU.addPreservedID(MachineLoopInfoID);
+ AU.addPreservedID(MachineDominatorsID);
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+ StringRef getPassName() const override { return "X86 FP Stackifier"; }
+
+ private:
+ const TargetInstrInfo *TII; // Machine instruction info.
+
+ // Two CFG edges are related if they leave the same block, or enter the same
+ // block. The transitive closure of an edge under this relation is a
+ // LiveBundle. It represents a set of CFG edges where the live FP stack
+ // registers must be allocated identically in the x87 stack.
+ //
+ // A LiveBundle is usually all the edges leaving a block, or all the edges
+ // entering a block, but it can contain more edges if critical edges are
+ // present.
+ //
+ // The set of live FP registers in a LiveBundle is calculated by bundleCFG,
+ // but the exact mapping of FP registers to stack slots is fixed later.
+ struct LiveBundle {
+ // Bit mask of live FP registers. Bit 0 = FP0, bit 1 = FP1, &c.
+ unsigned Mask;
+
+ // Number of pre-assigned live registers in FixStack. This is 0 when the
+ // stack order has not yet been fixed.
+ unsigned FixCount;
+
+ // Assigned stack order for live-in registers.
+ // FixStack[i] == getStackEntry(i) for all i < FixCount.
+ unsigned char FixStack[8];
+
+ LiveBundle() : Mask(0), FixCount(0) {}
+
+ // Have the live registers been assigned a stack order yet?
+ bool isFixed() const { return !Mask || FixCount; }
+ };
+
+ // Numbered LiveBundle structs. LiveBundles[0] is used for all CFG edges
+ // with no live FP registers.
+ SmallVector<LiveBundle, 8> LiveBundles;
+
+ // The edge bundle analysis provides indices into the LiveBundles vector.
+ EdgeBundles *Bundles;
+
+ // Return a bitmask of FP registers in block's live-in list.
+ static unsigned calcLiveInMask(MachineBasicBlock *MBB) {
+ unsigned Mask = 0;
+ for (const auto &LI : MBB->liveins()) {
+ if (LI.PhysReg < X86::FP0 || LI.PhysReg > X86::FP6)
+ continue;
+ Mask |= 1 << (LI.PhysReg - X86::FP0);
+ }
+ return Mask;
+ }
+
+ // Partition all the CFG edges into LiveBundles.
+ void bundleCFG(MachineFunction &MF);
+
+ MachineBasicBlock *MBB; // Current basic block
+
+ // The hardware keeps track of how many FP registers are live, so we have
+ // to model that exactly. Usually, each live register corresponds to an
+ // FP<n> register, but when dealing with calls, returns, and inline
+ // assembly, it is sometimes necessary to have live scratch registers.
+ unsigned Stack[8]; // FP<n> Registers in each stack slot...
+ unsigned StackTop; // The current top of the FP stack.
+
+ enum {
+ NumFPRegs = 8 // Including scratch pseudo-registers.
+ };
+
+ // For each live FP<n> register, point to its Stack[] entry.
+ // The first entries correspond to FP0-FP6, the rest are scratch registers
+ // used when we need slightly different live registers than what the
+ // register allocator thinks.
+ unsigned RegMap[NumFPRegs];
+
+ // Set up our stack model to match the incoming registers to MBB.
+ void setupBlockStack();
+
+ // Shuffle live registers to match the expectations of successor blocks.
+ void finishBlockStack();
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ void dumpStack() const {
+ dbgs() << "Stack contents:";
+ for (unsigned i = 0; i != StackTop; ++i) {
+ dbgs() << " FP" << Stack[i];
+ assert(RegMap[Stack[i]] == i && "Stack[] doesn't match RegMap[]!");
+ }
+ }
+#endif
+
+ /// getSlot - Return the stack slot number a particular register number is
+ /// in.
+ unsigned getSlot(unsigned RegNo) const {
+ assert(RegNo < NumFPRegs && "Regno out of range!");
+ return RegMap[RegNo];
+ }
+
+ /// isLive - Is RegNo currently live in the stack?
+ bool isLive(unsigned RegNo) const {
+ unsigned Slot = getSlot(RegNo);
+ return Slot < StackTop && Stack[Slot] == RegNo;
+ }
+
+ /// getStackEntry - Return the X86::FP<n> register in register ST(i).
+ unsigned getStackEntry(unsigned STi) const {
+ if (STi >= StackTop)
+ report_fatal_error("Access past stack top!");
+ return Stack[StackTop-1-STi];
+ }
+
+ /// getSTReg - Return the X86::ST(i) register which contains the specified
+ /// FP<RegNo> register.
+ unsigned getSTReg(unsigned RegNo) const {
+ return StackTop - 1 - getSlot(RegNo) + X86::ST0;
+ }
+
+ // pushReg - Push the specified FP<n> register onto the stack.
+ void pushReg(unsigned Reg) {
+ assert(Reg < NumFPRegs && "Register number out of range!");
+ if (StackTop >= 8)
+ report_fatal_error("Stack overflow!");
+ Stack[StackTop] = Reg;
+ RegMap[Reg] = StackTop++;
+ }
+
+ // popReg - Pop a register from the stack.
+ void popReg() {
+ if (StackTop == 0)
+ report_fatal_error("Cannot pop empty stack!");
+ RegMap[Stack[--StackTop]] = ~0; // Update state
+ }
+
+ bool isAtTop(unsigned RegNo) const { return getSlot(RegNo) == StackTop-1; }
+ void moveToTop(unsigned RegNo, MachineBasicBlock::iterator I) {
+ DebugLoc dl = I == MBB->end() ? DebugLoc() : I->getDebugLoc();
+ if (isAtTop(RegNo)) return;
+
+ unsigned STReg = getSTReg(RegNo);
+ unsigned RegOnTop = getStackEntry(0);
+
+ // Swap the slots the regs are in.
+ std::swap(RegMap[RegNo], RegMap[RegOnTop]);
+
+ // Swap stack slot contents.
+ if (RegMap[RegOnTop] >= StackTop)
+ report_fatal_error("Access past stack top!");
+ std::swap(Stack[RegMap[RegOnTop]], Stack[StackTop-1]);
+
+ // Emit an fxch to update the runtime processors version of the state.
+ BuildMI(*MBB, I, dl, TII->get(X86::XCH_F)).addReg(STReg);
+ ++NumFXCH;
+ }
+
+ void duplicateToTop(unsigned RegNo, unsigned AsReg,
+ MachineBasicBlock::iterator I) {
+ DebugLoc dl = I == MBB->end() ? DebugLoc() : I->getDebugLoc();
+ unsigned STReg = getSTReg(RegNo);
+ pushReg(AsReg); // New register on top of stack
+
+ BuildMI(*MBB, I, dl, TII->get(X86::LD_Frr)).addReg(STReg);
+ }
+
+ /// popStackAfter - Pop the current value off of the top of the FP stack
+ /// after the specified instruction.
+ void popStackAfter(MachineBasicBlock::iterator &I);
+
+ /// freeStackSlotAfter - Free the specified register from the register
+ /// stack, so that it is no longer in a register. If the register is
+ /// currently at the top of the stack, we just pop the current instruction,
+ /// otherwise we store the current top-of-stack into the specified slot,
+ /// then pop the top of stack.
+ void freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned Reg);
+
+ /// freeStackSlotBefore - Just the pop, no folding. Return the inserted
+ /// instruction.
+ MachineBasicBlock::iterator
+ freeStackSlotBefore(MachineBasicBlock::iterator I, unsigned FPRegNo);
+
+ /// Adjust the live registers to be the set in Mask.
+ void adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I);
+
+ /// Shuffle the top FixCount stack entries such that FP reg FixStack[0] is
+ /// st(0), FP reg FixStack[1] is st(1) etc.
+ void shuffleStackTop(const unsigned char *FixStack, unsigned FixCount,
+ MachineBasicBlock::iterator I);
+
+ bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB);
+
+ void handleCall(MachineBasicBlock::iterator &I);
+ void handleReturn(MachineBasicBlock::iterator &I);
+ void handleZeroArgFP(MachineBasicBlock::iterator &I);
+ void handleOneArgFP(MachineBasicBlock::iterator &I);
+ void handleOneArgFPRW(MachineBasicBlock::iterator &I);
+ void handleTwoArgFP(MachineBasicBlock::iterator &I);
+ void handleCompareFP(MachineBasicBlock::iterator &I);
+ void handleCondMovFP(MachineBasicBlock::iterator &I);
+ void handleSpecialFP(MachineBasicBlock::iterator &I);
+
+ // Check if a COPY instruction is using FP registers.
+ static bool isFPCopy(MachineInstr &MI) {
+ unsigned DstReg = MI.getOperand(0).getReg();
+ unsigned SrcReg = MI.getOperand(1).getReg();
+
+ return X86::RFP80RegClass.contains(DstReg) ||
+ X86::RFP80RegClass.contains(SrcReg);
+ }
+
+ void setKillFlags(MachineBasicBlock &MBB) const;
+ };
+ char FPS::ID = 0;
+}
+
+FunctionPass *llvm::createX86FloatingPointStackifierPass() { return new FPS(); }
+
+/// getFPReg - Return the X86::FPx register number for the specified operand.
+/// For example, this returns 3 for X86::FP3.
+static unsigned getFPReg(const MachineOperand &MO) {
+ assert(MO.isReg() && "Expected an FP register!");
+ unsigned Reg = MO.getReg();
+ assert(Reg >= X86::FP0 && Reg <= X86::FP6 && "Expected FP register!");
+ return Reg - X86::FP0;
+}
+
+/// runOnMachineFunction - Loop over all of the basic blocks, transforming FP
+/// register references into FP stack references.
+///
+bool FPS::runOnMachineFunction(MachineFunction &MF) {
+ // We only need to run this pass if there are any FP registers used in this
+ // function. If it is all integer, there is nothing for us to do!
+ bool FPIsUsed = false;
+
+ static_assert(X86::FP6 == X86::FP0+6, "Register enums aren't sorted right!");
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ for (unsigned i = 0; i <= 6; ++i)
+ if (!MRI.reg_nodbg_empty(X86::FP0 + i)) {
+ FPIsUsed = true;
+ break;
+ }
+
+ // Early exit.
+ if (!FPIsUsed) return false;
+
+ Bundles = &getAnalysis<EdgeBundles>();
+ TII = MF.getSubtarget().getInstrInfo();
+
+ // Prepare cross-MBB liveness.
+ bundleCFG(MF);
+
+ StackTop = 0;
+
+ // Process the function in depth first order so that we process at least one
+ // of the predecessors for every reachable block in the function.
+ df_iterator_default_set<MachineBasicBlock*> Processed;
+ MachineBasicBlock *Entry = &MF.front();
+
+ LiveBundle &Bundle =
+ LiveBundles[Bundles->getBundle(Entry->getNumber(), false)];
+
+ // In regcall convention, some FP registers may not be passed through
+ // the stack, so they will need to be assigned to the stack first
+ if ((Entry->getParent()->getFunction()->getCallingConv() ==
+ CallingConv::X86_RegCall) && (Bundle.Mask && !Bundle.FixCount)) {
+ // In the register calling convention, up to one FP argument could be
+ // saved in the first FP register.
+ // If bundle.mask is non-zero and Bundle.FixCount is zero, it means
+ // that the FP registers contain arguments.
+ // The actual value is passed in FP0.
+ // Here we fix the stack and mark FP0 as pre-assigned register.
+ assert((Bundle.Mask & 0xFE) == 0 &&
+ "Only FP0 could be passed as an argument");
+ Bundle.FixCount = 1;
+ Bundle.FixStack[0] = 0;
+ }
+
+ bool Changed = false;
+ for (MachineBasicBlock *BB : depth_first_ext(Entry, Processed))
+ Changed |= processBasicBlock(MF, *BB);
+
+ // Process any unreachable blocks in arbitrary order now.
+ if (MF.size() != Processed.size())
+ for (MachineBasicBlock &BB : MF)
+ if (Processed.insert(&BB).second)
+ Changed |= processBasicBlock(MF, BB);
+
+ LiveBundles.clear();
+
+ return Changed;
+}
+
+/// bundleCFG - Scan all the basic blocks to determine consistent live-in and
+/// live-out sets for the FP registers. Consistent means that the set of
+/// registers live-out from a block is identical to the live-in set of all
+/// successors. This is not enforced by the normal live-in lists since
+/// registers may be implicitly defined, or not used by all successors.
+void FPS::bundleCFG(MachineFunction &MF) {
+ assert(LiveBundles.empty() && "Stale data in LiveBundles");
+ LiveBundles.resize(Bundles->getNumBundles());
+
+ // Gather the actual live-in masks for all MBBs.
+ for (MachineBasicBlock &MBB : MF) {
+ const unsigned Mask = calcLiveInMask(&MBB);
+ if (!Mask)
+ continue;
+ // Update MBB ingoing bundle mask.
+ LiveBundles[Bundles->getBundle(MBB.getNumber(), false)].Mask |= Mask;
+ }
+}
+
+/// processBasicBlock - Loop over all of the instructions in the basic block,
+/// transforming FP instructions into their stack form.
+///
+bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
+ bool Changed = false;
+ MBB = &BB;
+
+ setKillFlags(BB);
+ setupBlockStack();
+
+ for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
+ MachineInstr &MI = *I;
+ uint64_t Flags = MI.getDesc().TSFlags;
+
+ unsigned FPInstClass = Flags & X86II::FPTypeMask;
+ if (MI.isInlineAsm())
+ FPInstClass = X86II::SpecialFP;
+
+ if (MI.isCopy() && isFPCopy(MI))
+ FPInstClass = X86II::SpecialFP;
+
+ if (MI.isImplicitDef() &&
+ X86::RFP80RegClass.contains(MI.getOperand(0).getReg()))
+ FPInstClass = X86II::SpecialFP;
+
+ if (MI.isCall())
+ FPInstClass = X86II::SpecialFP;
+
+ if (FPInstClass == X86II::NotFP)
+ continue; // Efficiently ignore non-fp insts!
+
+ MachineInstr *PrevMI = nullptr;
+ if (I != BB.begin())
+ PrevMI = &*std::prev(I);
+
+ ++NumFP; // Keep track of # of pseudo instrs
+ DEBUG(dbgs() << "\nFPInst:\t" << MI);
+
+ // Get dead variables list now because the MI pointer may be deleted as part
+ // of processing!
+ SmallVector<unsigned, 8> DeadRegs;
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI.getOperand(i);
+ if (MO.isReg() && MO.isDead())
+ DeadRegs.push_back(MO.getReg());
+ }
+
+ switch (FPInstClass) {
+ case X86II::ZeroArgFP: handleZeroArgFP(I); break;
+ case X86II::OneArgFP: handleOneArgFP(I); break; // fstp ST(0)
+ case X86II::OneArgFPRW: handleOneArgFPRW(I); break; // ST(0) = fsqrt(ST(0))
+ case X86II::TwoArgFP: handleTwoArgFP(I); break;
+ case X86II::CompareFP: handleCompareFP(I); break;
+ case X86II::CondMovFP: handleCondMovFP(I); break;
+ case X86II::SpecialFP: handleSpecialFP(I); break;
+ default: llvm_unreachable("Unknown FP Type!");
+ }
+
+ // Check to see if any of the values defined by this instruction are dead
+ // after definition. If so, pop them.
+ for (unsigned i = 0, e = DeadRegs.size(); i != e; ++i) {
+ unsigned Reg = DeadRegs[i];
+ // Check if Reg is live on the stack. An inline-asm register operand that
+ // is in the clobber list and marked dead might not be live on the stack.
+ if (Reg >= X86::FP0 && Reg <= X86::FP6 && isLive(Reg-X86::FP0)) {
+ DEBUG(dbgs() << "Register FP#" << Reg-X86::FP0 << " is dead!\n");
+ freeStackSlotAfter(I, Reg-X86::FP0);
+ }
+ }
+
+ // Print out all of the instructions expanded to if -debug
+ DEBUG({
+ MachineBasicBlock::iterator PrevI = PrevMI;
+ if (I == PrevI) {
+ dbgs() << "Just deleted pseudo instruction\n";
+ } else {
+ MachineBasicBlock::iterator Start = I;
+ // Rewind to first instruction newly inserted.
+ while (Start != BB.begin() && std::prev(Start) != PrevI)
+ --Start;
+ dbgs() << "Inserted instructions:\n\t";
+ Start->print(dbgs());
+ while (++Start != std::next(I)) {
+ }
+ }
+ dumpStack();
+ });
+ (void)PrevMI;
+
+ Changed = true;
+ }
+
+ finishBlockStack();
+
+ return Changed;
+}
+
+/// setupBlockStack - Use the live bundles to set up our model of the stack
+/// to match predecessors' live out stack.
+void FPS::setupBlockStack() {
+ DEBUG(dbgs() << "\nSetting up live-ins for BB#" << MBB->getNumber()
+ << " derived from " << MBB->getName() << ".\n");
+ StackTop = 0;
+ // Get the live-in bundle for MBB.
+ const LiveBundle &Bundle =
+ LiveBundles[Bundles->getBundle(MBB->getNumber(), false)];
+
+ if (!Bundle.Mask) {
+ DEBUG(dbgs() << "Block has no FP live-ins.\n");
+ return;
+ }
+
+ // Depth-first iteration should ensure that we always have an assigned stack.
+ assert(Bundle.isFixed() && "Reached block before any predecessors");
+
+ // Push the fixed live-in registers.
+ for (unsigned i = Bundle.FixCount; i > 0; --i) {
+ MBB->addLiveIn(X86::ST0+i-1);
+ DEBUG(dbgs() << "Live-in st(" << (i-1) << "): %FP"
+ << unsigned(Bundle.FixStack[i-1]) << '\n');
+ pushReg(Bundle.FixStack[i-1]);
+ }
+
+ // Kill off unwanted live-ins. This can happen with a critical edge.
+ // FIXME: We could keep these live registers around as zombies. They may need
+ // to be revived at the end of a short block. It might save a few instrs.
+ adjustLiveRegs(calcLiveInMask(MBB), MBB->begin());
+ DEBUG(MBB->dump());
+}
+
+/// finishBlockStack - Revive live-outs that are implicitly defined out of
+/// MBB. Shuffle live registers to match the expected fixed stack of any
+/// predecessors, and ensure that all predecessors are expecting the same
+/// stack.
+void FPS::finishBlockStack() {
+ // The RET handling below takes care of return blocks for us.
+ if (MBB->succ_empty())
+ return;
+
+ DEBUG(dbgs() << "Setting up live-outs for BB#" << MBB->getNumber()
+ << " derived from " << MBB->getName() << ".\n");
+
+ // Get MBB's live-out bundle.
+ unsigned BundleIdx = Bundles->getBundle(MBB->getNumber(), true);
+ LiveBundle &Bundle = LiveBundles[BundleIdx];
+
+ // We may need to kill and define some registers to match successors.
+ // FIXME: This can probably be combined with the shuffle below.
+ MachineBasicBlock::iterator Term = MBB->getFirstTerminator();
+ adjustLiveRegs(Bundle.Mask, Term);
+
+ if (!Bundle.Mask) {
+ DEBUG(dbgs() << "No live-outs.\n");
+ return;
+ }
+
+ // Has the stack order been fixed yet?
+ DEBUG(dbgs() << "LB#" << BundleIdx << ": ");
+ if (Bundle.isFixed()) {
+ DEBUG(dbgs() << "Shuffling stack to match.\n");
+ shuffleStackTop(Bundle.FixStack, Bundle.FixCount, Term);
+ } else {
+ // Not fixed yet, we get to choose.
+ DEBUG(dbgs() << "Fixing stack order now.\n");
+ Bundle.FixCount = StackTop;
+ for (unsigned i = 0; i < StackTop; ++i)
+ Bundle.FixStack[i] = getStackEntry(i);
+ }
+}
+
+
+//===----------------------------------------------------------------------===//
+// Efficient Lookup Table Support
+//===----------------------------------------------------------------------===//
+
+namespace {
+ struct TableEntry {
+ uint16_t from;
+ uint16_t to;
+ bool operator<(const TableEntry &TE) const { return from < TE.from; }
+ friend bool operator<(const TableEntry &TE, unsigned V) {
+ return TE.from < V;
+ }
+ friend bool LLVM_ATTRIBUTE_UNUSED operator<(unsigned V,
+ const TableEntry &TE) {
+ return V < TE.from;
+ }
+ };
+}
+
+static int Lookup(ArrayRef<TableEntry> Table, unsigned Opcode) {
+ const TableEntry *I = std::lower_bound(Table.begin(), Table.end(), Opcode);
+ if (I != Table.end() && I->from == Opcode)
+ return I->to;
+ return -1;
+}
+
+#ifdef NDEBUG
+#define ASSERT_SORTED(TABLE)
+#else
+#define ASSERT_SORTED(TABLE) \
+ { static bool TABLE##Checked = false; \
+ if (!TABLE##Checked) { \
+ assert(std::is_sorted(std::begin(TABLE), std::end(TABLE)) && \
+ "All lookup tables must be sorted for efficient access!"); \
+ TABLE##Checked = true; \
+ } \
+ }
+#endif
+
+//===----------------------------------------------------------------------===//
+// Register File -> Register Stack Mapping Methods
+//===----------------------------------------------------------------------===//
+
+// OpcodeTable - Sorted map of register instructions to their stack version.
+// The first element is an register file pseudo instruction, the second is the
+// concrete X86 instruction which uses the register stack.
+//
+static const TableEntry OpcodeTable[] = {
+ { X86::ABS_Fp32 , X86::ABS_F },
+ { X86::ABS_Fp64 , X86::ABS_F },
+ { X86::ABS_Fp80 , X86::ABS_F },
+ { X86::ADD_Fp32m , X86::ADD_F32m },
+ { X86::ADD_Fp64m , X86::ADD_F64m },
+ { X86::ADD_Fp64m32 , X86::ADD_F32m },
+ { X86::ADD_Fp80m32 , X86::ADD_F32m },
+ { X86::ADD_Fp80m64 , X86::ADD_F64m },
+ { X86::ADD_FpI16m32 , X86::ADD_FI16m },
+ { X86::ADD_FpI16m64 , X86::ADD_FI16m },
+ { X86::ADD_FpI16m80 , X86::ADD_FI16m },
+ { X86::ADD_FpI32m32 , X86::ADD_FI32m },
+ { X86::ADD_FpI32m64 , X86::ADD_FI32m },
+ { X86::ADD_FpI32m80 , X86::ADD_FI32m },
+ { X86::CHS_Fp32 , X86::CHS_F },
+ { X86::CHS_Fp64 , X86::CHS_F },
+ { X86::CHS_Fp80 , X86::CHS_F },
+ { X86::CMOVBE_Fp32 , X86::CMOVBE_F },
+ { X86::CMOVBE_Fp64 , X86::CMOVBE_F },
+ { X86::CMOVBE_Fp80 , X86::CMOVBE_F },
+ { X86::CMOVB_Fp32 , X86::CMOVB_F },
+ { X86::CMOVB_Fp64 , X86::CMOVB_F },
+ { X86::CMOVB_Fp80 , X86::CMOVB_F },
+ { X86::CMOVE_Fp32 , X86::CMOVE_F },
+ { X86::CMOVE_Fp64 , X86::CMOVE_F },
+ { X86::CMOVE_Fp80 , X86::CMOVE_F },
+ { X86::CMOVNBE_Fp32 , X86::CMOVNBE_F },
+ { X86::CMOVNBE_Fp64 , X86::CMOVNBE_F },
+ { X86::CMOVNBE_Fp80 , X86::CMOVNBE_F },
+ { X86::CMOVNB_Fp32 , X86::CMOVNB_F },
+ { X86::CMOVNB_Fp64 , X86::CMOVNB_F },
+ { X86::CMOVNB_Fp80 , X86::CMOVNB_F },
+ { X86::CMOVNE_Fp32 , X86::CMOVNE_F },
+ { X86::CMOVNE_Fp64 , X86::CMOVNE_F },
+ { X86::CMOVNE_Fp80 , X86::CMOVNE_F },
+ { X86::CMOVNP_Fp32 , X86::CMOVNP_F },
+ { X86::CMOVNP_Fp64 , X86::CMOVNP_F },
+ { X86::CMOVNP_Fp80 , X86::CMOVNP_F },
+ { X86::CMOVP_Fp32 , X86::CMOVP_F },
+ { X86::CMOVP_Fp64 , X86::CMOVP_F },
+ { X86::CMOVP_Fp80 , X86::CMOVP_F },
+ { X86::COS_Fp32 , X86::COS_F },
+ { X86::COS_Fp64 , X86::COS_F },
+ { X86::COS_Fp80 , X86::COS_F },
+ { X86::DIVR_Fp32m , X86::DIVR_F32m },
+ { X86::DIVR_Fp64m , X86::DIVR_F64m },
+ { X86::DIVR_Fp64m32 , X86::DIVR_F32m },
+ { X86::DIVR_Fp80m32 , X86::DIVR_F32m },
+ { X86::DIVR_Fp80m64 , X86::DIVR_F64m },
+ { X86::DIVR_FpI16m32, X86::DIVR_FI16m},
+ { X86::DIVR_FpI16m64, X86::DIVR_FI16m},
+ { X86::DIVR_FpI16m80, X86::DIVR_FI16m},
+ { X86::DIVR_FpI32m32, X86::DIVR_FI32m},
+ { X86::DIVR_FpI32m64, X86::DIVR_FI32m},
+ { X86::DIVR_FpI32m80, X86::DIVR_FI32m},
+ { X86::DIV_Fp32m , X86::DIV_F32m },
+ { X86::DIV_Fp64m , X86::DIV_F64m },
+ { X86::DIV_Fp64m32 , X86::DIV_F32m },
+ { X86::DIV_Fp80m32 , X86::DIV_F32m },
+ { X86::DIV_Fp80m64 , X86::DIV_F64m },
+ { X86::DIV_FpI16m32 , X86::DIV_FI16m },
+ { X86::DIV_FpI16m64 , X86::DIV_FI16m },
+ { X86::DIV_FpI16m80 , X86::DIV_FI16m },
+ { X86::DIV_FpI32m32 , X86::DIV_FI32m },
+ { X86::DIV_FpI32m64 , X86::DIV_FI32m },
+ { X86::DIV_FpI32m80 , X86::DIV_FI32m },
+ { X86::ILD_Fp16m32 , X86::ILD_F16m },
+ { X86::ILD_Fp16m64 , X86::ILD_F16m },
+ { X86::ILD_Fp16m80 , X86::ILD_F16m },
+ { X86::ILD_Fp32m32 , X86::ILD_F32m },
+ { X86::ILD_Fp32m64 , X86::ILD_F32m },
+ { X86::ILD_Fp32m80 , X86::ILD_F32m },
+ { X86::ILD_Fp64m32 , X86::ILD_F64m },
+ { X86::ILD_Fp64m64 , X86::ILD_F64m },
+ { X86::ILD_Fp64m80 , X86::ILD_F64m },
+ { X86::ISTT_Fp16m32 , X86::ISTT_FP16m},
+ { X86::ISTT_Fp16m64 , X86::ISTT_FP16m},
+ { X86::ISTT_Fp16m80 , X86::ISTT_FP16m},
+ { X86::ISTT_Fp32m32 , X86::ISTT_FP32m},
+ { X86::ISTT_Fp32m64 , X86::ISTT_FP32m},
+ { X86::ISTT_Fp32m80 , X86::ISTT_FP32m},
+ { X86::ISTT_Fp64m32 , X86::ISTT_FP64m},
+ { X86::ISTT_Fp64m64 , X86::ISTT_FP64m},
+ { X86::ISTT_Fp64m80 , X86::ISTT_FP64m},
+ { X86::IST_Fp16m32 , X86::IST_F16m },
+ { X86::IST_Fp16m64 , X86::IST_F16m },
+ { X86::IST_Fp16m80 , X86::IST_F16m },
+ { X86::IST_Fp32m32 , X86::IST_F32m },
+ { X86::IST_Fp32m64 , X86::IST_F32m },
+ { X86::IST_Fp32m80 , X86::IST_F32m },
+ { X86::IST_Fp64m32 , X86::IST_FP64m },
+ { X86::IST_Fp64m64 , X86::IST_FP64m },
+ { X86::IST_Fp64m80 , X86::IST_FP64m },
+ { X86::LD_Fp032 , X86::LD_F0 },
+ { X86::LD_Fp064 , X86::LD_F0 },
+ { X86::LD_Fp080 , X86::LD_F0 },
+ { X86::LD_Fp132 , X86::LD_F1 },
+ { X86::LD_Fp164 , X86::LD_F1 },
+ { X86::LD_Fp180 , X86::LD_F1 },
+ { X86::LD_Fp32m , X86::LD_F32m },
+ { X86::LD_Fp32m64 , X86::LD_F32m },
+ { X86::LD_Fp32m80 , X86::LD_F32m },
+ { X86::LD_Fp64m , X86::LD_F64m },
+ { X86::LD_Fp64m80 , X86::LD_F64m },
+ { X86::LD_Fp80m , X86::LD_F80m },
+ { X86::MUL_Fp32m , X86::MUL_F32m },
+ { X86::MUL_Fp64m , X86::MUL_F64m },
+ { X86::MUL_Fp64m32 , X86::MUL_F32m },
+ { X86::MUL_Fp80m32 , X86::MUL_F32m },
+ { X86::MUL_Fp80m64 , X86::MUL_F64m },
+ { X86::MUL_FpI16m32 , X86::MUL_FI16m },
+ { X86::MUL_FpI16m64 , X86::MUL_FI16m },
+ { X86::MUL_FpI16m80 , X86::MUL_FI16m },
+ { X86::MUL_FpI32m32 , X86::MUL_FI32m },
+ { X86::MUL_FpI32m64 , X86::MUL_FI32m },
+ { X86::MUL_FpI32m80 , X86::MUL_FI32m },
+ { X86::SIN_Fp32 , X86::SIN_F },
+ { X86::SIN_Fp64 , X86::SIN_F },
+ { X86::SIN_Fp80 , X86::SIN_F },
+ { X86::SQRT_Fp32 , X86::SQRT_F },
+ { X86::SQRT_Fp64 , X86::SQRT_F },
+ { X86::SQRT_Fp80 , X86::SQRT_F },
+ { X86::ST_Fp32m , X86::ST_F32m },
+ { X86::ST_Fp64m , X86::ST_F64m },
+ { X86::ST_Fp64m32 , X86::ST_F32m },
+ { X86::ST_Fp80m32 , X86::ST_F32m },
+ { X86::ST_Fp80m64 , X86::ST_F64m },
+ { X86::ST_FpP80m , X86::ST_FP80m },
+ { X86::SUBR_Fp32m , X86::SUBR_F32m },
+ { X86::SUBR_Fp64m , X86::SUBR_F64m },
+ { X86::SUBR_Fp64m32 , X86::SUBR_F32m },
+ { X86::SUBR_Fp80m32 , X86::SUBR_F32m },
+ { X86::SUBR_Fp80m64 , X86::SUBR_F64m },
+ { X86::SUBR_FpI16m32, X86::SUBR_FI16m},
+ { X86::SUBR_FpI16m64, X86::SUBR_FI16m},
+ { X86::SUBR_FpI16m80, X86::SUBR_FI16m},
+ { X86::SUBR_FpI32m32, X86::SUBR_FI32m},
+ { X86::SUBR_FpI32m64, X86::SUBR_FI32m},
+ { X86::SUBR_FpI32m80, X86::SUBR_FI32m},
+ { X86::SUB_Fp32m , X86::SUB_F32m },
+ { X86::SUB_Fp64m , X86::SUB_F64m },
+ { X86::SUB_Fp64m32 , X86::SUB_F32m },
+ { X86::SUB_Fp80m32 , X86::SUB_F32m },
+ { X86::SUB_Fp80m64 , X86::SUB_F64m },
+ { X86::SUB_FpI16m32 , X86::SUB_FI16m },
+ { X86::SUB_FpI16m64 , X86::SUB_FI16m },
+ { X86::SUB_FpI16m80 , X86::SUB_FI16m },
+ { X86::SUB_FpI32m32 , X86::SUB_FI32m },
+ { X86::SUB_FpI32m64 , X86::SUB_FI32m },
+ { X86::SUB_FpI32m80 , X86::SUB_FI32m },
+ { X86::TST_Fp32 , X86::TST_F },
+ { X86::TST_Fp64 , X86::TST_F },
+ { X86::TST_Fp80 , X86::TST_F },
+ { X86::UCOM_FpIr32 , X86::UCOM_FIr },
+ { X86::UCOM_FpIr64 , X86::UCOM_FIr },
+ { X86::UCOM_FpIr80 , X86::UCOM_FIr },
+ { X86::UCOM_Fpr32 , X86::UCOM_Fr },
+ { X86::UCOM_Fpr64 , X86::UCOM_Fr },
+ { X86::UCOM_Fpr80 , X86::UCOM_Fr },
+};
+
+static unsigned getConcreteOpcode(unsigned Opcode) {
+ ASSERT_SORTED(OpcodeTable);
+ int Opc = Lookup(OpcodeTable, Opcode);
+ assert(Opc != -1 && "FP Stack instruction not in OpcodeTable!");
+ return Opc;
+}
+
+//===----------------------------------------------------------------------===//
+// Helper Methods
+//===----------------------------------------------------------------------===//
+
+// PopTable - Sorted map of instructions to their popping version. The first
+// element is an instruction, the second is the version which pops.
+//
+static const TableEntry PopTable[] = {
+ { X86::ADD_FrST0 , X86::ADD_FPrST0 },
+
+ { X86::DIVR_FrST0, X86::DIVR_FPrST0 },
+ { X86::DIV_FrST0 , X86::DIV_FPrST0 },
+
+ { X86::IST_F16m , X86::IST_FP16m },
+ { X86::IST_F32m , X86::IST_FP32m },
+
+ { X86::MUL_FrST0 , X86::MUL_FPrST0 },
+
+ { X86::ST_F32m , X86::ST_FP32m },
+ { X86::ST_F64m , X86::ST_FP64m },
+ { X86::ST_Frr , X86::ST_FPrr },
+
+ { X86::SUBR_FrST0, X86::SUBR_FPrST0 },
+ { X86::SUB_FrST0 , X86::SUB_FPrST0 },
+
+ { X86::UCOM_FIr , X86::UCOM_FIPr },
+
+ { X86::UCOM_FPr , X86::UCOM_FPPr },
+ { X86::UCOM_Fr , X86::UCOM_FPr },
+};
+
+/// popStackAfter - Pop the current value off of the top of the FP stack after
+/// the specified instruction. This attempts to be sneaky and combine the pop
+/// into the instruction itself if possible. The iterator is left pointing to
+/// the last instruction, be it a new pop instruction inserted, or the old
+/// instruction if it was modified in place.
+///
+void FPS::popStackAfter(MachineBasicBlock::iterator &I) {
+ MachineInstr &MI = *I;
+ const DebugLoc &dl = MI.getDebugLoc();
+ ASSERT_SORTED(PopTable);
+
+ popReg();
+
+ // Check to see if there is a popping version of this instruction...
+ int Opcode = Lookup(PopTable, I->getOpcode());
+ if (Opcode != -1) {
+ I->setDesc(TII->get(Opcode));
+ if (Opcode == X86::UCOM_FPPr)
+ I->RemoveOperand(0);
+ } else { // Insert an explicit pop
+ I = BuildMI(*MBB, ++I, dl, TII->get(X86::ST_FPrr)).addReg(X86::ST0);
+ }
+}
+
+/// freeStackSlotAfter - Free the specified register from the register stack, so
+/// that it is no longer in a register. If the register is currently at the top
+/// of the stack, we just pop the current instruction, otherwise we store the
+/// current top-of-stack into the specified slot, then pop the top of stack.
+void FPS::freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned FPRegNo) {
+ if (getStackEntry(0) == FPRegNo) { // already at the top of stack? easy.
+ popStackAfter(I);
+ return;
+ }
+
+ // Otherwise, store the top of stack into the dead slot, killing the operand
+ // without having to add in an explicit xchg then pop.
+ //
+ I = freeStackSlotBefore(++I, FPRegNo);
+}
+
+/// freeStackSlotBefore - Free the specified register without trying any
+/// folding.
+MachineBasicBlock::iterator
+FPS::freeStackSlotBefore(MachineBasicBlock::iterator I, unsigned FPRegNo) {
+ unsigned STReg = getSTReg(FPRegNo);
+ unsigned OldSlot = getSlot(FPRegNo);
+ unsigned TopReg = Stack[StackTop-1];
+ Stack[OldSlot] = TopReg;
+ RegMap[TopReg] = OldSlot;
+ RegMap[FPRegNo] = ~0;
+ Stack[--StackTop] = ~0;
+ return BuildMI(*MBB, I, DebugLoc(), TII->get(X86::ST_FPrr))
+ .addReg(STReg)
+ .getInstr();
+}
+
+/// adjustLiveRegs - Kill and revive registers such that exactly the FP
+/// registers with a bit in Mask are live.
+void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) {
+ unsigned Defs = Mask;
+ unsigned Kills = 0;
+ for (unsigned i = 0; i < StackTop; ++i) {
+ unsigned RegNo = Stack[i];
+ if (!(Defs & (1 << RegNo)))
+ // This register is live, but we don't want it.
+ Kills |= (1 << RegNo);
+ else
+ // We don't need to imp-def this live register.
+ Defs &= ~(1 << RegNo);
+ }
+ assert((Kills & Defs) == 0 && "Register needs killing and def'ing?");
+
+ // Produce implicit-defs for free by using killed registers.
+ while (Kills && Defs) {
+ unsigned KReg = countTrailingZeros(Kills);
+ unsigned DReg = countTrailingZeros(Defs);
+ DEBUG(dbgs() << "Renaming %FP" << KReg << " as imp %FP" << DReg << "\n");
+ std::swap(Stack[getSlot(KReg)], Stack[getSlot(DReg)]);
+ std::swap(RegMap[KReg], RegMap[DReg]);
+ Kills &= ~(1 << KReg);
+ Defs &= ~(1 << DReg);
+ }
+
+ // Kill registers by popping.
+ if (Kills && I != MBB->begin()) {
+ MachineBasicBlock::iterator I2 = std::prev(I);
+ while (StackTop) {
+ unsigned KReg = getStackEntry(0);
+ if (!(Kills & (1 << KReg)))
+ break;
+ DEBUG(dbgs() << "Popping %FP" << KReg << "\n");
+ popStackAfter(I2);
+ Kills &= ~(1 << KReg);
+ }
+ }
+
+ // Manually kill the rest.
+ while (Kills) {
+ unsigned KReg = countTrailingZeros(Kills);
+ DEBUG(dbgs() << "Killing %FP" << KReg << "\n");
+ freeStackSlotBefore(I, KReg);
+ Kills &= ~(1 << KReg);
+ }
+
+ // Load zeros for all the imp-defs.
+ while(Defs) {
+ unsigned DReg = countTrailingZeros(Defs);
+ DEBUG(dbgs() << "Defining %FP" << DReg << " as 0\n");
+ BuildMI(*MBB, I, DebugLoc(), TII->get(X86::LD_F0));
+ pushReg(DReg);
+ Defs &= ~(1 << DReg);
+ }
+
+ // Now we should have the correct registers live.
+ DEBUG(dumpStack());
+ assert(StackTop == countPopulation(Mask) && "Live count mismatch");
+}
+
+/// shuffleStackTop - emit fxch instructions before I to shuffle the top
+/// FixCount entries into the order given by FixStack.
+/// FIXME: Is there a better algorithm than insertion sort?
+void FPS::shuffleStackTop(const unsigned char *FixStack,
+ unsigned FixCount,
+ MachineBasicBlock::iterator I) {
+ // Move items into place, starting from the desired stack bottom.
+ while (FixCount--) {
+ // Old register at position FixCount.
+ unsigned OldReg = getStackEntry(FixCount);
+ // Desired register at position FixCount.
+ unsigned Reg = FixStack[FixCount];
+ if (Reg == OldReg)
+ continue;
+ // (Reg st0) (OldReg st0) = (Reg OldReg st0)
+ moveToTop(Reg, I);
+ if (FixCount > 0)
+ moveToTop(OldReg, I);
+ }
+ DEBUG(dumpStack());
+}
+
+
+//===----------------------------------------------------------------------===//
+// Instruction transformation implementation
+//===----------------------------------------------------------------------===//
+
+void FPS::handleCall(MachineBasicBlock::iterator &I) {
+ unsigned STReturns = 0;
+ const MachineFunction* MF = I->getParent()->getParent();
+
+ for (const auto &MO : I->operands()) {
+ if (!MO.isReg())
+ continue;
+
+ unsigned R = MO.getReg() - X86::FP0;
+
+ if (R < 8) {
+ if (MF->getFunction()->getCallingConv() != CallingConv::X86_RegCall) {
+ assert(MO.isDef() && MO.isImplicit());
+ }
+
+ STReturns |= 1 << R;
+ }
+ }
+
+ unsigned N = countTrailingOnes(STReturns);
+
+ // FP registers used for function return must be consecutive starting at
+ // FP0
+ assert(STReturns == 0 || (isMask_32(STReturns) && N <= 2));
+
+ // Reset the FP Stack - It is required because of possible leftovers from
+ // passed arguments. The caller should assume that the FP stack is
+ // returned empty (unless the callee returns values on FP stack).
+ while (StackTop > 0)
+ popReg();
+
+ for (unsigned I = 0; I < N; ++I)
+ pushReg(N - I - 1);
+}
+
+/// If RET has an FP register use operand, pass the first one in ST(0) and
+/// the second one in ST(1).
+void FPS::handleReturn(MachineBasicBlock::iterator &I) {
+ MachineInstr &MI = *I;
+
+ // Find the register operands.
+ unsigned FirstFPRegOp = ~0U, SecondFPRegOp = ~0U;
+ unsigned LiveMask = 0;
+
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ MachineOperand &Op = MI.getOperand(i);
+ if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
+ continue;
+ // FP Register uses must be kills unless there are two uses of the same
+ // register, in which case only one will be a kill.
+ assert(Op.isUse() &&
+ (Op.isKill() || // Marked kill.
+ getFPReg(Op) == FirstFPRegOp || // Second instance.
+ MI.killsRegister(Op.getReg())) && // Later use is marked kill.
+ "Ret only defs operands, and values aren't live beyond it");
+
+ if (FirstFPRegOp == ~0U)
+ FirstFPRegOp = getFPReg(Op);
+ else {
+ assert(SecondFPRegOp == ~0U && "More than two fp operands!");
+ SecondFPRegOp = getFPReg(Op);
+ }
+ LiveMask |= (1 << getFPReg(Op));
+
+ // Remove the operand so that later passes don't see it.
+ MI.RemoveOperand(i);
+ --i;
+ --e;
+ }
+
+ // We may have been carrying spurious live-ins, so make sure only the
+ // returned registers are left live.
+ adjustLiveRegs(LiveMask, MI);
+ if (!LiveMask) return; // Quick check to see if any are possible.
+
+ // There are only four possibilities here:
+ // 1) we are returning a single FP value. In this case, it has to be in
+ // ST(0) already, so just declare success by removing the value from the
+ // FP Stack.
+ if (SecondFPRegOp == ~0U) {
+ // Assert that the top of stack contains the right FP register.
+ assert(StackTop == 1 && FirstFPRegOp == getStackEntry(0) &&
+ "Top of stack not the right register for RET!");
+
+ // Ok, everything is good, mark the value as not being on the stack
+ // anymore so that our assertion about the stack being empty at end of
+ // block doesn't fire.
+ StackTop = 0;
+ return;
+ }
+
+ // Otherwise, we are returning two values:
+ // 2) If returning the same value for both, we only have one thing in the FP
+ // stack. Consider: RET FP1, FP1
+ if (StackTop == 1) {
+ assert(FirstFPRegOp == SecondFPRegOp && FirstFPRegOp == getStackEntry(0)&&
+ "Stack misconfiguration for RET!");
+
+ // Duplicate the TOS so that we return it twice. Just pick some other FPx
+ // register to hold it.
+ unsigned NewReg = ScratchFPReg;
+ duplicateToTop(FirstFPRegOp, NewReg, MI);
+ FirstFPRegOp = NewReg;
+ }
+
+ /// Okay we know we have two different FPx operands now:
+ assert(StackTop == 2 && "Must have two values live!");
+
+ /// 3) If SecondFPRegOp is currently in ST(0) and FirstFPRegOp is currently
+ /// in ST(1). In this case, emit an fxch.
+ if (getStackEntry(0) == SecondFPRegOp) {
+ assert(getStackEntry(1) == FirstFPRegOp && "Unknown regs live");
+ moveToTop(FirstFPRegOp, MI);
+ }
+
+ /// 4) Finally, FirstFPRegOp must be in ST(0) and SecondFPRegOp must be in
+ /// ST(1). Just remove both from our understanding of the stack and return.
+ assert(getStackEntry(0) == FirstFPRegOp && "Unknown regs live");
+ assert(getStackEntry(1) == SecondFPRegOp && "Unknown regs live");
+ StackTop = 0;
+}
+
+/// handleZeroArgFP - ST(0) = fld0 ST(0) = flds <mem>
+///
+void FPS::handleZeroArgFP(MachineBasicBlock::iterator &I) {
+ MachineInstr &MI = *I;
+ unsigned DestReg = getFPReg(MI.getOperand(0));
+
+ // Change from the pseudo instruction to the concrete instruction.
+ MI.RemoveOperand(0); // Remove the explicit ST(0) operand
+ MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode())));
+
+ // Result gets pushed on the stack.
+ pushReg(DestReg);
+}
+
+/// handleOneArgFP - fst <mem>, ST(0)
+///
+void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) {
+ MachineInstr &MI = *I;
+ unsigned NumOps = MI.getDesc().getNumOperands();
+ assert((NumOps == X86::AddrNumOperands + 1 || NumOps == 1) &&
+ "Can only handle fst* & ftst instructions!");
+
+ // Is this the last use of the source register?
+ unsigned Reg = getFPReg(MI.getOperand(NumOps - 1));
+ bool KillsSrc = MI.killsRegister(X86::FP0 + Reg);
+
+ // FISTP64m is strange because there isn't a non-popping versions.
+ // If we have one _and_ we don't want to pop the operand, duplicate the value
+ // on the stack instead of moving it. This ensure that popping the value is
+ // always ok.
+ // Ditto FISTTP16m, FISTTP32m, FISTTP64m, ST_FpP80m.
+ //
+ if (!KillsSrc && (MI.getOpcode() == X86::IST_Fp64m32 ||
+ MI.getOpcode() == X86::ISTT_Fp16m32 ||
+ MI.getOpcode() == X86::ISTT_Fp32m32 ||
+ MI.getOpcode() == X86::ISTT_Fp64m32 ||
+ MI.getOpcode() == X86::IST_Fp64m64 ||
+ MI.getOpcode() == X86::ISTT_Fp16m64 ||
+ MI.getOpcode() == X86::ISTT_Fp32m64 ||
+ MI.getOpcode() == X86::ISTT_Fp64m64 ||
+ MI.getOpcode() == X86::IST_Fp64m80 ||
+ MI.getOpcode() == X86::ISTT_Fp16m80 ||
+ MI.getOpcode() == X86::ISTT_Fp32m80 ||
+ MI.getOpcode() == X86::ISTT_Fp64m80 ||
+ MI.getOpcode() == X86::ST_FpP80m)) {
+ duplicateToTop(Reg, ScratchFPReg, I);
+ } else {
+ moveToTop(Reg, I); // Move to the top of the stack...
+ }
+
+ // Convert from the pseudo instruction to the concrete instruction.
+ MI.RemoveOperand(NumOps - 1); // Remove explicit ST(0) operand
+ MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode())));
+
+ if (MI.getOpcode() == X86::IST_FP64m || MI.getOpcode() == X86::ISTT_FP16m ||
+ MI.getOpcode() == X86::ISTT_FP32m || MI.getOpcode() == X86::ISTT_FP64m ||
+ MI.getOpcode() == X86::ST_FP80m) {
+ if (StackTop == 0)
+ report_fatal_error("Stack empty??");
+ --StackTop;
+ } else if (KillsSrc) { // Last use of operand?
+ popStackAfter(I);
+ }
+}
+
+
+/// handleOneArgFPRW: Handle instructions that read from the top of stack and
+/// replace the value with a newly computed value. These instructions may have
+/// non-fp operands after their FP operands.
+///
+/// Examples:
+/// R1 = fchs R2
+/// R1 = fadd R2, [mem]
+///
+void FPS::handleOneArgFPRW(MachineBasicBlock::iterator &I) {
+ MachineInstr &MI = *I;
+#ifndef NDEBUG
+ unsigned NumOps = MI.getDesc().getNumOperands();
+ assert(NumOps >= 2 && "FPRW instructions must have 2 ops!!");
+#endif
+
+ // Is this the last use of the source register?
+ unsigned Reg = getFPReg(MI.getOperand(1));
+ bool KillsSrc = MI.killsRegister(X86::FP0 + Reg);
+
+ if (KillsSrc) {
+ // If this is the last use of the source register, just make sure it's on
+ // the top of the stack.
+ moveToTop(Reg, I);
+ if (StackTop == 0)
+ report_fatal_error("Stack cannot be empty!");
+ --StackTop;
+ pushReg(getFPReg(MI.getOperand(0)));
+ } else {
+ // If this is not the last use of the source register, _copy_ it to the top
+ // of the stack.
+ duplicateToTop(Reg, getFPReg(MI.getOperand(0)), I);
+ }
+
+ // Change from the pseudo instruction to the concrete instruction.
+ MI.RemoveOperand(1); // Drop the source operand.
+ MI.RemoveOperand(0); // Drop the destination operand.
+ MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode())));
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define tables of various ways to map pseudo instructions
+//
+
+// ForwardST0Table - Map: A = B op C into: ST(0) = ST(0) op ST(i)
+static const TableEntry ForwardST0Table[] = {
+ { X86::ADD_Fp32 , X86::ADD_FST0r },
+ { X86::ADD_Fp64 , X86::ADD_FST0r },
+ { X86::ADD_Fp80 , X86::ADD_FST0r },
+ { X86::DIV_Fp32 , X86::DIV_FST0r },
+ { X86::DIV_Fp64 , X86::DIV_FST0r },
+ { X86::DIV_Fp80 , X86::DIV_FST0r },
+ { X86::MUL_Fp32 , X86::MUL_FST0r },
+ { X86::MUL_Fp64 , X86::MUL_FST0r },
+ { X86::MUL_Fp80 , X86::MUL_FST0r },
+ { X86::SUB_Fp32 , X86::SUB_FST0r },
+ { X86::SUB_Fp64 , X86::SUB_FST0r },
+ { X86::SUB_Fp80 , X86::SUB_FST0r },
+};
+
+// ReverseST0Table - Map: A = B op C into: ST(0) = ST(i) op ST(0)
+static const TableEntry ReverseST0Table[] = {
+ { X86::ADD_Fp32 , X86::ADD_FST0r }, // commutative
+ { X86::ADD_Fp64 , X86::ADD_FST0r }, // commutative
+ { X86::ADD_Fp80 , X86::ADD_FST0r }, // commutative
+ { X86::DIV_Fp32 , X86::DIVR_FST0r },
+ { X86::DIV_Fp64 , X86::DIVR_FST0r },
+ { X86::DIV_Fp80 , X86::DIVR_FST0r },
+ { X86::MUL_Fp32 , X86::MUL_FST0r }, // commutative
+ { X86::MUL_Fp64 , X86::MUL_FST0r }, // commutative
+ { X86::MUL_Fp80 , X86::MUL_FST0r }, // commutative
+ { X86::SUB_Fp32 , X86::SUBR_FST0r },
+ { X86::SUB_Fp64 , X86::SUBR_FST0r },
+ { X86::SUB_Fp80 , X86::SUBR_FST0r },
+};
+
+// ForwardSTiTable - Map: A = B op C into: ST(i) = ST(0) op ST(i)
+static const TableEntry ForwardSTiTable[] = {
+ { X86::ADD_Fp32 , X86::ADD_FrST0 }, // commutative
+ { X86::ADD_Fp64 , X86::ADD_FrST0 }, // commutative
+ { X86::ADD_Fp80 , X86::ADD_FrST0 }, // commutative
+ { X86::DIV_Fp32 , X86::DIVR_FrST0 },
+ { X86::DIV_Fp64 , X86::DIVR_FrST0 },
+ { X86::DIV_Fp80 , X86::DIVR_FrST0 },
+ { X86::MUL_Fp32 , X86::MUL_FrST0 }, // commutative
+ { X86::MUL_Fp64 , X86::MUL_FrST0 }, // commutative
+ { X86::MUL_Fp80 , X86::MUL_FrST0 }, // commutative
+ { X86::SUB_Fp32 , X86::SUBR_FrST0 },
+ { X86::SUB_Fp64 , X86::SUBR_FrST0 },
+ { X86::SUB_Fp80 , X86::SUBR_FrST0 },
+};
+
+// ReverseSTiTable - Map: A = B op C into: ST(i) = ST(i) op ST(0)
+static const TableEntry ReverseSTiTable[] = {
+ { X86::ADD_Fp32 , X86::ADD_FrST0 },
+ { X86::ADD_Fp64 , X86::ADD_FrST0 },
+ { X86::ADD_Fp80 , X86::ADD_FrST0 },
+ { X86::DIV_Fp32 , X86::DIV_FrST0 },
+ { X86::DIV_Fp64 , X86::DIV_FrST0 },
+ { X86::DIV_Fp80 , X86::DIV_FrST0 },
+ { X86::MUL_Fp32 , X86::MUL_FrST0 },
+ { X86::MUL_Fp64 , X86::MUL_FrST0 },
+ { X86::MUL_Fp80 , X86::MUL_FrST0 },
+ { X86::SUB_Fp32 , X86::SUB_FrST0 },
+ { X86::SUB_Fp64 , X86::SUB_FrST0 },
+ { X86::SUB_Fp80 , X86::SUB_FrST0 },
+};
+
+
+/// handleTwoArgFP - Handle instructions like FADD and friends which are virtual
+/// instructions which need to be simplified and possibly transformed.
+///
+/// Result: ST(0) = fsub ST(0), ST(i)
+/// ST(i) = fsub ST(0), ST(i)
+/// ST(0) = fsubr ST(0), ST(i)
+/// ST(i) = fsubr ST(0), ST(i)
+///
+void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) {
+ ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table);
+ ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable);
+ MachineInstr &MI = *I;
+
+ unsigned NumOperands = MI.getDesc().getNumOperands();
+ assert(NumOperands == 3 && "Illegal TwoArgFP instruction!");
+ unsigned Dest = getFPReg(MI.getOperand(0));
+ unsigned Op0 = getFPReg(MI.getOperand(NumOperands - 2));
+ unsigned Op1 = getFPReg(MI.getOperand(NumOperands - 1));
+ bool KillsOp0 = MI.killsRegister(X86::FP0 + Op0);
+ bool KillsOp1 = MI.killsRegister(X86::FP0 + Op1);
+ DebugLoc dl = MI.getDebugLoc();
+
+ unsigned TOS = getStackEntry(0);
+
+ // One of our operands must be on the top of the stack. If neither is yet, we
+ // need to move one.
+ if (Op0 != TOS && Op1 != TOS) { // No operand at TOS?
+ // We can choose to move either operand to the top of the stack. If one of
+ // the operands is killed by this instruction, we want that one so that we
+ // can update right on top of the old version.
+ if (KillsOp0) {
+ moveToTop(Op0, I); // Move dead operand to TOS.
+ TOS = Op0;
+ } else if (KillsOp1) {
+ moveToTop(Op1, I);
+ TOS = Op1;
+ } else {
+ // All of the operands are live after this instruction executes, so we
+ // cannot update on top of any operand. Because of this, we must
+ // duplicate one of the stack elements to the top. It doesn't matter
+ // which one we pick.
+ //
+ duplicateToTop(Op0, Dest, I);
+ Op0 = TOS = Dest;
+ KillsOp0 = true;
+ }
+ } else if (!KillsOp0 && !KillsOp1) {
+ // If we DO have one of our operands at the top of the stack, but we don't
+ // have a dead operand, we must duplicate one of the operands to a new slot
+ // on the stack.
+ duplicateToTop(Op0, Dest, I);
+ Op0 = TOS = Dest;
+ KillsOp0 = true;
+ }
+
+ // Now we know that one of our operands is on the top of the stack, and at
+ // least one of our operands is killed by this instruction.
+ assert((TOS == Op0 || TOS == Op1) && (KillsOp0 || KillsOp1) &&
+ "Stack conditions not set up right!");
+
+ // We decide which form to use based on what is on the top of the stack, and
+ // which operand is killed by this instruction.
+ ArrayRef<TableEntry> InstTable;
+ bool isForward = TOS == Op0;
+ bool updateST0 = (TOS == Op0 && !KillsOp1) || (TOS == Op1 && !KillsOp0);
+ if (updateST0) {
+ if (isForward)
+ InstTable = ForwardST0Table;
+ else
+ InstTable = ReverseST0Table;
+ } else {
+ if (isForward)
+ InstTable = ForwardSTiTable;
+ else
+ InstTable = ReverseSTiTable;
+ }
+
+ int Opcode = Lookup(InstTable, MI.getOpcode());
+ assert(Opcode != -1 && "Unknown TwoArgFP pseudo instruction!");
+
+ // NotTOS - The register which is not on the top of stack...
+ unsigned NotTOS = (TOS == Op0) ? Op1 : Op0;
+
+ // Replace the old instruction with a new instruction
+ MBB->remove(&*I++);
+ I = BuildMI(*MBB, I, dl, TII->get(Opcode)).addReg(getSTReg(NotTOS));
+
+ // If both operands are killed, pop one off of the stack in addition to
+ // overwriting the other one.
+ if (KillsOp0 && KillsOp1 && Op0 != Op1) {
+ assert(!updateST0 && "Should have updated other operand!");
+ popStackAfter(I); // Pop the top of stack
+ }
+
+ // Update stack information so that we know the destination register is now on
+ // the stack.
+ unsigned UpdatedSlot = getSlot(updateST0 ? TOS : NotTOS);
+ assert(UpdatedSlot < StackTop && Dest < 7);
+ Stack[UpdatedSlot] = Dest;
+ RegMap[Dest] = UpdatedSlot;
+ MBB->getParent()->DeleteMachineInstr(&MI); // Remove the old instruction
+}
+
+/// handleCompareFP - Handle FUCOM and FUCOMI instructions, which have two FP
+/// register arguments and no explicit destinations.
+///
+void FPS::handleCompareFP(MachineBasicBlock::iterator &I) {
+ ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table);
+ ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable);
+ MachineInstr &MI = *I;
+
+ unsigned NumOperands = MI.getDesc().getNumOperands();
+ assert(NumOperands == 2 && "Illegal FUCOM* instruction!");
+ unsigned Op0 = getFPReg(MI.getOperand(NumOperands - 2));
+ unsigned Op1 = getFPReg(MI.getOperand(NumOperands - 1));
+ bool KillsOp0 = MI.killsRegister(X86::FP0 + Op0);
+ bool KillsOp1 = MI.killsRegister(X86::FP0 + Op1);
+
+ // Make sure the first operand is on the top of stack, the other one can be
+ // anywhere.
+ moveToTop(Op0, I);
+
+ // Change from the pseudo instruction to the concrete instruction.
+ MI.getOperand(0).setReg(getSTReg(Op1));
+ MI.RemoveOperand(1);
+ MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode())));
+
+ // If any of the operands are killed by this instruction, free them.
+ if (KillsOp0) freeStackSlotAfter(I, Op0);
+ if (KillsOp1 && Op0 != Op1) freeStackSlotAfter(I, Op1);
+}
+
+/// handleCondMovFP - Handle two address conditional move instructions. These
+/// instructions move a st(i) register to st(0) iff a condition is true. These
+/// instructions require that the first operand is at the top of the stack, but
+/// otherwise don't modify the stack at all.
+void FPS::handleCondMovFP(MachineBasicBlock::iterator &I) {
+ MachineInstr &MI = *I;
+
+ unsigned Op0 = getFPReg(MI.getOperand(0));
+ unsigned Op1 = getFPReg(MI.getOperand(2));
+ bool KillsOp1 = MI.killsRegister(X86::FP0 + Op1);
+
+ // The first operand *must* be on the top of the stack.
+ moveToTop(Op0, I);
+
+ // Change the second operand to the stack register that the operand is in.
+ // Change from the pseudo instruction to the concrete instruction.
+ MI.RemoveOperand(0);
+ MI.RemoveOperand(1);
+ MI.getOperand(0).setReg(getSTReg(Op1));
+ MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode())));
+
+ // If we kill the second operand, make sure to pop it from the stack.
+ if (Op0 != Op1 && KillsOp1) {
+ // Get this value off of the register stack.
+ freeStackSlotAfter(I, Op1);
+ }
+}
+
+
+/// handleSpecialFP - Handle special instructions which behave unlike other
+/// floating point instructions. This is primarily intended for use by pseudo
+/// instructions.
+///
+void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) {
+ MachineInstr &MI = *Inst;
+
+ if (MI.isCall()) {
+ handleCall(Inst);
+ return;
+ }
+
+ if (MI.isReturn()) {
+ handleReturn(Inst);
+ return;
+ }
+
+ switch (MI.getOpcode()) {
+ default: llvm_unreachable("Unknown SpecialFP instruction!");
+ case TargetOpcode::COPY: {
+ // We handle three kinds of copies: FP <- FP, FP <- ST, and ST <- FP.
+ const MachineOperand &MO1 = MI.getOperand(1);
+ const MachineOperand &MO0 = MI.getOperand(0);
+ bool KillsSrc = MI.killsRegister(MO1.getReg());
+
+ // FP <- FP copy.
+ unsigned DstFP = getFPReg(MO0);
+ unsigned SrcFP = getFPReg(MO1);
+ assert(isLive(SrcFP) && "Cannot copy dead register");
+ if (KillsSrc) {
+ // If the input operand is killed, we can just change the owner of the
+ // incoming stack slot into the result.
+ unsigned Slot = getSlot(SrcFP);
+ Stack[Slot] = DstFP;
+ RegMap[DstFP] = Slot;
+ } else {
+ // For COPY we just duplicate the specified value to a new stack slot.
+ // This could be made better, but would require substantial changes.
+ duplicateToTop(SrcFP, DstFP, Inst);
+ }
+ break;
+ }
+
+ case TargetOpcode::IMPLICIT_DEF: {
+ // All FP registers must be explicitly defined, so load a 0 instead.
+ unsigned Reg = MI.getOperand(0).getReg() - X86::FP0;
+ DEBUG(dbgs() << "Emitting LD_F0 for implicit FP" << Reg << '\n');
+ BuildMI(*MBB, Inst, MI.getDebugLoc(), TII->get(X86::LD_F0));
+ pushReg(Reg);
+ break;
+ }
+
+ case TargetOpcode::INLINEASM: {
+ // The inline asm MachineInstr currently only *uses* FP registers for the
+ // 'f' constraint. These should be turned into the current ST(x) register
+ // in the machine instr.
+ //
+ // There are special rules for x87 inline assembly. The compiler must know
+ // exactly how many registers are popped and pushed implicitly by the asm.
+ // Otherwise it is not possible to restore the stack state after the inline
+ // asm.
+ //
+ // There are 3 kinds of input operands:
+ //
+ // 1. Popped inputs. These must appear at the stack top in ST0-STn. A
+ // popped input operand must be in a fixed stack slot, and it is either
+ // tied to an output operand, or in the clobber list. The MI has ST use
+ // and def operands for these inputs.
+ //
+ // 2. Fixed inputs. These inputs appear in fixed stack slots, but are
+ // preserved by the inline asm. The fixed stack slots must be STn-STm
+ // following the popped inputs. A fixed input operand cannot be tied to
+ // an output or appear in the clobber list. The MI has ST use operands
+ // and no defs for these inputs.
+ //
+ // 3. Preserved inputs. These inputs use the "f" constraint which is
+ // represented as an FP register. The inline asm won't change these
+ // stack slots.
+ //
+ // Outputs must be in ST registers, FP outputs are not allowed. Clobbered
+ // registers do not count as output operands. The inline asm changes the
+ // stack as if it popped all the popped inputs and then pushed all the
+ // output operands.
+
+ // Scan the assembly for ST registers used, defined and clobbered. We can
+ // only tell clobbers from defs by looking at the asm descriptor.
+ unsigned STUses = 0, STDefs = 0, STClobbers = 0, STDeadDefs = 0;
+ unsigned NumOps = 0;
+ SmallSet<unsigned, 1> FRegIdx;
+ unsigned RCID;
+
+ for (unsigned i = InlineAsm::MIOp_FirstOperand, e = MI.getNumOperands();
+ i != e && MI.getOperand(i).isImm(); i += 1 + NumOps) {
+ unsigned Flags = MI.getOperand(i).getImm();
+
+ NumOps = InlineAsm::getNumOperandRegisters(Flags);
+ if (NumOps != 1)
+ continue;
+ const MachineOperand &MO = MI.getOperand(i + 1);
+ if (!MO.isReg())
+ continue;
+ unsigned STReg = MO.getReg() - X86::FP0;
+ if (STReg >= 8)
+ continue;
+
+ // If the flag has a register class constraint, this must be an operand
+ // with constraint "f". Record its index and continue.
+ if (InlineAsm::hasRegClassConstraint(Flags, RCID)) {
+ FRegIdx.insert(i + 1);
+ continue;
+ }
+
+ switch (InlineAsm::getKind(Flags)) {
+ case InlineAsm::Kind_RegUse:
+ STUses |= (1u << STReg);
+ break;
+ case InlineAsm::Kind_RegDef:
+ case InlineAsm::Kind_RegDefEarlyClobber:
+ STDefs |= (1u << STReg);
+ if (MO.isDead())
+ STDeadDefs |= (1u << STReg);
+ break;
+ case InlineAsm::Kind_Clobber:
+ STClobbers |= (1u << STReg);
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (STUses && !isMask_32(STUses))
+ MI.emitError("fixed input regs must be last on the x87 stack");
+ unsigned NumSTUses = countTrailingOnes(STUses);
+
+ // Defs must be contiguous from the stack top. ST0-STn.
+ if (STDefs && !isMask_32(STDefs)) {
+ MI.emitError("output regs must be last on the x87 stack");
+ STDefs = NextPowerOf2(STDefs) - 1;
+ }
+ unsigned NumSTDefs = countTrailingOnes(STDefs);
+
+ // So must the clobbered stack slots. ST0-STm, m >= n.
+ if (STClobbers && !isMask_32(STDefs | STClobbers))
+ MI.emitError("clobbers must be last on the x87 stack");
+
+ // Popped inputs are the ones that are also clobbered or defined.
+ unsigned STPopped = STUses & (STDefs | STClobbers);
+ if (STPopped && !isMask_32(STPopped))
+ MI.emitError("implicitly popped regs must be last on the x87 stack");
+ unsigned NumSTPopped = countTrailingOnes(STPopped);
+
+ DEBUG(dbgs() << "Asm uses " << NumSTUses << " fixed regs, pops "
+ << NumSTPopped << ", and defines " << NumSTDefs << " regs.\n");
+
+#ifndef NDEBUG
+ // If any input operand uses constraint "f", all output register
+ // constraints must be early-clobber defs.
+ for (unsigned I = 0, E = MI.getNumOperands(); I < E; ++I)
+ if (FRegIdx.count(I)) {
+ assert((1 << getFPReg(MI.getOperand(I)) & STDefs) == 0 &&
+ "Operands with constraint \"f\" cannot overlap with defs");
+ }
+#endif
+
+ // Collect all FP registers (register operands with constraints "t", "u",
+ // and "f") to kill afer the instruction.
+ unsigned FPKills = ((1u << NumFPRegs) - 1) & ~0xff;
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ MachineOperand &Op = MI.getOperand(i);
+ if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
+ continue;
+ unsigned FPReg = getFPReg(Op);
+
+ // If we kill this operand, make sure to pop it from the stack after the
+ // asm. We just remember it for now, and pop them all off at the end in
+ // a batch.
+ if (Op.isUse() && Op.isKill())
+ FPKills |= 1U << FPReg;
+ }
+
+ // Do not include registers that are implicitly popped by defs/clobbers.
+ FPKills &= ~(STDefs | STClobbers);
+
+ // Now we can rearrange the live registers to match what was requested.
+ unsigned char STUsesArray[8];
+
+ for (unsigned I = 0; I < NumSTUses; ++I)
+ STUsesArray[I] = I;
+
+ shuffleStackTop(STUsesArray, NumSTUses, Inst);
+ DEBUG({dbgs() << "Before asm: "; dumpStack();});
+
+ // With the stack layout fixed, rewrite the FP registers.
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ MachineOperand &Op = MI.getOperand(i);
+ if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
+ continue;
+
+ unsigned FPReg = getFPReg(Op);
+
+ if (FRegIdx.count(i))
+ // Operand with constraint "f".
+ Op.setReg(getSTReg(FPReg));
+ else
+ // Operand with a single register class constraint ("t" or "u").
+ Op.setReg(X86::ST0 + FPReg);
+ }
+
+ // Simulate the inline asm popping its inputs and pushing its outputs.
+ StackTop -= NumSTPopped;
+
+ for (unsigned i = 0; i < NumSTDefs; ++i)
+ pushReg(NumSTDefs - i - 1);
+
+ // If this asm kills any FP registers (is the last use of them) we must
+ // explicitly emit pop instructions for them. Do this now after the asm has
+ // executed so that the ST(x) numbers are not off (which would happen if we
+ // did this inline with operand rewriting).
+ //
+ // Note: this might be a non-optimal pop sequence. We might be able to do
+ // better by trying to pop in stack order or something.
+ while (FPKills) {
+ unsigned FPReg = countTrailingZeros(FPKills);
+ if (isLive(FPReg))
+ freeStackSlotAfter(Inst, FPReg);
+ FPKills &= ~(1U << FPReg);
+ }
+
+ // Don't delete the inline asm!
+ return;
+ }
+ }
+
+ Inst = MBB->erase(Inst); // Remove the pseudo instruction
+
+ // We want to leave I pointing to the previous instruction, but what if we
+ // just erased the first instruction?
+ if (Inst == MBB->begin()) {
+ DEBUG(dbgs() << "Inserting dummy KILL\n");
+ Inst = BuildMI(*MBB, Inst, DebugLoc(), TII->get(TargetOpcode::KILL));
+ } else
+ --Inst;
+}
+
+void FPS::setKillFlags(MachineBasicBlock &MBB) const {
+ const TargetRegisterInfo *TRI =
+ MBB.getParent()->getSubtarget().getRegisterInfo();
+ LivePhysRegs LPR(TRI);
+
+ LPR.addLiveOuts(MBB);
+
+ for (MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend();
+ I != E; ++I) {
+ if (I->isDebugValue())
+ continue;
+
+ std::bitset<8> Defs;
+ SmallVector<MachineOperand *, 2> Uses;
+ MachineInstr &MI = *I;
+
+ for (auto &MO : I->operands()) {
+ if (!MO.isReg())
+ continue;
+
+ unsigned Reg = MO.getReg() - X86::FP0;
+
+ if (Reg >= 8)
+ continue;
+
+ if (MO.isDef()) {
+ Defs.set(Reg);
+ if (!LPR.contains(MO.getReg()))
+ MO.setIsDead();
+ } else
+ Uses.push_back(&MO);
+ }
+
+ for (auto *MO : Uses)
+ if (Defs.test(getFPReg(*MO)) || !LPR.contains(MO->getReg()))
+ MO->setIsKill();
+
+ LPR.stepBackward(MI);
+ }
+}
diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp
new file mode 100644
index 000000000000..1deefe1231ca
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -0,0 +1,2998 @@
+//===-- X86FrameLowering.cpp - X86 Frame Information ----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86FrameLowering.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/Debug.h"
+#include <cstdlib>
+
+using namespace llvm;
+
+X86FrameLowering::X86FrameLowering(const X86Subtarget &STI,
+ unsigned StackAlignOverride)
+ : TargetFrameLowering(StackGrowsDown, StackAlignOverride,
+ STI.is64Bit() ? -8 : -4),
+ STI(STI), TII(*STI.getInstrInfo()), TRI(STI.getRegisterInfo()) {
+ // Cache a bunch of frame-related predicates for this subtarget.
+ SlotSize = TRI->getSlotSize();
+ Is64Bit = STI.is64Bit();
+ IsLP64 = STI.isTarget64BitLP64();
+ // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
+ Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64();
+ StackPtr = TRI->getStackRegister();
+}
+
+bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+ return !MF.getFrameInfo().hasVarSizedObjects() &&
+ !MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
+}
+
+/// canSimplifyCallFramePseudos - If there is a reserved call frame, the
+/// call frame pseudos can be simplified. Having a FP, as in the default
+/// implementation, is not sufficient here since we can't always use it.
+/// Use a more nuanced condition.
+bool
+X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const {
+ return hasReservedCallFrame(MF) ||
+ (hasFP(MF) && !TRI->needsStackRealignment(MF)) ||
+ TRI->hasBasePointer(MF);
+}
+
+// needsFrameIndexResolution - Do we need to perform FI resolution for
+// this function. Normally, this is required only when the function
+// has any stack objects. However, FI resolution actually has another job,
+// not apparent from the title - it resolves callframesetup/destroy
+// that were not simplified earlier.
+// So, this is required for x86 functions that have push sequences even
+// when there are no stack objects.
+bool
+X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const {
+ return MF.getFrameInfo().hasStackObjects() ||
+ MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
+}
+
+/// hasFP - Return true if the specified function should have a dedicated frame
+/// pointer register. This is true if the function has variable sized allocas
+/// or if frame pointer elimination is disabled.
+bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
+ TRI->needsStackRealignment(MF) ||
+ MFI.hasVarSizedObjects() ||
+ MFI.isFrameAddressTaken() || MFI.hasOpaqueSPAdjustment() ||
+ MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
+ MF.callsUnwindInit() || MF.hasEHFunclets() || MF.callsEHReturn() ||
+ MFI.hasStackMap() || MFI.hasPatchPoint() ||
+ MFI.hasCopyImplyingStackAdjustment());
+}
+
+static unsigned getSUBriOpcode(unsigned IsLP64, int64_t Imm) {
+ if (IsLP64) {
+ if (isInt<8>(Imm))
+ return X86::SUB64ri8;
+ return X86::SUB64ri32;
+ } else {
+ if (isInt<8>(Imm))
+ return X86::SUB32ri8;
+ return X86::SUB32ri;
+ }
+}
+
+static unsigned getADDriOpcode(unsigned IsLP64, int64_t Imm) {
+ if (IsLP64) {
+ if (isInt<8>(Imm))
+ return X86::ADD64ri8;
+ return X86::ADD64ri32;
+ } else {
+ if (isInt<8>(Imm))
+ return X86::ADD32ri8;
+ return X86::ADD32ri;
+ }
+}
+
+static unsigned getSUBrrOpcode(unsigned isLP64) {
+ return isLP64 ? X86::SUB64rr : X86::SUB32rr;
+}
+
+static unsigned getADDrrOpcode(unsigned isLP64) {
+ return isLP64 ? X86::ADD64rr : X86::ADD32rr;
+}
+
+static unsigned getANDriOpcode(bool IsLP64, int64_t Imm) {
+ if (IsLP64) {
+ if (isInt<8>(Imm))
+ return X86::AND64ri8;
+ return X86::AND64ri32;
+ }
+ if (isInt<8>(Imm))
+ return X86::AND32ri8;
+ return X86::AND32ri;
+}
+
+static unsigned getLEArOpcode(unsigned IsLP64) {
+ return IsLP64 ? X86::LEA64r : X86::LEA32r;
+}
+
+/// findDeadCallerSavedReg - Return a caller-saved register that isn't live
+/// when it reaches the "return" instruction. We can then pop a stack object
+/// to this register without worry about clobbering it.
+static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ const X86RegisterInfo *TRI,
+ bool Is64Bit) {
+ const MachineFunction *MF = MBB.getParent();
+ const Function *F = MF->getFunction();
+ if (!F || MF->callsEHReturn())
+ return 0;
+
+ const TargetRegisterClass &AvailableRegs = *TRI->getGPRsForTailCall(*MF);
+
+ if (MBBI == MBB.end())
+ return 0;
+
+ switch (MBBI->getOpcode()) {
+ default: return 0;
+ case TargetOpcode::PATCHABLE_RET:
+ case X86::RET:
+ case X86::RETL:
+ case X86::RETQ:
+ case X86::RETIL:
+ case X86::RETIQ:
+ case X86::TCRETURNdi:
+ case X86::TCRETURNri:
+ case X86::TCRETURNmi:
+ case X86::TCRETURNdi64:
+ case X86::TCRETURNri64:
+ case X86::TCRETURNmi64:
+ case X86::EH_RETURN:
+ case X86::EH_RETURN64: {
+ SmallSet<uint16_t, 8> Uses;
+ for (unsigned i = 0, e = MBBI->getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = MBBI->getOperand(i);
+ if (!MO.isReg() || MO.isDef())
+ continue;
+ unsigned Reg = MO.getReg();
+ if (!Reg)
+ continue;
+ for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+ Uses.insert(*AI);
+ }
+
+ for (auto CS : AvailableRegs)
+ if (!Uses.count(CS) && CS != X86::RIP)
+ return CS;
+ }
+ }
+
+ return 0;
+}
+
+static bool isEAXLiveIn(MachineBasicBlock &MBB) {
+ for (MachineBasicBlock::RegisterMaskPair RegMask : MBB.liveins()) {
+ unsigned Reg = RegMask.PhysReg;
+
+ if (Reg == X86::RAX || Reg == X86::EAX || Reg == X86::AX ||
+ Reg == X86::AH || Reg == X86::AL)
+ return true;
+ }
+
+ return false;
+}
+
+/// Check if the flags need to be preserved before the terminators.
+/// This would be the case, if the eflags is live-in of the region
+/// composed by the terminators or live-out of that region, without
+/// being defined by a terminator.
+static bool
+flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) {
+ for (const MachineInstr &MI : MBB.terminators()) {
+ bool BreakNext = false;
+ for (const MachineOperand &MO : MI.operands()) {
+ if (!MO.isReg())
+ continue;
+ unsigned Reg = MO.getReg();
+ if (Reg != X86::EFLAGS)
+ continue;
+
+ // This terminator needs an eflags that is not defined
+ // by a previous another terminator:
+ // EFLAGS is live-in of the region composed by the terminators.
+ if (!MO.isDef())
+ return true;
+ // This terminator defines the eflags, i.e., we don't need to preserve it.
+ // However, we still need to check this specific terminator does not
+ // read a live-in value.
+ BreakNext = true;
+ }
+ // We found a definition of the eflags, no need to preserve them.
+ if (BreakNext)
+ return false;
+ }
+
+ // None of the terminators use or define the eflags.
+ // Check if they are live-out, that would imply we need to preserve them.
+ for (const MachineBasicBlock *Succ : MBB.successors())
+ if (Succ->isLiveIn(X86::EFLAGS))
+ return true;
+
+ return false;
+}
+
+/// emitSPUpdate - Emit a series of instructions to increment / decrement the
+/// stack pointer by a constant value.
+void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ int64_t NumBytes, bool InEpilogue) const {
+ bool isSub = NumBytes < 0;
+ uint64_t Offset = isSub ? -NumBytes : NumBytes;
+
+ uint64_t Chunk = (1LL << 31) - 1;
+ DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+ while (Offset) {
+ if (Offset > Chunk) {
+ // Rather than emit a long series of instructions for large offsets,
+ // load the offset into a register and do one sub/add
+ unsigned Reg = 0;
+
+ if (isSub && !isEAXLiveIn(MBB))
+ Reg = (unsigned)(Is64Bit ? X86::RAX : X86::EAX);
+ else
+ Reg = findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit);
+
+ if (Reg) {
+ unsigned Opc = Is64Bit ? X86::MOV64ri : X86::MOV32ri;
+ BuildMI(MBB, MBBI, DL, TII.get(Opc), Reg)
+ .addImm(Offset);
+ Opc = isSub
+ ? getSUBrrOpcode(Is64Bit)
+ : getADDrrOpcode(Is64Bit);
+ MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
+ .addReg(StackPtr)
+ .addReg(Reg);
+ MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+ Offset = 0;
+ continue;
+ }
+ }
+
+ uint64_t ThisVal = std::min(Offset, Chunk);
+ if (ThisVal == (Is64Bit ? 8 : 4)) {
+ // Use push / pop instead.
+ unsigned Reg = isSub
+ ? (unsigned)(Is64Bit ? X86::RAX : X86::EAX)
+ : findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit);
+ if (Reg) {
+ unsigned Opc = isSub
+ ? (Is64Bit ? X86::PUSH64r : X86::PUSH32r)
+ : (Is64Bit ? X86::POP64r : X86::POP32r);
+ MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc))
+ .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub));
+ if (isSub)
+ MI->setFlag(MachineInstr::FrameSetup);
+ else
+ MI->setFlag(MachineInstr::FrameDestroy);
+ Offset -= ThisVal;
+ continue;
+ }
+ }
+
+ MachineInstrBuilder MI = BuildStackAdjustment(
+ MBB, MBBI, DL, isSub ? -ThisVal : ThisVal, InEpilogue);
+ if (isSub)
+ MI.setMIFlag(MachineInstr::FrameSetup);
+ else
+ MI.setMIFlag(MachineInstr::FrameDestroy);
+
+ Offset -= ThisVal;
+ }
+}
+
+MachineInstrBuilder X86FrameLowering::BuildStackAdjustment(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, int64_t Offset, bool InEpilogue) const {
+ assert(Offset != 0 && "zero offset stack adjustment requested");
+
+ // On Atom, using LEA to adjust SP is preferred, but using it in the epilogue
+ // is tricky.
+ bool UseLEA;
+ if (!InEpilogue) {
+ // Check if inserting the prologue at the beginning
+ // of MBB would require to use LEA operations.
+ // We need to use LEA operations if EFLAGS is live in, because
+ // it means an instruction will read it before it gets defined.
+ UseLEA = STI.useLeaForSP() || MBB.isLiveIn(X86::EFLAGS);
+ } else {
+ // If we can use LEA for SP but we shouldn't, check that none
+ // of the terminators uses the eflags. Otherwise we will insert
+ // a ADD that will redefine the eflags and break the condition.
+ // Alternatively, we could move the ADD, but this may not be possible
+ // and is an optimization anyway.
+ UseLEA = canUseLEAForSPInEpilogue(*MBB.getParent());
+ if (UseLEA && !STI.useLeaForSP())
+ UseLEA = flagsNeedToBePreservedBeforeTheTerminators(MBB);
+ // If that assert breaks, that means we do not do the right thing
+ // in canUseAsEpilogue.
+ assert((UseLEA || !flagsNeedToBePreservedBeforeTheTerminators(MBB)) &&
+ "We shouldn't have allowed this insertion point");
+ }
+
+ MachineInstrBuilder MI;
+ if (UseLEA) {
+ MI = addRegOffset(BuildMI(MBB, MBBI, DL,
+ TII.get(getLEArOpcode(Uses64BitFramePtr)),
+ StackPtr),
+ StackPtr, false, Offset);
+ } else {
+ bool IsSub = Offset < 0;
+ uint64_t AbsOffset = IsSub ? -Offset : Offset;
+ unsigned Opc = IsSub ? getSUBriOpcode(Uses64BitFramePtr, AbsOffset)
+ : getADDriOpcode(Uses64BitFramePtr, AbsOffset);
+ MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
+ .addReg(StackPtr)
+ .addImm(AbsOffset);
+ MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+ }
+ return MI;
+}
+
+int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ bool doMergeWithPrevious) const {
+ if ((doMergeWithPrevious && MBBI == MBB.begin()) ||
+ (!doMergeWithPrevious && MBBI == MBB.end()))
+ return 0;
+
+ MachineBasicBlock::iterator PI = doMergeWithPrevious ? std::prev(MBBI) : MBBI;
+ MachineBasicBlock::iterator NI = doMergeWithPrevious ? nullptr
+ : std::next(MBBI);
+ unsigned Opc = PI->getOpcode();
+ int Offset = 0;
+
+ if (!doMergeWithPrevious && NI != MBB.end() &&
+ NI->getOpcode() == TargetOpcode::CFI_INSTRUCTION) {
+ // Don't merge with the next instruction if it has CFI.
+ return Offset;
+ }
+
+ if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
+ Opc == X86::ADD32ri || Opc == X86::ADD32ri8) &&
+ PI->getOperand(0).getReg() == StackPtr){
+ assert(PI->getOperand(1).getReg() == StackPtr);
+ Offset += PI->getOperand(2).getImm();
+ MBB.erase(PI);
+ if (!doMergeWithPrevious) MBBI = NI;
+ } else if ((Opc == X86::LEA32r || Opc == X86::LEA64_32r) &&
+ PI->getOperand(0).getReg() == StackPtr &&
+ PI->getOperand(1).getReg() == StackPtr &&
+ PI->getOperand(2).getImm() == 1 &&
+ PI->getOperand(3).getReg() == X86::NoRegister &&
+ PI->getOperand(5).getReg() == X86::NoRegister) {
+ // For LEAs we have: def = lea SP, FI, noreg, Offset, noreg.
+ Offset += PI->getOperand(4).getImm();
+ MBB.erase(PI);
+ if (!doMergeWithPrevious) MBBI = NI;
+ } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
+ Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
+ PI->getOperand(0).getReg() == StackPtr) {
+ assert(PI->getOperand(1).getReg() == StackPtr);
+ Offset -= PI->getOperand(2).getImm();
+ MBB.erase(PI);
+ if (!doMergeWithPrevious) MBBI = NI;
+ }
+
+ return Offset;
+}
+
+void X86FrameLowering::BuildCFI(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL,
+ const MCCFIInstruction &CFIInst) const {
+ MachineFunction &MF = *MBB.getParent();
+ unsigned CFIIndex = MF.addFrameInst(CFIInst);
+ BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+}
+
+void X86FrameLowering::emitCalleeSavedFrameMoves(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL) const {
+ MachineFunction &MF = *MBB.getParent();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MachineModuleInfo &MMI = MF.getMMI();
+ const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+
+ // Add callee saved registers to move list.
+ const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+ if (CSI.empty()) return;
+
+ // Calculate offsets.
+ for (std::vector<CalleeSavedInfo>::const_iterator
+ I = CSI.begin(), E = CSI.end(); I != E; ++I) {
+ int64_t Offset = MFI.getObjectOffset(I->getFrameIdx());
+ unsigned Reg = I->getReg();
+
+ unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
+ BuildCFI(MBB, MBBI, DL,
+ MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
+ }
+}
+
+void X86FrameLowering::emitStackProbe(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, bool InProlog) const {
+ const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+ if (STI.isTargetWindowsCoreCLR()) {
+ if (InProlog) {
+ emitStackProbeInlineStub(MF, MBB, MBBI, DL, true);
+ } else {
+ emitStackProbeInline(MF, MBB, MBBI, DL, false);
+ }
+ } else {
+ emitStackProbeCall(MF, MBB, MBBI, DL, InProlog);
+ }
+}
+
+void X86FrameLowering::inlineStackProbe(MachineFunction &MF,
+ MachineBasicBlock &PrologMBB) const {
+ const StringRef ChkStkStubSymbol = "__chkstk_stub";
+ MachineInstr *ChkStkStub = nullptr;
+
+ for (MachineInstr &MI : PrologMBB) {
+ if (MI.isCall() && MI.getOperand(0).isSymbol() &&
+ ChkStkStubSymbol == MI.getOperand(0).getSymbolName()) {
+ ChkStkStub = &MI;
+ break;
+ }
+ }
+
+ if (ChkStkStub != nullptr) {
+ assert(!ChkStkStub->isBundled() &&
+ "Not expecting bundled instructions here");
+ MachineBasicBlock::iterator MBBI = std::next(ChkStkStub->getIterator());
+ assert(std::prev(MBBI) == ChkStkStub &&
+ "MBBI expected after __chkstk_stub.");
+ DebugLoc DL = PrologMBB.findDebugLoc(MBBI);
+ emitStackProbeInline(MF, PrologMBB, MBBI, DL, true);
+ ChkStkStub->eraseFromParent();
+ }
+}
+
+void X86FrameLowering::emitStackProbeInline(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL,
+ bool InProlog) const {
+ const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+ assert(STI.is64Bit() && "different expansion needed for 32 bit");
+ assert(STI.isTargetWindowsCoreCLR() && "custom expansion expects CoreCLR");
+ const TargetInstrInfo &TII = *STI.getInstrInfo();
+ const BasicBlock *LLVM_BB = MBB.getBasicBlock();
+
+ // RAX contains the number of bytes of desired stack adjustment.
+ // The handling here assumes this value has already been updated so as to
+ // maintain stack alignment.
+ //
+ // We need to exit with RSP modified by this amount and execute suitable
+ // page touches to notify the OS that we're growing the stack responsibly.
+ // All stack probing must be done without modifying RSP.
+ //
+ // MBB:
+ // SizeReg = RAX;
+ // ZeroReg = 0
+ // CopyReg = RSP
+ // Flags, TestReg = CopyReg - SizeReg
+ // FinalReg = !Flags.Ovf ? TestReg : ZeroReg
+ // LimitReg = gs magic thread env access
+ // if FinalReg >= LimitReg goto ContinueMBB
+ // RoundBB:
+ // RoundReg = page address of FinalReg
+ // LoopMBB:
+ // LoopReg = PHI(LimitReg,ProbeReg)
+ // ProbeReg = LoopReg - PageSize
+ // [ProbeReg] = 0
+ // if (ProbeReg > RoundReg) goto LoopMBB
+ // ContinueMBB:
+ // RSP = RSP - RAX
+ // [rest of original MBB]
+
+ // Set up the new basic blocks
+ MachineBasicBlock *RoundMBB = MF.CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *ContinueMBB = MF.CreateMachineBasicBlock(LLVM_BB);
+
+ MachineFunction::iterator MBBIter = std::next(MBB.getIterator());
+ MF.insert(MBBIter, RoundMBB);
+ MF.insert(MBBIter, LoopMBB);
+ MF.insert(MBBIter, ContinueMBB);
+
+ // Split MBB and move the tail portion down to ContinueMBB.
+ MachineBasicBlock::iterator BeforeMBBI = std::prev(MBBI);
+ ContinueMBB->splice(ContinueMBB->begin(), &MBB, MBBI, MBB.end());
+ ContinueMBB->transferSuccessorsAndUpdatePHIs(&MBB);
+
+ // Some useful constants
+ const int64_t ThreadEnvironmentStackLimit = 0x10;
+ const int64_t PageSize = 0x1000;
+ const int64_t PageMask = ~(PageSize - 1);
+
+ // Registers we need. For the normal case we use virtual
+ // registers. For the prolog expansion we use RAX, RCX and RDX.
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const TargetRegisterClass *RegClass = &X86::GR64RegClass;
+ const unsigned SizeReg = InProlog ? (unsigned)X86::RAX
+ : MRI.createVirtualRegister(RegClass),
+ ZeroReg = InProlog ? (unsigned)X86::RCX
+ : MRI.createVirtualRegister(RegClass),
+ CopyReg = InProlog ? (unsigned)X86::RDX
+ : MRI.createVirtualRegister(RegClass),
+ TestReg = InProlog ? (unsigned)X86::RDX
+ : MRI.createVirtualRegister(RegClass),
+ FinalReg = InProlog ? (unsigned)X86::RDX
+ : MRI.createVirtualRegister(RegClass),
+ RoundedReg = InProlog ? (unsigned)X86::RDX
+ : MRI.createVirtualRegister(RegClass),
+ LimitReg = InProlog ? (unsigned)X86::RCX
+ : MRI.createVirtualRegister(RegClass),
+ JoinReg = InProlog ? (unsigned)X86::RCX
+ : MRI.createVirtualRegister(RegClass),
+ ProbeReg = InProlog ? (unsigned)X86::RCX
+ : MRI.createVirtualRegister(RegClass);
+
+ // SP-relative offsets where we can save RCX and RDX.
+ int64_t RCXShadowSlot = 0;
+ int64_t RDXShadowSlot = 0;
+
+ // If inlining in the prolog, save RCX and RDX.
+ // Future optimization: don't save or restore if not live in.
+ if (InProlog) {
+ // Compute the offsets. We need to account for things already
+ // pushed onto the stack at this point: return address, frame
+ // pointer (if used), and callee saves.
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ const int64_t CalleeSaveSize = X86FI->getCalleeSavedFrameSize();
+ const bool HasFP = hasFP(MF);
+ RCXShadowSlot = 8 + CalleeSaveSize + (HasFP ? 8 : 0);
+ RDXShadowSlot = RCXShadowSlot + 8;
+ // Emit the saves.
+ addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false,
+ RCXShadowSlot)
+ .addReg(X86::RCX);
+ addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false,
+ RDXShadowSlot)
+ .addReg(X86::RDX);
+ } else {
+ // Not in the prolog. Copy RAX to a virtual reg.
+ BuildMI(&MBB, DL, TII.get(X86::MOV64rr), SizeReg).addReg(X86::RAX);
+ }
+
+ // Add code to MBB to check for overflow and set the new target stack pointer
+ // to zero if so.
+ BuildMI(&MBB, DL, TII.get(X86::XOR64rr), ZeroReg)
+ .addReg(ZeroReg, RegState::Undef)
+ .addReg(ZeroReg, RegState::Undef);
+ BuildMI(&MBB, DL, TII.get(X86::MOV64rr), CopyReg).addReg(X86::RSP);
+ BuildMI(&MBB, DL, TII.get(X86::SUB64rr), TestReg)
+ .addReg(CopyReg)
+ .addReg(SizeReg);
+ BuildMI(&MBB, DL, TII.get(X86::CMOVB64rr), FinalReg)
+ .addReg(TestReg)
+ .addReg(ZeroReg);
+
+ // FinalReg now holds final stack pointer value, or zero if
+ // allocation would overflow. Compare against the current stack
+ // limit from the thread environment block. Note this limit is the
+ // lowest touched page on the stack, not the point at which the OS
+ // will cause an overflow exception, so this is just an optimization
+ // to avoid unnecessarily touching pages that are below the current
+ // SP but already committed to the stack by the OS.
+ BuildMI(&MBB, DL, TII.get(X86::MOV64rm), LimitReg)
+ .addReg(0)
+ .addImm(1)
+ .addReg(0)
+ .addImm(ThreadEnvironmentStackLimit)
+ .addReg(X86::GS);
+ BuildMI(&MBB, DL, TII.get(X86::CMP64rr)).addReg(FinalReg).addReg(LimitReg);
+ // Jump if the desired stack pointer is at or above the stack limit.
+ BuildMI(&MBB, DL, TII.get(X86::JAE_1)).addMBB(ContinueMBB);
+
+ // Add code to roundMBB to round the final stack pointer to a page boundary.
+ BuildMI(RoundMBB, DL, TII.get(X86::AND64ri32), RoundedReg)
+ .addReg(FinalReg)
+ .addImm(PageMask);
+ BuildMI(RoundMBB, DL, TII.get(X86::JMP_1)).addMBB(LoopMBB);
+
+ // LimitReg now holds the current stack limit, RoundedReg page-rounded
+ // final RSP value. Add code to loopMBB to decrement LimitReg page-by-page
+ // and probe until we reach RoundedReg.
+ if (!InProlog) {
+ BuildMI(LoopMBB, DL, TII.get(X86::PHI), JoinReg)
+ .addReg(LimitReg)
+ .addMBB(RoundMBB)
+ .addReg(ProbeReg)
+ .addMBB(LoopMBB);
+ }
+
+ addRegOffset(BuildMI(LoopMBB, DL, TII.get(X86::LEA64r), ProbeReg), JoinReg,
+ false, -PageSize);
+
+ // Probe by storing a byte onto the stack.
+ BuildMI(LoopMBB, DL, TII.get(X86::MOV8mi))
+ .addReg(ProbeReg)
+ .addImm(1)
+ .addReg(0)
+ .addImm(0)
+ .addReg(0)
+ .addImm(0);
+ BuildMI(LoopMBB, DL, TII.get(X86::CMP64rr))
+ .addReg(RoundedReg)
+ .addReg(ProbeReg);
+ BuildMI(LoopMBB, DL, TII.get(X86::JNE_1)).addMBB(LoopMBB);
+
+ MachineBasicBlock::iterator ContinueMBBI = ContinueMBB->getFirstNonPHI();
+
+ // If in prolog, restore RDX and RCX.
+ if (InProlog) {
+ addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::MOV64rm),
+ X86::RCX),
+ X86::RSP, false, RCXShadowSlot);
+ addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::MOV64rm),
+ X86::RDX),
+ X86::RSP, false, RDXShadowSlot);
+ }
+
+ // Now that the probing is done, add code to continueMBB to update
+ // the stack pointer for real.
+ BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::SUB64rr), X86::RSP)
+ .addReg(X86::RSP)
+ .addReg(SizeReg);
+
+ // Add the control flow edges we need.
+ MBB.addSuccessor(ContinueMBB);
+ MBB.addSuccessor(RoundMBB);
+ RoundMBB->addSuccessor(LoopMBB);
+ LoopMBB->addSuccessor(ContinueMBB);
+ LoopMBB->addSuccessor(LoopMBB);
+
+ // Mark all the instructions added to the prolog as frame setup.
+ if (InProlog) {
+ for (++BeforeMBBI; BeforeMBBI != MBB.end(); ++BeforeMBBI) {
+ BeforeMBBI->setFlag(MachineInstr::FrameSetup);
+ }
+ for (MachineInstr &MI : *RoundMBB) {
+ MI.setFlag(MachineInstr::FrameSetup);
+ }
+ for (MachineInstr &MI : *LoopMBB) {
+ MI.setFlag(MachineInstr::FrameSetup);
+ }
+ for (MachineBasicBlock::iterator CMBBI = ContinueMBB->begin();
+ CMBBI != ContinueMBBI; ++CMBBI) {
+ CMBBI->setFlag(MachineInstr::FrameSetup);
+ }
+ }
+
+ // Possible TODO: physreg liveness for InProlog case.
+}
+
+void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL,
+ bool InProlog) const {
+ bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large;
+
+ unsigned CallOp;
+ if (Is64Bit)
+ CallOp = IsLargeCodeModel ? X86::CALL64r : X86::CALL64pcrel32;
+ else
+ CallOp = X86::CALLpcrel32;
+
+ const char *Symbol;
+ if (Is64Bit) {
+ if (STI.isTargetCygMing()) {
+ Symbol = "___chkstk_ms";
+ } else {
+ Symbol = "__chkstk";
+ }
+ } else if (STI.isTargetCygMing())
+ Symbol = "_alloca";
+ else
+ Symbol = "_chkstk";
+
+ MachineInstrBuilder CI;
+ MachineBasicBlock::iterator ExpansionMBBI = std::prev(MBBI);
+
+ // All current stack probes take AX and SP as input, clobber flags, and
+ // preserve all registers. x86_64 probes leave RSP unmodified.
+ if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) {
+ // For the large code model, we have to call through a register. Use R11,
+ // as it is scratch in all supported calling conventions.
+ BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::R11)
+ .addExternalSymbol(Symbol);
+ CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addReg(X86::R11);
+ } else {
+ CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addExternalSymbol(Symbol);
+ }
+
+ unsigned AX = Is64Bit ? X86::RAX : X86::EAX;
+ unsigned SP = Is64Bit ? X86::RSP : X86::ESP;
+ CI.addReg(AX, RegState::Implicit)
+ .addReg(SP, RegState::Implicit)
+ .addReg(AX, RegState::Define | RegState::Implicit)
+ .addReg(SP, RegState::Define | RegState::Implicit)
+ .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
+
+ if (Is64Bit) {
+ // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp
+ // themselves. It also does not clobber %rax so we can reuse it when
+ // adjusting %rsp.
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), X86::RSP)
+ .addReg(X86::RSP)
+ .addReg(X86::RAX);
+ }
+
+ if (InProlog) {
+ // Apply the frame setup flag to all inserted instrs.
+ for (++ExpansionMBBI; ExpansionMBBI != MBBI; ++ExpansionMBBI)
+ ExpansionMBBI->setFlag(MachineInstr::FrameSetup);
+ }
+}
+
+void X86FrameLowering::emitStackProbeInlineStub(
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const {
+
+ assert(InProlog && "ChkStkStub called outside prolog!");
+
+ BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32))
+ .addExternalSymbol("__chkstk_stub");
+}
+
+static unsigned calculateSetFPREG(uint64_t SPAdjust) {
+ // Win64 ABI has a less restrictive limitation of 240; 128 works equally well
+ // and might require smaller successive adjustments.
+ const uint64_t Win64MaxSEHOffset = 128;
+ uint64_t SEHFrameOffset = std::min(SPAdjust, Win64MaxSEHOffset);
+ // Win64 ABI requires 16-byte alignment for the UWOP_SET_FPREG opcode.
+ return SEHFrameOffset & -16;
+}
+
+// If we're forcing a stack realignment we can't rely on just the frame
+// info, we need to know the ABI stack alignment as well in case we
+// have a call out. Otherwise just make sure we have some alignment - we'll
+// go with the minimum SlotSize.
+uint64_t X86FrameLowering::calculateMaxStackAlign(const MachineFunction &MF) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ uint64_t MaxAlign = MFI.getMaxAlignment(); // Desired stack alignment.
+ unsigned StackAlign = getStackAlignment();
+ if (MF.getFunction()->hasFnAttribute("stackrealign")) {
+ if (MFI.hasCalls())
+ MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign;
+ else if (MaxAlign < SlotSize)
+ MaxAlign = SlotSize;
+ }
+ return MaxAlign;
+}
+
+void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, unsigned Reg,
+ uint64_t MaxAlign) const {
+ uint64_t Val = -MaxAlign;
+ unsigned AndOp = getANDriOpcode(Uses64BitFramePtr, Val);
+ MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AndOp), Reg)
+ .addReg(Reg)
+ .addImm(Val)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ // The EFLAGS implicit def is dead.
+ MI->getOperand(3).setIsDead();
+}
+
+/// emitPrologue - Push callee-saved registers onto the stack, which
+/// automatically adjust the stack pointer. Adjust the stack pointer to allocate
+/// space for local variables. Also emit labels used by the exception handler to
+/// generate the exception handling frames.
+
+/*
+ Here's a gist of what gets emitted:
+
+ ; Establish frame pointer, if needed
+ [if needs FP]
+ push %rbp
+ .cfi_def_cfa_offset 16
+ .cfi_offset %rbp, -16
+ .seh_pushreg %rpb
+ mov %rsp, %rbp
+ .cfi_def_cfa_register %rbp
+
+ ; Spill general-purpose registers
+ [for all callee-saved GPRs]
+ pushq %<reg>
+ [if not needs FP]
+ .cfi_def_cfa_offset (offset from RETADDR)
+ .seh_pushreg %<reg>
+
+ ; If the required stack alignment > default stack alignment
+ ; rsp needs to be re-aligned. This creates a "re-alignment gap"
+ ; of unknown size in the stack frame.
+ [if stack needs re-alignment]
+ and $MASK, %rsp
+
+ ; Allocate space for locals
+ [if target is Windows and allocated space > 4096 bytes]
+ ; Windows needs special care for allocations larger
+ ; than one page.
+ mov $NNN, %rax
+ call ___chkstk_ms/___chkstk
+ sub %rax, %rsp
+ [else]
+ sub $NNN, %rsp
+
+ [if needs FP]
+ .seh_stackalloc (size of XMM spill slots)
+ .seh_setframe %rbp, SEHFrameOffset ; = size of all spill slots
+ [else]
+ .seh_stackalloc NNN
+
+ ; Spill XMMs
+ ; Note, that while only Windows 64 ABI specifies XMMs as callee-preserved,
+ ; they may get spilled on any platform, if the current function
+ ; calls @llvm.eh.unwind.init
+ [if needs FP]
+ [for all callee-saved XMM registers]
+ movaps %<xmm reg>, -MMM(%rbp)
+ [for all callee-saved XMM registers]
+ .seh_savexmm %<xmm reg>, (-MMM + SEHFrameOffset)
+ ; i.e. the offset relative to (%rbp - SEHFrameOffset)
+ [else]
+ [for all callee-saved XMM registers]
+ movaps %<xmm reg>, KKK(%rsp)
+ [for all callee-saved XMM registers]
+ .seh_savexmm %<xmm reg>, KKK
+
+ .seh_endprologue
+
+ [if needs base pointer]
+ mov %rsp, %rbx
+ [if needs to restore base pointer]
+ mov %rsp, -MMM(%rbp)
+
+ ; Emit CFI info
+ [if needs FP]
+ [for all callee-saved registers]
+ .cfi_offset %<reg>, (offset from %rbp)
+ [else]
+ .cfi_def_cfa_offset (offset from RETADDR)
+ [for all callee-saved registers]
+ .cfi_offset %<reg>, (offset from %rsp)
+
+ Notes:
+ - .seh directives are emitted only for Windows 64 ABI
+ - .cfi directives are emitted for all other ABIs
+ - for 32-bit code, substitute %e?? registers for %r??
+*/
+
+void X86FrameLowering::emitPrologue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ assert(&STI == &MF.getSubtarget<X86Subtarget>() &&
+ "MF used frame lowering for wrong subtarget");
+ MachineBasicBlock::iterator MBBI = MBB.begin();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ const Function *Fn = MF.getFunction();
+ MachineModuleInfo &MMI = MF.getMMI();
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ uint64_t MaxAlign = calculateMaxStackAlign(MF); // Desired stack alignment.
+ uint64_t StackSize = MFI.getStackSize(); // Number of bytes to allocate.
+ bool IsFunclet = MBB.isEHFuncletEntry();
+ EHPersonality Personality = EHPersonality::Unknown;
+ if (Fn->hasPersonalityFn())
+ Personality = classifyEHPersonality(Fn->getPersonalityFn());
+ bool FnHasClrFunclet =
+ MF.hasEHFunclets() && Personality == EHPersonality::CoreCLR;
+ bool IsClrFunclet = IsFunclet && FnHasClrFunclet;
+ bool HasFP = hasFP(MF);
+ bool IsWin64CC = STI.isCallingConvWin64(Fn->getCallingConv());
+ bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
+ bool NeedsWinCFI = IsWin64Prologue && Fn->needsUnwindTableEntry();
+ bool NeedsDwarfCFI =
+ !IsWin64Prologue && (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry());
+ unsigned FramePtr = TRI->getFrameRegister(MF);
+ const unsigned MachineFramePtr =
+ STI.isTarget64BitILP32()
+ ? getX86SubSuperRegister(FramePtr, 64) : FramePtr;
+ unsigned BasePtr = TRI->getBaseRegister();
+ bool HasWinCFI = false;
+
+ // Debug location must be unknown since the first debug location is used
+ // to determine the end of the prologue.
+ DebugLoc DL;
+
+ // Add RETADDR move area to callee saved frame size.
+ int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+ if (TailCallReturnAddrDelta && IsWin64Prologue)
+ report_fatal_error("Can't handle guaranteed tail call under win64 yet");
+
+ if (TailCallReturnAddrDelta < 0)
+ X86FI->setCalleeSavedFrameSize(
+ X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta);
+
+ bool UseStackProbe = (STI.isOSWindows() && !STI.isTargetMachO());
+
+ // The default stack probe size is 4096 if the function has no stackprobesize
+ // attribute.
+ unsigned StackProbeSize = 4096;
+ if (Fn->hasFnAttribute("stack-probe-size"))
+ Fn->getFnAttribute("stack-probe-size")
+ .getValueAsString()
+ .getAsInteger(0, StackProbeSize);
+
+ // If this is x86-64 and the Red Zone is not disabled, if we are a leaf
+ // function, and use up to 128 bytes of stack space, don't have a frame
+ // pointer, calls, or dynamic alloca then we do not need to adjust the
+ // stack pointer (we fit in the Red Zone). We also check that we don't
+ // push and pop from the stack.
+ if (Is64Bit && !Fn->hasFnAttribute(Attribute::NoRedZone) &&
+ !TRI->needsStackRealignment(MF) &&
+ !MFI.hasVarSizedObjects() && // No dynamic alloca.
+ !MFI.adjustsStack() && // No calls.
+ !IsWin64CC && // Win64 has no Red Zone
+ !MFI.hasCopyImplyingStackAdjustment() && // Don't push and pop.
+ !MF.shouldSplitStack()) { // Regular stack
+ uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
+ if (HasFP) MinSize += SlotSize;
+ X86FI->setUsesRedZone(MinSize > 0 || StackSize > 0);
+ StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);
+ MFI.setStackSize(StackSize);
+ }
+
+ // Insert stack pointer adjustment for later moving of return addr. Only
+ // applies to tail call optimized functions where the callee argument stack
+ // size is bigger than the callers.
+ if (TailCallReturnAddrDelta < 0) {
+ BuildStackAdjustment(MBB, MBBI, DL, TailCallReturnAddrDelta,
+ /*InEpilogue=*/false)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ // Mapping for machine moves:
+ //
+ // DST: VirtualFP AND
+ // SRC: VirtualFP => DW_CFA_def_cfa_offset
+ // ELSE => DW_CFA_def_cfa
+ //
+ // SRC: VirtualFP AND
+ // DST: Register => DW_CFA_def_cfa_register
+ //
+ // ELSE
+ // OFFSET < 0 => DW_CFA_offset_extended_sf
+ // REG < 64 => DW_CFA_offset + Reg
+ // ELSE => DW_CFA_offset_extended
+
+ uint64_t NumBytes = 0;
+ int stackGrowth = -SlotSize;
+
+ // Find the funclet establisher parameter
+ unsigned Establisher = X86::NoRegister;
+ if (IsClrFunclet)
+ Establisher = Uses64BitFramePtr ? X86::RCX : X86::ECX;
+ else if (IsFunclet)
+ Establisher = Uses64BitFramePtr ? X86::RDX : X86::EDX;
+
+ if (IsWin64Prologue && IsFunclet && !IsClrFunclet) {
+ // Immediately spill establisher into the home slot.
+ // The runtime cares about this.
+ // MOV64mr %rdx, 16(%rsp)
+ unsigned MOVmr = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MOVmr)), StackPtr, true, 16)
+ .addReg(Establisher)
+ .setMIFlag(MachineInstr::FrameSetup);
+ MBB.addLiveIn(Establisher);
+ }
+
+ if (HasFP) {
+ // Calculate required stack adjustment.
+ uint64_t FrameSize = StackSize - SlotSize;
+ // If required, include space for extra hidden slot for stashing base pointer.
+ if (X86FI->getRestoreBasePointer())
+ FrameSize += SlotSize;
+
+ NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize();
+
+ // Callee-saved registers are pushed on stack before the stack is realigned.
+ if (TRI->needsStackRealignment(MF) && !IsWin64Prologue)
+ NumBytes = alignTo(NumBytes, MaxAlign);
+
+ // Get the offset of the stack slot for the EBP register, which is
+ // guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
+ // Update the frame offset adjustment.
+ if (!IsFunclet)
+ MFI.setOffsetAdjustment(-NumBytes);
+ else
+ assert(MFI.getOffsetAdjustment() == -(int)NumBytes &&
+ "should calculate same local variable offset for funclets");
+
+ // Save EBP/RBP into the appropriate stack slot.
+ BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
+ .addReg(MachineFramePtr, RegState::Kill)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ if (NeedsDwarfCFI) {
+ // Mark the place where EBP/RBP was saved.
+ // Define the current CFA rule to use the provided offset.
+ assert(StackSize);
+ BuildCFI(MBB, MBBI, DL,
+ MCCFIInstruction::createDefCfaOffset(nullptr, 2 * stackGrowth));
+
+ // Change the rule for the FramePtr to be an "offset" rule.
+ unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
+ BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createOffset(
+ nullptr, DwarfFramePtr, 2 * stackGrowth));
+ }
+
+ if (NeedsWinCFI) {
+ HasWinCFI = true;
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
+ .addImm(FramePtr)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ if (!IsWin64Prologue && !IsFunclet) {
+ // Update EBP with the new base value.
+ BuildMI(MBB, MBBI, DL,
+ TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr),
+ FramePtr)
+ .addReg(StackPtr)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ if (NeedsDwarfCFI) {
+ // Mark effective beginning of when frame pointer becomes valid.
+ // Define the current CFA to use the EBP/RBP register.
+ unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
+ BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaRegister(
+ nullptr, DwarfFramePtr));
+ }
+ }
+
+ // Mark the FramePtr as live-in in every block. Don't do this again for
+ // funclet prologues.
+ if (!IsFunclet) {
+ for (MachineBasicBlock &EveryMBB : MF)
+ EveryMBB.addLiveIn(MachineFramePtr);
+ }
+ } else {
+ assert(!IsFunclet && "funclets without FPs not yet implemented");
+ NumBytes = StackSize - X86FI->getCalleeSavedFrameSize();
+ }
+
+ // For EH funclets, only allocate enough space for outgoing calls. Save the
+ // NumBytes value that we would've used for the parent frame.
+ unsigned ParentFrameNumBytes = NumBytes;
+ if (IsFunclet)
+ NumBytes = getWinEHFuncletFrameSize(MF);
+
+ // Skip the callee-saved push instructions.
+ bool PushedRegs = false;
+ int StackOffset = 2 * stackGrowth;
+
+ while (MBBI != MBB.end() &&
+ MBBI->getFlag(MachineInstr::FrameSetup) &&
+ (MBBI->getOpcode() == X86::PUSH32r ||
+ MBBI->getOpcode() == X86::PUSH64r)) {
+ PushedRegs = true;
+ unsigned Reg = MBBI->getOperand(0).getReg();
+ ++MBBI;
+
+ if (!HasFP && NeedsDwarfCFI) {
+ // Mark callee-saved push instruction.
+ // Define the current CFA rule to use the provided offset.
+ assert(StackSize);
+ BuildCFI(MBB, MBBI, DL,
+ MCCFIInstruction::createDefCfaOffset(nullptr, StackOffset));
+ StackOffset += stackGrowth;
+ }
+
+ if (NeedsWinCFI) {
+ HasWinCFI = true;
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)).addImm(Reg).setMIFlag(
+ MachineInstr::FrameSetup);
+ }
+ }
+
+ // Realign stack after we pushed callee-saved registers (so that we'll be
+ // able to calculate their offsets from the frame pointer).
+ // Don't do this for Win64, it needs to realign the stack after the prologue.
+ if (!IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF)) {
+ assert(HasFP && "There should be a frame pointer if stack is realigned.");
+ BuildStackAlignAND(MBB, MBBI, DL, StackPtr, MaxAlign);
+ }
+
+ // If there is an SUB32ri of ESP immediately before this instruction, merge
+ // the two. This can be the case when tail call elimination is enabled and
+ // the callee has more arguments then the caller.
+ NumBytes -= mergeSPUpdates(MBB, MBBI, true);
+
+ // Adjust stack pointer: ESP -= numbytes.
+
+ // Windows and cygwin/mingw require a prologue helper routine when allocating
+ // more than 4K bytes on the stack. Windows uses __chkstk and cygwin/mingw
+ // uses __alloca. __alloca and the 32-bit version of __chkstk will probe the
+ // stack and adjust the stack pointer in one go. The 64-bit version of
+ // __chkstk is only responsible for probing the stack. The 64-bit prologue is
+ // responsible for adjusting the stack pointer. Touching the stack at 4K
+ // increments is necessary to ensure that the guard pages used by the OS
+ // virtual memory manager are allocated in correct sequence.
+ uint64_t AlignedNumBytes = NumBytes;
+ if (IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF))
+ AlignedNumBytes = alignTo(AlignedNumBytes, MaxAlign);
+ if (AlignedNumBytes >= StackProbeSize && UseStackProbe) {
+ // Check whether EAX is livein for this block.
+ bool isEAXAlive = isEAXLiveIn(MBB);
+
+ if (isEAXAlive) {
+ // Sanity check that EAX is not livein for this function.
+ // It should not be, so throw an assert.
+ assert(!Is64Bit && "EAX is livein in x64 case!");
+
+ // Save EAX
+ BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r))
+ .addReg(X86::EAX, RegState::Kill)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ if (Is64Bit) {
+ // Handle the 64-bit Windows ABI case where we need to call __chkstk.
+ // Function prologue is responsible for adjusting the stack pointer.
+ if (isUInt<32>(NumBytes)) {
+ BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
+ .addImm(NumBytes)
+ .setMIFlag(MachineInstr::FrameSetup);
+ } else if (isInt<32>(NumBytes)) {
+ BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri32), X86::RAX)
+ .addImm(NumBytes)
+ .setMIFlag(MachineInstr::FrameSetup);
+ } else {
+ BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX)
+ .addImm(NumBytes)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+ } else {
+ // Allocate NumBytes-4 bytes on stack in case of isEAXAlive.
+ // We'll also use 4 already allocated bytes for EAX.
+ BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
+ .addImm(isEAXAlive ? NumBytes - 4 : NumBytes)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ // Call __chkstk, __chkstk_ms, or __alloca.
+ emitStackProbe(MF, MBB, MBBI, DL, true);
+
+ if (isEAXAlive) {
+ // Restore EAX
+ MachineInstr *MI =
+ addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), X86::EAX),
+ StackPtr, false, NumBytes - 4);
+ MI->setFlag(MachineInstr::FrameSetup);
+ MBB.insert(MBBI, MI);
+ }
+ } else if (NumBytes) {
+ emitSPUpdate(MBB, MBBI, -(int64_t)NumBytes, /*InEpilogue=*/false);
+ }
+
+ if (NeedsWinCFI && NumBytes) {
+ HasWinCFI = true;
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc))
+ .addImm(NumBytes)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ int SEHFrameOffset = 0;
+ unsigned SPOrEstablisher;
+ if (IsFunclet) {
+ if (IsClrFunclet) {
+ // The establisher parameter passed to a CLR funclet is actually a pointer
+ // to the (mostly empty) frame of its nearest enclosing funclet; we have
+ // to find the root function establisher frame by loading the PSPSym from
+ // the intermediate frame.
+ unsigned PSPSlotOffset = getPSPSlotOffsetFromSP(MF);
+ MachinePointerInfo NoInfo;
+ MBB.addLiveIn(Establisher);
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rm), Establisher),
+ Establisher, false, PSPSlotOffset)
+ .addMemOperand(MF.getMachineMemOperand(
+ NoInfo, MachineMemOperand::MOLoad, SlotSize, SlotSize));
+ ;
+ // Save the root establisher back into the current funclet's (mostly
+ // empty) frame, in case a sub-funclet or the GC needs it.
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr,
+ false, PSPSlotOffset)
+ .addReg(Establisher)
+ .addMemOperand(
+ MF.getMachineMemOperand(NoInfo, MachineMemOperand::MOStore |
+ MachineMemOperand::MOVolatile,
+ SlotSize, SlotSize));
+ }
+ SPOrEstablisher = Establisher;
+ } else {
+ SPOrEstablisher = StackPtr;
+ }
+
+ if (IsWin64Prologue && HasFP) {
+ // Set RBP to a small fixed offset from RSP. In the funclet case, we base
+ // this calculation on the incoming establisher, which holds the value of
+ // RSP from the parent frame at the end of the prologue.
+ SEHFrameOffset = calculateSetFPREG(ParentFrameNumBytes);
+ if (SEHFrameOffset)
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), FramePtr),
+ SPOrEstablisher, false, SEHFrameOffset);
+ else
+ BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rr), FramePtr)
+ .addReg(SPOrEstablisher);
+
+ // If this is not a funclet, emit the CFI describing our frame pointer.
+ if (NeedsWinCFI && !IsFunclet) {
+ HasWinCFI = true;
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame))
+ .addImm(FramePtr)
+ .addImm(SEHFrameOffset)
+ .setMIFlag(MachineInstr::FrameSetup);
+ if (isAsynchronousEHPersonality(Personality))
+ MF.getWinEHFuncInfo()->SEHSetFrameOffset = SEHFrameOffset;
+ }
+ } else if (IsFunclet && STI.is32Bit()) {
+ // Reset EBP / ESI to something good for funclets.
+ MBBI = restoreWin32EHStackPointers(MBB, MBBI, DL);
+ // If we're a catch funclet, we can be returned to via catchret. Save ESP
+ // into the registration node so that the runtime will restore it for us.
+ if (!MBB.isCleanupFuncletEntry()) {
+ assert(Personality == EHPersonality::MSVC_CXX);
+ unsigned FrameReg;
+ int FI = MF.getWinEHFuncInfo()->EHRegNodeFrameIndex;
+ int64_t EHRegOffset = getFrameIndexReference(MF, FI, FrameReg);
+ // ESP is the first field, so no extra displacement is needed.
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32mr)), FrameReg,
+ false, EHRegOffset)
+ .addReg(X86::ESP);
+ }
+ }
+
+ while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) {
+ const MachineInstr &FrameInstr = *MBBI;
+ ++MBBI;
+
+ if (NeedsWinCFI) {
+ int FI;
+ if (unsigned Reg = TII.isStoreToStackSlot(FrameInstr, FI)) {
+ if (X86::FR64RegClass.contains(Reg)) {
+ unsigned IgnoredFrameReg;
+ int Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg);
+ Offset += SEHFrameOffset;
+
+ HasWinCFI = true;
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM))
+ .addImm(Reg)
+ .addImm(Offset)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+ }
+ }
+ }
+
+ if (NeedsWinCFI && HasWinCFI)
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue))
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ if (FnHasClrFunclet && !IsFunclet) {
+ // Save the so-called Initial-SP (i.e. the value of the stack pointer
+ // immediately after the prolog) into the PSPSlot so that funclets
+ // and the GC can recover it.
+ unsigned PSPSlotOffset = getPSPSlotOffsetFromSP(MF);
+ auto PSPInfo = MachinePointerInfo::getFixedStack(
+ MF, MF.getWinEHFuncInfo()->PSPSymFrameIdx);
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr, false,
+ PSPSlotOffset)
+ .addReg(StackPtr)
+ .addMemOperand(MF.getMachineMemOperand(
+ PSPInfo, MachineMemOperand::MOStore | MachineMemOperand::MOVolatile,
+ SlotSize, SlotSize));
+ }
+
+ // Realign stack after we spilled callee-saved registers (so that we'll be
+ // able to calculate their offsets from the frame pointer).
+ // Win64 requires aligning the stack after the prologue.
+ if (IsWin64Prologue && TRI->needsStackRealignment(MF)) {
+ assert(HasFP && "There should be a frame pointer if stack is realigned.");
+ BuildStackAlignAND(MBB, MBBI, DL, SPOrEstablisher, MaxAlign);
+ }
+
+ // We already dealt with stack realignment and funclets above.
+ if (IsFunclet && STI.is32Bit())
+ return;
+
+ // If we need a base pointer, set it up here. It's whatever the value
+ // of the stack pointer is at this point. Any variable size objects
+ // will be allocated after this, so we can still use the base pointer
+ // to reference locals.
+ if (TRI->hasBasePointer(MF)) {
+ // Update the base pointer with the current stack pointer.
+ unsigned Opc = Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr;
+ BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr)
+ .addReg(SPOrEstablisher)
+ .setMIFlag(MachineInstr::FrameSetup);
+ if (X86FI->getRestoreBasePointer()) {
+ // Stash value of base pointer. Saving RSP instead of EBP shortens
+ // dependence chain. Used by SjLj EH.
+ unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)),
+ FramePtr, true, X86FI->getRestoreBasePointerOffset())
+ .addReg(SPOrEstablisher)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ if (X86FI->getHasSEHFramePtrSave() && !IsFunclet) {
+ // Stash the value of the frame pointer relative to the base pointer for
+ // Win32 EH. This supports Win32 EH, which does the inverse of the above:
+ // it recovers the frame pointer from the base pointer rather than the
+ // other way around.
+ unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
+ unsigned UsedReg;
+ int Offset =
+ getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg);
+ assert(UsedReg == BasePtr);
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), UsedReg, true, Offset)
+ .addReg(FramePtr)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+ }
+
+ if (((!HasFP && NumBytes) || PushedRegs) && NeedsDwarfCFI) {
+ // Mark end of stack pointer adjustment.
+ if (!HasFP && NumBytes) {
+ // Define the current CFA rule to use the provided offset.
+ assert(StackSize);
+ BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaOffset(
+ nullptr, -StackSize + stackGrowth));
+ }
+
+ // Emit DWARF info specifying the offsets of the callee-saved registers.
+ if (PushedRegs)
+ emitCalleeSavedFrameMoves(MBB, MBBI, DL);
+ }
+
+ // X86 Interrupt handling function cannot assume anything about the direction
+ // flag (DF in EFLAGS register). Clear this flag by creating "cld" instruction
+ // in each prologue of interrupt handler function.
+ //
+ // FIXME: Create "cld" instruction only in these cases:
+ // 1. The interrupt handling function uses any of the "rep" instructions.
+ // 2. Interrupt handling function calls another function.
+ //
+ if (Fn->getCallingConv() == CallingConv::X86_INTR)
+ BuildMI(MBB, MBBI, DL, TII.get(X86::CLD))
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ // At this point we know if the function has WinCFI or not.
+ MF.setHasWinCFI(HasWinCFI);
+}
+
+bool X86FrameLowering::canUseLEAForSPInEpilogue(
+ const MachineFunction &MF) const {
+ // We can't use LEA instructions for adjusting the stack pointer if we don't
+ // have a frame pointer in the Win64 ABI. Only ADD instructions may be used
+ // to deallocate the stack.
+ // This means that we can use LEA for SP in two situations:
+ // 1. We *aren't* using the Win64 ABI which means we are free to use LEA.
+ // 2. We *have* a frame pointer which means we are permitted to use LEA.
+ return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI() || hasFP(MF);
+}
+
+static bool isFuncletReturnInstr(MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case X86::CATCHRET:
+ case X86::CLEANUPRET:
+ return true;
+ default:
+ return false;
+ }
+ llvm_unreachable("impossible");
+}
+
+// CLR funclets use a special "Previous Stack Pointer Symbol" slot on the
+// stack. It holds a pointer to the bottom of the root function frame. The
+// establisher frame pointer passed to a nested funclet may point to the
+// (mostly empty) frame of its parent funclet, but it will need to find
+// the frame of the root function to access locals. To facilitate this,
+// every funclet copies the pointer to the bottom of the root function
+// frame into a PSPSym slot in its own (mostly empty) stack frame. Using the
+// same offset for the PSPSym in the root function frame that's used in the
+// funclets' frames allows each funclet to dynamically accept any ancestor
+// frame as its establisher argument (the runtime doesn't guarantee the
+// immediate parent for some reason lost to history), and also allows the GC,
+// which uses the PSPSym for some bookkeeping, to find it in any funclet's
+// frame with only a single offset reported for the entire method.
+unsigned
+X86FrameLowering::getPSPSlotOffsetFromSP(const MachineFunction &MF) const {
+ const WinEHFuncInfo &Info = *MF.getWinEHFuncInfo();
+ unsigned SPReg;
+ int Offset = getFrameIndexReferencePreferSP(MF, Info.PSPSymFrameIdx, SPReg,
+ /*IgnoreSPUpdates*/ true);
+ assert(Offset >= 0 && SPReg == TRI->getStackRegister());
+ return static_cast<unsigned>(Offset);
+}
+
+unsigned
+X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const {
+ // This is the size of the pushed CSRs.
+ unsigned CSSize =
+ MF.getInfo<X86MachineFunctionInfo>()->getCalleeSavedFrameSize();
+ // This is the amount of stack a funclet needs to allocate.
+ unsigned UsedSize;
+ EHPersonality Personality =
+ classifyEHPersonality(MF.getFunction()->getPersonalityFn());
+ if (Personality == EHPersonality::CoreCLR) {
+ // CLR funclets need to hold enough space to include the PSPSym, at the
+ // same offset from the stack pointer (immediately after the prolog) as it
+ // resides at in the main function.
+ UsedSize = getPSPSlotOffsetFromSP(MF) + SlotSize;
+ } else {
+ // Other funclets just need enough stack for outgoing call arguments.
+ UsedSize = MF.getFrameInfo().getMaxCallFrameSize();
+ }
+ // RBP is not included in the callee saved register block. After pushing RBP,
+ // everything is 16 byte aligned. Everything we allocate before an outgoing
+ // call must also be 16 byte aligned.
+ unsigned FrameSizeMinusRBP = alignTo(CSSize + UsedSize, getStackAlignment());
+ // Subtract out the size of the callee saved registers. This is how much stack
+ // each funclet will allocate.
+ return FrameSizeMinusRBP - CSSize;
+}
+
+static bool isTailCallOpcode(unsigned Opc) {
+ return Opc == X86::TCRETURNri || Opc == X86::TCRETURNdi ||
+ Opc == X86::TCRETURNmi ||
+ Opc == X86::TCRETURNri64 || Opc == X86::TCRETURNdi64 ||
+ Opc == X86::TCRETURNmi64;
+}
+
+void X86FrameLowering::emitEpilogue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+ Optional<unsigned> RetOpcode;
+ if (MBBI != MBB.end())
+ RetOpcode = MBBI->getOpcode();
+ DebugLoc DL;
+ if (MBBI != MBB.end())
+ DL = MBBI->getDebugLoc();
+ // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
+ const bool Is64BitILP32 = STI.isTarget64BitILP32();
+ unsigned FramePtr = TRI->getFrameRegister(MF);
+ unsigned MachineFramePtr =
+ Is64BitILP32 ? getX86SubSuperRegister(FramePtr, 64) : FramePtr;
+
+ bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
+ bool NeedsWinCFI =
+ IsWin64Prologue && MF.getFunction()->needsUnwindTableEntry();
+ bool IsFunclet = MBBI == MBB.end() ? false : isFuncletReturnInstr(*MBBI);
+ MachineBasicBlock *TargetMBB = nullptr;
+
+ // Get the number of bytes to allocate from the FrameInfo.
+ uint64_t StackSize = MFI.getStackSize();
+ uint64_t MaxAlign = calculateMaxStackAlign(MF);
+ unsigned CSSize = X86FI->getCalleeSavedFrameSize();
+ uint64_t NumBytes = 0;
+
+ if (RetOpcode && *RetOpcode == X86::CATCHRET) {
+ // SEH shouldn't use catchret.
+ assert(!isAsynchronousEHPersonality(
+ classifyEHPersonality(MF.getFunction()->getPersonalityFn())) &&
+ "SEH should not use CATCHRET");
+
+ NumBytes = getWinEHFuncletFrameSize(MF);
+ assert(hasFP(MF) && "EH funclets without FP not yet implemented");
+ TargetMBB = MBBI->getOperand(0).getMBB();
+
+ // Pop EBP.
+ BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r),
+ MachineFramePtr)
+ .setMIFlag(MachineInstr::FrameDestroy);
+ } else if (RetOpcode && *RetOpcode == X86::CLEANUPRET) {
+ NumBytes = getWinEHFuncletFrameSize(MF);
+ assert(hasFP(MF) && "EH funclets without FP not yet implemented");
+ BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r),
+ MachineFramePtr)
+ .setMIFlag(MachineInstr::FrameDestroy);
+ } else if (hasFP(MF)) {
+ // Calculate required stack adjustment.
+ uint64_t FrameSize = StackSize - SlotSize;
+ NumBytes = FrameSize - CSSize;
+
+ // Callee-saved registers were pushed on stack before the stack was
+ // realigned.
+ if (TRI->needsStackRealignment(MF) && !IsWin64Prologue)
+ NumBytes = alignTo(FrameSize, MaxAlign);
+
+ // Pop EBP.
+ BuildMI(MBB, MBBI, DL,
+ TII.get(Is64Bit ? X86::POP64r : X86::POP32r), MachineFramePtr)
+ .setMIFlag(MachineInstr::FrameDestroy);
+ } else {
+ NumBytes = StackSize - CSSize;
+ }
+ uint64_t SEHStackAllocAmt = NumBytes;
+
+ MachineBasicBlock::iterator FirstCSPop = MBBI;
+ // Skip the callee-saved pop instructions.
+ while (MBBI != MBB.begin()) {
+ MachineBasicBlock::iterator PI = std::prev(MBBI);
+ unsigned Opc = PI->getOpcode();
+
+ if (Opc != X86::DBG_VALUE && !PI->isTerminator()) {
+ if ((Opc != X86::POP32r || !PI->getFlag(MachineInstr::FrameDestroy)) &&
+ (Opc != X86::POP64r || !PI->getFlag(MachineInstr::FrameDestroy)))
+ break;
+ FirstCSPop = PI;
+ }
+
+ --MBBI;
+ }
+ MBBI = FirstCSPop;
+
+ if (TargetMBB) {
+ // Fill EAX/RAX with the address of the target block.
+ unsigned ReturnReg = STI.is64Bit() ? X86::RAX : X86::EAX;
+ if (STI.is64Bit()) {
+ // LEA64r TargetMBB(%rip), %rax
+ BuildMI(MBB, FirstCSPop, DL, TII.get(X86::LEA64r), ReturnReg)
+ .addReg(X86::RIP)
+ .addImm(0)
+ .addReg(0)
+ .addMBB(TargetMBB)
+ .addReg(0);
+ } else {
+ // MOV32ri $TargetMBB, %eax
+ BuildMI(MBB, FirstCSPop, DL, TII.get(X86::MOV32ri), ReturnReg)
+ .addMBB(TargetMBB);
+ }
+ // Record that we've taken the address of TargetMBB and no longer just
+ // reference it in a terminator.
+ TargetMBB->setHasAddressTaken();
+ }
+
+ if (MBBI != MBB.end())
+ DL = MBBI->getDebugLoc();
+
+ // If there is an ADD32ri or SUB32ri of ESP immediately before this
+ // instruction, merge the two instructions.
+ if (NumBytes || MFI.hasVarSizedObjects())
+ NumBytes += mergeSPUpdates(MBB, MBBI, true);
+
+ // If dynamic alloca is used, then reset esp to point to the last callee-saved
+ // slot before popping them off! Same applies for the case, when stack was
+ // realigned. Don't do this if this was a funclet epilogue, since the funclets
+ // will not do realignment or dynamic stack allocation.
+ if ((TRI->needsStackRealignment(MF) || MFI.hasVarSizedObjects()) &&
+ !IsFunclet) {
+ if (TRI->needsStackRealignment(MF))
+ MBBI = FirstCSPop;
+ unsigned SEHFrameOffset = calculateSetFPREG(SEHStackAllocAmt);
+ uint64_t LEAAmount =
+ IsWin64Prologue ? SEHStackAllocAmt - SEHFrameOffset : -CSSize;
+
+ // There are only two legal forms of epilogue:
+ // - add SEHAllocationSize, %rsp
+ // - lea SEHAllocationSize(%FramePtr), %rsp
+ //
+ // 'mov %FramePtr, %rsp' will not be recognized as an epilogue sequence.
+ // However, we may use this sequence if we have a frame pointer because the
+ // effects of the prologue can safely be undone.
+ if (LEAAmount != 0) {
+ unsigned Opc = getLEArOpcode(Uses64BitFramePtr);
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr),
+ FramePtr, false, LEAAmount);
+ --MBBI;
+ } else {
+ unsigned Opc = (Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr);
+ BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
+ .addReg(FramePtr);
+ --MBBI;
+ }
+ } else if (NumBytes) {
+ // Adjust stack pointer back: ESP += numbytes.
+ emitSPUpdate(MBB, MBBI, NumBytes, /*InEpilogue=*/true);
+ --MBBI;
+ }
+
+ // Windows unwinder will not invoke function's exception handler if IP is
+ // either in prologue or in epilogue. This behavior causes a problem when a
+ // call immediately precedes an epilogue, because the return address points
+ // into the epilogue. To cope with that, we insert an epilogue marker here,
+ // then replace it with a 'nop' if it ends up immediately after a CALL in the
+ // final emitted code.
+ if (NeedsWinCFI && MF.hasWinCFI())
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_Epilogue));
+
+ if (!RetOpcode || !isTailCallOpcode(*RetOpcode)) {
+ // Add the return addr area delta back since we are not tail calling.
+ int Offset = -1 * X86FI->getTCReturnAddrDelta();
+ assert(Offset >= 0 && "TCDelta should never be positive");
+ if (Offset) {
+ MBBI = MBB.getFirstTerminator();
+
+ // Check for possible merge with preceding ADD instruction.
+ Offset += mergeSPUpdates(MBB, MBBI, true);
+ emitSPUpdate(MBB, MBBI, Offset, /*InEpilogue=*/true);
+ }
+ }
+}
+
+// NOTE: this only has a subset of the full frame index logic. In
+// particular, the FI < 0 and AfterFPPop logic is handled in
+// X86RegisterInfo::eliminateFrameIndex, but not here. Possibly
+// (probably?) it should be moved into here.
+int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+ unsigned &FrameReg) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ // We can't calculate offset from frame pointer if the stack is realigned,
+ // so enforce usage of stack/base pointer. The base pointer is used when we
+ // have dynamic allocas in addition to dynamic realignment.
+ if (TRI->hasBasePointer(MF))
+ FrameReg = TRI->getBaseRegister();
+ else if (TRI->needsStackRealignment(MF))
+ FrameReg = TRI->getStackRegister();
+ else
+ FrameReg = TRI->getFrameRegister(MF);
+
+ // Offset will hold the offset from the stack pointer at function entry to the
+ // object.
+ // We need to factor in additional offsets applied during the prologue to the
+ // frame, base, and stack pointer depending on which is used.
+ int Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea();
+ const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ unsigned CSSize = X86FI->getCalleeSavedFrameSize();
+ uint64_t StackSize = MFI.getStackSize();
+ bool HasFP = hasFP(MF);
+ bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
+ int64_t FPDelta = 0;
+
+ if (IsWin64Prologue) {
+ assert(!MFI.hasCalls() || (StackSize % 16) == 8);
+
+ // Calculate required stack adjustment.
+ uint64_t FrameSize = StackSize - SlotSize;
+ // If required, include space for extra hidden slot for stashing base pointer.
+ if (X86FI->getRestoreBasePointer())
+ FrameSize += SlotSize;
+ uint64_t NumBytes = FrameSize - CSSize;
+
+ uint64_t SEHFrameOffset = calculateSetFPREG(NumBytes);
+ if (FI && FI == X86FI->getFAIndex())
+ return -SEHFrameOffset;
+
+ // FPDelta is the offset from the "traditional" FP location of the old base
+ // pointer followed by return address and the location required by the
+ // restricted Win64 prologue.
+ // Add FPDelta to all offsets below that go through the frame pointer.
+ FPDelta = FrameSize - SEHFrameOffset;
+ assert((!MFI.hasCalls() || (FPDelta % 16) == 0) &&
+ "FPDelta isn't aligned per the Win64 ABI!");
+ }
+
+
+ if (TRI->hasBasePointer(MF)) {
+ assert(HasFP && "VLAs and dynamic stack realign, but no FP?!");
+ if (FI < 0) {
+ // Skip the saved EBP.
+ return Offset + SlotSize + FPDelta;
+ } else {
+ assert((-(Offset + StackSize)) % MFI.getObjectAlignment(FI) == 0);
+ return Offset + StackSize;
+ }
+ } else if (TRI->needsStackRealignment(MF)) {
+ if (FI < 0) {
+ // Skip the saved EBP.
+ return Offset + SlotSize + FPDelta;
+ } else {
+ assert((-(Offset + StackSize)) % MFI.getObjectAlignment(FI) == 0);
+ return Offset + StackSize;
+ }
+ // FIXME: Support tail calls
+ } else {
+ if (!HasFP)
+ return Offset + StackSize;
+
+ // Skip the saved EBP.
+ Offset += SlotSize;
+
+ // Skip the RETADDR move area
+ int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+ if (TailCallReturnAddrDelta < 0)
+ Offset -= TailCallReturnAddrDelta;
+ }
+
+ return Offset + FPDelta;
+}
+
+int
+X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF,
+ int FI, unsigned &FrameReg,
+ bool IgnoreSPUpdates) const {
+
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ // Does not include any dynamic realign.
+ const uint64_t StackSize = MFI.getStackSize();
+ // LLVM arranges the stack as follows:
+ // ...
+ // ARG2
+ // ARG1
+ // RETADDR
+ // PUSH RBP <-- RBP points here
+ // PUSH CSRs
+ // ~~~~~~~ <-- possible stack realignment (non-win64)
+ // ...
+ // STACK OBJECTS
+ // ... <-- RSP after prologue points here
+ // ~~~~~~~ <-- possible stack realignment (win64)
+ //
+ // if (hasVarSizedObjects()):
+ // ... <-- "base pointer" (ESI/RBX) points here
+ // DYNAMIC ALLOCAS
+ // ... <-- RSP points here
+ //
+ // Case 1: In the simple case of no stack realignment and no dynamic
+ // allocas, both "fixed" stack objects (arguments and CSRs) are addressable
+ // with fixed offsets from RSP.
+ //
+ // Case 2: In the case of stack realignment with no dynamic allocas, fixed
+ // stack objects are addressed with RBP and regular stack objects with RSP.
+ //
+ // Case 3: In the case of dynamic allocas and stack realignment, RSP is used
+ // to address stack arguments for outgoing calls and nothing else. The "base
+ // pointer" points to local variables, and RBP points to fixed objects.
+ //
+ // In cases 2 and 3, we can only answer for non-fixed stack objects, and the
+ // answer we give is relative to the SP after the prologue, and not the
+ // SP in the middle of the function.
+
+ if (MFI.isFixedObjectIndex(FI) && TRI->needsStackRealignment(MF) &&
+ !STI.isTargetWin64())
+ return getFrameIndexReference(MF, FI, FrameReg);
+
+ // If !hasReservedCallFrame the function might have SP adjustement in the
+ // body. So, even though the offset is statically known, it depends on where
+ // we are in the function.
+ const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+ if (!IgnoreSPUpdates && !TFI->hasReservedCallFrame(MF))
+ return getFrameIndexReference(MF, FI, FrameReg);
+
+ // We don't handle tail calls, and shouldn't be seeing them either.
+ assert(MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta() >= 0 &&
+ "we don't handle this case!");
+
+ // Fill in FrameReg output argument.
+ FrameReg = TRI->getStackRegister();
+
+ // This is how the math works out:
+ //
+ // %rsp grows (i.e. gets lower) left to right. Each box below is
+ // one word (eight bytes). Obj0 is the stack slot we're trying to
+ // get to.
+ //
+ // ----------------------------------
+ // | BP | Obj0 | Obj1 | ... | ObjN |
+ // ----------------------------------
+ // ^ ^ ^ ^
+ // A B C E
+ //
+ // A is the incoming stack pointer.
+ // (B - A) is the local area offset (-8 for x86-64) [1]
+ // (C - A) is the Offset returned by MFI.getObjectOffset for Obj0 [2]
+ //
+ // |(E - B)| is the StackSize (absolute value, positive). For a
+ // stack that grown down, this works out to be (B - E). [3]
+ //
+ // E is also the value of %rsp after stack has been set up, and we
+ // want (C - E) -- the value we can add to %rsp to get to Obj0. Now
+ // (C - E) == (C - A) - (B - A) + (B - E)
+ // { Using [1], [2] and [3] above }
+ // == getObjectOffset - LocalAreaOffset + StackSize
+ //
+
+ // Get the Offset from the StackPointer
+ int Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea();
+
+ return Offset + StackSize;
+}
+
+bool X86FrameLowering::assignCalleeSavedSpillSlots(
+ MachineFunction &MF, const TargetRegisterInfo *TRI,
+ std::vector<CalleeSavedInfo> &CSI) const {
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+
+ unsigned CalleeSavedFrameSize = 0;
+ int SpillSlotOffset = getOffsetOfLocalArea() + X86FI->getTCReturnAddrDelta();
+
+ if (hasFP(MF)) {
+ // emitPrologue always spills frame register the first thing.
+ SpillSlotOffset -= SlotSize;
+ MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
+
+ // Since emitPrologue and emitEpilogue will handle spilling and restoring of
+ // the frame register, we can delete it from CSI list and not have to worry
+ // about avoiding it later.
+ unsigned FPReg = TRI->getFrameRegister(MF);
+ for (unsigned i = 0; i < CSI.size(); ++i) {
+ if (TRI->regsOverlap(CSI[i].getReg(),FPReg)) {
+ CSI.erase(CSI.begin() + i);
+ break;
+ }
+ }
+ }
+
+ // Assign slots for GPRs. It increases frame size.
+ for (unsigned i = CSI.size(); i != 0; --i) {
+ unsigned Reg = CSI[i - 1].getReg();
+
+ if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
+ continue;
+
+ SpillSlotOffset -= SlotSize;
+ CalleeSavedFrameSize += SlotSize;
+
+ int SlotIndex = MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
+ CSI[i - 1].setFrameIdx(SlotIndex);
+ }
+
+ X86FI->setCalleeSavedFrameSize(CalleeSavedFrameSize);
+
+ // Assign slots for XMMs.
+ for (unsigned i = CSI.size(); i != 0; --i) {
+ unsigned Reg = CSI[i - 1].getReg();
+ if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
+ continue;
+
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+ // ensure alignment
+ SpillSlotOffset -= std::abs(SpillSlotOffset) % RC->getAlignment();
+ // spill into slot
+ SpillSlotOffset -= RC->getSize();
+ int SlotIndex =
+ MFI.CreateFixedSpillStackObject(RC->getSize(), SpillSlotOffset);
+ CSI[i - 1].setFrameIdx(SlotIndex);
+ MFI.ensureMaxAlignment(RC->getAlignment());
+ }
+
+ return true;
+}
+
+bool X86FrameLowering::spillCalleeSavedRegisters(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const {
+ DebugLoc DL = MBB.findDebugLoc(MI);
+
+ // Don't save CSRs in 32-bit EH funclets. The caller saves EBX, EBP, ESI, EDI
+ // for us, and there are no XMM CSRs on Win32.
+ if (MBB.isEHFuncletEntry() && STI.is32Bit() && STI.isOSWindows())
+ return true;
+
+ // Push GPRs. It increases frame size.
+ const MachineFunction &MF = *MBB.getParent();
+ unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r;
+ for (unsigned i = CSI.size(); i != 0; --i) {
+ unsigned Reg = CSI[i - 1].getReg();
+
+ if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
+ continue;
+
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ bool isLiveIn = MRI.isLiveIn(Reg);
+ if (!isLiveIn)
+ MBB.addLiveIn(Reg);
+
+ // Decide whether we can add a kill flag to the use.
+ bool CanKill = !isLiveIn;
+ // Check if any subregister is live-in
+ if (CanKill) {
+ for (MCRegAliasIterator AReg(Reg, TRI, false); AReg.isValid(); ++AReg) {
+ if (MRI.isLiveIn(*AReg)) {
+ CanKill = false;
+ break;
+ }
+ }
+ }
+
+ // Do not set a kill flag on values that are also marked as live-in. This
+ // happens with the @llvm-returnaddress intrinsic and with arguments
+ // passed in callee saved registers.
+ // Omitting the kill flags is conservatively correct even if the live-in
+ // is not used after all.
+ BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, getKillRegState(CanKill))
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ // Make XMM regs spilled. X86 does not have ability of push/pop XMM.
+ // It can be done by spilling XMMs to stack frame.
+ for (unsigned i = CSI.size(); i != 0; --i) {
+ unsigned Reg = CSI[i-1].getReg();
+ if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
+ continue;
+ // Add the callee-saved register as live-in. It's killed at the spill.
+ MBB.addLiveIn(Reg);
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+
+ TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i - 1].getFrameIdx(), RC,
+ TRI);
+ --MI;
+ MI->setFlag(MachineInstr::FrameSetup);
+ ++MI;
+ }
+
+ return true;
+}
+
+bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const {
+ if (CSI.empty())
+ return false;
+
+ if (MI != MBB.end() && isFuncletReturnInstr(*MI) && STI.isOSWindows()) {
+ // Don't restore CSRs in 32-bit EH funclets. Matches
+ // spillCalleeSavedRegisters.
+ if (STI.is32Bit())
+ return true;
+ // Don't restore CSRs before an SEH catchret. SEH except blocks do not form
+ // funclets. emitEpilogue transforms these to normal jumps.
+ if (MI->getOpcode() == X86::CATCHRET) {
+ const Function *Func = MBB.getParent()->getFunction();
+ bool IsSEH = isAsynchronousEHPersonality(
+ classifyEHPersonality(Func->getPersonalityFn()));
+ if (IsSEH)
+ return true;
+ }
+ }
+
+ DebugLoc DL = MBB.findDebugLoc(MI);
+
+ // Reload XMMs from stack frame.
+ for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+ unsigned Reg = CSI[i].getReg();
+ if (X86::GR64RegClass.contains(Reg) ||
+ X86::GR32RegClass.contains(Reg))
+ continue;
+
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+ TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RC, TRI);
+ }
+
+ // POP GPRs.
+ unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r;
+ for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+ unsigned Reg = CSI[i].getReg();
+ if (!X86::GR64RegClass.contains(Reg) &&
+ !X86::GR32RegClass.contains(Reg))
+ continue;
+
+ BuildMI(MBB, MI, DL, TII.get(Opc), Reg)
+ .setMIFlag(MachineInstr::FrameDestroy);
+ }
+ return true;
+}
+
+void X86FrameLowering::determineCalleeSaves(MachineFunction &MF,
+ BitVector &SavedRegs,
+ RegScavenger *RS) const {
+ TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+
+ if (TailCallReturnAddrDelta < 0) {
+ // create RETURNADDR area
+ // arg
+ // arg
+ // RETADDR
+ // { ...
+ // RETADDR area
+ // ...
+ // }
+ // [EBP]
+ MFI.CreateFixedObject(-TailCallReturnAddrDelta,
+ TailCallReturnAddrDelta - SlotSize, true);
+ }
+
+ // Spill the BasePtr if it's used.
+ if (TRI->hasBasePointer(MF)) {
+ SavedRegs.set(TRI->getBaseRegister());
+
+ // Allocate a spill slot for EBP if we have a base pointer and EH funclets.
+ if (MF.hasEHFunclets()) {
+ int FI = MFI.CreateSpillStackObject(SlotSize, SlotSize);
+ X86FI->setHasSEHFramePtrSave(true);
+ X86FI->setSEHFramePtrSaveIndex(FI);
+ }
+ }
+}
+
+static bool
+HasNestArgument(const MachineFunction *MF) {
+ const Function *F = MF->getFunction();
+ for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+ I != E; I++) {
+ if (I->hasNestAttr())
+ return true;
+ }
+ return false;
+}
+
+/// GetScratchRegister - Get a temp register for performing work in the
+/// segmented stack and the Erlang/HiPE stack prologue. Depending on platform
+/// and the properties of the function either one or two registers will be
+/// needed. Set primary to true for the first register, false for the second.
+static unsigned
+GetScratchRegister(bool Is64Bit, bool IsLP64, const MachineFunction &MF, bool Primary) {
+ CallingConv::ID CallingConvention = MF.getFunction()->getCallingConv();
+
+ // Erlang stuff.
+ if (CallingConvention == CallingConv::HiPE) {
+ if (Is64Bit)
+ return Primary ? X86::R14 : X86::R13;
+ else
+ return Primary ? X86::EBX : X86::EDI;
+ }
+
+ if (Is64Bit) {
+ if (IsLP64)
+ return Primary ? X86::R11 : X86::R12;
+ else
+ return Primary ? X86::R11D : X86::R12D;
+ }
+
+ bool IsNested = HasNestArgument(&MF);
+
+ if (CallingConvention == CallingConv::X86_FastCall ||
+ CallingConvention == CallingConv::Fast) {
+ if (IsNested)
+ report_fatal_error("Segmented stacks does not support fastcall with "
+ "nested function.");
+ return Primary ? X86::EAX : X86::ECX;
+ }
+ if (IsNested)
+ return Primary ? X86::EDX : X86::EAX;
+ return Primary ? X86::ECX : X86::EAX;
+}
+
+// The stack limit in the TCB is set to this many bytes above the actual stack
+// limit.
+static const uint64_t kSplitStackAvailable = 256;
+
+void X86FrameLowering::adjustForSegmentedStacks(
+ MachineFunction &MF, MachineBasicBlock &PrologueMBB) const {
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ uint64_t StackSize;
+ unsigned TlsReg, TlsOffset;
+ DebugLoc DL;
+
+ // To support shrink-wrapping we would need to insert the new blocks
+ // at the right place and update the branches to PrologueMBB.
+ assert(&(*MF.begin()) == &PrologueMBB && "Shrink-wrapping not supported yet");
+
+ unsigned ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true);
+ assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
+ "Scratch register is live-in");
+
+ if (MF.getFunction()->isVarArg())
+ report_fatal_error("Segmented stacks do not support vararg functions.");
+ if (!STI.isTargetLinux() && !STI.isTargetDarwin() && !STI.isTargetWin32() &&
+ !STI.isTargetWin64() && !STI.isTargetFreeBSD() &&
+ !STI.isTargetDragonFly())
+ report_fatal_error("Segmented stacks not supported on this platform.");
+
+ // Eventually StackSize will be calculated by a link-time pass; which will
+ // also decide whether checking code needs to be injected into this particular
+ // prologue.
+ StackSize = MFI.getStackSize();
+
+ // Do not generate a prologue for functions with a stack of size zero
+ if (StackSize == 0)
+ return;
+
+ MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock();
+ MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock();
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ bool IsNested = false;
+
+ // We need to know if the function has a nest argument only in 64 bit mode.
+ if (Is64Bit)
+ IsNested = HasNestArgument(&MF);
+
+ // The MOV R10, RAX needs to be in a different block, since the RET we emit in
+ // allocMBB needs to be last (terminating) instruction.
+
+ for (const auto &LI : PrologueMBB.liveins()) {
+ allocMBB->addLiveIn(LI);
+ checkMBB->addLiveIn(LI);
+ }
+
+ if (IsNested)
+ allocMBB->addLiveIn(IsLP64 ? X86::R10 : X86::R10D);
+
+ MF.push_front(allocMBB);
+ MF.push_front(checkMBB);
+
+ // When the frame size is less than 256 we just compare the stack
+ // boundary directly to the value of the stack pointer, per gcc.
+ bool CompareStackPointer = StackSize < kSplitStackAvailable;
+
+ // Read the limit off the current stacklet off the stack_guard location.
+ if (Is64Bit) {
+ if (STI.isTargetLinux()) {
+ TlsReg = X86::FS;
+ TlsOffset = IsLP64 ? 0x70 : 0x40;
+ } else if (STI.isTargetDarwin()) {
+ TlsReg = X86::GS;
+ TlsOffset = 0x60 + 90*8; // See pthread_machdep.h. Steal TLS slot 90.
+ } else if (STI.isTargetWin64()) {
+ TlsReg = X86::GS;
+ TlsOffset = 0x28; // pvArbitrary, reserved for application use
+ } else if (STI.isTargetFreeBSD()) {
+ TlsReg = X86::FS;
+ TlsOffset = 0x18;
+ } else if (STI.isTargetDragonFly()) {
+ TlsReg = X86::FS;
+ TlsOffset = 0x20; // use tls_tcb.tcb_segstack
+ } else {
+ report_fatal_error("Segmented stacks not supported on this platform.");
+ }
+
+ if (CompareStackPointer)
+ ScratchReg = IsLP64 ? X86::RSP : X86::ESP;
+ else
+ BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::LEA64r : X86::LEA64_32r), ScratchReg).addReg(X86::RSP)
+ .addImm(1).addReg(0).addImm(-StackSize).addReg(0);
+
+ BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::CMP64rm : X86::CMP32rm)).addReg(ScratchReg)
+ .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg);
+ } else {
+ if (STI.isTargetLinux()) {
+ TlsReg = X86::GS;
+ TlsOffset = 0x30;
+ } else if (STI.isTargetDarwin()) {
+ TlsReg = X86::GS;
+ TlsOffset = 0x48 + 90*4;
+ } else if (STI.isTargetWin32()) {
+ TlsReg = X86::FS;
+ TlsOffset = 0x14; // pvArbitrary, reserved for application use
+ } else if (STI.isTargetDragonFly()) {
+ TlsReg = X86::FS;
+ TlsOffset = 0x10; // use tls_tcb.tcb_segstack
+ } else if (STI.isTargetFreeBSD()) {
+ report_fatal_error("Segmented stacks not supported on FreeBSD i386.");
+ } else {
+ report_fatal_error("Segmented stacks not supported on this platform.");
+ }
+
+ if (CompareStackPointer)
+ ScratchReg = X86::ESP;
+ else
+ BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP)
+ .addImm(1).addReg(0).addImm(-StackSize).addReg(0);
+
+ if (STI.isTargetLinux() || STI.isTargetWin32() || STI.isTargetWin64() ||
+ STI.isTargetDragonFly()) {
+ BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg)
+ .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg);
+ } else if (STI.isTargetDarwin()) {
+
+ // TlsOffset doesn't fit into a mod r/m byte so we need an extra register.
+ unsigned ScratchReg2;
+ bool SaveScratch2;
+ if (CompareStackPointer) {
+ // The primary scratch register is available for holding the TLS offset.
+ ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, true);
+ SaveScratch2 = false;
+ } else {
+ // Need to use a second register to hold the TLS offset
+ ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, false);
+
+ // Unfortunately, with fastcc the second scratch register may hold an
+ // argument.
+ SaveScratch2 = MF.getRegInfo().isLiveIn(ScratchReg2);
+ }
+
+ // If Scratch2 is live-in then it needs to be saved.
+ assert((!MF.getRegInfo().isLiveIn(ScratchReg2) || SaveScratch2) &&
+ "Scratch register is live-in and not saved");
+
+ if (SaveScratch2)
+ BuildMI(checkMBB, DL, TII.get(X86::PUSH32r))
+ .addReg(ScratchReg2, RegState::Kill);
+
+ BuildMI(checkMBB, DL, TII.get(X86::MOV32ri), ScratchReg2)
+ .addImm(TlsOffset);
+ BuildMI(checkMBB, DL, TII.get(X86::CMP32rm))
+ .addReg(ScratchReg)
+ .addReg(ScratchReg2).addImm(1).addReg(0)
+ .addImm(0)
+ .addReg(TlsReg);
+
+ if (SaveScratch2)
+ BuildMI(checkMBB, DL, TII.get(X86::POP32r), ScratchReg2);
+ }
+ }
+
+ // This jump is taken if SP >= (Stacklet Limit + Stack Space required).
+ // It jumps to normal execution of the function body.
+ BuildMI(checkMBB, DL, TII.get(X86::JA_1)).addMBB(&PrologueMBB);
+
+ // On 32 bit we first push the arguments size and then the frame size. On 64
+ // bit, we pass the stack frame size in r10 and the argument size in r11.
+ if (Is64Bit) {
+ // Functions with nested arguments use R10, so it needs to be saved across
+ // the call to _morestack
+
+ const unsigned RegAX = IsLP64 ? X86::RAX : X86::EAX;
+ const unsigned Reg10 = IsLP64 ? X86::R10 : X86::R10D;
+ const unsigned Reg11 = IsLP64 ? X86::R11 : X86::R11D;
+ const unsigned MOVrr = IsLP64 ? X86::MOV64rr : X86::MOV32rr;
+ const unsigned MOVri = IsLP64 ? X86::MOV64ri : X86::MOV32ri;
+
+ if (IsNested)
+ BuildMI(allocMBB, DL, TII.get(MOVrr), RegAX).addReg(Reg10);
+
+ BuildMI(allocMBB, DL, TII.get(MOVri), Reg10)
+ .addImm(StackSize);
+ BuildMI(allocMBB, DL, TII.get(MOVri), Reg11)
+ .addImm(X86FI->getArgumentStackSize());
+ } else {
+ BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
+ .addImm(X86FI->getArgumentStackSize());
+ BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
+ .addImm(StackSize);
+ }
+
+ // __morestack is in libgcc
+ if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) {
+ // Under the large code model, we cannot assume that __morestack lives
+ // within 2^31 bytes of the call site, so we cannot use pc-relative
+ // addressing. We cannot perform the call via a temporary register,
+ // as the rax register may be used to store the static chain, and all
+ // other suitable registers may be either callee-save or used for
+ // parameter passing. We cannot use the stack at this point either
+ // because __morestack manipulates the stack directly.
+ //
+ // To avoid these issues, perform an indirect call via a read-only memory
+ // location containing the address.
+ //
+ // This solution is not perfect, as it assumes that the .rodata section
+ // is laid out within 2^31 bytes of each function body, but this seems
+ // to be sufficient for JIT.
+ BuildMI(allocMBB, DL, TII.get(X86::CALL64m))
+ .addReg(X86::RIP)
+ .addImm(0)
+ .addReg(0)
+ .addExternalSymbol("__morestack_addr")
+ .addReg(0);
+ MF.getMMI().setUsesMorestackAddr(true);
+ } else {
+ if (Is64Bit)
+ BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32))
+ .addExternalSymbol("__morestack");
+ else
+ BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32))
+ .addExternalSymbol("__morestack");
+ }
+
+ if (IsNested)
+ BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET_RESTORE_R10));
+ else
+ BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET));
+
+ allocMBB->addSuccessor(&PrologueMBB);
+
+ checkMBB->addSuccessor(allocMBB);
+ checkMBB->addSuccessor(&PrologueMBB);
+
+#ifdef EXPENSIVE_CHECKS
+ MF.verify();
+#endif
+}
+
+/// Lookup an ERTS parameter in the !hipe.literals named metadata node.
+/// HiPE provides Erlang Runtime System-internal parameters, such as PCB offsets
+/// to fields it needs, through a named metadata node "hipe.literals" containing
+/// name-value pairs.
+static unsigned getHiPELiteral(
+ NamedMDNode *HiPELiteralsMD, const StringRef LiteralName) {
+ for (int i = 0, e = HiPELiteralsMD->getNumOperands(); i != e; ++i) {
+ MDNode *Node = HiPELiteralsMD->getOperand(i);
+ if (Node->getNumOperands() != 2) continue;
+ MDString *NodeName = dyn_cast<MDString>(Node->getOperand(0));
+ ValueAsMetadata *NodeVal = dyn_cast<ValueAsMetadata>(Node->getOperand(1));
+ if (!NodeName || !NodeVal) continue;
+ ConstantInt *ValConst = dyn_cast_or_null<ConstantInt>(NodeVal->getValue());
+ if (ValConst && NodeName->getString() == LiteralName) {
+ return ValConst->getZExtValue();
+ }
+ }
+
+ report_fatal_error("HiPE literal " + LiteralName
+ + " required but not provided");
+}
+
+/// Erlang programs may need a special prologue to handle the stack size they
+/// might need at runtime. That is because Erlang/OTP does not implement a C
+/// stack but uses a custom implementation of hybrid stack/heap architecture.
+/// (for more information see Eric Stenman's Ph.D. thesis:
+/// http://publications.uu.se/uu/fulltext/nbn_se_uu_diva-2688.pdf)
+///
+/// CheckStack:
+/// temp0 = sp - MaxStack
+/// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
+/// OldStart:
+/// ...
+/// IncStack:
+/// call inc_stack # doubles the stack space
+/// temp0 = sp - MaxStack
+/// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
+void X86FrameLowering::adjustForHiPEPrologue(
+ MachineFunction &MF, MachineBasicBlock &PrologueMBB) const {
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ DebugLoc DL;
+
+ // To support shrink-wrapping we would need to insert the new blocks
+ // at the right place and update the branches to PrologueMBB.
+ assert(&(*MF.begin()) == &PrologueMBB && "Shrink-wrapping not supported yet");
+
+ // HiPE-specific values
+ NamedMDNode *HiPELiteralsMD = MF.getMMI().getModule()
+ ->getNamedMetadata("hipe.literals");
+ if (!HiPELiteralsMD)
+ report_fatal_error(
+ "Can't generate HiPE prologue without runtime parameters");
+ const unsigned HipeLeafWords
+ = getHiPELiteral(HiPELiteralsMD,
+ Is64Bit ? "AMD64_LEAF_WORDS" : "X86_LEAF_WORDS");
+ const unsigned CCRegisteredArgs = Is64Bit ? 6 : 5;
+ const unsigned Guaranteed = HipeLeafWords * SlotSize;
+ unsigned CallerStkArity = MF.getFunction()->arg_size() > CCRegisteredArgs ?
+ MF.getFunction()->arg_size() - CCRegisteredArgs : 0;
+ unsigned MaxStack = MFI.getStackSize() + CallerStkArity*SlotSize + SlotSize;
+
+ assert(STI.isTargetLinux() &&
+ "HiPE prologue is only supported on Linux operating systems.");
+
+ // Compute the largest caller's frame that is needed to fit the callees'
+ // frames. This 'MaxStack' is computed from:
+ //
+ // a) the fixed frame size, which is the space needed for all spilled temps,
+ // b) outgoing on-stack parameter areas, and
+ // c) the minimum stack space this function needs to make available for the
+ // functions it calls (a tunable ABI property).
+ if (MFI.hasCalls()) {
+ unsigned MoreStackForCalls = 0;
+
+ for (auto &MBB : MF) {
+ for (auto &MI : MBB) {
+ if (!MI.isCall())
+ continue;
+
+ // Get callee operand.
+ const MachineOperand &MO = MI.getOperand(0);
+
+ // Only take account of global function calls (no closures etc.).
+ if (!MO.isGlobal())
+ continue;
+
+ const Function *F = dyn_cast<Function>(MO.getGlobal());
+ if (!F)
+ continue;
+
+ // Do not update 'MaxStack' for primitive and built-in functions
+ // (encoded with names either starting with "erlang."/"bif_" or not
+ // having a ".", such as a simple <Module>.<Function>.<Arity>, or an
+ // "_", such as the BIF "suspend_0") as they are executed on another
+ // stack.
+ if (F->getName().find("erlang.") != StringRef::npos ||
+ F->getName().find("bif_") != StringRef::npos ||
+ F->getName().find_first_of("._") == StringRef::npos)
+ continue;
+
+ unsigned CalleeStkArity =
+ F->arg_size() > CCRegisteredArgs ? F->arg_size()-CCRegisteredArgs : 0;
+ if (HipeLeafWords - 1 > CalleeStkArity)
+ MoreStackForCalls = std::max(MoreStackForCalls,
+ (HipeLeafWords - 1 - CalleeStkArity) * SlotSize);
+ }
+ }
+ MaxStack += MoreStackForCalls;
+ }
+
+ // If the stack frame needed is larger than the guaranteed then runtime checks
+ // and calls to "inc_stack_0" BIF should be inserted in the assembly prologue.
+ if (MaxStack > Guaranteed) {
+ MachineBasicBlock *stackCheckMBB = MF.CreateMachineBasicBlock();
+ MachineBasicBlock *incStackMBB = MF.CreateMachineBasicBlock();
+
+ for (const auto &LI : PrologueMBB.liveins()) {
+ stackCheckMBB->addLiveIn(LI);
+ incStackMBB->addLiveIn(LI);
+ }
+
+ MF.push_front(incStackMBB);
+ MF.push_front(stackCheckMBB);
+
+ unsigned ScratchReg, SPReg, PReg, SPLimitOffset;
+ unsigned LEAop, CMPop, CALLop;
+ SPLimitOffset = getHiPELiteral(HiPELiteralsMD, "P_NSP_LIMIT");
+ if (Is64Bit) {
+ SPReg = X86::RSP;
+ PReg = X86::RBP;
+ LEAop = X86::LEA64r;
+ CMPop = X86::CMP64rm;
+ CALLop = X86::CALL64pcrel32;
+ } else {
+ SPReg = X86::ESP;
+ PReg = X86::EBP;
+ LEAop = X86::LEA32r;
+ CMPop = X86::CMP32rm;
+ CALLop = X86::CALLpcrel32;
+ }
+
+ ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true);
+ assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
+ "HiPE prologue scratch register is live-in");
+
+ // Create new MBB for StackCheck:
+ addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(LEAop), ScratchReg),
+ SPReg, false, -MaxStack);
+ // SPLimitOffset is in a fixed heap location (pointed by BP).
+ addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop))
+ .addReg(ScratchReg), PReg, false, SPLimitOffset);
+ BuildMI(stackCheckMBB, DL, TII.get(X86::JAE_1)).addMBB(&PrologueMBB);
+
+ // Create new MBB for IncStack:
+ BuildMI(incStackMBB, DL, TII.get(CALLop)).
+ addExternalSymbol("inc_stack_0");
+ addRegOffset(BuildMI(incStackMBB, DL, TII.get(LEAop), ScratchReg),
+ SPReg, false, -MaxStack);
+ addRegOffset(BuildMI(incStackMBB, DL, TII.get(CMPop))
+ .addReg(ScratchReg), PReg, false, SPLimitOffset);
+ BuildMI(incStackMBB, DL, TII.get(X86::JLE_1)).addMBB(incStackMBB);
+
+ stackCheckMBB->addSuccessor(&PrologueMBB, {99, 100});
+ stackCheckMBB->addSuccessor(incStackMBB, {1, 100});
+ incStackMBB->addSuccessor(&PrologueMBB, {99, 100});
+ incStackMBB->addSuccessor(incStackMBB, {1, 100});
+ }
+#ifdef EXPENSIVE_CHECKS
+ MF.verify();
+#endif
+}
+
+bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL,
+ int Offset) const {
+
+ if (Offset <= 0)
+ return false;
+
+ if (Offset % SlotSize)
+ return false;
+
+ int NumPops = Offset / SlotSize;
+ // This is only worth it if we have at most 2 pops.
+ if (NumPops != 1 && NumPops != 2)
+ return false;
+
+ // Handle only the trivial case where the adjustment directly follows
+ // a call. This is the most common one, anyway.
+ if (MBBI == MBB.begin())
+ return false;
+ MachineBasicBlock::iterator Prev = std::prev(MBBI);
+ if (!Prev->isCall() || !Prev->getOperand(1).isRegMask())
+ return false;
+
+ unsigned Regs[2];
+ unsigned FoundRegs = 0;
+
+ auto RegMask = Prev->getOperand(1);
+
+ auto &RegClass =
+ Is64Bit ? X86::GR64_NOREX_NOSPRegClass : X86::GR32_NOREX_NOSPRegClass;
+ // Try to find up to NumPops free registers.
+ for (auto Candidate : RegClass) {
+
+ // Poor man's liveness:
+ // Since we're immediately after a call, any register that is clobbered
+ // by the call and not defined by it can be considered dead.
+ if (!RegMask.clobbersPhysReg(Candidate))
+ continue;
+
+ bool IsDef = false;
+ for (const MachineOperand &MO : Prev->implicit_operands()) {
+ if (MO.isReg() && MO.isDef() &&
+ TRI->isSuperOrSubRegisterEq(MO.getReg(), Candidate)) {
+ IsDef = true;
+ break;
+ }
+ }
+
+ if (IsDef)
+ continue;
+
+ Regs[FoundRegs++] = Candidate;
+ if (FoundRegs == (unsigned)NumPops)
+ break;
+ }
+
+ if (FoundRegs == 0)
+ return false;
+
+ // If we found only one free register, but need two, reuse the same one twice.
+ while (FoundRegs < (unsigned)NumPops)
+ Regs[FoundRegs++] = Regs[0];
+
+ for (int i = 0; i < NumPops; ++i)
+ BuildMI(MBB, MBBI, DL,
+ TII.get(STI.is64Bit() ? X86::POP64r : X86::POP32r), Regs[i]);
+
+ return true;
+}
+
+MachineBasicBlock::iterator X86FrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
+ bool reserveCallFrame = hasReservedCallFrame(MF);
+ unsigned Opcode = I->getOpcode();
+ bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode();
+ DebugLoc DL = I->getDebugLoc();
+ uint64_t Amount = !reserveCallFrame ? I->getOperand(0).getImm() : 0;
+ uint64_t InternalAmt = (isDestroy || Amount) ? I->getOperand(1).getImm() : 0;
+ I = MBB.erase(I);
+
+ if (!reserveCallFrame) {
+ // If the stack pointer can be changed after prologue, turn the
+ // adjcallstackup instruction into a 'sub ESP, <amt>' and the
+ // adjcallstackdown instruction into 'add ESP, <amt>'
+
+ // We need to keep the stack aligned properly. To do this, we round the
+ // amount of space needed for the outgoing arguments up to the next
+ // alignment boundary.
+ unsigned StackAlign = getStackAlignment();
+ Amount = alignTo(Amount, StackAlign);
+
+ MachineModuleInfo &MMI = MF.getMMI();
+ const Function *Fn = MF.getFunction();
+ bool WindowsCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
+ bool DwarfCFI = !WindowsCFI &&
+ (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry());
+
+ // If we have any exception handlers in this function, and we adjust
+ // the SP before calls, we may need to indicate this to the unwinder
+ // using GNU_ARGS_SIZE. Note that this may be necessary even when
+ // Amount == 0, because the preceding function may have set a non-0
+ // GNU_ARGS_SIZE.
+ // TODO: We don't need to reset this between subsequent functions,
+ // if it didn't change.
+ bool HasDwarfEHHandlers = !WindowsCFI && !MF.getLandingPads().empty();
+
+ if (HasDwarfEHHandlers && !isDestroy &&
+ MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences())
+ BuildCFI(MBB, I, DL,
+ MCCFIInstruction::createGnuArgsSize(nullptr, Amount));
+
+ if (Amount == 0)
+ return I;
+
+ // Factor out the amount that gets handled inside the sequence
+ // (Pushes of argument for frame setup, callee pops for frame destroy)
+ Amount -= InternalAmt;
+
+ // TODO: This is needed only if we require precise CFA.
+ // If this is a callee-pop calling convention, emit a CFA adjust for
+ // the amount the callee popped.
+ if (isDestroy && InternalAmt && DwarfCFI && !hasFP(MF))
+ BuildCFI(MBB, I, DL,
+ MCCFIInstruction::createAdjustCfaOffset(nullptr, -InternalAmt));
+
+ // Add Amount to SP to destroy a frame, or subtract to setup.
+ int64_t StackAdjustment = isDestroy ? Amount : -Amount;
+ int64_t CfaAdjustment = -StackAdjustment;
+
+ if (StackAdjustment) {
+ // Merge with any previous or following adjustment instruction. Note: the
+ // instructions merged with here do not have CFI, so their stack
+ // adjustments do not feed into CfaAdjustment.
+ StackAdjustment += mergeSPUpdates(MBB, I, true);
+ StackAdjustment += mergeSPUpdates(MBB, I, false);
+
+ if (StackAdjustment) {
+ if (!(Fn->optForMinSize() &&
+ adjustStackWithPops(MBB, I, DL, StackAdjustment)))
+ BuildStackAdjustment(MBB, I, DL, StackAdjustment,
+ /*InEpilogue=*/false);
+ }
+ }
+
+ if (DwarfCFI && !hasFP(MF)) {
+ // If we don't have FP, but need to generate unwind information,
+ // we need to set the correct CFA offset after the stack adjustment.
+ // How much we adjust the CFA offset depends on whether we're emitting
+ // CFI only for EH purposes or for debugging. EH only requires the CFA
+ // offset to be correct at each call site, while for debugging we want
+ // it to be more precise.
+
+ // TODO: When not using precise CFA, we also need to adjust for the
+ // InternalAmt here.
+ if (CfaAdjustment) {
+ BuildCFI(MBB, I, DL, MCCFIInstruction::createAdjustCfaOffset(
+ nullptr, CfaAdjustment));
+ }
+ }
+
+ return I;
+ }
+
+ if (isDestroy && InternalAmt) {
+ // If we are performing frame pointer elimination and if the callee pops
+ // something off the stack pointer, add it back. We do this until we have
+ // more advanced stack pointer tracking ability.
+ // We are not tracking the stack pointer adjustment by the callee, so make
+ // sure we restore the stack pointer immediately after the call, there may
+ // be spill code inserted between the CALL and ADJCALLSTACKUP instructions.
+ MachineBasicBlock::iterator CI = I;
+ MachineBasicBlock::iterator B = MBB.begin();
+ while (CI != B && !std::prev(CI)->isCall())
+ --CI;
+ BuildStackAdjustment(MBB, CI, DL, -InternalAmt, /*InEpilogue=*/false);
+ }
+
+ return I;
+}
+
+bool X86FrameLowering::canUseAsPrologue(const MachineBasicBlock &MBB) const {
+ assert(MBB.getParent() && "Block is not attached to a function!");
+ const MachineFunction &MF = *MBB.getParent();
+ return !TRI->needsStackRealignment(MF) || !MBB.isLiveIn(X86::EFLAGS);
+}
+
+bool X86FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const {
+ assert(MBB.getParent() && "Block is not attached to a function!");
+
+ // Win64 has strict requirements in terms of epilogue and we are
+ // not taking a chance at messing with them.
+ // I.e., unless this block is already an exit block, we can't use
+ // it as an epilogue.
+ if (STI.isTargetWin64() && !MBB.succ_empty() && !MBB.isReturnBlock())
+ return false;
+
+ if (canUseLEAForSPInEpilogue(*MBB.getParent()))
+ return true;
+
+ // If we cannot use LEA to adjust SP, we may need to use ADD, which
+ // clobbers the EFLAGS. Check that we do not need to preserve it,
+ // otherwise, conservatively assume this is not
+ // safe to insert the epilogue here.
+ return !flagsNeedToBePreservedBeforeTheTerminators(MBB);
+}
+
+bool X86FrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
+ // If we may need to emit frameless compact unwind information, give
+ // up as this is currently broken: PR25614.
+ return (MF.getFunction()->hasFnAttribute(Attribute::NoUnwind) || hasFP(MF)) &&
+ // The lowering of segmented stack and HiPE only support entry blocks
+ // as prologue blocks: PR26107.
+ // This limitation may be lifted if we fix:
+ // - adjustForSegmentedStacks
+ // - adjustForHiPEPrologue
+ MF.getFunction()->getCallingConv() != CallingConv::HiPE &&
+ !MF.shouldSplitStack();
+}
+
+MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, bool RestoreSP) const {
+ assert(STI.isTargetWindowsMSVC() && "funclets only supported in MSVC env");
+ assert(STI.isTargetWin32() && "EBP/ESI restoration only required on win32");
+ assert(STI.is32Bit() && !Uses64BitFramePtr &&
+ "restoring EBP/ESI on non-32-bit target");
+
+ MachineFunction &MF = *MBB.getParent();
+ unsigned FramePtr = TRI->getFrameRegister(MF);
+ unsigned BasePtr = TRI->getBaseRegister();
+ WinEHFuncInfo &FuncInfo = *MF.getWinEHFuncInfo();
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ // FIXME: Don't set FrameSetup flag in catchret case.
+
+ int FI = FuncInfo.EHRegNodeFrameIndex;
+ int EHRegSize = MFI.getObjectSize(FI);
+
+ if (RestoreSP) {
+ // MOV32rm -EHRegSize(%ebp), %esp
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), X86::ESP),
+ X86::EBP, true, -EHRegSize)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ unsigned UsedReg;
+ int EHRegOffset = getFrameIndexReference(MF, FI, UsedReg);
+ int EndOffset = -EHRegOffset - EHRegSize;
+ FuncInfo.EHRegNodeEndOffset = EndOffset;
+
+ if (UsedReg == FramePtr) {
+ // ADD $offset, %ebp
+ unsigned ADDri = getADDriOpcode(false, EndOffset);
+ BuildMI(MBB, MBBI, DL, TII.get(ADDri), FramePtr)
+ .addReg(FramePtr)
+ .addImm(EndOffset)
+ .setMIFlag(MachineInstr::FrameSetup)
+ ->getOperand(3)
+ .setIsDead();
+ assert(EndOffset >= 0 &&
+ "end of registration object above normal EBP position!");
+ } else if (UsedReg == BasePtr) {
+ // LEA offset(%ebp), %esi
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA32r), BasePtr),
+ FramePtr, false, EndOffset)
+ .setMIFlag(MachineInstr::FrameSetup);
+ // MOV32rm SavedEBPOffset(%esi), %ebp
+ assert(X86FI->getHasSEHFramePtrSave());
+ int Offset =
+ getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg);
+ assert(UsedReg == BasePtr);
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), FramePtr),
+ UsedReg, true, Offset)
+ .setMIFlag(MachineInstr::FrameSetup);
+ } else {
+ llvm_unreachable("32-bit frames with WinEH must use FramePtr or BasePtr");
+ }
+ return MBBI;
+}
+
+namespace {
+// Struct used by orderFrameObjects to help sort the stack objects.
+struct X86FrameSortingObject {
+ bool IsValid = false; // true if we care about this Object.
+ unsigned ObjectIndex = 0; // Index of Object into MFI list.
+ unsigned ObjectSize = 0; // Size of Object in bytes.
+ unsigned ObjectAlignment = 1; // Alignment of Object in bytes.
+ unsigned ObjectNumUses = 0; // Object static number of uses.
+};
+
+// The comparison function we use for std::sort to order our local
+// stack symbols. The current algorithm is to use an estimated
+// "density". This takes into consideration the size and number of
+// uses each object has in order to roughly minimize code size.
+// So, for example, an object of size 16B that is referenced 5 times
+// will get higher priority than 4 4B objects referenced 1 time each.
+// It's not perfect and we may be able to squeeze a few more bytes out of
+// it (for example : 0(esp) requires fewer bytes, symbols allocated at the
+// fringe end can have special consideration, given their size is less
+// important, etc.), but the algorithmic complexity grows too much to be
+// worth the extra gains we get. This gets us pretty close.
+// The final order leaves us with objects with highest priority going
+// at the end of our list.
+struct X86FrameSortingComparator {
+ inline bool operator()(const X86FrameSortingObject &A,
+ const X86FrameSortingObject &B) {
+ uint64_t DensityAScaled, DensityBScaled;
+
+ // For consistency in our comparison, all invalid objects are placed
+ // at the end. This also allows us to stop walking when we hit the
+ // first invalid item after it's all sorted.
+ if (!A.IsValid)
+ return false;
+ if (!B.IsValid)
+ return true;
+
+ // The density is calculated by doing :
+ // (double)DensityA = A.ObjectNumUses / A.ObjectSize
+ // (double)DensityB = B.ObjectNumUses / B.ObjectSize
+ // Since this approach may cause inconsistencies in
+ // the floating point <, >, == comparisons, depending on the floating
+ // point model with which the compiler was built, we're going
+ // to scale both sides by multiplying with
+ // A.ObjectSize * B.ObjectSize. This ends up factoring away
+ // the division and, with it, the need for any floating point
+ // arithmetic.
+ DensityAScaled = static_cast<uint64_t>(A.ObjectNumUses) *
+ static_cast<uint64_t>(B.ObjectSize);
+ DensityBScaled = static_cast<uint64_t>(B.ObjectNumUses) *
+ static_cast<uint64_t>(A.ObjectSize);
+
+ // If the two densities are equal, prioritize highest alignment
+ // objects. This allows for similar alignment objects
+ // to be packed together (given the same density).
+ // There's room for improvement here, also, since we can pack
+ // similar alignment (different density) objects next to each
+ // other to save padding. This will also require further
+ // complexity/iterations, and the overall gain isn't worth it,
+ // in general. Something to keep in mind, though.
+ if (DensityAScaled == DensityBScaled)
+ return A.ObjectAlignment < B.ObjectAlignment;
+
+ return DensityAScaled < DensityBScaled;
+ }
+};
+} // namespace
+
+// Order the symbols in the local stack.
+// We want to place the local stack objects in some sort of sensible order.
+// The heuristic we use is to try and pack them according to static number
+// of uses and size of object in order to minimize code size.
+void X86FrameLowering::orderFrameObjects(
+ const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ // Don't waste time if there's nothing to do.
+ if (ObjectsToAllocate.empty())
+ return;
+
+ // Create an array of all MFI objects. We won't need all of these
+ // objects, but we're going to create a full array of them to make
+ // it easier to index into when we're counting "uses" down below.
+ // We want to be able to easily/cheaply access an object by simply
+ // indexing into it, instead of having to search for it every time.
+ std::vector<X86FrameSortingObject> SortingObjects(MFI.getObjectIndexEnd());
+
+ // Walk the objects we care about and mark them as such in our working
+ // struct.
+ for (auto &Obj : ObjectsToAllocate) {
+ SortingObjects[Obj].IsValid = true;
+ SortingObjects[Obj].ObjectIndex = Obj;
+ SortingObjects[Obj].ObjectAlignment = MFI.getObjectAlignment(Obj);
+ // Set the size.
+ int ObjectSize = MFI.getObjectSize(Obj);
+ if (ObjectSize == 0)
+ // Variable size. Just use 4.
+ SortingObjects[Obj].ObjectSize = 4;
+ else
+ SortingObjects[Obj].ObjectSize = ObjectSize;
+ }
+
+ // Count the number of uses for each object.
+ for (auto &MBB : MF) {
+ for (auto &MI : MBB) {
+ if (MI.isDebugValue())
+ continue;
+ for (const MachineOperand &MO : MI.operands()) {
+ // Check to see if it's a local stack symbol.
+ if (!MO.isFI())
+ continue;
+ int Index = MO.getIndex();
+ // Check to see if it falls within our range, and is tagged
+ // to require ordering.
+ if (Index >= 0 && Index < MFI.getObjectIndexEnd() &&
+ SortingObjects[Index].IsValid)
+ SortingObjects[Index].ObjectNumUses++;
+ }
+ }
+ }
+
+ // Sort the objects using X86FrameSortingAlgorithm (see its comment for
+ // info).
+ std::stable_sort(SortingObjects.begin(), SortingObjects.end(),
+ X86FrameSortingComparator());
+
+ // Now modify the original list to represent the final order that
+ // we want. The order will depend on whether we're going to access them
+ // from the stack pointer or the frame pointer. For SP, the list should
+ // end up with the END containing objects that we want with smaller offsets.
+ // For FP, it should be flipped.
+ int i = 0;
+ for (auto &Obj : SortingObjects) {
+ // All invalid items are sorted at the end, so it's safe to stop.
+ if (!Obj.IsValid)
+ break;
+ ObjectsToAllocate[i++] = Obj.ObjectIndex;
+ }
+
+ // Flip it if we're accessing off of the FP.
+ if (!TRI->needsStackRealignment(MF) && hasFP(MF))
+ std::reverse(ObjectsToAllocate.begin(), ObjectsToAllocate.end());
+}
+
+
+unsigned X86FrameLowering::getWinEHParentFrameOffset(const MachineFunction &MF) const {
+ // RDX, the parent frame pointer, is homed into 16(%rsp) in the prologue.
+ unsigned Offset = 16;
+ // RBP is immediately pushed.
+ Offset += SlotSize;
+ // All callee-saved registers are then pushed.
+ Offset += MF.getInfo<X86MachineFunctionInfo>()->getCalleeSavedFrameSize();
+ // Every funclet allocates enough stack space for the largest outgoing call.
+ Offset += getWinEHFuncletFrameSize(MF);
+ return Offset;
+}
+
+void X86FrameLowering::processFunctionBeforeFrameFinalized(
+ MachineFunction &MF, RegScavenger *RS) const {
+ // If this function isn't doing Win64-style C++ EH, we don't need to do
+ // anything.
+ const Function *Fn = MF.getFunction();
+ if (!STI.is64Bit() || !MF.hasEHFunclets() ||
+ classifyEHPersonality(Fn->getPersonalityFn()) != EHPersonality::MSVC_CXX)
+ return;
+
+ // Win64 C++ EH needs to allocate the UnwindHelp object at some fixed offset
+ // relative to RSP after the prologue. Find the offset of the last fixed
+ // object, so that we can allocate a slot immediately following it. If there
+ // were no fixed objects, use offset -SlotSize, which is immediately after the
+ // return address. Fixed objects have negative frame indices.
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo();
+ int64_t MinFixedObjOffset = -SlotSize;
+ for (int I = MFI.getObjectIndexBegin(); I < 0; ++I)
+ MinFixedObjOffset = std::min(MinFixedObjOffset, MFI.getObjectOffset(I));
+
+ for (WinEHTryBlockMapEntry &TBME : EHInfo.TryBlockMap) {
+ for (WinEHHandlerType &H : TBME.HandlerArray) {
+ int FrameIndex = H.CatchObj.FrameIndex;
+ if (FrameIndex != INT_MAX) {
+ // Ensure alignment.
+ unsigned Align = MFI.getObjectAlignment(FrameIndex);
+ MinFixedObjOffset -= std::abs(MinFixedObjOffset) % Align;
+ MinFixedObjOffset -= MFI.getObjectSize(FrameIndex);
+ MFI.setObjectOffset(FrameIndex, MinFixedObjOffset);
+ }
+ }
+ }
+
+ // Ensure alignment.
+ MinFixedObjOffset -= std::abs(MinFixedObjOffset) % 8;
+ int64_t UnwindHelpOffset = MinFixedObjOffset - SlotSize;
+ int UnwindHelpFI =
+ MFI.CreateFixedObject(SlotSize, UnwindHelpOffset, /*Immutable=*/false);
+ EHInfo.UnwindHelpFrameIdx = UnwindHelpFI;
+
+ // Store -2 into UnwindHelp on function entry. We have to scan forwards past
+ // other frame setup instructions.
+ MachineBasicBlock &MBB = MF.front();
+ auto MBBI = MBB.begin();
+ while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
+ ++MBBI;
+
+ DebugLoc DL = MBB.findDebugLoc(MBBI);
+ addFrameReference(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mi32)),
+ UnwindHelpFI)
+ .addImm(-2);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.h b/contrib/llvm/lib/Target/X86/X86FrameLowering.h
new file mode 100644
index 000000000000..e1b04d6dc300
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.h
@@ -0,0 +1,218 @@
+//===-- X86TargetFrameLowering.h - Define frame lowering for X86 -*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements X86-specific bits of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H
+#define LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H
+
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+
+class MachineInstrBuilder;
+class MCCFIInstruction;
+class X86Subtarget;
+class X86RegisterInfo;
+
+class X86FrameLowering : public TargetFrameLowering {
+public:
+ X86FrameLowering(const X86Subtarget &STI, unsigned StackAlignOverride);
+
+ // Cached subtarget predicates.
+
+ const X86Subtarget &STI;
+ const TargetInstrInfo &TII;
+ const X86RegisterInfo *TRI;
+
+ unsigned SlotSize;
+
+ /// Is64Bit implies that x86_64 instructions are available.
+ bool Is64Bit;
+
+ bool IsLP64;
+
+ /// True if the 64-bit frame or stack pointer should be used. True for most
+ /// 64-bit targets with the exception of x32. If this is false, 32-bit
+ /// instruction operands should be used to manipulate StackPtr and FramePtr.
+ bool Uses64BitFramePtr;
+
+ unsigned StackPtr;
+
+ /// Emit target stack probe code. This is required for all
+ /// large stack allocations on Windows. The caller is required to materialize
+ /// the number of bytes to probe in RAX/EAX.
+ void emitStackProbe(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+ bool InProlog) const;
+
+ /// Replace a StackProbe inline-stub with the actual probe code inline.
+ void inlineStackProbe(MachineFunction &MF,
+ MachineBasicBlock &PrologMBB) const override;
+
+ void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL) const;
+
+ /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+ /// the function.
+ void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+ void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+ void adjustForSegmentedStacks(MachineFunction &MF,
+ MachineBasicBlock &PrologueMBB) const override;
+
+ void adjustForHiPEPrologue(MachineFunction &MF,
+ MachineBasicBlock &PrologueMBB) const override;
+
+ void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+ RegScavenger *RS = nullptr) const override;
+
+ bool
+ assignCalleeSavedSpillSlots(MachineFunction &MF,
+ const TargetRegisterInfo *TRI,
+ std::vector<CalleeSavedInfo> &CSI) const override;
+
+ bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const override;
+
+ bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const override;
+
+ bool hasFP(const MachineFunction &MF) const override;
+ bool hasReservedCallFrame(const MachineFunction &MF) const override;
+ bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override;
+ bool needsFrameIndexResolution(const MachineFunction &MF) const override;
+
+ int getFrameIndexReference(const MachineFunction &MF, int FI,
+ unsigned &FrameReg) const override;
+
+ int getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI,
+ unsigned &FrameReg,
+ bool IgnoreSPUpdates) const override;
+
+ MachineBasicBlock::iterator
+ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI) const override;
+
+ unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const override;
+
+ void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+ RegScavenger *RS) const override;
+
+ /// Check the instruction before/after the passed instruction. If
+ /// it is an ADD/SUB/LEA instruction it is deleted argument and the
+ /// stack adjustment is returned as a positive value for ADD/LEA and
+ /// a negative for SUB.
+ int mergeSPUpdates(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+ bool doMergeWithPrevious) const;
+
+ /// Emit a series of instructions to increment / decrement the stack
+ /// pointer by a constant value.
+ void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+ int64_t NumBytes, bool InEpilogue) const;
+
+ /// Check that LEA can be used on SP in an epilogue sequence for \p MF.
+ bool canUseLEAForSPInEpilogue(const MachineFunction &MF) const;
+
+ /// Check whether or not the given \p MBB can be used as a prologue
+ /// for the target.
+ /// The prologue will be inserted first in this basic block.
+ /// This method is used by the shrink-wrapping pass to decide if
+ /// \p MBB will be correctly handled by the target.
+ /// As soon as the target enable shrink-wrapping without overriding
+ /// this method, we assume that each basic block is a valid
+ /// prologue.
+ bool canUseAsPrologue(const MachineBasicBlock &MBB) const override;
+
+ /// Check whether or not the given \p MBB can be used as a epilogue
+ /// for the target.
+ /// The epilogue will be inserted before the first terminator of that block.
+ /// This method is used by the shrink-wrapping pass to decide if
+ /// \p MBB will be correctly handled by the target.
+ bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override;
+
+ /// Returns true if the target will correctly handle shrink wrapping.
+ bool enableShrinkWrapping(const MachineFunction &MF) const override;
+
+ /// Order the symbols in the local stack.
+ /// We want to place the local stack objects in some sort of sensible order.
+ /// The heuristic we use is to try and pack them according to static number
+ /// of uses and size in order to minimize code size.
+ void orderFrameObjects(const MachineFunction &MF,
+ SmallVectorImpl<int> &ObjectsToAllocate) const override;
+
+ /// convertArgMovsToPushes - This method tries to convert a call sequence
+ /// that uses sub and mov instructions to put the argument onto the stack
+ /// into a series of pushes.
+ /// Returns true if the transformation succeeded, false if not.
+ bool convertArgMovsToPushes(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ uint64_t Amount) const;
+
+ /// Wraps up getting a CFI index and building a MachineInstr for it.
+ void BuildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, const MCCFIInstruction &CFIInst) const;
+
+ /// Sets up EBP and optionally ESI based on the incoming EBP value. Only
+ /// needed for 32-bit. Used in funclet prologues and at catchret destinations.
+ MachineBasicBlock::iterator
+ restoreWin32EHStackPointers(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, bool RestoreSP = false) const;
+
+private:
+ uint64_t calculateMaxStackAlign(const MachineFunction &MF) const;
+
+ /// Emit target stack probe as a call to a helper function
+ void emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+ bool InProlog) const;
+
+ /// Emit target stack probe as an inline sequence.
+ void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, bool InProlog) const;
+
+ /// Emit a stub to later inline the target stack probe.
+ void emitStackProbeInlineStub(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, bool InProlog) const;
+
+ /// Aligns the stack pointer by ANDing it with -MaxAlign.
+ void BuildStackAlignAND(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+ unsigned Reg, uint64_t MaxAlign) const;
+
+ /// Make small positive stack adjustments using POPs.
+ bool adjustStackWithPops(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+ int Offset) const;
+
+ /// Adjusts the stack pointer using LEA, SUB, or ADD.
+ MachineInstrBuilder BuildStackAdjustment(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, int64_t Offset,
+ bool InEpilogue) const;
+
+ unsigned getPSPSlotOffsetFromSP(const MachineFunction &MF) const;
+
+ unsigned getWinEHFuncletFrameSize(const MachineFunction &MF) const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
new file mode 100644
index 000000000000..8b66790679d9
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -0,0 +1,2798 @@
+//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a DAG pattern matching instruction selector for X86,
+// converting from a legalized dag to a X86 dag.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include <stdint.h>
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-isel"
+
+STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
+
+//===----------------------------------------------------------------------===//
+// Pattern Matcher Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+ /// This corresponds to X86AddressMode, but uses SDValue's instead of register
+ /// numbers for the leaves of the matched tree.
+ struct X86ISelAddressMode {
+ enum {
+ RegBase,
+ FrameIndexBase
+ } BaseType;
+
+ // This is really a union, discriminated by BaseType!
+ SDValue Base_Reg;
+ int Base_FrameIndex;
+
+ unsigned Scale;
+ SDValue IndexReg;
+ int32_t Disp;
+ SDValue Segment;
+ const GlobalValue *GV;
+ const Constant *CP;
+ const BlockAddress *BlockAddr;
+ const char *ES;
+ MCSymbol *MCSym;
+ int JT;
+ unsigned Align; // CP alignment.
+ unsigned char SymbolFlags; // X86II::MO_*
+
+ X86ISelAddressMode()
+ : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0),
+ Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr),
+ MCSym(nullptr), JT(-1), Align(0), SymbolFlags(X86II::MO_NO_FLAG) {}
+
+ bool hasSymbolicDisplacement() const {
+ return GV != nullptr || CP != nullptr || ES != nullptr ||
+ MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
+ }
+
+ bool hasBaseOrIndexReg() const {
+ return BaseType == FrameIndexBase ||
+ IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
+ }
+
+ /// Return true if this addressing mode is already RIP-relative.
+ bool isRIPRelative() const {
+ if (BaseType != RegBase) return false;
+ if (RegisterSDNode *RegNode =
+ dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
+ return RegNode->getReg() == X86::RIP;
+ return false;
+ }
+
+ void setBaseReg(SDValue Reg) {
+ BaseType = RegBase;
+ Base_Reg = Reg;
+ }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ void dump() {
+ dbgs() << "X86ISelAddressMode " << this << '\n';
+ dbgs() << "Base_Reg ";
+ if (Base_Reg.getNode())
+ Base_Reg.getNode()->dump();
+ else
+ dbgs() << "nul";
+ dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n'
+ << " Scale" << Scale << '\n'
+ << "IndexReg ";
+ if (IndexReg.getNode())
+ IndexReg.getNode()->dump();
+ else
+ dbgs() << "nul";
+ dbgs() << " Disp " << Disp << '\n'
+ << "GV ";
+ if (GV)
+ GV->dump();
+ else
+ dbgs() << "nul";
+ dbgs() << " CP ";
+ if (CP)
+ CP->dump();
+ else
+ dbgs() << "nul";
+ dbgs() << '\n'
+ << "ES ";
+ if (ES)
+ dbgs() << ES;
+ else
+ dbgs() << "nul";
+ dbgs() << " MCSym ";
+ if (MCSym)
+ dbgs() << MCSym;
+ else
+ dbgs() << "nul";
+ dbgs() << " JT" << JT << " Align" << Align << '\n';
+ }
+#endif
+ };
+}
+
+namespace {
+ //===--------------------------------------------------------------------===//
+ /// ISel - X86-specific code to select X86 machine instructions for
+ /// SelectionDAG operations.
+ ///
+ class X86DAGToDAGISel final : public SelectionDAGISel {
+ /// Keep a pointer to the X86Subtarget around so that we can
+ /// make the right decision when generating code for different targets.
+ const X86Subtarget *Subtarget;
+
+ /// If true, selector should try to optimize for code size instead of
+ /// performance.
+ bool OptForSize;
+
+ /// If true, selector should try to optimize for minimum code size.
+ bool OptForMinSize;
+
+ public:
+ explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
+ : SelectionDAGISel(tm, OptLevel), OptForSize(false),
+ OptForMinSize(false) {}
+
+ StringRef getPassName() const override {
+ return "X86 DAG->DAG Instruction Selection";
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ // Reset the subtarget each time through.
+ Subtarget = &MF.getSubtarget<X86Subtarget>();
+ SelectionDAGISel::runOnMachineFunction(MF);
+ return true;
+ }
+
+ void EmitFunctionEntryCode() override;
+
+ bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
+
+ void PreprocessISelDAG() override;
+
+ inline bool immSext8(SDNode *N) const {
+ return isInt<8>(cast<ConstantSDNode>(N)->getSExtValue());
+ }
+
+ // True if the 64-bit immediate fits in a 32-bit sign-extended field.
+ inline bool i64immSExt32(SDNode *N) const {
+ uint64_t v = cast<ConstantSDNode>(N)->getZExtValue();
+ return (int64_t)v == (int32_t)v;
+ }
+
+// Include the pieces autogenerated from the target description.
+#include "X86GenDAGISel.inc"
+
+ private:
+ void Select(SDNode *N) override;
+ bool tryGather(SDNode *N, unsigned Opc);
+
+ bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
+ bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
+ bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
+ bool matchAddress(SDValue N, X86ISelAddressMode &AM);
+ bool matchAdd(SDValue N, X86ISelAddressMode &AM, unsigned Depth);
+ bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
+ unsigned Depth);
+ bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
+ bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index, SDValue &Disp,
+ SDValue &Segment);
+ bool selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index, SDValue &Disp,
+ SDValue &Segment);
+ bool selectMOV64Imm32(SDValue N, SDValue &Imm);
+ bool selectLEAAddr(SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index, SDValue &Disp,
+ SDValue &Segment);
+ bool selectLEA64_32Addr(SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index, SDValue &Disp,
+ SDValue &Segment);
+ bool selectTLSADDRAddr(SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index, SDValue &Disp,
+ SDValue &Segment);
+ bool selectScalarSSELoad(SDNode *Root, SDValue N,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment,
+ SDValue &NodeWithChain);
+ bool selectRelocImm(SDValue N, SDValue &Op);
+
+ bool tryFoldLoad(SDNode *P, SDValue N,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment);
+
+ /// Implement addressing mode selection for inline asm expressions.
+ bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+ unsigned ConstraintID,
+ std::vector<SDValue> &OutOps) override;
+
+ void emitSpecialCodeForMain();
+
+ inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment) {
+ Base = (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
+ ? CurDAG->getTargetFrameIndex(
+ AM.Base_FrameIndex,
+ TLI->getPointerTy(CurDAG->getDataLayout()))
+ : AM.Base_Reg;
+ Scale = getI8Imm(AM.Scale, DL);
+ Index = AM.IndexReg;
+ // These are 32-bit even in 64-bit mode since RIP-relative offset
+ // is 32-bit.
+ if (AM.GV)
+ Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
+ MVT::i32, AM.Disp,
+ AM.SymbolFlags);
+ else if (AM.CP)
+ Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32,
+ AM.Align, AM.Disp, AM.SymbolFlags);
+ else if (AM.ES) {
+ assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
+ Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
+ } else if (AM.MCSym) {
+ assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
+ assert(AM.SymbolFlags == 0 && "oo");
+ Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
+ } else if (AM.JT != -1) {
+ assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
+ Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
+ } else if (AM.BlockAddr)
+ Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
+ AM.SymbolFlags);
+ else
+ Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32);
+
+ if (AM.Segment.getNode())
+ Segment = AM.Segment;
+ else
+ Segment = CurDAG->getRegister(0, MVT::i32);
+ }
+
+ // Utility function to determine whether we should avoid selecting
+ // immediate forms of instructions for better code size or not.
+ // At a high level, we'd like to avoid such instructions when
+ // we have similar constants used within the same basic block
+ // that can be kept in a register.
+ //
+ bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
+ uint32_t UseCount = 0;
+
+ // Do not want to hoist if we're not optimizing for size.
+ // TODO: We'd like to remove this restriction.
+ // See the comment in X86InstrInfo.td for more info.
+ if (!OptForSize)
+ return false;
+
+ // Walk all the users of the immediate.
+ for (SDNode::use_iterator UI = N->use_begin(),
+ UE = N->use_end(); (UI != UE) && (UseCount < 2); ++UI) {
+
+ SDNode *User = *UI;
+
+ // This user is already selected. Count it as a legitimate use and
+ // move on.
+ if (User->isMachineOpcode()) {
+ UseCount++;
+ continue;
+ }
+
+ // We want to count stores of immediates as real uses.
+ if (User->getOpcode() == ISD::STORE &&
+ User->getOperand(1).getNode() == N) {
+ UseCount++;
+ continue;
+ }
+
+ // We don't currently match users that have > 2 operands (except
+ // for stores, which are handled above)
+ // Those instruction won't match in ISEL, for now, and would
+ // be counted incorrectly.
+ // This may change in the future as we add additional instruction
+ // types.
+ if (User->getNumOperands() != 2)
+ continue;
+
+ // Immediates that are used for offsets as part of stack
+ // manipulation should be left alone. These are typically
+ // used to indicate SP offsets for argument passing and
+ // will get pulled into stores/pushes (implicitly).
+ if (User->getOpcode() == X86ISD::ADD ||
+ User->getOpcode() == ISD::ADD ||
+ User->getOpcode() == X86ISD::SUB ||
+ User->getOpcode() == ISD::SUB) {
+
+ // Find the other operand of the add/sub.
+ SDValue OtherOp = User->getOperand(0);
+ if (OtherOp.getNode() == N)
+ OtherOp = User->getOperand(1);
+
+ // Don't count if the other operand is SP.
+ RegisterSDNode *RegNode;
+ if (OtherOp->getOpcode() == ISD::CopyFromReg &&
+ (RegNode = dyn_cast_or_null<RegisterSDNode>(
+ OtherOp->getOperand(1).getNode())))
+ if ((RegNode->getReg() == X86::ESP) ||
+ (RegNode->getReg() == X86::RSP))
+ continue;
+ }
+
+ // ... otherwise, count this and move on.
+ UseCount++;
+ }
+
+ // If we have more than 1 use, then recommend for hoisting.
+ return (UseCount > 1);
+ }
+
+ /// Return a target constant with the specified value of type i8.
+ inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
+ return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
+ }
+
+ /// Return a target constant with the specified value, of type i32.
+ inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
+ return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
+ }
+
+ /// Return an SDNode that returns the value of the global base register.
+ /// Output instructions required to initialize the global base register,
+ /// if necessary.
+ SDNode *getGlobalBaseReg();
+
+ /// Return a reference to the TargetMachine, casted to the target-specific
+ /// type.
+ const X86TargetMachine &getTargetMachine() const {
+ return static_cast<const X86TargetMachine &>(TM);
+ }
+
+ /// Return a reference to the TargetInstrInfo, casted to the target-specific
+ /// type.
+ const X86InstrInfo *getInstrInfo() const {
+ return Subtarget->getInstrInfo();
+ }
+
+ /// \brief Address-mode matching performs shift-of-and to and-of-shift
+ /// reassociation in order to expose more scaled addressing
+ /// opportunities.
+ bool ComplexPatternFuncMutatesDAG() const override {
+ return true;
+ }
+ };
+}
+
+
+bool
+X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
+ if (OptLevel == CodeGenOpt::None) return false;
+
+ if (!N.hasOneUse())
+ return false;
+
+ if (N.getOpcode() != ISD::LOAD)
+ return true;
+
+ // If N is a load, do additional profitability checks.
+ if (U == Root) {
+ switch (U->getOpcode()) {
+ default: break;
+ case X86ISD::ADD:
+ case X86ISD::SUB:
+ case X86ISD::AND:
+ case X86ISD::XOR:
+ case X86ISD::OR:
+ case ISD::ADD:
+ case ISD::ADDC:
+ case ISD::ADDE:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR: {
+ SDValue Op1 = U->getOperand(1);
+
+ // If the other operand is a 8-bit immediate we should fold the immediate
+ // instead. This reduces code size.
+ // e.g.
+ // movl 4(%esp), %eax
+ // addl $4, %eax
+ // vs.
+ // movl $4, %eax
+ // addl 4(%esp), %eax
+ // The former is 2 bytes shorter. In case where the increment is 1, then
+ // the saving can be 4 bytes (by using incl %eax).
+ if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1))
+ if (Imm->getAPIntValue().isSignedIntN(8))
+ return false;
+
+ // If the other operand is a TLS address, we should fold it instead.
+ // This produces
+ // movl %gs:0, %eax
+ // leal i@NTPOFF(%eax), %eax
+ // instead of
+ // movl $i@NTPOFF, %eax
+ // addl %gs:0, %eax
+ // if the block also has an access to a second TLS address this will save
+ // a load.
+ // FIXME: This is probably also true for non-TLS addresses.
+ if (Op1.getOpcode() == X86ISD::Wrapper) {
+ SDValue Val = Op1.getOperand(0);
+ if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
+ return false;
+ }
+ }
+ }
+ }
+
+ return true;
+}
+
+/// Replace the original chain operand of the call with
+/// load's chain operand and move load below the call's chain operand.
+static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
+ SDValue Call, SDValue OrigChain) {
+ SmallVector<SDValue, 8> Ops;
+ SDValue Chain = OrigChain.getOperand(0);
+ if (Chain.getNode() == Load.getNode())
+ Ops.push_back(Load.getOperand(0));
+ else {
+ assert(Chain.getOpcode() == ISD::TokenFactor &&
+ "Unexpected chain operand");
+ for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
+ if (Chain.getOperand(i).getNode() == Load.getNode())
+ Ops.push_back(Load.getOperand(0));
+ else
+ Ops.push_back(Chain.getOperand(i));
+ SDValue NewChain =
+ CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
+ Ops.clear();
+ Ops.push_back(NewChain);
+ }
+ Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
+ CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
+ CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
+ Load.getOperand(1), Load.getOperand(2));
+
+ Ops.clear();
+ Ops.push_back(SDValue(Load.getNode(), 1));
+ Ops.append(Call->op_begin() + 1, Call->op_end());
+ CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
+}
+
+/// Return true if call address is a load and it can be
+/// moved below CALLSEQ_START and the chains leading up to the call.
+/// Return the CALLSEQ_START by reference as a second output.
+/// In the case of a tail call, there isn't a callseq node between the call
+/// chain and the load.
+static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
+ // The transformation is somewhat dangerous if the call's chain was glued to
+ // the call. After MoveBelowOrigChain the load is moved between the call and
+ // the chain, this can create a cycle if the load is not folded. So it is
+ // *really* important that we are sure the load will be folded.
+ if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
+ return false;
+ LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode());
+ if (!LD ||
+ LD->isVolatile() ||
+ LD->getAddressingMode() != ISD::UNINDEXED ||
+ LD->getExtensionType() != ISD::NON_EXTLOAD)
+ return false;
+
+ // Now let's find the callseq_start.
+ while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
+ if (!Chain.hasOneUse())
+ return false;
+ Chain = Chain.getOperand(0);
+ }
+
+ if (!Chain.getNumOperands())
+ return false;
+ // Since we are not checking for AA here, conservatively abort if the chain
+ // writes to memory. It's not safe to move the callee (a load) across a store.
+ if (isa<MemSDNode>(Chain.getNode()) &&
+ cast<MemSDNode>(Chain.getNode())->writeMem())
+ return false;
+ if (Chain.getOperand(0).getNode() == Callee.getNode())
+ return true;
+ if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
+ Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
+ Callee.getValue(1).hasOneUse())
+ return true;
+ return false;
+}
+
+void X86DAGToDAGISel::PreprocessISelDAG() {
+ // OptFor[Min]Size are used in pattern predicates that isel is matching.
+ OptForSize = MF->getFunction()->optForSize();
+ OptForMinSize = MF->getFunction()->optForMinSize();
+ assert((!OptForMinSize || OptForSize) && "OptForMinSize implies OptForSize");
+
+ for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
+ E = CurDAG->allnodes_end(); I != E; ) {
+ SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
+
+ if (OptLevel != CodeGenOpt::None &&
+ // Only does this when target favors doesn't favor register indirect
+ // call.
+ ((N->getOpcode() == X86ISD::CALL && !Subtarget->callRegIndirect()) ||
+ (N->getOpcode() == X86ISD::TC_RETURN &&
+ // Only does this if load can be folded into TC_RETURN.
+ (Subtarget->is64Bit() ||
+ !getTargetMachine().isPositionIndependent())))) {
+ /// Also try moving call address load from outside callseq_start to just
+ /// before the call to allow it to be folded.
+ ///
+ /// [Load chain]
+ /// ^
+ /// |
+ /// [Load]
+ /// ^ ^
+ /// | |
+ /// / \--
+ /// / |
+ ///[CALLSEQ_START] |
+ /// ^ |
+ /// | |
+ /// [LOAD/C2Reg] |
+ /// | |
+ /// \ /
+ /// \ /
+ /// [CALL]
+ bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
+ SDValue Chain = N->getOperand(0);
+ SDValue Load = N->getOperand(1);
+ if (!isCalleeLoad(Load, Chain, HasCallSeq))
+ continue;
+ moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
+ ++NumLoadMoved;
+ continue;
+ }
+
+ // Lower fpround and fpextend nodes that target the FP stack to be store and
+ // load to the stack. This is a gross hack. We would like to simply mark
+ // these as being illegal, but when we do that, legalize produces these when
+ // it expands calls, then expands these in the same legalize pass. We would
+ // like dag combine to be able to hack on these between the call expansion
+ // and the node legalization. As such this pass basically does "really
+ // late" legalization of these inline with the X86 isel pass.
+ // FIXME: This should only happen when not compiled with -O0.
+ if (N->getOpcode() != ISD::FP_ROUND && N->getOpcode() != ISD::FP_EXTEND)
+ continue;
+
+ MVT SrcVT = N->getOperand(0).getSimpleValueType();
+ MVT DstVT = N->getSimpleValueType(0);
+
+ // If any of the sources are vectors, no fp stack involved.
+ if (SrcVT.isVector() || DstVT.isVector())
+ continue;
+
+ // If the source and destination are SSE registers, then this is a legal
+ // conversion that should not be lowered.
+ const X86TargetLowering *X86Lowering =
+ static_cast<const X86TargetLowering *>(TLI);
+ bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
+ bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
+ if (SrcIsSSE && DstIsSSE)
+ continue;
+
+ if (!SrcIsSSE && !DstIsSSE) {
+ // If this is an FPStack extension, it is a noop.
+ if (N->getOpcode() == ISD::FP_EXTEND)
+ continue;
+ // If this is a value-preserving FPStack truncation, it is a noop.
+ if (N->getConstantOperandVal(1))
+ continue;
+ }
+
+ // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
+ // FPStack has extload and truncstore. SSE can fold direct loads into other
+ // operations. Based on this, decide what we want to do.
+ MVT MemVT;
+ if (N->getOpcode() == ISD::FP_ROUND)
+ MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'.
+ else
+ MemVT = SrcIsSSE ? SrcVT : DstVT;
+
+ SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
+ SDLoc dl(N);
+
+ // FIXME: optimize the case where the src/dest is a load or store?
+ SDValue Store =
+ CurDAG->getTruncStore(CurDAG->getEntryNode(), dl, N->getOperand(0),
+ MemTmp, MachinePointerInfo(), MemVT);
+ SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
+ MachinePointerInfo(), MemVT);
+
+ // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
+ // extload we created. This will cause general havok on the dag because
+ // anything below the conversion could be folded into other existing nodes.
+ // To avoid invalidating 'I', back it up to the convert node.
+ --I;
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
+
+ // Now that we did that, the node is dead. Increment the iterator to the
+ // next node to process, then delete N.
+ ++I;
+ CurDAG->DeleteNode(N);
+ }
+}
+
+
+/// Emit any code that needs to be executed only in the main function.
+void X86DAGToDAGISel::emitSpecialCodeForMain() {
+ if (Subtarget->isTargetCygMing()) {
+ TargetLowering::ArgListTy Args;
+ auto &DL = CurDAG->getDataLayout();
+
+ TargetLowering::CallLoweringInfo CLI(*CurDAG);
+ CLI.setChain(CurDAG->getRoot())
+ .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
+ CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
+ std::move(Args));
+ const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
+ std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
+ CurDAG->setRoot(Result.second);
+ }
+}
+
+void X86DAGToDAGISel::EmitFunctionEntryCode() {
+ // If this is main, emit special code for main.
+ if (const Function *Fn = MF->getFunction())
+ if (Fn->hasExternalLinkage() && Fn->getName() == "main")
+ emitSpecialCodeForMain();
+}
+
+static bool isDispSafeForFrameIndex(int64_t Val) {
+ // On 64-bit platforms, we can run into an issue where a frame index
+ // includes a displacement that, when added to the explicit displacement,
+ // will overflow the displacement field. Assuming that the frame index
+ // displacement fits into a 31-bit integer (which is only slightly more
+ // aggressive than the current fundamental assumption that it fits into
+ // a 32-bit integer), a 31-bit disp should always be safe.
+ return isInt<31>(Val);
+}
+
+bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
+ X86ISelAddressMode &AM) {
+ // Cannot combine ExternalSymbol displacements with integer offsets.
+ if (Offset != 0 && (AM.ES || AM.MCSym))
+ return true;
+ int64_t Val = AM.Disp + Offset;
+ CodeModel::Model M = TM.getCodeModel();
+ if (Subtarget->is64Bit()) {
+ if (!X86::isOffsetSuitableForCodeModel(Val, M,
+ AM.hasSymbolicDisplacement()))
+ return true;
+ // In addition to the checks required for a register base, check that
+ // we do not try to use an unsafe Disp with a frame index.
+ if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
+ !isDispSafeForFrameIndex(Val))
+ return true;
+ }
+ AM.Disp = Val;
+ return false;
+
+}
+
+bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
+ SDValue Address = N->getOperand(1);
+
+ // load gs:0 -> GS segment register.
+ // load fs:0 -> FS segment register.
+ //
+ // This optimization is valid because the GNU TLS model defines that
+ // gs:0 (or fs:0 on X86-64) contains its own address.
+ // For more information see http://people.redhat.com/drepper/tls.pdf
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address))
+ if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr &&
+ Subtarget->isTargetGlibc())
+ switch (N->getPointerInfo().getAddrSpace()) {
+ case 256:
+ AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
+ return false;
+ case 257:
+ AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
+ return false;
+ // Address space 258 is not handled here, because it is not used to
+ // address TLS areas.
+ }
+
+ return true;
+}
+
+/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
+/// mode. These wrap things that will resolve down into a symbol reference.
+/// If no match is possible, this returns true, otherwise it returns false.
+bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
+ // If the addressing mode already has a symbol as the displacement, we can
+ // never match another symbol.
+ if (AM.hasSymbolicDisplacement())
+ return true;
+
+ SDValue N0 = N.getOperand(0);
+ CodeModel::Model M = TM.getCodeModel();
+
+ // Handle X86-64 rip-relative addresses. We check this before checking direct
+ // folding because RIP is preferable to non-RIP accesses.
+ if (Subtarget->is64Bit() && N.getOpcode() == X86ISD::WrapperRIP &&
+ // Under X86-64 non-small code model, GV (and friends) are 64-bits, so
+ // they cannot be folded into immediate fields.
+ // FIXME: This can be improved for kernel and other models?
+ (M == CodeModel::Small || M == CodeModel::Kernel)) {
+ // Base and index reg must be 0 in order to use %rip as base.
+ if (AM.hasBaseOrIndexReg())
+ return true;
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
+ X86ISelAddressMode Backup = AM;
+ AM.GV = G->getGlobal();
+ AM.SymbolFlags = G->getTargetFlags();
+ if (foldOffsetIntoAddress(G->getOffset(), AM)) {
+ AM = Backup;
+ return true;
+ }
+ } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
+ X86ISelAddressMode Backup = AM;
+ AM.CP = CP->getConstVal();
+ AM.Align = CP->getAlignment();
+ AM.SymbolFlags = CP->getTargetFlags();
+ if (foldOffsetIntoAddress(CP->getOffset(), AM)) {
+ AM = Backup;
+ return true;
+ }
+ } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
+ AM.ES = S->getSymbol();
+ AM.SymbolFlags = S->getTargetFlags();
+ } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
+ AM.MCSym = S->getMCSymbol();
+ } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
+ AM.JT = J->getIndex();
+ AM.SymbolFlags = J->getTargetFlags();
+ } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
+ X86ISelAddressMode Backup = AM;
+ AM.BlockAddr = BA->getBlockAddress();
+ AM.SymbolFlags = BA->getTargetFlags();
+ if (foldOffsetIntoAddress(BA->getOffset(), AM)) {
+ AM = Backup;
+ return true;
+ }
+ } else
+ llvm_unreachable("Unhandled symbol reference node.");
+
+ if (N.getOpcode() == X86ISD::WrapperRIP)
+ AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
+ return false;
+ }
+
+ // Handle the case when globals fit in our immediate field: This is true for
+ // X86-32 always and X86-64 when in -mcmodel=small mode. In 64-bit
+ // mode, this only applies to a non-RIP-relative computation.
+ if (!Subtarget->is64Bit() ||
+ M == CodeModel::Small || M == CodeModel::Kernel) {
+ assert(N.getOpcode() != X86ISD::WrapperRIP &&
+ "RIP-relative addressing already handled");
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
+ AM.GV = G->getGlobal();
+ AM.Disp += G->getOffset();
+ AM.SymbolFlags = G->getTargetFlags();
+ } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
+ AM.CP = CP->getConstVal();
+ AM.Align = CP->getAlignment();
+ AM.Disp += CP->getOffset();
+ AM.SymbolFlags = CP->getTargetFlags();
+ } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
+ AM.ES = S->getSymbol();
+ AM.SymbolFlags = S->getTargetFlags();
+ } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
+ AM.MCSym = S->getMCSymbol();
+ } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
+ AM.JT = J->getIndex();
+ AM.SymbolFlags = J->getTargetFlags();
+ } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
+ AM.BlockAddr = BA->getBlockAddress();
+ AM.Disp += BA->getOffset();
+ AM.SymbolFlags = BA->getTargetFlags();
+ } else
+ llvm_unreachable("Unhandled symbol reference node.");
+ return false;
+ }
+
+ return true;
+}
+
+/// Add the specified node to the specified addressing mode, returning true if
+/// it cannot be done. This just pattern matches for the addressing mode.
+bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
+ if (matchAddressRecursively(N, AM, 0))
+ return true;
+
+ // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
+ // a smaller encoding and avoids a scaled-index.
+ if (AM.Scale == 2 &&
+ AM.BaseType == X86ISelAddressMode::RegBase &&
+ AM.Base_Reg.getNode() == nullptr) {
+ AM.Base_Reg = AM.IndexReg;
+ AM.Scale = 1;
+ }
+
+ // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
+ // because it has a smaller encoding.
+ // TODO: Which other code models can use this?
+ if (TM.getCodeModel() == CodeModel::Small &&
+ Subtarget->is64Bit() &&
+ AM.Scale == 1 &&
+ AM.BaseType == X86ISelAddressMode::RegBase &&
+ AM.Base_Reg.getNode() == nullptr &&
+ AM.IndexReg.getNode() == nullptr &&
+ AM.SymbolFlags == X86II::MO_NO_FLAG &&
+ AM.hasSymbolicDisplacement())
+ AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
+
+ return false;
+}
+
+bool X86DAGToDAGISel::matchAdd(SDValue N, X86ISelAddressMode &AM,
+ unsigned Depth) {
+ // Add an artificial use to this node so that we can keep track of
+ // it if it gets CSE'd with a different node.
+ HandleSDNode Handle(N);
+
+ X86ISelAddressMode Backup = AM;
+ if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
+ !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
+ return false;
+ AM = Backup;
+
+ // Try again after commuting the operands.
+ if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1) &&
+ !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1))
+ return false;
+ AM = Backup;
+
+ // If we couldn't fold both operands into the address at the same time,
+ // see if we can just put each operand into a register and fold at least
+ // the add.
+ if (AM.BaseType == X86ISelAddressMode::RegBase &&
+ !AM.Base_Reg.getNode() &&
+ !AM.IndexReg.getNode()) {
+ N = Handle.getValue();
+ AM.Base_Reg = N.getOperand(0);
+ AM.IndexReg = N.getOperand(1);
+ AM.Scale = 1;
+ return false;
+ }
+ N = Handle.getValue();
+ return true;
+}
+
+// Insert a node into the DAG at least before the Pos node's position. This
+// will reposition the node as needed, and will assign it a node ID that is <=
+// the Pos node's ID. Note that this does *not* preserve the uniqueness of node
+// IDs! The selection DAG must no longer depend on their uniqueness when this
+// is used.
+static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
+ if (N.getNode()->getNodeId() == -1 ||
+ N.getNode()->getNodeId() > Pos.getNode()->getNodeId()) {
+ DAG.RepositionNode(Pos.getNode()->getIterator(), N.getNode());
+ N.getNode()->setNodeId(Pos.getNode()->getNodeId());
+ }
+}
+
+// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
+// safe. This allows us to convert the shift and and into an h-register
+// extract and a scaled index. Returns false if the simplification is
+// performed.
+static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
+ uint64_t Mask,
+ SDValue Shift, SDValue X,
+ X86ISelAddressMode &AM) {
+ if (Shift.getOpcode() != ISD::SRL ||
+ !isa<ConstantSDNode>(Shift.getOperand(1)) ||
+ !Shift.hasOneUse())
+ return true;
+
+ int ScaleLog = 8 - Shift.getConstantOperandVal(1);
+ if (ScaleLog <= 0 || ScaleLog >= 4 ||
+ Mask != (0xffu << ScaleLog))
+ return true;
+
+ MVT VT = N.getSimpleValueType();
+ SDLoc DL(N);
+ SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
+ SDValue NewMask = DAG.getConstant(0xff, DL, VT);
+ SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight);
+ SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask);
+ SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
+ SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount);
+
+ // Insert the new nodes into the topological ordering. We must do this in
+ // a valid topological ordering as nothing is going to go back and re-sort
+ // these nodes. We continually insert before 'N' in sequence as this is
+ // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
+ // hierarchy left to express.
+ insertDAGNode(DAG, N, Eight);
+ insertDAGNode(DAG, N, Srl);
+ insertDAGNode(DAG, N, NewMask);
+ insertDAGNode(DAG, N, And);
+ insertDAGNode(DAG, N, ShlCount);
+ insertDAGNode(DAG, N, Shl);
+ DAG.ReplaceAllUsesWith(N, Shl);
+ AM.IndexReg = And;
+ AM.Scale = (1 << ScaleLog);
+ return false;
+}
+
+// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
+// allows us to fold the shift into this addressing mode. Returns false if the
+// transform succeeded.
+static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
+ uint64_t Mask,
+ SDValue Shift, SDValue X,
+ X86ISelAddressMode &AM) {
+ if (Shift.getOpcode() != ISD::SHL ||
+ !isa<ConstantSDNode>(Shift.getOperand(1)))
+ return true;
+
+ // Not likely to be profitable if either the AND or SHIFT node has more
+ // than one use (unless all uses are for address computation). Besides,
+ // isel mechanism requires their node ids to be reused.
+ if (!N.hasOneUse() || !Shift.hasOneUse())
+ return true;
+
+ // Verify that the shift amount is something we can fold.
+ unsigned ShiftAmt = Shift.getConstantOperandVal(1);
+ if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
+ return true;
+
+ MVT VT = N.getSimpleValueType();
+ SDLoc DL(N);
+ SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT);
+ SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
+ SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
+
+ // Insert the new nodes into the topological ordering. We must do this in
+ // a valid topological ordering as nothing is going to go back and re-sort
+ // these nodes. We continually insert before 'N' in sequence as this is
+ // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
+ // hierarchy left to express.
+ insertDAGNode(DAG, N, NewMask);
+ insertDAGNode(DAG, N, NewAnd);
+ insertDAGNode(DAG, N, NewShift);
+ DAG.ReplaceAllUsesWith(N, NewShift);
+
+ AM.Scale = 1 << ShiftAmt;
+ AM.IndexReg = NewAnd;
+ return false;
+}
+
+// Implement some heroics to detect shifts of masked values where the mask can
+// be replaced by extending the shift and undoing that in the addressing mode
+// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
+// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
+// the addressing mode. This results in code such as:
+//
+// int f(short *y, int *lookup_table) {
+// ...
+// return *y + lookup_table[*y >> 11];
+// }
+//
+// Turning into:
+// movzwl (%rdi), %eax
+// movl %eax, %ecx
+// shrl $11, %ecx
+// addl (%rsi,%rcx,4), %eax
+//
+// Instead of:
+// movzwl (%rdi), %eax
+// movl %eax, %ecx
+// shrl $9, %ecx
+// andl $124, %rcx
+// addl (%rsi,%rcx), %eax
+//
+// Note that this function assumes the mask is provided as a mask *after* the
+// value is shifted. The input chain may or may not match that, but computing
+// such a mask is trivial.
+static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
+ uint64_t Mask,
+ SDValue Shift, SDValue X,
+ X86ISelAddressMode &AM) {
+ if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
+ !isa<ConstantSDNode>(Shift.getOperand(1)))
+ return true;
+
+ unsigned ShiftAmt = Shift.getConstantOperandVal(1);
+ unsigned MaskLZ = countLeadingZeros(Mask);
+ unsigned MaskTZ = countTrailingZeros(Mask);
+
+ // The amount of shift we're trying to fit into the addressing mode is taken
+ // from the trailing zeros of the mask.
+ unsigned AMShiftAmt = MaskTZ;
+
+ // There is nothing we can do here unless the mask is removing some bits.
+ // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
+ if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
+
+ // We also need to ensure that mask is a continuous run of bits.
+ if (countTrailingOnes(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true;
+
+ // Scale the leading zero count down based on the actual size of the value.
+ // Also scale it down based on the size of the shift.
+ MaskLZ -= (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
+
+ // The final check is to ensure that any masked out high bits of X are
+ // already known to be zero. Otherwise, the mask has a semantic impact
+ // other than masking out a couple of low bits. Unfortunately, because of
+ // the mask, zero extensions will be removed from operands in some cases.
+ // This code works extra hard to look through extensions because we can
+ // replace them with zero extensions cheaply if necessary.
+ bool ReplacingAnyExtend = false;
+ if (X.getOpcode() == ISD::ANY_EXTEND) {
+ unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
+ X.getOperand(0).getSimpleValueType().getSizeInBits();
+ // Assume that we'll replace the any-extend with a zero-extend, and
+ // narrow the search to the extended value.
+ X = X.getOperand(0);
+ MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
+ ReplacingAnyExtend = true;
+ }
+ APInt MaskedHighBits =
+ APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
+ APInt KnownZero, KnownOne;
+ DAG.computeKnownBits(X, KnownZero, KnownOne);
+ if (MaskedHighBits != KnownZero) return true;
+
+ // We've identified a pattern that can be transformed into a single shift
+ // and an addressing mode. Make it so.
+ MVT VT = N.getSimpleValueType();
+ if (ReplacingAnyExtend) {
+ assert(X.getValueType() != VT);
+ // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
+ SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
+ insertDAGNode(DAG, N, NewX);
+ X = NewX;
+ }
+ SDLoc DL(N);
+ SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
+ SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
+ SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
+ SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt);
+
+ // Insert the new nodes into the topological ordering. We must do this in
+ // a valid topological ordering as nothing is going to go back and re-sort
+ // these nodes. We continually insert before 'N' in sequence as this is
+ // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
+ // hierarchy left to express.
+ insertDAGNode(DAG, N, NewSRLAmt);
+ insertDAGNode(DAG, N, NewSRL);
+ insertDAGNode(DAG, N, NewSHLAmt);
+ insertDAGNode(DAG, N, NewSHL);
+ DAG.ReplaceAllUsesWith(N, NewSHL);
+
+ AM.Scale = 1 << AMShiftAmt;
+ AM.IndexReg = NewSRL;
+ return false;
+}
+
+bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
+ unsigned Depth) {
+ SDLoc dl(N);
+ DEBUG({
+ dbgs() << "MatchAddress: ";
+ AM.dump();
+ });
+ // Limit recursion.
+ if (Depth > 5)
+ return matchAddressBase(N, AM);
+
+ // If this is already a %rip relative address, we can only merge immediates
+ // into it. Instead of handling this in every case, we handle it here.
+ // RIP relative addressing: %rip + 32-bit displacement!
+ if (AM.isRIPRelative()) {
+ // FIXME: JumpTable and ExternalSymbol address currently don't like
+ // displacements. It isn't very important, but this should be fixed for
+ // consistency.
+ if (!(AM.ES || AM.MCSym) && AM.JT != -1)
+ return true;
+
+ if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N))
+ if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
+ return false;
+ return true;
+ }
+
+ switch (N.getOpcode()) {
+ default: break;
+ case ISD::LOCAL_RECOVER: {
+ if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
+ if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
+ // Use the symbol and don't prefix it.
+ AM.MCSym = ESNode->getMCSymbol();
+ return false;
+ }
+ break;
+ }
+ case ISD::Constant: {
+ uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
+ if (!foldOffsetIntoAddress(Val, AM))
+ return false;
+ break;
+ }
+
+ case X86ISD::Wrapper:
+ case X86ISD::WrapperRIP:
+ if (!matchWrapper(N, AM))
+ return false;
+ break;
+
+ case ISD::LOAD:
+ if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
+ return false;
+ break;
+
+ case ISD::FrameIndex:
+ if (AM.BaseType == X86ISelAddressMode::RegBase &&
+ AM.Base_Reg.getNode() == nullptr &&
+ (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
+ AM.BaseType = X86ISelAddressMode::FrameIndexBase;
+ AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
+ return false;
+ }
+ break;
+
+ case ISD::SHL:
+ if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
+ break;
+
+ if (ConstantSDNode
+ *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1))) {
+ unsigned Val = CN->getZExtValue();
+ // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
+ // that the base operand remains free for further matching. If
+ // the base doesn't end up getting used, a post-processing step
+ // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
+ if (Val == 1 || Val == 2 || Val == 3) {
+ AM.Scale = 1 << Val;
+ SDValue ShVal = N.getNode()->getOperand(0);
+
+ // Okay, we know that we have a scale by now. However, if the scaled
+ // value is an add of something and a constant, we can fold the
+ // constant into the disp field here.
+ if (CurDAG->isBaseWithConstantOffset(ShVal)) {
+ AM.IndexReg = ShVal.getNode()->getOperand(0);
+ ConstantSDNode *AddVal =
+ cast<ConstantSDNode>(ShVal.getNode()->getOperand(1));
+ uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val;
+ if (!foldOffsetIntoAddress(Disp, AM))
+ return false;
+ }
+
+ AM.IndexReg = ShVal;
+ return false;
+ }
+ }
+ break;
+
+ case ISD::SRL: {
+ // Scale must not be used already.
+ if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
+
+ SDValue And = N.getOperand(0);
+ if (And.getOpcode() != ISD::AND) break;
+ SDValue X = And.getOperand(0);
+
+ // We only handle up to 64-bit values here as those are what matter for
+ // addressing mode optimizations.
+ if (X.getSimpleValueType().getSizeInBits() > 64) break;
+
+ // The mask used for the transform is expected to be post-shift, but we
+ // found the shift first so just apply the shift to the mask before passing
+ // it down.
+ if (!isa<ConstantSDNode>(N.getOperand(1)) ||
+ !isa<ConstantSDNode>(And.getOperand(1)))
+ break;
+ uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
+
+ // Try to fold the mask and shift into the scale, and return false if we
+ // succeed.
+ if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
+ return false;
+ break;
+ }
+
+ case ISD::SMUL_LOHI:
+ case ISD::UMUL_LOHI:
+ // A mul_lohi where we need the low part can be folded as a plain multiply.
+ if (N.getResNo() != 0) break;
+ LLVM_FALLTHROUGH;
+ case ISD::MUL:
+ case X86ISD::MUL_IMM:
+ // X*[3,5,9] -> X+X*[2,4,8]
+ if (AM.BaseType == X86ISelAddressMode::RegBase &&
+ AM.Base_Reg.getNode() == nullptr &&
+ AM.IndexReg.getNode() == nullptr) {
+ if (ConstantSDNode
+ *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1)))
+ if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
+ CN->getZExtValue() == 9) {
+ AM.Scale = unsigned(CN->getZExtValue())-1;
+
+ SDValue MulVal = N.getNode()->getOperand(0);
+ SDValue Reg;
+
+ // Okay, we know that we have a scale by now. However, if the scaled
+ // value is an add of something and a constant, we can fold the
+ // constant into the disp field here.
+ if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
+ isa<ConstantSDNode>(MulVal.getNode()->getOperand(1))) {
+ Reg = MulVal.getNode()->getOperand(0);
+ ConstantSDNode *AddVal =
+ cast<ConstantSDNode>(MulVal.getNode()->getOperand(1));
+ uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
+ if (foldOffsetIntoAddress(Disp, AM))
+ Reg = N.getNode()->getOperand(0);
+ } else {
+ Reg = N.getNode()->getOperand(0);
+ }
+
+ AM.IndexReg = AM.Base_Reg = Reg;
+ return false;
+ }
+ }
+ break;
+
+ case ISD::SUB: {
+ // Given A-B, if A can be completely folded into the address and
+ // the index field with the index field unused, use -B as the index.
+ // This is a win if a has multiple parts that can be folded into
+ // the address. Also, this saves a mov if the base register has
+ // other uses, since it avoids a two-address sub instruction, however
+ // it costs an additional mov if the index register has other uses.
+
+ // Add an artificial use to this node so that we can keep track of
+ // it if it gets CSE'd with a different node.
+ HandleSDNode Handle(N);
+
+ // Test if the LHS of the sub can be folded.
+ X86ISelAddressMode Backup = AM;
+ if (matchAddressRecursively(N.getNode()->getOperand(0), AM, Depth+1)) {
+ AM = Backup;
+ break;
+ }
+ // Test if the index field is free for use.
+ if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
+ AM = Backup;
+ break;
+ }
+
+ int Cost = 0;
+ SDValue RHS = Handle.getValue().getNode()->getOperand(1);
+ // If the RHS involves a register with multiple uses, this
+ // transformation incurs an extra mov, due to the neg instruction
+ // clobbering its operand.
+ if (!RHS.getNode()->hasOneUse() ||
+ RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
+ RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
+ RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
+ (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
+ RHS.getNode()->getOperand(0).getValueType() == MVT::i32))
+ ++Cost;
+ // If the base is a register with multiple uses, this
+ // transformation may save a mov.
+ if ((AM.BaseType == X86ISelAddressMode::RegBase &&
+ AM.Base_Reg.getNode() &&
+ !AM.Base_Reg.getNode()->hasOneUse()) ||
+ AM.BaseType == X86ISelAddressMode::FrameIndexBase)
+ --Cost;
+ // If the folded LHS was interesting, this transformation saves
+ // address arithmetic.
+ if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
+ ((AM.Disp != 0) && (Backup.Disp == 0)) +
+ (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
+ --Cost;
+ // If it doesn't look like it may be an overall win, don't do it.
+ if (Cost >= 0) {
+ AM = Backup;
+ break;
+ }
+
+ // Ok, the transformation is legal and appears profitable. Go for it.
+ SDValue Zero = CurDAG->getConstant(0, dl, N.getValueType());
+ SDValue Neg = CurDAG->getNode(ISD::SUB, dl, N.getValueType(), Zero, RHS);
+ AM.IndexReg = Neg;
+ AM.Scale = 1;
+
+ // Insert the new nodes into the topological ordering.
+ insertDAGNode(*CurDAG, N, Zero);
+ insertDAGNode(*CurDAG, N, Neg);
+ return false;
+ }
+
+ case ISD::ADD:
+ if (!matchAdd(N, AM, Depth))
+ return false;
+ break;
+
+ case ISD::OR:
+ // We want to look through a transform in InstCombine and DAGCombiner that
+ // turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'.
+ // Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3))
+ // An 'lea' can then be used to match the shift (multiply) and add:
+ // and $1, %esi
+ // lea (%rsi, %rdi, 8), %rax
+ if (CurDAG->haveNoCommonBitsSet(N.getOperand(0), N.getOperand(1)) &&
+ !matchAdd(N, AM, Depth))
+ return false;
+ break;
+
+ case ISD::AND: {
+ // Perform some heroic transforms on an and of a constant-count shift
+ // with a constant to enable use of the scaled offset field.
+
+ // Scale must not be used already.
+ if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
+
+ SDValue Shift = N.getOperand(0);
+ if (Shift.getOpcode() != ISD::SRL && Shift.getOpcode() != ISD::SHL) break;
+ SDValue X = Shift.getOperand(0);
+
+ // We only handle up to 64-bit values here as those are what matter for
+ // addressing mode optimizations.
+ if (X.getSimpleValueType().getSizeInBits() > 64) break;
+
+ if (!isa<ConstantSDNode>(N.getOperand(1)))
+ break;
+ uint64_t Mask = N.getConstantOperandVal(1);
+
+ // Try to fold the mask and shift into an extract and scale.
+ if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
+ return false;
+
+ // Try to fold the mask and shift directly into the scale.
+ if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
+ return false;
+
+ // Try to swap the mask and shift to place shifts which can be done as
+ // a scale on the outside of the mask.
+ if (!foldMaskedShiftToScaledMask(*CurDAG, N, Mask, Shift, X, AM))
+ return false;
+ break;
+ }
+ }
+
+ return matchAddressBase(N, AM);
+}
+
+/// Helper for MatchAddress. Add the specified node to the
+/// specified addressing mode without any further recursion.
+bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
+ // Is the base register already occupied?
+ if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
+ // If so, check to see if the scale index register is set.
+ if (!AM.IndexReg.getNode()) {
+ AM.IndexReg = N;
+ AM.Scale = 1;
+ return false;
+ }
+
+ // Otherwise, we cannot select it.
+ return true;
+ }
+
+ // Default, generate it as a register.
+ AM.BaseType = X86ISelAddressMode::RegBase;
+ AM.Base_Reg = N;
+ return false;
+}
+
+bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index,
+ SDValue &Disp, SDValue &Segment) {
+
+ MaskedGatherScatterSDNode *Mgs = dyn_cast<MaskedGatherScatterSDNode>(Parent);
+ if (!Mgs)
+ return false;
+ X86ISelAddressMode AM;
+ unsigned AddrSpace = Mgs->getPointerInfo().getAddrSpace();
+ // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
+ if (AddrSpace == 256)
+ AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
+ if (AddrSpace == 257)
+ AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
+ if (AddrSpace == 258)
+ AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
+
+ SDLoc DL(N);
+ Base = Mgs->getBasePtr();
+ Index = Mgs->getIndex();
+ unsigned ScalarSize = Mgs->getValue().getScalarValueSizeInBits();
+ Scale = getI8Imm(ScalarSize/8, DL);
+
+ // If Base is 0, the whole address is in index and the Scale is 1
+ if (isa<ConstantSDNode>(Base)) {
+ assert(cast<ConstantSDNode>(Base)->isNullValue() &&
+ "Unexpected base in gather/scatter");
+ Scale = getI8Imm(1, DL);
+ Base = CurDAG->getRegister(0, MVT::i32);
+ }
+ if (AM.Segment.getNode())
+ Segment = AM.Segment;
+ else
+ Segment = CurDAG->getRegister(0, MVT::i32);
+ Disp = CurDAG->getTargetConstant(0, DL, MVT::i32);
+ return true;
+}
+
+/// Returns true if it is able to pattern match an addressing mode.
+/// It returns the operands which make up the maximal addressing mode it can
+/// match by reference.
+///
+/// Parent is the parent node of the addr operand that is being matched. It
+/// is always a load, store, atomic node, or null. It is only null when
+/// checking memory operands for inline asm nodes.
+bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index,
+ SDValue &Disp, SDValue &Segment) {
+ X86ISelAddressMode AM;
+
+ if (Parent &&
+ // This list of opcodes are all the nodes that have an "addr:$ptr" operand
+ // that are not a MemSDNode, and thus don't have proper addrspace info.
+ Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
+ Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
+ Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
+ Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
+ Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
+ unsigned AddrSpace =
+ cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
+ // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
+ if (AddrSpace == 256)
+ AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
+ if (AddrSpace == 257)
+ AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
+ if (AddrSpace == 258)
+ AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
+ }
+
+ if (matchAddress(N, AM))
+ return false;
+
+ MVT VT = N.getSimpleValueType();
+ if (AM.BaseType == X86ISelAddressMode::RegBase) {
+ if (!AM.Base_Reg.getNode())
+ AM.Base_Reg = CurDAG->getRegister(0, VT);
+ }
+
+ if (!AM.IndexReg.getNode())
+ AM.IndexReg = CurDAG->getRegister(0, VT);
+
+ getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment);
+ return true;
+}
+
+/// Match a scalar SSE load. In particular, we want to match a load whose top
+/// elements are either undef or zeros. The load flavor is derived from the
+/// type of N, which is either v4f32 or v2f64.
+///
+/// We also return:
+/// PatternChainNode: this is the matched node that has a chain input and
+/// output.
+bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root,
+ SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index,
+ SDValue &Disp, SDValue &Segment,
+ SDValue &PatternNodeWithChain) {
+ // We can allow a full vector load here since narrowing a load is ok.
+ if (ISD::isNON_EXTLoad(N.getNode())) {
+ PatternNodeWithChain = N;
+ if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
+ IsLegalToFold(PatternNodeWithChain, *N->use_begin(), Root, OptLevel)) {
+ LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
+ return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
+ Segment);
+ }
+ }
+
+ // We can also match the special zero extended load opcode.
+ if (N.getOpcode() == X86ISD::VZEXT_LOAD) {
+ PatternNodeWithChain = N;
+ if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
+ IsLegalToFold(PatternNodeWithChain, *N->use_begin(), Root, OptLevel)) {
+ auto *MI = cast<MemIntrinsicSDNode>(PatternNodeWithChain);
+ return selectAddr(MI, MI->getBasePtr(), Base, Scale, Index, Disp,
+ Segment);
+ }
+ }
+
+ // Need to make sure that the SCALAR_TO_VECTOR and load are both only used
+ // once. Otherwise the load might get duplicated and the chain output of the
+ // duplicate load will not be observed by all dependencies.
+ if (N.getOpcode() == ISD::SCALAR_TO_VECTOR && N.getNode()->hasOneUse()) {
+ PatternNodeWithChain = N.getOperand(0);
+ if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
+ IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
+ IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) {
+ LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
+ return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
+ Segment);
+ }
+ }
+
+ // Also handle the case where we explicitly require zeros in the top
+ // elements. This is a vector shuffle from the zero vector.
+ if (N.getOpcode() == X86ISD::VZEXT_MOVL && N.getNode()->hasOneUse() &&
+ // Check to see if the top elements are all zeros (or bitcast of zeros).
+ N.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ N.getOperand(0).getNode()->hasOneUse()) {
+ PatternNodeWithChain = N.getOperand(0).getOperand(0);
+ if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
+ IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
+ IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) {
+ // Okay, this is a zero extending load. Fold it.
+ LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
+ return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
+ Segment);
+ }
+ }
+
+ return false;
+}
+
+
+bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
+ if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
+ uint64_t ImmVal = CN->getZExtValue();
+ if ((uint32_t)ImmVal != (uint64_t)ImmVal)
+ return false;
+
+ Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i64);
+ return true;
+ }
+
+ // In static codegen with small code model, we can get the address of a label
+ // into a register with 'movl'. TableGen has already made sure we're looking
+ // at a label of some kind.
+ assert(N->getOpcode() == X86ISD::Wrapper &&
+ "Unexpected node type for MOV32ri64");
+ N = N.getOperand(0);
+
+ // At least GNU as does not accept 'movl' for TPOFF relocations.
+ // FIXME: We could use 'movl' when we know we are targeting MC.
+ if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
+ return false;
+
+ Imm = N;
+ if (N->getOpcode() != ISD::TargetGlobalAddress)
+ return TM.getCodeModel() == CodeModel::Small;
+
+ Optional<ConstantRange> CR =
+ cast<GlobalAddressSDNode>(N)->getGlobal()->getAbsoluteSymbolRange();
+ if (!CR)
+ return TM.getCodeModel() == CodeModel::Small;
+
+ return CR->getUnsignedMax().ult(1ull << 32);
+}
+
+bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index,
+ SDValue &Disp, SDValue &Segment) {
+ // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
+ SDLoc DL(N);
+
+ if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
+ return false;
+
+ RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Base);
+ if (RN && RN->getReg() == 0)
+ Base = CurDAG->getRegister(0, MVT::i64);
+ else if (Base.getValueType() == MVT::i32 && !dyn_cast<FrameIndexSDNode>(Base)) {
+ // Base could already be %rip, particularly in the x32 ABI.
+ Base = SDValue(CurDAG->getMachineNode(
+ TargetOpcode::SUBREG_TO_REG, DL, MVT::i64,
+ CurDAG->getTargetConstant(0, DL, MVT::i64),
+ Base,
+ CurDAG->getTargetConstant(X86::sub_32bit, DL, MVT::i32)),
+ 0);
+ }
+
+ RN = dyn_cast<RegisterSDNode>(Index);
+ if (RN && RN->getReg() == 0)
+ Index = CurDAG->getRegister(0, MVT::i64);
+ else {
+ assert(Index.getValueType() == MVT::i32 &&
+ "Expect to be extending 32-bit registers for use in LEA");
+ Index = SDValue(CurDAG->getMachineNode(
+ TargetOpcode::SUBREG_TO_REG, DL, MVT::i64,
+ CurDAG->getTargetConstant(0, DL, MVT::i64),
+ Index,
+ CurDAG->getTargetConstant(X86::sub_32bit, DL,
+ MVT::i32)),
+ 0);
+ }
+
+ return true;
+}
+
+/// Calls SelectAddr and determines if the maximal addressing
+/// mode it matches can be cost effectively emitted as an LEA instruction.
+bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment) {
+ X86ISelAddressMode AM;
+
+ // Save the DL and VT before calling matchAddress, it can invalidate N.
+ SDLoc DL(N);
+ MVT VT = N.getSimpleValueType();
+
+ // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
+ // segments.
+ SDValue Copy = AM.Segment;
+ SDValue T = CurDAG->getRegister(0, MVT::i32);
+ AM.Segment = T;
+ if (matchAddress(N, AM))
+ return false;
+ assert (T == AM.Segment);
+ AM.Segment = Copy;
+
+ unsigned Complexity = 0;
+ if (AM.BaseType == X86ISelAddressMode::RegBase)
+ if (AM.Base_Reg.getNode())
+ Complexity = 1;
+ else
+ AM.Base_Reg = CurDAG->getRegister(0, VT);
+ else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
+ Complexity = 4;
+
+ if (AM.IndexReg.getNode())
+ Complexity++;
+ else
+ AM.IndexReg = CurDAG->getRegister(0, VT);
+
+ // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
+ // a simple shift.
+ if (AM.Scale > 1)
+ Complexity++;
+
+ // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
+ // to a LEA. This is determined with some experimentation but is by no means
+ // optimal (especially for code size consideration). LEA is nice because of
+ // its three-address nature. Tweak the cost function again when we can run
+ // convertToThreeAddress() at register allocation time.
+ if (AM.hasSymbolicDisplacement()) {
+ // For X86-64, always use LEA to materialize RIP-relative addresses.
+ if (Subtarget->is64Bit())
+ Complexity = 4;
+ else
+ Complexity += 2;
+ }
+
+ if (AM.Disp && (AM.Base_Reg.getNode() || AM.IndexReg.getNode()))
+ Complexity++;
+
+ // If it isn't worth using an LEA, reject it.
+ if (Complexity <= 2)
+ return false;
+
+ getAddressOperands(AM, DL, Base, Scale, Index, Disp, Segment);
+ return true;
+}
+
+/// This is only run on TargetGlobalTLSAddress nodes.
+bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index,
+ SDValue &Disp, SDValue &Segment) {
+ assert(N.getOpcode() == ISD::TargetGlobalTLSAddress);
+ const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
+
+ X86ISelAddressMode AM;
+ AM.GV = GA->getGlobal();
+ AM.Disp += GA->getOffset();
+ AM.Base_Reg = CurDAG->getRegister(0, N.getValueType());
+ AM.SymbolFlags = GA->getTargetFlags();
+
+ if (N.getValueType() == MVT::i32) {
+ AM.Scale = 1;
+ AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
+ } else {
+ AM.IndexReg = CurDAG->getRegister(0, MVT::i64);
+ }
+
+ getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment);
+ return true;
+}
+
+bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
+ if (auto *CN = dyn_cast<ConstantSDNode>(N)) {
+ Op = CurDAG->getTargetConstant(CN->getAPIntValue(), SDLoc(CN),
+ N.getValueType());
+ return true;
+ }
+
+ // Keep track of the original value type and whether this value was
+ // truncated. If we see a truncation from pointer type to VT that truncates
+ // bits that are known to be zero, we can use a narrow reference.
+ EVT VT = N.getValueType();
+ bool WasTruncated = false;
+ if (N.getOpcode() == ISD::TRUNCATE) {
+ WasTruncated = true;
+ N = N.getOperand(0);
+ }
+
+ if (N.getOpcode() != X86ISD::Wrapper)
+ return false;
+
+ // We can only use non-GlobalValues as immediates if they were not truncated,
+ // as we do not have any range information. If we have a GlobalValue and the
+ // address was not truncated, we can select it as an operand directly.
+ unsigned Opc = N.getOperand(0)->getOpcode();
+ if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
+ Op = N.getOperand(0);
+ // We can only select the operand directly if we didn't have to look past a
+ // truncate.
+ return !WasTruncated;
+ }
+
+ // Check that the global's range fits into VT.
+ auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
+ Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
+ if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
+ return false;
+
+ // Okay, we can use a narrow reference.
+ Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
+ GA->getOffset(), GA->getTargetFlags());
+ return true;
+}
+
+bool X86DAGToDAGISel::tryFoldLoad(SDNode *P, SDValue N,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment) {
+ if (!ISD::isNON_EXTLoad(N.getNode()) ||
+ !IsProfitableToFold(N, P, P) ||
+ !IsLegalToFold(N, P, P, OptLevel))
+ return false;
+
+ return selectAddr(N.getNode(),
+ N.getOperand(1), Base, Scale, Index, Disp, Segment);
+}
+
+/// Return an SDNode that returns the value of the global base register.
+/// Output instructions required to initialize the global base register,
+/// if necessary.
+SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
+ unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
+ auto &DL = MF->getDataLayout();
+ return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
+}
+
+/// Test whether the given X86ISD::CMP node has any uses which require the SF
+/// or OF bits to be accurate.
+static bool hasNoSignedComparisonUses(SDNode *N) {
+ // Examine each user of the node.
+ for (SDNode::use_iterator UI = N->use_begin(),
+ UE = N->use_end(); UI != UE; ++UI) {
+ // Only examine CopyToReg uses.
+ if (UI->getOpcode() != ISD::CopyToReg)
+ return false;
+ // Only examine CopyToReg uses that copy to EFLAGS.
+ if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() !=
+ X86::EFLAGS)
+ return false;
+ // Examine each user of the CopyToReg use.
+ for (SDNode::use_iterator FlagUI = UI->use_begin(),
+ FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
+ // Only examine the Flag result.
+ if (FlagUI.getUse().getResNo() != 1) continue;
+ // Anything unusual: assume conservatively.
+ if (!FlagUI->isMachineOpcode()) return false;
+ // Examine the opcode of the user.
+ switch (FlagUI->getMachineOpcode()) {
+ // These comparisons don't treat the most significant bit specially.
+ case X86::SETAr: case X86::SETAEr: case X86::SETBr: case X86::SETBEr:
+ case X86::SETEr: case X86::SETNEr: case X86::SETPr: case X86::SETNPr:
+ case X86::SETAm: case X86::SETAEm: case X86::SETBm: case X86::SETBEm:
+ case X86::SETEm: case X86::SETNEm: case X86::SETPm: case X86::SETNPm:
+ case X86::JA_1: case X86::JAE_1: case X86::JB_1: case X86::JBE_1:
+ case X86::JE_1: case X86::JNE_1: case X86::JP_1: case X86::JNP_1:
+ case X86::CMOVA16rr: case X86::CMOVA16rm:
+ case X86::CMOVA32rr: case X86::CMOVA32rm:
+ case X86::CMOVA64rr: case X86::CMOVA64rm:
+ case X86::CMOVAE16rr: case X86::CMOVAE16rm:
+ case X86::CMOVAE32rr: case X86::CMOVAE32rm:
+ case X86::CMOVAE64rr: case X86::CMOVAE64rm:
+ case X86::CMOVB16rr: case X86::CMOVB16rm:
+ case X86::CMOVB32rr: case X86::CMOVB32rm:
+ case X86::CMOVB64rr: case X86::CMOVB64rm:
+ case X86::CMOVBE16rr: case X86::CMOVBE16rm:
+ case X86::CMOVBE32rr: case X86::CMOVBE32rm:
+ case X86::CMOVBE64rr: case X86::CMOVBE64rm:
+ case X86::CMOVE16rr: case X86::CMOVE16rm:
+ case X86::CMOVE32rr: case X86::CMOVE32rm:
+ case X86::CMOVE64rr: case X86::CMOVE64rm:
+ case X86::CMOVNE16rr: case X86::CMOVNE16rm:
+ case X86::CMOVNE32rr: case X86::CMOVNE32rm:
+ case X86::CMOVNE64rr: case X86::CMOVNE64rm:
+ case X86::CMOVNP16rr: case X86::CMOVNP16rm:
+ case X86::CMOVNP32rr: case X86::CMOVNP32rm:
+ case X86::CMOVNP64rr: case X86::CMOVNP64rm:
+ case X86::CMOVP16rr: case X86::CMOVP16rm:
+ case X86::CMOVP32rr: case X86::CMOVP32rm:
+ case X86::CMOVP64rr: case X86::CMOVP64rm:
+ continue;
+ // Anything else: assume conservatively.
+ default: return false;
+ }
+ }
+ }
+ return true;
+}
+
+/// Check whether or not the chain ending in StoreNode is suitable for doing
+/// the {load; increment or decrement; store} to modify transformation.
+static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc,
+ SDValue StoredVal, SelectionDAG *CurDAG,
+ LoadSDNode* &LoadNode, SDValue &InputChain) {
+
+ // is the value stored the result of a DEC or INC?
+ if (!(Opc == X86ISD::DEC || Opc == X86ISD::INC)) return false;
+
+ // is the stored value result 0 of the load?
+ if (StoredVal.getResNo() != 0) return false;
+
+ // are there other uses of the loaded value than the inc or dec?
+ if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
+
+ // is the store non-extending and non-indexed?
+ if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
+ return false;
+
+ SDValue Load = StoredVal->getOperand(0);
+ // Is the stored value a non-extending and non-indexed load?
+ if (!ISD::isNormalLoad(Load.getNode())) return false;
+
+ // Return LoadNode by reference.
+ LoadNode = cast<LoadSDNode>(Load);
+ // is the size of the value one that we can handle? (i.e. 64, 32, 16, or 8)
+ EVT LdVT = LoadNode->getMemoryVT();
+ if (LdVT != MVT::i64 && LdVT != MVT::i32 && LdVT != MVT::i16 &&
+ LdVT != MVT::i8)
+ return false;
+
+ // Is store the only read of the loaded value?
+ if (!Load.hasOneUse())
+ return false;
+
+ // Is the address of the store the same as the load?
+ if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
+ LoadNode->getOffset() != StoreNode->getOffset())
+ return false;
+
+ // Check if the chain is produced by the load or is a TokenFactor with
+ // the load output chain as an operand. Return InputChain by reference.
+ SDValue Chain = StoreNode->getChain();
+
+ bool ChainCheck = false;
+ if (Chain == Load.getValue(1)) {
+ ChainCheck = true;
+ InputChain = LoadNode->getChain();
+ } else if (Chain.getOpcode() == ISD::TokenFactor) {
+ SmallVector<SDValue, 4> ChainOps;
+ for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
+ SDValue Op = Chain.getOperand(i);
+ if (Op == Load.getValue(1)) {
+ ChainCheck = true;
+ continue;
+ }
+
+ // Make sure using Op as part of the chain would not cause a cycle here.
+ // In theory, we could check whether the chain node is a predecessor of
+ // the load. But that can be very expensive. Instead visit the uses and
+ // make sure they all have smaller node id than the load.
+ int LoadId = LoadNode->getNodeId();
+ for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+ UE = UI->use_end(); UI != UE; ++UI) {
+ if (UI.getUse().getResNo() != 0)
+ continue;
+ if (UI->getNodeId() > LoadId)
+ return false;
+ }
+
+ ChainOps.push_back(Op);
+ }
+
+ if (ChainCheck)
+ // Make a new TokenFactor with all the other input chains except
+ // for the load.
+ InputChain = CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain),
+ MVT::Other, ChainOps);
+ }
+ if (!ChainCheck)
+ return false;
+
+ return true;
+}
+
+/// Get the appropriate X86 opcode for an in-memory increment or decrement.
+/// Opc should be X86ISD::DEC or X86ISD::INC.
+static unsigned getFusedLdStOpcode(EVT &LdVT, unsigned Opc) {
+ if (Opc == X86ISD::DEC) {
+ if (LdVT == MVT::i64) return X86::DEC64m;
+ if (LdVT == MVT::i32) return X86::DEC32m;
+ if (LdVT == MVT::i16) return X86::DEC16m;
+ if (LdVT == MVT::i8) return X86::DEC8m;
+ } else {
+ assert(Opc == X86ISD::INC && "unrecognized opcode");
+ if (LdVT == MVT::i64) return X86::INC64m;
+ if (LdVT == MVT::i32) return X86::INC32m;
+ if (LdVT == MVT::i16) return X86::INC16m;
+ if (LdVT == MVT::i8) return X86::INC8m;
+ }
+ llvm_unreachable("unrecognized size for LdVT");
+}
+
+/// Customized ISel for GATHER operations.
+bool X86DAGToDAGISel::tryGather(SDNode *Node, unsigned Opc) {
+ // Operands of Gather: VSrc, Base, VIdx, VMask, Scale
+ SDValue Chain = Node->getOperand(0);
+ SDValue VSrc = Node->getOperand(2);
+ SDValue Base = Node->getOperand(3);
+ SDValue VIdx = Node->getOperand(4);
+ SDValue VMask = Node->getOperand(5);
+ ConstantSDNode *Scale = dyn_cast<ConstantSDNode>(Node->getOperand(6));
+ if (!Scale)
+ return false;
+
+ SDVTList VTs = CurDAG->getVTList(VSrc.getValueType(), VSrc.getValueType(),
+ MVT::Other);
+
+ SDLoc DL(Node);
+
+ // Memory Operands: Base, Scale, Index, Disp, Segment
+ SDValue Disp = CurDAG->getTargetConstant(0, DL, MVT::i32);
+ SDValue Segment = CurDAG->getRegister(0, MVT::i32);
+ const SDValue Ops[] = { VSrc, Base, getI8Imm(Scale->getSExtValue(), DL), VIdx,
+ Disp, Segment, VMask, Chain};
+ SDNode *ResNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops);
+ // Node has 2 outputs: VDst and MVT::Other.
+ // ResNode has 3 outputs: VDst, VMask_wb, and MVT::Other.
+ // We replace VDst of Node with VDst of ResNode, and Other of Node with Other
+ // of ResNode.
+ ReplaceUses(SDValue(Node, 0), SDValue(ResNode, 0));
+ ReplaceUses(SDValue(Node, 1), SDValue(ResNode, 2));
+ CurDAG->RemoveDeadNode(Node);
+ return true;
+}
+
+void X86DAGToDAGISel::Select(SDNode *Node) {
+ MVT NVT = Node->getSimpleValueType(0);
+ unsigned Opc, MOpc;
+ unsigned Opcode = Node->getOpcode();
+ SDLoc dl(Node);
+
+ DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << '\n');
+
+ if (Node->isMachineOpcode()) {
+ DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
+ Node->setNodeId(-1);
+ return; // Already selected.
+ }
+
+ switch (Opcode) {
+ default: break;
+ case ISD::BRIND: {
+ if (Subtarget->isTargetNaCl())
+ // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
+ // leave the instruction alone.
+ break;
+ if (Subtarget->isTarget64BitILP32()) {
+ // Converts a 32-bit register to a 64-bit, zero-extended version of
+ // it. This is needed because x86-64 can do many things, but jmp %r32
+ // ain't one of them.
+ const SDValue &Target = Node->getOperand(1);
+ assert(Target.getSimpleValueType() == llvm::MVT::i32);
+ SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, EVT(MVT::i64));
+ SDValue Brind = CurDAG->getNode(ISD::BRIND, dl, MVT::Other,
+ Node->getOperand(0), ZextTarget);
+ ReplaceNode(Node, Brind.getNode());
+ SelectCode(ZextTarget.getNode());
+ SelectCode(Brind.getNode());
+ return;
+ }
+ break;
+ }
+ case ISD::INTRINSIC_W_CHAIN: {
+ unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+ switch (IntNo) {
+ default: break;
+ case Intrinsic::x86_avx2_gather_d_pd:
+ case Intrinsic::x86_avx2_gather_d_pd_256:
+ case Intrinsic::x86_avx2_gather_q_pd:
+ case Intrinsic::x86_avx2_gather_q_pd_256:
+ case Intrinsic::x86_avx2_gather_d_ps:
+ case Intrinsic::x86_avx2_gather_d_ps_256:
+ case Intrinsic::x86_avx2_gather_q_ps:
+ case Intrinsic::x86_avx2_gather_q_ps_256:
+ case Intrinsic::x86_avx2_gather_d_q:
+ case Intrinsic::x86_avx2_gather_d_q_256:
+ case Intrinsic::x86_avx2_gather_q_q:
+ case Intrinsic::x86_avx2_gather_q_q_256:
+ case Intrinsic::x86_avx2_gather_d_d:
+ case Intrinsic::x86_avx2_gather_d_d_256:
+ case Intrinsic::x86_avx2_gather_q_d:
+ case Intrinsic::x86_avx2_gather_q_d_256: {
+ if (!Subtarget->hasAVX2())
+ break;
+ unsigned Opc;
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic");
+ case Intrinsic::x86_avx2_gather_d_pd: Opc = X86::VGATHERDPDrm; break;
+ case Intrinsic::x86_avx2_gather_d_pd_256: Opc = X86::VGATHERDPDYrm; break;
+ case Intrinsic::x86_avx2_gather_q_pd: Opc = X86::VGATHERQPDrm; break;
+ case Intrinsic::x86_avx2_gather_q_pd_256: Opc = X86::VGATHERQPDYrm; break;
+ case Intrinsic::x86_avx2_gather_d_ps: Opc = X86::VGATHERDPSrm; break;
+ case Intrinsic::x86_avx2_gather_d_ps_256: Opc = X86::VGATHERDPSYrm; break;
+ case Intrinsic::x86_avx2_gather_q_ps: Opc = X86::VGATHERQPSrm; break;
+ case Intrinsic::x86_avx2_gather_q_ps_256: Opc = X86::VGATHERQPSYrm; break;
+ case Intrinsic::x86_avx2_gather_d_q: Opc = X86::VPGATHERDQrm; break;
+ case Intrinsic::x86_avx2_gather_d_q_256: Opc = X86::VPGATHERDQYrm; break;
+ case Intrinsic::x86_avx2_gather_q_q: Opc = X86::VPGATHERQQrm; break;
+ case Intrinsic::x86_avx2_gather_q_q_256: Opc = X86::VPGATHERQQYrm; break;
+ case Intrinsic::x86_avx2_gather_d_d: Opc = X86::VPGATHERDDrm; break;
+ case Intrinsic::x86_avx2_gather_d_d_256: Opc = X86::VPGATHERDDYrm; break;
+ case Intrinsic::x86_avx2_gather_q_d: Opc = X86::VPGATHERQDrm; break;
+ case Intrinsic::x86_avx2_gather_q_d_256: Opc = X86::VPGATHERQDYrm; break;
+ }
+ if (tryGather(Node, Opc))
+ return;
+ break;
+ }
+ }
+ break;
+ }
+ case X86ISD::GlobalBaseReg:
+ ReplaceNode(Node, getGlobalBaseReg());
+ return;
+
+ case X86ISD::SHRUNKBLEND: {
+ // SHRUNKBLEND selects like a regular VSELECT.
+ SDValue VSelect = CurDAG->getNode(
+ ISD::VSELECT, SDLoc(Node), Node->getValueType(0), Node->getOperand(0),
+ Node->getOperand(1), Node->getOperand(2));
+ ReplaceUses(SDValue(Node, 0), VSelect);
+ SelectCode(VSelect.getNode());
+ // We already called ReplaceUses.
+ return;
+ }
+
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR: {
+ // For operations of the form (x << C1) op C2, check if we can use a smaller
+ // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
+ SDValue N0 = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+
+ if (N0->getOpcode() != ISD::SHL || !N0->hasOneUse())
+ break;
+
+ // i8 is unshrinkable, i16 should be promoted to i32.
+ if (NVT != MVT::i32 && NVT != MVT::i64)
+ break;
+
+ ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
+ ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
+ if (!Cst || !ShlCst)
+ break;
+
+ int64_t Val = Cst->getSExtValue();
+ uint64_t ShlVal = ShlCst->getZExtValue();
+
+ // Make sure that we don't change the operation by removing bits.
+ // This only matters for OR and XOR, AND is unaffected.
+ uint64_t RemovedBitsMask = (1ULL << ShlVal) - 1;
+ if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
+ break;
+
+ unsigned ShlOp, AddOp, Op;
+ MVT CstVT = NVT;
+
+ // Check the minimum bitwidth for the new constant.
+ // TODO: AND32ri is the same as AND64ri32 with zext imm.
+ // TODO: MOV32ri+OR64r is cheaper than MOV64ri64+OR64rr
+ // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
+ if (!isInt<8>(Val) && isInt<8>(Val >> ShlVal))
+ CstVT = MVT::i8;
+ else if (!isInt<32>(Val) && isInt<32>(Val >> ShlVal))
+ CstVT = MVT::i32;
+
+ // Bail if there is no smaller encoding.
+ if (NVT == CstVT)
+ break;
+
+ switch (NVT.SimpleTy) {
+ default: llvm_unreachable("Unsupported VT!");
+ case MVT::i32:
+ assert(CstVT == MVT::i8);
+ ShlOp = X86::SHL32ri;
+ AddOp = X86::ADD32rr;
+
+ switch (Opcode) {
+ default: llvm_unreachable("Impossible opcode");
+ case ISD::AND: Op = X86::AND32ri8; break;
+ case ISD::OR: Op = X86::OR32ri8; break;
+ case ISD::XOR: Op = X86::XOR32ri8; break;
+ }
+ break;
+ case MVT::i64:
+ assert(CstVT == MVT::i8 || CstVT == MVT::i32);
+ ShlOp = X86::SHL64ri;
+ AddOp = X86::ADD64rr;
+
+ switch (Opcode) {
+ default: llvm_unreachable("Impossible opcode");
+ case ISD::AND: Op = CstVT==MVT::i8? X86::AND64ri8 : X86::AND64ri32; break;
+ case ISD::OR: Op = CstVT==MVT::i8? X86::OR64ri8 : X86::OR64ri32; break;
+ case ISD::XOR: Op = CstVT==MVT::i8? X86::XOR64ri8 : X86::XOR64ri32; break;
+ }
+ break;
+ }
+
+ // Emit the smaller op and the shift.
+ SDValue NewCst = CurDAG->getTargetConstant(Val >> ShlVal, dl, CstVT);
+ SDNode *New = CurDAG->getMachineNode(Op, dl, NVT, N0->getOperand(0),NewCst);
+ if (ShlVal == 1)
+ CurDAG->SelectNodeTo(Node, AddOp, NVT, SDValue(New, 0),
+ SDValue(New, 0));
+ else
+ CurDAG->SelectNodeTo(Node, ShlOp, NVT, SDValue(New, 0),
+ getI8Imm(ShlVal, dl));
+ return;
+ }
+ case X86ISD::UMUL8:
+ case X86ISD::SMUL8: {
+ SDValue N0 = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+
+ Opc = (Opcode == X86ISD::SMUL8 ? X86::IMUL8r : X86::MUL8r);
+
+ SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::AL,
+ N0, SDValue()).getValue(1);
+
+ SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32);
+ SDValue Ops[] = {N1, InFlag};
+ SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+
+ ReplaceNode(Node, CNode);
+ return;
+ }
+
+ case X86ISD::UMUL: {
+ SDValue N0 = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+
+ unsigned LoReg;
+ switch (NVT.SimpleTy) {
+ default: llvm_unreachable("Unsupported VT!");
+ case MVT::i8: LoReg = X86::AL; Opc = X86::MUL8r; break;
+ case MVT::i16: LoReg = X86::AX; Opc = X86::MUL16r; break;
+ case MVT::i32: LoReg = X86::EAX; Opc = X86::MUL32r; break;
+ case MVT::i64: LoReg = X86::RAX; Opc = X86::MUL64r; break;
+ }
+
+ SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
+ N0, SDValue()).getValue(1);
+
+ SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
+ SDValue Ops[] = {N1, InFlag};
+ SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+
+ ReplaceNode(Node, CNode);
+ return;
+ }
+
+ case ISD::SMUL_LOHI:
+ case ISD::UMUL_LOHI: {
+ SDValue N0 = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+
+ bool isSigned = Opcode == ISD::SMUL_LOHI;
+ bool hasBMI2 = Subtarget->hasBMI2();
+ if (!isSigned) {
+ switch (NVT.SimpleTy) {
+ default: llvm_unreachable("Unsupported VT!");
+ case MVT::i8: Opc = X86::MUL8r; MOpc = X86::MUL8m; break;
+ case MVT::i16: Opc = X86::MUL16r; MOpc = X86::MUL16m; break;
+ case MVT::i32: Opc = hasBMI2 ? X86::MULX32rr : X86::MUL32r;
+ MOpc = hasBMI2 ? X86::MULX32rm : X86::MUL32m; break;
+ case MVT::i64: Opc = hasBMI2 ? X86::MULX64rr : X86::MUL64r;
+ MOpc = hasBMI2 ? X86::MULX64rm : X86::MUL64m; break;
+ }
+ } else {
+ switch (NVT.SimpleTy) {
+ default: llvm_unreachable("Unsupported VT!");
+ case MVT::i8: Opc = X86::IMUL8r; MOpc = X86::IMUL8m; break;
+ case MVT::i16: Opc = X86::IMUL16r; MOpc = X86::IMUL16m; break;
+ case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break;
+ case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break;
+ }
+ }
+
+ unsigned SrcReg, LoReg, HiReg;
+ switch (Opc) {
+ default: llvm_unreachable("Unknown MUL opcode!");
+ case X86::IMUL8r:
+ case X86::MUL8r:
+ SrcReg = LoReg = X86::AL; HiReg = X86::AH;
+ break;
+ case X86::IMUL16r:
+ case X86::MUL16r:
+ SrcReg = LoReg = X86::AX; HiReg = X86::DX;
+ break;
+ case X86::IMUL32r:
+ case X86::MUL32r:
+ SrcReg = LoReg = X86::EAX; HiReg = X86::EDX;
+ break;
+ case X86::IMUL64r:
+ case X86::MUL64r:
+ SrcReg = LoReg = X86::RAX; HiReg = X86::RDX;
+ break;
+ case X86::MULX32rr:
+ SrcReg = X86::EDX; LoReg = HiReg = 0;
+ break;
+ case X86::MULX64rr:
+ SrcReg = X86::RDX; LoReg = HiReg = 0;
+ break;
+ }
+
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+ bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+ // Multiply is commmutative.
+ if (!foldedLoad) {
+ foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+ if (foldedLoad)
+ std::swap(N0, N1);
+ }
+
+ SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SrcReg,
+ N0, SDValue()).getValue(1);
+ SDValue ResHi, ResLo;
+
+ if (foldedLoad) {
+ SDValue Chain;
+ MachineSDNode *CNode = nullptr;
+ SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
+ InFlag };
+ if (MOpc == X86::MULX32rm || MOpc == X86::MULX64rm) {
+ SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other, MVT::Glue);
+ CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+ ResHi = SDValue(CNode, 0);
+ ResLo = SDValue(CNode, 1);
+ Chain = SDValue(CNode, 2);
+ InFlag = SDValue(CNode, 3);
+ } else {
+ SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
+ CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+ Chain = SDValue(CNode, 0);
+ InFlag = SDValue(CNode, 1);
+ }
+
+ // Update the chain.
+ ReplaceUses(N1.getValue(1), Chain);
+ // Record the mem-refs
+ LoadSDNode *LoadNode = cast<LoadSDNode>(N1);
+ if (LoadNode) {
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = LoadNode->getMemOperand();
+ CNode->setMemRefs(MemOp, MemOp + 1);
+ }
+ } else {
+ SDValue Ops[] = { N1, InFlag };
+ if (Opc == X86::MULX32rr || Opc == X86::MULX64rr) {
+ SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Glue);
+ SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+ ResHi = SDValue(CNode, 0);
+ ResLo = SDValue(CNode, 1);
+ InFlag = SDValue(CNode, 2);
+ } else {
+ SDVTList VTs = CurDAG->getVTList(MVT::Glue);
+ SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+ InFlag = SDValue(CNode, 0);
+ }
+ }
+
+ // Prevent use of AH in a REX instruction by referencing AX instead.
+ if (HiReg == X86::AH && Subtarget->is64Bit() &&
+ !SDValue(Node, 1).use_empty()) {
+ SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+ X86::AX, MVT::i16, InFlag);
+ InFlag = Result.getValue(2);
+ // Get the low part if needed. Don't use getCopyFromReg for aliasing
+ // registers.
+ if (!SDValue(Node, 0).use_empty())
+ ReplaceUses(SDValue(Node, 1),
+ CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
+
+ // Shift AX down 8 bits.
+ Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16,
+ Result,
+ CurDAG->getTargetConstant(8, dl, MVT::i8)),
+ 0);
+ // Then truncate it down to i8.
+ ReplaceUses(SDValue(Node, 1),
+ CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
+ }
+ // Copy the low half of the result, if it is needed.
+ if (!SDValue(Node, 0).use_empty()) {
+ if (!ResLo.getNode()) {
+ assert(LoReg && "Register for low half is not defined!");
+ ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg, NVT,
+ InFlag);
+ InFlag = ResLo.getValue(2);
+ }
+ ReplaceUses(SDValue(Node, 0), ResLo);
+ DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG); dbgs() << '\n');
+ }
+ // Copy the high half of the result, if it is needed.
+ if (!SDValue(Node, 1).use_empty()) {
+ if (!ResHi.getNode()) {
+ assert(HiReg && "Register for high half is not defined!");
+ ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg, NVT,
+ InFlag);
+ InFlag = ResHi.getValue(2);
+ }
+ ReplaceUses(SDValue(Node, 1), ResHi);
+ DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); dbgs() << '\n');
+ }
+
+ return;
+ }
+
+ case ISD::SDIVREM:
+ case ISD::UDIVREM:
+ case X86ISD::SDIVREM8_SEXT_HREG:
+ case X86ISD::UDIVREM8_ZEXT_HREG: {
+ SDValue N0 = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+
+ bool isSigned = (Opcode == ISD::SDIVREM ||
+ Opcode == X86ISD::SDIVREM8_SEXT_HREG);
+ if (!isSigned) {
+ switch (NVT.SimpleTy) {
+ default: llvm_unreachable("Unsupported VT!");
+ case MVT::i8: Opc = X86::DIV8r; MOpc = X86::DIV8m; break;
+ case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break;
+ case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break;
+ case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break;
+ }
+ } else {
+ switch (NVT.SimpleTy) {
+ default: llvm_unreachable("Unsupported VT!");
+ case MVT::i8: Opc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
+ case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
+ case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
+ case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
+ }
+ }
+
+ unsigned LoReg, HiReg, ClrReg;
+ unsigned SExtOpcode;
+ switch (NVT.SimpleTy) {
+ default: llvm_unreachable("Unsupported VT!");
+ case MVT::i8:
+ LoReg = X86::AL; ClrReg = HiReg = X86::AH;
+ SExtOpcode = X86::CBW;
+ break;
+ case MVT::i16:
+ LoReg = X86::AX; HiReg = X86::DX;
+ ClrReg = X86::DX;
+ SExtOpcode = X86::CWD;
+ break;
+ case MVT::i32:
+ LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
+ SExtOpcode = X86::CDQ;
+ break;
+ case MVT::i64:
+ LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
+ SExtOpcode = X86::CQO;
+ break;
+ }
+
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+ bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+ bool signBitIsZero = CurDAG->SignBitIsZero(N0);
+
+ SDValue InFlag;
+ if (NVT == MVT::i8 && (!isSigned || signBitIsZero)) {
+ // Special case for div8, just use a move with zero extension to AX to
+ // clear the upper 8 bits (AH).
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Move, Chain;
+ if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
+ SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
+ Move =
+ SDValue(CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32,
+ MVT::Other, Ops), 0);
+ Chain = Move.getValue(1);
+ ReplaceUses(N0.getValue(1), Chain);
+ } else {
+ Move =
+ SDValue(CurDAG->getMachineNode(X86::MOVZX32rr8, dl, MVT::i32, N0),0);
+ Chain = CurDAG->getEntryNode();
+ }
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::EAX, Move, SDValue());
+ InFlag = Chain.getValue(1);
+ } else {
+ InFlag =
+ CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
+ LoReg, N0, SDValue()).getValue(1);
+ if (isSigned && !signBitIsZero) {
+ // Sign extend the low part into the high part.
+ InFlag =
+ SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0);
+ } else {
+ // Zero out the high part, effectively zero extending the input.
+ SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0);
+ switch (NVT.SimpleTy) {
+ case MVT::i16:
+ ClrNode =
+ SDValue(CurDAG->getMachineNode(
+ TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
+ CurDAG->getTargetConstant(X86::sub_16bit, dl,
+ MVT::i32)),
+ 0);
+ break;
+ case MVT::i32:
+ break;
+ case MVT::i64:
+ ClrNode =
+ SDValue(CurDAG->getMachineNode(
+ TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
+ CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
+ CurDAG->getTargetConstant(X86::sub_32bit, dl,
+ MVT::i32)),
+ 0);
+ break;
+ default:
+ llvm_unreachable("Unexpected division source");
+ }
+
+ InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
+ ClrNode, InFlag).getValue(1);
+ }
+ }
+
+ if (foldedLoad) {
+ SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
+ InFlag };
+ SDNode *CNode =
+ CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
+ InFlag = SDValue(CNode, 1);
+ // Update the chain.
+ ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
+ } else {
+ InFlag =
+ SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0);
+ }
+
+ // Prevent use of AH in a REX instruction by explicitly copying it to
+ // an ABCD_L register.
+ //
+ // The current assumption of the register allocator is that isel
+ // won't generate explicit references to the GR8_ABCD_H registers. If
+ // the allocator and/or the backend get enhanced to be more robust in
+ // that regard, this can be, and should be, removed.
+ if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
+ SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
+ unsigned AHExtOpcode =
+ isSigned ? X86::MOVSX32_NOREXrr8 : X86::MOVZX32_NOREXrr8;
+
+ SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
+ MVT::Glue, AHCopy, InFlag);
+ SDValue Result(RNode, 0);
+ InFlag = SDValue(RNode, 1);
+
+ if (Opcode == X86ISD::UDIVREM8_ZEXT_HREG ||
+ Opcode == X86ISD::SDIVREM8_SEXT_HREG) {
+ if (Node->getValueType(1) == MVT::i64) {
+ // It's not possible to directly movsx AH to a 64bit register, because
+ // the latter needs the REX prefix, but the former can't have it.
+ assert(Opcode != X86ISD::SDIVREM8_SEXT_HREG &&
+ "Unexpected i64 sext of h-register");
+ Result =
+ SDValue(CurDAG->getMachineNode(
+ TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
+ CurDAG->getTargetConstant(0, dl, MVT::i64), Result,
+ CurDAG->getTargetConstant(X86::sub_32bit, dl,
+ MVT::i32)),
+ 0);
+ }
+ } else {
+ Result =
+ CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
+ }
+ ReplaceUses(SDValue(Node, 1), Result);
+ DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
+ }
+ // Copy the division (low) result, if it is needed.
+ if (!SDValue(Node, 0).use_empty()) {
+ SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+ LoReg, NVT, InFlag);
+ InFlag = Result.getValue(2);
+ ReplaceUses(SDValue(Node, 0), Result);
+ DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
+ }
+ // Copy the remainder (high) result, if it is needed.
+ if (!SDValue(Node, 1).use_empty()) {
+ SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+ HiReg, NVT, InFlag);
+ InFlag = Result.getValue(2);
+ ReplaceUses(SDValue(Node, 1), Result);
+ DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
+ }
+ return;
+ }
+
+ case X86ISD::CMP:
+ case X86ISD::SUB: {
+ // Sometimes a SUB is used to perform comparison.
+ if (Opcode == X86ISD::SUB && Node->hasAnyUseOfValue(0))
+ // This node is not a CMP.
+ break;
+ SDValue N0 = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+
+ if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() &&
+ hasNoSignedComparisonUses(Node))
+ N0 = N0.getOperand(0);
+
+ // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
+ // use a smaller encoding.
+ // Look past the truncate if CMP is the only use of it.
+ if ((N0.getNode()->getOpcode() == ISD::AND ||
+ (N0.getResNo() == 0 && N0.getNode()->getOpcode() == X86ISD::AND)) &&
+ N0.getNode()->hasOneUse() &&
+ N0.getValueType() != MVT::i8 &&
+ X86::isZeroNode(N1)) {
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getNode()->getOperand(1));
+ if (!C) break;
+
+ // For example, convert "testl %eax, $8" to "testb %al, $8"
+ if ((C->getZExtValue() & ~UINT64_C(0xff)) == 0 &&
+ (!(C->getZExtValue() & 0x80) ||
+ hasNoSignedComparisonUses(Node))) {
+ SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, MVT::i8);
+ SDValue Reg = N0.getNode()->getOperand(0);
+
+ // On x86-32, only the ABCD registers have 8-bit subregisters.
+ if (!Subtarget->is64Bit()) {
+ const TargetRegisterClass *TRC;
+ switch (N0.getSimpleValueType().SimpleTy) {
+ case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break;
+ case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break;
+ default: llvm_unreachable("Unsupported TEST operand type!");
+ }
+ SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i32);
+ Reg = SDValue(CurDAG->getMachineNode(X86::COPY_TO_REGCLASS, dl,
+ Reg.getValueType(), Reg, RC), 0);
+ }
+
+ // Extract the l-register.
+ SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl,
+ MVT::i8, Reg);
+
+ // Emit a testb.
+ SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32,
+ Subreg, Imm);
+ // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
+ // one, do not call ReplaceAllUsesWith.
+ ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
+ SDValue(NewNode, 0));
+ return;
+ }
+
+ // For example, "testl %eax, $2048" to "testb %ah, $8".
+ if ((C->getZExtValue() & ~UINT64_C(0xff00)) == 0 &&
+ (!(C->getZExtValue() & 0x8000) ||
+ hasNoSignedComparisonUses(Node))) {
+ // Shift the immediate right by 8 bits.
+ SDValue ShiftedImm = CurDAG->getTargetConstant(C->getZExtValue() >> 8,
+ dl, MVT::i8);
+ SDValue Reg = N0.getNode()->getOperand(0);
+
+ // Put the value in an ABCD register.
+ const TargetRegisterClass *TRC;
+ switch (N0.getSimpleValueType().SimpleTy) {
+ case MVT::i64: TRC = &X86::GR64_ABCDRegClass; break;
+ case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break;
+ case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break;
+ default: llvm_unreachable("Unsupported TEST operand type!");
+ }
+ SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i32);
+ Reg = SDValue(CurDAG->getMachineNode(X86::COPY_TO_REGCLASS, dl,
+ Reg.getValueType(), Reg, RC), 0);
+
+ // Extract the h-register.
+ SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl,
+ MVT::i8, Reg);
+
+ // Emit a testb. The EXTRACT_SUBREG becomes a COPY that can only
+ // target GR8_NOREX registers, so make sure the register class is
+ // forced.
+ SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri_NOREX, dl,
+ MVT::i32, Subreg, ShiftedImm);
+ // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
+ // one, do not call ReplaceAllUsesWith.
+ ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
+ SDValue(NewNode, 0));
+ return;
+ }
+
+ // For example, "testl %eax, $32776" to "testw %ax, $32776".
+ if ((C->getZExtValue() & ~UINT64_C(0xffff)) == 0 &&
+ N0.getValueType() != MVT::i16 &&
+ (!(C->getZExtValue() & 0x8000) ||
+ hasNoSignedComparisonUses(Node))) {
+ SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl,
+ MVT::i16);
+ SDValue Reg = N0.getNode()->getOperand(0);
+
+ // Extract the 16-bit subregister.
+ SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl,
+ MVT::i16, Reg);
+
+ // Emit a testw.
+ SDNode *NewNode = CurDAG->getMachineNode(X86::TEST16ri, dl, MVT::i32,
+ Subreg, Imm);
+ // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
+ // one, do not call ReplaceAllUsesWith.
+ ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
+ SDValue(NewNode, 0));
+ return;
+ }
+
+ // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
+ if ((C->getZExtValue() & ~UINT64_C(0xffffffff)) == 0 &&
+ N0.getValueType() == MVT::i64 &&
+ (!(C->getZExtValue() & 0x80000000) ||
+ hasNoSignedComparisonUses(Node))) {
+ SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl,
+ MVT::i32);
+ SDValue Reg = N0.getNode()->getOperand(0);
+
+ // Extract the 32-bit subregister.
+ SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_32bit, dl,
+ MVT::i32, Reg);
+
+ // Emit a testl.
+ SDNode *NewNode = CurDAG->getMachineNode(X86::TEST32ri, dl, MVT::i32,
+ Subreg, Imm);
+ // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
+ // one, do not call ReplaceAllUsesWith.
+ ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
+ SDValue(NewNode, 0));
+ return;
+ }
+ }
+ break;
+ }
+ case ISD::STORE: {
+ // Change a chain of {load; incr or dec; store} of the same value into
+ // a simple increment or decrement through memory of that value, if the
+ // uses of the modified value and its address are suitable.
+ // The DEC64m tablegen pattern is currently not able to match the case where
+ // the EFLAGS on the original DEC are used. (This also applies to
+ // {INC,DEC}X{64,32,16,8}.)
+ // We'll need to improve tablegen to allow flags to be transferred from a
+ // node in the pattern to the result node. probably with a new keyword
+ // for example, we have this
+ // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
+ // [(store (add (loadi64 addr:$dst), -1), addr:$dst),
+ // (implicit EFLAGS)]>;
+ // but maybe need something like this
+ // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
+ // [(store (add (loadi64 addr:$dst), -1), addr:$dst),
+ // (transferrable EFLAGS)]>;
+
+ StoreSDNode *StoreNode = cast<StoreSDNode>(Node);
+ SDValue StoredVal = StoreNode->getOperand(1);
+ unsigned Opc = StoredVal->getOpcode();
+
+ LoadSDNode *LoadNode = nullptr;
+ SDValue InputChain;
+ if (!isLoadIncOrDecStore(StoreNode, Opc, StoredVal, CurDAG,
+ LoadNode, InputChain))
+ break;
+
+ SDValue Base, Scale, Index, Disp, Segment;
+ if (!selectAddr(LoadNode, LoadNode->getBasePtr(),
+ Base, Scale, Index, Disp, Segment))
+ break;
+
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(2);
+ MemOp[0] = StoreNode->getMemOperand();
+ MemOp[1] = LoadNode->getMemOperand();
+ const SDValue Ops[] = { Base, Scale, Index, Disp, Segment, InputChain };
+ EVT LdVT = LoadNode->getMemoryVT();
+ unsigned newOpc = getFusedLdStOpcode(LdVT, Opc);
+ MachineSDNode *Result = CurDAG->getMachineNode(newOpc,
+ SDLoc(Node),
+ MVT::i32, MVT::Other, Ops);
+ Result->setMemRefs(MemOp, MemOp + 2);
+
+ ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
+ ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
+ CurDAG->RemoveDeadNode(Node);
+ return;
+ }
+ }
+
+ SelectCode(Node);
+}
+
+bool X86DAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
+ std::vector<SDValue> &OutOps) {
+ SDValue Op0, Op1, Op2, Op3, Op4;
+ switch (ConstraintID) {
+ default:
+ llvm_unreachable("Unexpected asm memory constraint");
+ case InlineAsm::Constraint_i:
+ // FIXME: It seems strange that 'i' is needed here since it's supposed to
+ // be an immediate and not a memory constraint.
+ LLVM_FALLTHROUGH;
+ case InlineAsm::Constraint_o: // offsetable ??
+ case InlineAsm::Constraint_v: // not offsetable ??
+ case InlineAsm::Constraint_m: // memory
+ case InlineAsm::Constraint_X:
+ if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
+ return true;
+ break;
+ }
+
+ OutOps.push_back(Op0);
+ OutOps.push_back(Op1);
+ OutOps.push_back(Op2);
+ OutOps.push_back(Op3);
+ OutOps.push_back(Op4);
+ return false;
+}
+
+/// This pass converts a legalized DAG into a X86-specific DAG,
+/// ready for instruction scheduling.
+FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
+ CodeGenOpt::Level OptLevel) {
+ return new X86DAGToDAGISel(TM, OptLevel);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
new file mode 100644
index 000000000000..b293dfa98f82
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -0,0 +1,34395 @@
+//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that X86 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ISelLowering.h"
+#include "Utils/X86ShuffleDecode.h"
+#include "X86CallingConv.h"
+#include "X86FrameLowering.h"
+#include "X86InstrBuilder.h"
+#include "X86IntrinsicsInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86ShuffleDecodeConstantPool.h"
+#include "X86TargetMachine.h"
+#include "X86TargetObjectFile.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/CodeGen/IntrinsicLowering.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetOptions.h"
+#include <algorithm>
+#include <bitset>
+#include <cctype>
+#include <numeric>
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-isel"
+
+STATISTIC(NumTailCalls, "Number of tail calls");
+
+static cl::opt<bool> ExperimentalVectorWideningLegalization(
+ "x86-experimental-vector-widening-legalization", cl::init(false),
+ cl::desc("Enable an experimental vector type legalization through widening "
+ "rather than promotion."),
+ cl::Hidden);
+
+X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
+ const X86Subtarget &STI)
+ : TargetLowering(TM), Subtarget(STI) {
+ bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
+ X86ScalarSSEf64 = Subtarget.hasSSE2();
+ X86ScalarSSEf32 = Subtarget.hasSSE1();
+ MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
+
+ // Set up the TargetLowering object.
+
+ // X86 is weird. It always uses i8 for shift amounts and setcc results.
+ setBooleanContents(ZeroOrOneBooleanContent);
+ // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
+ setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+
+ // For 64-bit, since we have so many registers, use the ILP scheduler.
+ // For 32-bit, use the register pressure specific scheduling.
+ // For Atom, always use ILP scheduling.
+ if (Subtarget.isAtom())
+ setSchedulingPreference(Sched::ILP);
+ else if (Subtarget.is64Bit())
+ setSchedulingPreference(Sched::ILP);
+ else
+ setSchedulingPreference(Sched::RegPressure);
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
+
+ // Bypass expensive divides on Atom when compiling with O2.
+ if (TM.getOptLevel() >= CodeGenOpt::Default) {
+ if (Subtarget.hasSlowDivide32())
+ addBypassSlowDiv(32, 8);
+ if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
+ addBypassSlowDiv(64, 16);
+ }
+
+ if (Subtarget.isTargetKnownWindowsMSVC() ||
+ Subtarget.isTargetWindowsItanium()) {
+ // Setup Windows compiler runtime calls.
+ setLibcallName(RTLIB::SDIV_I64, "_alldiv");
+ setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
+ setLibcallName(RTLIB::SREM_I64, "_allrem");
+ setLibcallName(RTLIB::UREM_I64, "_aullrem");
+ setLibcallName(RTLIB::MUL_I64, "_allmul");
+ setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
+ setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
+ setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
+ setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
+ setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
+ }
+
+ if (Subtarget.isTargetDarwin()) {
+ // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
+ setUseUnderscoreSetJmp(false);
+ setUseUnderscoreLongJmp(false);
+ } else if (Subtarget.isTargetWindowsGNU()) {
+ // MS runtime is weird: it exports _setjmp, but longjmp!
+ setUseUnderscoreSetJmp(true);
+ setUseUnderscoreLongJmp(false);
+ } else {
+ setUseUnderscoreSetJmp(true);
+ setUseUnderscoreLongJmp(true);
+ }
+
+ // Set up the register classes.
+ addRegisterClass(MVT::i8, &X86::GR8RegClass);
+ addRegisterClass(MVT::i16, &X86::GR16RegClass);
+ addRegisterClass(MVT::i32, &X86::GR32RegClass);
+ if (Subtarget.is64Bit())
+ addRegisterClass(MVT::i64, &X86::GR64RegClass);
+
+ for (MVT VT : MVT::integer_valuetypes())
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+
+ // We don't accept any truncstore of integer registers.
+ setTruncStoreAction(MVT::i64, MVT::i32, Expand);
+ setTruncStoreAction(MVT::i64, MVT::i16, Expand);
+ setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
+ setTruncStoreAction(MVT::i32, MVT::i16, Expand);
+ setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
+ setTruncStoreAction(MVT::i16, MVT::i8, Expand);
+
+ setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+ // SETOEQ and SETUNE require checking two conditions.
+ setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
+ setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
+ setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
+ setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
+ setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
+ setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
+
+ // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
+ // operation.
+ setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
+ setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
+ setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
+
+ if (Subtarget.is64Bit()) {
+ if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
+ // f32/f64 are legal, f80 is custom.
+ setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
+ else
+ setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
+ setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
+ } else if (!Subtarget.useSoftFloat()) {
+ // We have an algorithm for SSE2->double, and we turn this into a
+ // 64-bit FILD followed by conditional FADD for other targets.
+ setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
+ // We have an algorithm for SSE2, and we turn this into a 64-bit
+ // FILD or VCVTUSI2SS/SD for other targets.
+ setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
+ }
+
+ // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
+ // this operation.
+ setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
+ setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
+
+ if (!Subtarget.useSoftFloat()) {
+ // SSE has no i16 to fp conversion, only i32.
+ if (X86ScalarSSEf32) {
+ setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
+ // f32 and f64 cases are Legal, f80 case is not
+ setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
+ } else {
+ setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
+ setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
+ }
+ } else {
+ setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
+ setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
+ }
+
+ // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
+ // this operation.
+ setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
+ setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
+
+ if (!Subtarget.useSoftFloat()) {
+ // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
+ // are Legal, f80 is custom lowered.
+ setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
+ setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
+
+ if (X86ScalarSSEf32) {
+ setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
+ // f32 and f64 cases are Legal, f80 case is not
+ setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
+ } else {
+ setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
+ setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
+ }
+ } else {
+ setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
+ setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
+ setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
+ }
+
+ // Handle FP_TO_UINT by promoting the destination to a larger signed
+ // conversion.
+ setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
+ setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
+ setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
+
+ if (Subtarget.is64Bit()) {
+ if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
+ // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
+ setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
+ setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
+ } else {
+ setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
+ setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
+ }
+ } else if (!Subtarget.useSoftFloat()) {
+ // Since AVX is a superset of SSE3, only check for SSE here.
+ if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
+ // Expand FP_TO_UINT into a select.
+ // FIXME: We would like to use a Custom expander here eventually to do
+ // the optimal thing for SSE vs. the default expansion in the legalizer.
+ setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
+ else
+ // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
+ // With SSE3 we can use fisttpll to convert to a signed i64; without
+ // SSE, we're stuck with a fistpll.
+ setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
+
+ setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
+ }
+
+ // TODO: when we have SSE, these could be more efficient, by using movd/movq.
+ if (!X86ScalarSSEf64) {
+ setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
+ setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
+ if (Subtarget.is64Bit()) {
+ setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
+ // Without SSE, i64->f64 goes through memory.
+ setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
+ }
+ } else if (!Subtarget.is64Bit())
+ setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
+
+ // Scalar integer divide and remainder are lowered to use operations that
+ // produce two results, to match the available instructions. This exposes
+ // the two-result form to trivial CSE, which is able to combine x/y and x%y
+ // into a single instruction.
+ //
+ // Scalar integer multiply-high is also lowered to use two-result
+ // operations, to match the available instructions. However, plain multiply
+ // (low) operations are left as Legal, as there are single-result
+ // instructions for this in x86. Using the two-result multiply instructions
+ // when both high and low results are needed must be arranged by dagcombine.
+ for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+ setOperationAction(ISD::MULHS, VT, Expand);
+ setOperationAction(ISD::MULHU, VT, Expand);
+ setOperationAction(ISD::SDIV, VT, Expand);
+ setOperationAction(ISD::UDIV, VT, Expand);
+ setOperationAction(ISD::SREM, VT, Expand);
+ setOperationAction(ISD::UREM, VT, Expand);
+ }
+
+ for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+ if (VT == MVT::i64 && !Subtarget.is64Bit())
+ continue;
+ // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
+ setOperationAction(ISD::ADDC, VT, Custom);
+ setOperationAction(ISD::ADDE, VT, Custom);
+ setOperationAction(ISD::SUBC, VT, Custom);
+ setOperationAction(ISD::SUBE, VT, Custom);
+ }
+
+ setOperationAction(ISD::BR_JT , MVT::Other, Expand);
+ setOperationAction(ISD::BRCOND , MVT::Other, Custom);
+ for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
+ MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+ setOperationAction(ISD::BR_CC, VT, Expand);
+ setOperationAction(ISD::SELECT_CC, VT, Expand);
+ }
+ if (Subtarget.is64Bit())
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
+ setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
+
+ setOperationAction(ISD::FREM , MVT::f32 , Expand);
+ setOperationAction(ISD::FREM , MVT::f64 , Expand);
+ setOperationAction(ISD::FREM , MVT::f80 , Expand);
+ setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
+
+ // Promote the i8 variants and force them on up to i32 which has a shorter
+ // encoding.
+ setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
+ setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
+ if (!Subtarget.hasBMI()) {
+ setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
+ setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
+ if (Subtarget.is64Bit()) {
+ setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
+ }
+ }
+
+ if (Subtarget.hasLZCNT()) {
+ // When promoting the i8 variants, force them to i32 for a shorter
+ // encoding.
+ setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
+ setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
+ } else {
+ setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
+ setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
+ setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
+ if (Subtarget.is64Bit()) {
+ setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
+ }
+ }
+
+ // Special handling for half-precision floating point conversions.
+ // If we don't have F16C support, then lower half float conversions
+ // into library calls.
+ if (Subtarget.useSoftFloat() ||
+ (!Subtarget.hasF16C() && !Subtarget.hasAVX512())) {
+ setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
+ setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
+ }
+
+ // There's never any support for operations beyond MVT::f32.
+ setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
+ setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
+ setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
+ setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
+
+ setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
+ setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+ setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+ setTruncStoreAction(MVT::f80, MVT::f16, Expand);
+
+ if (Subtarget.hasPOPCNT()) {
+ setOperationAction(ISD::CTPOP , MVT::i8 , Promote);
+ } else {
+ setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
+ setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
+ setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
+ if (Subtarget.is64Bit())
+ setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
+ }
+
+ setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
+
+ if (!Subtarget.hasMOVBE())
+ setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
+
+ // These should be promoted to a larger select which is supported.
+ setOperationAction(ISD::SELECT , MVT::i1 , Promote);
+ // X86 wants to expand cmov itself.
+ for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
+ setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::SETCC, VT, Custom);
+ }
+ for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+ if (VT == MVT::i64 && !Subtarget.is64Bit())
+ continue;
+ setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::SETCCE, VT, Custom);
+ }
+ setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
+ // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
+ // SjLj exception handling but a light-weight setjmp/longjmp replacement to
+ // support continuation, user-level threading, and etc.. As a result, no
+ // other SjLj exception interfaces are implemented and please don't build
+ // your own exception handling based on them.
+ // LLVM/Clang supports zero-cost DWARF exception handling.
+ setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
+ setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
+ setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
+ if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
+ setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
+
+ // Darwin ABI issue.
+ for (auto VT : { MVT::i32, MVT::i64 }) {
+ if (VT == MVT::i64 && !Subtarget.is64Bit())
+ continue;
+ setOperationAction(ISD::ConstantPool , VT, Custom);
+ setOperationAction(ISD::JumpTable , VT, Custom);
+ setOperationAction(ISD::GlobalAddress , VT, Custom);
+ setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
+ setOperationAction(ISD::ExternalSymbol , VT, Custom);
+ setOperationAction(ISD::BlockAddress , VT, Custom);
+ }
+ // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
+ for (auto VT : { MVT::i32, MVT::i64 }) {
+ if (VT == MVT::i64 && !Subtarget.is64Bit())
+ continue;
+ setOperationAction(ISD::SHL_PARTS, VT, Custom);
+ setOperationAction(ISD::SRA_PARTS, VT, Custom);
+ setOperationAction(ISD::SRL_PARTS, VT, Custom);
+ }
+
+ if (Subtarget.hasSSE1())
+ setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
+
+ setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
+
+ // Expand certain atomics
+ for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+ setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
+ setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
+ }
+
+ if (Subtarget.hasCmpxchg16b()) {
+ setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
+ }
+
+ // FIXME - use subtarget debug flags
+ if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
+ !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
+ TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
+ setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
+ }
+
+ setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
+ setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
+
+ setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
+ setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
+
+ setOperationAction(ISD::TRAP, MVT::Other, Legal);
+ setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
+
+ // VASTART needs to be custom lowered to use the VarArgsFrameIndex
+ setOperationAction(ISD::VASTART , MVT::Other, Custom);
+ setOperationAction(ISD::VAEND , MVT::Other, Expand);
+ bool Is64Bit = Subtarget.is64Bit();
+ setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
+ setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
+
+ setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+ setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
+
+ // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
+ setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
+ setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
+
+ if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
+ // f32 and f64 use SSE.
+ // Set up the FP register classes.
+ addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
+ : &X86::FR32RegClass);
+ addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
+ : &X86::FR64RegClass);
+
+ for (auto VT : { MVT::f32, MVT::f64 }) {
+ // Use ANDPD to simulate FABS.
+ setOperationAction(ISD::FABS, VT, Custom);
+
+ // Use XORP to simulate FNEG.
+ setOperationAction(ISD::FNEG, VT, Custom);
+
+ // Use ANDPD and ORPD to simulate FCOPYSIGN.
+ setOperationAction(ISD::FCOPYSIGN, VT, Custom);
+
+ // We don't support sin/cos/fmod
+ setOperationAction(ISD::FSIN , VT, Expand);
+ setOperationAction(ISD::FCOS , VT, Expand);
+ setOperationAction(ISD::FSINCOS, VT, Expand);
+ }
+
+ // Lower this to MOVMSK plus an AND.
+ setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
+ setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
+
+ // Expand FP immediates into loads from the stack, except for the special
+ // cases we handle.
+ addLegalFPImmediate(APFloat(+0.0)); // xorpd
+ addLegalFPImmediate(APFloat(+0.0f)); // xorps
+ } else if (UseX87 && X86ScalarSSEf32) {
+ // Use SSE for f32, x87 for f64.
+ // Set up the FP register classes.
+ addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
+ : &X86::FR32RegClass);
+ addRegisterClass(MVT::f64, &X86::RFP64RegClass);
+
+ // Use ANDPS to simulate FABS.
+ setOperationAction(ISD::FABS , MVT::f32, Custom);
+
+ // Use XORP to simulate FNEG.
+ setOperationAction(ISD::FNEG , MVT::f32, Custom);
+
+ setOperationAction(ISD::UNDEF, MVT::f64, Expand);
+
+ // Use ANDPS and ORPS to simulate FCOPYSIGN.
+ setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+
+ // We don't support sin/cos/fmod
+ setOperationAction(ISD::FSIN , MVT::f32, Expand);
+ setOperationAction(ISD::FCOS , MVT::f32, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+
+ // Special cases we handle for FP constants.
+ addLegalFPImmediate(APFloat(+0.0f)); // xorps
+ addLegalFPImmediate(APFloat(+0.0)); // FLD0
+ addLegalFPImmediate(APFloat(+1.0)); // FLD1
+ addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
+ addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
+
+ if (!TM.Options.UnsafeFPMath) {
+ setOperationAction(ISD::FSIN , MVT::f64, Expand);
+ setOperationAction(ISD::FCOS , MVT::f64, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+ }
+ } else if (UseX87) {
+ // f32 and f64 in x87.
+ // Set up the FP register classes.
+ addRegisterClass(MVT::f64, &X86::RFP64RegClass);
+ addRegisterClass(MVT::f32, &X86::RFP32RegClass);
+
+ for (auto VT : { MVT::f32, MVT::f64 }) {
+ setOperationAction(ISD::UNDEF, VT, Expand);
+ setOperationAction(ISD::FCOPYSIGN, VT, Expand);
+
+ if (!TM.Options.UnsafeFPMath) {
+ setOperationAction(ISD::FSIN , VT, Expand);
+ setOperationAction(ISD::FCOS , VT, Expand);
+ setOperationAction(ISD::FSINCOS, VT, Expand);
+ }
+ }
+ addLegalFPImmediate(APFloat(+0.0)); // FLD0
+ addLegalFPImmediate(APFloat(+1.0)); // FLD1
+ addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
+ addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
+ addLegalFPImmediate(APFloat(+0.0f)); // FLD0
+ addLegalFPImmediate(APFloat(+1.0f)); // FLD1
+ addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
+ addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
+ }
+
+ // We don't support FMA.
+ setOperationAction(ISD::FMA, MVT::f64, Expand);
+ setOperationAction(ISD::FMA, MVT::f32, Expand);
+
+ // Long double always uses X87, except f128 in MMX.
+ if (UseX87) {
+ if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
+ addRegisterClass(MVT::f128, &X86::FR128RegClass);
+ ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
+ setOperationAction(ISD::FABS , MVT::f128, Custom);
+ setOperationAction(ISD::FNEG , MVT::f128, Custom);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
+ }
+
+ addRegisterClass(MVT::f80, &X86::RFP80RegClass);
+ setOperationAction(ISD::UNDEF, MVT::f80, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
+ {
+ APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
+ addLegalFPImmediate(TmpFlt); // FLD0
+ TmpFlt.changeSign();
+ addLegalFPImmediate(TmpFlt); // FLD0/FCHS
+
+ bool ignored;
+ APFloat TmpFlt2(+1.0);
+ TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
+ &ignored);
+ addLegalFPImmediate(TmpFlt2); // FLD1
+ TmpFlt2.changeSign();
+ addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
+ }
+
+ if (!TM.Options.UnsafeFPMath) {
+ setOperationAction(ISD::FSIN , MVT::f80, Expand);
+ setOperationAction(ISD::FCOS , MVT::f80, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
+ }
+
+ setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
+ setOperationAction(ISD::FCEIL, MVT::f80, Expand);
+ setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
+ setOperationAction(ISD::FRINT, MVT::f80, Expand);
+ setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
+ setOperationAction(ISD::FMA, MVT::f80, Expand);
+ }
+
+ // Always use a library call for pow.
+ setOperationAction(ISD::FPOW , MVT::f32 , Expand);
+ setOperationAction(ISD::FPOW , MVT::f64 , Expand);
+ setOperationAction(ISD::FPOW , MVT::f80 , Expand);
+
+ setOperationAction(ISD::FLOG, MVT::f80, Expand);
+ setOperationAction(ISD::FLOG2, MVT::f80, Expand);
+ setOperationAction(ISD::FLOG10, MVT::f80, Expand);
+ setOperationAction(ISD::FEXP, MVT::f80, Expand);
+ setOperationAction(ISD::FEXP2, MVT::f80, Expand);
+ setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
+ setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
+
+ // Some FP actions are always expanded for vector types.
+ for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
+ MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
+ setOperationAction(ISD::FSIN, VT, Expand);
+ setOperationAction(ISD::FSINCOS, VT, Expand);
+ setOperationAction(ISD::FCOS, VT, Expand);
+ setOperationAction(ISD::FREM, VT, Expand);
+ setOperationAction(ISD::FPOWI, VT, Expand);
+ setOperationAction(ISD::FCOPYSIGN, VT, Expand);
+ setOperationAction(ISD::FPOW, VT, Expand);
+ setOperationAction(ISD::FLOG, VT, Expand);
+ setOperationAction(ISD::FLOG2, VT, Expand);
+ setOperationAction(ISD::FLOG10, VT, Expand);
+ setOperationAction(ISD::FEXP, VT, Expand);
+ setOperationAction(ISD::FEXP2, VT, Expand);
+ }
+
+ // First set operation action for all vector types to either promote
+ // (for widening) or expand (for scalarization). Then we will selectively
+ // turn on ones that can be effectively codegen'd.
+ for (MVT VT : MVT::vector_valuetypes()) {
+ setOperationAction(ISD::SDIV, VT, Expand);
+ setOperationAction(ISD::UDIV, VT, Expand);
+ setOperationAction(ISD::SREM, VT, Expand);
+ setOperationAction(ISD::UREM, VT, Expand);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
+ setOperationAction(ISD::FMA, VT, Expand);
+ setOperationAction(ISD::FFLOOR, VT, Expand);
+ setOperationAction(ISD::FCEIL, VT, Expand);
+ setOperationAction(ISD::FTRUNC, VT, Expand);
+ setOperationAction(ISD::FRINT, VT, Expand);
+ setOperationAction(ISD::FNEARBYINT, VT, Expand);
+ setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+ setOperationAction(ISD::MULHS, VT, Expand);
+ setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+ setOperationAction(ISD::MULHU, VT, Expand);
+ setOperationAction(ISD::SDIVREM, VT, Expand);
+ setOperationAction(ISD::UDIVREM, VT, Expand);
+ setOperationAction(ISD::CTPOP, VT, Expand);
+ setOperationAction(ISD::CTTZ, VT, Expand);
+ setOperationAction(ISD::CTLZ, VT, Expand);
+ setOperationAction(ISD::ROTL, VT, Expand);
+ setOperationAction(ISD::ROTR, VT, Expand);
+ setOperationAction(ISD::BSWAP, VT, Expand);
+ setOperationAction(ISD::SETCC, VT, Expand);
+ setOperationAction(ISD::FP_TO_UINT, VT, Expand);
+ setOperationAction(ISD::FP_TO_SINT, VT, Expand);
+ setOperationAction(ISD::UINT_TO_FP, VT, Expand);
+ setOperationAction(ISD::SINT_TO_FP, VT, Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
+ setOperationAction(ISD::TRUNCATE, VT, Expand);
+ setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
+ setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
+ setOperationAction(ISD::ANY_EXTEND, VT, Expand);
+ setOperationAction(ISD::SELECT_CC, VT, Expand);
+ for (MVT InnerVT : MVT::vector_valuetypes()) {
+ setTruncStoreAction(InnerVT, VT, Expand);
+
+ setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
+
+ // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
+ // types, we have to deal with them whether we ask for Expansion or not.
+ // Setting Expand causes its own optimisation problems though, so leave
+ // them legal.
+ if (VT.getVectorElementType() == MVT::i1)
+ setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
+
+ // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
+ // split/scalarized right now.
+ if (VT.getVectorElementType() == MVT::f16)
+ setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
+ }
+ }
+
+ // FIXME: In order to prevent SSE instructions being expanded to MMX ones
+ // with -msoft-float, disable use of MMX as well.
+ if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
+ addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
+ // No operations on x86mmx supported, everything uses intrinsics.
+ }
+
+ if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
+ addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
+ : &X86::VR128RegClass);
+
+ setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
+ setOperationAction(ISD::FABS, MVT::v4f32, Custom);
+ setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
+ setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
+ }
+
+ if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
+ addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
+ : &X86::VR128RegClass);
+
+ // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
+ // registers cannot be used even for integer operations.
+ addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
+ : &X86::VR128RegClass);
+ addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
+ : &X86::VR128RegClass);
+ addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
+ : &X86::VR128RegClass);
+ addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
+ : &X86::VR128RegClass);
+
+ setOperationAction(ISD::MUL, MVT::v16i8, Custom);
+ setOperationAction(ISD::MUL, MVT::v4i32, Custom);
+ setOperationAction(ISD::MUL, MVT::v2i64, Custom);
+ setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
+ setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
+ setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
+ setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
+ setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
+ setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
+ setOperationAction(ISD::MUL, MVT::v8i16, Legal);
+ setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
+ setOperationAction(ISD::FABS, MVT::v2f64, Custom);
+ setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
+
+ setOperationAction(ISD::SMAX, MVT::v8i16, Legal);
+ setOperationAction(ISD::UMAX, MVT::v16i8, Legal);
+ setOperationAction(ISD::SMIN, MVT::v8i16, Legal);
+ setOperationAction(ISD::UMIN, MVT::v16i8, Legal);
+
+ setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
+ setOperationAction(ISD::SETCC, MVT::v16i8, Custom);
+ setOperationAction(ISD::SETCC, MVT::v8i16, Custom);
+ setOperationAction(ISD::SETCC, MVT::v4i32, Custom);
+
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
+
+ setOperationAction(ISD::CTPOP, MVT::v16i8, Custom);
+ setOperationAction(ISD::CTPOP, MVT::v8i16, Custom);
+ setOperationAction(ISD::CTPOP, MVT::v4i32, Custom);
+ setOperationAction(ISD::CTPOP, MVT::v2i64, Custom);
+
+ setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
+ setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
+ setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
+ setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
+
+ // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
+ for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ }
+
+ // We support custom legalizing of sext and anyext loads for specific
+ // memory vector types which we can load as a scalar (or sequence of
+ // scalars) and extend in-register to a legal 128-bit vector type. For sext
+ // loads these must work with a single scalar load.
+ for (MVT VT : MVT::integer_vector_valuetypes()) {
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
+ }
+
+ for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Custom);
+
+ if (VT == MVT::v2i64 && !Subtarget.is64Bit())
+ continue;
+
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ }
+
+ // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
+ for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
+ setOperationPromotedToType(ISD::AND, VT, MVT::v2i64);
+ setOperationPromotedToType(ISD::OR, VT, MVT::v2i64);
+ setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64);
+ setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64);
+ setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
+ }
+
+ // Custom lower v2i64 and v2f64 selects.
+ setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
+ setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
+
+ setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
+
+ setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
+
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
+
+ // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
+ setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
+
+ setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
+ setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
+
+ for (MVT VT : MVT::fp_vector_valuetypes())
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
+
+ setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
+ setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
+ setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
+
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
+
+ for (auto VT : { MVT::v8i16, MVT::v16i8 }) {
+ setOperationAction(ISD::SRL, VT, Custom);
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::SRA, VT, Custom);
+ }
+
+ // In the customized shift lowering, the legal cases in AVX2 will be
+ // recognized.
+ for (auto VT : { MVT::v4i32, MVT::v2i64 }) {
+ setOperationAction(ISD::SRL, VT, Custom);
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::SRA, VT, Custom);
+ }
+ }
+
+ if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
+ setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
+ }
+
+ if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
+ for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
+ setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
+ setOperationAction(ISD::FCEIL, RoundedTy, Legal);
+ setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
+ setOperationAction(ISD::FRINT, RoundedTy, Legal);
+ setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
+ }
+
+ setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
+ setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
+ setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
+ setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
+ setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
+ setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
+ setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
+ setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
+
+ // FIXME: Do we need to handle scalar-to-vector here?
+ setOperationAction(ISD::MUL, MVT::v4i32, Legal);
+
+ // We directly match byte blends in the backend as they match the VSELECT
+ // condition form.
+ setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
+
+ // SSE41 brings specific instructions for doing vector sign extend even in
+ // cases where we don't have SRA.
+ for (MVT VT : MVT::integer_vector_valuetypes()) {
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
+ }
+
+ // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
+
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
+
+ // i8 vectors are custom because the source register and source
+ // source memory operand types are not the same width.
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
+ }
+
+ if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
+ for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
+ MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
+ setOperationAction(ISD::ROTL, VT, Custom);
+
+ // XOP can efficiently perform BITREVERSE with VPPERM.
+ for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
+ setOperationAction(ISD::BITREVERSE, VT, Custom);
+
+ for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
+ MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
+ setOperationAction(ISD::BITREVERSE, VT, Custom);
+ }
+
+ if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
+ bool HasInt256 = Subtarget.hasInt256();
+
+ addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
+ : &X86::VR256RegClass);
+ addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
+ : &X86::VR256RegClass);
+ addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
+ : &X86::VR256RegClass);
+ addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
+ : &X86::VR256RegClass);
+ addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
+ : &X86::VR256RegClass);
+ addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
+ : &X86::VR256RegClass);
+
+ for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
+ setOperationAction(ISD::FFLOOR, VT, Legal);
+ setOperationAction(ISD::FCEIL, VT, Legal);
+ setOperationAction(ISD::FTRUNC, VT, Legal);
+ setOperationAction(ISD::FRINT, VT, Legal);
+ setOperationAction(ISD::FNEARBYINT, VT, Legal);
+ setOperationAction(ISD::FNEG, VT, Custom);
+ setOperationAction(ISD::FABS, VT, Custom);
+ setOperationAction(ISD::FCOPYSIGN, VT, Custom);
+ }
+
+ // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
+ // even though v8i16 is a legal type.
+ setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
+
+ setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
+ setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
+
+ setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
+
+ for (MVT VT : MVT::fp_vector_valuetypes())
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
+
+ for (auto VT : { MVT::v32i8, MVT::v16i16 }) {
+ setOperationAction(ISD::SRL, VT, Custom);
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::SRA, VT, Custom);
+ }
+
+ setOperationAction(ISD::SETCC, MVT::v32i8, Custom);
+ setOperationAction(ISD::SETCC, MVT::v16i16, Custom);
+ setOperationAction(ISD::SETCC, MVT::v8i32, Custom);
+ setOperationAction(ISD::SETCC, MVT::v4i64, Custom);
+
+ setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
+ setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
+ setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
+
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v4i64, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v16i16, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
+ setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
+
+ for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
+ setOperationAction(ISD::CTPOP, VT, Custom);
+ setOperationAction(ISD::CTTZ, VT, Custom);
+ setOperationAction(ISD::CTLZ, VT, Custom);
+ }
+
+ if (Subtarget.hasAnyFMA()) {
+ for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
+ MVT::v2f64, MVT::v4f64 })
+ setOperationAction(ISD::FMA, VT, Legal);
+ }
+
+ for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
+ setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
+ }
+
+ setOperationAction(ISD::MUL, MVT::v4i64, Custom);
+ setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::MUL, MVT::v32i8, Custom);
+
+ setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
+ setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
+
+ setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
+ setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
+
+ for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
+ setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
+ }
+
+ if (HasInt256) {
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom);
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
+
+ // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
+ // when we have a 256bit-wide blend with immediate.
+ setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
+
+ // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
+
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
+ }
+
+ // In the customized shift lowering, the legal cases in AVX2 will be
+ // recognized.
+ for (auto VT : { MVT::v8i32, MVT::v4i64 }) {
+ setOperationAction(ISD::SRL, VT, Custom);
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::SRA, VT, Custom);
+ }
+
+ for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
+ MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
+ setOperationAction(ISD::MLOAD, VT, Legal);
+ setOperationAction(ISD::MSTORE, VT, Legal);
+ }
+
+ // Extract subvector is special because the value type
+ // (result) is 128-bit but the source is 256-bit wide.
+ for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
+ MVT::v4f32, MVT::v2f64 }) {
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+ }
+
+ // Custom lower several nodes for 256-bit types.
+ for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
+ MVT::v8f32, MVT::v4f64 }) {
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+ }
+
+ if (HasInt256)
+ setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
+
+ // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
+ for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
+ setOperationPromotedToType(ISD::AND, VT, MVT::v4i64);
+ setOperationPromotedToType(ISD::OR, VT, MVT::v4i64);
+ setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64);
+ setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64);
+ setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
+ }
+ }
+
+ if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
+ addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
+ addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
+ addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
+ addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
+
+ addRegisterClass(MVT::i1, &X86::VK1RegClass);
+ addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
+ addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
+
+ for (MVT VT : MVT::fp_vector_valuetypes())
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
+
+ for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
+ setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
+ setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
+ setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
+ setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
+ setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
+ setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
+ }
+ setOperationAction(ISD::BR_CC, MVT::i1, Expand);
+ setOperationAction(ISD::SETCC, MVT::i1, Custom);
+ setOperationAction(ISD::SETCCE, MVT::i1, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
+ setOperationAction(ISD::XOR, MVT::i1, Legal);
+ setOperationAction(ISD::OR, MVT::i1, Legal);
+ setOperationAction(ISD::AND, MVT::i1, Legal);
+ setOperationAction(ISD::SUB, MVT::i1, Custom);
+ setOperationAction(ISD::ADD, MVT::i1, Custom);
+ setOperationAction(ISD::MUL, MVT::i1, Custom);
+
+ for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
+ MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
+ MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+ setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MaskVT, Custom);
+ setTruncStoreAction(VT, MaskVT, Custom);
+ }
+
+ for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
+ setOperationAction(ISD::FNEG, VT, Custom);
+ setOperationAction(ISD::FABS, VT, Custom);
+ setOperationAction(ISD::FMA, VT, Legal);
+ setOperationAction(ISD::FCOPYSIGN, VT, Custom);
+ }
+
+ setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Promote);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Promote);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v16i1, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v8i1, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Custom);
+ setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal);
+ setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
+
+ setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
+ setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
+ setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
+ setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
+ setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
+ if (Subtarget.hasVLX()){
+ setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
+ setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
+ setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
+ setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
+ setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
+
+ setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
+ setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
+ setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
+ setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
+ setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
+ } else {
+ for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
+ MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
+ setOperationAction(ISD::MLOAD, VT, Custom);
+ setOperationAction(ISD::MSTORE, VT, Custom);
+ }
+ }
+ setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i1, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i1, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v8i1, Expand);
+ setOperationAction(ISD::VSELECT, MVT::v16i1, Expand);
+ if (Subtarget.hasDQI()) {
+ setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v4i64, Legal);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v4i64, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
+
+ if (Subtarget.hasVLX()) {
+ // Fast v2f32 SINT_TO_FP( v2i32 ) custom conversion.
+ setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
+ }
+ }
+ if (Subtarget.hasVLX()) {
+ setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom);
+
+ // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
+ setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
+ }
+
+ setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
+ if (Subtarget.hasDQI()) {
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);
+ }
+ for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
+ setOperationAction(ISD::FFLOOR, VT, Legal);
+ setOperationAction(ISD::FCEIL, VT, Legal);
+ setOperationAction(ISD::FTRUNC, VT, Legal);
+ setOperationAction(ISD::FRINT, VT, Legal);
+ setOperationAction(ISD::FNEARBYINT, VT, Legal);
+ }
+
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);
+
+ // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
+ setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
+
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
+
+ setOperationAction(ISD::SETCC, MVT::v16i1, Custom);
+ setOperationAction(ISD::SETCC, MVT::v8i1, Custom);
+
+ setOperationAction(ISD::MUL, MVT::v8i64, Custom);
+
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom);
+ setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
+ setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
+ setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
+ setOperationAction(ISD::SELECT, MVT::v16i1, Custom);
+ setOperationAction(ISD::SELECT, MVT::v8i1, Custom);
+
+ setOperationAction(ISD::SMAX, MVT::v16i32, Legal);
+ setOperationAction(ISD::SMAX, MVT::v8i64, Legal);
+ setOperationAction(ISD::UMAX, MVT::v16i32, Legal);
+ setOperationAction(ISD::UMAX, MVT::v8i64, Legal);
+ setOperationAction(ISD::SMIN, MVT::v16i32, Legal);
+ setOperationAction(ISD::SMIN, MVT::v8i64, Legal);
+ setOperationAction(ISD::UMIN, MVT::v16i32, Legal);
+ setOperationAction(ISD::UMIN, MVT::v8i64, Legal);
+
+ setOperationAction(ISD::ADD, MVT::v8i1, Expand);
+ setOperationAction(ISD::ADD, MVT::v16i1, Expand);
+ setOperationAction(ISD::SUB, MVT::v8i1, Expand);
+ setOperationAction(ISD::SUB, MVT::v16i1, Expand);
+ setOperationAction(ISD::MUL, MVT::v8i1, Expand);
+ setOperationAction(ISD::MUL, MVT::v16i1, Expand);
+
+ setOperationAction(ISD::MUL, MVT::v16i32, Legal);
+
+ for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
+ setOperationAction(ISD::SRL, VT, Custom);
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::SRA, VT, Custom);
+ setOperationAction(ISD::CTPOP, VT, Custom);
+ setOperationAction(ISD::CTTZ, VT, Custom);
+ }
+
+ // Need to promote to 64-bit even though we have 32-bit masked instructions
+ // because the IR optimizers rearrange bitcasts around logic ops leaving
+ // too many variations to handle if we don't promote them.
+ setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
+ setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64);
+ setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
+
+ if (Subtarget.hasCDI()) {
+ setOperationAction(ISD::CTLZ, MVT::v8i64, Legal);
+ setOperationAction(ISD::CTLZ, MVT::v16i32, Legal);
+
+ setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v16i16, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v32i8, Custom);
+
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i64, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i32, Custom);
+
+ if (Subtarget.hasVLX()) {
+ setOperationAction(ISD::CTLZ, MVT::v4i64, Legal);
+ setOperationAction(ISD::CTLZ, MVT::v8i32, Legal);
+ setOperationAction(ISD::CTLZ, MVT::v2i64, Legal);
+ setOperationAction(ISD::CTLZ, MVT::v4i32, Legal);
+ } else {
+ setOperationAction(ISD::CTLZ, MVT::v4i64, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v8i32, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
+ }
+
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
+ } // Subtarget.hasCDI()
+
+ if (Subtarget.hasDQI()) {
+ // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
+ setOperationAction(ISD::MUL, MVT::v2i64, Legal);
+ setOperationAction(ISD::MUL, MVT::v4i64, Legal);
+ setOperationAction(ISD::MUL, MVT::v8i64, Legal);
+ }
+
+ // Custom lower several nodes.
+ for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
+ MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
+ setOperationAction(ISD::MGATHER, VT, Custom);
+ setOperationAction(ISD::MSCATTER, VT, Custom);
+ }
+ // Extract subvector is special because the value type
+ // (result) is 256-bit but the source is 512-bit wide.
+ // 128-bit was made Custom under AVX1.
+ for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
+ MVT::v8f32, MVT::v4f64 })
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+ for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,
+ MVT::v16i1, MVT::v32i1, MVT::v64i1 })
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
+
+ for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Legal);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+ setOperationAction(ISD::MLOAD, VT, Legal);
+ setOperationAction(ISD::MSTORE, VT, Legal);
+ setOperationAction(ISD::MGATHER, VT, Legal);
+ setOperationAction(ISD::MSCATTER, VT, Custom);
+ }
+ for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
+ setOperationPromotedToType(ISD::LOAD, VT, MVT::v8i64);
+ setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
+ }
+ }// has AVX-512
+
+ if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
+ addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
+ addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
+
+ addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
+ addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
+
+ setOperationAction(ISD::ADD, MVT::v32i1, Expand);
+ setOperationAction(ISD::ADD, MVT::v64i1, Expand);
+ setOperationAction(ISD::SUB, MVT::v32i1, Expand);
+ setOperationAction(ISD::SUB, MVT::v64i1, Expand);
+ setOperationAction(ISD::MUL, MVT::v32i1, Expand);
+ setOperationAction(ISD::MUL, MVT::v64i1, Expand);
+
+ setOperationAction(ISD::SETCC, MVT::v32i1, Custom);
+ setOperationAction(ISD::SETCC, MVT::v64i1, Custom);
+ setOperationAction(ISD::MUL, MVT::v32i16, Legal);
+ setOperationAction(ISD::MUL, MVT::v64i8, Custom);
+ setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
+ setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i1, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
+ setOperationAction(ISD::SELECT, MVT::v32i1, Custom);
+ setOperationAction(ISD::SELECT, MVT::v64i1, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v32i16, Legal);
+ setOperationAction(ISD::VSELECT, MVT::v64i8, Legal);
+ setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v32i1, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v64i1, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v32i1, Expand);
+ setOperationAction(ISD::VSELECT, MVT::v64i1, Expand);
+ setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
+
+ setOperationAction(ISD::SMAX, MVT::v64i8, Legal);
+ setOperationAction(ISD::SMAX, MVT::v32i16, Legal);
+ setOperationAction(ISD::UMAX, MVT::v64i8, Legal);
+ setOperationAction(ISD::UMAX, MVT::v32i16, Legal);
+ setOperationAction(ISD::SMIN, MVT::v64i8, Legal);
+ setOperationAction(ISD::SMIN, MVT::v32i16, Legal);
+ setOperationAction(ISD::UMIN, MVT::v64i8, Legal);
+ setOperationAction(ISD::UMIN, MVT::v32i16, Legal);
+
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
+
+ setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
+ if (Subtarget.hasVLX()) {
+ setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
+ setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
+ }
+
+ LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom;
+ for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
+ setOperationAction(ISD::MLOAD, VT, Action);
+ setOperationAction(ISD::MSTORE, VT, Action);
+ }
+
+ if (Subtarget.hasCDI()) {
+ setOperationAction(ISD::CTLZ, MVT::v32i16, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v64i8, Custom);
+ }
+
+ for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Legal);
+ setOperationAction(ISD::SRL, VT, Custom);
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::SRA, VT, Custom);
+ setOperationAction(ISD::MLOAD, VT, Legal);
+ setOperationAction(ISD::MSTORE, VT, Legal);
+ setOperationAction(ISD::CTPOP, VT, Custom);
+ setOperationAction(ISD::CTTZ, VT, Custom);
+
+ setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
+ setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
+ setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64);
+ }
+
+ for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
+ setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
+ if (Subtarget.hasVLX()) {
+ // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
+ setLoadExtAction(ExtType, MVT::v16i16, MVT::v16i8, Legal);
+ setLoadExtAction(ExtType, MVT::v8i16, MVT::v8i8, Legal);
+ }
+ }
+ }
+
+ if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
+ addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
+ addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
+
+ for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
+ setOperationAction(ISD::ADD, VT, Expand);
+ setOperationAction(ISD::SUB, VT, Expand);
+ setOperationAction(ISD::MUL, VT, Expand);
+ setOperationAction(ISD::VSELECT, VT, Expand);
+
+ setOperationAction(ISD::TRUNCATE, VT, Custom);
+ setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ }
+
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
+
+ for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
+ setOperationAction(ISD::SMAX, VT, Legal);
+ setOperationAction(ISD::UMAX, VT, Legal);
+ setOperationAction(ISD::SMIN, VT, Legal);
+ setOperationAction(ISD::UMIN, VT, Legal);
+ }
+ }
+
+ // We want to custom lower some of our intrinsics.
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+ if (!Subtarget.is64Bit()) {
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
+ }
+
+ // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
+ // handle type legalization for these operations here.
+ //
+ // FIXME: We really should do custom legalization for addition and
+ // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
+ // than generic legalization for 64-bit multiplication-with-overflow, though.
+ for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+ if (VT == MVT::i64 && !Subtarget.is64Bit())
+ continue;
+ // Add/Sub/Mul with overflow operations are custom lowered.
+ setOperationAction(ISD::SADDO, VT, Custom);
+ setOperationAction(ISD::UADDO, VT, Custom);
+ setOperationAction(ISD::SSUBO, VT, Custom);
+ setOperationAction(ISD::USUBO, VT, Custom);
+ setOperationAction(ISD::SMULO, VT, Custom);
+ setOperationAction(ISD::UMULO, VT, Custom);
+ }
+
+ if (!Subtarget.is64Bit()) {
+ // These libcalls are not available in 32-bit.
+ setLibcallName(RTLIB::SHL_I128, nullptr);
+ setLibcallName(RTLIB::SRL_I128, nullptr);
+ setLibcallName(RTLIB::SRA_I128, nullptr);
+ }
+
+ // Combine sin / cos into one node or libcall if possible.
+ if (Subtarget.hasSinCos()) {
+ setLibcallName(RTLIB::SINCOS_F32, "sincosf");
+ setLibcallName(RTLIB::SINCOS_F64, "sincos");
+ if (Subtarget.isTargetDarwin()) {
+ // For MacOSX, we don't want the normal expansion of a libcall to sincos.
+ // We want to issue a libcall to __sincos_stret to avoid memory traffic.
+ setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
+ setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
+ }
+ }
+
+ if (Subtarget.isTargetWin64()) {
+ setOperationAction(ISD::SDIV, MVT::i128, Custom);
+ setOperationAction(ISD::UDIV, MVT::i128, Custom);
+ setOperationAction(ISD::SREM, MVT::i128, Custom);
+ setOperationAction(ISD::UREM, MVT::i128, Custom);
+ setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
+ setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
+ }
+
+ // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
+ // is. We should promote the value to 64-bits to solve this.
+ // This is what the CRT headers do - `fmodf` is an inline header
+ // function casting to f64 and calling `fmod`.
+ if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
+ Subtarget.isTargetWindowsItanium()))
+ for (ISD::NodeType Op :
+ {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
+ ISD::FLOG10, ISD::FPOW, ISD::FSIN})
+ if (isOperationExpand(Op, MVT::f32))
+ setOperationAction(Op, MVT::f32, Promote);
+
+ // We have target-specific dag combine patterns for the following nodes:
+ setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
+ setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+ setTargetDAGCombine(ISD::BITCAST);
+ setTargetDAGCombine(ISD::VSELECT);
+ setTargetDAGCombine(ISD::SELECT);
+ setTargetDAGCombine(ISD::SHL);
+ setTargetDAGCombine(ISD::SRA);
+ setTargetDAGCombine(ISD::SRL);
+ setTargetDAGCombine(ISD::OR);
+ setTargetDAGCombine(ISD::AND);
+ setTargetDAGCombine(ISD::ADD);
+ setTargetDAGCombine(ISD::FADD);
+ setTargetDAGCombine(ISD::FSUB);
+ setTargetDAGCombine(ISD::FNEG);
+ setTargetDAGCombine(ISD::FMA);
+ setTargetDAGCombine(ISD::FMINNUM);
+ setTargetDAGCombine(ISD::FMAXNUM);
+ setTargetDAGCombine(ISD::SUB);
+ setTargetDAGCombine(ISD::LOAD);
+ setTargetDAGCombine(ISD::MLOAD);
+ setTargetDAGCombine(ISD::STORE);
+ setTargetDAGCombine(ISD::MSTORE);
+ setTargetDAGCombine(ISD::TRUNCATE);
+ setTargetDAGCombine(ISD::ZERO_EXTEND);
+ setTargetDAGCombine(ISD::ANY_EXTEND);
+ setTargetDAGCombine(ISD::SIGN_EXTEND);
+ setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
+ setTargetDAGCombine(ISD::SINT_TO_FP);
+ setTargetDAGCombine(ISD::UINT_TO_FP);
+ setTargetDAGCombine(ISD::SETCC);
+ setTargetDAGCombine(ISD::MUL);
+ setTargetDAGCombine(ISD::XOR);
+ setTargetDAGCombine(ISD::MSCATTER);
+ setTargetDAGCombine(ISD::MGATHER);
+
+ computeRegisterProperties(Subtarget.getRegisterInfo());
+
+ MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
+ MaxStoresPerMemsetOptSize = 8;
+ MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
+ MaxStoresPerMemcpyOptSize = 4;
+ MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
+ MaxStoresPerMemmoveOptSize = 4;
+ setPrefLoopAlignment(4); // 2^4 bytes.
+
+ // An out-of-order CPU can speculatively execute past a predictable branch,
+ // but a conditional move could be stalled by an expensive earlier operation.
+ PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
+ EnableExtLdPromotion = true;
+ setPrefFunctionAlignment(4); // 2^4 bytes.
+
+ verifyIntrinsicTables();
+}
+
+// This has so far only been implemented for 64-bit MachO.
+bool X86TargetLowering::useLoadStackGuardNode() const {
+ return Subtarget.isTargetMachO() && Subtarget.is64Bit();
+}
+
+TargetLoweringBase::LegalizeTypeAction
+X86TargetLowering::getPreferredVectorAction(EVT VT) const {
+ if (ExperimentalVectorWideningLegalization &&
+ VT.getVectorNumElements() != 1 &&
+ VT.getVectorElementType().getSimpleVT() != MVT::i1)
+ return TypeWidenVector;
+
+ return TargetLoweringBase::getPreferredVectorAction(VT);
+}
+
+EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
+ LLVMContext& Context,
+ EVT VT) const {
+ if (!VT.isVector())
+ return Subtarget.hasAVX512() ? MVT::i1: MVT::i8;
+
+ if (VT.isSimple()) {
+ MVT VVT = VT.getSimpleVT();
+ const unsigned NumElts = VVT.getVectorNumElements();
+ MVT EltVT = VVT.getVectorElementType();
+ if (VVT.is512BitVector()) {
+ if (Subtarget.hasAVX512())
+ if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
+ EltVT == MVT::f32 || EltVT == MVT::f64)
+ switch(NumElts) {
+ case 8: return MVT::v8i1;
+ case 16: return MVT::v16i1;
+ }
+ if (Subtarget.hasBWI())
+ if (EltVT == MVT::i8 || EltVT == MVT::i16)
+ switch(NumElts) {
+ case 32: return MVT::v32i1;
+ case 64: return MVT::v64i1;
+ }
+ }
+
+ if (Subtarget.hasBWI() && Subtarget.hasVLX())
+ return MVT::getVectorVT(MVT::i1, NumElts);
+
+ if (!isTypeLegal(VT) && getTypeAction(Context, VT) == TypePromoteInteger) {
+ EVT LegalVT = getTypeToTransformTo(Context, VT);
+ EltVT = LegalVT.getVectorElementType().getSimpleVT();
+ }
+
+ if (Subtarget.hasVLX() && EltVT.getSizeInBits() >= 32)
+ switch(NumElts) {
+ case 2: return MVT::v2i1;
+ case 4: return MVT::v4i1;
+ case 8: return MVT::v8i1;
+ }
+ }
+
+ return VT.changeVectorElementTypeToInteger();
+}
+
+/// Helper for getByValTypeAlignment to determine
+/// the desired ByVal argument alignment.
+static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
+ if (MaxAlign == 16)
+ return;
+ if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+ if (VTy->getBitWidth() == 128)
+ MaxAlign = 16;
+ } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+ unsigned EltAlign = 0;
+ getMaxByValAlign(ATy->getElementType(), EltAlign);
+ if (EltAlign > MaxAlign)
+ MaxAlign = EltAlign;
+ } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
+ for (auto *EltTy : STy->elements()) {
+ unsigned EltAlign = 0;
+ getMaxByValAlign(EltTy, EltAlign);
+ if (EltAlign > MaxAlign)
+ MaxAlign = EltAlign;
+ if (MaxAlign == 16)
+ break;
+ }
+ }
+}
+
+/// Return the desired alignment for ByVal aggregate
+/// function arguments in the caller parameter area. For X86, aggregates
+/// that contain SSE vectors are placed at 16-byte boundaries while the rest
+/// are at 4-byte boundaries.
+unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
+ const DataLayout &DL) const {
+ if (Subtarget.is64Bit()) {
+ // Max of 8 and alignment of type.
+ unsigned TyAlign = DL.getABITypeAlignment(Ty);
+ if (TyAlign > 8)
+ return TyAlign;
+ return 8;
+ }
+
+ unsigned Align = 4;
+ if (Subtarget.hasSSE1())
+ getMaxByValAlign(Ty, Align);
+ return Align;
+}
+
+/// Returns the target specific optimal type for load
+/// and store operations as a result of memset, memcpy, and memmove
+/// lowering. If DstAlign is zero that means it's safe to destination
+/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
+/// means there isn't a need to check it against alignment requirement,
+/// probably because the source does not need to be loaded. If 'IsMemset' is
+/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
+/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
+/// source is constant so it does not need to be loaded.
+/// It returns EVT::Other if the type should be determined using generic
+/// target-independent logic.
+EVT
+X86TargetLowering::getOptimalMemOpType(uint64_t Size,
+ unsigned DstAlign, unsigned SrcAlign,
+ bool IsMemset, bool ZeroMemset,
+ bool MemcpyStrSrc,
+ MachineFunction &MF) const {
+ const Function *F = MF.getFunction();
+ if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) {
+ if (Size >= 16 &&
+ (!Subtarget.isUnalignedMem16Slow() ||
+ ((DstAlign == 0 || DstAlign >= 16) &&
+ (SrcAlign == 0 || SrcAlign >= 16)))) {
+ // FIXME: Check if unaligned 32-byte accesses are slow.
+ if (Size >= 32 && Subtarget.hasAVX()) {
+ // Although this isn't a well-supported type for AVX1, we'll let
+ // legalization and shuffle lowering produce the optimal codegen. If we
+ // choose an optimal type with a vector element larger than a byte,
+ // getMemsetStores() may create an intermediate splat (using an integer
+ // multiply) before we splat as a vector.
+ return MVT::v32i8;
+ }
+ if (Subtarget.hasSSE2())
+ return MVT::v16i8;
+ // TODO: Can SSE1 handle a byte vector?
+ if (Subtarget.hasSSE1())
+ return MVT::v4f32;
+ } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
+ !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
+ // Do not use f64 to lower memcpy if source is string constant. It's
+ // better to use i32 to avoid the loads.
+ // Also, do not use f64 to lower memset unless this is a memset of zeros.
+ // The gymnastics of splatting a byte value into an XMM register and then
+ // only using 8-byte stores (because this is a CPU with slow unaligned
+ // 16-byte accesses) makes that a loser.
+ return MVT::f64;
+ }
+ }
+ // This is a compromise. If we reach here, unaligned accesses may be slow on
+ // this target. However, creating smaller, aligned accesses could be even
+ // slower and would certainly be a lot more code.
+ if (Subtarget.is64Bit() && Size >= 8)
+ return MVT::i64;
+ return MVT::i32;
+}
+
+bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
+ if (VT == MVT::f32)
+ return X86ScalarSSEf32;
+ else if (VT == MVT::f64)
+ return X86ScalarSSEf64;
+ return true;
+}
+
+bool
+X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+ unsigned,
+ unsigned,
+ bool *Fast) const {
+ if (Fast) {
+ switch (VT.getSizeInBits()) {
+ default:
+ // 8-byte and under are always assumed to be fast.
+ *Fast = true;
+ break;
+ case 128:
+ *Fast = !Subtarget.isUnalignedMem16Slow();
+ break;
+ case 256:
+ *Fast = !Subtarget.isUnalignedMem32Slow();
+ break;
+ // TODO: What about AVX-512 (512-bit) accesses?
+ }
+ }
+ // Misaligned accesses of any size are always allowed.
+ return true;
+}
+
+/// Return the entry encoding for a jump table in the
+/// current function. The returned value is a member of the
+/// MachineJumpTableInfo::JTEntryKind enum.
+unsigned X86TargetLowering::getJumpTableEncoding() const {
+ // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
+ // symbol.
+ if (isPositionIndependent() && Subtarget.isPICStyleGOT())
+ return MachineJumpTableInfo::EK_Custom32;
+
+ // Otherwise, use the normal jump table encoding heuristics.
+ return TargetLowering::getJumpTableEncoding();
+}
+
+bool X86TargetLowering::useSoftFloat() const {
+ return Subtarget.useSoftFloat();
+}
+
+const MCExpr *
+X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
+ const MachineBasicBlock *MBB,
+ unsigned uid,MCContext &Ctx) const{
+ assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
+ // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
+ // entries.
+ return MCSymbolRefExpr::create(MBB->getSymbol(),
+ MCSymbolRefExpr::VK_GOTOFF, Ctx);
+}
+
+/// Returns relocation base for the given PIC jumptable.
+SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
+ SelectionDAG &DAG) const {
+ if (!Subtarget.is64Bit())
+ // This doesn't have SDLoc associated with it, but is not really the
+ // same as a Register.
+ return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
+ getPointerTy(DAG.getDataLayout()));
+ return Table;
+}
+
+/// This returns the relocation base for the given PIC jumptable,
+/// the same as getPICJumpTableRelocBase, but as an MCExpr.
+const MCExpr *X86TargetLowering::
+getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
+ MCContext &Ctx) const {
+ // X86-64 uses RIP relative addressing based on the jump table label.
+ if (Subtarget.isPICStyleRIPRel())
+ return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
+
+ // Otherwise, the reference is relative to the PIC base.
+ return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
+}
+
+std::pair<const TargetRegisterClass *, uint8_t>
+X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
+ MVT VT) const {
+ const TargetRegisterClass *RRC = nullptr;
+ uint8_t Cost = 1;
+ switch (VT.SimpleTy) {
+ default:
+ return TargetLowering::findRepresentativeClass(TRI, VT);
+ case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
+ RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
+ break;
+ case MVT::x86mmx:
+ RRC = &X86::VR64RegClass;
+ break;
+ case MVT::f32: case MVT::f64:
+ case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
+ case MVT::v4f32: case MVT::v2f64:
+ case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
+ case MVT::v8f32: case MVT::v4f64:
+ case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
+ case MVT::v16f32: case MVT::v8f64:
+ RRC = &X86::VR128XRegClass;
+ break;
+ }
+ return std::make_pair(RRC, Cost);
+}
+
+unsigned X86TargetLowering::getAddressSpace() const {
+ if (Subtarget.is64Bit())
+ return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
+ return 256;
+}
+
+Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
+ // glibc has a special slot for the stack guard in tcbhead_t, use it instead
+ // of the usual global variable (see sysdeps/{i386,x86_64}/nptl/tls.h)
+ if (!Subtarget.isTargetGlibc())
+ return TargetLowering::getIRStackGuard(IRB);
+
+ // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
+ // %gs:0x14 on i386
+ unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
+ unsigned AddressSpace = getAddressSpace();
+ return ConstantExpr::getIntToPtr(
+ ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
+ Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
+}
+
+void X86TargetLowering::insertSSPDeclarations(Module &M) const {
+ // MSVC CRT provides functionalities for stack protection.
+ if (Subtarget.getTargetTriple().isOSMSVCRT()) {
+ // MSVC CRT has a global variable holding security cookie.
+ M.getOrInsertGlobal("__security_cookie",
+ Type::getInt8PtrTy(M.getContext()));
+
+ // MSVC CRT has a function to validate security cookie.
+ auto *SecurityCheckCookie = cast<Function>(
+ M.getOrInsertFunction("__security_check_cookie",
+ Type::getVoidTy(M.getContext()),
+ Type::getInt8PtrTy(M.getContext()), nullptr));
+ SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
+ SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
+ return;
+ }
+ // glibc has a special slot for the stack guard.
+ if (Subtarget.isTargetGlibc())
+ return;
+ TargetLowering::insertSSPDeclarations(M);
+}
+
+Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
+ // MSVC CRT has a global variable holding security cookie.
+ if (Subtarget.getTargetTriple().isOSMSVCRT())
+ return M.getGlobalVariable("__security_cookie");
+ return TargetLowering::getSDagStackGuard(M);
+}
+
+Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
+ // MSVC CRT has a function to validate security cookie.
+ if (Subtarget.getTargetTriple().isOSMSVCRT())
+ return M.getFunction("__security_check_cookie");
+ return TargetLowering::getSSPStackGuardCheck(M);
+}
+
+Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
+ if (Subtarget.getTargetTriple().isOSContiki())
+ return getDefaultSafeStackPointerLocation(IRB, false);
+
+ if (!Subtarget.isTargetAndroid())
+ return TargetLowering::getSafeStackPointerLocation(IRB);
+
+ // Android provides a fixed TLS slot for the SafeStack pointer. See the
+ // definition of TLS_SLOT_SAFESTACK in
+ // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
+ unsigned AddressSpace, Offset;
+
+ // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
+ // %gs:0x24 on i386
+ Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
+ AddressSpace = getAddressSpace();
+ return ConstantExpr::getIntToPtr(
+ ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
+ Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
+}
+
+bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
+ unsigned DestAS) const {
+ assert(SrcAS != DestAS && "Expected different address spaces!");
+
+ return SrcAS < 256 && DestAS < 256;
+}
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "X86GenCallingConv.inc"
+
+bool X86TargetLowering::CanLowerReturn(
+ CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
+ return CCInfo.CheckReturn(Outs, RetCC_X86);
+}
+
+const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
+ static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
+ return ScratchRegs;
+}
+
+/// Lowers masks values (v*i1) to the local register values
+/// \returns DAG node after lowering to register type
+static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
+ const SDLoc &Dl, SelectionDAG &DAG) {
+ EVT ValVT = ValArg.getValueType();
+
+ if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
+ (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
+ // Two stage lowering might be required
+ // bitcast: v8i1 -> i8 / v16i1 -> i16
+ // anyextend: i8 -> i32 / i16 -> i32
+ EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
+ SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
+ if (ValLoc == MVT::i32)
+ ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
+ return ValToCopy;
+ } else if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
+ (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
+ // One stage lowering is required
+ // bitcast: v32i1 -> i32 / v64i1 -> i64
+ return DAG.getBitcast(ValLoc, ValArg);
+ } else
+ return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg);
+}
+
+/// Breaks v64i1 value into two registers and adds the new node to the DAG
+static void Passv64i1ArgInRegs(
+ const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
+ SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
+ CCValAssign &NextVA, const X86Subtarget &Subtarget) {
+ assert((Subtarget.hasBWI() || Subtarget.hasBMI()) &&
+ "Expected AVX512BW or AVX512BMI target!");
+ assert(Subtarget.is32Bit() && "Expecting 32 bit target");
+ assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
+ assert(VA.isRegLoc() && NextVA.isRegLoc() &&
+ "The value should reside in two registers");
+
+ // Before splitting the value we cast it to i64
+ Arg = DAG.getBitcast(MVT::i64, Arg);
+
+ // Splitting the value into two i32 types
+ SDValue Lo, Hi;
+ Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
+ DAG.getConstant(0, Dl, MVT::i32));
+ Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
+ DAG.getConstant(1, Dl, MVT::i32));
+
+ // Attach the two i32 types into corresponding registers
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
+ RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
+}
+
+SDValue
+X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SDLoc &dl, SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+
+ if (CallConv == CallingConv::X86_INTR && !Outs.empty())
+ report_fatal_error("X86 interrupts may not return any value");
+
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
+ CCInfo.AnalyzeReturn(Outs, RetCC_X86);
+
+ SDValue Flag;
+ SmallVector<SDValue, 6> RetOps;
+ RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
+ // Operand #1 = Bytes To Pop
+ RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
+ MVT::i32));
+
+ // Copy the result values into the output registers.
+ for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
+ ++I, ++OutsIndex) {
+ CCValAssign &VA = RVLocs[I];
+ assert(VA.isRegLoc() && "Can only return in registers!");
+ SDValue ValToCopy = OutVals[OutsIndex];
+ EVT ValVT = ValToCopy.getValueType();
+
+ // Promote values to the appropriate types.
+ if (VA.getLocInfo() == CCValAssign::SExt)
+ ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
+ else if (VA.getLocInfo() == CCValAssign::ZExt)
+ ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
+ else if (VA.getLocInfo() == CCValAssign::AExt) {
+ if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
+ ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
+ else
+ ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
+ }
+ else if (VA.getLocInfo() == CCValAssign::BCvt)
+ ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
+
+ assert(VA.getLocInfo() != CCValAssign::FPExt &&
+ "Unexpected FP-extend for return value.");
+
+ // If this is x86-64, and we disabled SSE, we can't return FP values,
+ // or SSE or MMX vectors.
+ if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
+ VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
+ (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
+ report_fatal_error("SSE register return with SSE disabled");
+ }
+ // Likewise we can't return F64 values with SSE1 only. gcc does so, but
+ // llvm-gcc has never done it right and no one has noticed, so this
+ // should be OK for now.
+ if (ValVT == MVT::f64 &&
+ (Subtarget.is64Bit() && !Subtarget.hasSSE2()))
+ report_fatal_error("SSE2 register return with SSE2 disabled");
+
+ // Returns in ST0/ST1 are handled specially: these are pushed as operands to
+ // the RET instruction and handled by the FP Stackifier.
+ if (VA.getLocReg() == X86::FP0 ||
+ VA.getLocReg() == X86::FP1) {
+ // If this is a copy from an xmm register to ST(0), use an FPExtend to
+ // change the value to the FP stack register class.
+ if (isScalarFPTypeInSSEReg(VA.getValVT()))
+ ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
+ RetOps.push_back(ValToCopy);
+ // Don't emit a copytoreg.
+ continue;
+ }
+
+ // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
+ // which is returned in RAX / RDX.
+ if (Subtarget.is64Bit()) {
+ if (ValVT == MVT::x86mmx) {
+ if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
+ ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
+ ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
+ ValToCopy);
+ // If we don't have SSE2 available, convert to v4f32 so the generated
+ // register is legal.
+ if (!Subtarget.hasSSE2())
+ ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
+ }
+ }
+ }
+
+ SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+
+ if (VA.needsCustom()) {
+ assert(VA.getValVT() == MVT::v64i1 &&
+ "Currently the only custom case is when we split v64i1 to 2 regs");
+
+ Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
+ Subtarget);
+
+ assert(2 == RegsToPass.size() &&
+ "Expecting two registers after Pass64BitArgInRegs");
+ } else {
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
+ }
+
+ // Add nodes to the DAG and add the values into the RetOps list
+ for (auto &Reg : RegsToPass) {
+ Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
+ Flag = Chain.getValue(1);
+ RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
+ }
+ }
+
+ // Swift calling convention does not require we copy the sret argument
+ // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
+
+ // All x86 ABIs require that for returning structs by value we copy
+ // the sret argument into %rax/%eax (depending on ABI) for the return.
+ // We saved the argument into a virtual register in the entry block,
+ // so now we copy the value out and into %rax/%eax.
+ //
+ // Checking Function.hasStructRetAttr() here is insufficient because the IR
+ // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
+ // false, then an sret argument may be implicitly inserted in the SelDAG. In
+ // either case FuncInfo->setSRetReturnReg() will have been called.
+ if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
+ // When we have both sret and another return value, we should use the
+ // original Chain stored in RetOps[0], instead of the current Chain updated
+ // in the above loop. If we only have sret, RetOps[0] equals to Chain.
+
+ // For the case of sret and another return value, we have
+ // Chain_0 at the function entry
+ // Chain_1 = getCopyToReg(Chain_0) in the above loop
+ // If we use Chain_1 in getCopyFromReg, we will have
+ // Val = getCopyFromReg(Chain_1)
+ // Chain_2 = getCopyToReg(Chain_1, Val) from below
+
+ // getCopyToReg(Chain_0) will be glued together with
+ // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
+ // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
+ // Data dependency from Unit B to Unit A due to usage of Val in
+ // getCopyToReg(Chain_1, Val)
+ // Chain dependency from Unit A to Unit B
+
+ // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
+ SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
+ getPointerTy(MF.getDataLayout()));
+
+ unsigned RetValReg
+ = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
+ X86::RAX : X86::EAX;
+ Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
+ Flag = Chain.getValue(1);
+
+ // RAX/EAX now acts like a return value.
+ RetOps.push_back(
+ DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
+ }
+
+ const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
+ const MCPhysReg *I =
+ TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
+ if (I) {
+ for (; *I; ++I) {
+ if (X86::GR64RegClass.contains(*I))
+ RetOps.push_back(DAG.getRegister(*I, MVT::i64));
+ else
+ llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+ }
+ }
+
+ RetOps[0] = Chain; // Update chain.
+
+ // Add the flag if we have it.
+ if (Flag.getNode())
+ RetOps.push_back(Flag);
+
+ X86ISD::NodeType opcode = X86ISD::RET_FLAG;
+ if (CallConv == CallingConv::X86_INTR)
+ opcode = X86ISD::IRET;
+ return DAG.getNode(opcode, dl, MVT::Other, RetOps);
+}
+
+bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
+ if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
+ return false;
+
+ SDValue TCChain = Chain;
+ SDNode *Copy = *N->use_begin();
+ if (Copy->getOpcode() == ISD::CopyToReg) {
+ // If the copy has a glue operand, we conservatively assume it isn't safe to
+ // perform a tail call.
+ if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
+ return false;
+ TCChain = Copy->getOperand(0);
+ } else if (Copy->getOpcode() != ISD::FP_EXTEND)
+ return false;
+
+ bool HasRet = false;
+ for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
+ UI != UE; ++UI) {
+ if (UI->getOpcode() != X86ISD::RET_FLAG)
+ return false;
+ // If we are returning more than one value, we can definitely
+ // not make a tail call see PR19530
+ if (UI->getNumOperands() > 4)
+ return false;
+ if (UI->getNumOperands() == 4 &&
+ UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
+ return false;
+ HasRet = true;
+ }
+
+ if (!HasRet)
+ return false;
+
+ Chain = TCChain;
+ return true;
+}
+
+EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
+ ISD::NodeType ExtendKind) const {
+ MVT ReturnMVT = MVT::i32;
+
+ bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
+ if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
+ // The ABI does not require i1, i8 or i16 to be extended.
+ //
+ // On Darwin, there is code in the wild relying on Clang's old behaviour of
+ // always extending i8/i16 return values, so keep doing that for now.
+ // (PR26665).
+ ReturnMVT = MVT::i8;
+ }
+
+ EVT MinVT = getRegisterType(Context, ReturnMVT);
+ return VT.bitsLT(MinVT) ? MinVT : VT;
+}
+
+/// Reads two 32 bit registers and creates a 64 bit mask value.
+/// \param VA The current 32 bit value that need to be assigned.
+/// \param NextVA The next 32 bit value that need to be assigned.
+/// \param Root The parent DAG node.
+/// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
+/// glue purposes. In the case the DAG is already using
+/// physical register instead of virtual, we should glue
+/// our new SDValue to InFlag SDvalue.
+/// \return a new SDvalue of size 64bit.
+static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
+ SDValue &Root, SelectionDAG &DAG,
+ const SDLoc &Dl, const X86Subtarget &Subtarget,
+ SDValue *InFlag = nullptr) {
+ assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
+ assert(Subtarget.is32Bit() && "Expecting 32 bit target");
+ assert(VA.getValVT() == MVT::v64i1 &&
+ "Expecting first location of 64 bit width type");
+ assert(NextVA.getValVT() == VA.getValVT() &&
+ "The locations should have the same type");
+ assert(VA.isRegLoc() && NextVA.isRegLoc() &&
+ "The values should reside in two registers");
+
+ SDValue Lo, Hi;
+ unsigned Reg;
+ SDValue ArgValueLo, ArgValueHi;
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ const TargetRegisterClass *RC = &X86::GR32RegClass;
+
+ // Read a 32 bit value from the registers
+ if (nullptr == InFlag) {
+ // When no physical register is present,
+ // create an intermediate virtual register
+ Reg = MF.addLiveIn(VA.getLocReg(), RC);
+ ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
+ Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
+ ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
+ } else {
+ // When a physical register is available read the value from it and glue
+ // the reads together.
+ ArgValueLo =
+ DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
+ *InFlag = ArgValueLo.getValue(2);
+ ArgValueHi =
+ DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
+ *InFlag = ArgValueHi.getValue(2);
+ }
+
+ // Convert the i32 type into v32i1 type
+ Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
+
+ // Convert the i32 type into v32i1 type
+ Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
+
+ // Concantenate the two values together
+ return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
+}
+
+/// The function will lower a register of various sizes (8/16/32/64)
+/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
+/// \returns a DAG node contains the operand after lowering to mask type.
+static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
+ const EVT &ValLoc, const SDLoc &Dl,
+ SelectionDAG &DAG) {
+ SDValue ValReturned = ValArg;
+
+ if (ValVT == MVT::v64i1) {
+ // In 32 bit machine, this case is handled by getv64i1Argument
+ assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
+ // In 64 bit machine, There is no need to truncate the value only bitcast
+ } else {
+ MVT maskLen;
+ switch (ValVT.getSimpleVT().SimpleTy) {
+ case MVT::v8i1:
+ maskLen = MVT::i8;
+ break;
+ case MVT::v16i1:
+ maskLen = MVT::i16;
+ break;
+ case MVT::v32i1:
+ maskLen = MVT::i32;
+ break;
+ default:
+ llvm_unreachable("Expecting a vector of i1 types");
+ }
+
+ ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
+ }
+
+ return DAG.getBitcast(ValVT, ValReturned);
+}
+
+/// Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+///
+SDValue X86TargetLowering::LowerCallResult(
+ SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+
+ // Assign locations to each value returned by this call.
+ SmallVector<CCValAssign, 16> RVLocs;
+ bool Is64Bit = Subtarget.is64Bit();
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+ *DAG.getContext());
+ CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
+
+ // Copy all of the result registers out of their specified physreg.
+ for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
+ ++I, ++InsIndex) {
+ CCValAssign &VA = RVLocs[I];
+ EVT CopyVT = VA.getLocVT();
+
+ // If this is x86-64, and we disabled SSE, we can't return FP values
+ if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
+ ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
+ report_fatal_error("SSE register return with SSE disabled");
+ }
+
+ // If we prefer to use the value in xmm registers, copy it out as f80 and
+ // use a truncate to move it from fp stack reg to xmm reg.
+ bool RoundAfterCopy = false;
+ if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
+ isScalarFPTypeInSSEReg(VA.getValVT())) {
+ if (!Subtarget.hasX87())
+ report_fatal_error("X87 register return with X87 disabled");
+ CopyVT = MVT::f80;
+ RoundAfterCopy = (CopyVT != VA.getLocVT());
+ }
+
+ SDValue Val;
+ if (VA.needsCustom()) {
+ assert(VA.getValVT() == MVT::v64i1 &&
+ "Currently the only custom case is when we split v64i1 to 2 regs");
+ Val =
+ getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
+ } else {
+ Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
+ .getValue(1);
+ Val = Chain.getValue(0);
+ InFlag = Chain.getValue(2);
+ }
+
+ if (RoundAfterCopy)
+ Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
+ // This truncation won't change the value.
+ DAG.getIntPtrConstant(1, dl));
+
+ if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
+ if (VA.getValVT().isVector() &&
+ ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
+ (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
+ // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
+ Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
+ } else
+ Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
+ }
+
+ InVals.push_back(Val);
+ }
+
+ return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+// C & StdCall & Fast Calling Convention implementation
+//===----------------------------------------------------------------------===//
+// StdCall calling convention seems to be standard for many Windows' API
+// routines and around. It differs from C calling convention just a little:
+// callee should clean up the stack, not caller. Symbols should be also
+// decorated in some fancy way :) It doesn't support any vector arguments.
+// For info on fast calling convention see Fast Calling Convention (tail call)
+// implementation LowerX86_32FastCCCallTo.
+
+/// CallIsStructReturn - Determines whether a call uses struct return
+/// semantics.
+enum StructReturnType {
+ NotStructReturn,
+ RegStructReturn,
+ StackStructReturn
+};
+static StructReturnType
+callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
+ if (Outs.empty())
+ return NotStructReturn;
+
+ const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
+ if (!Flags.isSRet())
+ return NotStructReturn;
+ if (Flags.isInReg() || IsMCU)
+ return RegStructReturn;
+ return StackStructReturn;
+}
+
+/// Determines whether a function uses struct return semantics.
+static StructReturnType
+argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
+ if (Ins.empty())
+ return NotStructReturn;
+
+ const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
+ if (!Flags.isSRet())
+ return NotStructReturn;
+ if (Flags.isInReg() || IsMCU)
+ return RegStructReturn;
+ return StackStructReturn;
+}
+
+/// Make a copy of an aggregate at address specified by "Src" to address
+/// "Dst" with size and alignment information specified by the specific
+/// parameter attribute. The copy will be passed as a byval function parameter.
+static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
+ SDValue Chain, ISD::ArgFlagsTy Flags,
+ SelectionDAG &DAG, const SDLoc &dl) {
+ SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
+
+ return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
+ /*isVolatile*/false, /*AlwaysInline=*/true,
+ /*isTailCall*/false,
+ MachinePointerInfo(), MachinePointerInfo());
+}
+
+/// Return true if the calling convention is one that we can guarantee TCO for.
+static bool canGuaranteeTCO(CallingConv::ID CC) {
+ return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
+ CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
+ CC == CallingConv::HHVM);
+}
+
+/// Return true if we might ever do TCO for calls with this calling convention.
+static bool mayTailCallThisCC(CallingConv::ID CC) {
+ switch (CC) {
+ // C calling conventions:
+ case CallingConv::C:
+ case CallingConv::X86_64_Win64:
+ case CallingConv::X86_64_SysV:
+ // Callee pop conventions:
+ case CallingConv::X86_ThisCall:
+ case CallingConv::X86_StdCall:
+ case CallingConv::X86_VectorCall:
+ case CallingConv::X86_FastCall:
+ return true;
+ default:
+ return canGuaranteeTCO(CC);
+ }
+}
+
+/// Return true if the function is being made into a tailcall target by
+/// changing its ABI.
+static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
+ return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
+}
+
+bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
+ auto Attr =
+ CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
+ if (!CI->isTailCall() || Attr.getValueAsString() == "true")
+ return false;
+
+ CallSite CS(CI);
+ CallingConv::ID CalleeCC = CS.getCallingConv();
+ if (!mayTailCallThisCC(CalleeCC))
+ return false;
+
+ return true;
+}
+
+SDValue
+X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &dl, SelectionDAG &DAG,
+ const CCValAssign &VA,
+ MachineFrameInfo &MFI, unsigned i) const {
+ // Create the nodes corresponding to a load from this parameter slot.
+ ISD::ArgFlagsTy Flags = Ins[i].Flags;
+ bool AlwaysUseMutable = shouldGuaranteeTCO(
+ CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
+ bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
+ EVT ValVT;
+
+ // If value is passed by pointer we have address passed instead of the value
+ // itself. No need to extend if the mask value and location share the same
+ // absolute size.
+ bool ExtendedInMem =
+ VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
+ VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
+
+ if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
+ ValVT = VA.getLocVT();
+ else
+ ValVT = VA.getValVT();
+
+ // Calculate SP offset of interrupt parameter, re-arrange the slot normally
+ // taken by a return address.
+ int Offset = 0;
+ if (CallConv == CallingConv::X86_INTR) {
+ const X86Subtarget& Subtarget =
+ static_cast<const X86Subtarget&>(DAG.getSubtarget());
+ // X86 interrupts may take one or two arguments.
+ // On the stack there will be no return address as in regular call.
+ // Offset of last argument need to be set to -4/-8 bytes.
+ // Where offset of the first argument out of two, should be set to 0 bytes.
+ Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
+ }
+
+ // FIXME: For now, all byval parameter objects are marked mutable. This can be
+ // changed with more analysis.
+ // In case of tail call optimization mark all arguments mutable. Since they
+ // could be overwritten by lowering of arguments in case of a tail call.
+ if (Flags.isByVal()) {
+ unsigned Bytes = Flags.getByValSize();
+ if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
+ int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
+ // Adjust SP offset of interrupt parameter.
+ if (CallConv == CallingConv::X86_INTR) {
+ MFI.setObjectOffset(FI, Offset);
+ }
+ return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+ } else {
+ int FI = MFI.CreateFixedObject(ValVT.getSizeInBits()/8,
+ VA.getLocMemOffset(), isImmutable);
+
+ // Set SExt or ZExt flag.
+ if (VA.getLocInfo() == CCValAssign::ZExt) {
+ MFI.setObjectZExt(FI, true);
+ } else if (VA.getLocInfo() == CCValAssign::SExt) {
+ MFI.setObjectSExt(FI, true);
+ }
+
+ // Adjust SP offset of interrupt parameter.
+ if (CallConv == CallingConv::X86_INTR) {
+ MFI.setObjectOffset(FI, Offset);
+ }
+
+ SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+ SDValue Val = DAG.getLoad(
+ ValVT, dl, Chain, FIN,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+ return ExtendedInMem ?
+ DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val;
+ }
+}
+
+// FIXME: Get this from tablegen.
+static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
+ const X86Subtarget &Subtarget) {
+ assert(Subtarget.is64Bit());
+
+ if (Subtarget.isCallingConvWin64(CallConv)) {
+ static const MCPhysReg GPR64ArgRegsWin64[] = {
+ X86::RCX, X86::RDX, X86::R8, X86::R9
+ };
+ return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
+ }
+
+ static const MCPhysReg GPR64ArgRegs64Bit[] = {
+ X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
+ };
+ return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
+}
+
+// FIXME: Get this from tablegen.
+static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
+ CallingConv::ID CallConv,
+ const X86Subtarget &Subtarget) {
+ assert(Subtarget.is64Bit());
+ if (Subtarget.isCallingConvWin64(CallConv)) {
+ // The XMM registers which might contain var arg parameters are shadowed
+ // in their paired GPR. So we only need to save the GPR to their home
+ // slots.
+ // TODO: __vectorcall will change this.
+ return None;
+ }
+
+ const Function *Fn = MF.getFunction();
+ bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
+ bool isSoftFloat = Subtarget.useSoftFloat();
+ assert(!(isSoftFloat && NoImplicitFloatOps) &&
+ "SSE register cannot be used when SSE is disabled!");
+ if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
+ // Kernel mode asks for SSE to be disabled, so there are no XMM argument
+ // registers.
+ return None;
+
+ static const MCPhysReg XMMArgRegs64Bit[] = {
+ X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+ X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+ };
+ return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
+}
+
+static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
+ return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
+ [](const CCValAssign &A, const CCValAssign &B) -> bool {
+ return A.getValNo() < B.getValNo();
+ });
+}
+
+SDValue X86TargetLowering::LowerFormalArguments(
+ SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+ const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
+
+ const Function *Fn = MF.getFunction();
+ if (Fn->hasExternalLinkage() &&
+ Subtarget.isTargetCygMing() &&
+ Fn->getName() == "main")
+ FuncInfo->setForceFramePointer(true);
+
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ bool Is64Bit = Subtarget.is64Bit();
+ bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
+
+ assert(
+ !(isVarArg && canGuaranteeTCO(CallConv)) &&
+ "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
+
+ if (CallConv == CallingConv::X86_INTR) {
+ bool isLegal = Ins.size() == 1 ||
+ (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
+ (!Is64Bit && Ins[1].VT == MVT::i32)));
+ if (!isLegal)
+ report_fatal_error("X86 interrupts may take one or two arguments");
+ }
+
+ // Assign locations to all of the incoming arguments.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
+
+ // Allocate shadow area for Win64.
+ if (IsWin64)
+ CCInfo.AllocateStack(32, 8);
+
+ CCInfo.AnalyzeArguments(Ins, CC_X86);
+
+ // In vectorcall calling convention a second pass is required for the HVA
+ // types.
+ if (CallingConv::X86_VectorCall == CallConv) {
+ CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
+ }
+
+ // The next loop assumes that the locations are in the same order of the
+ // input arguments.
+ if (!isSortedByValueNo(ArgLocs))
+ llvm_unreachable("Argument Location list must be sorted before lowering");
+
+ SDValue ArgValue;
+ for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
+ ++I, ++InsIndex) {
+ assert(InsIndex < Ins.size() && "Invalid Ins index");
+ CCValAssign &VA = ArgLocs[I];
+
+ if (VA.isRegLoc()) {
+ EVT RegVT = VA.getLocVT();
+ if (VA.needsCustom()) {
+ assert(
+ VA.getValVT() == MVT::v64i1 &&
+ "Currently the only custom case is when we split v64i1 to 2 regs");
+
+ // v64i1 values, in regcall calling convention, that are
+ // compiled to 32 bit arch, are splited up into two registers.
+ ArgValue =
+ getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
+ } else {
+ const TargetRegisterClass *RC;
+ if (RegVT == MVT::i32)
+ RC = &X86::GR32RegClass;
+ else if (Is64Bit && RegVT == MVT::i64)
+ RC = &X86::GR64RegClass;
+ else if (RegVT == MVT::f32)
+ RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
+ else if (RegVT == MVT::f64)
+ RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
+ else if (RegVT == MVT::f80)
+ RC = &X86::RFP80RegClass;
+ else if (RegVT == MVT::f128)
+ RC = &X86::FR128RegClass;
+ else if (RegVT.is512BitVector())
+ RC = &X86::VR512RegClass;
+ else if (RegVT.is256BitVector())
+ RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
+ else if (RegVT.is128BitVector())
+ RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
+ else if (RegVT == MVT::x86mmx)
+ RC = &X86::VR64RegClass;
+ else if (RegVT == MVT::i1)
+ RC = &X86::VK1RegClass;
+ else if (RegVT == MVT::v8i1)
+ RC = &X86::VK8RegClass;
+ else if (RegVT == MVT::v16i1)
+ RC = &X86::VK16RegClass;
+ else if (RegVT == MVT::v32i1)
+ RC = &X86::VK32RegClass;
+ else if (RegVT == MVT::v64i1)
+ RC = &X86::VK64RegClass;
+ else
+ llvm_unreachable("Unknown argument type!");
+
+ unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+ ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
+ }
+
+ // If this is an 8 or 16-bit value, it is really passed promoted to 32
+ // bits. Insert an assert[sz]ext to capture this, then truncate to the
+ // right size.
+ if (VA.getLocInfo() == CCValAssign::SExt)
+ ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
+ DAG.getValueType(VA.getValVT()));
+ else if (VA.getLocInfo() == CCValAssign::ZExt)
+ ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
+ DAG.getValueType(VA.getValVT()));
+ else if (VA.getLocInfo() == CCValAssign::BCvt)
+ ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
+
+ if (VA.isExtInLoc()) {
+ // Handle MMX values passed in XMM regs.
+ if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
+ ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
+ else if (VA.getValVT().isVector() &&
+ VA.getValVT().getScalarType() == MVT::i1 &&
+ ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
+ (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
+ // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
+ ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
+ } else
+ ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+ }
+ } else {
+ assert(VA.isMemLoc());
+ ArgValue =
+ LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
+ }
+
+ // If value is passed via pointer - do a load.
+ if (VA.getLocInfo() == CCValAssign::Indirect)
+ ArgValue =
+ DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
+
+ InVals.push_back(ArgValue);
+ }
+
+ for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
+ // Swift calling convention does not require we copy the sret argument
+ // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
+ if (CallConv == CallingConv::Swift)
+ continue;
+
+ // All x86 ABIs require that for returning structs by value we copy the
+ // sret argument into %rax/%eax (depending on ABI) for the return. Save
+ // the argument into a virtual register so that we can access it from the
+ // return points.
+ if (Ins[I].Flags.isSRet()) {
+ unsigned Reg = FuncInfo->getSRetReturnReg();
+ if (!Reg) {
+ MVT PtrTy = getPointerTy(DAG.getDataLayout());
+ Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
+ FuncInfo->setSRetReturnReg(Reg);
+ }
+ SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
+ break;
+ }
+ }
+
+ unsigned StackSize = CCInfo.getNextStackOffset();
+ // Align stack specially for tail calls.
+ if (shouldGuaranteeTCO(CallConv,
+ MF.getTarget().Options.GuaranteedTailCallOpt))
+ StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
+
+ // If the function takes variable number of arguments, make a frame index for
+ // the start of the first vararg value... for expansion of llvm.va_start. We
+ // can skip this if there are no va_start calls.
+ if (MFI.hasVAStart() &&
+ (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
+ CallConv != CallingConv::X86_ThisCall))) {
+ FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
+ }
+
+ // Figure out if XMM registers are in use.
+ assert(!(Subtarget.useSoftFloat() &&
+ Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
+ "SSE register cannot be used when SSE is disabled!");
+
+ // 64-bit calling conventions support varargs and register parameters, so we
+ // have to do extra work to spill them in the prologue.
+ if (Is64Bit && isVarArg && MFI.hasVAStart()) {
+ // Find the first unallocated argument registers.
+ ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
+ ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
+ unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
+ unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
+ assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
+ "SSE register cannot be used when SSE is disabled!");
+
+ // Gather all the live in physical registers.
+ SmallVector<SDValue, 6> LiveGPRs;
+ SmallVector<SDValue, 8> LiveXMMRegs;
+ SDValue ALVal;
+ for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
+ unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
+ LiveGPRs.push_back(
+ DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
+ }
+ if (!ArgXMMs.empty()) {
+ unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
+ ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
+ for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
+ unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
+ LiveXMMRegs.push_back(
+ DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
+ }
+ }
+
+ if (IsWin64) {
+ // Get to the caller-allocated home save location. Add 8 to account
+ // for the return address.
+ int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
+ FuncInfo->setRegSaveFrameIndex(
+ MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
+ // Fixup to set vararg frame on shadow area (4 x i64).
+ if (NumIntRegs < 4)
+ FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
+ } else {
+ // For X86-64, if there are vararg parameters that are passed via
+ // registers, then we must store them to their spots on the stack so
+ // they may be loaded by dereferencing the result of va_next.
+ FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
+ FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
+ FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
+ ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
+ }
+
+ // Store the integer parameter registers.
+ SmallVector<SDValue, 8> MemOps;
+ SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
+ getPointerTy(DAG.getDataLayout()));
+ unsigned Offset = FuncInfo->getVarArgsGPOffset();
+ for (SDValue Val : LiveGPRs) {
+ SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
+ RSFIN, DAG.getIntPtrConstant(Offset, dl));
+ SDValue Store =
+ DAG.getStore(Val.getValue(1), dl, Val, FIN,
+ MachinePointerInfo::getFixedStack(
+ DAG.getMachineFunction(),
+ FuncInfo->getRegSaveFrameIndex(), Offset));
+ MemOps.push_back(Store);
+ Offset += 8;
+ }
+
+ if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
+ // Now store the XMM (fp + vector) parameter registers.
+ SmallVector<SDValue, 12> SaveXMMOps;
+ SaveXMMOps.push_back(Chain);
+ SaveXMMOps.push_back(ALVal);
+ SaveXMMOps.push_back(DAG.getIntPtrConstant(
+ FuncInfo->getRegSaveFrameIndex(), dl));
+ SaveXMMOps.push_back(DAG.getIntPtrConstant(
+ FuncInfo->getVarArgsFPOffset(), dl));
+ SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
+ LiveXMMRegs.end());
+ MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
+ MVT::Other, SaveXMMOps));
+ }
+
+ if (!MemOps.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
+ }
+
+ if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
+ // Find the largest legal vector type.
+ MVT VecVT = MVT::Other;
+ // FIXME: Only some x86_32 calling conventions support AVX512.
+ if (Subtarget.hasAVX512() &&
+ (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
+ CallConv == CallingConv::Intel_OCL_BI)))
+ VecVT = MVT::v16f32;
+ else if (Subtarget.hasAVX())
+ VecVT = MVT::v8f32;
+ else if (Subtarget.hasSSE2())
+ VecVT = MVT::v4f32;
+
+ // We forward some GPRs and some vector types.
+ SmallVector<MVT, 2> RegParmTypes;
+ MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
+ RegParmTypes.push_back(IntVT);
+ if (VecVT != MVT::Other)
+ RegParmTypes.push_back(VecVT);
+
+ // Compute the set of forwarded registers. The rest are scratch.
+ SmallVectorImpl<ForwardedRegister> &Forwards =
+ FuncInfo->getForwardedMustTailRegParms();
+ CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
+
+ // Conservatively forward AL on x86_64, since it might be used for varargs.
+ if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
+ unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
+ Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
+ }
+
+ // Copy all forwards from physical to virtual registers.
+ for (ForwardedRegister &F : Forwards) {
+ // FIXME: Can we use a less constrained schedule?
+ SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
+ F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
+ Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
+ }
+ }
+
+ // Some CCs need callee pop.
+ if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
+ MF.getTarget().Options.GuaranteedTailCallOpt)) {
+ FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
+ } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
+ // X86 interrupts must pop the error code if present
+ FuncInfo->setBytesToPopOnReturn(Is64Bit ? 8 : 4);
+ } else {
+ FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
+ // If this is an sret function, the return should pop the hidden pointer.
+ if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
+ !Subtarget.getTargetTriple().isOSMSVCRT() &&
+ argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
+ FuncInfo->setBytesToPopOnReturn(4);
+ }
+
+ if (!Is64Bit) {
+ // RegSaveFrameIndex is X86-64 only.
+ FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
+ if (CallConv == CallingConv::X86_FastCall ||
+ CallConv == CallingConv::X86_ThisCall)
+ // fastcc functions can't have varargs.
+ FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
+ }
+
+ FuncInfo->setArgumentStackSize(StackSize);
+
+ if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
+ EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());
+ if (Personality == EHPersonality::CoreCLR) {
+ assert(Is64Bit);
+ // TODO: Add a mechanism to frame lowering that will allow us to indicate
+ // that we'd prefer this slot be allocated towards the bottom of the frame
+ // (i.e. near the stack pointer after allocating the frame). Every
+ // funclet needs a copy of this slot in its (mostly empty) frame, and the
+ // offset from the bottom of this and each funclet's frame must be the
+ // same, so the size of funclets' (mostly empty) frames is dictated by
+ // how far this slot is from the bottom (since they allocate just enough
+ // space to accommodate holding this slot at the correct offset).
+ int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
+ EHInfo->PSPSymFrameIdx = PSPSymFI;
+ }
+ }
+
+ return Chain;
+}
+
+SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
+ SDValue Arg, const SDLoc &dl,
+ SelectionDAG &DAG,
+ const CCValAssign &VA,
+ ISD::ArgFlagsTy Flags) const {
+ unsigned LocMemOffset = VA.getLocMemOffset();
+ SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
+ PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
+ StackPtr, PtrOff);
+ if (Flags.isByVal())
+ return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
+
+ return DAG.getStore(
+ Chain, dl, Arg, PtrOff,
+ MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
+}
+
+/// Emit a load of return address if tail call
+/// optimization is performed and it is required.
+SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
+ SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
+ bool Is64Bit, int FPDiff, const SDLoc &dl) const {
+ // Adjust the Return address stack slot.
+ EVT VT = getPointerTy(DAG.getDataLayout());
+ OutRetAddr = getReturnAddressFrameIndex(DAG);
+
+ // Load the "old" Return address.
+ OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
+ return SDValue(OutRetAddr.getNode(), 1);
+}
+
+/// Emit a store of the return address if tail call
+/// optimization is performed and it is required (FPDiff!=0).
+static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
+ SDValue Chain, SDValue RetAddrFrIdx,
+ EVT PtrVT, unsigned SlotSize,
+ int FPDiff, const SDLoc &dl) {
+ // Store the return address to the appropriate stack slot.
+ if (!FPDiff) return Chain;
+ // Calculate the new stack slot for the return address.
+ int NewReturnAddrFI =
+ MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
+ false);
+ SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
+ Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
+ MachinePointerInfo::getFixedStack(
+ DAG.getMachineFunction(), NewReturnAddrFI));
+ return Chain;
+}
+
+/// Returns a vector_shuffle mask for an movs{s|d}, movd
+/// operation of specified width.
+static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
+ SDValue V2) {
+ unsigned NumElems = VT.getVectorNumElements();
+ SmallVector<int, 8> Mask;
+ Mask.push_back(NumElems);
+ for (unsigned i = 1; i != NumElems; ++i)
+ Mask.push_back(i);
+ return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
+}
+
+SDValue
+X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const {
+ SelectionDAG &DAG = CLI.DAG;
+ SDLoc &dl = CLI.DL;
+ SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+ SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+ SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
+ SDValue Chain = CLI.Chain;
+ SDValue Callee = CLI.Callee;
+ CallingConv::ID CallConv = CLI.CallConv;
+ bool &isTailCall = CLI.IsTailCall;
+ bool isVarArg = CLI.IsVarArg;
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ bool Is64Bit = Subtarget.is64Bit();
+ bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
+ StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
+ bool IsSibcall = false;
+ X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
+ auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
+
+ if (CallConv == CallingConv::X86_INTR)
+ report_fatal_error("X86 interrupts may not be called directly");
+
+ if (Attr.getValueAsString() == "true")
+ isTailCall = false;
+
+ if (Subtarget.isPICStyleGOT() &&
+ !MF.getTarget().Options.GuaranteedTailCallOpt) {
+ // If we are using a GOT, disable tail calls to external symbols with
+ // default visibility. Tail calling such a symbol requires using a GOT
+ // relocation, which forces early binding of the symbol. This breaks code
+ // that require lazy function symbol resolution. Using musttail or
+ // GuaranteedTailCallOpt will override this.
+ GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
+ if (!G || (!G->getGlobal()->hasLocalLinkage() &&
+ G->getGlobal()->hasDefaultVisibility()))
+ isTailCall = false;
+ }
+
+ bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
+ if (IsMustTail) {
+ // Force this to be a tail call. The verifier rules are enough to ensure
+ // that we can lower this successfully without moving the return address
+ // around.
+ isTailCall = true;
+ } else if (isTailCall) {
+ // Check if it's really possible to do a tail call.
+ isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
+ isVarArg, SR != NotStructReturn,
+ MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
+ Outs, OutVals, Ins, DAG);
+
+ // Sibcalls are automatically detected tailcalls which do not require
+ // ABI changes.
+ if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
+ IsSibcall = true;
+
+ if (isTailCall)
+ ++NumTailCalls;
+ }
+
+ assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
+ "Var args not supported with calling convention fastcc, ghc or hipe");
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
+
+ // Allocate shadow area for Win64.
+ if (IsWin64)
+ CCInfo.AllocateStack(32, 8);
+
+ CCInfo.AnalyzeArguments(Outs, CC_X86);
+
+ // In vectorcall calling convention a second pass is required for the HVA
+ // types.
+ if (CallingConv::X86_VectorCall == CallConv) {
+ CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
+ }
+
+ // Get a count of how many bytes are to be pushed on the stack.
+ unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
+ if (IsSibcall)
+ // This is a sibcall. The memory operands are available in caller's
+ // own caller's stack.
+ NumBytes = 0;
+ else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
+ canGuaranteeTCO(CallConv))
+ NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
+
+ int FPDiff = 0;
+ if (isTailCall && !IsSibcall && !IsMustTail) {
+ // Lower arguments at fp - stackoffset + fpdiff.
+ unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
+
+ FPDiff = NumBytesCallerPushed - NumBytes;
+
+ // Set the delta of movement of the returnaddr stackslot.
+ // But only set if delta is greater than previous delta.
+ if (FPDiff < X86Info->getTCReturnAddrDelta())
+ X86Info->setTCReturnAddrDelta(FPDiff);
+ }
+
+ unsigned NumBytesToPush = NumBytes;
+ unsigned NumBytesToPop = NumBytes;
+
+ // If we have an inalloca argument, all stack space has already been allocated
+ // for us and be right at the top of the stack. We don't support multiple
+ // arguments passed in memory when using inalloca.
+ if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
+ NumBytesToPush = 0;
+ if (!ArgLocs.back().isMemLoc())
+ report_fatal_error("cannot use inalloca attribute on a register "
+ "parameter");
+ if (ArgLocs.back().getLocMemOffset() != 0)
+ report_fatal_error("any parameter with the inalloca attribute must be "
+ "the only memory argument");
+ }
+
+ if (!IsSibcall)
+ Chain = DAG.getCALLSEQ_START(
+ Chain, DAG.getIntPtrConstant(NumBytesToPush, dl, true), dl);
+
+ SDValue RetAddrFrIdx;
+ // Load return address for tail calls.
+ if (isTailCall && FPDiff)
+ Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
+ Is64Bit, FPDiff, dl);
+
+ SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+ SmallVector<SDValue, 8> MemOpChains;
+ SDValue StackPtr;
+
+ // The next loop assumes that the locations are in the same order of the
+ // input arguments.
+ if (!isSortedByValueNo(ArgLocs))
+ llvm_unreachable("Argument Location list must be sorted before lowering");
+
+ // Walk the register/memloc assignments, inserting copies/loads. In the case
+ // of tail call optimization arguments are handle later.
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
+ ++I, ++OutIndex) {
+ assert(OutIndex < Outs.size() && "Invalid Out index");
+ // Skip inalloca arguments, they have already been written.
+ ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
+ if (Flags.isInAlloca())
+ continue;
+
+ CCValAssign &VA = ArgLocs[I];
+ EVT RegVT = VA.getLocVT();
+ SDValue Arg = OutVals[OutIndex];
+ bool isByVal = Flags.isByVal();
+
+ // Promote the value if needed.
+ switch (VA.getLocInfo()) {
+ default: llvm_unreachable("Unknown loc info!");
+ case CCValAssign::Full: break;
+ case CCValAssign::SExt:
+ Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
+ break;
+ case CCValAssign::ZExt:
+ Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
+ break;
+ case CCValAssign::AExt:
+ if (Arg.getValueType().isVector() &&
+ Arg.getValueType().getVectorElementType() == MVT::i1)
+ Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
+ else if (RegVT.is128BitVector()) {
+ // Special case: passing MMX values in XMM registers.
+ Arg = DAG.getBitcast(MVT::i64, Arg);
+ Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
+ Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
+ } else
+ Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
+ break;
+ case CCValAssign::BCvt:
+ Arg = DAG.getBitcast(RegVT, Arg);
+ break;
+ case CCValAssign::Indirect: {
+ // Store the argument.
+ SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
+ int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
+ Chain = DAG.getStore(
+ Chain, dl, Arg, SpillSlot,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+ Arg = SpillSlot;
+ break;
+ }
+ }
+
+ if (VA.needsCustom()) {
+ assert(VA.getValVT() == MVT::v64i1 &&
+ "Currently the only custom case is when we split v64i1 to 2 regs");
+ // Split v64i1 value into two registers
+ Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
+ Subtarget);
+ } else if (VA.isRegLoc()) {
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+ if (isVarArg && IsWin64) {
+ // Win64 ABI requires argument XMM reg to be copied to the corresponding
+ // shadow reg if callee is a varargs function.
+ unsigned ShadowReg = 0;
+ switch (VA.getLocReg()) {
+ case X86::XMM0: ShadowReg = X86::RCX; break;
+ case X86::XMM1: ShadowReg = X86::RDX; break;
+ case X86::XMM2: ShadowReg = X86::R8; break;
+ case X86::XMM3: ShadowReg = X86::R9; break;
+ }
+ if (ShadowReg)
+ RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
+ }
+ } else if (!IsSibcall && (!isTailCall || isByVal)) {
+ assert(VA.isMemLoc());
+ if (!StackPtr.getNode())
+ StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
+ getPointerTy(DAG.getDataLayout()));
+ MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
+ dl, DAG, VA, Flags));
+ }
+ }
+
+ if (!MemOpChains.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+
+ if (Subtarget.isPICStyleGOT()) {
+ // ELF / PIC requires GOT in the EBX register before function calls via PLT
+ // GOT pointer.
+ if (!isTailCall) {
+ RegsToPass.push_back(std::make_pair(
+ unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
+ getPointerTy(DAG.getDataLayout()))));
+ } else {
+ // If we are tail calling and generating PIC/GOT style code load the
+ // address of the callee into ECX. The value in ecx is used as target of
+ // the tail jump. This is done to circumvent the ebx/callee-saved problem
+ // for tail calls on PIC/GOT architectures. Normally we would just put the
+ // address of GOT into ebx and then call target@PLT. But for tail calls
+ // ebx would be restored (since ebx is callee saved) before jumping to the
+ // target@PLT.
+
+ // Note: The actual moving to ECX is done further down.
+ GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
+ if (G && !G->getGlobal()->hasLocalLinkage() &&
+ G->getGlobal()->hasDefaultVisibility())
+ Callee = LowerGlobalAddress(Callee, DAG);
+ else if (isa<ExternalSymbolSDNode>(Callee))
+ Callee = LowerExternalSymbol(Callee, DAG);
+ }
+ }
+
+ if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
+ // From AMD64 ABI document:
+ // For calls that may call functions that use varargs or stdargs
+ // (prototype-less calls or calls to functions containing ellipsis (...) in
+ // the declaration) %al is used as hidden argument to specify the number
+ // of SSE registers used. The contents of %al do not need to match exactly
+ // the number of registers, but must be an ubound on the number of SSE
+ // registers used and is in the range 0 - 8 inclusive.
+
+ // Count the number of XMM registers allocated.
+ static const MCPhysReg XMMArgRegs[] = {
+ X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+ X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+ };
+ unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
+ assert((Subtarget.hasSSE1() || !NumXMMRegs)
+ && "SSE registers cannot be used when SSE is disabled");
+
+ RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
+ DAG.getConstant(NumXMMRegs, dl,
+ MVT::i8)));
+ }
+
+ if (isVarArg && IsMustTail) {
+ const auto &Forwards = X86Info->getForwardedMustTailRegParms();
+ for (const auto &F : Forwards) {
+ SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
+ RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
+ }
+ }
+
+ // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
+ // don't need this because the eligibility check rejects calls that require
+ // shuffling arguments passed in memory.
+ if (!IsSibcall && isTailCall) {
+ // Force all the incoming stack arguments to be loaded from the stack
+ // before any new outgoing arguments are stored to the stack, because the
+ // outgoing stack slots may alias the incoming argument stack slots, and
+ // the alias isn't otherwise explicit. This is slightly more conservative
+ // than necessary, because it means that each store effectively depends
+ // on every argument instead of just those arguments it would clobber.
+ SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
+
+ SmallVector<SDValue, 8> MemOpChains2;
+ SDValue FIN;
+ int FI = 0;
+ for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
+ ++I, ++OutsIndex) {
+ CCValAssign &VA = ArgLocs[I];
+
+ if (VA.isRegLoc()) {
+ if (VA.needsCustom()) {
+ assert((CallConv == CallingConv::X86_RegCall) &&
+ "Expecting custome case only in regcall calling convention");
+ // This means that we are in special case where one argument was
+ // passed through two register locations - Skip the next location
+ ++I;
+ }
+
+ continue;
+ }
+
+ assert(VA.isMemLoc());
+ SDValue Arg = OutVals[OutsIndex];
+ ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
+ // Skip inalloca arguments. They don't require any work.
+ if (Flags.isInAlloca())
+ continue;
+ // Create frame index.
+ int32_t Offset = VA.getLocMemOffset()+FPDiff;
+ uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
+ FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
+ FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+
+ if (Flags.isByVal()) {
+ // Copy relative to framepointer.
+ SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
+ if (!StackPtr.getNode())
+ StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
+ getPointerTy(DAG.getDataLayout()));
+ Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
+ StackPtr, Source);
+
+ MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
+ ArgChain,
+ Flags, DAG, dl));
+ } else {
+ // Store relative to framepointer.
+ MemOpChains2.push_back(DAG.getStore(
+ ArgChain, dl, Arg, FIN,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
+ }
+ }
+
+ if (!MemOpChains2.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
+
+ // Store the return address to the appropriate stack slot.
+ Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
+ getPointerTy(DAG.getDataLayout()),
+ RegInfo->getSlotSize(), FPDiff, dl);
+ }
+
+ // Build a sequence of copy-to-reg nodes chained together with token chain
+ // and flag operands which copy the outgoing args into registers.
+ SDValue InFlag;
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+ Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+ RegsToPass[i].second, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
+ assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
+ // In the 64-bit large code model, we have to make all calls
+ // through a register, since the call instruction's 32-bit
+ // pc-relative offset may not be large enough to hold the whole
+ // address.
+ } else if (Callee->getOpcode() == ISD::GlobalAddress) {
+ // If the callee is a GlobalAddress node (quite common, every direct call
+ // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
+ // it.
+ GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
+
+ // We should use extra load for direct calls to dllimported functions in
+ // non-JIT mode.
+ const GlobalValue *GV = G->getGlobal();
+ if (!GV->hasDLLImportStorageClass()) {
+ unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
+
+ Callee = DAG.getTargetGlobalAddress(
+ GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
+
+ if (OpFlags == X86II::MO_GOTPCREL) {
+ // Add a wrapper.
+ Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
+ getPointerTy(DAG.getDataLayout()), Callee);
+ // Add extra indirection
+ Callee = DAG.getLoad(
+ getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+ }
+ }
+ } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+ const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
+ unsigned char OpFlags =
+ Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
+
+ Callee = DAG.getTargetExternalSymbol(
+ S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
+ } else if (Subtarget.isTarget64BitILP32() &&
+ Callee->getValueType(0) == MVT::i32) {
+ // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
+ Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
+ }
+
+ // Returns a chain & a flag for retval copy to use.
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SmallVector<SDValue, 8> Ops;
+
+ if (!IsSibcall && isTailCall) {
+ Chain = DAG.getCALLSEQ_END(Chain,
+ DAG.getIntPtrConstant(NumBytesToPop, dl, true),
+ DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
+ InFlag = Chain.getValue(1);
+ }
+
+ Ops.push_back(Chain);
+ Ops.push_back(Callee);
+
+ if (isTailCall)
+ Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
+
+ // Add argument registers to the end of the list so that they are known live
+ // into the call.
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+ Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+ RegsToPass[i].second.getValueType()));
+
+ // Add a register mask operand representing the call-preserved registers.
+ const uint32_t *Mask = RegInfo->getCallPreservedMask(MF, CallConv);
+ assert(Mask && "Missing call preserved mask for calling convention");
+
+ // If this is an invoke in a 32-bit function using a funclet-based
+ // personality, assume the function clobbers all registers. If an exception
+ // is thrown, the runtime will not restore CSRs.
+ // FIXME: Model this more precisely so that we can register allocate across
+ // the normal edge and spill and fill across the exceptional edge.
+ if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) {
+ const Function *CallerFn = MF.getFunction();
+ EHPersonality Pers =
+ CallerFn->hasPersonalityFn()
+ ? classifyEHPersonality(CallerFn->getPersonalityFn())
+ : EHPersonality::Unknown;
+ if (isFuncletEHPersonality(Pers))
+ Mask = RegInfo->getNoPreservedMask();
+ }
+
+ Ops.push_back(DAG.getRegisterMask(Mask));
+
+ if (InFlag.getNode())
+ Ops.push_back(InFlag);
+
+ if (isTailCall) {
+ // We used to do:
+ //// If this is the first return lowered for this function, add the regs
+ //// to the liveout set for the function.
+ // This isn't right, although it's probably harmless on x86; liveouts
+ // should be computed from returns not tail calls. Consider a void
+ // function making a tail call to a function returning int.
+ MF.getFrameInfo().setHasTailCall();
+ return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
+ }
+
+ Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
+ InFlag = Chain.getValue(1);
+
+ // Create the CALLSEQ_END node.
+ unsigned NumBytesForCalleeToPop;
+ if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
+ DAG.getTarget().Options.GuaranteedTailCallOpt))
+ NumBytesForCalleeToPop = NumBytes; // Callee pops everything
+ else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
+ !Subtarget.getTargetTriple().isOSMSVCRT() &&
+ SR == StackStructReturn)
+ // If this is a call to a struct-return function, the callee
+ // pops the hidden struct pointer, so we have to push it back.
+ // This is common for Darwin/X86, Linux & Mingw32 targets.
+ // For MSVC Win32 targets, the caller pops the hidden struct pointer.
+ NumBytesForCalleeToPop = 4;
+ else
+ NumBytesForCalleeToPop = 0; // Callee pops nothing.
+
+ if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
+ // No need to reset the stack after the call if the call doesn't return. To
+ // make the MI verify, we'll pretend the callee does it for us.
+ NumBytesForCalleeToPop = NumBytes;
+ }
+
+ // Returns a flag for retval copy to use.
+ if (!IsSibcall) {
+ Chain = DAG.getCALLSEQ_END(Chain,
+ DAG.getIntPtrConstant(NumBytesToPop, dl, true),
+ DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
+ true),
+ InFlag, dl);
+ InFlag = Chain.getValue(1);
+ }
+
+ // Handle result values, copying them out of physregs into vregs that we
+ // return.
+ return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
+ Ins, dl, DAG, InVals);
+}
+
+//===----------------------------------------------------------------------===//
+// Fast Calling Convention (tail call) implementation
+//===----------------------------------------------------------------------===//
+
+// Like std call, callee cleans arguments, convention except that ECX is
+// reserved for storing the tail called function address. Only 2 registers are
+// free for argument passing (inreg). Tail call optimization is performed
+// provided:
+// * tailcallopt is enabled
+// * caller/callee are fastcc
+// On X86_64 architecture with GOT-style position independent code only local
+// (within module) calls are supported at the moment.
+// To keep the stack aligned according to platform abi the function
+// GetAlignedArgumentStackSize ensures that argument delta is always multiples
+// of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
+// If a tail called function callee has more arguments than the caller the
+// caller needs to make sure that there is room to move the RETADDR to. This is
+// achieved by reserving an area the size of the argument delta right after the
+// original RETADDR, but before the saved framepointer or the spilled registers
+// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
+// stack layout:
+// arg1
+// arg2
+// RETADDR
+// [ new RETADDR
+// move area ]
+// (possible EBP)
+// ESI
+// EDI
+// local1 ..
+
+/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
+/// requirement.
+unsigned
+X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
+ SelectionDAG& DAG) const {
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
+ unsigned StackAlignment = TFI.getStackAlignment();
+ uint64_t AlignMask = StackAlignment - 1;
+ int64_t Offset = StackSize;
+ unsigned SlotSize = RegInfo->getSlotSize();
+ if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
+ // Number smaller than 12 so just add the difference.
+ Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
+ } else {
+ // Mask out lower bits, add stackalignment once plus the 12 bytes.
+ Offset = ((~AlignMask) & Offset) + StackAlignment +
+ (StackAlignment-SlotSize);
+ }
+ return Offset;
+}
+
+/// Return true if the given stack call argument is already available in the
+/// same position (relatively) of the caller's incoming argument stack.
+static
+bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
+ MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
+ const X86InstrInfo *TII, const CCValAssign &VA) {
+ unsigned Bytes = Arg.getValueSizeInBits() / 8;
+
+ for (;;) {
+ // Look through nodes that don't alter the bits of the incoming value.
+ unsigned Op = Arg.getOpcode();
+ if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
+ Arg = Arg.getOperand(0);
+ continue;
+ }
+ if (Op == ISD::TRUNCATE) {
+ const SDValue &TruncInput = Arg.getOperand(0);
+ if (TruncInput.getOpcode() == ISD::AssertZext &&
+ cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
+ Arg.getValueType()) {
+ Arg = TruncInput.getOperand(0);
+ continue;
+ }
+ }
+ break;
+ }
+
+ int FI = INT_MAX;
+ if (Arg.getOpcode() == ISD::CopyFromReg) {
+ unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(VR))
+ return false;
+ MachineInstr *Def = MRI->getVRegDef(VR);
+ if (!Def)
+ return false;
+ if (!Flags.isByVal()) {
+ if (!TII->isLoadFromStackSlot(*Def, FI))
+ return false;
+ } else {
+ unsigned Opcode = Def->getOpcode();
+ if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
+ Opcode == X86::LEA64_32r) &&
+ Def->getOperand(1).isFI()) {
+ FI = Def->getOperand(1).getIndex();
+ Bytes = Flags.getByValSize();
+ } else
+ return false;
+ }
+ } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
+ if (Flags.isByVal())
+ // ByVal argument is passed in as a pointer but it's now being
+ // dereferenced. e.g.
+ // define @foo(%struct.X* %A) {
+ // tail call @bar(%struct.X* byval %A)
+ // }
+ return false;
+ SDValue Ptr = Ld->getBasePtr();
+ FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
+ if (!FINode)
+ return false;
+ FI = FINode->getIndex();
+ } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
+ FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
+ FI = FINode->getIndex();
+ Bytes = Flags.getByValSize();
+ } else
+ return false;
+
+ assert(FI != INT_MAX);
+ if (!MFI.isFixedObjectIndex(FI))
+ return false;
+
+ if (Offset != MFI.getObjectOffset(FI))
+ return false;
+
+ if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
+ // If the argument location is wider than the argument type, check that any
+ // extension flags match.
+ if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
+ Flags.isSExt() != MFI.isObjectSExt(FI)) {
+ return false;
+ }
+ }
+
+ return Bytes == MFI.getObjectSize(FI);
+}
+
+/// Check whether the call is eligible for tail call optimization. Targets
+/// that want to do tail call optimization should implement this function.
+bool X86TargetLowering::IsEligibleForTailCallOptimization(
+ SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
+ bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
+ if (!mayTailCallThisCC(CalleeCC))
+ return false;
+
+ // If -tailcallopt is specified, make fastcc functions tail-callable.
+ MachineFunction &MF = DAG.getMachineFunction();
+ const Function *CallerF = MF.getFunction();
+
+ // If the function return type is x86_fp80 and the callee return type is not,
+ // then the FP_EXTEND of the call result is not a nop. It's not safe to
+ // perform a tailcall optimization here.
+ if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
+ return false;
+
+ CallingConv::ID CallerCC = CallerF->getCallingConv();
+ bool CCMatch = CallerCC == CalleeCC;
+ bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
+ bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
+
+ // Win64 functions have extra shadow space for argument homing. Don't do the
+ // sibcall if the caller and callee have mismatched expectations for this
+ // space.
+ if (IsCalleeWin64 != IsCallerWin64)
+ return false;
+
+ if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
+ if (canGuaranteeTCO(CalleeCC) && CCMatch)
+ return true;
+ return false;
+ }
+
+ // Look for obvious safe cases to perform tail call optimization that do not
+ // require ABI changes. This is what gcc calls sibcall.
+
+ // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
+ // emit a special epilogue.
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ if (RegInfo->needsStackRealignment(MF))
+ return false;
+
+ // Also avoid sibcall optimization if either caller or callee uses struct
+ // return semantics.
+ if (isCalleeStructRet || isCallerStructRet)
+ return false;
+
+ // Do not sibcall optimize vararg calls unless all arguments are passed via
+ // registers.
+ LLVMContext &C = *DAG.getContext();
+ if (isVarArg && !Outs.empty()) {
+ // Optimizing for varargs on Win64 is unlikely to be safe without
+ // additional testing.
+ if (IsCalleeWin64 || IsCallerWin64)
+ return false;
+
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
+
+ CCInfo.AnalyzeCallOperands(Outs, CC_X86);
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
+ if (!ArgLocs[i].isRegLoc())
+ return false;
+ }
+
+ // If the call result is in ST0 / ST1, it needs to be popped off the x87
+ // stack. Therefore, if it's not used by the call it is not safe to optimize
+ // this into a sibcall.
+ bool Unused = false;
+ for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+ if (!Ins[i].Used) {
+ Unused = true;
+ break;
+ }
+ }
+ if (Unused) {
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
+ CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
+ for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+ CCValAssign &VA = RVLocs[i];
+ if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
+ return false;
+ }
+ }
+
+ // Check that the call results are passed in the same way.
+ if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
+ RetCC_X86, RetCC_X86))
+ return false;
+ // The callee has to preserve all registers the caller needs to preserve.
+ const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
+ const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
+ if (!CCMatch) {
+ const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
+ if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
+ return false;
+ }
+
+ unsigned StackArgsSize = 0;
+
+ // If the callee takes no arguments then go on to check the results of the
+ // call.
+ if (!Outs.empty()) {
+ // Check if stack adjustment is needed. For now, do not do this if any
+ // argument is passed on the stack.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
+
+ // Allocate shadow area for Win64
+ if (IsCalleeWin64)
+ CCInfo.AllocateStack(32, 8);
+
+ CCInfo.AnalyzeCallOperands(Outs, CC_X86);
+ StackArgsSize = CCInfo.getNextStackOffset();
+
+ if (CCInfo.getNextStackOffset()) {
+ // Check if the arguments are already laid out in the right way as
+ // the caller's fixed stack objects.
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ const MachineRegisterInfo *MRI = &MF.getRegInfo();
+ const X86InstrInfo *TII = Subtarget.getInstrInfo();
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ SDValue Arg = OutVals[i];
+ ISD::ArgFlagsTy Flags = Outs[i].Flags;
+ if (VA.getLocInfo() == CCValAssign::Indirect)
+ return false;
+ if (!VA.isRegLoc()) {
+ if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
+ MFI, MRI, TII, VA))
+ return false;
+ }
+ }
+ }
+
+ bool PositionIndependent = isPositionIndependent();
+ // If the tailcall address may be in a register, then make sure it's
+ // possible to register allocate for it. In 32-bit, the call address can
+ // only target EAX, EDX, or ECX since the tail call must be scheduled after
+ // callee-saved registers are restored. These happen to be the same
+ // registers used to pass 'inreg' arguments so watch out for those.
+ if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
+ !isa<ExternalSymbolSDNode>(Callee)) ||
+ PositionIndependent)) {
+ unsigned NumInRegs = 0;
+ // In PIC we need an extra register to formulate the address computation
+ // for the callee.
+ unsigned MaxInRegs = PositionIndependent ? 2 : 3;
+
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ if (!VA.isRegLoc())
+ continue;
+ unsigned Reg = VA.getLocReg();
+ switch (Reg) {
+ default: break;
+ case X86::EAX: case X86::EDX: case X86::ECX:
+ if (++NumInRegs == MaxInRegs)
+ return false;
+ break;
+ }
+ }
+ }
+
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
+ return false;
+ }
+
+ bool CalleeWillPop =
+ X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
+ MF.getTarget().Options.GuaranteedTailCallOpt);
+
+ if (unsigned BytesToPop =
+ MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
+ // If we have bytes to pop, the callee must pop them.
+ bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
+ if (!CalleePopMatches)
+ return false;
+ } else if (CalleeWillPop && StackArgsSize > 0) {
+ // If we don't have bytes to pop, make sure the callee doesn't pop any.
+ return false;
+ }
+
+ return true;
+}
+
+FastISel *
+X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo) const {
+ return X86::createFastISel(funcInfo, libInfo);
+}
+
+//===----------------------------------------------------------------------===//
+// Other Lowering Hooks
+//===----------------------------------------------------------------------===//
+
+static bool MayFoldLoad(SDValue Op) {
+ return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
+}
+
+static bool MayFoldIntoStore(SDValue Op) {
+ return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
+}
+
+static bool MayFoldIntoZeroExtend(SDValue Op) {
+ if (Op.hasOneUse()) {
+ unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
+ return (ISD::ZERO_EXTEND == Opcode);
+ }
+ return false;
+}
+
+static bool isTargetShuffle(unsigned Opcode) {
+ switch(Opcode) {
+ default: return false;
+ case X86ISD::BLENDI:
+ case X86ISD::PSHUFB:
+ case X86ISD::PSHUFD:
+ case X86ISD::PSHUFHW:
+ case X86ISD::PSHUFLW:
+ case X86ISD::SHUFP:
+ case X86ISD::INSERTPS:
+ case X86ISD::PALIGNR:
+ case X86ISD::VSHLDQ:
+ case X86ISD::VSRLDQ:
+ case X86ISD::MOVLHPS:
+ case X86ISD::MOVLHPD:
+ case X86ISD::MOVHLPS:
+ case X86ISD::MOVLPS:
+ case X86ISD::MOVLPD:
+ case X86ISD::MOVSHDUP:
+ case X86ISD::MOVSLDUP:
+ case X86ISD::MOVDDUP:
+ case X86ISD::MOVSS:
+ case X86ISD::MOVSD:
+ case X86ISD::UNPCKL:
+ case X86ISD::UNPCKH:
+ case X86ISD::VBROADCAST:
+ case X86ISD::VPERMILPI:
+ case X86ISD::VPERMILPV:
+ case X86ISD::VPERM2X128:
+ case X86ISD::VPERMIL2:
+ case X86ISD::VPERMI:
+ case X86ISD::VPPERM:
+ case X86ISD::VPERMV:
+ case X86ISD::VPERMV3:
+ case X86ISD::VPERMIV3:
+ case X86ISD::VZEXT_MOVL:
+ return true;
+ }
+}
+
+static bool isTargetShuffleVariableMask(unsigned Opcode) {
+ switch (Opcode) {
+ default: return false;
+ // Target Shuffles.
+ case X86ISD::PSHUFB:
+ case X86ISD::VPERMILPV:
+ case X86ISD::VPERMIL2:
+ case X86ISD::VPPERM:
+ case X86ISD::VPERMV:
+ case X86ISD::VPERMV3:
+ case X86ISD::VPERMIV3:
+ return true;
+ // 'Faux' Target Shuffles.
+ case ISD::AND:
+ return true;
+ }
+}
+
+SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+ int ReturnAddrIndex = FuncInfo->getRAIndex();
+
+ if (ReturnAddrIndex == 0) {
+ // Set up a frame object for the return address.
+ unsigned SlotSize = RegInfo->getSlotSize();
+ ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
+ -(int64_t)SlotSize,
+ false);
+ FuncInfo->setRAIndex(ReturnAddrIndex);
+ }
+
+ return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
+}
+
+bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
+ bool hasSymbolicDisplacement) {
+ // Offset should fit into 32 bit immediate field.
+ if (!isInt<32>(Offset))
+ return false;
+
+ // If we don't have a symbolic displacement - we don't have any extra
+ // restrictions.
+ if (!hasSymbolicDisplacement)
+ return true;
+
+ // FIXME: Some tweaks might be needed for medium code model.
+ if (M != CodeModel::Small && M != CodeModel::Kernel)
+ return false;
+
+ // For small code model we assume that latest object is 16MB before end of 31
+ // bits boundary. We may also accept pretty large negative constants knowing
+ // that all objects are in the positive half of address space.
+ if (M == CodeModel::Small && Offset < 16*1024*1024)
+ return true;
+
+ // For kernel code model we know that all object resist in the negative half
+ // of 32bits address space. We may not accept negative offsets, since they may
+ // be just off and we may accept pretty large positive ones.
+ if (M == CodeModel::Kernel && Offset >= 0)
+ return true;
+
+ return false;
+}
+
+/// Determines whether the callee is required to pop its own arguments.
+/// Callee pop is necessary to support tail calls.
+bool X86::isCalleePop(CallingConv::ID CallingConv,
+ bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
+ // If GuaranteeTCO is true, we force some calls to be callee pop so that we
+ // can guarantee TCO.
+ if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
+ return true;
+
+ switch (CallingConv) {
+ default:
+ return false;
+ case CallingConv::X86_StdCall:
+ case CallingConv::X86_FastCall:
+ case CallingConv::X86_ThisCall:
+ case CallingConv::X86_VectorCall:
+ return !is64Bit;
+ }
+}
+
+/// \brief Return true if the condition is an unsigned comparison operation.
+static bool isX86CCUnsigned(unsigned X86CC) {
+ switch (X86CC) {
+ default:
+ llvm_unreachable("Invalid integer condition!");
+ case X86::COND_E:
+ case X86::COND_NE:
+ case X86::COND_B:
+ case X86::COND_A:
+ case X86::COND_BE:
+ case X86::COND_AE:
+ return true;
+ case X86::COND_G:
+ case X86::COND_GE:
+ case X86::COND_L:
+ case X86::COND_LE:
+ return false;
+ }
+}
+
+static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
+ switch (SetCCOpcode) {
+ default: llvm_unreachable("Invalid integer condition!");
+ case ISD::SETEQ: return X86::COND_E;
+ case ISD::SETGT: return X86::COND_G;
+ case ISD::SETGE: return X86::COND_GE;
+ case ISD::SETLT: return X86::COND_L;
+ case ISD::SETLE: return X86::COND_LE;
+ case ISD::SETNE: return X86::COND_NE;
+ case ISD::SETULT: return X86::COND_B;
+ case ISD::SETUGT: return X86::COND_A;
+ case ISD::SETULE: return X86::COND_BE;
+ case ISD::SETUGE: return X86::COND_AE;
+ }
+}
+
+/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
+/// condition code, returning the condition code and the LHS/RHS of the
+/// comparison to make.
+static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
+ bool isFP, SDValue &LHS, SDValue &RHS,
+ SelectionDAG &DAG) {
+ if (!isFP) {
+ if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
+ if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
+ // X > -1 -> X == 0, jump !sign.
+ RHS = DAG.getConstant(0, DL, RHS.getValueType());
+ return X86::COND_NS;
+ }
+ if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
+ // X < 0 -> X == 0, jump on sign.
+ return X86::COND_S;
+ }
+ if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
+ // X < 1 -> X <= 0
+ RHS = DAG.getConstant(0, DL, RHS.getValueType());
+ return X86::COND_LE;
+ }
+ }
+
+ return TranslateIntegerX86CC(SetCCOpcode);
+ }
+
+ // First determine if it is required or is profitable to flip the operands.
+
+ // If LHS is a foldable load, but RHS is not, flip the condition.
+ if (ISD::isNON_EXTLoad(LHS.getNode()) &&
+ !ISD::isNON_EXTLoad(RHS.getNode())) {
+ SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
+ std::swap(LHS, RHS);
+ }
+
+ switch (SetCCOpcode) {
+ default: break;
+ case ISD::SETOLT:
+ case ISD::SETOLE:
+ case ISD::SETUGT:
+ case ISD::SETUGE:
+ std::swap(LHS, RHS);
+ break;
+ }
+
+ // On a floating point condition, the flags are set as follows:
+ // ZF PF CF op
+ // 0 | 0 | 0 | X > Y
+ // 0 | 0 | 1 | X < Y
+ // 1 | 0 | 0 | X == Y
+ // 1 | 1 | 1 | unordered
+ switch (SetCCOpcode) {
+ default: llvm_unreachable("Condcode should be pre-legalized away");
+ case ISD::SETUEQ:
+ case ISD::SETEQ: return X86::COND_E;
+ case ISD::SETOLT: // flipped
+ case ISD::SETOGT:
+ case ISD::SETGT: return X86::COND_A;
+ case ISD::SETOLE: // flipped
+ case ISD::SETOGE:
+ case ISD::SETGE: return X86::COND_AE;
+ case ISD::SETUGT: // flipped
+ case ISD::SETULT:
+ case ISD::SETLT: return X86::COND_B;
+ case ISD::SETUGE: // flipped
+ case ISD::SETULE:
+ case ISD::SETLE: return X86::COND_BE;
+ case ISD::SETONE:
+ case ISD::SETNE: return X86::COND_NE;
+ case ISD::SETUO: return X86::COND_P;
+ case ISD::SETO: return X86::COND_NP;
+ case ISD::SETOEQ:
+ case ISD::SETUNE: return X86::COND_INVALID;
+ }
+}
+
+/// Is there a floating point cmov for the specific X86 condition code?
+/// Current x86 isa includes the following FP cmov instructions:
+/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
+static bool hasFPCMov(unsigned X86CC) {
+ switch (X86CC) {
+ default:
+ return false;
+ case X86::COND_B:
+ case X86::COND_BE:
+ case X86::COND_E:
+ case X86::COND_P:
+ case X86::COND_A:
+ case X86::COND_AE:
+ case X86::COND_NE:
+ case X86::COND_NP:
+ return true;
+ }
+}
+
+
+bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
+ const CallInst &I,
+ unsigned Intrinsic) const {
+
+ const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
+ if (!IntrData)
+ return false;
+
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.readMem = false;
+ Info.writeMem = false;
+ Info.vol = false;
+ Info.offset = 0;
+
+ switch (IntrData->Type) {
+ case EXPAND_FROM_MEM: {
+ Info.ptrVal = I.getArgOperand(0);
+ Info.memVT = MVT::getVT(I.getType());
+ Info.align = 1;
+ Info.readMem = true;
+ break;
+ }
+ case COMPRESS_TO_MEM: {
+ Info.ptrVal = I.getArgOperand(0);
+ Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
+ Info.align = 1;
+ Info.writeMem = true;
+ break;
+ }
+ case TRUNCATE_TO_MEM_VI8:
+ case TRUNCATE_TO_MEM_VI16:
+ case TRUNCATE_TO_MEM_VI32: {
+ Info.ptrVal = I.getArgOperand(0);
+ MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
+ MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
+ if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
+ ScalarVT = MVT::i8;
+ else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
+ ScalarVT = MVT::i16;
+ else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
+ ScalarVT = MVT::i32;
+
+ Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
+ Info.align = 1;
+ Info.writeMem = true;
+ break;
+ }
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+/// Returns true if the target can instruction select the
+/// specified FP immediate natively. If false, the legalizer will
+/// materialize the FP immediate as a load from a constant pool.
+bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+ for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
+ if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
+ return true;
+ }
+ return false;
+}
+
+bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
+ ISD::LoadExtType ExtTy,
+ EVT NewVT) const {
+ // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
+ // relocation target a movq or addq instruction: don't let the load shrink.
+ SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
+ if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
+ if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
+ return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
+ return true;
+}
+
+/// \brief Returns true if it is beneficial to convert a load of a constant
+/// to just the constant itself.
+bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
+ Type *Ty) const {
+ assert(Ty->isIntegerTy());
+
+ unsigned BitSize = Ty->getPrimitiveSizeInBits();
+ if (BitSize == 0 || BitSize > 64)
+ return false;
+ return true;
+}
+
+bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
+ unsigned Index) const {
+ if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
+ return false;
+
+ return (Index == 0 || Index == ResVT.getVectorNumElements());
+}
+
+bool X86TargetLowering::isCheapToSpeculateCttz() const {
+ // Speculate cttz only if we can directly use TZCNT.
+ return Subtarget.hasBMI();
+}
+
+bool X86TargetLowering::isCheapToSpeculateCtlz() const {
+ // Speculate ctlz only if we can directly use LZCNT.
+ return Subtarget.hasLZCNT();
+}
+
+bool X86TargetLowering::isCtlzFast() const {
+ return Subtarget.hasFastLZCNT();
+}
+
+bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
+ if (!Subtarget.hasBMI())
+ return false;
+
+ // There are only 32-bit and 64-bit forms for 'andn'.
+ EVT VT = Y.getValueType();
+ if (VT != MVT::i32 && VT != MVT::i64)
+ return false;
+
+ return true;
+}
+
+/// Val is the undef sentinel value or equal to the specified value.
+static bool isUndefOrEqual(int Val, int CmpVal) {
+ return ((Val == SM_SentinelUndef) || (Val == CmpVal));
+}
+
+/// Val is either the undef or zero sentinel value.
+static bool isUndefOrZero(int Val) {
+ return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
+}
+
+/// Return true if every element in Mask, beginning
+/// from position Pos and ending in Pos+Size is the undef sentinel value.
+static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
+ for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
+ if (Mask[i] != SM_SentinelUndef)
+ return false;
+ return true;
+}
+
+/// Return true if Val is undef or if its value falls within the
+/// specified range (L, H].
+static bool isUndefOrInRange(int Val, int Low, int Hi) {
+ return (Val == SM_SentinelUndef) || (Val >= Low && Val < Hi);
+}
+
+/// Return true if every element in Mask is undef or if its value
+/// falls within the specified range (L, H].
+static bool isUndefOrInRange(ArrayRef<int> Mask,
+ int Low, int Hi) {
+ for (int M : Mask)
+ if (!isUndefOrInRange(M, Low, Hi))
+ return false;
+ return true;
+}
+
+/// Return true if Val is undef, zero or if its value falls within the
+/// specified range (L, H].
+static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
+ return isUndefOrZero(Val) || (Val >= Low && Val < Hi);
+}
+
+/// Return true if every element in Mask is undef, zero or if its value
+/// falls within the specified range (L, H].
+static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
+ for (int M : Mask)
+ if (!isUndefOrZeroOrInRange(M, Low, Hi))
+ return false;
+ return true;
+}
+
+/// Return true if every element in Mask, beginning
+/// from position Pos and ending in Pos+Size, falls within the specified
+/// sequential range (Low, Low+Size]. or is undef.
+static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
+ unsigned Pos, unsigned Size, int Low) {
+ for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
+ if (!isUndefOrEqual(Mask[i], Low))
+ return false;
+ return true;
+}
+
+/// Return true if every element in Mask, beginning
+/// from position Pos and ending in Pos+Size, falls within the specified
+/// sequential range (Low, Low+Size], or is undef or is zero.
+static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
+ unsigned Size, int Low) {
+ for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
+ if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
+ return false;
+ return true;
+}
+
+/// Return true if every element in Mask, beginning
+/// from position Pos and ending in Pos+Size is undef or is zero.
+static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
+ unsigned Size) {
+ for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
+ if (!isUndefOrZero(Mask[i]))
+ return false;
+ return true;
+}
+
+/// \brief Helper function to test whether a shuffle mask could be
+/// simplified by widening the elements being shuffled.
+///
+/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
+/// leaves it in an unspecified state.
+///
+/// NOTE: This must handle normal vector shuffle masks and *target* vector
+/// shuffle masks. The latter have the special property of a '-2' representing
+/// a zero-ed lane of a vector.
+static bool canWidenShuffleElements(ArrayRef<int> Mask,
+ SmallVectorImpl<int> &WidenedMask) {
+ WidenedMask.assign(Mask.size() / 2, 0);
+ for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
+ // If both elements are undef, its trivial.
+ if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
+ WidenedMask[i / 2] = SM_SentinelUndef;
+ continue;
+ }
+
+ // Check for an undef mask and a mask value properly aligned to fit with
+ // a pair of values. If we find such a case, use the non-undef mask's value.
+ if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 &&
+ Mask[i + 1] % 2 == 1) {
+ WidenedMask[i / 2] = Mask[i + 1] / 2;
+ continue;
+ }
+ if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
+ WidenedMask[i / 2] = Mask[i] / 2;
+ continue;
+ }
+
+ // When zeroing, we need to spread the zeroing across both lanes to widen.
+ if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
+ if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
+ (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
+ WidenedMask[i / 2] = SM_SentinelZero;
+ continue;
+ }
+ return false;
+ }
+
+ // Finally check if the two mask values are adjacent and aligned with
+ // a pair.
+ if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 &&
+ Mask[i] + 1 == Mask[i + 1]) {
+ WidenedMask[i / 2] = Mask[i] / 2;
+ continue;
+ }
+
+ // Otherwise we can't safely widen the elements used in this shuffle.
+ return false;
+ }
+ assert(WidenedMask.size() == Mask.size() / 2 &&
+ "Incorrect size of mask after widening the elements!");
+
+ return true;
+}
+
+/// Helper function to scale a shuffle or target shuffle mask, replacing each
+/// mask index with the scaled sequential indices for an equivalent narrowed
+/// mask. This is the reverse process to canWidenShuffleElements, but can always
+/// succeed.
+static void scaleShuffleMask(int Scale, ArrayRef<int> Mask,
+ SmallVectorImpl<int> &ScaledMask) {
+ assert(0 < Scale && "Unexpected scaling factor");
+ int NumElts = Mask.size();
+ ScaledMask.assign(NumElts * Scale, -1);
+
+ for (int i = 0; i != NumElts; ++i) {
+ int M = Mask[i];
+
+ // Repeat sentinel values in every mask element.
+ if (M < 0) {
+ for (int s = 0; s != Scale; ++s)
+ ScaledMask[(Scale * i) + s] = M;
+ continue;
+ }
+
+ // Scale mask element and increment across each mask element.
+ for (int s = 0; s != Scale; ++s)
+ ScaledMask[(Scale * i) + s] = (Scale * M) + s;
+ }
+}
+
+/// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector
+/// extract that is suitable for instruction that extract 128 or 256 bit vectors
+static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
+ assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
+ if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
+ return false;
+
+ // The index should be aligned on a vecWidth-bit boundary.
+ uint64_t Index =
+ cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
+
+ MVT VT = N->getSimpleValueType(0);
+ unsigned ElSize = VT.getScalarSizeInBits();
+ bool Result = (Index * ElSize) % vecWidth == 0;
+
+ return Result;
+}
+
+/// Return true if the specified INSERT_SUBVECTOR
+/// operand specifies a subvector insert that is suitable for input to
+/// insertion of 128 or 256-bit subvectors
+static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
+ assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
+ if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
+ return false;
+ // The index should be aligned on a vecWidth-bit boundary.
+ uint64_t Index =
+ cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
+
+ MVT VT = N->getSimpleValueType(0);
+ unsigned ElSize = VT.getScalarSizeInBits();
+ bool Result = (Index * ElSize) % vecWidth == 0;
+
+ return Result;
+}
+
+bool X86::isVINSERT128Index(SDNode *N) {
+ return isVINSERTIndex(N, 128);
+}
+
+bool X86::isVINSERT256Index(SDNode *N) {
+ return isVINSERTIndex(N, 256);
+}
+
+bool X86::isVEXTRACT128Index(SDNode *N) {
+ return isVEXTRACTIndex(N, 128);
+}
+
+bool X86::isVEXTRACT256Index(SDNode *N) {
+ return isVEXTRACTIndex(N, 256);
+}
+
+static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
+ assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
+ assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) &&
+ "Illegal extract subvector for VEXTRACT");
+
+ uint64_t Index =
+ cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
+
+ MVT VecVT = N->getOperand(0).getSimpleValueType();
+ MVT ElVT = VecVT.getVectorElementType();
+
+ unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
+ return Index / NumElemsPerChunk;
+}
+
+static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
+ assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
+ assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) &&
+ "Illegal insert subvector for VINSERT");
+
+ uint64_t Index =
+ cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
+
+ MVT VecVT = N->getSimpleValueType(0);
+ MVT ElVT = VecVT.getVectorElementType();
+
+ unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
+ return Index / NumElemsPerChunk;
+}
+
+/// Return the appropriate immediate to extract the specified
+/// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions.
+unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
+ return getExtractVEXTRACTImmediate(N, 128);
+}
+
+/// Return the appropriate immediate to extract the specified
+/// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions.
+unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
+ return getExtractVEXTRACTImmediate(N, 256);
+}
+
+/// Return the appropriate immediate to insert at the specified
+/// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions.
+unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
+ return getInsertVINSERTImmediate(N, 128);
+}
+
+/// Return the appropriate immediate to insert at the specified
+/// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions.
+unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
+ return getInsertVINSERTImmediate(N, 256);
+}
+
+/// Returns true if Elt is a constant zero or a floating point constant +0.0.
+bool X86::isZeroNode(SDValue Elt) {
+ return isNullConstant(Elt) || isNullFPConstant(Elt);
+}
+
+// Build a vector of constants
+// Use an UNDEF node if MaskElt == -1.
+// Spilt 64-bit constants in the 32-bit mode.
+static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
+ const SDLoc &dl, bool IsMask = false) {
+
+ SmallVector<SDValue, 32> Ops;
+ bool Split = false;
+
+ MVT ConstVecVT = VT;
+ unsigned NumElts = VT.getVectorNumElements();
+ bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
+ if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
+ ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
+ Split = true;
+ }
+
+ MVT EltVT = ConstVecVT.getVectorElementType();
+ for (unsigned i = 0; i < NumElts; ++i) {
+ bool IsUndef = Values[i] < 0 && IsMask;
+ SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
+ DAG.getConstant(Values[i], dl, EltVT);
+ Ops.push_back(OpNode);
+ if (Split)
+ Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
+ DAG.getConstant(0, dl, EltVT));
+ }
+ SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
+ if (Split)
+ ConstsNode = DAG.getBitcast(VT, ConstsNode);
+ return ConstsNode;
+}
+
+static SDValue getConstVector(ArrayRef<APInt> Bits, SmallBitVector &Undefs,
+ MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
+ assert(Bits.size() == Undefs.size() && "Unequal constant and undef arrays");
+ SmallVector<SDValue, 32> Ops;
+ bool Split = false;
+
+ MVT ConstVecVT = VT;
+ unsigned NumElts = VT.getVectorNumElements();
+ bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
+ if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
+ ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
+ Split = true;
+ }
+
+ MVT EltVT = ConstVecVT.getVectorElementType();
+ for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
+ if (Undefs[i]) {
+ Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
+ continue;
+ }
+ const APInt &V = Bits[i];
+ assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
+ if (Split) {
+ Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
+ Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
+ } else if (EltVT == MVT::f32) {
+ APFloat FV(APFloat::IEEEsingle(), V);
+ Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
+ } else if (EltVT == MVT::f64) {
+ APFloat FV(APFloat::IEEEdouble(), V);
+ Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
+ } else {
+ Ops.push_back(DAG.getConstant(V, dl, EltVT));
+ }
+ }
+
+ SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
+ return DAG.getBitcast(VT, ConstsNode);
+}
+
+/// Returns a vector of specified type with all zero elements.
+static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG, const SDLoc &dl) {
+ assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
+ VT.getVectorElementType() == MVT::i1) &&
+ "Unexpected vector type");
+
+ // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
+ // type. This ensures they get CSE'd. But if the integer type is not
+ // available, use a floating-point +0.0 instead.
+ SDValue Vec;
+ if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
+ Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
+ } else if (VT.getVectorElementType() == MVT::i1) {
+ assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
+ "Unexpected vector type");
+ assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) &&
+ "Unexpected vector type");
+ Vec = DAG.getConstant(0, dl, VT);
+ } else {
+ unsigned Num32BitElts = VT.getSizeInBits() / 32;
+ Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
+ }
+ return DAG.getBitcast(VT, Vec);
+}
+
+static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
+ const SDLoc &dl, unsigned vectorWidth) {
+ EVT VT = Vec.getValueType();
+ EVT ElVT = VT.getVectorElementType();
+ unsigned Factor = VT.getSizeInBits()/vectorWidth;
+ EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
+ VT.getVectorNumElements()/Factor);
+
+ // Extract from UNDEF is UNDEF.
+ if (Vec.isUndef())
+ return DAG.getUNDEF(ResultVT);
+
+ // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
+ unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
+ assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
+
+ // This is the index of the first element of the vectorWidth-bit chunk
+ // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
+ IdxVal &= ~(ElemsPerChunk - 1);
+
+ // If the input is a buildvector just emit a smaller one.
+ if (Vec.getOpcode() == ISD::BUILD_VECTOR)
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
+ makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
+
+ SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
+}
+
+/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
+/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
+/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
+/// instructions or a simple subregister reference. Idx is an index in the
+/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
+/// lowering EXTRACT_VECTOR_ELT operations easier.
+static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, const SDLoc &dl) {
+ assert((Vec.getValueType().is256BitVector() ||
+ Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
+ return extractSubVector(Vec, IdxVal, DAG, dl, 128);
+}
+
+/// Generate a DAG to grab 256-bits from a 512-bit vector.
+static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, const SDLoc &dl) {
+ assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
+ return extractSubVector(Vec, IdxVal, DAG, dl, 256);
+}
+
+static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, const SDLoc &dl,
+ unsigned vectorWidth) {
+ assert((vectorWidth == 128 || vectorWidth == 256) &&
+ "Unsupported vector width");
+ // Inserting UNDEF is Result
+ if (Vec.isUndef())
+ return Result;
+ EVT VT = Vec.getValueType();
+ EVT ElVT = VT.getVectorElementType();
+ EVT ResultVT = Result.getValueType();
+
+ // Insert the relevant vectorWidth bits.
+ unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
+ assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
+
+ // This is the index of the first element of the vectorWidth-bit chunk
+ // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
+ IdxVal &= ~(ElemsPerChunk - 1);
+
+ SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
+}
+
+/// Generate a DAG to put 128-bits into a vector > 128 bits. This
+/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
+/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
+/// simple superregister reference. Idx is an index in the 128 bits
+/// we want. It need not be aligned to a 128-bit boundary. That makes
+/// lowering INSERT_VECTOR_ELT operations easier.
+static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, const SDLoc &dl) {
+ assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
+
+ // For insertion into the zero index (low half) of a 256-bit vector, it is
+ // more efficient to generate a blend with immediate instead of an insert*128.
+ // We are still creating an INSERT_SUBVECTOR below with an undef node to
+ // extend the subvector to the size of the result vector. Make sure that
+ // we are not recursing on that node by checking for undef here.
+ if (IdxVal == 0 && Result.getValueType().is256BitVector() &&
+ !Result.isUndef()) {
+ EVT ResultVT = Result.getValueType();
+ SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl);
+ SDValue Undef = DAG.getUNDEF(ResultVT);
+ SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef,
+ Vec, ZeroIndex);
+
+ // The blend instruction, and therefore its mask, depend on the data type.
+ MVT ScalarType = ResultVT.getVectorElementType().getSimpleVT();
+ if (ScalarType.isFloatingPoint()) {
+ // Choose either vblendps (float) or vblendpd (double).
+ unsigned ScalarSize = ScalarType.getSizeInBits();
+ assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type");
+ unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f;
+ SDValue Mask = DAG.getConstant(MaskVal, dl, MVT::i8);
+ return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask);
+ }
+
+ const X86Subtarget &Subtarget =
+ static_cast<const X86Subtarget &>(DAG.getSubtarget());
+
+ // AVX2 is needed for 256-bit integer blend support.
+ // Integers must be cast to 32-bit because there is only vpblendd;
+ // vpblendw can't be used for this because it has a handicapped mask.
+
+ // If we don't have AVX2, then cast to float. Using a wrong domain blend
+ // is still more efficient than using the wrong domain vinsertf128 that
+ // will be created by InsertSubVector().
+ MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32;
+
+ SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8);
+ Result = DAG.getBitcast(CastVT, Result);
+ Vec256 = DAG.getBitcast(CastVT, Vec256);
+ Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask);
+ return DAG.getBitcast(ResultVT, Vec256);
+ }
+
+ return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
+}
+
+static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, const SDLoc &dl) {
+ assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
+ return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
+}
+
+/// Insert i1-subvector to i1-vector.
+static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+
+ SDLoc dl(Op);
+ SDValue Vec = Op.getOperand(0);
+ SDValue SubVec = Op.getOperand(1);
+ SDValue Idx = Op.getOperand(2);
+
+ if (!isa<ConstantSDNode>(Idx))
+ return SDValue();
+
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
+ return Op;
+
+ MVT OpVT = Op.getSimpleValueType();
+ MVT SubVecVT = SubVec.getSimpleValueType();
+ unsigned NumElems = OpVT.getVectorNumElements();
+ unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
+
+ assert(IdxVal + SubVecNumElems <= NumElems &&
+ IdxVal % SubVecVT.getSizeInBits() == 0 &&
+ "Unexpected index value in INSERT_SUBVECTOR");
+
+ // There are 3 possible cases:
+ // 1. Subvector should be inserted in the lower part (IdxVal == 0)
+ // 2. Subvector should be inserted in the upper part
+ // (IdxVal + SubVecNumElems == NumElems)
+ // 3. Subvector should be inserted in the middle (for example v2i1
+ // to v16i1, index 2)
+
+ // extend to natively supported kshift
+ MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
+ MVT WideOpVT = OpVT;
+ if (OpVT.getSizeInBits() < MinVT.getStoreSizeInBits())
+ WideOpVT = MinVT;
+
+ SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
+ SDValue Undef = DAG.getUNDEF(WideOpVT);
+ SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
+ Undef, SubVec, ZeroIdx);
+
+ // Extract sub-vector if require.
+ auto ExtractSubVec = [&](SDValue V) {
+ return (WideOpVT == OpVT) ? V : DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
+ OpVT, V, ZeroIdx);
+ };
+
+ if (Vec.isUndef()) {
+ if (IdxVal != 0) {
+ SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8);
+ WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec, ShiftBits);
+ }
+ return ExtractSubVec(WideSubVec);
+ }
+
+ if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
+ NumElems = WideOpVT.getVectorNumElements();
+ unsigned ShiftLeft = NumElems - SubVecNumElems;
+ unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
+ Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec,
+ DAG.getConstant(ShiftLeft, dl, MVT::i8));
+ Vec = ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec,
+ DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec;
+ return ExtractSubVec(Vec);
+ }
+
+ if (IdxVal == 0) {
+ // Zero lower bits of the Vec
+ SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
+ Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
+ Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits);
+ Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits);
+ // Merge them together, SubVec should be zero extended.
+ WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
+ getZeroVector(WideOpVT, Subtarget, DAG, dl),
+ SubVec, ZeroIdx);
+ Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
+ return ExtractSubVec(Vec);
+ }
+
+ // Simple case when we put subvector in the upper part
+ if (IdxVal + SubVecNumElems == NumElems) {
+ // Zero upper bits of the Vec
+ WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec,
+ DAG.getConstant(IdxVal, dl, MVT::i8));
+ SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
+ Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
+ Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits);
+ Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits);
+ Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
+ return ExtractSubVec(Vec);
+ }
+ // Subvector should be inserted in the middle - use shuffle
+ WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
+ SubVec, ZeroIdx);
+ SmallVector<int, 64> Mask;
+ for (unsigned i = 0; i < NumElems; ++i)
+ Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ?
+ i : i + NumElems);
+ return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask);
+}
+
+/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
+/// instructions. This is used because creating CONCAT_VECTOR nodes of
+/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
+/// large BUILD_VECTORS.
+static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
+ unsigned NumElems, SelectionDAG &DAG,
+ const SDLoc &dl) {
+ SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
+ return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
+}
+
+static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
+ unsigned NumElems, SelectionDAG &DAG,
+ const SDLoc &dl) {
+ SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
+ return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
+}
+
+/// Returns a vector of specified type with all bits set.
+/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
+/// no AVX2 support, use two <4 x i32> inserted in a <8 x i32> appropriately.
+/// Then bitcast to their original type, ensuring they get CSE'd.
+static SDValue getOnesVector(EVT VT, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG, const SDLoc &dl) {
+ assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
+ "Expected a 128/256/512-bit vector type");
+
+ APInt Ones = APInt::getAllOnesValue(32);
+ unsigned NumElts = VT.getSizeInBits() / 32;
+ SDValue Vec;
+ if (!Subtarget.hasInt256() && NumElts == 8) {
+ Vec = DAG.getConstant(Ones, dl, MVT::v4i32);
+ Vec = concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
+ } else {
+ Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
+ }
+ return DAG.getBitcast(VT, Vec);
+}
+
+/// Generate unpacklo/unpackhi shuffle mask.
+static void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo,
+ bool Unary) {
+ assert(Mask.empty() && "Expected an empty shuffle mask vector");
+ int NumElts = VT.getVectorNumElements();
+ int NumEltsInLane = 128 / VT.getScalarSizeInBits();
+
+ for (int i = 0; i < NumElts; ++i) {
+ unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
+ int Pos = (i % NumEltsInLane) / 2 + LaneStart;
+ Pos += (Unary ? 0 : NumElts * (i % 2));
+ Pos += (Lo ? 0 : NumEltsInLane / 2);
+ Mask.push_back(Pos);
+ }
+}
+
+/// Returns a vector_shuffle node for an unpackl operation.
+static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
+ SDValue V1, SDValue V2) {
+ SmallVector<int, 8> Mask;
+ createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
+ return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
+}
+
+/// Returns a vector_shuffle node for an unpackh operation.
+static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
+ SDValue V1, SDValue V2) {
+ SmallVector<int, 8> Mask;
+ createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
+ return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
+}
+
+/// Return a vector_shuffle of the specified vector of zero or undef vector.
+/// This produces a shuffle where the low element of V2 is swizzled into the
+/// zero/undef vector, landing at element Idx.
+/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
+static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
+ bool IsZero,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = V2.getSimpleValueType();
+ SDValue V1 = IsZero
+ ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
+ int NumElems = VT.getVectorNumElements();
+ SmallVector<int, 16> MaskVec(NumElems);
+ for (int i = 0; i != NumElems; ++i)
+ // If this is the insertion idx, put the low elt of V2 here.
+ MaskVec[i] = (i == Idx) ? NumElems : i;
+ return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
+}
+
+static SDValue peekThroughBitcasts(SDValue V) {
+ while (V.getNode() && V.getOpcode() == ISD::BITCAST)
+ V = V.getOperand(0);
+ return V;
+}
+
+static SDValue peekThroughOneUseBitcasts(SDValue V) {
+ while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
+ V.getOperand(0).hasOneUse())
+ V = V.getOperand(0);
+ return V;
+}
+
+static const Constant *getTargetConstantFromNode(SDValue Op) {
+ Op = peekThroughBitcasts(Op);
+
+ auto *Load = dyn_cast<LoadSDNode>(Op);
+ if (!Load)
+ return nullptr;
+
+ SDValue Ptr = Load->getBasePtr();
+ if (Ptr->getOpcode() == X86ISD::Wrapper ||
+ Ptr->getOpcode() == X86ISD::WrapperRIP)
+ Ptr = Ptr->getOperand(0);
+
+ auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
+ if (!CNode || CNode->isMachineConstantPoolEntry())
+ return nullptr;
+
+ return dyn_cast<Constant>(CNode->getConstVal());
+}
+
+// Extract raw constant bits from constant pools.
+static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
+ SmallBitVector &UndefElts,
+ SmallVectorImpl<APInt> &EltBits) {
+ assert(UndefElts.empty() && "Expected an empty UndefElts vector");
+ assert(EltBits.empty() && "Expected an empty EltBits vector");
+
+ Op = peekThroughBitcasts(Op);
+
+ EVT VT = Op.getValueType();
+ unsigned SizeInBits = VT.getSizeInBits();
+ assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
+ unsigned NumElts = SizeInBits / EltSizeInBits;
+
+ // Extract all the undef/constant element data and pack into single bitsets.
+ APInt UndefBits(SizeInBits, 0);
+ APInt MaskBits(SizeInBits, 0);
+
+ // Split the undef/constant single bitset data into the target elements.
+ auto SplitBitData = [&]() {
+ UndefElts = SmallBitVector(NumElts, false);
+ EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
+
+ for (unsigned i = 0; i != NumElts; ++i) {
+ APInt UndefEltBits = UndefBits.lshr(i * EltSizeInBits);
+ UndefEltBits = UndefEltBits.zextOrTrunc(EltSizeInBits);
+
+ // Only treat an element as UNDEF if all bits are UNDEF, otherwise
+ // treat it as zero.
+ if (UndefEltBits.isAllOnesValue()) {
+ UndefElts[i] = true;
+ continue;
+ }
+
+ APInt Bits = MaskBits.lshr(i * EltSizeInBits);
+ Bits = Bits.zextOrTrunc(EltSizeInBits);
+ EltBits[i] = Bits.getZExtValue();
+ }
+ return true;
+ };
+
+ auto ExtractConstantBits = [SizeInBits](const Constant *Cst, APInt &Mask,
+ APInt &Undefs) {
+ if (!Cst)
+ return false;
+ unsigned CstSizeInBits = Cst->getType()->getPrimitiveSizeInBits();
+ if (isa<UndefValue>(Cst)) {
+ Mask = APInt::getNullValue(SizeInBits);
+ Undefs = APInt::getLowBitsSet(SizeInBits, CstSizeInBits);
+ return true;
+ }
+ if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
+ Mask = CInt->getValue().zextOrTrunc(SizeInBits);
+ Undefs = APInt::getNullValue(SizeInBits);
+ return true;
+ }
+ if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
+ Mask = CFP->getValueAPF().bitcastToAPInt().zextOrTrunc(SizeInBits);
+ Undefs = APInt::getNullValue(SizeInBits);
+ return true;
+ }
+ return false;
+ };
+
+ // Extract constant bits from constant pool vector.
+ if (auto *Cst = getTargetConstantFromNode(Op)) {
+ Type *CstTy = Cst->getType();
+ if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
+ return false;
+
+ unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
+ for (unsigned i = 0, e = CstTy->getVectorNumElements(); i != e; ++i) {
+ APInt Bits, Undefs;
+ if (!ExtractConstantBits(Cst->getAggregateElement(i), Bits, Undefs))
+ return false;
+ MaskBits |= Bits.shl(i * CstEltSizeInBits);
+ UndefBits |= Undefs.shl(i * CstEltSizeInBits);
+ }
+
+ return SplitBitData();
+ }
+
+ // Extract constant bits from a broadcasted constant pool scalar.
+ if (Op.getOpcode() == X86ISD::VBROADCAST &&
+ EltSizeInBits <= Op.getScalarValueSizeInBits()) {
+ if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
+ APInt Bits, Undefs;
+ if (ExtractConstantBits(Broadcast, Bits, Undefs)) {
+ unsigned NumBroadcastBits = Op.getScalarValueSizeInBits();
+ unsigned NumBroadcastElts = SizeInBits / NumBroadcastBits;
+ for (unsigned i = 0; i != NumBroadcastElts; ++i) {
+ MaskBits |= Bits.shl(i * NumBroadcastBits);
+ UndefBits |= Undefs.shl(i * NumBroadcastBits);
+ }
+ return SplitBitData();
+ }
+ }
+ }
+
+ return false;
+}
+
+// TODO: Merge more of this with getTargetConstantBitsFromNode.
+static bool getTargetShuffleMaskIndices(SDValue MaskNode,
+ unsigned MaskEltSizeInBits,
+ SmallVectorImpl<uint64_t> &RawMask) {
+ MaskNode = peekThroughBitcasts(MaskNode);
+
+ MVT VT = MaskNode.getSimpleValueType();
+ assert(VT.isVector() && "Can't produce a non-vector with a build_vector!");
+ unsigned NumMaskElts = VT.getSizeInBits() / MaskEltSizeInBits;
+
+ // Split an APInt element into MaskEltSizeInBits sized pieces and
+ // insert into the shuffle mask.
+ auto SplitElementToMask = [&](APInt Element) {
+ // Note that this is x86 and so always little endian: the low byte is
+ // the first byte of the mask.
+ int Split = VT.getScalarSizeInBits() / MaskEltSizeInBits;
+ for (int i = 0; i < Split; ++i) {
+ APInt RawElt = Element.getLoBits(MaskEltSizeInBits);
+ Element = Element.lshr(MaskEltSizeInBits);
+ RawMask.push_back(RawElt.getZExtValue());
+ }
+ };
+
+ if (MaskNode.getOpcode() == X86ISD::VBROADCAST) {
+ // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
+ // TODO: Handle (VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0
+ if (VT.getScalarSizeInBits() != MaskEltSizeInBits)
+ return false;
+ if (auto *CN = dyn_cast<ConstantSDNode>(MaskNode.getOperand(0))) {
+ const APInt &MaskElement = CN->getAPIntValue();
+ for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
+ APInt RawElt = MaskElement.getLoBits(MaskEltSizeInBits);
+ RawMask.push_back(RawElt.getZExtValue());
+ }
+ }
+ return false;
+ }
+
+ if (MaskNode.getOpcode() == X86ISD::VZEXT_MOVL &&
+ MaskNode.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR) {
+ SDValue MaskOp = MaskNode.getOperand(0).getOperand(0);
+ if (auto *CN = dyn_cast<ConstantSDNode>(MaskOp)) {
+ if ((MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0) {
+ RawMask.push_back(CN->getZExtValue());
+ RawMask.append(NumMaskElts - 1, 0);
+ return true;
+ }
+
+ if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0) {
+ unsigned ElementSplit = VT.getScalarSizeInBits() / MaskEltSizeInBits;
+ SplitElementToMask(CN->getAPIntValue());
+ RawMask.append((VT.getVectorNumElements() - 1) * ElementSplit, 0);
+ return true;
+ }
+ }
+ return false;
+ }
+
+ if (MaskNode.getOpcode() != ISD::BUILD_VECTOR)
+ return false;
+
+ // We can always decode if the buildvector is all zero constants,
+ // but can't use isBuildVectorAllZeros as it might contain UNDEFs.
+ if (all_of(MaskNode->ops(), X86::isZeroNode)) {
+ RawMask.append(NumMaskElts, 0);
+ return true;
+ }
+
+ // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
+ if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) != 0)
+ return false;
+
+ for (SDValue Op : MaskNode->ops()) {
+ if (auto *CN = dyn_cast<ConstantSDNode>(Op.getNode()))
+ SplitElementToMask(CN->getAPIntValue());
+ else if (auto *CFN = dyn_cast<ConstantFPSDNode>(Op.getNode()))
+ SplitElementToMask(CFN->getValueAPF().bitcastToAPInt());
+ else
+ return false;
+ }
+
+ return true;
+}
+
+/// Calculates the shuffle mask corresponding to the target-specific opcode.
+/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
+/// operands in \p Ops, and returns true.
+/// Sets \p IsUnary to true if only one source is used. Note that this will set
+/// IsUnary for shuffles which use a single input multiple times, and in those
+/// cases it will adjust the mask to only have indices within that single input.
+/// It is an error to call this with non-empty Mask/Ops vectors.
+static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
+ SmallVectorImpl<SDValue> &Ops,
+ SmallVectorImpl<int> &Mask, bool &IsUnary) {
+ unsigned NumElems = VT.getVectorNumElements();
+ SDValue ImmN;
+
+ assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
+ assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
+
+ IsUnary = false;
+ bool IsFakeUnary = false;
+ switch(N->getOpcode()) {
+ case X86ISD::BLENDI:
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ break;
+ case X86ISD::SHUFP:
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+ break;
+ case X86ISD::INSERTPS:
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+ break;
+ case X86ISD::UNPCKH:
+ DecodeUNPCKHMask(VT, Mask);
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+ break;
+ case X86ISD::UNPCKL:
+ DecodeUNPCKLMask(VT, Mask);
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+ break;
+ case X86ISD::MOVHLPS:
+ DecodeMOVHLPSMask(NumElems, Mask);
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+ break;
+ case X86ISD::MOVLHPS:
+ DecodeMOVLHPSMask(NumElems, Mask);
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+ break;
+ case X86ISD::PALIGNR:
+ assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+ Ops.push_back(N->getOperand(1));
+ Ops.push_back(N->getOperand(0));
+ break;
+ case X86ISD::VSHLDQ:
+ assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
+ ImmN = N->getOperand(N->getNumOperands() - 1);
+ DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ IsUnary = true;
+ break;
+ case X86ISD::VSRLDQ:
+ assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
+ ImmN = N->getOperand(N->getNumOperands() - 1);
+ DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ IsUnary = true;
+ break;
+ case X86ISD::PSHUFD:
+ case X86ISD::VPERMILPI:
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ IsUnary = true;
+ break;
+ case X86ISD::PSHUFHW:
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ IsUnary = true;
+ break;
+ case X86ISD::PSHUFLW:
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ IsUnary = true;
+ break;
+ case X86ISD::VZEXT_MOVL:
+ DecodeZeroMoveLowMask(VT, Mask);
+ IsUnary = true;
+ break;
+ case X86ISD::VBROADCAST: {
+ // We only decode broadcasts of same-sized vectors at the moment.
+ if (N->getOperand(0).getValueType() == VT) {
+ DecodeVectorBroadcast(VT, Mask);
+ IsUnary = true;
+ break;
+ }
+ return false;
+ }
+ case X86ISD::VPERMILPV: {
+ IsUnary = true;
+ SDValue MaskNode = N->getOperand(1);
+ unsigned MaskEltSize = VT.getScalarSizeInBits();
+ SmallVector<uint64_t, 32> RawMask;
+ if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
+ DecodeVPERMILPMask(VT, RawMask, Mask);
+ break;
+ }
+ if (auto *C = getTargetConstantFromNode(MaskNode)) {
+ DecodeVPERMILPMask(C, MaskEltSize, Mask);
+ break;
+ }
+ return false;
+ }
+ case X86ISD::PSHUFB: {
+ IsUnary = true;
+ SDValue MaskNode = N->getOperand(1);
+ SmallVector<uint64_t, 32> RawMask;
+ if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
+ DecodePSHUFBMask(RawMask, Mask);
+ break;
+ }
+ if (auto *C = getTargetConstantFromNode(MaskNode)) {
+ DecodePSHUFBMask(C, Mask);
+ break;
+ }
+ return false;
+ }
+ case X86ISD::VPERMI:
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ IsUnary = true;
+ break;
+ case X86ISD::MOVSS:
+ case X86ISD::MOVSD:
+ DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
+ break;
+ case X86ISD::VPERM2X128:
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+ break;
+ case X86ISD::MOVSLDUP:
+ DecodeMOVSLDUPMask(VT, Mask);
+ IsUnary = true;
+ break;
+ case X86ISD::MOVSHDUP:
+ DecodeMOVSHDUPMask(VT, Mask);
+ IsUnary = true;
+ break;
+ case X86ISD::MOVDDUP:
+ DecodeMOVDDUPMask(VT, Mask);
+ IsUnary = true;
+ break;
+ case X86ISD::MOVLHPD:
+ case X86ISD::MOVLPD:
+ case X86ISD::MOVLPS:
+ // Not yet implemented
+ return false;
+ case X86ISD::VPERMIL2: {
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+ unsigned MaskEltSize = VT.getScalarSizeInBits();
+ SDValue MaskNode = N->getOperand(2);
+ SDValue CtrlNode = N->getOperand(3);
+ if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
+ unsigned CtrlImm = CtrlOp->getZExtValue();
+ SmallVector<uint64_t, 32> RawMask;
+ if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
+ DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
+ break;
+ }
+ if (auto *C = getTargetConstantFromNode(MaskNode)) {
+ DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
+ break;
+ }
+ }
+ return false;
+ }
+ case X86ISD::VPPERM: {
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+ SDValue MaskNode = N->getOperand(2);
+ SmallVector<uint64_t, 32> RawMask;
+ if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
+ DecodeVPPERMMask(RawMask, Mask);
+ break;
+ }
+ if (auto *C = getTargetConstantFromNode(MaskNode)) {
+ DecodeVPPERMMask(C, Mask);
+ break;
+ }
+ return false;
+ }
+ case X86ISD::VPERMV: {
+ IsUnary = true;
+ // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
+ Ops.push_back(N->getOperand(1));
+ SDValue MaskNode = N->getOperand(0);
+ SmallVector<uint64_t, 32> RawMask;
+ unsigned MaskEltSize = VT.getScalarSizeInBits();
+ if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
+ DecodeVPERMVMask(RawMask, Mask);
+ break;
+ }
+ if (auto *C = getTargetConstantFromNode(MaskNode)) {
+ DecodeVPERMVMask(C, MaskEltSize, Mask);
+ break;
+ }
+ return false;
+ }
+ case X86ISD::VPERMV3: {
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
+ // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
+ Ops.push_back(N->getOperand(0));
+ Ops.push_back(N->getOperand(2));
+ SDValue MaskNode = N->getOperand(1);
+ unsigned MaskEltSize = VT.getScalarSizeInBits();
+ if (auto *C = getTargetConstantFromNode(MaskNode)) {
+ DecodeVPERMV3Mask(C, MaskEltSize, Mask);
+ break;
+ }
+ return false;
+ }
+ case X86ISD::VPERMIV3: {
+ IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2);
+ // Unlike most shuffle nodes, VPERMIV3's mask operand is the first one.
+ Ops.push_back(N->getOperand(1));
+ Ops.push_back(N->getOperand(2));
+ SDValue MaskNode = N->getOperand(0);
+ unsigned MaskEltSize = VT.getScalarSizeInBits();
+ if (auto *C = getTargetConstantFromNode(MaskNode)) {
+ DecodeVPERMV3Mask(C, MaskEltSize, Mask);
+ break;
+ }
+ return false;
+ }
+ default: llvm_unreachable("unknown target shuffle node");
+ }
+
+ // Empty mask indicates the decode failed.
+ if (Mask.empty())
+ return false;
+
+ // Check if we're getting a shuffle mask with zero'd elements.
+ if (!AllowSentinelZero)
+ if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
+ return false;
+
+ // If we have a fake unary shuffle, the shuffle mask is spread across two
+ // inputs that are actually the same node. Re-map the mask to always point
+ // into the first input.
+ if (IsFakeUnary)
+ for (int &M : Mask)
+ if (M >= (int)Mask.size())
+ M -= Mask.size();
+
+ // If we didn't already add operands in the opcode-specific code, default to
+ // adding 1 or 2 operands starting at 0.
+ if (Ops.empty()) {
+ Ops.push_back(N->getOperand(0));
+ if (!IsUnary || IsFakeUnary)
+ Ops.push_back(N->getOperand(1));
+ }
+
+ return true;
+}
+
+/// Check a target shuffle mask's inputs to see if we can set any values to
+/// SM_SentinelZero - this is for elements that are known to be zero
+/// (not just zeroable) from their inputs.
+/// Returns true if the target shuffle mask was decoded.
+static bool setTargetShuffleZeroElements(SDValue N,
+ SmallVectorImpl<int> &Mask,
+ SmallVectorImpl<SDValue> &Ops) {
+ bool IsUnary;
+ if (!isTargetShuffle(N.getOpcode()))
+ return false;
+
+ MVT VT = N.getSimpleValueType();
+ if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
+ return false;
+
+ SDValue V1 = Ops[0];
+ SDValue V2 = IsUnary ? V1 : Ops[1];
+
+ V1 = peekThroughBitcasts(V1);
+ V2 = peekThroughBitcasts(V2);
+
+ for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ int M = Mask[i];
+
+ // Already decoded as SM_SentinelZero / SM_SentinelUndef.
+ if (M < 0)
+ continue;
+
+ // Determine shuffle input and normalize the mask.
+ SDValue V = M < Size ? V1 : V2;
+ M %= Size;
+
+ // We are referencing an UNDEF input.
+ if (V.isUndef()) {
+ Mask[i] = SM_SentinelUndef;
+ continue;
+ }
+
+ // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
+ if (V.getOpcode() != ISD::BUILD_VECTOR)
+ continue;
+
+ // If the BUILD_VECTOR has fewer elements then the (larger) source
+ // element must be UNDEF/ZERO.
+ // TODO: Is it worth testing the individual bits of a constant?
+ if ((Size % V.getNumOperands()) == 0) {
+ int Scale = Size / V->getNumOperands();
+ SDValue Op = V.getOperand(M / Scale);
+ if (Op.isUndef())
+ Mask[i] = SM_SentinelUndef;
+ else if (X86::isZeroNode(Op))
+ Mask[i] = SM_SentinelZero;
+ continue;
+ }
+
+ // If the BUILD_VECTOR has more elements then all the (smaller) source
+ // elements must be all UNDEF or all ZERO.
+ if ((V.getNumOperands() % Size) == 0) {
+ int Scale = V->getNumOperands() / Size;
+ bool AllUndef = true;
+ bool AllZero = true;
+ for (int j = 0; j < Scale; ++j) {
+ SDValue Op = V.getOperand((M * Scale) + j);
+ AllUndef &= Op.isUndef();
+ AllZero &= X86::isZeroNode(Op);
+ }
+ if (AllUndef)
+ Mask[i] = SM_SentinelUndef;
+ else if (AllZero)
+ Mask[i] = SM_SentinelZero;
+ continue;
+ }
+ }
+
+ assert(VT.getVectorNumElements() == Mask.size() &&
+ "Different mask size from vector size!");
+ return true;
+}
+
+// Attempt to decode ops that could be represented as a shuffle mask.
+// The decoded shuffle mask may contain a different number of elements to the
+// destination value type.
+static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
+ SmallVectorImpl<SDValue> &Ops) {
+ Mask.clear();
+ Ops.clear();
+
+ MVT VT = N.getSimpleValueType();
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned NumSizeInBits = VT.getSizeInBits();
+ unsigned NumBitsPerElt = VT.getScalarSizeInBits();
+ assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
+ "Expected byte aligned value types");
+
+ unsigned Opcode = N.getOpcode();
+ switch (Opcode) {
+ case ISD::AND: {
+ // Attempt to decode as a per-byte mask.
+ SmallBitVector UndefElts;
+ SmallVector<APInt, 32> EltBits;
+ if (!getTargetConstantBitsFromNode(N.getOperand(1), 8, UndefElts, EltBits))
+ return false;
+ for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
+ if (UndefElts[i]) {
+ Mask.push_back(SM_SentinelUndef);
+ continue;
+ }
+ uint64_t ByteBits = EltBits[i].getZExtValue();
+ if (ByteBits != 0 && ByteBits != 255)
+ return false;
+ Mask.push_back(ByteBits == 0 ? SM_SentinelZero : i);
+ }
+ Ops.push_back(N.getOperand(0));
+ return true;
+ }
+ case X86ISD::VSHLI:
+ case X86ISD::VSRLI: {
+ uint64_t ShiftVal = N.getConstantOperandVal(1);
+ // Out of range bit shifts are guaranteed to be zero.
+ if (NumBitsPerElt <= ShiftVal) {
+ Mask.append(NumElts, SM_SentinelZero);
+ return true;
+ }
+
+ // We can only decode 'whole byte' bit shifts as shuffles.
+ if ((ShiftVal % 8) != 0)
+ break;
+
+ uint64_t ByteShift = ShiftVal / 8;
+ unsigned NumBytes = NumSizeInBits / 8;
+ unsigned NumBytesPerElt = NumBitsPerElt / 8;
+ Ops.push_back(N.getOperand(0));
+
+ // Clear mask to all zeros and insert the shifted byte indices.
+ Mask.append(NumBytes, SM_SentinelZero);
+
+ if (X86ISD::VSHLI == Opcode) {
+ for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
+ for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
+ Mask[i + j] = i + j - ByteShift;
+ } else {
+ for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
+ for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
+ Mask[i + j - ByteShift] = i + j;
+ }
+ return true;
+ }
+ case X86ISD::VZEXT: {
+ // TODO - add support for VPMOVZX with smaller input vector types.
+ SDValue Src = N.getOperand(0);
+ MVT SrcVT = Src.getSimpleValueType();
+ if (NumSizeInBits != SrcVT.getSizeInBits())
+ break;
+ DecodeZeroExtendMask(SrcVT.getScalarType(), VT, Mask);
+ Ops.push_back(Src);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
+/// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
+/// remaining input indices in case we now have a unary shuffle and adjust the
+/// Op0/Op1 inputs accordingly.
+/// Returns true if the target shuffle mask was decoded.
+static bool resolveTargetShuffleInputs(SDValue Op, SDValue &Op0, SDValue &Op1,
+ SmallVectorImpl<int> &Mask) {
+ SmallVector<SDValue, 2> Ops;
+ if (!setTargetShuffleZeroElements(Op, Mask, Ops))
+ if (!getFauxShuffleMask(Op, Mask, Ops))
+ return false;
+
+ int NumElts = Mask.size();
+ bool Op0InUse = any_of(Mask, [NumElts](int Idx) {
+ return 0 <= Idx && Idx < NumElts;
+ });
+ bool Op1InUse = any_of(Mask, [NumElts](int Idx) { return NumElts <= Idx; });
+
+ Op0 = Op0InUse ? Ops[0] : SDValue();
+ Op1 = Op1InUse ? Ops[1] : SDValue();
+
+ // We're only using Op1 - commute the mask and inputs.
+ if (!Op0InUse && Op1InUse) {
+ for (int &M : Mask)
+ if (NumElts <= M)
+ M -= NumElts;
+ Op0 = Op1;
+ Op1 = SDValue();
+ }
+
+ return true;
+}
+
+/// Returns the scalar element that will make up the ith
+/// element of the result of the vector shuffle.
+static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
+ unsigned Depth) {
+ if (Depth == 6)
+ return SDValue(); // Limit search depth.
+
+ SDValue V = SDValue(N, 0);
+ EVT VT = V.getValueType();
+ unsigned Opcode = V.getOpcode();
+
+ // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
+ if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
+ int Elt = SV->getMaskElt(Index);
+
+ if (Elt < 0)
+ return DAG.getUNDEF(VT.getVectorElementType());
+
+ unsigned NumElems = VT.getVectorNumElements();
+ SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
+ : SV->getOperand(1);
+ return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
+ }
+
+ // Recurse into target specific vector shuffles to find scalars.
+ if (isTargetShuffle(Opcode)) {
+ MVT ShufVT = V.getSimpleValueType();
+ MVT ShufSVT = ShufVT.getVectorElementType();
+ int NumElems = (int)ShufVT.getVectorNumElements();
+ SmallVector<int, 16> ShuffleMask;
+ SmallVector<SDValue, 16> ShuffleOps;
+ bool IsUnary;
+
+ if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
+ return SDValue();
+
+ int Elt = ShuffleMask[Index];
+ if (Elt == SM_SentinelZero)
+ return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
+ : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
+ if (Elt == SM_SentinelUndef)
+ return DAG.getUNDEF(ShufSVT);
+
+ assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
+ SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
+ return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
+ Depth+1);
+ }
+
+ // Actual nodes that may contain scalar elements
+ if (Opcode == ISD::BITCAST) {
+ V = V.getOperand(0);
+ EVT SrcVT = V.getValueType();
+ unsigned NumElems = VT.getVectorNumElements();
+
+ if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
+ return SDValue();
+ }
+
+ if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
+ return (Index == 0) ? V.getOperand(0)
+ : DAG.getUNDEF(VT.getVectorElementType());
+
+ if (V.getOpcode() == ISD::BUILD_VECTOR)
+ return V.getOperand(Index);
+
+ return SDValue();
+}
+
+/// Custom lower build_vector of v16i8.
+static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
+ unsigned NumNonZero, unsigned NumZero,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ const TargetLowering &TLI) {
+ if (NumNonZero > 8)
+ return SDValue();
+
+ SDLoc dl(Op);
+ SDValue V;
+ bool First = true;
+
+ // SSE4.1 - use PINSRB to insert each byte directly.
+ if (Subtarget.hasSSE41()) {
+ for (unsigned i = 0; i < 16; ++i) {
+ bool isNonZero = (NonZeros & (1 << i)) != 0;
+ if (isNonZero) {
+ if (First) {
+ if (NumZero)
+ V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
+ else
+ V = DAG.getUNDEF(MVT::v16i8);
+ First = false;
+ }
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
+ MVT::v16i8, V, Op.getOperand(i),
+ DAG.getIntPtrConstant(i, dl));
+ }
+ }
+
+ return V;
+ }
+
+ // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
+ for (unsigned i = 0; i < 16; ++i) {
+ bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
+ if (ThisIsNonZero && First) {
+ if (NumZero)
+ V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
+ else
+ V = DAG.getUNDEF(MVT::v8i16);
+ First = false;
+ }
+
+ if ((i & 1) != 0) {
+ SDValue ThisElt, LastElt;
+ bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
+ if (LastIsNonZero) {
+ LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
+ MVT::i16, Op.getOperand(i-1));
+ }
+ if (ThisIsNonZero) {
+ ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
+ ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
+ ThisElt, DAG.getConstant(8, dl, MVT::i8));
+ if (LastIsNonZero)
+ ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
+ } else
+ ThisElt = LastElt;
+
+ if (ThisElt.getNode())
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
+ DAG.getIntPtrConstant(i/2, dl));
+ }
+ }
+
+ return DAG.getBitcast(MVT::v16i8, V);
+}
+
+/// Custom lower build_vector of v8i16.
+static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
+ unsigned NumNonZero, unsigned NumZero,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ const TargetLowering &TLI) {
+ if (NumNonZero > 4)
+ return SDValue();
+
+ SDLoc dl(Op);
+ SDValue V;
+ bool First = true;
+ for (unsigned i = 0; i < 8; ++i) {
+ bool isNonZero = (NonZeros & (1 << i)) != 0;
+ if (isNonZero) {
+ if (First) {
+ if (NumZero)
+ V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
+ else
+ V = DAG.getUNDEF(MVT::v8i16);
+ First = false;
+ }
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
+ MVT::v8i16, V, Op.getOperand(i),
+ DAG.getIntPtrConstant(i, dl));
+ }
+ }
+
+ return V;
+}
+
+/// Custom lower build_vector of v4i32 or v4f32.
+static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ const TargetLowering &TLI) {
+ // Find all zeroable elements.
+ std::bitset<4> Zeroable;
+ for (int i=0; i < 4; ++i) {
+ SDValue Elt = Op->getOperand(i);
+ Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
+ }
+ assert(Zeroable.size() - Zeroable.count() > 1 &&
+ "We expect at least two non-zero elements!");
+
+ // We only know how to deal with build_vector nodes where elements are either
+ // zeroable or extract_vector_elt with constant index.
+ SDValue FirstNonZero;
+ unsigned FirstNonZeroIdx;
+ for (unsigned i=0; i < 4; ++i) {
+ if (Zeroable[i])
+ continue;
+ SDValue Elt = Op->getOperand(i);
+ if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ !isa<ConstantSDNode>(Elt.getOperand(1)))
+ return SDValue();
+ // Make sure that this node is extracting from a 128-bit vector.
+ MVT VT = Elt.getOperand(0).getSimpleValueType();
+ if (!VT.is128BitVector())
+ return SDValue();
+ if (!FirstNonZero.getNode()) {
+ FirstNonZero = Elt;
+ FirstNonZeroIdx = i;
+ }
+ }
+
+ assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
+ SDValue V1 = FirstNonZero.getOperand(0);
+ MVT VT = V1.getSimpleValueType();
+
+ // See if this build_vector can be lowered as a blend with zero.
+ SDValue Elt;
+ unsigned EltMaskIdx, EltIdx;
+ int Mask[4];
+ for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
+ if (Zeroable[EltIdx]) {
+ // The zero vector will be on the right hand side.
+ Mask[EltIdx] = EltIdx+4;
+ continue;
+ }
+
+ Elt = Op->getOperand(EltIdx);
+ // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
+ EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
+ if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
+ break;
+ Mask[EltIdx] = EltIdx;
+ }
+
+ if (EltIdx == 4) {
+ // Let the shuffle legalizer deal with blend operations.
+ SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
+ if (V1.getSimpleValueType() != VT)
+ V1 = DAG.getBitcast(VT, V1);
+ return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
+ }
+
+ // See if we can lower this build_vector to a INSERTPS.
+ if (!Subtarget.hasSSE41())
+ return SDValue();
+
+ SDValue V2 = Elt.getOperand(0);
+ if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
+ V1 = SDValue();
+
+ bool CanFold = true;
+ for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
+ if (Zeroable[i])
+ continue;
+
+ SDValue Current = Op->getOperand(i);
+ SDValue SrcVector = Current->getOperand(0);
+ if (!V1.getNode())
+ V1 = SrcVector;
+ CanFold = SrcVector == V1 &&
+ cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
+ }
+
+ if (!CanFold)
+ return SDValue();
+
+ assert(V1.getNode() && "Expected at least two non-zero elements!");
+ if (V1.getSimpleValueType() != MVT::v4f32)
+ V1 = DAG.getBitcast(MVT::v4f32, V1);
+ if (V2.getSimpleValueType() != MVT::v4f32)
+ V2 = DAG.getBitcast(MVT::v4f32, V2);
+
+ // Ok, we can emit an INSERTPS instruction.
+ unsigned ZMask = Zeroable.to_ulong();
+
+ unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
+ assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
+ SDLoc DL(Op);
+ SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
+ DAG.getIntPtrConstant(InsertPSMask, DL));
+ return DAG.getBitcast(VT, Result);
+}
+
+/// Return a vector logical shift node.
+static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
+ SelectionDAG &DAG, const TargetLowering &TLI,
+ const SDLoc &dl) {
+ assert(VT.is128BitVector() && "Unknown type for VShift");
+ MVT ShVT = MVT::v16i8;
+ unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
+ SrcOp = DAG.getBitcast(ShVT, SrcOp);
+ MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
+ assert(NumBits % 8 == 0 && "Only support byte sized shifts");
+ SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
+ return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
+}
+
+static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
+ SelectionDAG &DAG) {
+
+ // Check if the scalar load can be widened into a vector load. And if
+ // the address is "base + cst" see if the cst can be "absorbed" into
+ // the shuffle mask.
+ if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
+ SDValue Ptr = LD->getBasePtr();
+ if (!ISD::isNormalLoad(LD) || LD->isVolatile())
+ return SDValue();
+ EVT PVT = LD->getValueType(0);
+ if (PVT != MVT::i32 && PVT != MVT::f32)
+ return SDValue();
+
+ int FI = -1;
+ int64_t Offset = 0;
+ if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
+ FI = FINode->getIndex();
+ Offset = 0;
+ } else if (DAG.isBaseWithConstantOffset(Ptr) &&
+ isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
+ FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
+ Offset = Ptr.getConstantOperandVal(1);
+ Ptr = Ptr.getOperand(0);
+ } else {
+ return SDValue();
+ }
+
+ // FIXME: 256-bit vector instructions don't require a strict alignment,
+ // improve this code to support it better.
+ unsigned RequiredAlign = VT.getSizeInBits()/8;
+ SDValue Chain = LD->getChain();
+ // Make sure the stack object alignment is at least 16 or 32.
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
+ if (MFI.isFixedObjectIndex(FI)) {
+ // Can't change the alignment. FIXME: It's possible to compute
+ // the exact stack offset and reference FI + adjust offset instead.
+ // If someone *really* cares about this. That's the way to implement it.
+ return SDValue();
+ } else {
+ MFI.setObjectAlignment(FI, RequiredAlign);
+ }
+ }
+
+ // (Offset % 16 or 32) must be multiple of 4. Then address is then
+ // Ptr + (Offset & ~15).
+ if (Offset < 0)
+ return SDValue();
+ if ((Offset % RequiredAlign) & 3)
+ return SDValue();
+ int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
+ if (StartOffset) {
+ SDLoc DL(Ptr);
+ Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+ DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
+ }
+
+ int EltNo = (Offset - StartOffset) >> 2;
+ unsigned NumElems = VT.getVectorNumElements();
+
+ EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
+ SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
+ LD->getPointerInfo().getWithOffset(StartOffset));
+
+ SmallVector<int, 8> Mask(NumElems, EltNo);
+
+ return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
+ }
+
+ return SDValue();
+}
+
+/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
+/// elements can be replaced by a single large load which has the same value as
+/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
+///
+/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
+static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
+ SDLoc &DL, SelectionDAG &DAG,
+ bool isAfterLegalize) {
+ unsigned NumElems = Elts.size();
+
+ int LastLoadedElt = -1;
+ SmallBitVector LoadMask(NumElems, false);
+ SmallBitVector ZeroMask(NumElems, false);
+ SmallBitVector UndefMask(NumElems, false);
+
+ // For each element in the initializer, see if we've found a load, zero or an
+ // undef.
+ for (unsigned i = 0; i < NumElems; ++i) {
+ SDValue Elt = peekThroughBitcasts(Elts[i]);
+ if (!Elt.getNode())
+ return SDValue();
+
+ if (Elt.isUndef())
+ UndefMask[i] = true;
+ else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
+ ZeroMask[i] = true;
+ else if (ISD::isNON_EXTLoad(Elt.getNode())) {
+ LoadMask[i] = true;
+ LastLoadedElt = i;
+ // Each loaded element must be the correct fractional portion of the
+ // requested vector load.
+ if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
+ return SDValue();
+ } else
+ return SDValue();
+ }
+ assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
+ "Incomplete element masks");
+
+ // Handle Special Cases - all undef or undef/zero.
+ if (UndefMask.count() == NumElems)
+ return DAG.getUNDEF(VT);
+
+ // FIXME: Should we return this as a BUILD_VECTOR instead?
+ if ((ZeroMask | UndefMask).count() == NumElems)
+ return VT.isInteger() ? DAG.getConstant(0, DL, VT)
+ : DAG.getConstantFP(0.0, DL, VT);
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ int FirstLoadedElt = LoadMask.find_first();
+ SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
+ LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
+ EVT LDBaseVT = EltBase.getValueType();
+
+ // Consecutive loads can contain UNDEFS but not ZERO elements.
+ // Consecutive loads with UNDEFs and ZEROs elements require a
+ // an additional shuffle stage to clear the ZERO elements.
+ bool IsConsecutiveLoad = true;
+ bool IsConsecutiveLoadWithZeros = true;
+ for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
+ if (LoadMask[i]) {
+ SDValue Elt = peekThroughBitcasts(Elts[i]);
+ LoadSDNode *LD = cast<LoadSDNode>(Elt);
+ if (!DAG.areNonVolatileConsecutiveLoads(
+ LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
+ i - FirstLoadedElt)) {
+ IsConsecutiveLoad = false;
+ IsConsecutiveLoadWithZeros = false;
+ break;
+ }
+ } else if (ZeroMask[i]) {
+ IsConsecutiveLoad = false;
+ }
+ }
+
+ auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) {
+ auto MMOFlags = LDBase->getMemOperand()->getFlags();
+ assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
+ "Cannot merge volatile loads.");
+ SDValue NewLd =
+ DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
+ LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
+
+ if (LDBase->hasAnyUseOfValue(1)) {
+ SDValue NewChain =
+ DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
+ SDValue(NewLd.getNode(), 1));
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
+ DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
+ SDValue(NewLd.getNode(), 1));
+ }
+
+ return NewLd;
+ };
+
+ // LOAD - all consecutive load/undefs (must start/end with a load).
+ // If we have found an entire vector of loads and undefs, then return a large
+ // load of the entire vector width starting at the base pointer.
+ // If the vector contains zeros, then attempt to shuffle those elements.
+ if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
+ (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
+ assert(LDBase && "Did not find base load for merging consecutive loads");
+ EVT EltVT = LDBase->getValueType(0);
+ // Ensure that the input vector size for the merged loads matches the
+ // cumulative size of the input elements.
+ if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
+ return SDValue();
+
+ if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
+ return SDValue();
+
+ if (IsConsecutiveLoad)
+ return CreateLoad(VT, LDBase);
+
+ // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
+ // vector and a zero vector to clear out the zero elements.
+ if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
+ SmallVector<int, 4> ClearMask(NumElems, -1);
+ for (unsigned i = 0; i < NumElems; ++i) {
+ if (ZeroMask[i])
+ ClearMask[i] = i + NumElems;
+ else if (LoadMask[i])
+ ClearMask[i] = i;
+ }
+ SDValue V = CreateLoad(VT, LDBase);
+ SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
+ : DAG.getConstantFP(0.0, DL, VT);
+ return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
+ }
+ }
+
+ int LoadSize =
+ (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
+
+ // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
+ if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
+ (LoadSize == 32 || LoadSize == 64) &&
+ ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
+ MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
+ : MVT::getIntegerVT(LoadSize);
+ MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
+ if (TLI.isTypeLegal(VecVT)) {
+ SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
+ SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
+ SDValue ResNode =
+ DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
+ LDBase->getPointerInfo(),
+ LDBase->getAlignment(),
+ false/*isVolatile*/, true/*ReadMem*/,
+ false/*WriteMem*/);
+
+ // Make sure the newly-created LOAD is in the same position as LDBase in
+ // terms of dependency. We create a TokenFactor for LDBase and ResNode,
+ // and update uses of LDBase's output chain to use the TokenFactor.
+ if (LDBase->hasAnyUseOfValue(1)) {
+ SDValue NewChain =
+ DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
+ SDValue(ResNode.getNode(), 1));
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
+ DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
+ SDValue(ResNode.getNode(), 1));
+ }
+
+ return DAG.getBitcast(VT, ResNode);
+ }
+ }
+
+ return SDValue();
+}
+
+static Constant *getConstantVector(MVT VT, APInt SplatValue,
+ unsigned SplatBitSize, LLVMContext &C) {
+ unsigned ScalarSize = VT.getScalarSizeInBits();
+ unsigned NumElm = SplatBitSize / ScalarSize;
+
+ SmallVector<Constant *, 32> ConstantVec;
+ for (unsigned i = 0; i < NumElm; i++) {
+ APInt Val = SplatValue.lshr(ScalarSize * i).trunc(ScalarSize);
+ Constant *Const;
+ if (VT.isFloatingPoint()) {
+ assert((ScalarSize == 32 || ScalarSize == 64) &&
+ "Unsupported floating point scalar size");
+ if (ScalarSize == 32)
+ Const = ConstantFP::get(Type::getFloatTy(C), Val.bitsToFloat());
+ else
+ Const = ConstantFP::get(Type::getDoubleTy(C), Val.bitsToDouble());
+ } else
+ Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
+ ConstantVec.push_back(Const);
+ }
+ return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
+}
+
+static bool isUseOfShuffle(SDNode *N) {
+ for (auto *U : N->uses()) {
+ if (isTargetShuffle(U->getOpcode()))
+ return true;
+ if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
+ return isUseOfShuffle(U);
+ }
+ return false;
+}
+
+/// Attempt to use the vbroadcast instruction to generate a splat value for the
+/// following cases:
+/// 1. A splat BUILD_VECTOR which uses:
+/// a. A single scalar load, or a constant.
+/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
+/// 2. A splat shuffle which uses a scalar_to_vector node which comes from
+/// a scalar load, or a constant.
+///
+/// The VBROADCAST node is returned when a pattern is found,
+/// or SDValue() otherwise.
+static SDValue LowerVectorBroadcast(BuildVectorSDNode *BVOp, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ // VBROADCAST requires AVX.
+ // TODO: Splats could be generated for non-AVX CPUs using SSE
+ // instructions, but there's less potential gain for only 128-bit vectors.
+ if (!Subtarget.hasAVX())
+ return SDValue();
+
+ MVT VT = BVOp->getSimpleValueType(0);
+ SDLoc dl(BVOp);
+
+ assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
+ "Unsupported vector type for broadcast.");
+
+ BitVector UndefElements;
+ SDValue Ld = BVOp->getSplatValue(&UndefElements);
+
+ // We need a splat of a single value to use broadcast, and it doesn't
+ // make any sense if the value is only in one element of the vector.
+ if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
+ APInt SplatValue, Undef;
+ unsigned SplatBitSize;
+ bool HasUndef;
+ // Check if this is a repeated constant pattern suitable for broadcasting.
+ if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
+ SplatBitSize > VT.getScalarSizeInBits() &&
+ SplatBitSize < VT.getSizeInBits()) {
+ // Avoid replacing with broadcast when it's a use of a shuffle
+ // instruction to preserve the present custom lowering of shuffles.
+ if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
+ return SDValue();
+ // replace BUILD_VECTOR with broadcast of the repeated constants.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ LLVMContext *Ctx = DAG.getContext();
+ MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
+ if (Subtarget.hasAVX()) {
+ if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
+ !(SplatBitSize == 64 && Subtarget.is32Bit())) {
+ // Splatted value can fit in one INTEGER constant in constant pool.
+ // Load the constant and broadcast it.
+ MVT CVT = MVT::getIntegerVT(SplatBitSize);
+ Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
+ Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
+ SDValue CP = DAG.getConstantPool(C, PVT);
+ unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
+
+ unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+ Ld = DAG.getLoad(
+ CVT, dl, DAG.getEntryNode(), CP,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+ Alignment);
+ SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
+ MVT::getVectorVT(CVT, Repeat), Ld);
+ return DAG.getBitcast(VT, Brdcst);
+ } else if (SplatBitSize == 32 || SplatBitSize == 64) {
+ // Splatted value can fit in one FLOAT constant in constant pool.
+ // Load the constant and broadcast it.
+ // AVX have support for 32 and 64 bit broadcast for floats only.
+ // No 64bit integer in 32bit subtarget.
+ MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
+ Constant *C = SplatBitSize == 32
+ ? ConstantFP::get(Type::getFloatTy(*Ctx),
+ SplatValue.bitsToFloat())
+ : ConstantFP::get(Type::getDoubleTy(*Ctx),
+ SplatValue.bitsToDouble());
+ SDValue CP = DAG.getConstantPool(C, PVT);
+ unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
+
+ unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+ Ld = DAG.getLoad(
+ CVT, dl, DAG.getEntryNode(), CP,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+ Alignment);
+ SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
+ MVT::getVectorVT(CVT, Repeat), Ld);
+ return DAG.getBitcast(VT, Brdcst);
+ } else if (SplatBitSize > 64) {
+ // Load the vector of constants and broadcast it.
+ MVT CVT = VT.getScalarType();
+ Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
+ *Ctx);
+ SDValue VCP = DAG.getConstantPool(VecC, PVT);
+ unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
+ unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
+ Ld = DAG.getLoad(
+ MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+ Alignment);
+ SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
+ return DAG.getBitcast(VT, Brdcst);
+ }
+ }
+ }
+ return SDValue();
+ }
+
+ bool ConstSplatVal =
+ (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
+
+ // Make sure that all of the users of a non-constant load are from the
+ // BUILD_VECTOR node.
+ if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
+ return SDValue();
+
+ unsigned ScalarSize = Ld.getValueSizeInBits();
+ bool IsGE256 = (VT.getSizeInBits() >= 256);
+
+ // When optimizing for size, generate up to 5 extra bytes for a broadcast
+ // instruction to save 8 or more bytes of constant pool data.
+ // TODO: If multiple splats are generated to load the same constant,
+ // it may be detrimental to overall size. There needs to be a way to detect
+ // that condition to know if this is truly a size win.
+ bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
+
+ // Handle broadcasting a single constant scalar from the constant pool
+ // into a vector.
+ // On Sandybridge (no AVX2), it is still better to load a constant vector
+ // from the constant pool and not to broadcast it from a scalar.
+ // But override that restriction when optimizing for size.
+ // TODO: Check if splatting is recommended for other AVX-capable CPUs.
+ if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
+ EVT CVT = Ld.getValueType();
+ assert(!CVT.isVector() && "Must not broadcast a vector type");
+
+ // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
+ // For size optimization, also splat v2f64 and v2i64, and for size opt
+ // with AVX2, also splat i8 and i16.
+ // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
+ if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
+ (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
+ const Constant *C = nullptr;
+ if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
+ C = CI->getConstantIntValue();
+ else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
+ C = CF->getConstantFPValue();
+
+ assert(C && "Invalid constant type");
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDValue CP =
+ DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
+ unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+ Ld = DAG.getLoad(
+ CVT, dl, DAG.getEntryNode(), CP,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+ Alignment);
+
+ return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+ }
+ }
+
+ bool IsLoad = ISD::isNormalLoad(Ld.getNode());
+
+ // Handle AVX2 in-register broadcasts.
+ if (!IsLoad && Subtarget.hasInt256() &&
+ (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
+ return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+
+ // The scalar source must be a normal load.
+ if (!IsLoad)
+ return SDValue();
+
+ if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
+ (Subtarget.hasVLX() && ScalarSize == 64))
+ return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+
+ // The integer check is needed for the 64-bit into 128-bit so it doesn't match
+ // double since there is no vbroadcastsd xmm
+ if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
+ if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
+ return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+ }
+
+ // Unsupported broadcast.
+ return SDValue();
+}
+
+/// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
+/// underlying vector and index.
+///
+/// Modifies \p ExtractedFromVec to the real vector and returns the real
+/// index.
+static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
+ SDValue ExtIdx) {
+ int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
+ if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
+ return Idx;
+
+ // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
+ // lowered this:
+ // (extract_vector_elt (v8f32 %vreg1), Constant<6>)
+ // to:
+ // (extract_vector_elt (vector_shuffle<2,u,u,u>
+ // (extract_subvector (v8f32 %vreg0), Constant<4>),
+ // undef)
+ // Constant<0>)
+ // In this case the vector is the extract_subvector expression and the index
+ // is 2, as specified by the shuffle.
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
+ SDValue ShuffleVec = SVOp->getOperand(0);
+ MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
+ assert(ShuffleVecVT.getVectorElementType() ==
+ ExtractedFromVec.getSimpleValueType().getVectorElementType());
+
+ int ShuffleIdx = SVOp->getMaskElt(Idx);
+ if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
+ ExtractedFromVec = ShuffleVec;
+ return ShuffleIdx;
+ }
+ return Idx;
+}
+
+static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+
+ // Skip if insert_vec_elt is not supported.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
+ return SDValue();
+
+ SDLoc DL(Op);
+ unsigned NumElems = Op.getNumOperands();
+
+ SDValue VecIn1;
+ SDValue VecIn2;
+ SmallVector<unsigned, 4> InsertIndices;
+ SmallVector<int, 8> Mask(NumElems, -1);
+
+ for (unsigned i = 0; i != NumElems; ++i) {
+ unsigned Opc = Op.getOperand(i).getOpcode();
+
+ if (Opc == ISD::UNDEF)
+ continue;
+
+ if (Opc != ISD::EXTRACT_VECTOR_ELT) {
+ // Quit if more than 1 elements need inserting.
+ if (InsertIndices.size() > 1)
+ return SDValue();
+
+ InsertIndices.push_back(i);
+ continue;
+ }
+
+ SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
+ SDValue ExtIdx = Op.getOperand(i).getOperand(1);
+ // Quit if non-constant index.
+ if (!isa<ConstantSDNode>(ExtIdx))
+ return SDValue();
+ int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
+
+ // Quit if extracted from vector of different type.
+ if (ExtractedFromVec.getValueType() != VT)
+ return SDValue();
+
+ if (!VecIn1.getNode())
+ VecIn1 = ExtractedFromVec;
+ else if (VecIn1 != ExtractedFromVec) {
+ if (!VecIn2.getNode())
+ VecIn2 = ExtractedFromVec;
+ else if (VecIn2 != ExtractedFromVec)
+ // Quit if more than 2 vectors to shuffle
+ return SDValue();
+ }
+
+ if (ExtractedFromVec == VecIn1)
+ Mask[i] = Idx;
+ else if (ExtractedFromVec == VecIn2)
+ Mask[i] = Idx + NumElems;
+ }
+
+ if (!VecIn1.getNode())
+ return SDValue();
+
+ VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
+ SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
+ for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
+ unsigned Idx = InsertIndices[i];
+ NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
+ DAG.getIntPtrConstant(Idx, DL));
+ }
+
+ return NV;
+}
+
+static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
+ assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
+ Op.getScalarValueSizeInBits() == 1 &&
+ "Can not convert non-constant vector");
+ uint64_t Immediate = 0;
+ for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
+ SDValue In = Op.getOperand(idx);
+ if (!In.isUndef())
+ Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
+ }
+ SDLoc dl(Op);
+ MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
+ return DAG.getConstant(Immediate, dl, VT);
+}
+// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
+SDValue
+X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
+
+ MVT VT = Op.getSimpleValueType();
+ assert((VT.getVectorElementType() == MVT::i1) &&
+ "Unexpected type in LowerBUILD_VECTORvXi1!");
+
+ SDLoc dl(Op);
+ if (ISD::isBuildVectorAllZeros(Op.getNode()))
+ return DAG.getTargetConstant(0, dl, VT);
+
+ if (ISD::isBuildVectorAllOnes(Op.getNode()))
+ return DAG.getTargetConstant(1, dl, VT);
+
+ if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
+ SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
+ if (Imm.getValueSizeInBits() == VT.getSizeInBits())
+ return DAG.getBitcast(VT, Imm);
+ SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
+ DAG.getIntPtrConstant(0, dl));
+ }
+
+ // Vector has one or more non-const elements
+ uint64_t Immediate = 0;
+ SmallVector<unsigned, 16> NonConstIdx;
+ bool IsSplat = true;
+ bool HasConstElts = false;
+ int SplatIdx = -1;
+ for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
+ SDValue In = Op.getOperand(idx);
+ if (In.isUndef())
+ continue;
+ if (!isa<ConstantSDNode>(In))
+ NonConstIdx.push_back(idx);
+ else {
+ Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
+ HasConstElts = true;
+ }
+ if (SplatIdx < 0)
+ SplatIdx = idx;
+ else if (In != Op.getOperand(SplatIdx))
+ IsSplat = false;
+ }
+
+ // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
+ if (IsSplat)
+ return DAG.getNode(ISD::SELECT, dl, VT, Op.getOperand(SplatIdx),
+ DAG.getConstant(1, dl, VT),
+ DAG.getConstant(0, dl, VT));
+
+ // insert elements one by one
+ SDValue DstVec;
+ SDValue Imm;
+ if (Immediate) {
+ MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
+ Imm = DAG.getConstant(Immediate, dl, ImmVT);
+ }
+ else if (HasConstElts)
+ Imm = DAG.getConstant(0, dl, VT);
+ else
+ Imm = DAG.getUNDEF(VT);
+ if (Imm.getValueSizeInBits() == VT.getSizeInBits())
+ DstVec = DAG.getBitcast(VT, Imm);
+ else {
+ SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
+ DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
+ DAG.getIntPtrConstant(0, dl));
+ }
+
+ for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
+ unsigned InsertIdx = NonConstIdx[i];
+ DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
+ Op.getOperand(InsertIdx),
+ DAG.getIntPtrConstant(InsertIdx, dl));
+ }
+ return DstVec;
+}
+
+/// \brief Return true if \p N implements a horizontal binop and return the
+/// operands for the horizontal binop into V0 and V1.
+///
+/// This is a helper function of LowerToHorizontalOp().
+/// This function checks that the build_vector \p N in input implements a
+/// horizontal operation. Parameter \p Opcode defines the kind of horizontal
+/// operation to match.
+/// For example, if \p Opcode is equal to ISD::ADD, then this function
+/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
+/// is equal to ISD::SUB, then this function checks if this is a horizontal
+/// arithmetic sub.
+///
+/// This function only analyzes elements of \p N whose indices are
+/// in range [BaseIdx, LastIdx).
+static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
+ SelectionDAG &DAG,
+ unsigned BaseIdx, unsigned LastIdx,
+ SDValue &V0, SDValue &V1) {
+ EVT VT = N->getValueType(0);
+
+ assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
+ assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
+ "Invalid Vector in input!");
+
+ bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
+ bool CanFold = true;
+ unsigned ExpectedVExtractIdx = BaseIdx;
+ unsigned NumElts = LastIdx - BaseIdx;
+ V0 = DAG.getUNDEF(VT);
+ V1 = DAG.getUNDEF(VT);
+
+ // Check if N implements a horizontal binop.
+ for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
+ SDValue Op = N->getOperand(i + BaseIdx);
+
+ // Skip UNDEFs.
+ if (Op->isUndef()) {
+ // Update the expected vector extract index.
+ if (i * 2 == NumElts)
+ ExpectedVExtractIdx = BaseIdx;
+ ExpectedVExtractIdx += 2;
+ continue;
+ }
+
+ CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
+
+ if (!CanFold)
+ break;
+
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+
+ // Try to match the following pattern:
+ // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
+ CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ Op0.getOperand(0) == Op1.getOperand(0) &&
+ isa<ConstantSDNode>(Op0.getOperand(1)) &&
+ isa<ConstantSDNode>(Op1.getOperand(1)));
+ if (!CanFold)
+ break;
+
+ unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
+ unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
+
+ if (i * 2 < NumElts) {
+ if (V0.isUndef()) {
+ V0 = Op0.getOperand(0);
+ if (V0.getValueType() != VT)
+ return false;
+ }
+ } else {
+ if (V1.isUndef()) {
+ V1 = Op0.getOperand(0);
+ if (V1.getValueType() != VT)
+ return false;
+ }
+ if (i * 2 == NumElts)
+ ExpectedVExtractIdx = BaseIdx;
+ }
+
+ SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
+ if (I0 == ExpectedVExtractIdx)
+ CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
+ else if (IsCommutable && I1 == ExpectedVExtractIdx) {
+ // Try to match the following dag sequence:
+ // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
+ CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
+ } else
+ CanFold = false;
+
+ ExpectedVExtractIdx += 2;
+ }
+
+ return CanFold;
+}
+
+/// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
+/// a concat_vector.
+///
+/// This is a helper function of LowerToHorizontalOp().
+/// This function expects two 256-bit vectors called V0 and V1.
+/// At first, each vector is split into two separate 128-bit vectors.
+/// Then, the resulting 128-bit vectors are used to implement two
+/// horizontal binary operations.
+///
+/// The kind of horizontal binary operation is defined by \p X86Opcode.
+///
+/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
+/// the two new horizontal binop.
+/// When Mode is set, the first horizontal binop dag node would take as input
+/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
+/// horizontal binop dag node would take as input the lower 128-bit of V1
+/// and the upper 128-bit of V1.
+/// Example:
+/// HADD V0_LO, V0_HI
+/// HADD V1_LO, V1_HI
+///
+/// Otherwise, the first horizontal binop dag node takes as input the lower
+/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
+/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
+/// Example:
+/// HADD V0_LO, V1_LO
+/// HADD V0_HI, V1_HI
+///
+/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
+/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
+/// the upper 128-bits of the result.
+static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
+ const SDLoc &DL, SelectionDAG &DAG,
+ unsigned X86Opcode, bool Mode,
+ bool isUndefLO, bool isUndefHI) {
+ MVT VT = V0.getSimpleValueType();
+ assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
+ "Invalid nodes in input!");
+
+ unsigned NumElts = VT.getVectorNumElements();
+ SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
+ SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
+ SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
+ SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
+ MVT NewVT = V0_LO.getSimpleValueType();
+
+ SDValue LO = DAG.getUNDEF(NewVT);
+ SDValue HI = DAG.getUNDEF(NewVT);
+
+ if (Mode) {
+ // Don't emit a horizontal binop if the result is expected to be UNDEF.
+ if (!isUndefLO && !V0->isUndef())
+ LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
+ if (!isUndefHI && !V1->isUndef())
+ HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
+ } else {
+ // Don't emit a horizontal binop if the result is expected to be UNDEF.
+ if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
+ LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
+
+ if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
+ HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
+ }
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
+}
+
+/// Try to fold a build_vector that performs an 'addsub' to an X86ISD::ADDSUB
+/// node.
+static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
+ const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+ MVT VT = BV->getSimpleValueType(0);
+ if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
+ (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
+ return SDValue();
+
+ SDLoc DL(BV);
+ unsigned NumElts = VT.getVectorNumElements();
+ SDValue InVec0 = DAG.getUNDEF(VT);
+ SDValue InVec1 = DAG.getUNDEF(VT);
+
+ assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
+ VT == MVT::v2f64) && "build_vector with an invalid type found!");
+
+ // Odd-numbered elements in the input build vector are obtained from
+ // adding two integer/float elements.
+ // Even-numbered elements in the input build vector are obtained from
+ // subtracting two integer/float elements.
+ unsigned ExpectedOpcode = ISD::FSUB;
+ unsigned NextExpectedOpcode = ISD::FADD;
+ bool AddFound = false;
+ bool SubFound = false;
+
+ for (unsigned i = 0, e = NumElts; i != e; ++i) {
+ SDValue Op = BV->getOperand(i);
+
+ // Skip 'undef' values.
+ unsigned Opcode = Op.getOpcode();
+ if (Opcode == ISD::UNDEF) {
+ std::swap(ExpectedOpcode, NextExpectedOpcode);
+ continue;
+ }
+
+ // Early exit if we found an unexpected opcode.
+ if (Opcode != ExpectedOpcode)
+ return SDValue();
+
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+
+ // Try to match the following pattern:
+ // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
+ // Early exit if we cannot match that sequence.
+ if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ !isa<ConstantSDNode>(Op0.getOperand(1)) ||
+ !isa<ConstantSDNode>(Op1.getOperand(1)) ||
+ Op0.getOperand(1) != Op1.getOperand(1))
+ return SDValue();
+
+ unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
+ if (I0 != i)
+ return SDValue();
+
+ // We found a valid add/sub node. Update the information accordingly.
+ if (i & 1)
+ AddFound = true;
+ else
+ SubFound = true;
+
+ // Update InVec0 and InVec1.
+ if (InVec0.isUndef()) {
+ InVec0 = Op0.getOperand(0);
+ if (InVec0.getSimpleValueType() != VT)
+ return SDValue();
+ }
+ if (InVec1.isUndef()) {
+ InVec1 = Op1.getOperand(0);
+ if (InVec1.getSimpleValueType() != VT)
+ return SDValue();
+ }
+
+ // Make sure that operands in input to each add/sub node always
+ // come from a same pair of vectors.
+ if (InVec0 != Op0.getOperand(0)) {
+ if (ExpectedOpcode == ISD::FSUB)
+ return SDValue();
+
+ // FADD is commutable. Try to commute the operands
+ // and then test again.
+ std::swap(Op0, Op1);
+ if (InVec0 != Op0.getOperand(0))
+ return SDValue();
+ }
+
+ if (InVec1 != Op1.getOperand(0))
+ return SDValue();
+
+ // Update the pair of expected opcodes.
+ std::swap(ExpectedOpcode, NextExpectedOpcode);
+ }
+
+ // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
+ if (AddFound && SubFound && !InVec0.isUndef() && !InVec1.isUndef())
+ return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
+
+ return SDValue();
+}
+
+/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
+static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = BV->getSimpleValueType(0);
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned NumUndefsLO = 0;
+ unsigned NumUndefsHI = 0;
+ unsigned Half = NumElts/2;
+
+ // Count the number of UNDEF operands in the build_vector in input.
+ for (unsigned i = 0, e = Half; i != e; ++i)
+ if (BV->getOperand(i)->isUndef())
+ NumUndefsLO++;
+
+ for (unsigned i = Half, e = NumElts; i != e; ++i)
+ if (BV->getOperand(i)->isUndef())
+ NumUndefsHI++;
+
+ // Early exit if this is either a build_vector of all UNDEFs or all the
+ // operands but one are UNDEF.
+ if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
+ return SDValue();
+
+ SDLoc DL(BV);
+ SDValue InVec0, InVec1;
+ if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
+ // Try to match an SSE3 float HADD/HSUB.
+ if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
+ return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
+
+ if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
+ return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
+ } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
+ // Try to match an SSSE3 integer HADD/HSUB.
+ if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
+ return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
+
+ if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
+ return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
+ }
+
+ if (!Subtarget.hasAVX())
+ return SDValue();
+
+ if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
+ // Try to match an AVX horizontal add/sub of packed single/double
+ // precision floating point values from 256-bit vectors.
+ SDValue InVec2, InVec3;
+ if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
+ isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
+ ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
+ ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
+ return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
+
+ if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
+ isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
+ ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
+ ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
+ return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
+ } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
+ // Try to match an AVX2 horizontal add/sub of signed integers.
+ SDValue InVec2, InVec3;
+ unsigned X86Opcode;
+ bool CanFold = true;
+
+ if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
+ isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
+ ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
+ ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
+ X86Opcode = X86ISD::HADD;
+ else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
+ isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
+ ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
+ ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
+ X86Opcode = X86ISD::HSUB;
+ else
+ CanFold = false;
+
+ if (CanFold) {
+ // Fold this build_vector into a single horizontal add/sub.
+ // Do this only if the target has AVX2.
+ if (Subtarget.hasAVX2())
+ return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
+
+ // Do not try to expand this build_vector into a pair of horizontal
+ // add/sub if we can emit a pair of scalar add/sub.
+ if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
+ return SDValue();
+
+ // Convert this build_vector into a pair of horizontal binop followed by
+ // a concat vector.
+ bool isUndefLO = NumUndefsLO == Half;
+ bool isUndefHI = NumUndefsHI == Half;
+ return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
+ isUndefLO, isUndefHI);
+ }
+ }
+
+ if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
+ VT == MVT::v16i16) && Subtarget.hasAVX()) {
+ unsigned X86Opcode;
+ if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
+ X86Opcode = X86ISD::HADD;
+ else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
+ X86Opcode = X86ISD::HSUB;
+ else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
+ X86Opcode = X86ISD::FHADD;
+ else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
+ X86Opcode = X86ISD::FHSUB;
+ else
+ return SDValue();
+
+ // Don't try to expand this build_vector into a pair of horizontal add/sub
+ // if we can simply emit a pair of scalar add/sub.
+ if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
+ return SDValue();
+
+ // Convert this build_vector into two horizontal add/sub followed by
+ // a concat vector.
+ bool isUndefLO = NumUndefsLO == Half;
+ bool isUndefHI = NumUndefsHI == Half;
+ return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
+ isUndefLO, isUndefHI);
+ }
+
+ return SDValue();
+}
+
+/// If a BUILD_VECTOR's source elements all apply the same bit operation and
+/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
+/// just apply the bit to the vectors.
+/// NOTE: Its not in our interest to start make a general purpose vectorizer
+/// from this, but enough scalar bit operations are created from the later
+/// legalization + scalarization stages to need basic support.
+static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ MVT VT = Op->getSimpleValueType(0);
+ unsigned NumElems = VT.getVectorNumElements();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ // Check that all elements have the same opcode.
+ // TODO: Should we allow UNDEFS and if so how many?
+ unsigned Opcode = Op->getOperand(0).getOpcode();
+ for (unsigned i = 1; i < NumElems; ++i)
+ if (Opcode != Op->getOperand(i).getOpcode())
+ return SDValue();
+
+ // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
+ switch (Opcode) {
+ default:
+ return SDValue();
+ case ISD::AND:
+ case ISD::XOR:
+ case ISD::OR:
+ if (!TLI.isOperationLegalOrPromote(Opcode, VT))
+ return SDValue();
+ break;
+ }
+
+ SmallVector<SDValue, 4> LHSElts, RHSElts;
+ for (SDValue Elt : Op->ops()) {
+ SDValue LHS = Elt.getOperand(0);
+ SDValue RHS = Elt.getOperand(1);
+
+ // We expect the canonicalized RHS operand to be the constant.
+ if (!isa<ConstantSDNode>(RHS))
+ return SDValue();
+ LHSElts.push_back(LHS);
+ RHSElts.push_back(RHS);
+ }
+
+ SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
+ SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
+ return DAG.getNode(Opcode, DL, VT, LHS, RHS);
+}
+
+/// Create a vector constant without a load. SSE/AVX provide the bare minimum
+/// functionality to do this, so it's all zeros, all ones, or some derivation
+/// that is cheap to calculate.
+static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDLoc DL(Op);
+ MVT VT = Op.getSimpleValueType();
+
+ // Vectors containing all zeros can be matched by pxor and xorps.
+ if (ISD::isBuildVectorAllZeros(Op.getNode())) {
+ // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
+ // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
+ if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
+ return Op;
+
+ return getZeroVector(VT, Subtarget, DAG, DL);
+ }
+
+ // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
+ // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
+ // vpcmpeqd on 256-bit vectors.
+ if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
+ if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
+ (VT == MVT::v8i32 && Subtarget.hasInt256()))
+ return Op;
+
+ return getOnesVector(VT, Subtarget, DAG, DL);
+ }
+
+ return SDValue();
+}
+
+SDValue
+X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+
+ MVT VT = Op.getSimpleValueType();
+ MVT ExtVT = VT.getVectorElementType();
+ unsigned NumElems = Op.getNumOperands();
+
+ // Generate vectors for predicate vectors.
+ if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
+ return LowerBUILD_VECTORvXi1(Op, DAG);
+
+ if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
+ return VectorConstant;
+
+ BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
+ if (SDValue AddSub = LowerToAddSub(BV, Subtarget, DAG))
+ return AddSub;
+ if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
+ return HorizontalOp;
+ if (SDValue Broadcast = LowerVectorBroadcast(BV, Subtarget, DAG))
+ return Broadcast;
+ if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
+ return BitOp;
+
+ unsigned EVTBits = ExtVT.getSizeInBits();
+
+ unsigned NumZero = 0;
+ unsigned NumNonZero = 0;
+ uint64_t NonZeros = 0;
+ bool IsAllConstants = true;
+ SmallSet<SDValue, 8> Values;
+ for (unsigned i = 0; i < NumElems; ++i) {
+ SDValue Elt = Op.getOperand(i);
+ if (Elt.isUndef())
+ continue;
+ Values.insert(Elt);
+ if (Elt.getOpcode() != ISD::Constant &&
+ Elt.getOpcode() != ISD::ConstantFP)
+ IsAllConstants = false;
+ if (X86::isZeroNode(Elt))
+ NumZero++;
+ else {
+ assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
+ NonZeros |= ((uint64_t)1 << i);
+ NumNonZero++;
+ }
+ }
+
+ // All undef vector. Return an UNDEF. All zero vectors were handled above.
+ if (NumNonZero == 0)
+ return DAG.getUNDEF(VT);
+
+ // Special case for single non-zero, non-undef, element.
+ if (NumNonZero == 1) {
+ unsigned Idx = countTrailingZeros(NonZeros);
+ SDValue Item = Op.getOperand(Idx);
+
+ // If this is an insertion of an i64 value on x86-32, and if the top bits of
+ // the value are obviously zero, truncate the value to i32 and do the
+ // insertion that way. Only do this if the value is non-constant or if the
+ // value is a constant being inserted into element 0. It is cheaper to do
+ // a constant pool load than it is to do a movd + shuffle.
+ if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
+ (!IsAllConstants || Idx == 0)) {
+ if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
+ // Handle SSE only.
+ assert(VT == MVT::v2i64 && "Expected an SSE value type!");
+ MVT VecVT = MVT::v4i32;
+
+ // Truncate the value (which may itself be a constant) to i32, and
+ // convert it to a vector with movd (S2V+shuffle to zero extend).
+ Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
+ Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
+ return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
+ Item, Idx * 2, true, Subtarget, DAG));
+ }
+ }
+
+ // If we have a constant or non-constant insertion into the low element of
+ // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
+ // the rest of the elements. This will be matched as movd/movq/movss/movsd
+ // depending on what the source datatype is.
+ if (Idx == 0) {
+ if (NumZero == 0)
+ return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
+
+ if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
+ (ExtVT == MVT::i64 && Subtarget.is64Bit())) {
+ assert((VT.is128BitVector() || VT.is256BitVector() ||
+ VT.is512BitVector()) &&
+ "Expected an SSE value type!");
+ Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
+ // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
+ return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
+ }
+
+ // We can't directly insert an i8 or i16 into a vector, so zero extend
+ // it to i32 first.
+ if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
+ Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
+ if (VT.getSizeInBits() >= 256) {
+ MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
+ if (Subtarget.hasAVX()) {
+ Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
+ Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
+ } else {
+ // Without AVX, we need to extend to a 128-bit vector and then
+ // insert into the 256-bit vector.
+ Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
+ SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
+ Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
+ }
+ } else {
+ assert(VT.is128BitVector() && "Expected an SSE value type!");
+ Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
+ Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
+ }
+ return DAG.getBitcast(VT, Item);
+ }
+ }
+
+ // Is it a vector logical left shift?
+ if (NumElems == 2 && Idx == 1 &&
+ X86::isZeroNode(Op.getOperand(0)) &&
+ !X86::isZeroNode(Op.getOperand(1))) {
+ unsigned NumBits = VT.getSizeInBits();
+ return getVShift(true, VT,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+ VT, Op.getOperand(1)),
+ NumBits/2, DAG, *this, dl);
+ }
+
+ if (IsAllConstants) // Otherwise, it's better to do a constpool load.
+ return SDValue();
+
+ // Otherwise, if this is a vector with i32 or f32 elements, and the element
+ // is a non-constant being inserted into an element other than the low one,
+ // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
+ // movd/movss) to move this into the low element, then shuffle it into
+ // place.
+ if (EVTBits == 32) {
+ Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
+ return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
+ }
+ }
+
+ // Splat is obviously ok. Let legalizer expand it to a shuffle.
+ if (Values.size() == 1) {
+ if (EVTBits == 32) {
+ // Instead of a shuffle like this:
+ // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
+ // Check if it's possible to issue this instead.
+ // shuffle (vload ptr)), undef, <1, 1, 1, 1>
+ unsigned Idx = countTrailingZeros(NonZeros);
+ SDValue Item = Op.getOperand(Idx);
+ if (Op.getNode()->isOnlyUserOf(Item.getNode()))
+ return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
+ }
+ return SDValue();
+ }
+
+ // A vector full of immediates; various special cases are already
+ // handled, so this is best done with a single constant-pool load.
+ if (IsAllConstants)
+ return SDValue();
+
+ // See if we can use a vector load to get all of the elements.
+ if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
+ SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
+ if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, false))
+ return LD;
+ }
+
+ // For AVX-length vectors, build the individual 128-bit pieces and use
+ // shuffles to put them in place.
+ if (VT.is256BitVector() || VT.is512BitVector()) {
+ SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
+
+ EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
+
+ // Build both the lower and upper subvector.
+ SDValue Lower =
+ DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElems / 2));
+ SDValue Upper = DAG.getBuildVector(
+ HVT, dl, makeArrayRef(&Ops[NumElems / 2], NumElems / 2));
+
+ // Recreate the wider vector with the lower and upper part.
+ if (VT.is256BitVector())
+ return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
+ return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
+ }
+
+ // Let legalizer expand 2-wide build_vectors.
+ if (EVTBits == 64) {
+ if (NumNonZero == 1) {
+ // One half is zero or undef.
+ unsigned Idx = countTrailingZeros(NonZeros);
+ SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
+ Op.getOperand(Idx));
+ return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
+ }
+ return SDValue();
+ }
+
+ // If element VT is < 32 bits, convert it to inserts into a zero vector.
+ if (EVTBits == 8 && NumElems == 16)
+ if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
+ DAG, Subtarget, *this))
+ return V;
+
+ if (EVTBits == 16 && NumElems == 8)
+ if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
+ DAG, Subtarget, *this))
+ return V;
+
+ // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
+ if (EVTBits == 32 && NumElems == 4)
+ if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this))
+ return V;
+
+ // If element VT is == 32 bits, turn it into a number of shuffles.
+ if (NumElems == 4 && NumZero > 0) {
+ SmallVector<SDValue, 8> Ops(NumElems);
+ for (unsigned i = 0; i < 4; ++i) {
+ bool isZero = !(NonZeros & (1ULL << i));
+ if (isZero)
+ Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
+ else
+ Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
+ }
+
+ for (unsigned i = 0; i < 2; ++i) {
+ switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
+ default: break;
+ case 0:
+ Ops[i] = Ops[i*2]; // Must be a zero vector.
+ break;
+ case 1:
+ Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
+ break;
+ case 2:
+ Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
+ break;
+ case 3:
+ Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
+ break;
+ }
+ }
+
+ bool Reverse1 = (NonZeros & 0x3) == 2;
+ bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
+ int MaskVec[] = {
+ Reverse1 ? 1 : 0,
+ Reverse1 ? 0 : 1,
+ static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
+ static_cast<int>(Reverse2 ? NumElems : NumElems+1)
+ };
+ return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
+ }
+
+ if (Values.size() > 1 && VT.is128BitVector()) {
+ // Check for a build vector from mostly shuffle plus few inserting.
+ if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
+ return Sh;
+
+ // For SSE 4.1, use insertps to put the high elements into the low element.
+ if (Subtarget.hasSSE41()) {
+ SDValue Result;
+ if (!Op.getOperand(0).isUndef())
+ Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
+ else
+ Result = DAG.getUNDEF(VT);
+
+ for (unsigned i = 1; i < NumElems; ++i) {
+ if (Op.getOperand(i).isUndef()) continue;
+ Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
+ Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
+ }
+ return Result;
+ }
+
+ // Otherwise, expand into a number of unpckl*, start by extending each of
+ // our (non-undef) elements to the full vector width with the element in the
+ // bottom slot of the vector (which generates no code for SSE).
+ SmallVector<SDValue, 8> Ops(NumElems);
+ for (unsigned i = 0; i < NumElems; ++i) {
+ if (!Op.getOperand(i).isUndef())
+ Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
+ else
+ Ops[i] = DAG.getUNDEF(VT);
+ }
+
+ // Next, we iteratively mix elements, e.g. for v4f32:
+ // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
+ // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
+ // Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
+ unsigned EltStride = NumElems >> 1;
+ while (EltStride != 0) {
+ for (unsigned i = 0; i < EltStride; ++i) {
+ // If Ops[i+EltStride] is undef and this is the first round of mixing,
+ // then it is safe to just drop this shuffle: V[i] is already in the
+ // right place, the one element (since it's the first round) being
+ // inserted as undef can be dropped. This isn't safe for successive
+ // rounds because they will permute elements within both vectors.
+ if (Ops[i+EltStride].isUndef() &&
+ EltStride == NumElems/2)
+ continue;
+
+ Ops[i] = getUnpackl(DAG, dl, VT, Ops[i], Ops[i + EltStride]);
+ }
+ EltStride >>= 1;
+ }
+ return Ops[0];
+ }
+ return SDValue();
+}
+
+// 256-bit AVX can use the vinsertf128 instruction
+// to create 256-bit vectors from two other 128-bit ones.
+static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
+ SDLoc dl(Op);
+ MVT ResVT = Op.getSimpleValueType();
+
+ assert((ResVT.is256BitVector() ||
+ ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
+
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ unsigned NumElems = ResVT.getVectorNumElements();
+ if (ResVT.is256BitVector())
+ return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
+
+ if (Op.getNumOperands() == 4) {
+ MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
+ ResVT.getVectorNumElements()/2);
+ SDValue V3 = Op.getOperand(2);
+ SDValue V4 = Op.getOperand(3);
+ return concat256BitVectors(
+ concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
+ concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
+ NumElems, DAG, dl);
+ }
+ return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
+}
+
+static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
+ const X86Subtarget &Subtarget,
+ SelectionDAG & DAG) {
+ SDLoc dl(Op);
+ MVT ResVT = Op.getSimpleValueType();
+ unsigned NumOfOperands = Op.getNumOperands();
+
+ assert(isPowerOf2_32(NumOfOperands) &&
+ "Unexpected number of operands in CONCAT_VECTORS");
+
+ SDValue Undef = DAG.getUNDEF(ResVT);
+ if (NumOfOperands > 2) {
+ // Specialize the cases when all, or all but one, of the operands are undef.
+ unsigned NumOfDefinedOps = 0;
+ unsigned OpIdx = 0;
+ for (unsigned i = 0; i < NumOfOperands; i++)
+ if (!Op.getOperand(i).isUndef()) {
+ NumOfDefinedOps++;
+ OpIdx = i;
+ }
+ if (NumOfDefinedOps == 0)
+ return Undef;
+ if (NumOfDefinedOps == 1) {
+ unsigned SubVecNumElts =
+ Op.getOperand(OpIdx).getValueType().getVectorNumElements();
+ SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef,
+ Op.getOperand(OpIdx), IdxVal);
+ }
+
+ MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
+ ResVT.getVectorNumElements()/2);
+ SmallVector<SDValue, 2> Ops;
+ for (unsigned i = 0; i < NumOfOperands/2; i++)
+ Ops.push_back(Op.getOperand(i));
+ SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
+ Ops.clear();
+ for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
+ Ops.push_back(Op.getOperand(i));
+ SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
+ }
+
+ // 2 operands
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ unsigned NumElems = ResVT.getVectorNumElements();
+ assert(V1.getValueType() == V2.getValueType() &&
+ V1.getValueType().getVectorNumElements() == NumElems/2 &&
+ "Unexpected operands in CONCAT_VECTORS");
+
+ if (ResVT.getSizeInBits() >= 16)
+ return Op; // The operation is legal with KUNPCK
+
+ bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
+ bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
+ SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl);
+ if (IsZeroV1 && IsZeroV2)
+ return ZeroVec;
+
+ SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
+ if (V2.isUndef())
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
+ if (IsZeroV2)
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx);
+
+ SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
+ if (V1.isUndef())
+ V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
+
+ if (IsZeroV1)
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);
+
+ V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal);
+}
+
+static SDValue LowerCONCAT_VECTORS(SDValue Op,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ if (VT.getVectorElementType() == MVT::i1)
+ return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
+
+ assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
+ (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
+ Op.getNumOperands() == 4)));
+
+ // AVX can use the vinsertf128 instruction to create 256-bit vectors
+ // from two other 128-bit ones.
+
+ // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
+ return LowerAVXCONCAT_VECTORS(Op, DAG);
+}
+
+//===----------------------------------------------------------------------===//
+// Vector shuffle lowering
+//
+// This is an experimental code path for lowering vector shuffles on x86. It is
+// designed to handle arbitrary vector shuffles and blends, gracefully
+// degrading performance as necessary. It works hard to recognize idiomatic
+// shuffles and lower them to optimal instruction patterns without leaving
+// a framework that allows reasonably efficient handling of all vector shuffle
+// patterns.
+//===----------------------------------------------------------------------===//
+
+/// \brief Tiny helper function to identify a no-op mask.
+///
+/// This is a somewhat boring predicate function. It checks whether the mask
+/// array input, which is assumed to be a single-input shuffle mask of the kind
+/// used by the X86 shuffle instructions (not a fully general
+/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
+/// in-place shuffle are 'no-op's.
+static bool isNoopShuffleMask(ArrayRef<int> Mask) {
+ for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ assert(Mask[i] >= -1 && "Out of bound mask element!");
+ if (Mask[i] >= 0 && Mask[i] != i)
+ return false;
+ }
+ return true;
+}
+
+/// \brief Test whether there are elements crossing 128-bit lanes in this
+/// shuffle mask.
+///
+/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
+/// and we routinely test for these.
+static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
+ int LaneSize = 128 / VT.getScalarSizeInBits();
+ int Size = Mask.size();
+ for (int i = 0; i < Size; ++i)
+ if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
+ return true;
+ return false;
+}
+
+/// \brief Test whether a shuffle mask is equivalent within each sub-lane.
+///
+/// This checks a shuffle mask to see if it is performing the same
+/// lane-relative shuffle in each sub-lane. This trivially implies
+/// that it is also not lane-crossing. It may however involve a blend from the
+/// same lane of a second vector.
+///
+/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
+/// non-trivial to compute in the face of undef lanes. The representation is
+/// suitable for use with existing 128-bit shuffles as entries from the second
+/// vector have been remapped to [LaneSize, 2*LaneSize).
+static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
+ ArrayRef<int> Mask,
+ SmallVectorImpl<int> &RepeatedMask) {
+ int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
+ RepeatedMask.assign(LaneSize, -1);
+ int Size = Mask.size();
+ for (int i = 0; i < Size; ++i) {
+ assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
+ if (Mask[i] < 0)
+ continue;
+ if ((Mask[i] % Size) / LaneSize != i / LaneSize)
+ // This entry crosses lanes, so there is no way to model this shuffle.
+ return false;
+
+ // Ok, handle the in-lane shuffles by detecting if and when they repeat.
+ // Adjust second vector indices to start at LaneSize instead of Size.
+ int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
+ : Mask[i] % LaneSize + LaneSize;
+ if (RepeatedMask[i % LaneSize] < 0)
+ // This is the first non-undef entry in this slot of a 128-bit lane.
+ RepeatedMask[i % LaneSize] = LocalM;
+ else if (RepeatedMask[i % LaneSize] != LocalM)
+ // Found a mismatch with the repeated mask.
+ return false;
+ }
+ return true;
+}
+
+/// Test whether a shuffle mask is equivalent within each 128-bit lane.
+static bool
+is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
+ SmallVectorImpl<int> &RepeatedMask) {
+ return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
+}
+
+/// Test whether a shuffle mask is equivalent within each 256-bit lane.
+static bool
+is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
+ SmallVectorImpl<int> &RepeatedMask) {
+ return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
+}
+
+/// Test whether a target shuffle mask is equivalent within each sub-lane.
+/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
+static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
+ ArrayRef<int> Mask,
+ SmallVectorImpl<int> &RepeatedMask) {
+ int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
+ RepeatedMask.assign(LaneSize, SM_SentinelUndef);
+ int Size = Mask.size();
+ for (int i = 0; i < Size; ++i) {
+ assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
+ if (Mask[i] == SM_SentinelUndef)
+ continue;
+ if (Mask[i] == SM_SentinelZero) {
+ if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
+ return false;
+ RepeatedMask[i % LaneSize] = SM_SentinelZero;
+ continue;
+ }
+ if ((Mask[i] % Size) / LaneSize != i / LaneSize)
+ // This entry crosses lanes, so there is no way to model this shuffle.
+ return false;
+
+ // Ok, handle the in-lane shuffles by detecting if and when they repeat.
+ // Adjust second vector indices to start at LaneSize instead of Size.
+ int LocalM =
+ Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
+ if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
+ // This is the first non-undef entry in this slot of a 128-bit lane.
+ RepeatedMask[i % LaneSize] = LocalM;
+ else if (RepeatedMask[i % LaneSize] != LocalM)
+ // Found a mismatch with the repeated mask.
+ return false;
+ }
+ return true;
+}
+
+/// \brief Checks whether a shuffle mask is equivalent to an explicit list of
+/// arguments.
+///
+/// This is a fast way to test a shuffle mask against a fixed pattern:
+///
+/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
+///
+/// It returns true if the mask is exactly as wide as the argument list, and
+/// each element of the mask is either -1 (signifying undef) or the value given
+/// in the argument.
+static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ ArrayRef<int> ExpectedMask) {
+ if (Mask.size() != ExpectedMask.size())
+ return false;
+
+ int Size = Mask.size();
+
+ // If the values are build vectors, we can look through them to find
+ // equivalent inputs that make the shuffles equivalent.
+ auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
+ auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
+
+ for (int i = 0; i < Size; ++i) {
+ assert(Mask[i] >= -1 && "Out of bound mask element!");
+ if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
+ auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
+ auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
+ if (!MaskBV || !ExpectedBV ||
+ MaskBV->getOperand(Mask[i] % Size) !=
+ ExpectedBV->getOperand(ExpectedMask[i] % Size))
+ return false;
+ }
+}
+
+ return true;
+}
+
+/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
+///
+/// The masks must be exactly the same width.
+///
+/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
+/// value in ExpectedMask is always accepted. Otherwise the indices must match.
+///
+/// SM_SentinelZero is accepted as a valid negative index but must match in both.
+static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
+ ArrayRef<int> ExpectedMask) {
+ int Size = Mask.size();
+ if (Size != (int)ExpectedMask.size())
+ return false;
+
+ for (int i = 0; i < Size; ++i)
+ if (Mask[i] == SM_SentinelUndef)
+ continue;
+ else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
+ return false;
+ else if (Mask[i] != ExpectedMask[i])
+ return false;
+
+ return true;
+}
+
+/// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
+///
+/// This helper function produces an 8-bit shuffle immediate corresponding to
+/// the ubiquitous shuffle encoding scheme used in x86 instructions for
+/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
+/// example.
+///
+/// NB: We rely heavily on "undef" masks preserving the input lane.
+static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
+ assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
+ assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
+ assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
+ assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
+ assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
+
+ unsigned Imm = 0;
+ Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
+ Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
+ Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
+ Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
+ return Imm;
+}
+
+static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
+ SelectionDAG &DAG) {
+ return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
+}
+
+/// \brief Compute whether each element of a shuffle is zeroable.
+///
+/// A "zeroable" vector shuffle element is one which can be lowered to zero.
+/// Either it is an undef element in the shuffle mask, the element of the input
+/// referenced is undef, or the element of the input referenced is known to be
+/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
+/// as many lanes with this technique as possible to simplify the remaining
+/// shuffle.
+static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
+ SDValue V1, SDValue V2) {
+ SmallBitVector Zeroable(Mask.size(), false);
+ V1 = peekThroughBitcasts(V1);
+ V2 = peekThroughBitcasts(V2);
+
+ bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
+ bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
+
+ int VectorSizeInBits = V1.getValueSizeInBits();
+ int ScalarSizeInBits = VectorSizeInBits / Mask.size();
+ assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
+
+ for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ int M = Mask[i];
+ // Handle the easy cases.
+ if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
+ Zeroable[i] = true;
+ continue;
+ }
+
+ // Determine shuffle input and normalize the mask.
+ SDValue V = M < Size ? V1 : V2;
+ M %= Size;
+
+ // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
+ if (V.getOpcode() != ISD::BUILD_VECTOR)
+ continue;
+
+ // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
+ // the (larger) source element must be UNDEF/ZERO.
+ if ((Size % V.getNumOperands()) == 0) {
+ int Scale = Size / V->getNumOperands();
+ SDValue Op = V.getOperand(M / Scale);
+ if (Op.isUndef() || X86::isZeroNode(Op))
+ Zeroable[i] = true;
+ else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
+ APInt Val = Cst->getAPIntValue();
+ Val = Val.lshr((M % Scale) * ScalarSizeInBits);
+ Val = Val.getLoBits(ScalarSizeInBits);
+ Zeroable[i] = (Val == 0);
+ } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
+ APInt Val = Cst->getValueAPF().bitcastToAPInt();
+ Val = Val.lshr((M % Scale) * ScalarSizeInBits);
+ Val = Val.getLoBits(ScalarSizeInBits);
+ Zeroable[i] = (Val == 0);
+ }
+ continue;
+ }
+
+ // If the BUILD_VECTOR has more elements then all the (smaller) source
+ // elements must be UNDEF or ZERO.
+ if ((V.getNumOperands() % Size) == 0) {
+ int Scale = V->getNumOperands() / Size;
+ bool AllZeroable = true;
+ for (int j = 0; j < Scale; ++j) {
+ SDValue Op = V.getOperand((M * Scale) + j);
+ AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
+ }
+ Zeroable[i] = AllZeroable;
+ continue;
+ }
+ }
+
+ return Zeroable;
+}
+
+/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
+static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1,
+ SDValue V2,
+ const SmallBitVector &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ int Size = Mask.size();
+ int LaneSize = 128 / VT.getScalarSizeInBits();
+ const int NumBytes = VT.getSizeInBits() / 8;
+ const int NumEltBytes = VT.getScalarSizeInBits() / 8;
+
+ assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
+ (Subtarget.hasAVX2() && VT.is256BitVector()) ||
+ (Subtarget.hasBWI() && VT.is512BitVector()));
+
+ SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
+ // Sign bit set in i8 mask means zero element.
+ SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
+
+ SDValue V;
+ for (int i = 0; i < NumBytes; ++i) {
+ int M = Mask[i / NumEltBytes];
+ if (M < 0) {
+ PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
+ continue;
+ }
+ if (Zeroable[i / NumEltBytes]) {
+ PSHUFBMask[i] = ZeroMask;
+ continue;
+ }
+
+ // We can only use a single input of V1 or V2.
+ SDValue SrcV = (M >= Size ? V2 : V1);
+ if (V && V != SrcV)
+ return SDValue();
+ V = SrcV;
+ M %= Size;
+
+ // PSHUFB can't cross lanes, ensure this doesn't happen.
+ if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
+ return SDValue();
+
+ M = M % LaneSize;
+ M = M * NumEltBytes + (i % NumEltBytes);
+ PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
+ }
+ assert(V && "Failed to find a source input");
+
+ MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
+ return DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
+ DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
+}
+
+// X86 has dedicated unpack instructions that can handle specific blend
+// operations: UNPCKH and UNPCKL.
+static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1,
+ SDValue V2, SelectionDAG &DAG) {
+ SmallVector<int, 8> Unpckl;
+ createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
+ if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
+ return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
+
+ SmallVector<int, 8> Unpckh;
+ createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
+ if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
+ return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
+
+ // Commute and try again.
+ ShuffleVectorSDNode::commuteMask(Unpckl);
+ if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
+ return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
+
+ ShuffleVectorSDNode::commuteMask(Unpckh);
+ if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
+ return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
+
+ return SDValue();
+}
+
+/// \brief Try to emit a bitmask instruction for a shuffle.
+///
+/// This handles cases where we can model a blend exactly as a bitmask due to
+/// one of the inputs being zeroable.
+static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
+ SelectionDAG &DAG) {
+ assert(!VT.isFloatingPoint() && "Floating point types are not supported");
+ MVT EltVT = VT.getVectorElementType();
+ SDValue Zero = DAG.getConstant(0, DL, EltVT);
+ SDValue AllOnes =
+ DAG.getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), DL, EltVT);
+ SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
+ SDValue V;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ if (Zeroable[i])
+ continue;
+ if (Mask[i] % Size != i)
+ return SDValue(); // Not a blend.
+ if (!V)
+ V = Mask[i] < Size ? V1 : V2;
+ else if (V != (Mask[i] < Size ? V1 : V2))
+ return SDValue(); // Can only let one input through the mask.
+
+ VMaskOps[i] = AllOnes;
+ }
+ if (!V)
+ return SDValue(); // No non-zeroable elements!
+
+ SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
+ return DAG.getNode(ISD::AND, DL, VT, V, VMask);
+}
+
+/// \brief Try to emit a blend instruction for a shuffle using bit math.
+///
+/// This is used as a fallback approach when first class blend instructions are
+/// unavailable. Currently it is only suitable for integer vectors, but could
+/// be generalized for floating point vectors if desirable.
+static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ assert(VT.isInteger() && "Only supports integer vector types!");
+ MVT EltVT = VT.getVectorElementType();
+ int NumEltBits = EltVT.getSizeInBits();
+ SDValue Zero = DAG.getConstant(0, DL, EltVT);
+ SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
+ EltVT);
+ SmallVector<SDValue, 16> MaskOps;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
+ return SDValue(); // Shuffled input!
+ MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
+ }
+
+ SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
+ V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
+ // We have to cast V2 around.
+ MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
+ V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
+ DAG.getBitcast(MaskVT, V1Mask),
+ DAG.getBitcast(MaskVT, V2)));
+ return DAG.getNode(ISD::OR, DL, VT, V1, V2);
+}
+
+/// \brief Try to emit a blend instruction for a shuffle.
+///
+/// This doesn't do any checks for the availability of instructions for blending
+/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
+/// be matched in the backend with the type given. What it does check for is
+/// that the shuffle mask is a blend, or convertible into a blend with zero.
+static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Original,
+ const SmallBitVector &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
+ bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
+ SmallVector<int, 8> Mask(Original.begin(), Original.end());
+ bool ForceV1Zero = false, ForceV2Zero = false;
+
+ // Attempt to generate the binary blend mask. If an input is zero then
+ // we can use any lane.
+ // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
+ unsigned BlendMask = 0;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ int M = Mask[i];
+ if (M < 0)
+ continue;
+ if (M == i)
+ continue;
+ if (M == i + Size) {
+ BlendMask |= 1u << i;
+ continue;
+ }
+ if (Zeroable[i]) {
+ if (V1IsZero) {
+ ForceV1Zero = true;
+ Mask[i] = i;
+ continue;
+ }
+ if (V2IsZero) {
+ ForceV2Zero = true;
+ BlendMask |= 1u << i;
+ Mask[i] = i + Size;
+ continue;
+ }
+ }
+ return SDValue(); // Shuffled input!
+ }
+
+ // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
+ if (ForceV1Zero)
+ V1 = getZeroVector(VT, Subtarget, DAG, DL);
+ if (ForceV2Zero)
+ V2 = getZeroVector(VT, Subtarget, DAG, DL);
+
+ auto ScaleBlendMask = [](unsigned BlendMask, int Size, int Scale) {
+ unsigned ScaledMask = 0;
+ for (int i = 0; i != Size; ++i)
+ if (BlendMask & (1u << i))
+ for (int j = 0; j != Scale; ++j)
+ ScaledMask |= 1u << (i * Scale + j);
+ return ScaledMask;
+ };
+
+ switch (VT.SimpleTy) {
+ case MVT::v2f64:
+ case MVT::v4f32:
+ case MVT::v4f64:
+ case MVT::v8f32:
+ return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
+ DAG.getConstant(BlendMask, DL, MVT::i8));
+
+ case MVT::v4i64:
+ case MVT::v8i32:
+ assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
+ LLVM_FALLTHROUGH;
+ case MVT::v2i64:
+ case MVT::v4i32:
+ // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
+ // that instruction.
+ if (Subtarget.hasAVX2()) {
+ // Scale the blend by the number of 32-bit dwords per element.
+ int Scale = VT.getScalarSizeInBits() / 32;
+ BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
+ MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
+ V1 = DAG.getBitcast(BlendVT, V1);
+ V2 = DAG.getBitcast(BlendVT, V2);
+ return DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
+ DAG.getConstant(BlendMask, DL, MVT::i8)));
+ }
+ LLVM_FALLTHROUGH;
+ case MVT::v8i16: {
+ // For integer shuffles we need to expand the mask and cast the inputs to
+ // v8i16s prior to blending.
+ int Scale = 8 / VT.getVectorNumElements();
+ BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
+ V1 = DAG.getBitcast(MVT::v8i16, V1);
+ V2 = DAG.getBitcast(MVT::v8i16, V2);
+ return DAG.getBitcast(VT,
+ DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
+ DAG.getConstant(BlendMask, DL, MVT::i8)));
+ }
+
+ case MVT::v16i16: {
+ assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
+ SmallVector<int, 8> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
+ // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
+ assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
+ BlendMask = 0;
+ for (int i = 0; i < 8; ++i)
+ if (RepeatedMask[i] >= 8)
+ BlendMask |= 1u << i;
+ return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
+ DAG.getConstant(BlendMask, DL, MVT::i8));
+ }
+ LLVM_FALLTHROUGH;
+ }
+ case MVT::v16i8:
+ case MVT::v32i8: {
+ assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
+ "256-bit byte-blends require AVX2 support!");
+
+ // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
+ if (SDValue Masked =
+ lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
+ return Masked;
+
+ // Scale the blend by the number of bytes per element.
+ int Scale = VT.getScalarSizeInBits() / 8;
+
+ // This form of blend is always done on bytes. Compute the byte vector
+ // type.
+ MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
+
+ // Compute the VSELECT mask. Note that VSELECT is really confusing in the
+ // mix of LLVM's code generator and the x86 backend. We tell the code
+ // generator that boolean values in the elements of an x86 vector register
+ // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
+ // mapping a select to operand #1, and 'false' mapping to operand #2. The
+ // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
+ // of the element (the remaining are ignored) and 0 in that high bit would
+ // mean operand #1 while 1 in the high bit would mean operand #2. So while
+ // the LLVM model for boolean values in vector elements gets the relevant
+ // bit set, it is set backwards and over constrained relative to x86's
+ // actual model.
+ SmallVector<SDValue, 32> VSELECTMask;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i)
+ for (int j = 0; j < Scale; ++j)
+ VSELECTMask.push_back(
+ Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
+ : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
+ MVT::i8));
+
+ V1 = DAG.getBitcast(BlendVT, V1);
+ V2 = DAG.getBitcast(BlendVT, V2);
+ return DAG.getBitcast(
+ VT, DAG.getNode(ISD::VSELECT, DL, BlendVT,
+ DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2));
+ }
+
+ default:
+ llvm_unreachable("Not a supported integer vector type!");
+ }
+}
+
+/// \brief Try to lower as a blend of elements from two inputs followed by
+/// a single-input permutation.
+///
+/// This matches the pattern where we can blend elements from two inputs and
+/// then reduce the shuffle to a single-input permutation.
+static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
+ SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ // We build up the blend mask while checking whether a blend is a viable way
+ // to reduce the shuffle.
+ SmallVector<int, 32> BlendMask(Mask.size(), -1);
+ SmallVector<int, 32> PermuteMask(Mask.size(), -1);
+
+ for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ if (Mask[i] < 0)
+ continue;
+
+ assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
+
+ if (BlendMask[Mask[i] % Size] < 0)
+ BlendMask[Mask[i] % Size] = Mask[i];
+ else if (BlendMask[Mask[i] % Size] != Mask[i])
+ return SDValue(); // Can't blend in the needed input!
+
+ PermuteMask[i] = Mask[i] % Size;
+ }
+
+ SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
+ return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
+}
+
+/// \brief Generic routine to decompose a shuffle and blend into indepndent
+/// blends and permutes.
+///
+/// This matches the extremely common pattern for handling combined
+/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
+/// operations. It will try to pick the best arrangement of shuffles and
+/// blends.
+static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
+ MVT VT, SDValue V1,
+ SDValue V2,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ // Shuffle the input elements into the desired positions in V1 and V2 and
+ // blend them together.
+ SmallVector<int, 32> V1Mask(Mask.size(), -1);
+ SmallVector<int, 32> V2Mask(Mask.size(), -1);
+ SmallVector<int, 32> BlendMask(Mask.size(), -1);
+ for (int i = 0, Size = Mask.size(); i < Size; ++i)
+ if (Mask[i] >= 0 && Mask[i] < Size) {
+ V1Mask[i] = Mask[i];
+ BlendMask[i] = i;
+ } else if (Mask[i] >= Size) {
+ V2Mask[i] = Mask[i] - Size;
+ BlendMask[i] = i + Size;
+ }
+
+ // Try to lower with the simpler initial blend strategy unless one of the
+ // input shuffles would be a no-op. We prefer to shuffle inputs as the
+ // shuffle may be able to fold with a load or other benefit. However, when
+ // we'll have to do 2x as many shuffles in order to achieve this, blending
+ // first is a better strategy.
+ if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
+ if (SDValue BlendPerm =
+ lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
+ return BlendPerm;
+
+ V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
+ V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
+ return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
+}
+
+/// \brief Try to lower a vector shuffle as a rotation.
+///
+/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
+static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
+ ArrayRef<int> Mask) {
+ int NumElts = Mask.size();
+
+ // We need to detect various ways of spelling a rotation:
+ // [11, 12, 13, 14, 15, 0, 1, 2]
+ // [-1, 12, 13, 14, -1, -1, 1, -1]
+ // [-1, -1, -1, -1, -1, -1, 1, 2]
+ // [ 3, 4, 5, 6, 7, 8, 9, 10]
+ // [-1, 4, 5, 6, -1, -1, 9, -1]
+ // [-1, 4, 5, 6, -1, -1, -1, -1]
+ int Rotation = 0;
+ SDValue Lo, Hi;
+ for (int i = 0; i < NumElts; ++i) {
+ int M = Mask[i];
+ assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
+ "Unexpected mask index.");
+ if (M < 0)
+ continue;
+
+ // Determine where a rotated vector would have started.
+ int StartIdx = i - (M % NumElts);
+ if (StartIdx == 0)
+ // The identity rotation isn't interesting, stop.
+ return -1;
+
+ // If we found the tail of a vector the rotation must be the missing
+ // front. If we found the head of a vector, it must be how much of the
+ // head.
+ int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
+
+ if (Rotation == 0)
+ Rotation = CandidateRotation;
+ else if (Rotation != CandidateRotation)
+ // The rotations don't match, so we can't match this mask.
+ return -1;
+
+ // Compute which value this mask is pointing at.
+ SDValue MaskV = M < NumElts ? V1 : V2;
+
+ // Compute which of the two target values this index should be assigned
+ // to. This reflects whether the high elements are remaining or the low
+ // elements are remaining.
+ SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
+
+ // Either set up this value if we've not encountered it before, or check
+ // that it remains consistent.
+ if (!TargetV)
+ TargetV = MaskV;
+ else if (TargetV != MaskV)
+ // This may be a rotation, but it pulls from the inputs in some
+ // unsupported interleaving.
+ return -1;
+ }
+
+ // Check that we successfully analyzed the mask, and normalize the results.
+ assert(Rotation != 0 && "Failed to locate a viable rotation!");
+ assert((Lo || Hi) && "Failed to find a rotated input vector!");
+ if (!Lo)
+ Lo = Hi;
+ else if (!Hi)
+ Hi = Lo;
+
+ V1 = Lo;
+ V2 = Hi;
+
+ return Rotation;
+}
+
+/// \brief Try to lower a vector shuffle as a byte rotation.
+///
+/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
+/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
+/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
+/// try to generically lower a vector shuffle through such an pattern. It
+/// does not check for the profitability of lowering either as PALIGNR or
+/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
+/// This matches shuffle vectors that look like:
+///
+/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
+///
+/// Essentially it concatenates V1 and V2, shifts right by some number of
+/// elements, and takes the low elements as the result. Note that while this is
+/// specified as a *right shift* because x86 is little-endian, it is a *left
+/// rotate* of the vector lanes.
+static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
+ ArrayRef<int> Mask) {
+ // Don't accept any shuffles with zero elements.
+ if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
+ return -1;
+
+ // PALIGNR works on 128-bit lanes.
+ SmallVector<int, 16> RepeatedMask;
+ if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
+ return -1;
+
+ int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
+ if (Rotation <= 0)
+ return -1;
+
+ // PALIGNR rotates bytes, so we need to scale the
+ // rotation based on how many bytes are in the vector lane.
+ int NumElts = RepeatedMask.size();
+ int Scale = 16 / NumElts;
+ return Rotation * Scale;
+}
+
+static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
+ SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
+
+ SDValue Lo = V1, Hi = V2;
+ int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
+ if (ByteRotation <= 0)
+ return SDValue();
+
+ // Cast the inputs to i8 vector of correct length to match PALIGNR or
+ // PSLLDQ/PSRLDQ.
+ MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
+ Lo = DAG.getBitcast(ByteVT, Lo);
+ Hi = DAG.getBitcast(ByteVT, Hi);
+
+ // SSSE3 targets can use the palignr instruction.
+ if (Subtarget.hasSSSE3()) {
+ assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
+ "512-bit PALIGNR requires BWI instructions");
+ return DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
+ DAG.getConstant(ByteRotation, DL, MVT::i8)));
+ }
+
+ assert(VT.is128BitVector() &&
+ "Rotate-based lowering only supports 128-bit lowering!");
+ assert(Mask.size() <= 16 &&
+ "Can shuffle at most 16 bytes in a 128-bit vector!");
+ assert(ByteVT == MVT::v16i8 &&
+ "SSE2 rotate lowering only needed for v16i8!");
+
+ // Default SSE2 implementation
+ int LoByteShift = 16 - ByteRotation;
+ int HiByteShift = ByteRotation;
+
+ SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
+ DAG.getConstant(LoByteShift, DL, MVT::i8));
+ SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
+ DAG.getConstant(HiByteShift, DL, MVT::i8));
+ return DAG.getBitcast(VT,
+ DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
+}
+
+/// \brief Try to lower a vector shuffle as a dword/qword rotation.
+///
+/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
+/// rotation of the concatenation of two vectors; This routine will
+/// try to generically lower a vector shuffle through such an pattern.
+///
+/// Essentially it concatenates V1 and V2, shifts right by some number of
+/// elements, and takes the low elements as the result. Note that while this is
+/// specified as a *right shift* because x86 is little-endian, it is a *left
+/// rotate* of the vector lanes.
+static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
+ SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
+ "Only 32-bit and 64-bit elements are supported!");
+
+ // 128/256-bit vectors are only supported with VLX.
+ assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
+ && "VLX required for 128/256-bit vectors");
+
+ SDValue Lo = V1, Hi = V2;
+ int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
+ if (Rotation <= 0)
+ return SDValue();
+
+ return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
+ DAG.getConstant(Rotation, DL, MVT::i8));
+}
+
+/// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
+///
+/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
+/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
+/// matches elements from one of the input vectors shuffled to the left or
+/// right with zeroable elements 'shifted in'. It handles both the strictly
+/// bit-wise element shifts and the byte shift across an entire 128-bit double
+/// quad word lane.
+///
+/// PSHL : (little-endian) left bit shift.
+/// [ zz, 0, zz, 2 ]
+/// [ -1, 4, zz, -1 ]
+/// PSRL : (little-endian) right bit shift.
+/// [ 1, zz, 3, zz]
+/// [ -1, -1, 7, zz]
+/// PSLLDQ : (little-endian) left byte shift
+/// [ zz, 0, 1, 2, 3, 4, 5, 6]
+/// [ zz, zz, -1, -1, 2, 3, 4, -1]
+/// [ zz, zz, zz, zz, zz, zz, -1, 1]
+/// PSRLDQ : (little-endian) right byte shift
+/// [ 5, 6, 7, zz, zz, zz, zz, zz]
+/// [ -1, 5, 6, 7, zz, zz, zz, zz]
+/// [ 1, 2, -1, -1, -1, -1, zz, zz]
+static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
+ unsigned ScalarSizeInBits,
+ ArrayRef<int> Mask, int MaskOffset,
+ const SmallBitVector &Zeroable,
+ const X86Subtarget &Subtarget) {
+ int Size = Mask.size();
+ unsigned SizeInBits = Size * ScalarSizeInBits;
+
+ auto CheckZeros = [&](int Shift, int Scale, bool Left) {
+ for (int i = 0; i < Size; i += Scale)
+ for (int j = 0; j < Shift; ++j)
+ if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
+ return false;
+
+ return true;
+ };
+
+ auto MatchShift = [&](int Shift, int Scale, bool Left) {
+ for (int i = 0; i != Size; i += Scale) {
+ unsigned Pos = Left ? i + Shift : i;
+ unsigned Low = Left ? i : i + Shift;
+ unsigned Len = Scale - Shift;
+ if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
+ return -1;
+ }
+
+ int ShiftEltBits = ScalarSizeInBits * Scale;
+ bool ByteShift = ShiftEltBits > 64;
+ Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
+ : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
+ int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
+
+ // Normalize the scale for byte shifts to still produce an i64 element
+ // type.
+ Scale = ByteShift ? Scale / 2 : Scale;
+
+ // We need to round trip through the appropriate type for the shift.
+ MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
+ ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
+ : MVT::getVectorVT(ShiftSVT, Size / Scale);
+ return (int)ShiftAmt;
+ };
+
+ // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
+ // keep doubling the size of the integer elements up to that. We can
+ // then shift the elements of the integer vector by whole multiples of
+ // their width within the elements of the larger integer vector. Test each
+ // multiple to see if we can find a match with the moved element indices
+ // and that the shifted in elements are all zeroable.
+ unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
+ for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
+ for (int Shift = 1; Shift != Scale; ++Shift)
+ for (bool Left : {true, false})
+ if (CheckZeros(Shift, Scale, Left)) {
+ int ShiftAmt = MatchShift(Shift, Scale, Left);
+ if (0 < ShiftAmt)
+ return ShiftAmt;
+ }
+
+ // no match
+ return -1;
+}
+
+static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ int Size = Mask.size();
+ assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
+
+ MVT ShiftVT;
+ SDValue V = V1;
+ unsigned Opcode;
+
+ // Try to match shuffle against V1 shift.
+ int ShiftAmt = matchVectorShuffleAsShift(
+ ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
+
+ // If V1 failed, try to match shuffle against V2 shift.
+ if (ShiftAmt < 0) {
+ ShiftAmt =
+ matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
+ Mask, Size, Zeroable, Subtarget);
+ V = V2;
+ }
+
+ if (ShiftAmt < 0)
+ return SDValue();
+
+ assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
+ "Illegal integer vector type");
+ V = DAG.getBitcast(ShiftVT, V);
+ V = DAG.getNode(Opcode, DL, ShiftVT, V,
+ DAG.getConstant(ShiftAmt, DL, MVT::i8));
+ return DAG.getBitcast(VT, V);
+}
+
+/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
+static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
+ SelectionDAG &DAG) {
+ int Size = Mask.size();
+ int HalfSize = Size / 2;
+ assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
+ assert(!Zeroable.all() && "Fully zeroable shuffle mask");
+
+ // Upper half must be undefined.
+ if (!isUndefInRange(Mask, HalfSize, HalfSize))
+ return SDValue();
+
+ // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
+ // Remainder of lower half result is zero and upper half is all undef.
+ auto LowerAsEXTRQ = [&]() {
+ // Determine the extraction length from the part of the
+ // lower half that isn't zeroable.
+ int Len = HalfSize;
+ for (; Len > 0; --Len)
+ if (!Zeroable[Len - 1])
+ break;
+ assert(Len > 0 && "Zeroable shuffle mask");
+
+ // Attempt to match first Len sequential elements from the lower half.
+ SDValue Src;
+ int Idx = -1;
+ for (int i = 0; i != Len; ++i) {
+ int M = Mask[i];
+ if (M < 0)
+ continue;
+ SDValue &V = (M < Size ? V1 : V2);
+ M = M % Size;
+
+ // The extracted elements must start at a valid index and all mask
+ // elements must be in the lower half.
+ if (i > M || M >= HalfSize)
+ return SDValue();
+
+ if (Idx < 0 || (Src == V && Idx == (M - i))) {
+ Src = V;
+ Idx = M - i;
+ continue;
+ }
+ return SDValue();
+ }
+
+ if (Idx < 0)
+ return SDValue();
+
+ assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
+ int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
+ int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
+ return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src,
+ DAG.getConstant(BitLen, DL, MVT::i8),
+ DAG.getConstant(BitIdx, DL, MVT::i8));
+ };
+
+ if (SDValue ExtrQ = LowerAsEXTRQ())
+ return ExtrQ;
+
+ // INSERTQ: Extract lowest Len elements from lower half of second source and
+ // insert over first source, starting at Idx.
+ // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
+ auto LowerAsInsertQ = [&]() {
+ for (int Idx = 0; Idx != HalfSize; ++Idx) {
+ SDValue Base;
+
+ // Attempt to match first source from mask before insertion point.
+ if (isUndefInRange(Mask, 0, Idx)) {
+ /* EMPTY */
+ } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
+ Base = V1;
+ } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
+ Base = V2;
+ } else {
+ continue;
+ }
+
+ // Extend the extraction length looking to match both the insertion of
+ // the second source and the remaining elements of the first.
+ for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
+ SDValue Insert;
+ int Len = Hi - Idx;
+
+ // Match insertion.
+ if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
+ Insert = V1;
+ } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
+ Insert = V2;
+ } else {
+ continue;
+ }
+
+ // Match the remaining elements of the lower half.
+ if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
+ /* EMPTY */
+ } else if ((!Base || (Base == V1)) &&
+ isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
+ Base = V1;
+ } else if ((!Base || (Base == V2)) &&
+ isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
+ Size + Hi)) {
+ Base = V2;
+ } else {
+ continue;
+ }
+
+ // We may not have a base (first source) - this can safely be undefined.
+ if (!Base)
+ Base = DAG.getUNDEF(VT);
+
+ int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
+ int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
+ return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert,
+ DAG.getConstant(BitLen, DL, MVT::i8),
+ DAG.getConstant(BitIdx, DL, MVT::i8));
+ }
+ }
+
+ return SDValue();
+ };
+
+ if (SDValue InsertQ = LowerAsInsertQ())
+ return InsertQ;
+
+ return SDValue();
+}
+
+/// \brief Lower a vector shuffle as a zero or any extension.
+///
+/// Given a specific number of elements, element bit width, and extension
+/// stride, produce either a zero or any extension based on the available
+/// features of the subtarget. The extended elements are consecutive and
+/// begin and can start from an offseted element index in the input; to
+/// avoid excess shuffling the offset must either being in the bottom lane
+/// or at the start of a higher lane. All extended elements must be from
+/// the same lane.
+static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
+ const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
+ ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+ assert(Scale > 1 && "Need a scale to extend.");
+ int EltBits = VT.getScalarSizeInBits();
+ int NumElements = VT.getVectorNumElements();
+ int NumEltsPerLane = 128 / EltBits;
+ int OffsetLane = Offset / NumEltsPerLane;
+ assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
+ "Only 8, 16, and 32 bit elements can be extended.");
+ assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
+ assert(0 <= Offset && "Extension offset must be positive.");
+ assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
+ "Extension offset must be in the first lane or start an upper lane.");
+
+ // Check that an index is in same lane as the base offset.
+ auto SafeOffset = [&](int Idx) {
+ return OffsetLane == (Idx / NumEltsPerLane);
+ };
+
+ // Shift along an input so that the offset base moves to the first element.
+ auto ShuffleOffset = [&](SDValue V) {
+ if (!Offset)
+ return V;
+
+ SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
+ for (int i = 0; i * Scale < NumElements; ++i) {
+ int SrcIdx = i + Offset;
+ ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
+ }
+ return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
+ };
+
+ // Found a valid zext mask! Try various lowering strategies based on the
+ // input type and available ISA extensions.
+ if (Subtarget.hasSSE41()) {
+ // Not worth offseting 128-bit vectors if scale == 2, a pattern using
+ // PUNPCK will catch this in a later shuffle match.
+ if (Offset && Scale == 2 && VT.is128BitVector())
+ return SDValue();
+ MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
+ NumElements / Scale);
+ InputV = ShuffleOffset(InputV);
+
+ // For 256-bit vectors, we only need the lower (128-bit) input half.
+ // For 512-bit vectors, we only need the lower input half or quarter.
+ if (VT.getSizeInBits() > 128)
+ InputV = extractSubVector(InputV, 0, DAG, DL,
+ std::max(128, (int)VT.getSizeInBits() / Scale));
+
+ InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV);
+ return DAG.getBitcast(VT, InputV);
+ }
+
+ assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
+
+ // For any extends we can cheat for larger element sizes and use shuffle
+ // instructions that can fold with a load and/or copy.
+ if (AnyExt && EltBits == 32) {
+ int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
+ -1};
+ return DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
+ DAG.getBitcast(MVT::v4i32, InputV),
+ getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
+ }
+ if (AnyExt && EltBits == 16 && Scale > 2) {
+ int PSHUFDMask[4] = {Offset / 2, -1,
+ SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
+ InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
+ DAG.getBitcast(MVT::v4i32, InputV),
+ getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
+ int PSHUFWMask[4] = {1, -1, -1, -1};
+ unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
+ return DAG.getBitcast(
+ VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
+ DAG.getBitcast(MVT::v8i16, InputV),
+ getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
+ }
+
+ // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
+ // to 64-bits.
+ if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
+ assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
+ assert(VT.is128BitVector() && "Unexpected vector width!");
+
+ int LoIdx = Offset * EltBits;
+ SDValue Lo = DAG.getBitcast(
+ MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
+ DAG.getConstant(EltBits, DL, MVT::i8),
+ DAG.getConstant(LoIdx, DL, MVT::i8)));
+
+ if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
+ !SafeOffset(Offset + 1))
+ return DAG.getBitcast(VT, Lo);
+
+ int HiIdx = (Offset + 1) * EltBits;
+ SDValue Hi = DAG.getBitcast(
+ MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
+ DAG.getConstant(EltBits, DL, MVT::i8),
+ DAG.getConstant(HiIdx, DL, MVT::i8)));
+ return DAG.getBitcast(VT,
+ DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
+ }
+
+ // If this would require more than 2 unpack instructions to expand, use
+ // pshufb when available. We can only use more than 2 unpack instructions
+ // when zero extending i8 elements which also makes it easier to use pshufb.
+ if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
+ assert(NumElements == 16 && "Unexpected byte vector width!");
+ SDValue PSHUFBMask[16];
+ for (int i = 0; i < 16; ++i) {
+ int Idx = Offset + (i / Scale);
+ PSHUFBMask[i] = DAG.getConstant(
+ (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
+ }
+ InputV = DAG.getBitcast(MVT::v16i8, InputV);
+ return DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
+ DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
+ }
+
+ // If we are extending from an offset, ensure we start on a boundary that
+ // we can unpack from.
+ int AlignToUnpack = Offset % (NumElements / Scale);
+ if (AlignToUnpack) {
+ SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
+ for (int i = AlignToUnpack; i < NumElements; ++i)
+ ShMask[i - AlignToUnpack] = i;
+ InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
+ Offset -= AlignToUnpack;
+ }
+
+ // Otherwise emit a sequence of unpacks.
+ do {
+ unsigned UnpackLoHi = X86ISD::UNPCKL;
+ if (Offset >= (NumElements / 2)) {
+ UnpackLoHi = X86ISD::UNPCKH;
+ Offset -= (NumElements / 2);
+ }
+
+ MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
+ SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
+ : getZeroVector(InputVT, Subtarget, DAG, DL);
+ InputV = DAG.getBitcast(InputVT, InputV);
+ InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
+ Scale /= 2;
+ EltBits *= 2;
+ NumElements /= 2;
+ } while (Scale > 1);
+ return DAG.getBitcast(VT, InputV);
+}
+
+/// \brief Try to lower a vector shuffle as a zero extension on any microarch.
+///
+/// This routine will try to do everything in its power to cleverly lower
+/// a shuffle which happens to match the pattern of a zero extend. It doesn't
+/// check for the profitability of this lowering, it tries to aggressively
+/// match this pattern. It will use all of the micro-architectural details it
+/// can to emit an efficient lowering. It handles both blends with all-zero
+/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
+/// masking out later).
+///
+/// The reason we have dedicated lowering for zext-style shuffles is that they
+/// are both incredibly common and often quite performance sensitive.
+static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
+ const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ int Bits = VT.getSizeInBits();
+ int NumLanes = Bits / 128;
+ int NumElements = VT.getVectorNumElements();
+ int NumEltsPerLane = NumElements / NumLanes;
+ assert(VT.getScalarSizeInBits() <= 32 &&
+ "Exceeds 32-bit integer zero extension limit");
+ assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
+
+ // Define a helper function to check a particular ext-scale and lower to it if
+ // valid.
+ auto Lower = [&](int Scale) -> SDValue {
+ SDValue InputV;
+ bool AnyExt = true;
+ int Offset = 0;
+ int Matches = 0;
+ for (int i = 0; i < NumElements; ++i) {
+ int M = Mask[i];
+ if (M < 0)
+ continue; // Valid anywhere but doesn't tell us anything.
+ if (i % Scale != 0) {
+ // Each of the extended elements need to be zeroable.
+ if (!Zeroable[i])
+ return SDValue();
+
+ // We no longer are in the anyext case.
+ AnyExt = false;
+ continue;
+ }
+
+ // Each of the base elements needs to be consecutive indices into the
+ // same input vector.
+ SDValue V = M < NumElements ? V1 : V2;
+ M = M % NumElements;
+ if (!InputV) {
+ InputV = V;
+ Offset = M - (i / Scale);
+ } else if (InputV != V)
+ return SDValue(); // Flip-flopping inputs.
+
+ // Offset must start in the lowest 128-bit lane or at the start of an
+ // upper lane.
+ // FIXME: Is it ever worth allowing a negative base offset?
+ if (!((0 <= Offset && Offset < NumEltsPerLane) ||
+ (Offset % NumEltsPerLane) == 0))
+ return SDValue();
+
+ // If we are offsetting, all referenced entries must come from the same
+ // lane.
+ if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
+ return SDValue();
+
+ if ((M % NumElements) != (Offset + (i / Scale)))
+ return SDValue(); // Non-consecutive strided elements.
+ Matches++;
+ }
+
+ // If we fail to find an input, we have a zero-shuffle which should always
+ // have already been handled.
+ // FIXME: Maybe handle this here in case during blending we end up with one?
+ if (!InputV)
+ return SDValue();
+
+ // If we are offsetting, don't extend if we only match a single input, we
+ // can always do better by using a basic PSHUF or PUNPCK.
+ if (Offset != 0 && Matches < 2)
+ return SDValue();
+
+ return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
+ DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
+ };
+
+ // The widest scale possible for extending is to a 64-bit integer.
+ assert(Bits % 64 == 0 &&
+ "The number of bits in a vector must be divisible by 64 on x86!");
+ int NumExtElements = Bits / 64;
+
+ // Each iteration, try extending the elements half as much, but into twice as
+ // many elements.
+ for (; NumExtElements < NumElements; NumExtElements *= 2) {
+ assert(NumElements % NumExtElements == 0 &&
+ "The input vector size must be divisible by the extended size.");
+ if (SDValue V = Lower(NumElements / NumExtElements))
+ return V;
+ }
+
+ // General extends failed, but 128-bit vectors may be able to use MOVQ.
+ if (Bits != 128)
+ return SDValue();
+
+ // Returns one of the source operands if the shuffle can be reduced to a
+ // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
+ auto CanZExtLowHalf = [&]() {
+ for (int i = NumElements / 2; i != NumElements; ++i)
+ if (!Zeroable[i])
+ return SDValue();
+ if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
+ return V1;
+ if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
+ return V2;
+ return SDValue();
+ };
+
+ if (SDValue V = CanZExtLowHalf()) {
+ V = DAG.getBitcast(MVT::v2i64, V);
+ V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
+ return DAG.getBitcast(VT, V);
+ }
+
+ // No viable ext lowering found.
+ return SDValue();
+}
+
+/// \brief Try to get a scalar value for a specific element of a vector.
+///
+/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
+static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
+ SelectionDAG &DAG) {
+ MVT VT = V.getSimpleValueType();
+ MVT EltVT = VT.getVectorElementType();
+ V = peekThroughBitcasts(V);
+
+ // If the bitcasts shift the element size, we can't extract an equivalent
+ // element from it.
+ MVT NewVT = V.getSimpleValueType();
+ if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
+ return SDValue();
+
+ if (V.getOpcode() == ISD::BUILD_VECTOR ||
+ (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
+ // Ensure the scalar operand is the same size as the destination.
+ // FIXME: Add support for scalar truncation where possible.
+ SDValue S = V.getOperand(Idx);
+ if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
+ return DAG.getBitcast(EltVT, S);
+ }
+
+ return SDValue();
+}
+
+/// \brief Helper to test for a load that can be folded with x86 shuffles.
+///
+/// This is particularly important because the set of instructions varies
+/// significantly based on whether the operand is a load or not.
+static bool isShuffleFoldableLoad(SDValue V) {
+ V = peekThroughBitcasts(V);
+ return ISD::isNON_EXTLoad(V.getNode());
+}
+
+/// \brief Try to lower insertion of a single element into a zero vector.
+///
+/// This is a common pattern that we have especially efficient patterns to lower
+/// across all subtarget feature sets.
+static SDValue lowerVectorShuffleAsElementInsertion(
+ const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT ExtVT = VT;
+ MVT EltVT = VT.getVectorElementType();
+
+ int V2Index =
+ find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
+ Mask.begin();
+ bool IsV1Zeroable = true;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i)
+ if (i != V2Index && !Zeroable[i]) {
+ IsV1Zeroable = false;
+ break;
+ }
+
+ // Check for a single input from a SCALAR_TO_VECTOR node.
+ // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
+ // all the smarts here sunk into that routine. However, the current
+ // lowering of BUILD_VECTOR makes that nearly impossible until the old
+ // vector shuffle lowering is dead.
+ SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
+ DAG);
+ if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
+ // We need to zext the scalar if it is smaller than an i32.
+ V2S = DAG.getBitcast(EltVT, V2S);
+ if (EltVT == MVT::i8 || EltVT == MVT::i16) {
+ // Using zext to expand a narrow element won't work for non-zero
+ // insertions.
+ if (!IsV1Zeroable)
+ return SDValue();
+
+ // Zero-extend directly to i32.
+ ExtVT = MVT::v4i32;
+ V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
+ }
+ V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
+ } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
+ EltVT == MVT::i16) {
+ // Either not inserting from the low element of the input or the input
+ // element size is too small to use VZEXT_MOVL to clear the high bits.
+ return SDValue();
+ }
+
+ if (!IsV1Zeroable) {
+ // If V1 can't be treated as a zero vector we have fewer options to lower
+ // this. We can't support integer vectors or non-zero targets cheaply, and
+ // the V1 elements can't be permuted in any way.
+ assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
+ if (!VT.isFloatingPoint() || V2Index != 0)
+ return SDValue();
+ SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
+ V1Mask[V2Index] = -1;
+ if (!isNoopShuffleMask(V1Mask))
+ return SDValue();
+ // This is essentially a special case blend operation, but if we have
+ // general purpose blend operations, they are always faster. Bail and let
+ // the rest of the lowering handle these as blends.
+ if (Subtarget.hasSSE41())
+ return SDValue();
+
+ // Otherwise, use MOVSD or MOVSS.
+ assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
+ "Only two types of floating point element types to handle!");
+ return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
+ ExtVT, V1, V2);
+ }
+
+ // This lowering only works for the low element with floating point vectors.
+ if (VT.isFloatingPoint() && V2Index != 0)
+ return SDValue();
+
+ V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
+ if (ExtVT != VT)
+ V2 = DAG.getBitcast(VT, V2);
+
+ if (V2Index != 0) {
+ // If we have 4 or fewer lanes we can cheaply shuffle the element into
+ // the desired position. Otherwise it is more efficient to do a vector
+ // shift left. We know that we can do a vector shift left because all
+ // the inputs are zero.
+ if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
+ SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
+ V2Shuffle[V2Index] = 0;
+ V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
+ } else {
+ V2 = DAG.getBitcast(MVT::v16i8, V2);
+ V2 = DAG.getNode(
+ X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
+ DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
+ DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
+ DAG.getDataLayout(), VT)));
+ V2 = DAG.getBitcast(VT, V2);
+ }
+ }
+ return V2;
+}
+
+/// Try to lower broadcast of a single - truncated - integer element,
+/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
+///
+/// This assumes we have AVX2.
+static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
+ SDValue V0, int BroadcastIdx,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(Subtarget.hasAVX2() &&
+ "We can only lower integer broadcasts with AVX2!");
+
+ EVT EltVT = VT.getVectorElementType();
+ EVT V0VT = V0.getValueType();
+
+ assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
+ assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
+
+ EVT V0EltVT = V0VT.getVectorElementType();
+ if (!V0EltVT.isInteger())
+ return SDValue();
+
+ const unsigned EltSize = EltVT.getSizeInBits();
+ const unsigned V0EltSize = V0EltVT.getSizeInBits();
+
+ // This is only a truncation if the original element type is larger.
+ if (V0EltSize <= EltSize)
+ return SDValue();
+
+ assert(((V0EltSize % EltSize) == 0) &&
+ "Scalar type sizes must all be powers of 2 on x86!");
+
+ const unsigned V0Opc = V0.getOpcode();
+ const unsigned Scale = V0EltSize / EltSize;
+ const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
+
+ if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
+ V0Opc != ISD::BUILD_VECTOR)
+ return SDValue();
+
+ SDValue Scalar = V0.getOperand(V0BroadcastIdx);
+
+ // If we're extracting non-least-significant bits, shift so we can truncate.
+ // Hopefully, we can fold away the trunc/srl/load into the broadcast.
+ // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
+ // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
+ if (const int OffsetIdx = BroadcastIdx % Scale)
+ Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
+ DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
+
+ return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
+ DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
+}
+
+/// \brief Try to lower broadcast of a single element.
+///
+/// For convenience, this code also bundles all of the subtarget feature set
+/// filtering. While a little annoying to re-dispatch on type here, there isn't
+/// a convenient way to factor it out.
+/// FIXME: This is very similar to LowerVectorBroadcast - can we merge them?
+static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
+ SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
+ (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
+ (Subtarget.hasAVX2() && VT.isInteger())))
+ return SDValue();
+
+ // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
+ // we can only broadcast from a register with AVX2.
+ unsigned NumElts = Mask.size();
+ unsigned Opcode = VT == MVT::v2f64 ? X86ISD::MOVDDUP : X86ISD::VBROADCAST;
+ bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
+
+ // Check that the mask is a broadcast.
+ int BroadcastIdx = -1;
+ for (int i = 0; i != (int)NumElts; ++i) {
+ SmallVector<int, 8> BroadcastMask(NumElts, i);
+ if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
+ BroadcastIdx = i;
+ break;
+ }
+ }
+
+ if (BroadcastIdx < 0)
+ return SDValue();
+ assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
+ "a sorted mask where the broadcast "
+ "comes from V1.");
+
+ // Go up the chain of (vector) values to find a scalar load that we can
+ // combine with the broadcast.
+ SDValue V = V1;
+ for (;;) {
+ switch (V.getOpcode()) {
+ case ISD::BITCAST: {
+ SDValue VSrc = V.getOperand(0);
+ MVT SrcVT = VSrc.getSimpleValueType();
+ if (VT.getScalarSizeInBits() != SrcVT.getScalarSizeInBits())
+ break;
+ V = VSrc;
+ continue;
+ }
+ case ISD::CONCAT_VECTORS: {
+ int OperandSize = Mask.size() / V.getNumOperands();
+ V = V.getOperand(BroadcastIdx / OperandSize);
+ BroadcastIdx %= OperandSize;
+ continue;
+ }
+ case ISD::INSERT_SUBVECTOR: {
+ SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
+ auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
+ if (!ConstantIdx)
+ break;
+
+ int BeginIdx = (int)ConstantIdx->getZExtValue();
+ int EndIdx =
+ BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
+ if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
+ BroadcastIdx -= BeginIdx;
+ V = VInner;
+ } else {
+ V = VOuter;
+ }
+ continue;
+ }
+ }
+ break;
+ }
+
+ // Check if this is a broadcast of a scalar. We special case lowering
+ // for scalars so that we can more effectively fold with loads.
+ // First, look through bitcast: if the original value has a larger element
+ // type than the shuffle, the broadcast element is in essence truncated.
+ // Make that explicit to ease folding.
+ if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
+ if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
+ DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
+ return TruncBroadcast;
+
+ MVT BroadcastVT = VT;
+
+ // Peek through any bitcast (only useful for loads).
+ SDValue BC = peekThroughBitcasts(V);
+
+ // Also check the simpler case, where we can directly reuse the scalar.
+ if (V.getOpcode() == ISD::BUILD_VECTOR ||
+ (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
+ V = V.getOperand(BroadcastIdx);
+
+ // If we can't broadcast from a register, check that the input is a load.
+ if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
+ return SDValue();
+ } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
+ // 32-bit targets need to load i64 as a f64 and then bitcast the result.
+ if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
+ BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
+ Opcode = (BroadcastVT.is128BitVector() ? X86ISD::MOVDDUP : Opcode);
+ }
+
+ // If we are broadcasting a load that is only used by the shuffle
+ // then we can reduce the vector load to the broadcasted scalar load.
+ LoadSDNode *Ld = cast<LoadSDNode>(BC);
+ SDValue BaseAddr = Ld->getOperand(1);
+ EVT SVT = BroadcastVT.getScalarType();
+ unsigned Offset = BroadcastIdx * SVT.getStoreSize();
+ SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
+ V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
+ DAG.getMachineFunction().getMachineMemOperand(
+ Ld->getMemOperand(), Offset, SVT.getStoreSize()));
+
+ // Make sure the newly-created LOAD is in the same position as Ld in
+ // terms of dependency. We create a TokenFactor for Ld and V,
+ // and update uses of Ld's output chain to use the TokenFactor.
+ if (Ld->hasAnyUseOfValue(1)) {
+ SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+ SDValue(Ld, 1), SDValue(V.getNode(), 1));
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
+ DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
+ SDValue(V.getNode(), 1));
+ }
+ } else if (!BroadcastFromReg) {
+ // We can't broadcast from a vector register.
+ return SDValue();
+ } else if (BroadcastIdx != 0) {
+ // We can only broadcast from the zero-element of a vector register,
+ // but it can be advantageous to broadcast from the zero-element of a
+ // subvector.
+ if (!VT.is256BitVector() && !VT.is512BitVector())
+ return SDValue();
+
+ // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
+ if (VT == MVT::v4f64 || VT == MVT::v4i64)
+ return SDValue();
+
+ // Only broadcast the zero-element of a 128-bit subvector.
+ unsigned EltSize = VT.getScalarSizeInBits();
+ if (((BroadcastIdx * EltSize) % 128) != 0)
+ return SDValue();
+
+ MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 128 / EltSize);
+ V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,
+ DAG.getIntPtrConstant(BroadcastIdx, DL));
+ }
+
+ if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
+ V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
+ DAG.getBitcast(MVT::f64, V));
+
+ // Bitcast back to the same scalar type as BroadcastVT.
+ MVT SrcVT = V.getSimpleValueType();
+ if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
+ assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
+ "Unexpected vector element size");
+ if (SrcVT.isVector()) {
+ unsigned NumSrcElts = SrcVT.getVectorNumElements();
+ SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
+ } else {
+ SrcVT = BroadcastVT.getScalarType();
+ }
+ V = DAG.getBitcast(SrcVT, V);
+ }
+
+ // 32-bit targets need to load i64 as a f64 and then bitcast the result.
+ if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
+ V = DAG.getBitcast(MVT::f64, V);
+ unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
+ BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
+ }
+
+ return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
+}
+
+// Check for whether we can use INSERTPS to perform the shuffle. We only use
+// INSERTPS when the V1 elements are already in the correct locations
+// because otherwise we can just always use two SHUFPS instructions which
+// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
+// perform INSERTPS if a single V1 element is out of place and all V2
+// elements are zeroable.
+static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
+ unsigned &InsertPSMask,
+ const SmallBitVector &Zeroable,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
+ assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
+ assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+ // Attempt to match INSERTPS with one element from VA or VB being
+ // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
+ // are updated.
+ auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
+ ArrayRef<int> CandidateMask) {
+ unsigned ZMask = 0;
+ int VADstIndex = -1;
+ int VBDstIndex = -1;
+ bool VAUsedInPlace = false;
+
+ for (int i = 0; i < 4; ++i) {
+ // Synthesize a zero mask from the zeroable elements (includes undefs).
+ if (Zeroable[i]) {
+ ZMask |= 1 << i;
+ continue;
+ }
+
+ // Flag if we use any VA inputs in place.
+ if (i == CandidateMask[i]) {
+ VAUsedInPlace = true;
+ continue;
+ }
+
+ // We can only insert a single non-zeroable element.
+ if (VADstIndex >= 0 || VBDstIndex >= 0)
+ return false;
+
+ if (CandidateMask[i] < 4) {
+ // VA input out of place for insertion.
+ VADstIndex = i;
+ } else {
+ // VB input for insertion.
+ VBDstIndex = i;
+ }
+ }
+
+ // Don't bother if we have no (non-zeroable) element for insertion.
+ if (VADstIndex < 0 && VBDstIndex < 0)
+ return false;
+
+ // Determine element insertion src/dst indices. The src index is from the
+ // start of the inserted vector, not the start of the concatenated vector.
+ unsigned VBSrcIndex = 0;
+ if (VADstIndex >= 0) {
+ // If we have a VA input out of place, we use VA as the V2 element
+ // insertion and don't use the original V2 at all.
+ VBSrcIndex = CandidateMask[VADstIndex];
+ VBDstIndex = VADstIndex;
+ VB = VA;
+ } else {
+ VBSrcIndex = CandidateMask[VBDstIndex] - 4;
+ }
+
+ // If no V1 inputs are used in place, then the result is created only from
+ // the zero mask and the V2 insertion - so remove V1 dependency.
+ if (!VAUsedInPlace)
+ VA = DAG.getUNDEF(MVT::v4f32);
+
+ // Update V1, V2 and InsertPSMask accordingly.
+ V1 = VA;
+ V2 = VB;
+
+ // Insert the V2 element into the desired position.
+ InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
+ assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
+ return true;
+ };
+
+ if (matchAsInsertPS(V1, V2, Mask))
+ return true;
+
+ // Commute and try again.
+ SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
+ ShuffleVectorSDNode::commuteMask(CommutedMask);
+ if (matchAsInsertPS(V2, V1, CommutedMask))
+ return true;
+
+ return false;
+}
+
+static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+
+ // Attempt to match the insertps pattern.
+ unsigned InsertPSMask;
+ if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
+ return SDValue();
+
+ // Insert the V2 element into the desired position.
+ return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
+ DAG.getConstant(InsertPSMask, DL, MVT::i8));
+}
+
+/// \brief Try to lower a shuffle as a permute of the inputs followed by an
+/// UNPCK instruction.
+///
+/// This specifically targets cases where we end up with alternating between
+/// the two inputs, and so can permute them into something that feeds a single
+/// UNPCK instruction. Note that this routine only targets integer vectors
+/// because for floating point vectors we have a generalized SHUFPS lowering
+/// strategy that handles everything that doesn't *exactly* match an unpack,
+/// making this clever lowering unnecessary.
+static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
+ SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ assert(!VT.isFloatingPoint() &&
+ "This routine only supports integer vectors.");
+ assert(VT.is128BitVector() &&
+ "This routine only works on 128-bit vectors.");
+ assert(!V2.isUndef() &&
+ "This routine should only be used when blending two inputs.");
+ assert(Mask.size() >= 2 && "Single element masks are invalid.");
+
+ int Size = Mask.size();
+
+ int NumLoInputs =
+ count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
+ int NumHiInputs =
+ count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
+
+ bool UnpackLo = NumLoInputs >= NumHiInputs;
+
+ auto TryUnpack = [&](int ScalarSize, int Scale) {
+ SmallVector<int, 16> V1Mask((unsigned)Size, -1);
+ SmallVector<int, 16> V2Mask((unsigned)Size, -1);
+
+ for (int i = 0; i < Size; ++i) {
+ if (Mask[i] < 0)
+ continue;
+
+ // Each element of the unpack contains Scale elements from this mask.
+ int UnpackIdx = i / Scale;
+
+ // We only handle the case where V1 feeds the first slots of the unpack.
+ // We rely on canonicalization to ensure this is the case.
+ if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
+ return SDValue();
+
+ // Setup the mask for this input. The indexing is tricky as we have to
+ // handle the unpack stride.
+ SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
+ VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
+ Mask[i] % Size;
+ }
+
+ // If we will have to shuffle both inputs to use the unpack, check whether
+ // we can just unpack first and shuffle the result. If so, skip this unpack.
+ if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
+ !isNoopShuffleMask(V2Mask))
+ return SDValue();
+
+ // Shuffle the inputs into place.
+ V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
+ V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
+
+ // Cast the inputs to the type we will use to unpack them.
+ MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
+ V1 = DAG.getBitcast(UnpackVT, V1);
+ V2 = DAG.getBitcast(UnpackVT, V2);
+
+ // Unpack the inputs and cast the result back to the desired type.
+ return DAG.getBitcast(
+ VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
+ UnpackVT, V1, V2));
+ };
+
+ // We try each unpack from the largest to the smallest to try and find one
+ // that fits this mask.
+ int OrigScalarSize = VT.getScalarSizeInBits();
+ for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
+ if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
+ return Unpack;
+
+ // If none of the unpack-rooted lowerings worked (or were profitable) try an
+ // initial unpack.
+ if (NumLoInputs == 0 || NumHiInputs == 0) {
+ assert((NumLoInputs > 0 || NumHiInputs > 0) &&
+ "We have to have *some* inputs!");
+ int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
+
+ // FIXME: We could consider the total complexity of the permute of each
+ // possible unpacking. Or at the least we should consider how many
+ // half-crossings are created.
+ // FIXME: We could consider commuting the unpacks.
+
+ SmallVector<int, 32> PermMask((unsigned)Size, -1);
+ for (int i = 0; i < Size; ++i) {
+ if (Mask[i] < 0)
+ continue;
+
+ assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
+
+ PermMask[i] =
+ 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
+ }
+ return DAG.getVectorShuffle(
+ VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
+ DL, VT, V1, V2),
+ DAG.getUNDEF(VT), PermMask);
+ }
+
+ return SDValue();
+}
+
+/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
+///
+/// This is the basis function for the 2-lane 64-bit shuffles as we have full
+/// support for floating point shuffles but not integer shuffles. These
+/// instructions will incur a domain crossing penalty on some chips though so
+/// it is better to avoid lowering through this for integer vectors where
+/// possible.
+static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
+ SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
+ assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
+
+ if (V2.isUndef()) {
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
+ DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
+ return Broadcast;
+
+ // Straight shuffle of a single input vector. Simulate this by using the
+ // single input as both of the "inputs" to this instruction..
+ unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
+
+ if (Subtarget.hasAVX()) {
+ // If we have AVX, we can use VPERMILPS which will allow folding a load
+ // into the shuffle.
+ return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
+ DAG.getConstant(SHUFPDMask, DL, MVT::i8));
+ }
+
+ return DAG.getNode(
+ X86ISD::SHUFP, DL, MVT::v2f64,
+ Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
+ Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
+ DAG.getConstant(SHUFPDMask, DL, MVT::i8));
+ }
+ assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
+ assert(Mask[1] >= 2 && "Non-canonicalized blend!");
+
+ // If we have a single input, insert that into V1 if we can do so cheaply.
+ if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
+ if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return Insertion;
+ // Try inverting the insertion since for v2 masks it is easy to do and we
+ // can't reliably sort the mask one way or the other.
+ int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
+ Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
+ if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
+ return Insertion;
+ }
+
+ // Try to use one of the special instruction patterns to handle two common
+ // blend patterns if a zero-blend above didn't work.
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
+ isShuffleEquivalent(V1, V2, Mask, {1, 3}))
+ if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
+ // We can either use a special instruction to load over the low double or
+ // to move just the low double.
+ return DAG.getNode(
+ isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
+ DL, MVT::v2f64, V2,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
+
+ if (Subtarget.hasSSE41())
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
+ return V;
+
+ unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
+ return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
+ DAG.getConstant(SHUFPDMask, DL, MVT::i8));
+}
+
+/// \brief Handle lowering of 2-lane 64-bit integer shuffles.
+///
+/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
+/// the integer unit to minimize domain crossing penalties. However, for blends
+/// it falls back to the floating point shuffle operation with appropriate bit
+/// casting.
+static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
+ SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
+ assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
+
+ if (V2.isUndef()) {
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
+ DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
+ return Broadcast;
+
+ // Straight shuffle of a single input vector. For everything from SSE2
+ // onward this has a single fast instruction with no scary immediates.
+ // We have to map the mask as it is actually a v4i32 shuffle instruction.
+ V1 = DAG.getBitcast(MVT::v4i32, V1);
+ int WidenedMask[4] = {
+ std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
+ std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
+ return DAG.getBitcast(
+ MVT::v2i64,
+ DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
+ getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
+ }
+ assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
+ assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
+ assert(Mask[0] < 2 && "We sort V1 to be the first input.");
+ assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
+
+ // If we have a blend of two same-type PACKUS operations and the blend aligns
+ // with the low and high halves, we can just merge the PACKUS operations.
+ // This is particularly important as it lets us merge shuffles that this
+ // routine itself creates.
+ auto GetPackNode = [](SDValue V) {
+ V = peekThroughBitcasts(V);
+ return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
+ };
+ if (SDValue V1Pack = GetPackNode(V1))
+ if (SDValue V2Pack = GetPackNode(V2)) {
+ EVT PackVT = V1Pack.getValueType();
+ if (PackVT == V2Pack.getValueType())
+ return DAG.getBitcast(MVT::v2i64,
+ DAG.getNode(X86ISD::PACKUS, DL, PackVT,
+ Mask[0] == 0 ? V1Pack.getOperand(0)
+ : V1Pack.getOperand(1),
+ Mask[1] == 2 ? V2Pack.getOperand(0)
+ : V2Pack.getOperand(1)));
+ }
+
+ // Try to use shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Shift;
+
+ // When loading a scalar and then shuffling it into a vector we can often do
+ // the insertion cheaply.
+ if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return Insertion;
+ // Try inverting the insertion since for v2 masks it is easy to do and we
+ // can't reliably sort the mask one way or the other.
+ int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
+ if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
+ return Insertion;
+
+ // We have different paths for blend lowering, but they all must use the
+ // *exact* same predicate.
+ bool IsBlendSupported = Subtarget.hasSSE41();
+ if (IsBlendSupported)
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
+ return V;
+
+ // Try to use byte rotation instructions.
+ // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
+ if (Subtarget.hasSSSE3())
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
+ return Rotate;
+
+ // If we have direct support for blends, we should lower by decomposing into
+ // a permute. That will be faster than the domain cross.
+ if (IsBlendSupported)
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
+ Mask, DAG);
+
+ // We implement this with SHUFPD which is pretty lame because it will likely
+ // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
+ // However, all the alternatives are still more cycles and newer chips don't
+ // have this problem. It would be really nice if x86 had better shuffles here.
+ V1 = DAG.getBitcast(MVT::v2f64, V1);
+ V2 = DAG.getBitcast(MVT::v2f64, V2);
+ return DAG.getBitcast(MVT::v2i64,
+ DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
+}
+
+/// \brief Test whether this can be lowered with a single SHUFPS instruction.
+///
+/// This is used to disable more specialized lowerings when the shufps lowering
+/// will happen to be efficient.
+static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
+ // This routine only handles 128-bit shufps.
+ assert(Mask.size() == 4 && "Unsupported mask size!");
+ assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
+ assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
+ assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
+ assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
+
+ // To lower with a single SHUFPS we need to have the low half and high half
+ // each requiring a single input.
+ if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
+ return false;
+ if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
+ return false;
+
+ return true;
+}
+
+/// \brief Lower a vector shuffle using the SHUFPS instruction.
+///
+/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
+/// It makes no assumptions about whether this is the *best* lowering, it simply
+/// uses it.
+static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1,
+ SDValue V2, SelectionDAG &DAG) {
+ SDValue LowV = V1, HighV = V2;
+ int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
+
+ int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
+
+ if (NumV2Elements == 1) {
+ int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
+
+ // Compute the index adjacent to V2Index and in the same half by toggling
+ // the low bit.
+ int V2AdjIndex = V2Index ^ 1;
+
+ if (Mask[V2AdjIndex] < 0) {
+ // Handles all the cases where we have a single V2 element and an undef.
+ // This will only ever happen in the high lanes because we commute the
+ // vector otherwise.
+ if (V2Index < 2)
+ std::swap(LowV, HighV);
+ NewMask[V2Index] -= 4;
+ } else {
+ // Handle the case where the V2 element ends up adjacent to a V1 element.
+ // To make this work, blend them together as the first step.
+ int V1Index = V2AdjIndex;
+ int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
+ V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
+ getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
+
+ // Now proceed to reconstruct the final blend as we have the necessary
+ // high or low half formed.
+ if (V2Index < 2) {
+ LowV = V2;
+ HighV = V1;
+ } else {
+ HighV = V2;
+ }
+ NewMask[V1Index] = 2; // We put the V1 element in V2[2].
+ NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
+ }
+ } else if (NumV2Elements == 2) {
+ if (Mask[0] < 4 && Mask[1] < 4) {
+ // Handle the easy case where we have V1 in the low lanes and V2 in the
+ // high lanes.
+ NewMask[2] -= 4;
+ NewMask[3] -= 4;
+ } else if (Mask[2] < 4 && Mask[3] < 4) {
+ // We also handle the reversed case because this utility may get called
+ // when we detect a SHUFPS pattern but can't easily commute the shuffle to
+ // arrange things in the right direction.
+ NewMask[0] -= 4;
+ NewMask[1] -= 4;
+ HighV = V1;
+ LowV = V2;
+ } else {
+ // We have a mixture of V1 and V2 in both low and high lanes. Rather than
+ // trying to place elements directly, just blend them and set up the final
+ // shuffle to place them.
+
+ // The first two blend mask elements are for V1, the second two are for
+ // V2.
+ int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
+ Mask[2] < 4 ? Mask[2] : Mask[3],
+ (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
+ (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
+ V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
+ getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
+
+ // Now we do a normal shuffle of V1 by giving V1 as both operands to
+ // a blend.
+ LowV = HighV = V1;
+ NewMask[0] = Mask[0] < 4 ? 0 : 2;
+ NewMask[1] = Mask[0] < 4 ? 2 : 0;
+ NewMask[2] = Mask[2] < 4 ? 1 : 3;
+ NewMask[3] = Mask[2] < 4 ? 3 : 1;
+ }
+ }
+ return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
+ getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
+}
+
+/// \brief Lower 4-lane 32-bit floating point shuffles.
+///
+/// Uses instructions exclusively from the floating point unit to minimize
+/// domain crossing penalties, as these are sufficient to implement all v4f32
+/// shuffles.
+static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
+ SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+ assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+ int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
+
+ if (NumV2Elements == 0) {
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
+ DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
+ return Broadcast;
+
+ // Use even/odd duplicate instructions for masks that match their pattern.
+ if (Subtarget.hasSSE3()) {
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
+ return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
+ if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
+ return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
+ }
+
+ if (Subtarget.hasAVX()) {
+ // If we have AVX, we can use VPERMILPS which will allow folding a load
+ // into the shuffle.
+ return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
+ getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+ }
+
+ // Otherwise, use a straight shuffle of a single input vector. We pass the
+ // input vector to both operands to simulate this with a SHUFPS.
+ return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
+ getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+ }
+
+ // There are special ways we can lower some single-element blends. However, we
+ // have custom ways we can lower more complex single-element blends below that
+ // we defer to if both this and BLENDPS fail to match, so restrict this to
+ // when the V2 input is targeting element 0 of the mask -- that is the fast
+ // case here.
+ if (NumV2Elements == 1 && Mask[0] >= 4)
+ if (SDValue V = lowerVectorShuffleAsElementInsertion(
+ DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return V;
+
+ if (Subtarget.hasSSE41()) {
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
+ // Use INSERTPS if we can complete the shuffle efficiently.
+ if (SDValue V =
+ lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
+ return V;
+
+ if (!isSingleSHUFPSMask(Mask))
+ if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
+ DL, MVT::v4f32, V1, V2, Mask, DAG))
+ return BlendPerm;
+ }
+
+ // Use low/high mov instructions.
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
+ return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
+ return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
+ return V;
+
+ // Otherwise fall back to a SHUFPS lowering strategy.
+ return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
+}
+
+/// \brief Lower 4-lane i32 vector shuffles.
+///
+/// We try to handle these with integer-domain shuffles where we can, but for
+/// blends we use the floating point domain blend instructions.
+static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
+ SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
+ assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative. It also allows us to fold memory operands into the
+ // shuffle in many cases.
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+ DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return ZExt;
+
+ int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
+
+ if (NumV2Elements == 0) {
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
+ DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
+ return Broadcast;
+
+ // Straight shuffle of a single input vector. For everything from SSE2
+ // onward this has a single fast instruction with no scary immediates.
+ // We coerce the shuffle pattern to be compatible with UNPCK instructions
+ // but we aren't actually going to use the UNPCK instruction because doing
+ // so prevents folding a load into this instruction or making a copy.
+ const int UnpackLoMask[] = {0, 0, 1, 1};
+ const int UnpackHiMask[] = {2, 2, 3, 3};
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
+ Mask = UnpackLoMask;
+ else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
+ Mask = UnpackHiMask;
+
+ return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
+ getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+ }
+
+ // Try to use shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Shift;
+
+ // There are special ways we can lower some single-element blends.
+ if (NumV2Elements == 1)
+ if (SDValue V = lowerVectorShuffleAsElementInsertion(
+ DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return V;
+
+ // We have different paths for blend lowering, but they all must use the
+ // *exact* same predicate.
+ bool IsBlendSupported = Subtarget.hasSSE41();
+ if (IsBlendSupported)
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
+ if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
+ Zeroable, DAG))
+ return Masked;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
+ return V;
+
+ // Try to use byte rotation instructions.
+ // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
+ if (Subtarget.hasSSSE3())
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
+ return Rotate;
+
+ // Assume that a single SHUFPS is faster than an alternative sequence of
+ // multiple instructions (even if the CPU has a domain penalty).
+ // If some CPU is harmed by the domain switch, we can fix it in a later pass.
+ if (!isSingleSHUFPSMask(Mask)) {
+ // If we have direct support for blends, we should lower by decomposing into
+ // a permute. That will be faster than the domain cross.
+ if (IsBlendSupported)
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
+ Mask, DAG);
+
+ // Try to lower by permuting the inputs into an unpack instruction.
+ if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
+ DL, MVT::v4i32, V1, V2, Mask, DAG))
+ return Unpack;
+ }
+
+ // We implement this with SHUFPS because it can blend from two vectors.
+ // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
+ // up the inputs, bypassing domain shift penalties that we would encur if we
+ // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
+ // relevant.
+ SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
+ SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
+ SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
+ return DAG.getBitcast(MVT::v4i32, ShufPS);
+}
+
+/// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
+/// shuffle lowering, and the most complex part.
+///
+/// The lowering strategy is to try to form pairs of input lanes which are
+/// targeted at the same half of the final vector, and then use a dword shuffle
+/// to place them onto the right half, and finally unpack the paired lanes into
+/// their final position.
+///
+/// The exact breakdown of how to form these dword pairs and align them on the
+/// correct sides is really tricky. See the comments within the function for
+/// more of the details.
+///
+/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
+/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
+/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
+/// vector, form the analogous 128-bit 8-element Mask.
+static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
+ const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
+ const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+ assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
+ MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
+
+ assert(Mask.size() == 8 && "Shuffle mask length doen't match!");
+ MutableArrayRef<int> LoMask = Mask.slice(0, 4);
+ MutableArrayRef<int> HiMask = Mask.slice(4, 4);
+
+ SmallVector<int, 4> LoInputs;
+ std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
+ [](int M) { return M >= 0; });
+ std::sort(LoInputs.begin(), LoInputs.end());
+ LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
+ SmallVector<int, 4> HiInputs;
+ std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
+ [](int M) { return M >= 0; });
+ std::sort(HiInputs.begin(), HiInputs.end());
+ HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
+ int NumLToL =
+ std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
+ int NumHToL = LoInputs.size() - NumLToL;
+ int NumLToH =
+ std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
+ int NumHToH = HiInputs.size() - NumLToH;
+ MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
+ MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
+ MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
+ MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
+
+ // If we are splatting two values from one half - one to each half, then
+ // we can shuffle that half so each is splatted to a dword, then splat those
+ // to their respective halves.
+ auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp,
+ int DOffset) {
+ int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4};
+ int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1};
+ V = DAG.getNode(ShufWOp, DL, VT, V,
+ getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
+ V = DAG.getBitcast(PSHUFDVT, V);
+ V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
+ getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
+ return DAG.getBitcast(VT, V);
+ };
+
+ if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0)
+ return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0);
+ if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0)
+ return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2);
+
+ // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
+ // such inputs we can swap two of the dwords across the half mark and end up
+ // with <=2 inputs to each half in each half. Once there, we can fall through
+ // to the generic code below. For example:
+ //
+ // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
+ // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
+ //
+ // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
+ // and an existing 2-into-2 on the other half. In this case we may have to
+ // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
+ // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
+ // Fortunately, we don't have to handle anything but a 2-into-2 pattern
+ // because any other situation (including a 3-into-1 or 1-into-3 in the other
+ // half than the one we target for fixing) will be fixed when we re-enter this
+ // path. We will also combine away any sequence of PSHUFD instructions that
+ // result into a single instruction. Here is an example of the tricky case:
+ //
+ // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
+ // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
+ //
+ // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
+ //
+ // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
+ // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
+ //
+ // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
+ // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
+ //
+ // The result is fine to be handled by the generic logic.
+ auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
+ ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
+ int AOffset, int BOffset) {
+ assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
+ "Must call this with A having 3 or 1 inputs from the A half.");
+ assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
+ "Must call this with B having 1 or 3 inputs from the B half.");
+ assert(AToAInputs.size() + BToAInputs.size() == 4 &&
+ "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
+
+ bool ThreeAInputs = AToAInputs.size() == 3;
+
+ // Compute the index of dword with only one word among the three inputs in
+ // a half by taking the sum of the half with three inputs and subtracting
+ // the sum of the actual three inputs. The difference is the remaining
+ // slot.
+ int ADWord, BDWord;
+ int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
+ int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
+ int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
+ ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
+ int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
+ int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
+ int TripleNonInputIdx =
+ TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
+ TripleDWord = TripleNonInputIdx / 2;
+
+ // We use xor with one to compute the adjacent DWord to whichever one the
+ // OneInput is in.
+ OneInputDWord = (OneInput / 2) ^ 1;
+
+ // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
+ // and BToA inputs. If there is also such a problem with the BToB and AToB
+ // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
+ // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
+ // is essential that we don't *create* a 3<-1 as then we might oscillate.
+ if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
+ // Compute how many inputs will be flipped by swapping these DWords. We
+ // need
+ // to balance this to ensure we don't form a 3-1 shuffle in the other
+ // half.
+ int NumFlippedAToBInputs =
+ std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
+ std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
+ int NumFlippedBToBInputs =
+ std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
+ std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
+ if ((NumFlippedAToBInputs == 1 &&
+ (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
+ (NumFlippedBToBInputs == 1 &&
+ (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
+ // We choose whether to fix the A half or B half based on whether that
+ // half has zero flipped inputs. At zero, we may not be able to fix it
+ // with that half. We also bias towards fixing the B half because that
+ // will more commonly be the high half, and we have to bias one way.
+ auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
+ ArrayRef<int> Inputs) {
+ int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
+ bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
+ // Determine whether the free index is in the flipped dword or the
+ // unflipped dword based on where the pinned index is. We use this bit
+ // in an xor to conditionally select the adjacent dword.
+ int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
+ bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
+ if (IsFixIdxInput == IsFixFreeIdxInput)
+ FixFreeIdx += 1;
+ IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
+ assert(IsFixIdxInput != IsFixFreeIdxInput &&
+ "We need to be changing the number of flipped inputs!");
+ int PSHUFHalfMask[] = {0, 1, 2, 3};
+ std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
+ V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
+ MVT::v8i16, V,
+ getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
+
+ for (int &M : Mask)
+ if (M >= 0 && M == FixIdx)
+ M = FixFreeIdx;
+ else if (M >= 0 && M == FixFreeIdx)
+ M = FixIdx;
+ };
+ if (NumFlippedBToBInputs != 0) {
+ int BPinnedIdx =
+ BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
+ FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
+ } else {
+ assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
+ int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
+ FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
+ }
+ }
+ }
+
+ int PSHUFDMask[] = {0, 1, 2, 3};
+ PSHUFDMask[ADWord] = BDWord;
+ PSHUFDMask[BDWord] = ADWord;
+ V = DAG.getBitcast(
+ VT,
+ DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
+ getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
+
+ // Adjust the mask to match the new locations of A and B.
+ for (int &M : Mask)
+ if (M >= 0 && M/2 == ADWord)
+ M = 2 * BDWord + M % 2;
+ else if (M >= 0 && M/2 == BDWord)
+ M = 2 * ADWord + M % 2;
+
+ // Recurse back into this routine to re-compute state now that this isn't
+ // a 3 and 1 problem.
+ return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
+ DAG);
+ };
+ if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
+ return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
+ else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
+ return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
+
+ // At this point there are at most two inputs to the low and high halves from
+ // each half. That means the inputs can always be grouped into dwords and
+ // those dwords can then be moved to the correct half with a dword shuffle.
+ // We use at most one low and one high word shuffle to collect these paired
+ // inputs into dwords, and finally a dword shuffle to place them.
+ int PSHUFLMask[4] = {-1, -1, -1, -1};
+ int PSHUFHMask[4] = {-1, -1, -1, -1};
+ int PSHUFDMask[4] = {-1, -1, -1, -1};
+
+ // First fix the masks for all the inputs that are staying in their
+ // original halves. This will then dictate the targets of the cross-half
+ // shuffles.
+ auto fixInPlaceInputs =
+ [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
+ MutableArrayRef<int> SourceHalfMask,
+ MutableArrayRef<int> HalfMask, int HalfOffset) {
+ if (InPlaceInputs.empty())
+ return;
+ if (InPlaceInputs.size() == 1) {
+ SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
+ InPlaceInputs[0] - HalfOffset;
+ PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
+ return;
+ }
+ if (IncomingInputs.empty()) {
+ // Just fix all of the in place inputs.
+ for (int Input : InPlaceInputs) {
+ SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
+ PSHUFDMask[Input / 2] = Input / 2;
+ }
+ return;
+ }
+
+ assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
+ SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
+ InPlaceInputs[0] - HalfOffset;
+ // Put the second input next to the first so that they are packed into
+ // a dword. We find the adjacent index by toggling the low bit.
+ int AdjIndex = InPlaceInputs[0] ^ 1;
+ SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
+ std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
+ PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
+ };
+ fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
+ fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
+
+ // Now gather the cross-half inputs and place them into a free dword of
+ // their target half.
+ // FIXME: This operation could almost certainly be simplified dramatically to
+ // look more like the 3-1 fixing operation.
+ auto moveInputsToRightHalf = [&PSHUFDMask](
+ MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
+ MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
+ MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
+ int DestOffset) {
+ auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
+ return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
+ };
+ auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
+ int Word) {
+ int LowWord = Word & ~1;
+ int HighWord = Word | 1;
+ return isWordClobbered(SourceHalfMask, LowWord) ||
+ isWordClobbered(SourceHalfMask, HighWord);
+ };
+
+ if (IncomingInputs.empty())
+ return;
+
+ if (ExistingInputs.empty()) {
+ // Map any dwords with inputs from them into the right half.
+ for (int Input : IncomingInputs) {
+ // If the source half mask maps over the inputs, turn those into
+ // swaps and use the swapped lane.
+ if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
+ if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
+ SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
+ Input - SourceOffset;
+ // We have to swap the uses in our half mask in one sweep.
+ for (int &M : HalfMask)
+ if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
+ M = Input;
+ else if (M == Input)
+ M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
+ } else {
+ assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
+ Input - SourceOffset &&
+ "Previous placement doesn't match!");
+ }
+ // Note that this correctly re-maps both when we do a swap and when
+ // we observe the other side of the swap above. We rely on that to
+ // avoid swapping the members of the input list directly.
+ Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
+ }
+
+ // Map the input's dword into the correct half.
+ if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
+ PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
+ else
+ assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
+ Input / 2 &&
+ "Previous placement doesn't match!");
+ }
+
+ // And just directly shift any other-half mask elements to be same-half
+ // as we will have mirrored the dword containing the element into the
+ // same position within that half.
+ for (int &M : HalfMask)
+ if (M >= SourceOffset && M < SourceOffset + 4) {
+ M = M - SourceOffset + DestOffset;
+ assert(M >= 0 && "This should never wrap below zero!");
+ }
+ return;
+ }
+
+ // Ensure we have the input in a viable dword of its current half. This
+ // is particularly tricky because the original position may be clobbered
+ // by inputs being moved and *staying* in that half.
+ if (IncomingInputs.size() == 1) {
+ if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
+ int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
+ SourceOffset;
+ SourceHalfMask[InputFixed - SourceOffset] =
+ IncomingInputs[0] - SourceOffset;
+ std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
+ InputFixed);
+ IncomingInputs[0] = InputFixed;
+ }
+ } else if (IncomingInputs.size() == 2) {
+ if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
+ isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
+ // We have two non-adjacent or clobbered inputs we need to extract from
+ // the source half. To do this, we need to map them into some adjacent
+ // dword slot in the source mask.
+ int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
+ IncomingInputs[1] - SourceOffset};
+
+ // If there is a free slot in the source half mask adjacent to one of
+ // the inputs, place the other input in it. We use (Index XOR 1) to
+ // compute an adjacent index.
+ if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
+ SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
+ SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
+ SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
+ InputsFixed[1] = InputsFixed[0] ^ 1;
+ } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
+ SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
+ SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
+ SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
+ InputsFixed[0] = InputsFixed[1] ^ 1;
+ } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
+ SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
+ // The two inputs are in the same DWord but it is clobbered and the
+ // adjacent DWord isn't used at all. Move both inputs to the free
+ // slot.
+ SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
+ SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
+ InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
+ InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
+ } else {
+ // The only way we hit this point is if there is no clobbering
+ // (because there are no off-half inputs to this half) and there is no
+ // free slot adjacent to one of the inputs. In this case, we have to
+ // swap an input with a non-input.
+ for (int i = 0; i < 4; ++i)
+ assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
+ "We can't handle any clobbers here!");
+ assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
+ "Cannot have adjacent inputs here!");
+
+ SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
+ SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
+
+ // We also have to update the final source mask in this case because
+ // it may need to undo the above swap.
+ for (int &M : FinalSourceHalfMask)
+ if (M == (InputsFixed[0] ^ 1) + SourceOffset)
+ M = InputsFixed[1] + SourceOffset;
+ else if (M == InputsFixed[1] + SourceOffset)
+ M = (InputsFixed[0] ^ 1) + SourceOffset;
+
+ InputsFixed[1] = InputsFixed[0] ^ 1;
+ }
+
+ // Point everything at the fixed inputs.
+ for (int &M : HalfMask)
+ if (M == IncomingInputs[0])
+ M = InputsFixed[0] + SourceOffset;
+ else if (M == IncomingInputs[1])
+ M = InputsFixed[1] + SourceOffset;
+
+ IncomingInputs[0] = InputsFixed[0] + SourceOffset;
+ IncomingInputs[1] = InputsFixed[1] + SourceOffset;
+ }
+ } else {
+ llvm_unreachable("Unhandled input size!");
+ }
+
+ // Now hoist the DWord down to the right half.
+ int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
+ assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
+ PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
+ for (int &M : HalfMask)
+ for (int Input : IncomingInputs)
+ if (M == Input)
+ M = FreeDWord * 2 + Input % 2;
+ };
+ moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
+ /*SourceOffset*/ 4, /*DestOffset*/ 0);
+ moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
+ /*SourceOffset*/ 0, /*DestOffset*/ 4);
+
+ // Now enact all the shuffles we've computed to move the inputs into their
+ // target half.
+ if (!isNoopShuffleMask(PSHUFLMask))
+ V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
+ getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
+ if (!isNoopShuffleMask(PSHUFHMask))
+ V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
+ getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
+ if (!isNoopShuffleMask(PSHUFDMask))
+ V = DAG.getBitcast(
+ VT,
+ DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
+ getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
+
+ // At this point, each half should contain all its inputs, and we can then
+ // just shuffle them into their final position.
+ assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
+ "Failed to lift all the high half inputs to the low mask!");
+ assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
+ "Failed to lift all the low half inputs to the high mask!");
+
+ // Do a half shuffle for the low mask.
+ if (!isNoopShuffleMask(LoMask))
+ V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
+ getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
+
+ // Do a half shuffle with the high mask after shifting its values down.
+ for (int &M : HiMask)
+ if (M >= 0)
+ M -= 4;
+ if (!isNoopShuffleMask(HiMask))
+ V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
+ getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
+
+ return V;
+}
+
+/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
+/// blend if only one input is used.
+static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
+ const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable, SelectionDAG &DAG, bool &V1InUse,
+ bool &V2InUse) {
+ SDValue V1Mask[16];
+ SDValue V2Mask[16];
+ V1InUse = false;
+ V2InUse = false;
+
+ int Size = Mask.size();
+ int Scale = 16 / Size;
+ for (int i = 0; i < 16; ++i) {
+ if (Mask[i / Scale] < 0) {
+ V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
+ } else {
+ const int ZeroMask = 0x80;
+ int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
+ : ZeroMask;
+ int V2Idx = Mask[i / Scale] < Size
+ ? ZeroMask
+ : (Mask[i / Scale] - Size) * Scale + i % Scale;
+ if (Zeroable[i / Scale])
+ V1Idx = V2Idx = ZeroMask;
+ V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
+ V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
+ V1InUse |= (ZeroMask != V1Idx);
+ V2InUse |= (ZeroMask != V2Idx);
+ }
+ }
+
+ if (V1InUse)
+ V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
+ DAG.getBitcast(MVT::v16i8, V1),
+ DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
+ if (V2InUse)
+ V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
+ DAG.getBitcast(MVT::v16i8, V2),
+ DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
+
+ // If we need shuffled inputs from both, blend the two.
+ SDValue V;
+ if (V1InUse && V2InUse)
+ V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
+ else
+ V = V1InUse ? V1 : V2;
+
+ // Cast the result back to the correct type.
+ return DAG.getBitcast(VT, V);
+}
+
+/// \brief Generic lowering of 8-lane i16 shuffles.
+///
+/// This handles both single-input shuffles and combined shuffle/blends with
+/// two inputs. The single input shuffles are immediately delegated to
+/// a dedicated lowering routine.
+///
+/// The blends are lowered in one of three fundamental ways. If there are few
+/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
+/// of the input is significantly cheaper when lowered as an interleaving of
+/// the two inputs, try to interleave them. Otherwise, blend the low and high
+/// halves of the inputs separately (making them have relatively few inputs)
+/// and then concatenate them.
+static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
+ SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
+ assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative.
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+ DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return ZExt;
+
+ int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
+
+ if (NumV2Inputs == 0) {
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
+ DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
+ return Broadcast;
+
+ // Try to use shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
+ Zeroable, Subtarget, DAG))
+ return Shift;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
+ return V;
+
+ // Try to use byte rotation instructions.
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
+ Mask, Subtarget, DAG))
+ return Rotate;
+
+ // Make a copy of the mask so it can be modified.
+ SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
+ return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
+ MutableMask, Subtarget,
+ DAG);
+ }
+
+ assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
+ "All single-input shuffles should be canonicalized to be V1-input "
+ "shuffles.");
+
+ // Try to use shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Shift;
+
+ // See if we can use SSE4A Extraction / Insertion.
+ if (Subtarget.hasSSE4A())
+ if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
+ Zeroable, DAG))
+ return V;
+
+ // There are special ways we can lower some single-element blends.
+ if (NumV2Inputs == 1)
+ if (SDValue V = lowerVectorShuffleAsElementInsertion(
+ DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return V;
+
+ // We have different paths for blend lowering, but they all must use the
+ // *exact* same predicate.
+ bool IsBlendSupported = Subtarget.hasSSE41();
+ if (IsBlendSupported)
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
+ if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
+ Zeroable, DAG))
+ return Masked;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
+ return V;
+
+ // Try to use byte rotation instructions.
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
+ return Rotate;
+
+ if (SDValue BitBlend =
+ lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
+ return BitBlend;
+
+ // Try to lower by permuting the inputs into an unpack instruction.
+ if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
+ V2, Mask, DAG))
+ return Unpack;
+
+ // If we can't directly blend but can use PSHUFB, that will be better as it
+ // can both shuffle and set up the inefficient blend.
+ if (!IsBlendSupported && Subtarget.hasSSSE3()) {
+ bool V1InUse, V2InUse;
+ return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
+ Zeroable, DAG, V1InUse, V2InUse);
+ }
+
+ // We can always bit-blend if we have to so the fallback strategy is to
+ // decompose into single-input permutes and blends.
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
+ Mask, DAG);
+}
+
+/// \brief Check whether a compaction lowering can be done by dropping even
+/// elements and compute how many times even elements must be dropped.
+///
+/// This handles shuffles which take every Nth element where N is a power of
+/// two. Example shuffle masks:
+///
+/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
+/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
+/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
+/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
+/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
+///
+/// Any of these lanes can of course be undef.
+///
+/// This routine only supports N <= 3.
+/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
+/// for larger N.
+///
+/// \returns N above, or the number of times even elements must be dropped if
+/// there is such a number. Otherwise returns zero.
+static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
+ bool IsSingleInput) {
+ // The modulus for the shuffle vector entries is based on whether this is
+ // a single input or not.
+ int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
+ assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
+ "We should only be called with masks with a power-of-2 size!");
+
+ uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
+
+ // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
+ // and 2^3 simultaneously. This is because we may have ambiguity with
+ // partially undef inputs.
+ bool ViableForN[3] = {true, true, true};
+
+ for (int i = 0, e = Mask.size(); i < e; ++i) {
+ // Ignore undef lanes, we'll optimistically collapse them to the pattern we
+ // want.
+ if (Mask[i] < 0)
+ continue;
+
+ bool IsAnyViable = false;
+ for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
+ if (ViableForN[j]) {
+ uint64_t N = j + 1;
+
+ // The shuffle mask must be equal to (i * 2^N) % M.
+ if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
+ IsAnyViable = true;
+ else
+ ViableForN[j] = false;
+ }
+ // Early exit if we exhaust the possible powers of two.
+ if (!IsAnyViable)
+ break;
+ }
+
+ for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
+ if (ViableForN[j])
+ return j + 1;
+
+ // Return 0 as there is no viable power of two.
+ return 0;
+}
+
+/// \brief Generic lowering of v16i8 shuffles.
+///
+/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
+/// detect any complexity reducing interleaving. If that doesn't help, it uses
+/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
+/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
+/// back together.
+static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
+ SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
+ assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+
+ // Try to use shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Shift;
+
+ // Try to use byte rotation instructions.
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
+ return Rotate;
+
+ // Try to use a zext lowering.
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+ DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return ZExt;
+
+ // See if we can use SSE4A Extraction / Insertion.
+ if (Subtarget.hasSSE4A())
+ if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
+ Zeroable, DAG))
+ return V;
+
+ int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
+
+ // For single-input shuffles, there are some nicer lowering tricks we can use.
+ if (NumV2Elements == 0) {
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
+ DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
+ return Broadcast;
+
+ // Check whether we can widen this to an i16 shuffle by duplicating bytes.
+ // Notably, this handles splat and partial-splat shuffles more efficiently.
+ // However, it only makes sense if the pre-duplication shuffle simplifies
+ // things significantly. Currently, this means we need to be able to
+ // express the pre-duplication shuffle as an i16 shuffle.
+ //
+ // FIXME: We should check for other patterns which can be widened into an
+ // i16 shuffle as well.
+ auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
+ for (int i = 0; i < 16; i += 2)
+ if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
+ return false;
+
+ return true;
+ };
+ auto tryToWidenViaDuplication = [&]() -> SDValue {
+ if (!canWidenViaDuplication(Mask))
+ return SDValue();
+ SmallVector<int, 4> LoInputs;
+ std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
+ [](int M) { return M >= 0 && M < 8; });
+ std::sort(LoInputs.begin(), LoInputs.end());
+ LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
+ LoInputs.end());
+ SmallVector<int, 4> HiInputs;
+ std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
+ [](int M) { return M >= 8; });
+ std::sort(HiInputs.begin(), HiInputs.end());
+ HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
+ HiInputs.end());
+
+ bool TargetLo = LoInputs.size() >= HiInputs.size();
+ ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
+ ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
+
+ int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
+ SmallDenseMap<int, int, 8> LaneMap;
+ for (int I : InPlaceInputs) {
+ PreDupI16Shuffle[I/2] = I/2;
+ LaneMap[I] = I;
+ }
+ int j = TargetLo ? 0 : 4, je = j + 4;
+ for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
+ // Check if j is already a shuffle of this input. This happens when
+ // there are two adjacent bytes after we move the low one.
+ if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
+ // If we haven't yet mapped the input, search for a slot into which
+ // we can map it.
+ while (j < je && PreDupI16Shuffle[j] >= 0)
+ ++j;
+
+ if (j == je)
+ // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
+ return SDValue();
+
+ // Map this input with the i16 shuffle.
+ PreDupI16Shuffle[j] = MovingInputs[i] / 2;
+ }
+
+ // Update the lane map based on the mapping we ended up with.
+ LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
+ }
+ V1 = DAG.getBitcast(
+ MVT::v16i8,
+ DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
+ DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
+
+ // Unpack the bytes to form the i16s that will be shuffled into place.
+ V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
+ MVT::v16i8, V1, V1);
+
+ int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+ for (int i = 0; i < 16; ++i)
+ if (Mask[i] >= 0) {
+ int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
+ assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
+ if (PostDupI16Shuffle[i / 2] < 0)
+ PostDupI16Shuffle[i / 2] = MappedMask;
+ else
+ assert(PostDupI16Shuffle[i / 2] == MappedMask &&
+ "Conflicting entrties in the original shuffle!");
+ }
+ return DAG.getBitcast(
+ MVT::v16i8,
+ DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
+ DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
+ };
+ if (SDValue V = tryToWidenViaDuplication())
+ return V;
+ }
+
+ if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
+ Zeroable, DAG))
+ return Masked;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
+ return V;
+
+ // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
+ // with PSHUFB. It is important to do this before we attempt to generate any
+ // blends but after all of the single-input lowerings. If the single input
+ // lowerings can find an instruction sequence that is faster than a PSHUFB, we
+ // want to preserve that and we can DAG combine any longer sequences into
+ // a PSHUFB in the end. But once we start blending from multiple inputs,
+ // the complexity of DAG combining bad patterns back into PSHUFB is too high,
+ // and there are *very* few patterns that would actually be faster than the
+ // PSHUFB approach because of its ability to zero lanes.
+ //
+ // FIXME: The only exceptions to the above are blends which are exact
+ // interleavings with direct instructions supporting them. We currently don't
+ // handle those well here.
+ if (Subtarget.hasSSSE3()) {
+ bool V1InUse = false;
+ bool V2InUse = false;
+
+ SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
+ DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
+
+ // If both V1 and V2 are in use and we can use a direct blend or an unpack,
+ // do so. This avoids using them to handle blends-with-zero which is
+ // important as a single pshufb is significantly faster for that.
+ if (V1InUse && V2InUse) {
+ if (Subtarget.hasSSE41())
+ if (SDValue Blend = lowerVectorShuffleAsBlend(
+ DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return Blend;
+
+ // We can use an unpack to do the blending rather than an or in some
+ // cases. Even though the or may be (very minorly) more efficient, we
+ // preference this lowering because there are common cases where part of
+ // the complexity of the shuffles goes away when we do the final blend as
+ // an unpack.
+ // FIXME: It might be worth trying to detect if the unpack-feeding
+ // shuffles will both be pshufb, in which case we shouldn't bother with
+ // this.
+ if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
+ DL, MVT::v16i8, V1, V2, Mask, DAG))
+ return Unpack;
+ }
+
+ return PSHUFB;
+ }
+
+ // There are special ways we can lower some single-element blends.
+ if (NumV2Elements == 1)
+ if (SDValue V = lowerVectorShuffleAsElementInsertion(
+ DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return V;
+
+ if (SDValue BitBlend =
+ lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
+ return BitBlend;
+
+ // Check whether a compaction lowering can be done. This handles shuffles
+ // which take every Nth element for some even N. See the helper function for
+ // details.
+ //
+ // We special case these as they can be particularly efficiently handled with
+ // the PACKUSB instruction on x86 and they show up in common patterns of
+ // rearranging bytes to truncate wide elements.
+ bool IsSingleInput = V2.isUndef();
+ if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
+ // NumEvenDrops is the power of two stride of the elements. Another way of
+ // thinking about it is that we need to drop the even elements this many
+ // times to get the original input.
+
+ // First we need to zero all the dropped bytes.
+ assert(NumEvenDrops <= 3 &&
+ "No support for dropping even elements more than 3 times.");
+ // We use the mask type to pick which bytes are preserved based on how many
+ // elements are dropped.
+ MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
+ SDValue ByteClearMask = DAG.getBitcast(
+ MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
+ V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
+ if (!IsSingleInput)
+ V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
+
+ // Now pack things back together.
+ V1 = DAG.getBitcast(MVT::v8i16, V1);
+ V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
+ SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
+ for (int i = 1; i < NumEvenDrops; ++i) {
+ Result = DAG.getBitcast(MVT::v8i16, Result);
+ Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
+ }
+
+ return Result;
+ }
+
+ // Handle multi-input cases by blending single-input shuffles.
+ if (NumV2Elements > 0)
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
+ Mask, DAG);
+
+ // The fallback path for single-input shuffles widens this into two v8i16
+ // vectors with unpacks, shuffles those, and then pulls them back together
+ // with a pack.
+ SDValue V = V1;
+
+ std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
+ std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
+ for (int i = 0; i < 16; ++i)
+ if (Mask[i] >= 0)
+ (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
+
+ SDValue VLoHalf, VHiHalf;
+ // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
+ // them out and avoid using UNPCK{L,H} to extract the elements of V as
+ // i16s.
+ if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
+ none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
+ // Use a mask to drop the high bytes.
+ VLoHalf = DAG.getBitcast(MVT::v8i16, V);
+ VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
+ DAG.getConstant(0x00FF, DL, MVT::v8i16));
+
+ // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
+ VHiHalf = DAG.getUNDEF(MVT::v8i16);
+
+ // Squash the masks to point directly into VLoHalf.
+ for (int &M : LoBlendMask)
+ if (M >= 0)
+ M /= 2;
+ for (int &M : HiBlendMask)
+ if (M >= 0)
+ M /= 2;
+ } else {
+ // Otherwise just unpack the low half of V into VLoHalf and the high half into
+ // VHiHalf so that we can blend them as i16s.
+ SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
+
+ VLoHalf = DAG.getBitcast(
+ MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
+ VHiHalf = DAG.getBitcast(
+ MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
+ }
+
+ SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
+ SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
+
+ return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
+}
+
+/// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
+///
+/// This routine breaks down the specific type of 128-bit shuffle and
+/// dispatches to the lowering routines accordingly.
+static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ MVT VT, SDValue V1, SDValue V2,
+ const SmallBitVector &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ switch (VT.SimpleTy) {
+ case MVT::v2i64:
+ return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ case MVT::v2f64:
+ return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ case MVT::v4i32:
+ return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ case MVT::v4f32:
+ return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ case MVT::v8i16:
+ return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ case MVT::v16i8:
+ return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+
+ default:
+ llvm_unreachable("Unimplemented!");
+ }
+}
+
+/// \brief Generic routine to split vector shuffle into half-sized shuffles.
+///
+/// This routine just extracts two subvectors, shuffles them independently, and
+/// then concatenates them back together. This should work effectively with all
+/// AVX vector shuffle types.
+static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ assert(VT.getSizeInBits() >= 256 &&
+ "Only for 256-bit or wider vector shuffles!");
+ assert(V1.getSimpleValueType() == VT && "Bad operand type!");
+ assert(V2.getSimpleValueType() == VT && "Bad operand type!");
+
+ ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
+ ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
+
+ int NumElements = VT.getVectorNumElements();
+ int SplitNumElements = NumElements / 2;
+ MVT ScalarVT = VT.getVectorElementType();
+ MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
+
+ // Rather than splitting build-vectors, just build two narrower build
+ // vectors. This helps shuffling with splats and zeros.
+ auto SplitVector = [&](SDValue V) {
+ V = peekThroughBitcasts(V);
+
+ MVT OrigVT = V.getSimpleValueType();
+ int OrigNumElements = OrigVT.getVectorNumElements();
+ int OrigSplitNumElements = OrigNumElements / 2;
+ MVT OrigScalarVT = OrigVT.getVectorElementType();
+ MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
+
+ SDValue LoV, HiV;
+
+ auto *BV = dyn_cast<BuildVectorSDNode>(V);
+ if (!BV) {
+ LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
+ DAG.getIntPtrConstant(0, DL));
+ HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
+ DAG.getIntPtrConstant(OrigSplitNumElements, DL));
+ } else {
+
+ SmallVector<SDValue, 16> LoOps, HiOps;
+ for (int i = 0; i < OrigSplitNumElements; ++i) {
+ LoOps.push_back(BV->getOperand(i));
+ HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
+ }
+ LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
+ HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
+ }
+ return std::make_pair(DAG.getBitcast(SplitVT, LoV),
+ DAG.getBitcast(SplitVT, HiV));
+ };
+
+ SDValue LoV1, HiV1, LoV2, HiV2;
+ std::tie(LoV1, HiV1) = SplitVector(V1);
+ std::tie(LoV2, HiV2) = SplitVector(V2);
+
+ // Now create two 4-way blends of these half-width vectors.
+ auto HalfBlend = [&](ArrayRef<int> HalfMask) {
+ bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
+ SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
+ SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
+ SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
+ for (int i = 0; i < SplitNumElements; ++i) {
+ int M = HalfMask[i];
+ if (M >= NumElements) {
+ if (M >= NumElements + SplitNumElements)
+ UseHiV2 = true;
+ else
+ UseLoV2 = true;
+ V2BlendMask[i] = M - NumElements;
+ BlendMask[i] = SplitNumElements + i;
+ } else if (M >= 0) {
+ if (M >= SplitNumElements)
+ UseHiV1 = true;
+ else
+ UseLoV1 = true;
+ V1BlendMask[i] = M;
+ BlendMask[i] = i;
+ }
+ }
+
+ // Because the lowering happens after all combining takes place, we need to
+ // manually combine these blend masks as much as possible so that we create
+ // a minimal number of high-level vector shuffle nodes.
+
+ // First try just blending the halves of V1 or V2.
+ if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
+ return DAG.getUNDEF(SplitVT);
+ if (!UseLoV2 && !UseHiV2)
+ return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
+ if (!UseLoV1 && !UseHiV1)
+ return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
+
+ SDValue V1Blend, V2Blend;
+ if (UseLoV1 && UseHiV1) {
+ V1Blend =
+ DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
+ } else {
+ // We only use half of V1 so map the usage down into the final blend mask.
+ V1Blend = UseLoV1 ? LoV1 : HiV1;
+ for (int i = 0; i < SplitNumElements; ++i)
+ if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
+ BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
+ }
+ if (UseLoV2 && UseHiV2) {
+ V2Blend =
+ DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
+ } else {
+ // We only use half of V2 so map the usage down into the final blend mask.
+ V2Blend = UseLoV2 ? LoV2 : HiV2;
+ for (int i = 0; i < SplitNumElements; ++i)
+ if (BlendMask[i] >= SplitNumElements)
+ BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
+ }
+ return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
+ };
+ SDValue Lo = HalfBlend(LoMask);
+ SDValue Hi = HalfBlend(HiMask);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
+}
+
+/// \brief Either split a vector in halves or decompose the shuffles and the
+/// blend.
+///
+/// This is provided as a good fallback for many lowerings of non-single-input
+/// shuffles with more than one 128-bit lane. In those cases, we want to select
+/// between splitting the shuffle into 128-bit components and stitching those
+/// back together vs. extracting the single-input shuffles and blending those
+/// results.
+static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
+ SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ assert(!V2.isUndef() && "This routine must not be used to lower single-input "
+ "shuffles as it could then recurse on itself.");
+ int Size = Mask.size();
+
+ // If this can be modeled as a broadcast of two elements followed by a blend,
+ // prefer that lowering. This is especially important because broadcasts can
+ // often fold with memory operands.
+ auto DoBothBroadcast = [&] {
+ int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
+ for (int M : Mask)
+ if (M >= Size) {
+ if (V2BroadcastIdx < 0)
+ V2BroadcastIdx = M - Size;
+ else if (M - Size != V2BroadcastIdx)
+ return false;
+ } else if (M >= 0) {
+ if (V1BroadcastIdx < 0)
+ V1BroadcastIdx = M;
+ else if (M != V1BroadcastIdx)
+ return false;
+ }
+ return true;
+ };
+ if (DoBothBroadcast())
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
+ DAG);
+
+ // If the inputs all stem from a single 128-bit lane of each input, then we
+ // split them rather than blending because the split will decompose to
+ // unusually few instructions.
+ int LaneCount = VT.getSizeInBits() / 128;
+ int LaneSize = Size / LaneCount;
+ SmallBitVector LaneInputs[2];
+ LaneInputs[0].resize(LaneCount, false);
+ LaneInputs[1].resize(LaneCount, false);
+ for (int i = 0; i < Size; ++i)
+ if (Mask[i] >= 0)
+ LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
+ if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
+ return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
+
+ // Otherwise, just fall back to decomposed shuffles and a blend. This requires
+ // that the decomposed single-input shuffles don't end up here.
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
+}
+
+/// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
+/// a permutation and blend of those lanes.
+///
+/// This essentially blends the out-of-lane inputs to each lane into the lane
+/// from a permuted copy of the vector. This lowering strategy results in four
+/// instructions in the worst case for a single-input cross lane shuffle which
+/// is lower than any other fully general cross-lane shuffle strategy I'm aware
+/// of. Special cases for each particular shuffle pattern should be handled
+/// prior to trying this lowering.
+static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
+ SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ // FIXME: This should probably be generalized for 512-bit vectors as well.
+ assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
+ int Size = Mask.size();
+ int LaneSize = Size / 2;
+
+ // If there are only inputs from one 128-bit lane, splitting will in fact be
+ // less expensive. The flags track whether the given lane contains an element
+ // that crosses to another lane.
+ bool LaneCrossing[2] = {false, false};
+ for (int i = 0; i < Size; ++i)
+ if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
+ LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
+ if (!LaneCrossing[0] || !LaneCrossing[1])
+ return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
+
+ assert(V2.isUndef() &&
+ "This last part of this routine only works on single input shuffles");
+
+ SmallVector<int, 32> FlippedBlendMask(Size);
+ for (int i = 0; i < Size; ++i)
+ FlippedBlendMask[i] =
+ Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
+ ? Mask[i]
+ : Mask[i] % LaneSize +
+ (i / LaneSize) * LaneSize + Size);
+
+ // Flip the vector, and blend the results which should now be in-lane. The
+ // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
+ // 5 for the high source. The value 3 selects the high half of source 2 and
+ // the value 2 selects the low half of source 2. We only use source 2 to
+ // allow folding it into a memory operand.
+ unsigned PERMMask = 3 | 2 << 4;
+ SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
+ V1, DAG.getConstant(PERMMask, DL, MVT::i8));
+ return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
+}
+
+/// \brief Handle lowering 2-lane 128-bit shuffles.
+static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ // TODO: If minimizing size and one of the inputs is a zero vector and the
+ // the zero vector has only one use, we could use a VPERM2X128 to save the
+ // instruction bytes needed to explicitly generate the zero vector.
+
+ // Blends are faster and handle all the non-lane-crossing cases.
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
+ bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode());
+ bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode());
+
+ // If either input operand is a zero vector, use VPERM2X128 because its mask
+ // allows us to replace the zero input with an implicit zero.
+ if (!IsV1Zero && !IsV2Zero) {
+ // Check for patterns which can be matched with a single insert of a 128-bit
+ // subvector.
+ bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
+ if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
+ // With AVX2 we should use VPERMQ/VPERMPD to allow memory folding.
+ if (Subtarget.hasAVX2() && V2.isUndef())
+ return SDValue();
+
+ MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
+ VT.getVectorNumElements() / 2);
+ SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
+ DAG.getIntPtrConstant(0, DL));
+ SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
+ OnlyUsesV1 ? V1 : V2,
+ DAG.getIntPtrConstant(0, DL));
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
+ }
+ }
+
+ // Otherwise form a 128-bit permutation. After accounting for undefs,
+ // convert the 64-bit shuffle mask selection values into 128-bit
+ // selection bits by dividing the indexes by 2 and shifting into positions
+ // defined by a vperm2*128 instruction's immediate control byte.
+
+ // The immediate permute control byte looks like this:
+ // [1:0] - select 128 bits from sources for low half of destination
+ // [2] - ignore
+ // [3] - zero low half of destination
+ // [5:4] - select 128 bits from sources for high half of destination
+ // [6] - ignore
+ // [7] - zero high half of destination
+
+ int MaskLO = Mask[0];
+ if (MaskLO == SM_SentinelUndef)
+ MaskLO = Mask[1] == SM_SentinelUndef ? 0 : Mask[1];
+
+ int MaskHI = Mask[2];
+ if (MaskHI == SM_SentinelUndef)
+ MaskHI = Mask[3] == SM_SentinelUndef ? 0 : Mask[3];
+
+ unsigned PermMask = MaskLO / 2 | (MaskHI / 2) << 4;
+
+ // If either input is a zero vector, replace it with an undef input.
+ // Shuffle mask values < 4 are selecting elements of V1.
+ // Shuffle mask values >= 4 are selecting elements of V2.
+ // Adjust each half of the permute mask by clearing the half that was
+ // selecting the zero vector and setting the zero mask bit.
+ if (IsV1Zero) {
+ V1 = DAG.getUNDEF(VT);
+ if (MaskLO < 4)
+ PermMask = (PermMask & 0xf0) | 0x08;
+ if (MaskHI < 4)
+ PermMask = (PermMask & 0x0f) | 0x80;
+ }
+ if (IsV2Zero) {
+ V2 = DAG.getUNDEF(VT);
+ if (MaskLO >= 4)
+ PermMask = (PermMask & 0xf0) | 0x08;
+ if (MaskHI >= 4)
+ PermMask = (PermMask & 0x0f) | 0x80;
+ }
+
+ return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
+ DAG.getConstant(PermMask, DL, MVT::i8));
+}
+
+/// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
+/// shuffling each lane.
+///
+/// This will only succeed when the result of fixing the 128-bit lanes results
+/// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
+/// each 128-bit lanes. This handles many cases where we can quickly blend away
+/// the lane crosses early and then use simpler shuffles within each lane.
+///
+/// FIXME: It might be worthwhile at some point to support this without
+/// requiring the 128-bit lane-relative shuffles to be repeating, but currently
+/// in x86 only floating point has interesting non-repeating shuffles, and even
+/// those are still *marginally* more expensive.
+static SDValue lowerVectorShuffleByMerging128BitLanes(
+ const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+ assert(!V2.isUndef() && "This is only useful with multiple inputs.");
+
+ int Size = Mask.size();
+ int LaneSize = 128 / VT.getScalarSizeInBits();
+ int NumLanes = Size / LaneSize;
+ assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
+
+ // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
+ // check whether the in-128-bit lane shuffles share a repeating pattern.
+ SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
+ SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
+ for (int i = 0; i < Size; ++i) {
+ if (Mask[i] < 0)
+ continue;
+
+ int j = i / LaneSize;
+
+ if (Lanes[j] < 0) {
+ // First entry we've seen for this lane.
+ Lanes[j] = Mask[i] / LaneSize;
+ } else if (Lanes[j] != Mask[i] / LaneSize) {
+ // This doesn't match the lane selected previously!
+ return SDValue();
+ }
+
+ // Check that within each lane we have a consistent shuffle mask.
+ int k = i % LaneSize;
+ if (InLaneMask[k] < 0) {
+ InLaneMask[k] = Mask[i] % LaneSize;
+ } else if (InLaneMask[k] != Mask[i] % LaneSize) {
+ // This doesn't fit a repeating in-lane mask.
+ return SDValue();
+ }
+ }
+
+ // First shuffle the lanes into place.
+ MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
+ VT.getSizeInBits() / 64);
+ SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
+ for (int i = 0; i < NumLanes; ++i)
+ if (Lanes[i] >= 0) {
+ LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
+ LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
+ }
+
+ V1 = DAG.getBitcast(LaneVT, V1);
+ V2 = DAG.getBitcast(LaneVT, V2);
+ SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
+
+ // Cast it back to the type we actually want.
+ LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
+
+ // Now do a simple shuffle that isn't lane crossing.
+ SmallVector<int, 8> NewMask((unsigned)Size, -1);
+ for (int i = 0; i < Size; ++i)
+ if (Mask[i] >= 0)
+ NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
+ assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
+ "Must not introduce lane crosses at this point!");
+
+ return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
+}
+
+/// Lower shuffles where an entire half of a 256-bit vector is UNDEF.
+/// This allows for fast cases such as subvector extraction/insertion
+/// or shuffling smaller vector types which can lower more efficiently.
+static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
+ SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(VT.is256BitVector() && "Expected 256-bit vector");
+
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned HalfNumElts = NumElts / 2;
+ MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
+
+ bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
+ bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
+ if (!UndefLower && !UndefUpper)
+ return SDValue();
+
+ // Upper half is undef and lower half is whole upper subvector.
+ // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
+ if (UndefUpper &&
+ isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
+ DAG.getIntPtrConstant(HalfNumElts, DL));
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
+ DAG.getIntPtrConstant(0, DL));
+ }
+
+ // Lower half is undef and upper half is whole lower subvector.
+ // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
+ if (UndefLower &&
+ isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
+ DAG.getIntPtrConstant(0, DL));
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
+ DAG.getIntPtrConstant(HalfNumElts, DL));
+ }
+
+ // If the shuffle only uses two of the four halves of the input operands,
+ // then extract them and perform the 'half' shuffle at half width.
+ // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
+ int HalfIdx1 = -1, HalfIdx2 = -1;
+ SmallVector<int, 8> HalfMask(HalfNumElts);
+ unsigned Offset = UndefLower ? HalfNumElts : 0;
+ for (unsigned i = 0; i != HalfNumElts; ++i) {
+ int M = Mask[i + Offset];
+ if (M < 0) {
+ HalfMask[i] = M;
+ continue;
+ }
+
+ // Determine which of the 4 half vectors this element is from.
+ // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
+ int HalfIdx = M / HalfNumElts;
+
+ // Determine the element index into its half vector source.
+ int HalfElt = M % HalfNumElts;
+
+ // We can shuffle with up to 2 half vectors, set the new 'half'
+ // shuffle mask accordingly.
+ if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
+ HalfMask[i] = HalfElt;
+ HalfIdx1 = HalfIdx;
+ continue;
+ }
+ if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
+ HalfMask[i] = HalfElt + HalfNumElts;
+ HalfIdx2 = HalfIdx;
+ continue;
+ }
+
+ // Too many half vectors referenced.
+ return SDValue();
+ }
+ assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
+
+ // Only shuffle the halves of the inputs when useful.
+ int NumLowerHalves =
+ (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
+ int NumUpperHalves =
+ (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
+
+ // uuuuXXXX - don't extract uppers just to insert again.
+ if (UndefLower && NumUpperHalves != 0)
+ return SDValue();
+
+ // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
+ if (UndefUpper && NumUpperHalves == 2)
+ return SDValue();
+
+ // AVX2 - XXXXuuuu - always extract lowers.
+ if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
+ // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
+ if (VT == MVT::v4f64 || VT == MVT::v4i64)
+ return SDValue();
+ // AVX2 supports variable 32-bit element cross-lane shuffles.
+ if (VT == MVT::v8f32 || VT == MVT::v8i32) {
+ // XXXXuuuu - don't extract lowers and uppers.
+ if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
+ return SDValue();
+ }
+ }
+
+ auto GetHalfVector = [&](int HalfIdx) {
+ if (HalfIdx < 0)
+ return DAG.getUNDEF(HalfVT);
+ SDValue V = (HalfIdx < 2 ? V1 : V2);
+ HalfIdx = (HalfIdx % 2) * HalfNumElts;
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
+ DAG.getIntPtrConstant(HalfIdx, DL));
+ };
+
+ SDValue Half1 = GetHalfVector(HalfIdx1);
+ SDValue Half2 = GetHalfVector(HalfIdx2);
+ SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
+ DAG.getIntPtrConstant(Offset, DL));
+}
+
+/// \brief Test whether the specified input (0 or 1) is in-place blended by the
+/// given mask.
+///
+/// This returns true if the elements from a particular input are already in the
+/// slot required by the given mask and require no permutation.
+static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
+ assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
+ int Size = Mask.size();
+ for (int i = 0; i < Size; ++i)
+ if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
+ return false;
+
+ return true;
+}
+
+/// Handle case where shuffle sources are coming from the same 128-bit lane and
+/// every lane can be represented as the same repeating mask - allowing us to
+/// shuffle the sources with the repeating shuffle and then permute the result
+/// to the destination lanes.
+static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
+ const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+ int NumElts = VT.getVectorNumElements();
+ int NumLanes = VT.getSizeInBits() / 128;
+ int NumLaneElts = NumElts / NumLanes;
+
+ // On AVX2 we may be able to just shuffle the lowest elements and then
+ // broadcast the result.
+ if (Subtarget.hasAVX2()) {
+ for (unsigned BroadcastSize : {16, 32, 64}) {
+ if (BroadcastSize <= VT.getScalarSizeInBits())
+ continue;
+ int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
+
+ // Attempt to match a repeating pattern every NumBroadcastElts,
+ // accounting for UNDEFs but only references the lowest 128-bit
+ // lane of the inputs.
+ auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
+ for (int i = 0; i != NumElts; i += NumBroadcastElts)
+ for (int j = 0; j != NumBroadcastElts; ++j) {
+ int M = Mask[i + j];
+ if (M < 0)
+ continue;
+ int &R = RepeatMask[j];
+ if (0 != ((M % NumElts) / NumLaneElts))
+ return false;
+ if (0 <= R && R != M)
+ return false;
+ R = M;
+ }
+ return true;
+ };
+
+ SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
+ if (!FindRepeatingBroadcastMask(RepeatMask))
+ continue;
+
+ // Shuffle the (lowest) repeated elements in place for broadcast.
+ SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
+
+ // Shuffle the actual broadcast.
+ SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
+ for (int i = 0; i != NumElts; i += NumBroadcastElts)
+ for (int j = 0; j != NumBroadcastElts; ++j)
+ BroadcastMask[i + j] = j;
+ return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
+ BroadcastMask);
+ }
+ }
+
+ // Bail if the shuffle mask doesn't cross 128-bit lanes.
+ if (!is128BitLaneCrossingShuffleMask(VT, Mask))
+ return SDValue();
+
+ // Bail if we already have a repeated lane shuffle mask.
+ SmallVector<int, 8> RepeatedShuffleMask;
+ if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
+ return SDValue();
+
+ // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
+ // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
+ int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
+ int NumSubLanes = NumLanes * SubLaneScale;
+ int NumSubLaneElts = NumLaneElts / SubLaneScale;
+
+ // Check that all the sources are coming from the same lane and see if we can
+ // form a repeating shuffle mask (local to each sub-lane). At the same time,
+ // determine the source sub-lane for each destination sub-lane.
+ int TopSrcSubLane = -1;
+ SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
+ SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
+ SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
+ SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
+
+ for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
+ // Extract the sub-lane mask, check that it all comes from the same lane
+ // and normalize the mask entries to come from the first lane.
+ int SrcLane = -1;
+ SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
+ for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
+ int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
+ if (M < 0)
+ continue;
+ int Lane = (M % NumElts) / NumLaneElts;
+ if ((0 <= SrcLane) && (SrcLane != Lane))
+ return SDValue();
+ SrcLane = Lane;
+ int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
+ SubLaneMask[Elt] = LocalM;
+ }
+
+ // Whole sub-lane is UNDEF.
+ if (SrcLane < 0)
+ continue;
+
+ // Attempt to match against the candidate repeated sub-lane masks.
+ for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
+ auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
+ for (int i = 0; i != NumSubLaneElts; ++i) {
+ if (M1[i] < 0 || M2[i] < 0)
+ continue;
+ if (M1[i] != M2[i])
+ return false;
+ }
+ return true;
+ };
+
+ auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
+ if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
+ continue;
+
+ // Merge the sub-lane mask into the matching repeated sub-lane mask.
+ for (int i = 0; i != NumSubLaneElts; ++i) {
+ int M = SubLaneMask[i];
+ if (M < 0)
+ continue;
+ assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
+ "Unexpected mask element");
+ RepeatedSubLaneMask[i] = M;
+ }
+
+ // Track the top most source sub-lane - by setting the remaining to UNDEF
+ // we can greatly simplify shuffle matching.
+ int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
+ TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
+ Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
+ break;
+ }
+
+ // Bail if we failed to find a matching repeated sub-lane mask.
+ if (Dst2SrcSubLanes[DstSubLane] < 0)
+ return SDValue();
+ }
+ assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
+ "Unexpected source lane");
+
+ // Create a repeating shuffle mask for the entire vector.
+ SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
+ for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
+ int Lane = SubLane / SubLaneScale;
+ auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
+ for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
+ int M = RepeatedSubLaneMask[Elt];
+ if (M < 0)
+ continue;
+ int Idx = (SubLane * NumSubLaneElts) + Elt;
+ RepeatedMask[Idx] = M + (Lane * NumLaneElts);
+ }
+ }
+ SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
+
+ // Shuffle each source sub-lane to its destination.
+ SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
+ for (int i = 0; i != NumElts; i += NumSubLaneElts) {
+ int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
+ if (SrcSubLane < 0)
+ continue;
+ for (int j = 0; j != NumSubLaneElts; ++j)
+ SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
+ }
+
+ return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
+ SubLaneMask);
+}
+
+static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
+ unsigned &ShuffleImm,
+ ArrayRef<int> Mask) {
+ int NumElts = VT.getVectorNumElements();
+ assert(VT.getScalarType() == MVT::f64 &&
+ (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
+ "Unexpected data type for VSHUFPD");
+
+ // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
+ // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
+ ShuffleImm = 0;
+ bool ShufpdMask = true;
+ bool CommutableMask = true;
+ for (int i = 0; i < NumElts; ++i) {
+ if (Mask[i] == SM_SentinelUndef)
+ continue;
+ if (Mask[i] < 0)
+ return false;
+ int Val = (i & 6) + NumElts * (i & 1);
+ int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
+ if (Mask[i] < Val || Mask[i] > Val + 1)
+ ShufpdMask = false;
+ if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
+ CommutableMask = false;
+ ShuffleImm |= (Mask[i] % 2) << i;
+ }
+
+ if (ShufpdMask)
+ return true;
+ if (CommutableMask) {
+ std::swap(V1, V2);
+ return true;
+ }
+
+ return false;
+}
+
+static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1,
+ SDValue V2, SelectionDAG &DAG) {
+ unsigned Immediate = 0;
+ if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
+ return SDValue();
+
+ return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
+ DAG.getConstant(Immediate, DL, MVT::i8));
+}
+
+static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1,
+ SDValue V2, SelectionDAG &DAG) {
+ MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
+ MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
+
+ SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
+ if (V2.isUndef())
+ return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
+
+ return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
+}
+
+/// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
+///
+/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
+/// isn't available.
+static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
+ SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
+ assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+ SmallVector<int, 4> WidenedMask;
+ if (canWidenShuffleElements(Mask, WidenedMask))
+ if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return V;
+
+ if (V2.isUndef()) {
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
+ DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
+ return Broadcast;
+
+ // Use low duplicate instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
+ return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
+
+ if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
+ // Non-half-crossing single input shuffles can be lowered with an
+ // interleaved permutation.
+ unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
+ ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
+ return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
+ DAG.getConstant(VPERMILPMask, DL, MVT::i8));
+ }
+
+ // With AVX2 we have direct support for this permutation.
+ if (Subtarget.hasAVX2())
+ return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
+ getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+
+ // Try to create an in-lane repeating shuffle mask and then shuffle the
+ // the results into the target lanes.
+ if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+ DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
+ // Otherwise, fall back.
+ return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
+ DAG);
+ }
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
+ return V;
+
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
+ // Check if the blend happens to exactly fit that of SHUFPD.
+ if (SDValue Op =
+ lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
+ return Op;
+
+ // Try to create an in-lane repeating shuffle mask and then shuffle the
+ // the results into the target lanes.
+ if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+ DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle. However, if we have AVX2 and either inputs are already in place,
+ // we will be able to shuffle even across lanes the other input in a single
+ // instruction so skip this pattern.
+ if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
+ isShuffleMaskInputInPlace(1, Mask))))
+ if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
+ return Result;
+
+ // If we have AVX2 then we always want to lower with a blend because an v4 we
+ // can fully permute the elements.
+ if (Subtarget.hasAVX2())
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
+ Mask, DAG);
+
+ // Otherwise fall back on generic lowering.
+ return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
+}
+
+/// \brief Handle lowering of 4-lane 64-bit integer shuffles.
+///
+/// This routine is only called when we have AVX2 and thus a reasonable
+/// instruction set for v4i64 shuffling..
+static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
+ SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
+ assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+ assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
+
+ SmallVector<int, 4> WidenedMask;
+ if (canWidenShuffleElements(Mask, WidenedMask))
+ if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return V;
+
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
+ if (V2.isUndef()) {
+ // When the shuffle is mirrored between the 128-bit lanes of the unit, we
+ // can use lower latency instructions that will operate on both lanes.
+ SmallVector<int, 2> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
+ SmallVector<int, 4> PSHUFDMask;
+ scaleShuffleMask(2, RepeatedMask, PSHUFDMask);
+ return DAG.getBitcast(
+ MVT::v4i64,
+ DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
+ DAG.getBitcast(MVT::v8i32, V1),
+ getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
+ }
+
+ // AVX2 provides a direct instruction for permuting a single input across
+ // lanes.
+ return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
+ getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+ }
+
+ // Try to use shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Shift;
+
+ // If we have VLX support, we can use VALIGN.
+ if (Subtarget.hasVLX())
+ if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
+ Mask, Subtarget, DAG))
+ return Rotate;
+
+ // Try to use PALIGNR.
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
+ Mask, Subtarget, DAG))
+ return Rotate;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
+ return V;
+
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle. However, if we have AVX2 and either inputs are already in place,
+ // we will be able to shuffle even across lanes the other input in a single
+ // instruction so skip this pattern.
+ if (!isShuffleMaskInputInPlace(0, Mask) &&
+ !isShuffleMaskInputInPlace(1, Mask))
+ if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
+ return Result;
+
+ // Otherwise fall back on generic blend lowering.
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
+ Mask, DAG);
+}
+
+/// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
+///
+/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
+/// isn't available.
+static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
+ SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
+ assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
+ // If the shuffle mask is repeated in each 128-bit lane, we have many more
+ // options to efficiently lower the shuffle.
+ SmallVector<int, 4> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
+ assert(RepeatedMask.size() == 4 &&
+ "Repeated masks must be half the mask width!");
+
+ // Use even/odd duplicate instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
+ return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
+ if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
+ return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
+
+ if (V2.isUndef())
+ return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
+ getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
+ return V;
+
+ // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
+ // have already handled any direct blends.
+ return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
+ }
+
+ // Try to create an in-lane repeating shuffle mask and then shuffle the
+ // the results into the target lanes.
+ if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+ DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
+ // If we have a single input shuffle with different shuffle patterns in the
+ // two 128-bit lanes use the variable mask to VPERMILPS.
+ if (V2.isUndef()) {
+ SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
+ if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
+ return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
+
+ if (Subtarget.hasAVX2())
+ return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
+
+ // Otherwise, fall back.
+ return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
+ DAG);
+ }
+
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle.
+ if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
+ return Result;
+
+ // If we have AVX2 then we always want to lower with a blend because at v8 we
+ // can fully permute the elements.
+ if (Subtarget.hasAVX2())
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
+ Mask, DAG);
+
+ // Otherwise fall back on generic lowering.
+ return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
+}
+
+/// \brief Handle lowering of 8-lane 32-bit integer shuffles.
+///
+/// This routine is only called when we have AVX2 and thus a reasonable
+/// instruction set for v8i32 shuffling..
+static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
+ SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
+ assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+ assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
+
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative. It also allows us to fold memory operands into the
+ // shuffle in many cases.
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+ DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return ZExt;
+
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
+ // If the shuffle mask is repeated in each 128-bit lane we can use more
+ // efficient instructions that mirror the shuffles across the two 128-bit
+ // lanes.
+ SmallVector<int, 4> RepeatedMask;
+ bool Is128BitLaneRepeatedShuffle =
+ is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
+ if (Is128BitLaneRepeatedShuffle) {
+ assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
+ if (V2.isUndef())
+ return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
+ getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
+ return V;
+ }
+
+ // Try to use shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Shift;
+
+ // If we have VLX support, we can use VALIGN.
+ if (Subtarget.hasVLX())
+ if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
+ Mask, Subtarget, DAG))
+ return Rotate;
+
+ // Try to use byte rotation instructions.
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
+ return Rotate;
+
+ // Try to create an in-lane repeating shuffle mask and then shuffle the
+ // results into the target lanes.
+ if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+ DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
+ // If the shuffle patterns aren't repeated but it is a single input, directly
+ // generate a cross-lane VPERMD instruction.
+ if (V2.isUndef()) {
+ SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
+ return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
+ }
+
+ // Assume that a single SHUFPS is faster than an alternative sequence of
+ // multiple instructions (even if the CPU has a domain penalty).
+ // If some CPU is harmed by the domain switch, we can fix it in a later pass.
+ if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
+ SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
+ SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
+ SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
+ CastV1, CastV2, DAG);
+ return DAG.getBitcast(MVT::v8i32, ShufPS);
+ }
+
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle.
+ if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
+ return Result;
+
+ // Otherwise fall back on generic blend lowering.
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
+ Mask, DAG);
+}
+
+/// \brief Handle lowering of 16-lane 16-bit integer shuffles.
+///
+/// This routine is only called when we have AVX2 and thus a reasonable
+/// instruction set for v16i16 shuffling..
+static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
+ SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
+ assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+ assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
+
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative. It also allows us to fold memory operands into the
+ // shuffle in many cases.
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+ DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return ZExt;
+
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
+ return V;
+
+ // Try to use shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Shift;
+
+ // Try to use byte rotation instructions.
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
+ return Rotate;
+
+ // Try to create an in-lane repeating shuffle mask and then shuffle the
+ // the results into the target lanes.
+ if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+ DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
+ if (V2.isUndef()) {
+ // There are no generalized cross-lane shuffle operations available on i16
+ // element types.
+ if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
+ return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
+ Mask, DAG);
+
+ SmallVector<int, 8> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
+ // As this is a single-input shuffle, the repeated mask should be
+ // a strictly valid v8i16 mask that we can pass through to the v8i16
+ // lowering to handle even the v16 case.
+ return lowerV8I16GeneralSingleInputVectorShuffle(
+ DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
+ }
+ }
+
+ if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
+ DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
+ return PSHUFB;
+
+ // AVX512BWVL can lower to VPERMW.
+ if (Subtarget.hasBWI() && Subtarget.hasVLX())
+ return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
+
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle.
+ if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
+ return Result;
+
+ // Otherwise fall back on generic lowering.
+ return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
+}
+
+/// \brief Handle lowering of 32-lane 8-bit integer shuffles.
+///
+/// This routine is only called when we have AVX2 and thus a reasonable
+/// instruction set for v32i8 shuffling..
+static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
+ SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
+ assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
+ assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
+
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative. It also allows us to fold memory operands into the
+ // shuffle in many cases.
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+ DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return ZExt;
+
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
+ return V;
+
+ // Try to use shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Shift;
+
+ // Try to use byte rotation instructions.
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
+ return Rotate;
+
+ // Try to create an in-lane repeating shuffle mask and then shuffle the
+ // the results into the target lanes.
+ if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+ DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
+ // There are no generalized cross-lane shuffle operations available on i8
+ // element types.
+ if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
+ return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
+ DAG);
+
+ if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
+ DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
+ return PSHUFB;
+
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle.
+ if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
+ return Result;
+
+ // Otherwise fall back on generic lowering.
+ return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
+}
+
+/// \brief High-level routine to lower various 256-bit x86 vector shuffles.
+///
+/// This routine either breaks down the specific type of a 256-bit x86 vector
+/// shuffle or splits it into two 128-bit shuffles and fuses the results back
+/// together based on the available instructions.
+static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ MVT VT, SDValue V1, SDValue V2,
+ const SmallBitVector &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ // If we have a single input to the zero element, insert that into V1 if we
+ // can do so cheaply.
+ int NumElts = VT.getVectorNumElements();
+ int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
+
+ if (NumV2Elements == 1 && Mask[0] >= NumElts)
+ if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return Insertion;
+
+ // Handle special cases where the lower or upper half is UNDEF.
+ if (SDValue V =
+ lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
+ // There is a really nice hard cut-over between AVX1 and AVX2 that means we
+ // can check for those subtargets here and avoid much of the subtarget
+ // querying in the per-vector-type lowering routines. With AVX1 we have
+ // essentially *zero* ability to manipulate a 256-bit vector with integer
+ // types. Since we'll use floating point types there eventually, just
+ // immediately cast everything to a float and operate entirely in that domain.
+ if (VT.isInteger() && !Subtarget.hasAVX2()) {
+ int ElementBits = VT.getScalarSizeInBits();
+ if (ElementBits < 32) {
+ // No floating point type available, if we can't use the bit operations
+ // for masking/blending then decompose into 128-bit vectors.
+ if (SDValue V =
+ lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
+ return V;
+ if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
+ return V;
+ return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
+ }
+
+ MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
+ VT.getVectorNumElements());
+ V1 = DAG.getBitcast(FpVT, V1);
+ V2 = DAG.getBitcast(FpVT, V2);
+ return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
+ }
+
+ switch (VT.SimpleTy) {
+ case MVT::v4f64:
+ return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ case MVT::v4i64:
+ return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ case MVT::v8f32:
+ return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ case MVT::v8i32:
+ return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ case MVT::v16i16:
+ return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ case MVT::v32i8:
+ return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+
+ default:
+ llvm_unreachable("Not a valid 256-bit x86 vector type!");
+ }
+}
+
+/// \brief Try to lower a vector shuffle as a 128-bit shuffles.
+static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1,
+ SDValue V2, SelectionDAG &DAG) {
+ assert(VT.getScalarSizeInBits() == 64 &&
+ "Unexpected element type size for 128bit shuffle.");
+
+ // To handle 256 bit vector requires VLX and most probably
+ // function lowerV2X128VectorShuffle() is better solution.
+ assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
+
+ SmallVector<int, 4> WidenedMask;
+ if (!canWidenShuffleElements(Mask, WidenedMask))
+ return SDValue();
+
+ SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
+ // Insure elements came from the same Op.
+ int MaxOp1Index = VT.getVectorNumElements()/2 - 1;
+ for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
+ if (WidenedMask[i] == SM_SentinelZero)
+ return SDValue();
+ if (WidenedMask[i] == SM_SentinelUndef)
+ continue;
+
+ SDValue Op = WidenedMask[i] > MaxOp1Index ? V2 : V1;
+ unsigned OpIndex = (i < Size/2) ? 0 : 1;
+ if (Ops[OpIndex].isUndef())
+ Ops[OpIndex] = Op;
+ else if (Ops[OpIndex] != Op)
+ return SDValue();
+ }
+
+ // Form a 128-bit permutation.
+ // Convert the 64-bit shuffle mask selection values into 128-bit selection
+ // bits defined by a vshuf64x2 instruction's immediate control byte.
+ unsigned PermMask = 0, Imm = 0;
+ unsigned ControlBitsNum = WidenedMask.size() / 2;
+
+ for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
+ // Use first element in place of undef mask.
+ Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i];
+ PermMask |= (Imm % WidenedMask.size()) << (i * ControlBitsNum);
+ }
+
+ return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
+ DAG.getConstant(PermMask, DL, MVT::i8));
+}
+
+/// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
+static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
+ assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+
+ if (V2.isUndef()) {
+ // Use low duplicate instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
+ return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
+
+ if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
+ // Non-half-crossing single input shuffles can be lowered with an
+ // interleaved permutation.
+ unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
+ ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
+ ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
+ ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
+ return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
+ DAG.getConstant(VPERMILPMask, DL, MVT::i8));
+ }
+
+ SmallVector<int, 4> RepeatedMask;
+ if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
+ return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
+ getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
+ }
+
+ if (SDValue Shuf128 =
+ lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
+ return Shuf128;
+
+ if (SDValue Unpck =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
+ return Unpck;
+
+ // Check if the blend happens to exactly fit that of SHUFPD.
+ if (SDValue Op =
+ lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
+ return Op;
+
+ return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
+}
+
+/// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
+static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
+ SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
+ assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+
+ // If the shuffle mask is repeated in each 128-bit lane, we have many more
+ // options to efficiently lower the shuffle.
+ SmallVector<int, 4> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
+ assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
+
+ // Use even/odd duplicate instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
+ return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
+ if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
+ return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
+
+ if (V2.isUndef())
+ return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
+ getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue Unpck =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
+ return Unpck;
+
+ // Otherwise, fall back to a SHUFPS sequence.
+ return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
+ }
+
+ return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
+}
+
+/// \brief Handle lowering of 8-lane 64-bit integer shuffles.
+static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
+ SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
+ assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+
+ if (SDValue Shuf128 =
+ lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
+ return Shuf128;
+
+ if (V2.isUndef()) {
+ // When the shuffle is mirrored between the 128-bit lanes of the unit, we
+ // can use lower latency instructions that will operate on all four
+ // 128-bit lanes.
+ SmallVector<int, 2> Repeated128Mask;
+ if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
+ SmallVector<int, 4> PSHUFDMask;
+ scaleShuffleMask(2, Repeated128Mask, PSHUFDMask);
+ return DAG.getBitcast(
+ MVT::v8i64,
+ DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
+ DAG.getBitcast(MVT::v16i32, V1),
+ getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
+ }
+
+ SmallVector<int, 4> Repeated256Mask;
+ if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
+ return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
+ getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
+ }
+
+ // Try to use shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Shift;
+
+ // Try to use VALIGN.
+ if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
+ Mask, Subtarget, DAG))
+ return Rotate;
+
+ // Try to use PALIGNR.
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
+ Mask, Subtarget, DAG))
+ return Rotate;
+
+ if (SDValue Unpck =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
+ return Unpck;
+
+ return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
+}
+
+/// \brief Handle lowering of 16-lane 32-bit integer shuffles.
+static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
+ SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
+ assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative. It also allows us to fold memory operands into the
+ // shuffle in many cases.
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+ DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return ZExt;
+
+ // If the shuffle mask is repeated in each 128-bit lane we can use more
+ // efficient instructions that mirror the shuffles across the four 128-bit
+ // lanes.
+ SmallVector<int, 4> RepeatedMask;
+ bool Is128BitLaneRepeatedShuffle =
+ is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
+ if (Is128BitLaneRepeatedShuffle) {
+ assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
+ if (V2.isUndef())
+ return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
+ getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
+ return V;
+ }
+
+ // Try to use shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Shift;
+
+ // Try to use VALIGN.
+ if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
+ Mask, Subtarget, DAG))
+ return Rotate;
+
+ // Try to use byte rotation instructions.
+ if (Subtarget.hasBWI())
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
+ return Rotate;
+
+ // Assume that a single SHUFPS is faster than using a permv shuffle.
+ // If some CPU is harmed by the domain switch, we can fix it in a later pass.
+ if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
+ SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
+ SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
+ SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
+ CastV1, CastV2, DAG);
+ return DAG.getBitcast(MVT::v16i32, ShufPS);
+ }
+
+ return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
+}
+
+/// \brief Handle lowering of 32-lane 16-bit integer shuffles.
+static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
+ SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
+ assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
+ assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
+
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative. It also allows us to fold memory operands into the
+ // shuffle in many cases.
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+ DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return ZExt;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
+ return V;
+
+ // Try to use shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Shift;
+
+ // Try to use byte rotation instructions.
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
+ return Rotate;
+
+ if (V2.isUndef()) {
+ SmallVector<int, 8> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
+ // As this is a single-input shuffle, the repeated mask should be
+ // a strictly valid v8i16 mask that we can pass through to the v8i16
+ // lowering to handle even the v32 case.
+ return lowerV8I16GeneralSingleInputVectorShuffle(
+ DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
+ }
+ }
+
+ return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
+}
+
+/// \brief Handle lowering of 64-lane 8-bit integer shuffles.
+static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
+ SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
+ assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
+ assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
+
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative. It also allows us to fold memory operands into the
+ // shuffle in many cases.
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+ DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return ZExt;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
+ return V;
+
+ // Try to use shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Shift;
+
+ // Try to use byte rotation instructions.
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
+ return Rotate;
+
+ if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
+ DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
+ return PSHUFB;
+
+ // VBMI can use VPERMV/VPERMV3 byte shuffles.
+ if (Subtarget.hasVBMI())
+ return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
+
+ // FIXME: Implement direct support for this type!
+ return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
+}
+
+/// \brief High-level routine to lower various 512-bit x86 vector shuffles.
+///
+/// This routine either breaks down the specific type of a 512-bit x86 vector
+/// shuffle or splits it into two 256-bit shuffles and fuses the results back
+/// together based on the available instructions.
+static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ MVT VT, SDValue V1, SDValue V2,
+ const SmallBitVector &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(Subtarget.hasAVX512() &&
+ "Cannot lower 512-bit vectors w/ basic ISA!");
+
+ // If we have a single input to the zero element, insert that into V1 if we
+ // can do so cheaply.
+ int NumElts = Mask.size();
+ int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
+
+ if (NumV2Elements == 1 && Mask[0] >= NumElts)
+ if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return Insertion;
+
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast =
+ lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
+ return Broadcast;
+
+ // Dispatch to each element type for lowering. If we don't have support for
+ // specific element type shuffles at 512 bits, immediately split them and
+ // lower them. Each lowering routine of a given type is allowed to assume that
+ // the requisite ISA extensions for that element type are available.
+ switch (VT.SimpleTy) {
+ case MVT::v8f64:
+ return lowerV8F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
+ case MVT::v16f32:
+ return lowerV16F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
+ case MVT::v8i64:
+ return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ case MVT::v16i32:
+ return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ case MVT::v32i16:
+ return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ case MVT::v64i8:
+ return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+
+ default:
+ llvm_unreachable("Not a valid 512-bit x86 vector type!");
+ }
+}
+
+// Lower vXi1 vector shuffles.
+// There is no a dedicated instruction on AVX-512 that shuffles the masks.
+// The only way to shuffle bits is to sign-extend the mask vector to SIMD
+// vector, shuffle and then truncate it back.
+static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ MVT VT, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(Subtarget.hasAVX512() &&
+ "Cannot lower 512-bit vectors w/o basic ISA!");
+ MVT ExtVT;
+ switch (VT.SimpleTy) {
+ default:
+ llvm_unreachable("Expected a vector of i1 elements");
+ case MVT::v2i1:
+ ExtVT = MVT::v2i64;
+ break;
+ case MVT::v4i1:
+ ExtVT = MVT::v4i32;
+ break;
+ case MVT::v8i1:
+ ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL
+ break;
+ case MVT::v16i1:
+ ExtVT = MVT::v16i32;
+ break;
+ case MVT::v32i1:
+ ExtVT = MVT::v32i16;
+ break;
+ case MVT::v64i1:
+ ExtVT = MVT::v64i8;
+ break;
+ }
+
+ if (ISD::isBuildVectorAllZeros(V1.getNode()))
+ V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
+ else if (ISD::isBuildVectorAllOnes(V1.getNode()))
+ V1 = getOnesVector(ExtVT, Subtarget, DAG, DL);
+ else
+ V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
+
+ if (V2.isUndef())
+ V2 = DAG.getUNDEF(ExtVT);
+ else if (ISD::isBuildVectorAllZeros(V2.getNode()))
+ V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
+ else if (ISD::isBuildVectorAllOnes(V2.getNode()))
+ V2 = getOnesVector(ExtVT, Subtarget, DAG, DL);
+ else
+ V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
+
+ SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
+ // i1 was sign extended we can use X86ISD::CVT2MASK.
+ int NumElems = VT.getVectorNumElements();
+ if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
+ (Subtarget.hasDQI() && (NumElems < 32)))
+ return DAG.getNode(X86ISD::CVT2MASK, DL, VT, Shuffle);
+
+ return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
+}
+
+/// Helper function that returns true if the shuffle mask should be
+/// commuted to improve canonicalization.
+static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
+ int NumElements = Mask.size();
+
+ int NumV1Elements = 0, NumV2Elements = 0, NumSentinelElements = 0;
+ for (int M : Mask)
+ if (M < 0)
+ ++NumSentinelElements;
+ else if (M < NumElements)
+ ++NumV1Elements;
+ else
+ ++NumV2Elements;
+
+ // Commute the shuffle as needed such that more elements come from V1 than
+ // V2. This allows us to match the shuffle pattern strictly on how many
+ // elements come from V1 without handling the symmetric cases.
+ if (NumV2Elements > NumV1Elements)
+ return true;
+
+ assert(NumV1Elements > 0 && "No V1 indices");
+
+ if (NumV2Elements == 0)
+ return false;
+
+ // When the number of V1 and V2 elements are the same, try to minimize the
+ // number of uses of V2 in the low half of the vector. When that is tied,
+ // ensure that the sum of indices for V1 is equal to or lower than the sum
+ // indices for V2. When those are equal, try to ensure that the number of odd
+ // indices for V1 is lower than the number of odd indices for V2.
+ if (NumV1Elements == NumV2Elements) {
+ int LowV1Elements = 0, LowV2Elements = 0;
+ for (int M : Mask.slice(0, NumElements / 2))
+ if (M >= NumElements)
+ ++LowV2Elements;
+ else if (M >= 0)
+ ++LowV1Elements;
+ if (LowV2Elements > LowV1Elements)
+ return true;
+ if (LowV2Elements == LowV1Elements) {
+ int SumV1Indices = 0, SumV2Indices = 0;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i)
+ if (Mask[i] >= NumElements)
+ SumV2Indices += i;
+ else if (Mask[i] >= 0)
+ SumV1Indices += i;
+ if (SumV2Indices < SumV1Indices)
+ return true;
+ if (SumV2Indices == SumV1Indices) {
+ int NumV1OddIndices = 0, NumV2OddIndices = 0;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i)
+ if (Mask[i] >= NumElements)
+ NumV2OddIndices += i % 2;
+ else if (Mask[i] >= 0)
+ NumV1OddIndices += i % 2;
+ if (NumV2OddIndices < NumV1OddIndices)
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+/// \brief Top-level lowering for x86 vector shuffles.
+///
+/// This handles decomposition, canonicalization, and lowering of all x86
+/// vector shuffles. Most of the specific lowering strategies are encapsulated
+/// above in helper routines. The canonicalization attempts to widen shuffles
+/// to involve fewer lanes of wider elements, consolidate symmetric patterns
+/// s.t. only one of the two inputs needs to be tested, etc.
+static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ MVT VT = Op.getSimpleValueType();
+ int NumElements = VT.getVectorNumElements();
+ SDLoc DL(Op);
+ bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
+
+ assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
+ "Can't lower MMX shuffles");
+
+ bool V1IsUndef = V1.isUndef();
+ bool V2IsUndef = V2.isUndef();
+ if (V1IsUndef && V2IsUndef)
+ return DAG.getUNDEF(VT);
+
+ // When we create a shuffle node we put the UNDEF node to second operand,
+ // but in some cases the first operand may be transformed to UNDEF.
+ // In this case we should just commute the node.
+ if (V1IsUndef)
+ return DAG.getCommutedVectorShuffle(*SVOp);
+
+ // Check for non-undef masks pointing at an undef vector and make the masks
+ // undef as well. This makes it easier to match the shuffle based solely on
+ // the mask.
+ if (V2IsUndef)
+ for (int M : Mask)
+ if (M >= NumElements) {
+ SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
+ for (int &M : NewMask)
+ if (M >= NumElements)
+ M = -1;
+ return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
+ }
+
+ // Check for illegal shuffle mask element index values.
+ int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
+ assert(llvm::all_of(Mask,
+ [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
+ "Out of bounds shuffle index");
+
+ // We actually see shuffles that are entirely re-arrangements of a set of
+ // zero inputs. This mostly happens while decomposing complex shuffles into
+ // simple ones. Directly lower these as a buildvector of zeros.
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+ if (Zeroable.all())
+ return getZeroVector(VT, Subtarget, DAG, DL);
+
+ // Try to collapse shuffles into using a vector type with fewer elements but
+ // wider element types. We cap this to not form integers or floating point
+ // elements wider than 64 bits, but it might be interesting to form i128
+ // integers to handle flipping the low and high halves of AVX 256-bit vectors.
+ SmallVector<int, 16> WidenedMask;
+ if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
+ canWidenShuffleElements(Mask, WidenedMask)) {
+ MVT NewEltVT = VT.isFloatingPoint()
+ ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
+ : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
+ MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
+ // Make sure that the new vector type is legal. For example, v2f64 isn't
+ // legal on SSE1.
+ if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
+ V1 = DAG.getBitcast(NewVT, V1);
+ V2 = DAG.getBitcast(NewVT, V2);
+ return DAG.getBitcast(
+ VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
+ }
+ }
+
+ // Commute the shuffle if it will improve canonicalization.
+ if (canonicalizeShuffleMaskWithCommute(Mask))
+ return DAG.getCommutedVectorShuffle(*SVOp);
+
+ // For each vector width, delegate to a specialized lowering routine.
+ if (VT.is128BitVector())
+ return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
+ DAG);
+
+ if (VT.is256BitVector())
+ return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
+ DAG);
+
+ if (VT.is512BitVector())
+ return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
+ DAG);
+
+ if (Is1BitVector)
+ return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
+
+ llvm_unreachable("Unimplemented!");
+}
+
+/// \brief Try to lower a VSELECT instruction to a vector shuffle.
+static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDValue Cond = Op.getOperand(0);
+ SDValue LHS = Op.getOperand(1);
+ SDValue RHS = Op.getOperand(2);
+ SDLoc dl(Op);
+ MVT VT = Op.getSimpleValueType();
+
+ if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
+ return SDValue();
+ auto *CondBV = cast<BuildVectorSDNode>(Cond);
+
+ // Only non-legal VSELECTs reach this lowering, convert those into generic
+ // shuffles and re-use the shuffle lowering path for blends.
+ SmallVector<int, 32> Mask;
+ for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
+ SDValue CondElt = CondBV->getOperand(i);
+ Mask.push_back(
+ isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
+ : -1);
+ }
+ return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
+}
+
+SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
+ // A vselect where all conditions and data are constants can be optimized into
+ // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
+ if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
+ ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
+ ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
+ return SDValue();
+
+ // Try to lower this to a blend-style vector shuffle. This can handle all
+ // constant condition cases.
+ if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
+ return BlendOp;
+
+ // Variable blends are only legal from SSE4.1 onward.
+ if (!Subtarget.hasSSE41())
+ return SDValue();
+
+ // Only some types will be legal on some subtargets. If we can emit a legal
+ // VSELECT-matching blend, return Op, and but if we need to expand, return
+ // a null value.
+ switch (Op.getSimpleValueType().SimpleTy) {
+ default:
+ // Most of the vector types have blends past SSE4.1.
+ return Op;
+
+ case MVT::v32i8:
+ // The byte blends for AVX vectors were introduced only in AVX2.
+ if (Subtarget.hasAVX2())
+ return Op;
+
+ return SDValue();
+
+ case MVT::v8i16:
+ case MVT::v16i16:
+ // AVX-512 BWI and VLX features support VSELECT with i16 elements.
+ if (Subtarget.hasBWI() && Subtarget.hasVLX())
+ return Op;
+
+ // FIXME: We should custom lower this by fixing the condition and using i8
+ // blends.
+ return SDValue();
+ }
+}
+
+static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ SDLoc dl(Op);
+
+ if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
+ return SDValue();
+
+ if (VT.getSizeInBits() == 8) {
+ SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
+ Op.getOperand(0), Op.getOperand(1));
+ SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
+ DAG.getValueType(VT));
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
+ }
+
+ if (VT == MVT::f32) {
+ // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
+ // the result back to FR32 register. It's only worth matching if the
+ // result has a single use which is a store or a bitcast to i32. And in
+ // the case of a store, it's not worth it if the index is a constant 0,
+ // because a MOVSSmr can be used instead, which is smaller and faster.
+ if (!Op.hasOneUse())
+ return SDValue();
+ SDNode *User = *Op.getNode()->use_begin();
+ if ((User->getOpcode() != ISD::STORE ||
+ isNullConstant(Op.getOperand(1))) &&
+ (User->getOpcode() != ISD::BITCAST ||
+ User->getValueType(0) != MVT::i32))
+ return SDValue();
+ SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+ DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
+ Op.getOperand(1));
+ return DAG.getBitcast(MVT::f32, Extract);
+ }
+
+ if (VT == MVT::i32 || VT == MVT::i64) {
+ // ExtractPS/pextrq works with constant index.
+ if (isa<ConstantSDNode>(Op.getOperand(1)))
+ return Op;
+ }
+
+ return SDValue();
+}
+
+/// Extract one bit from mask vector, like v16i1 or v8i1.
+/// AVX-512 feature.
+SDValue
+X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
+ SDValue Vec = Op.getOperand(0);
+ SDLoc dl(Vec);
+ MVT VecVT = Vec.getSimpleValueType();
+ SDValue Idx = Op.getOperand(1);
+ MVT EltVT = Op.getSimpleValueType();
+
+ assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
+ assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
+ "Unexpected vector type in ExtractBitFromMaskVector");
+
+ // variable index can't be handled in mask registers,
+ // extend vector to VR512
+ if (!isa<ConstantSDNode>(Idx)) {
+ MVT ExtVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
+ SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
+ SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+ ExtVT.getVectorElementType(), Ext, Idx);
+ return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
+ }
+
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) ||
+ (VecVT.getVectorNumElements() < 8)) {
+ // Use kshiftlw/rw instruction.
+ VecVT = MVT::v16i1;
+ Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
+ DAG.getUNDEF(VecVT),
+ Vec,
+ DAG.getIntPtrConstant(0, dl));
+ }
+ unsigned MaxSift = VecVT.getVectorNumElements() - 1;
+ if (MaxSift - IdxVal)
+ Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
+ DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
+ Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
+ DAG.getConstant(MaxSift, dl, MVT::i8));
+ return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
+ DAG.getIntPtrConstant(0, dl));
+}
+
+SDValue
+X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ SDValue Vec = Op.getOperand(0);
+ MVT VecVT = Vec.getSimpleValueType();
+ SDValue Idx = Op.getOperand(1);
+
+ if (Op.getSimpleValueType() == MVT::i1)
+ return ExtractBitFromMaskVector(Op, DAG);
+
+ if (!isa<ConstantSDNode>(Idx)) {
+ if (VecVT.is512BitVector() ||
+ (VecVT.is256BitVector() && Subtarget.hasInt256() &&
+ VecVT.getScalarSizeInBits() == 32)) {
+
+ MVT MaskEltVT =
+ MVT::getIntegerVT(VecVT.getScalarSizeInBits());
+ MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
+ MaskEltVT.getSizeInBits());
+
+ Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
+ getZeroVector(MaskVT, Subtarget, DAG, dl), Idx,
+ DAG.getConstant(0, dl, PtrVT));
+ SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Perm,
+ DAG.getConstant(0, dl, PtrVT));
+ }
+ return SDValue();
+ }
+
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+
+ // If this is a 256-bit vector result, first extract the 128-bit vector and
+ // then extract the element from the 128-bit vector.
+ if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
+ // Get the 128-bit vector.
+ Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
+ MVT EltVT = VecVT.getVectorElementType();
+
+ unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
+ assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
+
+ // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
+ // this can be done with a mask.
+ IdxVal &= ElemsPerChunk - 1;
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
+ DAG.getConstant(IdxVal, dl, MVT::i32));
+ }
+
+ assert(VecVT.is128BitVector() && "Unexpected vector length");
+
+ MVT VT = Op.getSimpleValueType();
+
+ if (VT.getSizeInBits() == 16) {
+ // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
+ // we're going to zero extend the register or fold the store (SSE41 only).
+ if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
+ !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
+ return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+ DAG.getBitcast(MVT::v4i32, Vec), Idx));
+
+ // Transform it so it match pextrw which produces a 32-bit result.
+ SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
+ Op.getOperand(0), Op.getOperand(1));
+ SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
+ DAG.getValueType(VT));
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
+ }
+
+ if (Subtarget.hasSSE41())
+ if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
+ return Res;
+
+ // TODO: handle v16i8.
+
+ if (VT.getSizeInBits() == 32) {
+ if (IdxVal == 0)
+ return Op;
+
+ // SHUFPS the element to the lowest double word, then movss.
+ int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
+ Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
+ DAG.getIntPtrConstant(0, dl));
+ }
+
+ if (VT.getSizeInBits() == 64) {
+ // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
+ // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
+ // to match extract_elt for f64.
+ if (IdxVal == 0)
+ return Op;
+
+ // UNPCKHPD the element to the lowest double word, then movsd.
+ // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
+ // to a f64mem, the whole operation is folded into a single MOVHPDmr.
+ int Mask[2] = { 1, -1 };
+ Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
+ DAG.getIntPtrConstant(0, dl));
+ }
+
+ return SDValue();
+}
+
+/// Insert one bit to mask vector, like v16i1 or v8i1.
+/// AVX-512 feature.
+SDValue
+X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ SDValue Vec = Op.getOperand(0);
+ SDValue Elt = Op.getOperand(1);
+ SDValue Idx = Op.getOperand(2);
+ MVT VecVT = Vec.getSimpleValueType();
+
+ if (!isa<ConstantSDNode>(Idx)) {
+ // Non constant index. Extend source and destination,
+ // insert element and then truncate the result.
+ MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
+ MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32);
+ SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
+ DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
+ DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
+ return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
+ }
+
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
+ unsigned NumElems = VecVT.getVectorNumElements();
+
+ if(Vec.isUndef()) {
+ if (IdxVal)
+ EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
+ DAG.getConstant(IdxVal, dl, MVT::i8));
+ return EltInVec;
+ }
+
+ // Insertion of one bit into first or last position
+ // can be done with two SHIFTs + OR.
+ if (IdxVal == 0 ) {
+ // EltInVec already at correct index and other bits are 0.
+ // Clean the first bit in source vector.
+ Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
+ DAG.getConstant(1 , dl, MVT::i8));
+ Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
+ DAG.getConstant(1, dl, MVT::i8));
+
+ return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
+ }
+ if (IdxVal == NumElems -1) {
+ // Move the bit to the last position inside the vector.
+ EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
+ DAG.getConstant(IdxVal, dl, MVT::i8));
+ // Clean the last bit in the source vector.
+ Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
+ DAG.getConstant(1, dl, MVT::i8));
+ Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
+ DAG.getConstant(1 , dl, MVT::i8));
+
+ return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
+ }
+
+ // Use shuffle to insert element.
+ SmallVector<int, 64> MaskVec(NumElems);
+ for (unsigned i = 0; i != NumElems; ++i)
+ MaskVec[i] = (i == IdxVal) ? NumElems : i;
+
+ return DAG.getVectorShuffle(VecVT, dl, Vec, EltInVec, MaskVec);
+}
+
+SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
+ SelectionDAG &DAG) const {
+ MVT VT = Op.getSimpleValueType();
+ MVT EltVT = VT.getVectorElementType();
+ unsigned NumElts = VT.getVectorNumElements();
+
+ if (EltVT == MVT::i1)
+ return InsertBitToMaskVector(Op, DAG);
+
+ SDLoc dl(Op);
+ SDValue N0 = Op.getOperand(0);
+ SDValue N1 = Op.getOperand(1);
+ SDValue N2 = Op.getOperand(2);
+ if (!isa<ConstantSDNode>(N2))
+ return SDValue();
+ auto *N2C = cast<ConstantSDNode>(N2);
+ unsigned IdxVal = N2C->getZExtValue();
+
+ // If we are clearing out a element, we do this more efficiently with a
+ // blend shuffle than a costly integer insertion.
+ // TODO: would other rematerializable values (e.g. allbits) benefit as well?
+ // TODO: pre-SSE41 targets will tend to use bit masking - this could still
+ // be beneficial if we are inserting several zeros and can combine the masks.
+ if (X86::isZeroNode(N1) && Subtarget.hasSSE41() && NumElts <= 8) {
+ SmallVector<int, 8> ClearMask;
+ for (unsigned i = 0; i != NumElts; ++i)
+ ClearMask.push_back(i == IdxVal ? i + NumElts : i);
+ SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, dl);
+ return DAG.getVectorShuffle(VT, dl, N0, ZeroVector, ClearMask);
+ }
+
+ // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
+ // into that, and then insert the subvector back into the result.
+ if (VT.is256BitVector() || VT.is512BitVector()) {
+ // With a 256-bit vector, we can insert into the zero element efficiently
+ // using a blend if we have AVX or AVX2 and the right data type.
+ if (VT.is256BitVector() && IdxVal == 0) {
+ // TODO: It is worthwhile to cast integer to floating point and back
+ // and incur a domain crossing penalty if that's what we'll end up
+ // doing anyway after extracting to a 128-bit vector.
+ if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
+ (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
+ SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
+ N2 = DAG.getIntPtrConstant(1, dl);
+ return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
+ }
+ }
+
+ // Get the desired 128-bit vector chunk.
+ SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
+
+ // Insert the element into the desired chunk.
+ unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
+ assert(isPowerOf2_32(NumEltsIn128));
+ // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
+ unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
+
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
+ DAG.getConstant(IdxIn128, dl, MVT::i32));
+
+ // Insert the changed part back into the bigger vector
+ return insert128BitVector(N0, V, IdxVal, DAG, dl);
+ }
+ assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
+
+ if (Subtarget.hasSSE41()) {
+ if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {
+ unsigned Opc;
+ if (VT == MVT::v8i16) {
+ Opc = X86ISD::PINSRW;
+ } else {
+ assert(VT == MVT::v16i8);
+ Opc = X86ISD::PINSRB;
+ }
+
+ // Transform it so it match pinsr{b,w} which expects a GR32 as its second
+ // argument.
+ if (N1.getValueType() != MVT::i32)
+ N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
+ if (N2.getValueType() != MVT::i32)
+ N2 = DAG.getIntPtrConstant(IdxVal, dl);
+ return DAG.getNode(Opc, dl, VT, N0, N1, N2);
+ }
+
+ if (EltVT == MVT::f32) {
+ // Bits [7:6] of the constant are the source select. This will always be
+ // zero here. The DAG Combiner may combine an extract_elt index into
+ // these bits. For example (insert (extract, 3), 2) could be matched by
+ // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
+ // Bits [5:4] of the constant are the destination select. This is the
+ // value of the incoming immediate.
+ // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
+ // combine either bitwise AND or insert of float 0.0 to set these bits.
+
+ bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
+ if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
+ // If this is an insertion of 32-bits into the low 32-bits of
+ // a vector, we prefer to generate a blend with immediate rather
+ // than an insertps. Blends are simpler operations in hardware and so
+ // will always have equal or better performance than insertps.
+ // But if optimizing for size and there's a load folding opportunity,
+ // generate insertps because blendps does not have a 32-bit memory
+ // operand form.
+ N2 = DAG.getIntPtrConstant(1, dl);
+ N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
+ return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
+ }
+ N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
+ // Create this as a scalar to vector..
+ N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
+ return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
+ }
+
+ if (EltVT == MVT::i32 || EltVT == MVT::i64) {
+ // PINSR* works with constant index.
+ return Op;
+ }
+ }
+
+ if (EltVT == MVT::i8)
+ return SDValue();
+
+ if (EltVT.getSizeInBits() == 16) {
+ // Transform it so it match pinsrw which expects a 16-bit value in a GR32
+ // as its second argument.
+ if (N1.getValueType() != MVT::i32)
+ N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
+ if (N2.getValueType() != MVT::i32)
+ N2 = DAG.getIntPtrConstant(IdxVal, dl);
+ return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
+ }
+ return SDValue();
+}
+
+static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
+ SDLoc dl(Op);
+ MVT OpVT = Op.getSimpleValueType();
+
+ // If this is a 256-bit vector result, first insert into a 128-bit
+ // vector and then insert into the 256-bit vector.
+ if (!OpVT.is128BitVector()) {
+ // Insert into a 128-bit vector.
+ unsigned SizeFactor = OpVT.getSizeInBits()/128;
+ MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
+ OpVT.getVectorNumElements() / SizeFactor);
+
+ Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
+
+ // Insert the 128-bit vector.
+ return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
+ }
+
+ SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
+ assert(OpVT.is128BitVector() && "Expected an SSE type!");
+ return DAG.getBitcast(
+ OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
+}
+
+// Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in
+// a simple subregister reference or explicit instructions to grab
+// upper bits of a vector.
+static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(Subtarget.hasAVX() && "EXTRACT_SUBVECTOR requires AVX");
+
+ SDLoc dl(Op);
+ SDValue In = Op.getOperand(0);
+ SDValue Idx = Op.getOperand(1);
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ MVT ResVT = Op.getSimpleValueType();
+
+ assert((In.getSimpleValueType().is256BitVector() ||
+ In.getSimpleValueType().is512BitVector()) &&
+ "Can only extract from 256-bit or 512-bit vectors");
+
+ if (ResVT.is128BitVector())
+ return extract128BitVector(In, IdxVal, DAG, dl);
+ if (ResVT.is256BitVector())
+ return extract256BitVector(In, IdxVal, DAG, dl);
+
+ llvm_unreachable("Unimplemented!");
+}
+
+static bool areOnlyUsersOf(SDNode *N, ArrayRef<SDValue> ValidUsers) {
+ for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I)
+ if (llvm::all_of(ValidUsers,
+ [&I](SDValue V) { return V.getNode() != *I; }))
+ return false;
+ return true;
+}
+
+// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
+// simple superregister reference or explicit instructions to insert
+// the upper bits of a vector.
+static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(Subtarget.hasAVX() && "INSERT_SUBVECTOR requires AVX");
+
+ SDLoc dl(Op);
+ SDValue Vec = Op.getOperand(0);
+ SDValue SubVec = Op.getOperand(1);
+ SDValue Idx = Op.getOperand(2);
+
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ MVT OpVT = Op.getSimpleValueType();
+ MVT SubVecVT = SubVec.getSimpleValueType();
+
+ if (OpVT.getVectorElementType() == MVT::i1)
+ return insert1BitVector(Op, DAG, Subtarget);
+
+ assert((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
+ "Can only insert into 256-bit or 512-bit vectors");
+
+ // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
+ // load:
+ // (insert_subvector (insert_subvector undef, (load16 addr), 0),
+ // (load16 addr + 16), Elts/2)
+ // --> load32 addr
+ // or:
+ // (insert_subvector (insert_subvector undef, (load32 addr), 0),
+ // (load32 addr + 32), Elts/2)
+ // --> load64 addr
+ // or a 16-byte or 32-byte broadcast:
+ // (insert_subvector (insert_subvector undef, (load16 addr), 0),
+ // (load16 addr), Elts/2)
+ // --> X86SubVBroadcast(load16 addr)
+ // or:
+ // (insert_subvector (insert_subvector undef, (load32 addr), 0),
+ // (load32 addr), Elts/2)
+ // --> X86SubVBroadcast(load32 addr)
+ if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
+ Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
+ auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
+ if (Idx2 && Idx2->getZExtValue() == 0) {
+ SDValue SubVec2 = Vec.getOperand(1);
+ // If needed, look through bitcasts to get to the load.
+ if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
+ bool Fast;
+ unsigned Alignment = FirstLd->getAlignment();
+ unsigned AS = FirstLd->getAddressSpace();
+ const X86TargetLowering *TLI = Subtarget.getTargetLowering();
+ if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
+ OpVT, AS, Alignment, &Fast) && Fast) {
+ SDValue Ops[] = {SubVec2, SubVec};
+ if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
+ return Ld;
+ }
+ }
+ // If lower/upper loads are the same and the only users of the load, then
+ // lower to a VBROADCASTF128/VBROADCASTI128/etc.
+ if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) {
+ if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
+ areOnlyUsersOf(SubVec2.getNode(), {Op, Vec})) {
+ return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
+ }
+ }
+ // If this is subv_broadcast insert into both halves, use a larger
+ // subv_broadcast.
+ if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) {
+ return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
+ SubVec.getOperand(0));
+ }
+ }
+ }
+
+ if (SubVecVT.is128BitVector())
+ return insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
+
+ if (SubVecVT.is256BitVector())
+ return insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
+
+ llvm_unreachable("Unimplemented!");
+}
+
+// Returns the appropriate wrapper opcode for a global reference.
+unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
+ // References to absolute symbols are never PC-relative.
+ if (GV && GV->isAbsoluteSymbolRef())
+ return X86ISD::Wrapper;
+
+ CodeModel::Model M = getTargetMachine().getCodeModel();
+ if (Subtarget.isPICStyleRIPRel() &&
+ (M == CodeModel::Small || M == CodeModel::Kernel))
+ return X86ISD::WrapperRIP;
+
+ return X86ISD::Wrapper;
+}
+
+// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
+// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
+// one of the above mentioned nodes. It has to be wrapped because otherwise
+// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
+// be used to form addressing mode. These wrapped nodes will be selected
+// into MOV32ri.
+SDValue
+X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
+ ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+
+ // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+ // global base reg.
+ unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
+
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue Result = DAG.getTargetConstantPool(
+ CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
+ SDLoc DL(CP);
+ Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
+ // With PIC, the address is actually $g + Offset.
+ if (OpFlag) {
+ Result =
+ DAG.getNode(ISD::ADD, DL, PtrVT,
+ DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
+ }
+
+ return Result;
+}
+
+SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
+ JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+
+ // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+ // global base reg.
+ unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
+
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
+ SDLoc DL(JT);
+ Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
+
+ // With PIC, the address is actually $g + Offset.
+ if (OpFlag)
+ Result =
+ DAG.getNode(ISD::ADD, DL, PtrVT,
+ DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
+
+ return Result;
+}
+
+SDValue
+X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
+ const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
+
+ // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+ // global base reg.
+ const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
+ unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
+
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
+
+ SDLoc DL(Op);
+ Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
+
+ // With PIC, the address is actually $g + Offset.
+ if (isPositionIndependent() && !Subtarget.is64Bit()) {
+ Result =
+ DAG.getNode(ISD::ADD, DL, PtrVT,
+ DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
+ }
+
+ // For symbols that require a load from a stub to get the address, emit the
+ // load.
+ if (isGlobalStubReference(OpFlag))
+ Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+
+ return Result;
+}
+
+SDValue
+X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
+ // Create the TargetBlockAddressAddress node.
+ unsigned char OpFlags =
+ Subtarget.classifyBlockAddressReference();
+ const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+ int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
+ SDLoc dl(Op);
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
+ Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
+
+ // With PIC, the address is actually $g + Offset.
+ if (isGlobalRelativeToPICBase(OpFlags)) {
+ Result = DAG.getNode(ISD::ADD, dl, PtrVT,
+ DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
+ }
+
+ return Result;
+}
+
+SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
+ const SDLoc &dl, int64_t Offset,
+ SelectionDAG &DAG) const {
+ // Create the TargetGlobalAddress node, folding in the constant
+ // offset if it is legal.
+ unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
+ CodeModel::Model M = DAG.getTarget().getCodeModel();
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue Result;
+ if (OpFlags == X86II::MO_NO_FLAG &&
+ X86::isOffsetSuitableForCodeModel(Offset, M)) {
+ // A direct static reference to a global.
+ Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
+ Offset = 0;
+ } else {
+ Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
+ }
+
+ Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result);
+
+ // With PIC, the address is actually $g + Offset.
+ if (isGlobalRelativeToPICBase(OpFlags)) {
+ Result = DAG.getNode(ISD::ADD, dl, PtrVT,
+ DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
+ }
+
+ // For globals that require a load from a stub to get the address, emit the
+ // load.
+ if (isGlobalStubReference(OpFlags))
+ Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+
+ // If there was a non-zero offset that we didn't fold, create an explicit
+ // addition for it.
+ if (Offset != 0)
+ Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
+ DAG.getConstant(Offset, dl, PtrVT));
+
+ return Result;
+}
+
+SDValue
+X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
+ const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+ int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
+ return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
+}
+
+static SDValue
+GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
+ SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
+ unsigned char OperandFlags, bool LocalDynamic = false) {
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDLoc dl(GA);
+ SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
+ GA->getValueType(0),
+ GA->getOffset(),
+ OperandFlags);
+
+ X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
+ : X86ISD::TLSADDR;
+
+ if (InFlag) {
+ SDValue Ops[] = { Chain, TGA, *InFlag };
+ Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
+ } else {
+ SDValue Ops[] = { Chain, TGA };
+ Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
+ }
+
+ // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
+ MFI.setAdjustsStack(true);
+ MFI.setHasCalls(true);
+
+ SDValue Flag = Chain.getValue(1);
+ return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
+}
+
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
+static SDValue
+LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+ const EVT PtrVT) {
+ SDValue InFlag;
+ SDLoc dl(GA); // ? function entry point might be better
+ SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
+ DAG.getNode(X86ISD::GlobalBaseReg,
+ SDLoc(), PtrVT), InFlag);
+ InFlag = Chain.getValue(1);
+
+ return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
+}
+
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
+static SDValue
+LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+ const EVT PtrVT) {
+ return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
+ X86::RAX, X86II::MO_TLSGD);
+}
+
+static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
+ SelectionDAG &DAG,
+ const EVT PtrVT,
+ bool is64Bit) {
+ SDLoc dl(GA);
+
+ // Get the start address of the TLS block for this module.
+ X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
+ .getInfo<X86MachineFunctionInfo>();
+ MFI->incNumLocalDynamicTLSAccesses();
+
+ SDValue Base;
+ if (is64Bit) {
+ Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
+ X86II::MO_TLSLD, /*LocalDynamic=*/true);
+ } else {
+ SDValue InFlag;
+ SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
+ DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
+ InFlag = Chain.getValue(1);
+ Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
+ X86II::MO_TLSLDM, /*LocalDynamic=*/true);
+ }
+
+ // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
+ // of Base.
+
+ // Build x@dtpoff.
+ unsigned char OperandFlags = X86II::MO_DTPOFF;
+ unsigned WrapperKind = X86ISD::Wrapper;
+ SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
+ GA->getValueType(0),
+ GA->getOffset(), OperandFlags);
+ SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
+
+ // Add x@dtpoff with the base.
+ return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
+}
+
+// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
+static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+ const EVT PtrVT, TLSModel::Model model,
+ bool is64Bit, bool isPIC) {
+ SDLoc dl(GA);
+
+ // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
+ Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
+ is64Bit ? 257 : 256));
+
+ SDValue ThreadPointer =
+ DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
+ MachinePointerInfo(Ptr));
+
+ unsigned char OperandFlags = 0;
+ // Most TLS accesses are not RIP relative, even on x86-64. One exception is
+ // initialexec.
+ unsigned WrapperKind = X86ISD::Wrapper;
+ if (model == TLSModel::LocalExec) {
+ OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
+ } else if (model == TLSModel::InitialExec) {
+ if (is64Bit) {
+ OperandFlags = X86II::MO_GOTTPOFF;
+ WrapperKind = X86ISD::WrapperRIP;
+ } else {
+ OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
+ }
+ } else {
+ llvm_unreachable("Unexpected model");
+ }
+
+ // emit "addl x@ntpoff,%eax" (local exec)
+ // or "addl x@indntpoff,%eax" (initial exec)
+ // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
+ SDValue TGA =
+ DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
+ GA->getOffset(), OperandFlags);
+ SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
+
+ if (model == TLSModel::InitialExec) {
+ if (isPIC && !is64Bit) {
+ Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
+ DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
+ Offset);
+ }
+
+ Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+ }
+
+ // The address of the thread local variable is the add of the thread
+ // pointer with the offset of the variable.
+ return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
+}
+
+SDValue
+X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
+
+ GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+
+ if (DAG.getTarget().Options.EmulatedTLS)
+ return LowerToTLSEmulatedModel(GA, DAG);
+
+ const GlobalValue *GV = GA->getGlobal();
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+ bool PositionIndependent = isPositionIndependent();
+
+ if (Subtarget.isTargetELF()) {
+ TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
+ switch (model) {
+ case TLSModel::GeneralDynamic:
+ if (Subtarget.is64Bit())
+ return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
+ return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
+ case TLSModel::LocalDynamic:
+ return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
+ Subtarget.is64Bit());
+ case TLSModel::InitialExec:
+ case TLSModel::LocalExec:
+ return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
+ PositionIndependent);
+ }
+ llvm_unreachable("Unknown TLS model.");
+ }
+
+ if (Subtarget.isTargetDarwin()) {
+ // Darwin only has one model of TLS. Lower to that.
+ unsigned char OpFlag = 0;
+ unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
+ X86ISD::WrapperRIP : X86ISD::Wrapper;
+
+ // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+ // global base reg.
+ bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
+ if (PIC32)
+ OpFlag = X86II::MO_TLVP_PIC_BASE;
+ else
+ OpFlag = X86II::MO_TLVP;
+ SDLoc DL(Op);
+ SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
+ GA->getValueType(0),
+ GA->getOffset(), OpFlag);
+ SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
+
+ // With PIC32, the address is actually $g + Offset.
+ if (PIC32)
+ Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
+ DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
+ Offset);
+
+ // Lowering the machine isd will make sure everything is in the right
+ // location.
+ SDValue Chain = DAG.getEntryNode();
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+ Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, DL, true), DL);
+ SDValue Args[] = { Chain, Offset };
+ Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
+ Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
+ DAG.getIntPtrConstant(0, DL, true),
+ Chain.getValue(1), DL);
+
+ // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ MFI.setAdjustsStack(true);
+
+ // And our return value (tls address) is in the standard call return value
+ // location.
+ unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
+ return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
+ }
+
+ if (Subtarget.isTargetKnownWindowsMSVC() ||
+ Subtarget.isTargetWindowsItanium() ||
+ Subtarget.isTargetWindowsGNU()) {
+ // Just use the implicit TLS architecture
+ // Need to generate someting similar to:
+ // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
+ // ; from TEB
+ // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
+ // mov rcx, qword [rdx+rcx*8]
+ // mov eax, .tls$:tlsvar
+ // [rax+rcx] contains the address
+ // Windows 64bit: gs:0x58
+ // Windows 32bit: fs:__tls_array
+
+ SDLoc dl(GA);
+ SDValue Chain = DAG.getEntryNode();
+
+ // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
+ // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
+ // use its literal value of 0x2C.
+ Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
+ ? Type::getInt8PtrTy(*DAG.getContext(),
+ 256)
+ : Type::getInt32PtrTy(*DAG.getContext(),
+ 257));
+
+ SDValue TlsArray = Subtarget.is64Bit()
+ ? DAG.getIntPtrConstant(0x58, dl)
+ : (Subtarget.isTargetWindowsGNU()
+ ? DAG.getIntPtrConstant(0x2C, dl)
+ : DAG.getExternalSymbol("_tls_array", PtrVT));
+
+ SDValue ThreadPointer =
+ DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
+
+ SDValue res;
+ if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
+ res = ThreadPointer;
+ } else {
+ // Load the _tls_index variable
+ SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
+ if (Subtarget.is64Bit())
+ IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
+ MachinePointerInfo(), MVT::i32);
+ else
+ IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
+
+ auto &DL = DAG.getDataLayout();
+ SDValue Scale =
+ DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
+ IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
+
+ res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
+ }
+
+ res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
+
+ // Get the offset of start of .tls section
+ SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
+ GA->getValueType(0),
+ GA->getOffset(), X86II::MO_SECREL);
+ SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
+
+ // The address of the thread local variable is the add of the thread
+ // pointer with the offset of the variable.
+ return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
+ }
+
+ llvm_unreachable("TLS not implemented for this target.");
+}
+
+/// Lower SRA_PARTS and friends, which return two i32 values
+/// and take a 2 x i32 value to shift plus a shift amount.
+static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
+ assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+ MVT VT = Op.getSimpleValueType();
+ unsigned VTBits = VT.getSizeInBits();
+ SDLoc dl(Op);
+ bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
+ SDValue ShOpLo = Op.getOperand(0);
+ SDValue ShOpHi = Op.getOperand(1);
+ SDValue ShAmt = Op.getOperand(2);
+ // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
+ // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
+ // during isel.
+ SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
+ DAG.getConstant(VTBits - 1, dl, MVT::i8));
+ SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
+ DAG.getConstant(VTBits - 1, dl, MVT::i8))
+ : DAG.getConstant(0, dl, VT);
+
+ SDValue Tmp2, Tmp3;
+ if (Op.getOpcode() == ISD::SHL_PARTS) {
+ Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
+ Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
+ } else {
+ Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
+ Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
+ }
+
+ // If the shift amount is larger or equal than the width of a part we can't
+ // rely on the results of shld/shrd. Insert a test and select the appropriate
+ // values for large shift amounts.
+ SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
+ DAG.getConstant(VTBits, dl, MVT::i8));
+ SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
+ AndNode, DAG.getConstant(0, dl, MVT::i8));
+
+ SDValue Hi, Lo;
+ SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
+ SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
+ SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
+
+ if (Op.getOpcode() == ISD::SHL_PARTS) {
+ Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
+ Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
+ } else {
+ Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
+ Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
+ }
+
+ SDValue Ops[2] = { Lo, Hi };
+ return DAG.getMergeValues(Ops, dl);
+}
+
+SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue Src = Op.getOperand(0);
+ MVT SrcVT = Src.getSimpleValueType();
+ MVT VT = Op.getSimpleValueType();
+ SDLoc dl(Op);
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (SrcVT.isVector()) {
+ if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
+ return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
+ DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
+ DAG.getUNDEF(SrcVT)));
+ }
+ if (SrcVT.getVectorElementType() == MVT::i1) {
+ if (SrcVT == MVT::v2i1 && TLI.isTypeLegal(SrcVT))
+ return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
+ DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v2i64, Src));
+ MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
+ return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
+ DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
+ }
+ return SDValue();
+ }
+
+ assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
+ "Unknown SINT_TO_FP to lower!");
+
+ // These are really Legal; return the operand so the caller accepts it as
+ // Legal.
+ if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
+ return Op;
+ if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
+ Subtarget.is64Bit()) {
+ return Op;
+ }
+
+ SDValue ValueToStore = Op.getOperand(0);
+ if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
+ !Subtarget.is64Bit())
+ // Bitcasting to f64 here allows us to do a single 64-bit store from
+ // an SSE register, avoiding the store forwarding penalty that would come
+ // with two 32-bit stores.
+ ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
+
+ unsigned Size = SrcVT.getSizeInBits()/8;
+ MachineFunction &MF = DAG.getMachineFunction();
+ auto PtrVT = getPointerTy(MF.getDataLayout());
+ int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
+ SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
+ SDValue Chain = DAG.getStore(
+ DAG.getEntryNode(), dl, ValueToStore, StackSlot,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
+ return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
+}
+
+SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
+ SDValue StackSlot,
+ SelectionDAG &DAG) const {
+ // Build the FILD
+ SDLoc DL(Op);
+ SDVTList Tys;
+ bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
+ if (useSSE)
+ Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
+ else
+ Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
+
+ unsigned ByteSize = SrcVT.getSizeInBits()/8;
+
+ FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
+ MachineMemOperand *MMO;
+ if (FI) {
+ int SSFI = FI->getIndex();
+ MMO = DAG.getMachineFunction().getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
+ MachineMemOperand::MOLoad, ByteSize, ByteSize);
+ } else {
+ MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
+ StackSlot = StackSlot.getOperand(1);
+ }
+ SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
+ SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
+ X86ISD::FILD, DL,
+ Tys, Ops, SrcVT, MMO);
+
+ if (useSSE) {
+ Chain = Result.getValue(1);
+ SDValue InFlag = Result.getValue(2);
+
+ // FIXME: Currently the FST is flagged to the FILD_FLAG. This
+ // shouldn't be necessary except that RFP cannot be live across
+ // multiple blocks. When stackifier is fixed, they can be uncoupled.
+ MachineFunction &MF = DAG.getMachineFunction();
+ unsigned SSFISize = Op.getValueSizeInBits()/8;
+ int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
+ auto PtrVT = getPointerTy(MF.getDataLayout());
+ SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
+ Tys = DAG.getVTList(MVT::Other);
+ SDValue Ops[] = {
+ Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
+ };
+ MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
+ MachineMemOperand::MOStore, SSFISize, SSFISize);
+
+ Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
+ Ops, Op.getValueType(), MMO);
+ Result = DAG.getLoad(
+ Op.getValueType(), DL, Chain, StackSlot,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
+ }
+
+ return Result;
+}
+
+/// 64-bit unsigned integer to double expansion.
+SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
+ SelectionDAG &DAG) const {
+ // This algorithm is not obvious. Here it is what we're trying to output:
+ /*
+ movq %rax, %xmm0
+ punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
+ subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
+ #ifdef __SSE3__
+ haddpd %xmm0, %xmm0
+ #else
+ pshufd $0x4e, %xmm0, %xmm1
+ addpd %xmm1, %xmm0
+ #endif
+ */
+
+ SDLoc dl(Op);
+ LLVMContext *Context = DAG.getContext();
+
+ // Build some magic constants.
+ static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
+ Constant *C0 = ConstantDataVector::get(*Context, CV0);
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
+
+ SmallVector<Constant*,2> CV1;
+ CV1.push_back(
+ ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
+ APInt(64, 0x4330000000000000ULL))));
+ CV1.push_back(
+ ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
+ APInt(64, 0x4530000000000000ULL))));
+ Constant *C1 = ConstantVector::get(CV1);
+ SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
+
+ // Load the 64-bit value into an XMM register.
+ SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
+ Op.getOperand(0));
+ SDValue CLod0 =
+ DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+ /* Alignment = */ 16);
+ SDValue Unpck1 =
+ getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
+
+ SDValue CLod1 =
+ DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+ /* Alignment = */ 16);
+ SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
+ // TODO: Are there any fast-math-flags to propagate here?
+ SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
+ SDValue Result;
+
+ if (Subtarget.hasSSE3()) {
+ // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
+ Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
+ } else {
+ SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
+ SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
+ Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
+ DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
+ }
+
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
+ DAG.getIntPtrConstant(0, dl));
+}
+
+/// 32-bit unsigned integer to float expansion.
+SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ // FP constant to bias correct the final result.
+ SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
+ MVT::f64);
+
+ // Load the 32-bit value into an XMM register.
+ SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
+ Op.getOperand(0));
+
+ // Zero out the upper parts of the register.
+ Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
+
+ Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+ DAG.getBitcast(MVT::v2f64, Load),
+ DAG.getIntPtrConstant(0, dl));
+
+ // Or the load with the bias.
+ SDValue Or = DAG.getNode(
+ ISD::OR, dl, MVT::v2i64,
+ DAG.getBitcast(MVT::v2i64,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
+ DAG.getBitcast(MVT::v2i64,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
+ Or =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+ DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
+
+ // Subtract the bias.
+ // TODO: Are there any fast-math-flags to propagate here?
+ SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
+
+ // Handle final rounding.
+ MVT DestVT = Op.getSimpleValueType();
+
+ if (DestVT.bitsLT(MVT::f64))
+ return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
+ DAG.getIntPtrConstant(0, dl));
+ if (DestVT.bitsGT(MVT::f64))
+ return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
+
+ // Handle final rounding.
+ return Sub;
+}
+
+static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget, SDLoc &DL) {
+ if (Op.getSimpleValueType() != MVT::v2f64)
+ return SDValue();
+
+ SDValue N0 = Op.getOperand(0);
+ assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
+
+ // Legalize to v4i32 type.
+ N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
+ DAG.getUNDEF(MVT::v2i32));
+
+ if (Subtarget.hasAVX512())
+ return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
+
+ // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
+ // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
+ SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
+ SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
+
+ // Two to the power of half-word-size.
+ SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
+
+ // Clear upper part of LO, lower HI.
+ SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
+ SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
+
+ SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
+ fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
+ SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
+
+ // Add the two halves.
+ return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
+}
+
+static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // The algorithm is the following:
+ // #ifdef __SSE4_1__
+ // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
+ // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
+ // (uint4) 0x53000000, 0xaa);
+ // #else
+ // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
+ // uint4 hi = (v >> 16) | (uint4) 0x53000000;
+ // #endif
+ // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
+ // return (float4) lo + fhi;
+
+ // We shouldn't use it when unsafe-fp-math is enabled though: we might later
+ // reassociate the two FADDs, and if we do that, the algorithm fails
+ // spectacularly (PR24512).
+ // FIXME: If we ever have some kind of Machine FMF, this should be marked
+ // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
+ // there's also the MachineCombiner reassociations happening on Machine IR.
+ if (DAG.getTarget().Options.UnsafeFPMath)
+ return SDValue();
+
+ SDLoc DL(Op);
+ SDValue V = Op->getOperand(0);
+ MVT VecIntVT = V.getSimpleValueType();
+ bool Is128 = VecIntVT == MVT::v4i32;
+ MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
+ // If we convert to something else than the supported type, e.g., to v4f64,
+ // abort early.
+ if (VecFloatVT != Op->getSimpleValueType(0))
+ return SDValue();
+
+ assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
+ "Unsupported custom type");
+
+ // In the #idef/#else code, we have in common:
+ // - The vector of constants:
+ // -- 0x4b000000
+ // -- 0x53000000
+ // - A shift:
+ // -- v >> 16
+
+ // Create the splat vector for 0x4b000000.
+ SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
+ // Create the splat vector for 0x53000000.
+ SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
+
+ // Create the right shift.
+ SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
+ SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
+
+ SDValue Low, High;
+ if (Subtarget.hasSSE41()) {
+ MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
+ // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
+ SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
+ SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
+ // Low will be bitcasted right away, so do not bother bitcasting back to its
+ // original type.
+ Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
+ VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
+ // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
+ // (uint4) 0x53000000, 0xaa);
+ SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
+ SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
+ // High will be bitcasted right away, so do not bother bitcasting back to
+ // its original type.
+ High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
+ VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
+ } else {
+ SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
+ // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
+ SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
+ Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
+
+ // uint4 hi = (v >> 16) | (uint4) 0x53000000;
+ High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
+ }
+
+ // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
+ SDValue VecCstFAdd = DAG.getConstantFP(
+ APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
+
+ // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
+ SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
+ // TODO: Are there any fast-math-flags to propagate here?
+ SDValue FHigh =
+ DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
+ // return (float4) lo + fhi;
+ SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
+ return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
+}
+
+SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue N0 = Op.getOperand(0);
+ MVT SrcVT = N0.getSimpleValueType();
+ SDLoc dl(Op);
+
+ if (SrcVT.getVectorElementType() == MVT::i1) {
+ if (SrcVT == MVT::v2i1)
+ return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
+ DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N0));
+ MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
+ return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
+ DAG.getNode(ISD::ZERO_EXTEND, dl, IntegerVT, N0));
+ }
+
+ switch (SrcVT.SimpleTy) {
+ default:
+ llvm_unreachable("Custom UINT_TO_FP is not supported!");
+ case MVT::v4i8:
+ case MVT::v4i16:
+ case MVT::v8i8:
+ case MVT::v8i16: {
+ MVT NVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
+ return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
+ DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
+ }
+ case MVT::v2i32:
+ return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
+ case MVT::v4i32:
+ case MVT::v8i32:
+ return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
+ case MVT::v16i8:
+ case MVT::v16i16:
+ assert(Subtarget.hasAVX512());
+ return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
+ DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
+ }
+}
+
+SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue N0 = Op.getOperand(0);
+ SDLoc dl(Op);
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+ // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
+ // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
+ // the optimization here.
+ if (DAG.SignBitIsZero(N0))
+ return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
+
+ if (Op.getSimpleValueType().isVector())
+ return lowerUINT_TO_FP_vec(Op, DAG);
+
+ MVT SrcVT = N0.getSimpleValueType();
+ MVT DstVT = Op.getSimpleValueType();
+
+ if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
+ (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
+ // Conversions from unsigned i32 to f32/f64 are legal,
+ // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
+ return Op;
+ }
+
+ if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
+ return LowerUINT_TO_FP_i64(Op, DAG);
+ if (SrcVT == MVT::i32 && X86ScalarSSEf64)
+ return LowerUINT_TO_FP_i32(Op, DAG);
+ if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
+ return SDValue();
+
+ // Make a 64-bit buffer, and use it to build an FILD.
+ SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
+ if (SrcVT == MVT::i32) {
+ SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
+ SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
+ StackSlot, MachinePointerInfo());
+ SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
+ OffsetSlot, MachinePointerInfo());
+ SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
+ return Fild;
+ }
+
+ assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
+ SDValue ValueToStore = Op.getOperand(0);
+ if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
+ // Bitcasting to f64 here allows us to do a single 64-bit store from
+ // an SSE register, avoiding the store forwarding penalty that would come
+ // with two 32-bit stores.
+ ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
+ SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
+ MachinePointerInfo());
+ // For i64 source, we need to add the appropriate power of 2 if the input
+ // was negative. This is the same as the optimization in
+ // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
+ // we must be careful to do the computation in x87 extended precision, not
+ // in SSE. (The generic code can't know it's OK to do this, or how to.)
+ int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
+ MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
+ MachineMemOperand::MOLoad, 8, 8);
+
+ SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
+ SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
+ SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
+ MVT::i64, MMO);
+
+ APInt FF(32, 0x5F800000ULL);
+
+ // Check whether the sign bit is set.
+ SDValue SignSet = DAG.getSetCC(
+ dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
+ Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
+
+ // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
+ SDValue FudgePtr = DAG.getConstantPool(
+ ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
+
+ // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
+ SDValue Zero = DAG.getIntPtrConstant(0, dl);
+ SDValue Four = DAG.getIntPtrConstant(4, dl);
+ SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
+ Zero, Four);
+ FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
+
+ // Load the value out, extending it from f32 to f80.
+ // FIXME: Avoid the extend by constructing the right constant pool?
+ SDValue Fudge = DAG.getExtLoad(
+ ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
+ /* Alignment = */ 4);
+ // Extend everything to 80 bits to force it to be done on x87.
+ // TODO: Are there any fast-math-flags to propagate here?
+ SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
+ return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
+ DAG.getIntPtrConstant(0, dl));
+}
+
+// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
+// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
+// just return an <SDValue(), SDValue()> pair.
+// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
+// to i16, i32 or i64, and we lower it to a legal sequence.
+// If lowered to the final integer result we return a <result, SDValue()> pair.
+// Otherwise we lower it to a sequence ending with a FIST, return a
+// <FIST, StackSlot> pair, and the caller is responsible for loading
+// the final integer result from StackSlot.
+std::pair<SDValue,SDValue>
+X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
+ bool IsSigned, bool IsReplace) const {
+ SDLoc DL(Op);
+
+ EVT DstTy = Op.getValueType();
+ EVT TheVT = Op.getOperand(0).getValueType();
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+ if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
+ // f16 must be promoted before using the lowering in this routine.
+ // fp128 does not use this lowering.
+ return std::make_pair(SDValue(), SDValue());
+ }
+
+ // If using FIST to compute an unsigned i64, we'll need some fixup
+ // to handle values above the maximum signed i64. A FIST is always
+ // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
+ bool UnsignedFixup = !IsSigned &&
+ DstTy == MVT::i64 &&
+ (!Subtarget.is64Bit() ||
+ !isScalarFPTypeInSSEReg(TheVT));
+
+ if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
+ // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
+ // The low 32 bits of the fist result will have the correct uint32 result.
+ assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
+ DstTy = MVT::i64;
+ }
+
+ assert(DstTy.getSimpleVT() <= MVT::i64 &&
+ DstTy.getSimpleVT() >= MVT::i16 &&
+ "Unknown FP_TO_INT to lower!");
+
+ // These are really Legal.
+ if (DstTy == MVT::i32 &&
+ isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
+ return std::make_pair(SDValue(), SDValue());
+ if (Subtarget.is64Bit() &&
+ DstTy == MVT::i64 &&
+ isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
+ return std::make_pair(SDValue(), SDValue());
+
+ // We lower FP->int64 into FISTP64 followed by a load from a temporary
+ // stack slot.
+ MachineFunction &MF = DAG.getMachineFunction();
+ unsigned MemSize = DstTy.getSizeInBits()/8;
+ int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
+ SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
+
+ unsigned Opc;
+ switch (DstTy.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
+ case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
+ case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
+ case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
+ }
+
+ SDValue Chain = DAG.getEntryNode();
+ SDValue Value = Op.getOperand(0);
+ SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
+
+ if (UnsignedFixup) {
+ //
+ // Conversion to unsigned i64 is implemented with a select,
+ // depending on whether the source value fits in the range
+ // of a signed i64. Let Thresh be the FP equivalent of
+ // 0x8000000000000000ULL.
+ //
+ // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
+ // FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
+ // Fist-to-mem64 FistSrc
+ // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
+ // to XOR'ing the high 32 bits with Adjust.
+ //
+ // Being a power of 2, Thresh is exactly representable in all FP formats.
+ // For X87 we'd like to use the smallest FP type for this constant, but
+ // for DAG type consistency we have to match the FP operand type.
+
+ APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
+ LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
+ bool LosesInfo = false;
+ if (TheVT == MVT::f64)
+ // The rounding mode is irrelevant as the conversion should be exact.
+ Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
+ &LosesInfo);
+ else if (TheVT == MVT::f80)
+ Status = Thresh.convert(APFloat::x87DoubleExtended(),
+ APFloat::rmNearestTiesToEven, &LosesInfo);
+
+ assert(Status == APFloat::opOK && !LosesInfo &&
+ "FP conversion should have been exact");
+
+ SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
+
+ SDValue Cmp = DAG.getSetCC(DL,
+ getSetCCResultType(DAG.getDataLayout(),
+ *DAG.getContext(), TheVT),
+ Value, ThreshVal, ISD::SETLT);
+ Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
+ DAG.getConstant(0, DL, MVT::i32),
+ DAG.getConstant(0x80000000, DL, MVT::i32));
+ SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
+ Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
+ *DAG.getContext(), TheVT),
+ Value, ThreshVal, ISD::SETLT);
+ Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
+ }
+
+ // FIXME This causes a redundant load/store if the SSE-class value is already
+ // in memory, such as if it is on the callstack.
+ if (isScalarFPTypeInSSEReg(TheVT)) {
+ assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
+ Chain = DAG.getStore(Chain, DL, Value, StackSlot,
+ MachinePointerInfo::getFixedStack(MF, SSFI));
+ SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
+ SDValue Ops[] = {
+ Chain, StackSlot, DAG.getValueType(TheVT)
+ };
+
+ MachineMemOperand *MMO =
+ MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
+ MachineMemOperand::MOLoad, MemSize, MemSize);
+ Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
+ Chain = Value.getValue(1);
+ SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
+ StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
+ }
+
+ MachineMemOperand *MMO =
+ MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
+ MachineMemOperand::MOStore, MemSize, MemSize);
+
+ if (UnsignedFixup) {
+
+ // Insert the FIST, load its result as two i32's,
+ // and XOR the high i32 with Adjust.
+
+ SDValue FistOps[] = { Chain, Value, StackSlot };
+ SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
+ FistOps, DstTy, MMO);
+
+ SDValue Low32 =
+ DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
+ SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
+
+ SDValue High32 =
+ DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
+ High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
+
+ if (Subtarget.is64Bit()) {
+ // Join High32 and Low32 into a 64-bit result.
+ // (High32 << 32) | Low32
+ Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
+ High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
+ High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
+ DAG.getConstant(32, DL, MVT::i8));
+ SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
+ return std::make_pair(Result, SDValue());
+ }
+
+ SDValue ResultOps[] = { Low32, High32 };
+
+ SDValue pair = IsReplace
+ ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
+ : DAG.getMergeValues(ResultOps, DL);
+ return std::make_pair(pair, SDValue());
+ } else {
+ // Build the FP_TO_INT*_IN_MEM
+ SDValue Ops[] = { Chain, Value, StackSlot };
+ SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
+ Ops, DstTy, MMO);
+ return std::make_pair(FIST, StackSlot);
+ }
+}
+
+static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ MVT VT = Op->getSimpleValueType(0);
+ SDValue In = Op->getOperand(0);
+ MVT InVT = In.getSimpleValueType();
+ SDLoc dl(Op);
+
+ if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
+ return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In);
+
+ // Optimize vectors in AVX mode:
+ //
+ // v8i16 -> v8i32
+ // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
+ // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
+ // Concat upper and lower parts.
+ //
+ // v4i32 -> v4i64
+ // Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
+ // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
+ // Concat upper and lower parts.
+ //
+
+ if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
+ ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
+ ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
+ return SDValue();
+
+ if (Subtarget.hasInt256())
+ return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
+
+ SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
+ SDValue Undef = DAG.getUNDEF(InVT);
+ bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
+ SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
+ SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
+
+ MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
+ VT.getVectorNumElements()/2);
+
+ OpLo = DAG.getBitcast(HVT, OpLo);
+ OpHi = DAG.getBitcast(HVT, OpHi);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
+}
+
+static SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
+ const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+ MVT VT = Op->getSimpleValueType(0);
+ SDValue In = Op->getOperand(0);
+ MVT InVT = In.getSimpleValueType();
+ SDLoc DL(Op);
+ unsigned NumElts = VT.getVectorNumElements();
+ if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
+ return SDValue();
+
+ if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
+ return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
+
+ assert(InVT.getVectorElementType() == MVT::i1);
+
+ // Extend VT if the target is 256 or 128bit vector and VLX is not supported.
+ MVT ExtVT = VT;
+ if (!VT.is512BitVector() && !Subtarget.hasVLX())
+ ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
+
+ SDValue One =
+ DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT);
+ SDValue Zero =
+ DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
+
+ SDValue SelectedVal = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero);
+ if (VT == ExtVT)
+ return SelectedVal;
+ return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal);
+}
+
+static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ if (Subtarget.hasFp256())
+ if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
+ return Res;
+
+ return SDValue();
+}
+
+static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ MVT VT = Op.getSimpleValueType();
+ SDValue In = Op.getOperand(0);
+ MVT SVT = In.getSimpleValueType();
+
+ if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
+ return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG);
+
+ if (Subtarget.hasFp256())
+ if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
+ return Res;
+
+ assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
+ VT.getVectorNumElements() != SVT.getVectorNumElements());
+ return SDValue();
+}
+
+/// Helper to recursively truncate vector elements in half with PACKSS.
+/// It makes use of the fact that vector comparison results will be all-zeros
+/// or all-ones to use (vXi8 PACKSS(vYi16, vYi16)) instead of matching types.
+/// AVX2 (Int256) sub-targets require extra shuffling as the PACKSS operates
+/// within each 128-bit lane.
+static SDValue truncateVectorCompareWithPACKSS(EVT DstVT, SDValue In,
+ const SDLoc &DL,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // Requires SSE2 but AVX512 has fast truncate.
+ if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
+ return SDValue();
+
+ EVT SrcVT = In.getValueType();
+
+ // No truncation required, we might get here due to recursive calls.
+ if (SrcVT == DstVT)
+ return In;
+
+ // We only support vector truncation to 128bits or greater from a
+ // 256bits or greater source.
+ if ((DstVT.getSizeInBits() % 128) != 0)
+ return SDValue();
+ if ((SrcVT.getSizeInBits() % 256) != 0)
+ return SDValue();
+
+ unsigned NumElems = SrcVT.getVectorNumElements();
+ assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
+ assert(SrcVT.getSizeInBits() > DstVT.getSizeInBits() && "Illegal truncation");
+
+ EVT PackedSVT =
+ EVT::getIntegerVT(*DAG.getContext(), SrcVT.getScalarSizeInBits() / 2);
+
+ // Extract lower/upper subvectors.
+ unsigned NumSubElts = NumElems / 2;
+ unsigned SrcSizeInBits = SrcVT.getSizeInBits();
+ SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
+ SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
+
+ // 256bit -> 128bit truncate - PACKSS lower/upper 128-bit subvectors.
+ if (SrcVT.is256BitVector()) {
+ Lo = DAG.getBitcast(MVT::v8i16, Lo);
+ Hi = DAG.getBitcast(MVT::v8i16, Hi);
+ SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, Lo, Hi);
+ return DAG.getBitcast(DstVT, Res);
+ }
+
+ // AVX2: 512bit -> 256bit truncate - PACKSS lower/upper 256-bit subvectors.
+ // AVX2: 512bit -> 128bit truncate - PACKSS(PACKSS, PACKSS).
+ if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
+ Lo = DAG.getBitcast(MVT::v16i16, Lo);
+ Hi = DAG.getBitcast(MVT::v16i16, Hi);
+ SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v32i8, Lo, Hi);
+
+ // 256-bit PACKSS(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
+ // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
+ Res = DAG.getBitcast(MVT::v4i64, Res);
+ Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
+
+ if (DstVT.is256BitVector())
+ return DAG.getBitcast(DstVT, Res);
+
+ // If 512bit -> 128bit truncate another stage.
+ EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
+ Res = DAG.getBitcast(PackedVT, Res);
+ return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
+ }
+
+ // Recursively pack lower/upper subvectors, concat result and pack again.
+ assert(SrcVT.getSizeInBits() >= 512 && "Expected 512-bit vector or greater");
+ EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems / 2);
+ Lo = truncateVectorCompareWithPACKSS(PackedVT, Lo, DL, DAG, Subtarget);
+ Hi = truncateVectorCompareWithPACKSS(PackedVT, Hi, DL, DAG, Subtarget);
+
+ PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
+ SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
+ return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
+}
+
+static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+
+ SDLoc DL(Op);
+ MVT VT = Op.getSimpleValueType();
+ SDValue In = Op.getOperand(0);
+ MVT InVT = In.getSimpleValueType();
+
+ assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
+
+ // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
+ unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
+ if (InVT.getScalarSizeInBits() <= 16) {
+ if (Subtarget.hasBWI()) {
+ // legal, will go to VPMOVB2M, VPMOVW2M
+ // Shift packed bytes not supported natively, bitcast to word
+ MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
+ SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
+ DAG.getBitcast(ExtVT, In),
+ DAG.getConstant(ShiftInx, DL, ExtVT));
+ ShiftNode = DAG.getBitcast(InVT, ShiftNode);
+ return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
+ }
+ // Use TESTD/Q, extended vector to packed dword/qword.
+ assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
+ "Unexpected vector type.");
+ unsigned NumElts = InVT.getVectorNumElements();
+ MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
+ In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
+ InVT = ExtVT;
+ ShiftInx = InVT.getScalarSizeInBits() - 1;
+ }
+
+ SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
+ DAG.getConstant(ShiftInx, DL, InVT));
+ return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode);
+}
+
+SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ MVT VT = Op.getSimpleValueType();
+ SDValue In = Op.getOperand(0);
+ MVT InVT = In.getSimpleValueType();
+
+ if (VT == MVT::i1) {
+ assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
+ "Invalid scalar TRUNCATE operation");
+ if (InVT.getSizeInBits() >= 32)
+ return SDValue();
+ In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
+ return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
+ }
+ assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
+ "Invalid TRUNCATE operation");
+
+ if (VT.getVectorElementType() == MVT::i1)
+ return LowerTruncateVecI1(Op, DAG, Subtarget);
+
+ // vpmovqb/w/d, vpmovdb/w, vpmovwb
+ if (Subtarget.hasAVX512()) {
+ // word to byte only under BWI
+ if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
+ return DAG.getNode(X86ISD::VTRUNC, DL, VT,
+ DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In));
+ return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
+ }
+
+ // Truncate with PACKSS if we are truncating a vector comparison result.
+ // TODO: We should be able to support other operations as long as we
+ // we are saturating+packing zero/all bits only.
+ auto IsPackableComparison = [](SDValue V) {
+ unsigned Opcode = V.getOpcode();
+ return (Opcode == X86ISD::PCMPGT || Opcode == X86ISD::PCMPEQ ||
+ Opcode == X86ISD::CMPP);
+ };
+
+ if (IsPackableComparison(In) || (In.getOpcode() == ISD::CONCAT_VECTORS &&
+ all_of(In->ops(), IsPackableComparison))) {
+ if (SDValue V = truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget))
+ return V;
+ }
+
+ if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
+ // On AVX2, v4i64 -> v4i32 becomes VPERMD.
+ if (Subtarget.hasInt256()) {
+ static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
+ In = DAG.getBitcast(MVT::v8i32, In);
+ In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
+ ShufMask);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
+ DAG.getIntPtrConstant(0, DL));
+ }
+
+ SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
+ DAG.getIntPtrConstant(0, DL));
+ SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
+ DAG.getIntPtrConstant(2, DL));
+ OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
+ OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
+ static const int ShufMask[] = {0, 2, 4, 6};
+ return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
+ }
+
+ if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
+ // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
+ if (Subtarget.hasInt256()) {
+ In = DAG.getBitcast(MVT::v32i8, In);
+
+ SmallVector<SDValue,32> pshufbMask;
+ for (unsigned i = 0; i < 2; ++i) {
+ pshufbMask.push_back(DAG.getConstant(0x0, DL, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x1, DL, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x4, DL, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x5, DL, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x8, DL, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x9, DL, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0xc, DL, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0xd, DL, MVT::i8));
+ for (unsigned j = 0; j < 8; ++j)
+ pshufbMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
+ }
+ SDValue BV = DAG.getBuildVector(MVT::v32i8, DL, pshufbMask);
+ In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
+ In = DAG.getBitcast(MVT::v4i64, In);
+
+ static const int ShufMask[] = {0, 2, -1, -1};
+ In = DAG.getVectorShuffle(MVT::v4i64, DL, In, DAG.getUNDEF(MVT::v4i64),
+ ShufMask);
+ In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
+ DAG.getIntPtrConstant(0, DL));
+ return DAG.getBitcast(VT, In);
+ }
+
+ SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
+ DAG.getIntPtrConstant(0, DL));
+
+ SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
+ DAG.getIntPtrConstant(4, DL));
+
+ OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
+ OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
+
+ // The PSHUFB mask:
+ static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
+ -1, -1, -1, -1, -1, -1, -1, -1};
+
+ SDValue Undef = DAG.getUNDEF(MVT::v16i8);
+ OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
+ OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
+
+ OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
+ OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
+
+ // The MOVLHPS Mask:
+ static const int ShufMask2[] = {0, 1, 4, 5};
+ SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
+ return DAG.getBitcast(MVT::v8i16, res);
+ }
+
+ // Handle truncation of V256 to V128 using shuffles.
+ if (!VT.is128BitVector() || !InVT.is256BitVector())
+ return SDValue();
+
+ assert(Subtarget.hasFp256() && "256-bit vector without AVX!");
+
+ unsigned NumElems = VT.getVectorNumElements();
+ MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
+
+ SmallVector<int, 16> MaskVec(NumElems * 2, -1);
+ // Prepare truncation shuffle mask
+ for (unsigned i = 0; i != NumElems; ++i)
+ MaskVec[i] = i * 2;
+ SDValue V = DAG.getVectorShuffle(NVT, DL, DAG.getBitcast(NVT, In),
+ DAG.getUNDEF(NVT), MaskVec);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
+ DAG.getIntPtrConstant(0, DL));
+}
+
+SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) const {
+ bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
+
+ MVT VT = Op.getSimpleValueType();
+
+ if (VT.isVector()) {
+ assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
+ SDValue Src = Op.getOperand(0);
+ SDLoc dl(Op);
+ if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
+ return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI,
+ dl, VT,
+ DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
+ DAG.getUNDEF(MVT::v2f32)));
+ }
+
+ return SDValue();
+ }
+
+ assert(!VT.isVector());
+
+ std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
+ IsSigned, /*IsReplace=*/ false);
+ SDValue FIST = Vals.first, StackSlot = Vals.second;
+ // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
+ if (!FIST.getNode())
+ return Op;
+
+ if (StackSlot.getNode())
+ // Load the result.
+ return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
+
+ // The node is the result.
+ return FIST;
+}
+
+static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ MVT VT = Op.getSimpleValueType();
+ SDValue In = Op.getOperand(0);
+ MVT SVT = In.getSimpleValueType();
+
+ assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
+
+ return DAG.getNode(X86ISD::VFPEXT, DL, VT,
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
+ In, DAG.getUNDEF(SVT)));
+}
+
+/// The only differences between FABS and FNEG are the mask and the logic op.
+/// FNEG also has a folding opportunity for FNEG(FABS(x)).
+static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
+ assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
+ "Wrong opcode for lowering FABS or FNEG.");
+
+ bool IsFABS = (Op.getOpcode() == ISD::FABS);
+
+ // If this is a FABS and it has an FNEG user, bail out to fold the combination
+ // into an FNABS. We'll lower the FABS after that if it is still in use.
+ if (IsFABS)
+ for (SDNode *User : Op->uses())
+ if (User->getOpcode() == ISD::FNEG)
+ return Op;
+
+ SDLoc dl(Op);
+ MVT VT = Op.getSimpleValueType();
+
+ bool IsF128 = (VT == MVT::f128);
+
+ // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
+ // decide if we should generate a 16-byte constant mask when we only need 4 or
+ // 8 bytes for the scalar case.
+
+ MVT LogicVT;
+ MVT EltVT;
+
+ if (VT.isVector()) {
+ LogicVT = VT;
+ EltVT = VT.getVectorElementType();
+ } else if (IsF128) {
+ // SSE instructions are used for optimized f128 logical operations.
+ LogicVT = MVT::f128;
+ EltVT = VT;
+ } else {
+ // There are no scalar bitwise logical SSE/AVX instructions, so we
+ // generate a 16-byte vector constant and logic op even for the scalar case.
+ // Using a 16-byte mask allows folding the load of the mask with
+ // the logic op, so it can save (~4 bytes) on code size.
+ LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
+ EltVT = VT;
+ }
+
+ unsigned EltBits = EltVT.getSizeInBits();
+ // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
+ APInt MaskElt =
+ IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits);
+ const fltSemantics &Sem =
+ EltVT == MVT::f64 ? APFloat::IEEEdouble() :
+ (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
+ SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
+
+ SDValue Op0 = Op.getOperand(0);
+ bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
+ unsigned LogicOp =
+ IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
+ SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
+
+ if (VT.isVector() || IsF128)
+ return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
+
+ // For the scalar case extend to a 128-bit vector, perform the logic op,
+ // and extract the scalar result back out.
+ Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
+ SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
+ DAG.getIntPtrConstant(0, dl));
+}
+
+static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
+ SDValue Mag = Op.getOperand(0);
+ SDValue Sign = Op.getOperand(1);
+ SDLoc dl(Op);
+
+ // If the sign operand is smaller, extend it first.
+ MVT VT = Op.getSimpleValueType();
+ if (Sign.getSimpleValueType().bitsLT(VT))
+ Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
+
+ // And if it is bigger, shrink it first.
+ if (Sign.getSimpleValueType().bitsGT(VT))
+ Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
+
+ // At this point the operands and the result should have the same
+ // type, and that won't be f80 since that is not custom lowered.
+ bool IsF128 = (VT == MVT::f128);
+ assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
+ VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
+ VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
+ "Unexpected type in LowerFCOPYSIGN");
+
+ MVT EltVT = VT.getScalarType();
+ const fltSemantics &Sem =
+ EltVT == MVT::f64 ? APFloat::IEEEdouble()
+ : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
+
+ // Perform all scalar logic operations as 16-byte vectors because there are no
+ // scalar FP logic instructions in SSE.
+ // TODO: This isn't necessary. If we used scalar types, we might avoid some
+ // unnecessary splats, but we might miss load folding opportunities. Should
+ // this decision be based on OptimizeForSize?
+ bool IsFakeVector = !VT.isVector() && !IsF128;
+ MVT LogicVT = VT;
+ if (IsFakeVector)
+ LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
+
+ // The mask constants are automatically splatted for vector types.
+ unsigned EltSizeInBits = VT.getScalarSizeInBits();
+ SDValue SignMask = DAG.getConstantFP(
+ APFloat(Sem, APInt::getSignBit(EltSizeInBits)), dl, LogicVT);
+ SDValue MagMask = DAG.getConstantFP(
+ APFloat(Sem, ~APInt::getSignBit(EltSizeInBits)), dl, LogicVT);
+
+ // First, clear all bits but the sign bit from the second operand (sign).
+ if (IsFakeVector)
+ Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
+ SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
+
+ // Next, clear the sign bit from the first operand (magnitude).
+ // TODO: If we had general constant folding for FP logic ops, this check
+ // wouldn't be necessary.
+ SDValue MagBits;
+ if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
+ APFloat APF = Op0CN->getValueAPF();
+ APF.clearSign();
+ MagBits = DAG.getConstantFP(APF, dl, LogicVT);
+ } else {
+ // If the magnitude operand wasn't a constant, we need to AND out the sign.
+ if (IsFakeVector)
+ Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
+ MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
+ }
+
+ // OR the magnitude value with the sign bit.
+ SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
+ return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
+ DAG.getIntPtrConstant(0, dl));
+}
+
+static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
+ SDValue N0 = Op.getOperand(0);
+ SDLoc dl(Op);
+ MVT VT = Op.getSimpleValueType();
+
+ MVT OpVT = N0.getSimpleValueType();
+ assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
+ "Unexpected type for FGETSIGN");
+
+ // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
+ MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
+ SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
+ Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
+ Res = DAG.getZExtOrTrunc(Res, dl, VT);
+ Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
+ return Res;
+}
+
+// Check whether an OR'd tree is PTEST-able.
+static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
+
+ if (!Subtarget.hasSSE41())
+ return SDValue();
+
+ if (!Op->hasOneUse())
+ return SDValue();
+
+ SDNode *N = Op.getNode();
+ SDLoc DL(N);
+
+ SmallVector<SDValue, 8> Opnds;
+ DenseMap<SDValue, unsigned> VecInMap;
+ SmallVector<SDValue, 8> VecIns;
+ EVT VT = MVT::Other;
+
+ // Recognize a special case where a vector is casted into wide integer to
+ // test all 0s.
+ Opnds.push_back(N->getOperand(0));
+ Opnds.push_back(N->getOperand(1));
+
+ for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
+ SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
+ // BFS traverse all OR'd operands.
+ if (I->getOpcode() == ISD::OR) {
+ Opnds.push_back(I->getOperand(0));
+ Opnds.push_back(I->getOperand(1));
+ // Re-evaluate the number of nodes to be traversed.
+ e += 2; // 2 more nodes (LHS and RHS) are pushed.
+ continue;
+ }
+
+ // Quit if a non-EXTRACT_VECTOR_ELT
+ if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ // Quit if without a constant index.
+ SDValue Idx = I->getOperand(1);
+ if (!isa<ConstantSDNode>(Idx))
+ return SDValue();
+
+ SDValue ExtractedFromVec = I->getOperand(0);
+ DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
+ if (M == VecInMap.end()) {
+ VT = ExtractedFromVec.getValueType();
+ // Quit if not 128/256-bit vector.
+ if (!VT.is128BitVector() && !VT.is256BitVector())
+ return SDValue();
+ // Quit if not the same type.
+ if (VecInMap.begin() != VecInMap.end() &&
+ VT != VecInMap.begin()->first.getValueType())
+ return SDValue();
+ M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
+ VecIns.push_back(ExtractedFromVec);
+ }
+ M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
+ }
+
+ assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ "Not extracted from 128-/256-bit vector.");
+
+ unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
+
+ for (DenseMap<SDValue, unsigned>::const_iterator
+ I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
+ // Quit if not all elements are used.
+ if (I->second != FullMask)
+ return SDValue();
+ }
+
+ MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+
+ // Cast all vectors into TestVT for PTEST.
+ for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
+ VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
+
+ // If more than one full vectors are evaluated, OR them first before PTEST.
+ for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
+ // Each iteration will OR 2 nodes and append the result until there is only
+ // 1 node left, i.e. the final OR'd value of all vectors.
+ SDValue LHS = VecIns[Slot];
+ SDValue RHS = VecIns[Slot + 1];
+ VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
+ }
+
+ return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
+ VecIns.back(), VecIns.back());
+}
+
+/// \brief return true if \c Op has a use that doesn't just read flags.
+static bool hasNonFlagsUse(SDValue Op) {
+ for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
+ ++UI) {
+ SDNode *User = *UI;
+ unsigned UOpNo = UI.getOperandNo();
+ if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
+ // Look pass truncate.
+ UOpNo = User->use_begin().getOperandNo();
+ User = *User->use_begin();
+ }
+
+ if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
+ !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
+ return true;
+ }
+ return false;
+}
+
+// Emit KTEST instruction for bit vectors on AVX-512
+static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (Op.getOpcode() == ISD::BITCAST) {
+ auto hasKTEST = [&](MVT VT) {
+ unsigned SizeInBits = VT.getSizeInBits();
+ return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||
+ (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));
+ };
+ SDValue Op0 = Op.getOperand(0);
+ MVT Op0VT = Op0.getValueType().getSimpleVT();
+ if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
+ hasKTEST(Op0VT))
+ return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
+ }
+ return SDValue();
+}
+
+/// Emit nodes that will be selected as "test Op0,Op0", or something
+/// equivalent.
+SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
+ SelectionDAG &DAG) const {
+ if (Op.getValueType() == MVT::i1) {
+ SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
+ return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
+ DAG.getConstant(0, dl, MVT::i8));
+ }
+ // CF and OF aren't always set the way we want. Determine which
+ // of these we need.
+ bool NeedCF = false;
+ bool NeedOF = false;
+ switch (X86CC) {
+ default: break;
+ case X86::COND_A: case X86::COND_AE:
+ case X86::COND_B: case X86::COND_BE:
+ NeedCF = true;
+ break;
+ case X86::COND_G: case X86::COND_GE:
+ case X86::COND_L: case X86::COND_LE:
+ case X86::COND_O: case X86::COND_NO: {
+ // Check if we really need to set the
+ // Overflow flag. If NoSignedWrap is present
+ // that is not actually needed.
+ switch (Op->getOpcode()) {
+ case ISD::ADD:
+ case ISD::SUB:
+ case ISD::MUL:
+ case ISD::SHL: {
+ const auto *BinNode = cast<BinaryWithFlagsSDNode>(Op.getNode());
+ if (BinNode->Flags.hasNoSignedWrap())
+ break;
+ }
+ default:
+ NeedOF = true;
+ break;
+ }
+ break;
+ }
+ }
+ // See if we can use the EFLAGS value from the operand instead of
+ // doing a separate TEST. TEST always sets OF and CF to 0, so unless
+ // we prove that the arithmetic won't overflow, we can't use OF or CF.
+ if (Op.getResNo() != 0 || NeedOF || NeedCF) {
+ // Emit KTEST for bit vectors
+ if (auto Node = EmitKTEST(Op, DAG, Subtarget))
+ return Node;
+ // Emit a CMP with 0, which is the TEST pattern.
+ return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
+ DAG.getConstant(0, dl, Op.getValueType()));
+ }
+ unsigned Opcode = 0;
+ unsigned NumOperands = 0;
+
+ // Truncate operations may prevent the merge of the SETCC instruction
+ // and the arithmetic instruction before it. Attempt to truncate the operands
+ // of the arithmetic instruction and use a reduced bit-width instruction.
+ bool NeedTruncation = false;
+ SDValue ArithOp = Op;
+ if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
+ SDValue Arith = Op->getOperand(0);
+ // Both the trunc and the arithmetic op need to have one user each.
+ if (Arith->hasOneUse())
+ switch (Arith.getOpcode()) {
+ default: break;
+ case ISD::ADD:
+ case ISD::SUB:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR: {
+ NeedTruncation = true;
+ ArithOp = Arith;
+ }
+ }
+ }
+
+ // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
+ // which may be the result of a CAST. We use the variable 'Op', which is the
+ // non-casted variable when we check for possible users.
+ switch (ArithOp.getOpcode()) {
+ case ISD::ADD:
+ // Due to an isel shortcoming, be conservative if this add is likely to be
+ // selected as part of a load-modify-store instruction. When the root node
+ // in a match is a store, isel doesn't know how to remap non-chain non-flag
+ // uses of other nodes in the match, such as the ADD in this case. This
+ // leads to the ADD being left around and reselected, with the result being
+ // two adds in the output. Alas, even if none our users are stores, that
+ // doesn't prove we're O.K. Ergo, if we have any parents that aren't
+ // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require
+ // climbing the DAG back to the root, and it doesn't seem to be worth the
+ // effort.
+ for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+ UE = Op.getNode()->use_end(); UI != UE; ++UI)
+ if (UI->getOpcode() != ISD::CopyToReg &&
+ UI->getOpcode() != ISD::SETCC &&
+ UI->getOpcode() != ISD::STORE)
+ goto default_case;
+
+ if (ConstantSDNode *C =
+ dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
+ // An add of one will be selected as an INC.
+ if (C->isOne() && !Subtarget.slowIncDec()) {
+ Opcode = X86ISD::INC;
+ NumOperands = 1;
+ break;
+ }
+
+ // An add of negative one (subtract of one) will be selected as a DEC.
+ if (C->isAllOnesValue() && !Subtarget.slowIncDec()) {
+ Opcode = X86ISD::DEC;
+ NumOperands = 1;
+ break;
+ }
+ }
+
+ // Otherwise use a regular EFLAGS-setting add.
+ Opcode = X86ISD::ADD;
+ NumOperands = 2;
+ break;
+ case ISD::SHL:
+ case ISD::SRL:
+ // If we have a constant logical shift that's only used in a comparison
+ // against zero turn it into an equivalent AND. This allows turning it into
+ // a TEST instruction later.
+ if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
+ isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
+ EVT VT = Op.getValueType();
+ unsigned BitWidth = VT.getSizeInBits();
+ unsigned ShAmt = Op->getConstantOperandVal(1);
+ if (ShAmt >= BitWidth) // Avoid undefined shifts.
+ break;
+ APInt Mask = ArithOp.getOpcode() == ISD::SRL
+ ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
+ : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
+ if (!Mask.isSignedIntN(32)) // Avoid large immediates.
+ break;
+ Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
+ DAG.getConstant(Mask, dl, VT));
+ }
+ break;
+
+ case ISD::AND:
+ // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
+ // because a TEST instruction will be better.
+ if (!hasNonFlagsUse(Op)) {
+ SDValue Op0 = ArithOp->getOperand(0);
+ SDValue Op1 = ArithOp->getOperand(1);
+ EVT VT = ArithOp.getValueType();
+ bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
+ bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
+
+ // But if we can combine this into an ANDN operation, then create an AND
+ // now and allow it to be pattern matched into an ANDN.
+ if (!Subtarget.hasBMI() || !isAndn || !isLegalAndnType)
+ break;
+ }
+ LLVM_FALLTHROUGH;
+ case ISD::SUB:
+ case ISD::OR:
+ case ISD::XOR:
+ // Due to the ISEL shortcoming noted above, be conservative if this op is
+ // likely to be selected as part of a load-modify-store instruction.
+ for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+ UE = Op.getNode()->use_end(); UI != UE; ++UI)
+ if (UI->getOpcode() == ISD::STORE)
+ goto default_case;
+
+ // Otherwise use a regular EFLAGS-setting instruction.
+ switch (ArithOp.getOpcode()) {
+ default: llvm_unreachable("unexpected operator!");
+ case ISD::SUB: Opcode = X86ISD::SUB; break;
+ case ISD::XOR: Opcode = X86ISD::XOR; break;
+ case ISD::AND: Opcode = X86ISD::AND; break;
+ case ISD::OR: {
+ if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
+ if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
+ return EFLAGS;
+ }
+ Opcode = X86ISD::OR;
+ break;
+ }
+ }
+
+ NumOperands = 2;
+ break;
+ case X86ISD::ADD:
+ case X86ISD::SUB:
+ case X86ISD::INC:
+ case X86ISD::DEC:
+ case X86ISD::OR:
+ case X86ISD::XOR:
+ case X86ISD::AND:
+ return SDValue(Op.getNode(), 1);
+ default:
+ default_case:
+ break;
+ }
+
+ // If we found that truncation is beneficial, perform the truncation and
+ // update 'Op'.
+ if (NeedTruncation) {
+ EVT VT = Op.getValueType();
+ SDValue WideVal = Op->getOperand(0);
+ EVT WideVT = WideVal.getValueType();
+ unsigned ConvertedOp = 0;
+ // Use a target machine opcode to prevent further DAGCombine
+ // optimizations that may separate the arithmetic operations
+ // from the setcc node.
+ switch (WideVal.getOpcode()) {
+ default: break;
+ case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
+ case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
+ case ISD::AND: ConvertedOp = X86ISD::AND; break;
+ case ISD::OR: ConvertedOp = X86ISD::OR; break;
+ case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
+ }
+
+ if (ConvertedOp) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
+ SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
+ SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
+ Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
+ }
+ }
+ }
+
+ if (Opcode == 0) {
+ // Emit KTEST for bit vectors
+ if (auto Node = EmitKTEST(Op, DAG, Subtarget))
+ return Node;
+
+ // Emit a CMP with 0, which is the TEST pattern.
+ return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
+ DAG.getConstant(0, dl, Op.getValueType()));
+ }
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+ SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
+
+ SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
+ DAG.ReplaceAllUsesWith(Op, New);
+ return SDValue(New.getNode(), 1);
+}
+
+/// Emit nodes that will be selected as "cmp Op0,Op1", or something
+/// equivalent.
+SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
+ const SDLoc &dl, SelectionDAG &DAG) const {
+ if (isNullConstant(Op1))
+ return EmitTest(Op0, X86CC, dl, DAG);
+
+ assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
+ "Unexpected comparison operation for MVT::i1 operands");
+
+ if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
+ Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
+ // Only promote the compare up to I32 if it is a 16 bit operation
+ // with an immediate. 16 bit immediates are to be avoided.
+ if ((Op0.getValueType() == MVT::i16 &&
+ (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
+ !DAG.getMachineFunction().getFunction()->optForMinSize() &&
+ !Subtarget.isAtom()) {
+ unsigned ExtendOp =
+ isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
+ Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
+ Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
+ }
+ // Use SUB instead of CMP to enable CSE between SUB and CMP.
+ SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
+ SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
+ Op0, Op1);
+ return SDValue(Sub.getNode(), 1);
+ }
+ return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
+}
+
+/// Convert a comparison if required by the subtarget.
+SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
+ SelectionDAG &DAG) const {
+ // If the subtarget does not support the FUCOMI instruction, floating-point
+ // comparisons have to be converted.
+ if (Subtarget.hasCMov() ||
+ Cmp.getOpcode() != X86ISD::CMP ||
+ !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
+ !Cmp.getOperand(1).getValueType().isFloatingPoint())
+ return Cmp;
+
+ // The instruction selector will select an FUCOM instruction instead of
+ // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
+ // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
+ // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
+ SDLoc dl(Cmp);
+ SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
+ SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
+ SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
+ DAG.getConstant(8, dl, MVT::i8));
+ SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
+
+ // Some 64-bit targets lack SAHF support, but they do support FCOMI.
+ assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
+ return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
+}
+
+/// Check if replacement of SQRT with RSQRT should be disabled.
+bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+
+ // We never want to use both SQRT and RSQRT instructions for the same input.
+ if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
+ return false;
+
+ if (VT.isVector())
+ return Subtarget.hasFastVectorFSQRT();
+ return Subtarget.hasFastScalarFSQRT();
+}
+
+/// The minimum architected relative accuracy is 2^-12. We need one
+/// Newton-Raphson step to have a good float result (24 bits of precision).
+SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
+ SelectionDAG &DAG, int Enabled,
+ int &RefinementSteps,
+ bool &UseOneConstNR,
+ bool Reciprocal) const {
+ EVT VT = Op.getValueType();
+
+ // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
+ // TODO: Add support for AVX512 (v16f32).
+ // It is likely not profitable to do this for f64 because a double-precision
+ // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
+ // instructions: convert to single, rsqrtss, convert back to double, refine
+ // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
+ // along with FMA, this could be a throughput win.
+ if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
+ (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
+ (VT == MVT::v8f32 && Subtarget.hasAVX())) {
+ if (RefinementSteps == ReciprocalEstimate::Unspecified)
+ RefinementSteps = 1;
+
+ UseOneConstNR = false;
+ return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
+ }
+ return SDValue();
+}
+
+/// The minimum architected relative accuracy is 2^-12. We need one
+/// Newton-Raphson step to have a good float result (24 bits of precision).
+SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
+ int Enabled,
+ int &RefinementSteps) const {
+ EVT VT = Op.getValueType();
+
+ // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
+ // TODO: Add support for AVX512 (v16f32).
+ // It is likely not profitable to do this for f64 because a double-precision
+ // reciprocal estimate with refinement on x86 prior to FMA requires
+ // 15 instructions: convert to single, rcpss, convert back to double, refine
+ // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
+ // along with FMA, this could be a throughput win.
+
+ if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
+ (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
+ (VT == MVT::v8f32 && Subtarget.hasAVX())) {
+ // Enable estimate codegen with 1 refinement step for vector division.
+ // Scalar division estimates are disabled because they break too much
+ // real-world code. These defaults are intended to match GCC behavior.
+ if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
+ return SDValue();
+
+ if (RefinementSteps == ReciprocalEstimate::Unspecified)
+ RefinementSteps = 1;
+
+ return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
+ }
+ return SDValue();
+}
+
+/// If we have at least two divisions that use the same divisor, convert to
+/// multplication by a reciprocal. This may need to be adjusted for a given
+/// CPU if a division's cost is not at least twice the cost of a multiplication.
+/// This is because we still need one division to calculate the reciprocal and
+/// then we need two multiplies by that reciprocal as replacements for the
+/// original divisions.
+unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
+ return 2;
+}
+
+/// Helper for creating a X86ISD::SETCC node.
+static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
+ SelectionDAG &DAG) {
+ return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
+}
+
+/// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
+/// according to equal/not-equal condition code \p CC.
+static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
+ const SDLoc &dl, SelectionDAG &DAG) {
+ // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
+ // instruction. Since the shift amount is in-range-or-undefined, we know
+ // that doing a bittest on the i32 value is ok. We extend to i32 because
+ // the encoding for the i16 version is larger than the i32 version.
+ // Also promote i16 to i32 for performance / code size reason.
+ if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
+ Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
+
+ // See if we can use the 32-bit instruction instead of the 64-bit one for a
+ // shorter encoding. Since the former takes the modulo 32 of BitNo and the
+ // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
+ // known to be zero.
+ if (Src.getValueType() == MVT::i64 &&
+ DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
+ Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
+
+ // If the operand types disagree, extend the shift amount to match. Since
+ // BT ignores high bits (like shifts) we can use anyextend.
+ if (Src.getValueType() != BitNo.getValueType())
+ BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
+
+ SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
+ X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
+ return getSETCC(Cond, BT, dl , DAG);
+}
+
+/// Result of 'and' is compared against zero. Change to a BT node if possible.
+static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
+ const SDLoc &dl, SelectionDAG &DAG) {
+ SDValue Op0 = And.getOperand(0);
+ SDValue Op1 = And.getOperand(1);
+ if (Op0.getOpcode() == ISD::TRUNCATE)
+ Op0 = Op0.getOperand(0);
+ if (Op1.getOpcode() == ISD::TRUNCATE)
+ Op1 = Op1.getOperand(0);
+
+ SDValue LHS, RHS;
+ if (Op1.getOpcode() == ISD::SHL)
+ std::swap(Op0, Op1);
+ if (Op0.getOpcode() == ISD::SHL) {
+ if (isOneConstant(Op0.getOperand(0))) {
+ // If we looked past a truncate, check that it's only truncating away
+ // known zeros.
+ unsigned BitWidth = Op0.getValueSizeInBits();
+ unsigned AndBitWidth = And.getValueSizeInBits();
+ if (BitWidth > AndBitWidth) {
+ APInt Zeros, Ones;
+ DAG.computeKnownBits(Op0, Zeros, Ones);
+ if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
+ return SDValue();
+ }
+ LHS = Op1;
+ RHS = Op0.getOperand(1);
+ }
+ } else if (Op1.getOpcode() == ISD::Constant) {
+ ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
+ uint64_t AndRHSVal = AndRHS->getZExtValue();
+ SDValue AndLHS = Op0;
+
+ if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
+ LHS = AndLHS.getOperand(0);
+ RHS = AndLHS.getOperand(1);
+ }
+
+ // Use BT if the immediate can't be encoded in a TEST instruction.
+ if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
+ LHS = AndLHS;
+ RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
+ }
+ }
+
+ if (LHS.getNode())
+ return getBitTestCondition(LHS, RHS, CC, dl, DAG);
+
+ return SDValue();
+}
+
+// Convert (truncate (srl X, N) to i1) to (bt X, N)
+static SDValue LowerTruncateToBT(SDValue Op, ISD::CondCode CC,
+ const SDLoc &dl, SelectionDAG &DAG) {
+
+ assert(Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1 &&
+ "Expected TRUNCATE to i1 node");
+
+ if (Op.getOperand(0).getOpcode() != ISD::SRL)
+ return SDValue();
+
+ SDValue ShiftRight = Op.getOperand(0);
+ return getBitTestCondition(ShiftRight.getOperand(0), ShiftRight.getOperand(1),
+ CC, dl, DAG);
+}
+
+/// Result of 'and' or 'trunc to i1' is compared against zero.
+/// Change to a BT node if possible.
+SDValue X86TargetLowering::LowerToBT(SDValue Op, ISD::CondCode CC,
+ const SDLoc &dl, SelectionDAG &DAG) const {
+ if (Op.getOpcode() == ISD::AND)
+ return LowerAndToBT(Op, CC, dl, DAG);
+ if (Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1)
+ return LowerTruncateToBT(Op, CC, dl, DAG);
+ return SDValue();
+}
+
+/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
+/// CMPs.
+static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
+ SDValue &Op1) {
+ unsigned SSECC;
+ bool Swap = false;
+
+ // SSE Condition code mapping:
+ // 0 - EQ
+ // 1 - LT
+ // 2 - LE
+ // 3 - UNORD
+ // 4 - NEQ
+ // 5 - NLT
+ // 6 - NLE
+ // 7 - ORD
+ switch (SetCCOpcode) {
+ default: llvm_unreachable("Unexpected SETCC condition");
+ case ISD::SETOEQ:
+ case ISD::SETEQ: SSECC = 0; break;
+ case ISD::SETOGT:
+ case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
+ case ISD::SETLT:
+ case ISD::SETOLT: SSECC = 1; break;
+ case ISD::SETOGE:
+ case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
+ case ISD::SETLE:
+ case ISD::SETOLE: SSECC = 2; break;
+ case ISD::SETUO: SSECC = 3; break;
+ case ISD::SETUNE:
+ case ISD::SETNE: SSECC = 4; break;
+ case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
+ case ISD::SETUGE: SSECC = 5; break;
+ case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
+ case ISD::SETUGT: SSECC = 6; break;
+ case ISD::SETO: SSECC = 7; break;
+ case ISD::SETUEQ:
+ case ISD::SETONE: SSECC = 8; break;
+ }
+ if (Swap)
+ std::swap(Op0, Op1);
+
+ return SSECC;
+}
+
+/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
+/// concatenate the result back.
+static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+
+ assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
+ "Unsupported value type for operation");
+
+ unsigned NumElems = VT.getVectorNumElements();
+ SDLoc dl(Op);
+ SDValue CC = Op.getOperand(2);
+
+ // Extract the LHS vectors
+ SDValue LHS = Op.getOperand(0);
+ SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
+ SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
+
+ // Extract the RHS vectors
+ SDValue RHS = Op.getOperand(1);
+ SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
+ SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
+
+ // Issue the operation on the smaller types and concatenate the result back
+ MVT EltVT = VT.getVectorElementType();
+ MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+ DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
+ DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
+}
+
+static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ SDValue CC = Op.getOperand(2);
+ MVT VT = Op.getSimpleValueType();
+ SDLoc dl(Op);
+
+ assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
+ "Unexpected type for boolean compare operation");
+ ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+ SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
+ DAG.getConstant(-1, dl, VT));
+ SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
+ DAG.getConstant(-1, dl, VT));
+ switch (SetCCOpcode) {
+ default: llvm_unreachable("Unexpected SETCC condition");
+ case ISD::SETEQ:
+ // (x == y) -> ~(x ^ y)
+ return DAG.getNode(ISD::XOR, dl, VT,
+ DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
+ DAG.getConstant(-1, dl, VT));
+ case ISD::SETNE:
+ // (x != y) -> (x ^ y)
+ return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
+ case ISD::SETUGT:
+ case ISD::SETGT:
+ // (x > y) -> (x & ~y)
+ return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
+ case ISD::SETULT:
+ case ISD::SETLT:
+ // (x < y) -> (~x & y)
+ return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
+ case ISD::SETULE:
+ case ISD::SETLE:
+ // (x <= y) -> (~x | y)
+ return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
+ case ISD::SETUGE:
+ case ISD::SETGE:
+ // (x >=y) -> (x | ~y)
+ return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
+ }
+}
+
+static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
+
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ SDValue CC = Op.getOperand(2);
+ MVT VT = Op.getSimpleValueType();
+ SDLoc dl(Op);
+
+ assert(VT.getVectorElementType() == MVT::i1 &&
+ "Cannot set masked compare for this operation");
+
+ ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+ unsigned Opc = 0;
+ bool Unsigned = false;
+ bool Swap = false;
+ unsigned SSECC;
+ switch (SetCCOpcode) {
+ default: llvm_unreachable("Unexpected SETCC condition");
+ case ISD::SETNE: SSECC = 4; break;
+ case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break;
+ case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
+ case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
+ case ISD::SETGT: Opc = X86ISD::PCMPGTM; break;
+ case ISD::SETULT: SSECC = 1; Unsigned = true; break;
+ case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
+ case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap
+ case ISD::SETULE: Unsigned = true; LLVM_FALLTHROUGH;
+ case ISD::SETLE: SSECC = 2; break;
+ }
+
+ if (Swap)
+ std::swap(Op0, Op1);
+ if (Opc)
+ return DAG.getNode(Opc, dl, VT, Op0, Op1);
+ Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
+ return DAG.getNode(Opc, dl, VT, Op0, Op1,
+ DAG.getConstant(SSECC, dl, MVT::i8));
+}
+
+/// \brief Try to turn a VSETULT into a VSETULE by modifying its second
+/// operand \p Op1. If non-trivial (for example because it's not constant)
+/// return an empty value.
+static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
+ SelectionDAG &DAG) {
+ BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
+ if (!BV)
+ return SDValue();
+
+ MVT VT = Op1.getSimpleValueType();
+ MVT EVT = VT.getVectorElementType();
+ unsigned n = VT.getVectorNumElements();
+ SmallVector<SDValue, 8> ULTOp1;
+
+ for (unsigned i = 0; i < n; ++i) {
+ ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
+ if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
+ return SDValue();
+
+ // Avoid underflow.
+ APInt Val = Elt->getAPIntValue();
+ if (Val == 0)
+ return SDValue();
+
+ ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
+ }
+
+ return DAG.getBuildVector(VT, dl, ULTOp1);
+}
+
+static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ SDValue CC = Op.getOperand(2);
+ MVT VT = Op.getSimpleValueType();
+ ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+ bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
+ SDLoc dl(Op);
+
+ if (isFP) {
+#ifndef NDEBUG
+ MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
+ assert(EltVT == MVT::f32 || EltVT == MVT::f64);
+#endif
+
+ unsigned Opc;
+ if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
+ assert(VT.getVectorNumElements() <= 16);
+ Opc = X86ISD::CMPM;
+ } else {
+ Opc = X86ISD::CMPP;
+ // The SSE/AVX packed FP comparison nodes are defined with a
+ // floating-point vector result that matches the operand type. This allows
+ // them to work with an SSE1 target (integer vector types are not legal).
+ VT = Op0.getSimpleValueType();
+ }
+
+ // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
+ // emit two comparisons and a logic op to tie them together.
+ // TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is
+ // available.
+ SDValue Cmp;
+ unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
+ if (SSECC == 8) {
+ // LLVM predicate is SETUEQ or SETONE.
+ unsigned CC0, CC1;
+ unsigned CombineOpc;
+ if (SetCCOpcode == ISD::SETUEQ) {
+ CC0 = 3; // UNORD
+ CC1 = 0; // EQ
+ CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) :
+ static_cast<unsigned>(ISD::OR);
+ } else {
+ assert(SetCCOpcode == ISD::SETONE);
+ CC0 = 7; // ORD
+ CC1 = 4; // NEQ
+ CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) :
+ static_cast<unsigned>(ISD::AND);
+ }
+
+ SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
+ DAG.getConstant(CC0, dl, MVT::i8));
+ SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
+ DAG.getConstant(CC1, dl, MVT::i8));
+ Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
+ } else {
+ // Handle all other FP comparisons here.
+ Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
+ DAG.getConstant(SSECC, dl, MVT::i8));
+ }
+
+ // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
+ // result type of SETCC. The bitcast is expected to be optimized away
+ // during combining/isel.
+ if (Opc == X86ISD::CMPP)
+ Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
+
+ return Cmp;
+ }
+
+ MVT VTOp0 = Op0.getSimpleValueType();
+ assert(VTOp0 == Op1.getSimpleValueType() &&
+ "Expected operands with same type!");
+ assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
+ "Invalid number of packed elements for source and destination!");
+
+ if (VT.is128BitVector() && VTOp0.is256BitVector()) {
+ // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
+ // legalizer to a wider vector type. In the case of 'vsetcc' nodes, the
+ // legalizer firstly checks if the first operand in input to the setcc has
+ // a legal type. If so, then it promotes the return type to that same type.
+ // Otherwise, the return type is promoted to the 'next legal type' which,
+ // for a vector of MVT::i1 is always a 128-bit integer vector type.
+ //
+ // We reach this code only if the following two conditions are met:
+ // 1. Both return type and operand type have been promoted to wider types
+ // by the type legalizer.
+ // 2. The original operand type has been promoted to a 256-bit vector.
+ //
+ // Note that condition 2. only applies for AVX targets.
+ SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, SetCCOpcode);
+ return DAG.getZExtOrTrunc(NewOp, dl, VT);
+ }
+
+ // The non-AVX512 code below works under the assumption that source and
+ // destination types are the same.
+ assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
+ "Value types for source and destination must be the same!");
+
+ // Break 256-bit integer vector compare into smaller ones.
+ if (VT.is256BitVector() && !Subtarget.hasInt256())
+ return Lower256IntVSETCC(Op, DAG);
+
+ // Operands are boolean (vectors of i1)
+ MVT OpVT = Op1.getSimpleValueType();
+ if (OpVT.getVectorElementType() == MVT::i1)
+ return LowerBoolVSETCC_AVX512(Op, DAG);
+
+ // The result is boolean, but operands are int/float
+ if (VT.getVectorElementType() == MVT::i1) {
+ // In AVX-512 architecture setcc returns mask with i1 elements,
+ // But there is no compare instruction for i8 and i16 elements in KNL.
+ // In this case use SSE compare
+ bool UseAVX512Inst =
+ (OpVT.is512BitVector() ||
+ OpVT.getScalarSizeInBits() >= 32 ||
+ (Subtarget.hasBWI() && Subtarget.hasVLX()));
+
+ if (UseAVX512Inst)
+ return LowerIntVSETCC_AVX512(Op, DAG);
+
+ return DAG.getNode(ISD::TRUNCATE, dl, VT,
+ DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
+ }
+
+ // Lower using XOP integer comparisons.
+ if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
+ VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) {
+ // Translate compare code to XOP PCOM compare mode.
+ unsigned CmpMode = 0;
+ switch (SetCCOpcode) {
+ default: llvm_unreachable("Unexpected SETCC condition");
+ case ISD::SETULT:
+ case ISD::SETLT: CmpMode = 0x00; break;
+ case ISD::SETULE:
+ case ISD::SETLE: CmpMode = 0x01; break;
+ case ISD::SETUGT:
+ case ISD::SETGT: CmpMode = 0x02; break;
+ case ISD::SETUGE:
+ case ISD::SETGE: CmpMode = 0x03; break;
+ case ISD::SETEQ: CmpMode = 0x04; break;
+ case ISD::SETNE: CmpMode = 0x05; break;
+ }
+
+ // Are we comparing unsigned or signed integers?
+ unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode)
+ ? X86ISD::VPCOMU : X86ISD::VPCOM;
+
+ return DAG.getNode(Opc, dl, VT, Op0, Op1,
+ DAG.getConstant(CmpMode, dl, MVT::i8));
+ }
+
+ // We are handling one of the integer comparisons here. Since SSE only has
+ // GT and EQ comparisons for integer, swapping operands and multiple
+ // operations may be required for some comparisons.
+ unsigned Opc;
+ bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
+ bool Subus = false;
+
+ switch (SetCCOpcode) {
+ default: llvm_unreachable("Unexpected SETCC condition");
+ case ISD::SETNE: Invert = true;
+ case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break;
+ case ISD::SETLT: Swap = true;
+ case ISD::SETGT: Opc = X86ISD::PCMPGT; break;
+ case ISD::SETGE: Swap = true;
+ case ISD::SETLE: Opc = X86ISD::PCMPGT;
+ Invert = true; break;
+ case ISD::SETULT: Swap = true;
+ case ISD::SETUGT: Opc = X86ISD::PCMPGT;
+ FlipSigns = true; break;
+ case ISD::SETUGE: Swap = true;
+ case ISD::SETULE: Opc = X86ISD::PCMPGT;
+ FlipSigns = true; Invert = true; break;
+ }
+
+ // Special case: Use min/max operations for SETULE/SETUGE
+ MVT VET = VT.getVectorElementType();
+ bool hasMinMax =
+ (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
+ || (Subtarget.hasSSE2() && (VET == MVT::i8));
+
+ if (hasMinMax) {
+ switch (SetCCOpcode) {
+ default: break;
+ case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
+ case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
+ }
+
+ if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
+ }
+
+ bool hasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
+ if (!MinMax && hasSubus) {
+ // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
+ // Op0 u<= Op1:
+ // t = psubus Op0, Op1
+ // pcmpeq t, <0..0>
+ switch (SetCCOpcode) {
+ default: break;
+ case ISD::SETULT: {
+ // If the comparison is against a constant we can turn this into a
+ // setule. With psubus, setule does not require a swap. This is
+ // beneficial because the constant in the register is no longer
+ // destructed as the destination so it can be hoisted out of a loop.
+ // Only do this pre-AVX since vpcmp* is no longer destructive.
+ if (Subtarget.hasAVX())
+ break;
+ if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
+ Op1 = ULEOp1;
+ Subus = true; Invert = false; Swap = false;
+ }
+ break;
+ }
+ // Psubus is better than flip-sign because it requires no inversion.
+ case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break;
+ case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
+ }
+
+ if (Subus) {
+ Opc = X86ISD::SUBUS;
+ FlipSigns = false;
+ }
+ }
+
+ if (Swap)
+ std::swap(Op0, Op1);
+
+ // Check that the operation in question is available (most are plain SSE2,
+ // but PCMPGTQ and PCMPEQQ have different requirements).
+ if (VT == MVT::v2i64) {
+ if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
+ assert(Subtarget.hasSSE2() && "Don't know how to lower!");
+
+ // First cast everything to the right type.
+ Op0 = DAG.getBitcast(MVT::v4i32, Op0);
+ Op1 = DAG.getBitcast(MVT::v4i32, Op1);
+
+ // Since SSE has no unsigned integer comparisons, we need to flip the sign
+ // bits of the inputs before performing those operations. The lower
+ // compare is always unsigned.
+ SDValue SB;
+ if (FlipSigns) {
+ SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
+ } else {
+ SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
+ SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
+ SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
+ }
+ Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
+ Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
+
+ // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
+ SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
+ SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
+
+ // Create masks for only the low parts/high parts of the 64 bit integers.
+ static const int MaskHi[] = { 1, 1, 3, 3 };
+ static const int MaskLo[] = { 0, 0, 2, 2 };
+ SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
+ SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
+ SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
+
+ SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
+ Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
+
+ if (Invert)
+ Result = DAG.getNOT(dl, Result, MVT::v4i32);
+
+ return DAG.getBitcast(VT, Result);
+ }
+
+ if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
+ // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
+ // pcmpeqd + pshufd + pand.
+ assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
+
+ // First cast everything to the right type.
+ Op0 = DAG.getBitcast(MVT::v4i32, Op0);
+ Op1 = DAG.getBitcast(MVT::v4i32, Op1);
+
+ // Do the compare.
+ SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
+
+ // Make sure the lower and upper halves are both all-ones.
+ static const int Mask[] = { 1, 0, 3, 2 };
+ SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
+ Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
+
+ if (Invert)
+ Result = DAG.getNOT(dl, Result, MVT::v4i32);
+
+ return DAG.getBitcast(VT, Result);
+ }
+ }
+
+ // Since SSE has no unsigned integer comparisons, we need to flip the sign
+ // bits of the inputs before performing those operations.
+ if (FlipSigns) {
+ MVT EltVT = VT.getVectorElementType();
+ SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), dl,
+ VT);
+ Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
+ Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
+ }
+
+ SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
+
+ // If the logical-not of the result is required, perform that now.
+ if (Invert)
+ Result = DAG.getNOT(dl, Result, VT);
+
+ if (MinMax)
+ Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
+
+ if (Subus)
+ Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
+ getZeroVector(VT, Subtarget, DAG, dl));
+
+ return Result;
+}
+
+SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+
+ MVT VT = Op.getSimpleValueType();
+
+ if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
+
+ assert(((!Subtarget.hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
+ && "SetCC type must be 8-bit or 1-bit integer");
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ SDLoc dl(Op);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+
+ // Optimize to BT if possible.
+ // Lower (X & (1 << N)) == 0 to BT(X, N).
+ // Lower ((X >>u N) & 1) != 0 to BT(X, N).
+ // Lower ((X >>s N) & 1) != 0 to BT(X, N).
+ // Lower (trunc (X >> N) to i1) to BT(X, N).
+ if (Op0.hasOneUse() && isNullConstant(Op1) &&
+ (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+ if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) {
+ if (VT == MVT::i1)
+ return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
+ return NewSetCC;
+ }
+ }
+
+ // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
+ // these.
+ if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
+ (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+
+ // If the input is a setcc, then reuse the input setcc or use a new one with
+ // the inverted condition.
+ if (Op0.getOpcode() == X86ISD::SETCC) {
+ X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
+ bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
+ if (!Invert)
+ return Op0;
+
+ CCode = X86::GetOppositeBranchCondition(CCode);
+ SDValue SetCC = getSETCC(CCode, Op0.getOperand(1), dl, DAG);
+ if (VT == MVT::i1)
+ return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
+ return SetCC;
+ }
+ }
+ if (Op0.getValueType() == MVT::i1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+ if (isOneConstant(Op1)) {
+ ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
+ return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC);
+ }
+ if (!isNullConstant(Op1)) {
+ SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i1, Op0, Op1);
+ return DAG.getSetCC(dl, VT, Xor, DAG.getConstant(0, dl, MVT::i1), CC);
+ }
+ }
+
+ bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
+ X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
+ if (X86CC == X86::COND_INVALID)
+ return SDValue();
+
+ SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
+ EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
+ SDValue SetCC = getSETCC(X86CC, EFLAGS, dl, DAG);
+ if (VT == MVT::i1)
+ return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
+ return SetCC;
+}
+
+SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const {
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ SDValue Carry = Op.getOperand(2);
+ SDValue Cond = Op.getOperand(3);
+ SDLoc DL(Op);
+
+ assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only.");
+ X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
+
+ assert(Carry.getOpcode() != ISD::CARRY_FALSE);
+ SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
+ SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry);
+ SDValue SetCC = getSETCC(CC, Cmp.getValue(1), DL, DAG);
+ if (Op.getSimpleValueType() == MVT::i1)
+ return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
+ return SetCC;
+}
+
+/// Return true if opcode is a X86 logical comparison.
+static bool isX86LogicalCmp(SDValue Op) {
+ unsigned Opc = Op.getOpcode();
+ if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
+ Opc == X86ISD::SAHF)
+ return true;
+ if (Op.getResNo() == 1 &&
+ (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
+ Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
+ Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR ||
+ Opc == X86ISD::XOR || Opc == X86ISD::AND))
+ return true;
+
+ if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
+ return true;
+
+ return false;
+}
+
+static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
+ if (V.getOpcode() != ISD::TRUNCATE)
+ return false;
+
+ SDValue VOp0 = V.getOperand(0);
+ unsigned InBits = VOp0.getValueSizeInBits();
+ unsigned Bits = V.getValueSizeInBits();
+ return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
+}
+
+SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
+ bool AddTest = true;
+ SDValue Cond = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ SDValue Op2 = Op.getOperand(2);
+ SDLoc DL(Op);
+ MVT VT = Op1.getSimpleValueType();
+ SDValue CC;
+
+ // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
+ // are available or VBLENDV if AVX is available.
+ // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
+ if (Cond.getOpcode() == ISD::SETCC &&
+ ((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
+ (Subtarget.hasSSE1() && VT == MVT::f32)) &&
+ VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
+ SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
+ int SSECC = translateX86FSETCC(
+ cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
+
+ if (SSECC != 8) {
+ if (Subtarget.hasAVX512()) {
+ SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CondOp0,
+ CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
+ return DAG.getNode(VT.isVector() ? X86ISD::SELECT : X86ISD::SELECTS,
+ DL, VT, Cmp, Op1, Op2);
+ }
+
+ SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
+ DAG.getConstant(SSECC, DL, MVT::i8));
+
+ // If we have AVX, we can use a variable vector select (VBLENDV) instead
+ // of 3 logic instructions for size savings and potentially speed.
+ // Unfortunately, there is no scalar form of VBLENDV.
+
+ // If either operand is a constant, don't try this. We can expect to
+ // optimize away at least one of the logic instructions later in that
+ // case, so that sequence would be faster than a variable blend.
+
+ // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
+ // uses XMM0 as the selection register. That may need just as many
+ // instructions as the AND/ANDN/OR sequence due to register moves, so
+ // don't bother.
+
+ if (Subtarget.hasAVX() &&
+ !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
+
+ // Convert to vectors, do a VSELECT, and convert back to scalar.
+ // All of the conversions should be optimized away.
+
+ MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
+ SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
+ SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
+ SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
+
+ MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
+ VCmp = DAG.getBitcast(VCmpVT, VCmp);
+
+ SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2);
+
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
+ VSel, DAG.getIntPtrConstant(0, DL));
+ }
+ SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
+ SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
+ return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
+ }
+ }
+
+ // AVX512 fallback is to lower selects of scalar floats to masked moves.
+ if (Cond.getValueType() == MVT::i1 && (VT == MVT::f64 || VT == MVT::f32) &&
+ Subtarget.hasAVX512())
+ return DAG.getNode(X86ISD::SELECTS, DL, VT, Cond, Op1, Op2);
+
+ if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
+ SDValue Op1Scalar;
+ if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
+ Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
+ else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
+ Op1Scalar = Op1.getOperand(0);
+ SDValue Op2Scalar;
+ if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
+ Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
+ else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
+ Op2Scalar = Op2.getOperand(0);
+ if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
+ SDValue newSelect = DAG.getNode(ISD::SELECT, DL,
+ Op1Scalar.getValueType(),
+ Cond, Op1Scalar, Op2Scalar);
+ if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
+ return DAG.getBitcast(VT, newSelect);
+ SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
+ DAG.getIntPtrConstant(0, DL));
+ }
+ }
+
+ if (VT == MVT::v4i1 || VT == MVT::v2i1) {
+ SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
+ Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
+ DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
+ Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
+ DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
+ SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::v8i1,
+ Cond, Op1, Op2);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
+ }
+
+ if (Cond.getOpcode() == ISD::SETCC) {
+ if (SDValue NewCond = LowerSETCC(Cond, DAG))
+ Cond = NewCond;
+ }
+
+ // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
+ // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
+ // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
+ // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
+ if (Cond.getOpcode() == X86ISD::SETCC &&
+ Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
+ isNullConstant(Cond.getOperand(1).getOperand(1))) {
+ SDValue Cmp = Cond.getOperand(1);
+
+ unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
+
+ if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
+ (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
+ SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
+
+ SDValue CmpOp0 = Cmp.getOperand(0);
+ // Apply further optimizations for special cases
+ // (select (x != 0), -1, 0) -> neg & sbb
+ // (select (x == 0), 0, -1) -> neg & sbb
+ if (isNullConstant(Y) &&
+ (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
+ SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
+ SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
+ DAG.getConstant(0, DL,
+ CmpOp0.getValueType()),
+ CmpOp0);
+ SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
+ DAG.getConstant(X86::COND_B, DL, MVT::i8),
+ SDValue(Neg.getNode(), 1));
+ return Res;
+ }
+
+ Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
+ CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
+ Cmp = ConvertCmpIfNecessary(Cmp, DAG);
+
+ SDValue Res = // Res = 0 or -1.
+ DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
+ DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
+
+ if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
+ Res = DAG.getNOT(DL, Res, Res.getValueType());
+
+ if (!isNullConstant(Op2))
+ Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
+ return Res;
+ }
+ }
+
+ // Look past (and (setcc_carry (cmp ...)), 1).
+ if (Cond.getOpcode() == ISD::AND &&
+ Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
+ isOneConstant(Cond.getOperand(1)))
+ Cond = Cond.getOperand(0);
+
+ // If condition flag is set by a X86ISD::CMP, then use it as the condition
+ // setting operand in place of the X86ISD::SETCC.
+ unsigned CondOpcode = Cond.getOpcode();
+ if (CondOpcode == X86ISD::SETCC ||
+ CondOpcode == X86ISD::SETCC_CARRY) {
+ CC = Cond.getOperand(0);
+
+ SDValue Cmp = Cond.getOperand(1);
+ unsigned Opc = Cmp.getOpcode();
+ MVT VT = Op.getSimpleValueType();
+
+ bool IllegalFPCMov = false;
+ if (VT.isFloatingPoint() && !VT.isVector() &&
+ !isScalarFPTypeInSSEReg(VT)) // FPStack?
+ IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
+
+ if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
+ Opc == X86ISD::BT) { // FIXME
+ Cond = Cmp;
+ AddTest = false;
+ }
+ } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
+ CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
+ ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
+ Cond.getOperand(0).getValueType() != MVT::i8)) {
+ SDValue LHS = Cond.getOperand(0);
+ SDValue RHS = Cond.getOperand(1);
+ unsigned X86Opcode;
+ unsigned X86Cond;
+ SDVTList VTs;
+ switch (CondOpcode) {
+ case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
+ case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
+ case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
+ case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
+ case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
+ case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
+ default: llvm_unreachable("unexpected overflowing operator");
+ }
+ if (CondOpcode == ISD::UMULO)
+ VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
+ MVT::i32);
+ else
+ VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
+
+ SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
+
+ if (CondOpcode == ISD::UMULO)
+ Cond = X86Op.getValue(2);
+ else
+ Cond = X86Op.getValue(1);
+
+ CC = DAG.getConstant(X86Cond, DL, MVT::i8);
+ AddTest = false;
+ }
+
+ if (AddTest) {
+ // Look past the truncate if the high bits are known zero.
+ if (isTruncWithZeroHighBitsInput(Cond, DAG))
+ Cond = Cond.getOperand(0);
+
+ // We know the result of AND is compared against zero. Try to match
+ // it to BT.
+ if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
+ if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) {
+ CC = NewSetCC.getOperand(0);
+ Cond = NewSetCC.getOperand(1);
+ AddTest = false;
+ }
+ }
+ }
+
+ if (AddTest) {
+ CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
+ Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
+ }
+
+ // a < b ? -1 : 0 -> RES = ~setcc_carry
+ // a < b ? 0 : -1 -> RES = setcc_carry
+ // a >= b ? -1 : 0 -> RES = setcc_carry
+ // a >= b ? 0 : -1 -> RES = ~setcc_carry
+ if (Cond.getOpcode() == X86ISD::SUB) {
+ Cond = ConvertCmpIfNecessary(Cond, DAG);
+ unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
+
+ if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
+ (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
+ (isNullConstant(Op1) || isNullConstant(Op2))) {
+ SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
+ DAG.getConstant(X86::COND_B, DL, MVT::i8),
+ Cond);
+ if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
+ return DAG.getNOT(DL, Res, Res.getValueType());
+ return Res;
+ }
+ }
+
+ // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
+ // widen the cmov and push the truncate through. This avoids introducing a new
+ // branch during isel and doesn't add any extensions.
+ if (Op.getValueType() == MVT::i8 &&
+ Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
+ SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
+ if (T1.getValueType() == T2.getValueType() &&
+ // Blacklist CopyFromReg to avoid partial register stalls.
+ T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
+ SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
+ SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
+ return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
+ }
+ }
+
+ // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
+ // condition is true.
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
+ SDValue Ops[] = { Op2, Op1, CC, Cond };
+ return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
+}
+
+static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op->getSimpleValueType(0);
+ SDValue In = Op->getOperand(0);
+ MVT InVT = In.getSimpleValueType();
+ MVT VTElt = VT.getVectorElementType();
+ MVT InVTElt = InVT.getVectorElementType();
+ SDLoc dl(Op);
+
+ // SKX processor
+ if ((InVTElt == MVT::i1) &&
+ (((Subtarget.hasBWI() && Subtarget.hasVLX() &&
+ VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
+
+ ((Subtarget.hasBWI() && VT.is512BitVector() &&
+ VTElt.getSizeInBits() <= 16)) ||
+
+ ((Subtarget.hasDQI() && Subtarget.hasVLX() &&
+ VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
+
+ ((Subtarget.hasDQI() && VT.is512BitVector() &&
+ VTElt.getSizeInBits() >= 32))))
+ return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
+
+ unsigned NumElts = VT.getVectorNumElements();
+
+ if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
+ return SDValue();
+
+ if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {
+ if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
+ return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
+ return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
+ }
+
+ assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
+ MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
+ SDValue NegOne = DAG.getConstant(
+ APInt::getAllOnesValue(ExtVT.getScalarSizeInBits()), dl, ExtVT);
+ SDValue Zero = DAG.getConstant(
+ APInt::getNullValue(ExtVT.getScalarSizeInBits()), dl, ExtVT);
+
+ SDValue V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
+ if (VT.is512BitVector())
+ return V;
+ return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
+}
+
+// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
+// For sign extend this needs to handle all vector sizes and SSE4.1 and
+// non-SSE4.1 targets. For zero extend this should only handle inputs of
+// MVT::v64i8 when BWI is not supported, but AVX512 is.
+static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDValue In = Op->getOperand(0);
+ MVT VT = Op->getSimpleValueType(0);
+ MVT InVT = In.getSimpleValueType();
+ assert(VT.getSizeInBits() == InVT.getSizeInBits());
+
+ MVT SVT = VT.getVectorElementType();
+ MVT InSVT = InVT.getVectorElementType();
+ assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
+
+ if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
+ return SDValue();
+ if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
+ return SDValue();
+ if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
+ !(VT.is256BitVector() && Subtarget.hasInt256()) &&
+ !(VT.is512BitVector() && Subtarget.hasAVX512()))
+ return SDValue();
+
+ SDLoc dl(Op);
+
+ // For 256-bit vectors, we only need the lower (128-bit) half of the input.
+ // For 512-bit vectors, we need 128-bits or 256-bits.
+ if (VT.getSizeInBits() > 128) {
+ // Input needs to be at least the same number of elements as output, and
+ // at least 128-bits.
+ int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
+ In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
+ }
+
+ assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
+ InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
+
+ // SSE41 targets can use the pmovsx* instructions directly.
+ unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
+ X86ISD::VSEXT : X86ISD::VZEXT;
+ if (Subtarget.hasSSE41())
+ return DAG.getNode(ExtOpc, dl, VT, In);
+
+ // We should only get here for sign extend.
+ assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
+ "Unexpected opcode!");
+
+ // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
+ SDValue Curr = In;
+ MVT CurrVT = InVT;
+
+ // As SRAI is only available on i16/i32 types, we expand only up to i32
+ // and handle i64 separately.
+ while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
+ Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
+ MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
+ CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
+ Curr = DAG.getBitcast(CurrVT, Curr);
+ }
+
+ SDValue SignExt = Curr;
+ if (CurrVT != InVT) {
+ unsigned SignExtShift =
+ CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
+ SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
+ DAG.getConstant(SignExtShift, dl, MVT::i8));
+ }
+
+ if (CurrVT == VT)
+ return SignExt;
+
+ if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
+ SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
+ DAG.getConstant(31, dl, MVT::i8));
+ SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
+ return DAG.getBitcast(VT, Ext);
+ }
+
+ return SDValue();
+}
+
+static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op->getSimpleValueType(0);
+ SDValue In = Op->getOperand(0);
+ MVT InVT = In.getSimpleValueType();
+ SDLoc dl(Op);
+
+ if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
+ return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
+
+ if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
+ (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
+ (VT != MVT::v16i16 || InVT != MVT::v16i8))
+ return SDValue();
+
+ if (Subtarget.hasInt256())
+ return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
+
+ // Optimize vectors in AVX mode
+ // Sign extend v8i16 to v8i32 and
+ // v4i32 to v4i64
+ //
+ // Divide input vector into two parts
+ // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
+ // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
+ // concat the vectors to original VT
+
+ unsigned NumElems = InVT.getVectorNumElements();
+ SDValue Undef = DAG.getUNDEF(InVT);
+
+ SmallVector<int,8> ShufMask1(NumElems, -1);
+ for (unsigned i = 0; i != NumElems/2; ++i)
+ ShufMask1[i] = i;
+
+ SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
+
+ SmallVector<int,8> ShufMask2(NumElems, -1);
+ for (unsigned i = 0; i != NumElems/2; ++i)
+ ShufMask2[i] = i + NumElems/2;
+
+ SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
+
+ MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
+ VT.getVectorNumElements() / 2);
+
+ OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
+ OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
+}
+
+// Lower truncating store. We need a special lowering to vXi1 vectors
+static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
+ SDLoc dl(St);
+ EVT MemVT = St->getMemoryVT();
+ assert(St->isTruncatingStore() && "We only custom truncating store.");
+ assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
+ "Expected truncstore of i1 vector");
+
+ SDValue Op = St->getValue();
+ MVT OpVT = Op.getValueType().getSimpleVT();
+ unsigned NumElts = OpVT.getVectorNumElements();
+ if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
+ NumElts == 16) {
+ // Truncate and store - everything is legal
+ Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
+ if (MemVT.getSizeInBits() < 8)
+ Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
+ DAG.getUNDEF(MVT::v8i1), Op,
+ DAG.getIntPtrConstant(0, dl));
+ return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
+ St->getMemOperand());
+ }
+
+ // A subset, assume that we have only AVX-512F
+ if (NumElts <= 8) {
+ if (NumElts < 8) {
+ // Extend to 8-elts vector
+ MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
+ Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
+ DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
+ }
+ Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
+ return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
+ St->getMemOperand());
+ }
+ // v32i8
+ assert(OpVT == MVT::v32i8 && "Unexpected operand type");
+ // Divide the vector into 2 parts and store each part separately
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
+ DAG.getIntPtrConstant(0, dl));
+ Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
+ SDValue BasePtr = St->getBasePtr();
+ SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
+ St->getMemOperand());
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
+ DAG.getIntPtrConstant(16, dl));
+ Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);
+
+ SDValue BasePtrHi =
+ DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+ DAG.getConstant(2, dl, BasePtr.getValueType()));
+
+ SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
+ BasePtrHi, St->getMemOperand());
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
+}
+
+static SDValue LowerExtended1BitVectorLoad(SDValue Op,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+
+ LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
+ SDLoc dl(Ld);
+ EVT MemVT = Ld->getMemoryVT();
+ assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
+ "Expected i1 vector load");
+ unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
+ ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
+ MVT VT = Op.getValueType().getSimpleVT();
+ unsigned NumElts = VT.getVectorNumElements();
+
+ if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
+ NumElts == 16) {
+ // Load and extend - everything is legal
+ if (NumElts < 8) {
+ SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
+ Ld->getBasePtr(),
+ Ld->getMemOperand());
+ // Replace chain users with the new chain.
+ assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
+ MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
+ SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);
+
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
+ DAG.getIntPtrConstant(0, dl));
+ }
+ SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
+ Ld->getBasePtr(),
+ Ld->getMemOperand());
+ // Replace chain users with the new chain.
+ assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
+
+ // Finally, do a normal sign-extend to the desired register.
+ return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
+ }
+
+ if (NumElts <= 8) {
+ // A subset, assume that we have only AVX-512F
+ unsigned NumBitsToLoad = NumElts < 8 ? 8 : NumElts;
+ MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
+ SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
+ Ld->getBasePtr(),
+ Ld->getMemOperand());
+ // Replace chain users with the new chain.
+ assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
+
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad);
+ SDValue BitVec = DAG.getBitcast(MaskVT, Load);
+
+ if (NumElts == 8)
+ return DAG.getNode(ExtOpcode, dl, VT, BitVec);
+
+ // we should take care to v4i1 and v2i1
+
+ MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
+ SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
+ DAG.getIntPtrConstant(0, dl));
+ }
+
+ assert(VT == MVT::v32i8 && "Unexpected extload type");
+
+ SmallVector<SDValue, 2> Chains;
+
+ SDValue BasePtr = Ld->getBasePtr();
+ SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
+ Ld->getBasePtr(),
+ Ld->getMemOperand());
+ Chains.push_back(LoadLo.getValue(1));
+
+ SDValue BasePtrHi =
+ DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+ DAG.getConstant(2, dl, BasePtr.getValueType()));
+
+ SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
+ BasePtrHi,
+ Ld->getMemOperand());
+ Chains.push_back(LoadHi.getValue(1));
+ SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
+
+ SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
+ SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
+}
+
+// Lower vector extended loads using a shuffle. If SSSE3 is not available we
+// may emit an illegal shuffle but the expansion is still better than scalar
+// code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
+// we'll emit a shuffle and a arithmetic shift.
+// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
+// TODO: It is possible to support ZExt by zeroing the undef values during
+// the shuffle phase or after the shuffle.
+static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT RegVT = Op.getSimpleValueType();
+ assert(RegVT.isVector() && "We only custom lower vector sext loads.");
+ assert(RegVT.isInteger() &&
+ "We only custom lower integer vector sext loads.");
+
+ // Nothing useful we can do without SSE2 shuffles.
+ assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
+
+ LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
+ SDLoc dl(Ld);
+ EVT MemVT = Ld->getMemoryVT();
+ if (MemVT.getScalarType() == MVT::i1)
+ return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ unsigned RegSz = RegVT.getSizeInBits();
+
+ ISD::LoadExtType Ext = Ld->getExtensionType();
+
+ assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
+ && "Only anyext and sext are currently implemented.");
+ assert(MemVT != RegVT && "Cannot extend to the same type");
+ assert(MemVT.isVector() && "Must load a vector from memory");
+
+ unsigned NumElems = RegVT.getVectorNumElements();
+ unsigned MemSz = MemVT.getSizeInBits();
+ assert(RegSz > MemSz && "Register size must be greater than the mem size");
+
+ if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
+ // The only way in which we have a legal 256-bit vector result but not the
+ // integer 256-bit operations needed to directly lower a sextload is if we
+ // have AVX1 but not AVX2. In that case, we can always emit a sextload to
+ // a 128-bit vector and a normal sign_extend to 256-bits that should get
+ // correctly legalized. We do this late to allow the canonical form of
+ // sextload to persist throughout the rest of the DAG combiner -- it wants
+ // to fold together any extensions it can, and so will fuse a sign_extend
+ // of an sextload into a sextload targeting a wider value.
+ SDValue Load;
+ if (MemSz == 128) {
+ // Just switch this to a normal load.
+ assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
+ "it must be a legal 128-bit vector "
+ "type!");
+ Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
+ Ld->getPointerInfo(), Ld->getAlignment(),
+ Ld->getMemOperand()->getFlags());
+ } else {
+ assert(MemSz < 128 &&
+ "Can't extend a type wider than 128 bits to a 256 bit vector!");
+ // Do an sext load to a 128-bit vector type. We want to use the same
+ // number of elements, but elements half as wide. This will end up being
+ // recursively lowered by this routine, but will succeed as we definitely
+ // have all the necessary features if we're using AVX1.
+ EVT HalfEltVT =
+ EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
+ EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
+ Load =
+ DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
+ Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
+ Ld->getMemOperand()->getFlags());
+ }
+
+ // Replace chain users with the new chain.
+ assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
+
+ // Finally, do a normal sign-extend to the desired register.
+ return DAG.getSExtOrTrunc(Load, dl, RegVT);
+ }
+
+ // All sizes must be a power of two.
+ assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
+ "Non-power-of-two elements are not custom lowered!");
+
+ // Attempt to load the original value using scalar loads.
+ // Find the largest scalar type that divides the total loaded size.
+ MVT SclrLoadTy = MVT::i8;
+ for (MVT Tp : MVT::integer_valuetypes()) {
+ if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
+ SclrLoadTy = Tp;
+ }
+ }
+
+ // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
+ if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
+ (64 <= MemSz))
+ SclrLoadTy = MVT::f64;
+
+ // Calculate the number of scalar loads that we need to perform
+ // in order to load our vector from memory.
+ unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
+
+ assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
+ "Can only lower sext loads with a single scalar load!");
+
+ unsigned loadRegZize = RegSz;
+ if (Ext == ISD::SEXTLOAD && RegSz >= 256)
+ loadRegZize = 128;
+
+ // Represent our vector as a sequence of elements which are the
+ // largest scalar that we can load.
+ EVT LoadUnitVecVT = EVT::getVectorVT(
+ *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
+
+ // Represent the data using the same element type that is stored in
+ // memory. In practice, we ''widen'' MemVT.
+ EVT WideVecVT =
+ EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
+ loadRegZize / MemVT.getScalarSizeInBits());
+
+ assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
+ "Invalid vector type");
+
+ // We can't shuffle using an illegal type.
+ assert(TLI.isTypeLegal(WideVecVT) &&
+ "We only lower types that form legal widened vector types");
+
+ SmallVector<SDValue, 8> Chains;
+ SDValue Ptr = Ld->getBasePtr();
+ SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
+ TLI.getPointerTy(DAG.getDataLayout()));
+ SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
+
+ for (unsigned i = 0; i < NumLoads; ++i) {
+ // Perform a single load.
+ SDValue ScalarLoad =
+ DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
+ Ld->getAlignment(), Ld->getMemOperand()->getFlags());
+ Chains.push_back(ScalarLoad.getValue(1));
+ // Create the first element type using SCALAR_TO_VECTOR in order to avoid
+ // another round of DAGCombining.
+ if (i == 0)
+ Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
+ else
+ Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
+ ScalarLoad, DAG.getIntPtrConstant(i, dl));
+
+ Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+ }
+
+ SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+
+ // Bitcast the loaded value to a vector of the original element type, in
+ // the size of the target vector type.
+ SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
+ unsigned SizeRatio = RegSz / MemSz;
+
+ if (Ext == ISD::SEXTLOAD) {
+ // If we have SSE4.1, we can directly emit a VSEXT node.
+ if (Subtarget.hasSSE41()) {
+ SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
+ return Sext;
+ }
+
+ // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
+ // lanes.
+ assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
+ "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
+
+ SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
+ return Shuff;
+ }
+
+ // Redistribute the loaded elements into the different locations.
+ SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i * SizeRatio] = i;
+
+ SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
+ DAG.getUNDEF(WideVecVT), ShuffleVec);
+
+ // Bitcast to the requested type.
+ Shuff = DAG.getBitcast(RegVT, Shuff);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
+ return Shuff;
+}
+
+/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
+/// each of which has no other use apart from the AND / OR.
+static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
+ Opc = Op.getOpcode();
+ if (Opc != ISD::OR && Opc != ISD::AND)
+ return false;
+ return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
+ Op.getOperand(0).hasOneUse() &&
+ Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
+ Op.getOperand(1).hasOneUse());
+}
+
+/// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
+/// SETCC node has a single use.
+static bool isXor1OfSetCC(SDValue Op) {
+ if (Op.getOpcode() != ISD::XOR)
+ return false;
+ if (isOneConstant(Op.getOperand(1)))
+ return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
+ Op.getOperand(0).hasOneUse();
+ return false;
+}
+
+SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
+ bool addTest = true;
+ SDValue Chain = Op.getOperand(0);
+ SDValue Cond = Op.getOperand(1);
+ SDValue Dest = Op.getOperand(2);
+ SDLoc dl(Op);
+ SDValue CC;
+ bool Inverted = false;
+
+ if (Cond.getOpcode() == ISD::SETCC) {
+ // Check for setcc([su]{add,sub,mul}o == 0).
+ if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
+ isNullConstant(Cond.getOperand(1)) &&
+ Cond.getOperand(0).getResNo() == 1 &&
+ (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
+ Cond.getOperand(0).getOpcode() == ISD::UADDO ||
+ Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
+ Cond.getOperand(0).getOpcode() == ISD::USUBO ||
+ Cond.getOperand(0).getOpcode() == ISD::SMULO ||
+ Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
+ Inverted = true;
+ Cond = Cond.getOperand(0);
+ } else {
+ if (SDValue NewCond = LowerSETCC(Cond, DAG))
+ Cond = NewCond;
+ }
+ }
+#if 0
+ // FIXME: LowerXALUO doesn't handle these!!
+ else if (Cond.getOpcode() == X86ISD::ADD ||
+ Cond.getOpcode() == X86ISD::SUB ||
+ Cond.getOpcode() == X86ISD::SMUL ||
+ Cond.getOpcode() == X86ISD::UMUL)
+ Cond = LowerXALUO(Cond, DAG);
+#endif
+
+ // Look pass (and (setcc_carry (cmp ...)), 1).
+ if (Cond.getOpcode() == ISD::AND &&
+ Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
+ isOneConstant(Cond.getOperand(1)))
+ Cond = Cond.getOperand(0);
+
+ // If condition flag is set by a X86ISD::CMP, then use it as the condition
+ // setting operand in place of the X86ISD::SETCC.
+ unsigned CondOpcode = Cond.getOpcode();
+ if (CondOpcode == X86ISD::SETCC ||
+ CondOpcode == X86ISD::SETCC_CARRY) {
+ CC = Cond.getOperand(0);
+
+ SDValue Cmp = Cond.getOperand(1);
+ unsigned Opc = Cmp.getOpcode();
+ // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
+ if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
+ Cond = Cmp;
+ addTest = false;
+ } else {
+ switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
+ default: break;
+ case X86::COND_O:
+ case X86::COND_B:
+ // These can only come from an arithmetic instruction with overflow,
+ // e.g. SADDO, UADDO.
+ Cond = Cond.getOperand(1);
+ addTest = false;
+ break;
+ }
+ }
+ }
+ CondOpcode = Cond.getOpcode();
+ if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
+ CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
+ ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
+ Cond.getOperand(0).getValueType() != MVT::i8)) {
+ SDValue LHS = Cond.getOperand(0);
+ SDValue RHS = Cond.getOperand(1);
+ unsigned X86Opcode;
+ unsigned X86Cond;
+ SDVTList VTs;
+ // Keep this in sync with LowerXALUO, otherwise we might create redundant
+ // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
+ // X86ISD::INC).
+ switch (CondOpcode) {
+ case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
+ case ISD::SADDO:
+ if (isOneConstant(RHS)) {
+ X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
+ break;
+ }
+ X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
+ case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
+ case ISD::SSUBO:
+ if (isOneConstant(RHS)) {
+ X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
+ break;
+ }
+ X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
+ case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
+ case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
+ default: llvm_unreachable("unexpected overflowing operator");
+ }
+ if (Inverted)
+ X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
+ if (CondOpcode == ISD::UMULO)
+ VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
+ MVT::i32);
+ else
+ VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
+
+ SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
+
+ if (CondOpcode == ISD::UMULO)
+ Cond = X86Op.getValue(2);
+ else
+ Cond = X86Op.getValue(1);
+
+ CC = DAG.getConstant(X86Cond, dl, MVT::i8);
+ addTest = false;
+ } else {
+ unsigned CondOpc;
+ if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
+ SDValue Cmp = Cond.getOperand(0).getOperand(1);
+ if (CondOpc == ISD::OR) {
+ // Also, recognize the pattern generated by an FCMP_UNE. We can emit
+ // two branches instead of an explicit OR instruction with a
+ // separate test.
+ if (Cmp == Cond.getOperand(1).getOperand(1) &&
+ isX86LogicalCmp(Cmp)) {
+ CC = Cond.getOperand(0).getOperand(0);
+ Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+ Chain, Dest, CC, Cmp);
+ CC = Cond.getOperand(1).getOperand(0);
+ Cond = Cmp;
+ addTest = false;
+ }
+ } else { // ISD::AND
+ // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
+ // two branches instead of an explicit AND instruction with a
+ // separate test. However, we only do this if this block doesn't
+ // have a fall-through edge, because this requires an explicit
+ // jmp when the condition is false.
+ if (Cmp == Cond.getOperand(1).getOperand(1) &&
+ isX86LogicalCmp(Cmp) &&
+ Op.getNode()->hasOneUse()) {
+ X86::CondCode CCode =
+ (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
+ CCode = X86::GetOppositeBranchCondition(CCode);
+ CC = DAG.getConstant(CCode, dl, MVT::i8);
+ SDNode *User = *Op.getNode()->use_begin();
+ // Look for an unconditional branch following this conditional branch.
+ // We need this because we need to reverse the successors in order
+ // to implement FCMP_OEQ.
+ if (User->getOpcode() == ISD::BR) {
+ SDValue FalseBB = User->getOperand(1);
+ SDNode *NewBR =
+ DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
+ assert(NewBR == User);
+ (void)NewBR;
+ Dest = FalseBB;
+
+ Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+ Chain, Dest, CC, Cmp);
+ X86::CondCode CCode =
+ (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
+ CCode = X86::GetOppositeBranchCondition(CCode);
+ CC = DAG.getConstant(CCode, dl, MVT::i8);
+ Cond = Cmp;
+ addTest = false;
+ }
+ }
+ }
+ } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
+ // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
+ // It should be transformed during dag combiner except when the condition
+ // is set by a arithmetics with overflow node.
+ X86::CondCode CCode =
+ (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
+ CCode = X86::GetOppositeBranchCondition(CCode);
+ CC = DAG.getConstant(CCode, dl, MVT::i8);
+ Cond = Cond.getOperand(0).getOperand(1);
+ addTest = false;
+ } else if (Cond.getOpcode() == ISD::SETCC &&
+ cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
+ // For FCMP_OEQ, we can emit
+ // two branches instead of an explicit AND instruction with a
+ // separate test. However, we only do this if this block doesn't
+ // have a fall-through edge, because this requires an explicit
+ // jmp when the condition is false.
+ if (Op.getNode()->hasOneUse()) {
+ SDNode *User = *Op.getNode()->use_begin();
+ // Look for an unconditional branch following this conditional branch.
+ // We need this because we need to reverse the successors in order
+ // to implement FCMP_OEQ.
+ if (User->getOpcode() == ISD::BR) {
+ SDValue FalseBB = User->getOperand(1);
+ SDNode *NewBR =
+ DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
+ assert(NewBR == User);
+ (void)NewBR;
+ Dest = FalseBB;
+
+ SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
+ Cond.getOperand(0), Cond.getOperand(1));
+ Cmp = ConvertCmpIfNecessary(Cmp, DAG);
+ CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
+ Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+ Chain, Dest, CC, Cmp);
+ CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
+ Cond = Cmp;
+ addTest = false;
+ }
+ }
+ } else if (Cond.getOpcode() == ISD::SETCC &&
+ cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
+ // For FCMP_UNE, we can emit
+ // two branches instead of an explicit AND instruction with a
+ // separate test. However, we only do this if this block doesn't
+ // have a fall-through edge, because this requires an explicit
+ // jmp when the condition is false.
+ if (Op.getNode()->hasOneUse()) {
+ SDNode *User = *Op.getNode()->use_begin();
+ // Look for an unconditional branch following this conditional branch.
+ // We need this because we need to reverse the successors in order
+ // to implement FCMP_UNE.
+ if (User->getOpcode() == ISD::BR) {
+ SDValue FalseBB = User->getOperand(1);
+ SDNode *NewBR =
+ DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
+ assert(NewBR == User);
+ (void)NewBR;
+
+ SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
+ Cond.getOperand(0), Cond.getOperand(1));
+ Cmp = ConvertCmpIfNecessary(Cmp, DAG);
+ CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
+ Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+ Chain, Dest, CC, Cmp);
+ CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
+ Cond = Cmp;
+ addTest = false;
+ Dest = FalseBB;
+ }
+ }
+ }
+ }
+
+ if (addTest) {
+ // Look pass the truncate if the high bits are known zero.
+ if (isTruncWithZeroHighBitsInput(Cond, DAG))
+ Cond = Cond.getOperand(0);
+
+ // We know the result is compared against zero. Try to match it to BT.
+ if (Cond.hasOneUse()) {
+ if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) {
+ CC = NewSetCC.getOperand(0);
+ Cond = NewSetCC.getOperand(1);
+ addTest = false;
+ }
+ }
+ }
+
+ if (addTest) {
+ X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
+ CC = DAG.getConstant(X86Cond, dl, MVT::i8);
+ Cond = EmitTest(Cond, X86Cond, dl, DAG);
+ }
+ Cond = ConvertCmpIfNecessary(Cond, DAG);
+ return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+ Chain, Dest, CC, Cond);
+}
+
+// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
+// Calls to _alloca are needed to probe the stack when allocating more than 4k
+// bytes in one go. Touching the stack at 4K increments is necessary to ensure
+// that the guard pages used by the OS virtual memory manager are allocated in
+// correct sequence.
+SDValue
+X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+ SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ bool SplitStack = MF.shouldSplitStack();
+ bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
+ SplitStack;
+ SDLoc dl(Op);
+
+ // Get the inputs.
+ SDNode *Node = Op.getNode();
+ SDValue Chain = Op.getOperand(0);
+ SDValue Size = Op.getOperand(1);
+ unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+ EVT VT = Node->getValueType(0);
+
+ // Chain the dynamic stack allocation so that it doesn't modify the stack
+ // pointer when other instructions are using the stack.
+ Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl);
+
+ bool Is64Bit = Subtarget.is64Bit();
+ MVT SPTy = getPointerTy(DAG.getDataLayout());
+
+ SDValue Result;
+ if (!Lower) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
+ assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
+ " not tell us which reg is the stack pointer!");
+
+ SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
+ Chain = SP.getValue(1);
+ const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
+ unsigned StackAlign = TFI.getStackAlignment();
+ Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
+ if (Align > StackAlign)
+ Result = DAG.getNode(ISD::AND, dl, VT, Result,
+ DAG.getConstant(-(uint64_t)Align, dl, VT));
+ Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
+ } else if (SplitStack) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ if (Is64Bit) {
+ // The 64 bit implementation of segmented stacks needs to clobber both r10
+ // r11. This makes it impossible to use it along with nested parameters.
+ const Function *F = MF.getFunction();
+ for (const auto &A : F->args()) {
+ if (A.hasNestAttr())
+ report_fatal_error("Cannot use segmented stacks with functions that "
+ "have nested arguments.");
+ }
+ }
+
+ const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
+ unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
+ Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
+ Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
+ DAG.getRegister(Vreg, SPTy));
+ } else {
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+ Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
+ MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
+
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ unsigned SPReg = RegInfo->getStackRegister();
+ SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
+ Chain = SP.getValue(1);
+
+ if (Align) {
+ SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
+ DAG.getConstant(-(uint64_t)Align, dl, VT));
+ Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
+ }
+
+ Result = SP;
+ }
+
+ Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
+ DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
+
+ SDValue Ops[2] = {Result, Chain};
+ return DAG.getMergeValues(Ops, dl);
+}
+
+SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ auto PtrVT = getPointerTy(MF.getDataLayout());
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+
+ const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+ SDLoc DL(Op);
+
+ if (!Subtarget.is64Bit() ||
+ Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) {
+ // vastart just stores the address of the VarArgsFrameIndex slot into the
+ // memory location argument.
+ SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+ return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
+ MachinePointerInfo(SV));
+ }
+
+ // __va_list_tag:
+ // gp_offset (0 - 6 * 8)
+ // fp_offset (48 - 48 + 8 * 16)
+ // overflow_arg_area (point to parameters coming in memory).
+ // reg_save_area
+ SmallVector<SDValue, 8> MemOps;
+ SDValue FIN = Op.getOperand(1);
+ // Store gp_offset
+ SDValue Store = DAG.getStore(
+ Op.getOperand(0), DL,
+ DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
+ MachinePointerInfo(SV));
+ MemOps.push_back(Store);
+
+ // Store fp_offset
+ FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
+ Store = DAG.getStore(
+ Op.getOperand(0), DL,
+ DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
+ MachinePointerInfo(SV, 4));
+ MemOps.push_back(Store);
+
+ // Store ptr to overflow_arg_area
+ FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
+ SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+ Store =
+ DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
+ MemOps.push_back(Store);
+
+ // Store ptr to reg_save_area.
+ FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
+ Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
+ SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
+ Store = DAG.getStore(
+ Op.getOperand(0), DL, RSFIN, FIN,
+ MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
+ MemOps.push_back(Store);
+ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
+}
+
+SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
+ assert(Subtarget.is64Bit() &&
+ "LowerVAARG only handles 64-bit va_arg!");
+ assert(Op.getNumOperands() == 4);
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()))
+ // The Win64 ABI uses char* instead of a structure.
+ return DAG.expandVAArg(Op.getNode());
+
+ SDValue Chain = Op.getOperand(0);
+ SDValue SrcPtr = Op.getOperand(1);
+ const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+ unsigned Align = Op.getConstantOperandVal(3);
+ SDLoc dl(Op);
+
+ EVT ArgVT = Op.getNode()->getValueType(0);
+ Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+ uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
+ uint8_t ArgMode;
+
+ // Decide which area this value should be read from.
+ // TODO: Implement the AMD64 ABI in its entirety. This simple
+ // selection mechanism works only for the basic types.
+ if (ArgVT == MVT::f80) {
+ llvm_unreachable("va_arg for f80 not yet implemented");
+ } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
+ ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
+ } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
+ ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
+ } else {
+ llvm_unreachable("Unhandled argument type in LowerVAARG");
+ }
+
+ if (ArgMode == 2) {
+ // Sanity Check: Make sure using fp_offset makes sense.
+ assert(!Subtarget.useSoftFloat() &&
+ !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&
+ Subtarget.hasSSE1());
+ }
+
+ // Insert VAARG_64 node into the DAG
+ // VAARG_64 returns two values: Variable Argument Address, Chain
+ SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
+ DAG.getConstant(ArgMode, dl, MVT::i8),
+ DAG.getConstant(Align, dl, MVT::i32)};
+ SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
+ SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
+ VTs, InstOps, MVT::i64,
+ MachinePointerInfo(SV),
+ /*Align=*/0,
+ /*Volatile=*/false,
+ /*ReadMem=*/true,
+ /*WriteMem=*/true);
+ Chain = VAARG.getValue(1);
+
+ // Load the next argument and return it
+ return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
+}
+
+static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
+ // where a va_list is still an i8*.
+ assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
+ if (Subtarget.isCallingConvWin64(
+ DAG.getMachineFunction().getFunction()->getCallingConv()))
+ // Probably a Win64 va_copy.
+ return DAG.expandVACopy(Op.getNode());
+
+ SDValue Chain = Op.getOperand(0);
+ SDValue DstPtr = Op.getOperand(1);
+ SDValue SrcPtr = Op.getOperand(2);
+ const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
+ const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+ SDLoc DL(Op);
+
+ return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
+ DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
+ false, false,
+ MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
+}
+
+/// Handle vector element shifts where the shift amount is a constant.
+/// Takes immediate version of shift as input.
+static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
+ SDValue SrcOp, uint64_t ShiftAmt,
+ SelectionDAG &DAG) {
+ MVT ElementType = VT.getVectorElementType();
+
+ // Fold this packed shift into its first operand if ShiftAmt is 0.
+ if (ShiftAmt == 0)
+ return SrcOp;
+
+ // Check for ShiftAmt >= element width
+ if (ShiftAmt >= ElementType.getSizeInBits()) {
+ if (Opc == X86ISD::VSRAI)
+ ShiftAmt = ElementType.getSizeInBits() - 1;
+ else
+ return DAG.getConstant(0, dl, VT);
+ }
+
+ assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
+ && "Unknown target vector shift-by-constant node");
+
+ // Fold this packed vector shift into a build vector if SrcOp is a
+ // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
+ if (VT == SrcOp.getSimpleValueType() &&
+ ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
+ SmallVector<SDValue, 8> Elts;
+ unsigned NumElts = SrcOp->getNumOperands();
+ ConstantSDNode *ND;
+
+ switch(Opc) {
+ default: llvm_unreachable("Unknown opcode!");
+ case X86ISD::VSHLI:
+ for (unsigned i=0; i!=NumElts; ++i) {
+ SDValue CurrentOp = SrcOp->getOperand(i);
+ if (CurrentOp->isUndef()) {
+ Elts.push_back(CurrentOp);
+ continue;
+ }
+ ND = cast<ConstantSDNode>(CurrentOp);
+ const APInt &C = ND->getAPIntValue();
+ Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
+ }
+ break;
+ case X86ISD::VSRLI:
+ for (unsigned i=0; i!=NumElts; ++i) {
+ SDValue CurrentOp = SrcOp->getOperand(i);
+ if (CurrentOp->isUndef()) {
+ Elts.push_back(CurrentOp);
+ continue;
+ }
+ ND = cast<ConstantSDNode>(CurrentOp);
+ const APInt &C = ND->getAPIntValue();
+ Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
+ }
+ break;
+ case X86ISD::VSRAI:
+ for (unsigned i=0; i!=NumElts; ++i) {
+ SDValue CurrentOp = SrcOp->getOperand(i);
+ if (CurrentOp->isUndef()) {
+ Elts.push_back(CurrentOp);
+ continue;
+ }
+ ND = cast<ConstantSDNode>(CurrentOp);
+ const APInt &C = ND->getAPIntValue();
+ Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
+ }
+ break;
+ }
+
+ return DAG.getBuildVector(VT, dl, Elts);
+ }
+
+ return DAG.getNode(Opc, dl, VT, SrcOp,
+ DAG.getConstant(ShiftAmt, dl, MVT::i8));
+}
+
+/// Handle vector element shifts where the shift amount may or may not be a
+/// constant. Takes immediate version of shift as input.
+static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
+ SDValue SrcOp, SDValue ShAmt,
+ SelectionDAG &DAG) {
+ MVT SVT = ShAmt.getSimpleValueType();
+ assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
+
+ // Catch shift-by-constant.
+ if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
+ return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
+ CShAmt->getZExtValue(), DAG);
+
+ // Change opcode to non-immediate version
+ switch (Opc) {
+ default: llvm_unreachable("Unknown target vector shift node");
+ case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
+ case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
+ case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
+ }
+
+ const X86Subtarget &Subtarget =
+ static_cast<const X86Subtarget &>(DAG.getSubtarget());
+ if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
+ ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
+ // Let the shuffle legalizer expand this shift amount node.
+ SDValue Op0 = ShAmt.getOperand(0);
+ Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
+ ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, Subtarget, DAG);
+ } else {
+ // Need to build a vector containing shift amount.
+ // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
+ SmallVector<SDValue, 4> ShOps;
+ ShOps.push_back(ShAmt);
+ if (SVT == MVT::i32) {
+ ShOps.push_back(DAG.getConstant(0, dl, SVT));
+ ShOps.push_back(DAG.getUNDEF(SVT));
+ }
+ ShOps.push_back(DAG.getUNDEF(SVT));
+
+ MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
+ ShAmt = DAG.getBuildVector(BVT, dl, ShOps);
+ }
+
+ // The return type has to be a 128-bit type with the same element
+ // type as the input type.
+ MVT EltVT = VT.getVectorElementType();
+ MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
+
+ ShAmt = DAG.getBitcast(ShVT, ShAmt);
+ return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
+}
+
+/// \brief Return Mask with the necessary casting or extending
+/// for \p Mask according to \p MaskVT when lowering masking intrinsics
+static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
+ const X86Subtarget &Subtarget, SelectionDAG &DAG,
+ const SDLoc &dl) {
+
+ if (isAllOnesConstant(Mask))
+ return DAG.getTargetConstant(1, dl, MaskVT);
+ if (X86::isZeroNode(Mask))
+ return DAG.getTargetConstant(0, dl, MaskVT);
+
+ if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
+ // Mask should be extended
+ Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
+ MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
+ }
+
+ if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
+ if (MaskVT == MVT::v64i1) {
+ assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
+ // In case 32bit mode, bitcast i64 is illegal, extend/split it.
+ SDValue Lo, Hi;
+ Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
+ DAG.getConstant(0, dl, MVT::i32));
+ Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
+ DAG.getConstant(1, dl, MVT::i32));
+
+ Lo = DAG.getBitcast(MVT::v32i1, Lo);
+ Hi = DAG.getBitcast(MVT::v32i1, Hi);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
+ } else {
+ // MaskVT require < 64bit. Truncate mask (should succeed in any case),
+ // and bitcast.
+ MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
+ return DAG.getBitcast(MaskVT,
+ DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
+ }
+
+ } else {
+ MVT BitcastVT = MVT::getVectorVT(MVT::i1,
+ Mask.getSimpleValueType().getSizeInBits());
+ // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
+ // are extracted by EXTRACT_SUBVECTOR.
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
+ DAG.getBitcast(BitcastVT, Mask),
+ DAG.getIntPtrConstant(0, dl));
+ }
+}
+
+/// \brief Return (and \p Op, \p Mask) for compare instructions or
+/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
+/// necessary casting or extending for \p Mask when lowering masking intrinsics
+static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
+ SDValue PreservedSrc,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+ unsigned OpcodeSelect = ISD::VSELECT;
+ SDLoc dl(Op);
+
+ if (isAllOnesConstant(Mask))
+ return Op;
+
+ SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+
+ switch (Op.getOpcode()) {
+ default: break;
+ case X86ISD::PCMPEQM:
+ case X86ISD::PCMPGTM:
+ case X86ISD::CMPM:
+ case X86ISD::CMPMU:
+ return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
+ case X86ISD::VFPCLASS:
+ case X86ISD::VFPCLASSS:
+ return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
+ case X86ISD::VTRUNC:
+ case X86ISD::VTRUNCS:
+ case X86ISD::VTRUNCUS:
+ case X86ISD::CVTPS2PH:
+ // We can't use ISD::VSELECT here because it is not always "Legal"
+ // for the destination type. For example vpmovqb require only AVX512
+ // and vselect that can operate on byte element type require BWI
+ OpcodeSelect = X86ISD::SELECT;
+ break;
+ }
+ if (PreservedSrc.isUndef())
+ PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
+ return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
+}
+
+/// \brief Creates an SDNode for a predicated scalar operation.
+/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
+/// The mask is coming as MVT::i8 and it should be truncated
+/// to MVT::i1 while lowering masking intrinsics.
+/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
+/// "X86select" instead of "vselect". We just can't create the "vselect" node
+/// for a scalar instruction.
+static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
+ SDValue PreservedSrc,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ if (isAllOnesConstant(Mask))
+ return Op;
+
+ MVT VT = Op.getSimpleValueType();
+ SDLoc dl(Op);
+ // The mask should be of type MVT::i1
+ SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
+
+ if (Op.getOpcode() == X86ISD::FSETCCM ||
+ Op.getOpcode() == X86ISD::FSETCCM_RND)
+ return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
+ if (Op.getOpcode() == X86ISD::VFPCLASS ||
+ Op.getOpcode() == X86ISD::VFPCLASSS)
+ return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
+
+ if (PreservedSrc.isUndef())
+ PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
+ return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
+}
+
+static int getSEHRegistrationNodeSize(const Function *Fn) {
+ if (!Fn->hasPersonalityFn())
+ report_fatal_error(
+ "querying registration node size for function without personality");
+ // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
+ // WinEHStatePass for the full struct definition.
+ switch (classifyEHPersonality(Fn->getPersonalityFn())) {
+ case EHPersonality::MSVC_X86SEH: return 24;
+ case EHPersonality::MSVC_CXX: return 16;
+ default: break;
+ }
+ report_fatal_error(
+ "can only recover FP for 32-bit MSVC EH personality functions");
+}
+
+/// When the MSVC runtime transfers control to us, either to an outlined
+/// function or when returning to a parent frame after catching an exception, we
+/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
+/// Here's the math:
+/// RegNodeBase = EntryEBP - RegNodeSize
+/// ParentFP = RegNodeBase - ParentFrameOffset
+/// Subtracting RegNodeSize takes us to the offset of the registration node, and
+/// subtracting the offset (negative on x86) takes us back to the parent FP.
+static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
+ SDValue EntryEBP) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ SDLoc dl;
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+
+ // It's possible that the parent function no longer has a personality function
+ // if the exceptional code was optimized away, in which case we just return
+ // the incoming EBP.
+ if (!Fn->hasPersonalityFn())
+ return EntryEBP;
+
+ // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
+ // registration, or the .set_setframe offset.
+ MCSymbol *OffsetSym =
+ MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
+ GlobalValue::getRealLinkageName(Fn->getName()));
+ SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
+ SDValue ParentFrameOffset =
+ DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
+
+ // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
+ // prologue to RBP in the parent function.
+ const X86Subtarget &Subtarget =
+ static_cast<const X86Subtarget &>(DAG.getSubtarget());
+ if (Subtarget.is64Bit())
+ return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
+
+ int RegNodeSize = getSEHRegistrationNodeSize(Fn);
+ // RegNodeBase = EntryEBP - RegNodeSize
+ // ParentFP = RegNodeBase - ParentFrameOffset
+ SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
+ DAG.getConstant(RegNodeSize, dl, PtrVT));
+ return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
+}
+
+static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ // Helper to detect if the operand is CUR_DIRECTION rounding mode.
+ auto isRoundModeCurDirection = [](SDValue Rnd) {
+ if (!isa<ConstantSDNode>(Rnd))
+ return false;
+
+ unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
+ return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
+ };
+
+ SDLoc dl(Op);
+ unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ MVT VT = Op.getSimpleValueType();
+ const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
+ if (IntrData) {
+ switch(IntrData->Type) {
+ case INTR_TYPE_1OP:
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
+ case INTR_TYPE_2OP:
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
+ Op.getOperand(2));
+ case INTR_TYPE_3OP:
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
+ Op.getOperand(2), Op.getOperand(3));
+ case INTR_TYPE_4OP:
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
+ Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
+ case INTR_TYPE_1OP_MASK_RM: {
+ SDValue Src = Op.getOperand(1);
+ SDValue PassThru = Op.getOperand(2);
+ SDValue Mask = Op.getOperand(3);
+ SDValue RoundingMode;
+ // We always add rounding mode to the Node.
+ // If the rounding mode is not specified, we add the
+ // "current direction" mode.
+ if (Op.getNumOperands() == 4)
+ RoundingMode =
+ DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
+ else
+ RoundingMode = Op.getOperand(4);
+ assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
+ RoundingMode),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case INTR_TYPE_1OP_MASK: {
+ SDValue Src = Op.getOperand(1);
+ SDValue PassThru = Op.getOperand(2);
+ SDValue Mask = Op.getOperand(3);
+ // We add rounding mode to the Node when
+ // - RM Opcode is specified and
+ // - RM is not "current direction".
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0) {
+ SDValue Rnd = Op.getOperand(4);
+ if (!isRoundModeCurDirection(Rnd)) {
+ return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+ dl, Op.getValueType(),
+ Src, Rnd),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ }
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case INTR_TYPE_SCALAR_MASK: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue passThru = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+ return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
+ Mask, passThru, Subtarget, DAG);
+ }
+ case INTR_TYPE_SCALAR_MASK_RM: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src0 = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+ // There are 2 kinds of intrinsics in this group:
+ // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
+ // (2) With rounding mode and sae - 7 operands.
+ if (Op.getNumOperands() == 6) {
+ SDValue Sae = Op.getOperand(5);
+ return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
+ Sae),
+ Mask, Src0, Subtarget, DAG);
+ }
+ assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
+ SDValue RoundingMode = Op.getOperand(5);
+ SDValue Sae = Op.getOperand(6);
+ return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
+ RoundingMode, Sae),
+ Mask, Src0, Subtarget, DAG);
+ }
+ case INTR_TYPE_2OP_MASK:
+ case INTR_TYPE_2OP_IMM8_MASK: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue PassThru = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+
+ if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
+ Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
+
+ // We specify 2 possible opcodes for intrinsics with rounding modes.
+ // First, we check if the intrinsic may have non-default rounding mode,
+ // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0) {
+ SDValue Rnd = Op.getOperand(5);
+ if (!isRoundModeCurDirection(Rnd)) {
+ return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+ dl, Op.getValueType(),
+ Src1, Src2, Rnd),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ }
+ // TODO: Intrinsics should have fast-math-flags to propagate.
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case INTR_TYPE_2OP_MASK_RM: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue PassThru = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+ // We specify 2 possible modes for intrinsics, with/without rounding
+ // modes.
+ // First, we check if the intrinsic have rounding mode (6 operands),
+ // if not, we set rounding mode to "current".
+ SDValue Rnd;
+ if (Op.getNumOperands() == 6)
+ Rnd = Op.getOperand(5);
+ else
+ Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+ Src1, Src2, Rnd),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case INTR_TYPE_3OP_SCALAR_MASK_RM: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src3 = Op.getOperand(3);
+ SDValue PassThru = Op.getOperand(4);
+ SDValue Mask = Op.getOperand(5);
+ SDValue Sae = Op.getOperand(6);
+
+ return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
+ Src2, Src3, Sae),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case INTR_TYPE_3OP_MASK_RM: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Imm = Op.getOperand(3);
+ SDValue PassThru = Op.getOperand(4);
+ SDValue Mask = Op.getOperand(5);
+ // We specify 2 possible modes for intrinsics, with/without rounding
+ // modes.
+ // First, we check if the intrinsic have rounding mode (7 operands),
+ // if not, we set rounding mode to "current".
+ SDValue Rnd;
+ if (Op.getNumOperands() == 7)
+ Rnd = Op.getOperand(6);
+ else
+ Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+ Src1, Src2, Imm, Rnd),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case INTR_TYPE_3OP_IMM8_MASK:
+ case INTR_TYPE_3OP_MASK:
+ case INSERT_SUBVEC: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src3 = Op.getOperand(3);
+ SDValue PassThru = Op.getOperand(4);
+ SDValue Mask = Op.getOperand(5);
+
+ if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
+ Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
+ else if (IntrData->Type == INSERT_SUBVEC) {
+ // imm should be adapted to ISD::INSERT_SUBVECTOR behavior
+ assert(isa<ConstantSDNode>(Src3) && "Expected a ConstantSDNode here!");
+ unsigned Imm = cast<ConstantSDNode>(Src3)->getZExtValue();
+ Imm *= Src2.getSimpleValueType().getVectorNumElements();
+ Src3 = DAG.getTargetConstant(Imm, dl, MVT::i32);
+ }
+
+ // We specify 2 possible opcodes for intrinsics with rounding modes.
+ // First, we check if the intrinsic may have non-default rounding mode,
+ // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0) {
+ SDValue Rnd = Op.getOperand(6);
+ if (!isRoundModeCurDirection(Rnd)) {
+ return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+ dl, Op.getValueType(),
+ Src1, Src2, Src3, Rnd),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ }
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+ Src1, Src2, Src3),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case VPERM_2OP_MASK : {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue PassThru = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+
+ // Swap Src1 and Src2 in the node creation
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case VPERM_3OP_MASKZ:
+ case VPERM_3OP_MASK:{
+ MVT VT = Op.getSimpleValueType();
+ // Src2 is the PassThru
+ SDValue Src1 = Op.getOperand(1);
+ // PassThru needs to be the same type as the destination in order
+ // to pattern match correctly.
+ SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2));
+ SDValue Src3 = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+ SDValue PassThru = SDValue();
+
+ // set PassThru element
+ if (IntrData->Type == VPERM_3OP_MASKZ)
+ PassThru = getZeroVector(VT, Subtarget, DAG, dl);
+ else
+ PassThru = Src2;
+
+ // Swap Src1 and Src2 in the node creation
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
+ dl, Op.getValueType(),
+ Src2, Src1, Src3),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case FMA_OP_MASK3:
+ case FMA_OP_MASKZ:
+ case FMA_OP_MASK: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src3 = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+ MVT VT = Op.getSimpleValueType();
+ SDValue PassThru = SDValue();
+
+ // set PassThru element
+ if (IntrData->Type == FMA_OP_MASKZ)
+ PassThru = getZeroVector(VT, Subtarget, DAG, dl);
+ else if (IntrData->Type == FMA_OP_MASK3)
+ PassThru = Src3;
+ else
+ PassThru = Src1;
+
+ // We specify 2 possible opcodes for intrinsics with rounding modes.
+ // First, we check if the intrinsic may have non-default rounding mode,
+ // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0) {
+ SDValue Rnd = Op.getOperand(5);
+ if (!isRoundModeCurDirection(Rnd))
+ return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+ dl, Op.getValueType(),
+ Src1, Src2, Src3, Rnd),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
+ dl, Op.getValueType(),
+ Src1, Src2, Src3),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case FMA_OP_SCALAR_MASK:
+ case FMA_OP_SCALAR_MASK3:
+ case FMA_OP_SCALAR_MASKZ: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src3 = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+ MVT VT = Op.getSimpleValueType();
+ SDValue PassThru = SDValue();
+
+ // set PassThru element
+ if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
+ PassThru = getZeroVector(VT, Subtarget, DAG, dl);
+ else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
+ PassThru = Src3;
+ else
+ PassThru = Src1;
+
+ SDValue Rnd = Op.getOperand(5);
+ return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
+ Op.getValueType(), Src1, Src2,
+ Src3, Rnd),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case TERLOG_OP_MASK:
+ case TERLOG_OP_MASKZ: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src3 = Op.getOperand(3);
+ SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
+ SDValue Mask = Op.getOperand(5);
+ MVT VT = Op.getSimpleValueType();
+ SDValue PassThru = Src1;
+ // Set PassThru element.
+ if (IntrData->Type == TERLOG_OP_MASKZ)
+ PassThru = getZeroVector(VT, Subtarget, DAG, dl);
+
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+ Src1, Src2, Src3, Src4),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case CVTPD2PS:
+ // ISD::FP_ROUND has a second argument that indicates if the truncation
+ // does not change the value. Set it to 0 since it can change.
+ return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
+ DAG.getIntPtrConstant(0, dl));
+ case CVTPD2PS_MASK: {
+ SDValue Src = Op.getOperand(1);
+ SDValue PassThru = Op.getOperand(2);
+ SDValue Mask = Op.getOperand(3);
+ // We add rounding mode to the Node when
+ // - RM Opcode is specified and
+ // - RM is not "current direction".
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0) {
+ SDValue Rnd = Op.getOperand(4);
+ if (!isRoundModeCurDirection(Rnd)) {
+ return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+ dl, Op.getValueType(),
+ Src, Rnd),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ }
+ assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
+ // ISD::FP_ROUND has a second argument that indicates if the truncation
+ // does not change the value. Set it to 0 since it can change.
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
+ DAG.getIntPtrConstant(0, dl)),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case FPCLASS: {
+ // FPclass intrinsics with mask
+ SDValue Src1 = Op.getOperand(1);
+ MVT VT = Src1.getSimpleValueType();
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+ SDValue Imm = Op.getOperand(2);
+ SDValue Mask = Op.getOperand(3);
+ MVT BitcastVT = MVT::getVectorVT(MVT::i1,
+ Mask.getSimpleValueType().getSizeInBits());
+ SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
+ SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,
+ DAG.getTargetConstant(0, dl, MaskVT),
+ Subtarget, DAG);
+ SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
+ DAG.getUNDEF(BitcastVT), FPclassMask,
+ DAG.getIntPtrConstant(0, dl));
+ return DAG.getBitcast(Op.getValueType(), Res);
+ }
+ case FPCLASSS: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Imm = Op.getOperand(2);
+ SDValue Mask = Op.getOperand(3);
+ SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm);
+ SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
+ DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
+ return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, FPclassMask);
+ }
+ case CMP_MASK:
+ case CMP_MASK_CC: {
+ // Comparison intrinsics with masks.
+ // Example of transformation:
+ // (i8 (int_x86_avx512_mask_pcmpeq_q_128
+ // (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
+ // (i8 (bitcast
+ // (v8i1 (insert_subvector undef,
+ // (v2i1 (and (PCMPEQM %a, %b),
+ // (extract_subvector
+ // (v8i1 (bitcast %mask)), 0))), 0))))
+ MVT VT = Op.getOperand(1).getSimpleValueType();
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+ SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
+ MVT BitcastVT = MVT::getVectorVT(MVT::i1,
+ Mask.getSimpleValueType().getSizeInBits());
+ SDValue Cmp;
+ if (IntrData->Type == CMP_MASK_CC) {
+ SDValue CC = Op.getOperand(3);
+ CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
+ // We specify 2 possible opcodes for intrinsics with rounding modes.
+ // First, we check if the intrinsic may have non-default rounding mode,
+ // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+ if (IntrData->Opc1 != 0) {
+ SDValue Rnd = Op.getOperand(5);
+ if (!isRoundModeCurDirection(Rnd))
+ Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
+ Op.getOperand(2), CC, Rnd);
+ }
+ //default rounding mode
+ if(!Cmp.getNode())
+ Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
+ Op.getOperand(2), CC);
+
+ } else {
+ assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
+ Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
+ Op.getOperand(2));
+ }
+ SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
+ DAG.getTargetConstant(0, dl,
+ MaskVT),
+ Subtarget, DAG);
+ SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
+ DAG.getUNDEF(BitcastVT), CmpMask,
+ DAG.getIntPtrConstant(0, dl));
+ return DAG.getBitcast(Op.getValueType(), Res);
+ }
+ case CMP_MASK_SCALAR_CC: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
+ SDValue Mask = Op.getOperand(4);
+
+ SDValue Cmp;
+ if (IntrData->Opc1 != 0) {
+ SDValue Rnd = Op.getOperand(5);
+ if (!isRoundModeCurDirection(Rnd))
+ Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd);
+ }
+ //default rounding mode
+ if(!Cmp.getNode())
+ Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Src2, CC);
+
+ SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
+ DAG.getTargetConstant(0, dl,
+ MVT::i1),
+ Subtarget, DAG);
+
+ return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, CmpMask);
+ }
+ case COMI: { // Comparison intrinsics
+ ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
+ SDValue LHS = Op.getOperand(1);
+ SDValue RHS = Op.getOperand(2);
+ SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
+ SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
+ SDValue SetCC;
+ switch (CC) {
+ case ISD::SETEQ: { // (ZF = 0 and PF = 0)
+ SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
+ SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
+ SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
+ break;
+ }
+ case ISD::SETNE: { // (ZF = 1 or PF = 1)
+ SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
+ SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
+ SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
+ break;
+ }
+ case ISD::SETGT: // (CF = 0 and ZF = 0)
+ SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
+ break;
+ case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
+ SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
+ break;
+ }
+ case ISD::SETGE: // CF = 0
+ SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
+ break;
+ case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
+ SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
+ break;
+ default:
+ llvm_unreachable("Unexpected illegal condition!");
+ }
+ return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+ }
+ case COMI_RM: { // Comparison intrinsics with Sae
+ SDValue LHS = Op.getOperand(1);
+ SDValue RHS = Op.getOperand(2);
+ unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+ SDValue Sae = Op.getOperand(4);
+
+ SDValue FCmp;
+ if (isRoundModeCurDirection(Sae))
+ FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::i1, LHS, RHS,
+ DAG.getConstant(CondVal, dl, MVT::i8));
+ else
+ FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::i1, LHS, RHS,
+ DAG.getConstant(CondVal, dl, MVT::i8), Sae);
+ // AnyExt just uses KMOVW %kreg, %r32; ZeroExt emits "and $1, %reg"
+ return DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, FCmp);
+ }
+ case VSHIFT:
+ return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
+ Op.getOperand(1), Op.getOperand(2), DAG);
+ case COMPRESS_EXPAND_IN_REG: {
+ SDValue Mask = Op.getOperand(3);
+ SDValue DataToCompress = Op.getOperand(1);
+ SDValue PassThru = Op.getOperand(2);
+ if (isAllOnesConstant(Mask)) // return data as is
+ return Op.getOperand(1);
+
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+ DataToCompress),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case BROADCASTM: {
+ SDValue Mask = Op.getOperand(1);
+ MVT MaskVT = MVT::getVectorVT(MVT::i1,
+ Mask.getSimpleValueType().getSizeInBits());
+ Mask = DAG.getBitcast(MaskVT, Mask);
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
+ }
+ case KUNPCK: {
+ MVT VT = Op.getSimpleValueType();
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
+
+ SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
+ SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
+ // Arguments should be swapped.
+ SDValue Res = DAG.getNode(IntrData->Opc0, dl,
+ MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
+ Src2, Src1);
+ return DAG.getBitcast(VT, Res);
+ }
+ case FIXUPIMMS:
+ case FIXUPIMMS_MASKZ:
+ case FIXUPIMM:
+ case FIXUPIMM_MASKZ:{
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src3 = Op.getOperand(3);
+ SDValue Imm = Op.getOperand(4);
+ SDValue Mask = Op.getOperand(5);
+ SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
+ Src1 : getZeroVector(VT, Subtarget, DAG, dl);
+ // We specify 2 possible modes for intrinsics, with/without rounding
+ // modes.
+ // First, we check if the intrinsic have rounding mode (7 operands),
+ // if not, we set rounding mode to "current".
+ SDValue Rnd;
+ if (Op.getNumOperands() == 7)
+ Rnd = Op.getOperand(6);
+ else
+ Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
+ if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+ Src1, Src2, Src3, Imm, Rnd),
+ Mask, Passthru, Subtarget, DAG);
+ else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
+ return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+ Src1, Src2, Src3, Imm, Rnd),
+ Mask, Passthru, Subtarget, DAG);
+ }
+ case CONVERT_TO_MASK: {
+ MVT SrcVT = Op.getOperand(1).getSimpleValueType();
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
+ MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
+
+ SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
+ Op.getOperand(1));
+ SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
+ DAG.getUNDEF(BitcastVT), CvtMask,
+ DAG.getIntPtrConstant(0, dl));
+ return DAG.getBitcast(Op.getValueType(), Res);
+ }
+ case CONVERT_MASK_TO_VEC: {
+ SDValue Mask = Op.getOperand(1);
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+ SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+ return DAG.getNode(IntrData->Opc0, dl, VT, VMask);
+ }
+ case BRCST_SUBVEC_TO_VEC: {
+ SDValue Src = Op.getOperand(1);
+ SDValue Passthru = Op.getOperand(2);
+ SDValue Mask = Op.getOperand(3);
+ EVT resVT = Passthru.getValueType();
+ SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT,
+ DAG.getUNDEF(resVT), Src,
+ DAG.getIntPtrConstant(0, dl));
+ SDValue immVal;
+ if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector())
+ immVal = DAG.getConstant(0x44, dl, MVT::i8);
+ else
+ immVal = DAG.getConstant(0, dl, MVT::i8);
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+ subVec, subVec, immVal),
+ Mask, Passthru, Subtarget, DAG);
+ }
+ case BRCST32x2_TO_VEC: {
+ SDValue Src = Op.getOperand(1);
+ SDValue PassThru = Op.getOperand(2);
+ SDValue Mask = Op.getOperand(3);
+
+ assert((VT.getScalarType() == MVT::i32 ||
+ VT.getScalarType() == MVT::f32) && "Unexpected type!");
+ //bitcast Src to packed 64
+ MVT ScalarVT = VT.getScalarType() == MVT::i32 ? MVT::i64 : MVT::f64;
+ MVT BitcastVT = MVT::getVectorVT(ScalarVT, Src.getValueSizeInBits()/64);
+ Src = DAG.getBitcast(BitcastVT, Src);
+
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ default:
+ break;
+ }
+ }
+
+ switch (IntNo) {
+ default: return SDValue(); // Don't custom lower most intrinsics.
+
+ case Intrinsic::x86_avx2_permd:
+ case Intrinsic::x86_avx2_permps:
+ // Operands intentionally swapped. Mask is last operand to intrinsic,
+ // but second operand for node/instruction.
+ return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
+ Op.getOperand(2), Op.getOperand(1));
+
+ // ptest and testp intrinsics. The intrinsic these come from are designed to
+ // return an integer value, not just an instruction so lower it to the ptest
+ // or testp pattern and a setcc for the result.
+ case Intrinsic::x86_sse41_ptestz:
+ case Intrinsic::x86_sse41_ptestc:
+ case Intrinsic::x86_sse41_ptestnzc:
+ case Intrinsic::x86_avx_ptestz_256:
+ case Intrinsic::x86_avx_ptestc_256:
+ case Intrinsic::x86_avx_ptestnzc_256:
+ case Intrinsic::x86_avx_vtestz_ps:
+ case Intrinsic::x86_avx_vtestc_ps:
+ case Intrinsic::x86_avx_vtestnzc_ps:
+ case Intrinsic::x86_avx_vtestz_pd:
+ case Intrinsic::x86_avx_vtestc_pd:
+ case Intrinsic::x86_avx_vtestnzc_pd:
+ case Intrinsic::x86_avx_vtestz_ps_256:
+ case Intrinsic::x86_avx_vtestc_ps_256:
+ case Intrinsic::x86_avx_vtestnzc_ps_256:
+ case Intrinsic::x86_avx_vtestz_pd_256:
+ case Intrinsic::x86_avx_vtestc_pd_256:
+ case Intrinsic::x86_avx_vtestnzc_pd_256: {
+ bool IsTestPacked = false;
+ X86::CondCode X86CC;
+ switch (IntNo) {
+ default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
+ case Intrinsic::x86_avx_vtestz_ps:
+ case Intrinsic::x86_avx_vtestz_pd:
+ case Intrinsic::x86_avx_vtestz_ps_256:
+ case Intrinsic::x86_avx_vtestz_pd_256:
+ IsTestPacked = true;
+ LLVM_FALLTHROUGH;
+ case Intrinsic::x86_sse41_ptestz:
+ case Intrinsic::x86_avx_ptestz_256:
+ // ZF = 1
+ X86CC = X86::COND_E;
+ break;
+ case Intrinsic::x86_avx_vtestc_ps:
+ case Intrinsic::x86_avx_vtestc_pd:
+ case Intrinsic::x86_avx_vtestc_ps_256:
+ case Intrinsic::x86_avx_vtestc_pd_256:
+ IsTestPacked = true;
+ LLVM_FALLTHROUGH;
+ case Intrinsic::x86_sse41_ptestc:
+ case Intrinsic::x86_avx_ptestc_256:
+ // CF = 1
+ X86CC = X86::COND_B;
+ break;
+ case Intrinsic::x86_avx_vtestnzc_ps:
+ case Intrinsic::x86_avx_vtestnzc_pd:
+ case Intrinsic::x86_avx_vtestnzc_ps_256:
+ case Intrinsic::x86_avx_vtestnzc_pd_256:
+ IsTestPacked = true;
+ LLVM_FALLTHROUGH;
+ case Intrinsic::x86_sse41_ptestnzc:
+ case Intrinsic::x86_avx_ptestnzc_256:
+ // ZF and CF = 0
+ X86CC = X86::COND_A;
+ break;
+ }
+
+ SDValue LHS = Op.getOperand(1);
+ SDValue RHS = Op.getOperand(2);
+ unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
+ SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
+ SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
+ return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+ }
+ case Intrinsic::x86_avx512_kortestz_w:
+ case Intrinsic::x86_avx512_kortestc_w: {
+ X86::CondCode X86CC =
+ (IntNo == Intrinsic::x86_avx512_kortestz_w) ? X86::COND_E : X86::COND_B;
+ SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
+ SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
+ SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
+ SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
+ return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+ }
+
+ case Intrinsic::x86_sse42_pcmpistria128:
+ case Intrinsic::x86_sse42_pcmpestria128:
+ case Intrinsic::x86_sse42_pcmpistric128:
+ case Intrinsic::x86_sse42_pcmpestric128:
+ case Intrinsic::x86_sse42_pcmpistrio128:
+ case Intrinsic::x86_sse42_pcmpestrio128:
+ case Intrinsic::x86_sse42_pcmpistris128:
+ case Intrinsic::x86_sse42_pcmpestris128:
+ case Intrinsic::x86_sse42_pcmpistriz128:
+ case Intrinsic::x86_sse42_pcmpestriz128: {
+ unsigned Opcode;
+ X86::CondCode X86CC;
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
+ case Intrinsic::x86_sse42_pcmpistria128:
+ Opcode = X86ISD::PCMPISTRI;
+ X86CC = X86::COND_A;
+ break;
+ case Intrinsic::x86_sse42_pcmpestria128:
+ Opcode = X86ISD::PCMPESTRI;
+ X86CC = X86::COND_A;
+ break;
+ case Intrinsic::x86_sse42_pcmpistric128:
+ Opcode = X86ISD::PCMPISTRI;
+ X86CC = X86::COND_B;
+ break;
+ case Intrinsic::x86_sse42_pcmpestric128:
+ Opcode = X86ISD::PCMPESTRI;
+ X86CC = X86::COND_B;
+ break;
+ case Intrinsic::x86_sse42_pcmpistrio128:
+ Opcode = X86ISD::PCMPISTRI;
+ X86CC = X86::COND_O;
+ break;
+ case Intrinsic::x86_sse42_pcmpestrio128:
+ Opcode = X86ISD::PCMPESTRI;
+ X86CC = X86::COND_O;
+ break;
+ case Intrinsic::x86_sse42_pcmpistris128:
+ Opcode = X86ISD::PCMPISTRI;
+ X86CC = X86::COND_S;
+ break;
+ case Intrinsic::x86_sse42_pcmpestris128:
+ Opcode = X86ISD::PCMPESTRI;
+ X86CC = X86::COND_S;
+ break;
+ case Intrinsic::x86_sse42_pcmpistriz128:
+ Opcode = X86ISD::PCMPISTRI;
+ X86CC = X86::COND_E;
+ break;
+ case Intrinsic::x86_sse42_pcmpestriz128:
+ Opcode = X86ISD::PCMPESTRI;
+ X86CC = X86::COND_E;
+ break;
+ }
+ SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+ SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
+ SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
+ return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+ }
+
+ case Intrinsic::x86_sse42_pcmpistri128:
+ case Intrinsic::x86_sse42_pcmpestri128: {
+ unsigned Opcode;
+ if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
+ Opcode = X86ISD::PCMPISTRI;
+ else
+ Opcode = X86ISD::PCMPESTRI;
+
+ SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+ return DAG.getNode(Opcode, dl, VTs, NewOps);
+ }
+
+ case Intrinsic::eh_sjlj_lsda: {
+ MachineFunction &MF = DAG.getMachineFunction();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+ auto &Context = MF.getMMI().getContext();
+ MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
+ Twine(MF.getFunctionNumber()));
+ return DAG.getNode(X86ISD::Wrapper, dl, VT, DAG.getMCSymbol(S, PtrVT));
+ }
+
+ case Intrinsic::x86_seh_lsda: {
+ // Compute the symbol for the LSDA. We know it'll get emitted later.
+ MachineFunction &MF = DAG.getMachineFunction();
+ SDValue Op1 = Op.getOperand(1);
+ auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
+ MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
+ GlobalValue::getRealLinkageName(Fn->getName()));
+
+ // Generate a simple absolute symbol reference. This intrinsic is only
+ // supported on 32-bit Windows, which isn't PIC.
+ SDValue Result = DAG.getMCSymbol(LSDASym, VT);
+ return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
+ }
+
+ case Intrinsic::x86_seh_recoverfp: {
+ SDValue FnOp = Op.getOperand(1);
+ SDValue IncomingFPOp = Op.getOperand(2);
+ GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
+ auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
+ if (!Fn)
+ report_fatal_error(
+ "llvm.x86.seh.recoverfp must take a function as the first argument");
+ return recoverFramePointer(DAG, Fn, IncomingFPOp);
+ }
+
+ case Intrinsic::localaddress: {
+ // Returns one of the stack, base, or frame pointer registers, depending on
+ // which is used to reference local variables.
+ MachineFunction &MF = DAG.getMachineFunction();
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ unsigned Reg;
+ if (RegInfo->hasBasePointer(MF))
+ Reg = RegInfo->getBaseRegister();
+ else // This function handles the SP or FP case.
+ Reg = RegInfo->getPtrSizedFrameRegister(MF);
+ return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
+ }
+ }
+}
+
+static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+ SDValue Src, SDValue Mask, SDValue Base,
+ SDValue Index, SDValue ScaleOp, SDValue Chain,
+ const X86Subtarget &Subtarget) {
+ SDLoc dl(Op);
+ auto *C = cast<ConstantSDNode>(ScaleOp);
+ SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
+ MVT MaskVT = MVT::getVectorVT(MVT::i1,
+ Index.getSimpleValueType().getVectorNumElements());
+
+ SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
+ SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
+ SDValue Segment = DAG.getRegister(0, MVT::i32);
+ if (Src.isUndef())
+ Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
+ SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
+ SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
+ SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
+ return DAG.getMergeValues(RetOps, dl);
+}
+
+static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+ SDValue Src, SDValue Mask, SDValue Base,
+ SDValue Index, SDValue ScaleOp, SDValue Chain,
+ const X86Subtarget &Subtarget) {
+ SDLoc dl(Op);
+ auto *C = cast<ConstantSDNode>(ScaleOp);
+ SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
+ SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
+ SDValue Segment = DAG.getRegister(0, MVT::i32);
+ MVT MaskVT = MVT::getVectorVT(MVT::i1,
+ Index.getSimpleValueType().getVectorNumElements());
+
+ SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+ SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
+ SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
+ SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
+ return SDValue(Res, 1);
+}
+
+static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+ SDValue Mask, SDValue Base, SDValue Index,
+ SDValue ScaleOp, SDValue Chain,
+ const X86Subtarget &Subtarget) {
+ SDLoc dl(Op);
+ auto *C = cast<ConstantSDNode>(ScaleOp);
+ SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
+ SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
+ SDValue Segment = DAG.getRegister(0, MVT::i32);
+ MVT MaskVT =
+ MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
+ SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+ //SDVTList VTs = DAG.getVTList(MVT::Other);
+ SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
+ SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
+ return SDValue(Res, 0);
+}
+
+/// Handles the lowering of builtin intrinsic that return the value
+/// of the extended control register.
+static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ SmallVectorImpl<SDValue> &Results) {
+ assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue LO, HI;
+
+ // The ECX register is used to select the index of the XCR register to
+ // return.
+ SDValue Chain =
+ DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
+ SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
+ Chain = SDValue(N1, 0);
+
+ // Reads the content of XCR and returns it in registers EDX:EAX.
+ if (Subtarget.is64Bit()) {
+ LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
+ HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
+ LO.getValue(2));
+ } else {
+ LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
+ HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
+ LO.getValue(2));
+ }
+ Chain = HI.getValue(1);
+
+ if (Subtarget.is64Bit()) {
+ // Merge the two 32-bit values into a 64-bit one..
+ SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
+ DAG.getConstant(32, DL, MVT::i8));
+ Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
+ Results.push_back(Chain);
+ return;
+ }
+
+ // Use a buildpair to merge the two 32-bit values into a 64-bit one.
+ SDValue Ops[] = { LO, HI };
+ SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
+ Results.push_back(Pair);
+ Results.push_back(Chain);
+}
+
+/// Handles the lowering of builtin intrinsics that read performance monitor
+/// counters (x86_rdpmc).
+static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ SmallVectorImpl<SDValue> &Results) {
+ assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue LO, HI;
+
+ // The ECX register is used to select the index of the performance counter
+ // to read.
+ SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
+ N->getOperand(2));
+ SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
+
+ // Reads the content of a 64-bit performance counter and returns it in the
+ // registers EDX:EAX.
+ if (Subtarget.is64Bit()) {
+ LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
+ HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
+ LO.getValue(2));
+ } else {
+ LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
+ HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
+ LO.getValue(2));
+ }
+ Chain = HI.getValue(1);
+
+ if (Subtarget.is64Bit()) {
+ // The EAX register is loaded with the low-order 32 bits. The EDX register
+ // is loaded with the supported high-order bits of the counter.
+ SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
+ DAG.getConstant(32, DL, MVT::i8));
+ Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
+ Results.push_back(Chain);
+ return;
+ }
+
+ // Use a buildpair to merge the two 32-bit values into a 64-bit one.
+ SDValue Ops[] = { LO, HI };
+ SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
+ Results.push_back(Pair);
+ Results.push_back(Chain);
+}
+
+/// Handles the lowering of builtin intrinsics that read the time stamp counter
+/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
+/// READCYCLECOUNTER nodes.
+static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ SmallVectorImpl<SDValue> &Results) {
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
+ SDValue LO, HI;
+
+ // The processor's time-stamp counter (a 64-bit MSR) is stored into the
+ // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
+ // and the EAX register is loaded with the low-order 32 bits.
+ if (Subtarget.is64Bit()) {
+ LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
+ HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
+ LO.getValue(2));
+ } else {
+ LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
+ HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
+ LO.getValue(2));
+ }
+ SDValue Chain = HI.getValue(1);
+
+ if (Opcode == X86ISD::RDTSCP_DAG) {
+ assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
+
+ // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
+ // the ECX register. Add 'ecx' explicitly to the chain.
+ SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
+ HI.getValue(2));
+ // Explicitly store the content of ECX at the location passed in input
+ // to the 'rdtscp' intrinsic.
+ Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
+ MachinePointerInfo());
+ }
+
+ if (Subtarget.is64Bit()) {
+ // The EDX register is loaded with the high-order 32 bits of the MSR, and
+ // the EAX register is loaded with the low-order 32 bits.
+ SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
+ DAG.getConstant(32, DL, MVT::i8));
+ Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
+ Results.push_back(Chain);
+ return;
+ }
+
+ // Use a buildpair to merge the two 32-bit values into a 64-bit one.
+ SDValue Ops[] = { LO, HI };
+ SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
+ Results.push_back(Pair);
+ Results.push_back(Chain);
+}
+
+static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SmallVector<SDValue, 2> Results;
+ SDLoc DL(Op);
+ getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
+ Results);
+ return DAG.getMergeValues(Results, DL);
+}
+
+static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ SDValue Chain = Op.getOperand(0);
+ SDValue RegNode = Op.getOperand(2);
+ WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
+ if (!EHInfo)
+ report_fatal_error("EH registrations only live in functions using WinEH");
+
+ // Cast the operand to an alloca, and remember the frame index.
+ auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
+ if (!FINode)
+ report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
+ EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
+
+ // Return the chain operand without making any DAG nodes.
+ return Chain;
+}
+
+static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ SDValue Chain = Op.getOperand(0);
+ SDValue EHGuard = Op.getOperand(2);
+ WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
+ if (!EHInfo)
+ report_fatal_error("EHGuard only live in functions using WinEH");
+
+ // Cast the operand to an alloca, and remember the frame index.
+ auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
+ if (!FINode)
+ report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
+ EHInfo->EHGuardFrameIndex = FINode->getIndex();
+
+ // Return the chain operand without making any DAG nodes.
+ return Chain;
+}
+
+/// Emit Truncating Store with signed or unsigned saturation.
+static SDValue
+EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
+ SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
+ SelectionDAG &DAG) {
+
+ SDVTList VTs = DAG.getVTList(MVT::Other);
+ SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
+ SDValue Ops[] = { Chain, Val, Ptr, Undef };
+ return SignedSat ?
+ DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
+ DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
+}
+
+/// Emit Masked Truncating Store with signed or unsigned saturation.
+static SDValue
+EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
+ SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
+ MachineMemOperand *MMO, SelectionDAG &DAG) {
+
+ SDVTList VTs = DAG.getVTList(MVT::Other);
+ SDValue Ops[] = { Chain, Ptr, Mask, Val };
+ return SignedSat ?
+ DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
+ DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
+}
+
+static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+
+ const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
+ if (!IntrData) {
+ if (IntNo == llvm::Intrinsic::x86_seh_ehregnode)
+ return MarkEHRegistrationNode(Op, DAG);
+ if (IntNo == llvm::Intrinsic::x86_seh_ehguard)
+ return MarkEHGuard(Op, DAG);
+ if (IntNo == llvm::Intrinsic::x86_flags_read_u32 ||
+ IntNo == llvm::Intrinsic::x86_flags_read_u64 ||
+ IntNo == llvm::Intrinsic::x86_flags_write_u32 ||
+ IntNo == llvm::Intrinsic::x86_flags_write_u64) {
+ // We need a frame pointer because this will get lowered to a PUSH/POP
+ // sequence.
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ MFI.setHasCopyImplyingStackAdjustment(true);
+ // Don't do anything here, we will expand these intrinsics out later
+ // during ExpandISelPseudos in EmitInstrWithCustomInserter.
+ return SDValue();
+ }
+ return SDValue();
+ }
+
+ SDLoc dl(Op);
+ switch(IntrData->Type) {
+ default: llvm_unreachable("Unknown Intrinsic Type");
+ case RDSEED:
+ case RDRAND: {
+ // Emit the node with the right value type.
+ SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
+ SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
+
+ // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
+ // Otherwise return the value from Rand, which is always 0, casted to i32.
+ SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
+ DAG.getConstant(1, dl, Op->getValueType(1)),
+ DAG.getConstant(X86::COND_B, dl, MVT::i32),
+ SDValue(Result.getNode(), 1) };
+ SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
+ DAG.getVTList(Op->getValueType(1), MVT::Glue),
+ Ops);
+
+ // Return { result, isValid, chain }.
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
+ SDValue(Result.getNode(), 2));
+ }
+ case GATHER: {
+ //gather(v1, mask, index, base, scale);
+ SDValue Chain = Op.getOperand(0);
+ SDValue Src = Op.getOperand(2);
+ SDValue Base = Op.getOperand(3);
+ SDValue Index = Op.getOperand(4);
+ SDValue Mask = Op.getOperand(5);
+ SDValue Scale = Op.getOperand(6);
+ return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
+ Chain, Subtarget);
+ }
+ case SCATTER: {
+ //scatter(base, mask, index, v1, scale);
+ SDValue Chain = Op.getOperand(0);
+ SDValue Base = Op.getOperand(2);
+ SDValue Mask = Op.getOperand(3);
+ SDValue Index = Op.getOperand(4);
+ SDValue Src = Op.getOperand(5);
+ SDValue Scale = Op.getOperand(6);
+ return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
+ Scale, Chain, Subtarget);
+ }
+ case PREFETCH: {
+ SDValue Hint = Op.getOperand(6);
+ unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
+ assert(HintVal < 2 && "Wrong prefetch hint in intrinsic: should be 0 or 1");
+ unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);
+ SDValue Chain = Op.getOperand(0);
+ SDValue Mask = Op.getOperand(2);
+ SDValue Index = Op.getOperand(3);
+ SDValue Base = Op.getOperand(4);
+ SDValue Scale = Op.getOperand(5);
+ return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
+ Subtarget);
+ }
+ // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
+ case RDTSC: {
+ SmallVector<SDValue, 2> Results;
+ getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
+ Results);
+ return DAG.getMergeValues(Results, dl);
+ }
+ // Read Performance Monitoring Counters.
+ case RDPMC: {
+ SmallVector<SDValue, 2> Results;
+ getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
+ return DAG.getMergeValues(Results, dl);
+ }
+ // Get Extended Control Register.
+ case XGETBV: {
+ SmallVector<SDValue, 2> Results;
+ getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
+ return DAG.getMergeValues(Results, dl);
+ }
+ // XTEST intrinsics.
+ case XTEST: {
+ SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
+ SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
+
+ SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
+ SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
+ Ret, SDValue(InTrans.getNode(), 1));
+ }
+ // ADC/ADCX/SBB
+ case ADX: {
+ SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
+ SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
+ SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
+ DAG.getConstant(-1, dl, MVT::i8));
+ SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
+ Op.getOperand(4), GenCF.getValue(1));
+ SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
+ Op.getOperand(5), MachinePointerInfo());
+ SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
+ SDValue Results[] = { SetCC, Store };
+ return DAG.getMergeValues(Results, dl);
+ }
+ case COMPRESS_TO_MEM: {
+ SDValue Mask = Op.getOperand(4);
+ SDValue DataToCompress = Op.getOperand(3);
+ SDValue Addr = Op.getOperand(2);
+ SDValue Chain = Op.getOperand(0);
+ MVT VT = DataToCompress.getSimpleValueType();
+
+ MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
+ assert(MemIntr && "Expected MemIntrinsicSDNode!");
+
+ if (isAllOnesConstant(Mask)) // return just a store
+ return DAG.getStore(Chain, dl, DataToCompress, Addr,
+ MemIntr->getMemOperand());
+
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+ SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+
+ return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT,
+ MemIntr->getMemOperand(),
+ false /* truncating */, true /* compressing */);
+ }
+ case TRUNCATE_TO_MEM_VI8:
+ case TRUNCATE_TO_MEM_VI16:
+ case TRUNCATE_TO_MEM_VI32: {
+ SDValue Mask = Op.getOperand(4);
+ SDValue DataToTruncate = Op.getOperand(3);
+ SDValue Addr = Op.getOperand(2);
+ SDValue Chain = Op.getOperand(0);
+
+ MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
+ assert(MemIntr && "Expected MemIntrinsicSDNode!");
+
+ EVT MemVT = MemIntr->getMemoryVT();
+
+ uint16_t TruncationOp = IntrData->Opc0;
+ switch (TruncationOp) {
+ case X86ISD::VTRUNC: {
+ if (isAllOnesConstant(Mask)) // return just a truncate store
+ return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
+ MemIntr->getMemOperand());
+
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
+ SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+
+ return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
+ MemIntr->getMemOperand(), true /* truncating */);
+ }
+ case X86ISD::VTRUNCUS:
+ case X86ISD::VTRUNCS: {
+ bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
+ if (isAllOnesConstant(Mask))
+ return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
+ MemIntr->getMemOperand(), DAG);
+
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
+ SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+
+ return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
+ VMask, MemVT, MemIntr->getMemOperand(), DAG);
+ }
+ default:
+ llvm_unreachable("Unsupported truncstore intrinsic");
+ }
+ }
+
+ case EXPAND_FROM_MEM: {
+ SDValue Mask = Op.getOperand(4);
+ SDValue PassThru = Op.getOperand(3);
+ SDValue Addr = Op.getOperand(2);
+ SDValue Chain = Op.getOperand(0);
+ MVT VT = Op.getSimpleValueType();
+
+ MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
+ assert(MemIntr && "Expected MemIntrinsicSDNode!");
+
+ if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load.
+ return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
+ if (X86::isZeroNode(Mask))
+ return DAG.getUNDEF(VT);
+
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+ SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+ return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
+ MemIntr->getMemOperand(), ISD::NON_EXTLOAD,
+ true /* expanding */);
+ }
+ }
+}
+
+SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
+ SelectionDAG &DAG) const {
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ MFI.setReturnAddressIsTaken(true);
+
+ if (verifyReturnAddressArgumentIsConstant(Op, DAG))
+ return SDValue();
+
+ unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ SDLoc dl(Op);
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+ if (Depth > 0) {
+ SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
+ return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
+ DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
+ MachinePointerInfo());
+ }
+
+ // Just load the return address.
+ SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
+ return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
+ MachinePointerInfo());
+}
+
+SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
+ SelectionDAG &DAG) const {
+ DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
+ return getReturnAddressFrameIndex(DAG);
+}
+
+SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ EVT VT = Op.getValueType();
+
+ MFI.setFrameAddressIsTaken(true);
+
+ if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
+ // Depth > 0 makes no sense on targets which use Windows unwind codes. It
+ // is not possible to crawl up the stack without looking at the unwind codes
+ // simultaneously.
+ int FrameAddrIndex = FuncInfo->getFAIndex();
+ if (!FrameAddrIndex) {
+ // Set up a frame object for the return address.
+ unsigned SlotSize = RegInfo->getSlotSize();
+ FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
+ SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
+ FuncInfo->setFAIndex(FrameAddrIndex);
+ }
+ return DAG.getFrameIndex(FrameAddrIndex, VT);
+ }
+
+ unsigned FrameReg =
+ RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
+ SDLoc dl(Op); // FIXME probably not meaningful
+ unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
+ (FrameReg == X86::EBP && VT == MVT::i32)) &&
+ "Invalid Frame Register!");
+ SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
+ while (Depth--)
+ FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
+ MachinePointerInfo());
+ return FrameAddr;
+}
+
+// FIXME? Maybe this could be a TableGen attribute on some registers and
+// this table could be generated automatically from RegInfo.
+unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
+ SelectionDAG &DAG) const {
+ const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
+ const MachineFunction &MF = DAG.getMachineFunction();
+
+ unsigned Reg = StringSwitch<unsigned>(RegName)
+ .Case("esp", X86::ESP)
+ .Case("rsp", X86::RSP)
+ .Case("ebp", X86::EBP)
+ .Case("rbp", X86::RBP)
+ .Default(0);
+
+ if (Reg == X86::EBP || Reg == X86::RBP) {
+ if (!TFI.hasFP(MF))
+ report_fatal_error("register " + StringRef(RegName) +
+ " is allocatable: function has no frame pointer");
+#ifndef NDEBUG
+ else {
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ unsigned FrameReg =
+ RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
+ assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
+ "Invalid Frame Register!");
+ }
+#endif
+ }
+
+ if (Reg)
+ return Reg;
+
+ report_fatal_error("Invalid register name global variable");
+}
+
+SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
+ SelectionDAG &DAG) const {
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
+}
+
+unsigned X86TargetLowering::getExceptionPointerRegister(
+ const Constant *PersonalityFn) const {
+ if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
+ return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
+
+ return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
+}
+
+unsigned X86TargetLowering::getExceptionSelectorRegister(
+ const Constant *PersonalityFn) const {
+ // Funclet personalities don't use selectors (the runtime does the selection).
+ assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
+ return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
+}
+
+bool X86TargetLowering::needsFixedCatchObjects() const {
+ return Subtarget.isTargetWin64();
+}
+
+SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
+ SDValue Chain = Op.getOperand(0);
+ SDValue Offset = Op.getOperand(1);
+ SDValue Handler = Op.getOperand(2);
+ SDLoc dl (Op);
+
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
+ assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
+ (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
+ "Invalid Frame Register!");
+ SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
+ unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
+
+ SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
+ DAG.getIntPtrConstant(RegInfo->getSlotSize(),
+ dl));
+ StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
+ Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
+ Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
+
+ return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
+ DAG.getRegister(StoreAddrReg, PtrVT));
+}
+
+SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ // If the subtarget is not 64bit, we may need the global base reg
+ // after isel expand pseudo, i.e., after CGBR pass ran.
+ // Therefore, ask for the GlobalBaseReg now, so that the pass
+ // inserts the code for us in case we need it.
+ // Otherwise, we will end up in a situation where we will
+ // reference a virtual register that is not defined!
+ if (!Subtarget.is64Bit()) {
+ const X86InstrInfo *TII = Subtarget.getInstrInfo();
+ (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
+ }
+ return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
+ DAG.getVTList(MVT::i32, MVT::Other),
+ Op.getOperand(0), Op.getOperand(1));
+}
+
+SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
+ Op.getOperand(0), Op.getOperand(1));
+}
+
+SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
+ Op.getOperand(0));
+}
+
+static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
+ return Op.getOperand(0);
+}
+
+SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue Root = Op.getOperand(0);
+ SDValue Trmp = Op.getOperand(1); // trampoline
+ SDValue FPtr = Op.getOperand(2); // nested function
+ SDValue Nest = Op.getOperand(3); // 'nest' parameter value
+ SDLoc dl (Op);
+
+ const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+ const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+
+ if (Subtarget.is64Bit()) {
+ SDValue OutChains[6];
+
+ // Large code-model.
+ const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
+ const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
+
+ const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
+ const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
+
+ const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
+
+ // Load the pointer to the nested function into R11.
+ unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
+ SDValue Addr = Trmp;
+ OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
+ Addr, MachinePointerInfo(TrmpAddr));
+
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+ DAG.getConstant(2, dl, MVT::i64));
+ OutChains[1] =
+ DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
+ /* Alignment = */ 2);
+
+ // Load the 'nest' parameter value into R10.
+ // R10 is specified in X86CallingConv.td
+ OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+ DAG.getConstant(10, dl, MVT::i64));
+ OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
+ Addr, MachinePointerInfo(TrmpAddr, 10));
+
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+ DAG.getConstant(12, dl, MVT::i64));
+ OutChains[3] =
+ DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
+ /* Alignment = */ 2);
+
+ // Jump to the nested function.
+ OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+ DAG.getConstant(20, dl, MVT::i64));
+ OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
+ Addr, MachinePointerInfo(TrmpAddr, 20));
+
+ unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+ DAG.getConstant(22, dl, MVT::i64));
+ OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
+ Addr, MachinePointerInfo(TrmpAddr, 22));
+
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+ } else {
+ const Function *Func =
+ cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
+ CallingConv::ID CC = Func->getCallingConv();
+ unsigned NestReg;
+
+ switch (CC) {
+ default:
+ llvm_unreachable("Unsupported calling convention");
+ case CallingConv::C:
+ case CallingConv::X86_StdCall: {
+ // Pass 'nest' parameter in ECX.
+ // Must be kept in sync with X86CallingConv.td
+ NestReg = X86::ECX;
+
+ // Check that ECX wasn't needed by an 'inreg' parameter.
+ FunctionType *FTy = Func->getFunctionType();
+ const AttributeSet &Attrs = Func->getAttributes();
+
+ if (!Attrs.isEmpty() && !Func->isVarArg()) {
+ unsigned InRegCount = 0;
+ unsigned Idx = 1;
+
+ for (FunctionType::param_iterator I = FTy->param_begin(),
+ E = FTy->param_end(); I != E; ++I, ++Idx)
+ if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
+ auto &DL = DAG.getDataLayout();
+ // FIXME: should only count parameters that are lowered to integers.
+ InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
+ }
+
+ if (InRegCount > 2) {
+ report_fatal_error("Nest register in use - reduce number of inreg"
+ " parameters!");
+ }
+ }
+ break;
+ }
+ case CallingConv::X86_FastCall:
+ case CallingConv::X86_ThisCall:
+ case CallingConv::Fast:
+ // Pass 'nest' parameter in EAX.
+ // Must be kept in sync with X86CallingConv.td
+ NestReg = X86::EAX;
+ break;
+ }
+
+ SDValue OutChains[4];
+ SDValue Addr, Disp;
+
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+ DAG.getConstant(10, dl, MVT::i32));
+ Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
+
+ // This is storing the opcode for MOV32ri.
+ const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
+ const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
+ OutChains[0] =
+ DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
+ Trmp, MachinePointerInfo(TrmpAddr));
+
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+ DAG.getConstant(1, dl, MVT::i32));
+ OutChains[1] =
+ DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
+ /* Alignment = */ 1);
+
+ const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+ DAG.getConstant(5, dl, MVT::i32));
+ OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
+ Addr, MachinePointerInfo(TrmpAddr, 5),
+ /* Alignment = */ 1);
+
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+ DAG.getConstant(6, dl, MVT::i32));
+ OutChains[3] =
+ DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
+ /* Alignment = */ 1);
+
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+ }
+}
+
+SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
+ SelectionDAG &DAG) const {
+ /*
+ The rounding mode is in bits 11:10 of FPSR, and has the following
+ settings:
+ 00 Round to nearest
+ 01 Round to -inf
+ 10 Round to +inf
+ 11 Round to 0
+
+ FLT_ROUNDS, on the other hand, expects the following:
+ -1 Undefined
+ 0 Round to 0
+ 1 Round to nearest
+ 2 Round to +inf
+ 3 Round to -inf
+
+ To perform the conversion, we do:
+ (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
+ */
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
+ unsigned StackAlignment = TFI.getStackAlignment();
+ MVT VT = Op.getSimpleValueType();
+ SDLoc DL(Op);
+
+ // Save FP Control Word to stack slot
+ int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
+ SDValue StackSlot =
+ DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
+
+ MachineMemOperand *MMO =
+ MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
+ MachineMemOperand::MOStore, 2, 2);
+
+ SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
+ SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
+ DAG.getVTList(MVT::Other),
+ Ops, MVT::i16, MMO);
+
+ // Load FP Control Word from stack slot
+ SDValue CWD =
+ DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
+
+ // Transform as necessary
+ SDValue CWD1 =
+ DAG.getNode(ISD::SRL, DL, MVT::i16,
+ DAG.getNode(ISD::AND, DL, MVT::i16,
+ CWD, DAG.getConstant(0x800, DL, MVT::i16)),
+ DAG.getConstant(11, DL, MVT::i8));
+ SDValue CWD2 =
+ DAG.getNode(ISD::SRL, DL, MVT::i16,
+ DAG.getNode(ISD::AND, DL, MVT::i16,
+ CWD, DAG.getConstant(0x400, DL, MVT::i16)),
+ DAG.getConstant(9, DL, MVT::i8));
+
+ SDValue RetVal =
+ DAG.getNode(ISD::AND, DL, MVT::i16,
+ DAG.getNode(ISD::ADD, DL, MVT::i16,
+ DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
+ DAG.getConstant(1, DL, MVT::i16)),
+ DAG.getConstant(3, DL, MVT::i16));
+
+ return DAG.getNode((VT.getSizeInBits() < 16 ?
+ ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
+}
+
+/// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
+//
+// 1. i32/i64 128/256-bit vector (native support require VLX) are expended
+// to 512-bit vector.
+// 2. i8/i16 vector implemented using dword LZCNT vector instruction
+// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
+// split the vector, perform operation on it's Lo a Hi part and
+// concatenate the results.
+static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) {
+ assert(Op.getOpcode() == ISD::CTLZ);
+ SDLoc dl(Op);
+ MVT VT = Op.getSimpleValueType();
+ MVT EltVT = VT.getVectorElementType();
+ unsigned NumElems = VT.getVectorNumElements();
+
+ if (EltVT == MVT::i64 || EltVT == MVT::i32) {
+ // Extend to 512 bit vector.
+ assert((VT.is256BitVector() || VT.is128BitVector()) &&
+ "Unsupported value type for operation");
+
+ MVT NewVT = MVT::getVectorVT(EltVT, 512 / VT.getScalarSizeInBits());
+ SDValue Vec512 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
+ DAG.getUNDEF(NewVT),
+ Op.getOperand(0),
+ DAG.getIntPtrConstant(0, dl));
+ SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Vec512);
+
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CtlzNode,
+ DAG.getIntPtrConstant(0, dl));
+ }
+
+ assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
+ "Unsupported element type");
+
+ if (16 < NumElems) {
+ // Split vector, it's Lo and Hi parts will be handled in next iteration.
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
+ MVT OutVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+ Lo = DAG.getNode(ISD::CTLZ, dl, OutVT, Lo);
+ Hi = DAG.getNode(ISD::CTLZ, dl, OutVT, Hi);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
+ }
+
+ MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
+
+ assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
+ "Unsupported value type for operation");
+
+ // Use native supported vector instruction vplzcntd.
+ Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
+ SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
+ SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
+ SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
+
+ return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
+}
+
+// Lower CTLZ using a PSHUFB lookup table implementation.
+static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ int NumElts = VT.getVectorNumElements();
+ int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
+ MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
+
+ // Per-nibble leading zero PSHUFB lookup table.
+ const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
+ /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
+ /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
+ /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
+
+ SmallVector<SDValue, 64> LUTVec;
+ for (int i = 0; i < NumBytes; ++i)
+ LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
+ SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
+
+ // Begin by bitcasting the input to byte vector, then split those bytes
+ // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
+ // If the hi input nibble is zero then we add both results together, otherwise
+ // we just take the hi result (by masking the lo result to zero before the
+ // add).
+ SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
+ SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
+
+ SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
+ SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
+ SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
+ SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
+ SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
+
+ Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
+ Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
+ Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
+ SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
+
+ // Merge result back from vXi8 back to VT, working on the lo/hi halves
+ // of the current vector width in the same way we did for the nibbles.
+ // If the upper half of the input element is zero then add the halves'
+ // leading zero counts together, otherwise just use the upper half's.
+ // Double the width of the result until we are at target width.
+ while (CurrVT != VT) {
+ int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
+ int CurrNumElts = CurrVT.getVectorNumElements();
+ MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
+ MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
+ SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
+
+ // Check if the upper half of the input element is zero.
+ SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
+ DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
+ HiZ = DAG.getBitcast(NextVT, HiZ);
+
+ // Move the upper/lower halves to the lower bits as we'll be extending to
+ // NextVT. Mask the lower result to zero if HiZ is true and add the results
+ // together.
+ SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
+ SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
+ SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
+ R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
+ Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
+ CurrVT = NextVT;
+ }
+
+ return Res;
+}
+
+static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ SDValue Op0 = Op.getOperand(0);
+
+ if (Subtarget.hasAVX512())
+ return LowerVectorCTLZ_AVX512(Op, DAG);
+
+ // Decompose 256-bit ops into smaller 128-bit ops.
+ if (VT.is256BitVector() && !Subtarget.hasInt256()) {
+ unsigned NumElems = VT.getVectorNumElements();
+
+ // Extract each 128-bit vector, perform ctlz and concat the result.
+ SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
+ SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
+ DAG.getNode(ISD::CTLZ, DL, LHS.getValueType(), LHS),
+ DAG.getNode(ISD::CTLZ, DL, RHS.getValueType(), RHS));
+ }
+
+ assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
+ return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
+}
+
+static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ MVT OpVT = VT;
+ unsigned NumBits = VT.getSizeInBits();
+ SDLoc dl(Op);
+ unsigned Opc = Op.getOpcode();
+
+ if (VT.isVector())
+ return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
+
+ Op = Op.getOperand(0);
+ if (VT == MVT::i8) {
+ // Zero extend to i32 since there is not an i8 bsr.
+ OpVT = MVT::i32;
+ Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
+ }
+
+ // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
+ SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
+ Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
+
+ if (Opc == ISD::CTLZ) {
+ // If src is zero (i.e. bsr sets ZF), returns NumBits.
+ SDValue Ops[] = {
+ Op,
+ DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
+ DAG.getConstant(X86::COND_E, dl, MVT::i8),
+ Op.getValue(1)
+ };
+ Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
+ }
+
+ // Finally xor with NumBits-1.
+ Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
+ DAG.getConstant(NumBits - 1, dl, OpVT));
+
+ if (VT == MVT::i8)
+ Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
+ return Op;
+}
+
+static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ unsigned NumBits = VT.getScalarSizeInBits();
+ SDLoc dl(Op);
+
+ if (VT.isVector()) {
+ SDValue N0 = Op.getOperand(0);
+ SDValue Zero = DAG.getConstant(0, dl, VT);
+
+ // lsb(x) = (x & -x)
+ SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
+ DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
+
+ // cttz_undef(x) = (width - 1) - ctlz(lsb)
+ if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
+ SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
+ return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
+ DAG.getNode(ISD::CTLZ, dl, VT, LSB));
+ }
+
+ // cttz(x) = ctpop(lsb - 1)
+ SDValue One = DAG.getConstant(1, dl, VT);
+ return DAG.getNode(ISD::CTPOP, dl, VT,
+ DAG.getNode(ISD::SUB, dl, VT, LSB, One));
+ }
+
+ assert(Op.getOpcode() == ISD::CTTZ &&
+ "Only scalar CTTZ requires custom lowering");
+
+ // Issue a bsf (scan bits forward) which also sets EFLAGS.
+ SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+ Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
+
+ // If src is zero (i.e. bsf sets ZF), returns NumBits.
+ SDValue Ops[] = {
+ Op,
+ DAG.getConstant(NumBits, dl, VT),
+ DAG.getConstant(X86::COND_E, dl, MVT::i8),
+ Op.getValue(1)
+ };
+ return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
+}
+
+/// Break a 256-bit integer operation into two new 128-bit ones and then
+/// concatenate the result back.
+static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+
+ assert(VT.is256BitVector() && VT.isInteger() &&
+ "Unsupported value type for operation");
+
+ unsigned NumElems = VT.getVectorNumElements();
+ SDLoc dl(Op);
+
+ // Extract the LHS vectors
+ SDValue LHS = Op.getOperand(0);
+ SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
+ SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
+
+ // Extract the RHS vectors
+ SDValue RHS = Op.getOperand(1);
+ SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
+ SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
+
+ MVT EltVT = VT.getVectorElementType();
+ MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+ DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
+ DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
+}
+
+/// Break a 512-bit integer operation into two new 256-bit ones and then
+/// concatenate the result back.
+static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+
+ assert(VT.is512BitVector() && VT.isInteger() &&
+ "Unsupported value type for operation");
+
+ unsigned NumElems = VT.getVectorNumElements();
+ SDLoc dl(Op);
+
+ // Extract the LHS vectors
+ SDValue LHS = Op.getOperand(0);
+ SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
+ SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
+
+ // Extract the RHS vectors
+ SDValue RHS = Op.getOperand(1);
+ SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
+ SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
+
+ MVT EltVT = VT.getVectorElementType();
+ MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+ DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
+ DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
+}
+
+static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
+ if (Op.getValueType() == MVT::i1)
+ return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
+ Op.getOperand(0), Op.getOperand(1));
+ assert(Op.getSimpleValueType().is256BitVector() &&
+ Op.getSimpleValueType().isInteger() &&
+ "Only handle AVX 256-bit vector integer operation");
+ return Lower256IntArith(Op, DAG);
+}
+
+static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
+ if (Op.getValueType() == MVT::i1)
+ return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
+ Op.getOperand(0), Op.getOperand(1));
+ assert(Op.getSimpleValueType().is256BitVector() &&
+ Op.getSimpleValueType().isInteger() &&
+ "Only handle AVX 256-bit vector integer operation");
+ return Lower256IntArith(Op, DAG);
+}
+
+static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
+ assert(Op.getSimpleValueType().is256BitVector() &&
+ Op.getSimpleValueType().isInteger() &&
+ "Only handle AVX 256-bit vector integer operation");
+ return Lower256IntArith(Op, DAG);
+}
+
+static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc dl(Op);
+ MVT VT = Op.getSimpleValueType();
+
+ if (VT == MVT::i1)
+ return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
+
+ // Decompose 256-bit ops into smaller 128-bit ops.
+ if (VT.is256BitVector() && !Subtarget.hasInt256())
+ return Lower256IntArith(Op, DAG);
+
+ SDValue A = Op.getOperand(0);
+ SDValue B = Op.getOperand(1);
+
+ // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
+ // vector pairs, multiply and truncate.
+ if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
+ if (Subtarget.hasInt256()) {
+ // For 512-bit vectors, split into 256-bit vectors to allow the
+ // sign-extension to occur.
+ if (VT == MVT::v64i8)
+ return Lower512IntArith(Op, DAG);
+
+ // For 256-bit vectors, split into 128-bit vectors to allow the
+ // sign-extension to occur. We don't need this on AVX512BW as we can
+ // safely sign-extend to v32i16.
+ if (VT == MVT::v32i8 && !Subtarget.hasBWI())
+ return Lower256IntArith(Op, DAG);
+
+ MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
+ return DAG.getNode(
+ ISD::TRUNCATE, dl, VT,
+ DAG.getNode(ISD::MUL, dl, ExVT,
+ DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
+ DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
+ }
+
+ assert(VT == MVT::v16i8 &&
+ "Pre-AVX2 support only supports v16i8 multiplication");
+ MVT ExVT = MVT::v8i16;
+
+ // Extract the lo parts and sign extend to i16
+ SDValue ALo, BLo;
+ if (Subtarget.hasSSE41()) {
+ ALo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, A);
+ BLo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, B);
+ } else {
+ const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
+ -1, 4, -1, 5, -1, 6, -1, 7};
+ ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
+ BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
+ ALo = DAG.getBitcast(ExVT, ALo);
+ BLo = DAG.getBitcast(ExVT, BLo);
+ ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
+ BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
+ }
+
+ // Extract the hi parts and sign extend to i16
+ SDValue AHi, BHi;
+ if (Subtarget.hasSSE41()) {
+ const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
+ -1, -1, -1, -1, -1, -1, -1, -1};
+ AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
+ BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
+ AHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, AHi);
+ BHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, BHi);
+ } else {
+ const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
+ -1, 12, -1, 13, -1, 14, -1, 15};
+ AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
+ BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
+ AHi = DAG.getBitcast(ExVT, AHi);
+ BHi = DAG.getBitcast(ExVT, BHi);
+ AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
+ BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
+ }
+
+ // Multiply, mask the lower 8bits of the lo/hi results and pack
+ SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
+ SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
+ RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
+ RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
+ return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
+ }
+
+ // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
+ if (VT == MVT::v4i32) {
+ assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
+ "Should not custom lower when pmuldq is available!");
+
+ // Extract the odd parts.
+ static const int UnpackMask[] = { 1, -1, 3, -1 };
+ SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
+ SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
+
+ // Multiply the even parts.
+ SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
+ // Now multiply odd parts.
+ SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
+
+ Evens = DAG.getBitcast(VT, Evens);
+ Odds = DAG.getBitcast(VT, Odds);
+
+ // Merge the two vectors back together with a shuffle. This expands into 2
+ // shuffles.
+ static const int ShufMask[] = { 0, 4, 2, 6 };
+ return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
+ }
+
+ assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
+ "Only know how to lower V2I64/V4I64/V8I64 multiply");
+
+ // 32-bit vector types used for MULDQ/MULUDQ.
+ MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
+
+ // MULDQ returns the 64-bit result of the signed multiplication of the lower
+ // 32-bits. We can lower with this if the sign bits stretch that far.
+ if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
+ DAG.ComputeNumSignBits(B) > 32) {
+ return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),
+ DAG.getBitcast(MulVT, B));
+ }
+
+ // Ahi = psrlqi(a, 32);
+ // Bhi = psrlqi(b, 32);
+ //
+ // AloBlo = pmuludq(a, b);
+ // AloBhi = pmuludq(a, Bhi);
+ // AhiBlo = pmuludq(Ahi, b);
+ //
+ // Hi = psllqi(AloBhi + AhiBlo, 32);
+ // return AloBlo + Hi;
+ APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
+ bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask);
+ bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask);
+
+ APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
+ bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);
+ bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);
+
+ // Bit cast to 32-bit vectors for MULUDQ.
+ SDValue Alo = DAG.getBitcast(MulVT, A);
+ SDValue Blo = DAG.getBitcast(MulVT, B);
+
+ SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
+
+ // Only multiply lo/hi halves that aren't known to be zero.
+ SDValue AloBlo = Zero;
+ if (!ALoIsZero && !BLoIsZero)
+ AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo);
+
+ SDValue AloBhi = Zero;
+ if (!ALoIsZero && !BHiIsZero) {
+ SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
+ Bhi = DAG.getBitcast(MulVT, Bhi);
+ AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi);
+ }
+
+ SDValue AhiBlo = Zero;
+ if (!AHiIsZero && !BLoIsZero) {
+ SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
+ Ahi = DAG.getBitcast(MulVT, Ahi);
+ AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo);
+ }
+
+ SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
+ Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
+
+ return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
+}
+
+static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc dl(Op);
+ MVT VT = Op.getSimpleValueType();
+
+ // Decompose 256-bit ops into smaller 128-bit ops.
+ if (VT.is256BitVector() && !Subtarget.hasInt256())
+ return Lower256IntArith(Op, DAG);
+
+ // Only i8 vectors should need custom lowering after this.
+ assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
+ "Unsupported vector type");
+
+ // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
+ // logical shift down the upper half and pack back to i8.
+ SDValue A = Op.getOperand(0);
+ SDValue B = Op.getOperand(1);
+
+ // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
+ // and then ashr/lshr the upper bits down to the lower bits before multiply.
+ unsigned Opcode = Op.getOpcode();
+ unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
+ unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
+
+ // AVX2 implementations - extend xmm subvectors to ymm.
+ if (Subtarget.hasInt256()) {
+ SDValue Lo = DAG.getIntPtrConstant(0, dl);
+ SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl);
+
+ if (VT == MVT::v32i8) {
+ SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Lo);
+ SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Lo);
+ SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Hi);
+ SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Hi);
+ ALo = DAG.getNode(ExSSE41, dl, MVT::v16i16, ALo);
+ BLo = DAG.getNode(ExSSE41, dl, MVT::v16i16, BLo);
+ AHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, AHi);
+ BHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, BHi);
+ Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
+ DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
+ DAG.getConstant(8, dl, MVT::v16i16));
+ Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
+ DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
+ DAG.getConstant(8, dl, MVT::v16i16));
+ // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
+ // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
+ const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7,
+ 16, 17, 18, 19, 20, 21, 22, 23};
+ const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
+ 24, 25, 26, 27, 28, 29, 30, 31};
+ return DAG.getNode(X86ISD::PACKUS, dl, VT,
+ DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
+ DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
+ }
+
+ SDValue ExA = DAG.getNode(ExSSE41, dl, MVT::v16i16, A);
+ SDValue ExB = DAG.getNode(ExSSE41, dl, MVT::v16i16, B);
+ SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
+ SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
+ DAG.getConstant(8, dl, MVT::v16i16));
+ Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Lo);
+ Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Hi);
+ return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
+ }
+
+ assert(VT == MVT::v16i8 &&
+ "Pre-AVX2 support only supports v16i8 multiplication");
+ MVT ExVT = MVT::v8i16;
+
+ // Extract the lo parts and zero/sign extend to i16.
+ SDValue ALo, BLo;
+ if (Subtarget.hasSSE41()) {
+ ALo = DAG.getNode(ExSSE41, dl, ExVT, A);
+ BLo = DAG.getNode(ExSSE41, dl, ExVT, B);
+ } else {
+ const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
+ -1, 4, -1, 5, -1, 6, -1, 7};
+ ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
+ BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
+ ALo = DAG.getBitcast(ExVT, ALo);
+ BLo = DAG.getBitcast(ExVT, BLo);
+ ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
+ BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
+ }
+
+ // Extract the hi parts and zero/sign extend to i16.
+ SDValue AHi, BHi;
+ if (Subtarget.hasSSE41()) {
+ const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
+ -1, -1, -1, -1, -1, -1, -1, -1};
+ AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
+ BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
+ AHi = DAG.getNode(ExSSE41, dl, ExVT, AHi);
+ BHi = DAG.getNode(ExSSE41, dl, ExVT, BHi);
+ } else {
+ const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
+ -1, 12, -1, 13, -1, 14, -1, 15};
+ AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
+ BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
+ AHi = DAG.getBitcast(ExVT, AHi);
+ BHi = DAG.getBitcast(ExVT, BHi);
+ AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
+ BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
+ }
+
+ // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
+ // pack back to v16i8.
+ SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
+ SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
+ RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
+ RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
+ return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
+}
+
+SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
+ assert(Subtarget.isTargetWin64() && "Unexpected target");
+ EVT VT = Op.getValueType();
+ assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
+ "Unexpected return type for lowering");
+
+ RTLIB::Libcall LC;
+ bool isSigned;
+ switch (Op->getOpcode()) {
+ default: llvm_unreachable("Unexpected request for libcall!");
+ case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
+ case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
+ case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
+ case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
+ case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
+ case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
+ }
+
+ SDLoc dl(Op);
+ SDValue InChain = DAG.getEntryNode();
+
+ TargetLowering::ArgListTy Args;
+ TargetLowering::ArgListEntry Entry;
+ for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
+ EVT ArgVT = Op->getOperand(i).getValueType();
+ assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
+ "Unexpected argument type for lowering");
+ SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
+ Entry.Node = StackPtr;
+ InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
+ MachinePointerInfo(), /* Alignment = */ 16);
+ Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+ Entry.Ty = PointerType::get(ArgTy,0);
+ Entry.isSExt = false;
+ Entry.isZExt = false;
+ Args.push_back(Entry);
+ }
+
+ SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
+ getPointerTy(DAG.getDataLayout()));
+
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(dl).setChain(InChain)
+ .setCallee(getLibcallCallingConv(LC),
+ static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
+ Callee, std::move(Args))
+ .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
+
+ std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+ return DAG.getBitcast(VT, CallInfo.first);
+}
+
+static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
+ MVT VT = Op0.getSimpleValueType();
+ SDLoc dl(Op);
+
+ // Decompose 256-bit ops into smaller 128-bit ops.
+ if (VT.is256BitVector() && !Subtarget.hasInt256()) {
+ unsigned Opcode = Op.getOpcode();
+ unsigned NumElems = VT.getVectorNumElements();
+ MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
+ SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
+ SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
+ SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
+ SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
+ SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
+ SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
+ SDValue Ops[] = {
+ DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
+ DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
+ };
+ return DAG.getMergeValues(Ops, dl);
+ }
+
+ assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
+ (VT == MVT::v8i32 && Subtarget.hasInt256()));
+
+ // PMULxD operations multiply each even value (starting at 0) of LHS with
+ // the related value of RHS and produce a widen result.
+ // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
+ // => <2 x i64> <ae|cg>
+ //
+ // In other word, to have all the results, we need to perform two PMULxD:
+ // 1. one with the even values.
+ // 2. one with the odd values.
+ // To achieve #2, with need to place the odd values at an even position.
+ //
+ // Place the odd value at an even position (basically, shift all values 1
+ // step to the left):
+ const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
+ // <a|b|c|d> => <b|undef|d|undef>
+ SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
+ makeArrayRef(&Mask[0], VT.getVectorNumElements()));
+ // <e|f|g|h> => <f|undef|h|undef>
+ SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
+ makeArrayRef(&Mask[0], VT.getVectorNumElements()));
+
+ // Emit two multiplies, one for the lower 2 ints and one for the higher 2
+ // ints.
+ MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
+ bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
+ unsigned Opcode =
+ (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
+ // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
+ // => <2 x i64> <ae|cg>
+ SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
+ // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
+ // => <2 x i64> <bf|dh>
+ SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
+
+ // Shuffle it back into the right order.
+ SDValue Highs, Lows;
+ if (VT == MVT::v8i32) {
+ const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
+ Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
+ const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
+ Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
+ } else {
+ const int HighMask[] = {1, 5, 3, 7};
+ Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
+ const int LowMask[] = {0, 4, 2, 6};
+ Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
+ }
+
+ // If we have a signed multiply but no PMULDQ fix up the high parts of a
+ // unsigned multiply.
+ if (IsSigned && !Subtarget.hasSSE41()) {
+ SDValue ShAmt = DAG.getConstant(
+ 31, dl,
+ DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
+ SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
+ DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
+ SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
+ DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
+
+ SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
+ Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
+ }
+
+ // The first result of MUL_LOHI is actually the low value, followed by the
+ // high value.
+ SDValue Ops[] = {Lows, Highs};
+ return DAG.getMergeValues(Ops, dl);
+}
+
+// Return true if the required (according to Opcode) shift-imm form is natively
+// supported by the Subtarget
+static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
+ unsigned Opcode) {
+ if (VT.getScalarSizeInBits() < 16)
+ return false;
+
+ if (VT.is512BitVector() &&
+ (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
+ return true;
+
+ bool LShift = VT.is128BitVector() ||
+ (VT.is256BitVector() && Subtarget.hasInt256());
+
+ bool AShift = LShift && (Subtarget.hasVLX() ||
+ (VT != MVT::v2i64 && VT != MVT::v4i64));
+ return (Opcode == ISD::SRA) ? AShift : LShift;
+}
+
+// The shift amount is a variable, but it is the same for all vector lanes.
+// These instructions are defined together with shift-immediate.
+static
+bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
+ unsigned Opcode) {
+ return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
+}
+
+// Return true if the required (according to Opcode) variable-shift form is
+// natively supported by the Subtarget
+static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
+ unsigned Opcode) {
+
+ if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
+ return false;
+
+ // vXi16 supported only on AVX-512, BWI
+ if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
+ return false;
+
+ if (VT.is512BitVector() || Subtarget.hasVLX())
+ return true;
+
+ bool LShift = VT.is128BitVector() || VT.is256BitVector();
+ bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
+ return (Opcode == ISD::SRA) ? AShift : LShift;
+}
+
+static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ MVT VT = Op.getSimpleValueType();
+ SDLoc dl(Op);
+ SDValue R = Op.getOperand(0);
+ SDValue Amt = Op.getOperand(1);
+
+ unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
+ (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
+
+ auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
+ assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
+ MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
+ SDValue Ex = DAG.getBitcast(ExVT, R);
+
+ if (ShiftAmt >= 32) {
+ // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
+ SDValue Upper =
+ getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
+ SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
+ ShiftAmt - 32, DAG);
+ if (VT == MVT::v2i64)
+ Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
+ if (VT == MVT::v4i64)
+ Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
+ {9, 1, 11, 3, 13, 5, 15, 7});
+ } else {
+ // SRA upper i32, SHL whole i64 and select lower i32.
+ SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
+ ShiftAmt, DAG);
+ SDValue Lower =
+ getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
+ Lower = DAG.getBitcast(ExVT, Lower);
+ if (VT == MVT::v2i64)
+ Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
+ if (VT == MVT::v4i64)
+ Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
+ {8, 1, 10, 3, 12, 5, 14, 7});
+ }
+ return DAG.getBitcast(VT, Ex);
+ };
+
+ // Optimize shl/srl/sra with constant shift amount.
+ if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
+ if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
+ uint64_t ShiftAmt = ShiftConst->getZExtValue();
+
+ if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
+ return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
+
+ // i64 SRA needs to be performed as partial shifts.
+ if ((VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
+ Op.getOpcode() == ISD::SRA && !Subtarget.hasXOP())
+ return ArithmeticShiftRight64(ShiftAmt);
+
+ if (VT == MVT::v16i8 ||
+ (Subtarget.hasInt256() && VT == MVT::v32i8) ||
+ VT == MVT::v64i8) {
+ unsigned NumElts = VT.getVectorNumElements();
+ MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
+
+ // Simple i8 add case
+ if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
+ return DAG.getNode(ISD::ADD, dl, VT, R, R);
+
+ // ashr(R, 7) === cmp_slt(R, 0)
+ if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
+ SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
+ if (VT.is512BitVector()) {
+ assert(VT == MVT::v64i8 && "Unexpected element type!");
+ SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
+ return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
+ }
+ return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
+ }
+
+ // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
+ if (VT == MVT::v16i8 && Subtarget.hasXOP())
+ return SDValue();
+
+ if (Op.getOpcode() == ISD::SHL) {
+ // Make a large shift.
+ SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
+ R, ShiftAmt, DAG);
+ SHL = DAG.getBitcast(VT, SHL);
+ // Zero out the rightmost bits.
+ return DAG.getNode(ISD::AND, dl, VT, SHL,
+ DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
+ }
+ if (Op.getOpcode() == ISD::SRL) {
+ // Make a large shift.
+ SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
+ R, ShiftAmt, DAG);
+ SRL = DAG.getBitcast(VT, SRL);
+ // Zero out the leftmost bits.
+ return DAG.getNode(ISD::AND, dl, VT, SRL,
+ DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
+ }
+ if (Op.getOpcode() == ISD::SRA) {
+ // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
+ SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
+
+ SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
+ Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
+ Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
+ return Res;
+ }
+ llvm_unreachable("Unknown shift opcode.");
+ }
+ }
+ }
+
+ // Special case in 32-bit mode, where i64 is expanded into high and low parts.
+ if (!Subtarget.is64Bit() && !Subtarget.hasXOP() &&
+ (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
+ (Subtarget.hasAVX512() && VT == MVT::v8i64))) {
+
+ // Peek through any splat that was introduced for i64 shift vectorization.
+ int SplatIndex = -1;
+ if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
+ if (SVN->isSplat()) {
+ SplatIndex = SVN->getSplatIndex();
+ Amt = Amt.getOperand(0);
+ assert(SplatIndex < (int)VT.getVectorNumElements() &&
+ "Splat shuffle referencing second operand");
+ }
+
+ if (Amt.getOpcode() != ISD::BITCAST ||
+ Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
+ return SDValue();
+
+ Amt = Amt.getOperand(0);
+ unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
+ VT.getVectorNumElements();
+ unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
+ uint64_t ShiftAmt = 0;
+ unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
+ for (unsigned i = 0; i != Ratio; ++i) {
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
+ if (!C)
+ return SDValue();
+ // 6 == Log2(64)
+ ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
+ }
+
+ // Check remaining shift amounts (if not a splat).
+ if (SplatIndex < 0) {
+ for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
+ uint64_t ShAmt = 0;
+ for (unsigned j = 0; j != Ratio; ++j) {
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
+ if (!C)
+ return SDValue();
+ // 6 == Log2(64)
+ ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
+ }
+ if (ShAmt != ShiftAmt)
+ return SDValue();
+ }
+ }
+
+ if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
+ return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
+
+ if (Op.getOpcode() == ISD::SRA)
+ return ArithmeticShiftRight64(ShiftAmt);
+ }
+
+ return SDValue();
+}
+
+static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ MVT VT = Op.getSimpleValueType();
+ SDLoc dl(Op);
+ SDValue R = Op.getOperand(0);
+ SDValue Amt = Op.getOperand(1);
+
+ unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
+ (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
+
+ unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
+ (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
+
+ if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
+ SDValue BaseShAmt;
+ MVT EltVT = VT.getVectorElementType();
+
+ if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
+ // Check if this build_vector node is doing a splat.
+ // If so, then set BaseShAmt equal to the splat value.
+ BaseShAmt = BV->getSplatValue();
+ if (BaseShAmt && BaseShAmt.isUndef())
+ BaseShAmt = SDValue();
+ } else {
+ if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
+ Amt = Amt.getOperand(0);
+
+ ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
+ if (SVN && SVN->isSplat()) {
+ unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
+ SDValue InVec = Amt.getOperand(0);
+ if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
+ assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
+ "Unexpected shuffle index found!");
+ BaseShAmt = InVec.getOperand(SplatIdx);
+ } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
+ if (ConstantSDNode *C =
+ dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
+ if (C->getZExtValue() == SplatIdx)
+ BaseShAmt = InVec.getOperand(1);
+ }
+ }
+
+ if (!BaseShAmt)
+ // Avoid introducing an extract element from a shuffle.
+ BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
+ DAG.getIntPtrConstant(SplatIdx, dl));
+ }
+ }
+
+ if (BaseShAmt.getNode()) {
+ assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
+ if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
+ BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
+ else if (EltVT.bitsLT(MVT::i32))
+ BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
+
+ return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, DAG);
+ }
+ }
+
+ // Special case in 32-bit mode, where i64 is expanded into high and low parts.
+ if (!Subtarget.is64Bit() && VT == MVT::v2i64 &&
+ Amt.getOpcode() == ISD::BITCAST &&
+ Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
+ Amt = Amt.getOperand(0);
+ unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
+ VT.getVectorNumElements();
+ std::vector<SDValue> Vals(Ratio);
+ for (unsigned i = 0; i != Ratio; ++i)
+ Vals[i] = Amt.getOperand(i);
+ for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
+ for (unsigned j = 0; j != Ratio; ++j)
+ if (Vals[j] != Amt.getOperand(i + j))
+ return SDValue();
+ }
+
+ if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
+ return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
+ }
+ return SDValue();
+}
+
+static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ SDLoc dl(Op);
+ SDValue R = Op.getOperand(0);
+ SDValue Amt = Op.getOperand(1);
+ bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
+
+ assert(VT.isVector() && "Custom lowering only for vector shifts!");
+ assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
+
+ if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
+ return V;
+
+ if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
+ return V;
+
+ if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
+ return Op;
+
+ // XOP has 128-bit variable logical/arithmetic shifts.
+ // +ve/-ve Amt = shift left/right.
+ if (Subtarget.hasXOP() &&
+ (VT == MVT::v2i64 || VT == MVT::v4i32 ||
+ VT == MVT::v8i16 || VT == MVT::v16i8)) {
+ if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
+ SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
+ Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
+ }
+ if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
+ return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
+ if (Op.getOpcode() == ISD::SRA)
+ return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
+ }
+
+ // 2i64 vector logical shifts can efficiently avoid scalarization - do the
+ // shifts per-lane and then shuffle the partial results back together.
+ if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
+ // Splat the shift amounts so the scalar shifts above will catch it.
+ SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
+ SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
+ SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
+ SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
+ return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
+ }
+
+ // i64 vector arithmetic shift can be emulated with the transform:
+ // M = lshr(SIGN_BIT, Amt)
+ // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
+ if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
+ Op.getOpcode() == ISD::SRA) {
+ SDValue S = DAG.getConstant(APInt::getSignBit(64), dl, VT);
+ SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
+ R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
+ R = DAG.getNode(ISD::XOR, dl, VT, R, M);
+ R = DAG.getNode(ISD::SUB, dl, VT, R, M);
+ return R;
+ }
+
+ // If possible, lower this packed shift into a vector multiply instead of
+ // expanding it into a sequence of scalar shifts.
+ // Do this only if the vector shift count is a constant build_vector.
+ if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
+ (VT == MVT::v8i16 || VT == MVT::v4i32 ||
+ (Subtarget.hasInt256() && VT == MVT::v16i16))) {
+ SmallVector<SDValue, 8> Elts;
+ MVT SVT = VT.getVectorElementType();
+ unsigned SVTBits = SVT.getSizeInBits();
+ APInt One(SVTBits, 1);
+ unsigned NumElems = VT.getVectorNumElements();
+
+ for (unsigned i=0; i !=NumElems; ++i) {
+ SDValue Op = Amt->getOperand(i);
+ if (Op->isUndef()) {
+ Elts.push_back(Op);
+ continue;
+ }
+
+ ConstantSDNode *ND = cast<ConstantSDNode>(Op);
+ APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
+ uint64_t ShAmt = C.getZExtValue();
+ if (ShAmt >= SVTBits) {
+ Elts.push_back(DAG.getUNDEF(SVT));
+ continue;
+ }
+ Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
+ }
+ SDValue BV = DAG.getBuildVector(VT, dl, Elts);
+ return DAG.getNode(ISD::MUL, dl, VT, R, BV);
+ }
+
+ // Lower SHL with variable shift amount.
+ if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
+ Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
+
+ Op = DAG.getNode(ISD::ADD, dl, VT, Op,
+ DAG.getConstant(0x3f800000U, dl, VT));
+ Op = DAG.getBitcast(MVT::v4f32, Op);
+ Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
+ return DAG.getNode(ISD::MUL, dl, VT, Op, R);
+ }
+
+ // If possible, lower this shift as a sequence of two shifts by
+ // constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.
+ // Example:
+ // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
+ //
+ // Could be rewritten as:
+ // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
+ //
+ // The advantage is that the two shifts from the example would be
+ // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
+ // the vector shift into four scalar shifts plus four pairs of vector
+ // insert/extract.
+ if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
+ unsigned TargetOpcode = X86ISD::MOVSS;
+ bool CanBeSimplified;
+ // The splat value for the first packed shift (the 'X' from the example).
+ SDValue Amt1 = Amt->getOperand(0);
+ // The splat value for the second packed shift (the 'Y' from the example).
+ SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
+
+ // See if it is possible to replace this node with a sequence of
+ // two shifts followed by a MOVSS/MOVSD/PBLEND.
+ if (VT == MVT::v4i32) {
+ // Check if it is legal to use a MOVSS.
+ CanBeSimplified = Amt2 == Amt->getOperand(2) &&
+ Amt2 == Amt->getOperand(3);
+ if (!CanBeSimplified) {
+ // Otherwise, check if we can still simplify this node using a MOVSD.
+ CanBeSimplified = Amt1 == Amt->getOperand(1) &&
+ Amt->getOperand(2) == Amt->getOperand(3);
+ TargetOpcode = X86ISD::MOVSD;
+ Amt2 = Amt->getOperand(2);
+ }
+ } else {
+ // Do similar checks for the case where the machine value type
+ // is MVT::v8i16.
+ CanBeSimplified = Amt1 == Amt->getOperand(1);
+ for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
+ CanBeSimplified = Amt2 == Amt->getOperand(i);
+
+ if (!CanBeSimplified) {
+ TargetOpcode = X86ISD::MOVSD;
+ CanBeSimplified = true;
+ Amt2 = Amt->getOperand(4);
+ for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
+ CanBeSimplified = Amt1 == Amt->getOperand(i);
+ for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
+ CanBeSimplified = Amt2 == Amt->getOperand(j);
+ }
+ }
+
+ if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
+ isa<ConstantSDNode>(Amt2)) {
+ // Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
+ MVT CastVT = MVT::v4i32;
+ SDValue Splat1 =
+ DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
+ SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
+ SDValue Splat2 =
+ DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
+ SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
+ SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1);
+ SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2);
+ if (TargetOpcode == X86ISD::MOVSD)
+ return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
+ BitCast2, {0, 1, 6, 7}));
+ return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
+ BitCast2, {0, 5, 6, 7}));
+ }
+ }
+
+ // v4i32 Non Uniform Shifts.
+ // If the shift amount is constant we can shift each lane using the SSE2
+ // immediate shifts, else we need to zero-extend each lane to the lower i64
+ // and shift using the SSE2 variable shifts.
+ // The separate results can then be blended together.
+ if (VT == MVT::v4i32) {
+ unsigned Opc = Op.getOpcode();
+ SDValue Amt0, Amt1, Amt2, Amt3;
+ if (ConstantAmt) {
+ Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
+ Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
+ Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
+ Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
+ } else {
+ // ISD::SHL is handled above but we include it here for completeness.
+ switch (Opc) {
+ default:
+ llvm_unreachable("Unknown target vector shift node");
+ case ISD::SHL:
+ Opc = X86ISD::VSHL;
+ break;
+ case ISD::SRL:
+ Opc = X86ISD::VSRL;
+ break;
+ case ISD::SRA:
+ Opc = X86ISD::VSRA;
+ break;
+ }
+ // The SSE2 shifts use the lower i64 as the same shift amount for
+ // all lanes and the upper i64 is ignored. These shuffle masks
+ // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
+ SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
+ Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
+ Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
+ Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
+ Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
+ }
+
+ SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
+ SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
+ SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
+ SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
+ SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
+ SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
+ return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
+ }
+
+ if (VT == MVT::v16i8 ||
+ (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP())) {
+ MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
+ unsigned ShiftOpcode = Op->getOpcode();
+
+ auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
+ // On SSE41 targets we make use of the fact that VSELECT lowers
+ // to PBLENDVB which selects bytes based just on the sign bit.
+ if (Subtarget.hasSSE41()) {
+ V0 = DAG.getBitcast(VT, V0);
+ V1 = DAG.getBitcast(VT, V1);
+ Sel = DAG.getBitcast(VT, Sel);
+ return DAG.getBitcast(SelVT,
+ DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
+ }
+ // On pre-SSE41 targets we test for the sign bit by comparing to
+ // zero - a negative value will set all bits of the lanes to true
+ // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
+ SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
+ SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
+ return DAG.getNode(ISD::VSELECT, dl, SelVT, C, V0, V1);
+ };
+
+ // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
+ // We can safely do this using i16 shifts as we're only interested in
+ // the 3 lower bits of each byte.
+ Amt = DAG.getBitcast(ExtVT, Amt);
+ Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
+ Amt = DAG.getBitcast(VT, Amt);
+
+ if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
+ // r = VSELECT(r, shift(r, 4), a);
+ SDValue M =
+ DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
+ R = SignBitSelect(VT, Amt, M, R);
+
+ // a += a
+ Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
+
+ // r = VSELECT(r, shift(r, 2), a);
+ M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
+ R = SignBitSelect(VT, Amt, M, R);
+
+ // a += a
+ Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
+
+ // return VSELECT(r, shift(r, 1), a);
+ M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
+ R = SignBitSelect(VT, Amt, M, R);
+ return R;
+ }
+
+ if (Op->getOpcode() == ISD::SRA) {
+ // For SRA we need to unpack each byte to the higher byte of a i16 vector
+ // so we can correctly sign extend. We don't care what happens to the
+ // lower byte.
+ SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
+ SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
+ SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
+ SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
+ ALo = DAG.getBitcast(ExtVT, ALo);
+ AHi = DAG.getBitcast(ExtVT, AHi);
+ RLo = DAG.getBitcast(ExtVT, RLo);
+ RHi = DAG.getBitcast(ExtVT, RHi);
+
+ // r = VSELECT(r, shift(r, 4), a);
+ SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
+ DAG.getConstant(4, dl, ExtVT));
+ SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
+ DAG.getConstant(4, dl, ExtVT));
+ RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
+ RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
+
+ // a += a
+ ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
+ AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
+
+ // r = VSELECT(r, shift(r, 2), a);
+ MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
+ DAG.getConstant(2, dl, ExtVT));
+ MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
+ DAG.getConstant(2, dl, ExtVT));
+ RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
+ RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
+
+ // a += a
+ ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
+ AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
+
+ // r = VSELECT(r, shift(r, 1), a);
+ MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
+ DAG.getConstant(1, dl, ExtVT));
+ MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
+ DAG.getConstant(1, dl, ExtVT));
+ RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
+ RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
+
+ // Logical shift the result back to the lower byte, leaving a zero upper
+ // byte
+ // meaning that we can safely pack with PACKUSWB.
+ RLo =
+ DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
+ RHi =
+ DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
+ return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
+ }
+ }
+
+ // It's worth extending once and using the v8i32 shifts for 16-bit types, but
+ // the extra overheads to get from v16i8 to v8i32 make the existing SSE
+ // solution better.
+ if (Subtarget.hasInt256() && VT == MVT::v8i16) {
+ MVT ExtVT = MVT::v8i32;
+ unsigned ExtOpc =
+ Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ R = DAG.getNode(ExtOpc, dl, ExtVT, R);
+ Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
+ return DAG.getNode(ISD::TRUNCATE, dl, VT,
+ DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
+ }
+
+ if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
+ MVT ExtVT = MVT::v8i32;
+ SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
+ SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
+ SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
+ SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
+ SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
+ ALo = DAG.getBitcast(ExtVT, ALo);
+ AHi = DAG.getBitcast(ExtVT, AHi);
+ RLo = DAG.getBitcast(ExtVT, RLo);
+ RHi = DAG.getBitcast(ExtVT, RHi);
+ SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
+ SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
+ Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
+ Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
+ return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
+ }
+
+ if (VT == MVT::v8i16) {
+ unsigned ShiftOpcode = Op->getOpcode();
+
+ // If we have a constant shift amount, the non-SSE41 path is best as
+ // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
+ bool UseSSE41 = Subtarget.hasSSE41() &&
+ !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
+
+ auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
+ // On SSE41 targets we make use of the fact that VSELECT lowers
+ // to PBLENDVB which selects bytes based just on the sign bit.
+ if (UseSSE41) {
+ MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
+ V0 = DAG.getBitcast(ExtVT, V0);
+ V1 = DAG.getBitcast(ExtVT, V1);
+ Sel = DAG.getBitcast(ExtVT, Sel);
+ return DAG.getBitcast(
+ VT, DAG.getNode(ISD::VSELECT, dl, ExtVT, Sel, V0, V1));
+ }
+ // On pre-SSE41 targets we splat the sign bit - a negative value will
+ // set all bits of the lanes to true and VSELECT uses that in
+ // its OR(AND(V0,C),AND(V1,~C)) lowering.
+ SDValue C =
+ DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
+ return DAG.getNode(ISD::VSELECT, dl, VT, C, V0, V1);
+ };
+
+ // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
+ if (UseSSE41) {
+ // On SSE41 targets we need to replicate the shift mask in both
+ // bytes for PBLENDVB.
+ Amt = DAG.getNode(
+ ISD::OR, dl, VT,
+ DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
+ DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
+ } else {
+ Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
+ }
+
+ // r = VSELECT(r, shift(r, 8), a);
+ SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
+ R = SignBitSelect(Amt, M, R);
+
+ // a += a
+ Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
+
+ // r = VSELECT(r, shift(r, 4), a);
+ M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
+ R = SignBitSelect(Amt, M, R);
+
+ // a += a
+ Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
+
+ // r = VSELECT(r, shift(r, 2), a);
+ M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
+ R = SignBitSelect(Amt, M, R);
+
+ // a += a
+ Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
+
+ // return VSELECT(r, shift(r, 1), a);
+ M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
+ R = SignBitSelect(Amt, M, R);
+ return R;
+ }
+
+ // Decompose 256-bit shifts into smaller 128-bit shifts.
+ if (VT.is256BitVector())
+ return Lower256IntArith(Op, DAG);
+
+ return SDValue();
+}
+
+static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ SDLoc DL(Op);
+ SDValue R = Op.getOperand(0);
+ SDValue Amt = Op.getOperand(1);
+
+ assert(VT.isVector() && "Custom lowering only for vector rotates!");
+ assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
+ assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported");
+
+ // XOP has 128-bit vector variable + immediate rotates.
+ // +ve/-ve Amt = rotate left/right.
+
+ // Split 256-bit integers.
+ if (VT.is256BitVector())
+ return Lower256IntArith(Op, DAG);
+
+ assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
+
+ // Attempt to rotate by immediate.
+ if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
+ if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
+ uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
+ assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range");
+ return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
+ DAG.getConstant(RotateAmt, DL, MVT::i8));
+ }
+ }
+
+ // Use general rotate by variable (per-element).
+ return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt);
+}
+
+static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
+ // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
+ // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
+ // looks for this combo and may remove the "setcc" instruction if the "setcc"
+ // has only one use.
+ SDNode *N = Op.getNode();
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ unsigned BaseOp = 0;
+ X86::CondCode Cond;
+ SDLoc DL(Op);
+ switch (Op.getOpcode()) {
+ default: llvm_unreachable("Unknown ovf instruction!");
+ case ISD::SADDO:
+ // A subtract of one will be selected as a INC. Note that INC doesn't
+ // set CF, so we can't do this for UADDO.
+ if (isOneConstant(RHS)) {
+ BaseOp = X86ISD::INC;
+ Cond = X86::COND_O;
+ break;
+ }
+ BaseOp = X86ISD::ADD;
+ Cond = X86::COND_O;
+ break;
+ case ISD::UADDO:
+ BaseOp = X86ISD::ADD;
+ Cond = X86::COND_B;
+ break;
+ case ISD::SSUBO:
+ // A subtract of one will be selected as a DEC. Note that DEC doesn't
+ // set CF, so we can't do this for USUBO.
+ if (isOneConstant(RHS)) {
+ BaseOp = X86ISD::DEC;
+ Cond = X86::COND_O;
+ break;
+ }
+ BaseOp = X86ISD::SUB;
+ Cond = X86::COND_O;
+ break;
+ case ISD::USUBO:
+ BaseOp = X86ISD::SUB;
+ Cond = X86::COND_B;
+ break;
+ case ISD::SMULO:
+ BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
+ Cond = X86::COND_O;
+ break;
+ case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
+ if (N->getValueType(0) == MVT::i8) {
+ BaseOp = X86ISD::UMUL8;
+ Cond = X86::COND_O;
+ break;
+ }
+ SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
+ MVT::i32);
+ SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
+
+ SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);
+
+ if (N->getValueType(1) == MVT::i1)
+ SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
+
+ return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
+ }
+ }
+
+ // Also sets EFLAGS.
+ SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
+ SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
+
+ SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);
+
+ if (N->getValueType(1) == MVT::i1)
+ SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
+
+ return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
+}
+
+/// Returns true if the operand type is exactly twice the native width, and
+/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
+/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
+/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
+bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
+ unsigned OpWidth = MemType->getPrimitiveSizeInBits();
+
+ if (OpWidth == 64)
+ return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
+ else if (OpWidth == 128)
+ return Subtarget.hasCmpxchg16b();
+ else
+ return false;
+}
+
+bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
+ return needsCmpXchgNb(SI->getValueOperand()->getType());
+}
+
+// Note: this turns large loads into lock cmpxchg8b/16b.
+// FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
+TargetLowering::AtomicExpansionKind
+X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
+ auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
+ return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
+ : AtomicExpansionKind::None;
+}
+
+TargetLowering::AtomicExpansionKind
+X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
+ unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
+ Type *MemType = AI->getType();
+
+ // If the operand is too big, we must see if cmpxchg8/16b is available
+ // and default to library calls otherwise.
+ if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
+ return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
+ : AtomicExpansionKind::None;
+ }
+
+ AtomicRMWInst::BinOp Op = AI->getOperation();
+ switch (Op) {
+ default:
+ llvm_unreachable("Unknown atomic operation");
+ case AtomicRMWInst::Xchg:
+ case AtomicRMWInst::Add:
+ case AtomicRMWInst::Sub:
+ // It's better to use xadd, xsub or xchg for these in all cases.
+ return AtomicExpansionKind::None;
+ case AtomicRMWInst::Or:
+ case AtomicRMWInst::And:
+ case AtomicRMWInst::Xor:
+ // If the atomicrmw's result isn't actually used, we can just add a "lock"
+ // prefix to a normal instruction for these operations.
+ return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
+ : AtomicExpansionKind::None;
+ case AtomicRMWInst::Nand:
+ case AtomicRMWInst::Max:
+ case AtomicRMWInst::Min:
+ case AtomicRMWInst::UMax:
+ case AtomicRMWInst::UMin:
+ // These always require a non-trivial set of data operations on x86. We must
+ // use a cmpxchg loop.
+ return AtomicExpansionKind::CmpXChg;
+ }
+}
+
+LoadInst *
+X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
+ unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
+ Type *MemType = AI->getType();
+ // Accesses larger than the native width are turned into cmpxchg/libcalls, so
+ // there is no benefit in turning such RMWs into loads, and it is actually
+ // harmful as it introduces a mfence.
+ if (MemType->getPrimitiveSizeInBits() > NativeWidth)
+ return nullptr;
+
+ auto Builder = IRBuilder<>(AI);
+ Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+ auto SynchScope = AI->getSynchScope();
+ // We must restrict the ordering to avoid generating loads with Release or
+ // ReleaseAcquire orderings.
+ auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
+ auto Ptr = AI->getPointerOperand();
+
+ // Before the load we need a fence. Here is an example lifted from
+ // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
+ // is required:
+ // Thread 0:
+ // x.store(1, relaxed);
+ // r1 = y.fetch_add(0, release);
+ // Thread 1:
+ // y.fetch_add(42, acquire);
+ // r2 = x.load(relaxed);
+ // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
+ // lowered to just a load without a fence. A mfence flushes the store buffer,
+ // making the optimization clearly correct.
+ // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
+ // otherwise, we might be able to be more aggressive on relaxed idempotent
+ // rmw. In practice, they do not look useful, so we don't try to be
+ // especially clever.
+ if (SynchScope == SingleThread)
+ // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
+ // the IR level, so we must wrap it in an intrinsic.
+ return nullptr;
+
+ if (!Subtarget.hasMFence())
+ // FIXME: it might make sense to use a locked operation here but on a
+ // different cache-line to prevent cache-line bouncing. In practice it
+ // is probably a small win, and x86 processors without mfence are rare
+ // enough that we do not bother.
+ return nullptr;
+
+ Function *MFence =
+ llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
+ Builder.CreateCall(MFence, {});
+
+ // Finally we can emit the atomic load.
+ LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
+ AI->getType()->getPrimitiveSizeInBits());
+ Loaded->setAtomic(Order, SynchScope);
+ AI->replaceAllUsesWith(Loaded);
+ AI->eraseFromParent();
+ return Loaded;
+}
+
+static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc dl(Op);
+ AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
+ cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
+ SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
+ cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
+
+ // The only fence that needs an instruction is a sequentially-consistent
+ // cross-thread fence.
+ if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
+ FenceScope == CrossThread) {
+ if (Subtarget.hasMFence())
+ return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
+
+ SDValue Chain = Op.getOperand(0);
+ SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
+ SDValue Ops[] = {
+ DAG.getRegister(X86::ESP, MVT::i32), // Base
+ DAG.getTargetConstant(1, dl, MVT::i8), // Scale
+ DAG.getRegister(0, MVT::i32), // Index
+ DAG.getTargetConstant(0, dl, MVT::i32), // Disp
+ DAG.getRegister(0, MVT::i32), // Segment.
+ Zero,
+ Chain
+ };
+ SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
+ return SDValue(Res, 0);
+ }
+
+ // MEMBARRIER is a compiler barrier; it codegens to a no-op.
+ return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
+}
+
+static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT T = Op.getSimpleValueType();
+ SDLoc DL(Op);
+ unsigned Reg = 0;
+ unsigned size = 0;
+ switch(T.SimpleTy) {
+ default: llvm_unreachable("Invalid value type!");
+ case MVT::i8: Reg = X86::AL; size = 1; break;
+ case MVT::i16: Reg = X86::AX; size = 2; break;
+ case MVT::i32: Reg = X86::EAX; size = 4; break;
+ case MVT::i64:
+ assert(Subtarget.is64Bit() && "Node not type legal!");
+ Reg = X86::RAX; size = 8;
+ break;
+ }
+ SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
+ Op.getOperand(2), SDValue());
+ SDValue Ops[] = { cpIn.getValue(0),
+ Op.getOperand(1),
+ Op.getOperand(3),
+ DAG.getTargetConstant(size, DL, MVT::i8),
+ cpIn.getValue(1) };
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+ MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
+ SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
+ Ops, T, MMO);
+
+ SDValue cpOut =
+ DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
+ SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
+ MVT::i32, cpOut.getValue(2));
+ SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
+
+ DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
+ DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
+ DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
+ return SDValue();
+}
+
+static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT SrcVT = Op.getOperand(0).getSimpleValueType();
+ MVT DstVT = Op.getSimpleValueType();
+
+ if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
+ SrcVT == MVT::i64) {
+ assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
+ if (DstVT != MVT::f64)
+ // This conversion needs to be expanded.
+ return SDValue();
+
+ SDValue Op0 = Op->getOperand(0);
+ SmallVector<SDValue, 16> Elts;
+ SDLoc dl(Op);
+ unsigned NumElts;
+ MVT SVT;
+ if (SrcVT.isVector()) {
+ NumElts = SrcVT.getVectorNumElements();
+ SVT = SrcVT.getVectorElementType();
+
+ // Widen the vector in input in the case of MVT::v2i32.
+ // Example: from MVT::v2i32 to MVT::v4i32.
+ for (unsigned i = 0, e = NumElts; i != e; ++i)
+ Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
+ DAG.getIntPtrConstant(i, dl)));
+ } else {
+ assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
+ "Unexpected source type in LowerBITCAST");
+ Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
+ DAG.getIntPtrConstant(0, dl)));
+ Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
+ DAG.getIntPtrConstant(1, dl)));
+ NumElts = 2;
+ SVT = MVT::i32;
+ }
+ // Explicitly mark the extra elements as Undef.
+ Elts.append(NumElts, DAG.getUNDEF(SVT));
+
+ EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
+ SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
+ SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
+ DAG.getIntPtrConstant(0, dl));
+ }
+
+ assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
+ Subtarget.hasMMX() && "Unexpected custom BITCAST");
+ assert((DstVT == MVT::i64 ||
+ (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
+ "Unexpected custom BITCAST");
+ // i64 <=> MMX conversions are Legal.
+ if (SrcVT==MVT::i64 && DstVT.isVector())
+ return Op;
+ if (DstVT==MVT::i64 && SrcVT.isVector())
+ return Op;
+ // MMX <=> MMX conversions are Legal.
+ if (SrcVT.isVector() && DstVT.isVector())
+ return Op;
+ // All other conversions need to be expanded.
+ return SDValue();
+}
+
+/// Compute the horizontal sum of bytes in V for the elements of VT.
+///
+/// Requires V to be a byte vector and VT to be an integer vector type with
+/// wider elements than V's type. The width of the elements of VT determines
+/// how many bytes of V are summed horizontally to produce each element of the
+/// result.
+static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(V);
+ MVT ByteVecVT = V.getSimpleValueType();
+ MVT EltVT = VT.getVectorElementType();
+ assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
+ "Expected value to have byte element type.");
+ assert(EltVT != MVT::i8 &&
+ "Horizontal byte sum only makes sense for wider elements!");
+ unsigned VecSize = VT.getSizeInBits();
+ assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
+
+ // PSADBW instruction horizontally add all bytes and leave the result in i64
+ // chunks, thus directly computes the pop count for v2i64 and v4i64.
+ if (EltVT == MVT::i64) {
+ SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
+ MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
+ V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
+ return DAG.getBitcast(VT, V);
+ }
+
+ if (EltVT == MVT::i32) {
+ // We unpack the low half and high half into i32s interleaved with zeros so
+ // that we can use PSADBW to horizontally sum them. The most useful part of
+ // this is that it lines up the results of two PSADBW instructions to be
+ // two v2i64 vectors which concatenated are the 4 population counts. We can
+ // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
+ SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
+ SDValue V32 = DAG.getBitcast(VT, V);
+ SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
+ SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);
+
+ // Do the horizontal sums into two v2i64s.
+ Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
+ MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
+ Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
+ DAG.getBitcast(ByteVecVT, Low), Zeros);
+ High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
+ DAG.getBitcast(ByteVecVT, High), Zeros);
+
+ // Merge them together.
+ MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
+ V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
+ DAG.getBitcast(ShortVecVT, Low),
+ DAG.getBitcast(ShortVecVT, High));
+
+ return DAG.getBitcast(VT, V);
+ }
+
+ // The only element type left is i16.
+ assert(EltVT == MVT::i16 && "Unknown how to handle type");
+
+ // To obtain pop count for each i16 element starting from the pop count for
+ // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
+ // right by 8. It is important to shift as i16s as i8 vector shift isn't
+ // directly supported.
+ SDValue ShifterV = DAG.getConstant(8, DL, VT);
+ SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
+ V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
+ DAG.getBitcast(ByteVecVT, V));
+ return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
+}
+
+static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ MVT EltVT = VT.getVectorElementType();
+ unsigned VecSize = VT.getSizeInBits();
+
+ // Implement a lookup table in register by using an algorithm based on:
+ // http://wm.ite.pl/articles/sse-popcount.html
+ //
+ // The general idea is that every lower byte nibble in the input vector is an
+ // index into a in-register pre-computed pop count table. We then split up the
+ // input vector in two new ones: (1) a vector with only the shifted-right
+ // higher nibbles for each byte and (2) a vector with the lower nibbles (and
+ // masked out higher ones) for each byte. PSHUB is used separately with both
+ // to index the in-register table. Next, both are added and the result is a
+ // i8 vector where each element contains the pop count for input byte.
+ //
+ // To obtain the pop count for elements != i8, we follow up with the same
+ // approach and use additional tricks as described below.
+ //
+ const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
+ /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
+ /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
+ /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
+
+ int NumByteElts = VecSize / 8;
+ MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
+ SDValue In = DAG.getBitcast(ByteVecVT, Op);
+ SmallVector<SDValue, 64> LUTVec;
+ for (int i = 0; i < NumByteElts; ++i)
+ LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
+ SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
+ SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
+
+ // High nibbles
+ SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
+ SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
+
+ // Low nibbles
+ SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
+
+ // The input vector is used as the shuffle mask that index elements into the
+ // LUT. After counting low and high nibbles, add the vector to obtain the
+ // final pop count per i8 element.
+ SDValue HighPopCnt =
+ DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
+ SDValue LowPopCnt =
+ DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
+ SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
+
+ if (EltVT == MVT::i8)
+ return PopCnt;
+
+ return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
+}
+
+static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ assert(VT.is128BitVector() &&
+ "Only 128-bit vector bitmath lowering supported.");
+
+ int VecSize = VT.getSizeInBits();
+ MVT EltVT = VT.getVectorElementType();
+ int Len = EltVT.getSizeInBits();
+
+ // This is the vectorized version of the "best" algorithm from
+ // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+ // with a minor tweak to use a series of adds + shifts instead of vector
+ // multiplications. Implemented for all integer vector types. We only use
+ // this when we don't have SSSE3 which allows a LUT-based lowering that is
+ // much faster, even faster than using native popcnt instructions.
+
+ auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
+ MVT VT = V.getSimpleValueType();
+ SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
+ return DAG.getNode(OpCode, DL, VT, V, ShifterV);
+ };
+ auto GetMask = [&](SDValue V, APInt Mask) {
+ MVT VT = V.getSimpleValueType();
+ SDValue MaskV = DAG.getConstant(Mask, DL, VT);
+ return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
+ };
+
+ // We don't want to incur the implicit masks required to SRL vNi8 vectors on
+ // x86, so set the SRL type to have elements at least i16 wide. This is
+ // correct because all of our SRLs are followed immediately by a mask anyways
+ // that handles any bits that sneak into the high bits of the byte elements.
+ MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
+
+ SDValue V = Op;
+
+ // v = v - ((v >> 1) & 0x55555555...)
+ SDValue Srl =
+ DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
+ SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
+ V = DAG.getNode(ISD::SUB, DL, VT, V, And);
+
+ // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
+ SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
+ Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
+ SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
+ V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
+
+ // v = (v + (v >> 4)) & 0x0F0F0F0F...
+ Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
+ SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
+ V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
+
+ // At this point, V contains the byte-wise population count, and we are
+ // merely doing a horizontal sum if necessary to get the wider element
+ // counts.
+ if (EltVT == MVT::i8)
+ return V;
+
+ return LowerHorizontalByteSum(
+ DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
+ DAG);
+}
+
+// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
+// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
+static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
+ "Unknown CTPOP type to handle");
+ SDLoc DL(Op.getNode());
+ SDValue Op0 = Op.getOperand(0);
+
+ if (!Subtarget.hasSSSE3()) {
+ // We can't use the fast LUT approach, so fall back on vectorized bitmath.
+ assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
+ return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
+ }
+
+ if (VT.is256BitVector() && !Subtarget.hasInt256()) {
+ unsigned NumElems = VT.getVectorNumElements();
+
+ // Extract each 128-bit vector, compute pop count and concat the result.
+ SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
+ SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
+ LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
+ LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
+ }
+
+ if (VT.is512BitVector() && !Subtarget.hasBWI()) {
+ unsigned NumElems = VT.getVectorNumElements();
+
+ // Extract each 256-bit vector, compute pop count and concat the result.
+ SDValue LHS = extract256BitVector(Op0, 0, DAG, DL);
+ SDValue RHS = extract256BitVector(Op0, NumElems / 2, DAG, DL);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
+ LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
+ LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
+ }
+
+ return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
+}
+
+static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(Op.getSimpleValueType().isVector() &&
+ "We only do custom lowering for vector population count.");
+ return LowerVectorCTPOP(Op, Subtarget, DAG);
+}
+
+static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ SDValue In = Op.getOperand(0);
+ SDLoc DL(Op);
+
+ // For scalars, its still beneficial to transfer to/from the SIMD unit to
+ // perform the BITREVERSE.
+ if (!VT.isVector()) {
+ MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
+ SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
+ Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
+ DAG.getIntPtrConstant(0, DL));
+ }
+
+ MVT SVT = VT.getVectorElementType();
+ int NumElts = VT.getVectorNumElements();
+ int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
+
+ // Decompose 256-bit ops into smaller 128-bit ops.
+ if (VT.is256BitVector()) {
+ SDValue Lo = extract128BitVector(In, 0, DAG, DL);
+ SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL);
+
+ MVT HalfVT = MVT::getVectorVT(SVT, NumElts / 2);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
+ DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo),
+ DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi));
+ }
+
+ assert(VT.is128BitVector() &&
+ "Only 128-bit vector bitreverse lowering supported.");
+
+ // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
+ // perform the BSWAP in the shuffle.
+ // Its best to shuffle using the second operand as this will implicitly allow
+ // memory folding for multiple vectors.
+ SmallVector<SDValue, 16> MaskElts;
+ for (int i = 0; i != NumElts; ++i) {
+ for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
+ int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
+ int PermuteByte = SourceByte | (2 << 5);
+ MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
+ }
+ }
+
+ SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
+ SDValue Res = DAG.getBitcast(MVT::v16i8, In);
+ Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
+ Res, Mask);
+ return DAG.getBitcast(VT, Res);
+}
+
+static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ if (Subtarget.hasXOP())
+ return LowerBITREVERSE_XOP(Op, DAG);
+
+ assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
+
+ MVT VT = Op.getSimpleValueType();
+ SDValue In = Op.getOperand(0);
+ SDLoc DL(Op);
+
+ unsigned NumElts = VT.getVectorNumElements();
+ assert(VT.getScalarType() == MVT::i8 &&
+ "Only byte vector BITREVERSE supported");
+
+ // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
+ if (VT.is256BitVector() && !Subtarget.hasInt256()) {
+ MVT HalfVT = MVT::getVectorVT(MVT::i8, NumElts / 2);
+ SDValue Lo = extract128BitVector(In, 0, DAG, DL);
+ SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL);
+ Lo = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo);
+ Hi = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
+ }
+
+ // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
+ // two nibbles and a PSHUFB lookup to find the bitreverse of each
+ // 0-15 value (moved to the other nibble).
+ SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
+ SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
+ SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
+
+ const int LoLUT[16] = {
+ /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
+ /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
+ /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
+ /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
+ const int HiLUT[16] = {
+ /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
+ /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
+ /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
+ /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
+
+ SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
+ for (unsigned i = 0; i < NumElts; ++i) {
+ LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
+ HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
+ }
+
+ SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
+ SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
+ Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
+ Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
+ return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
+}
+
+static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) {
+ unsigned NewOpc = 0;
+ switch (N->getOpcode()) {
+ case ISD::ATOMIC_LOAD_ADD:
+ NewOpc = X86ISD::LADD;
+ break;
+ case ISD::ATOMIC_LOAD_SUB:
+ NewOpc = X86ISD::LSUB;
+ break;
+ case ISD::ATOMIC_LOAD_OR:
+ NewOpc = X86ISD::LOR;
+ break;
+ case ISD::ATOMIC_LOAD_XOR:
+ NewOpc = X86ISD::LXOR;
+ break;
+ case ISD::ATOMIC_LOAD_AND:
+ NewOpc = X86ISD::LAND;
+ break;
+ default:
+ llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
+ }
+
+ MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
+ return DAG.getMemIntrinsicNode(
+ NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
+ {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
+ /*MemVT=*/N->getSimpleValueType(0), MMO);
+}
+
+/// Lower atomic_load_ops into LOCK-prefixed operations.
+static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDValue Chain = N->getOperand(0);
+ SDValue LHS = N->getOperand(1);
+ SDValue RHS = N->getOperand(2);
+ unsigned Opc = N->getOpcode();
+ MVT VT = N->getSimpleValueType(0);
+ SDLoc DL(N);
+
+ // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
+ // can only be lowered when the result is unused. They should have already
+ // been transformed into a cmpxchg loop in AtomicExpand.
+ if (N->hasAnyUseOfValue(0)) {
+ // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
+ // select LXADD if LOCK_SUB can't be selected.
+ if (Opc == ISD::ATOMIC_LOAD_SUB) {
+ AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
+ RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
+ return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
+ RHS, AN->getMemOperand());
+ }
+ assert(Opc == ISD::ATOMIC_LOAD_ADD &&
+ "Used AtomicRMW ops other than Add should have been expanded!");
+ return N;
+ }
+
+ SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG);
+ // RAUW the chain, but don't worry about the result, as it's unused.
+ assert(!N->hasAnyUseOfValue(0));
+ DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
+ return SDValue();
+}
+
+static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
+ SDNode *Node = Op.getNode();
+ SDLoc dl(Node);
+ EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
+
+ // Convert seq_cst store -> xchg
+ // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
+ // FIXME: On 32-bit, store -> fist or movq would be more efficient
+ // (The only way to get a 16-byte store is cmpxchg16b)
+ // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
+ if (cast<AtomicSDNode>(Node)->getOrdering() ==
+ AtomicOrdering::SequentiallyConsistent ||
+ !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
+ SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
+ cast<AtomicSDNode>(Node)->getMemoryVT(),
+ Node->getOperand(0),
+ Node->getOperand(1), Node->getOperand(2),
+ cast<AtomicSDNode>(Node)->getMemOperand());
+ return Swap.getValue(1);
+ }
+ // Other atomic stores have a simple pattern.
+ return Op;
+}
+
+static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
+ MVT VT = Op.getNode()->getSimpleValueType(0);
+
+ // Let legalize expand this if it isn't a legal type yet.
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ return SDValue();
+
+ SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+
+ unsigned Opc;
+ bool ExtraOp = false;
+ switch (Op.getOpcode()) {
+ default: llvm_unreachable("Invalid code");
+ case ISD::ADDC: Opc = X86ISD::ADD; break;
+ case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
+ case ISD::SUBC: Opc = X86ISD::SUB; break;
+ case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
+ }
+
+ if (!ExtraOp)
+ return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
+ Op.getOperand(1));
+ return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
+ Op.getOperand(1), Op.getOperand(2));
+}
+
+static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
+
+ // For MacOSX, we want to call an alternative entry point: __sincos_stret,
+ // which returns the values as { float, float } (in XMM0) or
+ // { double, double } (which is returned in XMM0, XMM1).
+ SDLoc dl(Op);
+ SDValue Arg = Op.getOperand(0);
+ EVT ArgVT = Arg.getValueType();
+ Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+
+ TargetLowering::ArgListTy Args;
+ TargetLowering::ArgListEntry Entry;
+
+ Entry.Node = Arg;
+ Entry.Ty = ArgTy;
+ Entry.isSExt = false;
+ Entry.isZExt = false;
+ Args.push_back(Entry);
+
+ bool isF64 = ArgVT == MVT::f64;
+ // Only optimize x86_64 for now. i386 is a bit messy. For f32,
+ // the small struct {f32, f32} is returned in (eax, edx). For f64,
+ // the results are returned via SRet in memory.
+ const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret";
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDValue Callee =
+ DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
+
+ Type *RetTy = isF64
+ ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
+ : (Type*)VectorType::get(ArgTy, 4);
+
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
+ .setCallee(CallingConv::C, RetTy, Callee, std::move(Args));
+
+ std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+
+ if (isF64)
+ // Returned in xmm0 and xmm1.
+ return CallResult.first;
+
+ // Returned in bits 0:31 and 32:64 xmm0.
+ SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
+ CallResult.first, DAG.getIntPtrConstant(0, dl));
+ SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
+ CallResult.first, DAG.getIntPtrConstant(1, dl));
+ SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
+}
+
+/// Widen a vector input to a vector of NVT. The
+/// input vector must have the same element type as NVT.
+static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
+ bool FillWithZeroes = false) {
+ // Check if InOp already has the right width.
+ MVT InVT = InOp.getSimpleValueType();
+ if (InVT == NVT)
+ return InOp;
+
+ if (InOp.isUndef())
+ return DAG.getUNDEF(NVT);
+
+ assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
+ "input and widen element type must match");
+
+ unsigned InNumElts = InVT.getVectorNumElements();
+ unsigned WidenNumElts = NVT.getVectorNumElements();
+ assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
+ "Unexpected request for vector widening");
+
+ EVT EltVT = NVT.getVectorElementType();
+
+ SDLoc dl(InOp);
+ if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
+ InOp.getNumOperands() == 2) {
+ SDValue N1 = InOp.getOperand(1);
+ if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
+ N1.isUndef()) {
+ InOp = InOp.getOperand(0);
+ InVT = InOp.getSimpleValueType();
+ InNumElts = InVT.getVectorNumElements();
+ }
+ }
+ if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
+ ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
+ SmallVector<SDValue, 16> Ops;
+ for (unsigned i = 0; i < InNumElts; ++i)
+ Ops.push_back(InOp.getOperand(i));
+
+ SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
+ DAG.getUNDEF(EltVT);
+ for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
+ Ops.push_back(FillVal);
+ return DAG.getBuildVector(NVT, dl, Ops);
+ }
+ SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
+ DAG.getUNDEF(NVT);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
+ InOp, DAG.getIntPtrConstant(0, dl));
+}
+
+static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(Subtarget.hasAVX512() &&
+ "MGATHER/MSCATTER are supported on AVX-512 arch only");
+
+ // X86 scatter kills mask register, so its type should be added to
+ // the list of return values.
+ // If the "scatter" has 2 return values, it is already handled.
+ if (Op.getNode()->getNumValues() == 2)
+ return Op;
+
+ MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
+ SDValue Src = N->getValue();
+ MVT VT = Src.getSimpleValueType();
+ assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
+ SDLoc dl(Op);
+
+ SDValue NewScatter;
+ SDValue Index = N->getIndex();
+ SDValue Mask = N->getMask();
+ SDValue Chain = N->getChain();
+ SDValue BasePtr = N->getBasePtr();
+ MVT MemVT = N->getMemoryVT().getSimpleVT();
+ MVT IndexVT = Index.getSimpleValueType();
+ MVT MaskVT = Mask.getSimpleValueType();
+
+ if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
+ // The v2i32 value was promoted to v2i64.
+ // Now we "redo" the type legalizer's work and widen the original
+ // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
+ // with a shuffle.
+ assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
+ "Unexpected memory type");
+ int ShuffleMask[] = {0, 2, -1, -1};
+ Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
+ DAG.getUNDEF(MVT::v4i32), ShuffleMask);
+ // Now we have 4 elements instead of 2.
+ // Expand the index.
+ MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
+ Index = ExtendToType(Index, NewIndexVT, DAG);
+
+ // Expand the mask with zeroes
+ // Mask may be <2 x i64> or <2 x i1> at this moment
+ assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
+ "Unexpected mask type");
+ MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
+ Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
+ VT = MVT::v4i32;
+ }
+
+ unsigned NumElts = VT.getVectorNumElements();
+ if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
+ !Index.getSimpleValueType().is512BitVector()) {
+ // AVX512F supports only 512-bit vectors. Or data or index should
+ // be 512 bit wide. If now the both index and data are 256-bit, but
+ // the vector contains 8 elements, we just sign-extend the index
+ if (IndexVT == MVT::v8i32)
+ // Just extend index
+ Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
+ else {
+ // The minimal number of elts in scatter is 8
+ NumElts = 8;
+ // Index
+ MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
+ // Use original index here, do not modify the index twice
+ Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
+ if (IndexVT.getScalarType() == MVT::i32)
+ Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
+
+ // Mask
+ // At this point we have promoted mask operand
+ assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
+ MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
+ // Use the original mask here, do not modify the mask twice
+ Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
+
+ // The value that should be stored
+ MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
+ Src = ExtendToType(Src, NewVT, DAG);
+ }
+ }
+ // If the mask is "wide" at this point - truncate it to i1 vector
+ MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
+ Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
+
+ // The mask is killed by scatter, add it to the values
+ SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
+ SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
+ NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
+ N->getMemOperand());
+ DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
+ return SDValue(NewScatter.getNode(), 1);
+}
+
+static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+
+ MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
+ MVT VT = Op.getSimpleValueType();
+ MVT ScalarVT = VT.getScalarType();
+ SDValue Mask = N->getMask();
+ SDLoc dl(Op);
+
+ assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
+ "Expanding masked load is supported on AVX-512 target only!");
+
+ assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
+ "Expanding masked load is supported for 32 and 64-bit types only!");
+
+ // 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of
+ // VLX. These types for exp-loads are handled here.
+ if (!N->isExpandingLoad() && VT.getVectorNumElements() <= 4)
+ return Op;
+
+ assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
+ "Cannot lower masked load op.");
+
+ assert((ScalarVT.getSizeInBits() >= 32 ||
+ (Subtarget.hasBWI() &&
+ (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
+ "Unsupported masked load op.");
+
+ // This operation is legal for targets with VLX, but without
+ // VLX the vector should be widened to 512 bit
+ unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
+ MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
+ SDValue Src0 = N->getSrc0();
+ Src0 = ExtendToType(Src0, WideDataVT, DAG);
+
+ // Mask element has to be i1.
+ MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
+ assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
+ "We handle 4x32, 4x64 and 2x64 vectors only in this casse");
+
+ MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
+
+ Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
+ if (MaskEltTy != MVT::i1)
+ Mask = DAG.getNode(ISD::TRUNCATE, dl,
+ MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
+ SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
+ N->getBasePtr(), Mask, Src0,
+ N->getMemoryVT(), N->getMemOperand(),
+ N->getExtensionType(),
+ N->isExpandingLoad());
+
+ SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+ NewLoad.getValue(0),
+ DAG.getIntPtrConstant(0, dl));
+ SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
+ return DAG.getMergeValues(RetOps, dl);
+}
+
+static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
+ SDValue DataToStore = N->getValue();
+ MVT VT = DataToStore.getSimpleValueType();
+ MVT ScalarVT = VT.getScalarType();
+ SDValue Mask = N->getMask();
+ SDLoc dl(Op);
+
+ assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
+ "Expanding masked load is supported on AVX-512 target only!");
+
+ assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
+ "Expanding masked load is supported for 32 and 64-bit types only!");
+
+ // 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX.
+ if (!N->isCompressingStore() && VT.getVectorNumElements() <= 4)
+ return Op;
+
+ assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
+ "Cannot lower masked store op.");
+
+ assert((ScalarVT.getSizeInBits() >= 32 ||
+ (Subtarget.hasBWI() &&
+ (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
+ "Unsupported masked store op.");
+
+ // This operation is legal for targets with VLX, but without
+ // VLX the vector should be widened to 512 bit
+ unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
+ MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
+
+ // Mask element has to be i1.
+ MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
+ assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
+ "We handle 4x32, 4x64 and 2x64 vectors only in this casse");
+
+ MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
+
+ DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
+ Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
+ if (MaskEltTy != MVT::i1)
+ Mask = DAG.getNode(ISD::TRUNCATE, dl,
+ MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
+ return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
+ Mask, N->getMemoryVT(), N->getMemOperand(),
+ N->isTruncatingStore(), N->isCompressingStore());
+}
+
+static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(Subtarget.hasAVX512() &&
+ "MGATHER/MSCATTER are supported on AVX-512 arch only");
+
+ MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
+ SDLoc dl(Op);
+ MVT VT = Op.getSimpleValueType();
+ SDValue Index = N->getIndex();
+ SDValue Mask = N->getMask();
+ SDValue Src0 = N->getValue();
+ MVT IndexVT = Index.getSimpleValueType();
+ MVT MaskVT = Mask.getSimpleValueType();
+
+ unsigned NumElts = VT.getVectorNumElements();
+ assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
+
+ if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
+ !Index.getSimpleValueType().is512BitVector()) {
+ // AVX512F supports only 512-bit vectors. Or data or index should
+ // be 512 bit wide. If now the both index and data are 256-bit, but
+ // the vector contains 8 elements, we just sign-extend the index
+ if (NumElts == 8) {
+ Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
+ SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+ N->getOperand(3), Index };
+ DAG.UpdateNodeOperands(N, Ops);
+ return Op;
+ }
+
+ // Minimal number of elements in Gather
+ NumElts = 8;
+ // Index
+ MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
+ Index = ExtendToType(Index, NewIndexVT, DAG);
+ if (IndexVT.getScalarType() == MVT::i32)
+ Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
+
+ // Mask
+ MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
+ // At this point we have promoted mask operand
+ assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
+ MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
+ Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
+ Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
+
+ // The pass-thru value
+ MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
+ Src0 = ExtendToType(Src0, NewVT, DAG);
+
+ SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
+ SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other),
+ N->getMemoryVT(), dl, Ops,
+ N->getMemOperand());
+ SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+ NewGather.getValue(0),
+ DAG.getIntPtrConstant(0, dl));
+ SDValue RetOps[] = {Exract, NewGather.getValue(1)};
+ return DAG.getMergeValues(RetOps, dl);
+ }
+ return Op;
+}
+
+SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
+ SelectionDAG &DAG) const {
+ // TODO: Eventually, the lowering of these nodes should be informed by or
+ // deferred to the GC strategy for the function in which they appear. For
+ // now, however, they must be lowered to something. Since they are logically
+ // no-ops in the case of a null GC strategy (or a GC strategy which does not
+ // require special handling for these nodes), lower them as literal NOOPs for
+ // the time being.
+ SmallVector<SDValue, 2> Ops;
+
+ Ops.push_back(Op.getOperand(0));
+ if (Op->getGluedNode())
+ Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
+
+ SDLoc OpDL(Op);
+ SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
+
+ return NOOP;
+}
+
+SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
+ SelectionDAG &DAG) const {
+ // TODO: Eventually, the lowering of these nodes should be informed by or
+ // deferred to the GC strategy for the function in which they appear. For
+ // now, however, they must be lowered to something. Since they are logically
+ // no-ops in the case of a null GC strategy (or a GC strategy which does not
+ // require special handling for these nodes), lower them as literal NOOPs for
+ // the time being.
+ SmallVector<SDValue, 2> Ops;
+
+ Ops.push_back(Op.getOperand(0));
+ if (Op->getGluedNode())
+ Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
+
+ SDLoc OpDL(Op);
+ SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
+
+ return NOOP;
+}
+
+/// Provide custom lowering hooks for some operations.
+SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+ switch (Op.getOpcode()) {
+ default: llvm_unreachable("Should not custom lower this!");
+ case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
+ case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
+ return LowerCMP_SWAP(Op, Subtarget, DAG);
+ case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
+ case ISD::ATOMIC_LOAD_ADD:
+ case ISD::ATOMIC_LOAD_SUB:
+ case ISD::ATOMIC_LOAD_OR:
+ case ISD::ATOMIC_LOAD_XOR:
+ case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
+ case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
+ case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
+ case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
+ case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
+ case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
+ case ISD::VSELECT: return LowerVSELECT(Op, DAG);
+ case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+ case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
+ case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
+ case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
+ case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
+ case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
+ case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
+ case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
+ case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
+ case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
+ case ISD::SHL_PARTS:
+ case ISD::SRA_PARTS:
+ case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
+ case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
+ case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
+ case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
+ case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
+ case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
+ case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
+ case ISD::ZERO_EXTEND_VECTOR_INREG:
+ case ISD::SIGN_EXTEND_VECTOR_INREG:
+ return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
+ case ISD::FP_TO_SINT:
+ case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, Subtarget, DAG);
+ case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
+ case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);
+ case ISD::FABS:
+ case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
+ case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
+ case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
+ case ISD::SETCC: return LowerSETCC(Op, DAG);
+ case ISD::SETCCE: return LowerSETCCE(Op, DAG);
+ case ISD::SELECT: return LowerSELECT(Op, DAG);
+ case ISD::BRCOND: return LowerBRCOND(Op, DAG);
+ case ISD::JumpTable: return LowerJumpTable(Op, DAG);
+ case ISD::VASTART: return LowerVASTART(Op, DAG);
+ case ISD::VAARG: return LowerVAARG(Op, DAG);
+ case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
+ case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
+ case ISD::INTRINSIC_VOID:
+ case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
+ case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
+ case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
+ case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
+ case ISD::FRAME_TO_ARGS_OFFSET:
+ return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
+ case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
+ case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
+ case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
+ case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
+ case ISD::EH_SJLJ_SETUP_DISPATCH:
+ return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
+ case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
+ case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
+ case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
+ case ISD::CTLZ:
+ case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
+ case ISD::CTTZ:
+ case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
+ case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
+ case ISD::MULHS:
+ case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
+ case ISD::UMUL_LOHI:
+ case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
+ case ISD::ROTL: return LowerRotate(Op, Subtarget, DAG);
+ case ISD::SRA:
+ case ISD::SRL:
+ case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
+ case ISD::SADDO:
+ case ISD::UADDO:
+ case ISD::SSUBO:
+ case ISD::USUBO:
+ case ISD::SMULO:
+ case ISD::UMULO: return LowerXALUO(Op, DAG);
+ case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
+ case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
+ case ISD::ADDC:
+ case ISD::ADDE:
+ case ISD::SUBC:
+ case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
+ case ISD::ADD: return LowerADD(Op, DAG);
+ case ISD::SUB: return LowerSUB(Op, DAG);
+ case ISD::SMAX:
+ case ISD::SMIN:
+ case ISD::UMAX:
+ case ISD::UMIN: return LowerMINMAX(Op, DAG);
+ case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
+ case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
+ case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
+ case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
+ case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
+ case ISD::GC_TRANSITION_START:
+ return LowerGC_TRANSITION_START(Op, DAG);
+ case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
+ case ISD::STORE: return LowerTruncatingStore(Op, Subtarget, DAG);
+ }
+}
+
+/// Places new result values for the node in Results (their number
+/// and types must exactly match those of the original return values of
+/// the node), or leaves Results empty, which indicates that the node is not
+/// to be custom lowered after all.
+void X86TargetLowering::LowerOperationWrapper(SDNode *N,
+ SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const {
+ SDValue Res = LowerOperation(SDValue(N, 0), DAG);
+
+ if (!Res.getNode())
+ return;
+
+ assert((N->getNumValues() <= Res->getNumValues()) &&
+ "Lowering returned the wrong number of results!");
+
+ // Places new result values base on N result number.
+ // In some cases (LowerSINT_TO_FP for example) Res has more result values
+ // than original node, chain should be dropped(last value).
+ for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
+ Results.push_back(Res.getValue(I));
+}
+
+/// Replace a node with an illegal result type with a new node built out of
+/// custom code.
+void X86TargetLowering::ReplaceNodeResults(SDNode *N,
+ SmallVectorImpl<SDValue>&Results,
+ SelectionDAG &DAG) const {
+ SDLoc dl(N);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ switch (N->getOpcode()) {
+ default:
+ llvm_unreachable("Do not know how to custom type legalize this operation!");
+ case X86ISD::AVG: {
+ // Legalize types for X86ISD::AVG by expanding vectors.
+ assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
+
+ auto InVT = N->getValueType(0);
+ auto InVTSize = InVT.getSizeInBits();
+ const unsigned RegSize =
+ (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
+ assert((Subtarget.hasBWI() || RegSize < 512) &&
+ "512-bit vector requires AVX512BW");
+ assert((Subtarget.hasAVX2() || RegSize < 256) &&
+ "256-bit vector requires AVX2");
+
+ auto ElemVT = InVT.getVectorElementType();
+ auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
+ RegSize / ElemVT.getSizeInBits());
+ assert(RegSize % InVT.getSizeInBits() == 0);
+ unsigned NumConcat = RegSize / InVT.getSizeInBits();
+
+ SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
+ Ops[0] = N->getOperand(0);
+ SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
+ Ops[0] = N->getOperand(1);
+ SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
+
+ SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
+ Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
+ DAG.getIntPtrConstant(0, dl)));
+ return;
+ }
+ // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
+ case X86ISD::FMINC:
+ case X86ISD::FMIN:
+ case X86ISD::FMAXC:
+ case X86ISD::FMAX: {
+ EVT VT = N->getValueType(0);
+ assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
+ SDValue UNDEF = DAG.getUNDEF(VT);
+ SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
+ N->getOperand(0), UNDEF);
+ SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
+ N->getOperand(1), UNDEF);
+ Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
+ return;
+ }
+ case ISD::SDIV:
+ case ISD::UDIV:
+ case ISD::SREM:
+ case ISD::UREM:
+ case ISD::SDIVREM:
+ case ISD::UDIVREM: {
+ SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
+ Results.push_back(V);
+ return;
+ }
+ case ISD::FP_TO_SINT:
+ case ISD::FP_TO_UINT: {
+ bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
+
+ if (N->getValueType(0) == MVT::v2i32) {
+ assert((IsSigned || Subtarget.hasAVX512()) &&
+ "Can only handle signed conversion without AVX512");
+ assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
+ SDValue Src = N->getOperand(0);
+ if (Src.getValueType() == MVT::v2f64) {
+ SDValue Idx = DAG.getIntPtrConstant(0, dl);
+ SDValue Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI
+ : X86ISD::CVTTP2UI,
+ dl, MVT::v4i32, Src);
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
+ Results.push_back(Res);
+ return;
+ }
+ if (Src.getValueType() == MVT::v2f32) {
+ SDValue Idx = DAG.getIntPtrConstant(0, dl);
+ SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
+ DAG.getUNDEF(MVT::v2f32));
+ Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
+ : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
+ Results.push_back(Res);
+ return;
+ }
+
+ // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
+ // so early out here.
+ return;
+ }
+
+ std::pair<SDValue,SDValue> Vals =
+ FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
+ SDValue FIST = Vals.first, StackSlot = Vals.second;
+ if (FIST.getNode()) {
+ EVT VT = N->getValueType(0);
+ // Return a load from the stack slot.
+ if (StackSlot.getNode())
+ Results.push_back(
+ DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
+ else
+ Results.push_back(FIST);
+ }
+ return;
+ }
+ case ISD::SINT_TO_FP: {
+ assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
+ SDValue Src = N->getOperand(0);
+ if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
+ return;
+ Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
+ return;
+ }
+ case ISD::UINT_TO_FP: {
+ assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::v2f32)
+ return;
+ SDValue Src = N->getOperand(0);
+ EVT SrcVT = Src.getValueType();
+ if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
+ Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
+ return;
+ }
+ if (SrcVT != MVT::v2i32)
+ return;
+ SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
+ SDValue VBias =
+ DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
+ SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
+ DAG.getBitcast(MVT::v2i64, VBias));
+ Or = DAG.getBitcast(MVT::v2f64, Or);
+ // TODO: Are there any fast-math-flags to propagate here?
+ SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
+ Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
+ return;
+ }
+ case ISD::FP_ROUND: {
+ if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
+ return;
+ SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
+ Results.push_back(V);
+ return;
+ }
+ case ISD::FP_EXTEND: {
+ // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
+ // No other ValueType for FP_EXTEND should reach this point.
+ assert(N->getValueType(0) == MVT::v2f32 &&
+ "Do not know how to legalize this Node");
+ return;
+ }
+ case ISD::INTRINSIC_W_CHAIN: {
+ unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ switch (IntNo) {
+ default : llvm_unreachable("Do not know how to custom type "
+ "legalize this intrinsic operation!");
+ case Intrinsic::x86_rdtsc:
+ return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
+ Results);
+ case Intrinsic::x86_rdtscp:
+ return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
+ Results);
+ case Intrinsic::x86_rdpmc:
+ return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
+
+ case Intrinsic::x86_xgetbv:
+ return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
+ }
+ }
+ case ISD::INTRINSIC_WO_CHAIN: {
+ if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG))
+ Results.push_back(V);
+ return;
+ }
+ case ISD::READCYCLECOUNTER: {
+ return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
+ Results);
+ }
+ case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
+ EVT T = N->getValueType(0);
+ assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
+ bool Regs64bit = T == MVT::i128;
+ MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
+ SDValue cpInL, cpInH;
+ cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
+ DAG.getConstant(0, dl, HalfT));
+ cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
+ DAG.getConstant(1, dl, HalfT));
+ cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
+ Regs64bit ? X86::RAX : X86::EAX,
+ cpInL, SDValue());
+ cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
+ Regs64bit ? X86::RDX : X86::EDX,
+ cpInH, cpInL.getValue(1));
+ SDValue swapInL, swapInH;
+ swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
+ DAG.getConstant(0, dl, HalfT));
+ swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
+ DAG.getConstant(1, dl, HalfT));
+ swapInH =
+ DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
+ swapInH, cpInH.getValue(1));
+ // If the current function needs the base pointer, RBX,
+ // we shouldn't use cmpxchg directly.
+ // Indeed the lowering of that instruction will clobber
+ // that register and since RBX will be a reserved register
+ // the register allocator will not make sure its value will
+ // be properly saved and restored around this live-range.
+ const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
+ SDValue Result;
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+ unsigned BasePtr = TRI->getBaseRegister();
+ MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
+ if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
+ (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
+ // ISel prefers the LCMPXCHG64 variant.
+ // If that assert breaks, that means it is not the case anymore,
+ // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
+ // not just EBX. This is a matter of accepting i64 input for that
+ // pseudo, and restoring into the register of the right wide
+ // in expand pseudo. Everything else should just work.
+ assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
+ "Saving only half of the RBX");
+ unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
+ : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
+ SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
+ Regs64bit ? X86::RBX : X86::EBX,
+ HalfT, swapInH.getValue(1));
+ SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
+ RBXSave,
+ /*Glue*/ RBXSave.getValue(2)};
+ Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
+ } else {
+ unsigned Opcode =
+ Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
+ swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
+ Regs64bit ? X86::RBX : X86::EBX, swapInL,
+ swapInH.getValue(1));
+ SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
+ swapInL.getValue(1)};
+ Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
+ }
+ SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
+ Regs64bit ? X86::RAX : X86::EAX,
+ HalfT, Result.getValue(1));
+ SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
+ Regs64bit ? X86::RDX : X86::EDX,
+ HalfT, cpOutL.getValue(2));
+ SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
+
+ SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
+ MVT::i32, cpOutH.getValue(2));
+ SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
+ Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
+
+ Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
+ Results.push_back(Success);
+ Results.push_back(EFLAGS.getValue(1));
+ return;
+ }
+ case ISD::ATOMIC_SWAP:
+ case ISD::ATOMIC_LOAD_ADD:
+ case ISD::ATOMIC_LOAD_SUB:
+ case ISD::ATOMIC_LOAD_AND:
+ case ISD::ATOMIC_LOAD_OR:
+ case ISD::ATOMIC_LOAD_XOR:
+ case ISD::ATOMIC_LOAD_NAND:
+ case ISD::ATOMIC_LOAD_MIN:
+ case ISD::ATOMIC_LOAD_MAX:
+ case ISD::ATOMIC_LOAD_UMIN:
+ case ISD::ATOMIC_LOAD_UMAX:
+ case ISD::ATOMIC_LOAD: {
+ // Delegate to generic TypeLegalization. Situations we can really handle
+ // should have already been dealt with by AtomicExpandPass.cpp.
+ break;
+ }
+ case ISD::BITCAST: {
+ assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
+ EVT DstVT = N->getValueType(0);
+ EVT SrcVT = N->getOperand(0)->getValueType(0);
+
+ if (SrcVT != MVT::f64 ||
+ (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
+ return;
+
+ unsigned NumElts = DstVT.getVectorNumElements();
+ EVT SVT = DstVT.getVectorElementType();
+ EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
+ SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+ MVT::v2f64, N->getOperand(0));
+ SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
+
+ if (ExperimentalVectorWideningLegalization) {
+ // If we are legalizing vectors by widening, we already have the desired
+ // legal vector type, just return it.
+ Results.push_back(ToVecInt);
+ return;
+ }
+
+ SmallVector<SDValue, 8> Elts;
+ for (unsigned i = 0, e = NumElts; i != e; ++i)
+ Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
+ ToVecInt, DAG.getIntPtrConstant(i, dl)));
+
+ Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
+ }
+ }
+}
+
+const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
+ switch ((X86ISD::NodeType)Opcode) {
+ case X86ISD::FIRST_NUMBER: break;
+ case X86ISD::BSF: return "X86ISD::BSF";
+ case X86ISD::BSR: return "X86ISD::BSR";
+ case X86ISD::SHLD: return "X86ISD::SHLD";
+ case X86ISD::SHRD: return "X86ISD::SHRD";
+ case X86ISD::FAND: return "X86ISD::FAND";
+ case X86ISD::FANDN: return "X86ISD::FANDN";
+ case X86ISD::FOR: return "X86ISD::FOR";
+ case X86ISD::FXOR: return "X86ISD::FXOR";
+ case X86ISD::FILD: return "X86ISD::FILD";
+ case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
+ case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
+ case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
+ case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
+ case X86ISD::FLD: return "X86ISD::FLD";
+ case X86ISD::FST: return "X86ISD::FST";
+ case X86ISD::CALL: return "X86ISD::CALL";
+ case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
+ case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
+ case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
+ case X86ISD::BT: return "X86ISD::BT";
+ case X86ISD::CMP: return "X86ISD::CMP";
+ case X86ISD::COMI: return "X86ISD::COMI";
+ case X86ISD::UCOMI: return "X86ISD::UCOMI";
+ case X86ISD::CMPM: return "X86ISD::CMPM";
+ case X86ISD::CMPMU: return "X86ISD::CMPMU";
+ case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
+ case X86ISD::SETCC: return "X86ISD::SETCC";
+ case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
+ case X86ISD::FSETCC: return "X86ISD::FSETCC";
+ case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
+ case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND";
+ case X86ISD::CMOV: return "X86ISD::CMOV";
+ case X86ISD::BRCOND: return "X86ISD::BRCOND";
+ case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
+ case X86ISD::IRET: return "X86ISD::IRET";
+ case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
+ case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
+ case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
+ case X86ISD::Wrapper: return "X86ISD::Wrapper";
+ case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
+ case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
+ case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
+ case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
+ case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
+ case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
+ case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
+ case X86ISD::PINSRB: return "X86ISD::PINSRB";
+ case X86ISD::PINSRW: return "X86ISD::PINSRW";
+ case X86ISD::MMX_PINSRW: return "X86ISD::MMX_PINSRW";
+ case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
+ case X86ISD::ANDNP: return "X86ISD::ANDNP";
+ case X86ISD::BLENDI: return "X86ISD::BLENDI";
+ case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
+ case X86ISD::ADDUS: return "X86ISD::ADDUS";
+ case X86ISD::SUBUS: return "X86ISD::SUBUS";
+ case X86ISD::HADD: return "X86ISD::HADD";
+ case X86ISD::HSUB: return "X86ISD::HSUB";
+ case X86ISD::FHADD: return "X86ISD::FHADD";
+ case X86ISD::FHSUB: return "X86ISD::FHSUB";
+ case X86ISD::ABS: return "X86ISD::ABS";
+ case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
+ case X86ISD::FMAX: return "X86ISD::FMAX";
+ case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
+ case X86ISD::FMIN: return "X86ISD::FMIN";
+ case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
+ case X86ISD::FMAXC: return "X86ISD::FMAXC";
+ case X86ISD::FMINC: return "X86ISD::FMINC";
+ case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
+ case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS";
+ case X86ISD::FRCP: return "X86ISD::FRCP";
+ case X86ISD::FRCPS: return "X86ISD::FRCPS";
+ case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
+ case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
+ case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
+ case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
+ case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
+ case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
+ case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
+ case X86ISD::EH_SJLJ_SETUP_DISPATCH:
+ return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
+ case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
+ case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
+ case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
+ case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
+ case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
+ case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
+ case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
+ case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
+ return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
+ case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
+ return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
+ case X86ISD::LADD: return "X86ISD::LADD";
+ case X86ISD::LSUB: return "X86ISD::LSUB";
+ case X86ISD::LOR: return "X86ISD::LOR";
+ case X86ISD::LXOR: return "X86ISD::LXOR";
+ case X86ISD::LAND: return "X86ISD::LAND";
+ case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
+ case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
+ case X86ISD::VZEXT: return "X86ISD::VZEXT";
+ case X86ISD::VSEXT: return "X86ISD::VSEXT";
+ case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
+ case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
+ case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
+ case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
+ case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
+ case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
+ case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
+ case X86ISD::VINSERT: return "X86ISD::VINSERT";
+ case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
+ case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
+ case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
+ case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
+ case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
+ case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
+ case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK";
+ case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
+ case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
+ case X86ISD::VSHL: return "X86ISD::VSHL";
+ case X86ISD::VSRL: return "X86ISD::VSRL";
+ case X86ISD::VSRA: return "X86ISD::VSRA";
+ case X86ISD::VSHLI: return "X86ISD::VSHLI";
+ case X86ISD::VSRLI: return "X86ISD::VSRLI";
+ case X86ISD::VSRAI: return "X86ISD::VSRAI";
+ case X86ISD::VSRAV: return "X86ISD::VSRAV";
+ case X86ISD::VROTLI: return "X86ISD::VROTLI";
+ case X86ISD::VROTRI: return "X86ISD::VROTRI";
+ case X86ISD::VPPERM: return "X86ISD::VPPERM";
+ case X86ISD::CMPP: return "X86ISD::CMPP";
+ case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
+ case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
+ case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM";
+ case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM";
+ case X86ISD::ADD: return "X86ISD::ADD";
+ case X86ISD::SUB: return "X86ISD::SUB";
+ case X86ISD::ADC: return "X86ISD::ADC";
+ case X86ISD::SBB: return "X86ISD::SBB";
+ case X86ISD::SMUL: return "X86ISD::SMUL";
+ case X86ISD::UMUL: return "X86ISD::UMUL";
+ case X86ISD::SMUL8: return "X86ISD::SMUL8";
+ case X86ISD::UMUL8: return "X86ISD::UMUL8";
+ case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
+ case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
+ case X86ISD::INC: return "X86ISD::INC";
+ case X86ISD::DEC: return "X86ISD::DEC";
+ case X86ISD::OR: return "X86ISD::OR";
+ case X86ISD::XOR: return "X86ISD::XOR";
+ case X86ISD::AND: return "X86ISD::AND";
+ case X86ISD::BEXTR: return "X86ISD::BEXTR";
+ case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
+ case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
+ case X86ISD::PTEST: return "X86ISD::PTEST";
+ case X86ISD::TESTP: return "X86ISD::TESTP";
+ case X86ISD::TESTM: return "X86ISD::TESTM";
+ case X86ISD::TESTNM: return "X86ISD::TESTNM";
+ case X86ISD::KORTEST: return "X86ISD::KORTEST";
+ case X86ISD::KTEST: return "X86ISD::KTEST";
+ case X86ISD::PACKSS: return "X86ISD::PACKSS";
+ case X86ISD::PACKUS: return "X86ISD::PACKUS";
+ case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
+ case X86ISD::VALIGN: return "X86ISD::VALIGN";
+ case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
+ case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
+ case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
+ case X86ISD::SHUFP: return "X86ISD::SHUFP";
+ case X86ISD::SHUF128: return "X86ISD::SHUF128";
+ case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
+ case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD";
+ case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
+ case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
+ case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
+ case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
+ case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
+ case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
+ case X86ISD::MOVSD: return "X86ISD::MOVSD";
+ case X86ISD::MOVSS: return "X86ISD::MOVSS";
+ case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
+ case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
+ case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
+ case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
+ case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
+ case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT";
+ case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
+ case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
+ case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
+ case X86ISD::VPERMV: return "X86ISD::VPERMV";
+ case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
+ case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
+ case X86ISD::VPERMI: return "X86ISD::VPERMI";
+ case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
+ case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
+ case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
+ case X86ISD::VRANGE: return "X86ISD::VRANGE";
+ case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
+ case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
+ case X86ISD::PSADBW: return "X86ISD::PSADBW";
+ case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
+ case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
+ case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
+ case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
+ case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
+ case X86ISD::MFENCE: return "X86ISD::MFENCE";
+ case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
+ case X86ISD::SAHF: return "X86ISD::SAHF";
+ case X86ISD::RDRAND: return "X86ISD::RDRAND";
+ case X86ISD::RDSEED: return "X86ISD::RDSEED";
+ case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
+ case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
+ case X86ISD::VPROT: return "X86ISD::VPROT";
+ case X86ISD::VPROTI: return "X86ISD::VPROTI";
+ case X86ISD::VPSHA: return "X86ISD::VPSHA";
+ case X86ISD::VPSHL: return "X86ISD::VPSHL";
+ case X86ISD::VPCOM: return "X86ISD::VPCOM";
+ case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
+ case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
+ case X86ISD::FMADD: return "X86ISD::FMADD";
+ case X86ISD::FMSUB: return "X86ISD::FMSUB";
+ case X86ISD::FNMADD: return "X86ISD::FNMADD";
+ case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
+ case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
+ case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
+ case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
+ case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
+ case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
+ case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
+ case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
+ case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
+ case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND";
+ case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND";
+ case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND";
+ case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND";
+ case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND";
+ case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND";
+ case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND";
+ case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND";
+ case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
+ case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
+ case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
+ case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
+ case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
+ case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
+ case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
+ case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
+ case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
+ case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
+ case X86ISD::XTEST: return "X86ISD::XTEST";
+ case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
+ case X86ISD::EXPAND: return "X86ISD::EXPAND";
+ case X86ISD::SELECT: return "X86ISD::SELECT";
+ case X86ISD::SELECTS: return "X86ISD::SELECTS";
+ case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
+ case X86ISD::RCP28: return "X86ISD::RCP28";
+ case X86ISD::RCP28S: return "X86ISD::RCP28S";
+ case X86ISD::EXP2: return "X86ISD::EXP2";
+ case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
+ case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
+ case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
+ case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
+ case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
+ case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
+ case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
+ case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
+ case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
+ case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND";
+ case X86ISD::SCALEF: return "X86ISD::SCALEF";
+ case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
+ case X86ISD::ADDS: return "X86ISD::ADDS";
+ case X86ISD::SUBS: return "X86ISD::SUBS";
+ case X86ISD::AVG: return "X86ISD::AVG";
+ case X86ISD::MULHRS: return "X86ISD::MULHRS";
+ case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
+ case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
+ case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
+ case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
+ case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND";
+ case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND";
+ case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND";
+ case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND";
+ case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
+ case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
+ case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
+ case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
+ case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
+ case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
+ case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
+ case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
+ case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
+ case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
+ case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
+ case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
+ case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
+ case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
+ case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
+ }
+ return nullptr;
+}
+
+/// Return true if the addressing mode represented by AM is legal for this
+/// target, for a load/store of the specified type.
+bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
+ const AddrMode &AM, Type *Ty,
+ unsigned AS) const {
+ // X86 supports extremely general addressing modes.
+ CodeModel::Model M = getTargetMachine().getCodeModel();
+
+ // X86 allows a sign-extended 32-bit immediate field as a displacement.
+ if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
+ return false;
+
+ if (AM.BaseGV) {
+ unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
+
+ // If a reference to this global requires an extra load, we can't fold it.
+ if (isGlobalStubReference(GVFlags))
+ return false;
+
+ // If BaseGV requires a register for the PIC base, we cannot also have a
+ // BaseReg specified.
+ if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
+ return false;
+
+ // If lower 4G is not available, then we must use rip-relative addressing.
+ if ((M != CodeModel::Small || isPositionIndependent()) &&
+ Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
+ return false;
+ }
+
+ switch (AM.Scale) {
+ case 0:
+ case 1:
+ case 2:
+ case 4:
+ case 8:
+ // These scales always work.
+ break;
+ case 3:
+ case 5:
+ case 9:
+ // These scales are formed with basereg+scalereg. Only accept if there is
+ // no basereg yet.
+ if (AM.HasBaseReg)
+ return false;
+ break;
+ default: // Other stuff never works.
+ return false;
+ }
+
+ return true;
+}
+
+bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
+ unsigned Bits = Ty->getScalarSizeInBits();
+
+ // 8-bit shifts are always expensive, but versions with a scalar amount aren't
+ // particularly cheaper than those without.
+ if (Bits == 8)
+ return false;
+
+ // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
+ // variable shifts just as cheap as scalar ones.
+ if (Subtarget.hasInt256() && (Bits == 32 || Bits == 64))
+ return false;
+
+ // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
+ // fully general vector.
+ return true;
+}
+
+bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
+ if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+ return false;
+ unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
+ unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
+ return NumBits1 > NumBits2;
+}
+
+bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
+ if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+ return false;
+
+ if (!isTypeLegal(EVT::getEVT(Ty1)))
+ return false;
+
+ assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
+
+ // Assuming the caller doesn't have a zeroext or signext return parameter,
+ // truncation all the way down to i1 is valid.
+ return true;
+}
+
+bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
+ return isInt<32>(Imm);
+}
+
+bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
+ // Can also use sub to handle negated immediates.
+ return isInt<32>(Imm);
+}
+
+bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
+ if (!VT1.isInteger() || !VT2.isInteger())
+ return false;
+ unsigned NumBits1 = VT1.getSizeInBits();
+ unsigned NumBits2 = VT2.getSizeInBits();
+ return NumBits1 > NumBits2;
+}
+
+bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
+ // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
+ return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
+}
+
+bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
+ // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
+ return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
+}
+
+bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+ EVT VT1 = Val.getValueType();
+ if (isZExtFree(VT1, VT2))
+ return true;
+
+ if (Val.getOpcode() != ISD::LOAD)
+ return false;
+
+ if (!VT1.isSimple() || !VT1.isInteger() ||
+ !VT2.isSimple() || !VT2.isInteger())
+ return false;
+
+ switch (VT1.getSimpleVT().SimpleTy) {
+ default: break;
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32:
+ // X86 has 8, 16, and 32-bit zero-extending loads.
+ return true;
+ }
+
+ return false;
+}
+
+bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
+
+bool
+X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+ if (!Subtarget.hasAnyFMA())
+ return false;
+
+ VT = VT.getScalarType();
+
+ if (!VT.isSimple())
+ return false;
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ case MVT::f32:
+ case MVT::f64:
+ return true;
+ default:
+ break;
+ }
+
+ return false;
+}
+
+bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
+ // i16 instructions are longer (0x66 prefix) and potentially slower.
+ return !(VT1 == MVT::i32 && VT2 == MVT::i16);
+}
+
+/// Targets can use this to indicate that they only support *some*
+/// VECTOR_SHUFFLE operations, those with specific masks.
+/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
+/// are assumed to be legal.
+bool
+X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
+ EVT VT) const {
+ if (!VT.isSimple())
+ return false;
+
+ // Not for i1 vectors
+ if (VT.getSimpleVT().getScalarType() == MVT::i1)
+ return false;
+
+ // Very little shuffling can be done for 64-bit vectors right now.
+ if (VT.getSimpleVT().getSizeInBits() == 64)
+ return false;
+
+ // We only care that the types being shuffled are legal. The lowering can
+ // handle any possible shuffle mask that results.
+ return isTypeLegal(VT.getSimpleVT());
+}
+
+bool
+X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
+ EVT VT) const {
+ // Just delegate to the generic legality, clear masks aren't special.
+ return isShuffleMaskLegal(Mask, VT);
+}
+
+//===----------------------------------------------------------------------===//
+// X86 Scheduler Hooks
+//===----------------------------------------------------------------------===//
+
+/// Utility function to emit xbegin specifying the start of an RTM region.
+static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
+ const TargetInstrInfo *TII) {
+ DebugLoc DL = MI.getDebugLoc();
+
+ const BasicBlock *BB = MBB->getBasicBlock();
+ MachineFunction::iterator I = ++MBB->getIterator();
+
+ // For the v = xbegin(), we generate
+ //
+ // thisMBB:
+ // xbegin sinkMBB
+ //
+ // mainMBB:
+ // eax = -1
+ //
+ // sinkMBB:
+ // v = eax
+
+ MachineBasicBlock *thisMBB = MBB;
+ MachineFunction *MF = MBB->getParent();
+ MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+ MF->insert(I, mainMBB);
+ MF->insert(I, sinkMBB);
+
+ // Transfer the remainder of BB and its successor edges to sinkMBB.
+ sinkMBB->splice(sinkMBB->begin(), MBB,
+ std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+ sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+ // thisMBB:
+ // xbegin sinkMBB
+ // # fallthrough to mainMBB
+ // # abortion to sinkMBB
+ BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
+ thisMBB->addSuccessor(mainMBB);
+ thisMBB->addSuccessor(sinkMBB);
+
+ // mainMBB:
+ // EAX = -1
+ BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
+ mainMBB->addSuccessor(sinkMBB);
+
+ // sinkMBB:
+ // EAX is live into the sinkMBB
+ sinkMBB->addLiveIn(X86::EAX);
+ BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(TargetOpcode::COPY),
+ MI.getOperand(0).getReg())
+ .addReg(X86::EAX);
+
+ MI.eraseFromParent();
+ return sinkMBB;
+}
+
+// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
+// or XMM0_V32I8 in AVX all of this code can be replaced with that
+// in the .td file.
+static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
+ const TargetInstrInfo *TII) {
+ unsigned Opc;
+ switch (MI.getOpcode()) {
+ default: llvm_unreachable("illegal opcode!");
+ case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
+ case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
+ case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
+ case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
+ case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
+ case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
+ case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
+ case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
+ }
+
+ DebugLoc dl = MI.getDebugLoc();
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
+
+ unsigned NumArgs = MI.getNumOperands();
+ for (unsigned i = 1; i < NumArgs; ++i) {
+ MachineOperand &Op = MI.getOperand(i);
+ if (!(Op.isReg() && Op.isImplicit()))
+ MIB.addOperand(Op);
+ }
+ if (MI.hasOneMemOperand())
+ MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+ BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
+ .addReg(X86::XMM0);
+
+ MI.eraseFromParent();
+ return BB;
+}
+
+// FIXME: Custom handling because TableGen doesn't support multiple implicit
+// defs in an instruction pattern
+static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
+ const TargetInstrInfo *TII) {
+ unsigned Opc;
+ switch (MI.getOpcode()) {
+ default: llvm_unreachable("illegal opcode!");
+ case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
+ case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
+ case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
+ case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
+ case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
+ case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
+ case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
+ case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
+ }
+
+ DebugLoc dl = MI.getDebugLoc();
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
+
+ unsigned NumArgs = MI.getNumOperands(); // remove the results
+ for (unsigned i = 1; i < NumArgs; ++i) {
+ MachineOperand &Op = MI.getOperand(i);
+ if (!(Op.isReg() && Op.isImplicit()))
+ MIB.addOperand(Op);
+ }
+ if (MI.hasOneMemOperand())
+ MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+ BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
+ .addReg(X86::ECX);
+
+ MI.eraseFromParent();
+ return BB;
+}
+
+static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
+ const X86Subtarget &Subtarget) {
+ DebugLoc dl = MI.getDebugLoc();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+
+ // insert input VAL into EAX
+ BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
+ .addReg(MI.getOperand(0).getReg());
+ // insert zero to ECX
+ BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
+
+ // insert zero to EDX
+ BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
+
+ // insert WRPKRU instruction
+ BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
+
+ MI.eraseFromParent(); // The pseudo is gone now.
+ return BB;
+}
+
+static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
+ const X86Subtarget &Subtarget) {
+ DebugLoc dl = MI.getDebugLoc();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+
+ // insert zero to ECX
+ BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
+
+ // insert RDPKRU instruction
+ BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
+ BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
+ .addReg(X86::EAX);
+
+ MI.eraseFromParent(); // The pseudo is gone now.
+ return BB;
+}
+
+static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
+ const X86Subtarget &Subtarget,
+ unsigned Opc) {
+ DebugLoc dl = MI.getDebugLoc();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ // Address into RAX/EAX, other two args into ECX, EDX.
+ unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
+ unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
+ for (int i = 0; i < X86::AddrNumOperands; ++i)
+ MIB.addOperand(MI.getOperand(i));
+
+ unsigned ValOps = X86::AddrNumOperands;
+ BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
+ .addReg(MI.getOperand(ValOps).getReg());
+ BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
+ .addReg(MI.getOperand(ValOps + 1).getReg());
+
+ // The instruction doesn't actually take any operands though.
+ BuildMI(*BB, MI, dl, TII->get(Opc));
+
+ MI.eraseFromParent(); // The pseudo is gone now.
+ return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *MBB) const {
+ // Emit va_arg instruction on X86-64.
+
+ // Operands to this pseudo-instruction:
+ // 0 ) Output : destination address (reg)
+ // 1-5) Input : va_list address (addr, i64mem)
+ // 6 ) ArgSize : Size (in bytes) of vararg type
+ // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
+ // 8 ) Align : Alignment of type
+ // 9 ) EFLAGS (implicit-def)
+
+ assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
+ static_assert(X86::AddrNumOperands == 5,
+ "VAARG_64 assumes 5 address operands");
+
+ unsigned DestReg = MI.getOperand(0).getReg();
+ MachineOperand &Base = MI.getOperand(1);
+ MachineOperand &Scale = MI.getOperand(2);
+ MachineOperand &Index = MI.getOperand(3);
+ MachineOperand &Disp = MI.getOperand(4);
+ MachineOperand &Segment = MI.getOperand(5);
+ unsigned ArgSize = MI.getOperand(6).getImm();
+ unsigned ArgMode = MI.getOperand(7).getImm();
+ unsigned Align = MI.getOperand(8).getImm();
+
+ // Memory Reference
+ assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
+ MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
+ MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
+
+ // Machine Information
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+ const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
+ const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
+ DebugLoc DL = MI.getDebugLoc();
+
+ // struct va_list {
+ // i32 gp_offset
+ // i32 fp_offset
+ // i64 overflow_area (address)
+ // i64 reg_save_area (address)
+ // }
+ // sizeof(va_list) = 24
+ // alignment(va_list) = 8
+
+ unsigned TotalNumIntRegs = 6;
+ unsigned TotalNumXMMRegs = 8;
+ bool UseGPOffset = (ArgMode == 1);
+ bool UseFPOffset = (ArgMode == 2);
+ unsigned MaxOffset = TotalNumIntRegs * 8 +
+ (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
+
+ /* Align ArgSize to a multiple of 8 */
+ unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
+ bool NeedsAlign = (Align > 8);
+
+ MachineBasicBlock *thisMBB = MBB;
+ MachineBasicBlock *overflowMBB;
+ MachineBasicBlock *offsetMBB;
+ MachineBasicBlock *endMBB;
+
+ unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
+ unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
+ unsigned OffsetReg = 0;
+
+ if (!UseGPOffset && !UseFPOffset) {
+ // If we only pull from the overflow region, we don't create a branch.
+ // We don't need to alter control flow.
+ OffsetDestReg = 0; // unused
+ OverflowDestReg = DestReg;
+
+ offsetMBB = nullptr;
+ overflowMBB = thisMBB;
+ endMBB = thisMBB;
+ } else {
+ // First emit code to check if gp_offset (or fp_offset) is below the bound.
+ // If so, pull the argument from reg_save_area. (branch to offsetMBB)
+ // If not, pull from overflow_area. (branch to overflowMBB)
+ //
+ // thisMBB
+ // | .
+ // | .
+ // offsetMBB overflowMBB
+ // | .
+ // | .
+ // endMBB
+
+ // Registers for the PHI in endMBB
+ OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
+ OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
+
+ const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+ MachineFunction *MF = MBB->getParent();
+ overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+
+ MachineFunction::iterator MBBIter = ++MBB->getIterator();
+
+ // Insert the new basic blocks
+ MF->insert(MBBIter, offsetMBB);
+ MF->insert(MBBIter, overflowMBB);
+ MF->insert(MBBIter, endMBB);
+
+ // Transfer the remainder of MBB and its successor edges to endMBB.
+ endMBB->splice(endMBB->begin(), thisMBB,
+ std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
+ endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
+
+ // Make offsetMBB and overflowMBB successors of thisMBB
+ thisMBB->addSuccessor(offsetMBB);
+ thisMBB->addSuccessor(overflowMBB);
+
+ // endMBB is a successor of both offsetMBB and overflowMBB
+ offsetMBB->addSuccessor(endMBB);
+ overflowMBB->addSuccessor(endMBB);
+
+ // Load the offset value into a register
+ OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
+ BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
+ .addOperand(Base)
+ .addOperand(Scale)
+ .addOperand(Index)
+ .addDisp(Disp, UseFPOffset ? 4 : 0)
+ .addOperand(Segment)
+ .setMemRefs(MMOBegin, MMOEnd);
+
+ // Check if there is enough room left to pull this argument.
+ BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
+ .addReg(OffsetReg)
+ .addImm(MaxOffset + 8 - ArgSizeA8);
+
+ // Branch to "overflowMBB" if offset >= max
+ // Fall through to "offsetMBB" otherwise
+ BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
+ .addMBB(overflowMBB);
+ }
+
+ // In offsetMBB, emit code to use the reg_save_area.
+ if (offsetMBB) {
+ assert(OffsetReg != 0);
+
+ // Read the reg_save_area address.
+ unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
+ BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
+ .addOperand(Base)
+ .addOperand(Scale)
+ .addOperand(Index)
+ .addDisp(Disp, 16)
+ .addOperand(Segment)
+ .setMemRefs(MMOBegin, MMOEnd);
+
+ // Zero-extend the offset
+ unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
+ BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
+ .addImm(0)
+ .addReg(OffsetReg)
+ .addImm(X86::sub_32bit);
+
+ // Add the offset to the reg_save_area to get the final address.
+ BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
+ .addReg(OffsetReg64)
+ .addReg(RegSaveReg);
+
+ // Compute the offset for the next argument
+ unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
+ BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
+ .addReg(OffsetReg)
+ .addImm(UseFPOffset ? 16 : 8);
+
+ // Store it back into the va_list.
+ BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
+ .addOperand(Base)
+ .addOperand(Scale)
+ .addOperand(Index)
+ .addDisp(Disp, UseFPOffset ? 4 : 0)
+ .addOperand(Segment)
+ .addReg(NextOffsetReg)
+ .setMemRefs(MMOBegin, MMOEnd);
+
+ // Jump to endMBB
+ BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
+ .addMBB(endMBB);
+ }
+
+ //
+ // Emit code to use overflow area
+ //
+
+ // Load the overflow_area address into a register.
+ unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
+ BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
+ .addOperand(Base)
+ .addOperand(Scale)
+ .addOperand(Index)
+ .addDisp(Disp, 8)
+ .addOperand(Segment)
+ .setMemRefs(MMOBegin, MMOEnd);
+
+ // If we need to align it, do so. Otherwise, just copy the address
+ // to OverflowDestReg.
+ if (NeedsAlign) {
+ // Align the overflow address
+ assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
+ unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
+
+ // aligned_addr = (addr + (align-1)) & ~(align-1)
+ BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
+ .addReg(OverflowAddrReg)
+ .addImm(Align-1);
+
+ BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
+ .addReg(TmpReg)
+ .addImm(~(uint64_t)(Align-1));
+ } else {
+ BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
+ .addReg(OverflowAddrReg);
+ }
+
+ // Compute the next overflow address after this argument.
+ // (the overflow address should be kept 8-byte aligned)
+ unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
+ BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
+ .addReg(OverflowDestReg)
+ .addImm(ArgSizeA8);
+
+ // Store the new overflow address.
+ BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
+ .addOperand(Base)
+ .addOperand(Scale)
+ .addOperand(Index)
+ .addDisp(Disp, 8)
+ .addOperand(Segment)
+ .addReg(NextAddrReg)
+ .setMemRefs(MMOBegin, MMOEnd);
+
+ // If we branched, emit the PHI to the front of endMBB.
+ if (offsetMBB) {
+ BuildMI(*endMBB, endMBB->begin(), DL,
+ TII->get(X86::PHI), DestReg)
+ .addReg(OffsetDestReg).addMBB(offsetMBB)
+ .addReg(OverflowDestReg).addMBB(overflowMBB);
+ }
+
+ // Erase the pseudo instruction
+ MI.eraseFromParent();
+
+ return endMBB;
+}
+
+MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
+ MachineInstr &MI, MachineBasicBlock *MBB) const {
+ // Emit code to save XMM registers to the stack. The ABI says that the
+ // number of registers to save is given in %al, so it's theoretically
+ // possible to do an indirect jump trick to avoid saving all of them,
+ // however this code takes a simpler approach and just executes all
+ // of the stores if %al is non-zero. It's less code, and it's probably
+ // easier on the hardware branch predictor, and stores aren't all that
+ // expensive anyway.
+
+ // Create the new basic blocks. One block contains all the XMM stores,
+ // and one block is the final destination regardless of whether any
+ // stores were performed.
+ const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+ MachineFunction *F = MBB->getParent();
+ MachineFunction::iterator MBBIter = ++MBB->getIterator();
+ MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ F->insert(MBBIter, XMMSaveMBB);
+ F->insert(MBBIter, EndMBB);
+
+ // Transfer the remainder of MBB and its successor edges to EndMBB.
+ EndMBB->splice(EndMBB->begin(), MBB,
+ std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+ EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+ // The original block will now fall through to the XMM save block.
+ MBB->addSuccessor(XMMSaveMBB);
+ // The XMMSaveMBB will fall through to the end block.
+ XMMSaveMBB->addSuccessor(EndMBB);
+
+ // Now add the instructions.
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ DebugLoc DL = MI.getDebugLoc();
+
+ unsigned CountReg = MI.getOperand(0).getReg();
+ int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
+ int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
+
+ if (!Subtarget.isCallingConvWin64(F->getFunction()->getCallingConv())) {
+ // If %al is 0, branch around the XMM save block.
+ BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
+ BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
+ MBB->addSuccessor(EndMBB);
+ }
+
+ // Make sure the last operand is EFLAGS, which gets clobbered by the branch
+ // that was just emitted, but clearly shouldn't be "saved".
+ assert((MI.getNumOperands() <= 3 ||
+ !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
+ MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
+ "Expected last argument to be EFLAGS");
+ unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
+ // In the XMM save block, save all the XMM argument registers.
+ for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
+ int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
+ MachineMemOperand *MMO = F->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
+ MachineMemOperand::MOStore,
+ /*Size=*/16, /*Align=*/16);
+ BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
+ .addFrameIndex(RegSaveFrameIndex)
+ .addImm(/*Scale=*/1)
+ .addReg(/*IndexReg=*/0)
+ .addImm(/*Disp=*/Offset)
+ .addReg(/*Segment=*/0)
+ .addReg(MI.getOperand(i).getReg())
+ .addMemOperand(MMO);
+ }
+
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
+
+ return EndMBB;
+}
+
+// The EFLAGS operand of SelectItr might be missing a kill marker
+// because there were multiple uses of EFLAGS, and ISel didn't know
+// which to mark. Figure out whether SelectItr should have had a
+// kill marker, and set it if it should. Returns the correct kill
+// marker value.
+static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
+ MachineBasicBlock* BB,
+ const TargetRegisterInfo* TRI) {
+ // Scan forward through BB for a use/def of EFLAGS.
+ MachineBasicBlock::iterator miI(std::next(SelectItr));
+ for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
+ const MachineInstr& mi = *miI;
+ if (mi.readsRegister(X86::EFLAGS))
+ return false;
+ if (mi.definesRegister(X86::EFLAGS))
+ break; // Should have kill-flag - update below.
+ }
+
+ // If we hit the end of the block, check whether EFLAGS is live into a
+ // successor.
+ if (miI == BB->end()) {
+ for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
+ sEnd = BB->succ_end();
+ sItr != sEnd; ++sItr) {
+ MachineBasicBlock* succ = *sItr;
+ if (succ->isLiveIn(X86::EFLAGS))
+ return false;
+ }
+ }
+
+ // We found a def, or hit the end of the basic block and EFLAGS wasn't live
+ // out. SelectMI should have a kill flag on EFLAGS.
+ SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
+ return true;
+}
+
+// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
+// together with other CMOV pseudo-opcodes into a single basic-block with
+// conditional jump around it.
+static bool isCMOVPseudo(MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case X86::CMOV_FR32:
+ case X86::CMOV_FR64:
+ case X86::CMOV_GR8:
+ case X86::CMOV_GR16:
+ case X86::CMOV_GR32:
+ case X86::CMOV_RFP32:
+ case X86::CMOV_RFP64:
+ case X86::CMOV_RFP80:
+ case X86::CMOV_V2F64:
+ case X86::CMOV_V2I64:
+ case X86::CMOV_V4F32:
+ case X86::CMOV_V4F64:
+ case X86::CMOV_V4I64:
+ case X86::CMOV_V16F32:
+ case X86::CMOV_V8F32:
+ case X86::CMOV_V8F64:
+ case X86::CMOV_V8I64:
+ case X86::CMOV_V8I1:
+ case X86::CMOV_V16I1:
+ case X86::CMOV_V32I1:
+ case X86::CMOV_V64I1:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ DebugLoc DL = MI.getDebugLoc();
+
+ // To "insert" a SELECT_CC instruction, we actually have to insert the
+ // diamond control-flow pattern. The incoming instruction knows the
+ // destination vreg to set, the condition code register to branch on, the
+ // true/false values to select between, and a branch opcode to use.
+ const BasicBlock *LLVM_BB = BB->getBasicBlock();
+ MachineFunction::iterator It = ++BB->getIterator();
+
+ // thisMBB:
+ // ...
+ // TrueVal = ...
+ // cmpTY ccX, r1, r2
+ // bCC copy1MBB
+ // fallthrough --> copy0MBB
+ MachineBasicBlock *thisMBB = BB;
+ MachineFunction *F = BB->getParent();
+
+ // This code lowers all pseudo-CMOV instructions. Generally it lowers these
+ // as described above, by inserting a BB, and then making a PHI at the join
+ // point to select the true and false operands of the CMOV in the PHI.
+ //
+ // The code also handles two different cases of multiple CMOV opcodes
+ // in a row.
+ //
+ // Case 1:
+ // In this case, there are multiple CMOVs in a row, all which are based on
+ // the same condition setting (or the exact opposite condition setting).
+ // In this case we can lower all the CMOVs using a single inserted BB, and
+ // then make a number of PHIs at the join point to model the CMOVs. The only
+ // trickiness here, is that in a case like:
+ //
+ // t2 = CMOV cond1 t1, f1
+ // t3 = CMOV cond1 t2, f2
+ //
+ // when rewriting this into PHIs, we have to perform some renaming on the
+ // temps since you cannot have a PHI operand refer to a PHI result earlier
+ // in the same block. The "simple" but wrong lowering would be:
+ //
+ // t2 = PHI t1(BB1), f1(BB2)
+ // t3 = PHI t2(BB1), f2(BB2)
+ //
+ // but clearly t2 is not defined in BB1, so that is incorrect. The proper
+ // renaming is to note that on the path through BB1, t2 is really just a
+ // copy of t1, and do that renaming, properly generating:
+ //
+ // t2 = PHI t1(BB1), f1(BB2)
+ // t3 = PHI t1(BB1), f2(BB2)
+ //
+ // Case 2, we lower cascaded CMOVs such as
+ //
+ // (CMOV (CMOV F, T, cc1), T, cc2)
+ //
+ // to two successives branches. For that, we look for another CMOV as the
+ // following instruction.
+ //
+ // Without this, we would add a PHI between the two jumps, which ends up
+ // creating a few copies all around. For instance, for
+ //
+ // (sitofp (zext (fcmp une)))
+ //
+ // we would generate:
+ //
+ // ucomiss %xmm1, %xmm0
+ // movss <1.0f>, %xmm0
+ // movaps %xmm0, %xmm1
+ // jne .LBB5_2
+ // xorps %xmm1, %xmm1
+ // .LBB5_2:
+ // jp .LBB5_4
+ // movaps %xmm1, %xmm0
+ // .LBB5_4:
+ // retq
+ //
+ // because this custom-inserter would have generated:
+ //
+ // A
+ // | \
+ // | B
+ // | /
+ // C
+ // | \
+ // | D
+ // | /
+ // E
+ //
+ // A: X = ...; Y = ...
+ // B: empty
+ // C: Z = PHI [X, A], [Y, B]
+ // D: empty
+ // E: PHI [X, C], [Z, D]
+ //
+ // If we lower both CMOVs in a single step, we can instead generate:
+ //
+ // A
+ // | \
+ // | C
+ // | /|
+ // |/ |
+ // | |
+ // | D
+ // | /
+ // E
+ //
+ // A: X = ...; Y = ...
+ // D: empty
+ // E: PHI [X, A], [X, C], [Y, D]
+ //
+ // Which, in our sitofp/fcmp example, gives us something like:
+ //
+ // ucomiss %xmm1, %xmm0
+ // movss <1.0f>, %xmm0
+ // jne .LBB5_4
+ // jp .LBB5_4
+ // xorps %xmm0, %xmm0
+ // .LBB5_4:
+ // retq
+ //
+ MachineInstr *CascadedCMOV = nullptr;
+ MachineInstr *LastCMOV = &MI;
+ X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
+ X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
+ MachineBasicBlock::iterator NextMIIt =
+ std::next(MachineBasicBlock::iterator(MI));
+
+ // Check for case 1, where there are multiple CMOVs with the same condition
+ // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
+ // number of jumps the most.
+
+ if (isCMOVPseudo(MI)) {
+ // See if we have a string of CMOVS with the same condition.
+ while (NextMIIt != BB->end() && isCMOVPseudo(*NextMIIt) &&
+ (NextMIIt->getOperand(3).getImm() == CC ||
+ NextMIIt->getOperand(3).getImm() == OppCC)) {
+ LastCMOV = &*NextMIIt;
+ ++NextMIIt;
+ }
+ }
+
+ // This checks for case 2, but only do this if we didn't already find
+ // case 1, as indicated by LastCMOV == MI.
+ if (LastCMOV == &MI && NextMIIt != BB->end() &&
+ NextMIIt->getOpcode() == MI.getOpcode() &&
+ NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
+ NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
+ NextMIIt->getOperand(1).isKill()) {
+ CascadedCMOV = &*NextMIIt;
+ }
+
+ MachineBasicBlock *jcc1MBB = nullptr;
+
+ // If we have a cascaded CMOV, we lower it to two successive branches to
+ // the same block. EFLAGS is used by both, so mark it as live in the second.
+ if (CascadedCMOV) {
+ jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
+ F->insert(It, jcc1MBB);
+ jcc1MBB->addLiveIn(X86::EFLAGS);
+ }
+
+ MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ F->insert(It, copy0MBB);
+ F->insert(It, sinkMBB);
+
+ // If the EFLAGS register isn't dead in the terminator, then claim that it's
+ // live into the sink and copy blocks.
+ const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+
+ MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
+ if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
+ !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
+ copy0MBB->addLiveIn(X86::EFLAGS);
+ sinkMBB->addLiveIn(X86::EFLAGS);
+ }
+
+ // Transfer the remainder of BB and its successor edges to sinkMBB.
+ sinkMBB->splice(sinkMBB->begin(), BB,
+ std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end());
+ sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+ // Add the true and fallthrough blocks as its successors.
+ if (CascadedCMOV) {
+ // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV.
+ BB->addSuccessor(jcc1MBB);
+
+ // In that case, jcc1MBB will itself fallthrough the copy0MBB, and
+ // jump to the sinkMBB.
+ jcc1MBB->addSuccessor(copy0MBB);
+ jcc1MBB->addSuccessor(sinkMBB);
+ } else {
+ BB->addSuccessor(copy0MBB);
+ }
+
+ // The true block target of the first (or only) branch is always sinkMBB.
+ BB->addSuccessor(sinkMBB);
+
+ // Create the conditional branch instruction.
+ unsigned Opc = X86::GetCondBranchFromCond(CC);
+ BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
+
+ if (CascadedCMOV) {
+ unsigned Opc2 = X86::GetCondBranchFromCond(
+ (X86::CondCode)CascadedCMOV->getOperand(3).getImm());
+ BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
+ }
+
+ // copy0MBB:
+ // %FalseValue = ...
+ // # fallthrough to sinkMBB
+ copy0MBB->addSuccessor(sinkMBB);
+
+ // sinkMBB:
+ // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+ // ...
+ MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
+ MachineBasicBlock::iterator MIItEnd =
+ std::next(MachineBasicBlock::iterator(LastCMOV));
+ MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin();
+ DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
+ MachineInstrBuilder MIB;
+
+ // As we are creating the PHIs, we have to be careful if there is more than
+ // one. Later CMOVs may reference the results of earlier CMOVs, but later
+ // PHIs have to reference the individual true/false inputs from earlier PHIs.
+ // That also means that PHI construction must work forward from earlier to
+ // later, and that the code must maintain a mapping from earlier PHI's
+ // destination registers, and the registers that went into the PHI.
+
+ for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
+ unsigned DestReg = MIIt->getOperand(0).getReg();
+ unsigned Op1Reg = MIIt->getOperand(1).getReg();
+ unsigned Op2Reg = MIIt->getOperand(2).getReg();
+
+ // If this CMOV we are generating is the opposite condition from
+ // the jump we generated, then we have to swap the operands for the
+ // PHI that is going to be generated.
+ if (MIIt->getOperand(3).getImm() == OppCC)
+ std::swap(Op1Reg, Op2Reg);
+
+ if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
+ Op1Reg = RegRewriteTable[Op1Reg].first;
+
+ if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
+ Op2Reg = RegRewriteTable[Op2Reg].second;
+
+ MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL,
+ TII->get(X86::PHI), DestReg)
+ .addReg(Op1Reg).addMBB(copy0MBB)
+ .addReg(Op2Reg).addMBB(thisMBB);
+
+ // Add this PHI to the rewrite table.
+ RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
+ }
+
+ // If we have a cascaded CMOV, the second Jcc provides the same incoming
+ // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
+ if (CascadedCMOV) {
+ MIB.addReg(MI.getOperand(2).getReg()).addMBB(jcc1MBB);
+ // Copy the PHI result to the register defined by the second CMOV.
+ BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
+ DL, TII->get(TargetOpcode::COPY),
+ CascadedCMOV->getOperand(0).getReg())
+ .addReg(MI.getOperand(0).getReg());
+ CascadedCMOV->eraseFromParent();
+ }
+
+ // Now remove the CMOV(s).
+ for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; )
+ (MIIt++)->eraseFromParent();
+
+ return sinkMBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ // Combine the following atomic floating-point modification pattern:
+ // a.store(reg OP a.load(acquire), release)
+ // Transform them into:
+ // OPss (%gpr), %xmm
+ // movss %xmm, (%gpr)
+ // Or sd equivalent for 64-bit operations.
+ unsigned MOp, FOp;
+ switch (MI.getOpcode()) {
+ default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
+ case X86::RELEASE_FADD32mr:
+ FOp = X86::ADDSSrm;
+ MOp = X86::MOVSSmr;
+ break;
+ case X86::RELEASE_FADD64mr:
+ FOp = X86::ADDSDrm;
+ MOp = X86::MOVSDmr;
+ break;
+ }
+ const X86InstrInfo *TII = Subtarget.getInstrInfo();
+ DebugLoc DL = MI.getDebugLoc();
+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+ unsigned ValOpIdx = X86::AddrNumOperands;
+ unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
+ MachineInstrBuilder MIB =
+ BuildMI(*BB, MI, DL, TII->get(FOp),
+ MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
+ .addReg(VSrc);
+ for (int i = 0; i < X86::AddrNumOperands; ++i) {
+ MachineOperand &Operand = MI.getOperand(i);
+ // Clear any kill flags on register operands as we'll create a second
+ // instruction using the same address operands.
+ if (Operand.isReg())
+ Operand.setIsKill(false);
+ MIB.addOperand(Operand);
+ }
+ MachineInstr *FOpMI = MIB;
+ MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
+ for (int i = 0; i < X86::AddrNumOperands; ++i)
+ MIB.addOperand(MI.getOperand(i));
+ MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
+ return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ MachineFunction *MF = BB->getParent();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ DebugLoc DL = MI.getDebugLoc();
+ const BasicBlock *LLVM_BB = BB->getBasicBlock();
+
+ assert(MF->shouldSplitStack());
+
+ const bool Is64Bit = Subtarget.is64Bit();
+ const bool IsLP64 = Subtarget.isTarget64BitLP64();
+
+ const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
+ const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
+
+ // BB:
+ // ... [Till the alloca]
+ // If stacklet is not large enough, jump to mallocMBB
+ //
+ // bumpMBB:
+ // Allocate by subtracting from RSP
+ // Jump to continueMBB
+ //
+ // mallocMBB:
+ // Allocate by call to runtime
+ //
+ // continueMBB:
+ // ...
+ // [rest of original BB]
+ //
+
+ MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ const TargetRegisterClass *AddrRegClass =
+ getRegClassFor(getPointerTy(MF->getDataLayout()));
+
+ unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
+ bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
+ tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
+ SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
+ sizeVReg = MI.getOperand(1).getReg(),
+ physSPReg =
+ IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
+
+ MachineFunction::iterator MBBIter = ++BB->getIterator();
+
+ MF->insert(MBBIter, bumpMBB);
+ MF->insert(MBBIter, mallocMBB);
+ MF->insert(MBBIter, continueMBB);
+
+ continueMBB->splice(continueMBB->begin(), BB,
+ std::next(MachineBasicBlock::iterator(MI)), BB->end());
+ continueMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+ // Add code to the main basic block to check if the stack limit has been hit,
+ // and if so, jump to mallocMBB otherwise to bumpMBB.
+ BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
+ BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
+ .addReg(tmpSPVReg).addReg(sizeVReg);
+ BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
+ .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
+ .addReg(SPLimitVReg);
+ BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
+
+ // bumpMBB simply decreases the stack pointer, since we know the current
+ // stacklet has enough space.
+ BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
+ .addReg(SPLimitVReg);
+ BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
+ .addReg(SPLimitVReg);
+ BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
+
+ // Calls into a routine in libgcc to allocate more space from the heap.
+ const uint32_t *RegMask =
+ Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
+ if (IsLP64) {
+ BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
+ .addReg(sizeVReg);
+ BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
+ .addExternalSymbol("__morestack_allocate_stack_space")
+ .addRegMask(RegMask)
+ .addReg(X86::RDI, RegState::Implicit)
+ .addReg(X86::RAX, RegState::ImplicitDefine);
+ } else if (Is64Bit) {
+ BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
+ .addReg(sizeVReg);
+ BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
+ .addExternalSymbol("__morestack_allocate_stack_space")
+ .addRegMask(RegMask)
+ .addReg(X86::EDI, RegState::Implicit)
+ .addReg(X86::EAX, RegState::ImplicitDefine);
+ } else {
+ BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
+ .addImm(12);
+ BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
+ BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
+ .addExternalSymbol("__morestack_allocate_stack_space")
+ .addRegMask(RegMask)
+ .addReg(X86::EAX, RegState::ImplicitDefine);
+ }
+
+ if (!Is64Bit)
+ BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
+ .addImm(16);
+
+ BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
+ .addReg(IsLP64 ? X86::RAX : X86::EAX);
+ BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
+
+ // Set up the CFG correctly.
+ BB->addSuccessor(bumpMBB);
+ BB->addSuccessor(mallocMBB);
+ mallocMBB->addSuccessor(continueMBB);
+ bumpMBB->addSuccessor(continueMBB);
+
+ // Take care of the PHI nodes.
+ BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
+ MI.getOperand(0).getReg())
+ .addReg(mallocPtrVReg)
+ .addMBB(mallocMBB)
+ .addReg(bumpSPPtrVReg)
+ .addMBB(bumpMBB);
+
+ // Delete the original pseudo instruction.
+ MI.eraseFromParent();
+
+ // And we're done.
+ return continueMBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ MachineFunction *MF = BB->getParent();
+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+ MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
+ DebugLoc DL = MI.getDebugLoc();
+
+ assert(!isAsynchronousEHPersonality(
+ classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&
+ "SEH does not use catchret!");
+
+ // Only 32-bit EH needs to worry about manually restoring stack pointers.
+ if (!Subtarget.is32Bit())
+ return BB;
+
+ // C++ EH creates a new target block to hold the restore code, and wires up
+ // the new block to the return destination with a normal JMP_4.
+ MachineBasicBlock *RestoreMBB =
+ MF->CreateMachineBasicBlock(BB->getBasicBlock());
+ assert(BB->succ_size() == 1);
+ MF->insert(std::next(BB->getIterator()), RestoreMBB);
+ RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
+ BB->addSuccessor(RestoreMBB);
+ MI.getOperand(0).setMBB(RestoreMBB);
+
+ auto RestoreMBBI = RestoreMBB->begin();
+ BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
+ BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
+ return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ MachineFunction *MF = BB->getParent();
+ const Constant *PerFn = MF->getFunction()->getPersonalityFn();
+ bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
+ // Only 32-bit SEH requires special handling for catchpad.
+ if (IsSEH && Subtarget.is32Bit()) {
+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+ DebugLoc DL = MI.getDebugLoc();
+ BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
+ }
+ MI.eraseFromParent();
+ return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ // So, here we replace TLSADDR with the sequence:
+ // adjust_stackdown -> TLSADDR -> adjust_stackup.
+ // We need this because TLSADDR is lowered into calls
+ // inside MC, therefore without the two markers shrink-wrapping
+ // may push the prologue/epilogue pass them.
+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+ DebugLoc DL = MI.getDebugLoc();
+ MachineFunction &MF = *BB->getParent();
+
+ // Emit CALLSEQ_START right before the instruction.
+ unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
+ MachineInstrBuilder CallseqStart =
+ BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0);
+ BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
+
+ // Emit CALLSEQ_END right after the instruction.
+ // We don't call erase from parent because we want to keep the
+ // original instruction around.
+ unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
+ MachineInstrBuilder CallseqEnd =
+ BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
+ BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
+
+ return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ // This is pretty easy. We're taking the value that we received from
+ // our load from the relocation, sticking it in either RDI (x86-64)
+ // or EAX and doing an indirect call. The return value will then
+ // be in the normal return register.
+ MachineFunction *F = BB->getParent();
+ const X86InstrInfo *TII = Subtarget.getInstrInfo();
+ DebugLoc DL = MI.getDebugLoc();
+
+ assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
+ assert(MI.getOperand(3).isGlobal() && "This should be a global");
+
+ // Get a register mask for the lowered call.
+ // FIXME: The 32-bit calls have non-standard calling conventions. Use a
+ // proper register mask.
+ const uint32_t *RegMask =
+ Subtarget.is64Bit() ?
+ Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
+ Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
+ if (Subtarget.is64Bit()) {
+ MachineInstrBuilder MIB =
+ BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
+ .addReg(X86::RIP)
+ .addImm(0)
+ .addReg(0)
+ .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
+ MI.getOperand(3).getTargetFlags())
+ .addReg(0);
+ MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
+ addDirectMem(MIB, X86::RDI);
+ MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
+ } else if (!isPositionIndependent()) {
+ MachineInstrBuilder MIB =
+ BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
+ .addReg(0)
+ .addImm(0)
+ .addReg(0)
+ .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
+ MI.getOperand(3).getTargetFlags())
+ .addReg(0);
+ MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
+ addDirectMem(MIB, X86::EAX);
+ MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
+ } else {
+ MachineInstrBuilder MIB =
+ BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
+ .addReg(TII->getGlobalBaseReg(F))
+ .addImm(0)
+ .addReg(0)
+ .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
+ MI.getOperand(3).getTargetFlags())
+ .addReg(0);
+ MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
+ addDirectMem(MIB, X86::EAX);
+ MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
+ }
+
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
+ return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
+ MachineBasicBlock *MBB) const {
+ DebugLoc DL = MI.getDebugLoc();
+ MachineFunction *MF = MBB->getParent();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ const BasicBlock *BB = MBB->getBasicBlock();
+ MachineFunction::iterator I = ++MBB->getIterator();
+
+ // Memory Reference
+ MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
+ MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
+
+ unsigned DstReg;
+ unsigned MemOpndSlot = 0;
+
+ unsigned CurOp = 0;
+
+ DstReg = MI.getOperand(CurOp++).getReg();
+ const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
+ assert(RC->hasType(MVT::i32) && "Invalid destination!");
+ unsigned mainDstReg = MRI.createVirtualRegister(RC);
+ unsigned restoreDstReg = MRI.createVirtualRegister(RC);
+
+ MemOpndSlot = CurOp;
+
+ MVT PVT = getPointerTy(MF->getDataLayout());
+ assert((PVT == MVT::i64 || PVT == MVT::i32) &&
+ "Invalid Pointer Size!");
+
+ // For v = setjmp(buf), we generate
+ //
+ // thisMBB:
+ // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
+ // SjLjSetup restoreMBB
+ //
+ // mainMBB:
+ // v_main = 0
+ //
+ // sinkMBB:
+ // v = phi(main, restore)
+ //
+ // restoreMBB:
+ // if base pointer being used, load it from frame
+ // v_restore = 1
+
+ MachineBasicBlock *thisMBB = MBB;
+ MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
+ MF->insert(I, mainMBB);
+ MF->insert(I, sinkMBB);
+ MF->push_back(restoreMBB);
+ restoreMBB->setHasAddressTaken();
+
+ MachineInstrBuilder MIB;
+
+ // Transfer the remainder of BB and its successor edges to sinkMBB.
+ sinkMBB->splice(sinkMBB->begin(), MBB,
+ std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+ sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+ // thisMBB:
+ unsigned PtrStoreOpc = 0;
+ unsigned LabelReg = 0;
+ const int64_t LabelOffset = 1 * PVT.getStoreSize();
+ bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
+ !isPositionIndependent();
+
+ // Prepare IP either in reg or imm.
+ if (!UseImmLabel) {
+ PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
+ const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
+ LabelReg = MRI.createVirtualRegister(PtrRC);
+ if (Subtarget.is64Bit()) {
+ MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
+ .addReg(X86::RIP)
+ .addImm(0)
+ .addReg(0)
+ .addMBB(restoreMBB)
+ .addReg(0);
+ } else {
+ const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
+ MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
+ .addReg(XII->getGlobalBaseReg(MF))
+ .addImm(0)
+ .addReg(0)
+ .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
+ .addReg(0);
+ }
+ } else
+ PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
+ // Store IP
+ MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
+ for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+ if (i == X86::AddrDisp)
+ MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
+ else
+ MIB.addOperand(MI.getOperand(MemOpndSlot + i));
+ }
+ if (!UseImmLabel)
+ MIB.addReg(LabelReg);
+ else
+ MIB.addMBB(restoreMBB);
+ MIB.setMemRefs(MMOBegin, MMOEnd);
+ // Setup
+ MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
+ .addMBB(restoreMBB);
+
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ MIB.addRegMask(RegInfo->getNoPreservedMask());
+ thisMBB->addSuccessor(mainMBB);
+ thisMBB->addSuccessor(restoreMBB);
+
+ // mainMBB:
+ // EAX = 0
+ BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
+ mainMBB->addSuccessor(sinkMBB);
+
+ // sinkMBB:
+ BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+ TII->get(X86::PHI), DstReg)
+ .addReg(mainDstReg).addMBB(mainMBB)
+ .addReg(restoreDstReg).addMBB(restoreMBB);
+
+ // restoreMBB:
+ if (RegInfo->hasBasePointer(*MF)) {
+ const bool Uses64BitFramePtr =
+ Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
+ X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
+ X86FI->setRestoreBasePointer(MF);
+ unsigned FramePtr = RegInfo->getFrameRegister(*MF);
+ unsigned BasePtr = RegInfo->getBaseRegister();
+ unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
+ addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
+ FramePtr, true, X86FI->getRestoreBasePointerOffset())
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+ BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
+ BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
+ restoreMBB->addSuccessor(sinkMBB);
+
+ MI.eraseFromParent();
+ return sinkMBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
+ MachineBasicBlock *MBB) const {
+ DebugLoc DL = MI.getDebugLoc();
+ MachineFunction *MF = MBB->getParent();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ // Memory Reference
+ MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
+ MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
+
+ MVT PVT = getPointerTy(MF->getDataLayout());
+ assert((PVT == MVT::i64 || PVT == MVT::i32) &&
+ "Invalid Pointer Size!");
+
+ const TargetRegisterClass *RC =
+ (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
+ unsigned Tmp = MRI.createVirtualRegister(RC);
+ // Since FP is only updated here but NOT referenced, it's treated as GPR.
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
+ unsigned SP = RegInfo->getStackRegister();
+
+ MachineInstrBuilder MIB;
+
+ const int64_t LabelOffset = 1 * PVT.getStoreSize();
+ const int64_t SPOffset = 2 * PVT.getStoreSize();
+
+ unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
+ unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
+
+ // Reload FP
+ MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
+ for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
+ MIB.addOperand(MI.getOperand(i));
+ MIB.setMemRefs(MMOBegin, MMOEnd);
+ // Reload IP
+ MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
+ for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+ if (i == X86::AddrDisp)
+ MIB.addDisp(MI.getOperand(i), LabelOffset);
+ else
+ MIB.addOperand(MI.getOperand(i));
+ }
+ MIB.setMemRefs(MMOBegin, MMOEnd);
+ // Reload SP
+ MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
+ for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+ if (i == X86::AddrDisp)
+ MIB.addDisp(MI.getOperand(i), SPOffset);
+ else
+ MIB.addOperand(MI.getOperand(i));
+ }
+ MIB.setMemRefs(MMOBegin, MMOEnd);
+ // Jump
+ BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
+
+ MI.eraseFromParent();
+ return MBB;
+}
+
+void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
+ MachineBasicBlock *MBB,
+ MachineBasicBlock *DispatchBB,
+ int FI) const {
+ DebugLoc DL = MI.getDebugLoc();
+ MachineFunction *MF = MBB->getParent();
+ MachineRegisterInfo *MRI = &MF->getRegInfo();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+
+ MVT PVT = getPointerTy(MF->getDataLayout());
+ assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
+
+ unsigned Op = 0;
+ unsigned VR = 0;
+
+ bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
+ !isPositionIndependent();
+
+ if (UseImmLabel) {
+ Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
+ } else {
+ const TargetRegisterClass *TRC =
+ (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
+ VR = MRI->createVirtualRegister(TRC);
+ Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
+
+ /* const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII); */
+
+ if (Subtarget.is64Bit())
+ BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
+ .addReg(X86::RIP)
+ .addImm(1)
+ .addReg(0)
+ .addMBB(DispatchBB)
+ .addReg(0);
+ else
+ BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
+ .addReg(0) /* XII->getGlobalBaseReg(MF) */
+ .addImm(1)
+ .addReg(0)
+ .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
+ .addReg(0);
+ }
+
+ MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
+ addFrameReference(MIB, FI, 36);
+ if (UseImmLabel)
+ MIB.addMBB(DispatchBB);
+ else
+ MIB.addReg(VR);
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ DebugLoc DL = MI.getDebugLoc();
+ MachineFunction *MF = BB->getParent();
+ MachineFrameInfo &MFI = MF->getFrameInfo();
+ MachineRegisterInfo *MRI = &MF->getRegInfo();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ int FI = MFI.getFunctionContextIndex();
+
+ // Get a mapping of the call site numbers to all of the landing pads they're
+ // associated with.
+ DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
+ unsigned MaxCSNum = 0;
+ for (auto &MBB : *MF) {
+ if (!MBB.isEHPad())
+ continue;
+
+ MCSymbol *Sym = nullptr;
+ for (const auto &MI : MBB) {
+ if (MI.isDebugValue())
+ continue;
+
+ assert(MI.isEHLabel() && "expected EH_LABEL");
+ Sym = MI.getOperand(0).getMCSymbol();
+ break;
+ }
+
+ if (!MF->hasCallSiteLandingPad(Sym))
+ continue;
+
+ for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
+ CallSiteNumToLPad[CSI].push_back(&MBB);
+ MaxCSNum = std::max(MaxCSNum, CSI);
+ }
+ }
+
+ // Get an ordered list of the machine basic blocks for the jump table.
+ std::vector<MachineBasicBlock *> LPadList;
+ SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
+ LPadList.reserve(CallSiteNumToLPad.size());
+
+ for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
+ for (auto &LP : CallSiteNumToLPad[CSI]) {
+ LPadList.push_back(LP);
+ InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
+ }
+ }
+
+ assert(!LPadList.empty() &&
+ "No landing pad destinations for the dispatch jump table!");
+
+ // Create the MBBs for the dispatch code.
+
+ // Shove the dispatch's address into the return slot in the function context.
+ MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
+ DispatchBB->setIsEHPad(true);
+
+ MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
+ BuildMI(TrapBB, DL, TII->get(X86::TRAP));
+ DispatchBB->addSuccessor(TrapBB);
+
+ MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
+ DispatchBB->addSuccessor(DispContBB);
+
+ // Insert MBBs.
+ MF->push_back(DispatchBB);
+ MF->push_back(DispContBB);
+ MF->push_back(TrapBB);
+
+ // Insert code into the entry block that creates and registers the function
+ // context.
+ SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
+
+ // Create the jump table and associated information
+ MachineJumpTableInfo *JTI =
+ MF->getOrCreateJumpTableInfo(getJumpTableEncoding());
+ unsigned MJTI = JTI->createJumpTableIndex(LPadList);
+
+ const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII);
+ const X86RegisterInfo &RI = XII->getRegisterInfo();
+
+ // Add a register mask with no preserved registers. This results in all
+ // registers being marked as clobbered.
+ if (RI.hasBasePointer(*MF)) {
+ const bool FPIs64Bit =
+ Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
+ X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
+ MFI->setRestoreBasePointer(MF);
+
+ unsigned FP = RI.getFrameRegister(*MF);
+ unsigned BP = RI.getBaseRegister();
+ unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
+ addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
+ MFI->getRestoreBasePointerOffset())
+ .addRegMask(RI.getNoPreservedMask());
+ } else {
+ BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
+ .addRegMask(RI.getNoPreservedMask());
+ }
+
+ unsigned IReg = MRI->createVirtualRegister(&X86::GR32RegClass);
+ addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
+ 4);
+ BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
+ .addReg(IReg)
+ .addImm(LPadList.size());
+ BuildMI(DispatchBB, DL, TII->get(X86::JA_1)).addMBB(TrapBB);
+
+ unsigned JReg = MRI->createVirtualRegister(&X86::GR32RegClass);
+ BuildMI(DispContBB, DL, TII->get(X86::SUB32ri), JReg)
+ .addReg(IReg)
+ .addImm(1);
+ BuildMI(DispContBB, DL,
+ TII->get(Subtarget.is64Bit() ? X86::JMP64m : X86::JMP32m))
+ .addReg(0)
+ .addImm(Subtarget.is64Bit() ? 8 : 4)
+ .addReg(JReg)
+ .addJumpTableIndex(MJTI)
+ .addReg(0);
+
+ // Add the jump table entries as successors to the MBB.
+ SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
+ for (auto &LP : LPadList)
+ if (SeenMBBs.insert(LP).second)
+ DispContBB->addSuccessor(LP);
+
+ // N.B. the order the invoke BBs are processed in doesn't matter here.
+ SmallVector<MachineBasicBlock *, 64> MBBLPads;
+ const MCPhysReg *SavedRegs =
+ Subtarget.getRegisterInfo()->getCalleeSavedRegs(MF);
+ for (MachineBasicBlock *MBB : InvokeBBs) {
+ // Remove the landing pad successor from the invoke block and replace it
+ // with the new dispatch block.
+ // Keep a copy of Successors since it's modified inside the loop.
+ SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
+ MBB->succ_rend());
+ // FIXME: Avoid quadratic complexity.
+ for (auto MBBS : Successors) {
+ if (MBBS->isEHPad()) {
+ MBB->removeSuccessor(MBBS);
+ MBBLPads.push_back(MBBS);
+ }
+ }
+
+ MBB->addSuccessor(DispatchBB);
+
+ // Find the invoke call and mark all of the callee-saved registers as
+ // 'implicit defined' so that they're spilled. This prevents code from
+ // moving instructions to before the EH block, where they will never be
+ // executed.
+ for (auto &II : reverse(*MBB)) {
+ if (!II.isCall())
+ continue;
+
+ DenseMap<unsigned, bool> DefRegs;
+ for (auto &MOp : II.operands())
+ if (MOp.isReg())
+ DefRegs[MOp.getReg()] = true;
+
+ MachineInstrBuilder MIB(*MF, &II);
+ for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
+ unsigned Reg = SavedRegs[RI];
+ if (!DefRegs[Reg])
+ MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
+ }
+
+ break;
+ }
+ }
+
+ // Mark all former landing pads as non-landing pads. The dispatch is the only
+ // landing pad now.
+ for (auto &LP : MBBLPads)
+ LP->setIsEHPad(false);
+
+ // The instruction is gone now.
+ MI.eraseFromParent();
+ return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ MachineFunction *MF = BB->getParent();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ DebugLoc DL = MI.getDebugLoc();
+
+ switch (MI.getOpcode()) {
+ default: llvm_unreachable("Unexpected instr type to insert");
+ case X86::TAILJMPd64:
+ case X86::TAILJMPr64:
+ case X86::TAILJMPm64:
+ case X86::TAILJMPr64_REX:
+ case X86::TAILJMPm64_REX:
+ llvm_unreachable("TAILJMP64 would not be touched here.");
+ case X86::TCRETURNdi64:
+ case X86::TCRETURNri64:
+ case X86::TCRETURNmi64:
+ return BB;
+ case X86::TLS_addr32:
+ case X86::TLS_addr64:
+ case X86::TLS_base_addr32:
+ case X86::TLS_base_addr64:
+ return EmitLoweredTLSAddr(MI, BB);
+ case X86::CATCHRET:
+ return EmitLoweredCatchRet(MI, BB);
+ case X86::CATCHPAD:
+ return EmitLoweredCatchPad(MI, BB);
+ case X86::SEG_ALLOCA_32:
+ case X86::SEG_ALLOCA_64:
+ return EmitLoweredSegAlloca(MI, BB);
+ case X86::TLSCall_32:
+ case X86::TLSCall_64:
+ return EmitLoweredTLSCall(MI, BB);
+ case X86::CMOV_FR32:
+ case X86::CMOV_FR64:
+ case X86::CMOV_FR128:
+ case X86::CMOV_GR8:
+ case X86::CMOV_GR16:
+ case X86::CMOV_GR32:
+ case X86::CMOV_RFP32:
+ case X86::CMOV_RFP64:
+ case X86::CMOV_RFP80:
+ case X86::CMOV_V2F64:
+ case X86::CMOV_V2I64:
+ case X86::CMOV_V4F32:
+ case X86::CMOV_V4F64:
+ case X86::CMOV_V4I64:
+ case X86::CMOV_V16F32:
+ case X86::CMOV_V8F32:
+ case X86::CMOV_V8F64:
+ case X86::CMOV_V8I64:
+ case X86::CMOV_V8I1:
+ case X86::CMOV_V16I1:
+ case X86::CMOV_V32I1:
+ case X86::CMOV_V64I1:
+ return EmitLoweredSelect(MI, BB);
+
+ case X86::RDFLAGS32:
+ case X86::RDFLAGS64: {
+ unsigned PushF =
+ MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
+ unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
+ MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
+ // Permit reads of the FLAGS register without it being defined.
+ // This intrinsic exists to read external processor state in flags, such as
+ // the trap flag, interrupt flag, and direction flag, none of which are
+ // modeled by the backend.
+ Push->getOperand(2).setIsUndef();
+ BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
+
+ MI.eraseFromParent(); // The pseudo is gone now.
+ return BB;
+ }
+
+ case X86::WRFLAGS32:
+ case X86::WRFLAGS64: {
+ unsigned Push =
+ MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
+ unsigned PopF =
+ MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
+ BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
+ BuildMI(*BB, MI, DL, TII->get(PopF));
+
+ MI.eraseFromParent(); // The pseudo is gone now.
+ return BB;
+ }
+
+ case X86::RELEASE_FADD32mr:
+ case X86::RELEASE_FADD64mr:
+ return EmitLoweredAtomicFP(MI, BB);
+
+ case X86::FP32_TO_INT16_IN_MEM:
+ case X86::FP32_TO_INT32_IN_MEM:
+ case X86::FP32_TO_INT64_IN_MEM:
+ case X86::FP64_TO_INT16_IN_MEM:
+ case X86::FP64_TO_INT32_IN_MEM:
+ case X86::FP64_TO_INT64_IN_MEM:
+ case X86::FP80_TO_INT16_IN_MEM:
+ case X86::FP80_TO_INT32_IN_MEM:
+ case X86::FP80_TO_INT64_IN_MEM: {
+ // Change the floating point control register to use "round towards zero"
+ // mode when truncating to an integer value.
+ int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
+ addFrameReference(BuildMI(*BB, MI, DL,
+ TII->get(X86::FNSTCW16m)), CWFrameIdx);
+
+ // Load the old value of the high byte of the control word...
+ unsigned OldCW =
+ MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
+ addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
+ CWFrameIdx);
+
+ // Set the high part to be round to zero...
+ addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
+ .addImm(0xC7F);
+
+ // Reload the modified control word now...
+ addFrameReference(BuildMI(*BB, MI, DL,
+ TII->get(X86::FLDCW16m)), CWFrameIdx);
+
+ // Restore the memory image of control word to original value
+ addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
+ .addReg(OldCW);
+
+ // Get the X86 opcode to use.
+ unsigned Opc;
+ switch (MI.getOpcode()) {
+ default: llvm_unreachable("illegal opcode!");
+ case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
+ case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
+ case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
+ case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
+ case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
+ case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
+ case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
+ case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
+ case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
+ }
+
+ X86AddressMode AM = getAddressFromInstr(&MI, 0);
+ addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
+ .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
+
+ // Reload the original control word now.
+ addFrameReference(BuildMI(*BB, MI, DL,
+ TII->get(X86::FLDCW16m)), CWFrameIdx);
+
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
+ return BB;
+ }
+ // String/text processing lowering.
+ case X86::PCMPISTRM128REG:
+ case X86::VPCMPISTRM128REG:
+ case X86::PCMPISTRM128MEM:
+ case X86::VPCMPISTRM128MEM:
+ case X86::PCMPESTRM128REG:
+ case X86::VPCMPESTRM128REG:
+ case X86::PCMPESTRM128MEM:
+ case X86::VPCMPESTRM128MEM:
+ assert(Subtarget.hasSSE42() &&
+ "Target must have SSE4.2 or AVX features enabled");
+ return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
+
+ // String/text processing lowering.
+ case X86::PCMPISTRIREG:
+ case X86::VPCMPISTRIREG:
+ case X86::PCMPISTRIMEM:
+ case X86::VPCMPISTRIMEM:
+ case X86::PCMPESTRIREG:
+ case X86::VPCMPESTRIREG:
+ case X86::PCMPESTRIMEM:
+ case X86::VPCMPESTRIMEM:
+ assert(Subtarget.hasSSE42() &&
+ "Target must have SSE4.2 or AVX features enabled");
+ return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
+
+ // Thread synchronization.
+ case X86::MONITOR:
+ return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
+ case X86::MONITORX:
+ return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
+ // PKU feature
+ case X86::WRPKRU:
+ return emitWRPKRU(MI, BB, Subtarget);
+ case X86::RDPKRU:
+ return emitRDPKRU(MI, BB, Subtarget);
+ // xbegin
+ case X86::XBEGIN:
+ return emitXBegin(MI, BB, Subtarget.getInstrInfo());
+
+ case X86::VASTART_SAVE_XMM_REGS:
+ return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
+
+ case X86::VAARG_64:
+ return EmitVAARG64WithCustomInserter(MI, BB);
+
+ case X86::EH_SjLj_SetJmp32:
+ case X86::EH_SjLj_SetJmp64:
+ return emitEHSjLjSetJmp(MI, BB);
+
+ case X86::EH_SjLj_LongJmp32:
+ case X86::EH_SjLj_LongJmp64:
+ return emitEHSjLjLongJmp(MI, BB);
+
+ case X86::Int_eh_sjlj_setup_dispatch:
+ return EmitSjLjDispatchBlock(MI, BB);
+
+ case TargetOpcode::STATEPOINT:
+ // As an implementation detail, STATEPOINT shares the STACKMAP format at
+ // this point in the process. We diverge later.
+ return emitPatchPoint(MI, BB);
+
+ case TargetOpcode::STACKMAP:
+ case TargetOpcode::PATCHPOINT:
+ return emitPatchPoint(MI, BB);
+
+ case X86::LCMPXCHG8B: {
+ const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
+ // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
+ // requires a memory operand. If it happens that current architecture is
+ // i686 and for current function we need a base pointer
+ // - which is ESI for i686 - register allocator would not be able to
+ // allocate registers for an address in form of X(%reg, %reg, Y)
+ // - there never would be enough unreserved registers during regalloc
+ // (without the need for base ptr the only option would be X(%edi, %esi, Y).
+ // We are giving a hand to register allocator by precomputing the address in
+ // a new vreg using LEA.
+
+ // If it is not i686 or there is no base pointer - nothing to do here.
+ if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
+ return BB;
+
+ // Even though this code does not necessarily needs the base pointer to
+ // be ESI, we check for that. The reason: if this assert fails, there are
+ // some changes happened in the compiler base pointer handling, which most
+ // probably have to be addressed somehow here.
+ assert(TRI->getBaseRegister() == X86::ESI &&
+ "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
+ "base pointer in mind");
+
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ MVT SPTy = getPointerTy(MF->getDataLayout());
+ const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
+ unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
+
+ X86AddressMode AM = getAddressFromInstr(&MI, 0);
+ // Regalloc does not need any help when the memory operand of CMPXCHG8B
+ // does not use index register.
+ if (AM.IndexReg == X86::NoRegister)
+ return BB;
+
+ // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
+ // four operand definitions that are E[ABCD] registers. We skip them and
+ // then insert the LEA.
+ MachineBasicBlock::iterator MBBI(MI);
+ while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
+ MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
+ --MBBI;
+ addFullAddress(
+ BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
+
+ setDirectAddressInInstr(&MI, 0, computedAddrVReg);
+
+ return BB;
+ }
+ case X86::LCMPXCHG16B:
+ return BB;
+ case X86::LCMPXCHG8B_SAVE_EBX:
+ case X86::LCMPXCHG16B_SAVE_RBX: {
+ unsigned BasePtr =
+ MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
+ if (!BB->isLiveIn(BasePtr))
+ BB->addLiveIn(BasePtr);
+ return BB;
+ }
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// X86 Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
+ APInt &KnownZero,
+ APInt &KnownOne,
+ const SelectionDAG &DAG,
+ unsigned Depth) const {
+ unsigned BitWidth = KnownZero.getBitWidth();
+ unsigned Opc = Op.getOpcode();
+ assert((Opc >= ISD::BUILTIN_OP_END ||
+ Opc == ISD::INTRINSIC_WO_CHAIN ||
+ Opc == ISD::INTRINSIC_W_CHAIN ||
+ Opc == ISD::INTRINSIC_VOID) &&
+ "Should use MaskedValueIsZero if you don't know whether Op"
+ " is a target node!");
+
+ KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything.
+ switch (Opc) {
+ default: break;
+ case X86ISD::ADD:
+ case X86ISD::SUB:
+ case X86ISD::ADC:
+ case X86ISD::SBB:
+ case X86ISD::SMUL:
+ case X86ISD::UMUL:
+ case X86ISD::INC:
+ case X86ISD::DEC:
+ case X86ISD::OR:
+ case X86ISD::XOR:
+ case X86ISD::AND:
+ // These nodes' second result is a boolean.
+ if (Op.getResNo() == 0)
+ break;
+ LLVM_FALLTHROUGH;
+ case X86ISD::SETCC:
+ KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
+ break;
+ case X86ISD::MOVMSK: {
+ unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
+ KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
+ break;
+ }
+ case X86ISD::VZEXT: {
+ SDValue N0 = Op.getOperand(0);
+ unsigned NumElts = Op.getValueType().getVectorNumElements();
+ unsigned InNumElts = N0.getValueType().getVectorNumElements();
+ unsigned InBitWidth = N0.getValueType().getScalarSizeInBits();
+
+ KnownZero = KnownOne = APInt(InBitWidth, 0);
+ APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);
+ DAG.computeKnownBits(N0, KnownZero, KnownOne, DemandedElts, Depth + 1);
+ KnownOne = KnownOne.zext(BitWidth);
+ KnownZero = KnownZero.zext(BitWidth);
+ KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - InBitWidth);
+ break;
+ }
+ }
+}
+
+unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
+ SDValue Op, const SelectionDAG &DAG, unsigned Depth) const {
+ // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
+ if (Op.getOpcode() == X86ISD::SETCC_CARRY)
+ return Op.getScalarValueSizeInBits();
+
+ if (Op.getOpcode() == X86ISD::VSEXT) {
+ EVT VT = Op.getValueType();
+ EVT SrcVT = Op.getOperand(0).getValueType();
+ unsigned Tmp = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+ Tmp += VT.getScalarSizeInBits() - SrcVT.getScalarSizeInBits();
+ return Tmp;
+ }
+
+ // Fallback case.
+ return 1;
+}
+
+/// Returns true (and the GlobalValue and the offset) if the node is a
+/// GlobalAddress + offset.
+bool X86TargetLowering::isGAPlusOffset(SDNode *N,
+ const GlobalValue* &GA,
+ int64_t &Offset) const {
+ if (N->getOpcode() == X86ISD::Wrapper) {
+ if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
+ GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
+ Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
+ return true;
+ }
+ }
+ return TargetLowering::isGAPlusOffset(N, GA, Offset);
+}
+
+// Attempt to match a combined shuffle mask against supported unary shuffle
+// instructions.
+// TODO: Investigate sharing more of this with shuffle lowering.
+static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
+ unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
+ unsigned NumMaskElts = Mask.size();
+ unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
+ bool FloatDomain = MaskVT.isFloatingPoint() ||
+ (!Subtarget.hasAVX2() && MaskVT.is256BitVector());
+
+ // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
+ if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
+ isUndefOrEqual(Mask[0], 0) &&
+ isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
+ Shuffle = X86ISD::VZEXT_MOVL;
+ SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
+ return true;
+ }
+
+ // Match against a VZEXT instruction.
+ // TODO: Add 256/512-bit vector support.
+ if (!FloatDomain && MaskVT.is128BitVector() && Subtarget.hasSSE41()) {
+ unsigned MaxScale = 64 / MaskEltSize;
+ for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
+ bool Match = true;
+ unsigned NumDstElts = NumMaskElts / Scale;
+ for (unsigned i = 0; i != NumDstElts && Match; ++i) {
+ Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
+ Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
+ }
+ if (Match) {
+ SrcVT = MaskVT;
+ DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
+ DstVT = MVT::getVectorVT(DstVT, NumDstElts);
+ Shuffle = X86ISD::VZEXT;
+ return true;
+ }
+ }
+ }
+
+ // Check if we have SSE3 which will let us use MOVDDUP etc. The
+ // instructions are no slower than UNPCKLPD but has the option to
+ // fold the input operand into even an unaligned memory load.
+ if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && FloatDomain) {
+ if (isTargetShuffleEquivalent(Mask, {0, 0})) {
+ Shuffle = X86ISD::MOVDDUP;
+ SrcVT = DstVT = MVT::v2f64;
+ return true;
+ }
+ if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
+ Shuffle = X86ISD::MOVSLDUP;
+ SrcVT = DstVT = MVT::v4f32;
+ return true;
+ }
+ if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
+ Shuffle = X86ISD::MOVSHDUP;
+ SrcVT = DstVT = MVT::v4f32;
+ return true;
+ }
+ }
+
+ if (MaskVT.is256BitVector() && FloatDomain) {
+ assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
+ if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
+ Shuffle = X86ISD::MOVDDUP;
+ SrcVT = DstVT = MVT::v4f64;
+ return true;
+ }
+ if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
+ Shuffle = X86ISD::MOVSLDUP;
+ SrcVT = DstVT = MVT::v8f32;
+ return true;
+ }
+ if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
+ Shuffle = X86ISD::MOVSHDUP;
+ SrcVT = DstVT = MVT::v8f32;
+ return true;
+ }
+ }
+
+ if (MaskVT.is512BitVector() && FloatDomain) {
+ assert(Subtarget.hasAVX512() &&
+ "AVX512 required for 512-bit vector shuffles");
+ if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
+ Shuffle = X86ISD::MOVDDUP;
+ SrcVT = DstVT = MVT::v8f64;
+ return true;
+ }
+ if (isTargetShuffleEquivalent(
+ Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
+ Shuffle = X86ISD::MOVSLDUP;
+ SrcVT = DstVT = MVT::v16f32;
+ return true;
+ }
+ if (isTargetShuffleEquivalent(
+ Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
+ Shuffle = X86ISD::MOVSHDUP;
+ SrcVT = DstVT = MVT::v16f32;
+ return true;
+ }
+ }
+
+ // Attempt to match against broadcast-from-vector.
+ if (Subtarget.hasAVX2()) {
+ SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
+ if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
+ SrcVT = DstVT = MaskVT;
+ Shuffle = X86ISD::VBROADCAST;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+// Attempt to match a combined shuffle mask against supported unary immediate
+// permute instructions.
+// TODO: Investigate sharing more of this with shuffle lowering.
+static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
+ unsigned &Shuffle, MVT &ShuffleVT,
+ unsigned &PermuteImm) {
+ unsigned NumMaskElts = Mask.size();
+ bool FloatDomain = MaskVT.isFloatingPoint();
+
+ bool ContainsZeros = false;
+ SmallBitVector Zeroable(NumMaskElts, false);
+ for (unsigned i = 0; i != NumMaskElts; ++i) {
+ int M = Mask[i];
+ Zeroable[i] = isUndefOrZero(M);
+ ContainsZeros |= (M == SM_SentinelZero);
+ }
+
+ // Attempt to match against byte/bit shifts.
+ // FIXME: Add 512-bit support.
+ if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
+ (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
+ int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
+ MaskVT.getScalarSizeInBits(), Mask,
+ 0, Zeroable, Subtarget);
+ if (0 < ShiftAmt) {
+ PermuteImm = (unsigned)ShiftAmt;
+ return true;
+ }
+ }
+
+ // Ensure we don't contain any zero elements.
+ if (ContainsZeros)
+ return false;
+
+ assert(llvm::all_of(Mask, [&](int M) {
+ return SM_SentinelUndef <= M && M < (int)NumMaskElts;
+ }) && "Expected unary shuffle");
+
+ unsigned InputSizeInBits = MaskVT.getSizeInBits();
+ unsigned MaskScalarSizeInBits = InputSizeInBits / Mask.size();
+ MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
+
+ // Handle PSHUFLW/PSHUFHW repeated patterns.
+ if (MaskScalarSizeInBits == 16) {
+ SmallVector<int, 4> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
+ ArrayRef<int> LoMask(Mask.data() + 0, 4);
+ ArrayRef<int> HiMask(Mask.data() + 4, 4);
+
+ // PSHUFLW: permute lower 4 elements only.
+ if (isUndefOrInRange(LoMask, 0, 4) &&
+ isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
+ Shuffle = X86ISD::PSHUFLW;
+ ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
+ PermuteImm = getV4X86ShuffleImm(LoMask);
+ return true;
+ }
+
+ // PSHUFHW: permute upper 4 elements only.
+ if (isUndefOrInRange(HiMask, 4, 8) &&
+ isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
+ // Offset the HiMask so that we can create the shuffle immediate.
+ int OffsetHiMask[4];
+ for (int i = 0; i != 4; ++i)
+ OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
+
+ Shuffle = X86ISD::PSHUFHW;
+ ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
+ PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
+ return true;
+ }
+
+ return false;
+ }
+ return false;
+ }
+
+ // We only support permutation of 32/64 bit elements after this.
+ if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64)
+ return false;
+
+ // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
+ // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
+ if (FloatDomain && !Subtarget.hasAVX())
+ return false;
+
+ // Pre-AVX2 we must use float shuffles on 256-bit vectors.
+ if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
+ FloatDomain = true;
+
+ // Check for lane crossing permutes.
+ if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
+ // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
+ if (Subtarget.hasAVX2() && MaskVT.is256BitVector() && Mask.size() == 4) {
+ Shuffle = X86ISD::VPERMI;
+ ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
+ PermuteImm = getV4X86ShuffleImm(Mask);
+ return true;
+ }
+ if (Subtarget.hasAVX512() && MaskVT.is512BitVector() && Mask.size() == 8) {
+ SmallVector<int, 4> RepeatedMask;
+ if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
+ Shuffle = X86ISD::VPERMI;
+ ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
+ PermuteImm = getV4X86ShuffleImm(RepeatedMask);
+ return true;
+ }
+ }
+ return false;
+ }
+
+ // VPERMILPD can permute with a non-repeating shuffle.
+ if (FloatDomain && MaskScalarSizeInBits == 64) {
+ Shuffle = X86ISD::VPERMILPI;
+ ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
+ PermuteImm = 0;
+ for (int i = 0, e = Mask.size(); i != e; ++i) {
+ int M = Mask[i];
+ if (M == SM_SentinelUndef)
+ continue;
+ assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
+ PermuteImm |= (M & 1) << i;
+ }
+ return true;
+ }
+
+ // We need a repeating shuffle mask for VPERMILPS/PSHUFD.
+ SmallVector<int, 4> RepeatedMask;
+ if (!is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask))
+ return false;
+
+ // Narrow the repeated mask for 32-bit element permutes.
+ SmallVector<int, 4> WordMask = RepeatedMask;
+ if (MaskScalarSizeInBits == 64)
+ scaleShuffleMask(2, RepeatedMask, WordMask);
+
+ Shuffle = (FloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD);
+ ShuffleVT = (FloatDomain ? MVT::f32 : MVT::i32);
+ ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
+ PermuteImm = getV4X86ShuffleImm(WordMask);
+ return true;
+}
+
+// Attempt to match a combined unary shuffle mask against supported binary
+// shuffle instructions.
+// TODO: Investigate sharing more of this with shuffle lowering.
+static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
+ SDValue &V1, SDValue &V2,
+ const X86Subtarget &Subtarget,
+ unsigned &Shuffle, MVT &ShuffleVT,
+ bool IsUnary) {
+ bool FloatDomain = MaskVT.isFloatingPoint();
+ unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
+
+ if (MaskVT.is128BitVector()) {
+ if (isTargetShuffleEquivalent(Mask, {0, 0}) && FloatDomain) {
+ V2 = V1;
+ Shuffle = X86ISD::MOVLHPS;
+ ShuffleVT = MVT::v4f32;
+ return true;
+ }
+ if (isTargetShuffleEquivalent(Mask, {1, 1}) && FloatDomain) {
+ V2 = V1;
+ Shuffle = X86ISD::MOVHLPS;
+ ShuffleVT = MVT::v4f32;
+ return true;
+ }
+ if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
+ (FloatDomain || !Subtarget.hasSSE41())) {
+ std::swap(V1, V2);
+ Shuffle = X86ISD::MOVSD;
+ ShuffleVT = MaskVT;
+ return true;
+ }
+ if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
+ (FloatDomain || !Subtarget.hasSSE41())) {
+ Shuffle = X86ISD::MOVSS;
+ ShuffleVT = MaskVT;
+ return true;
+ }
+ }
+
+ // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
+ if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
+ (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
+ (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
+ (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
+ (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
+ MVT LegalVT = MaskVT;
+ if (LegalVT.is256BitVector() && !Subtarget.hasAVX2())
+ LegalVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
+
+ SmallVector<int, 64> Unpckl, Unpckh;
+ if (IsUnary) {
+ createUnpackShuffleMask(MaskVT, Unpckl, true, true);
+ if (isTargetShuffleEquivalent(Mask, Unpckl)) {
+ V2 = V1;
+ Shuffle = X86ISD::UNPCKL;
+ ShuffleVT = LegalVT;
+ return true;
+ }
+
+ createUnpackShuffleMask(MaskVT, Unpckh, false, true);
+ if (isTargetShuffleEquivalent(Mask, Unpckh)) {
+ V2 = V1;
+ Shuffle = X86ISD::UNPCKH;
+ ShuffleVT = LegalVT;
+ return true;
+ }
+ } else {
+ createUnpackShuffleMask(MaskVT, Unpckl, true, false);
+ if (isTargetShuffleEquivalent(Mask, Unpckl)) {
+ Shuffle = X86ISD::UNPCKL;
+ ShuffleVT = LegalVT;
+ return true;
+ }
+
+ createUnpackShuffleMask(MaskVT, Unpckh, false, false);
+ if (isTargetShuffleEquivalent(Mask, Unpckh)) {
+ Shuffle = X86ISD::UNPCKH;
+ ShuffleVT = LegalVT;
+ return true;
+ }
+
+ ShuffleVectorSDNode::commuteMask(Unpckl);
+ if (isTargetShuffleEquivalent(Mask, Unpckl)) {
+ std::swap(V1, V2);
+ Shuffle = X86ISD::UNPCKL;
+ ShuffleVT = LegalVT;
+ return true;
+ }
+
+ ShuffleVectorSDNode::commuteMask(Unpckh);
+ if (isTargetShuffleEquivalent(Mask, Unpckh)) {
+ std::swap(V1, V2);
+ Shuffle = X86ISD::UNPCKH;
+ ShuffleVT = LegalVT;
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
+ SDValue &V1, SDValue &V2,
+ SDLoc &DL, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ unsigned &Shuffle, MVT &ShuffleVT,
+ unsigned &PermuteImm) {
+ unsigned NumMaskElts = Mask.size();
+ bool FloatDomain = MaskVT.isFloatingPoint();
+
+ // Attempt to match against PALIGNR byte rotate.
+ if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
+ (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
+ int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
+ if (0 < ByteRotation) {
+ Shuffle = X86ISD::PALIGNR;
+ ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
+ PermuteImm = ByteRotation;
+ return true;
+ }
+ }
+
+ // Attempt to combine to X86ISD::BLENDI.
+ if (NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
+ (Subtarget.hasAVX() && MaskVT.is256BitVector()))) {
+ // Determine a type compatible with X86ISD::BLENDI.
+ // TODO - add 16i16 support (requires lane duplication).
+ MVT BlendVT = MaskVT;
+ if (Subtarget.hasAVX2()) {
+ if (BlendVT == MVT::v4i64)
+ BlendVT = MVT::v8i32;
+ else if (BlendVT == MVT::v2i64)
+ BlendVT = MVT::v4i32;
+ } else {
+ if (BlendVT == MVT::v2i64 || BlendVT == MVT::v4i32)
+ BlendVT = MVT::v8i16;
+ else if (BlendVT == MVT::v4i64)
+ BlendVT = MVT::v4f64;
+ else if (BlendVT == MVT::v8i32)
+ BlendVT = MVT::v8f32;
+ }
+
+ unsigned BlendSize = BlendVT.getVectorNumElements();
+ unsigned MaskRatio = BlendSize / NumMaskElts;
+
+ // Can we blend with zero?
+ if (isSequentialOrUndefOrZeroInRange(Mask, /*Pos*/ 0, /*Size*/ NumMaskElts,
+ /*Low*/ 0) &&
+ NumMaskElts <= BlendVT.getVectorNumElements()) {
+ PermuteImm = 0;
+ for (unsigned i = 0; i != BlendSize; ++i)
+ if (Mask[i / MaskRatio] < 0)
+ PermuteImm |= 1u << i;
+
+ V2 = getZeroVector(BlendVT, Subtarget, DAG, DL);
+ Shuffle = X86ISD::BLENDI;
+ ShuffleVT = BlendVT;
+ return true;
+ }
+
+ // Attempt to match as a binary blend.
+ if (NumMaskElts <= BlendVT.getVectorNumElements()) {
+ bool MatchBlend = true;
+ for (int i = 0; i != (int)NumMaskElts; ++i) {
+ int M = Mask[i];
+ if (M == SM_SentinelUndef)
+ continue;
+ else if (M == SM_SentinelZero)
+ MatchBlend = false;
+ else if ((M != i) && (M != (i + (int)NumMaskElts)))
+ MatchBlend = false;
+ }
+
+ if (MatchBlend) {
+ PermuteImm = 0;
+ for (unsigned i = 0; i != BlendSize; ++i)
+ if ((int)NumMaskElts <= Mask[i / MaskRatio])
+ PermuteImm |= 1u << i;
+
+ Shuffle = X86ISD::BLENDI;
+ ShuffleVT = BlendVT;
+ return true;
+ }
+ }
+ }
+
+ // Attempt to combine to INSERTPS.
+ if (Subtarget.hasSSE41() && MaskVT == MVT::v4f32) {
+ SmallBitVector Zeroable(4, false);
+ for (unsigned i = 0; i != NumMaskElts; ++i)
+ if (Mask[i] < 0)
+ Zeroable[i] = true;
+
+ if (Zeroable.any() &&
+ matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
+ Shuffle = X86ISD::INSERTPS;
+ ShuffleVT = MVT::v4f32;
+ return true;
+ }
+ }
+
+ // Attempt to combine to SHUFPD.
+ if ((MaskVT == MVT::v2f64 && Subtarget.hasSSE2()) ||
+ (MaskVT == MVT::v4f64 && Subtarget.hasAVX()) ||
+ (MaskVT == MVT::v8f64 && Subtarget.hasAVX512())) {
+ if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
+ Shuffle = X86ISD::SHUFP;
+ ShuffleVT = MaskVT;
+ return true;
+ }
+ }
+
+ // Attempt to combine to SHUFPS.
+ if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
+ (MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
+ (MaskVT == MVT::v16f32 && Subtarget.hasAVX512())) {
+ SmallVector<int, 4> RepeatedMask;
+ if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
+ auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
+ int M0 = RepeatedMask[Offset];
+ int M1 = RepeatedMask[Offset + 1];
+
+ if (isUndefInRange(RepeatedMask, Offset, 2)) {
+ return DAG.getUNDEF(MaskVT);
+ } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
+ S0 = (SM_SentinelUndef == M0 ? -1 : 0);
+ S1 = (SM_SentinelUndef == M1 ? -1 : 1);
+ return getZeroVector(MaskVT, Subtarget, DAG, DL);
+ } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
+ S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
+ S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
+ return V1;
+ } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
+ S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
+ S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
+ return V2;
+ }
+
+ return SDValue();
+ };
+
+ int ShufMask[4] = {-1, -1, -1, -1};
+ SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
+ SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
+
+ if (Lo && Hi) {
+ V1 = Lo;
+ V2 = Hi;
+ Shuffle = X86ISD::SHUFP;
+ ShuffleVT = MaskVT;
+ PermuteImm = getV4X86ShuffleImm(ShufMask);
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+/// \brief Combine an arbitrary chain of shuffles into a single instruction if
+/// possible.
+///
+/// This is the leaf of the recursive combine below. When we have found some
+/// chain of single-use x86 shuffle instructions and accumulated the combined
+/// shuffle mask represented by them, this will try to pattern match that mask
+/// into either a single instruction if there is a special purpose instruction
+/// for this operation, or into a PSHUFB instruction which is a fully general
+/// instruction but should only be used to replace chains over a certain depth.
+static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
+ ArrayRef<int> BaseMask, int Depth,
+ bool HasVariableMask, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
+ assert((Inputs.size() == 1 || Inputs.size() == 2) &&
+ "Unexpected number of shuffle inputs!");
+
+ // Find the inputs that enter the chain. Note that multiple uses are OK
+ // here, we're not going to remove the operands we find.
+ bool UnaryShuffle = (Inputs.size() == 1);
+ SDValue V1 = peekThroughBitcasts(Inputs[0]);
+ SDValue V2 = (UnaryShuffle ? V1 : peekThroughBitcasts(Inputs[1]));
+
+ MVT VT1 = V1.getSimpleValueType();
+ MVT VT2 = V2.getSimpleValueType();
+ MVT RootVT = Root.getSimpleValueType();
+ assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
+ VT2.getSizeInBits() == RootVT.getSizeInBits() &&
+ "Vector size mismatch");
+
+ SDLoc DL(Root);
+ SDValue Res;
+
+ unsigned NumBaseMaskElts = BaseMask.size();
+ if (NumBaseMaskElts == 1) {
+ assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
+ /*AddTo*/ true);
+ return true;
+ }
+
+ unsigned RootSizeInBits = RootVT.getSizeInBits();
+ unsigned NumRootElts = RootVT.getVectorNumElements();
+ unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
+ bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
+ (RootVT.is256BitVector() && !Subtarget.hasAVX2());
+
+ // Don't combine if we are a AVX512/EVEX target and the mask element size
+ // is different from the root element size - this would prevent writemasks
+ // from being reused.
+ // TODO - this currently prevents all lane shuffles from occurring.
+ // TODO - check for writemasks usage instead of always preventing combining.
+ // TODO - attempt to narrow Mask back to writemask size.
+ bool IsEVEXShuffle =
+ RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
+ if (IsEVEXShuffle && (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits))
+ return false;
+
+ // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
+
+ // Handle 128-bit lane shuffles of 256-bit vectors.
+ // TODO - this should support binary shuffles.
+ if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
+ !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
+ if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
+ return false; // Nothing to do!
+ MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
+ unsigned PermMask = 0;
+ PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
+ PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
+
+ Res = DAG.getBitcast(ShuffleVT, V1);
+ DCI.AddToWorklist(Res.getNode());
+ Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
+ DAG.getUNDEF(ShuffleVT),
+ DAG.getConstant(PermMask, DL, MVT::i8));
+ DCI.AddToWorklist(Res.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+ /*AddTo*/ true);
+ return true;
+ }
+
+ // For masks that have been widened to 128-bit elements or more,
+ // narrow back down to 64-bit elements.
+ SmallVector<int, 64> Mask;
+ if (BaseMaskEltSizeInBits > 64) {
+ assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
+ int MaskScale = BaseMaskEltSizeInBits / 64;
+ scaleShuffleMask(MaskScale, BaseMask, Mask);
+ } else {
+ Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
+ }
+
+ unsigned NumMaskElts = Mask.size();
+ unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
+
+ // Determine the effective mask value type.
+ FloatDomain &= (32 <= MaskEltSizeInBits);
+ MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
+ : MVT::getIntegerVT(MaskEltSizeInBits);
+ MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
+
+ // Only allow legal mask types.
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
+ return false;
+
+ // Attempt to match the mask against known shuffle patterns.
+ MVT ShuffleSrcVT, ShuffleVT;
+ unsigned Shuffle, PermuteImm;
+
+ if (UnaryShuffle) {
+ // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
+ // directly if we don't shuffle the lower element and we shuffle the upper
+ // (zero) elements within themselves.
+ if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
+ (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
+ unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
+ ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
+ if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
+ isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
+ /*AddTo*/ true);
+ return true;
+ }
+ }
+
+ if (matchUnaryVectorShuffle(MaskVT, Mask, Subtarget, Shuffle, ShuffleSrcVT,
+ ShuffleVT)) {
+ if (Depth == 1 && Root.getOpcode() == Shuffle)
+ return false; // Nothing to do!
+ if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
+ return false; // AVX512 Writemask clash.
+ Res = DAG.getBitcast(ShuffleSrcVT, V1);
+ DCI.AddToWorklist(Res.getNode());
+ Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
+ DCI.AddToWorklist(Res.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+ /*AddTo*/ true);
+ return true;
+ }
+
+ if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Subtarget, Shuffle,
+ ShuffleVT, PermuteImm)) {
+ if (Depth == 1 && Root.getOpcode() == Shuffle)
+ return false; // Nothing to do!
+ if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
+ return false; // AVX512 Writemask clash.
+ Res = DAG.getBitcast(ShuffleVT, V1);
+ DCI.AddToWorklist(Res.getNode());
+ Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
+ DAG.getConstant(PermuteImm, DL, MVT::i8));
+ DCI.AddToWorklist(Res.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+ /*AddTo*/ true);
+ return true;
+ }
+ }
+
+ if (matchBinaryVectorShuffle(MaskVT, Mask, V1, V2, Subtarget, Shuffle,
+ ShuffleVT, UnaryShuffle)) {
+ if (Depth == 1 && Root.getOpcode() == Shuffle)
+ return false; // Nothing to do!
+ if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
+ return false; // AVX512 Writemask clash.
+ V1 = DAG.getBitcast(ShuffleVT, V1);
+ DCI.AddToWorklist(V1.getNode());
+ V2 = DAG.getBitcast(ShuffleVT, V2);
+ DCI.AddToWorklist(V2.getNode());
+ Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2);
+ DCI.AddToWorklist(Res.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+ /*AddTo*/ true);
+ return true;
+ }
+
+ if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, V1, V2, DL, DAG, Subtarget,
+ Shuffle, ShuffleVT, PermuteImm)) {
+ if (Depth == 1 && Root.getOpcode() == Shuffle)
+ return false; // Nothing to do!
+ if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
+ return false; // AVX512 Writemask clash.
+ V1 = DAG.getBitcast(ShuffleVT, V1);
+ DCI.AddToWorklist(V1.getNode());
+ V2 = DAG.getBitcast(ShuffleVT, V2);
+ DCI.AddToWorklist(V2.getNode());
+ Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2,
+ DAG.getConstant(PermuteImm, DL, MVT::i8));
+ DCI.AddToWorklist(Res.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+ /*AddTo*/ true);
+ return true;
+ }
+
+ // Don't try to re-form single instruction chains under any circumstances now
+ // that we've done encoding canonicalization for them.
+ if (Depth < 2)
+ return false;
+
+ bool MaskContainsZeros =
+ any_of(Mask, [](int M) { return M == SM_SentinelZero; });
+
+ if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
+ // If we have a single input lane-crossing shuffle then lower to VPERMV.
+ if (UnaryShuffle && (Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
+ ((Subtarget.hasAVX2() &&
+ (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
+ (Subtarget.hasAVX512() &&
+ (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
+ MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
+ (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
+ (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
+ (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
+ (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
+ MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
+ MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
+ SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
+ DCI.AddToWorklist(VPermMask.getNode());
+ Res = DAG.getBitcast(MaskVT, V1);
+ DCI.AddToWorklist(Res.getNode());
+ Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
+ DCI.AddToWorklist(Res.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+ /*AddTo*/ true);
+ return true;
+ }
+
+ // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
+ // vector as the second source.
+ if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
+ ((Subtarget.hasAVX512() &&
+ (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
+ MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
+ (Subtarget.hasVLX() &&
+ (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
+ MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
+ (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
+ (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
+ (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
+ (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
+ // Adjust shuffle mask - replace SM_SentinelZero with second source index.
+ for (unsigned i = 0; i != NumMaskElts; ++i)
+ if (Mask[i] == SM_SentinelZero)
+ Mask[i] = NumMaskElts + i;
+
+ MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
+ MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
+ SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
+ DCI.AddToWorklist(VPermMask.getNode());
+ Res = DAG.getBitcast(MaskVT, V1);
+ DCI.AddToWorklist(Res.getNode());
+ SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
+ DCI.AddToWorklist(Zero.getNode());
+ Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
+ DCI.AddToWorklist(Res.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+ /*AddTo*/ true);
+ return true;
+ }
+
+ // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
+ if ((Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
+ ((Subtarget.hasAVX512() &&
+ (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
+ MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
+ (Subtarget.hasVLX() &&
+ (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
+ MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
+ (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
+ (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
+ (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
+ (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
+ MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
+ MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
+ SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
+ DCI.AddToWorklist(VPermMask.getNode());
+ V1 = DAG.getBitcast(MaskVT, V1);
+ DCI.AddToWorklist(V1.getNode());
+ V2 = DAG.getBitcast(MaskVT, V2);
+ DCI.AddToWorklist(V2.getNode());
+ Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
+ DCI.AddToWorklist(Res.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+ /*AddTo*/ true);
+ return true;
+ }
+ return false;
+ }
+
+ // See if we can combine a single input shuffle with zeros to a bit-mask,
+ // which is much simpler than any shuffle.
+ if (UnaryShuffle && MaskContainsZeros && (Depth >= 3 || HasVariableMask) &&
+ isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
+ DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
+ APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
+ APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
+ SmallBitVector UndefElts(NumMaskElts, false);
+ SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
+ for (unsigned i = 0; i != NumMaskElts; ++i) {
+ int M = Mask[i];
+ if (M == SM_SentinelUndef) {
+ UndefElts[i] = true;
+ continue;
+ }
+ if (M == SM_SentinelZero)
+ continue;
+ EltBits[i] = AllOnes;
+ }
+ SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
+ DCI.AddToWorklist(BitMask.getNode());
+ Res = DAG.getBitcast(MaskVT, V1);
+ DCI.AddToWorklist(Res.getNode());
+ unsigned AndOpcode =
+ FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
+ Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
+ DCI.AddToWorklist(Res.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+ /*AddTo*/ true);
+ return true;
+ }
+
+ // If we have a single input shuffle with different shuffle patterns in the
+ // the 128-bit lanes use the variable mask to VPERMILPS.
+ // TODO Combine other mask types at higher depths.
+ if (UnaryShuffle && HasVariableMask && !MaskContainsZeros &&
+ ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
+ (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
+ SmallVector<SDValue, 16> VPermIdx;
+ for (int M : Mask) {
+ SDValue Idx =
+ M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
+ VPermIdx.push_back(Idx);
+ }
+ MVT VPermMaskVT = MVT::getVectorVT(MVT::i32, NumMaskElts);
+ SDValue VPermMask = DAG.getBuildVector(VPermMaskVT, DL, VPermIdx);
+ DCI.AddToWorklist(VPermMask.getNode());
+ Res = DAG.getBitcast(MaskVT, V1);
+ DCI.AddToWorklist(Res.getNode());
+ Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
+ DCI.AddToWorklist(Res.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+ /*AddTo*/ true);
+ return true;
+ }
+
+ // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
+ // to VPERMIL2PD/VPERMIL2PS.
+ if ((Depth >= 3 || HasVariableMask) && Subtarget.hasXOP() &&
+ (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
+ MaskVT == MVT::v8f32)) {
+ // VPERMIL2 Operation.
+ // Bits[3] - Match Bit.
+ // Bits[2:1] - (Per Lane) PD Shuffle Mask.
+ // Bits[2:0] - (Per Lane) PS Shuffle Mask.
+ unsigned NumLanes = MaskVT.getSizeInBits() / 128;
+ unsigned NumEltsPerLane = NumMaskElts / NumLanes;
+ SmallVector<int, 8> VPerm2Idx;
+ MVT MaskIdxSVT = MVT::getIntegerVT(MaskVT.getScalarSizeInBits());
+ MVT MaskIdxVT = MVT::getVectorVT(MaskIdxSVT, NumMaskElts);
+ unsigned M2ZImm = 0;
+ for (int M : Mask) {
+ if (M == SM_SentinelUndef) {
+ VPerm2Idx.push_back(-1);
+ continue;
+ }
+ if (M == SM_SentinelZero) {
+ M2ZImm = 2;
+ VPerm2Idx.push_back(8);
+ continue;
+ }
+ int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
+ Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
+ VPerm2Idx.push_back(Index);
+ }
+ V1 = DAG.getBitcast(MaskVT, V1);
+ DCI.AddToWorklist(V1.getNode());
+ V2 = DAG.getBitcast(MaskVT, V2);
+ DCI.AddToWorklist(V2.getNode());
+ SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, MaskIdxVT, DAG, DL, true);
+ DCI.AddToWorklist(VPerm2MaskOp.getNode());
+ Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
+ DAG.getConstant(M2ZImm, DL, MVT::i8));
+ DCI.AddToWorklist(Res.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+ /*AddTo*/ true);
+ return true;
+ }
+
+ // If we have 3 or more shuffle instructions or a chain involving a variable
+ // mask, we can replace them with a single PSHUFB instruction profitably.
+ // Intel's manuals suggest only using PSHUFB if doing so replacing 5
+ // instructions, but in practice PSHUFB tends to be *very* fast so we're
+ // more aggressive.
+ if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
+ ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
+ (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
+ (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
+ SmallVector<SDValue, 16> PSHUFBMask;
+ int NumBytes = RootVT.getSizeInBits() / 8;
+ int Ratio = NumBytes / NumMaskElts;
+ for (int i = 0; i < NumBytes; ++i) {
+ int M = Mask[i / Ratio];
+ if (M == SM_SentinelUndef) {
+ PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
+ continue;
+ }
+ if (M == SM_SentinelZero) {
+ PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
+ continue;
+ }
+ M = Ratio * M + i % Ratio;
+ assert ((M / 16) == (i / 16) && "Lane crossing detected");
+ PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
+ }
+ MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
+ Res = DAG.getBitcast(ByteVT, V1);
+ DCI.AddToWorklist(Res.getNode());
+ SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
+ DCI.AddToWorklist(PSHUFBMaskOp.getNode());
+ Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
+ DCI.AddToWorklist(Res.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+ /*AddTo*/ true);
+ return true;
+ }
+
+ // With XOP, if we have a 128-bit binary input shuffle we can always combine
+ // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
+ // slower than PSHUFB on targets that support both.
+ if ((Depth >= 3 || HasVariableMask) && RootVT.is128BitVector() &&
+ Subtarget.hasXOP()) {
+ // VPPERM Mask Operation
+ // Bits[4:0] - Byte Index (0 - 31)
+ // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
+ SmallVector<SDValue, 16> VPPERMMask;
+ int NumBytes = 16;
+ int Ratio = NumBytes / NumMaskElts;
+ for (int i = 0; i < NumBytes; ++i) {
+ int M = Mask[i / Ratio];
+ if (M == SM_SentinelUndef) {
+ VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
+ continue;
+ }
+ if (M == SM_SentinelZero) {
+ VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
+ continue;
+ }
+ M = Ratio * M + i % Ratio;
+ VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
+ }
+ MVT ByteVT = MVT::v16i8;
+ V1 = DAG.getBitcast(ByteVT, V1);
+ DCI.AddToWorklist(V1.getNode());
+ V2 = DAG.getBitcast(ByteVT, V2);
+ DCI.AddToWorklist(V2.getNode());
+ SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
+ DCI.AddToWorklist(VPPERMMaskOp.getNode());
+ Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
+ DCI.AddToWorklist(Res.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+ /*AddTo*/ true);
+ return true;
+ }
+
+ // Failed to find any combines.
+ return false;
+}
+
+// Attempt to constant fold all of the constant source ops.
+// Returns true if the entire shuffle is folded to a constant.
+// TODO: Extend this to merge multiple constant Ops and update the mask.
+static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
+ ArrayRef<int> Mask, SDValue Root,
+ bool HasVariableMask, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ MVT VT = Root.getSimpleValueType();
+
+ unsigned SizeInBits = VT.getSizeInBits();
+ unsigned NumMaskElts = Mask.size();
+ unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
+ unsigned NumOps = Ops.size();
+
+ // Extract constant bits from each source op.
+ bool OneUseConstantOp = false;
+ SmallVector<SmallBitVector, 4> UndefEltsOps(NumOps);
+ SmallVector<SmallVector<APInt, 8>, 4> RawBitsOps(NumOps);
+ for (unsigned i = 0; i != NumOps; ++i) {
+ SDValue SrcOp = Ops[i];
+ OneUseConstantOp |= SrcOp.hasOneUse();
+ if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
+ RawBitsOps[i]))
+ return false;
+ }
+
+ // Only fold if at least one of the constants is only used once or
+ // the combined shuffle has included a variable mask shuffle, this
+ // is to avoid constant pool bloat.
+ if (!OneUseConstantOp && !HasVariableMask)
+ return false;
+
+ // Shuffle the constant bits according to the mask.
+ SmallBitVector UndefElts(NumMaskElts, false);
+ SmallBitVector ZeroElts(NumMaskElts, false);
+ SmallBitVector ConstantElts(NumMaskElts, false);
+ SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
+ APInt::getNullValue(MaskSizeInBits));
+ for (unsigned i = 0; i != NumMaskElts; ++i) {
+ int M = Mask[i];
+ if (M == SM_SentinelUndef) {
+ UndefElts[i] = true;
+ continue;
+ } else if (M == SM_SentinelZero) {
+ ZeroElts[i] = true;
+ continue;
+ }
+ assert(0 <= M && M < (int)(NumMaskElts * NumOps));
+
+ unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
+ unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
+
+ auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
+ if (SrcUndefElts[SrcMaskIdx]) {
+ UndefElts[i] = true;
+ continue;
+ }
+
+ auto &SrcEltBits = RawBitsOps[SrcOpIdx];
+ APInt &Bits = SrcEltBits[SrcMaskIdx];
+ if (!Bits) {
+ ZeroElts[i] = true;
+ continue;
+ }
+
+ ConstantElts[i] = true;
+ ConstantBitData[i] = Bits;
+ }
+ assert((UndefElts | ZeroElts | ConstantElts).count() == NumMaskElts);
+
+ // Create the constant data.
+ MVT MaskSVT;
+ if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
+ MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
+ else
+ MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
+
+ MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
+
+ SDLoc DL(Root);
+ SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
+ DCI.AddToWorklist(CstOp.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(VT, CstOp));
+ return true;
+}
+
+/// \brief Fully generic combining of x86 shuffle instructions.
+///
+/// This should be the last combine run over the x86 shuffle instructions. Once
+/// they have been fully optimized, this will recursively consider all chains
+/// of single-use shuffle instructions, build a generic model of the cumulative
+/// shuffle operation, and check for simpler instructions which implement this
+/// operation. We use this primarily for two purposes:
+///
+/// 1) Collapse generic shuffles to specialized single instructions when
+/// equivalent. In most cases, this is just an encoding size win, but
+/// sometimes we will collapse multiple generic shuffles into a single
+/// special-purpose shuffle.
+/// 2) Look for sequences of shuffle instructions with 3 or more total
+/// instructions, and replace them with the slightly more expensive SSSE3
+/// PSHUFB instruction if available. We do this as the last combining step
+/// to ensure we avoid using PSHUFB if we can implement the shuffle with
+/// a suitable short sequence of other instructions. The PSHUFB will either
+/// use a register or have to read from memory and so is slightly (but only
+/// slightly) more expensive than the other shuffle instructions.
+///
+/// Because this is inherently a quadratic operation (for each shuffle in
+/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
+/// This should never be an issue in practice as the shuffle lowering doesn't
+/// produce sequences of more than 8 instructions.
+///
+/// FIXME: We will currently miss some cases where the redundant shuffling
+/// would simplify under the threshold for PSHUFB formation because of
+/// combine-ordering. To fix this, we should do the redundant instruction
+/// combining in this recursive walk.
+static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
+ int SrcOpIndex, SDValue Root,
+ ArrayRef<int> RootMask,
+ int Depth, bool HasVariableMask,
+ SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ // Bound the depth of our recursive combine because this is ultimately
+ // quadratic in nature.
+ if (Depth > 8)
+ return false;
+
+ // Directly rip through bitcasts to find the underlying operand.
+ SDValue Op = SrcOps[SrcOpIndex];
+ Op = peekThroughOneUseBitcasts(Op);
+
+ MVT VT = Op.getSimpleValueType();
+ if (!VT.isVector())
+ return false; // Bail if we hit a non-vector.
+
+ assert(Root.getSimpleValueType().isVector() &&
+ "Shuffles operate on vector types!");
+ assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
+ "Can only combine shuffles of the same vector register size.");
+
+ // Extract target shuffle mask and resolve sentinels and inputs.
+ SDValue Input0, Input1;
+ SmallVector<int, 16> OpMask;
+ if (!resolveTargetShuffleInputs(Op, Input0, Input1, OpMask))
+ return false;
+
+ // Add the inputs to the Ops list, avoiding duplicates.
+ SmallVector<SDValue, 8> Ops(SrcOps.begin(), SrcOps.end());
+
+ int InputIdx0 = -1, InputIdx1 = -1;
+ for (int i = 0, e = Ops.size(); i < e; ++i) {
+ SDValue BC = peekThroughBitcasts(Ops[i]);
+ if (Input0 && BC == peekThroughBitcasts(Input0))
+ InputIdx0 = i;
+ if (Input1 && BC == peekThroughBitcasts(Input1))
+ InputIdx1 = i;
+ }
+
+ if (Input0 && InputIdx0 < 0) {
+ InputIdx0 = SrcOpIndex;
+ Ops[SrcOpIndex] = Input0;
+ }
+ if (Input1 && InputIdx1 < 0) {
+ InputIdx1 = Ops.size();
+ Ops.push_back(Input1);
+ }
+
+ assert(((RootMask.size() > OpMask.size() &&
+ RootMask.size() % OpMask.size() == 0) ||
+ (OpMask.size() > RootMask.size() &&
+ OpMask.size() % RootMask.size() == 0) ||
+ OpMask.size() == RootMask.size()) &&
+ "The smaller number of elements must divide the larger.");
+ int MaskWidth = std::max<int>(OpMask.size(), RootMask.size());
+ int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
+ int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
+ assert(((RootRatio == 1 && OpRatio == 1) ||
+ (RootRatio == 1) != (OpRatio == 1)) &&
+ "Must not have a ratio for both incoming and op masks!");
+
+ SmallVector<int, 16> Mask;
+ Mask.reserve(MaskWidth);
+
+ // Merge this shuffle operation's mask into our accumulated mask. Note that
+ // this shuffle's mask will be the first applied to the input, followed by the
+ // root mask to get us all the way to the root value arrangement. The reason
+ // for this order is that we are recursing up the operation chain.
+ for (int i = 0; i < MaskWidth; ++i) {
+ int RootIdx = i / RootRatio;
+ if (RootMask[RootIdx] < 0) {
+ // This is a zero or undef lane, we're done.
+ Mask.push_back(RootMask[RootIdx]);
+ continue;
+ }
+
+ int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
+
+ // Just insert the scaled root mask value if it references an input other
+ // than the SrcOp we're currently inserting.
+ if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
+ (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
+ Mask.push_back(RootMaskedIdx);
+ continue;
+ }
+
+ RootMaskedIdx %= MaskWidth;
+
+ int OpIdx = RootMaskedIdx / OpRatio;
+ if (OpMask[OpIdx] < 0) {
+ // The incoming lanes are zero or undef, it doesn't matter which ones we
+ // are using.
+ Mask.push_back(OpMask[OpIdx]);
+ continue;
+ }
+
+ // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
+ int OpMaskedIdx = OpMask[OpIdx] * OpRatio + RootMaskedIdx % OpRatio;
+ OpMaskedIdx %= MaskWidth;
+
+ if (OpMask[OpIdx] < (int)OpMask.size()) {
+ assert(0 <= InputIdx0 && "Unknown target shuffle input");
+ OpMaskedIdx += InputIdx0 * MaskWidth;
+ } else {
+ assert(0 <= InputIdx1 && "Unknown target shuffle input");
+ OpMaskedIdx += InputIdx1 * MaskWidth;
+ }
+
+ Mask.push_back(OpMaskedIdx);
+ }
+
+ // Handle the all undef/zero cases early.
+ if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) {
+ DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType()));
+ return true;
+ }
+ if (all_of(Mask, [](int Idx) { return Idx < 0; })) {
+ // TODO - should we handle the mixed zero/undef case as well? Just returning
+ // a zero mask will lose information on undef elements possibly reducing
+ // future combine possibilities.
+ DCI.CombineTo(Root.getNode(), getZeroVector(Root.getSimpleValueType(),
+ Subtarget, DAG, SDLoc(Root)));
+ return true;
+ }
+
+ // Remove unused shuffle source ops.
+ SmallVector<SDValue, 8> UsedOps;
+ for (int i = 0, e = Ops.size(); i < e; ++i) {
+ int lo = UsedOps.size() * MaskWidth;
+ int hi = lo + MaskWidth;
+ if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
+ UsedOps.push_back(Ops[i]);
+ continue;
+ }
+ for (int &M : Mask)
+ if (lo <= M)
+ M -= MaskWidth;
+ }
+ assert(!UsedOps.empty() && "Shuffle with no inputs detected");
+ Ops = UsedOps;
+
+ HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
+
+ // See if we can recurse into each shuffle source op (if it's a target shuffle).
+ for (int i = 0, e = Ops.size(); i < e; ++i)
+ if (Ops[i].getNode()->hasOneUse() || Op->isOnlyUserOf(Ops[i].getNode()))
+ if (combineX86ShufflesRecursively(Ops, i, Root, Mask, Depth + 1,
+ HasVariableMask, DAG, DCI, Subtarget))
+ return true;
+
+ // Attempt to constant fold all of the constant source ops.
+ if (combineX86ShufflesConstants(Ops, Mask, Root, HasVariableMask, DAG, DCI,
+ Subtarget))
+ return true;
+
+ // We can only combine unary and binary shuffle mask cases.
+ if (Ops.size() > 2)
+ return false;
+
+ // Minor canonicalization of the accumulated shuffle mask to make it easier
+ // to match below. All this does is detect masks with sequential pairs of
+ // elements, and shrink them to the half-width mask. It does this in a loop
+ // so it will reduce the size of the mask to the minimal width mask which
+ // performs an equivalent shuffle.
+ SmallVector<int, 16> WidenedMask;
+ while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
+ Mask = std::move(WidenedMask);
+ }
+
+ // Canonicalization of binary shuffle masks to improve pattern matching by
+ // commuting the inputs.
+ if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
+ ShuffleVectorSDNode::commuteMask(Mask);
+ std::swap(Ops[0], Ops[1]);
+ }
+
+ return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
+ DCI, Subtarget);
+}
+
+/// \brief Get the PSHUF-style mask from PSHUF node.
+///
+/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
+/// PSHUF-style masks that can be reused with such instructions.
+static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
+ MVT VT = N.getSimpleValueType();
+ SmallVector<int, 4> Mask;
+ SmallVector<SDValue, 2> Ops;
+ bool IsUnary;
+ bool HaveMask =
+ getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
+ (void)HaveMask;
+ assert(HaveMask);
+
+ // If we have more than 128-bits, only the low 128-bits of shuffle mask
+ // matter. Check that the upper masks are repeats and remove them.
+ if (VT.getSizeInBits() > 128) {
+ int LaneElts = 128 / VT.getScalarSizeInBits();
+#ifndef NDEBUG
+ for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
+ for (int j = 0; j < LaneElts; ++j)
+ assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
+ "Mask doesn't repeat in high 128-bit lanes!");
+#endif
+ Mask.resize(LaneElts);
+ }
+
+ switch (N.getOpcode()) {
+ case X86ISD::PSHUFD:
+ return Mask;
+ case X86ISD::PSHUFLW:
+ Mask.resize(4);
+ return Mask;
+ case X86ISD::PSHUFHW:
+ Mask.erase(Mask.begin(), Mask.begin() + 4);
+ for (int &M : Mask)
+ M -= 4;
+ return Mask;
+ default:
+ llvm_unreachable("No valid shuffle instruction found!");
+ }
+}
+
+/// \brief Search for a combinable shuffle across a chain ending in pshufd.
+///
+/// We walk up the chain and look for a combinable shuffle, skipping over
+/// shuffles that we could hoist this shuffle's transformation past without
+/// altering anything.
+static SDValue
+combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
+ SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ assert(N.getOpcode() == X86ISD::PSHUFD &&
+ "Called with something other than an x86 128-bit half shuffle!");
+ SDLoc DL(N);
+
+ // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
+ // of the shuffles in the chain so that we can form a fresh chain to replace
+ // this one.
+ SmallVector<SDValue, 8> Chain;
+ SDValue V = N.getOperand(0);
+ for (; V.hasOneUse(); V = V.getOperand(0)) {
+ switch (V.getOpcode()) {
+ default:
+ return SDValue(); // Nothing combined!
+
+ case ISD::BITCAST:
+ // Skip bitcasts as we always know the type for the target specific
+ // instructions.
+ continue;
+
+ case X86ISD::PSHUFD:
+ // Found another dword shuffle.
+ break;
+
+ case X86ISD::PSHUFLW:
+ // Check that the low words (being shuffled) are the identity in the
+ // dword shuffle, and the high words are self-contained.
+ if (Mask[0] != 0 || Mask[1] != 1 ||
+ !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
+ return SDValue();
+
+ Chain.push_back(V);
+ continue;
+
+ case X86ISD::PSHUFHW:
+ // Check that the high words (being shuffled) are the identity in the
+ // dword shuffle, and the low words are self-contained.
+ if (Mask[2] != 2 || Mask[3] != 3 ||
+ !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
+ return SDValue();
+
+ Chain.push_back(V);
+ continue;
+
+ case X86ISD::UNPCKL:
+ case X86ISD::UNPCKH:
+ // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
+ // shuffle into a preceding word shuffle.
+ if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
+ V.getSimpleValueType().getVectorElementType() != MVT::i16)
+ return SDValue();
+
+ // Search for a half-shuffle which we can combine with.
+ unsigned CombineOp =
+ V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
+ if (V.getOperand(0) != V.getOperand(1) ||
+ !V->isOnlyUserOf(V.getOperand(0).getNode()))
+ return SDValue();
+ Chain.push_back(V);
+ V = V.getOperand(0);
+ do {
+ switch (V.getOpcode()) {
+ default:
+ return SDValue(); // Nothing to combine.
+
+ case X86ISD::PSHUFLW:
+ case X86ISD::PSHUFHW:
+ if (V.getOpcode() == CombineOp)
+ break;
+
+ Chain.push_back(V);
+
+ LLVM_FALLTHROUGH;
+ case ISD::BITCAST:
+ V = V.getOperand(0);
+ continue;
+ }
+ break;
+ } while (V.hasOneUse());
+ break;
+ }
+ // Break out of the loop if we break out of the switch.
+ break;
+ }
+
+ if (!V.hasOneUse())
+ // We fell out of the loop without finding a viable combining instruction.
+ return SDValue();
+
+ // Merge this node's mask and our incoming mask.
+ SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
+ for (int &M : Mask)
+ M = VMask[M];
+ V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
+ getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+
+ // Rebuild the chain around this new shuffle.
+ while (!Chain.empty()) {
+ SDValue W = Chain.pop_back_val();
+
+ if (V.getValueType() != W.getOperand(0).getValueType())
+ V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
+
+ switch (W.getOpcode()) {
+ default:
+ llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
+
+ case X86ISD::UNPCKL:
+ case X86ISD::UNPCKH:
+ V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
+ break;
+
+ case X86ISD::PSHUFD:
+ case X86ISD::PSHUFLW:
+ case X86ISD::PSHUFHW:
+ V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
+ break;
+ }
+ }
+ if (V.getValueType() != N.getValueType())
+ V = DAG.getBitcast(N.getValueType(), V);
+
+ // Return the new chain to replace N.
+ return V;
+}
+
+/// \brief Search for a combinable shuffle across a chain ending in pshuflw or
+/// pshufhw.
+///
+/// We walk up the chain, skipping shuffles of the other half and looking
+/// through shuffles which switch halves trying to find a shuffle of the same
+/// pair of dwords.
+static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
+ SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ assert(
+ (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
+ "Called with something other than an x86 128-bit half shuffle!");
+ SDLoc DL(N);
+ unsigned CombineOpcode = N.getOpcode();
+
+ // Walk up a single-use chain looking for a combinable shuffle.
+ SDValue V = N.getOperand(0);
+ for (; V.hasOneUse(); V = V.getOperand(0)) {
+ switch (V.getOpcode()) {
+ default:
+ return false; // Nothing combined!
+
+ case ISD::BITCAST:
+ // Skip bitcasts as we always know the type for the target specific
+ // instructions.
+ continue;
+
+ case X86ISD::PSHUFLW:
+ case X86ISD::PSHUFHW:
+ if (V.getOpcode() == CombineOpcode)
+ break;
+
+ // Other-half shuffles are no-ops.
+ continue;
+ }
+ // Break out of the loop if we break out of the switch.
+ break;
+ }
+
+ if (!V.hasOneUse())
+ // We fell out of the loop without finding a viable combining instruction.
+ return false;
+
+ // Combine away the bottom node as its shuffle will be accumulated into
+ // a preceding shuffle.
+ DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
+
+ // Record the old value.
+ SDValue Old = V;
+
+ // Merge this node's mask and our incoming mask (adjusted to account for all
+ // the pshufd instructions encountered).
+ SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
+ for (int &M : Mask)
+ M = VMask[M];
+ V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
+ getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+
+ // Check that the shuffles didn't cancel each other out. If not, we need to
+ // combine to the new one.
+ if (Old != V)
+ // Replace the combinable shuffle with the combined one, updating all users
+ // so that we re-evaluate the chain here.
+ DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
+
+ return true;
+}
+
+/// \brief Try to combine x86 target specific shuffles.
+static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ SDLoc DL(N);
+ MVT VT = N.getSimpleValueType();
+ SmallVector<int, 4> Mask;
+
+ unsigned Opcode = N.getOpcode();
+ switch (Opcode) {
+ case X86ISD::PSHUFD:
+ case X86ISD::PSHUFLW:
+ case X86ISD::PSHUFHW:
+ Mask = getPSHUFShuffleMask(N);
+ assert(Mask.size() == 4);
+ break;
+ case X86ISD::UNPCKL: {
+ auto Op0 = N.getOperand(0);
+ auto Op1 = N.getOperand(1);
+ unsigned Opcode0 = Op0.getOpcode();
+ unsigned Opcode1 = Op1.getOpcode();
+
+ // Combine X86ISD::UNPCKL with 2 X86ISD::FHADD inputs into a single
+ // X86ISD::FHADD. This is generated by UINT_TO_FP v2f64 scalarization.
+ // TODO: Add other horizontal operations as required.
+ if (VT == MVT::v2f64 && Opcode0 == Opcode1 && Opcode0 == X86ISD::FHADD)
+ return DAG.getNode(Opcode0, DL, VT, Op0.getOperand(0), Op1.getOperand(0));
+
+ // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
+ // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
+ // moves upper half elements into the lower half part. For example:
+ //
+ // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
+ // undef:v16i8
+ // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
+ //
+ // will be combined to:
+ //
+ // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
+
+ // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
+ // happen due to advanced instructions.
+ if (!VT.is128BitVector())
+ return SDValue();
+
+ if (Op0.isUndef() && Opcode1 == ISD::VECTOR_SHUFFLE) {
+ ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
+
+ unsigned NumElts = VT.getVectorNumElements();
+ SmallVector<int, 8> ExpectedMask(NumElts, -1);
+ std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
+ NumElts / 2);
+
+ auto ShufOp = Op1.getOperand(0);
+ if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
+ return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
+ }
+ return SDValue();
+ }
+ case X86ISD::BLENDI: {
+ SDValue V0 = N->getOperand(0);
+ SDValue V1 = N->getOperand(1);
+ assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
+ "Unexpected input vector types");
+
+ // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
+ // operands and changing the mask to 1. This saves us a bunch of
+ // pattern-matching possibilities related to scalar math ops in SSE/AVX.
+ // x86InstrInfo knows how to commute this back after instruction selection
+ // if it would help register allocation.
+
+ // TODO: If optimizing for size or a processor that doesn't suffer from
+ // partial register update stalls, this should be transformed into a MOVSD
+ // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
+
+ if (VT == MVT::v2f64)
+ if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
+ if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
+ SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
+ return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
+ }
+
+ return SDValue();
+ }
+ case X86ISD::MOVSD:
+ case X86ISD::MOVSS: {
+ bool isFloat = VT.isFloatingPoint();
+ SDValue V0 = peekThroughBitcasts(N->getOperand(0));
+ SDValue V1 = peekThroughBitcasts(N->getOperand(1));
+ bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
+ bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
+ bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
+ bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
+ assert(!(isZero0 && isZero1) && "Zeroable shuffle detected.");
+
+ // We often lower to MOVSD/MOVSS from integer as well as native float
+ // types; remove unnecessary domain-crossing bitcasts if we can to make it
+ // easier to combine shuffles later on. We've already accounted for the
+ // domain switching cost when we decided to lower with it.
+ if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
+ MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
+ : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
+ V0 = DAG.getBitcast(NewVT, V0);
+ V1 = DAG.getBitcast(NewVT, V1);
+ return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
+ }
+
+ return SDValue();
+ }
+ case X86ISD::INSERTPS: {
+ assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
+ SDValue Op0 = N.getOperand(0);
+ SDValue Op1 = N.getOperand(1);
+ SDValue Op2 = N.getOperand(2);
+ unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
+ unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
+ unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
+ unsigned ZeroMask = InsertPSMask & 0xF;
+
+ // If we zero out all elements from Op0 then we don't need to reference it.
+ if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
+ return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
+ DAG.getConstant(InsertPSMask, DL, MVT::i8));
+
+ // If we zero out the element from Op1 then we don't need to reference it.
+ if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
+ return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
+ DAG.getConstant(InsertPSMask, DL, MVT::i8));
+
+ // Attempt to merge insertps Op1 with an inner target shuffle node.
+ SmallVector<int, 8> TargetMask1;
+ SmallVector<SDValue, 2> Ops1;
+ if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
+ int M = TargetMask1[SrcIdx];
+ if (isUndefOrZero(M)) {
+ // Zero/UNDEF insertion - zero out element and remove dependency.
+ InsertPSMask |= (1u << DstIdx);
+ return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
+ DAG.getConstant(InsertPSMask, DL, MVT::i8));
+ }
+ // Update insertps mask srcidx and reference the source input directly.
+ assert(0 <= M && M < 8 && "Shuffle index out of range");
+ InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
+ Op1 = Ops1[M < 4 ? 0 : 1];
+ return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
+ DAG.getConstant(InsertPSMask, DL, MVT::i8));
+ }
+
+ // Attempt to merge insertps Op0 with an inner target shuffle node.
+ SmallVector<int, 8> TargetMask0;
+ SmallVector<SDValue, 2> Ops0;
+ if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
+ return SDValue();
+
+ bool Updated = false;
+ bool UseInput00 = false;
+ bool UseInput01 = false;
+ for (int i = 0; i != 4; ++i) {
+ int M = TargetMask0[i];
+ if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
+ // No change if element is already zero or the inserted element.
+ continue;
+ } else if (isUndefOrZero(M)) {
+ // If the target mask is undef/zero then we must zero the element.
+ InsertPSMask |= (1u << i);
+ Updated = true;
+ continue;
+ }
+
+ // The input vector element must be inline.
+ if (M != i && M != (i + 4))
+ return SDValue();
+
+ // Determine which inputs of the target shuffle we're using.
+ UseInput00 |= (0 <= M && M < 4);
+ UseInput01 |= (4 <= M);
+ }
+
+ // If we're not using both inputs of the target shuffle then use the
+ // referenced input directly.
+ if (UseInput00 && !UseInput01) {
+ Updated = true;
+ Op0 = Ops0[0];
+ } else if (!UseInput00 && UseInput01) {
+ Updated = true;
+ Op0 = Ops0[1];
+ }
+
+ if (Updated)
+ return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
+ DAG.getConstant(InsertPSMask, DL, MVT::i8));
+
+ return SDValue();
+ }
+ default:
+ return SDValue();
+ }
+
+ // Nuke no-op shuffles that show up after combining.
+ if (isNoopShuffleMask(Mask))
+ return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
+
+ // Look for simplifications involving one or two shuffle instructions.
+ SDValue V = N.getOperand(0);
+ switch (N.getOpcode()) {
+ default:
+ break;
+ case X86ISD::PSHUFLW:
+ case X86ISD::PSHUFHW:
+ assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
+
+ if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
+ return SDValue(); // We combined away this shuffle, so we're done.
+
+ // See if this reduces to a PSHUFD which is no more expensive and can
+ // combine with more operations. Note that it has to at least flip the
+ // dwords as otherwise it would have been removed as a no-op.
+ if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
+ int DMask[] = {0, 1, 2, 3};
+ int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
+ DMask[DOffset + 0] = DOffset + 1;
+ DMask[DOffset + 1] = DOffset + 0;
+ MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
+ V = DAG.getBitcast(DVT, V);
+ DCI.AddToWorklist(V.getNode());
+ V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
+ getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
+ DCI.AddToWorklist(V.getNode());
+ return DAG.getBitcast(VT, V);
+ }
+
+ // Look for shuffle patterns which can be implemented as a single unpack.
+ // FIXME: This doesn't handle the location of the PSHUFD generically, and
+ // only works when we have a PSHUFD followed by two half-shuffles.
+ if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
+ (V.getOpcode() == X86ISD::PSHUFLW ||
+ V.getOpcode() == X86ISD::PSHUFHW) &&
+ V.getOpcode() != N.getOpcode() &&
+ V.hasOneUse()) {
+ SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
+ if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
+ SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
+ SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
+ int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
+ int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
+ int WordMask[8];
+ for (int i = 0; i < 4; ++i) {
+ WordMask[i + NOffset] = Mask[i] + NOffset;
+ WordMask[i + VOffset] = VMask[i] + VOffset;
+ }
+ // Map the word mask through the DWord mask.
+ int MappedMask[8];
+ for (int i = 0; i < 8; ++i)
+ MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
+ if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
+ makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
+ // We can replace all three shuffles with an unpack.
+ V = DAG.getBitcast(VT, D.getOperand(0));
+ DCI.AddToWorklist(V.getNode());
+ return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
+ : X86ISD::UNPCKH,
+ DL, VT, V, V);
+ }
+ }
+ }
+
+ break;
+
+ case X86ISD::PSHUFD:
+ if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI))
+ return NewN;
+
+ break;
+ }
+
+ return SDValue();
+}
+
+/// \brief Try to combine a shuffle into a target-specific add-sub node.
+///
+/// We combine this directly on the abstract vector shuffle nodes so it is
+/// easier to generically match. We also insert dummy vector shuffle nodes for
+/// the operands which explicitly discard the lanes which are unused by this
+/// operation to try to flow through the rest of the combiner the fact that
+/// they're unused.
+static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+ if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
+ (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
+ return SDValue();
+
+ // We only handle target-independent shuffles.
+ // FIXME: It would be easy and harmless to use the target shuffle mask
+ // extraction tool to support more.
+ if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
+ return SDValue();
+
+ ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();
+ SmallVector<int, 8> Mask(OrigMask.begin(), OrigMask.end());
+
+ SDValue V1 = N->getOperand(0);
+ SDValue V2 = N->getOperand(1);
+
+ // We require the first shuffle operand to be the FSUB node, and the second to
+ // be the FADD node.
+ if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) {
+ ShuffleVectorSDNode::commuteMask(Mask);
+ std::swap(V1, V2);
+ } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
+ return SDValue();
+
+ // If there are other uses of these operations we can't fold them.
+ if (!V1->hasOneUse() || !V2->hasOneUse())
+ return SDValue();
+
+ // Ensure that both operations have the same operands. Note that we can
+ // commute the FADD operands.
+ SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
+ if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
+ (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
+ return SDValue();
+
+ // We're looking for blends between FADD and FSUB nodes. We insist on these
+ // nodes being lined up in a specific expected pattern.
+ if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
+ isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
+ isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15})))
+ return SDValue();
+
+ return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
+}
+
+// We are looking for a shuffle where both sources are concatenated with undef
+// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
+// if we can express this as a single-source shuffle, that's preferable.
+static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+
+ // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
+ if (!VT.is128BitVector() && !VT.is256BitVector())
+ return SDValue();
+
+ if (VT.getVectorElementType() != MVT::i32 &&
+ VT.getVectorElementType() != MVT::i64 &&
+ VT.getVectorElementType() != MVT::f32 &&
+ VT.getVectorElementType() != MVT::f64)
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ // Check that both sources are concats with undef.
+ if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
+ N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
+ N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
+ !N1.getOperand(1).isUndef())
+ return SDValue();
+
+ // Construct the new shuffle mask. Elements from the first source retain their
+ // index, but elements from the second source no longer need to skip an undef.
+ SmallVector<int, 8> Mask;
+ int NumElts = VT.getVectorNumElements();
+
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ for (int Elt : SVOp->getMask())
+ Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
+
+ SDLoc DL(N);
+ SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
+ N1.getOperand(0));
+ return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
+}
+
+static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ SDLoc dl(N);
+ EVT VT = N->getValueType(0);
+
+ // Don't create instructions with illegal types after legalize types has run.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
+ return SDValue();
+
+ // If we have legalized the vector types, look for blends of FADD and FSUB
+ // nodes that we can fuse into an ADDSUB node.
+ if (TLI.isTypeLegal(VT))
+ if (SDValue AddSub = combineShuffleToAddSub(N, Subtarget, DAG))
+ return AddSub;
+
+ // During Type Legalization, when promoting illegal vector types,
+ // the backend might introduce new shuffle dag nodes and bitcasts.
+ //
+ // This code performs the following transformation:
+ // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
+ // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
+ //
+ // We do this only if both the bitcast and the BINOP dag nodes have
+ // one use. Also, perform this transformation only if the new binary
+ // operation is legal. This is to avoid introducing dag nodes that
+ // potentially need to be further expanded (or custom lowered) into a
+ // less optimal sequence of dag nodes.
+ if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
+ N->getOpcode() == ISD::VECTOR_SHUFFLE &&
+ N->getOperand(0).getOpcode() == ISD::BITCAST &&
+ N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ SDValue BC0 = N0.getOperand(0);
+ EVT SVT = BC0.getValueType();
+ unsigned Opcode = BC0.getOpcode();
+ unsigned NumElts = VT.getVectorNumElements();
+
+ if (BC0.hasOneUse() && SVT.isVector() &&
+ SVT.getVectorNumElements() * 2 == NumElts &&
+ TLI.isOperationLegal(Opcode, VT)) {
+ bool CanFold = false;
+ switch (Opcode) {
+ default : break;
+ case ISD::ADD:
+ case ISD::SUB:
+ case ISD::MUL:
+ // isOperationLegal lies for integer ops on floating point types.
+ CanFold = VT.isInteger();
+ break;
+ case ISD::FADD:
+ case ISD::FSUB:
+ case ISD::FMUL:
+ // isOperationLegal lies for floating point ops on integer types.
+ CanFold = VT.isFloatingPoint();
+ break;
+ }
+
+ unsigned SVTNumElts = SVT.getVectorNumElements();
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
+ CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
+ for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
+ CanFold = SVOp->getMaskElt(i) < 0;
+
+ if (CanFold) {
+ SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
+ SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
+ SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
+ return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
+ }
+ }
+ }
+
+ // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
+ // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
+ // consecutive, non-overlapping, and in the right order.
+ SmallVector<SDValue, 16> Elts;
+ for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
+ Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
+
+ if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true))
+ return LD;
+
+ // For AVX2, we sometimes want to combine
+ // (vector_shuffle <mask> (concat_vectors t1, undef)
+ // (concat_vectors t2, undef))
+ // Into:
+ // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
+ // Since the latter can be efficiently lowered with VPERMD/VPERMQ
+ if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
+ return ShufConcat;
+
+ if (isTargetShuffle(N->getOpcode())) {
+ SDValue Op(N, 0);
+ if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
+ return Shuffle;
+
+ // Try recursively combining arbitrary sequences of x86 shuffle
+ // instructions into higher-order shuffles. We do this after combining
+ // specific PSHUF instruction sequences into their minimal form so that we
+ // can evaluate how many specialized shuffle instructions are involved in
+ // a particular chain.
+ SmallVector<int, 1> NonceMask; // Just a placeholder.
+ NonceMask.push_back(0);
+ if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
+ /*Depth*/ 1, /*HasVarMask*/ false, DAG,
+ DCI, Subtarget))
+ return SDValue(); // This routine will use CombineTo to replace N.
+ }
+
+ return SDValue();
+}
+
+/// Check if a vector extract from a target-specific shuffle of a load can be
+/// folded into a single element load.
+/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
+/// shuffles have been custom lowered so we need to handle those here.
+static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ SDValue InVec = N->getOperand(0);
+ SDValue EltNo = N->getOperand(1);
+ EVT EltVT = N->getValueType(0);
+
+ if (!isa<ConstantSDNode>(EltNo))
+ return SDValue();
+
+ EVT OriginalVT = InVec.getValueType();
+
+ if (InVec.getOpcode() == ISD::BITCAST) {
+ // Don't duplicate a load with other uses.
+ if (!InVec.hasOneUse())
+ return SDValue();
+ EVT BCVT = InVec.getOperand(0).getValueType();
+ if (!BCVT.isVector() ||
+ BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
+ return SDValue();
+ InVec = InVec.getOperand(0);
+ }
+
+ EVT CurrentVT = InVec.getValueType();
+
+ if (!isTargetShuffle(InVec.getOpcode()))
+ return SDValue();
+
+ // Don't duplicate a load with other uses.
+ if (!InVec.hasOneUse())
+ return SDValue();
+
+ SmallVector<int, 16> ShuffleMask;
+ SmallVector<SDValue, 2> ShuffleOps;
+ bool UnaryShuffle;
+ if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
+ ShuffleOps, ShuffleMask, UnaryShuffle))
+ return SDValue();
+
+ // Select the input vector, guarding against out of range extract vector.
+ unsigned NumElems = CurrentVT.getVectorNumElements();
+ int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
+ int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
+
+ if (Idx == SM_SentinelZero)
+ return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
+ : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
+ if (Idx == SM_SentinelUndef)
+ return DAG.getUNDEF(EltVT);
+
+ assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
+ SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
+ : ShuffleOps[1];
+
+ // If inputs to shuffle are the same for both ops, then allow 2 uses
+ unsigned AllowedUses =
+ (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
+
+ if (LdNode.getOpcode() == ISD::BITCAST) {
+ // Don't duplicate a load with other uses.
+ if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
+ return SDValue();
+
+ AllowedUses = 1; // only allow 1 load use if we have a bitcast
+ LdNode = LdNode.getOperand(0);
+ }
+
+ if (!ISD::isNormalLoad(LdNode.getNode()))
+ return SDValue();
+
+ LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
+
+ if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
+ return SDValue();
+
+ // If there's a bitcast before the shuffle, check if the load type and
+ // alignment is valid.
+ unsigned Align = LN0->getAlignment();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
+ EltVT.getTypeForEVT(*DAG.getContext()));
+
+ if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
+ return SDValue();
+
+ // All checks match so transform back to vector_shuffle so that DAG combiner
+ // can finish the job
+ SDLoc dl(N);
+
+ // Create shuffle node taking into account the case that its a unary shuffle
+ SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
+ Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
+ ShuffleMask);
+ Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
+ EltNo);
+}
+
+static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+
+ // Detect bitcasts between i32 to x86mmx low word. Since MMX types are
+ // special and don't usually play with other vector types, it's better to
+ // handle them early to be sure we emit efficient code by avoiding
+ // store-load conversions.
+ if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
+ N0.getValueType() == MVT::v2i32 &&
+ isNullConstant(N0.getOperand(1))) {
+ SDValue N00 = N0->getOperand(0);
+ if (N00.getValueType() == MVT::i32)
+ return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
+ }
+
+ // Convert a bitcasted integer logic operation that has one bitcasted
+ // floating-point operand into a floating-point logic operation. This may
+ // create a load of a constant, but that is cheaper than materializing the
+ // constant in an integer register and transferring it to an SSE register or
+ // transferring the SSE operand to integer register and back.
+ unsigned FPOpcode;
+ switch (N0.getOpcode()) {
+ case ISD::AND: FPOpcode = X86ISD::FAND; break;
+ case ISD::OR: FPOpcode = X86ISD::FOR; break;
+ case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
+ default: return SDValue();
+ }
+
+ if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
+ (Subtarget.hasSSE2() && VT == MVT::f64)))
+ return SDValue();
+
+ SDValue LogicOp0 = N0.getOperand(0);
+ SDValue LogicOp1 = N0.getOperand(1);
+ SDLoc DL0(N0);
+
+ // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
+ if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
+ LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
+ !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
+ SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
+ return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
+ }
+ // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
+ if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
+ LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
+ !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
+ SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
+ return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
+ }
+
+ return SDValue();
+}
+
+// Match a binop + shuffle pyramid that represents a horizontal reduction over
+// the elements of a vector.
+// Returns the vector that is being reduced on, or SDValue() if a reduction
+// was not matched.
+static SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType BinOp) {
+ // The pattern must end in an extract from index 0.
+ if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) ||
+ !isNullConstant(Extract->getOperand(1)))
+ return SDValue();
+
+ unsigned Stages =
+ Log2_32(Extract->getOperand(0).getValueType().getVectorNumElements());
+
+ SDValue Op = Extract->getOperand(0);
+ // At each stage, we're looking for something that looks like:
+ // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
+ // <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
+ // i32 undef, i32 undef, i32 undef, i32 undef>
+ // %a = binop <8 x i32> %op, %s
+ // Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
+ // we expect something like:
+ // <4,5,6,7,u,u,u,u>
+ // <2,3,u,u,u,u,u,u>
+ // <1,u,u,u,u,u,u,u>
+ for (unsigned i = 0; i < Stages; ++i) {
+ if (Op.getOpcode() != BinOp)
+ return SDValue();
+
+ ShuffleVectorSDNode *Shuffle =
+ dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
+ if (Shuffle) {
+ Op = Op.getOperand(1);
+ } else {
+ Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
+ Op = Op.getOperand(0);
+ }
+
+ // The first operand of the shuffle should be the same as the other operand
+ // of the add.
+ if (!Shuffle || (Shuffle->getOperand(0) != Op))
+ return SDValue();
+
+ // Verify the shuffle has the expected (at this stage of the pyramid) mask.
+ for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
+ if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
+ return SDValue();
+ }
+
+ return Op;
+}
+
+// Given a select, detect the following pattern:
+// 1: %2 = zext <N x i8> %0 to <N x i32>
+// 2: %3 = zext <N x i8> %1 to <N x i32>
+// 3: %4 = sub nsw <N x i32> %2, %3
+// 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
+// 5: %6 = sub nsw <N x i32> zeroinitializer, %4
+// 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
+// This is useful as it is the input into a SAD pattern.
+static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
+ SDValue &Op1) {
+ // Check the condition of the select instruction is greater-than.
+ SDValue SetCC = Select->getOperand(0);
+ if (SetCC.getOpcode() != ISD::SETCC)
+ return false;
+ ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
+ if (CC != ISD::SETGT)
+ return false;
+
+ SDValue SelectOp1 = Select->getOperand(1);
+ SDValue SelectOp2 = Select->getOperand(2);
+
+ // The second operand of the select should be the negation of the first
+ // operand, which is implemented as 0 - SelectOp1.
+ if (!(SelectOp2.getOpcode() == ISD::SUB &&
+ ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
+ SelectOp2.getOperand(1) == SelectOp1))
+ return false;
+
+ // The first operand of SetCC is the first operand of the select, which is the
+ // difference between the two input vectors.
+ if (SetCC.getOperand(0) != SelectOp1)
+ return false;
+
+ // The second operand of the comparison can be either -1 or 0.
+ if (!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
+ ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
+ return false;
+
+ // The first operand of the select is the difference between the two input
+ // vectors.
+ if (SelectOp1.getOpcode() != ISD::SUB)
+ return false;
+
+ Op0 = SelectOp1.getOperand(0);
+ Op1 = SelectOp1.getOperand(1);
+
+ // Check if the operands of the sub are zero-extended from vectors of i8.
+ if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
+ Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
+ Op1.getOpcode() != ISD::ZERO_EXTEND ||
+ Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
+ return false;
+
+ return true;
+}
+
+// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
+// to these zexts.
+static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
+ const SDValue &Zext1, const SDLoc &DL) {
+
+ // Find the appropriate width for the PSADBW.
+ EVT InVT = Zext0.getOperand(0).getValueType();
+ unsigned RegSize = std::max(128u, InVT.getSizeInBits());
+
+ // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
+ // fill in the missing vector elements with 0.
+ unsigned NumConcat = RegSize / InVT.getSizeInBits();
+ SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
+ Ops[0] = Zext0.getOperand(0);
+ MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
+ SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
+ Ops[0] = Zext1.getOperand(0);
+ SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
+
+ // Actually build the SAD
+ MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
+ return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
+}
+
+static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // PSADBW is only supported on SSE2 and up.
+ if (!Subtarget.hasSSE2())
+ return SDValue();
+
+ // Verify the type we're extracting from is appropriate
+ // TODO: There's nothing special about i32, any integer type above i16 should
+ // work just as well.
+ EVT VT = Extract->getOperand(0).getValueType();
+ if (!VT.isSimple() || !(VT.getVectorElementType() == MVT::i32))
+ return SDValue();
+
+ unsigned RegSize = 128;
+ if (Subtarget.hasBWI())
+ RegSize = 512;
+ else if (Subtarget.hasAVX2())
+ RegSize = 256;
+
+ // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
+ // TODO: We should be able to handle larger vectors by splitting them before
+ // feeding them into several SADs, and then reducing over those.
+ if (VT.getSizeInBits() / 4 > RegSize)
+ return SDValue();
+
+ // Match shuffle + add pyramid.
+ SDValue Root = matchBinOpReduction(Extract, ISD::ADD);
+
+ // If there was a match, we want Root to be a select that is the root of an
+ // abs-diff pattern.
+ if (!Root || (Root.getOpcode() != ISD::VSELECT))
+ return SDValue();
+
+ // Check whether we have an abs-diff pattern feeding into the select.
+ SDValue Zext0, Zext1;
+ if (!detectZextAbsDiff(Root, Zext0, Zext1))
+ return SDValue();
+
+ // Create the SAD instruction
+ SDLoc DL(Extract);
+ SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);
+
+ // If the original vector was wider than 8 elements, sum over the results
+ // in the SAD vector.
+ unsigned Stages = Log2_32(VT.getVectorNumElements());
+ MVT SadVT = SAD.getSimpleValueType();
+ if (Stages > 3) {
+ unsigned SadElems = SadVT.getVectorNumElements();
+
+ for(unsigned i = Stages - 3; i > 0; --i) {
+ SmallVector<int, 16> Mask(SadElems, -1);
+ for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
+ Mask[j] = MaskEnd + j;
+
+ SDValue Shuffle =
+ DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
+ SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
+ }
+ }
+
+ // Return the lowest i32.
+ MVT ResVT = MVT::getVectorVT(MVT::i32, SadVT.getSizeInBits() / 32);
+ SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SAD,
+ Extract->getOperand(1));
+}
+
+/// Detect vector gather/scatter index generation and convert it from being a
+/// bunch of shuffles and extracts into a somewhat faster sequence.
+/// For i686, the best sequence is apparently storing the value and loading
+/// scalars back, while for x64 we should use 64-bit extracts and shifts.
+static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
+ return NewOp;
+
+ SDValue InputVector = N->getOperand(0);
+ SDLoc dl(InputVector);
+ // Detect mmx to i32 conversion through a v2i32 elt extract.
+ if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
+ N->getValueType(0) == MVT::i32 &&
+ InputVector.getValueType() == MVT::v2i32 &&
+ isa<ConstantSDNode>(N->getOperand(1)) &&
+ N->getConstantOperandVal(1) == 0) {
+ SDValue MMXSrc = InputVector.getOperand(0);
+
+ // The bitcast source is a direct mmx result.
+ if (MMXSrc.getValueType() == MVT::x86mmx)
+ return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
+ }
+
+ EVT VT = N->getValueType(0);
+
+ if (VT == MVT::i1 && isa<ConstantSDNode>(N->getOperand(1)) &&
+ InputVector.getOpcode() == ISD::BITCAST &&
+ isa<ConstantSDNode>(InputVector.getOperand(0))) {
+ uint64_t ExtractedElt =
+ cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ uint64_t InputValue =
+ cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue();
+ uint64_t Res = (InputValue >> ExtractedElt) & 1;
+ return DAG.getConstant(Res, dl, MVT::i1);
+ }
+
+ // Check whether this extract is the root of a sum of absolute differences
+ // pattern. This has to be done here because we really want it to happen
+ // pre-legalization,
+ if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
+ return SAD;
+
+ // Only operate on vectors of 4 elements, where the alternative shuffling
+ // gets to be more expensive.
+ if (InputVector.getValueType() != MVT::v4i32)
+ return SDValue();
+
+ // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
+ // single use which is a sign-extend or zero-extend, and all elements are
+ // used.
+ SmallVector<SDNode *, 4> Uses;
+ unsigned ExtractedElements = 0;
+ for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
+ UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
+ if (UI.getUse().getResNo() != InputVector.getResNo())
+ return SDValue();
+
+ SDNode *Extract = *UI;
+ if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ if (Extract->getValueType(0) != MVT::i32)
+ return SDValue();
+ if (!Extract->hasOneUse())
+ return SDValue();
+ if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
+ Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
+ return SDValue();
+ if (!isa<ConstantSDNode>(Extract->getOperand(1)))
+ return SDValue();
+
+ // Record which element was extracted.
+ ExtractedElements |=
+ 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
+
+ Uses.push_back(Extract);
+ }
+
+ // If not all the elements were used, this may not be worthwhile.
+ if (ExtractedElements != 15)
+ return SDValue();
+
+ // Ok, we've now decided to do the transformation.
+ // If 64-bit shifts are legal, use the extract-shift sequence,
+ // otherwise bounce the vector off the cache.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDValue Vals[4];
+
+ if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
+ SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
+ auto &DL = DAG.getDataLayout();
+ EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
+ SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
+ DAG.getConstant(0, dl, VecIdxTy));
+ SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
+ DAG.getConstant(1, dl, VecIdxTy));
+
+ SDValue ShAmt = DAG.getConstant(
+ 32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
+ Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
+ Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
+ DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
+ Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
+ Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
+ DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
+ } else {
+ // Store the value to a temporary stack slot.
+ SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
+ SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
+ MachinePointerInfo());
+
+ EVT ElementType = InputVector.getValueType().getVectorElementType();
+ unsigned EltSize = ElementType.getSizeInBits() / 8;
+
+ // Replace each use (extract) with a load of the appropriate element.
+ for (unsigned i = 0; i < 4; ++i) {
+ uint64_t Offset = EltSize * i;
+ auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+ SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
+
+ SDValue ScalarAddr =
+ DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
+
+ // Load the scalar.
+ Vals[i] =
+ DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
+ }
+ }
+
+ // Replace the extracts
+ for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
+ UE = Uses.end(); UI != UE; ++UI) {
+ SDNode *Extract = *UI;
+
+ SDValue Idx = Extract->getOperand(1);
+ uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
+ }
+
+ // The replacement was made in place; don't return anything.
+ return SDValue();
+}
+
+/// If a vector select has an operand that is -1 or 0, simplify the select to a
+/// bitwise logic operation.
+static SDValue combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDValue Cond = N->getOperand(0);
+ SDValue LHS = N->getOperand(1);
+ SDValue RHS = N->getOperand(2);
+ EVT VT = LHS.getValueType();
+ EVT CondVT = Cond.getValueType();
+ SDLoc DL(N);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ if (N->getOpcode() != ISD::VSELECT)
+ return SDValue();
+
+ bool FValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
+ // Check if the first operand is all zeros.This situation only
+ // applies to avx512.
+ if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse()) {
+ //Invert the cond to not(cond) : xor(op,allones)=not(op)
+ SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(1, DL, Cond.getValueType()));
+ //Vselect cond, op1, op2 = Vselect not(cond), op2, op1
+ return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS);
+ }
+ assert(CondVT.isVector() && "Vector select expects a vector selector!");
+
+ // To use the condition operand as a bitwise mask, it must have elements that
+ // are the same size as the select elements. Ie, the condition operand must
+ // have already been promoted from the IR select condition type <N x i1>.
+ // Don't check if the types themselves are equal because that excludes
+ // vector floating-point selects.
+ if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
+ return SDValue();
+
+ bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
+ FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
+
+ // Try to invert the condition if true value is not all 1s and false value is
+ // not all 0s.
+ if (!TValIsAllOnes && !FValIsAllZeros &&
+ // Check if the selector will be produced by CMPP*/PCMP*.
+ Cond.getOpcode() == ISD::SETCC &&
+ // Check if SETCC has already been promoted.
+ TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
+ CondVT) {
+ bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
+ bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
+
+ if (TValIsAllZeros || FValIsAllOnes) {
+ SDValue CC = Cond.getOperand(2);
+ ISD::CondCode NewCC =
+ ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
+ Cond.getOperand(0).getValueType().isInteger());
+ Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
+ NewCC);
+ std::swap(LHS, RHS);
+ TValIsAllOnes = FValIsAllOnes;
+ FValIsAllZeros = TValIsAllZeros;
+ }
+ }
+
+ if (!TValIsAllOnes && !FValIsAllZeros)
+ return SDValue();
+
+ SDValue Ret;
+ if (TValIsAllOnes && FValIsAllZeros)
+ Ret = Cond;
+ else if (TValIsAllOnes)
+ Ret = DAG.getNode(ISD::OR, DL, CondVT, Cond, DAG.getBitcast(CondVT, RHS));
+ else if (FValIsAllZeros)
+ Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond, DAG.getBitcast(CondVT, LHS));
+
+ return DAG.getBitcast(VT, Ret);
+}
+
+static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
+ SDValue Cond = N->getOperand(0);
+ SDValue LHS = N->getOperand(1);
+ SDValue RHS = N->getOperand(2);
+ SDLoc DL(N);
+
+ auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
+ auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
+ if (!TrueC || !FalseC)
+ return SDValue();
+
+ // Don't do this for crazy integer types.
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType()))
+ return SDValue();
+
+ // If this is efficiently invertible, canonicalize the LHSC/RHSC values
+ // so that TrueC (the true value) is larger than FalseC.
+ bool NeedsCondInvert = false;
+ if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
+ // Efficiently invertible.
+ (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible.
+ (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible.
+ isa<ConstantSDNode>(Cond.getOperand(1))))) {
+ NeedsCondInvert = true;
+ std::swap(TrueC, FalseC);
+ }
+
+ // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0.
+ if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
+ if (NeedsCondInvert) // Invert the condition if needed.
+ Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(1, DL, Cond.getValueType()));
+
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
+
+ unsigned ShAmt = TrueC->getAPIntValue().logBase2();
+ return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
+ DAG.getConstant(ShAmt, DL, MVT::i8));
+ }
+
+ // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
+ if (FalseC->getAPIntValue() + 1 == TrueC->getAPIntValue()) {
+ if (NeedsCondInvert) // Invert the condition if needed.
+ Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(1, DL, Cond.getValueType()));
+
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
+ return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+ SDValue(FalseC, 0));
+ }
+
+ // Optimize cases that will turn into an LEA instruction. This requires
+ // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
+ if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
+ uint64_t Diff = TrueC->getZExtValue() - FalseC->getZExtValue();
+ if (N->getValueType(0) == MVT::i32)
+ Diff = (unsigned)Diff;
+
+ bool isFastMultiplier = false;
+ if (Diff < 10) {
+ switch ((unsigned char)Diff) {
+ default:
+ break;
+ case 1: // result = add base, cond
+ case 2: // result = lea base( , cond*2)
+ case 3: // result = lea base(cond, cond*2)
+ case 4: // result = lea base( , cond*4)
+ case 5: // result = lea base(cond, cond*4)
+ case 8: // result = lea base( , cond*8)
+ case 9: // result = lea base(cond, cond*8)
+ isFastMultiplier = true;
+ break;
+ }
+ }
+
+ if (isFastMultiplier) {
+ APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
+ if (NeedsCondInvert) // Invert the condition if needed.
+ Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(1, DL, Cond.getValueType()));
+
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
+ // Scale the condition by the difference.
+ if (Diff != 1)
+ Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(Diff, DL, Cond.getValueType()));
+
+ // Add the base if non-zero.
+ if (FalseC->getAPIntValue() != 0)
+ Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+ SDValue(FalseC, 0));
+ return Cond;
+ }
+ }
+
+ return SDValue();
+}
+
+// If this is a bitcasted op that can be represented as another type, push the
+// the bitcast to the inputs. This allows more opportunities for pattern
+// matching masked instructions. This is called when we know that the operation
+// is used as one of the inputs of a vselect.
+static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ // Make sure we have a bitcast.
+ if (OrigOp.getOpcode() != ISD::BITCAST)
+ return false;
+
+ SDValue Op = OrigOp.getOperand(0);
+
+ // If the operation is used by anything other than the bitcast, we shouldn't
+ // do this combine as that would replicate the operation.
+ if (!Op.hasOneUse())
+ return false;
+
+ MVT VT = OrigOp.getSimpleValueType();
+ MVT EltVT = VT.getVectorElementType();
+ SDLoc DL(Op.getNode());
+
+ auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1,
+ SDValue Op2) {
+ Op0 = DAG.getBitcast(VT, Op0);
+ DCI.AddToWorklist(Op0.getNode());
+ Op1 = DAG.getBitcast(VT, Op1);
+ DCI.AddToWorklist(Op1.getNode());
+ DCI.CombineTo(OrigOp.getNode(),
+ DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2));
+ return true;
+ };
+
+ unsigned Opcode = Op.getOpcode();
+ switch (Opcode) {
+ case X86ISD::PALIGNR:
+ // PALIGNR can be converted to VALIGND/Q for 128-bit vectors.
+ if (!VT.is128BitVector())
+ return false;
+ Opcode = X86ISD::VALIGN;
+ LLVM_FALLTHROUGH;
+ case X86ISD::VALIGN: {
+ if (EltVT != MVT::i32 && EltVT != MVT::i64)
+ return false;
+ uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+ MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
+ unsigned ShiftAmt = Imm * OpEltVT.getSizeInBits();
+ unsigned EltSize = EltVT.getSizeInBits();
+ // Make sure we can represent the same shift with the new VT.
+ if ((ShiftAmt % EltSize) != 0)
+ return false;
+ Imm = ShiftAmt / EltSize;
+ return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
+ DAG.getConstant(Imm, DL, MVT::i8));
+ }
+ case X86ISD::SHUF128: {
+ if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)
+ return false;
+ // Only change element size, not type.
+ if (VT.isInteger() != Op.getSimpleValueType().isInteger())
+ return false;
+ return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
+ Op.getOperand(2));
+ }
+ }
+
+ return false;
+}
+
+/// Do target-specific dag combines on SELECT and VSELECT nodes.
+static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ SDLoc DL(N);
+ SDValue Cond = N->getOperand(0);
+ // Get the LHS/RHS of the select.
+ SDValue LHS = N->getOperand(1);
+ SDValue RHS = N->getOperand(2);
+ EVT VT = LHS.getValueType();
+ EVT CondVT = Cond.getValueType();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ // If we have SSE[12] support, try to form min/max nodes. SSE min/max
+ // instructions match the semantics of the common C idiom x<y?x:y but not
+ // x<=y?x:y, because of how they handle negative zero (which can be
+ // ignored in unsafe-math mode).
+ // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
+ if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
+ VT != MVT::f80 && VT != MVT::f128 &&
+ (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
+ (Subtarget.hasSSE2() ||
+ (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+ unsigned Opcode = 0;
+ // Check for x CC y ? x : y.
+ if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
+ DAG.isEqualTo(RHS, Cond.getOperand(1))) {
+ switch (CC) {
+ default: break;
+ case ISD::SETULT:
+ // Converting this to a min would handle NaNs incorrectly, and swapping
+ // the operands would cause it to handle comparisons between positive
+ // and negative zero incorrectly.
+ if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
+ if (!DAG.getTarget().Options.UnsafeFPMath &&
+ !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
+ break;
+ std::swap(LHS, RHS);
+ }
+ Opcode = X86ISD::FMIN;
+ break;
+ case ISD::SETOLE:
+ // Converting this to a min would handle comparisons between positive
+ // and negative zero incorrectly.
+ if (!DAG.getTarget().Options.UnsafeFPMath &&
+ !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
+ break;
+ Opcode = X86ISD::FMIN;
+ break;
+ case ISD::SETULE:
+ // Converting this to a min would handle both negative zeros and NaNs
+ // incorrectly, but we can swap the operands to fix both.
+ std::swap(LHS, RHS);
+ case ISD::SETOLT:
+ case ISD::SETLT:
+ case ISD::SETLE:
+ Opcode = X86ISD::FMIN;
+ break;
+
+ case ISD::SETOGE:
+ // Converting this to a max would handle comparisons between positive
+ // and negative zero incorrectly.
+ if (!DAG.getTarget().Options.UnsafeFPMath &&
+ !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
+ break;
+ Opcode = X86ISD::FMAX;
+ break;
+ case ISD::SETUGT:
+ // Converting this to a max would handle NaNs incorrectly, and swapping
+ // the operands would cause it to handle comparisons between positive
+ // and negative zero incorrectly.
+ if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
+ if (!DAG.getTarget().Options.UnsafeFPMath &&
+ !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
+ break;
+ std::swap(LHS, RHS);
+ }
+ Opcode = X86ISD::FMAX;
+ break;
+ case ISD::SETUGE:
+ // Converting this to a max would handle both negative zeros and NaNs
+ // incorrectly, but we can swap the operands to fix both.
+ std::swap(LHS, RHS);
+ case ISD::SETOGT:
+ case ISD::SETGT:
+ case ISD::SETGE:
+ Opcode = X86ISD::FMAX;
+ break;
+ }
+ // Check for x CC y ? y : x -- a min/max with reversed arms.
+ } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
+ DAG.isEqualTo(RHS, Cond.getOperand(0))) {
+ switch (CC) {
+ default: break;
+ case ISD::SETOGE:
+ // Converting this to a min would handle comparisons between positive
+ // and negative zero incorrectly, and swapping the operands would
+ // cause it to handle NaNs incorrectly.
+ if (!DAG.getTarget().Options.UnsafeFPMath &&
+ !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
+ if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
+ break;
+ std::swap(LHS, RHS);
+ }
+ Opcode = X86ISD::FMIN;
+ break;
+ case ISD::SETUGT:
+ // Converting this to a min would handle NaNs incorrectly.
+ if (!DAG.getTarget().Options.UnsafeFPMath &&
+ (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
+ break;
+ Opcode = X86ISD::FMIN;
+ break;
+ case ISD::SETUGE:
+ // Converting this to a min would handle both negative zeros and NaNs
+ // incorrectly, but we can swap the operands to fix both.
+ std::swap(LHS, RHS);
+ case ISD::SETOGT:
+ case ISD::SETGT:
+ case ISD::SETGE:
+ Opcode = X86ISD::FMIN;
+ break;
+
+ case ISD::SETULT:
+ // Converting this to a max would handle NaNs incorrectly.
+ if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
+ break;
+ Opcode = X86ISD::FMAX;
+ break;
+ case ISD::SETOLE:
+ // Converting this to a max would handle comparisons between positive
+ // and negative zero incorrectly, and swapping the operands would
+ // cause it to handle NaNs incorrectly.
+ if (!DAG.getTarget().Options.UnsafeFPMath &&
+ !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
+ if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
+ break;
+ std::swap(LHS, RHS);
+ }
+ Opcode = X86ISD::FMAX;
+ break;
+ case ISD::SETULE:
+ // Converting this to a max would handle both negative zeros and NaNs
+ // incorrectly, but we can swap the operands to fix both.
+ std::swap(LHS, RHS);
+ case ISD::SETOLT:
+ case ISD::SETLT:
+ case ISD::SETLE:
+ Opcode = X86ISD::FMAX;
+ break;
+ }
+ }
+
+ if (Opcode)
+ return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
+ }
+
+ // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
+ // lowering on KNL. In this case we convert it to
+ // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
+ // The same situation for all 128 and 256-bit vectors of i8 and i16.
+ // Since SKX these selects have a proper lowering.
+ if (Subtarget.hasAVX512() && CondVT.isVector() &&
+ CondVT.getVectorElementType() == MVT::i1 &&
+ (VT.is128BitVector() || VT.is256BitVector()) &&
+ (VT.getVectorElementType() == MVT::i8 ||
+ VT.getVectorElementType() == MVT::i16) &&
+ !(Subtarget.hasBWI() && Subtarget.hasVLX())) {
+ Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
+ DCI.AddToWorklist(Cond.getNode());
+ return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
+ }
+
+ if (SDValue V = combineSelectOfTwoConstants(N, DAG))
+ return V;
+
+ // Canonicalize max and min:
+ // (x > y) ? x : y -> (x >= y) ? x : y
+ // (x < y) ? x : y -> (x <= y) ? x : y
+ // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
+ // the need for an extra compare
+ // against zero. e.g.
+ // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
+ // subl %esi, %edi
+ // testl %edi, %edi
+ // movl $0, %eax
+ // cmovgl %edi, %eax
+ // =>
+ // xorl %eax, %eax
+ // subl %esi, $edi
+ // cmovsl %eax, %edi
+ if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
+ DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
+ DAG.isEqualTo(RHS, Cond.getOperand(1))) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+ switch (CC) {
+ default: break;
+ case ISD::SETLT:
+ case ISD::SETGT: {
+ ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
+ Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
+ Cond.getOperand(0), Cond.getOperand(1), NewCC);
+ return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
+ }
+ }
+ }
+
+ // Early exit check
+ if (!TLI.isTypeLegal(VT))
+ return SDValue();
+
+ // Match VSELECTs into subs with unsigned saturation.
+ if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
+ // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
+ ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
+ (Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+ // Check if one of the arms of the VSELECT is a zero vector. If it's on the
+ // left side invert the predicate to simplify logic below.
+ SDValue Other;
+ if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
+ Other = RHS;
+ CC = ISD::getSetCCInverse(CC, true);
+ } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
+ Other = LHS;
+ }
+
+ if (Other.getNode() && Other->getNumOperands() == 2 &&
+ DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
+ SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
+ SDValue CondRHS = Cond->getOperand(1);
+
+ // Look for a general sub with unsigned saturation first.
+ // x >= y ? x-y : 0 --> subus x, y
+ // x > y ? x-y : 0 --> subus x, y
+ if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
+ Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
+ return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
+
+ if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
+ if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
+ if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
+ if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
+ // If the RHS is a constant we have to reverse the const
+ // canonicalization.
+ // x > C-1 ? x+-C : 0 --> subus x, C
+ if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
+ CondRHSConst->getAPIntValue() ==
+ (-OpRHSConst->getAPIntValue() - 1))
+ return DAG.getNode(
+ X86ISD::SUBUS, DL, VT, OpLHS,
+ DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
+
+ // Another special case: If C was a sign bit, the sub has been
+ // canonicalized into a xor.
+ // FIXME: Would it be better to use computeKnownBits to determine
+ // whether it's safe to decanonicalize the xor?
+ // x s< 0 ? x^C : 0 --> subus x, C
+ if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
+ ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
+ OpRHSConst->getAPIntValue().isSignBit())
+ // Note that we have to rebuild the RHS constant here to ensure we
+ // don't rely on particular values of undef lanes.
+ return DAG.getNode(
+ X86ISD::SUBUS, DL, VT, OpLHS,
+ DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
+ }
+ }
+ }
+
+ if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, Subtarget))
+ return V;
+
+ // If this is a *dynamic* select (non-constant condition) and we can match
+ // this node with one of the variable blend instructions, restructure the
+ // condition so that the blends can use the high bit of each element and use
+ // SimplifyDemandedBits to simplify the condition operand.
+ if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
+ !DCI.isBeforeLegalize() &&
+ !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
+ unsigned BitWidth = Cond.getScalarValueSizeInBits();
+
+ // Don't optimize vector selects that map to mask-registers.
+ if (BitWidth == 1)
+ return SDValue();
+
+ // We can only handle the cases where VSELECT is directly legal on the
+ // subtarget. We custom lower VSELECT nodes with constant conditions and
+ // this makes it hard to see whether a dynamic VSELECT will correctly
+ // lower, so we both check the operation's status and explicitly handle the
+ // cases where a *dynamic* blend will fail even though a constant-condition
+ // blend could be custom lowered.
+ // FIXME: We should find a better way to handle this class of problems.
+ // Potentially, we should combine constant-condition vselect nodes
+ // pre-legalization into shuffles and not mark as many types as custom
+ // lowered.
+ if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
+ return SDValue();
+ // FIXME: We don't support i16-element blends currently. We could and
+ // should support them by making *all* the bits in the condition be set
+ // rather than just the high bit and using an i8-element blend.
+ if (VT.getVectorElementType() == MVT::i16)
+ return SDValue();
+ // Dynamic blending was only available from SSE4.1 onward.
+ if (VT.is128BitVector() && !Subtarget.hasSSE41())
+ return SDValue();
+ // Byte blends are only available in AVX2
+ if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
+ return SDValue();
+
+ assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
+ APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
+
+ APInt KnownZero, KnownOne;
+ TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
+ DCI.isBeforeLegalizeOps());
+ if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
+ TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
+ TLO)) {
+ // If we changed the computation somewhere in the DAG, this change
+ // will affect all users of Cond.
+ // Make sure it is fine and update all the nodes so that we do not
+ // use the generic VSELECT anymore. Otherwise, we may perform
+ // wrong optimizations as we messed up with the actual expectation
+ // for the vector boolean values.
+ if (Cond != TLO.Old) {
+ // Check all uses of that condition operand to check whether it will be
+ // consumed by non-BLEND instructions, which may depend on all bits are
+ // set properly.
+ for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
+ I != E; ++I)
+ if (I->getOpcode() != ISD::VSELECT)
+ // TODO: Add other opcodes eventually lowered into BLEND.
+ return SDValue();
+
+ // Update all the users of the condition, before committing the change,
+ // so that the VSELECT optimizations that expect the correct vector
+ // boolean value will not be triggered.
+ for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
+ I != E; ++I)
+ DAG.ReplaceAllUsesOfValueWith(
+ SDValue(*I, 0),
+ DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
+ Cond, I->getOperand(1), I->getOperand(2)));
+ DCI.CommitTargetLoweringOpt(TLO);
+ return SDValue();
+ }
+ // At this point, only Cond is changed. Change the condition
+ // just for N to keep the opportunity to optimize all other
+ // users their own way.
+ DAG.ReplaceAllUsesOfValueWith(
+ SDValue(N, 0),
+ DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
+ TLO.New, N->getOperand(1), N->getOperand(2)));
+ return SDValue();
+ }
+ }
+
+ // Look for vselects with LHS/RHS being bitcasted from an operation that
+ // can be executed on another type. Push the bitcast to the inputs of
+ // the operation. This exposes opportunities for using masking instructions.
+ if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalizeOps() &&
+ CondVT.getVectorElementType() == MVT::i1) {
+ if (combineBitcastForMaskedOp(LHS, DAG, DCI))
+ return SDValue(N, 0);
+ if (combineBitcastForMaskedOp(RHS, DAG, DCI))
+ return SDValue(N, 0);
+ }
+
+ return SDValue();
+}
+
+/// Combine:
+/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
+/// to:
+/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
+/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
+/// Note that this is only legal for some op/cc combinations.
+static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
+ SelectionDAG &DAG) {
+ // This combine only operates on CMP-like nodes.
+ if (!(Cmp.getOpcode() == X86ISD::CMP ||
+ (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
+ return SDValue();
+
+ // This only applies to variations of the common case:
+ // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
+ // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
+ // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
+ // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
+ // Using the proper condcodes (see below), overflow is checked for.
+
+ // FIXME: We can generalize both constraints:
+ // - XOR/OR/AND (if they were made to survive AtomicExpand)
+ // - LHS != 1
+ // if the result is compared.
+
+ SDValue CmpLHS = Cmp.getOperand(0);
+ SDValue CmpRHS = Cmp.getOperand(1);
+
+ if (!CmpLHS.hasOneUse())
+ return SDValue();
+
+ auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
+ if (!CmpRHSC || CmpRHSC->getZExtValue() != 0)
+ return SDValue();
+
+ const unsigned Opc = CmpLHS.getOpcode();
+
+ if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
+ return SDValue();
+
+ SDValue OpRHS = CmpLHS.getOperand(2);
+ auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
+ if (!OpRHSC)
+ return SDValue();
+
+ APInt Addend = OpRHSC->getAPIntValue();
+ if (Opc == ISD::ATOMIC_LOAD_SUB)
+ Addend = -Addend;
+
+ if (CC == X86::COND_S && Addend == 1)
+ CC = X86::COND_LE;
+ else if (CC == X86::COND_NS && Addend == 1)
+ CC = X86::COND_G;
+ else if (CC == X86::COND_G && Addend == -1)
+ CC = X86::COND_GE;
+ else if (CC == X86::COND_LE && Addend == -1)
+ CC = X86::COND_L;
+ else
+ return SDValue();
+
+ SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG);
+ DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
+ DAG.getUNDEF(CmpLHS.getValueType()));
+ DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
+ return LockOp;
+}
+
+// Check whether a boolean test is testing a boolean value generated by
+// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
+// code.
+//
+// Simplify the following patterns:
+// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
+// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
+// to (Op EFLAGS Cond)
+//
+// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
+// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
+// to (Op EFLAGS !Cond)
+//
+// where Op could be BRCOND or CMOV.
+//
+static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
+ // This combine only operates on CMP-like nodes.
+ if (!(Cmp.getOpcode() == X86ISD::CMP ||
+ (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
+ return SDValue();
+
+ // Quit if not used as a boolean value.
+ if (CC != X86::COND_E && CC != X86::COND_NE)
+ return SDValue();
+
+ // Check CMP operands. One of them should be 0 or 1 and the other should be
+ // an SetCC or extended from it.
+ SDValue Op1 = Cmp.getOperand(0);
+ SDValue Op2 = Cmp.getOperand(1);
+
+ SDValue SetCC;
+ const ConstantSDNode* C = nullptr;
+ bool needOppositeCond = (CC == X86::COND_E);
+ bool checkAgainstTrue = false; // Is it a comparison against 1?
+
+ if ((C = dyn_cast<ConstantSDNode>(Op1)))
+ SetCC = Op2;
+ else if ((C = dyn_cast<ConstantSDNode>(Op2)))
+ SetCC = Op1;
+ else // Quit if all operands are not constants.
+ return SDValue();
+
+ if (C->getZExtValue() == 1) {
+ needOppositeCond = !needOppositeCond;
+ checkAgainstTrue = true;
+ } else if (C->getZExtValue() != 0)
+ // Quit if the constant is neither 0 or 1.
+ return SDValue();
+
+ bool truncatedToBoolWithAnd = false;
+ // Skip (zext $x), (trunc $x), or (and $x, 1) node.
+ while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
+ SetCC.getOpcode() == ISD::TRUNCATE ||
+ SetCC.getOpcode() == ISD::AND) {
+ if (SetCC.getOpcode() == ISD::AND) {
+ int OpIdx = -1;
+ if (isOneConstant(SetCC.getOperand(0)))
+ OpIdx = 1;
+ if (isOneConstant(SetCC.getOperand(1)))
+ OpIdx = 0;
+ if (OpIdx < 0)
+ break;
+ SetCC = SetCC.getOperand(OpIdx);
+ truncatedToBoolWithAnd = true;
+ } else
+ SetCC = SetCC.getOperand(0);
+ }
+
+ switch (SetCC.getOpcode()) {
+ case X86ISD::SETCC_CARRY:
+ // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
+ // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
+ // i.e. it's a comparison against true but the result of SETCC_CARRY is not
+ // truncated to i1 using 'and'.
+ if (checkAgainstTrue && !truncatedToBoolWithAnd)
+ break;
+ assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
+ "Invalid use of SETCC_CARRY!");
+ LLVM_FALLTHROUGH;
+ case X86ISD::SETCC:
+ // Set the condition code or opposite one if necessary.
+ CC = X86::CondCode(SetCC.getConstantOperandVal(0));
+ if (needOppositeCond)
+ CC = X86::GetOppositeBranchCondition(CC);
+ return SetCC.getOperand(1);
+ case X86ISD::CMOV: {
+ // Check whether false/true value has canonical one, i.e. 0 or 1.
+ ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
+ ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
+ // Quit if true value is not a constant.
+ if (!TVal)
+ return SDValue();
+ // Quit if false value is not a constant.
+ if (!FVal) {
+ SDValue Op = SetCC.getOperand(0);
+ // Skip 'zext' or 'trunc' node.
+ if (Op.getOpcode() == ISD::ZERO_EXTEND ||
+ Op.getOpcode() == ISD::TRUNCATE)
+ Op = Op.getOperand(0);
+ // A special case for rdrand/rdseed, where 0 is set if false cond is
+ // found.
+ if ((Op.getOpcode() != X86ISD::RDRAND &&
+ Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
+ return SDValue();
+ }
+ // Quit if false value is not the constant 0 or 1.
+ bool FValIsFalse = true;
+ if (FVal && FVal->getZExtValue() != 0) {
+ if (FVal->getZExtValue() != 1)
+ return SDValue();
+ // If FVal is 1, opposite cond is needed.
+ needOppositeCond = !needOppositeCond;
+ FValIsFalse = false;
+ }
+ // Quit if TVal is not the constant opposite of FVal.
+ if (FValIsFalse && TVal->getZExtValue() != 1)
+ return SDValue();
+ if (!FValIsFalse && TVal->getZExtValue() != 0)
+ return SDValue();
+ CC = X86::CondCode(SetCC.getConstantOperandVal(2));
+ if (needOppositeCond)
+ CC = X86::GetOppositeBranchCondition(CC);
+ return SetCC.getOperand(3);
+ }
+ }
+
+ return SDValue();
+}
+
+/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
+/// Match:
+/// (X86or (X86setcc) (X86setcc))
+/// (X86cmp (and (X86setcc) (X86setcc)), 0)
+static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
+ X86::CondCode &CC1, SDValue &Flags,
+ bool &isAnd) {
+ if (Cond->getOpcode() == X86ISD::CMP) {
+ if (!isNullConstant(Cond->getOperand(1)))
+ return false;
+
+ Cond = Cond->getOperand(0);
+ }
+
+ isAnd = false;
+
+ SDValue SetCC0, SetCC1;
+ switch (Cond->getOpcode()) {
+ default: return false;
+ case ISD::AND:
+ case X86ISD::AND:
+ isAnd = true;
+ LLVM_FALLTHROUGH;
+ case ISD::OR:
+ case X86ISD::OR:
+ SetCC0 = Cond->getOperand(0);
+ SetCC1 = Cond->getOperand(1);
+ break;
+ };
+
+ // Make sure we have SETCC nodes, using the same flags value.
+ if (SetCC0.getOpcode() != X86ISD::SETCC ||
+ SetCC1.getOpcode() != X86ISD::SETCC ||
+ SetCC0->getOperand(1) != SetCC1->getOperand(1))
+ return false;
+
+ CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
+ CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
+ Flags = SetCC0->getOperand(1);
+ return true;
+}
+
+/// Optimize an EFLAGS definition used according to the condition code \p CC
+/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
+/// uses of chain values.
+static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
+ SelectionDAG &DAG) {
+ if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
+ return R;
+ return combineSetCCAtomicArith(EFLAGS, CC, DAG);
+}
+
+/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
+static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ SDLoc DL(N);
+
+ // If the flag operand isn't dead, don't touch this CMOV.
+ if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
+ return SDValue();
+
+ SDValue FalseOp = N->getOperand(0);
+ SDValue TrueOp = N->getOperand(1);
+ X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
+ SDValue Cond = N->getOperand(3);
+
+ if (CC == X86::COND_E || CC == X86::COND_NE) {
+ switch (Cond.getOpcode()) {
+ default: break;
+ case X86ISD::BSR:
+ case X86ISD::BSF:
+ // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
+ if (DAG.isKnownNeverZero(Cond.getOperand(0)))
+ return (CC == X86::COND_E) ? FalseOp : TrueOp;
+ }
+ }
+
+ // Try to simplify the EFLAGS and condition code operands.
+ // We can't always do this as FCMOV only supports a subset of X86 cond.
+ if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG)) {
+ if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
+ SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
+ Flags};
+ return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
+ }
+ }
+
+ // If this is a select between two integer constants, try to do some
+ // optimizations. Note that the operands are ordered the opposite of SELECT
+ // operands.
+ if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
+ if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
+ // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
+ // larger than FalseC (the false value).
+ if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
+ CC = X86::GetOppositeBranchCondition(CC);
+ std::swap(TrueC, FalseC);
+ std::swap(TrueOp, FalseOp);
+ }
+
+ // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
+ // This is efficient for any integer data type (including i8/i16) and
+ // shift amount.
+ if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
+ Cond = getSETCC(CC, Cond, DL, DAG);
+
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
+
+ unsigned ShAmt = TrueC->getAPIntValue().logBase2();
+ Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(ShAmt, DL, MVT::i8));
+ if (N->getNumValues() == 2) // Dead flag value?
+ return DCI.CombineTo(N, Cond, SDValue());
+ return Cond;
+ }
+
+ // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
+ // for any integer data type, including i8/i16.
+ if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
+ Cond = getSETCC(CC, Cond, DL, DAG);
+
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
+ FalseC->getValueType(0), Cond);
+ Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+ SDValue(FalseC, 0));
+
+ if (N->getNumValues() == 2) // Dead flag value?
+ return DCI.CombineTo(N, Cond, SDValue());
+ return Cond;
+ }
+
+ // Optimize cases that will turn into an LEA instruction. This requires
+ // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
+ if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
+ uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
+ if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
+
+ bool isFastMultiplier = false;
+ if (Diff < 10) {
+ switch ((unsigned char)Diff) {
+ default: break;
+ case 1: // result = add base, cond
+ case 2: // result = lea base( , cond*2)
+ case 3: // result = lea base(cond, cond*2)
+ case 4: // result = lea base( , cond*4)
+ case 5: // result = lea base(cond, cond*4)
+ case 8: // result = lea base( , cond*8)
+ case 9: // result = lea base(cond, cond*8)
+ isFastMultiplier = true;
+ break;
+ }
+ }
+
+ if (isFastMultiplier) {
+ APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
+ Cond = getSETCC(CC, Cond, DL ,DAG);
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
+ Cond);
+ // Scale the condition by the difference.
+ if (Diff != 1)
+ Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(Diff, DL, Cond.getValueType()));
+
+ // Add the base if non-zero.
+ if (FalseC->getAPIntValue() != 0)
+ Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+ SDValue(FalseC, 0));
+ if (N->getNumValues() == 2) // Dead flag value?
+ return DCI.CombineTo(N, Cond, SDValue());
+ return Cond;
+ }
+ }
+ }
+ }
+
+ // Handle these cases:
+ // (select (x != c), e, c) -> select (x != c), e, x),
+ // (select (x == c), c, e) -> select (x == c), x, e)
+ // where the c is an integer constant, and the "select" is the combination
+ // of CMOV and CMP.
+ //
+ // The rationale for this change is that the conditional-move from a constant
+ // needs two instructions, however, conditional-move from a register needs
+ // only one instruction.
+ //
+ // CAVEAT: By replacing a constant with a symbolic value, it may obscure
+ // some instruction-combining opportunities. This opt needs to be
+ // postponed as late as possible.
+ //
+ if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
+ // the DCI.xxxx conditions are provided to postpone the optimization as
+ // late as possible.
+
+ ConstantSDNode *CmpAgainst = nullptr;
+ if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
+ (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
+ !isa<ConstantSDNode>(Cond.getOperand(0))) {
+
+ if (CC == X86::COND_NE &&
+ CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
+ CC = X86::GetOppositeBranchCondition(CC);
+ std::swap(TrueOp, FalseOp);
+ }
+
+ if (CC == X86::COND_E &&
+ CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
+ SDValue Ops[] = { FalseOp, Cond.getOperand(0),
+ DAG.getConstant(CC, DL, MVT::i8), Cond };
+ return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
+ }
+ }
+ }
+
+ // Fold and/or of setcc's to double CMOV:
+ // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
+ // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
+ //
+ // This combine lets us generate:
+ // cmovcc1 (jcc1 if we don't have CMOV)
+ // cmovcc2 (same)
+ // instead of:
+ // setcc1
+ // setcc2
+ // and/or
+ // cmovne (jne if we don't have CMOV)
+ // When we can't use the CMOV instruction, it might increase branch
+ // mispredicts.
+ // When we can use CMOV, or when there is no mispredict, this improves
+ // throughput and reduces register pressure.
+ //
+ if (CC == X86::COND_NE) {
+ SDValue Flags;
+ X86::CondCode CC0, CC1;
+ bool isAndSetCC;
+ if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
+ if (isAndSetCC) {
+ std::swap(FalseOp, TrueOp);
+ CC0 = X86::GetOppositeBranchCondition(CC0);
+ CC1 = X86::GetOppositeBranchCondition(CC1);
+ }
+
+ SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
+ Flags};
+ SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps);
+ SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
+ SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1));
+ return CMOV;
+ }
+ }
+
+ return SDValue();
+}
+
+/// Different mul shrinking modes.
+enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
+
+static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
+ EVT VT = N->getOperand(0).getValueType();
+ if (VT.getScalarSizeInBits() != 32)
+ return false;
+
+ assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
+ unsigned SignBits[2] = {1, 1};
+ bool IsPositive[2] = {false, false};
+ for (unsigned i = 0; i < 2; i++) {
+ SDValue Opd = N->getOperand(i);
+
+ // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
+ // compute signbits for it separately.
+ if (Opd.getOpcode() == ISD::ANY_EXTEND) {
+ // For anyextend, it is safe to assume an appropriate number of leading
+ // sign/zero bits.
+ if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
+ SignBits[i] = 25;
+ else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
+ MVT::i16)
+ SignBits[i] = 17;
+ else
+ return false;
+ IsPositive[i] = true;
+ } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
+ // All the operands of BUILD_VECTOR need to be int constant.
+ // Find the smallest value range which all the operands belong to.
+ SignBits[i] = 32;
+ IsPositive[i] = true;
+ for (const SDValue &SubOp : Opd.getNode()->op_values()) {
+ if (SubOp.isUndef())
+ continue;
+ auto *CN = dyn_cast<ConstantSDNode>(SubOp);
+ if (!CN)
+ return false;
+ APInt IntVal = CN->getAPIntValue();
+ if (IntVal.isNegative())
+ IsPositive[i] = false;
+ SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
+ }
+ } else {
+ SignBits[i] = DAG.ComputeNumSignBits(Opd);
+ if (Opd.getOpcode() == ISD::ZERO_EXTEND)
+ IsPositive[i] = true;
+ }
+ }
+
+ bool AllPositive = IsPositive[0] && IsPositive[1];
+ unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
+ // When ranges are from -128 ~ 127, use MULS8 mode.
+ if (MinSignBits >= 25)
+ Mode = MULS8;
+ // When ranges are from 0 ~ 255, use MULU8 mode.
+ else if (AllPositive && MinSignBits >= 24)
+ Mode = MULU8;
+ // When ranges are from -32768 ~ 32767, use MULS16 mode.
+ else if (MinSignBits >= 17)
+ Mode = MULS16;
+ // When ranges are from 0 ~ 65535, use MULU16 mode.
+ else if (AllPositive && MinSignBits >= 16)
+ Mode = MULU16;
+ else
+ return false;
+ return true;
+}
+
+/// When the operands of vector mul are extended from smaller size values,
+/// like i8 and i16, the type of mul may be shrinked to generate more
+/// efficient code. Two typical patterns are handled:
+/// Pattern1:
+/// %2 = sext/zext <N x i8> %1 to <N x i32>
+/// %4 = sext/zext <N x i8> %3 to <N x i32>
+// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
+/// %5 = mul <N x i32> %2, %4
+///
+/// Pattern2:
+/// %2 = zext/sext <N x i16> %1 to <N x i32>
+/// %4 = zext/sext <N x i16> %3 to <N x i32>
+/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
+/// %5 = mul <N x i32> %2, %4
+///
+/// There are four mul shrinking modes:
+/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
+/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
+/// generate pmullw+sext32 for it (MULS8 mode).
+/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
+/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
+/// generate pmullw+zext32 for it (MULU8 mode).
+/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
+/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
+/// generate pmullw+pmulhw for it (MULS16 mode).
+/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
+/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
+/// generate pmullw+pmulhuw for it (MULU16 mode).
+static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // Check for legality
+ // pmullw/pmulhw are not supported by SSE.
+ if (!Subtarget.hasSSE2())
+ return SDValue();
+
+ // Check for profitability
+ // pmulld is supported since SSE41. It is better to use pmulld
+ // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
+ // the expansion.
+ bool OptForMinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
+ if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
+ return SDValue();
+
+ ShrinkMode Mode;
+ if (!canReduceVMulWidth(N, DAG, Mode))
+ return SDValue();
+
+ SDLoc DL(N);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT VT = N->getOperand(0).getValueType();
+ unsigned RegSize = 128;
+ MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
+ EVT ReducedVT =
+ EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements());
+ // Shrink the operands of mul.
+ SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
+ SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
+
+ if (VT.getVectorNumElements() >= OpsVT.getVectorNumElements()) {
+ // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
+ // lower part is needed.
+ SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
+ if (Mode == MULU8 || Mode == MULS8) {
+ return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
+ DL, VT, MulLo);
+ } else {
+ MVT ResVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
+ // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
+ // the higher part is also needed.
+ SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
+ ReducedVT, NewN0, NewN1);
+
+ // Repack the lower part and higher part result of mul into a wider
+ // result.
+ // Generate shuffle functioning as punpcklwd.
+ SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());
+ for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
+ ShuffleMask[2 * i] = i;
+ ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements();
+ }
+ SDValue ResLo =
+ DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
+ ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo);
+ // Generate shuffle functioning as punpckhwd.
+ for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
+ ShuffleMask[2 * i] = i + VT.getVectorNumElements() / 2;
+ ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements() * 3 / 2;
+ }
+ SDValue ResHi =
+ DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
+ ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
+ }
+ } else {
+ // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
+ // to legalize the mul explicitly because implicit legalization for type
+ // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
+ // instructions which will not exist when we explicitly legalize it by
+ // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
+ // <4 x i16> undef).
+ //
+ // Legalize the operands of mul.
+ // FIXME: We may be able to handle non-concatenated vectors by insertion.
+ unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
+ if ((RegSize % ReducedSizeInBits) != 0)
+ return SDValue();
+
+ SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
+ DAG.getUNDEF(ReducedVT));
+ Ops[0] = NewN0;
+ NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
+ Ops[0] = NewN1;
+ NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
+
+ if (Mode == MULU8 || Mode == MULS8) {
+ // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
+ // part is needed.
+ SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
+
+ // convert the type of mul result to VT.
+ MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
+ SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
+ : ISD::SIGN_EXTEND_VECTOR_INREG,
+ DL, ResVT, Mul);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
+ DAG.getIntPtrConstant(0, DL));
+ } else {
+ // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
+ // MULU16/MULS16, both parts are needed.
+ SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
+ SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
+ OpsVT, NewN0, NewN1);
+
+ // Repack the lower part and higher part result of mul into a wider
+ // result. Make sure the type of mul result is VT.
+ MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
+ SDValue Res = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi);
+ Res = DAG.getNode(ISD::BITCAST, DL, ResVT, Res);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
+ DAG.getIntPtrConstant(0, DL));
+ }
+ }
+}
+
+/// Optimize a single multiply with constant into two operations in order to
+/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
+static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ EVT VT = N->getValueType(0);
+ if (DCI.isBeforeLegalize() && VT.isVector())
+ return reduceVMULWidth(N, DAG, Subtarget);
+
+ // An imul is usually smaller than the alternative sequence.
+ if (DAG.getMachineFunction().getFunction()->optForMinSize())
+ return SDValue();
+
+ if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+ return SDValue();
+
+ if (VT != MVT::i64 && VT != MVT::i32)
+ return SDValue();
+
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!C)
+ return SDValue();
+ uint64_t MulAmt = C->getZExtValue();
+ if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
+ return SDValue();
+
+ uint64_t MulAmt1 = 0;
+ uint64_t MulAmt2 = 0;
+ if ((MulAmt % 9) == 0) {
+ MulAmt1 = 9;
+ MulAmt2 = MulAmt / 9;
+ } else if ((MulAmt % 5) == 0) {
+ MulAmt1 = 5;
+ MulAmt2 = MulAmt / 5;
+ } else if ((MulAmt % 3) == 0) {
+ MulAmt1 = 3;
+ MulAmt2 = MulAmt / 3;
+ }
+
+ SDLoc DL(N);
+ SDValue NewMul;
+ if (MulAmt2 &&
+ (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
+
+ if (isPowerOf2_64(MulAmt2) &&
+ !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
+ // If second multiplifer is pow2, issue it first. We want the multiply by
+ // 3, 5, or 9 to be folded into the addressing mode unless the lone use
+ // is an add.
+ std::swap(MulAmt1, MulAmt2);
+
+ if (isPowerOf2_64(MulAmt1))
+ NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
+ else
+ NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
+ DAG.getConstant(MulAmt1, DL, VT));
+
+ if (isPowerOf2_64(MulAmt2))
+ NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
+ DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
+ else
+ NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
+ DAG.getConstant(MulAmt2, DL, VT));
+ }
+
+ if (!NewMul) {
+ assert(MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX)
+ && "Both cases that could cause potential overflows should have "
+ "already been handled.");
+ if (isPowerOf2_64(MulAmt - 1))
+ // (mul x, 2^N + 1) => (add (shl x, N), x)
+ NewMul = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
+ DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant(Log2_64(MulAmt - 1), DL,
+ MVT::i8)));
+
+ else if (isPowerOf2_64(MulAmt + 1))
+ // (mul x, 2^N - 1) => (sub (shl x, N), x)
+ NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT,
+ N->getOperand(0),
+ DAG.getConstant(Log2_64(MulAmt + 1),
+ DL, MVT::i8)), N->getOperand(0));
+ }
+
+ if (NewMul)
+ // Do not add new nodes to DAG combiner worklist.
+ DCI.CombineTo(N, NewMul, false);
+
+ return SDValue();
+}
+
+static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+ EVT VT = N0.getValueType();
+
+ // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
+ // since the result of setcc_c is all zero's or all ones.
+ if (VT.isInteger() && !VT.isVector() &&
+ N1C && N0.getOpcode() == ISD::AND &&
+ N0.getOperand(1).getOpcode() == ISD::Constant) {
+ SDValue N00 = N0.getOperand(0);
+ APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+ const APInt &ShAmt = N1C->getAPIntValue();
+ Mask = Mask.shl(ShAmt);
+ bool MaskOK = false;
+ // We can handle cases concerning bit-widening nodes containing setcc_c if
+ // we carefully interrogate the mask to make sure we are semantics
+ // preserving.
+ // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
+ // of the underlying setcc_c operation if the setcc_c was zero extended.
+ // Consider the following example:
+ // zext(setcc_c) -> i32 0x0000FFFF
+ // c1 -> i32 0x0000FFFF
+ // c2 -> i32 0x00000001
+ // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
+ // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
+ if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
+ MaskOK = true;
+ } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
+ N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
+ MaskOK = true;
+ } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
+ N00.getOpcode() == ISD::ANY_EXTEND) &&
+ N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
+ MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
+ }
+ if (MaskOK && Mask != 0) {
+ SDLoc DL(N);
+ return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
+ }
+ }
+
+ // Hardware support for vector shifts is sparse which makes us scalarize the
+ // vector operations in many cases. Also, on sandybridge ADD is faster than
+ // shl.
+ // (shl V, 1) -> add V,V
+ if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
+ if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
+ assert(N0.getValueType().isVector() && "Invalid vector shift type");
+ // We shift all of the values by one. In many cases we do not have
+ // hardware support for this operation. This is better expressed as an ADD
+ // of two values.
+ if (N1SplatC->getAPIntValue() == 1)
+ return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
+ }
+
+ return SDValue();
+}
+
+static SDValue combineShiftRightAlgebraic(SDNode *N, SelectionDAG &DAG) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT VT = N0.getValueType();
+ unsigned Size = VT.getSizeInBits();
+
+ // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
+ // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
+ // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
+ // depending on sign of (SarConst - [56,48,32,24,16])
+
+ // sexts in X86 are MOVs. The MOVs have the same code size
+ // as above SHIFTs (only SHIFT on 1 has lower code size).
+ // However the MOVs have 2 advantages to a SHIFT:
+ // 1. MOVs can write to a register that differs from source
+ // 2. MOVs accept memory operands
+
+ if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant ||
+ N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
+ N0.getOperand(1).getOpcode() != ISD::Constant)
+ return SDValue();
+
+ SDValue N00 = N0.getOperand(0);
+ SDValue N01 = N0.getOperand(1);
+ APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
+ APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
+ EVT CVT = N1.getValueType();
+
+ if (SarConst.isNegative())
+ return SDValue();
+
+ for (MVT SVT : MVT::integer_valuetypes()) {
+ unsigned ShiftSize = SVT.getSizeInBits();
+ // skipping types without corresponding sext/zext and
+ // ShlConst that is not one of [56,48,32,24,16]
+ if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize)
+ continue;
+ SDLoc DL(N);
+ SDValue NN =
+ DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
+ SarConst = SarConst - (Size - ShiftSize);
+ if (SarConst == 0)
+ return NN;
+ else if (SarConst.isNegative())
+ return DAG.getNode(ISD::SHL, DL, VT, NN,
+ DAG.getConstant(-SarConst, DL, CVT));
+ else
+ return DAG.getNode(ISD::SRA, DL, VT, NN,
+ DAG.getConstant(SarConst, DL, CVT));
+ }
+ return SDValue();
+}
+
+/// \brief Returns a vector of 0s if the node in input is a vector logical
+/// shift by a constant amount which is known to be bigger than or equal
+/// to the vector element size in bits.
+static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ EVT VT = N->getValueType(0);
+
+ if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
+ (!Subtarget.hasInt256() ||
+ (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
+ return SDValue();
+
+ SDValue Amt = N->getOperand(1);
+ SDLoc DL(N);
+ if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
+ if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
+ const APInt &ShiftAmt = AmtSplat->getAPIntValue();
+ unsigned MaxAmount =
+ VT.getSimpleVT().getScalarSizeInBits();
+
+ // SSE2/AVX2 logical shifts always return a vector of 0s
+ // if the shift amount is bigger than or equal to
+ // the element size. The constant shift amount will be
+ // encoded as a 8-bit immediate.
+ if (ShiftAmt.trunc(8).uge(MaxAmount))
+ return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
+ }
+
+ return SDValue();
+}
+
+static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ if (N->getOpcode() == ISD::SHL)
+ if (SDValue V = combineShiftLeft(N, DAG))
+ return V;
+
+ if (N->getOpcode() == ISD::SRA)
+ if (SDValue V = combineShiftRightAlgebraic(N, DAG))
+ return V;
+
+ // Try to fold this logical shift into a zero vector.
+ if (N->getOpcode() != ISD::SRA)
+ if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))
+ return V;
+
+ return SDValue();
+}
+
+static SDValue combineVectorShift(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ assert((X86ISD::VSHLI == N->getOpcode() || X86ISD::VSRLI == N->getOpcode()) &&
+ "Unexpected opcode");
+ EVT VT = N->getValueType(0);
+ unsigned NumBitsPerElt = VT.getScalarSizeInBits();
+
+ // This fails for mask register (vXi1) shifts.
+ if ((NumBitsPerElt % 8) != 0)
+ return SDValue();
+
+ // Out of range logical bit shifts are guaranteed to be zero.
+ APInt ShiftVal = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
+ if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt))
+ return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
+
+ // Shift N0 by zero -> N0.
+ if (!ShiftVal)
+ return N->getOperand(0);
+
+ // Shift zero -> zero.
+ if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
+ return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
+
+ // We can decode 'whole byte' logical bit shifts as shuffles.
+ if ((ShiftVal.getZExtValue() % 8) == 0) {
+ SDValue Op(N, 0);
+ SmallVector<int, 1> NonceMask; // Just a placeholder.
+ NonceMask.push_back(0);
+ if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
+ /*Depth*/ 1, /*HasVarMask*/ false, DAG,
+ DCI, Subtarget))
+ return SDValue(); // This routine will use CombineTo to replace N.
+ }
+
+ return SDValue();
+}
+
+/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
+/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
+/// OR -> CMPNEQSS.
+static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ unsigned opcode;
+
+ // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
+ // we're requiring SSE2 for both.
+ if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDValue CMP0 = N0->getOperand(1);
+ SDValue CMP1 = N1->getOperand(1);
+ SDLoc DL(N);
+
+ // The SETCCs should both refer to the same CMP.
+ if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
+ return SDValue();
+
+ SDValue CMP00 = CMP0->getOperand(0);
+ SDValue CMP01 = CMP0->getOperand(1);
+ EVT VT = CMP00.getValueType();
+
+ if (VT == MVT::f32 || VT == MVT::f64) {
+ bool ExpectingFlags = false;
+ // Check for any users that want flags:
+ for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
+ !ExpectingFlags && UI != UE; ++UI)
+ switch (UI->getOpcode()) {
+ default:
+ case ISD::BR_CC:
+ case ISD::BRCOND:
+ case ISD::SELECT:
+ ExpectingFlags = true;
+ break;
+ case ISD::CopyToReg:
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND:
+ case ISD::ANY_EXTEND:
+ break;
+ }
+
+ if (!ExpectingFlags) {
+ enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
+ enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
+
+ if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
+ X86::CondCode tmp = cc0;
+ cc0 = cc1;
+ cc1 = tmp;
+ }
+
+ if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
+ (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
+ // FIXME: need symbolic constants for these magic numbers.
+ // See X86ATTInstPrinter.cpp:printSSECC().
+ unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
+ if (Subtarget.hasAVX512()) {
+ SDValue FSetCC = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CMP00,
+ CMP01,
+ DAG.getConstant(x86cc, DL, MVT::i8));
+ if (N->getValueType(0) != MVT::i1)
+ return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
+ FSetCC);
+ return FSetCC;
+ }
+ SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
+ CMP00.getValueType(), CMP00, CMP01,
+ DAG.getConstant(x86cc, DL,
+ MVT::i8));
+
+ bool is64BitFP = (CMP00.getValueType() == MVT::f64);
+ MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
+
+ if (is64BitFP && !Subtarget.is64Bit()) {
+ // On a 32-bit target, we cannot bitcast the 64-bit float to a
+ // 64-bit integer, since that's not a legal type. Since
+ // OnesOrZeroesF is all ones of all zeroes, we don't need all the
+ // bits, but can do this little dance to extract the lowest 32 bits
+ // and work with those going forward.
+ SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
+ OnesOrZeroesF);
+ SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
+ OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
+ Vector32, DAG.getIntPtrConstant(0, DL));
+ IntVT = MVT::i32;
+ }
+
+ SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
+ SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
+ DAG.getConstant(1, DL, IntVT));
+ SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
+ ANDed);
+ return OneBitOfTruth;
+ }
+ }
+ }
+ }
+ return SDValue();
+}
+
+/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
+static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
+ assert(N->getOpcode() == ISD::AND);
+
+ EVT VT = N->getValueType(0);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDLoc DL(N);
+
+ if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
+ return SDValue();
+
+ // Canonicalize XOR to the left.
+ if (N1.getOpcode() == ISD::XOR)
+ std::swap(N0, N1);
+
+ if (N0.getOpcode() != ISD::XOR)
+ return SDValue();
+
+ SDValue N00 = N0->getOperand(0);
+ SDValue N01 = N0->getOperand(1);
+
+ N01 = peekThroughBitcasts(N01);
+
+ // Either match a direct AllOnes for 128, 256, and 512-bit vectors, or an
+ // insert_subvector building a 256-bit AllOnes vector.
+ if (!ISD::isBuildVectorAllOnes(N01.getNode())) {
+ if (!VT.is256BitVector() || N01->getOpcode() != ISD::INSERT_SUBVECTOR)
+ return SDValue();
+
+ SDValue V1 = N01->getOperand(0);
+ SDValue V2 = N01->getOperand(1);
+ if (V1.getOpcode() != ISD::INSERT_SUBVECTOR ||
+ !V1.getOperand(0).isUndef() ||
+ !ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) ||
+ !ISD::isBuildVectorAllOnes(V2.getNode()))
+ return SDValue();
+ }
+ return DAG.getNode(X86ISD::ANDNP, DL, VT, N00, N1);
+}
+
+// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
+// register. In most cases we actually compare or select YMM-sized registers
+// and mixing the two types creates horrible code. This method optimizes
+// some of the transition sequences.
+static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ EVT VT = N->getValueType(0);
+ if (!VT.is256BitVector())
+ return SDValue();
+
+ assert((N->getOpcode() == ISD::ANY_EXTEND ||
+ N->getOpcode() == ISD::ZERO_EXTEND ||
+ N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
+
+ SDValue Narrow = N->getOperand(0);
+ EVT NarrowVT = Narrow->getValueType(0);
+ if (!NarrowVT.is128BitVector())
+ return SDValue();
+
+ if (Narrow->getOpcode() != ISD::XOR &&
+ Narrow->getOpcode() != ISD::AND &&
+ Narrow->getOpcode() != ISD::OR)
+ return SDValue();
+
+ SDValue N0 = Narrow->getOperand(0);
+ SDValue N1 = Narrow->getOperand(1);
+ SDLoc DL(Narrow);
+
+ // The Left side has to be a trunc.
+ if (N0.getOpcode() != ISD::TRUNCATE)
+ return SDValue();
+
+ // The type of the truncated inputs.
+ EVT WideVT = N0->getOperand(0)->getValueType(0);
+ if (WideVT != VT)
+ return SDValue();
+
+ // The right side has to be a 'trunc' or a constant vector.
+ bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
+ ConstantSDNode *RHSConstSplat = nullptr;
+ if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
+ RHSConstSplat = RHSBV->getConstantSplatNode();
+ if (!RHSTrunc && !RHSConstSplat)
+ return SDValue();
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
+ return SDValue();
+
+ // Set N0 and N1 to hold the inputs to the new wide operation.
+ N0 = N0->getOperand(0);
+ if (RHSConstSplat) {
+ N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
+ SDValue(RHSConstSplat, 0));
+ N1 = DAG.getSplatBuildVector(WideVT, DL, N1);
+ } else if (RHSTrunc) {
+ N1 = N1->getOperand(0);
+ }
+
+ // Generate the wide operation.
+ SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
+ unsigned Opcode = N->getOpcode();
+ switch (Opcode) {
+ case ISD::ANY_EXTEND:
+ return Op;
+ case ISD::ZERO_EXTEND: {
+ unsigned InBits = NarrowVT.getScalarSizeInBits();
+ APInt Mask = APInt::getAllOnesValue(InBits);
+ Mask = Mask.zext(VT.getScalarSizeInBits());
+ return DAG.getNode(ISD::AND, DL, VT,
+ Op, DAG.getConstant(Mask, DL, VT));
+ }
+ case ISD::SIGN_EXTEND:
+ return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
+ Op, DAG.getValueType(NarrowVT));
+ default:
+ llvm_unreachable("Unexpected opcode");
+ }
+}
+
+/// If both input operands of a logic op are being cast from floating point
+/// types, try to convert this into a floating point logic node to avoid
+/// unnecessary moves from SSE to integer registers.
+static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ unsigned FPOpcode = ISD::DELETED_NODE;
+ if (N->getOpcode() == ISD::AND)
+ FPOpcode = X86ISD::FAND;
+ else if (N->getOpcode() == ISD::OR)
+ FPOpcode = X86ISD::FOR;
+ else if (N->getOpcode() == ISD::XOR)
+ FPOpcode = X86ISD::FXOR;
+
+ assert(FPOpcode != ISD::DELETED_NODE &&
+ "Unexpected input node for FP logic conversion");
+
+ EVT VT = N->getValueType(0);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDLoc DL(N);
+ if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
+ ((Subtarget.hasSSE1() && VT == MVT::i32) ||
+ (Subtarget.hasSSE2() && VT == MVT::i64))) {
+ SDValue N00 = N0.getOperand(0);
+ SDValue N10 = N1.getOperand(0);
+ EVT N00Type = N00.getValueType();
+ EVT N10Type = N10.getValueType();
+ if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
+ SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
+ return DAG.getBitcast(VT, FPLogic);
+ }
+ }
+ return SDValue();
+}
+
+/// If this is a PCMPEQ or PCMPGT result that is bitwise-anded with 1 (this is
+/// the x86 lowering of a SETCC + ZEXT), replace the 'and' with a shift-right to
+/// eliminate loading the vector constant mask value. This relies on the fact
+/// that a PCMP always creates an all-ones or all-zeros bitmask per element.
+static SDValue combinePCMPAnd1(SDNode *N, SelectionDAG &DAG) {
+ SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
+ SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
+
+ // TODO: Use AssertSext to mark any nodes that have the property of producing
+ // all-ones or all-zeros. Then check for that node rather than particular
+ // opcodes.
+ if (Op0.getOpcode() != X86ISD::PCMPEQ && Op0.getOpcode() != X86ISD::PCMPGT)
+ return SDValue();
+
+ // The existence of the PCMP node guarantees that we have the required SSE2 or
+ // AVX2 for a shift of this vector type, but there is no vector shift by
+ // immediate for a vector with byte elements (PSRLB). 512-bit vectors use the
+ // masked compare nodes, so they should not make it here.
+ EVT VT0 = Op0.getValueType();
+ EVT VT1 = Op1.getValueType();
+ unsigned EltBitWidth = VT0.getScalarSizeInBits();
+ if (VT0 != VT1 || EltBitWidth == 8)
+ return SDValue();
+
+ assert(VT0.getSizeInBits() == 128 || VT0.getSizeInBits() == 256);
+
+ APInt SplatVal;
+ if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) || SplatVal != 1)
+ return SDValue();
+
+ SDLoc DL(N);
+ SDValue ShAmt = DAG.getConstant(EltBitWidth - 1, DL, MVT::i8);
+ SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
+ return DAG.getBitcast(N->getValueType(0), Shift);
+}
+
+static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
+ return R;
+
+ if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
+ return FPLogic;
+
+ if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
+ return R;
+
+ if (SDValue ShiftRight = combinePCMPAnd1(N, DAG))
+ return ShiftRight;
+
+ EVT VT = N->getValueType(0);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDLoc DL(N);
+
+ // Attempt to recursively combine a bitmask AND with shuffles.
+ if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
+ SDValue Op(N, 0);
+ SmallVector<int, 1> NonceMask; // Just a placeholder.
+ NonceMask.push_back(0);
+ if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
+ /*Depth*/ 1, /*HasVarMask*/ false, DAG,
+ DCI, Subtarget))
+ return SDValue(); // This routine will use CombineTo to replace N.
+ }
+
+ // Create BEXTR instructions
+ // BEXTR is ((X >> imm) & (2**size-1))
+ if (VT != MVT::i32 && VT != MVT::i64)
+ return SDValue();
+
+ if (!Subtarget.hasBMI() && !Subtarget.hasTBM())
+ return SDValue();
+ if (N0.getOpcode() != ISD::SRA && N0.getOpcode() != ISD::SRL)
+ return SDValue();
+
+ ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
+ ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+ if (MaskNode && ShiftNode) {
+ uint64_t Mask = MaskNode->getZExtValue();
+ uint64_t Shift = ShiftNode->getZExtValue();
+ if (isMask_64(Mask)) {
+ uint64_t MaskSize = countPopulation(Mask);
+ if (Shift + MaskSize <= VT.getSizeInBits())
+ return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
+ DAG.getConstant(Shift | (MaskSize << 8), DL,
+ VT));
+ }
+ }
+ return SDValue();
+}
+
+// Try to fold:
+// (or (and (m, y), (pandn m, x)))
+// into:
+// (vselect m, x, y)
+// As a special case, try to fold:
+// (or (and (m, (sub 0, x)), (pandn m, x)))
+// into:
+// (sub (xor X, M), M)
+static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ assert(N->getOpcode() == ISD::OR);
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+
+ if (!((VT == MVT::v2i64) || (VT == MVT::v4i64 && Subtarget.hasInt256())))
+ return SDValue();
+ assert(Subtarget.hasSSE2() && "Unexpected i64 vector without SSE2!");
+
+ // Canonicalize pandn to RHS
+ if (N0.getOpcode() == X86ISD::ANDNP)
+ std::swap(N0, N1);
+
+ if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
+ return SDValue();
+
+ SDValue Mask = N1.getOperand(0);
+ SDValue X = N1.getOperand(1);
+ SDValue Y;
+ if (N0.getOperand(0) == Mask)
+ Y = N0.getOperand(1);
+ if (N0.getOperand(1) == Mask)
+ Y = N0.getOperand(0);
+
+ // Check to see if the mask appeared in both the AND and ANDNP.
+ if (!Y.getNode())
+ return SDValue();
+
+ // Validate that X, Y, and Mask are bitcasts, and see through them.
+ Mask = peekThroughBitcasts(Mask);
+ X = peekThroughBitcasts(X);
+ Y = peekThroughBitcasts(Y);
+
+ EVT MaskVT = Mask.getValueType();
+
+ // Validate that the Mask operand is a vector sra node.
+ // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
+ // there is no psrai.b
+ unsigned EltBits = MaskVT.getScalarSizeInBits();
+ unsigned SraAmt = ~0;
+ if (Mask.getOpcode() == ISD::SRA) {
+ if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
+ if (auto *AmtConst = AmtBV->getConstantSplatNode())
+ SraAmt = AmtConst->getZExtValue();
+ } else if (Mask.getOpcode() == X86ISD::VSRAI) {
+ SDValue SraC = Mask.getOperand(1);
+ SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue();
+ }
+ if ((SraAmt + 1) != EltBits)
+ return SDValue();
+
+ SDLoc DL(N);
+
+ // Try to match:
+ // (or (and (M, (sub 0, X)), (pandn M, X)))
+ // which is a special case of vselect:
+ // (vselect M, (sub 0, X), X)
+ // Per:
+ // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
+ // We know that, if fNegate is 0 or 1:
+ // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
+ //
+ // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
+ // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
+ // ( M ? -X : X) == ((X ^ M ) + (M & 1))
+ // This lets us transform our vselect to:
+ // (add (xor X, M), (and M, 1))
+ // And further to:
+ // (sub (xor X, M), M)
+ if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
+ auto IsNegV = [](SDNode *N, SDValue V) {
+ return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
+ ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
+ };
+ SDValue V;
+ if (IsNegV(Y.getNode(), X))
+ V = X;
+ else if (IsNegV(X.getNode(), Y))
+ V = Y;
+
+ if (V) {
+ assert(EltBits == 8 || EltBits == 16 || EltBits == 32);
+ SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
+ SDValue SubOp2 = Mask;
+
+ // If the negate was on the false side of the select, then
+ // the operands of the SUB need to be swapped. PR 27251.
+ // This is because the pattern being matched above is
+ // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
+ // but if the pattern matched was
+ // (vselect M, X, (sub (0, X))), that is really negation of the pattern
+ // above, -(vselect M, (sub 0, X), X), and therefore the replacement
+ // pattern also needs to be a negation of the replacement pattern above.
+ // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
+ // sub accomplishes the negation of the replacement pattern.
+ if (V == Y)
+ std::swap(SubOp1, SubOp2);
+
+ return DAG.getBitcast(VT,
+ DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2));
+ }
+ }
+
+ // PBLENDVB is only available on SSE 4.1.
+ if (!Subtarget.hasSSE41())
+ return SDValue();
+
+ MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
+
+ X = DAG.getBitcast(BlendVT, X);
+ Y = DAG.getBitcast(BlendVT, Y);
+ Mask = DAG.getBitcast(BlendVT, Mask);
+ Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
+ return DAG.getBitcast(VT, Mask);
+}
+
+// Helper function for combineOrCmpEqZeroToCtlzSrl
+// Transforms:
+// seteq(cmp x, 0)
+// into:
+// srl(ctlz x), log2(bitsize(x))
+// Input pattern is checked by caller.
+static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
+ SelectionDAG &DAG) {
+ SDValue Cmp = Op.getOperand(1);
+ EVT VT = Cmp.getOperand(0).getValueType();
+ unsigned Log2b = Log2_32(VT.getSizeInBits());
+ SDLoc dl(Op);
+ SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
+ // The result of the shift is true or false, and on X86, the 32-bit
+ // encoding of shr and lzcnt is more desirable.
+ SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
+ SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
+ DAG.getConstant(Log2b, dl, VT));
+ return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
+}
+
+// Try to transform:
+// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
+// into:
+// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
+// Will also attempt to match more generic cases, eg:
+// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
+// Only applies if the target supports the FastLZCNT feature.
+static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
+ return SDValue();
+
+ auto isORCandidate = [](SDValue N) {
+ return (N->getOpcode() == ISD::OR && N->hasOneUse());
+ };
+
+ // Check the zero extend is extending to 32-bit or more. The code generated by
+ // srl(ctlz) for 16-bit or less variants of the pattern would require extra
+ // instructions to clear the upper bits.
+ if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
+ !isORCandidate(N->getOperand(0)))
+ return SDValue();
+
+ // Check the node matches: setcc(eq, cmp 0)
+ auto isSetCCCandidate = [](SDValue N) {
+ return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
+ X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
+ N->getOperand(1).getOpcode() == X86ISD::CMP &&
+ N->getOperand(1).getConstantOperandVal(1) == 0 &&
+ N->getOperand(1).getValueType().bitsGE(MVT::i32);
+ };
+
+ SDNode *OR = N->getOperand(0).getNode();
+ SDValue LHS = OR->getOperand(0);
+ SDValue RHS = OR->getOperand(1);
+
+ // Save nodes matching or(or, setcc(eq, cmp 0)).
+ SmallVector<SDNode *, 2> ORNodes;
+ while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
+ (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
+ ORNodes.push_back(OR);
+ OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
+ LHS = OR->getOperand(0);
+ RHS = OR->getOperand(1);
+ }
+
+ // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
+ if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
+ !isORCandidate(SDValue(OR, 0)))
+ return SDValue();
+
+ // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
+ // to
+ // or(srl(ctlz),srl(ctlz)).
+ // The dag combiner can then fold it into:
+ // srl(or(ctlz, ctlz)).
+ EVT VT = OR->getValueType(0);
+ SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
+ SDValue Ret, NewRHS;
+ if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
+ Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
+
+ if (!Ret)
+ return SDValue();
+
+ // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
+ while (ORNodes.size() > 0) {
+ OR = ORNodes.pop_back_val();
+ LHS = OR->getOperand(0);
+ RHS = OR->getOperand(1);
+ // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
+ if (RHS->getOpcode() == ISD::OR)
+ std::swap(LHS, RHS);
+ EVT VT = OR->getValueType(0);
+ SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
+ if (!NewRHS)
+ return SDValue();
+ Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
+ }
+
+ if (Ret)
+ Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
+
+ return Ret;
+}
+
+static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
+ return R;
+
+ if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
+ return FPLogic;
+
+ if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
+ return R;
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+
+ if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
+ return SDValue();
+
+ // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
+ bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
+
+ // SHLD/SHRD instructions have lower register pressure, but on some
+ // platforms they have higher latency than the equivalent
+ // series of shifts/or that would otherwise be generated.
+ // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
+ // have higher latencies and we are not optimizing for size.
+ if (!OptForSize && Subtarget.isSHLDSlow())
+ return SDValue();
+
+ if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
+ std::swap(N0, N1);
+ if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
+ return SDValue();
+ if (!N0.hasOneUse() || !N1.hasOneUse())
+ return SDValue();
+
+ SDValue ShAmt0 = N0.getOperand(1);
+ if (ShAmt0.getValueType() != MVT::i8)
+ return SDValue();
+ SDValue ShAmt1 = N1.getOperand(1);
+ if (ShAmt1.getValueType() != MVT::i8)
+ return SDValue();
+ if (ShAmt0.getOpcode() == ISD::TRUNCATE)
+ ShAmt0 = ShAmt0.getOperand(0);
+ if (ShAmt1.getOpcode() == ISD::TRUNCATE)
+ ShAmt1 = ShAmt1.getOperand(0);
+
+ SDLoc DL(N);
+ unsigned Opc = X86ISD::SHLD;
+ SDValue Op0 = N0.getOperand(0);
+ SDValue Op1 = N1.getOperand(0);
+ if (ShAmt0.getOpcode() == ISD::SUB ||
+ ShAmt0.getOpcode() == ISD::XOR) {
+ Opc = X86ISD::SHRD;
+ std::swap(Op0, Op1);
+ std::swap(ShAmt0, ShAmt1);
+ }
+
+ // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
+ // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
+ // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
+ // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
+ unsigned Bits = VT.getSizeInBits();
+ if (ShAmt1.getOpcode() == ISD::SUB) {
+ SDValue Sum = ShAmt1.getOperand(0);
+ if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
+ SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
+ if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
+ ShAmt1Op1 = ShAmt1Op1.getOperand(0);
+ if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
+ return DAG.getNode(Opc, DL, VT,
+ Op0, Op1,
+ DAG.getNode(ISD::TRUNCATE, DL,
+ MVT::i8, ShAmt0));
+ }
+ } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
+ ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
+ if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
+ return DAG.getNode(Opc, DL, VT,
+ N0.getOperand(0), N1.getOperand(0),
+ DAG.getNode(ISD::TRUNCATE, DL,
+ MVT::i8, ShAmt0));
+ } else if (ShAmt1.getOpcode() == ISD::XOR) {
+ SDValue Mask = ShAmt1.getOperand(1);
+ if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
+ unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
+ SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
+ if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
+ ShAmt1Op0 = ShAmt1Op0.getOperand(0);
+ if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
+ if (Op1.getOpcode() == InnerShift &&
+ isa<ConstantSDNode>(Op1.getOperand(1)) &&
+ Op1.getConstantOperandVal(1) == 1) {
+ return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
+ DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
+ }
+ // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
+ if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
+ Op1.getOperand(0) == Op1.getOperand(1)) {
+ return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
+ DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
+ }
+ }
+ }
+ }
+
+ return SDValue();
+}
+
+/// Generate NEG and CMOV for integer abs.
+static SDValue combineIntegerAbs(SDNode *N, SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+
+ // Since X86 does not have CMOV for 8-bit integer, we don't convert
+ // 8-bit integer abs to NEG and CMOV.
+ if (VT.isInteger() && VT.getSizeInBits() == 8)
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDLoc DL(N);
+
+ // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
+ // and change it to SUB and CMOV.
+ if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
+ N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
+ N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) {
+ auto *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
+ if (Y1C && Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
+ // Generate SUB & CMOV.
+ SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
+ DAG.getConstant(0, DL, VT), N0.getOperand(0));
+ SDValue Ops[] = {N0.getOperand(0), Neg,
+ DAG.getConstant(X86::COND_GE, DL, MVT::i8),
+ SDValue(Neg.getNode(), 1)};
+ return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
+ }
+ }
+ return SDValue();
+}
+
+/// Try to turn tests against the signbit in the form of:
+/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
+/// into:
+/// SETGT(X, -1)
+static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
+ // This is only worth doing if the output type is i8 or i1.
+ EVT ResultType = N->getValueType(0);
+ if (ResultType != MVT::i8 && ResultType != MVT::i1)
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ // We should be performing an xor against a truncated shift.
+ if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
+ return SDValue();
+
+ // Make sure we are performing an xor against one.
+ if (!isOneConstant(N1))
+ return SDValue();
+
+ // SetCC on x86 zero extends so only act on this if it's a logical shift.
+ SDValue Shift = N0.getOperand(0);
+ if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
+ return SDValue();
+
+ // Make sure we are truncating from one of i16, i32 or i64.
+ EVT ShiftTy = Shift.getValueType();
+ if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
+ return SDValue();
+
+ // Make sure the shift amount extracts the sign bit.
+ if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
+ Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
+ return SDValue();
+
+ // Create a greater-than comparison against -1.
+ // N.B. Using SETGE against 0 works but we want a canonical looking
+ // comparison, using SETGT matches up with what TranslateX86CC.
+ SDLoc DL(N);
+ SDValue ShiftOp = Shift.getOperand(0);
+ EVT ShiftOpTy = ShiftOp.getValueType();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
+ *DAG.getContext(), ResultType);
+ SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
+ DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
+ if (SetCCResultType != ResultType)
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
+ return Cond;
+}
+
+/// Turn vector tests of the signbit in the form of:
+/// xor (sra X, elt_size(X)-1), -1
+/// into:
+/// pcmpgt X, -1
+///
+/// This should be called before type legalization because the pattern may not
+/// persist after that.
+static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ EVT VT = N->getValueType(0);
+ if (!VT.isSimple())
+ return SDValue();
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return SDValue();
+ case MVT::v16i8:
+ case MVT::v8i16:
+ case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
+ case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
+ case MVT::v32i8:
+ case MVT::v16i16:
+ case MVT::v8i32:
+ case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
+ }
+
+ // There must be a shift right algebraic before the xor, and the xor must be a
+ // 'not' operation.
+ SDValue Shift = N->getOperand(0);
+ SDValue Ones = N->getOperand(1);
+ if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
+ !ISD::isBuildVectorAllOnes(Ones.getNode()))
+ return SDValue();
+
+ // The shift should be smearing the sign bit across each vector element.
+ auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
+ if (!ShiftBV)
+ return SDValue();
+
+ EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
+ auto *ShiftAmt = ShiftBV->getConstantSplatNode();
+ if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
+ return SDValue();
+
+ // Create a greater-than comparison against -1. We don't use the more obvious
+ // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
+ return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
+}
+
+/// This function detects the AVG pattern between vectors of unsigned i8/i16,
+/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
+/// X86ISD::AVG instruction.
+static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ const SDLoc &DL) {
+ if (!VT.isVector() || !VT.isSimple())
+ return SDValue();
+ EVT InVT = In.getValueType();
+ unsigned NumElems = VT.getVectorNumElements();
+
+ EVT ScalarVT = VT.getVectorElementType();
+ if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
+ isPowerOf2_32(NumElems)))
+ return SDValue();
+
+ // InScalarVT is the intermediate type in AVG pattern and it should be greater
+ // than the original input type (i8/i16).
+ EVT InScalarVT = InVT.getVectorElementType();
+ if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
+ return SDValue();
+
+ if (!Subtarget.hasSSE2())
+ return SDValue();
+ if (Subtarget.hasBWI()) {
+ if (VT.getSizeInBits() > 512)
+ return SDValue();
+ } else if (Subtarget.hasAVX2()) {
+ if (VT.getSizeInBits() > 256)
+ return SDValue();
+ } else {
+ if (VT.getSizeInBits() > 128)
+ return SDValue();
+ }
+
+ // Detect the following pattern:
+ //
+ // %1 = zext <N x i8> %a to <N x i32>
+ // %2 = zext <N x i8> %b to <N x i32>
+ // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
+ // %4 = add nuw nsw <N x i32> %3, %2
+ // %5 = lshr <N x i32> %N, <i32 1 x N>
+ // %6 = trunc <N x i32> %5 to <N x i8>
+ //
+ // In AVX512, the last instruction can also be a trunc store.
+
+ if (In.getOpcode() != ISD::SRL)
+ return SDValue();
+
+ // A lambda checking the given SDValue is a constant vector and each element
+ // is in the range [Min, Max].
+ auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
+ BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
+ if (!BV || !BV->isConstant())
+ return false;
+ for (unsigned i = 0, e = V.getNumOperands(); i < e; i++) {
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(i));
+ if (!C)
+ return false;
+ uint64_t Val = C->getZExtValue();
+ if (Val < Min || Val > Max)
+ return false;
+ }
+ return true;
+ };
+
+ // Check if each element of the vector is left-shifted by one.
+ auto LHS = In.getOperand(0);
+ auto RHS = In.getOperand(1);
+ if (!IsConstVectorInRange(RHS, 1, 1))
+ return SDValue();
+ if (LHS.getOpcode() != ISD::ADD)
+ return SDValue();
+
+ // Detect a pattern of a + b + 1 where the order doesn't matter.
+ SDValue Operands[3];
+ Operands[0] = LHS.getOperand(0);
+ Operands[1] = LHS.getOperand(1);
+
+ // Take care of the case when one of the operands is a constant vector whose
+ // element is in the range [1, 256].
+ if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
+ Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
+ Operands[0].getOperand(0).getValueType() == VT) {
+ // The pattern is detected. Subtract one from the constant vector, then
+ // demote it and emit X86ISD::AVG instruction.
+ SDValue VecOnes = DAG.getConstant(1, DL, InVT);
+ Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
+ Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
+ return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
+ Operands[1]);
+ }
+
+ if (Operands[0].getOpcode() == ISD::ADD)
+ std::swap(Operands[0], Operands[1]);
+ else if (Operands[1].getOpcode() != ISD::ADD)
+ return SDValue();
+ Operands[2] = Operands[1].getOperand(0);
+ Operands[1] = Operands[1].getOperand(1);
+
+ // Now we have three operands of two additions. Check that one of them is a
+ // constant vector with ones, and the other two are promoted from i8/i16.
+ for (int i = 0; i < 3; ++i) {
+ if (!IsConstVectorInRange(Operands[i], 1, 1))
+ continue;
+ std::swap(Operands[i], Operands[2]);
+
+ // Check if Operands[0] and Operands[1] are results of type promotion.
+ for (int j = 0; j < 2; ++j)
+ if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
+ Operands[j].getOperand(0).getValueType() != VT)
+ return SDValue();
+
+ // The pattern is detected, emit X86ISD::AVG instruction.
+ return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
+ Operands[1].getOperand(0));
+ }
+
+ return SDValue();
+}
+
+static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ LoadSDNode *Ld = cast<LoadSDNode>(N);
+ EVT RegVT = Ld->getValueType(0);
+ EVT MemVT = Ld->getMemoryVT();
+ SDLoc dl(Ld);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ // For chips with slow 32-byte unaligned loads, break the 32-byte operation
+ // into two 16-byte operations.
+ ISD::LoadExtType Ext = Ld->getExtensionType();
+ bool Fast;
+ unsigned AddressSpace = Ld->getAddressSpace();
+ unsigned Alignment = Ld->getAlignment();
+ if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
+ Ext == ISD::NON_EXTLOAD &&
+ TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
+ AddressSpace, Alignment, &Fast) && !Fast) {
+ unsigned NumElems = RegVT.getVectorNumElements();
+ if (NumElems < 2)
+ return SDValue();
+
+ SDValue Ptr = Ld->getBasePtr();
+
+ EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
+ NumElems/2);
+ SDValue Load1 =
+ DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
+ Alignment, Ld->getMemOperand()->getFlags());
+
+ Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
+ SDValue Load2 =
+ DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
+ std::min(16U, Alignment), Ld->getMemOperand()->getFlags());
+ SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ Load1.getValue(1),
+ Load2.getValue(1));
+
+ SDValue NewVec = DAG.getUNDEF(RegVT);
+ NewVec = insert128BitVector(NewVec, Load1, 0, DAG, dl);
+ NewVec = insert128BitVector(NewVec, Load2, NumElems / 2, DAG, dl);
+ return DCI.CombineTo(N, NewVec, TF, true);
+ }
+
+ return SDValue();
+}
+
+/// If V is a build vector of boolean constants and exactly one of those
+/// constants is true, return the operand index of that true element.
+/// Otherwise, return -1.
+static int getOneTrueElt(SDValue V) {
+ // This needs to be a build vector of booleans.
+ // TODO: Checking for the i1 type matches the IR definition for the mask,
+ // but the mask check could be loosened to i8 or other types. That might
+ // also require checking more than 'allOnesValue'; eg, the x86 HW
+ // instructions only require that the MSB is set for each mask element.
+ // The ISD::MSTORE comments/definition do not specify how the mask operand
+ // is formatted.
+ auto *BV = dyn_cast<BuildVectorSDNode>(V);
+ if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
+ return -1;
+
+ int TrueIndex = -1;
+ unsigned NumElts = BV->getValueType(0).getVectorNumElements();
+ for (unsigned i = 0; i < NumElts; ++i) {
+ const SDValue &Op = BV->getOperand(i);
+ if (Op.isUndef())
+ continue;
+ auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
+ if (!ConstNode)
+ return -1;
+ if (ConstNode->getAPIntValue().isAllOnesValue()) {
+ // If we already found a one, this is too many.
+ if (TrueIndex >= 0)
+ return -1;
+ TrueIndex = i;
+ }
+ }
+ return TrueIndex;
+}
+
+/// Given a masked memory load/store operation, return true if it has one mask
+/// bit set. If it has one mask bit set, then also return the memory address of
+/// the scalar element to load/store, the vector index to insert/extract that
+/// scalar element, and the alignment for the scalar memory access.
+static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
+ SelectionDAG &DAG, SDValue &Addr,
+ SDValue &Index, unsigned &Alignment) {
+ int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
+ if (TrueMaskElt < 0)
+ return false;
+
+ // Get the address of the one scalar element that is specified by the mask
+ // using the appropriate offset from the base pointer.
+ EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
+ Addr = MaskedOp->getBasePtr();
+ if (TrueMaskElt != 0) {
+ unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
+ Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
+ }
+
+ Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
+ Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
+ return true;
+}
+
+/// If exactly one element of the mask is set for a non-extending masked load,
+/// it is a scalar load and vector insert.
+/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
+/// mask have already been optimized in IR, so we don't bother with those here.
+static SDValue
+reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
+ // However, some target hooks may need to be added to know when the transform
+ // is profitable. Endianness would also have to be considered.
+
+ SDValue Addr, VecIndex;
+ unsigned Alignment;
+ if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
+ return SDValue();
+
+ // Load the one scalar element that is specified by the mask using the
+ // appropriate offset from the base pointer.
+ SDLoc DL(ML);
+ EVT VT = ML->getValueType(0);
+ EVT EltVT = VT.getVectorElementType();
+ SDValue Load =
+ DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
+ Alignment, ML->getMemOperand()->getFlags());
+
+ // Insert the loaded element into the appropriate place in the vector.
+ SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
+ Load, VecIndex);
+ return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
+}
+
+static SDValue
+combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
+ return SDValue();
+
+ SDLoc DL(ML);
+ EVT VT = ML->getValueType(0);
+
+ // If we are loading the first and last elements of a vector, it is safe and
+ // always faster to load the whole vector. Replace the masked load with a
+ // vector load and select.
+ unsigned NumElts = VT.getVectorNumElements();
+ BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
+ bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
+ bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
+ if (LoadFirstElt && LoadLastElt) {
+ SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
+ ML->getMemOperand());
+ SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
+ return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
+ }
+
+ // Convert a masked load with a constant mask into a masked load and a select.
+ // This allows the select operation to use a faster kind of select instruction
+ // (for example, vblendvps -> vblendps).
+
+ // Don't try this if the pass-through operand is already undefined. That would
+ // cause an infinite loop because that's what we're about to create.
+ if (ML->getSrc0().isUndef())
+ return SDValue();
+
+ // The new masked load has an undef pass-through operand. The select uses the
+ // original pass-through operand.
+ SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
+ ML->getMask(), DAG.getUNDEF(VT),
+ ML->getMemoryVT(), ML->getMemOperand(),
+ ML->getExtensionType());
+ SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
+
+ return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
+}
+
+static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
+
+ // TODO: Expanding load with constant mask may be optimized as well.
+ if (Mld->isExpandingLoad())
+ return SDValue();
+
+ if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
+ if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
+ return ScalarLoad;
+ // TODO: Do some AVX512 subsets benefit from this transform?
+ if (!Subtarget.hasAVX512())
+ if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
+ return Blend;
+ }
+
+ if (Mld->getExtensionType() != ISD::SEXTLOAD)
+ return SDValue();
+
+ // Resolve extending loads.
+ EVT VT = Mld->getValueType(0);
+ unsigned NumElems = VT.getVectorNumElements();
+ EVT LdVT = Mld->getMemoryVT();
+ SDLoc dl(Mld);
+
+ assert(LdVT != VT && "Cannot extend to the same type");
+ unsigned ToSz = VT.getScalarSizeInBits();
+ unsigned FromSz = LdVT.getScalarSizeInBits();
+ // From/To sizes and ElemCount must be pow of two.
+ assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
+ "Unexpected size for extending masked load");
+
+ unsigned SizeRatio = ToSz / FromSz;
+ assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
+
+ // Create a type on which we perform the shuffle.
+ EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
+ LdVT.getScalarType(), NumElems*SizeRatio);
+ assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+ // Convert Src0 value.
+ SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
+ if (!Mld->getSrc0().isUndef()) {
+ SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i] = i * SizeRatio;
+
+ // Can't shuffle using an illegal type.
+ assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
+ "WideVecVT should be legal");
+ WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
+ DAG.getUNDEF(WideVecVT), ShuffleVec);
+ }
+ // Prepare the new mask.
+ SDValue NewMask;
+ SDValue Mask = Mld->getMask();
+ if (Mask.getValueType() == VT) {
+ // Mask and original value have the same type.
+ NewMask = DAG.getBitcast(WideVecVT, Mask);
+ SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i] = i * SizeRatio;
+ for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
+ ShuffleVec[i] = NumElems * SizeRatio;
+ NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
+ DAG.getConstant(0, dl, WideVecVT),
+ ShuffleVec);
+ } else {
+ assert(Mask.getValueType().getVectorElementType() == MVT::i1);
+ unsigned WidenNumElts = NumElems*SizeRatio;
+ unsigned MaskNumElts = VT.getVectorNumElements();
+ EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ WidenNumElts);
+
+ unsigned NumConcat = WidenNumElts / MaskNumElts;
+ SmallVector<SDValue, 16> Ops(NumConcat);
+ SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
+ Ops[0] = Mask;
+ for (unsigned i = 1; i != NumConcat; ++i)
+ Ops[i] = ZeroVal;
+
+ NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
+ }
+
+ SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
+ Mld->getBasePtr(), NewMask, WideSrc0,
+ Mld->getMemoryVT(), Mld->getMemOperand(),
+ ISD::NON_EXTLOAD);
+ SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
+ return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
+}
+
+/// If exactly one element of the mask is set for a non-truncating masked store,
+/// it is a vector extract and scalar store.
+/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
+/// mask have already been optimized in IR, so we don't bother with those here.
+static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
+ SelectionDAG &DAG) {
+ // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
+ // However, some target hooks may need to be added to know when the transform
+ // is profitable. Endianness would also have to be considered.
+
+ SDValue Addr, VecIndex;
+ unsigned Alignment;
+ if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
+ return SDValue();
+
+ // Extract the one scalar element that is actually being stored.
+ SDLoc DL(MS);
+ EVT VT = MS->getValue().getValueType();
+ EVT EltVT = VT.getVectorElementType();
+ SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
+ MS->getValue(), VecIndex);
+
+ // Store that element at the appropriate offset from the base pointer.
+ return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
+ Alignment, MS->getMemOperand()->getFlags());
+}
+
+static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
+
+ if (Mst->isCompressingStore())
+ return SDValue();
+
+ if (!Mst->isTruncatingStore())
+ return reduceMaskedStoreToScalarStore(Mst, DAG);
+
+ // Resolve truncating stores.
+ EVT VT = Mst->getValue().getValueType();
+ unsigned NumElems = VT.getVectorNumElements();
+ EVT StVT = Mst->getMemoryVT();
+ SDLoc dl(Mst);
+
+ assert(StVT != VT && "Cannot truncate to the same type");
+ unsigned FromSz = VT.getScalarSizeInBits();
+ unsigned ToSz = StVT.getScalarSizeInBits();
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ // The truncating store is legal in some cases. For example
+ // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
+ // are designated for truncate store.
+ // In this case we don't need any further transformations.
+ if (TLI.isTruncStoreLegal(VT, StVT))
+ return SDValue();
+
+ // From/To sizes and ElemCount must be pow of two.
+ assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
+ "Unexpected size for truncating masked store");
+ // We are going to use the original vector elt for storing.
+ // Accumulated smaller vector elements must be a multiple of the store size.
+ assert (((NumElems * FromSz) % ToSz) == 0 &&
+ "Unexpected ratio for truncating masked store");
+
+ unsigned SizeRatio = FromSz / ToSz;
+ assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
+
+ // Create a type on which we perform the shuffle.
+ EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
+ StVT.getScalarType(), NumElems*SizeRatio);
+
+ assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+ SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
+ SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i] = i * SizeRatio;
+
+ // Can't shuffle using an illegal type.
+ assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
+ "WideVecVT should be legal");
+
+ SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
+ DAG.getUNDEF(WideVecVT),
+ ShuffleVec);
+
+ SDValue NewMask;
+ SDValue Mask = Mst->getMask();
+ if (Mask.getValueType() == VT) {
+ // Mask and original value have the same type.
+ NewMask = DAG.getBitcast(WideVecVT, Mask);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i] = i * SizeRatio;
+ for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
+ ShuffleVec[i] = NumElems*SizeRatio;
+ NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
+ DAG.getConstant(0, dl, WideVecVT),
+ ShuffleVec);
+ } else {
+ assert(Mask.getValueType().getVectorElementType() == MVT::i1);
+ unsigned WidenNumElts = NumElems*SizeRatio;
+ unsigned MaskNumElts = VT.getVectorNumElements();
+ EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ WidenNumElts);
+
+ unsigned NumConcat = WidenNumElts / MaskNumElts;
+ SmallVector<SDValue, 16> Ops(NumConcat);
+ SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
+ Ops[0] = Mask;
+ for (unsigned i = 1; i != NumConcat; ++i)
+ Ops[i] = ZeroVal;
+
+ NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
+ }
+
+ return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
+ Mst->getBasePtr(), NewMask, StVT,
+ Mst->getMemOperand(), false);
+}
+
+static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ StoreSDNode *St = cast<StoreSDNode>(N);
+ EVT VT = St->getValue().getValueType();
+ EVT StVT = St->getMemoryVT();
+ SDLoc dl(St);
+ SDValue StoredVal = St->getOperand(1);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ // If we are saving a concatenation of two XMM registers and 32-byte stores
+ // are slow, such as on Sandy Bridge, perform two 16-byte stores.
+ bool Fast;
+ unsigned AddressSpace = St->getAddressSpace();
+ unsigned Alignment = St->getAlignment();
+ if (VT.is256BitVector() && StVT == VT &&
+ TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
+ AddressSpace, Alignment, &Fast) &&
+ !Fast) {
+ unsigned NumElems = VT.getVectorNumElements();
+ if (NumElems < 2)
+ return SDValue();
+
+ SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
+ SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
+
+ SDValue Ptr0 = St->getBasePtr();
+ SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
+
+ SDValue Ch0 =
+ DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
+ Alignment, St->getMemOperand()->getFlags());
+ SDValue Ch1 =
+ DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(),
+ std::min(16U, Alignment), St->getMemOperand()->getFlags());
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
+ }
+
+ // Optimize trunc store (of multiple scalars) to shuffle and store.
+ // First, pack all of the elements in one place. Next, store to memory
+ // in fewer chunks.
+ if (St->isTruncatingStore() && VT.isVector()) {
+ // Check if we can detect an AVG pattern from the truncation. If yes,
+ // replace the trunc store by a normal store with the result of X86ISD::AVG
+ // instruction.
+ if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
+ Subtarget, dl))
+ return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
+ St->getPointerInfo(), St->getAlignment(),
+ St->getMemOperand()->getFlags());
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ unsigned NumElems = VT.getVectorNumElements();
+ assert(StVT != VT && "Cannot truncate to the same type");
+ unsigned FromSz = VT.getScalarSizeInBits();
+ unsigned ToSz = StVT.getScalarSizeInBits();
+
+ // The truncating store is legal in some cases. For example
+ // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
+ // are designated for truncate store.
+ // In this case we don't need any further transformations.
+ if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
+ return SDValue();
+
+ // From, To sizes and ElemCount must be pow of two
+ if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
+ // We are going to use the original vector elt for storing.
+ // Accumulated smaller vector elements must be a multiple of the store size.
+ if (0 != (NumElems * FromSz) % ToSz) return SDValue();
+
+ unsigned SizeRatio = FromSz / ToSz;
+
+ assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
+
+ // Create a type on which we perform the shuffle
+ EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
+ StVT.getScalarType(), NumElems*SizeRatio);
+
+ assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+ SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
+ SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i] = i * SizeRatio;
+
+ // Can't shuffle using an illegal type.
+ if (!TLI.isTypeLegal(WideVecVT))
+ return SDValue();
+
+ SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
+ DAG.getUNDEF(WideVecVT),
+ ShuffleVec);
+ // At this point all of the data is stored at the bottom of the
+ // register. We now need to save it to mem.
+
+ // Find the largest store unit
+ MVT StoreType = MVT::i8;
+ for (MVT Tp : MVT::integer_valuetypes()) {
+ if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
+ StoreType = Tp;
+ }
+
+ // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
+ if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
+ (64 <= NumElems * ToSz))
+ StoreType = MVT::f64;
+
+ // Bitcast the original vector into a vector of store-size units
+ EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
+ StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
+ assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
+ SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
+ SmallVector<SDValue, 8> Chains;
+ SDValue Ptr = St->getBasePtr();
+
+ // Perform one or more big stores into memory.
+ for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
+ SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+ StoreType, ShuffWide,
+ DAG.getIntPtrConstant(i, dl));
+ SDValue Ch =
+ DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
+ St->getAlignment(), St->getMemOperand()->getFlags());
+ Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
+ Chains.push_back(Ch);
+ }
+
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+ }
+
+ // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
+ // the FP state in cases where an emms may be missing.
+ // A preferable solution to the general problem is to figure out the right
+ // places to insert EMMS. This qualifies as a quick hack.
+
+ // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
+ if (VT.getSizeInBits() != 64)
+ return SDValue();
+
+ const Function *F = DAG.getMachineFunction().getFunction();
+ bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
+ bool F64IsLegal =
+ !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
+ if ((VT.isVector() ||
+ (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
+ isa<LoadSDNode>(St->getValue()) &&
+ !cast<LoadSDNode>(St->getValue())->isVolatile() &&
+ St->getChain().hasOneUse() && !St->isVolatile()) {
+ SDNode* LdVal = St->getValue().getNode();
+ LoadSDNode *Ld = nullptr;
+ int TokenFactorIndex = -1;
+ SmallVector<SDValue, 8> Ops;
+ SDNode* ChainVal = St->getChain().getNode();
+ // Must be a store of a load. We currently handle two cases: the load
+ // is a direct child, and it's under an intervening TokenFactor. It is
+ // possible to dig deeper under nested TokenFactors.
+ if (ChainVal == LdVal)
+ Ld = cast<LoadSDNode>(St->getChain());
+ else if (St->getValue().hasOneUse() &&
+ ChainVal->getOpcode() == ISD::TokenFactor) {
+ for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
+ if (ChainVal->getOperand(i).getNode() == LdVal) {
+ TokenFactorIndex = i;
+ Ld = cast<LoadSDNode>(St->getValue());
+ } else
+ Ops.push_back(ChainVal->getOperand(i));
+ }
+ }
+
+ if (!Ld || !ISD::isNormalLoad(Ld))
+ return SDValue();
+
+ // If this is not the MMX case, i.e. we are just turning i64 load/store
+ // into f64 load/store, avoid the transformation if there are multiple
+ // uses of the loaded value.
+ if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
+ return SDValue();
+
+ SDLoc LdDL(Ld);
+ SDLoc StDL(N);
+ // If we are a 64-bit capable x86, lower to a single movq load/store pair.
+ // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
+ // pair instead.
+ if (Subtarget.is64Bit() || F64IsLegal) {
+ MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
+ SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
+ Ld->getPointerInfo(), Ld->getAlignment(),
+ Ld->getMemOperand()->getFlags());
+ SDValue NewChain = NewLd.getValue(1);
+ if (TokenFactorIndex >= 0) {
+ Ops.push_back(NewChain);
+ NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
+ }
+ return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
+ St->getPointerInfo(), St->getAlignment(),
+ St->getMemOperand()->getFlags());
+ }
+
+ // Otherwise, lower to two pairs of 32-bit loads / stores.
+ SDValue LoAddr = Ld->getBasePtr();
+ SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
+
+ SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
+ Ld->getPointerInfo(), Ld->getAlignment(),
+ Ld->getMemOperand()->getFlags());
+ SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
+ Ld->getPointerInfo().getWithOffset(4),
+ MinAlign(Ld->getAlignment(), 4),
+ Ld->getMemOperand()->getFlags());
+
+ SDValue NewChain = LoLd.getValue(1);
+ if (TokenFactorIndex >= 0) {
+ Ops.push_back(LoLd);
+ Ops.push_back(HiLd);
+ NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
+ }
+
+ LoAddr = St->getBasePtr();
+ HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
+
+ SDValue LoSt =
+ DAG.getStore(NewChain, StDL, LoLd, LoAddr, St->getPointerInfo(),
+ St->getAlignment(), St->getMemOperand()->getFlags());
+ SDValue HiSt = DAG.getStore(
+ NewChain, StDL, HiLd, HiAddr, St->getPointerInfo().getWithOffset(4),
+ MinAlign(St->getAlignment(), 4), St->getMemOperand()->getFlags());
+ return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
+ }
+
+ // This is similar to the above case, but here we handle a scalar 64-bit
+ // integer store that is extracted from a vector on a 32-bit target.
+ // If we have SSE2, then we can treat it like a floating-point double
+ // to get past legalization. The execution dependencies fixup pass will
+ // choose the optimal machine instruction for the store if this really is
+ // an integer or v2f32 rather than an f64.
+ if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
+ St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+ SDValue OldExtract = St->getOperand(1);
+ SDValue ExtOp0 = OldExtract.getOperand(0);
+ unsigned VecSize = ExtOp0.getValueSizeInBits();
+ EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
+ SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
+ SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+ BitCast, OldExtract.getOperand(1));
+ return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
+ St->getPointerInfo(), St->getAlignment(),
+ St->getMemOperand()->getFlags());
+ }
+
+ return SDValue();
+}
+
+/// Return 'true' if this vector operation is "horizontal"
+/// and return the operands for the horizontal operation in LHS and RHS. A
+/// horizontal operation performs the binary operation on successive elements
+/// of its first operand, then on successive elements of its second operand,
+/// returning the resulting values in a vector. For example, if
+/// A = < float a0, float a1, float a2, float a3 >
+/// and
+/// B = < float b0, float b1, float b2, float b3 >
+/// then the result of doing a horizontal operation on A and B is
+/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
+/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
+/// A horizontal-op B, for some already available A and B, and if so then LHS is
+/// set to A, RHS to B, and the routine returns 'true'.
+/// Note that the binary operation should have the property that if one of the
+/// operands is UNDEF then the result is UNDEF.
+static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
+ // Look for the following pattern: if
+ // A = < float a0, float a1, float a2, float a3 >
+ // B = < float b0, float b1, float b2, float b3 >
+ // and
+ // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
+ // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
+ // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
+ // which is A horizontal-op B.
+
+ // At least one of the operands should be a vector shuffle.
+ if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
+ RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
+ return false;
+
+ MVT VT = LHS.getSimpleValueType();
+
+ assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ "Unsupported vector type for horizontal add/sub");
+
+ // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
+ // operate independently on 128-bit lanes.
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned NumLanes = VT.getSizeInBits()/128;
+ unsigned NumLaneElts = NumElts / NumLanes;
+ assert((NumLaneElts % 2 == 0) &&
+ "Vector type should have an even number of elements in each lane");
+ unsigned HalfLaneElts = NumLaneElts/2;
+
+ // View LHS in the form
+ // LHS = VECTOR_SHUFFLE A, B, LMask
+ // If LHS is not a shuffle then pretend it is the shuffle
+ // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
+ // NOTE: in what follows a default initialized SDValue represents an UNDEF of
+ // type VT.
+ SDValue A, B;
+ SmallVector<int, 16> LMask(NumElts);
+ if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
+ if (!LHS.getOperand(0).isUndef())
+ A = LHS.getOperand(0);
+ if (!LHS.getOperand(1).isUndef())
+ B = LHS.getOperand(1);
+ ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
+ std::copy(Mask.begin(), Mask.end(), LMask.begin());
+ } else {
+ if (!LHS.isUndef())
+ A = LHS;
+ for (unsigned i = 0; i != NumElts; ++i)
+ LMask[i] = i;
+ }
+
+ // Likewise, view RHS in the form
+ // RHS = VECTOR_SHUFFLE C, D, RMask
+ SDValue C, D;
+ SmallVector<int, 16> RMask(NumElts);
+ if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
+ if (!RHS.getOperand(0).isUndef())
+ C = RHS.getOperand(0);
+ if (!RHS.getOperand(1).isUndef())
+ D = RHS.getOperand(1);
+ ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
+ std::copy(Mask.begin(), Mask.end(), RMask.begin());
+ } else {
+ if (!RHS.isUndef())
+ C = RHS;
+ for (unsigned i = 0; i != NumElts; ++i)
+ RMask[i] = i;
+ }
+
+ // Check that the shuffles are both shuffling the same vectors.
+ if (!(A == C && B == D) && !(A == D && B == C))
+ return false;
+
+ // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
+ if (!A.getNode() && !B.getNode())
+ return false;
+
+ // If A and B occur in reverse order in RHS, then "swap" them (which means
+ // rewriting the mask).
+ if (A != C)
+ ShuffleVectorSDNode::commuteMask(RMask);
+
+ // At this point LHS and RHS are equivalent to
+ // LHS = VECTOR_SHUFFLE A, B, LMask
+ // RHS = VECTOR_SHUFFLE A, B, RMask
+ // Check that the masks correspond to performing a horizontal operation.
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = 0; i != NumLaneElts; ++i) {
+ int LIdx = LMask[i+l], RIdx = RMask[i+l];
+
+ // Ignore any UNDEF components.
+ if (LIdx < 0 || RIdx < 0 ||
+ (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
+ (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
+ continue;
+
+ // Check that successive elements are being operated on. If not, this is
+ // not a horizontal operation.
+ unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
+ int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
+ if (!(LIdx == Index && RIdx == Index + 1) &&
+ !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
+ return false;
+ }
+ }
+
+ LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
+ RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
+ return true;
+}
+
+/// Do target-specific dag combines on floating-point adds/subs.
+static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ EVT VT = N->getValueType(0);
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ bool IsFadd = N->getOpcode() == ISD::FADD;
+ assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
+
+ // Try to synthesize horizontal add/sub from adds/subs of shuffles.
+ if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
+ (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
+ isHorizontalBinOp(LHS, RHS, IsFadd)) {
+ auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
+ return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
+ }
+ return SDValue();
+}
+
+/// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
+static SDValue
+combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
+ SmallVector<SDValue, 8> &Regs) {
+ assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
+ Regs[0].getValueType() == MVT::v2i64));
+ EVT OutVT = N->getValueType(0);
+ EVT OutSVT = OutVT.getVectorElementType();
+ EVT InVT = Regs[0].getValueType();
+ EVT InSVT = InVT.getVectorElementType();
+ SDLoc DL(N);
+
+ // First, use mask to unset all bits that won't appear in the result.
+ assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
+ "OutSVT can only be either i8 or i16.");
+ APInt Mask =
+ APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
+ SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
+ for (auto &Reg : Regs)
+ Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);
+
+ MVT UnpackedVT, PackedVT;
+ if (OutSVT == MVT::i8) {
+ UnpackedVT = MVT::v8i16;
+ PackedVT = MVT::v16i8;
+ } else {
+ UnpackedVT = MVT::v4i32;
+ PackedVT = MVT::v8i16;
+ }
+
+ // In each iteration, truncate the type by a half size.
+ auto RegNum = Regs.size();
+ for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
+ j < e; j *= 2, RegNum /= 2) {
+ for (unsigned i = 0; i < RegNum; i++)
+ Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
+ for (unsigned i = 0; i < RegNum / 2; i++)
+ Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
+ Regs[i * 2 + 1]);
+ }
+
+ // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
+ // then extract a subvector as the result since v8i8 is not a legal type.
+ if (OutVT == MVT::v8i8) {
+ Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
+ Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
+ DAG.getIntPtrConstant(0, DL));
+ return Regs[0];
+ } else if (RegNum > 1) {
+ Regs.resize(RegNum);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
+ } else
+ return Regs[0];
+}
+
+/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
+static SDValue
+combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG,
+ SmallVector<SDValue, 8> &Regs) {
+ assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
+ EVT OutVT = N->getValueType(0);
+ SDLoc DL(N);
+
+ // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
+ SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
+ for (auto &Reg : Regs) {
+ Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt, DAG);
+ Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt, DAG);
+ }
+
+ for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
+ Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
+ Regs[i * 2 + 1]);
+
+ if (Regs.size() > 2) {
+ Regs.resize(Regs.size() / 2);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
+ } else
+ return Regs[0];
+}
+
+/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
+/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
+/// legalization the truncation will be translated into a BUILD_VECTOR with each
+/// element that is extracted from a vector and then truncated, and it is
+/// difficult to do this optimization based on them.
+static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ EVT OutVT = N->getValueType(0);
+ if (!OutVT.isVector())
+ return SDValue();
+
+ SDValue In = N->getOperand(0);
+ if (!In.getValueType().isSimple())
+ return SDValue();
+
+ EVT InVT = In.getValueType();
+ unsigned NumElems = OutVT.getVectorNumElements();
+
+ // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
+ // SSE2, and we need to take care of it specially.
+ // AVX512 provides vpmovdb.
+ if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
+ return SDValue();
+
+ EVT OutSVT = OutVT.getVectorElementType();
+ EVT InSVT = InVT.getVectorElementType();
+ if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
+ (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
+ NumElems >= 8))
+ return SDValue();
+
+ // SSSE3's pshufb results in less instructions in the cases below.
+ if (Subtarget.hasSSSE3() && NumElems == 8 &&
+ ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
+ (InSVT == MVT::i32 && OutSVT == MVT::i16)))
+ return SDValue();
+
+ SDLoc DL(N);
+
+ // Split a long vector into vectors of legal type.
+ unsigned RegNum = InVT.getSizeInBits() / 128;
+ SmallVector<SDValue, 8> SubVec(RegNum);
+ unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
+ EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
+
+ for (unsigned i = 0; i < RegNum; i++)
+ SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
+ DAG.getIntPtrConstant(i * NumSubRegElts, DL));
+
+ // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
+ // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
+ // truncate 2 x v4i32 to v8i16.
+ if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
+ return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
+ else if (InSVT == MVT::i32)
+ return combineVectorTruncationWithPACKSS(N, DAG, SubVec);
+ else
+ return SDValue();
+}
+
+/// This function transforms vector truncation of 'all or none' bits values.
+/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS operations.
+static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // Requires SSE2 but AVX512 has fast truncate.
+ if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
+ return SDValue();
+
+ if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
+ return SDValue();
+
+ SDValue In = N->getOperand(0);
+ if (!In.getValueType().isSimple())
+ return SDValue();
+
+ MVT VT = N->getValueType(0).getSimpleVT();
+ MVT SVT = VT.getScalarType();
+
+ MVT InVT = In.getValueType().getSimpleVT();
+ MVT InSVT = InVT.getScalarType();
+
+ // Use PACKSS if the input is a splatted sign bit.
+ // e.g. Comparison result, sext_in_reg, etc.
+ unsigned NumSignBits = DAG.ComputeNumSignBits(In);
+ if (NumSignBits != InSVT.getSizeInBits())
+ return SDValue();
+
+ // Check we have a truncation suited for PACKSS.
+ if (!VT.is128BitVector() && !VT.is256BitVector())
+ return SDValue();
+ if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
+ return SDValue();
+ if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
+ return SDValue();
+
+ return truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget);
+}
+
+static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ EVT VT = N->getValueType(0);
+ SDValue Src = N->getOperand(0);
+ SDLoc DL(N);
+
+ // Try to detect AVG pattern first.
+ if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
+ return Avg;
+
+ // The bitcast source is a direct mmx result.
+ // Detect bitcasts between i32 to x86mmx
+ if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
+ SDValue BCSrc = Src.getOperand(0);
+ if (BCSrc.getValueType() == MVT::x86mmx)
+ return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
+ }
+
+ // Try to truncate extended sign bits with PACKSS.
+ if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
+ return V;
+
+ return combineVectorTruncation(N, DAG, Subtarget);
+}
+
+/// Returns the negated value if the node \p N flips sign of FP value.
+///
+/// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
+/// AVX512F does not have FXOR, so FNEG is lowered as
+/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
+/// In this case we go though all bitcasts.
+static SDValue isFNEG(SDNode *N) {
+ if (N->getOpcode() == ISD::FNEG)
+ return N->getOperand(0);
+
+ SDValue Op = peekThroughBitcasts(SDValue(N, 0));
+ if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
+ return SDValue();
+
+ SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
+ if (!Op1.getValueType().isFloatingPoint())
+ return SDValue();
+
+ SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
+
+ unsigned EltBits = Op1.getScalarValueSizeInBits();
+ auto isSignBitValue = [&](const ConstantFP *C) {
+ return C->getValueAPF().bitcastToAPInt() == APInt::getSignBit(EltBits);
+ };
+
+ // There is more than one way to represent the same constant on
+ // the different X86 targets. The type of the node may also depend on size.
+ // - load scalar value and broadcast
+ // - BUILD_VECTOR node
+ // - load from a constant pool.
+ // We check all variants here.
+ if (Op1.getOpcode() == X86ISD::VBROADCAST) {
+ if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
+ if (isSignBitValue(cast<ConstantFP>(C)))
+ return Op0;
+
+ } else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
+ if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
+ if (isSignBitValue(CN->getConstantFPValue()))
+ return Op0;
+
+ } else if (auto *C = getTargetConstantFromNode(Op1)) {
+ if (C->getType()->isVectorTy()) {
+ if (auto *SplatV = C->getSplatValue())
+ if (isSignBitValue(cast<ConstantFP>(SplatV)))
+ return Op0;
+ } else if (auto *FPConst = dyn_cast<ConstantFP>(C))
+ if (isSignBitValue(FPConst))
+ return Op0;
+ }
+ return SDValue();
+}
+
+/// Do target-specific dag combines on floating point negations.
+static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ EVT OrigVT = N->getValueType(0);
+ SDValue Arg = isFNEG(N);
+ assert(Arg.getNode() && "N is expected to be an FNEG node");
+
+ EVT VT = Arg.getValueType();
+ EVT SVT = VT.getScalarType();
+ SDLoc DL(N);
+
+ // Let legalize expand this if it isn't a legal type yet.
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ return SDValue();
+
+ // If we're negating a FMUL node on a target with FMA, then we can avoid the
+ // use of a constant by performing (-0 - A*B) instead.
+ // FIXME: Check rounding control flags as well once it becomes available.
+ if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
+ Arg->getFlags()->hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
+ SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
+ SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
+ Arg.getOperand(1), Zero);
+ return DAG.getBitcast(OrigVT, NewNode);
+ }
+
+ // If we're negating an FMA node, then we can adjust the
+ // instruction to include the extra negation.
+ unsigned NewOpcode = 0;
+ if (Arg.hasOneUse()) {
+ switch (Arg.getOpcode()) {
+ case X86ISD::FMADD: NewOpcode = X86ISD::FNMSUB; break;
+ case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
+ case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
+ case X86ISD::FNMSUB: NewOpcode = X86ISD::FMADD; break;
+ case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
+ case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
+ case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
+ case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
+ // We can't handle scalar intrinsic node here because it would only
+ // invert one element and not the whole vector. But we could try to handle
+ // a negation of the lower element only.
+ }
+ }
+ if (NewOpcode)
+ return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
+ Arg.getNode()->ops()));
+
+ return SDValue();
+}
+
+static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ MVT VT = N->getSimpleValueType(0);
+ // If we have integer vector types available, use the integer opcodes.
+ if (VT.isVector() && Subtarget.hasSSE2()) {
+ SDLoc dl(N);
+
+ MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
+
+ SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
+ SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
+ unsigned IntOpcode;
+ switch (N->getOpcode()) {
+ default: llvm_unreachable("Unexpected FP logic op");
+ case X86ISD::FOR: IntOpcode = ISD::OR; break;
+ case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
+ case X86ISD::FAND: IntOpcode = ISD::AND; break;
+ case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
+ }
+ SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
+ return DAG.getBitcast(VT, IntOp);
+ }
+ return SDValue();
+}
+
+static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
+ return Cmp;
+
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
+ return RV;
+
+ if (Subtarget.hasCMov())
+ if (SDValue RV = combineIntegerAbs(N, DAG))
+ return RV;
+
+ if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
+ return FPLogic;
+
+ if (isFNEG(N))
+ return combineFneg(N, DAG, Subtarget);
+ return SDValue();
+}
+
+
+static bool isNullFPScalarOrVectorConst(SDValue V) {
+ return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
+}
+
+/// If a value is a scalar FP zero or a vector FP zero (potentially including
+/// undefined elements), return a zero constant that may be used to fold away
+/// that value. In the case of a vector, the returned constant will not contain
+/// undefined elements even if the input parameter does. This makes it suitable
+/// to be used as a replacement operand with operations (eg, bitwise-and) where
+/// an undef should not propagate.
+static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (!isNullFPScalarOrVectorConst(V))
+ return SDValue();
+
+ if (V.getValueType().isVector())
+ return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
+
+ return V;
+}
+
+static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+ SDLoc DL(N);
+
+ // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
+ if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
+ (VT == MVT::f64 && Subtarget.hasSSE2())))
+ return SDValue();
+
+ auto isAllOnesConstantFP = [](SDValue V) {
+ auto *C = dyn_cast<ConstantFPSDNode>(V);
+ return C && C->getConstantFPValue()->isAllOnesValue();
+ };
+
+ // fand (fxor X, -1), Y --> fandn X, Y
+ if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
+ return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
+
+ // fand X, (fxor Y, -1) --> fandn Y, X
+ if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
+ return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
+
+ return SDValue();
+}
+
+/// Do target-specific dag combines on X86ISD::FAND nodes.
+static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // FAND(0.0, x) -> 0.0
+ if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
+ return V;
+
+ // FAND(x, 0.0) -> 0.0
+ if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
+ return V;
+
+ if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
+ return V;
+
+ return lowerX86FPLogicOp(N, DAG, Subtarget);
+}
+
+/// Do target-specific dag combines on X86ISD::FANDN nodes.
+static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // FANDN(0.0, x) -> x
+ if (isNullFPScalarOrVectorConst(N->getOperand(0)))
+ return N->getOperand(1);
+
+ // FANDN(x, 0.0) -> 0.0
+ if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
+ return V;
+
+ return lowerX86FPLogicOp(N, DAG, Subtarget);
+}
+
+/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
+static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
+
+ // F[X]OR(0.0, x) -> x
+ if (isNullFPScalarOrVectorConst(N->getOperand(0)))
+ return N->getOperand(1);
+
+ // F[X]OR(x, 0.0) -> x
+ if (isNullFPScalarOrVectorConst(N->getOperand(1)))
+ return N->getOperand(0);
+
+ if (isFNEG(N))
+ if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
+ return NewVal;
+
+ return lowerX86FPLogicOp(N, DAG, Subtarget);
+}
+
+/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
+static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
+ assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
+
+ // Only perform optimizations if UnsafeMath is used.
+ if (!DAG.getTarget().Options.UnsafeFPMath)
+ return SDValue();
+
+ // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
+ // into FMINC and FMAXC, which are Commutative operations.
+ unsigned NewOp = 0;
+ switch (N->getOpcode()) {
+ default: llvm_unreachable("unknown opcode");
+ case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
+ case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
+ }
+
+ return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
+ N->getOperand(0), N->getOperand(1));
+}
+
+static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (Subtarget.useSoftFloat())
+ return SDValue();
+
+ // TODO: Check for global or instruction-level "nnan". In that case, we
+ // should be able to lower to FMAX/FMIN alone.
+ // TODO: If an operand is already known to be a NaN or not a NaN, this
+ // should be an optional swap and FMAX/FMIN.
+
+ EVT VT = N->getValueType(0);
+ if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
+ (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
+ (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
+ return SDValue();
+
+ // This takes at least 3 instructions, so favor a library call when operating
+ // on a scalar and minimizing code size.
+ if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize())
+ return SDValue();
+
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ SDLoc DL(N);
+ EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
+ DAG.getDataLayout(), *DAG.getContext(), VT);
+
+ // There are 4 possibilities involving NaN inputs, and these are the required
+ // outputs:
+ // Op1
+ // Num NaN
+ // ----------------
+ // Num | Max | Op0 |
+ // Op0 ----------------
+ // NaN | Op1 | NaN |
+ // ----------------
+ //
+ // The SSE FP max/min instructions were not designed for this case, but rather
+ // to implement:
+ // Min = Op1 < Op0 ? Op1 : Op0
+ // Max = Op1 > Op0 ? Op1 : Op0
+ //
+ // So they always return Op0 if either input is a NaN. However, we can still
+ // use those instructions for fmaxnum by selecting away a NaN input.
+
+ // If either operand is NaN, the 2nd source operand (Op0) is passed through.
+ auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
+ SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
+ SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
+
+ // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
+ // are NaN, the NaN value of Op1 is the result.
+ auto SelectOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT;
+ return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax);
+}
+
+static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ // BT ignores high bits in the bit index operand.
+ SDValue Op1 = N->getOperand(1);
+ if (Op1.hasOneUse()) {
+ unsigned BitWidth = Op1.getValueSizeInBits();
+ APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
+ APInt KnownZero, KnownOne;
+ TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+ !DCI.isBeforeLegalizeOps());
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
+ TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
+ DCI.CommitTargetLoweringOpt(TLO);
+ }
+ return SDValue();
+}
+
+static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ EVT VT = N->getValueType(0);
+ if (!VT.isVector())
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
+ SDLoc dl(N);
+
+ // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
+ // both SSE and AVX2 since there is no sign-extended shift right
+ // operation on a vector with 64-bit elements.
+ //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
+ // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
+ if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
+ N0.getOpcode() == ISD::SIGN_EXTEND)) {
+ SDValue N00 = N0.getOperand(0);
+
+ // EXTLOAD has a better solution on AVX2,
+ // it may be replaced with X86ISD::VSEXT node.
+ if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
+ if (!ISD::isNormalLoad(N00.getNode()))
+ return SDValue();
+
+ if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
+ SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
+ N00, N1);
+ return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
+ }
+ }
+ return SDValue();
+}
+
+/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
+/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
+/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
+/// opportunities to combine math ops, use an LEA, or use a complex addressing
+/// mode. This can eliminate extend, add, and shift instructions.
+static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
+ Ext->getOpcode() != ISD::ZERO_EXTEND)
+ return SDValue();
+
+ // TODO: This should be valid for other integer types.
+ EVT VT = Ext->getValueType(0);
+ if (VT != MVT::i64)
+ return SDValue();
+
+ SDValue Add = Ext->getOperand(0);
+ if (Add.getOpcode() != ISD::ADD)
+ return SDValue();
+
+ bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
+ bool NSW = Add->getFlags()->hasNoSignedWrap();
+ bool NUW = Add->getFlags()->hasNoUnsignedWrap();
+
+ // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
+ // into the 'zext'
+ if ((Sext && !NSW) || (!Sext && !NUW))
+ return SDValue();
+
+ // Having a constant operand to the 'add' ensures that we are not increasing
+ // the instruction count because the constant is extended for free below.
+ // A constant operand can also become the displacement field of an LEA.
+ auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
+ if (!AddOp1)
+ return SDValue();
+
+ // Don't make the 'add' bigger if there's no hope of combining it with some
+ // other 'add' or 'shl' instruction.
+ // TODO: It may be profitable to generate simpler LEA instructions in place
+ // of single 'add' instructions, but the cost model for selecting an LEA
+ // currently has a high threshold.
+ bool HasLEAPotential = false;
+ for (auto *User : Ext->uses()) {
+ if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
+ HasLEAPotential = true;
+ break;
+ }
+ }
+ if (!HasLEAPotential)
+ return SDValue();
+
+ // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
+ int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
+ SDValue AddOp0 = Add.getOperand(0);
+ SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
+ SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
+
+ // The wider add is guaranteed to not wrap because both operands are
+ // sign-extended.
+ SDNodeFlags Flags;
+ Flags.setNoSignedWrap(NSW);
+ Flags.setNoUnsignedWrap(NUW);
+ return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, &Flags);
+}
+
+/// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
+/// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
+/// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
+/// extends from AH (which we otherwise need to do contortions to access).
+static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
+ SDValue N0 = N->getOperand(0);
+ auto OpcodeN = N->getOpcode();
+ auto OpcodeN0 = N0.getOpcode();
+ if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
+ (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+ EVT InVT = N0.getValueType();
+ if (N0.getResNo() != 1 || InVT != MVT::i8 || VT != MVT::i32)
+ return SDValue();
+
+ SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
+ auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
+ : X86ISD::UDIVREM8_ZEXT_HREG;
+ SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
+ N0.getOperand(1));
+ DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
+ return R.getValue(1);
+}
+
+/// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
+/// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
+/// with UNDEFs) of the input to vectors of the same size as the target type
+/// which then extends the lowest elements.
+static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ unsigned Opcode = N->getOpcode();
+ if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
+ return SDValue();
+ if (!DCI.isBeforeLegalizeOps())
+ return SDValue();
+ if (!Subtarget.hasSSE2())
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+ EVT SVT = VT.getScalarType();
+ EVT InVT = N0.getValueType();
+ EVT InSVT = InVT.getScalarType();
+
+ // Input type must be a vector and we must be extending legal integer types.
+ if (!VT.isVector())
+ return SDValue();
+ if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
+ return SDValue();
+ if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
+ return SDValue();
+
+ // On AVX2+ targets, if the input/output types are both legal then we will be
+ // able to use SIGN_EXTEND/ZERO_EXTEND directly.
+ if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
+ DAG.getTargetLoweringInfo().isTypeLegal(InVT))
+ return SDValue();
+
+ SDLoc DL(N);
+
+ auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
+ EVT InVT = N.getValueType();
+ EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
+ Size / InVT.getScalarSizeInBits());
+ SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
+ DAG.getUNDEF(InVT));
+ Opnds[0] = N;
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
+ };
+
+ // If target-size is less than 128-bits, extend to a type that would extend
+ // to 128 bits, extend that and extract the original target vector.
+ if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
+ unsigned Scale = 128 / VT.getSizeInBits();
+ EVT ExVT =
+ EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
+ SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
+ SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
+ DAG.getIntPtrConstant(0, DL));
+ }
+
+ // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
+ // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
+ // Also use this if we don't have SSE41 to allow the legalizer do its job.
+ if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
+ (VT.is256BitVector() && Subtarget.hasInt256()) ||
+ (VT.is512BitVector() && Subtarget.hasAVX512())) {
+ SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
+ return Opcode == ISD::SIGN_EXTEND
+ ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
+ : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
+ }
+
+ auto SplitAndExtendInReg = [&](unsigned SplitSize) {
+ unsigned NumVecs = VT.getSizeInBits() / SplitSize;
+ unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
+ EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
+ EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
+
+ SmallVector<SDValue, 8> Opnds;
+ for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
+ SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
+ DAG.getIntPtrConstant(Offset, DL));
+ SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
+ SrcVec = Opcode == ISD::SIGN_EXTEND
+ ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
+ : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
+ Opnds.push_back(SrcVec);
+ }
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
+ };
+
+ // On pre-AVX2 targets, split into 128-bit nodes of
+ // ISD::*_EXTEND_VECTOR_INREG.
+ if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
+ return SplitAndExtendInReg(128);
+
+ // On pre-AVX512 targets, split into 256-bit nodes of
+ // ISD::*_EXTEND_VECTOR_INREG.
+ if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256))
+ return SplitAndExtendInReg(256);
+
+ return SDValue();
+}
+
+static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+ EVT InVT = N0.getValueType();
+ SDLoc DL(N);
+
+ if (SDValue DivRem8 = getDivRem8(N, DAG))
+ return DivRem8;
+
+ if (!DCI.isBeforeLegalizeOps()) {
+ if (InVT == MVT::i1) {
+ SDValue Zero = DAG.getConstant(0, DL, VT);
+ SDValue AllOnes =
+ DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT);
+ return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero);
+ }
+ return SDValue();
+ }
+
+ if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
+ return V;
+
+ if (Subtarget.hasAVX() && VT.is256BitVector())
+ if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
+ return R;
+
+ if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
+ return NewAdd;
+
+ return SDValue();
+}
+
+static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDLoc dl(N);
+ EVT VT = N->getValueType(0);
+
+ // Let legalize expand this if it isn't a legal type yet.
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ return SDValue();
+
+ EVT ScalarVT = VT.getScalarType();
+ if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
+ return SDValue();
+
+ SDValue A = N->getOperand(0);
+ SDValue B = N->getOperand(1);
+ SDValue C = N->getOperand(2);
+
+ auto invertIfNegative = [](SDValue &V) {
+ if (SDValue NegVal = isFNEG(V.getNode())) {
+ V = NegVal;
+ return true;
+ }
+ return false;
+ };
+
+ // Do not convert the passthru input of scalar intrinsics.
+ // FIXME: We could allow negations of the lower element only.
+ bool NegA = N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
+ bool NegB = invertIfNegative(B);
+ bool NegC = N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
+
+ // Negative multiplication when NegA xor NegB
+ bool NegMul = (NegA != NegB);
+
+ unsigned NewOpcode;
+ if (!NegMul)
+ NewOpcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
+ else
+ NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
+
+
+ if (N->getOpcode() == X86ISD::FMADD_RND) {
+ switch (NewOpcode) {
+ case X86ISD::FMADD: NewOpcode = X86ISD::FMADD_RND; break;
+ case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break;
+ case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
+ case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
+ }
+ } else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
+ switch (NewOpcode) {
+ case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS1_RND; break;
+ case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1_RND; break;
+ case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
+ case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
+ }
+ } else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
+ switch (NewOpcode) {
+ case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS3_RND; break;
+ case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3_RND; break;
+ case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
+ case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
+ }
+ } else {
+ assert((N->getOpcode() == X86ISD::FMADD || N->getOpcode() == ISD::FMA) &&
+ "Unexpected opcode!");
+ return DAG.getNode(NewOpcode, dl, VT, A, B, C);
+ }
+
+ return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
+}
+
+static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
+ // (and (i32 x86isd::setcc_carry), 1)
+ // This eliminates the zext. This transformation is necessary because
+ // ISD::SETCC is always legalized to i8.
+ SDLoc dl(N);
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+
+ if (N0.getOpcode() == ISD::AND &&
+ N0.hasOneUse() &&
+ N0.getOperand(0).hasOneUse()) {
+ SDValue N00 = N0.getOperand(0);
+ if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
+ if (!isOneConstant(N0.getOperand(1)))
+ return SDValue();
+ return DAG.getNode(ISD::AND, dl, VT,
+ DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
+ N00.getOperand(0), N00.getOperand(1)),
+ DAG.getConstant(1, dl, VT));
+ }
+ }
+
+ if (N0.getOpcode() == ISD::TRUNCATE &&
+ N0.hasOneUse() &&
+ N0.getOperand(0).hasOneUse()) {
+ SDValue N00 = N0.getOperand(0);
+ if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
+ return DAG.getNode(ISD::AND, dl, VT,
+ DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
+ N00.getOperand(0), N00.getOperand(1)),
+ DAG.getConstant(1, dl, VT));
+ }
+ }
+
+ if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
+ return V;
+
+ if (VT.is256BitVector())
+ if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
+ return R;
+
+ if (SDValue DivRem8 = getDivRem8(N, DAG))
+ return DivRem8;
+
+ if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
+ return NewAdd;
+
+ if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
+ return R;
+
+ return SDValue();
+}
+
+/// Optimize x == -y --> x+y == 0
+/// x != -y --> x+y != 0
+static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+ SDLoc DL(N);
+
+ if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
+ if (isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) {
+ SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS,
+ LHS.getOperand(1));
+ return DAG.getSetCC(DL, N->getValueType(0), addV,
+ DAG.getConstant(0, DL, addV.getValueType()), CC);
+ }
+ if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
+ if (isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) {
+ SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS,
+ RHS.getOperand(1));
+ return DAG.getSetCC(DL, N->getValueType(0), addV,
+ DAG.getConstant(0, DL, addV.getValueType()), CC);
+ }
+
+ if (VT.getScalarType() == MVT::i1 &&
+ (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
+ bool IsSEXT0 =
+ (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
+ (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
+ bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
+
+ if (!IsSEXT0 || !IsVZero1) {
+ // Swap the operands and update the condition code.
+ std::swap(LHS, RHS);
+ CC = ISD::getSetCCSwappedOperands(CC);
+
+ IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
+ (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
+ IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
+ }
+
+ if (IsSEXT0 && IsVZero1) {
+ assert(VT == LHS.getOperand(0).getValueType() &&
+ "Uexpected operand type");
+ if (CC == ISD::SETGT)
+ return DAG.getConstant(0, DL, VT);
+ if (CC == ISD::SETLE)
+ return DAG.getConstant(1, DL, VT);
+ if (CC == ISD::SETEQ || CC == ISD::SETGE)
+ return DAG.getNOT(DL, LHS.getOperand(0), VT);
+
+ assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
+ "Unexpected condition code!");
+ return LHS.getOperand(0);
+ }
+ }
+
+ // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
+ // to avoid scalarization via legalization because v4i32 is not a legal type.
+ if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
+ LHS.getValueType() == MVT::v4f32)
+ return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
+
+ return SDValue();
+}
+
+static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) {
+ SDLoc DL(N);
+ // Gather and Scatter instructions use k-registers for masks. The type of
+ // the masks is v*i1. So the mask will be truncated anyway.
+ // The SIGN_EXTEND_INREG my be dropped.
+ SDValue Mask = N->getOperand(2);
+ if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+ SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
+ NewOps[2] = Mask.getOperand(0);
+ DAG.UpdateNodeOperands(N, NewOps);
+ }
+ return SDValue();
+}
+
+// Helper function of performSETCCCombine. It is to materialize "setb reg"
+// as "sbb reg,reg", since it can be extended without zext and produces
+// an all-ones bit which is more useful than 0/1 in some cases.
+static SDValue MaterializeSETB(const SDLoc &DL, SDValue EFLAGS,
+ SelectionDAG &DAG, MVT VT) {
+ if (VT == MVT::i8)
+ return DAG.getNode(ISD::AND, DL, VT,
+ DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
+ DAG.getConstant(X86::COND_B, DL, MVT::i8),
+ EFLAGS),
+ DAG.getConstant(1, DL, VT));
+ assert (VT == MVT::i1 && "Unexpected type for SECCC node");
+ return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
+ DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
+ DAG.getConstant(X86::COND_B, DL, MVT::i8),
+ EFLAGS));
+}
+
+// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
+static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ SDLoc DL(N);
+ X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
+ SDValue EFLAGS = N->getOperand(1);
+
+ if (CC == X86::COND_A) {
+ // Try to convert COND_A into COND_B in an attempt to facilitate
+ // materializing "setb reg".
+ //
+ // Do not flip "e > c", where "c" is a constant, because Cmp instruction
+ // cannot take an immediate as its first operand.
+ //
+ if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
+ EFLAGS.getValueType().isInteger() &&
+ !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
+ SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
+ EFLAGS.getNode()->getVTList(),
+ EFLAGS.getOperand(1), EFLAGS.getOperand(0));
+ SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
+ return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
+ }
+ }
+
+ // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
+ // a zext and produces an all-ones bit which is more useful than 0/1 in some
+ // cases.
+ if (CC == X86::COND_B)
+ return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
+
+ // Try to simplify the EFLAGS and condition code operands.
+ if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG))
+ return getSETCC(CC, Flags, DL, DAG);
+
+ return SDValue();
+}
+
+/// Optimize branch condition evaluation.
+static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ SDLoc DL(N);
+ SDValue EFLAGS = N->getOperand(3);
+ X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
+
+ // Try to simplify the EFLAGS and condition code operands.
+ // Make sure to not keep references to operands, as combineSetCCEFLAGS can
+ // RAUW them under us.
+ if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
+ SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
+ return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
+ N->getOperand(1), Cond, Flags);
+ }
+
+ return SDValue();
+}
+
+static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
+ SelectionDAG &DAG) {
+ // Take advantage of vector comparisons producing 0 or -1 in each lane to
+ // optimize away operation when it's from a constant.
+ //
+ // The general transformation is:
+ // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
+ // AND(VECTOR_CMP(x,y), constant2)
+ // constant2 = UNARYOP(constant)
+
+ // Early exit if this isn't a vector operation, the operand of the
+ // unary operation isn't a bitwise AND, or if the sizes of the operations
+ // aren't the same.
+ EVT VT = N->getValueType(0);
+ if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
+ N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
+ VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
+ return SDValue();
+
+ // Now check that the other operand of the AND is a constant. We could
+ // make the transformation for non-constant splats as well, but it's unclear
+ // that would be a benefit as it would not eliminate any operations, just
+ // perform one more step in scalar code before moving to the vector unit.
+ if (BuildVectorSDNode *BV =
+ dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
+ // Bail out if the vector isn't a constant.
+ if (!BV->isConstant())
+ return SDValue();
+
+ // Everything checks out. Build up the new and improved node.
+ SDLoc DL(N);
+ EVT IntVT = BV->getValueType(0);
+ // Create a new constant of the appropriate type for the transformed
+ // DAG.
+ SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
+ // The AND node needs bitcasts to/from an integer vector type around it.
+ SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
+ SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
+ N->getOperand(0)->getOperand(0), MaskConst);
+ SDValue Res = DAG.getBitcast(VT, NewAnd);
+ return Res;
+ }
+
+ return SDValue();
+}
+
+static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDValue Op0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+ EVT InVT = Op0.getValueType();
+ EVT InSVT = InVT.getScalarType();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
+ // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
+ if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
+ SDLoc dl(N);
+ EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+ InVT.getVectorNumElements());
+ SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
+
+ if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT))
+ return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
+
+ return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
+ }
+
+ // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
+ // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
+ // the optimization here.
+ if (DAG.SignBitIsZero(Op0))
+ return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
+
+ return SDValue();
+}
+
+static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // First try to optimize away the conversion entirely when it's
+ // conditionally from a constant. Vectors only.
+ if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
+ return Res;
+
+ // Now move on to more general possibilities.
+ SDValue Op0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+ EVT InVT = Op0.getValueType();
+ EVT InSVT = InVT.getScalarType();
+
+ // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
+ // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
+ // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
+ if (InVT.isVector() &&
+ (InSVT == MVT::i8 || InSVT == MVT::i16 ||
+ (InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) {
+ SDLoc dl(N);
+ EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+ InVT.getVectorNumElements());
+ SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
+ return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
+ }
+
+ // Without AVX512DQ we only support i64 to float scalar conversion. For both
+ // vectors and scalars, see if we know that the upper bits are all the sign
+ // bit, in which case we can truncate the input to i32 and convert from that.
+ if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
+ unsigned BitWidth = InVT.getScalarSizeInBits();
+ unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
+ if (NumSignBits >= (BitWidth - 31)) {
+ EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
+ if (InVT.isVector())
+ TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
+ InVT.getVectorNumElements());
+ SDLoc dl(N);
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
+ return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
+ }
+ }
+
+ // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
+ // a 32-bit target where SSE doesn't support i64->FP operations.
+ if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
+ LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
+ EVT LdVT = Ld->getValueType(0);
+
+ // This transformation is not supported if the result type is f16 or f128.
+ if (VT == MVT::f16 || VT == MVT::f128)
+ return SDValue();
+
+ if (!Ld->isVolatile() && !VT.isVector() &&
+ ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
+ !Subtarget.is64Bit() && LdVT == MVT::i64) {
+ SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
+ SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
+ DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
+ return FILDChain;
+ }
+ }
+ return SDValue();
+}
+
+// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
+static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
+ X86TargetLowering::DAGCombinerInfo &DCI) {
+ // If the LHS and RHS of the ADC node are zero, then it can't overflow and
+ // the result is either zero or one (depending on the input carry bit).
+ // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
+ if (X86::isZeroNode(N->getOperand(0)) &&
+ X86::isZeroNode(N->getOperand(1)) &&
+ // We don't have a good way to replace an EFLAGS use, so only do this when
+ // dead right now.
+ SDValue(N, 1).use_empty()) {
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+ SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
+ SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
+ DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+ DAG.getConstant(X86::COND_B, DL,
+ MVT::i8),
+ N->getOperand(2)),
+ DAG.getConstant(1, DL, VT));
+ return DCI.CombineTo(N, Res1, CarryOut);
+ }
+
+ return SDValue();
+}
+
+/// fold (add Y, (sete X, 0)) -> adc 0, Y
+/// (add Y, (setne X, 0)) -> sbb -1, Y
+/// (sub (sete X, 0), Y) -> sbb 0, Y
+/// (sub (setne X, 0), Y) -> adc -1, Y
+static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
+ SDLoc DL(N);
+
+ // Look through ZExts.
+ SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
+ if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
+ return SDValue();
+
+ SDValue SetCC = Ext.getOperand(0);
+ if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
+ return SDValue();
+
+ X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
+ if (CC != X86::COND_E && CC != X86::COND_NE)
+ return SDValue();
+
+ SDValue Cmp = SetCC.getOperand(1);
+ if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
+ !X86::isZeroNode(Cmp.getOperand(1)) ||
+ !Cmp.getOperand(0).getValueType().isInteger())
+ return SDValue();
+
+ SDValue CmpOp0 = Cmp.getOperand(0);
+ SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
+ DAG.getConstant(1, DL, CmpOp0.getValueType()));
+
+ SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
+ if (CC == X86::COND_NE)
+ return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
+ DL, OtherVal.getValueType(), OtherVal,
+ DAG.getConstant(-1ULL, DL, OtherVal.getValueType()),
+ NewCmp);
+ return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
+ DL, OtherVal.getValueType(), OtherVal,
+ DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp);
+}
+
+static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+
+ // TODO: There's nothing special about i32, any integer type above i16 should
+ // work just as well.
+ if (!VT.isVector() || !VT.isSimple() ||
+ !(VT.getVectorElementType() == MVT::i32))
+ return SDValue();
+
+ unsigned RegSize = 128;
+ if (Subtarget.hasBWI())
+ RegSize = 512;
+ else if (Subtarget.hasAVX2())
+ RegSize = 256;
+
+ // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
+ // TODO: We should be able to handle larger vectors by splitting them before
+ // feeding them into several SADs, and then reducing over those.
+ if (VT.getSizeInBits() / 4 > RegSize)
+ return SDValue();
+
+ // We know N is a reduction add, which means one of its operands is a phi.
+ // To match SAD, we need the other operand to be a vector select.
+ SDValue SelectOp, Phi;
+ if (Op0.getOpcode() == ISD::VSELECT) {
+ SelectOp = Op0;
+ Phi = Op1;
+ } else if (Op1.getOpcode() == ISD::VSELECT) {
+ SelectOp = Op1;
+ Phi = Op0;
+ } else
+ return SDValue();
+
+ // Check whether we have an abs-diff pattern feeding into the select.
+ if(!detectZextAbsDiff(SelectOp, Op0, Op1))
+ return SDValue();
+
+ // SAD pattern detected. Now build a SAD instruction and an addition for
+ // reduction. Note that the number of elements of the result of SAD is less
+ // than the number of elements of its input. Therefore, we could only update
+ // part of elements in the reduction vector.
+ SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);
+
+ // The output of PSADBW is a vector of i64.
+ // We need to turn the vector of i64 into a vector of i32.
+ // If the reduction vector is at least as wide as the psadbw result, just
+ // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
+ // anyway.
+ MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
+ if (VT.getSizeInBits() >= ResVT.getSizeInBits())
+ Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
+ else
+ Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
+
+ if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
+ // Update part of elements of the reduction vector. This is done by first
+ // extracting a sub-vector from it, updating this sub-vector, and inserting
+ // it back.
+ SDValue SubPhi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Phi,
+ DAG.getIntPtrConstant(0, DL));
+ SDValue Res = DAG.getNode(ISD::ADD, DL, ResVT, Sad, SubPhi);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Phi, Res,
+ DAG.getIntPtrConstant(0, DL));
+ } else
+ return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
+}
+
+static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags;
+ if (Flags->hasVectorReduction()) {
+ if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
+ return Sad;
+ }
+ EVT VT = N->getValueType(0);
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+
+ // Try to synthesize horizontal adds from adds of shuffles.
+ if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
+ (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
+ isHorizontalBinOp(Op0, Op1, true))
+ return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
+
+ return OptimizeConditionalInDecrement(N, DAG);
+}
+
+static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+
+ // X86 can't encode an immediate LHS of a sub. See if we can push the
+ // negation into a preceding instruction.
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
+ // If the RHS of the sub is a XOR with one use and a constant, invert the
+ // immediate. Then add one to the LHS of the sub so we can turn
+ // X-Y -> X+~Y+1, saving one register.
+ if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
+ isa<ConstantSDNode>(Op1.getOperand(1))) {
+ APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
+ EVT VT = Op0.getValueType();
+ SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
+ Op1.getOperand(0),
+ DAG.getConstant(~XorC, SDLoc(Op1), VT));
+ return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
+ DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
+ }
+ }
+
+ // Try to synthesize horizontal adds from adds of shuffles.
+ EVT VT = N->getValueType(0);
+ if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
+ (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
+ isHorizontalBinOp(Op0, Op1, true))
+ return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
+
+ return OptimizeConditionalInDecrement(N, DAG);
+}
+
+static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ SDLoc DL(N);
+ unsigned Opcode = N->getOpcode();
+ MVT VT = N->getSimpleValueType(0);
+ MVT SVT = VT.getVectorElementType();
+ SDValue Op = N->getOperand(0);
+ MVT OpVT = Op.getSimpleValueType();
+ MVT OpEltVT = OpVT.getVectorElementType();
+ unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
+
+ // Perform any constant folding.
+ // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
+ if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
+ unsigned NumDstElts = VT.getVectorNumElements();
+ SmallBitVector Undefs(NumDstElts, false);
+ SmallVector<APInt, 4> Vals(NumDstElts, APInt(SVT.getSizeInBits(), 0));
+ for (unsigned i = 0; i != NumDstElts; ++i) {
+ SDValue OpElt = Op.getOperand(i);
+ if (OpElt.getOpcode() == ISD::UNDEF) {
+ Undefs[i] = true;
+ continue;
+ }
+ APInt Cst = cast<ConstantSDNode>(OpElt.getNode())->getAPIntValue();
+ Vals[i] = Opcode == X86ISD::VZEXT ? Cst.zextOrTrunc(SVT.getSizeInBits())
+ : Cst.sextOrTrunc(SVT.getSizeInBits());
+ }
+ return getConstVector(Vals, Undefs, VT, DAG, DL);
+ }
+
+ // (vzext (bitcast (vzext (x)) -> (vzext x)
+ // TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
+ SDValue V = peekThroughBitcasts(Op);
+ if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
+ MVT InnerVT = V.getSimpleValueType();
+ MVT InnerEltVT = InnerVT.getVectorElementType();
+
+ // If the element sizes match exactly, we can just do one larger vzext. This
+ // is always an exact type match as vzext operates on integer types.
+ if (OpEltVT == InnerEltVT) {
+ assert(OpVT == InnerVT && "Types must match for vzext!");
+ return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
+ }
+
+ // The only other way we can combine them is if only a single element of the
+ // inner vzext is used in the input to the outer vzext.
+ if (InnerEltVT.getSizeInBits() < InputBits)
+ return SDValue();
+
+ // In this case, the inner vzext is completely dead because we're going to
+ // only look at bits inside of the low element. Just do the outer vzext on
+ // a bitcast of the input to the inner.
+ return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
+ }
+
+ // Check if we can bypass extracting and re-inserting an element of an input
+ // vector. Essentially:
+ // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
+ // TODO: Add X86ISD::VSEXT support
+ if (Opcode == X86ISD::VZEXT &&
+ V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
+ SDValue ExtractedV = V.getOperand(0);
+ SDValue OrigV = ExtractedV.getOperand(0);
+ if (isNullConstant(ExtractedV.getOperand(1))) {
+ MVT OrigVT = OrigV.getSimpleValueType();
+ // Extract a subvector if necessary...
+ if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
+ int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
+ OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
+ OrigVT.getVectorNumElements() / Ratio);
+ OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
+ DAG.getIntPtrConstant(0, DL));
+ }
+ Op = DAG.getBitcast(OpVT, OrigV);
+ return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
+ }
+ }
+
+ return SDValue();
+}
+
+/// Canonicalize (LSUB p, 1) -> (LADD p, -1).
+static SDValue combineLockSub(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDValue Chain = N->getOperand(0);
+ SDValue LHS = N->getOperand(1);
+ SDValue RHS = N->getOperand(2);
+ MVT VT = RHS.getSimpleValueType();
+ SDLoc DL(N);
+
+ auto *C = dyn_cast<ConstantSDNode>(RHS);
+ if (!C || C->getZExtValue() != 1)
+ return SDValue();
+
+ RHS = DAG.getConstant(-1, DL, VT);
+ MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
+ return DAG.getMemIntrinsicNode(X86ISD::LADD, DL,
+ DAG.getVTList(MVT::i32, MVT::Other),
+ {Chain, LHS, RHS}, VT, MMO);
+}
+
+// TEST (AND a, b) ,(AND a, b) -> TEST a, b
+static SDValue combineTestM(SDNode *N, SelectionDAG &DAG) {
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+
+ if (Op0 != Op1 || Op1->getOpcode() != ISD::AND)
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+ SDLoc DL(N);
+
+ return DAG.getNode(X86ISD::TESTM, DL, VT,
+ Op0->getOperand(0), Op0->getOperand(1));
+}
+
+static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ MVT VT = N->getSimpleValueType(0);
+ SDLoc DL(N);
+
+ if (N->getOperand(0) == N->getOperand(1)) {
+ if (N->getOpcode() == X86ISD::PCMPEQ)
+ return getOnesVector(VT, Subtarget, DAG, DL);
+ if (N->getOpcode() == X86ISD::PCMPGT)
+ return getZeroVector(VT, Subtarget, DAG, DL);
+ }
+
+ return SDValue();
+}
+
+
+SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ switch (N->getOpcode()) {
+ default: break;
+ case ISD::EXTRACT_VECTOR_ELT:
+ return combineExtractVectorElt(N, DAG, DCI, Subtarget);
+ case ISD::VSELECT:
+ case ISD::SELECT:
+ case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
+ case ISD::BITCAST: return combineBitcast(N, DAG, Subtarget);
+ case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
+ case ISD::ADD: return combineAdd(N, DAG, Subtarget);
+ case ISD::SUB: return combineSub(N, DAG, Subtarget);
+ case X86ISD::ADC: return combineADC(N, DAG, DCI);
+ case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
+ case ISD::SHL:
+ case ISD::SRA:
+ case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
+ case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
+ case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
+ case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
+ case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
+ case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
+ case ISD::STORE: return combineStore(N, DAG, Subtarget);
+ case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget);
+ case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
+ case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
+ case ISD::FADD:
+ case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
+ case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
+ case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
+ case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
+ case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
+ case X86ISD::FXOR:
+ case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
+ case X86ISD::FMIN:
+ case X86ISD::FMAX: return combineFMinFMax(N, DAG);
+ case ISD::FMINNUM:
+ case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
+ case X86ISD::BT: return combineBT(N, DAG, DCI);
+ case ISD::ANY_EXTEND:
+ case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
+ case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
+ case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
+ case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
+ case X86ISD::SETCC: return combineX86SetCC(N, DAG, DCI, Subtarget);
+ case X86ISD::BRCOND: return combineBrCond(N, DAG, DCI, Subtarget);
+ case X86ISD::VSHLI:
+ case X86ISD::VSRLI: return combineVectorShift(N, DAG, DCI, Subtarget);
+ case X86ISD::VSEXT:
+ case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget);
+ case X86ISD::SHUFP: // Handle all target specific shuffles
+ case X86ISD::INSERTPS:
+ case X86ISD::PALIGNR:
+ case X86ISD::VSHLDQ:
+ case X86ISD::VSRLDQ:
+ case X86ISD::BLENDI:
+ case X86ISD::UNPCKH:
+ case X86ISD::UNPCKL:
+ case X86ISD::MOVHLPS:
+ case X86ISD::MOVLHPS:
+ case X86ISD::PSHUFB:
+ case X86ISD::PSHUFD:
+ case X86ISD::PSHUFHW:
+ case X86ISD::PSHUFLW:
+ case X86ISD::MOVSHDUP:
+ case X86ISD::MOVSLDUP:
+ case X86ISD::MOVDDUP:
+ case X86ISD::MOVSS:
+ case X86ISD::MOVSD:
+ case X86ISD::VPPERM:
+ case X86ISD::VPERMI:
+ case X86ISD::VPERMV:
+ case X86ISD::VPERMV3:
+ case X86ISD::VPERMIV3:
+ case X86ISD::VPERMIL2:
+ case X86ISD::VPERMILPI:
+ case X86ISD::VPERMILPV:
+ case X86ISD::VPERM2X128:
+ case X86ISD::VZEXT_MOVL:
+ case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
+ case X86ISD::FMADD:
+ case X86ISD::FMADD_RND:
+ case X86ISD::FMADDS1_RND:
+ case X86ISD::FMADDS3_RND:
+ case ISD::FMA: return combineFMA(N, DAG, Subtarget);
+ case ISD::MGATHER:
+ case ISD::MSCATTER: return combineGatherScatter(N, DAG);
+ case X86ISD::LSUB: return combineLockSub(N, DAG, Subtarget);
+ case X86ISD::TESTM: return combineTestM(N, DAG);
+ case X86ISD::PCMPEQ:
+ case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
+ }
+
+ return SDValue();
+}
+
+/// Return true if the target has native support for the specified value type
+/// and it is 'desirable' to use the type for the given node type. e.g. On x86
+/// i16 is legal, but undesirable since i16 instruction encodings are longer and
+/// some i16 instructions are slow.
+bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
+ if (!isTypeLegal(VT))
+ return false;
+ if (VT != MVT::i16)
+ return true;
+
+ switch (Opc) {
+ default:
+ return true;
+ case ISD::LOAD:
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND:
+ case ISD::ANY_EXTEND:
+ case ISD::SHL:
+ case ISD::SRL:
+ case ISD::SUB:
+ case ISD::ADD:
+ case ISD::MUL:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR:
+ return false;
+ }
+}
+
+/// This function checks if any of the users of EFLAGS copies the EFLAGS. We
+/// know that the code that lowers COPY of EFLAGS has to use the stack, and if
+/// we don't adjust the stack we clobber the first frame index.
+/// See X86InstrInfo::copyPhysReg.
+bool X86TargetLowering::hasCopyImplyingStackAdjustment(
+ MachineFunction *MF) const {
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ return any_of(MRI.reg_instructions(X86::EFLAGS),
+ [](const MachineInstr &RI) { return RI.isCopy(); });
+}
+
+/// This method query the target whether it is beneficial for dag combiner to
+/// promote the specified node. If true, it should return the desired promotion
+/// type by reference.
+bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
+ EVT VT = Op.getValueType();
+ if (VT != MVT::i16)
+ return false;
+
+ bool Promote = false;
+ bool Commute = false;
+ switch (Op.getOpcode()) {
+ default: break;
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND:
+ case ISD::ANY_EXTEND:
+ Promote = true;
+ break;
+ case ISD::SHL:
+ case ISD::SRL: {
+ SDValue N0 = Op.getOperand(0);
+ // Look out for (store (shl (load), x)).
+ if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
+ return false;
+ Promote = true;
+ break;
+ }
+ case ISD::ADD:
+ case ISD::MUL:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR:
+ Commute = true;
+ LLVM_FALLTHROUGH;
+ case ISD::SUB: {
+ SDValue N0 = Op.getOperand(0);
+ SDValue N1 = Op.getOperand(1);
+ if (!Commute && MayFoldLoad(N1))
+ return false;
+ // Avoid disabling potential load folding opportunities.
+ if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
+ return false;
+ if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
+ return false;
+ Promote = true;
+ }
+ }
+
+ PVT = MVT::i32;
+ return Promote;
+}
+
+//===----------------------------------------------------------------------===//
+// X86 Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+// Helper to match a string separated by whitespace.
+static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
+ S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
+
+ for (StringRef Piece : Pieces) {
+ if (!S.startswith(Piece)) // Check if the piece matches.
+ return false;
+
+ S = S.substr(Piece.size());
+ StringRef::size_type Pos = S.find_first_not_of(" \t");
+ if (Pos == 0) // We matched a prefix.
+ return false;
+
+ S = S.substr(Pos);
+ }
+
+ return S.empty();
+}
+
+static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
+
+ if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
+ if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
+ std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
+ std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
+
+ if (AsmPieces.size() == 3)
+ return true;
+ else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
+ return true;
+ }
+ }
+ return false;
+}
+
+bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
+ InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
+
+ const std::string &AsmStr = IA->getAsmString();
+
+ IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+ if (!Ty || Ty->getBitWidth() % 16 != 0)
+ return false;
+
+ // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
+ SmallVector<StringRef, 4> AsmPieces;
+ SplitString(AsmStr, AsmPieces, ";\n");
+
+ switch (AsmPieces.size()) {
+ default: return false;
+ case 1:
+ // FIXME: this should verify that we are targeting a 486 or better. If not,
+ // we will turn this bswap into something that will be lowered to logical
+ // ops instead of emitting the bswap asm. For now, we don't support 486 or
+ // lower so don't worry about this.
+ // bswap $0
+ if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
+ matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
+ matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
+ matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
+ matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
+ matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
+ // No need to check constraints, nothing other than the equivalent of
+ // "=r,0" would be valid here.
+ return IntrinsicLowering::LowerToByteSwap(CI);
+ }
+
+ // rorw $$8, ${0:w} --> llvm.bswap.i16
+ if (CI->getType()->isIntegerTy(16) &&
+ IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
+ (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
+ matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
+ AsmPieces.clear();
+ StringRef ConstraintsStr = IA->getConstraintString();
+ SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
+ array_pod_sort(AsmPieces.begin(), AsmPieces.end());
+ if (clobbersFlagRegisters(AsmPieces))
+ return IntrinsicLowering::LowerToByteSwap(CI);
+ }
+ break;
+ case 3:
+ if (CI->getType()->isIntegerTy(32) &&
+ IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
+ matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
+ matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
+ matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
+ AsmPieces.clear();
+ StringRef ConstraintsStr = IA->getConstraintString();
+ SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
+ array_pod_sort(AsmPieces.begin(), AsmPieces.end());
+ if (clobbersFlagRegisters(AsmPieces))
+ return IntrinsicLowering::LowerToByteSwap(CI);
+ }
+
+ if (CI->getType()->isIntegerTy(64)) {
+ InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
+ if (Constraints.size() >= 2 &&
+ Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
+ Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
+ // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
+ if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
+ matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
+ matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
+ return IntrinsicLowering::LowerToByteSwap(CI);
+ }
+ }
+ break;
+ }
+ return false;
+}
+
+/// Given a constraint letter, return the type of constraint for this target.
+X86TargetLowering::ConstraintType
+X86TargetLowering::getConstraintType(StringRef Constraint) const {
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ case 'R':
+ case 'q':
+ case 'Q':
+ case 'f':
+ case 't':
+ case 'u':
+ case 'y':
+ case 'x':
+ case 'v':
+ case 'Y':
+ case 'l':
+ return C_RegisterClass;
+ case 'k': // AVX512 masking registers.
+ case 'a':
+ case 'b':
+ case 'c':
+ case 'd':
+ case 'S':
+ case 'D':
+ case 'A':
+ return C_Register;
+ case 'I':
+ case 'J':
+ case 'K':
+ case 'L':
+ case 'M':
+ case 'N':
+ case 'G':
+ case 'C':
+ case 'e':
+ case 'Z':
+ return C_Other;
+ default:
+ break;
+ }
+ }
+ else if (Constraint.size() == 2) {
+ switch (Constraint[0]) {
+ default:
+ break;
+ case 'Y':
+ switch (Constraint[1]) {
+ default:
+ break;
+ case 'k':
+ return C_Register;
+ }
+ }
+ }
+ return TargetLowering::getConstraintType(Constraint);
+}
+
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+ X86TargetLowering::getSingleConstraintMatchWeight(
+ AsmOperandInfo &info, const char *constraint) const {
+ ConstraintWeight weight = CW_Invalid;
+ Value *CallOperandVal = info.CallOperandVal;
+ // If we don't have a value, we can't do a match,
+ // but allow it at the lowest weight.
+ if (!CallOperandVal)
+ return CW_Default;
+ Type *type = CallOperandVal->getType();
+ // Look at the constraint type.
+ switch (*constraint) {
+ default:
+ weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+ case 'R':
+ case 'q':
+ case 'Q':
+ case 'a':
+ case 'b':
+ case 'c':
+ case 'd':
+ case 'S':
+ case 'D':
+ case 'A':
+ if (CallOperandVal->getType()->isIntegerTy())
+ weight = CW_SpecificReg;
+ break;
+ case 'f':
+ case 't':
+ case 'u':
+ if (type->isFloatingPointTy())
+ weight = CW_SpecificReg;
+ break;
+ case 'y':
+ if (type->isX86_MMXTy() && Subtarget.hasMMX())
+ weight = CW_SpecificReg;
+ break;
+ case 'Y':
+ // Other "Y<x>" (e.g. "Yk") constraints should be implemented below.
+ if (constraint[1] == 'k') {
+ // Support for 'Yk' (similarly to the 'k' variant below).
+ weight = CW_SpecificReg;
+ break;
+ }
+ // Else fall through (handle "Y" constraint).
+ LLVM_FALLTHROUGH;
+ case 'v':
+ if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
+ weight = CW_Register;
+ LLVM_FALLTHROUGH;
+ case 'x':
+ if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
+ ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
+ weight = CW_Register;
+ break;
+ case 'k':
+ // Enable conditional vector operations using %k<#> registers.
+ weight = CW_SpecificReg;
+ break;
+ case 'I':
+ if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
+ if (C->getZExtValue() <= 31)
+ weight = CW_Constant;
+ }
+ break;
+ case 'J':
+ if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if (C->getZExtValue() <= 63)
+ weight = CW_Constant;
+ }
+ break;
+ case 'K':
+ if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
+ weight = CW_Constant;
+ }
+ break;
+ case 'L':
+ if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
+ weight = CW_Constant;
+ }
+ break;
+ case 'M':
+ if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if (C->getZExtValue() <= 3)
+ weight = CW_Constant;
+ }
+ break;
+ case 'N':
+ if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if (C->getZExtValue() <= 0xff)
+ weight = CW_Constant;
+ }
+ break;
+ case 'G':
+ case 'C':
+ if (isa<ConstantFP>(CallOperandVal)) {
+ weight = CW_Constant;
+ }
+ break;
+ case 'e':
+ if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if ((C->getSExtValue() >= -0x80000000LL) &&
+ (C->getSExtValue() <= 0x7fffffffLL))
+ weight = CW_Constant;
+ }
+ break;
+ case 'Z':
+ if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if (C->getZExtValue() <= 0xffffffff)
+ weight = CW_Constant;
+ }
+ break;
+ }
+ return weight;
+}
+
+/// Try to replace an X constraint, which matches anything, with another that
+/// has more specific requirements based on the type of the corresponding
+/// operand.
+const char *X86TargetLowering::
+LowerXConstraint(EVT ConstraintVT) const {
+ // FP X constraints get lowered to SSE1/2 registers if available, otherwise
+ // 'f' like normal targets.
+ if (ConstraintVT.isFloatingPoint()) {
+ if (Subtarget.hasSSE2())
+ return "Y";
+ if (Subtarget.hasSSE1())
+ return "x";
+ }
+
+ return TargetLowering::LowerXConstraint(ConstraintVT);
+}
+
+/// Lower the specified operand into the Ops vector.
+/// If it is invalid, don't add anything to Ops.
+void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+ std::string &Constraint,
+ std::vector<SDValue>&Ops,
+ SelectionDAG &DAG) const {
+ SDValue Result;
+
+ // Only support length 1 constraints for now.
+ if (Constraint.length() > 1) return;
+
+ char ConstraintLetter = Constraint[0];
+ switch (ConstraintLetter) {
+ default: break;
+ case 'I':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() <= 31) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+ Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'J':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() <= 63) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+ Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'K':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (isInt<8>(C->getSExtValue())) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+ Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'L':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
+ (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
+ Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
+ Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'M':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() <= 3) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+ Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'N':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() <= 255) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+ Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'O':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() <= 127) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+ Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'e': {
+ // 32-bit signed value
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
+ C->getSExtValue())) {
+ // Widen to 64 bits here to get it sign extended.
+ Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
+ break;
+ }
+ // FIXME gcc accepts some relocatable values here too, but only in certain
+ // memory models; it's complicated.
+ }
+ return;
+ }
+ case 'Z': {
+ // 32-bit unsigned value
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
+ C->getZExtValue())) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+ Op.getValueType());
+ break;
+ }
+ }
+ // FIXME gcc accepts some relocatable values here too, but only in certain
+ // memory models; it's complicated.
+ return;
+ }
+ case 'i': {
+ // Literal immediates are always ok.
+ if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
+ // Widen to 64 bits here to get it sign extended.
+ Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
+ break;
+ }
+
+ // In any sort of PIC mode addresses need to be computed at runtime by
+ // adding in a register or some sort of table lookup. These can't
+ // be used as immediates.
+ if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
+ return;
+
+ // If we are in non-pic codegen mode, we allow the address of a global (with
+ // an optional displacement) to be used with 'i'.
+ GlobalAddressSDNode *GA = nullptr;
+ int64_t Offset = 0;
+
+ // Match either (GA), (GA+C), (GA+C1+C2), etc.
+ while (1) {
+ if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
+ Offset += GA->getOffset();
+ break;
+ } else if (Op.getOpcode() == ISD::ADD) {
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+ Offset += C->getZExtValue();
+ Op = Op.getOperand(0);
+ continue;
+ }
+ } else if (Op.getOpcode() == ISD::SUB) {
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+ Offset += -C->getZExtValue();
+ Op = Op.getOperand(0);
+ continue;
+ }
+ }
+
+ // Otherwise, this isn't something we can handle, reject it.
+ return;
+ }
+
+ const GlobalValue *GV = GA->getGlobal();
+ // If we require an extra load to get this address, as in PIC mode, we
+ // can't accept it.
+ if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
+ return;
+
+ Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
+ GA->getValueType(0), Offset);
+ break;
+ }
+ }
+
+ if (Result.getNode()) {
+ Ops.push_back(Result);
+ return;
+ }
+ return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+/// Check if \p RC is a general purpose register class.
+/// I.e., GR* or one of their variant.
+static bool isGRClass(const TargetRegisterClass &RC) {
+ return RC.hasSuperClassEq(&X86::GR8RegClass) ||
+ RC.hasSuperClassEq(&X86::GR16RegClass) ||
+ RC.hasSuperClassEq(&X86::GR32RegClass) ||
+ RC.hasSuperClassEq(&X86::GR64RegClass) ||
+ RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
+}
+
+/// Check if \p RC is a vector register class.
+/// I.e., FR* / VR* or one of their variant.
+static bool isFRClass(const TargetRegisterClass &RC) {
+ return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
+ RC.hasSuperClassEq(&X86::FR64XRegClass) ||
+ RC.hasSuperClassEq(&X86::VR128XRegClass) ||
+ RC.hasSuperClassEq(&X86::VR256XRegClass) ||
+ RC.hasSuperClassEq(&X86::VR512RegClass);
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ StringRef Constraint,
+ MVT VT) const {
+ // First, see if this is a constraint that directly corresponds to an LLVM
+ // register class.
+ if (Constraint.size() == 1) {
+ // GCC Constraint Letters
+ switch (Constraint[0]) {
+ default: break;
+ // TODO: Slight differences here in allocation order and leaving
+ // RIP in the class. Do they matter any more here than they do
+ // in the normal allocation?
+ case 'k':
+ if (Subtarget.hasAVX512()) {
+ // Only supported in AVX512 or later.
+ switch (VT.SimpleTy) {
+ default: break;
+ case MVT::i32:
+ return std::make_pair(0U, &X86::VK32RegClass);
+ case MVT::i16:
+ return std::make_pair(0U, &X86::VK16RegClass);
+ case MVT::i8:
+ return std::make_pair(0U, &X86::VK8RegClass);
+ case MVT::i1:
+ return std::make_pair(0U, &X86::VK1RegClass);
+ case MVT::i64:
+ return std::make_pair(0U, &X86::VK64RegClass);
+ }
+ }
+ break;
+ case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
+ if (Subtarget.is64Bit()) {
+ if (VT == MVT::i32 || VT == MVT::f32)
+ return std::make_pair(0U, &X86::GR32RegClass);
+ if (VT == MVT::i16)
+ return std::make_pair(0U, &X86::GR16RegClass);
+ if (VT == MVT::i8 || VT == MVT::i1)
+ return std::make_pair(0U, &X86::GR8RegClass);
+ if (VT == MVT::i64 || VT == MVT::f64)
+ return std::make_pair(0U, &X86::GR64RegClass);
+ break;
+ }
+ // 32-bit fallthrough
+ case 'Q': // Q_REGS
+ if (VT == MVT::i32 || VT == MVT::f32)
+ return std::make_pair(0U, &X86::GR32_ABCDRegClass);
+ if (VT == MVT::i16)
+ return std::make_pair(0U, &X86::GR16_ABCDRegClass);
+ if (VT == MVT::i8 || VT == MVT::i1)
+ return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
+ if (VT == MVT::i64)
+ return std::make_pair(0U, &X86::GR64_ABCDRegClass);
+ break;
+ case 'r': // GENERAL_REGS
+ case 'l': // INDEX_REGS
+ if (VT == MVT::i8 || VT == MVT::i1)
+ return std::make_pair(0U, &X86::GR8RegClass);
+ if (VT == MVT::i16)
+ return std::make_pair(0U, &X86::GR16RegClass);
+ if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
+ return std::make_pair(0U, &X86::GR32RegClass);
+ return std::make_pair(0U, &X86::GR64RegClass);
+ case 'R': // LEGACY_REGS
+ if (VT == MVT::i8 || VT == MVT::i1)
+ return std::make_pair(0U, &X86::GR8_NOREXRegClass);
+ if (VT == MVT::i16)
+ return std::make_pair(0U, &X86::GR16_NOREXRegClass);
+ if (VT == MVT::i32 || !Subtarget.is64Bit())
+ return std::make_pair(0U, &X86::GR32_NOREXRegClass);
+ return std::make_pair(0U, &X86::GR64_NOREXRegClass);
+ case 'f': // FP Stack registers.
+ // If SSE is enabled for this VT, use f80 to ensure the isel moves the
+ // value to the correct fpstack register class.
+ if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
+ return std::make_pair(0U, &X86::RFP32RegClass);
+ if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
+ return std::make_pair(0U, &X86::RFP64RegClass);
+ return std::make_pair(0U, &X86::RFP80RegClass);
+ case 'y': // MMX_REGS if MMX allowed.
+ if (!Subtarget.hasMMX()) break;
+ return std::make_pair(0U, &X86::VR64RegClass);
+ case 'Y': // SSE_REGS if SSE2 allowed
+ if (!Subtarget.hasSSE2()) break;
+ LLVM_FALLTHROUGH;
+ case 'v':
+ case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
+ if (!Subtarget.hasSSE1()) break;
+ bool VConstraint = (Constraint[0] == 'v');
+
+ switch (VT.SimpleTy) {
+ default: break;
+ // Scalar SSE types.
+ case MVT::f32:
+ case MVT::i32:
+ if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
+ return std::make_pair(0U, &X86::FR32XRegClass);
+ return std::make_pair(0U, &X86::FR32RegClass);
+ case MVT::f64:
+ case MVT::i64:
+ if (VConstraint && Subtarget.hasVLX())
+ return std::make_pair(0U, &X86::FR64XRegClass);
+ return std::make_pair(0U, &X86::FR64RegClass);
+ // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
+ // Vector types.
+ case MVT::v16i8:
+ case MVT::v8i16:
+ case MVT::v4i32:
+ case MVT::v2i64:
+ case MVT::v4f32:
+ case MVT::v2f64:
+ if (VConstraint && Subtarget.hasVLX())
+ return std::make_pair(0U, &X86::VR128XRegClass);
+ return std::make_pair(0U, &X86::VR128RegClass);
+ // AVX types.
+ case MVT::v32i8:
+ case MVT::v16i16:
+ case MVT::v8i32:
+ case MVT::v4i64:
+ case MVT::v8f32:
+ case MVT::v4f64:
+ if (VConstraint && Subtarget.hasVLX())
+ return std::make_pair(0U, &X86::VR256XRegClass);
+ return std::make_pair(0U, &X86::VR256RegClass);
+ case MVT::v8f64:
+ case MVT::v16f32:
+ case MVT::v16i32:
+ case MVT::v8i64:
+ return std::make_pair(0U, &X86::VR512RegClass);
+ }
+ break;
+ }
+ } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
+ switch (Constraint[1]) {
+ default:
+ break;
+ case 'k':
+ // This register class doesn't allocate k0 for masked vector operation.
+ if (Subtarget.hasAVX512()) { // Only supported in AVX512.
+ switch (VT.SimpleTy) {
+ default: break;
+ case MVT::i32:
+ return std::make_pair(0U, &X86::VK32WMRegClass);
+ case MVT::i16:
+ return std::make_pair(0U, &X86::VK16WMRegClass);
+ case MVT::i8:
+ return std::make_pair(0U, &X86::VK8WMRegClass);
+ case MVT::i1:
+ return std::make_pair(0U, &X86::VK1WMRegClass);
+ case MVT::i64:
+ return std::make_pair(0U, &X86::VK64WMRegClass);
+ }
+ }
+ break;
+ }
+ }
+
+ // Use the default implementation in TargetLowering to convert the register
+ // constraint into a member of a register class.
+ std::pair<unsigned, const TargetRegisterClass*> Res;
+ Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+
+ // Not found as a standard register?
+ if (!Res.second) {
+ // Map st(0) -> st(7) -> ST0
+ if (Constraint.size() == 7 && Constraint[0] == '{' &&
+ tolower(Constraint[1]) == 's' &&
+ tolower(Constraint[2]) == 't' &&
+ Constraint[3] == '(' &&
+ (Constraint[4] >= '0' && Constraint[4] <= '7') &&
+ Constraint[5] == ')' &&
+ Constraint[6] == '}') {
+
+ Res.first = X86::FP0+Constraint[4]-'0';
+ Res.second = &X86::RFP80RegClass;
+ return Res;
+ }
+
+ // GCC allows "st(0)" to be called just plain "st".
+ if (StringRef("{st}").equals_lower(Constraint)) {
+ Res.first = X86::FP0;
+ Res.second = &X86::RFP80RegClass;
+ return Res;
+ }
+
+ // flags -> EFLAGS
+ if (StringRef("{flags}").equals_lower(Constraint)) {
+ Res.first = X86::EFLAGS;
+ Res.second = &X86::CCRRegClass;
+ return Res;
+ }
+
+ // 'A' means EAX + EDX.
+ if (Constraint == "A") {
+ Res.first = X86::EAX;
+ Res.second = &X86::GR32_ADRegClass;
+ return Res;
+ }
+ return Res;
+ }
+
+ // Otherwise, check to see if this is a register class of the wrong value
+ // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
+ // turn into {ax},{dx}.
+ // MVT::Other is used to specify clobber names.
+ if (Res.second->hasType(VT) || VT == MVT::Other)
+ return Res; // Correct type already, nothing to do.
+
+ // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
+ // return "eax". This should even work for things like getting 64bit integer
+ // registers when given an f64 type.
+ const TargetRegisterClass *Class = Res.second;
+ // The generic code will match the first register class that contains the
+ // given register. Thus, based on the ordering of the tablegened file,
+ // the "plain" GR classes might not come first.
+ // Therefore, use a helper method.
+ if (isGRClass(*Class)) {
+ unsigned Size = VT.getSizeInBits();
+ if (Size == 1) Size = 8;
+ unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
+ if (DestReg > 0) {
+ Res.first = DestReg;
+ Res.second = Size == 8 ? &X86::GR8RegClass
+ : Size == 16 ? &X86::GR16RegClass
+ : Size == 32 ? &X86::GR32RegClass
+ : &X86::GR64RegClass;
+ assert(Res.second->contains(Res.first) && "Register in register class");
+ } else {
+ // No register found/type mismatch.
+ Res.first = 0;
+ Res.second = nullptr;
+ }
+ } else if (isFRClass(*Class)) {
+ // Handle references to XMM physical registers that got mapped into the
+ // wrong class. This can happen with constraints like {xmm0} where the
+ // target independent register mapper will just pick the first match it can
+ // find, ignoring the required type.
+
+ // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
+ if (VT == MVT::f32 || VT == MVT::i32)
+ Res.second = &X86::FR32RegClass;
+ else if (VT == MVT::f64 || VT == MVT::i64)
+ Res.second = &X86::FR64RegClass;
+ else if (X86::VR128RegClass.hasType(VT))
+ Res.second = &X86::VR128RegClass;
+ else if (X86::VR256RegClass.hasType(VT))
+ Res.second = &X86::VR256RegClass;
+ else if (X86::VR512RegClass.hasType(VT))
+ Res.second = &X86::VR512RegClass;
+ else {
+ // Type mismatch and not a clobber: Return an error;
+ Res.first = 0;
+ Res.second = nullptr;
+ }
+ }
+
+ return Res;
+}
+
+int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
+ const AddrMode &AM, Type *Ty,
+ unsigned AS) const {
+ // Scaling factors are not free at all.
+ // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
+ // will take 2 allocations in the out of order engine instead of 1
+ // for plain addressing mode, i.e. inst (reg1).
+ // E.g.,
+ // vaddps (%rsi,%drx), %ymm0, %ymm1
+ // Requires two allocations (one for the load, one for the computation)
+ // whereas:
+ // vaddps (%rsi), %ymm0, %ymm1
+ // Requires just 1 allocation, i.e., freeing allocations for other operations
+ // and having less micro operations to execute.
+ //
+ // For some X86 architectures, this is even worse because for instance for
+ // stores, the complex addressing mode forces the instruction to use the
+ // "load" ports instead of the dedicated "store" port.
+ // E.g., on Haswell:
+ // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
+ // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
+ if (isLegalAddressingMode(DL, AM, Ty, AS))
+ // Scale represents reg2 * scale, thus account for 1
+ // as soon as we use a second register.
+ return AM.Scale != 0;
+ return -1;
+}
+
+bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
+ // Integer division on x86 is expensive. However, when aggressively optimizing
+ // for code size, we prefer to use a div instruction, as it is usually smaller
+ // than the alternative sequence.
+ // The exception to this is vector division. Since x86 doesn't have vector
+ // integer division, leaving the division as-is is a loss even in terms of
+ // size, because it will have to be scalarized, while the alternative code
+ // sequence can be performed in vector form.
+ bool OptSize = Attr.hasAttribute(AttributeSet::FunctionIndex,
+ Attribute::MinSize);
+ return OptSize && !VT.isVector();
+}
+
+void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
+ if (!Subtarget.is64Bit())
+ return;
+
+ // Update IsSplitCSR in X86MachineFunctionInfo.
+ X86MachineFunctionInfo *AFI =
+ Entry->getParent()->getInfo<X86MachineFunctionInfo>();
+ AFI->setIsSplitCSR(true);
+}
+
+void X86TargetLowering::insertCopiesSplitCSR(
+ MachineBasicBlock *Entry,
+ const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
+ const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
+ const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
+ if (!IStart)
+ return;
+
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
+ MachineBasicBlock::iterator MBBI = Entry->begin();
+ for (const MCPhysReg *I = IStart; *I; ++I) {
+ const TargetRegisterClass *RC = nullptr;
+ if (X86::GR64RegClass.contains(*I))
+ RC = &X86::GR64RegClass;
+ else
+ llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+
+ unsigned NewVR = MRI->createVirtualRegister(RC);
+ // Create copy from CSR to a virtual register.
+ // FIXME: this currently does not emit CFI pseudo-instructions, it works
+ // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
+ // nounwind. If we want to generalize this later, we may need to emit
+ // CFI pseudo-instructions.
+ assert(Entry->getParent()->getFunction()->hasFnAttribute(
+ Attribute::NoUnwind) &&
+ "Function should be nounwind in insertCopiesSplitCSR!");
+ Entry->addLiveIn(*I);
+ BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
+ .addReg(*I);
+
+ // Insert the copy-back instructions right before the terminator.
+ for (auto *Exit : Exits)
+ BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
+ TII->get(TargetOpcode::COPY), *I)
+ .addReg(NewVR);
+ }
+}
+
+bool X86TargetLowering::supportSwiftError() const {
+ return Subtarget.is64Bit();
+}
diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.h b/contrib/llvm/lib/Target/X86/X86ISelLowering.h
new file mode 100644
index 000000000000..37f9353042b1
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.h
@@ -0,0 +1,1382 @@
+//===-- X86ISelLowering.h - X86 DAG Lowering Interface ----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that X86 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
+#define LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetOptions.h"
+
+namespace llvm {
+ class X86Subtarget;
+ class X86TargetMachine;
+
+ namespace X86ISD {
+ // X86 Specific DAG Nodes
+ enum NodeType : unsigned {
+ // Start the numbering where the builtin ops leave off.
+ FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+ /// Bit scan forward.
+ BSF,
+ /// Bit scan reverse.
+ BSR,
+
+ /// Double shift instructions. These correspond to
+ /// X86::SHLDxx and X86::SHRDxx instructions.
+ SHLD,
+ SHRD,
+
+ /// Bitwise logical AND of floating point values. This corresponds
+ /// to X86::ANDPS or X86::ANDPD.
+ FAND,
+
+ /// Bitwise logical OR of floating point values. This corresponds
+ /// to X86::ORPS or X86::ORPD.
+ FOR,
+
+ /// Bitwise logical XOR of floating point values. This corresponds
+ /// to X86::XORPS or X86::XORPD.
+ FXOR,
+
+ /// Bitwise logical ANDNOT of floating point values. This
+ /// corresponds to X86::ANDNPS or X86::ANDNPD.
+ FANDN,
+
+ /// These operations represent an abstract X86 call
+ /// instruction, which includes a bunch of information. In particular the
+ /// operands of these node are:
+ ///
+ /// #0 - The incoming token chain
+ /// #1 - The callee
+ /// #2 - The number of arg bytes the caller pushes on the stack.
+ /// #3 - The number of arg bytes the callee pops off the stack.
+ /// #4 - The value to pass in AL/AX/EAX (optional)
+ /// #5 - The value to pass in DL/DX/EDX (optional)
+ ///
+ /// The result values of these nodes are:
+ ///
+ /// #0 - The outgoing token chain
+ /// #1 - The first register result value (optional)
+ /// #2 - The second register result value (optional)
+ ///
+ CALL,
+
+ /// This operation implements the lowering for readcyclecounter.
+ RDTSC_DAG,
+
+ /// X86 Read Time-Stamp Counter and Processor ID.
+ RDTSCP_DAG,
+
+ /// X86 Read Performance Monitoring Counters.
+ RDPMC_DAG,
+
+ /// X86 compare and logical compare instructions.
+ CMP, COMI, UCOMI,
+
+ /// X86 bit-test instructions.
+ BT,
+
+ /// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS
+ /// operand, usually produced by a CMP instruction.
+ SETCC,
+
+ /// X86 Select
+ SELECT, SELECTS,
+
+ // Same as SETCC except it's materialized with a sbb and the value is all
+ // one's or all zero's.
+ SETCC_CARRY, // R = carry_bit ? ~0 : 0
+
+ /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
+ /// Operands are two FP values to compare; result is a mask of
+ /// 0s or 1s. Generally DTRT for C/C++ with NaNs.
+ FSETCC,
+
+ /// X86 FP SETCC, similar to above, but with output as an i1 mask and
+ /// with optional rounding mode.
+ FSETCCM, FSETCCM_RND,
+
+ /// X86 conditional moves. Operand 0 and operand 1 are the two values
+ /// to select from. Operand 2 is the condition code, and operand 3 is the
+ /// flag operand produced by a CMP or TEST instruction. It also writes a
+ /// flag result.
+ CMOV,
+
+ /// X86 conditional branches. Operand 0 is the chain operand, operand 1
+ /// is the block to branch if condition is true, operand 2 is the
+ /// condition code, and operand 3 is the flag operand produced by a CMP
+ /// or TEST instruction.
+ BRCOND,
+
+ /// Return with a flag operand. Operand 0 is the chain operand, operand
+ /// 1 is the number of bytes of stack to pop.
+ RET_FLAG,
+
+ /// Return from interrupt. Operand 0 is the number of bytes to pop.
+ IRET,
+
+ /// Repeat fill, corresponds to X86::REP_STOSx.
+ REP_STOS,
+
+ /// Repeat move, corresponds to X86::REP_MOVSx.
+ REP_MOVS,
+
+ /// On Darwin, this node represents the result of the popl
+ /// at function entry, used for PIC code.
+ GlobalBaseReg,
+
+ /// A wrapper node for TargetConstantPool, TargetJumpTable,
+ /// TargetExternalSymbol, TargetGlobalAddress, TargetGlobalTLSAddress,
+ /// MCSymbol and TargetBlockAddress.
+ Wrapper,
+
+ /// Special wrapper used under X86-64 PIC mode for RIP
+ /// relative displacements.
+ WrapperRIP,
+
+ /// Copies a 64-bit value from the low word of an XMM vector
+ /// to an MMX vector. If you think this is too close to the previous
+ /// mnemonic, so do I; blame Intel.
+ MOVDQ2Q,
+
+ /// Copies a 32-bit value from the low word of a MMX
+ /// vector to a GPR.
+ MMX_MOVD2W,
+
+ /// Copies a GPR into the low 32-bit word of a MMX vector
+ /// and zero out the high word.
+ MMX_MOVW2D,
+
+ /// Extract an 8-bit value from a vector and zero extend it to
+ /// i32, corresponds to X86::PEXTRB.
+ PEXTRB,
+
+ /// Extract a 16-bit value from a vector and zero extend it to
+ /// i32, corresponds to X86::PEXTRW.
+ PEXTRW,
+
+ /// Insert any element of a 4 x float vector into any element
+ /// of a destination 4 x floatvector.
+ INSERTPS,
+
+ /// Insert the lower 8-bits of a 32-bit value to a vector,
+ /// corresponds to X86::PINSRB.
+ PINSRB,
+
+ /// Insert the lower 16-bits of a 32-bit value to a vector,
+ /// corresponds to X86::PINSRW.
+ PINSRW, MMX_PINSRW,
+
+ /// Shuffle 16 8-bit values within a vector.
+ PSHUFB,
+
+ /// Compute Sum of Absolute Differences.
+ PSADBW,
+ /// Compute Double Block Packed Sum-Absolute-Differences
+ DBPSADBW,
+
+ /// Bitwise Logical AND NOT of Packed FP values.
+ ANDNP,
+
+ /// Blend where the selector is an immediate.
+ BLENDI,
+
+ /// Blend where the condition has been shrunk.
+ /// This is used to emphasize that the condition mask is
+ /// no more valid for generic VSELECT optimizations.
+ SHRUNKBLEND,
+
+ /// Combined add and sub on an FP vector.
+ ADDSUB,
+
+ // FP vector ops with rounding mode.
+ FADD_RND,
+ FSUB_RND,
+ FMUL_RND,
+ FDIV_RND,
+ FMAX_RND,
+ FMIN_RND,
+ FSQRT_RND, FSQRTS_RND,
+
+ // FP vector get exponent.
+ FGETEXP_RND, FGETEXPS_RND,
+ // Extract Normalized Mantissas.
+ VGETMANT, VGETMANTS,
+ // FP Scale.
+ SCALEF,
+ SCALEFS,
+
+ // Integer add/sub with unsigned saturation.
+ ADDUS,
+ SUBUS,
+
+ // Integer add/sub with signed saturation.
+ ADDS,
+ SUBS,
+
+ // Unsigned Integer average.
+ AVG,
+
+ /// Integer horizontal add/sub.
+ HADD,
+ HSUB,
+
+ /// Floating point horizontal add/sub.
+ FHADD,
+ FHSUB,
+
+ // Integer absolute value
+ ABS,
+
+ // Detect Conflicts Within a Vector
+ CONFLICT,
+
+ /// Floating point max and min.
+ FMAX, FMIN,
+
+ /// Commutative FMIN and FMAX.
+ FMAXC, FMINC,
+
+ /// Floating point reciprocal-sqrt and reciprocal approximation.
+ /// Note that these typically require refinement
+ /// in order to obtain suitable precision.
+ FRSQRT, FRCP,
+ FRSQRTS, FRCPS,
+
+ // Thread Local Storage.
+ TLSADDR,
+
+ // Thread Local Storage. A call to get the start address
+ // of the TLS block for the current module.
+ TLSBASEADDR,
+
+ // Thread Local Storage. When calling to an OS provided
+ // thunk at the address from an earlier relocation.
+ TLSCALL,
+
+ // Exception Handling helpers.
+ EH_RETURN,
+
+ // SjLj exception handling setjmp.
+ EH_SJLJ_SETJMP,
+
+ // SjLj exception handling longjmp.
+ EH_SJLJ_LONGJMP,
+
+ // SjLj exception handling dispatch.
+ EH_SJLJ_SETUP_DISPATCH,
+
+ /// Tail call return. See X86TargetLowering::LowerCall for
+ /// the list of operands.
+ TC_RETURN,
+
+ // Vector move to low scalar and zero higher vector elements.
+ VZEXT_MOVL,
+
+ // Vector integer zero-extend.
+ VZEXT,
+ // Vector integer signed-extend.
+ VSEXT,
+
+ // Vector integer truncate.
+ VTRUNC,
+ // Vector integer truncate with unsigned/signed saturation.
+ VTRUNCUS, VTRUNCS,
+
+ // Vector FP extend.
+ VFPEXT, VFPEXT_RND, VFPEXTS_RND,
+
+ // Vector FP round.
+ VFPROUND, VFPROUND_RND, VFPROUNDS_RND,
+
+ // Convert a vector to mask, set bits base on MSB.
+ CVT2MASK,
+
+ // 128-bit vector logical left / right shift
+ VSHLDQ, VSRLDQ,
+
+ // Vector shift elements
+ VSHL, VSRL, VSRA,
+
+ // Vector variable shift right arithmetic.
+ // Unlike ISD::SRA, in case shift count greater then element size
+ // use sign bit to fill destination data element.
+ VSRAV,
+
+ // Vector shift elements by immediate
+ VSHLI, VSRLI, VSRAI,
+
+ // Bit rotate by immediate
+ VROTLI, VROTRI,
+
+ // Vector packed double/float comparison.
+ CMPP,
+
+ // Vector integer comparisons.
+ PCMPEQ, PCMPGT,
+ // Vector integer comparisons, the result is in a mask vector.
+ PCMPEQM, PCMPGTM,
+
+ MULTISHIFT,
+
+ /// Vector comparison generating mask bits for fp and
+ /// integer signed and unsigned data types.
+ CMPM,
+ CMPMU,
+ // Vector comparison with rounding mode for FP values
+ CMPM_RND,
+
+ // Arithmetic operations with FLAGS results.
+ ADD, SUB, ADC, SBB, SMUL,
+ INC, DEC, OR, XOR, AND,
+
+ // Bit field extract.
+ BEXTR,
+
+ // LOW, HI, FLAGS = umul LHS, RHS.
+ UMUL,
+
+ // 8-bit SMUL/UMUL - AX, FLAGS = smul8/umul8 AL, RHS.
+ SMUL8, UMUL8,
+
+ // 8-bit divrem that zero-extend the high result (AH).
+ UDIVREM8_ZEXT_HREG,
+ SDIVREM8_SEXT_HREG,
+
+ // X86-specific multiply by immediate.
+ MUL_IMM,
+
+ // Vector sign bit extraction.
+ MOVMSK,
+
+ // Vector bitwise comparisons.
+ PTEST,
+
+ // Vector packed fp sign bitwise comparisons.
+ TESTP,
+
+ // Vector "test" in AVX-512, the result is in a mask vector.
+ TESTM,
+ TESTNM,
+
+ // OR/AND test for masks.
+ KORTEST,
+ KTEST,
+
+ // Several flavors of instructions with vector shuffle behaviors.
+ // Saturated signed/unnsigned packing.
+ PACKSS,
+ PACKUS,
+ // Intra-lane alignr.
+ PALIGNR,
+ // AVX512 inter-lane alignr.
+ VALIGN,
+ PSHUFD,
+ PSHUFHW,
+ PSHUFLW,
+ SHUFP,
+ //Shuffle Packed Values at 128-bit granularity.
+ SHUF128,
+ MOVDDUP,
+ MOVSHDUP,
+ MOVSLDUP,
+ MOVLHPS,
+ MOVLHPD,
+ MOVHLPS,
+ MOVLPS,
+ MOVLPD,
+ MOVSD,
+ MOVSS,
+ UNPCKL,
+ UNPCKH,
+ VPERMILPV,
+ VPERMILPI,
+ VPERMI,
+ VPERM2X128,
+
+ // Variable Permute (VPERM).
+ // Res = VPERMV MaskV, V0
+ VPERMV,
+
+ // 3-op Variable Permute (VPERMT2).
+ // Res = VPERMV3 V0, MaskV, V1
+ VPERMV3,
+
+ // 3-op Variable Permute overwriting the index (VPERMI2).
+ // Res = VPERMIV3 V0, MaskV, V1
+ VPERMIV3,
+
+ // Bitwise ternary logic.
+ VPTERNLOG,
+ // Fix Up Special Packed Float32/64 values.
+ VFIXUPIMM,
+ VFIXUPIMMS,
+ // Range Restriction Calculation For Packed Pairs of Float32/64 values.
+ VRANGE,
+ // Reduce - Perform Reduction Transformation on scalar\packed FP.
+ VREDUCE, VREDUCES,
+ // RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
+ VRNDSCALE, VRNDSCALES,
+ // Tests Types Of a FP Values for packed types.
+ VFPCLASS,
+ // Tests Types Of a FP Values for scalar types.
+ VFPCLASSS,
+
+ // Broadcast scalar to vector.
+ VBROADCAST,
+ // Broadcast mask to vector.
+ VBROADCASTM,
+ // Broadcast subvector to vector.
+ SUBV_BROADCAST,
+
+ // Insert/Extract vector element.
+ VINSERT,
+ VEXTRACT,
+
+ /// SSE4A Extraction and Insertion.
+ EXTRQI, INSERTQI,
+
+ // XOP variable/immediate rotations.
+ VPROT, VPROTI,
+ // XOP arithmetic/logical shifts.
+ VPSHA, VPSHL,
+ // XOP signed/unsigned integer comparisons.
+ VPCOM, VPCOMU,
+ // XOP packed permute bytes.
+ VPPERM,
+ // XOP two source permutation.
+ VPERMIL2,
+
+ // Vector multiply packed unsigned doubleword integers.
+ PMULUDQ,
+ // Vector multiply packed signed doubleword integers.
+ PMULDQ,
+ // Vector Multiply Packed UnsignedIntegers with Round and Scale.
+ MULHRS,
+
+ // Multiply and Add Packed Integers.
+ VPMADDUBSW, VPMADDWD,
+ VPMADD52L, VPMADD52H,
+
+ // FMA nodes.
+ FMADD,
+ FNMADD,
+ FMSUB,
+ FNMSUB,
+ FMADDSUB,
+ FMSUBADD,
+
+ // FMA with rounding mode.
+ FMADD_RND,
+ FNMADD_RND,
+ FMSUB_RND,
+ FNMSUB_RND,
+ FMADDSUB_RND,
+ FMSUBADD_RND,
+
+ // Scalar intrinsic FMA with rounding mode.
+ // Two versions, passthru bits on op1 or op3.
+ FMADDS1_RND, FMADDS3_RND,
+ FNMADDS1_RND, FNMADDS3_RND,
+ FMSUBS1_RND, FMSUBS3_RND,
+ FNMSUBS1_RND, FNMSUBS3_RND,
+
+ // Compress and expand.
+ COMPRESS,
+ EXPAND,
+
+ // Convert Unsigned/Integer to Floating-Point Value with rounding mode.
+ SINT_TO_FP_RND, UINT_TO_FP_RND,
+ SCALAR_SINT_TO_FP_RND, SCALAR_UINT_TO_FP_RND,
+
+ // Vector float/double to signed/unsigned integer.
+ CVTP2SI, CVTP2UI, CVTP2SI_RND, CVTP2UI_RND,
+ // Scalar float/double to signed/unsigned integer.
+ CVTS2SI_RND, CVTS2UI_RND,
+
+ // Vector float/double to signed/unsigned integer with truncation.
+ CVTTP2SI, CVTTP2UI, CVTTP2SI_RND, CVTTP2UI_RND,
+ // Scalar float/double to signed/unsigned integer with truncation.
+ CVTTS2SI_RND, CVTTS2UI_RND,
+
+ // Vector signed/unsigned integer to float/double.
+ CVTSI2P, CVTUI2P,
+
+ // Save xmm argument registers to the stack, according to %al. An operator
+ // is needed so that this can be expanded with control flow.
+ VASTART_SAVE_XMM_REGS,
+
+ // Windows's _chkstk call to do stack probing.
+ WIN_ALLOCA,
+
+ // For allocating variable amounts of stack space when using
+ // segmented stacks. Check if the current stacklet has enough space, and
+ // falls back to heap allocation if not.
+ SEG_ALLOCA,
+
+ // Memory barriers.
+ MEMBARRIER,
+ MFENCE,
+
+ // Store FP status word into i16 register.
+ FNSTSW16r,
+
+ // Store contents of %ah into %eflags.
+ SAHF,
+
+ // Get a random integer and indicate whether it is valid in CF.
+ RDRAND,
+
+ // Get a NIST SP800-90B & C compliant random integer and
+ // indicate whether it is valid in CF.
+ RDSEED,
+
+ // SSE42 string comparisons.
+ PCMPISTRI,
+ PCMPESTRI,
+
+ // Test if in transactional execution.
+ XTEST,
+
+ // ERI instructions.
+ RSQRT28, RSQRT28S, RCP28, RCP28S, EXP2,
+
+ // Conversions between float and half-float.
+ CVTPS2PH, CVTPH2PS,
+
+ // Compare and swap.
+ LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
+ LCMPXCHG8_DAG,
+ LCMPXCHG16_DAG,
+ LCMPXCHG8_SAVE_EBX_DAG,
+ LCMPXCHG16_SAVE_RBX_DAG,
+
+ /// LOCK-prefixed arithmetic read-modify-write instructions.
+ /// EFLAGS, OUTCHAIN = LADD(INCHAIN, PTR, RHS)
+ LADD, LSUB, LOR, LXOR, LAND,
+
+ // Load, scalar_to_vector, and zero extend.
+ VZEXT_LOAD,
+
+ // Store FP control world into i16 memory.
+ FNSTCW16m,
+
+ /// This instruction implements FP_TO_SINT with the
+ /// integer destination in memory and a FP reg source. This corresponds
+ /// to the X86::FIST*m instructions and the rounding mode change stuff. It
+ /// has two inputs (token chain and address) and two outputs (int value
+ /// and token chain).
+ FP_TO_INT16_IN_MEM,
+ FP_TO_INT32_IN_MEM,
+ FP_TO_INT64_IN_MEM,
+
+ /// This instruction implements SINT_TO_FP with the
+ /// integer source in memory and FP reg result. This corresponds to the
+ /// X86::FILD*m instructions. It has three inputs (token chain, address,
+ /// and source type) and two outputs (FP value and token chain). FILD_FLAG
+ /// also produces a flag).
+ FILD,
+ FILD_FLAG,
+
+ /// This instruction implements an extending load to FP stack slots.
+ /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain
+ /// operand, ptr to load from, and a ValueType node indicating the type
+ /// to load to.
+ FLD,
+
+ /// This instruction implements a truncating store to FP stack
+ /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a
+ /// chain operand, value to store, address, and a ValueType to store it
+ /// as.
+ FST,
+
+ /// This instruction grabs the address of the next argument
+ /// from a va_list. (reads and modifies the va_list in memory)
+ VAARG_64,
+
+ // Vector truncating store with unsigned/signed saturation
+ VTRUNCSTOREUS, VTRUNCSTORES,
+ // Vector truncating masked store with unsigned/signed saturation
+ VMTRUNCSTOREUS, VMTRUNCSTORES
+
+ // WARNING: Do not add anything in the end unless you want the node to
+ // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all
+ // opcodes will be thought as target memory ops!
+ };
+ } // end namespace X86ISD
+
+ /// Define some predicates that are used for node matching.
+ namespace X86 {
+ /// Return true if the specified
+ /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
+ /// suitable for input to VEXTRACTF128, VEXTRACTI128 instructions.
+ bool isVEXTRACT128Index(SDNode *N);
+
+ /// Return true if the specified
+ /// INSERT_SUBVECTOR operand specifies a subvector insert that is
+ /// suitable for input to VINSERTF128, VINSERTI128 instructions.
+ bool isVINSERT128Index(SDNode *N);
+
+ /// Return true if the specified
+ /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
+ /// suitable for input to VEXTRACTF64X4, VEXTRACTI64X4 instructions.
+ bool isVEXTRACT256Index(SDNode *N);
+
+ /// Return true if the specified
+ /// INSERT_SUBVECTOR operand specifies a subvector insert that is
+ /// suitable for input to VINSERTF64X4, VINSERTI64X4 instructions.
+ bool isVINSERT256Index(SDNode *N);
+
+ /// Return the appropriate
+ /// immediate to extract the specified EXTRACT_SUBVECTOR index
+ /// with VEXTRACTF128, VEXTRACTI128 instructions.
+ unsigned getExtractVEXTRACT128Immediate(SDNode *N);
+
+ /// Return the appropriate
+ /// immediate to insert at the specified INSERT_SUBVECTOR index
+ /// with VINSERTF128, VINSERT128 instructions.
+ unsigned getInsertVINSERT128Immediate(SDNode *N);
+
+ /// Return the appropriate
+ /// immediate to extract the specified EXTRACT_SUBVECTOR index
+ /// with VEXTRACTF64X4, VEXTRACTI64x4 instructions.
+ unsigned getExtractVEXTRACT256Immediate(SDNode *N);
+
+ /// Return the appropriate
+ /// immediate to insert at the specified INSERT_SUBVECTOR index
+ /// with VINSERTF64x4, VINSERTI64x4 instructions.
+ unsigned getInsertVINSERT256Immediate(SDNode *N);
+
+ /// Returns true if Elt is a constant zero or floating point constant +0.0.
+ bool isZeroNode(SDValue Elt);
+
+ /// Returns true of the given offset can be
+ /// fit into displacement field of the instruction.
+ bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
+ bool hasSymbolicDisplacement = true);
+
+ /// Determines whether the callee is required to pop its
+ /// own arguments. Callee pop is necessary to support tail calls.
+ bool isCalleePop(CallingConv::ID CallingConv,
+ bool is64Bit, bool IsVarArg, bool GuaranteeTCO);
+
+ } // end namespace X86
+
+ //===--------------------------------------------------------------------===//
+ // X86 Implementation of the TargetLowering interface
+ class X86TargetLowering final : public TargetLowering {
+ public:
+ explicit X86TargetLowering(const X86TargetMachine &TM,
+ const X86Subtarget &STI);
+
+ unsigned getJumpTableEncoding() const override;
+ bool useSoftFloat() const override;
+
+ MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
+ return MVT::i8;
+ }
+
+ const MCExpr *
+ LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
+ const MachineBasicBlock *MBB, unsigned uid,
+ MCContext &Ctx) const override;
+
+ /// Returns relocation base for the given PIC jumptable.
+ SDValue getPICJumpTableRelocBase(SDValue Table,
+ SelectionDAG &DAG) const override;
+ const MCExpr *
+ getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
+ unsigned JTI, MCContext &Ctx) const override;
+
+ /// Return the desired alignment for ByVal aggregate
+ /// function arguments in the caller parameter area. For X86, aggregates
+ /// that contains are placed at 16-byte boundaries while the rest are at
+ /// 4-byte boundaries.
+ unsigned getByValTypeAlignment(Type *Ty,
+ const DataLayout &DL) const override;
+
+ /// Returns the target specific optimal type for load
+ /// and store operations as a result of memset, memcpy, and memmove
+ /// lowering. If DstAlign is zero that means it's safe to destination
+ /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
+ /// means there isn't a need to check it against alignment requirement,
+ /// probably because the source does not need to be loaded. If 'IsMemset' is
+ /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
+ /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
+ /// source is constant so it does not need to be loaded.
+ /// It returns EVT::Other if the type should be determined using generic
+ /// target-independent logic.
+ EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
+ bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
+ MachineFunction &MF) const override;
+
+ /// Returns true if it's safe to use load / store of the
+ /// specified type to expand memcpy / memset inline. This is mostly true
+ /// for all types except for some special cases. For example, on X86
+ /// targets without SSE2 f64 load / store are done with fldl / fstpl which
+ /// also does type conversion. Note the specified type doesn't have to be
+ /// legal as the hook is used before type legalization.
+ bool isSafeMemOpType(MVT VT) const override;
+
+ /// Returns true if the target allows unaligned memory accesses of the
+ /// specified type. Returns whether it is "fast" in the last argument.
+ bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align,
+ bool *Fast) const override;
+
+ /// Provide custom lowering hooks for some operations.
+ ///
+ SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+ /// Places new result values for the node in Results (their number
+ /// and types must exactly match those of the original return values of
+ /// the node), or leaves Results empty, which indicates that the node is not
+ /// to be custom lowered after all.
+ void LowerOperationWrapper(SDNode *N,
+ SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const override;
+
+ /// Replace the results of node with an illegal result
+ /// type with new values built out of custom code.
+ ///
+ void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+ SelectionDAG &DAG) const override;
+
+ SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+ /// Return true if the target has native support for
+ /// the specified value type and it is 'desirable' to use the type for the
+ /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
+ /// instruction encodings are longer and some i16 instructions are slow.
+ bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override;
+
+ /// Return true if the target has native support for the
+ /// specified value type and it is 'desirable' to use the type. e.g. On x86
+ /// i16 is legal, but undesirable since i16 instruction encodings are longer
+ /// and some i16 instructions are slow.
+ bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override;
+
+ /// Return true if the MachineFunction contains a COPY which would imply
+ /// HasOpaqueSPAdjustment.
+ bool hasCopyImplyingStackAdjustment(MachineFunction *MF) const override;
+
+ MachineBasicBlock *
+ EmitInstrWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *MBB) const override;
+
+ /// This method returns the name of a target specific DAG node.
+ const char *getTargetNodeName(unsigned Opcode) const override;
+
+ bool isCheapToSpeculateCttz() const override;
+
+ bool isCheapToSpeculateCtlz() const override;
+
+ bool isCtlzFast() const override;
+
+ bool hasBitPreservingFPLogic(EVT VT) const override {
+ return VT == MVT::f32 || VT == MVT::f64 || VT.isVector();
+ }
+
+ bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const override {
+ // If the pair to store is a mixture of float and int values, we will
+ // save two bitwise instructions and one float-to-int instruction and
+ // increase one store instruction. There is potentially a more
+ // significant benefit because it avoids the float->int domain switch
+ // for input value. So It is more likely a win.
+ if ((LTy.isFloatingPoint() && HTy.isInteger()) ||
+ (LTy.isInteger() && HTy.isFloatingPoint()))
+ return true;
+ // If the pair only contains int values, we will save two bitwise
+ // instructions and increase one store instruction (costing one more
+ // store buffer). Since the benefit is more blurred so we leave
+ // such pair out until we get testcase to prove it is a win.
+ return false;
+ }
+
+ bool hasAndNotCompare(SDValue Y) const override;
+
+ /// Return the value type to use for ISD::SETCC.
+ EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+ EVT VT) const override;
+
+ /// Determine which of the bits specified in Mask are known to be either
+ /// zero or one and return them in the KnownZero/KnownOne bitsets.
+ void computeKnownBitsForTargetNode(const SDValue Op,
+ APInt &KnownZero,
+ APInt &KnownOne,
+ const SelectionDAG &DAG,
+ unsigned Depth = 0) const override;
+
+ /// Determine the number of bits in the operation that are sign bits.
+ unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
+ const SelectionDAG &DAG,
+ unsigned Depth) const override;
+
+ bool isGAPlusOffset(SDNode *N, const GlobalValue* &GA,
+ int64_t &Offset) const override;
+
+ SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
+
+ bool ExpandInlineAsm(CallInst *CI) const override;
+
+ ConstraintType getConstraintType(StringRef Constraint) const override;
+
+ /// Examine constraint string and operand type and determine a weight value.
+ /// The operand object must already have been set up with the operand type.
+ ConstraintWeight
+ getSingleConstraintMatchWeight(AsmOperandInfo &info,
+ const char *constraint) const override;
+
+ const char *LowerXConstraint(EVT ConstraintVT) const override;
+
+ /// Lower the specified operand into the Ops vector. If it is invalid, don't
+ /// add anything to Ops. If hasMemory is true it means one of the asm
+ /// constraint of the inline asm instruction being processed is 'm'.
+ void LowerAsmOperandForConstraint(SDValue Op,
+ std::string &Constraint,
+ std::vector<SDValue> &Ops,
+ SelectionDAG &DAG) const override;
+
+ unsigned
+ getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
+ if (ConstraintCode == "i")
+ return InlineAsm::Constraint_i;
+ else if (ConstraintCode == "o")
+ return InlineAsm::Constraint_o;
+ else if (ConstraintCode == "v")
+ return InlineAsm::Constraint_v;
+ else if (ConstraintCode == "X")
+ return InlineAsm::Constraint_X;
+ return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
+ }
+
+ /// Given a physical register constraint
+ /// (e.g. {edx}), return the register number and the register class for the
+ /// register. This should only be used for C_Register constraints. On
+ /// error, this returns a register number of 0.
+ std::pair<unsigned, const TargetRegisterClass *>
+ getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ StringRef Constraint, MVT VT) const override;
+
+ /// Return true if the addressing mode represented
+ /// by AM is legal for this target, for a load/store of the specified type.
+ bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
+ Type *Ty, unsigned AS) const override;
+
+ /// Return true if the specified immediate is legal
+ /// icmp immediate, that is the target has icmp instructions which can
+ /// compare a register against the immediate without having to materialize
+ /// the immediate into a register.
+ bool isLegalICmpImmediate(int64_t Imm) const override;
+
+ /// Return true if the specified immediate is legal
+ /// add immediate, that is the target has add instructions which can
+ /// add a register and the immediate without having to materialize
+ /// the immediate into a register.
+ bool isLegalAddImmediate(int64_t Imm) const override;
+
+ /// \brief Return the cost of the scaling factor used in the addressing
+ /// mode represented by AM for this target, for a load/store
+ /// of the specified type.
+ /// If the AM is supported, the return value must be >= 0.
+ /// If the AM is not supported, it returns a negative value.
+ int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty,
+ unsigned AS) const override;
+
+ bool isVectorShiftByScalarCheap(Type *Ty) const override;
+
+ /// Return true if it's free to truncate a value of
+ /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in
+ /// register EAX to i16 by referencing its sub-register AX.
+ bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
+ bool isTruncateFree(EVT VT1, EVT VT2) const override;
+
+ bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;
+
+ /// Return true if any actual instruction that defines a
+ /// value of type Ty1 implicit zero-extends the value to Ty2 in the result
+ /// register. This does not necessarily include registers defined in
+ /// unknown ways, such as incoming arguments, or copies from unknown
+ /// virtual registers. Also, if isTruncateFree(Ty2, Ty1) is true, this
+ /// does not necessarily apply to truncate instructions. e.g. on x86-64,
+ /// all instructions that define 32-bit values implicit zero-extend the
+ /// result out to 64 bits.
+ bool isZExtFree(Type *Ty1, Type *Ty2) const override;
+ bool isZExtFree(EVT VT1, EVT VT2) const override;
+ bool isZExtFree(SDValue Val, EVT VT2) const override;
+
+ /// Return true if folding a vector load into ExtVal (a sign, zero, or any
+ /// extend node) is profitable.
+ bool isVectorLoadExtDesirable(SDValue) const override;
+
+ /// Return true if an FMA operation is faster than a pair of fmul and fadd
+ /// instructions. fmuladd intrinsics will be expanded to FMAs when this
+ /// method returns true, otherwise fmuladd is expanded to fmul + fadd.
+ bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
+
+ /// Return true if it's profitable to narrow
+ /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow
+ /// from i32 to i8 but not from i32 to i16.
+ bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
+
+ /// Given an intrinsic, checks if on the target the intrinsic will need to map
+ /// to a MemIntrinsicNode (touches memory). If this is the case, it returns
+ /// true and stores the intrinsic information into the IntrinsicInfo that was
+ /// passed to the function.
+ bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
+ unsigned Intrinsic) const override;
+
+ /// Returns true if the target can instruction select the
+ /// specified FP immediate natively. If false, the legalizer will
+ /// materialize the FP immediate as a load from a constant pool.
+ bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+
+ /// Targets can use this to indicate that they only support *some*
+ /// VECTOR_SHUFFLE operations, those with specific masks. By default, if a
+ /// target supports the VECTOR_SHUFFLE node, all mask values are assumed to
+ /// be legal.
+ bool isShuffleMaskLegal(const SmallVectorImpl<int> &Mask,
+ EVT VT) const override;
+
+ /// Similar to isShuffleMaskLegal. This is used by Targets can use this to
+ /// indicate if there is a suitable VECTOR_SHUFFLE that can be used to
+ /// replace a VAND with a constant pool entry.
+ bool isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
+ EVT VT) const override;
+
+ /// If true, then instruction selection should
+ /// seek to shrink the FP constant of the specified type to a smaller type
+ /// in order to save space and / or reduce runtime.
+ bool ShouldShrinkFPConstant(EVT VT) const override {
+ // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
+ // expensive than a straight movsd. On the other hand, it's important to
+ // shrink long double fp constant since fldt is very slow.
+ return !X86ScalarSSEf64 || VT == MVT::f80;
+ }
+
+ /// Return true if we believe it is correct and profitable to reduce the
+ /// load node to a smaller type.
+ bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy,
+ EVT NewVT) const override;
+
+ /// Return true if the specified scalar FP type is computed in an SSE
+ /// register, not on the X87 floating point stack.
+ bool isScalarFPTypeInSSEReg(EVT VT) const {
+ return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
+ (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1
+ }
+
+ /// \brief Returns true if it is beneficial to convert a load of a constant
+ /// to just the constant itself.
+ bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+ Type *Ty) const override;
+
+ /// Return true if EXTRACT_SUBVECTOR is cheap for this result type
+ /// with this index.
+ bool isExtractSubvectorCheap(EVT ResVT, unsigned Index) const override;
+
+ /// Intel processors have a unified instruction and data cache
+ const char * getClearCacheBuiltinName() const override {
+ return nullptr; // nothing to do, move along.
+ }
+
+ unsigned getRegisterByName(const char* RegName, EVT VT,
+ SelectionDAG &DAG) const override;
+
+ /// If a physical register, this returns the register that receives the
+ /// exception address on entry to an EH pad.
+ unsigned
+ getExceptionPointerRegister(const Constant *PersonalityFn) const override;
+
+ /// If a physical register, this returns the register that receives the
+ /// exception typeid on entry to a landing pad.
+ unsigned
+ getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
+
+ virtual bool needsFixedCatchObjects() const override;
+
+ /// This method returns a target specific FastISel object,
+ /// or null if the target does not support "fast" ISel.
+ FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo) const override;
+
+ /// If the target has a standard location for the stack protector cookie,
+ /// returns the address of that location. Otherwise, returns nullptr.
+ Value *getIRStackGuard(IRBuilder<> &IRB) const override;
+
+ bool useLoadStackGuardNode() const override;
+ void insertSSPDeclarations(Module &M) const override;
+ Value *getSDagStackGuard(const Module &M) const override;
+ Value *getSSPStackGuardCheck(const Module &M) const override;
+
+ /// Return true if the target stores SafeStack pointer at a fixed offset in
+ /// some non-standard address space, and populates the address space and
+ /// offset as appropriate.
+ Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override;
+
+ SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot,
+ SelectionDAG &DAG) const;
+
+ bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
+
+ /// \brief Customize the preferred legalization strategy for certain types.
+ LegalizeTypeAction getPreferredVectorAction(EVT VT) const override;
+
+ bool isIntDivCheap(EVT VT, AttributeSet Attr) const override;
+
+ bool supportSwiftError() const override;
+
+ unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
+
+ /// \brief Lower interleaved load(s) into target specific
+ /// instructions/intrinsics.
+ bool lowerInterleavedLoad(LoadInst *LI,
+ ArrayRef<ShuffleVectorInst *> Shuffles,
+ ArrayRef<unsigned> Indices,
+ unsigned Factor) const override;
+ protected:
+ std::pair<const TargetRegisterClass *, uint8_t>
+ findRepresentativeClass(const TargetRegisterInfo *TRI,
+ MVT VT) const override;
+
+ private:
+ /// Keep a reference to the X86Subtarget around so that we can
+ /// make the right decision when generating code for different targets.
+ const X86Subtarget &Subtarget;
+
+ /// Select between SSE or x87 floating point ops.
+ /// When SSE is available, use it for f32 operations.
+ /// When SSE2 is available, use it for f64 operations.
+ bool X86ScalarSSEf32;
+ bool X86ScalarSSEf64;
+
+ /// A list of legal FP immediates.
+ std::vector<APFloat> LegalFPImmediates;
+
+ /// Indicate that this x86 target can instruction
+ /// select the specified FP immediate natively.
+ void addLegalFPImmediate(const APFloat& Imm) {
+ LegalFPImmediates.push_back(Imm);
+ }
+
+ SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+ CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const;
+ SDValue LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
+ const SmallVectorImpl<ISD::InputArg> &ArgInfo,
+ const SDLoc &dl, SelectionDAG &DAG,
+ const CCValAssign &VA, MachineFrameInfo &MFI,
+ unsigned i) const;
+ SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
+ const SDLoc &dl, SelectionDAG &DAG,
+ const CCValAssign &VA,
+ ISD::ArgFlagsTy Flags) const;
+
+ // Call lowering helpers.
+
+ /// Check whether the call is eligible for tail call optimization. Targets
+ /// that want to do tail call optimization should implement this function.
+ bool IsEligibleForTailCallOptimization(SDValue Callee,
+ CallingConv::ID CalleeCC,
+ bool isVarArg,
+ bool isCalleeStructRet,
+ bool isCallerStructRet,
+ Type *RetTy,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ SelectionDAG& DAG) const;
+ SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr,
+ SDValue Chain, bool IsTailCall,
+ bool Is64Bit, int FPDiff,
+ const SDLoc &dl) const;
+
+ unsigned GetAlignedArgumentStackSize(unsigned StackSize,
+ SelectionDAG &DAG) const;
+
+ unsigned getAddressSpace(void) const;
+
+ std::pair<SDValue,SDValue> FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
+ bool isSigned,
+ bool isReplace) const;
+
+ SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const;
+ SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+
+ unsigned getGlobalWrapperKind(const GlobalValue *GV = nullptr) const;
+ SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGlobalAddress(const GlobalValue *GV, const SDLoc &dl,
+ int64_t Offset, SelectionDAG &DAG) const;
+ SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_TO_INT(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) const;
+ SDValue LowerToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl,
+ SelectionDAG &DAG) const;
+ SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSETCCE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerADDROFRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGC_TRANSITION_START(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGC_TRANSITION_END(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue
+ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const override;
+ SDValue LowerCall(CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const override;
+
+ SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SDLoc &dl, SelectionDAG &DAG) const override;
+
+ bool supportSplitCSR(MachineFunction *MF) const override {
+ return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
+ MF->getFunction()->hasFnAttribute(Attribute::NoUnwind);
+ }
+ void initializeSplitCSR(MachineBasicBlock *Entry) const override;
+ void insertCopiesSplitCSR(
+ MachineBasicBlock *Entry,
+ const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
+
+ bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
+
+ bool mayBeEmittedAsTailCall(CallInst *CI) const override;
+
+ EVT getTypeForExtReturn(LLVMContext &Context, EVT VT,
+ ISD::NodeType ExtendKind) const override;
+
+ bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ LLVMContext &Context) const override;
+
+ const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
+
+ TargetLoweringBase::AtomicExpansionKind
+ shouldExpandAtomicLoadInIR(LoadInst *SI) const override;
+ bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
+ TargetLoweringBase::AtomicExpansionKind
+ shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
+
+ LoadInst *
+ lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
+
+ bool needsCmpXchgNb(Type *MemType) const;
+
+ void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB,
+ MachineBasicBlock *DispatchBB, int FI) const;
+
+ // Utility function to emit the low-level va_arg code for X86-64.
+ MachineBasicBlock *
+ EmitVAARG64WithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *MBB) const;
+
+ /// Utility function to emit the xmm reg save portion of va_start.
+ MachineBasicBlock *
+ EmitVAStartSaveXMMRegsWithCustomInserter(MachineInstr &BInstr,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *EmitLoweredSelect(MachineInstr &I,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *EmitLoweredAtomicFP(MachineInstr &I,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *EmitLoweredCatchPad(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *EmitLoweredTLSAddr(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *EmitLoweredTLSCall(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI,
+ MachineBasicBlock *MBB) const;
+
+ MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr &MI,
+ MachineBasicBlock *MBB) const;
+
+ MachineBasicBlock *emitFMA3Instr(MachineInstr &MI,
+ MachineBasicBlock *MBB) const;
+
+ MachineBasicBlock *EmitSjLjDispatchBlock(MachineInstr &MI,
+ MachineBasicBlock *MBB) const;
+
+ /// Emit nodes that will be selected as "test Op0,Op0", or something
+ /// equivalent, for use with the given x86 condition code.
+ SDValue EmitTest(SDValue Op0, unsigned X86CC, const SDLoc &dl,
+ SelectionDAG &DAG) const;
+
+ /// Emit nodes that will be selected as "cmp Op0,Op1", or something
+ /// equivalent, for use with the given x86 condition code.
+ SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, const SDLoc &dl,
+ SelectionDAG &DAG) const;
+
+ /// Convert a comparison if required by the subtarget.
+ SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const;
+
+ /// Check if replacement of SQRT with RSQRT should be disabled.
+ bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override;
+
+ /// Use rsqrt* to speed up sqrt calculations.
+ SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
+ int &RefinementSteps, bool &UseOneConstNR,
+ bool Reciprocal) const override;
+
+ /// Use rcp* to speed up fdiv calculations.
+ SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
+ int &RefinementSteps) const override;
+
+ /// Reassociate floating point divisions into multiply by reciprocal.
+ unsigned combineRepeatedFPDivisors() const override;
+ };
+
+ namespace X86 {
+ FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo);
+ } // end namespace X86
+
+ // Base class for all X86 non-masked store operations.
+ class X86StoreSDNode : public MemSDNode {
+ public:
+ X86StoreSDNode(unsigned Opcode, unsigned Order, const DebugLoc &dl,
+ SDVTList VTs, EVT MemVT,
+ MachineMemOperand *MMO)
+ :MemSDNode(Opcode, Order, dl, VTs, MemVT, MMO) {}
+ const SDValue &getValue() const { return getOperand(1); }
+ const SDValue &getBasePtr() const { return getOperand(2); }
+
+ static bool classof(const SDNode *N) {
+ return N->getOpcode() == X86ISD::VTRUNCSTORES ||
+ N->getOpcode() == X86ISD::VTRUNCSTOREUS;
+ }
+ };
+
+ // Base class for all X86 masked store operations.
+ // The class has the same order of operands as MaskedStoreSDNode for
+ // convenience.
+ class X86MaskedStoreSDNode : public MemSDNode {
+ public:
+ X86MaskedStoreSDNode(unsigned Opcode, unsigned Order,
+ const DebugLoc &dl, SDVTList VTs, EVT MemVT,
+ MachineMemOperand *MMO)
+ : MemSDNode(Opcode, Order, dl, VTs, MemVT, MMO) {}
+
+ const SDValue &getBasePtr() const { return getOperand(1); }
+ const SDValue &getMask() const { return getOperand(2); }
+ const SDValue &getValue() const { return getOperand(3); }
+
+ static bool classof(const SDNode *N) {
+ return N->getOpcode() == X86ISD::VMTRUNCSTORES ||
+ N->getOpcode() == X86ISD::VMTRUNCSTOREUS;
+ }
+ };
+
+ // X86 Truncating Store with Signed saturation.
+ class TruncSStoreSDNode : public X86StoreSDNode {
+ public:
+ TruncSStoreSDNode(unsigned Order, const DebugLoc &dl,
+ SDVTList VTs, EVT MemVT, MachineMemOperand *MMO)
+ : X86StoreSDNode(X86ISD::VTRUNCSTORES, Order, dl, VTs, MemVT, MMO) {}
+
+ static bool classof(const SDNode *N) {
+ return N->getOpcode() == X86ISD::VTRUNCSTORES;
+ }
+ };
+
+ // X86 Truncating Store with Unsigned saturation.
+ class TruncUSStoreSDNode : public X86StoreSDNode {
+ public:
+ TruncUSStoreSDNode(unsigned Order, const DebugLoc &dl,
+ SDVTList VTs, EVT MemVT, MachineMemOperand *MMO)
+ : X86StoreSDNode(X86ISD::VTRUNCSTOREUS, Order, dl, VTs, MemVT, MMO) {}
+
+ static bool classof(const SDNode *N) {
+ return N->getOpcode() == X86ISD::VTRUNCSTOREUS;
+ }
+ };
+
+ // X86 Truncating Masked Store with Signed saturation.
+ class MaskedTruncSStoreSDNode : public X86MaskedStoreSDNode {
+ public:
+ MaskedTruncSStoreSDNode(unsigned Order,
+ const DebugLoc &dl, SDVTList VTs, EVT MemVT,
+ MachineMemOperand *MMO)
+ : X86MaskedStoreSDNode(X86ISD::VMTRUNCSTORES, Order, dl, VTs, MemVT, MMO) {}
+
+ static bool classof(const SDNode *N) {
+ return N->getOpcode() == X86ISD::VMTRUNCSTORES;
+ }
+ };
+
+ // X86 Truncating Masked Store with Unsigned saturation.
+ class MaskedTruncUSStoreSDNode : public X86MaskedStoreSDNode {
+ public:
+ MaskedTruncUSStoreSDNode(unsigned Order,
+ const DebugLoc &dl, SDVTList VTs, EVT MemVT,
+ MachineMemOperand *MMO)
+ : X86MaskedStoreSDNode(X86ISD::VMTRUNCSTOREUS, Order, dl, VTs, MemVT, MMO) {}
+
+ static bool classof(const SDNode *N) {
+ return N->getOpcode() == X86ISD::VMTRUNCSTOREUS;
+ }
+ };
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/X86/X86Instr3DNow.td b/contrib/llvm/lib/Target/X86/X86Instr3DNow.td
new file mode 100644
index 000000000000..ba1aede3c1a0
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86Instr3DNow.td
@@ -0,0 +1,103 @@
+//===-- X86Instr3DNow.td - The 3DNow! Instruction Set ------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the 3DNow! instruction set, which extends MMX to support
+// floating point and also adds a few more random instructions for good measure.
+//
+//===----------------------------------------------------------------------===//
+
+class I3DNow<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pat>
+ : I<o, F, outs, ins, asm, pat>, TB, Requires<[Has3DNow]> {
+}
+
+class I3DNow_binop<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat>
+ : I3DNow<o, F, (outs VR64:$dst), ins,
+ !strconcat(Mnemonic, "\t{$src2, $dst|$dst, $src2}"), pat>,
+ Has3DNow0F0FOpcode {
+ // FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet.
+ let isAsmParserOnly = 1;
+ let Constraints = "$src1 = $dst";
+}
+
+class I3DNow_conv<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat>
+ : I3DNow<o, F, (outs VR64:$dst), ins,
+ !strconcat(Mnemonic, "\t{$src, $dst|$dst, $src}"), pat>,
+ Has3DNow0F0FOpcode {
+ // FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet.
+ let isAsmParserOnly = 1;
+}
+
+multiclass I3DNow_binop_rm<bits<8> opc, string Mn> {
+ def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn, []>;
+ def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn, []>;
+}
+
+multiclass I3DNow_binop_rm_int<bits<8> opc, string Mn, string Ver = ""> {
+ def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn,
+ [(set VR64:$dst, (!cast<Intrinsic>(
+ !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, VR64:$src2))]>;
+ def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn,
+ [(set VR64:$dst, (!cast<Intrinsic>(
+ !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1,
+ (bitconvert (load_mmx addr:$src2))))]>;
+}
+
+multiclass I3DNow_conv_rm<bits<8> opc, string Mn> {
+ def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src1), Mn, []>;
+ def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src1), Mn, []>;
+}
+
+multiclass I3DNow_conv_rm_int<bits<8> opc, string Mn, string Ver = ""> {
+ def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src), Mn,
+ [(set VR64:$dst, (!cast<Intrinsic>(
+ !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src))]>;
+ def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src), Mn,
+ [(set VR64:$dst, (!cast<Intrinsic>(
+ !strconcat("int_x86_3dnow", Ver, "_", Mn))
+ (bitconvert (load_mmx addr:$src))))]>;
+}
+
+defm PAVGUSB : I3DNow_binop_rm_int<0xBF, "pavgusb">;
+defm PF2ID : I3DNow_conv_rm_int<0x1D, "pf2id">;
+defm PFACC : I3DNow_binop_rm_int<0xAE, "pfacc">;
+defm PFADD : I3DNow_binop_rm_int<0x9E, "pfadd">;
+defm PFCMPEQ : I3DNow_binop_rm_int<0xB0, "pfcmpeq">;
+defm PFCMPGE : I3DNow_binop_rm_int<0x90, "pfcmpge">;
+defm PFCMPGT : I3DNow_binop_rm_int<0xA0, "pfcmpgt">;
+defm PFMAX : I3DNow_binop_rm_int<0xA4, "pfmax">;
+defm PFMIN : I3DNow_binop_rm_int<0x94, "pfmin">;
+defm PFMUL : I3DNow_binop_rm_int<0xB4, "pfmul">;
+defm PFRCP : I3DNow_conv_rm_int<0x96, "pfrcp">;
+defm PFRCPIT1 : I3DNow_binop_rm_int<0xA6, "pfrcpit1">;
+defm PFRCPIT2 : I3DNow_binop_rm_int<0xB6, "pfrcpit2">;
+defm PFRSQIT1 : I3DNow_binop_rm_int<0xA7, "pfrsqit1">;
+defm PFRSQRT : I3DNow_conv_rm_int<0x97, "pfrsqrt">;
+defm PFSUB : I3DNow_binop_rm_int<0x9A, "pfsub">;
+defm PFSUBR : I3DNow_binop_rm_int<0xAA, "pfsubr">;
+defm PI2FD : I3DNow_conv_rm_int<0x0D, "pi2fd">;
+defm PMULHRW : I3DNow_binop_rm_int<0xB7, "pmulhrw">;
+
+
+def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms",
+ [(int_x86_mmx_femms)]>;
+
+def PREFETCH : I3DNow<0x0D, MRM0m, (outs), (ins i8mem:$addr),
+ "prefetch\t$addr",
+ [(prefetch addr:$addr, (i32 0), imm, (i32 1))]>;
+
+def PREFETCHW : I<0x0D, MRM1m, (outs), (ins i8mem:$addr), "prefetchw\t$addr",
+ [(prefetch addr:$addr, (i32 1), (i32 3), (i32 1))]>, TB,
+ Requires<[HasPrefetchW]>;
+
+// "3DNowA" instructions
+defm PF2IW : I3DNow_conv_rm_int<0x1C, "pf2iw", "a">;
+defm PI2FW : I3DNow_conv_rm_int<0x0C, "pi2fw", "a">;
+defm PFNACC : I3DNow_binop_rm_int<0x8A, "pfnacc", "a">;
+defm PFPNACC : I3DNow_binop_rm_int<0x8E, "pfpnacc", "a">;
+defm PSWAPD : I3DNow_conv_rm_int<0xBB, "pswapd", "a">;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td
new file mode 100644
index 000000000000..da7437ea0ccb
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -0,0 +1,9181 @@
+//===-- X86InstrAVX512.td - AVX512 Instruction Set ---------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 AVX512 instruction set, defining the
+// instructions, and properties of the instructions which are needed for code
+// generation, machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+// Group template arguments that can be derived from the vector type (EltNum x
+// EltVT). These are things like the register class for the writemask, etc.
+// The idea is to pass one of these as the template argument rather than the
+// individual arguments.
+// The template is also used for scalar types, in this case numelts is 1.
+class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
+ string suffix = ""> {
+ RegisterClass RC = rc;
+ ValueType EltVT = eltvt;
+ int NumElts = numelts;
+
+ // Corresponding mask register class.
+ RegisterClass KRC = !cast<RegisterClass>("VK" # NumElts);
+
+ // Corresponding write-mask register class.
+ RegisterClass KRCWM = !cast<RegisterClass>("VK" # NumElts # "WM");
+
+ // The mask VT.
+ ValueType KVT = !cast<ValueType>(!if (!eq (NumElts, 1), "i1",
+ "v" # NumElts # "i1"));
+
+ // The GPR register class that can hold the write mask. Use GR8 for fewer
+ // than 8 elements. Use shift-right and equal to work around the lack of
+ // !lt in tablegen.
+ RegisterClass MRC =
+ !cast<RegisterClass>("GR" #
+ !if (!eq (!srl(NumElts, 3), 0), 8, NumElts));
+
+ // Suffix used in the instruction mnemonic.
+ string Suffix = suffix;
+
+ // VTName is a string name for vector VT. For vector types it will be
+ // v # NumElts # EltVT, so for vector of 8 elements of i32 it will be v8i32
+ // It is a little bit complex for scalar types, where NumElts = 1.
+ // In this case we build v4f32 or v2f64
+ string VTName = "v" # !if (!eq (NumElts, 1),
+ !if (!eq (EltVT.Size, 32), 4,
+ !if (!eq (EltVT.Size, 64), 2, NumElts)), NumElts) # EltVT;
+
+ // The vector VT.
+ ValueType VT = !cast<ValueType>(VTName);
+
+ string EltTypeName = !cast<string>(EltVT);
+ // Size of the element type in bits, e.g. 32 for v16i32.
+ string EltSizeName = !subst("i", "", !subst("f", "", EltTypeName));
+ int EltSize = EltVT.Size;
+
+ // "i" for integer types and "f" for floating-point types
+ string TypeVariantName = !subst(EltSizeName, "", EltTypeName);
+
+ // Size of RC in bits, e.g. 512 for VR512.
+ int Size = VT.Size;
+
+ // The corresponding memory operand, e.g. i512mem for VR512.
+ X86MemOperand MemOp = !cast<X86MemOperand>(TypeVariantName # Size # "mem");
+ X86MemOperand ScalarMemOp = !cast<X86MemOperand>(EltVT # "mem");
+
+ // Load patterns
+ // Note: For 128/256-bit integer VT we choose loadv2i64/loadv4i64
+ // due to load promotion during legalization
+ PatFrag LdFrag = !cast<PatFrag>("load" #
+ !if (!eq (TypeVariantName, "i"),
+ !if (!eq (Size, 128), "v2i64",
+ !if (!eq (Size, 256), "v4i64",
+ !if (!eq (Size, 512), "v8i64",
+ VTName))), VTName));
+
+ PatFrag AlignedLdFrag = !cast<PatFrag>("alignedload" #
+ !if (!eq (TypeVariantName, "i"),
+ !if (!eq (Size, 128), "v2i64",
+ !if (!eq (Size, 256), "v4i64",
+ !if (!eq (Size, 512), "v8i64",
+ VTName))), VTName));
+
+ PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT);
+
+ // The corresponding float type, e.g. v16f32 for v16i32
+ // Note: For EltSize < 32, FloatVT is illegal and TableGen
+ // fails to compile, so we choose FloatVT = VT
+ ValueType FloatVT = !cast<ValueType>(
+ !if (!eq (!srl(EltSize,5),0),
+ VTName,
+ !if (!eq(TypeVariantName, "i"),
+ "v" # NumElts # "f" # EltSize,
+ VTName)));
+
+ ValueType IntVT = !cast<ValueType>(
+ !if (!eq (!srl(EltSize,5),0),
+ VTName,
+ !if (!eq(TypeVariantName, "f"),
+ "v" # NumElts # "i" # EltSize,
+ VTName)));
+ // The string to specify embedded broadcast in assembly.
+ string BroadcastStr = "{1to" # NumElts # "}";
+
+ // 8-bit compressed displacement tuple/subvector format. This is only
+ // defined for NumElts <= 8.
+ CD8VForm CD8TupleForm = !if (!eq (!srl(NumElts, 4), 0),
+ !cast<CD8VForm>("CD8VT" # NumElts), ?);
+
+ SubRegIndex SubRegIdx = !if (!eq (Size, 128), sub_xmm,
+ !if (!eq (Size, 256), sub_ymm, ?));
+
+ Domain ExeDomain = !if (!eq (EltTypeName, "f32"), SSEPackedSingle,
+ !if (!eq (EltTypeName, "f64"), SSEPackedDouble,
+ SSEPackedInt));
+
+ RegisterClass FRC = !if (!eq (EltTypeName, "f32"), FR32X, FR64X);
+
+ // A vector tye of the same width with element type i64. This is used to
+ // create patterns for logic ops.
+ ValueType i64VT = !cast<ValueType>("v" # !srl(Size, 6) # "i64");
+
+ // A vector type of the same width with element type i32. This is used to
+ // create the canonical constant zero node ImmAllZerosV.
+ ValueType i32VT = !cast<ValueType>("v" # !srl(Size, 5) # "i32");
+ dag ImmAllZerosV = (VT (bitconvert (i32VT immAllZerosV)));
+
+ string ZSuffix = !if (!eq (Size, 128), "Z128",
+ !if (!eq (Size, 256), "Z256", "Z"));
+}
+
+def v64i8_info : X86VectorVTInfo<64, i8, VR512, "b">;
+def v32i16_info : X86VectorVTInfo<32, i16, VR512, "w">;
+def v16i32_info : X86VectorVTInfo<16, i32, VR512, "d">;
+def v8i64_info : X86VectorVTInfo<8, i64, VR512, "q">;
+def v16f32_info : X86VectorVTInfo<16, f32, VR512, "ps">;
+def v8f64_info : X86VectorVTInfo<8, f64, VR512, "pd">;
+
+// "x" in v32i8x_info means RC = VR256X
+def v32i8x_info : X86VectorVTInfo<32, i8, VR256X, "b">;
+def v16i16x_info : X86VectorVTInfo<16, i16, VR256X, "w">;
+def v8i32x_info : X86VectorVTInfo<8, i32, VR256X, "d">;
+def v4i64x_info : X86VectorVTInfo<4, i64, VR256X, "q">;
+def v8f32x_info : X86VectorVTInfo<8, f32, VR256X, "ps">;
+def v4f64x_info : X86VectorVTInfo<4, f64, VR256X, "pd">;
+
+def v16i8x_info : X86VectorVTInfo<16, i8, VR128X, "b">;
+def v8i16x_info : X86VectorVTInfo<8, i16, VR128X, "w">;
+def v4i32x_info : X86VectorVTInfo<4, i32, VR128X, "d">;
+def v2i64x_info : X86VectorVTInfo<2, i64, VR128X, "q">;
+def v4f32x_info : X86VectorVTInfo<4, f32, VR128X, "ps">;
+def v2f64x_info : X86VectorVTInfo<2, f64, VR128X, "pd">;
+
+// We map scalar types to the smallest (128-bit) vector type
+// with the appropriate element type. This allows to use the same masking logic.
+def i32x_info : X86VectorVTInfo<1, i32, GR32, "si">;
+def i64x_info : X86VectorVTInfo<1, i64, GR64, "sq">;
+def f32x_info : X86VectorVTInfo<1, f32, VR128X, "ss">;
+def f64x_info : X86VectorVTInfo<1, f64, VR128X, "sd">;
+
+class AVX512VLVectorVTInfo<X86VectorVTInfo i512, X86VectorVTInfo i256,
+ X86VectorVTInfo i128> {
+ X86VectorVTInfo info512 = i512;
+ X86VectorVTInfo info256 = i256;
+ X86VectorVTInfo info128 = i128;
+}
+
+def avx512vl_i8_info : AVX512VLVectorVTInfo<v64i8_info, v32i8x_info,
+ v16i8x_info>;
+def avx512vl_i16_info : AVX512VLVectorVTInfo<v32i16_info, v16i16x_info,
+ v8i16x_info>;
+def avx512vl_i32_info : AVX512VLVectorVTInfo<v16i32_info, v8i32x_info,
+ v4i32x_info>;
+def avx512vl_i64_info : AVX512VLVectorVTInfo<v8i64_info, v4i64x_info,
+ v2i64x_info>;
+def avx512vl_f32_info : AVX512VLVectorVTInfo<v16f32_info, v8f32x_info,
+ v4f32x_info>;
+def avx512vl_f64_info : AVX512VLVectorVTInfo<v8f64_info, v4f64x_info,
+ v2f64x_info>;
+
+// This multiclass generates the masking variants from the non-masking
+// variant. It only provides the assembly pieces for the masking variants.
+// It assumes custom ISel patterns for masking which can be provided as
+// template arguments.
+multiclass AVX512_maskable_custom<bits<8> O, Format F,
+ dag Outs,
+ dag Ins, dag MaskingIns, dag ZeroMaskingIns,
+ string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ list<dag> Pattern,
+ list<dag> MaskingPattern,
+ list<dag> ZeroMaskingPattern,
+ string MaskingConstraint = "",
+ InstrItinClass itin = NoItinerary,
+ bit IsCommutable = 0,
+ bit IsKCommutable = 0> {
+ let isCommutable = IsCommutable in
+ def NAME: AVX512<O, F, Outs, Ins,
+ OpcodeStr#"\t{"#AttSrcAsm#", $dst|"#
+ "$dst, "#IntelSrcAsm#"}",
+ Pattern, itin>;
+
+ // Prefer over VMOV*rrk Pat<>
+ let AddedComplexity = 20, isCommutable = IsKCommutable in
+ def NAME#k: AVX512<O, F, Outs, MaskingIns,
+ OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"#
+ "$dst {${mask}}, "#IntelSrcAsm#"}",
+ MaskingPattern, itin>,
+ EVEX_K {
+ // In case of the 3src subclass this is overridden with a let.
+ string Constraints = MaskingConstraint;
+ }
+
+ // Zero mask does not add any restrictions to commute operands transformation.
+ // So, it is Ok to use IsCommutable instead of IsKCommutable.
+ let AddedComplexity = 30, isCommutable = IsCommutable in // Prefer over VMOV*rrkz Pat<>
+ def NAME#kz: AVX512<O, F, Outs, ZeroMaskingIns,
+ OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}} {z}|"#
+ "$dst {${mask}} {z}, "#IntelSrcAsm#"}",
+ ZeroMaskingPattern,
+ itin>,
+ EVEX_KZ;
+}
+
+
+// Common base class of AVX512_maskable and AVX512_maskable_3src.
+multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _,
+ dag Outs,
+ dag Ins, dag MaskingIns, dag ZeroMaskingIns,
+ string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ dag RHS, dag MaskingRHS,
+ SDNode Select = vselect,
+ string MaskingConstraint = "",
+ InstrItinClass itin = NoItinerary,
+ bit IsCommutable = 0,
+ bit IsKCommutable = 0> :
+ AVX512_maskable_custom<O, F, Outs, Ins, MaskingIns, ZeroMaskingIns, OpcodeStr,
+ AttSrcAsm, IntelSrcAsm,
+ [(set _.RC:$dst, RHS)],
+ [(set _.RC:$dst, MaskingRHS)],
+ [(set _.RC:$dst,
+ (Select _.KRCWM:$mask, RHS, _.ImmAllZerosV))],
+ MaskingConstraint, NoItinerary, IsCommutable,
+ IsKCommutable>;
+
+// This multiclass generates the unconditional/non-masking, the masking and
+// the zero-masking variant of the vector instruction. In the masking case, the
+// perserved vector elements come from a new dummy input operand tied to $dst.
+multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _,
+ dag Outs, dag Ins, string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ dag RHS,
+ InstrItinClass itin = NoItinerary,
+ bit IsCommutable = 0, bit IsKCommutable = 0,
+ SDNode Select = vselect> :
+ AVX512_maskable_common<O, F, _, Outs, Ins,
+ !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
+ !con((ins _.KRCWM:$mask), Ins),
+ OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
+ (Select _.KRCWM:$mask, RHS, _.RC:$src0), Select,
+ "$src0 = $dst", itin, IsCommutable, IsKCommutable>;
+
+// This multiclass generates the unconditional/non-masking, the masking and
+// the zero-masking variant of the scalar instruction.
+multiclass AVX512_maskable_scalar<bits<8> O, Format F, X86VectorVTInfo _,
+ dag Outs, dag Ins, string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ dag RHS,
+ InstrItinClass itin = NoItinerary,
+ bit IsCommutable = 0> :
+ AVX512_maskable_common<O, F, _, Outs, Ins,
+ !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
+ !con((ins _.KRCWM:$mask), Ins),
+ OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
+ (X86selects _.KRCWM:$mask, RHS, _.RC:$src0),
+ X86selects, "$src0 = $dst", itin, IsCommutable>;
+
+// Similar to AVX512_maskable but in this case one of the source operands
+// ($src1) is already tied to $dst so we just use that for the preserved
+// vector elements. NOTE that the NonTiedIns (the ins dag) should exclude
+// $src1.
+multiclass AVX512_maskable_3src<bits<8> O, Format F, X86VectorVTInfo _,
+ dag Outs, dag NonTiedIns, string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ dag RHS, bit IsCommutable = 0,
+ bit IsKCommutable = 0> :
+ AVX512_maskable_common<O, F, _, Outs,
+ !con((ins _.RC:$src1), NonTiedIns),
+ !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
+ !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
+ OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
+ (vselect _.KRCWM:$mask, RHS, _.RC:$src1),
+ vselect, "", NoItinerary, IsCommutable, IsKCommutable>;
+
+multiclass AVX512_maskable_3src_scalar<bits<8> O, Format F, X86VectorVTInfo _,
+ dag Outs, dag NonTiedIns, string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ dag RHS, bit IsCommutable = 0,
+ bit IsKCommutable = 0> :
+ AVX512_maskable_common<O, F, _, Outs,
+ !con((ins _.RC:$src1), NonTiedIns),
+ !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
+ !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
+ OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
+ (X86selects _.KRCWM:$mask, RHS, _.RC:$src1),
+ X86selects, "", NoItinerary, IsCommutable,
+ IsKCommutable>;
+
+multiclass AVX512_maskable_in_asm<bits<8> O, Format F, X86VectorVTInfo _,
+ dag Outs, dag Ins,
+ string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ list<dag> Pattern> :
+ AVX512_maskable_custom<O, F, Outs, Ins,
+ !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
+ !con((ins _.KRCWM:$mask), Ins),
+ OpcodeStr, AttSrcAsm, IntelSrcAsm, Pattern, [], [],
+ "$src0 = $dst">;
+
+
+// Instruction with mask that puts result in mask register,
+// like "compare" and "vptest"
+multiclass AVX512_maskable_custom_cmp<bits<8> O, Format F,
+ dag Outs,
+ dag Ins, dag MaskingIns,
+ string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ list<dag> Pattern,
+ list<dag> MaskingPattern,
+ bit IsCommutable = 0> {
+ let isCommutable = IsCommutable in
+ def NAME: AVX512<O, F, Outs, Ins,
+ OpcodeStr#"\t{"#AttSrcAsm#", $dst|"#
+ "$dst, "#IntelSrcAsm#"}",
+ Pattern, NoItinerary>;
+
+ def NAME#k: AVX512<O, F, Outs, MaskingIns,
+ OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"#
+ "$dst {${mask}}, "#IntelSrcAsm#"}",
+ MaskingPattern, NoItinerary>, EVEX_K;
+}
+
+multiclass AVX512_maskable_common_cmp<bits<8> O, Format F, X86VectorVTInfo _,
+ dag Outs,
+ dag Ins, dag MaskingIns,
+ string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ dag RHS, dag MaskingRHS,
+ bit IsCommutable = 0> :
+ AVX512_maskable_custom_cmp<O, F, Outs, Ins, MaskingIns, OpcodeStr,
+ AttSrcAsm, IntelSrcAsm,
+ [(set _.KRC:$dst, RHS)],
+ [(set _.KRC:$dst, MaskingRHS)], IsCommutable>;
+
+multiclass AVX512_maskable_cmp<bits<8> O, Format F, X86VectorVTInfo _,
+ dag Outs, dag Ins, string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ dag RHS, bit IsCommutable = 0> :
+ AVX512_maskable_common_cmp<O, F, _, Outs, Ins,
+ !con((ins _.KRCWM:$mask), Ins),
+ OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
+ (and _.KRCWM:$mask, RHS), IsCommutable>;
+
+multiclass AVX512_maskable_cmp_alt<bits<8> O, Format F, X86VectorVTInfo _,
+ dag Outs, dag Ins, string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm> :
+ AVX512_maskable_custom_cmp<O, F, Outs,
+ Ins, !con((ins _.KRCWM:$mask),Ins), OpcodeStr,
+ AttSrcAsm, IntelSrcAsm, [],[]>;
+
+// This multiclass generates the unconditional/non-masking, the masking and
+// the zero-masking variant of the vector instruction. In the masking case, the
+// perserved vector elements come from a new dummy input operand tied to $dst.
+multiclass AVX512_maskable_logic<bits<8> O, Format F, X86VectorVTInfo _,
+ dag Outs, dag Ins, string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ dag RHS, dag MaskedRHS,
+ InstrItinClass itin = NoItinerary,
+ bit IsCommutable = 0, SDNode Select = vselect> :
+ AVX512_maskable_custom<O, F, Outs, Ins,
+ !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
+ !con((ins _.KRCWM:$mask), Ins),
+ OpcodeStr, AttSrcAsm, IntelSrcAsm,
+ [(set _.RC:$dst, RHS)],
+ [(set _.RC:$dst,
+ (Select _.KRCWM:$mask, MaskedRHS, _.RC:$src0))],
+ [(set _.RC:$dst,
+ (Select _.KRCWM:$mask, MaskedRHS,
+ _.ImmAllZerosV))],
+ "$src0 = $dst", itin, IsCommutable>;
+
+// Bitcasts between 512-bit vector types. Return the original type since
+// no instruction is needed for the conversion.
+def : Pat<(v8f64 (bitconvert (v8i64 VR512:$src))), (v8f64 VR512:$src)>;
+def : Pat<(v8f64 (bitconvert (v16i32 VR512:$src))), (v8f64 VR512:$src)>;
+def : Pat<(v8f64 (bitconvert (v32i16 VR512:$src))), (v8f64 VR512:$src)>;
+def : Pat<(v8f64 (bitconvert (v64i8 VR512:$src))), (v8f64 VR512:$src)>;
+def : Pat<(v8f64 (bitconvert (v16f32 VR512:$src))), (v8f64 VR512:$src)>;
+def : Pat<(v16f32 (bitconvert (v8i64 VR512:$src))), (v16f32 VR512:$src)>;
+def : Pat<(v16f32 (bitconvert (v16i32 VR512:$src))), (v16f32 VR512:$src)>;
+def : Pat<(v16f32 (bitconvert (v32i16 VR512:$src))), (v16f32 VR512:$src)>;
+def : Pat<(v16f32 (bitconvert (v64i8 VR512:$src))), (v16f32 VR512:$src)>;
+def : Pat<(v16f32 (bitconvert (v8f64 VR512:$src))), (v16f32 VR512:$src)>;
+def : Pat<(v8i64 (bitconvert (v16i32 VR512:$src))), (v8i64 VR512:$src)>;
+def : Pat<(v8i64 (bitconvert (v32i16 VR512:$src))), (v8i64 VR512:$src)>;
+def : Pat<(v8i64 (bitconvert (v64i8 VR512:$src))), (v8i64 VR512:$src)>;
+def : Pat<(v8i64 (bitconvert (v8f64 VR512:$src))), (v8i64 VR512:$src)>;
+def : Pat<(v8i64 (bitconvert (v16f32 VR512:$src))), (v8i64 VR512:$src)>;
+def : Pat<(v16i32 (bitconvert (v8i64 VR512:$src))), (v16i32 VR512:$src)>;
+def : Pat<(v16i32 (bitconvert (v16f32 VR512:$src))), (v16i32 VR512:$src)>;
+def : Pat<(v16i32 (bitconvert (v32i16 VR512:$src))), (v16i32 VR512:$src)>;
+def : Pat<(v16i32 (bitconvert (v64i8 VR512:$src))), (v16i32 VR512:$src)>;
+def : Pat<(v16i32 (bitconvert (v8f64 VR512:$src))), (v16i32 VR512:$src)>;
+def : Pat<(v32i16 (bitconvert (v8i64 VR512:$src))), (v32i16 VR512:$src)>;
+def : Pat<(v32i16 (bitconvert (v16i32 VR512:$src))), (v32i16 VR512:$src)>;
+def : Pat<(v32i16 (bitconvert (v64i8 VR512:$src))), (v32i16 VR512:$src)>;
+def : Pat<(v32i16 (bitconvert (v8f64 VR512:$src))), (v32i16 VR512:$src)>;
+def : Pat<(v32i16 (bitconvert (v16f32 VR512:$src))), (v32i16 VR512:$src)>;
+def : Pat<(v32i16 (bitconvert (v16f32 VR512:$src))), (v32i16 VR512:$src)>;
+def : Pat<(v64i8 (bitconvert (v8i64 VR512:$src))), (v64i8 VR512:$src)>;
+def : Pat<(v64i8 (bitconvert (v16i32 VR512:$src))), (v64i8 VR512:$src)>;
+def : Pat<(v64i8 (bitconvert (v32i16 VR512:$src))), (v64i8 VR512:$src)>;
+def : Pat<(v64i8 (bitconvert (v8f64 VR512:$src))), (v64i8 VR512:$src)>;
+def : Pat<(v64i8 (bitconvert (v16f32 VR512:$src))), (v64i8 VR512:$src)>;
+
+// Alias instruction that maps zero vector to pxor / xorp* for AVX-512.
+// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
+// swizzled by ExecutionDepsFix to pxor.
+// We set canFoldAsLoad because this can be converted to a constant-pool
+// load of an all-zeros value if folding it would be beneficial.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+ isPseudo = 1, Predicates = [HasAVX512], SchedRW = [WriteZero] in {
+def AVX512_512_SET0 : I<0, Pseudo, (outs VR512:$dst), (ins), "",
+ [(set VR512:$dst, (v16i32 immAllZerosV))]>;
+def AVX512_512_SETALLONES : I<0, Pseudo, (outs VR512:$dst), (ins), "",
+ [(set VR512:$dst, (v16i32 immAllOnesV))]>;
+}
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+ isPseudo = 1, Predicates = [HasVLX], SchedRW = [WriteZero] in {
+def AVX512_128_SET0 : I<0, Pseudo, (outs VR128X:$dst), (ins), "",
+ [(set VR128X:$dst, (v4i32 immAllZerosV))]>;
+def AVX512_256_SET0 : I<0, Pseudo, (outs VR256X:$dst), (ins), "",
+ [(set VR256X:$dst, (v8i32 immAllZerosV))]>;
+}
+
+// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
+// This is expanded by ExpandPostRAPseudos.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+ isPseudo = 1, SchedRW = [WriteZero], Predicates = [HasVLX, HasDQI] in {
+ def AVX512_FsFLD0SS : I<0, Pseudo, (outs FR32X:$dst), (ins), "",
+ [(set FR32X:$dst, fp32imm0)]>;
+ def AVX512_FsFLD0SD : I<0, Pseudo, (outs FR64X:$dst), (ins), "",
+ [(set FR64X:$dst, fpimm0)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - VECTOR INSERT
+//
+multiclass vinsert_for_size<int Opcode, X86VectorVTInfo From, X86VectorVTInfo To,
+ PatFrag vinsert_insert> {
+ let ExeDomain = To.ExeDomain in {
+ defm rr : AVX512_maskable<Opcode, MRMSrcReg, To, (outs To.RC:$dst),
+ (ins To.RC:$src1, From.RC:$src2, i32u8imm:$src3),
+ "vinsert" # From.EltTypeName # "x" # From.NumElts,
+ "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (vinsert_insert:$src3 (To.VT To.RC:$src1),
+ (From.VT From.RC:$src2),
+ (iPTR imm))>, AVX512AIi8Base, EVEX_4V;
+
+ defm rm : AVX512_maskable<Opcode, MRMSrcMem, To, (outs To.RC:$dst),
+ (ins To.RC:$src1, From.MemOp:$src2, i32u8imm:$src3),
+ "vinsert" # From.EltTypeName # "x" # From.NumElts,
+ "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (vinsert_insert:$src3 (To.VT To.RC:$src1),
+ (From.VT (bitconvert (From.LdFrag addr:$src2))),
+ (iPTR imm))>, AVX512AIi8Base, EVEX_4V,
+ EVEX_CD8<From.EltSize, From.CD8TupleForm>;
+ }
+}
+
+multiclass vinsert_for_size_lowering<string InstrStr, X86VectorVTInfo From,
+ X86VectorVTInfo To, PatFrag vinsert_insert,
+ SDNodeXForm INSERT_get_vinsert_imm , list<Predicate> p> {
+ let Predicates = p in {
+ def : Pat<(vinsert_insert:$ins
+ (To.VT To.RC:$src1), (From.VT From.RC:$src2), (iPTR imm)),
+ (To.VT (!cast<Instruction>(InstrStr#"rr")
+ To.RC:$src1, From.RC:$src2,
+ (INSERT_get_vinsert_imm To.RC:$ins)))>;
+
+ def : Pat<(vinsert_insert:$ins
+ (To.VT To.RC:$src1),
+ (From.VT (bitconvert (From.LdFrag addr:$src2))),
+ (iPTR imm)),
+ (To.VT (!cast<Instruction>(InstrStr#"rm")
+ To.RC:$src1, addr:$src2,
+ (INSERT_get_vinsert_imm To.RC:$ins)))>;
+ }
+}
+
+multiclass vinsert_for_type<ValueType EltVT32, int Opcode128,
+ ValueType EltVT64, int Opcode256> {
+
+ let Predicates = [HasVLX] in
+ defm NAME # "32x4Z256" : vinsert_for_size<Opcode128,
+ X86VectorVTInfo< 4, EltVT32, VR128X>,
+ X86VectorVTInfo< 8, EltVT32, VR256X>,
+ vinsert128_insert>, EVEX_V256;
+
+ defm NAME # "32x4Z" : vinsert_for_size<Opcode128,
+ X86VectorVTInfo< 4, EltVT32, VR128X>,
+ X86VectorVTInfo<16, EltVT32, VR512>,
+ vinsert128_insert>, EVEX_V512;
+
+ defm NAME # "64x4Z" : vinsert_for_size<Opcode256,
+ X86VectorVTInfo< 4, EltVT64, VR256X>,
+ X86VectorVTInfo< 8, EltVT64, VR512>,
+ vinsert256_insert>, VEX_W, EVEX_V512;
+
+ let Predicates = [HasVLX, HasDQI] in
+ defm NAME # "64x2Z256" : vinsert_for_size<Opcode128,
+ X86VectorVTInfo< 2, EltVT64, VR128X>,
+ X86VectorVTInfo< 4, EltVT64, VR256X>,
+ vinsert128_insert>, VEX_W, EVEX_V256;
+
+ let Predicates = [HasDQI] in {
+ defm NAME # "64x2Z" : vinsert_for_size<Opcode128,
+ X86VectorVTInfo< 2, EltVT64, VR128X>,
+ X86VectorVTInfo< 8, EltVT64, VR512>,
+ vinsert128_insert>, VEX_W, EVEX_V512;
+
+ defm NAME # "32x8Z" : vinsert_for_size<Opcode256,
+ X86VectorVTInfo< 8, EltVT32, VR256X>,
+ X86VectorVTInfo<16, EltVT32, VR512>,
+ vinsert256_insert>, EVEX_V512;
+ }
+}
+
+defm VINSERTF : vinsert_for_type<f32, 0x18, f64, 0x1a>;
+defm VINSERTI : vinsert_for_type<i32, 0x38, i64, 0x3a>;
+
+// Codegen pattern with the alternative types,
+// Only add this if 64x2 and its friends are not supported natively via AVX512DQ.
+defm : vinsert_for_size_lowering<"VINSERTF32x4Z256", v2f64x_info, v4f64x_info,
+ vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX, NoDQI]>;
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v2i64x_info, v4i64x_info,
+ vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX, NoDQI]>;
+
+defm : vinsert_for_size_lowering<"VINSERTF32x4Z", v2f64x_info, v8f64_info,
+ vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512, NoDQI]>;
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v2i64x_info, v8i64_info,
+ vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512, NoDQI]>;
+
+defm : vinsert_for_size_lowering<"VINSERTF64x4Z", v8f32x_info, v16f32_info,
+ vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512, NoDQI]>;
+defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v8i32x_info, v16i32_info,
+ vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512, NoDQI]>;
+
+// Codegen pattern with the alternative types insert VEC128 into VEC256
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v8i16x_info, v16i16x_info,
+ vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v16i8x_info, v32i8x_info,
+ vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
+// Codegen pattern with the alternative types insert VEC128 into VEC512
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v8i16x_info, v32i16_info,
+ vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v16i8x_info, v64i8_info,
+ vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
+// Codegen pattern with the alternative types insert VEC256 into VEC512
+defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v16i16x_info, v32i16_info,
+ vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
+defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v32i8x_info, v64i8_info,
+ vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
+
+// vinsertps - insert f32 to XMM
+let ExeDomain = SSEPackedSingle in {
+def VINSERTPSZrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst),
+ (ins VR128X:$src1, VR128X:$src2, u8imm:$src3),
+ "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, imm:$src3))]>,
+ EVEX_4V;
+def VINSERTPSZrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst),
+ (ins VR128X:$src1, f32mem:$src2, u8imm:$src3),
+ "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set VR128X:$dst, (X86insertps VR128X:$src1,
+ (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
+ imm:$src3))]>, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX-512 VECTOR EXTRACT
+//---
+
+multiclass vextract_for_size<int Opcode,
+ X86VectorVTInfo From, X86VectorVTInfo To,
+ PatFrag vextract_extract,
+ SDNodeXForm EXTRACT_get_vextract_imm> {
+
+ let hasSideEffects = 0, ExeDomain = To.ExeDomain in {
+ // use AVX512_maskable_in_asm (AVX512_maskable can't be used due to
+ // vextract_extract), we interesting only in patterns without mask,
+ // intrinsics pattern match generated bellow.
+ defm rr : AVX512_maskable_in_asm<Opcode, MRMDestReg, To, (outs To.RC:$dst),
+ (ins From.RC:$src1, i32u8imm:$idx),
+ "vextract" # To.EltTypeName # "x" # To.NumElts,
+ "$idx, $src1", "$src1, $idx",
+ [(set To.RC:$dst, (vextract_extract:$idx (From.VT From.RC:$src1),
+ (iPTR imm)))]>,
+ AVX512AIi8Base, EVEX;
+ def mr : AVX512AIi8<Opcode, MRMDestMem, (outs),
+ (ins To.MemOp:$dst, From.RC:$src1, i32u8imm:$idx),
+ "vextract" # To.EltTypeName # "x" # To.NumElts #
+ "\t{$idx, $src1, $dst|$dst, $src1, $idx}",
+ [(store (To.VT (vextract_extract:$idx
+ (From.VT From.RC:$src1), (iPTR imm))),
+ addr:$dst)]>, EVEX;
+
+ let mayStore = 1, hasSideEffects = 0 in
+ def mrk : AVX512AIi8<Opcode, MRMDestMem, (outs),
+ (ins To.MemOp:$dst, To.KRCWM:$mask,
+ From.RC:$src1, i32u8imm:$idx),
+ "vextract" # To.EltTypeName # "x" # To.NumElts #
+ "\t{$idx, $src1, $dst {${mask}}|"
+ "$dst {${mask}}, $src1, $idx}",
+ []>, EVEX_K, EVEX;
+ }
+
+ def : Pat<(To.VT (vselect To.KRCWM:$mask,
+ (vextract_extract:$ext (From.VT From.RC:$src1),
+ (iPTR imm)),
+ To.RC:$src0)),
+ (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts #
+ From.ZSuffix # "rrk")
+ To.RC:$src0, To.KRCWM:$mask, From.RC:$src1,
+ (EXTRACT_get_vextract_imm To.RC:$ext))>;
+
+ def : Pat<(To.VT (vselect To.KRCWM:$mask,
+ (vextract_extract:$ext (From.VT From.RC:$src1),
+ (iPTR imm)),
+ To.ImmAllZerosV)),
+ (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts #
+ From.ZSuffix # "rrkz")
+ To.KRCWM:$mask, From.RC:$src1,
+ (EXTRACT_get_vextract_imm To.RC:$ext))>;
+
+ // Intrinsic call with masking.
+ def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName #
+ "x" # To.NumElts # "_" # From.Size)
+ From.RC:$src1, (iPTR imm:$idx), To.RC:$src0, To.MRC:$mask),
+ (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts #
+ From.ZSuffix # "rrk")
+ To.RC:$src0,
+ (COPY_TO_REGCLASS To.MRC:$mask, To.KRCWM),
+ From.RC:$src1, imm:$idx)>;
+
+ // Intrinsic call with zero-masking.
+ def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName #
+ "x" # To.NumElts # "_" # From.Size)
+ From.RC:$src1, (iPTR imm:$idx), To.ImmAllZerosV, To.MRC:$mask),
+ (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts #
+ From.ZSuffix # "rrkz")
+ (COPY_TO_REGCLASS To.MRC:$mask, To.KRCWM),
+ From.RC:$src1, imm:$idx)>;
+
+ // Intrinsic call without masking.
+ def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName #
+ "x" # To.NumElts # "_" # From.Size)
+ From.RC:$src1, (iPTR imm:$idx), To.ImmAllZerosV, (i8 -1)),
+ (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts #
+ From.ZSuffix # "rr")
+ From.RC:$src1, imm:$idx)>;
+}
+
+// Codegen pattern for the alternative types
+multiclass vextract_for_size_lowering<string InstrStr, X86VectorVTInfo From,
+ X86VectorVTInfo To, PatFrag vextract_extract,
+ SDNodeXForm EXTRACT_get_vextract_imm, list<Predicate> p> {
+ let Predicates = p in {
+ def : Pat<(vextract_extract:$ext (From.VT From.RC:$src1), (iPTR imm)),
+ (To.VT (!cast<Instruction>(InstrStr#"rr")
+ From.RC:$src1,
+ (EXTRACT_get_vextract_imm To.RC:$ext)))>;
+ def : Pat<(store (To.VT (vextract_extract:$ext (From.VT From.RC:$src1),
+ (iPTR imm))), addr:$dst),
+ (!cast<Instruction>(InstrStr#"mr") addr:$dst, From.RC:$src1,
+ (EXTRACT_get_vextract_imm To.RC:$ext))>;
+ }
+}
+
+multiclass vextract_for_type<ValueType EltVT32, int Opcode128,
+ ValueType EltVT64, int Opcode256> {
+ defm NAME # "32x4Z" : vextract_for_size<Opcode128,
+ X86VectorVTInfo<16, EltVT32, VR512>,
+ X86VectorVTInfo< 4, EltVT32, VR128X>,
+ vextract128_extract,
+ EXTRACT_get_vextract128_imm>,
+ EVEX_V512, EVEX_CD8<32, CD8VT4>;
+ defm NAME # "64x4Z" : vextract_for_size<Opcode256,
+ X86VectorVTInfo< 8, EltVT64, VR512>,
+ X86VectorVTInfo< 4, EltVT64, VR256X>,
+ vextract256_extract,
+ EXTRACT_get_vextract256_imm>,
+ VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT4>;
+ let Predicates = [HasVLX] in
+ defm NAME # "32x4Z256" : vextract_for_size<Opcode128,
+ X86VectorVTInfo< 8, EltVT32, VR256X>,
+ X86VectorVTInfo< 4, EltVT32, VR128X>,
+ vextract128_extract,
+ EXTRACT_get_vextract128_imm>,
+ EVEX_V256, EVEX_CD8<32, CD8VT4>;
+ let Predicates = [HasVLX, HasDQI] in
+ defm NAME # "64x2Z256" : vextract_for_size<Opcode128,
+ X86VectorVTInfo< 4, EltVT64, VR256X>,
+ X86VectorVTInfo< 2, EltVT64, VR128X>,
+ vextract128_extract,
+ EXTRACT_get_vextract128_imm>,
+ VEX_W, EVEX_V256, EVEX_CD8<64, CD8VT2>;
+ let Predicates = [HasDQI] in {
+ defm NAME # "64x2Z" : vextract_for_size<Opcode128,
+ X86VectorVTInfo< 8, EltVT64, VR512>,
+ X86VectorVTInfo< 2, EltVT64, VR128X>,
+ vextract128_extract,
+ EXTRACT_get_vextract128_imm>,
+ VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT2>;
+ defm NAME # "32x8Z" : vextract_for_size<Opcode256,
+ X86VectorVTInfo<16, EltVT32, VR512>,
+ X86VectorVTInfo< 8, EltVT32, VR256X>,
+ vextract256_extract,
+ EXTRACT_get_vextract256_imm>,
+ EVEX_V512, EVEX_CD8<32, CD8VT8>;
+ }
+}
+
+defm VEXTRACTF : vextract_for_type<f32, 0x19, f64, 0x1b>;
+defm VEXTRACTI : vextract_for_type<i32, 0x39, i64, 0x3b>;
+
+// extract_subvector codegen patterns with the alternative types.
+// Only add this if 64x2 and its friends are not supported natively via AVX512DQ.
+defm : vextract_for_size_lowering<"VEXTRACTF32x4Z", v8f64_info, v2f64x_info,
+ vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512, NoDQI]>;
+defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v8i64_info, v2i64x_info,
+ vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512, NoDQI]>;
+
+defm : vextract_for_size_lowering<"VEXTRACTF64x4Z", v16f32_info, v8f32x_info,
+ vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512, NoDQI]>;
+defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v16i32_info, v8i32x_info,
+ vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512, NoDQI]>;
+
+defm : vextract_for_size_lowering<"VEXTRACTF32x4Z256", v4f64x_info, v2f64x_info,
+ vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX, NoDQI]>;
+defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v4i64x_info, v2i64x_info,
+ vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX, NoDQI]>;
+
+// Codegen pattern with the alternative types extract VEC128 from VEC256
+defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v16i16x_info, v8i16x_info,
+ vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>;
+defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v32i8x_info, v16i8x_info,
+ vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>;
+
+// Codegen pattern with the alternative types extract VEC128 from VEC512
+defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v32i16_info, v8i16x_info,
+ vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
+defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v64i8_info, v16i8x_info,
+ vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
+// Codegen pattern with the alternative types extract VEC256 from VEC512
+defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v32i16_info, v16i16x_info,
+ vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
+defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v64i8_info, v32i8x_info,
+ vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
+
+// A 128-bit subvector extract from the first 256-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(v2i64 (extract_subvector (v8i64 VR512:$src), (iPTR 0))),
+ (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm))>;
+def : Pat<(v2f64 (extract_subvector (v8f64 VR512:$src), (iPTR 0))),
+ (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm))>;
+def : Pat<(v4i32 (extract_subvector (v16i32 VR512:$src), (iPTR 0))),
+ (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm))>;
+def : Pat<(v4f32 (extract_subvector (v16f32 VR512:$src), (iPTR 0))),
+ (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>;
+def : Pat<(v8i16 (extract_subvector (v32i16 VR512:$src), (iPTR 0))),
+ (v8i16 (EXTRACT_SUBREG (v32i16 VR512:$src), sub_xmm))>;
+def : Pat<(v16i8 (extract_subvector (v64i8 VR512:$src), (iPTR 0))),
+ (v16i8 (EXTRACT_SUBREG (v64i8 VR512:$src), sub_xmm))>;
+
+// A 256-bit subvector extract from the first 256-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(v4i64 (extract_subvector (v8i64 VR512:$src), (iPTR 0))),
+ (v4i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_ymm))>;
+def : Pat<(v4f64 (extract_subvector (v8f64 VR512:$src), (iPTR 0))),
+ (v4f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_ymm))>;
+def : Pat<(v8i32 (extract_subvector (v16i32 VR512:$src), (iPTR 0))),
+ (v8i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_ymm))>;
+def : Pat<(v8f32 (extract_subvector (v16f32 VR512:$src), (iPTR 0))),
+ (v8f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_ymm))>;
+def : Pat<(v16i16 (extract_subvector (v32i16 VR512:$src), (iPTR 0))),
+ (v16i16 (EXTRACT_SUBREG (v32i16 VR512:$src), sub_ymm))>;
+def : Pat<(v32i8 (extract_subvector (v64i8 VR512:$src), (iPTR 0))),
+ (v32i8 (EXTRACT_SUBREG (v64i8 VR512:$src), sub_ymm))>;
+
+let AddedComplexity = 25 in { // to give priority over vinsertf128rm
+// A 128-bit subvector insert to the first 512-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(v8i64 (insert_subvector undef, (v2i64 VR128X:$src), (iPTR 0))),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>;
+def : Pat<(v8f64 (insert_subvector undef, (v2f64 VR128X:$src), (iPTR 0))),
+ (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>;
+def : Pat<(v16i32 (insert_subvector undef, (v4i32 VR128X:$src), (iPTR 0))),
+ (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>;
+def : Pat<(v16f32 (insert_subvector undef, (v4f32 VR128X:$src), (iPTR 0))),
+ (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>;
+def : Pat<(v32i16 (insert_subvector undef, (v8i16 VR128X:$src), (iPTR 0))),
+ (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>;
+def : Pat<(v64i8 (insert_subvector undef, (v16i8 VR128X:$src), (iPTR 0))),
+ (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>;
+
+// A 256-bit subvector insert to the first 512-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(v8i64 (insert_subvector undef, (v4i64 VR256X:$src), (iPTR 0))),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+def : Pat<(v8f64 (insert_subvector undef, (v4f64 VR256X:$src), (iPTR 0))),
+ (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+def : Pat<(v16i32 (insert_subvector undef, (v8i32 VR256X:$src), (iPTR 0))),
+ (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+def : Pat<(v16f32 (insert_subvector undef, (v8f32 VR256X:$src), (iPTR 0))),
+ (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+def : Pat<(v32i16 (insert_subvector undef, (v16i16 VR256X:$src), (iPTR 0))),
+ (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+def : Pat<(v64i8 (insert_subvector undef, (v32i8 VR256X:$src), (iPTR 0))),
+ (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+}
+
+// vextractps - extract 32 bits from XMM
+def VEXTRACTPSZrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32:$dst),
+ (ins VR128X:$src1, u8imm:$src2),
+ "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))]>,
+ EVEX;
+
+def VEXTRACTPSZmr : AVX512AIi8<0x17, MRMDestMem, (outs),
+ (ins f32mem:$dst, VR128X:$src1, u8imm:$src2),
+ "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(store (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2),
+ addr:$dst)]>, EVEX, EVEX_CD8<32, CD8VT1>;
+
+//===---------------------------------------------------------------------===//
+// AVX-512 BROADCAST
+//---
+// broadcast with a scalar argument.
+multiclass avx512_broadcast_scalar<bits<8> opc, string OpcodeStr,
+ X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo> {
+
+ let isCodeGenOnly = 1 in {
+ def r_s : I< opc, MRMSrcReg, (outs DestInfo.RC:$dst),
+ (ins SrcInfo.FRC:$src), OpcodeStr#"\t{$src, $dst|$dst, $src}",
+ [(set DestInfo.RC:$dst, (DestInfo.VT (X86VBroadcast SrcInfo.FRC:$src)))]>,
+ Requires<[HasAVX512]>, T8PD, EVEX;
+
+ let Constraints = "$src0 = $dst" in
+ def rk_s : I< opc, MRMSrcReg, (outs DestInfo.RC:$dst),
+ (ins DestInfo.RC:$src0, DestInfo.KRCWM:$mask, SrcInfo.FRC:$src),
+ OpcodeStr#"\t{$src, $dst {${mask}} |$dst {${mask}}, $src}",
+ [(set DestInfo.RC:$dst,
+ (vselect DestInfo.KRCWM:$mask,
+ (DestInfo.VT (X86VBroadcast SrcInfo.FRC:$src)),
+ DestInfo.RC:$src0))]>,
+ Requires<[HasAVX512]>, T8PD, EVEX, EVEX_K;
+
+ def rkz_s : I< opc, MRMSrcReg, (outs DestInfo.RC:$dst),
+ (ins DestInfo.KRCWM:$mask, SrcInfo.FRC:$src),
+ OpcodeStr#"\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
+ [(set DestInfo.RC:$dst,
+ (vselect DestInfo.KRCWM:$mask,
+ (DestInfo.VT (X86VBroadcast SrcInfo.FRC:$src)),
+ DestInfo.ImmAllZerosV))]>,
+ Requires<[HasAVX512]>, T8PD, EVEX, EVEX_KZ;
+ } // let isCodeGenOnly = 1 in
+}
+
+multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr,
+ X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo> {
+ let ExeDomain = DestInfo.ExeDomain in {
+ defm r : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst),
+ (ins SrcInfo.RC:$src), OpcodeStr, "$src", "$src",
+ (DestInfo.VT (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src)))>,
+ T8PD, EVEX;
+ defm m : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
+ (ins SrcInfo.ScalarMemOp:$src), OpcodeStr, "$src", "$src",
+ (DestInfo.VT (X86VBroadcast
+ (SrcInfo.ScalarLdFrag addr:$src)))>,
+ T8PD, EVEX, EVEX_CD8<SrcInfo.EltSize, CD8VT1>;
+ }
+
+ def : Pat<(DestInfo.VT (X86VBroadcast
+ (SrcInfo.VT (scalar_to_vector
+ (SrcInfo.ScalarLdFrag addr:$src))))),
+ (!cast<Instruction>(NAME#DestInfo.ZSuffix#m) addr:$src)>;
+ let AddedComplexity = 20 in
+ def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask,
+ (X86VBroadcast
+ (SrcInfo.VT (scalar_to_vector
+ (SrcInfo.ScalarLdFrag addr:$src)))),
+ DestInfo.RC:$src0)),
+ (!cast<Instruction>(NAME#DestInfo.ZSuffix#mk)
+ DestInfo.RC:$src0, DestInfo.KRCWM:$mask, addr:$src)>;
+ let AddedComplexity = 30 in
+ def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask,
+ (X86VBroadcast
+ (SrcInfo.VT (scalar_to_vector
+ (SrcInfo.ScalarLdFrag addr:$src)))),
+ DestInfo.ImmAllZerosV)),
+ (!cast<Instruction>(NAME#DestInfo.ZSuffix#mkz)
+ DestInfo.KRCWM:$mask, addr:$src)>;
+}
+
+multiclass avx512_fp_broadcast_sd<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo _> {
+ let Predicates = [HasAVX512] in
+ defm Z : avx512_broadcast_rm<opc, OpcodeStr, _.info512, _.info128>,
+ avx512_broadcast_scalar<opc, OpcodeStr, _.info512, _.info128>,
+ EVEX_V512;
+
+ let Predicates = [HasVLX] in {
+ defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, _.info256, _.info128>,
+ avx512_broadcast_scalar<opc, OpcodeStr, _.info256, _.info128>,
+ EVEX_V256;
+ }
+}
+
+multiclass avx512_fp_broadcast_ss<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo _> {
+ let Predicates = [HasAVX512] in
+ defm Z : avx512_broadcast_rm<opc, OpcodeStr, _.info512, _.info128>,
+ avx512_broadcast_scalar<opc, OpcodeStr, _.info512, _.info128>,
+ EVEX_V512;
+
+ let Predicates = [HasVLX] in {
+ defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, _.info256, _.info128>,
+ avx512_broadcast_scalar<opc, OpcodeStr, _.info256, _.info128>,
+ EVEX_V256;
+ defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, _.info128, _.info128>,
+ avx512_broadcast_scalar<opc, OpcodeStr, _.info128, _.info128>,
+ EVEX_V128;
+ }
+}
+defm VBROADCASTSS : avx512_fp_broadcast_ss<0x18, "vbroadcastss",
+ avx512vl_f32_info>;
+defm VBROADCASTSD : avx512_fp_broadcast_sd<0x19, "vbroadcastsd",
+ avx512vl_f64_info>, VEX_W;
+
+def : Pat<(int_x86_avx512_vbroadcast_ss_512 addr:$src),
+ (VBROADCASTSSZm addr:$src)>;
+def : Pat<(int_x86_avx512_vbroadcast_sd_512 addr:$src),
+ (VBROADCASTSDZm addr:$src)>;
+
+multiclass avx512_int_broadcast_reg<bits<8> opc, X86VectorVTInfo _,
+ RegisterClass SrcRC> {
+ defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins SrcRC:$src),
+ "vpbroadcast"##_.Suffix, "$src", "$src",
+ (_.VT (X86VBroadcast SrcRC:$src))>, T8PD, EVEX;
+}
+
+multiclass avx512_int_broadcast_reg_vl<bits<8> opc, AVX512VLVectorVTInfo _,
+ RegisterClass SrcRC, Predicate prd> {
+ let Predicates = [prd] in
+ defm Z : avx512_int_broadcast_reg<opc, _.info512, SrcRC>, EVEX_V512;
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_int_broadcast_reg<opc, _.info256, SrcRC>, EVEX_V256;
+ defm Z128 : avx512_int_broadcast_reg<opc, _.info128, SrcRC>, EVEX_V128;
+ }
+}
+
+let isCodeGenOnly = 1 in {
+defm VPBROADCASTBr : avx512_int_broadcast_reg_vl<0x7A, avx512vl_i8_info, GR8,
+ HasBWI>;
+defm VPBROADCASTWr : avx512_int_broadcast_reg_vl<0x7B, avx512vl_i16_info, GR16,
+ HasBWI>;
+}
+let isAsmParserOnly = 1 in {
+ defm VPBROADCASTBr_Alt : avx512_int_broadcast_reg_vl<0x7A, avx512vl_i8_info,
+ GR32, HasBWI>;
+ defm VPBROADCASTWr_Alt : avx512_int_broadcast_reg_vl<0x7B, avx512vl_i16_info,
+ GR32, HasBWI>;
+}
+defm VPBROADCASTDr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i32_info, GR32,
+ HasAVX512>;
+defm VPBROADCASTQr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i64_info, GR64,
+ HasAVX512>, VEX_W;
+
+def : Pat <(v16i32 (X86vzext VK16WM:$mask)),
+ (VPBROADCASTDrZrkz VK16WM:$mask, (i32 (MOV32ri 0x1)))>;
+def : Pat <(v8i64 (X86vzext VK8WM:$mask)),
+ (VPBROADCASTQrZrkz VK8WM:$mask, (i64 (MOV64ri 0x1)))>;
+
+// Provide aliases for broadcast from the same register class that
+// automatically does the extract.
+multiclass avx512_int_broadcast_rm_lowering<X86VectorVTInfo DestInfo,
+ X86VectorVTInfo SrcInfo> {
+ def : Pat<(DestInfo.VT (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))),
+ (!cast<Instruction>(NAME#DestInfo.ZSuffix#"r")
+ (EXTRACT_SUBREG (SrcInfo.VT SrcInfo.RC:$src), sub_xmm))>;
+}
+
+multiclass avx512_int_broadcast_rm_vl<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo _, Predicate prd> {
+ let Predicates = [prd] in {
+ defm Z : avx512_broadcast_rm<opc, OpcodeStr, _.info512, _.info128>,
+ avx512_int_broadcast_rm_lowering<_.info512, _.info256>,
+ EVEX_V512;
+ // Defined separately to avoid redefinition.
+ defm Z_Alt : avx512_int_broadcast_rm_lowering<_.info512, _.info512>;
+ }
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, _.info256, _.info128>,
+ avx512_int_broadcast_rm_lowering<_.info256, _.info256>,
+ EVEX_V256;
+ defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, _.info128, _.info128>,
+ EVEX_V128;
+ }
+}
+
+defm VPBROADCASTB : avx512_int_broadcast_rm_vl<0x78, "vpbroadcastb",
+ avx512vl_i8_info, HasBWI>;
+defm VPBROADCASTW : avx512_int_broadcast_rm_vl<0x79, "vpbroadcastw",
+ avx512vl_i16_info, HasBWI>;
+defm VPBROADCASTD : avx512_int_broadcast_rm_vl<0x58, "vpbroadcastd",
+ avx512vl_i32_info, HasAVX512>;
+defm VPBROADCASTQ : avx512_int_broadcast_rm_vl<0x59, "vpbroadcastq",
+ avx512vl_i64_info, HasAVX512>, VEX_W;
+
+multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr,
+ X86VectorVTInfo _Dst, X86VectorVTInfo _Src> {
+ defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
+ (ins _Src.MemOp:$src), OpcodeStr, "$src", "$src",
+ (_Dst.VT (X86SubVBroadcast
+ (_Src.VT (bitconvert (_Src.LdFrag addr:$src)))))>,
+ AVX5128IBase, EVEX;
+}
+
+let Predicates = [HasVLX, HasBWI] in {
+ // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
+ // This means we'll encounter truncated i32 loads; match that here.
+ def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
+ (VPBROADCASTWZ128m addr:$src)>;
+ def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
+ (VPBROADCASTWZ256m addr:$src)>;
+ def : Pat<(v8i16 (X86VBroadcast
+ (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
+ (VPBROADCASTWZ128m addr:$src)>;
+ def : Pat<(v16i16 (X86VBroadcast
+ (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
+ (VPBROADCASTWZ256m addr:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX-512 BROADCAST SUBVECTORS
+//
+
+defm VBROADCASTI32X4 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4",
+ v16i32_info, v4i32x_info>,
+ EVEX_V512, EVEX_CD8<32, CD8VT4>;
+defm VBROADCASTF32X4 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf32x4",
+ v16f32_info, v4f32x_info>,
+ EVEX_V512, EVEX_CD8<32, CD8VT4>;
+defm VBROADCASTI64X4 : avx512_subvec_broadcast_rm<0x5b, "vbroadcasti64x4",
+ v8i64_info, v4i64x_info>, VEX_W,
+ EVEX_V512, EVEX_CD8<64, CD8VT4>;
+defm VBROADCASTF64X4 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf64x4",
+ v8f64_info, v4f64x_info>, VEX_W,
+ EVEX_V512, EVEX_CD8<64, CD8VT4>;
+
+let Predicates = [HasAVX512] in {
+def : Pat<(v32i16 (X86SubVBroadcast (bc_v16i16 (loadv4i64 addr:$src)))),
+ (VBROADCASTI64X4rm addr:$src)>;
+def : Pat<(v64i8 (X86SubVBroadcast (bc_v32i8 (loadv4i64 addr:$src)))),
+ (VBROADCASTI64X4rm addr:$src)>;
+
+// Provide fallback in case the load node that is used in the patterns above
+// is used by additional users, which prevents the pattern selection.
+def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))),
+ (VINSERTF64x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+ (v8f32 VR256X:$src), 1)>;
+def : Pat<(v8f64 (X86SubVBroadcast (v4f64 VR256X:$src))),
+ (VINSERTF64x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+ (v4f64 VR256X:$src), 1)>;
+def : Pat<(v8i64 (X86SubVBroadcast (v4i64 VR256X:$src))),
+ (VINSERTI64x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+ (v4i64 VR256X:$src), 1)>;
+def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))),
+ (VINSERTI64x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+ (v8i32 VR256X:$src), 1)>;
+def : Pat<(v32i16 (X86SubVBroadcast (v16i16 VR256X:$src))),
+ (VINSERTI64x4Zrr (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+ (v16i16 VR256X:$src), 1)>;
+def : Pat<(v64i8 (X86SubVBroadcast (v32i8 VR256X:$src))),
+ (VINSERTI64x4Zrr (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+ (v32i8 VR256X:$src), 1)>;
+
+def : Pat<(v32i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
+ (VBROADCASTI32X4rm addr:$src)>;
+def : Pat<(v64i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
+ (VBROADCASTI32X4rm addr:$src)>;
+
+// Provide fallback in case the load node that is used in the patterns above
+// is used by additional users, which prevents the pattern selection.
+def : Pat<(v8f64 (X86SubVBroadcast (v2f64 VR128X:$src))),
+ (VINSERTF64x4Zrr
+ (VINSERTF32x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)),
+ VR128X:$src, sub_xmm),
+ VR128X:$src, 1),
+ (EXTRACT_SUBREG
+ (v8f64 (VINSERTF32x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)),
+ VR128X:$src, sub_xmm),
+ VR128X:$src, 1)), sub_ymm), 1)>;
+def : Pat<(v8i64 (X86SubVBroadcast (v2i64 VR128X:$src))),
+ (VINSERTI64x4Zrr
+ (VINSERTI32x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)),
+ VR128X:$src, sub_xmm),
+ VR128X:$src, 1),
+ (EXTRACT_SUBREG
+ (v8i64 (VINSERTI32x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)),
+ VR128X:$src, sub_xmm),
+ VR128X:$src, 1)), sub_ymm), 1)>;
+
+def : Pat<(v32i16 (X86SubVBroadcast (v8i16 VR128X:$src))),
+ (VINSERTI64x4Zrr
+ (VINSERTI32x4Zrr (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)),
+ VR128X:$src, sub_xmm),
+ VR128X:$src, 1),
+ (EXTRACT_SUBREG
+ (v32i16 (VINSERTI32x4Zrr (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)),
+ VR128X:$src, sub_xmm),
+ VR128X:$src, 1)), sub_ymm), 1)>;
+def : Pat<(v64i8 (X86SubVBroadcast (v16i8 VR128X:$src))),
+ (VINSERTI64x4Zrr
+ (VINSERTI32x4Zrr (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)),
+ VR128X:$src, sub_xmm),
+ VR128X:$src, 1),
+ (EXTRACT_SUBREG
+ (v64i8 (VINSERTI32x4Zrr (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)),
+ VR128X:$src, sub_xmm),
+ VR128X:$src, 1)), sub_ymm), 1)>;
+}
+
+let Predicates = [HasVLX] in {
+defm VBROADCASTI32X4Z256 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4",
+ v8i32x_info, v4i32x_info>,
+ EVEX_V256, EVEX_CD8<32, CD8VT4>;
+defm VBROADCASTF32X4Z256 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf32x4",
+ v8f32x_info, v4f32x_info>,
+ EVEX_V256, EVEX_CD8<32, CD8VT4>;
+
+def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
+ (VBROADCASTI32X4Z256rm addr:$src)>;
+def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
+ (VBROADCASTI32X4Z256rm addr:$src)>;
+
+// Provide fallback in case the load node that is used in the patterns above
+// is used by additional users, which prevents the pattern selection.
+def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128X:$src))),
+ (VINSERTF32x4Z256rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ (v4f32 VR128X:$src), 1)>;
+def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128X:$src))),
+ (VINSERTI32x4Z256rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ (v4i32 VR128X:$src), 1)>;
+def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128X:$src))),
+ (VINSERTI32x4Z256rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ (v8i16 VR128X:$src), 1)>;
+def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128X:$src))),
+ (VINSERTI32x4Z256rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ (v16i8 VR128X:$src), 1)>;
+}
+
+let Predicates = [HasVLX, HasDQI] in {
+defm VBROADCASTI64X2Z128 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti64x2",
+ v4i64x_info, v2i64x_info>, VEX_W,
+ EVEX_V256, EVEX_CD8<64, CD8VT2>;
+defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf64x2",
+ v4f64x_info, v2f64x_info>, VEX_W,
+ EVEX_V256, EVEX_CD8<64, CD8VT2>;
+
+// Provide fallback in case the load node that is used in the patterns above
+// is used by additional users, which prevents the pattern selection.
+def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128X:$src))),
+ (VINSERTF64x2Z256rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ (v2f64 VR128X:$src), 1)>;
+def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128X:$src))),
+ (VINSERTI64x2Z256rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ (v2i64 VR128X:$src), 1)>;
+}
+
+let Predicates = [HasVLX, NoDQI] in {
+def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
+ (VBROADCASTF32X4Z256rm addr:$src)>;
+def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
+ (VBROADCASTI32X4Z256rm addr:$src)>;
+
+// Provide fallback in case the load node that is used in the patterns above
+// is used by additional users, which prevents the pattern selection.
+def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128X:$src))),
+ (VINSERTF32x4Z256rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ (v2f64 VR128X:$src), 1)>;
+def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128X:$src))),
+ (VINSERTI32x4Z256rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ (v2i64 VR128X:$src), 1)>;
+}
+
+let Predicates = [HasAVX512, NoDQI] in {
+def : Pat<(v8f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
+ (VBROADCASTF32X4rm addr:$src)>;
+def : Pat<(v8i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
+ (VBROADCASTI32X4rm addr:$src)>;
+
+def : Pat<(v16f32 (X86SubVBroadcast (v4f32 VR128X:$src))),
+ (VINSERTF64x4Zrr
+ (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
+ VR128X:$src, sub_xmm),
+ VR128X:$src, 1),
+ (EXTRACT_SUBREG
+ (v16f32 (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
+ VR128X:$src, sub_xmm),
+ VR128X:$src, 1)), sub_ymm), 1)>;
+def : Pat<(v16i32 (X86SubVBroadcast (v4i32 VR128X:$src))),
+ (VINSERTI64x4Zrr
+ (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)),
+ VR128X:$src, sub_xmm),
+ VR128X:$src, 1),
+ (EXTRACT_SUBREG
+ (v16i32 (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)),
+ VR128X:$src, sub_xmm),
+ VR128X:$src, 1)), sub_ymm), 1)>;
+
+def : Pat<(v16f32 (X86SubVBroadcast (loadv8f32 addr:$src))),
+ (VBROADCASTF64X4rm addr:$src)>;
+def : Pat<(v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src)))),
+ (VBROADCASTI64X4rm addr:$src)>;
+
+// Provide fallback in case the load node that is used in the patterns above
+// is used by additional users, which prevents the pattern selection.
+def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))),
+ (VINSERTF64x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+ (v8f32 VR256X:$src), 1)>;
+def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))),
+ (VINSERTI64x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+ (v8i32 VR256X:$src), 1)>;
+}
+
+let Predicates = [HasDQI] in {
+defm VBROADCASTI64X2 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti64x2",
+ v8i64_info, v2i64x_info>, VEX_W,
+ EVEX_V512, EVEX_CD8<64, CD8VT2>;
+defm VBROADCASTI32X8 : avx512_subvec_broadcast_rm<0x5b, "vbroadcasti32x8",
+ v16i32_info, v8i32x_info>,
+ EVEX_V512, EVEX_CD8<32, CD8VT8>;
+defm VBROADCASTF64X2 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf64x2",
+ v8f64_info, v2f64x_info>, VEX_W,
+ EVEX_V512, EVEX_CD8<64, CD8VT2>;
+defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf32x8",
+ v16f32_info, v8f32x_info>,
+ EVEX_V512, EVEX_CD8<32, CD8VT8>;
+
+// Provide fallback in case the load node that is used in the patterns above
+// is used by additional users, which prevents the pattern selection.
+def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))),
+ (VINSERTF32x8Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+ (v8f32 VR256X:$src), 1)>;
+def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))),
+ (VINSERTI32x8Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+ (v8i32 VR256X:$src), 1)>;
+
+def : Pat<(v16f32 (X86SubVBroadcast (v4f32 VR128X:$src))),
+ (VINSERTF32x8Zrr
+ (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
+ VR128X:$src, sub_xmm),
+ VR128X:$src, 1),
+ (EXTRACT_SUBREG
+ (v16f32 (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
+ VR128X:$src, sub_xmm),
+ VR128X:$src, 1)), sub_ymm), 1)>;
+def : Pat<(v16i32 (X86SubVBroadcast (v4i32 VR128X:$src))),
+ (VINSERTI32x8Zrr
+ (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)),
+ VR128X:$src, sub_xmm),
+ VR128X:$src, 1),
+ (EXTRACT_SUBREG
+ (v16i32 (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)),
+ VR128X:$src, sub_xmm),
+ VR128X:$src, 1)), sub_ymm), 1)>;
+}
+
+multiclass avx512_common_broadcast_32x2<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo _Dst, AVX512VLVectorVTInfo _Src> {
+ let Predicates = [HasDQI] in
+ defm Z : avx512_broadcast_rm<opc, OpcodeStr, _Dst.info512, _Src.info128>,
+ EVEX_V512;
+ let Predicates = [HasDQI, HasVLX] in
+ defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, _Dst.info256, _Src.info128>,
+ EVEX_V256;
+}
+
+multiclass avx512_common_broadcast_i32x2<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo _Dst, AVX512VLVectorVTInfo _Src> :
+ avx512_common_broadcast_32x2<opc, OpcodeStr, _Dst, _Src> {
+
+ let Predicates = [HasDQI, HasVLX] in
+ defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, _Dst.info128, _Src.info128>,
+ EVEX_V128;
+}
+
+defm VBROADCASTI32X2 : avx512_common_broadcast_i32x2<0x59, "vbroadcasti32x2",
+ avx512vl_i32_info, avx512vl_i64_info>;
+defm VBROADCASTF32X2 : avx512_common_broadcast_32x2<0x19, "vbroadcastf32x2",
+ avx512vl_f32_info, avx512vl_f64_info>;
+
+def : Pat<(v16f32 (X86VBroadcast (v16f32 VR512:$src))),
+ (VBROADCASTSSZr (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>;
+def : Pat<(v16f32 (X86VBroadcast (v8f32 VR256X:$src))),
+ (VBROADCASTSSZr (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm))>;
+
+def : Pat<(v8f64 (X86VBroadcast (v8f64 VR512:$src))),
+ (VBROADCASTSDZr (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm))>;
+def : Pat<(v8f64 (X86VBroadcast (v4f64 VR256X:$src))),
+ (VBROADCASTSDZr (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm))>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 BROADCAST MASK TO VECTOR REGISTER
+//---
+multiclass avx512_mask_broadcastm<bits<8> opc, string OpcodeStr,
+ X86VectorVTInfo _, RegisterClass KRC> {
+ def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.RC:$dst), (ins KRC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set _.RC:$dst, (_.VT (X86VBroadcastm KRC:$src)))]>, EVEX;
+}
+
+multiclass avx512_mask_broadcast<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo VTInfo, RegisterClass KRC> {
+ let Predicates = [HasCDI] in
+ defm Z : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info512, KRC>, EVEX_V512;
+ let Predicates = [HasCDI, HasVLX] in {
+ defm Z256 : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info256, KRC>, EVEX_V256;
+ defm Z128 : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info128, KRC>, EVEX_V128;
+ }
+}
+
+defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d",
+ avx512vl_i32_info, VK16>;
+defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q",
+ avx512vl_i64_info, VK8>, VEX_W;
+
+//===----------------------------------------------------------------------===//
+// -- VPERMI2 - 3 source operands form --
+multiclass avx512_perm_i<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
+let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
+ // The index operand in the pattern should really be an integer type. However,
+ // if we do that and it happens to come from a bitcast, then it becomes
+ // difficult to find the bitcast needed to convert the index to the
+ // destination type for the passthru since it will be folded with the bitcast
+ // of the index operand.
+ defm rr: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (X86VPermi2X _.RC:$src1, _.RC:$src2, _.RC:$src3)), 1>, EVEX_4V,
+ AVX5128IBase;
+
+ defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.MemOp:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (X86VPermi2X _.RC:$src1, _.RC:$src2,
+ (_.VT (bitconvert (_.LdFrag addr:$src3))))), 1>,
+ EVEX_4V, AVX5128IBase;
+ }
+}
+multiclass avx512_perm_i_mb<bits<8> opc, string OpcodeStr,
+ X86VectorVTInfo _> {
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in
+ defm rmb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.ScalarMemOp:$src3),
+ OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
+ !strconcat("$src2, ${src3}", _.BroadcastStr ),
+ (_.VT (X86VPermi2X _.RC:$src1,
+ _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))),
+ 1>, AVX5128IBase, EVEX_4V, EVEX_B;
+}
+
+multiclass avx512_perm_i_sizes<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo VTInfo> {
+ defm NAME: avx512_perm_i<opc, OpcodeStr, VTInfo.info512>,
+ avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info512>, EVEX_V512;
+ let Predicates = [HasVLX] in {
+ defm NAME#128: avx512_perm_i<opc, OpcodeStr, VTInfo.info128>,
+ avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info128>, EVEX_V128;
+ defm NAME#256: avx512_perm_i<opc, OpcodeStr, VTInfo.info256>,
+ avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info256>, EVEX_V256;
+ }
+}
+
+multiclass avx512_perm_i_sizes_bw<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo VTInfo,
+ Predicate Prd> {
+ let Predicates = [Prd] in
+ defm NAME: avx512_perm_i<opc, OpcodeStr, VTInfo.info512>, EVEX_V512;
+ let Predicates = [Prd, HasVLX] in {
+ defm NAME#128: avx512_perm_i<opc, OpcodeStr, VTInfo.info128>, EVEX_V128;
+ defm NAME#256: avx512_perm_i<opc, OpcodeStr, VTInfo.info256>, EVEX_V256;
+ }
+}
+
+defm VPERMI2D : avx512_perm_i_sizes<0x76, "vpermi2d",
+ avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+defm VPERMI2Q : avx512_perm_i_sizes<0x76, "vpermi2q",
+ avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPERMI2W : avx512_perm_i_sizes_bw<0x75, "vpermi2w",
+ avx512vl_i16_info, HasBWI>,
+ VEX_W, EVEX_CD8<16, CD8VF>;
+defm VPERMI2B : avx512_perm_i_sizes_bw<0x75, "vpermi2b",
+ avx512vl_i8_info, HasVBMI>,
+ EVEX_CD8<8, CD8VF>;
+defm VPERMI2PS : avx512_perm_i_sizes<0x77, "vpermi2ps",
+ avx512vl_f32_info>, EVEX_CD8<32, CD8VF>;
+defm VPERMI2PD : avx512_perm_i_sizes<0x77, "vpermi2pd",
+ avx512vl_f64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
+
+// VPERMT2
+multiclass avx512_perm_t<bits<8> opc, string OpcodeStr,
+ X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
+let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
+ defm rr: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins IdxVT.RC:$src2, _.RC:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, _.RC:$src3)), 1>,
+ EVEX_4V, AVX5128IBase;
+
+ defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins IdxVT.RC:$src2, _.MemOp:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2,
+ (bitconvert (_.LdFrag addr:$src3)))), 1>,
+ EVEX_4V, AVX5128IBase;
+ }
+}
+multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr,
+ X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in
+ defm rmb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins IdxVT.RC:$src2, _.ScalarMemOp:$src3),
+ OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
+ !strconcat("$src2, ${src3}", _.BroadcastStr ),
+ (_.VT (X86VPermt2 _.RC:$src1,
+ IdxVT.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))),
+ 1>, AVX5128IBase, EVEX_4V, EVEX_B;
+}
+
+multiclass avx512_perm_t_sizes<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo VTInfo,
+ AVX512VLVectorVTInfo ShuffleMask> {
+ defm NAME: avx512_perm_t<opc, OpcodeStr, VTInfo.info512,
+ ShuffleMask.info512>,
+ avx512_perm_t_mb<opc, OpcodeStr, VTInfo.info512,
+ ShuffleMask.info512>, EVEX_V512;
+ let Predicates = [HasVLX] in {
+ defm NAME#128: avx512_perm_t<opc, OpcodeStr, VTInfo.info128,
+ ShuffleMask.info128>,
+ avx512_perm_t_mb<opc, OpcodeStr, VTInfo.info128,
+ ShuffleMask.info128>, EVEX_V128;
+ defm NAME#256: avx512_perm_t<opc, OpcodeStr, VTInfo.info256,
+ ShuffleMask.info256>,
+ avx512_perm_t_mb<opc, OpcodeStr, VTInfo.info256,
+ ShuffleMask.info256>, EVEX_V256;
+ }
+}
+
+multiclass avx512_perm_t_sizes_bw<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo VTInfo,
+ AVX512VLVectorVTInfo Idx,
+ Predicate Prd> {
+ let Predicates = [Prd] in
+ defm NAME: avx512_perm_t<opc, OpcodeStr, VTInfo.info512,
+ Idx.info512>, EVEX_V512;
+ let Predicates = [Prd, HasVLX] in {
+ defm NAME#128: avx512_perm_t<opc, OpcodeStr, VTInfo.info128,
+ Idx.info128>, EVEX_V128;
+ defm NAME#256: avx512_perm_t<opc, OpcodeStr, VTInfo.info256,
+ Idx.info256>, EVEX_V256;
+ }
+}
+
+defm VPERMT2D : avx512_perm_t_sizes<0x7E, "vpermt2d",
+ avx512vl_i32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+defm VPERMT2Q : avx512_perm_t_sizes<0x7E, "vpermt2q",
+ avx512vl_i64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPERMT2W : avx512_perm_t_sizes_bw<0x7D, "vpermt2w",
+ avx512vl_i16_info, avx512vl_i16_info, HasBWI>,
+ VEX_W, EVEX_CD8<16, CD8VF>;
+defm VPERMT2B : avx512_perm_t_sizes_bw<0x7D, "vpermt2b",
+ avx512vl_i8_info, avx512vl_i8_info, HasVBMI>,
+ EVEX_CD8<8, CD8VF>;
+defm VPERMT2PS : avx512_perm_t_sizes<0x7F, "vpermt2ps",
+ avx512vl_f32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+defm VPERMT2PD : avx512_perm_t_sizes<0x7F, "vpermt2pd",
+ avx512vl_f64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - BLEND using mask
+//
+multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain in {
+ let hasSideEffects = 0 in
+ def rr : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, ${dst}|${dst}, $src1, $src2}"),
+ []>, EVEX_4V;
+ def rrk : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
+ (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
+ [(set _.RC:$dst, (vselect _.KRCWM:$mask,
+ (_.VT _.RC:$src2),
+ (_.VT _.RC:$src1)))]>, EVEX_4V, EVEX_K;
+ let hasSideEffects = 0 in
+ def rrkz : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
+ (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"),
+ []>, EVEX_4V, EVEX_KZ;
+ let mayLoad = 1, hasSideEffects = 0 in
+ def rm : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, ${dst}|${dst}, $src1, $src2}"),
+ []>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+ def rmk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
+ (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
+ [(set _.RC:$dst, (vselect _.KRCWM:$mask,
+ (_.VT (bitconvert (_.LdFrag addr:$src2))),
+ (_.VT _.RC:$src1)))]>,
+ EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>;
+ let mayLoad = 1, hasSideEffects = 0 in
+ def rmkz : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
+ (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"),
+ []>, EVEX_4V, EVEX_KZ, EVEX_CD8<_.EltSize, CD8VF>;
+ }
+}
+multiclass avx512_blendmask_rmb<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
+
+ def rmbk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
+ (ins _.KRCWM:$mask, _.RC:$src1, _.ScalarMemOp:$src2),
+ !strconcat(OpcodeStr,
+ "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
+ "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
+ [(set _.RC:$dst,(vselect _.KRCWM:$mask,
+ (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
+ (_.VT _.RC:$src1)))]>,
+ EVEX_4V, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
+
+ let mayLoad = 1, hasSideEffects = 0 in
+ def rmb : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2),
+ !strconcat(OpcodeStr,
+ "\t{${src2}", _.BroadcastStr, ", $src1, $dst|",
+ "$dst, $src1, ${src2}", _.BroadcastStr, "}"),
+ []>, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
+
+}
+
+multiclass blendmask_dq <bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo VTInfo> {
+ defm Z : avx512_blendmask <opc, OpcodeStr, VTInfo.info512>,
+ avx512_blendmask_rmb <opc, OpcodeStr, VTInfo.info512>, EVEX_V512;
+
+ let Predicates = [HasVLX] in {
+ defm Z256 : avx512_blendmask<opc, OpcodeStr, VTInfo.info256>,
+ avx512_blendmask_rmb <opc, OpcodeStr, VTInfo.info256>, EVEX_V256;
+ defm Z128 : avx512_blendmask<opc, OpcodeStr, VTInfo.info128>,
+ avx512_blendmask_rmb <opc, OpcodeStr, VTInfo.info128>, EVEX_V128;
+ }
+}
+
+multiclass blendmask_bw <bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo VTInfo> {
+ let Predicates = [HasBWI] in
+ defm Z : avx512_blendmask <opc, OpcodeStr, VTInfo.info512>, EVEX_V512;
+
+ let Predicates = [HasBWI, HasVLX] in {
+ defm Z256 : avx512_blendmask <opc, OpcodeStr, VTInfo.info256>, EVEX_V256;
+ defm Z128 : avx512_blendmask <opc, OpcodeStr, VTInfo.info128>, EVEX_V128;
+ }
+}
+
+
+defm VBLENDMPS : blendmask_dq <0x65, "vblendmps", avx512vl_f32_info>;
+defm VBLENDMPD : blendmask_dq <0x65, "vblendmpd", avx512vl_f64_info>, VEX_W;
+defm VPBLENDMD : blendmask_dq <0x64, "vpblendmd", avx512vl_i32_info>;
+defm VPBLENDMQ : blendmask_dq <0x64, "vpblendmq", avx512vl_i64_info>, VEX_W;
+defm VPBLENDMB : blendmask_bw <0x66, "vpblendmb", avx512vl_i8_info>;
+defm VPBLENDMW : blendmask_bw <0x66, "vpblendmw", avx512vl_i16_info>, VEX_W;
+
+
+let Predicates = [HasAVX512, NoVLX] in {
+def : Pat<(v8f32 (vselect (v8i1 VK8WM:$mask), (v8f32 VR256X:$src1),
+ (v8f32 VR256X:$src2))),
+ (EXTRACT_SUBREG
+ (v16f32 (VBLENDMPSZrrk (COPY_TO_REGCLASS VK8WM:$mask, VK16WM),
+ (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
+ (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))), sub_ymm)>;
+
+def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1),
+ (v8i32 VR256X:$src2))),
+ (EXTRACT_SUBREG
+ (v16i32 (VPBLENDMDZrrk (COPY_TO_REGCLASS VK8WM:$mask, VK16WM),
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))), sub_ymm)>;
+}
+//===----------------------------------------------------------------------===//
+// Compare Instructions
+//===----------------------------------------------------------------------===//
+
+// avx512_cmp_scalar - AVX512 CMPSS and CMPSD
+
+multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd>{
+
+ defm rr_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
+ (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc),
+ "vcmp${cc}"#_.Suffix,
+ "$src2, $src1", "$src1, $src2",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ imm:$cc)>, EVEX_4V;
+ defm rm_Int : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
+ (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc),
+ "vcmp${cc}"#_.Suffix,
+ "$src2, $src1", "$src1, $src2",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
+ imm:$cc)>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;
+
+ defm rrb_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
+ (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc),
+ "vcmp${cc}"#_.Suffix,
+ "{sae}, $src2, $src1", "$src1, $src2, {sae}",
+ (OpNodeRnd (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ imm:$cc,
+ (i32 FROUND_NO_EXC))>, EVEX_4V, EVEX_B;
+ // Accept explicit immediate argument form instead of comparison code.
+ let isAsmParserOnly = 1, hasSideEffects = 0 in {
+ defm rri_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
+ (outs VK1:$dst),
+ (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+ "vcmp"#_.Suffix,
+ "$cc, $src2, $src1", "$src1, $src2, $cc">, EVEX_4V;
+ defm rmi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _,
+ (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc),
+ "vcmp"#_.Suffix,
+ "$cc, $src2, $src1", "$src1, $src2, $cc">,
+ EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;
+
+ defm rrb_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
+ (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+ "vcmp"#_.Suffix,
+ "$cc, {sae}, $src2, $src1","$src1, $src2, {sae}, $cc">,
+ EVEX_4V, EVEX_B;
+ }// let isAsmParserOnly = 1, hasSideEffects = 0
+
+ let isCodeGenOnly = 1 in {
+ let isCommutable = 1 in
+ def rr : AVX512Ii8<0xC2, MRMSrcReg,
+ (outs _.KRC:$dst), (ins _.FRC:$src1, _.FRC:$src2, AVXCC:$cc),
+ !strconcat("vcmp${cc}", _.Suffix,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _.KRC:$dst, (OpNode _.FRC:$src1,
+ _.FRC:$src2,
+ imm:$cc))],
+ IIC_SSE_ALU_F32S_RR>, EVEX_4V;
+ def rm : AVX512Ii8<0xC2, MRMSrcMem,
+ (outs _.KRC:$dst),
+ (ins _.FRC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc),
+ !strconcat("vcmp${cc}", _.Suffix,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _.KRC:$dst, (OpNode _.FRC:$src1,
+ (_.ScalarLdFrag addr:$src2),
+ imm:$cc))],
+ IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;
+ }
+}
+
+let Predicates = [HasAVX512] in {
+ defm VCMPSSZ : avx512_cmp_scalar<f32x_info, X86cmpms, X86cmpmsRnd>,
+ AVX512XSIi8Base;
+ defm VCMPSDZ : avx512_cmp_scalar<f64x_info, X86cmpms, X86cmpmsRnd>,
+ AVX512XDIi8Base, VEX_W;
+}
+
+multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, bit IsCommutable> {
+ let isCommutable = IsCommutable in
+ def rr : AVX512BI<opc, MRMSrcReg,
+ (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)))],
+ IIC_SSE_ALU_F32P_RR>, EVEX_4V;
+ def rm : AVX512BI<opc, MRMSrcMem,
+ (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
+ (_.VT (bitconvert (_.LdFrag addr:$src2)))))],
+ IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+ def rrk : AVX512BI<opc, MRMSrcReg,
+ (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|",
+ "$dst {${mask}}, $src1, $src2}"),
+ [(set _.KRC:$dst, (and _.KRCWM:$mask,
+ (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))))],
+ IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K;
+ def rmk : AVX512BI<opc, MRMSrcMem,
+ (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|",
+ "$dst {${mask}}, $src1, $src2}"),
+ [(set _.KRC:$dst, (and _.KRCWM:$mask,
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT (bitconvert
+ (_.LdFrag addr:$src2))))))],
+ IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K;
+}
+
+multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, bit IsCommutable> :
+ avx512_icmp_packed<opc, OpcodeStr, OpNode, _, IsCommutable> {
+ def rmb : AVX512BI<opc, MRMSrcMem,
+ (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2),
+ !strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src1, $dst",
+ "|$dst, $src1, ${src2}", _.BroadcastStr, "}"),
+ [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
+ (X86VBroadcast (_.ScalarLdFrag addr:$src2))))],
+ IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B;
+ def rmbk : AVX512BI<opc, MRMSrcMem,
+ (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
+ _.ScalarMemOp:$src2),
+ !strconcat(OpcodeStr,
+ "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
+ "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
+ [(set _.KRC:$dst, (and _.KRCWM:$mask,
+ (OpNode (_.VT _.RC:$src1),
+ (X86VBroadcast
+ (_.ScalarLdFrag addr:$src2)))))],
+ IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B;
+}
+
+multiclass avx512_icmp_packed_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ AVX512VLVectorVTInfo VTInfo, Predicate prd,
+ bit IsCommutable = 0> {
+ let Predicates = [prd] in
+ defm Z : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info512,
+ IsCommutable>, EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info256,
+ IsCommutable>, EVEX_V256;
+ defm Z128 : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info128,
+ IsCommutable>, EVEX_V128;
+ }
+}
+
+multiclass avx512_icmp_packed_rmb_vl<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, AVX512VLVectorVTInfo VTInfo,
+ Predicate prd, bit IsCommutable = 0> {
+ let Predicates = [prd] in
+ defm Z : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info512,
+ IsCommutable>, EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info256,
+ IsCommutable>, EVEX_V256;
+ defm Z128 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info128,
+ IsCommutable>, EVEX_V128;
+ }
+}
+
+defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm,
+ avx512vl_i8_info, HasBWI, 1>,
+ EVEX_CD8<8, CD8VF>;
+
+defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm,
+ avx512vl_i16_info, HasBWI, 1>,
+ EVEX_CD8<16, CD8VF>;
+
+defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm,
+ avx512vl_i32_info, HasAVX512, 1>,
+ EVEX_CD8<32, CD8VF>;
+
+defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm,
+ avx512vl_i64_info, HasAVX512, 1>,
+ T8PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", X86pcmpgtm,
+ avx512vl_i8_info, HasBWI>,
+ EVEX_CD8<8, CD8VF>;
+
+defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw", X86pcmpgtm,
+ avx512vl_i16_info, HasBWI>,
+ EVEX_CD8<16, CD8VF>;
+
+defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", X86pcmpgtm,
+ avx512vl_i32_info, HasAVX512>,
+ EVEX_CD8<32, CD8VF>;
+
+defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm,
+ avx512vl_i64_info, HasAVX512>,
+ T8PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+let Predicates = [HasAVX512, NoVLX] in {
+def : Pat<(v8i1 (X86pcmpgtm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
+ (COPY_TO_REGCLASS (VPCMPGTDZrr
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), VK8)>;
+
+def : Pat<(v8i1 (X86pcmpeqm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
+ (COPY_TO_REGCLASS (VPCMPEQDZrr
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), VK8)>;
+}
+
+multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
+ X86VectorVTInfo _> {
+ let isCommutable = 1 in
+ def rri : AVX512AIi8<opc, MRMSrcReg,
+ (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, AVX512ICC:$cc),
+ !strconcat("vpcmp${cc}", Suffix,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+ imm:$cc))],
+ IIC_SSE_ALU_F32P_RR>, EVEX_4V;
+ def rmi : AVX512AIi8<opc, MRMSrcMem,
+ (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, AVX512ICC:$cc),
+ !strconcat("vpcmp${cc}", Suffix,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
+ (_.VT (bitconvert (_.LdFrag addr:$src2))),
+ imm:$cc))],
+ IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+ def rrik : AVX512AIi8<opc, MRMSrcReg,
+ (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2,
+ AVX512ICC:$cc),
+ !strconcat("vpcmp${cc}", Suffix,
+ "\t{$src2, $src1, $dst {${mask}}|",
+ "$dst {${mask}}, $src1, $src2}"),
+ [(set _.KRC:$dst, (and _.KRCWM:$mask,
+ (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+ imm:$cc)))],
+ IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K;
+ def rmik : AVX512AIi8<opc, MRMSrcMem,
+ (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2,
+ AVX512ICC:$cc),
+ !strconcat("vpcmp${cc}", Suffix,
+ "\t{$src2, $src1, $dst {${mask}}|",
+ "$dst {${mask}}, $src1, $src2}"),
+ [(set _.KRC:$dst, (and _.KRCWM:$mask,
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT (bitconvert (_.LdFrag addr:$src2))),
+ imm:$cc)))],
+ IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K;
+
+ // Accept explicit immediate argument form instead of comparison code.
+ let isAsmParserOnly = 1, hasSideEffects = 0 in {
+ def rri_alt : AVX512AIi8<opc, MRMSrcReg,
+ (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+ !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|",
+ "$dst, $src1, $src2, $cc}"),
+ [], IIC_SSE_ALU_F32P_RR>, EVEX_4V;
+ let mayLoad = 1 in
+ def rmi_alt : AVX512AIi8<opc, MRMSrcMem,
+ (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
+ !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|",
+ "$dst, $src1, $src2, $cc}"),
+ [], IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+ def rrik_alt : AVX512AIi8<opc, MRMSrcReg,
+ (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2,
+ u8imm:$cc),
+ !strconcat("vpcmp", Suffix,
+ "\t{$cc, $src2, $src1, $dst {${mask}}|",
+ "$dst {${mask}}, $src1, $src2, $cc}"),
+ [], IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K;
+ let mayLoad = 1 in
+ def rmik_alt : AVX512AIi8<opc, MRMSrcMem,
+ (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2,
+ u8imm:$cc),
+ !strconcat("vpcmp", Suffix,
+ "\t{$cc, $src2, $src1, $dst {${mask}}|",
+ "$dst {${mask}}, $src1, $src2, $cc}"),
+ [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K;
+ }
+}
+
+multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode,
+ X86VectorVTInfo _> :
+ avx512_icmp_cc<opc, Suffix, OpNode, _> {
+ def rmib : AVX512AIi8<opc, MRMSrcMem,
+ (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2,
+ AVX512ICC:$cc),
+ !strconcat("vpcmp${cc}", Suffix,
+ "\t{${src2}", _.BroadcastStr, ", $src1, $dst|",
+ "$dst, $src1, ${src2}", _.BroadcastStr, "}"),
+ [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
+ (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
+ imm:$cc))],
+ IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B;
+ def rmibk : AVX512AIi8<opc, MRMSrcMem,
+ (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
+ _.ScalarMemOp:$src2, AVX512ICC:$cc),
+ !strconcat("vpcmp${cc}", Suffix,
+ "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
+ "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
+ [(set _.KRC:$dst, (and _.KRCWM:$mask,
+ (OpNode (_.VT _.RC:$src1),
+ (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
+ imm:$cc)))],
+ IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B;
+
+ // Accept explicit immediate argument form instead of comparison code.
+ let isAsmParserOnly = 1, hasSideEffects = 0, mayLoad = 1 in {
+ def rmib_alt : AVX512AIi8<opc, MRMSrcMem,
+ (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2,
+ u8imm:$cc),
+ !strconcat("vpcmp", Suffix,
+ "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst|",
+ "$dst, $src1, ${src2}", _.BroadcastStr, ", $cc}"),
+ [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B;
+ def rmibk_alt : AVX512AIi8<opc, MRMSrcMem,
+ (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
+ _.ScalarMemOp:$src2, u8imm:$cc),
+ !strconcat("vpcmp", Suffix,
+ "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
+ "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, ", $cc}"),
+ [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B;
+ }
+}
+
+multiclass avx512_icmp_cc_vl<bits<8> opc, string Suffix, SDNode OpNode,
+ AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+ let Predicates = [prd] in
+ defm Z : avx512_icmp_cc<opc, Suffix, OpNode, VTInfo.info512>, EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_icmp_cc<opc, Suffix, OpNode, VTInfo.info256>, EVEX_V256;
+ defm Z128 : avx512_icmp_cc<opc, Suffix, OpNode, VTInfo.info128>, EVEX_V128;
+ }
+}
+
+multiclass avx512_icmp_cc_rmb_vl<bits<8> opc, string Suffix, SDNode OpNode,
+ AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+ let Predicates = [prd] in
+ defm Z : avx512_icmp_cc_rmb<opc, Suffix, OpNode, VTInfo.info512>,
+ EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_icmp_cc_rmb<opc, Suffix, OpNode, VTInfo.info256>,
+ EVEX_V256;
+ defm Z128 : avx512_icmp_cc_rmb<opc, Suffix, OpNode, VTInfo.info128>,
+ EVEX_V128;
+ }
+}
+
+defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86cmpm, avx512vl_i8_info,
+ HasBWI>, EVEX_CD8<8, CD8VF>;
+defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86cmpmu, avx512vl_i8_info,
+ HasBWI>, EVEX_CD8<8, CD8VF>;
+
+defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86cmpm, avx512vl_i16_info,
+ HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>;
+defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86cmpmu, avx512vl_i16_info,
+ HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>;
+
+defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86cmpm, avx512vl_i32_info,
+ HasAVX512>, EVEX_CD8<32, CD8VF>;
+defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86cmpmu, avx512vl_i32_info,
+ HasAVX512>, EVEX_CD8<32, CD8VF>;
+
+defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86cmpm, avx512vl_i64_info,
+ HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86cmpmu, avx512vl_i64_info,
+ HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>;
+
+multiclass avx512_vcmp_common<X86VectorVTInfo _> {
+
+ defm rri : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
+ (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2,AVXCC:$cc),
+ "vcmp${cc}"#_.Suffix,
+ "$src2, $src1", "$src1, $src2",
+ (X86cmpm (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ imm:$cc), 1>;
+
+ defm rmi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
+ (outs _.KRC:$dst),(ins _.RC:$src1, _.MemOp:$src2, AVXCC:$cc),
+ "vcmp${cc}"#_.Suffix,
+ "$src2, $src1", "$src1, $src2",
+ (X86cmpm (_.VT _.RC:$src1),
+ (_.VT (bitconvert (_.LdFrag addr:$src2))),
+ imm:$cc)>;
+
+ defm rmbi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
+ (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc),
+ "vcmp${cc}"#_.Suffix,
+ "${src2}"##_.BroadcastStr##", $src1",
+ "$src1, ${src2}"##_.BroadcastStr,
+ (X86cmpm (_.VT _.RC:$src1),
+ (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
+ imm:$cc)>,EVEX_B;
+ // Accept explicit immediate argument form instead of comparison code.
+ let isAsmParserOnly = 1, hasSideEffects = 0 in {
+ defm rri_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
+ (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+ "vcmp"#_.Suffix,
+ "$cc, $src2, $src1", "$src1, $src2, $cc">;
+
+ let mayLoad = 1 in {
+ defm rmi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _,
+ (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
+ "vcmp"#_.Suffix,
+ "$cc, $src2, $src1", "$src1, $src2, $cc">;
+
+ defm rmbi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _,
+ (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc),
+ "vcmp"#_.Suffix,
+ "$cc, ${src2}"##_.BroadcastStr##", $src1",
+ "$src1, ${src2}"##_.BroadcastStr##", $cc">,EVEX_B;
+ }
+ }
+}
+
+multiclass avx512_vcmp_sae<X86VectorVTInfo _> {
+ // comparison code form (VCMP[EQ/LT/LE/...]
+ defm rrib : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
+ (outs _.KRC:$dst),(ins _.RC:$src1, _.RC:$src2, AVXCC:$cc),
+ "vcmp${cc}"#_.Suffix,
+ "{sae}, $src2, $src1", "$src1, $src2, {sae}",
+ (X86cmpmRnd (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ imm:$cc,
+ (i32 FROUND_NO_EXC))>, EVEX_B;
+
+ let isAsmParserOnly = 1, hasSideEffects = 0 in {
+ defm rrib_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
+ (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+ "vcmp"#_.Suffix,
+ "$cc, {sae}, $src2, $src1",
+ "$src1, $src2, {sae}, $cc">, EVEX_B;
+ }
+}
+
+multiclass avx512_vcmp<AVX512VLVectorVTInfo _> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_vcmp_common<_.info512>,
+ avx512_vcmp_sae<_.info512>, EVEX_V512;
+
+ }
+ let Predicates = [HasAVX512,HasVLX] in {
+ defm Z128 : avx512_vcmp_common<_.info128>, EVEX_V128;
+ defm Z256 : avx512_vcmp_common<_.info256>, EVEX_V256;
+ }
+}
+
+defm VCMPPD : avx512_vcmp<avx512vl_f64_info>,
+ AVX512PDIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+defm VCMPPS : avx512_vcmp<avx512vl_f32_info>,
+ AVX512PSIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+
+def : Pat<(v8i1 (X86cmpm (v8f32 VR256X:$src1), (v8f32 VR256X:$src2), imm:$cc)),
+ (COPY_TO_REGCLASS (VCMPPSZrri
+ (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+ (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
+ imm:$cc), VK8)>;
+def : Pat<(v8i1 (X86cmpm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)),
+ (COPY_TO_REGCLASS (VPCMPDZrri
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
+ imm:$cc), VK8)>;
+def : Pat<(v8i1 (X86cmpmu (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)),
+ (COPY_TO_REGCLASS (VPCMPUDZrri
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
+ imm:$cc), VK8)>;
+
+// ----------------------------------------------------------------
+// FPClass
+//handle fpclass instruction mask = op(reg_scalar,imm)
+// op(mem_scalar,imm)
+multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, Predicate prd> {
+ let Predicates = [prd] in {
+ def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),//_.KRC:$dst),
+ (ins _.RC:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1),
+ (i32 imm:$src2)))], NoItinerary>;
+ def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
+ (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix#
+ "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
+ [(set _.KRC:$dst,(or _.KRCWM:$mask,
+ (OpNode (_.VT _.RC:$src1),
+ (i32 imm:$src2))))], NoItinerary>, EVEX_K;
+ let AddedComplexity = 20 in {
+ def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+ (ins _.MemOp:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix##
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set _.KRC:$dst,
+ (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
+ (i32 imm:$src2)))], NoItinerary>;
+ def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+ (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix##
+ "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
+ [(set _.KRC:$dst,(or _.KRCWM:$mask,
+ (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
+ (i32 imm:$src2))))], NoItinerary>, EVEX_K;
+ }
+ }
+}
+
+//handle fpclass instruction mask = fpclass(reg_vec, reg_vec, imm)
+// fpclass(reg_vec, mem_vec, imm)
+// fpclass(reg_vec, broadcast(eltVt), imm)
+multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, string mem, string broadcast>{
+ def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
+ (ins _.RC:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1),
+ (i32 imm:$src2)))], NoItinerary>;
+ def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
+ (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix#
+ "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
+ [(set _.KRC:$dst,(or _.KRCWM:$mask,
+ (OpNode (_.VT _.RC:$src1),
+ (i32 imm:$src2))))], NoItinerary>, EVEX_K;
+ def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+ (ins _.MemOp:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix##mem#
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set _.KRC:$dst,(OpNode
+ (_.VT (bitconvert (_.LdFrag addr:$src1))),
+ (i32 imm:$src2)))], NoItinerary>;
+ def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+ (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix##mem#
+ "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
+ [(set _.KRC:$dst, (or _.KRCWM:$mask, (OpNode
+ (_.VT (bitconvert (_.LdFrag addr:$src1))),
+ (i32 imm:$src2))))], NoItinerary>, EVEX_K;
+ def rmb : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+ (ins _.ScalarMemOp:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"##
+ _.BroadcastStr##", $dst|$dst, ${src1}"
+ ##_.BroadcastStr##", $src2}",
+ [(set _.KRC:$dst,(OpNode
+ (_.VT (X86VBroadcast
+ (_.ScalarLdFrag addr:$src1))),
+ (i32 imm:$src2)))], NoItinerary>,EVEX_B;
+ def rmbk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+ (ins _.KRCWM:$mask, _.ScalarMemOp:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"##
+ _.BroadcastStr##", $dst {${mask}}|$dst {${mask}}, ${src1}"##
+ _.BroadcastStr##", $src2}",
+ [(set _.KRC:$dst,(or _.KRCWM:$mask, (OpNode
+ (_.VT (X86VBroadcast
+ (_.ScalarLdFrag addr:$src1))),
+ (i32 imm:$src2))))], NoItinerary>,
+ EVEX_B, EVEX_K;
+}
+
+multiclass avx512_vector_fpclass_all<string OpcodeStr,
+ AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd,
+ string broadcast>{
+ let Predicates = [prd] in {
+ defm Z : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info512, "{z}",
+ broadcast>, EVEX_V512;
+ }
+ let Predicates = [prd, HasVLX] in {
+ defm Z128 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info128, "{x}",
+ broadcast>, EVEX_V128;
+ defm Z256 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info256, "{y}",
+ broadcast>, EVEX_V256;
+ }
+}
+
+multiclass avx512_fp_fpclass_all<string OpcodeStr, bits<8> opcVec,
+ bits<8> opcScalar, SDNode VecOpNode, SDNode ScalarOpNode, Predicate prd>{
+ defm PS : avx512_vector_fpclass_all<OpcodeStr, avx512vl_f32_info, opcVec,
+ VecOpNode, prd, "{l}">, EVEX_CD8<32, CD8VF>;
+ defm PD : avx512_vector_fpclass_all<OpcodeStr, avx512vl_f64_info, opcVec,
+ VecOpNode, prd, "{q}">,EVEX_CD8<64, CD8VF> , VEX_W;
+ defm SS : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode,
+ f32x_info, prd>, EVEX_CD8<32, CD8VT1>;
+ defm SD : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode,
+ f64x_info, prd>, EVEX_CD8<64, CD8VT1>, VEX_W;
+}
+
+defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, X86Vfpclass,
+ X86Vfpclasss, HasDQI>, AVX512AIi8Base,EVEX;
+
+//-----------------------------------------------------------------
+// Mask register copy, including
+// - copy between mask registers
+// - load/store mask registers
+// - copy from GPR to mask register and vice versa
+//
+multiclass avx512_mask_mov<bits<8> opc_kk, bits<8> opc_km, bits<8> opc_mk,
+ string OpcodeStr, RegisterClass KRC,
+ ValueType vvt, X86MemOperand x86memop> {
+ let hasSideEffects = 0 in
+ def kk : I<opc_kk, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
+ def km : I<opc_km, MRMSrcMem, (outs KRC:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set KRC:$dst, (vvt (load addr:$src)))]>;
+ def mk : I<opc_mk, MRMDestMem, (outs), (ins x86memop:$dst, KRC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(store KRC:$src, addr:$dst)]>;
+}
+
+multiclass avx512_mask_mov_gpr<bits<8> opc_kr, bits<8> opc_rk,
+ string OpcodeStr,
+ RegisterClass KRC, RegisterClass GRC> {
+ let hasSideEffects = 0 in {
+ def kr : I<opc_kr, MRMSrcReg, (outs KRC:$dst), (ins GRC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
+ def rk : I<opc_rk, MRMSrcReg, (outs GRC:$dst), (ins KRC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
+ }
+}
+
+let Predicates = [HasDQI] in
+ defm KMOVB : avx512_mask_mov<0x90, 0x90, 0x91, "kmovb", VK8, v8i1, i8mem>,
+ avx512_mask_mov_gpr<0x92, 0x93, "kmovb", VK8, GR32>,
+ VEX, PD;
+
+let Predicates = [HasAVX512] in
+ defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16mem>,
+ avx512_mask_mov_gpr<0x92, 0x93, "kmovw", VK16, GR32>,
+ VEX, PS;
+
+let Predicates = [HasBWI] in {
+ defm KMOVD : avx512_mask_mov<0x90, 0x90, 0x91, "kmovd", VK32, v32i1,i32mem>,
+ VEX, PD, VEX_W;
+ defm KMOVD : avx512_mask_mov_gpr<0x92, 0x93, "kmovd", VK32, GR32>,
+ VEX, XD;
+ defm KMOVQ : avx512_mask_mov<0x90, 0x90, 0x91, "kmovq", VK64, v64i1, i64mem>,
+ VEX, PS, VEX_W;
+ defm KMOVQ : avx512_mask_mov_gpr<0x92, 0x93, "kmovq", VK64, GR64>,
+ VEX, XD, VEX_W;
+}
+
+// GR from/to mask register
+def : Pat<(v16i1 (bitconvert (i16 GR16:$src))),
+ (COPY_TO_REGCLASS GR16:$src, VK16)>;
+def : Pat<(i16 (bitconvert (v16i1 VK16:$src))),
+ (COPY_TO_REGCLASS VK16:$src, GR16)>;
+
+def : Pat<(v8i1 (bitconvert (i8 GR8:$src))),
+ (COPY_TO_REGCLASS GR8:$src, VK8)>;
+def : Pat<(i8 (bitconvert (v8i1 VK8:$src))),
+ (COPY_TO_REGCLASS VK8:$src, GR8)>;
+
+def : Pat<(i32 (zext (i16 (bitconvert (v16i1 VK16:$src))))),
+ (KMOVWrk VK16:$src)>;
+def : Pat<(i32 (anyext (i16 (bitconvert (v16i1 VK16:$src))))),
+ (i32 (INSERT_SUBREG (IMPLICIT_DEF),
+ (i16 (COPY_TO_REGCLASS VK16:$src, GR16)), sub_16bit))>;
+
+def : Pat<(i32 (zext (i8 (bitconvert (v8i1 VK8:$src))))),
+ (MOVZX32rr8 (COPY_TO_REGCLASS VK8:$src, GR8))>, Requires<[NoDQI]>;
+def : Pat<(i32 (zext (i8 (bitconvert (v8i1 VK8:$src))))),
+ (KMOVBrk VK8:$src)>, Requires<[HasDQI]>;
+def : Pat<(i32 (anyext (i8 (bitconvert (v8i1 VK8:$src))))),
+ (i32 (INSERT_SUBREG (IMPLICIT_DEF),
+ (i8 (COPY_TO_REGCLASS VK8:$src, GR8)), sub_8bit))>;
+
+def : Pat<(v32i1 (bitconvert (i32 GR32:$src))),
+ (COPY_TO_REGCLASS GR32:$src, VK32)>;
+def : Pat<(i32 (bitconvert (v32i1 VK32:$src))),
+ (COPY_TO_REGCLASS VK32:$src, GR32)>;
+def : Pat<(v64i1 (bitconvert (i64 GR64:$src))),
+ (COPY_TO_REGCLASS GR64:$src, VK64)>;
+def : Pat<(i64 (bitconvert (v64i1 VK64:$src))),
+ (COPY_TO_REGCLASS VK64:$src, GR64)>;
+
+// Load/store kreg
+let Predicates = [HasDQI] in {
+ def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst),
+ (KMOVBmk addr:$dst, VK8:$src)>;
+ def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))),
+ (KMOVBkm addr:$src)>;
+
+ def : Pat<(store VK4:$src, addr:$dst),
+ (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK4:$src, VK8))>;
+ def : Pat<(store VK2:$src, addr:$dst),
+ (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK2:$src, VK8))>;
+ def : Pat<(store VK1:$src, addr:$dst),
+ (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK1:$src, VK8))>;
+
+ def : Pat<(v2i1 (load addr:$src)),
+ (COPY_TO_REGCLASS (KMOVBkm addr:$src), VK2)>;
+ def : Pat<(v4i1 (load addr:$src)),
+ (COPY_TO_REGCLASS (KMOVBkm addr:$src), VK4)>;
+}
+let Predicates = [HasAVX512, NoDQI] in {
+ def : Pat<(store VK1:$src, addr:$dst),
+ (MOV8mr addr:$dst,
+ (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)),
+ sub_8bit))>;
+ def : Pat<(store VK2:$src, addr:$dst),
+ (MOV8mr addr:$dst,
+ (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK2:$src, VK16)),
+ sub_8bit))>;
+ def : Pat<(store VK4:$src, addr:$dst),
+ (MOV8mr addr:$dst,
+ (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK4:$src, VK16)),
+ sub_8bit))>;
+ def : Pat<(store VK8:$src, addr:$dst),
+ (MOV8mr addr:$dst,
+ (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)),
+ sub_8bit))>;
+
+ def : Pat<(v8i1 (load addr:$src)),
+ (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK8)>;
+ def : Pat<(v2i1 (load addr:$src)),
+ (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK2)>;
+ def : Pat<(v4i1 (load addr:$src)),
+ (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK4)>;
+}
+
+let Predicates = [HasAVX512] in {
+ def : Pat<(store (i16 (bitconvert (v16i1 VK16:$src))), addr:$dst),
+ (KMOVWmk addr:$dst, VK16:$src)>;
+ def : Pat<(i1 (load addr:$src)),
+ (COPY_TO_REGCLASS (AND32ri8 (MOVZX32rm8 addr:$src), (i32 1)), VK1)>;
+ def : Pat<(v16i1 (bitconvert (i16 (load addr:$src)))),
+ (KMOVWkm addr:$src)>;
+}
+let Predicates = [HasBWI] in {
+ def : Pat<(store (i32 (bitconvert (v32i1 VK32:$src))), addr:$dst),
+ (KMOVDmk addr:$dst, VK32:$src)>;
+ def : Pat<(v32i1 (bitconvert (i32 (load addr:$src)))),
+ (KMOVDkm addr:$src)>;
+ def : Pat<(store (i64 (bitconvert (v64i1 VK64:$src))), addr:$dst),
+ (KMOVQmk addr:$dst, VK64:$src)>;
+ def : Pat<(v64i1 (bitconvert (i64 (load addr:$src)))),
+ (KMOVQkm addr:$src)>;
+}
+
+let Predicates = [HasAVX512] in {
+ def : Pat<(i1 (trunc (i64 GR64:$src))),
+ (COPY_TO_REGCLASS (KMOVWkr (AND32ri8 (EXTRACT_SUBREG $src, sub_32bit),
+ (i32 1))), VK1)>;
+
+ def : Pat<(i1 (trunc (i32 GR32:$src))),
+ (COPY_TO_REGCLASS (KMOVWkr (AND32ri8 $src, (i32 1))), VK1)>;
+
+ def : Pat<(i1 (trunc (i32 (assertzext_i1 GR32:$src)))),
+ (COPY_TO_REGCLASS GR32:$src, VK1)>;
+
+ def : Pat<(i1 (trunc (i8 GR8:$src))),
+ (COPY_TO_REGCLASS
+ (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+ GR8:$src, sub_8bit), (i32 1))),
+ VK1)>;
+
+ def : Pat<(i1 (trunc (i16 GR16:$src))),
+ (COPY_TO_REGCLASS
+ (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+ GR16:$src, sub_16bit), (i32 1))),
+ VK1)>;
+
+ def : Pat<(i32 (zext VK1:$src)),
+ (AND32ri8 (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1))>;
+
+ def : Pat<(i32 (anyext VK1:$src)),
+ (COPY_TO_REGCLASS VK1:$src, GR32)>;
+
+ def : Pat<(i8 (zext VK1:$src)),
+ (EXTRACT_SUBREG
+ (AND32ri8 (KMOVWrk
+ (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)), sub_8bit)>;
+
+ def : Pat<(i8 (anyext VK1:$src)),
+ (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_8bit)>;
+
+ def : Pat<(i64 (zext VK1:$src)),
+ (AND64ri8 (SUBREG_TO_REG (i64 0),
+ (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_32bit), (i64 1))>;
+
+ def : Pat<(i64 (anyext VK1:$src)),
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+ (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_32bit)>;
+
+ def : Pat<(i16 (zext VK1:$src)),
+ (EXTRACT_SUBREG
+ (AND32ri8 (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)),
+ sub_16bit)>;
+
+ def : Pat<(i16 (anyext VK1:$src)),
+ (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_16bit)>;
+}
+def : Pat<(v16i1 (scalar_to_vector VK1:$src)),
+ (COPY_TO_REGCLASS VK1:$src, VK16)>;
+def : Pat<(v8i1 (scalar_to_vector VK1:$src)),
+ (COPY_TO_REGCLASS VK1:$src, VK8)>;
+def : Pat<(v4i1 (scalar_to_vector VK1:$src)),
+ (COPY_TO_REGCLASS VK1:$src, VK4)>;
+def : Pat<(v2i1 (scalar_to_vector VK1:$src)),
+ (COPY_TO_REGCLASS VK1:$src, VK2)>;
+def : Pat<(v32i1 (scalar_to_vector VK1:$src)),
+ (COPY_TO_REGCLASS VK1:$src, VK32)>;
+def : Pat<(v64i1 (scalar_to_vector VK1:$src)),
+ (COPY_TO_REGCLASS VK1:$src, VK64)>;
+
+def : Pat<(store (i1 -1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
+def : Pat<(store (i1 1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
+def : Pat<(store (i1 0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>;
+
+def : Pat<(i1 (X86Vextract VK64:$src, (iPTR 0))), (COPY_TO_REGCLASS VK64:$src, VK1)>;
+def : Pat<(i1 (X86Vextract VK32:$src, (iPTR 0))), (COPY_TO_REGCLASS VK32:$src, VK1)>;
+def : Pat<(i1 (X86Vextract VK16:$src, (iPTR 0))), (COPY_TO_REGCLASS VK16:$src, VK1)>;
+def : Pat<(i1 (X86Vextract VK8:$src, (iPTR 0))), (COPY_TO_REGCLASS VK8:$src, VK1)>;
+def : Pat<(i1 (X86Vextract VK4:$src, (iPTR 0))), (COPY_TO_REGCLASS VK4:$src, VK1)>;
+def : Pat<(i1 (X86Vextract VK2:$src, (iPTR 0))), (COPY_TO_REGCLASS VK2:$src, VK1)>;
+
+// Mask unary operation
+// - KNOT
+multiclass avx512_mask_unop<bits<8> opc, string OpcodeStr,
+ RegisterClass KRC, SDPatternOperator OpNode,
+ Predicate prd> {
+ let Predicates = [prd] in
+ def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set KRC:$dst, (OpNode KRC:$src))]>;
+}
+
+multiclass avx512_mask_unop_all<bits<8> opc, string OpcodeStr,
+ SDPatternOperator OpNode> {
+ defm B : avx512_mask_unop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode,
+ HasDQI>, VEX, PD;
+ defm W : avx512_mask_unop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode,
+ HasAVX512>, VEX, PS;
+ defm D : avx512_mask_unop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode,
+ HasBWI>, VEX, PD, VEX_W;
+ defm Q : avx512_mask_unop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode,
+ HasBWI>, VEX, PS, VEX_W;
+}
+
+defm KNOT : avx512_mask_unop_all<0x44, "knot", vnot>;
+
+multiclass avx512_mask_unop_int<string IntName, string InstName> {
+ let Predicates = [HasAVX512] in
+ def : Pat<(!cast<Intrinsic>("int_x86_avx512_"##IntName##"_w")
+ (i16 GR16:$src)),
+ (COPY_TO_REGCLASS (!cast<Instruction>(InstName##"Wrr")
+ (v16i1 (COPY_TO_REGCLASS GR16:$src, VK16))), GR16)>;
+}
+defm : avx512_mask_unop_int<"knot", "KNOT">;
+
+// KNL does not support KMOVB, 8-bit mask is promoted to 16-bit
+let Predicates = [HasAVX512, NoDQI] in
+def : Pat<(vnot VK8:$src),
+ (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$src, VK16)), VK8)>;
+
+def : Pat<(vnot VK4:$src),
+ (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK4:$src, VK16)), VK4)>;
+def : Pat<(vnot VK2:$src),
+ (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK2:$src, VK16)), VK2)>;
+
+// Mask binary operation
+// - KAND, KANDN, KOR, KXNOR, KXOR
+multiclass avx512_mask_binop<bits<8> opc, string OpcodeStr,
+ RegisterClass KRC, SDPatternOperator OpNode,
+ Predicate prd, bit IsCommutable> {
+ let Predicates = [prd], isCommutable = IsCommutable in
+ def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src1, KRC:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set KRC:$dst, (OpNode KRC:$src1, KRC:$src2))]>;
+}
+
+multiclass avx512_mask_binop_all<bits<8> opc, string OpcodeStr,
+ SDPatternOperator OpNode, bit IsCommutable,
+ Predicate prdW = HasAVX512> {
+ defm B : avx512_mask_binop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode,
+ HasDQI, IsCommutable>, VEX_4V, VEX_L, PD;
+ defm W : avx512_mask_binop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode,
+ prdW, IsCommutable>, VEX_4V, VEX_L, PS;
+ defm D : avx512_mask_binop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode,
+ HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PD;
+ defm Q : avx512_mask_binop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode,
+ HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PS;
+}
+
+def andn : PatFrag<(ops node:$i0, node:$i1), (and (not node:$i0), node:$i1)>;
+def xnor : PatFrag<(ops node:$i0, node:$i1), (not (xor node:$i0, node:$i1))>;
+// These nodes use 'vnot' instead of 'not' to support vectors.
+def vandn : PatFrag<(ops node:$i0, node:$i1), (and (vnot node:$i0), node:$i1)>;
+def vxnor : PatFrag<(ops node:$i0, node:$i1), (vnot (xor node:$i0, node:$i1))>;
+
+defm KAND : avx512_mask_binop_all<0x41, "kand", and, 1>;
+defm KOR : avx512_mask_binop_all<0x45, "kor", or, 1>;
+defm KXNOR : avx512_mask_binop_all<0x46, "kxnor", vxnor, 1>;
+defm KXOR : avx512_mask_binop_all<0x47, "kxor", xor, 1>;
+defm KANDN : avx512_mask_binop_all<0x42, "kandn", vandn, 0>;
+defm KADD : avx512_mask_binop_all<0x4A, "kadd", add, 1, HasDQI>;
+
+multiclass avx512_mask_binop_int<string IntName, string InstName> {
+ let Predicates = [HasAVX512] in
+ def : Pat<(!cast<Intrinsic>("int_x86_avx512_"##IntName##"_w")
+ (i16 GR16:$src1), (i16 GR16:$src2)),
+ (COPY_TO_REGCLASS (!cast<Instruction>(InstName##"Wrr")
+ (v16i1 (COPY_TO_REGCLASS GR16:$src1, VK16)),
+ (v16i1 (COPY_TO_REGCLASS GR16:$src2, VK16))), GR16)>;
+}
+
+defm : avx512_mask_binop_int<"kand", "KAND">;
+defm : avx512_mask_binop_int<"kandn", "KANDN">;
+defm : avx512_mask_binop_int<"kor", "KOR">;
+defm : avx512_mask_binop_int<"kxnor", "KXNOR">;
+defm : avx512_mask_binop_int<"kxor", "KXOR">;
+
+multiclass avx512_binop_pat<SDPatternOperator VOpNode, SDPatternOperator OpNode,
+ Instruction Inst> {
+ // With AVX512F, 8-bit mask is promoted to 16-bit mask,
+ // for the DQI set, this type is legal and KxxxB instruction is used
+ let Predicates = [NoDQI] in
+ def : Pat<(VOpNode VK8:$src1, VK8:$src2),
+ (COPY_TO_REGCLASS
+ (Inst (COPY_TO_REGCLASS VK8:$src1, VK16),
+ (COPY_TO_REGCLASS VK8:$src2, VK16)), VK8)>;
+
+ // All types smaller than 8 bits require conversion anyway
+ def : Pat<(OpNode VK1:$src1, VK1:$src2),
+ (COPY_TO_REGCLASS (Inst
+ (COPY_TO_REGCLASS VK1:$src1, VK16),
+ (COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>;
+ def : Pat<(VOpNode VK2:$src1, VK2:$src2),
+ (COPY_TO_REGCLASS (Inst
+ (COPY_TO_REGCLASS VK2:$src1, VK16),
+ (COPY_TO_REGCLASS VK2:$src2, VK16)), VK1)>;
+ def : Pat<(VOpNode VK4:$src1, VK4:$src2),
+ (COPY_TO_REGCLASS (Inst
+ (COPY_TO_REGCLASS VK4:$src1, VK16),
+ (COPY_TO_REGCLASS VK4:$src2, VK16)), VK1)>;
+}
+
+defm : avx512_binop_pat<and, and, KANDWrr>;
+defm : avx512_binop_pat<vandn, andn, KANDNWrr>;
+defm : avx512_binop_pat<or, or, KORWrr>;
+defm : avx512_binop_pat<vxnor, xnor, KXNORWrr>;
+defm : avx512_binop_pat<xor, xor, KXORWrr>;
+
+// Mask unpacking
+multiclass avx512_mask_unpck<string Suffix,RegisterClass KRC, ValueType VT,
+ RegisterClass KRCSrc, Predicate prd> {
+ let Predicates = [prd] in {
+ let hasSideEffects = 0 in
+ def rr : I<0x4b, MRMSrcReg, (outs KRC:$dst),
+ (ins KRC:$src1, KRC:$src2),
+ "kunpck"#Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ VEX_4V, VEX_L;
+
+ def : Pat<(VT (concat_vectors KRCSrc:$src1, KRCSrc:$src2)),
+ (!cast<Instruction>(NAME##rr)
+ (COPY_TO_REGCLASS KRCSrc:$src2, KRC),
+ (COPY_TO_REGCLASS KRCSrc:$src1, KRC))>;
+ }
+}
+
+defm KUNPCKBW : avx512_mask_unpck<"bw", VK16, v16i1, VK8, HasAVX512>, PD;
+defm KUNPCKWD : avx512_mask_unpck<"wd", VK32, v32i1, VK16, HasBWI>, PS;
+defm KUNPCKDQ : avx512_mask_unpck<"dq", VK64, v64i1, VK32, HasBWI>, PS, VEX_W;
+
+// Mask bit testing
+multiclass avx512_mask_testop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
+ SDNode OpNode, Predicate prd> {
+ let Predicates = [prd], Defs = [EFLAGS] in
+ def rr : I<opc, MRMSrcReg, (outs), (ins KRC:$src1, KRC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+ [(set EFLAGS, (OpNode KRC:$src1, KRC:$src2))]>;
+}
+
+multiclass avx512_mask_testop_w<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ Predicate prdW = HasAVX512> {
+ defm B : avx512_mask_testop<opc, OpcodeStr#"b", VK8, OpNode, HasDQI>,
+ VEX, PD;
+ defm W : avx512_mask_testop<opc, OpcodeStr#"w", VK16, OpNode, prdW>,
+ VEX, PS;
+ defm Q : avx512_mask_testop<opc, OpcodeStr#"q", VK64, OpNode, HasBWI>,
+ VEX, PS, VEX_W;
+ defm D : avx512_mask_testop<opc, OpcodeStr#"d", VK32, OpNode, HasBWI>,
+ VEX, PD, VEX_W;
+}
+
+defm KORTEST : avx512_mask_testop_w<0x98, "kortest", X86kortest>;
+defm KTEST : avx512_mask_testop_w<0x99, "ktest", X86ktest, HasDQI>;
+
+// Mask shift
+multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
+ SDNode OpNode> {
+ let Predicates = [HasAVX512] in
+ def ri : Ii8<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src, u8imm:$imm),
+ !strconcat(OpcodeStr,
+ "\t{$imm, $src, $dst|$dst, $src, $imm}"),
+ [(set KRC:$dst, (OpNode KRC:$src, (i8 imm:$imm)))]>;
+}
+
+multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr,
+ SDNode OpNode> {
+ defm W : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "w"), VK16, OpNode>,
+ VEX, TAPD, VEX_W;
+ let Predicates = [HasDQI] in
+ defm B : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "b"), VK8, OpNode>,
+ VEX, TAPD;
+ let Predicates = [HasBWI] in {
+ defm Q : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "q"), VK64, OpNode>,
+ VEX, TAPD, VEX_W;
+ defm D : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "d"), VK32, OpNode>,
+ VEX, TAPD;
+ }
+}
+
+defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86vshli>;
+defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86vsrli>;
+
+// Mask setting all 0s or 1s
+multiclass avx512_mask_setop<RegisterClass KRC, ValueType VT, PatFrag Val> {
+ let Predicates = [HasAVX512] in
+ let isReMaterializable = 1, isAsCheapAsAMove = 1, isPseudo = 1 in
+ def #NAME# : I<0, Pseudo, (outs KRC:$dst), (ins), "",
+ [(set KRC:$dst, (VT Val))]>;
+}
+
+multiclass avx512_mask_setop_w<PatFrag Val> {
+ defm B : avx512_mask_setop<VK8, v8i1, Val>;
+ defm W : avx512_mask_setop<VK16, v16i1, Val>;
+ defm D : avx512_mask_setop<VK32, v32i1, Val>;
+ defm Q : avx512_mask_setop<VK64, v64i1, Val>;
+}
+
+defm KSET0 : avx512_mask_setop_w<immAllZerosV>;
+defm KSET1 : avx512_mask_setop_w<immAllOnesV>;
+
+// With AVX-512 only, 8-bit mask is promoted to 16-bit mask.
+let Predicates = [HasAVX512] in {
+ def : Pat<(v8i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK8)>;
+ def : Pat<(v4i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK4)>;
+ def : Pat<(v2i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK2)>;
+ def : Pat<(v8i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK8)>;
+ def : Pat<(v4i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK4)>;
+ def : Pat<(v2i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK2)>;
+ def : Pat<(i1 0), (COPY_TO_REGCLASS (KSET0W), VK1)>;
+ def : Pat<(i1 1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>;
+ def : Pat<(i1 -1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>;
+}
+
+// Patterns for kmask insert_subvector/extract_subvector to/from index=0
+multiclass operation_subvector_mask_lowering<RegisterClass subRC, ValueType subVT,
+ RegisterClass RC, ValueType VT> {
+ def : Pat<(subVT (extract_subvector (VT RC:$src), (iPTR 0))),
+ (subVT (COPY_TO_REGCLASS RC:$src, subRC))>;
+
+ def : Pat<(VT (insert_subvector undef, subRC:$src, (iPTR 0))),
+ (VT (COPY_TO_REGCLASS subRC:$src, RC))>;
+}
+
+defm : operation_subvector_mask_lowering<VK2, v2i1, VK4, v4i1>;
+defm : operation_subvector_mask_lowering<VK2, v2i1, VK8, v8i1>;
+defm : operation_subvector_mask_lowering<VK2, v2i1, VK16, v16i1>;
+defm : operation_subvector_mask_lowering<VK2, v2i1, VK32, v32i1>;
+defm : operation_subvector_mask_lowering<VK2, v2i1, VK64, v64i1>;
+
+defm : operation_subvector_mask_lowering<VK4, v4i1, VK8, v8i1>;
+defm : operation_subvector_mask_lowering<VK4, v4i1, VK16, v16i1>;
+defm : operation_subvector_mask_lowering<VK4, v4i1, VK32, v32i1>;
+defm : operation_subvector_mask_lowering<VK4, v4i1, VK64, v64i1>;
+
+defm : operation_subvector_mask_lowering<VK8, v8i1, VK16, v16i1>;
+defm : operation_subvector_mask_lowering<VK8, v8i1, VK32, v32i1>;
+defm : operation_subvector_mask_lowering<VK8, v8i1, VK64, v64i1>;
+
+defm : operation_subvector_mask_lowering<VK16, v16i1, VK32, v32i1>;
+defm : operation_subvector_mask_lowering<VK16, v16i1, VK64, v64i1>;
+
+defm : operation_subvector_mask_lowering<VK32, v32i1, VK64, v64i1>;
+
+def : Pat<(v2i1 (extract_subvector (v4i1 VK4:$src), (iPTR 2))),
+ (v2i1 (COPY_TO_REGCLASS
+ (KSHIFTRWri (COPY_TO_REGCLASS VK4:$src, VK16), (i8 2)),
+ VK2))>;
+def : Pat<(v4i1 (extract_subvector (v8i1 VK8:$src), (iPTR 4))),
+ (v4i1 (COPY_TO_REGCLASS
+ (KSHIFTRWri (COPY_TO_REGCLASS VK8:$src, VK16), (i8 4)),
+ VK4))>;
+def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))),
+ (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>;
+def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 16))),
+ (v16i1 (COPY_TO_REGCLASS (KSHIFTRDri VK32:$src, (i8 16)), VK16))>;
+def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 32))),
+ (v32i1 (COPY_TO_REGCLASS (KSHIFTRQri VK64:$src, (i8 32)), VK32))>;
+
+
+// Patterns for kmask shift
+multiclass mask_shift_lowering<RegisterClass RC, ValueType VT> {
+ def : Pat<(VT (X86vshli RC:$src, (i8 imm:$imm))),
+ (VT (COPY_TO_REGCLASS
+ (KSHIFTLWri (COPY_TO_REGCLASS RC:$src, VK16),
+ (I8Imm $imm)),
+ RC))>;
+ def : Pat<(VT (X86vsrli RC:$src, (i8 imm:$imm))),
+ (VT (COPY_TO_REGCLASS
+ (KSHIFTRWri (COPY_TO_REGCLASS RC:$src, VK16),
+ (I8Imm $imm)),
+ RC))>;
+}
+
+defm : mask_shift_lowering<VK8, v8i1>, Requires<[HasAVX512, NoDQI]>;
+defm : mask_shift_lowering<VK4, v4i1>, Requires<[HasAVX512]>;
+defm : mask_shift_lowering<VK2, v2i1>, Requires<[HasAVX512]>;
+//===----------------------------------------------------------------------===//
+// AVX-512 - Aligned and unaligned load and store
+//
+
+
+multiclass avx512_load<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ PatFrag ld_frag, PatFrag mload,
+ SDPatternOperator SelectOprr = vselect> {
+ let hasSideEffects = 0 in {
+ def rr : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), (ins _.RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [],
+ _.ExeDomain>, EVEX;
+ def rrkz : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst),
+ (ins _.KRCWM:$mask, _.RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|",
+ "${dst} {${mask}} {z}, $src}"),
+ [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask,
+ (_.VT _.RC:$src),
+ _.ImmAllZerosV)))], _.ExeDomain>,
+ EVEX, EVEX_KZ;
+
+ let canFoldAsLoad = 1, isReMaterializable = 1,
+ SchedRW = [WriteLoad] in
+ def rm : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.MemOp:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set _.RC:$dst, (_.VT (bitconvert (ld_frag addr:$src))))],
+ _.ExeDomain>, EVEX;
+
+ let Constraints = "$src0 = $dst" in {
+ def rrk : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst),
+ (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1),
+ !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|",
+ "${dst} {${mask}}, $src1}"),
+ [(set _.RC:$dst, (_.VT (SelectOprr _.KRCWM:$mask,
+ (_.VT _.RC:$src1),
+ (_.VT _.RC:$src0))))], _.ExeDomain>,
+ EVEX, EVEX_K;
+ let SchedRW = [WriteLoad] in
+ def rmk : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst),
+ (ins _.RC:$src0, _.KRCWM:$mask, _.MemOp:$src1),
+ !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|",
+ "${dst} {${mask}}, $src1}"),
+ [(set _.RC:$dst, (_.VT
+ (vselect _.KRCWM:$mask,
+ (_.VT (bitconvert (ld_frag addr:$src1))),
+ (_.VT _.RC:$src0))))], _.ExeDomain>, EVEX, EVEX_K;
+ }
+ let SchedRW = [WriteLoad] in
+ def rmkz : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst),
+ (ins _.KRCWM:$mask, _.MemOp:$src),
+ OpcodeStr #"\t{$src, ${dst} {${mask}} {z}|"#
+ "${dst} {${mask}} {z}, $src}",
+ [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask,
+ (_.VT (bitconvert (ld_frag addr:$src))), _.ImmAllZerosV)))],
+ _.ExeDomain>, EVEX, EVEX_KZ;
+ }
+ def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, undef)),
+ (!cast<Instruction>(NAME#_.ZSuffix##rmkz) _.KRCWM:$mask, addr:$ptr)>;
+
+ def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, _.ImmAllZerosV)),
+ (!cast<Instruction>(NAME#_.ZSuffix##rmkz) _.KRCWM:$mask, addr:$ptr)>;
+
+ def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, (_.VT _.RC:$src0))),
+ (!cast<Instruction>(NAME#_.ZSuffix##rmk) _.RC:$src0,
+ _.KRCWM:$mask, addr:$ptr)>;
+}
+
+multiclass avx512_alignedload_vl<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo _,
+ Predicate prd> {
+ let Predicates = [prd] in
+ defm Z : avx512_load<opc, OpcodeStr, _.info512, _.info512.AlignedLdFrag,
+ masked_load_aligned512>, EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_load<opc, OpcodeStr, _.info256, _.info256.AlignedLdFrag,
+ masked_load_aligned256>, EVEX_V256;
+ defm Z128 : avx512_load<opc, OpcodeStr, _.info128, _.info128.AlignedLdFrag,
+ masked_load_aligned128>, EVEX_V128;
+ }
+}
+
+multiclass avx512_load_vl<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo _,
+ Predicate prd,
+ SDPatternOperator SelectOprr = vselect> {
+ let Predicates = [prd] in
+ defm Z : avx512_load<opc, OpcodeStr, _.info512, _.info512.LdFrag,
+ masked_load_unaligned, SelectOprr>, EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_load<opc, OpcodeStr, _.info256, _.info256.LdFrag,
+ masked_load_unaligned, SelectOprr>, EVEX_V256;
+ defm Z128 : avx512_load<opc, OpcodeStr, _.info128, _.info128.LdFrag,
+ masked_load_unaligned, SelectOprr>, EVEX_V128;
+ }
+}
+
+multiclass avx512_store<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ PatFrag st_frag, PatFrag mstore> {
+
+ let hasSideEffects = 0 in {
+ def rr_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.RC:$src),
+ OpcodeStr # ".s\t{$src, $dst|$dst, $src}",
+ [], _.ExeDomain>, EVEX;
+ def rrk_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst),
+ (ins _.KRCWM:$mask, _.RC:$src),
+ OpcodeStr # ".s\t{$src, ${dst} {${mask}}|"#
+ "${dst} {${mask}}, $src}",
+ [], _.ExeDomain>, EVEX, EVEX_K;
+ def rrkz_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst),
+ (ins _.KRCWM:$mask, _.RC:$src),
+ OpcodeStr # ".s\t{$src, ${dst} {${mask}} {z}|" #
+ "${dst} {${mask}} {z}, $src}",
+ [], _.ExeDomain>, EVEX, EVEX_KZ;
+ }
+
+ def mr : AVX512PI<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(st_frag (_.VT _.RC:$src), addr:$dst)], _.ExeDomain>, EVEX;
+ def mrk : AVX512PI<opc, MRMDestMem, (outs),
+ (ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src),
+ OpcodeStr # "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}",
+ [], _.ExeDomain>, EVEX, EVEX_K;
+
+ def: Pat<(mstore addr:$ptr, _.KRCWM:$mask, (_.VT _.RC:$src)),
+ (!cast<Instruction>(NAME#_.ZSuffix##mrk) addr:$ptr,
+ _.KRCWM:$mask, _.RC:$src)>;
+}
+
+
+multiclass avx512_store_vl< bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo _, Predicate prd> {
+ let Predicates = [prd] in
+ defm Z : avx512_store<opc, OpcodeStr, _.info512, store,
+ masked_store_unaligned>, EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_store<opc, OpcodeStr, _.info256, store,
+ masked_store_unaligned>, EVEX_V256;
+ defm Z128 : avx512_store<opc, OpcodeStr, _.info128, store,
+ masked_store_unaligned>, EVEX_V128;
+ }
+}
+
+multiclass avx512_alignedstore_vl<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo _, Predicate prd> {
+ let Predicates = [prd] in
+ defm Z : avx512_store<opc, OpcodeStr, _.info512, alignedstore512,
+ masked_store_aligned512>, EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_store<opc, OpcodeStr, _.info256, alignedstore256,
+ masked_store_aligned256>, EVEX_V256;
+ defm Z128 : avx512_store<opc, OpcodeStr, _.info128, alignedstore,
+ masked_store_aligned128>, EVEX_V128;
+ }
+}
+
+defm VMOVAPS : avx512_alignedload_vl<0x28, "vmovaps", avx512vl_f32_info,
+ HasAVX512>,
+ avx512_alignedstore_vl<0x29, "vmovaps", avx512vl_f32_info,
+ HasAVX512>, PS, EVEX_CD8<32, CD8VF>;
+
+defm VMOVAPD : avx512_alignedload_vl<0x28, "vmovapd", avx512vl_f64_info,
+ HasAVX512>,
+ avx512_alignedstore_vl<0x29, "vmovapd", avx512vl_f64_info,
+ HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VMOVUPS : avx512_load_vl<0x10, "vmovups", avx512vl_f32_info, HasAVX512,
+ null_frag>,
+ avx512_store_vl<0x11, "vmovups", avx512vl_f32_info, HasAVX512>,
+ PS, EVEX_CD8<32, CD8VF>;
+
+defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512,
+ null_frag>,
+ avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512>,
+ PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info,
+ HasAVX512>,
+ avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info,
+ HasAVX512>, PD, EVEX_CD8<32, CD8VF>;
+
+defm VMOVDQA64 : avx512_alignedload_vl<0x6F, "vmovdqa64", avx512vl_i64_info,
+ HasAVX512>,
+ avx512_alignedstore_vl<0x7F, "vmovdqa64", avx512vl_i64_info,
+ HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI>,
+ avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info,
+ HasBWI>, XD, EVEX_CD8<8, CD8VF>;
+
+defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI>,
+ avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info,
+ HasBWI>, XD, VEX_W, EVEX_CD8<16, CD8VF>;
+
+defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512,
+ null_frag>,
+ avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info,
+ HasAVX512>, XS, EVEX_CD8<32, CD8VF>;
+
+defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512,
+ null_frag>,
+ avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info,
+ HasAVX512>, XS, VEX_W, EVEX_CD8<64, CD8VF>;
+
+// Special instructions to help with spilling when we don't have VLX. We need
+// to load or store from a ZMM register instead. These are converted in
+// expandPostRAPseudos.
+let isReMaterializable = 1, canFoldAsLoad = 1,
+ isPseudo = 1, SchedRW = [WriteLoad], mayLoad = 1, hasSideEffects = 0 in {
+def VMOVAPSZ128rm_NOVLX : I<0, Pseudo, (outs VR128X:$dst), (ins f128mem:$src),
+ "", []>;
+def VMOVAPSZ256rm_NOVLX : I<0, Pseudo, (outs VR256X:$dst), (ins f256mem:$src),
+ "", []>;
+def VMOVUPSZ128rm_NOVLX : I<0, Pseudo, (outs VR128X:$dst), (ins f128mem:$src),
+ "", []>;
+def VMOVUPSZ256rm_NOVLX : I<0, Pseudo, (outs VR256X:$dst), (ins f256mem:$src),
+ "", []>;
+}
+
+let isPseudo = 1, mayStore = 1, hasSideEffects = 0 in {
+def VMOVAPSZ128mr_NOVLX : I<0, Pseudo, (outs), (ins f128mem:$dst, VR128X:$src),
+ "", []>;
+def VMOVAPSZ256mr_NOVLX : I<0, Pseudo, (outs), (ins f256mem:$dst, VR256X:$src),
+ "", []>;
+def VMOVUPSZ128mr_NOVLX : I<0, Pseudo, (outs), (ins f128mem:$dst, VR128X:$src),
+ "", []>;
+def VMOVUPSZ256mr_NOVLX : I<0, Pseudo, (outs), (ins f256mem:$dst, VR256X:$src),
+ "", []>;
+}
+
+def : Pat<(v8i64 (vselect VK8WM:$mask, (bc_v8i64 (v16i32 immAllZerosV)),
+ (v8i64 VR512:$src))),
+ (VMOVDQA64Zrrkz (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$mask, VK16)),
+ VK8), VR512:$src)>;
+
+def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 immAllZerosV),
+ (v16i32 VR512:$src))),
+ (VMOVDQA32Zrrkz (KNOTWrr VK16WM:$mask), VR512:$src)>;
+
+// These patterns exist to prevent the above patterns from introducing a second
+// mask inversion when one already exists.
+def : Pat<(v8i64 (vselect (xor VK8:$mask, (v8i1 immAllOnesV)),
+ (bc_v8i64 (v16i32 immAllZerosV)),
+ (v8i64 VR512:$src))),
+ (VMOVDQA64Zrrkz VK8:$mask, VR512:$src)>;
+def : Pat<(v16i32 (vselect (xor VK16:$mask, (v16i1 immAllOnesV)),
+ (v16i32 immAllZerosV),
+ (v16i32 VR512:$src))),
+ (VMOVDQA32Zrrkz VK16WM:$mask, VR512:$src)>;
+
+let Predicates = [HasVLX, NoBWI] in {
+ // 128-bit load/store without BWI.
+ def : Pat<(alignedstore (v8i16 VR128X:$src), addr:$dst),
+ (VMOVDQA32Z128mr addr:$dst, VR128X:$src)>;
+ def : Pat<(alignedstore (v16i8 VR128X:$src), addr:$dst),
+ (VMOVDQA32Z128mr addr:$dst, VR128X:$src)>;
+ def : Pat<(store (v8i16 VR128X:$src), addr:$dst),
+ (VMOVDQU32Z128mr addr:$dst, VR128X:$src)>;
+ def : Pat<(store (v16i8 VR128X:$src), addr:$dst),
+ (VMOVDQU32Z128mr addr:$dst, VR128X:$src)>;
+
+ // 256-bit load/store without BWI.
+ def : Pat<(alignedstore256 (v16i16 VR256X:$src), addr:$dst),
+ (VMOVDQA32Z256mr addr:$dst, VR256X:$src)>;
+ def : Pat<(alignedstore256 (v32i8 VR256X:$src), addr:$dst),
+ (VMOVDQA32Z256mr addr:$dst, VR256X:$src)>;
+ def : Pat<(store (v16i16 VR256X:$src), addr:$dst),
+ (VMOVDQU32Z256mr addr:$dst, VR256X:$src)>;
+ def : Pat<(store (v32i8 VR256X:$src), addr:$dst),
+ (VMOVDQU32Z256mr addr:$dst, VR256X:$src)>;
+}
+
+let Predicates = [HasVLX] in {
+ // Special patterns for storing subvector extracts of lower 128-bits of 256.
+ // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
+ def : Pat<(alignedstore (v2f64 (extract_subvector
+ (v4f64 VR256X:$src), (iPTR 0))), addr:$dst),
+ (VMOVAPDZ128mr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
+ def : Pat<(alignedstore (v4f32 (extract_subvector
+ (v8f32 VR256X:$src), (iPTR 0))), addr:$dst),
+ (VMOVAPSZ128mr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
+ def : Pat<(alignedstore (v2i64 (extract_subvector
+ (v4i64 VR256X:$src), (iPTR 0))), addr:$dst),
+ (VMOVDQA64Z128mr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
+ def : Pat<(alignedstore (v4i32 (extract_subvector
+ (v8i32 VR256X:$src), (iPTR 0))), addr:$dst),
+ (VMOVDQA32Z128mr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
+ def : Pat<(alignedstore (v8i16 (extract_subvector
+ (v16i16 VR256X:$src), (iPTR 0))), addr:$dst),
+ (VMOVDQA32Z128mr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
+ def : Pat<(alignedstore (v16i8 (extract_subvector
+ (v32i8 VR256X:$src), (iPTR 0))), addr:$dst),
+ (VMOVDQA32Z128mr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
+
+ def : Pat<(store (v2f64 (extract_subvector
+ (v4f64 VR256X:$src), (iPTR 0))), addr:$dst),
+ (VMOVUPDZ128mr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
+ def : Pat<(store (v4f32 (extract_subvector
+ (v8f32 VR256X:$src), (iPTR 0))), addr:$dst),
+ (VMOVUPSZ128mr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
+ def : Pat<(store (v2i64 (extract_subvector
+ (v4i64 VR256X:$src), (iPTR 0))), addr:$dst),
+ (VMOVDQU64Z128mr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
+ def : Pat<(store (v4i32 (extract_subvector
+ (v8i32 VR256X:$src), (iPTR 0))), addr:$dst),
+ (VMOVDQU32Z128mr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
+ def : Pat<(store (v8i16 (extract_subvector
+ (v16i16 VR256X:$src), (iPTR 0))), addr:$dst),
+ (VMOVDQU32Z128mr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
+ def : Pat<(store (v16i8 (extract_subvector
+ (v32i8 VR256X:$src), (iPTR 0))), addr:$dst),
+ (VMOVDQU32Z128mr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
+
+ // Special patterns for storing subvector extracts of lower 128-bits of 512.
+ // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
+ def : Pat<(alignedstore (v2f64 (extract_subvector
+ (v8f64 VR512:$src), (iPTR 0))), addr:$dst),
+ (VMOVAPDZ128mr addr:$dst, (v2f64 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
+ def : Pat<(alignedstore (v4f32 (extract_subvector
+ (v16f32 VR512:$src), (iPTR 0))), addr:$dst),
+ (VMOVAPSZ128mr addr:$dst, (v4f32 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
+ def : Pat<(alignedstore (v2i64 (extract_subvector
+ (v8i64 VR512:$src), (iPTR 0))), addr:$dst),
+ (VMOVDQA64Z128mr addr:$dst, (v2i64 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
+ def : Pat<(alignedstore (v4i32 (extract_subvector
+ (v16i32 VR512:$src), (iPTR 0))), addr:$dst),
+ (VMOVDQA32Z128mr addr:$dst, (v4i32 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
+ def : Pat<(alignedstore (v8i16 (extract_subvector
+ (v32i16 VR512:$src), (iPTR 0))), addr:$dst),
+ (VMOVDQA32Z128mr addr:$dst, (v8i16 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
+ def : Pat<(alignedstore (v16i8 (extract_subvector
+ (v64i8 VR512:$src), (iPTR 0))), addr:$dst),
+ (VMOVDQA32Z128mr addr:$dst, (v16i8 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
+
+ def : Pat<(store (v2f64 (extract_subvector
+ (v8f64 VR512:$src), (iPTR 0))), addr:$dst),
+ (VMOVUPDZ128mr addr:$dst, (v2f64 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
+ def : Pat<(store (v4f32 (extract_subvector
+ (v16f32 VR512:$src), (iPTR 0))), addr:$dst),
+ (VMOVUPSZ128mr addr:$dst, (v4f32 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
+ def : Pat<(store (v2i64 (extract_subvector
+ (v8i64 VR512:$src), (iPTR 0))), addr:$dst),
+ (VMOVDQU64Z128mr addr:$dst, (v2i64 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
+ def : Pat<(store (v4i32 (extract_subvector
+ (v16i32 VR512:$src), (iPTR 0))), addr:$dst),
+ (VMOVDQU32Z128mr addr:$dst, (v4i32 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
+ def : Pat<(store (v8i16 (extract_subvector
+ (v32i16 VR512:$src), (iPTR 0))), addr:$dst),
+ (VMOVDQU32Z128mr addr:$dst, (v8i16 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
+ def : Pat<(store (v16i8 (extract_subvector
+ (v64i8 VR512:$src), (iPTR 0))), addr:$dst),
+ (VMOVDQU32Z128mr addr:$dst, (v16i8 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
+
+ // Special patterns for storing subvector extracts of lower 256-bits of 512.
+ // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
+ def : Pat<(alignedstore256 (v4f64 (extract_subvector
+ (v8f64 VR512:$src), (iPTR 0))), addr:$dst),
+ (VMOVAPDZ256mr addr:$dst, (v4f64 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
+ def : Pat<(alignedstore (v8f32 (extract_subvector
+ (v16f32 VR512:$src), (iPTR 0))), addr:$dst),
+ (VMOVAPSZ256mr addr:$dst, (v8f32 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
+ def : Pat<(alignedstore256 (v4i64 (extract_subvector
+ (v8i64 VR512:$src), (iPTR 0))), addr:$dst),
+ (VMOVDQA64Z256mr addr:$dst, (v4i64 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
+ def : Pat<(alignedstore256 (v8i32 (extract_subvector
+ (v16i32 VR512:$src), (iPTR 0))), addr:$dst),
+ (VMOVDQA32Z256mr addr:$dst, (v8i32 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
+ def : Pat<(alignedstore256 (v16i16 (extract_subvector
+ (v32i16 VR512:$src), (iPTR 0))), addr:$dst),
+ (VMOVDQA32Z256mr addr:$dst, (v16i16 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
+ def : Pat<(alignedstore256 (v32i8 (extract_subvector
+ (v64i8 VR512:$src), (iPTR 0))), addr:$dst),
+ (VMOVDQA32Z256mr addr:$dst, (v32i8 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
+
+ def : Pat<(store (v4f64 (extract_subvector
+ (v8f64 VR512:$src), (iPTR 0))), addr:$dst),
+ (VMOVUPDZ256mr addr:$dst, (v4f64 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
+ def : Pat<(store (v8f32 (extract_subvector
+ (v16f32 VR512:$src), (iPTR 0))), addr:$dst),
+ (VMOVUPSZ256mr addr:$dst, (v8f32 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
+ def : Pat<(store (v4i64 (extract_subvector
+ (v8i64 VR512:$src), (iPTR 0))), addr:$dst),
+ (VMOVDQU64Z256mr addr:$dst, (v4i64 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
+ def : Pat<(store (v8i32 (extract_subvector
+ (v16i32 VR512:$src), (iPTR 0))), addr:$dst),
+ (VMOVDQU32Z256mr addr:$dst, (v8i32 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
+ def : Pat<(store (v16i16 (extract_subvector
+ (v32i16 VR512:$src), (iPTR 0))), addr:$dst),
+ (VMOVDQU32Z256mr addr:$dst, (v16i16 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
+ def : Pat<(store (v32i8 (extract_subvector
+ (v64i8 VR512:$src), (iPTR 0))), addr:$dst),
+ (VMOVDQU32Z256mr addr:$dst, (v32i8 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
+}
+
+
+// Move Int Doubleword to Packed Double Int
+//
+let ExeDomain = SSEPackedInt in {
+def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src),
+ "vmovd\t{$src, $dst|$dst, $src}",
+ [(set VR128X:$dst,
+ (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
+ EVEX;
+def VMOVDI2PDIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i32mem:$src),
+ "vmovd\t{$src, $dst|$dst, $src}",
+ [(set VR128X:$dst,
+ (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
+ IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
+def VMOV64toPQIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR64:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(set VR128X:$dst,
+ (v2i64 (scalar_to_vector GR64:$src)))],
+ IIC_SSE_MOVDQ>, EVEX, VEX_W;
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
+def VMOV64toPQIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst),
+ (ins i64mem:$src),
+ "vmovq\t{$src, $dst|$dst, $src}", []>,
+ EVEX, VEX_W, EVEX_CD8<64, CD8VT1>;
+let isCodeGenOnly = 1 in {
+def VMOV64toSDZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR64X:$dst), (ins GR64:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(set FR64X:$dst, (bitconvert GR64:$src))],
+ IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>;
+def VMOVSDto64Zrr : AVX512BI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64X:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (bitconvert FR64X:$src))],
+ IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>;
+def VMOVSDto64Zmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64X:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(store (i64 (bitconvert FR64X:$src)), addr:$dst)],
+ IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteStore]>,
+ EVEX_CD8<64, CD8VT1>;
+}
+} // ExeDomain = SSEPackedInt
+
+// Move Int Doubleword to Single Scalar
+//
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
+def VMOVDI2SSZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src),
+ "vmovd\t{$src, $dst|$dst, $src}",
+ [(set FR32X:$dst, (bitconvert GR32:$src))],
+ IIC_SSE_MOVDQ>, EVEX;
+
+def VMOVDI2SSZrm : AVX512BI<0x6E, MRMSrcMem, (outs FR32X:$dst), (ins i32mem:$src),
+ "vmovd\t{$src, $dst|$dst, $src}",
+ [(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))],
+ IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
+} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
+
+// Move doubleword from xmm register to r/m32
+//
+let ExeDomain = SSEPackedInt in {
+def VMOVPDI2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src),
+ "vmovd\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (extractelt (v4i32 VR128X:$src),
+ (iPTR 0)))], IIC_SSE_MOVD_ToGP>,
+ EVEX;
+def VMOVPDI2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs),
+ (ins i32mem:$dst, VR128X:$src),
+ "vmovd\t{$src, $dst|$dst, $src}",
+ [(store (i32 (extractelt (v4i32 VR128X:$src),
+ (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
+ EVEX, EVEX_CD8<32, CD8VT1>;
+} // ExeDomain = SSEPackedInt
+
+// Move quadword from xmm1 register to r/m64
+//
+let ExeDomain = SSEPackedInt in {
+def VMOVPQIto64Zrr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (extractelt (v2i64 VR128X:$src),
+ (iPTR 0)))],
+ IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_W,
+ Requires<[HasAVX512, In64BitMode]>;
+
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
+def VMOVPQIto64Zmr : I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128X:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [], IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_W,
+ Requires<[HasAVX512, In64BitMode]>;
+
+def VMOVPQI2QIZmr : I<0xD6, MRMDestMem, (outs),
+ (ins i64mem:$dst, VR128X:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(store (extractelt (v2i64 VR128X:$src), (iPTR 0)),
+ addr:$dst)], IIC_SSE_MOVDQ>,
+ EVEX, PD, VEX_W, EVEX_CD8<64, CD8VT1>,
+ Sched<[WriteStore]>, Requires<[HasAVX512, In64BitMode]>;
+
+let hasSideEffects = 0 in
+def VMOVPQI2QIZrr : AVX512BI<0xD6, MRMDestReg, (outs VR128X:$dst),
+ (ins VR128X:$src),
+ "vmovq.s\t{$src, $dst|$dst, $src}",[]>,
+ EVEX, VEX_W;
+} // ExeDomain = SSEPackedInt
+
+// Move Scalar Single to Double Int
+//
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
+def VMOVSS2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst),
+ (ins FR32X:$src),
+ "vmovd\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (bitconvert FR32X:$src))],
+ IIC_SSE_MOVD_ToGP>, EVEX;
+def VMOVSS2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs),
+ (ins i32mem:$dst, FR32X:$src),
+ "vmovd\t{$src, $dst|$dst, $src}",
+ [(store (i32 (bitconvert FR32X:$src)), addr:$dst)],
+ IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
+} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
+
+// Move Quadword Int to Packed Quadword Int
+//
+let ExeDomain = SSEPackedInt in {
+def VMOVQI2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst),
+ (ins i64mem:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(set VR128X:$dst,
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
+ EVEX, VEX_W, EVEX_CD8<8, CD8VT8>;
+} // ExeDomain = SSEPackedInt
+
+//===----------------------------------------------------------------------===//
+// AVX-512 MOVSS, MOVSD
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_move_scalar<string asm, SDNode OpNode,
+ X86VectorVTInfo _> {
+ def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.FRC:$src2),
+ !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1,
+ (scalar_to_vector _.FRC:$src2))))],
+ _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V;
+ def rrkz : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
+ (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
+ !strconcat(asm, "\t{$src2, $src1, $dst {${mask}} {z}|",
+ "$dst {${mask}} {z}, $src1, $src2}"),
+ [(set _.RC:$dst, (_.VT (X86selects _.KRCWM:$mask,
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
+ _.ImmAllZerosV)))],
+ _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, EVEX_KZ;
+ let Constraints = "$src0 = $dst" in
+ def rrk : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
+ (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
+ !strconcat(asm, "\t{$src2, $src1, $dst {${mask}}|",
+ "$dst {${mask}}, $src1, $src2}"),
+ [(set _.RC:$dst, (_.VT (X86selects _.KRCWM:$mask,
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
+ (_.VT _.RC:$src0))))],
+ _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, EVEX_K;
+ let canFoldAsLoad = 1, isReMaterializable = 1 in
+ def rm : AVX512PI<0x10, MRMSrcMem, (outs _.FRC:$dst), (ins _.ScalarMemOp:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+ [(set _.FRC:$dst, (_.ScalarLdFrag addr:$src))],
+ _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX;
+ let mayLoad = 1, hasSideEffects = 0 in {
+ let Constraints = "$src0 = $dst" in
+ def rmk : AVX512PI<0x10, MRMSrcMem, (outs _.RC:$dst),
+ (ins _.RC:$src0, _.KRCWM:$mask, _.ScalarMemOp:$src),
+ !strconcat(asm, "\t{$src, $dst {${mask}}|",
+ "$dst {${mask}}, $src}"),
+ [], _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX, EVEX_K;
+ def rmkz : AVX512PI<0x10, MRMSrcMem, (outs _.RC:$dst),
+ (ins _.KRCWM:$mask, _.ScalarMemOp:$src),
+ !strconcat(asm, "\t{$src, $dst {${mask}} {z}|",
+ "$dst {${mask}} {z}, $src}"),
+ [], _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX, EVEX_KZ;
+ }
+ def mr: AVX512PI<0x11, MRMDestMem, (outs), (ins _.ScalarMemOp:$dst, _.FRC:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+ [(store _.FRC:$src, addr:$dst)], _.ExeDomain, IIC_SSE_MOV_S_MR>,
+ EVEX;
+ let mayStore = 1, hasSideEffects = 0 in
+ def mrk: AVX512PI<0x11, MRMDestMem, (outs),
+ (ins _.ScalarMemOp:$dst, VK1WM:$mask, _.FRC:$src),
+ !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"),
+ [], _.ExeDomain, IIC_SSE_MOV_S_MR>, EVEX, EVEX_K;
+}
+
+defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, f32x_info>,
+ VEX_LIG, XS, EVEX_CD8<32, CD8VT1>;
+
+defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, f64x_info>,
+ VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+
+multiclass avx512_move_scalar_lowering<string InstrStr, SDNode OpNode,
+ PatLeaf ZeroFP, X86VectorVTInfo _> {
+
+def : Pat<(_.VT (OpNode _.RC:$src0,
+ (_.VT (scalar_to_vector
+ (_.EltVT (X86selects (i1 (trunc GR32:$mask)),
+ (_.EltVT _.FRC:$src1),
+ (_.EltVT _.FRC:$src2))))))),
+ (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr#rrk)
+ (COPY_TO_REGCLASS _.FRC:$src2, _.RC),
+ (COPY_TO_REGCLASS GR32:$mask, VK1WM),
+ (_.VT _.RC:$src0),
+ (COPY_TO_REGCLASS _.FRC:$src1, _.RC)),
+ _.RC)>;
+
+def : Pat<(_.VT (OpNode _.RC:$src0,
+ (_.VT (scalar_to_vector
+ (_.EltVT (X86selects (i1 (trunc GR32:$mask)),
+ (_.EltVT _.FRC:$src1),
+ (_.EltVT ZeroFP))))))),
+ (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr#rrkz)
+ (COPY_TO_REGCLASS GR32:$mask, VK1WM),
+ (_.VT _.RC:$src0),
+ (COPY_TO_REGCLASS _.FRC:$src1, _.RC)),
+ _.RC)>;
+
+}
+
+multiclass avx512_store_scalar_lowering<string InstrStr, AVX512VLVectorVTInfo _,
+ dag Mask, RegisterClass MaskRC> {
+
+def : Pat<(masked_store addr:$dst, Mask,
+ (_.info512.VT (insert_subvector undef,
+ (_.info256.VT (insert_subvector undef,
+ (_.info128.VT _.info128.RC:$src),
+ (i64 0))),
+ (i64 0)))),
+ (!cast<Instruction>(InstrStr#mrk) addr:$dst,
+ (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)),
+ (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>;
+
+}
+
+multiclass avx512_load_scalar_lowering<string InstrStr, AVX512VLVectorVTInfo _,
+ dag Mask, RegisterClass MaskRC> {
+
+def : Pat<(_.info128.VT (extract_subvector
+ (_.info512.VT (masked_load addr:$srcAddr, Mask,
+ (_.info512.VT (bitconvert
+ (v16i32 immAllZerosV))))),
+ (i64 0))),
+ (!cast<Instruction>(InstrStr#rmkz)
+ (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)),
+ addr:$srcAddr)>;
+
+def : Pat<(_.info128.VT (extract_subvector
+ (_.info512.VT (masked_load addr:$srcAddr, Mask,
+ (_.info512.VT (insert_subvector undef,
+ (_.info256.VT (insert_subvector undef,
+ (_.info128.VT (X86vzmovl _.info128.RC:$src)),
+ (i64 0))),
+ (i64 0))))),
+ (i64 0))),
+ (!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src,
+ (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)),
+ addr:$srcAddr)>;
+
+}
+
+defm : avx512_move_scalar_lowering<"VMOVSSZ", X86Movss, fp32imm0, v4f32x_info>;
+defm : avx512_move_scalar_lowering<"VMOVSDZ", X86Movsd, fp64imm0, v2f64x_info>;
+
+defm : avx512_store_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
+ (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>;
+defm : avx512_store_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
+ (v16i1 (bitconvert (i16 (and GR16:$mask, (i16 1))))), GR16>;
+defm : avx512_store_scalar_lowering<"VMOVSDZ", avx512vl_f64_info,
+ (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8>;
+
+defm : avx512_load_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
+ (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>;
+defm : avx512_load_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
+ (v16i1 (bitconvert (i16 (and GR16:$mask, (i16 1))))), GR16>;
+defm : avx512_load_scalar_lowering<"VMOVSDZ", avx512vl_f64_info,
+ (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8>;
+
+def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))),
+ (COPY_TO_REGCLASS (VMOVSSZrrk (COPY_TO_REGCLASS FR32X:$src2, VR128X),
+ VK1WM:$mask, (v4f32 (IMPLICIT_DEF)),(COPY_TO_REGCLASS FR32X:$src1, VR128X)), FR32X)>;
+
+def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))),
+ (COPY_TO_REGCLASS (VMOVSDZrrk (COPY_TO_REGCLASS FR64X:$src2, VR128X),
+ VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR64X:$src1, VR128X)), FR64X)>;
+
+def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask),
+ (VMOVSSZmrk addr:$dst, (i1 (COPY_TO_REGCLASS GR8:$mask, VK1WM)),
+ (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+
+let hasSideEffects = 0 in
+defm VMOVSSZrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f32x_info,
+ (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2),
+ "vmovss.s", "$src2, $src1", "$src1, $src2", []>,
+ XS, EVEX_4V, VEX_LIG;
+
+let hasSideEffects = 0 in
+defm VMOVSSDrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f64x_info,
+ (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2),
+ "vmovsd.s", "$src2, $src1", "$src1, $src2", []>,
+ XD, EVEX_4V, VEX_LIG, VEX_W;
+
+let Predicates = [HasAVX512] in {
+ let AddedComplexity = 15 in {
+ // Move scalar to XMM zero-extended, zeroing a VR128X then do a
+ // MOVS{S,D} to the lower bits.
+ def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32X:$src)))),
+ (VMOVSSZrr (v4f32 (V_SET0)), FR32X:$src)>;
+ def : Pat<(v4f32 (X86vzmovl (v4f32 VR128X:$src))),
+ (VMOVSSZrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+ def : Pat<(v4i32 (X86vzmovl (v4i32 VR128X:$src))),
+ (VMOVSSZrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+ def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64X:$src)))),
+ (VMOVSDZrr (v2f64 (V_SET0)), FR64X:$src)>;
+ }
+
+ // Move low f32 and clear high bits.
+ def : Pat<(v8f32 (X86vzmovl (v8f32 VR256X:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (VMOVSSZrr (v4f32 (V_SET0)),
+ (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)), sub_xmm)>;
+ def : Pat<(v8i32 (X86vzmovl (v8i32 VR256X:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (VMOVSSZrr (v4i32 (V_SET0)),
+ (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)), sub_xmm)>;
+ def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (VMOVSSZrr (v4f32 (V_SET0)),
+ (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)), sub_xmm)>;
+ def : Pat<(v16i32 (X86vzmovl (v16i32 VR512:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (VMOVSSZrr (v4i32 (V_SET0)),
+ (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)), sub_xmm)>;
+
+ let AddedComplexity = 20 in {
+ // MOVSSrm zeros the high parts of the register; represent this
+ // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
+ def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
+ (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
+ def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+ (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
+ def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
+ (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
+ def : Pat<(v4f32 (X86vzload addr:$src)),
+ (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
+
+ // MOVSDrm zeros the high parts of the register; represent this
+ // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
+ def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
+ (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+ def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+ (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+ def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
+ (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+ def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
+ (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+ def : Pat<(v2f64 (X86vzload addr:$src)),
+ (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+
+ // Represent the same patterns above but in the form they appear for
+ // 256-bit types
+ def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
+ (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
+ (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>;
+ def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
+ (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
+ (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
+ def : Pat<(v8f32 (X86vzload addr:$src)),
+ (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
+ def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
+ (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
+ (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
+ def : Pat<(v4f64 (X86vzload addr:$src)),
+ (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
+
+ // Represent the same patterns above but in the form they appear for
+ // 512-bit types
+ def : Pat<(v16i32 (X86vzmovl (insert_subvector undef,
+ (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
+ (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>;
+ def : Pat<(v16f32 (X86vzmovl (insert_subvector undef,
+ (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
+ (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
+ def : Pat<(v16f32 (X86vzload addr:$src)),
+ (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
+ def : Pat<(v8f64 (X86vzmovl (insert_subvector undef,
+ (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
+ (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
+ def : Pat<(v8f64 (X86vzload addr:$src)),
+ (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
+ }
+ def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
+ (v4f32 (scalar_to_vector FR32X:$src)), (iPTR 0)))),
+ (SUBREG_TO_REG (i32 0), (v4f32 (VMOVSSZrr (v4f32 (V_SET0)),
+ FR32X:$src)), sub_xmm)>;
+ def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
+ (v2f64 (scalar_to_vector FR64X:$src)), (iPTR 0)))),
+ (SUBREG_TO_REG (i64 0), (v2f64 (VMOVSDZrr (v2f64 (V_SET0)),
+ FR64X:$src)), sub_xmm)>;
+ def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
+ (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>;
+
+ // Move low f64 and clear high bits.
+ def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (VMOVSDZrr (v2f64 (V_SET0)),
+ (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)), sub_xmm)>;
+ def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (VMOVSDZrr (v2f64 (V_SET0)),
+ (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)), sub_xmm)>;
+
+ def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))),
+ (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (V_SET0)),
+ (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)), sub_xmm)>;
+ def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))),
+ (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (V_SET0)),
+ (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)), sub_xmm)>;
+
+ // Extract and store.
+ def : Pat<(store (f32 (extractelt (v4f32 VR128X:$src), (iPTR 0))),
+ addr:$dst),
+ (VMOVSSZmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X))>;
+
+ // Shuffle with VMOVSS
+ def : Pat<(v4i32 (X86Movss VR128X:$src1, VR128X:$src2)),
+ (VMOVSSZrr (v4i32 VR128X:$src1),
+ (COPY_TO_REGCLASS (v4i32 VR128X:$src2), FR32X))>;
+ def : Pat<(v4f32 (X86Movss VR128X:$src1, VR128X:$src2)),
+ (VMOVSSZrr (v4f32 VR128X:$src1),
+ (COPY_TO_REGCLASS (v4f32 VR128X:$src2), FR32X))>;
+
+ // 256-bit variants
+ def : Pat<(v8i32 (X86Movss VR256X:$src1, VR256X:$src2)),
+ (SUBREG_TO_REG (i32 0),
+ (VMOVSSZrr (EXTRACT_SUBREG (v8i32 VR256X:$src1), sub_xmm),
+ (EXTRACT_SUBREG (v8i32 VR256X:$src2), sub_xmm)),
+ sub_xmm)>;
+ def : Pat<(v8f32 (X86Movss VR256X:$src1, VR256X:$src2)),
+ (SUBREG_TO_REG (i32 0),
+ (VMOVSSZrr (EXTRACT_SUBREG (v8f32 VR256X:$src1), sub_xmm),
+ (EXTRACT_SUBREG (v8f32 VR256X:$src2), sub_xmm)),
+ sub_xmm)>;
+
+ // Shuffle with VMOVSD
+ def : Pat<(v2i64 (X86Movsd VR128X:$src1, VR128X:$src2)),
+ (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+ def : Pat<(v2f64 (X86Movsd VR128X:$src1, VR128X:$src2)),
+ (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+ def : Pat<(v4f32 (X86Movsd VR128X:$src1, VR128X:$src2)),
+ (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+ def : Pat<(v4i32 (X86Movsd VR128X:$src1, VR128X:$src2)),
+ (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+
+ // 256-bit variants
+ def : Pat<(v4i64 (X86Movsd VR256X:$src1, VR256X:$src2)),
+ (SUBREG_TO_REG (i32 0),
+ (VMOVSDZrr (EXTRACT_SUBREG (v4i64 VR256X:$src1), sub_xmm),
+ (EXTRACT_SUBREG (v4i64 VR256X:$src2), sub_xmm)),
+ sub_xmm)>;
+ def : Pat<(v4f64 (X86Movsd VR256X:$src1, VR256X:$src2)),
+ (SUBREG_TO_REG (i32 0),
+ (VMOVSDZrr (EXTRACT_SUBREG (v4f64 VR256X:$src1), sub_xmm),
+ (EXTRACT_SUBREG (v4f64 VR256X:$src2), sub_xmm)),
+ sub_xmm)>;
+
+ def : Pat<(v2f64 (X86Movlpd VR128X:$src1, VR128X:$src2)),
+ (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+ def : Pat<(v2i64 (X86Movlpd VR128X:$src1, VR128X:$src2)),
+ (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+ def : Pat<(v4f32 (X86Movlps VR128X:$src1, VR128X:$src2)),
+ (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+ def : Pat<(v4i32 (X86Movlps VR128X:$src1, VR128X:$src2)),
+ (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+}
+
+let AddedComplexity = 15 in
+def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst),
+ (ins VR128X:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(set VR128X:$dst, (v2i64 (X86vzmovl
+ (v2i64 VR128X:$src))))],
+ IIC_SSE_MOVQ_RR>, EVEX, VEX_W;
+
+let Predicates = [HasAVX512] in {
+ let AddedComplexity = 15 in {
+ def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
+ (VMOVDI2PDIZrr GR32:$src)>;
+
+ def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
+ (VMOV64toPQIZrr GR64:$src)>;
+
+ def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+ (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
+ (SUBREG_TO_REG (i64 0), (VMOV64toPQIZrr GR64:$src), sub_xmm)>;
+
+ def : Pat<(v8i64 (X86vzmovl (insert_subvector undef,
+ (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
+ (SUBREG_TO_REG (i64 0), (VMOV64toPQIZrr GR64:$src), sub_xmm)>;
+ }
+ // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
+ let AddedComplexity = 20 in {
+ def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+ (VMOVDI2PDIZrm addr:$src)>;
+ def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
+ (VMOVDI2PDIZrm addr:$src)>;
+ def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+ (VMOVDI2PDIZrm addr:$src)>;
+ def : Pat<(v4i32 (X86vzload addr:$src)),
+ (VMOVDI2PDIZrm addr:$src)>;
+ def : Pat<(v8i32 (X86vzload addr:$src)),
+ (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>;
+ def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
+ (VMOVQI2PQIZrm addr:$src)>;
+ def : Pat<(v2f64 (X86vzmovl (v2f64 VR128X:$src))),
+ (VMOVZPQILo2PQIZrr VR128X:$src)>;
+ def : Pat<(v2i64 (X86vzload addr:$src)),
+ (VMOVQI2PQIZrm addr:$src)>;
+ def : Pat<(v4i64 (X86vzload addr:$src)),
+ (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>;
+ }
+
+ // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
+ def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
+ (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
+ (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src), sub_xmm)>;
+ def : Pat<(v16i32 (X86vzmovl (insert_subvector undef,
+ (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
+ (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src), sub_xmm)>;
+
+ // Use regular 128-bit instructions to match 512-bit scalar_to_vec+zext.
+ def : Pat<(v16i32 (X86vzload addr:$src)),
+ (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>;
+ def : Pat<(v8i64 (X86vzload addr:$src)),
+ (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>;
+}
+
+def : Pat<(v16i32 (X86Vinsert (v16i32 immAllZerosV), GR32:$src2, (iPTR 0))),
+ (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src2), sub_xmm)>;
+
+def : Pat<(v8i64 (X86Vinsert (bc_v8i64 (v16i32 immAllZerosV)), GR64:$src2, (iPTR 0))),
+ (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>;
+
+def : Pat<(v16i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))),
+ (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src2), sub_xmm)>;
+
+def : Pat<(v8i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
+ (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Non-temporals
+//===----------------------------------------------------------------------===//
+let SchedRW = [WriteLoad] in {
+ def VMOVNTDQAZrm : AVX512PI<0x2A, MRMSrcMem, (outs VR512:$dst),
+ (ins i512mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}",
+ [(set VR512:$dst, (int_x86_avx512_movntdqa addr:$src))],
+ SSEPackedInt>, EVEX, T8PD, EVEX_V512,
+ EVEX_CD8<64, CD8VF>;
+
+ let Predicates = [HasVLX] in {
+ def VMOVNTDQAZ256rm : AVX512PI<0x2A, MRMSrcMem, (outs VR256X:$dst),
+ (ins i256mem:$src),
+ "vmovntdqa\t{$src, $dst|$dst, $src}",
+ [(set VR256X:$dst, (int_x86_avx2_movntdqa addr:$src))],
+ SSEPackedInt>, EVEX, T8PD, EVEX_V256,
+ EVEX_CD8<64, CD8VF>;
+
+ def VMOVNTDQAZ128rm : AVX512PI<0x2A, MRMSrcMem, (outs VR128X:$dst),
+ (ins i128mem:$src),
+ "vmovntdqa\t{$src, $dst|$dst, $src}",
+ [(set VR128X:$dst, (int_x86_sse41_movntdqa addr:$src))],
+ SSEPackedInt>, EVEX, T8PD, EVEX_V128,
+ EVEX_CD8<64, CD8VF>;
+ }
+}
+
+multiclass avx512_movnt<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ PatFrag st_frag = alignednontemporalstore,
+ InstrItinClass itin = IIC_SSE_MOVNT> {
+ let SchedRW = [WriteStore], AddedComplexity = 400 in
+ def mr : AVX512PI<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(st_frag (_.VT _.RC:$src), addr:$dst)],
+ _.ExeDomain, itin>, EVEX, EVEX_CD8<_.EltSize, CD8VF>;
+}
+
+multiclass avx512_movnt_vl<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo VTInfo> {
+ let Predicates = [HasAVX512] in
+ defm Z : avx512_movnt<opc, OpcodeStr, VTInfo.info512>, EVEX_V512;
+
+ let Predicates = [HasAVX512, HasVLX] in {
+ defm Z256 : avx512_movnt<opc, OpcodeStr, VTInfo.info256>, EVEX_V256;
+ defm Z128 : avx512_movnt<opc, OpcodeStr, VTInfo.info128>, EVEX_V128;
+ }
+}
+
+defm VMOVNTDQ : avx512_movnt_vl<0xE7, "vmovntdq", avx512vl_i64_info>, PD;
+defm VMOVNTPD : avx512_movnt_vl<0x2B, "vmovntpd", avx512vl_f64_info>, PD, VEX_W;
+defm VMOVNTPS : avx512_movnt_vl<0x2B, "vmovntps", avx512vl_f32_info>, PS;
+
+let Predicates = [HasAVX512], AddedComplexity = 400 in {
+ def : Pat<(alignednontemporalstore (v16i32 VR512:$src), addr:$dst),
+ (VMOVNTDQZmr addr:$dst, VR512:$src)>;
+ def : Pat<(alignednontemporalstore (v32i16 VR512:$src), addr:$dst),
+ (VMOVNTDQZmr addr:$dst, VR512:$src)>;
+ def : Pat<(alignednontemporalstore (v64i8 VR512:$src), addr:$dst),
+ (VMOVNTDQZmr addr:$dst, VR512:$src)>;
+
+ def : Pat<(v8f64 (alignednontemporalload addr:$src)),
+ (VMOVNTDQAZrm addr:$src)>;
+ def : Pat<(v16f32 (alignednontemporalload addr:$src)),
+ (VMOVNTDQAZrm addr:$src)>;
+ def : Pat<(v8i64 (alignednontemporalload addr:$src)),
+ (VMOVNTDQAZrm addr:$src)>;
+ def : Pat<(v16i32 (bitconvert (v8i64 (alignednontemporalload addr:$src)))),
+ (VMOVNTDQAZrm addr:$src)>;
+ def : Pat<(v32i16 (bitconvert (v8i64 (alignednontemporalload addr:$src)))),
+ (VMOVNTDQAZrm addr:$src)>;
+ def : Pat<(v64i8 (bitconvert (v8i64 (alignednontemporalload addr:$src)))),
+ (VMOVNTDQAZrm addr:$src)>;
+}
+
+let Predicates = [HasVLX], AddedComplexity = 400 in {
+ def : Pat<(alignednontemporalstore (v8i32 VR256X:$src), addr:$dst),
+ (VMOVNTDQZ256mr addr:$dst, VR256X:$src)>;
+ def : Pat<(alignednontemporalstore (v16i16 VR256X:$src), addr:$dst),
+ (VMOVNTDQZ256mr addr:$dst, VR256X:$src)>;
+ def : Pat<(alignednontemporalstore (v32i8 VR256X:$src), addr:$dst),
+ (VMOVNTDQZ256mr addr:$dst, VR256X:$src)>;
+
+ def : Pat<(v4f64 (alignednontemporalload addr:$src)),
+ (VMOVNTDQAZ256rm addr:$src)>;
+ def : Pat<(v8f32 (alignednontemporalload addr:$src)),
+ (VMOVNTDQAZ256rm addr:$src)>;
+ def : Pat<(v4i64 (alignednontemporalload addr:$src)),
+ (VMOVNTDQAZ256rm addr:$src)>;
+ def : Pat<(v8i32 (bitconvert (v2i64 (alignednontemporalload addr:$src)))),
+ (VMOVNTDQAZ256rm addr:$src)>;
+ def : Pat<(v16i16 (bitconvert (v2i64 (alignednontemporalload addr:$src)))),
+ (VMOVNTDQAZ256rm addr:$src)>;
+ def : Pat<(v32i8 (bitconvert (v2i64 (alignednontemporalload addr:$src)))),
+ (VMOVNTDQAZ256rm addr:$src)>;
+
+ def : Pat<(alignednontemporalstore (v4i32 VR128X:$src), addr:$dst),
+ (VMOVNTDQZ128mr addr:$dst, VR128X:$src)>;
+ def : Pat<(alignednontemporalstore (v8i16 VR128X:$src), addr:$dst),
+ (VMOVNTDQZ128mr addr:$dst, VR128X:$src)>;
+ def : Pat<(alignednontemporalstore (v16i8 VR128X:$src), addr:$dst),
+ (VMOVNTDQZ128mr addr:$dst, VR128X:$src)>;
+
+ def : Pat<(v2f64 (alignednontemporalload addr:$src)),
+ (VMOVNTDQAZ128rm addr:$src)>;
+ def : Pat<(v4f32 (alignednontemporalload addr:$src)),
+ (VMOVNTDQAZ128rm addr:$src)>;
+ def : Pat<(v2i64 (alignednontemporalload addr:$src)),
+ (VMOVNTDQAZ128rm addr:$src)>;
+ def : Pat<(v4i32 (bitconvert (v2i64 (alignednontemporalload addr:$src)))),
+ (VMOVNTDQAZ128rm addr:$src)>;
+ def : Pat<(v8i16 (bitconvert (v2i64 (alignednontemporalload addr:$src)))),
+ (VMOVNTDQAZ128rm addr:$src)>;
+ def : Pat<(v16i8 (bitconvert (v2i64 (alignednontemporalload addr:$src)))),
+ (VMOVNTDQAZ128rm addr:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Integer arithmetic
+//
+multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, OpndItins itins,
+ bit IsCommutable = 0> {
+ defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
+ itins.rr, IsCommutable>,
+ AVX512BIBase, EVEX_4V;
+
+ defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode _.RC:$src1,
+ (bitconvert (_.LdFrag addr:$src2)))),
+ itins.rm>,
+ AVX512BIBase, EVEX_4V;
+}
+
+multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, OpndItins itins,
+ bit IsCommutable = 0> :
+ avx512_binop_rm<opc, OpcodeStr, OpNode, _, itins, IsCommutable> {
+ defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+ "${src2}"##_.BroadcastStr##", $src1",
+ "$src1, ${src2}"##_.BroadcastStr,
+ (_.VT (OpNode _.RC:$src1,
+ (X86VBroadcast
+ (_.ScalarLdFrag addr:$src2)))),
+ itins.rm>,
+ AVX512BIBase, EVEX_4V, EVEX_B;
+}
+
+multiclass avx512_binop_rm_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ AVX512VLVectorVTInfo VTInfo, OpndItins itins,
+ Predicate prd, bit IsCommutable = 0> {
+ let Predicates = [prd] in
+ defm Z : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info512, itins,
+ IsCommutable>, EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info256, itins,
+ IsCommutable>, EVEX_V256;
+ defm Z128 : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info128, itins,
+ IsCommutable>, EVEX_V128;
+ }
+}
+
+multiclass avx512_binop_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ AVX512VLVectorVTInfo VTInfo, OpndItins itins,
+ Predicate prd, bit IsCommutable = 0> {
+ let Predicates = [prd] in
+ defm Z : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info512, itins,
+ IsCommutable>, EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info256, itins,
+ IsCommutable>, EVEX_V256;
+ defm Z128 : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info128, itins,
+ IsCommutable>, EVEX_V128;
+ }
+}
+
+multiclass avx512_binop_rm_vl_q<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ OpndItins itins, Predicate prd,
+ bit IsCommutable = 0> {
+ defm NAME : avx512_binop_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i64_info,
+ itins, prd, IsCommutable>,
+ VEX_W, EVEX_CD8<64, CD8VF>;
+}
+
+multiclass avx512_binop_rm_vl_d<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ OpndItins itins, Predicate prd,
+ bit IsCommutable = 0> {
+ defm NAME : avx512_binop_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i32_info,
+ itins, prd, IsCommutable>, EVEX_CD8<32, CD8VF>;
+}
+
+multiclass avx512_binop_rm_vl_w<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ OpndItins itins, Predicate prd,
+ bit IsCommutable = 0> {
+ defm NAME : avx512_binop_rm_vl<opc, OpcodeStr, OpNode, avx512vl_i16_info,
+ itins, prd, IsCommutable>, EVEX_CD8<16, CD8VF>;
+}
+
+multiclass avx512_binop_rm_vl_b<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ OpndItins itins, Predicate prd,
+ bit IsCommutable = 0> {
+ defm NAME : avx512_binop_rm_vl<opc, OpcodeStr, OpNode, avx512vl_i8_info,
+ itins, prd, IsCommutable>, EVEX_CD8<8, CD8VF>;
+}
+
+multiclass avx512_binop_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr,
+ SDNode OpNode, OpndItins itins, Predicate prd,
+ bit IsCommutable = 0> {
+ defm Q : avx512_binop_rm_vl_q<opc_q, OpcodeStr#"q", OpNode, itins, prd,
+ IsCommutable>;
+
+ defm D : avx512_binop_rm_vl_d<opc_d, OpcodeStr#"d", OpNode, itins, prd,
+ IsCommutable>;
+}
+
+multiclass avx512_binop_rm_vl_bw<bits<8> opc_b, bits<8> opc_w, string OpcodeStr,
+ SDNode OpNode, OpndItins itins, Predicate prd,
+ bit IsCommutable = 0> {
+ defm W : avx512_binop_rm_vl_w<opc_w, OpcodeStr#"w", OpNode, itins, prd,
+ IsCommutable>;
+
+ defm B : avx512_binop_rm_vl_b<opc_b, OpcodeStr#"b", OpNode, itins, prd,
+ IsCommutable>;
+}
+
+multiclass avx512_binop_rm_vl_all<bits<8> opc_b, bits<8> opc_w,
+ bits<8> opc_d, bits<8> opc_q,
+ string OpcodeStr, SDNode OpNode,
+ OpndItins itins, bit IsCommutable = 0> {
+ defm NAME : avx512_binop_rm_vl_dq<opc_d, opc_q, OpcodeStr, OpNode,
+ itins, HasAVX512, IsCommutable>,
+ avx512_binop_rm_vl_bw<opc_b, opc_w, OpcodeStr, OpNode,
+ itins, HasBWI, IsCommutable>;
+}
+
+multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins,
+ SDNode OpNode,X86VectorVTInfo _Src,
+ X86VectorVTInfo _Dst, X86VectorVTInfo _Brdct,
+ bit IsCommutable = 0> {
+ defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst),
+ (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr,
+ "$src2, $src1","$src1, $src2",
+ (_Dst.VT (OpNode
+ (_Src.VT _Src.RC:$src1),
+ (_Src.VT _Src.RC:$src2))),
+ itins.rr, IsCommutable>,
+ AVX512BIBase, EVEX_4V;
+ defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
+ (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
+ (bitconvert (_Src.LdFrag addr:$src2)))),
+ itins.rm>,
+ AVX512BIBase, EVEX_4V;
+
+ defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
+ (ins _Src.RC:$src1, _Brdct.ScalarMemOp:$src2),
+ OpcodeStr,
+ "${src2}"##_Brdct.BroadcastStr##", $src1",
+ "$src1, ${src2}"##_Brdct.BroadcastStr,
+ (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
+ (_Brdct.VT (X86VBroadcast
+ (_Brdct.ScalarLdFrag addr:$src2)))))),
+ itins.rm>,
+ AVX512BIBase, EVEX_4V, EVEX_B;
+}
+
+defm VPADD : avx512_binop_rm_vl_all<0xFC, 0xFD, 0xFE, 0xD4, "vpadd", add,
+ SSE_INTALU_ITINS_P, 1>;
+defm VPSUB : avx512_binop_rm_vl_all<0xF8, 0xF9, 0xFA, 0xFB, "vpsub", sub,
+ SSE_INTALU_ITINS_P, 0>;
+defm VPADDS : avx512_binop_rm_vl_bw<0xEC, 0xED, "vpadds", X86adds,
+ SSE_INTALU_ITINS_P, HasBWI, 1>;
+defm VPSUBS : avx512_binop_rm_vl_bw<0xE8, 0xE9, "vpsubs", X86subs,
+ SSE_INTALU_ITINS_P, HasBWI, 0>;
+defm VPADDUS : avx512_binop_rm_vl_bw<0xDC, 0xDD, "vpaddus", X86addus,
+ SSE_INTALU_ITINS_P, HasBWI, 1>;
+defm VPSUBUS : avx512_binop_rm_vl_bw<0xD8, 0xD9, "vpsubus", X86subus,
+ SSE_INTALU_ITINS_P, HasBWI, 0>;
+defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmulld", mul,
+ SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
+defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmullw", mul,
+ SSE_INTALU_ITINS_P, HasBWI, 1>;
+defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmullq", mul,
+ SSE_INTALU_ITINS_P, HasDQI, 1>, T8PD;
+defm VPMULHW : avx512_binop_rm_vl_w<0xE5, "vpmulhw", mulhs, SSE_INTALU_ITINS_P,
+ HasBWI, 1>;
+defm VPMULHUW : avx512_binop_rm_vl_w<0xE4, "vpmulhuw", mulhu, SSE_INTMUL_ITINS_P,
+ HasBWI, 1>;
+defm VPMULHRSW : avx512_binop_rm_vl_w<0x0B, "vpmulhrsw", X86mulhrs, SSE_INTMUL_ITINS_P,
+ HasBWI, 1>, T8PD;
+defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", X86avg,
+ SSE_INTALU_ITINS_P, HasBWI, 1>;
+
+multiclass avx512_binop_all<bits<8> opc, string OpcodeStr, OpndItins itins,
+ AVX512VLVectorVTInfo _SrcVTInfo, AVX512VLVectorVTInfo _DstVTInfo,
+ SDNode OpNode, Predicate prd, bit IsCommutable = 0> {
+ let Predicates = [prd] in
+ defm NAME#Z : avx512_binop_rm2<opc, OpcodeStr, itins, OpNode,
+ _SrcVTInfo.info512, _DstVTInfo.info512,
+ v8i64_info, IsCommutable>,
+ EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W;
+ let Predicates = [HasVLX, prd] in {
+ defm NAME#Z256 : avx512_binop_rm2<opc, OpcodeStr, itins, OpNode,
+ _SrcVTInfo.info256, _DstVTInfo.info256,
+ v4i64x_info, IsCommutable>,
+ EVEX_V256, EVEX_CD8<64, CD8VF>, VEX_W;
+ defm NAME#Z128 : avx512_binop_rm2<opc, OpcodeStr, itins, OpNode,
+ _SrcVTInfo.info128, _DstVTInfo.info128,
+ v2i64x_info, IsCommutable>,
+ EVEX_V128, EVEX_CD8<64, CD8VF>, VEX_W;
+ }
+}
+
+defm VPMULDQ : avx512_binop_all<0x28, "vpmuldq", SSE_INTALU_ITINS_P,
+ avx512vl_i32_info, avx512vl_i64_info,
+ X86pmuldq, HasAVX512, 1>,T8PD;
+defm VPMULUDQ : avx512_binop_all<0xF4, "vpmuludq", SSE_INTMUL_ITINS_P,
+ avx512vl_i32_info, avx512vl_i64_info,
+ X86pmuludq, HasAVX512, 1>;
+defm VPMULTISHIFTQB : avx512_binop_all<0x83, "vpmultishiftqb", SSE_INTALU_ITINS_P,
+ avx512vl_i8_info, avx512vl_i8_info,
+ X86multishift, HasVBMI, 0>, T8PD;
+
+multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _Src, X86VectorVTInfo _Dst> {
+ defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
+ (ins _Src.RC:$src1, _Src.ScalarMemOp:$src2),
+ OpcodeStr,
+ "${src2}"##_Src.BroadcastStr##", $src1",
+ "$src1, ${src2}"##_Src.BroadcastStr,
+ (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
+ (_Src.VT (X86VBroadcast
+ (_Src.ScalarLdFrag addr:$src2))))))>,
+ EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>;
+}
+
+multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr,
+ SDNode OpNode,X86VectorVTInfo _Src,
+ X86VectorVTInfo _Dst, bit IsCommutable = 0> {
+ defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst),
+ (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr,
+ "$src2, $src1","$src1, $src2",
+ (_Dst.VT (OpNode
+ (_Src.VT _Src.RC:$src1),
+ (_Src.VT _Src.RC:$src2))),
+ NoItinerary, IsCommutable>,
+ EVEX_CD8<_Src.EltSize, CD8VF>, EVEX_4V;
+ defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
+ (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
+ (bitconvert (_Src.LdFrag addr:$src2))))>,
+ EVEX_4V, EVEX_CD8<_Src.EltSize, CD8VF>;
+}
+
+multiclass avx512_packs_all_i32_i16<bits<8> opc, string OpcodeStr,
+ SDNode OpNode> {
+ let Predicates = [HasBWI] in
+ defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, v16i32_info,
+ v32i16_info>,
+ avx512_packs_rmb<opc, OpcodeStr, OpNode, v16i32_info,
+ v32i16_info>, EVEX_V512;
+ let Predicates = [HasBWI, HasVLX] in {
+ defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, v8i32x_info,
+ v16i16x_info>,
+ avx512_packs_rmb<opc, OpcodeStr, OpNode, v8i32x_info,
+ v16i16x_info>, EVEX_V256;
+ defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, v4i32x_info,
+ v8i16x_info>,
+ avx512_packs_rmb<opc, OpcodeStr, OpNode, v4i32x_info,
+ v8i16x_info>, EVEX_V128;
+ }
+}
+multiclass avx512_packs_all_i16_i8<bits<8> opc, string OpcodeStr,
+ SDNode OpNode> {
+ let Predicates = [HasBWI] in
+ defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, v32i16_info,
+ v64i8_info>, EVEX_V512;
+ let Predicates = [HasBWI, HasVLX] in {
+ defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, v16i16x_info,
+ v32i8x_info>, EVEX_V256;
+ defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, v8i16x_info,
+ v16i8x_info>, EVEX_V128;
+ }
+}
+
+multiclass avx512_vpmadd<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, AVX512VLVectorVTInfo _Src,
+ AVX512VLVectorVTInfo _Dst, bit IsCommutable = 0> {
+ let Predicates = [HasBWI] in
+ defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info512,
+ _Dst.info512, IsCommutable>, EVEX_V512;
+ let Predicates = [HasBWI, HasVLX] in {
+ defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info256,
+ _Dst.info256, IsCommutable>, EVEX_V256;
+ defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info128,
+ _Dst.info128, IsCommutable>, EVEX_V128;
+ }
+}
+
+defm VPACKSSDW : avx512_packs_all_i32_i16<0x6B, "vpackssdw", X86Packss>, AVX512BIBase;
+defm VPACKUSDW : avx512_packs_all_i32_i16<0x2b, "vpackusdw", X86Packus>, AVX5128IBase;
+defm VPACKSSWB : avx512_packs_all_i16_i8 <0x63, "vpacksswb", X86Packss>, AVX512BIBase;
+defm VPACKUSWB : avx512_packs_all_i16_i8 <0x67, "vpackuswb", X86Packus>, AVX512BIBase;
+
+defm VPMADDUBSW : avx512_vpmadd<0x04, "vpmaddubsw", X86vpmaddubsw,
+ avx512vl_i8_info, avx512vl_i16_info>, AVX512BIBase, T8PD;
+defm VPMADDWD : avx512_vpmadd<0xF5, "vpmaddwd", X86vpmaddwd,
+ avx512vl_i16_info, avx512vl_i32_info, 1>, AVX512BIBase;
+
+defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxsb", smax,
+ SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
+defm VPMAXSW : avx512_binop_rm_vl_w<0xEE, "vpmaxsw", smax,
+ SSE_INTALU_ITINS_P, HasBWI, 1>;
+defm VPMAXS : avx512_binop_rm_vl_dq<0x3D, 0x3D, "vpmaxs", smax,
+ SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
+
+defm VPMAXUB : avx512_binop_rm_vl_b<0xDE, "vpmaxub", umax,
+ SSE_INTALU_ITINS_P, HasBWI, 1>;
+defm VPMAXUW : avx512_binop_rm_vl_w<0x3E, "vpmaxuw", umax,
+ SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
+defm VPMAXU : avx512_binop_rm_vl_dq<0x3F, 0x3F, "vpmaxu", umax,
+ SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
+
+defm VPMINSB : avx512_binop_rm_vl_b<0x38, "vpminsb", smin,
+ SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
+defm VPMINSW : avx512_binop_rm_vl_w<0xEA, "vpminsw", smin,
+ SSE_INTALU_ITINS_P, HasBWI, 1>;
+defm VPMINS : avx512_binop_rm_vl_dq<0x39, 0x39, "vpmins", smin,
+ SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
+
+defm VPMINUB : avx512_binop_rm_vl_b<0xDA, "vpminub", umin,
+ SSE_INTALU_ITINS_P, HasBWI, 1>;
+defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminuw", umin,
+ SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
+defm VPMINU : avx512_binop_rm_vl_dq<0x3B, 0x3B, "vpminu", umin,
+ SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
+
+// PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX.
+let Predicates = [HasDQI, NoVLX] in {
+ def : Pat<(v4i64 (mul (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))),
+ (EXTRACT_SUBREG
+ (VPMULLQZrr
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)),
+ sub_ymm)>;
+
+ def : Pat<(v2i64 (mul (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
+ (EXTRACT_SUBREG
+ (VPMULLQZrr
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)),
+ sub_xmm)>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX-512 Logical Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_logic_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, OpndItins itins,
+ bit IsCommutable = 0> {
+ defm rr : AVX512_maskable_logic<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.i64VT (OpNode (bitconvert (_.VT _.RC:$src1)),
+ (bitconvert (_.VT _.RC:$src2)))),
+ (_.VT (bitconvert (_.i64VT (OpNode _.RC:$src1,
+ _.RC:$src2)))),
+ itins.rr, IsCommutable>,
+ AVX512BIBase, EVEX_4V;
+
+ defm rm : AVX512_maskable_logic<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.i64VT (OpNode (bitconvert (_.VT _.RC:$src1)),
+ (bitconvert (_.LdFrag addr:$src2)))),
+ (_.VT (bitconvert (_.i64VT (OpNode _.RC:$src1,
+ (bitconvert (_.LdFrag addr:$src2)))))),
+ itins.rm>,
+ AVX512BIBase, EVEX_4V;
+}
+
+multiclass avx512_logic_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, OpndItins itins,
+ bit IsCommutable = 0> :
+ avx512_logic_rm<opc, OpcodeStr, OpNode, _, itins, IsCommutable> {
+ defm rmb : AVX512_maskable_logic<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+ "${src2}"##_.BroadcastStr##", $src1",
+ "$src1, ${src2}"##_.BroadcastStr,
+ (_.i64VT (OpNode _.RC:$src1,
+ (bitconvert
+ (_.VT (X86VBroadcast
+ (_.ScalarLdFrag addr:$src2)))))),
+ (_.VT (bitconvert (_.i64VT (OpNode _.RC:$src1,
+ (bitconvert
+ (_.VT (X86VBroadcast
+ (_.ScalarLdFrag addr:$src2)))))))),
+ itins.rm>,
+ AVX512BIBase, EVEX_4V, EVEX_B;
+}
+
+multiclass avx512_logic_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ AVX512VLVectorVTInfo VTInfo, OpndItins itins,
+ Predicate prd, bit IsCommutable = 0> {
+ let Predicates = [prd] in
+ defm Z : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info512, itins,
+ IsCommutable>, EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info256, itins,
+ IsCommutable>, EVEX_V256;
+ defm Z128 : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info128, itins,
+ IsCommutable>, EVEX_V128;
+ }
+}
+
+multiclass avx512_logic_rm_vl_d<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ OpndItins itins, Predicate prd,
+ bit IsCommutable = 0> {
+ defm NAME : avx512_logic_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i32_info,
+ itins, prd, IsCommutable>, EVEX_CD8<32, CD8VF>;
+}
+
+multiclass avx512_logic_rm_vl_q<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ OpndItins itins, Predicate prd,
+ bit IsCommutable = 0> {
+ defm NAME : avx512_logic_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i64_info,
+ itins, prd, IsCommutable>,
+ VEX_W, EVEX_CD8<64, CD8VF>;
+}
+
+multiclass avx512_logic_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr,
+ SDNode OpNode, OpndItins itins, Predicate prd,
+ bit IsCommutable = 0> {
+ defm Q : avx512_logic_rm_vl_q<opc_q, OpcodeStr#"q", OpNode, itins, prd,
+ IsCommutable>;
+
+ defm D : avx512_logic_rm_vl_d<opc_d, OpcodeStr#"d", OpNode, itins, prd,
+ IsCommutable>;
+}
+
+defm VPAND : avx512_logic_rm_vl_dq<0xDB, 0xDB, "vpand", and,
+ SSE_INTALU_ITINS_P, HasAVX512, 1>;
+defm VPOR : avx512_logic_rm_vl_dq<0xEB, 0xEB, "vpor", or,
+ SSE_INTALU_ITINS_P, HasAVX512, 1>;
+defm VPXOR : avx512_logic_rm_vl_dq<0xEF, 0xEF, "vpxor", xor,
+ SSE_INTALU_ITINS_P, HasAVX512, 1>;
+defm VPANDN : avx512_logic_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp,
+ SSE_INTALU_ITINS_P, HasAVX512, 0>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 FP arithmetic
+//===----------------------------------------------------------------------===//
+multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
+ SDNode OpNode, SDNode VecNode, OpndItins itins,
+ bit IsCommutable> {
+ let ExeDomain = _.ExeDomain in {
+ defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+ (i32 FROUND_CURRENT)),
+ itins.rr>;
+
+ defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (VecNode (_.VT _.RC:$src1),
+ (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
+ (i32 FROUND_CURRENT)),
+ itins.rm>;
+ let isCodeGenOnly = 1, Predicates = [HasAVX512] in {
+ def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
+ (ins _.FRC:$src1, _.FRC:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))],
+ itins.rr> {
+ let isCommutable = IsCommutable;
+ }
+ def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
+ (ins _.FRC:$src1, _.ScalarMemOp:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set _.FRC:$dst, (OpNode _.FRC:$src1,
+ (_.ScalarLdFrag addr:$src2)))], itins.rm>;
+ }
+ }
+}
+
+multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
+ SDNode VecNode, OpndItins itins, bit IsCommutable = 0> {
+ let ExeDomain = _.ExeDomain in
+ defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr,
+ "$rc, $src2, $src1", "$src1, $src2, $rc",
+ (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+ (i32 imm:$rc)), itins.rr, IsCommutable>,
+ EVEX_B, EVEX_RC;
+}
+multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
+ SDNode VecNode, OpndItins itins, bit IsCommutable> {
+ let ExeDomain = _.ExeDomain in
+ defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+ "{sae}, $src2, $src1", "$src1, $src2, {sae}",
+ (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+ (i32 FROUND_NO_EXC))>, EVEX_B;
+}
+
+multiclass avx512_binop_s_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode VecNode,
+ SizeItins itins, bit IsCommutable> {
+ defm SSZ : avx512_fp_scalar<opc, OpcodeStr#"ss", f32x_info, OpNode, VecNode,
+ itins.s, IsCommutable>,
+ avx512_fp_scalar_round<opc, OpcodeStr#"ss", f32x_info, VecNode,
+ itins.s, IsCommutable>,
+ XS, EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+ defm SDZ : avx512_fp_scalar<opc, OpcodeStr#"sd", f64x_info, OpNode, VecNode,
+ itins.d, IsCommutable>,
+ avx512_fp_scalar_round<opc, OpcodeStr#"sd", f64x_info, VecNode,
+ itins.d, IsCommutable>,
+ XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>;
+}
+
+multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode VecNode,
+ SizeItins itins, bit IsCommutable> {
+ defm SSZ : avx512_fp_scalar<opc, OpcodeStr#"ss", f32x_info, OpNode, VecNode,
+ itins.s, IsCommutable>,
+ avx512_fp_scalar_sae<opc, OpcodeStr#"ss", f32x_info, VecNode,
+ itins.s, IsCommutable>,
+ XS, EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+ defm SDZ : avx512_fp_scalar<opc, OpcodeStr#"sd", f64x_info, OpNode, VecNode,
+ itins.d, IsCommutable>,
+ avx512_fp_scalar_sae<opc, OpcodeStr#"sd", f64x_info, VecNode,
+ itins.d, IsCommutable>,
+ XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>;
+}
+defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86faddRnd, SSE_ALU_ITINS_S, 1>;
+defm VMUL : avx512_binop_s_round<0x59, "vmul", fmul, X86fmulRnd, SSE_MUL_ITINS_S, 1>;
+defm VSUB : avx512_binop_s_round<0x5C, "vsub", fsub, X86fsubRnd, SSE_ALU_ITINS_S, 0>;
+defm VDIV : avx512_binop_s_round<0x5E, "vdiv", fdiv, X86fdivRnd, SSE_DIV_ITINS_S, 0>;
+defm VMIN : avx512_binop_s_sae <0x5D, "vmin", X86fmin, X86fminRnd, SSE_ALU_ITINS_S, 0>;
+defm VMAX : avx512_binop_s_sae <0x5F, "vmax", X86fmax, X86fmaxRnd, SSE_ALU_ITINS_S, 0>;
+
+// MIN/MAX nodes are commutable under "unsafe-fp-math". In this case we use
+// X86fminc and X86fmaxc instead of X86fmin and X86fmax
+multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr,
+ X86VectorVTInfo _, SDNode OpNode, OpndItins itins> {
+ let isCodeGenOnly = 1, Predicates = [HasAVX512] in {
+ def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
+ (ins _.FRC:$src1, _.FRC:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))],
+ itins.rr> {
+ let isCommutable = 1;
+ }
+ def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
+ (ins _.FRC:$src1, _.ScalarMemOp:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set _.FRC:$dst, (OpNode _.FRC:$src1,
+ (_.ScalarLdFrag addr:$src2)))], itins.rm>;
+ }
+}
+defm VMINCSSZ : avx512_comutable_binop_s<0x5D, "vminss", f32x_info, X86fminc,
+ SSE_ALU_ITINS_S.s>, XS, EVEX_4V, VEX_LIG,
+ EVEX_CD8<32, CD8VT1>;
+
+defm VMINCSDZ : avx512_comutable_binop_s<0x5D, "vminsd", f64x_info, X86fminc,
+ SSE_ALU_ITINS_S.d>, XD, VEX_W, EVEX_4V, VEX_LIG,
+ EVEX_CD8<64, CD8VT1>;
+
+defm VMAXCSSZ : avx512_comutable_binop_s<0x5F, "vmaxss", f32x_info, X86fmaxc,
+ SSE_ALU_ITINS_S.s>, XS, EVEX_4V, VEX_LIG,
+ EVEX_CD8<32, CD8VT1>;
+
+defm VMAXCSDZ : avx512_comutable_binop_s<0x5F, "vmaxsd", f64x_info, X86fmaxc,
+ SSE_ALU_ITINS_S.d>, XD, VEX_W, EVEX_4V, VEX_LIG,
+ EVEX_CD8<64, CD8VT1>;
+
+multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
+ X86VectorVTInfo _, OpndItins itins,
+ bit IsCommutable> {
+ let ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
+ defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2)), itins.rr,
+ IsCommutable>, EVEX_4V;
+ let mayLoad = 1 in {
+ defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix,
+ "$src2, $src1", "$src1, $src2",
+ (OpNode _.RC:$src1, (_.LdFrag addr:$src2)), itins.rm>,
+ EVEX_4V;
+ defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
+ "${src2}"##_.BroadcastStr##", $src1",
+ "$src1, ${src2}"##_.BroadcastStr,
+ (OpNode _.RC:$src1, (_.VT (X86VBroadcast
+ (_.ScalarLdFrag addr:$src2)))),
+ itins.rm>, EVEX_4V, EVEX_B;
+ }
+ }
+}
+
+multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNodeRnd,
+ X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain in
+ defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr##_.Suffix,
+ "$rc, $src2, $src1", "$src1, $src2, $rc",
+ (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 imm:$rc)))>,
+ EVEX_4V, EVEX_B, EVEX_RC;
+}
+
+
+multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNodeRnd,
+ X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain in
+ defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
+ "{sae}, $src2, $src1", "$src1, $src2, {sae}",
+ (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 FROUND_NO_EXC)))>,
+ EVEX_4V, EVEX_B;
+}
+
+multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
+ Predicate prd, SizeItins itins,
+ bit IsCommutable = 0> {
+ let Predicates = [prd] in {
+ defm PSZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v16f32_info,
+ itins.s, IsCommutable>, EVEX_V512, PS,
+ EVEX_CD8<32, CD8VF>;
+ defm PDZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v8f64_info,
+ itins.d, IsCommutable>, EVEX_V512, PD, VEX_W,
+ EVEX_CD8<64, CD8VF>;
+ }
+
+ // Define only if AVX512VL feature is present.
+ let Predicates = [prd, HasVLX] in {
+ defm PSZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, v4f32x_info,
+ itins.s, IsCommutable>, EVEX_V128, PS,
+ EVEX_CD8<32, CD8VF>;
+ defm PSZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, v8f32x_info,
+ itins.s, IsCommutable>, EVEX_V256, PS,
+ EVEX_CD8<32, CD8VF>;
+ defm PDZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, v2f64x_info,
+ itins.d, IsCommutable>, EVEX_V128, PD, VEX_W,
+ EVEX_CD8<64, CD8VF>;
+ defm PDZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, v4f64x_info,
+ itins.d, IsCommutable>, EVEX_V256, PD, VEX_W,
+ EVEX_CD8<64, CD8VF>;
+ }
+}
+
+multiclass avx512_fp_binop_p_round<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd> {
+ defm PSZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, v16f32_info>,
+ EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+ defm PDZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, v8f64_info>,
+ EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>;
+}
+
+multiclass avx512_fp_binop_p_sae<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd> {
+ defm PSZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, v16f32_info>,
+ EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+ defm PDZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, v8f64_info>,
+ EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>;
+}
+
+defm VADD : avx512_fp_binop_p<0x58, "vadd", fadd, HasAVX512,
+ SSE_ALU_ITINS_P, 1>,
+ avx512_fp_binop_p_round<0x58, "vadd", X86faddRnd>;
+defm VMUL : avx512_fp_binop_p<0x59, "vmul", fmul, HasAVX512,
+ SSE_MUL_ITINS_P, 1>,
+ avx512_fp_binop_p_round<0x59, "vmul", X86fmulRnd>;
+defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub, HasAVX512, SSE_ALU_ITINS_P>,
+ avx512_fp_binop_p_round<0x5C, "vsub", X86fsubRnd>;
+defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv, HasAVX512, SSE_DIV_ITINS_P>,
+ avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd>;
+defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, HasAVX512,
+ SSE_ALU_ITINS_P, 0>,
+ avx512_fp_binop_p_sae<0x5D, "vmin", X86fminRnd>;
+defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, HasAVX512,
+ SSE_ALU_ITINS_P, 0>,
+ avx512_fp_binop_p_sae<0x5F, "vmax", X86fmaxRnd>;
+let isCodeGenOnly = 1 in {
+ defm VMINC : avx512_fp_binop_p<0x5D, "vmin", X86fminc, HasAVX512,
+ SSE_ALU_ITINS_P, 1>;
+ defm VMAXC : avx512_fp_binop_p<0x5F, "vmax", X86fmaxc, HasAVX512,
+ SSE_ALU_ITINS_P, 1>;
+}
+defm VAND : avx512_fp_binop_p<0x54, "vand", null_frag, HasDQI,
+ SSE_ALU_ITINS_P, 1>;
+defm VANDN : avx512_fp_binop_p<0x55, "vandn", null_frag, HasDQI,
+ SSE_ALU_ITINS_P, 0>;
+defm VOR : avx512_fp_binop_p<0x56, "vor", null_frag, HasDQI,
+ SSE_ALU_ITINS_P, 1>;
+defm VXOR : avx512_fp_binop_p<0x57, "vxor", null_frag, HasDQI,
+ SSE_ALU_ITINS_P, 1>;
+
+// Patterns catch floating point selects with bitcasted integer logic ops.
+multiclass avx512_fp_logical_lowering<string InstrStr, SDNode OpNode,
+ X86VectorVTInfo _, Predicate prd> {
+let Predicates = [prd] in {
+ // Masked register-register logical operations.
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (bitconvert (_.i64VT (OpNode _.RC:$src1, _.RC:$src2))),
+ _.RC:$src0)),
+ (!cast<Instruction>(InstrStr#rrk) _.RC:$src0, _.KRCWM:$mask,
+ _.RC:$src1, _.RC:$src2)>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (bitconvert (_.i64VT (OpNode _.RC:$src1, _.RC:$src2))),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>(InstrStr#rrkz) _.KRCWM:$mask, _.RC:$src1,
+ _.RC:$src2)>;
+ // Masked register-memory logical operations.
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (bitconvert (_.i64VT (OpNode _.RC:$src1,
+ (load addr:$src2)))),
+ _.RC:$src0)),
+ (!cast<Instruction>(InstrStr#rmk) _.RC:$src0, _.KRCWM:$mask,
+ _.RC:$src1, addr:$src2)>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (bitconvert (_.i64VT (OpNode _.RC:$src1, (load addr:$src2)))),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>(InstrStr#rmkz) _.KRCWM:$mask, _.RC:$src1,
+ addr:$src2)>;
+ // Register-broadcast logical operations.
+ def : Pat<(_.i64VT (OpNode _.RC:$src1,
+ (bitconvert (_.VT (X86VBroadcast
+ (_.ScalarLdFrag addr:$src2)))))),
+ (!cast<Instruction>(InstrStr#rmb) _.RC:$src1, addr:$src2)>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (bitconvert
+ (_.i64VT (OpNode _.RC:$src1,
+ (bitconvert (_.VT
+ (X86VBroadcast
+ (_.ScalarLdFrag addr:$src2))))))),
+ _.RC:$src0)),
+ (!cast<Instruction>(InstrStr#rmbk) _.RC:$src0, _.KRCWM:$mask,
+ _.RC:$src1, addr:$src2)>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (bitconvert
+ (_.i64VT (OpNode _.RC:$src1,
+ (bitconvert (_.VT
+ (X86VBroadcast
+ (_.ScalarLdFrag addr:$src2))))))),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>(InstrStr#rmbkz) _.KRCWM:$mask,
+ _.RC:$src1, addr:$src2)>;
+}
+}
+
+multiclass avx512_fp_logical_lowering_sizes<string InstrStr, SDNode OpNode> {
+ defm : avx512_fp_logical_lowering<InstrStr#DZ128, OpNode, v4f32x_info, HasVLX>;
+ defm : avx512_fp_logical_lowering<InstrStr#QZ128, OpNode, v2f64x_info, HasVLX>;
+ defm : avx512_fp_logical_lowering<InstrStr#DZ256, OpNode, v8f32x_info, HasVLX>;
+ defm : avx512_fp_logical_lowering<InstrStr#QZ256, OpNode, v4f64x_info, HasVLX>;
+ defm : avx512_fp_logical_lowering<InstrStr#DZ, OpNode, v16f32_info, HasAVX512>;
+ defm : avx512_fp_logical_lowering<InstrStr#QZ, OpNode, v8f64_info, HasAVX512>;
+}
+
+defm : avx512_fp_logical_lowering_sizes<"VPAND", and>;
+defm : avx512_fp_logical_lowering_sizes<"VPOR", or>;
+defm : avx512_fp_logical_lowering_sizes<"VPXOR", xor>;
+defm : avx512_fp_logical_lowering_sizes<"VPANDN", X86andnp>;
+
+let Predicates = [HasVLX,HasDQI] in {
+ // Use packed logical operations for scalar ops.
+ def : Pat<(f64 (X86fand FR64X:$src1, FR64X:$src2)),
+ (COPY_TO_REGCLASS (VANDPDZ128rr
+ (COPY_TO_REGCLASS FR64X:$src1, VR128X),
+ (COPY_TO_REGCLASS FR64X:$src2, VR128X)), FR64X)>;
+ def : Pat<(f64 (X86for FR64X:$src1, FR64X:$src2)),
+ (COPY_TO_REGCLASS (VORPDZ128rr
+ (COPY_TO_REGCLASS FR64X:$src1, VR128X),
+ (COPY_TO_REGCLASS FR64X:$src2, VR128X)), FR64X)>;
+ def : Pat<(f64 (X86fxor FR64X:$src1, FR64X:$src2)),
+ (COPY_TO_REGCLASS (VXORPDZ128rr
+ (COPY_TO_REGCLASS FR64X:$src1, VR128X),
+ (COPY_TO_REGCLASS FR64X:$src2, VR128X)), FR64X)>;
+ def : Pat<(f64 (X86fandn FR64X:$src1, FR64X:$src2)),
+ (COPY_TO_REGCLASS (VANDNPDZ128rr
+ (COPY_TO_REGCLASS FR64X:$src1, VR128X),
+ (COPY_TO_REGCLASS FR64X:$src2, VR128X)), FR64X)>;
+
+ def : Pat<(f32 (X86fand FR32X:$src1, FR32X:$src2)),
+ (COPY_TO_REGCLASS (VANDPSZ128rr
+ (COPY_TO_REGCLASS FR32X:$src1, VR128X),
+ (COPY_TO_REGCLASS FR32X:$src2, VR128X)), FR32X)>;
+ def : Pat<(f32 (X86for FR32X:$src1, FR32X:$src2)),
+ (COPY_TO_REGCLASS (VORPSZ128rr
+ (COPY_TO_REGCLASS FR32X:$src1, VR128X),
+ (COPY_TO_REGCLASS FR32X:$src2, VR128X)), FR32X)>;
+ def : Pat<(f32 (X86fxor FR32X:$src1, FR32X:$src2)),
+ (COPY_TO_REGCLASS (VXORPSZ128rr
+ (COPY_TO_REGCLASS FR32X:$src1, VR128X),
+ (COPY_TO_REGCLASS FR32X:$src2, VR128X)), FR32X)>;
+ def : Pat<(f32 (X86fandn FR32X:$src1, FR32X:$src2)),
+ (COPY_TO_REGCLASS (VANDNPSZ128rr
+ (COPY_TO_REGCLASS FR32X:$src1, VR128X),
+ (COPY_TO_REGCLASS FR32X:$src2, VR128X)), FR32X)>;
+}
+
+multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT)))>, EVEX_4V;
+ defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix,
+ "$src2, $src1", "$src1, $src2",
+ (OpNode _.RC:$src1, (_.LdFrag addr:$src2), (i32 FROUND_CURRENT))>, EVEX_4V;
+ defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
+ "${src2}"##_.BroadcastStr##", $src1",
+ "$src1, ${src2}"##_.BroadcastStr,
+ (OpNode _.RC:$src1, (_.VT (X86VBroadcast
+ (_.ScalarLdFrag addr:$src2))), (i32 FROUND_CURRENT))>,
+ EVEX_4V, EVEX_B;
+}
+
+multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ defm rr: AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT)))>;
+ defm rm: AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
+ "$src2, $src1", "$src1, $src2",
+ (OpNode _.RC:$src1,
+ (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
+ (i32 FROUND_CURRENT))>;
+}
+
+multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr, SDNode OpNode, SDNode OpNodeScal> {
+ defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v16f32_info>,
+ avx512_fp_round_packed<opc, OpcodeStr, OpNode, v16f32_info>,
+ EVEX_V512, EVEX_CD8<32, CD8VF>;
+ defm PDZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v8f64_info>,
+ avx512_fp_round_packed<opc, OpcodeStr, OpNode, v8f64_info>,
+ EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+ defm SSZ128 : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNodeScal, f32x_info>,
+ avx512_fp_scalar_round<opcScaler, OpcodeStr##"ss", f32x_info, OpNodeScal, SSE_ALU_ITINS_S.s>,
+ EVEX_4V,EVEX_CD8<32, CD8VT1>;
+ defm SDZ128 : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNodeScal, f64x_info>,
+ avx512_fp_scalar_round<opcScaler, OpcodeStr##"sd", f64x_info, OpNodeScal, SSE_ALU_ITINS_S.d>,
+ EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
+
+ // Define only if AVX512VL feature is present.
+ let Predicates = [HasVLX] in {
+ defm PSZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v4f32x_info>,
+ EVEX_V128, EVEX_CD8<32, CD8VF>;
+ defm PSZ256 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v8f32x_info>,
+ EVEX_V256, EVEX_CD8<32, CD8VF>;
+ defm PDZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v2f64x_info>,
+ EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>;
+ defm PDZ256 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v4f64x_info>,
+ EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>;
+ }
+}
+defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef", X86scalef, X86scalefs>, T8PD;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 VPTESTM instructions
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_vptest<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ let isCommutable = 1 in
+ defm rr : AVX512_maskable_cmp<opc, MRMSrcReg, _, (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>,
+ EVEX_4V;
+ defm rm : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT (bitconvert (_.LdFrag addr:$src2))))>,
+ EVEX_4V,
+ EVEX_CD8<_.EltSize, CD8VF>;
+}
+
+multiclass avx512_vptest_mb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ defm rmb : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+ "${src2}"##_.BroadcastStr##", $src1",
+ "$src1, ${src2}"##_.BroadcastStr,
+ (OpNode (_.VT _.RC:$src1), (_.VT (X86VBroadcast
+ (_.ScalarLdFrag addr:$src2))))>,
+ EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+}
+
+// Use 512bit version to implement 128/256 bit in case NoVLX.
+multiclass avx512_vptest_lowering<SDNode OpNode, X86VectorVTInfo ExtendInfo,
+ X86VectorVTInfo _, string Suffix> {
+ def : Pat<(_.KVT (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))),
+ (_.KVT (COPY_TO_REGCLASS
+ (!cast<Instruction>(NAME # Suffix # "Zrr")
+ (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
+ _.RC:$src1, _.SubRegIdx),
+ (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
+ _.RC:$src2, _.SubRegIdx)),
+ _.KRC))>;
+}
+
+multiclass avx512_vptest_dq_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ AVX512VLVectorVTInfo _, string Suffix> {
+ let Predicates = [HasAVX512] in
+ defm Z : avx512_vptest<opc, OpcodeStr, OpNode, _.info512>,
+ avx512_vptest_mb<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512;
+
+ let Predicates = [HasAVX512, HasVLX] in {
+ defm Z256 : avx512_vptest<opc, OpcodeStr, OpNode, _.info256>,
+ avx512_vptest_mb<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256;
+ defm Z128 : avx512_vptest<opc, OpcodeStr, OpNode, _.info128>,
+ avx512_vptest_mb<opc, OpcodeStr, OpNode, _.info128>, EVEX_V128;
+ }
+ let Predicates = [HasAVX512, NoVLX] in {
+ defm Z256_Alt : avx512_vptest_lowering< OpNode, _.info512, _.info256, Suffix>;
+ defm Z128_Alt : avx512_vptest_lowering< OpNode, _.info512, _.info128, Suffix>;
+ }
+}
+
+multiclass avx512_vptest_dq<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+ defm D : avx512_vptest_dq_sizes<opc, OpcodeStr#"d", OpNode,
+ avx512vl_i32_info, "D">;
+ defm Q : avx512_vptest_dq_sizes<opc, OpcodeStr#"q", OpNode,
+ avx512vl_i64_info, "Q">, VEX_W;
+}
+
+multiclass avx512_vptest_wb<bits<8> opc, string OpcodeStr,
+ SDNode OpNode> {
+ let Predicates = [HasBWI] in {
+ defm WZ: avx512_vptest<opc, OpcodeStr#"w", OpNode, v32i16_info>,
+ EVEX_V512, VEX_W;
+ defm BZ: avx512_vptest<opc, OpcodeStr#"b", OpNode, v64i8_info>,
+ EVEX_V512;
+ }
+ let Predicates = [HasVLX, HasBWI] in {
+
+ defm WZ256: avx512_vptest<opc, OpcodeStr#"w", OpNode, v16i16x_info>,
+ EVEX_V256, VEX_W;
+ defm WZ128: avx512_vptest<opc, OpcodeStr#"w", OpNode, v8i16x_info>,
+ EVEX_V128, VEX_W;
+ defm BZ256: avx512_vptest<opc, OpcodeStr#"b", OpNode, v32i8x_info>,
+ EVEX_V256;
+ defm BZ128: avx512_vptest<opc, OpcodeStr#"b", OpNode, v16i8x_info>,
+ EVEX_V128;
+ }
+
+ let Predicates = [HasAVX512, NoVLX] in {
+ defm BZ256_Alt : avx512_vptest_lowering< OpNode, v64i8_info, v32i8x_info, "B">;
+ defm BZ128_Alt : avx512_vptest_lowering< OpNode, v64i8_info, v16i8x_info, "B">;
+ defm WZ256_Alt : avx512_vptest_lowering< OpNode, v32i16_info, v16i16x_info, "W">;
+ defm WZ128_Alt : avx512_vptest_lowering< OpNode, v32i16_info, v8i16x_info, "W">;
+ }
+
+}
+
+multiclass avx512_vptest_all_forms<bits<8> opc_wb, bits<8> opc_dq, string OpcodeStr,
+ SDNode OpNode> :
+ avx512_vptest_wb <opc_wb, OpcodeStr, OpNode>,
+ avx512_vptest_dq<opc_dq, OpcodeStr, OpNode>;
+
+defm VPTESTM : avx512_vptest_all_forms<0x26, 0x27, "vptestm", X86testm>, T8PD;
+defm VPTESTNM : avx512_vptest_all_forms<0x26, 0x27, "vptestnm", X86testnm>, T8XS;
+
+
+//===----------------------------------------------------------------------===//
+// AVX-512 Shift instructions
+//===----------------------------------------------------------------------===//
+multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM,
+ string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain in {
+ defm ri : AVX512_maskable<opc, ImmFormR, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, u8imm:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode _.RC:$src1, (i8 imm:$src2))),
+ SSE_INTSHIFT_ITINS_P.rr>;
+ defm mi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
+ (ins _.MemOp:$src1, u8imm:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
+ (i8 imm:$src2))),
+ SSE_INTSHIFT_ITINS_P.rm>;
+ }
+}
+
+multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM,
+ string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain in
+ defm mbi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
+ (ins _.ScalarMemOp:$src1, u8imm:$src2), OpcodeStr,
+ "$src2, ${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr##", $src2",
+ (_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src1)), (i8 imm:$src2))),
+ SSE_INTSHIFT_ITINS_P.rm>, EVEX_B;
+}
+
+multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType SrcVT, PatFrag bc_frag, X86VectorVTInfo _> {
+ // src2 is always 128-bit
+ let ExeDomain = _.ExeDomain in {
+ defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, VR128X:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode _.RC:$src1, (SrcVT VR128X:$src2))),
+ SSE_INTSHIFT_ITINS_P.rr>, AVX512BIBase, EVEX_4V;
+ defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, i128mem:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode _.RC:$src1, (bc_frag (loadv2i64 addr:$src2)))),
+ SSE_INTSHIFT_ITINS_P.rm>, AVX512BIBase,
+ EVEX_4V;
+ }
+}
+
+multiclass avx512_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType SrcVT, PatFrag bc_frag,
+ AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+ let Predicates = [prd] in
+ defm Z : avx512_shift_rrm<opc, OpcodeStr, OpNode, SrcVT, bc_frag,
+ VTInfo.info512>, EVEX_V512,
+ EVEX_CD8<VTInfo.info512.EltSize, CD8VQ> ;
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_shift_rrm<opc, OpcodeStr, OpNode, SrcVT, bc_frag,
+ VTInfo.info256>, EVEX_V256,
+ EVEX_CD8<VTInfo.info256.EltSize, CD8VH>;
+ defm Z128 : avx512_shift_rrm<opc, OpcodeStr, OpNode, SrcVT, bc_frag,
+ VTInfo.info128>, EVEX_V128,
+ EVEX_CD8<VTInfo.info128.EltSize, CD8VF>;
+ }
+}
+
+multiclass avx512_shift_types<bits<8> opcd, bits<8> opcq, bits<8> opcw,
+ string OpcodeStr, SDNode OpNode> {
+ defm D : avx512_shift_sizes<opcd, OpcodeStr#"d", OpNode, v4i32, bc_v4i32,
+ avx512vl_i32_info, HasAVX512>;
+ defm Q : avx512_shift_sizes<opcq, OpcodeStr#"q", OpNode, v2i64, bc_v2i64,
+ avx512vl_i64_info, HasAVX512>, VEX_W;
+ defm W : avx512_shift_sizes<opcw, OpcodeStr#"w", OpNode, v8i16, bc_v8i16,
+ avx512vl_i16_info, HasBWI>;
+}
+
+multiclass avx512_shift_rmi_sizes<bits<8> opc, Format ImmFormR, Format ImmFormM,
+ string OpcodeStr, SDNode OpNode,
+ AVX512VLVectorVTInfo VTInfo> {
+ let Predicates = [HasAVX512] in
+ defm Z: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+ VTInfo.info512>,
+ avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
+ VTInfo.info512>, EVEX_V512;
+ let Predicates = [HasAVX512, HasVLX] in {
+ defm Z256: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+ VTInfo.info256>,
+ avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
+ VTInfo.info256>, EVEX_V256;
+ defm Z128: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+ VTInfo.info128>,
+ avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
+ VTInfo.info128>, EVEX_V128;
+ }
+}
+
+multiclass avx512_shift_rmi_w<bits<8> opcw,
+ Format ImmFormR, Format ImmFormM,
+ string OpcodeStr, SDNode OpNode> {
+ let Predicates = [HasBWI] in
+ defm WZ: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+ v32i16_info>, EVEX_V512;
+ let Predicates = [HasVLX, HasBWI] in {
+ defm WZ256: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+ v16i16x_info>, EVEX_V256;
+ defm WZ128: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+ v8i16x_info>, EVEX_V128;
+ }
+}
+
+multiclass avx512_shift_rmi_dq<bits<8> opcd, bits<8> opcq,
+ Format ImmFormR, Format ImmFormM,
+ string OpcodeStr, SDNode OpNode> {
+ defm D: avx512_shift_rmi_sizes<opcd, ImmFormR, ImmFormM, OpcodeStr#"d", OpNode,
+ avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+ defm Q: avx512_shift_rmi_sizes<opcq, ImmFormR, ImmFormM, OpcodeStr#"q", OpNode,
+ avx512vl_i64_info>, EVEX_CD8<64, CD8VF>, VEX_W;
+}
+
+defm VPSRL : avx512_shift_rmi_dq<0x72, 0x73, MRM2r, MRM2m, "vpsrl", X86vsrli>,
+ avx512_shift_rmi_w<0x71, MRM2r, MRM2m, "vpsrlw", X86vsrli>, AVX512BIi8Base, EVEX_4V;
+
+defm VPSLL : avx512_shift_rmi_dq<0x72, 0x73, MRM6r, MRM6m, "vpsll", X86vshli>,
+ avx512_shift_rmi_w<0x71, MRM6r, MRM6m, "vpsllw", X86vshli>, AVX512BIi8Base, EVEX_4V;
+
+defm VPSRA : avx512_shift_rmi_dq<0x72, 0x72, MRM4r, MRM4m, "vpsra", X86vsrai>,
+ avx512_shift_rmi_w<0x71, MRM4r, MRM4m, "vpsraw", X86vsrai>, AVX512BIi8Base, EVEX_4V;
+
+defm VPROR : avx512_shift_rmi_dq<0x72, 0x72, MRM0r, MRM0m, "vpror", X86vrotri>, AVX512BIi8Base, EVEX_4V;
+defm VPROL : avx512_shift_rmi_dq<0x72, 0x72, MRM1r, MRM1m, "vprol", X86vrotli>, AVX512BIi8Base, EVEX_4V;
+
+defm VPSLL : avx512_shift_types<0xF2, 0xF3, 0xF1, "vpsll", X86vshl>;
+defm VPSRA : avx512_shift_types<0xE2, 0xE2, 0xE1, "vpsra", X86vsra>;
+defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl>;
+
+//===-------------------------------------------------------------------===//
+// Variable Bit Shifts
+//===-------------------------------------------------------------------===//
+multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain in {
+ defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode _.RC:$src1, (_.VT _.RC:$src2))),
+ SSE_INTSHIFT_ITINS_P.rr>, AVX5128IBase, EVEX_4V;
+ defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode _.RC:$src1,
+ (_.VT (bitconvert (_.LdFrag addr:$src2))))),
+ SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_4V,
+ EVEX_CD8<_.EltSize, CD8VF>;
+ }
+}
+
+multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain in
+ defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+ "${src2}"##_.BroadcastStr##", $src1",
+ "$src1, ${src2}"##_.BroadcastStr,
+ (_.VT (OpNode _.RC:$src1, (_.VT (X86VBroadcast
+ (_.ScalarLdFrag addr:$src2))))),
+ SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_B,
+ EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+}
+multiclass avx512_var_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ AVX512VLVectorVTInfo _> {
+ let Predicates = [HasAVX512] in
+ defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, _.info512>,
+ avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512;
+
+ let Predicates = [HasAVX512, HasVLX] in {
+ defm Z256 : avx512_var_shift<opc, OpcodeStr, OpNode, _.info256>,
+ avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256;
+ defm Z128 : avx512_var_shift<opc, OpcodeStr, OpNode, _.info128>,
+ avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info128>, EVEX_V128;
+ }
+}
+
+multiclass avx512_var_shift_types<bits<8> opc, string OpcodeStr,
+ SDNode OpNode> {
+ defm D : avx512_var_shift_sizes<opc, OpcodeStr#"d", OpNode,
+ avx512vl_i32_info>;
+ defm Q : avx512_var_shift_sizes<opc, OpcodeStr#"q", OpNode,
+ avx512vl_i64_info>, VEX_W;
+}
+
+// Use 512bit version to implement 128/256 bit in case NoVLX.
+multiclass avx512_var_shift_w_lowering<AVX512VLVectorVTInfo _, SDNode OpNode> {
+ let Predicates = [HasBWI, NoVLX] in {
+ def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1),
+ (_.info256.VT _.info256.RC:$src2))),
+ (EXTRACT_SUBREG
+ (!cast<Instruction>(NAME#"WZrr")
+ (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src1, sub_ymm),
+ (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)),
+ sub_ymm)>;
+
+ def : Pat<(_.info128.VT (OpNode (_.info128.VT _.info128.RC:$src1),
+ (_.info128.VT _.info128.RC:$src2))),
+ (EXTRACT_SUBREG
+ (!cast<Instruction>(NAME#"WZrr")
+ (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src1, sub_xmm),
+ (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)),
+ sub_xmm)>;
+ }
+}
+
+multiclass avx512_var_shift_w<bits<8> opc, string OpcodeStr,
+ SDNode OpNode> {
+ let Predicates = [HasBWI] in
+ defm WZ: avx512_var_shift<opc, OpcodeStr, OpNode, v32i16_info>,
+ EVEX_V512, VEX_W;
+ let Predicates = [HasVLX, HasBWI] in {
+
+ defm WZ256: avx512_var_shift<opc, OpcodeStr, OpNode, v16i16x_info>,
+ EVEX_V256, VEX_W;
+ defm WZ128: avx512_var_shift<opc, OpcodeStr, OpNode, v8i16x_info>,
+ EVEX_V128, VEX_W;
+ }
+}
+
+defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl>,
+ avx512_var_shift_w<0x12, "vpsllvw", shl>,
+ avx512_var_shift_w_lowering<avx512vl_i16_info, shl>;
+
+defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra>,
+ avx512_var_shift_w<0x11, "vpsravw", sra>,
+ avx512_var_shift_w_lowering<avx512vl_i16_info, sra>;
+
+defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl>,
+ avx512_var_shift_w<0x10, "vpsrlvw", srl>,
+ avx512_var_shift_w_lowering<avx512vl_i16_info, srl>;
+defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr>;
+defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl>;
+
+// Special handing for handling VPSRAV intrinsics.
+multiclass avx512_var_shift_int_lowering<string InstrStr, X86VectorVTInfo _,
+ list<Predicate> p> {
+ let Predicates = p in {
+ def : Pat<(_.VT (X86vsrav _.RC:$src1, _.RC:$src2)),
+ (!cast<Instruction>(InstrStr#_.ZSuffix#rr) _.RC:$src1,
+ _.RC:$src2)>;
+ def : Pat<(_.VT (X86vsrav _.RC:$src1, (bitconvert (_.LdFrag addr:$src2)))),
+ (!cast<Instruction>(InstrStr#_.ZSuffix##rm)
+ _.RC:$src1, addr:$src2)>;
+ let AddedComplexity = 20 in {
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (X86vsrav _.RC:$src1, _.RC:$src2), _.RC:$src0)),
+ (!cast<Instruction>(InstrStr#_.ZSuffix#rrk) _.RC:$src0,
+ _.KRC:$mask, _.RC:$src1, _.RC:$src2)>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (X86vsrav _.RC:$src1, (bitconvert (_.LdFrag addr:$src2))),
+ _.RC:$src0)),
+ (!cast<Instruction>(InstrStr#_.ZSuffix##rmk) _.RC:$src0,
+ _.KRC:$mask, _.RC:$src1, addr:$src2)>;
+ }
+ let AddedComplexity = 30 in {
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (X86vsrav _.RC:$src1, _.RC:$src2), _.ImmAllZerosV)),
+ (!cast<Instruction>(InstrStr#_.ZSuffix#rrkz) _.KRC:$mask,
+ _.RC:$src1, _.RC:$src2)>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (X86vsrav _.RC:$src1, (bitconvert (_.LdFrag addr:$src2))),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>(InstrStr#_.ZSuffix##rmkz) _.KRC:$mask,
+ _.RC:$src1, addr:$src2)>;
+ }
+ }
+}
+
+multiclass avx512_var_shift_int_lowering_mb<string InstrStr, X86VectorVTInfo _,
+ list<Predicate> p> :
+ avx512_var_shift_int_lowering<InstrStr, _, p> {
+ let Predicates = p in {
+ def : Pat<(_.VT (X86vsrav _.RC:$src1,
+ (X86VBroadcast (_.ScalarLdFrag addr:$src2)))),
+ (!cast<Instruction>(InstrStr#_.ZSuffix##rmb)
+ _.RC:$src1, addr:$src2)>;
+ let AddedComplexity = 20 in
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (X86vsrav _.RC:$src1,
+ (X86VBroadcast (_.ScalarLdFrag addr:$src2))),
+ _.RC:$src0)),
+ (!cast<Instruction>(InstrStr#_.ZSuffix##rmbk) _.RC:$src0,
+ _.KRC:$mask, _.RC:$src1, addr:$src2)>;
+ let AddedComplexity = 30 in
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (X86vsrav _.RC:$src1,
+ (X86VBroadcast (_.ScalarLdFrag addr:$src2))),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>(InstrStr#_.ZSuffix##rmbkz) _.KRC:$mask,
+ _.RC:$src1, addr:$src2)>;
+ }
+}
+
+defm : avx512_var_shift_int_lowering<"VPSRAVW", v8i16x_info, [HasVLX, HasBWI]>;
+defm : avx512_var_shift_int_lowering<"VPSRAVW", v16i16x_info, [HasVLX, HasBWI]>;
+defm : avx512_var_shift_int_lowering<"VPSRAVW", v32i16_info, [HasBWI]>;
+defm : avx512_var_shift_int_lowering_mb<"VPSRAVD", v4i32x_info, [HasVLX]>;
+defm : avx512_var_shift_int_lowering_mb<"VPSRAVD", v8i32x_info, [HasVLX]>;
+defm : avx512_var_shift_int_lowering_mb<"VPSRAVD", v16i32_info, [HasAVX512]>;
+defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v2i64x_info, [HasVLX]>;
+defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v4i64x_info, [HasVLX]>;
+defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v8i64_info, [HasAVX512]>;
+
+//===-------------------------------------------------------------------===//
+// 1-src variable permutation VPERMW/D/Q
+//===-------------------------------------------------------------------===//
+multiclass avx512_vperm_dq_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ AVX512VLVectorVTInfo _> {
+ let Predicates = [HasAVX512] in
+ defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, _.info512>,
+ avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512;
+
+ let Predicates = [HasAVX512, HasVLX] in
+ defm Z256 : avx512_var_shift<opc, OpcodeStr, OpNode, _.info256>,
+ avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256;
+}
+
+multiclass avx512_vpermi_dq_sizes<bits<8> opc, Format ImmFormR, Format ImmFormM,
+ string OpcodeStr, SDNode OpNode,
+ AVX512VLVectorVTInfo VTInfo> {
+ let Predicates = [HasAVX512] in
+ defm Z: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+ VTInfo.info512>,
+ avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
+ VTInfo.info512>, EVEX_V512;
+ let Predicates = [HasAVX512, HasVLX] in
+ defm Z256: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+ VTInfo.info256>,
+ avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
+ VTInfo.info256>, EVEX_V256;
+}
+
+multiclass avx512_vperm_bw<bits<8> opc, string OpcodeStr,
+ Predicate prd, SDNode OpNode,
+ AVX512VLVectorVTInfo _> {
+ let Predicates = [prd] in
+ defm Z: avx512_var_shift<opc, OpcodeStr, OpNode, _.info512>,
+ EVEX_V512 ;
+ let Predicates = [HasVLX, prd] in {
+ defm Z256: avx512_var_shift<opc, OpcodeStr, OpNode, _.info256>,
+ EVEX_V256 ;
+ defm Z128: avx512_var_shift<opc, OpcodeStr, OpNode, _.info128>,
+ EVEX_V128 ;
+ }
+}
+
+defm VPERMW : avx512_vperm_bw<0x8D, "vpermw", HasBWI, X86VPermv,
+ avx512vl_i16_info>, VEX_W;
+defm VPERMB : avx512_vperm_bw<0x8D, "vpermb", HasVBMI, X86VPermv,
+ avx512vl_i8_info>;
+
+defm VPERMD : avx512_vperm_dq_sizes<0x36, "vpermd", X86VPermv,
+ avx512vl_i32_info>;
+defm VPERMQ : avx512_vperm_dq_sizes<0x36, "vpermq", X86VPermv,
+ avx512vl_i64_info>, VEX_W;
+defm VPERMPS : avx512_vperm_dq_sizes<0x16, "vpermps", X86VPermv,
+ avx512vl_f32_info>;
+defm VPERMPD : avx512_vperm_dq_sizes<0x16, "vpermpd", X86VPermv,
+ avx512vl_f64_info>, VEX_W;
+
+defm VPERMQ : avx512_vpermi_dq_sizes<0x00, MRMSrcReg, MRMSrcMem, "vpermq",
+ X86VPermi, avx512vl_i64_info>,
+ EVEX, AVX512AIi8Base, EVEX_CD8<64, CD8VF>, VEX_W;
+defm VPERMPD : avx512_vpermi_dq_sizes<0x01, MRMSrcReg, MRMSrcMem, "vpermpd",
+ X86VPermi, avx512vl_f64_info>,
+ EVEX, AVX512AIi8Base, EVEX_CD8<64, CD8VF>, VEX_W;
+//===----------------------------------------------------------------------===//
+// AVX-512 - VPERMIL
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, X86VectorVTInfo Ctrl> {
+ defm rr: AVX512_maskable<OpcVar, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, Ctrl.RC:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode _.RC:$src1,
+ (Ctrl.VT Ctrl.RC:$src2)))>,
+ T8PD, EVEX_4V;
+ defm rm: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, Ctrl.MemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode
+ _.RC:$src1,
+ (Ctrl.VT (bitconvert(Ctrl.LdFrag addr:$src2)))))>,
+ T8PD, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+ defm rmb: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+ "${src2}"##_.BroadcastStr##", $src1",
+ "$src1, ${src2}"##_.BroadcastStr,
+ (_.VT (OpNode
+ _.RC:$src1,
+ (Ctrl.VT (X86VBroadcast
+ (Ctrl.ScalarLdFrag addr:$src2)))))>,
+ T8PD, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
+}
+
+multiclass avx512_permil_vec_common<string OpcodeStr, bits<8> OpcVar,
+ AVX512VLVectorVTInfo _, AVX512VLVectorVTInfo Ctrl>{
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info512,
+ Ctrl.info512>, EVEX_V512;
+ }
+ let Predicates = [HasAVX512, HasVLX] in {
+ defm Z128 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info128,
+ Ctrl.info128>, EVEX_V128;
+ defm Z256 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info256,
+ Ctrl.info256>, EVEX_V256;
+ }
+}
+
+multiclass avx512_permil<string OpcodeStr, bits<8> OpcImm, bits<8> OpcVar,
+ AVX512VLVectorVTInfo _, AVX512VLVectorVTInfo Ctrl>{
+
+ defm NAME: avx512_permil_vec_common<OpcodeStr, OpcVar, _, Ctrl>;
+ defm NAME: avx512_shift_rmi_sizes<OpcImm, MRMSrcReg, MRMSrcMem, OpcodeStr,
+ X86VPermilpi, _>,
+ EVEX, AVX512AIi8Base, EVEX_CD8<_.info128.EltSize, CD8VF>;
+}
+
+let ExeDomain = SSEPackedSingle in
+defm VPERMILPS : avx512_permil<"vpermilps", 0x04, 0x0C, avx512vl_f32_info,
+ avx512vl_i32_info>;
+let ExeDomain = SSEPackedDouble in
+defm VPERMILPD : avx512_permil<"vpermilpd", 0x05, 0x0D, avx512vl_f64_info,
+ avx512vl_i64_info>, VEX_W;
+//===----------------------------------------------------------------------===//
+// AVX-512 - VPSHUFD, VPSHUFLW, VPSHUFHW
+//===----------------------------------------------------------------------===//
+
+defm VPSHUFD : avx512_shift_rmi_sizes<0x70, MRMSrcReg, MRMSrcMem, "vpshufd",
+ X86PShufd, avx512vl_i32_info>,
+ EVEX, AVX512BIi8Base, EVEX_CD8<32, CD8VF>;
+defm VPSHUFH : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshufhw",
+ X86PShufhw>, EVEX, AVX512XSIi8Base;
+defm VPSHUFL : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshuflw",
+ X86PShuflw>, EVEX, AVX512XDIi8Base;
+
+multiclass avx512_pshufb_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+ let Predicates = [HasBWI] in
+ defm Z: avx512_var_shift<opc, OpcodeStr, OpNode, v64i8_info>, EVEX_V512;
+
+ let Predicates = [HasVLX, HasBWI] in {
+ defm Z256: avx512_var_shift<opc, OpcodeStr, OpNode, v32i8x_info>, EVEX_V256;
+ defm Z128: avx512_var_shift<opc, OpcodeStr, OpNode, v16i8x_info>, EVEX_V128;
+ }
+}
+
+defm VPSHUFB: avx512_pshufb_sizes<0x00, "vpshufb", X86pshufb>;
+
+//===----------------------------------------------------------------------===//
+// Move Low to High and High to Low packed FP Instructions
+//===----------------------------------------------------------------------===//
+def VMOVLHPSZrr : AVX512PSI<0x16, MRMSrcReg, (outs VR128X:$dst),
+ (ins VR128X:$src1, VR128X:$src2),
+ "vmovlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128X:$dst, (v4f32 (X86Movlhps VR128X:$src1, VR128X:$src2)))],
+ IIC_SSE_MOV_LH>, EVEX_4V;
+def VMOVHLPSZrr : AVX512PSI<0x12, MRMSrcReg, (outs VR128X:$dst),
+ (ins VR128X:$src1, VR128X:$src2),
+ "vmovhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128X:$dst, (v4f32 (X86Movhlps VR128X:$src1, VR128X:$src2)))],
+ IIC_SSE_MOV_LH>, EVEX_4V;
+
+let Predicates = [HasAVX512] in {
+ // MOVLHPS patterns
+ def : Pat<(v4i32 (X86Movlhps VR128X:$src1, VR128X:$src2)),
+ (VMOVLHPSZrr VR128X:$src1, VR128X:$src2)>;
+ def : Pat<(v2i64 (X86Movlhps VR128X:$src1, VR128X:$src2)),
+ (VMOVLHPSZrr (v2i64 VR128X:$src1), VR128X:$src2)>;
+
+ // MOVHLPS patterns
+ def : Pat<(v4i32 (X86Movhlps VR128X:$src1, VR128X:$src2)),
+ (VMOVHLPSZrr VR128X:$src1, VR128X:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// VMOVHPS/PD VMOVLPS Instructions
+// All patterns was taken from SSS implementation.
+//===----------------------------------------------------------------------===//
+multiclass avx512_mov_hilo_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ def rm : AVX512<opc, MRMSrcMem, (outs _.RC:$dst),
+ (ins _.RC:$src1, f64mem:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _.RC:$dst,
+ (OpNode _.RC:$src1,
+ (_.VT (bitconvert
+ (v2f64 (scalar_to_vector (loadf64 addr:$src2)))))))],
+ IIC_SSE_MOV_LH>, EVEX_4V;
+}
+
+defm VMOVHPSZ128 : avx512_mov_hilo_packed<0x16, "vmovhps", X86Movlhps,
+ v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS;
+defm VMOVHPDZ128 : avx512_mov_hilo_packed<0x16, "vmovhpd", X86Movlhpd,
+ v2f64x_info>, EVEX_CD8<64, CD8VT1>, PD, VEX_W;
+defm VMOVLPSZ128 : avx512_mov_hilo_packed<0x12, "vmovlps", X86Movlps,
+ v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS;
+defm VMOVLPDZ128 : avx512_mov_hilo_packed<0x12, "vmovlpd", X86Movlpd,
+ v2f64x_info>, EVEX_CD8<64, CD8VT1>, PD, VEX_W;
+
+let Predicates = [HasAVX512] in {
+ // VMOVHPS patterns
+ def : Pat<(X86Movlhps VR128X:$src1,
+ (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
+ (VMOVHPSZ128rm VR128X:$src1, addr:$src2)>;
+ def : Pat<(X86Movlhps VR128X:$src1,
+ (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
+ (VMOVHPSZ128rm VR128X:$src1, addr:$src2)>;
+ // VMOVHPD patterns
+ def : Pat<(v2f64 (X86Unpckl VR128X:$src1,
+ (scalar_to_vector (loadf64 addr:$src2)))),
+ (VMOVHPDZ128rm VR128X:$src1, addr:$src2)>;
+ def : Pat<(v2f64 (X86Unpckl VR128X:$src1,
+ (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
+ (VMOVHPDZ128rm VR128X:$src1, addr:$src2)>;
+ // VMOVLPS patterns
+ def : Pat<(v4f32 (X86Movlps VR128X:$src1, (load addr:$src2))),
+ (VMOVLPSZ128rm VR128X:$src1, addr:$src2)>;
+ def : Pat<(v4i32 (X86Movlps VR128X:$src1, (load addr:$src2))),
+ (VMOVLPSZ128rm VR128X:$src1, addr:$src2)>;
+ // VMOVLPD patterns
+ def : Pat<(v2f64 (X86Movlpd VR128X:$src1, (load addr:$src2))),
+ (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>;
+ def : Pat<(v2i64 (X86Movlpd VR128X:$src1, (load addr:$src2))),
+ (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>;
+ def : Pat<(v2f64 (X86Movsd VR128X:$src1,
+ (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
+ (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>;
+}
+
+def VMOVHPSZ128mr : AVX512PSI<0x17, MRMDestMem, (outs),
+ (ins f64mem:$dst, VR128X:$src),
+ "vmovhps\t{$src, $dst|$dst, $src}",
+ [(store (f64 (extractelt
+ (X86Unpckh (bc_v2f64 (v4f32 VR128X:$src)),
+ (bc_v2f64 (v4f32 VR128X:$src))),
+ (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>,
+ EVEX, EVEX_CD8<32, CD8VT2>;
+def VMOVHPDZ128mr : AVX512PDI<0x17, MRMDestMem, (outs),
+ (ins f64mem:$dst, VR128X:$src),
+ "vmovhpd\t{$src, $dst|$dst, $src}",
+ [(store (f64 (extractelt
+ (v2f64 (X86Unpckh VR128X:$src, VR128X:$src)),
+ (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>,
+ EVEX, EVEX_CD8<64, CD8VT1>, VEX_W;
+def VMOVLPSZ128mr : AVX512PSI<0x13, MRMDestMem, (outs),
+ (ins f64mem:$dst, VR128X:$src),
+ "vmovlps\t{$src, $dst|$dst, $src}",
+ [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128X:$src)),
+ (iPTR 0))), addr:$dst)],
+ IIC_SSE_MOV_LH>,
+ EVEX, EVEX_CD8<32, CD8VT2>;
+def VMOVLPDZ128mr : AVX512PDI<0x13, MRMDestMem, (outs),
+ (ins f64mem:$dst, VR128X:$src),
+ "vmovlpd\t{$src, $dst|$dst, $src}",
+ [(store (f64 (extractelt (v2f64 VR128X:$src),
+ (iPTR 0))), addr:$dst)],
+ IIC_SSE_MOV_LH>,
+ EVEX, EVEX_CD8<64, CD8VT1>, VEX_W;
+
+let Predicates = [HasAVX512] in {
+ // VMOVHPD patterns
+ def : Pat<(store (f64 (extractelt
+ (v2f64 (X86VPermilpi VR128X:$src, (i8 1))),
+ (iPTR 0))), addr:$dst),
+ (VMOVHPDZ128mr addr:$dst, VR128X:$src)>;
+ // VMOVLPS patterns
+ def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128X:$src2)),
+ addr:$src1),
+ (VMOVLPSZ128mr addr:$src1, VR128X:$src2)>;
+ def : Pat<(store (v4i32 (X86Movlps
+ (bc_v4i32 (loadv2i64 addr:$src1)), VR128X:$src2)), addr:$src1),
+ (VMOVLPSZ128mr addr:$src1, VR128X:$src2)>;
+ // VMOVLPD patterns
+ def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128X:$src2)),
+ addr:$src1),
+ (VMOVLPDZ128mr addr:$src1, VR128X:$src2)>;
+ def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128X:$src2)),
+ addr:$src1),
+ (VMOVLPDZ128mr addr:$src1, VR128X:$src2)>;
+}
+//===----------------------------------------------------------------------===//
+// FMA - Fused Multiply Operations
+//
+
+multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, string Suff> {
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
+ defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)), 1, 1>,
+ AVX512FMA3Base;
+
+ defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.MemOp:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (OpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))), 1, 0>,
+ AVX512FMA3Base;
+
+ defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.ScalarMemOp:$src3),
+ OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
+ !strconcat("$src2, ${src3}", _.BroadcastStr ),
+ (OpNode _.RC:$src2,
+ _.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))), 1, 0>,
+ AVX512FMA3Base, EVEX_B;
+ }
+
+ // Additional pattern for folding broadcast nodes in other orders.
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (OpNode _.RC:$src1, _.RC:$src2,
+ (X86VBroadcast (_.ScalarLdFrag addr:$src3))),
+ _.RC:$src1)),
+ (!cast<Instruction>(NAME#Suff#_.ZSuffix#mbk) _.RC:$src1,
+ _.KRCWM:$mask, _.RC:$src2, addr:$src3)>;
+}
+
+multiclass avx512_fma3_213_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, string Suff> {
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in
+ defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
+ OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
+ (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 imm:$rc))), 1, 1>,
+ AVX512FMA3Base, EVEX_B, EVEX_RC;
+}
+
+multiclass avx512_fma3p_213_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode OpNodeRnd, AVX512VLVectorVTInfo _,
+ string Suff> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, _.info512, Suff>,
+ avx512_fma3_213_round<opc, OpcodeStr, OpNodeRnd, _.info512,
+ Suff>, EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
+ }
+ let Predicates = [HasVLX, HasAVX512] in {
+ defm Z256 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, _.info256, Suff>,
+ EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
+ defm Z128 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, _.info128, Suff>,
+ EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
+ }
+}
+
+multiclass avx512_fma3p_213_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode OpNodeRnd > {
+ defm PS : avx512_fma3p_213_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd,
+ avx512vl_f32_info, "PS">;
+ defm PD : avx512_fma3p_213_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd,
+ avx512vl_f64_info, "PD">, VEX_W;
+}
+
+defm VFMADD213 : avx512_fma3p_213_f<0xA8, "vfmadd213", X86Fmadd, X86FmaddRnd>;
+defm VFMSUB213 : avx512_fma3p_213_f<0xAA, "vfmsub213", X86Fmsub, X86FmsubRnd>;
+defm VFMADDSUB213 : avx512_fma3p_213_f<0xA6, "vfmaddsub213", X86Fmaddsub, X86FmaddsubRnd>;
+defm VFMSUBADD213 : avx512_fma3p_213_f<0xA7, "vfmsubadd213", X86Fmsubadd, X86FmsubaddRnd>;
+defm VFNMADD213 : avx512_fma3p_213_f<0xAC, "vfnmadd213", X86Fnmadd, X86FnmaddRnd>;
+defm VFNMSUB213 : avx512_fma3p_213_f<0xAE, "vfnmsub213", X86Fnmsub, X86FnmsubRnd>;
+
+
+multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, string Suff> {
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
+ defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1>,
+ AVX512FMA3Base;
+
+ defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.MemOp:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)), 1, 0>,
+ AVX512FMA3Base;
+
+ defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.ScalarMemOp:$src3),
+ OpcodeStr, "${src3}"##_.BroadcastStr##", $src2",
+ "$src2, ${src3}"##_.BroadcastStr,
+ (_.VT (OpNode _.RC:$src2,
+ (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
+ _.RC:$src1)), 1, 0>, AVX512FMA3Base, EVEX_B;
+ }
+
+ // Additional patterns for folding broadcast nodes in other orders.
+ def : Pat<(_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
+ _.RC:$src2, _.RC:$src1)),
+ (!cast<Instruction>(NAME#Suff#_.ZSuffix#mb) _.RC:$src1,
+ _.RC:$src2, addr:$src3)>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
+ _.RC:$src2, _.RC:$src1),
+ _.RC:$src1)),
+ (!cast<Instruction>(NAME#Suff#_.ZSuffix#mbk) _.RC:$src1,
+ _.KRCWM:$mask, _.RC:$src2, addr:$src3)>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
+ _.RC:$src2, _.RC:$src1),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>(NAME#Suff#_.ZSuffix#mbkz) _.RC:$src1,
+ _.KRCWM:$mask, _.RC:$src2, addr:$src3)>;
+}
+
+multiclass avx512_fma3_231_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, string Suff> {
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in
+ defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
+ OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
+ (_.VT ( OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 imm:$rc))), 1, 1>,
+ AVX512FMA3Base, EVEX_B, EVEX_RC;
+}
+
+multiclass avx512_fma3p_231_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode OpNodeRnd, AVX512VLVectorVTInfo _,
+ string Suff> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, _.info512, Suff>,
+ avx512_fma3_231_round<opc, OpcodeStr, OpNodeRnd, _.info512,
+ Suff>, EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
+ }
+ let Predicates = [HasVLX, HasAVX512] in {
+ defm Z256 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, _.info256, Suff>,
+ EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
+ defm Z128 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, _.info128, Suff>,
+ EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
+ }
+}
+
+multiclass avx512_fma3p_231_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode OpNodeRnd > {
+ defm PS : avx512_fma3p_231_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd,
+ avx512vl_f32_info, "PS">;
+ defm PD : avx512_fma3p_231_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd,
+ avx512vl_f64_info, "PD">, VEX_W;
+}
+
+defm VFMADD231 : avx512_fma3p_231_f<0xB8, "vfmadd231", X86Fmadd, X86FmaddRnd>;
+defm VFMSUB231 : avx512_fma3p_231_f<0xBA, "vfmsub231", X86Fmsub, X86FmsubRnd>;
+defm VFMADDSUB231 : avx512_fma3p_231_f<0xB6, "vfmaddsub231", X86Fmaddsub, X86FmaddsubRnd>;
+defm VFMSUBADD231 : avx512_fma3p_231_f<0xB7, "vfmsubadd231", X86Fmsubadd, X86FmsubaddRnd>;
+defm VFNMADD231 : avx512_fma3p_231_f<0xBC, "vfnmadd231", X86Fnmadd, X86FnmaddRnd>;
+defm VFNMSUB231 : avx512_fma3p_231_f<0xBE, "vfnmsub231", X86Fnmsub, X86FnmsubRnd>;
+
+multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, string Suff> {
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
+ defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2)), 1, 1>,
+ AVX512FMA3Base;
+
+ defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.MemOp:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (OpNode _.RC:$src1, (_.LdFrag addr:$src3), _.RC:$src2)), 1, 0>,
+ AVX512FMA3Base;
+
+ defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.ScalarMemOp:$src3),
+ OpcodeStr, "${src3}"##_.BroadcastStr##", $src2",
+ "$src2, ${src3}"##_.BroadcastStr,
+ (_.VT (OpNode _.RC:$src1,
+ (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
+ _.RC:$src2)), 1, 0>, AVX512FMA3Base, EVEX_B;
+ }
+
+ // Additional patterns for folding broadcast nodes in other orders.
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
+ _.RC:$src1, _.RC:$src2),
+ _.RC:$src1)),
+ (!cast<Instruction>(NAME#Suff#_.ZSuffix#mbk) _.RC:$src1,
+ _.KRCWM:$mask, _.RC:$src2, addr:$src3)>;
+}
+
+multiclass avx512_fma3_132_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, string Suff> {
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in
+ defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
+ OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
+ (_.VT ( OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 imm:$rc))), 1, 1>,
+ AVX512FMA3Base, EVEX_B, EVEX_RC;
+}
+
+multiclass avx512_fma3p_132_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode OpNodeRnd, AVX512VLVectorVTInfo _,
+ string Suff> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, _.info512, Suff>,
+ avx512_fma3_132_round<opc, OpcodeStr, OpNodeRnd, _.info512,
+ Suff>, EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
+ }
+ let Predicates = [HasVLX, HasAVX512] in {
+ defm Z256 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, _.info256, Suff>,
+ EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
+ defm Z128 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, _.info128, Suff>,
+ EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
+ }
+}
+
+multiclass avx512_fma3p_132_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode OpNodeRnd > {
+ defm PS : avx512_fma3p_132_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd,
+ avx512vl_f32_info, "PS">;
+ defm PD : avx512_fma3p_132_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd,
+ avx512vl_f64_info, "PD">, VEX_W;
+}
+
+defm VFMADD132 : avx512_fma3p_132_f<0x98, "vfmadd132", X86Fmadd, X86FmaddRnd>;
+defm VFMSUB132 : avx512_fma3p_132_f<0x9A, "vfmsub132", X86Fmsub, X86FmsubRnd>;
+defm VFMADDSUB132 : avx512_fma3p_132_f<0x96, "vfmaddsub132", X86Fmaddsub, X86FmaddsubRnd>;
+defm VFMSUBADD132 : avx512_fma3p_132_f<0x97, "vfmsubadd132", X86Fmsubadd, X86FmsubaddRnd>;
+defm VFNMADD132 : avx512_fma3p_132_f<0x9C, "vfnmadd132", X86Fnmadd, X86FnmaddRnd>;
+defm VFNMSUB132 : avx512_fma3p_132_f<0x9E, "vfnmsub132", X86Fnmsub, X86FnmsubRnd>;
+
+// Scalar FMA
+let Constraints = "$src1 = $dst" in {
+multiclass avx512_fma3s_common<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ dag RHS_VEC_r, dag RHS_VEC_m, dag RHS_VEC_rb,
+ dag RHS_r, dag RHS_m > {
+ defm r_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3), OpcodeStr,
+ "$src3, $src2", "$src2, $src3", RHS_VEC_r, 1, 1>, AVX512FMA3Base;
+
+ defm m_Int: AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.ScalarMemOp:$src3), OpcodeStr,
+ "$src3, $src2", "$src2, $src3", RHS_VEC_m, 1, 1>, AVX512FMA3Base;
+
+ defm rb_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
+ OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", RHS_VEC_rb, 1, 1>,
+ AVX512FMA3Base, EVEX_B, EVEX_RC;
+
+ let isCodeGenOnly = 1, isCommutable = 1 in {
+ def r : AVX512FMA3<opc, MRMSrcReg, (outs _.FRC:$dst),
+ (ins _.FRC:$src1, _.FRC:$src2, _.FRC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [RHS_r]>;
+ def m : AVX512FMA3<opc, MRMSrcMem, (outs _.FRC:$dst),
+ (ins _.FRC:$src1, _.FRC:$src2, _.ScalarMemOp:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [RHS_m]>;
+ }// isCodeGenOnly = 1
+}
+}// Constraints = "$src1 = $dst"
+
+multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
+ string OpcodeStr, SDNode OpNode, SDNode OpNodeRnds1,
+ SDNode OpNodeRnds3, X86VectorVTInfo _ , string SUFF> {
+
+ defm NAME#213#SUFF#Z: avx512_fma3s_common<opc213, OpcodeStr#"213"#_.Suffix , _ ,
+ // Operands for intrinsic are in 123 order to preserve passthu
+ // semantics.
+ (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, _.RC:$src3, (i32 FROUND_CURRENT))),
+ (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2,
+ (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))), (i32 FROUND_CURRENT))),
+ (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, _.RC:$src3,
+ (i32 imm:$rc))),
+ (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1,
+ _.FRC:$src3))),
+ (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1,
+ (_.ScalarLdFrag addr:$src3))))>;
+
+ defm NAME#231#SUFF#Z: avx512_fma3s_common<opc231, OpcodeStr#"231"#_.Suffix , _ ,
+ (_.VT (OpNodeRnds3 _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 FROUND_CURRENT))),
+ (_.VT (OpNodeRnds3 _.RC:$src2,
+ (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))),
+ _.RC:$src1, (i32 FROUND_CURRENT))),
+ (_.VT ( OpNodeRnds3 _.RC:$src2, _.RC:$src3, _.RC:$src1,
+ (i32 imm:$rc))),
+ (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src3,
+ _.FRC:$src1))),
+ (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2,
+ (_.ScalarLdFrag addr:$src3), _.FRC:$src1)))>;
+
+ defm NAME#132#SUFF#Z: avx512_fma3s_common<opc132, OpcodeStr#"132"#_.Suffix , _ ,
+ (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 FROUND_CURRENT))),
+ (_.VT (OpNodeRnds1 _.RC:$src1,
+ (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))),
+ _.RC:$src2, (i32 FROUND_CURRENT))),
+ (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src3, _.RC:$src2,
+ (i32 imm:$rc))),
+ (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1, _.FRC:$src3,
+ _.FRC:$src2))),
+ (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1,
+ (_.ScalarLdFrag addr:$src3), _.FRC:$src2)))>;
+}
+
+multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132,
+ string OpcodeStr, SDNode OpNode, SDNode OpNodeRnds1,
+ SDNode OpNodeRnds3> {
+ let Predicates = [HasAVX512] in {
+ defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
+ OpNodeRnds1, OpNodeRnds3, f32x_info, "SS">,
+ EVEX_CD8<32, CD8VT1>, VEX_LIG;
+ defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
+ OpNodeRnds1, OpNodeRnds3, f64x_info, "SD">,
+ EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W;
+ }
+}
+
+defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86FmaddRnds1,
+ X86FmaddRnds3>;
+defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86FmsubRnds1,
+ X86FmsubRnds3>;
+defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd,
+ X86FnmaddRnds1, X86FnmaddRnds3>;
+defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub,
+ X86FnmsubRnds1, X86FnmsubRnds3>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 Packed Multiply of Unsigned 52-bit Integers and Add the Low 52-bit IFMA
+//===----------------------------------------------------------------------===//
+let Constraints = "$src1 = $dst" in {
+multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>,
+ AVX512FMA3Base;
+
+ defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.MemOp:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2, (_.LdFrag addr:$src3)))>,
+ AVX512FMA3Base;
+
+ defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.ScalarMemOp:$src3),
+ OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
+ !strconcat("$src2, ${src3}", _.BroadcastStr ),
+ (OpNode _.RC:$src1,
+ _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))>,
+ AVX512FMA3Base, EVEX_B;
+}
+} // Constraints = "$src1 = $dst"
+
+multiclass avx512_pmadd52_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ AVX512VLVectorVTInfo _> {
+ let Predicates = [HasIFMA] in {
+ defm Z : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, _.info512>,
+ EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
+ }
+ let Predicates = [HasVLX, HasIFMA] in {
+ defm Z256 : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, _.info256>,
+ EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
+ defm Z128 : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, _.info128>,
+ EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
+ }
+}
+
+defm VPMADD52LUQ : avx512_pmadd52_common<0xb4, "vpmadd52luq", x86vpmadd52l,
+ avx512vl_i64_info>, VEX_W;
+defm VPMADD52HUQ : avx512_pmadd52_common<0xb5, "vpmadd52huq", x86vpmadd52h,
+ avx512vl_i64_info>, VEX_W;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 Scalar convert from sign integer to float/double
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_vcvtsi<bits<8> opc, SDNode OpNode, RegisterClass SrcRC,
+ X86VectorVTInfo DstVT, X86MemOperand x86memop,
+ PatFrag ld_frag, string asm> {
+ let hasSideEffects = 0 in {
+ def rr : SI<opc, MRMSrcReg, (outs DstVT.FRC:$dst),
+ (ins DstVT.FRC:$src1, SrcRC:$src),
+ !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+ EVEX_4V;
+ let mayLoad = 1 in
+ def rm : SI<opc, MRMSrcMem, (outs DstVT.FRC:$dst),
+ (ins DstVT.FRC:$src1, x86memop:$src),
+ !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+ EVEX_4V;
+ } // hasSideEffects = 0
+ let isCodeGenOnly = 1 in {
+ def rr_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst),
+ (ins DstVT.RC:$src1, SrcRC:$src2),
+ !strconcat(asm,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set DstVT.RC:$dst,
+ (OpNode (DstVT.VT DstVT.RC:$src1),
+ SrcRC:$src2,
+ (i32 FROUND_CURRENT)))]>, EVEX_4V;
+
+ def rm_Int : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst),
+ (ins DstVT.RC:$src1, x86memop:$src2),
+ !strconcat(asm,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set DstVT.RC:$dst,
+ (OpNode (DstVT.VT DstVT.RC:$src1),
+ (ld_frag addr:$src2),
+ (i32 FROUND_CURRENT)))]>, EVEX_4V;
+ }//isCodeGenOnly = 1
+}
+
+multiclass avx512_vcvtsi_round<bits<8> opc, SDNode OpNode, RegisterClass SrcRC,
+ X86VectorVTInfo DstVT, string asm> {
+ def rrb_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst),
+ (ins DstVT.RC:$src1, SrcRC:$src2, AVX512RC:$rc),
+ !strconcat(asm,
+ "\t{$src2, $rc, $src1, $dst|$dst, $src1, $rc, $src2}"),
+ [(set DstVT.RC:$dst,
+ (OpNode (DstVT.VT DstVT.RC:$src1),
+ SrcRC:$src2,
+ (i32 imm:$rc)))]>, EVEX_4V, EVEX_B, EVEX_RC;
+}
+
+multiclass avx512_vcvtsi_common<bits<8> opc, SDNode OpNode, RegisterClass SrcRC,
+ X86VectorVTInfo DstVT, X86MemOperand x86memop,
+ PatFrag ld_frag, string asm> {
+ defm NAME : avx512_vcvtsi_round<opc, OpNode, SrcRC, DstVT, asm>,
+ avx512_vcvtsi<opc, OpNode, SrcRC, DstVT, x86memop, ld_frag, asm>,
+ VEX_LIG;
+}
+
+let Predicates = [HasAVX512] in {
+defm VCVTSI2SSZ : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR32,
+ v4f32x_info, i32mem, loadi32, "cvtsi2ss{l}">,
+ XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTSI642SSZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR64,
+ v4f32x_info, i64mem, loadi64, "cvtsi2ss{q}">,
+ XS, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VCVTSI2SDZ : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR32,
+ v2f64x_info, i32mem, loadi32, "cvtsi2sd{l}">,
+ XD, EVEX_CD8<32, CD8VT1>;
+defm VCVTSI642SDZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR64,
+ v2f64x_info, i64mem, loadi64, "cvtsi2sd{q}">,
+ XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
+ (VCVTSI2SSZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0>;
+def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
+ (VCVTSI2SDZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0>;
+
+def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
+ (VCVTSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
+ (VCVTSI642SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))),
+ (VCVTSI2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))),
+ (VCVTSI642SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+
+def : Pat<(f32 (sint_to_fp GR32:$src)),
+ (VCVTSI2SSZrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
+def : Pat<(f32 (sint_to_fp GR64:$src)),
+ (VCVTSI642SSZrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
+def : Pat<(f64 (sint_to_fp GR32:$src)),
+ (VCVTSI2SDZrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
+def : Pat<(f64 (sint_to_fp GR64:$src)),
+ (VCVTSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
+
+defm VCVTUSI2SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, GR32,
+ v4f32x_info, i32mem, loadi32,
+ "cvtusi2ss{l}">, XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTUSI642SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, GR64,
+ v4f32x_info, i64mem, loadi64, "cvtusi2ss{q}">,
+ XS, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VCVTUSI2SDZ : avx512_vcvtsi<0x7B, X86UintToFpRnd, GR32, v2f64x_info,
+ i32mem, loadi32, "cvtusi2sd{l}">,
+ XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, GR64,
+ v2f64x_info, i64mem, loadi64, "cvtusi2sd{q}">,
+ XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+def : InstAlias<"vcvtusi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
+ (VCVTUSI2SSZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0>;
+def : InstAlias<"vcvtusi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
+ (VCVTUSI2SDZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0>;
+
+def : Pat<(f32 (uint_to_fp (loadi32 addr:$src))),
+ (VCVTUSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f32 (uint_to_fp (loadi64 addr:$src))),
+ (VCVTUSI642SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f64 (uint_to_fp (loadi32 addr:$src))),
+ (VCVTUSI2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f64 (uint_to_fp (loadi64 addr:$src))),
+ (VCVTUSI642SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+
+def : Pat<(f32 (uint_to_fp GR32:$src)),
+ (VCVTUSI2SSZrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
+def : Pat<(f32 (uint_to_fp GR64:$src)),
+ (VCVTUSI642SSZrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
+def : Pat<(f64 (uint_to_fp GR32:$src)),
+ (VCVTUSI2SDZrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
+def : Pat<(f64 (uint_to_fp GR64:$src)),
+ (VCVTUSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX-512 Scalar convert from float/double to integer
+//===----------------------------------------------------------------------===//
+multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT ,
+ X86VectorVTInfo DstVT, SDNode OpNode, string asm> {
+ let Predicates = [HasAVX512] in {
+ def rr : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.RC:$src),
+ !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+ [(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.RC:$src),(i32 FROUND_CURRENT)))]>,
+ EVEX, VEX_LIG;
+ def rb : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.RC:$src, AVX512RC:$rc),
+ !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"),
+ [(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.RC:$src),(i32 imm:$rc)))]>,
+ EVEX, VEX_LIG, EVEX_B, EVEX_RC;
+ def rm : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst), (ins SrcVT.ScalarMemOp:$src),
+ !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+ [(set DstVT.RC:$dst, (OpNode
+ (SrcVT.VT (scalar_to_vector (SrcVT.ScalarLdFrag addr:$src))),
+ (i32 FROUND_CURRENT)))]>,
+ EVEX, VEX_LIG;
+ } // Predicates = [HasAVX512]
+}
+
+// Convert float/double to signed/unsigned int 32/64
+defm VCVTSS2SIZ: avx512_cvt_s_int_round<0x2D, f32x_info, i32x_info,
+ X86cvts2si, "cvtss2si">,
+ XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTSS2SI64Z: avx512_cvt_s_int_round<0x2D, f32x_info, i64x_info,
+ X86cvts2si, "cvtss2si">,
+ XS, VEX_W, EVEX_CD8<32, CD8VT1>;
+defm VCVTSS2USIZ: avx512_cvt_s_int_round<0x79, f32x_info, i32x_info,
+ X86cvts2usi, "cvtss2usi">,
+ XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTSS2USI64Z: avx512_cvt_s_int_round<0x79, f32x_info, i64x_info,
+ X86cvts2usi, "cvtss2usi">, XS, VEX_W,
+ EVEX_CD8<32, CD8VT1>;
+defm VCVTSD2SIZ: avx512_cvt_s_int_round<0x2D, f64x_info, i32x_info,
+ X86cvts2si, "cvtsd2si">,
+ XD, EVEX_CD8<64, CD8VT1>;
+defm VCVTSD2SI64Z: avx512_cvt_s_int_round<0x2D, f64x_info, i64x_info,
+ X86cvts2si, "cvtsd2si">,
+ XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VCVTSD2USIZ: avx512_cvt_s_int_round<0x79, f64x_info, i32x_info,
+ X86cvts2usi, "cvtsd2usi">,
+ XD, EVEX_CD8<64, CD8VT1>;
+defm VCVTSD2USI64Z: avx512_cvt_s_int_round<0x79, f64x_info, i64x_info,
+ X86cvts2usi, "cvtsd2usi">, XD, VEX_W,
+ EVEX_CD8<64, CD8VT1>;
+
+// The SSE version of these instructions are disabled for AVX512.
+// Therefore, the SSE intrinsics are mapped to the AVX512 instructions.
+let Predicates = [HasAVX512] in {
+ def : Pat<(i32 (int_x86_sse_cvtss2si (v4f32 VR128X:$src))),
+ (VCVTSS2SIZrr VR128X:$src)>;
+ def : Pat<(i32 (int_x86_sse_cvtss2si (sse_load_f32 addr:$src))),
+ (VCVTSS2SIZrm addr:$src)>;
+ def : Pat<(i64 (int_x86_sse_cvtss2si64 (v4f32 VR128X:$src))),
+ (VCVTSS2SI64Zrr VR128X:$src)>;
+ def : Pat<(i64 (int_x86_sse_cvtss2si64 (sse_load_f32 addr:$src))),
+ (VCVTSS2SI64Zrm addr:$src)>;
+ def : Pat<(i32 (int_x86_sse2_cvtsd2si (v2f64 VR128X:$src))),
+ (VCVTSD2SIZrr VR128X:$src)>;
+ def : Pat<(i32 (int_x86_sse2_cvtsd2si (sse_load_f64 addr:$src))),
+ (VCVTSD2SIZrm addr:$src)>;
+ def : Pat<(i64 (int_x86_sse2_cvtsd2si64 (v2f64 VR128X:$src))),
+ (VCVTSD2SI64Zrr VR128X:$src)>;
+ def : Pat<(i64 (int_x86_sse2_cvtsd2si64 (sse_load_f64 addr:$src))),
+ (VCVTSD2SI64Zrm addr:$src)>;
+} // HasAVX512
+
+let Predicates = [HasAVX512] in {
+ def : Pat<(int_x86_sse_cvtsi2ss VR128X:$src1, GR32:$src2),
+ (VCVTSI2SSZrr_Int VR128X:$src1, GR32:$src2)>;
+ def : Pat<(int_x86_sse_cvtsi2ss VR128X:$src1, (loadi32 addr:$src2)),
+ (VCVTSI2SSZrm_Int VR128X:$src1, addr:$src2)>;
+ def : Pat<(int_x86_sse_cvtsi642ss VR128X:$src1, GR64:$src2),
+ (VCVTSI642SSZrr_Int VR128X:$src1, GR64:$src2)>;
+ def : Pat<(int_x86_sse_cvtsi642ss VR128X:$src1, (loadi64 addr:$src2)),
+ (VCVTSI642SSZrm_Int VR128X:$src1, addr:$src2)>;
+ def : Pat<(int_x86_sse2_cvtsi2sd VR128X:$src1, GR32:$src2),
+ (VCVTSI2SDZrr_Int VR128X:$src1, GR32:$src2)>;
+ def : Pat<(int_x86_sse2_cvtsi2sd VR128X:$src1, (loadi32 addr:$src2)),
+ (VCVTSI2SDZrm_Int VR128X:$src1, addr:$src2)>;
+ def : Pat<(int_x86_sse2_cvtsi642sd VR128X:$src1, GR64:$src2),
+ (VCVTSI642SDZrr_Int VR128X:$src1, GR64:$src2)>;
+ def : Pat<(int_x86_sse2_cvtsi642sd VR128X:$src1, (loadi64 addr:$src2)),
+ (VCVTSI642SDZrm_Int VR128X:$src1, addr:$src2)>;
+ def : Pat<(int_x86_avx512_cvtusi2sd VR128X:$src1, GR32:$src2),
+ (VCVTUSI2SDZrr_Int VR128X:$src1, GR32:$src2)>;
+ def : Pat<(int_x86_avx512_cvtusi2sd VR128X:$src1, (loadi32 addr:$src2)),
+ (VCVTUSI2SDZrm_Int VR128X:$src1, addr:$src2)>;
+} // Predicates = [HasAVX512]
+
+// Convert float/double to signed/unsigned int 32/64 with truncation
+multiclass avx512_cvt_s_all<bits<8> opc, string asm, X86VectorVTInfo _SrcRC,
+ X86VectorVTInfo _DstRC, SDNode OpNode,
+ SDNode OpNodeRnd, string aliasStr>{
+let Predicates = [HasAVX512] in {
+ def rr : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src),
+ !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+ [(set _DstRC.RC:$dst, (OpNode _SrcRC.FRC:$src))]>, EVEX;
+ let hasSideEffects = 0 in
+ def rb : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src),
+ !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"),
+ []>, EVEX, EVEX_B;
+ def rm : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst), (ins _SrcRC.ScalarMemOp:$src),
+ !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+ [(set _DstRC.RC:$dst, (OpNode (_SrcRC.ScalarLdFrag addr:$src)))]>,
+ EVEX;
+
+ def : InstAlias<asm # aliasStr # "\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "rr") _DstRC.RC:$dst, _SrcRC.FRC:$src), 0>;
+ def : InstAlias<asm # aliasStr # "\t\t{{sae}, $src, $dst|$dst, $src, {sae}}",
+ (!cast<Instruction>(NAME # "rb") _DstRC.RC:$dst, _SrcRC.FRC:$src), 0>;
+ def : InstAlias<asm # aliasStr # "\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "rm") _DstRC.RC:$dst,
+ _SrcRC.ScalarMemOp:$src), 0>;
+
+ let isCodeGenOnly = 1 in {
+ def rr_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
+ !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+ [(set _DstRC.RC:$dst, (OpNodeRnd (_SrcRC.VT _SrcRC.RC:$src),
+ (i32 FROUND_CURRENT)))]>, EVEX, VEX_LIG;
+ def rb_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
+ !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"),
+ [(set _DstRC.RC:$dst, (OpNodeRnd (_SrcRC.VT _SrcRC.RC:$src),
+ (i32 FROUND_NO_EXC)))]>,
+ EVEX,VEX_LIG , EVEX_B;
+ let mayLoad = 1, hasSideEffects = 0 in
+ def rm_Int : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst),
+ (ins _SrcRC.MemOp:$src),
+ !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+ []>, EVEX, VEX_LIG;
+
+ } // isCodeGenOnly = 1
+} //HasAVX512
+}
+
+
+defm VCVTTSS2SIZ: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i32x_info,
+ fp_to_sint, X86cvtts2IntRnd, "{l}">,
+ XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTTSS2SI64Z: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i64x_info,
+ fp_to_sint, X86cvtts2IntRnd, "{q}">,
+ VEX_W, XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTTSD2SIZ: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i32x_info,
+ fp_to_sint, X86cvtts2IntRnd, "{l}">,
+ XD, EVEX_CD8<64, CD8VT1>;
+defm VCVTTSD2SI64Z: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i64x_info,
+ fp_to_sint, X86cvtts2IntRnd, "{q}">,
+ VEX_W, XD, EVEX_CD8<64, CD8VT1>;
+
+defm VCVTTSS2USIZ: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i32x_info,
+ fp_to_uint, X86cvtts2UIntRnd, "{l}">,
+ XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTTSS2USI64Z: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i64x_info,
+ fp_to_uint, X86cvtts2UIntRnd, "{q}">,
+ XS,VEX_W, EVEX_CD8<32, CD8VT1>;
+defm VCVTTSD2USIZ: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i32x_info,
+ fp_to_uint, X86cvtts2UIntRnd, "{l}">,
+ XD, EVEX_CD8<64, CD8VT1>;
+defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i64x_info,
+ fp_to_uint, X86cvtts2UIntRnd, "{q}">,
+ XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+let Predicates = [HasAVX512] in {
+ def : Pat<(i32 (int_x86_sse_cvttss2si (v4f32 VR128X:$src))),
+ (VCVTTSS2SIZrr_Int VR128X:$src)>;
+ def : Pat<(i32 (int_x86_sse_cvttss2si (sse_load_f32 addr:$src))),
+ (VCVTTSS2SIZrm_Int addr:$src)>;
+ def : Pat<(i64 (int_x86_sse_cvttss2si64 (v4f32 VR128X:$src))),
+ (VCVTTSS2SI64Zrr_Int VR128X:$src)>;
+ def : Pat<(i64 (int_x86_sse_cvttss2si64 (sse_load_f32 addr:$src))),
+ (VCVTTSS2SI64Zrm_Int addr:$src)>;
+ def : Pat<(i32 (int_x86_sse2_cvttsd2si (v2f64 VR128X:$src))),
+ (VCVTTSD2SIZrr_Int VR128X:$src)>;
+ def : Pat<(i32 (int_x86_sse2_cvttsd2si (sse_load_f64 addr:$src))),
+ (VCVTTSD2SIZrm_Int addr:$src)>;
+ def : Pat<(i64 (int_x86_sse2_cvttsd2si64 (v2f64 VR128X:$src))),
+ (VCVTTSD2SI64Zrr_Int VR128X:$src)>;
+ def : Pat<(i64 (int_x86_sse2_cvttsd2si64 (sse_load_f64 addr:$src))),
+ (VCVTTSD2SI64Zrm_Int addr:$src)>;
+} // HasAVX512
+//===----------------------------------------------------------------------===//
+// AVX-512 Convert form float to double and back
+//===----------------------------------------------------------------------===//
+multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ X86VectorVTInfo _Src, SDNode OpNode> {
+ defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _Src.RC:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode (_.VT _.RC:$src1),
+ (_Src.VT _Src.RC:$src2),
+ (i32 FROUND_CURRENT)))>,
+ EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>;
+ defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _Src.RC:$src1, _Src.ScalarMemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode (_.VT _.RC:$src1),
+ (_Src.VT (scalar_to_vector
+ (_Src.ScalarLdFrag addr:$src2))),
+ (i32 FROUND_CURRENT)))>,
+ EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+}
+
+// Scalar Coversion with SAE - suppress all exceptions
+multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ X86VectorVTInfo _Src, SDNode OpNodeRnd> {
+ defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _Src.RC:$src2), OpcodeStr,
+ "{sae}, $src2, $src1", "$src1, $src2, {sae}",
+ (_.VT (OpNodeRnd (_.VT _.RC:$src1),
+ (_Src.VT _Src.RC:$src2),
+ (i32 FROUND_NO_EXC)))>,
+ EVEX_4V, VEX_LIG, EVEX_B;
+}
+
+// Scalar Conversion with rounding control (RC)
+multiclass avx512_cvt_fp_rc_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ X86VectorVTInfo _Src, SDNode OpNodeRnd> {
+ defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _Src.RC:$src2, AVX512RC:$rc), OpcodeStr,
+ "$rc, $src2, $src1", "$src1, $src2, $rc",
+ (_.VT (OpNodeRnd (_.VT _.RC:$src1),
+ (_Src.VT _Src.RC:$src2), (i32 imm:$rc)))>,
+ EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>,
+ EVEX_B, EVEX_RC;
+}
+multiclass avx512_cvt_fp_scalar_sd2ss<bits<8> opc, string OpcodeStr,
+ SDNode OpNodeRnd, X86VectorVTInfo _src,
+ X86VectorVTInfo _dst> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd>,
+ avx512_cvt_fp_rc_scalar<opc, OpcodeStr, _dst, _src,
+ OpNodeRnd>, VEX_W, EVEX_CD8<64, CD8VT1>, XD;
+ }
+}
+
+multiclass avx512_cvt_fp_scalar_ss2sd<bits<8> opc, string OpcodeStr,
+ SDNode OpNodeRnd, X86VectorVTInfo _src,
+ X86VectorVTInfo _dst> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd>,
+ avx512_cvt_fp_sae_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd>,
+ EVEX_CD8<32, CD8VT1>, XS;
+ }
+}
+defm VCVTSD2SS : avx512_cvt_fp_scalar_sd2ss<0x5A, "vcvtsd2ss",
+ X86froundRnd, f64x_info, f32x_info>;
+defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd",
+ X86fpextRnd,f32x_info, f64x_info >;
+
+def : Pat<(f64 (fpextend FR32X:$src)),
+ (COPY_TO_REGCLASS (VCVTSS2SDZrr (COPY_TO_REGCLASS FR32X:$src, VR128X),
+ (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>,
+ Requires<[HasAVX512]>;
+def : Pat<(f64 (fpextend (loadf32 addr:$src))),
+ (COPY_TO_REGCLASS (VCVTSS2SDZrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
+ Requires<[HasAVX512]>;
+
+def : Pat<(f64 (extloadf32 addr:$src)),
+ (COPY_TO_REGCLASS (VCVTSS2SDZrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
+ Requires<[HasAVX512, OptForSize]>;
+
+def : Pat<(f64 (extloadf32 addr:$src)),
+ (COPY_TO_REGCLASS (VCVTSS2SDZrr (v4f32 (IMPLICIT_DEF)),
+ (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)), VR128X)>,
+ Requires<[HasAVX512, OptForSpeed]>;
+
+def : Pat<(f32 (fpround FR64X:$src)),
+ (COPY_TO_REGCLASS (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, VR128X),
+ (COPY_TO_REGCLASS FR64X:$src, VR128X)), VR128X)>,
+ Requires<[HasAVX512]>;
+//===----------------------------------------------------------------------===//
+// AVX-512 Vector convert from signed/unsigned integer to float/double
+// and from float/double to signed/unsigned integer
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ X86VectorVTInfo _Src, SDNode OpNode,
+ string Broadcast = _.BroadcastStr,
+ string Alias = "", X86MemOperand MemOp = _Src.MemOp> {
+
+ defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _Src.RC:$src), OpcodeStr, "$src", "$src",
+ (_.VT (OpNode (_Src.VT _Src.RC:$src)))>, EVEX;
+
+ defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins MemOp:$src), OpcodeStr#Alias, "$src", "$src",
+ (_.VT (OpNode (_Src.VT
+ (bitconvert (_Src.LdFrag addr:$src)))))>, EVEX;
+
+ defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _Src.ScalarMemOp:$src), OpcodeStr,
+ "${src}"##Broadcast, "${src}"##Broadcast,
+ (_.VT (OpNode (_Src.VT
+ (X86VBroadcast (_Src.ScalarLdFrag addr:$src)))
+ ))>, EVEX, EVEX_B;
+}
+// Coversion with SAE - suppress all exceptions
+multiclass avx512_vcvt_fp_sae<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ X86VectorVTInfo _Src, SDNode OpNodeRnd> {
+ defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _Src.RC:$src), OpcodeStr,
+ "{sae}, $src", "$src, {sae}",
+ (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src),
+ (i32 FROUND_NO_EXC)))>,
+ EVEX, EVEX_B;
+}
+
+// Conversion with rounding control (RC)
+multiclass avx512_vcvt_fp_rc<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ X86VectorVTInfo _Src, SDNode OpNodeRnd> {
+ defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _Src.RC:$src, AVX512RC:$rc), OpcodeStr,
+ "$rc, $src", "$src, $rc",
+ (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src), (i32 imm:$rc)))>,
+ EVEX, EVEX_B, EVEX_RC;
+}
+
+// Extend Float to Double
+multiclass avx512_cvtps2pd<bits<8> opc, string OpcodeStr> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8f32x_info, fpextend>,
+ avx512_vcvt_fp_sae<opc, OpcodeStr, v8f64_info, v8f32x_info,
+ X86vfpextRnd>, EVEX_V512;
+ }
+ let Predicates = [HasVLX] in {
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4f32x_info,
+ X86vfpext, "{1to2}", "", f64mem>, EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4f32x_info, fpextend>,
+ EVEX_V256;
+ }
+}
+
+// Truncate Double to Float
+multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info, fpround>,
+ avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8f64_info,
+ X86vfproundRnd>, EVEX_V512;
+ }
+ let Predicates = [HasVLX] in {
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2f64x_info,
+ X86vfpround, "{1to2}", "{x}">, EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4f64x_info, fpround,
+ "{1to4}", "{y}">, EVEX_V256;
+
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>;
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, f128mem:$src), 0>;
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>;
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, f256mem:$src), 0>;
+ }
+}
+
+defm VCVTPD2PS : avx512_cvtpd2ps<0x5A, "vcvtpd2ps">,
+ VEX_W, PD, EVEX_CD8<64, CD8VF>;
+defm VCVTPS2PD : avx512_cvtps2pd<0x5A, "vcvtps2pd">,
+ PS, EVEX_CD8<32, CD8VH>;
+
+def : Pat<(v8f64 (extloadv8f32 addr:$src)),
+ (VCVTPS2PDZrm addr:$src)>;
+
+let Predicates = [HasVLX] in {
+ let AddedComplexity = 15 in
+ def : Pat<(X86vzmovl (v2f64 (bitconvert
+ (v4f32 (X86vfpround (v2f64 VR128X:$src)))))),
+ (VCVTPD2PSZ128rr VR128X:$src)>;
+ def : Pat<(v2f64 (extloadv2f32 addr:$src)),
+ (VCVTPS2PDZ128rm addr:$src)>;
+ def : Pat<(v4f64 (extloadv4f32 addr:$src)),
+ (VCVTPS2PDZ256rm addr:$src)>;
+}
+
+// Convert Signed/Unsigned Doubleword to Double
+multiclass avx512_cvtdq2pd<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode OpNode128> {
+ // No rounding in this op
+ let Predicates = [HasAVX512] in
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i32x_info, OpNode>,
+ EVEX_V512;
+
+ let Predicates = [HasVLX] in {
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4i32x_info,
+ OpNode128, "{1to2}", "", i64mem>, EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i32x_info, OpNode>,
+ EVEX_V256;
+ }
+}
+
+// Convert Signed/Unsigned Doubleword to Float
+multiclass avx512_cvtdq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode OpNodeRnd> {
+ let Predicates = [HasAVX512] in
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16f32_info, v16i32_info, OpNode>,
+ avx512_vcvt_fp_rc<opc, OpcodeStr, v16f32_info, v16i32_info,
+ OpNodeRnd>, EVEX_V512;
+
+ let Predicates = [HasVLX] in {
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i32x_info, OpNode>,
+ EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i32x_info, OpNode>,
+ EVEX_V256;
+ }
+}
+
+// Convert Float to Signed/Unsigned Doubleword with truncation
+multiclass avx512_cvttps2dq<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, SDNode OpNodeRnd> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode>,
+ avx512_vcvt_fp_sae<opc, OpcodeStr, v16i32_info, v16f32_info,
+ OpNodeRnd>, EVEX_V512;
+ }
+ let Predicates = [HasVLX] in {
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode>,
+ EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode>,
+ EVEX_V256;
+ }
+}
+
+// Convert Float to Signed/Unsigned Doubleword
+multiclass avx512_cvtps2dq<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, SDNode OpNodeRnd> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode>,
+ avx512_vcvt_fp_rc<opc, OpcodeStr, v16i32_info, v16f32_info,
+ OpNodeRnd>, EVEX_V512;
+ }
+ let Predicates = [HasVLX] in {
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode>,
+ EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode>,
+ EVEX_V256;
+ }
+}
+
+// Convert Double to Signed/Unsigned Doubleword with truncation
+multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode OpNode128, SDNode OpNodeRnd> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode>,
+ avx512_vcvt_fp_sae<opc, OpcodeStr, v8i32x_info, v8f64_info,
+ OpNodeRnd>, EVEX_V512;
+ }
+ let Predicates = [HasVLX] in {
+ // we need "x"/"y" suffixes in order to distinguish between 128 and 256
+ // memory forms of these instructions in Asm Parser. They have the same
+ // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
+ // due to the same reason.
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info,
+ OpNode128, "{1to2}", "{x}">, EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode,
+ "{1to4}", "{y}">, EVEX_V256;
+
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>;
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, i128mem:$src), 0>;
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>;
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, i256mem:$src), 0>;
+ }
+}
+
+// Convert Double to Signed/Unsigned Doubleword
+multiclass avx512_cvtpd2dq<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, SDNode OpNodeRnd> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode>,
+ avx512_vcvt_fp_rc<opc, OpcodeStr, v8i32x_info, v8f64_info,
+ OpNodeRnd>, EVEX_V512;
+ }
+ let Predicates = [HasVLX] in {
+ // we need "x"/"y" suffixes in order to distinguish between 128 and 256
+ // memory forms of these instructions in Asm Parcer. They have the same
+ // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
+ // due to the same reason.
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info, OpNode,
+ "{1to2}", "{x}">, EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode,
+ "{1to4}", "{y}">, EVEX_V256;
+
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>;
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, f128mem:$src), 0>;
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>;
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, f256mem:$src), 0>;
+ }
+}
+
+// Convert Double to Signed/Unsigned Quardword
+multiclass avx512_cvtpd2qq<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, SDNode OpNodeRnd> {
+ let Predicates = [HasDQI] in {
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode>,
+ avx512_vcvt_fp_rc<opc, OpcodeStr, v8i64_info, v8f64_info,
+ OpNodeRnd>, EVEX_V512;
+ }
+ let Predicates = [HasDQI, HasVLX] in {
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode>,
+ EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode>,
+ EVEX_V256;
+ }
+}
+
+// Convert Double to Signed/Unsigned Quardword with truncation
+multiclass avx512_cvttpd2qq<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, SDNode OpNodeRnd> {
+ let Predicates = [HasDQI] in {
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode>,
+ avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f64_info,
+ OpNodeRnd>, EVEX_V512;
+ }
+ let Predicates = [HasDQI, HasVLX] in {
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode>,
+ EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode>,
+ EVEX_V256;
+ }
+}
+
+// Convert Signed/Unsigned Quardword to Double
+multiclass avx512_cvtqq2pd<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, SDNode OpNodeRnd> {
+ let Predicates = [HasDQI] in {
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i64_info, OpNode>,
+ avx512_vcvt_fp_rc<opc, OpcodeStr, v8f64_info, v8i64_info,
+ OpNodeRnd>, EVEX_V512;
+ }
+ let Predicates = [HasDQI, HasVLX] in {
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v2i64x_info, OpNode>,
+ EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i64x_info, OpNode>,
+ EVEX_V256;
+ }
+}
+
+// Convert Float to Signed/Unsigned Quardword
+multiclass avx512_cvtps2qq<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, SDNode OpNodeRnd> {
+ let Predicates = [HasDQI] in {
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode>,
+ avx512_vcvt_fp_rc<opc, OpcodeStr, v8i64_info, v8f32x_info,
+ OpNodeRnd>, EVEX_V512;
+ }
+ let Predicates = [HasDQI, HasVLX] in {
+ // Explicitly specified broadcast string, since we take only 2 elements
+ // from v4f32x_info source
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode,
+ "{1to2}", "", f64mem>, EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode>,
+ EVEX_V256;
+ }
+}
+
+// Convert Float to Signed/Unsigned Quardword with truncation
+multiclass avx512_cvttps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode OpNode128, SDNode OpNodeRnd> {
+ let Predicates = [HasDQI] in {
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode>,
+ avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f32x_info,
+ OpNodeRnd>, EVEX_V512;
+ }
+ let Predicates = [HasDQI, HasVLX] in {
+ // Explicitly specified broadcast string, since we take only 2 elements
+ // from v4f32x_info source
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode128,
+ "{1to2}", "", f64mem>, EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode>,
+ EVEX_V256;
+ }
+}
+
+// Convert Signed/Unsigned Quardword to Float
+multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode OpNode128, SDNode OpNodeRnd> {
+ let Predicates = [HasDQI] in {
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i64_info, OpNode>,
+ avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8i64_info,
+ OpNodeRnd>, EVEX_V512;
+ }
+ let Predicates = [HasDQI, HasVLX] in {
+ // we need "x"/"y" suffixes in order to distinguish between 128 and 256
+ // memory forms of these instructions in Asm Parcer. They have the same
+ // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
+ // due to the same reason.
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2i64x_info, OpNode128,
+ "{1to2}", "{x}">, EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i64x_info, OpNode,
+ "{1to4}", "{y}">, EVEX_V256;
+
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>;
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, i128mem:$src), 0>;
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>;
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, i256mem:$src), 0>;
+ }
+}
+
+defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", sint_to_fp, X86VSintToFP>,
+ XS, EVEX_CD8<32, CD8VH>;
+
+defm VCVTDQ2PS : avx512_cvtdq2ps<0x5B, "vcvtdq2ps", sint_to_fp,
+ X86VSintToFpRnd>,
+ PS, EVEX_CD8<32, CD8VF>;
+
+defm VCVTTPS2DQ : avx512_cvttps2dq<0x5B, "vcvttps2dq", fp_to_sint,
+ X86cvttp2siRnd>,
+ XS, EVEX_CD8<32, CD8VF>;
+
+defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", fp_to_sint, X86cvttp2si,
+ X86cvttp2siRnd>,
+ PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VCVTTPS2UDQ : avx512_cvttps2dq<0x78, "vcvttps2udq", fp_to_uint,
+ X86cvttp2uiRnd>, PS,
+ EVEX_CD8<32, CD8VF>;
+
+defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", fp_to_uint,
+ X86cvttp2ui, X86cvttp2uiRnd>, PS, VEX_W,
+ EVEX_CD8<64, CD8VF>;
+
+defm VCVTUDQ2PD : avx512_cvtdq2pd<0x7A, "vcvtudq2pd", uint_to_fp, X86VUintToFP>,
+ XS, EVEX_CD8<32, CD8VH>;
+
+defm VCVTUDQ2PS : avx512_cvtdq2ps<0x7A, "vcvtudq2ps", uint_to_fp,
+ X86VUintToFpRnd>, XD,
+ EVEX_CD8<32, CD8VF>;
+
+defm VCVTPS2DQ : avx512_cvtps2dq<0x5B, "vcvtps2dq", X86cvtp2Int,
+ X86cvtp2IntRnd>, PD, EVEX_CD8<32, CD8VF>;
+
+defm VCVTPD2DQ : avx512_cvtpd2dq<0xE6, "vcvtpd2dq", X86cvtp2Int,
+ X86cvtp2IntRnd>, XD, VEX_W,
+ EVEX_CD8<64, CD8VF>;
+
+defm VCVTPS2UDQ : avx512_cvtps2dq<0x79, "vcvtps2udq", X86cvtp2UInt,
+ X86cvtp2UIntRnd>,
+ PS, EVEX_CD8<32, CD8VF>;
+defm VCVTPD2UDQ : avx512_cvtpd2dq<0x79, "vcvtpd2udq", X86cvtp2UInt,
+ X86cvtp2UIntRnd>, VEX_W,
+ PS, EVEX_CD8<64, CD8VF>;
+
+defm VCVTPD2QQ : avx512_cvtpd2qq<0x7B, "vcvtpd2qq", X86cvtp2Int,
+ X86cvtp2IntRnd>, VEX_W,
+ PD, EVEX_CD8<64, CD8VF>;
+
+defm VCVTPS2QQ : avx512_cvtps2qq<0x7B, "vcvtps2qq", X86cvtp2Int,
+ X86cvtp2IntRnd>, PD, EVEX_CD8<32, CD8VH>;
+
+defm VCVTPD2UQQ : avx512_cvtpd2qq<0x79, "vcvtpd2uqq", X86cvtp2UInt,
+ X86cvtp2UIntRnd>, VEX_W,
+ PD, EVEX_CD8<64, CD8VF>;
+
+defm VCVTPS2UQQ : avx512_cvtps2qq<0x79, "vcvtps2uqq", X86cvtp2UInt,
+ X86cvtp2UIntRnd>, PD, EVEX_CD8<32, CD8VH>;
+
+defm VCVTTPD2QQ : avx512_cvttpd2qq<0x7A, "vcvttpd2qq", fp_to_sint,
+ X86cvttp2siRnd>, VEX_W,
+ PD, EVEX_CD8<64, CD8VF>;
+
+defm VCVTTPS2QQ : avx512_cvttps2qq<0x7A, "vcvttps2qq", fp_to_sint, X86cvttp2si,
+ X86cvttp2siRnd>, PD, EVEX_CD8<32, CD8VH>;
+
+defm VCVTTPD2UQQ : avx512_cvttpd2qq<0x78, "vcvttpd2uqq", fp_to_uint,
+ X86cvttp2uiRnd>, VEX_W,
+ PD, EVEX_CD8<64, CD8VF>;
+
+defm VCVTTPS2UQQ : avx512_cvttps2qq<0x78, "vcvttps2uqq", fp_to_uint, X86cvttp2ui,
+ X86cvttp2uiRnd>, PD, EVEX_CD8<32, CD8VH>;
+
+defm VCVTQQ2PD : avx512_cvtqq2pd<0xE6, "vcvtqq2pd", sint_to_fp,
+ X86VSintToFpRnd>, VEX_W, XS, EVEX_CD8<64, CD8VF>;
+
+defm VCVTUQQ2PD : avx512_cvtqq2pd<0x7A, "vcvtuqq2pd", uint_to_fp,
+ X86VUintToFpRnd>, VEX_W, XS, EVEX_CD8<64, CD8VF>;
+
+defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", sint_to_fp, X86VSintToFP,
+ X86VSintToFpRnd>, VEX_W, PS, EVEX_CD8<64, CD8VF>;
+
+defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", uint_to_fp, X86VUintToFP,
+ X86VUintToFpRnd>, VEX_W, XD, EVEX_CD8<64, CD8VF>;
+
+let Predicates = [HasAVX512, NoVLX] in {
+def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))),
+ (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr
+ (v16f32 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR256X:$src1, sub_ymm)))), sub_ymm)>;
+
+def : Pat<(v4i32 (fp_to_uint (v4f32 VR128X:$src1))),
+ (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr
+ (v16f32 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR128X:$src1, sub_xmm)))), sub_xmm)>;
+
+def : Pat<(v4i32 (fp_to_uint (v4f64 VR256X:$src1))),
+ (EXTRACT_SUBREG (v8i32 (VCVTTPD2UDQZrr
+ (v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR256X:$src1, sub_ymm)))), sub_xmm)>;
+
+def : Pat<(v4i32 (X86cvttp2ui (v2f64 VR128X:$src))),
+ (EXTRACT_SUBREG (v8i32 (VCVTTPD2UDQZrr
+ (v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR128X:$src, sub_xmm)))), sub_xmm)>;
+
+def : Pat<(v8f32 (uint_to_fp (v8i32 VR256X:$src1))),
+ (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR256X:$src1, sub_ymm)))), sub_ymm)>;
+
+def : Pat<(v4f32 (uint_to_fp (v4i32 VR128X:$src1))),
+ (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR128X:$src1, sub_xmm)))), sub_xmm)>;
+
+def : Pat<(v4f64 (uint_to_fp (v4i32 VR128X:$src1))),
+ (EXTRACT_SUBREG (v8f64 (VCVTUDQ2PDZrr
+ (v8i32 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR128X:$src1, sub_xmm)))), sub_ymm)>;
+
+def : Pat<(v2f64 (X86VUintToFP (v4i32 VR128X:$src1))),
+ (EXTRACT_SUBREG (v8f64 (VCVTUDQ2PDZrr
+ (v8i32 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR128X:$src1, sub_xmm)))), sub_xmm)>;
+}
+
+let Predicates = [HasAVX512, HasVLX] in {
+ let AddedComplexity = 15 in {
+ def : Pat<(X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvtp2Int (v2f64 VR128X:$src)))))),
+ (VCVTPD2DQZ128rr VR128X:$src)>;
+ def : Pat<(v4i32 (bitconvert (X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvtp2UInt (v2f64 VR128X:$src)))))))),
+ (VCVTPD2UDQZ128rr VR128X:$src)>;
+ def : Pat<(X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvttp2si (v2f64 VR128X:$src)))))),
+ (VCVTTPD2DQZ128rr VR128X:$src)>;
+ def : Pat<(v4i32 (bitconvert (X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvttp2ui (v2f64 VR128X:$src)))))))),
+ (VCVTTPD2UDQZ128rr VR128X:$src)>;
+ }
+}
+
+let Predicates = [HasAVX512] in {
+ def : Pat<(v8f32 (fpround (loadv8f64 addr:$src))),
+ (VCVTPD2PSZrm addr:$src)>;
+ def : Pat<(v8f64 (extloadv8f32 addr:$src)),
+ (VCVTPS2PDZrm addr:$src)>;
+}
+
+let Predicates = [HasDQI, HasVLX] in {
+ let AddedComplexity = 15 in {
+ def : Pat<(X86vzmovl (v2f64 (bitconvert
+ (v4f32 (X86VSintToFP (v2i64 VR128X:$src)))))),
+ (VCVTQQ2PSZ128rr VR128X:$src)>;
+ def : Pat<(X86vzmovl (v2f64 (bitconvert
+ (v4f32 (X86VUintToFP (v2i64 VR128X:$src)))))),
+ (VCVTUQQ2PSZ128rr VR128X:$src)>;
+ }
+}
+
+let Predicates = [HasDQI, NoVLX] in {
+def : Pat<(v2i64 (fp_to_sint (v2f64 VR128X:$src1))),
+ (EXTRACT_SUBREG (v8i64 (VCVTTPD2QQZrr
+ (v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR128X:$src1, sub_xmm)))), sub_xmm)>;
+
+def : Pat<(v4i64 (fp_to_sint (v4f32 VR128X:$src1))),
+ (EXTRACT_SUBREG (v8i64 (VCVTTPS2QQZrr
+ (v8f32 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR128X:$src1, sub_xmm)))), sub_ymm)>;
+
+def : Pat<(v4i64 (fp_to_sint (v4f64 VR256X:$src1))),
+ (EXTRACT_SUBREG (v8i64 (VCVTTPD2QQZrr
+ (v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR256X:$src1, sub_ymm)))), sub_ymm)>;
+
+def : Pat<(v2i64 (fp_to_uint (v2f64 VR128X:$src1))),
+ (EXTRACT_SUBREG (v8i64 (VCVTTPD2UQQZrr
+ (v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR128X:$src1, sub_xmm)))), sub_xmm)>;
+
+def : Pat<(v4i64 (fp_to_uint (v4f32 VR128X:$src1))),
+ (EXTRACT_SUBREG (v8i64 (VCVTTPS2UQQZrr
+ (v8f32 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR128X:$src1, sub_xmm)))), sub_ymm)>;
+
+def : Pat<(v4i64 (fp_to_uint (v4f64 VR256X:$src1))),
+ (EXTRACT_SUBREG (v8i64 (VCVTTPD2UQQZrr
+ (v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR256X:$src1, sub_ymm)))), sub_ymm)>;
+
+def : Pat<(v4f32 (sint_to_fp (v4i64 VR256X:$src1))),
+ (EXTRACT_SUBREG (v8f32 (VCVTQQ2PSZrr
+ (v8i64 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR256X:$src1, sub_ymm)))), sub_xmm)>;
+
+def : Pat<(v2f64 (sint_to_fp (v2i64 VR128X:$src1))),
+ (EXTRACT_SUBREG (v8f64 (VCVTQQ2PDZrr
+ (v8i64 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR128X:$src1, sub_xmm)))), sub_xmm)>;
+
+def : Pat<(v4f64 (sint_to_fp (v4i64 VR256X:$src1))),
+ (EXTRACT_SUBREG (v8f64 (VCVTQQ2PDZrr
+ (v8i64 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR256X:$src1, sub_ymm)))), sub_ymm)>;
+
+def : Pat<(v4f32 (uint_to_fp (v4i64 VR256X:$src1))),
+ (EXTRACT_SUBREG (v8f32 (VCVTUQQ2PSZrr
+ (v8i64 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR256X:$src1, sub_ymm)))), sub_xmm)>;
+
+def : Pat<(v2f64 (uint_to_fp (v2i64 VR128X:$src1))),
+ (EXTRACT_SUBREG (v8f64 (VCVTUQQ2PDZrr
+ (v8i64 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR128X:$src1, sub_xmm)))), sub_xmm)>;
+
+def : Pat<(v4f64 (uint_to_fp (v4i64 VR256X:$src1))),
+ (EXTRACT_SUBREG (v8f64 (VCVTUQQ2PDZrr
+ (v8i64 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR256X:$src1, sub_ymm)))), sub_ymm)>;
+}
+
+//===----------------------------------------------------------------------===//
+// Half precision conversion instructions
+//===----------------------------------------------------------------------===//
+multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src,
+ X86MemOperand x86memop, PatFrag ld_frag> {
+ defm rr : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src),
+ "vcvtph2ps", "$src", "$src",
+ (X86cvtph2ps (_src.VT _src.RC:$src),
+ (i32 FROUND_CURRENT))>, T8PD;
+ defm rm : AVX512_maskable<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst), (ins x86memop:$src),
+ "vcvtph2ps", "$src", "$src",
+ (X86cvtph2ps (_src.VT (bitconvert (ld_frag addr:$src))),
+ (i32 FROUND_CURRENT))>, T8PD;
+}
+
+multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src> {
+ defm rb : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src),
+ "vcvtph2ps", "{sae}, $src", "$src, {sae}",
+ (X86cvtph2ps (_src.VT _src.RC:$src),
+ (i32 FROUND_NO_EXC))>, T8PD, EVEX_B;
+
+}
+
+let Predicates = [HasAVX512] in {
+ defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem, loadv4i64>,
+ avx512_cvtph2ps_sae<v16f32_info, v16i16x_info>,
+ EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>;
+ let Predicates = [HasVLX] in {
+ defm VCVTPH2PSZ256 : avx512_cvtph2ps<v8f32x_info, v8i16x_info, f128mem,
+ loadv2i64>,EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>;
+ defm VCVTPH2PSZ128 : avx512_cvtph2ps<v4f32x_info, v8i16x_info, f64mem,
+ loadv2i64>, EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>;
+ }
+}
+
+multiclass avx512_cvtps2ph<X86VectorVTInfo _dest, X86VectorVTInfo _src,
+ X86MemOperand x86memop> {
+ defm rr : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst),
+ (ins _src.RC:$src1, i32u8imm:$src2),
+ "vcvtps2ph", "$src2, $src1", "$src1, $src2",
+ (X86cvtps2ph (_src.VT _src.RC:$src1),
+ (i32 imm:$src2)),
+ NoItinerary, 0, 0, X86select>, AVX512AIi8Base;
+ def mr : AVX512AIi8<0x1D, MRMDestMem, (outs),
+ (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2),
+ "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(store (_dest.VT (X86cvtps2ph (_src.VT _src.RC:$src1),
+ (i32 imm:$src2))),
+ addr:$dst)]>;
+ let hasSideEffects = 0, mayStore = 1 in
+ def mrk : AVX512AIi8<0x1D, MRMDestMem, (outs),
+ (ins x86memop:$dst, _dest.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2),
+ "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
+ []>, EVEX_K;
+}
+multiclass avx512_cvtps2ph_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src> {
+ let hasSideEffects = 0 in
+ defm rb : AVX512_maskable_in_asm<0x1D, MRMDestReg, _dest,
+ (outs _dest.RC:$dst),
+ (ins _src.RC:$src1, i32u8imm:$src2),
+ "vcvtps2ph", "$src2, {sae}, $src1", "$src1, {sae}, $src2",
+ []>, EVEX_B, AVX512AIi8Base;
+}
+let Predicates = [HasAVX512] in {
+ defm VCVTPS2PHZ : avx512_cvtps2ph<v16i16x_info, v16f32_info, f256mem>,
+ avx512_cvtps2ph_sae<v16i16x_info, v16f32_info>,
+ EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>;
+ let Predicates = [HasVLX] in {
+ defm VCVTPS2PHZ256 : avx512_cvtps2ph<v8i16x_info, v8f32x_info, f128mem>,
+ EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>;
+ defm VCVTPS2PHZ128 : avx512_cvtps2ph<v8i16x_info, v4f32x_info, f128mem>,
+ EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>;
+ }
+}
+
+// Patterns for matching conversions from float to half-float and vice versa.
+let Predicates = [HasVLX] in {
+ // Use MXCSR.RC for rounding instead of explicitly specifying the default
+ // rounding mode (Nearest-Even, encoded as 0). Both are equivalent in the
+ // configurations we support (the default). However, falling back to MXCSR is
+ // more consistent with other instructions, which are always controlled by it.
+ // It's encoded as 0b100.
+ def : Pat<(fp_to_f16 FR32X:$src),
+ (i16 (EXTRACT_SUBREG (VMOVPDI2DIZrr (VCVTPS2PHZ128rr
+ (COPY_TO_REGCLASS FR32X:$src, VR128X), 4)), sub_16bit))>;
+
+ def : Pat<(f16_to_fp GR16:$src),
+ (f32 (COPY_TO_REGCLASS (VCVTPH2PSZ128rr
+ (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128X)), FR32X)) >;
+
+ def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32X:$src))),
+ (f32 (COPY_TO_REGCLASS (VCVTPH2PSZ128rr
+ (VCVTPS2PHZ128rr (COPY_TO_REGCLASS FR32X:$src, VR128X), 4)), FR32X)) >;
+}
+
+// Patterns for matching float to half-float conversion when AVX512 is supported
+// but F16C isn't. In that case we have to use 512-bit vectors.
+let Predicates = [HasAVX512, NoVLX, NoF16C] in {
+ def : Pat<(fp_to_f16 FR32X:$src),
+ (i16 (EXTRACT_SUBREG
+ (VMOVPDI2DIZrr
+ (v8i16 (EXTRACT_SUBREG
+ (VCVTPS2PHZrr
+ (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
+ (v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)),
+ sub_xmm), 4), sub_xmm))), sub_16bit))>;
+
+ def : Pat<(f16_to_fp GR16:$src),
+ (f32 (COPY_TO_REGCLASS
+ (v4f32 (EXTRACT_SUBREG
+ (VCVTPH2PSZrr
+ (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)),
+ (v8i16 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128X)),
+ sub_xmm)), sub_xmm)), FR32X))>;
+
+ def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32X:$src))),
+ (f32 (COPY_TO_REGCLASS
+ (v4f32 (EXTRACT_SUBREG
+ (VCVTPH2PSZrr
+ (VCVTPS2PHZrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
+ (v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)),
+ sub_xmm), 4)), sub_xmm)), FR32X))>;
+}
+
+// Unordered/Ordered scalar fp compare with Sea and set EFLAGS
+multiclass avx512_ord_cmp_sae<bits<8> opc, X86VectorVTInfo _,
+ string OpcodeStr> {
+ def rb: AVX512<opc, MRMSrcReg, (outs), (ins _.RC:$src1, _.RC:$src2),
+ !strconcat(OpcodeStr, "\t{{sae}, $src2, $src1|$src1, $src2, {sae}}"),
+ [], IIC_SSE_COMIS_RR>, EVEX, EVEX_B, VEX_LIG, EVEX_V128,
+ Sched<[WriteFAdd]>;
+}
+
+let Defs = [EFLAGS], Predicates = [HasAVX512] in {
+ defm VUCOMISSZ : avx512_ord_cmp_sae<0x2E, v4f32x_info, "vucomiss">,
+ AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>;
+ defm VUCOMISDZ : avx512_ord_cmp_sae<0x2E, v2f64x_info, "vucomisd">,
+ AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>;
+ defm VCOMISSZ : avx512_ord_cmp_sae<0x2F, v4f32x_info, "vcomiss">,
+ AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>;
+ defm VCOMISDZ : avx512_ord_cmp_sae<0x2F, v2f64x_info, "vcomisd">,
+ AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>;
+}
+
+let Defs = [EFLAGS], Predicates = [HasAVX512] in {
+ defm VUCOMISSZ : sse12_ord_cmp<0x2E, FR32X, X86cmp, f32, f32mem, loadf32,
+ "ucomiss">, PS, EVEX, VEX_LIG,
+ EVEX_CD8<32, CD8VT1>;
+ defm VUCOMISDZ : sse12_ord_cmp<0x2E, FR64X, X86cmp, f64, f64mem, loadf64,
+ "ucomisd">, PD, EVEX,
+ VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+ let Pattern = []<dag> in {
+ defm VCOMISSZ : sse12_ord_cmp<0x2F, FR32X, undef, f32, f32mem, loadf32,
+ "comiss">, PS, EVEX, VEX_LIG,
+ EVEX_CD8<32, CD8VT1>;
+ defm VCOMISDZ : sse12_ord_cmp<0x2F, FR64X, undef, f64, f64mem, loadf64,
+ "comisd">, PD, EVEX,
+ VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+ }
+ let isCodeGenOnly = 1 in {
+ defm Int_VUCOMISSZ : sse12_ord_cmp<0x2E, VR128X, X86ucomi, v4f32, f128mem,
+ load, "ucomiss">, PS, EVEX, VEX_LIG,
+ EVEX_CD8<32, CD8VT1>;
+ defm Int_VUCOMISDZ : sse12_ord_cmp<0x2E, VR128X, X86ucomi, v2f64, f128mem,
+ load, "ucomisd">, PD, EVEX,
+ VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+ defm Int_VCOMISSZ : sse12_ord_cmp<0x2F, VR128X, X86comi, v4f32, f128mem,
+ load, "comiss">, PS, EVEX, VEX_LIG,
+ EVEX_CD8<32, CD8VT1>;
+ defm Int_VCOMISDZ : sse12_ord_cmp<0x2F, VR128X, X86comi, v2f64, f128mem,
+ load, "comisd">, PD, EVEX,
+ VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+ }
+}
+
+/// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd
+multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ let AddedComplexity = 20 , Predicates = [HasAVX512] in {
+ defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>, EVEX_4V;
+ defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))))>, EVEX_4V;
+}
+}
+
+defm VRCP14SS : avx512_fp14_s<0x4D, "vrcp14ss", X86frcp14s, f32x_info>,
+ EVEX_CD8<32, CD8VT1>, T8PD;
+defm VRCP14SD : avx512_fp14_s<0x4D, "vrcp14sd", X86frcp14s, f64x_info>,
+ VEX_W, EVEX_CD8<64, CD8VT1>, T8PD;
+defm VRSQRT14SS : avx512_fp14_s<0x4F, "vrsqrt14ss", X86frsqrt14s, f32x_info>,
+ EVEX_CD8<32, CD8VT1>, T8PD;
+defm VRSQRT14SD : avx512_fp14_s<0x4F, "vrsqrt14sd", X86frsqrt14s, f64x_info>,
+ VEX_W, EVEX_CD8<64, CD8VT1>, T8PD;
+
+/// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd
+multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src), OpcodeStr, "$src", "$src",
+ (_.FloatVT (OpNode _.RC:$src))>, EVEX, T8PD;
+ defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.MemOp:$src), OpcodeStr, "$src", "$src",
+ (OpNode (_.FloatVT
+ (bitconvert (_.LdFrag addr:$src))))>, EVEX, T8PD;
+ defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.ScalarMemOp:$src), OpcodeStr,
+ "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
+ (OpNode (_.FloatVT
+ (X86VBroadcast (_.ScalarLdFrag addr:$src))))>,
+ EVEX, T8PD, EVEX_B;
+}
+
+multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+ defm PSZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"), OpNode, v16f32_info>,
+ EVEX_V512, EVEX_CD8<32, CD8VF>;
+ defm PDZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"), OpNode, v8f64_info>,
+ EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+ // Define only if AVX512VL feature is present.
+ let Predicates = [HasVLX] in {
+ defm PSZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"),
+ OpNode, v4f32x_info>,
+ EVEX_V128, EVEX_CD8<32, CD8VF>;
+ defm PSZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"),
+ OpNode, v8f32x_info>,
+ EVEX_V256, EVEX_CD8<32, CD8VF>;
+ defm PDZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"),
+ OpNode, v2f64x_info>,
+ EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>;
+ defm PDZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"),
+ OpNode, v4f64x_info>,
+ EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>;
+ }
+}
+
+defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86frsqrt>;
+defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86frcp>;
+
+/// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd
+multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
+ SDNode OpNode> {
+
+ defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+ (i32 FROUND_CURRENT))>;
+
+ defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+ "{sae}, $src2, $src1", "$src1, $src2, {sae}",
+ (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+ (i32 FROUND_NO_EXC))>, EVEX_B;
+
+ defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
+ (i32 FROUND_CURRENT))>;
+}
+
+multiclass avx512_eri_s<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+ defm SS : avx512_fp28_s<opc, OpcodeStr#"ss", f32x_info, OpNode>,
+ EVEX_CD8<32, CD8VT1>;
+ defm SD : avx512_fp28_s<opc, OpcodeStr#"sd", f64x_info, OpNode>,
+ EVEX_CD8<64, CD8VT1>, VEX_W;
+}
+
+let Predicates = [HasERI] in {
+ defm VRCP28 : avx512_eri_s<0xCB, "vrcp28", X86rcp28s>, T8PD, EVEX_4V;
+ defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s>, T8PD, EVEX_4V;
+}
+
+defm VGETEXP : avx512_eri_s<0x43, "vgetexp", X86fgetexpRnds>, T8PD, EVEX_4V;
+/// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd
+
+multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ SDNode OpNode> {
+
+ defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src), OpcodeStr, "$src", "$src",
+ (OpNode (_.VT _.RC:$src), (i32 FROUND_CURRENT))>;
+
+ defm m : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.MemOp:$src), OpcodeStr, "$src", "$src",
+ (OpNode (_.FloatVT
+ (bitconvert (_.LdFrag addr:$src))),
+ (i32 FROUND_CURRENT))>;
+
+ defm mb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.ScalarMemOp:$src), OpcodeStr,
+ "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
+ (OpNode (_.FloatVT
+ (X86VBroadcast (_.ScalarLdFrag addr:$src))),
+ (i32 FROUND_CURRENT))>, EVEX_B;
+}
+multiclass avx512_fp28_p_round<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ SDNode OpNode> {
+ defm rb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src), OpcodeStr,
+ "{sae}, $src", "$src, {sae}",
+ (OpNode (_.VT _.RC:$src), (i32 FROUND_NO_EXC))>, EVEX_B;
+}
+
+multiclass avx512_eri<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+ defm PS : avx512_fp28_p<opc, OpcodeStr#"ps", v16f32_info, OpNode>,
+ avx512_fp28_p_round<opc, OpcodeStr#"ps", v16f32_info, OpNode>,
+ T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
+ defm PD : avx512_fp28_p<opc, OpcodeStr#"pd", v8f64_info, OpNode>,
+ avx512_fp28_p_round<opc, OpcodeStr#"pd", v8f64_info, OpNode>,
+ T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+}
+
+multiclass avx512_fp_unaryop_packed<bits<8> opc, string OpcodeStr,
+ SDNode OpNode> {
+ // Define only if AVX512VL feature is present.
+ let Predicates = [HasVLX] in {
+ defm PSZ128 : avx512_fp28_p<opc, OpcodeStr#"ps", v4f32x_info, OpNode>,
+ EVEX_V128, T8PD, EVEX_CD8<32, CD8VF>;
+ defm PSZ256 : avx512_fp28_p<opc, OpcodeStr#"ps", v8f32x_info, OpNode>,
+ EVEX_V256, T8PD, EVEX_CD8<32, CD8VF>;
+ defm PDZ128 : avx512_fp28_p<opc, OpcodeStr#"pd", v2f64x_info, OpNode>,
+ EVEX_V128, VEX_W, T8PD, EVEX_CD8<64, CD8VF>;
+ defm PDZ256 : avx512_fp28_p<opc, OpcodeStr#"pd", v4f64x_info, OpNode>,
+ EVEX_V256, VEX_W, T8PD, EVEX_CD8<64, CD8VF>;
+ }
+}
+let Predicates = [HasERI] in {
+
+ defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28>, EVEX;
+ defm VRCP28 : avx512_eri<0xCA, "vrcp28", X86rcp28>, EVEX;
+ defm VEXP2 : avx512_eri<0xC8, "vexp2", X86exp2>, EVEX;
+}
+defm VGETEXP : avx512_eri<0x42, "vgetexp", X86fgetexpRnd>,
+ avx512_fp_unaryop_packed<0x42, "vgetexp", X86fgetexpRnd> , EVEX;
+
+multiclass avx512_sqrt_packed_round<bits<8> opc, string OpcodeStr,
+ SDNode OpNodeRnd, X86VectorVTInfo _>{
+ defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src, AVX512RC:$rc), OpcodeStr, "$rc, $src", "$src, $rc",
+ (_.VT (OpNodeRnd _.RC:$src, (i32 imm:$rc)))>,
+ EVEX, EVEX_B, EVEX_RC;
+}
+
+multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, X86VectorVTInfo _>{
+ defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src), OpcodeStr, "$src", "$src",
+ (_.FloatVT (OpNode _.RC:$src))>, EVEX;
+ defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.MemOp:$src), OpcodeStr, "$src", "$src",
+ (OpNode (_.FloatVT
+ (bitconvert (_.LdFrag addr:$src))))>, EVEX;
+
+ defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.ScalarMemOp:$src), OpcodeStr,
+ "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
+ (OpNode (_.FloatVT
+ (X86VBroadcast (_.ScalarLdFrag addr:$src))))>,
+ EVEX, EVEX_B;
+}
+
+multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr,
+ SDNode OpNode> {
+ defm PSZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
+ v16f32_info>,
+ EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+ defm PDZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
+ v8f64_info>,
+ EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>;
+ // Define only if AVX512VL feature is present.
+ let Predicates = [HasVLX] in {
+ defm PSZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"),
+ OpNode, v4f32x_info>,
+ EVEX_V128, PS, EVEX_CD8<32, CD8VF>;
+ defm PSZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"),
+ OpNode, v8f32x_info>,
+ EVEX_V256, PS, EVEX_CD8<32, CD8VF>;
+ defm PDZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"),
+ OpNode, v2f64x_info>,
+ EVEX_V128, VEX_W, PD, EVEX_CD8<64, CD8VF>;
+ defm PDZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"),
+ OpNode, v4f64x_info>,
+ EVEX_V256, VEX_W, PD, EVEX_CD8<64, CD8VF>;
+ }
+}
+
+multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr,
+ SDNode OpNodeRnd> {
+ defm PSZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "ps"), OpNodeRnd,
+ v16f32_info>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+ defm PDZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "pd"), OpNodeRnd,
+ v8f64_info>, EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>;
+}
+
+multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
+ string SUFF, SDNode OpNode, SDNode OpNodeRnd> {
+
+ defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (OpNodeRnd (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (i32 FROUND_CURRENT))>;
+ defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (OpNodeRnd (_.VT _.RC:$src1),
+ (_.VT (scalar_to_vector
+ (_.ScalarLdFrag addr:$src2))),
+ (i32 FROUND_CURRENT))>;
+
+ defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr,
+ "$rc, $src2, $src1", "$src1, $src2, $rc",
+ (OpNodeRnd (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (i32 imm:$rc))>,
+ EVEX_B, EVEX_RC;
+
+ let isCodeGenOnly = 1, hasSideEffects = 0 in {
+ def r : I<opc, MRMSrcReg, (outs _.FRC:$dst),
+ (ins _.FRC:$src1, _.FRC:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>;
+
+ let mayLoad = 1 in
+ def m : I<opc, MRMSrcMem, (outs _.FRC:$dst),
+ (ins _.FRC:$src1, _.ScalarMemOp:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>;
+ }
+
+ def : Pat<(_.EltVT (OpNode _.FRC:$src)),
+ (!cast<Instruction>(NAME#SUFF#Zr)
+ (_.EltVT (IMPLICIT_DEF)), _.FRC:$src)>;
+
+ def : Pat<(_.EltVT (OpNode (load addr:$src))),
+ (!cast<Instruction>(NAME#SUFF#Zm)
+ (_.EltVT (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX512, OptForSize]>;
+}
+
+multiclass avx512_sqrt_scalar_all<bits<8> opc, string OpcodeStr> {
+ defm SSZ : avx512_sqrt_scalar<opc, OpcodeStr#"ss", f32x_info, "SS", fsqrt,
+ X86fsqrtRnds>, EVEX_CD8<32, CD8VT1>, EVEX_4V, XS;
+ defm SDZ : avx512_sqrt_scalar<opc, OpcodeStr#"sd", f64x_info, "SD", fsqrt,
+ X86fsqrtRnds>, EVEX_CD8<64, CD8VT1>, EVEX_4V, XD, VEX_W;
+}
+
+defm VSQRT : avx512_sqrt_packed_all<0x51, "vsqrt", fsqrt>,
+ avx512_sqrt_packed_all_round<0x51, "vsqrt", X86fsqrtRnd>;
+
+defm VSQRT : avx512_sqrt_scalar_all<0x51, "vsqrt">, VEX_LIG;
+
+let Predicates = [HasAVX512] in {
+ def : Pat<(f32 (X86frsqrt FR32X:$src)),
+ (COPY_TO_REGCLASS (VRSQRT14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>;
+ def : Pat<(f32 (X86frsqrt (load addr:$src))),
+ (COPY_TO_REGCLASS (VRSQRT14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
+ Requires<[OptForSize]>;
+ def : Pat<(f32 (X86frcp FR32X:$src)),
+ (COPY_TO_REGCLASS (VRCP14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X )>;
+ def : Pat<(f32 (X86frcp (load addr:$src))),
+ (COPY_TO_REGCLASS (VRCP14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
+ Requires<[OptForSize]>;
+}
+
+multiclass
+avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
+
+ let ExeDomain = _.ExeDomain in {
+ defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr,
+ "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+ (i32 imm:$src3), (i32 FROUND_CURRENT)))>;
+
+ defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr,
+ "$src3, {sae}, $src2, $src1", "$src1, $src2, {sae}, $src3",
+ (_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+ (i32 imm:$src3), (i32 FROUND_NO_EXC)))>, EVEX_B;
+
+ defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
+ OpcodeStr,
+ "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (_.VT (X86RndScales (_.VT _.RC:$src1),
+ (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
+ (i32 imm:$src3), (i32 FROUND_CURRENT)))>;
+ }
+ let Predicates = [HasAVX512] in {
+ def : Pat<(ffloor _.FRC:$src), (COPY_TO_REGCLASS
+ (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x1))), _.FRC)>;
+ def : Pat<(fceil _.FRC:$src), (COPY_TO_REGCLASS
+ (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x2))), _.FRC)>;
+ def : Pat<(ftrunc _.FRC:$src), (COPY_TO_REGCLASS
+ (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x3))), _.FRC)>;
+ def : Pat<(frint _.FRC:$src), (COPY_TO_REGCLASS
+ (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x4))), _.FRC)>;
+ def : Pat<(fnearbyint _.FRC:$src), (COPY_TO_REGCLASS
+ (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0xc))), _.FRC)>;
+
+ def : Pat<(ffloor (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
+ (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
+ addr:$src, (i32 0x1))), _.FRC)>;
+ def : Pat<(fceil (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
+ (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
+ addr:$src, (i32 0x2))), _.FRC)>;
+ def : Pat<(ftrunc (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
+ (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
+ addr:$src, (i32 0x3))), _.FRC)>;
+ def : Pat<(frint (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
+ (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
+ addr:$src, (i32 0x4))), _.FRC)>;
+ def : Pat<(fnearbyint (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
+ (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
+ addr:$src, (i32 0xc))), _.FRC)>;
+ }
+}
+
+defm VRNDSCALESS : avx512_rndscale_scalar<0x0A, "vrndscaless", f32x_info>,
+ AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+
+defm VRNDSCALESD : avx512_rndscale_scalar<0x0B, "vrndscalesd", f64x_info>, VEX_W,
+ AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VT1>;
+
+//-------------------------------------------------
+// Integer truncate and extend operations
+//-------------------------------------------------
+
+multiclass avx512_trunc_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo SrcInfo, X86VectorVTInfo DestInfo,
+ X86MemOperand x86memop> {
+ let ExeDomain = DestInfo.ExeDomain in
+ defm rr : AVX512_maskable<opc, MRMDestReg, DestInfo, (outs DestInfo.RC:$dst),
+ (ins SrcInfo.RC:$src1), OpcodeStr ,"$src1", "$src1",
+ (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1)))>,
+ EVEX, T8XS;
+
+ // for intrinsic patter match
+ def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask,
+ (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))),
+ undef)),
+ (!cast<Instruction>(NAME#SrcInfo.ZSuffix##rrkz) DestInfo.KRCWM:$mask ,
+ SrcInfo.RC:$src1)>;
+
+ def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask,
+ (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))),
+ DestInfo.ImmAllZerosV)),
+ (!cast<Instruction>(NAME#SrcInfo.ZSuffix##rrkz) DestInfo.KRCWM:$mask ,
+ SrcInfo.RC:$src1)>;
+
+ def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask,
+ (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))),
+ DestInfo.RC:$src0)),
+ (!cast<Instruction>(NAME#SrcInfo.ZSuffix##rrk) DestInfo.RC:$src0,
+ DestInfo.KRCWM:$mask ,
+ SrcInfo.RC:$src1)>;
+
+ let mayStore = 1, mayLoad = 1, hasSideEffects = 0,
+ ExeDomain = DestInfo.ExeDomain in {
+ def mr : AVX512XS8I<opc, MRMDestMem, (outs),
+ (ins x86memop:$dst, SrcInfo.RC:$src),
+ OpcodeStr # "\t{$src, $dst|$dst, $src}",
+ []>, EVEX;
+
+ def mrk : AVX512XS8I<opc, MRMDestMem, (outs),
+ (ins x86memop:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src),
+ OpcodeStr # "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+ []>, EVEX, EVEX_K;
+ }//mayStore = 1, mayLoad = 1, hasSideEffects = 0
+}
+
+multiclass avx512_trunc_mr_lowering<X86VectorVTInfo SrcInfo,
+ X86VectorVTInfo DestInfo,
+ PatFrag truncFrag, PatFrag mtruncFrag > {
+
+ def : Pat<(truncFrag (SrcInfo.VT SrcInfo.RC:$src), addr:$dst),
+ (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mr)
+ addr:$dst, SrcInfo.RC:$src)>;
+
+ def : Pat<(mtruncFrag addr:$dst, SrcInfo.KRCWM:$mask,
+ (SrcInfo.VT SrcInfo.RC:$src)),
+ (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mrk)
+ addr:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src)>;
+}
+
+multiclass avx512_trunc<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ AVX512VLVectorVTInfo VTSrcInfo, X86VectorVTInfo DestInfoZ128,
+ X86VectorVTInfo DestInfoZ256, X86VectorVTInfo DestInfoZ,
+ X86MemOperand x86memopZ128, X86MemOperand x86memopZ256,
+ X86MemOperand x86memopZ, PatFrag truncFrag, PatFrag mtruncFrag,
+ Predicate prd = HasAVX512>{
+
+ let Predicates = [HasVLX, prd] in {
+ defm Z128: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info128,
+ DestInfoZ128, x86memopZ128>,
+ avx512_trunc_mr_lowering<VTSrcInfo.info128, DestInfoZ128,
+ truncFrag, mtruncFrag>, EVEX_V128;
+
+ defm Z256: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info256,
+ DestInfoZ256, x86memopZ256>,
+ avx512_trunc_mr_lowering<VTSrcInfo.info256, DestInfoZ256,
+ truncFrag, mtruncFrag>, EVEX_V256;
+ }
+ let Predicates = [prd] in
+ defm Z: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info512,
+ DestInfoZ, x86memopZ>,
+ avx512_trunc_mr_lowering<VTSrcInfo.info512, DestInfoZ,
+ truncFrag, mtruncFrag>, EVEX_V512;
+}
+
+multiclass avx512_trunc_qb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ PatFrag StoreNode, PatFrag MaskedStoreNode> {
+ defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info,
+ v16i8x_info, v16i8x_info, v16i8x_info, i16mem, i32mem, i64mem,
+ StoreNode, MaskedStoreNode>, EVEX_CD8<8, CD8VO>;
+}
+
+multiclass avx512_trunc_qw<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ PatFrag StoreNode, PatFrag MaskedStoreNode> {
+ defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info,
+ v8i16x_info, v8i16x_info, v8i16x_info, i32mem, i64mem, i128mem,
+ StoreNode, MaskedStoreNode>, EVEX_CD8<16, CD8VQ>;
+}
+
+multiclass avx512_trunc_qd<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ PatFrag StoreNode, PatFrag MaskedStoreNode> {
+ defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info,
+ v4i32x_info, v4i32x_info, v8i32x_info, i64mem, i128mem, i256mem,
+ StoreNode, MaskedStoreNode>, EVEX_CD8<32, CD8VH>;
+}
+
+multiclass avx512_trunc_db<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ PatFrag StoreNode, PatFrag MaskedStoreNode> {
+ defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i32_info,
+ v16i8x_info, v16i8x_info, v16i8x_info, i32mem, i64mem, i128mem,
+ StoreNode, MaskedStoreNode>, EVEX_CD8<8, CD8VQ>;
+}
+
+multiclass avx512_trunc_dw<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ PatFrag StoreNode, PatFrag MaskedStoreNode> {
+ defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i32_info,
+ v8i16x_info, v8i16x_info, v16i16x_info, i64mem, i128mem, i256mem,
+ StoreNode, MaskedStoreNode>, EVEX_CD8<16, CD8VH>;
+}
+
+multiclass avx512_trunc_wb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ PatFrag StoreNode, PatFrag MaskedStoreNode> {
+ defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i16_info,
+ v16i8x_info, v16i8x_info, v32i8x_info, i64mem, i128mem, i256mem,
+ StoreNode, MaskedStoreNode, HasBWI>, EVEX_CD8<16, CD8VH>;
+}
+
+defm VPMOVQB : avx512_trunc_qb<0x32, "vpmovqb", X86vtrunc,
+ truncstorevi8, masked_truncstorevi8>;
+defm VPMOVSQB : avx512_trunc_qb<0x22, "vpmovsqb", X86vtruncs,
+ truncstore_s_vi8, masked_truncstore_s_vi8>;
+defm VPMOVUSQB : avx512_trunc_qb<0x12, "vpmovusqb", X86vtruncus,
+ truncstore_us_vi8, masked_truncstore_us_vi8>;
+
+defm VPMOVQW : avx512_trunc_qw<0x34, "vpmovqw", X86vtrunc,
+ truncstorevi16, masked_truncstorevi16>;
+defm VPMOVSQW : avx512_trunc_qw<0x24, "vpmovsqw", X86vtruncs,
+ truncstore_s_vi16, masked_truncstore_s_vi16>;
+defm VPMOVUSQW : avx512_trunc_qw<0x14, "vpmovusqw", X86vtruncus,
+ truncstore_us_vi16, masked_truncstore_us_vi16>;
+
+defm VPMOVQD : avx512_trunc_qd<0x35, "vpmovqd", X86vtrunc,
+ truncstorevi32, masked_truncstorevi32>;
+defm VPMOVSQD : avx512_trunc_qd<0x25, "vpmovsqd", X86vtruncs,
+ truncstore_s_vi32, masked_truncstore_s_vi32>;
+defm VPMOVUSQD : avx512_trunc_qd<0x15, "vpmovusqd", X86vtruncus,
+ truncstore_us_vi32, masked_truncstore_us_vi32>;
+
+defm VPMOVDB : avx512_trunc_db<0x31, "vpmovdb", X86vtrunc,
+ truncstorevi8, masked_truncstorevi8>;
+defm VPMOVSDB : avx512_trunc_db<0x21, "vpmovsdb", X86vtruncs,
+ truncstore_s_vi8, masked_truncstore_s_vi8>;
+defm VPMOVUSDB : avx512_trunc_db<0x11, "vpmovusdb", X86vtruncus,
+ truncstore_us_vi8, masked_truncstore_us_vi8>;
+
+defm VPMOVDW : avx512_trunc_dw<0x33, "vpmovdw", X86vtrunc,
+ truncstorevi16, masked_truncstorevi16>;
+defm VPMOVSDW : avx512_trunc_dw<0x23, "vpmovsdw", X86vtruncs,
+ truncstore_s_vi16, masked_truncstore_s_vi16>;
+defm VPMOVUSDW : avx512_trunc_dw<0x13, "vpmovusdw", X86vtruncus,
+ truncstore_us_vi16, masked_truncstore_us_vi16>;
+
+defm VPMOVWB : avx512_trunc_wb<0x30, "vpmovwb", X86vtrunc,
+ truncstorevi8, masked_truncstorevi8>;
+defm VPMOVSWB : avx512_trunc_wb<0x20, "vpmovswb", X86vtruncs,
+ truncstore_s_vi8, masked_truncstore_s_vi8>;
+defm VPMOVUSWB : avx512_trunc_wb<0x10, "vpmovuswb", X86vtruncus,
+ truncstore_us_vi8, masked_truncstore_us_vi8>;
+
+let Predicates = [HasAVX512, NoVLX] in {
+def: Pat<(v8i16 (X86vtrunc (v8i32 VR256X:$src))),
+ (v8i16 (EXTRACT_SUBREG
+ (v16i16 (VPMOVDWZrr (v16i32 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR256X:$src, sub_ymm)))), sub_xmm))>;
+def: Pat<(v4i32 (X86vtrunc (v4i64 VR256X:$src))),
+ (v4i32 (EXTRACT_SUBREG
+ (v8i32 (VPMOVQDZrr (v8i64 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR256X:$src, sub_ymm)))), sub_xmm))>;
+}
+
+let Predicates = [HasBWI, NoVLX] in {
+def: Pat<(v16i8 (X86vtrunc (v16i16 VR256X:$src))),
+ (v16i8 (EXTRACT_SUBREG (VPMOVWBZrr (v32i16 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR256X:$src, sub_ymm))), sub_xmm))>;
+}
+
+multiclass avx512_extend_common<bits<8> opc, string OpcodeStr,
+ X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo,
+ X86MemOperand x86memop, PatFrag LdFrag, SDPatternOperator OpNode>{
+ let ExeDomain = DestInfo.ExeDomain in {
+ defm rr : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst),
+ (ins SrcInfo.RC:$src), OpcodeStr ,"$src", "$src",
+ (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src)))>,
+ EVEX;
+
+ defm rm : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
+ (ins x86memop:$src), OpcodeStr ,"$src", "$src",
+ (DestInfo.VT (LdFrag addr:$src))>,
+ EVEX;
+ }
+}
+
+multiclass avx512_extend_BW<bits<8> opc, string OpcodeStr,
+ SDPatternOperator OpNode,
+ string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
+ let Predicates = [HasVLX, HasBWI] in {
+ defm Z128: avx512_extend_common<opc, OpcodeStr, v8i16x_info,
+ v16i8x_info, i64mem, LdFrag, OpNode>,
+ EVEX_CD8<8, CD8VH>, T8PD, EVEX_V128;
+
+ defm Z256: avx512_extend_common<opc, OpcodeStr, v16i16x_info,
+ v16i8x_info, i128mem, LdFrag, OpNode>,
+ EVEX_CD8<8, CD8VH>, T8PD, EVEX_V256;
+ }
+ let Predicates = [HasBWI] in {
+ defm Z : avx512_extend_common<opc, OpcodeStr, v32i16_info,
+ v32i8x_info, i256mem, LdFrag, OpNode>,
+ EVEX_CD8<8, CD8VH>, T8PD, EVEX_V512;
+ }
+}
+
+multiclass avx512_extend_BD<bits<8> opc, string OpcodeStr,
+ SDPatternOperator OpNode,
+ string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
+ let Predicates = [HasVLX, HasAVX512] in {
+ defm Z128: avx512_extend_common<opc, OpcodeStr, v4i32x_info,
+ v16i8x_info, i32mem, LdFrag, OpNode>,
+ EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V128;
+
+ defm Z256: avx512_extend_common<opc, OpcodeStr, v8i32x_info,
+ v16i8x_info, i64mem, LdFrag, OpNode>,
+ EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V256;
+ }
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_extend_common<opc, OpcodeStr, v16i32_info,
+ v16i8x_info, i128mem, LdFrag, OpNode>,
+ EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V512;
+ }
+}
+
+multiclass avx512_extend_BQ<bits<8> opc, string OpcodeStr,
+ SDPatternOperator OpNode,
+ string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
+ let Predicates = [HasVLX, HasAVX512] in {
+ defm Z128: avx512_extend_common<opc, OpcodeStr, v2i64x_info,
+ v16i8x_info, i16mem, LdFrag, OpNode>,
+ EVEX_CD8<8, CD8VO>, T8PD, EVEX_V128;
+
+ defm Z256: avx512_extend_common<opc, OpcodeStr, v4i64x_info,
+ v16i8x_info, i32mem, LdFrag, OpNode>,
+ EVEX_CD8<8, CD8VO>, T8PD, EVEX_V256;
+ }
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_extend_common<opc, OpcodeStr, v8i64_info,
+ v16i8x_info, i64mem, LdFrag, OpNode>,
+ EVEX_CD8<8, CD8VO>, T8PD, EVEX_V512;
+ }
+}
+
+multiclass avx512_extend_WD<bits<8> opc, string OpcodeStr,
+ SDPatternOperator OpNode,
+ string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> {
+ let Predicates = [HasVLX, HasAVX512] in {
+ defm Z128: avx512_extend_common<opc, OpcodeStr, v4i32x_info,
+ v8i16x_info, i64mem, LdFrag, OpNode>,
+ EVEX_CD8<16, CD8VH>, T8PD, EVEX_V128;
+
+ defm Z256: avx512_extend_common<opc, OpcodeStr, v8i32x_info,
+ v8i16x_info, i128mem, LdFrag, OpNode>,
+ EVEX_CD8<16, CD8VH>, T8PD, EVEX_V256;
+ }
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_extend_common<opc, OpcodeStr, v16i32_info,
+ v16i16x_info, i256mem, LdFrag, OpNode>,
+ EVEX_CD8<16, CD8VH>, T8PD, EVEX_V512;
+ }
+}
+
+multiclass avx512_extend_WQ<bits<8> opc, string OpcodeStr,
+ SDPatternOperator OpNode,
+ string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> {
+ let Predicates = [HasVLX, HasAVX512] in {
+ defm Z128: avx512_extend_common<opc, OpcodeStr, v2i64x_info,
+ v8i16x_info, i32mem, LdFrag, OpNode>,
+ EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V128;
+
+ defm Z256: avx512_extend_common<opc, OpcodeStr, v4i64x_info,
+ v8i16x_info, i64mem, LdFrag, OpNode>,
+ EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V256;
+ }
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_extend_common<opc, OpcodeStr, v8i64_info,
+ v8i16x_info, i128mem, LdFrag, OpNode>,
+ EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V512;
+ }
+}
+
+multiclass avx512_extend_DQ<bits<8> opc, string OpcodeStr,
+ SDPatternOperator OpNode,
+ string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi32")> {
+
+ let Predicates = [HasVLX, HasAVX512] in {
+ defm Z128: avx512_extend_common<opc, OpcodeStr, v2i64x_info,
+ v4i32x_info, i64mem, LdFrag, OpNode>,
+ EVEX_CD8<32, CD8VH>, T8PD, EVEX_V128;
+
+ defm Z256: avx512_extend_common<opc, OpcodeStr, v4i64x_info,
+ v4i32x_info, i128mem, LdFrag, OpNode>,
+ EVEX_CD8<32, CD8VH>, T8PD, EVEX_V256;
+ }
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_extend_common<opc, OpcodeStr, v8i64_info,
+ v8i32x_info, i256mem, LdFrag, OpNode>,
+ EVEX_CD8<32, CD8VH>, T8PD, EVEX_V512;
+ }
+}
+
+defm VPMOVZXBW : avx512_extend_BW<0x30, "vpmovzxbw", X86vzext, "z">;
+defm VPMOVZXBD : avx512_extend_BD<0x31, "vpmovzxbd", X86vzext, "z">;
+defm VPMOVZXBQ : avx512_extend_BQ<0x32, "vpmovzxbq", X86vzext, "z">;
+defm VPMOVZXWD : avx512_extend_WD<0x33, "vpmovzxwd", X86vzext, "z">;
+defm VPMOVZXWQ : avx512_extend_WQ<0x34, "vpmovzxwq", X86vzext, "z">;
+defm VPMOVZXDQ : avx512_extend_DQ<0x35, "vpmovzxdq", X86vzext, "z">;
+
+defm VPMOVSXBW: avx512_extend_BW<0x20, "vpmovsxbw", X86vsext, "s">;
+defm VPMOVSXBD: avx512_extend_BD<0x21, "vpmovsxbd", X86vsext, "s">;
+defm VPMOVSXBQ: avx512_extend_BQ<0x22, "vpmovsxbq", X86vsext, "s">;
+defm VPMOVSXWD: avx512_extend_WD<0x23, "vpmovsxwd", X86vsext, "s">;
+defm VPMOVSXWQ: avx512_extend_WQ<0x24, "vpmovsxwq", X86vsext, "s">;
+defm VPMOVSXDQ: avx512_extend_DQ<0x25, "vpmovsxdq", X86vsext, "s">;
+
+// EXTLOAD patterns, implemented using vpmovz
+multiclass avx512_ext_lowering<string InstrStr, X86VectorVTInfo To,
+ X86VectorVTInfo From, PatFrag LdFrag> {
+ def : Pat<(To.VT (LdFrag addr:$src)),
+ (!cast<Instruction>("VPMOVZX"#InstrStr#"rm") addr:$src)>;
+ def : Pat<(To.VT (vselect To.KRCWM:$mask, (LdFrag addr:$src), To.RC:$src0)),
+ (!cast<Instruction>("VPMOVZX"#InstrStr#"rmk") To.RC:$src0,
+ To.KRC:$mask, addr:$src)>;
+ def : Pat<(To.VT (vselect To.KRCWM:$mask, (LdFrag addr:$src),
+ To.ImmAllZerosV)),
+ (!cast<Instruction>("VPMOVZX"#InstrStr#"rmkz") To.KRC:$mask,
+ addr:$src)>;
+}
+
+let Predicates = [HasVLX, HasBWI] in {
+ defm : avx512_ext_lowering<"BWZ128", v8i16x_info, v16i8x_info, extloadvi8>;
+ defm : avx512_ext_lowering<"BWZ256", v16i16x_info, v16i8x_info, extloadvi8>;
+}
+let Predicates = [HasBWI] in {
+ defm : avx512_ext_lowering<"BWZ", v32i16_info, v32i8x_info, extloadvi8>;
+}
+let Predicates = [HasVLX, HasAVX512] in {
+ defm : avx512_ext_lowering<"BDZ128", v4i32x_info, v16i8x_info, extloadvi8>;
+ defm : avx512_ext_lowering<"BDZ256", v8i32x_info, v16i8x_info, extloadvi8>;
+ defm : avx512_ext_lowering<"BQZ128", v2i64x_info, v16i8x_info, extloadvi8>;
+ defm : avx512_ext_lowering<"BQZ256", v4i64x_info, v16i8x_info, extloadvi8>;
+ defm : avx512_ext_lowering<"WDZ128", v4i32x_info, v8i16x_info, extloadvi16>;
+ defm : avx512_ext_lowering<"WDZ256", v8i32x_info, v8i16x_info, extloadvi16>;
+ defm : avx512_ext_lowering<"WQZ128", v2i64x_info, v8i16x_info, extloadvi16>;
+ defm : avx512_ext_lowering<"WQZ256", v4i64x_info, v8i16x_info, extloadvi16>;
+ defm : avx512_ext_lowering<"DQZ128", v2i64x_info, v4i32x_info, extloadvi32>;
+ defm : avx512_ext_lowering<"DQZ256", v4i64x_info, v4i32x_info, extloadvi32>;
+}
+let Predicates = [HasAVX512] in {
+ defm : avx512_ext_lowering<"BDZ", v16i32_info, v16i8x_info, extloadvi8>;
+ defm : avx512_ext_lowering<"BQZ", v8i64_info, v16i8x_info, extloadvi8>;
+ defm : avx512_ext_lowering<"WDZ", v16i32_info, v16i16x_info, extloadvi16>;
+ defm : avx512_ext_lowering<"WQZ", v8i64_info, v8i16x_info, extloadvi16>;
+ defm : avx512_ext_lowering<"DQZ", v8i64_info, v8i32x_info, extloadvi32>;
+}
+
+multiclass AVX512_pmovx_patterns<string OpcPrefix, string ExtTy,
+ SDNode ExtOp, PatFrag ExtLoad16> {
+ // 128-bit patterns
+ let Predicates = [HasVLX, HasBWI] in {
+ def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
+ def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
+ def : Pat<(v8i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
+ def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
+ def : Pat<(v8i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
+ }
+ let Predicates = [HasVLX] in {
+ def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
+ def : Pat<(v4i32 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
+ (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
+ def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
+ def : Pat<(v4i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
+
+ def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (ExtLoad16 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
+ (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
+
+ def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
+ def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
+ def : Pat<(v4i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
+ def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
+ def : Pat<(v4i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
+
+ def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
+ (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (v8i16 (vzmovl_v4i32 addr:$src)))),
+ (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
+
+ def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
+ }
+ // 256-bit patterns
+ let Predicates = [HasVLX, HasBWI] in {
+ def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>;
+ def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>;
+ def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>;
+ }
+ let Predicates = [HasVLX] in {
+ def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
+ def : Pat<(v8i32 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
+ def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
+ def : Pat<(v8i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
+
+ def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
+ def : Pat<(v4i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
+ (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
+ def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
+ def : Pat<(v4i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
+
+ def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>;
+ def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>;
+ def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>;
+
+ def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
+ def : Pat<(v4i64 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
+ def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
+ def : Pat<(v4i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
+
+ def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>;
+ def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>;
+ def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>;
+ }
+ // 512-bit patterns
+ let Predicates = [HasBWI] in {
+ def : Pat<(v32i16 (ExtOp (bc_v32i8 (loadv4i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BWZrm) addr:$src)>;
+ }
+ let Predicates = [HasAVX512] in {
+ def : Pat<(v16i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BDZrm) addr:$src)>;
+
+ def : Pat<(v8i64 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BQZrm) addr:$src)>;
+ def : Pat<(v8i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BQZrm) addr:$src)>;
+
+ def : Pat<(v16i32 (ExtOp (bc_v16i16 (loadv4i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WDZrm) addr:$src)>;
+
+ def : Pat<(v8i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WQZrm) addr:$src)>;
+
+ def : Pat<(v8i64 (ExtOp (bc_v8i32 (loadv4i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#DQZrm) addr:$src)>;
+ }
+}
+
+defm : AVX512_pmovx_patterns<"VPMOVSX", "s", X86vsext, extloadi32i16>;
+defm : AVX512_pmovx_patterns<"VPMOVZX", "z", X86vzext, loadi16_anyext>;
+
+//===----------------------------------------------------------------------===//
+// GATHER - SCATTER Operations
+
+multiclass avx512_gather<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ X86MemOperand memop, PatFrag GatherNode> {
+ let Constraints = "@earlyclobber $dst, $src1 = $dst, $mask = $mask_wb",
+ ExeDomain = _.ExeDomain in
+ def rm : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst, _.KRCWM:$mask_wb),
+ (ins _.RC:$src1, _.KRCWM:$mask, memop:$src2),
+ !strconcat(OpcodeStr#_.Suffix,
+ "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
+ [(set _.RC:$dst, _.KRCWM:$mask_wb,
+ (GatherNode (_.VT _.RC:$src1), _.KRCWM:$mask,
+ vectoraddr:$src2))]>, EVEX, EVEX_K,
+ EVEX_CD8<_.EltSize, CD8VT1>;
+}
+
+multiclass avx512_gather_q_pd<bits<8> dopc, bits<8> qopc,
+ AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
+ defm NAME##D##SUFF##Z: avx512_gather<dopc, OpcodeStr##"d", _.info512,
+ vy512mem, mgatherv8i32>, EVEX_V512, VEX_W;
+ defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info512,
+ vz512mem, mgatherv8i64>, EVEX_V512, VEX_W;
+let Predicates = [HasVLX] in {
+ defm NAME##D##SUFF##Z256: avx512_gather<dopc, OpcodeStr##"d", _.info256,
+ vx256xmem, mgatherv4i32>, EVEX_V256, VEX_W;
+ defm NAME##Q##SUFF##Z256: avx512_gather<qopc, OpcodeStr##"q", _.info256,
+ vy256xmem, mgatherv4i64>, EVEX_V256, VEX_W;
+ defm NAME##D##SUFF##Z128: avx512_gather<dopc, OpcodeStr##"d", _.info128,
+ vx128xmem, mgatherv4i32>, EVEX_V128, VEX_W;
+ defm NAME##Q##SUFF##Z128: avx512_gather<qopc, OpcodeStr##"q", _.info128,
+ vx128xmem, mgatherv2i64>, EVEX_V128, VEX_W;
+}
+}
+
+multiclass avx512_gather_d_ps<bits<8> dopc, bits<8> qopc,
+ AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
+ defm NAME##D##SUFF##Z: avx512_gather<dopc, OpcodeStr##"d", _.info512, vz512mem,
+ mgatherv16i32>, EVEX_V512;
+ defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info256, vz512mem,
+ mgatherv8i64>, EVEX_V512;
+let Predicates = [HasVLX] in {
+ defm NAME##D##SUFF##Z256: avx512_gather<dopc, OpcodeStr##"d", _.info256,
+ vy256xmem, mgatherv8i32>, EVEX_V256;
+ defm NAME##Q##SUFF##Z256: avx512_gather<qopc, OpcodeStr##"q", _.info128,
+ vy128xmem, mgatherv4i64>, EVEX_V256;
+ defm NAME##D##SUFF##Z128: avx512_gather<dopc, OpcodeStr##"d", _.info128,
+ vx128xmem, mgatherv4i32>, EVEX_V128;
+ defm NAME##Q##SUFF##Z128: avx512_gather<qopc, OpcodeStr##"q", _.info128,
+ vx64xmem, mgatherv2i64>, EVEX_V128;
+}
+}
+
+
+defm VGATHER : avx512_gather_q_pd<0x92, 0x93, avx512vl_f64_info, "vgather", "PD">,
+ avx512_gather_d_ps<0x92, 0x93, avx512vl_f32_info, "vgather", "PS">;
+
+defm VPGATHER : avx512_gather_q_pd<0x90, 0x91, avx512vl_i64_info, "vpgather", "Q">,
+ avx512_gather_d_ps<0x90, 0x91, avx512vl_i32_info, "vpgather", "D">;
+
+multiclass avx512_scatter<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ X86MemOperand memop, PatFrag ScatterNode> {
+
+let mayStore = 1, Constraints = "$mask = $mask_wb", ExeDomain = _.ExeDomain in
+
+ def mr : AVX5128I<opc, MRMDestMem, (outs _.KRCWM:$mask_wb),
+ (ins memop:$dst, _.KRCWM:$mask, _.RC:$src),
+ !strconcat(OpcodeStr#_.Suffix,
+ "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"),
+ [(set _.KRCWM:$mask_wb, (ScatterNode (_.VT _.RC:$src),
+ _.KRCWM:$mask, vectoraddr:$dst))]>,
+ EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>;
+}
+
+multiclass avx512_scatter_q_pd<bits<8> dopc, bits<8> qopc,
+ AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
+ defm NAME##D##SUFF##Z: avx512_scatter<dopc, OpcodeStr##"d", _.info512,
+ vy512mem, mscatterv8i32>, EVEX_V512, VEX_W;
+ defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info512,
+ vz512mem, mscatterv8i64>, EVEX_V512, VEX_W;
+let Predicates = [HasVLX] in {
+ defm NAME##D##SUFF##Z256: avx512_scatter<dopc, OpcodeStr##"d", _.info256,
+ vx256xmem, mscatterv4i32>, EVEX_V256, VEX_W;
+ defm NAME##Q##SUFF##Z256: avx512_scatter<qopc, OpcodeStr##"q", _.info256,
+ vy256xmem, mscatterv4i64>, EVEX_V256, VEX_W;
+ defm NAME##D##SUFF##Z128: avx512_scatter<dopc, OpcodeStr##"d", _.info128,
+ vx128xmem, mscatterv4i32>, EVEX_V128, VEX_W;
+ defm NAME##Q##SUFF##Z128: avx512_scatter<qopc, OpcodeStr##"q", _.info128,
+ vx128xmem, mscatterv2i64>, EVEX_V128, VEX_W;
+}
+}
+
+multiclass avx512_scatter_d_ps<bits<8> dopc, bits<8> qopc,
+ AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
+ defm NAME##D##SUFF##Z: avx512_scatter<dopc, OpcodeStr##"d", _.info512, vz512mem,
+ mscatterv16i32>, EVEX_V512;
+ defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info256, vz512mem,
+ mscatterv8i64>, EVEX_V512;
+let Predicates = [HasVLX] in {
+ defm NAME##D##SUFF##Z256: avx512_scatter<dopc, OpcodeStr##"d", _.info256,
+ vy256xmem, mscatterv8i32>, EVEX_V256;
+ defm NAME##Q##SUFF##Z256: avx512_scatter<qopc, OpcodeStr##"q", _.info128,
+ vy128xmem, mscatterv4i64>, EVEX_V256;
+ defm NAME##D##SUFF##Z128: avx512_scatter<dopc, OpcodeStr##"d", _.info128,
+ vx128xmem, mscatterv4i32>, EVEX_V128;
+ defm NAME##Q##SUFF##Z128: avx512_scatter<qopc, OpcodeStr##"q", _.info128,
+ vx64xmem, mscatterv2i64>, EVEX_V128;
+}
+}
+
+defm VSCATTER : avx512_scatter_q_pd<0xA2, 0xA3, avx512vl_f64_info, "vscatter", "PD">,
+ avx512_scatter_d_ps<0xA2, 0xA3, avx512vl_f32_info, "vscatter", "PS">;
+
+defm VPSCATTER : avx512_scatter_q_pd<0xA0, 0xA1, avx512vl_i64_info, "vpscatter", "Q">,
+ avx512_scatter_d_ps<0xA0, 0xA1, avx512vl_i32_info, "vpscatter", "D">;
+
+// prefetch
+multiclass avx512_gather_scatter_prefetch<bits<8> opc, Format F, string OpcodeStr,
+ RegisterClass KRC, X86MemOperand memop> {
+ let Predicates = [HasPFI], hasSideEffects = 1 in
+ def m : AVX5128I<opc, F, (outs), (ins KRC:$mask, memop:$src),
+ !strconcat(OpcodeStr, "\t{$src {${mask}}|{${mask}}, $src}"),
+ []>, EVEX, EVEX_K;
+}
+
+defm VGATHERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dps",
+ VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qps",
+ VK8WM, vz512mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VGATHERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dpd",
+ VK8WM, vy512mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qpd",
+ VK8WM, vz512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+defm VGATHERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dps",
+ VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qps",
+ VK8WM, vz512mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VGATHERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dpd",
+ VK8WM, vy512mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qpd",
+ VK8WM, vz512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dps",
+ VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qps",
+ VK8WM, vz512mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dpd",
+ VK8WM, vy512mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qpd",
+ VK8WM, vz512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dps",
+ VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qps",
+ VK8WM, vz512mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dpd",
+ VK8WM, vy512mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qpd",
+ VK8WM, vz512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+// Helper fragments to match sext vXi1 to vXiY.
+def v64i1sextv64i8 : PatLeaf<(v64i8
+ (X86vsext
+ (v64i1 (X86pcmpgtm
+ (bc_v64i8 (v16i32 immAllZerosV)),
+ VR512:$src))))>;
+def v32i1sextv32i16 : PatLeaf<(v32i16 (X86vsrai VR512:$src, (i8 15)))>;
+def v16i1sextv16i32 : PatLeaf<(v16i32 (X86vsrai VR512:$src, (i8 31)))>;
+def v8i1sextv8i64 : PatLeaf<(v8i64 (X86vsrai VR512:$src, (i8 63)))>;
+
+multiclass cvt_by_vec_width<bits<8> opc, X86VectorVTInfo Vec, string OpcodeStr > {
+def rr : AVX512XS8I<opc, MRMSrcReg, (outs Vec.RC:$dst), (ins Vec.KRC:$src),
+ !strconcat(OpcodeStr##Vec.Suffix, "\t{$src, $dst|$dst, $src}"),
+ [(set Vec.RC:$dst, (Vec.VT (X86vsext Vec.KRC:$src)))]>, EVEX;
+}
+
+multiclass cvt_mask_by_elt_width<bits<8> opc, AVX512VLVectorVTInfo VTInfo,
+ string OpcodeStr, Predicate prd> {
+let Predicates = [prd] in
+ defm Z : cvt_by_vec_width<opc, VTInfo.info512, OpcodeStr>, EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : cvt_by_vec_width<opc, VTInfo.info256, OpcodeStr>, EVEX_V256;
+ defm Z128 : cvt_by_vec_width<opc, VTInfo.info128, OpcodeStr>, EVEX_V128;
+ }
+}
+
+multiclass avx512_convert_mask_to_vector<string OpcodeStr> {
+ defm NAME##B : cvt_mask_by_elt_width<0x28, avx512vl_i8_info, OpcodeStr,
+ HasBWI>;
+ defm NAME##W : cvt_mask_by_elt_width<0x28, avx512vl_i16_info, OpcodeStr,
+ HasBWI>, VEX_W;
+ defm NAME##D : cvt_mask_by_elt_width<0x38, avx512vl_i32_info, OpcodeStr,
+ HasDQI>;
+ defm NAME##Q : cvt_mask_by_elt_width<0x38, avx512vl_i64_info, OpcodeStr,
+ HasDQI>, VEX_W;
+}
+
+defm VPMOVM2 : avx512_convert_mask_to_vector<"vpmovm2">;
+
+multiclass convert_vector_to_mask_common<bits<8> opc, X86VectorVTInfo _, string OpcodeStr > {
+ def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set _.KRC:$dst, (X86cvt2mask (_.VT _.RC:$src)))]>, EVEX;
+}
+
+// Use 512bit version to implement 128/256 bit in case NoVLX.
+multiclass convert_vector_to_mask_lowering<X86VectorVTInfo ExtendInfo,
+ X86VectorVTInfo _> {
+
+ def : Pat<(_.KVT (X86cvt2mask (_.VT _.RC:$src))),
+ (_.KVT (COPY_TO_REGCLASS
+ (!cast<Instruction>(NAME#"Zrr")
+ (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
+ _.RC:$src, _.SubRegIdx)),
+ _.KRC))>;
+}
+
+multiclass avx512_convert_vector_to_mask<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+ let Predicates = [prd] in
+ defm Z : convert_vector_to_mask_common <opc, VTInfo.info512, OpcodeStr>,
+ EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : convert_vector_to_mask_common<opc, VTInfo.info256, OpcodeStr>,
+ EVEX_V256;
+ defm Z128 : convert_vector_to_mask_common<opc, VTInfo.info128, OpcodeStr>,
+ EVEX_V128;
+ }
+ let Predicates = [prd, NoVLX] in {
+ defm Z256_Alt : convert_vector_to_mask_lowering<VTInfo.info512, VTInfo.info256>;
+ defm Z128_Alt : convert_vector_to_mask_lowering<VTInfo.info512, VTInfo.info128>;
+ }
+}
+
+defm VPMOVB2M : avx512_convert_vector_to_mask<0x29, "vpmovb2m",
+ avx512vl_i8_info, HasBWI>;
+defm VPMOVW2M : avx512_convert_vector_to_mask<0x29, "vpmovw2m",
+ avx512vl_i16_info, HasBWI>, VEX_W;
+defm VPMOVD2M : avx512_convert_vector_to_mask<0x39, "vpmovd2m",
+ avx512vl_i32_info, HasDQI>;
+defm VPMOVQ2M : avx512_convert_vector_to_mask<0x39, "vpmovq2m",
+ avx512vl_i64_info, HasDQI>, VEX_W;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - COMPRESS and EXPAND
+//
+
+multiclass compress_by_vec_width_common<bits<8> opc, X86VectorVTInfo _,
+ string OpcodeStr> {
+ defm rr : AVX512_maskable<opc, MRMDestReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1), OpcodeStr, "$src1", "$src1",
+ (_.VT (X86compress _.RC:$src1))>, AVX5128IBase;
+
+ let mayStore = 1, hasSideEffects = 0 in
+ def mr : AVX5128I<opc, MRMDestMem, (outs),
+ (ins _.MemOp:$dst, _.RC:$src),
+ OpcodeStr # "\t{$src, $dst|$dst, $src}",
+ []>, EVEX_CD8<_.EltSize, CD8VT1>;
+
+ def mrk : AVX5128I<opc, MRMDestMem, (outs),
+ (ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src),
+ OpcodeStr # "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+ []>,
+ EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>;
+}
+
+multiclass compress_by_vec_width_lowering<X86VectorVTInfo _ > {
+
+ def : Pat<(X86mCompressingStore addr:$dst, _.KRCWM:$mask,
+ (_.VT _.RC:$src)),
+ (!cast<Instruction>(NAME#_.ZSuffix##mrk)
+ addr:$dst, _.KRCWM:$mask, _.RC:$src)>;
+}
+
+multiclass compress_by_elt_width<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo VTInfo> {
+ defm Z : compress_by_vec_width_common<opc, VTInfo.info512, OpcodeStr>,
+ compress_by_vec_width_lowering<VTInfo.info512>, EVEX_V512;
+
+ let Predicates = [HasVLX] in {
+ defm Z256 : compress_by_vec_width_common<opc, VTInfo.info256, OpcodeStr>,
+ compress_by_vec_width_lowering<VTInfo.info256>, EVEX_V256;
+ defm Z128 : compress_by_vec_width_common<opc, VTInfo.info128, OpcodeStr>,
+ compress_by_vec_width_lowering<VTInfo.info128>, EVEX_V128;
+ }
+}
+
+defm VPCOMPRESSD : compress_by_elt_width <0x8B, "vpcompressd", avx512vl_i32_info>,
+ EVEX;
+defm VPCOMPRESSQ : compress_by_elt_width <0x8B, "vpcompressq", avx512vl_i64_info>,
+ EVEX, VEX_W;
+defm VCOMPRESSPS : compress_by_elt_width <0x8A, "vcompressps", avx512vl_f32_info>,
+ EVEX;
+defm VCOMPRESSPD : compress_by_elt_width <0x8A, "vcompresspd", avx512vl_f64_info>,
+ EVEX, VEX_W;
+
+// expand
+multiclass expand_by_vec_width<bits<8> opc, X86VectorVTInfo _,
+ string OpcodeStr> {
+ defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1), OpcodeStr, "$src1", "$src1",
+ (_.VT (X86expand _.RC:$src1))>, AVX5128IBase;
+
+ defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.MemOp:$src1), OpcodeStr, "$src1", "$src1",
+ (_.VT (X86expand (_.VT (bitconvert
+ (_.LdFrag addr:$src1)))))>,
+ AVX5128IBase, EVEX_CD8<_.EltSize, CD8VT1>;
+}
+
+multiclass expand_by_vec_width_lowering<X86VectorVTInfo _ > {
+
+ def : Pat<(_.VT (X86mExpandingLoad addr:$src, _.KRCWM:$mask, undef)),
+ (!cast<Instruction>(NAME#_.ZSuffix##rmkz)
+ _.KRCWM:$mask, addr:$src)>;
+
+ def : Pat<(_.VT (X86mExpandingLoad addr:$src, _.KRCWM:$mask,
+ (_.VT _.RC:$src0))),
+ (!cast<Instruction>(NAME#_.ZSuffix##rmk)
+ _.RC:$src0, _.KRCWM:$mask, addr:$src)>;
+}
+
+multiclass expand_by_elt_width<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo VTInfo> {
+ defm Z : expand_by_vec_width<opc, VTInfo.info512, OpcodeStr>,
+ expand_by_vec_width_lowering<VTInfo.info512>, EVEX_V512;
+
+ let Predicates = [HasVLX] in {
+ defm Z256 : expand_by_vec_width<opc, VTInfo.info256, OpcodeStr>,
+ expand_by_vec_width_lowering<VTInfo.info256>, EVEX_V256;
+ defm Z128 : expand_by_vec_width<opc, VTInfo.info128, OpcodeStr>,
+ expand_by_vec_width_lowering<VTInfo.info128>, EVEX_V128;
+ }
+}
+
+defm VPEXPANDD : expand_by_elt_width <0x89, "vpexpandd", avx512vl_i32_info>,
+ EVEX;
+defm VPEXPANDQ : expand_by_elt_width <0x89, "vpexpandq", avx512vl_i64_info>,
+ EVEX, VEX_W;
+defm VEXPANDPS : expand_by_elt_width <0x88, "vexpandps", avx512vl_f32_info>,
+ EVEX;
+defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", avx512vl_f64_info>,
+ EVEX, VEX_W;
+
+//handle instruction reg_vec1 = op(reg_vec,imm)
+// op(mem_vec,imm)
+// op(broadcast(eltVt),imm)
+//all instruction created with FROUND_CURRENT
+multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _>{
+ let ExeDomain = _.ExeDomain in {
+ defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2",
+ (OpNode (_.VT _.RC:$src1),
+ (i32 imm:$src2),
+ (i32 FROUND_CURRENT))>;
+ defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.MemOp:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2",
+ (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
+ (i32 imm:$src2),
+ (i32 FROUND_CURRENT))>;
+ defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.ScalarMemOp:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix, "$src2, ${src1}"##_.BroadcastStr,
+ "${src1}"##_.BroadcastStr##", $src2",
+ (OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src1))),
+ (i32 imm:$src2),
+ (i32 FROUND_CURRENT))>, EVEX_B;
+ }
+}
+
+//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
+multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, X86VectorVTInfo _>{
+ let ExeDomain = _.ExeDomain in
+ defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix, "$src2, {sae}, $src1",
+ "$src1, {sae}, $src2",
+ (OpNode (_.VT _.RC:$src1),
+ (i32 imm:$src2),
+ (i32 FROUND_NO_EXC))>, EVEX_B;
+}
+
+multiclass avx512_common_unary_fp_sae_packed_imm<string OpcodeStr,
+ AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd>{
+ let Predicates = [prd] in {
+ defm Z : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, _.info512>,
+ avx512_unary_fp_sae_packed_imm<opc, OpcodeStr, OpNode, _.info512>,
+ EVEX_V512;
+ }
+ let Predicates = [prd, HasVLX] in {
+ defm Z128 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, _.info128>,
+ EVEX_V128;
+ defm Z256 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, _.info256>,
+ EVEX_V256;
+ }
+}
+
+//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm)
+// op(reg_vec2,mem_vec,imm)
+// op(reg_vec2,broadcast(eltVt),imm)
+//all instruction created with FROUND_CURRENT
+multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _>{
+ let ExeDomain = _.ExeDomain in {
+ defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
+ OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (i32 imm:$src3),
+ (i32 FROUND_CURRENT))>;
+ defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3),
+ OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT (bitconvert (_.LdFrag addr:$src2))),
+ (i32 imm:$src3),
+ (i32 FROUND_CURRENT))>;
+ defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
+ OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
+ "$src1, ${src2}"##_.BroadcastStr##", $src3",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
+ (i32 imm:$src3),
+ (i32 FROUND_CURRENT))>, EVEX_B;
+ }
+}
+
+//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm)
+// op(reg_vec2,mem_vec,imm)
+multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo>{
+ let ExeDomain = DestInfo.ExeDomain in {
+ defm rri : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst),
+ (ins SrcInfo.RC:$src1, SrcInfo.RC:$src2, u8imm:$src3),
+ OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1),
+ (SrcInfo.VT SrcInfo.RC:$src2),
+ (i8 imm:$src3)))>;
+ defm rmi : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
+ (ins SrcInfo.RC:$src1, SrcInfo.MemOp:$src2, u8imm:$src3),
+ OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1),
+ (SrcInfo.VT (bitconvert
+ (SrcInfo.LdFrag addr:$src2))),
+ (i8 imm:$src3)))>;
+ }
+}
+
+//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm)
+// op(reg_vec2,mem_vec,imm)
+// op(reg_vec2,broadcast(eltVt),imm)
+multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _>:
+ avx512_3Op_rm_imm8<opc, OpcodeStr, OpNode, _, _>{
+
+ let ExeDomain = _.ExeDomain in
+ defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
+ OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
+ "$src1, ${src2}"##_.BroadcastStr##", $src3",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
+ (i8 imm:$src3))>, EVEX_B;
+}
+
+//handle scalar instruction reg_vec1 = op(reg_vec2,reg_vec3,imm)
+// op(reg_vec2,mem_scalar,imm)
+//all instruction created with FROUND_CURRENT
+multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain in {
+ defm rri : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
+ OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (i32 imm:$src3),
+ (i32 FROUND_CURRENT))>;
+ defm rmi : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
+ OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT (scalar_to_vector
+ (_.ScalarLdFrag addr:$src2))),
+ (i32 imm:$src3),
+ (i32 FROUND_CURRENT))>;
+ }
+}
+
+//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
+multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, X86VectorVTInfo _>{
+ let ExeDomain = _.ExeDomain in
+ defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
+ OpcodeStr, "$src3, {sae}, $src2, $src1",
+ "$src1, $src2, {sae}, $src3",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (i32 imm:$src3),
+ (i32 FROUND_NO_EXC))>, EVEX_B;
+}
+//handle scalar instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
+multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, X86VectorVTInfo _> {
+ defm NAME#rrib : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
+ OpcodeStr, "$src3, {sae}, $src2, $src1",
+ "$src1, $src2, {sae}, $src3",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (i32 imm:$src3),
+ (i32 FROUND_NO_EXC))>, EVEX_B;
+}
+
+multiclass avx512_common_fp_sae_packed_imm<string OpcodeStr,
+ AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd>{
+ let Predicates = [prd] in {
+ defm Z : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, _.info512>,
+ avx512_fp_sae_packed_imm<opc, OpcodeStr, OpNode, _.info512>,
+ EVEX_V512;
+
+ }
+ let Predicates = [prd, HasVLX] in {
+ defm Z128 : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, _.info128>,
+ EVEX_V128;
+ defm Z256 : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, _.info256>,
+ EVEX_V256;
+ }
+}
+
+multiclass avx512_common_3Op_rm_imm8<bits<8> opc, SDNode OpNode, string OpStr,
+ AVX512VLVectorVTInfo DestInfo, AVX512VLVectorVTInfo SrcInfo>{
+ let Predicates = [HasBWI] in {
+ defm Z : avx512_3Op_rm_imm8<opc, OpStr, OpNode, DestInfo.info512,
+ SrcInfo.info512>, EVEX_V512, AVX512AIi8Base, EVEX_4V;
+ }
+ let Predicates = [HasBWI, HasVLX] in {
+ defm Z128 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, DestInfo.info128,
+ SrcInfo.info128>, EVEX_V128, AVX512AIi8Base, EVEX_4V;
+ defm Z256 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, DestInfo.info256,
+ SrcInfo.info256>, EVEX_V256, AVX512AIi8Base, EVEX_4V;
+ }
+}
+
+multiclass avx512_common_3Op_imm8<string OpcodeStr, AVX512VLVectorVTInfo _,
+ bits<8> opc, SDNode OpNode>{
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512;
+ }
+ let Predicates = [HasAVX512, HasVLX] in {
+ defm Z128 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info128>, EVEX_V128;
+ defm Z256 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256;
+ }
+}
+
+multiclass avx512_common_fp_sae_scalar_imm<string OpcodeStr,
+ X86VectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd>{
+ let Predicates = [prd] in {
+ defm Z128 : avx512_fp_scalar_imm<opc, OpcodeStr, OpNode, _>,
+ avx512_fp_sae_scalar_imm<opc, OpcodeStr, OpNode, _>;
+ }
+}
+
+multiclass avx512_common_unary_fp_sae_packed_imm_all<string OpcodeStr,
+ bits<8> opcPs, bits<8> opcPd, SDNode OpNode, Predicate prd>{
+ defm PS : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f32_info,
+ opcPs, OpNode, prd>, EVEX_CD8<32, CD8VF>;
+ defm PD : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f64_info,
+ opcPd, OpNode, prd>, EVEX_CD8<64, CD8VF>, VEX_W;
+}
+
+
+defm VREDUCE : avx512_common_unary_fp_sae_packed_imm_all<"vreduce", 0x56, 0x56,
+ X86VReduce, HasDQI>, AVX512AIi8Base, EVEX;
+defm VRNDSCALE : avx512_common_unary_fp_sae_packed_imm_all<"vrndscale", 0x08, 0x09,
+ X86VRndScale, HasAVX512>, AVX512AIi8Base, EVEX;
+defm VGETMANT : avx512_common_unary_fp_sae_packed_imm_all<"vgetmant", 0x26, 0x26,
+ X86VGetMant, HasAVX512>, AVX512AIi8Base, EVEX;
+
+
+defm VRANGEPD : avx512_common_fp_sae_packed_imm<"vrangepd", avx512vl_f64_info,
+ 0x50, X86VRange, HasDQI>,
+ AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+defm VRANGEPS : avx512_common_fp_sae_packed_imm<"vrangeps", avx512vl_f32_info,
+ 0x50, X86VRange, HasDQI>,
+ AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+
+defm VRANGESD: avx512_common_fp_sae_scalar_imm<"vrangesd", f64x_info,
+ 0x51, X86VRange, HasDQI>,
+ AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
+defm VRANGESS: avx512_common_fp_sae_scalar_imm<"vrangess", f32x_info,
+ 0x51, X86VRange, HasDQI>,
+ AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+
+defm VREDUCESD: avx512_common_fp_sae_scalar_imm<"vreducesd", f64x_info,
+ 0x57, X86Reduces, HasDQI>,
+ AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
+defm VREDUCESS: avx512_common_fp_sae_scalar_imm<"vreducess", f32x_info,
+ 0x57, X86Reduces, HasDQI>,
+ AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+
+defm VGETMANTSD: avx512_common_fp_sae_scalar_imm<"vgetmantsd", f64x_info,
+ 0x27, X86GetMants, HasAVX512>,
+ AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
+defm VGETMANTSS: avx512_common_fp_sae_scalar_imm<"vgetmantss", f32x_info,
+ 0x27, X86GetMants, HasAVX512>,
+ AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+
+multiclass avx512_shuff_packed_128<string OpcodeStr, AVX512VLVectorVTInfo _,
+ bits<8> opc, SDNode OpNode = X86Shuf128>{
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512;
+
+ }
+ let Predicates = [HasAVX512, HasVLX] in {
+ defm Z256 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256;
+ }
+}
+let Predicates = [HasAVX512] in {
+def : Pat<(v16f32 (ffloor VR512:$src)),
+ (VRNDSCALEPSZrri VR512:$src, (i32 0x1))>;
+def : Pat<(v16f32 (fnearbyint VR512:$src)),
+ (VRNDSCALEPSZrri VR512:$src, (i32 0xC))>;
+def : Pat<(v16f32 (fceil VR512:$src)),
+ (VRNDSCALEPSZrri VR512:$src, (i32 0x2))>;
+def : Pat<(v16f32 (frint VR512:$src)),
+ (VRNDSCALEPSZrri VR512:$src, (i32 0x4))>;
+def : Pat<(v16f32 (ftrunc VR512:$src)),
+ (VRNDSCALEPSZrri VR512:$src, (i32 0x3))>;
+
+def : Pat<(v8f64 (ffloor VR512:$src)),
+ (VRNDSCALEPDZrri VR512:$src, (i32 0x1))>;
+def : Pat<(v8f64 (fnearbyint VR512:$src)),
+ (VRNDSCALEPDZrri VR512:$src, (i32 0xC))>;
+def : Pat<(v8f64 (fceil VR512:$src)),
+ (VRNDSCALEPDZrri VR512:$src, (i32 0x2))>;
+def : Pat<(v8f64 (frint VR512:$src)),
+ (VRNDSCALEPDZrri VR512:$src, (i32 0x4))>;
+def : Pat<(v8f64 (ftrunc VR512:$src)),
+ (VRNDSCALEPDZrri VR512:$src, (i32 0x3))>;
+}
+
+defm VSHUFF32X4 : avx512_shuff_packed_128<"vshuff32x4",avx512vl_f32_info, 0x23>,
+ AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+defm VSHUFF64X2 : avx512_shuff_packed_128<"vshuff64x2",avx512vl_f64_info, 0x23>,
+ AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+defm VSHUFI32X4 : avx512_shuff_packed_128<"vshufi32x4",avx512vl_i32_info, 0x43>,
+ AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2",avx512vl_i64_info, 0x43>,
+ AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+
+multiclass avx512_valign<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I> {
+ defm NAME: avx512_common_3Op_imm8<OpcodeStr, VTInfo_I, 0x03, X86VAlign>,
+ AVX512AIi8Base, EVEX_4V;
+}
+
+defm VALIGND: avx512_valign<"valignd", avx512vl_i32_info>,
+ EVEX_CD8<32, CD8VF>;
+defm VALIGNQ: avx512_valign<"valignq", avx512vl_i64_info>,
+ EVEX_CD8<64, CD8VF>, VEX_W;
+
+multiclass avx512_vpalignr_lowering<X86VectorVTInfo _ , list<Predicate> p>{
+ let Predicates = p in
+ def NAME#_.VTName#rri:
+ Pat<(_.VT (X86PAlignr _.RC:$src1, _.RC:$src2, (i8 imm:$imm))),
+ (!cast<Instruction>(NAME#_.ZSuffix#rri)
+ _.RC:$src1, _.RC:$src2, imm:$imm)>;
+}
+
+multiclass avx512_vpalignr_lowering_common<AVX512VLVectorVTInfo _>:
+ avx512_vpalignr_lowering<_.info512, [HasBWI]>,
+ avx512_vpalignr_lowering<_.info128, [HasBWI, HasVLX]>,
+ avx512_vpalignr_lowering<_.info256, [HasBWI, HasVLX]>;
+
+defm VPALIGNR: avx512_common_3Op_rm_imm8<0x0F, X86PAlignr, "vpalignr" ,
+ avx512vl_i8_info, avx512vl_i8_info>,
+ avx512_vpalignr_lowering_common<avx512vl_i16_info>,
+ avx512_vpalignr_lowering_common<avx512vl_i32_info>,
+ avx512_vpalignr_lowering_common<avx512vl_f32_info>,
+ avx512_vpalignr_lowering_common<avx512vl_i64_info>,
+ avx512_vpalignr_lowering_common<avx512vl_f64_info>,
+ EVEX_CD8<8, CD8VF>;
+
+defm VDBPSADBW: avx512_common_3Op_rm_imm8<0x42, X86dbpsadbw, "vdbpsadbw" ,
+ avx512vl_i16_info, avx512vl_i8_info>, EVEX_CD8<8, CD8VF>;
+
+multiclass avx512_unary_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1), OpcodeStr,
+ "$src1", "$src1",
+ (_.VT (OpNode _.RC:$src1))>, EVEX, AVX5128IBase;
+
+ defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.MemOp:$src1), OpcodeStr,
+ "$src1", "$src1",
+ (_.VT (OpNode (bitconvert (_.LdFrag addr:$src1))))>,
+ EVEX, AVX5128IBase, EVEX_CD8<_.EltSize, CD8VF>;
+}
+
+multiclass avx512_unary_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> :
+ avx512_unary_rm<opc, OpcodeStr, OpNode, _> {
+ defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.ScalarMemOp:$src1), OpcodeStr,
+ "${src1}"##_.BroadcastStr,
+ "${src1}"##_.BroadcastStr,
+ (_.VT (OpNode (X86VBroadcast
+ (_.ScalarLdFrag addr:$src1))))>,
+ EVEX, AVX5128IBase, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
+}
+
+multiclass avx512_unary_rm_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+ let Predicates = [prd] in
+ defm Z : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info512>, EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info256>,
+ EVEX_V256;
+ defm Z128 : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info128>,
+ EVEX_V128;
+ }
+}
+
+multiclass avx512_unary_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+ let Predicates = [prd] in
+ defm Z : avx512_unary_rmb<opc, OpcodeStr, OpNode, VTInfo.info512>,
+ EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_unary_rmb<opc, OpcodeStr, OpNode, VTInfo.info256>,
+ EVEX_V256;
+ defm Z128 : avx512_unary_rmb<opc, OpcodeStr, OpNode, VTInfo.info128>,
+ EVEX_V128;
+ }
+}
+
+multiclass avx512_unary_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr,
+ SDNode OpNode, Predicate prd> {
+ defm Q : avx512_unary_rmb_vl<opc_q, OpcodeStr#"q", OpNode, avx512vl_i64_info,
+ prd>, VEX_W;
+ defm D : avx512_unary_rmb_vl<opc_d, OpcodeStr#"d", OpNode, avx512vl_i32_info,
+ prd>;
+}
+
+multiclass avx512_unary_rm_vl_bw<bits<8> opc_b, bits<8> opc_w, string OpcodeStr,
+ SDNode OpNode, Predicate prd> {
+ defm W : avx512_unary_rm_vl<opc_w, OpcodeStr#"w", OpNode, avx512vl_i16_info, prd>;
+ defm B : avx512_unary_rm_vl<opc_b, OpcodeStr#"b", OpNode, avx512vl_i8_info, prd>;
+}
+
+multiclass avx512_unary_rm_vl_all<bits<8> opc_b, bits<8> opc_w,
+ bits<8> opc_d, bits<8> opc_q,
+ string OpcodeStr, SDNode OpNode> {
+ defm NAME : avx512_unary_rm_vl_dq<opc_d, opc_q, OpcodeStr, OpNode,
+ HasAVX512>,
+ avx512_unary_rm_vl_bw<opc_b, opc_w, OpcodeStr, OpNode,
+ HasBWI>;
+}
+
+defm VPABS : avx512_unary_rm_vl_all<0x1C, 0x1D, 0x1E, 0x1F, "vpabs", X86Abs>;
+
+def avx512_v16i1sextv16i8 : PatLeaf<(v16i8 (X86pcmpgt (bc_v16i8 (v4i32 immAllZerosV)),
+ VR128X:$src))>;
+def avx512_v8i1sextv8i16 : PatLeaf<(v8i16 (X86vsrai VR128X:$src, (i8 15)))>;
+def avx512_v4i1sextv4i32 : PatLeaf<(v4i32 (X86vsrai VR128X:$src, (i8 31)))>;
+def avx512_v32i1sextv32i8 : PatLeaf<(v32i8 (X86pcmpgt (bc_v32i8 (v8i32 immAllZerosV)),
+ VR256X:$src))>;
+def avx512_v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256X:$src, (i8 15)))>;
+def avx512_v8i1sextv8i32 : PatLeaf<(v8i32 (X86vsrai VR256X:$src, (i8 31)))>;
+
+let Predicates = [HasBWI, HasVLX] in {
+ def : Pat<(xor
+ (bc_v2i64 (avx512_v16i1sextv16i8)),
+ (bc_v2i64 (add (v16i8 VR128X:$src), (avx512_v16i1sextv16i8)))),
+ (VPABSBZ128rr VR128X:$src)>;
+ def : Pat<(xor
+ (bc_v2i64 (avx512_v8i1sextv8i16)),
+ (bc_v2i64 (add (v8i16 VR128X:$src), (avx512_v8i1sextv8i16)))),
+ (VPABSWZ128rr VR128X:$src)>;
+ def : Pat<(xor
+ (bc_v4i64 (avx512_v32i1sextv32i8)),
+ (bc_v4i64 (add (v32i8 VR256X:$src), (avx512_v32i1sextv32i8)))),
+ (VPABSBZ256rr VR256X:$src)>;
+ def : Pat<(xor
+ (bc_v4i64 (avx512_v16i1sextv16i16)),
+ (bc_v4i64 (add (v16i16 VR256X:$src), (avx512_v16i1sextv16i16)))),
+ (VPABSWZ256rr VR256X:$src)>;
+}
+let Predicates = [HasAVX512, HasVLX] in {
+ def : Pat<(xor
+ (bc_v2i64 (avx512_v4i1sextv4i32)),
+ (bc_v2i64 (add (v4i32 VR128X:$src), (avx512_v4i1sextv4i32)))),
+ (VPABSDZ128rr VR128X:$src)>;
+ def : Pat<(xor
+ (bc_v4i64 (avx512_v8i1sextv8i32)),
+ (bc_v4i64 (add (v8i32 VR256X:$src), (avx512_v8i1sextv8i32)))),
+ (VPABSDZ256rr VR256X:$src)>;
+}
+
+let Predicates = [HasAVX512] in {
+def : Pat<(xor
+ (bc_v8i64 (v16i1sextv16i32)),
+ (bc_v8i64 (add (v16i32 VR512:$src), (v16i1sextv16i32)))),
+ (VPABSDZrr VR512:$src)>;
+def : Pat<(xor
+ (bc_v8i64 (v8i1sextv8i64)),
+ (bc_v8i64 (add (v8i64 VR512:$src), (v8i1sextv8i64)))),
+ (VPABSQZrr VR512:$src)>;
+}
+let Predicates = [HasBWI] in {
+def : Pat<(xor
+ (bc_v8i64 (v64i1sextv64i8)),
+ (bc_v8i64 (add (v64i8 VR512:$src), (v64i1sextv64i8)))),
+ (VPABSBZrr VR512:$src)>;
+def : Pat<(xor
+ (bc_v8i64 (v32i1sextv32i16)),
+ (bc_v8i64 (add (v32i16 VR512:$src), (v32i1sextv32i16)))),
+ (VPABSWZrr VR512:$src)>;
+}
+
+multiclass avx512_ctlz<bits<8> opc, string OpcodeStr, Predicate prd>{
+
+ defm NAME : avx512_unary_rm_vl_dq<opc, opc, OpcodeStr, ctlz, prd>;
+}
+
+defm VPLZCNT : avx512_ctlz<0x44, "vplzcnt", HasCDI>;
+defm VPCONFLICT : avx512_unary_rm_vl_dq<0xC4, 0xC4, "vpconflict", X86Conflict, HasCDI>;
+
+//===---------------------------------------------------------------------===//
+// Replicate Single FP - MOVSHDUP and MOVSLDUP
+//===---------------------------------------------------------------------===//
+multiclass avx512_replicate<bits<8> opc, string OpcodeStr, SDNode OpNode>{
+ defm NAME: avx512_unary_rm_vl<opc, OpcodeStr, OpNode, avx512vl_f32_info,
+ HasAVX512>, XS;
+}
+
+defm VMOVSHDUP : avx512_replicate<0x16, "vmovshdup", X86Movshdup>;
+defm VMOVSLDUP : avx512_replicate<0x12, "vmovsldup", X86Movsldup>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - MOVDDUP
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src), OpcodeStr, "$src", "$src",
+ (_.VT (OpNode (_.VT _.RC:$src)))>, EVEX;
+ defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.ScalarMemOp:$src), OpcodeStr, "$src", "$src",
+ (_.VT (OpNode (_.VT (scalar_to_vector
+ (_.ScalarLdFrag addr:$src)))))>,
+ EVEX, EVEX_CD8<_.EltSize, CD8VH>;
+}
+
+multiclass avx512_movddup_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ AVX512VLVectorVTInfo VTInfo> {
+
+ defm Z : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info512>, EVEX_V512;
+
+ let Predicates = [HasAVX512, HasVLX] in {
+ defm Z256 : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info256>,
+ EVEX_V256;
+ defm Z128 : avx512_movddup_128<opc, OpcodeStr, OpNode, VTInfo.info128>,
+ EVEX_V128;
+ }
+}
+
+multiclass avx512_movddup<bits<8> opc, string OpcodeStr, SDNode OpNode>{
+ defm NAME: avx512_movddup_common<opc, OpcodeStr, OpNode,
+ avx512vl_f64_info>, XD, VEX_W;
+}
+
+defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", X86Movddup>;
+
+let Predicates = [HasVLX] in {
+def : Pat<(X86Movddup (loadv2f64 addr:$src)),
+ (VMOVDDUPZ128rm addr:$src)>;
+def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
+ (VMOVDDUPZ128rm addr:$src)>;
+def : Pat<(v2f64 (X86VBroadcast f64:$src)),
+ (VMOVDDUPZ128rr (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Unpack Instructions
+//===----------------------------------------------------------------------===//
+defm VUNPCKH : avx512_fp_binop_p<0x15, "vunpckh", X86Unpckh, HasAVX512,
+ SSE_ALU_ITINS_S>;
+defm VUNPCKL : avx512_fp_binop_p<0x14, "vunpckl", X86Unpckl, HasAVX512,
+ SSE_ALU_ITINS_S>;
+
+defm VPUNPCKLBW : avx512_binop_rm_vl_b<0x60, "vpunpcklbw", X86Unpckl,
+ SSE_INTALU_ITINS_P, HasBWI>;
+defm VPUNPCKHBW : avx512_binop_rm_vl_b<0x68, "vpunpckhbw", X86Unpckh,
+ SSE_INTALU_ITINS_P, HasBWI>;
+defm VPUNPCKLWD : avx512_binop_rm_vl_w<0x61, "vpunpcklwd", X86Unpckl,
+ SSE_INTALU_ITINS_P, HasBWI>;
+defm VPUNPCKHWD : avx512_binop_rm_vl_w<0x69, "vpunpckhwd", X86Unpckh,
+ SSE_INTALU_ITINS_P, HasBWI>;
+
+defm VPUNPCKLDQ : avx512_binop_rm_vl_d<0x62, "vpunpckldq", X86Unpckl,
+ SSE_INTALU_ITINS_P, HasAVX512>;
+defm VPUNPCKHDQ : avx512_binop_rm_vl_d<0x6A, "vpunpckhdq", X86Unpckh,
+ SSE_INTALU_ITINS_P, HasAVX512>;
+defm VPUNPCKLQDQ : avx512_binop_rm_vl_q<0x6C, "vpunpcklqdq", X86Unpckl,
+ SSE_INTALU_ITINS_P, HasAVX512>;
+defm VPUNPCKHQDQ : avx512_binop_rm_vl_q<0x6D, "vpunpckhqdq", X86Unpckh,
+ SSE_INTALU_ITINS_P, HasAVX512>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Extract & Insert Integer Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_extract_elt_bw_m<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ def mr : AVX512Ii8<opc, MRMDestMem, (outs),
+ (ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(store (_.EltVT (trunc (assertzext (OpNode (_.VT _.RC:$src1),
+ imm:$src2)))),
+ addr:$dst)]>,
+ EVEX, EVEX_CD8<_.EltSize, CD8VT1>;
+}
+
+multiclass avx512_extract_elt_b<string OpcodeStr, X86VectorVTInfo _> {
+ let Predicates = [HasBWI] in {
+ def rr : AVX512Ii8<0x14, MRMDestReg, (outs GR32orGR64:$dst),
+ (ins _.RC:$src1, u8imm:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32orGR64:$dst,
+ (X86pextrb (_.VT _.RC:$src1), imm:$src2))]>,
+ EVEX, TAPD;
+
+ defm NAME : avx512_extract_elt_bw_m<0x14, OpcodeStr, X86pextrb, _>, TAPD;
+ }
+}
+
+multiclass avx512_extract_elt_w<string OpcodeStr, X86VectorVTInfo _> {
+ let Predicates = [HasBWI] in {
+ def rr : AVX512Ii8<0xC5, MRMSrcReg, (outs GR32orGR64:$dst),
+ (ins _.RC:$src1, u8imm:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32orGR64:$dst,
+ (X86pextrw (_.VT _.RC:$src1), imm:$src2))]>,
+ EVEX, PD;
+
+ let hasSideEffects = 0 in
+ def rr_REV : AVX512Ii8<0x15, MRMDestReg, (outs GR32orGR64:$dst),
+ (ins _.RC:$src1, u8imm:$src2),
+ OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ EVEX, TAPD;
+
+ defm NAME : avx512_extract_elt_bw_m<0x15, OpcodeStr, X86pextrw, _>, TAPD;
+ }
+}
+
+multiclass avx512_extract_elt_dq<string OpcodeStr, X86VectorVTInfo _,
+ RegisterClass GRC> {
+ let Predicates = [HasDQI] in {
+ def rr : AVX512Ii8<0x16, MRMDestReg, (outs GRC:$dst),
+ (ins _.RC:$src1, u8imm:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GRC:$dst,
+ (extractelt (_.VT _.RC:$src1), imm:$src2))]>,
+ EVEX, TAPD;
+
+ def mr : AVX512Ii8<0x16, MRMDestMem, (outs),
+ (ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(store (extractelt (_.VT _.RC:$src1),
+ imm:$src2),addr:$dst)]>,
+ EVEX, EVEX_CD8<_.EltSize, CD8VT1>, TAPD;
+ }
+}
+
+defm VPEXTRBZ : avx512_extract_elt_b<"vpextrb", v16i8x_info>;
+defm VPEXTRWZ : avx512_extract_elt_w<"vpextrw", v8i16x_info>;
+defm VPEXTRDZ : avx512_extract_elt_dq<"vpextrd", v4i32x_info, GR32>;
+defm VPEXTRQZ : avx512_extract_elt_dq<"vpextrq", v2i64x_info, GR64>, VEX_W;
+
+multiclass avx512_insert_elt_m<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, PatFrag LdFrag> {
+ def rm : AVX512Ii8<opc, MRMSrcMem, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
+ OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set _.RC:$dst,
+ (_.VT (OpNode _.RC:$src1, (LdFrag addr:$src2), imm:$src3)))]>,
+ EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;
+}
+
+multiclass avx512_insert_elt_bw<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, PatFrag LdFrag> {
+ let Predicates = [HasBWI] in {
+ def rr : AVX512Ii8<opc, MRMSrcReg, (outs _.RC:$dst),
+ (ins _.RC:$src1, GR32orGR64:$src2, u8imm:$src3),
+ OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set _.RC:$dst,
+ (OpNode _.RC:$src1, GR32orGR64:$src2, imm:$src3))]>, EVEX_4V;
+
+ defm NAME : avx512_insert_elt_m<opc, OpcodeStr, OpNode, _, LdFrag>;
+ }
+}
+
+multiclass avx512_insert_elt_dq<bits<8> opc, string OpcodeStr,
+ X86VectorVTInfo _, RegisterClass GRC> {
+ let Predicates = [HasDQI] in {
+ def rr : AVX512Ii8<opc, MRMSrcReg, (outs _.RC:$dst),
+ (ins _.RC:$src1, GRC:$src2, u8imm:$src3),
+ OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set _.RC:$dst,
+ (_.VT (insertelt _.RC:$src1, GRC:$src2, imm:$src3)))]>,
+ EVEX_4V, TAPD;
+
+ defm NAME : avx512_insert_elt_m<opc, OpcodeStr, insertelt, _,
+ _.ScalarLdFrag>, TAPD;
+ }
+}
+
+defm VPINSRBZ : avx512_insert_elt_bw<0x20, "vpinsrb", X86pinsrb, v16i8x_info,
+ extloadi8>, TAPD;
+defm VPINSRWZ : avx512_insert_elt_bw<0xC4, "vpinsrw", X86pinsrw, v8i16x_info,
+ extloadi16>, PD;
+defm VPINSRDZ : avx512_insert_elt_dq<0x22, "vpinsrd", v4i32x_info, GR32>;
+defm VPINSRQZ : avx512_insert_elt_dq<0x22, "vpinsrq", v2i64x_info, GR64>, VEX_W;
+//===----------------------------------------------------------------------===//
+// VSHUFPS - VSHUFPD Operations
+//===----------------------------------------------------------------------===//
+multiclass avx512_shufp<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I,
+ AVX512VLVectorVTInfo VTInfo_FP>{
+ defm NAME: avx512_common_3Op_imm8<OpcodeStr, VTInfo_FP, 0xC6, X86Shufp>,
+ EVEX_CD8<VTInfo_FP.info512.EltSize, CD8VF>,
+ AVX512AIi8Base, EVEX_4V;
+}
+
+defm VSHUFPS: avx512_shufp<"vshufps", avx512vl_i32_info, avx512vl_f32_info>, PS;
+defm VSHUFPD: avx512_shufp<"vshufpd", avx512vl_i64_info, avx512vl_f64_info>, PD, VEX_W;
+//===----------------------------------------------------------------------===//
+// AVX-512 - Byte shift Left/Right
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_shift_packed<bits<8> opc, SDNode OpNode, Format MRMr,
+ Format MRMm, string OpcodeStr, X86VectorVTInfo _>{
+ def rr : AVX512<opc, MRMr,
+ (outs _.RC:$dst), (ins _.RC:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _.RC:$dst,(_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))]>;
+ def rm : AVX512<opc, MRMm,
+ (outs _.RC:$dst), (ins _.MemOp:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _.RC:$dst,(_.VT (OpNode
+ (_.VT (bitconvert (_.LdFrag addr:$src1))),
+ (i8 imm:$src2))))]>;
+}
+
+multiclass avx512_shift_packed_all<bits<8> opc, SDNode OpNode, Format MRMr,
+ Format MRMm, string OpcodeStr, Predicate prd>{
+ let Predicates = [prd] in
+ defm Z512 : avx512_shift_packed<opc, OpNode, MRMr, MRMm,
+ OpcodeStr, v64i8_info>, EVEX_V512;
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_shift_packed<opc, OpNode, MRMr, MRMm,
+ OpcodeStr, v32i8x_info>, EVEX_V256;
+ defm Z128 : avx512_shift_packed<opc, OpNode, MRMr, MRMm,
+ OpcodeStr, v16i8x_info>, EVEX_V128;
+ }
+}
+defm VPSLLDQ : avx512_shift_packed_all<0x73, X86vshldq, MRM7r, MRM7m, "vpslldq",
+ HasBWI>, AVX512PDIi8Base, EVEX_4V;
+defm VPSRLDQ : avx512_shift_packed_all<0x73, X86vshrdq, MRM3r, MRM3m, "vpsrldq",
+ HasBWI>, AVX512PDIi8Base, EVEX_4V;
+
+
+multiclass avx512_psadbw_packed<bits<8> opc, SDNode OpNode,
+ string OpcodeStr, X86VectorVTInfo _dst,
+ X86VectorVTInfo _src>{
+ def rr : AVX512BI<opc, MRMSrcReg,
+ (outs _dst.RC:$dst), (ins _src.RC:$src1, _src.RC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _dst.RC:$dst,(_dst.VT
+ (OpNode (_src.VT _src.RC:$src1),
+ (_src.VT _src.RC:$src2))))]>;
+ def rm : AVX512BI<opc, MRMSrcMem,
+ (outs _dst.RC:$dst), (ins _src.RC:$src1, _src.MemOp:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _dst.RC:$dst,(_dst.VT
+ (OpNode (_src.VT _src.RC:$src1),
+ (_src.VT (bitconvert
+ (_src.LdFrag addr:$src2))))))]>;
+}
+
+multiclass avx512_psadbw_packed_all<bits<8> opc, SDNode OpNode,
+ string OpcodeStr, Predicate prd> {
+ let Predicates = [prd] in
+ defm Z512 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v8i64_info,
+ v64i8_info>, EVEX_V512;
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v4i64x_info,
+ v32i8x_info>, EVEX_V256;
+ defm Z128 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v2i64x_info,
+ v16i8x_info>, EVEX_V128;
+ }
+}
+
+defm VPSADBW : avx512_psadbw_packed_all<0xf6, X86psadbw, "vpsadbw",
+ HasBWI>, EVEX_4V;
+
+multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _>{
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
+ defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3, u8imm:$src4),
+ OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src4",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (_.VT _.RC:$src3),
+ (i8 imm:$src4)), 1, 1>, AVX512AIi8Base, EVEX_4V;
+ defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.MemOp:$src3, u8imm:$src4),
+ OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src4",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (_.VT (bitconvert (_.LdFrag addr:$src3))),
+ (i8 imm:$src4)), 1, 0>,
+ AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+ defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.ScalarMemOp:$src3, u8imm:$src4),
+ OpcodeStr, "$src4, ${src3}"##_.BroadcastStr##", $src2",
+ "$src2, ${src3}"##_.BroadcastStr##", $src4",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
+ (i8 imm:$src4)), 1, 0>, EVEX_B,
+ AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+ }// Constraints = "$src1 = $dst"
+}
+
+multiclass avx512_common_ternlog<string OpcodeStr, AVX512VLVectorVTInfo _>{
+ let Predicates = [HasAVX512] in
+ defm Z : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, _.info512>, EVEX_V512;
+ let Predicates = [HasAVX512, HasVLX] in {
+ defm Z128 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, _.info128>, EVEX_V128;
+ defm Z256 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, _.info256>, EVEX_V256;
+ }
+}
+
+defm VPTERNLOGD : avx512_common_ternlog<"vpternlogd", avx512vl_i32_info>;
+defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", avx512vl_i64_info>, VEX_W;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - FixupImm
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _>{
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
+ defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
+ OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (_.IntVT _.RC:$src3),
+ (i32 imm:$src4),
+ (i32 FROUND_CURRENT))>;
+ defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.MemOp:$src3, i32u8imm:$src4),
+ OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (_.IntVT (bitconvert (_.LdFrag addr:$src3))),
+ (i32 imm:$src4),
+ (i32 FROUND_CURRENT))>;
+ defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4),
+ OpcodeStr##_.Suffix, "$src4, ${src3}"##_.BroadcastStr##", $src2",
+ "$src2, ${src3}"##_.BroadcastStr##", $src4",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (_.IntVT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
+ (i32 imm:$src4),
+ (i32 FROUND_CURRENT))>, EVEX_B;
+ } // Constraints = "$src1 = $dst"
+}
+
+multiclass avx512_fixupimm_packed_sae<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, X86VectorVTInfo _>{
+let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
+ defm rrib : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
+ OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2",
+ "$src2, $src3, {sae}, $src4",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (_.IntVT _.RC:$src3),
+ (i32 imm:$src4),
+ (i32 FROUND_NO_EXC))>, EVEX_B;
+ }
+}
+
+multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, X86VectorVTInfo _src3VT> {
+ let Constraints = "$src1 = $dst" , Predicates = [HasAVX512],
+ ExeDomain = _.ExeDomain in {
+ defm rri : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
+ OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (_src3VT.VT _src3VT.RC:$src3),
+ (i32 imm:$src4),
+ (i32 FROUND_CURRENT))>;
+
+ defm rrib : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
+ OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2",
+ "$src2, $src3, {sae}, $src4",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (_src3VT.VT _src3VT.RC:$src3),
+ (i32 imm:$src4),
+ (i32 FROUND_NO_EXC))>, EVEX_B;
+ defm rmi : AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4),
+ OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (_src3VT.VT (scalar_to_vector
+ (_src3VT.ScalarLdFrag addr:$src3))),
+ (i32 imm:$src4),
+ (i32 FROUND_CURRENT))>;
+ }
+}
+
+multiclass avx512_fixupimm_packed_all<AVX512VLVectorVTInfo _Vec>{
+ let Predicates = [HasAVX512] in
+ defm Z : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, _Vec.info512>,
+ avx512_fixupimm_packed_sae<0x54, "vfixupimm", X86VFixupimm, _Vec.info512>,
+ AVX512AIi8Base, EVEX_4V, EVEX_V512;
+ let Predicates = [HasAVX512, HasVLX] in {
+ defm Z128 : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, _Vec.info128>,
+ AVX512AIi8Base, EVEX_4V, EVEX_V128;
+ defm Z256 : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, _Vec.info256>,
+ AVX512AIi8Base, EVEX_4V, EVEX_V256;
+ }
+}
+
+defm VFIXUPIMMSS : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar,
+ f32x_info, v4i32x_info>,
+ AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+defm VFIXUPIMMSD : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar,
+ f64x_info, v2i64x_info>,
+ AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
+defm VFIXUPIMMPS : avx512_fixupimm_packed_all<avx512vl_f32_info>,
+ EVEX_CD8<32, CD8VF>;
+defm VFIXUPIMMPD : avx512_fixupimm_packed_all<avx512vl_f64_info>,
+ EVEX_CD8<64, CD8VF>, VEX_W;
+
+
+
+// Patterns used to select SSE scalar fp arithmetic instructions from
+// either:
+//
+// (1) a scalar fp operation followed by a blend
+//
+// The effect is that the backend no longer emits unnecessary vector
+// insert instructions immediately after SSE scalar fp instructions
+// like addss or mulss.
+//
+// For example, given the following code:
+// __m128 foo(__m128 A, __m128 B) {
+// A[0] += B[0];
+// return A;
+// }
+//
+// Previously we generated:
+// addss %xmm0, %xmm1
+// movss %xmm1, %xmm0
+//
+// We now generate:
+// addss %xmm1, %xmm0
+//
+// (2) a vector packed single/double fp operation followed by a vector insert
+//
+// The effect is that the backend converts the packed fp instruction
+// followed by a vector insert into a single SSE scalar fp instruction.
+//
+// For example, given the following code:
+// __m128 foo(__m128 A, __m128 B) {
+// __m128 C = A + B;
+// return (__m128) {c[0], a[1], a[2], a[3]};
+// }
+//
+// Previously we generated:
+// addps %xmm0, %xmm1
+// movss %xmm1, %xmm0
+//
+// We now generate:
+// addss %xmm1, %xmm0
+
+// TODO: Some canonicalization in lowering would simplify the number of
+// patterns we have to try to match.
+multiclass AVX512_scalar_math_f32_patterns<SDNode Op, string OpcPrefix> {
+ let Predicates = [HasAVX512] in {
+ // extracted scalar math op with insert via movss
+ def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst), (v4f32 (scalar_to_vector
+ (Op (f32 (extractelt (v4f32 VR128X:$dst), (iPTR 0))),
+ FR32X:$src))))),
+ (!cast<I>("V"#OpcPrefix#SSZrr_Int) v4f32:$dst,
+ (COPY_TO_REGCLASS FR32X:$src, VR128X))>;
+
+ // extracted scalar math op with insert via blend
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128X:$dst), (v4f32 (scalar_to_vector
+ (Op (f32 (extractelt (v4f32 VR128X:$dst), (iPTR 0))),
+ FR32X:$src))), (i8 1))),
+ (!cast<I>("V"#OpcPrefix#SSZrr_Int) v4f32:$dst,
+ (COPY_TO_REGCLASS FR32X:$src, VR128X))>;
+
+ // vector math op with insert via movss
+ def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst),
+ (Op (v4f32 VR128X:$dst), (v4f32 VR128X:$src)))),
+ (!cast<I>("V"#OpcPrefix#SSZrr_Int) v4f32:$dst, v4f32:$src)>;
+
+ // vector math op with insert via blend
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128X:$dst),
+ (Op (v4f32 VR128X:$dst), (v4f32 VR128X:$src)), (i8 1))),
+ (!cast<I>("V"#OpcPrefix#SSZrr_Int) v4f32:$dst, v4f32:$src)>;
+
+ // extracted masked scalar math op with insert via movss
+ def : Pat<(X86Movss (v4f32 VR128X:$src1),
+ (scalar_to_vector
+ (X86selects VK1WM:$mask,
+ (Op (f32 (extractelt (v4f32 VR128X:$src1), (iPTR 0))),
+ FR32X:$src2),
+ FR32X:$src0))),
+ (!cast<I>("V"#OpcPrefix#SSZrr_Intk) (COPY_TO_REGCLASS FR32X:$src0, VR128X),
+ VK1WM:$mask, v4f32:$src1,
+ (COPY_TO_REGCLASS FR32X:$src2, VR128X))>;
+ }
+}
+
+defm : AVX512_scalar_math_f32_patterns<fadd, "ADD">;
+defm : AVX512_scalar_math_f32_patterns<fsub, "SUB">;
+defm : AVX512_scalar_math_f32_patterns<fmul, "MUL">;
+defm : AVX512_scalar_math_f32_patterns<fdiv, "DIV">;
+
+multiclass AVX512_scalar_math_f64_patterns<SDNode Op, string OpcPrefix> {
+ let Predicates = [HasAVX512] in {
+ // extracted scalar math op with insert via movsd
+ def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst), (v2f64 (scalar_to_vector
+ (Op (f64 (extractelt (v2f64 VR128X:$dst), (iPTR 0))),
+ FR64X:$src))))),
+ (!cast<I>("V"#OpcPrefix#SDZrr_Int) v2f64:$dst,
+ (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
+
+ // extracted scalar math op with insert via blend
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128X:$dst), (v2f64 (scalar_to_vector
+ (Op (f64 (extractelt (v2f64 VR128X:$dst), (iPTR 0))),
+ FR64X:$src))), (i8 1))),
+ (!cast<I>("V"#OpcPrefix#SDZrr_Int) v2f64:$dst,
+ (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
+
+ // vector math op with insert via movsd
+ def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst),
+ (Op (v2f64 VR128X:$dst), (v2f64 VR128X:$src)))),
+ (!cast<I>("V"#OpcPrefix#SDZrr_Int) v2f64:$dst, v2f64:$src)>;
+
+ // vector math op with insert via blend
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128X:$dst),
+ (Op (v2f64 VR128X:$dst), (v2f64 VR128X:$src)), (i8 1))),
+ (!cast<I>("V"#OpcPrefix#SDZrr_Int) v2f64:$dst, v2f64:$src)>;
+
+ // extracted masked scalar math op with insert via movss
+ def : Pat<(X86Movsd (v2f64 VR128X:$src1),
+ (scalar_to_vector
+ (X86selects VK1WM:$mask,
+ (Op (f64 (extractelt (v2f64 VR128X:$src1), (iPTR 0))),
+ FR64X:$src2),
+ FR64X:$src0))),
+ (!cast<I>("V"#OpcPrefix#SDZrr_Intk) (COPY_TO_REGCLASS FR64X:$src0, VR128X),
+ VK1WM:$mask, v2f64:$src1,
+ (COPY_TO_REGCLASS FR64X:$src2, VR128X))>;
+ }
+}
+
+defm : AVX512_scalar_math_f64_patterns<fadd, "ADD">;
+defm : AVX512_scalar_math_f64_patterns<fsub, "SUB">;
+defm : AVX512_scalar_math_f64_patterns<fmul, "MUL">;
+defm : AVX512_scalar_math_f64_patterns<fdiv, "DIV">;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td
new file mode 100644
index 000000000000..bfd21c062aa2
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -0,0 +1,1375 @@
+//===-- X86InstrArithmetic.td - Integer Arithmetic Instrs --*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the integer arithmetic instructions in the X86
+// architecture.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// LEA - Load Effective Address
+let SchedRW = [WriteLEA] in {
+let hasSideEffects = 0 in
+def LEA16r : I<0x8D, MRMSrcMem,
+ (outs GR16:$dst), (ins anymem:$src),
+ "lea{w}\t{$src|$dst}, {$dst|$src}", [], IIC_LEA_16>, OpSize16;
+let isReMaterializable = 1 in
+def LEA32r : I<0x8D, MRMSrcMem,
+ (outs GR32:$dst), (ins anymem:$src),
+ "lea{l}\t{$src|$dst}, {$dst|$src}",
+ [(set GR32:$dst, lea32addr:$src)], IIC_LEA>,
+ OpSize32, Requires<[Not64BitMode]>;
+
+def LEA64_32r : I<0x8D, MRMSrcMem,
+ (outs GR32:$dst), (ins lea64_32mem:$src),
+ "lea{l}\t{$src|$dst}, {$dst|$src}",
+ [(set GR32:$dst, lea64_32addr:$src)], IIC_LEA>,
+ OpSize32, Requires<[In64BitMode]>;
+
+let isReMaterializable = 1 in
+def LEA64r : RI<0x8D, MRMSrcMem, (outs GR64:$dst), (ins lea64mem:$src),
+ "lea{q}\t{$src|$dst}, {$dst|$src}",
+ [(set GR64:$dst, lea64addr:$src)], IIC_LEA>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Fixed-Register Multiplication and Division Instructions.
+//
+
+// SchedModel info for instruction that loads one value and gets the second
+// (and possibly third) value from a register.
+// This is used for instructions that put the memory operands before other
+// uses.
+class SchedLoadReg<SchedWrite SW> : Sched<[SW,
+ // Memory operand.
+ ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+ // Register reads (implicit or explicit).
+ ReadAfterLd, ReadAfterLd]>;
+
+// Extra precision multiplication
+
+// AL is really implied by AX, but the registers in Defs must match the
+// SDNode results (i8, i32).
+// AL,AH = AL*GR8
+let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+def MUL8r : I<0xF6, MRM4r, (outs), (ins GR8:$src), "mul{b}\t$src",
+ // FIXME: Used for 8-bit mul, ignore result upper 8 bits.
+ // This probably ought to be moved to a def : Pat<> if the
+ // syntax can be accepted.
+ [(set AL, (mul AL, GR8:$src)),
+ (implicit EFLAGS)], IIC_MUL8>, Sched<[WriteIMul]>;
+// AX,DX = AX*GR16
+let Defs = [AX,DX,EFLAGS], Uses = [AX], hasSideEffects = 0 in
+def MUL16r : I<0xF7, MRM4r, (outs), (ins GR16:$src),
+ "mul{w}\t$src",
+ [], IIC_MUL16_REG>, OpSize16, Sched<[WriteIMul]>;
+// EAX,EDX = EAX*GR32
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX], hasSideEffects = 0 in
+def MUL32r : I<0xF7, MRM4r, (outs), (ins GR32:$src),
+ "mul{l}\t$src",
+ [/*(set EAX, EDX, EFLAGS, (X86umul_flag EAX, GR32:$src))*/],
+ IIC_MUL32_REG>, OpSize32, Sched<[WriteIMul]>;
+// RAX,RDX = RAX*GR64
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX], hasSideEffects = 0 in
+def MUL64r : RI<0xF7, MRM4r, (outs), (ins GR64:$src),
+ "mul{q}\t$src",
+ [/*(set RAX, RDX, EFLAGS, (X86umul_flag RAX, GR64:$src))*/],
+ IIC_MUL64>, Sched<[WriteIMul]>;
+// AL,AH = AL*[mem8]
+let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+def MUL8m : I<0xF6, MRM4m, (outs), (ins i8mem :$src),
+ "mul{b}\t$src",
+ // FIXME: Used for 8-bit mul, ignore result upper 8 bits.
+ // This probably ought to be moved to a def : Pat<> if the
+ // syntax can be accepted.
+ [(set AL, (mul AL, (loadi8 addr:$src))),
+ (implicit EFLAGS)], IIC_MUL8>, SchedLoadReg<WriteIMulLd>;
+// AX,DX = AX*[mem16]
+let mayLoad = 1, hasSideEffects = 0 in {
+let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+def MUL16m : I<0xF7, MRM4m, (outs), (ins i16mem:$src),
+ "mul{w}\t$src",
+ [], IIC_MUL16_MEM>, OpSize16, SchedLoadReg<WriteIMulLd>;
+// EAX,EDX = EAX*[mem32]
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
+def MUL32m : I<0xF7, MRM4m, (outs), (ins i32mem:$src),
+ "mul{l}\t$src",
+ [], IIC_MUL32_MEM>, OpSize32, SchedLoadReg<WriteIMulLd>;
+// RAX,RDX = RAX*[mem64]
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
+def MUL64m : RI<0xF7, MRM4m, (outs), (ins i64mem:$src),
+ "mul{q}\t$src", [], IIC_MUL64>, SchedLoadReg<WriteIMulLd>;
+}
+
+let hasSideEffects = 0 in {
+// AL,AH = AL*GR8
+let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+def IMUL8r : I<0xF6, MRM5r, (outs), (ins GR8:$src), "imul{b}\t$src", [],
+ IIC_IMUL8>, Sched<[WriteIMul]>;
+// AX,DX = AX*GR16
+let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+def IMUL16r : I<0xF7, MRM5r, (outs), (ins GR16:$src), "imul{w}\t$src", [],
+ IIC_IMUL16_RR>, OpSize16, Sched<[WriteIMul]>;
+// EAX,EDX = EAX*GR32
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
+def IMUL32r : I<0xF7, MRM5r, (outs), (ins GR32:$src), "imul{l}\t$src", [],
+ IIC_IMUL32_RR>, OpSize32, Sched<[WriteIMul]>;
+// RAX,RDX = RAX*GR64
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
+def IMUL64r : RI<0xF7, MRM5r, (outs), (ins GR64:$src), "imul{q}\t$src", [],
+ IIC_IMUL64_RR>, Sched<[WriteIMul]>;
+
+let mayLoad = 1 in {
+// AL,AH = AL*[mem8]
+let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+def IMUL8m : I<0xF6, MRM5m, (outs), (ins i8mem :$src),
+ "imul{b}\t$src", [], IIC_IMUL8>, SchedLoadReg<WriteIMulLd>;
+// AX,DX = AX*[mem16]
+let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+def IMUL16m : I<0xF7, MRM5m, (outs), (ins i16mem:$src),
+ "imul{w}\t$src", [], IIC_IMUL16_MEM>, OpSize16,
+ SchedLoadReg<WriteIMulLd>;
+// EAX,EDX = EAX*[mem32]
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
+def IMUL32m : I<0xF7, MRM5m, (outs), (ins i32mem:$src),
+ "imul{l}\t$src", [], IIC_IMUL32_MEM>, OpSize32,
+ SchedLoadReg<WriteIMulLd>;
+// RAX,RDX = RAX*[mem64]
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
+def IMUL64m : RI<0xF7, MRM5m, (outs), (ins i64mem:$src),
+ "imul{q}\t$src", [], IIC_IMUL64>, SchedLoadReg<WriteIMulLd>;
+}
+} // hasSideEffects
+
+
+let Defs = [EFLAGS] in {
+let Constraints = "$src1 = $dst" in {
+
+let isCommutable = 1, SchedRW = [WriteIMul] in {
+// X = IMUL Y, Z --> X = IMUL Z, Y
+// Register-Register Signed Integer Multiply
+def IMUL16rr : I<0xAF, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1,GR16:$src2),
+ "imul{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86smul_flag GR16:$src1, GR16:$src2))], IIC_IMUL16_RR>,
+ TB, OpSize16;
+def IMUL32rr : I<0xAF, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1,GR32:$src2),
+ "imul{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86smul_flag GR32:$src1, GR32:$src2))], IIC_IMUL32_RR>,
+ TB, OpSize32;
+def IMUL64rr : RI<0xAF, MRMSrcReg, (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2),
+ "imul{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86smul_flag GR64:$src1, GR64:$src2))], IIC_IMUL64_RR>,
+ TB;
+} // isCommutable, SchedRW
+
+// Register-Memory Signed Integer Multiply
+let SchedRW = [WriteIMulLd, ReadAfterLd] in {
+def IMUL16rm : I<0xAF, MRMSrcMem, (outs GR16:$dst),
+ (ins GR16:$src1, i16mem:$src2),
+ "imul{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86smul_flag GR16:$src1, (load addr:$src2)))],
+ IIC_IMUL16_RM>,
+ TB, OpSize16;
+def IMUL32rm : I<0xAF, MRMSrcMem, (outs GR32:$dst),
+ (ins GR32:$src1, i32mem:$src2),
+ "imul{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86smul_flag GR32:$src1, (load addr:$src2)))],
+ IIC_IMUL32_RM>,
+ TB, OpSize32;
+def IMUL64rm : RI<0xAF, MRMSrcMem, (outs GR64:$dst),
+ (ins GR64:$src1, i64mem:$src2),
+ "imul{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86smul_flag GR64:$src1, (load addr:$src2)))],
+ IIC_IMUL64_RM>,
+ TB;
+} // SchedRW
+} // Constraints = "$src1 = $dst"
+
+} // Defs = [EFLAGS]
+
+// Surprisingly enough, these are not two address instructions!
+let Defs = [EFLAGS] in {
+let SchedRW = [WriteIMul] in {
+// Register-Integer Signed Integer Multiply
+def IMUL16rri : Ii16<0x69, MRMSrcReg, // GR16 = GR16*I16
+ (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+ "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86smul_flag GR16:$src1, imm:$src2))],
+ IIC_IMUL16_RRI>, OpSize16;
+def IMUL16rri8 : Ii8<0x6B, MRMSrcReg, // GR16 = GR16*I8
+ (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
+ "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86smul_flag GR16:$src1, i16immSExt8:$src2))],
+ IIC_IMUL16_RRI>, OpSize16;
+def IMUL32rri : Ii32<0x69, MRMSrcReg, // GR32 = GR32*I32
+ (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+ "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86smul_flag GR32:$src1, imm:$src2))],
+ IIC_IMUL32_RRI>, OpSize32;
+def IMUL32rri8 : Ii8<0x6B, MRMSrcReg, // GR32 = GR32*I8
+ (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
+ "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86smul_flag GR32:$src1, i32immSExt8:$src2))],
+ IIC_IMUL32_RRI>, OpSize32;
+def IMUL64rri32 : RIi32S<0x69, MRMSrcReg, // GR64 = GR64*I32
+ (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
+ "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86smul_flag GR64:$src1, i64immSExt32:$src2))],
+ IIC_IMUL64_RRI>;
+def IMUL64rri8 : RIi8<0x6B, MRMSrcReg, // GR64 = GR64*I8
+ (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+ "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86smul_flag GR64:$src1, i64immSExt8:$src2))],
+ IIC_IMUL64_RRI>;
+} // SchedRW
+
+// Memory-Integer Signed Integer Multiply
+let SchedRW = [WriteIMulLd] in {
+def IMUL16rmi : Ii16<0x69, MRMSrcMem, // GR16 = [mem16]*I16
+ (outs GR16:$dst), (ins i16mem:$src1, i16imm:$src2),
+ "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86smul_flag (load addr:$src1), imm:$src2))],
+ IIC_IMUL16_RMI>,
+ OpSize16;
+def IMUL16rmi8 : Ii8<0x6B, MRMSrcMem, // GR16 = [mem16]*I8
+ (outs GR16:$dst), (ins i16mem:$src1, i16i8imm :$src2),
+ "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86smul_flag (load addr:$src1),
+ i16immSExt8:$src2))], IIC_IMUL16_RMI>,
+ OpSize16;
+def IMUL32rmi : Ii32<0x69, MRMSrcMem, // GR32 = [mem32]*I32
+ (outs GR32:$dst), (ins i32mem:$src1, i32imm:$src2),
+ "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86smul_flag (load addr:$src1), imm:$src2))],
+ IIC_IMUL32_RMI>, OpSize32;
+def IMUL32rmi8 : Ii8<0x6B, MRMSrcMem, // GR32 = [mem32]*I8
+ (outs GR32:$dst), (ins i32mem:$src1, i32i8imm: $src2),
+ "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86smul_flag (load addr:$src1),
+ i32immSExt8:$src2))],
+ IIC_IMUL32_RMI>, OpSize32;
+def IMUL64rmi32 : RIi32S<0x69, MRMSrcMem, // GR64 = [mem64]*I32
+ (outs GR64:$dst), (ins i64mem:$src1, i64i32imm:$src2),
+ "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86smul_flag (load addr:$src1),
+ i64immSExt32:$src2))],
+ IIC_IMUL64_RMI>;
+def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem, // GR64 = [mem64]*I8
+ (outs GR64:$dst), (ins i64mem:$src1, i64i8imm: $src2),
+ "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86smul_flag (load addr:$src1),
+ i64immSExt8:$src2))],
+ IIC_IMUL64_RMI>;
+} // SchedRW
+} // Defs = [EFLAGS]
+
+
+
+
+// unsigned division/remainder
+let hasSideEffects = 1 in { // so that we don't speculatively execute
+let SchedRW = [WriteIDiv] in {
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+def DIV8r : I<0xF6, MRM6r, (outs), (ins GR8:$src), // AX/r8 = AL,AH
+ "div{b}\t$src", [], IIC_DIV8_REG>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def DIV16r : I<0xF7, MRM6r, (outs), (ins GR16:$src), // DX:AX/r16 = AX,DX
+ "div{w}\t$src", [], IIC_DIV16>, OpSize16;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
+def DIV32r : I<0xF7, MRM6r, (outs), (ins GR32:$src), // EDX:EAX/r32 = EAX,EDX
+ "div{l}\t$src", [], IIC_DIV32>, OpSize32;
+// RDX:RAX/r64 = RAX,RDX
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
+def DIV64r : RI<0xF7, MRM6r, (outs), (ins GR64:$src),
+ "div{q}\t$src", [], IIC_DIV64>;
+} // SchedRW
+
+let mayLoad = 1 in {
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+def DIV8m : I<0xF6, MRM6m, (outs), (ins i8mem:$src), // AX/[mem8] = AL,AH
+ "div{b}\t$src", [], IIC_DIV8_MEM>,
+ SchedLoadReg<WriteIDivLd>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def DIV16m : I<0xF7, MRM6m, (outs), (ins i16mem:$src), // DX:AX/[mem16] = AX,DX
+ "div{w}\t$src", [], IIC_DIV16>, OpSize16,
+ SchedLoadReg<WriteIDivLd>;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in // EDX:EAX/[mem32] = EAX,EDX
+def DIV32m : I<0xF7, MRM6m, (outs), (ins i32mem:$src),
+ "div{l}\t$src", [], IIC_DIV32>,
+ SchedLoadReg<WriteIDivLd>, OpSize32;
+// RDX:RAX/[mem64] = RAX,RDX
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
+def DIV64m : RI<0xF7, MRM6m, (outs), (ins i64mem:$src),
+ "div{q}\t$src", [], IIC_DIV64>,
+ SchedLoadReg<WriteIDivLd>;
+}
+
+// Signed division/remainder.
+let SchedRW = [WriteIDiv] in {
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+def IDIV8r : I<0xF6, MRM7r, (outs), (ins GR8:$src), // AX/r8 = AL,AH
+ "idiv{b}\t$src", [], IIC_IDIV8>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def IDIV16r: I<0xF7, MRM7r, (outs), (ins GR16:$src), // DX:AX/r16 = AX,DX
+ "idiv{w}\t$src", [], IIC_IDIV16>, OpSize16;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
+def IDIV32r: I<0xF7, MRM7r, (outs), (ins GR32:$src), // EDX:EAX/r32 = EAX,EDX
+ "idiv{l}\t$src", [], IIC_IDIV32>, OpSize32;
+// RDX:RAX/r64 = RAX,RDX
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
+def IDIV64r: RI<0xF7, MRM7r, (outs), (ins GR64:$src),
+ "idiv{q}\t$src", [], IIC_IDIV64>;
+} // SchedRW
+
+let mayLoad = 1 in {
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+def IDIV8m : I<0xF6, MRM7m, (outs), (ins i8mem:$src), // AX/[mem8] = AL,AH
+ "idiv{b}\t$src", [], IIC_IDIV8>,
+ SchedLoadReg<WriteIDivLd>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def IDIV16m: I<0xF7, MRM7m, (outs), (ins i16mem:$src), // DX:AX/[mem16] = AX,DX
+ "idiv{w}\t$src", [], IIC_IDIV16>, OpSize16,
+ SchedLoadReg<WriteIDivLd>;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in // EDX:EAX/[mem32] = EAX,EDX
+def IDIV32m: I<0xF7, MRM7m, (outs), (ins i32mem:$src),
+ "idiv{l}\t$src", [], IIC_IDIV32>, OpSize32,
+ SchedLoadReg<WriteIDivLd>;
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in // RDX:RAX/[mem64] = RAX,RDX
+def IDIV64m: RI<0xF7, MRM7m, (outs), (ins i64mem:$src),
+ "idiv{q}\t$src", [], IIC_IDIV64>,
+ SchedLoadReg<WriteIDivLd>;
+}
+} // hasSideEffects = 0
+
+//===----------------------------------------------------------------------===//
+// Two address Instructions.
+//
+
+// unary instructions
+let CodeSize = 2 in {
+let Defs = [EFLAGS] in {
+let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
+def NEG8r : I<0xF6, MRM3r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "neg{b}\t$dst",
+ [(set GR8:$dst, (ineg GR8:$src1)),
+ (implicit EFLAGS)], IIC_UNARY_REG>;
+def NEG16r : I<0xF7, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
+ "neg{w}\t$dst",
+ [(set GR16:$dst, (ineg GR16:$src1)),
+ (implicit EFLAGS)], IIC_UNARY_REG>, OpSize16;
+def NEG32r : I<0xF7, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
+ "neg{l}\t$dst",
+ [(set GR32:$dst, (ineg GR32:$src1)),
+ (implicit EFLAGS)], IIC_UNARY_REG>, OpSize32;
+def NEG64r : RI<0xF7, MRM3r, (outs GR64:$dst), (ins GR64:$src1), "neg{q}\t$dst",
+ [(set GR64:$dst, (ineg GR64:$src1)),
+ (implicit EFLAGS)], IIC_UNARY_REG>;
+} // Constraints = "$src1 = $dst", SchedRW
+
+// Read-modify-write negate.
+let SchedRW = [WriteALULd, WriteRMW] in {
+def NEG8m : I<0xF6, MRM3m, (outs), (ins i8mem :$dst),
+ "neg{b}\t$dst",
+ [(store (ineg (loadi8 addr:$dst)), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>;
+def NEG16m : I<0xF7, MRM3m, (outs), (ins i16mem:$dst),
+ "neg{w}\t$dst",
+ [(store (ineg (loadi16 addr:$dst)), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize16;
+def NEG32m : I<0xF7, MRM3m, (outs), (ins i32mem:$dst),
+ "neg{l}\t$dst",
+ [(store (ineg (loadi32 addr:$dst)), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize32;
+def NEG64m : RI<0xF7, MRM3m, (outs), (ins i64mem:$dst), "neg{q}\t$dst",
+ [(store (ineg (loadi64 addr:$dst)), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>;
+} // SchedRW
+} // Defs = [EFLAGS]
+
+
+// Note: NOT does not set EFLAGS!
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
+// Match xor -1 to not. Favors these over a move imm + xor to save code size.
+let AddedComplexity = 15 in {
+def NOT8r : I<0xF6, MRM2r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "not{b}\t$dst",
+ [(set GR8:$dst, (not GR8:$src1))], IIC_UNARY_REG>;
+def NOT16r : I<0xF7, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
+ "not{w}\t$dst",
+ [(set GR16:$dst, (not GR16:$src1))], IIC_UNARY_REG>, OpSize16;
+def NOT32r : I<0xF7, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
+ "not{l}\t$dst",
+ [(set GR32:$dst, (not GR32:$src1))], IIC_UNARY_REG>, OpSize32;
+def NOT64r : RI<0xF7, MRM2r, (outs GR64:$dst), (ins GR64:$src1), "not{q}\t$dst",
+ [(set GR64:$dst, (not GR64:$src1))], IIC_UNARY_REG>;
+}
+} // Constraints = "$src1 = $dst", SchedRW
+
+let SchedRW = [WriteALULd, WriteRMW] in {
+def NOT8m : I<0xF6, MRM2m, (outs), (ins i8mem :$dst),
+ "not{b}\t$dst",
+ [(store (not (loadi8 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>;
+def NOT16m : I<0xF7, MRM2m, (outs), (ins i16mem:$dst),
+ "not{w}\t$dst",
+ [(store (not (loadi16 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>,
+ OpSize16;
+def NOT32m : I<0xF7, MRM2m, (outs), (ins i32mem:$dst),
+ "not{l}\t$dst",
+ [(store (not (loadi32 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>,
+ OpSize32;
+def NOT64m : RI<0xF7, MRM2m, (outs), (ins i64mem:$dst), "not{q}\t$dst",
+ [(store (not (loadi64 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>;
+} // SchedRW
+} // CodeSize
+
+// TODO: inc/dec is slow for P4, but fast for Pentium-M.
+let Defs = [EFLAGS] in {
+let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
+let CodeSize = 2 in
+def INC8r : I<0xFE, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "inc{b}\t$dst",
+ [(set GR8:$dst, EFLAGS, (X86inc_flag GR8:$src1))],
+ IIC_UNARY_REG>;
+let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA.
+def INC16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
+ "inc{w}\t$dst",
+ [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))],
+ IIC_UNARY_REG>, OpSize16;
+def INC32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
+ "inc{l}\t$dst",
+ [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))],
+ IIC_UNARY_REG>, OpSize32;
+def INC64r : RI<0xFF, MRM0r, (outs GR64:$dst), (ins GR64:$src1), "inc{q}\t$dst",
+ [(set GR64:$dst, EFLAGS, (X86inc_flag GR64:$src1))],
+ IIC_UNARY_REG>;
+} // isConvertibleToThreeAddress = 1, CodeSize = 2
+
+// Short forms only valid in 32-bit mode. Selected during MCInst lowering.
+let CodeSize = 1, hasSideEffects = 0 in {
+def INC16r_alt : I<0x40, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1),
+ "inc{w}\t$dst", [], IIC_UNARY_REG>,
+ OpSize16, Requires<[Not64BitMode]>;
+def INC32r_alt : I<0x40, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1),
+ "inc{l}\t$dst", [], IIC_UNARY_REG>,
+ OpSize32, Requires<[Not64BitMode]>;
+} // CodeSize = 1, hasSideEffects = 0
+} // Constraints = "$src1 = $dst", SchedRW
+
+let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in {
+ def INC8m : I<0xFE, MRM0m, (outs), (ins i8mem :$dst), "inc{b}\t$dst",
+ [(store (add (loadi8 addr:$dst), 1), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>;
+ def INC16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst",
+ [(store (add (loadi16 addr:$dst), 1), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize16;
+ def INC32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst",
+ [(store (add (loadi32 addr:$dst), 1), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize32;
+ def INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst), "inc{q}\t$dst",
+ [(store (add (loadi64 addr:$dst), 1), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>;
+} // CodeSize = 2, SchedRW
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
+let CodeSize = 2 in
+def DEC8r : I<0xFE, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "dec{b}\t$dst",
+ [(set GR8:$dst, EFLAGS, (X86dec_flag GR8:$src1))],
+ IIC_UNARY_REG>;
+let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA.
+def DEC16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
+ "dec{w}\t$dst",
+ [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))],
+ IIC_UNARY_REG>, OpSize16;
+def DEC32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
+ "dec{l}\t$dst",
+ [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))],
+ IIC_UNARY_REG>, OpSize32;
+def DEC64r : RI<0xFF, MRM1r, (outs GR64:$dst), (ins GR64:$src1), "dec{q}\t$dst",
+ [(set GR64:$dst, EFLAGS, (X86dec_flag GR64:$src1))],
+ IIC_UNARY_REG>;
+} // isConvertibleToThreeAddress = 1, CodeSize = 2
+
+// Short forms only valid in 32-bit mode. Selected during MCInst lowering.
+let CodeSize = 1, hasSideEffects = 0 in {
+def DEC16r_alt : I<0x48, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1),
+ "dec{w}\t$dst", [], IIC_UNARY_REG>,
+ OpSize16, Requires<[Not64BitMode]>;
+def DEC32r_alt : I<0x48, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1),
+ "dec{l}\t$dst", [], IIC_UNARY_REG>,
+ OpSize32, Requires<[Not64BitMode]>;
+} // CodeSize = 1, hasSideEffects = 0
+} // Constraints = "$src1 = $dst", SchedRW
+
+
+let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in {
+ def DEC8m : I<0xFE, MRM1m, (outs), (ins i8mem :$dst), "dec{b}\t$dst",
+ [(store (add (loadi8 addr:$dst), -1), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>;
+ def DEC16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst",
+ [(store (add (loadi16 addr:$dst), -1), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize16;
+ def DEC32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst",
+ [(store (add (loadi32 addr:$dst), -1), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize32;
+ def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
+ [(store (add (loadi64 addr:$dst), -1), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>;
+} // CodeSize = 2, SchedRW
+} // Defs = [EFLAGS]
+
+/// X86TypeInfo - This is a bunch of information that describes relevant X86
+/// information about value types. For example, it can tell you what the
+/// register class and preferred load to use.
+class X86TypeInfo<ValueType vt, string instrsuffix, RegisterClass regclass,
+ PatFrag loadnode, X86MemOperand memoperand, ImmType immkind,
+ Operand immoperand, SDPatternOperator immoperator,
+ Operand imm8operand, SDPatternOperator imm8operator,
+ bit hasOddOpcode, OperandSize opSize,
+ bit hasREX_WPrefix> {
+ /// VT - This is the value type itself.
+ ValueType VT = vt;
+
+ /// InstrSuffix - This is the suffix used on instructions with this type. For
+ /// example, i8 -> "b", i16 -> "w", i32 -> "l", i64 -> "q".
+ string InstrSuffix = instrsuffix;
+
+ /// RegClass - This is the register class associated with this type. For
+ /// example, i8 -> GR8, i16 -> GR16, i32 -> GR32, i64 -> GR64.
+ RegisterClass RegClass = regclass;
+
+ /// LoadNode - This is the load node associated with this type. For
+ /// example, i8 -> loadi8, i16 -> loadi16, i32 -> loadi32, i64 -> loadi64.
+ PatFrag LoadNode = loadnode;
+
+ /// MemOperand - This is the memory operand associated with this type. For
+ /// example, i8 -> i8mem, i16 -> i16mem, i32 -> i32mem, i64 -> i64mem.
+ X86MemOperand MemOperand = memoperand;
+
+ /// ImmEncoding - This is the encoding of an immediate of this type. For
+ /// example, i8 -> Imm8, i16 -> Imm16, i32 -> Imm32. Note that i64 -> Imm32
+ /// since the immediate fields of i64 instructions is a 32-bit sign extended
+ /// value.
+ ImmType ImmEncoding = immkind;
+
+ /// ImmOperand - This is the operand kind of an immediate of this type. For
+ /// example, i8 -> i8imm, i16 -> i16imm, i32 -> i32imm. Note that i64 ->
+ /// i64i32imm since the immediate fields of i64 instructions is a 32-bit sign
+ /// extended value.
+ Operand ImmOperand = immoperand;
+
+ /// ImmOperator - This is the operator that should be used to match an
+ /// immediate of this kind in a pattern (e.g. imm, or i64immSExt32).
+ SDPatternOperator ImmOperator = immoperator;
+
+ /// Imm8Operand - This is the operand kind to use for an imm8 of this type.
+ /// For example, i8 -> <invalid>, i16 -> i16i8imm, i32 -> i32i8imm. This is
+ /// only used for instructions that have a sign-extended imm8 field form.
+ Operand Imm8Operand = imm8operand;
+
+ /// Imm8Operator - This is the operator that should be used to match an 8-bit
+ /// sign extended immediate of this kind in a pattern (e.g. imm16immSExt8).
+ SDPatternOperator Imm8Operator = imm8operator;
+
+ /// HasOddOpcode - This bit is true if the instruction should have an odd (as
+ /// opposed to even) opcode. Operations on i8 are usually even, operations on
+ /// other datatypes are odd.
+ bit HasOddOpcode = hasOddOpcode;
+
+ /// OpSize - Selects whether the instruction needs a 0x66 prefix based on
+ /// 16-bit vs 32-bit mode. i8/i64 set this to OpSizeFixed. i16 sets this
+ /// to Opsize16. i32 sets this to OpSize32.
+ OperandSize OpSize = opSize;
+
+ /// HasREX_WPrefix - This bit is set to true if the instruction should have
+ /// the 0x40 REX prefix. This is set for i64 types.
+ bit HasREX_WPrefix = hasREX_WPrefix;
+}
+
+def invalid_node : SDNode<"<<invalid_node>>", SDTIntLeaf,[],"<<invalid_node>>">;
+
+
+def Xi8 : X86TypeInfo<i8, "b", GR8, loadi8, i8mem,
+ Imm8, i8imm, imm8_su, i8imm, invalid_node,
+ 0, OpSizeFixed, 0>;
+def Xi16 : X86TypeInfo<i16, "w", GR16, loadi16, i16mem,
+ Imm16, i16imm, imm16_su, i16i8imm, i16immSExt8_su,
+ 1, OpSize16, 0>;
+def Xi32 : X86TypeInfo<i32, "l", GR32, loadi32, i32mem,
+ Imm32, i32imm, imm32_su, i32i8imm, i32immSExt8_su,
+ 1, OpSize32, 0>;
+def Xi64 : X86TypeInfo<i64, "q", GR64, loadi64, i64mem,
+ Imm32S, i64i32imm, i64immSExt32_su, i64i8imm, i64immSExt8_su,
+ 1, OpSizeFixed, 1>;
+
+/// ITy - This instruction base class takes the type info for the instruction.
+/// Using this, it:
+/// 1. Concatenates together the instruction mnemonic with the appropriate
+/// suffix letter, a tab, and the arguments.
+/// 2. Infers whether the instruction should have a 0x66 prefix byte.
+/// 3. Infers whether the instruction should have a 0x40 REX_W prefix.
+/// 4. Infers whether the low bit of the opcode should be 0 (for i8 operations)
+/// or 1 (for i16,i32,i64 operations).
+class ITy<bits<8> opcode, Format f, X86TypeInfo typeinfo, dag outs, dag ins,
+ string mnemonic, string args, list<dag> pattern,
+ InstrItinClass itin = IIC_BIN_NONMEM>
+ : I<{opcode{7}, opcode{6}, opcode{5}, opcode{4},
+ opcode{3}, opcode{2}, opcode{1}, typeinfo.HasOddOpcode },
+ f, outs, ins,
+ !strconcat(mnemonic, "{", typeinfo.InstrSuffix, "}\t", args), pattern,
+ itin> {
+
+ // Infer instruction prefixes from type info.
+ let OpSize = typeinfo.OpSize;
+ let hasREX_WPrefix = typeinfo.HasREX_WPrefix;
+}
+
+// BinOpRR - Instructions like "add reg, reg, reg".
+class BinOpRR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ dag outlist, list<dag> pattern, InstrItinClass itin,
+ Format f = MRMDestReg>
+ : ITy<opcode, f, typeinfo, outlist,
+ (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
+ mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
+ Sched<[WriteALU]>;
+
+// BinOpRR_F - Instructions like "cmp reg, Reg", where the pattern has
+// just a EFLAGS as a result.
+class BinOpRR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDPatternOperator opnode, Format f = MRMDestReg>
+ : BinOpRR<opcode, mnemonic, typeinfo, (outs),
+ [(set EFLAGS,
+ (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))],
+ IIC_BIN_NONMEM, f>;
+
+// BinOpRR_RF - Instructions like "add reg, reg, reg", where the pattern has
+// both a regclass and EFLAGS as a result.
+class BinOpRR_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode>
+ : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+ [(set typeinfo.RegClass:$dst, EFLAGS,
+ (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))],
+ IIC_BIN_NONMEM>;
+
+// BinOpRR_RFF - Instructions like "adc reg, reg, reg", where the pattern has
+// both a regclass and EFLAGS as a result, and has EFLAGS as input.
+class BinOpRR_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode>
+ : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+ [(set typeinfo.RegClass:$dst, EFLAGS,
+ (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2,
+ EFLAGS))], IIC_BIN_CARRY_NONMEM>;
+
+// BinOpRR_Rev - Instructions like "add reg, reg, reg" (reversed encoding).
+class BinOpRR_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ InstrItinClass itin = IIC_BIN_NONMEM>
+ : ITy<opcode, MRMSrcReg, typeinfo,
+ (outs typeinfo.RegClass:$dst),
+ (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
+ mnemonic, "{$src2, $dst|$dst, $src2}", [], itin>,
+ Sched<[WriteALU]> {
+ // The disassembler should know about this, but not the asmparser.
+ let isCodeGenOnly = 1;
+ let ForceDisassemble = 1;
+ let hasSideEffects = 0;
+}
+
+// BinOpRR_RDD_Rev - Instructions like "adc reg, reg, reg" (reversed encoding).
+class BinOpRR_RFF_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
+ : BinOpRR_Rev<opcode, mnemonic, typeinfo, IIC_BIN_CARRY_NONMEM>;
+
+// BinOpRR_F_Rev - Instructions like "cmp reg, reg" (reversed encoding).
+class BinOpRR_F_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
+ : ITy<opcode, MRMSrcReg, typeinfo, (outs),
+ (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
+ mnemonic, "{$src2, $src1|$src1, $src2}", [], IIC_BIN_NONMEM>,
+ Sched<[WriteALU]> {
+ // The disassembler should know about this, but not the asmparser.
+ let isCodeGenOnly = 1;
+ let ForceDisassemble = 1;
+ let hasSideEffects = 0;
+}
+
+// BinOpRM - Instructions like "add reg, reg, [mem]".
+class BinOpRM<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ dag outlist, list<dag> pattern,
+ InstrItinClass itin = IIC_BIN_MEM>
+ : ITy<opcode, MRMSrcMem, typeinfo, outlist,
+ (ins typeinfo.RegClass:$src1, typeinfo.MemOperand:$src2),
+ mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
+ Sched<[WriteALULd, ReadAfterLd]>;
+
+// BinOpRM_R - Instructions like "add reg, reg, [mem]".
+class BinOpRM_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode>
+ : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+ [(set typeinfo.RegClass:$dst,
+ (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>;
+
+// BinOpRM_F - Instructions like "cmp reg, [mem]".
+class BinOpRM_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDPatternOperator opnode>
+ : BinOpRM<opcode, mnemonic, typeinfo, (outs),
+ [(set EFLAGS,
+ (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>;
+
+// BinOpRM_RF - Instructions like "add reg, reg, [mem]".
+class BinOpRM_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode>
+ : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+ [(set typeinfo.RegClass:$dst, EFLAGS,
+ (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>;
+
+// BinOpRM_RFF - Instructions like "adc reg, reg, [mem]".
+class BinOpRM_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode>
+ : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+ [(set typeinfo.RegClass:$dst, EFLAGS,
+ (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2),
+ EFLAGS))], IIC_BIN_CARRY_MEM>;
+
+// BinOpRI - Instructions like "add reg, reg, imm".
+class BinOpRI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ Format f, dag outlist, list<dag> pattern,
+ InstrItinClass itin = IIC_BIN_NONMEM>
+ : ITy<opcode, f, typeinfo, outlist,
+ (ins typeinfo.RegClass:$src1, typeinfo.ImmOperand:$src2),
+ mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
+ Sched<[WriteALU]> {
+ let ImmT = typeinfo.ImmEncoding;
+}
+
+// BinOpRI_F - Instructions like "cmp reg, imm".
+class BinOpRI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDPatternOperator opnode, Format f>
+ : BinOpRI<opcode, mnemonic, typeinfo, f, (outs),
+ [(set EFLAGS,
+ (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>;
+
+// BinOpRI_RF - Instructions like "add reg, reg, imm".
+class BinOpRI_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode, Format f>
+ : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+ [(set typeinfo.RegClass:$dst, EFLAGS,
+ (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>;
+// BinOpRI_RFF - Instructions like "adc reg, reg, imm".
+class BinOpRI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode, Format f>
+ : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+ [(set typeinfo.RegClass:$dst, EFLAGS,
+ (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2,
+ EFLAGS))], IIC_BIN_CARRY_NONMEM>;
+
+// BinOpRI8 - Instructions like "add reg, reg, imm8".
+class BinOpRI8<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ Format f, dag outlist, list<dag> pattern,
+ InstrItinClass itin = IIC_BIN_NONMEM>
+ : ITy<opcode, f, typeinfo, outlist,
+ (ins typeinfo.RegClass:$src1, typeinfo.Imm8Operand:$src2),
+ mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
+ Sched<[WriteALU]> {
+ let ImmT = Imm8; // Always 8-bit immediate.
+}
+
+// BinOpRI8_F - Instructions like "cmp reg, imm8".
+class BinOpRI8_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDPatternOperator opnode, Format f>
+ : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs),
+ [(set EFLAGS,
+ (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>;
+
+// BinOpRI8_RF - Instructions like "add reg, reg, imm8".
+class BinOpRI8_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDPatternOperator opnode, Format f>
+ : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+ [(set typeinfo.RegClass:$dst, EFLAGS,
+ (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>;
+
+// BinOpRI8_RFF - Instructions like "adc reg, reg, imm8".
+class BinOpRI8_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDPatternOperator opnode, Format f>
+ : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+ [(set typeinfo.RegClass:$dst, EFLAGS,
+ (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2,
+ EFLAGS))], IIC_BIN_CARRY_NONMEM>;
+
+// BinOpMR - Instructions like "add [mem], reg".
+class BinOpMR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ list<dag> pattern, InstrItinClass itin = IIC_BIN_MEM>
+ : ITy<opcode, MRMDestMem, typeinfo,
+ (outs), (ins typeinfo.MemOperand:$dst, typeinfo.RegClass:$src),
+ mnemonic, "{$src, $dst|$dst, $src}", pattern, itin>,
+ Sched<[WriteALULd, WriteRMW]>;
+
+// BinOpMR_RMW - Instructions like "add [mem], reg".
+class BinOpMR_RMW<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode>
+ : BinOpMR<opcode, mnemonic, typeinfo,
+ [(store (opnode (load addr:$dst), typeinfo.RegClass:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+
+// BinOpMR_RMW_FF - Instructions like "adc [mem], reg".
+class BinOpMR_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode>
+ : BinOpMR<opcode, mnemonic, typeinfo,
+ [(store (opnode (load addr:$dst), typeinfo.RegClass:$src, EFLAGS),
+ addr:$dst),
+ (implicit EFLAGS)], IIC_BIN_CARRY_MEM>;
+
+// BinOpMR_F - Instructions like "cmp [mem], reg".
+class BinOpMR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode>
+ : BinOpMR<opcode, mnemonic, typeinfo,
+ [(set EFLAGS, (opnode (load addr:$dst), typeinfo.RegClass:$src))]>;
+
+// BinOpMI - Instructions like "add [mem], imm".
+class BinOpMI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ Format f, list<dag> pattern,
+ InstrItinClass itin = IIC_BIN_MEM>
+ : ITy<opcode, f, typeinfo,
+ (outs), (ins typeinfo.MemOperand:$dst, typeinfo.ImmOperand:$src),
+ mnemonic, "{$src, $dst|$dst, $src}", pattern, itin>,
+ Sched<[WriteALULd, WriteRMW]> {
+ let ImmT = typeinfo.ImmEncoding;
+}
+
+// BinOpMI_RMW - Instructions like "add [mem], imm".
+class BinOpMI_RMW<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode, Format f>
+ : BinOpMI<opcode, mnemonic, typeinfo, f,
+ [(store (opnode (typeinfo.VT (load addr:$dst)),
+ typeinfo.ImmOperator:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+// BinOpMI_RMW_FF - Instructions like "adc [mem], imm".
+class BinOpMI_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode, Format f>
+ : BinOpMI<opcode, mnemonic, typeinfo, f,
+ [(store (opnode (typeinfo.VT (load addr:$dst)),
+ typeinfo.ImmOperator:$src, EFLAGS), addr:$dst),
+ (implicit EFLAGS)], IIC_BIN_CARRY_MEM>;
+
+// BinOpMI_F - Instructions like "cmp [mem], imm".
+class BinOpMI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDPatternOperator opnode, Format f>
+ : BinOpMI<opcode, mnemonic, typeinfo, f,
+ [(set EFLAGS, (opnode (typeinfo.VT (load addr:$dst)),
+ typeinfo.ImmOperator:$src))]>;
+
+// BinOpMI8 - Instructions like "add [mem], imm8".
+class BinOpMI8<string mnemonic, X86TypeInfo typeinfo,
+ Format f, list<dag> pattern,
+ InstrItinClass itin = IIC_BIN_MEM>
+ : ITy<0x82, f, typeinfo,
+ (outs), (ins typeinfo.MemOperand:$dst, typeinfo.Imm8Operand:$src),
+ mnemonic, "{$src, $dst|$dst, $src}", pattern, itin>,
+ Sched<[WriteALULd, WriteRMW]> {
+ let ImmT = Imm8; // Always 8-bit immediate.
+}
+
+// BinOpMI8_RMW - Instructions like "add [mem], imm8".
+class BinOpMI8_RMW<string mnemonic, X86TypeInfo typeinfo,
+ SDPatternOperator opnode, Format f>
+ : BinOpMI8<mnemonic, typeinfo, f,
+ [(store (opnode (load addr:$dst),
+ typeinfo.Imm8Operator:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+
+// BinOpMI8_RMW_FF - Instructions like "adc [mem], imm8".
+class BinOpMI8_RMW_FF<string mnemonic, X86TypeInfo typeinfo,
+ SDPatternOperator opnode, Format f>
+ : BinOpMI8<mnemonic, typeinfo, f,
+ [(store (opnode (load addr:$dst),
+ typeinfo.Imm8Operator:$src, EFLAGS), addr:$dst),
+ (implicit EFLAGS)], IIC_BIN_CARRY_MEM>;
+
+// BinOpMI8_F - Instructions like "cmp [mem], imm8".
+class BinOpMI8_F<string mnemonic, X86TypeInfo typeinfo,
+ SDPatternOperator opnode, Format f>
+ : BinOpMI8<mnemonic, typeinfo, f,
+ [(set EFLAGS, (opnode (load addr:$dst),
+ typeinfo.Imm8Operator:$src))]>;
+
+// BinOpAI - Instructions like "add %eax, %eax, imm", that imp-def EFLAGS.
+class BinOpAI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ Register areg, string operands,
+ InstrItinClass itin = IIC_BIN_NONMEM>
+ : ITy<opcode, RawFrm, typeinfo,
+ (outs), (ins typeinfo.ImmOperand:$src),
+ mnemonic, operands, [], itin>, Sched<[WriteALU]> {
+ let ImmT = typeinfo.ImmEncoding;
+ let Uses = [areg];
+ let Defs = [areg, EFLAGS];
+ let hasSideEffects = 0;
+}
+
+// BinOpAI_RFF - Instructions like "adc %eax, %eax, imm", that implicitly define
+// and use EFLAGS.
+class BinOpAI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ Register areg, string operands>
+ : BinOpAI<opcode, mnemonic, typeinfo, areg, operands,
+ IIC_BIN_CARRY_NONMEM> {
+ let Uses = [areg, EFLAGS];
+}
+
+// BinOpAI_F - Instructions like "cmp %eax, %eax, imm", that imp-def EFLAGS.
+class BinOpAI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ Register areg, string operands>
+ : BinOpAI<opcode, mnemonic, typeinfo, areg, operands> {
+ let Defs = [EFLAGS];
+}
+
+/// ArithBinOp_RF - This is an arithmetic binary operator where the pattern is
+/// defined with "(set GPR:$dst, EFLAGS, (...".
+///
+/// It would be nice to get rid of the second and third argument here, but
+/// tblgen can't handle dependent type references aggressively enough: PR8330
+multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
+ string mnemonic, Format RegMRM, Format MemMRM,
+ SDNode opnodeflag, SDNode opnode,
+ bit CommutableRR, bit ConvertibleToThreeAddress> {
+ let Defs = [EFLAGS] in {
+ let Constraints = "$src1 = $dst" in {
+ let isCommutable = CommutableRR in {
+ def NAME#8rr : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>;
+ let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+ def NAME#16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>;
+ def NAME#32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>;
+ def NAME#64rr : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag>;
+ } // isConvertibleToThreeAddress
+ } // isCommutable
+
+ def NAME#8rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>;
+ def NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>;
+ def NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>;
+ def NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>;
+
+ def NAME#8rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag>;
+ def NAME#16rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag>;
+ def NAME#32rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>;
+ def NAME#64rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag>;
+
+ def NAME#8ri : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>;
+
+ let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+ // NOTE: These are order specific, we want the ri8 forms to be listed
+ // first so that they are slightly preferred to the ri forms.
+ def NAME#16ri8 : BinOpRI8_RF<0x82, mnemonic, Xi16, opnodeflag, RegMRM>;
+ def NAME#32ri8 : BinOpRI8_RF<0x82, mnemonic, Xi32, opnodeflag, RegMRM>;
+ def NAME#64ri8 : BinOpRI8_RF<0x82, mnemonic, Xi64, opnodeflag, RegMRM>;
+
+ def NAME#16ri : BinOpRI_RF<0x80, mnemonic, Xi16, opnodeflag, RegMRM>;
+ def NAME#32ri : BinOpRI_RF<0x80, mnemonic, Xi32, opnodeflag, RegMRM>;
+ def NAME#64ri32: BinOpRI_RF<0x80, mnemonic, Xi64, opnodeflag, RegMRM>;
+ }
+ } // Constraints = "$src1 = $dst"
+
+ def NAME#8mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi8 , opnode>;
+ def NAME#16mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi16, opnode>;
+ def NAME#32mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi32, opnode>;
+ def NAME#64mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi64, opnode>;
+
+ // NOTE: These are order specific, we want the mi8 forms to be listed
+ // first so that they are slightly preferred to the mi forms.
+ def NAME#16mi8 : BinOpMI8_RMW<mnemonic, Xi16, opnode, MemMRM>;
+ def NAME#32mi8 : BinOpMI8_RMW<mnemonic, Xi32, opnode, MemMRM>;
+ def NAME#64mi8 : BinOpMI8_RMW<mnemonic, Xi64, opnode, MemMRM>;
+
+ def NAME#8mi : BinOpMI_RMW<0x80, mnemonic, Xi8 , opnode, MemMRM>;
+ def NAME#16mi : BinOpMI_RMW<0x80, mnemonic, Xi16, opnode, MemMRM>;
+ def NAME#32mi : BinOpMI_RMW<0x80, mnemonic, Xi32, opnode, MemMRM>;
+ def NAME#64mi32 : BinOpMI_RMW<0x80, mnemonic, Xi64, opnode, MemMRM>;
+
+ // These are for the disassembler since 0x82 opcode behaves like 0x80, but
+ // not in 64-bit mode.
+ let Predicates = [Not64BitMode], isCodeGenOnly = 1, ForceDisassemble = 1,
+ hasSideEffects = 0 in {
+ let Constraints = "$src1 = $dst" in
+ def NAME#8ri8 : BinOpRI8_RF<0x82, mnemonic, Xi8, null_frag, RegMRM>;
+ let mayLoad = 1, mayStore = 1 in
+ def NAME#8mi8 : BinOpMI8_RMW<mnemonic, Xi8, null_frag, MemMRM>;
+ }
+ } // Defs = [EFLAGS]
+
+ def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
+ "{$src, %al|al, $src}">;
+ def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
+ "{$src, %ax|ax, $src}">;
+ def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
+ "{$src, %eax|eax, $src}">;
+ def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
+ "{$src, %rax|rax, $src}">;
+}
+
+/// ArithBinOp_RFF - This is an arithmetic binary operator where the pattern is
+/// defined with "(set GPR:$dst, EFLAGS, (node LHS, RHS, EFLAGS))" like ADC and
+/// SBB.
+///
+/// It would be nice to get rid of the second and third argument here, but
+/// tblgen can't handle dependent type references aggressively enough: PR8330
+multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
+ string mnemonic, Format RegMRM, Format MemMRM,
+ SDNode opnode, bit CommutableRR,
+ bit ConvertibleToThreeAddress> {
+ let Uses = [EFLAGS], Defs = [EFLAGS] in {
+ let Constraints = "$src1 = $dst" in {
+ let isCommutable = CommutableRR in {
+ def NAME#8rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi8 , opnode>;
+ let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+ def NAME#16rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi16, opnode>;
+ def NAME#32rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi32, opnode>;
+ def NAME#64rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi64, opnode>;
+ } // isConvertibleToThreeAddress
+ } // isCommutable
+
+ def NAME#8rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi8>;
+ def NAME#16rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi16>;
+ def NAME#32rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi32>;
+ def NAME#64rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi64>;
+
+ def NAME#8rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi8 , opnode>;
+ def NAME#16rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi16, opnode>;
+ def NAME#32rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi32, opnode>;
+ def NAME#64rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi64, opnode>;
+
+ def NAME#8ri : BinOpRI_RFF<0x80, mnemonic, Xi8 , opnode, RegMRM>;
+
+ let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+ // NOTE: These are order specific, we want the ri8 forms to be listed
+ // first so that they are slightly preferred to the ri forms.
+ def NAME#16ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi16, opnode, RegMRM>;
+ def NAME#32ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi32, opnode, RegMRM>;
+ def NAME#64ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi64, opnode, RegMRM>;
+
+ def NAME#16ri : BinOpRI_RFF<0x80, mnemonic, Xi16, opnode, RegMRM>;
+ def NAME#32ri : BinOpRI_RFF<0x80, mnemonic, Xi32, opnode, RegMRM>;
+ def NAME#64ri32: BinOpRI_RFF<0x80, mnemonic, Xi64, opnode, RegMRM>;
+ }
+ } // Constraints = "$src1 = $dst"
+
+ def NAME#8mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi8 , opnode>;
+ def NAME#16mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi16, opnode>;
+ def NAME#32mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi32, opnode>;
+ def NAME#64mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi64, opnode>;
+
+ // NOTE: These are order specific, we want the mi8 forms to be listed
+ // first so that they are slightly preferred to the mi forms.
+ def NAME#16mi8 : BinOpMI8_RMW_FF<mnemonic, Xi16, opnode, MemMRM>;
+ def NAME#32mi8 : BinOpMI8_RMW_FF<mnemonic, Xi32, opnode, MemMRM>;
+ def NAME#64mi8 : BinOpMI8_RMW_FF<mnemonic, Xi64, opnode, MemMRM>;
+
+ def NAME#8mi : BinOpMI_RMW_FF<0x80, mnemonic, Xi8 , opnode, MemMRM>;
+ def NAME#16mi : BinOpMI_RMW_FF<0x80, mnemonic, Xi16, opnode, MemMRM>;
+ def NAME#32mi : BinOpMI_RMW_FF<0x80, mnemonic, Xi32, opnode, MemMRM>;
+ def NAME#64mi32 : BinOpMI_RMW_FF<0x80, mnemonic, Xi64, opnode, MemMRM>;
+
+ // These are for the disassembler since 0x82 opcode behaves like 0x80, but
+ // not in 64-bit mode.
+ let Predicates = [Not64BitMode], isCodeGenOnly = 1, ForceDisassemble = 1,
+ hasSideEffects = 0 in {
+ let Constraints = "$src1 = $dst" in
+ def NAME#8ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi8, null_frag, RegMRM>;
+ let mayLoad = 1, mayStore = 1 in
+ def NAME#8mi8 : BinOpMI8_RMW_FF<mnemonic, Xi8, null_frag, MemMRM>;
+ }
+ } // Uses = [EFLAGS], Defs = [EFLAGS]
+
+ def NAME#8i8 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi8 , AL,
+ "{$src, %al|al, $src}">;
+ def NAME#16i16 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi16, AX,
+ "{$src, %ax|ax, $src}">;
+ def NAME#32i32 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi32, EAX,
+ "{$src, %eax|eax, $src}">;
+ def NAME#64i32 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi64, RAX,
+ "{$src, %rax|rax, $src}">;
+}
+
+/// ArithBinOp_F - This is an arithmetic binary operator where the pattern is
+/// defined with "(set EFLAGS, (...". It would be really nice to find a way
+/// to factor this with the other ArithBinOp_*.
+///
+multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
+ string mnemonic, Format RegMRM, Format MemMRM,
+ SDNode opnode,
+ bit CommutableRR, bit ConvertibleToThreeAddress> {
+ let Defs = [EFLAGS] in {
+ let isCommutable = CommutableRR in {
+ def NAME#8rr : BinOpRR_F<BaseOpc, mnemonic, Xi8 , opnode>;
+ let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+ def NAME#16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>;
+ def NAME#32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>;
+ def NAME#64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>;
+ }
+ } // isCommutable
+
+ def NAME#8rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>;
+ def NAME#16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>;
+ def NAME#32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>;
+ def NAME#64rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi64>;
+
+ def NAME#8rm : BinOpRM_F<BaseOpc2, mnemonic, Xi8 , opnode>;
+ def NAME#16rm : BinOpRM_F<BaseOpc2, mnemonic, Xi16, opnode>;
+ def NAME#32rm : BinOpRM_F<BaseOpc2, mnemonic, Xi32, opnode>;
+ def NAME#64rm : BinOpRM_F<BaseOpc2, mnemonic, Xi64, opnode>;
+
+ def NAME#8ri : BinOpRI_F<0x80, mnemonic, Xi8 , opnode, RegMRM>;
+
+ let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+ // NOTE: These are order specific, we want the ri8 forms to be listed
+ // first so that they are slightly preferred to the ri forms.
+ def NAME#16ri8 : BinOpRI8_F<0x82, mnemonic, Xi16, opnode, RegMRM>;
+ def NAME#32ri8 : BinOpRI8_F<0x82, mnemonic, Xi32, opnode, RegMRM>;
+ def NAME#64ri8 : BinOpRI8_F<0x82, mnemonic, Xi64, opnode, RegMRM>;
+
+ def NAME#16ri : BinOpRI_F<0x80, mnemonic, Xi16, opnode, RegMRM>;
+ def NAME#32ri : BinOpRI_F<0x80, mnemonic, Xi32, opnode, RegMRM>;
+ def NAME#64ri32: BinOpRI_F<0x80, mnemonic, Xi64, opnode, RegMRM>;
+ }
+
+ def NAME#8mr : BinOpMR_F<BaseOpc, mnemonic, Xi8 , opnode>;
+ def NAME#16mr : BinOpMR_F<BaseOpc, mnemonic, Xi16, opnode>;
+ def NAME#32mr : BinOpMR_F<BaseOpc, mnemonic, Xi32, opnode>;
+ def NAME#64mr : BinOpMR_F<BaseOpc, mnemonic, Xi64, opnode>;
+
+ // NOTE: These are order specific, we want the mi8 forms to be listed
+ // first so that they are slightly preferred to the mi forms.
+ def NAME#16mi8 : BinOpMI8_F<mnemonic, Xi16, opnode, MemMRM>;
+ def NAME#32mi8 : BinOpMI8_F<mnemonic, Xi32, opnode, MemMRM>;
+ def NAME#64mi8 : BinOpMI8_F<mnemonic, Xi64, opnode, MemMRM>;
+
+ def NAME#8mi : BinOpMI_F<0x80, mnemonic, Xi8 , opnode, MemMRM>;
+ def NAME#16mi : BinOpMI_F<0x80, mnemonic, Xi16, opnode, MemMRM>;
+ def NAME#32mi : BinOpMI_F<0x80, mnemonic, Xi32, opnode, MemMRM>;
+ def NAME#64mi32 : BinOpMI_F<0x80, mnemonic, Xi64, opnode, MemMRM>;
+
+ // These are for the disassembler since 0x82 opcode behaves like 0x80, but
+ // not in 64-bit mode.
+ let Predicates = [Not64BitMode], isCodeGenOnly = 1, ForceDisassemble = 1,
+ hasSideEffects = 0 in {
+ def NAME#8ri8 : BinOpRI8_F<0x82, mnemonic, Xi8, null_frag, RegMRM>;
+ let mayLoad = 1 in
+ def NAME#8mi8 : BinOpMI8_F<mnemonic, Xi8, null_frag, MemMRM>;
+ }
+ } // Defs = [EFLAGS]
+
+ def NAME#8i8 : BinOpAI_F<BaseOpc4, mnemonic, Xi8 , AL,
+ "{$src, %al|al, $src}">;
+ def NAME#16i16 : BinOpAI_F<BaseOpc4, mnemonic, Xi16, AX,
+ "{$src, %ax|ax, $src}">;
+ def NAME#32i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi32, EAX,
+ "{$src, %eax|eax, $src}">;
+ def NAME#64i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi64, RAX,
+ "{$src, %rax|rax, $src}">;
+}
+
+
+defm AND : ArithBinOp_RF<0x20, 0x22, 0x24, "and", MRM4r, MRM4m,
+ X86and_flag, and, 1, 0>;
+defm OR : ArithBinOp_RF<0x08, 0x0A, 0x0C, "or", MRM1r, MRM1m,
+ X86or_flag, or, 1, 0>;
+defm XOR : ArithBinOp_RF<0x30, 0x32, 0x34, "xor", MRM6r, MRM6m,
+ X86xor_flag, xor, 1, 0>;
+defm ADD : ArithBinOp_RF<0x00, 0x02, 0x04, "add", MRM0r, MRM0m,
+ X86add_flag, add, 1, 1>;
+let isCompare = 1 in {
+defm SUB : ArithBinOp_RF<0x28, 0x2A, 0x2C, "sub", MRM5r, MRM5m,
+ X86sub_flag, sub, 0, 0>;
+}
+
+// Arithmetic.
+defm ADC : ArithBinOp_RFF<0x10, 0x12, 0x14, "adc", MRM2r, MRM2m, X86adc_flag,
+ 1, 0>;
+defm SBB : ArithBinOp_RFF<0x18, 0x1A, 0x1C, "sbb", MRM3r, MRM3m, X86sbb_flag,
+ 0, 0>;
+
+let isCompare = 1 in {
+defm CMP : ArithBinOp_F<0x38, 0x3A, 0x3C, "cmp", MRM7r, MRM7m, X86cmp, 0, 0>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Semantically, test instructions are similar like AND, except they don't
+// generate a result. From an encoding perspective, they are very different:
+// they don't have all the usual imm8 and REV forms, and are encoded into a
+// different space.
+def X86testpat : PatFrag<(ops node:$lhs, node:$rhs),
+ (X86cmp (and_su node:$lhs, node:$rhs), 0)>;
+
+let isCompare = 1 in {
+ let Defs = [EFLAGS] in {
+ let isCommutable = 1 in {
+ def TEST8rr : BinOpRR_F<0x84, "test", Xi8 , X86testpat>;
+ def TEST16rr : BinOpRR_F<0x84, "test", Xi16, X86testpat>;
+ def TEST32rr : BinOpRR_F<0x84, "test", Xi32, X86testpat>;
+ def TEST64rr : BinOpRR_F<0x84, "test", Xi64, X86testpat>;
+ } // isCommutable
+
+ def TEST8rm : BinOpRM_F<0x84, "test", Xi8 , X86testpat>;
+ def TEST16rm : BinOpRM_F<0x84, "test", Xi16, X86testpat>;
+ def TEST32rm : BinOpRM_F<0x84, "test", Xi32, X86testpat>;
+ def TEST64rm : BinOpRM_F<0x84, "test", Xi64, X86testpat>;
+
+ def TEST8ri : BinOpRI_F<0xF6, "test", Xi8 , X86testpat, MRM0r>;
+ def TEST16ri : BinOpRI_F<0xF6, "test", Xi16, X86testpat, MRM0r>;
+ def TEST32ri : BinOpRI_F<0xF6, "test", Xi32, X86testpat, MRM0r>;
+ def TEST64ri32 : BinOpRI_F<0xF6, "test", Xi64, X86testpat, MRM0r>;
+
+ def TEST8mi : BinOpMI_F<0xF6, "test", Xi8 , X86testpat, MRM0m>;
+ def TEST16mi : BinOpMI_F<0xF6, "test", Xi16, X86testpat, MRM0m>;
+ def TEST32mi : BinOpMI_F<0xF6, "test", Xi32, X86testpat, MRM0m>;
+ def TEST64mi32 : BinOpMI_F<0xF6, "test", Xi64, X86testpat, MRM0m>;
+
+ // When testing the result of EXTRACT_SUBREG sub_8bit_hi, make sure the
+ // register class is constrained to GR8_NOREX. This pseudo is explicitly
+ // marked side-effect free, since it doesn't have an isel pattern like
+ // other test instructions.
+ let isPseudo = 1, hasSideEffects = 0 in
+ def TEST8ri_NOREX : I<0, Pseudo, (outs), (ins GR8_NOREX:$src, i8imm:$mask),
+ "", [], IIC_BIN_NONMEM>, Sched<[WriteALU]>;
+ } // Defs = [EFLAGS]
+
+ def TEST8i8 : BinOpAI_F<0xA8, "test", Xi8 , AL,
+ "{$src, %al|al, $src}">;
+ def TEST16i16 : BinOpAI_F<0xA8, "test", Xi16, AX,
+ "{$src, %ax|ax, $src}">;
+ def TEST32i32 : BinOpAI_F<0xA8, "test", Xi32, EAX,
+ "{$src, %eax|eax, $src}">;
+ def TEST64i32 : BinOpAI_F<0xA8, "test", Xi64, RAX,
+ "{$src, %rax|rax, $src}">;
+} // isCompare
+
+//===----------------------------------------------------------------------===//
+// ANDN Instruction
+//
+multiclass bmi_andn<string mnemonic, RegisterClass RC, X86MemOperand x86memop,
+ PatFrag ld_frag> {
+ def rr : I<0xF2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, EFLAGS, (X86and_flag (not RC:$src1), RC:$src2))],
+ IIC_BIN_NONMEM>, Sched<[WriteALU]>;
+ def rm : I<0xF2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, EFLAGS,
+ (X86and_flag (not RC:$src1), (ld_frag addr:$src2)))], IIC_BIN_MEM>,
+ Sched<[WriteALULd, ReadAfterLd]>;
+}
+
+let Predicates = [HasBMI], Defs = [EFLAGS] in {
+ defm ANDN32 : bmi_andn<"andn{l}", GR32, i32mem, loadi32>, T8PS, VEX_4V;
+ defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64>, T8PS, VEX_4V, VEX_W;
+}
+
+let Predicates = [HasBMI] in {
+ def : Pat<(and (not GR32:$src1), GR32:$src2),
+ (ANDN32rr GR32:$src1, GR32:$src2)>;
+ def : Pat<(and (not GR64:$src1), GR64:$src2),
+ (ANDN64rr GR64:$src1, GR64:$src2)>;
+ def : Pat<(and (not GR32:$src1), (loadi32 addr:$src2)),
+ (ANDN32rm GR32:$src1, addr:$src2)>;
+ def : Pat<(and (not GR64:$src1), (loadi64 addr:$src2)),
+ (ANDN64rm GR64:$src1, addr:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// MULX Instruction
+//
+multiclass bmi_mulx<string mnemonic, RegisterClass RC, X86MemOperand x86memop> {
+let hasSideEffects = 0 in {
+ let isCommutable = 1 in
+ def rr : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src),
+ !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
+ [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMul, WriteIMulH]>;
+
+ let mayLoad = 1 in
+ def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src),
+ !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
+ [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMulLd, WriteIMulH]>;
+}
+}
+
+let Predicates = [HasBMI2] in {
+ let Uses = [EDX] in
+ defm MULX32 : bmi_mulx<"mulx{l}", GR32, i32mem>;
+ let Uses = [RDX] in
+ defm MULX64 : bmi_mulx<"mulx{q}", GR64, i64mem>, VEX_W;
+}
+
+//===----------------------------------------------------------------------===//
+// ADCX Instruction
+//
+let Predicates = [HasADX], Defs = [EFLAGS], Uses = [EFLAGS],
+ Constraints = "$src0 = $dst", AddedComplexity = 10 in {
+ let SchedRW = [WriteALU] in {
+ def ADCX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst),
+ (ins GR32:$src0, GR32:$src), "adcx{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, EFLAGS,
+ (X86adc_flag GR32:$src0, GR32:$src, EFLAGS))],
+ IIC_BIN_CARRY_NONMEM>, T8PD;
+ def ADCX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst),
+ (ins GR64:$src0, GR64:$src), "adcx{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, EFLAGS,
+ (X86adc_flag GR64:$src0, GR64:$src, EFLAGS))],
+ IIC_BIN_CARRY_NONMEM>, T8PD;
+ } // SchedRW
+
+ let mayLoad = 1, SchedRW = [WriteALULd] in {
+ def ADCX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst),
+ (ins GR32:$src0, i32mem:$src), "adcx{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, EFLAGS,
+ (X86adc_flag GR32:$src0, (loadi32 addr:$src), EFLAGS))],
+ IIC_BIN_CARRY_MEM>, T8PD;
+
+ def ADCX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst),
+ (ins GR64:$src0, i64mem:$src), "adcx{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, EFLAGS,
+ (X86adc_flag GR64:$src0, (loadi64 addr:$src), EFLAGS))],
+ IIC_BIN_CARRY_MEM>, T8PD;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// ADOX Instruction
+//
+let Predicates = [HasADX], hasSideEffects = 0, Defs = [EFLAGS],
+ Uses = [EFLAGS] in {
+ let SchedRW = [WriteALU] in {
+ def ADOX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "adox{l}\t{$src, $dst|$dst, $src}", [], IIC_BIN_NONMEM>, T8XS;
+
+ def ADOX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "adox{q}\t{$src, $dst|$dst, $src}", [], IIC_BIN_NONMEM>, T8XS;
+ } // SchedRW
+
+ let mayLoad = 1, SchedRW = [WriteALULd] in {
+ def ADOX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "adox{l}\t{$src, $dst|$dst, $src}", [], IIC_BIN_MEM>, T8XS;
+
+ def ADOX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "adox{q}\t{$src, $dst|$dst, $src}", [], IIC_BIN_MEM>, T8XS;
+ }
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrBuilder.h b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h
new file mode 100644
index 000000000000..ba970bc2048e
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h
@@ -0,0 +1,233 @@
+//===-- X86InstrBuilder.h - Functions to aid building x86 insts -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes functions that may be used with BuildMI from the
+// MachineInstrBuilder.h file to handle X86'isms in a clean way.
+//
+// The BuildMem function may be used with the BuildMI function to add entire
+// memory references in a single, typed, function call. X86 memory references
+// can be very complex expressions (described in the README), so wrapping them
+// up behind an easier to use interface makes sense. Descriptions of the
+// functions are included below.
+//
+// For reference, the order of operands for memory references is:
+// (Operand), Base, Scale, Index, Displacement.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86INSTRBUILDER_H
+#define LLVM_LIB_TARGET_X86_X86INSTRBUILDER_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include <cassert>
+
+namespace llvm {
+
+/// X86AddressMode - This struct holds a generalized full x86 address mode.
+/// The base register can be a frame index, which will eventually be replaced
+/// with BP or SP and Disp being offsetted accordingly. The displacement may
+/// also include the offset of a global value.
+struct X86AddressMode {
+ enum {
+ RegBase,
+ FrameIndexBase
+ } BaseType;
+
+ union {
+ unsigned Reg;
+ int FrameIndex;
+ } Base;
+
+ unsigned Scale;
+ unsigned IndexReg;
+ int Disp;
+ const GlobalValue *GV;
+ unsigned GVOpFlags;
+
+ X86AddressMode()
+ : BaseType(RegBase), Scale(1), IndexReg(0), Disp(0), GV(nullptr),
+ GVOpFlags(0) {
+ Base.Reg = 0;
+ }
+
+ void getFullAddress(SmallVectorImpl<MachineOperand> &MO) {
+ assert(Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8);
+
+ if (BaseType == X86AddressMode::RegBase)
+ MO.push_back(MachineOperand::CreateReg(Base.Reg, false, false, false,
+ false, false, false, 0, false));
+ else {
+ assert(BaseType == X86AddressMode::FrameIndexBase);
+ MO.push_back(MachineOperand::CreateFI(Base.FrameIndex));
+ }
+
+ MO.push_back(MachineOperand::CreateImm(Scale));
+ MO.push_back(MachineOperand::CreateReg(IndexReg, false, false, false, false,
+ false, false, 0, false));
+
+ if (GV)
+ MO.push_back(MachineOperand::CreateGA(GV, Disp, GVOpFlags));
+ else
+ MO.push_back(MachineOperand::CreateImm(Disp));
+
+ MO.push_back(MachineOperand::CreateReg(0, false, false, false, false, false,
+ false, 0, false));
+ }
+};
+
+/// Compute the addressing mode from an machine instruction starting with the
+/// given operand.
+static inline X86AddressMode getAddressFromInstr(const MachineInstr *MI,
+ unsigned Operand) {
+ X86AddressMode AM;
+ const MachineOperand &Op0 = MI->getOperand(Operand);
+ if (Op0.isReg()) {
+ AM.BaseType = X86AddressMode::RegBase;
+ AM.Base.Reg = Op0.getReg();
+ } else {
+ AM.BaseType = X86AddressMode::FrameIndexBase;
+ AM.Base.FrameIndex = Op0.getIndex();
+ }
+
+ const MachineOperand &Op1 = MI->getOperand(Operand + 1);
+ AM.Scale = Op1.getImm();
+
+ const MachineOperand &Op2 = MI->getOperand(Operand + 2);
+ AM.IndexReg = Op2.getReg();
+
+ const MachineOperand &Op3 = MI->getOperand(Operand + 3);
+ if (Op3.isGlobal())
+ AM.GV = Op3.getGlobal();
+ else
+ AM.Disp = Op3.getImm();
+
+ return AM;
+}
+
+/// addDirectMem - This function is used to add a direct memory reference to the
+/// current instruction -- that is, a dereference of an address in a register,
+/// with no scale, index or displacement. An example is: DWORD PTR [EAX].
+///
+static inline const MachineInstrBuilder &
+addDirectMem(const MachineInstrBuilder &MIB, unsigned Reg) {
+ // Because memory references are always represented with five
+ // values, this adds: Reg, 1, NoReg, 0, NoReg to the instruction.
+ return MIB.addReg(Reg).addImm(1).addReg(0).addImm(0).addReg(0);
+}
+
+/// Replace the address used in the instruction with the direct memory
+/// reference.
+static inline void setDirectAddressInInstr(MachineInstr *MI, unsigned Operand,
+ unsigned Reg) {
+ // Direct memory address is in a form of: Reg, 1 (Scale), NoReg, 0, NoReg.
+ MI->getOperand(Operand).setReg(Reg);
+ MI->getOperand(Operand + 1).setImm(1);
+ MI->getOperand(Operand + 2).setReg(0);
+ MI->getOperand(Operand + 3).setImm(0);
+ MI->getOperand(Operand + 4).setReg(0);
+}
+
+static inline const MachineInstrBuilder &
+addOffset(const MachineInstrBuilder &MIB, int Offset) {
+ return MIB.addImm(1).addReg(0).addImm(Offset).addReg(0);
+}
+
+static inline const MachineInstrBuilder &
+addOffset(const MachineInstrBuilder &MIB, const MachineOperand& Offset) {
+ return MIB.addImm(1).addReg(0).addOperand(Offset).addReg(0);
+}
+
+/// addRegOffset - This function is used to add a memory reference of the form
+/// [Reg + Offset], i.e., one with no scale or index, but with a
+/// displacement. An example is: DWORD PTR [EAX + 4].
+///
+static inline const MachineInstrBuilder &
+addRegOffset(const MachineInstrBuilder &MIB,
+ unsigned Reg, bool isKill, int Offset) {
+ return addOffset(MIB.addReg(Reg, getKillRegState(isKill)), Offset);
+}
+
+/// addRegReg - This function is used to add a memory reference of the form:
+/// [Reg + Reg].
+static inline const MachineInstrBuilder &addRegReg(const MachineInstrBuilder &MIB,
+ unsigned Reg1, bool isKill1,
+ unsigned Reg2, bool isKill2) {
+ return MIB.addReg(Reg1, getKillRegState(isKill1)).addImm(1)
+ .addReg(Reg2, getKillRegState(isKill2)).addImm(0).addReg(0);
+}
+
+static inline const MachineInstrBuilder &
+addFullAddress(const MachineInstrBuilder &MIB,
+ const X86AddressMode &AM) {
+ assert(AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8);
+
+ if (AM.BaseType == X86AddressMode::RegBase)
+ MIB.addReg(AM.Base.Reg);
+ else {
+ assert(AM.BaseType == X86AddressMode::FrameIndexBase);
+ MIB.addFrameIndex(AM.Base.FrameIndex);
+ }
+
+ MIB.addImm(AM.Scale).addReg(AM.IndexReg);
+ if (AM.GV)
+ MIB.addGlobalAddress(AM.GV, AM.Disp, AM.GVOpFlags);
+ else
+ MIB.addImm(AM.Disp);
+
+ return MIB.addReg(0);
+}
+
+/// addFrameReference - This function is used to add a reference to the base of
+/// an abstract object on the stack frame of the current function. This
+/// reference has base register as the FrameIndex offset until it is resolved.
+/// This allows a constant offset to be specified as well...
+///
+static inline const MachineInstrBuilder &
+addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0) {
+ MachineInstr *MI = MIB;
+ MachineFunction &MF = *MI->getParent()->getParent();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ const MCInstrDesc &MCID = MI->getDesc();
+ auto Flags = MachineMemOperand::MONone;
+ if (MCID.mayLoad())
+ Flags |= MachineMemOperand::MOLoad;
+ if (MCID.mayStore())
+ Flags |= MachineMemOperand::MOStore;
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FI, Offset), Flags,
+ MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+ return addOffset(MIB.addFrameIndex(FI), Offset)
+ .addMemOperand(MMO);
+}
+
+/// addConstantPoolReference - This function is used to add a reference to the
+/// base of a constant value spilled to the per-function constant pool. The
+/// reference uses the abstract ConstantPoolIndex which is retained until
+/// either machine code emission or assembly output. In PIC mode on x86-32,
+/// the GlobalBaseReg parameter can be used to make this a
+/// GlobalBaseReg-relative reference.
+///
+static inline const MachineInstrBuilder &
+addConstantPoolReference(const MachineInstrBuilder &MIB, unsigned CPI,
+ unsigned GlobalBaseReg, unsigned char OpFlags) {
+ //FIXME: factor this
+ return MIB.addReg(GlobalBaseReg).addImm(1).addReg(0)
+ .addConstantPoolIndex(CPI, 0, OpFlags).addReg(0);
+}
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_X86_X86INSTRBUILDER_H
diff --git a/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td
new file mode 100644
index 000000000000..c73c95019f8d
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td
@@ -0,0 +1,112 @@
+//===-- X86InstrCMovSetCC.td - Conditional Move and SetCC --*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 conditional move and set on condition
+// instructions.
+//
+//===----------------------------------------------------------------------===//
+
+
+// CMOV instructions.
+multiclass CMOV<bits<8> opc, string Mnemonic, PatLeaf CondNode> {
+ let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
+ isCommutable = 1, SchedRW = [WriteALU] in {
+ def NAME#16rr
+ : I<opc, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ !strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"),
+ [(set GR16:$dst,
+ (X86cmov GR16:$src1, GR16:$src2, CondNode, EFLAGS))],
+ IIC_CMOV16_RR>, TB, OpSize16;
+ def NAME#32rr
+ : I<opc, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ !strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"),
+ [(set GR32:$dst,
+ (X86cmov GR32:$src1, GR32:$src2, CondNode, EFLAGS))],
+ IIC_CMOV32_RR>, TB, OpSize32;
+ def NAME#64rr
+ :RI<opc, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ !strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"),
+ [(set GR64:$dst,
+ (X86cmov GR64:$src1, GR64:$src2, CondNode, EFLAGS))],
+ IIC_CMOV32_RR>, TB;
+ }
+
+ let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
+ SchedRW = [WriteALULd, ReadAfterLd] in {
+ def NAME#16rm
+ : I<opc, MRMSrcMem, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ !strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"),
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ CondNode, EFLAGS))], IIC_CMOV16_RM>,
+ TB, OpSize16;
+ def NAME#32rm
+ : I<opc, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ !strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"),
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ CondNode, EFLAGS))], IIC_CMOV32_RM>,
+ TB, OpSize32;
+ def NAME#64rm
+ :RI<opc, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ !strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"),
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ CondNode, EFLAGS))], IIC_CMOV32_RM>, TB;
+ } // Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst"
+} // end multiclass
+
+
+// Conditional Moves.
+defm CMOVO : CMOV<0x40, "cmovo" , X86_COND_O>;
+defm CMOVNO : CMOV<0x41, "cmovno", X86_COND_NO>;
+defm CMOVB : CMOV<0x42, "cmovb" , X86_COND_B>;
+defm CMOVAE : CMOV<0x43, "cmovae", X86_COND_AE>;
+defm CMOVE : CMOV<0x44, "cmove" , X86_COND_E>;
+defm CMOVNE : CMOV<0x45, "cmovne", X86_COND_NE>;
+defm CMOVBE : CMOV<0x46, "cmovbe", X86_COND_BE>;
+defm CMOVA : CMOV<0x47, "cmova" , X86_COND_A>;
+defm CMOVS : CMOV<0x48, "cmovs" , X86_COND_S>;
+defm CMOVNS : CMOV<0x49, "cmovns", X86_COND_NS>;
+defm CMOVP : CMOV<0x4A, "cmovp" , X86_COND_P>;
+defm CMOVNP : CMOV<0x4B, "cmovnp", X86_COND_NP>;
+defm CMOVL : CMOV<0x4C, "cmovl" , X86_COND_L>;
+defm CMOVGE : CMOV<0x4D, "cmovge", X86_COND_GE>;
+defm CMOVLE : CMOV<0x4E, "cmovle", X86_COND_LE>;
+defm CMOVG : CMOV<0x4F, "cmovg" , X86_COND_G>;
+
+
+// SetCC instructions.
+multiclass SETCC<bits<8> opc, string Mnemonic, PatLeaf OpNode> {
+ let Uses = [EFLAGS] in {
+ def r : I<opc, MRMXr, (outs GR8:$dst), (ins),
+ !strconcat(Mnemonic, "\t$dst"),
+ [(set GR8:$dst, (X86setcc OpNode, EFLAGS))],
+ IIC_SET_R>, TB, Sched<[WriteALU]>;
+ def m : I<opc, MRMXm, (outs), (ins i8mem:$dst),
+ !strconcat(Mnemonic, "\t$dst"),
+ [(store (X86setcc OpNode, EFLAGS), addr:$dst)],
+ IIC_SET_M>, TB, Sched<[WriteALU, WriteStore]>;
+ } // Uses = [EFLAGS]
+}
+
+defm SETO : SETCC<0x90, "seto", X86_COND_O>; // is overflow bit set
+defm SETNO : SETCC<0x91, "setno", X86_COND_NO>; // is overflow bit not set
+defm SETB : SETCC<0x92, "setb", X86_COND_B>; // unsigned less than
+defm SETAE : SETCC<0x93, "setae", X86_COND_AE>; // unsigned greater or equal
+defm SETE : SETCC<0x94, "sete", X86_COND_E>; // equal to
+defm SETNE : SETCC<0x95, "setne", X86_COND_NE>; // not equal to
+defm SETBE : SETCC<0x96, "setbe", X86_COND_BE>; // unsigned less than or equal
+defm SETA : SETCC<0x97, "seta", X86_COND_A>; // unsigned greater than
+defm SETS : SETCC<0x98, "sets", X86_COND_S>; // is signed bit set
+defm SETNS : SETCC<0x99, "setns", X86_COND_NS>; // is not signed
+defm SETP : SETCC<0x9A, "setp", X86_COND_P>; // is parity bit set
+defm SETNP : SETCC<0x9B, "setnp", X86_COND_NP>; // is parity bit not set
+defm SETL : SETCC<0x9C, "setl", X86_COND_L>; // signed less than
+defm SETGE : SETCC<0x9D, "setge", X86_COND_GE>; // signed greater or equal
+defm SETLE : SETCC<0x9E, "setle", X86_COND_LE>; // signed less than or equal
+defm SETG : SETCC<0x9F, "setg", X86_COND_G>; // signed greater than
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td
new file mode 100644
index 000000000000..3c27eb8077d0
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -0,0 +1,1932 @@
+//===- X86InstrCompiler.td - Compiler Pseudos and Patterns -*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the various pseudo instructions used by the compiler,
+// as well as Pat patterns used during instruction selection.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Pattern Matching Support
+
+def GetLo32XForm : SDNodeXForm<imm, [{
+ // Transformation function: get the low 32 bits.
+ return getI32Imm((unsigned)N->getZExtValue(), SDLoc(N));
+}]>;
+
+def GetLo8XForm : SDNodeXForm<imm, [{
+ // Transformation function: get the low 8 bits.
+ return getI8Imm((uint8_t)N->getZExtValue(), SDLoc(N));
+}]>;
+
+
+//===----------------------------------------------------------------------===//
+// Random Pseudo Instructions.
+
+// PIC base construction. This expands to code that looks like this:
+// call $next_inst
+// popl %destreg"
+let hasSideEffects = 0, isNotDuplicable = 1, Uses = [ESP] in
+ def MOVPC32r : Ii32<0xE8, Pseudo, (outs GR32:$reg), (ins i32imm:$label),
+ "", []>;
+
+
+// ADJCALLSTACKDOWN/UP implicitly use/def ESP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
+// sub / add which can clobber EFLAGS.
+let Defs = [ESP, EFLAGS], Uses = [ESP] in {
+def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+ "#ADJCALLSTACKDOWN",
+ []>,
+ Requires<[NotLP64]>;
+def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+ "#ADJCALLSTACKUP",
+ [(X86callseq_end timm:$amt1, timm:$amt2)]>,
+ Requires<[NotLP64]>;
+}
+def : Pat<(X86callseq_start timm:$amt1),
+ (ADJCALLSTACKDOWN32 i32imm:$amt1, 0)>, Requires<[NotLP64]>;
+
+
+// ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
+// sub / add which can clobber EFLAGS.
+let Defs = [RSP, EFLAGS], Uses = [RSP] in {
+def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+ "#ADJCALLSTACKDOWN",
+ []>,
+ Requires<[IsLP64]>;
+def ADJCALLSTACKUP64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+ "#ADJCALLSTACKUP",
+ [(X86callseq_end timm:$amt1, timm:$amt2)]>,
+ Requires<[IsLP64]>;
+}
+def : Pat<(X86callseq_start timm:$amt1),
+ (ADJCALLSTACKDOWN64 i32imm:$amt1, 0)>, Requires<[IsLP64]>;
+
+
+// x86-64 va_start lowering magic.
+let usesCustomInserter = 1, Defs = [EFLAGS] in {
+def VASTART_SAVE_XMM_REGS : I<0, Pseudo,
+ (outs),
+ (ins GR8:$al,
+ i64imm:$regsavefi, i64imm:$offset,
+ variable_ops),
+ "#VASTART_SAVE_XMM_REGS $al, $regsavefi, $offset",
+ [(X86vastart_save_xmm_regs GR8:$al,
+ imm:$regsavefi,
+ imm:$offset),
+ (implicit EFLAGS)]>;
+
+// The VAARG_64 pseudo-instruction takes the address of the va_list,
+// and places the address of the next argument into a register.
+let Defs = [EFLAGS] in
+def VAARG_64 : I<0, Pseudo,
+ (outs GR64:$dst),
+ (ins i8mem:$ap, i32imm:$size, i8imm:$mode, i32imm:$align),
+ "#VAARG_64 $dst, $ap, $size, $mode, $align",
+ [(set GR64:$dst,
+ (X86vaarg64 addr:$ap, imm:$size, imm:$mode, imm:$align)),
+ (implicit EFLAGS)]>;
+
+
+// When using segmented stacks these are lowered into instructions which first
+// check if the current stacklet has enough free memory. If it does, memory is
+// allocated by bumping the stack pointer. Otherwise memory is allocated from
+// the heap.
+
+let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
+def SEG_ALLOCA_32 : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$size),
+ "# variable sized alloca for segmented stacks",
+ [(set GR32:$dst,
+ (X86SegAlloca GR32:$size))]>,
+ Requires<[NotLP64]>;
+
+let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in
+def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size),
+ "# variable sized alloca for segmented stacks",
+ [(set GR64:$dst,
+ (X86SegAlloca GR64:$size))]>,
+ Requires<[In64BitMode]>;
+}
+
+// Dynamic stack allocation yields a _chkstk or _alloca call for all Windows
+// targets. These calls are needed to probe the stack when allocating more than
+// 4k bytes in one go. Touching the stack at 4K increments is necessary to
+// ensure that the guard pages used by the OS virtual memory manager are
+// allocated in correct sequence.
+// The main point of having separate instruction are extra unmodelled effects
+// (compared to ordinary calls) like stack pointer change.
+
+let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
+def WIN_ALLOCA_32 : I<0, Pseudo, (outs), (ins GR32:$size),
+ "# dynamic stack allocation",
+ [(X86WinAlloca GR32:$size)]>,
+ Requires<[NotLP64]>;
+
+let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in
+def WIN_ALLOCA_64 : I<0, Pseudo, (outs), (ins GR64:$size),
+ "# dynamic stack allocation",
+ [(X86WinAlloca GR64:$size)]>,
+ Requires<[In64BitMode]>;
+
+
+//===----------------------------------------------------------------------===//
+// EH Pseudo Instructions
+//
+let SchedRW = [WriteSystem] in {
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+ hasCtrlDep = 1, isCodeGenOnly = 1 in {
+def EH_RETURN : I<0xC3, RawFrm, (outs), (ins GR32:$addr),
+ "ret\t#eh_return, addr: $addr",
+ [(X86ehret GR32:$addr)], IIC_RET>, Sched<[WriteJumpLd]>;
+
+}
+
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+ hasCtrlDep = 1, isCodeGenOnly = 1 in {
+def EH_RETURN64 : I<0xC3, RawFrm, (outs), (ins GR64:$addr),
+ "ret\t#eh_return, addr: $addr",
+ [(X86ehret GR64:$addr)], IIC_RET>, Sched<[WriteJumpLd]>;
+
+}
+
+let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
+ isCodeGenOnly = 1, isReturn = 1 in {
+ def CLEANUPRET : I<0, Pseudo, (outs), (ins), "# CLEANUPRET", [(cleanupret)]>;
+
+ // CATCHRET needs a custom inserter for SEH.
+ let usesCustomInserter = 1 in
+ def CATCHRET : I<0, Pseudo, (outs), (ins brtarget32:$dst, brtarget32:$from),
+ "# CATCHRET",
+ [(catchret bb:$dst, bb:$from)]>;
+}
+
+let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1,
+ usesCustomInserter = 1 in
+def CATCHPAD : I<0, Pseudo, (outs), (ins), "# CATCHPAD", [(catchpad)]>;
+
+// This instruction is responsible for re-establishing stack pointers after an
+// exception has been caught and we are rejoining normal control flow in the
+// parent function or funclet. It generally sets ESP and EBP, and optionally
+// ESI. It is only needed for 32-bit WinEH, as the runtime restores CSRs for us
+// elsewhere.
+let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1 in
+def EH_RESTORE : I<0, Pseudo, (outs), (ins), "# EH_RESTORE", []>;
+
+let hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
+ usesCustomInserter = 1 in {
+ def EH_SjLj_SetJmp32 : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$buf),
+ "#EH_SJLJ_SETJMP32",
+ [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>,
+ Requires<[Not64BitMode]>;
+ def EH_SjLj_SetJmp64 : I<0, Pseudo, (outs GR32:$dst), (ins i64mem:$buf),
+ "#EH_SJLJ_SETJMP64",
+ [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>,
+ Requires<[In64BitMode]>;
+ let isTerminator = 1 in {
+ def EH_SjLj_LongJmp32 : I<0, Pseudo, (outs), (ins i32mem:$buf),
+ "#EH_SJLJ_LONGJMP32",
+ [(X86eh_sjlj_longjmp addr:$buf)]>,
+ Requires<[Not64BitMode]>;
+ def EH_SjLj_LongJmp64 : I<0, Pseudo, (outs), (ins i64mem:$buf),
+ "#EH_SJLJ_LONGJMP64",
+ [(X86eh_sjlj_longjmp addr:$buf)]>,
+ Requires<[In64BitMode]>;
+ }
+}
+} // SchedRW
+
+let isBranch = 1, isTerminator = 1, isCodeGenOnly = 1 in {
+ def EH_SjLj_Setup : I<0, Pseudo, (outs), (ins brtarget:$dst),
+ "#EH_SjLj_Setup\t$dst", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions used by unwind info.
+//
+let isPseudo = 1 in {
+ def SEH_PushReg : I<0, Pseudo, (outs), (ins i32imm:$reg),
+ "#SEH_PushReg $reg", []>;
+ def SEH_SaveReg : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst),
+ "#SEH_SaveReg $reg, $dst", []>;
+ def SEH_SaveXMM : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst),
+ "#SEH_SaveXMM $reg, $dst", []>;
+ def SEH_StackAlloc : I<0, Pseudo, (outs), (ins i32imm:$size),
+ "#SEH_StackAlloc $size", []>;
+ def SEH_SetFrame : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$offset),
+ "#SEH_SetFrame $reg, $offset", []>;
+ def SEH_PushFrame : I<0, Pseudo, (outs), (ins i1imm:$mode),
+ "#SEH_PushFrame $mode", []>;
+ def SEH_EndPrologue : I<0, Pseudo, (outs), (ins),
+ "#SEH_EndPrologue", []>;
+ def SEH_Epilogue : I<0, Pseudo, (outs), (ins),
+ "#SEH_Epilogue", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions used by segmented stacks.
+//
+
+// This is lowered into a RET instruction by MCInstLower. We need
+// this so that we don't have to have a MachineBasicBlock which ends
+// with a RET and also has successors.
+let isPseudo = 1 in {
+def MORESTACK_RET: I<0, Pseudo, (outs), (ins),
+ "", []>;
+
+// This instruction is lowered to a RET followed by a MOV. The two
+// instructions are not generated on a higher level since then the
+// verifier sees a MachineBasicBlock ending with a non-terminator.
+def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins),
+ "", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Alias Instructions
+//===----------------------------------------------------------------------===//
+
+// Alias instruction mapping movr0 to xor.
+// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
+let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1,
+ isPseudo = 1, AddedComplexity = 20 in
+def MOV32r0 : I<0, Pseudo, (outs GR32:$dst), (ins), "",
+ [(set GR32:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>;
+
+// Other widths can also make use of the 32-bit xor, which may have a smaller
+// encoding and avoid partial register updates.
+def : Pat<(i8 0), (EXTRACT_SUBREG (MOV32r0), sub_8bit)>;
+def : Pat<(i16 0), (EXTRACT_SUBREG (MOV32r0), sub_16bit)>;
+def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)> {
+ let AddedComplexity = 20;
+}
+
+let Predicates = [OptForSize, NotSlowIncDec, Not64BitMode],
+ AddedComplexity = 15 in {
+ // Pseudo instructions for materializing 1 and -1 using XOR+INC/DEC,
+ // which only require 3 bytes compared to MOV32ri which requires 5.
+ let Defs = [EFLAGS], isReMaterializable = 1, isPseudo = 1 in {
+ def MOV32r1 : I<0, Pseudo, (outs GR32:$dst), (ins), "",
+ [(set GR32:$dst, 1)]>;
+ def MOV32r_1 : I<0, Pseudo, (outs GR32:$dst), (ins), "",
+ [(set GR32:$dst, -1)]>;
+ }
+
+ // MOV16ri is 4 bytes, so the instructions above are smaller.
+ def : Pat<(i16 1), (EXTRACT_SUBREG (MOV32r1), sub_16bit)>;
+ def : Pat<(i16 -1), (EXTRACT_SUBREG (MOV32r_1), sub_16bit)>;
+}
+
+let isReMaterializable = 1, isPseudo = 1, AddedComplexity = 10 in {
+// AddedComplexity higher than MOV64ri but lower than MOV32r0 and MOV32r1.
+// FIXME: Add itinerary class and Schedule.
+def MOV32ImmSExti8 : I<0, Pseudo, (outs GR32:$dst), (ins i32i8imm:$src), "",
+ [(set GR32:$dst, i32immSExt8:$src)]>,
+ Requires<[OptForMinSize, NotWin64WithoutFP]>;
+def MOV64ImmSExti8 : I<0, Pseudo, (outs GR64:$dst), (ins i64i8imm:$src), "",
+ [(set GR64:$dst, i64immSExt8:$src)]>,
+ Requires<[OptForMinSize, NotWin64WithoutFP]>;
+}
+
+// Materialize i64 constant where top 32-bits are zero. This could theoretically
+// use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however
+// that would make it more difficult to rematerialize.
+let isReMaterializable = 1, isAsCheapAsAMove = 1,
+ isPseudo = 1, hasSideEffects = 0 in
+def MOV32ri64 : I<0, Pseudo, (outs GR32:$dst), (ins i64i32imm:$src), "", []>;
+
+// This 64-bit pseudo-move can be used for both a 64-bit constant that is
+// actually the zero-extension of a 32-bit constant and for labels in the
+// x86-64 small code model.
+def mov64imm32 : ComplexPattern<i64, 1, "selectMOV64Imm32", [imm, X86Wrapper]>;
+
+let AddedComplexity = 1 in
+def : Pat<(i64 mov64imm32:$src),
+ (SUBREG_TO_REG (i64 0), (MOV32ri64 mov64imm32:$src), sub_32bit)>;
+
+// Use sbb to materialize carry bit.
+let Uses = [EFLAGS], Defs = [EFLAGS], isPseudo = 1, SchedRW = [WriteALU] in {
+// FIXME: These are pseudo ops that should be replaced with Pat<> patterns.
+// However, Pat<> can't replicate the destination reg into the inputs of the
+// result.
+def SETB_C8r : I<0, Pseudo, (outs GR8:$dst), (ins), "",
+ [(set GR8:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+def SETB_C16r : I<0, Pseudo, (outs GR16:$dst), (ins), "",
+ [(set GR16:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+def SETB_C32r : I<0, Pseudo, (outs GR32:$dst), (ins), "",
+ [(set GR32:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+def SETB_C64r : I<0, Pseudo, (outs GR64:$dst), (ins), "",
+ [(set GR64:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+} // isCodeGenOnly
+
+
+def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+ (SETB_C16r)>;
+def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+ (SETB_C32r)>;
+def : Pat<(i64 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+ (SETB_C64r)>;
+
+def : Pat<(i16 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+ (SETB_C16r)>;
+def : Pat<(i32 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+ (SETB_C32r)>;
+def : Pat<(i64 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+ (SETB_C64r)>;
+
+// We canonicalize 'setb' to "(and (sbb reg,reg), 1)" on the hope that the and
+// will be eliminated and that the sbb can be extended up to a wider type. When
+// this happens, it is great. However, if we are left with an 8-bit sbb and an
+// and, we might as well just match it as a setb.
+def : Pat<(and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1),
+ (SETBr)>;
+
+// (add OP, SETB) -> (adc OP, 0)
+def : Pat<(add (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR8:$op),
+ (ADC8ri GR8:$op, 0)>;
+def : Pat<(add (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR32:$op),
+ (ADC32ri8 GR32:$op, 0)>;
+def : Pat<(add (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR64:$op),
+ (ADC64ri8 GR64:$op, 0)>;
+
+// (sub OP, SETB) -> (sbb OP, 0)
+def : Pat<(sub GR8:$op, (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
+ (SBB8ri GR8:$op, 0)>;
+def : Pat<(sub GR32:$op, (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
+ (SBB32ri8 GR32:$op, 0)>;
+def : Pat<(sub GR64:$op, (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
+ (SBB64ri8 GR64:$op, 0)>;
+
+// (sub OP, SETCC_CARRY) -> (adc OP, 0)
+def : Pat<(sub GR8:$op, (i8 (X86setcc_c X86_COND_B, EFLAGS))),
+ (ADC8ri GR8:$op, 0)>;
+def : Pat<(sub GR32:$op, (i32 (X86setcc_c X86_COND_B, EFLAGS))),
+ (ADC32ri8 GR32:$op, 0)>;
+def : Pat<(sub GR64:$op, (i64 (X86setcc_c X86_COND_B, EFLAGS))),
+ (ADC64ri8 GR64:$op, 0)>;
+
+//===----------------------------------------------------------------------===//
+// String Pseudo Instructions
+//
+let SchedRW = [WriteMicrocoded] in {
+let Defs = [ECX,EDI,ESI], Uses = [ECX,EDI,ESI], isCodeGenOnly = 1 in {
+def REP_MOVSB_32 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}",
+ [(X86rep_movs i8)], IIC_REP_MOVS>, REP,
+ Requires<[Not64BitMode]>;
+def REP_MOVSW_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}",
+ [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize16,
+ Requires<[Not64BitMode]>;
+def REP_MOVSD_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}",
+ [(X86rep_movs i32)], IIC_REP_MOVS>, REP, OpSize32,
+ Requires<[Not64BitMode]>;
+}
+
+let Defs = [RCX,RDI,RSI], Uses = [RCX,RDI,RSI], isCodeGenOnly = 1 in {
+def REP_MOVSB_64 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}",
+ [(X86rep_movs i8)], IIC_REP_MOVS>, REP,
+ Requires<[In64BitMode]>;
+def REP_MOVSW_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}",
+ [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize16,
+ Requires<[In64BitMode]>;
+def REP_MOVSD_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}",
+ [(X86rep_movs i32)], IIC_REP_MOVS>, REP, OpSize32,
+ Requires<[In64BitMode]>;
+def REP_MOVSQ_64 : RI<0xA5, RawFrm, (outs), (ins), "{rep;movsq|rep movsq}",
+ [(X86rep_movs i64)], IIC_REP_MOVS>, REP,
+ Requires<[In64BitMode]>;
+}
+
+// FIXME: Should use "(X86rep_stos AL)" as the pattern.
+let Defs = [ECX,EDI], isCodeGenOnly = 1 in {
+ let Uses = [AL,ECX,EDI] in
+ def REP_STOSB_32 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}",
+ [(X86rep_stos i8)], IIC_REP_STOS>, REP,
+ Requires<[Not64BitMode]>;
+ let Uses = [AX,ECX,EDI] in
+ def REP_STOSW_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}",
+ [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize16,
+ Requires<[Not64BitMode]>;
+ let Uses = [EAX,ECX,EDI] in
+ def REP_STOSD_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}",
+ [(X86rep_stos i32)], IIC_REP_STOS>, REP, OpSize32,
+ Requires<[Not64BitMode]>;
+}
+
+let Defs = [RCX,RDI], isCodeGenOnly = 1 in {
+ let Uses = [AL,RCX,RDI] in
+ def REP_STOSB_64 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}",
+ [(X86rep_stos i8)], IIC_REP_STOS>, REP,
+ Requires<[In64BitMode]>;
+ let Uses = [AX,RCX,RDI] in
+ def REP_STOSW_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}",
+ [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize16,
+ Requires<[In64BitMode]>;
+ let Uses = [RAX,RCX,RDI] in
+ def REP_STOSD_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}",
+ [(X86rep_stos i32)], IIC_REP_STOS>, REP, OpSize32,
+ Requires<[In64BitMode]>;
+
+ let Uses = [RAX,RCX,RDI] in
+ def REP_STOSQ_64 : RI<0xAB, RawFrm, (outs), (ins), "{rep;stosq|rep stosq}",
+ [(X86rep_stos i64)], IIC_REP_STOS>, REP,
+ Requires<[In64BitMode]>;
+}
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Thread Local Storage Instructions
+//
+
+// ELF TLS Support
+// All calls clobber the non-callee saved registers. ESP is marked as
+// a use to prevent stack-pointer assignments that appear immediately
+// before calls from potentially appearing dead.
+let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7,
+ ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7,
+ MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+ XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+ XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+ usesCustomInserter = 1, Uses = [ESP] in {
+def TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
+ "# TLS_addr32",
+ [(X86tlsaddr tls32addr:$sym)]>,
+ Requires<[Not64BitMode]>;
+def TLS_base_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
+ "# TLS_base_addr32",
+ [(X86tlsbaseaddr tls32baseaddr:$sym)]>,
+ Requires<[Not64BitMode]>;
+}
+
+// All calls clobber the non-callee saved registers. RSP is marked as
+// a use to prevent stack-pointer assignments that appear immediately
+// before calls from potentially appearing dead.
+let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+ FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7,
+ ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7,
+ MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+ XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+ XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+ usesCustomInserter = 1, Uses = [RSP] in {
+def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
+ "# TLS_addr64",
+ [(X86tlsaddr tls64addr:$sym)]>,
+ Requires<[In64BitMode]>;
+def TLS_base_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
+ "# TLS_base_addr64",
+ [(X86tlsbaseaddr tls64baseaddr:$sym)]>,
+ Requires<[In64BitMode]>;
+}
+
+// Darwin TLS Support
+// For i386, the address of the thunk is passed on the stack, on return the
+// address of the variable is in %eax. %ecx is trashed during the function
+// call. All other registers are preserved.
+let Defs = [EAX, ECX, EFLAGS],
+ Uses = [ESP],
+ usesCustomInserter = 1 in
+def TLSCall_32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
+ "# TLSCall_32",
+ [(X86TLSCall addr:$sym)]>,
+ Requires<[Not64BitMode]>;
+
+// For x86_64, the address of the thunk is passed in %rdi, but the
+// pseudo directly use the symbol, so do not add an implicit use of
+// %rdi. The lowering will do the right thing with RDI.
+// On return the address of the variable is in %rax. All other
+// registers are preserved.
+let Defs = [RAX, EFLAGS],
+ Uses = [RSP],
+ usesCustomInserter = 1 in
+def TLSCall_64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
+ "# TLSCall_64",
+ [(X86TLSCall addr:$sym)]>,
+ Requires<[In64BitMode]>;
+
+
+//===----------------------------------------------------------------------===//
+// Conditional Move Pseudo Instructions
+
+// CMOV* - Used to implement the SELECT DAG operation. Expanded after
+// instruction selection into a branch sequence.
+multiclass CMOVrr_PSEUDO<RegisterClass RC, ValueType VT> {
+ def CMOV#NAME : I<0, Pseudo,
+ (outs RC:$dst), (ins RC:$t, RC:$f, i8imm:$cond),
+ "#CMOV_"#NAME#" PSEUDO!",
+ [(set RC:$dst, (VT (X86cmov RC:$t, RC:$f, imm:$cond,
+ EFLAGS)))]>;
+}
+
+let usesCustomInserter = 1, Uses = [EFLAGS] in {
+ // X86 doesn't have 8-bit conditional moves. Use a customInserter to
+ // emit control flow. An alternative to this is to mark i8 SELECT as Promote,
+ // however that requires promoting the operands, and can induce additional
+ // i8 register pressure.
+ defm _GR8 : CMOVrr_PSEUDO<GR8, i8>;
+
+ let Predicates = [NoCMov] in {
+ defm _GR32 : CMOVrr_PSEUDO<GR32, i32>;
+ defm _GR16 : CMOVrr_PSEUDO<GR16, i16>;
+ } // Predicates = [NoCMov]
+
+ // fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no
+ // SSE1/SSE2.
+ let Predicates = [FPStackf32] in
+ defm _RFP32 : CMOVrr_PSEUDO<RFP32, f32>;
+
+ let Predicates = [FPStackf64] in
+ defm _RFP64 : CMOVrr_PSEUDO<RFP64, f64>;
+
+ defm _RFP80 : CMOVrr_PSEUDO<RFP80, f80>;
+
+ defm _FR32 : CMOVrr_PSEUDO<FR32, f32>;
+ defm _FR64 : CMOVrr_PSEUDO<FR64, f64>;
+ defm _FR128 : CMOVrr_PSEUDO<FR128, f128>;
+ defm _V4F32 : CMOVrr_PSEUDO<VR128, v4f32>;
+ defm _V2F64 : CMOVrr_PSEUDO<VR128, v2f64>;
+ defm _V2I64 : CMOVrr_PSEUDO<VR128, v2i64>;
+ defm _V8F32 : CMOVrr_PSEUDO<VR256, v8f32>;
+ defm _V4F64 : CMOVrr_PSEUDO<VR256, v4f64>;
+ defm _V4I64 : CMOVrr_PSEUDO<VR256, v4i64>;
+ defm _V8I64 : CMOVrr_PSEUDO<VR512, v8i64>;
+ defm _V8F64 : CMOVrr_PSEUDO<VR512, v8f64>;
+ defm _V16F32 : CMOVrr_PSEUDO<VR512, v16f32>;
+ defm _V8I1 : CMOVrr_PSEUDO<VK8, v8i1>;
+ defm _V16I1 : CMOVrr_PSEUDO<VK16, v16i1>;
+ defm _V32I1 : CMOVrr_PSEUDO<VK32, v32i1>;
+ defm _V64I1 : CMOVrr_PSEUDO<VK64, v64i1>;
+} // usesCustomInserter = 1, Uses = [EFLAGS]
+
+//===----------------------------------------------------------------------===//
+// Normal-Instructions-With-Lock-Prefix Pseudo Instructions
+//===----------------------------------------------------------------------===//
+
+// FIXME: Use normal instructions and add lock prefix dynamically.
+
+// Memory barriers
+
+// TODO: Get this to fold the constant into the instruction.
+let isCodeGenOnly = 1, Defs = [EFLAGS] in
+def OR32mrLocked : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$zero),
+ "or{l}\t{$zero, $dst|$dst, $zero}", [],
+ IIC_ALU_MEM>, Requires<[Not64BitMode]>, OpSize32, LOCK,
+ Sched<[WriteALULd, WriteRMW]>;
+
+let hasSideEffects = 1 in
+def Int_MemBarrier : I<0, Pseudo, (outs), (ins),
+ "#MEMBARRIER",
+ [(X86MemBarrier)]>, Sched<[WriteLoad]>;
+
+// RegOpc corresponds to the mr version of the instruction
+// ImmOpc corresponds to the mi version of the instruction
+// ImmOpc8 corresponds to the mi8 version of the instruction
+// ImmMod corresponds to the instruction format of the mi and mi8 versions
+multiclass LOCK_ArithBinOp<bits<8> RegOpc, bits<8> ImmOpc, bits<8> ImmOpc8,
+ Format ImmMod, SDPatternOperator Op, string mnemonic> {
+let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
+ SchedRW = [WriteALULd, WriteRMW] in {
+
+def NAME#8mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+ RegOpc{3}, RegOpc{2}, RegOpc{1}, 0 },
+ MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
+ !strconcat(mnemonic, "{b}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [(set EFLAGS, (Op addr:$dst, GR8:$src2))],
+ IIC_ALU_NONMEM>, LOCK;
+
+def NAME#16mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+ RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
+ MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+ !strconcat(mnemonic, "{w}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [(set EFLAGS, (Op addr:$dst, GR16:$src2))],
+ IIC_ALU_NONMEM>, OpSize16, LOCK;
+
+def NAME#32mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+ RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
+ MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+ !strconcat(mnemonic, "{l}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [(set EFLAGS, (Op addr:$dst, GR32:$src2))],
+ IIC_ALU_NONMEM>, OpSize32, LOCK;
+
+def NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+ RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
+ MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+ !strconcat(mnemonic, "{q}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [(set EFLAGS, (Op addr:$dst, GR64:$src2))],
+ IIC_ALU_NONMEM>, LOCK;
+
+def NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+ ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 },
+ ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2),
+ !strconcat(mnemonic, "{b}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [(set EFLAGS, (Op addr:$dst, (i8 imm:$src2)))],
+ IIC_ALU_MEM>, LOCK;
+
+def NAME#16mi : Ii16<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+ ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
+ ImmMod, (outs), (ins i16mem :$dst, i16imm :$src2),
+ !strconcat(mnemonic, "{w}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [(set EFLAGS, (Op addr:$dst, (i16 imm:$src2)))],
+ IIC_ALU_MEM>, OpSize16, LOCK;
+
+def NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+ ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
+ ImmMod, (outs), (ins i32mem :$dst, i32imm :$src2),
+ !strconcat(mnemonic, "{l}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [(set EFLAGS, (Op addr:$dst, (i32 imm:$src2)))],
+ IIC_ALU_MEM>, OpSize32, LOCK;
+
+def NAME#64mi32 : RIi32S<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+ ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
+ ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2),
+ !strconcat(mnemonic, "{q}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [(set EFLAGS, (Op addr:$dst, i64immSExt32:$src2))],
+ IIC_ALU_MEM>, LOCK;
+
+def NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
+ ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
+ ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2),
+ !strconcat(mnemonic, "{w}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [(set EFLAGS, (Op addr:$dst, i16immSExt8:$src2))],
+ IIC_ALU_MEM>, OpSize16, LOCK;
+
+def NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
+ ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
+ ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2),
+ !strconcat(mnemonic, "{l}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [(set EFLAGS, (Op addr:$dst, i32immSExt8:$src2))],
+ IIC_ALU_MEM>, OpSize32, LOCK;
+
+def NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
+ ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
+ ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2),
+ !strconcat(mnemonic, "{q}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [(set EFLAGS, (Op addr:$dst, i64immSExt8:$src2))],
+ IIC_ALU_MEM>, LOCK;
+
+}
+
+}
+
+defm LOCK_ADD : LOCK_ArithBinOp<0x00, 0x80, 0x83, MRM0m, X86lock_add, "add">;
+defm LOCK_SUB : LOCK_ArithBinOp<0x28, 0x80, 0x83, MRM5m, X86lock_sub, "sub">;
+defm LOCK_OR : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM1m, X86lock_or , "or">;
+defm LOCK_AND : LOCK_ArithBinOp<0x20, 0x80, 0x83, MRM4m, X86lock_and, "and">;
+defm LOCK_XOR : LOCK_ArithBinOp<0x30, 0x80, 0x83, MRM6m, X86lock_xor, "xor">;
+
+multiclass LOCK_ArithUnOp<bits<8> Opc8, bits<8> Opc, Format Form,
+ int Increment, string mnemonic> {
+let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
+ SchedRW = [WriteALULd, WriteRMW], Predicates = [NotSlowIncDec] in {
+def NAME#8m : I<Opc8, Form, (outs), (ins i8mem :$dst),
+ !strconcat(mnemonic, "{b}\t$dst"),
+ [(set EFLAGS, (X86lock_add addr:$dst, (i8 Increment)))],
+ IIC_UNARY_MEM>, LOCK;
+def NAME#16m : I<Opc, Form, (outs), (ins i16mem:$dst),
+ !strconcat(mnemonic, "{w}\t$dst"),
+ [(set EFLAGS, (X86lock_add addr:$dst, (i16 Increment)))],
+ IIC_UNARY_MEM>, OpSize16, LOCK;
+def NAME#32m : I<Opc, Form, (outs), (ins i32mem:$dst),
+ !strconcat(mnemonic, "{l}\t$dst"),
+ [(set EFLAGS, (X86lock_add addr:$dst, (i32 Increment)))],
+ IIC_UNARY_MEM>, OpSize32, LOCK;
+def NAME#64m : RI<Opc, Form, (outs), (ins i64mem:$dst),
+ !strconcat(mnemonic, "{q}\t$dst"),
+ [(set EFLAGS, (X86lock_add addr:$dst, (i64 Increment)))],
+ IIC_UNARY_MEM>, LOCK;
+}
+}
+
+defm LOCK_INC : LOCK_ArithUnOp<0xFE, 0xFF, MRM0m, 1, "inc">;
+defm LOCK_DEC : LOCK_ArithUnOp<0xFE, 0xFF, MRM1m, -1, "dec">;
+
+// Atomic compare and swap.
+multiclass LCMPXCHG_UnOp<bits<8> Opc, Format Form, string mnemonic,
+ SDPatternOperator frag, X86MemOperand x86memop,
+ InstrItinClass itin> {
+let isCodeGenOnly = 1, usesCustomInserter = 1 in {
+ def NAME : I<Opc, Form, (outs), (ins x86memop:$ptr),
+ !strconcat(mnemonic, "\t$ptr"),
+ [(frag addr:$ptr)], itin>, TB, LOCK;
+}
+}
+
+multiclass LCMPXCHG_BinOp<bits<8> Opc8, bits<8> Opc, Format Form,
+ string mnemonic, SDPatternOperator frag,
+ InstrItinClass itin8, InstrItinClass itin> {
+let isCodeGenOnly = 1, SchedRW = [WriteALULd, WriteRMW] in {
+ let Defs = [AL, EFLAGS], Uses = [AL] in
+ def NAME#8 : I<Opc8, Form, (outs), (ins i8mem:$ptr, GR8:$swap),
+ !strconcat(mnemonic, "{b}\t{$swap, $ptr|$ptr, $swap}"),
+ [(frag addr:$ptr, GR8:$swap, 1)], itin8>, TB, LOCK;
+ let Defs = [AX, EFLAGS], Uses = [AX] in
+ def NAME#16 : I<Opc, Form, (outs), (ins i16mem:$ptr, GR16:$swap),
+ !strconcat(mnemonic, "{w}\t{$swap, $ptr|$ptr, $swap}"),
+ [(frag addr:$ptr, GR16:$swap, 2)], itin>, TB, OpSize16, LOCK;
+ let Defs = [EAX, EFLAGS], Uses = [EAX] in
+ def NAME#32 : I<Opc, Form, (outs), (ins i32mem:$ptr, GR32:$swap),
+ !strconcat(mnemonic, "{l}\t{$swap, $ptr|$ptr, $swap}"),
+ [(frag addr:$ptr, GR32:$swap, 4)], itin>, TB, OpSize32, LOCK;
+ let Defs = [RAX, EFLAGS], Uses = [RAX] in
+ def NAME#64 : RI<Opc, Form, (outs), (ins i64mem:$ptr, GR64:$swap),
+ !strconcat(mnemonic, "{q}\t{$swap, $ptr|$ptr, $swap}"),
+ [(frag addr:$ptr, GR64:$swap, 8)], itin>, TB, LOCK;
+}
+}
+
+let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX],
+ SchedRW = [WriteALULd, WriteRMW] in {
+defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b",
+ X86cas8, i64mem,
+ IIC_CMPX_LOCK_8B>;
+}
+
+// This pseudo must be used when the frame uses RBX as
+// the base pointer. Indeed, in such situation RBX is a reserved
+// register and the register allocator will ignore any use/def of
+// it. In other words, the register will not fix the clobbering of
+// RBX that will happen when setting the arguments for the instrucion.
+//
+// Unlike the actual related instuction, we mark that this one
+// defines EBX (instead of using EBX).
+// The rationale is that we will define RBX during the expansion of
+// the pseudo. The argument feeding EBX is ebx_input.
+//
+// The additional argument, $ebx_save, is a temporary register used to
+// save the value of RBX accross the actual instruction.
+//
+// To make sure the register assigned to $ebx_save does not interfere with
+// the definition of the actual instruction, we use a definition $dst which
+// is tied to $rbx_save. That way, the live-range of $rbx_save spans accross
+// the instruction and we are sure we will have a valid register to restore
+// the value of RBX.
+let Defs = [EAX, EDX, EBX, EFLAGS], Uses = [EAX, ECX, EDX],
+ SchedRW = [WriteALULd, WriteRMW], isCodeGenOnly = 1, isPseudo = 1,
+ Constraints = "$ebx_save = $dst", usesCustomInserter = 1 in {
+def LCMPXCHG8B_SAVE_EBX :
+ I<0, Pseudo, (outs GR32:$dst),
+ (ins i64mem:$ptr, GR32:$ebx_input, GR32:$ebx_save),
+ !strconcat("cmpxchg8b", "\t$ptr"),
+ [(set GR32:$dst, (X86cas8save_ebx addr:$ptr, GR32:$ebx_input,
+ GR32:$ebx_save))],
+ IIC_CMPX_LOCK_8B>;
+}
+
+
+let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX],
+ Predicates = [HasCmpxchg16b], SchedRW = [WriteALULd, WriteRMW] in {
+defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b",
+ X86cas16, i128mem,
+ IIC_CMPX_LOCK_16B>, REX_W;
+}
+
+// Same as LCMPXCHG8B_SAVE_RBX but for the 16 Bytes variant.
+let Defs = [RAX, RDX, RBX, EFLAGS], Uses = [RAX, RCX, RDX],
+ Predicates = [HasCmpxchg16b], SchedRW = [WriteALULd, WriteRMW],
+ isCodeGenOnly = 1, isPseudo = 1, Constraints = "$rbx_save = $dst",
+ usesCustomInserter = 1 in {
+def LCMPXCHG16B_SAVE_RBX :
+ I<0, Pseudo, (outs GR64:$dst),
+ (ins i128mem:$ptr, GR64:$rbx_input, GR64:$rbx_save),
+ !strconcat("cmpxchg16b", "\t$ptr"),
+ [(set GR64:$dst, (X86cas16save_rbx addr:$ptr, GR64:$rbx_input,
+ GR64:$rbx_save))],
+ IIC_CMPX_LOCK_16B>;
+}
+
+defm LCMPXCHG : LCMPXCHG_BinOp<0xB0, 0xB1, MRMDestMem, "cmpxchg",
+ X86cas, IIC_CMPX_LOCK_8, IIC_CMPX_LOCK>;
+
+// Atomic exchange and add
+multiclass ATOMIC_LOAD_BINOP<bits<8> opc8, bits<8> opc, string mnemonic,
+ string frag,
+ InstrItinClass itin8, InstrItinClass itin> {
+ let Constraints = "$val = $dst", Defs = [EFLAGS], isCodeGenOnly = 1,
+ SchedRW = [WriteALULd, WriteRMW] in {
+ def NAME#8 : I<opc8, MRMSrcMem, (outs GR8:$dst),
+ (ins GR8:$val, i8mem:$ptr),
+ !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"),
+ [(set GR8:$dst,
+ (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))],
+ itin8>;
+ def NAME#16 : I<opc, MRMSrcMem, (outs GR16:$dst),
+ (ins GR16:$val, i16mem:$ptr),
+ !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"),
+ [(set
+ GR16:$dst,
+ (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))],
+ itin>, OpSize16;
+ def NAME#32 : I<opc, MRMSrcMem, (outs GR32:$dst),
+ (ins GR32:$val, i32mem:$ptr),
+ !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"),
+ [(set
+ GR32:$dst,
+ (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))],
+ itin>, OpSize32;
+ def NAME#64 : RI<opc, MRMSrcMem, (outs GR64:$dst),
+ (ins GR64:$val, i64mem:$ptr),
+ !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"),
+ [(set
+ GR64:$dst,
+ (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))],
+ itin>;
+ }
+}
+
+defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add",
+ IIC_XADD_LOCK_MEM8, IIC_XADD_LOCK_MEM>,
+ TB, LOCK;
+
+/* The following multiclass tries to make sure that in code like
+ * x.store (immediate op x.load(acquire), release)
+ * and
+ * x.store (register op x.load(acquire), release)
+ * an operation directly on memory is generated instead of wasting a register.
+ * It is not automatic as atomic_store/load are only lowered to MOV instructions
+ * extremely late to prevent them from being accidentally reordered in the backend
+ * (see below the RELEASE_MOV* / ACQUIRE_MOV* pseudo-instructions)
+ */
+multiclass RELEASE_BINOP_MI<SDNode op> {
+ def NAME#8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src),
+ "#BINOP "#NAME#"8mi PSEUDO!",
+ [(atomic_store_8 addr:$dst, (op
+ (atomic_load_8 addr:$dst), (i8 imm:$src)))]>;
+ def NAME#8mr : I<0, Pseudo, (outs), (ins i8mem:$dst, GR8:$src),
+ "#BINOP "#NAME#"8mr PSEUDO!",
+ [(atomic_store_8 addr:$dst, (op
+ (atomic_load_8 addr:$dst), GR8:$src))]>;
+ // NAME#16 is not generated as 16-bit arithmetic instructions are considered
+ // costly and avoided as far as possible by this backend anyway
+ def NAME#32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src),
+ "#BINOP "#NAME#"32mi PSEUDO!",
+ [(atomic_store_32 addr:$dst, (op
+ (atomic_load_32 addr:$dst), (i32 imm:$src)))]>;
+ def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src),
+ "#BINOP "#NAME#"32mr PSEUDO!",
+ [(atomic_store_32 addr:$dst, (op
+ (atomic_load_32 addr:$dst), GR32:$src))]>;
+ def NAME#64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src),
+ "#BINOP "#NAME#"64mi32 PSEUDO!",
+ [(atomic_store_64 addr:$dst, (op
+ (atomic_load_64 addr:$dst), (i64immSExt32:$src)))]>;
+ def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src),
+ "#BINOP "#NAME#"64mr PSEUDO!",
+ [(atomic_store_64 addr:$dst, (op
+ (atomic_load_64 addr:$dst), GR64:$src))]>;
+}
+let Defs = [EFLAGS] in {
+ defm RELEASE_ADD : RELEASE_BINOP_MI<add>;
+ defm RELEASE_AND : RELEASE_BINOP_MI<and>;
+ defm RELEASE_OR : RELEASE_BINOP_MI<or>;
+ defm RELEASE_XOR : RELEASE_BINOP_MI<xor>;
+ // Note: we don't deal with sub, because substractions of constants are
+ // optimized into additions before this code can run.
+}
+
+// Same as above, but for floating-point.
+// FIXME: imm version.
+// FIXME: Version that doesn't clobber $src, using AVX's VADDSS.
+// FIXME: This could also handle SIMD operations with *ps and *pd instructions.
+let usesCustomInserter = 1 in {
+multiclass RELEASE_FP_BINOP_MI<SDNode op> {
+ def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, FR32:$src),
+ "#BINOP "#NAME#"32mr PSEUDO!",
+ [(atomic_store_32 addr:$dst,
+ (i32 (bitconvert (op
+ (f32 (bitconvert (i32 (atomic_load_32 addr:$dst)))),
+ FR32:$src))))]>, Requires<[HasSSE1]>;
+ def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, FR64:$src),
+ "#BINOP "#NAME#"64mr PSEUDO!",
+ [(atomic_store_64 addr:$dst,
+ (i64 (bitconvert (op
+ (f64 (bitconvert (i64 (atomic_load_64 addr:$dst)))),
+ FR64:$src))))]>, Requires<[HasSSE2]>;
+}
+defm RELEASE_FADD : RELEASE_FP_BINOP_MI<fadd>;
+// FIXME: Add fsub, fmul, fdiv, ...
+}
+
+multiclass RELEASE_UNOP<dag dag8, dag dag16, dag dag32, dag dag64> {
+ def NAME#8m : I<0, Pseudo, (outs), (ins i8mem:$dst),
+ "#UNOP "#NAME#"8m PSEUDO!",
+ [(atomic_store_8 addr:$dst, dag8)]>;
+ def NAME#16m : I<0, Pseudo, (outs), (ins i16mem:$dst),
+ "#UNOP "#NAME#"16m PSEUDO!",
+ [(atomic_store_16 addr:$dst, dag16)]>;
+ def NAME#32m : I<0, Pseudo, (outs), (ins i32mem:$dst),
+ "#UNOP "#NAME#"32m PSEUDO!",
+ [(atomic_store_32 addr:$dst, dag32)]>;
+ def NAME#64m : I<0, Pseudo, (outs), (ins i64mem:$dst),
+ "#UNOP "#NAME#"64m PSEUDO!",
+ [(atomic_store_64 addr:$dst, dag64)]>;
+}
+
+let Defs = [EFLAGS] in {
+ defm RELEASE_INC : RELEASE_UNOP<
+ (add (atomic_load_8 addr:$dst), (i8 1)),
+ (add (atomic_load_16 addr:$dst), (i16 1)),
+ (add (atomic_load_32 addr:$dst), (i32 1)),
+ (add (atomic_load_64 addr:$dst), (i64 1))>, Requires<[NotSlowIncDec]>;
+ defm RELEASE_DEC : RELEASE_UNOP<
+ (add (atomic_load_8 addr:$dst), (i8 -1)),
+ (add (atomic_load_16 addr:$dst), (i16 -1)),
+ (add (atomic_load_32 addr:$dst), (i32 -1)),
+ (add (atomic_load_64 addr:$dst), (i64 -1))>, Requires<[NotSlowIncDec]>;
+}
+/*
+TODO: These don't work because the type inference of TableGen fails.
+TODO: find a way to fix it.
+let Defs = [EFLAGS] in {
+ defm RELEASE_NEG : RELEASE_UNOP<
+ (ineg (atomic_load_8 addr:$dst)),
+ (ineg (atomic_load_16 addr:$dst)),
+ (ineg (atomic_load_32 addr:$dst)),
+ (ineg (atomic_load_64 addr:$dst))>;
+}
+// NOT doesn't set flags.
+defm RELEASE_NOT : RELEASE_UNOP<
+ (not (atomic_load_8 addr:$dst)),
+ (not (atomic_load_16 addr:$dst)),
+ (not (atomic_load_32 addr:$dst)),
+ (not (atomic_load_64 addr:$dst))>;
+*/
+
+def RELEASE_MOV8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src),
+ "#RELEASE_MOV8mi PSEUDO!",
+ [(atomic_store_8 addr:$dst, (i8 imm:$src))]>;
+def RELEASE_MOV16mi : I<0, Pseudo, (outs), (ins i16mem:$dst, i16imm:$src),
+ "#RELEASE_MOV16mi PSEUDO!",
+ [(atomic_store_16 addr:$dst, (i16 imm:$src))]>;
+def RELEASE_MOV32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src),
+ "#RELEASE_MOV32mi PSEUDO!",
+ [(atomic_store_32 addr:$dst, (i32 imm:$src))]>;
+def RELEASE_MOV64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src),
+ "#RELEASE_MOV64mi32 PSEUDO!",
+ [(atomic_store_64 addr:$dst, i64immSExt32:$src)]>;
+
+def RELEASE_MOV8mr : I<0, Pseudo, (outs), (ins i8mem :$dst, GR8 :$src),
+ "#RELEASE_MOV8mr PSEUDO!",
+ [(atomic_store_8 addr:$dst, GR8 :$src)]>;
+def RELEASE_MOV16mr : I<0, Pseudo, (outs), (ins i16mem:$dst, GR16:$src),
+ "#RELEASE_MOV16mr PSEUDO!",
+ [(atomic_store_16 addr:$dst, GR16:$src)]>;
+def RELEASE_MOV32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src),
+ "#RELEASE_MOV32mr PSEUDO!",
+ [(atomic_store_32 addr:$dst, GR32:$src)]>;
+def RELEASE_MOV64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src),
+ "#RELEASE_MOV64mr PSEUDO!",
+ [(atomic_store_64 addr:$dst, GR64:$src)]>;
+
+def ACQUIRE_MOV8rm : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src),
+ "#ACQUIRE_MOV8rm PSEUDO!",
+ [(set GR8:$dst, (atomic_load_8 addr:$src))]>;
+def ACQUIRE_MOV16rm : I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$src),
+ "#ACQUIRE_MOV16rm PSEUDO!",
+ [(set GR16:$dst, (atomic_load_16 addr:$src))]>;
+def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src),
+ "#ACQUIRE_MOV32rm PSEUDO!",
+ [(set GR32:$dst, (atomic_load_32 addr:$src))]>;
+def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src),
+ "#ACQUIRE_MOV64rm PSEUDO!",
+ [(set GR64:$dst, (atomic_load_64 addr:$src))]>;
+
+//===----------------------------------------------------------------------===//
+// DAG Pattern Matching Rules
+//===----------------------------------------------------------------------===//
+
+// Use AND/OR to store 0/-1 in memory when optimizing for minsize. This saves
+// binary size compared to a regular MOV, but it introduces an unnecessary
+// load, so is not suitable for regular or optsize functions.
+let Predicates = [OptForMinSize] in {
+def : Pat<(store (i16 0), addr:$dst), (AND16mi8 addr:$dst, 0)>;
+def : Pat<(store (i32 0), addr:$dst), (AND32mi8 addr:$dst, 0)>;
+def : Pat<(store (i64 0), addr:$dst), (AND64mi8 addr:$dst, 0)>;
+def : Pat<(store (i16 -1), addr:$dst), (OR16mi8 addr:$dst, -1)>;
+def : Pat<(store (i32 -1), addr:$dst), (OR32mi8 addr:$dst, -1)>;
+def : Pat<(store (i64 -1), addr:$dst), (OR64mi8 addr:$dst, -1)>;
+}
+
+// In kernel code model, we can get the address of a label
+// into a register with 'movq'. FIXME: This is a hack, the 'imm' predicate of
+// the MOV64ri32 should accept these.
+def : Pat<(i64 (X86Wrapper tconstpool :$dst)),
+ (MOV64ri32 tconstpool :$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper tjumptable :$dst)),
+ (MOV64ri32 tjumptable :$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
+ (MOV64ri32 tglobaladdr :$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
+ (MOV64ri32 texternalsym:$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper mcsym:$dst)),
+ (MOV64ri32 mcsym:$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper tblockaddress:$dst)),
+ (MOV64ri32 tblockaddress:$dst)>, Requires<[KernelCode]>;
+
+// If we have small model and -static mode, it is safe to store global addresses
+// directly as immediates. FIXME: This is really a hack, the 'imm' predicate
+// for MOV64mi32 should handle this sort of thing.
+def : Pat<(store (i64 (X86Wrapper tconstpool:$src)), addr:$dst),
+ (MOV64mi32 addr:$dst, tconstpool:$src)>,
+ Requires<[NearData, IsNotPIC]>;
+def : Pat<(store (i64 (X86Wrapper tjumptable:$src)), addr:$dst),
+ (MOV64mi32 addr:$dst, tjumptable:$src)>,
+ Requires<[NearData, IsNotPIC]>;
+def : Pat<(store (i64 (X86Wrapper tglobaladdr:$src)), addr:$dst),
+ (MOV64mi32 addr:$dst, tglobaladdr:$src)>,
+ Requires<[NearData, IsNotPIC]>;
+def : Pat<(store (i64 (X86Wrapper texternalsym:$src)), addr:$dst),
+ (MOV64mi32 addr:$dst, texternalsym:$src)>,
+ Requires<[NearData, IsNotPIC]>;
+def : Pat<(store (i64 (X86Wrapper mcsym:$src)), addr:$dst),
+ (MOV64mi32 addr:$dst, mcsym:$src)>,
+ Requires<[NearData, IsNotPIC]>;
+def : Pat<(store (i64 (X86Wrapper tblockaddress:$src)), addr:$dst),
+ (MOV64mi32 addr:$dst, tblockaddress:$src)>,
+ Requires<[NearData, IsNotPIC]>;
+
+def : Pat<(i32 (X86RecoverFrameAlloc mcsym:$dst)), (MOV32ri mcsym:$dst)>;
+def : Pat<(i64 (X86RecoverFrameAlloc mcsym:$dst)), (MOV64ri mcsym:$dst)>;
+
+// Calls
+
+// tls has some funny stuff here...
+// This corresponds to movabs $foo@tpoff, %rax
+def : Pat<(i64 (X86Wrapper tglobaltlsaddr :$dst)),
+ (MOV64ri32 tglobaltlsaddr :$dst)>;
+// This corresponds to add $foo@tpoff, %rax
+def : Pat<(add GR64:$src1, (X86Wrapper tglobaltlsaddr :$dst)),
+ (ADD64ri32 GR64:$src1, tglobaltlsaddr :$dst)>;
+
+
+// Direct PC relative function call for small code model. 32-bit displacement
+// sign extended to 64-bit.
+def : Pat<(X86call (i64 tglobaladdr:$dst)),
+ (CALL64pcrel32 tglobaladdr:$dst)>;
+def : Pat<(X86call (i64 texternalsym:$dst)),
+ (CALL64pcrel32 texternalsym:$dst)>;
+
+// Tailcall stuff. The TCRETURN instructions execute after the epilog, so they
+// can never use callee-saved registers. That is the purpose of the GR64_TC
+// register classes.
+//
+// The only volatile register that is never used by the calling convention is
+// %r11. This happens when calling a vararg function with 6 arguments.
+//
+// Match an X86tcret that uses less than 7 volatile registers.
+def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off),
+ (X86tcret node:$ptr, node:$off), [{
+ // X86tcret args: (*chain, ptr, imm, regs..., glue)
+ unsigned NumRegs = 0;
+ for (unsigned i = 3, e = N->getNumOperands(); i != e; ++i)
+ if (isa<RegisterSDNode>(N->getOperand(i)) && ++NumRegs > 6)
+ return false;
+ return true;
+}]>;
+
+def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
+ (TCRETURNri ptr_rc_tailcall:$dst, imm:$off)>,
+ Requires<[Not64BitMode]>;
+
+// FIXME: This is disabled for 32-bit PIC mode because the global base
+// register which is part of the address mode may be assigned a
+// callee-saved register.
+def : Pat<(X86tcret (load addr:$dst), imm:$off),
+ (TCRETURNmi addr:$dst, imm:$off)>,
+ Requires<[Not64BitMode, IsNotPIC]>;
+
+def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off),
+ (TCRETURNdi tglobaladdr:$dst, imm:$off)>,
+ Requires<[NotLP64]>;
+
+def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off),
+ (TCRETURNdi texternalsym:$dst, imm:$off)>,
+ Requires<[NotLP64]>;
+
+def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
+ (TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>,
+ Requires<[In64BitMode]>;
+
+// Don't fold loads into X86tcret requiring more than 6 regs.
+// There wouldn't be enough scratch registers for base+index.
+def : Pat<(X86tcret_6regs (load addr:$dst), imm:$off),
+ (TCRETURNmi64 addr:$dst, imm:$off)>,
+ Requires<[In64BitMode]>;
+
+def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off),
+ (TCRETURNdi64 tglobaladdr:$dst, imm:$off)>,
+ Requires<[IsLP64]>;
+
+def : Pat<(X86tcret (i64 texternalsym:$dst), imm:$off),
+ (TCRETURNdi64 texternalsym:$dst, imm:$off)>,
+ Requires<[IsLP64]>;
+
+// Normal calls, with various flavors of addresses.
+def : Pat<(X86call (i32 tglobaladdr:$dst)),
+ (CALLpcrel32 tglobaladdr:$dst)>;
+def : Pat<(X86call (i32 texternalsym:$dst)),
+ (CALLpcrel32 texternalsym:$dst)>;
+def : Pat<(X86call (i32 imm:$dst)),
+ (CALLpcrel32 imm:$dst)>, Requires<[CallImmAddr]>;
+
+// Comparisons.
+
+// TEST R,R is smaller than CMP R,0
+def : Pat<(X86cmp GR8:$src1, 0),
+ (TEST8rr GR8:$src1, GR8:$src1)>;
+def : Pat<(X86cmp GR16:$src1, 0),
+ (TEST16rr GR16:$src1, GR16:$src1)>;
+def : Pat<(X86cmp GR32:$src1, 0),
+ (TEST32rr GR32:$src1, GR32:$src1)>;
+def : Pat<(X86cmp GR64:$src1, 0),
+ (TEST64rr GR64:$src1, GR64:$src1)>;
+
+// Conditional moves with folded loads with operands swapped and conditions
+// inverted.
+multiclass CMOVmr<PatLeaf InvertedCond, Instruction Inst16, Instruction Inst32,
+ Instruction Inst64> {
+ let Predicates = [HasCMov] in {
+ def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, InvertedCond, EFLAGS),
+ (Inst16 GR16:$src2, addr:$src1)>;
+ def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, InvertedCond, EFLAGS),
+ (Inst32 GR32:$src2, addr:$src1)>;
+ def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, InvertedCond, EFLAGS),
+ (Inst64 GR64:$src2, addr:$src1)>;
+ }
+}
+
+defm : CMOVmr<X86_COND_B , CMOVAE16rm, CMOVAE32rm, CMOVAE64rm>;
+defm : CMOVmr<X86_COND_AE, CMOVB16rm , CMOVB32rm , CMOVB64rm>;
+defm : CMOVmr<X86_COND_E , CMOVNE16rm, CMOVNE32rm, CMOVNE64rm>;
+defm : CMOVmr<X86_COND_NE, CMOVE16rm , CMOVE32rm , CMOVE64rm>;
+defm : CMOVmr<X86_COND_BE, CMOVA16rm , CMOVA32rm , CMOVA64rm>;
+defm : CMOVmr<X86_COND_A , CMOVBE16rm, CMOVBE32rm, CMOVBE64rm>;
+defm : CMOVmr<X86_COND_L , CMOVGE16rm, CMOVGE32rm, CMOVGE64rm>;
+defm : CMOVmr<X86_COND_GE, CMOVL16rm , CMOVL32rm , CMOVL64rm>;
+defm : CMOVmr<X86_COND_LE, CMOVG16rm , CMOVG32rm , CMOVG64rm>;
+defm : CMOVmr<X86_COND_G , CMOVLE16rm, CMOVLE32rm, CMOVLE64rm>;
+defm : CMOVmr<X86_COND_P , CMOVNP16rm, CMOVNP32rm, CMOVNP64rm>;
+defm : CMOVmr<X86_COND_NP, CMOVP16rm , CMOVP32rm , CMOVP64rm>;
+defm : CMOVmr<X86_COND_S , CMOVNS16rm, CMOVNS32rm, CMOVNS64rm>;
+defm : CMOVmr<X86_COND_NS, CMOVS16rm , CMOVS32rm , CMOVS64rm>;
+defm : CMOVmr<X86_COND_O , CMOVNO16rm, CMOVNO32rm, CMOVNO64rm>;
+defm : CMOVmr<X86_COND_NO, CMOVO16rm , CMOVO32rm , CMOVO64rm>;
+
+// zextload bool -> zextload byte
+// i1 stored in one byte in zero-extended form.
+// Upper bits cleanup should be executed before Store.
+def : Pat<(zextloadi8i1 addr:$src), (MOV8rm addr:$src)>;
+def : Pat<(zextloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>;
+def : Pat<(zextloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>;
+def : Pat<(zextloadi64i1 addr:$src),
+ (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
+
+// extload bool -> extload byte
+// When extloading from 16-bit and smaller memory locations into 64-bit
+// registers, use zero-extending loads so that the entire 64-bit register is
+// defined, avoiding partial-register updates.
+
+def : Pat<(extloadi8i1 addr:$src), (MOV8rm addr:$src)>;
+def : Pat<(extloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>;
+def : Pat<(extloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>;
+def : Pat<(extloadi16i8 addr:$src), (MOVZX16rm8 addr:$src)>;
+def : Pat<(extloadi32i8 addr:$src), (MOVZX32rm8 addr:$src)>;
+def : Pat<(extloadi32i16 addr:$src), (MOVZX32rm16 addr:$src)>;
+
+// For other extloads, use subregs, since the high contents of the register are
+// defined after an extload.
+def : Pat<(extloadi64i1 addr:$src),
+ (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
+def : Pat<(extloadi64i8 addr:$src),
+ (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
+def : Pat<(extloadi64i16 addr:$src),
+ (SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>;
+def : Pat<(extloadi64i32 addr:$src),
+ (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>;
+
+// anyext. Define these to do an explicit zero-extend to
+// avoid partial-register updates.
+def : Pat<(i16 (anyext GR8 :$src)), (EXTRACT_SUBREG
+ (MOVZX32rr8 GR8 :$src), sub_16bit)>;
+def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8 GR8 :$src)>;
+
+// Except for i16 -> i32 since isel expect i16 ops to be promoted to i32.
+def : Pat<(i32 (anyext GR16:$src)),
+ (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit)>;
+
+def : Pat<(i64 (anyext GR8 :$src)),
+ (SUBREG_TO_REG (i64 0), (MOVZX32rr8 GR8 :$src), sub_32bit)>;
+def : Pat<(i64 (anyext GR16:$src)),
+ (SUBREG_TO_REG (i64 0), (MOVZX32rr16 GR16 :$src), sub_32bit)>;
+def : Pat<(i64 (anyext GR32:$src)),
+ (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>;
+
+
+// Any instruction that defines a 32-bit result leaves the high half of the
+// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
+// be copying from a truncate. Any other 32-bit operation will zero-extend
+// up to 64 bits.
+def def32 : PatLeaf<(i32 GR32:$src), [{
+ return N->getOpcode() != ISD::TRUNCATE &&
+ N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
+ N->getOpcode() != ISD::CopyFromReg &&
+ N->getOpcode() != ISD::AssertSext;
+}]>;
+
+// In the case of a 32-bit def that is known to implicitly zero-extend,
+// we can use a SUBREG_TO_REG.
+def : Pat<(i64 (zext def32:$src)),
+ (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>;
+
+//===----------------------------------------------------------------------===//
+// Pattern match OR as ADD
+//===----------------------------------------------------------------------===//
+
+// If safe, we prefer to pattern match OR as ADD at isel time. ADD can be
+// 3-addressified into an LEA instruction to avoid copies. However, we also
+// want to finally emit these instructions as an or at the end of the code
+// generator to make the generated code easier to read. To do this, we select
+// into "disjoint bits" pseudo ops.
+
+// Treat an 'or' node is as an 'add' if the or'ed bits are known to be zero.
+def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{
+ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1)))
+ return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue());
+
+ APInt KnownZero0, KnownOne0;
+ CurDAG->computeKnownBits(N->getOperand(0), KnownZero0, KnownOne0, 0);
+ APInt KnownZero1, KnownOne1;
+ CurDAG->computeKnownBits(N->getOperand(1), KnownZero1, KnownOne1, 0);
+ return (~KnownZero0 & ~KnownZero1) == 0;
+}]>;
+
+
+// (or x1, x2) -> (add x1, x2) if two operands are known not to share bits.
+// Try this before the selecting to OR.
+let AddedComplexity = 5, SchedRW = [WriteALU] in {
+
+let isConvertibleToThreeAddress = 1,
+ Constraints = "$src1 = $dst", Defs = [EFLAGS] in {
+let isCommutable = 1 in {
+def ADD16rr_DB : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "", // orw/addw REG, REG
+ [(set GR16:$dst, (or_is_add GR16:$src1, GR16:$src2))]>;
+def ADD32rr_DB : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "", // orl/addl REG, REG
+ [(set GR32:$dst, (or_is_add GR32:$src1, GR32:$src2))]>;
+def ADD64rr_DB : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "", // orq/addq REG, REG
+ [(set GR64:$dst, (or_is_add GR64:$src1, GR64:$src2))]>;
+} // isCommutable
+
+// NOTE: These are order specific, we want the ri8 forms to be listed
+// first so that they are slightly preferred to the ri forms.
+
+def ADD16ri8_DB : I<0, Pseudo,
+ (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
+ "", // orw/addw REG, imm8
+ [(set GR16:$dst,(or_is_add GR16:$src1,i16immSExt8:$src2))]>;
+def ADD16ri_DB : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+ "", // orw/addw REG, imm
+ [(set GR16:$dst, (or_is_add GR16:$src1, imm:$src2))]>;
+
+def ADD32ri8_DB : I<0, Pseudo,
+ (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
+ "", // orl/addl REG, imm8
+ [(set GR32:$dst,(or_is_add GR32:$src1,i32immSExt8:$src2))]>;
+def ADD32ri_DB : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+ "", // orl/addl REG, imm
+ [(set GR32:$dst, (or_is_add GR32:$src1, imm:$src2))]>;
+
+
+def ADD64ri8_DB : I<0, Pseudo,
+ (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+ "", // orq/addq REG, imm8
+ [(set GR64:$dst, (or_is_add GR64:$src1,
+ i64immSExt8:$src2))]>;
+def ADD64ri32_DB : I<0, Pseudo,
+ (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
+ "", // orq/addq REG, imm
+ [(set GR64:$dst, (or_is_add GR64:$src1,
+ i64immSExt32:$src2))]>;
+}
+} // AddedComplexity, SchedRW
+
+
+//===----------------------------------------------------------------------===//
+// Some peepholes
+//===----------------------------------------------------------------------===//
+
+// Odd encoding trick: -128 fits into an 8-bit immediate field while
+// +128 doesn't, so in this special case use a sub instead of an add.
+def : Pat<(add GR16:$src1, 128),
+ (SUB16ri8 GR16:$src1, -128)>;
+def : Pat<(store (add (loadi16 addr:$dst), 128), addr:$dst),
+ (SUB16mi8 addr:$dst, -128)>;
+
+def : Pat<(add GR32:$src1, 128),
+ (SUB32ri8 GR32:$src1, -128)>;
+def : Pat<(store (add (loadi32 addr:$dst), 128), addr:$dst),
+ (SUB32mi8 addr:$dst, -128)>;
+
+def : Pat<(add GR64:$src1, 128),
+ (SUB64ri8 GR64:$src1, -128)>;
+def : Pat<(store (add (loadi64 addr:$dst), 128), addr:$dst),
+ (SUB64mi8 addr:$dst, -128)>;
+
+// The same trick applies for 32-bit immediate fields in 64-bit
+// instructions.
+def : Pat<(add GR64:$src1, 0x0000000080000000),
+ (SUB64ri32 GR64:$src1, 0xffffffff80000000)>;
+def : Pat<(store (add (loadi64 addr:$dst), 0x0000000080000000), addr:$dst),
+ (SUB64mi32 addr:$dst, 0xffffffff80000000)>;
+
+// To avoid needing to materialize an immediate in a register, use a 32-bit and
+// with implicit zero-extension instead of a 64-bit and if the immediate has at
+// least 32 bits of leading zeros. If in addition the last 32 bits can be
+// represented with a sign extension of a 8 bit constant, use that.
+// This can also reduce instruction size by eliminating the need for the REX
+// prefix.
+
+// AddedComplexity is needed to give priority over i64immSExt8 and i64immSExt32.
+let AddedComplexity = 1 in {
+def : Pat<(and GR64:$src, i64immZExt32SExt8:$imm),
+ (SUBREG_TO_REG
+ (i64 0),
+ (AND32ri8
+ (EXTRACT_SUBREG GR64:$src, sub_32bit),
+ (i32 (GetLo8XForm imm:$imm))),
+ sub_32bit)>;
+
+def : Pat<(and GR64:$src, i64immZExt32:$imm),
+ (SUBREG_TO_REG
+ (i64 0),
+ (AND32ri
+ (EXTRACT_SUBREG GR64:$src, sub_32bit),
+ (i32 (GetLo32XForm imm:$imm))),
+ sub_32bit)>;
+} // AddedComplexity = 1
+
+
+// AddedComplexity is needed due to the increased complexity on the
+// i64immZExt32SExt8 and i64immZExt32 patterns above. Applying this to all
+// the MOVZX patterns keeps thems together in DAGIsel tables.
+let AddedComplexity = 1 in {
+// r & (2^16-1) ==> movz
+def : Pat<(and GR32:$src1, 0xffff),
+ (MOVZX32rr16 (EXTRACT_SUBREG GR32:$src1, sub_16bit))>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR32:$src1, 0xff),
+ (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src1,
+ GR32_ABCD)),
+ sub_8bit))>,
+ Requires<[Not64BitMode]>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR16:$src1, 0xff),
+ (EXTRACT_SUBREG (MOVZX32rr8 (EXTRACT_SUBREG
+ (i16 (COPY_TO_REGCLASS GR16:$src1, GR16_ABCD)), sub_8bit)),
+ sub_16bit)>,
+ Requires<[Not64BitMode]>;
+
+// r & (2^32-1) ==> movz
+def : Pat<(and GR64:$src, 0x00000000FFFFFFFF),
+ (SUBREG_TO_REG (i64 0),
+ (MOV32rr (EXTRACT_SUBREG GR64:$src, sub_32bit)),
+ sub_32bit)>;
+// r & (2^16-1) ==> movz
+def : Pat<(and GR64:$src, 0xffff),
+ (SUBREG_TO_REG (i64 0),
+ (MOVZX32rr16 (i16 (EXTRACT_SUBREG GR64:$src, sub_16bit))),
+ sub_32bit)>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR64:$src, 0xff),
+ (SUBREG_TO_REG (i64 0),
+ (MOVZX32rr8 (i8 (EXTRACT_SUBREG GR64:$src, sub_8bit))),
+ sub_32bit)>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR32:$src1, 0xff),
+ (MOVZX32rr8 (EXTRACT_SUBREG GR32:$src1, sub_8bit))>,
+ Requires<[In64BitMode]>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR16:$src1, 0xff),
+ (EXTRACT_SUBREG (MOVZX32rr8 (i8
+ (EXTRACT_SUBREG GR16:$src1, sub_8bit))), sub_16bit)>,
+ Requires<[In64BitMode]>;
+} // AddedComplexity = 1
+
+
+// sext_inreg patterns
+def : Pat<(sext_inreg GR32:$src, i16),
+ (MOVSX32rr16 (EXTRACT_SUBREG GR32:$src, sub_16bit))>;
+def : Pat<(sext_inreg GR32:$src, i8),
+ (MOVSX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
+ GR32_ABCD)),
+ sub_8bit))>,
+ Requires<[Not64BitMode]>;
+
+def : Pat<(sext_inreg GR16:$src, i8),
+ (EXTRACT_SUBREG (i32 (MOVSX32rr8 (EXTRACT_SUBREG
+ (i32 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), sub_8bit))),
+ sub_16bit)>,
+ Requires<[Not64BitMode]>;
+
+def : Pat<(sext_inreg GR64:$src, i32),
+ (MOVSX64rr32 (EXTRACT_SUBREG GR64:$src, sub_32bit))>;
+def : Pat<(sext_inreg GR64:$src, i16),
+ (MOVSX64rr16 (EXTRACT_SUBREG GR64:$src, sub_16bit))>;
+def : Pat<(sext_inreg GR64:$src, i8),
+ (MOVSX64rr8 (EXTRACT_SUBREG GR64:$src, sub_8bit))>;
+def : Pat<(sext_inreg GR32:$src, i8),
+ (MOVSX32rr8 (EXTRACT_SUBREG GR32:$src, sub_8bit))>,
+ Requires<[In64BitMode]>;
+def : Pat<(sext_inreg GR16:$src, i8),
+ (EXTRACT_SUBREG (MOVSX32rr8
+ (EXTRACT_SUBREG GR16:$src, sub_8bit)), sub_16bit)>,
+ Requires<[In64BitMode]>;
+
+// sext, sext_load, zext, zext_load
+def: Pat<(i16 (sext GR8:$src)),
+ (EXTRACT_SUBREG (MOVSX32rr8 GR8:$src), sub_16bit)>;
+def: Pat<(sextloadi16i8 addr:$src),
+ (EXTRACT_SUBREG (MOVSX32rm8 addr:$src), sub_16bit)>;
+def: Pat<(i16 (zext GR8:$src)),
+ (EXTRACT_SUBREG (MOVZX32rr8 GR8:$src), sub_16bit)>;
+def: Pat<(zextloadi16i8 addr:$src),
+ (EXTRACT_SUBREG (MOVZX32rm8 addr:$src), sub_16bit)>;
+
+// trunc patterns
+def : Pat<(i16 (trunc GR32:$src)),
+ (EXTRACT_SUBREG GR32:$src, sub_16bit)>;
+def : Pat<(i8 (trunc GR32:$src)),
+ (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
+ sub_8bit)>,
+ Requires<[Not64BitMode]>;
+def : Pat<(i8 (trunc GR16:$src)),
+ (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+ sub_8bit)>,
+ Requires<[Not64BitMode]>;
+def : Pat<(i32 (trunc GR64:$src)),
+ (EXTRACT_SUBREG GR64:$src, sub_32bit)>;
+def : Pat<(i16 (trunc GR64:$src)),
+ (EXTRACT_SUBREG GR64:$src, sub_16bit)>;
+def : Pat<(i8 (trunc GR64:$src)),
+ (EXTRACT_SUBREG GR64:$src, sub_8bit)>;
+def : Pat<(i8 (trunc GR32:$src)),
+ (EXTRACT_SUBREG GR32:$src, sub_8bit)>,
+ Requires<[In64BitMode]>;
+def : Pat<(i8 (trunc GR16:$src)),
+ (EXTRACT_SUBREG GR16:$src, sub_8bit)>,
+ Requires<[In64BitMode]>;
+
+// h-register tricks
+def : Pat<(i8 (trunc (srl_su GR16:$src, (i8 8)))),
+ (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+ sub_8bit_hi)>,
+ Requires<[Not64BitMode]>;
+def : Pat<(i8 (trunc (srl_su (i32 (anyext GR16:$src)), (i8 8)))),
+ (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+ sub_8bit_hi)>,
+ Requires<[Not64BitMode]>;
+def : Pat<(i8 (trunc (srl_su GR32:$src, (i8 8)))),
+ (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
+ sub_8bit_hi)>,
+ Requires<[Not64BitMode]>;
+def : Pat<(srl GR16:$src, (i8 8)),
+ (EXTRACT_SUBREG
+ (MOVZX32rr8
+ (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+ sub_8bit_hi)),
+ sub_16bit)>,
+ Requires<[Not64BitMode]>;
+def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))),
+ (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src,
+ GR16_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[Not64BitMode]>;
+def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))),
+ (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src,
+ GR16_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[Not64BitMode]>;
+def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
+ (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
+ GR32_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[Not64BitMode]>;
+def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)),
+ (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
+ GR32_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[Not64BitMode]>;
+
+// h-register tricks.
+// For now, be conservative on x86-64 and use an h-register extract only if the
+// value is immediately zero-extended or stored, which are somewhat common
+// cases. This uses a bunch of code to prevent a register requiring a REX prefix
+// from being allocated in the same instruction as the h register, as there's
+// currently no way to describe this requirement to the register allocator.
+
+// h-register extract and zero-extend.
+def : Pat<(and (srl_su GR64:$src, (i8 8)), (i64 255)),
+ (SUBREG_TO_REG
+ (i64 0),
+ (MOVZX32_NOREXrr8
+ (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)),
+ sub_8bit_hi)),
+ sub_32bit)>;
+def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
+ (MOVZX32_NOREXrr8
+ (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[In64BitMode]>;
+def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)),
+ (MOVZX32_NOREXrr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
+ GR32_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[In64BitMode]>;
+def : Pat<(srl GR16:$src, (i8 8)),
+ (EXTRACT_SUBREG
+ (MOVZX32_NOREXrr8
+ (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+ sub_8bit_hi)),
+ sub_16bit)>,
+ Requires<[In64BitMode]>;
+def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))),
+ (MOVZX32_NOREXrr8
+ (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[In64BitMode]>;
+def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))),
+ (MOVZX32_NOREXrr8
+ (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[In64BitMode]>;
+def : Pat<(i64 (zext (srl_su GR16:$src, (i8 8)))),
+ (SUBREG_TO_REG
+ (i64 0),
+ (MOVZX32_NOREXrr8
+ (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+ sub_8bit_hi)),
+ sub_32bit)>;
+def : Pat<(i64 (anyext (srl_su GR16:$src, (i8 8)))),
+ (SUBREG_TO_REG
+ (i64 0),
+ (MOVZX32_NOREXrr8
+ (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+ sub_8bit_hi)),
+ sub_32bit)>;
+
+// h-register extract and store.
+def : Pat<(store (i8 (trunc_su (srl_su GR64:$src, (i8 8)))), addr:$dst),
+ (MOV8mr_NOREX
+ addr:$dst,
+ (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)),
+ sub_8bit_hi))>;
+def : Pat<(store (i8 (trunc_su (srl_su GR32:$src, (i8 8)))), addr:$dst),
+ (MOV8mr_NOREX
+ addr:$dst,
+ (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[In64BitMode]>;
+def : Pat<(store (i8 (trunc_su (srl_su GR16:$src, (i8 8)))), addr:$dst),
+ (MOV8mr_NOREX
+ addr:$dst,
+ (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[In64BitMode]>;
+
+
+// (shl x, 1) ==> (add x, x)
+// Note that if x is undef (immediate or otherwise), we could theoretically
+// end up with the two uses of x getting different values, producing a result
+// where the least significant bit is not 0. However, the probability of this
+// happening is considered low enough that this is officially not a
+// "real problem".
+def : Pat<(shl GR8 :$src1, (i8 1)), (ADD8rr GR8 :$src1, GR8 :$src1)>;
+def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>;
+def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>;
+def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>;
+
+// Helper imms that check if a mask doesn't change significant shift bits.
+def immShift32 : ImmLeaf<i8, [{
+ return countTrailingOnes<uint64_t>(Imm) >= 5;
+}]>;
+def immShift64 : ImmLeaf<i8, [{
+ return countTrailingOnes<uint64_t>(Imm) >= 6;
+}]>;
+
+// Shift amount is implicitly masked.
+multiclass MaskedShiftAmountPats<SDNode frag, string name> {
+ // (shift x (and y, 31)) ==> (shift x, y)
+ def : Pat<(frag GR8:$src1, (and CL, immShift32)),
+ (!cast<Instruction>(name # "8rCL") GR8:$src1)>;
+ def : Pat<(frag GR16:$src1, (and CL, immShift32)),
+ (!cast<Instruction>(name # "16rCL") GR16:$src1)>;
+ def : Pat<(frag GR32:$src1, (and CL, immShift32)),
+ (!cast<Instruction>(name # "32rCL") GR32:$src1)>;
+ def : Pat<(store (frag (loadi8 addr:$dst), (and CL, immShift32)), addr:$dst),
+ (!cast<Instruction>(name # "8mCL") addr:$dst)>;
+ def : Pat<(store (frag (loadi16 addr:$dst), (and CL, immShift32)), addr:$dst),
+ (!cast<Instruction>(name # "16mCL") addr:$dst)>;
+ def : Pat<(store (frag (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst),
+ (!cast<Instruction>(name # "32mCL") addr:$dst)>;
+
+ // (shift x (and y, 63)) ==> (shift x, y)
+ def : Pat<(frag GR64:$src1, (and CL, immShift64)),
+ (!cast<Instruction>(name # "64rCL") GR64:$src1)>;
+ def : Pat<(store (frag (loadi64 addr:$dst), (and CL, 63)), addr:$dst),
+ (!cast<Instruction>(name # "64mCL") addr:$dst)>;
+}
+
+defm : MaskedShiftAmountPats<shl, "SHL">;
+defm : MaskedShiftAmountPats<srl, "SHR">;
+defm : MaskedShiftAmountPats<sra, "SAR">;
+defm : MaskedShiftAmountPats<rotl, "ROL">;
+defm : MaskedShiftAmountPats<rotr, "ROR">;
+
+// Double shift amount is implicitly masked.
+multiclass MaskedDoubleShiftAmountPats<SDNode frag, string name> {
+ // (shift x (and y, 31)) ==> (shift x, y)
+ def : Pat<(frag GR16:$src1, GR16:$src2, (and CL, immShift32)),
+ (!cast<Instruction>(name # "16rrCL") GR16:$src1, GR16:$src2)>;
+ def : Pat<(frag GR32:$src1, GR32:$src2, (and CL, immShift32)),
+ (!cast<Instruction>(name # "32rrCL") GR32:$src1, GR32:$src2)>;
+
+ // (shift x (and y, 63)) ==> (shift x, y)
+ def : Pat<(frag GR64:$src1, GR64:$src2, (and CL, immShift64)),
+ (!cast<Instruction>(name # "64rrCL") GR64:$src1, GR64:$src2)>;
+}
+
+defm : MaskedDoubleShiftAmountPats<X86shld, "SHLD">;
+defm : MaskedDoubleShiftAmountPats<X86shrd, "SHRD">;
+
+// (anyext (setcc_carry)) -> (setcc_carry)
+def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+ (SETB_C16r)>;
+def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+ (SETB_C32r)>;
+def : Pat<(i32 (anyext (i16 (X86setcc_c X86_COND_B, EFLAGS)))),
+ (SETB_C32r)>;
+
+//===----------------------------------------------------------------------===//
+// EFLAGS-defining Patterns
+//===----------------------------------------------------------------------===//
+
+// add reg, reg
+def : Pat<(add GR8 :$src1, GR8 :$src2), (ADD8rr GR8 :$src1, GR8 :$src2)>;
+def : Pat<(add GR16:$src1, GR16:$src2), (ADD16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(add GR32:$src1, GR32:$src2), (ADD32rr GR32:$src1, GR32:$src2)>;
+
+// add reg, mem
+def : Pat<(add GR8:$src1, (loadi8 addr:$src2)),
+ (ADD8rm GR8:$src1, addr:$src2)>;
+def : Pat<(add GR16:$src1, (loadi16 addr:$src2)),
+ (ADD16rm GR16:$src1, addr:$src2)>;
+def : Pat<(add GR32:$src1, (loadi32 addr:$src2)),
+ (ADD32rm GR32:$src1, addr:$src2)>;
+
+// add reg, imm
+def : Pat<(add GR8 :$src1, imm:$src2), (ADD8ri GR8:$src1 , imm:$src2)>;
+def : Pat<(add GR16:$src1, imm:$src2), (ADD16ri GR16:$src1, imm:$src2)>;
+def : Pat<(add GR32:$src1, imm:$src2), (ADD32ri GR32:$src1, imm:$src2)>;
+def : Pat<(add GR16:$src1, i16immSExt8:$src2),
+ (ADD16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(add GR32:$src1, i32immSExt8:$src2),
+ (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// sub reg, reg
+def : Pat<(sub GR8 :$src1, GR8 :$src2), (SUB8rr GR8 :$src1, GR8 :$src2)>;
+def : Pat<(sub GR16:$src1, GR16:$src2), (SUB16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(sub GR32:$src1, GR32:$src2), (SUB32rr GR32:$src1, GR32:$src2)>;
+
+// sub reg, mem
+def : Pat<(sub GR8:$src1, (loadi8 addr:$src2)),
+ (SUB8rm GR8:$src1, addr:$src2)>;
+def : Pat<(sub GR16:$src1, (loadi16 addr:$src2)),
+ (SUB16rm GR16:$src1, addr:$src2)>;
+def : Pat<(sub GR32:$src1, (loadi32 addr:$src2)),
+ (SUB32rm GR32:$src1, addr:$src2)>;
+
+// sub reg, imm
+def : Pat<(sub GR8:$src1, imm:$src2),
+ (SUB8ri GR8:$src1, imm:$src2)>;
+def : Pat<(sub GR16:$src1, imm:$src2),
+ (SUB16ri GR16:$src1, imm:$src2)>;
+def : Pat<(sub GR32:$src1, imm:$src2),
+ (SUB32ri GR32:$src1, imm:$src2)>;
+def : Pat<(sub GR16:$src1, i16immSExt8:$src2),
+ (SUB16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(sub GR32:$src1, i32immSExt8:$src2),
+ (SUB32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// sub 0, reg
+def : Pat<(X86sub_flag 0, GR8 :$src), (NEG8r GR8 :$src)>;
+def : Pat<(X86sub_flag 0, GR16:$src), (NEG16r GR16:$src)>;
+def : Pat<(X86sub_flag 0, GR32:$src), (NEG32r GR32:$src)>;
+def : Pat<(X86sub_flag 0, GR64:$src), (NEG64r GR64:$src)>;
+
+// mul reg, reg
+def : Pat<(mul GR16:$src1, GR16:$src2),
+ (IMUL16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(mul GR32:$src1, GR32:$src2),
+ (IMUL32rr GR32:$src1, GR32:$src2)>;
+
+// mul reg, mem
+def : Pat<(mul GR16:$src1, (loadi16 addr:$src2)),
+ (IMUL16rm GR16:$src1, addr:$src2)>;
+def : Pat<(mul GR32:$src1, (loadi32 addr:$src2)),
+ (IMUL32rm GR32:$src1, addr:$src2)>;
+
+// mul reg, imm
+def : Pat<(mul GR16:$src1, imm:$src2),
+ (IMUL16rri GR16:$src1, imm:$src2)>;
+def : Pat<(mul GR32:$src1, imm:$src2),
+ (IMUL32rri GR32:$src1, imm:$src2)>;
+def : Pat<(mul GR16:$src1, i16immSExt8:$src2),
+ (IMUL16rri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(mul GR32:$src1, i32immSExt8:$src2),
+ (IMUL32rri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// reg = mul mem, imm
+def : Pat<(mul (loadi16 addr:$src1), imm:$src2),
+ (IMUL16rmi addr:$src1, imm:$src2)>;
+def : Pat<(mul (loadi32 addr:$src1), imm:$src2),
+ (IMUL32rmi addr:$src1, imm:$src2)>;
+def : Pat<(mul (loadi16 addr:$src1), i16immSExt8:$src2),
+ (IMUL16rmi8 addr:$src1, i16immSExt8:$src2)>;
+def : Pat<(mul (loadi32 addr:$src1), i32immSExt8:$src2),
+ (IMUL32rmi8 addr:$src1, i32immSExt8:$src2)>;
+
+// Patterns for nodes that do not produce flags, for instructions that do.
+
+// addition
+def : Pat<(add GR64:$src1, GR64:$src2),
+ (ADD64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(add GR64:$src1, i64immSExt8:$src2),
+ (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(add GR64:$src1, i64immSExt32:$src2),
+ (ADD64ri32 GR64:$src1, i64immSExt32:$src2)>;
+def : Pat<(add GR64:$src1, (loadi64 addr:$src2)),
+ (ADD64rm GR64:$src1, addr:$src2)>;
+
+// subtraction
+def : Pat<(sub GR64:$src1, GR64:$src2),
+ (SUB64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(sub GR64:$src1, (loadi64 addr:$src2)),
+ (SUB64rm GR64:$src1, addr:$src2)>;
+def : Pat<(sub GR64:$src1, i64immSExt8:$src2),
+ (SUB64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(sub GR64:$src1, i64immSExt32:$src2),
+ (SUB64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// Multiply
+def : Pat<(mul GR64:$src1, GR64:$src2),
+ (IMUL64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(mul GR64:$src1, (loadi64 addr:$src2)),
+ (IMUL64rm GR64:$src1, addr:$src2)>;
+def : Pat<(mul GR64:$src1, i64immSExt8:$src2),
+ (IMUL64rri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(mul GR64:$src1, i64immSExt32:$src2),
+ (IMUL64rri32 GR64:$src1, i64immSExt32:$src2)>;
+def : Pat<(mul (loadi64 addr:$src1), i64immSExt8:$src2),
+ (IMUL64rmi8 addr:$src1, i64immSExt8:$src2)>;
+def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2),
+ (IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>;
+
+// Increment/Decrement reg.
+// Do not make INC/DEC if it is slow
+let Predicates = [NotSlowIncDec] in {
+ def : Pat<(add GR8:$src, 1), (INC8r GR8:$src)>;
+ def : Pat<(add GR16:$src, 1), (INC16r GR16:$src)>;
+ def : Pat<(add GR32:$src, 1), (INC32r GR32:$src)>;
+ def : Pat<(add GR64:$src, 1), (INC64r GR64:$src)>;
+ def : Pat<(add GR8:$src, -1), (DEC8r GR8:$src)>;
+ def : Pat<(add GR16:$src, -1), (DEC16r GR16:$src)>;
+ def : Pat<(add GR32:$src, -1), (DEC32r GR32:$src)>;
+ def : Pat<(add GR64:$src, -1), (DEC64r GR64:$src)>;
+}
+
+// or reg/reg.
+def : Pat<(or GR8 :$src1, GR8 :$src2), (OR8rr GR8 :$src1, GR8 :$src2)>;
+def : Pat<(or GR16:$src1, GR16:$src2), (OR16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(or GR32:$src1, GR32:$src2), (OR32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(or GR64:$src1, GR64:$src2), (OR64rr GR64:$src1, GR64:$src2)>;
+
+// or reg/mem
+def : Pat<(or GR8:$src1, (loadi8 addr:$src2)),
+ (OR8rm GR8:$src1, addr:$src2)>;
+def : Pat<(or GR16:$src1, (loadi16 addr:$src2)),
+ (OR16rm GR16:$src1, addr:$src2)>;
+def : Pat<(or GR32:$src1, (loadi32 addr:$src2)),
+ (OR32rm GR32:$src1, addr:$src2)>;
+def : Pat<(or GR64:$src1, (loadi64 addr:$src2)),
+ (OR64rm GR64:$src1, addr:$src2)>;
+
+// or reg/imm
+def : Pat<(or GR8:$src1 , imm:$src2), (OR8ri GR8 :$src1, imm:$src2)>;
+def : Pat<(or GR16:$src1, imm:$src2), (OR16ri GR16:$src1, imm:$src2)>;
+def : Pat<(or GR32:$src1, imm:$src2), (OR32ri GR32:$src1, imm:$src2)>;
+def : Pat<(or GR16:$src1, i16immSExt8:$src2),
+ (OR16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(or GR32:$src1, i32immSExt8:$src2),
+ (OR32ri8 GR32:$src1, i32immSExt8:$src2)>;
+def : Pat<(or GR64:$src1, i64immSExt8:$src2),
+ (OR64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(or GR64:$src1, i64immSExt32:$src2),
+ (OR64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// xor reg/reg
+def : Pat<(xor GR8 :$src1, GR8 :$src2), (XOR8rr GR8 :$src1, GR8 :$src2)>;
+def : Pat<(xor GR16:$src1, GR16:$src2), (XOR16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(xor GR32:$src1, GR32:$src2), (XOR32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(xor GR64:$src1, GR64:$src2), (XOR64rr GR64:$src1, GR64:$src2)>;
+
+// xor reg/mem
+def : Pat<(xor GR8:$src1, (loadi8 addr:$src2)),
+ (XOR8rm GR8:$src1, addr:$src2)>;
+def : Pat<(xor GR16:$src1, (loadi16 addr:$src2)),
+ (XOR16rm GR16:$src1, addr:$src2)>;
+def : Pat<(xor GR32:$src1, (loadi32 addr:$src2)),
+ (XOR32rm GR32:$src1, addr:$src2)>;
+def : Pat<(xor GR64:$src1, (loadi64 addr:$src2)),
+ (XOR64rm GR64:$src1, addr:$src2)>;
+
+// xor reg/imm
+def : Pat<(xor GR8:$src1, imm:$src2),
+ (XOR8ri GR8:$src1, imm:$src2)>;
+def : Pat<(xor GR16:$src1, imm:$src2),
+ (XOR16ri GR16:$src1, imm:$src2)>;
+def : Pat<(xor GR32:$src1, imm:$src2),
+ (XOR32ri GR32:$src1, imm:$src2)>;
+def : Pat<(xor GR16:$src1, i16immSExt8:$src2),
+ (XOR16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(xor GR32:$src1, i32immSExt8:$src2),
+ (XOR32ri8 GR32:$src1, i32immSExt8:$src2)>;
+def : Pat<(xor GR64:$src1, i64immSExt8:$src2),
+ (XOR64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(xor GR64:$src1, i64immSExt32:$src2),
+ (XOR64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// and reg/reg
+def : Pat<(and GR8 :$src1, GR8 :$src2), (AND8rr GR8 :$src1, GR8 :$src2)>;
+def : Pat<(and GR16:$src1, GR16:$src2), (AND16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(and GR32:$src1, GR32:$src2), (AND32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(and GR64:$src1, GR64:$src2), (AND64rr GR64:$src1, GR64:$src2)>;
+
+// and reg/mem
+def : Pat<(and GR8:$src1, (loadi8 addr:$src2)),
+ (AND8rm GR8:$src1, addr:$src2)>;
+def : Pat<(and GR16:$src1, (loadi16 addr:$src2)),
+ (AND16rm GR16:$src1, addr:$src2)>;
+def : Pat<(and GR32:$src1, (loadi32 addr:$src2)),
+ (AND32rm GR32:$src1, addr:$src2)>;
+def : Pat<(and GR64:$src1, (loadi64 addr:$src2)),
+ (AND64rm GR64:$src1, addr:$src2)>;
+
+// and reg/imm
+def : Pat<(and GR8:$src1, imm:$src2),
+ (AND8ri GR8:$src1, imm:$src2)>;
+def : Pat<(and GR16:$src1, imm:$src2),
+ (AND16ri GR16:$src1, imm:$src2)>;
+def : Pat<(and GR32:$src1, imm:$src2),
+ (AND32ri GR32:$src1, imm:$src2)>;
+def : Pat<(and GR16:$src1, i16immSExt8:$src2),
+ (AND16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(and GR32:$src1, i32immSExt8:$src2),
+ (AND32ri8 GR32:$src1, i32immSExt8:$src2)>;
+def : Pat<(and GR64:$src1, i64immSExt8:$src2),
+ (AND64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(and GR64:$src1, i64immSExt32:$src2),
+ (AND64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// Bit scan instruction patterns to match explicit zero-undef behavior.
+def : Pat<(cttz_zero_undef GR16:$src), (BSF16rr GR16:$src)>;
+def : Pat<(cttz_zero_undef GR32:$src), (BSF32rr GR32:$src)>;
+def : Pat<(cttz_zero_undef GR64:$src), (BSF64rr GR64:$src)>;
+def : Pat<(cttz_zero_undef (loadi16 addr:$src)), (BSF16rm addr:$src)>;
+def : Pat<(cttz_zero_undef (loadi32 addr:$src)), (BSF32rm addr:$src)>;
+def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm addr:$src)>;
+
+// When HasMOVBE is enabled it is possible to get a non-legalized
+// register-register 16 bit bswap. This maps it to a ROL instruction.
+let Predicates = [HasMOVBE] in {
+ def : Pat<(bswap GR16:$src), (ROL16ri GR16:$src, (i8 8))>;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrControl.td b/contrib/llvm/lib/Target/X86/X86InstrControl.td
new file mode 100644
index 000000000000..4ea223e82be9
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrControl.td
@@ -0,0 +1,358 @@
+//===-- X86InstrControl.td - Control Flow Instructions -----*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 jump, return, call, and related instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Control Flow Instructions.
+//
+
+// Return instructions.
+//
+// The X86retflag return instructions are variadic because we may add ST0 and
+// ST1 arguments when returning values on the x87 stack.
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+ hasCtrlDep = 1, FPForm = SpecialFP, SchedRW = [WriteJumpLd] in {
+ def RETL : I <0xC3, RawFrm, (outs), (ins variable_ops),
+ "ret{l}", [], IIC_RET>, OpSize32,
+ Requires<[Not64BitMode]>;
+ def RETQ : I <0xC3, RawFrm, (outs), (ins variable_ops),
+ "ret{q}", [], IIC_RET>, OpSize32,
+ Requires<[In64BitMode]>;
+ def RETW : I <0xC3, RawFrm, (outs), (ins),
+ "ret{w}",
+ [], IIC_RET>, OpSize16;
+ def RETIL : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
+ "ret{l}\t$amt",
+ [], IIC_RET_IMM>, OpSize32,
+ Requires<[Not64BitMode]>;
+ def RETIQ : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
+ "ret{q}\t$amt",
+ [], IIC_RET_IMM>, OpSize32,
+ Requires<[In64BitMode]>;
+ def RETIW : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt),
+ "ret{w}\t$amt",
+ [], IIC_RET_IMM>, OpSize16;
+ def LRETL : I <0xCB, RawFrm, (outs), (ins),
+ "{l}ret{l|f}", [], IIC_RET>, OpSize32;
+ def LRETQ : RI <0xCB, RawFrm, (outs), (ins),
+ "{l}ret{|f}q", [], IIC_RET>, Requires<[In64BitMode]>;
+ def LRETW : I <0xCB, RawFrm, (outs), (ins),
+ "{l}ret{w|f}", [], IIC_RET>, OpSize16;
+ def LRETIL : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
+ "{l}ret{l|f}\t$amt", [], IIC_RET>, OpSize32;
+ def LRETIQ : RIi16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
+ "{l}ret{|f}q\t$amt", [], IIC_RET>, Requires<[In64BitMode]>;
+ def LRETIW : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
+ "{l}ret{w|f}\t$amt", [], IIC_RET>, OpSize16;
+
+ // The machine return from interrupt instruction, but sometimes we need to
+ // perform a post-epilogue stack adjustment. Codegen emits the pseudo form
+ // which expands to include an SP adjustment if necessary.
+ def IRET16 : I <0xcf, RawFrm, (outs), (ins), "iret{w}", [], IIC_IRET>,
+ OpSize16;
+ def IRET32 : I <0xcf, RawFrm, (outs), (ins), "iret{l|d}", [],
+ IIC_IRET>, OpSize32;
+ def IRET64 : RI <0xcf, RawFrm, (outs), (ins), "iretq", [],
+ IIC_IRET>, Requires<[In64BitMode]>;
+ let isCodeGenOnly = 1 in
+ def IRET : PseudoI<(outs), (ins i32imm:$adj), [(X86iret timm:$adj)]>;
+ def RET : PseudoI<(outs), (ins i32imm:$adj, variable_ops), [(X86retflag timm:$adj)]>;
+}
+
+// Unconditional branches.
+let isBarrier = 1, isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in {
+ def JMP_1 : Ii8PCRel<0xEB, RawFrm, (outs), (ins brtarget8:$dst),
+ "jmp\t$dst", [(br bb:$dst)], IIC_JMP_REL>;
+ let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
+ def JMP_2 : Ii16PCRel<0xE9, RawFrm, (outs), (ins brtarget16:$dst),
+ "jmp\t$dst", [], IIC_JMP_REL>, OpSize16;
+ def JMP_4 : Ii32PCRel<0xE9, RawFrm, (outs), (ins brtarget32:$dst),
+ "jmp\t$dst", [], IIC_JMP_REL>, OpSize32;
+ }
+}
+
+// Conditional Branches.
+let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump] in {
+ multiclass ICBr<bits<8> opc1, bits<8> opc4, string asm, PatFrag Cond> {
+ def _1 : Ii8PCRel <opc1, RawFrm, (outs), (ins brtarget8:$dst), asm,
+ [(X86brcond bb:$dst, Cond, EFLAGS)], IIC_Jcc>;
+ let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
+ def _2 : Ii16PCRel<opc4, RawFrm, (outs), (ins brtarget16:$dst), asm,
+ [], IIC_Jcc>, OpSize16, TB;
+ def _4 : Ii32PCRel<opc4, RawFrm, (outs), (ins brtarget32:$dst), asm,
+ [], IIC_Jcc>, TB, OpSize32;
+ }
+ }
+}
+
+defm JO : ICBr<0x70, 0x80, "jo\t$dst" , X86_COND_O>;
+defm JNO : ICBr<0x71, 0x81, "jno\t$dst", X86_COND_NO>;
+defm JB : ICBr<0x72, 0x82, "jb\t$dst" , X86_COND_B>;
+defm JAE : ICBr<0x73, 0x83, "jae\t$dst", X86_COND_AE>;
+defm JE : ICBr<0x74, 0x84, "je\t$dst" , X86_COND_E>;
+defm JNE : ICBr<0x75, 0x85, "jne\t$dst", X86_COND_NE>;
+defm JBE : ICBr<0x76, 0x86, "jbe\t$dst", X86_COND_BE>;
+defm JA : ICBr<0x77, 0x87, "ja\t$dst" , X86_COND_A>;
+defm JS : ICBr<0x78, 0x88, "js\t$dst" , X86_COND_S>;
+defm JNS : ICBr<0x79, 0x89, "jns\t$dst", X86_COND_NS>;
+defm JP : ICBr<0x7A, 0x8A, "jp\t$dst" , X86_COND_P>;
+defm JNP : ICBr<0x7B, 0x8B, "jnp\t$dst", X86_COND_NP>;
+defm JL : ICBr<0x7C, 0x8C, "jl\t$dst" , X86_COND_L>;
+defm JGE : ICBr<0x7D, 0x8D, "jge\t$dst", X86_COND_GE>;
+defm JLE : ICBr<0x7E, 0x8E, "jle\t$dst", X86_COND_LE>;
+defm JG : ICBr<0x7F, 0x8F, "jg\t$dst" , X86_COND_G>;
+
+// jcx/jecx/jrcx instructions.
+let isBranch = 1, isTerminator = 1, hasSideEffects = 0, SchedRW = [WriteJump] in {
+ // These are the 32-bit versions of this instruction for the asmparser. In
+ // 32-bit mode, the address size prefix is jcxz and the unprefixed version is
+ // jecxz.
+ let Uses = [CX] in
+ def JCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
+ "jcxz\t$dst", [], IIC_JCXZ>, AdSize16,
+ Requires<[Not64BitMode]>;
+ let Uses = [ECX] in
+ def JECXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
+ "jecxz\t$dst", [], IIC_JCXZ>, AdSize32;
+
+ let Uses = [RCX] in
+ def JRCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
+ "jrcxz\t$dst", [], IIC_JCXZ>, AdSize64,
+ Requires<[In64BitMode]>;
+}
+
+// Indirect branches
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+ def JMP16r : I<0xFF, MRM4r, (outs), (ins GR16:$dst), "jmp{w}\t{*}$dst",
+ [(brind GR16:$dst)], IIC_JMP_REG>, Requires<[Not64BitMode]>,
+ OpSize16, Sched<[WriteJump]>;
+ def JMP16m : I<0xFF, MRM4m, (outs), (ins i16mem:$dst), "jmp{w}\t{*}$dst",
+ [(brind (loadi16 addr:$dst))], IIC_JMP_MEM>,
+ Requires<[Not64BitMode]>, OpSize16, Sched<[WriteJumpLd]>;
+
+ def JMP32r : I<0xFF, MRM4r, (outs), (ins GR32:$dst), "jmp{l}\t{*}$dst",
+ [(brind GR32:$dst)], IIC_JMP_REG>, Requires<[Not64BitMode]>,
+ OpSize32, Sched<[WriteJump]>;
+ def JMP32m : I<0xFF, MRM4m, (outs), (ins i32mem:$dst), "jmp{l}\t{*}$dst",
+ [(brind (loadi32 addr:$dst))], IIC_JMP_MEM>,
+ Requires<[Not64BitMode]>, OpSize32, Sched<[WriteJumpLd]>;
+
+ def JMP64r : I<0xFF, MRM4r, (outs), (ins GR64:$dst), "jmp{q}\t{*}$dst",
+ [(brind GR64:$dst)], IIC_JMP_REG>, Requires<[In64BitMode]>,
+ Sched<[WriteJump]>;
+ def JMP64m : I<0xFF, MRM4m, (outs), (ins i64mem:$dst), "jmp{q}\t{*}$dst",
+ [(brind (loadi64 addr:$dst))], IIC_JMP_MEM>,
+ Requires<[In64BitMode]>, Sched<[WriteJumpLd]>;
+
+ let Predicates = [Not64BitMode] in {
+ def FARJMP16i : Iseg16<0xEA, RawFrmImm16, (outs),
+ (ins i16imm:$off, i16imm:$seg),
+ "ljmp{w}\t$seg, $off", [],
+ IIC_JMP_FAR_PTR>, OpSize16, Sched<[WriteJump]>;
+ def FARJMP32i : Iseg32<0xEA, RawFrmImm16, (outs),
+ (ins i32imm:$off, i16imm:$seg),
+ "ljmp{l}\t$seg, $off", [],
+ IIC_JMP_FAR_PTR>, OpSize32, Sched<[WriteJump]>;
+ }
+ def FARJMP64 : RI<0xFF, MRM5m, (outs), (ins opaque80mem:$dst),
+ "ljmp{q}\t{*}$dst", [], IIC_JMP_FAR_MEM>,
+ Sched<[WriteJump]>;
+
+ def FARJMP16m : I<0xFF, MRM5m, (outs), (ins opaque32mem:$dst),
+ "ljmp{w}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize16,
+ Sched<[WriteJumpLd]>;
+ def FARJMP32m : I<0xFF, MRM5m, (outs), (ins opaque48mem:$dst),
+ "ljmp{l}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize32,
+ Sched<[WriteJumpLd]>;
+}
+
+
+// Loop instructions
+let SchedRW = [WriteJump] in {
+def LOOP : Ii8PCRel<0xE2, RawFrm, (outs), (ins brtarget8:$dst), "loop\t$dst", [], IIC_LOOP>;
+def LOOPE : Ii8PCRel<0xE1, RawFrm, (outs), (ins brtarget8:$dst), "loope\t$dst", [], IIC_LOOPE>;
+def LOOPNE : Ii8PCRel<0xE0, RawFrm, (outs), (ins brtarget8:$dst), "loopne\t$dst", [], IIC_LOOPNE>;
+}
+
+//===----------------------------------------------------------------------===//
+// Call Instructions...
+//
+let isCall = 1 in
+ // All calls clobber the non-callee saved registers. ESP is marked as
+ // a use to prevent stack-pointer assignments that appear immediately
+ // before calls from potentially appearing dead. Uses for argument
+ // registers are added manually.
+ let Uses = [ESP] in {
+ def CALLpcrel32 : Ii32PCRel<0xE8, RawFrm,
+ (outs), (ins i32imm_pcrel:$dst),
+ "call{l}\t$dst", [], IIC_CALL_RI>, OpSize32,
+ Requires<[Not64BitMode]>, Sched<[WriteJump]>;
+ let hasSideEffects = 0 in
+ def CALLpcrel16 : Ii16PCRel<0xE8, RawFrm,
+ (outs), (ins i16imm_pcrel:$dst),
+ "call{w}\t$dst", [], IIC_CALL_RI>, OpSize16,
+ Sched<[WriteJump]>;
+ def CALL16r : I<0xFF, MRM2r, (outs), (ins GR16:$dst),
+ "call{w}\t{*}$dst", [(X86call GR16:$dst)], IIC_CALL_RI>,
+ OpSize16, Requires<[Not64BitMode]>, Sched<[WriteJump]>;
+ def CALL16m : I<0xFF, MRM2m, (outs), (ins i16mem:$dst),
+ "call{w}\t{*}$dst", [(X86call (loadi16 addr:$dst))],
+ IIC_CALL_MEM>, OpSize16,
+ Requires<[Not64BitMode,FavorMemIndirectCall]>,
+ Sched<[WriteJumpLd]>;
+ def CALL32r : I<0xFF, MRM2r, (outs), (ins GR32:$dst),
+ "call{l}\t{*}$dst", [(X86call GR32:$dst)], IIC_CALL_RI>,
+ OpSize32, Requires<[Not64BitMode]>, Sched<[WriteJump]>;
+ def CALL32m : I<0xFF, MRM2m, (outs), (ins i32mem:$dst),
+ "call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))],
+ IIC_CALL_MEM>, OpSize32,
+ Requires<[Not64BitMode,FavorMemIndirectCall]>,
+ Sched<[WriteJumpLd]>;
+
+ let Predicates = [Not64BitMode] in {
+ def FARCALL16i : Iseg16<0x9A, RawFrmImm16, (outs),
+ (ins i16imm:$off, i16imm:$seg),
+ "lcall{w}\t$seg, $off", [],
+ IIC_CALL_FAR_PTR>, OpSize16, Sched<[WriteJump]>;
+ def FARCALL32i : Iseg32<0x9A, RawFrmImm16, (outs),
+ (ins i32imm:$off, i16imm:$seg),
+ "lcall{l}\t$seg, $off", [],
+ IIC_CALL_FAR_PTR>, OpSize32, Sched<[WriteJump]>;
+ }
+
+ def FARCALL16m : I<0xFF, MRM3m, (outs), (ins opaque32mem:$dst),
+ "lcall{w}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize16,
+ Sched<[WriteJumpLd]>;
+ def FARCALL32m : I<0xFF, MRM3m, (outs), (ins opaque48mem:$dst),
+ "lcall{l}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize32,
+ Sched<[WriteJumpLd]>;
+ }
+
+
+// Tail call stuff.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
+ isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in
+ let Uses = [ESP] in {
+ def TCRETURNdi : PseudoI<(outs),
+ (ins i32imm_pcrel:$dst, i32imm:$offset), []>;
+ def TCRETURNri : PseudoI<(outs),
+ (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>;
+ let mayLoad = 1 in
+ def TCRETURNmi : PseudoI<(outs),
+ (ins i32mem_TC:$dst, i32imm:$offset), []>;
+
+ // FIXME: The should be pseudo instructions that are lowered when going to
+ // mcinst.
+ def TAILJMPd : Ii32PCRel<0xE9, RawFrm, (outs),
+ (ins i32imm_pcrel:$dst),
+ "jmp\t$dst",
+ [], IIC_JMP_REL>;
+
+ def TAILJMPr : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
+ "", [], IIC_JMP_REG>; // FIXME: Remove encoding when JIT is dead.
+ let mayLoad = 1 in
+ def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst),
+ "jmp{l}\t{*}$dst", [], IIC_JMP_MEM>;
+}
+
+// Conditional tail calls are similar to the above, but they are branches
+// rather than barriers, and they use EFLAGS.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1,
+ isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in
+ let Uses = [ESP, EFLAGS] in {
+ def TCRETURNdicc : PseudoI<(outs),
+ (ins i32imm_pcrel:$dst, i32imm:$offset, i32imm:$cond), []>;
+
+ // This gets substituted to a conditional jump instruction in MC lowering.
+ def TAILJMPd_CC : Ii32PCRel<0x80, RawFrm, (outs),
+ (ins i32imm_pcrel:$dst, i32imm:$cond),
+ "",
+ [], IIC_JMP_REL>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Call Instructions...
+//
+
+// RSP is marked as a use to prevent stack-pointer assignments that appear
+// immediately before calls from potentially appearing dead. Uses for argument
+// registers are added manually.
+let isCall = 1, Uses = [RSP], SchedRW = [WriteJump] in {
+ // NOTE: this pattern doesn't match "X86call imm", because we do not know
+ // that the offset between an arbitrary immediate and the call will fit in
+ // the 32-bit pcrel field that we have.
+ def CALL64pcrel32 : Ii32PCRel<0xE8, RawFrm,
+ (outs), (ins i64i32imm_pcrel:$dst),
+ "call{q}\t$dst", [], IIC_CALL_RI>, OpSize32,
+ Requires<[In64BitMode]>;
+ def CALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst),
+ "call{q}\t{*}$dst", [(X86call GR64:$dst)],
+ IIC_CALL_RI>,
+ Requires<[In64BitMode]>;
+ def CALL64m : I<0xFF, MRM2m, (outs), (ins i64mem:$dst),
+ "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))],
+ IIC_CALL_MEM>,
+ Requires<[In64BitMode,FavorMemIndirectCall]>;
+
+ def FARCALL64 : RI<0xFF, MRM3m, (outs), (ins opaque80mem:$dst),
+ "lcall{q}\t{*}$dst", [], IIC_CALL_FAR_MEM>;
+}
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
+ isCodeGenOnly = 1, Uses = [RSP], usesCustomInserter = 1,
+ SchedRW = [WriteJump] in {
+ def TCRETURNdi64 : PseudoI<(outs),
+ (ins i64i32imm_pcrel:$dst, i32imm:$offset),
+ []>;
+ def TCRETURNri64 : PseudoI<(outs),
+ (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>;
+ let mayLoad = 1 in
+ def TCRETURNmi64 : PseudoI<(outs),
+ (ins i64mem_TC:$dst, i32imm:$offset), []>;
+
+ def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs), (ins i64i32imm_pcrel:$dst),
+ "jmp\t$dst", [], IIC_JMP_REL>;
+
+ def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
+ "jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;
+
+ let mayLoad = 1 in
+ def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst),
+ "jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;
+
+ // Win64 wants indirect jumps leaving the function to have a REX_W prefix.
+ let hasREX_WPrefix = 1 in {
+ def TAILJMPr64_REX : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
+ "rex64 jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;
+
+ let mayLoad = 1 in
+ def TAILJMPm64_REX : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst),
+ "rex64 jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;
+ }
+}
+
+// Conditional tail calls are similar to the above, but they are branches
+// rather than barriers, and they use EFLAGS.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1,
+ isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in
+ let Uses = [RSP, EFLAGS] in {
+ def TCRETURNdi64cc : PseudoI<(outs),
+ (ins i64i32imm_pcrel:$dst, i32imm:$offset,
+ i32imm:$cond), []>;
+
+ // This gets substituted to a conditional jump instruction in MC lowering.
+ def TAILJMPd64_CC : Ii32PCRel<0x80, RawFrm, (outs),
+ (ins i64i32imm_pcrel:$dst, i32imm:$cond),
+ "",
+ [], IIC_JMP_REL>;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrExtension.td b/contrib/llvm/lib/Target/X86/X86InstrExtension.td
new file mode 100644
index 000000000000..af43d9f53325
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrExtension.td
@@ -0,0 +1,186 @@
+//===-- X86InstrExtension.td - Sign and Zero Extensions ----*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the sign and zero extension operations.
+//
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 0 in {
+ let Defs = [AX], Uses = [AL] in
+ def CBW : I<0x98, RawFrm, (outs), (ins),
+ "{cbtw|cbw}", [], IIC_CBW>, OpSize16; // AX = signext(AL)
+ let Defs = [EAX], Uses = [AX] in
+ def CWDE : I<0x98, RawFrm, (outs), (ins),
+ "{cwtl|cwde}", [], IIC_CBW>, OpSize32; // EAX = signext(AX)
+
+ let Defs = [AX,DX], Uses = [AX] in
+ def CWD : I<0x99, RawFrm, (outs), (ins),
+ "{cwtd|cwd}", [], IIC_CBW>, OpSize16; // DX:AX = signext(AX)
+ let Defs = [EAX,EDX], Uses = [EAX] in
+ def CDQ : I<0x99, RawFrm, (outs), (ins),
+ "{cltd|cdq}", [], IIC_CBW>, OpSize32; // EDX:EAX = signext(EAX)
+
+
+ let Defs = [RAX], Uses = [EAX] in
+ def CDQE : RI<0x98, RawFrm, (outs), (ins),
+ "{cltq|cdqe}", [], IIC_CBW>; // RAX = signext(EAX)
+
+ let Defs = [RAX,RDX], Uses = [RAX] in
+ def CQO : RI<0x99, RawFrm, (outs), (ins),
+ "{cqto|cqo}", [], IIC_CBW>; // RDX:RAX = signext(RAX)
+}
+
+
+
+// Sign/Zero extenders
+let hasSideEffects = 0 in {
+def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
+ "movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_R8>,
+ TB, OpSize16, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVSX16rm8 : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
+ "movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_M8>,
+ TB, OpSize16, Sched<[WriteALULd]>;
+} // hasSideEffects = 0
+def MOVSX32rr8 : I<0xBE, MRMSrcReg, (outs GR32:$dst), (ins GR8:$src),
+ "movs{bl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (sext GR8:$src))], IIC_MOVSX>, TB,
+ OpSize32, Sched<[WriteALU]>;
+def MOVSX32rm8 : I<0xBE, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src),
+ "movs{bl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (sextloadi32i8 addr:$src))], IIC_MOVSX>, TB,
+ OpSize32, Sched<[WriteALULd]>;
+def MOVSX32rr16: I<0xBF, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src),
+ "movs{wl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (sext GR16:$src))], IIC_MOVSX>, TB,
+ OpSize32, Sched<[WriteALU]>;
+def MOVSX32rm16: I<0xBF, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
+ "movs{wl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (sextloadi32i16 addr:$src))], IIC_MOVSX>,
+ OpSize32, TB, Sched<[WriteALULd]>;
+
+let hasSideEffects = 0 in {
+def MOVZX16rr8 : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
+ "movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_R8>,
+ TB, OpSize16, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVZX16rm8 : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
+ "movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_M8>,
+ TB, OpSize16, Sched<[WriteALULd]>;
+} // hasSideEffects = 0
+def MOVZX32rr8 : I<0xB6, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src),
+ "movz{bl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (zext GR8:$src))], IIC_MOVZX>, TB,
+ OpSize32, Sched<[WriteALU]>;
+def MOVZX32rm8 : I<0xB6, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src),
+ "movz{bl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (zextloadi32i8 addr:$src))], IIC_MOVZX>, TB,
+ OpSize32, Sched<[WriteALULd]>;
+def MOVZX32rr16: I<0xB7, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src),
+ "movz{wl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (zext GR16:$src))], IIC_MOVZX>, TB,
+ OpSize32, Sched<[WriteALU]>;
+def MOVZX32rm16: I<0xB7, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
+ "movz{wl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (zextloadi32i16 addr:$src))], IIC_MOVZX>,
+ TB, OpSize32, Sched<[WriteALULd]>;
+
+// These are the same as the regular MOVZX32rr8 and MOVZX32rm8
+// except that they use GR32_NOREX for the output operand register class
+// instead of GR32. This allows them to operate on h registers on x86-64.
+let hasSideEffects = 0, isCodeGenOnly = 1 in {
+def MOVZX32_NOREXrr8 : I<0xB6, MRMSrcReg,
+ (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src),
+ "movz{bl|x}\t{$src, $dst|$dst, $src} # NOREX",
+ [], IIC_MOVZX>, TB, OpSize32, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVZX32_NOREXrm8 : I<0xB6, MRMSrcMem,
+ (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src),
+ "movz{bl|x}\t{$src, $dst|$dst, $src} # NOREX",
+ [], IIC_MOVZX>, TB, OpSize32, Sched<[WriteALULd]>;
+
+def MOVSX32_NOREXrr8 : I<0xBE, MRMSrcReg,
+ (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src),
+ "movs{bl|x}\t{$src, $dst|$dst, $src} # NOREX",
+ [], IIC_MOVSX>, TB, OpSize32, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVSX32_NOREXrm8 : I<0xBE, MRMSrcMem,
+ (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src),
+ "movs{bl|x}\t{$src, $dst|$dst, $src} # NOREX",
+ [], IIC_MOVSX>, TB, OpSize32, Sched<[WriteALULd]>;
+}
+
+// MOVSX64rr8 always has a REX prefix and it has an 8-bit register
+// operand, which makes it a rare instruction with an 8-bit register
+// operand that can never access an h register. If support for h registers
+// were generalized, this would require a special register class.
+def MOVSX64rr8 : RI<0xBE, MRMSrcReg, (outs GR64:$dst), (ins GR8 :$src),
+ "movs{bq|x}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sext GR8:$src))], IIC_MOVSX>, TB,
+ Sched<[WriteALU]>;
+def MOVSX64rm8 : RI<0xBE, MRMSrcMem, (outs GR64:$dst), (ins i8mem :$src),
+ "movs{bq|x}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sextloadi64i8 addr:$src))], IIC_MOVSX>,
+ TB, Sched<[WriteALULd]>;
+def MOVSX64rr16: RI<0xBF, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
+ "movs{wq|x}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sext GR16:$src))], IIC_MOVSX>, TB,
+ Sched<[WriteALU]>;
+def MOVSX64rm16: RI<0xBF, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+ "movs{wq|x}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sextloadi64i16 addr:$src))], IIC_MOVSX>,
+ TB, Sched<[WriteALULd]>;
+def MOVSX64rr32: RI<0x63, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
+ "movs{lq|xd}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sext GR32:$src))], IIC_MOVSX>,
+ Sched<[WriteALU]>, Requires<[In64BitMode]>;
+def MOVSX64rm32: RI<0x63, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src),
+ "movs{lq|xd}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sextloadi64i32 addr:$src))], IIC_MOVSX>,
+ Sched<[WriteALULd]>, Requires<[In64BitMode]>;
+
+// movzbq and movzwq encodings for the disassembler
+let hasSideEffects = 0 in {
+def MOVZX64rr8 : RI<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8:$src),
+ "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+ TB, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVZX64rm8 : RI<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem:$src),
+ "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+ TB, Sched<[WriteALULd]>;
+def MOVZX64rr16 : RI<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
+ "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+ TB, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVZX64rm16 : RI<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+ "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+ TB, Sched<[WriteALULd]>;
+}
+
+// 64-bit zero-extension patterns use SUBREG_TO_REG and an operation writing a
+// 32-bit register.
+def : Pat<(i64 (zext GR8:$src)),
+ (SUBREG_TO_REG (i64 0), (MOVZX32rr8 GR8:$src), sub_32bit)>;
+def : Pat<(zextloadi64i8 addr:$src),
+ (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
+
+def : Pat<(i64 (zext GR16:$src)),
+ (SUBREG_TO_REG (i64 0), (MOVZX32rr16 GR16:$src), sub_32bit)>;
+def : Pat<(zextloadi64i16 addr:$src),
+ (SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>;
+
+// The preferred way to do 32-bit-to-64-bit zero extension on x86-64 is to use a
+// SUBREG_TO_REG to utilize implicit zero-extension, however this isn't possible
+// when the 32-bit value is defined by a truncate or is copied from something
+// where the high bits aren't necessarily all zero. In such cases, we fall back
+// to these explicit zext instructions.
+def : Pat<(i64 (zext GR32:$src)),
+ (SUBREG_TO_REG (i64 0), (MOV32rr GR32:$src), sub_32bit)>;
+def : Pat<(i64 (zextloadi64i32 addr:$src)),
+ (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFMA.td b/contrib/llvm/lib/Target/X86/X86InstrFMA.td
new file mode 100644
index 000000000000..4b19f801dae1
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrFMA.td
@@ -0,0 +1,443 @@
+//===-- X86InstrFMA.td - FMA Instruction Set ---------------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes FMA (Fused Multiply-Add) instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// FMA3 - Intel 3 operand Fused Multiply-Add instructions
+//===----------------------------------------------------------------------===//
+
+// For all FMA opcodes declared in fma3p_rm and fma3s_rm milticlasses defined
+// below, both the register and memory variants are commutable.
+// For the register form the commutable operands are 1, 2 and 3.
+// For the memory variant the folded operand must be in 3. Thus,
+// in that case, only the operands 1 and 2 can be swapped.
+// Commuting some of operands may require the opcode change.
+// FMA*213*:
+// operands 1 and 2 (memory & register forms): *213* --> *213*(no changes);
+// operands 1 and 3 (register forms only): *213* --> *231*;
+// operands 2 and 3 (register forms only): *213* --> *132*.
+// FMA*132*:
+// operands 1 and 2 (memory & register forms): *132* --> *231*;
+// operands 1 and 3 (register forms only): *132* --> *132*(no changes);
+// operands 2 and 3 (register forms only): *132* --> *213*.
+// FMA*231*:
+// operands 1 and 2 (memory & register forms): *231* --> *132*;
+// operands 1 and 3 (register forms only): *231* --> *213*;
+// operands 2 and 3 (register forms only): *231* --> *231*(no changes).
+
+let Constraints = "$src1 = $dst", hasSideEffects = 0, isCommutable = 1 in
+multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
+ PatFrag MemFrag128, PatFrag MemFrag256,
+ ValueType OpVT128, ValueType OpVT256,
+ SDPatternOperator Op = null_frag> {
+ def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst, (OpVT128 (Op VR128:$src2,
+ VR128:$src1, VR128:$src3)))]>;
+
+ let mayLoad = 1 in
+ def m : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, f128mem:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1,
+ (MemFrag128 addr:$src3))))]>;
+
+ def Yr : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, VR256:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR256:$dst, (OpVT256 (Op VR256:$src2, VR256:$src1,
+ VR256:$src3)))]>, VEX_L;
+
+ let mayLoad = 1 in
+ def Ym : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, f256mem:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR256:$dst,
+ (OpVT256 (Op VR256:$src2, VR256:$src1,
+ (MemFrag256 addr:$src3))))]>, VEX_L;
+}
+
+multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
+ string OpcodeStr, string PackTy, string Suff,
+ PatFrag MemFrag128, PatFrag MemFrag256,
+ SDNode Op, ValueType OpTy128, ValueType OpTy256> {
+ defm NAME#213#Suff : fma3p_rm<opc213,
+ !strconcat(OpcodeStr, "213", PackTy),
+ MemFrag128, MemFrag256, OpTy128, OpTy256, Op>;
+ defm NAME#132#Suff : fma3p_rm<opc132,
+ !strconcat(OpcodeStr, "132", PackTy),
+ MemFrag128, MemFrag256, OpTy128, OpTy256>;
+ defm NAME#231#Suff : fma3p_rm<opc231,
+ !strconcat(OpcodeStr, "231", PackTy),
+ MemFrag128, MemFrag256, OpTy128, OpTy256>;
+}
+
+// Fused Multiply-Add
+let ExeDomain = SSEPackedSingle in {
+ defm VFMADD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", "PS",
+ loadv4f32, loadv8f32, X86Fmadd, v4f32, v8f32>;
+ defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", "PS",
+ loadv4f32, loadv8f32, X86Fmsub, v4f32, v8f32>;
+ defm VFMADDSUB : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps", "PS",
+ loadv4f32, loadv8f32, X86Fmaddsub,
+ v4f32, v8f32>;
+ defm VFMSUBADD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps", "PS",
+ loadv4f32, loadv8f32, X86Fmsubadd,
+ v4f32, v8f32>;
+}
+
+let ExeDomain = SSEPackedDouble in {
+ defm VFMADD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", "PD",
+ loadv2f64, loadv4f64, X86Fmadd, v2f64,
+ v4f64>, VEX_W;
+ defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", "PD",
+ loadv2f64, loadv4f64, X86Fmsub, v2f64,
+ v4f64>, VEX_W;
+ defm VFMADDSUB : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd", "PD",
+ loadv2f64, loadv4f64, X86Fmaddsub,
+ v2f64, v4f64>, VEX_W;
+ defm VFMSUBADD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd", "PD",
+ loadv2f64, loadv4f64, X86Fmsubadd,
+ v2f64, v4f64>, VEX_W;
+}
+
+// Fused Negative Multiply-Add
+let ExeDomain = SSEPackedSingle in {
+ defm VFNMADD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps", "PS", loadv4f32,
+ loadv8f32, X86Fnmadd, v4f32, v8f32>;
+ defm VFNMSUB : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps", "PS", loadv4f32,
+ loadv8f32, X86Fnmsub, v4f32, v8f32>;
+}
+let ExeDomain = SSEPackedDouble in {
+ defm VFNMADD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", "PD", loadv2f64,
+ loadv4f64, X86Fnmadd, v2f64, v4f64>, VEX_W;
+ defm VFNMSUB : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd", "PD", loadv2f64,
+ loadv4f64, X86Fnmsub, v2f64, v4f64>, VEX_W;
+}
+
+// All source register operands of FMA opcodes defined in fma3s_rm multiclass
+// can be commuted. In many cases such commute transformation requres an opcode
+// adjustment, for example, commuting the operands 1 and 2 in FMA*132 form
+// would require an opcode change to FMA*231:
+// FMA*132* reg1, reg2, reg3; // reg1 * reg3 + reg2;
+// -->
+// FMA*231* reg2, reg1, reg3; // reg1 * reg3 + reg2;
+// Please see more detailed comment at the very beginning of the section
+// defining FMA3 opcodes above.
+let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in
+multiclass fma3s_rm<bits<8> opc, string OpcodeStr,
+ X86MemOperand x86memop, RegisterClass RC,
+ SDPatternOperator OpNode = null_frag> {
+ def r : FMA3<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set RC:$dst, (OpNode RC:$src2, RC:$src1, RC:$src3))]>;
+
+ let mayLoad = 1 in
+ def m : FMA3<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, x86memop:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set RC:$dst,
+ (OpNode RC:$src2, RC:$src1, (load addr:$src3)))]>;
+}
+
+// These FMA*_Int instructions are defined specially for being used when
+// the scalar FMA intrinsics are lowered to machine instructions, and in that
+// sense, they are similar to existing ADD*_Int, SUB*_Int, MUL*_Int, etc.
+// instructions.
+//
+// All of the FMA*_Int opcodes are defined as commutable here.
+// Commuting the 2nd and 3rd source register operands of FMAs is quite trivial
+// and the corresponding optimizations have been developed.
+// Commuting the 1st operand of FMA*_Int requires some additional analysis,
+// the commute optimization is legal only if all users of FMA*_Int use only
+// the lowest element of the FMA*_Int instruction. Even though such analysis
+// may be not implemented yet we allow the routines doing the actual commute
+// transformation to decide if one or another instruction is commutable or not.
+let Constraints = "$src1 = $dst", isCommutable = 1, isCodeGenOnly = 1,
+ hasSideEffects = 0 in
+multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr,
+ Operand memopr, RegisterClass RC> {
+ def r_Int : FMA3<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ []>;
+
+ let mayLoad = 1 in
+ def m_Int : FMA3<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, memopr:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ []>;
+}
+
+multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
+ string OpStr, string PackTy, string Suff,
+ SDNode OpNode, RegisterClass RC,
+ X86MemOperand x86memop> {
+ defm NAME#132#Suff : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy),
+ x86memop, RC>;
+ defm NAME#213#Suff : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy),
+ x86memop, RC, OpNode>;
+ defm NAME#231#Suff : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy),
+ x86memop, RC>;
+}
+
+// The FMA 213 form is created for lowering of scalar FMA intrinscis
+// to machine instructions.
+// The FMA 132 form can trivially be get by commuting the 2nd and 3rd operands
+// of FMA 213 form.
+// The FMA 231 form can be get only by commuting the 1st operand of 213 or 132
+// forms and is possible only after special analysis of all uses of the initial
+// instruction. Such analysis do not exist yet and thus introducing the 231
+// form of FMA*_Int instructions is done using an optimistic assumption that
+// such analysis will be implemented eventually.
+multiclass fma3s_int_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
+ string OpStr, string PackTy, string Suff,
+ RegisterClass RC, Operand memop> {
+ defm NAME#132#Suff : fma3s_rm_int<opc132, !strconcat(OpStr, "132", PackTy),
+ memop, RC>;
+ defm NAME#213#Suff : fma3s_rm_int<opc213, !strconcat(OpStr, "213", PackTy),
+ memop, RC>;
+ defm NAME#231#Suff : fma3s_rm_int<opc231, !strconcat(OpStr, "231", PackTy),
+ memop, RC>;
+}
+
+multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
+ string OpStr, Intrinsic IntF32, Intrinsic IntF64,
+ SDNode OpNode> {
+ let ExeDomain = SSEPackedSingle in
+ defm NAME : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", "SS", OpNode,
+ FR32, f32mem>,
+ fma3s_int_forms<opc132, opc213, opc231, OpStr, "ss", "SS",
+ VR128, ssmem>;
+
+ let ExeDomain = SSEPackedDouble in
+ defm NAME : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", "SD", OpNode,
+ FR64, f64mem>,
+ fma3s_int_forms<opc132, opc213, opc231, OpStr, "sd", "SD",
+ VR128, sdmem>, VEX_W;
+
+ // These patterns use the 123 ordering, instead of 213, even though
+ // they match the intrinsic to the 213 version of the instruction.
+ // This is because src1 is tied to dest, and the scalar intrinsics
+ // require the pass-through values to come from the first source
+ // operand, not the second.
+ let Predicates = [HasFMA] in {
+ def : Pat<(IntF32 VR128:$src1, VR128:$src2, VR128:$src3),
+ (COPY_TO_REGCLASS(!cast<Instruction>(NAME#"213SSr_Int")
+ $src1, $src2, $src3), VR128)>;
+
+ def : Pat<(IntF64 VR128:$src1, VR128:$src2, VR128:$src3),
+ (COPY_TO_REGCLASS(!cast<Instruction>(NAME#"213SDr_Int")
+ $src1, $src2, $src3), VR128)>;
+ }
+}
+
+defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", int_x86_fma_vfmadd_ss,
+ int_x86_fma_vfmadd_sd, X86Fmadd>, VEX_LIG;
+defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", int_x86_fma_vfmsub_ss,
+ int_x86_fma_vfmsub_sd, X86Fmsub>, VEX_LIG;
+
+defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", int_x86_fma_vfnmadd_ss,
+ int_x86_fma_vfnmadd_sd, X86Fnmadd>, VEX_LIG;
+defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", int_x86_fma_vfnmsub_ss,
+ int_x86_fma_vfnmsub_sd, X86Fnmsub>, VEX_LIG;
+
+
+//===----------------------------------------------------------------------===//
+// FMA4 - AMD 4 operand Fused Multiply-Add instructions
+//===----------------------------------------------------------------------===//
+
+
+multiclass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ X86MemOperand x86memop, ValueType OpVT, SDNode OpNode,
+ PatFrag mem_frag> {
+ let isCommutable = 1 in
+ def rr : FMA4<opc, MRMSrcRegOp4, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set RC:$dst,
+ (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, VEX_W, VEX_LIG;
+ def rm : FMA4<opc, MRMSrcMemOp4, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, x86memop:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set RC:$dst, (OpNode RC:$src1, RC:$src2,
+ (mem_frag addr:$src3)))]>, VEX_W, VEX_LIG;
+ def mr : FMA4<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set RC:$dst,
+ (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3))]>, VEX_LIG;
+// For disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+ def rr_REV : FMA4<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
+ VEX_LIG;
+}
+
+multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,
+ ComplexPattern mem_cpat, Intrinsic Int> {
+let isCodeGenOnly = 1 in {
+ def rr_Int : FMA4<opc, MRMSrcRegOp4, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, VEX_LIG;
+ def rm_Int : FMA4<opc, MRMSrcMemOp4, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, memop:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst, (Int VR128:$src1, VR128:$src2,
+ mem_cpat:$src3))]>, VEX_W, VEX_LIG;
+ def mr_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, memop:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>, VEX_LIG;
+} // isCodeGenOnly = 1
+}
+
+multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType OpVT128, ValueType OpVT256,
+ PatFrag ld_frag128, PatFrag ld_frag256> {
+ let isCommutable = 1 in
+ def rr : FMA4<opc, MRMSrcRegOp4, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (OpVT128 (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>,
+ VEX_W;
+ def rm : FMA4<opc, MRMSrcMemOp4, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, f128mem:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst, (OpNode VR128:$src1, VR128:$src2,
+ (ld_frag128 addr:$src3)))]>, VEX_W;
+ def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, f128mem:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (OpNode VR128:$src1, (ld_frag128 addr:$src2), VR128:$src3))]>;
+ let isCommutable = 1 in
+ def Yrr : FMA4<opc, MRMSrcRegOp4, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, VR256:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR256:$dst,
+ (OpVT256 (OpNode VR256:$src1, VR256:$src2, VR256:$src3)))]>,
+ VEX_W, VEX_L;
+ def Yrm : FMA4<opc, MRMSrcMemOp4, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, f256mem:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR256:$dst, (OpNode VR256:$src1, VR256:$src2,
+ (ld_frag256 addr:$src3)))]>, VEX_W, VEX_L;
+ def Ymr : FMA4<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, f256mem:$src2, VR256:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR256:$dst, (OpNode VR256:$src1,
+ (ld_frag256 addr:$src2), VR256:$src3))]>, VEX_L;
+// For disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
+ def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>;
+ def Yrr_REV : FMA4<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, VR256:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
+ VEX_L;
+} // isCodeGenOnly = 1
+}
+
+let ExeDomain = SSEPackedSingle in {
+ // Scalar Instructions
+ defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>,
+ fma4s_int<0x6A, "vfmaddss", ssmem, sse_load_f32,
+ int_x86_fma_vfmadd_ss>;
+ defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>,
+ fma4s_int<0x6E, "vfmsubss", ssmem, sse_load_f32,
+ int_x86_fma_vfmsub_ss>;
+ defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32,
+ X86Fnmadd, loadf32>,
+ fma4s_int<0x7A, "vfnmaddss", ssmem, sse_load_f32,
+ int_x86_fma_vfnmadd_ss>;
+ defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32,
+ X86Fnmsub, loadf32>,
+ fma4s_int<0x7E, "vfnmsubss", ssmem, sse_load_f32,
+ int_x86_fma_vfnmsub_ss>;
+ // Packed Instructions
+ defm VFMADDPS4 : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32,
+ loadv4f32, loadv8f32>;
+ defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32,
+ loadv4f32, loadv8f32>;
+ defm VFNMADDPS4 : fma4p<0x78, "vfnmaddps", X86Fnmadd, v4f32, v8f32,
+ loadv4f32, loadv8f32>;
+ defm VFNMSUBPS4 : fma4p<0x7C, "vfnmsubps", X86Fnmsub, v4f32, v8f32,
+ loadv4f32, loadv8f32>;
+ defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", X86Fmaddsub, v4f32, v8f32,
+ loadv4f32, loadv8f32>;
+ defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", X86Fmsubadd, v4f32, v8f32,
+ loadv4f32, loadv8f32>;
+}
+
+let ExeDomain = SSEPackedDouble in {
+ // Scalar Instructions
+ defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>,
+ fma4s_int<0x6B, "vfmaddsd", sdmem, sse_load_f64,
+ int_x86_fma_vfmadd_sd>;
+ defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>,
+ fma4s_int<0x6F, "vfmsubsd", sdmem, sse_load_f64,
+ int_x86_fma_vfmsub_sd>;
+ defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64,
+ X86Fnmadd, loadf64>,
+ fma4s_int<0x7B, "vfnmaddsd", sdmem, sse_load_f64,
+ int_x86_fma_vfnmadd_sd>;
+ defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64,
+ X86Fnmsub, loadf64>,
+ fma4s_int<0x7F, "vfnmsubsd", sdmem, sse_load_f64,
+ int_x86_fma_vfnmsub_sd>;
+ // Packed Instructions
+ defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64,
+ loadv2f64, loadv4f64>;
+ defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64,
+ loadv2f64, loadv4f64>;
+ defm VFNMADDPD4 : fma4p<0x79, "vfnmaddpd", X86Fnmadd, v2f64, v4f64,
+ loadv2f64, loadv4f64>;
+ defm VFNMSUBPD4 : fma4p<0x7D, "vfnmsubpd", X86Fnmsub, v2f64, v4f64,
+ loadv2f64, loadv4f64>;
+ defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", X86Fmaddsub, v2f64, v4f64,
+ loadv2f64, loadv4f64>;
+ defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", X86Fmsubadd, v2f64, v4f64,
+ loadv2f64, loadv4f64>;
+}
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.cpp b/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
new file mode 100644
index 000000000000..db83497ee69d
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
@@ -0,0 +1,285 @@
+//===-- X86InstrFMA3Info.cpp - X86 FMA3 Instruction Information -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the classes providing information
+// about existing X86 FMA3 opcodes, classifying and grouping them.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrFMA3Info.h"
+#include "X86InstrInfo.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Threading.h"
+using namespace llvm;
+
+/// This flag is used in the method llvm::call_once() used below to make the
+/// initialization of the map 'OpcodeToGroup' thread safe.
+LLVM_DEFINE_ONCE_FLAG(InitGroupsOnceFlag);
+
+static ManagedStatic<X86InstrFMA3Info> X86InstrFMA3InfoObj;
+X86InstrFMA3Info *X86InstrFMA3Info::getX86InstrFMA3Info() {
+ return &*X86InstrFMA3InfoObj;
+}
+
+void X86InstrFMA3Info::initRMGroup(const uint16_t *RegOpcodes,
+ const uint16_t *MemOpcodes, unsigned Attr) {
+ // Create a new instance of this class that would hold a group of FMA opcodes.
+ X86InstrFMA3Group *G = new X86InstrFMA3Group(RegOpcodes, MemOpcodes, Attr);
+
+ // Add the references from indvidual opcodes to the group holding them.
+ assert((!OpcodeToGroup[RegOpcodes[0]] && !OpcodeToGroup[RegOpcodes[1]] &&
+ !OpcodeToGroup[RegOpcodes[2]] && !OpcodeToGroup[MemOpcodes[0]] &&
+ !OpcodeToGroup[MemOpcodes[1]] && !OpcodeToGroup[MemOpcodes[2]]) &&
+ "Duplication or rewrite of elements in OpcodeToGroup.");
+ OpcodeToGroup[RegOpcodes[0]] = G;
+ OpcodeToGroup[RegOpcodes[1]] = G;
+ OpcodeToGroup[RegOpcodes[2]] = G;
+ OpcodeToGroup[MemOpcodes[0]] = G;
+ OpcodeToGroup[MemOpcodes[1]] = G;
+ OpcodeToGroup[MemOpcodes[2]] = G;
+}
+
+void X86InstrFMA3Info::initRGroup(const uint16_t *RegOpcodes, unsigned Attr) {
+ // Create a new instance of this class that would hold a group of FMA opcodes.
+ X86InstrFMA3Group *G = new X86InstrFMA3Group(RegOpcodes, nullptr, Attr);
+
+ // Add the references from indvidual opcodes to the group holding them.
+ assert((!OpcodeToGroup[RegOpcodes[0]] && !OpcodeToGroup[RegOpcodes[1]] &&
+ !OpcodeToGroup[RegOpcodes[2]]) &&
+ "Duplication or rewrite of elements in OpcodeToGroup.");
+ OpcodeToGroup[RegOpcodes[0]] = G;
+ OpcodeToGroup[RegOpcodes[1]] = G;
+ OpcodeToGroup[RegOpcodes[2]] = G;
+}
+
+void X86InstrFMA3Info::initMGroup(const uint16_t *MemOpcodes, unsigned Attr) {
+ // Create a new instance of this class that would hold a group of FMA opcodes.
+ X86InstrFMA3Group *G = new X86InstrFMA3Group(nullptr, MemOpcodes, Attr);
+
+ // Add the references from indvidual opcodes to the group holding them.
+ assert((!OpcodeToGroup[MemOpcodes[0]] && !OpcodeToGroup[MemOpcodes[1]] &&
+ !OpcodeToGroup[MemOpcodes[2]]) &&
+ "Duplication or rewrite of elements in OpcodeToGroup.");
+ OpcodeToGroup[MemOpcodes[0]] = G;
+ OpcodeToGroup[MemOpcodes[1]] = G;
+ OpcodeToGroup[MemOpcodes[2]] = G;
+}
+
+#define FMA3RM(R132, R213, R231, M132, M213, M231) \
+ static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231}; \
+ static const uint16_t Mem##R132[3] = {X86::M132, X86::M213, X86::M231}; \
+ initRMGroup(Reg##R132, Mem##R132);
+
+#define FMA3RMA(R132, R213, R231, M132, M213, M231, Attrs) \
+ static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231}; \
+ static const uint16_t Mem##R132[3] = {X86::M132, X86::M213, X86::M231}; \
+ initRMGroup(Reg##R132, Mem##R132, (Attrs));
+
+#define FMA3R(R132, R213, R231) \
+ static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231}; \
+ initRGroup(Reg##R132);
+
+#define FMA3RA(R132, R213, R231, Attrs) \
+ static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231}; \
+ initRGroup(Reg##R132, (Attrs));
+
+#define FMA3M(M132, M213, M231) \
+ static const uint16_t Mem##M132[3] = {X86::M132, X86::M213, X86::M231}; \
+ initMGroup(Mem##M132);
+
+#define FMA3MA(M132, M213, M231, Attrs) \
+ static const uint16_t Mem##M132[3] = {X86::M132, X86::M213, X86::M231}; \
+ initMGroup(Mem##M132, (Attrs));
+
+#define FMA3_AVX2_VECTOR_GROUP(Name) \
+ FMA3RM(Name##132PSr, Name##213PSr, Name##231PSr, \
+ Name##132PSm, Name##213PSm, Name##231PSm); \
+ FMA3RM(Name##132PDr, Name##213PDr, Name##231PDr, \
+ Name##132PDm, Name##213PDm, Name##231PDm); \
+ FMA3RM(Name##132PSYr, Name##213PSYr, Name##231PSYr, \
+ Name##132PSYm, Name##213PSYm, Name##231PSYm); \
+ FMA3RM(Name##132PDYr, Name##213PDYr, Name##231PDYr, \
+ Name##132PDYm, Name##213PDYm, Name##231PDYm);
+
+#define FMA3_AVX2_SCALAR_GROUP(Name) \
+ FMA3RM(Name##132SSr, Name##213SSr, Name##231SSr, \
+ Name##132SSm, Name##213SSm, Name##231SSm); \
+ FMA3RM(Name##132SDr, Name##213SDr, Name##231SDr, \
+ Name##132SDm, Name##213SDm, Name##231SDm); \
+ FMA3RMA(Name##132SSr_Int, Name##213SSr_Int, Name##231SSr_Int, \
+ Name##132SSm_Int, Name##213SSm_Int, Name##231SSm_Int, \
+ X86InstrFMA3Group::X86FMA3Intrinsic); \
+ FMA3RMA(Name##132SDr_Int, Name##213SDr_Int, Name##231SDr_Int, \
+ Name##132SDm_Int, Name##213SDm_Int, Name##231SDm_Int, \
+ X86InstrFMA3Group::X86FMA3Intrinsic);
+
+#define FMA3_AVX2_FULL_GROUP(Name) \
+ FMA3_AVX2_VECTOR_GROUP(Name); \
+ FMA3_AVX2_SCALAR_GROUP(Name);
+
+#define FMA3_AVX512_VECTOR_GROUP(Name) \
+ FMA3RM(Name##132PSZ128r, Name##213PSZ128r, Name##231PSZ128r, \
+ Name##132PSZ128m, Name##213PSZ128m, Name##231PSZ128m); \
+ FMA3RM(Name##132PDZ128r, Name##213PDZ128r, Name##231PDZ128r, \
+ Name##132PDZ128m, Name##213PDZ128m, Name##231PDZ128m); \
+ FMA3RM(Name##132PSZ256r, Name##213PSZ256r, Name##231PSZ256r, \
+ Name##132PSZ256m, Name##213PSZ256m, Name##231PSZ256m); \
+ FMA3RM(Name##132PDZ256r, Name##213PDZ256r, Name##231PDZ256r, \
+ Name##132PDZ256m, Name##213PDZ256m, Name##231PDZ256m); \
+ FMA3RM(Name##132PSZr, Name##213PSZr, Name##231PSZr, \
+ Name##132PSZm, Name##213PSZm, Name##231PSZm); \
+ FMA3RM(Name##132PDZr, Name##213PDZr, Name##231PDZr, \
+ Name##132PDZm, Name##213PDZm, Name##231PDZm); \
+ FMA3RMA(Name##132PSZ128rk, Name##213PSZ128rk, Name##231PSZ128rk, \
+ Name##132PSZ128mk, Name##213PSZ128mk, Name##231PSZ128mk, \
+ X86InstrFMA3Group::X86FMA3KMergeMasked); \
+ FMA3RMA(Name##132PDZ128rk, Name##213PDZ128rk, Name##231PDZ128rk, \
+ Name##132PDZ128mk, Name##213PDZ128mk, Name##231PDZ128mk, \
+ X86InstrFMA3Group::X86FMA3KMergeMasked); \
+ FMA3RMA(Name##132PSZ256rk, Name##213PSZ256rk, Name##231PSZ256rk, \
+ Name##132PSZ256mk, Name##213PSZ256mk, Name##231PSZ256mk, \
+ X86InstrFMA3Group::X86FMA3KMergeMasked); \
+ FMA3RMA(Name##132PDZ256rk, Name##213PDZ256rk, Name##231PDZ256rk, \
+ Name##132PDZ256mk, Name##213PDZ256mk, Name##231PDZ256mk, \
+ X86InstrFMA3Group::X86FMA3KMergeMasked); \
+ FMA3RMA(Name##132PSZrk, Name##213PSZrk, Name##231PSZrk, \
+ Name##132PSZmk, Name##213PSZmk, Name##231PSZmk, \
+ X86InstrFMA3Group::X86FMA3KMergeMasked); \
+ FMA3RMA(Name##132PDZrk, Name##213PDZrk, Name##231PDZrk, \
+ Name##132PDZmk, Name##213PDZmk, Name##231PDZmk, \
+ X86InstrFMA3Group::X86FMA3KMergeMasked); \
+ FMA3RMA(Name##132PSZ128rkz, Name##213PSZ128rkz, Name##231PSZ128rkz, \
+ Name##132PSZ128mkz, Name##213PSZ128mkz, Name##231PSZ128mkz, \
+ X86InstrFMA3Group::X86FMA3KZeroMasked); \
+ FMA3RMA(Name##132PDZ128rkz, Name##213PDZ128rkz, Name##231PDZ128rkz, \
+ Name##132PDZ128mkz, Name##213PDZ128mkz, Name##231PDZ128mkz, \
+ X86InstrFMA3Group::X86FMA3KZeroMasked); \
+ FMA3RMA(Name##132PSZ256rkz, Name##213PSZ256rkz, Name##231PSZ256rkz, \
+ Name##132PSZ256mkz, Name##213PSZ256mkz, Name##231PSZ256mkz, \
+ X86InstrFMA3Group::X86FMA3KZeroMasked); \
+ FMA3RMA(Name##132PDZ256rkz, Name##213PDZ256rkz, Name##231PDZ256rkz, \
+ Name##132PDZ256mkz, Name##213PDZ256mkz, Name##231PDZ256mkz, \
+ X86InstrFMA3Group::X86FMA3KZeroMasked); \
+ FMA3RMA(Name##132PSZrkz, Name##213PSZrkz, Name##231PSZrkz, \
+ Name##132PSZmkz, Name##213PSZmkz, Name##231PSZmkz, \
+ X86InstrFMA3Group::X86FMA3KZeroMasked); \
+ FMA3RMA(Name##132PDZrkz, Name##213PDZrkz, Name##231PDZrkz, \
+ Name##132PDZmkz, Name##213PDZmkz, Name##231PDZmkz, \
+ X86InstrFMA3Group::X86FMA3KZeroMasked); \
+ FMA3R(Name##132PSZrb, Name##213PSZrb, Name##231PSZrb); \
+ FMA3R(Name##132PDZrb, Name##213PDZrb, Name##231PDZrb); \
+ FMA3RA(Name##132PSZrbk, Name##213PSZrbk, Name##231PSZrbk, \
+ X86InstrFMA3Group::X86FMA3KMergeMasked); \
+ FMA3RA(Name##132PDZrbk, Name##213PDZrbk, Name##231PDZrbk, \
+ X86InstrFMA3Group::X86FMA3KMergeMasked); \
+ FMA3RA(Name##132PSZrbkz, Name##213PSZrbkz, Name##231PSZrbkz, \
+ X86InstrFMA3Group::X86FMA3KZeroMasked); \
+ FMA3RA(Name##132PDZrbkz, Name##213PDZrbkz, Name##231PDZrbkz, \
+ X86InstrFMA3Group::X86FMA3KZeroMasked); \
+ FMA3M(Name##132PSZ128mb, Name##213PSZ128mb, Name##231PSZ128mb); \
+ FMA3M(Name##132PDZ128mb, Name##213PDZ128mb, Name##231PDZ128mb); \
+ FMA3M(Name##132PSZ256mb, Name##213PSZ256mb, Name##231PSZ256mb); \
+ FMA3M(Name##132PDZ256mb, Name##213PDZ256mb, Name##231PDZ256mb); \
+ FMA3M(Name##132PSZmb, Name##213PSZmb, Name##231PSZmb); \
+ FMA3M(Name##132PDZmb, Name##213PDZmb, Name##231PDZmb); \
+ FMA3MA(Name##132PSZ128mbk, Name##213PSZ128mbk, Name##231PSZ128mbk, \
+ X86InstrFMA3Group::X86FMA3KMergeMasked); \
+ FMA3MA(Name##132PDZ128mbk, Name##213PDZ128mbk, Name##231PDZ128mbk, \
+ X86InstrFMA3Group::X86FMA3KMergeMasked); \
+ FMA3MA(Name##132PSZ256mbk, Name##213PSZ256mbk, Name##231PSZ256mbk, \
+ X86InstrFMA3Group::X86FMA3KMergeMasked); \
+ FMA3MA(Name##132PDZ256mbk, Name##213PDZ256mbk, Name##231PDZ256mbk, \
+ X86InstrFMA3Group::X86FMA3KMergeMasked); \
+ FMA3MA(Name##132PSZmbk, Name##213PSZmbk, Name##231PSZmbk, \
+ X86InstrFMA3Group::X86FMA3KMergeMasked); \
+ FMA3MA(Name##132PDZmbk, Name##213PDZmbk, Name##231PDZmbk, \
+ X86InstrFMA3Group::X86FMA3KMergeMasked); \
+ FMA3MA(Name##132PSZ128mbkz, Name##213PSZ128mbkz, Name##231PSZ128mbkz, \
+ X86InstrFMA3Group::X86FMA3KZeroMasked); \
+ FMA3MA(Name##132PDZ128mbkz, Name##213PDZ128mbkz, Name##231PDZ128mbkz, \
+ X86InstrFMA3Group::X86FMA3KZeroMasked); \
+ FMA3MA(Name##132PSZ256mbkz, Name##213PSZ256mbkz, Name##231PSZ256mbkz, \
+ X86InstrFMA3Group::X86FMA3KZeroMasked); \
+ FMA3MA(Name##132PDZ256mbkz, Name##213PDZ256mbkz, Name##231PDZ256mbkz, \
+ X86InstrFMA3Group::X86FMA3KZeroMasked); \
+ FMA3MA(Name##132PSZmbkz, Name##213PSZmbkz, Name##231PSZmbkz, \
+ X86InstrFMA3Group::X86FMA3KZeroMasked); \
+ FMA3MA(Name##132PDZmbkz, Name##213PDZmbkz, Name##231PDZmbkz, \
+ X86InstrFMA3Group::X86FMA3KZeroMasked);
+
+#define FMA3_AVX512_SCALAR_GROUP(Name) \
+ FMA3RM(Name##132SSZr, Name##213SSZr, Name##231SSZr, \
+ Name##132SSZm, Name##213SSZm, Name##231SSZm); \
+ FMA3RM(Name##132SDZr, Name##213SDZr, Name##231SDZr, \
+ Name##132SDZm, Name##213SDZm, Name##231SDZm); \
+ FMA3RMA(Name##132SSZr_Int, Name##213SSZr_Int, Name##231SSZr_Int, \
+ Name##132SSZm_Int, Name##213SSZm_Int, Name##231SSZm_Int, \
+ X86InstrFMA3Group::X86FMA3Intrinsic); \
+ FMA3RMA(Name##132SDZr_Int, Name##213SDZr_Int, Name##231SDZr_Int, \
+ Name##132SDZm_Int, Name##213SDZm_Int, Name##231SDZm_Int, \
+ X86InstrFMA3Group::X86FMA3Intrinsic); \
+ FMA3RMA(Name##132SSZr_Intk, Name##213SSZr_Intk, Name##231SSZr_Intk, \
+ Name##132SSZm_Intk, Name##213SSZm_Intk, Name##231SSZm_Intk, \
+ X86InstrFMA3Group::X86FMA3Intrinsic | \
+ X86InstrFMA3Group::X86FMA3KMergeMasked); \
+ FMA3RMA(Name##132SDZr_Intk, Name##213SDZr_Intk, Name##231SDZr_Intk, \
+ Name##132SDZm_Intk, Name##213SDZm_Intk, Name##231SDZm_Intk, \
+ X86InstrFMA3Group::X86FMA3Intrinsic | \
+ X86InstrFMA3Group::X86FMA3KMergeMasked); \
+ FMA3RMA(Name##132SSZr_Intkz, Name##213SSZr_Intkz, Name##231SSZr_Intkz, \
+ Name##132SSZm_Intkz, Name##213SSZm_Intkz, Name##231SSZm_Intkz, \
+ X86InstrFMA3Group::X86FMA3Intrinsic | \
+ X86InstrFMA3Group::X86FMA3KZeroMasked); \
+ FMA3RMA(Name##132SDZr_Intkz, Name##213SDZr_Intkz, Name##231SDZr_Intkz, \
+ Name##132SDZm_Intkz, Name##213SDZm_Intkz, Name##231SDZm_Intkz, \
+ X86InstrFMA3Group::X86FMA3Intrinsic | \
+ X86InstrFMA3Group::X86FMA3KZeroMasked); \
+ FMA3RA(Name##132SSZrb_Int, Name##213SSZrb_Int, Name##231SSZrb_Int, \
+ X86InstrFMA3Group::X86FMA3Intrinsic); \
+ FMA3RA(Name##132SDZrb_Int, Name##213SDZrb_Int, Name##231SDZrb_Int, \
+ X86InstrFMA3Group::X86FMA3Intrinsic); \
+ FMA3RA(Name##132SSZrb_Intk, Name##213SSZrb_Intk, Name##231SSZrb_Intk, \
+ X86InstrFMA3Group::X86FMA3Intrinsic | \
+ X86InstrFMA3Group::X86FMA3KMergeMasked); \
+ FMA3RA(Name##132SDZrb_Intk, Name##213SDZrb_Intk, Name##231SDZrb_Intk, \
+ X86InstrFMA3Group::X86FMA3Intrinsic | \
+ X86InstrFMA3Group::X86FMA3KMergeMasked); \
+ FMA3RA(Name##132SSZrb_Intkz, Name##213SSZrb_Intkz, Name##231SSZrb_Intkz, \
+ X86InstrFMA3Group::X86FMA3Intrinsic | \
+ X86InstrFMA3Group::X86FMA3KZeroMasked); \
+ FMA3RA(Name##132SDZrb_Intkz, Name##213SDZrb_Intkz, Name##231SDZrb_Intkz, \
+ X86InstrFMA3Group::X86FMA3Intrinsic | \
+ X86InstrFMA3Group::X86FMA3KZeroMasked);
+
+#define FMA3_AVX512_FULL_GROUP(Name) \
+ FMA3_AVX512_VECTOR_GROUP(Name); \
+ FMA3_AVX512_SCALAR_GROUP(Name);
+
+void X86InstrFMA3Info::initGroupsOnceImpl() {
+ FMA3_AVX2_FULL_GROUP(VFMADD);
+ FMA3_AVX2_FULL_GROUP(VFMSUB);
+ FMA3_AVX2_FULL_GROUP(VFNMADD);
+ FMA3_AVX2_FULL_GROUP(VFNMSUB);
+
+ FMA3_AVX2_VECTOR_GROUP(VFMADDSUB);
+ FMA3_AVX2_VECTOR_GROUP(VFMSUBADD);
+
+ FMA3_AVX512_FULL_GROUP(VFMADD);
+ FMA3_AVX512_FULL_GROUP(VFMSUB);
+ FMA3_AVX512_FULL_GROUP(VFNMADD);
+ FMA3_AVX512_FULL_GROUP(VFNMSUB);
+
+ FMA3_AVX512_VECTOR_GROUP(VFMADDSUB);
+ FMA3_AVX512_VECTOR_GROUP(VFMSUBADD);
+}
+
+void X86InstrFMA3Info::initGroupsOnce() {
+ llvm::call_once(InitGroupsOnceFlag,
+ []() { getX86InstrFMA3Info()->initGroupsOnceImpl(); });
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.h b/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.h
new file mode 100644
index 000000000000..025cee3b2b90
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.h
@@ -0,0 +1,315 @@
+//===-- X86InstrFMA3Info.h - X86 FMA3 Instruction Information -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the classes providing information
+// about existing X86 FMA3 opcodes, classifying and grouping them.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_UTILS_X86INSTRFMA3INFO_H
+#define LLVM_LIB_TARGET_X86_UTILS_X86INSTRFMA3INFO_H
+
+#include "X86.h"
+#include "llvm/ADT/DenseMap.h"
+#include <cassert>
+#include <set>
+
+namespace llvm {
+/// This class is used to group {132, 213, 231} forms of FMA opcodes together.
+/// Each of the groups has either 3 register opcodes, 3 memory opcodes,
+/// or 6 register and memory opcodes. Also, each group has an attrubutes field
+/// describing it.
+class X86InstrFMA3Group {
+private:
+ /// Reference to an array holding 3 forms of register FMA opcodes.
+ /// It may be set to nullptr if the group of FMA opcodes does not have
+ /// any register form opcodes.
+ const uint16_t *RegOpcodes;
+
+ /// Reference to an array holding 3 forms of memory FMA opcodes.
+ /// It may be set to nullptr if the group of FMA opcodes does not have
+ /// any register form opcodes.
+ const uint16_t *MemOpcodes;
+
+ /// This bitfield specifies the attributes associated with the created
+ /// FMA groups of opcodes.
+ unsigned Attributes;
+
+ static const unsigned Form132 = 0;
+ static const unsigned Form213 = 1;
+ static const unsigned Form231 = 2;
+
+public:
+ /// This bit must be set in the 'Attributes' field of FMA group if such
+ /// group of FMA opcodes consists of FMA intrinsic opcodes.
+ static const unsigned X86FMA3Intrinsic = 0x1;
+
+ /// This bit must be set in the 'Attributes' field of FMA group if such
+ /// group of FMA opcodes consists of AVX512 opcodes accepting a k-mask and
+ /// passing the elements from the 1st operand to the result of the operation
+ /// when the correpondings bits in the k-mask are unset.
+ static const unsigned X86FMA3KMergeMasked = 0x2;
+
+ /// This bit must be set in the 'Attributes' field of FMA group if such
+ /// group of FMA opcodes consists of AVX512 opcodes accepting a k-zeromask.
+ static const unsigned X86FMA3KZeroMasked = 0x4;
+
+ /// Constructor. Creates a new group of FMA opcodes with three register form
+ /// FMA opcodes \p RegOpcodes and three memory form FMA opcodes \p MemOpcodes.
+ /// The parameters \p RegOpcodes and \p MemOpcodes may be set to nullptr,
+ /// which means that the created group of FMA opcodes does not have the
+ /// corresponding (register or memory) opcodes.
+ /// The parameter \p Attr specifies the attributes describing the created
+ /// group.
+ X86InstrFMA3Group(const uint16_t *RegOpcodes, const uint16_t *MemOpcodes,
+ unsigned Attr)
+ : RegOpcodes(RegOpcodes), MemOpcodes(MemOpcodes), Attributes(Attr) {
+ assert((RegOpcodes || MemOpcodes) &&
+ "Cannot create a group not having any opcodes.");
+ }
+
+ /// Returns a memory form opcode that is the equivalent of the given register
+ /// form opcode \p RegOpcode. 0 is returned if the group does not have
+ /// either register of memory opcodes.
+ unsigned getMemOpcode(unsigned RegOpcode) const {
+ if (!RegOpcodes || !MemOpcodes)
+ return 0;
+ for (unsigned Form = 0; Form < 3; Form++)
+ if (RegOpcodes[Form] == RegOpcode)
+ return MemOpcodes[Form];
+ return 0;
+ }
+
+ /// Returns the 132 form of FMA register opcode.
+ unsigned getReg132Opcode() const {
+ assert(RegOpcodes && "The group does not have register opcodes.");
+ return RegOpcodes[Form132];
+ }
+
+ /// Returns the 213 form of FMA register opcode.
+ unsigned getReg213Opcode() const {
+ assert(RegOpcodes && "The group does not have register opcodes.");
+ return RegOpcodes[Form213];
+ }
+
+ /// Returns the 231 form of FMA register opcode.
+ unsigned getReg231Opcode() const {
+ assert(RegOpcodes && "The group does not have register opcodes.");
+ return RegOpcodes[Form231];
+ }
+
+ /// Returns the 132 form of FMA memory opcode.
+ unsigned getMem132Opcode() const {
+ assert(MemOpcodes && "The group does not have memory opcodes.");
+ return MemOpcodes[Form132];
+ }
+
+ /// Returns the 213 form of FMA memory opcode.
+ unsigned getMem213Opcode() const {
+ assert(MemOpcodes && "The group does not have memory opcodes.");
+ return MemOpcodes[Form213];
+ }
+
+ /// Returns the 231 form of FMA memory opcode.
+ unsigned getMem231Opcode() const {
+ assert(MemOpcodes && "The group does not have memory opcodes.");
+ return MemOpcodes[Form231];
+ }
+
+ /// Returns true iff the group of FMA opcodes holds intrinsic opcodes.
+ bool isIntrinsic() const { return (Attributes & X86FMA3Intrinsic) != 0; }
+
+ /// Returns true iff the group of FMA opcodes holds k-merge-masked opcodes.
+ bool isKMergeMasked() const {
+ return (Attributes & X86FMA3KMergeMasked) != 0;
+ }
+
+ /// Returns true iff the group of FMA opcodes holds k-zero-masked opcodes.
+ bool isKZeroMasked() const { return (Attributes & X86FMA3KZeroMasked) != 0; }
+
+ /// Returns true iff the group of FMA opcodes holds any of k-masked opcodes.
+ bool isKMasked() const {
+ return (Attributes & (X86FMA3KMergeMasked | X86FMA3KZeroMasked)) != 0;
+ }
+
+ /// Returns true iff the given \p Opcode is a register opcode from the
+ /// groups of FMA opcodes.
+ bool isRegOpcodeFromGroup(unsigned Opcode) const {
+ if (!RegOpcodes)
+ return false;
+ for (unsigned Form = 0; Form < 3; Form++)
+ if (Opcode == RegOpcodes[Form])
+ return true;
+ return false;
+ }
+
+ /// Returns true iff the given \p Opcode is a memory opcode from the
+ /// groups of FMA opcodes.
+ bool isMemOpcodeFromGroup(unsigned Opcode) const {
+ if (!MemOpcodes)
+ return false;
+ for (unsigned Form = 0; Form < 3; Form++)
+ if (Opcode == MemOpcodes[Form])
+ return true;
+ return false;
+ }
+};
+
+/// This class provides information about all existing FMA3 opcodes
+///
+class X86InstrFMA3Info {
+private:
+ /// A map that is used to find the group of FMA opcodes using any FMA opcode
+ /// from the group.
+ DenseMap<unsigned, const X86InstrFMA3Group *> OpcodeToGroup;
+
+ /// Creates groups of FMA opcodes and initializes Opcode-to-Group map.
+ /// This method can be called many times, but the actual initialization is
+ /// called only once.
+ static void initGroupsOnce();
+
+ /// Creates groups of FMA opcodes and initializes Opcode-to-Group map.
+ /// This method must be called ONLY from initGroupsOnce(). Otherwise, such
+ /// call is not thread safe.
+ void initGroupsOnceImpl();
+
+ /// Creates one group of FMA opcodes having the register opcodes
+ /// \p RegOpcodes and memory opcodes \p MemOpcodes. The parameter \p Attr
+ /// specifies the attributes describing the created group.
+ void initRMGroup(const uint16_t *RegOpcodes,
+ const uint16_t *MemOpcodes, unsigned Attr = 0);
+
+ /// Creates one group of FMA opcodes having only the register opcodes
+ /// \p RegOpcodes. The parameter \p Attr specifies the attributes describing
+ /// the created group.
+ void initRGroup(const uint16_t *RegOpcodes, unsigned Attr = 0);
+
+ /// Creates one group of FMA opcodes having only the memory opcodes
+ /// \p MemOpcodes. The parameter \p Attr specifies the attributes describing
+ /// the created group.
+ void initMGroup(const uint16_t *MemOpcodes, unsigned Attr = 0);
+
+public:
+ /// Returns the reference to an object of this class. It is assumed that
+ /// only one object may exist.
+ static X86InstrFMA3Info *getX86InstrFMA3Info();
+
+ /// Constructor. Just creates an object of the class.
+ X86InstrFMA3Info() {}
+
+ /// Destructor. Deallocates the memory used for FMA3 Groups.
+ ~X86InstrFMA3Info() {
+ std::set<const X86InstrFMA3Group *> DeletedGroups;
+ auto E = OpcodeToGroup.end();
+ for (auto I = OpcodeToGroup.begin(); I != E; I++) {
+ const X86InstrFMA3Group *G = I->second;
+ if (DeletedGroups.find(G) == DeletedGroups.end()) {
+ DeletedGroups.insert(G);
+ delete G;
+ }
+ }
+ }
+
+ /// Returns a reference to a group of FMA3 opcodes to where the given
+ /// \p Opcode is included. If the given \p Opcode is not recognized as FMA3
+ /// and not included into any FMA3 group, then nullptr is returned.
+ static const X86InstrFMA3Group *getFMA3Group(unsigned Opcode) {
+ // Ensure that the groups of opcodes are initialized.
+ initGroupsOnce();
+
+ // Find the group including the given opcode.
+ const X86InstrFMA3Info *FMA3Info = getX86InstrFMA3Info();
+ auto I = FMA3Info->OpcodeToGroup.find(Opcode);
+ if (I == FMA3Info->OpcodeToGroup.end())
+ return nullptr;
+
+ return I->second;
+ }
+
+ /// Returns true iff the given \p Opcode is recognized as FMA3 by this class.
+ static bool isFMA3(unsigned Opcode) {
+ return getFMA3Group(Opcode) != nullptr;
+ }
+
+ /// Iterator that is used to walk on FMA register opcodes having memory
+ /// form equivalents.
+ class rm_iterator {
+ private:
+ /// Iterator associated with the OpcodeToGroup map. It must always be
+ /// initialized with an entry from OpcodeToGroup for which I->first
+ /// points to a register FMA opcode and I->second points to a group of
+ /// FMA opcodes having memory form equivalent of I->first.
+ DenseMap<unsigned, const X86InstrFMA3Group *>::const_iterator I;
+
+ public:
+ /// Constructor. Creates rm_iterator. The parameter \p I must be an
+ /// iterator to OpcodeToGroup map entry having I->first pointing to
+ /// register form FMA opcode and I->second pointing to a group of FMA
+ /// opcodes holding memory form equivalent for I->fist.
+ rm_iterator(DenseMap<unsigned, const X86InstrFMA3Group *>::const_iterator I)
+ : I(I) {}
+
+ /// Returns the register form FMA opcode.
+ unsigned getRegOpcode() const { return I->first; };
+
+ /// Returns the memory form equivalent opcode for FMA register opcode
+ /// referenced by I->first.
+ unsigned getMemOpcode() const {
+ unsigned Opcode = I->first;
+ const X86InstrFMA3Group *Group = I->second;
+ return Group->getMemOpcode(Opcode);
+ }
+
+ /// Returns a reference to a group of FMA opcodes.
+ const X86InstrFMA3Group *getGroup() const { return I->second; }
+
+ bool operator==(const rm_iterator &OtherIt) const { return I == OtherIt.I; }
+ bool operator!=(const rm_iterator &OtherIt) const { return I != OtherIt.I; }
+
+ /// Increment. Advances the 'I' iterator to the next OpcodeToGroup entry
+ /// having I->first pointing to register form FMA and I->second pointing
+ /// to a group of FMA opcodes holding memory form equivalen for I->first.
+ rm_iterator &operator++() {
+ auto E = getX86InstrFMA3Info()->OpcodeToGroup.end();
+ for (++I; I != E; ++I) {
+ unsigned RegOpcode = I->first;
+ const X86InstrFMA3Group *Group = I->second;
+ if (Group->getMemOpcode(RegOpcode) != 0)
+ break;
+ }
+ return *this;
+ }
+ };
+
+ /// Returns rm_iterator pointing to the first entry of OpcodeToGroup map
+ /// with a register FMA opcode having memory form opcode equivalent.
+ static rm_iterator rm_begin() {
+ initGroupsOnce();
+ const X86InstrFMA3Info *FMA3Info = getX86InstrFMA3Info();
+ auto I = FMA3Info->OpcodeToGroup.begin();
+ auto E = FMA3Info->OpcodeToGroup.end();
+ while (I != E) {
+ unsigned Opcode = I->first;
+ const X86InstrFMA3Group *G = I->second;
+ if (G->getMemOpcode(Opcode) != 0)
+ break;
+ I++;
+ }
+ return rm_iterator(I);
+ }
+
+ /// Returns the last rm_iterator.
+ static rm_iterator rm_end() {
+ initGroupsOnce();
+ return rm_iterator(getX86InstrFMA3Info()->OpcodeToGroup.end());
+ }
+};
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFPStack.td b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td
new file mode 100644
index 000000000000..10f3839ea8ed
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td
@@ -0,0 +1,729 @@
+//===- X86InstrFPStack.td - FPU Instruction Set ------------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 x87 FPU instruction set, defining the
+// instructions, and properties of the instructions which are needed for code
+// generation, machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// FPStack specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDTX86FpGet2 : SDTypeProfile<2, 0, [SDTCisVT<0, f80>,
+ SDTCisVT<1, f80>]>;
+def SDTX86Fld : SDTypeProfile<1, 2, [SDTCisFP<0>,
+ SDTCisPtrTy<1>,
+ SDTCisVT<2, OtherVT>]>;
+def SDTX86Fst : SDTypeProfile<0, 3, [SDTCisFP<0>,
+ SDTCisPtrTy<1>,
+ SDTCisVT<2, OtherVT>]>;
+def SDTX86Fild : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisPtrTy<1>,
+ SDTCisVT<2, OtherVT>]>;
+def SDTX86Fnstsw : SDTypeProfile<1, 1, [SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
+def SDTX86FpToIMem : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>;
+
+def SDTX86CwdStore : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+
+def X86fld : SDNode<"X86ISD::FLD", SDTX86Fld,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def X86fst : SDNode<"X86ISD::FST", SDTX86Fst,
+ [SDNPHasChain, SDNPInGlue, SDNPMayStore,
+ SDNPMemOperand]>;
+def X86fild : SDNode<"X86ISD::FILD", SDTX86Fild,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def X86fildflag : SDNode<"X86ISD::FILD_FLAG", SDTX86Fild,
+ [SDNPHasChain, SDNPOutGlue, SDNPMayLoad,
+ SDNPMemOperand]>;
+def X86fp_stsw : SDNode<"X86ISD::FNSTSW16r", SDTX86Fnstsw>;
+def X86fp_to_i16mem : SDNode<"X86ISD::FP_TO_INT16_IN_MEM", SDTX86FpToIMem,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def X86fp_to_i32mem : SDNode<"X86ISD::FP_TO_INT32_IN_MEM", SDTX86FpToIMem,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def X86fp_to_i64mem : SDNode<"X86ISD::FP_TO_INT64_IN_MEM", SDTX86FpToIMem,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def X86fp_cwd_get16 : SDNode<"X86ISD::FNSTCW16m", SDTX86CwdStore,
+ [SDNPHasChain, SDNPMayStore, SDNPSideEffect,
+ SDNPMemOperand]>;
+
+//===----------------------------------------------------------------------===//
+// FPStack pattern fragments
+//===----------------------------------------------------------------------===//
+
+def fpimm0 : PatLeaf<(fpimm), [{
+ return N->isExactlyValue(+0.0);
+}]>;
+
+def fpimmneg0 : PatLeaf<(fpimm), [{
+ return N->isExactlyValue(-0.0);
+}]>;
+
+def fpimm1 : PatLeaf<(fpimm), [{
+ return N->isExactlyValue(+1.0);
+}]>;
+
+def fpimmneg1 : PatLeaf<(fpimm), [{
+ return N->isExactlyValue(-1.0);
+}]>;
+
+// Some 'special' instructions
+let usesCustomInserter = 1 in { // Expanded after instruction selection.
+ def FP32_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP32:$src),
+ [(X86fp_to_i16mem RFP32:$src, addr:$dst)]>;
+ def FP32_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP32:$src),
+ [(X86fp_to_i32mem RFP32:$src, addr:$dst)]>;
+ def FP32_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP32:$src),
+ [(X86fp_to_i64mem RFP32:$src, addr:$dst)]>;
+ def FP64_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP64:$src),
+ [(X86fp_to_i16mem RFP64:$src, addr:$dst)]>;
+ def FP64_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP64:$src),
+ [(X86fp_to_i32mem RFP64:$src, addr:$dst)]>;
+ def FP64_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP64:$src),
+ [(X86fp_to_i64mem RFP64:$src, addr:$dst)]>;
+ def FP80_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP80:$src),
+ [(X86fp_to_i16mem RFP80:$src, addr:$dst)]>;
+ def FP80_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP80:$src),
+ [(X86fp_to_i32mem RFP80:$src, addr:$dst)]>;
+ def FP80_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP80:$src),
+ [(X86fp_to_i64mem RFP80:$src, addr:$dst)]>;
+}
+
+// All FP Stack operations are represented with four instructions here. The
+// first three instructions, generated by the instruction selector, use "RFP32"
+// "RFP64" or "RFP80" registers: traditional register files to reference 32-bit,
+// 64-bit or 80-bit floating point values. These sizes apply to the values,
+// not the registers, which are always 80 bits; RFP32, RFP64 and RFP80 can be
+// copied to each other without losing information. These instructions are all
+// pseudo instructions and use the "_Fp" suffix.
+// In some cases there are additional variants with a mixture of different
+// register sizes.
+// The second instruction is defined with FPI, which is the actual instruction
+// emitted by the assembler. These use "RST" registers, although frequently
+// the actual register(s) used are implicit. These are always 80 bits.
+// The FP stackifier pass converts one to the other after register allocation
+// occurs.
+//
+// Note that the FpI instruction should have instruction selection info (e.g.
+// a pattern) and the FPI instruction should have emission info (e.g. opcode
+// encoding and asm printing info).
+
+// FpIf32, FpIf64 - Floating Point Pseudo Instruction template.
+// f32 instructions can use SSE1 and are predicated on FPStackf32 == !SSE1.
+// f64 instructions can use SSE2 and are predicated on FPStackf64 == !SSE2.
+// f80 instructions cannot use SSE and use neither of these.
+class FpIf32<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+ FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32]>;
+class FpIf64<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+ FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64]>;
+
+// Factoring for arithmetic.
+multiclass FPBinary_rr<SDNode OpNode> {
+// Register op register -> register
+// These are separated out because they have no reversed form.
+def _Fp32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2), TwoArgFP,
+ [(set RFP32:$dst, (OpNode RFP32:$src1, RFP32:$src2))]>;
+def _Fp64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2), TwoArgFP,
+ [(set RFP64:$dst, (OpNode RFP64:$src1, RFP64:$src2))]>;
+def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), TwoArgFP,
+ [(set RFP80:$dst, (OpNode RFP80:$src1, RFP80:$src2))]>;
+}
+// The FopST0 series are not included here because of the irregularities
+// in where the 'r' goes in assembly output.
+// These instructions cannot address 80-bit memory.
+multiclass FPBinary<SDNode OpNode, Format fp, string asmstring,
+ bit Forward = 1> {
+// ST(0) = ST(0) + [mem]
+def _Fp32m : FpIf32<(outs RFP32:$dst),
+ (ins RFP32:$src1, f32mem:$src2), OneArgFPRW,
+ [!if(Forward,
+ (set RFP32:$dst,
+ (OpNode RFP32:$src1, (loadf32 addr:$src2))),
+ (set RFP32:$dst,
+ (OpNode (loadf32 addr:$src2), RFP32:$src1)))]>;
+def _Fp64m : FpIf64<(outs RFP64:$dst),
+ (ins RFP64:$src1, f64mem:$src2), OneArgFPRW,
+ [!if(Forward,
+ (set RFP64:$dst,
+ (OpNode RFP64:$src1, (loadf64 addr:$src2))),
+ (set RFP64:$dst,
+ (OpNode (loadf64 addr:$src2), RFP64:$src1)))]>;
+def _Fp64m32: FpIf64<(outs RFP64:$dst),
+ (ins RFP64:$src1, f32mem:$src2), OneArgFPRW,
+ [!if(Forward,
+ (set RFP64:$dst,
+ (OpNode RFP64:$src1, (f64 (extloadf32 addr:$src2)))),
+ (set RFP64:$dst,
+ (OpNode (f64 (extloadf32 addr:$src2)), RFP64:$src1)))]>;
+def _Fp80m32: FpI_<(outs RFP80:$dst),
+ (ins RFP80:$src1, f32mem:$src2), OneArgFPRW,
+ [!if(Forward,
+ (set RFP80:$dst,
+ (OpNode RFP80:$src1, (f80 (extloadf32 addr:$src2)))),
+ (set RFP80:$dst,
+ (OpNode (f80 (extloadf32 addr:$src2)), RFP80:$src1)))]>;
+def _Fp80m64: FpI_<(outs RFP80:$dst),
+ (ins RFP80:$src1, f64mem:$src2), OneArgFPRW,
+ [!if(Forward,
+ (set RFP80:$dst,
+ (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2)))),
+ (set RFP80:$dst,
+ (OpNode (f80 (extloadf64 addr:$src2)), RFP80:$src1)))]>;
+let mayLoad = 1 in
+def _F32m : FPI<0xD8, fp, (outs), (ins f32mem:$src),
+ !strconcat("f", asmstring, "{s}\t$src")>;
+let mayLoad = 1 in
+def _F64m : FPI<0xDC, fp, (outs), (ins f64mem:$src),
+ !strconcat("f", asmstring, "{l}\t$src")>;
+// ST(0) = ST(0) + [memint]
+def _FpI16m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i16mem:$src2),
+ OneArgFPRW,
+ [!if(Forward,
+ (set RFP32:$dst,
+ (OpNode RFP32:$src1, (X86fild addr:$src2, i16))),
+ (set RFP32:$dst,
+ (OpNode (X86fild addr:$src2, i16), RFP32:$src1)))]>;
+def _FpI32m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i32mem:$src2),
+ OneArgFPRW,
+ [!if(Forward,
+ (set RFP32:$dst,
+ (OpNode RFP32:$src1, (X86fild addr:$src2, i32))),
+ (set RFP32:$dst,
+ (OpNode (X86fild addr:$src2, i32), RFP32:$src1)))]>;
+def _FpI16m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i16mem:$src2),
+ OneArgFPRW,
+ [!if(Forward,
+ (set RFP64:$dst,
+ (OpNode RFP64:$src1, (X86fild addr:$src2, i16))),
+ (set RFP64:$dst,
+ (OpNode (X86fild addr:$src2, i16), RFP64:$src1)))]>;
+def _FpI32m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i32mem:$src2),
+ OneArgFPRW,
+ [!if(Forward,
+ (set RFP64:$dst,
+ (OpNode RFP64:$src1, (X86fild addr:$src2, i32))),
+ (set RFP64:$dst,
+ (OpNode (X86fild addr:$src2, i32), RFP64:$src1)))]>;
+def _FpI16m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i16mem:$src2),
+ OneArgFPRW,
+ [!if(Forward,
+ (set RFP80:$dst,
+ (OpNode RFP80:$src1, (X86fild addr:$src2, i16))),
+ (set RFP80:$dst,
+ (OpNode (X86fild addr:$src2, i16), RFP80:$src1)))]>;
+def _FpI32m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i32mem:$src2),
+ OneArgFPRW,
+ [!if(Forward,
+ (set RFP80:$dst,
+ (OpNode RFP80:$src1, (X86fild addr:$src2, i32))),
+ (set RFP80:$dst,
+ (OpNode (X86fild addr:$src2, i32), RFP80:$src1)))]>;
+let mayLoad = 1 in
+def _FI16m : FPI<0xDE, fp, (outs), (ins i16mem:$src),
+ !strconcat("fi", asmstring, "{s}\t$src")>;
+let mayLoad = 1 in
+def _FI32m : FPI<0xDA, fp, (outs), (ins i32mem:$src),
+ !strconcat("fi", asmstring, "{l}\t$src")>;
+}
+
+let Defs = [FPSW] in {
+// FPBinary_rr just defines pseudo-instructions, no need to set a scheduling
+// resources.
+defm ADD : FPBinary_rr<fadd>;
+defm SUB : FPBinary_rr<fsub>;
+defm MUL : FPBinary_rr<fmul>;
+defm DIV : FPBinary_rr<fdiv>;
+// Sets the scheduling resources for the actual NAME#_F<size>m defintions.
+let SchedRW = [WriteFAddLd] in {
+defm ADD : FPBinary<fadd, MRM0m, "add">;
+defm SUB : FPBinary<fsub, MRM4m, "sub">;
+defm SUBR: FPBinary<fsub ,MRM5m, "subr", 0>;
+}
+let SchedRW = [WriteFMulLd] in {
+defm MUL : FPBinary<fmul, MRM1m, "mul">;
+}
+let SchedRW = [WriteFDivLd] in {
+defm DIV : FPBinary<fdiv, MRM6m, "div">;
+defm DIVR: FPBinary<fdiv, MRM7m, "divr", 0>;
+}
+}
+
+class FPST0rInst<Format fp, string asm>
+ : FPI<0xD8, fp, (outs), (ins RST:$op), asm>;
+class FPrST0Inst<Format fp, string asm>
+ : FPI<0xDC, fp, (outs), (ins RST:$op), asm>;
+class FPrST0PInst<Format fp, string asm>
+ : FPI<0xDE, fp, (outs), (ins RST:$op), asm>;
+
+// NOTE: GAS and apparently all other AT&T style assemblers have a broken notion
+// of some of the 'reverse' forms of the fsub and fdiv instructions. As such,
+// we have to put some 'r's in and take them out of weird places.
+let SchedRW = [WriteFAdd] in {
+def ADD_FST0r : FPST0rInst <MRM0r, "fadd\t$op">;
+def ADD_FrST0 : FPrST0Inst <MRM0r, "fadd\t{%st(0), $op|$op, st(0)}">;
+def ADD_FPrST0 : FPrST0PInst<MRM0r, "faddp\t$op">;
+def SUBR_FST0r : FPST0rInst <MRM5r, "fsubr\t$op">;
+def SUB_FrST0 : FPrST0Inst <MRM5r, "fsub{r}\t{%st(0), $op|$op, st(0)}">;
+def SUB_FPrST0 : FPrST0PInst<MRM5r, "fsub{r}p\t$op">;
+def SUB_FST0r : FPST0rInst <MRM4r, "fsub\t$op">;
+def SUBR_FrST0 : FPrST0Inst <MRM4r, "fsub{|r}\t{%st(0), $op|$op, st(0)}">;
+def SUBR_FPrST0 : FPrST0PInst<MRM4r, "fsub{|r}p\t$op">;
+} // SchedRW
+let SchedRW = [WriteFMul] in {
+def MUL_FST0r : FPST0rInst <MRM1r, "fmul\t$op">;
+def MUL_FrST0 : FPrST0Inst <MRM1r, "fmul\t{%st(0), $op|$op, st(0)}">;
+def MUL_FPrST0 : FPrST0PInst<MRM1r, "fmulp\t$op">;
+} // SchedRW
+let SchedRW = [WriteFDiv] in {
+def DIVR_FST0r : FPST0rInst <MRM7r, "fdivr\t$op">;
+def DIV_FrST0 : FPrST0Inst <MRM7r, "fdiv{r}\t{%st(0), $op|$op, st(0)}">;
+def DIV_FPrST0 : FPrST0PInst<MRM7r, "fdiv{r}p\t$op">;
+def DIV_FST0r : FPST0rInst <MRM6r, "fdiv\t$op">;
+def DIVR_FrST0 : FPrST0Inst <MRM6r, "fdiv{|r}\t{%st(0), $op|$op, st(0)}">;
+def DIVR_FPrST0 : FPrST0PInst<MRM6r, "fdiv{|r}p\t$op">;
+} // SchedRW
+
+def COM_FST0r : FPST0rInst <MRM2r, "fcom\t$op">;
+def COMP_FST0r : FPST0rInst <MRM3r, "fcomp\t$op">;
+
+// Unary operations.
+multiclass FPUnary<SDNode OpNode, Format fp, string asmstring> {
+def _Fp32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src), OneArgFPRW,
+ [(set RFP32:$dst, (OpNode RFP32:$src))]>;
+def _Fp64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src), OneArgFPRW,
+ [(set RFP64:$dst, (OpNode RFP64:$src))]>;
+def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src), OneArgFPRW,
+ [(set RFP80:$dst, (OpNode RFP80:$src))]>;
+def _F : FPI<0xD9, fp, (outs), (ins), asmstring>;
+}
+
+let Defs = [FPSW] in {
+defm CHS : FPUnary<fneg, MRM_E0, "fchs">;
+defm ABS : FPUnary<fabs, MRM_E1, "fabs">;
+let SchedRW = [WriteFSqrt] in {
+defm SQRT: FPUnary<fsqrt,MRM_FA, "fsqrt">;
+}
+defm SIN : FPUnary<fsin, MRM_FE, "fsin">;
+defm COS : FPUnary<fcos, MRM_FF, "fcos">;
+
+let hasSideEffects = 0 in {
+def TST_Fp32 : FpIf32<(outs), (ins RFP32:$src), OneArgFP, []>;
+def TST_Fp64 : FpIf64<(outs), (ins RFP64:$src), OneArgFP, []>;
+def TST_Fp80 : FpI_<(outs), (ins RFP80:$src), OneArgFP, []>;
+}
+def TST_F : FPI<0xD9, MRM_E4, (outs), (ins), "ftst">;
+} // Defs = [FPSW]
+
+// Versions of FP instructions that take a single memory operand. Added for the
+// disassembler; remove as they are included with patterns elsewhere.
+def FCOM32m : FPI<0xD8, MRM2m, (outs), (ins f32mem:$src), "fcom{s}\t$src">;
+def FCOMP32m : FPI<0xD8, MRM3m, (outs), (ins f32mem:$src), "fcomp{s}\t$src">;
+
+def FLDENVm : FPI<0xD9, MRM4m, (outs), (ins f32mem:$src), "fldenv\t$src">;
+def FSTENVm : FPI<0xD9, MRM6m, (outs), (ins f32mem:$dst), "fnstenv\t$dst">;
+
+def FICOM32m : FPI<0xDA, MRM2m, (outs), (ins i32mem:$src), "ficom{l}\t$src">;
+def FICOMP32m: FPI<0xDA, MRM3m, (outs), (ins i32mem:$src), "ficomp{l}\t$src">;
+
+def FCOM64m : FPI<0xDC, MRM2m, (outs), (ins f64mem:$src), "fcom{l}\t$src">;
+def FCOMP64m : FPI<0xDC, MRM3m, (outs), (ins f64mem:$src), "fcomp{l}\t$src">;
+
+def FRSTORm : FPI<0xDD, MRM4m, (outs), (ins f32mem:$dst), "frstor\t$dst">;
+def FSAVEm : FPI<0xDD, MRM6m, (outs), (ins f32mem:$dst), "fnsave\t$dst">;
+def FNSTSWm : FPI<0xDD, MRM7m, (outs), (ins i16mem:$dst), "fnstsw\t$dst">;
+
+def FICOM16m : FPI<0xDE, MRM2m, (outs), (ins i16mem:$src), "ficom{s}\t$src">;
+def FICOMP16m: FPI<0xDE, MRM3m, (outs), (ins i16mem:$src), "ficomp{s}\t$src">;
+
+def FBLDm : FPI<0xDF, MRM4m, (outs), (ins f80mem:$src), "fbld\t$src">;
+def FBSTPm : FPI<0xDF, MRM6m, (outs), (ins f80mem:$dst), "fbstp\t$dst">;
+
+// Floating point cmovs.
+class FpIf32CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+ FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32, HasCMov]>;
+class FpIf64CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+ FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64, HasCMov]>;
+
+multiclass FPCMov<PatLeaf cc> {
+ def _Fp32 : FpIf32CMov<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2),
+ CondMovFP,
+ [(set RFP32:$dst, (X86cmov RFP32:$src1, RFP32:$src2,
+ cc, EFLAGS))]>;
+ def _Fp64 : FpIf64CMov<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2),
+ CondMovFP,
+ [(set RFP64:$dst, (X86cmov RFP64:$src1, RFP64:$src2,
+ cc, EFLAGS))]>;
+ def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2),
+ CondMovFP,
+ [(set RFP80:$dst, (X86cmov RFP80:$src1, RFP80:$src2,
+ cc, EFLAGS))]>,
+ Requires<[HasCMov]>;
+}
+
+let Defs = [FPSW] in {
+let Uses = [EFLAGS], Constraints = "$src1 = $dst" in {
+defm CMOVB : FPCMov<X86_COND_B>;
+defm CMOVBE : FPCMov<X86_COND_BE>;
+defm CMOVE : FPCMov<X86_COND_E>;
+defm CMOVP : FPCMov<X86_COND_P>;
+defm CMOVNB : FPCMov<X86_COND_AE>;
+defm CMOVNBE: FPCMov<X86_COND_A>;
+defm CMOVNE : FPCMov<X86_COND_NE>;
+defm CMOVNP : FPCMov<X86_COND_NP>;
+} // Uses = [EFLAGS], Constraints = "$src1 = $dst"
+
+let Predicates = [HasCMov] in {
+// These are not factored because there's no clean way to pass DA/DB.
+def CMOVB_F : FPI<0xDA, MRM0r, (outs), (ins RST:$op),
+ "fcmovb\t{$op, %st(0)|st(0), $op}">;
+def CMOVBE_F : FPI<0xDA, MRM2r, (outs), (ins RST:$op),
+ "fcmovbe\t{$op, %st(0)|st(0), $op}">;
+def CMOVE_F : FPI<0xDA, MRM1r, (outs), (ins RST:$op),
+ "fcmove\t{$op, %st(0)|st(0), $op}">;
+def CMOVP_F : FPI<0xDA, MRM3r, (outs), (ins RST:$op),
+ "fcmovu\t{$op, %st(0)|st(0), $op}">;
+def CMOVNB_F : FPI<0xDB, MRM0r, (outs), (ins RST:$op),
+ "fcmovnb\t{$op, %st(0)|st(0), $op}">;
+def CMOVNBE_F: FPI<0xDB, MRM2r, (outs), (ins RST:$op),
+ "fcmovnbe\t{$op, %st(0)|st(0), $op}">;
+def CMOVNE_F : FPI<0xDB, MRM1r, (outs), (ins RST:$op),
+ "fcmovne\t{$op, %st(0)|st(0), $op}">;
+def CMOVNP_F : FPI<0xDB, MRM3r, (outs), (ins RST:$op),
+ "fcmovnu\t{$op, %st(0)|st(0), $op}">;
+} // Predicates = [HasCMov]
+
+// Floating point loads & stores.
+let canFoldAsLoad = 1 in {
+def LD_Fp32m : FpIf32<(outs RFP32:$dst), (ins f32mem:$src), ZeroArgFP,
+ [(set RFP32:$dst, (loadf32 addr:$src))]>;
+let isReMaterializable = 1 in
+ def LD_Fp64m : FpIf64<(outs RFP64:$dst), (ins f64mem:$src), ZeroArgFP,
+ [(set RFP64:$dst, (loadf64 addr:$src))]>;
+def LD_Fp80m : FpI_<(outs RFP80:$dst), (ins f80mem:$src), ZeroArgFP,
+ [(set RFP80:$dst, (loadf80 addr:$src))]>;
+}
+def LD_Fp32m64 : FpIf64<(outs RFP64:$dst), (ins f32mem:$src), ZeroArgFP,
+ [(set RFP64:$dst, (f64 (extloadf32 addr:$src)))]>;
+def LD_Fp64m80 : FpI_<(outs RFP80:$dst), (ins f64mem:$src), ZeroArgFP,
+ [(set RFP80:$dst, (f80 (extloadf64 addr:$src)))]>;
+def LD_Fp32m80 : FpI_<(outs RFP80:$dst), (ins f32mem:$src), ZeroArgFP,
+ [(set RFP80:$dst, (f80 (extloadf32 addr:$src)))]>;
+def ILD_Fp16m32: FpIf32<(outs RFP32:$dst), (ins i16mem:$src), ZeroArgFP,
+ [(set RFP32:$dst, (X86fild addr:$src, i16))]>;
+def ILD_Fp32m32: FpIf32<(outs RFP32:$dst), (ins i32mem:$src), ZeroArgFP,
+ [(set RFP32:$dst, (X86fild addr:$src, i32))]>;
+def ILD_Fp64m32: FpIf32<(outs RFP32:$dst), (ins i64mem:$src), ZeroArgFP,
+ [(set RFP32:$dst, (X86fild addr:$src, i64))]>;
+def ILD_Fp16m64: FpIf64<(outs RFP64:$dst), (ins i16mem:$src), ZeroArgFP,
+ [(set RFP64:$dst, (X86fild addr:$src, i16))]>;
+def ILD_Fp32m64: FpIf64<(outs RFP64:$dst), (ins i32mem:$src), ZeroArgFP,
+ [(set RFP64:$dst, (X86fild addr:$src, i32))]>;
+def ILD_Fp64m64: FpIf64<(outs RFP64:$dst), (ins i64mem:$src), ZeroArgFP,
+ [(set RFP64:$dst, (X86fild addr:$src, i64))]>;
+def ILD_Fp16m80: FpI_<(outs RFP80:$dst), (ins i16mem:$src), ZeroArgFP,
+ [(set RFP80:$dst, (X86fild addr:$src, i16))]>;
+def ILD_Fp32m80: FpI_<(outs RFP80:$dst), (ins i32mem:$src), ZeroArgFP,
+ [(set RFP80:$dst, (X86fild addr:$src, i32))]>;
+def ILD_Fp64m80: FpI_<(outs RFP80:$dst), (ins i64mem:$src), ZeroArgFP,
+ [(set RFP80:$dst, (X86fild addr:$src, i64))]>;
+
+def ST_Fp32m : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP,
+ [(store RFP32:$src, addr:$op)]>;
+def ST_Fp64m32 : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP,
+ [(truncstoref32 RFP64:$src, addr:$op)]>;
+def ST_Fp64m : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP,
+ [(store RFP64:$src, addr:$op)]>;
+def ST_Fp80m32 : FpI_<(outs), (ins f32mem:$op, RFP80:$src), OneArgFP,
+ [(truncstoref32 RFP80:$src, addr:$op)]>;
+def ST_Fp80m64 : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP,
+ [(truncstoref64 RFP80:$src, addr:$op)]>;
+// FST does not support 80-bit memory target; FSTP must be used.
+
+let mayStore = 1, hasSideEffects = 0 in {
+def ST_FpP32m : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP, []>;
+def ST_FpP64m32 : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP, []>;
+def ST_FpP64m : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP, []>;
+def ST_FpP80m32 : FpI_<(outs), (ins f32mem:$op, RFP80:$src), OneArgFP, []>;
+def ST_FpP80m64 : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP, []>;
+}
+def ST_FpP80m : FpI_<(outs), (ins f80mem:$op, RFP80:$src), OneArgFP,
+ [(store RFP80:$src, addr:$op)]>;
+let mayStore = 1, hasSideEffects = 0 in {
+def IST_Fp16m32 : FpIf32<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp32m32 : FpIf32<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp64m32 : FpIf32<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp16m64 : FpIf64<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp32m64 : FpIf64<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp64m64 : FpIf64<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp16m80 : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP, []>;
+def IST_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP, []>;
+def IST_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, []>;
+}
+
+let mayLoad = 1, SchedRW = [WriteLoad] in {
+def LD_F32m : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src",
+ IIC_FLD>;
+def LD_F64m : FPI<0xDD, MRM0m, (outs), (ins f64mem:$src), "fld{l}\t$src",
+ IIC_FLD>;
+def LD_F80m : FPI<0xDB, MRM5m, (outs), (ins f80mem:$src), "fld{t}\t$src",
+ IIC_FLD80>;
+def ILD_F16m : FPI<0xDF, MRM0m, (outs), (ins i16mem:$src), "fild{s}\t$src",
+ IIC_FILD>;
+def ILD_F32m : FPI<0xDB, MRM0m, (outs), (ins i32mem:$src), "fild{l}\t$src",
+ IIC_FILD>;
+def ILD_F64m : FPI<0xDF, MRM5m, (outs), (ins i64mem:$src), "fild{ll}\t$src",
+ IIC_FILD>;
+}
+let mayStore = 1, SchedRW = [WriteStore] in {
+def ST_F32m : FPI<0xD9, MRM2m, (outs), (ins f32mem:$dst), "fst{s}\t$dst",
+ IIC_FST>;
+def ST_F64m : FPI<0xDD, MRM2m, (outs), (ins f64mem:$dst), "fst{l}\t$dst",
+ IIC_FST>;
+def ST_FP32m : FPI<0xD9, MRM3m, (outs), (ins f32mem:$dst), "fstp{s}\t$dst",
+ IIC_FST>;
+def ST_FP64m : FPI<0xDD, MRM3m, (outs), (ins f64mem:$dst), "fstp{l}\t$dst",
+ IIC_FST>;
+def ST_FP80m : FPI<0xDB, MRM7m, (outs), (ins f80mem:$dst), "fstp{t}\t$dst",
+ IIC_FST80>;
+def IST_F16m : FPI<0xDF, MRM2m, (outs), (ins i16mem:$dst), "fist{s}\t$dst",
+ IIC_FIST>;
+def IST_F32m : FPI<0xDB, MRM2m, (outs), (ins i32mem:$dst), "fist{l}\t$dst",
+ IIC_FIST>;
+def IST_FP16m : FPI<0xDF, MRM3m, (outs), (ins i16mem:$dst), "fistp{s}\t$dst",
+ IIC_FIST>;
+def IST_FP32m : FPI<0xDB, MRM3m, (outs), (ins i32mem:$dst), "fistp{l}\t$dst",
+ IIC_FIST>;
+def IST_FP64m : FPI<0xDF, MRM7m, (outs), (ins i64mem:$dst), "fistp{ll}\t$dst",
+ IIC_FIST>;
+}
+
+// FISTTP requires SSE3 even though it's a FPStack op.
+let Predicates = [HasSSE3] in {
+def ISTT_Fp16m32 : FpI_<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP,
+ [(X86fp_to_i16mem RFP32:$src, addr:$op)]>;
+def ISTT_Fp32m32 : FpI_<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP,
+ [(X86fp_to_i32mem RFP32:$src, addr:$op)]>;
+def ISTT_Fp64m32 : FpI_<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP,
+ [(X86fp_to_i64mem RFP32:$src, addr:$op)]>;
+def ISTT_Fp16m64 : FpI_<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP,
+ [(X86fp_to_i16mem RFP64:$src, addr:$op)]>;
+def ISTT_Fp32m64 : FpI_<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP,
+ [(X86fp_to_i32mem RFP64:$src, addr:$op)]>;
+def ISTT_Fp64m64 : FpI_<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP,
+ [(X86fp_to_i64mem RFP64:$src, addr:$op)]>;
+def ISTT_Fp16m80 : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP,
+ [(X86fp_to_i16mem RFP80:$src, addr:$op)]>;
+def ISTT_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP,
+ [(X86fp_to_i32mem RFP80:$src, addr:$op)]>;
+def ISTT_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP,
+ [(X86fp_to_i64mem RFP80:$src, addr:$op)]>;
+} // Predicates = [HasSSE3]
+
+let mayStore = 1, SchedRW = [WriteStore] in {
+def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst",
+ IIC_FST>;
+def ISTT_FP32m : FPI<0xDB, MRM1m, (outs), (ins i32mem:$dst), "fisttp{l}\t$dst",
+ IIC_FST>;
+def ISTT_FP64m : FPI<0xDD, MRM1m, (outs), (ins i64mem:$dst),
+ "fisttp{ll}\t$dst", IIC_FST>;
+}
+
+// FP Stack manipulation instructions.
+let SchedRW = [WriteMove] in {
+def LD_Frr : FPI<0xD9, MRM0r, (outs), (ins RST:$op), "fld\t$op", IIC_FLD>;
+def ST_Frr : FPI<0xDD, MRM2r, (outs), (ins RST:$op), "fst\t$op", IIC_FST>;
+def ST_FPrr : FPI<0xDD, MRM3r, (outs), (ins RST:$op), "fstp\t$op", IIC_FST>;
+def XCH_F : FPI<0xD9, MRM1r, (outs), (ins RST:$op), "fxch\t$op", IIC_FXCH>;
+}
+
+// Floating point constant loads.
+let isReMaterializable = 1 in {
+def LD_Fp032 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP,
+ [(set RFP32:$dst, fpimm0)]>;
+def LD_Fp132 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP,
+ [(set RFP32:$dst, fpimm1)]>;
+def LD_Fp064 : FpIf64<(outs RFP64:$dst), (ins), ZeroArgFP,
+ [(set RFP64:$dst, fpimm0)]>;
+def LD_Fp164 : FpIf64<(outs RFP64:$dst), (ins), ZeroArgFP,
+ [(set RFP64:$dst, fpimm1)]>;
+def LD_Fp080 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP,
+ [(set RFP80:$dst, fpimm0)]>;
+def LD_Fp180 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP,
+ [(set RFP80:$dst, fpimm1)]>;
+}
+
+let SchedRW = [WriteZero] in {
+def LD_F0 : FPI<0xD9, MRM_EE, (outs), (ins), "fldz", IIC_FLDZ>;
+def LD_F1 : FPI<0xD9, MRM_E8, (outs), (ins), "fld1", IIC_FIST>;
+}
+
+// Floating point compares.
+let SchedRW = [WriteFAdd] in {
+def UCOM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
+ [(set FPSW, (trunc (X86cmp RFP32:$lhs, RFP32:$rhs)))]>;
+def UCOM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
+ [(set FPSW, (trunc (X86cmp RFP64:$lhs, RFP64:$rhs)))]>;
+def UCOM_Fpr80 : FpI_ <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
+ [(set FPSW, (trunc (X86cmp RFP80:$lhs, RFP80:$rhs)))]>;
+} // SchedRW
+} // Defs = [FPSW]
+
+let SchedRW = [WriteFAdd] in {
+// CC = ST(0) cmp ST(i)
+let Defs = [EFLAGS, FPSW] in {
+def UCOM_FpIr32: FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
+ [(set EFLAGS, (X86cmp RFP32:$lhs, RFP32:$rhs))]>;
+def UCOM_FpIr64: FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
+ [(set EFLAGS, (X86cmp RFP64:$lhs, RFP64:$rhs))]>;
+def UCOM_FpIr80: FpI_<(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
+ [(set EFLAGS, (X86cmp RFP80:$lhs, RFP80:$rhs))]>;
+}
+
+let Defs = [FPSW], Uses = [ST0] in {
+def UCOM_Fr : FPI<0xDD, MRM4r, // FPSW = cmp ST(0) with ST(i)
+ (outs), (ins RST:$reg), "fucom\t$reg", IIC_FUCOM>;
+def UCOM_FPr : FPI<0xDD, MRM5r, // FPSW = cmp ST(0) with ST(i), pop
+ (outs), (ins RST:$reg), "fucomp\t$reg", IIC_FUCOM>;
+def UCOM_FPPr : FPI<0xDA, MRM_E9, // cmp ST(0) with ST(1), pop, pop
+ (outs), (ins), "fucompp", IIC_FUCOM>;
+}
+
+let Defs = [EFLAGS, FPSW], Uses = [ST0] in {
+def UCOM_FIr : FPI<0xDB, MRM5r, // CC = cmp ST(0) with ST(i)
+ (outs), (ins RST:$reg), "fucomi\t$reg", IIC_FUCOMI>;
+def UCOM_FIPr : FPI<0xDF, MRM5r, // CC = cmp ST(0) with ST(i), pop
+ (outs), (ins RST:$reg), "fucompi\t$reg", IIC_FUCOMI>;
+}
+
+let Defs = [EFLAGS, FPSW] in {
+def COM_FIr : FPI<0xDB, MRM6r, (outs), (ins RST:$reg),
+ "fcomi\t$reg", IIC_FCOMI>;
+def COM_FIPr : FPI<0xDF, MRM6r, (outs), (ins RST:$reg),
+ "fcompi\t$reg", IIC_FCOMI>;
+}
+} // SchedRW
+
+// Floating point flag ops.
+let SchedRW = [WriteALU] in {
+let Defs = [AX], Uses = [FPSW] in
+def FNSTSW16r : I<0xDF, MRM_E0, // AX = fp flags
+ (outs), (ins), "fnstsw\t{%ax|ax}",
+ [(set AX, (X86fp_stsw FPSW))], IIC_FNSTSW>;
+
+def FNSTCW16m : I<0xD9, MRM7m, // [mem16] = X87 control world
+ (outs), (ins i16mem:$dst), "fnstcw\t$dst",
+ [(X86fp_cwd_get16 addr:$dst)], IIC_FNSTCW>;
+} // SchedRW
+let mayLoad = 1 in
+def FLDCW16m : I<0xD9, MRM5m, // X87 control world = [mem16]
+ (outs), (ins i16mem:$dst), "fldcw\t$dst", [], IIC_FLDCW>,
+ Sched<[WriteLoad]>;
+
+// FPU control instructions
+let SchedRW = [WriteMicrocoded] in {
+let Defs = [FPSW] in
+def FNINIT : I<0xDB, MRM_E3, (outs), (ins), "fninit", [], IIC_FNINIT>;
+def FFREE : FPI<0xDD, MRM0r, (outs), (ins RST:$reg),
+ "ffree\t$reg", IIC_FFREE>;
+// Clear exceptions
+
+let Defs = [FPSW] in
+def FNCLEX : I<0xDB, MRM_E2, (outs), (ins), "fnclex", [], IIC_FNCLEX>;
+} // SchedRW
+
+// Operandless floating-point instructions for the disassembler.
+let SchedRW = [WriteMicrocoded] in {
+def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", [], IIC_WAIT>;
+
+def FNOP : I<0xD9, MRM_D0, (outs), (ins), "fnop", [], IIC_FNOP>;
+def FXAM : I<0xD9, MRM_E5, (outs), (ins), "fxam", [], IIC_FXAM>;
+def FLDL2T : I<0xD9, MRM_E9, (outs), (ins), "fldl2t", [], IIC_FLDL>;
+def FLDL2E : I<0xD9, MRM_EA, (outs), (ins), "fldl2e", [], IIC_FLDL>;
+def FLDPI : I<0xD9, MRM_EB, (outs), (ins), "fldpi", [], IIC_FLDL>;
+def FLDLG2 : I<0xD9, MRM_EC, (outs), (ins), "fldlg2", [], IIC_FLDL>;
+def FLDLN2 : I<0xD9, MRM_ED, (outs), (ins), "fldln2", [], IIC_FLDL>;
+def F2XM1 : I<0xD9, MRM_F0, (outs), (ins), "f2xm1", [], IIC_F2XM1>;
+def FYL2X : I<0xD9, MRM_F1, (outs), (ins), "fyl2x", [], IIC_FYL2X>;
+def FPTAN : I<0xD9, MRM_F2, (outs), (ins), "fptan", [], IIC_FPTAN>;
+def FPATAN : I<0xD9, MRM_F3, (outs), (ins), "fpatan", [], IIC_FPATAN>;
+def FXTRACT : I<0xD9, MRM_F4, (outs), (ins), "fxtract", [], IIC_FXTRACT>;
+def FPREM1 : I<0xD9, MRM_F5, (outs), (ins), "fprem1", [], IIC_FPREM1>;
+def FDECSTP : I<0xD9, MRM_F6, (outs), (ins), "fdecstp", [], IIC_FPSTP>;
+def FINCSTP : I<0xD9, MRM_F7, (outs), (ins), "fincstp", [], IIC_FPSTP>;
+def FPREM : I<0xD9, MRM_F8, (outs), (ins), "fprem", [], IIC_FPREM>;
+def FYL2XP1 : I<0xD9, MRM_F9, (outs), (ins), "fyl2xp1", [], IIC_FYL2XP1>;
+def FSINCOS : I<0xD9, MRM_FB, (outs), (ins), "fsincos", [], IIC_FSINCOS>;
+def FRNDINT : I<0xD9, MRM_FC, (outs), (ins), "frndint", [], IIC_FRNDINT>;
+def FSCALE : I<0xD9, MRM_FD, (outs), (ins), "fscale", [], IIC_FSCALE>;
+def FCOMPP : I<0xDE, MRM_D9, (outs), (ins), "fcompp", [], IIC_FCOMPP>;
+
+let Predicates = [HasFXSR] in {
+ def FXSAVE : I<0xAE, MRM0m, (outs), (ins opaque512mem:$dst),
+ "fxsave\t$dst", [(int_x86_fxsave addr:$dst)], IIC_FXSAVE>, TB;
+ def FXSAVE64 : RI<0xAE, MRM0m, (outs), (ins opaque512mem:$dst),
+ "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)],
+ IIC_FXSAVE>, TB, Requires<[In64BitMode]>;
+ def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
+ "fxrstor\t$src", [(int_x86_fxrstor addr:$src)], IIC_FXRSTOR>, TB;
+ def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
+ "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)],
+ IIC_FXRSTOR>, TB, Requires<[In64BitMode]>;
+} // Predicates = [FeatureFXSR]
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// Required for RET of f32 / f64 / f80 values.
+def : Pat<(X86fld addr:$src, f32), (LD_Fp32m addr:$src)>;
+def : Pat<(X86fld addr:$src, f64), (LD_Fp64m addr:$src)>;
+def : Pat<(X86fld addr:$src, f80), (LD_Fp80m addr:$src)>;
+
+// Required for CALL which return f32 / f64 / f80 values.
+def : Pat<(X86fst RFP32:$src, addr:$op, f32), (ST_Fp32m addr:$op, RFP32:$src)>;
+def : Pat<(X86fst RFP64:$src, addr:$op, f32), (ST_Fp64m32 addr:$op,
+ RFP64:$src)>;
+def : Pat<(X86fst RFP64:$src, addr:$op, f64), (ST_Fp64m addr:$op, RFP64:$src)>;
+def : Pat<(X86fst RFP80:$src, addr:$op, f32), (ST_Fp80m32 addr:$op,
+ RFP80:$src)>;
+def : Pat<(X86fst RFP80:$src, addr:$op, f64), (ST_Fp80m64 addr:$op,
+ RFP80:$src)>;
+def : Pat<(X86fst RFP80:$src, addr:$op, f80), (ST_FpP80m addr:$op,
+ RFP80:$src)>;
+
+// Floating point constant -0.0 and -1.0
+def : Pat<(f32 fpimmneg0), (CHS_Fp32 (LD_Fp032))>, Requires<[FPStackf32]>;
+def : Pat<(f32 fpimmneg1), (CHS_Fp32 (LD_Fp132))>, Requires<[FPStackf32]>;
+def : Pat<(f64 fpimmneg0), (CHS_Fp64 (LD_Fp064))>, Requires<[FPStackf64]>;
+def : Pat<(f64 fpimmneg1), (CHS_Fp64 (LD_Fp164))>, Requires<[FPStackf64]>;
+def : Pat<(f80 fpimmneg0), (CHS_Fp80 (LD_Fp080))>;
+def : Pat<(f80 fpimmneg1), (CHS_Fp80 (LD_Fp180))>;
+
+// Used to conv. i64 to f64 since there isn't a SSE version.
+def : Pat<(X86fildflag addr:$src, i64), (ILD_Fp64m64 addr:$src)>;
+
+// FP extensions map onto simple pseudo-value conversions if they are to/from
+// the FP stack.
+def : Pat<(f64 (fpextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP64)>,
+ Requires<[FPStackf32]>;
+def : Pat<(f80 (fpextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP80)>,
+ Requires<[FPStackf32]>;
+def : Pat<(f80 (fpextend RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP80)>,
+ Requires<[FPStackf64]>;
+
+// FP truncations map onto simple pseudo-value conversions if they are to/from
+// the FP stack. We have validated that only value-preserving truncations make
+// it through isel.
+def : Pat<(f32 (fpround RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP32)>,
+ Requires<[FPStackf32]>;
+def : Pat<(f32 (fpround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP32)>,
+ Requires<[FPStackf32]>;
+def : Pat<(f64 (fpround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP64)>,
+ Requires<[FPStackf64]>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFormats.td b/contrib/llvm/lib/Target/X86/X86InstrFormats.td
new file mode 100644
index 000000000000..610756aa37da
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrFormats.td
@@ -0,0 +1,957 @@
+//===-- X86InstrFormats.td - X86 Instruction Formats -------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Format Definitions.
+//
+
+// Format specifies the encoding used by the instruction. This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<7> val> {
+ bits<7> Value = val;
+}
+
+def Pseudo : Format<0>;
+def RawFrm : Format<1>;
+def AddRegFrm : Format<2>;
+def RawFrmMemOffs : Format<3>;
+def RawFrmSrc : Format<4>;
+def RawFrmDst : Format<5>;
+def RawFrmDstSrc : Format<6>;
+def RawFrmImm8 : Format<7>;
+def RawFrmImm16 : Format<8>;
+def MRMDestMem : Format<32>;
+def MRMSrcMem : Format<33>;
+def MRMSrcMem4VOp3 : Format<34>;
+def MRMSrcMemOp4 : Format<35>;
+def MRMXm : Format<39>;
+def MRM0m : Format<40>; def MRM1m : Format<41>; def MRM2m : Format<42>;
+def MRM3m : Format<43>; def MRM4m : Format<44>; def MRM5m : Format<45>;
+def MRM6m : Format<46>; def MRM7m : Format<47>;
+def MRMDestReg : Format<48>;
+def MRMSrcReg : Format<49>;
+def MRMSrcReg4VOp3 : Format<50>;
+def MRMSrcRegOp4 : Format<51>;
+def MRMXr : Format<55>;
+def MRM0r : Format<56>; def MRM1r : Format<57>; def MRM2r : Format<58>;
+def MRM3r : Format<59>; def MRM4r : Format<60>; def MRM5r : Format<61>;
+def MRM6r : Format<62>; def MRM7r : Format<63>;
+def MRM_C0 : Format<64>; def MRM_C1 : Format<65>; def MRM_C2 : Format<66>;
+def MRM_C3 : Format<67>; def MRM_C4 : Format<68>; def MRM_C5 : Format<69>;
+def MRM_C6 : Format<70>; def MRM_C7 : Format<71>; def MRM_C8 : Format<72>;
+def MRM_C9 : Format<73>; def MRM_CA : Format<74>; def MRM_CB : Format<75>;
+def MRM_CC : Format<76>; def MRM_CD : Format<77>; def MRM_CE : Format<78>;
+def MRM_CF : Format<79>; def MRM_D0 : Format<80>; def MRM_D1 : Format<81>;
+def MRM_D2 : Format<82>; def MRM_D3 : Format<83>; def MRM_D4 : Format<84>;
+def MRM_D5 : Format<85>; def MRM_D6 : Format<86>; def MRM_D7 : Format<87>;
+def MRM_D8 : Format<88>; def MRM_D9 : Format<89>; def MRM_DA : Format<90>;
+def MRM_DB : Format<91>; def MRM_DC : Format<92>; def MRM_DD : Format<93>;
+def MRM_DE : Format<94>; def MRM_DF : Format<95>; def MRM_E0 : Format<96>;
+def MRM_E1 : Format<97>; def MRM_E2 : Format<98>; def MRM_E3 : Format<99>;
+def MRM_E4 : Format<100>; def MRM_E5 : Format<101>; def MRM_E6 : Format<102>;
+def MRM_E7 : Format<103>; def MRM_E8 : Format<104>; def MRM_E9 : Format<105>;
+def MRM_EA : Format<106>; def MRM_EB : Format<107>; def MRM_EC : Format<108>;
+def MRM_ED : Format<109>; def MRM_EE : Format<110>; def MRM_EF : Format<111>;
+def MRM_F0 : Format<112>; def MRM_F1 : Format<113>; def MRM_F2 : Format<114>;
+def MRM_F3 : Format<115>; def MRM_F4 : Format<116>; def MRM_F5 : Format<117>;
+def MRM_F6 : Format<118>; def MRM_F7 : Format<119>; def MRM_F8 : Format<120>;
+def MRM_F9 : Format<121>; def MRM_FA : Format<122>; def MRM_FB : Format<123>;
+def MRM_FC : Format<124>; def MRM_FD : Format<125>; def MRM_FE : Format<126>;
+def MRM_FF : Format<127>;
+
+// ImmType - This specifies the immediate type used by an instruction. This is
+// part of the ad-hoc solution used to emit machine instruction encodings by our
+// machine code emitter.
+class ImmType<bits<4> val> {
+ bits<4> Value = val;
+}
+def NoImm : ImmType<0>;
+def Imm8 : ImmType<1>;
+def Imm8PCRel : ImmType<2>;
+def Imm8Reg : ImmType<3>; // Register encoded in [7:4].
+def Imm16 : ImmType<4>;
+def Imm16PCRel : ImmType<5>;
+def Imm32 : ImmType<6>;
+def Imm32PCRel : ImmType<7>;
+def Imm32S : ImmType<8>;
+def Imm64 : ImmType<9>;
+
+// FPFormat - This specifies what form this FP instruction has. This is used by
+// the Floating-Point stackifier pass.
+class FPFormat<bits<3> val> {
+ bits<3> Value = val;
+}
+def NotFP : FPFormat<0>;
+def ZeroArgFP : FPFormat<1>;
+def OneArgFP : FPFormat<2>;
+def OneArgFPRW : FPFormat<3>;
+def TwoArgFP : FPFormat<4>;
+def CompareFP : FPFormat<5>;
+def CondMovFP : FPFormat<6>;
+def SpecialFP : FPFormat<7>;
+
+// Class specifying the SSE execution domain, used by the SSEDomainFix pass.
+// Keep in sync with tables in X86InstrInfo.cpp.
+class Domain<bits<2> val> {
+ bits<2> Value = val;
+}
+def GenericDomain : Domain<0>;
+def SSEPackedSingle : Domain<1>;
+def SSEPackedDouble : Domain<2>;
+def SSEPackedInt : Domain<3>;
+
+// Class specifying the vector form of the decompressed
+// displacement of 8-bit.
+class CD8VForm<bits<3> val> {
+ bits<3> Value = val;
+}
+def CD8VF : CD8VForm<0>; // v := VL
+def CD8VH : CD8VForm<1>; // v := VL/2
+def CD8VQ : CD8VForm<2>; // v := VL/4
+def CD8VO : CD8VForm<3>; // v := VL/8
+// The tuple (subvector) forms.
+def CD8VT1 : CD8VForm<4>; // v := 1
+def CD8VT2 : CD8VForm<5>; // v := 2
+def CD8VT4 : CD8VForm<6>; // v := 4
+def CD8VT8 : CD8VForm<7>; // v := 8
+
+// Class specifying the prefix used an opcode extension.
+class Prefix<bits<3> val> {
+ bits<3> Value = val;
+}
+def NoPrfx : Prefix<0>;
+def PS : Prefix<1>;
+def PD : Prefix<2>;
+def XS : Prefix<3>;
+def XD : Prefix<4>;
+
+// Class specifying the opcode map.
+class Map<bits<3> val> {
+ bits<3> Value = val;
+}
+def OB : Map<0>;
+def TB : Map<1>;
+def T8 : Map<2>;
+def TA : Map<3>;
+def XOP8 : Map<4>;
+def XOP9 : Map<5>;
+def XOPA : Map<6>;
+
+// Class specifying the encoding
+class Encoding<bits<2> val> {
+ bits<2> Value = val;
+}
+def EncNormal : Encoding<0>;
+def EncVEX : Encoding<1>;
+def EncXOP : Encoding<2>;
+def EncEVEX : Encoding<3>;
+
+// Operand size for encodings that change based on mode.
+class OperandSize<bits<2> val> {
+ bits<2> Value = val;
+}
+def OpSizeFixed : OperandSize<0>; // Never needs a 0x66 prefix.
+def OpSize16 : OperandSize<1>; // Needs 0x66 prefix in 32-bit mode.
+def OpSize32 : OperandSize<2>; // Needs 0x66 prefix in 16-bit mode.
+
+// Address size for encodings that change based on mode.
+class AddressSize<bits<2> val> {
+ bits<2> Value = val;
+}
+def AdSizeX : AddressSize<0>; // Address size determined using addr operand.
+def AdSize16 : AddressSize<1>; // Encodes a 16-bit address.
+def AdSize32 : AddressSize<2>; // Encodes a 32-bit address.
+def AdSize64 : AddressSize<3>; // Encodes a 64-bit address.
+
+// Prefix byte classes which are used to indicate to the ad-hoc machine code
+// emitter that various prefix bytes are required.
+class OpSize16 { OperandSize OpSize = OpSize16; }
+class OpSize32 { OperandSize OpSize = OpSize32; }
+class AdSize16 { AddressSize AdSize = AdSize16; }
+class AdSize32 { AddressSize AdSize = AdSize32; }
+class AdSize64 { AddressSize AdSize = AdSize64; }
+class REX_W { bit hasREX_WPrefix = 1; }
+class LOCK { bit hasLockPrefix = 1; }
+class REP { bit hasREPPrefix = 1; }
+class TB { Map OpMap = TB; }
+class T8 { Map OpMap = T8; }
+class TA { Map OpMap = TA; }
+class XOP8 { Map OpMap = XOP8; Prefix OpPrefix = PS; }
+class XOP9 { Map OpMap = XOP9; Prefix OpPrefix = PS; }
+class XOPA { Map OpMap = XOPA; Prefix OpPrefix = PS; }
+class OBXS { Prefix OpPrefix = XS; }
+class PS : TB { Prefix OpPrefix = PS; }
+class PD : TB { Prefix OpPrefix = PD; }
+class XD : TB { Prefix OpPrefix = XD; }
+class XS : TB { Prefix OpPrefix = XS; }
+class T8PS : T8 { Prefix OpPrefix = PS; }
+class T8PD : T8 { Prefix OpPrefix = PD; }
+class T8XD : T8 { Prefix OpPrefix = XD; }
+class T8XS : T8 { Prefix OpPrefix = XS; }
+class TAPS : TA { Prefix OpPrefix = PS; }
+class TAPD : TA { Prefix OpPrefix = PD; }
+class TAXD : TA { Prefix OpPrefix = XD; }
+class VEX { Encoding OpEnc = EncVEX; }
+class VEX_W { bit hasVEX_WPrefix = 1; }
+class VEX_4V : VEX { bit hasVEX_4V = 1; }
+class VEX_L { bit hasVEX_L = 1; }
+class VEX_LIG { bit ignoresVEX_L = 1; }
+class EVEX : VEX { Encoding OpEnc = EncEVEX; }
+class EVEX_4V : VEX_4V { Encoding OpEnc = EncEVEX; }
+class EVEX_K { bit hasEVEX_K = 1; }
+class EVEX_KZ : EVEX_K { bit hasEVEX_Z = 1; }
+class EVEX_B { bit hasEVEX_B = 1; }
+class EVEX_RC { bit hasEVEX_RC = 1; }
+class EVEX_V512 { bit hasEVEX_L2 = 1; bit hasVEX_L = 0; }
+class EVEX_V256 { bit hasEVEX_L2 = 0; bit hasVEX_L = 1; }
+class EVEX_V128 { bit hasEVEX_L2 = 0; bit hasVEX_L = 0; }
+
+// Specify AVX512 8-bit compressed displacement encoding based on the vector
+// element size in bits (8, 16, 32, 64) and the CDisp8 form.
+class EVEX_CD8<int esize, CD8VForm form> {
+ int CD8_EltSize = !srl(esize, 3);
+ bits<3> CD8_Form = form.Value;
+}
+
+class Has3DNow0F0FOpcode { bit has3DNow0F0FOpcode = 1; }
+class XOP { Encoding OpEnc = EncXOP; }
+class XOP_4V : XOP { bit hasVEX_4V = 1; }
+
+class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
+ string AsmStr,
+ InstrItinClass itin,
+ Domain d = GenericDomain>
+ : Instruction {
+ let Namespace = "X86";
+
+ bits<8> Opcode = opcod;
+ Format Form = f;
+ bits<7> FormBits = Form.Value;
+ ImmType ImmT = i;
+
+ dag OutOperandList = outs;
+ dag InOperandList = ins;
+ string AsmString = AsmStr;
+
+ // If this is a pseudo instruction, mark it isCodeGenOnly.
+ let isCodeGenOnly = !eq(!cast<string>(f), "Pseudo");
+
+ let Itinerary = itin;
+
+ //
+ // Attributes specific to X86 instructions...
+ //
+ bit ForceDisassemble = 0; // Force instruction to disassemble even though it's
+ // isCodeGenonly. Needed to hide an ambiguous
+ // AsmString from the parser, but still disassemble.
+
+ OperandSize OpSize = OpSizeFixed; // Does this instruction's encoding change
+ // based on operand size of the mode?
+ bits<2> OpSizeBits = OpSize.Value;
+ AddressSize AdSize = AdSizeX; // Does this instruction's encoding change
+ // based on address size of the mode?
+ bits<2> AdSizeBits = AdSize.Value;
+
+ Prefix OpPrefix = NoPrfx; // Which prefix byte does this inst have?
+ bits<3> OpPrefixBits = OpPrefix.Value;
+ Map OpMap = OB; // Which opcode map does this inst have?
+ bits<3> OpMapBits = OpMap.Value;
+ bit hasREX_WPrefix = 0; // Does this inst require the REX.W prefix?
+ FPFormat FPForm = NotFP; // What flavor of FP instruction is this?
+ bit hasLockPrefix = 0; // Does this inst have a 0xF0 prefix?
+ Domain ExeDomain = d;
+ bit hasREPPrefix = 0; // Does this inst have a REP prefix?
+ Encoding OpEnc = EncNormal; // Encoding used by this instruction
+ bits<2> OpEncBits = OpEnc.Value;
+ bit hasVEX_WPrefix = 0; // Does this inst set the VEX_W field?
+ bit hasVEX_4V = 0; // Does this inst require the VEX.VVVV field?
+ bit hasVEX_L = 0; // Does this inst use large (256-bit) registers?
+ bit ignoresVEX_L = 0; // Does this instruction ignore the L-bit
+ bit hasEVEX_K = 0; // Does this inst require masking?
+ bit hasEVEX_Z = 0; // Does this inst set the EVEX_Z field?
+ bit hasEVEX_L2 = 0; // Does this inst set the EVEX_L2 field?
+ bit hasEVEX_B = 0; // Does this inst set the EVEX_B field?
+ bits<3> CD8_Form = 0; // Compressed disp8 form - vector-width.
+ // Declare it int rather than bits<4> so that all bits are defined when
+ // assigning to bits<7>.
+ int CD8_EltSize = 0; // Compressed disp8 form - element-size in bytes.
+ bit has3DNow0F0FOpcode =0;// Wacky 3dNow! encoding?
+ bit hasEVEX_RC = 0; // Explicitly specified rounding control in FP instruction.
+
+ bits<2> EVEX_LL;
+ let EVEX_LL{0} = hasVEX_L;
+ let EVEX_LL{1} = hasEVEX_L2;
+ // Vector size in bytes.
+ bits<7> VectSize = !shl(16, EVEX_LL);
+
+ // The scaling factor for AVX512's compressed displacement is either
+ // - the size of a power-of-two number of elements or
+ // - the size of a single element for broadcasts or
+ // - the total vector size divided by a power-of-two number.
+ // Possible values are: 0 (non-AVX512 inst), 1, 2, 4, 8, 16, 32 and 64.
+ bits<7> CD8_Scale = !if (!eq (OpEnc.Value, EncEVEX.Value),
+ !if (CD8_Form{2},
+ !shl(CD8_EltSize, CD8_Form{1-0}),
+ !if (hasEVEX_B,
+ CD8_EltSize,
+ !srl(VectSize, CD8_Form{1-0}))), 0);
+
+ // TSFlags layout should be kept in sync with X86BaseInfo.h.
+ let TSFlags{6-0} = FormBits;
+ let TSFlags{8-7} = OpSizeBits;
+ let TSFlags{10-9} = AdSizeBits;
+ let TSFlags{13-11} = OpPrefixBits;
+ let TSFlags{16-14} = OpMapBits;
+ let TSFlags{17} = hasREX_WPrefix;
+ let TSFlags{21-18} = ImmT.Value;
+ let TSFlags{24-22} = FPForm.Value;
+ let TSFlags{25} = hasLockPrefix;
+ let TSFlags{26} = hasREPPrefix;
+ let TSFlags{28-27} = ExeDomain.Value;
+ let TSFlags{30-29} = OpEncBits;
+ let TSFlags{38-31} = Opcode;
+ let TSFlags{39} = hasVEX_WPrefix;
+ let TSFlags{40} = hasVEX_4V;
+ let TSFlags{41} = hasVEX_L;
+ let TSFlags{42} = hasEVEX_K;
+ let TSFlags{43} = hasEVEX_Z;
+ let TSFlags{44} = hasEVEX_L2;
+ let TSFlags{45} = hasEVEX_B;
+ // If we run out of TSFlags bits, it's possible to encode this in 3 bits.
+ let TSFlags{52-46} = CD8_Scale;
+ let TSFlags{53} = has3DNow0F0FOpcode;
+ let TSFlags{54} = hasEVEX_RC;
+}
+
+class PseudoI<dag oops, dag iops, list<dag> pattern>
+ : X86Inst<0, Pseudo, NoImm, oops, iops, "", NoItinerary> {
+ let Pattern = pattern;
+}
+
+class I<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary,
+ Domain d = GenericDomain>
+ : X86Inst<o, f, NoImm, outs, ins, asm, itin, d> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+class Ii8 <bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary,
+ Domain d = GenericDomain>
+ : X86Inst<o, f, Imm8, outs, ins, asm, itin, d> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+class Ii8Reg<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary,
+ Domain d = GenericDomain>
+ : X86Inst<o, f, Imm8Reg, outs, ins, asm, itin, d> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+class Ii8PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : X86Inst<o, f, Imm8PCRel, outs, ins, asm, itin> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+class Ii16<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : X86Inst<o, f, Imm16, outs, ins, asm, itin> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+class Ii32<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : X86Inst<o, f, Imm32, outs, ins, asm, itin> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+class Ii32S<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : X86Inst<o, f, Imm32S, outs, ins, asm, itin> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+
+class Ii16PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : X86Inst<o, f, Imm16PCRel, outs, ins, asm, itin> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+
+class Ii32PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : X86Inst<o, f, Imm32PCRel, outs, ins, asm, itin> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+
+// FPStack Instruction Templates:
+// FPI - Floating Point Instruction template.
+class FPI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, [], itin> {}
+
+// FpI_ - Floating Point Pseudo Instruction template. Not Predicated.
+class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern,
+ InstrItinClass itin = NoItinerary>
+ : X86Inst<0, Pseudo, NoImm, outs, ins, "", itin> {
+ let FPForm = fp;
+ let Pattern = pattern;
+}
+
+// Templates for instructions that use a 16- or 32-bit segmented address as
+// their only operand: lcall (FAR CALL) and ljmp (FAR JMP)
+//
+// Iseg16 - 16-bit segment selector, 16-bit offset
+// Iseg32 - 16-bit segment selector, 32-bit offset
+
+class Iseg16 <bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : X86Inst<o, f, Imm16, outs, ins, asm, itin> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+
+class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : X86Inst<o, f, Imm32, outs, ins, asm, itin> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+
+// SI - SSE 1 & 2 scalar instructions
+class SI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary,
+ Domain d = GenericDomain>
+ : I<o, F, outs, ins, asm, pattern, itin, d> {
+ let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
+ !if(!eq(OpEnc.Value, EncVEX.Value), [UseAVX],
+ !if(!eq(OpPrefix.Value, XS.Value), [UseSSE1],
+ !if(!eq(OpPrefix.Value, XD.Value), [UseSSE2],
+ !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2],
+ [UseSSE1])))));
+
+ // AVX instructions have a 'v' prefix in the mnemonic
+ let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm),
+ !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm),
+ asm));
+}
+
+// SI - SSE 1 & 2 scalar intrinsics - vex form available on AVX512
+class SI_Int<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary,
+ Domain d = GenericDomain>
+ : I<o, F, outs, ins, asm, pattern, itin, d> {
+ let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
+ !if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX],
+ !if(!eq(OpPrefix.Value, XS.Value), [UseSSE1],
+ !if(!eq(OpPrefix.Value, XD.Value), [UseSSE2],
+ !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2],
+ [UseSSE1])))));
+
+ // AVX instructions have a 'v' prefix in the mnemonic
+ let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm),
+ !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm),
+ asm));
+}
+// SIi8 - SSE 1 & 2 scalar instructions - vex form available on AVX512
+class SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin> {
+ let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
+ !if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX],
+ !if(!eq(OpPrefix.Value, XS.Value), [UseSSE1],
+ [UseSSE2])));
+
+ // AVX instructions have a 'v' prefix in the mnemonic
+ let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm),
+ !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm),
+ asm));
+}
+
+// PI - SSE 1 & 2 packed instructions
+class PI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern,
+ InstrItinClass itin, Domain d>
+ : I<o, F, outs, ins, asm, pattern, itin, d> {
+ let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
+ !if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX],
+ !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2],
+ [UseSSE1])));
+
+ // AVX instructions have a 'v' prefix in the mnemonic
+ let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm),
+ !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm),
+ asm));
+}
+
+// MMXPI - SSE 1 & 2 packed instructions with MMX operands
+class MMXPI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern,
+ InstrItinClass itin, Domain d>
+ : I<o, F, outs, ins, asm, pattern, itin, d> {
+ let Predicates = !if(!eq(OpPrefix.Value, PD.Value), [HasSSE2],
+ [HasSSE1]);
+}
+
+// PIi8 - SSE 1 & 2 packed instructions with immediate
+class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin, Domain d>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, d> {
+ let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
+ !if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX],
+ !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2],
+ [UseSSE1])));
+
+ // AVX instructions have a 'v' prefix in the mnemonic
+ let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm),
+ !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm),
+ asm));
+}
+
+// SSE1 Instruction Templates:
+//
+// SSI - SSE1 instructions with XS prefix.
+// PSI - SSE1 instructions with PS prefix.
+// PSIi8 - SSE1 instructions with ImmT == Imm8 and PS prefix.
+// VSSI - SSE1 instructions with XS prefix in AVX form.
+// VPSI - SSE1 instructions with PS prefix in AVX form, packed single.
+
+class SSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE1]>;
+class SSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE1]>;
+class PSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, PS,
+ Requires<[UseSSE1]>;
+class PSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, PS,
+ Requires<[UseSSE1]>;
+class VSSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS,
+ Requires<[HasAVX]>;
+class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedSingle>, PS,
+ Requires<[HasAVX]>;
+
+// SSE2 Instruction Templates:
+//
+// SDI - SSE2 instructions with XD prefix.
+// SDIi8 - SSE2 instructions with ImmT == Imm8 and XD prefix.
+// S2SI - SSE2 instructions with XS prefix.
+// SSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix.
+// PDI - SSE2 instructions with PD prefix, packed double domain.
+// PDIi8 - SSE2 instructions with ImmT == Imm8 and PD prefix.
+// VSDI - SSE2 scalar instructions with XD prefix in AVX form.
+// VPDI - SSE2 vector instructions with PD prefix in AVX form,
+// packed double domain.
+// VS2I - SSE2 scalar instructions with PD prefix in AVX form.
+// S2I - SSE2 scalar instructions with PD prefix.
+// MMXSDIi8 - SSE2 instructions with ImmT == Imm8 and XD prefix as well as
+// MMX operands.
+// MMXSSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix as well as
+// MMX operands.
+
+class SDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[UseSSE2]>;
+class SDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[UseSSE2]>;
+class S2SI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE2]>;
+class S2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE2]>;
+class PDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, PD,
+ Requires<[UseSSE2]>;
+class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, PD,
+ Requires<[UseSSE2]>;
+class VSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XD,
+ Requires<[UseAVX]>;
+class VS2SI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS,
+ Requires<[HasAVX]>;
+class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedDouble>,
+ PD, Requires<[HasAVX]>;
+class VS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, PD,
+ Requires<[UseAVX]>;
+class S2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, PD, Requires<[UseSSE2]>;
+class MMXSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>;
+class MMXS2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>;
+
+// SSE3 Instruction Templates:
+//
+// S3I - SSE3 instructions with PD prefixes.
+// S3SI - SSE3 instructions with XS prefix.
+// S3DI - SSE3 instructions with XD prefix.
+
+class S3SI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, XS,
+ Requires<[UseSSE3]>;
+class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, XD,
+ Requires<[UseSSE3]>;
+class S3I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, PD,
+ Requires<[UseSSE3]>;
+
+
+// SSSE3 Instruction Templates:
+//
+// SS38I - SSSE3 instructions with T8 prefix.
+// SS3AI - SSSE3 instructions with TA prefix.
+// MMXSS38I - SSSE3 instructions with T8 prefix and MMX operands.
+// MMXSS3AI - SSSE3 instructions with TA prefix and MMX operands.
+//
+// Note: SSSE3 instructions have 64-bit and 128-bit versions. The 64-bit version
+// uses the MMX registers. The 64-bit versions are grouped with the MMX
+// classes. They need to be enabled even if AVX is enabled.
+
+class SS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+ Requires<[UseSSSE3]>;
+class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+ Requires<[UseSSSE3]>;
+class MMXSS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PS,
+ Requires<[HasSSSE3]>;
+class MMXSS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPS,
+ Requires<[HasSSSE3]>;
+
+// SSE4.1 Instruction Templates:
+//
+// SS48I - SSE 4.1 instructions with T8 prefix.
+// SS41AIi8 - SSE 4.1 instructions with TA prefix and ImmT == Imm8.
+//
+class SS48I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+ Requires<[UseSSE41]>;
+class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+ Requires<[UseSSE41]>;
+
+// SSE4.2 Instruction Templates:
+//
+// SS428I - SSE 4.2 instructions with T8 prefix.
+class SS428I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+ Requires<[UseSSE42]>;
+
+// SS42FI - SSE 4.2 instructions with T8XD prefix.
+// NOTE: 'HasSSE42' is used as SS42FI is only used for CRC32 insns.
+class SS42FI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, T8XD, Requires<[HasSSE42]>;
+
+// SS42AI = SSE 4.2 instructions with TA prefix
+class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+ Requires<[UseSSE42]>;
+
+// AVX Instruction Templates:
+// Instructions introduced in AVX (no SSE equivalent forms)
+//
+// AVX8I - AVX instructions with T8PD prefix.
+// AVXAIi8 - AVX instructions with TAPD prefix and ImmT = Imm8.
+class AVX8I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+ Requires<[HasAVX]>;
+class AVXAIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+ Requires<[HasAVX]>;
+
+// AVX2 Instruction Templates:
+// Instructions introduced in AVX2 (no SSE equivalent forms)
+//
+// AVX28I - AVX2 instructions with T8PD prefix.
+// AVX2AIi8 - AVX2 instructions with TAPD prefix and ImmT = Imm8.
+class AVX28I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+ Requires<[HasAVX2]>;
+class AVX2AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+ Requires<[HasAVX2]>;
+
+
+// AVX-512 Instruction Templates:
+// Instructions introduced in AVX-512 (no SSE equivalent forms)
+//
+// AVX5128I - AVX-512 instructions with T8PD prefix.
+// AVX512AIi8 - AVX-512 instructions with TAPD prefix and ImmT = Imm8.
+// AVX512PDI - AVX-512 instructions with PD, double packed.
+// AVX512PSI - AVX-512 instructions with PS, single packed.
+// AVX512XS8I - AVX-512 instructions with T8 and XS prefixes.
+// AVX512XSI - AVX-512 instructions with XS prefix, generic domain.
+// AVX512BI - AVX-512 instructions with PD, int packed domain.
+// AVX512SI - AVX-512 scalar instructions with PD prefix.
+
+class AVX5128I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+ Requires<[HasAVX512]>;
+class AVX5128IBase : T8PD {
+ Domain ExeDomain = SSEPackedInt;
+}
+class AVX512XS8I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8XS,
+ Requires<[HasAVX512]>;
+class AVX512XSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, XS,
+ Requires<[HasAVX512]>;
+class AVX512XDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, XD,
+ Requires<[HasAVX512]>;
+class AVX512BI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, PD,
+ Requires<[HasAVX512]>;
+class AVX512BIBase : PD {
+ Domain ExeDomain = SSEPackedInt;
+}
+class AVX512BIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, PD,
+ Requires<[HasAVX512]>;
+class AVX512BIi8Base : PD {
+ Domain ExeDomain = SSEPackedInt;
+ ImmType ImmT = Imm8;
+}
+class AVX512XSIi8Base : XS {
+ Domain ExeDomain = SSEPackedInt;
+ ImmType ImmT = Imm8;
+}
+class AVX512XDIi8Base : XD {
+ Domain ExeDomain = SSEPackedInt;
+ ImmType ImmT = Imm8;
+}
+class AVX512PSIi8Base : PS {
+ Domain ExeDomain = SSEPackedSingle;
+ ImmType ImmT = Imm8;
+}
+class AVX512PDIi8Base : PD {
+ Domain ExeDomain = SSEPackedDouble;
+ ImmType ImmT = Imm8;
+}
+class AVX512AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+ Requires<[HasAVX512]>;
+class AVX512AIi8Base : TAPD {
+ ImmType ImmT = Imm8;
+}
+class AVX512Ii8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>,
+ Requires<[HasAVX512]>;
+class AVX512PDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, PD,
+ Requires<[HasAVX512]>;
+class AVX512PSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, PS,
+ Requires<[HasAVX512]>;
+class AVX512PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, Domain d, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, d>, Requires<[HasAVX512]>;
+class AVX512PI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, Domain d, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, d>, Requires<[HasAVX512]>;
+class AVX512FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, T8PD,
+ EVEX_4V, Requires<[HasAVX512]>;
+class AVX512FMA3Base : T8PD, EVEX_4V;
+
+class AVX512<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, Requires<[HasAVX512]>;
+
+// AES Instruction Templates:
+//
+// AES8I
+// These use the same encoding as the SSE4.2 T8 and TA encodings.
+class AES8I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern, InstrItinClass itin = IIC_AES>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+ Requires<[HasAES]>;
+
+class AESAI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+ Requires<[HasAES]>;
+
+// PCLMUL Instruction Templates
+class PCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+ Requires<[HasPCLMUL]>;
+
+class AVXPCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+ VEX_4V, Requires<[HasAVX, HasPCLMUL]>;
+
+// FMA3 Instruction Templates
+class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, T8PD,
+ VEX_4V, FMASC, Requires<[HasFMA, NoVLX]>;
+
+// FMA4 Instruction Templates
+class FMA4<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern, InstrItinClass itin = NoItinerary>
+ : Ii8Reg<o, F, outs, ins, asm, pattern, itin>, TAPD,
+ VEX_4V, FMASC, Requires<[HasFMA4]>;
+
+// XOP 2, 3 and 4 Operand Instruction Template
+class IXOP<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>,
+ XOP9, Requires<[HasXOP]>;
+
+// XOP 2 and 3 Operand Instruction Templates with imm byte
+class IXOPi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>,
+ XOP8, Requires<[HasXOP]>;
+// XOP 4 Operand Instruction Templates with imm byte
+class IXOPi8Reg<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8Reg<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>,
+ XOP8, Requires<[HasXOP]>;
+
+// XOP 5 operand instruction (VEX encoding!)
+class IXOP5<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern, InstrItinClass itin = NoItinerary>
+ : Ii8Reg<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+ VEX_4V, Requires<[HasXOP]>;
+
+// X86-64 Instruction templates...
+//
+
+class RI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class RIi8 <bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class RIi16 <bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii16<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class RIi32 <bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii32<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class RIi32S <bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii32S<o, F, outs, ins, asm, pattern, itin>, REX_W;
+
+class RIi64<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : X86Inst<o, f, Imm64, outs, ins, asm, itin>, REX_W {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+
+class RIi64_NOREX<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : X86Inst<o, f, Imm64, outs, ins, asm, itin> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+
+class RS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : S2I<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class VRS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : VS2I<o, F, outs, ins, asm, pattern, itin>, VEX_W;
+
+// MMX Instruction templates
+//
+
+// MMXI - MMX instructions with TB prefix.
+// MMXI32 - MMX instructions with TB prefix valid only in 32 bit mode.
+// MMXI64 - MMX instructions with TB prefix valid only in 64 bit mode.
+// MMX2I - MMX / SSE2 instructions with PD prefix.
+// MMXIi8 - MMX instructions with ImmT == Imm8 and PS prefix.
+// MMXIi8 - MMX instructions with ImmT == Imm8 and PS prefix.
+// MMXID - MMX instructions with XD prefix.
+// MMXIS - MMX instructions with XS prefix.
+class MMXI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX]>;
+class MMXI32<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX,Not64BitMode]>;
+class MMXI64<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX,In64BitMode]>;
+class MMXRI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, PS, REX_W, Requires<[HasMMX]>;
+class MMX2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, PD, Requires<[HasMMX]>;
+class MMXIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX]>;
+class MMXID<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasMMX]>;
+class MMXIS<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasMMX]>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
new file mode 100644
index 000000000000..c5689d7c698c
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -0,0 +1,1099 @@
+//===-- X86InstrFragmentsSIMD.td - x86 SIMD ISA ------------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides pattern fragments useful for SIMD instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MMX specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+// Low word of MMX to GPR.
+def MMX_X86movd2w : SDNode<"X86ISD::MMX_MOVD2W", SDTypeProfile<1, 1,
+ [SDTCisVT<0, i32>, SDTCisVT<1, x86mmx>]>>;
+// GPR to low word of MMX.
+def MMX_X86movw2d : SDNode<"X86ISD::MMX_MOVW2D", SDTypeProfile<1, 1,
+ [SDTCisVT<0, x86mmx>, SDTCisVT<1, i32>]>>;
+
+//===----------------------------------------------------------------------===//
+// MMX Pattern Fragments
+//===----------------------------------------------------------------------===//
+
+def load_mmx : PatFrag<(ops node:$ptr), (x86mmx (load node:$ptr))>;
+def load_mvmmx : PatFrag<(ops node:$ptr),
+ (x86mmx (MMX_X86movw2d (load node:$ptr)))>;
+
+//===----------------------------------------------------------------------===//
+// SSE specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisSameAs<1, 2>,
+ SDTCisFP<1>, SDTCisVT<3, i8>,
+ SDTCisVec<1>]>;
+def SDTX86CmpTestSae : SDTypeProfile<1, 3, [SDTCisVT<0, i32>,
+ SDTCisSameAs<1, 2>, SDTCisInt<3>]>;
+
+def X86fmin : SDNode<"X86ISD::FMIN", SDTFPBinOp>;
+def X86fmax : SDNode<"X86ISD::FMAX", SDTFPBinOp>;
+
+// Commutative and Associative FMIN and FMAX.
+def X86fminc : SDNode<"X86ISD::FMINC", SDTFPBinOp,
+ [SDNPCommutative, SDNPAssociative]>;
+def X86fmaxc : SDNode<"X86ISD::FMAXC", SDTFPBinOp,
+ [SDNPCommutative, SDNPAssociative]>;
+
+def X86fand : SDNode<"X86ISD::FAND", SDTFPBinOp,
+ [SDNPCommutative, SDNPAssociative]>;
+def X86for : SDNode<"X86ISD::FOR", SDTFPBinOp,
+ [SDNPCommutative, SDNPAssociative]>;
+def X86fxor : SDNode<"X86ISD::FXOR", SDTFPBinOp,
+ [SDNPCommutative, SDNPAssociative]>;
+def X86fandn : SDNode<"X86ISD::FANDN", SDTFPBinOp>;
+def X86frsqrt : SDNode<"X86ISD::FRSQRT", SDTFPUnaryOp>;
+def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>;
+def X86frsqrt14s: SDNode<"X86ISD::FRSQRTS", SDTFPBinOp>;
+def X86frcp14s : SDNode<"X86ISD::FRCPS", SDTFPBinOp>;
+def X86fhadd : SDNode<"X86ISD::FHADD", SDTFPBinOp>;
+def X86fhsub : SDNode<"X86ISD::FHSUB", SDTFPBinOp>;
+def X86hadd : SDNode<"X86ISD::HADD", SDTIntBinOp>;
+def X86hsub : SDNode<"X86ISD::HSUB", SDTIntBinOp>;
+def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest>;
+def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>;
+def X86cmps : SDNode<"X86ISD::FSETCC", SDTX86Cmps>;
+def X86pshufb : SDNode<"X86ISD::PSHUFB",
+ SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i8>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>]>>;
+def X86psadbw : SDNode<"X86ISD::PSADBW",
+ SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>,
+ SDTCVecEltisVT<1, i8>,
+ SDTCisSameSizeAs<0,1>,
+ SDTCisSameAs<1,2>]>, [SDNPCommutative]>;
+def X86dbpsadbw : SDNode<"X86ISD::DBPSADBW",
+ SDTypeProfile<1, 3, [SDTCVecEltisVT<0, i16>,
+ SDTCVecEltisVT<1, i8>,
+ SDTCisSameSizeAs<0,1>,
+ SDTCisSameAs<1,2>, SDTCisInt<3>]>>;
+def X86andnp : SDNode<"X86ISD::ANDNP",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>]>>;
+def X86multishift : SDNode<"X86ISD::MULTISHIFT",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisSameAs<1,2>]>>;
+def X86pextrb : SDNode<"X86ISD::PEXTRB",
+ SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, v16i8>,
+ SDTCisPtrTy<2>]>>;
+def X86pextrw : SDNode<"X86ISD::PEXTRW",
+ SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, v8i16>,
+ SDTCisPtrTy<2>]>>;
+def X86pinsrb : SDNode<"X86ISD::PINSRB",
+ SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
+ SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
+def X86pinsrw : SDNode<"X86ISD::PINSRW",
+ SDTypeProfile<1, 3, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>,
+ SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
+def X86insertps : SDNode<"X86ISD::INSERTPS",
+ SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>,
+ SDTCisVT<2, v4f32>, SDTCisVT<3, i8>]>>;
+def X86vzmovl : SDNode<"X86ISD::VZEXT_MOVL",
+ SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>;
+
+def X86vzload : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+def X86vzext : SDNode<"X86ISD::VZEXT",
+ SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisInt<0>, SDTCisInt<1>,
+ SDTCisOpSmallerThanOp<1, 0>]>>;
+
+def X86vsext : SDNode<"X86ISD::VSEXT",
+ SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisInt<0>, SDTCisInt<1>,
+ SDTCisOpSmallerThanOp<1, 0>]>>;
+
+def SDTVtrunc : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisInt<0>, SDTCisInt<1>,
+ SDTCisOpSmallerThanOp<0, 1>]>;
+
+def X86vtrunc : SDNode<"X86ISD::VTRUNC", SDTVtrunc>;
+def X86vtruncs : SDNode<"X86ISD::VTRUNCS", SDTVtrunc>;
+def X86vtruncus : SDNode<"X86ISD::VTRUNCUS", SDTVtrunc>;
+
+def X86vfpext : SDNode<"X86ISD::VFPEXT",
+ SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f64>,
+ SDTCVecEltisVT<1, f32>,
+ SDTCisSameSizeAs<0, 1>]>>;
+def X86vfpround: SDNode<"X86ISD::VFPROUND",
+ SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>,
+ SDTCVecEltisVT<1, f64>,
+ SDTCisSameSizeAs<0, 1>]>>;
+
+def X86froundRnd: SDNode<"X86ISD::VFPROUNDS_RND",
+ SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f32>,
+ SDTCisSameAs<0, 1>,
+ SDTCVecEltisVT<2, f64>,
+ SDTCisSameSizeAs<0, 2>,
+ SDTCisVT<3, i32>]>>;
+
+def X86fpextRnd : SDNode<"X86ISD::VFPEXTS_RND",
+ SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f64>,
+ SDTCisSameAs<0, 1>,
+ SDTCVecEltisVT<2, f32>,
+ SDTCisSameSizeAs<0, 2>,
+ SDTCisVT<3, i32>]>>;
+
+def X86vshldq : SDNode<"X86ISD::VSHLDQ", SDTIntShiftOp>;
+def X86vshrdq : SDNode<"X86ISD::VSRLDQ", SDTIntShiftOp>;
+def X86cmpp : SDNode<"X86ISD::CMPP", SDTX86VFCMP>;
+def X86pcmpeq : SDNode<"X86ISD::PCMPEQ", SDTIntBinOp, [SDNPCommutative]>;
+def X86pcmpgt : SDNode<"X86ISD::PCMPGT", SDTIntBinOp>;
+
+def X86IntCmpMask : SDTypeProfile<1, 2,
+ [SDTCisVec<0>, SDTCVecEltisVT<0, i1>, SDTCisSameAs<1, 2>, SDTCisInt<1>,
+ SDTCisSameNumEltsAs<0, 1>]>;
+def X86pcmpeqm : SDNode<"X86ISD::PCMPEQM", X86IntCmpMask, [SDNPCommutative]>;
+def X86pcmpgtm : SDNode<"X86ISD::PCMPGTM", X86IntCmpMask>;
+
+def X86CmpMaskCC :
+ SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>,
+ SDTCisVec<1>, SDTCisSameAs<2, 1>,
+ SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>]>;
+def X86CmpMaskCCRound :
+ SDTypeProfile<1, 4, [SDTCisVec<0>,SDTCVecEltisVT<0, i1>,
+ SDTCisVec<1>, SDTCisSameAs<2, 1>,
+ SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>,
+ SDTCisVT<4, i32>]>;
+def X86CmpMaskCCScalar :
+ SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+
+def X86CmpMaskCCScalarRound :
+ SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>,
+ SDTCisVT<4, i32>]>;
+
+def X86cmpm : SDNode<"X86ISD::CMPM", X86CmpMaskCC>;
+def X86cmpmRnd : SDNode<"X86ISD::CMPM_RND", X86CmpMaskCCRound>;
+def X86cmpmu : SDNode<"X86ISD::CMPMU", X86CmpMaskCC>;
+def X86cmpms : SDNode<"X86ISD::FSETCCM", X86CmpMaskCCScalar>;
+def X86cmpmsRnd : SDNode<"X86ISD::FSETCCM_RND", X86CmpMaskCCScalarRound>;
+
+def X86vshl : SDNode<"X86ISD::VSHL",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisVec<2>]>>;
+def X86vsrl : SDNode<"X86ISD::VSRL",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisVec<2>]>>;
+def X86vsra : SDNode<"X86ISD::VSRA",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisVec<2>]>>;
+
+def X86vsrav : SDNode<"X86ISD::VSRAV" ,
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>]>>;
+
+def X86vshli : SDNode<"X86ISD::VSHLI", SDTIntShiftOp>;
+def X86vsrli : SDNode<"X86ISD::VSRLI", SDTIntShiftOp>;
+def X86vsrai : SDNode<"X86ISD::VSRAI", SDTIntShiftOp>;
+
+def X86vrotli : SDNode<"X86ISD::VROTLI", SDTIntShiftOp>;
+def X86vrotri : SDNode<"X86ISD::VROTRI", SDTIntShiftOp>;
+
+def X86vprot : SDNode<"X86ISD::VPROT",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>]>>;
+def X86vproti : SDNode<"X86ISD::VPROTI",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisVT<2, i8>]>>;
+
+def X86vpshl : SDNode<"X86ISD::VPSHL",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>]>>;
+def X86vpsha : SDNode<"X86ISD::VPSHA",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>]>>;
+
+def X86vpcom : SDNode<"X86ISD::VPCOM",
+ SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>,
+ SDTCisVT<3, i8>]>>;
+def X86vpcomu : SDNode<"X86ISD::VPCOMU",
+ SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>,
+ SDTCisVT<3, i8>]>>;
+def X86vpermil2 : SDNode<"X86ISD::VPERMIL2",
+ SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>,
+ SDTCisSameSizeAs<0,3>,
+ SDTCisSameNumEltsAs<0, 3>,
+ SDTCisVT<4, i8>]>>;
+def X86vpperm : SDNode<"X86ISD::VPPERM",
+ SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>]>>;
+
+def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
+ SDTCisVec<1>,
+ SDTCisSameAs<2, 1>]>;
+
+def SDTX86Testm : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisSameAs<2, 1>, SDTCVecEltisVT<0, i1>,
+ SDTCisSameNumEltsAs<0, 1>]>;
+
+def X86addus : SDNode<"X86ISD::ADDUS", SDTIntBinOp, [SDNPCommutative]>;
+def X86subus : SDNode<"X86ISD::SUBUS", SDTIntBinOp>;
+def X86adds : SDNode<"X86ISD::ADDS", SDTIntBinOp, [SDNPCommutative]>;
+def X86subs : SDNode<"X86ISD::SUBS", SDTIntBinOp>;
+def X86mulhrs : SDNode<"X86ISD::MULHRS", SDTIntBinOp, [SDNPCommutative]>;
+def X86avg : SDNode<"X86ISD::AVG" , SDTIntBinOp, [SDNPCommutative]>;
+def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
+def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>;
+def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>;
+def X86ktest : SDNode<"X86ISD::KTEST", SDTX86CmpPTest>;
+def X86testm : SDNode<"X86ISD::TESTM", SDTX86Testm, [SDNPCommutative]>;
+def X86testnm : SDNode<"X86ISD::TESTNM", SDTX86Testm, [SDNPCommutative]>;
+
+def X86movmsk : SDNode<"X86ISD::MOVMSK",
+ SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVec<1>]>>;
+
+def X86select : SDNode<"X86ISD::SELECT",
+ SDTypeProfile<1, 3, [SDTCVecEltisVT<1, i1>,
+ SDTCisSameAs<0, 2>,
+ SDTCisSameAs<2, 3>,
+ SDTCisSameNumEltsAs<0, 1>]>>;
+
+def X86selects : SDNode<"X86ISD::SELECTS",
+ SDTypeProfile<1, 3, [SDTCisVT<1, i1>,
+ SDTCisSameAs<0, 2>,
+ SDTCisSameAs<2, 3>]>>;
+
+def X86pmuludq : SDNode<"X86ISD::PMULUDQ",
+ SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>,
+ SDTCVecEltisVT<1, i32>,
+ SDTCisSameSizeAs<0,1>,
+ SDTCisSameAs<1,2>]>,
+ [SDNPCommutative]>;
+def X86pmuldq : SDNode<"X86ISD::PMULDQ",
+ SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>,
+ SDTCVecEltisVT<1, i32>,
+ SDTCisSameSizeAs<0,1>,
+ SDTCisSameAs<1,2>]>,
+ [SDNPCommutative]>;
+
+def X86extrqi : SDNode<"X86ISD::EXTRQI",
+ SDTypeProfile<1, 3, [SDTCisVT<0, v2i64>, SDTCisSameAs<0,1>,
+ SDTCisVT<2, i8>, SDTCisVT<3, i8>]>>;
+def X86insertqi : SDNode<"X86ISD::INSERTQI",
+ SDTypeProfile<1, 4, [SDTCisVT<0, v2i64>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<1,2>, SDTCisVT<3, i8>,
+ SDTCisVT<4, i8>]>>;
+
+// Specific shuffle nodes - At some point ISD::VECTOR_SHUFFLE will always get
+// translated into one of the target nodes below during lowering.
+// Note: this is a work in progress...
+def SDTShuff1Op : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
+def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>]>;
+
+def SDTShuff2OpM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameSizeAs<0,2>,
+ SDTCisSameNumEltsAs<0,2>]>;
+def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>,
+ SDTCisSameAs<0,1>, SDTCisVT<2, i8>]>;
+def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>, SDTCisVT<3, i8>]>;
+def SDTFPBinOpImmRound: SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>, SDTCisVT<3, i32>, SDTCisVT<4, i32>]>;
+def SDTFPTernaryOpImmRound: SDTypeProfile<1, 5, [SDTCisFP<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>,
+ SDTCisInt<3>,
+ SDTCisSameSizeAs<0, 3>,
+ SDTCisSameNumEltsAs<0, 3>,
+ SDTCisVT<4, i32>,
+ SDTCisVT<5, i32>]>;
+def SDTFPUnaryOpImmRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
+
+def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
+def SDTVBroadcastm : SDTypeProfile<1, 1, [SDTCisVec<0>,
+ SDTCisInt<0>, SDTCisInt<1>]>;
+
+def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<1,2>, SDTCisVT<3, i8>]>;
+
+def SDTTernlog : SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>, SDTCisSameAs<0,3>,
+ SDTCisVT<4, i8>]>;
+
+def SDTFPBinOpRound : SDTypeProfile<1, 3, [ // fadd_round, fmul_round, etc.
+ SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisFP<0>, SDTCisVT<3, i32>]>;
+
+def SDTFPUnaryOpRound : SDTypeProfile<1, 2, [ // fsqrt_round, fgetexp_round, etc.
+ SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisVT<2, i32>]>;
+
+def SDTFma : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>,
+ SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>;
+def SDTFmaRound : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>,
+ SDTCisSameAs<1,2>, SDTCisSameAs<1,3>,
+ SDTCisVT<4, i32>]>;
+
+def X86PAlignr : SDNode<"X86ISD::PALIGNR", SDTShuff3OpI>;
+def X86VAlign : SDNode<"X86ISD::VALIGN", SDTShuff3OpI>;
+
+def X86Abs : SDNode<"X86ISD::ABS", SDTIntUnaryOp>;
+def X86Conflict : SDNode<"X86ISD::CONFLICT", SDTIntUnaryOp>;
+
+def X86PShufd : SDNode<"X86ISD::PSHUFD", SDTShuff2OpI>;
+def X86PShufhw : SDNode<"X86ISD::PSHUFHW", SDTShuff2OpI>;
+def X86PShuflw : SDNode<"X86ISD::PSHUFLW", SDTShuff2OpI>;
+
+def X86Shufp : SDNode<"X86ISD::SHUFP", SDTShuff3OpI>;
+def X86Shuf128 : SDNode<"X86ISD::SHUF128", SDTShuff3OpI>;
+
+def X86Movddup : SDNode<"X86ISD::MOVDDUP", SDTShuff1Op>;
+def X86Movshdup : SDNode<"X86ISD::MOVSHDUP", SDTShuff1Op>;
+def X86Movsldup : SDNode<"X86ISD::MOVSLDUP", SDTShuff1Op>;
+
+def X86Movsd : SDNode<"X86ISD::MOVSD", SDTShuff2Op>;
+def X86Movss : SDNode<"X86ISD::MOVSS", SDTShuff2Op>;
+
+def X86Movlhps : SDNode<"X86ISD::MOVLHPS", SDTShuff2Op>;
+def X86Movlhpd : SDNode<"X86ISD::MOVLHPD", SDTShuff2Op>;
+def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2Op>;
+
+def X86Movlps : SDNode<"X86ISD::MOVLPS", SDTShuff2Op>;
+def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>;
+
+def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisSameSizeAs<0,1>,
+ SDTCisSameAs<1,2>]>;
+def X86Packss : SDNode<"X86ISD::PACKSS", SDTPack>;
+def X86Packus : SDNode<"X86ISD::PACKUS", SDTPack>;
+
+def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>;
+def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>;
+
+def X86vpmaddubsw : SDNode<"X86ISD::VPMADDUBSW" , SDTPack>;
+def X86vpmaddwd : SDNode<"X86ISD::VPMADDWD" , SDTPack, [SDNPCommutative]>;
+
+def X86VPermilpv : SDNode<"X86ISD::VPERMILPV", SDTShuff2OpM>;
+def X86VPermilpi : SDNode<"X86ISD::VPERMILPI", SDTShuff2OpI>;
+def X86VPermv : SDNode<"X86ISD::VPERMV",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<1>,
+ SDTCisSameNumEltsAs<0,1>,
+ SDTCisSameSizeAs<0,1>,
+ SDTCisSameAs<0,2>]>>;
+def X86VPermi : SDNode<"X86ISD::VPERMI", SDTShuff2OpI>;
+def X86VPermt2 : SDNode<"X86ISD::VPERMV3",
+ SDTypeProfile<1, 3, [SDTCisVec<0>,
+ SDTCisSameAs<0,1>, SDTCisInt<2>,
+ SDTCisVec<2>, SDTCisSameNumEltsAs<0, 2>,
+ SDTCisSameSizeAs<0,2>,
+ SDTCisSameAs<0,3>]>, []>;
+
+// Even though the index operand should be integer, we need to make it match the
+// destination type so that we can pattern match the masked version where the
+// index is also the passthru operand.
+def X86VPermi2X : SDNode<"X86ISD::VPERMIV3",
+ SDTypeProfile<1, 3, [SDTCisVec<0>,
+ SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>,
+ SDTCisSameAs<0,3>]>, []>;
+
+def X86vpternlog : SDNode<"X86ISD::VPTERNLOG", SDTTernlog>;
+
+def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>;
+
+def X86VFixupimm : SDNode<"X86ISD::VFIXUPIMM", SDTFPTernaryOpImmRound>;
+def X86VFixupimmScalar : SDNode<"X86ISD::VFIXUPIMMS", SDTFPTernaryOpImmRound>;
+def X86VRange : SDNode<"X86ISD::VRANGE", SDTFPBinOpImmRound>;
+def X86VReduce : SDNode<"X86ISD::VREDUCE", SDTFPUnaryOpImmRound>;
+def X86VRndScale : SDNode<"X86ISD::VRNDSCALE", SDTFPUnaryOpImmRound>;
+def X86VGetMant : SDNode<"X86ISD::VGETMANT", SDTFPUnaryOpImmRound>;
+def X86Vfpclass : SDNode<"X86ISD::VFPCLASS",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>,
+ SDTCisVec<1>, SDTCisFP<1>,
+ SDTCisSameNumEltsAs<0,1>,
+ SDTCisVT<2, i32>]>, []>;
+def X86Vfpclasss : SDNode<"X86ISD::VFPCLASSS",
+ SDTypeProfile<1, 2, [SDTCisVT<0, i1>,
+ SDTCisFP<1>, SDTCisVT<2, i32>]>,[]>;
+
+def X86SubVBroadcast : SDNode<"X86ISD::SUBV_BROADCAST",
+ SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisSubVecOfVec<1, 0>]>, []>;
+
+def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
+def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>;
+def X86Vinsert : SDNode<"X86ISD::VINSERT", SDTypeProfile<1, 3,
+ [SDTCisSameAs<0, 1>, SDTCisEltOfVec<2, 1>,
+ SDTCisPtrTy<3>]>, []>;
+def X86Vextract : SDNode<"X86ISD::VEXTRACT", SDTypeProfile<1, 2,
+ [SDTCisEltOfVec<0, 1>, SDTCisVec<1>,
+ SDTCisPtrTy<2>]>, []>;
+
+def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>;
+
+def X86Addsub : SDNode<"X86ISD::ADDSUB", SDTFPBinOp>;
+
+def X86faddRnd : SDNode<"X86ISD::FADD_RND", SDTFPBinOpRound>;
+def X86fsubRnd : SDNode<"X86ISD::FSUB_RND", SDTFPBinOpRound>;
+def X86fmulRnd : SDNode<"X86ISD::FMUL_RND", SDTFPBinOpRound>;
+def X86fdivRnd : SDNode<"X86ISD::FDIV_RND", SDTFPBinOpRound>;
+def X86fmaxRnd : SDNode<"X86ISD::FMAX_RND", SDTFPBinOpRound>;
+def X86scalef : SDNode<"X86ISD::SCALEF", SDTFPBinOpRound>;
+def X86scalefs : SDNode<"X86ISD::SCALEFS", SDTFPBinOpRound>;
+def X86fminRnd : SDNode<"X86ISD::FMIN_RND", SDTFPBinOpRound>;
+def X86fsqrtRnd : SDNode<"X86ISD::FSQRT_RND", SDTFPUnaryOpRound>;
+def X86fsqrtRnds : SDNode<"X86ISD::FSQRTS_RND", SDTFPBinOpRound>;
+def X86fgetexpRnd : SDNode<"X86ISD::FGETEXP_RND", SDTFPUnaryOpRound>;
+def X86fgetexpRnds : SDNode<"X86ISD::FGETEXPS_RND", SDTFPBinOpRound>;
+
+def X86Fmadd : SDNode<"X86ISD::FMADD", SDTFma>;
+def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFma>;
+def X86Fmsub : SDNode<"X86ISD::FMSUB", SDTFma>;
+def X86Fnmsub : SDNode<"X86ISD::FNMSUB", SDTFma>;
+def X86Fmaddsub : SDNode<"X86ISD::FMADDSUB", SDTFma>;
+def X86Fmsubadd : SDNode<"X86ISD::FMSUBADD", SDTFma>;
+
+def X86FmaddRnd : SDNode<"X86ISD::FMADD_RND", SDTFmaRound>;
+def X86FnmaddRnd : SDNode<"X86ISD::FNMADD_RND", SDTFmaRound>;
+def X86FmsubRnd : SDNode<"X86ISD::FMSUB_RND", SDTFmaRound>;
+def X86FnmsubRnd : SDNode<"X86ISD::FNMSUB_RND", SDTFmaRound>;
+def X86FmaddsubRnd : SDNode<"X86ISD::FMADDSUB_RND", SDTFmaRound>;
+def X86FmsubaddRnd : SDNode<"X86ISD::FMSUBADD_RND", SDTFmaRound>;
+
+// Scalar FMA intrinsics with passthru bits in operand 1.
+def X86FmaddRnds1 : SDNode<"X86ISD::FMADDS1_RND", SDTFmaRound>;
+def X86FnmaddRnds1 : SDNode<"X86ISD::FNMADDS1_RND", SDTFmaRound>;
+def X86FmsubRnds1 : SDNode<"X86ISD::FMSUBS1_RND", SDTFmaRound>;
+def X86FnmsubRnds1 : SDNode<"X86ISD::FNMSUBS1_RND", SDTFmaRound>;
+
+// Scalar FMA intrinsics with passthru bits in operand 3.
+def X86FmaddRnds3 : SDNode<"X86ISD::FMADDS3_RND", SDTFmaRound>;
+def X86FnmaddRnds3 : SDNode<"X86ISD::FNMADDS3_RND", SDTFmaRound>;
+def X86FmsubRnds3 : SDNode<"X86ISD::FMSUBS3_RND", SDTFmaRound>;
+def X86FnmsubRnds3 : SDNode<"X86ISD::FNMSUBS3_RND", SDTFmaRound>;
+
+def x86vpmadd52l : SDNode<"X86ISD::VPMADD52L", SDTFma>;
+def x86vpmadd52h : SDNode<"X86ISD::VPMADD52H", SDTFma>;
+
+def X86rsqrt28 : SDNode<"X86ISD::RSQRT28", SDTFPUnaryOpRound>;
+def X86rcp28 : SDNode<"X86ISD::RCP28", SDTFPUnaryOpRound>;
+def X86exp2 : SDNode<"X86ISD::EXP2", SDTFPUnaryOpRound>;
+
+def X86rsqrt28s : SDNode<"X86ISD::RSQRT28S", SDTFPBinOpRound>;
+def X86rcp28s : SDNode<"X86ISD::RCP28S", SDTFPBinOpRound>;
+def X86RndScales : SDNode<"X86ISD::VRNDSCALES", SDTFPBinOpImmRound>;
+def X86Reduces : SDNode<"X86ISD::VREDUCES", SDTFPBinOpImmRound>;
+def X86GetMants : SDNode<"X86ISD::VGETMANTS", SDTFPBinOpImmRound>;
+
+def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+ SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>,
+ SDTCisVT<4, i8>]>;
+def SDT_PCMPESTRI : SDTypeProfile<2, 5, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+ SDTCisVT<2, v16i8>, SDTCisVT<3, i32>,
+ SDTCisVT<4, v16i8>, SDTCisVT<5, i32>,
+ SDTCisVT<6, i8>]>;
+
+def X86pcmpistri : SDNode<"X86ISD::PCMPISTRI", SDT_PCMPISTRI>;
+def X86pcmpestri : SDNode<"X86ISD::PCMPESTRI", SDT_PCMPESTRI>;
+
+def X86compress: SDNode<"X86ISD::COMPRESS", SDTypeProfile<1, 1,
+ [SDTCisSameAs<0, 1>, SDTCisVec<1>]>, []>;
+def X86expand : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 1,
+ [SDTCisSameAs<0, 1>, SDTCisVec<1>]>, []>;
+
+def SDTintToFPRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisFP<0>,
+ SDTCisSameAs<0,1>, SDTCisInt<2>,
+ SDTCisVT<3, i32>]>;
+
+def SDTFloatToInt: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisInt<0>, SDTCisFP<1>]>;
+def SDTFloatToIntRnd: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisInt<0>, SDTCisFP<1>,
+ SDTCisVT<2, i32>]>;
+def SDTSFloatToIntRnd: SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisFP<1>,
+ SDTCisVec<1>, SDTCisVT<2, i32>]>;
+
+def SDTVintToFP: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisFP<0>, SDTCisInt<1>]>;
+def SDTVintToFPRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisFP<0>, SDTCisInt<1>,
+ SDTCisVT<2, i32>]>;
+
+// Scalar
+def X86SintToFpRnd : SDNode<"X86ISD::SCALAR_SINT_TO_FP_RND", SDTintToFPRound>;
+def X86UintToFpRnd : SDNode<"X86ISD::SCALAR_UINT_TO_FP_RND", SDTintToFPRound>;
+
+def X86cvtts2IntRnd : SDNode<"X86ISD::CVTTS2SI_RND", SDTSFloatToIntRnd>;
+def X86cvtts2UIntRnd : SDNode<"X86ISD::CVTTS2UI_RND", SDTSFloatToIntRnd>;
+
+def X86cvts2si : SDNode<"X86ISD::CVTS2SI_RND", SDTSFloatToIntRnd>;
+def X86cvts2usi : SDNode<"X86ISD::CVTS2UI_RND", SDTSFloatToIntRnd>;
+
+// Vector with rounding mode
+
+// cvtt fp-to-int staff
+def X86cvttp2siRnd : SDNode<"X86ISD::CVTTP2SI_RND", SDTFloatToIntRnd>;
+def X86cvttp2uiRnd : SDNode<"X86ISD::CVTTP2UI_RND", SDTFloatToIntRnd>;
+
+def X86VSintToFpRnd : SDNode<"X86ISD::SINT_TO_FP_RND", SDTVintToFPRound>;
+def X86VUintToFpRnd : SDNode<"X86ISD::UINT_TO_FP_RND", SDTVintToFPRound>;
+
+// cvt fp-to-int staff
+def X86cvtp2IntRnd : SDNode<"X86ISD::CVTP2SI_RND", SDTFloatToIntRnd>;
+def X86cvtp2UIntRnd : SDNode<"X86ISD::CVTP2UI_RND", SDTFloatToIntRnd>;
+
+// Vector without rounding mode
+
+// cvtt fp-to-int staff
+def X86cvttp2si : SDNode<"X86ISD::CVTTP2SI", SDTFloatToInt>;
+def X86cvttp2ui : SDNode<"X86ISD::CVTTP2UI", SDTFloatToInt>;
+
+def X86VSintToFP : SDNode<"X86ISD::CVTSI2P", SDTVintToFP>;
+def X86VUintToFP : SDNode<"X86ISD::CVTUI2P", SDTVintToFP>;
+
+// cvt int-to-fp staff
+def X86cvtp2Int : SDNode<"X86ISD::CVTP2SI", SDTFloatToInt>;
+def X86cvtp2UInt : SDNode<"X86ISD::CVTP2UI", SDTFloatToInt>;
+
+def X86cvtph2ps : SDNode<"X86ISD::CVTPH2PS",
+ SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>,
+ SDTCVecEltisVT<1, i16>,
+ SDTCisVT<2, i32>]> >;
+
+def X86cvtps2ph : SDNode<"X86ISD::CVTPS2PH",
+ SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i16>,
+ SDTCVecEltisVT<1, f32>,
+ SDTCisVT<2, i32>]> >;
+def X86vfpextRnd : SDNode<"X86ISD::VFPEXT_RND",
+ SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f64>,
+ SDTCVecEltisVT<1, f32>,
+ SDTCisOpSmallerThanOp<1, 0>,
+ SDTCisVT<2, i32>]>>;
+def X86vfproundRnd: SDNode<"X86ISD::VFPROUND_RND",
+ SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>,
+ SDTCVecEltisVT<1, f64>,
+ SDTCisOpSmallerThanOp<0, 1>,
+ SDTCisVT<2, i32>]>>;
+
+def X86cvt2mask : SDNode<"X86ISD::CVT2MASK", SDTIntTruncOp>;
+
+//===----------------------------------------------------------------------===//
+// SSE Complex Patterns
+//===----------------------------------------------------------------------===//
+
+// These are 'extloads' from a scalar to the low element of a vector, zeroing
+// the top elements. These are used for the SSE 'ss' and 'sd' instruction
+// forms.
+def sse_load_f32 : ComplexPattern<v4f32, 5, "selectScalarSSELoad", [],
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand,
+ SDNPWantRoot]>;
+def sse_load_f64 : ComplexPattern<v2f64, 5, "selectScalarSSELoad", [],
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand,
+ SDNPWantRoot]>;
+
+def ssmem : Operand<v4f32> {
+ let PrintMethod = "printf32mem";
+ let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, SEGMENT_REG);
+ let ParserMatchClass = X86Mem32AsmOperand;
+ let OperandType = "OPERAND_MEMORY";
+}
+def sdmem : Operand<v2f64> {
+ let PrintMethod = "printf64mem";
+ let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, SEGMENT_REG);
+ let ParserMatchClass = X86Mem64AsmOperand;
+ let OperandType = "OPERAND_MEMORY";
+}
+
+//===----------------------------------------------------------------------===//
+// SSE pattern fragments
+//===----------------------------------------------------------------------===//
+
+// 128-bit load pattern fragments
+// NOTE: all 128-bit integer vector loads are promoted to v2i64
+def loadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>;
+def loadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>;
+def loadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>;
+
+// 256-bit load pattern fragments
+// NOTE: all 256-bit integer vector loads are promoted to v4i64
+def loadv8f32 : PatFrag<(ops node:$ptr), (v8f32 (load node:$ptr))>;
+def loadv4f64 : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>;
+def loadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>;
+
+// 512-bit load pattern fragments
+def loadv16f32 : PatFrag<(ops node:$ptr), (v16f32 (load node:$ptr))>;
+def loadv8f64 : PatFrag<(ops node:$ptr), (v8f64 (load node:$ptr))>;
+def loadv8i64 : PatFrag<(ops node:$ptr), (v8i64 (load node:$ptr))>;
+
+// 128-/256-/512-bit extload pattern fragments
+def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>;
+def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>;
+def extloadv8f32 : PatFrag<(ops node:$ptr), (v8f64 (extloadvf32 node:$ptr))>;
+
+// Like 'store', but always requires 128-bit vector alignment.
+def alignedstore : PatFrag<(ops node:$val, node:$ptr),
+ (store node:$val, node:$ptr), [{
+ return cast<StoreSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+// Like 'store', but always requires 256-bit vector alignment.
+def alignedstore256 : PatFrag<(ops node:$val, node:$ptr),
+ (store node:$val, node:$ptr), [{
+ return cast<StoreSDNode>(N)->getAlignment() >= 32;
+}]>;
+
+// Like 'store', but always requires 512-bit vector alignment.
+def alignedstore512 : PatFrag<(ops node:$val, node:$ptr),
+ (store node:$val, node:$ptr), [{
+ return cast<StoreSDNode>(N)->getAlignment() >= 64;
+}]>;
+
+// Like 'load', but always requires 128-bit vector alignment.
+def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return cast<LoadSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+// Like 'load', but always requires 256-bit vector alignment.
+def alignedload256 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return cast<LoadSDNode>(N)->getAlignment() >= 32;
+}]>;
+
+// Like 'load', but always requires 512-bit vector alignment.
+def alignedload512 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return cast<LoadSDNode>(N)->getAlignment() >= 64;
+}]>;
+
+// 128-bit aligned load pattern fragments
+// NOTE: all 128-bit integer vector loads are promoted to v2i64
+def alignedloadv4f32 : PatFrag<(ops node:$ptr),
+ (v4f32 (alignedload node:$ptr))>;
+def alignedloadv2f64 : PatFrag<(ops node:$ptr),
+ (v2f64 (alignedload node:$ptr))>;
+def alignedloadv2i64 : PatFrag<(ops node:$ptr),
+ (v2i64 (alignedload node:$ptr))>;
+
+// 256-bit aligned load pattern fragments
+// NOTE: all 256-bit integer vector loads are promoted to v4i64
+def alignedloadv8f32 : PatFrag<(ops node:$ptr),
+ (v8f32 (alignedload256 node:$ptr))>;
+def alignedloadv4f64 : PatFrag<(ops node:$ptr),
+ (v4f64 (alignedload256 node:$ptr))>;
+def alignedloadv4i64 : PatFrag<(ops node:$ptr),
+ (v4i64 (alignedload256 node:$ptr))>;
+
+// 512-bit aligned load pattern fragments
+def alignedloadv16f32 : PatFrag<(ops node:$ptr),
+ (v16f32 (alignedload512 node:$ptr))>;
+def alignedloadv8f64 : PatFrag<(ops node:$ptr),
+ (v8f64 (alignedload512 node:$ptr))>;
+def alignedloadv8i64 : PatFrag<(ops node:$ptr),
+ (v8i64 (alignedload512 node:$ptr))>;
+
+// Like 'load', but uses special alignment checks suitable for use in
+// memory operands in most SSE instructions, which are required to
+// be naturally aligned on some targets but not on others. If the subtarget
+// allows unaligned accesses, match any load, though this may require
+// setting a feature bit in the processor (on startup, for example).
+// Opteron 10h and later implement such a feature.
+def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return Subtarget->hasSSEUnalignedMem()
+ || cast<LoadSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+// 128-bit memop pattern fragments
+// NOTE: all 128-bit integer vector loads are promoted to v2i64
+def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>;
+def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>;
+def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>;
+
+// These are needed to match a scalar memop that is used in a vector-only
+// math instruction such as the FP logical ops: andps, andnps, orps, xorps.
+// The memory operand is required to be a 128-bit load, so it must be converted
+// from a vector to a scalar.
+def memopfsf32_128 : PatFrag<(ops node:$ptr),
+ (f32 (extractelt (memopv4f32 node:$ptr), (iPTR 0)))>;
+def memopfsf64_128 : PatFrag<(ops node:$ptr),
+ (f64 (extractelt (memopv2f64 node:$ptr), (iPTR 0)))>;
+
+
+// SSSE3 uses MMX registers for some instructions. They aren't aligned on a
+// 16-byte boundary.
+// FIXME: 8 byte alignment for mmx reads is not required
+def memop64 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return cast<LoadSDNode>(N)->getAlignment() >= 8;
+}]>;
+
+def memopmmx : PatFrag<(ops node:$ptr), (x86mmx (memop64 node:$ptr))>;
+
+def mgatherv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_gather node:$src1, node:$src2, node:$src3) , [{
+ if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
+ return (Mgt->getIndex().getValueType() == MVT::v4i32 ||
+ Mgt->getBasePtr().getValueType() == MVT::v4i32);
+ return false;
+}]>;
+
+def mgatherv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_gather node:$src1, node:$src2, node:$src3) , [{
+ if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
+ return (Mgt->getIndex().getValueType() == MVT::v8i32 ||
+ Mgt->getBasePtr().getValueType() == MVT::v8i32);
+ return false;
+}]>;
+
+def mgatherv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_gather node:$src1, node:$src2, node:$src3) , [{
+ if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
+ return (Mgt->getIndex().getValueType() == MVT::v2i64 ||
+ Mgt->getBasePtr().getValueType() == MVT::v2i64);
+ return false;
+}]>;
+def mgatherv4i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_gather node:$src1, node:$src2, node:$src3) , [{
+ if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
+ return (Mgt->getIndex().getValueType() == MVT::v4i64 ||
+ Mgt->getBasePtr().getValueType() == MVT::v4i64);
+ return false;
+}]>;
+def mgatherv8i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_gather node:$src1, node:$src2, node:$src3) , [{
+ if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
+ return (Mgt->getIndex().getValueType() == MVT::v8i64 ||
+ Mgt->getBasePtr().getValueType() == MVT::v8i64);
+ return false;
+}]>;
+def mgatherv16i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_gather node:$src1, node:$src2, node:$src3) , [{
+ if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
+ return (Mgt->getIndex().getValueType() == MVT::v16i32 ||
+ Mgt->getBasePtr().getValueType() == MVT::v16i32);
+ return false;
+}]>;
+
+def mscatterv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_scatter node:$src1, node:$src2, node:$src3) , [{
+ if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N))
+ return (Sc->getIndex().getValueType() == MVT::v2i64 ||
+ Sc->getBasePtr().getValueType() == MVT::v2i64);
+ return false;
+}]>;
+
+def mscatterv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_scatter node:$src1, node:$src2, node:$src3) , [{
+ if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N))
+ return (Sc->getIndex().getValueType() == MVT::v4i32 ||
+ Sc->getBasePtr().getValueType() == MVT::v4i32);
+ return false;
+}]>;
+
+def mscatterv4i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_scatter node:$src1, node:$src2, node:$src3) , [{
+ if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N))
+ return (Sc->getIndex().getValueType() == MVT::v4i64 ||
+ Sc->getBasePtr().getValueType() == MVT::v4i64);
+ return false;
+}]>;
+
+def mscatterv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_scatter node:$src1, node:$src2, node:$src3) , [{
+ if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N))
+ return (Sc->getIndex().getValueType() == MVT::v8i32 ||
+ Sc->getBasePtr().getValueType() == MVT::v8i32);
+ return false;
+}]>;
+
+def mscatterv8i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_scatter node:$src1, node:$src2, node:$src3) , [{
+ if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N))
+ return (Sc->getIndex().getValueType() == MVT::v8i64 ||
+ Sc->getBasePtr().getValueType() == MVT::v8i64);
+ return false;
+}]>;
+def mscatterv16i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_scatter node:$src1, node:$src2, node:$src3) , [{
+ if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N))
+ return (Sc->getIndex().getValueType() == MVT::v16i32 ||
+ Sc->getBasePtr().getValueType() == MVT::v16i32);
+ return false;
+}]>;
+
+// 128-bit bitconvert pattern fragments
+def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>;
+def bc_v2f64 : PatFrag<(ops node:$in), (v2f64 (bitconvert node:$in))>;
+def bc_v16i8 : PatFrag<(ops node:$in), (v16i8 (bitconvert node:$in))>;
+def bc_v8i16 : PatFrag<(ops node:$in), (v8i16 (bitconvert node:$in))>;
+def bc_v4i32 : PatFrag<(ops node:$in), (v4i32 (bitconvert node:$in))>;
+def bc_v2i64 : PatFrag<(ops node:$in), (v2i64 (bitconvert node:$in))>;
+
+// 256-bit bitconvert pattern fragments
+def bc_v32i8 : PatFrag<(ops node:$in), (v32i8 (bitconvert node:$in))>;
+def bc_v16i16 : PatFrag<(ops node:$in), (v16i16 (bitconvert node:$in))>;
+def bc_v8i32 : PatFrag<(ops node:$in), (v8i32 (bitconvert node:$in))>;
+def bc_v4i64 : PatFrag<(ops node:$in), (v4i64 (bitconvert node:$in))>;
+def bc_v8f32 : PatFrag<(ops node:$in), (v8f32 (bitconvert node:$in))>;
+
+// 512-bit bitconvert pattern fragments
+def bc_v64i8 : PatFrag<(ops node:$in), (v64i8 (bitconvert node:$in))>;
+def bc_v16i32 : PatFrag<(ops node:$in), (v16i32 (bitconvert node:$in))>;
+def bc_v8i64 : PatFrag<(ops node:$in), (v8i64 (bitconvert node:$in))>;
+def bc_v8f64 : PatFrag<(ops node:$in), (v8f64 (bitconvert node:$in))>;
+def bc_v16f32 : PatFrag<(ops node:$in), (v16f32 (bitconvert node:$in))>;
+
+def vzmovl_v2i64 : PatFrag<(ops node:$src),
+ (bitconvert (v2i64 (X86vzmovl
+ (v2i64 (scalar_to_vector (loadi64 node:$src))))))>;
+def vzmovl_v4i32 : PatFrag<(ops node:$src),
+ (bitconvert (v4i32 (X86vzmovl
+ (v4i32 (scalar_to_vector (loadi32 node:$src))))))>;
+
+def vzload_v2i64 : PatFrag<(ops node:$src),
+ (bitconvert (v2i64 (X86vzload node:$src)))>;
+
+
+def fp32imm0 : PatLeaf<(f32 fpimm), [{
+ return N->isExactlyValue(+0.0);
+}]>;
+
+def fp64imm0 : PatLeaf<(f64 fpimm), [{
+ return N->isExactlyValue(+0.0);
+}]>;
+
+def I8Imm : SDNodeXForm<imm, [{
+ // Transformation function: get the low 8 bits.
+ return getI8Imm((uint8_t)N->getZExtValue(), SDLoc(N));
+}]>;
+
+def FROUND_NO_EXC : ImmLeaf<i32, [{ return Imm == 8; }]>;
+def FROUND_CURRENT : ImmLeaf<i32, [{
+ return Imm == X86::STATIC_ROUNDING::CUR_DIRECTION;
+}]>;
+
+// BYTE_imm - Transform bit immediates into byte immediates.
+def BYTE_imm : SDNodeXForm<imm, [{
+ // Transformation function: imm >> 3
+ return getI32Imm(N->getZExtValue() >> 3, SDLoc(N));
+}]>;
+
+// EXTRACT_get_vextract128_imm xform function: convert extract_subvector index
+// to VEXTRACTF128/VEXTRACTI128 imm.
+def EXTRACT_get_vextract128_imm : SDNodeXForm<extract_subvector, [{
+ return getI8Imm(X86::getExtractVEXTRACT128Immediate(N), SDLoc(N));
+}]>;
+
+// INSERT_get_vinsert128_imm xform function: convert insert_subvector index to
+// VINSERTF128/VINSERTI128 imm.
+def INSERT_get_vinsert128_imm : SDNodeXForm<insert_subvector, [{
+ return getI8Imm(X86::getInsertVINSERT128Immediate(N), SDLoc(N));
+}]>;
+
+// EXTRACT_get_vextract256_imm xform function: convert extract_subvector index
+// to VEXTRACTF64x4 imm.
+def EXTRACT_get_vextract256_imm : SDNodeXForm<extract_subvector, [{
+ return getI8Imm(X86::getExtractVEXTRACT256Immediate(N), SDLoc(N));
+}]>;
+
+// INSERT_get_vinsert256_imm xform function: convert insert_subvector index to
+// VINSERTF64x4 imm.
+def INSERT_get_vinsert256_imm : SDNodeXForm<insert_subvector, [{
+ return getI8Imm(X86::getInsertVINSERT256Immediate(N), SDLoc(N));
+}]>;
+
+def vextract128_extract : PatFrag<(ops node:$bigvec, node:$index),
+ (extract_subvector node:$bigvec,
+ node:$index), [{
+ return X86::isVEXTRACT128Index(N);
+}], EXTRACT_get_vextract128_imm>;
+
+def vinsert128_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
+ node:$index),
+ (insert_subvector node:$bigvec, node:$smallvec,
+ node:$index), [{
+ return X86::isVINSERT128Index(N);
+}], INSERT_get_vinsert128_imm>;
+
+
+def vextract256_extract : PatFrag<(ops node:$bigvec, node:$index),
+ (extract_subvector node:$bigvec,
+ node:$index), [{
+ return X86::isVEXTRACT256Index(N);
+}], EXTRACT_get_vextract256_imm>;
+
+def vinsert256_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
+ node:$index),
+ (insert_subvector node:$bigvec, node:$smallvec,
+ node:$index), [{
+ return X86::isVINSERT256Index(N);
+}], INSERT_get_vinsert256_imm>;
+
+def X86mload : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_load node:$src1, node:$src2, node:$src3), [{
+ return !cast<MaskedLoadSDNode>(N)->isExpandingLoad() &&
+ cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD;
+}]>;
+
+def masked_load_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86mload node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedLoadSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+def masked_load_aligned256 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86mload node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedLoadSDNode>(N)->getAlignment() >= 32;
+}]>;
+
+def masked_load_aligned512 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86mload node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedLoadSDNode>(N)->getAlignment() >= 64;
+}]>;
+
+def masked_load_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_load node:$src1, node:$src2, node:$src3), [{
+ return !cast<MaskedLoadSDNode>(N)->isExpandingLoad() &&
+ cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD;
+}]>;
+
+def X86mExpandingLoad : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_load node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedLoadSDNode>(N)->isExpandingLoad();
+}]>;
+
+// Masked store fragments.
+// X86mstore can't be implemented in core DAG files because some targets
+// do not support vector types (llvm-tblgen will fail).
+def X86mstore : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_store node:$src1, node:$src2, node:$src3), [{
+ return (!cast<MaskedStoreSDNode>(N)->isTruncatingStore()) &&
+ (!cast<MaskedStoreSDNode>(N)->isCompressingStore());
+}]>;
+
+def masked_store_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86mstore node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedStoreSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+def masked_store_aligned256 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86mstore node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedStoreSDNode>(N)->getAlignment() >= 32;
+}]>;
+
+def masked_store_aligned512 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86mstore node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedStoreSDNode>(N)->getAlignment() >= 64;
+}]>;
+
+def masked_store_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_store node:$src1, node:$src2, node:$src3), [{
+ return (!cast<MaskedStoreSDNode>(N)->isTruncatingStore()) &&
+ (!cast<MaskedStoreSDNode>(N)->isCompressingStore());
+}]>;
+
+def X86mCompressingStore : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_store node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedStoreSDNode>(N)->isCompressingStore();
+}]>;
+
+// masked truncstore fragments
+// X86mtruncstore can't be implemented in core DAG files because some targets
+// doesn't support vector type ( llvm-tblgen will fail)
+def X86mtruncstore : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_store node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedStoreSDNode>(N)->isTruncatingStore();
+}]>;
+def masked_truncstorevi8 :
+ PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86mtruncstore node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+def masked_truncstorevi16 :
+ PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86mtruncstore node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+}]>;
+def masked_truncstorevi32 :
+ PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86mtruncstore node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+}]>;
+
+def X86TruncSStore : SDNode<"X86ISD::VTRUNCSTORES", SDTStore,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+def X86TruncUSStore : SDNode<"X86ISD::VTRUNCSTOREUS", SDTStore,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+def X86MTruncSStore : SDNode<"X86ISD::VMTRUNCSTORES", SDTMaskedStore,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+def X86MTruncUSStore : SDNode<"X86ISD::VMTRUNCSTOREUS", SDTMaskedStore,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+def truncstore_s_vi8 : PatFrag<(ops node:$val, node:$ptr),
+ (X86TruncSStore node:$val, node:$ptr), [{
+ return cast<TruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+
+def truncstore_us_vi8 : PatFrag<(ops node:$val, node:$ptr),
+ (X86TruncUSStore node:$val, node:$ptr), [{
+ return cast<TruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+
+def truncstore_s_vi16 : PatFrag<(ops node:$val, node:$ptr),
+ (X86TruncSStore node:$val, node:$ptr), [{
+ return cast<TruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+}]>;
+
+def truncstore_us_vi16 : PatFrag<(ops node:$val, node:$ptr),
+ (X86TruncUSStore node:$val, node:$ptr), [{
+ return cast<TruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+}]>;
+
+def truncstore_s_vi32 : PatFrag<(ops node:$val, node:$ptr),
+ (X86TruncSStore node:$val, node:$ptr), [{
+ return cast<TruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+}]>;
+
+def truncstore_us_vi32 : PatFrag<(ops node:$val, node:$ptr),
+ (X86TruncUSStore node:$val, node:$ptr), [{
+ return cast<TruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+}]>;
+
+def masked_truncstore_s_vi8 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86MTruncSStore node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedTruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+
+def masked_truncstore_us_vi8 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86MTruncUSStore node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedTruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+
+def masked_truncstore_s_vi16 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86MTruncSStore node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedTruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+}]>;
+
+def masked_truncstore_us_vi16 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86MTruncUSStore node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedTruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+}]>;
+
+def masked_truncstore_s_vi32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86MTruncSStore node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedTruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+}]>;
+
+def masked_truncstore_us_vi32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86MTruncUSStore node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedTruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+}]>;
+
+def assertzext_i1 :
+ PatFrag<(ops node:$src), (assertzext node:$src), [{
+ return cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i1;
+}]>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
new file mode 100644
index 000000000000..579359794fbd
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -0,0 +1,9731 @@
+//===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrInfo.h"
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-instr-info"
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "X86GenInstrInfo.inc"
+
+static cl::opt<bool>
+NoFusing("disable-spill-fusing",
+ cl::desc("Disable fusing of spill code into instructions"));
+static cl::opt<bool>
+PrintFailedFusing("print-failed-fuse-candidates",
+ cl::desc("Print instructions that the allocator wants to"
+ " fuse, but the X86 backend currently can't"),
+ cl::Hidden);
+static cl::opt<bool>
+ReMatPICStubLoad("remat-pic-stub-load",
+ cl::desc("Re-materialize load from stub in PIC mode"),
+ cl::init(false), cl::Hidden);
+static cl::opt<unsigned>
+PartialRegUpdateClearance("partial-reg-update-clearance",
+ cl::desc("Clearance between two register writes "
+ "for inserting XOR to avoid partial "
+ "register update"),
+ cl::init(64), cl::Hidden);
+static cl::opt<unsigned>
+UndefRegClearance("undef-reg-clearance",
+ cl::desc("How many idle instructions we would like before "
+ "certain undef register reads"),
+ cl::init(128), cl::Hidden);
+
+enum {
+ // Select which memory operand is being unfolded.
+ // (stored in bits 0 - 3)
+ TB_INDEX_0 = 0,
+ TB_INDEX_1 = 1,
+ TB_INDEX_2 = 2,
+ TB_INDEX_3 = 3,
+ TB_INDEX_4 = 4,
+ TB_INDEX_MASK = 0xf,
+
+ // Do not insert the reverse map (MemOp -> RegOp) into the table.
+ // This may be needed because there is a many -> one mapping.
+ TB_NO_REVERSE = 1 << 4,
+
+ // Do not insert the forward map (RegOp -> MemOp) into the table.
+ // This is needed for Native Client, which prohibits branch
+ // instructions from using a memory operand.
+ TB_NO_FORWARD = 1 << 5,
+
+ TB_FOLDED_LOAD = 1 << 6,
+ TB_FOLDED_STORE = 1 << 7,
+
+ // Minimum alignment required for load/store.
+ // Used for RegOp->MemOp conversion.
+ // (stored in bits 8 - 15)
+ TB_ALIGN_SHIFT = 8,
+ TB_ALIGN_NONE = 0 << TB_ALIGN_SHIFT,
+ TB_ALIGN_16 = 16 << TB_ALIGN_SHIFT,
+ TB_ALIGN_32 = 32 << TB_ALIGN_SHIFT,
+ TB_ALIGN_64 = 64 << TB_ALIGN_SHIFT,
+ TB_ALIGN_MASK = 0xff << TB_ALIGN_SHIFT
+};
+
+struct X86MemoryFoldTableEntry {
+ uint16_t RegOp;
+ uint16_t MemOp;
+ uint16_t Flags;
+};
+
+// Pin the vtable to this file.
+void X86InstrInfo::anchor() {}
+
+X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
+ : X86GenInstrInfo((STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64
+ : X86::ADJCALLSTACKDOWN32),
+ (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64
+ : X86::ADJCALLSTACKUP32),
+ X86::CATCHRET,
+ (STI.is64Bit() ? X86::RETQ : X86::RETL)),
+ Subtarget(STI), RI(STI.getTargetTriple()) {
+
+ static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = {
+ { X86::ADC32ri, X86::ADC32mi, 0 },
+ { X86::ADC32ri8, X86::ADC32mi8, 0 },
+ { X86::ADC32rr, X86::ADC32mr, 0 },
+ { X86::ADC64ri32, X86::ADC64mi32, 0 },
+ { X86::ADC64ri8, X86::ADC64mi8, 0 },
+ { X86::ADC64rr, X86::ADC64mr, 0 },
+ { X86::ADD16ri, X86::ADD16mi, 0 },
+ { X86::ADD16ri8, X86::ADD16mi8, 0 },
+ { X86::ADD16ri_DB, X86::ADD16mi, TB_NO_REVERSE },
+ { X86::ADD16ri8_DB, X86::ADD16mi8, TB_NO_REVERSE },
+ { X86::ADD16rr, X86::ADD16mr, 0 },
+ { X86::ADD16rr_DB, X86::ADD16mr, TB_NO_REVERSE },
+ { X86::ADD32ri, X86::ADD32mi, 0 },
+ { X86::ADD32ri8, X86::ADD32mi8, 0 },
+ { X86::ADD32ri_DB, X86::ADD32mi, TB_NO_REVERSE },
+ { X86::ADD32ri8_DB, X86::ADD32mi8, TB_NO_REVERSE },
+ { X86::ADD32rr, X86::ADD32mr, 0 },
+ { X86::ADD32rr_DB, X86::ADD32mr, TB_NO_REVERSE },
+ { X86::ADD64ri32, X86::ADD64mi32, 0 },
+ { X86::ADD64ri8, X86::ADD64mi8, 0 },
+ { X86::ADD64ri32_DB,X86::ADD64mi32, TB_NO_REVERSE },
+ { X86::ADD64ri8_DB, X86::ADD64mi8, TB_NO_REVERSE },
+ { X86::ADD64rr, X86::ADD64mr, 0 },
+ { X86::ADD64rr_DB, X86::ADD64mr, TB_NO_REVERSE },
+ { X86::ADD8ri, X86::ADD8mi, 0 },
+ { X86::ADD8rr, X86::ADD8mr, 0 },
+ { X86::AND16ri, X86::AND16mi, 0 },
+ { X86::AND16ri8, X86::AND16mi8, 0 },
+ { X86::AND16rr, X86::AND16mr, 0 },
+ { X86::AND32ri, X86::AND32mi, 0 },
+ { X86::AND32ri8, X86::AND32mi8, 0 },
+ { X86::AND32rr, X86::AND32mr, 0 },
+ { X86::AND64ri32, X86::AND64mi32, 0 },
+ { X86::AND64ri8, X86::AND64mi8, 0 },
+ { X86::AND64rr, X86::AND64mr, 0 },
+ { X86::AND8ri, X86::AND8mi, 0 },
+ { X86::AND8rr, X86::AND8mr, 0 },
+ { X86::DEC16r, X86::DEC16m, 0 },
+ { X86::DEC32r, X86::DEC32m, 0 },
+ { X86::DEC64r, X86::DEC64m, 0 },
+ { X86::DEC8r, X86::DEC8m, 0 },
+ { X86::INC16r, X86::INC16m, 0 },
+ { X86::INC32r, X86::INC32m, 0 },
+ { X86::INC64r, X86::INC64m, 0 },
+ { X86::INC8r, X86::INC8m, 0 },
+ { X86::NEG16r, X86::NEG16m, 0 },
+ { X86::NEG32r, X86::NEG32m, 0 },
+ { X86::NEG64r, X86::NEG64m, 0 },
+ { X86::NEG8r, X86::NEG8m, 0 },
+ { X86::NOT16r, X86::NOT16m, 0 },
+ { X86::NOT32r, X86::NOT32m, 0 },
+ { X86::NOT64r, X86::NOT64m, 0 },
+ { X86::NOT8r, X86::NOT8m, 0 },
+ { X86::OR16ri, X86::OR16mi, 0 },
+ { X86::OR16ri8, X86::OR16mi8, 0 },
+ { X86::OR16rr, X86::OR16mr, 0 },
+ { X86::OR32ri, X86::OR32mi, 0 },
+ { X86::OR32ri8, X86::OR32mi8, 0 },
+ { X86::OR32rr, X86::OR32mr, 0 },
+ { X86::OR64ri32, X86::OR64mi32, 0 },
+ { X86::OR64ri8, X86::OR64mi8, 0 },
+ { X86::OR64rr, X86::OR64mr, 0 },
+ { X86::OR8ri, X86::OR8mi, 0 },
+ { X86::OR8rr, X86::OR8mr, 0 },
+ { X86::ROL16r1, X86::ROL16m1, 0 },
+ { X86::ROL16rCL, X86::ROL16mCL, 0 },
+ { X86::ROL16ri, X86::ROL16mi, 0 },
+ { X86::ROL32r1, X86::ROL32m1, 0 },
+ { X86::ROL32rCL, X86::ROL32mCL, 0 },
+ { X86::ROL32ri, X86::ROL32mi, 0 },
+ { X86::ROL64r1, X86::ROL64m1, 0 },
+ { X86::ROL64rCL, X86::ROL64mCL, 0 },
+ { X86::ROL64ri, X86::ROL64mi, 0 },
+ { X86::ROL8r1, X86::ROL8m1, 0 },
+ { X86::ROL8rCL, X86::ROL8mCL, 0 },
+ { X86::ROL8ri, X86::ROL8mi, 0 },
+ { X86::ROR16r1, X86::ROR16m1, 0 },
+ { X86::ROR16rCL, X86::ROR16mCL, 0 },
+ { X86::ROR16ri, X86::ROR16mi, 0 },
+ { X86::ROR32r1, X86::ROR32m1, 0 },
+ { X86::ROR32rCL, X86::ROR32mCL, 0 },
+ { X86::ROR32ri, X86::ROR32mi, 0 },
+ { X86::ROR64r1, X86::ROR64m1, 0 },
+ { X86::ROR64rCL, X86::ROR64mCL, 0 },
+ { X86::ROR64ri, X86::ROR64mi, 0 },
+ { X86::ROR8r1, X86::ROR8m1, 0 },
+ { X86::ROR8rCL, X86::ROR8mCL, 0 },
+ { X86::ROR8ri, X86::ROR8mi, 0 },
+ { X86::SAR16r1, X86::SAR16m1, 0 },
+ { X86::SAR16rCL, X86::SAR16mCL, 0 },
+ { X86::SAR16ri, X86::SAR16mi, 0 },
+ { X86::SAR32r1, X86::SAR32m1, 0 },
+ { X86::SAR32rCL, X86::SAR32mCL, 0 },
+ { X86::SAR32ri, X86::SAR32mi, 0 },
+ { X86::SAR64r1, X86::SAR64m1, 0 },
+ { X86::SAR64rCL, X86::SAR64mCL, 0 },
+ { X86::SAR64ri, X86::SAR64mi, 0 },
+ { X86::SAR8r1, X86::SAR8m1, 0 },
+ { X86::SAR8rCL, X86::SAR8mCL, 0 },
+ { X86::SAR8ri, X86::SAR8mi, 0 },
+ { X86::SBB32ri, X86::SBB32mi, 0 },
+ { X86::SBB32ri8, X86::SBB32mi8, 0 },
+ { X86::SBB32rr, X86::SBB32mr, 0 },
+ { X86::SBB64ri32, X86::SBB64mi32, 0 },
+ { X86::SBB64ri8, X86::SBB64mi8, 0 },
+ { X86::SBB64rr, X86::SBB64mr, 0 },
+ { X86::SHL16r1, X86::SHL16m1, 0 },
+ { X86::SHL16rCL, X86::SHL16mCL, 0 },
+ { X86::SHL16ri, X86::SHL16mi, 0 },
+ { X86::SHL32r1, X86::SHL32m1, 0 },
+ { X86::SHL32rCL, X86::SHL32mCL, 0 },
+ { X86::SHL32ri, X86::SHL32mi, 0 },
+ { X86::SHL64r1, X86::SHL64m1, 0 },
+ { X86::SHL64rCL, X86::SHL64mCL, 0 },
+ { X86::SHL64ri, X86::SHL64mi, 0 },
+ { X86::SHL8r1, X86::SHL8m1, 0 },
+ { X86::SHL8rCL, X86::SHL8mCL, 0 },
+ { X86::SHL8ri, X86::SHL8mi, 0 },
+ { X86::SHLD16rrCL, X86::SHLD16mrCL, 0 },
+ { X86::SHLD16rri8, X86::SHLD16mri8, 0 },
+ { X86::SHLD32rrCL, X86::SHLD32mrCL, 0 },
+ { X86::SHLD32rri8, X86::SHLD32mri8, 0 },
+ { X86::SHLD64rrCL, X86::SHLD64mrCL, 0 },
+ { X86::SHLD64rri8, X86::SHLD64mri8, 0 },
+ { X86::SHR16r1, X86::SHR16m1, 0 },
+ { X86::SHR16rCL, X86::SHR16mCL, 0 },
+ { X86::SHR16ri, X86::SHR16mi, 0 },
+ { X86::SHR32r1, X86::SHR32m1, 0 },
+ { X86::SHR32rCL, X86::SHR32mCL, 0 },
+ { X86::SHR32ri, X86::SHR32mi, 0 },
+ { X86::SHR64r1, X86::SHR64m1, 0 },
+ { X86::SHR64rCL, X86::SHR64mCL, 0 },
+ { X86::SHR64ri, X86::SHR64mi, 0 },
+ { X86::SHR8r1, X86::SHR8m1, 0 },
+ { X86::SHR8rCL, X86::SHR8mCL, 0 },
+ { X86::SHR8ri, X86::SHR8mi, 0 },
+ { X86::SHRD16rrCL, X86::SHRD16mrCL, 0 },
+ { X86::SHRD16rri8, X86::SHRD16mri8, 0 },
+ { X86::SHRD32rrCL, X86::SHRD32mrCL, 0 },
+ { X86::SHRD32rri8, X86::SHRD32mri8, 0 },
+ { X86::SHRD64rrCL, X86::SHRD64mrCL, 0 },
+ { X86::SHRD64rri8, X86::SHRD64mri8, 0 },
+ { X86::SUB16ri, X86::SUB16mi, 0 },
+ { X86::SUB16ri8, X86::SUB16mi8, 0 },
+ { X86::SUB16rr, X86::SUB16mr, 0 },
+ { X86::SUB32ri, X86::SUB32mi, 0 },
+ { X86::SUB32ri8, X86::SUB32mi8, 0 },
+ { X86::SUB32rr, X86::SUB32mr, 0 },
+ { X86::SUB64ri32, X86::SUB64mi32, 0 },
+ { X86::SUB64ri8, X86::SUB64mi8, 0 },
+ { X86::SUB64rr, X86::SUB64mr, 0 },
+ { X86::SUB8ri, X86::SUB8mi, 0 },
+ { X86::SUB8rr, X86::SUB8mr, 0 },
+ { X86::XOR16ri, X86::XOR16mi, 0 },
+ { X86::XOR16ri8, X86::XOR16mi8, 0 },
+ { X86::XOR16rr, X86::XOR16mr, 0 },
+ { X86::XOR32ri, X86::XOR32mi, 0 },
+ { X86::XOR32ri8, X86::XOR32mi8, 0 },
+ { X86::XOR32rr, X86::XOR32mr, 0 },
+ { X86::XOR64ri32, X86::XOR64mi32, 0 },
+ { X86::XOR64ri8, X86::XOR64mi8, 0 },
+ { X86::XOR64rr, X86::XOR64mr, 0 },
+ { X86::XOR8ri, X86::XOR8mi, 0 },
+ { X86::XOR8rr, X86::XOR8mr, 0 }
+ };
+
+ for (X86MemoryFoldTableEntry Entry : MemoryFoldTable2Addr) {
+ AddTableEntry(RegOp2MemOpTable2Addr, MemOp2RegOpTable,
+ Entry.RegOp, Entry.MemOp,
+ // Index 0, folded load and store, no alignment requirement.
+ Entry.Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE);
+ }
+
+ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
+ { X86::BT16ri8, X86::BT16mi8, TB_FOLDED_LOAD },
+ { X86::BT32ri8, X86::BT32mi8, TB_FOLDED_LOAD },
+ { X86::BT64ri8, X86::BT64mi8, TB_FOLDED_LOAD },
+ { X86::CALL32r, X86::CALL32m, TB_FOLDED_LOAD },
+ { X86::CALL64r, X86::CALL64m, TB_FOLDED_LOAD },
+ { X86::CMP16ri, X86::CMP16mi, TB_FOLDED_LOAD },
+ { X86::CMP16ri8, X86::CMP16mi8, TB_FOLDED_LOAD },
+ { X86::CMP16rr, X86::CMP16mr, TB_FOLDED_LOAD },
+ { X86::CMP32ri, X86::CMP32mi, TB_FOLDED_LOAD },
+ { X86::CMP32ri8, X86::CMP32mi8, TB_FOLDED_LOAD },
+ { X86::CMP32rr, X86::CMP32mr, TB_FOLDED_LOAD },
+ { X86::CMP64ri32, X86::CMP64mi32, TB_FOLDED_LOAD },
+ { X86::CMP64ri8, X86::CMP64mi8, TB_FOLDED_LOAD },
+ { X86::CMP64rr, X86::CMP64mr, TB_FOLDED_LOAD },
+ { X86::CMP8ri, X86::CMP8mi, TB_FOLDED_LOAD },
+ { X86::CMP8rr, X86::CMP8mr, TB_FOLDED_LOAD },
+ { X86::DIV16r, X86::DIV16m, TB_FOLDED_LOAD },
+ { X86::DIV32r, X86::DIV32m, TB_FOLDED_LOAD },
+ { X86::DIV64r, X86::DIV64m, TB_FOLDED_LOAD },
+ { X86::DIV8r, X86::DIV8m, TB_FOLDED_LOAD },
+ { X86::EXTRACTPSrr, X86::EXTRACTPSmr, TB_FOLDED_STORE },
+ { X86::IDIV16r, X86::IDIV16m, TB_FOLDED_LOAD },
+ { X86::IDIV32r, X86::IDIV32m, TB_FOLDED_LOAD },
+ { X86::IDIV64r, X86::IDIV64m, TB_FOLDED_LOAD },
+ { X86::IDIV8r, X86::IDIV8m, TB_FOLDED_LOAD },
+ { X86::IMUL16r, X86::IMUL16m, TB_FOLDED_LOAD },
+ { X86::IMUL32r, X86::IMUL32m, TB_FOLDED_LOAD },
+ { X86::IMUL64r, X86::IMUL64m, TB_FOLDED_LOAD },
+ { X86::IMUL8r, X86::IMUL8m, TB_FOLDED_LOAD },
+ { X86::JMP32r, X86::JMP32m, TB_FOLDED_LOAD },
+ { X86::JMP64r, X86::JMP64m, TB_FOLDED_LOAD },
+ { X86::MOV16ri, X86::MOV16mi, TB_FOLDED_STORE },
+ { X86::MOV16rr, X86::MOV16mr, TB_FOLDED_STORE },
+ { X86::MOV32ri, X86::MOV32mi, TB_FOLDED_STORE },
+ { X86::MOV32rr, X86::MOV32mr, TB_FOLDED_STORE },
+ { X86::MOV64ri32, X86::MOV64mi32, TB_FOLDED_STORE },
+ { X86::MOV64rr, X86::MOV64mr, TB_FOLDED_STORE },
+ { X86::MOV8ri, X86::MOV8mi, TB_FOLDED_STORE },
+ { X86::MOV8rr, X86::MOV8mr, TB_FOLDED_STORE },
+ { X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, TB_FOLDED_STORE },
+ { X86::MOVAPDrr, X86::MOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::MOVAPSrr, X86::MOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::MOVDQArr, X86::MOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::MOVDQUrr, X86::MOVDQUmr, TB_FOLDED_STORE },
+ { X86::MOVPDI2DIrr, X86::MOVPDI2DImr, TB_FOLDED_STORE },
+ { X86::MOVPQIto64rr,X86::MOVPQI2QImr, TB_FOLDED_STORE },
+ { X86::MOVSDto64rr, X86::MOVSDto64mr, TB_FOLDED_STORE },
+ { X86::MOVSS2DIrr, X86::MOVSS2DImr, TB_FOLDED_STORE },
+ { X86::MOVUPDrr, X86::MOVUPDmr, TB_FOLDED_STORE },
+ { X86::MOVUPSrr, X86::MOVUPSmr, TB_FOLDED_STORE },
+ { X86::MUL16r, X86::MUL16m, TB_FOLDED_LOAD },
+ { X86::MUL32r, X86::MUL32m, TB_FOLDED_LOAD },
+ { X86::MUL64r, X86::MUL64m, TB_FOLDED_LOAD },
+ { X86::MUL8r, X86::MUL8m, TB_FOLDED_LOAD },
+ { X86::PEXTRDrr, X86::PEXTRDmr, TB_FOLDED_STORE },
+ { X86::PEXTRQrr, X86::PEXTRQmr, TB_FOLDED_STORE },
+ { X86::PUSH16r, X86::PUSH16rmm, TB_FOLDED_LOAD },
+ { X86::PUSH32r, X86::PUSH32rmm, TB_FOLDED_LOAD },
+ { X86::PUSH64r, X86::PUSH64rmm, TB_FOLDED_LOAD },
+ { X86::SETAEr, X86::SETAEm, TB_FOLDED_STORE },
+ { X86::SETAr, X86::SETAm, TB_FOLDED_STORE },
+ { X86::SETBEr, X86::SETBEm, TB_FOLDED_STORE },
+ { X86::SETBr, X86::SETBm, TB_FOLDED_STORE },
+ { X86::SETEr, X86::SETEm, TB_FOLDED_STORE },
+ { X86::SETGEr, X86::SETGEm, TB_FOLDED_STORE },
+ { X86::SETGr, X86::SETGm, TB_FOLDED_STORE },
+ { X86::SETLEr, X86::SETLEm, TB_FOLDED_STORE },
+ { X86::SETLr, X86::SETLm, TB_FOLDED_STORE },
+ { X86::SETNEr, X86::SETNEm, TB_FOLDED_STORE },
+ { X86::SETNOr, X86::SETNOm, TB_FOLDED_STORE },
+ { X86::SETNPr, X86::SETNPm, TB_FOLDED_STORE },
+ { X86::SETNSr, X86::SETNSm, TB_FOLDED_STORE },
+ { X86::SETOr, X86::SETOm, TB_FOLDED_STORE },
+ { X86::SETPr, X86::SETPm, TB_FOLDED_STORE },
+ { X86::SETSr, X86::SETSm, TB_FOLDED_STORE },
+ { X86::TAILJMPr, X86::TAILJMPm, TB_FOLDED_LOAD },
+ { X86::TAILJMPr64, X86::TAILJMPm64, TB_FOLDED_LOAD },
+ { X86::TAILJMPr64_REX, X86::TAILJMPm64_REX, TB_FOLDED_LOAD },
+ { X86::TEST16ri, X86::TEST16mi, TB_FOLDED_LOAD },
+ { X86::TEST32ri, X86::TEST32mi, TB_FOLDED_LOAD },
+ { X86::TEST64ri32, X86::TEST64mi32, TB_FOLDED_LOAD },
+ { X86::TEST8ri, X86::TEST8mi, TB_FOLDED_LOAD },
+
+ // AVX 128-bit versions of foldable instructions
+ { X86::VEXTRACTPSrr,X86::VEXTRACTPSmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTF128rr, X86::VEXTRACTF128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVAPDrr, X86::VMOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVAPSrr, X86::VMOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVDQArr, X86::VMOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVDQUrr, X86::VMOVDQUmr, TB_FOLDED_STORE },
+ { X86::VMOVPDI2DIrr,X86::VMOVPDI2DImr, TB_FOLDED_STORE },
+ { X86::VMOVPQIto64rr, X86::VMOVPQI2QImr,TB_FOLDED_STORE },
+ { X86::VMOVSDto64rr,X86::VMOVSDto64mr, TB_FOLDED_STORE },
+ { X86::VMOVSS2DIrr, X86::VMOVSS2DImr, TB_FOLDED_STORE },
+ { X86::VMOVUPDrr, X86::VMOVUPDmr, TB_FOLDED_STORE },
+ { X86::VMOVUPSrr, X86::VMOVUPSmr, TB_FOLDED_STORE },
+ { X86::VPEXTRDrr, X86::VPEXTRDmr, TB_FOLDED_STORE },
+ { X86::VPEXTRQrr, X86::VPEXTRQmr, TB_FOLDED_STORE },
+
+ // AVX 256-bit foldable instructions
+ { X86::VEXTRACTI128rr, X86::VEXTRACTI128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVAPDYrr, X86::VMOVAPDYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
+ { X86::VMOVAPSYrr, X86::VMOVAPSYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
+ { X86::VMOVDQAYrr, X86::VMOVDQAYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
+ { X86::VMOVDQUYrr, X86::VMOVDQUYmr, TB_FOLDED_STORE },
+ { X86::VMOVUPDYrr, X86::VMOVUPDYmr, TB_FOLDED_STORE },
+ { X86::VMOVUPSYrr, X86::VMOVUPSYmr, TB_FOLDED_STORE },
+
+ // AVX-512 foldable instructions
+ { X86::VEXTRACTF32x4Zrr,X86::VEXTRACTF32x4Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTF32x8Zrr,X86::VEXTRACTF32x8Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTF64x2Zrr,X86::VEXTRACTF64x2Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTF64x4Zrr,X86::VEXTRACTF64x4Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTI32x4Zrr,X86::VEXTRACTI32x4Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTI32x8Zrr,X86::VEXTRACTI32x8Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTI64x2Zrr,X86::VEXTRACTI64x2Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTI64x4Zrr,X86::VEXTRACTI64x4Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTPSZrr, X86::VEXTRACTPSZmr, TB_FOLDED_STORE },
+ { X86::VMOVPDI2DIZrr, X86::VMOVPDI2DIZmr, TB_FOLDED_STORE },
+ { X86::VMOVAPDZrr, X86::VMOVAPDZmr, TB_FOLDED_STORE | TB_ALIGN_64 },
+ { X86::VMOVAPSZrr, X86::VMOVAPSZmr, TB_FOLDED_STORE | TB_ALIGN_64 },
+ { X86::VMOVDQA32Zrr, X86::VMOVDQA32Zmr, TB_FOLDED_STORE | TB_ALIGN_64 },
+ { X86::VMOVDQA64Zrr, X86::VMOVDQA64Zmr, TB_FOLDED_STORE | TB_ALIGN_64 },
+ { X86::VMOVUPDZrr, X86::VMOVUPDZmr, TB_FOLDED_STORE },
+ { X86::VMOVUPSZrr, X86::VMOVUPSZmr, TB_FOLDED_STORE },
+ { X86::VMOVDQU8Zrr, X86::VMOVDQU8Zmr, TB_FOLDED_STORE },
+ { X86::VMOVDQU16Zrr, X86::VMOVDQU16Zmr, TB_FOLDED_STORE },
+ { X86::VMOVDQU32Zrr, X86::VMOVDQU32Zmr, TB_FOLDED_STORE },
+ { X86::VMOVDQU64Zrr, X86::VMOVDQU64Zmr, TB_FOLDED_STORE },
+ { X86::VPMOVDBZrr, X86::VPMOVDBZmr, TB_FOLDED_STORE },
+ { X86::VPMOVDWZrr, X86::VPMOVDWZmr, TB_FOLDED_STORE },
+ { X86::VPMOVQDZrr, X86::VPMOVQDZmr, TB_FOLDED_STORE },
+ { X86::VPMOVQWZrr, X86::VPMOVQWZmr, TB_FOLDED_STORE },
+ { X86::VPMOVWBZrr, X86::VPMOVWBZmr, TB_FOLDED_STORE },
+ { X86::VPMOVSDBZrr, X86::VPMOVSDBZmr, TB_FOLDED_STORE },
+ { X86::VPMOVSDWZrr, X86::VPMOVSDWZmr, TB_FOLDED_STORE },
+ { X86::VPMOVSQDZrr, X86::VPMOVSQDZmr, TB_FOLDED_STORE },
+ { X86::VPMOVSQWZrr, X86::VPMOVSQWZmr, TB_FOLDED_STORE },
+ { X86::VPMOVSWBZrr, X86::VPMOVSWBZmr, TB_FOLDED_STORE },
+ { X86::VPMOVUSDBZrr, X86::VPMOVUSDBZmr, TB_FOLDED_STORE },
+ { X86::VPMOVUSDWZrr, X86::VPMOVUSDWZmr, TB_FOLDED_STORE },
+ { X86::VPMOVUSQDZrr, X86::VPMOVUSQDZmr, TB_FOLDED_STORE },
+ { X86::VPMOVUSQWZrr, X86::VPMOVUSQWZmr, TB_FOLDED_STORE },
+ { X86::VPMOVUSWBZrr, X86::VPMOVUSWBZmr, TB_FOLDED_STORE },
+
+ // AVX-512 foldable instructions (256-bit versions)
+ { X86::VEXTRACTF32x4Z256rr,X86::VEXTRACTF32x4Z256mr, TB_FOLDED_STORE },
+ { X86::VEXTRACTF64x2Z256rr,X86::VEXTRACTF64x2Z256mr, TB_FOLDED_STORE },
+ { X86::VEXTRACTI32x4Z256rr,X86::VEXTRACTI32x4Z256mr, TB_FOLDED_STORE },
+ { X86::VEXTRACTI64x2Z256rr,X86::VEXTRACTI64x2Z256mr, TB_FOLDED_STORE },
+ { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
+ { X86::VMOVAPSZ256rr, X86::VMOVAPSZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
+ { X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
+ { X86::VMOVDQA64Z256rr, X86::VMOVDQA64Z256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
+ { X86::VMOVUPDZ256rr, X86::VMOVUPDZ256mr, TB_FOLDED_STORE },
+ { X86::VMOVUPSZ256rr, X86::VMOVUPSZ256mr, TB_FOLDED_STORE },
+ { X86::VMOVDQU8Z256rr, X86::VMOVDQU8Z256mr, TB_FOLDED_STORE },
+ { X86::VMOVDQU16Z256rr, X86::VMOVDQU16Z256mr, TB_FOLDED_STORE },
+ { X86::VMOVDQU32Z256rr, X86::VMOVDQU32Z256mr, TB_FOLDED_STORE },
+ { X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256mr, TB_FOLDED_STORE },
+ { X86::VPMOVDWZ256rr, X86::VPMOVDWZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVQDZ256rr, X86::VPMOVQDZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVWBZ256rr, X86::VPMOVWBZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVSDWZ256rr, X86::VPMOVSDWZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVSQDZ256rr, X86::VPMOVSQDZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVSWBZ256rr, X86::VPMOVSWBZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVUSDWZ256rr, X86::VPMOVUSDWZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVUSQDZ256rr, X86::VPMOVUSQDZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVUSWBZ256rr, X86::VPMOVUSWBZ256mr, TB_FOLDED_STORE },
+
+ // AVX-512 foldable instructions (128-bit versions)
+ { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVAPSZ128rr, X86::VMOVAPSZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVDQA64Z128rr, X86::VMOVDQA64Z128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVUPDZ128rr, X86::VMOVUPDZ128mr, TB_FOLDED_STORE },
+ { X86::VMOVUPSZ128rr, X86::VMOVUPSZ128mr, TB_FOLDED_STORE },
+ { X86::VMOVDQU8Z128rr, X86::VMOVDQU8Z128mr, TB_FOLDED_STORE },
+ { X86::VMOVDQU16Z128rr, X86::VMOVDQU16Z128mr, TB_FOLDED_STORE },
+ { X86::VMOVDQU32Z128rr, X86::VMOVDQU32Z128mr, TB_FOLDED_STORE },
+ { X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128mr, TB_FOLDED_STORE },
+
+ // F16C foldable instructions
+ { X86::VCVTPS2PHrr, X86::VCVTPS2PHmr, TB_FOLDED_STORE },
+ { X86::VCVTPS2PHYrr, X86::VCVTPS2PHYmr, TB_FOLDED_STORE }
+ };
+
+ for (X86MemoryFoldTableEntry Entry : MemoryFoldTable0) {
+ AddTableEntry(RegOp2MemOpTable0, MemOp2RegOpTable,
+ Entry.RegOp, Entry.MemOp, TB_INDEX_0 | Entry.Flags);
+ }
+
+ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
+ { X86::BSF16rr, X86::BSF16rm, 0 },
+ { X86::BSF32rr, X86::BSF32rm, 0 },
+ { X86::BSF64rr, X86::BSF64rm, 0 },
+ { X86::BSR16rr, X86::BSR16rm, 0 },
+ { X86::BSR32rr, X86::BSR32rm, 0 },
+ { X86::BSR64rr, X86::BSR64rm, 0 },
+ { X86::CMP16rr, X86::CMP16rm, 0 },
+ { X86::CMP32rr, X86::CMP32rm, 0 },
+ { X86::CMP64rr, X86::CMP64rm, 0 },
+ { X86::CMP8rr, X86::CMP8rm, 0 },
+ { X86::CVTSD2SSrr, X86::CVTSD2SSrm, 0 },
+ { X86::CVTSI2SD64rr, X86::CVTSI2SD64rm, 0 },
+ { X86::CVTSI2SDrr, X86::CVTSI2SDrm, 0 },
+ { X86::CVTSI2SS64rr, X86::CVTSI2SS64rm, 0 },
+ { X86::CVTSI2SSrr, X86::CVTSI2SSrm, 0 },
+ { X86::CVTSS2SDrr, X86::CVTSS2SDrm, 0 },
+ { X86::CVTTSD2SI64rr, X86::CVTTSD2SI64rm, 0 },
+ { X86::CVTTSD2SIrr, X86::CVTTSD2SIrm, 0 },
+ { X86::CVTTSS2SI64rr, X86::CVTTSS2SI64rm, 0 },
+ { X86::CVTTSS2SIrr, X86::CVTTSS2SIrm, 0 },
+ { X86::IMUL16rri, X86::IMUL16rmi, 0 },
+ { X86::IMUL16rri8, X86::IMUL16rmi8, 0 },
+ { X86::IMUL32rri, X86::IMUL32rmi, 0 },
+ { X86::IMUL32rri8, X86::IMUL32rmi8, 0 },
+ { X86::IMUL64rri32, X86::IMUL64rmi32, 0 },
+ { X86::IMUL64rri8, X86::IMUL64rmi8, 0 },
+ { X86::Int_COMISDrr, X86::Int_COMISDrm, TB_NO_REVERSE },
+ { X86::Int_COMISSrr, X86::Int_COMISSrm, TB_NO_REVERSE },
+ { X86::CVTSD2SI64rr, X86::CVTSD2SI64rm, TB_NO_REVERSE },
+ { X86::CVTSD2SIrr, X86::CVTSD2SIrm, TB_NO_REVERSE },
+ { X86::CVTSS2SI64rr, X86::CVTSS2SI64rm, TB_NO_REVERSE },
+ { X86::CVTSS2SIrr, X86::CVTSS2SIrm, TB_NO_REVERSE },
+ { X86::CVTDQ2PDrr, X86::CVTDQ2PDrm, TB_NO_REVERSE },
+ { X86::CVTDQ2PSrr, X86::CVTDQ2PSrm, TB_ALIGN_16 },
+ { X86::CVTPD2DQrr, X86::CVTPD2DQrm, TB_ALIGN_16 },
+ { X86::CVTPD2PSrr, X86::CVTPD2PSrm, TB_ALIGN_16 },
+ { X86::CVTPS2DQrr, X86::CVTPS2DQrm, TB_ALIGN_16 },
+ { X86::CVTPS2PDrr, X86::CVTPS2PDrm, TB_NO_REVERSE },
+ { X86::CVTTPD2DQrr, X86::CVTTPD2DQrm, TB_ALIGN_16 },
+ { X86::CVTTPS2DQrr, X86::CVTTPS2DQrm, TB_ALIGN_16 },
+ { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm, TB_NO_REVERSE },
+ { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm, TB_NO_REVERSE },
+ { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm, TB_NO_REVERSE },
+ { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm, TB_NO_REVERSE },
+ { X86::Int_UCOMISDrr, X86::Int_UCOMISDrm, TB_NO_REVERSE },
+ { X86::Int_UCOMISSrr, X86::Int_UCOMISSrm, TB_NO_REVERSE },
+ { X86::MOV16rr, X86::MOV16rm, 0 },
+ { X86::MOV32rr, X86::MOV32rm, 0 },
+ { X86::MOV64rr, X86::MOV64rm, 0 },
+ { X86::MOV64toPQIrr, X86::MOVQI2PQIrm, 0 },
+ { X86::MOV64toSDrr, X86::MOV64toSDrm, 0 },
+ { X86::MOV8rr, X86::MOV8rm, 0 },
+ { X86::MOVAPDrr, X86::MOVAPDrm, TB_ALIGN_16 },
+ { X86::MOVAPSrr, X86::MOVAPSrm, TB_ALIGN_16 },
+ { X86::MOVDDUPrr, X86::MOVDDUPrm, 0 },
+ { X86::MOVDI2PDIrr, X86::MOVDI2PDIrm, 0 },
+ { X86::MOVDI2SSrr, X86::MOVDI2SSrm, 0 },
+ { X86::MOVDQArr, X86::MOVDQArm, TB_ALIGN_16 },
+ { X86::MOVDQUrr, X86::MOVDQUrm, 0 },
+ { X86::MOVSHDUPrr, X86::MOVSHDUPrm, TB_ALIGN_16 },
+ { X86::MOVSLDUPrr, X86::MOVSLDUPrm, TB_ALIGN_16 },
+ { X86::MOVSX16rr8, X86::MOVSX16rm8, 0 },
+ { X86::MOVSX32rr16, X86::MOVSX32rm16, 0 },
+ { X86::MOVSX32rr8, X86::MOVSX32rm8, 0 },
+ { X86::MOVSX64rr16, X86::MOVSX64rm16, 0 },
+ { X86::MOVSX64rr32, X86::MOVSX64rm32, 0 },
+ { X86::MOVSX64rr8, X86::MOVSX64rm8, 0 },
+ { X86::MOVUPDrr, X86::MOVUPDrm, 0 },
+ { X86::MOVUPSrr, X86::MOVUPSrm, 0 },
+ { X86::MOVZPQILo2PQIrr, X86::MOVQI2PQIrm, TB_NO_REVERSE },
+ { X86::MOVZX16rr8, X86::MOVZX16rm8, 0 },
+ { X86::MOVZX32rr16, X86::MOVZX32rm16, 0 },
+ { X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8, 0 },
+ { X86::MOVZX32rr8, X86::MOVZX32rm8, 0 },
+ { X86::PABSBrr, X86::PABSBrm, TB_ALIGN_16 },
+ { X86::PABSDrr, X86::PABSDrm, TB_ALIGN_16 },
+ { X86::PABSWrr, X86::PABSWrm, TB_ALIGN_16 },
+ { X86::PCMPESTRIrr, X86::PCMPESTRIrm, TB_ALIGN_16 },
+ { X86::PCMPESTRM128rr, X86::PCMPESTRM128rm, TB_ALIGN_16 },
+ { X86::PCMPISTRIrr, X86::PCMPISTRIrm, TB_ALIGN_16 },
+ { X86::PCMPISTRM128rr, X86::PCMPISTRM128rm, TB_ALIGN_16 },
+ { X86::PHMINPOSUWrr128, X86::PHMINPOSUWrm128, TB_ALIGN_16 },
+ { X86::PMOVSXBDrr, X86::PMOVSXBDrm, TB_NO_REVERSE },
+ { X86::PMOVSXBQrr, X86::PMOVSXBQrm, TB_NO_REVERSE },
+ { X86::PMOVSXBWrr, X86::PMOVSXBWrm, TB_NO_REVERSE },
+ { X86::PMOVSXDQrr, X86::PMOVSXDQrm, TB_NO_REVERSE },
+ { X86::PMOVSXWDrr, X86::PMOVSXWDrm, TB_NO_REVERSE },
+ { X86::PMOVSXWQrr, X86::PMOVSXWQrm, TB_NO_REVERSE },
+ { X86::PMOVZXBDrr, X86::PMOVZXBDrm, TB_NO_REVERSE },
+ { X86::PMOVZXBQrr, X86::PMOVZXBQrm, TB_NO_REVERSE },
+ { X86::PMOVZXBWrr, X86::PMOVZXBWrm, TB_NO_REVERSE },
+ { X86::PMOVZXDQrr, X86::PMOVZXDQrm, TB_NO_REVERSE },
+ { X86::PMOVZXWDrr, X86::PMOVZXWDrm, TB_NO_REVERSE },
+ { X86::PMOVZXWQrr, X86::PMOVZXWQrm, TB_NO_REVERSE },
+ { X86::PSHUFDri, X86::PSHUFDmi, TB_ALIGN_16 },
+ { X86::PSHUFHWri, X86::PSHUFHWmi, TB_ALIGN_16 },
+ { X86::PSHUFLWri, X86::PSHUFLWmi, TB_ALIGN_16 },
+ { X86::PTESTrr, X86::PTESTrm, TB_ALIGN_16 },
+ { X86::RCPPSr, X86::RCPPSm, TB_ALIGN_16 },
+ { X86::RCPSSr, X86::RCPSSm, 0 },
+ { X86::RCPSSr_Int, X86::RCPSSm_Int, TB_NO_REVERSE },
+ { X86::ROUNDPDr, X86::ROUNDPDm, TB_ALIGN_16 },
+ { X86::ROUNDPSr, X86::ROUNDPSm, TB_ALIGN_16 },
+ { X86::ROUNDSDr, X86::ROUNDSDm, 0 },
+ { X86::ROUNDSSr, X86::ROUNDSSm, 0 },
+ { X86::RSQRTPSr, X86::RSQRTPSm, TB_ALIGN_16 },
+ { X86::RSQRTSSr, X86::RSQRTSSm, 0 },
+ { X86::RSQRTSSr_Int, X86::RSQRTSSm_Int, TB_NO_REVERSE },
+ { X86::SQRTPDr, X86::SQRTPDm, TB_ALIGN_16 },
+ { X86::SQRTPSr, X86::SQRTPSm, TB_ALIGN_16 },
+ { X86::SQRTSDr, X86::SQRTSDm, 0 },
+ { X86::SQRTSDr_Int, X86::SQRTSDm_Int, TB_NO_REVERSE },
+ { X86::SQRTSSr, X86::SQRTSSm, 0 },
+ { X86::SQRTSSr_Int, X86::SQRTSSm_Int, TB_NO_REVERSE },
+ { X86::TEST16rr, X86::TEST16rm, 0 },
+ { X86::TEST32rr, X86::TEST32rm, 0 },
+ { X86::TEST64rr, X86::TEST64rm, 0 },
+ { X86::TEST8rr, X86::TEST8rm, 0 },
+ // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0
+ { X86::UCOMISDrr, X86::UCOMISDrm, 0 },
+ { X86::UCOMISSrr, X86::UCOMISSrm, 0 },
+
+ // MMX version of foldable instructions
+ { X86::MMX_CVTPD2PIirr, X86::MMX_CVTPD2PIirm, 0 },
+ { X86::MMX_CVTPI2PDirr, X86::MMX_CVTPI2PDirm, 0 },
+ { X86::MMX_CVTPS2PIirr, X86::MMX_CVTPS2PIirm, 0 },
+ { X86::MMX_CVTTPD2PIirr, X86::MMX_CVTTPD2PIirm, 0 },
+ { X86::MMX_CVTTPS2PIirr, X86::MMX_CVTTPS2PIirm, 0 },
+ { X86::MMX_MOVD64to64rr, X86::MMX_MOVQ64rm, 0 },
+ { X86::MMX_PABSBrr64, X86::MMX_PABSBrm64, 0 },
+ { X86::MMX_PABSDrr64, X86::MMX_PABSDrm64, 0 },
+ { X86::MMX_PABSWrr64, X86::MMX_PABSWrm64, 0 },
+ { X86::MMX_PSHUFWri, X86::MMX_PSHUFWmi, 0 },
+
+ // 3DNow! version of foldable instructions
+ { X86::PF2IDrr, X86::PF2IDrm, 0 },
+ { X86::PF2IWrr, X86::PF2IWrm, 0 },
+ { X86::PFRCPrr, X86::PFRCPrm, 0 },
+ { X86::PFRSQRTrr, X86::PFRSQRTrm, 0 },
+ { X86::PI2FDrr, X86::PI2FDrm, 0 },
+ { X86::PI2FWrr, X86::PI2FWrm, 0 },
+ { X86::PSWAPDrr, X86::PSWAPDrm, 0 },
+
+ // AVX 128-bit versions of foldable instructions
+ { X86::Int_VCOMISDrr, X86::Int_VCOMISDrm, TB_NO_REVERSE },
+ { X86::Int_VCOMISSrr, X86::Int_VCOMISSrm, TB_NO_REVERSE },
+ { X86::Int_VUCOMISDrr, X86::Int_VUCOMISDrm, TB_NO_REVERSE },
+ { X86::Int_VUCOMISSrr, X86::Int_VUCOMISSrm, TB_NO_REVERSE },
+ { X86::VCVTTSD2SI64rr, X86::VCVTTSD2SI64rm, 0 },
+ { X86::Int_VCVTTSD2SI64rr,X86::Int_VCVTTSD2SI64rm,TB_NO_REVERSE },
+ { X86::VCVTTSD2SIrr, X86::VCVTTSD2SIrm, 0 },
+ { X86::Int_VCVTTSD2SIrr,X86::Int_VCVTTSD2SIrm, TB_NO_REVERSE },
+ { X86::VCVTTSS2SI64rr, X86::VCVTTSS2SI64rm, 0 },
+ { X86::Int_VCVTTSS2SI64rr,X86::Int_VCVTTSS2SI64rm,TB_NO_REVERSE },
+ { X86::VCVTTSS2SIrr, X86::VCVTTSS2SIrm, 0 },
+ { X86::Int_VCVTTSS2SIrr,X86::Int_VCVTTSS2SIrm, TB_NO_REVERSE },
+ { X86::VCVTSD2SI64rr, X86::VCVTSD2SI64rm, TB_NO_REVERSE },
+ { X86::VCVTSD2SIrr, X86::VCVTSD2SIrm, TB_NO_REVERSE },
+ { X86::VCVTSS2SI64rr, X86::VCVTSS2SI64rm, TB_NO_REVERSE },
+ { X86::VCVTSS2SIrr, X86::VCVTSS2SIrm, TB_NO_REVERSE },
+ { X86::VCVTDQ2PDrr, X86::VCVTDQ2PDrm, TB_NO_REVERSE },
+ { X86::VCVTDQ2PSrr, X86::VCVTDQ2PSrm, 0 },
+ { X86::VCVTPD2DQrr, X86::VCVTPD2DQrm, 0 },
+ { X86::VCVTPD2PSrr, X86::VCVTPD2PSrm, 0 },
+ { X86::VCVTPS2DQrr, X86::VCVTPS2DQrm, 0 },
+ { X86::VCVTPS2PDrr, X86::VCVTPS2PDrm, TB_NO_REVERSE },
+ { X86::VCVTTPD2DQrr, X86::VCVTTPD2DQrm, 0 },
+ { X86::VCVTTPS2DQrr, X86::VCVTTPS2DQrm, 0 },
+ { X86::VMOV64toPQIrr, X86::VMOVQI2PQIrm, 0 },
+ { X86::VMOV64toSDrr, X86::VMOV64toSDrm, 0 },
+ { X86::VMOVAPDrr, X86::VMOVAPDrm, TB_ALIGN_16 },
+ { X86::VMOVAPSrr, X86::VMOVAPSrm, TB_ALIGN_16 },
+ { X86::VMOVDDUPrr, X86::VMOVDDUPrm, 0 },
+ { X86::VMOVDI2PDIrr, X86::VMOVDI2PDIrm, 0 },
+ { X86::VMOVDI2SSrr, X86::VMOVDI2SSrm, 0 },
+ { X86::VMOVDQArr, X86::VMOVDQArm, TB_ALIGN_16 },
+ { X86::VMOVDQUrr, X86::VMOVDQUrm, 0 },
+ { X86::VMOVSLDUPrr, X86::VMOVSLDUPrm, 0 },
+ { X86::VMOVSHDUPrr, X86::VMOVSHDUPrm, 0 },
+ { X86::VMOVUPDrr, X86::VMOVUPDrm, 0 },
+ { X86::VMOVUPSrr, X86::VMOVUPSrm, 0 },
+ { X86::VMOVZPQILo2PQIrr,X86::VMOVQI2PQIrm, TB_NO_REVERSE },
+ { X86::VPABSBrr, X86::VPABSBrm, 0 },
+ { X86::VPABSDrr, X86::VPABSDrm, 0 },
+ { X86::VPABSWrr, X86::VPABSWrm, 0 },
+ { X86::VPCMPESTRIrr, X86::VPCMPESTRIrm, 0 },
+ { X86::VPCMPESTRM128rr, X86::VPCMPESTRM128rm, 0 },
+ { X86::VPCMPISTRIrr, X86::VPCMPISTRIrm, 0 },
+ { X86::VPCMPISTRM128rr, X86::VPCMPISTRM128rm, 0 },
+ { X86::VPHMINPOSUWrr128, X86::VPHMINPOSUWrm128, 0 },
+ { X86::VPERMILPDri, X86::VPERMILPDmi, 0 },
+ { X86::VPERMILPSri, X86::VPERMILPSmi, 0 },
+ { X86::VPMOVSXBDrr, X86::VPMOVSXBDrm, TB_NO_REVERSE },
+ { X86::VPMOVSXBQrr, X86::VPMOVSXBQrm, TB_NO_REVERSE },
+ { X86::VPMOVSXBWrr, X86::VPMOVSXBWrm, TB_NO_REVERSE },
+ { X86::VPMOVSXDQrr, X86::VPMOVSXDQrm, TB_NO_REVERSE },
+ { X86::VPMOVSXWDrr, X86::VPMOVSXWDrm, TB_NO_REVERSE },
+ { X86::VPMOVSXWQrr, X86::VPMOVSXWQrm, TB_NO_REVERSE },
+ { X86::VPMOVZXBDrr, X86::VPMOVZXBDrm, TB_NO_REVERSE },
+ { X86::VPMOVZXBQrr, X86::VPMOVZXBQrm, TB_NO_REVERSE },
+ { X86::VPMOVZXBWrr, X86::VPMOVZXBWrm, TB_NO_REVERSE },
+ { X86::VPMOVZXDQrr, X86::VPMOVZXDQrm, TB_NO_REVERSE },
+ { X86::VPMOVZXWDrr, X86::VPMOVZXWDrm, TB_NO_REVERSE },
+ { X86::VPMOVZXWQrr, X86::VPMOVZXWQrm, TB_NO_REVERSE },
+ { X86::VPSHUFDri, X86::VPSHUFDmi, 0 },
+ { X86::VPSHUFHWri, X86::VPSHUFHWmi, 0 },
+ { X86::VPSHUFLWri, X86::VPSHUFLWmi, 0 },
+ { X86::VPTESTrr, X86::VPTESTrm, 0 },
+ { X86::VRCPPSr, X86::VRCPPSm, 0 },
+ { X86::VROUNDPDr, X86::VROUNDPDm, 0 },
+ { X86::VROUNDPSr, X86::VROUNDPSm, 0 },
+ { X86::VRSQRTPSr, X86::VRSQRTPSm, 0 },
+ { X86::VSQRTPDr, X86::VSQRTPDm, 0 },
+ { X86::VSQRTPSr, X86::VSQRTPSm, 0 },
+ { X86::VTESTPDrr, X86::VTESTPDrm, 0 },
+ { X86::VTESTPSrr, X86::VTESTPSrm, 0 },
+ { X86::VUCOMISDrr, X86::VUCOMISDrm, 0 },
+ { X86::VUCOMISSrr, X86::VUCOMISSrm, 0 },
+
+ // AVX 256-bit foldable instructions
+ { X86::VCVTDQ2PDYrr, X86::VCVTDQ2PDYrm, TB_NO_REVERSE },
+ { X86::VCVTDQ2PSYrr, X86::VCVTDQ2PSYrm, 0 },
+ { X86::VCVTPD2DQYrr, X86::VCVTPD2DQYrm, 0 },
+ { X86::VCVTPD2PSYrr, X86::VCVTPD2PSYrm, 0 },
+ { X86::VCVTPS2DQYrr, X86::VCVTPS2DQYrm, 0 },
+ { X86::VCVTPS2PDYrr, X86::VCVTPS2PDYrm, TB_NO_REVERSE },
+ { X86::VCVTTPD2DQYrr, X86::VCVTTPD2DQYrm, 0 },
+ { X86::VCVTTPS2DQYrr, X86::VCVTTPS2DQYrm, 0 },
+ { X86::VMOVAPDYrr, X86::VMOVAPDYrm, TB_ALIGN_32 },
+ { X86::VMOVAPSYrr, X86::VMOVAPSYrm, TB_ALIGN_32 },
+ { X86::VMOVDDUPYrr, X86::VMOVDDUPYrm, 0 },
+ { X86::VMOVDQAYrr, X86::VMOVDQAYrm, TB_ALIGN_32 },
+ { X86::VMOVDQUYrr, X86::VMOVDQUYrm, 0 },
+ { X86::VMOVSLDUPYrr, X86::VMOVSLDUPYrm, 0 },
+ { X86::VMOVSHDUPYrr, X86::VMOVSHDUPYrm, 0 },
+ { X86::VMOVUPDYrr, X86::VMOVUPDYrm, 0 },
+ { X86::VMOVUPSYrr, X86::VMOVUPSYrm, 0 },
+ { X86::VPERMILPDYri, X86::VPERMILPDYmi, 0 },
+ { X86::VPERMILPSYri, X86::VPERMILPSYmi, 0 },
+ { X86::VPTESTYrr, X86::VPTESTYrm, 0 },
+ { X86::VRCPPSYr, X86::VRCPPSYm, 0 },
+ { X86::VROUNDYPDr, X86::VROUNDYPDm, 0 },
+ { X86::VROUNDYPSr, X86::VROUNDYPSm, 0 },
+ { X86::VRSQRTPSYr, X86::VRSQRTPSYm, 0 },
+ { X86::VSQRTPDYr, X86::VSQRTPDYm, 0 },
+ { X86::VSQRTPSYr, X86::VSQRTPSYm, 0 },
+ { X86::VTESTPDYrr, X86::VTESTPDYrm, 0 },
+ { X86::VTESTPSYrr, X86::VTESTPSYrm, 0 },
+
+ // AVX2 foldable instructions
+
+ // VBROADCASTS{SD}rr register instructions were an AVX2 addition while the
+ // VBROADCASTS{SD}rm memory instructions were available from AVX1.
+ // TB_NO_REVERSE prevents unfolding from introducing an illegal instruction
+ // on AVX1 targets. The VPBROADCAST instructions are all AVX2 instructions
+ // so they don't need an equivalent limitation.
+ { X86::VBROADCASTSSrr, X86::VBROADCASTSSrm, TB_NO_REVERSE },
+ { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm, TB_NO_REVERSE },
+ { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm, TB_NO_REVERSE },
+ { X86::VPABSBYrr, X86::VPABSBYrm, 0 },
+ { X86::VPABSDYrr, X86::VPABSDYrm, 0 },
+ { X86::VPABSWYrr, X86::VPABSWYrm, 0 },
+ { X86::VPBROADCASTBrr, X86::VPBROADCASTBrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTBYrr, X86::VPBROADCASTBYrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTDrr, X86::VPBROADCASTDrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTDYrr, X86::VPBROADCASTDYrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTQrr, X86::VPBROADCASTQrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTQYrr, X86::VPBROADCASTQYrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTWrr, X86::VPBROADCASTWrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTWYrr, X86::VPBROADCASTWYrm, TB_NO_REVERSE },
+ { X86::VPERMPDYri, X86::VPERMPDYmi, 0 },
+ { X86::VPERMQYri, X86::VPERMQYmi, 0 },
+ { X86::VPMOVSXBDYrr, X86::VPMOVSXBDYrm, TB_NO_REVERSE },
+ { X86::VPMOVSXBQYrr, X86::VPMOVSXBQYrm, TB_NO_REVERSE },
+ { X86::VPMOVSXBWYrr, X86::VPMOVSXBWYrm, 0 },
+ { X86::VPMOVSXDQYrr, X86::VPMOVSXDQYrm, 0 },
+ { X86::VPMOVSXWDYrr, X86::VPMOVSXWDYrm, 0 },
+ { X86::VPMOVSXWQYrr, X86::VPMOVSXWQYrm, TB_NO_REVERSE },
+ { X86::VPMOVZXBDYrr, X86::VPMOVZXBDYrm, TB_NO_REVERSE },
+ { X86::VPMOVZXBQYrr, X86::VPMOVZXBQYrm, TB_NO_REVERSE },
+ { X86::VPMOVZXBWYrr, X86::VPMOVZXBWYrm, 0 },
+ { X86::VPMOVZXDQYrr, X86::VPMOVZXDQYrm, 0 },
+ { X86::VPMOVZXWDYrr, X86::VPMOVZXWDYrm, 0 },
+ { X86::VPMOVZXWQYrr, X86::VPMOVZXWQYrm, TB_NO_REVERSE },
+ { X86::VPSHUFDYri, X86::VPSHUFDYmi, 0 },
+ { X86::VPSHUFHWYri, X86::VPSHUFHWYmi, 0 },
+ { X86::VPSHUFLWYri, X86::VPSHUFLWYmi, 0 },
+
+ // XOP foldable instructions
+ { X86::VFRCZPDrr, X86::VFRCZPDrm, 0 },
+ { X86::VFRCZPDrrY, X86::VFRCZPDrmY, 0 },
+ { X86::VFRCZPSrr, X86::VFRCZPSrm, 0 },
+ { X86::VFRCZPSrrY, X86::VFRCZPSrmY, 0 },
+ { X86::VFRCZSDrr, X86::VFRCZSDrm, 0 },
+ { X86::VFRCZSSrr, X86::VFRCZSSrm, 0 },
+ { X86::VPHADDBDrr, X86::VPHADDBDrm, 0 },
+ { X86::VPHADDBQrr, X86::VPHADDBQrm, 0 },
+ { X86::VPHADDBWrr, X86::VPHADDBWrm, 0 },
+ { X86::VPHADDDQrr, X86::VPHADDDQrm, 0 },
+ { X86::VPHADDWDrr, X86::VPHADDWDrm, 0 },
+ { X86::VPHADDWQrr, X86::VPHADDWQrm, 0 },
+ { X86::VPHADDUBDrr, X86::VPHADDUBDrm, 0 },
+ { X86::VPHADDUBQrr, X86::VPHADDUBQrm, 0 },
+ { X86::VPHADDUBWrr, X86::VPHADDUBWrm, 0 },
+ { X86::VPHADDUDQrr, X86::VPHADDUDQrm, 0 },
+ { X86::VPHADDUWDrr, X86::VPHADDUWDrm, 0 },
+ { X86::VPHADDUWQrr, X86::VPHADDUWQrm, 0 },
+ { X86::VPHSUBBWrr, X86::VPHSUBBWrm, 0 },
+ { X86::VPHSUBDQrr, X86::VPHSUBDQrm, 0 },
+ { X86::VPHSUBWDrr, X86::VPHSUBWDrm, 0 },
+ { X86::VPROTBri, X86::VPROTBmi, 0 },
+ { X86::VPROTBrr, X86::VPROTBmr, 0 },
+ { X86::VPROTDri, X86::VPROTDmi, 0 },
+ { X86::VPROTDrr, X86::VPROTDmr, 0 },
+ { X86::VPROTQri, X86::VPROTQmi, 0 },
+ { X86::VPROTQrr, X86::VPROTQmr, 0 },
+ { X86::VPROTWri, X86::VPROTWmi, 0 },
+ { X86::VPROTWrr, X86::VPROTWmr, 0 },
+ { X86::VPSHABrr, X86::VPSHABmr, 0 },
+ { X86::VPSHADrr, X86::VPSHADmr, 0 },
+ { X86::VPSHAQrr, X86::VPSHAQmr, 0 },
+ { X86::VPSHAWrr, X86::VPSHAWmr, 0 },
+ { X86::VPSHLBrr, X86::VPSHLBmr, 0 },
+ { X86::VPSHLDrr, X86::VPSHLDmr, 0 },
+ { X86::VPSHLQrr, X86::VPSHLQmr, 0 },
+ { X86::VPSHLWrr, X86::VPSHLWmr, 0 },
+
+ // BMI/BMI2/LZCNT/POPCNT/TBM foldable instructions
+ { X86::BEXTR32rr, X86::BEXTR32rm, 0 },
+ { X86::BEXTR64rr, X86::BEXTR64rm, 0 },
+ { X86::BEXTRI32ri, X86::BEXTRI32mi, 0 },
+ { X86::BEXTRI64ri, X86::BEXTRI64mi, 0 },
+ { X86::BLCFILL32rr, X86::BLCFILL32rm, 0 },
+ { X86::BLCFILL64rr, X86::BLCFILL64rm, 0 },
+ { X86::BLCI32rr, X86::BLCI32rm, 0 },
+ { X86::BLCI64rr, X86::BLCI64rm, 0 },
+ { X86::BLCIC32rr, X86::BLCIC32rm, 0 },
+ { X86::BLCIC64rr, X86::BLCIC64rm, 0 },
+ { X86::BLCMSK32rr, X86::BLCMSK32rm, 0 },
+ { X86::BLCMSK64rr, X86::BLCMSK64rm, 0 },
+ { X86::BLCS32rr, X86::BLCS32rm, 0 },
+ { X86::BLCS64rr, X86::BLCS64rm, 0 },
+ { X86::BLSFILL32rr, X86::BLSFILL32rm, 0 },
+ { X86::BLSFILL64rr, X86::BLSFILL64rm, 0 },
+ { X86::BLSI32rr, X86::BLSI32rm, 0 },
+ { X86::BLSI64rr, X86::BLSI64rm, 0 },
+ { X86::BLSIC32rr, X86::BLSIC32rm, 0 },
+ { X86::BLSIC64rr, X86::BLSIC64rm, 0 },
+ { X86::BLSMSK32rr, X86::BLSMSK32rm, 0 },
+ { X86::BLSMSK64rr, X86::BLSMSK64rm, 0 },
+ { X86::BLSR32rr, X86::BLSR32rm, 0 },
+ { X86::BLSR64rr, X86::BLSR64rm, 0 },
+ { X86::BZHI32rr, X86::BZHI32rm, 0 },
+ { X86::BZHI64rr, X86::BZHI64rm, 0 },
+ { X86::LZCNT16rr, X86::LZCNT16rm, 0 },
+ { X86::LZCNT32rr, X86::LZCNT32rm, 0 },
+ { X86::LZCNT64rr, X86::LZCNT64rm, 0 },
+ { X86::POPCNT16rr, X86::POPCNT16rm, 0 },
+ { X86::POPCNT32rr, X86::POPCNT32rm, 0 },
+ { X86::POPCNT64rr, X86::POPCNT64rm, 0 },
+ { X86::RORX32ri, X86::RORX32mi, 0 },
+ { X86::RORX64ri, X86::RORX64mi, 0 },
+ { X86::SARX32rr, X86::SARX32rm, 0 },
+ { X86::SARX64rr, X86::SARX64rm, 0 },
+ { X86::SHRX32rr, X86::SHRX32rm, 0 },
+ { X86::SHRX64rr, X86::SHRX64rm, 0 },
+ { X86::SHLX32rr, X86::SHLX32rm, 0 },
+ { X86::SHLX64rr, X86::SHLX64rm, 0 },
+ { X86::T1MSKC32rr, X86::T1MSKC32rm, 0 },
+ { X86::T1MSKC64rr, X86::T1MSKC64rm, 0 },
+ { X86::TZCNT16rr, X86::TZCNT16rm, 0 },
+ { X86::TZCNT32rr, X86::TZCNT32rm, 0 },
+ { X86::TZCNT64rr, X86::TZCNT64rm, 0 },
+ { X86::TZMSK32rr, X86::TZMSK32rm, 0 },
+ { X86::TZMSK64rr, X86::TZMSK64rm, 0 },
+
+ // AVX-512 foldable instructions
+ { X86::VBROADCASTSSZr, X86::VBROADCASTSSZm, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZr_s, X86::VBROADCASTSSZm, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZr, X86::VBROADCASTSDZm, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZr_s, X86::VBROADCASTSDZm, TB_NO_REVERSE },
+ { X86::VMOV64toPQIZrr, X86::VMOVQI2PQIZrm, 0 },
+ { X86::VMOVZPQILo2PQIZrr,X86::VMOVQI2PQIZrm, TB_NO_REVERSE },
+ { X86::VMOVDI2SSZrr, X86::VMOVDI2SSZrm, 0 },
+ { X86::VMOVAPDZrr, X86::VMOVAPDZrm, TB_ALIGN_64 },
+ { X86::VMOVAPSZrr, X86::VMOVAPSZrm, TB_ALIGN_64 },
+ { X86::VMOVDQA32Zrr, X86::VMOVDQA32Zrm, TB_ALIGN_64 },
+ { X86::VMOVDQA64Zrr, X86::VMOVDQA64Zrm, TB_ALIGN_64 },
+ { X86::VMOVDQU8Zrr, X86::VMOVDQU8Zrm, 0 },
+ { X86::VMOVDQU16Zrr, X86::VMOVDQU16Zrm, 0 },
+ { X86::VMOVDQU32Zrr, X86::VMOVDQU32Zrm, 0 },
+ { X86::VMOVDQU64Zrr, X86::VMOVDQU64Zrm, 0 },
+ { X86::VMOVUPDZrr, X86::VMOVUPDZrm, 0 },
+ { X86::VMOVUPSZrr, X86::VMOVUPSZrm, 0 },
+ { X86::VPABSDZrr, X86::VPABSDZrm, 0 },
+ { X86::VPABSQZrr, X86::VPABSQZrm, 0 },
+ { X86::VPERMILPDZri, X86::VPERMILPDZmi, 0 },
+ { X86::VPERMILPSZri, X86::VPERMILPSZmi, 0 },
+ { X86::VPERMPDZri, X86::VPERMPDZmi, 0 },
+ { X86::VPERMQZri, X86::VPERMQZmi, 0 },
+ { X86::VPMOVSXBDZrr, X86::VPMOVSXBDZrm, 0 },
+ { X86::VPMOVSXBQZrr, X86::VPMOVSXBQZrm, TB_NO_REVERSE },
+ { X86::VPMOVSXBWZrr, X86::VPMOVSXBWZrm, 0 },
+ { X86::VPMOVSXDQZrr, X86::VPMOVSXDQZrm, 0 },
+ { X86::VPMOVSXWDZrr, X86::VPMOVSXWDZrm, 0 },
+ { X86::VPMOVSXWQZrr, X86::VPMOVSXWQZrm, 0 },
+ { X86::VPMOVZXBDZrr, X86::VPMOVZXBDZrm, 0 },
+ { X86::VPMOVZXBQZrr, X86::VPMOVZXBQZrm, TB_NO_REVERSE },
+ { X86::VPMOVZXBWZrr, X86::VPMOVZXBWZrm, 0 },
+ { X86::VPMOVZXDQZrr, X86::VPMOVZXDQZrm, 0 },
+ { X86::VPMOVZXWDZrr, X86::VPMOVZXWDZrm, 0 },
+ { X86::VPMOVZXWQZrr, X86::VPMOVZXWQZrm, 0 },
+ { X86::VPSHUFDZri, X86::VPSHUFDZmi, 0 },
+ { X86::VPSHUFHWZri, X86::VPSHUFHWZmi, 0 },
+ { X86::VPSHUFLWZri, X86::VPSHUFLWZmi, 0 },
+
+ // AVX-512 foldable instructions (256-bit versions)
+ { X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256m, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZ256r_s, X86::VBROADCASTSSZ256m, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256m, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZ256r_s, X86::VBROADCASTSDZ256m, TB_NO_REVERSE },
+ { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256rm, TB_ALIGN_32 },
+ { X86::VMOVAPSZ256rr, X86::VMOVAPSZ256rm, TB_ALIGN_32 },
+ { X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256rm, TB_ALIGN_32 },
+ { X86::VMOVDQA64Z256rr, X86::VMOVDQA64Z256rm, TB_ALIGN_32 },
+ { X86::VMOVDQU8Z256rr, X86::VMOVDQU8Z256rm, 0 },
+ { X86::VMOVDQU16Z256rr, X86::VMOVDQU16Z256rm, 0 },
+ { X86::VMOVDQU32Z256rr, X86::VMOVDQU32Z256rm, 0 },
+ { X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256rm, 0 },
+ { X86::VMOVUPDZ256rr, X86::VMOVUPDZ256rm, 0 },
+ { X86::VMOVUPSZ256rr, X86::VMOVUPSZ256rm, 0 },
+ { X86::VPERMILPDZ256ri, X86::VPERMILPDZ256mi, 0 },
+ { X86::VPERMILPSZ256ri, X86::VPERMILPSZ256mi, 0 },
+ { X86::VPERMPDZ256ri, X86::VPERMPDZ256mi, 0 },
+ { X86::VPERMQZ256ri, X86::VPERMQZ256mi, 0 },
+ { X86::VPMOVSXBDZ256rr, X86::VPMOVSXBDZ256rm, TB_NO_REVERSE },
+ { X86::VPMOVSXBQZ256rr, X86::VPMOVSXBQZ256rm, TB_NO_REVERSE },
+ { X86::VPMOVSXBWZ256rr, X86::VPMOVSXBWZ256rm, 0 },
+ { X86::VPMOVSXDQZ256rr, X86::VPMOVSXDQZ256rm, 0 },
+ { X86::VPMOVSXWDZ256rr, X86::VPMOVSXWDZ256rm, 0 },
+ { X86::VPMOVSXWQZ256rr, X86::VPMOVSXWQZ256rm, TB_NO_REVERSE },
+ { X86::VPMOVZXBDZ256rr, X86::VPMOVZXBDZ256rm, TB_NO_REVERSE },
+ { X86::VPMOVZXBQZ256rr, X86::VPMOVZXBQZ256rm, TB_NO_REVERSE },
+ { X86::VPMOVZXBWZ256rr, X86::VPMOVZXBWZ256rm, 0 },
+ { X86::VPMOVZXDQZ256rr, X86::VPMOVZXDQZ256rm, 0 },
+ { X86::VPMOVZXWDZ256rr, X86::VPMOVZXWDZ256rm, 0 },
+ { X86::VPMOVZXWQZ256rr, X86::VPMOVZXWQZ256rm, TB_NO_REVERSE },
+ { X86::VPSHUFDZ256ri, X86::VPSHUFDZ256mi, 0 },
+ { X86::VPSHUFHWZ256ri, X86::VPSHUFHWZ256mi, 0 },
+ { X86::VPSHUFLWZ256ri, X86::VPSHUFLWZ256mi, 0 },
+
+ // AVX-512 foldable instructions (128-bit versions)
+ { X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128m, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZ128r_s, X86::VBROADCASTSSZ128m, TB_NO_REVERSE },
+ { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128rm, TB_ALIGN_16 },
+ { X86::VMOVAPSZ128rr, X86::VMOVAPSZ128rm, TB_ALIGN_16 },
+ { X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128rm, TB_ALIGN_16 },
+ { X86::VMOVDQA64Z128rr, X86::VMOVDQA64Z128rm, TB_ALIGN_16 },
+ { X86::VMOVDQU8Z128rr, X86::VMOVDQU8Z128rm, 0 },
+ { X86::VMOVDQU16Z128rr, X86::VMOVDQU16Z128rm, 0 },
+ { X86::VMOVDQU32Z128rr, X86::VMOVDQU32Z128rm, 0 },
+ { X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128rm, 0 },
+ { X86::VMOVUPDZ128rr, X86::VMOVUPDZ128rm, 0 },
+ { X86::VMOVUPSZ128rr, X86::VMOVUPSZ128rm, 0 },
+ { X86::VPERMILPDZ128ri, X86::VPERMILPDZ128mi, 0 },
+ { X86::VPERMILPSZ128ri, X86::VPERMILPSZ128mi, 0 },
+ { X86::VPMOVSXBDZ128rr, X86::VPMOVSXBDZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVSXBQZ128rr, X86::VPMOVSXBQZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVSXBWZ128rr, X86::VPMOVSXBWZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVSXDQZ128rr, X86::VPMOVSXDQZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVSXWDZ128rr, X86::VPMOVSXWDZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVSXWQZ128rr, X86::VPMOVSXWQZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVZXBDZ128rr, X86::VPMOVZXBDZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVZXBQZ128rr, X86::VPMOVZXBQZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVZXBWZ128rr, X86::VPMOVZXBWZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVZXDQZ128rr, X86::VPMOVZXDQZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVZXWDZ128rr, X86::VPMOVZXWDZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVZXWQZ128rr, X86::VPMOVZXWQZ128rm, TB_NO_REVERSE },
+ { X86::VPSHUFDZ128ri, X86::VPSHUFDZ128mi, 0 },
+ { X86::VPSHUFHWZ128ri, X86::VPSHUFHWZ128mi, 0 },
+ { X86::VPSHUFLWZ128ri, X86::VPSHUFLWZ128mi, 0 },
+
+ // F16C foldable instructions
+ { X86::VCVTPH2PSrr, X86::VCVTPH2PSrm, 0 },
+ { X86::VCVTPH2PSYrr, X86::VCVTPH2PSYrm, 0 },
+
+ // AES foldable instructions
+ { X86::AESIMCrr, X86::AESIMCrm, TB_ALIGN_16 },
+ { X86::AESKEYGENASSIST128rr, X86::AESKEYGENASSIST128rm, TB_ALIGN_16 },
+ { X86::VAESIMCrr, X86::VAESIMCrm, 0 },
+ { X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, 0 }
+ };
+
+ for (X86MemoryFoldTableEntry Entry : MemoryFoldTable1) {
+ AddTableEntry(RegOp2MemOpTable1, MemOp2RegOpTable,
+ Entry.RegOp, Entry.MemOp,
+ // Index 1, folded load
+ Entry.Flags | TB_INDEX_1 | TB_FOLDED_LOAD);
+ }
+
+ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
+ { X86::ADC32rr, X86::ADC32rm, 0 },
+ { X86::ADC64rr, X86::ADC64rm, 0 },
+ { X86::ADD16rr, X86::ADD16rm, 0 },
+ { X86::ADD16rr_DB, X86::ADD16rm, TB_NO_REVERSE },
+ { X86::ADD32rr, X86::ADD32rm, 0 },
+ { X86::ADD32rr_DB, X86::ADD32rm, TB_NO_REVERSE },
+ { X86::ADD64rr, X86::ADD64rm, 0 },
+ { X86::ADD64rr_DB, X86::ADD64rm, TB_NO_REVERSE },
+ { X86::ADD8rr, X86::ADD8rm, 0 },
+ { X86::ADDPDrr, X86::ADDPDrm, TB_ALIGN_16 },
+ { X86::ADDPSrr, X86::ADDPSrm, TB_ALIGN_16 },
+ { X86::ADDSDrr, X86::ADDSDrm, 0 },
+ { X86::ADDSDrr_Int, X86::ADDSDrm_Int, TB_NO_REVERSE },
+ { X86::ADDSSrr, X86::ADDSSrm, 0 },
+ { X86::ADDSSrr_Int, X86::ADDSSrm_Int, TB_NO_REVERSE },
+ { X86::ADDSUBPDrr, X86::ADDSUBPDrm, TB_ALIGN_16 },
+ { X86::ADDSUBPSrr, X86::ADDSUBPSrm, TB_ALIGN_16 },
+ { X86::AND16rr, X86::AND16rm, 0 },
+ { X86::AND32rr, X86::AND32rm, 0 },
+ { X86::AND64rr, X86::AND64rm, 0 },
+ { X86::AND8rr, X86::AND8rm, 0 },
+ { X86::ANDNPDrr, X86::ANDNPDrm, TB_ALIGN_16 },
+ { X86::ANDNPSrr, X86::ANDNPSrm, TB_ALIGN_16 },
+ { X86::ANDPDrr, X86::ANDPDrm, TB_ALIGN_16 },
+ { X86::ANDPSrr, X86::ANDPSrm, TB_ALIGN_16 },
+ { X86::BLENDPDrri, X86::BLENDPDrmi, TB_ALIGN_16 },
+ { X86::BLENDPSrri, X86::BLENDPSrmi, TB_ALIGN_16 },
+ { X86::BLENDVPDrr0, X86::BLENDVPDrm0, TB_ALIGN_16 },
+ { X86::BLENDVPSrr0, X86::BLENDVPSrm0, TB_ALIGN_16 },
+ { X86::CMOVA16rr, X86::CMOVA16rm, 0 },
+ { X86::CMOVA32rr, X86::CMOVA32rm, 0 },
+ { X86::CMOVA64rr, X86::CMOVA64rm, 0 },
+ { X86::CMOVAE16rr, X86::CMOVAE16rm, 0 },
+ { X86::CMOVAE32rr, X86::CMOVAE32rm, 0 },
+ { X86::CMOVAE64rr, X86::CMOVAE64rm, 0 },
+ { X86::CMOVB16rr, X86::CMOVB16rm, 0 },
+ { X86::CMOVB32rr, X86::CMOVB32rm, 0 },
+ { X86::CMOVB64rr, X86::CMOVB64rm, 0 },
+ { X86::CMOVBE16rr, X86::CMOVBE16rm, 0 },
+ { X86::CMOVBE32rr, X86::CMOVBE32rm, 0 },
+ { X86::CMOVBE64rr, X86::CMOVBE64rm, 0 },
+ { X86::CMOVE16rr, X86::CMOVE16rm, 0 },
+ { X86::CMOVE32rr, X86::CMOVE32rm, 0 },
+ { X86::CMOVE64rr, X86::CMOVE64rm, 0 },
+ { X86::CMOVG16rr, X86::CMOVG16rm, 0 },
+ { X86::CMOVG32rr, X86::CMOVG32rm, 0 },
+ { X86::CMOVG64rr, X86::CMOVG64rm, 0 },
+ { X86::CMOVGE16rr, X86::CMOVGE16rm, 0 },
+ { X86::CMOVGE32rr, X86::CMOVGE32rm, 0 },
+ { X86::CMOVGE64rr, X86::CMOVGE64rm, 0 },
+ { X86::CMOVL16rr, X86::CMOVL16rm, 0 },
+ { X86::CMOVL32rr, X86::CMOVL32rm, 0 },
+ { X86::CMOVL64rr, X86::CMOVL64rm, 0 },
+ { X86::CMOVLE16rr, X86::CMOVLE16rm, 0 },
+ { X86::CMOVLE32rr, X86::CMOVLE32rm, 0 },
+ { X86::CMOVLE64rr, X86::CMOVLE64rm, 0 },
+ { X86::CMOVNE16rr, X86::CMOVNE16rm, 0 },
+ { X86::CMOVNE32rr, X86::CMOVNE32rm, 0 },
+ { X86::CMOVNE64rr, X86::CMOVNE64rm, 0 },
+ { X86::CMOVNO16rr, X86::CMOVNO16rm, 0 },
+ { X86::CMOVNO32rr, X86::CMOVNO32rm, 0 },
+ { X86::CMOVNO64rr, X86::CMOVNO64rm, 0 },
+ { X86::CMOVNP16rr, X86::CMOVNP16rm, 0 },
+ { X86::CMOVNP32rr, X86::CMOVNP32rm, 0 },
+ { X86::CMOVNP64rr, X86::CMOVNP64rm, 0 },
+ { X86::CMOVNS16rr, X86::CMOVNS16rm, 0 },
+ { X86::CMOVNS32rr, X86::CMOVNS32rm, 0 },
+ { X86::CMOVNS64rr, X86::CMOVNS64rm, 0 },
+ { X86::CMOVO16rr, X86::CMOVO16rm, 0 },
+ { X86::CMOVO32rr, X86::CMOVO32rm, 0 },
+ { X86::CMOVO64rr, X86::CMOVO64rm, 0 },
+ { X86::CMOVP16rr, X86::CMOVP16rm, 0 },
+ { X86::CMOVP32rr, X86::CMOVP32rm, 0 },
+ { X86::CMOVP64rr, X86::CMOVP64rm, 0 },
+ { X86::CMOVS16rr, X86::CMOVS16rm, 0 },
+ { X86::CMOVS32rr, X86::CMOVS32rm, 0 },
+ { X86::CMOVS64rr, X86::CMOVS64rm, 0 },
+ { X86::CMPPDrri, X86::CMPPDrmi, TB_ALIGN_16 },
+ { X86::CMPPSrri, X86::CMPPSrmi, TB_ALIGN_16 },
+ { X86::CMPSDrr, X86::CMPSDrm, 0 },
+ { X86::CMPSSrr, X86::CMPSSrm, 0 },
+ { X86::CRC32r32r32, X86::CRC32r32m32, 0 },
+ { X86::CRC32r64r64, X86::CRC32r64m64, 0 },
+ { X86::DIVPDrr, X86::DIVPDrm, TB_ALIGN_16 },
+ { X86::DIVPSrr, X86::DIVPSrm, TB_ALIGN_16 },
+ { X86::DIVSDrr, X86::DIVSDrm, 0 },
+ { X86::DIVSDrr_Int, X86::DIVSDrm_Int, TB_NO_REVERSE },
+ { X86::DIVSSrr, X86::DIVSSrm, 0 },
+ { X86::DIVSSrr_Int, X86::DIVSSrm_Int, TB_NO_REVERSE },
+ { X86::DPPDrri, X86::DPPDrmi, TB_ALIGN_16 },
+ { X86::DPPSrri, X86::DPPSrmi, TB_ALIGN_16 },
+ { X86::HADDPDrr, X86::HADDPDrm, TB_ALIGN_16 },
+ { X86::HADDPSrr, X86::HADDPSrm, TB_ALIGN_16 },
+ { X86::HSUBPDrr, X86::HSUBPDrm, TB_ALIGN_16 },
+ { X86::HSUBPSrr, X86::HSUBPSrm, TB_ALIGN_16 },
+ { X86::IMUL16rr, X86::IMUL16rm, 0 },
+ { X86::IMUL32rr, X86::IMUL32rm, 0 },
+ { X86::IMUL64rr, X86::IMUL64rm, 0 },
+ { X86::Int_CMPSDrr, X86::Int_CMPSDrm, TB_NO_REVERSE },
+ { X86::Int_CMPSSrr, X86::Int_CMPSSrm, TB_NO_REVERSE },
+ { X86::Int_CVTSD2SSrr, X86::Int_CVTSD2SSrm, TB_NO_REVERSE },
+ { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm, 0 },
+ { X86::Int_CVTSI2SDrr, X86::Int_CVTSI2SDrm, 0 },
+ { X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm, 0 },
+ { X86::Int_CVTSI2SSrr, X86::Int_CVTSI2SSrm, 0 },
+ { X86::Int_CVTSS2SDrr, X86::Int_CVTSS2SDrm, TB_NO_REVERSE },
+ { X86::MAXPDrr, X86::MAXPDrm, TB_ALIGN_16 },
+ { X86::MAXCPDrr, X86::MAXCPDrm, TB_ALIGN_16 },
+ { X86::MAXPSrr, X86::MAXPSrm, TB_ALIGN_16 },
+ { X86::MAXCPSrr, X86::MAXCPSrm, TB_ALIGN_16 },
+ { X86::MAXSDrr, X86::MAXSDrm, 0 },
+ { X86::MAXCSDrr, X86::MAXCSDrm, 0 },
+ { X86::MAXSDrr_Int, X86::MAXSDrm_Int, TB_NO_REVERSE },
+ { X86::MAXSSrr, X86::MAXSSrm, 0 },
+ { X86::MAXCSSrr, X86::MAXCSSrm, 0 },
+ { X86::MAXSSrr_Int, X86::MAXSSrm_Int, TB_NO_REVERSE },
+ { X86::MINPDrr, X86::MINPDrm, TB_ALIGN_16 },
+ { X86::MINCPDrr, X86::MINCPDrm, TB_ALIGN_16 },
+ { X86::MINPSrr, X86::MINPSrm, TB_ALIGN_16 },
+ { X86::MINCPSrr, X86::MINCPSrm, TB_ALIGN_16 },
+ { X86::MINSDrr, X86::MINSDrm, 0 },
+ { X86::MINCSDrr, X86::MINCSDrm, 0 },
+ { X86::MINSDrr_Int, X86::MINSDrm_Int, TB_NO_REVERSE },
+ { X86::MINSSrr, X86::MINSSrm, 0 },
+ { X86::MINCSSrr, X86::MINCSSrm, 0 },
+ { X86::MINSSrr_Int, X86::MINSSrm_Int, TB_NO_REVERSE },
+ { X86::MOVLHPSrr, X86::MOVHPSrm, TB_NO_REVERSE },
+ { X86::MPSADBWrri, X86::MPSADBWrmi, TB_ALIGN_16 },
+ { X86::MULPDrr, X86::MULPDrm, TB_ALIGN_16 },
+ { X86::MULPSrr, X86::MULPSrm, TB_ALIGN_16 },
+ { X86::MULSDrr, X86::MULSDrm, 0 },
+ { X86::MULSDrr_Int, X86::MULSDrm_Int, TB_NO_REVERSE },
+ { X86::MULSSrr, X86::MULSSrm, 0 },
+ { X86::MULSSrr_Int, X86::MULSSrm_Int, TB_NO_REVERSE },
+ { X86::OR16rr, X86::OR16rm, 0 },
+ { X86::OR32rr, X86::OR32rm, 0 },
+ { X86::OR64rr, X86::OR64rm, 0 },
+ { X86::OR8rr, X86::OR8rm, 0 },
+ { X86::ORPDrr, X86::ORPDrm, TB_ALIGN_16 },
+ { X86::ORPSrr, X86::ORPSrm, TB_ALIGN_16 },
+ { X86::PACKSSDWrr, X86::PACKSSDWrm, TB_ALIGN_16 },
+ { X86::PACKSSWBrr, X86::PACKSSWBrm, TB_ALIGN_16 },
+ { X86::PACKUSDWrr, X86::PACKUSDWrm, TB_ALIGN_16 },
+ { X86::PACKUSWBrr, X86::PACKUSWBrm, TB_ALIGN_16 },
+ { X86::PADDBrr, X86::PADDBrm, TB_ALIGN_16 },
+ { X86::PADDDrr, X86::PADDDrm, TB_ALIGN_16 },
+ { X86::PADDQrr, X86::PADDQrm, TB_ALIGN_16 },
+ { X86::PADDSBrr, X86::PADDSBrm, TB_ALIGN_16 },
+ { X86::PADDSWrr, X86::PADDSWrm, TB_ALIGN_16 },
+ { X86::PADDUSBrr, X86::PADDUSBrm, TB_ALIGN_16 },
+ { X86::PADDUSWrr, X86::PADDUSWrm, TB_ALIGN_16 },
+ { X86::PADDWrr, X86::PADDWrm, TB_ALIGN_16 },
+ { X86::PALIGNRrri, X86::PALIGNRrmi, TB_ALIGN_16 },
+ { X86::PANDNrr, X86::PANDNrm, TB_ALIGN_16 },
+ { X86::PANDrr, X86::PANDrm, TB_ALIGN_16 },
+ { X86::PAVGBrr, X86::PAVGBrm, TB_ALIGN_16 },
+ { X86::PAVGWrr, X86::PAVGWrm, TB_ALIGN_16 },
+ { X86::PBLENDVBrr0, X86::PBLENDVBrm0, TB_ALIGN_16 },
+ { X86::PBLENDWrri, X86::PBLENDWrmi, TB_ALIGN_16 },
+ { X86::PCLMULQDQrr, X86::PCLMULQDQrm, TB_ALIGN_16 },
+ { X86::PCMPEQBrr, X86::PCMPEQBrm, TB_ALIGN_16 },
+ { X86::PCMPEQDrr, X86::PCMPEQDrm, TB_ALIGN_16 },
+ { X86::PCMPEQQrr, X86::PCMPEQQrm, TB_ALIGN_16 },
+ { X86::PCMPEQWrr, X86::PCMPEQWrm, TB_ALIGN_16 },
+ { X86::PCMPGTBrr, X86::PCMPGTBrm, TB_ALIGN_16 },
+ { X86::PCMPGTDrr, X86::PCMPGTDrm, TB_ALIGN_16 },
+ { X86::PCMPGTQrr, X86::PCMPGTQrm, TB_ALIGN_16 },
+ { X86::PCMPGTWrr, X86::PCMPGTWrm, TB_ALIGN_16 },
+ { X86::PHADDDrr, X86::PHADDDrm, TB_ALIGN_16 },
+ { X86::PHADDWrr, X86::PHADDWrm, TB_ALIGN_16 },
+ { X86::PHADDSWrr128, X86::PHADDSWrm128, TB_ALIGN_16 },
+ { X86::PHSUBDrr, X86::PHSUBDrm, TB_ALIGN_16 },
+ { X86::PHSUBSWrr128, X86::PHSUBSWrm128, TB_ALIGN_16 },
+ { X86::PHSUBWrr, X86::PHSUBWrm, TB_ALIGN_16 },
+ { X86::PINSRBrr, X86::PINSRBrm, 0 },
+ { X86::PINSRDrr, X86::PINSRDrm, 0 },
+ { X86::PINSRQrr, X86::PINSRQrm, 0 },
+ { X86::PINSRWrri, X86::PINSRWrmi, 0 },
+ { X86::PMADDUBSWrr, X86::PMADDUBSWrm, TB_ALIGN_16 },
+ { X86::PMADDWDrr, X86::PMADDWDrm, TB_ALIGN_16 },
+ { X86::PMAXSWrr, X86::PMAXSWrm, TB_ALIGN_16 },
+ { X86::PMAXUBrr, X86::PMAXUBrm, TB_ALIGN_16 },
+ { X86::PMINSWrr, X86::PMINSWrm, TB_ALIGN_16 },
+ { X86::PMINUBrr, X86::PMINUBrm, TB_ALIGN_16 },
+ { X86::PMINSBrr, X86::PMINSBrm, TB_ALIGN_16 },
+ { X86::PMINSDrr, X86::PMINSDrm, TB_ALIGN_16 },
+ { X86::PMINUDrr, X86::PMINUDrm, TB_ALIGN_16 },
+ { X86::PMINUWrr, X86::PMINUWrm, TB_ALIGN_16 },
+ { X86::PMAXSBrr, X86::PMAXSBrm, TB_ALIGN_16 },
+ { X86::PMAXSDrr, X86::PMAXSDrm, TB_ALIGN_16 },
+ { X86::PMAXUDrr, X86::PMAXUDrm, TB_ALIGN_16 },
+ { X86::PMAXUWrr, X86::PMAXUWrm, TB_ALIGN_16 },
+ { X86::PMULDQrr, X86::PMULDQrm, TB_ALIGN_16 },
+ { X86::PMULHRSWrr, X86::PMULHRSWrm, TB_ALIGN_16 },
+ { X86::PMULHUWrr, X86::PMULHUWrm, TB_ALIGN_16 },
+ { X86::PMULHWrr, X86::PMULHWrm, TB_ALIGN_16 },
+ { X86::PMULLDrr, X86::PMULLDrm, TB_ALIGN_16 },
+ { X86::PMULLWrr, X86::PMULLWrm, TB_ALIGN_16 },
+ { X86::PMULUDQrr, X86::PMULUDQrm, TB_ALIGN_16 },
+ { X86::PORrr, X86::PORrm, TB_ALIGN_16 },
+ { X86::PSADBWrr, X86::PSADBWrm, TB_ALIGN_16 },
+ { X86::PSHUFBrr, X86::PSHUFBrm, TB_ALIGN_16 },
+ { X86::PSIGNBrr128, X86::PSIGNBrm128, TB_ALIGN_16 },
+ { X86::PSIGNWrr128, X86::PSIGNWrm128, TB_ALIGN_16 },
+ { X86::PSIGNDrr128, X86::PSIGNDrm128, TB_ALIGN_16 },
+ { X86::PSLLDrr, X86::PSLLDrm, TB_ALIGN_16 },
+ { X86::PSLLQrr, X86::PSLLQrm, TB_ALIGN_16 },
+ { X86::PSLLWrr, X86::PSLLWrm, TB_ALIGN_16 },
+ { X86::PSRADrr, X86::PSRADrm, TB_ALIGN_16 },
+ { X86::PSRAWrr, X86::PSRAWrm, TB_ALIGN_16 },
+ { X86::PSRLDrr, X86::PSRLDrm, TB_ALIGN_16 },
+ { X86::PSRLQrr, X86::PSRLQrm, TB_ALIGN_16 },
+ { X86::PSRLWrr, X86::PSRLWrm, TB_ALIGN_16 },
+ { X86::PSUBBrr, X86::PSUBBrm, TB_ALIGN_16 },
+ { X86::PSUBDrr, X86::PSUBDrm, TB_ALIGN_16 },
+ { X86::PSUBQrr, X86::PSUBQrm, TB_ALIGN_16 },
+ { X86::PSUBSBrr, X86::PSUBSBrm, TB_ALIGN_16 },
+ { X86::PSUBSWrr, X86::PSUBSWrm, TB_ALIGN_16 },
+ { X86::PSUBUSBrr, X86::PSUBUSBrm, TB_ALIGN_16 },
+ { X86::PSUBUSWrr, X86::PSUBUSWrm, TB_ALIGN_16 },
+ { X86::PSUBWrr, X86::PSUBWrm, TB_ALIGN_16 },
+ { X86::PUNPCKHBWrr, X86::PUNPCKHBWrm, TB_ALIGN_16 },
+ { X86::PUNPCKHDQrr, X86::PUNPCKHDQrm, TB_ALIGN_16 },
+ { X86::PUNPCKHQDQrr, X86::PUNPCKHQDQrm, TB_ALIGN_16 },
+ { X86::PUNPCKHWDrr, X86::PUNPCKHWDrm, TB_ALIGN_16 },
+ { X86::PUNPCKLBWrr, X86::PUNPCKLBWrm, TB_ALIGN_16 },
+ { X86::PUNPCKLDQrr, X86::PUNPCKLDQrm, TB_ALIGN_16 },
+ { X86::PUNPCKLQDQrr, X86::PUNPCKLQDQrm, TB_ALIGN_16 },
+ { X86::PUNPCKLWDrr, X86::PUNPCKLWDrm, TB_ALIGN_16 },
+ { X86::PXORrr, X86::PXORrm, TB_ALIGN_16 },
+ { X86::ROUNDSDr_Int, X86::ROUNDSDm_Int, TB_NO_REVERSE },
+ { X86::ROUNDSSr_Int, X86::ROUNDSSm_Int, TB_NO_REVERSE },
+ { X86::SBB32rr, X86::SBB32rm, 0 },
+ { X86::SBB64rr, X86::SBB64rm, 0 },
+ { X86::SHUFPDrri, X86::SHUFPDrmi, TB_ALIGN_16 },
+ { X86::SHUFPSrri, X86::SHUFPSrmi, TB_ALIGN_16 },
+ { X86::SUB16rr, X86::SUB16rm, 0 },
+ { X86::SUB32rr, X86::SUB32rm, 0 },
+ { X86::SUB64rr, X86::SUB64rm, 0 },
+ { X86::SUB8rr, X86::SUB8rm, 0 },
+ { X86::SUBPDrr, X86::SUBPDrm, TB_ALIGN_16 },
+ { X86::SUBPSrr, X86::SUBPSrm, TB_ALIGN_16 },
+ { X86::SUBSDrr, X86::SUBSDrm, 0 },
+ { X86::SUBSDrr_Int, X86::SUBSDrm_Int, TB_NO_REVERSE },
+ { X86::SUBSSrr, X86::SUBSSrm, 0 },
+ { X86::SUBSSrr_Int, X86::SUBSSrm_Int, TB_NO_REVERSE },
+ // FIXME: TEST*rr -> swapped operand of TEST*mr.
+ { X86::UNPCKHPDrr, X86::UNPCKHPDrm, TB_ALIGN_16 },
+ { X86::UNPCKHPSrr, X86::UNPCKHPSrm, TB_ALIGN_16 },
+ { X86::UNPCKLPDrr, X86::UNPCKLPDrm, TB_ALIGN_16 },
+ { X86::UNPCKLPSrr, X86::UNPCKLPSrm, TB_ALIGN_16 },
+ { X86::XOR16rr, X86::XOR16rm, 0 },
+ { X86::XOR32rr, X86::XOR32rm, 0 },
+ { X86::XOR64rr, X86::XOR64rm, 0 },
+ { X86::XOR8rr, X86::XOR8rm, 0 },
+ { X86::XORPDrr, X86::XORPDrm, TB_ALIGN_16 },
+ { X86::XORPSrr, X86::XORPSrm, TB_ALIGN_16 },
+
+ // MMX version of foldable instructions
+ { X86::MMX_CVTPI2PSirr, X86::MMX_CVTPI2PSirm, 0 },
+ { X86::MMX_PACKSSDWirr, X86::MMX_PACKSSDWirm, 0 },
+ { X86::MMX_PACKSSWBirr, X86::MMX_PACKSSWBirm, 0 },
+ { X86::MMX_PACKUSWBirr, X86::MMX_PACKUSWBirm, 0 },
+ { X86::MMX_PADDBirr, X86::MMX_PADDBirm, 0 },
+ { X86::MMX_PADDDirr, X86::MMX_PADDDirm, 0 },
+ { X86::MMX_PADDQirr, X86::MMX_PADDQirm, 0 },
+ { X86::MMX_PADDSBirr, X86::MMX_PADDSBirm, 0 },
+ { X86::MMX_PADDSWirr, X86::MMX_PADDSWirm, 0 },
+ { X86::MMX_PADDUSBirr, X86::MMX_PADDUSBirm, 0 },
+ { X86::MMX_PADDUSWirr, X86::MMX_PADDUSWirm, 0 },
+ { X86::MMX_PADDWirr, X86::MMX_PADDWirm, 0 },
+ { X86::MMX_PALIGNR64irr, X86::MMX_PALIGNR64irm, 0 },
+ { X86::MMX_PANDNirr, X86::MMX_PANDNirm, 0 },
+ { X86::MMX_PANDirr, X86::MMX_PANDirm, 0 },
+ { X86::MMX_PAVGBirr, X86::MMX_PAVGBirm, 0 },
+ { X86::MMX_PAVGWirr, X86::MMX_PAVGWirm, 0 },
+ { X86::MMX_PCMPEQBirr, X86::MMX_PCMPEQBirm, 0 },
+ { X86::MMX_PCMPEQDirr, X86::MMX_PCMPEQDirm, 0 },
+ { X86::MMX_PCMPEQWirr, X86::MMX_PCMPEQWirm, 0 },
+ { X86::MMX_PCMPGTBirr, X86::MMX_PCMPGTBirm, 0 },
+ { X86::MMX_PCMPGTDirr, X86::MMX_PCMPGTDirm, 0 },
+ { X86::MMX_PCMPGTWirr, X86::MMX_PCMPGTWirm, 0 },
+ { X86::MMX_PHADDSWrr64, X86::MMX_PHADDSWrm64, 0 },
+ { X86::MMX_PHADDWrr64, X86::MMX_PHADDWrm64, 0 },
+ { X86::MMX_PHADDrr64, X86::MMX_PHADDrm64, 0 },
+ { X86::MMX_PHSUBDrr64, X86::MMX_PHSUBDrm64, 0 },
+ { X86::MMX_PHSUBSWrr64, X86::MMX_PHSUBSWrm64, 0 },
+ { X86::MMX_PHSUBWrr64, X86::MMX_PHSUBWrm64, 0 },
+ { X86::MMX_PINSRWirri, X86::MMX_PINSRWirmi, 0 },
+ { X86::MMX_PMADDUBSWrr64, X86::MMX_PMADDUBSWrm64, 0 },
+ { X86::MMX_PMADDWDirr, X86::MMX_PMADDWDirm, 0 },
+ { X86::MMX_PMAXSWirr, X86::MMX_PMAXSWirm, 0 },
+ { X86::MMX_PMAXUBirr, X86::MMX_PMAXUBirm, 0 },
+ { X86::MMX_PMINSWirr, X86::MMX_PMINSWirm, 0 },
+ { X86::MMX_PMINUBirr, X86::MMX_PMINUBirm, 0 },
+ { X86::MMX_PMULHRSWrr64, X86::MMX_PMULHRSWrm64, 0 },
+ { X86::MMX_PMULHUWirr, X86::MMX_PMULHUWirm, 0 },
+ { X86::MMX_PMULHWirr, X86::MMX_PMULHWirm, 0 },
+ { X86::MMX_PMULLWirr, X86::MMX_PMULLWirm, 0 },
+ { X86::MMX_PMULUDQirr, X86::MMX_PMULUDQirm, 0 },
+ { X86::MMX_PORirr, X86::MMX_PORirm, 0 },
+ { X86::MMX_PSADBWirr, X86::MMX_PSADBWirm, 0 },
+ { X86::MMX_PSHUFBrr64, X86::MMX_PSHUFBrm64, 0 },
+ { X86::MMX_PSIGNBrr64, X86::MMX_PSIGNBrm64, 0 },
+ { X86::MMX_PSIGNDrr64, X86::MMX_PSIGNDrm64, 0 },
+ { X86::MMX_PSIGNWrr64, X86::MMX_PSIGNWrm64, 0 },
+ { X86::MMX_PSLLDrr, X86::MMX_PSLLDrm, 0 },
+ { X86::MMX_PSLLQrr, X86::MMX_PSLLQrm, 0 },
+ { X86::MMX_PSLLWrr, X86::MMX_PSLLWrm, 0 },
+ { X86::MMX_PSRADrr, X86::MMX_PSRADrm, 0 },
+ { X86::MMX_PSRAWrr, X86::MMX_PSRAWrm, 0 },
+ { X86::MMX_PSRLDrr, X86::MMX_PSRLDrm, 0 },
+ { X86::MMX_PSRLQrr, X86::MMX_PSRLQrm, 0 },
+ { X86::MMX_PSRLWrr, X86::MMX_PSRLWrm, 0 },
+ { X86::MMX_PSUBBirr, X86::MMX_PSUBBirm, 0 },
+ { X86::MMX_PSUBDirr, X86::MMX_PSUBDirm, 0 },
+ { X86::MMX_PSUBQirr, X86::MMX_PSUBQirm, 0 },
+ { X86::MMX_PSUBSBirr, X86::MMX_PSUBSBirm, 0 },
+ { X86::MMX_PSUBSWirr, X86::MMX_PSUBSWirm, 0 },
+ { X86::MMX_PSUBUSBirr, X86::MMX_PSUBUSBirm, 0 },
+ { X86::MMX_PSUBUSWirr, X86::MMX_PSUBUSWirm, 0 },
+ { X86::MMX_PSUBWirr, X86::MMX_PSUBWirm, 0 },
+ { X86::MMX_PUNPCKHBWirr, X86::MMX_PUNPCKHBWirm, 0 },
+ { X86::MMX_PUNPCKHDQirr, X86::MMX_PUNPCKHDQirm, 0 },
+ { X86::MMX_PUNPCKHWDirr, X86::MMX_PUNPCKHWDirm, 0 },
+ { X86::MMX_PUNPCKLBWirr, X86::MMX_PUNPCKLBWirm, 0 },
+ { X86::MMX_PUNPCKLDQirr, X86::MMX_PUNPCKLDQirm, 0 },
+ { X86::MMX_PUNPCKLWDirr, X86::MMX_PUNPCKLWDirm, 0 },
+ { X86::MMX_PXORirr, X86::MMX_PXORirm, 0 },
+
+ // 3DNow! version of foldable instructions
+ { X86::PAVGUSBrr, X86::PAVGUSBrm, 0 },
+ { X86::PFACCrr, X86::PFACCrm, 0 },
+ { X86::PFADDrr, X86::PFADDrm, 0 },
+ { X86::PFCMPEQrr, X86::PFCMPEQrm, 0 },
+ { X86::PFCMPGErr, X86::PFCMPGErm, 0 },
+ { X86::PFCMPGTrr, X86::PFCMPGTrm, 0 },
+ { X86::PFMAXrr, X86::PFMAXrm, 0 },
+ { X86::PFMINrr, X86::PFMINrm, 0 },
+ { X86::PFMULrr, X86::PFMULrm, 0 },
+ { X86::PFNACCrr, X86::PFNACCrm, 0 },
+ { X86::PFPNACCrr, X86::PFPNACCrm, 0 },
+ { X86::PFRCPIT1rr, X86::PFRCPIT1rm, 0 },
+ { X86::PFRCPIT2rr, X86::PFRCPIT2rm, 0 },
+ { X86::PFRSQIT1rr, X86::PFRSQIT1rm, 0 },
+ { X86::PFSUBrr, X86::PFSUBrm, 0 },
+ { X86::PFSUBRrr, X86::PFSUBRrm, 0 },
+ { X86::PMULHRWrr, X86::PMULHRWrm, 0 },
+
+ // AVX 128-bit versions of foldable instructions
+ { X86::VCVTSD2SSrr, X86::VCVTSD2SSrm, 0 },
+ { X86::Int_VCVTSD2SSrr, X86::Int_VCVTSD2SSrm, TB_NO_REVERSE },
+ { X86::VCVTSI2SD64rr, X86::VCVTSI2SD64rm, 0 },
+ { X86::Int_VCVTSI2SD64rr, X86::Int_VCVTSI2SD64rm, 0 },
+ { X86::VCVTSI2SDrr, X86::VCVTSI2SDrm, 0 },
+ { X86::Int_VCVTSI2SDrr, X86::Int_VCVTSI2SDrm, 0 },
+ { X86::VCVTSI2SS64rr, X86::VCVTSI2SS64rm, 0 },
+ { X86::Int_VCVTSI2SS64rr, X86::Int_VCVTSI2SS64rm, 0 },
+ { X86::VCVTSI2SSrr, X86::VCVTSI2SSrm, 0 },
+ { X86::Int_VCVTSI2SSrr, X86::Int_VCVTSI2SSrm, 0 },
+ { X86::VCVTSS2SDrr, X86::VCVTSS2SDrm, 0 },
+ { X86::Int_VCVTSS2SDrr, X86::Int_VCVTSS2SDrm, TB_NO_REVERSE },
+ { X86::VADDPDrr, X86::VADDPDrm, 0 },
+ { X86::VADDPSrr, X86::VADDPSrm, 0 },
+ { X86::VADDSDrr, X86::VADDSDrm, 0 },
+ { X86::VADDSDrr_Int, X86::VADDSDrm_Int, TB_NO_REVERSE },
+ { X86::VADDSSrr, X86::VADDSSrm, 0 },
+ { X86::VADDSSrr_Int, X86::VADDSSrm_Int, TB_NO_REVERSE },
+ { X86::VADDSUBPDrr, X86::VADDSUBPDrm, 0 },
+ { X86::VADDSUBPSrr, X86::VADDSUBPSrm, 0 },
+ { X86::VANDNPDrr, X86::VANDNPDrm, 0 },
+ { X86::VANDNPSrr, X86::VANDNPSrm, 0 },
+ { X86::VANDPDrr, X86::VANDPDrm, 0 },
+ { X86::VANDPSrr, X86::VANDPSrm, 0 },
+ { X86::VBLENDPDrri, X86::VBLENDPDrmi, 0 },
+ { X86::VBLENDPSrri, X86::VBLENDPSrmi, 0 },
+ { X86::VBLENDVPDrr, X86::VBLENDVPDrm, 0 },
+ { X86::VBLENDVPSrr, X86::VBLENDVPSrm, 0 },
+ { X86::VCMPPDrri, X86::VCMPPDrmi, 0 },
+ { X86::VCMPPSrri, X86::VCMPPSrmi, 0 },
+ { X86::VCMPSDrr, X86::VCMPSDrm, 0 },
+ { X86::VCMPSSrr, X86::VCMPSSrm, 0 },
+ { X86::VDIVPDrr, X86::VDIVPDrm, 0 },
+ { X86::VDIVPSrr, X86::VDIVPSrm, 0 },
+ { X86::VDIVSDrr, X86::VDIVSDrm, 0 },
+ { X86::VDIVSDrr_Int, X86::VDIVSDrm_Int, TB_NO_REVERSE },
+ { X86::VDIVSSrr, X86::VDIVSSrm, 0 },
+ { X86::VDIVSSrr_Int, X86::VDIVSSrm_Int, TB_NO_REVERSE },
+ { X86::VDPPDrri, X86::VDPPDrmi, 0 },
+ { X86::VDPPSrri, X86::VDPPSrmi, 0 },
+ { X86::VHADDPDrr, X86::VHADDPDrm, 0 },
+ { X86::VHADDPSrr, X86::VHADDPSrm, 0 },
+ { X86::VHSUBPDrr, X86::VHSUBPDrm, 0 },
+ { X86::VHSUBPSrr, X86::VHSUBPSrm, 0 },
+ { X86::Int_VCMPSDrr, X86::Int_VCMPSDrm, TB_NO_REVERSE },
+ { X86::Int_VCMPSSrr, X86::Int_VCMPSSrm, TB_NO_REVERSE },
+ { X86::VMAXCPDrr, X86::VMAXCPDrm, 0 },
+ { X86::VMAXCPSrr, X86::VMAXCPSrm, 0 },
+ { X86::VMAXCSDrr, X86::VMAXCSDrm, 0 },
+ { X86::VMAXCSSrr, X86::VMAXCSSrm, 0 },
+ { X86::VMAXPDrr, X86::VMAXPDrm, 0 },
+ { X86::VMAXPSrr, X86::VMAXPSrm, 0 },
+ { X86::VMAXSDrr, X86::VMAXSDrm, 0 },
+ { X86::VMAXSDrr_Int, X86::VMAXSDrm_Int, TB_NO_REVERSE },
+ { X86::VMAXSSrr, X86::VMAXSSrm, 0 },
+ { X86::VMAXSSrr_Int, X86::VMAXSSrm_Int, TB_NO_REVERSE },
+ { X86::VMINCPDrr, X86::VMINCPDrm, 0 },
+ { X86::VMINCPSrr, X86::VMINCPSrm, 0 },
+ { X86::VMINCSDrr, X86::VMINCSDrm, 0 },
+ { X86::VMINCSSrr, X86::VMINCSSrm, 0 },
+ { X86::VMINPDrr, X86::VMINPDrm, 0 },
+ { X86::VMINPSrr, X86::VMINPSrm, 0 },
+ { X86::VMINSDrr, X86::VMINSDrm, 0 },
+ { X86::VMINSDrr_Int, X86::VMINSDrm_Int, TB_NO_REVERSE },
+ { X86::VMINSSrr, X86::VMINSSrm, 0 },
+ { X86::VMINSSrr_Int, X86::VMINSSrm_Int, TB_NO_REVERSE },
+ { X86::VMOVLHPSrr, X86::VMOVHPSrm, TB_NO_REVERSE },
+ { X86::VMPSADBWrri, X86::VMPSADBWrmi, 0 },
+ { X86::VMULPDrr, X86::VMULPDrm, 0 },
+ { X86::VMULPSrr, X86::VMULPSrm, 0 },
+ { X86::VMULSDrr, X86::VMULSDrm, 0 },
+ { X86::VMULSDrr_Int, X86::VMULSDrm_Int, TB_NO_REVERSE },
+ { X86::VMULSSrr, X86::VMULSSrm, 0 },
+ { X86::VMULSSrr_Int, X86::VMULSSrm_Int, TB_NO_REVERSE },
+ { X86::VORPDrr, X86::VORPDrm, 0 },
+ { X86::VORPSrr, X86::VORPSrm, 0 },
+ { X86::VPACKSSDWrr, X86::VPACKSSDWrm, 0 },
+ { X86::VPACKSSWBrr, X86::VPACKSSWBrm, 0 },
+ { X86::VPACKUSDWrr, X86::VPACKUSDWrm, 0 },
+ { X86::VPACKUSWBrr, X86::VPACKUSWBrm, 0 },
+ { X86::VPADDBrr, X86::VPADDBrm, 0 },
+ { X86::VPADDDrr, X86::VPADDDrm, 0 },
+ { X86::VPADDQrr, X86::VPADDQrm, 0 },
+ { X86::VPADDSBrr, X86::VPADDSBrm, 0 },
+ { X86::VPADDSWrr, X86::VPADDSWrm, 0 },
+ { X86::VPADDUSBrr, X86::VPADDUSBrm, 0 },
+ { X86::VPADDUSWrr, X86::VPADDUSWrm, 0 },
+ { X86::VPADDWrr, X86::VPADDWrm, 0 },
+ { X86::VPALIGNRrri, X86::VPALIGNRrmi, 0 },
+ { X86::VPANDNrr, X86::VPANDNrm, 0 },
+ { X86::VPANDrr, X86::VPANDrm, 0 },
+ { X86::VPAVGBrr, X86::VPAVGBrm, 0 },
+ { X86::VPAVGWrr, X86::VPAVGWrm, 0 },
+ { X86::VPBLENDVBrr, X86::VPBLENDVBrm, 0 },
+ { X86::VPBLENDWrri, X86::VPBLENDWrmi, 0 },
+ { X86::VPCLMULQDQrr, X86::VPCLMULQDQrm, 0 },
+ { X86::VPCMPEQBrr, X86::VPCMPEQBrm, 0 },
+ { X86::VPCMPEQDrr, X86::VPCMPEQDrm, 0 },
+ { X86::VPCMPEQQrr, X86::VPCMPEQQrm, 0 },
+ { X86::VPCMPEQWrr, X86::VPCMPEQWrm, 0 },
+ { X86::VPCMPGTBrr, X86::VPCMPGTBrm, 0 },
+ { X86::VPCMPGTDrr, X86::VPCMPGTDrm, 0 },
+ { X86::VPCMPGTQrr, X86::VPCMPGTQrm, 0 },
+ { X86::VPCMPGTWrr, X86::VPCMPGTWrm, 0 },
+ { X86::VPHADDDrr, X86::VPHADDDrm, 0 },
+ { X86::VPHADDSWrr128, X86::VPHADDSWrm128, 0 },
+ { X86::VPHADDWrr, X86::VPHADDWrm, 0 },
+ { X86::VPHSUBDrr, X86::VPHSUBDrm, 0 },
+ { X86::VPHSUBSWrr128, X86::VPHSUBSWrm128, 0 },
+ { X86::VPHSUBWrr, X86::VPHSUBWrm, 0 },
+ { X86::VPERMILPDrr, X86::VPERMILPDrm, 0 },
+ { X86::VPERMILPSrr, X86::VPERMILPSrm, 0 },
+ { X86::VPINSRBrr, X86::VPINSRBrm, 0 },
+ { X86::VPINSRDrr, X86::VPINSRDrm, 0 },
+ { X86::VPINSRQrr, X86::VPINSRQrm, 0 },
+ { X86::VPINSRWrri, X86::VPINSRWrmi, 0 },
+ { X86::VPMADDUBSWrr, X86::VPMADDUBSWrm, 0 },
+ { X86::VPMADDWDrr, X86::VPMADDWDrm, 0 },
+ { X86::VPMAXSWrr, X86::VPMAXSWrm, 0 },
+ { X86::VPMAXUBrr, X86::VPMAXUBrm, 0 },
+ { X86::VPMINSWrr, X86::VPMINSWrm, 0 },
+ { X86::VPMINUBrr, X86::VPMINUBrm, 0 },
+ { X86::VPMINSBrr, X86::VPMINSBrm, 0 },
+ { X86::VPMINSDrr, X86::VPMINSDrm, 0 },
+ { X86::VPMINUDrr, X86::VPMINUDrm, 0 },
+ { X86::VPMINUWrr, X86::VPMINUWrm, 0 },
+ { X86::VPMAXSBrr, X86::VPMAXSBrm, 0 },
+ { X86::VPMAXSDrr, X86::VPMAXSDrm, 0 },
+ { X86::VPMAXUDrr, X86::VPMAXUDrm, 0 },
+ { X86::VPMAXUWrr, X86::VPMAXUWrm, 0 },
+ { X86::VPMULDQrr, X86::VPMULDQrm, 0 },
+ { X86::VPMULHRSWrr, X86::VPMULHRSWrm, 0 },
+ { X86::VPMULHUWrr, X86::VPMULHUWrm, 0 },
+ { X86::VPMULHWrr, X86::VPMULHWrm, 0 },
+ { X86::VPMULLDrr, X86::VPMULLDrm, 0 },
+ { X86::VPMULLWrr, X86::VPMULLWrm, 0 },
+ { X86::VPMULUDQrr, X86::VPMULUDQrm, 0 },
+ { X86::VPORrr, X86::VPORrm, 0 },
+ { X86::VPSADBWrr, X86::VPSADBWrm, 0 },
+ { X86::VPSHUFBrr, X86::VPSHUFBrm, 0 },
+ { X86::VPSIGNBrr128, X86::VPSIGNBrm128, 0 },
+ { X86::VPSIGNWrr128, X86::VPSIGNWrm128, 0 },
+ { X86::VPSIGNDrr128, X86::VPSIGNDrm128, 0 },
+ { X86::VPSLLDrr, X86::VPSLLDrm, 0 },
+ { X86::VPSLLQrr, X86::VPSLLQrm, 0 },
+ { X86::VPSLLWrr, X86::VPSLLWrm, 0 },
+ { X86::VPSRADrr, X86::VPSRADrm, 0 },
+ { X86::VPSRAWrr, X86::VPSRAWrm, 0 },
+ { X86::VPSRLDrr, X86::VPSRLDrm, 0 },
+ { X86::VPSRLQrr, X86::VPSRLQrm, 0 },
+ { X86::VPSRLWrr, X86::VPSRLWrm, 0 },
+ { X86::VPSUBBrr, X86::VPSUBBrm, 0 },
+ { X86::VPSUBDrr, X86::VPSUBDrm, 0 },
+ { X86::VPSUBQrr, X86::VPSUBQrm, 0 },
+ { X86::VPSUBSBrr, X86::VPSUBSBrm, 0 },
+ { X86::VPSUBSWrr, X86::VPSUBSWrm, 0 },
+ { X86::VPSUBUSBrr, X86::VPSUBUSBrm, 0 },
+ { X86::VPSUBUSWrr, X86::VPSUBUSWrm, 0 },
+ { X86::VPSUBWrr, X86::VPSUBWrm, 0 },
+ { X86::VPUNPCKHBWrr, X86::VPUNPCKHBWrm, 0 },
+ { X86::VPUNPCKHDQrr, X86::VPUNPCKHDQrm, 0 },
+ { X86::VPUNPCKHQDQrr, X86::VPUNPCKHQDQrm, 0 },
+ { X86::VPUNPCKHWDrr, X86::VPUNPCKHWDrm, 0 },
+ { X86::VPUNPCKLBWrr, X86::VPUNPCKLBWrm, 0 },
+ { X86::VPUNPCKLDQrr, X86::VPUNPCKLDQrm, 0 },
+ { X86::VPUNPCKLQDQrr, X86::VPUNPCKLQDQrm, 0 },
+ { X86::VPUNPCKLWDrr, X86::VPUNPCKLWDrm, 0 },
+ { X86::VPXORrr, X86::VPXORrm, 0 },
+ { X86::VRCPSSr, X86::VRCPSSm, 0 },
+ { X86::VRCPSSr_Int, X86::VRCPSSm_Int, TB_NO_REVERSE },
+ { X86::VRSQRTSSr, X86::VRSQRTSSm, 0 },
+ { X86::VRSQRTSSr_Int, X86::VRSQRTSSm_Int, TB_NO_REVERSE },
+ { X86::VROUNDSDr, X86::VROUNDSDm, 0 },
+ { X86::VROUNDSDr_Int, X86::VROUNDSDm_Int, TB_NO_REVERSE },
+ { X86::VROUNDSSr, X86::VROUNDSSm, 0 },
+ { X86::VROUNDSSr_Int, X86::VROUNDSSm_Int, TB_NO_REVERSE },
+ { X86::VSHUFPDrri, X86::VSHUFPDrmi, 0 },
+ { X86::VSHUFPSrri, X86::VSHUFPSrmi, 0 },
+ { X86::VSQRTSDr, X86::VSQRTSDm, 0 },
+ { X86::VSQRTSDr_Int, X86::VSQRTSDm_Int, TB_NO_REVERSE },
+ { X86::VSQRTSSr, X86::VSQRTSSm, 0 },
+ { X86::VSQRTSSr_Int, X86::VSQRTSSm_Int, TB_NO_REVERSE },
+ { X86::VSUBPDrr, X86::VSUBPDrm, 0 },
+ { X86::VSUBPSrr, X86::VSUBPSrm, 0 },
+ { X86::VSUBSDrr, X86::VSUBSDrm, 0 },
+ { X86::VSUBSDrr_Int, X86::VSUBSDrm_Int, TB_NO_REVERSE },
+ { X86::VSUBSSrr, X86::VSUBSSrm, 0 },
+ { X86::VSUBSSrr_Int, X86::VSUBSSrm_Int, TB_NO_REVERSE },
+ { X86::VUNPCKHPDrr, X86::VUNPCKHPDrm, 0 },
+ { X86::VUNPCKHPSrr, X86::VUNPCKHPSrm, 0 },
+ { X86::VUNPCKLPDrr, X86::VUNPCKLPDrm, 0 },
+ { X86::VUNPCKLPSrr, X86::VUNPCKLPSrm, 0 },
+ { X86::VXORPDrr, X86::VXORPDrm, 0 },
+ { X86::VXORPSrr, X86::VXORPSrm, 0 },
+
+ // AVX 256-bit foldable instructions
+ { X86::VADDPDYrr, X86::VADDPDYrm, 0 },
+ { X86::VADDPSYrr, X86::VADDPSYrm, 0 },
+ { X86::VADDSUBPDYrr, X86::VADDSUBPDYrm, 0 },
+ { X86::VADDSUBPSYrr, X86::VADDSUBPSYrm, 0 },
+ { X86::VANDNPDYrr, X86::VANDNPDYrm, 0 },
+ { X86::VANDNPSYrr, X86::VANDNPSYrm, 0 },
+ { X86::VANDPDYrr, X86::VANDPDYrm, 0 },
+ { X86::VANDPSYrr, X86::VANDPSYrm, 0 },
+ { X86::VBLENDPDYrri, X86::VBLENDPDYrmi, 0 },
+ { X86::VBLENDPSYrri, X86::VBLENDPSYrmi, 0 },
+ { X86::VBLENDVPDYrr, X86::VBLENDVPDYrm, 0 },
+ { X86::VBLENDVPSYrr, X86::VBLENDVPSYrm, 0 },
+ { X86::VCMPPDYrri, X86::VCMPPDYrmi, 0 },
+ { X86::VCMPPSYrri, X86::VCMPPSYrmi, 0 },
+ { X86::VDIVPDYrr, X86::VDIVPDYrm, 0 },
+ { X86::VDIVPSYrr, X86::VDIVPSYrm, 0 },
+ { X86::VDPPSYrri, X86::VDPPSYrmi, 0 },
+ { X86::VHADDPDYrr, X86::VHADDPDYrm, 0 },
+ { X86::VHADDPSYrr, X86::VHADDPSYrm, 0 },
+ { X86::VHSUBPDYrr, X86::VHSUBPDYrm, 0 },
+ { X86::VHSUBPSYrr, X86::VHSUBPSYrm, 0 },
+ { X86::VINSERTF128rr, X86::VINSERTF128rm, 0 },
+ { X86::VMAXCPDYrr, X86::VMAXCPDYrm, 0 },
+ { X86::VMAXCPSYrr, X86::VMAXCPSYrm, 0 },
+ { X86::VMAXPDYrr, X86::VMAXPDYrm, 0 },
+ { X86::VMAXPSYrr, X86::VMAXPSYrm, 0 },
+ { X86::VMINCPDYrr, X86::VMINCPDYrm, 0 },
+ { X86::VMINCPSYrr, X86::VMINCPSYrm, 0 },
+ { X86::VMINPDYrr, X86::VMINPDYrm, 0 },
+ { X86::VMINPSYrr, X86::VMINPSYrm, 0 },
+ { X86::VMULPDYrr, X86::VMULPDYrm, 0 },
+ { X86::VMULPSYrr, X86::VMULPSYrm, 0 },
+ { X86::VORPDYrr, X86::VORPDYrm, 0 },
+ { X86::VORPSYrr, X86::VORPSYrm, 0 },
+ { X86::VPERM2F128rr, X86::VPERM2F128rm, 0 },
+ { X86::VPERMILPDYrr, X86::VPERMILPDYrm, 0 },
+ { X86::VPERMILPSYrr, X86::VPERMILPSYrm, 0 },
+ { X86::VSHUFPDYrri, X86::VSHUFPDYrmi, 0 },
+ { X86::VSHUFPSYrri, X86::VSHUFPSYrmi, 0 },
+ { X86::VSUBPDYrr, X86::VSUBPDYrm, 0 },
+ { X86::VSUBPSYrr, X86::VSUBPSYrm, 0 },
+ { X86::VUNPCKHPDYrr, X86::VUNPCKHPDYrm, 0 },
+ { X86::VUNPCKHPSYrr, X86::VUNPCKHPSYrm, 0 },
+ { X86::VUNPCKLPDYrr, X86::VUNPCKLPDYrm, 0 },
+ { X86::VUNPCKLPSYrr, X86::VUNPCKLPSYrm, 0 },
+ { X86::VXORPDYrr, X86::VXORPDYrm, 0 },
+ { X86::VXORPSYrr, X86::VXORPSYrm, 0 },
+
+ // AVX2 foldable instructions
+ { X86::VINSERTI128rr, X86::VINSERTI128rm, 0 },
+ { X86::VPACKSSDWYrr, X86::VPACKSSDWYrm, 0 },
+ { X86::VPACKSSWBYrr, X86::VPACKSSWBYrm, 0 },
+ { X86::VPACKUSDWYrr, X86::VPACKUSDWYrm, 0 },
+ { X86::VPACKUSWBYrr, X86::VPACKUSWBYrm, 0 },
+ { X86::VPADDBYrr, X86::VPADDBYrm, 0 },
+ { X86::VPADDDYrr, X86::VPADDDYrm, 0 },
+ { X86::VPADDQYrr, X86::VPADDQYrm, 0 },
+ { X86::VPADDSBYrr, X86::VPADDSBYrm, 0 },
+ { X86::VPADDSWYrr, X86::VPADDSWYrm, 0 },
+ { X86::VPADDUSBYrr, X86::VPADDUSBYrm, 0 },
+ { X86::VPADDUSWYrr, X86::VPADDUSWYrm, 0 },
+ { X86::VPADDWYrr, X86::VPADDWYrm, 0 },
+ { X86::VPALIGNRYrri, X86::VPALIGNRYrmi, 0 },
+ { X86::VPANDNYrr, X86::VPANDNYrm, 0 },
+ { X86::VPANDYrr, X86::VPANDYrm, 0 },
+ { X86::VPAVGBYrr, X86::VPAVGBYrm, 0 },
+ { X86::VPAVGWYrr, X86::VPAVGWYrm, 0 },
+ { X86::VPBLENDDrri, X86::VPBLENDDrmi, 0 },
+ { X86::VPBLENDDYrri, X86::VPBLENDDYrmi, 0 },
+ { X86::VPBLENDVBYrr, X86::VPBLENDVBYrm, 0 },
+ { X86::VPBLENDWYrri, X86::VPBLENDWYrmi, 0 },
+ { X86::VPCMPEQBYrr, X86::VPCMPEQBYrm, 0 },
+ { X86::VPCMPEQDYrr, X86::VPCMPEQDYrm, 0 },
+ { X86::VPCMPEQQYrr, X86::VPCMPEQQYrm, 0 },
+ { X86::VPCMPEQWYrr, X86::VPCMPEQWYrm, 0 },
+ { X86::VPCMPGTBYrr, X86::VPCMPGTBYrm, 0 },
+ { X86::VPCMPGTDYrr, X86::VPCMPGTDYrm, 0 },
+ { X86::VPCMPGTQYrr, X86::VPCMPGTQYrm, 0 },
+ { X86::VPCMPGTWYrr, X86::VPCMPGTWYrm, 0 },
+ { X86::VPERM2I128rr, X86::VPERM2I128rm, 0 },
+ { X86::VPERMDYrr, X86::VPERMDYrm, 0 },
+ { X86::VPERMPSYrr, X86::VPERMPSYrm, 0 },
+ { X86::VPHADDDYrr, X86::VPHADDDYrm, 0 },
+ { X86::VPHADDSWrr256, X86::VPHADDSWrm256, 0 },
+ { X86::VPHADDWYrr, X86::VPHADDWYrm, 0 },
+ { X86::VPHSUBDYrr, X86::VPHSUBDYrm, 0 },
+ { X86::VPHSUBSWrr256, X86::VPHSUBSWrm256, 0 },
+ { X86::VPHSUBWYrr, X86::VPHSUBWYrm, 0 },
+ { X86::VPMADDUBSWYrr, X86::VPMADDUBSWYrm, 0 },
+ { X86::VPMADDWDYrr, X86::VPMADDWDYrm, 0 },
+ { X86::VPMAXSWYrr, X86::VPMAXSWYrm, 0 },
+ { X86::VPMAXUBYrr, X86::VPMAXUBYrm, 0 },
+ { X86::VPMINSWYrr, X86::VPMINSWYrm, 0 },
+ { X86::VPMINUBYrr, X86::VPMINUBYrm, 0 },
+ { X86::VPMINSBYrr, X86::VPMINSBYrm, 0 },
+ { X86::VPMINSDYrr, X86::VPMINSDYrm, 0 },
+ { X86::VPMINUDYrr, X86::VPMINUDYrm, 0 },
+ { X86::VPMINUWYrr, X86::VPMINUWYrm, 0 },
+ { X86::VPMAXSBYrr, X86::VPMAXSBYrm, 0 },
+ { X86::VPMAXSDYrr, X86::VPMAXSDYrm, 0 },
+ { X86::VPMAXUDYrr, X86::VPMAXUDYrm, 0 },
+ { X86::VPMAXUWYrr, X86::VPMAXUWYrm, 0 },
+ { X86::VMPSADBWYrri, X86::VMPSADBWYrmi, 0 },
+ { X86::VPMULDQYrr, X86::VPMULDQYrm, 0 },
+ { X86::VPMULHRSWYrr, X86::VPMULHRSWYrm, 0 },
+ { X86::VPMULHUWYrr, X86::VPMULHUWYrm, 0 },
+ { X86::VPMULHWYrr, X86::VPMULHWYrm, 0 },
+ { X86::VPMULLDYrr, X86::VPMULLDYrm, 0 },
+ { X86::VPMULLWYrr, X86::VPMULLWYrm, 0 },
+ { X86::VPMULUDQYrr, X86::VPMULUDQYrm, 0 },
+ { X86::VPORYrr, X86::VPORYrm, 0 },
+ { X86::VPSADBWYrr, X86::VPSADBWYrm, 0 },
+ { X86::VPSHUFBYrr, X86::VPSHUFBYrm, 0 },
+ { X86::VPSIGNBYrr256, X86::VPSIGNBYrm256, 0 },
+ { X86::VPSIGNWYrr256, X86::VPSIGNWYrm256, 0 },
+ { X86::VPSIGNDYrr256, X86::VPSIGNDYrm256, 0 },
+ { X86::VPSLLDYrr, X86::VPSLLDYrm, 0 },
+ { X86::VPSLLQYrr, X86::VPSLLQYrm, 0 },
+ { X86::VPSLLWYrr, X86::VPSLLWYrm, 0 },
+ { X86::VPSLLVDrr, X86::VPSLLVDrm, 0 },
+ { X86::VPSLLVDYrr, X86::VPSLLVDYrm, 0 },
+ { X86::VPSLLVQrr, X86::VPSLLVQrm, 0 },
+ { X86::VPSLLVQYrr, X86::VPSLLVQYrm, 0 },
+ { X86::VPSRADYrr, X86::VPSRADYrm, 0 },
+ { X86::VPSRAWYrr, X86::VPSRAWYrm, 0 },
+ { X86::VPSRAVDrr, X86::VPSRAVDrm, 0 },
+ { X86::VPSRAVDYrr, X86::VPSRAVDYrm, 0 },
+ { X86::VPSRLDYrr, X86::VPSRLDYrm, 0 },
+ { X86::VPSRLQYrr, X86::VPSRLQYrm, 0 },
+ { X86::VPSRLWYrr, X86::VPSRLWYrm, 0 },
+ { X86::VPSRLVDrr, X86::VPSRLVDrm, 0 },
+ { X86::VPSRLVDYrr, X86::VPSRLVDYrm, 0 },
+ { X86::VPSRLVQrr, X86::VPSRLVQrm, 0 },
+ { X86::VPSRLVQYrr, X86::VPSRLVQYrm, 0 },
+ { X86::VPSUBBYrr, X86::VPSUBBYrm, 0 },
+ { X86::VPSUBDYrr, X86::VPSUBDYrm, 0 },
+ { X86::VPSUBQYrr, X86::VPSUBQYrm, 0 },
+ { X86::VPSUBSBYrr, X86::VPSUBSBYrm, 0 },
+ { X86::VPSUBSWYrr, X86::VPSUBSWYrm, 0 },
+ { X86::VPSUBUSBYrr, X86::VPSUBUSBYrm, 0 },
+ { X86::VPSUBUSWYrr, X86::VPSUBUSWYrm, 0 },
+ { X86::VPSUBWYrr, X86::VPSUBWYrm, 0 },
+ { X86::VPUNPCKHBWYrr, X86::VPUNPCKHBWYrm, 0 },
+ { X86::VPUNPCKHDQYrr, X86::VPUNPCKHDQYrm, 0 },
+ { X86::VPUNPCKHQDQYrr, X86::VPUNPCKHQDQYrm, 0 },
+ { X86::VPUNPCKHWDYrr, X86::VPUNPCKHWDYrm, 0 },
+ { X86::VPUNPCKLBWYrr, X86::VPUNPCKLBWYrm, 0 },
+ { X86::VPUNPCKLDQYrr, X86::VPUNPCKLDQYrm, 0 },
+ { X86::VPUNPCKLQDQYrr, X86::VPUNPCKLQDQYrm, 0 },
+ { X86::VPUNPCKLWDYrr, X86::VPUNPCKLWDYrm, 0 },
+ { X86::VPXORYrr, X86::VPXORYrm, 0 },
+
+ // FMA4 foldable patterns
+ { X86::VFMADDSS4rr, X86::VFMADDSS4mr, TB_ALIGN_NONE },
+ { X86::VFMADDSS4rr_Int, X86::VFMADDSS4mr_Int, TB_NO_REVERSE },
+ { X86::VFMADDSD4rr, X86::VFMADDSD4mr, TB_ALIGN_NONE },
+ { X86::VFMADDSD4rr_Int, X86::VFMADDSD4mr_Int, TB_NO_REVERSE },
+ { X86::VFMADDPS4rr, X86::VFMADDPS4mr, TB_ALIGN_NONE },
+ { X86::VFMADDPD4rr, X86::VFMADDPD4mr, TB_ALIGN_NONE },
+ { X86::VFMADDPS4Yrr, X86::VFMADDPS4Ymr, TB_ALIGN_NONE },
+ { X86::VFMADDPD4Yrr, X86::VFMADDPD4Ymr, TB_ALIGN_NONE },
+ { X86::VFNMADDSS4rr, X86::VFNMADDSS4mr, TB_ALIGN_NONE },
+ { X86::VFNMADDSS4rr_Int, X86::VFNMADDSS4mr_Int, TB_NO_REVERSE },
+ { X86::VFNMADDSD4rr, X86::VFNMADDSD4mr, TB_ALIGN_NONE },
+ { X86::VFNMADDSD4rr_Int, X86::VFNMADDSD4mr_Int, TB_NO_REVERSE },
+ { X86::VFNMADDPS4rr, X86::VFNMADDPS4mr, TB_ALIGN_NONE },
+ { X86::VFNMADDPD4rr, X86::VFNMADDPD4mr, TB_ALIGN_NONE },
+ { X86::VFNMADDPS4Yrr, X86::VFNMADDPS4Ymr, TB_ALIGN_NONE },
+ { X86::VFNMADDPD4Yrr, X86::VFNMADDPD4Ymr, TB_ALIGN_NONE },
+ { X86::VFMSUBSS4rr, X86::VFMSUBSS4mr, TB_ALIGN_NONE },
+ { X86::VFMSUBSS4rr_Int, X86::VFMSUBSS4mr_Int, TB_NO_REVERSE },
+ { X86::VFMSUBSD4rr, X86::VFMSUBSD4mr, TB_ALIGN_NONE },
+ { X86::VFMSUBSD4rr_Int, X86::VFMSUBSD4mr_Int, TB_NO_REVERSE },
+ { X86::VFMSUBPS4rr, X86::VFMSUBPS4mr, TB_ALIGN_NONE },
+ { X86::VFMSUBPD4rr, X86::VFMSUBPD4mr, TB_ALIGN_NONE },
+ { X86::VFMSUBPS4Yrr, X86::VFMSUBPS4Ymr, TB_ALIGN_NONE },
+ { X86::VFMSUBPD4Yrr, X86::VFMSUBPD4Ymr, TB_ALIGN_NONE },
+ { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4mr, TB_ALIGN_NONE },
+ { X86::VFNMSUBSS4rr_Int, X86::VFNMSUBSS4mr_Int, TB_NO_REVERSE },
+ { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4mr, TB_ALIGN_NONE },
+ { X86::VFNMSUBSD4rr_Int, X86::VFNMSUBSD4mr_Int, TB_NO_REVERSE },
+ { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4mr, TB_ALIGN_NONE },
+ { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4mr, TB_ALIGN_NONE },
+ { X86::VFNMSUBPS4Yrr, X86::VFNMSUBPS4Ymr, TB_ALIGN_NONE },
+ { X86::VFNMSUBPD4Yrr, X86::VFNMSUBPD4Ymr, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4mr, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4mr, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPS4Yrr, X86::VFMADDSUBPS4Ymr, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPD4Yrr, X86::VFMADDSUBPD4Ymr, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4mr, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4mr, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPS4Yrr, X86::VFMSUBADDPS4Ymr, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPD4Yrr, X86::VFMSUBADDPD4Ymr, TB_ALIGN_NONE },
+
+ // XOP foldable instructions
+ { X86::VPCMOVrrr, X86::VPCMOVrmr, 0 },
+ { X86::VPCMOVrrrY, X86::VPCMOVrmrY, 0 },
+ { X86::VPCOMBri, X86::VPCOMBmi, 0 },
+ { X86::VPCOMDri, X86::VPCOMDmi, 0 },
+ { X86::VPCOMQri, X86::VPCOMQmi, 0 },
+ { X86::VPCOMWri, X86::VPCOMWmi, 0 },
+ { X86::VPCOMUBri, X86::VPCOMUBmi, 0 },
+ { X86::VPCOMUDri, X86::VPCOMUDmi, 0 },
+ { X86::VPCOMUQri, X86::VPCOMUQmi, 0 },
+ { X86::VPCOMUWri, X86::VPCOMUWmi, 0 },
+ { X86::VPERMIL2PDrr, X86::VPERMIL2PDmr, 0 },
+ { X86::VPERMIL2PDrrY, X86::VPERMIL2PDmrY, 0 },
+ { X86::VPERMIL2PSrr, X86::VPERMIL2PSmr, 0 },
+ { X86::VPERMIL2PSrrY, X86::VPERMIL2PSmrY, 0 },
+ { X86::VPMACSDDrr, X86::VPMACSDDrm, 0 },
+ { X86::VPMACSDQHrr, X86::VPMACSDQHrm, 0 },
+ { X86::VPMACSDQLrr, X86::VPMACSDQLrm, 0 },
+ { X86::VPMACSSDDrr, X86::VPMACSSDDrm, 0 },
+ { X86::VPMACSSDQHrr, X86::VPMACSSDQHrm, 0 },
+ { X86::VPMACSSDQLrr, X86::VPMACSSDQLrm, 0 },
+ { X86::VPMACSSWDrr, X86::VPMACSSWDrm, 0 },
+ { X86::VPMACSSWWrr, X86::VPMACSSWWrm, 0 },
+ { X86::VPMACSWDrr, X86::VPMACSWDrm, 0 },
+ { X86::VPMACSWWrr, X86::VPMACSWWrm, 0 },
+ { X86::VPMADCSSWDrr, X86::VPMADCSSWDrm, 0 },
+ { X86::VPMADCSWDrr, X86::VPMADCSWDrm, 0 },
+ { X86::VPPERMrrr, X86::VPPERMrmr, 0 },
+ { X86::VPROTBrr, X86::VPROTBrm, 0 },
+ { X86::VPROTDrr, X86::VPROTDrm, 0 },
+ { X86::VPROTQrr, X86::VPROTQrm, 0 },
+ { X86::VPROTWrr, X86::VPROTWrm, 0 },
+ { X86::VPSHABrr, X86::VPSHABrm, 0 },
+ { X86::VPSHADrr, X86::VPSHADrm, 0 },
+ { X86::VPSHAQrr, X86::VPSHAQrm, 0 },
+ { X86::VPSHAWrr, X86::VPSHAWrm, 0 },
+ { X86::VPSHLBrr, X86::VPSHLBrm, 0 },
+ { X86::VPSHLDrr, X86::VPSHLDrm, 0 },
+ { X86::VPSHLQrr, X86::VPSHLQrm, 0 },
+ { X86::VPSHLWrr, X86::VPSHLWrm, 0 },
+
+ // BMI/BMI2 foldable instructions
+ { X86::ANDN32rr, X86::ANDN32rm, 0 },
+ { X86::ANDN64rr, X86::ANDN64rm, 0 },
+ { X86::MULX32rr, X86::MULX32rm, 0 },
+ { X86::MULX64rr, X86::MULX64rm, 0 },
+ { X86::PDEP32rr, X86::PDEP32rm, 0 },
+ { X86::PDEP64rr, X86::PDEP64rm, 0 },
+ { X86::PEXT32rr, X86::PEXT32rm, 0 },
+ { X86::PEXT64rr, X86::PEXT64rm, 0 },
+
+ // ADX foldable instructions
+ { X86::ADCX32rr, X86::ADCX32rm, 0 },
+ { X86::ADCX64rr, X86::ADCX64rm, 0 },
+ { X86::ADOX32rr, X86::ADOX32rm, 0 },
+ { X86::ADOX64rr, X86::ADOX64rm, 0 },
+
+ // AVX-512 foldable instructions
+ { X86::VADDPDZrr, X86::VADDPDZrm, 0 },
+ { X86::VADDPSZrr, X86::VADDPSZrm, 0 },
+ { X86::VADDSDZrr, X86::VADDSDZrm, 0 },
+ { X86::VADDSDZrr_Int, X86::VADDSDZrm_Int, TB_NO_REVERSE },
+ { X86::VADDSSZrr, X86::VADDSSZrm, 0 },
+ { X86::VADDSSZrr_Int, X86::VADDSSZrm_Int, TB_NO_REVERSE },
+ { X86::VALIGNDZrri, X86::VALIGNDZrmi, 0 },
+ { X86::VALIGNQZrri, X86::VALIGNQZrmi, 0 },
+ { X86::VANDNPDZrr, X86::VANDNPDZrm, 0 },
+ { X86::VANDNPSZrr, X86::VANDNPSZrm, 0 },
+ { X86::VANDPDZrr, X86::VANDPDZrm, 0 },
+ { X86::VANDPSZrr, X86::VANDPSZrm, 0 },
+ { X86::VBROADCASTSSZrkz, X86::VBROADCASTSSZmkz, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZrkz, X86::VBROADCASTSDZmkz, TB_NO_REVERSE },
+ { X86::VCMPPDZrri, X86::VCMPPDZrmi, 0 },
+ { X86::VCMPPSZrri, X86::VCMPPSZrmi, 0 },
+ { X86::VCMPSDZrr, X86::VCMPSDZrm, 0 },
+ { X86::VCMPSDZrr_Int, X86::VCMPSDZrm_Int, TB_NO_REVERSE },
+ { X86::VCMPSSZrr, X86::VCMPSSZrm, 0 },
+ { X86::VCMPSSZrr_Int, X86::VCMPSSZrm_Int, TB_NO_REVERSE },
+ { X86::VDIVPDZrr, X86::VDIVPDZrm, 0 },
+ { X86::VDIVPSZrr, X86::VDIVPSZrm, 0 },
+ { X86::VDIVSDZrr, X86::VDIVSDZrm, 0 },
+ { X86::VDIVSDZrr_Int, X86::VDIVSDZrm_Int, TB_NO_REVERSE },
+ { X86::VDIVSSZrr, X86::VDIVSSZrm, 0 },
+ { X86::VDIVSSZrr_Int, X86::VDIVSSZrm_Int, TB_NO_REVERSE },
+ { X86::VINSERTF32x4Zrr, X86::VINSERTF32x4Zrm, 0 },
+ { X86::VINSERTF32x8Zrr, X86::VINSERTF32x8Zrm, 0 },
+ { X86::VINSERTF64x2Zrr, X86::VINSERTF64x2Zrm, 0 },
+ { X86::VINSERTF64x4Zrr, X86::VINSERTF64x4Zrm, 0 },
+ { X86::VINSERTI32x4Zrr, X86::VINSERTI32x4Zrm, 0 },
+ { X86::VINSERTI32x8Zrr, X86::VINSERTI32x8Zrm, 0 },
+ { X86::VINSERTI64x2Zrr, X86::VINSERTI64x2Zrm, 0 },
+ { X86::VINSERTI64x4Zrr, X86::VINSERTI64x4Zrm, 0 },
+ { X86::VMAXCPDZrr, X86::VMAXCPDZrm, 0 },
+ { X86::VMAXCPSZrr, X86::VMAXCPSZrm, 0 },
+ { X86::VMAXCSDZrr, X86::VMAXCSDZrm, 0 },
+ { X86::VMAXCSSZrr, X86::VMAXCSSZrm, 0 },
+ { X86::VMAXPDZrr, X86::VMAXPDZrm, 0 },
+ { X86::VMAXPSZrr, X86::VMAXPSZrm, 0 },
+ { X86::VMAXSDZrr, X86::VMAXSDZrm, 0 },
+ { X86::VMAXSDZrr_Int, X86::VMAXSDZrm_Int, TB_NO_REVERSE },
+ { X86::VMAXSSZrr, X86::VMAXSSZrm, 0 },
+ { X86::VMAXSSZrr_Int, X86::VMAXSSZrm_Int, TB_NO_REVERSE },
+ { X86::VMINCPDZrr, X86::VMINCPDZrm, 0 },
+ { X86::VMINCPSZrr, X86::VMINCPSZrm, 0 },
+ { X86::VMINCSDZrr, X86::VMINCSDZrm, 0 },
+ { X86::VMINCSSZrr, X86::VMINCSSZrm, 0 },
+ { X86::VMINPDZrr, X86::VMINPDZrm, 0 },
+ { X86::VMINPSZrr, X86::VMINPSZrm, 0 },
+ { X86::VMINSDZrr, X86::VMINSDZrm, 0 },
+ { X86::VMINSDZrr_Int, X86::VMINSDZrm_Int, TB_NO_REVERSE },
+ { X86::VMINSSZrr, X86::VMINSSZrm, 0 },
+ { X86::VMINSSZrr_Int, X86::VMINSSZrm_Int, TB_NO_REVERSE },
+ { X86::VMULPDZrr, X86::VMULPDZrm, 0 },
+ { X86::VMULPSZrr, X86::VMULPSZrm, 0 },
+ { X86::VMULSDZrr, X86::VMULSDZrm, 0 },
+ { X86::VMULSDZrr_Int, X86::VMULSDZrm_Int, TB_NO_REVERSE },
+ { X86::VMULSSZrr, X86::VMULSSZrm, 0 },
+ { X86::VMULSSZrr_Int, X86::VMULSSZrm_Int, TB_NO_REVERSE },
+ { X86::VORPDZrr, X86::VORPDZrm, 0 },
+ { X86::VORPSZrr, X86::VORPSZrm, 0 },
+ { X86::VPADDBZrr, X86::VPADDBZrm, 0 },
+ { X86::VPADDDZrr, X86::VPADDDZrm, 0 },
+ { X86::VPADDQZrr, X86::VPADDQZrm, 0 },
+ { X86::VPADDSBZrr, X86::VPADDSBZrm, 0 },
+ { X86::VPADDSWZrr, X86::VPADDSWZrm, 0 },
+ { X86::VPADDUSBZrr, X86::VPADDUSBZrm, 0 },
+ { X86::VPADDUSWZrr, X86::VPADDUSWZrm, 0 },
+ { X86::VPADDWZrr, X86::VPADDWZrm, 0 },
+ { X86::VPALIGNRZrri, X86::VPALIGNRZrmi, 0 },
+ { X86::VPANDDZrr, X86::VPANDDZrm, 0 },
+ { X86::VPANDNDZrr, X86::VPANDNDZrm, 0 },
+ { X86::VPANDNQZrr, X86::VPANDNQZrm, 0 },
+ { X86::VPANDQZrr, X86::VPANDQZrm, 0 },
+ { X86::VPCMPBZrri, X86::VPCMPBZrmi, 0 },
+ { X86::VPCMPDZrri, X86::VPCMPDZrmi, 0 },
+ { X86::VPCMPEQBZrr, X86::VPCMPEQBZrm, 0 },
+ { X86::VPCMPEQDZrr, X86::VPCMPEQDZrm, 0 },
+ { X86::VPCMPEQQZrr, X86::VPCMPEQQZrm, 0 },
+ { X86::VPCMPEQWZrr, X86::VPCMPEQWZrm, 0 },
+ { X86::VPCMPGTBZrr, X86::VPCMPGTBZrm, 0 },
+ { X86::VPCMPGTDZrr, X86::VPCMPGTDZrm, 0 },
+ { X86::VPCMPGTQZrr, X86::VPCMPGTQZrm, 0 },
+ { X86::VPCMPGTWZrr, X86::VPCMPGTWZrm, 0 },
+ { X86::VPCMPQZrri, X86::VPCMPQZrmi, 0 },
+ { X86::VPCMPUBZrri, X86::VPCMPUBZrmi, 0 },
+ { X86::VPCMPUDZrri, X86::VPCMPUDZrmi, 0 },
+ { X86::VPCMPUQZrri, X86::VPCMPUQZrmi, 0 },
+ { X86::VPCMPUWZrri, X86::VPCMPUWZrmi, 0 },
+ { X86::VPCMPWZrri, X86::VPCMPWZrmi, 0 },
+ { X86::VPERMBZrr, X86::VPERMBZrm, 0 },
+ { X86::VPERMDZrr, X86::VPERMDZrm, 0 },
+ { X86::VPERMILPDZrr, X86::VPERMILPDZrm, 0 },
+ { X86::VPERMILPSZrr, X86::VPERMILPSZrm, 0 },
+ { X86::VPERMPDZrr, X86::VPERMPDZrm, 0 },
+ { X86::VPERMPSZrr, X86::VPERMPSZrm, 0 },
+ { X86::VPERMQZrr, X86::VPERMQZrm, 0 },
+ { X86::VPERMWZrr, X86::VPERMWZrm, 0 },
+ { X86::VPMADDUBSWZrr, X86::VPMADDUBSWZrm, 0 },
+ { X86::VPMADDWDZrr, X86::VPMADDWDZrm, 0 },
+ { X86::VPMAXSDZrr, X86::VPMAXSDZrm, 0 },
+ { X86::VPMAXSQZrr, X86::VPMAXSQZrm, 0 },
+ { X86::VPMAXUDZrr, X86::VPMAXUDZrm, 0 },
+ { X86::VPMAXUQZrr, X86::VPMAXUQZrm, 0 },
+ { X86::VPMINSDZrr, X86::VPMINSDZrm, 0 },
+ { X86::VPMINSQZrr, X86::VPMINSQZrm, 0 },
+ { X86::VPMINUDZrr, X86::VPMINUDZrm, 0 },
+ { X86::VPMINUQZrr, X86::VPMINUQZrm, 0 },
+ { X86::VPMULDQZrr, X86::VPMULDQZrm, 0 },
+ { X86::VPMULUDQZrr, X86::VPMULUDQZrm, 0 },
+ { X86::VPORDZrr, X86::VPORDZrm, 0 },
+ { X86::VPORQZrr, X86::VPORQZrm, 0 },
+ { X86::VPSHUFBZrr, X86::VPSHUFBZrm, 0 },
+ { X86::VPSLLVDZrr, X86::VPSLLVDZrm, 0 },
+ { X86::VPSLLVQZrr, X86::VPSLLVQZrm, 0 },
+ { X86::VPSRAVDZrr, X86::VPSRAVDZrm, 0 },
+ { X86::VPSRLVDZrr, X86::VPSRLVDZrm, 0 },
+ { X86::VPSRLVQZrr, X86::VPSRLVQZrm, 0 },
+ { X86::VPSUBBZrr, X86::VPSUBBZrm, 0 },
+ { X86::VPSUBDZrr, X86::VPSUBDZrm, 0 },
+ { X86::VPSUBQZrr, X86::VPSUBQZrm, 0 },
+ { X86::VPSUBSBZrr, X86::VPSUBSBZrm, 0 },
+ { X86::VPSUBSWZrr, X86::VPSUBSWZrm, 0 },
+ { X86::VPSUBUSBZrr, X86::VPSUBUSBZrm, 0 },
+ { X86::VPSUBUSWZrr, X86::VPSUBUSWZrm, 0 },
+ { X86::VPSUBWZrr, X86::VPSUBWZrm, 0 },
+ { X86::VPUNPCKHBWZrr, X86::VPUNPCKHBWZrm, 0 },
+ { X86::VPUNPCKHDQZrr, X86::VPUNPCKHDQZrm, 0 },
+ { X86::VPUNPCKHQDQZrr, X86::VPUNPCKHQDQZrm, 0 },
+ { X86::VPUNPCKHWDZrr, X86::VPUNPCKHWDZrm, 0 },
+ { X86::VPUNPCKLBWZrr, X86::VPUNPCKLBWZrm, 0 },
+ { X86::VPUNPCKLDQZrr, X86::VPUNPCKLDQZrm, 0 },
+ { X86::VPUNPCKLQDQZrr, X86::VPUNPCKLQDQZrm, 0 },
+ { X86::VPUNPCKLWDZrr, X86::VPUNPCKLWDZrm, 0 },
+ { X86::VPXORDZrr, X86::VPXORDZrm, 0 },
+ { X86::VPXORQZrr, X86::VPXORQZrm, 0 },
+ { X86::VSHUFPDZrri, X86::VSHUFPDZrmi, 0 },
+ { X86::VSHUFPSZrri, X86::VSHUFPSZrmi, 0 },
+ { X86::VSUBPDZrr, X86::VSUBPDZrm, 0 },
+ { X86::VSUBPSZrr, X86::VSUBPSZrm, 0 },
+ { X86::VSUBSDZrr, X86::VSUBSDZrm, 0 },
+ { X86::VSUBSDZrr_Int, X86::VSUBSDZrm_Int, TB_NO_REVERSE },
+ { X86::VSUBSSZrr, X86::VSUBSSZrm, 0 },
+ { X86::VSUBSSZrr_Int, X86::VSUBSSZrm_Int, TB_NO_REVERSE },
+ { X86::VUNPCKHPDZrr, X86::VUNPCKHPDZrm, 0 },
+ { X86::VUNPCKHPSZrr, X86::VUNPCKHPSZrm, 0 },
+ { X86::VUNPCKLPDZrr, X86::VUNPCKLPDZrm, 0 },
+ { X86::VUNPCKLPSZrr, X86::VUNPCKLPSZrm, 0 },
+ { X86::VXORPDZrr, X86::VXORPDZrm, 0 },
+ { X86::VXORPSZrr, X86::VXORPSZrm, 0 },
+
+ // AVX-512{F,VL} foldable instructions
+ { X86::VADDPDZ128rr, X86::VADDPDZ128rm, 0 },
+ { X86::VADDPDZ256rr, X86::VADDPDZ256rm, 0 },
+ { X86::VADDPSZ128rr, X86::VADDPSZ128rm, 0 },
+ { X86::VADDPSZ256rr, X86::VADDPSZ256rm, 0 },
+ { X86::VALIGNDZ128rri, X86::VALIGNDZ128rmi, 0 },
+ { X86::VALIGNDZ256rri, X86::VALIGNDZ256rmi, 0 },
+ { X86::VALIGNQZ128rri, X86::VALIGNQZ128rmi, 0 },
+ { X86::VALIGNQZ256rri, X86::VALIGNQZ256rmi, 0 },
+ { X86::VANDNPDZ128rr, X86::VANDNPDZ128rm, 0 },
+ { X86::VANDNPDZ256rr, X86::VANDNPDZ256rm, 0 },
+ { X86::VANDNPSZ128rr, X86::VANDNPSZ128rm, 0 },
+ { X86::VANDNPSZ256rr, X86::VANDNPSZ256rm, 0 },
+ { X86::VANDPDZ128rr, X86::VANDPDZ128rm, 0 },
+ { X86::VANDPDZ256rr, X86::VANDPDZ256rm, 0 },
+ { X86::VANDPSZ128rr, X86::VANDPSZ128rm, 0 },
+ { X86::VANDPSZ256rr, X86::VANDPSZ256rm, 0 },
+ { X86::VBROADCASTSSZ128rkz, X86::VBROADCASTSSZ128mkz, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZ256rkz, X86::VBROADCASTSSZ256mkz, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZ256rkz, X86::VBROADCASTSDZ256mkz, TB_NO_REVERSE },
+ { X86::VCMPPDZ128rri, X86::VCMPPDZ128rmi, 0 },
+ { X86::VCMPPDZ256rri, X86::VCMPPDZ256rmi, 0 },
+ { X86::VCMPPSZ128rri, X86::VCMPPSZ128rmi, 0 },
+ { X86::VCMPPSZ256rri, X86::VCMPPSZ256rmi, 0 },
+ { X86::VDIVPDZ128rr, X86::VDIVPDZ128rm, 0 },
+ { X86::VDIVPDZ256rr, X86::VDIVPDZ256rm, 0 },
+ { X86::VDIVPSZ128rr, X86::VDIVPSZ128rm, 0 },
+ { X86::VDIVPSZ256rr, X86::VDIVPSZ256rm, 0 },
+ { X86::VINSERTF32x4Z256rr,X86::VINSERTF32x4Z256rm, 0 },
+ { X86::VINSERTF64x2Z256rr,X86::VINSERTF64x2Z256rm, 0 },
+ { X86::VINSERTI32x4Z256rr,X86::VINSERTI32x4Z256rm, 0 },
+ { X86::VINSERTI64x2Z256rr,X86::VINSERTI64x2Z256rm, 0 },
+ { X86::VMAXCPDZ128rr, X86::VMAXCPDZ128rm, 0 },
+ { X86::VMAXCPDZ256rr, X86::VMAXCPDZ256rm, 0 },
+ { X86::VMAXCPSZ128rr, X86::VMAXCPSZ128rm, 0 },
+ { X86::VMAXCPSZ256rr, X86::VMAXCPSZ256rm, 0 },
+ { X86::VMAXPDZ128rr, X86::VMAXPDZ128rm, 0 },
+ { X86::VMAXPDZ256rr, X86::VMAXPDZ256rm, 0 },
+ { X86::VMAXPSZ128rr, X86::VMAXPSZ128rm, 0 },
+ { X86::VMAXPSZ256rr, X86::VMAXPSZ256rm, 0 },
+ { X86::VMINCPDZ128rr, X86::VMINCPDZ128rm, 0 },
+ { X86::VMINCPDZ256rr, X86::VMINCPDZ256rm, 0 },
+ { X86::VMINCPSZ128rr, X86::VMINCPSZ128rm, 0 },
+ { X86::VMINCPSZ256rr, X86::VMINCPSZ256rm, 0 },
+ { X86::VMINPDZ128rr, X86::VMINPDZ128rm, 0 },
+ { X86::VMINPDZ256rr, X86::VMINPDZ256rm, 0 },
+ { X86::VMINPSZ128rr, X86::VMINPSZ128rm, 0 },
+ { X86::VMINPSZ256rr, X86::VMINPSZ256rm, 0 },
+ { X86::VMULPDZ128rr, X86::VMULPDZ128rm, 0 },
+ { X86::VMULPDZ256rr, X86::VMULPDZ256rm, 0 },
+ { X86::VMULPSZ128rr, X86::VMULPSZ128rm, 0 },
+ { X86::VMULPSZ256rr, X86::VMULPSZ256rm, 0 },
+ { X86::VORPDZ128rr, X86::VORPDZ128rm, 0 },
+ { X86::VORPDZ256rr, X86::VORPDZ256rm, 0 },
+ { X86::VORPSZ128rr, X86::VORPSZ128rm, 0 },
+ { X86::VORPSZ256rr, X86::VORPSZ256rm, 0 },
+ { X86::VPADDBZ128rr, X86::VPADDBZ128rm, 0 },
+ { X86::VPADDBZ256rr, X86::VPADDBZ256rm, 0 },
+ { X86::VPADDDZ128rr, X86::VPADDDZ128rm, 0 },
+ { X86::VPADDDZ256rr, X86::VPADDDZ256rm, 0 },
+ { X86::VPADDQZ128rr, X86::VPADDQZ128rm, 0 },
+ { X86::VPADDQZ256rr, X86::VPADDQZ256rm, 0 },
+ { X86::VPADDSBZ128rr, X86::VPADDSBZ128rm, 0 },
+ { X86::VPADDSBZ256rr, X86::VPADDSBZ256rm, 0 },
+ { X86::VPADDSWZ128rr, X86::VPADDSWZ128rm, 0 },
+ { X86::VPADDSWZ256rr, X86::VPADDSWZ256rm, 0 },
+ { X86::VPADDUSBZ128rr, X86::VPADDUSBZ128rm, 0 },
+ { X86::VPADDUSBZ256rr, X86::VPADDUSBZ256rm, 0 },
+ { X86::VPADDUSWZ128rr, X86::VPADDUSWZ128rm, 0 },
+ { X86::VPADDUSWZ256rr, X86::VPADDUSWZ256rm, 0 },
+ { X86::VPADDWZ128rr, X86::VPADDWZ128rm, 0 },
+ { X86::VPADDWZ256rr, X86::VPADDWZ256rm, 0 },
+ { X86::VPALIGNRZ128rri, X86::VPALIGNRZ128rmi, 0 },
+ { X86::VPALIGNRZ256rri, X86::VPALIGNRZ256rmi, 0 },
+ { X86::VPANDDZ128rr, X86::VPANDDZ128rm, 0 },
+ { X86::VPANDDZ256rr, X86::VPANDDZ256rm, 0 },
+ { X86::VPANDNDZ128rr, X86::VPANDNDZ128rm, 0 },
+ { X86::VPANDNDZ256rr, X86::VPANDNDZ256rm, 0 },
+ { X86::VPANDNQZ128rr, X86::VPANDNQZ128rm, 0 },
+ { X86::VPANDNQZ256rr, X86::VPANDNQZ256rm, 0 },
+ { X86::VPANDQZ128rr, X86::VPANDQZ128rm, 0 },
+ { X86::VPANDQZ256rr, X86::VPANDQZ256rm, 0 },
+ { X86::VPCMPBZ128rri, X86::VPCMPBZ128rmi, 0 },
+ { X86::VPCMPBZ256rri, X86::VPCMPBZ256rmi, 0 },
+ { X86::VPCMPDZ128rri, X86::VPCMPDZ128rmi, 0 },
+ { X86::VPCMPDZ256rri, X86::VPCMPDZ256rmi, 0 },
+ { X86::VPCMPEQBZ128rr, X86::VPCMPEQBZ128rm, 0 },
+ { X86::VPCMPEQBZ256rr, X86::VPCMPEQBZ256rm, 0 },
+ { X86::VPCMPEQDZ128rr, X86::VPCMPEQDZ128rm, 0 },
+ { X86::VPCMPEQDZ256rr, X86::VPCMPEQDZ256rm, 0 },
+ { X86::VPCMPEQQZ128rr, X86::VPCMPEQQZ128rm, 0 },
+ { X86::VPCMPEQQZ256rr, X86::VPCMPEQQZ256rm, 0 },
+ { X86::VPCMPEQWZ128rr, X86::VPCMPEQWZ128rm, 0 },
+ { X86::VPCMPEQWZ256rr, X86::VPCMPEQWZ256rm, 0 },
+ { X86::VPCMPGTBZ128rr, X86::VPCMPGTBZ128rm, 0 },
+ { X86::VPCMPGTBZ256rr, X86::VPCMPGTBZ256rm, 0 },
+ { X86::VPCMPGTDZ128rr, X86::VPCMPGTDZ128rm, 0 },
+ { X86::VPCMPGTDZ256rr, X86::VPCMPGTDZ256rm, 0 },
+ { X86::VPCMPGTQZ128rr, X86::VPCMPGTQZ128rm, 0 },
+ { X86::VPCMPGTQZ256rr, X86::VPCMPGTQZ256rm, 0 },
+ { X86::VPCMPGTWZ128rr, X86::VPCMPGTWZ128rm, 0 },
+ { X86::VPCMPGTWZ256rr, X86::VPCMPGTWZ256rm, 0 },
+ { X86::VPCMPQZ128rri, X86::VPCMPQZ128rmi, 0 },
+ { X86::VPCMPQZ256rri, X86::VPCMPQZ256rmi, 0 },
+ { X86::VPCMPUBZ128rri, X86::VPCMPUBZ128rmi, 0 },
+ { X86::VPCMPUBZ256rri, X86::VPCMPUBZ256rmi, 0 },
+ { X86::VPCMPUDZ128rri, X86::VPCMPUDZ128rmi, 0 },
+ { X86::VPCMPUDZ256rri, X86::VPCMPUDZ256rmi, 0 },
+ { X86::VPCMPUQZ128rri, X86::VPCMPUQZ128rmi, 0 },
+ { X86::VPCMPUQZ256rri, X86::VPCMPUQZ256rmi, 0 },
+ { X86::VPCMPUWZ128rri, X86::VPCMPUWZ128rmi, 0 },
+ { X86::VPCMPUWZ256rri, X86::VPCMPUWZ256rmi, 0 },
+ { X86::VPCMPWZ128rri, X86::VPCMPWZ128rmi, 0 },
+ { X86::VPCMPWZ256rri, X86::VPCMPWZ256rmi, 0 },
+ { X86::VPERMBZ128rr, X86::VPERMBZ128rm, 0 },
+ { X86::VPERMBZ256rr, X86::VPERMBZ256rm, 0 },
+ { X86::VPERMDZ256rr, X86::VPERMDZ256rm, 0 },
+ { X86::VPERMILPDZ128rr, X86::VPERMILPDZ128rm, 0 },
+ { X86::VPERMILPDZ256rr, X86::VPERMILPDZ256rm, 0 },
+ { X86::VPERMILPSZ128rr, X86::VPERMILPSZ128rm, 0 },
+ { X86::VPERMILPSZ256rr, X86::VPERMILPSZ256rm, 0 },
+ { X86::VPERMPDZ256rr, X86::VPERMPDZ256rm, 0 },
+ { X86::VPERMPSZ256rr, X86::VPERMPSZ256rm, 0 },
+ { X86::VPERMQZ256rr, X86::VPERMQZ256rm, 0 },
+ { X86::VPERMWZ128rr, X86::VPERMWZ128rm, 0 },
+ { X86::VPERMWZ256rr, X86::VPERMWZ256rm, 0 },
+ { X86::VPMADDUBSWZ128rr, X86::VPMADDUBSWZ128rm, 0 },
+ { X86::VPMADDUBSWZ256rr, X86::VPMADDUBSWZ256rm, 0 },
+ { X86::VPMADDWDZ128rr, X86::VPMADDWDZ128rm, 0 },
+ { X86::VPMADDWDZ256rr, X86::VPMADDWDZ256rm, 0 },
+ { X86::VPORDZ128rr, X86::VPORDZ128rm, 0 },
+ { X86::VPORDZ256rr, X86::VPORDZ256rm, 0 },
+ { X86::VPORQZ128rr, X86::VPORQZ128rm, 0 },
+ { X86::VPORQZ256rr, X86::VPORQZ256rm, 0 },
+ { X86::VPSHUFBZ128rr, X86::VPSHUFBZ128rm, 0 },
+ { X86::VPSHUFBZ256rr, X86::VPSHUFBZ256rm, 0 },
+ { X86::VPSUBBZ128rr, X86::VPSUBBZ128rm, 0 },
+ { X86::VPSUBBZ256rr, X86::VPSUBBZ256rm, 0 },
+ { X86::VPSUBDZ128rr, X86::VPSUBDZ128rm, 0 },
+ { X86::VPSUBDZ256rr, X86::VPSUBDZ256rm, 0 },
+ { X86::VPSUBQZ128rr, X86::VPSUBQZ128rm, 0 },
+ { X86::VPSUBQZ256rr, X86::VPSUBQZ256rm, 0 },
+ { X86::VPSUBSBZ128rr, X86::VPSUBSBZ128rm, 0 },
+ { X86::VPSUBSBZ256rr, X86::VPSUBSBZ256rm, 0 },
+ { X86::VPSUBSWZ128rr, X86::VPSUBSWZ128rm, 0 },
+ { X86::VPSUBSWZ256rr, X86::VPSUBSWZ256rm, 0 },
+ { X86::VPSUBUSBZ128rr, X86::VPSUBUSBZ128rm, 0 },
+ { X86::VPSUBUSBZ256rr, X86::VPSUBUSBZ256rm, 0 },
+ { X86::VPSUBUSWZ128rr, X86::VPSUBUSWZ128rm, 0 },
+ { X86::VPSUBUSWZ256rr, X86::VPSUBUSWZ256rm, 0 },
+ { X86::VPSUBWZ128rr, X86::VPSUBWZ128rm, 0 },
+ { X86::VPSUBWZ256rr, X86::VPSUBWZ256rm, 0 },
+ { X86::VPUNPCKHBWZ128rr, X86::VPUNPCKHBWZ128rm, 0 },
+ { X86::VPUNPCKHBWZ256rr, X86::VPUNPCKHBWZ256rm, 0 },
+ { X86::VPUNPCKHDQZ128rr, X86::VPUNPCKHDQZ128rm, 0 },
+ { X86::VPUNPCKHDQZ256rr, X86::VPUNPCKHDQZ256rm, 0 },
+ { X86::VPUNPCKHQDQZ128rr, X86::VPUNPCKHQDQZ128rm, 0 },
+ { X86::VPUNPCKHQDQZ256rr, X86::VPUNPCKHQDQZ256rm, 0 },
+ { X86::VPUNPCKHWDZ128rr, X86::VPUNPCKHWDZ128rm, 0 },
+ { X86::VPUNPCKHWDZ256rr, X86::VPUNPCKHWDZ256rm, 0 },
+ { X86::VPUNPCKLBWZ128rr, X86::VPUNPCKLBWZ128rm, 0 },
+ { X86::VPUNPCKLBWZ256rr, X86::VPUNPCKLBWZ256rm, 0 },
+ { X86::VPUNPCKLDQZ128rr, X86::VPUNPCKLDQZ128rm, 0 },
+ { X86::VPUNPCKLDQZ256rr, X86::VPUNPCKLDQZ256rm, 0 },
+ { X86::VPUNPCKLQDQZ128rr, X86::VPUNPCKLQDQZ128rm, 0 },
+ { X86::VPUNPCKLQDQZ256rr, X86::VPUNPCKLQDQZ256rm, 0 },
+ { X86::VPUNPCKLWDZ128rr, X86::VPUNPCKLWDZ128rm, 0 },
+ { X86::VPUNPCKLWDZ256rr, X86::VPUNPCKLWDZ256rm, 0 },
+ { X86::VPXORDZ128rr, X86::VPXORDZ128rm, 0 },
+ { X86::VPXORDZ256rr, X86::VPXORDZ256rm, 0 },
+ { X86::VPXORQZ128rr, X86::VPXORQZ128rm, 0 },
+ { X86::VPXORQZ256rr, X86::VPXORQZ256rm, 0 },
+ { X86::VSUBPDZ128rr, X86::VSUBPDZ128rm, 0 },
+ { X86::VSUBPDZ256rr, X86::VSUBPDZ256rm, 0 },
+ { X86::VSUBPSZ128rr, X86::VSUBPSZ128rm, 0 },
+ { X86::VSUBPSZ256rr, X86::VSUBPSZ256rm, 0 },
+ { X86::VUNPCKHPDZ128rr, X86::VUNPCKHPDZ128rm, 0 },
+ { X86::VUNPCKHPDZ256rr, X86::VUNPCKHPDZ256rm, 0 },
+ { X86::VUNPCKHPSZ128rr, X86::VUNPCKHPSZ128rm, 0 },
+ { X86::VUNPCKHPSZ256rr, X86::VUNPCKHPSZ256rm, 0 },
+ { X86::VUNPCKLPDZ128rr, X86::VUNPCKLPDZ128rm, 0 },
+ { X86::VUNPCKLPDZ256rr, X86::VUNPCKLPDZ256rm, 0 },
+ { X86::VUNPCKLPSZ128rr, X86::VUNPCKLPSZ128rm, 0 },
+ { X86::VUNPCKLPSZ256rr, X86::VUNPCKLPSZ256rm, 0 },
+ { X86::VXORPDZ128rr, X86::VXORPDZ128rm, 0 },
+ { X86::VXORPDZ256rr, X86::VXORPDZ256rm, 0 },
+ { X86::VXORPSZ128rr, X86::VXORPSZ128rm, 0 },
+ { X86::VXORPSZ256rr, X86::VXORPSZ256rm, 0 },
+
+ // AVX-512 masked foldable instructions
+ { X86::VPERMILPDZrikz, X86::VPERMILPDZmikz, 0 },
+ { X86::VPERMILPSZrikz, X86::VPERMILPSZmikz, 0 },
+ { X86::VPERMPDZrikz, X86::VPERMPDZmikz, 0 },
+ { X86::VPERMQZrikz, X86::VPERMQZmikz, 0 },
+ { X86::VPMOVSXBDZrrkz, X86::VPMOVSXBDZrmkz, 0 },
+ { X86::VPMOVSXBQZrrkz, X86::VPMOVSXBQZrmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXBWZrrkz, X86::VPMOVSXBWZrmkz, 0 },
+ { X86::VPMOVSXDQZrrkz, X86::VPMOVSXDQZrmkz, 0 },
+ { X86::VPMOVSXWDZrrkz, X86::VPMOVSXWDZrmkz, 0 },
+ { X86::VPMOVSXWQZrrkz, X86::VPMOVSXWQZrmkz, 0 },
+ { X86::VPMOVZXBDZrrkz, X86::VPMOVZXBDZrmkz, 0 },
+ { X86::VPMOVZXBQZrrkz, X86::VPMOVZXBQZrmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXBWZrrkz, X86::VPMOVZXBWZrmkz, 0 },
+ { X86::VPMOVZXDQZrrkz, X86::VPMOVZXDQZrmkz, 0 },
+ { X86::VPMOVZXWDZrrkz, X86::VPMOVZXWDZrmkz, 0 },
+ { X86::VPMOVZXWQZrrkz, X86::VPMOVZXWQZrmkz, 0 },
+ { X86::VPSHUFDZrikz, X86::VPSHUFDZmikz, 0 },
+ { X86::VPSHUFHWZrikz, X86::VPSHUFHWZmikz, 0 },
+ { X86::VPSHUFLWZrikz, X86::VPSHUFLWZmikz, 0 },
+
+ // AVX-512VL 256-bit masked foldable instructions
+ { X86::VPERMILPDZ256rikz, X86::VPERMILPDZ256mikz, 0 },
+ { X86::VPERMILPSZ256rikz, X86::VPERMILPSZ256mikz, 0 },
+ { X86::VPERMPDZ256rikz, X86::VPERMPDZ256mikz, 0 },
+ { X86::VPERMQZ256rikz, X86::VPERMQZ256mikz, 0 },
+ { X86::VPMOVSXBDZ256rrkz, X86::VPMOVSXBDZ256rmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXBQZ256rrkz, X86::VPMOVSXBQZ256rmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXBWZ256rrkz, X86::VPMOVSXBWZ256rmkz, 0 },
+ { X86::VPMOVSXDQZ256rrkz, X86::VPMOVSXDQZ256rmkz, 0 },
+ { X86::VPMOVSXWDZ256rrkz, X86::VPMOVSXWDZ256rmkz, 0 },
+ { X86::VPMOVSXWQZ256rrkz, X86::VPMOVSXWQZ256rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXBDZ256rrkz, X86::VPMOVZXBDZ256rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXBQZ256rrkz, X86::VPMOVZXBQZ256rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXBWZ256rrkz, X86::VPMOVZXBWZ256rmkz, 0 },
+ { X86::VPMOVZXDQZ256rrkz, X86::VPMOVZXDQZ256rmkz, 0 },
+ { X86::VPMOVZXWDZ256rrkz, X86::VPMOVZXWDZ256rmkz, 0 },
+ { X86::VPMOVZXWQZ256rrkz, X86::VPMOVZXWQZ256rmkz, TB_NO_REVERSE },
+ { X86::VPSHUFDZ256rikz, X86::VPSHUFDZ256mikz, 0 },
+ { X86::VPSHUFHWZ256rikz, X86::VPSHUFHWZ256mikz, 0 },
+ { X86::VPSHUFLWZ256rikz, X86::VPSHUFLWZ256mikz, 0 },
+
+ // AVX-512VL 128-bit masked foldable instructions
+ { X86::VPERMILPDZ128rikz, X86::VPERMILPDZ128mikz, 0 },
+ { X86::VPERMILPSZ128rikz, X86::VPERMILPSZ128mikz, 0 },
+ { X86::VPMOVSXBDZ128rrkz, X86::VPMOVSXBDZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXBQZ128rrkz, X86::VPMOVSXBQZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXBWZ128rrkz, X86::VPMOVSXBWZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXDQZ128rrkz, X86::VPMOVSXDQZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXWDZ128rrkz, X86::VPMOVSXWDZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXWQZ128rrkz, X86::VPMOVSXWQZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXBDZ128rrkz, X86::VPMOVZXBDZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXBQZ128rrkz, X86::VPMOVZXBQZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXBWZ128rrkz, X86::VPMOVZXBWZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXDQZ128rrkz, X86::VPMOVZXDQZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXWDZ128rrkz, X86::VPMOVZXWDZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXWQZ128rrkz, X86::VPMOVZXWQZ128rmkz, TB_NO_REVERSE },
+ { X86::VPSHUFDZ128rikz, X86::VPSHUFDZ128mikz, 0 },
+ { X86::VPSHUFHWZ128rikz, X86::VPSHUFHWZ128mikz, 0 },
+ { X86::VPSHUFLWZ128rikz, X86::VPSHUFLWZ128mikz, 0 },
+
+ // AES foldable instructions
+ { X86::AESDECLASTrr, X86::AESDECLASTrm, TB_ALIGN_16 },
+ { X86::AESDECrr, X86::AESDECrm, TB_ALIGN_16 },
+ { X86::AESENCLASTrr, X86::AESENCLASTrm, TB_ALIGN_16 },
+ { X86::AESENCrr, X86::AESENCrm, TB_ALIGN_16 },
+ { X86::VAESDECLASTrr, X86::VAESDECLASTrm, 0 },
+ { X86::VAESDECrr, X86::VAESDECrm, 0 },
+ { X86::VAESENCLASTrr, X86::VAESENCLASTrm, 0 },
+ { X86::VAESENCrr, X86::VAESENCrm, 0 },
+
+ // SHA foldable instructions
+ { X86::SHA1MSG1rr, X86::SHA1MSG1rm, TB_ALIGN_16 },
+ { X86::SHA1MSG2rr, X86::SHA1MSG2rm, TB_ALIGN_16 },
+ { X86::SHA1NEXTErr, X86::SHA1NEXTErm, TB_ALIGN_16 },
+ { X86::SHA1RNDS4rri, X86::SHA1RNDS4rmi, TB_ALIGN_16 },
+ { X86::SHA256MSG1rr, X86::SHA256MSG1rm, TB_ALIGN_16 },
+ { X86::SHA256MSG2rr, X86::SHA256MSG2rm, TB_ALIGN_16 },
+ { X86::SHA256RNDS2rr, X86::SHA256RNDS2rm, TB_ALIGN_16 }
+ };
+
+ for (X86MemoryFoldTableEntry Entry : MemoryFoldTable2) {
+ AddTableEntry(RegOp2MemOpTable2, MemOp2RegOpTable,
+ Entry.RegOp, Entry.MemOp,
+ // Index 2, folded load
+ Entry.Flags | TB_INDEX_2 | TB_FOLDED_LOAD);
+ }
+
+ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
+ // FMA4 foldable patterns
+ { X86::VFMADDSS4rr, X86::VFMADDSS4rm, TB_ALIGN_NONE },
+ { X86::VFMADDSS4rr_Int, X86::VFMADDSS4rm_Int, TB_NO_REVERSE },
+ { X86::VFMADDSD4rr, X86::VFMADDSD4rm, TB_ALIGN_NONE },
+ { X86::VFMADDSD4rr_Int, X86::VFMADDSD4rm_Int, TB_NO_REVERSE },
+ { X86::VFMADDPS4rr, X86::VFMADDPS4rm, TB_ALIGN_NONE },
+ { X86::VFMADDPD4rr, X86::VFMADDPD4rm, TB_ALIGN_NONE },
+ { X86::VFMADDPS4Yrr, X86::VFMADDPS4Yrm, TB_ALIGN_NONE },
+ { X86::VFMADDPD4Yrr, X86::VFMADDPD4Yrm, TB_ALIGN_NONE },
+ { X86::VFNMADDSS4rr, X86::VFNMADDSS4rm, TB_ALIGN_NONE },
+ { X86::VFNMADDSS4rr_Int, X86::VFNMADDSS4rm_Int, TB_NO_REVERSE },
+ { X86::VFNMADDSD4rr, X86::VFNMADDSD4rm, TB_ALIGN_NONE },
+ { X86::VFNMADDSD4rr_Int, X86::VFNMADDSD4rm_Int, TB_NO_REVERSE },
+ { X86::VFNMADDPS4rr, X86::VFNMADDPS4rm, TB_ALIGN_NONE },
+ { X86::VFNMADDPD4rr, X86::VFNMADDPD4rm, TB_ALIGN_NONE },
+ { X86::VFNMADDPS4Yrr, X86::VFNMADDPS4Yrm, TB_ALIGN_NONE },
+ { X86::VFNMADDPD4Yrr, X86::VFNMADDPD4Yrm, TB_ALIGN_NONE },
+ { X86::VFMSUBSS4rr, X86::VFMSUBSS4rm, TB_ALIGN_NONE },
+ { X86::VFMSUBSS4rr_Int, X86::VFMSUBSS4rm_Int, TB_NO_REVERSE },
+ { X86::VFMSUBSD4rr, X86::VFMSUBSD4rm, TB_ALIGN_NONE },
+ { X86::VFMSUBSD4rr_Int, X86::VFMSUBSD4rm_Int, TB_NO_REVERSE },
+ { X86::VFMSUBPS4rr, X86::VFMSUBPS4rm, TB_ALIGN_NONE },
+ { X86::VFMSUBPD4rr, X86::VFMSUBPD4rm, TB_ALIGN_NONE },
+ { X86::VFMSUBPS4Yrr, X86::VFMSUBPS4Yrm, TB_ALIGN_NONE },
+ { X86::VFMSUBPD4Yrr, X86::VFMSUBPD4Yrm, TB_ALIGN_NONE },
+ { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4rm, TB_ALIGN_NONE },
+ { X86::VFNMSUBSS4rr_Int, X86::VFNMSUBSS4rm_Int, TB_NO_REVERSE },
+ { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4rm, TB_ALIGN_NONE },
+ { X86::VFNMSUBSD4rr_Int, X86::VFNMSUBSD4rm_Int, TB_NO_REVERSE },
+ { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4rm, TB_ALIGN_NONE },
+ { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4rm, TB_ALIGN_NONE },
+ { X86::VFNMSUBPS4Yrr, X86::VFNMSUBPS4Yrm, TB_ALIGN_NONE },
+ { X86::VFNMSUBPD4Yrr, X86::VFNMSUBPD4Yrm, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4rm, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4rm, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPS4Yrr, X86::VFMADDSUBPS4Yrm, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPD4Yrr, X86::VFMADDSUBPD4Yrm, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4rm, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4rm, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPS4Yrr, X86::VFMSUBADDPS4Yrm, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPD4Yrr, X86::VFMSUBADDPD4Yrm, TB_ALIGN_NONE },
+
+ // XOP foldable instructions
+ { X86::VPCMOVrrr, X86::VPCMOVrrm, 0 },
+ { X86::VPCMOVrrrY, X86::VPCMOVrrmY, 0 },
+ { X86::VPERMIL2PDrr, X86::VPERMIL2PDrm, 0 },
+ { X86::VPERMIL2PDrrY, X86::VPERMIL2PDrmY, 0 },
+ { X86::VPERMIL2PSrr, X86::VPERMIL2PSrm, 0 },
+ { X86::VPERMIL2PSrrY, X86::VPERMIL2PSrmY, 0 },
+ { X86::VPPERMrrr, X86::VPPERMrrm, 0 },
+
+ // AVX-512 instructions with 3 source operands.
+ { X86::VBLENDMPDZrr, X86::VBLENDMPDZrm, 0 },
+ { X86::VBLENDMPSZrr, X86::VBLENDMPSZrm, 0 },
+ { X86::VPBLENDMDZrr, X86::VPBLENDMDZrm, 0 },
+ { X86::VPBLENDMQZrr, X86::VPBLENDMQZrm, 0 },
+ { X86::VBROADCASTSSZrk, X86::VBROADCASTSSZmk, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZrk, X86::VBROADCASTSDZmk, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZ256rk, X86::VBROADCASTSSZ256mk, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZ256rk, X86::VBROADCASTSDZ256mk, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZ128rk, X86::VBROADCASTSSZ128mk, TB_NO_REVERSE },
+ { X86::VPERMI2Brr, X86::VPERMI2Brm, 0 },
+ { X86::VPERMI2Drr, X86::VPERMI2Drm, 0 },
+ { X86::VPERMI2PSrr, X86::VPERMI2PSrm, 0 },
+ { X86::VPERMI2PDrr, X86::VPERMI2PDrm, 0 },
+ { X86::VPERMI2Qrr, X86::VPERMI2Qrm, 0 },
+ { X86::VPERMI2Wrr, X86::VPERMI2Wrm, 0 },
+ { X86::VPERMT2Brr, X86::VPERMT2Brm, 0 },
+ { X86::VPERMT2Drr, X86::VPERMT2Drm, 0 },
+ { X86::VPERMT2PSrr, X86::VPERMT2PSrm, 0 },
+ { X86::VPERMT2PDrr, X86::VPERMT2PDrm, 0 },
+ { X86::VPERMT2Qrr, X86::VPERMT2Qrm, 0 },
+ { X86::VPERMT2Wrr, X86::VPERMT2Wrm, 0 },
+ { X86::VPTERNLOGDZrri, X86::VPTERNLOGDZrmi, 0 },
+ { X86::VPTERNLOGQZrri, X86::VPTERNLOGQZrmi, 0 },
+
+ // AVX-512VL 256-bit instructions with 3 source operands.
+ { X86::VPERMI2B256rr, X86::VPERMI2B256rm, 0 },
+ { X86::VPERMI2D256rr, X86::VPERMI2D256rm, 0 },
+ { X86::VPERMI2PD256rr, X86::VPERMI2PD256rm, 0 },
+ { X86::VPERMI2PS256rr, X86::VPERMI2PS256rm, 0 },
+ { X86::VPERMI2Q256rr, X86::VPERMI2Q256rm, 0 },
+ { X86::VPERMI2W256rr, X86::VPERMI2W256rm, 0 },
+ { X86::VPERMT2B256rr, X86::VPERMT2B256rm, 0 },
+ { X86::VPERMT2D256rr, X86::VPERMT2D256rm, 0 },
+ { X86::VPERMT2PD256rr, X86::VPERMT2PD256rm, 0 },
+ { X86::VPERMT2PS256rr, X86::VPERMT2PS256rm, 0 },
+ { X86::VPERMT2Q256rr, X86::VPERMT2Q256rm, 0 },
+ { X86::VPERMT2W256rr, X86::VPERMT2W256rm, 0 },
+ { X86::VPTERNLOGDZ256rri, X86::VPTERNLOGDZ256rmi, 0 },
+ { X86::VPTERNLOGQZ256rri, X86::VPTERNLOGQZ256rmi, 0 },
+
+ // AVX-512VL 128-bit instructions with 3 source operands.
+ { X86::VPERMI2B128rr, X86::VPERMI2B128rm, 0 },
+ { X86::VPERMI2D128rr, X86::VPERMI2D128rm, 0 },
+ { X86::VPERMI2PD128rr, X86::VPERMI2PD128rm, 0 },
+ { X86::VPERMI2PS128rr, X86::VPERMI2PS128rm, 0 },
+ { X86::VPERMI2Q128rr, X86::VPERMI2Q128rm, 0 },
+ { X86::VPERMI2W128rr, X86::VPERMI2W128rm, 0 },
+ { X86::VPERMT2B128rr, X86::VPERMT2B128rm, 0 },
+ { X86::VPERMT2D128rr, X86::VPERMT2D128rm, 0 },
+ { X86::VPERMT2PD128rr, X86::VPERMT2PD128rm, 0 },
+ { X86::VPERMT2PS128rr, X86::VPERMT2PS128rm, 0 },
+ { X86::VPERMT2Q128rr, X86::VPERMT2Q128rm, 0 },
+ { X86::VPERMT2W128rr, X86::VPERMT2W128rm, 0 },
+ { X86::VPTERNLOGDZ128rri, X86::VPTERNLOGDZ128rmi, 0 },
+ { X86::VPTERNLOGQZ128rri, X86::VPTERNLOGQZ128rmi, 0 },
+
+ // AVX-512 masked instructions
+ { X86::VADDPDZrrkz, X86::VADDPDZrmkz, 0 },
+ { X86::VADDPSZrrkz, X86::VADDPSZrmkz, 0 },
+ { X86::VALIGNDZrrikz, X86::VALIGNDZrmikz, 0 },
+ { X86::VALIGNQZrrikz, X86::VALIGNQZrmikz, 0 },
+ { X86::VANDNPDZrrkz, X86::VANDNPDZrmkz, 0 },
+ { X86::VANDNPSZrrkz, X86::VANDNPSZrmkz, 0 },
+ { X86::VANDPDZrrkz, X86::VANDPDZrmkz, 0 },
+ { X86::VANDPSZrrkz, X86::VANDPSZrmkz, 0 },
+ { X86::VDIVPDZrrkz, X86::VDIVPDZrmkz, 0 },
+ { X86::VDIVPSZrrkz, X86::VDIVPSZrmkz, 0 },
+ { X86::VINSERTF32x4Zrrkz, X86::VINSERTF32x4Zrmkz, 0 },
+ { X86::VINSERTF32x8Zrrkz, X86::VINSERTF32x8Zrmkz, 0 },
+ { X86::VINSERTF64x2Zrrkz, X86::VINSERTF64x2Zrmkz, 0 },
+ { X86::VINSERTF64x4Zrrkz, X86::VINSERTF64x4Zrmkz, 0 },
+ { X86::VINSERTI32x4Zrrkz, X86::VINSERTI32x4Zrmkz, 0 },
+ { X86::VINSERTI32x8Zrrkz, X86::VINSERTI32x8Zrmkz, 0 },
+ { X86::VINSERTI64x2Zrrkz, X86::VINSERTI64x2Zrmkz, 0 },
+ { X86::VINSERTI64x4Zrrkz, X86::VINSERTI64x4Zrmkz, 0 },
+ { X86::VMAXCPDZrrkz, X86::VMAXCPDZrmkz, 0 },
+ { X86::VMAXCPSZrrkz, X86::VMAXCPSZrmkz, 0 },
+ { X86::VMAXPDZrrkz, X86::VMAXPDZrmkz, 0 },
+ { X86::VMAXPSZrrkz, X86::VMAXPSZrmkz, 0 },
+ { X86::VMINCPDZrrkz, X86::VMINCPDZrmkz, 0 },
+ { X86::VMINCPSZrrkz, X86::VMINCPSZrmkz, 0 },
+ { X86::VMINPDZrrkz, X86::VMINPDZrmkz, 0 },
+ { X86::VMINPSZrrkz, X86::VMINPSZrmkz, 0 },
+ { X86::VMULPDZrrkz, X86::VMULPDZrmkz, 0 },
+ { X86::VMULPSZrrkz, X86::VMULPSZrmkz, 0 },
+ { X86::VORPDZrrkz, X86::VORPDZrmkz, 0 },
+ { X86::VORPSZrrkz, X86::VORPSZrmkz, 0 },
+ { X86::VPADDBZrrkz, X86::VPADDBZrmkz, 0 },
+ { X86::VPADDDZrrkz, X86::VPADDDZrmkz, 0 },
+ { X86::VPADDQZrrkz, X86::VPADDQZrmkz, 0 },
+ { X86::VPADDSBZrrkz, X86::VPADDSBZrmkz, 0 },
+ { X86::VPADDSWZrrkz, X86::VPADDSWZrmkz, 0 },
+ { X86::VPADDUSBZrrkz, X86::VPADDUSBZrmkz, 0 },
+ { X86::VPADDUSWZrrkz, X86::VPADDUSWZrmkz, 0 },
+ { X86::VPADDWZrrkz, X86::VPADDWZrmkz, 0 },
+ { X86::VPALIGNRZrrikz, X86::VPALIGNRZrmikz, 0 },
+ { X86::VPANDDZrrkz, X86::VPANDDZrmkz, 0 },
+ { X86::VPANDNDZrrkz, X86::VPANDNDZrmkz, 0 },
+ { X86::VPANDNQZrrkz, X86::VPANDNQZrmkz, 0 },
+ { X86::VPANDQZrrkz, X86::VPANDQZrmkz, 0 },
+ { X86::VPERMBZrrkz, X86::VPERMBZrmkz, 0 },
+ { X86::VPERMDZrrkz, X86::VPERMDZrmkz, 0 },
+ { X86::VPERMILPDZrrkz, X86::VPERMILPDZrmkz, 0 },
+ { X86::VPERMILPSZrrkz, X86::VPERMILPSZrmkz, 0 },
+ { X86::VPERMPDZrrkz, X86::VPERMPDZrmkz, 0 },
+ { X86::VPERMPSZrrkz, X86::VPERMPSZrmkz, 0 },
+ { X86::VPERMQZrrkz, X86::VPERMQZrmkz, 0 },
+ { X86::VPERMWZrrkz, X86::VPERMWZrmkz, 0 },
+ { X86::VPMADDUBSWZrrkz, X86::VPMADDUBSWZrmkz, 0 },
+ { X86::VPMADDWDZrrkz, X86::VPMADDWDZrmkz, 0 },
+ { X86::VPORDZrrkz, X86::VPORDZrmkz, 0 },
+ { X86::VPORQZrrkz, X86::VPORQZrmkz, 0 },
+ { X86::VPSHUFBZrrkz, X86::VPSHUFBZrmkz, 0 },
+ { X86::VPSUBBZrrkz, X86::VPSUBBZrmkz, 0 },
+ { X86::VPSUBDZrrkz, X86::VPSUBDZrmkz, 0 },
+ { X86::VPSUBQZrrkz, X86::VPSUBQZrmkz, 0 },
+ { X86::VPSUBSBZrrkz, X86::VPSUBSBZrmkz, 0 },
+ { X86::VPSUBSWZrrkz, X86::VPSUBSWZrmkz, 0 },
+ { X86::VPSUBUSBZrrkz, X86::VPSUBUSBZrmkz, 0 },
+ { X86::VPSUBUSWZrrkz, X86::VPSUBUSWZrmkz, 0 },
+ { X86::VPSUBWZrrkz, X86::VPSUBWZrmkz, 0 },
+ { X86::VPUNPCKHBWZrrkz, X86::VPUNPCKHBWZrmkz, 0 },
+ { X86::VPUNPCKHDQZrrkz, X86::VPUNPCKHDQZrmkz, 0 },
+ { X86::VPUNPCKHQDQZrrkz, X86::VPUNPCKHQDQZrmkz, 0 },
+ { X86::VPUNPCKHWDZrrkz, X86::VPUNPCKHWDZrmkz, 0 },
+ { X86::VPUNPCKLBWZrrkz, X86::VPUNPCKLBWZrmkz, 0 },
+ { X86::VPUNPCKLDQZrrkz, X86::VPUNPCKLDQZrmkz, 0 },
+ { X86::VPUNPCKLQDQZrrkz, X86::VPUNPCKLQDQZrmkz, 0 },
+ { X86::VPUNPCKLWDZrrkz, X86::VPUNPCKLWDZrmkz, 0 },
+ { X86::VPXORDZrrkz, X86::VPXORDZrmkz, 0 },
+ { X86::VPXORQZrrkz, X86::VPXORQZrmkz, 0 },
+ { X86::VSUBPDZrrkz, X86::VSUBPDZrmkz, 0 },
+ { X86::VSUBPSZrrkz, X86::VSUBPSZrmkz, 0 },
+ { X86::VUNPCKHPDZrrkz, X86::VUNPCKHPDZrmkz, 0 },
+ { X86::VUNPCKHPSZrrkz, X86::VUNPCKHPSZrmkz, 0 },
+ { X86::VUNPCKLPDZrrkz, X86::VUNPCKLPDZrmkz, 0 },
+ { X86::VUNPCKLPSZrrkz, X86::VUNPCKLPSZrmkz, 0 },
+ { X86::VXORPDZrrkz, X86::VXORPDZrmkz, 0 },
+ { X86::VXORPSZrrkz, X86::VXORPSZrmkz, 0 },
+
+ // AVX-512{F,VL} masked arithmetic instructions 256-bit
+ { X86::VADDPDZ256rrkz, X86::VADDPDZ256rmkz, 0 },
+ { X86::VADDPSZ256rrkz, X86::VADDPSZ256rmkz, 0 },
+ { X86::VALIGNDZ256rrikz, X86::VALIGNDZ256rmikz, 0 },
+ { X86::VALIGNQZ256rrikz, X86::VALIGNQZ256rmikz, 0 },
+ { X86::VANDNPDZ256rrkz, X86::VANDNPDZ256rmkz, 0 },
+ { X86::VANDNPSZ256rrkz, X86::VANDNPSZ256rmkz, 0 },
+ { X86::VANDPDZ256rrkz, X86::VANDPDZ256rmkz, 0 },
+ { X86::VANDPSZ256rrkz, X86::VANDPSZ256rmkz, 0 },
+ { X86::VDIVPDZ256rrkz, X86::VDIVPDZ256rmkz, 0 },
+ { X86::VDIVPSZ256rrkz, X86::VDIVPSZ256rmkz, 0 },
+ { X86::VINSERTF32x4Z256rrkz, X86::VINSERTF32x4Z256rmkz, 0 },
+ { X86::VINSERTF64x2Z256rrkz, X86::VINSERTF64x2Z256rmkz, 0 },
+ { X86::VINSERTI32x4Z256rrkz, X86::VINSERTI32x4Z256rmkz, 0 },
+ { X86::VINSERTI64x2Z256rrkz, X86::VINSERTI64x2Z256rmkz, 0 },
+ { X86::VMAXCPDZ256rrkz, X86::VMAXCPDZ256rmkz, 0 },
+ { X86::VMAXCPSZ256rrkz, X86::VMAXCPSZ256rmkz, 0 },
+ { X86::VMAXPDZ256rrkz, X86::VMAXPDZ256rmkz, 0 },
+ { X86::VMAXPSZ256rrkz, X86::VMAXPSZ256rmkz, 0 },
+ { X86::VMINCPDZ256rrkz, X86::VMINCPDZ256rmkz, 0 },
+ { X86::VMINCPSZ256rrkz, X86::VMINCPSZ256rmkz, 0 },
+ { X86::VMINPDZ256rrkz, X86::VMINPDZ256rmkz, 0 },
+ { X86::VMINPSZ256rrkz, X86::VMINPSZ256rmkz, 0 },
+ { X86::VMULPDZ256rrkz, X86::VMULPDZ256rmkz, 0 },
+ { X86::VMULPSZ256rrkz, X86::VMULPSZ256rmkz, 0 },
+ { X86::VORPDZ256rrkz, X86::VORPDZ256rmkz, 0 },
+ { X86::VORPSZ256rrkz, X86::VORPSZ256rmkz, 0 },
+ { X86::VPADDBZ256rrkz, X86::VPADDBZ256rmkz, 0 },
+ { X86::VPADDDZ256rrkz, X86::VPADDDZ256rmkz, 0 },
+ { X86::VPADDQZ256rrkz, X86::VPADDQZ256rmkz, 0 },
+ { X86::VPADDSBZ256rrkz, X86::VPADDSBZ256rmkz, 0 },
+ { X86::VPADDSWZ256rrkz, X86::VPADDSWZ256rmkz, 0 },
+ { X86::VPADDUSBZ256rrkz, X86::VPADDUSBZ256rmkz, 0 },
+ { X86::VPADDUSWZ256rrkz, X86::VPADDUSWZ256rmkz, 0 },
+ { X86::VPADDWZ256rrkz, X86::VPADDWZ256rmkz, 0 },
+ { X86::VPALIGNRZ256rrikz, X86::VPALIGNRZ256rmikz, 0 },
+ { X86::VPANDDZ256rrkz, X86::VPANDDZ256rmkz, 0 },
+ { X86::VPANDNDZ256rrkz, X86::VPANDNDZ256rmkz, 0 },
+ { X86::VPANDNQZ256rrkz, X86::VPANDNQZ256rmkz, 0 },
+ { X86::VPANDQZ256rrkz, X86::VPANDQZ256rmkz, 0 },
+ { X86::VPERMBZ256rrkz, X86::VPERMBZ256rmkz, 0 },
+ { X86::VPERMDZ256rrkz, X86::VPERMDZ256rmkz, 0 },
+ { X86::VPERMILPDZ256rrkz, X86::VPERMILPDZ256rmkz, 0 },
+ { X86::VPERMILPSZ256rrkz, X86::VPERMILPSZ256rmkz, 0 },
+ { X86::VPERMPDZ256rrkz, X86::VPERMPDZ256rmkz, 0 },
+ { X86::VPERMPSZ256rrkz, X86::VPERMPSZ256rmkz, 0 },
+ { X86::VPERMQZ256rrkz, X86::VPERMQZ256rmkz, 0 },
+ { X86::VPERMWZ256rrkz, X86::VPERMWZ256rmkz, 0 },
+ { X86::VPMADDUBSWZ256rrkz, X86::VPMADDUBSWZ256rmkz, 0 },
+ { X86::VPMADDWDZ256rrkz, X86::VPMADDWDZ256rmkz, 0 },
+ { X86::VPORDZ256rrkz, X86::VPORDZ256rmkz, 0 },
+ { X86::VPORQZ256rrkz, X86::VPORQZ256rmkz, 0 },
+ { X86::VPSHUFBZ256rrkz, X86::VPSHUFBZ256rmkz, 0 },
+ { X86::VPSUBBZ256rrkz, X86::VPSUBBZ256rmkz, 0 },
+ { X86::VPSUBDZ256rrkz, X86::VPSUBDZ256rmkz, 0 },
+ { X86::VPSUBQZ256rrkz, X86::VPSUBQZ256rmkz, 0 },
+ { X86::VPSUBSBZ256rrkz, X86::VPSUBSBZ256rmkz, 0 },
+ { X86::VPSUBSWZ256rrkz, X86::VPSUBSWZ256rmkz, 0 },
+ { X86::VPSUBUSBZ256rrkz, X86::VPSUBUSBZ256rmkz, 0 },
+ { X86::VPSUBUSWZ256rrkz, X86::VPSUBUSWZ256rmkz, 0 },
+ { X86::VPSUBWZ256rrkz, X86::VPSUBWZ256rmkz, 0 },
+ { X86::VPUNPCKHBWZ256rrkz, X86::VPUNPCKHBWZ256rmkz, 0 },
+ { X86::VPUNPCKHDQZ256rrkz, X86::VPUNPCKHDQZ256rmkz, 0 },
+ { X86::VPUNPCKHQDQZ256rrkz, X86::VPUNPCKHQDQZ256rmkz, 0 },
+ { X86::VPUNPCKHWDZ256rrkz, X86::VPUNPCKHWDZ256rmkz, 0 },
+ { X86::VPUNPCKLBWZ256rrkz, X86::VPUNPCKLBWZ256rmkz, 0 },
+ { X86::VPUNPCKLDQZ256rrkz, X86::VPUNPCKLDQZ256rmkz, 0 },
+ { X86::VPUNPCKLQDQZ256rrkz, X86::VPUNPCKLQDQZ256rmkz, 0 },
+ { X86::VPUNPCKLWDZ256rrkz, X86::VPUNPCKLWDZ256rmkz, 0 },
+ { X86::VPXORDZ256rrkz, X86::VPXORDZ256rmkz, 0 },
+ { X86::VPXORQZ256rrkz, X86::VPXORQZ256rmkz, 0 },
+ { X86::VSUBPDZ256rrkz, X86::VSUBPDZ256rmkz, 0 },
+ { X86::VSUBPSZ256rrkz, X86::VSUBPSZ256rmkz, 0 },
+ { X86::VUNPCKHPDZ256rrkz, X86::VUNPCKHPDZ256rmkz, 0 },
+ { X86::VUNPCKHPSZ256rrkz, X86::VUNPCKHPSZ256rmkz, 0 },
+ { X86::VUNPCKLPDZ256rrkz, X86::VUNPCKLPDZ256rmkz, 0 },
+ { X86::VUNPCKLPSZ256rrkz, X86::VUNPCKLPSZ256rmkz, 0 },
+ { X86::VXORPDZ256rrkz, X86::VXORPDZ256rmkz, 0 },
+ { X86::VXORPSZ256rrkz, X86::VXORPSZ256rmkz, 0 },
+
+ // AVX-512{F,VL} masked arithmetic instructions 128-bit
+ { X86::VADDPDZ128rrkz, X86::VADDPDZ128rmkz, 0 },
+ { X86::VADDPSZ128rrkz, X86::VADDPSZ128rmkz, 0 },
+ { X86::VALIGNDZ128rrikz, X86::VALIGNDZ128rmikz, 0 },
+ { X86::VALIGNQZ128rrikz, X86::VALIGNQZ128rmikz, 0 },
+ { X86::VANDNPDZ128rrkz, X86::VANDNPDZ128rmkz, 0 },
+ { X86::VANDNPSZ128rrkz, X86::VANDNPSZ128rmkz, 0 },
+ { X86::VANDPDZ128rrkz, X86::VANDPDZ128rmkz, 0 },
+ { X86::VANDPSZ128rrkz, X86::VANDPSZ128rmkz, 0 },
+ { X86::VDIVPDZ128rrkz, X86::VDIVPDZ128rmkz, 0 },
+ { X86::VDIVPSZ128rrkz, X86::VDIVPSZ128rmkz, 0 },
+ { X86::VMAXCPDZ128rrkz, X86::VMAXCPDZ128rmkz, 0 },
+ { X86::VMAXCPSZ128rrkz, X86::VMAXCPSZ128rmkz, 0 },
+ { X86::VMAXPDZ128rrkz, X86::VMAXPDZ128rmkz, 0 },
+ { X86::VMAXPSZ128rrkz, X86::VMAXPSZ128rmkz, 0 },
+ { X86::VMINCPDZ128rrkz, X86::VMINCPDZ128rmkz, 0 },
+ { X86::VMINCPSZ128rrkz, X86::VMINCPSZ128rmkz, 0 },
+ { X86::VMINPDZ128rrkz, X86::VMINPDZ128rmkz, 0 },
+ { X86::VMINPSZ128rrkz, X86::VMINPSZ128rmkz, 0 },
+ { X86::VMULPDZ128rrkz, X86::VMULPDZ128rmkz, 0 },
+ { X86::VMULPSZ128rrkz, X86::VMULPSZ128rmkz, 0 },
+ { X86::VORPDZ128rrkz, X86::VORPDZ128rmkz, 0 },
+ { X86::VORPSZ128rrkz, X86::VORPSZ128rmkz, 0 },
+ { X86::VPADDBZ128rrkz, X86::VPADDBZ128rmkz, 0 },
+ { X86::VPADDDZ128rrkz, X86::VPADDDZ128rmkz, 0 },
+ { X86::VPADDQZ128rrkz, X86::VPADDQZ128rmkz, 0 },
+ { X86::VPADDSBZ128rrkz, X86::VPADDSBZ128rmkz, 0 },
+ { X86::VPADDSWZ128rrkz, X86::VPADDSWZ128rmkz, 0 },
+ { X86::VPADDUSBZ128rrkz, X86::VPADDUSBZ128rmkz, 0 },
+ { X86::VPADDUSWZ128rrkz, X86::VPADDUSWZ128rmkz, 0 },
+ { X86::VPADDWZ128rrkz, X86::VPADDWZ128rmkz, 0 },
+ { X86::VPALIGNRZ128rrikz, X86::VPALIGNRZ128rmikz, 0 },
+ { X86::VPANDDZ128rrkz, X86::VPANDDZ128rmkz, 0 },
+ { X86::VPANDNDZ128rrkz, X86::VPANDNDZ128rmkz, 0 },
+ { X86::VPANDNQZ128rrkz, X86::VPANDNQZ128rmkz, 0 },
+ { X86::VPANDQZ128rrkz, X86::VPANDQZ128rmkz, 0 },
+ { X86::VPERMBZ128rrkz, X86::VPERMBZ128rmkz, 0 },
+ { X86::VPERMILPDZ128rrkz, X86::VPERMILPDZ128rmkz, 0 },
+ { X86::VPERMILPSZ128rrkz, X86::VPERMILPSZ128rmkz, 0 },
+ { X86::VPERMWZ128rrkz, X86::VPERMWZ128rmkz, 0 },
+ { X86::VPMADDUBSWZ128rrkz, X86::VPMADDUBSWZ128rmkz, 0 },
+ { X86::VPMADDWDZ128rrkz, X86::VPMADDWDZ128rmkz, 0 },
+ { X86::VPORDZ128rrkz, X86::VPORDZ128rmkz, 0 },
+ { X86::VPORQZ128rrkz, X86::VPORQZ128rmkz, 0 },
+ { X86::VPSHUFBZ128rrkz, X86::VPSHUFBZ128rmkz, 0 },
+ { X86::VPSUBBZ128rrkz, X86::VPSUBBZ128rmkz, 0 },
+ { X86::VPSUBDZ128rrkz, X86::VPSUBDZ128rmkz, 0 },
+ { X86::VPSUBQZ128rrkz, X86::VPSUBQZ128rmkz, 0 },
+ { X86::VPSUBSBZ128rrkz, X86::VPSUBSBZ128rmkz, 0 },
+ { X86::VPSUBSWZ128rrkz, X86::VPSUBSWZ128rmkz, 0 },
+ { X86::VPSUBUSBZ128rrkz, X86::VPSUBUSBZ128rmkz, 0 },
+ { X86::VPSUBUSWZ128rrkz, X86::VPSUBUSWZ128rmkz, 0 },
+ { X86::VPSUBWZ128rrkz, X86::VPSUBWZ128rmkz, 0 },
+ { X86::VPUNPCKHBWZ128rrkz, X86::VPUNPCKHBWZ128rmkz, 0 },
+ { X86::VPUNPCKHDQZ128rrkz, X86::VPUNPCKHDQZ128rmkz, 0 },
+ { X86::VPUNPCKHQDQZ128rrkz, X86::VPUNPCKHQDQZ128rmkz, 0 },
+ { X86::VPUNPCKHWDZ128rrkz, X86::VPUNPCKHWDZ128rmkz, 0 },
+ { X86::VPUNPCKLBWZ128rrkz, X86::VPUNPCKLBWZ128rmkz, 0 },
+ { X86::VPUNPCKLDQZ128rrkz, X86::VPUNPCKLDQZ128rmkz, 0 },
+ { X86::VPUNPCKLQDQZ128rrkz, X86::VPUNPCKLQDQZ128rmkz, 0 },
+ { X86::VPUNPCKLWDZ128rrkz, X86::VPUNPCKLWDZ128rmkz, 0 },
+ { X86::VPXORDZ128rrkz, X86::VPXORDZ128rmkz, 0 },
+ { X86::VPXORQZ128rrkz, X86::VPXORQZ128rmkz, 0 },
+ { X86::VSUBPDZ128rrkz, X86::VSUBPDZ128rmkz, 0 },
+ { X86::VSUBPSZ128rrkz, X86::VSUBPSZ128rmkz, 0 },
+ { X86::VUNPCKHPDZ128rrkz, X86::VUNPCKHPDZ128rmkz, 0 },
+ { X86::VUNPCKHPSZ128rrkz, X86::VUNPCKHPSZ128rmkz, 0 },
+ { X86::VUNPCKLPDZ128rrkz, X86::VUNPCKLPDZ128rmkz, 0 },
+ { X86::VUNPCKLPSZ128rrkz, X86::VUNPCKLPSZ128rmkz, 0 },
+ { X86::VXORPDZ128rrkz, X86::VXORPDZ128rmkz, 0 },
+ { X86::VXORPSZ128rrkz, X86::VXORPSZ128rmkz, 0 },
+
+ // AVX-512 masked foldable instructions
+ { X86::VPERMILPDZrik, X86::VPERMILPDZmik, 0 },
+ { X86::VPERMILPSZrik, X86::VPERMILPSZmik, 0 },
+ { X86::VPERMPDZrik, X86::VPERMPDZmik, 0 },
+ { X86::VPERMQZrik, X86::VPERMQZmik, 0 },
+ { X86::VPMOVSXBDZrrk, X86::VPMOVSXBDZrmk, 0 },
+ { X86::VPMOVSXBQZrrk, X86::VPMOVSXBQZrmk, TB_NO_REVERSE },
+ { X86::VPMOVSXBWZrrk, X86::VPMOVSXBWZrmk, 0 },
+ { X86::VPMOVSXDQZrrk, X86::VPMOVSXDQZrmk, 0 },
+ { X86::VPMOVSXWDZrrk, X86::VPMOVSXWDZrmk, 0 },
+ { X86::VPMOVSXWQZrrk, X86::VPMOVSXWQZrmk, 0 },
+ { X86::VPMOVZXBDZrrk, X86::VPMOVZXBDZrmk, 0 },
+ { X86::VPMOVZXBQZrrk, X86::VPMOVZXBQZrmk, TB_NO_REVERSE },
+ { X86::VPMOVZXBWZrrk, X86::VPMOVZXBWZrmk, 0 },
+ { X86::VPMOVZXDQZrrk, X86::VPMOVZXDQZrmk, 0 },
+ { X86::VPMOVZXWDZrrk, X86::VPMOVZXWDZrmk, 0 },
+ { X86::VPMOVZXWQZrrk, X86::VPMOVZXWQZrmk, 0 },
+ { X86::VPSHUFDZrik, X86::VPSHUFDZmik, 0 },
+ { X86::VPSHUFHWZrik, X86::VPSHUFHWZmik, 0 },
+ { X86::VPSHUFLWZrik, X86::VPSHUFLWZmik, 0 },
+
+ // AVX-512VL 256-bit masked foldable instructions
+ { X86::VPERMILPDZ256rik, X86::VPERMILPDZ256mik, 0 },
+ { X86::VPERMILPSZ256rik, X86::VPERMILPSZ256mik, 0 },
+ { X86::VPERMPDZ256rik, X86::VPERMPDZ256mik, 0 },
+ { X86::VPERMQZ256rik, X86::VPERMQZ256mik, 0 },
+ { X86::VPMOVSXBDZ256rrk, X86::VPMOVSXBDZ256rmk, TB_NO_REVERSE },
+ { X86::VPMOVSXBQZ256rrk, X86::VPMOVSXBQZ256rmk, TB_NO_REVERSE },
+ { X86::VPMOVSXBWZ256rrk, X86::VPMOVSXBWZ256rmk, 0 },
+ { X86::VPMOVSXDQZ256rrk, X86::VPMOVSXDQZ256rmk, 0 },
+ { X86::VPMOVSXWDZ256rrk, X86::VPMOVSXWDZ256rmk, 0 },
+ { X86::VPMOVSXWQZ256rrk, X86::VPMOVSXWQZ256rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXBDZ256rrk, X86::VPMOVZXBDZ256rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXBQZ256rrk, X86::VPMOVZXBQZ256rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXBWZ256rrk, X86::VPMOVZXBWZ256rmk, 0 },
+ { X86::VPMOVZXDQZ256rrk, X86::VPMOVZXDQZ256rmk, 0 },
+ { X86::VPMOVZXWDZ256rrk, X86::VPMOVZXWDZ256rmk, 0 },
+ { X86::VPMOVZXWQZ256rrk, X86::VPMOVZXWQZ256rmk, TB_NO_REVERSE },
+ { X86::VPSHUFDZ256rik, X86::VPSHUFDZ256mik, 0 },
+ { X86::VPSHUFHWZ256rik, X86::VPSHUFHWZ256mik, 0 },
+ { X86::VPSHUFLWZ256rik, X86::VPSHUFLWZ256mik, 0 },
+
+ // AVX-512VL 128-bit masked foldable instructions
+ { X86::VPERMILPDZ128rik, X86::VPERMILPDZ128mik, 0 },
+ { X86::VPERMILPSZ128rik, X86::VPERMILPSZ128mik, 0 },
+ { X86::VPMOVSXBDZ128rrk, X86::VPMOVSXBDZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVSXBQZ128rrk, X86::VPMOVSXBQZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVSXBWZ128rrk, X86::VPMOVSXBWZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVSXDQZ128rrk, X86::VPMOVSXDQZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVSXWDZ128rrk, X86::VPMOVSXWDZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVSXWQZ128rrk, X86::VPMOVSXWQZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXBDZ128rrk, X86::VPMOVZXBDZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXBQZ128rrk, X86::VPMOVZXBQZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXBWZ128rrk, X86::VPMOVZXBWZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXDQZ128rrk, X86::VPMOVZXDQZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXWDZ128rrk, X86::VPMOVZXWDZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXWQZ128rrk, X86::VPMOVZXWQZ128rmk, TB_NO_REVERSE },
+ { X86::VPSHUFDZ128rik, X86::VPSHUFDZ128mik, 0 },
+ { X86::VPSHUFHWZ128rik, X86::VPSHUFHWZ128mik, 0 },
+ { X86::VPSHUFLWZ128rik, X86::VPSHUFLWZ128mik, 0 },
+ };
+
+ for (X86MemoryFoldTableEntry Entry : MemoryFoldTable3) {
+ AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
+ Entry.RegOp, Entry.MemOp,
+ // Index 3, folded load
+ Entry.Flags | TB_INDEX_3 | TB_FOLDED_LOAD);
+ }
+ auto I = X86InstrFMA3Info::rm_begin();
+ auto E = X86InstrFMA3Info::rm_end();
+ for (; I != E; ++I) {
+ if (!I.getGroup()->isKMasked()) {
+ // Intrinsic forms need to pass TB_NO_REVERSE.
+ if (I.getGroup()->isIntrinsic()) {
+ AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
+ I.getRegOpcode(), I.getMemOpcode(),
+ TB_ALIGN_NONE | TB_INDEX_3 | TB_FOLDED_LOAD | TB_NO_REVERSE);
+ } else {
+ AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
+ I.getRegOpcode(), I.getMemOpcode(),
+ TB_ALIGN_NONE | TB_INDEX_3 | TB_FOLDED_LOAD);
+ }
+ }
+ }
+
+ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
+ // AVX-512 foldable masked instructions
+ { X86::VADDPDZrrk, X86::VADDPDZrmk, 0 },
+ { X86::VADDPSZrrk, X86::VADDPSZrmk, 0 },
+ { X86::VALIGNDZrrik, X86::VALIGNDZrmik, 0 },
+ { X86::VALIGNQZrrik, X86::VALIGNQZrmik, 0 },
+ { X86::VANDNPDZrrk, X86::VANDNPDZrmk, 0 },
+ { X86::VANDNPSZrrk, X86::VANDNPSZrmk, 0 },
+ { X86::VANDPDZrrk, X86::VANDPDZrmk, 0 },
+ { X86::VANDPSZrrk, X86::VANDPSZrmk, 0 },
+ { X86::VDIVPDZrrk, X86::VDIVPDZrmk, 0 },
+ { X86::VDIVPSZrrk, X86::VDIVPSZrmk, 0 },
+ { X86::VINSERTF32x4Zrrk, X86::VINSERTF32x4Zrmk, 0 },
+ { X86::VINSERTF32x8Zrrk, X86::VINSERTF32x8Zrmk, 0 },
+ { X86::VINSERTF64x2Zrrk, X86::VINSERTF64x2Zrmk, 0 },
+ { X86::VINSERTF64x4Zrrk, X86::VINSERTF64x4Zrmk, 0 },
+ { X86::VINSERTI32x4Zrrk, X86::VINSERTI32x4Zrmk, 0 },
+ { X86::VINSERTI32x8Zrrk, X86::VINSERTI32x8Zrmk, 0 },
+ { X86::VINSERTI64x2Zrrk, X86::VINSERTI64x2Zrmk, 0 },
+ { X86::VINSERTI64x4Zrrk, X86::VINSERTI64x4Zrmk, 0 },
+ { X86::VMAXCPDZrrk, X86::VMAXCPDZrmk, 0 },
+ { X86::VMAXCPSZrrk, X86::VMAXCPSZrmk, 0 },
+ { X86::VMAXPDZrrk, X86::VMAXPDZrmk, 0 },
+ { X86::VMAXPSZrrk, X86::VMAXPSZrmk, 0 },
+ { X86::VMINCPDZrrk, X86::VMINCPDZrmk, 0 },
+ { X86::VMINCPSZrrk, X86::VMINCPSZrmk, 0 },
+ { X86::VMINPDZrrk, X86::VMINPDZrmk, 0 },
+ { X86::VMINPSZrrk, X86::VMINPSZrmk, 0 },
+ { X86::VMULPDZrrk, X86::VMULPDZrmk, 0 },
+ { X86::VMULPSZrrk, X86::VMULPSZrmk, 0 },
+ { X86::VORPDZrrk, X86::VORPDZrmk, 0 },
+ { X86::VORPSZrrk, X86::VORPSZrmk, 0 },
+ { X86::VPADDBZrrk, X86::VPADDBZrmk, 0 },
+ { X86::VPADDDZrrk, X86::VPADDDZrmk, 0 },
+ { X86::VPADDQZrrk, X86::VPADDQZrmk, 0 },
+ { X86::VPADDSBZrrk, X86::VPADDSBZrmk, 0 },
+ { X86::VPADDSWZrrk, X86::VPADDSWZrmk, 0 },
+ { X86::VPADDUSBZrrk, X86::VPADDUSBZrmk, 0 },
+ { X86::VPADDUSWZrrk, X86::VPADDUSWZrmk, 0 },
+ { X86::VPADDWZrrk, X86::VPADDWZrmk, 0 },
+ { X86::VPALIGNRZrrik, X86::VPALIGNRZrmik, 0 },
+ { X86::VPANDDZrrk, X86::VPANDDZrmk, 0 },
+ { X86::VPANDNDZrrk, X86::VPANDNDZrmk, 0 },
+ { X86::VPANDNQZrrk, X86::VPANDNQZrmk, 0 },
+ { X86::VPANDQZrrk, X86::VPANDQZrmk, 0 },
+ { X86::VPERMBZrrk, X86::VPERMBZrmk, 0 },
+ { X86::VPERMDZrrk, X86::VPERMDZrmk, 0 },
+ { X86::VPERMI2Brrk, X86::VPERMI2Brmk, 0 },
+ { X86::VPERMI2Drrk, X86::VPERMI2Drmk, 0 },
+ { X86::VPERMI2PSrrk, X86::VPERMI2PSrmk, 0 },
+ { X86::VPERMI2PDrrk, X86::VPERMI2PDrmk, 0 },
+ { X86::VPERMI2Qrrk, X86::VPERMI2Qrmk, 0 },
+ { X86::VPERMI2Wrrk, X86::VPERMI2Wrmk, 0 },
+ { X86::VPERMILPDZrrk, X86::VPERMILPDZrmk, 0 },
+ { X86::VPERMILPSZrrk, X86::VPERMILPSZrmk, 0 },
+ { X86::VPERMPDZrrk, X86::VPERMPDZrmk, 0 },
+ { X86::VPERMPSZrrk, X86::VPERMPSZrmk, 0 },
+ { X86::VPERMQZrrk, X86::VPERMQZrmk, 0 },
+ { X86::VPERMT2Brrk, X86::VPERMT2Brmk, 0 },
+ { X86::VPERMT2Drrk, X86::VPERMT2Drmk, 0 },
+ { X86::VPERMT2PSrrk, X86::VPERMT2PSrmk, 0 },
+ { X86::VPERMT2PDrrk, X86::VPERMT2PDrmk, 0 },
+ { X86::VPERMT2Qrrk, X86::VPERMT2Qrmk, 0 },
+ { X86::VPERMT2Wrrk, X86::VPERMT2Wrmk, 0 },
+ { X86::VPERMWZrrk, X86::VPERMWZrmk, 0 },
+ { X86::VPMADDUBSWZrrk, X86::VPMADDUBSWZrmk, 0 },
+ { X86::VPMADDWDZrrk, X86::VPMADDWDZrmk, 0 },
+ { X86::VPORDZrrk, X86::VPORDZrmk, 0 },
+ { X86::VPORQZrrk, X86::VPORQZrmk, 0 },
+ { X86::VPSHUFBZrrk, X86::VPSHUFBZrmk, 0 },
+ { X86::VPSUBBZrrk, X86::VPSUBBZrmk, 0 },
+ { X86::VPSUBDZrrk, X86::VPSUBDZrmk, 0 },
+ { X86::VPSUBQZrrk, X86::VPSUBQZrmk, 0 },
+ { X86::VPSUBSBZrrk, X86::VPSUBSBZrmk, 0 },
+ { X86::VPSUBSWZrrk, X86::VPSUBSWZrmk, 0 },
+ { X86::VPSUBUSBZrrk, X86::VPSUBUSBZrmk, 0 },
+ { X86::VPSUBUSWZrrk, X86::VPSUBUSWZrmk, 0 },
+ { X86::VPTERNLOGDZrrik, X86::VPTERNLOGDZrmik, 0 },
+ { X86::VPTERNLOGQZrrik, X86::VPTERNLOGQZrmik, 0 },
+ { X86::VPUNPCKHBWZrrk, X86::VPUNPCKHBWZrmk, 0 },
+ { X86::VPUNPCKHDQZrrk, X86::VPUNPCKHDQZrmk, 0 },
+ { X86::VPUNPCKHQDQZrrk, X86::VPUNPCKHQDQZrmk, 0 },
+ { X86::VPUNPCKHWDZrrk, X86::VPUNPCKHWDZrmk, 0 },
+ { X86::VPUNPCKLBWZrrk, X86::VPUNPCKLBWZrmk, 0 },
+ { X86::VPUNPCKLDQZrrk, X86::VPUNPCKLDQZrmk, 0 },
+ { X86::VPUNPCKLQDQZrrk, X86::VPUNPCKLQDQZrmk, 0 },
+ { X86::VPUNPCKLWDZrrk, X86::VPUNPCKLWDZrmk, 0 },
+ { X86::VPXORDZrrk, X86::VPXORDZrmk, 0 },
+ { X86::VPXORQZrrk, X86::VPXORQZrmk, 0 },
+ { X86::VSUBPDZrrk, X86::VSUBPDZrmk, 0 },
+ { X86::VSUBPSZrrk, X86::VSUBPSZrmk, 0 },
+ { X86::VUNPCKHPDZrrk, X86::VUNPCKHPDZrmk, 0 },
+ { X86::VUNPCKHPSZrrk, X86::VUNPCKHPSZrmk, 0 },
+ { X86::VUNPCKLPDZrrk, X86::VUNPCKLPDZrmk, 0 },
+ { X86::VUNPCKLPSZrrk, X86::VUNPCKLPSZrmk, 0 },
+ { X86::VXORPDZrrk, X86::VXORPDZrmk, 0 },
+ { X86::VXORPSZrrk, X86::VXORPSZrmk, 0 },
+
+ // AVX-512{F,VL} foldable masked instructions 256-bit
+ { X86::VADDPDZ256rrk, X86::VADDPDZ256rmk, 0 },
+ { X86::VADDPSZ256rrk, X86::VADDPSZ256rmk, 0 },
+ { X86::VALIGNDZ256rrik, X86::VALIGNDZ256rmik, 0 },
+ { X86::VALIGNQZ256rrik, X86::VALIGNQZ256rmik, 0 },
+ { X86::VANDNPDZ256rrk, X86::VANDNPDZ256rmk, 0 },
+ { X86::VANDNPSZ256rrk, X86::VANDNPSZ256rmk, 0 },
+ { X86::VANDPDZ256rrk, X86::VANDPDZ256rmk, 0 },
+ { X86::VANDPSZ256rrk, X86::VANDPSZ256rmk, 0 },
+ { X86::VDIVPDZ256rrk, X86::VDIVPDZ256rmk, 0 },
+ { X86::VDIVPSZ256rrk, X86::VDIVPSZ256rmk, 0 },
+ { X86::VINSERTF32x4Z256rrk,X86::VINSERTF32x4Z256rmk, 0 },
+ { X86::VINSERTF64x2Z256rrk,X86::VINSERTF64x2Z256rmk, 0 },
+ { X86::VINSERTI32x4Z256rrk,X86::VINSERTI32x4Z256rmk, 0 },
+ { X86::VINSERTI64x2Z256rrk,X86::VINSERTI64x2Z256rmk, 0 },
+ { X86::VMAXCPDZ256rrk, X86::VMAXCPDZ256rmk, 0 },
+ { X86::VMAXCPSZ256rrk, X86::VMAXCPSZ256rmk, 0 },
+ { X86::VMAXPDZ256rrk, X86::VMAXPDZ256rmk, 0 },
+ { X86::VMAXPSZ256rrk, X86::VMAXPSZ256rmk, 0 },
+ { X86::VMINCPDZ256rrk, X86::VMINCPDZ256rmk, 0 },
+ { X86::VMINCPSZ256rrk, X86::VMINCPSZ256rmk, 0 },
+ { X86::VMINPDZ256rrk, X86::VMINPDZ256rmk, 0 },
+ { X86::VMINPSZ256rrk, X86::VMINPSZ256rmk, 0 },
+ { X86::VMULPDZ256rrk, X86::VMULPDZ256rmk, 0 },
+ { X86::VMULPSZ256rrk, X86::VMULPSZ256rmk, 0 },
+ { X86::VORPDZ256rrk, X86::VORPDZ256rmk, 0 },
+ { X86::VORPSZ256rrk, X86::VORPSZ256rmk, 0 },
+ { X86::VPADDBZ256rrk, X86::VPADDBZ256rmk, 0 },
+ { X86::VPADDDZ256rrk, X86::VPADDDZ256rmk, 0 },
+ { X86::VPADDQZ256rrk, X86::VPADDQZ256rmk, 0 },
+ { X86::VPADDSBZ256rrk, X86::VPADDSBZ256rmk, 0 },
+ { X86::VPADDSWZ256rrk, X86::VPADDSWZ256rmk, 0 },
+ { X86::VPADDUSBZ256rrk, X86::VPADDUSBZ256rmk, 0 },
+ { X86::VPADDUSWZ256rrk, X86::VPADDUSWZ256rmk, 0 },
+ { X86::VPADDWZ256rrk, X86::VPADDWZ256rmk, 0 },
+ { X86::VPALIGNRZ256rrik, X86::VPALIGNRZ256rmik, 0 },
+ { X86::VPANDDZ256rrk, X86::VPANDDZ256rmk, 0 },
+ { X86::VPANDNDZ256rrk, X86::VPANDNDZ256rmk, 0 },
+ { X86::VPANDNQZ256rrk, X86::VPANDNQZ256rmk, 0 },
+ { X86::VPANDQZ256rrk, X86::VPANDQZ256rmk, 0 },
+ { X86::VPERMBZ256rrk, X86::VPERMBZ256rmk, 0 },
+ { X86::VPERMDZ256rrk, X86::VPERMDZ256rmk, 0 },
+ { X86::VPERMI2B256rrk, X86::VPERMI2B256rmk, 0 },
+ { X86::VPERMI2D256rrk, X86::VPERMI2D256rmk, 0 },
+ { X86::VPERMI2PD256rrk, X86::VPERMI2PD256rmk, 0 },
+ { X86::VPERMI2PS256rrk, X86::VPERMI2PS256rmk, 0 },
+ { X86::VPERMI2Q256rrk, X86::VPERMI2Q256rmk, 0 },
+ { X86::VPERMI2W256rrk, X86::VPERMI2W256rmk, 0 },
+ { X86::VPERMILPDZ256rrk, X86::VPERMILPDZ256rmk, 0 },
+ { X86::VPERMILPSZ256rrk, X86::VPERMILPSZ256rmk, 0 },
+ { X86::VPERMPDZ256rrk, X86::VPERMPDZ256rmk, 0 },
+ { X86::VPERMPSZ256rrk, X86::VPERMPSZ256rmk, 0 },
+ { X86::VPERMQZ256rrk, X86::VPERMQZ256rmk, 0 },
+ { X86::VPERMT2B256rrk, X86::VPERMT2B256rmk, 0 },
+ { X86::VPERMT2D256rrk, X86::VPERMT2D256rmk, 0 },
+ { X86::VPERMT2PD256rrk, X86::VPERMT2PD256rmk, 0 },
+ { X86::VPERMT2PS256rrk, X86::VPERMT2PS256rmk, 0 },
+ { X86::VPERMT2Q256rrk, X86::VPERMT2Q256rmk, 0 },
+ { X86::VPERMT2W256rrk, X86::VPERMT2W256rmk, 0 },
+ { X86::VPERMWZ256rrk, X86::VPERMWZ256rmk, 0 },
+ { X86::VPMADDUBSWZ256rrk, X86::VPMADDUBSWZ256rmk, 0 },
+ { X86::VPMADDWDZ256rrk, X86::VPMADDWDZ256rmk, 0 },
+ { X86::VPORDZ256rrk, X86::VPORDZ256rmk, 0 },
+ { X86::VPORQZ256rrk, X86::VPORQZ256rmk, 0 },
+ { X86::VPSHUFBZ256rrk, X86::VPSHUFBZ256rmk, 0 },
+ { X86::VPSUBBZ256rrk, X86::VPSUBBZ256rmk, 0 },
+ { X86::VPSUBDZ256rrk, X86::VPSUBDZ256rmk, 0 },
+ { X86::VPSUBQZ256rrk, X86::VPSUBQZ256rmk, 0 },
+ { X86::VPSUBSBZ256rrk, X86::VPSUBSBZ256rmk, 0 },
+ { X86::VPSUBSWZ256rrk, X86::VPSUBSWZ256rmk, 0 },
+ { X86::VPSUBUSBZ256rrk, X86::VPSUBUSBZ256rmk, 0 },
+ { X86::VPSUBUSWZ256rrk, X86::VPSUBUSWZ256rmk, 0 },
+ { X86::VPSUBWZ256rrk, X86::VPSUBWZ256rmk, 0 },
+ { X86::VPTERNLOGDZ256rrik, X86::VPTERNLOGDZ256rmik, 0 },
+ { X86::VPTERNLOGQZ256rrik, X86::VPTERNLOGQZ256rmik, 0 },
+ { X86::VPUNPCKHBWZ256rrk, X86::VPUNPCKHBWZ256rmk, 0 },
+ { X86::VPUNPCKHDQZ256rrk, X86::VPUNPCKHDQZ256rmk, 0 },
+ { X86::VPUNPCKHQDQZ256rrk, X86::VPUNPCKHQDQZ256rmk, 0 },
+ { X86::VPUNPCKHWDZ256rrk, X86::VPUNPCKHWDZ256rmk, 0 },
+ { X86::VPUNPCKLBWZ256rrk, X86::VPUNPCKLBWZ256rmk, 0 },
+ { X86::VPUNPCKLDQZ256rrk, X86::VPUNPCKLDQZ256rmk, 0 },
+ { X86::VPUNPCKLQDQZ256rrk, X86::VPUNPCKLQDQZ256rmk, 0 },
+ { X86::VPUNPCKLWDZ256rrk, X86::VPUNPCKLWDZ256rmk, 0 },
+ { X86::VPXORDZ256rrk, X86::VPXORDZ256rmk, 0 },
+ { X86::VPXORQZ256rrk, X86::VPXORQZ256rmk, 0 },
+ { X86::VSUBPDZ256rrk, X86::VSUBPDZ256rmk, 0 },
+ { X86::VSUBPSZ256rrk, X86::VSUBPSZ256rmk, 0 },
+ { X86::VUNPCKHPDZ256rrk, X86::VUNPCKHPDZ256rmk, 0 },
+ { X86::VUNPCKHPSZ256rrk, X86::VUNPCKHPSZ256rmk, 0 },
+ { X86::VUNPCKLPDZ256rrk, X86::VUNPCKLPDZ256rmk, 0 },
+ { X86::VUNPCKLPSZ256rrk, X86::VUNPCKLPSZ256rmk, 0 },
+ { X86::VXORPDZ256rrk, X86::VXORPDZ256rmk, 0 },
+ { X86::VXORPSZ256rrk, X86::VXORPSZ256rmk, 0 },
+
+ // AVX-512{F,VL} foldable instructions 128-bit
+ { X86::VADDPDZ128rrk, X86::VADDPDZ128rmk, 0 },
+ { X86::VADDPSZ128rrk, X86::VADDPSZ128rmk, 0 },
+ { X86::VALIGNDZ128rrik, X86::VALIGNDZ128rmik, 0 },
+ { X86::VALIGNQZ128rrik, X86::VALIGNQZ128rmik, 0 },
+ { X86::VANDNPDZ128rrk, X86::VANDNPDZ128rmk, 0 },
+ { X86::VANDNPSZ128rrk, X86::VANDNPSZ128rmk, 0 },
+ { X86::VANDPDZ128rrk, X86::VANDPDZ128rmk, 0 },
+ { X86::VANDPSZ128rrk, X86::VANDPSZ128rmk, 0 },
+ { X86::VDIVPDZ128rrk, X86::VDIVPDZ128rmk, 0 },
+ { X86::VDIVPSZ128rrk, X86::VDIVPSZ128rmk, 0 },
+ { X86::VMAXCPDZ128rrk, X86::VMAXCPDZ128rmk, 0 },
+ { X86::VMAXCPSZ128rrk, X86::VMAXCPSZ128rmk, 0 },
+ { X86::VMAXPDZ128rrk, X86::VMAXPDZ128rmk, 0 },
+ { X86::VMAXPSZ128rrk, X86::VMAXPSZ128rmk, 0 },
+ { X86::VMINCPDZ128rrk, X86::VMINCPDZ128rmk, 0 },
+ { X86::VMINCPSZ128rrk, X86::VMINCPSZ128rmk, 0 },
+ { X86::VMINPDZ128rrk, X86::VMINPDZ128rmk, 0 },
+ { X86::VMINPSZ128rrk, X86::VMINPSZ128rmk, 0 },
+ { X86::VMULPDZ128rrk, X86::VMULPDZ128rmk, 0 },
+ { X86::VMULPSZ128rrk, X86::VMULPSZ128rmk, 0 },
+ { X86::VORPDZ128rrk, X86::VORPDZ128rmk, 0 },
+ { X86::VORPSZ128rrk, X86::VORPSZ128rmk, 0 },
+ { X86::VPADDBZ128rrk, X86::VPADDBZ128rmk, 0 },
+ { X86::VPADDDZ128rrk, X86::VPADDDZ128rmk, 0 },
+ { X86::VPADDQZ128rrk, X86::VPADDQZ128rmk, 0 },
+ { X86::VPADDSBZ128rrk, X86::VPADDSBZ128rmk, 0 },
+ { X86::VPADDSWZ128rrk, X86::VPADDSWZ128rmk, 0 },
+ { X86::VPADDUSBZ128rrk, X86::VPADDUSBZ128rmk, 0 },
+ { X86::VPADDUSWZ128rrk, X86::VPADDUSWZ128rmk, 0 },
+ { X86::VPADDWZ128rrk, X86::VPADDWZ128rmk, 0 },
+ { X86::VPALIGNRZ128rrik, X86::VPALIGNRZ128rmik, 0 },
+ { X86::VPANDDZ128rrk, X86::VPANDDZ128rmk, 0 },
+ { X86::VPANDNDZ128rrk, X86::VPANDNDZ128rmk, 0 },
+ { X86::VPANDNQZ128rrk, X86::VPANDNQZ128rmk, 0 },
+ { X86::VPANDQZ128rrk, X86::VPANDQZ128rmk, 0 },
+ { X86::VPERMBZ128rrk, X86::VPERMBZ128rmk, 0 },
+ { X86::VPERMI2B128rrk, X86::VPERMI2B128rmk, 0 },
+ { X86::VPERMI2D128rrk, X86::VPERMI2D128rmk, 0 },
+ { X86::VPERMI2PD128rrk, X86::VPERMI2PD128rmk, 0 },
+ { X86::VPERMI2PS128rrk, X86::VPERMI2PS128rmk, 0 },
+ { X86::VPERMI2Q128rrk, X86::VPERMI2Q128rmk, 0 },
+ { X86::VPERMI2W128rrk, X86::VPERMI2W128rmk, 0 },
+ { X86::VPERMILPDZ128rrk, X86::VPERMILPDZ128rmk, 0 },
+ { X86::VPERMILPSZ128rrk, X86::VPERMILPSZ128rmk, 0 },
+ { X86::VPERMT2B128rrk, X86::VPERMT2B128rmk, 0 },
+ { X86::VPERMT2D128rrk, X86::VPERMT2D128rmk, 0 },
+ { X86::VPERMT2PD128rrk, X86::VPERMT2PD128rmk, 0 },
+ { X86::VPERMT2PS128rrk, X86::VPERMT2PS128rmk, 0 },
+ { X86::VPERMT2Q128rrk, X86::VPERMT2Q128rmk, 0 },
+ { X86::VPERMT2W128rrk, X86::VPERMT2W128rmk, 0 },
+ { X86::VPERMWZ128rrk, X86::VPERMWZ128rmk, 0 },
+ { X86::VPMADDUBSWZ128rrk, X86::VPMADDUBSWZ128rmk, 0 },
+ { X86::VPMADDWDZ128rrk, X86::VPMADDWDZ128rmk, 0 },
+ { X86::VPORDZ128rrk, X86::VPORDZ128rmk, 0 },
+ { X86::VPORQZ128rrk, X86::VPORQZ128rmk, 0 },
+ { X86::VPSHUFBZ128rrk, X86::VPSHUFBZ128rmk, 0 },
+ { X86::VPSUBBZ128rrk, X86::VPSUBBZ128rmk, 0 },
+ { X86::VPSUBDZ128rrk, X86::VPSUBDZ128rmk, 0 },
+ { X86::VPSUBQZ128rrk, X86::VPSUBQZ128rmk, 0 },
+ { X86::VPSUBSBZ128rrk, X86::VPSUBSBZ128rmk, 0 },
+ { X86::VPSUBSWZ128rrk, X86::VPSUBSWZ128rmk, 0 },
+ { X86::VPSUBUSBZ128rrk, X86::VPSUBUSBZ128rmk, 0 },
+ { X86::VPSUBUSWZ128rrk, X86::VPSUBUSWZ128rmk, 0 },
+ { X86::VPSUBWZ128rrk, X86::VPSUBWZ128rmk, 0 },
+ { X86::VPTERNLOGDZ128rrik, X86::VPTERNLOGDZ128rmik, 0 },
+ { X86::VPTERNLOGQZ128rrik, X86::VPTERNLOGQZ128rmik, 0 },
+ { X86::VPUNPCKHBWZ128rrk, X86::VPUNPCKHBWZ128rmk, 0 },
+ { X86::VPUNPCKHDQZ128rrk, X86::VPUNPCKHDQZ128rmk, 0 },
+ { X86::VPUNPCKHQDQZ128rrk, X86::VPUNPCKHQDQZ128rmk, 0 },
+ { X86::VPUNPCKHWDZ128rrk, X86::VPUNPCKHWDZ128rmk, 0 },
+ { X86::VPUNPCKLBWZ128rrk, X86::VPUNPCKLBWZ128rmk, 0 },
+ { X86::VPUNPCKLDQZ128rrk, X86::VPUNPCKLDQZ128rmk, 0 },
+ { X86::VPUNPCKLQDQZ128rrk, X86::VPUNPCKLQDQZ128rmk, 0 },
+ { X86::VPUNPCKLWDZ128rrk, X86::VPUNPCKLWDZ128rmk, 0 },
+ { X86::VPXORDZ128rrk, X86::VPXORDZ128rmk, 0 },
+ { X86::VPXORQZ128rrk, X86::VPXORQZ128rmk, 0 },
+ { X86::VSUBPDZ128rrk, X86::VSUBPDZ128rmk, 0 },
+ { X86::VSUBPSZ128rrk, X86::VSUBPSZ128rmk, 0 },
+ { X86::VUNPCKHPDZ128rrk, X86::VUNPCKHPDZ128rmk, 0 },
+ { X86::VUNPCKHPSZ128rrk, X86::VUNPCKHPSZ128rmk, 0 },
+ { X86::VUNPCKLPDZ128rrk, X86::VUNPCKLPDZ128rmk, 0 },
+ { X86::VUNPCKLPSZ128rrk, X86::VUNPCKLPSZ128rmk, 0 },
+ { X86::VXORPDZ128rrk, X86::VXORPDZ128rmk, 0 },
+ { X86::VXORPSZ128rrk, X86::VXORPSZ128rmk, 0 },
+
+ // 512-bit three source instructions with zero masking.
+ { X86::VPERMI2Brrkz, X86::VPERMI2Brmkz, 0 },
+ { X86::VPERMI2Drrkz, X86::VPERMI2Drmkz, 0 },
+ { X86::VPERMI2PSrrkz, X86::VPERMI2PSrmkz, 0 },
+ { X86::VPERMI2PDrrkz, X86::VPERMI2PDrmkz, 0 },
+ { X86::VPERMI2Qrrkz, X86::VPERMI2Qrmkz, 0 },
+ { X86::VPERMI2Wrrkz, X86::VPERMI2Wrmkz, 0 },
+ { X86::VPERMT2Brrkz, X86::VPERMT2Brmkz, 0 },
+ { X86::VPERMT2Drrkz, X86::VPERMT2Drmkz, 0 },
+ { X86::VPERMT2PSrrkz, X86::VPERMT2PSrmkz, 0 },
+ { X86::VPERMT2PDrrkz, X86::VPERMT2PDrmkz, 0 },
+ { X86::VPERMT2Qrrkz, X86::VPERMT2Qrmkz, 0 },
+ { X86::VPERMT2Wrrkz, X86::VPERMT2Wrmkz, 0 },
+ { X86::VPTERNLOGDZrrikz, X86::VPTERNLOGDZrmikz, 0 },
+ { X86::VPTERNLOGQZrrikz, X86::VPTERNLOGQZrmikz, 0 },
+
+ // 256-bit three source instructions with zero masking.
+ { X86::VPERMI2B256rrkz, X86::VPERMI2B256rmkz, 0 },
+ { X86::VPERMI2D256rrkz, X86::VPERMI2D256rmkz, 0 },
+ { X86::VPERMI2PD256rrkz, X86::VPERMI2PD256rmkz, 0 },
+ { X86::VPERMI2PS256rrkz, X86::VPERMI2PS256rmkz, 0 },
+ { X86::VPERMI2Q256rrkz, X86::VPERMI2Q256rmkz, 0 },
+ { X86::VPERMI2W256rrkz, X86::VPERMI2W256rmkz, 0 },
+ { X86::VPERMT2B256rrkz, X86::VPERMT2B256rmkz, 0 },
+ { X86::VPERMT2D256rrkz, X86::VPERMT2D256rmkz, 0 },
+ { X86::VPERMT2PD256rrkz, X86::VPERMT2PD256rmkz, 0 },
+ { X86::VPERMT2PS256rrkz, X86::VPERMT2PS256rmkz, 0 },
+ { X86::VPERMT2Q256rrkz, X86::VPERMT2Q256rmkz, 0 },
+ { X86::VPERMT2W256rrkz, X86::VPERMT2W256rmkz, 0 },
+ { X86::VPTERNLOGDZ256rrikz,X86::VPTERNLOGDZ256rmikz, 0 },
+ { X86::VPTERNLOGQZ256rrikz,X86::VPTERNLOGQZ256rmikz, 0 },
+
+ // 128-bit three source instructions with zero masking.
+ { X86::VPERMI2B128rrkz, X86::VPERMI2B128rmkz, 0 },
+ { X86::VPERMI2D128rrkz, X86::VPERMI2D128rmkz, 0 },
+ { X86::VPERMI2PD128rrkz, X86::VPERMI2PD128rmkz, 0 },
+ { X86::VPERMI2PS128rrkz, X86::VPERMI2PS128rmkz, 0 },
+ { X86::VPERMI2Q128rrkz, X86::VPERMI2Q128rmkz, 0 },
+ { X86::VPERMI2W128rrkz, X86::VPERMI2W128rmkz, 0 },
+ { X86::VPERMT2B128rrkz, X86::VPERMT2B128rmkz, 0 },
+ { X86::VPERMT2D128rrkz, X86::VPERMT2D128rmkz, 0 },
+ { X86::VPERMT2PD128rrkz, X86::VPERMT2PD128rmkz, 0 },
+ { X86::VPERMT2PS128rrkz, X86::VPERMT2PS128rmkz, 0 },
+ { X86::VPERMT2Q128rrkz, X86::VPERMT2Q128rmkz, 0 },
+ { X86::VPERMT2W128rrkz, X86::VPERMT2W128rmkz, 0 },
+ { X86::VPTERNLOGDZ128rrikz,X86::VPTERNLOGDZ128rmikz, 0 },
+ { X86::VPTERNLOGQZ128rrikz,X86::VPTERNLOGQZ128rmikz, 0 },
+ };
+
+ for (X86MemoryFoldTableEntry Entry : MemoryFoldTable4) {
+ AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
+ Entry.RegOp, Entry.MemOp,
+ // Index 4, folded load
+ Entry.Flags | TB_INDEX_4 | TB_FOLDED_LOAD);
+ }
+ for (I = X86InstrFMA3Info::rm_begin(); I != E; ++I) {
+ if (I.getGroup()->isKMasked()) {
+ // Intrinsics need to pass TB_NO_REVERSE.
+ if (I.getGroup()->isIntrinsic()) {
+ AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
+ I.getRegOpcode(), I.getMemOpcode(),
+ TB_ALIGN_NONE | TB_INDEX_4 | TB_FOLDED_LOAD | TB_NO_REVERSE);
+ } else {
+ AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
+ I.getRegOpcode(), I.getMemOpcode(),
+ TB_ALIGN_NONE | TB_INDEX_4 | TB_FOLDED_LOAD);
+ }
+ }
+ }
+}
+
+void
+X86InstrInfo::AddTableEntry(RegOp2MemOpTableType &R2MTable,
+ MemOp2RegOpTableType &M2RTable,
+ uint16_t RegOp, uint16_t MemOp, uint16_t Flags) {
+ if ((Flags & TB_NO_FORWARD) == 0) {
+ assert(!R2MTable.count(RegOp) && "Duplicate entry!");
+ R2MTable[RegOp] = std::make_pair(MemOp, Flags);
+ }
+ if ((Flags & TB_NO_REVERSE) == 0) {
+ assert(!M2RTable.count(MemOp) &&
+ "Duplicated entries in unfolding maps?");
+ M2RTable[MemOp] = std::make_pair(RegOp, Flags);
+ }
+}
+
+bool
+X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
+ unsigned &SrcReg, unsigned &DstReg,
+ unsigned &SubIdx) const {
+ switch (MI.getOpcode()) {
+ default: break;
+ case X86::MOVSX16rr8:
+ case X86::MOVZX16rr8:
+ case X86::MOVSX32rr8:
+ case X86::MOVZX32rr8:
+ case X86::MOVSX64rr8:
+ if (!Subtarget.is64Bit())
+ // It's not always legal to reference the low 8-bit of the larger
+ // register in 32-bit mode.
+ return false;
+ case X86::MOVSX32rr16:
+ case X86::MOVZX32rr16:
+ case X86::MOVSX64rr16:
+ case X86::MOVSX64rr32: {
+ if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
+ // Be conservative.
+ return false;
+ SrcReg = MI.getOperand(1).getReg();
+ DstReg = MI.getOperand(0).getReg();
+ switch (MI.getOpcode()) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::MOVSX16rr8:
+ case X86::MOVZX16rr8:
+ case X86::MOVSX32rr8:
+ case X86::MOVZX32rr8:
+ case X86::MOVSX64rr8:
+ SubIdx = X86::sub_8bit;
+ break;
+ case X86::MOVSX32rr16:
+ case X86::MOVZX32rr16:
+ case X86::MOVSX64rr16:
+ SubIdx = X86::sub_16bit;
+ break;
+ case X86::MOVSX64rr32:
+ SubIdx = X86::sub_32bit;
+ break;
+ }
+ return true;
+ }
+ }
+ return false;
+}
+
+int X86InstrInfo::getSPAdjust(const MachineInstr &MI) const {
+ const MachineFunction *MF = MI.getParent()->getParent();
+ const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+
+ if (MI.getOpcode() == getCallFrameSetupOpcode() ||
+ MI.getOpcode() == getCallFrameDestroyOpcode()) {
+ unsigned StackAlign = TFI->getStackAlignment();
+ int SPAdj =
+ (MI.getOperand(0).getImm() + StackAlign - 1) / StackAlign * StackAlign;
+
+ SPAdj -= MI.getOperand(1).getImm();
+
+ if (MI.getOpcode() == getCallFrameSetupOpcode())
+ return SPAdj;
+ else
+ return -SPAdj;
+ }
+
+ // To know whether a call adjusts the stack, we need information
+ // that is bound to the following ADJCALLSTACKUP pseudo.
+ // Look for the next ADJCALLSTACKUP that follows the call.
+ if (MI.isCall()) {
+ const MachineBasicBlock *MBB = MI.getParent();
+ auto I = ++MachineBasicBlock::const_iterator(MI);
+ for (auto E = MBB->end(); I != E; ++I) {
+ if (I->getOpcode() == getCallFrameDestroyOpcode() ||
+ I->isCall())
+ break;
+ }
+
+ // If we could not find a frame destroy opcode, then it has already
+ // been simplified, so we don't care.
+ if (I->getOpcode() != getCallFrameDestroyOpcode())
+ return 0;
+
+ return -(I->getOperand(1).getImm());
+ }
+
+ // Currently handle only PUSHes we can reasonably expect to see
+ // in call sequences
+ switch (MI.getOpcode()) {
+ default:
+ return 0;
+ case X86::PUSH32i8:
+ case X86::PUSH32r:
+ case X86::PUSH32rmm:
+ case X86::PUSH32rmr:
+ case X86::PUSHi32:
+ return 4;
+ case X86::PUSH64i8:
+ case X86::PUSH64r:
+ case X86::PUSH64rmm:
+ case X86::PUSH64rmr:
+ case X86::PUSH64i32:
+ return 8;
+ }
+}
+
+/// Return true and the FrameIndex if the specified
+/// operand and follow operands form a reference to the stack frame.
+bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op,
+ int &FrameIndex) const {
+ if (MI.getOperand(Op + X86::AddrBaseReg).isFI() &&
+ MI.getOperand(Op + X86::AddrScaleAmt).isImm() &&
+ MI.getOperand(Op + X86::AddrIndexReg).isReg() &&
+ MI.getOperand(Op + X86::AddrDisp).isImm() &&
+ MI.getOperand(Op + X86::AddrScaleAmt).getImm() == 1 &&
+ MI.getOperand(Op + X86::AddrIndexReg).getReg() == 0 &&
+ MI.getOperand(Op + X86::AddrDisp).getImm() == 0) {
+ FrameIndex = MI.getOperand(Op + X86::AddrBaseReg).getIndex();
+ return true;
+ }
+ return false;
+}
+
+static bool isFrameLoadOpcode(int Opcode) {
+ switch (Opcode) {
+ default:
+ return false;
+ case X86::MOV8rm:
+ case X86::MOV16rm:
+ case X86::MOV32rm:
+ case X86::MOV64rm:
+ case X86::LD_Fp64m:
+ case X86::MOVSSrm:
+ case X86::MOVSDrm:
+ case X86::MOVAPSrm:
+ case X86::MOVUPSrm:
+ case X86::MOVAPDrm:
+ case X86::MOVUPDrm:
+ case X86::MOVDQArm:
+ case X86::MOVDQUrm:
+ case X86::VMOVSSrm:
+ case X86::VMOVSDrm:
+ case X86::VMOVAPSrm:
+ case X86::VMOVUPSrm:
+ case X86::VMOVAPDrm:
+ case X86::VMOVUPDrm:
+ case X86::VMOVDQArm:
+ case X86::VMOVDQUrm:
+ case X86::VMOVUPSYrm:
+ case X86::VMOVAPSYrm:
+ case X86::VMOVUPDYrm:
+ case X86::VMOVAPDYrm:
+ case X86::VMOVDQUYrm:
+ case X86::VMOVDQAYrm:
+ case X86::MMX_MOVD64rm:
+ case X86::MMX_MOVQ64rm:
+ case X86::VMOVSSZrm:
+ case X86::VMOVSDZrm:
+ case X86::VMOVAPSZrm:
+ case X86::VMOVAPSZ128rm:
+ case X86::VMOVAPSZ256rm:
+ case X86::VMOVAPSZ128rm_NOVLX:
+ case X86::VMOVAPSZ256rm_NOVLX:
+ case X86::VMOVUPSZrm:
+ case X86::VMOVUPSZ128rm:
+ case X86::VMOVUPSZ256rm:
+ case X86::VMOVUPSZ128rm_NOVLX:
+ case X86::VMOVUPSZ256rm_NOVLX:
+ case X86::VMOVAPDZrm:
+ case X86::VMOVAPDZ128rm:
+ case X86::VMOVAPDZ256rm:
+ case X86::VMOVUPDZrm:
+ case X86::VMOVUPDZ128rm:
+ case X86::VMOVUPDZ256rm:
+ case X86::VMOVDQA32Zrm:
+ case X86::VMOVDQA32Z128rm:
+ case X86::VMOVDQA32Z256rm:
+ case X86::VMOVDQU32Zrm:
+ case X86::VMOVDQU32Z128rm:
+ case X86::VMOVDQU32Z256rm:
+ case X86::VMOVDQA64Zrm:
+ case X86::VMOVDQA64Z128rm:
+ case X86::VMOVDQA64Z256rm:
+ case X86::VMOVDQU64Zrm:
+ case X86::VMOVDQU64Z128rm:
+ case X86::VMOVDQU64Z256rm:
+ case X86::VMOVDQU8Zrm:
+ case X86::VMOVDQU8Z128rm:
+ case X86::VMOVDQU8Z256rm:
+ case X86::VMOVDQU16Zrm:
+ case X86::VMOVDQU16Z128rm:
+ case X86::VMOVDQU16Z256rm:
+ case X86::KMOVBkm:
+ case X86::KMOVWkm:
+ case X86::KMOVDkm:
+ case X86::KMOVQkm:
+ return true;
+ }
+}
+
+static bool isFrameStoreOpcode(int Opcode) {
+ switch (Opcode) {
+ default: break;
+ case X86::MOV8mr:
+ case X86::MOV16mr:
+ case X86::MOV32mr:
+ case X86::MOV64mr:
+ case X86::ST_FpP64m:
+ case X86::MOVSSmr:
+ case X86::MOVSDmr:
+ case X86::MOVAPSmr:
+ case X86::MOVUPSmr:
+ case X86::MOVAPDmr:
+ case X86::MOVUPDmr:
+ case X86::MOVDQAmr:
+ case X86::MOVDQUmr:
+ case X86::VMOVSSmr:
+ case X86::VMOVSDmr:
+ case X86::VMOVAPSmr:
+ case X86::VMOVUPSmr:
+ case X86::VMOVAPDmr:
+ case X86::VMOVUPDmr:
+ case X86::VMOVDQAmr:
+ case X86::VMOVDQUmr:
+ case X86::VMOVUPSYmr:
+ case X86::VMOVAPSYmr:
+ case X86::VMOVUPDYmr:
+ case X86::VMOVAPDYmr:
+ case X86::VMOVDQUYmr:
+ case X86::VMOVDQAYmr:
+ case X86::VMOVSSZmr:
+ case X86::VMOVSDZmr:
+ case X86::VMOVUPSZmr:
+ case X86::VMOVUPSZ128mr:
+ case X86::VMOVUPSZ256mr:
+ case X86::VMOVUPSZ128mr_NOVLX:
+ case X86::VMOVUPSZ256mr_NOVLX:
+ case X86::VMOVAPSZmr:
+ case X86::VMOVAPSZ128mr:
+ case X86::VMOVAPSZ256mr:
+ case X86::VMOVAPSZ128mr_NOVLX:
+ case X86::VMOVAPSZ256mr_NOVLX:
+ case X86::VMOVUPDZmr:
+ case X86::VMOVUPDZ128mr:
+ case X86::VMOVUPDZ256mr:
+ case X86::VMOVAPDZmr:
+ case X86::VMOVAPDZ128mr:
+ case X86::VMOVAPDZ256mr:
+ case X86::VMOVDQA32Zmr:
+ case X86::VMOVDQA32Z128mr:
+ case X86::VMOVDQA32Z256mr:
+ case X86::VMOVDQU32Zmr:
+ case X86::VMOVDQU32Z128mr:
+ case X86::VMOVDQU32Z256mr:
+ case X86::VMOVDQA64Zmr:
+ case X86::VMOVDQA64Z128mr:
+ case X86::VMOVDQA64Z256mr:
+ case X86::VMOVDQU64Zmr:
+ case X86::VMOVDQU64Z128mr:
+ case X86::VMOVDQU64Z256mr:
+ case X86::VMOVDQU8Zmr:
+ case X86::VMOVDQU8Z128mr:
+ case X86::VMOVDQU8Z256mr:
+ case X86::VMOVDQU16Zmr:
+ case X86::VMOVDQU16Z128mr:
+ case X86::VMOVDQU16Z256mr:
+ case X86::MMX_MOVD64mr:
+ case X86::MMX_MOVQ64mr:
+ case X86::MMX_MOVNTQmr:
+ case X86::KMOVBmk:
+ case X86::KMOVWmk:
+ case X86::KMOVDmk:
+ case X86::KMOVQmk:
+ return true;
+ }
+ return false;
+}
+
+unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const {
+ if (isFrameLoadOpcode(MI.getOpcode()))
+ if (MI.getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex))
+ return MI.getOperand(0).getReg();
+ return 0;
+}
+
+unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI,
+ int &FrameIndex) const {
+ if (isFrameLoadOpcode(MI.getOpcode())) {
+ unsigned Reg;
+ if ((Reg = isLoadFromStackSlot(MI, FrameIndex)))
+ return Reg;
+ // Check for post-frame index elimination operations
+ const MachineMemOperand *Dummy;
+ return hasLoadFromStackSlot(MI, Dummy, FrameIndex);
+ }
+ return 0;
+}
+
+unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const {
+ if (isFrameStoreOpcode(MI.getOpcode()))
+ if (MI.getOperand(X86::AddrNumOperands).getSubReg() == 0 &&
+ isFrameOperand(MI, 0, FrameIndex))
+ return MI.getOperand(X86::AddrNumOperands).getReg();
+ return 0;
+}
+
+unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI,
+ int &FrameIndex) const {
+ if (isFrameStoreOpcode(MI.getOpcode())) {
+ unsigned Reg;
+ if ((Reg = isStoreToStackSlot(MI, FrameIndex)))
+ return Reg;
+ // Check for post-frame index elimination operations
+ const MachineMemOperand *Dummy;
+ return hasStoreToStackSlot(MI, Dummy, FrameIndex);
+ }
+ return 0;
+}
+
+/// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
+static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) {
+ // Don't waste compile time scanning use-def chains of physregs.
+ if (!TargetRegisterInfo::isVirtualRegister(BaseReg))
+ return false;
+ bool isPICBase = false;
+ for (MachineRegisterInfo::def_instr_iterator I = MRI.def_instr_begin(BaseReg),
+ E = MRI.def_instr_end(); I != E; ++I) {
+ MachineInstr *DefMI = &*I;
+ if (DefMI->getOpcode() != X86::MOVPC32r)
+ return false;
+ assert(!isPICBase && "More than one PIC base?");
+ isPICBase = true;
+ }
+ return isPICBase;
+}
+
+bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
+ AliasAnalysis *AA) const {
+ switch (MI.getOpcode()) {
+ default: break;
+ case X86::MOV8rm:
+ case X86::MOV8rm_NOREX:
+ case X86::MOV16rm:
+ case X86::MOV32rm:
+ case X86::MOV64rm:
+ case X86::LD_Fp64m:
+ case X86::MOVSSrm:
+ case X86::MOVSDrm:
+ case X86::MOVAPSrm:
+ case X86::MOVUPSrm:
+ case X86::MOVAPDrm:
+ case X86::MOVUPDrm:
+ case X86::MOVDQArm:
+ case X86::MOVDQUrm:
+ case X86::VMOVSSrm:
+ case X86::VMOVSDrm:
+ case X86::VMOVAPSrm:
+ case X86::VMOVUPSrm:
+ case X86::VMOVAPDrm:
+ case X86::VMOVUPDrm:
+ case X86::VMOVDQArm:
+ case X86::VMOVDQUrm:
+ case X86::VMOVAPSYrm:
+ case X86::VMOVUPSYrm:
+ case X86::VMOVAPDYrm:
+ case X86::VMOVUPDYrm:
+ case X86::VMOVDQAYrm:
+ case X86::VMOVDQUYrm:
+ case X86::MMX_MOVD64rm:
+ case X86::MMX_MOVQ64rm:
+ // AVX-512
+ case X86::VMOVSSZrm:
+ case X86::VMOVSDZrm:
+ case X86::VMOVAPDZ128rm:
+ case X86::VMOVAPDZ256rm:
+ case X86::VMOVAPDZrm:
+ case X86::VMOVAPSZ128rm:
+ case X86::VMOVAPSZ256rm:
+ case X86::VMOVAPSZ128rm_NOVLX:
+ case X86::VMOVAPSZ256rm_NOVLX:
+ case X86::VMOVAPSZrm:
+ case X86::VMOVDQA32Z128rm:
+ case X86::VMOVDQA32Z256rm:
+ case X86::VMOVDQA32Zrm:
+ case X86::VMOVDQA64Z128rm:
+ case X86::VMOVDQA64Z256rm:
+ case X86::VMOVDQA64Zrm:
+ case X86::VMOVDQU16Z128rm:
+ case X86::VMOVDQU16Z256rm:
+ case X86::VMOVDQU16Zrm:
+ case X86::VMOVDQU32Z128rm:
+ case X86::VMOVDQU32Z256rm:
+ case X86::VMOVDQU32Zrm:
+ case X86::VMOVDQU64Z128rm:
+ case X86::VMOVDQU64Z256rm:
+ case X86::VMOVDQU64Zrm:
+ case X86::VMOVDQU8Z128rm:
+ case X86::VMOVDQU8Z256rm:
+ case X86::VMOVDQU8Zrm:
+ case X86::VMOVUPDZ128rm:
+ case X86::VMOVUPDZ256rm:
+ case X86::VMOVUPDZrm:
+ case X86::VMOVUPSZ128rm:
+ case X86::VMOVUPSZ256rm:
+ case X86::VMOVUPSZ128rm_NOVLX:
+ case X86::VMOVUPSZ256rm_NOVLX:
+ case X86::VMOVUPSZrm: {
+ // Loads from constant pools are trivially rematerializable.
+ if (MI.getOperand(1 + X86::AddrBaseReg).isReg() &&
+ MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
+ MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
+ MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
+ MI.isDereferenceableInvariantLoad(AA)) {
+ unsigned BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
+ if (BaseReg == 0 || BaseReg == X86::RIP)
+ return true;
+ // Allow re-materialization of PIC load.
+ if (!ReMatPICStubLoad && MI.getOperand(1 + X86::AddrDisp).isGlobal())
+ return false;
+ const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ return regIsPICBase(BaseReg, MRI);
+ }
+ return false;
+ }
+
+ case X86::LEA32r:
+ case X86::LEA64r: {
+ if (MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
+ MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
+ MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
+ !MI.getOperand(1 + X86::AddrDisp).isReg()) {
+ // lea fi#, lea GV, etc. are all rematerializable.
+ if (!MI.getOperand(1 + X86::AddrBaseReg).isReg())
+ return true;
+ unsigned BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
+ if (BaseReg == 0)
+ return true;
+ // Allow re-materialization of lea PICBase + x.
+ const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ return regIsPICBase(BaseReg, MRI);
+ }
+ return false;
+ }
+ }
+
+ // All other instructions marked M_REMATERIALIZABLE are always trivially
+ // rematerializable.
+ return true;
+}
+
+bool X86InstrInfo::isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
+ MachineBasicBlock::iterator E = MBB.end();
+
+ // For compile time consideration, if we are not able to determine the
+ // safety after visiting 4 instructions in each direction, we will assume
+ // it's not safe.
+ MachineBasicBlock::iterator Iter = I;
+ for (unsigned i = 0; Iter != E && i < 4; ++i) {
+ bool SeenDef = false;
+ for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) {
+ MachineOperand &MO = Iter->getOperand(j);
+ if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS))
+ SeenDef = true;
+ if (!MO.isReg())
+ continue;
+ if (MO.getReg() == X86::EFLAGS) {
+ if (MO.isUse())
+ return false;
+ SeenDef = true;
+ }
+ }
+
+ if (SeenDef)
+ // This instruction defines EFLAGS, no need to look any further.
+ return true;
+ ++Iter;
+ // Skip over DBG_VALUE.
+ while (Iter != E && Iter->isDebugValue())
+ ++Iter;
+ }
+
+ // It is safe to clobber EFLAGS at the end of a block of no successor has it
+ // live in.
+ if (Iter == E) {
+ for (MachineBasicBlock *S : MBB.successors())
+ if (S->isLiveIn(X86::EFLAGS))
+ return false;
+ return true;
+ }
+
+ MachineBasicBlock::iterator B = MBB.begin();
+ Iter = I;
+ for (unsigned i = 0; i < 4; ++i) {
+ // If we make it to the beginning of the block, it's safe to clobber
+ // EFLAGS iff EFLAGS is not live-in.
+ if (Iter == B)
+ return !MBB.isLiveIn(X86::EFLAGS);
+
+ --Iter;
+ // Skip over DBG_VALUE.
+ while (Iter != B && Iter->isDebugValue())
+ --Iter;
+
+ bool SawKill = false;
+ for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) {
+ MachineOperand &MO = Iter->getOperand(j);
+ // A register mask may clobber EFLAGS, but we should still look for a
+ // live EFLAGS def.
+ if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS))
+ SawKill = true;
+ if (MO.isReg() && MO.getReg() == X86::EFLAGS) {
+ if (MO.isDef()) return MO.isDead();
+ if (MO.isKill()) SawKill = true;
+ }
+ }
+
+ if (SawKill)
+ // This instruction kills EFLAGS and doesn't redefine it, so
+ // there's no need to look further.
+ return true;
+ }
+
+ // Conservative answer.
+ return false;
+}
+
+void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ unsigned DestReg, unsigned SubIdx,
+ const MachineInstr &Orig,
+ const TargetRegisterInfo &TRI) const {
+ bool ClobbersEFLAGS = false;
+ for (const MachineOperand &MO : Orig.operands()) {
+ if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) {
+ ClobbersEFLAGS = true;
+ break;
+ }
+ }
+
+ if (ClobbersEFLAGS && !isSafeToClobberEFLAGS(MBB, I)) {
+ // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side
+ // effects.
+ int Value;
+ switch (Orig.getOpcode()) {
+ case X86::MOV32r0: Value = 0; break;
+ case X86::MOV32r1: Value = 1; break;
+ case X86::MOV32r_1: Value = -1; break;
+ default:
+ llvm_unreachable("Unexpected instruction!");
+ }
+
+ const DebugLoc &DL = Orig.getDebugLoc();
+ BuildMI(MBB, I, DL, get(X86::MOV32ri))
+ .addOperand(Orig.getOperand(0))
+ .addImm(Value);
+ } else {
+ MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig);
+ MBB.insert(I, MI);
+ }
+
+ MachineInstr &NewMI = *std::prev(I);
+ NewMI.substituteRegister(Orig.getOperand(0).getReg(), DestReg, SubIdx, TRI);
+}
+
+/// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead.
+bool X86InstrInfo::hasLiveCondCodeDef(MachineInstr &MI) const {
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = MI.getOperand(i);
+ if (MO.isReg() && MO.isDef() &&
+ MO.getReg() == X86::EFLAGS && !MO.isDead()) {
+ return true;
+ }
+ }
+ return false;
+}
+
+/// Check whether the shift count for a machine operand is non-zero.
+inline static unsigned getTruncatedShiftCount(MachineInstr &MI,
+ unsigned ShiftAmtOperandIdx) {
+ // The shift count is six bits with the REX.W prefix and five bits without.
+ unsigned ShiftCountMask = (MI.getDesc().TSFlags & X86II::REX_W) ? 63 : 31;
+ unsigned Imm = MI.getOperand(ShiftAmtOperandIdx).getImm();
+ return Imm & ShiftCountMask;
+}
+
+/// Check whether the given shift count is appropriate
+/// can be represented by a LEA instruction.
+inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
+ // Left shift instructions can be transformed into load-effective-address
+ // instructions if we can encode them appropriately.
+ // A LEA instruction utilizes a SIB byte to encode its scale factor.
+ // The SIB.scale field is two bits wide which means that we can encode any
+ // shift amount less than 4.
+ return ShAmt < 4 && ShAmt > 0;
+}
+
+bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
+ unsigned Opc, bool AllowSP, unsigned &NewSrc,
+ bool &isKill, bool &isUndef,
+ MachineOperand &ImplicitOp,
+ LiveVariables *LV) const {
+ MachineFunction &MF = *MI.getParent()->getParent();
+ const TargetRegisterClass *RC;
+ if (AllowSP) {
+ RC = Opc != X86::LEA32r ? &X86::GR64RegClass : &X86::GR32RegClass;
+ } else {
+ RC = Opc != X86::LEA32r ?
+ &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass;
+ }
+ unsigned SrcReg = Src.getReg();
+
+ // For both LEA64 and LEA32 the register already has essentially the right
+ // type (32-bit or 64-bit) we may just need to forbid SP.
+ if (Opc != X86::LEA64_32r) {
+ NewSrc = SrcReg;
+ isKill = Src.isKill();
+ isUndef = Src.isUndef();
+
+ if (TargetRegisterInfo::isVirtualRegister(NewSrc) &&
+ !MF.getRegInfo().constrainRegClass(NewSrc, RC))
+ return false;
+
+ return true;
+ }
+
+ // This is for an LEA64_32r and incoming registers are 32-bit. One way or
+ // another we need to add 64-bit registers to the final MI.
+ if (TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
+ ImplicitOp = Src;
+ ImplicitOp.setImplicit();
+
+ NewSrc = getX86SubSuperRegister(Src.getReg(), 64);
+ isKill = Src.isKill();
+ isUndef = Src.isUndef();
+ } else {
+ // Virtual register of the wrong class, we have to create a temporary 64-bit
+ // vreg to feed into the LEA.
+ NewSrc = MF.getRegInfo().createVirtualRegister(RC);
+ MachineInstr *Copy = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+ get(TargetOpcode::COPY))
+ .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
+ .addOperand(Src);
+
+ // Which is obviously going to be dead after we're done with it.
+ isKill = true;
+ isUndef = false;
+
+ if (LV)
+ LV->replaceKillInstruction(SrcReg, MI, *Copy);
+ }
+
+ // We've set all the parameters without issue.
+ return true;
+}
+
+/// Helper for convertToThreeAddress when 16-bit LEA is disabled, use 32-bit
+/// LEA to form 3-address code by promoting to a 32-bit superregister and then
+/// truncating back down to a 16-bit subregister.
+MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
+ unsigned MIOpc, MachineFunction::iterator &MFI, MachineInstr &MI,
+ LiveVariables *LV) const {
+ MachineBasicBlock::iterator MBBI = MI.getIterator();
+ unsigned Dest = MI.getOperand(0).getReg();
+ unsigned Src = MI.getOperand(1).getReg();
+ bool isDead = MI.getOperand(0).isDead();
+ bool isKill = MI.getOperand(1).isKill();
+
+ MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo();
+ unsigned leaOutReg = RegInfo.createVirtualRegister(&X86::GR32RegClass);
+ unsigned Opc, leaInReg;
+ if (Subtarget.is64Bit()) {
+ Opc = X86::LEA64_32r;
+ leaInReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
+ } else {
+ Opc = X86::LEA32r;
+ leaInReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
+ }
+
+ // Build and insert into an implicit UNDEF value. This is OK because
+ // well be shifting and then extracting the lower 16-bits.
+ // This has the potential to cause partial register stall. e.g.
+ // movw (%rbp,%rcx,2), %dx
+ // leal -65(%rdx), %esi
+ // But testing has shown this *does* help performance in 64-bit mode (at
+ // least on modern x86 machines).
+ BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg);
+ MachineInstr *InsMI =
+ BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
+ .addReg(leaInReg, RegState::Define, X86::sub_16bit)
+ .addReg(Src, getKillRegState(isKill));
+
+ MachineInstrBuilder MIB =
+ BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(Opc), leaOutReg);
+ switch (MIOpc) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::SHL16ri: {
+ unsigned ShAmt = MI.getOperand(2).getImm();
+ MIB.addReg(0).addImm(1ULL << ShAmt)
+ .addReg(leaInReg, RegState::Kill).addImm(0).addReg(0);
+ break;
+ }
+ case X86::INC16r:
+ addRegOffset(MIB, leaInReg, true, 1);
+ break;
+ case X86::DEC16r:
+ addRegOffset(MIB, leaInReg, true, -1);
+ break;
+ case X86::ADD16ri:
+ case X86::ADD16ri8:
+ case X86::ADD16ri_DB:
+ case X86::ADD16ri8_DB:
+ addRegOffset(MIB, leaInReg, true, MI.getOperand(2).getImm());
+ break;
+ case X86::ADD16rr:
+ case X86::ADD16rr_DB: {
+ unsigned Src2 = MI.getOperand(2).getReg();
+ bool isKill2 = MI.getOperand(2).isKill();
+ unsigned leaInReg2 = 0;
+ MachineInstr *InsMI2 = nullptr;
+ if (Src == Src2) {
+ // ADD16rr %reg1028<kill>, %reg1028
+ // just a single insert_subreg.
+ addRegReg(MIB, leaInReg, true, leaInReg, false);
+ } else {
+ if (Subtarget.is64Bit())
+ leaInReg2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
+ else
+ leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
+ // Build and insert into an implicit UNDEF value. This is OK because
+ // well be shifting and then extracting the lower 16-bits.
+ BuildMI(*MFI, &*MIB, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg2);
+ InsMI2 = BuildMI(*MFI, &*MIB, MI.getDebugLoc(), get(TargetOpcode::COPY))
+ .addReg(leaInReg2, RegState::Define, X86::sub_16bit)
+ .addReg(Src2, getKillRegState(isKill2));
+ addRegReg(MIB, leaInReg, true, leaInReg2, true);
+ }
+ if (LV && isKill2 && InsMI2)
+ LV->replaceKillInstruction(Src2, MI, *InsMI2);
+ break;
+ }
+ }
+
+ MachineInstr *NewMI = MIB;
+ MachineInstr *ExtMI =
+ BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
+ .addReg(Dest, RegState::Define | getDeadRegState(isDead))
+ .addReg(leaOutReg, RegState::Kill, X86::sub_16bit);
+
+ if (LV) {
+ // Update live variables
+ LV->getVarInfo(leaInReg).Kills.push_back(NewMI);
+ LV->getVarInfo(leaOutReg).Kills.push_back(ExtMI);
+ if (isKill)
+ LV->replaceKillInstruction(Src, MI, *InsMI);
+ if (isDead)
+ LV->replaceKillInstruction(Dest, MI, *ExtMI);
+ }
+
+ return ExtMI;
+}
+
+/// This method must be implemented by targets that
+/// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
+/// may be able to convert a two-address instruction into a true
+/// three-address instruction on demand. This allows the X86 target (for
+/// example) to convert ADD and SHL instructions into LEA instructions if they
+/// would require register copies due to two-addressness.
+///
+/// This method returns a null pointer if the transformation cannot be
+/// performed, otherwise it returns the new instruction.
+///
+MachineInstr *
+X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
+ MachineInstr &MI, LiveVariables *LV) const {
+ // The following opcodes also sets the condition code register(s). Only
+ // convert them to equivalent lea if the condition code register def's
+ // are dead!
+ if (hasLiveCondCodeDef(MI))
+ return nullptr;
+
+ MachineFunction &MF = *MI.getParent()->getParent();
+ // All instructions input are two-addr instructions. Get the known operands.
+ const MachineOperand &Dest = MI.getOperand(0);
+ const MachineOperand &Src = MI.getOperand(1);
+
+ MachineInstr *NewMI = nullptr;
+ // FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's. When
+ // we have better subtarget support, enable the 16-bit LEA generation here.
+ // 16-bit LEA is also slow on Core2.
+ bool DisableLEA16 = true;
+ bool is64Bit = Subtarget.is64Bit();
+
+ unsigned MIOpc = MI.getOpcode();
+ switch (MIOpc) {
+ default: return nullptr;
+ case X86::SHL64ri: {
+ assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
+ unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+ if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
+
+ // LEA can't handle RSP.
+ if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
+ !MF.getRegInfo().constrainRegClass(Src.getReg(),
+ &X86::GR64_NOSPRegClass))
+ return nullptr;
+
+ NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
+ .addOperand(Dest)
+ .addReg(0)
+ .addImm(1ULL << ShAmt)
+ .addOperand(Src)
+ .addImm(0)
+ .addReg(0);
+ break;
+ }
+ case X86::SHL32ri: {
+ assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
+ unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+ if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
+
+ unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
+
+ // LEA can't handle ESP.
+ bool isKill, isUndef;
+ unsigned SrcReg;
+ MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+ if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
+ SrcReg, isKill, isUndef, ImplicitOp, LV))
+ return nullptr;
+
+ MachineInstrBuilder MIB =
+ BuildMI(MF, MI.getDebugLoc(), get(Opc))
+ .addOperand(Dest)
+ .addReg(0)
+ .addImm(1ULL << ShAmt)
+ .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef))
+ .addImm(0)
+ .addReg(0);
+ if (ImplicitOp.getReg() != 0)
+ MIB.addOperand(ImplicitOp);
+ NewMI = MIB;
+
+ break;
+ }
+ case X86::SHL16ri: {
+ assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
+ unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+ if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
+
+ if (DisableLEA16)
+ return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV)
+ : nullptr;
+ NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r))
+ .addOperand(Dest)
+ .addReg(0)
+ .addImm(1ULL << ShAmt)
+ .addOperand(Src)
+ .addImm(0)
+ .addReg(0);
+ break;
+ }
+ case X86::INC64r:
+ case X86::INC32r: {
+ assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
+ unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r
+ : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
+ bool isKill, isUndef;
+ unsigned SrcReg;
+ MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+ if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
+ SrcReg, isKill, isUndef, ImplicitOp, LV))
+ return nullptr;
+
+ MachineInstrBuilder MIB =
+ BuildMI(MF, MI.getDebugLoc(), get(Opc))
+ .addOperand(Dest)
+ .addReg(SrcReg,
+ getKillRegState(isKill) | getUndefRegState(isUndef));
+ if (ImplicitOp.getReg() != 0)
+ MIB.addOperand(ImplicitOp);
+
+ NewMI = addOffset(MIB, 1);
+ break;
+ }
+ case X86::INC16r:
+ if (DisableLEA16)
+ return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV)
+ : nullptr;
+ assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
+ NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r))
+ .addOperand(Dest)
+ .addOperand(Src),
+ 1);
+ break;
+ case X86::DEC64r:
+ case X86::DEC32r: {
+ assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
+ unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r
+ : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
+
+ bool isKill, isUndef;
+ unsigned SrcReg;
+ MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+ if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
+ SrcReg, isKill, isUndef, ImplicitOp, LV))
+ return nullptr;
+
+ MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
+ .addOperand(Dest)
+ .addReg(SrcReg, getUndefRegState(isUndef) |
+ getKillRegState(isKill));
+ if (ImplicitOp.getReg() != 0)
+ MIB.addOperand(ImplicitOp);
+
+ NewMI = addOffset(MIB, -1);
+
+ break;
+ }
+ case X86::DEC16r:
+ if (DisableLEA16)
+ return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV)
+ : nullptr;
+ assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
+ NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r))
+ .addOperand(Dest)
+ .addOperand(Src),
+ -1);
+ break;
+ case X86::ADD64rr:
+ case X86::ADD64rr_DB:
+ case X86::ADD32rr:
+ case X86::ADD32rr_DB: {
+ assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
+ unsigned Opc;
+ if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB)
+ Opc = X86::LEA64r;
+ else
+ Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
+
+ bool isKill, isUndef;
+ unsigned SrcReg;
+ MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+ if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
+ SrcReg, isKill, isUndef, ImplicitOp, LV))
+ return nullptr;
+
+ const MachineOperand &Src2 = MI.getOperand(2);
+ bool isKill2, isUndef2;
+ unsigned SrcReg2;
+ MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
+ if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false,
+ SrcReg2, isKill2, isUndef2, ImplicitOp2, LV))
+ return nullptr;
+
+ MachineInstrBuilder MIB =
+ BuildMI(MF, MI.getDebugLoc(), get(Opc)).addOperand(Dest);
+ if (ImplicitOp.getReg() != 0)
+ MIB.addOperand(ImplicitOp);
+ if (ImplicitOp2.getReg() != 0)
+ MIB.addOperand(ImplicitOp2);
+
+ NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2);
+
+ // Preserve undefness of the operands.
+ NewMI->getOperand(1).setIsUndef(isUndef);
+ NewMI->getOperand(3).setIsUndef(isUndef2);
+
+ if (LV && Src2.isKill())
+ LV->replaceKillInstruction(SrcReg2, MI, *NewMI);
+ break;
+ }
+ case X86::ADD16rr:
+ case X86::ADD16rr_DB: {
+ if (DisableLEA16)
+ return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV)
+ : nullptr;
+ assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
+ unsigned Src2 = MI.getOperand(2).getReg();
+ bool isKill2 = MI.getOperand(2).isKill();
+ NewMI = addRegReg(
+ BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)).addOperand(Dest),
+ Src.getReg(), Src.isKill(), Src2, isKill2);
+
+ // Preserve undefness of the operands.
+ bool isUndef = MI.getOperand(1).isUndef();
+ bool isUndef2 = MI.getOperand(2).isUndef();
+ NewMI->getOperand(1).setIsUndef(isUndef);
+ NewMI->getOperand(3).setIsUndef(isUndef2);
+
+ if (LV && isKill2)
+ LV->replaceKillInstruction(Src2, MI, *NewMI);
+ break;
+ }
+ case X86::ADD64ri32:
+ case X86::ADD64ri8:
+ case X86::ADD64ri32_DB:
+ case X86::ADD64ri8_DB:
+ assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
+ NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
+ .addOperand(Dest)
+ .addOperand(Src),
+ MI.getOperand(2));
+ break;
+ case X86::ADD32ri:
+ case X86::ADD32ri8:
+ case X86::ADD32ri_DB:
+ case X86::ADD32ri8_DB: {
+ assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
+ unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
+
+ bool isKill, isUndef;
+ unsigned SrcReg;
+ MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+ if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
+ SrcReg, isKill, isUndef, ImplicitOp, LV))
+ return nullptr;
+
+ MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
+ .addOperand(Dest)
+ .addReg(SrcReg, getUndefRegState(isUndef) |
+ getKillRegState(isKill));
+ if (ImplicitOp.getReg() != 0)
+ MIB.addOperand(ImplicitOp);
+
+ NewMI = addOffset(MIB, MI.getOperand(2));
+ break;
+ }
+ case X86::ADD16ri:
+ case X86::ADD16ri8:
+ case X86::ADD16ri_DB:
+ case X86::ADD16ri8_DB:
+ if (DisableLEA16)
+ return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV)
+ : nullptr;
+ assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
+ NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r))
+ .addOperand(Dest)
+ .addOperand(Src),
+ MI.getOperand(2));
+ break;
+ }
+
+ if (!NewMI) return nullptr;
+
+ if (LV) { // Update live variables
+ if (Src.isKill())
+ LV->replaceKillInstruction(Src.getReg(), MI, *NewMI);
+ if (Dest.isDead())
+ LV->replaceKillInstruction(Dest.getReg(), MI, *NewMI);
+ }
+
+ MFI->insert(MI.getIterator(), NewMI); // Insert the new inst
+ return NewMI;
+}
+
+/// This determines which of three possible cases of a three source commute
+/// the source indexes correspond to taking into account any mask operands.
+/// All prevents commuting a passthru operand. Returns -1 if the commute isn't
+/// possible.
+/// Case 0 - Possible to commute the first and second operands.
+/// Case 1 - Possible to commute the first and third operands.
+/// Case 2 - Possible to commute the second and third operands.
+static int getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1,
+ unsigned SrcOpIdx2) {
+ // Put the lowest index to SrcOpIdx1 to simplify the checks below.
+ if (SrcOpIdx1 > SrcOpIdx2)
+ std::swap(SrcOpIdx1, SrcOpIdx2);
+
+ unsigned Op1 = 1, Op2 = 2, Op3 = 3;
+ if (X86II::isKMasked(TSFlags)) {
+ // The k-mask operand cannot be commuted.
+ if (SrcOpIdx1 == 2)
+ return -1;
+
+ // For k-zero-masked operations it is Ok to commute the first vector
+ // operand.
+ // For regular k-masked operations a conservative choice is done as the
+ // elements of the first vector operand, for which the corresponding bit
+ // in the k-mask operand is set to 0, are copied to the result of the
+ // instruction.
+ // TODO/FIXME: The commute still may be legal if it is known that the
+ // k-mask operand is set to either all ones or all zeroes.
+ // It is also Ok to commute the 1st operand if all users of MI use only
+ // the elements enabled by the k-mask operand. For example,
+ // v4 = VFMADD213PSZrk v1, k, v2, v3; // v1[i] = k[i] ? v2[i]*v1[i]+v3[i]
+ // : v1[i];
+ // VMOVAPSZmrk <mem_addr>, k, v4; // this is the ONLY user of v4 ->
+ // // Ok, to commute v1 in FMADD213PSZrk.
+ if (X86II::isKMergeMasked(TSFlags) && SrcOpIdx1 == Op1)
+ return -1;
+ Op2++;
+ Op3++;
+ }
+
+ if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op2)
+ return 0;
+ if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op3)
+ return 1;
+ if (SrcOpIdx1 == Op2 && SrcOpIdx2 == Op3)
+ return 2;
+ return -1;
+}
+
+unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(
+ const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2,
+ const X86InstrFMA3Group &FMA3Group) const {
+
+ unsigned Opc = MI.getOpcode();
+
+ // Put the lowest index to SrcOpIdx1 to simplify the checks below.
+ if (SrcOpIdx1 > SrcOpIdx2)
+ std::swap(SrcOpIdx1, SrcOpIdx2);
+
+ // TODO: Commuting the 1st operand of FMA*_Int requires some additional
+ // analysis. The commute optimization is legal only if all users of FMA*_Int
+ // use only the lowest element of the FMA*_Int instruction. Such analysis are
+ // not implemented yet. So, just return 0 in that case.
+ // When such analysis are available this place will be the right place for
+ // calling it.
+ if (FMA3Group.isIntrinsic() && SrcOpIdx1 == 1)
+ return 0;
+
+ // Determine which case this commute is or if it can't be done.
+ int Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1, SrcOpIdx2);
+ if (Case < 0)
+ return 0;
+
+ // Define the FMA forms mapping array that helps to map input FMA form
+ // to output FMA form to preserve the operation semantics after
+ // commuting the operands.
+ const unsigned Form132Index = 0;
+ const unsigned Form213Index = 1;
+ const unsigned Form231Index = 2;
+ static const unsigned FormMapping[][3] = {
+ // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2;
+ // FMA132 A, C, b; ==> FMA231 C, A, b;
+ // FMA213 B, A, c; ==> FMA213 A, B, c;
+ // FMA231 C, A, b; ==> FMA132 A, C, b;
+ { Form231Index, Form213Index, Form132Index },
+ // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3;
+ // FMA132 A, c, B; ==> FMA132 B, c, A;
+ // FMA213 B, a, C; ==> FMA231 C, a, B;
+ // FMA231 C, a, B; ==> FMA213 B, a, C;
+ { Form132Index, Form231Index, Form213Index },
+ // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3;
+ // FMA132 a, C, B; ==> FMA213 a, B, C;
+ // FMA213 b, A, C; ==> FMA132 b, C, A;
+ // FMA231 c, A, B; ==> FMA231 c, B, A;
+ { Form213Index, Form132Index, Form231Index }
+ };
+
+ unsigned FMAForms[3];
+ if (FMA3Group.isRegOpcodeFromGroup(Opc)) {
+ FMAForms[0] = FMA3Group.getReg132Opcode();
+ FMAForms[1] = FMA3Group.getReg213Opcode();
+ FMAForms[2] = FMA3Group.getReg231Opcode();
+ } else {
+ FMAForms[0] = FMA3Group.getMem132Opcode();
+ FMAForms[1] = FMA3Group.getMem213Opcode();
+ FMAForms[2] = FMA3Group.getMem231Opcode();
+ }
+ unsigned FormIndex;
+ for (FormIndex = 0; FormIndex < 3; FormIndex++)
+ if (Opc == FMAForms[FormIndex])
+ break;
+
+ // Everything is ready, just adjust the FMA opcode and return it.
+ FormIndex = FormMapping[Case][FormIndex];
+ return FMAForms[FormIndex];
+}
+
+static bool commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1,
+ unsigned SrcOpIdx2) {
+ uint64_t TSFlags = MI.getDesc().TSFlags;
+
+ // Determine which case this commute is or if it can't be done.
+ int Case = getThreeSrcCommuteCase(TSFlags, SrcOpIdx1, SrcOpIdx2);
+ if (Case < 0)
+ return false;
+
+ // For each case we need to swap two pairs of bits in the final immediate.
+ static const uint8_t SwapMasks[3][4] = {
+ { 0x04, 0x10, 0x08, 0x20 }, // Swap bits 2/4 and 3/5.
+ { 0x02, 0x10, 0x08, 0x40 }, // Swap bits 1/4 and 3/6.
+ { 0x02, 0x04, 0x20, 0x40 }, // Swap bits 1/2 and 5/6.
+ };
+
+ uint8_t Imm = MI.getOperand(MI.getNumOperands()-1).getImm();
+ // Clear out the bits we are swapping.
+ uint8_t NewImm = Imm & ~(SwapMasks[Case][0] | SwapMasks[Case][1] |
+ SwapMasks[Case][2] | SwapMasks[Case][3]);
+ // If the immediate had a bit of the pair set, then set the opposite bit.
+ if (Imm & SwapMasks[Case][0]) NewImm |= SwapMasks[Case][1];
+ if (Imm & SwapMasks[Case][1]) NewImm |= SwapMasks[Case][0];
+ if (Imm & SwapMasks[Case][2]) NewImm |= SwapMasks[Case][3];
+ if (Imm & SwapMasks[Case][3]) NewImm |= SwapMasks[Case][2];
+ MI.getOperand(MI.getNumOperands()-1).setImm(NewImm);
+
+ return true;
+}
+
+// Returns true if this is a VPERMI2 or VPERMT2 instrution that can be
+// commuted.
+static bool isCommutableVPERMV3Instruction(unsigned Opcode) {
+#define VPERM_CASES(Suffix) \
+ case X86::VPERMI2##Suffix##128rr: case X86::VPERMT2##Suffix##128rr: \
+ case X86::VPERMI2##Suffix##256rr: case X86::VPERMT2##Suffix##256rr: \
+ case X86::VPERMI2##Suffix##rr: case X86::VPERMT2##Suffix##rr: \
+ case X86::VPERMI2##Suffix##128rm: case X86::VPERMT2##Suffix##128rm: \
+ case X86::VPERMI2##Suffix##256rm: case X86::VPERMT2##Suffix##256rm: \
+ case X86::VPERMI2##Suffix##rm: case X86::VPERMT2##Suffix##rm: \
+ case X86::VPERMI2##Suffix##128rrkz: case X86::VPERMT2##Suffix##128rrkz: \
+ case X86::VPERMI2##Suffix##256rrkz: case X86::VPERMT2##Suffix##256rrkz: \
+ case X86::VPERMI2##Suffix##rrkz: case X86::VPERMT2##Suffix##rrkz: \
+ case X86::VPERMI2##Suffix##128rmkz: case X86::VPERMT2##Suffix##128rmkz: \
+ case X86::VPERMI2##Suffix##256rmkz: case X86::VPERMT2##Suffix##256rmkz: \
+ case X86::VPERMI2##Suffix##rmkz: case X86::VPERMT2##Suffix##rmkz:
+
+#define VPERM_CASES_BROADCAST(Suffix) \
+ VPERM_CASES(Suffix) \
+ case X86::VPERMI2##Suffix##128rmb: case X86::VPERMT2##Suffix##128rmb: \
+ case X86::VPERMI2##Suffix##256rmb: case X86::VPERMT2##Suffix##256rmb: \
+ case X86::VPERMI2##Suffix##rmb: case X86::VPERMT2##Suffix##rmb: \
+ case X86::VPERMI2##Suffix##128rmbkz: case X86::VPERMT2##Suffix##128rmbkz: \
+ case X86::VPERMI2##Suffix##256rmbkz: case X86::VPERMT2##Suffix##256rmbkz: \
+ case X86::VPERMI2##Suffix##rmbkz: case X86::VPERMT2##Suffix##rmbkz:
+
+ switch (Opcode) {
+ default: return false;
+ VPERM_CASES(B)
+ VPERM_CASES_BROADCAST(D)
+ VPERM_CASES_BROADCAST(PD)
+ VPERM_CASES_BROADCAST(PS)
+ VPERM_CASES_BROADCAST(Q)
+ VPERM_CASES(W)
+ return true;
+ }
+#undef VPERM_CASES_BROADCAST
+#undef VPERM_CASES
+}
+
+// Returns commuted opcode for VPERMI2 and VPERMT2 instructions by switching
+// from the I opcod to the T opcode and vice versa.
+static unsigned getCommutedVPERMV3Opcode(unsigned Opcode) {
+#define VPERM_CASES(Orig, New) \
+ case X86::Orig##128rr: return X86::New##128rr; \
+ case X86::Orig##128rrkz: return X86::New##128rrkz; \
+ case X86::Orig##128rm: return X86::New##128rm; \
+ case X86::Orig##128rmkz: return X86::New##128rmkz; \
+ case X86::Orig##256rr: return X86::New##256rr; \
+ case X86::Orig##256rrkz: return X86::New##256rrkz; \
+ case X86::Orig##256rm: return X86::New##256rm; \
+ case X86::Orig##256rmkz: return X86::New##256rmkz; \
+ case X86::Orig##rr: return X86::New##rr; \
+ case X86::Orig##rrkz: return X86::New##rrkz; \
+ case X86::Orig##rm: return X86::New##rm; \
+ case X86::Orig##rmkz: return X86::New##rmkz;
+
+#define VPERM_CASES_BROADCAST(Orig, New) \
+ VPERM_CASES(Orig, New) \
+ case X86::Orig##128rmb: return X86::New##128rmb; \
+ case X86::Orig##128rmbkz: return X86::New##128rmbkz; \
+ case X86::Orig##256rmb: return X86::New##256rmb; \
+ case X86::Orig##256rmbkz: return X86::New##256rmbkz; \
+ case X86::Orig##rmb: return X86::New##rmb; \
+ case X86::Orig##rmbkz: return X86::New##rmbkz;
+
+ switch (Opcode) {
+ VPERM_CASES(VPERMI2B, VPERMT2B)
+ VPERM_CASES_BROADCAST(VPERMI2D, VPERMT2D)
+ VPERM_CASES_BROADCAST(VPERMI2PD, VPERMT2PD)
+ VPERM_CASES_BROADCAST(VPERMI2PS, VPERMT2PS)
+ VPERM_CASES_BROADCAST(VPERMI2Q, VPERMT2Q)
+ VPERM_CASES(VPERMI2W, VPERMT2W)
+ VPERM_CASES(VPERMT2B, VPERMI2B)
+ VPERM_CASES_BROADCAST(VPERMT2D, VPERMI2D)
+ VPERM_CASES_BROADCAST(VPERMT2PD, VPERMI2PD)
+ VPERM_CASES_BROADCAST(VPERMT2PS, VPERMI2PS)
+ VPERM_CASES_BROADCAST(VPERMT2Q, VPERMI2Q)
+ VPERM_CASES(VPERMT2W, VPERMI2W)
+ }
+
+ llvm_unreachable("Unreachable!");
+#undef VPERM_CASES_BROADCAST
+#undef VPERM_CASES
+}
+
+MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+ unsigned OpIdx1,
+ unsigned OpIdx2) const {
+ auto cloneIfNew = [NewMI](MachineInstr &MI) -> MachineInstr & {
+ if (NewMI)
+ return *MI.getParent()->getParent()->CloneMachineInstr(&MI);
+ return MI;
+ };
+
+ switch (MI.getOpcode()) {
+ case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)
+ case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)
+ case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I)
+ case X86::SHLD32rri8: // A = SHLD32rri8 B, C, I -> A = SHRD32rri8 C, B, (32-I)
+ case X86::SHRD64rri8: // A = SHRD64rri8 B, C, I -> A = SHLD64rri8 C, B, (64-I)
+ case X86::SHLD64rri8:{// A = SHLD64rri8 B, C, I -> A = SHRD64rri8 C, B, (64-I)
+ unsigned Opc;
+ unsigned Size;
+ switch (MI.getOpcode()) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::SHRD16rri8: Size = 16; Opc = X86::SHLD16rri8; break;
+ case X86::SHLD16rri8: Size = 16; Opc = X86::SHRD16rri8; break;
+ case X86::SHRD32rri8: Size = 32; Opc = X86::SHLD32rri8; break;
+ case X86::SHLD32rri8: Size = 32; Opc = X86::SHRD32rri8; break;
+ case X86::SHRD64rri8: Size = 64; Opc = X86::SHLD64rri8; break;
+ case X86::SHLD64rri8: Size = 64; Opc = X86::SHRD64rri8; break;
+ }
+ unsigned Amt = MI.getOperand(3).getImm();
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.setDesc(get(Opc));
+ WorkingMI.getOperand(3).setImm(Size - Amt);
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+ case X86::BLENDPDrri:
+ case X86::BLENDPSrri:
+ case X86::PBLENDWrri:
+ case X86::VBLENDPDrri:
+ case X86::VBLENDPSrri:
+ case X86::VBLENDPDYrri:
+ case X86::VBLENDPSYrri:
+ case X86::VPBLENDDrri:
+ case X86::VPBLENDWrri:
+ case X86::VPBLENDDYrri:
+ case X86::VPBLENDWYrri:{
+ unsigned Mask;
+ switch (MI.getOpcode()) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::BLENDPDrri: Mask = 0x03; break;
+ case X86::BLENDPSrri: Mask = 0x0F; break;
+ case X86::PBLENDWrri: Mask = 0xFF; break;
+ case X86::VBLENDPDrri: Mask = 0x03; break;
+ case X86::VBLENDPSrri: Mask = 0x0F; break;
+ case X86::VBLENDPDYrri: Mask = 0x0F; break;
+ case X86::VBLENDPSYrri: Mask = 0xFF; break;
+ case X86::VPBLENDDrri: Mask = 0x0F; break;
+ case X86::VPBLENDWrri: Mask = 0xFF; break;
+ case X86::VPBLENDDYrri: Mask = 0xFF; break;
+ case X86::VPBLENDWYrri: Mask = 0xFF; break;
+ }
+ // Only the least significant bits of Imm are used.
+ unsigned Imm = MI.getOperand(3).getImm() & Mask;
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.getOperand(3).setImm(Mask ^ Imm);
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+ case X86::MOVSDrr:
+ case X86::MOVSSrr:
+ case X86::VMOVSDrr:
+ case X86::VMOVSSrr:{
+ // On SSE41 or later we can commute a MOVSS/MOVSD to a BLENDPS/BLENDPD.
+ if (!Subtarget.hasSSE41())
+ return nullptr;
+
+ unsigned Mask, Opc;
+ switch (MI.getOpcode()) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::MOVSDrr: Opc = X86::BLENDPDrri; Mask = 0x02; break;
+ case X86::MOVSSrr: Opc = X86::BLENDPSrri; Mask = 0x0E; break;
+ case X86::VMOVSDrr: Opc = X86::VBLENDPDrri; Mask = 0x02; break;
+ case X86::VMOVSSrr: Opc = X86::VBLENDPSrri; Mask = 0x0E; break;
+ }
+
+ // MOVSD/MOVSS's 2nd operand is a FR64/FR32 reg class - we need to copy
+ // this over to a VR128 class like the 1st operand to use a BLENDPD/BLENDPS.
+ auto &MRI = MI.getParent()->getParent()->getRegInfo();
+ auto VR128RC = MRI.getRegClass(MI.getOperand(1).getReg());
+ unsigned VR128 = MRI.createVirtualRegister(VR128RC);
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY),
+ VR128)
+ .addReg(MI.getOperand(2).getReg());
+
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.setDesc(get(Opc));
+ WorkingMI.getOperand(2).setReg(VR128);
+ WorkingMI.addOperand(MachineOperand::CreateImm(Mask));
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+ case X86::PCLMULQDQrr:
+ case X86::VPCLMULQDQrr:{
+ // SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0]
+ // SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0]
+ unsigned Imm = MI.getOperand(3).getImm();
+ unsigned Src1Hi = Imm & 0x01;
+ unsigned Src2Hi = Imm & 0x10;
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4));
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+ case X86::CMPSDrr:
+ case X86::CMPSSrr:
+ case X86::CMPPDrri:
+ case X86::CMPPSrri:
+ case X86::VCMPSDrr:
+ case X86::VCMPSSrr:
+ case X86::VCMPPDrri:
+ case X86::VCMPPSrri:
+ case X86::VCMPPDYrri:
+ case X86::VCMPPSYrri:
+ case X86::VCMPSDZrr:
+ case X86::VCMPSSZrr:
+ case X86::VCMPPDZrri:
+ case X86::VCMPPSZrri:
+ case X86::VCMPPDZ128rri:
+ case X86::VCMPPSZ128rri:
+ case X86::VCMPPDZ256rri:
+ case X86::VCMPPSZ256rri: {
+ // Float comparison can be safely commuted for
+ // Ordered/Unordered/Equal/NotEqual tests
+ unsigned Imm = MI.getOperand(3).getImm() & 0x7;
+ switch (Imm) {
+ case 0x00: // EQUAL
+ case 0x03: // UNORDERED
+ case 0x04: // NOT EQUAL
+ case 0x07: // ORDERED
+ return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+ default:
+ return nullptr;
+ }
+ }
+ case X86::VPCMPBZ128rri: case X86::VPCMPUBZ128rri:
+ case X86::VPCMPBZ256rri: case X86::VPCMPUBZ256rri:
+ case X86::VPCMPBZrri: case X86::VPCMPUBZrri:
+ case X86::VPCMPDZ128rri: case X86::VPCMPUDZ128rri:
+ case X86::VPCMPDZ256rri: case X86::VPCMPUDZ256rri:
+ case X86::VPCMPDZrri: case X86::VPCMPUDZrri:
+ case X86::VPCMPQZ128rri: case X86::VPCMPUQZ128rri:
+ case X86::VPCMPQZ256rri: case X86::VPCMPUQZ256rri:
+ case X86::VPCMPQZrri: case X86::VPCMPUQZrri:
+ case X86::VPCMPWZ128rri: case X86::VPCMPUWZ128rri:
+ case X86::VPCMPWZ256rri: case X86::VPCMPUWZ256rri:
+ case X86::VPCMPWZrri: case X86::VPCMPUWZrri: {
+ // Flip comparison mode immediate (if necessary).
+ unsigned Imm = MI.getOperand(3).getImm() & 0x7;
+ switch (Imm) {
+ default: llvm_unreachable("Unreachable!");
+ case 0x01: Imm = 0x06; break; // LT -> NLE
+ case 0x02: Imm = 0x05; break; // LE -> NLT
+ case 0x05: Imm = 0x02; break; // NLT -> LE
+ case 0x06: Imm = 0x01; break; // NLE -> LT
+ case 0x00: // EQ
+ case 0x03: // FALSE
+ case 0x04: // NE
+ case 0x07: // TRUE
+ break;
+ }
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.getOperand(3).setImm(Imm);
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+ case X86::VPCOMBri: case X86::VPCOMUBri:
+ case X86::VPCOMDri: case X86::VPCOMUDri:
+ case X86::VPCOMQri: case X86::VPCOMUQri:
+ case X86::VPCOMWri: case X86::VPCOMUWri: {
+ // Flip comparison mode immediate (if necessary).
+ unsigned Imm = MI.getOperand(3).getImm() & 0x7;
+ switch (Imm) {
+ default: llvm_unreachable("Unreachable!");
+ case 0x00: Imm = 0x02; break; // LT -> GT
+ case 0x01: Imm = 0x03; break; // LE -> GE
+ case 0x02: Imm = 0x00; break; // GT -> LT
+ case 0x03: Imm = 0x01; break; // GE -> LE
+ case 0x04: // EQ
+ case 0x05: // NE
+ case 0x06: // FALSE
+ case 0x07: // TRUE
+ break;
+ }
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.getOperand(3).setImm(Imm);
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+ case X86::VPERM2F128rr:
+ case X86::VPERM2I128rr: {
+ // Flip permute source immediate.
+ // Imm & 0x02: lo = if set, select Op1.lo/hi else Op0.lo/hi.
+ // Imm & 0x20: hi = if set, select Op1.lo/hi else Op0.lo/hi.
+ unsigned Imm = MI.getOperand(3).getImm() & 0xFF;
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.getOperand(3).setImm(Imm ^ 0x22);
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+ case X86::MOVHLPSrr:
+ case X86::UNPCKHPDrr: {
+ if (!Subtarget.hasSSE2())
+ return nullptr;
+
+ unsigned Opc = MI.getOpcode();
+ switch (Opc) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::MOVHLPSrr: Opc = X86::UNPCKHPDrr; break;
+ case X86::UNPCKHPDrr: Opc = X86::MOVHLPSrr; break;
+ }
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.setDesc(get(Opc));
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+ case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr:
+ case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr:
+ case X86::CMOVE16rr: case X86::CMOVE32rr: case X86::CMOVE64rr:
+ case X86::CMOVNE16rr: case X86::CMOVNE32rr: case X86::CMOVNE64rr:
+ case X86::CMOVBE16rr: case X86::CMOVBE32rr: case X86::CMOVBE64rr:
+ case X86::CMOVA16rr: case X86::CMOVA32rr: case X86::CMOVA64rr:
+ case X86::CMOVL16rr: case X86::CMOVL32rr: case X86::CMOVL64rr:
+ case X86::CMOVGE16rr: case X86::CMOVGE32rr: case X86::CMOVGE64rr:
+ case X86::CMOVLE16rr: case X86::CMOVLE32rr: case X86::CMOVLE64rr:
+ case X86::CMOVG16rr: case X86::CMOVG32rr: case X86::CMOVG64rr:
+ case X86::CMOVS16rr: case X86::CMOVS32rr: case X86::CMOVS64rr:
+ case X86::CMOVNS16rr: case X86::CMOVNS32rr: case X86::CMOVNS64rr:
+ case X86::CMOVP16rr: case X86::CMOVP32rr: case X86::CMOVP64rr:
+ case X86::CMOVNP16rr: case X86::CMOVNP32rr: case X86::CMOVNP64rr:
+ case X86::CMOVO16rr: case X86::CMOVO32rr: case X86::CMOVO64rr:
+ case X86::CMOVNO16rr: case X86::CMOVNO32rr: case X86::CMOVNO64rr: {
+ unsigned Opc;
+ switch (MI.getOpcode()) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::CMOVB16rr: Opc = X86::CMOVAE16rr; break;
+ case X86::CMOVB32rr: Opc = X86::CMOVAE32rr; break;
+ case X86::CMOVB64rr: Opc = X86::CMOVAE64rr; break;
+ case X86::CMOVAE16rr: Opc = X86::CMOVB16rr; break;
+ case X86::CMOVAE32rr: Opc = X86::CMOVB32rr; break;
+ case X86::CMOVAE64rr: Opc = X86::CMOVB64rr; break;
+ case X86::CMOVE16rr: Opc = X86::CMOVNE16rr; break;
+ case X86::CMOVE32rr: Opc = X86::CMOVNE32rr; break;
+ case X86::CMOVE64rr: Opc = X86::CMOVNE64rr; break;
+ case X86::CMOVNE16rr: Opc = X86::CMOVE16rr; break;
+ case X86::CMOVNE32rr: Opc = X86::CMOVE32rr; break;
+ case X86::CMOVNE64rr: Opc = X86::CMOVE64rr; break;
+ case X86::CMOVBE16rr: Opc = X86::CMOVA16rr; break;
+ case X86::CMOVBE32rr: Opc = X86::CMOVA32rr; break;
+ case X86::CMOVBE64rr: Opc = X86::CMOVA64rr; break;
+ case X86::CMOVA16rr: Opc = X86::CMOVBE16rr; break;
+ case X86::CMOVA32rr: Opc = X86::CMOVBE32rr; break;
+ case X86::CMOVA64rr: Opc = X86::CMOVBE64rr; break;
+ case X86::CMOVL16rr: Opc = X86::CMOVGE16rr; break;
+ case X86::CMOVL32rr: Opc = X86::CMOVGE32rr; break;
+ case X86::CMOVL64rr: Opc = X86::CMOVGE64rr; break;
+ case X86::CMOVGE16rr: Opc = X86::CMOVL16rr; break;
+ case X86::CMOVGE32rr: Opc = X86::CMOVL32rr; break;
+ case X86::CMOVGE64rr: Opc = X86::CMOVL64rr; break;
+ case X86::CMOVLE16rr: Opc = X86::CMOVG16rr; break;
+ case X86::CMOVLE32rr: Opc = X86::CMOVG32rr; break;
+ case X86::CMOVLE64rr: Opc = X86::CMOVG64rr; break;
+ case X86::CMOVG16rr: Opc = X86::CMOVLE16rr; break;
+ case X86::CMOVG32rr: Opc = X86::CMOVLE32rr; break;
+ case X86::CMOVG64rr: Opc = X86::CMOVLE64rr; break;
+ case X86::CMOVS16rr: Opc = X86::CMOVNS16rr; break;
+ case X86::CMOVS32rr: Opc = X86::CMOVNS32rr; break;
+ case X86::CMOVS64rr: Opc = X86::CMOVNS64rr; break;
+ case X86::CMOVNS16rr: Opc = X86::CMOVS16rr; break;
+ case X86::CMOVNS32rr: Opc = X86::CMOVS32rr; break;
+ case X86::CMOVNS64rr: Opc = X86::CMOVS64rr; break;
+ case X86::CMOVP16rr: Opc = X86::CMOVNP16rr; break;
+ case X86::CMOVP32rr: Opc = X86::CMOVNP32rr; break;
+ case X86::CMOVP64rr: Opc = X86::CMOVNP64rr; break;
+ case X86::CMOVNP16rr: Opc = X86::CMOVP16rr; break;
+ case X86::CMOVNP32rr: Opc = X86::CMOVP32rr; break;
+ case X86::CMOVNP64rr: Opc = X86::CMOVP64rr; break;
+ case X86::CMOVO16rr: Opc = X86::CMOVNO16rr; break;
+ case X86::CMOVO32rr: Opc = X86::CMOVNO32rr; break;
+ case X86::CMOVO64rr: Opc = X86::CMOVNO64rr; break;
+ case X86::CMOVNO16rr: Opc = X86::CMOVO16rr; break;
+ case X86::CMOVNO32rr: Opc = X86::CMOVO32rr; break;
+ case X86::CMOVNO64rr: Opc = X86::CMOVO64rr; break;
+ }
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.setDesc(get(Opc));
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+ case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi:
+ case X86::VPTERNLOGDZ128rri: case X86::VPTERNLOGDZ128rmi:
+ case X86::VPTERNLOGDZ256rri: case X86::VPTERNLOGDZ256rmi:
+ case X86::VPTERNLOGQZrri: case X86::VPTERNLOGQZrmi:
+ case X86::VPTERNLOGQZ128rri: case X86::VPTERNLOGQZ128rmi:
+ case X86::VPTERNLOGQZ256rri: case X86::VPTERNLOGQZ256rmi:
+ case X86::VPTERNLOGDZrrik: case X86::VPTERNLOGDZrmik:
+ case X86::VPTERNLOGDZ128rrik: case X86::VPTERNLOGDZ128rmik:
+ case X86::VPTERNLOGDZ256rrik: case X86::VPTERNLOGDZ256rmik:
+ case X86::VPTERNLOGQZrrik: case X86::VPTERNLOGQZrmik:
+ case X86::VPTERNLOGQZ128rrik: case X86::VPTERNLOGQZ128rmik:
+ case X86::VPTERNLOGQZ256rrik: case X86::VPTERNLOGQZ256rmik:
+ case X86::VPTERNLOGDZrrikz: case X86::VPTERNLOGDZrmikz:
+ case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz:
+ case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz:
+ case X86::VPTERNLOGQZrrikz: case X86::VPTERNLOGQZrmikz:
+ case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz:
+ case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz: {
+ auto &WorkingMI = cloneIfNew(MI);
+ if (!commuteVPTERNLOG(WorkingMI, OpIdx1, OpIdx2))
+ return nullptr;
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+ default: {
+ if (isCommutableVPERMV3Instruction(MI.getOpcode())) {
+ unsigned Opc = getCommutedVPERMV3Opcode(MI.getOpcode());
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.setDesc(get(Opc));
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+
+ const X86InstrFMA3Group *FMA3Group =
+ X86InstrFMA3Info::getFMA3Group(MI.getOpcode());
+ if (FMA3Group) {
+ unsigned Opc =
+ getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group);
+ if (Opc == 0)
+ return nullptr;
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.setDesc(get(Opc));
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+
+ return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+ }
+ }
+}
+
+bool X86InstrInfo::findFMA3CommutedOpIndices(
+ const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2,
+ const X86InstrFMA3Group &FMA3Group) const {
+
+ if (!findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2))
+ return false;
+
+ // Check if we can adjust the opcode to preserve the semantics when
+ // commute the register operands.
+ return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2, FMA3Group) != 0;
+}
+
+bool X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
+ unsigned &SrcOpIdx1,
+ unsigned &SrcOpIdx2) const {
+ uint64_t TSFlags = MI.getDesc().TSFlags;
+
+ unsigned FirstCommutableVecOp = 1;
+ unsigned LastCommutableVecOp = 3;
+ unsigned KMaskOp = 0;
+ if (X86II::isKMasked(TSFlags)) {
+ // The k-mask operand has index = 2 for masked and zero-masked operations.
+ KMaskOp = 2;
+
+ // The operand with index = 1 is used as a source for those elements for
+ // which the corresponding bit in the k-mask is set to 0.
+ if (X86II::isKMergeMasked(TSFlags))
+ FirstCommutableVecOp = 3;
+
+ LastCommutableVecOp++;
+ }
+
+ if (isMem(MI, LastCommutableVecOp))
+ LastCommutableVecOp--;
+
+ // Only the first RegOpsNum operands are commutable.
+ // Also, the value 'CommuteAnyOperandIndex' is valid here as it means
+ // that the operand is not specified/fixed.
+ if (SrcOpIdx1 != CommuteAnyOperandIndex &&
+ (SrcOpIdx1 < FirstCommutableVecOp || SrcOpIdx1 > LastCommutableVecOp ||
+ SrcOpIdx1 == KMaskOp))
+ return false;
+ if (SrcOpIdx2 != CommuteAnyOperandIndex &&
+ (SrcOpIdx2 < FirstCommutableVecOp || SrcOpIdx2 > LastCommutableVecOp ||
+ SrcOpIdx2 == KMaskOp))
+ return false;
+
+ // Look for two different register operands assumed to be commutable
+ // regardless of the FMA opcode. The FMA opcode is adjusted later.
+ if (SrcOpIdx1 == CommuteAnyOperandIndex ||
+ SrcOpIdx2 == CommuteAnyOperandIndex) {
+ unsigned CommutableOpIdx1 = SrcOpIdx1;
+ unsigned CommutableOpIdx2 = SrcOpIdx2;
+
+ // At least one of operands to be commuted is not specified and
+ // this method is free to choose appropriate commutable operands.
+ if (SrcOpIdx1 == SrcOpIdx2)
+ // Both of operands are not fixed. By default set one of commutable
+ // operands to the last register operand of the instruction.
+ CommutableOpIdx2 = LastCommutableVecOp;
+ else if (SrcOpIdx2 == CommuteAnyOperandIndex)
+ // Only one of operands is not fixed.
+ CommutableOpIdx2 = SrcOpIdx1;
+
+ // CommutableOpIdx2 is well defined now. Let's choose another commutable
+ // operand and assign its index to CommutableOpIdx1.
+ unsigned Op2Reg = MI.getOperand(CommutableOpIdx2).getReg();
+ for (CommutableOpIdx1 = LastCommutableVecOp;
+ CommutableOpIdx1 >= FirstCommutableVecOp; CommutableOpIdx1--) {
+ // Just ignore and skip the k-mask operand.
+ if (CommutableOpIdx1 == KMaskOp)
+ continue;
+
+ // The commuted operands must have different registers.
+ // Otherwise, the commute transformation does not change anything and
+ // is useless then.
+ if (Op2Reg != MI.getOperand(CommutableOpIdx1).getReg())
+ break;
+ }
+
+ // No appropriate commutable operands were found.
+ if (CommutableOpIdx1 < FirstCommutableVecOp)
+ return false;
+
+ // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2
+ // to return those values.
+ if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
+ CommutableOpIdx1, CommutableOpIdx2))
+ return false;
+ }
+
+ return true;
+}
+
+bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
+ unsigned &SrcOpIdx2) const {
+ const MCInstrDesc &Desc = MI.getDesc();
+ if (!Desc.isCommutable())
+ return false;
+
+ switch (MI.getOpcode()) {
+ case X86::CMPSDrr:
+ case X86::CMPSSrr:
+ case X86::CMPPDrri:
+ case X86::CMPPSrri:
+ case X86::VCMPSDrr:
+ case X86::VCMPSSrr:
+ case X86::VCMPPDrri:
+ case X86::VCMPPSrri:
+ case X86::VCMPPDYrri:
+ case X86::VCMPPSYrri:
+ case X86::VCMPSDZrr:
+ case X86::VCMPSSZrr:
+ case X86::VCMPPDZrri:
+ case X86::VCMPPSZrri:
+ case X86::VCMPPDZ128rri:
+ case X86::VCMPPSZ128rri:
+ case X86::VCMPPDZ256rri:
+ case X86::VCMPPSZ256rri: {
+ // Float comparison can be safely commuted for
+ // Ordered/Unordered/Equal/NotEqual tests
+ unsigned Imm = MI.getOperand(3).getImm() & 0x7;
+ switch (Imm) {
+ case 0x00: // EQUAL
+ case 0x03: // UNORDERED
+ case 0x04: // NOT EQUAL
+ case 0x07: // ORDERED
+ // The indices of the commutable operands are 1 and 2.
+ // Assign them to the returned operand indices here.
+ return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2);
+ }
+ return false;
+ }
+ case X86::MOVSDrr:
+ case X86::MOVSSrr:
+ case X86::VMOVSDrr:
+ case X86::VMOVSSrr: {
+ if (Subtarget.hasSSE41())
+ return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+ return false;
+ }
+ case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi:
+ case X86::VPTERNLOGDZ128rri: case X86::VPTERNLOGDZ128rmi:
+ case X86::VPTERNLOGDZ256rri: case X86::VPTERNLOGDZ256rmi:
+ case X86::VPTERNLOGQZrri: case X86::VPTERNLOGQZrmi:
+ case X86::VPTERNLOGQZ128rri: case X86::VPTERNLOGQZ128rmi:
+ case X86::VPTERNLOGQZ256rri: case X86::VPTERNLOGQZ256rmi:
+ case X86::VPTERNLOGDZrrik: case X86::VPTERNLOGDZrmik:
+ case X86::VPTERNLOGDZ128rrik: case X86::VPTERNLOGDZ128rmik:
+ case X86::VPTERNLOGDZ256rrik: case X86::VPTERNLOGDZ256rmik:
+ case X86::VPTERNLOGQZrrik: case X86::VPTERNLOGQZrmik:
+ case X86::VPTERNLOGQZ128rrik: case X86::VPTERNLOGQZ128rmik:
+ case X86::VPTERNLOGQZ256rrik: case X86::VPTERNLOGQZ256rmik:
+ case X86::VPTERNLOGDZrrikz: case X86::VPTERNLOGDZrmikz:
+ case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz:
+ case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz:
+ case X86::VPTERNLOGQZrrikz: case X86::VPTERNLOGQZrmikz:
+ case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz:
+ case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz:
+ return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+ default:
+ const X86InstrFMA3Group *FMA3Group =
+ X86InstrFMA3Info::getFMA3Group(MI.getOpcode());
+ if (FMA3Group)
+ return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2, *FMA3Group);
+
+ // Handled masked instructions since we need to skip over the mask input
+ // and the preserved input.
+ if (Desc.TSFlags & X86II::EVEX_K) {
+ // First assume that the first input is the mask operand and skip past it.
+ unsigned CommutableOpIdx1 = Desc.getNumDefs() + 1;
+ unsigned CommutableOpIdx2 = Desc.getNumDefs() + 2;
+ // Check if the first input is tied. If there isn't one then we only
+ // need to skip the mask operand which we did above.
+ if ((MI.getDesc().getOperandConstraint(Desc.getNumDefs(),
+ MCOI::TIED_TO) != -1)) {
+ // If this is zero masking instruction with a tied operand, we need to
+ // move the first index back to the first input since this must
+ // be a 3 input instruction and we want the first two non-mask inputs.
+ // Otherwise this is a 2 input instruction with a preserved input and
+ // mask, so we need to move the indices to skip one more input.
+ if (Desc.TSFlags & X86II::EVEX_Z)
+ --CommutableOpIdx1;
+ else {
+ ++CommutableOpIdx1;
+ ++CommutableOpIdx2;
+ }
+ }
+
+ if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
+ CommutableOpIdx1, CommutableOpIdx2))
+ return false;
+
+ if (!MI.getOperand(SrcOpIdx1).isReg() ||
+ !MI.getOperand(SrcOpIdx2).isReg())
+ // No idea.
+ return false;
+ return true;
+ }
+
+ return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+ }
+ return false;
+}
+
+static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) {
+ switch (BrOpc) {
+ default: return X86::COND_INVALID;
+ case X86::JE_1: return X86::COND_E;
+ case X86::JNE_1: return X86::COND_NE;
+ case X86::JL_1: return X86::COND_L;
+ case X86::JLE_1: return X86::COND_LE;
+ case X86::JG_1: return X86::COND_G;
+ case X86::JGE_1: return X86::COND_GE;
+ case X86::JB_1: return X86::COND_B;
+ case X86::JBE_1: return X86::COND_BE;
+ case X86::JA_1: return X86::COND_A;
+ case X86::JAE_1: return X86::COND_AE;
+ case X86::JS_1: return X86::COND_S;
+ case X86::JNS_1: return X86::COND_NS;
+ case X86::JP_1: return X86::COND_P;
+ case X86::JNP_1: return X86::COND_NP;
+ case X86::JO_1: return X86::COND_O;
+ case X86::JNO_1: return X86::COND_NO;
+ }
+}
+
+/// Return condition code of a SET opcode.
+static X86::CondCode getCondFromSETOpc(unsigned Opc) {
+ switch (Opc) {
+ default: return X86::COND_INVALID;
+ case X86::SETAr: case X86::SETAm: return X86::COND_A;
+ case X86::SETAEr: case X86::SETAEm: return X86::COND_AE;
+ case X86::SETBr: case X86::SETBm: return X86::COND_B;
+ case X86::SETBEr: case X86::SETBEm: return X86::COND_BE;
+ case X86::SETEr: case X86::SETEm: return X86::COND_E;
+ case X86::SETGr: case X86::SETGm: return X86::COND_G;
+ case X86::SETGEr: case X86::SETGEm: return X86::COND_GE;
+ case X86::SETLr: case X86::SETLm: return X86::COND_L;
+ case X86::SETLEr: case X86::SETLEm: return X86::COND_LE;
+ case X86::SETNEr: case X86::SETNEm: return X86::COND_NE;
+ case X86::SETNOr: case X86::SETNOm: return X86::COND_NO;
+ case X86::SETNPr: case X86::SETNPm: return X86::COND_NP;
+ case X86::SETNSr: case X86::SETNSm: return X86::COND_NS;
+ case X86::SETOr: case X86::SETOm: return X86::COND_O;
+ case X86::SETPr: case X86::SETPm: return X86::COND_P;
+ case X86::SETSr: case X86::SETSm: return X86::COND_S;
+ }
+}
+
+/// Return condition code of a CMov opcode.
+X86::CondCode X86::getCondFromCMovOpc(unsigned Opc) {
+ switch (Opc) {
+ default: return X86::COND_INVALID;
+ case X86::CMOVA16rm: case X86::CMOVA16rr: case X86::CMOVA32rm:
+ case X86::CMOVA32rr: case X86::CMOVA64rm: case X86::CMOVA64rr:
+ return X86::COND_A;
+ case X86::CMOVAE16rm: case X86::CMOVAE16rr: case X86::CMOVAE32rm:
+ case X86::CMOVAE32rr: case X86::CMOVAE64rm: case X86::CMOVAE64rr:
+ return X86::COND_AE;
+ case X86::CMOVB16rm: case X86::CMOVB16rr: case X86::CMOVB32rm:
+ case X86::CMOVB32rr: case X86::CMOVB64rm: case X86::CMOVB64rr:
+ return X86::COND_B;
+ case X86::CMOVBE16rm: case X86::CMOVBE16rr: case X86::CMOVBE32rm:
+ case X86::CMOVBE32rr: case X86::CMOVBE64rm: case X86::CMOVBE64rr:
+ return X86::COND_BE;
+ case X86::CMOVE16rm: case X86::CMOVE16rr: case X86::CMOVE32rm:
+ case X86::CMOVE32rr: case X86::CMOVE64rm: case X86::CMOVE64rr:
+ return X86::COND_E;
+ case X86::CMOVG16rm: case X86::CMOVG16rr: case X86::CMOVG32rm:
+ case X86::CMOVG32rr: case X86::CMOVG64rm: case X86::CMOVG64rr:
+ return X86::COND_G;
+ case X86::CMOVGE16rm: case X86::CMOVGE16rr: case X86::CMOVGE32rm:
+ case X86::CMOVGE32rr: case X86::CMOVGE64rm: case X86::CMOVGE64rr:
+ return X86::COND_GE;
+ case X86::CMOVL16rm: case X86::CMOVL16rr: case X86::CMOVL32rm:
+ case X86::CMOVL32rr: case X86::CMOVL64rm: case X86::CMOVL64rr:
+ return X86::COND_L;
+ case X86::CMOVLE16rm: case X86::CMOVLE16rr: case X86::CMOVLE32rm:
+ case X86::CMOVLE32rr: case X86::CMOVLE64rm: case X86::CMOVLE64rr:
+ return X86::COND_LE;
+ case X86::CMOVNE16rm: case X86::CMOVNE16rr: case X86::CMOVNE32rm:
+ case X86::CMOVNE32rr: case X86::CMOVNE64rm: case X86::CMOVNE64rr:
+ return X86::COND_NE;
+ case X86::CMOVNO16rm: case X86::CMOVNO16rr: case X86::CMOVNO32rm:
+ case X86::CMOVNO32rr: case X86::CMOVNO64rm: case X86::CMOVNO64rr:
+ return X86::COND_NO;
+ case X86::CMOVNP16rm: case X86::CMOVNP16rr: case X86::CMOVNP32rm:
+ case X86::CMOVNP32rr: case X86::CMOVNP64rm: case X86::CMOVNP64rr:
+ return X86::COND_NP;
+ case X86::CMOVNS16rm: case X86::CMOVNS16rr: case X86::CMOVNS32rm:
+ case X86::CMOVNS32rr: case X86::CMOVNS64rm: case X86::CMOVNS64rr:
+ return X86::COND_NS;
+ case X86::CMOVO16rm: case X86::CMOVO16rr: case X86::CMOVO32rm:
+ case X86::CMOVO32rr: case X86::CMOVO64rm: case X86::CMOVO64rr:
+ return X86::COND_O;
+ case X86::CMOVP16rm: case X86::CMOVP16rr: case X86::CMOVP32rm:
+ case X86::CMOVP32rr: case X86::CMOVP64rm: case X86::CMOVP64rr:
+ return X86::COND_P;
+ case X86::CMOVS16rm: case X86::CMOVS16rr: case X86::CMOVS32rm:
+ case X86::CMOVS32rr: case X86::CMOVS64rm: case X86::CMOVS64rr:
+ return X86::COND_S;
+ }
+}
+
+unsigned X86::GetCondBranchFromCond(X86::CondCode CC) {
+ switch (CC) {
+ default: llvm_unreachable("Illegal condition code!");
+ case X86::COND_E: return X86::JE_1;
+ case X86::COND_NE: return X86::JNE_1;
+ case X86::COND_L: return X86::JL_1;
+ case X86::COND_LE: return X86::JLE_1;
+ case X86::COND_G: return X86::JG_1;
+ case X86::COND_GE: return X86::JGE_1;
+ case X86::COND_B: return X86::JB_1;
+ case X86::COND_BE: return X86::JBE_1;
+ case X86::COND_A: return X86::JA_1;
+ case X86::COND_AE: return X86::JAE_1;
+ case X86::COND_S: return X86::JS_1;
+ case X86::COND_NS: return X86::JNS_1;
+ case X86::COND_P: return X86::JP_1;
+ case X86::COND_NP: return X86::JNP_1;
+ case X86::COND_O: return X86::JO_1;
+ case X86::COND_NO: return X86::JNO_1;
+ }
+}
+
+/// Return the inverse of the specified condition,
+/// e.g. turning COND_E to COND_NE.
+X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) {
+ switch (CC) {
+ default: llvm_unreachable("Illegal condition code!");
+ case X86::COND_E: return X86::COND_NE;
+ case X86::COND_NE: return X86::COND_E;
+ case X86::COND_L: return X86::COND_GE;
+ case X86::COND_LE: return X86::COND_G;
+ case X86::COND_G: return X86::COND_LE;
+ case X86::COND_GE: return X86::COND_L;
+ case X86::COND_B: return X86::COND_AE;
+ case X86::COND_BE: return X86::COND_A;
+ case X86::COND_A: return X86::COND_BE;
+ case X86::COND_AE: return X86::COND_B;
+ case X86::COND_S: return X86::COND_NS;
+ case X86::COND_NS: return X86::COND_S;
+ case X86::COND_P: return X86::COND_NP;
+ case X86::COND_NP: return X86::COND_P;
+ case X86::COND_O: return X86::COND_NO;
+ case X86::COND_NO: return X86::COND_O;
+ case X86::COND_NE_OR_P: return X86::COND_E_AND_NP;
+ case X86::COND_E_AND_NP: return X86::COND_NE_OR_P;
+ }
+}
+
+/// Assuming the flags are set by MI(a,b), return the condition code if we
+/// modify the instructions such that flags are set by MI(b,a).
+static X86::CondCode getSwappedCondition(X86::CondCode CC) {
+ switch (CC) {
+ default: return X86::COND_INVALID;
+ case X86::COND_E: return X86::COND_E;
+ case X86::COND_NE: return X86::COND_NE;
+ case X86::COND_L: return X86::COND_G;
+ case X86::COND_LE: return X86::COND_GE;
+ case X86::COND_G: return X86::COND_L;
+ case X86::COND_GE: return X86::COND_LE;
+ case X86::COND_B: return X86::COND_A;
+ case X86::COND_BE: return X86::COND_AE;
+ case X86::COND_A: return X86::COND_B;
+ case X86::COND_AE: return X86::COND_BE;
+ }
+}
+
+/// Return a set opcode for the given condition and
+/// whether it has memory operand.
+unsigned X86::getSETFromCond(CondCode CC, bool HasMemoryOperand) {
+ static const uint16_t Opc[16][2] = {
+ { X86::SETAr, X86::SETAm },
+ { X86::SETAEr, X86::SETAEm },
+ { X86::SETBr, X86::SETBm },
+ { X86::SETBEr, X86::SETBEm },
+ { X86::SETEr, X86::SETEm },
+ { X86::SETGr, X86::SETGm },
+ { X86::SETGEr, X86::SETGEm },
+ { X86::SETLr, X86::SETLm },
+ { X86::SETLEr, X86::SETLEm },
+ { X86::SETNEr, X86::SETNEm },
+ { X86::SETNOr, X86::SETNOm },
+ { X86::SETNPr, X86::SETNPm },
+ { X86::SETNSr, X86::SETNSm },
+ { X86::SETOr, X86::SETOm },
+ { X86::SETPr, X86::SETPm },
+ { X86::SETSr, X86::SETSm }
+ };
+
+ assert(CC <= LAST_VALID_COND && "Can only handle standard cond codes");
+ return Opc[CC][HasMemoryOperand ? 1 : 0];
+}
+
+/// Return a cmov opcode for the given condition,
+/// register size in bytes, and operand type.
+unsigned X86::getCMovFromCond(CondCode CC, unsigned RegBytes,
+ bool HasMemoryOperand) {
+ static const uint16_t Opc[32][3] = {
+ { X86::CMOVA16rr, X86::CMOVA32rr, X86::CMOVA64rr },
+ { X86::CMOVAE16rr, X86::CMOVAE32rr, X86::CMOVAE64rr },
+ { X86::CMOVB16rr, X86::CMOVB32rr, X86::CMOVB64rr },
+ { X86::CMOVBE16rr, X86::CMOVBE32rr, X86::CMOVBE64rr },
+ { X86::CMOVE16rr, X86::CMOVE32rr, X86::CMOVE64rr },
+ { X86::CMOVG16rr, X86::CMOVG32rr, X86::CMOVG64rr },
+ { X86::CMOVGE16rr, X86::CMOVGE32rr, X86::CMOVGE64rr },
+ { X86::CMOVL16rr, X86::CMOVL32rr, X86::CMOVL64rr },
+ { X86::CMOVLE16rr, X86::CMOVLE32rr, X86::CMOVLE64rr },
+ { X86::CMOVNE16rr, X86::CMOVNE32rr, X86::CMOVNE64rr },
+ { X86::CMOVNO16rr, X86::CMOVNO32rr, X86::CMOVNO64rr },
+ { X86::CMOVNP16rr, X86::CMOVNP32rr, X86::CMOVNP64rr },
+ { X86::CMOVNS16rr, X86::CMOVNS32rr, X86::CMOVNS64rr },
+ { X86::CMOVO16rr, X86::CMOVO32rr, X86::CMOVO64rr },
+ { X86::CMOVP16rr, X86::CMOVP32rr, X86::CMOVP64rr },
+ { X86::CMOVS16rr, X86::CMOVS32rr, X86::CMOVS64rr },
+ { X86::CMOVA16rm, X86::CMOVA32rm, X86::CMOVA64rm },
+ { X86::CMOVAE16rm, X86::CMOVAE32rm, X86::CMOVAE64rm },
+ { X86::CMOVB16rm, X86::CMOVB32rm, X86::CMOVB64rm },
+ { X86::CMOVBE16rm, X86::CMOVBE32rm, X86::CMOVBE64rm },
+ { X86::CMOVE16rm, X86::CMOVE32rm, X86::CMOVE64rm },
+ { X86::CMOVG16rm, X86::CMOVG32rm, X86::CMOVG64rm },
+ { X86::CMOVGE16rm, X86::CMOVGE32rm, X86::CMOVGE64rm },
+ { X86::CMOVL16rm, X86::CMOVL32rm, X86::CMOVL64rm },
+ { X86::CMOVLE16rm, X86::CMOVLE32rm, X86::CMOVLE64rm },
+ { X86::CMOVNE16rm, X86::CMOVNE32rm, X86::CMOVNE64rm },
+ { X86::CMOVNO16rm, X86::CMOVNO32rm, X86::CMOVNO64rm },
+ { X86::CMOVNP16rm, X86::CMOVNP32rm, X86::CMOVNP64rm },
+ { X86::CMOVNS16rm, X86::CMOVNS32rm, X86::CMOVNS64rm },
+ { X86::CMOVO16rm, X86::CMOVO32rm, X86::CMOVO64rm },
+ { X86::CMOVP16rm, X86::CMOVP32rm, X86::CMOVP64rm },
+ { X86::CMOVS16rm, X86::CMOVS32rm, X86::CMOVS64rm }
+ };
+
+ assert(CC < 16 && "Can only handle standard cond codes");
+ unsigned Idx = HasMemoryOperand ? 16+CC : CC;
+ switch(RegBytes) {
+ default: llvm_unreachable("Illegal register size!");
+ case 2: return Opc[Idx][0];
+ case 4: return Opc[Idx][1];
+ case 8: return Opc[Idx][2];
+ }
+}
+
+bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr &MI) const {
+ if (!MI.isTerminator()) return false;
+
+ // Conditional branch is a special case.
+ if (MI.isBranch() && !MI.isBarrier())
+ return true;
+ if (!MI.isPredicable())
+ return true;
+ return !isPredicated(MI);
+}
+
+bool X86InstrInfo::isUnconditionalTailCall(const MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ case X86::TCRETURNdi:
+ case X86::TCRETURNri:
+ case X86::TCRETURNmi:
+ case X86::TCRETURNdi64:
+ case X86::TCRETURNri64:
+ case X86::TCRETURNmi64:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool X86InstrInfo::canMakeTailCallConditional(
+ SmallVectorImpl<MachineOperand> &BranchCond,
+ const MachineInstr &TailCall) const {
+ if (TailCall.getOpcode() != X86::TCRETURNdi &&
+ TailCall.getOpcode() != X86::TCRETURNdi64) {
+ // Only direct calls can be done with a conditional branch.
+ return false;
+ }
+
+ if (Subtarget.isTargetWin64()) {
+ // Conditional tail calls confuse the Win64 unwinder.
+ // TODO: Allow them for "leaf" functions; PR30337.
+ return false;
+ }
+
+ assert(BranchCond.size() == 1);
+ if (BranchCond[0].getImm() > X86::LAST_VALID_COND) {
+ // Can't make a conditional tail call with this condition.
+ return false;
+ }
+
+ const X86MachineFunctionInfo *X86FI =
+ TailCall.getParent()->getParent()->getInfo<X86MachineFunctionInfo>();
+ if (X86FI->getTCReturnAddrDelta() != 0 ||
+ TailCall.getOperand(1).getImm() != 0) {
+ // A conditional tail call cannot do any stack adjustment.
+ return false;
+ }
+
+ return true;
+}
+
+void X86InstrInfo::replaceBranchWithTailCall(
+ MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &BranchCond,
+ const MachineInstr &TailCall) const {
+ assert(canMakeTailCallConditional(BranchCond, TailCall));
+
+ MachineBasicBlock::iterator I = MBB.end();
+ while (I != MBB.begin()) {
+ --I;
+ if (I->isDebugValue())
+ continue;
+ if (!I->isBranch())
+ assert(0 && "Can't find the branch to replace!");
+
+ X86::CondCode CC = getCondFromBranchOpc(I->getOpcode());
+ assert(BranchCond.size() == 1);
+ if (CC != BranchCond[0].getImm())
+ continue;
+
+ break;
+ }
+
+ unsigned Opc = TailCall.getOpcode() == X86::TCRETURNdi ? X86::TCRETURNdicc
+ : X86::TCRETURNdi64cc;
+
+ auto MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opc));
+ MIB->addOperand(TailCall.getOperand(0)); // Destination.
+ MIB.addImm(0); // Stack offset (not used).
+ MIB->addOperand(BranchCond[0]); // Condition.
+ MIB.copyImplicitOps(TailCall); // Regmask and (imp-used) parameters.
+
+ I->eraseFromParent();
+}
+
+// Given a MBB and its TBB, find the FBB which was a fallthrough MBB (it may
+// not be a fallthrough MBB now due to layout changes). Return nullptr if the
+// fallthrough MBB cannot be identified.
+static MachineBasicBlock *getFallThroughMBB(MachineBasicBlock *MBB,
+ MachineBasicBlock *TBB) {
+ // Look for non-EHPad successors other than TBB. If we find exactly one, it
+ // is the fallthrough MBB. If we find zero, then TBB is both the target MBB
+ // and fallthrough MBB. If we find more than one, we cannot identify the
+ // fallthrough MBB and should return nullptr.
+ MachineBasicBlock *FallthroughBB = nullptr;
+ for (auto SI = MBB->succ_begin(), SE = MBB->succ_end(); SI != SE; ++SI) {
+ if ((*SI)->isEHPad() || (*SI == TBB && FallthroughBB))
+ continue;
+ // Return a nullptr if we found more than one fallthrough successor.
+ if (FallthroughBB && FallthroughBB != TBB)
+ return nullptr;
+ FallthroughBB = *SI;
+ }
+ return FallthroughBB;
+}
+
+bool X86InstrInfo::AnalyzeBranchImpl(
+ MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ SmallVectorImpl<MachineInstr *> &CondBranches, bool AllowModify) const {
+
+ // Start from the bottom of the block and work up, examining the
+ // terminator instructions.
+ MachineBasicBlock::iterator I = MBB.end();
+ MachineBasicBlock::iterator UnCondBrIter = MBB.end();
+ while (I != MBB.begin()) {
+ --I;
+ if (I->isDebugValue())
+ continue;
+
+ // Working from the bottom, when we see a non-terminator instruction, we're
+ // done.
+ if (!isUnpredicatedTerminator(*I))
+ break;
+
+ // A terminator that isn't a branch can't easily be handled by this
+ // analysis.
+ if (!I->isBranch())
+ return true;
+
+ // Handle unconditional branches.
+ if (I->getOpcode() == X86::JMP_1) {
+ UnCondBrIter = I;
+
+ if (!AllowModify) {
+ TBB = I->getOperand(0).getMBB();
+ continue;
+ }
+
+ // If the block has any instructions after a JMP, delete them.
+ while (std::next(I) != MBB.end())
+ std::next(I)->eraseFromParent();
+
+ Cond.clear();
+ FBB = nullptr;
+
+ // Delete the JMP if it's equivalent to a fall-through.
+ if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
+ TBB = nullptr;
+ I->eraseFromParent();
+ I = MBB.end();
+ UnCondBrIter = MBB.end();
+ continue;
+ }
+
+ // TBB is used to indicate the unconditional destination.
+ TBB = I->getOperand(0).getMBB();
+ continue;
+ }
+
+ // Handle conditional branches.
+ X86::CondCode BranchCode = getCondFromBranchOpc(I->getOpcode());
+ if (BranchCode == X86::COND_INVALID)
+ return true; // Can't handle indirect branch.
+
+ // Working from the bottom, handle the first conditional branch.
+ if (Cond.empty()) {
+ MachineBasicBlock *TargetBB = I->getOperand(0).getMBB();
+ if (AllowModify && UnCondBrIter != MBB.end() &&
+ MBB.isLayoutSuccessor(TargetBB)) {
+ // If we can modify the code and it ends in something like:
+ //
+ // jCC L1
+ // jmp L2
+ // L1:
+ // ...
+ // L2:
+ //
+ // Then we can change this to:
+ //
+ // jnCC L2
+ // L1:
+ // ...
+ // L2:
+ //
+ // Which is a bit more efficient.
+ // We conditionally jump to the fall-through block.
+ BranchCode = GetOppositeBranchCondition(BranchCode);
+ unsigned JNCC = GetCondBranchFromCond(BranchCode);
+ MachineBasicBlock::iterator OldInst = I;
+
+ BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(JNCC))
+ .addMBB(UnCondBrIter->getOperand(0).getMBB());
+ BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_1))
+ .addMBB(TargetBB);
+
+ OldInst->eraseFromParent();
+ UnCondBrIter->eraseFromParent();
+
+ // Restart the analysis.
+ UnCondBrIter = MBB.end();
+ I = MBB.end();
+ continue;
+ }
+
+ FBB = TBB;
+ TBB = I->getOperand(0).getMBB();
+ Cond.push_back(MachineOperand::CreateImm(BranchCode));
+ CondBranches.push_back(&*I);
+ continue;
+ }
+
+ // Handle subsequent conditional branches. Only handle the case where all
+ // conditional branches branch to the same destination and their condition
+ // opcodes fit one of the special multi-branch idioms.
+ assert(Cond.size() == 1);
+ assert(TBB);
+
+ // If the conditions are the same, we can leave them alone.
+ X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
+ auto NewTBB = I->getOperand(0).getMBB();
+ if (OldBranchCode == BranchCode && TBB == NewTBB)
+ continue;
+
+ // If they differ, see if they fit one of the known patterns. Theoretically,
+ // we could handle more patterns here, but we shouldn't expect to see them
+ // if instruction selection has done a reasonable job.
+ if (TBB == NewTBB &&
+ ((OldBranchCode == X86::COND_P && BranchCode == X86::COND_NE) ||
+ (OldBranchCode == X86::COND_NE && BranchCode == X86::COND_P))) {
+ BranchCode = X86::COND_NE_OR_P;
+ } else if ((OldBranchCode == X86::COND_NP && BranchCode == X86::COND_NE) ||
+ (OldBranchCode == X86::COND_E && BranchCode == X86::COND_P)) {
+ if (NewTBB != (FBB ? FBB : getFallThroughMBB(&MBB, TBB)))
+ return true;
+
+ // X86::COND_E_AND_NP usually has two different branch destinations.
+ //
+ // JP B1
+ // JE B2
+ // JMP B1
+ // B1:
+ // B2:
+ //
+ // Here this condition branches to B2 only if NP && E. It has another
+ // equivalent form:
+ //
+ // JNE B1
+ // JNP B2
+ // JMP B1
+ // B1:
+ // B2:
+ //
+ // Similarly it branches to B2 only if E && NP. That is why this condition
+ // is named with COND_E_AND_NP.
+ BranchCode = X86::COND_E_AND_NP;
+ } else
+ return true;
+
+ // Update the MachineOperand.
+ Cond[0].setImm(BranchCode);
+ CondBranches.push_back(&*I);
+ }
+
+ return false;
+}
+
+bool X86InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const {
+ SmallVector<MachineInstr *, 4> CondBranches;
+ return AnalyzeBranchImpl(MBB, TBB, FBB, Cond, CondBranches, AllowModify);
+}
+
+bool X86InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB,
+ MachineBranchPredicate &MBP,
+ bool AllowModify) const {
+ using namespace std::placeholders;
+
+ SmallVector<MachineOperand, 4> Cond;
+ SmallVector<MachineInstr *, 4> CondBranches;
+ if (AnalyzeBranchImpl(MBB, MBP.TrueDest, MBP.FalseDest, Cond, CondBranches,
+ AllowModify))
+ return true;
+
+ if (Cond.size() != 1)
+ return true;
+
+ assert(MBP.TrueDest && "expected!");
+
+ if (!MBP.FalseDest)
+ MBP.FalseDest = MBB.getNextNode();
+
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+ MachineInstr *ConditionDef = nullptr;
+ bool SingleUseCondition = true;
+
+ for (auto I = std::next(MBB.rbegin()), E = MBB.rend(); I != E; ++I) {
+ if (I->modifiesRegister(X86::EFLAGS, TRI)) {
+ ConditionDef = &*I;
+ break;
+ }
+
+ if (I->readsRegister(X86::EFLAGS, TRI))
+ SingleUseCondition = false;
+ }
+
+ if (!ConditionDef)
+ return true;
+
+ if (SingleUseCondition) {
+ for (auto *Succ : MBB.successors())
+ if (Succ->isLiveIn(X86::EFLAGS))
+ SingleUseCondition = false;
+ }
+
+ MBP.ConditionDef = ConditionDef;
+ MBP.SingleUseCondition = SingleUseCondition;
+
+ // Currently we only recognize the simple pattern:
+ //
+ // test %reg, %reg
+ // je %label
+ //
+ const unsigned TestOpcode =
+ Subtarget.is64Bit() ? X86::TEST64rr : X86::TEST32rr;
+
+ if (ConditionDef->getOpcode() == TestOpcode &&
+ ConditionDef->getNumOperands() == 3 &&
+ ConditionDef->getOperand(0).isIdenticalTo(ConditionDef->getOperand(1)) &&
+ (Cond[0].getImm() == X86::COND_NE || Cond[0].getImm() == X86::COND_E)) {
+ MBP.LHS = ConditionDef->getOperand(0);
+ MBP.RHS = MachineOperand::CreateImm(0);
+ MBP.Predicate = Cond[0].getImm() == X86::COND_NE
+ ? MachineBranchPredicate::PRED_NE
+ : MachineBranchPredicate::PRED_EQ;
+ return false;
+ }
+
+ return true;
+}
+
+unsigned X86InstrInfo::removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved) const {
+ assert(!BytesRemoved && "code size not handled");
+
+ MachineBasicBlock::iterator I = MBB.end();
+ unsigned Count = 0;
+
+ while (I != MBB.begin()) {
+ --I;
+ if (I->isDebugValue())
+ continue;
+ if (I->getOpcode() != X86::JMP_1 &&
+ getCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
+ break;
+ // Remove the branch.
+ I->eraseFromParent();
+ I = MBB.end();
+ ++Count;
+ }
+
+ return Count;
+}
+
+unsigned X86InstrInfo::insertBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB,
+ ArrayRef<MachineOperand> Cond,
+ const DebugLoc &DL,
+ int *BytesAdded) const {
+ // Shouldn't be a fall through.
+ assert(TBB && "insertBranch must not be told to insert a fallthrough");
+ assert((Cond.size() == 1 || Cond.size() == 0) &&
+ "X86 branch conditions have one component!");
+ assert(!BytesAdded && "code size not handled");
+
+ if (Cond.empty()) {
+ // Unconditional branch?
+ assert(!FBB && "Unconditional branch with multiple successors!");
+ BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(TBB);
+ return 1;
+ }
+
+ // If FBB is null, it is implied to be a fall-through block.
+ bool FallThru = FBB == nullptr;
+
+ // Conditional branch.
+ unsigned Count = 0;
+ X86::CondCode CC = (X86::CondCode)Cond[0].getImm();
+ switch (CC) {
+ case X86::COND_NE_OR_P:
+ // Synthesize NE_OR_P with two branches.
+ BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(TBB);
+ ++Count;
+ BuildMI(&MBB, DL, get(X86::JP_1)).addMBB(TBB);
+ ++Count;
+ break;
+ case X86::COND_E_AND_NP:
+ // Use the next block of MBB as FBB if it is null.
+ if (FBB == nullptr) {
+ FBB = getFallThroughMBB(&MBB, TBB);
+ assert(FBB && "MBB cannot be the last block in function when the false "
+ "body is a fall-through.");
+ }
+ // Synthesize COND_E_AND_NP with two branches.
+ BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(FBB);
+ ++Count;
+ BuildMI(&MBB, DL, get(X86::JNP_1)).addMBB(TBB);
+ ++Count;
+ break;
+ default: {
+ unsigned Opc = GetCondBranchFromCond(CC);
+ BuildMI(&MBB, DL, get(Opc)).addMBB(TBB);
+ ++Count;
+ }
+ }
+ if (!FallThru) {
+ // Two-way Conditional branch. Insert the second branch.
+ BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB);
+ ++Count;
+ }
+ return Count;
+}
+
+bool X86InstrInfo::
+canInsertSelect(const MachineBasicBlock &MBB,
+ ArrayRef<MachineOperand> Cond,
+ unsigned TrueReg, unsigned FalseReg,
+ int &CondCycles, int &TrueCycles, int &FalseCycles) const {
+ // Not all subtargets have cmov instructions.
+ if (!Subtarget.hasCMov())
+ return false;
+ if (Cond.size() != 1)
+ return false;
+ // We cannot do the composite conditions, at least not in SSA form.
+ if ((X86::CondCode)Cond[0].getImm() > X86::COND_S)
+ return false;
+
+ // Check register classes.
+ const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ const TargetRegisterClass *RC =
+ RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
+ if (!RC)
+ return false;
+
+ // We have cmov instructions for 16, 32, and 64 bit general purpose registers.
+ if (X86::GR16RegClass.hasSubClassEq(RC) ||
+ X86::GR32RegClass.hasSubClassEq(RC) ||
+ X86::GR64RegClass.hasSubClassEq(RC)) {
+ // This latency applies to Pentium M, Merom, Wolfdale, Nehalem, and Sandy
+ // Bridge. Probably Ivy Bridge as well.
+ CondCycles = 2;
+ TrueCycles = 2;
+ FalseCycles = 2;
+ return true;
+ }
+
+ // Can't do vectors.
+ return false;
+}
+
+void X86InstrInfo::insertSelect(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL, unsigned DstReg,
+ ArrayRef<MachineOperand> Cond, unsigned TrueReg,
+ unsigned FalseReg) const {
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ assert(Cond.size() == 1 && "Invalid Cond array");
+ unsigned Opc = getCMovFromCond((X86::CondCode)Cond[0].getImm(),
+ MRI.getRegClass(DstReg)->getSize(),
+ false /*HasMemoryOperand*/);
+ BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(FalseReg).addReg(TrueReg);
+}
+
+/// Test if the given register is a physical h register.
+static bool isHReg(unsigned Reg) {
+ return X86::GR8_ABCD_HRegClass.contains(Reg);
+}
+
+// Try and copy between VR128/VR64 and GR64 registers.
+static unsigned CopyToFromAsymmetricReg(unsigned &DestReg, unsigned &SrcReg,
+ const X86Subtarget &Subtarget) {
+ bool HasAVX = Subtarget.hasAVX();
+ bool HasAVX512 = Subtarget.hasAVX512();
+
+ // SrcReg(MaskReg) -> DestReg(GR64)
+ // SrcReg(MaskReg) -> DestReg(GR32)
+ // SrcReg(MaskReg) -> DestReg(GR16)
+ // SrcReg(MaskReg) -> DestReg(GR8)
+
+ // All KMASK RegClasses hold the same k registers, can be tested against anyone.
+ if (X86::VK16RegClass.contains(SrcReg)) {
+ if (X86::GR64RegClass.contains(DestReg)) {
+ assert(Subtarget.hasBWI());
+ return X86::KMOVQrk;
+ }
+ if (X86::GR32RegClass.contains(DestReg))
+ return Subtarget.hasBWI() ? X86::KMOVDrk : X86::KMOVWrk;
+ if (X86::GR16RegClass.contains(DestReg)) {
+ DestReg = getX86SubSuperRegister(DestReg, 32);
+ return X86::KMOVWrk;
+ }
+ if (X86::GR8RegClass.contains(DestReg)) {
+ DestReg = getX86SubSuperRegister(DestReg, 32);
+ return Subtarget.hasDQI() ? X86::KMOVBrk : X86::KMOVWrk;
+ }
+ }
+
+ // SrcReg(GR64) -> DestReg(MaskReg)
+ // SrcReg(GR32) -> DestReg(MaskReg)
+ // SrcReg(GR16) -> DestReg(MaskReg)
+ // SrcReg(GR8) -> DestReg(MaskReg)
+
+ // All KMASK RegClasses hold the same k registers, can be tested against anyone.
+ if (X86::VK16RegClass.contains(DestReg)) {
+ if (X86::GR64RegClass.contains(SrcReg)) {
+ assert(Subtarget.hasBWI());
+ return X86::KMOVQkr;
+ }
+ if (X86::GR32RegClass.contains(SrcReg))
+ return Subtarget.hasBWI() ? X86::KMOVDkr : X86::KMOVWkr;
+ if (X86::GR16RegClass.contains(SrcReg)) {
+ SrcReg = getX86SubSuperRegister(SrcReg, 32);
+ return X86::KMOVWkr;
+ }
+ if (X86::GR8RegClass.contains(SrcReg)) {
+ SrcReg = getX86SubSuperRegister(SrcReg, 32);
+ return Subtarget.hasDQI() ? X86::KMOVBkr : X86::KMOVWkr;
+ }
+ }
+
+
+ // SrcReg(VR128) -> DestReg(GR64)
+ // SrcReg(VR64) -> DestReg(GR64)
+ // SrcReg(GR64) -> DestReg(VR128)
+ // SrcReg(GR64) -> DestReg(VR64)
+
+ if (X86::GR64RegClass.contains(DestReg)) {
+ if (X86::VR128XRegClass.contains(SrcReg))
+ // Copy from a VR128 register to a GR64 register.
+ return HasAVX512 ? X86::VMOVPQIto64Zrr :
+ HasAVX ? X86::VMOVPQIto64rr :
+ X86::MOVPQIto64rr;
+ if (X86::VR64RegClass.contains(SrcReg))
+ // Copy from a VR64 register to a GR64 register.
+ return X86::MMX_MOVD64from64rr;
+ } else if (X86::GR64RegClass.contains(SrcReg)) {
+ // Copy from a GR64 register to a VR128 register.
+ if (X86::VR128XRegClass.contains(DestReg))
+ return HasAVX512 ? X86::VMOV64toPQIZrr :
+ HasAVX ? X86::VMOV64toPQIrr :
+ X86::MOV64toPQIrr;
+ // Copy from a GR64 register to a VR64 register.
+ if (X86::VR64RegClass.contains(DestReg))
+ return X86::MMX_MOVD64to64rr;
+ }
+
+ // SrcReg(FR32) -> DestReg(GR32)
+ // SrcReg(GR32) -> DestReg(FR32)
+
+ if (X86::GR32RegClass.contains(DestReg) &&
+ X86::FR32XRegClass.contains(SrcReg))
+ // Copy from a FR32 register to a GR32 register.
+ return HasAVX512 ? X86::VMOVSS2DIZrr :
+ HasAVX ? X86::VMOVSS2DIrr :
+ X86::MOVSS2DIrr;
+
+ if (X86::FR32XRegClass.contains(DestReg) &&
+ X86::GR32RegClass.contains(SrcReg))
+ // Copy from a GR32 register to a FR32 register.
+ return HasAVX512 ? X86::VMOVDI2SSZrr :
+ HasAVX ? X86::VMOVDI2SSrr :
+ X86::MOVDI2SSrr;
+ return 0;
+}
+
+void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const DebugLoc &DL, unsigned DestReg,
+ unsigned SrcReg, bool KillSrc) const {
+ // First deal with the normal symmetric copies.
+ bool HasAVX = Subtarget.hasAVX();
+ bool HasVLX = Subtarget.hasVLX();
+ unsigned Opc = 0;
+ if (X86::GR64RegClass.contains(DestReg, SrcReg))
+ Opc = X86::MOV64rr;
+ else if (X86::GR32RegClass.contains(DestReg, SrcReg))
+ Opc = X86::MOV32rr;
+ else if (X86::GR16RegClass.contains(DestReg, SrcReg))
+ Opc = X86::MOV16rr;
+ else if (X86::GR8RegClass.contains(DestReg, SrcReg)) {
+ // Copying to or from a physical H register on x86-64 requires a NOREX
+ // move. Otherwise use a normal move.
+ if ((isHReg(DestReg) || isHReg(SrcReg)) &&
+ Subtarget.is64Bit()) {
+ Opc = X86::MOV8rr_NOREX;
+ // Both operands must be encodable without an REX prefix.
+ assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) &&
+ "8-bit H register can not be copied outside GR8_NOREX");
+ } else
+ Opc = X86::MOV8rr;
+ }
+ else if (X86::VR64RegClass.contains(DestReg, SrcReg))
+ Opc = X86::MMX_MOVQ64rr;
+ else if (X86::VR128XRegClass.contains(DestReg, SrcReg)) {
+ if (HasVLX)
+ Opc = X86::VMOVAPSZ128rr;
+ else if (X86::VR128RegClass.contains(DestReg, SrcReg))
+ Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
+ else {
+ // If this an extended register and we don't have VLX we need to use a
+ // 512-bit move.
+ Opc = X86::VMOVAPSZrr;
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ DestReg = TRI->getMatchingSuperReg(DestReg, X86::sub_xmm,
+ &X86::VR512RegClass);
+ SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm,
+ &X86::VR512RegClass);
+ }
+ } else if (X86::VR256XRegClass.contains(DestReg, SrcReg)) {
+ if (HasVLX)
+ Opc = X86::VMOVAPSZ256rr;
+ else if (X86::VR256RegClass.contains(DestReg, SrcReg))
+ Opc = X86::VMOVAPSYrr;
+ else {
+ // If this an extended register and we don't have VLX we need to use a
+ // 512-bit move.
+ Opc = X86::VMOVAPSZrr;
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ DestReg = TRI->getMatchingSuperReg(DestReg, X86::sub_ymm,
+ &X86::VR512RegClass);
+ SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm,
+ &X86::VR512RegClass);
+ }
+ } else if (X86::VR512RegClass.contains(DestReg, SrcReg))
+ Opc = X86::VMOVAPSZrr;
+ // All KMASK RegClasses hold the same k registers, can be tested against anyone.
+ else if (X86::VK16RegClass.contains(DestReg, SrcReg))
+ Opc = Subtarget.hasBWI() ? X86::KMOVQkk : X86::KMOVWkk;
+ if (!Opc)
+ Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget);
+
+ if (Opc) {
+ BuildMI(MBB, MI, DL, get(Opc), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+ }
+
+ bool FromEFLAGS = SrcReg == X86::EFLAGS;
+ bool ToEFLAGS = DestReg == X86::EFLAGS;
+ int Reg = FromEFLAGS ? DestReg : SrcReg;
+ bool is32 = X86::GR32RegClass.contains(Reg);
+ bool is64 = X86::GR64RegClass.contains(Reg);
+
+ if ((FromEFLAGS || ToEFLAGS) && (is32 || is64)) {
+ int Mov = is64 ? X86::MOV64rr : X86::MOV32rr;
+ int Push = is64 ? X86::PUSH64r : X86::PUSH32r;
+ int PushF = is64 ? X86::PUSHF64 : X86::PUSHF32;
+ int Pop = is64 ? X86::POP64r : X86::POP32r;
+ int PopF = is64 ? X86::POPF64 : X86::POPF32;
+ int AX = is64 ? X86::RAX : X86::EAX;
+
+ if (!Subtarget.hasLAHFSAHF()) {
+ assert(Subtarget.is64Bit() &&
+ "Not having LAHF/SAHF only happens on 64-bit.");
+ // Moving EFLAGS to / from another register requires a push and a pop.
+ // Notice that we have to adjust the stack if we don't want to clobber the
+ // first frame index. See X86FrameLowering.cpp - usesTheStack.
+ if (FromEFLAGS) {
+ BuildMI(MBB, MI, DL, get(PushF));
+ BuildMI(MBB, MI, DL, get(Pop), DestReg);
+ }
+ if (ToEFLAGS) {
+ BuildMI(MBB, MI, DL, get(Push))
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ BuildMI(MBB, MI, DL, get(PopF));
+ }
+ return;
+ }
+
+ // The flags need to be saved, but saving EFLAGS with PUSHF/POPF is
+ // inefficient. Instead:
+ // - Save the overflow flag OF into AL using SETO, and restore it using a
+ // signed 8-bit addition of AL and INT8_MAX.
+ // - Save/restore the bottom 8 EFLAGS bits (CF, PF, AF, ZF, SF) to/from AH
+ // using LAHF/SAHF.
+ // - When RAX/EAX is live and isn't the destination register, make sure it
+ // isn't clobbered by PUSH/POP'ing it before and after saving/restoring
+ // the flags.
+ // This approach is ~2.25x faster than using PUSHF/POPF.
+ //
+ // This is still somewhat inefficient because we don't know which flags are
+ // actually live inside EFLAGS. Were we able to do a single SETcc instead of
+ // SETO+LAHF / ADDB+SAHF the code could be 1.02x faster.
+ //
+ // PUSHF/POPF is also potentially incorrect because it affects other flags
+ // such as TF/IF/DF, which LLVM doesn't model.
+ //
+ // Notice that we have to adjust the stack if we don't want to clobber the
+ // first frame index.
+ // See X86ISelLowering.cpp - X86::hasCopyImplyingStackAdjustment.
+
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ MachineBasicBlock::LivenessQueryResult LQR =
+ MBB.computeRegisterLiveness(TRI, AX, MI);
+ // We do not want to save and restore AX if we do not have to.
+ // Moreover, if we do so whereas AX is dead, we would need to set
+ // an undef flag on the use of AX, otherwise the verifier will
+ // complain that we read an undef value.
+ // We do not want to change the behavior of the machine verifier
+ // as this is usually wrong to read an undef value.
+ if (MachineBasicBlock::LQR_Unknown == LQR) {
+ LivePhysRegs LPR(TRI);
+ LPR.addLiveOuts(MBB);
+ MachineBasicBlock::iterator I = MBB.end();
+ while (I != MI) {
+ --I;
+ LPR.stepBackward(*I);
+ }
+ // AX contains the top most register in the aliasing hierarchy.
+ // It may not be live, but one of its aliases may be.
+ for (MCRegAliasIterator AI(AX, TRI, true);
+ AI.isValid() && LQR != MachineBasicBlock::LQR_Live; ++AI)
+ LQR = LPR.contains(*AI) ? MachineBasicBlock::LQR_Live
+ : MachineBasicBlock::LQR_Dead;
+ }
+ bool AXDead = (Reg == AX) || (MachineBasicBlock::LQR_Dead == LQR);
+ if (!AXDead)
+ BuildMI(MBB, MI, DL, get(Push)).addReg(AX, getKillRegState(true));
+ if (FromEFLAGS) {
+ BuildMI(MBB, MI, DL, get(X86::SETOr), X86::AL);
+ BuildMI(MBB, MI, DL, get(X86::LAHF));
+ BuildMI(MBB, MI, DL, get(Mov), Reg).addReg(AX);
+ }
+ if (ToEFLAGS) {
+ BuildMI(MBB, MI, DL, get(Mov), AX).addReg(Reg, getKillRegState(KillSrc));
+ BuildMI(MBB, MI, DL, get(X86::ADD8ri), X86::AL)
+ .addReg(X86::AL)
+ .addImm(INT8_MAX);
+ BuildMI(MBB, MI, DL, get(X86::SAHF));
+ }
+ if (!AXDead)
+ BuildMI(MBB, MI, DL, get(Pop), AX);
+ return;
+ }
+
+ DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg)
+ << " to " << RI.getName(DestReg) << '\n');
+ llvm_unreachable("Cannot emit physreg copy instruction");
+}
+
+static unsigned getLoadStoreRegOpcode(unsigned Reg,
+ const TargetRegisterClass *RC,
+ bool isStackAligned,
+ const X86Subtarget &STI,
+ bool load) {
+ bool HasAVX = STI.hasAVX();
+ bool HasAVX512 = STI.hasAVX512();
+ bool HasVLX = STI.hasVLX();
+
+ switch (RC->getSize()) {
+ default:
+ llvm_unreachable("Unknown spill size");
+ case 1:
+ assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass");
+ if (STI.is64Bit())
+ // Copying to or from a physical H register on x86-64 requires a NOREX
+ // move. Otherwise use a normal move.
+ if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC))
+ return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
+ return load ? X86::MOV8rm : X86::MOV8mr;
+ case 2:
+ if (X86::VK16RegClass.hasSubClassEq(RC))
+ return load ? X86::KMOVWkm : X86::KMOVWmk;
+ assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
+ return load ? X86::MOV16rm : X86::MOV16mr;
+ case 4:
+ if (X86::GR32RegClass.hasSubClassEq(RC))
+ return load ? X86::MOV32rm : X86::MOV32mr;
+ if (X86::FR32XRegClass.hasSubClassEq(RC))
+ return load ?
+ (HasAVX512 ? X86::VMOVSSZrm : HasAVX ? X86::VMOVSSrm : X86::MOVSSrm) :
+ (HasAVX512 ? X86::VMOVSSZmr : HasAVX ? X86::VMOVSSmr : X86::MOVSSmr);
+ if (X86::RFP32RegClass.hasSubClassEq(RC))
+ return load ? X86::LD_Fp32m : X86::ST_Fp32m;
+ if (X86::VK32RegClass.hasSubClassEq(RC))
+ return load ? X86::KMOVDkm : X86::KMOVDmk;
+ llvm_unreachable("Unknown 4-byte regclass");
+ case 8:
+ if (X86::GR64RegClass.hasSubClassEq(RC))
+ return load ? X86::MOV64rm : X86::MOV64mr;
+ if (X86::FR64XRegClass.hasSubClassEq(RC))
+ return load ?
+ (HasAVX512 ? X86::VMOVSDZrm : HasAVX ? X86::VMOVSDrm : X86::MOVSDrm) :
+ (HasAVX512 ? X86::VMOVSDZmr : HasAVX ? X86::VMOVSDmr : X86::MOVSDmr);
+ if (X86::VR64RegClass.hasSubClassEq(RC))
+ return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
+ if (X86::RFP64RegClass.hasSubClassEq(RC))
+ return load ? X86::LD_Fp64m : X86::ST_Fp64m;
+ if (X86::VK64RegClass.hasSubClassEq(RC))
+ return load ? X86::KMOVQkm : X86::KMOVQmk;
+ llvm_unreachable("Unknown 8-byte regclass");
+ case 10:
+ assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
+ return load ? X86::LD_Fp80m : X86::ST_FpP80m;
+ case 16: {
+ assert(X86::VR128XRegClass.hasSubClassEq(RC) && "Unknown 16-byte regclass");
+ // If stack is realigned we can use aligned stores.
+ if (isStackAligned)
+ return load ?
+ (HasVLX ? X86::VMOVAPSZ128rm :
+ HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX :
+ HasAVX ? X86::VMOVAPSrm :
+ X86::MOVAPSrm):
+ (HasVLX ? X86::VMOVAPSZ128mr :
+ HasAVX512 ? X86::VMOVAPSZ128mr_NOVLX :
+ HasAVX ? X86::VMOVAPSmr :
+ X86::MOVAPSmr);
+ else
+ return load ?
+ (HasVLX ? X86::VMOVUPSZ128rm :
+ HasAVX512 ? X86::VMOVUPSZ128rm_NOVLX :
+ HasAVX ? X86::VMOVUPSrm :
+ X86::MOVUPSrm):
+ (HasVLX ? X86::VMOVUPSZ128mr :
+ HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX :
+ HasAVX ? X86::VMOVUPSmr :
+ X86::MOVUPSmr);
+ }
+ case 32:
+ assert(X86::VR256XRegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass");
+ // If stack is realigned we can use aligned stores.
+ if (isStackAligned)
+ return load ?
+ (HasVLX ? X86::VMOVAPSZ256rm :
+ HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX :
+ X86::VMOVAPSYrm) :
+ (HasVLX ? X86::VMOVAPSZ256mr :
+ HasAVX512 ? X86::VMOVAPSZ256mr_NOVLX :
+ X86::VMOVAPSYmr);
+ else
+ return load ?
+ (HasVLX ? X86::VMOVUPSZ256rm :
+ HasAVX512 ? X86::VMOVUPSZ256rm_NOVLX :
+ X86::VMOVUPSYrm) :
+ (HasVLX ? X86::VMOVUPSZ256mr :
+ HasAVX512 ? X86::VMOVUPSZ256mr_NOVLX :
+ X86::VMOVUPSYmr);
+ case 64:
+ assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass");
+ assert(STI.hasAVX512() && "Using 512-bit register requires AVX512");
+ if (isStackAligned)
+ return load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
+ else
+ return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
+ }
+}
+
+bool X86InstrInfo::getMemOpBaseRegImmOfs(MachineInstr &MemOp, unsigned &BaseReg,
+ int64_t &Offset,
+ const TargetRegisterInfo *TRI) const {
+ const MCInstrDesc &Desc = MemOp.getDesc();
+ int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
+ if (MemRefBegin < 0)
+ return false;
+
+ MemRefBegin += X86II::getOperandBias(Desc);
+
+ MachineOperand &BaseMO = MemOp.getOperand(MemRefBegin + X86::AddrBaseReg);
+ if (!BaseMO.isReg()) // Can be an MO_FrameIndex
+ return false;
+
+ BaseReg = BaseMO.getReg();
+ if (MemOp.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm() != 1)
+ return false;
+
+ if (MemOp.getOperand(MemRefBegin + X86::AddrIndexReg).getReg() !=
+ X86::NoRegister)
+ return false;
+
+ const MachineOperand &DispMO = MemOp.getOperand(MemRefBegin + X86::AddrDisp);
+
+ // Displacement can be symbolic
+ if (!DispMO.isImm())
+ return false;
+
+ Offset = DispMO.getImm();
+
+ return true;
+}
+
+static unsigned getStoreRegOpcode(unsigned SrcReg,
+ const TargetRegisterClass *RC,
+ bool isStackAligned,
+ const X86Subtarget &STI) {
+ return getLoadStoreRegOpcode(SrcReg, RC, isStackAligned, STI, false);
+}
+
+
+static unsigned getLoadRegOpcode(unsigned DestReg,
+ const TargetRegisterClass *RC,
+ bool isStackAligned,
+ const X86Subtarget &STI) {
+ return getLoadStoreRegOpcode(DestReg, RC, isStackAligned, STI, true);
+}
+
+void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned SrcReg, bool isKill, int FrameIdx,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ const MachineFunction &MF = *MBB.getParent();
+ assert(MF.getFrameInfo().getObjectSize(FrameIdx) >= RC->getSize() &&
+ "Stack slot too small for store");
+ unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
+ bool isAligned =
+ (Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) ||
+ RI.canRealignStack(MF);
+ unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
+ DebugLoc DL = MBB.findDebugLoc(MI);
+ addFrameReference(BuildMI(MBB, MI, DL, get(Opc)), FrameIdx)
+ .addReg(SrcReg, getKillRegState(isKill));
+}
+
+void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
+ bool isKill,
+ SmallVectorImpl<MachineOperand> &Addr,
+ const TargetRegisterClass *RC,
+ MachineInstr::mmo_iterator MMOBegin,
+ MachineInstr::mmo_iterator MMOEnd,
+ SmallVectorImpl<MachineInstr*> &NewMIs) const {
+ unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
+ bool isAligned = MMOBegin != MMOEnd &&
+ (*MMOBegin)->getAlignment() >= Alignment;
+ unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
+ DebugLoc DL;
+ MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
+ for (unsigned i = 0, e = Addr.size(); i != e; ++i)
+ MIB.addOperand(Addr[i]);
+ MIB.addReg(SrcReg, getKillRegState(isKill));
+ (*MIB).setMemRefs(MMOBegin, MMOEnd);
+ NewMIs.push_back(MIB);
+}
+
+
+void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned DestReg, int FrameIdx,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ const MachineFunction &MF = *MBB.getParent();
+ unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
+ bool isAligned =
+ (Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) ||
+ RI.canRealignStack(MF);
+ unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
+ DebugLoc DL = MBB.findDebugLoc(MI);
+ addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DestReg), FrameIdx);
+}
+
+void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+ SmallVectorImpl<MachineOperand> &Addr,
+ const TargetRegisterClass *RC,
+ MachineInstr::mmo_iterator MMOBegin,
+ MachineInstr::mmo_iterator MMOEnd,
+ SmallVectorImpl<MachineInstr*> &NewMIs) const {
+ unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
+ bool isAligned = MMOBegin != MMOEnd &&
+ (*MMOBegin)->getAlignment() >= Alignment;
+ unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
+ DebugLoc DL;
+ MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
+ for (unsigned i = 0, e = Addr.size(); i != e; ++i)
+ MIB.addOperand(Addr[i]);
+ (*MIB).setMemRefs(MMOBegin, MMOEnd);
+ NewMIs.push_back(MIB);
+}
+
+bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
+ unsigned &SrcReg2, int &CmpMask,
+ int &CmpValue) const {
+ switch (MI.getOpcode()) {
+ default: break;
+ case X86::CMP64ri32:
+ case X86::CMP64ri8:
+ case X86::CMP32ri:
+ case X86::CMP32ri8:
+ case X86::CMP16ri:
+ case X86::CMP16ri8:
+ case X86::CMP8ri:
+ if (!MI.getOperand(1).isImm())
+ return false;
+ SrcReg = MI.getOperand(0).getReg();
+ SrcReg2 = 0;
+ CmpMask = ~0;
+ CmpValue = MI.getOperand(1).getImm();
+ return true;
+ // A SUB can be used to perform comparison.
+ case X86::SUB64rm:
+ case X86::SUB32rm:
+ case X86::SUB16rm:
+ case X86::SUB8rm:
+ SrcReg = MI.getOperand(1).getReg();
+ SrcReg2 = 0;
+ CmpMask = ~0;
+ CmpValue = 0;
+ return true;
+ case X86::SUB64rr:
+ case X86::SUB32rr:
+ case X86::SUB16rr:
+ case X86::SUB8rr:
+ SrcReg = MI.getOperand(1).getReg();
+ SrcReg2 = MI.getOperand(2).getReg();
+ CmpMask = ~0;
+ CmpValue = 0;
+ return true;
+ case X86::SUB64ri32:
+ case X86::SUB64ri8:
+ case X86::SUB32ri:
+ case X86::SUB32ri8:
+ case X86::SUB16ri:
+ case X86::SUB16ri8:
+ case X86::SUB8ri:
+ if (!MI.getOperand(2).isImm())
+ return false;
+ SrcReg = MI.getOperand(1).getReg();
+ SrcReg2 = 0;
+ CmpMask = ~0;
+ CmpValue = MI.getOperand(2).getImm();
+ return true;
+ case X86::CMP64rr:
+ case X86::CMP32rr:
+ case X86::CMP16rr:
+ case X86::CMP8rr:
+ SrcReg = MI.getOperand(0).getReg();
+ SrcReg2 = MI.getOperand(1).getReg();
+ CmpMask = ~0;
+ CmpValue = 0;
+ return true;
+ case X86::TEST8rr:
+ case X86::TEST16rr:
+ case X86::TEST32rr:
+ case X86::TEST64rr:
+ SrcReg = MI.getOperand(0).getReg();
+ if (MI.getOperand(1).getReg() != SrcReg)
+ return false;
+ // Compare against zero.
+ SrcReg2 = 0;
+ CmpMask = ~0;
+ CmpValue = 0;
+ return true;
+ }
+ return false;
+}
+
+/// Check whether the first instruction, whose only
+/// purpose is to update flags, can be made redundant.
+/// CMPrr can be made redundant by SUBrr if the operands are the same.
+/// This function can be extended later on.
+/// SrcReg, SrcRegs: register operands for FlagI.
+/// ImmValue: immediate for FlagI if it takes an immediate.
+inline static bool isRedundantFlagInstr(MachineInstr &FlagI, unsigned SrcReg,
+ unsigned SrcReg2, int ImmValue,
+ MachineInstr &OI) {
+ if (((FlagI.getOpcode() == X86::CMP64rr && OI.getOpcode() == X86::SUB64rr) ||
+ (FlagI.getOpcode() == X86::CMP32rr && OI.getOpcode() == X86::SUB32rr) ||
+ (FlagI.getOpcode() == X86::CMP16rr && OI.getOpcode() == X86::SUB16rr) ||
+ (FlagI.getOpcode() == X86::CMP8rr && OI.getOpcode() == X86::SUB8rr)) &&
+ ((OI.getOperand(1).getReg() == SrcReg &&
+ OI.getOperand(2).getReg() == SrcReg2) ||
+ (OI.getOperand(1).getReg() == SrcReg2 &&
+ OI.getOperand(2).getReg() == SrcReg)))
+ return true;
+
+ if (((FlagI.getOpcode() == X86::CMP64ri32 &&
+ OI.getOpcode() == X86::SUB64ri32) ||
+ (FlagI.getOpcode() == X86::CMP64ri8 &&
+ OI.getOpcode() == X86::SUB64ri8) ||
+ (FlagI.getOpcode() == X86::CMP32ri && OI.getOpcode() == X86::SUB32ri) ||
+ (FlagI.getOpcode() == X86::CMP32ri8 &&
+ OI.getOpcode() == X86::SUB32ri8) ||
+ (FlagI.getOpcode() == X86::CMP16ri && OI.getOpcode() == X86::SUB16ri) ||
+ (FlagI.getOpcode() == X86::CMP16ri8 &&
+ OI.getOpcode() == X86::SUB16ri8) ||
+ (FlagI.getOpcode() == X86::CMP8ri && OI.getOpcode() == X86::SUB8ri)) &&
+ OI.getOperand(1).getReg() == SrcReg &&
+ OI.getOperand(2).getImm() == ImmValue)
+ return true;
+ return false;
+}
+
+/// Check whether the definition can be converted
+/// to remove a comparison against zero.
+inline static bool isDefConvertible(MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ default: return false;
+
+ // The shift instructions only modify ZF if their shift count is non-zero.
+ // N.B.: The processor truncates the shift count depending on the encoding.
+ case X86::SAR8ri: case X86::SAR16ri: case X86::SAR32ri:case X86::SAR64ri:
+ case X86::SHR8ri: case X86::SHR16ri: case X86::SHR32ri:case X86::SHR64ri:
+ return getTruncatedShiftCount(MI, 2) != 0;
+
+ // Some left shift instructions can be turned into LEA instructions but only
+ // if their flags aren't used. Avoid transforming such instructions.
+ case X86::SHL8ri: case X86::SHL16ri: case X86::SHL32ri:case X86::SHL64ri:{
+ unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+ if (isTruncatedShiftCountForLEA(ShAmt)) return false;
+ return ShAmt != 0;
+ }
+
+ case X86::SHRD16rri8:case X86::SHRD32rri8:case X86::SHRD64rri8:
+ case X86::SHLD16rri8:case X86::SHLD32rri8:case X86::SHLD64rri8:
+ return getTruncatedShiftCount(MI, 3) != 0;
+
+ case X86::SUB64ri32: case X86::SUB64ri8: case X86::SUB32ri:
+ case X86::SUB32ri8: case X86::SUB16ri: case X86::SUB16ri8:
+ case X86::SUB8ri: case X86::SUB64rr: case X86::SUB32rr:
+ case X86::SUB16rr: case X86::SUB8rr: case X86::SUB64rm:
+ case X86::SUB32rm: case X86::SUB16rm: case X86::SUB8rm:
+ case X86::DEC64r: case X86::DEC32r: case X86::DEC16r: case X86::DEC8r:
+ case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD32ri:
+ case X86::ADD32ri8: case X86::ADD16ri: case X86::ADD16ri8:
+ case X86::ADD8ri: case X86::ADD64rr: case X86::ADD32rr:
+ case X86::ADD16rr: case X86::ADD8rr: case X86::ADD64rm:
+ case X86::ADD32rm: case X86::ADD16rm: case X86::ADD8rm:
+ case X86::INC64r: case X86::INC32r: case X86::INC16r: case X86::INC8r:
+ case X86::AND64ri32: case X86::AND64ri8: case X86::AND32ri:
+ case X86::AND32ri8: case X86::AND16ri: case X86::AND16ri8:
+ case X86::AND8ri: case X86::AND64rr: case X86::AND32rr:
+ case X86::AND16rr: case X86::AND8rr: case X86::AND64rm:
+ case X86::AND32rm: case X86::AND16rm: case X86::AND8rm:
+ case X86::XOR64ri32: case X86::XOR64ri8: case X86::XOR32ri:
+ case X86::XOR32ri8: case X86::XOR16ri: case X86::XOR16ri8:
+ case X86::XOR8ri: case X86::XOR64rr: case X86::XOR32rr:
+ case X86::XOR16rr: case X86::XOR8rr: case X86::XOR64rm:
+ case X86::XOR32rm: case X86::XOR16rm: case X86::XOR8rm:
+ case X86::OR64ri32: case X86::OR64ri8: case X86::OR32ri:
+ case X86::OR32ri8: case X86::OR16ri: case X86::OR16ri8:
+ case X86::OR8ri: case X86::OR64rr: case X86::OR32rr:
+ case X86::OR16rr: case X86::OR8rr: case X86::OR64rm:
+ case X86::OR32rm: case X86::OR16rm: case X86::OR8rm:
+ case X86::NEG8r: case X86::NEG16r: case X86::NEG32r: case X86::NEG64r:
+ case X86::SAR8r1: case X86::SAR16r1: case X86::SAR32r1:case X86::SAR64r1:
+ case X86::SHR8r1: case X86::SHR16r1: case X86::SHR32r1:case X86::SHR64r1:
+ case X86::SHL8r1: case X86::SHL16r1: case X86::SHL32r1:case X86::SHL64r1:
+ case X86::ADC32ri: case X86::ADC32ri8:
+ case X86::ADC32rr: case X86::ADC64ri32:
+ case X86::ADC64ri8: case X86::ADC64rr:
+ case X86::SBB32ri: case X86::SBB32ri8:
+ case X86::SBB32rr: case X86::SBB64ri32:
+ case X86::SBB64ri8: case X86::SBB64rr:
+ case X86::ANDN32rr: case X86::ANDN32rm:
+ case X86::ANDN64rr: case X86::ANDN64rm:
+ case X86::BEXTR32rr: case X86::BEXTR64rr:
+ case X86::BEXTR32rm: case X86::BEXTR64rm:
+ case X86::BLSI32rr: case X86::BLSI32rm:
+ case X86::BLSI64rr: case X86::BLSI64rm:
+ case X86::BLSMSK32rr:case X86::BLSMSK32rm:
+ case X86::BLSMSK64rr:case X86::BLSMSK64rm:
+ case X86::BLSR32rr: case X86::BLSR32rm:
+ case X86::BLSR64rr: case X86::BLSR64rm:
+ case X86::BZHI32rr: case X86::BZHI32rm:
+ case X86::BZHI64rr: case X86::BZHI64rm:
+ case X86::LZCNT16rr: case X86::LZCNT16rm:
+ case X86::LZCNT32rr: case X86::LZCNT32rm:
+ case X86::LZCNT64rr: case X86::LZCNT64rm:
+ case X86::POPCNT16rr:case X86::POPCNT16rm:
+ case X86::POPCNT32rr:case X86::POPCNT32rm:
+ case X86::POPCNT64rr:case X86::POPCNT64rm:
+ case X86::TZCNT16rr: case X86::TZCNT16rm:
+ case X86::TZCNT32rr: case X86::TZCNT32rm:
+ case X86::TZCNT64rr: case X86::TZCNT64rm:
+ return true;
+ }
+}
+
+/// Check whether the use can be converted to remove a comparison against zero.
+static X86::CondCode isUseDefConvertible(MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ default: return X86::COND_INVALID;
+ case X86::LZCNT16rr: case X86::LZCNT16rm:
+ case X86::LZCNT32rr: case X86::LZCNT32rm:
+ case X86::LZCNT64rr: case X86::LZCNT64rm:
+ return X86::COND_B;
+ case X86::POPCNT16rr:case X86::POPCNT16rm:
+ case X86::POPCNT32rr:case X86::POPCNT32rm:
+ case X86::POPCNT64rr:case X86::POPCNT64rm:
+ return X86::COND_E;
+ case X86::TZCNT16rr: case X86::TZCNT16rm:
+ case X86::TZCNT32rr: case X86::TZCNT32rm:
+ case X86::TZCNT64rr: case X86::TZCNT64rm:
+ return X86::COND_B;
+ }
+}
+
+/// Check if there exists an earlier instruction that
+/// operates on the same source operands and sets flags in the same way as
+/// Compare; remove Compare if possible.
+bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
+ unsigned SrcReg2, int CmpMask,
+ int CmpValue,
+ const MachineRegisterInfo *MRI) const {
+ // Check whether we can replace SUB with CMP.
+ unsigned NewOpcode = 0;
+ switch (CmpInstr.getOpcode()) {
+ default: break;
+ case X86::SUB64ri32:
+ case X86::SUB64ri8:
+ case X86::SUB32ri:
+ case X86::SUB32ri8:
+ case X86::SUB16ri:
+ case X86::SUB16ri8:
+ case X86::SUB8ri:
+ case X86::SUB64rm:
+ case X86::SUB32rm:
+ case X86::SUB16rm:
+ case X86::SUB8rm:
+ case X86::SUB64rr:
+ case X86::SUB32rr:
+ case X86::SUB16rr:
+ case X86::SUB8rr: {
+ if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
+ return false;
+ // There is no use of the destination register, we can replace SUB with CMP.
+ switch (CmpInstr.getOpcode()) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::SUB64rm: NewOpcode = X86::CMP64rm; break;
+ case X86::SUB32rm: NewOpcode = X86::CMP32rm; break;
+ case X86::SUB16rm: NewOpcode = X86::CMP16rm; break;
+ case X86::SUB8rm: NewOpcode = X86::CMP8rm; break;
+ case X86::SUB64rr: NewOpcode = X86::CMP64rr; break;
+ case X86::SUB32rr: NewOpcode = X86::CMP32rr; break;
+ case X86::SUB16rr: NewOpcode = X86::CMP16rr; break;
+ case X86::SUB8rr: NewOpcode = X86::CMP8rr; break;
+ case X86::SUB64ri32: NewOpcode = X86::CMP64ri32; break;
+ case X86::SUB64ri8: NewOpcode = X86::CMP64ri8; break;
+ case X86::SUB32ri: NewOpcode = X86::CMP32ri; break;
+ case X86::SUB32ri8: NewOpcode = X86::CMP32ri8; break;
+ case X86::SUB16ri: NewOpcode = X86::CMP16ri; break;
+ case X86::SUB16ri8: NewOpcode = X86::CMP16ri8; break;
+ case X86::SUB8ri: NewOpcode = X86::CMP8ri; break;
+ }
+ CmpInstr.setDesc(get(NewOpcode));
+ CmpInstr.RemoveOperand(0);
+ // Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
+ if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm ||
+ NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm)
+ return false;
+ }
+ }
+
+ // Get the unique definition of SrcReg.
+ MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
+ if (!MI) return false;
+
+ // CmpInstr is the first instruction of the BB.
+ MachineBasicBlock::iterator I = CmpInstr, Def = MI;
+
+ // If we are comparing against zero, check whether we can use MI to update
+ // EFLAGS. If MI is not in the same BB as CmpInstr, do not optimize.
+ bool IsCmpZero = (SrcReg2 == 0 && CmpValue == 0);
+ if (IsCmpZero && MI->getParent() != CmpInstr.getParent())
+ return false;
+
+ // If we have a use of the source register between the def and our compare
+ // instruction we can eliminate the compare iff the use sets EFLAGS in the
+ // right way.
+ bool ShouldUpdateCC = false;
+ X86::CondCode NewCC = X86::COND_INVALID;
+ if (IsCmpZero && !isDefConvertible(*MI)) {
+ // Scan forward from the use until we hit the use we're looking for or the
+ // compare instruction.
+ for (MachineBasicBlock::iterator J = MI;; ++J) {
+ // Do we have a convertible instruction?
+ NewCC = isUseDefConvertible(*J);
+ if (NewCC != X86::COND_INVALID && J->getOperand(1).isReg() &&
+ J->getOperand(1).getReg() == SrcReg) {
+ assert(J->definesRegister(X86::EFLAGS) && "Must be an EFLAGS def!");
+ ShouldUpdateCC = true; // Update CC later on.
+ // This is not a def of SrcReg, but still a def of EFLAGS. Keep going
+ // with the new def.
+ Def = J;
+ MI = &*Def;
+ break;
+ }
+
+ if (J == I)
+ return false;
+ }
+ }
+
+ // We are searching for an earlier instruction that can make CmpInstr
+ // redundant and that instruction will be saved in Sub.
+ MachineInstr *Sub = nullptr;
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+ // We iterate backward, starting from the instruction before CmpInstr and
+ // stop when reaching the definition of a source register or done with the BB.
+ // RI points to the instruction before CmpInstr.
+ // If the definition is in this basic block, RE points to the definition;
+ // otherwise, RE is the rend of the basic block.
+ MachineBasicBlock::reverse_iterator
+ RI = ++I.getReverse(),
+ RE = CmpInstr.getParent() == MI->getParent()
+ ? Def.getReverse() /* points to MI */
+ : CmpInstr.getParent()->rend();
+ MachineInstr *Movr0Inst = nullptr;
+ for (; RI != RE; ++RI) {
+ MachineInstr &Instr = *RI;
+ // Check whether CmpInstr can be made redundant by the current instruction.
+ if (!IsCmpZero &&
+ isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpValue, Instr)) {
+ Sub = &Instr;
+ break;
+ }
+
+ if (Instr.modifiesRegister(X86::EFLAGS, TRI) ||
+ Instr.readsRegister(X86::EFLAGS, TRI)) {
+ // This instruction modifies or uses EFLAGS.
+
+ // MOV32r0 etc. are implemented with xor which clobbers condition code.
+ // They are safe to move up, if the definition to EFLAGS is dead and
+ // earlier instructions do not read or write EFLAGS.
+ if (!Movr0Inst && Instr.getOpcode() == X86::MOV32r0 &&
+ Instr.registerDefIsDead(X86::EFLAGS, TRI)) {
+ Movr0Inst = &Instr;
+ continue;
+ }
+
+ // We can't remove CmpInstr.
+ return false;
+ }
+ }
+
+ // Return false if no candidates exist.
+ if (!IsCmpZero && !Sub)
+ return false;
+
+ bool IsSwapped = (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
+ Sub->getOperand(2).getReg() == SrcReg);
+
+ // Scan forward from the instruction after CmpInstr for uses of EFLAGS.
+ // It is safe to remove CmpInstr if EFLAGS is redefined or killed.
+ // If we are done with the basic block, we need to check whether EFLAGS is
+ // live-out.
+ bool IsSafe = false;
+ SmallVector<std::pair<MachineInstr*, unsigned /*NewOpc*/>, 4> OpsToUpdate;
+ MachineBasicBlock::iterator E = CmpInstr.getParent()->end();
+ for (++I; I != E; ++I) {
+ const MachineInstr &Instr = *I;
+ bool ModifyEFLAGS = Instr.modifiesRegister(X86::EFLAGS, TRI);
+ bool UseEFLAGS = Instr.readsRegister(X86::EFLAGS, TRI);
+ // We should check the usage if this instruction uses and updates EFLAGS.
+ if (!UseEFLAGS && ModifyEFLAGS) {
+ // It is safe to remove CmpInstr if EFLAGS is updated again.
+ IsSafe = true;
+ break;
+ }
+ if (!UseEFLAGS && !ModifyEFLAGS)
+ continue;
+
+ // EFLAGS is used by this instruction.
+ X86::CondCode OldCC = X86::COND_INVALID;
+ bool OpcIsSET = false;
+ if (IsCmpZero || IsSwapped) {
+ // We decode the condition code from opcode.
+ if (Instr.isBranch())
+ OldCC = getCondFromBranchOpc(Instr.getOpcode());
+ else {
+ OldCC = getCondFromSETOpc(Instr.getOpcode());
+ if (OldCC != X86::COND_INVALID)
+ OpcIsSET = true;
+ else
+ OldCC = X86::getCondFromCMovOpc(Instr.getOpcode());
+ }
+ if (OldCC == X86::COND_INVALID) return false;
+ }
+ if (IsCmpZero) {
+ switch (OldCC) {
+ default: break;
+ case X86::COND_A: case X86::COND_AE:
+ case X86::COND_B: case X86::COND_BE:
+ case X86::COND_G: case X86::COND_GE:
+ case X86::COND_L: case X86::COND_LE:
+ case X86::COND_O: case X86::COND_NO:
+ // CF and OF are used, we can't perform this optimization.
+ return false;
+ }
+
+ // If we're updating the condition code check if we have to reverse the
+ // condition.
+ if (ShouldUpdateCC)
+ switch (OldCC) {
+ default:
+ return false;
+ case X86::COND_E:
+ break;
+ case X86::COND_NE:
+ NewCC = GetOppositeBranchCondition(NewCC);
+ break;
+ }
+ } else if (IsSwapped) {
+ // If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs
+ // to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
+ // We swap the condition code and synthesize the new opcode.
+ NewCC = getSwappedCondition(OldCC);
+ if (NewCC == X86::COND_INVALID) return false;
+ }
+
+ if ((ShouldUpdateCC || IsSwapped) && NewCC != OldCC) {
+ // Synthesize the new opcode.
+ bool HasMemoryOperand = Instr.hasOneMemOperand();
+ unsigned NewOpc;
+ if (Instr.isBranch())
+ NewOpc = GetCondBranchFromCond(NewCC);
+ else if(OpcIsSET)
+ NewOpc = getSETFromCond(NewCC, HasMemoryOperand);
+ else {
+ unsigned DstReg = Instr.getOperand(0).getReg();
+ NewOpc = getCMovFromCond(NewCC, MRI->getRegClass(DstReg)->getSize(),
+ HasMemoryOperand);
+ }
+
+ // Push the MachineInstr to OpsToUpdate.
+ // If it is safe to remove CmpInstr, the condition code of these
+ // instructions will be modified.
+ OpsToUpdate.push_back(std::make_pair(&*I, NewOpc));
+ }
+ if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) {
+ // It is safe to remove CmpInstr if EFLAGS is updated again or killed.
+ IsSafe = true;
+ break;
+ }
+ }
+
+ // If EFLAGS is not killed nor re-defined, we should check whether it is
+ // live-out. If it is live-out, do not optimize.
+ if ((IsCmpZero || IsSwapped) && !IsSafe) {
+ MachineBasicBlock *MBB = CmpInstr.getParent();
+ for (MachineBasicBlock *Successor : MBB->successors())
+ if (Successor->isLiveIn(X86::EFLAGS))
+ return false;
+ }
+
+ // The instruction to be updated is either Sub or MI.
+ Sub = IsCmpZero ? MI : Sub;
+ // Move Movr0Inst to the appropriate place before Sub.
+ if (Movr0Inst) {
+ // Look backwards until we find a def that doesn't use the current EFLAGS.
+ Def = Sub;
+ MachineBasicBlock::reverse_iterator InsertI = Def.getReverse(),
+ InsertE = Sub->getParent()->rend();
+ for (; InsertI != InsertE; ++InsertI) {
+ MachineInstr *Instr = &*InsertI;
+ if (!Instr->readsRegister(X86::EFLAGS, TRI) &&
+ Instr->modifiesRegister(X86::EFLAGS, TRI)) {
+ Sub->getParent()->remove(Movr0Inst);
+ Instr->getParent()->insert(MachineBasicBlock::iterator(Instr),
+ Movr0Inst);
+ break;
+ }
+ }
+ if (InsertI == InsertE)
+ return false;
+ }
+
+ // Make sure Sub instruction defines EFLAGS and mark the def live.
+ unsigned i = 0, e = Sub->getNumOperands();
+ for (; i != e; ++i) {
+ MachineOperand &MO = Sub->getOperand(i);
+ if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) {
+ MO.setIsDead(false);
+ break;
+ }
+ }
+ assert(i != e && "Unable to locate a def EFLAGS operand");
+
+ CmpInstr.eraseFromParent();
+
+ // Modify the condition code of instructions in OpsToUpdate.
+ for (auto &Op : OpsToUpdate)
+ Op.first->setDesc(get(Op.second));
+ return true;
+}
+
+/// Try to remove the load by folding it to a register
+/// operand at the use. We fold the load instructions if load defines a virtual
+/// register, the virtual register is used once in the same BB, and the
+/// instructions in-between do not load or store, and have no side effects.
+MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI,
+ const MachineRegisterInfo *MRI,
+ unsigned &FoldAsLoadDefReg,
+ MachineInstr *&DefMI) const {
+ // Check whether we can move DefMI here.
+ DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
+ assert(DefMI);
+ bool SawStore = false;
+ if (!DefMI->isSafeToMove(nullptr, SawStore))
+ return nullptr;
+
+ // Collect information about virtual register operands of MI.
+ SmallVector<unsigned, 1> SrcOperandIds;
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = MI.getOperand(i);
+ if (!MO.isReg())
+ continue;
+ unsigned Reg = MO.getReg();
+ if (Reg != FoldAsLoadDefReg)
+ continue;
+ // Do not fold if we have a subreg use or a def.
+ if (MO.getSubReg() || MO.isDef())
+ return nullptr;
+ SrcOperandIds.push_back(i);
+ }
+ if (SrcOperandIds.empty())
+ return nullptr;
+
+ // Check whether we can fold the def into SrcOperandId.
+ if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandIds, *DefMI)) {
+ FoldAsLoadDefReg = 0;
+ return FoldMI;
+ }
+
+ return nullptr;
+}
+
+/// Expand a single-def pseudo instruction to a two-addr
+/// instruction with two undef reads of the register being defined.
+/// This is used for mapping:
+/// %xmm4 = V_SET0
+/// to:
+/// %xmm4 = PXORrr %xmm4<undef>, %xmm4<undef>
+///
+static bool Expand2AddrUndef(MachineInstrBuilder &MIB,
+ const MCInstrDesc &Desc) {
+ assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
+ unsigned Reg = MIB->getOperand(0).getReg();
+ MIB->setDesc(Desc);
+
+ // MachineInstr::addOperand() will insert explicit operands before any
+ // implicit operands.
+ MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
+ // But we don't trust that.
+ assert(MIB->getOperand(1).getReg() == Reg &&
+ MIB->getOperand(2).getReg() == Reg && "Misplaced operand");
+ return true;
+}
+
+/// Expand a single-def pseudo instruction to a two-addr
+/// instruction with two %k0 reads.
+/// This is used for mapping:
+/// %k4 = K_SET1
+/// to:
+/// %k4 = KXNORrr %k0, %k0
+static bool Expand2AddrKreg(MachineInstrBuilder &MIB,
+ const MCInstrDesc &Desc, unsigned Reg) {
+ assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
+ MIB->setDesc(Desc);
+ MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
+ return true;
+}
+
+static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII,
+ bool MinusOne) {
+ MachineBasicBlock &MBB = *MIB->getParent();
+ DebugLoc DL = MIB->getDebugLoc();
+ unsigned Reg = MIB->getOperand(0).getReg();
+
+ // Insert the XOR.
+ BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg)
+ .addReg(Reg, RegState::Undef)
+ .addReg(Reg, RegState::Undef);
+
+ // Turn the pseudo into an INC or DEC.
+ MIB->setDesc(TII.get(MinusOne ? X86::DEC32r : X86::INC32r));
+ MIB.addReg(Reg);
+
+ return true;
+}
+
+static bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB,
+ const TargetInstrInfo &TII,
+ const X86Subtarget &Subtarget) {
+ MachineBasicBlock &MBB = *MIB->getParent();
+ DebugLoc DL = MIB->getDebugLoc();
+ int64_t Imm = MIB->getOperand(1).getImm();
+ assert(Imm != 0 && "Using push/pop for 0 is not efficient.");
+ MachineBasicBlock::iterator I = MIB.getInstr();
+
+ int StackAdjustment;
+
+ if (Subtarget.is64Bit()) {
+ assert(MIB->getOpcode() == X86::MOV64ImmSExti8 ||
+ MIB->getOpcode() == X86::MOV32ImmSExti8);
+
+ // Can't use push/pop lowering if the function might write to the red zone.
+ X86MachineFunctionInfo *X86FI =
+ MBB.getParent()->getInfo<X86MachineFunctionInfo>();
+ if (X86FI->getUsesRedZone()) {
+ MIB->setDesc(TII.get(MIB->getOpcode() ==
+ X86::MOV32ImmSExti8 ? X86::MOV32ri : X86::MOV64ri));
+ return true;
+ }
+
+ // 64-bit mode doesn't have 32-bit push/pop, so use 64-bit operations and
+ // widen the register if necessary.
+ StackAdjustment = 8;
+ BuildMI(MBB, I, DL, TII.get(X86::PUSH64i8)).addImm(Imm);
+ MIB->setDesc(TII.get(X86::POP64r));
+ MIB->getOperand(0)
+ .setReg(getX86SubSuperRegister(MIB->getOperand(0).getReg(), 64));
+ } else {
+ assert(MIB->getOpcode() == X86::MOV32ImmSExti8);
+ StackAdjustment = 4;
+ BuildMI(MBB, I, DL, TII.get(X86::PUSH32i8)).addImm(Imm);
+ MIB->setDesc(TII.get(X86::POP32r));
+ }
+
+ // Build CFI if necessary.
+ MachineFunction &MF = *MBB.getParent();
+ const X86FrameLowering *TFL = Subtarget.getFrameLowering();
+ bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
+ bool NeedsDwarfCFI =
+ !IsWin64Prologue &&
+ (MF.getMMI().hasDebugInfo() || MF.getFunction()->needsUnwindTableEntry());
+ bool EmitCFI = !TFL->hasFP(MF) && NeedsDwarfCFI;
+ if (EmitCFI) {
+ TFL->BuildCFI(MBB, I, DL,
+ MCCFIInstruction::createAdjustCfaOffset(nullptr, StackAdjustment));
+ TFL->BuildCFI(MBB, std::next(I), DL,
+ MCCFIInstruction::createAdjustCfaOffset(nullptr, -StackAdjustment));
+ }
+
+ return true;
+}
+
+// LoadStackGuard has so far only been implemented for 64-bit MachO. Different
+// code sequence is needed for other targets.
+static void expandLoadStackGuard(MachineInstrBuilder &MIB,
+ const TargetInstrInfo &TII) {
+ MachineBasicBlock &MBB = *MIB->getParent();
+ DebugLoc DL = MIB->getDebugLoc();
+ unsigned Reg = MIB->getOperand(0).getReg();
+ const GlobalValue *GV =
+ cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
+ auto Flags = MachineMemOperand::MOLoad |
+ MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant;
+ MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
+ MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 8, 8);
+ MachineBasicBlock::iterator I = MIB.getInstr();
+
+ BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg).addReg(X86::RIP).addImm(1)
+ .addReg(0).addGlobalAddress(GV, 0, X86II::MO_GOTPCREL).addReg(0)
+ .addMemOperand(MMO);
+ MIB->setDebugLoc(DL);
+ MIB->setDesc(TII.get(X86::MOV64rm));
+ MIB.addReg(Reg, RegState::Kill).addImm(1).addReg(0).addImm(0).addReg(0);
+}
+
+// This is used to handle spills for 128/256-bit registers when we have AVX512,
+// but not VLX. If it uses an extended register we need to use an instruction
+// that loads the lower 128/256-bit, but is available with only AVX512F.
+static bool expandNOVLXLoad(MachineInstrBuilder &MIB,
+ const TargetRegisterInfo *TRI,
+ const MCInstrDesc &LoadDesc,
+ const MCInstrDesc &BroadcastDesc,
+ unsigned SubIdx) {
+ unsigned DestReg = MIB->getOperand(0).getReg();
+ // Check if DestReg is XMM16-31 or YMM16-31.
+ if (TRI->getEncodingValue(DestReg) < 16) {
+ // We can use a normal VEX encoded load.
+ MIB->setDesc(LoadDesc);
+ } else {
+ // Use a 128/256-bit VBROADCAST instruction.
+ MIB->setDesc(BroadcastDesc);
+ // Change the destination to a 512-bit register.
+ DestReg = TRI->getMatchingSuperReg(DestReg, SubIdx, &X86::VR512RegClass);
+ MIB->getOperand(0).setReg(DestReg);
+ }
+ return true;
+}
+
+// This is used to handle spills for 128/256-bit registers when we have AVX512,
+// but not VLX. If it uses an extended register we need to use an instruction
+// that stores the lower 128/256-bit, but is available with only AVX512F.
+static bool expandNOVLXStore(MachineInstrBuilder &MIB,
+ const TargetRegisterInfo *TRI,
+ const MCInstrDesc &StoreDesc,
+ const MCInstrDesc &ExtractDesc,
+ unsigned SubIdx) {
+ unsigned SrcReg = MIB->getOperand(X86::AddrNumOperands).getReg();
+ // Check if DestReg is XMM16-31 or YMM16-31.
+ if (TRI->getEncodingValue(SrcReg) < 16) {
+ // We can use a normal VEX encoded store.
+ MIB->setDesc(StoreDesc);
+ } else {
+ // Use a VEXTRACTF instruction.
+ MIB->setDesc(ExtractDesc);
+ // Change the destination to a 512-bit register.
+ SrcReg = TRI->getMatchingSuperReg(SrcReg, SubIdx, &X86::VR512RegClass);
+ MIB->getOperand(X86::AddrNumOperands).setReg(SrcReg);
+ MIB.addImm(0x0); // Append immediate to extract from the lower bits.
+ }
+
+ return true;
+}
+bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+ bool HasAVX = Subtarget.hasAVX();
+ MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
+ switch (MI.getOpcode()) {
+ case X86::MOV32r0:
+ return Expand2AddrUndef(MIB, get(X86::XOR32rr));
+ case X86::MOV32r1:
+ return expandMOV32r1(MIB, *this, /*MinusOne=*/ false);
+ case X86::MOV32r_1:
+ return expandMOV32r1(MIB, *this, /*MinusOne=*/ true);
+ case X86::MOV32ImmSExti8:
+ case X86::MOV64ImmSExti8:
+ return ExpandMOVImmSExti8(MIB, *this, Subtarget);
+ case X86::SETB_C8r:
+ return Expand2AddrUndef(MIB, get(X86::SBB8rr));
+ case X86::SETB_C16r:
+ return Expand2AddrUndef(MIB, get(X86::SBB16rr));
+ case X86::SETB_C32r:
+ return Expand2AddrUndef(MIB, get(X86::SBB32rr));
+ case X86::SETB_C64r:
+ return Expand2AddrUndef(MIB, get(X86::SBB64rr));
+ case X86::V_SET0:
+ case X86::FsFLD0SS:
+ case X86::FsFLD0SD:
+ return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
+ case X86::AVX_SET0:
+ assert(HasAVX && "AVX not supported");
+ return Expand2AddrUndef(MIB, get(X86::VXORPSYrr));
+ case X86::AVX512_128_SET0:
+ return Expand2AddrUndef(MIB, get(X86::VPXORDZ128rr));
+ case X86::AVX512_256_SET0:
+ return Expand2AddrUndef(MIB, get(X86::VPXORDZ256rr));
+ case X86::AVX512_512_SET0:
+ return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
+ case X86::AVX512_FsFLD0SS:
+ case X86::AVX512_FsFLD0SD:
+ return Expand2AddrUndef(MIB, get(X86::VXORPSZ128rr));
+ case X86::V_SETALLONES:
+ return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
+ case X86::AVX2_SETALLONES:
+ return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
+ case X86::AVX512_512_SETALLONES: {
+ unsigned Reg = MIB->getOperand(0).getReg();
+ MIB->setDesc(get(X86::VPTERNLOGDZrri));
+ // VPTERNLOGD needs 3 register inputs and an immediate.
+ // 0xff will return 1s for any input.
+ MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef)
+ .addReg(Reg, RegState::Undef).addImm(0xff);
+ return true;
+ }
+ case X86::VMOVAPSZ128rm_NOVLX:
+ return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSrm),
+ get(X86::VBROADCASTF32X4rm), X86::sub_xmm);
+ case X86::VMOVUPSZ128rm_NOVLX:
+ return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSrm),
+ get(X86::VBROADCASTF32X4rm), X86::sub_xmm);
+ case X86::VMOVAPSZ256rm_NOVLX:
+ return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSYrm),
+ get(X86::VBROADCASTF64X4rm), X86::sub_ymm);
+ case X86::VMOVUPSZ256rm_NOVLX:
+ return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSYrm),
+ get(X86::VBROADCASTF64X4rm), X86::sub_ymm);
+ case X86::VMOVAPSZ128mr_NOVLX:
+ return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSmr),
+ get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm);
+ case X86::VMOVUPSZ128mr_NOVLX:
+ return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSmr),
+ get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm);
+ case X86::VMOVAPSZ256mr_NOVLX:
+ return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSYmr),
+ get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
+ case X86::VMOVUPSZ256mr_NOVLX:
+ return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr),
+ get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
+ case X86::TEST8ri_NOREX:
+ MI.setDesc(get(X86::TEST8ri));
+ return true;
+ case X86::MOV32ri64:
+ MI.setDesc(get(X86::MOV32ri));
+ return true;
+
+ // KNL does not recognize dependency-breaking idioms for mask registers,
+ // so kxnor %k1, %k1, %k2 has a RAW dependence on %k1.
+ // Using %k0 as the undef input register is a performance heuristic based
+ // on the assumption that %k0 is used less frequently than the other mask
+ // registers, since it is not usable as a write mask.
+ // FIXME: A more advanced approach would be to choose the best input mask
+ // register based on context.
+ case X86::KSET0B:
+ case X86::KSET0W: return Expand2AddrKreg(MIB, get(X86::KXORWrr), X86::K0);
+ case X86::KSET0D: return Expand2AddrKreg(MIB, get(X86::KXORDrr), X86::K0);
+ case X86::KSET0Q: return Expand2AddrKreg(MIB, get(X86::KXORQrr), X86::K0);
+ case X86::KSET1B:
+ case X86::KSET1W: return Expand2AddrKreg(MIB, get(X86::KXNORWrr), X86::K0);
+ case X86::KSET1D: return Expand2AddrKreg(MIB, get(X86::KXNORDrr), X86::K0);
+ case X86::KSET1Q: return Expand2AddrKreg(MIB, get(X86::KXNORQrr), X86::K0);
+ case TargetOpcode::LOAD_STACK_GUARD:
+ expandLoadStackGuard(MIB, *this);
+ return true;
+ }
+ return false;
+}
+
+static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs,
+ int PtrOffset = 0) {
+ unsigned NumAddrOps = MOs.size();
+
+ if (NumAddrOps < 4) {
+ // FrameIndex only - add an immediate offset (whether its zero or not).
+ for (unsigned i = 0; i != NumAddrOps; ++i)
+ MIB.addOperand(MOs[i]);
+ addOffset(MIB, PtrOffset);
+ } else {
+ // General Memory Addressing - we need to add any offset to an existing
+ // offset.
+ assert(MOs.size() == 5 && "Unexpected memory operand list length");
+ for (unsigned i = 0; i != NumAddrOps; ++i) {
+ const MachineOperand &MO = MOs[i];
+ if (i == 3 && PtrOffset != 0) {
+ MIB.addDisp(MO, PtrOffset);
+ } else {
+ MIB.addOperand(MO);
+ }
+ }
+ }
+}
+
+static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
+ ArrayRef<MachineOperand> MOs,
+ MachineBasicBlock::iterator InsertPt,
+ MachineInstr &MI,
+ const TargetInstrInfo &TII) {
+ // Create the base instruction with the memory operand as the first part.
+ // Omit the implicit operands, something BuildMI can't do.
+ MachineInstr *NewMI =
+ MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true);
+ MachineInstrBuilder MIB(MF, NewMI);
+ addOperands(MIB, MOs);
+
+ // Loop over the rest of the ri operands, converting them over.
+ unsigned NumOps = MI.getDesc().getNumOperands() - 2;
+ for (unsigned i = 0; i != NumOps; ++i) {
+ MachineOperand &MO = MI.getOperand(i + 2);
+ MIB.addOperand(MO);
+ }
+ for (unsigned i = NumOps + 2, e = MI.getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = MI.getOperand(i);
+ MIB.addOperand(MO);
+ }
+
+ MachineBasicBlock *MBB = InsertPt->getParent();
+ MBB->insert(InsertPt, NewMI);
+
+ return MIB;
+}
+
+static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode,
+ unsigned OpNo, ArrayRef<MachineOperand> MOs,
+ MachineBasicBlock::iterator InsertPt,
+ MachineInstr &MI, const TargetInstrInfo &TII,
+ int PtrOffset = 0) {
+ // Omit the implicit operands, something BuildMI can't do.
+ MachineInstr *NewMI =
+ MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true);
+ MachineInstrBuilder MIB(MF, NewMI);
+
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = MI.getOperand(i);
+ if (i == OpNo) {
+ assert(MO.isReg() && "Expected to fold into reg operand!");
+ addOperands(MIB, MOs, PtrOffset);
+ } else {
+ MIB.addOperand(MO);
+ }
+ }
+
+ MachineBasicBlock *MBB = InsertPt->getParent();
+ MBB->insert(InsertPt, NewMI);
+
+ return MIB;
+}
+
+static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
+ ArrayRef<MachineOperand> MOs,
+ MachineBasicBlock::iterator InsertPt,
+ MachineInstr &MI) {
+ MachineInstrBuilder MIB = BuildMI(*InsertPt->getParent(), InsertPt,
+ MI.getDebugLoc(), TII.get(Opcode));
+ addOperands(MIB, MOs);
+ return MIB.addImm(0);
+}
+
+MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
+ MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
+ ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
+ unsigned Size, unsigned Align) const {
+ switch (MI.getOpcode()) {
+ case X86::INSERTPSrr:
+ case X86::VINSERTPSrr:
+ case X86::VINSERTPSZrr:
+ // Attempt to convert the load of inserted vector into a fold load
+ // of a single float.
+ if (OpNum == 2) {
+ unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
+ unsigned ZMask = Imm & 15;
+ unsigned DstIdx = (Imm >> 4) & 3;
+ unsigned SrcIdx = (Imm >> 6) & 3;
+
+ unsigned RCSize = getRegClass(MI.getDesc(), OpNum, &RI, MF)->getSize();
+ if (Size <= RCSize && 4 <= Align) {
+ int PtrOffset = SrcIdx * 4;
+ unsigned NewImm = (DstIdx << 4) | ZMask;
+ unsigned NewOpCode =
+ (MI.getOpcode() == X86::VINSERTPSZrr) ? X86::VINSERTPSZrm :
+ (MI.getOpcode() == X86::VINSERTPSrr) ? X86::VINSERTPSrm :
+ X86::INSERTPSrm;
+ MachineInstr *NewMI =
+ FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset);
+ NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm);
+ return NewMI;
+ }
+ }
+ break;
+ case X86::MOVHLPSrr:
+ case X86::VMOVHLPSrr:
+ case X86::VMOVHLPSZrr:
+ // Move the upper 64-bits of the second operand to the lower 64-bits.
+ // To fold the load, adjust the pointer to the upper and use (V)MOVLPS.
+ // TODO: In most cases AVX doesn't have a 8-byte alignment requirement.
+ if (OpNum == 2) {
+ unsigned RCSize = getRegClass(MI.getDesc(), OpNum, &RI, MF)->getSize();
+ if (Size <= RCSize && 8 <= Align) {
+ unsigned NewOpCode =
+ (MI.getOpcode() == X86::VMOVHLPSZrr) ? X86::VMOVLPSZ128rm :
+ (MI.getOpcode() == X86::VMOVHLPSrr) ? X86::VMOVLPSrm :
+ X86::MOVLPSrm;
+ MachineInstr *NewMI =
+ FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, 8);
+ return NewMI;
+ }
+ }
+ break;
+ };
+
+ return nullptr;
+}
+
+MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
+ MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
+ ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
+ unsigned Size, unsigned Align, bool AllowCommute) const {
+ const DenseMap<unsigned,
+ std::pair<uint16_t, uint16_t> > *OpcodeTablePtr = nullptr;
+ bool isCallRegIndirect = Subtarget.callRegIndirect();
+ bool isTwoAddrFold = false;
+
+ // For CPUs that favor the register form of a call or push,
+ // do not fold loads into calls or pushes, unless optimizing for size
+ // aggressively.
+ if (isCallRegIndirect && !MF.getFunction()->optForMinSize() &&
+ (MI.getOpcode() == X86::CALL32r || MI.getOpcode() == X86::CALL64r ||
+ MI.getOpcode() == X86::PUSH16r || MI.getOpcode() == X86::PUSH32r ||
+ MI.getOpcode() == X86::PUSH64r))
+ return nullptr;
+
+ unsigned NumOps = MI.getDesc().getNumOperands();
+ bool isTwoAddr =
+ NumOps > 1 && MI.getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1;
+
+ // FIXME: AsmPrinter doesn't know how to handle
+ // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
+ if (MI.getOpcode() == X86::ADD32ri &&
+ MI.getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
+ return nullptr;
+
+ MachineInstr *NewMI = nullptr;
+
+ // Attempt to fold any custom cases we have.
+ if (MachineInstr *CustomMI =
+ foldMemoryOperandCustom(MF, MI, OpNum, MOs, InsertPt, Size, Align))
+ return CustomMI;
+
+ // Folding a memory location into the two-address part of a two-address
+ // instruction is different than folding it other places. It requires
+ // replacing the *two* registers with the memory location.
+ if (isTwoAddr && NumOps >= 2 && OpNum < 2 && MI.getOperand(0).isReg() &&
+ MI.getOperand(1).isReg() &&
+ MI.getOperand(0).getReg() == MI.getOperand(1).getReg()) {
+ OpcodeTablePtr = &RegOp2MemOpTable2Addr;
+ isTwoAddrFold = true;
+ } else if (OpNum == 0) {
+ if (MI.getOpcode() == X86::MOV32r0) {
+ NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, InsertPt, MI);
+ if (NewMI)
+ return NewMI;
+ }
+
+ OpcodeTablePtr = &RegOp2MemOpTable0;
+ } else if (OpNum == 1) {
+ OpcodeTablePtr = &RegOp2MemOpTable1;
+ } else if (OpNum == 2) {
+ OpcodeTablePtr = &RegOp2MemOpTable2;
+ } else if (OpNum == 3) {
+ OpcodeTablePtr = &RegOp2MemOpTable3;
+ } else if (OpNum == 4) {
+ OpcodeTablePtr = &RegOp2MemOpTable4;
+ }
+
+ // If table selected...
+ if (OpcodeTablePtr) {
+ // Find the Opcode to fuse
+ auto I = OpcodeTablePtr->find(MI.getOpcode());
+ if (I != OpcodeTablePtr->end()) {
+ unsigned Opcode = I->second.first;
+ unsigned MinAlign = (I->second.second & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT;
+ if (Align < MinAlign)
+ return nullptr;
+ bool NarrowToMOV32rm = false;
+ if (Size) {
+ unsigned RCSize = getRegClass(MI.getDesc(), OpNum, &RI, MF)->getSize();
+ if (Size < RCSize) {
+ // Check if it's safe to fold the load. If the size of the object is
+ // narrower than the load width, then it's not.
+ if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
+ return nullptr;
+ // If this is a 64-bit load, but the spill slot is 32, then we can do
+ // a 32-bit load which is implicitly zero-extended. This likely is
+ // due to live interval analysis remat'ing a load from stack slot.
+ if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
+ return nullptr;
+ Opcode = X86::MOV32rm;
+ NarrowToMOV32rm = true;
+ }
+ }
+
+ if (isTwoAddrFold)
+ NewMI = FuseTwoAddrInst(MF, Opcode, MOs, InsertPt, MI, *this);
+ else
+ NewMI = FuseInst(MF, Opcode, OpNum, MOs, InsertPt, MI, *this);
+
+ if (NarrowToMOV32rm) {
+ // If this is the special case where we use a MOV32rm to load a 32-bit
+ // value and zero-extend the top bits. Change the destination register
+ // to a 32-bit one.
+ unsigned DstReg = NewMI->getOperand(0).getReg();
+ if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+ NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit));
+ else
+ NewMI->getOperand(0).setSubReg(X86::sub_32bit);
+ }
+ return NewMI;
+ }
+ }
+
+ // If the instruction and target operand are commutable, commute the
+ // instruction and try again.
+ if (AllowCommute) {
+ unsigned CommuteOpIdx1 = OpNum, CommuteOpIdx2 = CommuteAnyOperandIndex;
+ if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) {
+ bool HasDef = MI.getDesc().getNumDefs();
+ unsigned Reg0 = HasDef ? MI.getOperand(0).getReg() : 0;
+ unsigned Reg1 = MI.getOperand(CommuteOpIdx1).getReg();
+ unsigned Reg2 = MI.getOperand(CommuteOpIdx2).getReg();
+ bool Tied1 =
+ 0 == MI.getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO);
+ bool Tied2 =
+ 0 == MI.getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO);
+
+ // If either of the commutable operands are tied to the destination
+ // then we can not commute + fold.
+ if ((HasDef && Reg0 == Reg1 && Tied1) ||
+ (HasDef && Reg0 == Reg2 && Tied2))
+ return nullptr;
+
+ MachineInstr *CommutedMI =
+ commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2);
+ if (!CommutedMI) {
+ // Unable to commute.
+ return nullptr;
+ }
+ if (CommutedMI != &MI) {
+ // New instruction. We can't fold from this.
+ CommutedMI->eraseFromParent();
+ return nullptr;
+ }
+
+ // Attempt to fold with the commuted version of the instruction.
+ NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt,
+ Size, Align, /*AllowCommute=*/false);
+ if (NewMI)
+ return NewMI;
+
+ // Folding failed again - undo the commute before returning.
+ MachineInstr *UncommutedMI =
+ commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2);
+ if (!UncommutedMI) {
+ // Unable to commute.
+ return nullptr;
+ }
+ if (UncommutedMI != &MI) {
+ // New instruction. It doesn't need to be kept.
+ UncommutedMI->eraseFromParent();
+ return nullptr;
+ }
+
+ // Return here to prevent duplicate fuse failure report.
+ return nullptr;
+ }
+ }
+
+ // No fusion
+ if (PrintFailedFusing && !MI.isCopy())
+ dbgs() << "We failed to fuse operand " << OpNum << " in " << MI;
+ return nullptr;
+}
+
+/// Return true for all instructions that only update
+/// the first 32 or 64-bits of the destination register and leave the rest
+/// unmodified. This can be used to avoid folding loads if the instructions
+/// only update part of the destination register, and the non-updated part is
+/// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these
+/// instructions breaks the partial register dependency and it can improve
+/// performance. e.g.:
+///
+/// movss (%rdi), %xmm0
+/// cvtss2sd %xmm0, %xmm0
+///
+/// Instead of
+/// cvtss2sd (%rdi), %xmm0
+///
+/// FIXME: This should be turned into a TSFlags.
+///
+static bool hasPartialRegUpdate(unsigned Opcode) {
+ switch (Opcode) {
+ case X86::CVTSI2SSrr:
+ case X86::CVTSI2SSrm:
+ case X86::CVTSI2SS64rr:
+ case X86::CVTSI2SS64rm:
+ case X86::CVTSI2SDrr:
+ case X86::CVTSI2SDrm:
+ case X86::CVTSI2SD64rr:
+ case X86::CVTSI2SD64rm:
+ case X86::CVTSD2SSrr:
+ case X86::CVTSD2SSrm:
+ case X86::CVTSS2SDrr:
+ case X86::CVTSS2SDrm:
+ case X86::MOVHPDrm:
+ case X86::MOVHPSrm:
+ case X86::MOVLPDrm:
+ case X86::MOVLPSrm:
+ case X86::RCPSSr:
+ case X86::RCPSSm:
+ case X86::RCPSSr_Int:
+ case X86::RCPSSm_Int:
+ case X86::ROUNDSDr:
+ case X86::ROUNDSDm:
+ case X86::ROUNDSSr:
+ case X86::ROUNDSSm:
+ case X86::RSQRTSSr:
+ case X86::RSQRTSSm:
+ case X86::RSQRTSSr_Int:
+ case X86::RSQRTSSm_Int:
+ case X86::SQRTSSr:
+ case X86::SQRTSSm:
+ case X86::SQRTSSr_Int:
+ case X86::SQRTSSm_Int:
+ case X86::SQRTSDr:
+ case X86::SQRTSDm:
+ case X86::SQRTSDr_Int:
+ case X86::SQRTSDm_Int:
+ return true;
+ }
+
+ return false;
+}
+
+/// Inform the ExeDepsFix pass how many idle
+/// instructions we would like before a partial register update.
+unsigned X86InstrInfo::getPartialRegUpdateClearance(
+ const MachineInstr &MI, unsigned OpNum,
+ const TargetRegisterInfo *TRI) const {
+ if (OpNum != 0 || !hasPartialRegUpdate(MI.getOpcode()))
+ return 0;
+
+ // If MI is marked as reading Reg, the partial register update is wanted.
+ const MachineOperand &MO = MI.getOperand(0);
+ unsigned Reg = MO.getReg();
+ if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (MO.readsReg() || MI.readsVirtualRegister(Reg))
+ return 0;
+ } else {
+ if (MI.readsRegister(Reg, TRI))
+ return 0;
+ }
+
+ // If any instructions in the clearance range are reading Reg, insert a
+ // dependency breaking instruction, which is inexpensive and is likely to
+ // be hidden in other instruction's cycles.
+ return PartialRegUpdateClearance;
+}
+
+// Return true for any instruction the copies the high bits of the first source
+// operand into the unused high bits of the destination operand.
+static bool hasUndefRegUpdate(unsigned Opcode) {
+ switch (Opcode) {
+ case X86::VCVTSI2SSrr:
+ case X86::VCVTSI2SSrm:
+ case X86::Int_VCVTSI2SSrr:
+ case X86::Int_VCVTSI2SSrm:
+ case X86::VCVTSI2SS64rr:
+ case X86::VCVTSI2SS64rm:
+ case X86::Int_VCVTSI2SS64rr:
+ case X86::Int_VCVTSI2SS64rm:
+ case X86::VCVTSI2SDrr:
+ case X86::VCVTSI2SDrm:
+ case X86::Int_VCVTSI2SDrr:
+ case X86::Int_VCVTSI2SDrm:
+ case X86::VCVTSI2SD64rr:
+ case X86::VCVTSI2SD64rm:
+ case X86::Int_VCVTSI2SD64rr:
+ case X86::Int_VCVTSI2SD64rm:
+ case X86::VCVTSD2SSrr:
+ case X86::VCVTSD2SSrm:
+ case X86::Int_VCVTSD2SSrr:
+ case X86::Int_VCVTSD2SSrm:
+ case X86::VCVTSS2SDrr:
+ case X86::VCVTSS2SDrm:
+ case X86::Int_VCVTSS2SDrr:
+ case X86::Int_VCVTSS2SDrm:
+ case X86::VRCPSSr:
+ case X86::VRCPSSr_Int:
+ case X86::VRCPSSm:
+ case X86::VRCPSSm_Int:
+ case X86::VROUNDSDr:
+ case X86::VROUNDSDm:
+ case X86::VROUNDSDr_Int:
+ case X86::VROUNDSDm_Int:
+ case X86::VROUNDSSr:
+ case X86::VROUNDSSm:
+ case X86::VROUNDSSr_Int:
+ case X86::VROUNDSSm_Int:
+ case X86::VRSQRTSSr:
+ case X86::VRSQRTSSr_Int:
+ case X86::VRSQRTSSm:
+ case X86::VRSQRTSSm_Int:
+ case X86::VSQRTSSr:
+ case X86::VSQRTSSr_Int:
+ case X86::VSQRTSSm:
+ case X86::VSQRTSSm_Int:
+ case X86::VSQRTSDr:
+ case X86::VSQRTSDr_Int:
+ case X86::VSQRTSDm:
+ case X86::VSQRTSDm_Int:
+ // AVX-512
+ case X86::VCVTSI2SSZrr:
+ case X86::VCVTSI2SSZrm:
+ case X86::VCVTSI2SSZrr_Int:
+ case X86::VCVTSI2SSZrrb_Int:
+ case X86::VCVTSI2SSZrm_Int:
+ case X86::VCVTSI642SSZrr:
+ case X86::VCVTSI642SSZrm:
+ case X86::VCVTSI642SSZrr_Int:
+ case X86::VCVTSI642SSZrrb_Int:
+ case X86::VCVTSI642SSZrm_Int:
+ case X86::VCVTSI2SDZrr:
+ case X86::VCVTSI2SDZrm:
+ case X86::VCVTSI2SDZrr_Int:
+ case X86::VCVTSI2SDZrrb_Int:
+ case X86::VCVTSI2SDZrm_Int:
+ case X86::VCVTSI642SDZrr:
+ case X86::VCVTSI642SDZrm:
+ case X86::VCVTSI642SDZrr_Int:
+ case X86::VCVTSI642SDZrrb_Int:
+ case X86::VCVTSI642SDZrm_Int:
+ case X86::VCVTUSI2SSZrr:
+ case X86::VCVTUSI2SSZrm:
+ case X86::VCVTUSI2SSZrr_Int:
+ case X86::VCVTUSI2SSZrrb_Int:
+ case X86::VCVTUSI2SSZrm_Int:
+ case X86::VCVTUSI642SSZrr:
+ case X86::VCVTUSI642SSZrm:
+ case X86::VCVTUSI642SSZrr_Int:
+ case X86::VCVTUSI642SSZrrb_Int:
+ case X86::VCVTUSI642SSZrm_Int:
+ case X86::VCVTUSI2SDZrr:
+ case X86::VCVTUSI2SDZrm:
+ case X86::VCVTUSI2SDZrr_Int:
+ case X86::VCVTUSI2SDZrm_Int:
+ case X86::VCVTUSI642SDZrr:
+ case X86::VCVTUSI642SDZrm:
+ case X86::VCVTUSI642SDZrr_Int:
+ case X86::VCVTUSI642SDZrrb_Int:
+ case X86::VCVTUSI642SDZrm_Int:
+ case X86::VCVTSD2SSZrr:
+ case X86::VCVTSD2SSZrrb:
+ case X86::VCVTSD2SSZrm:
+ case X86::VCVTSS2SDZrr:
+ case X86::VCVTSS2SDZrrb:
+ case X86::VCVTSS2SDZrm:
+ case X86::VRNDSCALESDr:
+ case X86::VRNDSCALESDrb:
+ case X86::VRNDSCALESDm:
+ case X86::VRNDSCALESSr:
+ case X86::VRNDSCALESSrb:
+ case X86::VRNDSCALESSm:
+ case X86::VRCP14SSrr:
+ case X86::VRCP14SSrm:
+ case X86::VRSQRT14SSrr:
+ case X86::VRSQRT14SSrm:
+ case X86::VSQRTSSZr:
+ case X86::VSQRTSSZr_Int:
+ case X86::VSQRTSSZrb_Int:
+ case X86::VSQRTSSZm:
+ case X86::VSQRTSSZm_Int:
+ case X86::VSQRTSDZr:
+ case X86::VSQRTSDZr_Int:
+ case X86::VSQRTSDZrb_Int:
+ case X86::VSQRTSDZm:
+ case X86::VSQRTSDZm_Int:
+ return true;
+ }
+
+ return false;
+}
+
+/// Inform the ExeDepsFix pass how many idle instructions we would like before
+/// certain undef register reads.
+///
+/// This catches the VCVTSI2SD family of instructions:
+///
+/// vcvtsi2sdq %rax, %xmm0<undef>, %xmm14
+///
+/// We should to be careful *not* to catch VXOR idioms which are presumably
+/// handled specially in the pipeline:
+///
+/// vxorps %xmm1<undef>, %xmm1<undef>, %xmm1
+///
+/// Like getPartialRegUpdateClearance, this makes a strong assumption that the
+/// high bits that are passed-through are not live.
+unsigned
+X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum,
+ const TargetRegisterInfo *TRI) const {
+ if (!hasUndefRegUpdate(MI.getOpcode()))
+ return 0;
+
+ // Set the OpNum parameter to the first source operand.
+ OpNum = 1;
+
+ const MachineOperand &MO = MI.getOperand(OpNum);
+ if (MO.isUndef() && TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+ return UndefRegClearance;
+ }
+ return 0;
+}
+
+void X86InstrInfo::breakPartialRegDependency(
+ MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const {
+ unsigned Reg = MI.getOperand(OpNum).getReg();
+ // If MI kills this register, the false dependence is already broken.
+ if (MI.killsRegister(Reg, TRI))
+ return;
+
+ if (X86::VR128RegClass.contains(Reg)) {
+ // These instructions are all floating point domain, so xorps is the best
+ // choice.
+ unsigned Opc = Subtarget.hasAVX() ? X86::VXORPSrr : X86::XORPSrr;
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(Opc), Reg)
+ .addReg(Reg, RegState::Undef)
+ .addReg(Reg, RegState::Undef);
+ MI.addRegisterKilled(Reg, TRI, true);
+ } else if (X86::VR256RegClass.contains(Reg)) {
+ // Use vxorps to clear the full ymm register.
+ // It wants to read and write the xmm sub-register.
+ unsigned XReg = TRI->getSubReg(Reg, X86::sub_xmm);
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VXORPSrr), XReg)
+ .addReg(XReg, RegState::Undef)
+ .addReg(XReg, RegState::Undef)
+ .addReg(Reg, RegState::ImplicitDefine);
+ MI.addRegisterKilled(Reg, TRI, true);
+ }
+}
+
+MachineInstr *
+X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
+ ArrayRef<unsigned> Ops,
+ MachineBasicBlock::iterator InsertPt,
+ int FrameIndex, LiveIntervals *LIS) const {
+ // Check switch flag
+ if (NoFusing)
+ return nullptr;
+
+ // Unless optimizing for size, don't fold to avoid partial
+ // register update stalls
+ if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI.getOpcode()))
+ return nullptr;
+
+ // Don't fold subreg spills, or reloads that use a high subreg.
+ for (auto Op : Ops) {
+ MachineOperand &MO = MI.getOperand(Op);
+ auto SubReg = MO.getSubReg();
+ if (SubReg && (MO.isDef() || SubReg == X86::sub_8bit_hi))
+ return nullptr;
+ }
+
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ unsigned Size = MFI.getObjectSize(FrameIndex);
+ unsigned Alignment = MFI.getObjectAlignment(FrameIndex);
+ // If the function stack isn't realigned we don't want to fold instructions
+ // that need increased alignment.
+ if (!RI.needsStackRealignment(MF))
+ Alignment =
+ std::min(Alignment, Subtarget.getFrameLowering()->getStackAlignment());
+ if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
+ unsigned NewOpc = 0;
+ unsigned RCSize = 0;
+ switch (MI.getOpcode()) {
+ default: return nullptr;
+ case X86::TEST8rr: NewOpc = X86::CMP8ri; RCSize = 1; break;
+ case X86::TEST16rr: NewOpc = X86::CMP16ri8; RCSize = 2; break;
+ case X86::TEST32rr: NewOpc = X86::CMP32ri8; RCSize = 4; break;
+ case X86::TEST64rr: NewOpc = X86::CMP64ri8; RCSize = 8; break;
+ }
+ // Check if it's safe to fold the load. If the size of the object is
+ // narrower than the load width, then it's not.
+ if (Size < RCSize)
+ return nullptr;
+ // Change to CMPXXri r, 0 first.
+ MI.setDesc(get(NewOpc));
+ MI.getOperand(1).ChangeToImmediate(0);
+ } else if (Ops.size() != 1)
+ return nullptr;
+
+ return foldMemoryOperandImpl(MF, MI, Ops[0],
+ MachineOperand::CreateFI(FrameIndex), InsertPt,
+ Size, Alignment, /*AllowCommute=*/true);
+}
+
+/// Check if \p LoadMI is a partial register load that we can't fold into \p MI
+/// because the latter uses contents that wouldn't be defined in the folded
+/// version. For instance, this transformation isn't legal:
+/// movss (%rdi), %xmm0
+/// addps %xmm0, %xmm0
+/// ->
+/// addps (%rdi), %xmm0
+///
+/// But this one is:
+/// movss (%rdi), %xmm0
+/// addss %xmm0, %xmm0
+/// ->
+/// addss (%rdi), %xmm0
+///
+static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
+ const MachineInstr &UserMI,
+ const MachineFunction &MF) {
+ unsigned Opc = LoadMI.getOpcode();
+ unsigned UserOpc = UserMI.getOpcode();
+ unsigned RegSize =
+ MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg())->getSize();
+
+ if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm || Opc == X86::VMOVSSZrm) &&
+ RegSize > 4) {
+ // These instructions only load 32 bits, we can't fold them if the
+ // destination register is wider than 32 bits (4 bytes), and its user
+ // instruction isn't scalar (SS).
+ switch (UserOpc) {
+ case X86::ADDSSrr_Int: case X86::VADDSSrr_Int: case X86::VADDSSZrr_Int:
+ case X86::Int_CMPSSrr: case X86::Int_VCMPSSrr: case X86::VCMPSSZrr_Int:
+ case X86::DIVSSrr_Int: case X86::VDIVSSrr_Int: case X86::VDIVSSZrr_Int:
+ case X86::MAXSSrr_Int: case X86::VMAXSSrr_Int: case X86::VMAXSSZrr_Int:
+ case X86::MINSSrr_Int: case X86::VMINSSrr_Int: case X86::VMINSSZrr_Int:
+ case X86::MULSSrr_Int: case X86::VMULSSrr_Int: case X86::VMULSSZrr_Int:
+ case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int: case X86::VSUBSSZrr_Int:
+ case X86::VFMADDSS4rr_Int: case X86::VFNMADDSS4rr_Int:
+ case X86::VFMSUBSS4rr_Int: case X86::VFNMSUBSS4rr_Int:
+ case X86::VFMADD132SSr_Int: case X86::VFNMADD132SSr_Int:
+ case X86::VFMADD213SSr_Int: case X86::VFNMADD213SSr_Int:
+ case X86::VFMADD231SSr_Int: case X86::VFNMADD231SSr_Int:
+ case X86::VFMSUB132SSr_Int: case X86::VFNMSUB132SSr_Int:
+ case X86::VFMSUB213SSr_Int: case X86::VFNMSUB213SSr_Int:
+ case X86::VFMSUB231SSr_Int: case X86::VFNMSUB231SSr_Int:
+ case X86::VFMADD132SSZr_Int: case X86::VFNMADD132SSZr_Int:
+ case X86::VFMADD213SSZr_Int: case X86::VFNMADD213SSZr_Int:
+ case X86::VFMADD231SSZr_Int: case X86::VFNMADD231SSZr_Int:
+ case X86::VFMSUB132SSZr_Int: case X86::VFNMSUB132SSZr_Int:
+ case X86::VFMSUB213SSZr_Int: case X86::VFNMSUB213SSZr_Int:
+ case X86::VFMSUB231SSZr_Int: case X86::VFNMSUB231SSZr_Int:
+ return false;
+ default:
+ return true;
+ }
+ }
+
+ if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm || Opc == X86::VMOVSDZrm) &&
+ RegSize > 8) {
+ // These instructions only load 64 bits, we can't fold them if the
+ // destination register is wider than 64 bits (8 bytes), and its user
+ // instruction isn't scalar (SD).
+ switch (UserOpc) {
+ case X86::ADDSDrr_Int: case X86::VADDSDrr_Int: case X86::VADDSDZrr_Int:
+ case X86::Int_CMPSDrr: case X86::Int_VCMPSDrr: case X86::VCMPSDZrr_Int:
+ case X86::DIVSDrr_Int: case X86::VDIVSDrr_Int: case X86::VDIVSDZrr_Int:
+ case X86::MAXSDrr_Int: case X86::VMAXSDrr_Int: case X86::VMAXSDZrr_Int:
+ case X86::MINSDrr_Int: case X86::VMINSDrr_Int: case X86::VMINSDZrr_Int:
+ case X86::MULSDrr_Int: case X86::VMULSDrr_Int: case X86::VMULSDZrr_Int:
+ case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int: case X86::VSUBSDZrr_Int:
+ case X86::VFMADDSD4rr_Int: case X86::VFNMADDSD4rr_Int:
+ case X86::VFMSUBSD4rr_Int: case X86::VFNMSUBSD4rr_Int:
+ case X86::VFMADD132SDr_Int: case X86::VFNMADD132SDr_Int:
+ case X86::VFMADD213SDr_Int: case X86::VFNMADD213SDr_Int:
+ case X86::VFMADD231SDr_Int: case X86::VFNMADD231SDr_Int:
+ case X86::VFMSUB132SDr_Int: case X86::VFNMSUB132SDr_Int:
+ case X86::VFMSUB213SDr_Int: case X86::VFNMSUB213SDr_Int:
+ case X86::VFMSUB231SDr_Int: case X86::VFNMSUB231SDr_Int:
+ case X86::VFMADD132SDZr_Int: case X86::VFNMADD132SDZr_Int:
+ case X86::VFMADD213SDZr_Int: case X86::VFNMADD213SDZr_Int:
+ case X86::VFMADD231SDZr_Int: case X86::VFNMADD231SDZr_Int:
+ case X86::VFMSUB132SDZr_Int: case X86::VFNMSUB132SDZr_Int:
+ case X86::VFMSUB213SDZr_Int: case X86::VFNMSUB213SDZr_Int:
+ case X86::VFMSUB231SDZr_Int: case X86::VFNMSUB231SDZr_Int:
+ return false;
+ default:
+ return true;
+ }
+ }
+
+ return false;
+}
+
+MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
+ MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
+ MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
+ LiveIntervals *LIS) const {
+
+ // TODO: Support the case where LoadMI loads a wide register, but MI
+ // only uses a subreg.
+ for (auto Op : Ops) {
+ if (MI.getOperand(Op).getSubReg())
+ return nullptr;
+ }
+
+ // If loading from a FrameIndex, fold directly from the FrameIndex.
+ unsigned NumOps = LoadMI.getDesc().getNumOperands();
+ int FrameIndex;
+ if (isLoadFromStackSlot(LoadMI, FrameIndex)) {
+ if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF))
+ return nullptr;
+ return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex, LIS);
+ }
+
+ // Check switch flag
+ if (NoFusing) return nullptr;
+
+ // Avoid partial register update stalls unless optimizing for size.
+ if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI.getOpcode()))
+ return nullptr;
+
+ // Determine the alignment of the load.
+ unsigned Alignment = 0;
+ if (LoadMI.hasOneMemOperand())
+ Alignment = (*LoadMI.memoperands_begin())->getAlignment();
+ else
+ switch (LoadMI.getOpcode()) {
+ case X86::AVX512_512_SET0:
+ case X86::AVX512_512_SETALLONES:
+ Alignment = 64;
+ break;
+ case X86::AVX2_SETALLONES:
+ case X86::AVX_SET0:
+ case X86::AVX512_256_SET0:
+ Alignment = 32;
+ break;
+ case X86::V_SET0:
+ case X86::V_SETALLONES:
+ case X86::AVX512_128_SET0:
+ Alignment = 16;
+ break;
+ case X86::FsFLD0SD:
+ case X86::AVX512_FsFLD0SD:
+ Alignment = 8;
+ break;
+ case X86::FsFLD0SS:
+ case X86::AVX512_FsFLD0SS:
+ Alignment = 4;
+ break;
+ default:
+ return nullptr;
+ }
+ if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
+ unsigned NewOpc = 0;
+ switch (MI.getOpcode()) {
+ default: return nullptr;
+ case X86::TEST8rr: NewOpc = X86::CMP8ri; break;
+ case X86::TEST16rr: NewOpc = X86::CMP16ri8; break;
+ case X86::TEST32rr: NewOpc = X86::CMP32ri8; break;
+ case X86::TEST64rr: NewOpc = X86::CMP64ri8; break;
+ }
+ // Change to CMPXXri r, 0 first.
+ MI.setDesc(get(NewOpc));
+ MI.getOperand(1).ChangeToImmediate(0);
+ } else if (Ops.size() != 1)
+ return nullptr;
+
+ // Make sure the subregisters match.
+ // Otherwise we risk changing the size of the load.
+ if (LoadMI.getOperand(0).getSubReg() != MI.getOperand(Ops[0]).getSubReg())
+ return nullptr;
+
+ SmallVector<MachineOperand,X86::AddrNumOperands> MOs;
+ switch (LoadMI.getOpcode()) {
+ case X86::V_SET0:
+ case X86::V_SETALLONES:
+ case X86::AVX2_SETALLONES:
+ case X86::AVX_SET0:
+ case X86::AVX512_128_SET0:
+ case X86::AVX512_256_SET0:
+ case X86::AVX512_512_SET0:
+ case X86::AVX512_512_SETALLONES:
+ case X86::FsFLD0SD:
+ case X86::AVX512_FsFLD0SD:
+ case X86::FsFLD0SS:
+ case X86::AVX512_FsFLD0SS: {
+ // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
+ // Create a constant-pool entry and operands to load from it.
+
+ // Medium and large mode can't fold loads this way.
+ if (MF.getTarget().getCodeModel() != CodeModel::Small &&
+ MF.getTarget().getCodeModel() != CodeModel::Kernel)
+ return nullptr;
+
+ // x86-32 PIC requires a PIC base register for constant pools.
+ unsigned PICBase = 0;
+ if (MF.getTarget().isPositionIndependent()) {
+ if (Subtarget.is64Bit())
+ PICBase = X86::RIP;
+ else
+ // FIXME: PICBase = getGlobalBaseReg(&MF);
+ // This doesn't work for several reasons.
+ // 1. GlobalBaseReg may have been spilled.
+ // 2. It may not be live at MI.
+ return nullptr;
+ }
+
+ // Create a constant-pool entry.
+ MachineConstantPool &MCP = *MF.getConstantPool();
+ Type *Ty;
+ unsigned Opc = LoadMI.getOpcode();
+ if (Opc == X86::FsFLD0SS || Opc == X86::AVX512_FsFLD0SS)
+ Ty = Type::getFloatTy(MF.getFunction()->getContext());
+ else if (Opc == X86::FsFLD0SD || Opc == X86::AVX512_FsFLD0SD)
+ Ty = Type::getDoubleTy(MF.getFunction()->getContext());
+ else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES)
+ Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()),16);
+ else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0 ||
+ Opc == X86::AVX512_256_SET0)
+ Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 8);
+ else
+ Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4);
+
+ bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES ||
+ Opc == X86::AVX512_512_SETALLONES);
+ const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) :
+ Constant::getNullValue(Ty);
+ unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
+
+ // Create operands to load from the constant pool entry.
+ MOs.push_back(MachineOperand::CreateReg(PICBase, false));
+ MOs.push_back(MachineOperand::CreateImm(1));
+ MOs.push_back(MachineOperand::CreateReg(0, false));
+ MOs.push_back(MachineOperand::CreateCPI(CPI, 0));
+ MOs.push_back(MachineOperand::CreateReg(0, false));
+ break;
+ }
+ default: {
+ if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF))
+ return nullptr;
+
+ // Folding a normal load. Just copy the load's address operands.
+ MOs.append(LoadMI.operands_begin() + NumOps - X86::AddrNumOperands,
+ LoadMI.operands_begin() + NumOps);
+ break;
+ }
+ }
+ return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, InsertPt,
+ /*Size=*/0, Alignment, /*AllowCommute=*/true);
+}
+
+bool X86InstrInfo::unfoldMemoryOperand(
+ MachineFunction &MF, MachineInstr &MI, unsigned Reg, bool UnfoldLoad,
+ bool UnfoldStore, SmallVectorImpl<MachineInstr *> &NewMIs) const {
+ auto I = MemOp2RegOpTable.find(MI.getOpcode());
+ if (I == MemOp2RegOpTable.end())
+ return false;
+ unsigned Opc = I->second.first;
+ unsigned Index = I->second.second & TB_INDEX_MASK;
+ bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
+ bool FoldedStore = I->second.second & TB_FOLDED_STORE;
+ if (UnfoldLoad && !FoldedLoad)
+ return false;
+ UnfoldLoad &= FoldedLoad;
+ if (UnfoldStore && !FoldedStore)
+ return false;
+ UnfoldStore &= FoldedStore;
+
+ const MCInstrDesc &MCID = get(Opc);
+ const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
+ // TODO: Check if 32-byte or greater accesses are slow too?
+ if (!MI.hasOneMemOperand() && RC == &X86::VR128RegClass &&
+ Subtarget.isUnalignedMem16Slow())
+ // Without memoperands, loadRegFromAddr and storeRegToStackSlot will
+ // conservatively assume the address is unaligned. That's bad for
+ // performance.
+ return false;
+ SmallVector<MachineOperand, X86::AddrNumOperands> AddrOps;
+ SmallVector<MachineOperand,2> BeforeOps;
+ SmallVector<MachineOperand,2> AfterOps;
+ SmallVector<MachineOperand,4> ImpOps;
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ MachineOperand &Op = MI.getOperand(i);
+ if (i >= Index && i < Index + X86::AddrNumOperands)
+ AddrOps.push_back(Op);
+ else if (Op.isReg() && Op.isImplicit())
+ ImpOps.push_back(Op);
+ else if (i < Index)
+ BeforeOps.push_back(Op);
+ else if (i > Index)
+ AfterOps.push_back(Op);
+ }
+
+ // Emit the load instruction.
+ if (UnfoldLoad) {
+ std::pair<MachineInstr::mmo_iterator, MachineInstr::mmo_iterator> MMOs =
+ MF.extractLoadMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ loadRegFromAddr(MF, Reg, AddrOps, RC, MMOs.first, MMOs.second, NewMIs);
+ if (UnfoldStore) {
+ // Address operands cannot be marked isKill.
+ for (unsigned i = 1; i != 1 + X86::AddrNumOperands; ++i) {
+ MachineOperand &MO = NewMIs[0]->getOperand(i);
+ if (MO.isReg())
+ MO.setIsKill(false);
+ }
+ }
+ }
+
+ // Emit the data processing instruction.
+ MachineInstr *DataMI = MF.CreateMachineInstr(MCID, MI.getDebugLoc(), true);
+ MachineInstrBuilder MIB(MF, DataMI);
+
+ if (FoldedStore)
+ MIB.addReg(Reg, RegState::Define);
+ for (MachineOperand &BeforeOp : BeforeOps)
+ MIB.addOperand(BeforeOp);
+ if (FoldedLoad)
+ MIB.addReg(Reg);
+ for (MachineOperand &AfterOp : AfterOps)
+ MIB.addOperand(AfterOp);
+ for (MachineOperand &ImpOp : ImpOps) {
+ MIB.addReg(ImpOp.getReg(),
+ getDefRegState(ImpOp.isDef()) |
+ RegState::Implicit |
+ getKillRegState(ImpOp.isKill()) |
+ getDeadRegState(ImpOp.isDead()) |
+ getUndefRegState(ImpOp.isUndef()));
+ }
+ // Change CMP32ri r, 0 back to TEST32rr r, r, etc.
+ switch (DataMI->getOpcode()) {
+ default: break;
+ case X86::CMP64ri32:
+ case X86::CMP64ri8:
+ case X86::CMP32ri:
+ case X86::CMP32ri8:
+ case X86::CMP16ri:
+ case X86::CMP16ri8:
+ case X86::CMP8ri: {
+ MachineOperand &MO0 = DataMI->getOperand(0);
+ MachineOperand &MO1 = DataMI->getOperand(1);
+ if (MO1.getImm() == 0) {
+ unsigned NewOpc;
+ switch (DataMI->getOpcode()) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::CMP64ri8:
+ case X86::CMP64ri32: NewOpc = X86::TEST64rr; break;
+ case X86::CMP32ri8:
+ case X86::CMP32ri: NewOpc = X86::TEST32rr; break;
+ case X86::CMP16ri8:
+ case X86::CMP16ri: NewOpc = X86::TEST16rr; break;
+ case X86::CMP8ri: NewOpc = X86::TEST8rr; break;
+ }
+ DataMI->setDesc(get(NewOpc));
+ MO1.ChangeToRegister(MO0.getReg(), false);
+ }
+ }
+ }
+ NewMIs.push_back(DataMI);
+
+ // Emit the store instruction.
+ if (UnfoldStore) {
+ const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI, MF);
+ std::pair<MachineInstr::mmo_iterator, MachineInstr::mmo_iterator> MMOs =
+ MF.extractStoreMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ storeRegToAddr(MF, Reg, true, AddrOps, DstRC, MMOs.first, MMOs.second, NewMIs);
+ }
+
+ return true;
+}
+
+bool
+X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
+ SmallVectorImpl<SDNode*> &NewNodes) const {
+ if (!N->isMachineOpcode())
+ return false;
+
+ auto I = MemOp2RegOpTable.find(N->getMachineOpcode());
+ if (I == MemOp2RegOpTable.end())
+ return false;
+ unsigned Opc = I->second.first;
+ unsigned Index = I->second.second & TB_INDEX_MASK;
+ bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
+ bool FoldedStore = I->second.second & TB_FOLDED_STORE;
+ const MCInstrDesc &MCID = get(Opc);
+ MachineFunction &MF = DAG.getMachineFunction();
+ const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
+ unsigned NumDefs = MCID.NumDefs;
+ std::vector<SDValue> AddrOps;
+ std::vector<SDValue> BeforeOps;
+ std::vector<SDValue> AfterOps;
+ SDLoc dl(N);
+ unsigned NumOps = N->getNumOperands();
+ for (unsigned i = 0; i != NumOps-1; ++i) {
+ SDValue Op = N->getOperand(i);
+ if (i >= Index-NumDefs && i < Index-NumDefs + X86::AddrNumOperands)
+ AddrOps.push_back(Op);
+ else if (i < Index-NumDefs)
+ BeforeOps.push_back(Op);
+ else if (i > Index-NumDefs)
+ AfterOps.push_back(Op);
+ }
+ SDValue Chain = N->getOperand(NumOps-1);
+ AddrOps.push_back(Chain);
+
+ // Emit the load instruction.
+ SDNode *Load = nullptr;
+ if (FoldedLoad) {
+ EVT VT = *RC->vt_begin();
+ std::pair<MachineInstr::mmo_iterator,
+ MachineInstr::mmo_iterator> MMOs =
+ MF.extractLoadMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
+ cast<MachineSDNode>(N)->memoperands_end());
+ if (!(*MMOs.first) &&
+ RC == &X86::VR128RegClass &&
+ Subtarget.isUnalignedMem16Slow())
+ // Do not introduce a slow unaligned load.
+ return false;
+ // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
+ // memory access is slow above.
+ unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
+ bool isAligned = (*MMOs.first) &&
+ (*MMOs.first)->getAlignment() >= Alignment;
+ Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, Subtarget), dl,
+ VT, MVT::Other, AddrOps);
+ NewNodes.push_back(Load);
+
+ // Preserve memory reference information.
+ cast<MachineSDNode>(Load)->setMemRefs(MMOs.first, MMOs.second);
+ }
+
+ // Emit the data processing instruction.
+ std::vector<EVT> VTs;
+ const TargetRegisterClass *DstRC = nullptr;
+ if (MCID.getNumDefs() > 0) {
+ DstRC = getRegClass(MCID, 0, &RI, MF);
+ VTs.push_back(*DstRC->vt_begin());
+ }
+ for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
+ EVT VT = N->getValueType(i);
+ if (VT != MVT::Other && i >= (unsigned)MCID.getNumDefs())
+ VTs.push_back(VT);
+ }
+ if (Load)
+ BeforeOps.push_back(SDValue(Load, 0));
+ BeforeOps.insert(BeforeOps.end(), AfterOps.begin(), AfterOps.end());
+ SDNode *NewNode= DAG.getMachineNode(Opc, dl, VTs, BeforeOps);
+ NewNodes.push_back(NewNode);
+
+ // Emit the store instruction.
+ if (FoldedStore) {
+ AddrOps.pop_back();
+ AddrOps.push_back(SDValue(NewNode, 0));
+ AddrOps.push_back(Chain);
+ std::pair<MachineInstr::mmo_iterator,
+ MachineInstr::mmo_iterator> MMOs =
+ MF.extractStoreMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
+ cast<MachineSDNode>(N)->memoperands_end());
+ if (!(*MMOs.first) &&
+ RC == &X86::VR128RegClass &&
+ Subtarget.isUnalignedMem16Slow())
+ // Do not introduce a slow unaligned store.
+ return false;
+ // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
+ // memory access is slow above.
+ unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
+ bool isAligned = (*MMOs.first) &&
+ (*MMOs.first)->getAlignment() >= Alignment;
+ SDNode *Store =
+ DAG.getMachineNode(getStoreRegOpcode(0, DstRC, isAligned, Subtarget),
+ dl, MVT::Other, AddrOps);
+ NewNodes.push_back(Store);
+
+ // Preserve memory reference information.
+ cast<MachineSDNode>(Store)->setMemRefs(MMOs.first, MMOs.second);
+ }
+
+ return true;
+}
+
+unsigned X86InstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc,
+ bool UnfoldLoad, bool UnfoldStore,
+ unsigned *LoadRegIndex) const {
+ auto I = MemOp2RegOpTable.find(Opc);
+ if (I == MemOp2RegOpTable.end())
+ return 0;
+ bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
+ bool FoldedStore = I->second.second & TB_FOLDED_STORE;
+ if (UnfoldLoad && !FoldedLoad)
+ return 0;
+ if (UnfoldStore && !FoldedStore)
+ return 0;
+ if (LoadRegIndex)
+ *LoadRegIndex = I->second.second & TB_INDEX_MASK;
+ return I->second.first;
+}
+
+bool
+X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
+ int64_t &Offset1, int64_t &Offset2) const {
+ if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode())
+ return false;
+ unsigned Opc1 = Load1->getMachineOpcode();
+ unsigned Opc2 = Load2->getMachineOpcode();
+ switch (Opc1) {
+ default: return false;
+ case X86::MOV8rm:
+ case X86::MOV16rm:
+ case X86::MOV32rm:
+ case X86::MOV64rm:
+ case X86::LD_Fp32m:
+ case X86::LD_Fp64m:
+ case X86::LD_Fp80m:
+ case X86::MOVSSrm:
+ case X86::MOVSDrm:
+ case X86::MMX_MOVD64rm:
+ case X86::MMX_MOVQ64rm:
+ case X86::MOVAPSrm:
+ case X86::MOVUPSrm:
+ case X86::MOVAPDrm:
+ case X86::MOVUPDrm:
+ case X86::MOVDQArm:
+ case X86::MOVDQUrm:
+ // AVX load instructions
+ case X86::VMOVSSrm:
+ case X86::VMOVSDrm:
+ case X86::VMOVAPSrm:
+ case X86::VMOVUPSrm:
+ case X86::VMOVAPDrm:
+ case X86::VMOVUPDrm:
+ case X86::VMOVDQArm:
+ case X86::VMOVDQUrm:
+ case X86::VMOVAPSYrm:
+ case X86::VMOVUPSYrm:
+ case X86::VMOVAPDYrm:
+ case X86::VMOVUPDYrm:
+ case X86::VMOVDQAYrm:
+ case X86::VMOVDQUYrm:
+ // AVX512 load instructions
+ case X86::VMOVSSZrm:
+ case X86::VMOVSDZrm:
+ case X86::VMOVAPSZ128rm:
+ case X86::VMOVUPSZ128rm:
+ case X86::VMOVAPSZ128rm_NOVLX:
+ case X86::VMOVUPSZ128rm_NOVLX:
+ case X86::VMOVAPDZ128rm:
+ case X86::VMOVUPDZ128rm:
+ case X86::VMOVDQU8Z128rm:
+ case X86::VMOVDQU16Z128rm:
+ case X86::VMOVDQA32Z128rm:
+ case X86::VMOVDQU32Z128rm:
+ case X86::VMOVDQA64Z128rm:
+ case X86::VMOVDQU64Z128rm:
+ case X86::VMOVAPSZ256rm:
+ case X86::VMOVUPSZ256rm:
+ case X86::VMOVAPSZ256rm_NOVLX:
+ case X86::VMOVUPSZ256rm_NOVLX:
+ case X86::VMOVAPDZ256rm:
+ case X86::VMOVUPDZ256rm:
+ case X86::VMOVDQU8Z256rm:
+ case X86::VMOVDQU16Z256rm:
+ case X86::VMOVDQA32Z256rm:
+ case X86::VMOVDQU32Z256rm:
+ case X86::VMOVDQA64Z256rm:
+ case X86::VMOVDQU64Z256rm:
+ case X86::VMOVAPSZrm:
+ case X86::VMOVUPSZrm:
+ case X86::VMOVAPDZrm:
+ case X86::VMOVUPDZrm:
+ case X86::VMOVDQU8Zrm:
+ case X86::VMOVDQU16Zrm:
+ case X86::VMOVDQA32Zrm:
+ case X86::VMOVDQU32Zrm:
+ case X86::VMOVDQA64Zrm:
+ case X86::VMOVDQU64Zrm:
+ case X86::KMOVBkm:
+ case X86::KMOVWkm:
+ case X86::KMOVDkm:
+ case X86::KMOVQkm:
+ break;
+ }
+ switch (Opc2) {
+ default: return false;
+ case X86::MOV8rm:
+ case X86::MOV16rm:
+ case X86::MOV32rm:
+ case X86::MOV64rm:
+ case X86::LD_Fp32m:
+ case X86::LD_Fp64m:
+ case X86::LD_Fp80m:
+ case X86::MOVSSrm:
+ case X86::MOVSDrm:
+ case X86::MMX_MOVD64rm:
+ case X86::MMX_MOVQ64rm:
+ case X86::MOVAPSrm:
+ case X86::MOVUPSrm:
+ case X86::MOVAPDrm:
+ case X86::MOVUPDrm:
+ case X86::MOVDQArm:
+ case X86::MOVDQUrm:
+ // AVX load instructions
+ case X86::VMOVSSrm:
+ case X86::VMOVSDrm:
+ case X86::VMOVAPSrm:
+ case X86::VMOVUPSrm:
+ case X86::VMOVAPDrm:
+ case X86::VMOVUPDrm:
+ case X86::VMOVDQArm:
+ case X86::VMOVDQUrm:
+ case X86::VMOVAPSYrm:
+ case X86::VMOVUPSYrm:
+ case X86::VMOVAPDYrm:
+ case X86::VMOVUPDYrm:
+ case X86::VMOVDQAYrm:
+ case X86::VMOVDQUYrm:
+ // AVX512 load instructions
+ case X86::VMOVSSZrm:
+ case X86::VMOVSDZrm:
+ case X86::VMOVAPSZ128rm:
+ case X86::VMOVUPSZ128rm:
+ case X86::VMOVAPSZ128rm_NOVLX:
+ case X86::VMOVUPSZ128rm_NOVLX:
+ case X86::VMOVAPDZ128rm:
+ case X86::VMOVUPDZ128rm:
+ case X86::VMOVDQU8Z128rm:
+ case X86::VMOVDQU16Z128rm:
+ case X86::VMOVDQA32Z128rm:
+ case X86::VMOVDQU32Z128rm:
+ case X86::VMOVDQA64Z128rm:
+ case X86::VMOVDQU64Z128rm:
+ case X86::VMOVAPSZ256rm:
+ case X86::VMOVUPSZ256rm:
+ case X86::VMOVAPSZ256rm_NOVLX:
+ case X86::VMOVUPSZ256rm_NOVLX:
+ case X86::VMOVAPDZ256rm:
+ case X86::VMOVUPDZ256rm:
+ case X86::VMOVDQU8Z256rm:
+ case X86::VMOVDQU16Z256rm:
+ case X86::VMOVDQA32Z256rm:
+ case X86::VMOVDQU32Z256rm:
+ case X86::VMOVDQA64Z256rm:
+ case X86::VMOVDQU64Z256rm:
+ case X86::VMOVAPSZrm:
+ case X86::VMOVUPSZrm:
+ case X86::VMOVAPDZrm:
+ case X86::VMOVUPDZrm:
+ case X86::VMOVDQU8Zrm:
+ case X86::VMOVDQU16Zrm:
+ case X86::VMOVDQA32Zrm:
+ case X86::VMOVDQU32Zrm:
+ case X86::VMOVDQA64Zrm:
+ case X86::VMOVDQU64Zrm:
+ case X86::KMOVBkm:
+ case X86::KMOVWkm:
+ case X86::KMOVDkm:
+ case X86::KMOVQkm:
+ break;
+ }
+
+ // Check if chain operands and base addresses match.
+ if (Load1->getOperand(0) != Load2->getOperand(0) ||
+ Load1->getOperand(5) != Load2->getOperand(5))
+ return false;
+ // Segment operands should match as well.
+ if (Load1->getOperand(4) != Load2->getOperand(4))
+ return false;
+ // Scale should be 1, Index should be Reg0.
+ if (Load1->getOperand(1) == Load2->getOperand(1) &&
+ Load1->getOperand(2) == Load2->getOperand(2)) {
+ if (cast<ConstantSDNode>(Load1->getOperand(1))->getZExtValue() != 1)
+ return false;
+
+ // Now let's examine the displacements.
+ if (isa<ConstantSDNode>(Load1->getOperand(3)) &&
+ isa<ConstantSDNode>(Load2->getOperand(3))) {
+ Offset1 = cast<ConstantSDNode>(Load1->getOperand(3))->getSExtValue();
+ Offset2 = cast<ConstantSDNode>(Load2->getOperand(3))->getSExtValue();
+ return true;
+ }
+ }
+ return false;
+}
+
+bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+ int64_t Offset1, int64_t Offset2,
+ unsigned NumLoads) const {
+ assert(Offset2 > Offset1);
+ if ((Offset2 - Offset1) / 8 > 64)
+ return false;
+
+ unsigned Opc1 = Load1->getMachineOpcode();
+ unsigned Opc2 = Load2->getMachineOpcode();
+ if (Opc1 != Opc2)
+ return false; // FIXME: overly conservative?
+
+ switch (Opc1) {
+ default: break;
+ case X86::LD_Fp32m:
+ case X86::LD_Fp64m:
+ case X86::LD_Fp80m:
+ case X86::MMX_MOVD64rm:
+ case X86::MMX_MOVQ64rm:
+ return false;
+ }
+
+ EVT VT = Load1->getValueType(0);
+ switch (VT.getSimpleVT().SimpleTy) {
+ default:
+ // XMM registers. In 64-bit mode we can be a bit more aggressive since we
+ // have 16 of them to play with.
+ if (Subtarget.is64Bit()) {
+ if (NumLoads >= 3)
+ return false;
+ } else if (NumLoads) {
+ return false;
+ }
+ break;
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32:
+ case MVT::i64:
+ case MVT::f32:
+ case MVT::f64:
+ if (NumLoads)
+ return false;
+ break;
+ }
+
+ return true;
+}
+
+bool X86InstrInfo::shouldScheduleAdjacent(const MachineInstr &First,
+ const MachineInstr &Second) const {
+ // Check if this processor supports macro-fusion. Since this is a minor
+ // heuristic, we haven't specifically reserved a feature. hasAVX is a decent
+ // proxy for SandyBridge+.
+ if (!Subtarget.hasAVX())
+ return false;
+
+ enum {
+ FuseTest,
+ FuseCmp,
+ FuseInc
+ } FuseKind;
+
+ switch (Second.getOpcode()) {
+ default:
+ return false;
+ case X86::JE_1:
+ case X86::JNE_1:
+ case X86::JL_1:
+ case X86::JLE_1:
+ case X86::JG_1:
+ case X86::JGE_1:
+ FuseKind = FuseInc;
+ break;
+ case X86::JB_1:
+ case X86::JBE_1:
+ case X86::JA_1:
+ case X86::JAE_1:
+ FuseKind = FuseCmp;
+ break;
+ case X86::JS_1:
+ case X86::JNS_1:
+ case X86::JP_1:
+ case X86::JNP_1:
+ case X86::JO_1:
+ case X86::JNO_1:
+ FuseKind = FuseTest;
+ break;
+ }
+ switch (First.getOpcode()) {
+ default:
+ return false;
+ case X86::TEST8rr:
+ case X86::TEST16rr:
+ case X86::TEST32rr:
+ case X86::TEST64rr:
+ case X86::TEST8ri:
+ case X86::TEST16ri:
+ case X86::TEST32ri:
+ case X86::TEST32i32:
+ case X86::TEST64i32:
+ case X86::TEST64ri32:
+ case X86::TEST8rm:
+ case X86::TEST16rm:
+ case X86::TEST32rm:
+ case X86::TEST64rm:
+ case X86::TEST8ri_NOREX:
+ case X86::AND16i16:
+ case X86::AND16ri:
+ case X86::AND16ri8:
+ case X86::AND16rm:
+ case X86::AND16rr:
+ case X86::AND32i32:
+ case X86::AND32ri:
+ case X86::AND32ri8:
+ case X86::AND32rm:
+ case X86::AND32rr:
+ case X86::AND64i32:
+ case X86::AND64ri32:
+ case X86::AND64ri8:
+ case X86::AND64rm:
+ case X86::AND64rr:
+ case X86::AND8i8:
+ case X86::AND8ri:
+ case X86::AND8rm:
+ case X86::AND8rr:
+ return true;
+ case X86::CMP16i16:
+ case X86::CMP16ri:
+ case X86::CMP16ri8:
+ case X86::CMP16rm:
+ case X86::CMP16rr:
+ case X86::CMP32i32:
+ case X86::CMP32ri:
+ case X86::CMP32ri8:
+ case X86::CMP32rm:
+ case X86::CMP32rr:
+ case X86::CMP64i32:
+ case X86::CMP64ri32:
+ case X86::CMP64ri8:
+ case X86::CMP64rm:
+ case X86::CMP64rr:
+ case X86::CMP8i8:
+ case X86::CMP8ri:
+ case X86::CMP8rm:
+ case X86::CMP8rr:
+ case X86::ADD16i16:
+ case X86::ADD16ri:
+ case X86::ADD16ri8:
+ case X86::ADD16ri8_DB:
+ case X86::ADD16ri_DB:
+ case X86::ADD16rm:
+ case X86::ADD16rr:
+ case X86::ADD16rr_DB:
+ case X86::ADD32i32:
+ case X86::ADD32ri:
+ case X86::ADD32ri8:
+ case X86::ADD32ri8_DB:
+ case X86::ADD32ri_DB:
+ case X86::ADD32rm:
+ case X86::ADD32rr:
+ case X86::ADD32rr_DB:
+ case X86::ADD64i32:
+ case X86::ADD64ri32:
+ case X86::ADD64ri32_DB:
+ case X86::ADD64ri8:
+ case X86::ADD64ri8_DB:
+ case X86::ADD64rm:
+ case X86::ADD64rr:
+ case X86::ADD64rr_DB:
+ case X86::ADD8i8:
+ case X86::ADD8mi:
+ case X86::ADD8mr:
+ case X86::ADD8ri:
+ case X86::ADD8rm:
+ case X86::ADD8rr:
+ case X86::SUB16i16:
+ case X86::SUB16ri:
+ case X86::SUB16ri8:
+ case X86::SUB16rm:
+ case X86::SUB16rr:
+ case X86::SUB32i32:
+ case X86::SUB32ri:
+ case X86::SUB32ri8:
+ case X86::SUB32rm:
+ case X86::SUB32rr:
+ case X86::SUB64i32:
+ case X86::SUB64ri32:
+ case X86::SUB64ri8:
+ case X86::SUB64rm:
+ case X86::SUB64rr:
+ case X86::SUB8i8:
+ case X86::SUB8ri:
+ case X86::SUB8rm:
+ case X86::SUB8rr:
+ return FuseKind == FuseCmp || FuseKind == FuseInc;
+ case X86::INC16r:
+ case X86::INC32r:
+ case X86::INC64r:
+ case X86::INC8r:
+ case X86::DEC16r:
+ case X86::DEC32r:
+ case X86::DEC64r:
+ case X86::DEC8r:
+ return FuseKind == FuseInc;
+ }
+}
+
+bool X86InstrInfo::
+reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+ assert(Cond.size() == 1 && "Invalid X86 branch condition!");
+ X86::CondCode CC = static_cast<X86::CondCode>(Cond[0].getImm());
+ Cond[0].setImm(GetOppositeBranchCondition(CC));
+ return false;
+}
+
+bool X86InstrInfo::
+isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
+ // FIXME: Return false for x87 stack register classes for now. We can't
+ // allow any loads of these registers before FpGet_ST0_80.
+ return !(RC == &X86::CCRRegClass || RC == &X86::RFP32RegClass ||
+ RC == &X86::RFP64RegClass || RC == &X86::RFP80RegClass);
+}
+
+/// Return a virtual register initialized with the
+/// the global base register value. Output instructions required to
+/// initialize the register in the function entry block, if necessary.
+///
+/// TODO: Eliminate this and move the code to X86MachineFunctionInfo.
+///
+unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
+ assert(!Subtarget.is64Bit() &&
+ "X86-64 PIC uses RIP relative addressing");
+
+ X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
+ unsigned GlobalBaseReg = X86FI->getGlobalBaseReg();
+ if (GlobalBaseReg != 0)
+ return GlobalBaseReg;
+
+ // Create the register. The code to initialize it is inserted
+ // later, by the CGBR pass (below).
+ MachineRegisterInfo &RegInfo = MF->getRegInfo();
+ GlobalBaseReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
+ X86FI->setGlobalBaseReg(GlobalBaseReg);
+ return GlobalBaseReg;
+}
+
+// These are the replaceable SSE instructions. Some of these have Int variants
+// that we don't include here. We don't want to replace instructions selected
+// by intrinsics.
+static const uint16_t ReplaceableInstrs[][3] = {
+ //PackedSingle PackedDouble PackedInt
+ { X86::MOVAPSmr, X86::MOVAPDmr, X86::MOVDQAmr },
+ { X86::MOVAPSrm, X86::MOVAPDrm, X86::MOVDQArm },
+ { X86::MOVAPSrr, X86::MOVAPDrr, X86::MOVDQArr },
+ { X86::MOVUPSmr, X86::MOVUPDmr, X86::MOVDQUmr },
+ { X86::MOVUPSrm, X86::MOVUPDrm, X86::MOVDQUrm },
+ { X86::MOVLPSmr, X86::MOVLPDmr, X86::MOVPQI2QImr },
+ { X86::MOVSSmr, X86::MOVSSmr, X86::MOVPDI2DImr },
+ { X86::MOVSDrm, X86::MOVSDrm, X86::MOVQI2PQIrm },
+ { X86::MOVSSrm, X86::MOVSSrm, X86::MOVDI2PDIrm },
+ { X86::MOVNTPSmr, X86::MOVNTPDmr, X86::MOVNTDQmr },
+ { X86::ANDNPSrm, X86::ANDNPDrm, X86::PANDNrm },
+ { X86::ANDNPSrr, X86::ANDNPDrr, X86::PANDNrr },
+ { X86::ANDPSrm, X86::ANDPDrm, X86::PANDrm },
+ { X86::ANDPSrr, X86::ANDPDrr, X86::PANDrr },
+ { X86::ORPSrm, X86::ORPDrm, X86::PORrm },
+ { X86::ORPSrr, X86::ORPDrr, X86::PORrr },
+ { X86::XORPSrm, X86::XORPDrm, X86::PXORrm },
+ { X86::XORPSrr, X86::XORPDrr, X86::PXORrr },
+ // AVX 128-bit support
+ { X86::VMOVAPSmr, X86::VMOVAPDmr, X86::VMOVDQAmr },
+ { X86::VMOVAPSrm, X86::VMOVAPDrm, X86::VMOVDQArm },
+ { X86::VMOVAPSrr, X86::VMOVAPDrr, X86::VMOVDQArr },
+ { X86::VMOVUPSmr, X86::VMOVUPDmr, X86::VMOVDQUmr },
+ { X86::VMOVUPSrm, X86::VMOVUPDrm, X86::VMOVDQUrm },
+ { X86::VMOVLPSmr, X86::VMOVLPDmr, X86::VMOVPQI2QImr },
+ { X86::VMOVSSmr, X86::VMOVSSmr, X86::VMOVPDI2DImr },
+ { X86::VMOVSDrm, X86::VMOVSDrm, X86::VMOVQI2PQIrm },
+ { X86::VMOVSSrm, X86::VMOVSSrm, X86::VMOVDI2PDIrm },
+ { X86::VMOVNTPSmr, X86::VMOVNTPDmr, X86::VMOVNTDQmr },
+ { X86::VANDNPSrm, X86::VANDNPDrm, X86::VPANDNrm },
+ { X86::VANDNPSrr, X86::VANDNPDrr, X86::VPANDNrr },
+ { X86::VANDPSrm, X86::VANDPDrm, X86::VPANDrm },
+ { X86::VANDPSrr, X86::VANDPDrr, X86::VPANDrr },
+ { X86::VORPSrm, X86::VORPDrm, X86::VPORrm },
+ { X86::VORPSrr, X86::VORPDrr, X86::VPORrr },
+ { X86::VXORPSrm, X86::VXORPDrm, X86::VPXORrm },
+ { X86::VXORPSrr, X86::VXORPDrr, X86::VPXORrr },
+ // AVX 256-bit support
+ { X86::VMOVAPSYmr, X86::VMOVAPDYmr, X86::VMOVDQAYmr },
+ { X86::VMOVAPSYrm, X86::VMOVAPDYrm, X86::VMOVDQAYrm },
+ { X86::VMOVAPSYrr, X86::VMOVAPDYrr, X86::VMOVDQAYrr },
+ { X86::VMOVUPSYmr, X86::VMOVUPDYmr, X86::VMOVDQUYmr },
+ { X86::VMOVUPSYrm, X86::VMOVUPDYrm, X86::VMOVDQUYrm },
+ { X86::VMOVNTPSYmr, X86::VMOVNTPDYmr, X86::VMOVNTDQYmr },
+ // AVX512 support
+ { X86::VMOVLPSZ128mr, X86::VMOVLPDZ128mr, X86::VMOVPQI2QIZmr },
+ { X86::VMOVNTPSZ128mr, X86::VMOVNTPDZ128mr, X86::VMOVNTDQZ128mr },
+ { X86::VMOVNTPSZ128mr, X86::VMOVNTPDZ128mr, X86::VMOVNTDQZ128mr },
+ { X86::VMOVNTPSZmr, X86::VMOVNTPDZmr, X86::VMOVNTDQZmr },
+ { X86::VMOVSDZmr, X86::VMOVSDZmr, X86::VMOVPQI2QIZmr },
+ { X86::VMOVSSZmr, X86::VMOVSSZmr, X86::VMOVPDI2DIZmr },
+ { X86::VMOVSDZrm, X86::VMOVSDZrm, X86::VMOVQI2PQIZrm },
+ { X86::VMOVSSZrm, X86::VMOVSSZrm, X86::VMOVDI2PDIZrm },
+ { X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128r, X86::VPBROADCASTDZ128r },
+ { X86::VBROADCASTSSZ128m, X86::VBROADCASTSSZ128m, X86::VPBROADCASTDZ128m },
+ { X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256r, X86::VPBROADCASTDZ256r },
+ { X86::VBROADCASTSSZ256m, X86::VBROADCASTSSZ256m, X86::VPBROADCASTDZ256m },
+ { X86::VBROADCASTSSZr, X86::VBROADCASTSSZr, X86::VPBROADCASTDZr },
+ { X86::VBROADCASTSSZm, X86::VBROADCASTSSZm, X86::VPBROADCASTDZm },
+ { X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256r, X86::VPBROADCASTQZ256r },
+ { X86::VBROADCASTSDZ256m, X86::VBROADCASTSDZ256m, X86::VPBROADCASTQZ256m },
+ { X86::VBROADCASTSDZr, X86::VBROADCASTSDZr, X86::VPBROADCASTQZr },
+ { X86::VBROADCASTSDZm, X86::VBROADCASTSDZm, X86::VPBROADCASTQZm },
+};
+
+static const uint16_t ReplaceableInstrsAVX2[][3] = {
+ //PackedSingle PackedDouble PackedInt
+ { X86::VANDNPSYrm, X86::VANDNPDYrm, X86::VPANDNYrm },
+ { X86::VANDNPSYrr, X86::VANDNPDYrr, X86::VPANDNYrr },
+ { X86::VANDPSYrm, X86::VANDPDYrm, X86::VPANDYrm },
+ { X86::VANDPSYrr, X86::VANDPDYrr, X86::VPANDYrr },
+ { X86::VORPSYrm, X86::VORPDYrm, X86::VPORYrm },
+ { X86::VORPSYrr, X86::VORPDYrr, X86::VPORYrr },
+ { X86::VXORPSYrm, X86::VXORPDYrm, X86::VPXORYrm },
+ { X86::VXORPSYrr, X86::VXORPDYrr, X86::VPXORYrr },
+ { X86::VEXTRACTF128mr, X86::VEXTRACTF128mr, X86::VEXTRACTI128mr },
+ { X86::VEXTRACTF128rr, X86::VEXTRACTF128rr, X86::VEXTRACTI128rr },
+ { X86::VINSERTF128rm, X86::VINSERTF128rm, X86::VINSERTI128rm },
+ { X86::VINSERTF128rr, X86::VINSERTF128rr, X86::VINSERTI128rr },
+ { X86::VPERM2F128rm, X86::VPERM2F128rm, X86::VPERM2I128rm },
+ { X86::VPERM2F128rr, X86::VPERM2F128rr, X86::VPERM2I128rr },
+ { X86::VBROADCASTSSrm, X86::VBROADCASTSSrm, X86::VPBROADCASTDrm},
+ { X86::VBROADCASTSSrr, X86::VBROADCASTSSrr, X86::VPBROADCASTDrr},
+ { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrr, X86::VPBROADCASTDYrr},
+ { X86::VBROADCASTSSYrm, X86::VBROADCASTSSYrm, X86::VPBROADCASTDYrm},
+ { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrr, X86::VPBROADCASTQYrr},
+ { X86::VBROADCASTSDYrm, X86::VBROADCASTSDYrm, X86::VPBROADCASTQYrm},
+ { X86::VBROADCASTF128, X86::VBROADCASTF128, X86::VBROADCASTI128 },
+};
+
+static const uint16_t ReplaceableInstrsAVX512[][4] = {
+ // Two integer columns for 64-bit and 32-bit elements.
+ //PackedSingle PackedDouble PackedInt PackedInt
+ { X86::VMOVAPSZ128mr, X86::VMOVAPDZ128mr, X86::VMOVDQA64Z128mr, X86::VMOVDQA32Z128mr },
+ { X86::VMOVAPSZ128rm, X86::VMOVAPDZ128rm, X86::VMOVDQA64Z128rm, X86::VMOVDQA32Z128rm },
+ { X86::VMOVAPSZ128rr, X86::VMOVAPDZ128rr, X86::VMOVDQA64Z128rr, X86::VMOVDQA32Z128rr },
+ { X86::VMOVUPSZ128mr, X86::VMOVUPDZ128mr, X86::VMOVDQU64Z128mr, X86::VMOVDQU32Z128mr },
+ { X86::VMOVUPSZ128rm, X86::VMOVUPDZ128rm, X86::VMOVDQU64Z128rm, X86::VMOVDQU32Z128rm },
+ { X86::VMOVAPSZ256mr, X86::VMOVAPDZ256mr, X86::VMOVDQA64Z256mr, X86::VMOVDQA32Z256mr },
+ { X86::VMOVAPSZ256rm, X86::VMOVAPDZ256rm, X86::VMOVDQA64Z256rm, X86::VMOVDQA32Z256rm },
+ { X86::VMOVAPSZ256rr, X86::VMOVAPDZ256rr, X86::VMOVDQA64Z256rr, X86::VMOVDQA32Z256rr },
+ { X86::VMOVUPSZ256mr, X86::VMOVUPDZ256mr, X86::VMOVDQU64Z256mr, X86::VMOVDQU32Z256mr },
+ { X86::VMOVUPSZ256rm, X86::VMOVUPDZ256rm, X86::VMOVDQU64Z256rm, X86::VMOVDQU32Z256rm },
+ { X86::VMOVAPSZmr, X86::VMOVAPDZmr, X86::VMOVDQA64Zmr, X86::VMOVDQA32Zmr },
+ { X86::VMOVAPSZrm, X86::VMOVAPDZrm, X86::VMOVDQA64Zrm, X86::VMOVDQA32Zrm },
+ { X86::VMOVAPSZrr, X86::VMOVAPDZrr, X86::VMOVDQA64Zrr, X86::VMOVDQA32Zrr },
+ { X86::VMOVUPSZmr, X86::VMOVUPDZmr, X86::VMOVDQU64Zmr, X86::VMOVDQU32Zmr },
+ { X86::VMOVUPSZrm, X86::VMOVUPDZrm, X86::VMOVDQU64Zrm, X86::VMOVDQU32Zrm },
+};
+
+static const uint16_t ReplaceableInstrsAVX512DQ[][4] = {
+ // Two integer columns for 64-bit and 32-bit elements.
+ //PackedSingle PackedDouble PackedInt PackedInt
+ { X86::VANDNPSZ128rm, X86::VANDNPDZ128rm, X86::VPANDNQZ128rm, X86::VPANDNDZ128rm },
+ { X86::VANDNPSZ128rr, X86::VANDNPDZ128rr, X86::VPANDNQZ128rr, X86::VPANDNDZ128rr },
+ { X86::VANDPSZ128rm, X86::VANDPDZ128rm, X86::VPANDQZ128rm, X86::VPANDDZ128rm },
+ { X86::VANDPSZ128rr, X86::VANDPDZ128rr, X86::VPANDQZ128rr, X86::VPANDDZ128rr },
+ { X86::VORPSZ128rm, X86::VORPDZ128rm, X86::VPORQZ128rm, X86::VPORDZ128rm },
+ { X86::VORPSZ128rr, X86::VORPDZ128rr, X86::VPORQZ128rr, X86::VPORDZ128rr },
+ { X86::VXORPSZ128rm, X86::VXORPDZ128rm, X86::VPXORQZ128rm, X86::VPXORDZ128rm },
+ { X86::VXORPSZ128rr, X86::VXORPDZ128rr, X86::VPXORQZ128rr, X86::VPXORDZ128rr },
+ { X86::VANDNPSZ256rm, X86::VANDNPDZ256rm, X86::VPANDNQZ256rm, X86::VPANDNDZ256rm },
+ { X86::VANDNPSZ256rr, X86::VANDNPDZ256rr, X86::VPANDNQZ256rr, X86::VPANDNDZ256rr },
+ { X86::VANDPSZ256rm, X86::VANDPDZ256rm, X86::VPANDQZ256rm, X86::VPANDDZ256rm },
+ { X86::VANDPSZ256rr, X86::VANDPDZ256rr, X86::VPANDQZ256rr, X86::VPANDDZ256rr },
+ { X86::VORPSZ256rm, X86::VORPDZ256rm, X86::VPORQZ256rm, X86::VPORDZ256rm },
+ { X86::VORPSZ256rr, X86::VORPDZ256rr, X86::VPORQZ256rr, X86::VPORDZ256rr },
+ { X86::VXORPSZ256rm, X86::VXORPDZ256rm, X86::VPXORQZ256rm, X86::VPXORDZ256rm },
+ { X86::VXORPSZ256rr, X86::VXORPDZ256rr, X86::VPXORQZ256rr, X86::VPXORDZ256rr },
+ { X86::VANDNPSZrm, X86::VANDNPDZrm, X86::VPANDNQZrm, X86::VPANDNDZrm },
+ { X86::VANDNPSZrr, X86::VANDNPDZrr, X86::VPANDNQZrr, X86::VPANDNDZrr },
+ { X86::VANDPSZrm, X86::VANDPDZrm, X86::VPANDQZrm, X86::VPANDDZrm },
+ { X86::VANDPSZrr, X86::VANDPDZrr, X86::VPANDQZrr, X86::VPANDDZrr },
+ { X86::VORPSZrm, X86::VORPDZrm, X86::VPORQZrm, X86::VPORDZrm },
+ { X86::VORPSZrr, X86::VORPDZrr, X86::VPORQZrr, X86::VPORDZrr },
+ { X86::VXORPSZrm, X86::VXORPDZrm, X86::VPXORQZrm, X86::VPXORDZrm },
+ { X86::VXORPSZrr, X86::VXORPDZrr, X86::VPXORQZrr, X86::VPXORDZrr },
+};
+
+static const uint16_t ReplaceableInstrsAVX512DQMasked[][4] = {
+ // Two integer columns for 64-bit and 32-bit elements.
+ //PackedSingle PackedDouble
+ //PackedInt PackedInt
+ { X86::VANDNPSZ128rmk, X86::VANDNPDZ128rmk,
+ X86::VPANDNQZ128rmk, X86::VPANDNDZ128rmk },
+ { X86::VANDNPSZ128rmkz, X86::VANDNPDZ128rmkz,
+ X86::VPANDNQZ128rmkz, X86::VPANDNDZ128rmkz },
+ { X86::VANDNPSZ128rrk, X86::VANDNPDZ128rrk,
+ X86::VPANDNQZ128rrk, X86::VPANDNDZ128rrk },
+ { X86::VANDNPSZ128rrkz, X86::VANDNPDZ128rrkz,
+ X86::VPANDNQZ128rrkz, X86::VPANDNDZ128rrkz },
+ { X86::VANDPSZ128rmk, X86::VANDPDZ128rmk,
+ X86::VPANDQZ128rmk, X86::VPANDDZ128rmk },
+ { X86::VANDPSZ128rmkz, X86::VANDPDZ128rmkz,
+ X86::VPANDQZ128rmkz, X86::VPANDDZ128rmkz },
+ { X86::VANDPSZ128rrk, X86::VANDPDZ128rrk,
+ X86::VPANDQZ128rrk, X86::VPANDDZ128rrk },
+ { X86::VANDPSZ128rrkz, X86::VANDPDZ128rrkz,
+ X86::VPANDQZ128rrkz, X86::VPANDDZ128rrkz },
+ { X86::VORPSZ128rmk, X86::VORPDZ128rmk,
+ X86::VPORQZ128rmk, X86::VPORDZ128rmk },
+ { X86::VORPSZ128rmkz, X86::VORPDZ128rmkz,
+ X86::VPORQZ128rmkz, X86::VPORDZ128rmkz },
+ { X86::VORPSZ128rrk, X86::VORPDZ128rrk,
+ X86::VPORQZ128rrk, X86::VPORDZ128rrk },
+ { X86::VORPSZ128rrkz, X86::VORPDZ128rrkz,
+ X86::VPORQZ128rrkz, X86::VPORDZ128rrkz },
+ { X86::VXORPSZ128rmk, X86::VXORPDZ128rmk,
+ X86::VPXORQZ128rmk, X86::VPXORDZ128rmk },
+ { X86::VXORPSZ128rmkz, X86::VXORPDZ128rmkz,
+ X86::VPXORQZ128rmkz, X86::VPXORDZ128rmkz },
+ { X86::VXORPSZ128rrk, X86::VXORPDZ128rrk,
+ X86::VPXORQZ128rrk, X86::VPXORDZ128rrk },
+ { X86::VXORPSZ128rrkz, X86::VXORPDZ128rrkz,
+ X86::VPXORQZ128rrkz, X86::VPXORDZ128rrkz },
+ { X86::VANDNPSZ256rmk, X86::VANDNPDZ256rmk,
+ X86::VPANDNQZ256rmk, X86::VPANDNDZ256rmk },
+ { X86::VANDNPSZ256rmkz, X86::VANDNPDZ256rmkz,
+ X86::VPANDNQZ256rmkz, X86::VPANDNDZ256rmkz },
+ { X86::VANDNPSZ256rrk, X86::VANDNPDZ256rrk,
+ X86::VPANDNQZ256rrk, X86::VPANDNDZ256rrk },
+ { X86::VANDNPSZ256rrkz, X86::VANDNPDZ256rrkz,
+ X86::VPANDNQZ256rrkz, X86::VPANDNDZ256rrkz },
+ { X86::VANDPSZ256rmk, X86::VANDPDZ256rmk,
+ X86::VPANDQZ256rmk, X86::VPANDDZ256rmk },
+ { X86::VANDPSZ256rmkz, X86::VANDPDZ256rmkz,
+ X86::VPANDQZ256rmkz, X86::VPANDDZ256rmkz },
+ { X86::VANDPSZ256rrk, X86::VANDPDZ256rrk,
+ X86::VPANDQZ256rrk, X86::VPANDDZ256rrk },
+ { X86::VANDPSZ256rrkz, X86::VANDPDZ256rrkz,
+ X86::VPANDQZ256rrkz, X86::VPANDDZ256rrkz },
+ { X86::VORPSZ256rmk, X86::VORPDZ256rmk,
+ X86::VPORQZ256rmk, X86::VPORDZ256rmk },
+ { X86::VORPSZ256rmkz, X86::VORPDZ256rmkz,
+ X86::VPORQZ256rmkz, X86::VPORDZ256rmkz },
+ { X86::VORPSZ256rrk, X86::VORPDZ256rrk,
+ X86::VPORQZ256rrk, X86::VPORDZ256rrk },
+ { X86::VORPSZ256rrkz, X86::VORPDZ256rrkz,
+ X86::VPORQZ256rrkz, X86::VPORDZ256rrkz },
+ { X86::VXORPSZ256rmk, X86::VXORPDZ256rmk,
+ X86::VPXORQZ256rmk, X86::VPXORDZ256rmk },
+ { X86::VXORPSZ256rmkz, X86::VXORPDZ256rmkz,
+ X86::VPXORQZ256rmkz, X86::VPXORDZ256rmkz },
+ { X86::VXORPSZ256rrk, X86::VXORPDZ256rrk,
+ X86::VPXORQZ256rrk, X86::VPXORDZ256rrk },
+ { X86::VXORPSZ256rrkz, X86::VXORPDZ256rrkz,
+ X86::VPXORQZ256rrkz, X86::VPXORDZ256rrkz },
+ { X86::VANDNPSZrmk, X86::VANDNPDZrmk,
+ X86::VPANDNQZrmk, X86::VPANDNDZrmk },
+ { X86::VANDNPSZrmkz, X86::VANDNPDZrmkz,
+ X86::VPANDNQZrmkz, X86::VPANDNDZrmkz },
+ { X86::VANDNPSZrrk, X86::VANDNPDZrrk,
+ X86::VPANDNQZrrk, X86::VPANDNDZrrk },
+ { X86::VANDNPSZrrkz, X86::VANDNPDZrrkz,
+ X86::VPANDNQZrrkz, X86::VPANDNDZrrkz },
+ { X86::VANDPSZrmk, X86::VANDPDZrmk,
+ X86::VPANDQZrmk, X86::VPANDDZrmk },
+ { X86::VANDPSZrmkz, X86::VANDPDZrmkz,
+ X86::VPANDQZrmkz, X86::VPANDDZrmkz },
+ { X86::VANDPSZrrk, X86::VANDPDZrrk,
+ X86::VPANDQZrrk, X86::VPANDDZrrk },
+ { X86::VANDPSZrrkz, X86::VANDPDZrrkz,
+ X86::VPANDQZrrkz, X86::VPANDDZrrkz },
+ { X86::VORPSZrmk, X86::VORPDZrmk,
+ X86::VPORQZrmk, X86::VPORDZrmk },
+ { X86::VORPSZrmkz, X86::VORPDZrmkz,
+ X86::VPORQZrmkz, X86::VPORDZrmkz },
+ { X86::VORPSZrrk, X86::VORPDZrrk,
+ X86::VPORQZrrk, X86::VPORDZrrk },
+ { X86::VORPSZrrkz, X86::VORPDZrrkz,
+ X86::VPORQZrrkz, X86::VPORDZrrkz },
+ { X86::VXORPSZrmk, X86::VXORPDZrmk,
+ X86::VPXORQZrmk, X86::VPXORDZrmk },
+ { X86::VXORPSZrmkz, X86::VXORPDZrmkz,
+ X86::VPXORQZrmkz, X86::VPXORDZrmkz },
+ { X86::VXORPSZrrk, X86::VXORPDZrrk,
+ X86::VPXORQZrrk, X86::VPXORDZrrk },
+ { X86::VXORPSZrrkz, X86::VXORPDZrrkz,
+ X86::VPXORQZrrkz, X86::VPXORDZrrkz },
+ // Broadcast loads can be handled the same as masked operations to avoid
+ // changing element size.
+ { X86::VANDNPSZ128rmb, X86::VANDNPDZ128rmb,
+ X86::VPANDNQZ128rmb, X86::VPANDNDZ128rmb },
+ { X86::VANDPSZ128rmb, X86::VANDPDZ128rmb,
+ X86::VPANDQZ128rmb, X86::VPANDDZ128rmb },
+ { X86::VORPSZ128rmb, X86::VORPDZ128rmb,
+ X86::VPORQZ128rmb, X86::VPORDZ128rmb },
+ { X86::VXORPSZ128rmb, X86::VXORPDZ128rmb,
+ X86::VPXORQZ128rmb, X86::VPXORDZ128rmb },
+ { X86::VANDNPSZ256rmb, X86::VANDNPDZ256rmb,
+ X86::VPANDNQZ256rmb, X86::VPANDNDZ256rmb },
+ { X86::VANDPSZ256rmb, X86::VANDPDZ256rmb,
+ X86::VPANDQZ256rmb, X86::VPANDDZ256rmb },
+ { X86::VORPSZ256rmb, X86::VORPDZ256rmb,
+ X86::VPORQZ256rmb, X86::VPORDZ256rmb },
+ { X86::VXORPSZ256rmb, X86::VXORPDZ256rmb,
+ X86::VPXORQZ256rmb, X86::VPXORDZ256rmb },
+ { X86::VANDNPSZrmb, X86::VANDNPDZrmb,
+ X86::VPANDNQZrmb, X86::VPANDNDZrmb },
+ { X86::VANDPSZrmb, X86::VANDPDZrmb,
+ X86::VPANDQZrmb, X86::VPANDDZrmb },
+ { X86::VANDPSZrmb, X86::VANDPDZrmb,
+ X86::VPANDQZrmb, X86::VPANDDZrmb },
+ { X86::VORPSZrmb, X86::VORPDZrmb,
+ X86::VPORQZrmb, X86::VPORDZrmb },
+ { X86::VXORPSZrmb, X86::VXORPDZrmb,
+ X86::VPXORQZrmb, X86::VPXORDZrmb },
+ { X86::VANDNPSZ128rmbk, X86::VANDNPDZ128rmbk,
+ X86::VPANDNQZ128rmbk, X86::VPANDNDZ128rmbk },
+ { X86::VANDPSZ128rmbk, X86::VANDPDZ128rmbk,
+ X86::VPANDQZ128rmbk, X86::VPANDDZ128rmbk },
+ { X86::VORPSZ128rmbk, X86::VORPDZ128rmbk,
+ X86::VPORQZ128rmbk, X86::VPORDZ128rmbk },
+ { X86::VXORPSZ128rmbk, X86::VXORPDZ128rmbk,
+ X86::VPXORQZ128rmbk, X86::VPXORDZ128rmbk },
+ { X86::VANDNPSZ256rmbk, X86::VANDNPDZ256rmbk,
+ X86::VPANDNQZ256rmbk, X86::VPANDNDZ256rmbk },
+ { X86::VANDPSZ256rmbk, X86::VANDPDZ256rmbk,
+ X86::VPANDQZ256rmbk, X86::VPANDDZ256rmbk },
+ { X86::VORPSZ256rmbk, X86::VORPDZ256rmbk,
+ X86::VPORQZ256rmbk, X86::VPORDZ256rmbk },
+ { X86::VXORPSZ256rmbk, X86::VXORPDZ256rmbk,
+ X86::VPXORQZ256rmbk, X86::VPXORDZ256rmbk },
+ { X86::VANDNPSZrmbk, X86::VANDNPDZrmbk,
+ X86::VPANDNQZrmbk, X86::VPANDNDZrmbk },
+ { X86::VANDPSZrmbk, X86::VANDPDZrmbk,
+ X86::VPANDQZrmbk, X86::VPANDDZrmbk },
+ { X86::VANDPSZrmbk, X86::VANDPDZrmbk,
+ X86::VPANDQZrmbk, X86::VPANDDZrmbk },
+ { X86::VORPSZrmbk, X86::VORPDZrmbk,
+ X86::VPORQZrmbk, X86::VPORDZrmbk },
+ { X86::VXORPSZrmbk, X86::VXORPDZrmbk,
+ X86::VPXORQZrmbk, X86::VPXORDZrmbk },
+ { X86::VANDNPSZ128rmbkz,X86::VANDNPDZ128rmbkz,
+ X86::VPANDNQZ128rmbkz,X86::VPANDNDZ128rmbkz},
+ { X86::VANDPSZ128rmbkz, X86::VANDPDZ128rmbkz,
+ X86::VPANDQZ128rmbkz, X86::VPANDDZ128rmbkz },
+ { X86::VORPSZ128rmbkz, X86::VORPDZ128rmbkz,
+ X86::VPORQZ128rmbkz, X86::VPORDZ128rmbkz },
+ { X86::VXORPSZ128rmbkz, X86::VXORPDZ128rmbkz,
+ X86::VPXORQZ128rmbkz, X86::VPXORDZ128rmbkz },
+ { X86::VANDNPSZ256rmbkz,X86::VANDNPDZ256rmbkz,
+ X86::VPANDNQZ256rmbkz,X86::VPANDNDZ256rmbkz},
+ { X86::VANDPSZ256rmbkz, X86::VANDPDZ256rmbkz,
+ X86::VPANDQZ256rmbkz, X86::VPANDDZ256rmbkz },
+ { X86::VORPSZ256rmbkz, X86::VORPDZ256rmbkz,
+ X86::VPORQZ256rmbkz, X86::VPORDZ256rmbkz },
+ { X86::VXORPSZ256rmbkz, X86::VXORPDZ256rmbkz,
+ X86::VPXORQZ256rmbkz, X86::VPXORDZ256rmbkz },
+ { X86::VANDNPSZrmbkz, X86::VANDNPDZrmbkz,
+ X86::VPANDNQZrmbkz, X86::VPANDNDZrmbkz },
+ { X86::VANDPSZrmbkz, X86::VANDPDZrmbkz,
+ X86::VPANDQZrmbkz, X86::VPANDDZrmbkz },
+ { X86::VANDPSZrmbkz, X86::VANDPDZrmbkz,
+ X86::VPANDQZrmbkz, X86::VPANDDZrmbkz },
+ { X86::VORPSZrmbkz, X86::VORPDZrmbkz,
+ X86::VPORQZrmbkz, X86::VPORDZrmbkz },
+ { X86::VXORPSZrmbkz, X86::VXORPDZrmbkz,
+ X86::VPXORQZrmbkz, X86::VPXORDZrmbkz },
+};
+
+// FIXME: Some shuffle and unpack instructions have equivalents in different
+// domains, but they require a bit more work than just switching opcodes.
+
+static const uint16_t *lookup(unsigned opcode, unsigned domain,
+ ArrayRef<uint16_t[3]> Table) {
+ for (const uint16_t (&Row)[3] : Table)
+ if (Row[domain-1] == opcode)
+ return Row;
+ return nullptr;
+}
+
+static const uint16_t *lookupAVX512(unsigned opcode, unsigned domain,
+ ArrayRef<uint16_t[4]> Table) {
+ // If this is the integer domain make sure to check both integer columns.
+ for (const uint16_t (&Row)[4] : Table)
+ if (Row[domain-1] == opcode || (domain == 3 && Row[3] == opcode))
+ return Row;
+ return nullptr;
+}
+
+std::pair<uint16_t, uint16_t>
+X86InstrInfo::getExecutionDomain(const MachineInstr &MI) const {
+ uint16_t domain = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
+ unsigned opcode = MI.getOpcode();
+ uint16_t validDomains = 0;
+ if (domain) {
+ if (lookup(MI.getOpcode(), domain, ReplaceableInstrs)) {
+ validDomains = 0xe;
+ } else if (lookup(opcode, domain, ReplaceableInstrsAVX2)) {
+ validDomains = Subtarget.hasAVX2() ? 0xe : 0x6;
+ } else if (lookupAVX512(opcode, domain, ReplaceableInstrsAVX512)) {
+ validDomains = 0xe;
+ } else if (lookupAVX512(opcode, domain, ReplaceableInstrsAVX512DQ)) {
+ validDomains = Subtarget.hasDQI() ? 0xe : 0x8;
+ } else if (const uint16_t *table = lookupAVX512(opcode, domain,
+ ReplaceableInstrsAVX512DQMasked)) {
+ if (domain == 1 || (domain == 3 && table[3] == opcode))
+ validDomains = Subtarget.hasDQI() ? 0xa : 0x8;
+ else
+ validDomains = Subtarget.hasDQI() ? 0xc : 0x8;
+ }
+ }
+ return std::make_pair(domain, validDomains);
+}
+
+void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const {
+ assert(Domain>0 && Domain<4 && "Invalid execution domain");
+ uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
+ assert(dom && "Not an SSE instruction");
+ const uint16_t *table = lookup(MI.getOpcode(), dom, ReplaceableInstrs);
+ if (!table) { // try the other table
+ assert((Subtarget.hasAVX2() || Domain < 3) &&
+ "256-bit vector operations only available in AVX2");
+ table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2);
+ }
+ if (!table) { // try the AVX512 table
+ assert(Subtarget.hasAVX512() && "Requires AVX-512");
+ table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512);
+ // Don't change integer Q instructions to D instructions.
+ if (table && Domain == 3 && table[3] == MI.getOpcode())
+ Domain = 4;
+ }
+ if (!table) { // try the AVX512DQ table
+ assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ");
+ table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQ);
+ // Don't change integer Q instructions to D instructions and
+ // use D intructions if we started with a PS instruction.
+ if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
+ Domain = 4;
+ }
+ if (!table) { // try the AVX512DQMasked table
+ assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ");
+ table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQMasked);
+ if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
+ Domain = 4;
+ }
+ assert(table && "Cannot change domain");
+ MI.setDesc(get(table[Domain - 1]));
+}
+
+/// Return the noop instruction to use for a noop.
+void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+ NopInst.setOpcode(X86::NOOP);
+}
+
+bool X86InstrInfo::isHighLatencyDef(int opc) const {
+ switch (opc) {
+ default: return false;
+ case X86::DIVPDrm:
+ case X86::DIVPDrr:
+ case X86::DIVPSrm:
+ case X86::DIVPSrr:
+ case X86::DIVSDrm:
+ case X86::DIVSDrm_Int:
+ case X86::DIVSDrr:
+ case X86::DIVSDrr_Int:
+ case X86::DIVSSrm:
+ case X86::DIVSSrm_Int:
+ case X86::DIVSSrr:
+ case X86::DIVSSrr_Int:
+ case X86::SQRTPDm:
+ case X86::SQRTPDr:
+ case X86::SQRTPSm:
+ case X86::SQRTPSr:
+ case X86::SQRTSDm:
+ case X86::SQRTSDm_Int:
+ case X86::SQRTSDr:
+ case X86::SQRTSDr_Int:
+ case X86::SQRTSSm:
+ case X86::SQRTSSm_Int:
+ case X86::SQRTSSr:
+ case X86::SQRTSSr_Int:
+ // AVX instructions with high latency
+ case X86::VDIVPDrm:
+ case X86::VDIVPDrr:
+ case X86::VDIVPDYrm:
+ case X86::VDIVPDYrr:
+ case X86::VDIVPSrm:
+ case X86::VDIVPSrr:
+ case X86::VDIVPSYrm:
+ case X86::VDIVPSYrr:
+ case X86::VDIVSDrm:
+ case X86::VDIVSDrm_Int:
+ case X86::VDIVSDrr:
+ case X86::VDIVSDrr_Int:
+ case X86::VDIVSSrm:
+ case X86::VDIVSSrm_Int:
+ case X86::VDIVSSrr:
+ case X86::VDIVSSrr_Int:
+ case X86::VSQRTPDm:
+ case X86::VSQRTPDr:
+ case X86::VSQRTPDYm:
+ case X86::VSQRTPDYr:
+ case X86::VSQRTPSm:
+ case X86::VSQRTPSr:
+ case X86::VSQRTPSYm:
+ case X86::VSQRTPSYr:
+ case X86::VSQRTSDm:
+ case X86::VSQRTSDm_Int:
+ case X86::VSQRTSDr:
+ case X86::VSQRTSDr_Int:
+ case X86::VSQRTSSm:
+ case X86::VSQRTSSm_Int:
+ case X86::VSQRTSSr:
+ case X86::VSQRTSSr_Int:
+ // AVX512 instructions with high latency
+ case X86::VDIVPDZ128rm:
+ case X86::VDIVPDZ128rmb:
+ case X86::VDIVPDZ128rmbk:
+ case X86::VDIVPDZ128rmbkz:
+ case X86::VDIVPDZ128rmk:
+ case X86::VDIVPDZ128rmkz:
+ case X86::VDIVPDZ128rr:
+ case X86::VDIVPDZ128rrk:
+ case X86::VDIVPDZ128rrkz:
+ case X86::VDIVPDZ256rm:
+ case X86::VDIVPDZ256rmb:
+ case X86::VDIVPDZ256rmbk:
+ case X86::VDIVPDZ256rmbkz:
+ case X86::VDIVPDZ256rmk:
+ case X86::VDIVPDZ256rmkz:
+ case X86::VDIVPDZ256rr:
+ case X86::VDIVPDZ256rrk:
+ case X86::VDIVPDZ256rrkz:
+ case X86::VDIVPDZrb:
+ case X86::VDIVPDZrbk:
+ case X86::VDIVPDZrbkz:
+ case X86::VDIVPDZrm:
+ case X86::VDIVPDZrmb:
+ case X86::VDIVPDZrmbk:
+ case X86::VDIVPDZrmbkz:
+ case X86::VDIVPDZrmk:
+ case X86::VDIVPDZrmkz:
+ case X86::VDIVPDZrr:
+ case X86::VDIVPDZrrk:
+ case X86::VDIVPDZrrkz:
+ case X86::VDIVPSZ128rm:
+ case X86::VDIVPSZ128rmb:
+ case X86::VDIVPSZ128rmbk:
+ case X86::VDIVPSZ128rmbkz:
+ case X86::VDIVPSZ128rmk:
+ case X86::VDIVPSZ128rmkz:
+ case X86::VDIVPSZ128rr:
+ case X86::VDIVPSZ128rrk:
+ case X86::VDIVPSZ128rrkz:
+ case X86::VDIVPSZ256rm:
+ case X86::VDIVPSZ256rmb:
+ case X86::VDIVPSZ256rmbk:
+ case X86::VDIVPSZ256rmbkz:
+ case X86::VDIVPSZ256rmk:
+ case X86::VDIVPSZ256rmkz:
+ case X86::VDIVPSZ256rr:
+ case X86::VDIVPSZ256rrk:
+ case X86::VDIVPSZ256rrkz:
+ case X86::VDIVPSZrb:
+ case X86::VDIVPSZrbk:
+ case X86::VDIVPSZrbkz:
+ case X86::VDIVPSZrm:
+ case X86::VDIVPSZrmb:
+ case X86::VDIVPSZrmbk:
+ case X86::VDIVPSZrmbkz:
+ case X86::VDIVPSZrmk:
+ case X86::VDIVPSZrmkz:
+ case X86::VDIVPSZrr:
+ case X86::VDIVPSZrrk:
+ case X86::VDIVPSZrrkz:
+ case X86::VDIVSDZrm:
+ case X86::VDIVSDZrr:
+ case X86::VDIVSDZrm_Int:
+ case X86::VDIVSDZrm_Intk:
+ case X86::VDIVSDZrm_Intkz:
+ case X86::VDIVSDZrr_Int:
+ case X86::VDIVSDZrr_Intk:
+ case X86::VDIVSDZrr_Intkz:
+ case X86::VDIVSDZrrb:
+ case X86::VDIVSDZrrbk:
+ case X86::VDIVSDZrrbkz:
+ case X86::VDIVSSZrm:
+ case X86::VDIVSSZrr:
+ case X86::VDIVSSZrm_Int:
+ case X86::VDIVSSZrm_Intk:
+ case X86::VDIVSSZrm_Intkz:
+ case X86::VDIVSSZrr_Int:
+ case X86::VDIVSSZrr_Intk:
+ case X86::VDIVSSZrr_Intkz:
+ case X86::VDIVSSZrrb:
+ case X86::VDIVSSZrrbk:
+ case X86::VDIVSSZrrbkz:
+ case X86::VSQRTPDZ128m:
+ case X86::VSQRTPDZ128mb:
+ case X86::VSQRTPDZ128mbk:
+ case X86::VSQRTPDZ128mbkz:
+ case X86::VSQRTPDZ128mk:
+ case X86::VSQRTPDZ128mkz:
+ case X86::VSQRTPDZ128r:
+ case X86::VSQRTPDZ128rk:
+ case X86::VSQRTPDZ128rkz:
+ case X86::VSQRTPDZ256m:
+ case X86::VSQRTPDZ256mb:
+ case X86::VSQRTPDZ256mbk:
+ case X86::VSQRTPDZ256mbkz:
+ case X86::VSQRTPDZ256mk:
+ case X86::VSQRTPDZ256mkz:
+ case X86::VSQRTPDZ256r:
+ case X86::VSQRTPDZ256rk:
+ case X86::VSQRTPDZ256rkz:
+ case X86::VSQRTPDZm:
+ case X86::VSQRTPDZmb:
+ case X86::VSQRTPDZmbk:
+ case X86::VSQRTPDZmbkz:
+ case X86::VSQRTPDZmk:
+ case X86::VSQRTPDZmkz:
+ case X86::VSQRTPDZr:
+ case X86::VSQRTPDZrb:
+ case X86::VSQRTPDZrbk:
+ case X86::VSQRTPDZrbkz:
+ case X86::VSQRTPDZrk:
+ case X86::VSQRTPDZrkz:
+ case X86::VSQRTPSZ128m:
+ case X86::VSQRTPSZ128mb:
+ case X86::VSQRTPSZ128mbk:
+ case X86::VSQRTPSZ128mbkz:
+ case X86::VSQRTPSZ128mk:
+ case X86::VSQRTPSZ128mkz:
+ case X86::VSQRTPSZ128r:
+ case X86::VSQRTPSZ128rk:
+ case X86::VSQRTPSZ128rkz:
+ case X86::VSQRTPSZ256m:
+ case X86::VSQRTPSZ256mb:
+ case X86::VSQRTPSZ256mbk:
+ case X86::VSQRTPSZ256mbkz:
+ case X86::VSQRTPSZ256mk:
+ case X86::VSQRTPSZ256mkz:
+ case X86::VSQRTPSZ256r:
+ case X86::VSQRTPSZ256rk:
+ case X86::VSQRTPSZ256rkz:
+ case X86::VSQRTPSZm:
+ case X86::VSQRTPSZmb:
+ case X86::VSQRTPSZmbk:
+ case X86::VSQRTPSZmbkz:
+ case X86::VSQRTPSZmk:
+ case X86::VSQRTPSZmkz:
+ case X86::VSQRTPSZr:
+ case X86::VSQRTPSZrb:
+ case X86::VSQRTPSZrbk:
+ case X86::VSQRTPSZrbkz:
+ case X86::VSQRTPSZrk:
+ case X86::VSQRTPSZrkz:
+ case X86::VSQRTSDZm:
+ case X86::VSQRTSDZm_Int:
+ case X86::VSQRTSDZm_Intk:
+ case X86::VSQRTSDZm_Intkz:
+ case X86::VSQRTSDZr:
+ case X86::VSQRTSDZr_Int:
+ case X86::VSQRTSDZr_Intk:
+ case X86::VSQRTSDZr_Intkz:
+ case X86::VSQRTSDZrb_Int:
+ case X86::VSQRTSDZrb_Intk:
+ case X86::VSQRTSDZrb_Intkz:
+ case X86::VSQRTSSZm:
+ case X86::VSQRTSSZm_Int:
+ case X86::VSQRTSSZm_Intk:
+ case X86::VSQRTSSZm_Intkz:
+ case X86::VSQRTSSZr:
+ case X86::VSQRTSSZr_Int:
+ case X86::VSQRTSSZr_Intk:
+ case X86::VSQRTSSZr_Intkz:
+ case X86::VSQRTSSZrb_Int:
+ case X86::VSQRTSSZrb_Intk:
+ case X86::VSQRTSSZrb_Intkz:
+
+ case X86::VGATHERDPDYrm:
+ case X86::VGATHERDPDZ128rm:
+ case X86::VGATHERDPDZ256rm:
+ case X86::VGATHERDPDZrm:
+ case X86::VGATHERDPDrm:
+ case X86::VGATHERDPSYrm:
+ case X86::VGATHERDPSZ128rm:
+ case X86::VGATHERDPSZ256rm:
+ case X86::VGATHERDPSZrm:
+ case X86::VGATHERDPSrm:
+ case X86::VGATHERPF0DPDm:
+ case X86::VGATHERPF0DPSm:
+ case X86::VGATHERPF0QPDm:
+ case X86::VGATHERPF0QPSm:
+ case X86::VGATHERPF1DPDm:
+ case X86::VGATHERPF1DPSm:
+ case X86::VGATHERPF1QPDm:
+ case X86::VGATHERPF1QPSm:
+ case X86::VGATHERQPDYrm:
+ case X86::VGATHERQPDZ128rm:
+ case X86::VGATHERQPDZ256rm:
+ case X86::VGATHERQPDZrm:
+ case X86::VGATHERQPDrm:
+ case X86::VGATHERQPSYrm:
+ case X86::VGATHERQPSZ128rm:
+ case X86::VGATHERQPSZ256rm:
+ case X86::VGATHERQPSZrm:
+ case X86::VGATHERQPSrm:
+ case X86::VPGATHERDDYrm:
+ case X86::VPGATHERDDZ128rm:
+ case X86::VPGATHERDDZ256rm:
+ case X86::VPGATHERDDZrm:
+ case X86::VPGATHERDDrm:
+ case X86::VPGATHERDQYrm:
+ case X86::VPGATHERDQZ128rm:
+ case X86::VPGATHERDQZ256rm:
+ case X86::VPGATHERDQZrm:
+ case X86::VPGATHERDQrm:
+ case X86::VPGATHERQDYrm:
+ case X86::VPGATHERQDZ128rm:
+ case X86::VPGATHERQDZ256rm:
+ case X86::VPGATHERQDZrm:
+ case X86::VPGATHERQDrm:
+ case X86::VPGATHERQQYrm:
+ case X86::VPGATHERQQZ128rm:
+ case X86::VPGATHERQQZ256rm:
+ case X86::VPGATHERQQZrm:
+ case X86::VPGATHERQQrm:
+ case X86::VSCATTERDPDZ128mr:
+ case X86::VSCATTERDPDZ256mr:
+ case X86::VSCATTERDPDZmr:
+ case X86::VSCATTERDPSZ128mr:
+ case X86::VSCATTERDPSZ256mr:
+ case X86::VSCATTERDPSZmr:
+ case X86::VSCATTERPF0DPDm:
+ case X86::VSCATTERPF0DPSm:
+ case X86::VSCATTERPF0QPDm:
+ case X86::VSCATTERPF0QPSm:
+ case X86::VSCATTERPF1DPDm:
+ case X86::VSCATTERPF1DPSm:
+ case X86::VSCATTERPF1QPDm:
+ case X86::VSCATTERPF1QPSm:
+ case X86::VSCATTERQPDZ128mr:
+ case X86::VSCATTERQPDZ256mr:
+ case X86::VSCATTERQPDZmr:
+ case X86::VSCATTERQPSZ128mr:
+ case X86::VSCATTERQPSZ256mr:
+ case X86::VSCATTERQPSZmr:
+ case X86::VPSCATTERDDZ128mr:
+ case X86::VPSCATTERDDZ256mr:
+ case X86::VPSCATTERDDZmr:
+ case X86::VPSCATTERDQZ128mr:
+ case X86::VPSCATTERDQZ256mr:
+ case X86::VPSCATTERDQZmr:
+ case X86::VPSCATTERQDZ128mr:
+ case X86::VPSCATTERQDZ256mr:
+ case X86::VPSCATTERQDZmr:
+ case X86::VPSCATTERQQZ128mr:
+ case X86::VPSCATTERQQZ256mr:
+ case X86::VPSCATTERQQZmr:
+ return true;
+ }
+}
+
+bool X86InstrInfo::hasHighOperandLatency(const TargetSchedModel &SchedModel,
+ const MachineRegisterInfo *MRI,
+ const MachineInstr &DefMI,
+ unsigned DefIdx,
+ const MachineInstr &UseMI,
+ unsigned UseIdx) const {
+ return isHighLatencyDef(DefMI.getOpcode());
+}
+
+bool X86InstrInfo::hasReassociableOperands(const MachineInstr &Inst,
+ const MachineBasicBlock *MBB) const {
+ assert((Inst.getNumOperands() == 3 || Inst.getNumOperands() == 4) &&
+ "Reassociation needs binary operators");
+
+ // Integer binary math/logic instructions have a third source operand:
+ // the EFLAGS register. That operand must be both defined here and never
+ // used; ie, it must be dead. If the EFLAGS operand is live, then we can
+ // not change anything because rearranging the operands could affect other
+ // instructions that depend on the exact status flags (zero, sign, etc.)
+ // that are set by using these particular operands with this operation.
+ if (Inst.getNumOperands() == 4) {
+ assert(Inst.getOperand(3).isReg() &&
+ Inst.getOperand(3).getReg() == X86::EFLAGS &&
+ "Unexpected operand in reassociable instruction");
+ if (!Inst.getOperand(3).isDead())
+ return false;
+ }
+
+ return TargetInstrInfo::hasReassociableOperands(Inst, MBB);
+}
+
+// TODO: There are many more machine instruction opcodes to match:
+// 1. Other data types (integer, vectors)
+// 2. Other math / logic operations (xor, or)
+// 3. Other forms of the same operation (intrinsics and other variants)
+bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
+ switch (Inst.getOpcode()) {
+ case X86::AND8rr:
+ case X86::AND16rr:
+ case X86::AND32rr:
+ case X86::AND64rr:
+ case X86::OR8rr:
+ case X86::OR16rr:
+ case X86::OR32rr:
+ case X86::OR64rr:
+ case X86::XOR8rr:
+ case X86::XOR16rr:
+ case X86::XOR32rr:
+ case X86::XOR64rr:
+ case X86::IMUL16rr:
+ case X86::IMUL32rr:
+ case X86::IMUL64rr:
+ case X86::PANDrr:
+ case X86::PORrr:
+ case X86::PXORrr:
+ case X86::ANDPDrr:
+ case X86::ANDPSrr:
+ case X86::ORPDrr:
+ case X86::ORPSrr:
+ case X86::XORPDrr:
+ case X86::XORPSrr:
+ case X86::PADDBrr:
+ case X86::PADDWrr:
+ case X86::PADDDrr:
+ case X86::PADDQrr:
+ case X86::VPANDrr:
+ case X86::VPANDYrr:
+ case X86::VPANDDZ128rr:
+ case X86::VPANDDZ256rr:
+ case X86::VPANDDZrr:
+ case X86::VPANDQZ128rr:
+ case X86::VPANDQZ256rr:
+ case X86::VPANDQZrr:
+ case X86::VPORrr:
+ case X86::VPORYrr:
+ case X86::VPORDZ128rr:
+ case X86::VPORDZ256rr:
+ case X86::VPORDZrr:
+ case X86::VPORQZ128rr:
+ case X86::VPORQZ256rr:
+ case X86::VPORQZrr:
+ case X86::VPXORrr:
+ case X86::VPXORYrr:
+ case X86::VPXORDZ128rr:
+ case X86::VPXORDZ256rr:
+ case X86::VPXORDZrr:
+ case X86::VPXORQZ128rr:
+ case X86::VPXORQZ256rr:
+ case X86::VPXORQZrr:
+ case X86::VANDPDrr:
+ case X86::VANDPSrr:
+ case X86::VANDPDYrr:
+ case X86::VANDPSYrr:
+ case X86::VANDPDZ128rr:
+ case X86::VANDPSZ128rr:
+ case X86::VANDPDZ256rr:
+ case X86::VANDPSZ256rr:
+ case X86::VANDPDZrr:
+ case X86::VANDPSZrr:
+ case X86::VORPDrr:
+ case X86::VORPSrr:
+ case X86::VORPDYrr:
+ case X86::VORPSYrr:
+ case X86::VORPDZ128rr:
+ case X86::VORPSZ128rr:
+ case X86::VORPDZ256rr:
+ case X86::VORPSZ256rr:
+ case X86::VORPDZrr:
+ case X86::VORPSZrr:
+ case X86::VXORPDrr:
+ case X86::VXORPSrr:
+ case X86::VXORPDYrr:
+ case X86::VXORPSYrr:
+ case X86::VXORPDZ128rr:
+ case X86::VXORPSZ128rr:
+ case X86::VXORPDZ256rr:
+ case X86::VXORPSZ256rr:
+ case X86::VXORPDZrr:
+ case X86::VXORPSZrr:
+ case X86::KADDBrr:
+ case X86::KADDWrr:
+ case X86::KADDDrr:
+ case X86::KADDQrr:
+ case X86::KANDBrr:
+ case X86::KANDWrr:
+ case X86::KANDDrr:
+ case X86::KANDQrr:
+ case X86::KORBrr:
+ case X86::KORWrr:
+ case X86::KORDrr:
+ case X86::KORQrr:
+ case X86::KXORBrr:
+ case X86::KXORWrr:
+ case X86::KXORDrr:
+ case X86::KXORQrr:
+ case X86::VPADDBrr:
+ case X86::VPADDWrr:
+ case X86::VPADDDrr:
+ case X86::VPADDQrr:
+ case X86::VPADDBYrr:
+ case X86::VPADDWYrr:
+ case X86::VPADDDYrr:
+ case X86::VPADDQYrr:
+ case X86::VPADDBZ128rr:
+ case X86::VPADDWZ128rr:
+ case X86::VPADDDZ128rr:
+ case X86::VPADDQZ128rr:
+ case X86::VPADDBZ256rr:
+ case X86::VPADDWZ256rr:
+ case X86::VPADDDZ256rr:
+ case X86::VPADDQZ256rr:
+ case X86::VPADDBZrr:
+ case X86::VPADDWZrr:
+ case X86::VPADDDZrr:
+ case X86::VPADDQZrr:
+ case X86::VPMULLWrr:
+ case X86::VPMULLWYrr:
+ case X86::VPMULLWZ128rr:
+ case X86::VPMULLWZ256rr:
+ case X86::VPMULLWZrr:
+ case X86::VPMULLDrr:
+ case X86::VPMULLDYrr:
+ case X86::VPMULLDZ128rr:
+ case X86::VPMULLDZ256rr:
+ case X86::VPMULLDZrr:
+ case X86::VPMULLQZ128rr:
+ case X86::VPMULLQZ256rr:
+ case X86::VPMULLQZrr:
+ // Normal min/max instructions are not commutative because of NaN and signed
+ // zero semantics, but these are. Thus, there's no need to check for global
+ // relaxed math; the instructions themselves have the properties we need.
+ case X86::MAXCPDrr:
+ case X86::MAXCPSrr:
+ case X86::MAXCSDrr:
+ case X86::MAXCSSrr:
+ case X86::MINCPDrr:
+ case X86::MINCPSrr:
+ case X86::MINCSDrr:
+ case X86::MINCSSrr:
+ case X86::VMAXCPDrr:
+ case X86::VMAXCPSrr:
+ case X86::VMAXCPDYrr:
+ case X86::VMAXCPSYrr:
+ case X86::VMAXCPDZ128rr:
+ case X86::VMAXCPSZ128rr:
+ case X86::VMAXCPDZ256rr:
+ case X86::VMAXCPSZ256rr:
+ case X86::VMAXCPDZrr:
+ case X86::VMAXCPSZrr:
+ case X86::VMAXCSDrr:
+ case X86::VMAXCSSrr:
+ case X86::VMAXCSDZrr:
+ case X86::VMAXCSSZrr:
+ case X86::VMINCPDrr:
+ case X86::VMINCPSrr:
+ case X86::VMINCPDYrr:
+ case X86::VMINCPSYrr:
+ case X86::VMINCPDZ128rr:
+ case X86::VMINCPSZ128rr:
+ case X86::VMINCPDZ256rr:
+ case X86::VMINCPSZ256rr:
+ case X86::VMINCPDZrr:
+ case X86::VMINCPSZrr:
+ case X86::VMINCSDrr:
+ case X86::VMINCSSrr:
+ case X86::VMINCSDZrr:
+ case X86::VMINCSSZrr:
+ return true;
+ case X86::ADDPDrr:
+ case X86::ADDPSrr:
+ case X86::ADDSDrr:
+ case X86::ADDSSrr:
+ case X86::MULPDrr:
+ case X86::MULPSrr:
+ case X86::MULSDrr:
+ case X86::MULSSrr:
+ case X86::VADDPDrr:
+ case X86::VADDPSrr:
+ case X86::VADDPDYrr:
+ case X86::VADDPSYrr:
+ case X86::VADDPDZ128rr:
+ case X86::VADDPSZ128rr:
+ case X86::VADDPDZ256rr:
+ case X86::VADDPSZ256rr:
+ case X86::VADDPDZrr:
+ case X86::VADDPSZrr:
+ case X86::VADDSDrr:
+ case X86::VADDSSrr:
+ case X86::VADDSDZrr:
+ case X86::VADDSSZrr:
+ case X86::VMULPDrr:
+ case X86::VMULPSrr:
+ case X86::VMULPDYrr:
+ case X86::VMULPSYrr:
+ case X86::VMULPDZ128rr:
+ case X86::VMULPSZ128rr:
+ case X86::VMULPDZ256rr:
+ case X86::VMULPSZ256rr:
+ case X86::VMULPDZrr:
+ case X86::VMULPSZrr:
+ case X86::VMULSDrr:
+ case X86::VMULSSrr:
+ case X86::VMULSDZrr:
+ case X86::VMULSSZrr:
+ return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
+ default:
+ return false;
+ }
+}
+
+/// This is an architecture-specific helper function of reassociateOps.
+/// Set special operand attributes for new instructions after reassociation.
+void X86InstrInfo::setSpecialOperandAttr(MachineInstr &OldMI1,
+ MachineInstr &OldMI2,
+ MachineInstr &NewMI1,
+ MachineInstr &NewMI2) const {
+ // Integer instructions define an implicit EFLAGS source register operand as
+ // the third source (fourth total) operand.
+ if (OldMI1.getNumOperands() != 4 || OldMI2.getNumOperands() != 4)
+ return;
+
+ assert(NewMI1.getNumOperands() == 4 && NewMI2.getNumOperands() == 4 &&
+ "Unexpected instruction type for reassociation");
+
+ MachineOperand &OldOp1 = OldMI1.getOperand(3);
+ MachineOperand &OldOp2 = OldMI2.getOperand(3);
+ MachineOperand &NewOp1 = NewMI1.getOperand(3);
+ MachineOperand &NewOp2 = NewMI2.getOperand(3);
+
+ assert(OldOp1.isReg() && OldOp1.getReg() == X86::EFLAGS && OldOp1.isDead() &&
+ "Must have dead EFLAGS operand in reassociable instruction");
+ assert(OldOp2.isReg() && OldOp2.getReg() == X86::EFLAGS && OldOp2.isDead() &&
+ "Must have dead EFLAGS operand in reassociable instruction");
+
+ (void)OldOp1;
+ (void)OldOp2;
+
+ assert(NewOp1.isReg() && NewOp1.getReg() == X86::EFLAGS &&
+ "Unexpected operand in reassociable instruction");
+ assert(NewOp2.isReg() && NewOp2.getReg() == X86::EFLAGS &&
+ "Unexpected operand in reassociable instruction");
+
+ // Mark the new EFLAGS operands as dead to be helpful to subsequent iterations
+ // of this pass or other passes. The EFLAGS operands must be dead in these new
+ // instructions because the EFLAGS operands in the original instructions must
+ // be dead in order for reassociation to occur.
+ NewOp1.setIsDead();
+ NewOp2.setIsDead();
+}
+
+std::pair<unsigned, unsigned>
+X86InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
+ return std::make_pair(TF, 0u);
+}
+
+ArrayRef<std::pair<unsigned, const char *>>
+X86InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
+ using namespace X86II;
+ static const std::pair<unsigned, const char *> TargetFlags[] = {
+ {MO_GOT_ABSOLUTE_ADDRESS, "x86-got-absolute-address"},
+ {MO_PIC_BASE_OFFSET, "x86-pic-base-offset"},
+ {MO_GOT, "x86-got"},
+ {MO_GOTOFF, "x86-gotoff"},
+ {MO_GOTPCREL, "x86-gotpcrel"},
+ {MO_PLT, "x86-plt"},
+ {MO_TLSGD, "x86-tlsgd"},
+ {MO_TLSLD, "x86-tlsld"},
+ {MO_TLSLDM, "x86-tlsldm"},
+ {MO_GOTTPOFF, "x86-gottpoff"},
+ {MO_INDNTPOFF, "x86-indntpoff"},
+ {MO_TPOFF, "x86-tpoff"},
+ {MO_DTPOFF, "x86-dtpoff"},
+ {MO_NTPOFF, "x86-ntpoff"},
+ {MO_GOTNTPOFF, "x86-gotntpoff"},
+ {MO_DLLIMPORT, "x86-dllimport"},
+ {MO_DARWIN_NONLAZY, "x86-darwin-nonlazy"},
+ {MO_DARWIN_NONLAZY_PIC_BASE, "x86-darwin-nonlazy-pic-base"},
+ {MO_TLVP, "x86-tlvp"},
+ {MO_TLVP_PIC_BASE, "x86-tlvp-pic-base"},
+ {MO_SECREL, "x86-secrel"}};
+ return makeArrayRef(TargetFlags);
+}
+
+bool X86InstrInfo::isTailCall(const MachineInstr &Inst) const {
+ switch (Inst.getOpcode()) {
+ case X86::TCRETURNdi:
+ case X86::TCRETURNmi:
+ case X86::TCRETURNri:
+ case X86::TCRETURNdi64:
+ case X86::TCRETURNmi64:
+ case X86::TCRETURNri64:
+ case X86::TAILJMPd:
+ case X86::TAILJMPm:
+ case X86::TAILJMPr:
+ case X86::TAILJMPd64:
+ case X86::TAILJMPm64:
+ case X86::TAILJMPr64:
+ case X86::TAILJMPm64_REX:
+ case X86::TAILJMPr64_REX:
+ return true;
+ default:
+ return false;
+ }
+}
+
+namespace {
+ /// Create Global Base Reg pass. This initializes the PIC
+ /// global base register for x86-32.
+ struct CGBR : public MachineFunctionPass {
+ static char ID;
+ CGBR() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ const X86TargetMachine *TM =
+ static_cast<const X86TargetMachine *>(&MF.getTarget());
+ const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+
+ // Don't do anything if this is 64-bit as 64-bit PIC
+ // uses RIP relative addressing.
+ if (STI.is64Bit())
+ return false;
+
+ // Only emit a global base reg in PIC mode.
+ if (!TM->isPositionIndependent())
+ return false;
+
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ unsigned GlobalBaseReg = X86FI->getGlobalBaseReg();
+
+ // If we didn't need a GlobalBaseReg, don't insert code.
+ if (GlobalBaseReg == 0)
+ return false;
+
+ // Insert the set of GlobalBaseReg into the first MBB of the function
+ MachineBasicBlock &FirstMBB = MF.front();
+ MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+ DebugLoc DL = FirstMBB.findDebugLoc(MBBI);
+ MachineRegisterInfo &RegInfo = MF.getRegInfo();
+ const X86InstrInfo *TII = STI.getInstrInfo();
+
+ unsigned PC;
+ if (STI.isPICStyleGOT())
+ PC = RegInfo.createVirtualRegister(&X86::GR32RegClass);
+ else
+ PC = GlobalBaseReg;
+
+ // Operand of MovePCtoStack is completely ignored by asm printer. It's
+ // only used in JIT code emission as displacement to pc.
+ BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVPC32r), PC).addImm(0);
+
+ // If we're using vanilla 'GOT' PIC style, we should use relative addressing
+ // not to pc, but to _GLOBAL_OFFSET_TABLE_ external.
+ if (STI.isPICStyleGOT()) {
+ // Generate addl $__GLOBAL_OFFSET_TABLE_ + [.-piclabel], %some_register
+ BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg)
+ .addReg(PC).addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
+ X86II::MO_GOT_ABSOLUTE_ADDRESS);
+ }
+
+ return true;
+ }
+
+ StringRef getPassName() const override {
+ return "X86 PIC Global Base Reg Initialization";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+ };
+}
+
+char CGBR::ID = 0;
+FunctionPass*
+llvm::createX86GlobalBaseRegPass() { return new CGBR(); }
+
+namespace {
+ struct LDTLSCleanup : public MachineFunctionPass {
+ static char ID;
+ LDTLSCleanup() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
+ X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
+ if (MFI->getNumLocalDynamicTLSAccesses() < 2) {
+ // No point folding accesses if there isn't at least two.
+ return false;
+ }
+
+ MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
+ return VisitNode(DT->getRootNode(), 0);
+ }
+
+ // Visit the dominator subtree rooted at Node in pre-order.
+ // If TLSBaseAddrReg is non-null, then use that to replace any
+ // TLS_base_addr instructions. Otherwise, create the register
+ // when the first such instruction is seen, and then use it
+ // as we encounter more instructions.
+ bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) {
+ MachineBasicBlock *BB = Node->getBlock();
+ bool Changed = false;
+
+ // Traverse the current block.
+ for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
+ ++I) {
+ switch (I->getOpcode()) {
+ case X86::TLS_base_addr32:
+ case X86::TLS_base_addr64:
+ if (TLSBaseAddrReg)
+ I = ReplaceTLSBaseAddrCall(*I, TLSBaseAddrReg);
+ else
+ I = SetRegister(*I, &TLSBaseAddrReg);
+ Changed = true;
+ break;
+ default:
+ break;
+ }
+ }
+
+ // Visit the children of this block in the dominator tree.
+ for (MachineDomTreeNode::iterator I = Node->begin(), E = Node->end();
+ I != E; ++I) {
+ Changed |= VisitNode(*I, TLSBaseAddrReg);
+ }
+
+ return Changed;
+ }
+
+ // Replace the TLS_base_addr instruction I with a copy from
+ // TLSBaseAddrReg, returning the new instruction.
+ MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr &I,
+ unsigned TLSBaseAddrReg) {
+ MachineFunction *MF = I.getParent()->getParent();
+ const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>();
+ const bool is64Bit = STI.is64Bit();
+ const X86InstrInfo *TII = STI.getInstrInfo();
+
+ // Insert a Copy from TLSBaseAddrReg to RAX/EAX.
+ MachineInstr *Copy =
+ BuildMI(*I.getParent(), I, I.getDebugLoc(),
+ TII->get(TargetOpcode::COPY), is64Bit ? X86::RAX : X86::EAX)
+ .addReg(TLSBaseAddrReg);
+
+ // Erase the TLS_base_addr instruction.
+ I.eraseFromParent();
+
+ return Copy;
+ }
+
+ // Create a virtal register in *TLSBaseAddrReg, and populate it by
+ // inserting a copy instruction after I. Returns the new instruction.
+ MachineInstr *SetRegister(MachineInstr &I, unsigned *TLSBaseAddrReg) {
+ MachineFunction *MF = I.getParent()->getParent();
+ const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>();
+ const bool is64Bit = STI.is64Bit();
+ const X86InstrInfo *TII = STI.getInstrInfo();
+
+ // Create a virtual register for the TLS base address.
+ MachineRegisterInfo &RegInfo = MF->getRegInfo();
+ *TLSBaseAddrReg = RegInfo.createVirtualRegister(is64Bit
+ ? &X86::GR64RegClass
+ : &X86::GR32RegClass);
+
+ // Insert a copy from RAX/EAX to TLSBaseAddrReg.
+ MachineInstr *Next = I.getNextNode();
+ MachineInstr *Copy =
+ BuildMI(*I.getParent(), Next, I.getDebugLoc(),
+ TII->get(TargetOpcode::COPY), *TLSBaseAddrReg)
+ .addReg(is64Bit ? X86::RAX : X86::EAX);
+
+ return Copy;
+ }
+
+ StringRef getPassName() const override {
+ return "Local Dynamic TLS Access Clean-up";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<MachineDominatorTree>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+ };
+}
+
+char LDTLSCleanup::ID = 0;
+FunctionPass*
+llvm::createCleanupLocalDynamicTLSPass() { return new LDTLSCleanup(); }
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.h b/contrib/llvm/lib/Target/X86/X86InstrInfo.h
new file mode 100644
index 000000000000..8d746172dcbc
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.h
@@ -0,0 +1,608 @@
+//===-- X86InstrInfo.h - X86 Instruction Information ------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86INSTRINFO_H
+#define LLVM_LIB_TARGET_X86_X86INSTRINFO_H
+
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "X86InstrFMA3Info.h"
+#include "X86RegisterInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "X86GenInstrInfo.inc"
+
+namespace llvm {
+ class MachineInstrBuilder;
+ class X86RegisterInfo;
+ class X86Subtarget;
+
+namespace X86 {
+ // X86 specific condition code. These correspond to X86_*_COND in
+ // X86InstrInfo.td. They must be kept in synch.
+enum CondCode {
+ COND_A = 0,
+ COND_AE = 1,
+ COND_B = 2,
+ COND_BE = 3,
+ COND_E = 4,
+ COND_G = 5,
+ COND_GE = 6,
+ COND_L = 7,
+ COND_LE = 8,
+ COND_NE = 9,
+ COND_NO = 10,
+ COND_NP = 11,
+ COND_NS = 12,
+ COND_O = 13,
+ COND_P = 14,
+ COND_S = 15,
+ LAST_VALID_COND = COND_S,
+
+ // Artificial condition codes. These are used by AnalyzeBranch
+ // to indicate a block terminated with two conditional branches that together
+ // form a compound condition. They occur in code using FCMP_OEQ or FCMP_UNE,
+ // which can't be represented on x86 with a single condition. These
+ // are never used in MachineInstrs and are inverses of one another.
+ COND_NE_OR_P,
+ COND_E_AND_NP,
+
+ COND_INVALID
+};
+
+// Turn condition code into conditional branch opcode.
+unsigned GetCondBranchFromCond(CondCode CC);
+
+/// \brief Return a set opcode for the given condition and whether it has
+/// a memory operand.
+unsigned getSETFromCond(CondCode CC, bool HasMemoryOperand = false);
+
+/// \brief Return a cmov opcode for the given condition, register size in
+/// bytes, and operand type.
+unsigned getCMovFromCond(CondCode CC, unsigned RegBytes,
+ bool HasMemoryOperand = false);
+
+// Turn CMov opcode into condition code.
+CondCode getCondFromCMovOpc(unsigned Opc);
+
+/// GetOppositeBranchCondition - Return the inverse of the specified cond,
+/// e.g. turning COND_E to COND_NE.
+CondCode GetOppositeBranchCondition(CondCode CC);
+} // end namespace X86;
+
+
+/// isGlobalStubReference - Return true if the specified TargetFlag operand is
+/// a reference to a stub for a global, not the global itself.
+inline static bool isGlobalStubReference(unsigned char TargetFlag) {
+ switch (TargetFlag) {
+ case X86II::MO_DLLIMPORT: // dllimport stub.
+ case X86II::MO_GOTPCREL: // rip-relative GOT reference.
+ case X86II::MO_GOT: // normal GOT reference.
+ case X86II::MO_DARWIN_NONLAZY_PIC_BASE: // Normal $non_lazy_ptr ref.
+ case X86II::MO_DARWIN_NONLAZY: // Normal $non_lazy_ptr ref.
+ return true;
+ default:
+ return false;
+ }
+}
+
+/// isGlobalRelativeToPICBase - Return true if the specified global value
+/// reference is relative to a 32-bit PIC base (X86ISD::GlobalBaseReg). If this
+/// is true, the addressing mode has the PIC base register added in (e.g. EBX).
+inline static bool isGlobalRelativeToPICBase(unsigned char TargetFlag) {
+ switch (TargetFlag) {
+ case X86II::MO_GOTOFF: // isPICStyleGOT: local global.
+ case X86II::MO_GOT: // isPICStyleGOT: other global.
+ case X86II::MO_PIC_BASE_OFFSET: // Darwin local global.
+ case X86II::MO_DARWIN_NONLAZY_PIC_BASE: // Darwin/32 external global.
+ case X86II::MO_TLVP: // ??? Pretty sure..
+ return true;
+ default:
+ return false;
+ }
+}
+
+inline static bool isScale(const MachineOperand &MO) {
+ return MO.isImm() &&
+ (MO.getImm() == 1 || MO.getImm() == 2 ||
+ MO.getImm() == 4 || MO.getImm() == 8);
+}
+
+inline static bool isLeaMem(const MachineInstr &MI, unsigned Op) {
+ if (MI.getOperand(Op).isFI())
+ return true;
+ return Op + X86::AddrSegmentReg <= MI.getNumOperands() &&
+ MI.getOperand(Op + X86::AddrBaseReg).isReg() &&
+ isScale(MI.getOperand(Op + X86::AddrScaleAmt)) &&
+ MI.getOperand(Op + X86::AddrIndexReg).isReg() &&
+ (MI.getOperand(Op + X86::AddrDisp).isImm() ||
+ MI.getOperand(Op + X86::AddrDisp).isGlobal() ||
+ MI.getOperand(Op + X86::AddrDisp).isCPI() ||
+ MI.getOperand(Op + X86::AddrDisp).isJTI());
+}
+
+inline static bool isMem(const MachineInstr &MI, unsigned Op) {
+ if (MI.getOperand(Op).isFI())
+ return true;
+ return Op + X86::AddrNumOperands <= MI.getNumOperands() &&
+ MI.getOperand(Op + X86::AddrSegmentReg).isReg() && isLeaMem(MI, Op);
+}
+
+class X86InstrInfo final : public X86GenInstrInfo {
+ X86Subtarget &Subtarget;
+ const X86RegisterInfo RI;
+
+ /// RegOp2MemOpTable3Addr, RegOp2MemOpTable0, RegOp2MemOpTable1,
+ /// RegOp2MemOpTable2, RegOp2MemOpTable3 - Load / store folding opcode maps.
+ ///
+ typedef DenseMap<unsigned,
+ std::pair<uint16_t, uint16_t> > RegOp2MemOpTableType;
+ RegOp2MemOpTableType RegOp2MemOpTable2Addr;
+ RegOp2MemOpTableType RegOp2MemOpTable0;
+ RegOp2MemOpTableType RegOp2MemOpTable1;
+ RegOp2MemOpTableType RegOp2MemOpTable2;
+ RegOp2MemOpTableType RegOp2MemOpTable3;
+ RegOp2MemOpTableType RegOp2MemOpTable4;
+
+ /// MemOp2RegOpTable - Load / store unfolding opcode map.
+ ///
+ typedef DenseMap<unsigned,
+ std::pair<uint16_t, uint16_t> > MemOp2RegOpTableType;
+ MemOp2RegOpTableType MemOp2RegOpTable;
+
+ static void AddTableEntry(RegOp2MemOpTableType &R2MTable,
+ MemOp2RegOpTableType &M2RTable,
+ uint16_t RegOp, uint16_t MemOp, uint16_t Flags);
+
+ virtual void anchor();
+
+ bool AnalyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ SmallVectorImpl<MachineInstr *> &CondBranches,
+ bool AllowModify) const;
+
+public:
+ explicit X86InstrInfo(X86Subtarget &STI);
+
+ /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As
+ /// such, whenever a client has an instance of instruction info, it should
+ /// always be able to get register info as well (through this method).
+ ///
+ const X86RegisterInfo &getRegisterInfo() const { return RI; }
+
+ /// getSPAdjust - This returns the stack pointer adjustment made by
+ /// this instruction. For x86, we need to handle more complex call
+ /// sequences involving PUSHes.
+ int getSPAdjust(const MachineInstr &MI) const override;
+
+ /// isCoalescableExtInstr - Return true if the instruction is a "coalescable"
+ /// extension instruction. That is, it's like a copy where it's legal for the
+ /// source to overlap the destination. e.g. X86::MOVSX64rr32. If this returns
+ /// true, then it's expected the pre-extension value is available as a subreg
+ /// of the result register. This also returns the sub-register index in
+ /// SubIdx.
+ bool isCoalescableExtInstr(const MachineInstr &MI,
+ unsigned &SrcReg, unsigned &DstReg,
+ unsigned &SubIdx) const override;
+
+ unsigned isLoadFromStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const override;
+ /// isLoadFromStackSlotPostFE - Check for post-frame ptr elimination
+ /// stack locations as well. This uses a heuristic so it isn't
+ /// reliable for correctness.
+ unsigned isLoadFromStackSlotPostFE(const MachineInstr &MI,
+ int &FrameIndex) const override;
+
+ unsigned isStoreToStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const override;
+ /// isStoreToStackSlotPostFE - Check for post-frame ptr elimination
+ /// stack locations as well. This uses a heuristic so it isn't
+ /// reliable for correctness.
+ unsigned isStoreToStackSlotPostFE(const MachineInstr &MI,
+ int &FrameIndex) const override;
+
+ bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
+ AliasAnalysis *AA) const override;
+ void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ unsigned DestReg, unsigned SubIdx,
+ const MachineInstr &Orig,
+ const TargetRegisterInfo &TRI) const override;
+
+ /// Given an operand within a MachineInstr, insert preceding code to put it
+ /// into the right format for a particular kind of LEA instruction. This may
+ /// involve using an appropriate super-register instead (with an implicit use
+ /// of the original) or creating a new virtual register and inserting COPY
+ /// instructions to get the data into the right class.
+ ///
+ /// Reference parameters are set to indicate how caller should add this
+ /// operand to the LEA instruction.
+ bool classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
+ unsigned LEAOpcode, bool AllowSP, unsigned &NewSrc,
+ bool &isKill, bool &isUndef,
+ MachineOperand &ImplicitOp, LiveVariables *LV) const;
+
+ /// convertToThreeAddress - This method must be implemented by targets that
+ /// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
+ /// may be able to convert a two-address instruction into a true
+ /// three-address instruction on demand. This allows the X86 target (for
+ /// example) to convert ADD and SHL instructions into LEA instructions if they
+ /// would require register copies due to two-addressness.
+ ///
+ /// This method returns a null pointer if the transformation cannot be
+ /// performed, otherwise it returns the new instruction.
+ ///
+ MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
+ MachineInstr &MI,
+ LiveVariables *LV) const override;
+
+ /// Returns true iff the routine could find two commutable operands in the
+ /// given machine instruction.
+ /// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their
+ /// input values can be re-defined in this method only if the input values
+ /// are not pre-defined, which is designated by the special value
+ /// 'CommuteAnyOperandIndex' assigned to it.
+ /// If both of indices are pre-defined and refer to some operands, then the
+ /// method simply returns true if the corresponding operands are commutable
+ /// and returns false otherwise.
+ ///
+ /// For example, calling this method this way:
+ /// unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex;
+ /// findCommutedOpIndices(MI, Op1, Op2);
+ /// can be interpreted as a query asking to find an operand that would be
+ /// commutable with the operand#1.
+ bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
+ unsigned &SrcOpIdx2) const override;
+
+ /// Returns true if the routine could find two commutable operands
+ /// in the given FMA instruction \p MI. Otherwise, returns false.
+ ///
+ /// \p SrcOpIdx1 and \p SrcOpIdx2 are INPUT and OUTPUT arguments.
+ /// The output indices of the commuted operands are returned in these
+ /// arguments. Also, the input values of these arguments may be preset either
+ /// to indices of operands that must be commuted or be equal to a special
+ /// value 'CommuteAnyOperandIndex' which means that the corresponding
+ /// operand index is not set and this method is free to pick any of
+ /// available commutable operands.
+ /// The parameter \p FMA3Group keeps the reference to the group of relative
+ /// FMA3 opcodes including register/memory forms of 132/213/231 opcodes.
+ ///
+ /// For example, calling this method this way:
+ /// unsigned Idx1 = 1, Idx2 = CommuteAnyOperandIndex;
+ /// findFMA3CommutedOpIndices(MI, Idx1, Idx2, FMA3Group);
+ /// can be interpreted as a query asking if the operand #1 can be swapped
+ /// with any other available operand (e.g. operand #2, operand #3, etc.).
+ ///
+ /// The returned FMA opcode may differ from the opcode in the given MI.
+ /// For example, commuting the operands #1 and #3 in the following FMA
+ /// FMA213 #1, #2, #3
+ /// results into instruction with adjusted opcode:
+ /// FMA231 #3, #2, #1
+ bool findFMA3CommutedOpIndices(const MachineInstr &MI,
+ unsigned &SrcOpIdx1,
+ unsigned &SrcOpIdx2,
+ const X86InstrFMA3Group &FMA3Group) const;
+
+ /// Returns an adjusted FMA opcode that must be used in FMA instruction that
+ /// performs the same computations as the given \p MI but which has the
+ /// operands \p SrcOpIdx1 and \p SrcOpIdx2 commuted.
+ /// It may return 0 if it is unsafe to commute the operands.
+ /// Note that a machine instruction (instead of its opcode) is passed as the
+ /// first parameter to make it possible to analyze the instruction's uses and
+ /// commute the first operand of FMA even when it seems unsafe when you look
+ /// at the opcode. For example, it is Ok to commute the first operand of
+ /// VFMADD*SD_Int, if ONLY the lowest 64-bit element of the result is used.
+ ///
+ /// The returned FMA opcode may differ from the opcode in the given \p MI.
+ /// For example, commuting the operands #1 and #3 in the following FMA
+ /// FMA213 #1, #2, #3
+ /// results into instruction with adjusted opcode:
+ /// FMA231 #3, #2, #1
+ unsigned getFMA3OpcodeToCommuteOperands(const MachineInstr &MI,
+ unsigned SrcOpIdx1,
+ unsigned SrcOpIdx2,
+ const X86InstrFMA3Group &FMA3Group) const;
+
+ // Branch analysis.
+ bool isUnpredicatedTerminator(const MachineInstr &MI) const override;
+ bool isUnconditionalTailCall(const MachineInstr &MI) const override;
+ bool canMakeTailCallConditional(SmallVectorImpl<MachineOperand> &Cond,
+ const MachineInstr &TailCall) const override;
+ void replaceBranchWithTailCall(MachineBasicBlock &MBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ const MachineInstr &TailCall) const override;
+
+ bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const override;
+
+ bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
+ int64_t &Offset,
+ const TargetRegisterInfo *TRI) const override;
+ bool analyzeBranchPredicate(MachineBasicBlock &MBB,
+ TargetInstrInfo::MachineBranchPredicate &MBP,
+ bool AllowModify = false) const override;
+
+ unsigned removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved = nullptr) const override;
+ unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+ const DebugLoc &DL,
+ int *BytesAdded = nullptr) const override;
+ bool canInsertSelect(const MachineBasicBlock&, ArrayRef<MachineOperand> Cond,
+ unsigned, unsigned, int&, int&, int&) const override;
+ void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ const DebugLoc &DL, unsigned DstReg,
+ ArrayRef<MachineOperand> Cond, unsigned TrueReg,
+ unsigned FalseReg) const override;
+ void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+ bool KillSrc) const override;
+ void storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned SrcReg, bool isKill, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
+
+ void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
+ SmallVectorImpl<MachineOperand> &Addr,
+ const TargetRegisterClass *RC,
+ MachineInstr::mmo_iterator MMOBegin,
+ MachineInstr::mmo_iterator MMOEnd,
+ SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+ void loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned DestReg, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
+
+ void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+ SmallVectorImpl<MachineOperand> &Addr,
+ const TargetRegisterClass *RC,
+ MachineInstr::mmo_iterator MMOBegin,
+ MachineInstr::mmo_iterator MMOEnd,
+ SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+ bool expandPostRAPseudo(MachineInstr &MI) const override;
+
+ /// Check whether the target can fold a load that feeds a subreg operand
+ /// (or a subreg operand that feeds a store).
+ bool isSubregFoldable() const override { return true; }
+
+ /// foldMemoryOperand - If this target supports it, fold a load or store of
+ /// the specified stack slot into the specified machine instruction for the
+ /// specified operand(s). If this is possible, the target should perform the
+ /// folding and return true, otherwise it should return false. If it folds
+ /// the instruction, it is likely that the MachineInstruction the iterator
+ /// references has been changed.
+ MachineInstr *
+ foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
+ ArrayRef<unsigned> Ops,
+ MachineBasicBlock::iterator InsertPt, int FrameIndex,
+ LiveIntervals *LIS = nullptr) const override;
+
+ /// foldMemoryOperand - Same as the previous version except it allows folding
+ /// of any load and store from / to any address, not just from a specific
+ /// stack slot.
+ MachineInstr *foldMemoryOperandImpl(
+ MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
+ MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
+ LiveIntervals *LIS = nullptr) const override;
+
+ /// unfoldMemoryOperand - Separate a single instruction which folded a load or
+ /// a store or a load and a store into two or more instruction. If this is
+ /// possible, returns true as well as the new instructions by reference.
+ bool
+ unfoldMemoryOperand(MachineFunction &MF, MachineInstr &MI, unsigned Reg,
+ bool UnfoldLoad, bool UnfoldStore,
+ SmallVectorImpl<MachineInstr *> &NewMIs) const override;
+
+ bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
+ SmallVectorImpl<SDNode*> &NewNodes) const override;
+
+ /// getOpcodeAfterMemoryUnfold - Returns the opcode of the would be new
+ /// instruction after load / store are unfolded from an instruction of the
+ /// specified opcode. It returns zero if the specified unfolding is not
+ /// possible. If LoadRegIndex is non-null, it is filled in with the operand
+ /// index of the operand which will hold the register holding the loaded
+ /// value.
+ unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
+ bool UnfoldLoad, bool UnfoldStore,
+ unsigned *LoadRegIndex = nullptr) const override;
+
+ /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler
+ /// to determine if two loads are loading from the same base address. It
+ /// should only return true if the base pointers are the same and the
+ /// only differences between the two addresses are the offset. It also returns
+ /// the offsets by reference.
+ bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1,
+ int64_t &Offset2) const override;
+
+ /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to
+ /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads should
+ /// be scheduled togther. On some targets if two loads are loading from
+ /// addresses in the same cache line, it's better if they are scheduled
+ /// together. This function takes two integers that represent the load offsets
+ /// from the common base address. It returns true if it decides it's desirable
+ /// to schedule the two loads together. "NumLoads" is the number of loads that
+ /// have already been scheduled after Load1.
+ bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+ int64_t Offset1, int64_t Offset2,
+ unsigned NumLoads) const override;
+
+ bool shouldScheduleAdjacent(const MachineInstr &First,
+ const MachineInstr &Second) const override;
+
+ void getNoopForMachoTarget(MCInst &NopInst) const override;
+
+ bool
+ reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+
+ /// isSafeToMoveRegClassDefs - Return true if it's safe to move a machine
+ /// instruction that defines the specified register class.
+ bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
+
+ /// isSafeToClobberEFLAGS - Return true if it's safe insert an instruction tha
+ /// would clobber the EFLAGS condition register. Note the result may be
+ /// conservative. If it cannot definitely determine the safety after visiting
+ /// a few instructions in each direction it assumes it's not safe.
+ bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const;
+
+ /// True if MI has a condition code def, e.g. EFLAGS, that is
+ /// not marked dead.
+ bool hasLiveCondCodeDef(MachineInstr &MI) const;
+
+ /// getGlobalBaseReg - Return a virtual register initialized with the
+ /// the global base register value. Output instructions required to
+ /// initialize the register in the function entry block, if necessary.
+ ///
+ unsigned getGlobalBaseReg(MachineFunction *MF) const;
+
+ std::pair<uint16_t, uint16_t>
+ getExecutionDomain(const MachineInstr &MI) const override;
+
+ void setExecutionDomain(MachineInstr &MI, unsigned Domain) const override;
+
+ unsigned
+ getPartialRegUpdateClearance(const MachineInstr &MI, unsigned OpNum,
+ const TargetRegisterInfo *TRI) const override;
+ unsigned getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum,
+ const TargetRegisterInfo *TRI) const override;
+ void breakPartialRegDependency(MachineInstr &MI, unsigned OpNum,
+ const TargetRegisterInfo *TRI) const override;
+
+ MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
+ unsigned OpNum,
+ ArrayRef<MachineOperand> MOs,
+ MachineBasicBlock::iterator InsertPt,
+ unsigned Size, unsigned Alignment,
+ bool AllowCommute) const;
+
+ bool isHighLatencyDef(int opc) const override;
+
+ bool hasHighOperandLatency(const TargetSchedModel &SchedModel,
+ const MachineRegisterInfo *MRI,
+ const MachineInstr &DefMI, unsigned DefIdx,
+ const MachineInstr &UseMI,
+ unsigned UseIdx) const override;
+
+ bool useMachineCombiner() const override {
+ return true;
+ }
+
+ bool isAssociativeAndCommutative(const MachineInstr &Inst) const override;
+
+ bool hasReassociableOperands(const MachineInstr &Inst,
+ const MachineBasicBlock *MBB) const override;
+
+ void setSpecialOperandAttr(MachineInstr &OldMI1, MachineInstr &OldMI2,
+ MachineInstr &NewMI1,
+ MachineInstr &NewMI2) const override;
+
+ /// analyzeCompare - For a comparison instruction, return the source registers
+ /// in SrcReg and SrcReg2 if having two register operands, and the value it
+ /// compares against in CmpValue. Return true if the comparison instruction
+ /// can be analyzed.
+ bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
+ unsigned &SrcReg2, int &CmpMask,
+ int &CmpValue) const override;
+
+ /// optimizeCompareInstr - Check if there exists an earlier instruction that
+ /// operates on the same source operands and sets flags in the same way as
+ /// Compare; remove Compare if possible.
+ bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
+ unsigned SrcReg2, int CmpMask, int CmpValue,
+ const MachineRegisterInfo *MRI) const override;
+
+ /// optimizeLoadInstr - Try to remove the load by folding it to a register
+ /// operand at the use. We fold the load instructions if and only if the
+ /// def and use are in the same BB. We only look at one load and see
+ /// whether it can be folded into MI. FoldAsLoadDefReg is the virtual register
+ /// defined by the load we are trying to fold. DefMI returns the machine
+ /// instruction that defines FoldAsLoadDefReg, and the function returns
+ /// the machine instruction generated due to folding.
+ MachineInstr *optimizeLoadInstr(MachineInstr &MI,
+ const MachineRegisterInfo *MRI,
+ unsigned &FoldAsLoadDefReg,
+ MachineInstr *&DefMI) const override;
+
+ std::pair<unsigned, unsigned>
+ decomposeMachineOperandsTargetFlags(unsigned TF) const override;
+
+ ArrayRef<std::pair<unsigned, const char *>>
+ getSerializableDirectMachineOperandTargetFlags() const override;
+
+ bool isTailCall(const MachineInstr &Inst) const override;
+
+protected:
+ /// Commutes the operands in the given instruction by changing the operands
+ /// order and/or changing the instruction's opcode and/or the immediate value
+ /// operand.
+ ///
+ /// The arguments 'CommuteOpIdx1' and 'CommuteOpIdx2' specify the operands
+ /// to be commuted.
+ ///
+ /// Do not call this method for a non-commutable instruction or
+ /// non-commutable operands.
+ /// Even though the instruction is commutable, the method may still
+ /// fail to commute the operands, null pointer is returned in such cases.
+ MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+ unsigned CommuteOpIdx1,
+ unsigned CommuteOpIdx2) const override;
+
+private:
+ MachineInstr *convertToThreeAddressWithLEA(unsigned MIOpc,
+ MachineFunction::iterator &MFI,
+ MachineInstr &MI,
+ LiveVariables *LV) const;
+
+ /// Handles memory folding for special case instructions, for instance those
+ /// requiring custom manipulation of the address.
+ MachineInstr *foldMemoryOperandCustom(MachineFunction &MF, MachineInstr &MI,
+ unsigned OpNum,
+ ArrayRef<MachineOperand> MOs,
+ MachineBasicBlock::iterator InsertPt,
+ unsigned Size, unsigned Align) const;
+
+ /// isFrameOperand - Return true and the FrameIndex if the specified
+ /// operand and follow operands form a reference to the stack frame.
+ bool isFrameOperand(const MachineInstr &MI, unsigned int Op,
+ int &FrameIndex) const;
+
+ /// Returns true iff the routine could find two commutable operands in the
+ /// given machine instruction with 3 vector inputs.
+ /// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their
+ /// input values can be re-defined in this method only if the input values
+ /// are not pre-defined, which is designated by the special value
+ /// 'CommuteAnyOperandIndex' assigned to it.
+ /// If both of indices are pre-defined and refer to some operands, then the
+ /// method simply returns true if the corresponding operands are commutable
+ /// and returns false otherwise.
+ ///
+ /// For example, calling this method this way:
+ /// unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex;
+ /// findThreeSrcCommutedOpIndices(MI, Op1, Op2);
+ /// can be interpreted as a query asking to find an operand that would be
+ /// commutable with the operand#1.
+ bool findThreeSrcCommutedOpIndices(const MachineInstr &MI,
+ unsigned &SrcOpIdx1,
+ unsigned &SrcOpIdx2) const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.td b/contrib/llvm/lib/Target/X86/X86InstrInfo.td
new file mode 100644
index 000000000000..38036715a25a
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.td
@@ -0,0 +1,3119 @@
+//===-- X86InstrInfo.td - Main X86 Instruction Definition --*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 instruction set, defining the instructions, and
+// properties of the instructions which are needed for code generation, machine
+// code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// X86 specific DAG Nodes.
+//
+
+def SDTIntShiftDOp: SDTypeProfile<1, 3,
+ [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+ SDTCisInt<0>, SDTCisInt<3>]>;
+
+def SDTX86CmpTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisSameAs<1, 2>]>;
+
+def SDTX86Cmps : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+//def SDTX86Cmpss : SDTypeProfile<1, 3, [SDTCisVT<0, f32>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+
+def SDTX86Cmov : SDTypeProfile<1, 4,
+ [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
+ SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
+
+// Unary and binary operator instructions that set EFLAGS as a side-effect.
+def SDTUnaryArithWithFlags : SDTypeProfile<2, 1,
+ [SDTCisSameAs<0, 2>,
+ SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+def SDTBinaryArithWithFlags : SDTypeProfile<2, 2,
+ [SDTCisSameAs<0, 2>,
+ SDTCisSameAs<0, 3>,
+ SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+// SDTBinaryArithWithFlagsInOut - RES1, EFLAGS = op LHS, RHS, EFLAGS
+def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
+ [SDTCisSameAs<0, 2>,
+ SDTCisSameAs<0, 3>,
+ SDTCisInt<0>,
+ SDTCisVT<1, i32>,
+ SDTCisVT<4, i32>]>;
+// RES1, RES2, FLAGS = op LHS, RHS
+def SDT2ResultBinaryArithWithFlags : SDTypeProfile<3, 2,
+ [SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>,
+ SDTCisSameAs<0, 3>,
+ SDTCisInt<0>, SDTCisVT<1, i32>]>;
+def SDTX86BrCond : SDTypeProfile<0, 3,
+ [SDTCisVT<0, OtherVT>,
+ SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
+
+def SDTX86SetCC : SDTypeProfile<1, 2,
+ [SDTCisVT<0, i8>,
+ SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
+def SDTX86SetCC_C : SDTypeProfile<1, 2,
+ [SDTCisInt<0>,
+ SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
+
+def SDTX86sahf : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i8>]>;
+
+def SDTX86rdrand : SDTypeProfile<2, 0, [SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>,
+ SDTCisVT<2, i8>]>;
+def SDTX86caspair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+def SDTX86caspairSaveEbx8 : SDTypeProfile<1, 3,
+ [SDTCisVT<0, i32>, SDTCisPtrTy<1>,
+ SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
+def SDTX86caspairSaveRbx16 : SDTypeProfile<1, 3,
+ [SDTCisVT<0, i64>, SDTCisPtrTy<1>,
+ SDTCisVT<2, i64>, SDTCisVT<3, i64>]>;
+
+def SDTLockBinaryArithWithFlags : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
+ SDTCisPtrTy<1>,
+ SDTCisInt<2>]>;
+
+def SDTX86Ret : SDTypeProfile<0, -1, [SDTCisVT<0, i32>]>;
+
+def SDT_X86CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_X86CallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>,
+ SDTCisVT<1, i32>]>;
+
+def SDT_X86Call : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
+
+def SDT_X86VASTART_SAVE_XMM_REGS : SDTypeProfile<0, -1, [SDTCisVT<0, i8>,
+ SDTCisVT<1, iPTR>,
+ SDTCisVT<2, iPTR>]>;
+
+def SDT_X86VAARG_64 : SDTypeProfile<1, -1, [SDTCisPtrTy<0>,
+ SDTCisPtrTy<1>,
+ SDTCisVT<2, i32>,
+ SDTCisVT<3, i8>,
+ SDTCisVT<4, i32>]>;
+
+def SDTX86RepStr : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>;
+
+def SDTX86Void : SDTypeProfile<0, 0, []>;
+
+def SDTX86Wrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+
+def SDT_X86TLSADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86TLSBASEADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86TLSCALL : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86WIN_ALLOCA : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>;
+
+def SDT_X86SEG_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
+
+def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>;
+
+def SDT_X86MEMBARRIER : SDTypeProfile<0, 0, []>;
+
+def X86MemBarrier : SDNode<"X86ISD::MEMBARRIER", SDT_X86MEMBARRIER,
+ [SDNPHasChain,SDNPSideEffect]>;
+def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER,
+ [SDNPHasChain]>;
+
+
+def X86bsf : SDNode<"X86ISD::BSF", SDTUnaryArithWithFlags>;
+def X86bsr : SDNode<"X86ISD::BSR", SDTUnaryArithWithFlags>;
+def X86shld : SDNode<"X86ISD::SHLD", SDTIntShiftDOp>;
+def X86shrd : SDNode<"X86ISD::SHRD", SDTIntShiftDOp>;
+
+def X86cmp : SDNode<"X86ISD::CMP" , SDTX86CmpTest>;
+def X86bt : SDNode<"X86ISD::BT", SDTX86CmpTest>;
+
+def X86cmov : SDNode<"X86ISD::CMOV", SDTX86Cmov>;
+def X86brcond : SDNode<"X86ISD::BRCOND", SDTX86BrCond,
+ [SDNPHasChain]>;
+def X86setcc : SDNode<"X86ISD::SETCC", SDTX86SetCC>;
+def X86setcc_c : SDNode<"X86ISD::SETCC_CARRY", SDTX86SetCC_C>;
+
+def X86sahf : SDNode<"X86ISD::SAHF", SDTX86sahf>;
+
+def X86rdrand : SDNode<"X86ISD::RDRAND", SDTX86rdrand,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+def X86rdseed : SDNode<"X86ISD::RDSEED", SDTX86rdrand,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas,
+ [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
+ SDNPMayLoad, SDNPMemOperand]>;
+def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86caspair,
+ [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
+ SDNPMayLoad, SDNPMemOperand]>;
+def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86caspair,
+ [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
+ SDNPMayLoad, SDNPMemOperand]>;
+def X86cas8save_ebx : SDNode<"X86ISD::LCMPXCHG8_SAVE_EBX_DAG",
+ SDTX86caspairSaveEbx8,
+ [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
+ SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
+def X86cas16save_rbx : SDNode<"X86ISD::LCMPXCHG16_SAVE_RBX_DAG",
+ SDTX86caspairSaveRbx16,
+ [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
+ SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
+
+def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def X86iret : SDNode<"X86ISD::IRET", SDTX86Ret,
+ [SDNPHasChain, SDNPOptInGlue]>;
+
+def X86vastart_save_xmm_regs :
+ SDNode<"X86ISD::VASTART_SAVE_XMM_REGS",
+ SDT_X86VASTART_SAVE_XMM_REGS,
+ [SDNPHasChain, SDNPVariadic]>;
+def X86vaarg64 :
+ SDNode<"X86ISD::VAARG_64", SDT_X86VAARG_64,
+ [SDNPHasChain, SDNPMayLoad, SDNPMayStore,
+ SDNPMemOperand]>;
+def X86callseq_start :
+ SDNode<"ISD::CALLSEQ_START", SDT_X86CallSeqStart,
+ [SDNPHasChain, SDNPOutGlue]>;
+def X86callseq_end :
+ SDNode<"ISD::CALLSEQ_END", SDT_X86CallSeqEnd,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def X86call : SDNode<"X86ISD::CALL", SDT_X86Call,
+ [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
+ SDNPVariadic]>;
+
+def X86rep_stos: SDNode<"X86ISD::REP_STOS", SDTX86RepStr,
+ [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore]>;
+def X86rep_movs: SDNode<"X86ISD::REP_MOVS", SDTX86RepStr,
+ [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
+ SDNPMayLoad]>;
+
+def X86rdtsc : SDNode<"X86ISD::RDTSC_DAG", SDTX86Void,
+ [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
+def X86rdtscp : SDNode<"X86ISD::RDTSCP_DAG", SDTX86Void,
+ [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
+def X86rdpmc : SDNode<"X86ISD::RDPMC_DAG", SDTX86Void,
+ [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
+
+def X86Wrapper : SDNode<"X86ISD::Wrapper", SDTX86Wrapper>;
+def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP", SDTX86Wrapper>;
+
+def X86RecoverFrameAlloc : SDNode<"ISD::LOCAL_RECOVER",
+ SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
+ SDTCisInt<1>]>>;
+
+def X86tlsaddr : SDNode<"X86ISD::TLSADDR", SDT_X86TLSADDR,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def X86tlsbaseaddr : SDNode<"X86ISD::TLSBASEADDR", SDT_X86TLSBASEADDR,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def X86ehret : SDNode<"X86ISD::EH_RETURN", SDT_X86EHRET,
+ [SDNPHasChain]>;
+
+def X86eh_sjlj_setjmp : SDNode<"X86ISD::EH_SJLJ_SETJMP",
+ SDTypeProfile<1, 1, [SDTCisInt<0>,
+ SDTCisPtrTy<1>]>,
+ [SDNPHasChain, SDNPSideEffect]>;
+def X86eh_sjlj_longjmp : SDNode<"X86ISD::EH_SJLJ_LONGJMP",
+ SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
+ [SDNPHasChain, SDNPSideEffect]>;
+def X86eh_sjlj_setup_dispatch : SDNode<"X86ISD::EH_SJLJ_SETUP_DISPATCH",
+ SDTypeProfile<0, 0, []>,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+def X86tcret : SDNode<"X86ISD::TC_RETURN", SDT_X86TCRET,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+def X86add_flag : SDNode<"X86ISD::ADD", SDTBinaryArithWithFlags,
+ [SDNPCommutative]>;
+def X86sub_flag : SDNode<"X86ISD::SUB", SDTBinaryArithWithFlags>;
+def X86smul_flag : SDNode<"X86ISD::SMUL", SDTBinaryArithWithFlags,
+ [SDNPCommutative]>;
+def X86umul_flag : SDNode<"X86ISD::UMUL", SDT2ResultBinaryArithWithFlags,
+ [SDNPCommutative]>;
+def X86adc_flag : SDNode<"X86ISD::ADC", SDTBinaryArithWithFlagsInOut>;
+def X86sbb_flag : SDNode<"X86ISD::SBB", SDTBinaryArithWithFlagsInOut>;
+
+def X86inc_flag : SDNode<"X86ISD::INC", SDTUnaryArithWithFlags>;
+def X86dec_flag : SDNode<"X86ISD::DEC", SDTUnaryArithWithFlags>;
+def X86or_flag : SDNode<"X86ISD::OR", SDTBinaryArithWithFlags,
+ [SDNPCommutative]>;
+def X86xor_flag : SDNode<"X86ISD::XOR", SDTBinaryArithWithFlags,
+ [SDNPCommutative]>;
+def X86and_flag : SDNode<"X86ISD::AND", SDTBinaryArithWithFlags,
+ [SDNPCommutative]>;
+
+def X86lock_add : SDNode<"X86ISD::LADD", SDTLockBinaryArithWithFlags,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
+ SDNPMemOperand]>;
+def X86lock_sub : SDNode<"X86ISD::LSUB", SDTLockBinaryArithWithFlags,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
+ SDNPMemOperand]>;
+def X86lock_or : SDNode<"X86ISD::LOR", SDTLockBinaryArithWithFlags,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
+ SDNPMemOperand]>;
+def X86lock_xor : SDNode<"X86ISD::LXOR", SDTLockBinaryArithWithFlags,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
+ SDNPMemOperand]>;
+def X86lock_and : SDNode<"X86ISD::LAND", SDTLockBinaryArithWithFlags,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
+ SDNPMemOperand]>;
+
+def X86bextr : SDNode<"X86ISD::BEXTR", SDTIntBinOp>;
+
+def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;
+
+def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDT_X86WIN_ALLOCA,
+ [SDNPHasChain, SDNPOutGlue]>;
+
+def X86SegAlloca : SDNode<"X86ISD::SEG_ALLOCA", SDT_X86SEG_ALLOCA,
+ [SDNPHasChain]>;
+
+def X86TLSCall : SDNode<"X86ISD::TLSCALL", SDT_X86TLSCALL,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+//===----------------------------------------------------------------------===//
+// X86 Operand Definitions.
+//
+
+// A version of ptr_rc which excludes SP, ESP, and RSP. This is used for
+// the index operand of an address, to conform to x86 encoding restrictions.
+def ptr_rc_nosp : PointerLikeRegClass<1>;
+
+// *mem - Operand definitions for the funky X86 addressing mode operands.
+//
+def X86MemAsmOperand : AsmOperandClass {
+ let Name = "Mem";
+}
+let RenderMethod = "addMemOperands", SuperClasses = [X86MemAsmOperand] in {
+ def X86Mem8AsmOperand : AsmOperandClass { let Name = "Mem8"; }
+ def X86Mem16AsmOperand : AsmOperandClass { let Name = "Mem16"; }
+ def X86Mem32AsmOperand : AsmOperandClass { let Name = "Mem32"; }
+ def X86Mem64AsmOperand : AsmOperandClass { let Name = "Mem64"; }
+ def X86Mem80AsmOperand : AsmOperandClass { let Name = "Mem80"; }
+ def X86Mem128AsmOperand : AsmOperandClass { let Name = "Mem128"; }
+ def X86Mem256AsmOperand : AsmOperandClass { let Name = "Mem256"; }
+ def X86Mem512AsmOperand : AsmOperandClass { let Name = "Mem512"; }
+ // Gather mem operands
+ def X86Mem64_RC128Operand : AsmOperandClass { let Name = "Mem64_RC128"; }
+ def X86Mem128_RC128Operand : AsmOperandClass { let Name = "Mem128_RC128"; }
+ def X86Mem256_RC128Operand : AsmOperandClass { let Name = "Mem256_RC128"; }
+ def X86Mem128_RC256Operand : AsmOperandClass { let Name = "Mem128_RC256"; }
+ def X86Mem256_RC256Operand : AsmOperandClass { let Name = "Mem256_RC256"; }
+
+ def X86Mem64_RC128XOperand : AsmOperandClass { let Name = "Mem64_RC128X"; }
+ def X86Mem128_RC128XOperand : AsmOperandClass { let Name = "Mem128_RC128X"; }
+ def X86Mem256_RC128XOperand : AsmOperandClass { let Name = "Mem256_RC128X"; }
+ def X86Mem128_RC256XOperand : AsmOperandClass { let Name = "Mem128_RC256X"; }
+ def X86Mem256_RC256XOperand : AsmOperandClass { let Name = "Mem256_RC256X"; }
+ def X86Mem512_RC256XOperand : AsmOperandClass { let Name = "Mem512_RC256X"; }
+ def X86Mem512_RC512Operand : AsmOperandClass { let Name = "Mem512_RC512"; }
+}
+
+def X86AbsMemAsmOperand : AsmOperandClass {
+ let Name = "AbsMem";
+ let SuperClasses = [X86MemAsmOperand];
+}
+
+class X86MemOperand<string printMethod,
+ AsmOperandClass parserMatchClass = X86MemAsmOperand> : Operand<iPTR> {
+ let PrintMethod = printMethod;
+ let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, SEGMENT_REG);
+ let ParserMatchClass = parserMatchClass;
+ let OperandType = "OPERAND_MEMORY";
+}
+
+// Gather mem operands
+class X86VMemOperand<RegisterClass RC, string printMethod,
+ AsmOperandClass parserMatchClass>
+ : X86MemOperand<printMethod, parserMatchClass> {
+ let MIOperandInfo = (ops ptr_rc, i8imm, RC, i32imm, SEGMENT_REG);
+}
+
+def anymem : X86MemOperand<"printanymem">;
+
+def opaque32mem : X86MemOperand<"printopaquemem">;
+def opaque48mem : X86MemOperand<"printopaquemem">;
+def opaque80mem : X86MemOperand<"printopaquemem">;
+def opaque512mem : X86MemOperand<"printopaquemem">;
+
+def i8mem : X86MemOperand<"printi8mem", X86Mem8AsmOperand>;
+def i16mem : X86MemOperand<"printi16mem", X86Mem16AsmOperand>;
+def i32mem : X86MemOperand<"printi32mem", X86Mem32AsmOperand>;
+def i64mem : X86MemOperand<"printi64mem", X86Mem64AsmOperand>;
+def i128mem : X86MemOperand<"printi128mem", X86Mem128AsmOperand>;
+def i256mem : X86MemOperand<"printi256mem", X86Mem256AsmOperand>;
+def i512mem : X86MemOperand<"printi512mem", X86Mem512AsmOperand>;
+def f32mem : X86MemOperand<"printf32mem", X86Mem32AsmOperand>;
+def f64mem : X86MemOperand<"printf64mem", X86Mem64AsmOperand>;
+def f80mem : X86MemOperand<"printf80mem", X86Mem80AsmOperand>;
+def f128mem : X86MemOperand<"printf128mem", X86Mem128AsmOperand>;
+def f256mem : X86MemOperand<"printf256mem", X86Mem256AsmOperand>;
+def f512mem : X86MemOperand<"printf512mem", X86Mem512AsmOperand>;
+
+def v512mem : X86VMemOperand<VR512, "printf512mem", X86Mem512AsmOperand>;
+
+// Gather mem operands
+def vx64mem : X86VMemOperand<VR128, "printi64mem", X86Mem64_RC128Operand>;
+def vx128mem : X86VMemOperand<VR128, "printi128mem", X86Mem128_RC128Operand>;
+def vx256mem : X86VMemOperand<VR128, "printi256mem", X86Mem256_RC128Operand>;
+def vy128mem : X86VMemOperand<VR256, "printi128mem", X86Mem128_RC256Operand>;
+def vy256mem : X86VMemOperand<VR256, "printi256mem", X86Mem256_RC256Operand>;
+
+def vx64xmem : X86VMemOperand<VR128X, "printi64mem", X86Mem64_RC128XOperand>;
+def vx128xmem : X86VMemOperand<VR128X, "printi128mem", X86Mem128_RC128XOperand>;
+def vx256xmem : X86VMemOperand<VR128X, "printi256mem", X86Mem256_RC128XOperand>;
+def vy128xmem : X86VMemOperand<VR256, "printi128mem", X86Mem128_RC256XOperand>;
+def vy256xmem : X86VMemOperand<VR256X, "printi256mem", X86Mem256_RC256XOperand>;
+def vy512mem : X86VMemOperand<VR256X, "printi512mem", X86Mem512_RC256XOperand>;
+def vz512mem : X86VMemOperand<VR512, "printi512mem", X86Mem512_RC512Operand>;
+
+// A version of i8mem for use on x86-64 and x32 that uses a NOREX GPR instead
+// of a plain GPR, so that it doesn't potentially require a REX prefix.
+def ptr_rc_norex : PointerLikeRegClass<2>;
+def ptr_rc_norex_nosp : PointerLikeRegClass<3>;
+
+def i8mem_NOREX : Operand<iPTR> {
+ let PrintMethod = "printi8mem";
+ let MIOperandInfo = (ops ptr_rc_norex, i8imm, ptr_rc_norex_nosp, i32imm,
+ SEGMENT_REG);
+ let ParserMatchClass = X86Mem8AsmOperand;
+ let OperandType = "OPERAND_MEMORY";
+}
+
+// GPRs available for tailcall.
+// It represents GR32_TC, GR64_TC or GR64_TCW64.
+def ptr_rc_tailcall : PointerLikeRegClass<4>;
+
+// Special i32mem for addresses of load folding tail calls. These are not
+// allowed to use callee-saved registers since they must be scheduled
+// after callee-saved register are popped.
+def i32mem_TC : Operand<i32> {
+ let PrintMethod = "printi32mem";
+ let MIOperandInfo = (ops ptr_rc_tailcall, i8imm, ptr_rc_tailcall,
+ i32imm, SEGMENT_REG);
+ let ParserMatchClass = X86Mem32AsmOperand;
+ let OperandType = "OPERAND_MEMORY";
+}
+
+// Special i64mem for addresses of load folding tail calls. These are not
+// allowed to use callee-saved registers since they must be scheduled
+// after callee-saved register are popped.
+def i64mem_TC : Operand<i64> {
+ let PrintMethod = "printi64mem";
+ let MIOperandInfo = (ops ptr_rc_tailcall, i8imm,
+ ptr_rc_tailcall, i32imm, SEGMENT_REG);
+ let ParserMatchClass = X86Mem64AsmOperand;
+ let OperandType = "OPERAND_MEMORY";
+}
+
+let OperandType = "OPERAND_PCREL",
+ ParserMatchClass = X86AbsMemAsmOperand,
+ PrintMethod = "printPCRelImm" in {
+def i32imm_pcrel : Operand<i32>;
+def i16imm_pcrel : Operand<i16>;
+
+// Branch targets have OtherVT type and print as pc-relative values.
+def brtarget : Operand<OtherVT>;
+def brtarget8 : Operand<OtherVT>;
+
+}
+
+// Special parser to detect 16-bit mode to select 16-bit displacement.
+def X86AbsMem16AsmOperand : AsmOperandClass {
+ let Name = "AbsMem16";
+ let RenderMethod = "addAbsMemOperands";
+ let SuperClasses = [X86AbsMemAsmOperand];
+}
+
+// Branch targets have OtherVT type and print as pc-relative values.
+let OperandType = "OPERAND_PCREL",
+ PrintMethod = "printPCRelImm" in {
+let ParserMatchClass = X86AbsMem16AsmOperand in
+ def brtarget16 : Operand<OtherVT>;
+let ParserMatchClass = X86AbsMemAsmOperand in
+ def brtarget32 : Operand<OtherVT>;
+}
+
+let RenderMethod = "addSrcIdxOperands" in {
+ def X86SrcIdx8Operand : AsmOperandClass {
+ let Name = "SrcIdx8";
+ let SuperClasses = [X86Mem8AsmOperand];
+ }
+ def X86SrcIdx16Operand : AsmOperandClass {
+ let Name = "SrcIdx16";
+ let SuperClasses = [X86Mem16AsmOperand];
+ }
+ def X86SrcIdx32Operand : AsmOperandClass {
+ let Name = "SrcIdx32";
+ let SuperClasses = [X86Mem32AsmOperand];
+ }
+ def X86SrcIdx64Operand : AsmOperandClass {
+ let Name = "SrcIdx64";
+ let SuperClasses = [X86Mem64AsmOperand];
+ }
+} // RenderMethod = "addSrcIdxOperands"
+
+let RenderMethod = "addDstIdxOperands" in {
+ def X86DstIdx8Operand : AsmOperandClass {
+ let Name = "DstIdx8";
+ let SuperClasses = [X86Mem8AsmOperand];
+ }
+ def X86DstIdx16Operand : AsmOperandClass {
+ let Name = "DstIdx16";
+ let SuperClasses = [X86Mem16AsmOperand];
+ }
+ def X86DstIdx32Operand : AsmOperandClass {
+ let Name = "DstIdx32";
+ let SuperClasses = [X86Mem32AsmOperand];
+ }
+ def X86DstIdx64Operand : AsmOperandClass {
+ let Name = "DstIdx64";
+ let SuperClasses = [X86Mem64AsmOperand];
+ }
+} // RenderMethod = "addDstIdxOperands"
+
+let RenderMethod = "addMemOffsOperands" in {
+ def X86MemOffs16_8AsmOperand : AsmOperandClass {
+ let Name = "MemOffs16_8";
+ let SuperClasses = [X86Mem8AsmOperand];
+ }
+ def X86MemOffs16_16AsmOperand : AsmOperandClass {
+ let Name = "MemOffs16_16";
+ let SuperClasses = [X86Mem16AsmOperand];
+ }
+ def X86MemOffs16_32AsmOperand : AsmOperandClass {
+ let Name = "MemOffs16_32";
+ let SuperClasses = [X86Mem32AsmOperand];
+ }
+ def X86MemOffs32_8AsmOperand : AsmOperandClass {
+ let Name = "MemOffs32_8";
+ let SuperClasses = [X86Mem8AsmOperand];
+ }
+ def X86MemOffs32_16AsmOperand : AsmOperandClass {
+ let Name = "MemOffs32_16";
+ let SuperClasses = [X86Mem16AsmOperand];
+ }
+ def X86MemOffs32_32AsmOperand : AsmOperandClass {
+ let Name = "MemOffs32_32";
+ let SuperClasses = [X86Mem32AsmOperand];
+ }
+ def X86MemOffs32_64AsmOperand : AsmOperandClass {
+ let Name = "MemOffs32_64";
+ let SuperClasses = [X86Mem64AsmOperand];
+ }
+ def X86MemOffs64_8AsmOperand : AsmOperandClass {
+ let Name = "MemOffs64_8";
+ let SuperClasses = [X86Mem8AsmOperand];
+ }
+ def X86MemOffs64_16AsmOperand : AsmOperandClass {
+ let Name = "MemOffs64_16";
+ let SuperClasses = [X86Mem16AsmOperand];
+ }
+ def X86MemOffs64_32AsmOperand : AsmOperandClass {
+ let Name = "MemOffs64_32";
+ let SuperClasses = [X86Mem32AsmOperand];
+ }
+ def X86MemOffs64_64AsmOperand : AsmOperandClass {
+ let Name = "MemOffs64_64";
+ let SuperClasses = [X86Mem64AsmOperand];
+ }
+} // RenderMethod = "addMemOffsOperands"
+
+class X86SrcIdxOperand<string printMethod, AsmOperandClass parserMatchClass>
+ : X86MemOperand<printMethod, parserMatchClass> {
+ let MIOperandInfo = (ops ptr_rc, SEGMENT_REG);
+}
+
+class X86DstIdxOperand<string printMethod, AsmOperandClass parserMatchClass>
+ : X86MemOperand<printMethod, parserMatchClass> {
+ let MIOperandInfo = (ops ptr_rc);
+}
+
+def srcidx8 : X86SrcIdxOperand<"printSrcIdx8", X86SrcIdx8Operand>;
+def srcidx16 : X86SrcIdxOperand<"printSrcIdx16", X86SrcIdx16Operand>;
+def srcidx32 : X86SrcIdxOperand<"printSrcIdx32", X86SrcIdx32Operand>;
+def srcidx64 : X86SrcIdxOperand<"printSrcIdx64", X86SrcIdx64Operand>;
+def dstidx8 : X86DstIdxOperand<"printDstIdx8", X86DstIdx8Operand>;
+def dstidx16 : X86DstIdxOperand<"printDstIdx16", X86DstIdx16Operand>;
+def dstidx32 : X86DstIdxOperand<"printDstIdx32", X86DstIdx32Operand>;
+def dstidx64 : X86DstIdxOperand<"printDstIdx64", X86DstIdx64Operand>;
+
+class X86MemOffsOperand<Operand immOperand, string printMethod,
+ AsmOperandClass parserMatchClass>
+ : X86MemOperand<printMethod, parserMatchClass> {
+ let MIOperandInfo = (ops immOperand, SEGMENT_REG);
+}
+
+def offset16_8 : X86MemOffsOperand<i16imm, "printMemOffs8",
+ X86MemOffs16_8AsmOperand>;
+def offset16_16 : X86MemOffsOperand<i16imm, "printMemOffs16",
+ X86MemOffs16_16AsmOperand>;
+def offset16_32 : X86MemOffsOperand<i16imm, "printMemOffs32",
+ X86MemOffs16_32AsmOperand>;
+def offset32_8 : X86MemOffsOperand<i32imm, "printMemOffs8",
+ X86MemOffs32_8AsmOperand>;
+def offset32_16 : X86MemOffsOperand<i32imm, "printMemOffs16",
+ X86MemOffs32_16AsmOperand>;
+def offset32_32 : X86MemOffsOperand<i32imm, "printMemOffs32",
+ X86MemOffs32_32AsmOperand>;
+def offset32_64 : X86MemOffsOperand<i32imm, "printMemOffs64",
+ X86MemOffs32_64AsmOperand>;
+def offset64_8 : X86MemOffsOperand<i64imm, "printMemOffs8",
+ X86MemOffs64_8AsmOperand>;
+def offset64_16 : X86MemOffsOperand<i64imm, "printMemOffs16",
+ X86MemOffs64_16AsmOperand>;
+def offset64_32 : X86MemOffsOperand<i64imm, "printMemOffs32",
+ X86MemOffs64_32AsmOperand>;
+def offset64_64 : X86MemOffsOperand<i64imm, "printMemOffs64",
+ X86MemOffs64_64AsmOperand>;
+
+def SSECC : Operand<i8> {
+ let PrintMethod = "printSSEAVXCC";
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def i8immZExt3 : ImmLeaf<i8, [{
+ return Imm >= 0 && Imm < 8;
+}]>;
+
+def AVXCC : Operand<i8> {
+ let PrintMethod = "printSSEAVXCC";
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def i8immZExt5 : ImmLeaf<i8, [{
+ return Imm >= 0 && Imm < 32;
+}]>;
+
+def AVX512ICC : Operand<i8> {
+ let PrintMethod = "printSSEAVXCC";
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def XOPCC : Operand<i8> {
+ let PrintMethod = "printXOPCC";
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+class ImmSExtAsmOperandClass : AsmOperandClass {
+ let SuperClasses = [ImmAsmOperand];
+ let RenderMethod = "addImmOperands";
+}
+
+def X86GR32orGR64AsmOperand : AsmOperandClass {
+ let Name = "GR32orGR64";
+}
+
+def GR32orGR64 : RegisterOperand<GR32> {
+ let ParserMatchClass = X86GR32orGR64AsmOperand;
+}
+def AVX512RCOperand : AsmOperandClass {
+ let Name = "AVX512RC";
+}
+def AVX512RC : Operand<i32> {
+ let PrintMethod = "printRoundingControl";
+ let OperandType = "OPERAND_IMMEDIATE";
+ let ParserMatchClass = AVX512RCOperand;
+}
+
+// Sign-extended immediate classes. We don't need to define the full lattice
+// here because there is no instruction with an ambiguity between ImmSExti64i32
+// and ImmSExti32i8.
+//
+// The strange ranges come from the fact that the assembler always works with
+// 64-bit immediates, but for a 16-bit target value we want to accept both "-1"
+// (which will be a -1ULL), and "0xFF" (-1 in 16-bits).
+
+// [0, 0x7FFFFFFF] |
+// [0xFFFFFFFF80000000, 0xFFFFFFFFFFFFFFFF]
+def ImmSExti64i32AsmOperand : ImmSExtAsmOperandClass {
+ let Name = "ImmSExti64i32";
+}
+
+// [0, 0x0000007F] | [0x000000000000FF80, 0x000000000000FFFF] |
+// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
+def ImmSExti16i8AsmOperand : ImmSExtAsmOperandClass {
+ let Name = "ImmSExti16i8";
+ let SuperClasses = [ImmSExti64i32AsmOperand];
+}
+
+// [0, 0x0000007F] | [0x00000000FFFFFF80, 0x00000000FFFFFFFF] |
+// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
+def ImmSExti32i8AsmOperand : ImmSExtAsmOperandClass {
+ let Name = "ImmSExti32i8";
+}
+
+// [0, 0x0000007F] |
+// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
+def ImmSExti64i8AsmOperand : ImmSExtAsmOperandClass {
+ let Name = "ImmSExti64i8";
+ let SuperClasses = [ImmSExti16i8AsmOperand, ImmSExti32i8AsmOperand,
+ ImmSExti64i32AsmOperand];
+}
+
+// Unsigned immediate used by SSE/AVX instructions
+// [0, 0xFF]
+// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
+def ImmUnsignedi8AsmOperand : AsmOperandClass {
+ let Name = "ImmUnsignedi8";
+ let RenderMethod = "addImmOperands";
+}
+
+// A couple of more descriptive operand definitions.
+// 16-bits but only 8 bits are significant.
+def i16i8imm : Operand<i16> {
+ let ParserMatchClass = ImmSExti16i8AsmOperand;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+// 32-bits but only 8 bits are significant.
+def i32i8imm : Operand<i32> {
+ let ParserMatchClass = ImmSExti32i8AsmOperand;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// 64-bits but only 32 bits are significant.
+def i64i32imm : Operand<i64> {
+ let ParserMatchClass = ImmSExti64i32AsmOperand;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// 64-bits but only 8 bits are significant.
+def i64i8imm : Operand<i64> {
+ let ParserMatchClass = ImmSExti64i8AsmOperand;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// Unsigned 8-bit immediate used by SSE/AVX instructions.
+def u8imm : Operand<i8> {
+ let PrintMethod = "printU8Imm";
+ let ParserMatchClass = ImmUnsignedi8AsmOperand;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// 32-bit immediate but only 8-bits are significant and they are unsigned.
+// Used by some SSE/AVX instructions that use intrinsics.
+def i32u8imm : Operand<i32> {
+ let PrintMethod = "printU8Imm";
+ let ParserMatchClass = ImmUnsignedi8AsmOperand;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// 64-bits but only 32 bits are significant, and those bits are treated as being
+// pc relative.
+def i64i32imm_pcrel : Operand<i64> {
+ let PrintMethod = "printPCRelImm";
+ let ParserMatchClass = X86AbsMemAsmOperand;
+ let OperandType = "OPERAND_PCREL";
+}
+
+def lea64_32mem : Operand<i32> {
+ let PrintMethod = "printanymem";
+ let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, SEGMENT_REG);
+ let ParserMatchClass = X86MemAsmOperand;
+}
+
+// Memory operands that use 64-bit pointers in both ILP32 and LP64.
+def lea64mem : Operand<i64> {
+ let PrintMethod = "printanymem";
+ let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, SEGMENT_REG);
+ let ParserMatchClass = X86MemAsmOperand;
+}
+
+
+//===----------------------------------------------------------------------===//
+// X86 Complex Pattern Definitions.
+//
+
+// Define X86-specific addressing mode.
+def addr : ComplexPattern<iPTR, 5, "selectAddr", [], [SDNPWantParent]>;
+def lea32addr : ComplexPattern<i32, 5, "selectLEAAddr",
+ [add, sub, mul, X86mul_imm, shl, or, frameindex],
+ []>;
+// In 64-bit mode 32-bit LEAs can use RIP-relative addressing.
+def lea64_32addr : ComplexPattern<i32, 5, "selectLEA64_32Addr",
+ [add, sub, mul, X86mul_imm, shl, or,
+ frameindex, X86WrapperRIP],
+ []>;
+
+def tls32addr : ComplexPattern<i32, 5, "selectTLSADDRAddr",
+ [tglobaltlsaddr], []>;
+
+def tls32baseaddr : ComplexPattern<i32, 5, "selectTLSADDRAddr",
+ [tglobaltlsaddr], []>;
+
+def lea64addr : ComplexPattern<i64, 5, "selectLEAAddr",
+ [add, sub, mul, X86mul_imm, shl, or, frameindex,
+ X86WrapperRIP], []>;
+
+def tls64addr : ComplexPattern<i64, 5, "selectTLSADDRAddr",
+ [tglobaltlsaddr], []>;
+
+def tls64baseaddr : ComplexPattern<i64, 5, "selectTLSADDRAddr",
+ [tglobaltlsaddr], []>;
+
+def vectoraddr : ComplexPattern<iPTR, 5, "selectVectorAddr", [],[SDNPWantParent]>;
+
+// A relocatable immediate is either an immediate operand or an operand that can
+// be relocated by the linker to an immediate, such as a regular symbol in
+// non-PIC code.
+def relocImm : ComplexPattern<iAny, 1, "selectRelocImm", [imm, X86Wrapper], [],
+ 0>;
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Predicate Definitions.
+def TruePredicate : Predicate<"true">;
+
+def HasCMov : Predicate<"Subtarget->hasCMov()">;
+def NoCMov : Predicate<"!Subtarget->hasCMov()">;
+
+def HasMMX : Predicate<"Subtarget->hasMMX()">;
+def Has3DNow : Predicate<"Subtarget->has3DNow()">;
+def Has3DNowA : Predicate<"Subtarget->has3DNowA()">;
+def HasSSE1 : Predicate<"Subtarget->hasSSE1()">;
+def UseSSE1 : Predicate<"Subtarget->hasSSE1() && !Subtarget->hasAVX()">;
+def HasSSE2 : Predicate<"Subtarget->hasSSE2()">;
+def UseSSE2 : Predicate<"Subtarget->hasSSE2() && !Subtarget->hasAVX()">;
+def HasSSE3 : Predicate<"Subtarget->hasSSE3()">;
+def UseSSE3 : Predicate<"Subtarget->hasSSE3() && !Subtarget->hasAVX()">;
+def HasSSSE3 : Predicate<"Subtarget->hasSSSE3()">;
+def UseSSSE3 : Predicate<"Subtarget->hasSSSE3() && !Subtarget->hasAVX()">;
+def HasSSE41 : Predicate<"Subtarget->hasSSE41()">;
+def NoSSE41 : Predicate<"!Subtarget->hasSSE41()">;
+def UseSSE41 : Predicate<"Subtarget->hasSSE41() && !Subtarget->hasAVX()">;
+def HasSSE42 : Predicate<"Subtarget->hasSSE42()">;
+def UseSSE42 : Predicate<"Subtarget->hasSSE42() && !Subtarget->hasAVX()">;
+def HasSSE4A : Predicate<"Subtarget->hasSSE4A()">;
+def HasAVX : Predicate<"Subtarget->hasAVX()">;
+def HasAVX2 : Predicate<"Subtarget->hasAVX2()">;
+def HasAVX1Only : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">;
+def HasAVX512 : Predicate<"Subtarget->hasAVX512()">,
+ AssemblerPredicate<"FeatureAVX512", "AVX-512 ISA">;
+def UseAVX : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX512()">;
+def UseAVX2 : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">;
+def NoAVX512 : Predicate<"!Subtarget->hasAVX512()">;
+def HasCDI : Predicate<"Subtarget->hasCDI()">,
+ AssemblerPredicate<"FeatureCDI", "AVX-512 CD ISA">;
+def HasPFI : Predicate<"Subtarget->hasPFI()">,
+ AssemblerPredicate<"FeaturePFI", "AVX-512 PF ISA">;
+def HasERI : Predicate<"Subtarget->hasERI()">,
+ AssemblerPredicate<"FeatureERI", "AVX-512 ER ISA">;
+def HasDQI : Predicate<"Subtarget->hasDQI()">,
+ AssemblerPredicate<"FeatureDQI", "AVX-512 DQ ISA">;
+def NoDQI : Predicate<"!Subtarget->hasDQI()">;
+def HasBWI : Predicate<"Subtarget->hasBWI()">,
+ AssemblerPredicate<"FeatureBWI", "AVX-512 BW ISA">;
+def NoBWI : Predicate<"!Subtarget->hasBWI()">;
+def HasVLX : Predicate<"Subtarget->hasVLX()">,
+ AssemblerPredicate<"FeatureVLX", "AVX-512 VL ISA">;
+def NoVLX : Predicate<"!Subtarget->hasVLX()">;
+def NoVLX_Or_NoBWI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasBWI()">;
+def NoVLX_Or_NoDQI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasDQI()">;
+def PKU : Predicate<"Subtarget->hasPKU()">;
+
+def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">;
+def HasAES : Predicate<"Subtarget->hasAES()">;
+def HasFXSR : Predicate<"Subtarget->hasFXSR()">;
+def HasXSAVE : Predicate<"Subtarget->hasXSAVE()">;
+def HasXSAVEOPT : Predicate<"Subtarget->hasXSAVEOPT()">;
+def HasXSAVEC : Predicate<"Subtarget->hasXSAVEC()">;
+def HasXSAVES : Predicate<"Subtarget->hasXSAVES()">;
+def HasPCLMUL : Predicate<"Subtarget->hasPCLMUL()">;
+def HasFMA : Predicate<"Subtarget->hasFMA()">;
+def UseFMAOnAVX : Predicate<"Subtarget->hasFMA() && !Subtarget->hasAVX512()">;
+def HasFMA4 : Predicate<"Subtarget->hasFMA4()">;
+def HasXOP : Predicate<"Subtarget->hasXOP()">;
+def HasTBM : Predicate<"Subtarget->hasTBM()">;
+def HasMOVBE : Predicate<"Subtarget->hasMOVBE()">;
+def HasRDRAND : Predicate<"Subtarget->hasRDRAND()">;
+def HasF16C : Predicate<"Subtarget->hasF16C()">;
+def NoF16C : Predicate<"!Subtarget->hasF16C()">;
+def HasFSGSBase : Predicate<"Subtarget->hasFSGSBase()">;
+def HasLZCNT : Predicate<"Subtarget->hasLZCNT()">;
+def HasBMI : Predicate<"Subtarget->hasBMI()">;
+def HasBMI2 : Predicate<"Subtarget->hasBMI2()">;
+def HasVBMI : Predicate<"Subtarget->hasVBMI()">,
+ AssemblerPredicate<"FeatureVBMI", "AVX-512 VBMI ISA">;
+def HasIFMA : Predicate<"Subtarget->hasIFMA()">,
+ AssemblerPredicate<"FeatureIFMA", "AVX-512 IFMA ISA">;
+def HasRTM : Predicate<"Subtarget->hasRTM()">;
+def HasHLE : Predicate<"Subtarget->hasHLE()">;
+def HasTSX : Predicate<"Subtarget->hasRTM() || Subtarget->hasHLE()">;
+def HasADX : Predicate<"Subtarget->hasADX()">;
+def HasSHA : Predicate<"Subtarget->hasSHA()">;
+def HasPRFCHW : Predicate<"Subtarget->hasPRFCHW()">;
+def HasRDSEED : Predicate<"Subtarget->hasRDSEED()">;
+def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">;
+def HasLAHFSAHF : Predicate<"Subtarget->hasLAHFSAHF()">;
+def HasMWAITX : Predicate<"Subtarget->hasMWAITX()">;
+def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">;
+def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">;
+def HasMPX : Predicate<"Subtarget->hasMPX()">;
+def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
+def Not64BitMode : Predicate<"!Subtarget->is64Bit()">,
+ AssemblerPredicate<"!Mode64Bit", "Not 64-bit mode">;
+def In64BitMode : Predicate<"Subtarget->is64Bit()">,
+ AssemblerPredicate<"Mode64Bit", "64-bit mode">;
+def IsLP64 : Predicate<"Subtarget->isTarget64BitLP64()">;
+def NotLP64 : Predicate<"!Subtarget->isTarget64BitLP64()">;
+def In16BitMode : Predicate<"Subtarget->is16Bit()">,
+ AssemblerPredicate<"Mode16Bit", "16-bit mode">;
+def Not16BitMode : Predicate<"!Subtarget->is16Bit()">,
+ AssemblerPredicate<"!Mode16Bit", "Not 16-bit mode">;
+def In32BitMode : Predicate<"Subtarget->is32Bit()">,
+ AssemblerPredicate<"Mode32Bit", "32-bit mode">;
+def IsWin64 : Predicate<"Subtarget->isTargetWin64()">;
+def NotWin64 : Predicate<"!Subtarget->isTargetWin64()">;
+def NotWin64WithoutFP : Predicate<"!Subtarget->isTargetWin64() ||"
+ "Subtarget->getFrameLowering()->hasFP(*MF)">;
+def IsPS4 : Predicate<"Subtarget->isTargetPS4()">;
+def NotPS4 : Predicate<"!Subtarget->isTargetPS4()">;
+def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">;
+def NotNaCl : Predicate<"!Subtarget->isTargetNaCl()">;
+def SmallCode : Predicate<"TM.getCodeModel() == CodeModel::Small">;
+def KernelCode : Predicate<"TM.getCodeModel() == CodeModel::Kernel">;
+def NearData : Predicate<"TM.getCodeModel() == CodeModel::Small ||"
+ "TM.getCodeModel() == CodeModel::Kernel">;
+def IsNotPIC : Predicate<"!TM.isPositionIndependent()">;
+def OptForSize : Predicate<"OptForSize">;
+def OptForMinSize : Predicate<"OptForMinSize">;
+def OptForSpeed : Predicate<"!OptForSize">;
+def FastBTMem : Predicate<"!Subtarget->isBTMemSlow()">;
+def CallImmAddr : Predicate<"Subtarget->isLegalToCallImmediateAddr()">;
+def FavorMemIndirectCall : Predicate<"!Subtarget->callRegIndirect()">;
+def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">;
+def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">;
+def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">;
+def HasMFence : Predicate<"Subtarget->hasMFence()">;
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Format Definitions.
+//
+
+include "X86InstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments.
+//
+
+// X86 specific condition code. These correspond to CondCode in
+// X86InstrInfo.h. They must be kept in synch.
+def X86_COND_A : PatLeaf<(i8 0)>; // alt. COND_NBE
+def X86_COND_AE : PatLeaf<(i8 1)>; // alt. COND_NC
+def X86_COND_B : PatLeaf<(i8 2)>; // alt. COND_C
+def X86_COND_BE : PatLeaf<(i8 3)>; // alt. COND_NA
+def X86_COND_E : PatLeaf<(i8 4)>; // alt. COND_Z
+def X86_COND_G : PatLeaf<(i8 5)>; // alt. COND_NLE
+def X86_COND_GE : PatLeaf<(i8 6)>; // alt. COND_NL
+def X86_COND_L : PatLeaf<(i8 7)>; // alt. COND_NGE
+def X86_COND_LE : PatLeaf<(i8 8)>; // alt. COND_NG
+def X86_COND_NE : PatLeaf<(i8 9)>; // alt. COND_NZ
+def X86_COND_NO : PatLeaf<(i8 10)>;
+def X86_COND_NP : PatLeaf<(i8 11)>; // alt. COND_PO
+def X86_COND_NS : PatLeaf<(i8 12)>;
+def X86_COND_O : PatLeaf<(i8 13)>;
+def X86_COND_P : PatLeaf<(i8 14)>; // alt. COND_PE
+def X86_COND_S : PatLeaf<(i8 15)>;
+
+def i16immSExt8 : ImmLeaf<i16, [{ return isInt<8>(Imm); }]>;
+def i32immSExt8 : ImmLeaf<i32, [{ return isInt<8>(Imm); }]>;
+def i64immSExt8 : ImmLeaf<i64, [{ return isInt<8>(Imm); }]>;
+def i64immSExt32 : ImmLeaf<i64, [{ return isInt<32>(Imm); }]>;
+
+// If we have multiple users of an immediate, it's much smaller to reuse
+// the register, rather than encode the immediate in every instruction.
+// This has the risk of increasing register pressure from stretched live
+// ranges, however, the immediates should be trivial to rematerialize by
+// the RA in the event of high register pressure.
+// TODO : This is currently enabled for stores and binary ops. There are more
+// cases for which this can be enabled, though this catches the bulk of the
+// issues.
+// TODO2 : This should really also be enabled under O2, but there's currently
+// an issue with RA where we don't pull the constants into their users
+// when we rematerialize them. I'll follow-up on enabling O2 after we fix that
+// issue.
+// TODO3 : This is currently limited to single basic blocks (DAG creation
+// pulls block immediates to the top and merges them if necessary).
+// Eventually, it would be nice to allow ConstantHoisting to merge constants
+// globally for potentially added savings.
+//
+def imm8_su : PatLeaf<(i8 relocImm), [{
+ return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def imm16_su : PatLeaf<(i16 relocImm), [{
+ return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def imm32_su : PatLeaf<(i32 relocImm), [{
+ return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def i64immSExt32_su : PatLeaf<(i64immSExt32), [{
+ return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+
+def i16immSExt8_su : PatLeaf<(i16immSExt8), [{
+ return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def i32immSExt8_su : PatLeaf<(i32immSExt8), [{
+ return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def i64immSExt8_su : PatLeaf<(i64immSExt8), [{
+ return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+
+// i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit
+// unsigned field.
+def i64immZExt32 : ImmLeaf<i64, [{ return isUInt<32>(Imm); }]>;
+
+def i64immZExt32SExt8 : ImmLeaf<i64, [{
+ return isUInt<32>(Imm) && isInt<8>(static_cast<int32_t>(Imm));
+}]>;
+
+// Helper fragments for loads.
+// It's always safe to treat a anyext i16 load as a i32 load if the i16 is
+// known to be 32-bit aligned or better. Ditto for i8 to i16.
+def loadi16 : PatFrag<(ops node:$ptr), (i16 (unindexedload node:$ptr)), [{
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ ISD::LoadExtType ExtType = LD->getExtensionType();
+ if (ExtType == ISD::NON_EXTLOAD)
+ return true;
+ if (ExtType == ISD::EXTLOAD)
+ return LD->getAlignment() >= 2 && !LD->isVolatile();
+ return false;
+}]>;
+
+def loadi16_anyext : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)),[{
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ ISD::LoadExtType ExtType = LD->getExtensionType();
+ if (ExtType == ISD::EXTLOAD)
+ return LD->getAlignment() >= 2 && !LD->isVolatile();
+ return false;
+}]>;
+
+def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ ISD::LoadExtType ExtType = LD->getExtensionType();
+ if (ExtType == ISD::NON_EXTLOAD)
+ return true;
+ if (ExtType == ISD::EXTLOAD)
+ return LD->getAlignment() >= 4 && !LD->isVolatile();
+ return false;
+}]>;
+
+def loadi8 : PatFrag<(ops node:$ptr), (i8 (load node:$ptr))>;
+def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>;
+def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>;
+def loadf64 : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>;
+def loadf80 : PatFrag<(ops node:$ptr), (f80 (load node:$ptr))>;
+def loadf128 : PatFrag<(ops node:$ptr), (f128 (load node:$ptr))>;
+
+def sextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (sextloadi8 node:$ptr))>;
+def sextloadi32i8 : PatFrag<(ops node:$ptr), (i32 (sextloadi8 node:$ptr))>;
+def sextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (sextloadi16 node:$ptr))>;
+def sextloadi64i8 : PatFrag<(ops node:$ptr), (i64 (sextloadi8 node:$ptr))>;
+def sextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (sextloadi16 node:$ptr))>;
+def sextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (sextloadi32 node:$ptr))>;
+
+def zextloadi8i1 : PatFrag<(ops node:$ptr), (i8 (zextloadi1 node:$ptr))>;
+def zextloadi16i1 : PatFrag<(ops node:$ptr), (i16 (zextloadi1 node:$ptr))>;
+def zextloadi32i1 : PatFrag<(ops node:$ptr), (i32 (zextloadi1 node:$ptr))>;
+def zextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (zextloadi8 node:$ptr))>;
+def zextloadi32i8 : PatFrag<(ops node:$ptr), (i32 (zextloadi8 node:$ptr))>;
+def zextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (zextloadi16 node:$ptr))>;
+def zextloadi64i1 : PatFrag<(ops node:$ptr), (i64 (zextloadi1 node:$ptr))>;
+def zextloadi64i8 : PatFrag<(ops node:$ptr), (i64 (zextloadi8 node:$ptr))>;
+def zextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (zextloadi16 node:$ptr))>;
+def zextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (zextloadi32 node:$ptr))>;
+
+def extloadi8i1 : PatFrag<(ops node:$ptr), (i8 (extloadi1 node:$ptr))>;
+def extloadi16i1 : PatFrag<(ops node:$ptr), (i16 (extloadi1 node:$ptr))>;
+def extloadi32i1 : PatFrag<(ops node:$ptr), (i32 (extloadi1 node:$ptr))>;
+def extloadi16i8 : PatFrag<(ops node:$ptr), (i16 (extloadi8 node:$ptr))>;
+def extloadi32i8 : PatFrag<(ops node:$ptr), (i32 (extloadi8 node:$ptr))>;
+def extloadi32i16 : PatFrag<(ops node:$ptr), (i32 (extloadi16 node:$ptr))>;
+def extloadi64i1 : PatFrag<(ops node:$ptr), (i64 (extloadi1 node:$ptr))>;
+def extloadi64i8 : PatFrag<(ops node:$ptr), (i64 (extloadi8 node:$ptr))>;
+def extloadi64i16 : PatFrag<(ops node:$ptr), (i64 (extloadi16 node:$ptr))>;
+def extloadi64i32 : PatFrag<(ops node:$ptr), (i64 (extloadi32 node:$ptr))>;
+
+
+// An 'and' node with a single use.
+def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
+ return N->hasOneUse();
+}]>;
+// An 'srl' node with a single use.
+def srl_su : PatFrag<(ops node:$lhs, node:$rhs), (srl node:$lhs, node:$rhs), [{
+ return N->hasOneUse();
+}]>;
+// An 'trunc' node with a single use.
+def trunc_su : PatFrag<(ops node:$src), (trunc node:$src), [{
+ return N->hasOneUse();
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction list.
+//
+
+// Nop
+let hasSideEffects = 0, SchedRW = [WriteZero] in {
+ def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", [], IIC_NOP>;
+ def NOOPW : I<0x1f, MRMXm, (outs), (ins i16mem:$zero),
+ "nop{w}\t$zero", [], IIC_NOP>, TB, OpSize16;
+ def NOOPL : I<0x1f, MRMXm, (outs), (ins i32mem:$zero),
+ "nop{l}\t$zero", [], IIC_NOP>, TB, OpSize32;
+}
+
+
+// Constructing a stack frame.
+def ENTER : Ii16<0xC8, RawFrmImm8, (outs), (ins i16imm:$len, i8imm:$lvl),
+ "enter\t$len, $lvl", [], IIC_ENTER>, Sched<[WriteMicrocoded]>;
+
+let SchedRW = [WriteALU] in {
+let Defs = [EBP, ESP], Uses = [EBP, ESP], mayLoad = 1, hasSideEffects=0 in
+def LEAVE : I<0xC9, RawFrm,
+ (outs), (ins), "leave", [], IIC_LEAVE>,
+ Requires<[Not64BitMode]>;
+
+let Defs = [RBP,RSP], Uses = [RBP,RSP], mayLoad = 1, hasSideEffects = 0 in
+def LEAVE64 : I<0xC9, RawFrm,
+ (outs), (ins), "leave", [], IIC_LEAVE>,
+ Requires<[In64BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Instructions.
+//
+
+let isBarrier = 1, hasSideEffects = 1, usesCustomInserter = 1 in
+ def Int_eh_sjlj_setup_dispatch
+ : PseudoI<(outs), (ins), [(X86eh_sjlj_setup_dispatch)]>;
+
+let Defs = [ESP], Uses = [ESP], hasSideEffects=0 in {
+let mayLoad = 1, SchedRW = [WriteLoad] in {
+def POP16r : I<0x58, AddRegFrm, (outs GR16:$reg), (ins), "pop{w}\t$reg", [],
+ IIC_POP_REG16>, OpSize16;
+def POP32r : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", [],
+ IIC_POP_REG>, OpSize32, Requires<[Not64BitMode]>;
+def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", [],
+ IIC_POP_REG>, OpSize16;
+def POP16rmm: I<0x8F, MRM0m, (outs), (ins i16mem:$dst), "pop{w}\t$dst", [],
+ IIC_POP_MEM>, OpSize16;
+def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", [],
+ IIC_POP_REG>, OpSize32, Requires<[Not64BitMode]>;
+def POP32rmm: I<0x8F, MRM0m, (outs), (ins i32mem:$dst), "pop{l}\t$dst", [],
+ IIC_POP_MEM>, OpSize32, Requires<[Not64BitMode]>;
+} // mayLoad, SchedRW
+
+let mayStore = 1, SchedRW = [WriteStore] in {
+def PUSH16r : I<0x50, AddRegFrm, (outs), (ins GR16:$reg), "push{w}\t$reg",[],
+ IIC_PUSH_REG>, OpSize16;
+def PUSH32r : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[],
+ IIC_PUSH_REG>, OpSize32, Requires<[Not64BitMode]>;
+def PUSH16rmr: I<0xFF, MRM6r, (outs), (ins GR16:$reg), "push{w}\t$reg",[],
+ IIC_PUSH_REG>, OpSize16;
+def PUSH32rmr: I<0xFF, MRM6r, (outs), (ins GR32:$reg), "push{l}\t$reg",[],
+ IIC_PUSH_REG>, OpSize32, Requires<[Not64BitMode]>;
+
+def PUSH16i8 : Ii8<0x6a, RawFrm, (outs), (ins i16i8imm:$imm),
+ "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16;
+def PUSHi16 : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm),
+ "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16;
+
+def PUSH32i8 : Ii8<0x6a, RawFrm, (outs), (ins i32i8imm:$imm),
+ "push{l}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
+ Requires<[Not64BitMode]>;
+def PUSHi32 : Ii32<0x68, RawFrm, (outs), (ins i32imm:$imm),
+ "push{l}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
+ Requires<[Not64BitMode]>;
+} // mayStore, SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in {
+def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src",[],
+ IIC_PUSH_MEM>, OpSize16;
+def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[],
+ IIC_PUSH_MEM>, OpSize32, Requires<[Not64BitMode]>;
+} // mayLoad, mayStore, SchedRW
+
+}
+
+let mayLoad = 1, mayStore = 1, usesCustomInserter = 1,
+ SchedRW = [WriteRMW], Defs = [ESP] in {
+ let Uses = [ESP] in
+ def RDFLAGS32 : PseudoI<(outs GR32:$dst), (ins),
+ [(set GR32:$dst, (int_x86_flags_read_u32))]>,
+ Requires<[Not64BitMode]>;
+
+ let Uses = [RSP] in
+ def RDFLAGS64 : PseudoI<(outs GR64:$dst), (ins),
+ [(set GR64:$dst, (int_x86_flags_read_u64))]>,
+ Requires<[In64BitMode]>;
+}
+
+let mayLoad = 1, mayStore = 1, usesCustomInserter = 1,
+ SchedRW = [WriteRMW] in {
+ let Defs = [ESP, EFLAGS], Uses = [ESP] in
+ def WRFLAGS32 : PseudoI<(outs), (ins GR32:$src),
+ [(int_x86_flags_write_u32 GR32:$src)]>,
+ Requires<[Not64BitMode]>;
+
+ let Defs = [RSP, EFLAGS], Uses = [RSP] in
+ def WRFLAGS64 : PseudoI<(outs), (ins GR64:$src),
+ [(int_x86_flags_write_u64 GR64:$src)]>,
+ Requires<[In64BitMode]>;
+}
+
+let Defs = [ESP, EFLAGS], Uses = [ESP], mayLoad = 1, hasSideEffects=0,
+ SchedRW = [WriteLoad] in {
+def POPF16 : I<0x9D, RawFrm, (outs), (ins), "popf{w}", [], IIC_POP_F>,
+ OpSize16;
+def POPF32 : I<0x9D, RawFrm, (outs), (ins), "popf{l|d}", [], IIC_POP_FD>,
+ OpSize32, Requires<[Not64BitMode]>;
+}
+
+let Defs = [ESP], Uses = [ESP, EFLAGS], mayStore = 1, hasSideEffects=0,
+ SchedRW = [WriteStore] in {
+def PUSHF16 : I<0x9C, RawFrm, (outs), (ins), "pushf{w}", [], IIC_PUSH_F>,
+ OpSize16;
+def PUSHF32 : I<0x9C, RawFrm, (outs), (ins), "pushf{l|d}", [], IIC_PUSH_F>,
+ OpSize32, Requires<[Not64BitMode]>;
+}
+
+let Defs = [RSP], Uses = [RSP], hasSideEffects=0 in {
+let mayLoad = 1, SchedRW = [WriteLoad] in {
+def POP64r : I<0x58, AddRegFrm, (outs GR64:$reg), (ins), "pop{q}\t$reg", [],
+ IIC_POP_REG>, OpSize32, Requires<[In64BitMode]>;
+def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", [],
+ IIC_POP_REG>, OpSize32, Requires<[In64BitMode]>;
+def POP64rmm: I<0x8F, MRM0m, (outs), (ins i64mem:$dst), "pop{q}\t$dst", [],
+ IIC_POP_MEM>, OpSize32, Requires<[In64BitMode]>;
+} // mayLoad, SchedRW
+let mayStore = 1, SchedRW = [WriteStore] in {
+def PUSH64r : I<0x50, AddRegFrm, (outs), (ins GR64:$reg), "push{q}\t$reg", [],
+ IIC_PUSH_REG>, OpSize32, Requires<[In64BitMode]>;
+def PUSH64rmr: I<0xFF, MRM6r, (outs), (ins GR64:$reg), "push{q}\t$reg", [],
+ IIC_PUSH_REG>, OpSize32, Requires<[In64BitMode]>;
+} // mayStore, SchedRW
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in {
+def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", [],
+ IIC_PUSH_MEM>, OpSize32, Requires<[In64BitMode]>;
+} // mayLoad, mayStore, SchedRW
+}
+
+let Defs = [RSP], Uses = [RSP], hasSideEffects = 0, mayStore = 1,
+ SchedRW = [WriteStore] in {
+def PUSH64i8 : Ii8<0x6a, RawFrm, (outs), (ins i64i8imm:$imm),
+ "push{q}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
+ Requires<[In64BitMode]>;
+def PUSH64i32 : Ii32S<0x68, RawFrm, (outs), (ins i64i32imm:$imm),
+ "push{q}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
+ Requires<[In64BitMode]>;
+}
+
+let Defs = [RSP, EFLAGS], Uses = [RSP], mayLoad = 1, hasSideEffects=0 in
+def POPF64 : I<0x9D, RawFrm, (outs), (ins), "popfq", [], IIC_POP_FD>,
+ OpSize32, Requires<[In64BitMode]>, Sched<[WriteLoad]>;
+let Defs = [RSP], Uses = [RSP, EFLAGS], mayStore = 1, hasSideEffects=0 in
+def PUSHF64 : I<0x9C, RawFrm, (outs), (ins), "pushfq", [], IIC_PUSH_F>,
+ OpSize32, Requires<[In64BitMode]>, Sched<[WriteStore]>;
+
+let Defs = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], Uses = [ESP],
+ mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteLoad] in {
+def POPA32 : I<0x61, RawFrm, (outs), (ins), "popal", [], IIC_POP_A>,
+ OpSize32, Requires<[Not64BitMode]>;
+def POPA16 : I<0x61, RawFrm, (outs), (ins), "popaw", [], IIC_POP_A>,
+ OpSize16, Requires<[Not64BitMode]>;
+}
+let Defs = [ESP], Uses = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP],
+ mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in {
+def PUSHA32 : I<0x60, RawFrm, (outs), (ins), "pushal", [], IIC_PUSH_A>,
+ OpSize32, Requires<[Not64BitMode]>;
+def PUSHA16 : I<0x60, RawFrm, (outs), (ins), "pushaw", [], IIC_PUSH_A>,
+ OpSize16, Requires<[Not64BitMode]>;
+}
+
+let Constraints = "$src = $dst", SchedRW = [WriteALU] in {
+// GR32 = bswap GR32
+def BSWAP32r : I<0xC8, AddRegFrm,
+ (outs GR32:$dst), (ins GR32:$src),
+ "bswap{l}\t$dst",
+ [(set GR32:$dst, (bswap GR32:$src))], IIC_BSWAP>, OpSize32, TB;
+
+def BSWAP64r : RI<0xC8, AddRegFrm, (outs GR64:$dst), (ins GR64:$src),
+ "bswap{q}\t$dst",
+ [(set GR64:$dst, (bswap GR64:$src))], IIC_BSWAP>, TB;
+} // Constraints = "$src = $dst", SchedRW
+
+// Bit scan instructions.
+let Defs = [EFLAGS] in {
+def BSF16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "bsf{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))],
+ IIC_BIT_SCAN_REG>, PS, OpSize16, Sched<[WriteShift]>;
+def BSF16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "bsf{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))],
+ IIC_BIT_SCAN_MEM>, PS, OpSize16, Sched<[WriteShiftLd]>;
+def BSF32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "bsf{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))],
+ IIC_BIT_SCAN_REG>, PS, OpSize32, Sched<[WriteShift]>;
+def BSF32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "bsf{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))],
+ IIC_BIT_SCAN_MEM>, PS, OpSize32, Sched<[WriteShiftLd]>;
+def BSF64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "bsf{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))],
+ IIC_BIT_SCAN_REG>, PS, Sched<[WriteShift]>;
+def BSF64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "bsf{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))],
+ IIC_BIT_SCAN_MEM>, PS, Sched<[WriteShiftLd]>;
+
+def BSR16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "bsr{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))],
+ IIC_BIT_SCAN_REG>, PS, OpSize16, Sched<[WriteShift]>;
+def BSR16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "bsr{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))],
+ IIC_BIT_SCAN_MEM>, PS, OpSize16, Sched<[WriteShiftLd]>;
+def BSR32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "bsr{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))],
+ IIC_BIT_SCAN_REG>, PS, OpSize32, Sched<[WriteShift]>;
+def BSR32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "bsr{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))],
+ IIC_BIT_SCAN_MEM>, PS, OpSize32, Sched<[WriteShiftLd]>;
+def BSR64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "bsr{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))],
+ IIC_BIT_SCAN_REG>, PS, Sched<[WriteShift]>;
+def BSR64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "bsr{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))],
+ IIC_BIT_SCAN_MEM>, PS, Sched<[WriteShiftLd]>;
+} // Defs = [EFLAGS]
+
+let SchedRW = [WriteMicrocoded] in {
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [EDI,ESI], Uses = [EDI,ESI,EFLAGS] in {
+def MOVSB : I<0xA4, RawFrmDstSrc, (outs), (ins dstidx8:$dst, srcidx8:$src),
+ "movsb\t{$src, $dst|$dst, $src}", [], IIC_MOVS>;
+def MOVSW : I<0xA5, RawFrmDstSrc, (outs), (ins dstidx16:$dst, srcidx16:$src),
+ "movsw\t{$src, $dst|$dst, $src}", [], IIC_MOVS>, OpSize16;
+def MOVSL : I<0xA5, RawFrmDstSrc, (outs), (ins dstidx32:$dst, srcidx32:$src),
+ "movs{l|d}\t{$src, $dst|$dst, $src}", [], IIC_MOVS>, OpSize32;
+def MOVSQ : RI<0xA5, RawFrmDstSrc, (outs), (ins dstidx64:$dst, srcidx64:$src),
+ "movsq\t{$src, $dst|$dst, $src}", [], IIC_MOVS>;
+}
+
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [EDI], Uses = [AL,EDI,EFLAGS] in
+def STOSB : I<0xAA, RawFrmDst, (outs), (ins dstidx8:$dst),
+ "stosb\t{%al, $dst|$dst, al}", [], IIC_STOS>;
+let Defs = [EDI], Uses = [AX,EDI,EFLAGS] in
+def STOSW : I<0xAB, RawFrmDst, (outs), (ins dstidx16:$dst),
+ "stosw\t{%ax, $dst|$dst, ax}", [], IIC_STOS>, OpSize16;
+let Defs = [EDI], Uses = [EAX,EDI,EFLAGS] in
+def STOSL : I<0xAB, RawFrmDst, (outs), (ins dstidx32:$dst),
+ "stos{l|d}\t{%eax, $dst|$dst, eax}", [], IIC_STOS>, OpSize32;
+let Defs = [RDI], Uses = [RAX,RDI,EFLAGS] in
+def STOSQ : RI<0xAB, RawFrmDst, (outs), (ins dstidx64:$dst),
+ "stosq\t{%rax, $dst|$dst, rax}", [], IIC_STOS>;
+
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [EDI,EFLAGS], Uses = [AL,EDI,EFLAGS] in
+def SCASB : I<0xAE, RawFrmDst, (outs), (ins dstidx8:$dst),
+ "scasb\t{$dst, %al|al, $dst}", [], IIC_SCAS>;
+let Defs = [EDI,EFLAGS], Uses = [AX,EDI,EFLAGS] in
+def SCASW : I<0xAF, RawFrmDst, (outs), (ins dstidx16:$dst),
+ "scasw\t{$dst, %ax|ax, $dst}", [], IIC_SCAS>, OpSize16;
+let Defs = [EDI,EFLAGS], Uses = [EAX,EDI,EFLAGS] in
+def SCASL : I<0xAF, RawFrmDst, (outs), (ins dstidx32:$dst),
+ "scas{l|d}\t{$dst, %eax|eax, $dst}", [], IIC_SCAS>, OpSize32;
+let Defs = [EDI,EFLAGS], Uses = [RAX,EDI,EFLAGS] in
+def SCASQ : RI<0xAF, RawFrmDst, (outs), (ins dstidx64:$dst),
+ "scasq\t{$dst, %rax|rax, $dst}", [], IIC_SCAS>;
+
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [EDI,ESI,EFLAGS], Uses = [EDI,ESI,EFLAGS] in {
+def CMPSB : I<0xA6, RawFrmDstSrc, (outs), (ins dstidx8:$dst, srcidx8:$src),
+ "cmpsb\t{$dst, $src|$src, $dst}", [], IIC_CMPS>;
+def CMPSW : I<0xA7, RawFrmDstSrc, (outs), (ins dstidx16:$dst, srcidx16:$src),
+ "cmpsw\t{$dst, $src|$src, $dst}", [], IIC_CMPS>, OpSize16;
+def CMPSL : I<0xA7, RawFrmDstSrc, (outs), (ins dstidx32:$dst, srcidx32:$src),
+ "cmps{l|d}\t{$dst, $src|$src, $dst}", [], IIC_CMPS>, OpSize32;
+def CMPSQ : RI<0xA7, RawFrmDstSrc, (outs), (ins dstidx64:$dst, srcidx64:$src),
+ "cmpsq\t{$dst, $src|$src, $dst}", [], IIC_CMPS>;
+}
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Move Instructions.
+//
+let SchedRW = [WriteMove] in {
+let hasSideEffects = 0 in {
+def MOV8rr : I<0x88, MRMDestReg, (outs GR8 :$dst), (ins GR8 :$src),
+ "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+def MOV16rr : I<0x89, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16;
+def MOV32rr : I<0x89, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32;
+def MOV64rr : RI<0x89, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+}
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+def MOV8ri : Ii8 <0xB0, AddRegFrm, (outs GR8 :$dst), (ins i8imm :$src),
+ "mov{b}\t{$src, $dst|$dst, $src}",
+ [(set GR8:$dst, imm:$src)], IIC_MOV>;
+def MOV16ri : Ii16<0xB8, AddRegFrm, (outs GR16:$dst), (ins i16imm:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, imm:$src)], IIC_MOV>, OpSize16;
+def MOV32ri : Ii32<0xB8, AddRegFrm, (outs GR32:$dst), (ins i32imm:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, relocImm:$src)], IIC_MOV>, OpSize32;
+def MOV64ri32 : RIi32S<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, i64immSExt32:$src)], IIC_MOV>;
+}
+let isReMaterializable = 1 in {
+def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src),
+ "movabs{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, relocImm:$src)], IIC_MOV>;
+}
+
+// Longer forms that use a ModR/M byte. Needed for disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
+def MOV8ri_alt : Ii8 <0xC6, MRM0r, (outs GR8 :$dst), (ins i8imm :$src),
+ "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+def MOV16ri_alt : Ii16<0xC7, MRM0r, (outs GR16:$dst), (ins i16imm:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16;
+def MOV32ri_alt : Ii32<0xC7, MRM0r, (outs GR32:$dst), (ins i32imm:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32;
+}
+} // SchedRW
+
+let SchedRW = [WriteStore] in {
+def MOV8mi : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src),
+ "mov{b}\t{$src, $dst|$dst, $src}",
+ [(store (i8 imm8_su:$src), addr:$dst)], IIC_MOV_MEM>;
+def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}",
+ [(store (i16 imm16_su:$src), addr:$dst)], IIC_MOV_MEM>, OpSize16;
+def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}",
+ [(store (i32 imm32_su:$src), addr:$dst)], IIC_MOV_MEM>, OpSize32;
+def MOV64mi32 : RIi32S<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}",
+ [(store i64immSExt32_su:$src, addr:$dst)], IIC_MOV_MEM>;
+} // SchedRW
+
+let hasSideEffects = 0 in {
+
+/// Memory offset versions of moves. The immediate is an address mode sized
+/// offset from the segment base.
+let SchedRW = [WriteALU] in {
+let mayLoad = 1 in {
+let Defs = [AL] in
+def MOV8ao32 : Ii32<0xA0, RawFrmMemOffs, (outs), (ins offset32_8:$src),
+ "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>,
+ AdSize32;
+let Defs = [AX] in
+def MOV16ao32 : Ii32<0xA1, RawFrmMemOffs, (outs), (ins offset32_16:$src),
+ "mov{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>,
+ OpSize16, AdSize32;
+let Defs = [EAX] in
+def MOV32ao32 : Ii32<0xA1, RawFrmMemOffs, (outs), (ins offset32_32:$src),
+ "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>,
+ OpSize32, AdSize32;
+let Defs = [RAX] in
+def MOV64ao32 : RIi32<0xA1, RawFrmMemOffs, (outs), (ins offset32_64:$src),
+ "mov{q}\t{$src, %rax|rax, $src}", [], IIC_MOV_MEM>,
+ AdSize32;
+
+let Defs = [AL] in
+def MOV8ao16 : Ii16<0xA0, RawFrmMemOffs, (outs), (ins offset16_8:$src),
+ "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>, AdSize16;
+let Defs = [AX] in
+def MOV16ao16 : Ii16<0xA1, RawFrmMemOffs, (outs), (ins offset16_16:$src),
+ "mov{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>,
+ OpSize16, AdSize16;
+let Defs = [EAX] in
+def MOV32ao16 : Ii16<0xA1, RawFrmMemOffs, (outs), (ins offset16_32:$src),
+ "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>,
+ AdSize16, OpSize32;
+}
+let mayStore = 1 in {
+let Uses = [AL] in
+def MOV8o32a : Ii32<0xA2, RawFrmMemOffs, (outs), (ins offset32_8:$dst),
+ "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>, AdSize32;
+let Uses = [AX] in
+def MOV16o32a : Ii32<0xA3, RawFrmMemOffs, (outs), (ins offset32_16:$dst),
+ "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>,
+ OpSize16, AdSize32;
+let Uses = [EAX] in
+def MOV32o32a : Ii32<0xA3, RawFrmMemOffs, (outs), (ins offset32_32:$dst),
+ "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>,
+ OpSize32, AdSize32;
+let Uses = [RAX] in
+def MOV64o32a : RIi32<0xA3, RawFrmMemOffs, (outs), (ins offset32_64:$dst),
+ "mov{q}\t{%rax, $dst|$dst, rax}", [], IIC_MOV_MEM>,
+ AdSize32;
+
+let Uses = [AL] in
+def MOV8o16a : Ii16<0xA2, RawFrmMemOffs, (outs), (ins offset16_8:$dst),
+ "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>, AdSize16;
+let Uses = [AX] in
+def MOV16o16a : Ii16<0xA3, RawFrmMemOffs, (outs), (ins offset16_16:$dst),
+ "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>,
+ OpSize16, AdSize16;
+let Uses = [EAX] in
+def MOV32o16a : Ii16<0xA3, RawFrmMemOffs, (outs), (ins offset16_32:$dst),
+ "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>,
+ OpSize32, AdSize16;
+}
+}
+
+// These forms all have full 64-bit absolute addresses in their instructions
+// and use the movabs mnemonic to indicate this specific form.
+let mayLoad = 1 in {
+let Defs = [AL] in
+def MOV8ao64 : RIi64_NOREX<0xA0, RawFrmMemOffs, (outs), (ins offset64_8:$src),
+ "movabs{b}\t{$src, %al|al, $src}", []>, AdSize64;
+let Defs = [AX] in
+def MOV16ao64 : RIi64_NOREX<0xA1, RawFrmMemOffs, (outs), (ins offset64_16:$src),
+ "movabs{w}\t{$src, %ax|ax, $src}", []>, OpSize16, AdSize64;
+let Defs = [EAX] in
+def MOV32ao64 : RIi64_NOREX<0xA1, RawFrmMemOffs, (outs), (ins offset64_32:$src),
+ "movabs{l}\t{$src, %eax|eax, $src}", []>, OpSize32,
+ AdSize64;
+let Defs = [RAX] in
+def MOV64ao64 : RIi64<0xA1, RawFrmMemOffs, (outs), (ins offset64_64:$src),
+ "movabs{q}\t{$src, %rax|rax, $src}", []>, AdSize64;
+}
+
+let mayStore = 1 in {
+let Uses = [AL] in
+def MOV8o64a : RIi64_NOREX<0xA2, RawFrmMemOffs, (outs), (ins offset64_8:$dst),
+ "movabs{b}\t{%al, $dst|$dst, al}", []>, AdSize64;
+let Uses = [AX] in
+def MOV16o64a : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs), (ins offset64_16:$dst),
+ "movabs{w}\t{%ax, $dst|$dst, ax}", []>, OpSize16, AdSize64;
+let Uses = [EAX] in
+def MOV32o64a : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs), (ins offset64_32:$dst),
+ "movabs{l}\t{%eax, $dst|$dst, eax}", []>, OpSize32,
+ AdSize64;
+let Uses = [RAX] in
+def MOV64o64a : RIi64<0xA3, RawFrmMemOffs, (outs), (ins offset64_64:$dst),
+ "movabs{q}\t{%rax, $dst|$dst, rax}", []>, AdSize64;
+}
+} // hasSideEffects = 0
+
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
+ SchedRW = [WriteMove] in {
+def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src),
+ "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+def MOV16rr_REV : I<0x8B, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16;
+def MOV32rr_REV : I<0x8B, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32;
+def MOV64rr_REV : RI<0x8B, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+}
+
+let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in {
+def MOV8rm : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src),
+ "mov{b}\t{$src, $dst|$dst, $src}",
+ [(set GR8:$dst, (loadi8 addr:$src))], IIC_MOV_MEM>;
+def MOV16rm : I<0x8B, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (loadi16 addr:$src))], IIC_MOV_MEM>, OpSize16;
+def MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (loadi32 addr:$src))], IIC_MOV_MEM>, OpSize32;
+def MOV64rm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (load addr:$src))], IIC_MOV_MEM>;
+}
+
+let SchedRW = [WriteStore] in {
+def MOV8mr : I<0x88, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src),
+ "mov{b}\t{$src, $dst|$dst, $src}",
+ [(store GR8:$src, addr:$dst)], IIC_MOV_MEM>;
+def MOV16mr : I<0x89, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}",
+ [(store GR16:$src, addr:$dst)], IIC_MOV_MEM>, OpSize16;
+def MOV32mr : I<0x89, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}",
+ [(store GR32:$src, addr:$dst)], IIC_MOV_MEM>, OpSize32;
+def MOV64mr : RI<0x89, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}",
+ [(store GR64:$src, addr:$dst)], IIC_MOV_MEM>;
+} // SchedRW
+
+// Versions of MOV8rr, MOV8mr, and MOV8rm that use i8mem_NOREX and GR8_NOREX so
+// that they can be used for copying and storing h registers, which can't be
+// encoded when a REX prefix is present.
+let isCodeGenOnly = 1 in {
+let hasSideEffects = 0 in
+def MOV8rr_NOREX : I<0x88, MRMDestReg,
+ (outs GR8_NOREX:$dst), (ins GR8_NOREX:$src),
+ "mov{b}\t{$src, $dst|$dst, $src} # NOREX", [], IIC_MOV>,
+ Sched<[WriteMove]>;
+let mayStore = 1, hasSideEffects = 0 in
+def MOV8mr_NOREX : I<0x88, MRMDestMem,
+ (outs), (ins i8mem_NOREX:$dst, GR8_NOREX:$src),
+ "mov{b}\t{$src, $dst|$dst, $src} # NOREX", [],
+ IIC_MOV_MEM>, Sched<[WriteStore]>;
+let mayLoad = 1, hasSideEffects = 0,
+ canFoldAsLoad = 1, isReMaterializable = 1 in
+def MOV8rm_NOREX : I<0x8A, MRMSrcMem,
+ (outs GR8_NOREX:$dst), (ins i8mem_NOREX:$src),
+ "mov{b}\t{$src, $dst|$dst, $src} # NOREX", [],
+ IIC_MOV_MEM>, Sched<[WriteLoad]>;
+}
+
+
+// Condition code ops, incl. set if equal/not equal/...
+let SchedRW = [WriteALU] in {
+let Defs = [EFLAGS], Uses = [AH] in
+def SAHF : I<0x9E, RawFrm, (outs), (ins), "sahf",
+ [(set EFLAGS, (X86sahf AH))], IIC_AHF>,
+ Requires<[HasLAHFSAHF]>;
+let Defs = [AH], Uses = [EFLAGS], hasSideEffects = 0 in
+def LAHF : I<0x9F, RawFrm, (outs), (ins), "lahf", [],
+ IIC_AHF>, // AH = flags
+ Requires<[HasLAHFSAHF]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Bit tests instructions: BT, BTS, BTR, BTC.
+
+let Defs = [EFLAGS] in {
+let SchedRW = [WriteALU] in {
+def BT16rr : I<0xA3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+ "bt{w}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt GR16:$src1, GR16:$src2))], IIC_BT_RR>,
+ OpSize16, TB;
+def BT32rr : I<0xA3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+ "bt{l}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt GR32:$src1, GR32:$src2))], IIC_BT_RR>,
+ OpSize32, TB;
+def BT64rr : RI<0xA3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+ "bt{q}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt GR64:$src1, GR64:$src2))], IIC_BT_RR>, TB;
+} // SchedRW
+
+// Unlike with the register+register form, the memory+register form of the
+// bt instruction does not ignore the high bits of the index. From ISel's
+// perspective, this is pretty bizarre. Make these instructions disassembly
+// only for now.
+
+let mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteALULd] in {
+ def BT16mr : I<0xA3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+ "bt{w}\t{$src2, $src1|$src1, $src2}",
+ // [(X86bt (loadi16 addr:$src1), GR16:$src2),
+ // (implicit EFLAGS)]
+ [], IIC_BT_MR
+ >, OpSize16, TB, Requires<[FastBTMem]>;
+ def BT32mr : I<0xA3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+ "bt{l}\t{$src2, $src1|$src1, $src2}",
+ // [(X86bt (loadi32 addr:$src1), GR32:$src2),
+ // (implicit EFLAGS)]
+ [], IIC_BT_MR
+ >, OpSize32, TB, Requires<[FastBTMem]>;
+ def BT64mr : RI<0xA3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+ "bt{q}\t{$src2, $src1|$src1, $src2}",
+ // [(X86bt (loadi64 addr:$src1), GR64:$src2),
+ // (implicit EFLAGS)]
+ [], IIC_BT_MR
+ >, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+ "bt{w}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt GR16:$src1, i16immSExt8:$src2))],
+ IIC_BT_RI>, OpSize16, TB;
+def BT32ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+ "bt{l}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt GR32:$src1, i32immSExt8:$src2))],
+ IIC_BT_RI>, OpSize32, TB;
+def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+ "bt{q}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt GR64:$src1, i64immSExt8:$src2))],
+ IIC_BT_RI>, TB;
+} // SchedRW
+
+// Note that these instructions don't need FastBTMem because that
+// only applies when the other operand is in a register. When it's
+// an immediate, bt is still fast.
+let SchedRW = [WriteALU] in {
+def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+ "bt{w}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt (loadi16 addr:$src1), i16immSExt8:$src2))
+ ], IIC_BT_MI>, OpSize16, TB;
+def BT32mi8 : Ii8<0xBA, MRM4m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+ "bt{l}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt (loadi32 addr:$src1), i32immSExt8:$src2))
+ ], IIC_BT_MI>, OpSize32, TB;
+def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+ "bt{q}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt (loadi64 addr:$src1),
+ i64immSExt8:$src2))], IIC_BT_MI>, TB;
+} // SchedRW
+
+let hasSideEffects = 0 in {
+let SchedRW = [WriteALU] in {
+def BTC16rr : I<0xBB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+ "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+ OpSize16, TB;
+def BTC32rr : I<0xBB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+ "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+ OpSize32, TB;
+def BTC64rr : RI<0xBB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+ "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTC16mr : I<0xBB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+ "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+ OpSize16, TB;
+def BTC32mr : I<0xBB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+ "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+ OpSize32, TB;
+def BTC64mr : RI<0xBB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+ "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BTC16ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+ "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+ OpSize16, TB;
+def BTC32ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+ "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+ OpSize32, TB;
+def BTC64ri8 : RIi8<0xBA, MRM7r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+ "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTC16mi8 : Ii8<0xBA, MRM7m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+ "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+ OpSize16, TB;
+def BTC32mi8 : Ii8<0xBA, MRM7m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+ "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+ OpSize32, TB;
+def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+ "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BTR16rr : I<0xB3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+ "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+ OpSize16, TB;
+def BTR32rr : I<0xB3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+ "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+ OpSize32, TB;
+def BTR64rr : RI<0xB3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+ "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTR16mr : I<0xB3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+ "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+ OpSize16, TB;
+def BTR32mr : I<0xB3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+ "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+ OpSize32, TB;
+def BTR64mr : RI<0xB3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+ "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BTR16ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+ "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+ OpSize16, TB;
+def BTR32ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+ "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+ OpSize32, TB;
+def BTR64ri8 : RIi8<0xBA, MRM6r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+ "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTR16mi8 : Ii8<0xBA, MRM6m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+ "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+ OpSize16, TB;
+def BTR32mi8 : Ii8<0xBA, MRM6m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+ "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+ OpSize32, TB;
+def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+ "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BTS16rr : I<0xAB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+ "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+ OpSize16, TB;
+def BTS32rr : I<0xAB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+ "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+ OpSize32, TB;
+def BTS64rr : RI<0xAB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+ "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTS16mr : I<0xAB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+ "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+ OpSize16, TB;
+def BTS32mr : I<0xAB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+ "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+ OpSize32, TB;
+def BTS64mr : RI<0xAB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+ "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BTS16ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+ "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+ OpSize16, TB;
+def BTS32ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+ "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+ OpSize32, TB;
+def BTS64ri8 : RIi8<0xBA, MRM5r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+ "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTS16mi8 : Ii8<0xBA, MRM5m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+ "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+ OpSize16, TB;
+def BTS32mi8 : Ii8<0xBA, MRM5m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+ "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+ OpSize32, TB;
+def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+ "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
+}
+} // hasSideEffects = 0
+} // Defs = [EFLAGS]
+
+
+//===----------------------------------------------------------------------===//
+// Atomic support
+//
+
+// Atomic swap. These are just normal xchg instructions. But since a memory
+// operand is referenced, the atomicity is ensured.
+multiclass ATOMIC_SWAP<bits<8> opc8, bits<8> opc, string mnemonic, string frag,
+ InstrItinClass itin> {
+ let Constraints = "$val = $dst", SchedRW = [WriteALULd, WriteRMW] in {
+ def NAME#8rm : I<opc8, MRMSrcMem, (outs GR8:$dst),
+ (ins GR8:$val, i8mem:$ptr),
+ !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"),
+ [(set
+ GR8:$dst,
+ (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))],
+ itin>;
+ def NAME#16rm : I<opc, MRMSrcMem, (outs GR16:$dst),
+ (ins GR16:$val, i16mem:$ptr),
+ !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"),
+ [(set
+ GR16:$dst,
+ (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))],
+ itin>, OpSize16;
+ def NAME#32rm : I<opc, MRMSrcMem, (outs GR32:$dst),
+ (ins GR32:$val, i32mem:$ptr),
+ !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"),
+ [(set
+ GR32:$dst,
+ (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))],
+ itin>, OpSize32;
+ def NAME#64rm : RI<opc, MRMSrcMem, (outs GR64:$dst),
+ (ins GR64:$val, i64mem:$ptr),
+ !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"),
+ [(set
+ GR64:$dst,
+ (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))],
+ itin>;
+ }
+}
+
+defm XCHG : ATOMIC_SWAP<0x86, 0x87, "xchg", "atomic_swap", IIC_XCHG_MEM>;
+
+// Swap between registers.
+let SchedRW = [WriteALU] in {
+let Constraints = "$val = $dst" in {
+def XCHG8rr : I<0x86, MRMSrcReg, (outs GR8:$dst), (ins GR8:$val, GR8:$src),
+ "xchg{b}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>;
+def XCHG16rr : I<0x87, MRMSrcReg, (outs GR16:$dst), (ins GR16:$val, GR16:$src),
+ "xchg{w}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>,
+ OpSize16;
+def XCHG32rr : I<0x87, MRMSrcReg, (outs GR32:$dst), (ins GR32:$val, GR32:$src),
+ "xchg{l}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>,
+ OpSize32;
+def XCHG64rr : RI<0x87, MRMSrcReg, (outs GR64:$dst), (ins GR64:$val,GR64:$src),
+ "xchg{q}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>;
+}
+
+// Swap between EAX and other registers.
+let Uses = [AX], Defs = [AX] in
+def XCHG16ar : I<0x90, AddRegFrm, (outs), (ins GR16:$src),
+ "xchg{w}\t{$src, %ax|ax, $src}", [], IIC_XCHG_REG>, OpSize16;
+let Uses = [EAX], Defs = [EAX] in
+def XCHG32ar : I<0x90, AddRegFrm, (outs), (ins GR32:$src),
+ "xchg{l}\t{$src, %eax|eax, $src}", [], IIC_XCHG_REG>,
+ OpSize32, Requires<[Not64BitMode]>;
+let Uses = [EAX], Defs = [EAX] in
+// Uses GR32_NOAX in 64-bit mode to prevent encoding using the 0x90 NOP encoding.
+// xchg %eax, %eax needs to clear upper 32-bits of RAX so is not a NOP.
+def XCHG32ar64 : I<0x90, AddRegFrm, (outs), (ins GR32_NOAX:$src),
+ "xchg{l}\t{$src, %eax|eax, $src}", [], IIC_XCHG_REG>,
+ OpSize32, Requires<[In64BitMode]>;
+let Uses = [RAX], Defs = [RAX] in
+def XCHG64ar : RI<0x90, AddRegFrm, (outs), (ins GR64:$src),
+ "xchg{q}\t{$src, %rax|rax, $src}", [], IIC_XCHG_REG>;
+} // SchedRW
+
+let SchedRW = [WriteALU] in {
+def XADD8rr : I<0xC0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src),
+ "xadd{b}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB;
+def XADD16rr : I<0xC1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
+ "xadd{w}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB,
+ OpSize16;
+def XADD32rr : I<0xC1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+ "xadd{l}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB,
+ OpSize32;
+def XADD64rr : RI<0xC1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+ "xadd{q}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def XADD8rm : I<0xC0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
+ "xadd{b}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB;
+def XADD16rm : I<0xC1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+ "xadd{w}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB,
+ OpSize16;
+def XADD32rm : I<0xC1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+ "xadd{l}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB,
+ OpSize32;
+def XADD64rm : RI<0xC1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+ "xadd{q}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB;
+
+}
+
+let SchedRW = [WriteALU] in {
+def CMPXCHG8rr : I<0xB0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src),
+ "cmpxchg{b}\t{$src, $dst|$dst, $src}", [],
+ IIC_CMPXCHG_REG8>, TB;
+def CMPXCHG16rr : I<0xB1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
+ "cmpxchg{w}\t{$src, $dst|$dst, $src}", [],
+ IIC_CMPXCHG_REG>, TB, OpSize16;
+def CMPXCHG32rr : I<0xB1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+ "cmpxchg{l}\t{$src, $dst|$dst, $src}", [],
+ IIC_CMPXCHG_REG>, TB, OpSize32;
+def CMPXCHG64rr : RI<0xB1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+ "cmpxchg{q}\t{$src, $dst|$dst, $src}", [],
+ IIC_CMPXCHG_REG>, TB;
+} // SchedRW
+
+let SchedRW = [WriteALULd, WriteRMW] in {
+let mayLoad = 1, mayStore = 1 in {
+def CMPXCHG8rm : I<0xB0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
+ "cmpxchg{b}\t{$src, $dst|$dst, $src}", [],
+ IIC_CMPXCHG_MEM8>, TB;
+def CMPXCHG16rm : I<0xB1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+ "cmpxchg{w}\t{$src, $dst|$dst, $src}", [],
+ IIC_CMPXCHG_MEM>, TB, OpSize16;
+def CMPXCHG32rm : I<0xB1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+ "cmpxchg{l}\t{$src, $dst|$dst, $src}", [],
+ IIC_CMPXCHG_MEM>, TB, OpSize32;
+def CMPXCHG64rm : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+ "cmpxchg{q}\t{$src, $dst|$dst, $src}", [],
+ IIC_CMPXCHG_MEM>, TB;
+}
+
+let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in
+def CMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$dst),
+ "cmpxchg8b\t$dst", [], IIC_CMPXCHG_8B>, TB;
+
+let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in
+def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst),
+ "cmpxchg16b\t$dst", [], IIC_CMPXCHG_16B>,
+ TB, Requires<[HasCmpxchg16b]>;
+} // SchedRW
+
+
+// Lock instruction prefix
+def LOCK_PREFIX : I<0xF0, RawFrm, (outs), (ins), "lock", []>;
+
+// Rex64 instruction prefix
+def REX64_PREFIX : I<0x48, RawFrm, (outs), (ins), "rex64", []>,
+ Requires<[In64BitMode]>;
+
+// Data16 instruction prefix
+def DATA16_PREFIX : I<0x66, RawFrm, (outs), (ins), "data16", []>;
+
+// Repeat string operation instruction prefixes
+// These uses the DF flag in the EFLAGS register to inc or dec ECX
+let Defs = [ECX], Uses = [ECX,EFLAGS] in {
+// Repeat (used with INS, OUTS, MOVS, LODS and STOS)
+def REP_PREFIX : I<0xF3, RawFrm, (outs), (ins), "rep", []>;
+// Repeat while not equal (used with CMPS and SCAS)
+def REPNE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "repne", []>;
+}
+
+
+// String manipulation instructions
+let SchedRW = [WriteMicrocoded] in {
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [AL,ESI], Uses = [ESI,EFLAGS] in
+def LODSB : I<0xAC, RawFrmSrc, (outs), (ins srcidx8:$src),
+ "lodsb\t{$src, %al|al, $src}", [], IIC_LODS>;
+let Defs = [AX,ESI], Uses = [ESI,EFLAGS] in
+def LODSW : I<0xAD, RawFrmSrc, (outs), (ins srcidx16:$src),
+ "lodsw\t{$src, %ax|ax, $src}", [], IIC_LODS>, OpSize16;
+let Defs = [EAX,ESI], Uses = [ESI,EFLAGS] in
+def LODSL : I<0xAD, RawFrmSrc, (outs), (ins srcidx32:$src),
+ "lods{l|d}\t{$src, %eax|eax, $src}", [], IIC_LODS>, OpSize32;
+let Defs = [RAX,ESI], Uses = [ESI,EFLAGS] in
+def LODSQ : RI<0xAD, RawFrmSrc, (outs), (ins srcidx64:$src),
+ "lodsq\t{$src, %rax|rax, $src}", [], IIC_LODS>;
+}
+
+let SchedRW = [WriteSystem] in {
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [ESI], Uses = [DX,ESI,EFLAGS] in {
+def OUTSB : I<0x6E, RawFrmSrc, (outs), (ins srcidx8:$src),
+ "outsb\t{$src, %dx|dx, $src}", [], IIC_OUTS>;
+def OUTSW : I<0x6F, RawFrmSrc, (outs), (ins srcidx16:$src),
+ "outsw\t{$src, %dx|dx, $src}", [], IIC_OUTS>, OpSize16;
+def OUTSL : I<0x6F, RawFrmSrc, (outs), (ins srcidx32:$src),
+ "outs{l|d}\t{$src, %dx|dx, $src}", [], IIC_OUTS>, OpSize32;
+}
+
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [EDI], Uses = [DX,EDI,EFLAGS] in {
+def INSB : I<0x6C, RawFrmDst, (outs), (ins dstidx8:$dst),
+ "insb\t{%dx, $dst|$dst, dx}", [], IIC_INS>;
+def INSW : I<0x6D, RawFrmDst, (outs), (ins dstidx16:$dst),
+ "insw\t{%dx, $dst|$dst, dx}", [], IIC_INS>, OpSize16;
+def INSL : I<0x6D, RawFrmDst, (outs), (ins dstidx32:$dst),
+ "ins{l|d}\t{%dx, $dst|$dst, dx}", [], IIC_INS>, OpSize32;
+}
+}
+
+// Flag instructions
+let SchedRW = [WriteALU] in {
+def CLC : I<0xF8, RawFrm, (outs), (ins), "clc", [], IIC_CLC>;
+def STC : I<0xF9, RawFrm, (outs), (ins), "stc", [], IIC_STC>;
+def CLI : I<0xFA, RawFrm, (outs), (ins), "cli", [], IIC_CLI>;
+def STI : I<0xFB, RawFrm, (outs), (ins), "sti", [], IIC_STI>;
+def CLD : I<0xFC, RawFrm, (outs), (ins), "cld", [], IIC_CLD>;
+def STD : I<0xFD, RawFrm, (outs), (ins), "std", [], IIC_STD>;
+def CMC : I<0xF5, RawFrm, (outs), (ins), "cmc", [], IIC_CMC>;
+
+def CLTS : I<0x06, RawFrm, (outs), (ins), "clts", [], IIC_CLTS>, TB;
+}
+
+// Table lookup instructions
+let Uses = [AL,EBX], Defs = [AL], hasSideEffects = 0, mayLoad = 1 in
+def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", [], IIC_XLAT>,
+ Sched<[WriteLoad]>;
+
+let SchedRW = [WriteMicrocoded] in {
+// ASCII Adjust After Addition
+let Uses = [AL,EFLAGS], Defs = [AX,EFLAGS], hasSideEffects = 0 in
+def AAA : I<0x37, RawFrm, (outs), (ins), "aaa", [], IIC_AAA>,
+ Requires<[Not64BitMode]>;
+
+// ASCII Adjust AX Before Division
+let Uses = [AX], Defs = [AX,EFLAGS], hasSideEffects = 0 in
+def AAD8i8 : Ii8<0xD5, RawFrm, (outs), (ins i8imm:$src),
+ "aad\t$src", [], IIC_AAD>, Requires<[Not64BitMode]>;
+
+// ASCII Adjust AX After Multiply
+let Uses = [AL], Defs = [AX,EFLAGS], hasSideEffects = 0 in
+def AAM8i8 : Ii8<0xD4, RawFrm, (outs), (ins i8imm:$src),
+ "aam\t$src", [], IIC_AAM>, Requires<[Not64BitMode]>;
+
+// ASCII Adjust AL After Subtraction - sets
+let Uses = [AL,EFLAGS], Defs = [AX,EFLAGS], hasSideEffects = 0 in
+def AAS : I<0x3F, RawFrm, (outs), (ins), "aas", [], IIC_AAS>,
+ Requires<[Not64BitMode]>;
+
+// Decimal Adjust AL after Addition
+let Uses = [AL,EFLAGS], Defs = [AL,EFLAGS], hasSideEffects = 0 in
+def DAA : I<0x27, RawFrm, (outs), (ins), "daa", [], IIC_DAA>,
+ Requires<[Not64BitMode]>;
+
+// Decimal Adjust AL after Subtraction
+let Uses = [AL,EFLAGS], Defs = [AL,EFLAGS], hasSideEffects = 0 in
+def DAS : I<0x2F, RawFrm, (outs), (ins), "das", [], IIC_DAS>,
+ Requires<[Not64BitMode]>;
+} // SchedRW
+
+let SchedRW = [WriteSystem] in {
+// Check Array Index Against Bounds
+def BOUNDS16rm : I<0x62, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "bound\t{$src, $dst|$dst, $src}", [], IIC_BOUND>, OpSize16,
+ Requires<[Not64BitMode]>;
+def BOUNDS32rm : I<0x62, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "bound\t{$src, $dst|$dst, $src}", [], IIC_BOUND>, OpSize32,
+ Requires<[Not64BitMode]>;
+
+// Adjust RPL Field of Segment Selector
+def ARPL16rr : I<0x63, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
+ "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_REG>,
+ Requires<[Not64BitMode]>;
+def ARPL16mr : I<0x63, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+ "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_MEM>,
+ Requires<[Not64BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// MOVBE Instructions
+//
+let Predicates = [HasMOVBE] in {
+ let SchedRW = [WriteALULd] in {
+ def MOVBE16rm : I<0xF0, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "movbe{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (bswap (loadi16 addr:$src)))], IIC_MOVBE>,
+ OpSize16, T8PS;
+ def MOVBE32rm : I<0xF0, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "movbe{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (bswap (loadi32 addr:$src)))], IIC_MOVBE>,
+ OpSize32, T8PS;
+ def MOVBE64rm : RI<0xF0, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "movbe{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (bswap (loadi64 addr:$src)))], IIC_MOVBE>,
+ T8PS;
+ }
+ let SchedRW = [WriteStore] in {
+ def MOVBE16mr : I<0xF1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+ "movbe{w}\t{$src, $dst|$dst, $src}",
+ [(store (bswap GR16:$src), addr:$dst)], IIC_MOVBE>,
+ OpSize16, T8PS;
+ def MOVBE32mr : I<0xF1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+ "movbe{l}\t{$src, $dst|$dst, $src}",
+ [(store (bswap GR32:$src), addr:$dst)], IIC_MOVBE>,
+ OpSize32, T8PS;
+ def MOVBE64mr : RI<0xF1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+ "movbe{q}\t{$src, $dst|$dst, $src}",
+ [(store (bswap GR64:$src), addr:$dst)], IIC_MOVBE>,
+ T8PS;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// RDRAND Instruction
+//
+let Predicates = [HasRDRAND], Defs = [EFLAGS] in {
+ def RDRAND16r : I<0xC7, MRM6r, (outs GR16:$dst), (ins),
+ "rdrand{w}\t$dst",
+ [(set GR16:$dst, EFLAGS, (X86rdrand))]>, OpSize16, TB;
+ def RDRAND32r : I<0xC7, MRM6r, (outs GR32:$dst), (ins),
+ "rdrand{l}\t$dst",
+ [(set GR32:$dst, EFLAGS, (X86rdrand))]>, OpSize32, TB;
+ def RDRAND64r : RI<0xC7, MRM6r, (outs GR64:$dst), (ins),
+ "rdrand{q}\t$dst",
+ [(set GR64:$dst, EFLAGS, (X86rdrand))]>, TB;
+}
+
+//===----------------------------------------------------------------------===//
+// RDSEED Instruction
+//
+let Predicates = [HasRDSEED], Defs = [EFLAGS] in {
+ def RDSEED16r : I<0xC7, MRM7r, (outs GR16:$dst), (ins),
+ "rdseed{w}\t$dst",
+ [(set GR16:$dst, EFLAGS, (X86rdseed))]>, OpSize16, TB;
+ def RDSEED32r : I<0xC7, MRM7r, (outs GR32:$dst), (ins),
+ "rdseed{l}\t$dst",
+ [(set GR32:$dst, EFLAGS, (X86rdseed))]>, OpSize32, TB;
+ def RDSEED64r : RI<0xC7, MRM7r, (outs GR64:$dst), (ins),
+ "rdseed{q}\t$dst",
+ [(set GR64:$dst, EFLAGS, (X86rdseed))]>, TB;
+}
+
+//===----------------------------------------------------------------------===//
+// LZCNT Instruction
+//
+let Predicates = [HasLZCNT], Defs = [EFLAGS] in {
+ def LZCNT16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "lzcnt{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (ctlz GR16:$src)), (implicit EFLAGS)]>, XS,
+ OpSize16;
+ def LZCNT16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "lzcnt{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (ctlz (loadi16 addr:$src))),
+ (implicit EFLAGS)]>, XS, OpSize16;
+
+ def LZCNT32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "lzcnt{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (ctlz GR32:$src)), (implicit EFLAGS)]>, XS,
+ OpSize32;
+ def LZCNT32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "lzcnt{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (ctlz (loadi32 addr:$src))),
+ (implicit EFLAGS)]>, XS, OpSize32;
+
+ def LZCNT64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "lzcnt{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (ctlz GR64:$src)), (implicit EFLAGS)]>,
+ XS;
+ def LZCNT64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "lzcnt{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (ctlz (loadi64 addr:$src))),
+ (implicit EFLAGS)]>, XS;
+}
+
+//===----------------------------------------------------------------------===//
+// BMI Instructions
+//
+let Predicates = [HasBMI], Defs = [EFLAGS] in {
+ def TZCNT16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "tzcnt{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (cttz GR16:$src)), (implicit EFLAGS)]>, XS,
+ OpSize16;
+ def TZCNT16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "tzcnt{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (cttz (loadi16 addr:$src))),
+ (implicit EFLAGS)]>, XS, OpSize16;
+
+ def TZCNT32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "tzcnt{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (cttz GR32:$src)), (implicit EFLAGS)]>, XS,
+ OpSize32;
+ def TZCNT32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "tzcnt{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (cttz (loadi32 addr:$src))),
+ (implicit EFLAGS)]>, XS, OpSize32;
+
+ def TZCNT64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "tzcnt{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (cttz GR64:$src)), (implicit EFLAGS)]>,
+ XS;
+ def TZCNT64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "tzcnt{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (cttz (loadi64 addr:$src))),
+ (implicit EFLAGS)]>, XS;
+}
+
+multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM,
+ RegisterClass RC, X86MemOperand x86memop> {
+let hasSideEffects = 0 in {
+ def rr : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src),
+ !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"),
+ []>, T8PS, VEX_4V;
+ let mayLoad = 1 in
+ def rm : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src),
+ !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"),
+ []>, T8PS, VEX_4V;
+}
+}
+
+let Predicates = [HasBMI], Defs = [EFLAGS] in {
+ defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem>;
+ defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem>, VEX_W;
+ defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem>;
+ defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem>, VEX_W;
+ defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem>;
+ defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem>, VEX_W;
+}
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments to auto generate BMI instructions.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasBMI] in {
+ // FIXME: patterns for the load versions are not implemented
+ def : Pat<(and GR32:$src, (add GR32:$src, -1)),
+ (BLSR32rr GR32:$src)>;
+ def : Pat<(and GR64:$src, (add GR64:$src, -1)),
+ (BLSR64rr GR64:$src)>;
+
+ def : Pat<(xor GR32:$src, (add GR32:$src, -1)),
+ (BLSMSK32rr GR32:$src)>;
+ def : Pat<(xor GR64:$src, (add GR64:$src, -1)),
+ (BLSMSK64rr GR64:$src)>;
+
+ def : Pat<(and GR32:$src, (ineg GR32:$src)),
+ (BLSI32rr GR32:$src)>;
+ def : Pat<(and GR64:$src, (ineg GR64:$src)),
+ (BLSI64rr GR64:$src)>;
+}
+
+
+multiclass bmi_bextr_bzhi<bits<8> opc, string mnemonic, RegisterClass RC,
+ X86MemOperand x86memop, Intrinsic Int,
+ PatFrag ld_frag> {
+ def rr : I<opc, MRMSrcReg4VOp3, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (Int RC:$src1, RC:$src2)), (implicit EFLAGS)]>,
+ T8PS, VEX;
+ def rm : I<opc, MRMSrcMem4VOp3, (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
+ !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (Int (ld_frag addr:$src1), RC:$src2)),
+ (implicit EFLAGS)]>, T8PS, VEX;
+}
+
+let Predicates = [HasBMI], Defs = [EFLAGS] in {
+ defm BEXTR32 : bmi_bextr_bzhi<0xF7, "bextr{l}", GR32, i32mem,
+ int_x86_bmi_bextr_32, loadi32>;
+ defm BEXTR64 : bmi_bextr_bzhi<0xF7, "bextr{q}", GR64, i64mem,
+ int_x86_bmi_bextr_64, loadi64>, VEX_W;
+}
+
+let Predicates = [HasBMI2], Defs = [EFLAGS] in {
+ defm BZHI32 : bmi_bextr_bzhi<0xF5, "bzhi{l}", GR32, i32mem,
+ int_x86_bmi_bzhi_32, loadi32>;
+ defm BZHI64 : bmi_bextr_bzhi<0xF5, "bzhi{q}", GR64, i64mem,
+ int_x86_bmi_bzhi_64, loadi64>, VEX_W;
+}
+
+
+def CountTrailingOnes : SDNodeXForm<imm, [{
+ // Count the trailing ones in the immediate.
+ return getI8Imm(countTrailingOnes(N->getZExtValue()), SDLoc(N));
+}]>;
+
+def BZHIMask : ImmLeaf<i64, [{
+ return isMask_64(Imm) && (countTrailingOnes<uint64_t>(Imm) > 32);
+}]>;
+
+let Predicates = [HasBMI2] in {
+ def : Pat<(and GR64:$src, BZHIMask:$mask),
+ (BZHI64rr GR64:$src,
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+ (MOV8ri (CountTrailingOnes imm:$mask)), sub_8bit))>;
+
+ def : Pat<(and GR32:$src, (add (shl 1, GR8:$lz), -1)),
+ (BZHI32rr GR32:$src,
+ (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+
+ def : Pat<(and (loadi32 addr:$src), (add (shl 1, GR8:$lz), -1)),
+ (BZHI32rm addr:$src,
+ (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+
+ def : Pat<(and GR64:$src, (add (shl 1, GR8:$lz), -1)),
+ (BZHI64rr GR64:$src,
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+
+ def : Pat<(and (loadi64 addr:$src), (add (shl 1, GR8:$lz), -1)),
+ (BZHI64rm addr:$src,
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+} // HasBMI2
+
+let Predicates = [HasBMI] in {
+ def : Pat<(X86bextr GR32:$src1, GR32:$src2),
+ (BEXTR32rr GR32:$src1, GR32:$src2)>;
+ def : Pat<(X86bextr (loadi32 addr:$src1), GR32:$src2),
+ (BEXTR32rm addr:$src1, GR32:$src2)>;
+ def : Pat<(X86bextr GR64:$src1, GR64:$src2),
+ (BEXTR64rr GR64:$src1, GR64:$src2)>;
+ def : Pat<(X86bextr (loadi64 addr:$src1), GR64:$src2),
+ (BEXTR64rm addr:$src1, GR64:$src2)>;
+} // HasBMI
+
+multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC,
+ X86MemOperand x86memop, Intrinsic Int,
+ PatFrag ld_frag> {
+ def rr : I<0xF5, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (Int RC:$src1, RC:$src2))]>,
+ VEX_4V;
+ def rm : I<0xF5, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))]>, VEX_4V;
+}
+
+let Predicates = [HasBMI2] in {
+ defm PDEP32 : bmi_pdep_pext<"pdep{l}", GR32, i32mem,
+ int_x86_bmi_pdep_32, loadi32>, T8XD;
+ defm PDEP64 : bmi_pdep_pext<"pdep{q}", GR64, i64mem,
+ int_x86_bmi_pdep_64, loadi64>, T8XD, VEX_W;
+ defm PEXT32 : bmi_pdep_pext<"pext{l}", GR32, i32mem,
+ int_x86_bmi_pext_32, loadi32>, T8XS;
+ defm PEXT64 : bmi_pdep_pext<"pext{q}", GR64, i64mem,
+ int_x86_bmi_pext_64, loadi64>, T8XS, VEX_W;
+}
+
+//===----------------------------------------------------------------------===//
+// TBM Instructions
+//
+let Predicates = [HasTBM], Defs = [EFLAGS] in {
+
+multiclass tbm_ternary_imm_intr<bits<8> opc, RegisterClass RC, string OpcodeStr,
+ X86MemOperand x86memop, PatFrag ld_frag,
+ Intrinsic Int, Operand immtype,
+ SDPatternOperator immoperator> {
+ def ri : Ii32<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, immtype:$cntl),
+ !strconcat(OpcodeStr,
+ "\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"),
+ [(set RC:$dst, (Int RC:$src1, immoperator:$cntl))]>,
+ XOP, XOPA;
+ def mi : Ii32<opc, MRMSrcMem, (outs RC:$dst),
+ (ins x86memop:$src1, immtype:$cntl),
+ !strconcat(OpcodeStr,
+ "\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"),
+ [(set RC:$dst, (Int (ld_frag addr:$src1), immoperator:$cntl))]>,
+ XOP, XOPA;
+}
+
+defm BEXTRI32 : tbm_ternary_imm_intr<0x10, GR32, "bextr", i32mem, loadi32,
+ int_x86_tbm_bextri_u32, i32imm, imm>;
+let ImmT = Imm32S in
+defm BEXTRI64 : tbm_ternary_imm_intr<0x10, GR64, "bextr", i64mem, loadi64,
+ int_x86_tbm_bextri_u64, i64i32imm,
+ i64immSExt32>, VEX_W;
+
+multiclass tbm_binary_rm<bits<8> opc, Format FormReg, Format FormMem,
+ RegisterClass RC, string OpcodeStr,
+ X86MemOperand x86memop, PatFrag ld_frag> {
+let hasSideEffects = 0 in {
+ def rr : I<opc, FormReg, (outs RC:$dst), (ins RC:$src),
+ !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"),
+ []>, XOP_4V, XOP9;
+ let mayLoad = 1 in
+ def rm : I<opc, FormMem, (outs RC:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"),
+ []>, XOP_4V, XOP9;
+}
+}
+
+multiclass tbm_binary_intr<bits<8> opc, string OpcodeStr,
+ Format FormReg, Format FormMem> {
+ defm NAME#32 : tbm_binary_rm<opc, FormReg, FormMem, GR32, OpcodeStr, i32mem,
+ loadi32>;
+ defm NAME#64 : tbm_binary_rm<opc, FormReg, FormMem, GR64, OpcodeStr, i64mem,
+ loadi64>, VEX_W;
+}
+
+defm BLCFILL : tbm_binary_intr<0x01, "blcfill", MRM1r, MRM1m>;
+defm BLCI : tbm_binary_intr<0x02, "blci", MRM6r, MRM6m>;
+defm BLCIC : tbm_binary_intr<0x01, "blcic", MRM5r, MRM5m>;
+defm BLCMSK : tbm_binary_intr<0x02, "blcmsk", MRM1r, MRM1m>;
+defm BLCS : tbm_binary_intr<0x01, "blcs", MRM3r, MRM3m>;
+defm BLSFILL : tbm_binary_intr<0x01, "blsfill", MRM2r, MRM2m>;
+defm BLSIC : tbm_binary_intr<0x01, "blsic", MRM6r, MRM6m>;
+defm T1MSKC : tbm_binary_intr<0x01, "t1mskc", MRM7r, MRM7m>;
+defm TZMSK : tbm_binary_intr<0x01, "tzmsk", MRM4r, MRM4m>;
+} // HasTBM, EFLAGS
+
+//===----------------------------------------------------------------------===//
+// MONITORX/MWAITX Instructions
+//
+let SchedRW = [ WriteSystem ] in {
+ let usesCustomInserter = 1 in {
+ def MONITORX : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3),
+ [(int_x86_monitorx addr:$src1, GR32:$src2, GR32:$src3)]>,
+ Requires<[ HasMWAITX ]>;
+ }
+
+ let Uses = [ EAX, ECX, EDX ] in {
+ def MONITORXrrr : I<0x01, MRM_FA, (outs), (ins), "monitorx", [], IIC_SSE_MONITORX>,
+ TB, Requires<[ HasMWAITX ]>;
+ }
+
+ let Uses = [ ECX, EAX, EBX ] in {
+ def MWAITXrrr : I<0x01, MRM_FB, (outs), (ins), "mwaitx",
+ [(int_x86_mwaitx ECX, EAX, EBX)], IIC_SSE_MWAITX>,
+ TB, Requires<[ HasMWAITX ]>;
+ }
+} // SchedRW
+
+def : InstAlias<"mwaitx\t{%eax, %ecx, %ebx|ebx, ecx, eax}", (MWAITXrrr)>,
+ Requires<[ Not64BitMode ]>;
+def : InstAlias<"mwaitx\t{%rax, %rcx, %rbx|rbx, rcx, rax}", (MWAITXrrr)>,
+ Requires<[ In64BitMode ]>;
+
+def : InstAlias<"monitorx\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORXrrr)>,
+ Requires<[ Not64BitMode ]>;
+def : InstAlias<"monitorx\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORXrrr)>,
+ Requires<[ In64BitMode ]>;
+
+//===----------------------------------------------------------------------===//
+// CLZERO Instruction
+//
+let Uses = [EAX] in
+def CLZEROr : I<0x01, MRM_FC, (outs), (ins), "clzero", []>, TB;
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments to auto generate TBM instructions.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasTBM] in {
+ def : Pat<(X86bextr GR32:$src1, (i32 imm:$src2)),
+ (BEXTRI32ri GR32:$src1, imm:$src2)>;
+ def : Pat<(X86bextr (loadi32 addr:$src1), (i32 imm:$src2)),
+ (BEXTRI32mi addr:$src1, imm:$src2)>;
+ def : Pat<(X86bextr GR64:$src1, i64immSExt32:$src2),
+ (BEXTRI64ri GR64:$src1, i64immSExt32:$src2)>;
+ def : Pat<(X86bextr (loadi64 addr:$src1), i64immSExt32:$src2),
+ (BEXTRI64mi addr:$src1, i64immSExt32:$src2)>;
+
+ // FIXME: patterns for the load versions are not implemented
+ def : Pat<(and GR32:$src, (add GR32:$src, 1)),
+ (BLCFILL32rr GR32:$src)>;
+ def : Pat<(and GR64:$src, (add GR64:$src, 1)),
+ (BLCFILL64rr GR64:$src)>;
+
+ def : Pat<(or GR32:$src, (not (add GR32:$src, 1))),
+ (BLCI32rr GR32:$src)>;
+ def : Pat<(or GR64:$src, (not (add GR64:$src, 1))),
+ (BLCI64rr GR64:$src)>;
+
+ // Extra patterns because opt can optimize the above patterns to this.
+ def : Pat<(or GR32:$src, (sub -2, GR32:$src)),
+ (BLCI32rr GR32:$src)>;
+ def : Pat<(or GR64:$src, (sub -2, GR64:$src)),
+ (BLCI64rr GR64:$src)>;
+
+ def : Pat<(and (not GR32:$src), (add GR32:$src, 1)),
+ (BLCIC32rr GR32:$src)>;
+ def : Pat<(and (not GR64:$src), (add GR64:$src, 1)),
+ (BLCIC64rr GR64:$src)>;
+
+ def : Pat<(xor GR32:$src, (add GR32:$src, 1)),
+ (BLCMSK32rr GR32:$src)>;
+ def : Pat<(xor GR64:$src, (add GR64:$src, 1)),
+ (BLCMSK64rr GR64:$src)>;
+
+ def : Pat<(or GR32:$src, (add GR32:$src, 1)),
+ (BLCS32rr GR32:$src)>;
+ def : Pat<(or GR64:$src, (add GR64:$src, 1)),
+ (BLCS64rr GR64:$src)>;
+
+ def : Pat<(or GR32:$src, (add GR32:$src, -1)),
+ (BLSFILL32rr GR32:$src)>;
+ def : Pat<(or GR64:$src, (add GR64:$src, -1)),
+ (BLSFILL64rr GR64:$src)>;
+
+ def : Pat<(or (not GR32:$src), (add GR32:$src, -1)),
+ (BLSIC32rr GR32:$src)>;
+ def : Pat<(or (not GR64:$src), (add GR64:$src, -1)),
+ (BLSIC64rr GR64:$src)>;
+
+ def : Pat<(or (not GR32:$src), (add GR32:$src, 1)),
+ (T1MSKC32rr GR32:$src)>;
+ def : Pat<(or (not GR64:$src), (add GR64:$src, 1)),
+ (T1MSKC64rr GR64:$src)>;
+
+ def : Pat<(and (not GR32:$src), (add GR32:$src, -1)),
+ (TZMSK32rr GR32:$src)>;
+ def : Pat<(and (not GR64:$src), (add GR64:$src, -1)),
+ (TZMSK64rr GR64:$src)>;
+} // HasTBM
+
+//===----------------------------------------------------------------------===//
+// Memory Instructions
+//
+
+def CLFLUSHOPT : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
+ "clflushopt\t$src", [(int_x86_clflushopt addr:$src)]>, PD;
+def CLWB : I<0xAE, MRM6m, (outs), (ins i8mem:$src), "clwb\t$src", []>, PD;
+def PCOMMIT : I<0xAE, MRM_F8, (outs), (ins), "pcommit", []>, PD;
+
+
+//===----------------------------------------------------------------------===//
+// Subsystems.
+//===----------------------------------------------------------------------===//
+
+include "X86InstrArithmetic.td"
+include "X86InstrCMovSetCC.td"
+include "X86InstrExtension.td"
+include "X86InstrControl.td"
+include "X86InstrShiftRotate.td"
+
+// X87 Floating Point Stack.
+include "X86InstrFPStack.td"
+
+// SIMD support (SSE, MMX and AVX)
+include "X86InstrFragmentsSIMD.td"
+
+// FMA - Fused Multiply-Add support (requires FMA)
+include "X86InstrFMA.td"
+
+// XOP
+include "X86InstrXOP.td"
+
+// SSE, MMX and 3DNow! vector support.
+include "X86InstrSSE.td"
+include "X86InstrAVX512.td"
+include "X86InstrMMX.td"
+include "X86Instr3DNow.td"
+
+// MPX instructions
+include "X86InstrMPX.td"
+
+include "X86InstrVMX.td"
+include "X86InstrSVM.td"
+
+include "X86InstrTSX.td"
+include "X86InstrSGX.td"
+
+// System instructions.
+include "X86InstrSystem.td"
+
+// Compiler Pseudo Instructions and Pat Patterns
+include "X86InstrCompiler.td"
+
+//===----------------------------------------------------------------------===//
+// Assembler Mnemonic Aliases
+//===----------------------------------------------------------------------===//
+
+def : MnemonicAlias<"call", "callw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"call", "calll", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"call", "callq", "att">, Requires<[In64BitMode]>;
+
+def : MnemonicAlias<"cbw", "cbtw", "att">;
+def : MnemonicAlias<"cwde", "cwtl", "att">;
+def : MnemonicAlias<"cwd", "cwtd", "att">;
+def : MnemonicAlias<"cdq", "cltd", "att">;
+def : MnemonicAlias<"cdqe", "cltq", "att">;
+def : MnemonicAlias<"cqo", "cqto", "att">;
+
+// In 64-bit mode lret maps to lretl; it is not ambiguous with lretq.
+def : MnemonicAlias<"lret", "lretw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"lret", "lretl", "att">, Requires<[Not16BitMode]>;
+
+def : MnemonicAlias<"leavel", "leave", "att">, Requires<[Not64BitMode]>;
+def : MnemonicAlias<"leaveq", "leave", "att">, Requires<[In64BitMode]>;
+
+def : MnemonicAlias<"loopz", "loope">;
+def : MnemonicAlias<"loopnz", "loopne">;
+
+def : MnemonicAlias<"pop", "popw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"pop", "popl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"pop", "popq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"popf", "popfw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"popf", "popfl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"popf", "popfq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"popfd", "popfl", "att">;
+
+// FIXME: This is wrong for "push reg". "push %bx" should turn into pushw in
+// all modes. However: "push (addr)" and "push $42" should default to
+// pushl/pushq depending on the current mode. Similar for "pop %bx"
+def : MnemonicAlias<"push", "pushw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"push", "pushl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"push", "pushq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"pushf", "pushfw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"pushf", "pushfl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"pushf", "pushfq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"pushfd", "pushfl", "att">;
+
+def : MnemonicAlias<"popad", "popal", "intel">, Requires<[Not64BitMode]>;
+def : MnemonicAlias<"pushad", "pushal", "intel">, Requires<[Not64BitMode]>;
+def : MnemonicAlias<"popa", "popaw", "intel">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"pusha", "pushaw", "intel">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"popa", "popal", "intel">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"pusha", "pushal", "intel">, Requires<[In32BitMode]>;
+
+def : MnemonicAlias<"popa", "popaw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"pusha", "pushaw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"popa", "popal", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"pusha", "pushal", "att">, Requires<[In32BitMode]>;
+
+def : MnemonicAlias<"repe", "rep">;
+def : MnemonicAlias<"repz", "rep">;
+def : MnemonicAlias<"repnz", "repne">;
+
+def : MnemonicAlias<"ret", "retw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"ret", "retl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"ret", "retq", "att">, Requires<[In64BitMode]>;
+
+// Apply 'ret' behavior to 'retn'
+def : MnemonicAlias<"retn", "retw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"retn", "retl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"retn", "retq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"retn", "ret", "intel">;
+
+def : MnemonicAlias<"sal", "shl", "intel">;
+def : MnemonicAlias<"salb", "shlb", "att">;
+def : MnemonicAlias<"salw", "shlw", "att">;
+def : MnemonicAlias<"sall", "shll", "att">;
+def : MnemonicAlias<"salq", "shlq", "att">;
+
+def : MnemonicAlias<"smovb", "movsb", "att">;
+def : MnemonicAlias<"smovw", "movsw", "att">;
+def : MnemonicAlias<"smovl", "movsl", "att">;
+def : MnemonicAlias<"smovq", "movsq", "att">;
+
+def : MnemonicAlias<"ud2a", "ud2", "att">;
+def : MnemonicAlias<"verrw", "verr", "att">;
+
+// System instruction aliases.
+def : MnemonicAlias<"iret", "iretw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"iret", "iretl", "att">, Requires<[Not16BitMode]>;
+def : MnemonicAlias<"sysret", "sysretl", "att">;
+def : MnemonicAlias<"sysexit", "sysexitl", "att">;
+
+def : MnemonicAlias<"lgdt", "lgdtw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"lgdt", "lgdtl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"lgdt", "lgdtq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"lidt", "lidtw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"lidt", "lidtl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"lidt", "lidtq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"sgdt", "sgdtw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"sgdt", "sgdtl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"sgdt", "sgdtq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"sidt", "sidtw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"sidt", "sidtl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"sidt", "sidtq", "att">, Requires<[In64BitMode]>;
+
+
+// Floating point stack aliases.
+def : MnemonicAlias<"fcmovz", "fcmove", "att">;
+def : MnemonicAlias<"fcmova", "fcmovnbe", "att">;
+def : MnemonicAlias<"fcmovnae", "fcmovb", "att">;
+def : MnemonicAlias<"fcmovna", "fcmovbe", "att">;
+def : MnemonicAlias<"fcmovae", "fcmovnb", "att">;
+def : MnemonicAlias<"fcomip", "fcompi">;
+def : MnemonicAlias<"fildq", "fildll", "att">;
+def : MnemonicAlias<"fistpq", "fistpll", "att">;
+def : MnemonicAlias<"fisttpq", "fisttpll", "att">;
+def : MnemonicAlias<"fldcww", "fldcw", "att">;
+def : MnemonicAlias<"fnstcww", "fnstcw", "att">;
+def : MnemonicAlias<"fnstsww", "fnstsw", "att">;
+def : MnemonicAlias<"fucomip", "fucompi">;
+def : MnemonicAlias<"fwait", "wait">;
+
+def : MnemonicAlias<"fxsaveq", "fxsave64", "att">;
+def : MnemonicAlias<"fxrstorq", "fxrstor64", "att">;
+def : MnemonicAlias<"xsaveq", "xsave64", "att">;
+def : MnemonicAlias<"xrstorq", "xrstor64", "att">;
+def : MnemonicAlias<"xsaveoptq", "xsaveopt64", "att">;
+def : MnemonicAlias<"xrstorsq", "xrstors64", "att">;
+def : MnemonicAlias<"xsavecq", "xsavec64", "att">;
+def : MnemonicAlias<"xsavesq", "xsaves64", "att">;
+
+class CondCodeAlias<string Prefix,string Suffix, string OldCond, string NewCond,
+ string VariantName>
+ : MnemonicAlias<!strconcat(Prefix, OldCond, Suffix),
+ !strconcat(Prefix, NewCond, Suffix), VariantName>;
+
+/// IntegerCondCodeMnemonicAlias - This multiclass defines a bunch of
+/// MnemonicAlias's that canonicalize the condition code in a mnemonic, for
+/// example "setz" -> "sete".
+multiclass IntegerCondCodeMnemonicAlias<string Prefix, string Suffix,
+ string V = ""> {
+ def C : CondCodeAlias<Prefix, Suffix, "c", "b", V>; // setc -> setb
+ def Z : CondCodeAlias<Prefix, Suffix, "z" , "e", V>; // setz -> sete
+ def NA : CondCodeAlias<Prefix, Suffix, "na", "be", V>; // setna -> setbe
+ def NB : CondCodeAlias<Prefix, Suffix, "nb", "ae", V>; // setnb -> setae
+ def NC : CondCodeAlias<Prefix, Suffix, "nc", "ae", V>; // setnc -> setae
+ def NG : CondCodeAlias<Prefix, Suffix, "ng", "le", V>; // setng -> setle
+ def NL : CondCodeAlias<Prefix, Suffix, "nl", "ge", V>; // setnl -> setge
+ def NZ : CondCodeAlias<Prefix, Suffix, "nz", "ne", V>; // setnz -> setne
+ def PE : CondCodeAlias<Prefix, Suffix, "pe", "p", V>; // setpe -> setp
+ def PO : CondCodeAlias<Prefix, Suffix, "po", "np", V>; // setpo -> setnp
+
+ def NAE : CondCodeAlias<Prefix, Suffix, "nae", "b", V>; // setnae -> setb
+ def NBE : CondCodeAlias<Prefix, Suffix, "nbe", "a", V>; // setnbe -> seta
+ def NGE : CondCodeAlias<Prefix, Suffix, "nge", "l", V>; // setnge -> setl
+ def NLE : CondCodeAlias<Prefix, Suffix, "nle", "g", V>; // setnle -> setg
+}
+
+// Aliases for set<CC>
+defm : IntegerCondCodeMnemonicAlias<"set", "">;
+// Aliases for j<CC>
+defm : IntegerCondCodeMnemonicAlias<"j", "">;
+// Aliases for cmov<CC>{w,l,q}
+defm : IntegerCondCodeMnemonicAlias<"cmov", "w", "att">;
+defm : IntegerCondCodeMnemonicAlias<"cmov", "l", "att">;
+defm : IntegerCondCodeMnemonicAlias<"cmov", "q", "att">;
+// No size suffix for intel-style asm.
+defm : IntegerCondCodeMnemonicAlias<"cmov", "", "intel">;
+
+
+//===----------------------------------------------------------------------===//
+// Assembler Instruction Aliases
+//===----------------------------------------------------------------------===//
+
+// aad/aam default to base 10 if no operand is specified.
+def : InstAlias<"aad", (AAD8i8 10)>, Requires<[Not64BitMode]>;
+def : InstAlias<"aam", (AAM8i8 10)>, Requires<[Not64BitMode]>;
+
+// Disambiguate the mem/imm form of bt-without-a-suffix as btl.
+// Likewise for btc/btr/bts.
+def : InstAlias<"bt\t{$imm, $mem|$mem, $imm}",
+ (BT32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
+def : InstAlias<"btc\t{$imm, $mem|$mem, $imm}",
+ (BTC32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
+def : InstAlias<"btr\t{$imm, $mem|$mem, $imm}",
+ (BTR32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
+def : InstAlias<"bts\t{$imm, $mem|$mem, $imm}",
+ (BTS32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
+
+// clr aliases.
+def : InstAlias<"clrb\t$reg", (XOR8rr GR8 :$reg, GR8 :$reg), 0>;
+def : InstAlias<"clrw\t$reg", (XOR16rr GR16:$reg, GR16:$reg), 0>;
+def : InstAlias<"clrl\t$reg", (XOR32rr GR32:$reg, GR32:$reg), 0>;
+def : InstAlias<"clrq\t$reg", (XOR64rr GR64:$reg, GR64:$reg), 0>;
+
+// lods aliases. Accept the destination being omitted because it's implicit
+// in the mnemonic, or the mnemonic suffix being omitted because it's implicit
+// in the destination.
+def : InstAlias<"lodsb\t$src", (LODSB srcidx8:$src), 0>;
+def : InstAlias<"lodsw\t$src", (LODSW srcidx16:$src), 0>;
+def : InstAlias<"lods{l|d}\t$src", (LODSL srcidx32:$src), 0>;
+def : InstAlias<"lodsq\t$src", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"lods\t{$src, %al|al, $src}", (LODSB srcidx8:$src), 0>;
+def : InstAlias<"lods\t{$src, %ax|ax, $src}", (LODSW srcidx16:$src), 0>;
+def : InstAlias<"lods\t{$src, %eax|eax, $src}", (LODSL srcidx32:$src), 0>;
+def : InstAlias<"lods\t{$src, %rax|rax, $src}", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"lods\t$src", (LODSB srcidx8:$src), 0>;
+def : InstAlias<"lods\t$src", (LODSW srcidx16:$src), 0>;
+def : InstAlias<"lods\t$src", (LODSL srcidx32:$src), 0>;
+def : InstAlias<"lods\t$src", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>;
+
+
+// stos aliases. Accept the source being omitted because it's implicit in
+// the mnemonic, or the mnemonic suffix being omitted because it's implicit
+// in the source.
+def : InstAlias<"stosb\t$dst", (STOSB dstidx8:$dst), 0>;
+def : InstAlias<"stosw\t$dst", (STOSW dstidx16:$dst), 0>;
+def : InstAlias<"stos{l|d}\t$dst", (STOSL dstidx32:$dst), 0>;
+def : InstAlias<"stosq\t$dst", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"stos\t{%al, $dst|$dst, al}", (STOSB dstidx8:$dst), 0>;
+def : InstAlias<"stos\t{%ax, $dst|$dst, ax}", (STOSW dstidx16:$dst), 0>;
+def : InstAlias<"stos\t{%eax, $dst|$dst, eax}", (STOSL dstidx32:$dst), 0>;
+def : InstAlias<"stos\t{%rax, $dst|$dst, rax}", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"stos\t$dst", (STOSB dstidx8:$dst), 0>;
+def : InstAlias<"stos\t$dst", (STOSW dstidx16:$dst), 0>;
+def : InstAlias<"stos\t$dst", (STOSL dstidx32:$dst), 0>;
+def : InstAlias<"stos\t$dst", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+
+
+// scas aliases. Accept the destination being omitted because it's implicit
+// in the mnemonic, or the mnemonic suffix being omitted because it's implicit
+// in the destination.
+def : InstAlias<"scasb\t$dst", (SCASB dstidx8:$dst), 0>;
+def : InstAlias<"scasw\t$dst", (SCASW dstidx16:$dst), 0>;
+def : InstAlias<"scas{l|d}\t$dst", (SCASL dstidx32:$dst), 0>;
+def : InstAlias<"scasq\t$dst", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"scas\t{$dst, %al|al, $dst}", (SCASB dstidx8:$dst), 0>;
+def : InstAlias<"scas\t{$dst, %ax|ax, $dst}", (SCASW dstidx16:$dst), 0>;
+def : InstAlias<"scas\t{$dst, %eax|eax, $dst}", (SCASL dstidx32:$dst), 0>;
+def : InstAlias<"scas\t{$dst, %rax|rax, $dst}", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"scas\t$dst", (SCASB dstidx8:$dst), 0>;
+def : InstAlias<"scas\t$dst", (SCASW dstidx16:$dst), 0>;
+def : InstAlias<"scas\t$dst", (SCASL dstidx32:$dst), 0>;
+def : InstAlias<"scas\t$dst", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+
+// cmps aliases. Mnemonic suffix being omitted because it's implicit
+// in the destination.
+def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSB dstidx8:$dst, srcidx8:$src), 0>;
+def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSW dstidx16:$dst, srcidx16:$src), 0>;
+def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSL dstidx32:$dst, srcidx32:$src), 0>;
+def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSQ dstidx64:$dst, srcidx64:$src), 0>, Requires<[In64BitMode]>;
+
+// movs aliases. Mnemonic suffix being omitted because it's implicit
+// in the destination.
+def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSB dstidx8:$dst, srcidx8:$src), 0>;
+def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSW dstidx16:$dst, srcidx16:$src), 0>;
+def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSL dstidx32:$dst, srcidx32:$src), 0>;
+def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSQ dstidx64:$dst, srcidx64:$src), 0>, Requires<[In64BitMode]>;
+
+// div and idiv aliases for explicit A register.
+def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8r GR8 :$src)>;
+def : InstAlias<"div{w}\t{$src, %ax|ax, $src}", (DIV16r GR16:$src)>;
+def : InstAlias<"div{l}\t{$src, %eax|eax, $src}", (DIV32r GR32:$src)>;
+def : InstAlias<"div{q}\t{$src, %rax|rax, $src}", (DIV64r GR64:$src)>;
+def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8m i8mem :$src)>;
+def : InstAlias<"div{w}\t{$src, %ax|ax, $src}", (DIV16m i16mem:$src)>;
+def : InstAlias<"div{l}\t{$src, %eax|eax, $src}", (DIV32m i32mem:$src)>;
+def : InstAlias<"div{q}\t{$src, %rax|rax, $src}", (DIV64m i64mem:$src)>;
+def : InstAlias<"idiv{b}\t{$src, %al|al, $src}", (IDIV8r GR8 :$src)>;
+def : InstAlias<"idiv{w}\t{$src, %ax|ax, $src}", (IDIV16r GR16:$src)>;
+def : InstAlias<"idiv{l}\t{$src, %eax|eax, $src}", (IDIV32r GR32:$src)>;
+def : InstAlias<"idiv{q}\t{$src, %rax|rax, $src}", (IDIV64r GR64:$src)>;
+def : InstAlias<"idiv{b}\t{$src, %al|al, $src}", (IDIV8m i8mem :$src)>;
+def : InstAlias<"idiv{w}\t{$src, %ax|ax, $src}", (IDIV16m i16mem:$src)>;
+def : InstAlias<"idiv{l}\t{$src, %eax|eax, $src}", (IDIV32m i32mem:$src)>;
+def : InstAlias<"idiv{q}\t{$src, %rax|rax, $src}", (IDIV64m i64mem:$src)>;
+
+
+
+// Various unary fpstack operations default to operating on on ST1.
+// For example, "fxch" -> "fxch %st(1)"
+def : InstAlias<"faddp", (ADD_FPrST0 ST1), 0>;
+def: InstAlias<"fadd", (ADD_FPrST0 ST1), 0>;
+def : InstAlias<"fsub{|r}p", (SUBR_FPrST0 ST1), 0>;
+def : InstAlias<"fsub{r|}p", (SUB_FPrST0 ST1), 0>;
+def : InstAlias<"fmul", (MUL_FPrST0 ST1), 0>;
+def : InstAlias<"fmulp", (MUL_FPrST0 ST1), 0>;
+def : InstAlias<"fdiv{|r}p", (DIVR_FPrST0 ST1), 0>;
+def : InstAlias<"fdiv{r|}p", (DIV_FPrST0 ST1), 0>;
+def : InstAlias<"fxch", (XCH_F ST1), 0>;
+def : InstAlias<"fcom", (COM_FST0r ST1), 0>;
+def : InstAlias<"fcomp", (COMP_FST0r ST1), 0>;
+def : InstAlias<"fcomi", (COM_FIr ST1), 0>;
+def : InstAlias<"fcompi", (COM_FIPr ST1), 0>;
+def : InstAlias<"fucom", (UCOM_Fr ST1), 0>;
+def : InstAlias<"fucomp", (UCOM_FPr ST1), 0>;
+def : InstAlias<"fucomi", (UCOM_FIr ST1), 0>;
+def : InstAlias<"fucompi", (UCOM_FIPr ST1), 0>;
+
+// Handle fmul/fadd/fsub/fdiv instructions with explicitly written st(0) op.
+// For example, "fadd %st(4), %st(0)" -> "fadd %st(4)". We also disambiguate
+// instructions like "fadd %st(0), %st(0)" as "fadd %st(0)" for consistency with
+// gas.
+multiclass FpUnaryAlias<string Mnemonic, Instruction Inst, bit EmitAlias = 1> {
+ def : InstAlias<!strconcat(Mnemonic, "\t{$op, %st(0)|st(0), $op}"),
+ (Inst RST:$op), EmitAlias>;
+ def : InstAlias<!strconcat(Mnemonic, "\t{%st(0), %st(0)|st(0), st(0)}"),
+ (Inst ST0), EmitAlias>;
+}
+
+defm : FpUnaryAlias<"fadd", ADD_FST0r>;
+defm : FpUnaryAlias<"faddp", ADD_FPrST0, 0>;
+defm : FpUnaryAlias<"fsub", SUB_FST0r>;
+defm : FpUnaryAlias<"fsub{|r}p", SUBR_FPrST0>;
+defm : FpUnaryAlias<"fsubr", SUBR_FST0r>;
+defm : FpUnaryAlias<"fsub{r|}p", SUB_FPrST0>;
+defm : FpUnaryAlias<"fmul", MUL_FST0r>;
+defm : FpUnaryAlias<"fmulp", MUL_FPrST0>;
+defm : FpUnaryAlias<"fdiv", DIV_FST0r>;
+defm : FpUnaryAlias<"fdiv{|r}p", DIVR_FPrST0>;
+defm : FpUnaryAlias<"fdivr", DIVR_FST0r>;
+defm : FpUnaryAlias<"fdiv{r|}p", DIV_FPrST0>;
+defm : FpUnaryAlias<"fcomi", COM_FIr, 0>;
+defm : FpUnaryAlias<"fucomi", UCOM_FIr, 0>;
+defm : FpUnaryAlias<"fcompi", COM_FIPr>;
+defm : FpUnaryAlias<"fucompi", UCOM_FIPr>;
+
+
+// Handle "f{mulp,addp} st(0), $op" the same as "f{mulp,addp} $op", since they
+// commute. We also allow fdiv[r]p/fsubrp even though they don't commute,
+// solely because gas supports it.
+def : InstAlias<"faddp\t{%st(0), $op|$op, st(0)}", (ADD_FPrST0 RST:$op), 0>;
+def : InstAlias<"fmulp\t{%st(0), $op|$op, st(0)}", (MUL_FPrST0 RST:$op)>;
+def : InstAlias<"fsub{|r}p\t{%st(0), $op|$op, st(0)}", (SUBR_FPrST0 RST:$op)>;
+def : InstAlias<"fsub{r|}p\t{%st(0), $op|$op, st(0)}", (SUB_FPrST0 RST:$op)>;
+def : InstAlias<"fdiv{|r}p\t{%st(0), $op|$op, st(0)}", (DIVR_FPrST0 RST:$op)>;
+def : InstAlias<"fdiv{r|}p\t{%st(0), $op|$op, st(0)}", (DIV_FPrST0 RST:$op)>;
+
+// We accept "fnstsw %eax" even though it only writes %ax.
+def : InstAlias<"fnstsw\t{%eax|eax}", (FNSTSW16r)>;
+def : InstAlias<"fnstsw\t{%al|al}" , (FNSTSW16r)>;
+def : InstAlias<"fnstsw" , (FNSTSW16r)>;
+
+// lcall and ljmp aliases. This seems to be an odd mapping in 64-bit mode, but
+// this is compatible with what GAS does.
+def : InstAlias<"lcall\t$seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg), 0>, Requires<[In32BitMode]>;
+def : InstAlias<"ljmp\t$seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg), 0>, Requires<[In32BitMode]>;
+def : InstAlias<"lcall\t{*}$dst", (FARCALL32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"ljmp\t{*}$dst", (FARJMP32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"lcall\t$seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"ljmp\t$seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"lcall\t{*}$dst", (FARCALL16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"ljmp\t{*}$dst", (FARJMP16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>;
+
+def : InstAlias<"call\t{*}$dst", (CALL64m i64mem:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"jmp\t{*}$dst", (JMP64m i64mem:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"call\t{*}$dst", (CALL32m i32mem:$dst), 0>, Requires<[In32BitMode]>;
+def : InstAlias<"jmp\t{*}$dst", (JMP32m i32mem:$dst), 0>, Requires<[In32BitMode]>;
+def : InstAlias<"call\t{*}$dst", (CALL16m i16mem:$dst), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"jmp\t{*}$dst", (JMP16m i16mem:$dst), 0>, Requires<[In16BitMode]>;
+
+
+// "imul <imm>, B" is an alias for "imul <imm>, B, B".
+def : InstAlias<"imul{w}\t{$imm, $r|$r, $imm}", (IMUL16rri GR16:$r, GR16:$r, i16imm:$imm), 0>;
+def : InstAlias<"imul{w}\t{$imm, $r|$r, $imm}", (IMUL16rri8 GR16:$r, GR16:$r, i16i8imm:$imm), 0>;
+def : InstAlias<"imul{l}\t{$imm, $r|$r, $imm}", (IMUL32rri GR32:$r, GR32:$r, i32imm:$imm), 0>;
+def : InstAlias<"imul{l}\t{$imm, $r|$r, $imm}", (IMUL32rri8 GR32:$r, GR32:$r, i32i8imm:$imm), 0>;
+def : InstAlias<"imul{q}\t{$imm, $r|$r, $imm}", (IMUL64rri32 GR64:$r, GR64:$r, i64i32imm:$imm), 0>;
+def : InstAlias<"imul{q}\t{$imm, $r|$r, $imm}", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm), 0>;
+
+// ins aliases. Accept the mnemonic suffix being omitted because it's implicit
+// in the destination.
+def : InstAlias<"ins\t{%dx, $dst|$dst, dx}", (INSB dstidx8:$dst), 0>;
+def : InstAlias<"ins\t{%dx, $dst|$dst, dx}", (INSW dstidx16:$dst), 0>;
+def : InstAlias<"ins\t{%dx, $dst|$dst, dx}", (INSL dstidx32:$dst), 0>;
+
+// outs aliases. Accept the mnemonic suffix being omitted because it's implicit
+// in the source.
+def : InstAlias<"outs\t{$src, %dx|dx, $src}", (OUTSB srcidx8:$src), 0>;
+def : InstAlias<"outs\t{$src, %dx|dx, $src}", (OUTSW srcidx16:$src), 0>;
+def : InstAlias<"outs\t{$src, %dx|dx, $src}", (OUTSL srcidx32:$src), 0>;
+
+// inb %dx -> inb %al, %dx
+def : InstAlias<"inb\t{%dx|dx}", (IN8rr), 0>;
+def : InstAlias<"inw\t{%dx|dx}", (IN16rr), 0>;
+def : InstAlias<"inl\t{%dx|dx}", (IN32rr), 0>;
+def : InstAlias<"inb\t$port", (IN8ri u8imm:$port), 0>;
+def : InstAlias<"inw\t$port", (IN16ri u8imm:$port), 0>;
+def : InstAlias<"inl\t$port", (IN32ri u8imm:$port), 0>;
+
+
+// jmp and call aliases for lcall and ljmp. jmp $42,$5 -> ljmp
+def : InstAlias<"call\t$seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>;
+def : InstAlias<"jmp\t$seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>;
+def : InstAlias<"call\t$seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>, Requires<[In32BitMode]>;
+def : InstAlias<"jmp\t$seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>, Requires<[In32BitMode]>;
+def : InstAlias<"callw\t$seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>;
+def : InstAlias<"jmpw\t$seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>;
+def : InstAlias<"calll\t$seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>;
+def : InstAlias<"jmpl\t$seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>;
+
+// Force mov without a suffix with a segment and mem to prefer the 'l' form of
+// the move. All segment/mem forms are equivalent, this has the shortest
+// encoding.
+def : InstAlias<"mov\t{$mem, $seg|$seg, $mem}", (MOV32sm SEGMENT_REG:$seg, i32mem:$mem), 0>;
+def : InstAlias<"mov\t{$seg, $mem|$mem, $seg}", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg), 0>;
+
+// Match 'movq <largeimm>, <reg>' as an alias for movabsq.
+def : InstAlias<"movq\t{$imm, $reg|$reg, $imm}", (MOV64ri GR64:$reg, i64imm:$imm), 0>;
+
+// Match 'movq GR64, MMX' as an alias for movd.
+def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
+ (MMX_MOVD64to64rr VR64:$dst, GR64:$src), 0>;
+def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
+ (MMX_MOVD64from64rr GR64:$dst, VR64:$src), 0>;
+
+// movsx aliases
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0>;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX32rr8 GR32:$dst, GR8:$src), 0>;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX32rr16 GR32:$dst, GR16:$src), 0>;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr8 GR64:$dst, GR8:$src), 0>;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr16 GR64:$dst, GR16:$src), 0>;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr32 GR64:$dst, GR32:$src), 0>;
+
+// movzx aliases
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX16rr8 GR16:$dst, GR8:$src), 0>;
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX16rm8 GR16:$dst, i8mem:$src), 0>;
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX32rr8 GR32:$dst, GR8:$src), 0>;
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX32rr16 GR32:$dst, GR16:$src), 0>;
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX64rr8 GR64:$dst, GR8:$src), 0>;
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX64rr16 GR64:$dst, GR16:$src), 0>;
+// Note: No GR32->GR64 movzx form.
+
+// outb %dx -> outb %al, %dx
+def : InstAlias<"outb\t{%dx|dx}", (OUT8rr), 0>;
+def : InstAlias<"outw\t{%dx|dx}", (OUT16rr), 0>;
+def : InstAlias<"outl\t{%dx|dx}", (OUT32rr), 0>;
+def : InstAlias<"outb\t$port", (OUT8ir u8imm:$port), 0>;
+def : InstAlias<"outw\t$port", (OUT16ir u8imm:$port), 0>;
+def : InstAlias<"outl\t$port", (OUT32ir u8imm:$port), 0>;
+
+// 'sldt <mem>' can be encoded with either sldtw or sldtq with the same
+// effect (both store to a 16-bit mem). Force to sldtw to avoid ambiguity
+// errors, since its encoding is the most compact.
+def : InstAlias<"sldt $mem", (SLDT16m i16mem:$mem), 0>;
+
+// shld/shrd op,op -> shld op, op, CL
+def : InstAlias<"shld{w}\t{$r2, $r1|$r1, $r2}", (SHLD16rrCL GR16:$r1, GR16:$r2), 0>;
+def : InstAlias<"shld{l}\t{$r2, $r1|$r1, $r2}", (SHLD32rrCL GR32:$r1, GR32:$r2), 0>;
+def : InstAlias<"shld{q}\t{$r2, $r1|$r1, $r2}", (SHLD64rrCL GR64:$r1, GR64:$r2), 0>;
+def : InstAlias<"shrd{w}\t{$r2, $r1|$r1, $r2}", (SHRD16rrCL GR16:$r1, GR16:$r2), 0>;
+def : InstAlias<"shrd{l}\t{$r2, $r1|$r1, $r2}", (SHRD32rrCL GR32:$r1, GR32:$r2), 0>;
+def : InstAlias<"shrd{q}\t{$r2, $r1|$r1, $r2}", (SHRD64rrCL GR64:$r1, GR64:$r2), 0>;
+
+def : InstAlias<"shld{w}\t{$reg, $mem|$mem, $reg}", (SHLD16mrCL i16mem:$mem, GR16:$reg), 0>;
+def : InstAlias<"shld{l}\t{$reg, $mem|$mem, $reg}", (SHLD32mrCL i32mem:$mem, GR32:$reg), 0>;
+def : InstAlias<"shld{q}\t{$reg, $mem|$mem, $reg}", (SHLD64mrCL i64mem:$mem, GR64:$reg), 0>;
+def : InstAlias<"shrd{w}\t{$reg, $mem|$mem, $reg}", (SHRD16mrCL i16mem:$mem, GR16:$reg), 0>;
+def : InstAlias<"shrd{l}\t{$reg, $mem|$mem, $reg}", (SHRD32mrCL i32mem:$mem, GR32:$reg), 0>;
+def : InstAlias<"shrd{q}\t{$reg, $mem|$mem, $reg}", (SHRD64mrCL i64mem:$mem, GR64:$reg), 0>;
+
+/* FIXME: This is disabled because the asm matcher is currently incapable of
+ * matching a fixed immediate like $1.
+// "shl X, $1" is an alias for "shl X".
+multiclass ShiftRotateByOneAlias<string Mnemonic, string Opc> {
+ def : InstAlias<!strconcat(Mnemonic, "b $op, $$1"),
+ (!cast<Instruction>(!strconcat(Opc, "8r1")) GR8:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "w $op, $$1"),
+ (!cast<Instruction>(!strconcat(Opc, "16r1")) GR16:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "l $op, $$1"),
+ (!cast<Instruction>(!strconcat(Opc, "32r1")) GR32:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "q $op, $$1"),
+ (!cast<Instruction>(!strconcat(Opc, "64r1")) GR64:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "b $op, $$1"),
+ (!cast<Instruction>(!strconcat(Opc, "8m1")) i8mem:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "w $op, $$1"),
+ (!cast<Instruction>(!strconcat(Opc, "16m1")) i16mem:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "l $op, $$1"),
+ (!cast<Instruction>(!strconcat(Opc, "32m1")) i32mem:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "q $op, $$1"),
+ (!cast<Instruction>(!strconcat(Opc, "64m1")) i64mem:$op)>;
+}
+
+defm : ShiftRotateByOneAlias<"rcl", "RCL">;
+defm : ShiftRotateByOneAlias<"rcr", "RCR">;
+defm : ShiftRotateByOneAlias<"rol", "ROL">;
+defm : ShiftRotateByOneAlias<"ror", "ROR">;
+FIXME */
+
+// test: We accept "testX <reg>, <mem>" and "testX <mem>, <reg>" as synonyms.
+def : InstAlias<"test{b}\t{$val, $mem|$mem, $val}",
+ (TEST8rm GR8 :$val, i8mem :$mem), 0>;
+def : InstAlias<"test{w}\t{$val, $mem|$mem, $val}",
+ (TEST16rm GR16:$val, i16mem:$mem), 0>;
+def : InstAlias<"test{l}\t{$val, $mem|$mem, $val}",
+ (TEST32rm GR32:$val, i32mem:$mem), 0>;
+def : InstAlias<"test{q}\t{$val, $mem|$mem, $val}",
+ (TEST64rm GR64:$val, i64mem:$mem), 0>;
+
+// xchg: We accept "xchgX <reg>, <mem>" and "xchgX <mem>, <reg>" as synonyms.
+def : InstAlias<"xchg{b}\t{$mem, $val|$val, $mem}",
+ (XCHG8rm GR8 :$val, i8mem :$mem), 0>;
+def : InstAlias<"xchg{w}\t{$mem, $val|$val, $mem}",
+ (XCHG16rm GR16:$val, i16mem:$mem), 0>;
+def : InstAlias<"xchg{l}\t{$mem, $val|$val, $mem}",
+ (XCHG32rm GR32:$val, i32mem:$mem), 0>;
+def : InstAlias<"xchg{q}\t{$mem, $val|$val, $mem}",
+ (XCHG64rm GR64:$val, i64mem:$mem), 0>;
+
+// xchg: We accept "xchgX <reg>, %eax" and "xchgX %eax, <reg>" as synonyms.
+def : InstAlias<"xchg{w}\t{%ax, $src|$src, ax}", (XCHG16ar GR16:$src), 0>;
+def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}",
+ (XCHG32ar GR32:$src), 0>, Requires<[Not64BitMode]>;
+def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}",
+ (XCHG32ar64 GR32_NOAX:$src), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"xchg{q}\t{%rax, $src|$src, rax}", (XCHG64ar GR64:$src), 0>;
+
+// These aliases exist to get the parser to prioritize matching 8-bit
+// immediate encodings over matching the implicit ax/eax/rax encodings. By
+// explicitly mentioning the A register here, these entries will be ordered
+// first due to the more explicit immediate type.
+def : InstAlias<"adc{w}\t{$imm, %ax|ax, $imm}", (ADC16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"add{w}\t{$imm, %ax|ax, $imm}", (ADD16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"and{w}\t{$imm, %ax|ax, $imm}", (AND16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"cmp{w}\t{$imm, %ax|ax, $imm}", (CMP16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"or{w}\t{$imm, %ax|ax, $imm}", (OR16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"sbb{w}\t{$imm, %ax|ax, $imm}", (SBB16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"sub{w}\t{$imm, %ax|ax, $imm}", (SUB16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"xor{w}\t{$imm, %ax|ax, $imm}", (XOR16ri8 AX, i16i8imm:$imm), 0>;
+
+def : InstAlias<"adc{l}\t{$imm, %eax|eax, $imm}", (ADC32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"add{l}\t{$imm, %eax|eax, $imm}", (ADD32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"and{l}\t{$imm, %eax|eax, $imm}", (AND32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"cmp{l}\t{$imm, %eax|eax, $imm}", (CMP32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"or{l}\t{$imm, %eax|eax, $imm}", (OR32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"sbb{l}\t{$imm, %eax|eax, $imm}", (SBB32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"sub{l}\t{$imm, %eax|eax, $imm}", (SUB32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"xor{l}\t{$imm, %eax|eax, $imm}", (XOR32ri8 EAX, i32i8imm:$imm), 0>;
+
+def : InstAlias<"adc{q}\t{$imm, %rax|rax, $imm}", (ADC64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"add{q}\t{$imm, %rax|rax, $imm}", (ADD64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"and{q}\t{$imm, %rax|rax, $imm}", (AND64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"cmp{q}\t{$imm, %rax|rax, $imm}", (CMP64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"or{q}\t{$imm, %rax|rax, $imm}", (OR64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"sbb{q}\t{$imm, %rax|rax, $imm}", (SBB64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"sub{q}\t{$imm, %rax|rax, $imm}", (SUB64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"xor{q}\t{$imm, %rax|rax, $imm}", (XOR64ri8 RAX, i64i8imm:$imm), 0>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrMMX.td b/contrib/llvm/lib/Target/X86/X86InstrMMX.td
new file mode 100644
index 000000000000..0bb106823983
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrMMX.td
@@ -0,0 +1,675 @@
+//===-- X86InstrMMX.td - Describe the MMX Instruction Set --*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 MMX instruction set, defining the instructions,
+// and properties of the instructions which are needed for code generation,
+// machine code emission, and analysis.
+//
+// All instructions that use MMX should be in this file, even if they also use
+// SSE.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MMX Multiclasses
+//===----------------------------------------------------------------------===//
+
+let Sched = WriteVecALU in {
+def MMX_INTALU_ITINS : OpndItins<
+ IIC_MMX_ALU_RR, IIC_MMX_ALU_RM
+>;
+
+def MMX_INTALUQ_ITINS : OpndItins<
+ IIC_MMX_ALUQ_RR, IIC_MMX_ALUQ_RM
+>;
+
+def MMX_PHADDSUBW : OpndItins<
+ IIC_MMX_PHADDSUBW_RR, IIC_MMX_PHADDSUBW_RM
+>;
+
+def MMX_PHADDSUBD : OpndItins<
+ IIC_MMX_PHADDSUBD_RR, IIC_MMX_PHADDSUBD_RM
+>;
+}
+
+let Sched = WriteVecLogic in
+def MMX_INTALU_ITINS_VECLOGICSCHED : OpndItins<
+ IIC_MMX_ALU_RR, IIC_MMX_ALU_RM
+>;
+
+let Sched = WriteVecIMul in
+def MMX_PMUL_ITINS : OpndItins<
+ IIC_MMX_PMUL, IIC_MMX_PMUL
+>;
+
+let Sched = WriteVecIMul in {
+def MMX_PSADBW_ITINS : OpndItins<
+ IIC_MMX_PSADBW, IIC_MMX_PSADBW
+>;
+
+def MMX_MISC_FUNC_ITINS : OpndItins<
+ IIC_MMX_MISC_FUNC_MEM, IIC_MMX_MISC_FUNC_REG
+>;
+}
+
+def MMX_SHIFT_ITINS : ShiftOpndItins<
+ IIC_MMX_SHIFT_RR, IIC_MMX_SHIFT_RM, IIC_MMX_SHIFT_RI
+>;
+
+let Sched = WriteShuffle in {
+def MMX_UNPCK_H_ITINS : OpndItins<
+ IIC_MMX_UNPCK_H_RR, IIC_MMX_UNPCK_H_RM
+>;
+
+def MMX_UNPCK_L_ITINS : OpndItins<
+ IIC_MMX_UNPCK_L, IIC_MMX_UNPCK_L
+>;
+
+def MMX_PCK_ITINS : OpndItins<
+ IIC_MMX_PCK_RR, IIC_MMX_PCK_RM
+>;
+
+def MMX_PSHUF_ITINS : OpndItins<
+ IIC_MMX_PSHUF, IIC_MMX_PSHUF
+>;
+} // Sched
+
+let Sched = WriteCvtF2I in {
+def MMX_CVT_PD_ITINS : OpndItins<
+ IIC_MMX_CVT_PD_RR, IIC_MMX_CVT_PD_RM
+>;
+
+def MMX_CVT_PS_ITINS : OpndItins<
+ IIC_MMX_CVT_PS_RR, IIC_MMX_CVT_PS_RM
+>;
+}
+
+let Constraints = "$src1 = $dst" in {
+ // MMXI_binop_rm_int - Simple MMX binary operator based on intrinsic.
+ // When this is cleaned up, remove the FIXME from X86RecognizableInstr.cpp.
+ multiclass MMXI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+ OpndItins itins, bit Commutable = 0> {
+ def irr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
+ (ins VR64:$src1, VR64:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))], itins.rr>,
+ Sched<[itins.Sched]> {
+ let isCommutable = Commutable;
+ }
+ def irm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
+ (ins VR64:$src1, i64mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId VR64:$src1,
+ (bitconvert (load_mmx addr:$src2))))],
+ itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ }
+
+ multiclass MMXI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm,
+ string OpcodeStr, Intrinsic IntId,
+ Intrinsic IntId2, ShiftOpndItins itins> {
+ def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
+ (ins VR64:$src1, VR64:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))], itins.rr>,
+ Sched<[WriteVecShift]>;
+ def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
+ (ins VR64:$src1, i64mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId VR64:$src1,
+ (bitconvert (load_mmx addr:$src2))))],
+ itins.rm>, Sched<[WriteVecShiftLd, ReadAfterLd]>;
+ def ri : MMXIi8<opc2, ImmForm, (outs VR64:$dst),
+ (ins VR64:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId2 VR64:$src1, imm:$src2))], itins.ri>,
+ Sched<[WriteVecShift]>;
+ }
+}
+
+/// Unary MMX instructions requiring SSSE3.
+multiclass SS3I_unop_rm_int_mm<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId64, OpndItins itins> {
+ def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR64:$dst, (IntId64 VR64:$src))], itins.rr>,
+ Sched<[itins.Sched]>;
+
+ def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR64:$dst,
+ (IntId64 (bitconvert (memopmmx addr:$src))))],
+ itins.rm>, Sched<[itins.Sched.Folded]>;
+}
+
+/// Binary MMX instructions requiring SSSE3.
+let ImmT = NoImm, Constraints = "$src1 = $dst" in {
+multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId64, OpndItins itins,
+ bit Commutable = 0> {
+ let isCommutable = Commutable in
+ def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst),
+ (ins VR64:$src1, VR64:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))], itins.rr>,
+ Sched<[itins.Sched]>;
+ def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst),
+ (ins VR64:$src1, i64mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst,
+ (IntId64 VR64:$src1,
+ (bitconvert (memopmmx addr:$src2))))], itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+}
+
+/// PALIGN MMX instructions (require SSSE3).
+multiclass ssse3_palign_mm<string asm, Intrinsic IntId> {
+ def R64irr : MMXSS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
+ (ins VR64:$src1, VR64:$src2, u8imm:$src3),
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 imm:$src3)))]>,
+ Sched<[WriteShuffle]>;
+ def R64irm : MMXSS3AI<0x0F, MRMSrcMem, (outs VR64:$dst),
+ (ins VR64:$src1, i64mem:$src2, u8imm:$src3),
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR64:$dst, (IntId VR64:$src1,
+ (bitconvert (load_mmx addr:$src2)), (i8 imm:$src3)))]>,
+ Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+multiclass sse12_cvt_pint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+ Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag,
+ string asm, OpndItins itins, Domain d> {
+ def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
+ [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr, d>,
+ Sched<[itins.Sched]>;
+ def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
+ [(set DstRC:$dst, (Int (ld_frag addr:$src)))], itins.rm, d>,
+ Sched<[itins.Sched.Folded]>;
+}
+
+multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC,
+ RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop,
+ PatFrag ld_frag, string asm, Domain d> {
+ def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst),
+ (ins DstRC:$src1, SrcRC:$src2), asm,
+ [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))],
+ NoItinerary, d>, Sched<[WriteCvtI2F]>;
+ def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst),
+ (ins DstRC:$src1, x86memop:$src2), asm,
+ [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))],
+ NoItinerary, d>, Sched<[WriteCvtI2FLd]>;
+}
+
+//===----------------------------------------------------------------------===//
+// MMX EMMS Instruction
+//===----------------------------------------------------------------------===//
+
+def MMX_EMMS : MMXI<0x77, RawFrm, (outs), (ins), "emms",
+ [(int_x86_mmx_emms)], IIC_MMX_EMMS>;
+
+//===----------------------------------------------------------------------===//
+// MMX Scalar Instructions
+//===----------------------------------------------------------------------===//
+
+// Data Transfer Instructions
+def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst,
+ (x86mmx (scalar_to_vector GR32:$src)))],
+ IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>;
+def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst,
+ (x86mmx (scalar_to_vector (loadi32 addr:$src))))],
+ IIC_MMX_MOV_MM_RM>, Sched<[WriteLoad]>;
+
+let Predicates = [HasMMX] in {
+ let AddedComplexity = 15 in
+ def : Pat<(x86mmx (MMX_X86movw2d GR32:$src)),
+ (MMX_MOVD64rr GR32:$src)>;
+ let AddedComplexity = 20 in
+ def : Pat<(x86mmx (MMX_X86movw2d (loadi32 addr:$src))),
+ (MMX_MOVD64rm addr:$src)>;
+}
+
+let mayStore = 1 in
+def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR64:$src),
+ "movd\t{$src, $dst|$dst, $src}", [], IIC_MMX_MOV_MM_RM>,
+ Sched<[WriteStore]>;
+
+def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR64:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst,
+ (MMX_X86movd2w (x86mmx VR64:$src)))],
+ IIC_MMX_MOV_REG_MM>, Sched<[WriteMove]>;
+
+let isBitcast = 1 in
+def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst, (bitconvert GR64:$src))],
+ IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>;
+
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
+def MMX_MOVD64to64rm : MMXRI<0x6E, MRMSrcMem, (outs VR64:$dst),
+ (ins i64mem:$src), "movd\t{$src, $dst|$dst, $src}",
+ [], IIC_MMX_MOVQ_RM>, Sched<[WriteLoad]>;
+
+// These are 64 bit moves, but since the OS X assembler doesn't
+// recognize a register-register movq, we write them as
+// movd.
+let SchedRW = [WriteMove], isBitcast = 1 in {
+def MMX_MOVD64from64rr : MMXRI<0x7E, MRMDestReg,
+ (outs GR64:$dst), (ins VR64:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst,
+ (bitconvert VR64:$src))], IIC_MMX_MOV_REG_MM>;
+let hasSideEffects = 0 in
+def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
+ "movq\t{$src, $dst|$dst, $src}", [],
+ IIC_MMX_MOVQ_RR>;
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
+def MMX_MOVQ64rr_REV : MMXI<0x7F, MRMDestReg, (outs VR64:$dst), (ins VR64:$src),
+ "movq\t{$src, $dst|$dst, $src}", [],
+ IIC_MMX_MOVQ_RR>;
+}
+} // SchedRW
+
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
+def MMX_MOVD64from64rm : MMXRI<0x7E, MRMDestMem,
+ (outs), (ins i64mem:$dst, VR64:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [], IIC_MMX_MOV_REG_MM>, Sched<[WriteStore]>;
+
+let SchedRW = [WriteLoad] in {
+let canFoldAsLoad = 1 in
+def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst, (load_mmx addr:$src))],
+ IIC_MMX_MOVQ_RM>;
+} // SchedRW
+let SchedRW = [WriteStore] in
+def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(store (x86mmx VR64:$src), addr:$dst)],
+ IIC_MMX_MOVQ_RM>;
+
+let SchedRW = [WriteMove] in {
+def MMX_MOVDQ2Qrr : MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
+ (ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst,
+ (x86mmx (bitconvert
+ (i64 (extractelt (v2i64 VR128:$src),
+ (iPTR 0))))))],
+ IIC_MMX_MOVQ_RR>;
+
+def MMX_MOVQ2DQrr : MMXS2SIi8<0xD6, MRMSrcReg, (outs VR128:$dst),
+ (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2i64
+ (scalar_to_vector
+ (i64 (bitconvert (x86mmx VR64:$src))))))],
+ IIC_MMX_MOVQ_RR>;
+
+let isCodeGenOnly = 1, hasSideEffects = 1 in {
+def MMX_MOVQ2FR64rr: MMXS2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst),
+ (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
+ [], IIC_MMX_MOVQ_RR>;
+
+def MMX_MOVFR642Qrr: MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
+ (ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}",
+ [], IIC_MMX_MOVQ_RR>;
+}
+} // SchedRW
+
+let Predicates = [HasSSE1] in
+def MMX_MOVNTQmr : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
+ "movntq\t{$src, $dst|$dst, $src}",
+ [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)],
+ IIC_MMX_MOVQ_RM>, Sched<[WriteStore]>;
+
+let Predicates = [HasMMX] in {
+ let AddedComplexity = 15 in
+ // movd to MMX register zero-extends
+ def : Pat<(x86mmx (X86vzmovl (x86mmx (scalar_to_vector GR32:$src)))),
+ (MMX_MOVD64rr GR32:$src)>;
+ let AddedComplexity = 20 in
+ def : Pat<(x86mmx (X86vzmovl (x86mmx (scalar_to_vector (loadi32 addr:$src))))),
+ (MMX_MOVD64rm addr:$src)>;
+}
+
+// Arithmetic Instructions
+defm MMX_PABSB : SS3I_unop_rm_int_mm<0x1C, "pabsb", int_x86_ssse3_pabs_b,
+ MMX_INTALU_ITINS>;
+defm MMX_PABSW : SS3I_unop_rm_int_mm<0x1D, "pabsw", int_x86_ssse3_pabs_w,
+ MMX_INTALU_ITINS>;
+defm MMX_PABSD : SS3I_unop_rm_int_mm<0x1E, "pabsd", int_x86_ssse3_pabs_d,
+ MMX_INTALU_ITINS>;
+// -- Addition
+defm MMX_PADDB : MMXI_binop_rm_int<0xFC, "paddb", int_x86_mmx_padd_b,
+ MMX_INTALU_ITINS, 1>;
+defm MMX_PADDW : MMXI_binop_rm_int<0xFD, "paddw", int_x86_mmx_padd_w,
+ MMX_INTALU_ITINS, 1>;
+defm MMX_PADDD : MMXI_binop_rm_int<0xFE, "paddd", int_x86_mmx_padd_d,
+ MMX_INTALU_ITINS, 1>;
+let Predicates = [HasSSE2] in
+defm MMX_PADDQ : MMXI_binop_rm_int<0xD4, "paddq", int_x86_mmx_padd_q,
+ MMX_INTALUQ_ITINS, 1>;
+defm MMX_PADDSB : MMXI_binop_rm_int<0xEC, "paddsb" , int_x86_mmx_padds_b,
+ MMX_INTALU_ITINS, 1>;
+defm MMX_PADDSW : MMXI_binop_rm_int<0xED, "paddsw" , int_x86_mmx_padds_w,
+ MMX_INTALU_ITINS, 1>;
+
+defm MMX_PADDUSB : MMXI_binop_rm_int<0xDC, "paddusb", int_x86_mmx_paddus_b,
+ MMX_INTALU_ITINS, 1>;
+defm MMX_PADDUSW : MMXI_binop_rm_int<0xDD, "paddusw", int_x86_mmx_paddus_w,
+ MMX_INTALU_ITINS, 1>;
+
+defm MMX_PHADDW : SS3I_binop_rm_int_mm<0x01, "phaddw", int_x86_ssse3_phadd_w,
+ MMX_PHADDSUBW>;
+defm MMX_PHADD : SS3I_binop_rm_int_mm<0x02, "phaddd", int_x86_ssse3_phadd_d,
+ MMX_PHADDSUBD>;
+defm MMX_PHADDSW : SS3I_binop_rm_int_mm<0x03, "phaddsw",int_x86_ssse3_phadd_sw,
+ MMX_PHADDSUBW>;
+
+
+// -- Subtraction
+defm MMX_PSUBB : MMXI_binop_rm_int<0xF8, "psubb", int_x86_mmx_psub_b,
+ MMX_INTALU_ITINS>;
+defm MMX_PSUBW : MMXI_binop_rm_int<0xF9, "psubw", int_x86_mmx_psub_w,
+ MMX_INTALU_ITINS>;
+defm MMX_PSUBD : MMXI_binop_rm_int<0xFA, "psubd", int_x86_mmx_psub_d,
+ MMX_INTALU_ITINS>;
+let Predicates = [HasSSE2] in
+defm MMX_PSUBQ : MMXI_binop_rm_int<0xFB, "psubq", int_x86_mmx_psub_q,
+ MMX_INTALUQ_ITINS>;
+
+defm MMX_PSUBSB : MMXI_binop_rm_int<0xE8, "psubsb" , int_x86_mmx_psubs_b,
+ MMX_INTALU_ITINS>;
+defm MMX_PSUBSW : MMXI_binop_rm_int<0xE9, "psubsw" , int_x86_mmx_psubs_w,
+ MMX_INTALU_ITINS>;
+
+defm MMX_PSUBUSB : MMXI_binop_rm_int<0xD8, "psubusb", int_x86_mmx_psubus_b,
+ MMX_INTALU_ITINS>;
+defm MMX_PSUBUSW : MMXI_binop_rm_int<0xD9, "psubusw", int_x86_mmx_psubus_w,
+ MMX_INTALU_ITINS>;
+
+defm MMX_PHSUBW : SS3I_binop_rm_int_mm<0x05, "phsubw", int_x86_ssse3_phsub_w,
+ MMX_PHADDSUBW>;
+defm MMX_PHSUBD : SS3I_binop_rm_int_mm<0x06, "phsubd", int_x86_ssse3_phsub_d,
+ MMX_PHADDSUBD>;
+defm MMX_PHSUBSW : SS3I_binop_rm_int_mm<0x07, "phsubsw",int_x86_ssse3_phsub_sw,
+ MMX_PHADDSUBW>;
+
+// -- Multiplication
+defm MMX_PMULLW : MMXI_binop_rm_int<0xD5, "pmullw", int_x86_mmx_pmull_w,
+ MMX_PMUL_ITINS, 1>;
+
+defm MMX_PMULHW : MMXI_binop_rm_int<0xE5, "pmulhw", int_x86_mmx_pmulh_w,
+ MMX_PMUL_ITINS, 1>;
+let Predicates = [HasSSE1] in
+defm MMX_PMULHUW : MMXI_binop_rm_int<0xE4, "pmulhuw", int_x86_mmx_pmulhu_w,
+ MMX_PMUL_ITINS, 1>;
+let Predicates = [HasSSE2] in
+defm MMX_PMULUDQ : MMXI_binop_rm_int<0xF4, "pmuludq", int_x86_mmx_pmulu_dq,
+ MMX_PMUL_ITINS, 1>;
+defm MMX_PMULHRSW : SS3I_binop_rm_int_mm<0x0B, "pmulhrsw",
+ int_x86_ssse3_pmul_hr_sw,
+ MMX_PMUL_ITINS, 1>;
+
+// -- Miscellanea
+defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd,
+ MMX_PMUL_ITINS, 1>;
+
+defm MMX_PMADDUBSW : SS3I_binop_rm_int_mm<0x04, "pmaddubsw",
+ int_x86_ssse3_pmadd_ub_sw, MMX_PMUL_ITINS>;
+let Predicates = [HasSSE1] in {
+defm MMX_PAVGB : MMXI_binop_rm_int<0xE0, "pavgb", int_x86_mmx_pavg_b,
+ MMX_MISC_FUNC_ITINS, 1>;
+defm MMX_PAVGW : MMXI_binop_rm_int<0xE3, "pavgw", int_x86_mmx_pavg_w,
+ MMX_MISC_FUNC_ITINS, 1>;
+
+defm MMX_PMINUB : MMXI_binop_rm_int<0xDA, "pminub", int_x86_mmx_pminu_b,
+ MMX_MISC_FUNC_ITINS, 1>;
+defm MMX_PMINSW : MMXI_binop_rm_int<0xEA, "pminsw", int_x86_mmx_pmins_w,
+ MMX_MISC_FUNC_ITINS, 1>;
+
+defm MMX_PMAXUB : MMXI_binop_rm_int<0xDE, "pmaxub", int_x86_mmx_pmaxu_b,
+ MMX_MISC_FUNC_ITINS, 1>;
+defm MMX_PMAXSW : MMXI_binop_rm_int<0xEE, "pmaxsw", int_x86_mmx_pmaxs_w,
+ MMX_MISC_FUNC_ITINS, 1>;
+
+defm MMX_PSADBW : MMXI_binop_rm_int<0xF6, "psadbw", int_x86_mmx_psad_bw,
+ MMX_PSADBW_ITINS, 1>;
+}
+
+defm MMX_PSIGNB : SS3I_binop_rm_int_mm<0x08, "psignb", int_x86_ssse3_psign_b,
+ MMX_MISC_FUNC_ITINS>;
+defm MMX_PSIGNW : SS3I_binop_rm_int_mm<0x09, "psignw", int_x86_ssse3_psign_w,
+ MMX_MISC_FUNC_ITINS>;
+defm MMX_PSIGND : SS3I_binop_rm_int_mm<0x0A, "psignd", int_x86_ssse3_psign_d,
+ MMX_MISC_FUNC_ITINS>;
+let Constraints = "$src1 = $dst" in
+ defm MMX_PALIGN : ssse3_palign_mm<"palignr", int_x86_mmx_palignr_b>;
+
+// Logical Instructions
+defm MMX_PAND : MMXI_binop_rm_int<0xDB, "pand", int_x86_mmx_pand,
+ MMX_INTALU_ITINS_VECLOGICSCHED, 1>;
+defm MMX_POR : MMXI_binop_rm_int<0xEB, "por" , int_x86_mmx_por,
+ MMX_INTALU_ITINS_VECLOGICSCHED, 1>;
+defm MMX_PXOR : MMXI_binop_rm_int<0xEF, "pxor", int_x86_mmx_pxor,
+ MMX_INTALU_ITINS_VECLOGICSCHED, 1>;
+defm MMX_PANDN : MMXI_binop_rm_int<0xDF, "pandn", int_x86_mmx_pandn,
+ MMX_INTALU_ITINS_VECLOGICSCHED>;
+
+// Shift Instructions
+defm MMX_PSRLW : MMXI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw",
+ int_x86_mmx_psrl_w, int_x86_mmx_psrli_w,
+ MMX_SHIFT_ITINS>;
+defm MMX_PSRLD : MMXI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld",
+ int_x86_mmx_psrl_d, int_x86_mmx_psrli_d,
+ MMX_SHIFT_ITINS>;
+defm MMX_PSRLQ : MMXI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq",
+ int_x86_mmx_psrl_q, int_x86_mmx_psrli_q,
+ MMX_SHIFT_ITINS>;
+
+def : Pat<(int_x86_mmx_psrl_w VR64:$src1, (load_mvmmx addr:$src2)),
+ (MMX_PSRLWrm VR64:$src1, addr:$src2)>;
+def : Pat<(int_x86_mmx_psrl_d VR64:$src1, (load_mvmmx addr:$src2)),
+ (MMX_PSRLDrm VR64:$src1, addr:$src2)>;
+def : Pat<(int_x86_mmx_psrl_q VR64:$src1, (load_mvmmx addr:$src2)),
+ (MMX_PSRLQrm VR64:$src1, addr:$src2)>;
+
+defm MMX_PSLLW : MMXI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw",
+ int_x86_mmx_psll_w, int_x86_mmx_pslli_w,
+ MMX_SHIFT_ITINS>;
+defm MMX_PSLLD : MMXI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld",
+ int_x86_mmx_psll_d, int_x86_mmx_pslli_d,
+ MMX_SHIFT_ITINS>;
+defm MMX_PSLLQ : MMXI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq",
+ int_x86_mmx_psll_q, int_x86_mmx_pslli_q,
+ MMX_SHIFT_ITINS>;
+
+def : Pat<(int_x86_mmx_psll_w VR64:$src1, (load_mvmmx addr:$src2)),
+ (MMX_PSLLWrm VR64:$src1, addr:$src2)>;
+def : Pat<(int_x86_mmx_psll_d VR64:$src1, (load_mvmmx addr:$src2)),
+ (MMX_PSLLDrm VR64:$src1, addr:$src2)>;
+def : Pat<(int_x86_mmx_psll_q VR64:$src1, (load_mvmmx addr:$src2)),
+ (MMX_PSLLQrm VR64:$src1, addr:$src2)>;
+
+defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw",
+ int_x86_mmx_psra_w, int_x86_mmx_psrai_w,
+ MMX_SHIFT_ITINS>;
+defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
+ int_x86_mmx_psra_d, int_x86_mmx_psrai_d,
+ MMX_SHIFT_ITINS>;
+
+def : Pat<(int_x86_mmx_psra_w VR64:$src1, (load_mvmmx addr:$src2)),
+ (MMX_PSRAWrm VR64:$src1, addr:$src2)>;
+def : Pat<(int_x86_mmx_psra_d VR64:$src1, (load_mvmmx addr:$src2)),
+ (MMX_PSRADrm VR64:$src1, addr:$src2)>;
+
+// Comparison Instructions
+defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b,
+ MMX_INTALU_ITINS>;
+defm MMX_PCMPEQW : MMXI_binop_rm_int<0x75, "pcmpeqw", int_x86_mmx_pcmpeq_w,
+ MMX_INTALU_ITINS>;
+defm MMX_PCMPEQD : MMXI_binop_rm_int<0x76, "pcmpeqd", int_x86_mmx_pcmpeq_d,
+ MMX_INTALU_ITINS>;
+
+defm MMX_PCMPGTB : MMXI_binop_rm_int<0x64, "pcmpgtb", int_x86_mmx_pcmpgt_b,
+ MMX_INTALU_ITINS>;
+defm MMX_PCMPGTW : MMXI_binop_rm_int<0x65, "pcmpgtw", int_x86_mmx_pcmpgt_w,
+ MMX_INTALU_ITINS>;
+defm MMX_PCMPGTD : MMXI_binop_rm_int<0x66, "pcmpgtd", int_x86_mmx_pcmpgt_d,
+ MMX_INTALU_ITINS>;
+
+// -- Unpack Instructions
+defm MMX_PUNPCKHBW : MMXI_binop_rm_int<0x68, "punpckhbw",
+ int_x86_mmx_punpckhbw,
+ MMX_UNPCK_H_ITINS>;
+defm MMX_PUNPCKHWD : MMXI_binop_rm_int<0x69, "punpckhwd",
+ int_x86_mmx_punpckhwd,
+ MMX_UNPCK_H_ITINS>;
+defm MMX_PUNPCKHDQ : MMXI_binop_rm_int<0x6A, "punpckhdq",
+ int_x86_mmx_punpckhdq,
+ MMX_UNPCK_H_ITINS>;
+defm MMX_PUNPCKLBW : MMXI_binop_rm_int<0x60, "punpcklbw",
+ int_x86_mmx_punpcklbw,
+ MMX_UNPCK_L_ITINS>;
+defm MMX_PUNPCKLWD : MMXI_binop_rm_int<0x61, "punpcklwd",
+ int_x86_mmx_punpcklwd,
+ MMX_UNPCK_L_ITINS>;
+defm MMX_PUNPCKLDQ : MMXI_binop_rm_int<0x62, "punpckldq",
+ int_x86_mmx_punpckldq,
+ MMX_UNPCK_L_ITINS>;
+
+// -- Pack Instructions
+defm MMX_PACKSSWB : MMXI_binop_rm_int<0x63, "packsswb", int_x86_mmx_packsswb,
+ MMX_PCK_ITINS>;
+defm MMX_PACKSSDW : MMXI_binop_rm_int<0x6B, "packssdw", int_x86_mmx_packssdw,
+ MMX_PCK_ITINS>;
+defm MMX_PACKUSWB : MMXI_binop_rm_int<0x67, "packuswb", int_x86_mmx_packuswb,
+ MMX_PCK_ITINS>;
+
+// -- Shuffle Instructions
+defm MMX_PSHUFB : SS3I_binop_rm_int_mm<0x00, "pshufb", int_x86_ssse3_pshuf_b,
+ MMX_PSHUF_ITINS>;
+
+def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg,
+ (outs VR64:$dst), (ins VR64:$src1, u8imm:$src2),
+ "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR64:$dst,
+ (int_x86_sse_pshuf_w VR64:$src1, imm:$src2))],
+ IIC_MMX_PSHUF>, Sched<[WriteShuffle]>;
+def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
+ (outs VR64:$dst), (ins i64mem:$src1, u8imm:$src2),
+ "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR64:$dst,
+ (int_x86_sse_pshuf_w (load_mmx addr:$src1),
+ imm:$src2))],
+ IIC_MMX_PSHUF>, Sched<[WriteShuffleLd]>;
+
+
+
+
+// -- Conversion Instructions
+defm MMX_CVTPS2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtps2pi,
+ f64mem, load, "cvtps2pi\t{$src, $dst|$dst, $src}",
+ MMX_CVT_PS_ITINS, SSEPackedSingle>, PS;
+defm MMX_CVTPD2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtpd2pi,
+ f128mem, memop, "cvtpd2pi\t{$src, $dst|$dst, $src}",
+ MMX_CVT_PD_ITINS, SSEPackedDouble>, PD;
+defm MMX_CVTTPS2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttps2pi,
+ f64mem, load, "cvttps2pi\t{$src, $dst|$dst, $src}",
+ MMX_CVT_PS_ITINS, SSEPackedSingle>, PS;
+defm MMX_CVTTPD2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttpd2pi,
+ f128mem, memop, "cvttpd2pi\t{$src, $dst|$dst, $src}",
+ MMX_CVT_PD_ITINS, SSEPackedDouble>, PD;
+defm MMX_CVTPI2PD : sse12_cvt_pint<0x2A, VR64, VR128, int_x86_sse_cvtpi2pd,
+ i64mem, load, "cvtpi2pd\t{$src, $dst|$dst, $src}",
+ MMX_CVT_PD_ITINS, SSEPackedDouble>, PD;
+let Constraints = "$src1 = $dst" in {
+ defm MMX_CVTPI2PS : sse12_cvt_pint_3addr<0x2A, VR64, VR128,
+ int_x86_sse_cvtpi2ps,
+ i64mem, load, "cvtpi2ps\t{$src2, $dst|$dst, $src2}",
+ SSEPackedSingle>, PS;
+}
+
+// Extract / Insert
+let Predicates = [HasSSE1] in
+def MMX_PEXTRWirri: MMXIi8<0xC5, MRMSrcReg,
+ (outs GR32orGR64:$dst), (ins VR64:$src1, i32u8imm:$src2),
+ "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32orGR64:$dst, (int_x86_mmx_pextr_w VR64:$src1,
+ imm:$src2))],
+ IIC_MMX_PEXTR>, Sched<[WriteShuffle]>;
+let Constraints = "$src1 = $dst" in {
+let Predicates = [HasSSE1] in {
+ def MMX_PINSRWirri : MMXIi8<0xC4, MRMSrcReg,
+ (outs VR64:$dst),
+ (ins VR64:$src1, GR32orGR64:$src2, i32u8imm:$src3),
+ "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
+ GR32orGR64:$src2, imm:$src3))],
+ IIC_MMX_PINSRW>, Sched<[WriteShuffle]>;
+
+ def MMX_PINSRWirmi : MMXIi8<0xC4, MRMSrcMem,
+ (outs VR64:$dst),
+ (ins VR64:$src1, i16mem:$src2, i32u8imm:$src3),
+ "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
+ (i32 (anyext (loadi16 addr:$src2))),
+ imm:$src3))],
+ IIC_MMX_PINSRW>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+}
+
+// Mask creation
+let Predicates = [HasSSE1] in
+def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
+ (ins VR64:$src),
+ "pmovmskb\t{$src, $dst|$dst, $src}",
+ [(set GR32orGR64:$dst,
+ (int_x86_mmx_pmovmskb VR64:$src))]>;
+
+
+// Low word of XMM to MMX.
+def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1,
+ [SDTCisVT<0, x86mmx>, SDTCisVT<1, v2i64>]>>;
+
+def : Pat<(x86mmx (MMX_X86movdq2q VR128:$src)),
+ (x86mmx (MMX_MOVDQ2Qrr VR128:$src))>;
+
+def : Pat<(x86mmx (MMX_X86movdq2q (loadv2i64 addr:$src))),
+ (x86mmx (MMX_MOVQ64rm addr:$src))>;
+
+// Misc.
+let SchedRW = [WriteShuffle] in {
+let Uses = [EDI], Predicates = [HasSSE1,Not64BitMode] in
+def MMX_MASKMOVQ : MMXI32<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
+ "maskmovq\t{$mask, $src|$src, $mask}",
+ [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)],
+ IIC_MMX_MASKMOV>;
+let Uses = [RDI], Predicates = [HasSSE1,In64BitMode] in
+def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
+ "maskmovq\t{$mask, $src|$src, $mask}",
+ [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, RDI)],
+ IIC_MMX_MASKMOV>;
+}
+
+// 64-bit bit convert.
+let Predicates = [HasSSE2] in {
+def : Pat<(f64 (bitconvert (x86mmx VR64:$src))),
+ (MMX_MOVQ2FR64rr VR64:$src)>;
+def : Pat<(x86mmx (bitconvert (f64 FR64:$src))),
+ (MMX_MOVFR642Qrr FR64:$src)>;
+}
+
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrMPX.td b/contrib/llvm/lib/Target/X86/X86InstrMPX.td
new file mode 100644
index 000000000000..309f601d1fce
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrMPX.td
@@ -0,0 +1,70 @@
+//===-- X86InstrMPX.td - MPX Instruction Set ---------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 MPX instruction set, defining the
+// instructions, and properties of the instructions which are needed for code
+// generation, machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+multiclass mpx_bound_make<bits<8> opc, string OpcodeStr> {
+ def 32rm: I<opc, MRMSrcMem, (outs BNDR:$dst), (ins i32mem:$src),
+ OpcodeStr#"\t{$src, $dst|$dst, $src}", []>,
+ Requires<[HasMPX, Not64BitMode]>;
+ def 64rm: RI<opc, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
+ OpcodeStr#"\t{$src, $dst|$dst, $src}", []>,
+ Requires<[HasMPX, In64BitMode]>;
+}
+
+defm BNDMK : mpx_bound_make<0x1B, "bndmk">, XS;
+
+multiclass mpx_bound_check<bits<8> opc, string OpcodeStr> {
+ def 32rm: I<opc, MRMSrcMem, (outs), (ins BNDR:$src1, i32mem:$src2),
+ OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
+ Requires<[HasMPX, Not64BitMode]>;
+ def 64rm: RI<opc, MRMSrcMem, (outs), (ins BNDR:$src1, i64mem:$src2),
+ OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
+ Requires<[HasMPX, In64BitMode]>;
+ def 32rr: I<opc, MRMSrcReg, (outs), (ins BNDR:$src1, GR32:$src2),
+ OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
+ Requires<[HasMPX, Not64BitMode]>;
+ def 64rr: RI<opc, MRMSrcReg, (outs), (ins BNDR:$src1, GR64:$src2),
+ OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
+ Requires<[HasMPX, In64BitMode]>;
+}
+defm BNDCL : mpx_bound_check<0x1A, "bndcl">, XS;
+defm BNDCU : mpx_bound_check<0x1A, "bndcu">, XD;
+defm BNDCN : mpx_bound_check<0x1B, "bndcn">, XD;
+
+def BNDMOVRMrr : I<0x1A, MRMSrcReg, (outs BNDR:$dst), (ins BNDR:$src),
+ "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
+ Requires<[HasMPX]>;
+def BNDMOVRM32rm : I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
+ "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
+ Requires<[HasMPX, Not64BitMode]>;
+def BNDMOVRM64rm : RI<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i128mem:$src),
+ "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
+ Requires<[HasMPX, In64BitMode]>;
+
+def BNDMOVMRrr : I<0x1B, MRMDestReg, (outs BNDR:$dst), (ins BNDR:$src),
+ "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
+ Requires<[HasMPX]>;
+def BNDMOVMR32mr : I<0x1B, MRMDestMem, (outs), (ins i64mem:$dst, BNDR:$src),
+ "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
+ Requires<[HasMPX, Not64BitMode]>;
+def BNDMOVMR64mr : RI<0x1B, MRMDestMem, (outs), (ins i128mem:$dst, BNDR:$src),
+ "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
+ Requires<[HasMPX, In64BitMode]>;
+
+def BNDSTXmr: I<0x1B, MRMDestMem, (outs), (ins i64mem:$dst, BNDR:$src),
+ "bndstx\t{$src, $dst|$dst, $src}", []>, PS,
+ Requires<[HasMPX]>;
+def BNDLDXrm: I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
+ "bndldx\t{$src, $dst|$dst, $src}", []>, PS,
+ Requires<[HasMPX]>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSGX.td b/contrib/llvm/lib/Target/X86/X86InstrSGX.td
new file mode 100644
index 000000000000..84119ad5eb35
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrSGX.td
@@ -0,0 +1,24 @@
+//===-- X86InstrSGX.td - SGX Instruction Set Extension -----*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the Intel SGX instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SGX instructions
+
+// ENCLS - Execute an Enclave System Function of Specified Leaf Number
+def ENCLS : I<0x01, MRM_CF, (outs), (ins),
+ "encls", []>, TB;
+
+// ENCLU - Execute an Enclave User Function of Specified Leaf Number
+def ENCLU : I<0x01, MRM_D7, (outs), (ins),
+ "enclu", []>, TB;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm/lib/Target/X86/X86InstrSSE.td
new file mode 100644
index 000000000000..9d6a89363044
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrSSE.td
@@ -0,0 +1,8711 @@
+//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 SSE instruction set, defining the instructions,
+// and properties of the instructions which are needed for code generation,
+// machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+class OpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm> {
+ InstrItinClass rr = arg_rr;
+ InstrItinClass rm = arg_rm;
+ // InstrSchedModel info.
+ X86FoldableSchedWrite Sched = WriteFAdd;
+}
+
+class SizeItins<OpndItins arg_s, OpndItins arg_d> {
+ OpndItins s = arg_s;
+ OpndItins d = arg_d;
+}
+
+
+class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm,
+ InstrItinClass arg_ri> {
+ InstrItinClass rr = arg_rr;
+ InstrItinClass rm = arg_rm;
+ InstrItinClass ri = arg_ri;
+}
+
+
+// scalar
+let Sched = WriteFAdd in {
+def SSE_ALU_F32S : OpndItins<
+ IIC_SSE_ALU_F32S_RR, IIC_SSE_ALU_F32S_RM
+>;
+
+def SSE_ALU_F64S : OpndItins<
+ IIC_SSE_ALU_F64S_RR, IIC_SSE_ALU_F64S_RM
+>;
+}
+
+def SSE_ALU_ITINS_S : SizeItins<
+ SSE_ALU_F32S, SSE_ALU_F64S
+>;
+
+let Sched = WriteFMul in {
+def SSE_MUL_F32S : OpndItins<
+ IIC_SSE_MUL_F32S_RR, IIC_SSE_MUL_F64S_RM
+>;
+
+def SSE_MUL_F64S : OpndItins<
+ IIC_SSE_MUL_F64S_RR, IIC_SSE_MUL_F64S_RM
+>;
+}
+
+def SSE_MUL_ITINS_S : SizeItins<
+ SSE_MUL_F32S, SSE_MUL_F64S
+>;
+
+let Sched = WriteFDiv in {
+def SSE_DIV_F32S : OpndItins<
+ IIC_SSE_DIV_F32S_RR, IIC_SSE_DIV_F64S_RM
+>;
+
+def SSE_DIV_F64S : OpndItins<
+ IIC_SSE_DIV_F64S_RR, IIC_SSE_DIV_F64S_RM
+>;
+}
+
+def SSE_DIV_ITINS_S : SizeItins<
+ SSE_DIV_F32S, SSE_DIV_F64S
+>;
+
+// parallel
+let Sched = WriteFAdd in {
+def SSE_ALU_F32P : OpndItins<
+ IIC_SSE_ALU_F32P_RR, IIC_SSE_ALU_F32P_RM
+>;
+
+def SSE_ALU_F64P : OpndItins<
+ IIC_SSE_ALU_F64P_RR, IIC_SSE_ALU_F64P_RM
+>;
+}
+
+def SSE_ALU_ITINS_P : SizeItins<
+ SSE_ALU_F32P, SSE_ALU_F64P
+>;
+
+let Sched = WriteFMul in {
+def SSE_MUL_F32P : OpndItins<
+ IIC_SSE_MUL_F32P_RR, IIC_SSE_MUL_F64P_RM
+>;
+
+def SSE_MUL_F64P : OpndItins<
+ IIC_SSE_MUL_F64P_RR, IIC_SSE_MUL_F64P_RM
+>;
+}
+
+def SSE_MUL_ITINS_P : SizeItins<
+ SSE_MUL_F32P, SSE_MUL_F64P
+>;
+
+let Sched = WriteFDiv in {
+def SSE_DIV_F32P : OpndItins<
+ IIC_SSE_DIV_F32P_RR, IIC_SSE_DIV_F64P_RM
+>;
+
+def SSE_DIV_F64P : OpndItins<
+ IIC_SSE_DIV_F64P_RR, IIC_SSE_DIV_F64P_RM
+>;
+}
+
+def SSE_DIV_ITINS_P : SizeItins<
+ SSE_DIV_F32P, SSE_DIV_F64P
+>;
+
+let Sched = WriteVecLogic in
+def SSE_VEC_BIT_ITINS_P : OpndItins<
+ IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM
+>;
+
+def SSE_BIT_ITINS_P : OpndItins<
+ IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM
+>;
+
+let Sched = WriteVecALU in {
+def SSE_INTALU_ITINS_P : OpndItins<
+ IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
+>;
+
+def SSE_INTALUQ_ITINS_P : OpndItins<
+ IIC_SSE_INTALUQ_P_RR, IIC_SSE_INTALUQ_P_RM
+>;
+}
+
+let Sched = WriteVecIMul in
+def SSE_INTMUL_ITINS_P : OpndItins<
+ IIC_SSE_INTMUL_P_RR, IIC_SSE_INTMUL_P_RM
+>;
+
+def SSE_INTSHIFT_ITINS_P : ShiftOpndItins<
+ IIC_SSE_INTSH_P_RR, IIC_SSE_INTSH_P_RM, IIC_SSE_INTSH_P_RI
+>;
+
+def SSE_MOVA_ITINS : OpndItins<
+ IIC_SSE_MOVA_P_RR, IIC_SSE_MOVA_P_RM
+>;
+
+def SSE_MOVU_ITINS : OpndItins<
+ IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM
+>;
+
+def SSE_DPPD_ITINS : OpndItins<
+ IIC_SSE_DPPD_RR, IIC_SSE_DPPD_RM
+>;
+
+def SSE_DPPS_ITINS : OpndItins<
+ IIC_SSE_DPPS_RR, IIC_SSE_DPPD_RM
+>;
+
+def DEFAULT_ITINS : OpndItins<
+ IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+def SSE_EXTRACT_ITINS : OpndItins<
+ IIC_SSE_EXTRACTPS_RR, IIC_SSE_EXTRACTPS_RM
+>;
+
+def SSE_INSERT_ITINS : OpndItins<
+ IIC_SSE_INSERTPS_RR, IIC_SSE_INSERTPS_RM
+>;
+
+let Sched = WriteMPSAD in
+def SSE_MPSADBW_ITINS : OpndItins<
+ IIC_SSE_MPSADBW_RR, IIC_SSE_MPSADBW_RM
+>;
+
+let Sched = WriteVecIMul in
+def SSE_PMULLD_ITINS : OpndItins<
+ IIC_SSE_PMULLD_RR, IIC_SSE_PMULLD_RM
+>;
+
+// Definitions for backward compatibility.
+// The instructions mapped on these definitions uses a different itinerary
+// than the actual scheduling model.
+let Sched = WriteShuffle in
+def DEFAULT_ITINS_SHUFFLESCHED : OpndItins<
+ IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+let Sched = WriteVecIMul in
+def DEFAULT_ITINS_VECIMULSCHED : OpndItins<
+ IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+let Sched = WriteShuffle in
+def SSE_INTALU_ITINS_SHUFF_P : OpndItins<
+ IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
+>;
+
+let Sched = WriteMPSAD in
+def DEFAULT_ITINS_MPSADSCHED : OpndItins<
+ IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+let Sched = WriteFBlend in
+def DEFAULT_ITINS_FBLENDSCHED : OpndItins<
+ IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+let Sched = WriteBlend in
+def DEFAULT_ITINS_BLENDSCHED : OpndItins<
+ IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+let Sched = WriteVarBlend in
+def DEFAULT_ITINS_VARBLENDSCHED : OpndItins<
+ IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+let Sched = WriteFBlend in
+def SSE_INTALU_ITINS_FBLEND_P : OpndItins<
+ IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
+>;
+
+let Sched = WriteBlend in
+def SSE_INTALU_ITINS_BLEND_P : OpndItins<
+ IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
+>;
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 Instructions Classes
+//===----------------------------------------------------------------------===//
+
+/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
+multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ RegisterClass RC, X86MemOperand x86memop,
+ Domain d, OpndItins itins, bit Is2Addr = 1> {
+ let isCommutable = 1 in {
+ def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], itins.rr, d>,
+ Sched<[itins.Sched]>;
+ }
+ def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], itins.rm, d>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
+multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr,
+ SDPatternOperator Int, RegisterClass RC,
+ string asm, Operand memopr,
+ ComplexPattern mem_cpat, Domain d,
+ OpndItins itins, bit Is2Addr = 1> {
+let isCodeGenOnly = 1, hasSideEffects = 0 in {
+ def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (Int RC:$src1, RC:$src2))], itins.rr, d>,
+ Sched<[itins.Sched]>;
+ let mayLoad = 1 in
+ def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (Int RC:$src1, mem_cpat:$src2))], itins.rm, d>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+}
+
+/// sse12_fp_packed - SSE 1 & 2 packed instructions class
+multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ RegisterClass RC, ValueType vt,
+ X86MemOperand x86memop, PatFrag mem_frag,
+ Domain d, OpndItins itins, bit Is2Addr = 1> {
+ let isCommutable = 1 in
+ def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr, d>,
+ Sched<[itins.Sched]>;
+ let mayLoad = 1 in
+ def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
+ itins.rm, d>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
+multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
+ string OpcodeStr, X86MemOperand x86memop,
+ list<dag> pat_rr, list<dag> pat_rm,
+ bit Is2Addr = 1> {
+ let isCommutable = 1, hasSideEffects = 0 in
+ def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ pat_rr, NoItinerary, d>,
+ Sched<[WriteVecLogic]>;
+ def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ pat_rm, NoItinerary, d>,
+ Sched<[WriteVecLogicLd, ReadAfterLd]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Non-instruction patterns
+//===----------------------------------------------------------------------===//
+
+// A vector extract of the first f32/f64 position is a subregister copy
+def : Pat<(f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
+ (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>;
+def : Pat<(f64 (extractelt (v2f64 VR128:$src), (iPTR 0))),
+ (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64)>;
+
+// A 128-bit subvector extract from the first 256-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (iPTR 0))),
+ (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm))>;
+def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (iPTR 0))),
+ (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm))>;
+
+def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (iPTR 0))),
+ (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm))>;
+def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (iPTR 0))),
+ (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm))>;
+
+def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (iPTR 0))),
+ (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), sub_xmm))>;
+def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (iPTR 0))),
+ (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), sub_xmm))>;
+
+// A 128-bit subvector insert to the first 256-bit vector position
+// is a subregister copy that needs no instruction.
+let AddedComplexity = 25 in { // to give priority over vinsertf128rm
+def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)),
+ (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)),
+ (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)),
+ (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)),
+ (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (iPTR 0)),
+ (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (iPTR 0)),
+ (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+}
+
+// Implicitly promote a 32-bit scalar to a vector.
+def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
+ (COPY_TO_REGCLASS FR32:$src, VR128)>;
+// Implicitly promote a 64-bit scalar to a vector.
+def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
+ (COPY_TO_REGCLASS FR64:$src, VR128)>;
+
+// Bitcasts between 128-bit vector types. Return the original type since
+// no instruction is needed for the conversion
+def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
+def : Pat<(f128 (bitconvert (i128 FR128:$src))), (f128 FR128:$src)>;
+def : Pat<(i128 (bitconvert (f128 FR128:$src))), (i128 FR128:$src)>;
+
+// Bitcasts between 256-bit vector types. Return the original type since
+// no instruction is needed for the conversion
+def : Pat<(v4i64 (bitconvert (v8i32 VR256:$src))), (v4i64 VR256:$src)>;
+def : Pat<(v4i64 (bitconvert (v16i16 VR256:$src))), (v4i64 VR256:$src)>;
+def : Pat<(v4i64 (bitconvert (v32i8 VR256:$src))), (v4i64 VR256:$src)>;
+def : Pat<(v4i64 (bitconvert (v8f32 VR256:$src))), (v4i64 VR256:$src)>;
+def : Pat<(v4i64 (bitconvert (v4f64 VR256:$src))), (v4i64 VR256:$src)>;
+def : Pat<(v8i32 (bitconvert (v4i64 VR256:$src))), (v8i32 VR256:$src)>;
+def : Pat<(v8i32 (bitconvert (v16i16 VR256:$src))), (v8i32 VR256:$src)>;
+def : Pat<(v8i32 (bitconvert (v32i8 VR256:$src))), (v8i32 VR256:$src)>;
+def : Pat<(v8i32 (bitconvert (v4f64 VR256:$src))), (v8i32 VR256:$src)>;
+def : Pat<(v8i32 (bitconvert (v8f32 VR256:$src))), (v8i32 VR256:$src)>;
+def : Pat<(v16i16 (bitconvert (v4i64 VR256:$src))), (v16i16 VR256:$src)>;
+def : Pat<(v16i16 (bitconvert (v8i32 VR256:$src))), (v16i16 VR256:$src)>;
+def : Pat<(v16i16 (bitconvert (v32i8 VR256:$src))), (v16i16 VR256:$src)>;
+def : Pat<(v16i16 (bitconvert (v4f64 VR256:$src))), (v16i16 VR256:$src)>;
+def : Pat<(v16i16 (bitconvert (v8f32 VR256:$src))), (v16i16 VR256:$src)>;
+def : Pat<(v32i8 (bitconvert (v4i64 VR256:$src))), (v32i8 VR256:$src)>;
+def : Pat<(v32i8 (bitconvert (v8i32 VR256:$src))), (v32i8 VR256:$src)>;
+def : Pat<(v32i8 (bitconvert (v16i16 VR256:$src))), (v32i8 VR256:$src)>;
+def : Pat<(v32i8 (bitconvert (v4f64 VR256:$src))), (v32i8 VR256:$src)>;
+def : Pat<(v32i8 (bitconvert (v8f32 VR256:$src))), (v32i8 VR256:$src)>;
+def : Pat<(v8f32 (bitconvert (v4i64 VR256:$src))), (v8f32 VR256:$src)>;
+def : Pat<(v8f32 (bitconvert (v8i32 VR256:$src))), (v8f32 VR256:$src)>;
+def : Pat<(v8f32 (bitconvert (v16i16 VR256:$src))), (v8f32 VR256:$src)>;
+def : Pat<(v8f32 (bitconvert (v32i8 VR256:$src))), (v8f32 VR256:$src)>;
+def : Pat<(v8f32 (bitconvert (v4f64 VR256:$src))), (v8f32 VR256:$src)>;
+def : Pat<(v4f64 (bitconvert (v4i64 VR256:$src))), (v4f64 VR256:$src)>;
+def : Pat<(v4f64 (bitconvert (v8i32 VR256:$src))), (v4f64 VR256:$src)>;
+def : Pat<(v4f64 (bitconvert (v16i16 VR256:$src))), (v4f64 VR256:$src)>;
+def : Pat<(v4f64 (bitconvert (v32i8 VR256:$src))), (v4f64 VR256:$src)>;
+def : Pat<(v4f64 (bitconvert (v8f32 VR256:$src))), (v4f64 VR256:$src)>;
+
+// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
+// This is expanded by ExpandPostRAPseudos.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+ isPseudo = 1, SchedRW = [WriteZero] in {
+ def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
+ [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoVLX_Or_NoDQI]>;
+ def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
+ [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2, NoVLX_Or_NoDQI]>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX & SSE - Zero/One Vectors
+//===----------------------------------------------------------------------===//
+
+// Alias instruction that maps zero vector to pxor / xorp* for sse.
+// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
+// swizzled by ExecutionDepsFix to pxor.
+// We set canFoldAsLoad because this can be converted to a constant-pool
+// load of an all-zeros value if folding it would be beneficial.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+ isPseudo = 1, Predicates = [NoVLX], SchedRW = [WriteZero] in {
+def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
+ [(set VR128:$dst, (v4f32 immAllZerosV))]>;
+}
+
+let Predicates = [NoVLX] in
+def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
+
+
+// The same as done above but for AVX. The 256-bit AVX1 ISA doesn't support PI,
+// and doesn't need it because on sandy bridge the register is set to zero
+// at the rename stage without using any execution unit, so SET0PSY
+// and SET0PDY can be used for vector int instructions without penalty
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+ isPseudo = 1, Predicates = [HasAVX, NoVLX], SchedRW = [WriteZero] in {
+def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
+ [(set VR256:$dst, (v8i32 immAllZerosV))]>;
+}
+
+// We set canFoldAsLoad because this can be converted to a constant-pool
+// load of an all-ones value if folding it would be beneficial.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+ isPseudo = 1, SchedRW = [WriteZero] in {
+ def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
+ [(set VR128:$dst, (v4i32 immAllOnesV))]>;
+ let Predicates = [HasAVX2] in
+ def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
+ [(set VR256:$dst, (v8i32 immAllOnesV))]>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move FP Scalar Instructions
+//
+// Move Instructions. Register-to-register movss/movsd is not used for FR32/64
+// register copies because it's a partial register update; Register-to-register
+// movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires
+// that the insert be implementable in terms of a copy, and just mentioned, we
+// don't use movss/movsd for copies.
+//===----------------------------------------------------------------------===//
+
+multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt,
+ X86MemOperand x86memop, string base_opc,
+ string asm_opr, Domain d = GenericDomain> {
+ let isCommutable = 1 in
+ def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, RC:$src2),
+ !strconcat(base_opc, asm_opr),
+ [(set VR128:$dst, (vt (OpNode VR128:$src1,
+ (scalar_to_vector RC:$src2))))],
+ IIC_SSE_MOV_S_RR, d>, Sched<[WriteFShuffle]>;
+
+ // For the disassembler
+ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+ def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
+ (ins VR128:$src1, RC:$src2),
+ !strconcat(base_opc, asm_opr),
+ [], IIC_SSE_MOV_S_RR>, Sched<[WriteFShuffle]>;
+}
+
+multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
+ X86MemOperand x86memop, string OpcodeStr,
+ Domain d = GenericDomain> {
+ // AVX
+ defm V#NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d>,
+ VEX_4V, VEX_LIG;
+
+ def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR, d>,
+ VEX, VEX_LIG, Sched<[WriteStore]>;
+ // SSE1 & 2
+ let Constraints = "$src1 = $dst" in {
+ defm NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
+ "\t{$src2, $dst|$dst, $src2}", d>;
+ }
+
+ def NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR, d>,
+ Sched<[WriteStore]>;
+}
+
+// Loading from memory automatically zeroing upper bits.
+multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
+ PatFrag mem_pat, string OpcodeStr,
+ Domain d = GenericDomain> {
+ def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (mem_pat addr:$src))],
+ IIC_SSE_MOV_S_RM, d>, VEX, VEX_LIG, Sched<[WriteLoad]>;
+ def NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (mem_pat addr:$src))],
+ IIC_SSE_MOV_S_RM, d>, Sched<[WriteLoad]>;
+}
+
+defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
+ SSEPackedSingle>, XS;
+defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
+ SSEPackedDouble>, XD;
+
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+ defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss",
+ SSEPackedSingle>, XS;
+
+ let AddedComplexity = 20 in
+ defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd",
+ SSEPackedDouble>, XD;
+}
+
+// Patterns
+let Predicates = [UseAVX] in {
+ let AddedComplexity = 20 in {
+ // MOVSSrm zeros the high parts of the register; represent this
+ // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
+ def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
+ (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
+ def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+ (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
+ def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
+ (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
+ def : Pat<(v4f32 (X86vzload addr:$src)),
+ (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
+
+ // MOVSDrm zeros the high parts of the register; represent this
+ // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
+ def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
+ (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
+ def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+ (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
+ def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
+ (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
+ def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
+ (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
+ def : Pat<(v2f64 (X86vzload addr:$src)),
+ (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
+
+ // Represent the same patterns above but in the form they appear for
+ // 256-bit types
+ def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
+ (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
+ (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
+ def : Pat<(v8f32 (X86vzload addr:$src)),
+ (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
+ def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
+ (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
+ (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
+ def : Pat<(v4f64 (X86vzload addr:$src)),
+ (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
+ }
+
+ // Extract and store.
+ def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
+ addr:$dst),
+ (VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>;
+
+ // Shuffle with VMOVSS
+ def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
+ (VMOVSSrr (v4i32 VR128:$src1),
+ (COPY_TO_REGCLASS (v4i32 VR128:$src2), FR32))>;
+ def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
+ (VMOVSSrr (v4f32 VR128:$src1),
+ (COPY_TO_REGCLASS (v4f32 VR128:$src2), FR32))>;
+
+ // 256-bit variants
+ def : Pat<(v8i32 (X86Movss VR256:$src1, VR256:$src2)),
+ (SUBREG_TO_REG (i32 0),
+ (VMOVSSrr (EXTRACT_SUBREG (v8i32 VR256:$src1), sub_xmm),
+ (EXTRACT_SUBREG (v8i32 VR256:$src2), sub_xmm)),
+ sub_xmm)>;
+ def : Pat<(v8f32 (X86Movss VR256:$src1, VR256:$src2)),
+ (SUBREG_TO_REG (i32 0),
+ (VMOVSSrr (EXTRACT_SUBREG (v8f32 VR256:$src1), sub_xmm),
+ (EXTRACT_SUBREG (v8f32 VR256:$src2), sub_xmm)),
+ sub_xmm)>;
+
+ // Shuffle with VMOVSD
+ def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
+ (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
+ (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
+ (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
+ (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+
+ // 256-bit variants
+ def : Pat<(v4i64 (X86Movsd VR256:$src1, VR256:$src2)),
+ (SUBREG_TO_REG (i32 0),
+ (VMOVSDrr (EXTRACT_SUBREG (v4i64 VR256:$src1), sub_xmm),
+ (EXTRACT_SUBREG (v4i64 VR256:$src2), sub_xmm)),
+ sub_xmm)>;
+ def : Pat<(v4f64 (X86Movsd VR256:$src1, VR256:$src2)),
+ (SUBREG_TO_REG (i32 0),
+ (VMOVSDrr (EXTRACT_SUBREG (v4f64 VR256:$src1), sub_xmm),
+ (EXTRACT_SUBREG (v4f64 VR256:$src2), sub_xmm)),
+ sub_xmm)>;
+
+ // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
+ // is during lowering, where it's not possible to recognize the fold cause
+ // it has two uses through a bitcast. One use disappears at isel time and the
+ // fold opportunity reappears.
+ def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
+ (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
+ (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
+ (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
+ (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+}
+
+let Predicates = [UseSSE1] in {
+ let Predicates = [NoSSE41], AddedComplexity = 15 in {
+ // Move scalar to XMM zero-extended, zeroing a VR128 then do a
+ // MOVSS to the lower bits.
+ def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
+ (MOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
+ def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+ (MOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
+ def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+ (MOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
+ }
+
+ let AddedComplexity = 20 in {
+ // MOVSSrm already zeros the high parts of the register.
+ def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
+ (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
+ def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+ (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
+ def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
+ (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
+ def : Pat<(v4f32 (X86vzload addr:$src)),
+ (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
+ }
+
+ // Extract and store.
+ def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
+ addr:$dst),
+ (MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>;
+
+ // Shuffle with MOVSS
+ def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
+ (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
+ def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
+ (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
+}
+
+let Predicates = [UseSSE2] in {
+ let Predicates = [NoSSE41], AddedComplexity = 15 in {
+ // Move scalar to XMM zero-extended, zeroing a VR128 then do a
+ // MOVSD to the lower bits.
+ def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
+ (MOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
+ }
+
+ let AddedComplexity = 20 in {
+ // MOVSDrm already zeros the high parts of the register.
+ def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
+ (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
+ def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+ (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
+ def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
+ (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
+ def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
+ (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
+ def : Pat<(v2f64 (X86vzload addr:$src)),
+ (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
+ }
+
+ // Shuffle with MOVSD
+ def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
+ (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
+ (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
+ (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
+ (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+
+ // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
+ // is during lowering, where it's not possible to recognize the fold because
+ // it has two uses through a bitcast. One use disappears at isel time and the
+ // fold opportunity reappears.
+ def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
+ (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
+ (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
+ (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
+ (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+}
+
+// Aliases to help the assembler pick two byte VEX encodings by swapping the
+// operands relative to the normal instructions to use VEX.R instead of VEX.B.
+def : InstAlias<"vmovss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ (VMOVSSrr_REV VR128L:$dst, VR128:$src1, VR128H:$src2), 0>;
+def : InstAlias<"vmovsd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ (VMOVSDrr_REV VR128L:$dst, VR128:$src1, VR128H:$src2), 0>;
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
+ X86MemOperand x86memop, PatFrag ld_frag,
+ string asm, Domain d,
+ OpndItins itins> {
+let hasSideEffects = 0 in
+ def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], itins.rr, d>,
+ Sched<[WriteFShuffle]>;
+let canFoldAsLoad = 1, isReMaterializable = 1 in
+ def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (ld_frag addr:$src))], itins.rm, d>,
+ Sched<[WriteLoad]>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
+ "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
+ PS, VEX;
+defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
+ "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
+ PD, VEX;
+defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
+ "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
+ PS, VEX;
+defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
+ "movupd", SSEPackedDouble, SSE_MOVU_ITINS>,
+ PD, VEX;
+
+defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32,
+ "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
+ PS, VEX, VEX_L;
+defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64,
+ "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
+ PD, VEX, VEX_L;
+defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32,
+ "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
+ PS, VEX, VEX_L;
+defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64,
+ "movupd", SSEPackedDouble, SSE_MOVU_ITINS>,
+ PD, VEX, VEX_L;
+}
+
+let Predicates = [UseSSE1] in {
+defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
+ "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
+ PS;
+defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
+ "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
+ PS;
+}
+let Predicates = [UseSSE2] in {
+defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
+ "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
+ PD;
+defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
+ "movupd", SSEPackedDouble, SSE_MOVU_ITINS>,
+ PD;
+}
+
+let SchedRW = [WriteStore], Predicates = [HasAVX, NoVLX] in {
+def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movaps\t{$src, $dst|$dst, $src}",
+ [(alignedstore (v4f32 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVA_P_MR>, VEX;
+def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movapd\t{$src, $dst|$dst, $src}",
+ [(alignedstore (v2f64 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVA_P_MR>, VEX;
+def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movups\t{$src, $dst|$dst, $src}",
+ [(store (v4f32 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVU_P_MR>, VEX;
+def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movupd\t{$src, $dst|$dst, $src}",
+ [(store (v2f64 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVU_P_MR>, VEX;
+def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
+ "movaps\t{$src, $dst|$dst, $src}",
+ [(alignedstore256 (v8f32 VR256:$src), addr:$dst)],
+ IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
+def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
+ "movapd\t{$src, $dst|$dst, $src}",
+ [(alignedstore256 (v4f64 VR256:$src), addr:$dst)],
+ IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
+def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
+ "movups\t{$src, $dst|$dst, $src}",
+ [(store (v8f32 VR256:$src), addr:$dst)],
+ IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
+def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
+ "movupd\t{$src, $dst|$dst, $src}",
+ [(store (v4f64 VR256:$src), addr:$dst)],
+ IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
+} // SchedRW
+
+// For disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
+ SchedRW = [WriteFShuffle] in {
+ def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
+ (ins VR128:$src),
+ "movaps\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>, VEX;
+ def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
+ (ins VR128:$src),
+ "movapd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>, VEX;
+ def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
+ (ins VR128:$src),
+ "movups\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVU_P_RR>, VEX;
+ def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
+ (ins VR128:$src),
+ "movupd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVU_P_RR>, VEX;
+ def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
+ (ins VR256:$src),
+ "movaps\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
+ def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
+ (ins VR256:$src),
+ "movapd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
+ def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
+ (ins VR256:$src),
+ "movups\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
+ def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
+ (ins VR256:$src),
+ "movupd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
+}
+
+// Aliases to help the assembler pick two byte VEX encodings by swapping the
+// operands relative to the normal instructions to use VEX.R instead of VEX.B.
+def : InstAlias<"vmovaps\t{$src, $dst|$dst, $src}",
+ (VMOVAPSrr_REV VR128L:$dst, VR128H:$src), 0>;
+def : InstAlias<"vmovapd\t{$src, $dst|$dst, $src}",
+ (VMOVAPDrr_REV VR128L:$dst, VR128H:$src), 0>;
+def : InstAlias<"vmovups\t{$src, $dst|$dst, $src}",
+ (VMOVUPSrr_REV VR128L:$dst, VR128H:$src), 0>;
+def : InstAlias<"vmovupd\t{$src, $dst|$dst, $src}",
+ (VMOVUPDrr_REV VR128L:$dst, VR128H:$src), 0>;
+def : InstAlias<"vmovaps\t{$src, $dst|$dst, $src}",
+ (VMOVAPSYrr_REV VR256L:$dst, VR256H:$src), 0>;
+def : InstAlias<"vmovapd\t{$src, $dst|$dst, $src}",
+ (VMOVAPDYrr_REV VR256L:$dst, VR256H:$src), 0>;
+def : InstAlias<"vmovups\t{$src, $dst|$dst, $src}",
+ (VMOVUPSYrr_REV VR256L:$dst, VR256H:$src), 0>;
+def : InstAlias<"vmovupd\t{$src, $dst|$dst, $src}",
+ (VMOVUPDYrr_REV VR256L:$dst, VR256H:$src), 0>;
+
+let SchedRW = [WriteStore] in {
+def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movaps\t{$src, $dst|$dst, $src}",
+ [(alignedstore (v4f32 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVA_P_MR>;
+def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movapd\t{$src, $dst|$dst, $src}",
+ [(alignedstore (v2f64 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVA_P_MR>;
+def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movups\t{$src, $dst|$dst, $src}",
+ [(store (v4f32 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVU_P_MR>;
+def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movupd\t{$src, $dst|$dst, $src}",
+ [(store (v2f64 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVU_P_MR>;
+} // SchedRW
+
+// For disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
+ SchedRW = [WriteFShuffle] in {
+ def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+ "movaps\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>;
+ def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+ "movapd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>;
+ def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+ "movups\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVU_P_RR>;
+ def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+ "movupd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVU_P_RR>;
+}
+
+// Use vmovaps/vmovups for AVX integer load/store.
+let Predicates = [HasAVX, NoVLX] in {
+ // 128-bit load/store
+ def : Pat<(alignedloadv2i64 addr:$src),
+ (VMOVAPSrm addr:$src)>;
+ def : Pat<(loadv2i64 addr:$src),
+ (VMOVUPSrm addr:$src)>;
+
+ def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
+ (VMOVAPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
+ (VMOVAPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v2i64 VR128:$src), addr:$dst),
+ (VMOVUPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v4i32 VR128:$src), addr:$dst),
+ (VMOVUPSmr addr:$dst, VR128:$src)>;
+
+ // 256-bit load/store
+ def : Pat<(alignedloadv4i64 addr:$src),
+ (VMOVAPSYrm addr:$src)>;
+ def : Pat<(loadv4i64 addr:$src),
+ (VMOVUPSYrm addr:$src)>;
+ def : Pat<(alignedstore256 (v4i64 VR256:$src), addr:$dst),
+ (VMOVAPSYmr addr:$dst, VR256:$src)>;
+ def : Pat<(alignedstore256 (v8i32 VR256:$src), addr:$dst),
+ (VMOVAPSYmr addr:$dst, VR256:$src)>;
+ def : Pat<(store (v4i64 VR256:$src), addr:$dst),
+ (VMOVUPSYmr addr:$dst, VR256:$src)>;
+ def : Pat<(store (v8i32 VR256:$src), addr:$dst),
+ (VMOVUPSYmr addr:$dst, VR256:$src)>;
+
+ // Special patterns for storing subvector extracts of lower 128-bits
+ // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
+ def : Pat<(alignedstore (v2f64 (extract_subvector
+ (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
+ (VMOVAPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+ def : Pat<(alignedstore (v4f32 (extract_subvector
+ (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
+ (VMOVAPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+ def : Pat<(alignedstore (v2i64 (extract_subvector
+ (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
+ (VMOVAPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+ def : Pat<(alignedstore (v4i32 (extract_subvector
+ (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
+ (VMOVAPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+ def : Pat<(alignedstore (v8i16 (extract_subvector
+ (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
+ (VMOVAPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+ def : Pat<(alignedstore (v16i8 (extract_subvector
+ (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
+ (VMOVAPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+
+ def : Pat<(store (v2f64 (extract_subvector
+ (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
+ (VMOVUPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+ def : Pat<(store (v4f32 (extract_subvector
+ (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
+ (VMOVUPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+ def : Pat<(store (v2i64 (extract_subvector
+ (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
+ (VMOVUPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+ def : Pat<(store (v4i32 (extract_subvector
+ (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
+ (VMOVUPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+ def : Pat<(store (v8i16 (extract_subvector
+ (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
+ (VMOVUPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+ def : Pat<(store (v16i8 (extract_subvector
+ (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
+ (VMOVUPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+ // 128-bit load/store
+ def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
+ (VMOVAPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
+ (VMOVAPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v8i16 VR128:$src), addr:$dst),
+ (VMOVUPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v16i8 VR128:$src), addr:$dst),
+ (VMOVUPSmr addr:$dst, VR128:$src)>;
+
+ // 256-bit load/store
+ def : Pat<(alignedstore256 (v16i16 VR256:$src), addr:$dst),
+ (VMOVAPSYmr addr:$dst, VR256:$src)>;
+ def : Pat<(alignedstore256 (v32i8 VR256:$src), addr:$dst),
+ (VMOVAPSYmr addr:$dst, VR256:$src)>;
+ def : Pat<(store (v16i16 VR256:$src), addr:$dst),
+ (VMOVUPSYmr addr:$dst, VR256:$src)>;
+ def : Pat<(store (v32i8 VR256:$src), addr:$dst),
+ (VMOVUPSYmr addr:$dst, VR256:$src)>;
+}
+
+// Use movaps / movups for SSE integer load / store (one byte shorter).
+// The instructions selected below are then converted to MOVDQA/MOVDQU
+// during the SSE domain pass.
+let Predicates = [UseSSE1] in {
+ def : Pat<(alignedloadv2i64 addr:$src),
+ (MOVAPSrm addr:$src)>;
+ def : Pat<(loadv2i64 addr:$src),
+ (MOVUPSrm addr:$src)>;
+
+ def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
+ (MOVAPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
+ (MOVAPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
+ (MOVAPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
+ (MOVAPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v2i64 VR128:$src), addr:$dst),
+ (MOVUPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v4i32 VR128:$src), addr:$dst),
+ (MOVUPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v8i16 VR128:$src), addr:$dst),
+ (MOVUPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v16i8 VR128:$src), addr:$dst),
+ (MOVUPSmr addr:$dst, VR128:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Low packed FP Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode psnode, SDNode pdnode,
+ string base_opc, string asm_opr,
+ InstrItinClass itin> {
+ def PSrm : PI<opc, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
+ !strconcat(base_opc, "s", asm_opr),
+ [(set VR128:$dst,
+ (psnode VR128:$src1,
+ (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))],
+ itin, SSEPackedSingle>, PS,
+ Sched<[WriteFShuffleLd, ReadAfterLd]>;
+
+ def PDrm : PI<opc, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
+ !strconcat(base_opc, "d", asm_opr),
+ [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
+ (scalar_to_vector (loadf64 addr:$src2)))))],
+ itin, SSEPackedDouble>, PD,
+ Sched<[WriteFShuffleLd, ReadAfterLd]>;
+
+}
+
+multiclass sse12_mov_hilo_packed<bits<8>opc, SDNode psnode, SDNode pdnode,
+ string base_opc, InstrItinClass itin> {
+ let Predicates = [UseAVX] in
+ defm V#NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ itin>, VEX_4V;
+
+ let Constraints = "$src1 = $dst" in
+ defm NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
+ "\t{$src2, $dst|$dst, $src2}",
+ itin>;
+}
+
+let AddedComplexity = 20 in {
+ defm MOVL : sse12_mov_hilo_packed<0x12, X86Movlps, X86Movlpd, "movlp",
+ IIC_SSE_MOV_LH>;
+}
+
+let SchedRW = [WriteStore] in {
+let Predicates = [UseAVX] in {
+def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movlps\t{$src, $dst|$dst, $src}",
+ [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
+ (iPTR 0))), addr:$dst)],
+ IIC_SSE_MOV_LH>, VEX;
+def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movlpd\t{$src, $dst|$dst, $src}",
+ [(store (f64 (extractelt (v2f64 VR128:$src),
+ (iPTR 0))), addr:$dst)],
+ IIC_SSE_MOV_LH>, VEX;
+}// UseAVX
+def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movlps\t{$src, $dst|$dst, $src}",
+ [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
+ (iPTR 0))), addr:$dst)],
+ IIC_SSE_MOV_LH>;
+def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movlpd\t{$src, $dst|$dst, $src}",
+ [(store (f64 (extractelt (v2f64 VR128:$src),
+ (iPTR 0))), addr:$dst)],
+ IIC_SSE_MOV_LH>;
+} // SchedRW
+
+let Predicates = [UseAVX] in {
+ // Shuffle with VMOVLPS
+ def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
+ (VMOVLPSrm VR128:$src1, addr:$src2)>;
+ def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
+ (VMOVLPSrm VR128:$src1, addr:$src2)>;
+
+ // Shuffle with VMOVLPD
+ def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
+ (VMOVLPDrm VR128:$src1, addr:$src2)>;
+ def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
+ (VMOVLPDrm VR128:$src1, addr:$src2)>;
+ def : Pat<(v2f64 (X86Movsd VR128:$src1,
+ (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
+ (VMOVLPDrm VR128:$src1, addr:$src2)>;
+
+ // Store patterns
+ def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
+ addr:$src1),
+ (VMOVLPSmr addr:$src1, VR128:$src2)>;
+ def : Pat<(store (v4i32 (X86Movlps
+ (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)), addr:$src1),
+ (VMOVLPSmr addr:$src1, VR128:$src2)>;
+ def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
+ addr:$src1),
+ (VMOVLPDmr addr:$src1, VR128:$src2)>;
+ def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
+ addr:$src1),
+ (VMOVLPDmr addr:$src1, VR128:$src2)>;
+}
+
+let Predicates = [UseSSE1] in {
+ // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
+ def : Pat<(store (i64 (extractelt (bc_v2i64 (v4f32 VR128:$src2)),
+ (iPTR 0))), addr:$src1),
+ (MOVLPSmr addr:$src1, VR128:$src2)>;
+
+ // Shuffle with MOVLPS
+ def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
+ (MOVLPSrm VR128:$src1, addr:$src2)>;
+ def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
+ (MOVLPSrm VR128:$src1, addr:$src2)>;
+ def : Pat<(X86Movlps VR128:$src1,
+ (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
+ (MOVLPSrm VR128:$src1, addr:$src2)>;
+
+ // Store patterns
+ def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
+ addr:$src1),
+ (MOVLPSmr addr:$src1, VR128:$src2)>;
+ def : Pat<(store (v4i32 (X86Movlps
+ (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)),
+ addr:$src1),
+ (MOVLPSmr addr:$src1, VR128:$src2)>;
+}
+
+let Predicates = [UseSSE2] in {
+ // Shuffle with MOVLPD
+ def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
+ (MOVLPDrm VR128:$src1, addr:$src2)>;
+ def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
+ (MOVLPDrm VR128:$src1, addr:$src2)>;
+ def : Pat<(v2f64 (X86Movsd VR128:$src1,
+ (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
+ (MOVLPDrm VR128:$src1, addr:$src2)>;
+
+ // Store patterns
+ def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
+ addr:$src1),
+ (MOVLPDmr addr:$src1, VR128:$src2)>;
+ def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
+ addr:$src1),
+ (MOVLPDmr addr:$src1, VR128:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Hi packed FP Instructions
+//===----------------------------------------------------------------------===//
+
+let AddedComplexity = 20 in {
+ defm MOVH : sse12_mov_hilo_packed<0x16, X86Movlhps, X86Movlhpd, "movhp",
+ IIC_SSE_MOV_LH>;
+}
+
+let SchedRW = [WriteStore] in {
+// v2f64 extract element 1 is always custom lowered to unpack high to low
+// and extract element 0 so the non-store version isn't too horrible.
+let Predicates = [UseAVX] in {
+def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movhps\t{$src, $dst|$dst, $src}",
+ [(store (f64 (extractelt
+ (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
+ (bc_v2f64 (v4f32 VR128:$src))),
+ (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
+def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movhpd\t{$src, $dst|$dst, $src}",
+ [(store (f64 (extractelt
+ (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
+ (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
+} // UseAVX
+def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movhps\t{$src, $dst|$dst, $src}",
+ [(store (f64 (extractelt
+ (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
+ (bc_v2f64 (v4f32 VR128:$src))),
+ (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
+def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movhpd\t{$src, $dst|$dst, $src}",
+ [(store (f64 (extractelt
+ (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
+ (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
+} // SchedRW
+
+let Predicates = [UseAVX] in {
+ // VMOVHPS patterns
+ def : Pat<(X86Movlhps VR128:$src1,
+ (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
+ (VMOVHPSrm VR128:$src1, addr:$src2)>;
+ def : Pat<(X86Movlhps VR128:$src1,
+ (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
+ (VMOVHPSrm VR128:$src1, addr:$src2)>;
+
+ // VMOVHPD patterns
+
+ // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
+ // is during lowering, where it's not possible to recognize the load fold
+ // cause it has two uses through a bitcast. One use disappears at isel time
+ // and the fold opportunity reappears.
+ def : Pat<(v2f64 (X86Unpckl VR128:$src1,
+ (scalar_to_vector (loadf64 addr:$src2)))),
+ (VMOVHPDrm VR128:$src1, addr:$src2)>;
+
+ // Also handle an i64 load because that may get selected as a faster way to
+ // load the data.
+ def : Pat<(v2f64 (X86Unpckl VR128:$src1,
+ (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
+ (VMOVHPDrm VR128:$src1, addr:$src2)>;
+
+ def : Pat<(store (f64 (extractelt
+ (bc_v2f64 (v4f32 (X86Movhlps VR128:$src, VR128:$src))),
+ (iPTR 0))), addr:$dst),
+ (VMOVHPDmr addr:$dst, VR128:$src)>;
+
+ def : Pat<(store (f64 (extractelt
+ (v2f64 (X86VPermilpi VR128:$src, (i8 1))),
+ (iPTR 0))), addr:$dst),
+ (VMOVHPDmr addr:$dst, VR128:$src)>;
+}
+
+let Predicates = [UseSSE1] in {
+ // MOVHPS patterns
+ def : Pat<(X86Movlhps VR128:$src1,
+ (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
+ (MOVHPSrm VR128:$src1, addr:$src2)>;
+ def : Pat<(X86Movlhps VR128:$src1,
+ (bc_v4f32 (v2i64 (X86vzload addr:$src2)))),
+ (MOVHPSrm VR128:$src1, addr:$src2)>;
+}
+
+let Predicates = [UseSSE2] in {
+ // MOVHPD patterns
+
+ // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
+ // is during lowering, where it's not possible to recognize the load fold
+ // cause it has two uses through a bitcast. One use disappears at isel time
+ // and the fold opportunity reappears.
+ def : Pat<(v2f64 (X86Unpckl VR128:$src1,
+ (scalar_to_vector (loadf64 addr:$src2)))),
+ (MOVHPDrm VR128:$src1, addr:$src2)>;
+
+ // Also handle an i64 load because that may get selected as a faster way to
+ // load the data.
+ def : Pat<(v2f64 (X86Unpckl VR128:$src1,
+ (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
+ (MOVHPDrm VR128:$src1, addr:$src2)>;
+
+ def : Pat<(store (f64 (extractelt
+ (bc_v2f64 (v4f32 (X86Movhlps VR128:$src, VR128:$src))),
+ (iPTR 0))), addr:$dst),
+ (MOVHPDmr addr:$dst, VR128:$src)>;
+
+ def : Pat<(store (f64 (extractelt
+ (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))),
+ (iPTR 0))), addr:$dst),
+ (MOVHPDmr addr:$dst, VR128:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
+//===----------------------------------------------------------------------===//
+
+let AddedComplexity = 20, Predicates = [UseAVX] in {
+ def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
+ IIC_SSE_MOV_LH>,
+ VEX_4V, Sched<[WriteFShuffle]>;
+ def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
+ IIC_SSE_MOV_LH>,
+ VEX_4V, Sched<[WriteFShuffle]>;
+}
+let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
+ def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ "movlhps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
+ IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
+ let isCommutable = 1 in
+ def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ "movhlps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
+ IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
+}
+
+let Predicates = [UseAVX] in {
+ // MOVLHPS patterns
+ def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
+ (VMOVLHPSrr VR128:$src1, VR128:$src2)>;
+ def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
+ (VMOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
+
+ // MOVHLPS patterns
+ def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
+ (VMOVHLPSrr VR128:$src1, VR128:$src2)>;
+}
+
+let Predicates = [UseSSE1] in {
+ // MOVLHPS patterns
+ def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
+ (MOVLHPSrr VR128:$src1, VR128:$src2)>;
+ def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
+ (MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
+
+ // MOVHLPS patterns
+ def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
+ (MOVHLPSrr VR128:$src1, VR128:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Conversion Instructions
+//===----------------------------------------------------------------------===//
+
+def SSE_CVT_PD : OpndItins<
+ IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM
+>;
+
+let Sched = WriteCvtI2F in
+def SSE_CVT_PS : OpndItins<
+ IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM
+>;
+
+let Sched = WriteCvtI2F in
+def SSE_CVT_Scalar : OpndItins<
+ IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM
+>;
+
+let Sched = WriteCvtF2I in
+def SSE_CVT_SS2SI_32 : OpndItins<
+ IIC_SSE_CVT_SS2SI32_RR, IIC_SSE_CVT_SS2SI32_RM
+>;
+
+let Sched = WriteCvtF2I in
+def SSE_CVT_SS2SI_64 : OpndItins<
+ IIC_SSE_CVT_SS2SI64_RR, IIC_SSE_CVT_SS2SI64_RM
+>;
+
+let Sched = WriteCvtF2I in
+def SSE_CVT_SD2SI : OpndItins<
+ IIC_SSE_CVT_SD2SI_RR, IIC_SSE_CVT_SD2SI_RM
+>;
+
+// FIXME: We probably want to match the rm form only when optimizing for
+// size, to avoid false depenendecies (see sse_fp_unop_s for details)
+multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+ SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
+ string asm, OpndItins itins> {
+ def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
+ [(set DstRC:$dst, (OpNode SrcRC:$src))],
+ itins.rr>, Sched<[itins.Sched]>;
+ def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
+ [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))],
+ itins.rm>, Sched<[itins.Sched.Folded]>;
+}
+
+multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop,
+ ValueType DstTy, ValueType SrcTy, PatFrag ld_frag,
+ string asm, Domain d, OpndItins itins> {
+let hasSideEffects = 0 in {
+ def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm,
+ [(set RC:$dst, (DstTy (sint_to_fp (SrcTy RC:$src))))],
+ itins.rr, d>, Sched<[itins.Sched]>;
+ let mayLoad = 1 in
+ def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm,
+ [(set RC:$dst, (DstTy (sint_to_fp
+ (SrcTy (bitconvert (ld_frag addr:$src))))))],
+ itins.rm, d>, Sched<[itins.Sched.Folded]>;
+}
+}
+
+// FIXME: We probably want to match the rm form only when optimizing for
+// size, to avoid false depenendecies (see sse_fp_unop_s for details)
+multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+ X86MemOperand x86memop, string asm> {
+let hasSideEffects = 0, Predicates = [UseAVX] in {
+ def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
+ !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+ Sched<[WriteCvtI2F]>;
+ let mayLoad = 1 in
+ def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
+ (ins DstRC:$src1, x86memop:$src),
+ !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+ Sched<[WriteCvtI2FLd, ReadAfterLd]>;
+} // hasSideEffects = 0
+}
+
+let Predicates = [UseAVX] in {
+defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
+ "cvttss2si\t{$src, $dst|$dst, $src}",
+ SSE_CVT_SS2SI_32>,
+ XS, VEX, VEX_LIG;
+defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
+ "cvttss2si\t{$src, $dst|$dst, $src}",
+ SSE_CVT_SS2SI_64>,
+ XS, VEX, VEX_W, VEX_LIG;
+defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
+ "cvttsd2si\t{$src, $dst|$dst, $src}",
+ SSE_CVT_SD2SI>,
+ XD, VEX, VEX_LIG;
+defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
+ "cvttsd2si\t{$src, $dst|$dst, $src}",
+ SSE_CVT_SD2SI>,
+ XD, VEX, VEX_W, VEX_LIG;
+
+def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTTSS2SIrr GR32:$dst, FR32:$src), 0>;
+def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTTSS2SIrm GR32:$dst, f32mem:$src), 0>;
+def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTTSD2SIrr GR32:$dst, FR64:$src), 0>;
+def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTTSD2SIrm GR32:$dst, f64mem:$src), 0>;
+def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTTSS2SI64rr GR64:$dst, FR32:$src), 0>;
+def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>;
+def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
+def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
+}
+// The assembler can recognize rr 64-bit instructions by seeing a rxx
+// register, but the same isn't true when only using memory operands,
+// provide other assembly "l" and "q" forms to address this explicitly
+// where appropriate to do so.
+defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss{l}">,
+ XS, VEX_4V, VEX_LIG;
+defm VCVTSI2SS64 : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}">,
+ XS, VEX_4V, VEX_W, VEX_LIG;
+defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">,
+ XD, VEX_4V, VEX_LIG;
+defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">,
+ XD, VEX_4V, VEX_W, VEX_LIG;
+
+let Predicates = [UseAVX] in {
+ def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
+ (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src), 0>;
+ def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
+ (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src), 0>;
+
+ def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
+ (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+ def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
+ (VCVTSI2SS64rm (f32 (IMPLICIT_DEF)), addr:$src)>;
+ def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))),
+ (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+ def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))),
+ (VCVTSI2SD64rm (f64 (IMPLICIT_DEF)), addr:$src)>;
+
+ def : Pat<(f32 (sint_to_fp GR32:$src)),
+ (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
+ def : Pat<(f32 (sint_to_fp GR64:$src)),
+ (VCVTSI2SS64rr (f32 (IMPLICIT_DEF)), GR64:$src)>;
+ def : Pat<(f64 (sint_to_fp GR32:$src)),
+ (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
+ def : Pat<(f64 (sint_to_fp GR64:$src)),
+ (VCVTSI2SD64rr (f64 (IMPLICIT_DEF)), GR64:$src)>;
+}
+
+defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
+ "cvttss2si\t{$src, $dst|$dst, $src}",
+ SSE_CVT_SS2SI_32>, XS;
+defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
+ "cvttss2si\t{$src, $dst|$dst, $src}",
+ SSE_CVT_SS2SI_64>, XS, REX_W;
+defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
+ "cvttsd2si\t{$src, $dst|$dst, $src}",
+ SSE_CVT_SD2SI>, XD;
+defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
+ "cvttsd2si\t{$src, $dst|$dst, $src}",
+ SSE_CVT_SD2SI>, XD, REX_W;
+defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32,
+ "cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
+ SSE_CVT_Scalar>, XS;
+defm CVTSI2SS64 : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64,
+ "cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
+ SSE_CVT_Scalar>, XS, REX_W;
+defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32,
+ "cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
+ SSE_CVT_Scalar>, XD;
+defm CVTSI2SD64 : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64,
+ "cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
+ SSE_CVT_Scalar>, XD, REX_W;
+
+def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTTSS2SIrr GR32:$dst, FR32:$src), 0>;
+def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTTSS2SIrm GR32:$dst, f32mem:$src), 0>;
+def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTTSD2SIrr GR32:$dst, FR64:$src), 0>;
+def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTTSD2SIrm GR32:$dst, f64mem:$src), 0>;
+def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTTSS2SI64rr GR64:$dst, FR32:$src), 0>;
+def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>;
+def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
+def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
+
+def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
+ (CVTSI2SSrm FR64:$dst, i32mem:$src), 0>;
+def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
+ (CVTSI2SDrm FR64:$dst, i32mem:$src), 0>;
+
+// Conversion Instructions Intrinsics - Match intrinsics which expect MM
+// and/or XMM operand(s).
+
+// FIXME: We probably want to match the rm form only when optimizing for
+// size, to avoid false depenendecies (see sse_fp_unop_s for details)
+multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+ Intrinsic Int, Operand memop, ComplexPattern mem_cpat,
+ string asm, OpndItins itins> {
+ def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+ [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr>,
+ Sched<[itins.Sched]>;
+ def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+ [(set DstRC:$dst, (Int mem_cpat:$src))], itins.rm>,
+ Sched<[itins.Sched.Folded]>;
+}
+
+multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
+ RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop,
+ PatFrag ld_frag, string asm, OpndItins itins,
+ bit Is2Addr = 1> {
+ def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))],
+ itins.rr>, Sched<[itins.Sched]>;
+ def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
+ (ins DstRC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))],
+ itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+let Predicates = [UseAVX] in {
+defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32,
+ int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si",
+ SSE_CVT_SD2SI>, XD, VEX, VEX_LIG;
+defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
+ int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si",
+ SSE_CVT_SD2SI>, XD, VEX, VEX_W, VEX_LIG;
+}
+defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
+ sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD;
+defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64,
+ sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD, REX_W;
+
+
+let isCodeGenOnly = 1 in {
+ let Predicates = [UseAVX] in {
+ defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+ int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}",
+ SSE_CVT_Scalar, 0>, XS, VEX_4V;
+ defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+ int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}",
+ SSE_CVT_Scalar, 0>, XS, VEX_4V,
+ VEX_W;
+ defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+ int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}",
+ SSE_CVT_Scalar, 0>, XD, VEX_4V;
+ defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+ int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}",
+ SSE_CVT_Scalar, 0>, XD,
+ VEX_4V, VEX_W;
+ }
+ let Constraints = "$src1 = $dst" in {
+ defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+ int_x86_sse_cvtsi2ss, i32mem, loadi32,
+ "cvtsi2ss{l}", SSE_CVT_Scalar>, XS;
+ defm Int_CVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+ int_x86_sse_cvtsi642ss, i64mem, loadi64,
+ "cvtsi2ss{q}", SSE_CVT_Scalar>, XS, REX_W;
+ defm Int_CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+ int_x86_sse2_cvtsi2sd, i32mem, loadi32,
+ "cvtsi2sd{l}", SSE_CVT_Scalar>, XD;
+ defm Int_CVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+ int_x86_sse2_cvtsi642sd, i64mem, loadi64,
+ "cvtsi2sd{q}", SSE_CVT_Scalar>, XD, REX_W;
+ }
+} // isCodeGenOnly = 1
+
+/// SSE 1 Only
+
+// Aliases for intrinsics
+let isCodeGenOnly = 1 in {
+let Predicates = [UseAVX] in {
+defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
+ ssmem, sse_load_f32, "cvttss2si",
+ SSE_CVT_SS2SI_32>, XS, VEX;
+defm Int_VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
+ int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
+ "cvttss2si", SSE_CVT_SS2SI_64>,
+ XS, VEX, VEX_W;
+defm Int_VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
+ sdmem, sse_load_f64, "cvttsd2si",
+ SSE_CVT_SD2SI>, XD, VEX;
+defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
+ int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
+ "cvttsd2si", SSE_CVT_SD2SI>,
+ XD, VEX, VEX_W;
+}
+defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
+ ssmem, sse_load_f32, "cvttss2si",
+ SSE_CVT_SS2SI_32>, XS;
+defm Int_CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
+ int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
+ "cvttss2si", SSE_CVT_SS2SI_64>, XS, REX_W;
+defm Int_CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
+ sdmem, sse_load_f64, "cvttsd2si",
+ SSE_CVT_SD2SI>, XD;
+defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
+ int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
+ "cvttsd2si", SSE_CVT_SD2SI>, XD, REX_W;
+} // isCodeGenOnly = 1
+
+let Predicates = [UseAVX] in {
+defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
+ ssmem, sse_load_f32, "cvtss2si",
+ SSE_CVT_SS2SI_32>, XS, VEX, VEX_LIG;
+defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
+ ssmem, sse_load_f32, "cvtss2si",
+ SSE_CVT_SS2SI_64>, XS, VEX, VEX_W, VEX_LIG;
+}
+defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
+ ssmem, sse_load_f32, "cvtss2si",
+ SSE_CVT_SS2SI_32>, XS;
+defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
+ ssmem, sse_load_f32, "cvtss2si",
+ SSE_CVT_SS2SI_64>, XS, REX_W;
+
+defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, loadv2i64,
+ "vcvtdq2ps\t{$src, $dst|$dst, $src}",
+ SSEPackedSingle, SSE_CVT_PS>,
+ PS, VEX, Requires<[HasAVX, NoVLX]>;
+defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, loadv4i64,
+ "vcvtdq2ps\t{$src, $dst|$dst, $src}",
+ SSEPackedSingle, SSE_CVT_PS>,
+ PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>;
+
+defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memopv2i64,
+ "cvtdq2ps\t{$src, $dst|$dst, $src}",
+ SSEPackedSingle, SSE_CVT_PS>,
+ PS, Requires<[UseSSE2]>;
+
+let Predicates = [UseAVX] in {
+def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTSS2SIrr GR32:$dst, VR128:$src), 0>;
+def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTSS2SIrm GR32:$dst, ssmem:$src), 0>;
+def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTSD2SIrr GR32:$dst, VR128:$src), 0>;
+def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTSD2SIrm GR32:$dst, sdmem:$src), 0>;
+def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTSS2SI64rr GR64:$dst, VR128:$src), 0>;
+def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTSS2SI64rm GR64:$dst, ssmem:$src), 0>;
+def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
+def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTSD2SI64rm GR64:$dst, sdmem:$src), 0>;
+}
+
+def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTSS2SIrr GR32:$dst, VR128:$src), 0>;
+def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTSS2SIrm GR32:$dst, ssmem:$src), 0>;
+def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTSD2SIrr GR32:$dst, VR128:$src), 0>;
+def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTSD2SIrm GR32:$dst, sdmem:$src), 0>;
+def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTSS2SI64rr GR64:$dst, VR128:$src), 0>;
+def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTSS2SI64rm GR64:$dst, ssmem:$src), 0>;
+def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
+def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTSD2SI64rm GR64:$dst, sdmem:$src), 0>;
+
+/// SSE 2 Only
+
+// Convert scalar double to scalar single
+let hasSideEffects = 0, Predicates = [UseAVX] in {
+def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
+ (ins FR64:$src1, FR64:$src2),
+ "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
+ IIC_SSE_CVT_Scalar_RR>, VEX_4V, VEX_LIG,
+ Sched<[WriteCvtF2F]>;
+let mayLoad = 1 in
+def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst),
+ (ins FR64:$src1, f64mem:$src2),
+ "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [], IIC_SSE_CVT_Scalar_RM>,
+ XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG,
+ Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+}
+
+def : Pat<(f32 (fpround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>,
+ Requires<[UseAVX]>;
+
+def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
+ "cvtsd2ss\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (fpround FR64:$src))],
+ IIC_SSE_CVT_Scalar_RR>, Sched<[WriteCvtF2F]>;
+def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
+ "cvtsd2ss\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (fpround (loadf64 addr:$src)))],
+ IIC_SSE_CVT_Scalar_RM>,
+ XD,
+ Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
+
+let isCodeGenOnly = 1 in {
+def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
+ IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[HasAVX]>,
+ Sched<[WriteCvtF2F]>;
+def Int_VCVTSD2SSrm: I<0x5A, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
+ "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
+ VR128:$src1, sse_load_f64:$src2))],
+ IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, Requires<[HasAVX]>,
+ Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+
+let Constraints = "$src1 = $dst" in {
+def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
+ IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>,
+ Sched<[WriteCvtF2F]>;
+def Int_CVTSD2SSrm: I<0x5A, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
+ "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
+ VR128:$src1, sse_load_f64:$src2))],
+ IIC_SSE_CVT_Scalar_RM>, XD, Requires<[UseSSE2]>,
+ Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+}
+} // isCodeGenOnly = 1
+
+// Convert scalar single to scalar double
+// SSE2 instructions with XS prefix
+let hasSideEffects = 0, Predicates = [UseAVX] in {
+def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
+ (ins FR32:$src1, FR32:$src2),
+ "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [], IIC_SSE_CVT_Scalar_RR>,
+ XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG,
+ Sched<[WriteCvtF2F]>;
+let mayLoad = 1 in
+def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
+ (ins FR32:$src1, f32mem:$src2),
+ "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [], IIC_SSE_CVT_Scalar_RM>,
+ XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>,
+ Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+}
+
+def : Pat<(f64 (fpextend FR32:$src)),
+ (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[UseAVX]>;
+def : Pat<(fpextend (loadf32 addr:$src)),
+ (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>;
+
+def : Pat<(extloadf32 addr:$src),
+ (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>,
+ Requires<[UseAVX, OptForSize]>;
+def : Pat<(extloadf32 addr:$src),
+ (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
+ Requires<[UseAVX, OptForSpeed]>;
+
+def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
+ "cvtss2sd\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (fpextend FR32:$src))],
+ IIC_SSE_CVT_Scalar_RR>, XS,
+ Requires<[UseSSE2]>, Sched<[WriteCvtF2F]>;
+def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
+ "cvtss2sd\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (extloadf32 addr:$src))],
+ IIC_SSE_CVT_Scalar_RM>, XS,
+ Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
+
+// extload f32 -> f64. This matches load+fpextend because we have a hack in
+// the isel (PreprocessForFPConvert) that can introduce loads after dag
+// combine.
+// Since these loads aren't folded into the fpextend, we have to match it
+// explicitly here.
+def : Pat<(fpextend (loadf32 addr:$src)),
+ (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(extloadf32 addr:$src),
+ (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>;
+
+let isCodeGenOnly = 1 in {
+def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
+ IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, Requires<[HasAVX]>,
+ Sched<[WriteCvtF2F]>;
+def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
+ "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
+ IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, Requires<[HasAVX]>,
+ Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
+def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "cvtss2sd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
+ IIC_SSE_CVT_Scalar_RR>, XS, Requires<[UseSSE2]>,
+ Sched<[WriteCvtF2F]>;
+def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
+ "cvtss2sd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
+ IIC_SSE_CVT_Scalar_RM>, XS, Requires<[UseSSE2]>,
+ Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+}
+} // isCodeGenOnly = 1
+
+// Convert packed single/double fp to doubleword
+def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
+ IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
+def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvtps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvtps2dq (loadv4f32 addr:$src)))],
+ IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+ "cvtps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst,
+ (int_x86_avx_cvt_ps2dq_256 VR256:$src))],
+ IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+ "cvtps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst,
+ (int_x86_avx_cvt_ps2dq_256 (loadv8f32 addr:$src)))],
+ IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
+def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
+ IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
+def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvtps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)))],
+ IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
+
+
+// Convert Packed Double FP to Packed DW Integers
+let Predicates = [HasAVX, NoVLX] in {
+// The assembler can recognize rr 256-bit instructions by seeing a ymm
+// register, but the same isn't true when using memory operands instead.
+// Provide other assembly rr and rm forms to address this explicitly.
+def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "vcvtpd2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
+ VEX, Sched<[WriteCvtF2I]>;
+
+// XMM only
+def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
+ (VCVTPD2DQrr VR128:$dst, VR128:$src), 0>;
+def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX,
+ Sched<[WriteCvtF2ILd]>;
+def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
+ (VCVTPD2DQrm VR128:$dst, f128mem:$src), 0>;
+
+// YMM only
+def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
+ "vcvtpd2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>,
+ VEX, VEX_L, Sched<[WriteCvtF2I]>;
+def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
+ "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>,
+ VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
+def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
+ (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0>;
+def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
+ (VCVTPD2DQYrm VR128:$dst, f256mem:$src), 0>;
+}
+
+def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvtpd2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))],
+ IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2ILd]>;
+def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtpd2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))],
+ IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;
+
+// Convert with truncation packed single/double fp to doubleword
+// SSE2 packed instructions with XS prefix
+let Predicates = [HasAVX, NoVLX] in {
+def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvttps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (fp_to_sint (v4f32 VR128:$src))))],
+ IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
+def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvttps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (fp_to_sint (loadv4f32 addr:$src))))],
+ IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+ "cvttps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst,
+ (v8i32 (fp_to_sint (v8f32 VR256:$src))))],
+ IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+ "cvttps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst,
+ (v8i32 (fp_to_sint (loadv8f32 addr:$src))))],
+ IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
+ Sched<[WriteCvtF2ILd]>;
+}
+
+def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvttps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (fp_to_sint (v4f32 VR128:$src))))],
+ IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
+def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvttps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (fp_to_sint (memopv4f32 addr:$src))))],
+ IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
+
+let Predicates = [HasAVX, NoVLX] in
+def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvttpd2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (X86cvttp2si (v2f64 VR128:$src))))],
+ IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>;
+
+// The assembler can recognize rr 256-bit instructions by seeing a ymm
+// register, but the same isn't true when using memory operands instead.
+// Provide other assembly rr and rm forms to address this explicitly.
+
+// XMM only
+def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
+ (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>;
+let Predicates = [HasAVX, NoVLX] in
+def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvttpd2dq{x}\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (X86cvttp2si (loadv2f64 addr:$src))))],
+ IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
+ (VCVTTPD2DQrm VR128:$dst, f128mem:$src), 0>;
+
+// YMM only
+let Predicates = [HasAVX, NoVLX] in {
+def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
+ "cvttpd2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (fp_to_sint (v4f64 VR256:$src))))],
+ IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
+ "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (fp_to_sint (loadv4f64 addr:$src))))],
+ IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
+}
+def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
+ (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;
+def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
+ (VCVTTPD2DQYrm VR128:$dst, f256mem:$src), 0>;
+
+let Predicates = [HasAVX, NoVLX] in {
+ let AddedComplexity = 15 in {
+ def : Pat<(X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
+ (VCVTPD2DQrr VR128:$src)>;
+ def : Pat<(X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))),
+ (VCVTTPD2DQrr VR128:$src)>;
+ }
+} // Predicates = [HasAVX]
+
+def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvttpd2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (X86cvttp2si (v2f64 VR128:$src))))],
+ IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;
+def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
+ "cvttpd2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (X86cvttp2si (memopv2f64 addr:$src))))],
+ IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2ILd]>;
+
+let Predicates = [UseSSE2] in {
+ let AddedComplexity = 15 in {
+ def : Pat<(X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
+ (CVTPD2DQrr VR128:$src)>;
+ def : Pat<(X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))),
+ (CVTTPD2DQrr VR128:$src)>;
+ }
+} // Predicates = [UseSSE2]
+
+// Convert packed single to packed double
+let Predicates = [HasAVX, NoVLX] in {
+ // SSE2 instructions without OpSize prefix
+def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "vcvtps2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))],
+ IIC_SSE_CVT_PD_RR>, PS, VEX, Sched<[WriteCvtF2F]>;
+def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+ "vcvtps2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
+ IIC_SSE_CVT_PD_RM>, PS, VEX, Sched<[WriteCvtF2FLd]>;
+def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+ "vcvtps2pd\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst, (v4f64 (fpextend (v4f32 VR128:$src))))],
+ IIC_SSE_CVT_PD_RR>, PS, VEX, VEX_L, Sched<[WriteCvtF2F]>;
+def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
+ "vcvtps2pd\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))],
+ IIC_SSE_CVT_PD_RM>, PS, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
+}
+
+let Predicates = [UseSSE2] in {
+def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtps2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))],
+ IIC_SSE_CVT_PD_RR>, PS, Sched<[WriteCvtF2F]>;
+def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+ "cvtps2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
+ IIC_SSE_CVT_PD_RM>, PS, Sched<[WriteCvtF2FLd]>;
+}
+
+// Convert Packed DW Integers to Packed Double FP
+let Predicates = [HasAVX, NoVLX] in {
+let hasSideEffects = 0, mayLoad = 1 in
+def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))]>,
+ VEX, Sched<[WriteCvtI2FLd]>;
+def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>,
+ VEX, Sched<[WriteCvtI2F]>;
+def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
+ "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst,
+ (v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))))]>,
+ VEX, VEX_L, Sched<[WriteCvtI2FLd]>;
+def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+ "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst,
+ (v4f64 (sint_to_fp (v4i32 VR128:$src))))]>,
+ VEX, VEX_L, Sched<[WriteCvtI2F]>;
+}
+
+let hasSideEffects = 0, mayLoad = 1 in
+def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ "cvtdq2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))],
+ IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtI2FLd]>;
+def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtdq2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2f64 (X86VSintToFP (v4i32 VR128:$src))))],
+ IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtI2F]>;
+
+// AVX register conversion intrinsics
+let Predicates = [HasAVX, NoVLX] in {
+ def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+ (VCVTDQ2PDrm addr:$src)>;
+} // Predicates = [HasAVX, NoVLX]
+
+// SSE2 register conversion intrinsics
+let Predicates = [UseSSE2] in {
+ def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+ (CVTDQ2PDrm addr:$src)>;
+} // Predicates = [UseSSE2]
+
+// Convert packed double to packed single
+// The assembler can recognize rr 256-bit instructions by seeing a ymm
+// register, but the same isn't true when using memory operands instead.
+// Provide other assembly rr and rm forms to address this explicitly.
+let Predicates = [HasAVX, NoVLX] in
+def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtpd2ps\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))],
+ IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2F]>;
+
+// XMM only
+def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
+ (VCVTPD2PSrr VR128:$dst, VR128:$src), 0>;
+let Predicates = [HasAVX, NoVLX] in
+def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvtpd2ps{x}\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (X86vfpround (loadv2f64 addr:$src)))],
+ IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>;
+def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
+ (VCVTPD2PSrm VR128:$dst, f128mem:$src), 0>;
+
+// YMM only
+let Predicates = [HasAVX, NoVLX] in {
+def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
+ "cvtpd2ps\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (fpround VR256:$src))],
+ IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2F]>;
+def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
+ "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (fpround (loadv4f64 addr:$src)))],
+ IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
+}
+def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
+ (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0>;
+def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
+ (VCVTPD2PSYrm VR128:$dst, f256mem:$src), 0>;
+
+def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtpd2ps\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))],
+ IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2F]>;
+def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvtpd2ps\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (X86vfpround (memopv2f64 addr:$src)))],
+ IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2FLd]>;
+
+// AVX 256-bit register conversion intrinsics
+// FIXME: Migrate SSE conversion intrinsics matching to use patterns as below
+// whenever possible to avoid declaring two versions of each one.
+
+let Predicates = [HasAVX, NoVLX] in {
+ // Match fpround and fpextend for 128/256-bit conversions
+ let AddedComplexity = 15 in
+ def : Pat<(X86vzmovl (v2f64 (bitconvert
+ (v4f32 (X86vfpround (v2f64 VR128:$src)))))),
+ (VCVTPD2PSrr VR128:$src)>;
+}
+
+let Predicates = [UseSSE2] in {
+ // Match fpround and fpextend for 128 conversions
+ let AddedComplexity = 15 in
+ def : Pat<(X86vzmovl (v2f64 (bitconvert
+ (v4f32 (X86vfpround (v2f64 VR128:$src)))))),
+ (CVTPD2PSrr VR128:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Compare Instructions
+//===----------------------------------------------------------------------===//
+
+// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
+multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
+ Operand CC, SDNode OpNode, ValueType VT,
+ PatFrag ld_frag, string asm, string asm_alt,
+ OpndItins itins, ImmLeaf immLeaf> {
+ let isCommutable = 1 in
+ def rr : SIi8<0xC2, MRMSrcReg,
+ (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
+ [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, immLeaf:$cc))],
+ itins.rr>, Sched<[itins.Sched]>;
+ def rm : SIi8<0xC2, MRMSrcMem,
+ (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
+ [(set RC:$dst, (OpNode (VT RC:$src1),
+ (ld_frag addr:$src2), immLeaf:$cc))],
+ itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+
+ // Accept explicit immediate argument form instead of comparison code.
+ let isAsmParserOnly = 1, hasSideEffects = 0 in {
+ def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, u8imm:$cc), asm_alt, [],
+ IIC_SSE_ALU_F32S_RR>, Sched<[itins.Sched]>;
+ let mayLoad = 1 in
+ def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm_alt, [],
+ IIC_SSE_ALU_F32S_RM>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ }
+}
+
+defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmps, f32, loadf32,
+ "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+ SSE_ALU_F32S, i8immZExt5>, XS, VEX_4V, VEX_LIG;
+defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmps, f64, loadf64,
+ "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+ SSE_ALU_F32S, i8immZExt5>, // same latency as 32 bit compare
+ XD, VEX_4V, VEX_LIG;
+
+let Constraints = "$src1 = $dst" in {
+ defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmps, f32, loadf32,
+ "cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
+ "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S,
+ i8immZExt3>, XS;
+ defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmps, f64, loadf64,
+ "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
+ "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
+ SSE_ALU_F64S, i8immZExt3>, XD;
+}
+
+multiclass sse12_cmp_scalar_int<Operand memop, Operand CC,
+ Intrinsic Int, string asm, OpndItins itins,
+ ImmLeaf immLeaf, ComplexPattern mem_cpat> {
+ def rr : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src, CC:$cc), asm,
+ [(set VR128:$dst, (Int VR128:$src1,
+ VR128:$src, immLeaf:$cc))],
+ itins.rr>,
+ Sched<[itins.Sched]>;
+ def rm : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, memop:$src, CC:$cc), asm,
+ [(set VR128:$dst, (Int VR128:$src1,
+ mem_cpat:$src, immLeaf:$cc))],
+ itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+let isCodeGenOnly = 1 in {
+ // Aliases to match intrinsics which expect XMM operand(s).
+ defm Int_VCMPSS : sse12_cmp_scalar_int<ssmem, AVXCC, int_x86_sse_cmp_ss,
+ "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
+ SSE_ALU_F32S, i8immZExt5, sse_load_f32>,
+ XS, VEX_4V;
+ defm Int_VCMPSD : sse12_cmp_scalar_int<sdmem, AVXCC, int_x86_sse2_cmp_sd,
+ "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}",
+ SSE_ALU_F32S, i8immZExt5, sse_load_f64>, // same latency as f32
+ XD, VEX_4V;
+ let Constraints = "$src1 = $dst" in {
+ defm Int_CMPSS : sse12_cmp_scalar_int<ssmem, SSECC, int_x86_sse_cmp_ss,
+ "cmp${cc}ss\t{$src, $dst|$dst, $src}",
+ SSE_ALU_F32S, i8immZExt3, sse_load_f32>, XS;
+ defm Int_CMPSD : sse12_cmp_scalar_int<sdmem, SSECC, int_x86_sse2_cmp_sd,
+ "cmp${cc}sd\t{$src, $dst|$dst, $src}",
+ SSE_ALU_F64S, i8immZExt3, sse_load_f64>,
+ XD;
+}
+}
+
+
+// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
+multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
+ ValueType vt, X86MemOperand x86memop,
+ PatFrag ld_frag, string OpcodeStr> {
+ def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+ [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))],
+ IIC_SSE_COMIS_RR>,
+ Sched<[WriteFAdd]>;
+ def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+ [(set EFLAGS, (OpNode (vt RC:$src1),
+ (ld_frag addr:$src2)))],
+ IIC_SSE_COMIS_RM>,
+ Sched<[WriteFAddLd, ReadAfterLd]>;
+}
+
+let Defs = [EFLAGS] in {
+ defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
+ "ucomiss">, PS, VEX, VEX_LIG;
+ defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
+ "ucomisd">, PD, VEX, VEX_LIG;
+ let Pattern = []<dag> in {
+ defm VCOMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
+ "comiss">, PS, VEX, VEX_LIG;
+ defm VCOMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
+ "comisd">, PD, VEX, VEX_LIG;
+ }
+
+ let isCodeGenOnly = 1 in {
+ defm Int_VUCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
+ load, "ucomiss">, PS, VEX;
+ defm Int_VUCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
+ load, "ucomisd">, PD, VEX;
+
+ defm Int_VCOMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem,
+ load, "comiss">, PS, VEX;
+ defm Int_VCOMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem,
+ load, "comisd">, PD, VEX;
+ }
+ defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
+ "ucomiss">, PS;
+ defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
+ "ucomisd">, PD;
+
+ let Pattern = []<dag> in {
+ defm COMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
+ "comiss">, PS;
+ defm COMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
+ "comisd">, PD;
+ }
+
+ let isCodeGenOnly = 1 in {
+ defm Int_UCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
+ load, "ucomiss">, PS;
+ defm Int_UCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
+ load, "ucomisd">, PD;
+
+ defm Int_COMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, load,
+ "comiss">, PS;
+ defm Int_COMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, load,
+ "comisd">, PD;
+ }
+} // Defs = [EFLAGS]
+
+// sse12_cmp_packed - sse 1 & 2 compare packed instructions
+multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
+ Operand CC, Intrinsic Int, string asm,
+ string asm_alt, Domain d, ImmLeaf immLeaf,
+ PatFrag ld_frag, OpndItins itins = SSE_ALU_F32P> {
+ let isCommutable = 1 in
+ def rri : PIi8<0xC2, MRMSrcReg,
+ (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
+ [(set RC:$dst, (Int RC:$src1, RC:$src2, immLeaf:$cc))],
+ itins.rr, d>,
+ Sched<[WriteFAdd]>;
+ def rmi : PIi8<0xC2, MRMSrcMem,
+ (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
+ [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2), immLeaf:$cc))],
+ itins.rm, d>,
+ Sched<[WriteFAddLd, ReadAfterLd]>;
+
+ // Accept explicit immediate argument form instead of comparison code.
+ let isAsmParserOnly = 1, hasSideEffects = 0 in {
+ def rri_alt : PIi8<0xC2, MRMSrcReg,
+ (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc),
+ asm_alt, [], itins.rr, d>, Sched<[WriteFAdd]>;
+ let mayLoad = 1 in
+ def rmi_alt : PIi8<0xC2, MRMSrcMem,
+ (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc),
+ asm_alt, [], itins.rm, d>,
+ Sched<[WriteFAddLd, ReadAfterLd]>;
+ }
+}
+
+defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse_cmp_ps,
+ "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+ SSEPackedSingle, i8immZExt5, loadv4f32>, PS, VEX_4V;
+defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse2_cmp_pd,
+ "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+ SSEPackedDouble, i8immZExt5, loadv2f64>, PD, VEX_4V;
+defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_ps_256,
+ "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+ SSEPackedSingle, i8immZExt5, loadv8f32>, PS, VEX_4V, VEX_L;
+defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_pd_256,
+ "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+ SSEPackedDouble, i8immZExt5, loadv4f64>, PD, VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst" in {
+ defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse_cmp_ps,
+ "cmp${cc}ps\t{$src2, $dst|$dst, $src2}",
+ "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
+ SSEPackedSingle, i8immZExt5, memopv4f32, SSE_ALU_F32P>, PS;
+ defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse2_cmp_pd,
+ "cmp${cc}pd\t{$src2, $dst|$dst, $src2}",
+ "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
+ SSEPackedDouble, i8immZExt5, memopv2f64, SSE_ALU_F64P>, PD;
+}
+
+let Predicates = [HasAVX] in {
+def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
+ (VCMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
+def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), (loadv4f32 addr:$src2), imm:$cc)),
+ (VCMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
+def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
+ (VCMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
+def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), (loadv2f64 addr:$src2), imm:$cc)),
+ (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
+
+def : Pat<(v8f32 (X86cmpp (v8f32 VR256:$src1), VR256:$src2, imm:$cc)),
+ (VCMPPSYrri (v8f32 VR256:$src1), (v8f32 VR256:$src2), imm:$cc)>;
+def : Pat<(v8f32 (X86cmpp (v8f32 VR256:$src1), (loadv8f32 addr:$src2), imm:$cc)),
+ (VCMPPSYrmi (v8f32 VR256:$src1), addr:$src2, imm:$cc)>;
+def : Pat<(v4f64 (X86cmpp (v4f64 VR256:$src1), VR256:$src2, imm:$cc)),
+ (VCMPPDYrri VR256:$src1, VR256:$src2, imm:$cc)>;
+def : Pat<(v4f64 (X86cmpp (v4f64 VR256:$src1), (loadv4f64 addr:$src2), imm:$cc)),
+ (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
+}
+
+let Predicates = [UseSSE1] in {
+def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
+ (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
+def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), (memopv4f32 addr:$src2), imm:$cc)),
+ (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
+}
+
+let Predicates = [UseSSE2] in {
+def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
+ (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
+def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), (memopv2f64 addr:$src2), imm:$cc)),
+ (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Shuffle Instructions
+//===----------------------------------------------------------------------===//
+
+/// sse12_shuffle - sse 1 & 2 fp shuffle instructions
+multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
+ ValueType vt, string asm, PatFrag mem_frag,
+ Domain d> {
+ def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm,
+ [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
+ (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
+ Sched<[WriteFShuffleLd, ReadAfterLd]>;
+ def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, u8imm:$src3), asm,
+ [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
+ (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
+ Sched<[WriteFShuffle]>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+ defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
+ "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ loadv4f32, SSEPackedSingle>, PS, VEX_4V;
+ defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
+ "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ loadv8f32, SSEPackedSingle>, PS, VEX_4V, VEX_L;
+ defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
+ "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ loadv2f64, SSEPackedDouble>, PD, VEX_4V;
+ defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
+ "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ loadv4f64, SSEPackedDouble>, PD, VEX_4V, VEX_L;
+}
+let Constraints = "$src1 = $dst" in {
+ defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
+ "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ memopv4f32, SSEPackedSingle>, PS;
+ defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
+ "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ memopv2f64, SSEPackedDouble>, PD;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+ def : Pat<(v4i32 (X86Shufp VR128:$src1,
+ (bc_v4i32 (loadv2i64 addr:$src2)), (i8 imm:$imm))),
+ (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
+ def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
+
+ def : Pat<(v2i64 (X86Shufp VR128:$src1,
+ (loadv2i64 addr:$src2), (i8 imm:$imm))),
+ (VSHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
+ def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
+
+ // 256-bit patterns
+ def : Pat<(v8i32 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>;
+ def : Pat<(v8i32 (X86Shufp VR256:$src1,
+ (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
+ (VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>;
+
+ def : Pat<(v4i64 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>;
+ def : Pat<(v4i64 (X86Shufp VR256:$src1,
+ (loadv4i64 addr:$src2), (i8 imm:$imm))),
+ (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>;
+}
+
+let Predicates = [UseSSE1] in {
+ def : Pat<(v4i32 (X86Shufp VR128:$src1,
+ (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
+ (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
+ def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
+}
+
+let Predicates = [UseSSE2] in {
+ // Generic SHUFPD patterns
+ def : Pat<(v2i64 (X86Shufp VR128:$src1,
+ (memopv2i64 addr:$src2), (i8 imm:$imm))),
+ (SHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
+ def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Unpack FP Instructions
+//===----------------------------------------------------------------------===//
+
+/// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave
+multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
+ PatFrag mem_frag, RegisterClass RC,
+ X86MemOperand x86memop, string asm,
+ Domain d, bit IsCommutable = 0> {
+ let isCommutable = IsCommutable in
+ def rr : PI<opc, MRMSrcReg,
+ (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ asm, [(set RC:$dst,
+ (vt (OpNode RC:$src1, RC:$src2)))],
+ IIC_SSE_UNPCK, d>, Sched<[WriteFShuffle]>;
+ def rm : PI<opc, MRMSrcMem,
+ (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ asm, [(set RC:$dst,
+ (vt (OpNode RC:$src1,
+ (mem_frag addr:$src2))))],
+ IIC_SSE_UNPCK, d>,
+ Sched<[WriteFShuffleLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32,
+ VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SSEPackedSingle>, PS, VEX_4V;
+defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64,
+ VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SSEPackedDouble>, PD, VEX_4V;
+defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32,
+ VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SSEPackedSingle>, PS, VEX_4V;
+defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64,
+ VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SSEPackedDouble>, PD, VEX_4V;
+
+defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32,
+ VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SSEPackedSingle>, PS, VEX_4V, VEX_L;
+defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64,
+ VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SSEPackedDouble>, PD, VEX_4V, VEX_L;
+defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32,
+ VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SSEPackedSingle>, PS, VEX_4V, VEX_L;
+defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64,
+ VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SSEPackedDouble>, PD, VEX_4V, VEX_L;
+}// Predicates = [HasAVX, NoVLX]
+let Constraints = "$src1 = $dst" in {
+ defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
+ VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
+ SSEPackedSingle>, PS;
+ defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64,
+ VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
+ SSEPackedDouble, 1>, PD;
+ defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32,
+ VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
+ SSEPackedSingle>, PS;
+ defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64,
+ VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
+ SSEPackedDouble>, PD;
+} // Constraints = "$src1 = $dst"
+
+let Predicates = [HasAVX1Only] in {
+ def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
+ (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
+ (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
+ (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
+ (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
+
+ def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))),
+ (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
+ (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))),
+ (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
+ (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Extract Floating-Point Sign mask
+//===----------------------------------------------------------------------===//
+
+/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
+multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt,
+ string asm, Domain d> {
+ def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+ [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], IIC_SSE_MOVMSK, d>,
+ Sched<[WriteVecLogic]>;
+}
+
+let Predicates = [HasAVX] in {
+ defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
+ SSEPackedSingle>, PS, VEX;
+ defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
+ SSEPackedDouble>, PD, VEX;
+ defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps",
+ SSEPackedSingle>, PS, VEX, VEX_L;
+ defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd",
+ SSEPackedDouble>, PD, VEX, VEX_L;
+}
+
+defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
+ SSEPackedSingle>, PS;
+defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
+ SSEPackedDouble>, PD;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Logical Instructions
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in { // SSE integer instructions
+
+/// PDI_binop_rm - Simple SSE2 binary operator.
+multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+ X86MemOperand x86memop, OpndItins itins,
+ bit IsCommutable, bit Is2Addr> {
+ let isCommutable = IsCommutable in
+ def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>,
+ Sched<[itins.Sched]>;
+ def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (OpVT (OpNode RC:$src1,
+ (bitconvert (memop_frag addr:$src2)))))],
+ itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+} // ExeDomain = SSEPackedInt
+
+multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
+ ValueType OpVT128, ValueType OpVT256,
+ OpndItins itins, bit IsCommutable = 0, Predicate prd> {
+let Predicates = [HasAVX, prd] in
+ defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
+ VR128, loadv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V;
+
+let Constraints = "$src1 = $dst" in
+ defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
+ memopv2i64, i128mem, itins, IsCommutable, 1>;
+
+let Predicates = [HasAVX2, prd] in
+ defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
+ OpVT256, VR256, loadv4i64, i256mem, itins,
+ IsCommutable, 0>, VEX_4V, VEX_L;
+}
+
+// These are ordered here for pattern ordering requirements with the fp versions
+
+defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64,
+ SSE_VEC_BIT_ITINS_P, 1, NoVLX>;
+defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64,
+ SSE_VEC_BIT_ITINS_P, 1, NoVLX>;
+defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64,
+ SSE_VEC_BIT_ITINS_P, 1, NoVLX>;
+defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
+ SSE_VEC_BIT_ITINS_P, 0, NoVLX>;
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Logical Instructions
+//===----------------------------------------------------------------------===//
+
+/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
+///
+multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
+ SDNode OpNode> {
+ let Predicates = [HasAVX, NoVLX] in {
+ defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
+ !strconcat(OpcodeStr, "ps"), f256mem,
+ [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
+ (bc_v4i64 (v8f32 VR256:$src2))))],
+ [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
+ (loadv4i64 addr:$src2)))], 0>, PS, VEX_4V, VEX_L;
+
+ defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
+ !strconcat(OpcodeStr, "pd"), f256mem,
+ [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
+ (bc_v4i64 (v4f64 VR256:$src2))))],
+ [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
+ (loadv4i64 addr:$src2)))], 0>,
+ PD, VEX_4V, VEX_L;
+
+ defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
+ !strconcat(OpcodeStr, "ps"), f128mem,
+ [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
+ (bc_v2i64 (v4f32 VR128:$src2))))],
+ [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
+ (loadv2i64 addr:$src2)))], 0>, PS, VEX_4V;
+
+ defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
+ !strconcat(OpcodeStr, "pd"), f128mem,
+ [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
+ (bc_v2i64 (v2f64 VR128:$src2))))],
+ [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
+ (loadv2i64 addr:$src2)))], 0>,
+ PD, VEX_4V;
+ }
+
+ let Constraints = "$src1 = $dst" in {
+ defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
+ !strconcat(OpcodeStr, "ps"), f128mem,
+ [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
+ (bc_v2i64 (v4f32 VR128:$src2))))],
+ [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
+ (memopv2i64 addr:$src2)))]>, PS;
+
+ defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
+ !strconcat(OpcodeStr, "pd"), f128mem,
+ [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
+ (bc_v2i64 (v2f64 VR128:$src2))))],
+ [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
+ (memopv2i64 addr:$src2)))]>, PD;
+ }
+}
+
+defm AND : sse12_fp_packed_logical<0x54, "and", and>;
+defm OR : sse12_fp_packed_logical<0x56, "or", or>;
+defm XOR : sse12_fp_packed_logical<0x57, "xor", xor>;
+let isCommutable = 0 in
+ defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp>;
+
+// If only AVX1 is supported, we need to handle integer operations with
+// floating point instructions since the integer versions aren't available.
+let Predicates = [HasAVX1Only] in {
+ def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)),
+ (VANDPSYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)),
+ (VORPSYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)),
+ (VXORPSYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)),
+ (VANDNPSYrr VR256:$src1, VR256:$src2)>;
+
+ def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)),
+ (VANDPSYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)),
+ (VORPSYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)),
+ (VXORPSYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)),
+ (VANDNPSYrm VR256:$src1, addr:$src2)>;
+}
+
+let Predicates = [HasAVX, NoVLX_Or_NoDQI] in {
+ // Use packed logical operations for scalar ops.
+ def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)),
+ (COPY_TO_REGCLASS (VANDPDrr
+ (COPY_TO_REGCLASS FR64:$src1, VR128),
+ (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+ def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)),
+ (COPY_TO_REGCLASS (VORPDrr
+ (COPY_TO_REGCLASS FR64:$src1, VR128),
+ (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+ def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)),
+ (COPY_TO_REGCLASS (VXORPDrr
+ (COPY_TO_REGCLASS FR64:$src1, VR128),
+ (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+ def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)),
+ (COPY_TO_REGCLASS (VANDNPDrr
+ (COPY_TO_REGCLASS FR64:$src1, VR128),
+ (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+
+ def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)),
+ (COPY_TO_REGCLASS (VANDPSrr
+ (COPY_TO_REGCLASS FR32:$src1, VR128),
+ (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+ def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)),
+ (COPY_TO_REGCLASS (VORPSrr
+ (COPY_TO_REGCLASS FR32:$src1, VR128),
+ (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+ def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)),
+ (COPY_TO_REGCLASS (VXORPSrr
+ (COPY_TO_REGCLASS FR32:$src1, VR128),
+ (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+ def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)),
+ (COPY_TO_REGCLASS (VANDNPSrr
+ (COPY_TO_REGCLASS FR32:$src1, VR128),
+ (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+}
+
+let Predicates = [UseSSE1] in {
+ // Use packed logical operations for scalar ops.
+ def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)),
+ (COPY_TO_REGCLASS (ANDPSrr
+ (COPY_TO_REGCLASS FR32:$src1, VR128),
+ (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+ def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)),
+ (COPY_TO_REGCLASS (ORPSrr
+ (COPY_TO_REGCLASS FR32:$src1, VR128),
+ (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+ def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)),
+ (COPY_TO_REGCLASS (XORPSrr
+ (COPY_TO_REGCLASS FR32:$src1, VR128),
+ (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+ def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)),
+ (COPY_TO_REGCLASS (ANDNPSrr
+ (COPY_TO_REGCLASS FR32:$src1, VR128),
+ (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+}
+
+let Predicates = [UseSSE2] in {
+ // Use packed logical operations for scalar ops.
+ def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)),
+ (COPY_TO_REGCLASS (ANDPDrr
+ (COPY_TO_REGCLASS FR64:$src1, VR128),
+ (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+ def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)),
+ (COPY_TO_REGCLASS (ORPDrr
+ (COPY_TO_REGCLASS FR64:$src1, VR128),
+ (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+ def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)),
+ (COPY_TO_REGCLASS (XORPDrr
+ (COPY_TO_REGCLASS FR64:$src1, VR128),
+ (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+ def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)),
+ (COPY_TO_REGCLASS (ANDNPDrr
+ (COPY_TO_REGCLASS FR64:$src1, VR128),
+ (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+}
+
+// Patterns for packed operations when we don't have integer type available.
+def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)),
+ (ANDPSrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v4f32 (X86for VR128:$src1, VR128:$src2)),
+ (ORPSrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v4f32 (X86fxor VR128:$src1, VR128:$src2)),
+ (XORPSrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v4f32 (X86fandn VR128:$src1, VR128:$src2)),
+ (ANDNPSrr VR128:$src1, VR128:$src2)>;
+
+def : Pat<(X86fand VR128:$src1, (memopv4f32 addr:$src2)),
+ (ANDPSrm VR128:$src1, addr:$src2)>;
+def : Pat<(X86for VR128:$src1, (memopv4f32 addr:$src2)),
+ (ORPSrm VR128:$src1, addr:$src2)>;
+def : Pat<(X86fxor VR128:$src1, (memopv4f32 addr:$src2)),
+ (XORPSrm VR128:$src1, addr:$src2)>;
+def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)),
+ (ANDNPSrm VR128:$src1, addr:$src2)>;
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Arithmetic Instructions
+//===----------------------------------------------------------------------===//
+
+/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and
+/// vector forms.
+///
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation. This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a scalar)
+/// and leaves the top elements unmodified (therefore these cannot be commuted).
+///
+/// These three forms can each be reg+reg or reg+mem.
+///
+
+/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
+/// classes below
+multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, SizeItins itins> {
+ let Predicates = [HasAVX, NoVLX] in {
+ defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
+ VR128, v4f32, f128mem, loadv4f32,
+ SSEPackedSingle, itins.s, 0>, PS, VEX_4V;
+ defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
+ VR128, v2f64, f128mem, loadv2f64,
+ SSEPackedDouble, itins.d, 0>, PD, VEX_4V;
+
+ defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
+ OpNode, VR256, v8f32, f256mem, loadv8f32,
+ SSEPackedSingle, itins.s, 0>, PS, VEX_4V, VEX_L;
+ defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
+ OpNode, VR256, v4f64, f256mem, loadv4f64,
+ SSEPackedDouble, itins.d, 0>, PD, VEX_4V, VEX_L;
+ }
+
+ let Constraints = "$src1 = $dst" in {
+ defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
+ v4f32, f128mem, memopv4f32, SSEPackedSingle,
+ itins.s>, PS;
+ defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
+ v2f64, f128mem, memopv2f64, SSEPackedDouble,
+ itins.d>, PD;
+ }
+}
+
+multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SizeItins itins> {
+ defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
+ OpNode, FR32, f32mem, SSEPackedSingle, itins.s, 0>,
+ XS, VEX_4V, VEX_LIG;
+ defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
+ OpNode, FR64, f64mem, SSEPackedDouble, itins.d, 0>,
+ XD, VEX_4V, VEX_LIG;
+
+ let Constraints = "$src1 = $dst" in {
+ defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
+ OpNode, FR32, f32mem, SSEPackedSingle,
+ itins.s>, XS;
+ defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
+ OpNode, FR64, f64mem, SSEPackedDouble,
+ itins.d>, XD;
+ }
+}
+
+multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
+ SDPatternOperator IntSS,
+ SDPatternOperator IntSD,
+ SizeItins itins> {
+ defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, IntSS, VR128,
+ !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
+ SSEPackedSingle, itins.s, 0>, XS, VEX_4V, VEX_LIG;
+ defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, IntSD, VR128,
+ !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
+ SSEPackedDouble, itins.d, 0>, XD, VEX_4V, VEX_LIG;
+
+ let Constraints = "$src1 = $dst" in {
+ defm SS : sse12_fp_scalar_int<opc, OpcodeStr, IntSS, VR128,
+ !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
+ SSEPackedSingle, itins.s>, XS;
+ defm SD : sse12_fp_scalar_int<opc, OpcodeStr, IntSD, VR128,
+ !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
+ SSEPackedDouble, itins.d>, XD;
+ }
+}
+
+// Binary Arithmetic instructions
+defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>,
+ basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>,
+ basic_sse12_fp_binop_s_int<0x58, "add", null_frag, null_frag,
+ SSE_ALU_ITINS_S>;
+defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>,
+ basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>,
+ basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, null_frag,
+ SSE_MUL_ITINS_S>;
+let isCommutable = 0 in {
+ defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>,
+ basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>,
+ basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, null_frag,
+ SSE_ALU_ITINS_S>;
+ defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>,
+ basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>,
+ basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, null_frag,
+ SSE_DIV_ITINS_S>;
+ defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>,
+ basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>,
+ basic_sse12_fp_binop_s_int<0x5F, "max", int_x86_sse_max_ss,
+ int_x86_sse2_max_sd, SSE_ALU_ITINS_S>;
+ defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>,
+ basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>,
+ basic_sse12_fp_binop_s_int<0x5D, "min", int_x86_sse_min_ss,
+ int_x86_sse2_min_sd, SSE_ALU_ITINS_S>;
+}
+
+let isCodeGenOnly = 1 in {
+ defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>,
+ basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S>;
+ defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>,
+ basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>;
+}
+
+// Patterns used to select SSE scalar fp arithmetic instructions from
+// either:
+//
+// (1) a scalar fp operation followed by a blend
+//
+// The effect is that the backend no longer emits unnecessary vector
+// insert instructions immediately after SSE scalar fp instructions
+// like addss or mulss.
+//
+// For example, given the following code:
+// __m128 foo(__m128 A, __m128 B) {
+// A[0] += B[0];
+// return A;
+// }
+//
+// Previously we generated:
+// addss %xmm0, %xmm1
+// movss %xmm1, %xmm0
+//
+// We now generate:
+// addss %xmm1, %xmm0
+//
+// (2) a vector packed single/double fp operation followed by a vector insert
+//
+// The effect is that the backend converts the packed fp instruction
+// followed by a vector insert into a single SSE scalar fp instruction.
+//
+// For example, given the following code:
+// __m128 foo(__m128 A, __m128 B) {
+// __m128 C = A + B;
+// return (__m128) {c[0], a[1], a[2], a[3]};
+// }
+//
+// Previously we generated:
+// addps %xmm0, %xmm1
+// movss %xmm1, %xmm0
+//
+// We now generate:
+// addss %xmm1, %xmm0
+
+// TODO: Some canonicalization in lowering would simplify the number of
+// patterns we have to try to match.
+multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> {
+ let Predicates = [UseSSE1] in {
+ // extracted scalar math op with insert via movss
+ def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+ (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
+ FR32:$src))))),
+ (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst,
+ (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+ // vector math op with insert via movss
+ def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
+ (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
+ (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
+ }
+
+ // With SSE 4.1, blendi is preferred to movsd, so match that too.
+ let Predicates = [UseSSE41] in {
+ // extracted scalar math op with insert via blend
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+ (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
+ FR32:$src))), (i8 1))),
+ (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst,
+ (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+ // vector math op with insert via blend
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
+ (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
+ (!cast<I>(OpcPrefix#SSrr_Int)v4f32:$dst, v4f32:$src)>;
+
+ }
+
+ // Repeat everything for AVX.
+ let Predicates = [UseAVX] in {
+ // extracted scalar math op with insert via movss
+ def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+ (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
+ FR32:$src))))),
+ (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst,
+ (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+ // extracted scalar math op with insert via blend
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+ (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
+ FR32:$src))), (i8 1))),
+ (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst,
+ (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+ // vector math op with insert via movss
+ def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
+ (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
+ (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
+
+ // vector math op with insert via blend
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
+ (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
+ (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
+ }
+}
+
+defm : scalar_math_f32_patterns<fadd, "ADD">;
+defm : scalar_math_f32_patterns<fsub, "SUB">;
+defm : scalar_math_f32_patterns<fmul, "MUL">;
+defm : scalar_math_f32_patterns<fdiv, "DIV">;
+
+multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> {
+ let Predicates = [UseSSE2] in {
+ // extracted scalar math op with insert via movsd
+ def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
+ (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))))),
+ (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst,
+ (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+ // vector math op with insert via movsd
+ def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
+ (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
+ (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
+ }
+
+ // With SSE 4.1, blendi is preferred to movsd, so match those too.
+ let Predicates = [UseSSE41] in {
+ // extracted scalar math op with insert via blend
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
+ (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))), (i8 1))),
+ (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst,
+ (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+ // vector math op with insert via blend
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
+ (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
+ (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
+ }
+
+ // Repeat everything for AVX.
+ let Predicates = [UseAVX] in {
+ // extracted scalar math op with insert via movsd
+ def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
+ (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))))),
+ (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst,
+ (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+ // extracted scalar math op with insert via blend
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
+ (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))), (i8 1))),
+ (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst,
+ (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+ // vector math op with insert via movsd
+ def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
+ (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
+ (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
+
+ // vector math op with insert via blend
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
+ (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
+ (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
+ }
+}
+
+defm : scalar_math_f64_patterns<fadd, "ADD">;
+defm : scalar_math_f64_patterns<fsub, "SUB">;
+defm : scalar_math_f64_patterns<fmul, "MUL">;
+defm : scalar_math_f64_patterns<fdiv, "DIV">;
+
+
+/// Unop Arithmetic
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation. This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a
+/// scalar) and leaves the top elements undefined.
+///
+/// And, we have a special variant form for a full-vector intrinsic form.
+
+let Sched = WriteFSqrt in {
+def SSE_SQRTPS : OpndItins<
+ IIC_SSE_SQRTPS_RR, IIC_SSE_SQRTPS_RM
+>;
+
+def SSE_SQRTSS : OpndItins<
+ IIC_SSE_SQRTSS_RR, IIC_SSE_SQRTSS_RM
+>;
+
+def SSE_SQRTPD : OpndItins<
+ IIC_SSE_SQRTPD_RR, IIC_SSE_SQRTPD_RM
+>;
+
+def SSE_SQRTSD : OpndItins<
+ IIC_SSE_SQRTSD_RR, IIC_SSE_SQRTSD_RM
+>;
+}
+
+let Sched = WriteFRsqrt in {
+def SSE_RSQRTPS : OpndItins<
+ IIC_SSE_RSQRTPS_RR, IIC_SSE_RSQRTPS_RM
+>;
+
+def SSE_RSQRTSS : OpndItins<
+ IIC_SSE_RSQRTSS_RR, IIC_SSE_RSQRTSS_RM
+>;
+}
+
+let Sched = WriteFRcp in {
+def SSE_RCPP : OpndItins<
+ IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM
+>;
+
+def SSE_RCPS : OpndItins<
+ IIC_SSE_RCPS_RR, IIC_SSE_RCPS_RM
+>;
+}
+
+/// sse_fp_unop_s - SSE1 unops in scalar form
+/// For the non-AVX defs, we need $src1 to be tied to $dst because
+/// the HW instructions are 2 operand / destructive.
+multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ ValueType vt, ValueType ScalarVT,
+ X86MemOperand x86memop,
+ Intrinsic Intr,
+ SDNode OpNode, Domain d, OpndItins itins,
+ Predicate target, string Suffix> {
+ let hasSideEffects = 0 in {
+ def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1),
+ !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
+ [(set RC:$dst, (OpNode RC:$src1))], itins.rr, d>, Sched<[itins.Sched]>,
+ Requires<[target]>;
+ let mayLoad = 1 in
+ def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1),
+ !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
+ [(set RC:$dst, (OpNode (load addr:$src1)))], itins.rm, d>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>,
+ Requires<[target, OptForSize]>;
+
+ let isCodeGenOnly = 1, Constraints = "$src1 = $dst" in {
+ def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ []>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ let mayLoad = 1 in
+ def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, x86memop:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ []>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ }
+ }
+
+ let Predicates = [target] in {
+ // These are unary operations, but they are modeled as having 2 source operands
+ // because the high elements of the destination are unchanged in SSE.
+ def : Pat<(Intr VR128:$src),
+ (!cast<Instruction>(NAME#Suffix##r_Int) VR128:$src, VR128:$src)>;
+ }
+ // We don't want to fold scalar loads into these instructions unless
+ // optimizing for size. This is because the folded instruction will have a
+ // partial register update, while the unfolded sequence will not, e.g.
+ // movss mem, %xmm0
+ // rcpss %xmm0, %xmm0
+ // which has a clobber before the rcp, vs.
+ // rcpss mem, %xmm0
+ let Predicates = [target, OptForSize] in {
+ def : Pat<(Intr (scalar_to_vector (ScalarVT (load addr:$src2)))),
+ (!cast<Instruction>(NAME#Suffix##m_Int)
+ (vt (IMPLICIT_DEF)), addr:$src2)>;
+ }
+}
+
+multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ ValueType vt, ValueType ScalarVT,
+ X86MemOperand x86memop,
+ Intrinsic Intr, SDNode OpNode, Domain d,
+ OpndItins itins, string Suffix> {
+ let hasSideEffects = 0 in {
+ def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [], itins.rr, d>, Sched<[itins.Sched]>;
+ let mayLoad = 1 in
+ def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [], itins.rm, d>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ let isCodeGenOnly = 1 in {
+ def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, Sched<[itins.Sched.Folded]>;
+ let mayLoad = 1 in
+ def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, x86memop:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ }
+ }
+
+ // We don't want to fold scalar loads into these instructions unless
+ // optimizing for size. This is because the folded instruction will have a
+ // partial register update, while the unfolded sequence will not, e.g.
+ // vmovss mem, %xmm0
+ // vrcpss %xmm0, %xmm0, %xmm0
+ // which has a clobber before the rcp, vs.
+ // vrcpss mem, %xmm0, %xmm0
+ // TODO: In theory, we could fold the load, and avoid the stall caused by
+ // the partial register store, either in ExeDepFix or with smarter RA.
+ let Predicates = [UseAVX] in {
+ def : Pat<(OpNode RC:$src), (!cast<Instruction>("V"#NAME#Suffix##r)
+ (ScalarVT (IMPLICIT_DEF)), RC:$src)>;
+ }
+ let Predicates = [HasAVX] in {
+ def : Pat<(Intr VR128:$src),
+ (!cast<Instruction>("V"#NAME#Suffix##r_Int) VR128:$src,
+ VR128:$src)>;
+ }
+ let Predicates = [HasAVX, OptForSize] in {
+ def : Pat<(Intr (scalar_to_vector (ScalarVT (load addr:$src2)))),
+ (!cast<Instruction>("V"#NAME#Suffix##m_Int)
+ (vt (IMPLICIT_DEF)), addr:$src2)>;
+ }
+ let Predicates = [UseAVX, OptForSize] in {
+ def : Pat<(ScalarVT (OpNode (load addr:$src))),
+ (!cast<Instruction>("V"#NAME#Suffix##m) (ScalarVT (IMPLICIT_DEF)),
+ addr:$src)>;
+ }
+}
+
+/// sse1_fp_unop_p - SSE1 unops in packed form.
+multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ OpndItins itins, list<Predicate> prds> {
+let Predicates = prds in {
+ def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat("v", OpcodeStr,
+ "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))],
+ itins.rr>, VEX, Sched<[itins.Sched]>;
+ def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ !strconcat("v", OpcodeStr,
+ "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))],
+ itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
+ def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+ !strconcat("v", OpcodeStr,
+ "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))],
+ itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
+ def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+ !strconcat("v", OpcodeStr,
+ "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))],
+ itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
+}
+
+ def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))], itins.rr>,
+ Sched<[itins.Sched]>;
+ def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))], itins.rm>,
+ Sched<[itins.Sched.Folded]>;
+}
+
+/// sse2_fp_unop_p - SSE2 unops in vector forms.
+multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, OpndItins itins> {
+let Predicates = [HasAVX] in {
+ def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat("v", OpcodeStr,
+ "pd\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))],
+ itins.rr>, VEX, Sched<[itins.Sched]>;
+ def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ !strconcat("v", OpcodeStr,
+ "pd\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))],
+ itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
+ def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+ !strconcat("v", OpcodeStr,
+ "pd\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))],
+ itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
+ def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+ !strconcat("v", OpcodeStr,
+ "pd\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))],
+ itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
+}
+
+ def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))], itins.rr>,
+ Sched<[itins.Sched]>;
+ def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))], itins.rm>,
+ Sched<[itins.Sched.Folded]>;
+}
+
+multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ OpndItins itins> {
+ defm SS : sse_fp_unop_s<opc, OpcodeStr##ss, FR32, v4f32, f32, f32mem,
+ !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
+ SSEPackedSingle, itins, UseSSE1, "SS">, XS;
+ defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32,
+ f32mem,
+ !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
+ SSEPackedSingle, itins, "SS">, XS, VEX_4V, VEX_LIG;
+}
+
+multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ OpndItins itins> {
+ defm SD : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, v2f64, f64, f64mem,
+ !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
+ OpNode, SSEPackedDouble, itins, UseSSE2, "SD">, XD;
+ defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, v2f64, f64,
+ f64mem,
+ !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
+ OpNode, SSEPackedDouble, itins, "SD">,
+ XD, VEX_4V, VEX_LIG;
+}
+
+// Square root.
+defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS>,
+ sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS, [HasAVX]>,
+ sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD>,
+ sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>;
+
+// Reciprocal approximations. Note that these typically require refinement
+// in order to obtain suitable precision.
+defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>,
+ sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS, [HasAVX, NoVLX] >;
+defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS>,
+ sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP, [HasAVX, NoVLX]>;
+
+// There is no f64 version of the reciprocal approximation instructions.
+
+// TODO: We should add *scalar* op patterns for these just like we have for
+// the binops above. If the binop and unop patterns could all be unified
+// that would be even better.
+
+multiclass scalar_unary_math_patterns<Intrinsic Intr, string OpcPrefix,
+ SDNode Move, ValueType VT,
+ Predicate BasePredicate> {
+ let Predicates = [BasePredicate] in {
+ def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
+ (!cast<I>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
+ }
+
+ // With SSE 4.1, blendi is preferred to movs*, so match that too.
+ let Predicates = [UseSSE41] in {
+ def : Pat<(VT (X86Blendi VT:$dst, (Intr VT:$src), (i8 1))),
+ (!cast<I>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
+ }
+
+ // Repeat for AVX versions of the instructions.
+ let Predicates = [HasAVX] in {
+ def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
+ (!cast<I>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
+
+ def : Pat<(VT (X86Blendi VT:$dst, (Intr VT:$src), (i8 1))),
+ (!cast<I>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
+ }
+}
+
+defm : scalar_unary_math_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss,
+ v4f32, UseSSE1>;
+defm : scalar_unary_math_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss,
+ v4f32, UseSSE1>;
+defm : scalar_unary_math_patterns<int_x86_sse_sqrt_ss, "SQRTSS", X86Movss,
+ v4f32, UseSSE1>;
+defm : scalar_unary_math_patterns<int_x86_sse2_sqrt_sd, "SQRTSD", X86Movsd,
+ v2f64, UseSSE2>;
+
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Non-temporal stores
+//===----------------------------------------------------------------------===//
+
+let AddedComplexity = 400 in { // Prefer non-temporal versions
+let SchedRW = [WriteStore] in {
+let Predicates = [HasAVX, NoVLX] in {
+def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
+ (ins f128mem:$dst, VR128:$src),
+ "movntps\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v4f32 VR128:$src),
+ addr:$dst)],
+ IIC_SSE_MOVNT>, VEX;
+def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
+ (ins f128mem:$dst, VR128:$src),
+ "movntpd\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v2f64 VR128:$src),
+ addr:$dst)],
+ IIC_SSE_MOVNT>, VEX;
+
+let ExeDomain = SSEPackedInt in
+def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs),
+ (ins f128mem:$dst, VR128:$src),
+ "movntdq\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v2i64 VR128:$src),
+ addr:$dst)],
+ IIC_SSE_MOVNT>, VEX;
+
+def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
+ (ins f256mem:$dst, VR256:$src),
+ "movntps\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v8f32 VR256:$src),
+ addr:$dst)],
+ IIC_SSE_MOVNT>, VEX, VEX_L;
+def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
+ (ins f256mem:$dst, VR256:$src),
+ "movntpd\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v4f64 VR256:$src),
+ addr:$dst)],
+ IIC_SSE_MOVNT>, VEX, VEX_L;
+let ExeDomain = SSEPackedInt in
+def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
+ (ins f256mem:$dst, VR256:$src),
+ "movntdq\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v4i64 VR256:$src),
+ addr:$dst)],
+ IIC_SSE_MOVNT>, VEX, VEX_L;
+}
+
+def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movntps\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVNT>;
+def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movntpd\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVNT>;
+
+let ExeDomain = SSEPackedInt in
+def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movntdq\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVNT>;
+
+// There is no AVX form for instructions below this point
+def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+ "movnti{l}\t{$src, $dst|$dst, $src}",
+ [(nontemporalstore (i32 GR32:$src), addr:$dst)],
+ IIC_SSE_MOVNT>,
+ PS, Requires<[HasSSE2]>;
+def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+ "movnti{q}\t{$src, $dst|$dst, $src}",
+ [(nontemporalstore (i64 GR64:$src), addr:$dst)],
+ IIC_SSE_MOVNT>,
+ PS, Requires<[HasSSE2]>;
+} // SchedRW = [WriteStore]
+
+let Predicates = [HasAVX, NoVLX] in {
+ def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst),
+ (VMOVNTDQYmr addr:$dst, VR256:$src)>;
+ def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst),
+ (VMOVNTDQYmr addr:$dst, VR256:$src)>;
+ def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst),
+ (VMOVNTDQYmr addr:$dst, VR256:$src)>;
+
+ def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
+ (VMOVNTDQmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
+ (VMOVNTDQmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
+ (VMOVNTDQmr addr:$dst, VR128:$src)>;
+}
+
+let Predicates = [UseSSE2] in {
+ def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
+ (MOVNTDQmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
+ (MOVNTDQmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
+ (MOVNTDQmr addr:$dst, VR128:$src)>;
+}
+
+} // AddedComplexity
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Prefetch and memory fence
+//===----------------------------------------------------------------------===//
+
+// Prefetch intrinsic.
+let Predicates = [HasSSE1], SchedRW = [WriteLoad] in {
+def PREFETCHT0 : I<0x18, MRM1m, (outs), (ins i8mem:$src),
+ "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))],
+ IIC_SSE_PREFETCH>, TB;
+def PREFETCHT1 : I<0x18, MRM2m, (outs), (ins i8mem:$src),
+ "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))],
+ IIC_SSE_PREFETCH>, TB;
+def PREFETCHT2 : I<0x18, MRM3m, (outs), (ins i8mem:$src),
+ "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))],
+ IIC_SSE_PREFETCH>, TB;
+def PREFETCHNTA : I<0x18, MRM0m, (outs), (ins i8mem:$src),
+ "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))],
+ IIC_SSE_PREFETCH>, TB;
+}
+
+// FIXME: How should flush instruction be modeled?
+let SchedRW = [WriteLoad] in {
+// Flush cache
+def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
+ "clflush\t$src", [(int_x86_sse2_clflush addr:$src)],
+ IIC_SSE_PREFETCH>, PS, Requires<[HasSSE2]>;
+}
+
+let SchedRW = [WriteNop] in {
+// Pause. This "instruction" is encoded as "rep; nop", so even though it
+// was introduced with SSE2, it's backward compatible.
+def PAUSE : I<0x90, RawFrm, (outs), (ins),
+ "pause", [(int_x86_sse2_pause)], IIC_SSE_PAUSE>,
+ OBXS, Requires<[HasSSE2]>;
+}
+
+let SchedRW = [WriteFence] in {
+// Load, store, and memory fence
+// TODO: As with mfence, we may want to ease the availablity of sfence/lfence
+// to include any 64-bit target.
+def SFENCE : I<0xAE, MRM_F8, (outs), (ins),
+ "sfence", [(int_x86_sse_sfence)], IIC_SSE_SFENCE>,
+ PS, Requires<[HasSSE1]>;
+def LFENCE : I<0xAE, MRM_E8, (outs), (ins),
+ "lfence", [(int_x86_sse2_lfence)], IIC_SSE_LFENCE>,
+ TB, Requires<[HasSSE2]>;
+def MFENCE : I<0xAE, MRM_F0, (outs), (ins),
+ "mfence", [(int_x86_sse2_mfence)], IIC_SSE_MFENCE>,
+ TB, Requires<[HasMFence]>;
+} // SchedRW
+
+def : Pat<(X86MFence), (MFENCE)>;
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Load/Store XCSR register
+//===----------------------------------------------------------------------===//
+
+def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
+ "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
+ IIC_SSE_LDMXCSR>, VEX, Sched<[WriteLoad]>;
+def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
+ "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
+ IIC_SSE_STMXCSR>, VEX, Sched<[WriteStore]>;
+
+let Predicates = [UseSSE1] in {
+def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
+ "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
+ IIC_SSE_LDMXCSR>, TB, Sched<[WriteLoad]>;
+def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
+ "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
+ IIC_SSE_STMXCSR>, TB, Sched<[WriteStore]>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Move Aligned/Unaligned Packed Integer Instructions
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in { // SSE integer instructions
+
+let hasSideEffects = 0, SchedRW = [WriteMove] in {
+def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
+ VEX;
+def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
+ VEX, VEX_L;
+def VMOVDQUrr : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
+ VEX;
+def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+ "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
+ VEX, VEX_L;
+}
+
+// For Disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
+ SchedRW = [WriteMove] in {
+def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>,
+ VEX;
+def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
+def VMOVDQUrr_REV : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+ "movdqu\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVU_P_RR>,
+ VEX;
+def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
+ "movdqu\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
+}
+
+let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
+ hasSideEffects = 0, SchedRW = [WriteLoad] in {
+def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
+ VEX;
+def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
+ VEX, VEX_L;
+let Predicates = [HasAVX] in {
+ def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
+ XS, VEX;
+ def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+ "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
+ XS, VEX, VEX_L;
+}
+}
+
+let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in {
+def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs),
+ (ins i128mem:$dst, VR128:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
+ VEX;
+def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
+ (ins i256mem:$dst, VR256:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
+ VEX, VEX_L;
+let Predicates = [HasAVX] in {
+def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+ "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
+ XS, VEX;
+def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
+ "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
+ XS, VEX, VEX_L;
+}
+}
+
+let SchedRW = [WriteMove] in {
+let hasSideEffects = 0 in {
+def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>;
+
+def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movdqu\t{$src, $dst|$dst, $src}",
+ [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
+}
+
+// For Disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
+def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>;
+
+def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+ "movdqu\t{$src, $dst|$dst, $src}",
+ [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
+}
+} // SchedRW
+
+let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
+ hasSideEffects = 0, SchedRW = [WriteLoad] in {
+def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "movdqa\t{$src, $dst|$dst, $src}",
+ [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/],
+ IIC_SSE_MOVA_P_RM>;
+def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "movdqu\t{$src, $dst|$dst, $src}",
+ [/*(set VR128:$dst, (loadv2i64 addr:$src))*/],
+ IIC_SSE_MOVU_P_RM>,
+ XS, Requires<[UseSSE2]>;
+}
+
+let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in {
+def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+ "movdqa\t{$src, $dst|$dst, $src}",
+ [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/],
+ IIC_SSE_MOVA_P_MR>;
+def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+ "movdqu\t{$src, $dst|$dst, $src}",
+ [/*(store (v2i64 VR128:$src), addr:$dst)*/],
+ IIC_SSE_MOVU_P_MR>,
+ XS, Requires<[UseSSE2]>;
+}
+
+} // ExeDomain = SSEPackedInt
+
+// Aliases to help the assembler pick two byte VEX encodings by swapping the
+// operands relative to the normal instructions to use VEX.R instead of VEX.B.
+def : InstAlias<"vmovdqa\t{$src, $dst|$dst, $src}",
+ (VMOVDQArr_REV VR128L:$dst, VR128H:$src), 0>;
+def : InstAlias<"vmovdqa\t{$src, $dst|$dst, $src}",
+ (VMOVDQAYrr_REV VR256L:$dst, VR256H:$src), 0>;
+def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}",
+ (VMOVDQUrr_REV VR128L:$dst, VR128H:$src), 0>;
+def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}",
+ (VMOVDQUYrr_REV VR256L:$dst, VR256H:$src), 0>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Arithmetic Instructions
+//===---------------------------------------------------------------------===//
+
+let Sched = WriteVecIMul in
+def SSE_PMADD : OpndItins<
+ IIC_SSE_PMADD, IIC_SSE_PMADD
+>;
+
+let ExeDomain = SSEPackedInt in { // SSE integer instructions
+
+/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
+multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType DstVT, ValueType SrcVT, RegisterClass RC,
+ PatFrag memop_frag, X86MemOperand x86memop,
+ OpndItins itins, bit Is2Addr = 1> {
+ let isCommutable = 1 in
+ def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
+ Sched<[itins.Sched]>;
+ def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
+ (bitconvert (memop_frag addr:$src2)))))]>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+} // ExeDomain = SSEPackedInt
+
+defm PADDB : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
+ SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PADDW : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
+ SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PADDD : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
+ SSE_INTALU_ITINS_P, 1, NoVLX>;
+defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
+ SSE_INTALUQ_ITINS_P, 1, NoVLX>;
+defm PADDSB : PDI_binop_all<0xEC, "paddsb", X86adds, v16i8, v32i8,
+ SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PADDSW : PDI_binop_all<0xED, "paddsw", X86adds, v8i16, v16i16,
+ SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PADDUSB : PDI_binop_all<0xDC, "paddusb", X86addus, v16i8, v32i8,
+ SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PADDUSW : PDI_binop_all<0xDD, "paddusw", X86addus, v8i16, v16i16,
+ SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
+ SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
+ SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PMULHW : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16,
+ SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
+ SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
+ SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+defm PSUBD : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
+ SSE_INTALU_ITINS_P, 0, NoVLX>;
+defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
+ SSE_INTALUQ_ITINS_P, 0, NoVLX>;
+defm PSUBSB : PDI_binop_all<0xE8, "psubsb", X86subs, v16i8, v32i8,
+ SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+defm PSUBSW : PDI_binop_all<0xE9, "psubsw", X86subs, v8i16, v16i16,
+ SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8,
+ SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16,
+ SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+defm PMINUB : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8,
+ SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PMINSW : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16,
+ SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8,
+ SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16,
+ SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8,
+ SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16,
+ SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
+defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
+ loadv2i64, i128mem, SSE_PMADD, 0>, VEX_4V;
+
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
+defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16,
+ VR256, loadv4i64, i256mem, SSE_PMADD,
+ 0>, VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst" in
+defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
+ memopv2i64, i128mem, SSE_PMADD>;
+
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
+defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
+ loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 0>,
+ VEX_4V;
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
+defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
+ loadv4i64, i256mem, SSE_INTMUL_ITINS_P, 0>,
+ VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst" in
+defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
+ memopv2i64, i128mem, SSE_INTALU_ITINS_P>;
+
+let Predicates = [HasAVX, NoVLX] in
+defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128,
+ loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 0>,
+ VEX_4V;
+let Predicates = [HasAVX2, NoVLX] in
+defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32,
+ VR256, loadv4i64, i256mem,
+ SSE_INTMUL_ITINS_P, 0>, VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst" in
+defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128,
+ memopv2i64, i128mem, SSE_INTMUL_ITINS_P>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Logical Instructions
+//===---------------------------------------------------------------------===//
+
+multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
+ string OpcodeStr, SDNode OpNode,
+ SDNode OpNode2, RegisterClass RC,
+ ValueType DstVT, ValueType SrcVT,
+ PatFrag ld_frag, bit Is2Addr = 1> {
+ // src2 is always 128-bit
+ def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, VR128:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))],
+ SSE_INTSHIFT_ITINS_P.rr>, Sched<[WriteVecShift]>;
+ def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, i128mem:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (DstVT (OpNode RC:$src1,
+ (SrcVT (bitconvert (ld_frag addr:$src2))))))],
+ SSE_INTSHIFT_ITINS_P.rm>, Sched<[WriteVecShiftLd, ReadAfterLd]>;
+ def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
+ (ins RC:$src1, u8imm:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))],
+ SSE_INTSHIFT_ITINS_P.ri>, Sched<[WriteVecShift]>;
+}
+
+multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm,
+ string OpcodeStr, SDNode OpNode,
+ SDNode OpNode2, ValueType DstVT128,
+ ValueType DstVT256, ValueType SrcVT,
+ Predicate prd> {
+let Predicates = [HasAVX, prd] in
+ defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
+ OpNode, OpNode2, VR128, DstVT128, SrcVT,
+ loadv2i64, 0>, VEX_4V;
+let Predicates = [HasAVX2, prd] in
+ defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
+ OpNode, OpNode2, VR256, DstVT256, SrcVT,
+ loadv2i64, 0>, VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst" in
+ defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2,
+ VR128, DstVT128, SrcVT, memopv2i64>;
+}
+
+multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr,
+ SDNode OpNode, RegisterClass RC, ValueType VT,
+ bit Is2Addr = 1> {
+ def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (VT (OpNode RC:$src1, (i8 imm:$src2))))],
+ IIC_SSE_INTSHDQ_P_RI>, Sched<[WriteVecShift]>;
+}
+
+multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr,
+ SDNode OpNode> {
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
+ defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
+ VR128, v16i8, 0>, VEX_4V;
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
+ defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
+ VR256, v32i8, 0>, VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst" in
+ defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8>;
+}
+
+let ExeDomain = SSEPackedInt in {
+ defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
+ v8i16, v16i16, v8i16, NoVLX_Or_NoBWI>;
+ defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
+ v4i32, v8i32, v4i32, NoVLX>;
+ defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
+ v2i64, v4i64, v2i64, NoVLX>;
+
+ defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
+ v8i16, v16i16, v8i16, NoVLX_Or_NoBWI>;
+ defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
+ v4i32, v8i32, v4i32, NoVLX>;
+ defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
+ v2i64, v4i64, v2i64, NoVLX>;
+
+ defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
+ v8i16, v16i16, v8i16, NoVLX_Or_NoBWI>;
+ defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
+ v4i32, v8i32, v4i32, NoVLX>;
+
+ defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq>;
+ defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq>;
+ // PSRADQri doesn't exist in SSE[1-3].
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Comparison Instructions
+//===---------------------------------------------------------------------===//
+
+defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
+ SSE_INTALU_ITINS_P, 1, TruePredicate>;
+defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
+ SSE_INTALU_ITINS_P, 1, TruePredicate>;
+defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
+ SSE_INTALU_ITINS_P, 1, TruePredicate>;
+defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
+ SSE_INTALU_ITINS_P, 0, TruePredicate>;
+defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
+ SSE_INTALU_ITINS_P, 0, TruePredicate>;
+defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
+ SSE_INTALU_ITINS_P, 0, TruePredicate>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Shuffle Instructions
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in {
+multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
+ SDNode OpNode, Predicate prd> {
+let Predicates = [HasAVX, prd] in {
+ def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, u8imm:$src2),
+ !strconcat("v", OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
+ IIC_SSE_PSHUF_RI>, VEX, Sched<[WriteShuffle]>;
+ def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src1, u8imm:$src2),
+ !strconcat("v", OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)),
+ (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX,
+ Sched<[WriteShuffleLd]>;
+}
+
+let Predicates = [HasAVX2, prd] in {
+ def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, u8imm:$src2),
+ !strconcat("v", OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))],
+ IIC_SSE_PSHUF_RI>, VEX, VEX_L, Sched<[WriteShuffle]>;
+ def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
+ (ins i256mem:$src1, u8imm:$src2),
+ !strconcat("v", OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)),
+ (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX, VEX_L,
+ Sched<[WriteShuffleLd]>;
+}
+
+let Predicates = [UseSSE2] in {
+ def ri : Ii8<0x70, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
+ IIC_SSE_PSHUF_RI>, Sched<[WriteShuffle]>;
+ def mi : Ii8<0x70, MRMSrcMem,
+ (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
+ (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>,
+ Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+}
+} // ExeDomain = SSEPackedInt
+
+defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd, NoVLX>, PD;
+defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw,
+ NoVLX_Or_NoBWI>, XS;
+defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw,
+ NoVLX_Or_NoBWI>, XD;
+
+let Predicates = [HasAVX] in {
+ def : Pat<(v4f32 (X86PShufd (loadv4f32 addr:$src1), (i8 imm:$imm))),
+ (VPSHUFDmi addr:$src1, imm:$imm)>;
+ def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
+ (VPSHUFDri VR128:$src1, imm:$imm)>;
+}
+
+let Predicates = [UseSSE2] in {
+ def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))),
+ (PSHUFDmi addr:$src1, imm:$imm)>;
+ def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
+ (PSHUFDri VR128:$src1, imm:$imm)>;
+}
+
+//===---------------------------------------------------------------------===//
+// Packed Integer Pack Instructions (SSE & AVX)
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in {
+multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
+ ValueType ArgVT, SDNode OpNode, PatFrag ld_frag,
+ bit Is2Addr = 1> {
+ def rr : PDI<opc, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set VR128:$dst,
+ (OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>,
+ Sched<[WriteShuffle]>;
+ def rm : PDI<opc, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set VR128:$dst,
+ (OutVT (OpNode (ArgVT VR128:$src1),
+ (bitconvert (ld_frag addr:$src2)))))]>,
+ Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+multiclass sse2_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT,
+ ValueType ArgVT, SDNode OpNode> {
+ def Yrr : PDI<opc, MRMSrcReg,
+ (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>,
+ Sched<[WriteShuffle]>;
+ def Yrm : PDI<opc, MRMSrcMem,
+ (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (OutVT (OpNode (ArgVT VR256:$src1),
+ (bitconvert (loadv4i64 addr:$src2)))))]>,
+ Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
+ ValueType ArgVT, SDNode OpNode, PatFrag ld_frag,
+ bit Is2Addr = 1> {
+ def rr : SS48I<opc, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set VR128:$dst,
+ (OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>,
+ Sched<[WriteShuffle]>;
+ def rm : SS48I<opc, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set VR128:$dst,
+ (OutVT (OpNode (ArgVT VR128:$src1),
+ (bitconvert (ld_frag addr:$src2)))))]>,
+ Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+multiclass sse4_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT,
+ ValueType ArgVT, SDNode OpNode> {
+ def Yrr : SS48I<opc, MRMSrcReg,
+ (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>,
+ Sched<[WriteShuffle]>;
+ def Yrm : SS48I<opc, MRMSrcMem,
+ (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (OutVT (OpNode (ArgVT VR256:$src1),
+ (bitconvert (loadv4i64 addr:$src2)))))]>,
+ Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+ defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss,
+ loadv2i64, 0>, VEX_4V;
+ defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss,
+ loadv2i64, 0>, VEX_4V;
+
+ defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus,
+ loadv2i64, 0>, VEX_4V;
+ defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus,
+ loadv2i64, 0>, VEX_4V;
+}
+
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+ defm VPACKSSWB : sse2_pack_y<0x63, "vpacksswb", v32i8, v16i16, X86Packss>,
+ VEX_4V, VEX_L;
+ defm VPACKSSDW : sse2_pack_y<0x6B, "vpackssdw", v16i16, v8i32, X86Packss>,
+ VEX_4V, VEX_L;
+
+ defm VPACKUSWB : sse2_pack_y<0x67, "vpackuswb", v32i8, v16i16, X86Packus>,
+ VEX_4V, VEX_L;
+ defm VPACKUSDW : sse4_pack_y<0x2B, "vpackusdw", v16i16, v8i32, X86Packus>,
+ VEX_4V, VEX_L;
+}
+
+let Constraints = "$src1 = $dst" in {
+ defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss,
+ memopv2i64>;
+ defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss,
+ memopv2i64>;
+
+ defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus,
+ memopv2i64>;
+
+ defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus,
+ memopv2i64>;
+}
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Unpack Instructions
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in {
+multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
+ SDNode OpNode, PatFrag ld_frag, bit Is2Addr = 1> {
+ def rr : PDI<opc, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))],
+ IIC_SSE_UNPCK>, Sched<[WriteShuffle]>;
+ def rm : PDI<opc, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set VR128:$dst, (vt (OpNode VR128:$src1,
+ (bitconvert (ld_frag addr:$src2)))))],
+ IIC_SSE_UNPCK>,
+ Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt,
+ SDNode OpNode> {
+ def Yrr : PDI<opc, MRMSrcReg,
+ (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
+ !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst, (vt (OpNode VR256:$src1, VR256:$src2)))]>,
+ Sched<[WriteShuffle]>;
+ def Yrm : PDI<opc, MRMSrcMem,
+ (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
+ !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst, (vt (OpNode VR256:$src1,
+ (bitconvert (loadv4i64 addr:$src2)))))]>,
+ Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+ defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl,
+ loadv2i64, 0>, VEX_4V;
+ defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl,
+ loadv2i64, 0>, VEX_4V;
+ defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh,
+ loadv2i64, 0>, VEX_4V;
+ defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh,
+ loadv2i64, 0>, VEX_4V;
+}
+let Predicates = [HasAVX, NoVLX] in {
+ defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl,
+ loadv2i64, 0>, VEX_4V;
+ defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl,
+ loadv2i64, 0>, VEX_4V;
+ defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh,
+ loadv2i64, 0>, VEX_4V;
+ defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh,
+ loadv2i64, 0>, VEX_4V;
+}
+
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+ defm VPUNPCKLBW : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl>,
+ VEX_4V, VEX_L;
+ defm VPUNPCKLWD : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl>,
+ VEX_4V, VEX_L;
+ defm VPUNPCKHBW : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh>,
+ VEX_4V, VEX_L;
+ defm VPUNPCKHWD : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh>,
+ VEX_4V, VEX_L;
+}
+let Predicates = [HasAVX2, NoVLX] in {
+ defm VPUNPCKLDQ : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl>,
+ VEX_4V, VEX_L;
+ defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl>,
+ VEX_4V, VEX_L;
+ defm VPUNPCKHDQ : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh>,
+ VEX_4V, VEX_L;
+ defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh>,
+ VEX_4V, VEX_L;
+}
+
+let Constraints = "$src1 = $dst" in {
+ defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl,
+ memopv2i64>;
+ defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl,
+ memopv2i64>;
+ defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl,
+ memopv2i64>;
+ defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl,
+ memopv2i64>;
+
+ defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh,
+ memopv2i64>;
+ defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh,
+ memopv2i64>;
+ defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh,
+ memopv2i64>;
+ defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh,
+ memopv2i64>;
+}
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Extract and Insert
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in {
+multiclass sse2_pinsrw<bit Is2Addr = 1> {
+ def rri : Ii8<0xC4, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1,
+ GR32orGR64:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))],
+ IIC_SSE_PINSRW>, Sched<[WriteShuffle]>;
+ def rmi : Ii8<0xC4, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1,
+ i16mem:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
+ imm:$src3))], IIC_SSE_PINSRW>,
+ Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+// Extract
+let Predicates = [HasAVX, NoBWI] in
+def VPEXTRWri : Ii8<0xC5, MRMSrcReg,
+ (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
+ "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
+ imm:$src2))]>, PD, VEX,
+ Sched<[WriteShuffle]>;
+def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
+ (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
+ "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
+ imm:$src2))], IIC_SSE_PEXTRW>,
+ Sched<[WriteShuffleLd, ReadAfterLd]>;
+
+// Insert
+let Predicates = [HasAVX, NoBWI] in
+defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V;
+
+let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
+defm PINSRW : sse2_pinsrw, PD;
+
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Mask Creation
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in {
+
+def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
+ (ins VR128:$src),
+ "pmovmskb\t{$src, $dst|$dst, $src}",
+ [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))],
+ IIC_SSE_MOVMSK>, VEX;
+
+let Predicates = [HasAVX2] in {
+def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
+ (ins VR256:$src),
+ "pmovmskb\t{$src, $dst|$dst, $src}",
+ [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>,
+ VEX, VEX_L;
+}
+
+def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
+ "pmovmskb\t{$src, $dst|$dst, $src}",
+ [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))],
+ IIC_SSE_MOVMSK>;
+
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Conditional Store
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in {
+
+let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in
+def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
+ (ins VR128:$src, VR128:$mask),
+ "maskmovdqu\t{$mask, $src|$src, $mask}",
+ [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
+ IIC_SSE_MASKMOV>, VEX;
+let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
+def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
+ (ins VR128:$src, VR128:$mask),
+ "maskmovdqu\t{$mask, $src|$src, $mask}",
+ [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
+ IIC_SSE_MASKMOV>, VEX;
+
+let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in
+def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
+ "maskmovdqu\t{$mask, $src|$src, $mask}",
+ [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
+ IIC_SSE_MASKMOV>;
+let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
+def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
+ "maskmovdqu\t{$mask, $src|$src, $mask}",
+ [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
+ IIC_SSE_MASKMOV>;
+
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Move Doubleword/Quadword
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+// Move Int Doubleword to Packed Double Int
+//
+let ExeDomain = SSEPackedInt in {
+def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
+ VEX, Sched<[WriteMove]>;
+def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
+ IIC_SSE_MOVDQ>,
+ VEX, Sched<[WriteLoad]>;
+def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2i64 (scalar_to_vector GR64:$src)))],
+ IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
+def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteLoad]>;
+let isCodeGenOnly = 1 in
+def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (bitconvert GR64:$src))],
+ IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+
+def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
+ Sched<[WriteMove]>;
+def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
+ IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+ "mov{d|q}\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2i64 (scalar_to_vector GR64:$src)))],
+ IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
+def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ "mov{d|q}\t{$src, $dst|$dst, $src}",
+ [], IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+let isCodeGenOnly = 1 in
+def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+ "mov{d|q}\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (bitconvert GR64:$src))],
+ IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// Move Int Doubleword to Single Scalar
+//
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
+ def VMOVDI2SSrr : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (bitconvert GR32:$src))],
+ IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+
+ def VMOVDI2SSrm : VS2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
+ IIC_SSE_MOVDQ>,
+ VEX, Sched<[WriteLoad]>;
+ def MOVDI2SSrr : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (bitconvert GR32:$src))],
+ IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
+
+ def MOVDI2SSrm : S2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
+ IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
+
+//===---------------------------------------------------------------------===//
+// Move Packed Doubleword Int to Packed Double Int
+//
+let ExeDomain = SSEPackedInt in {
+def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
+ (iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX,
+ Sched<[WriteMove]>;
+def VMOVPDI2DImr : VS2I<0x7E, MRMDestMem, (outs),
+ (ins i32mem:$dst, VR128:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(store (i32 (extractelt (v4i32 VR128:$src),
+ (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
+ VEX, Sched<[WriteStore]>;
+def MOVPDI2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
+ (iPTR 0)))], IIC_SSE_MOVD_ToGP>,
+ Sched<[WriteMove]>;
+def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(store (i32 (extractelt (v4i32 VR128:$src),
+ (iPTR 0))), addr:$dst)],
+ IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+} // ExeDomain = SSEPackedInt
+
+def : Pat<(v8i32 (X86Vinsert (v8i32 immAllZerosV), GR32:$src2, (iPTR 0))),
+ (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
+
+def : Pat<(v4i64 (X86Vinsert (bc_v4i64 (v8i32 immAllZerosV)), GR64:$src2, (iPTR 0))),
+ (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
+
+def : Pat<(v8i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))),
+ (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
+
+def : Pat<(v4i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
+ (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
+
+//===---------------------------------------------------------------------===//
+// Move Packed Doubleword Int first element to Doubleword Int
+//
+let ExeDomain = SSEPackedInt in {
+let SchedRW = [WriteMove] in {
+def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
+ (iPTR 0)))],
+ IIC_SSE_MOVD_ToGP>,
+ VEX;
+
+def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+ "mov{d|q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
+ (iPTR 0)))],
+ IIC_SSE_MOVD_ToGP>;
+} //SchedRW
+
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
+def VMOVPQIto64rm : VRS2I<0x7E, MRMDestMem, (outs),
+ (ins i64mem:$dst, VR128:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
+def MOVPQIto64rm : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+ "mov{d|q}\t{$src, $dst|$dst, $src}",
+ [], IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// Bitcast FR64 <-> GR64
+//
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
+ let Predicates = [UseAVX] in
+ def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
+ VEX, Sched<[WriteLoad]>;
+ def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (bitconvert FR64:$src))],
+ IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+ def VMOVSDto64mr : VRS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
+ IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
+
+ def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))],
+ IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+ def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
+ "mov{d|q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (bitconvert FR64:$src))],
+ IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
+ def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
+ IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
+
+//===---------------------------------------------------------------------===//
+// Move Scalar Single to Double Int
+//
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
+ def VMOVSS2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (bitconvert FR32:$src))],
+ IIC_SSE_MOVD_ToGP>, VEX, Sched<[WriteMove]>;
+ def VMOVSS2DImr : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
+ IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
+ def MOVSS2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (bitconvert FR32:$src))],
+ IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
+ def MOVSS2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
+ IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
+
+let Predicates = [UseAVX] in {
+ let AddedComplexity = 15 in {
+ def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
+ (VMOVDI2PDIrr GR32:$src)>;
+
+ def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
+ (VMOV64toPQIrr GR64:$src)>;
+
+ def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+ (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
+ (SUBREG_TO_REG (i64 0), (VMOV64toPQIrr GR64:$src), sub_xmm)>;
+ }
+ // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
+ // These instructions also write zeros in the high part of a 256-bit register.
+ let AddedComplexity = 20 in {
+ def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+ (VMOVDI2PDIrm addr:$src)>;
+ def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
+ (VMOVDI2PDIrm addr:$src)>;
+ def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+ (VMOVDI2PDIrm addr:$src)>;
+ def : Pat<(v4i32 (X86vzload addr:$src)),
+ (VMOVDI2PDIrm addr:$src)>;
+ def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
+ (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
+ (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrm addr:$src), sub_xmm)>;
+ def : Pat<(v8i32 (X86vzload addr:$src)),
+ (SUBREG_TO_REG (i64 0), (VMOVDI2PDIrm addr:$src), sub_xmm)>;
+ }
+ // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
+ def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
+ (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
+ (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src), sub_xmm)>;
+}
+
+let Predicates = [UseSSE2] in {
+ let AddedComplexity = 15 in {
+ def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
+ (MOVDI2PDIrr GR32:$src)>;
+
+ def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
+ (MOV64toPQIrr GR64:$src)>;
+ }
+ let AddedComplexity = 20 in {
+ def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+ (MOVDI2PDIrm addr:$src)>;
+ def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
+ (MOVDI2PDIrm addr:$src)>;
+ def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+ (MOVDI2PDIrm addr:$src)>;
+ def : Pat<(v4i32 (X86vzload addr:$src)),
+ (MOVDI2PDIrm addr:$src)>;
+ }
+}
+
+// These are the correct encodings of the instructions so that we know how to
+// read correct assembly, even though we continue to emit the wrong ones for
+// compatibility with Darwin's buggy assembler.
+def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
+ (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
+def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
+ (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
+// Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX.
+def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
+ (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>;
+def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
+ (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Move Quadword
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+// Move Quadword Int to Packed Quadword Int
+//
+
+let ExeDomain = SSEPackedInt, SchedRW = [WriteLoad] in {
+def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
+ VEX, Requires<[UseAVX]>;
+def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))))],
+ IIC_SSE_MOVDQ>, XS,
+ Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
+} // ExeDomain, SchedRW
+
+//===---------------------------------------------------------------------===//
+// Move Packed Quadword Int to Quadword Int
+//
+let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in {
+def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(store (i64 (extractelt (v2i64 VR128:$src),
+ (iPTR 0))), addr:$dst)],
+ IIC_SSE_MOVDQ>, VEX;
+def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(store (i64 (extractelt (v2i64 VR128:$src),
+ (iPTR 0))), addr:$dst)],
+ IIC_SSE_MOVDQ>;
+} // ExeDomain, SchedRW
+
+// For disassembler only
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
+ SchedRW = [WriteVecLogic] in {
+def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+ "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, VEX;
+def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+ "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>;
+}
+
+// Aliases to help the assembler pick two byte VEX encodings by swapping the
+// operands relative to the normal instructions to use VEX.R instead of VEX.B.
+def : InstAlias<"vmovq\t{$src, $dst|$dst, $src}",
+ (VMOVPQI2QIrr VR128L:$dst, VR128H:$src), 0>;
+
+let Predicates = [UseAVX], AddedComplexity = 20 in {
+ def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+ (VMOVQI2PQIrm addr:$src)>;
+ def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
+ (VMOVQI2PQIrm addr:$src)>;
+ def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
+ (VMOVQI2PQIrm addr:$src)>;
+ def : Pat<(v2i64 (X86vzload addr:$src)),
+ (VMOVQI2PQIrm addr:$src)>;
+ def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
+ (SUBREG_TO_REG (i64 0), (VMOVQI2PQIrm addr:$src), sub_xmm)>;
+ def : Pat<(v4i64 (X86vzload addr:$src)),
+ (SUBREG_TO_REG (i64 0), (VMOVQI2PQIrm addr:$src), sub_xmm)>;
+}
+
+let Predicates = [UseSSE2], AddedComplexity = 20 in {
+ def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+ (MOVQI2PQIrm addr:$src)>;
+ def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
+ (MOVQI2PQIrm addr:$src)>;
+ def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
+ (MOVQI2PQIrm addr:$src)>;
+ def : Pat<(v2i64 (X86vzload addr:$src)), (MOVQI2PQIrm addr:$src)>;
+}
+
+//===---------------------------------------------------------------------===//
+// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
+// IA32 document. movq xmm1, xmm2 does clear the high bits.
+//
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in {
+let AddedComplexity = 15 in
+def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
+ IIC_SSE_MOVQ_RR>,
+ XS, VEX, Requires<[UseAVX]>;
+let AddedComplexity = 15 in
+def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
+ IIC_SSE_MOVQ_RR>,
+ XS, Requires<[UseSSE2]>;
+} // ExeDomain, SchedRW
+
+let AddedComplexity = 20 in {
+ let Predicates = [UseAVX] in {
+ def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
+ (VMOVZPQILo2PQIrr VR128:$src)>;
+ }
+ let Predicates = [UseSSE2] in {
+ def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
+ (MOVZPQILo2PQIrr VR128:$src)>;
+ }
+}
+
+//===---------------------------------------------------------------------===//
+// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
+//===---------------------------------------------------------------------===//
+multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
+ ValueType vt, RegisterClass RC, PatFrag mem_frag,
+ X86MemOperand x86memop> {
+def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (vt (OpNode RC:$src)))],
+ IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
+def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (OpNode (mem_frag addr:$src)))],
+ IIC_SSE_MOV_LH>, Sched<[WriteLoad]>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+ defm VMOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
+ v4f32, VR128, loadv4f32, f128mem>, VEX;
+ defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
+ v4f32, VR128, loadv4f32, f128mem>, VEX;
+ defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
+ v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L;
+ defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
+ v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L;
+}
+defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
+ memopv4f32, f128mem>;
+defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
+ memopv4f32, f128mem>;
+
+let Predicates = [HasAVX, NoVLX] in {
+ def : Pat<(v4i32 (X86Movshdup VR128:$src)),
+ (VMOVSHDUPrr VR128:$src)>;
+ def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (loadv2i64 addr:$src)))),
+ (VMOVSHDUPrm addr:$src)>;
+ def : Pat<(v4i32 (X86Movsldup VR128:$src)),
+ (VMOVSLDUPrr VR128:$src)>;
+ def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (loadv2i64 addr:$src)))),
+ (VMOVSLDUPrm addr:$src)>;
+ def : Pat<(v8i32 (X86Movshdup VR256:$src)),
+ (VMOVSHDUPYrr VR256:$src)>;
+ def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (loadv4i64 addr:$src)))),
+ (VMOVSHDUPYrm addr:$src)>;
+ def : Pat<(v8i32 (X86Movsldup VR256:$src)),
+ (VMOVSLDUPYrr VR256:$src)>;
+ def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (loadv4i64 addr:$src)))),
+ (VMOVSLDUPYrm addr:$src)>;
+}
+
+let Predicates = [UseSSE3] in {
+ def : Pat<(v4i32 (X86Movshdup VR128:$src)),
+ (MOVSHDUPrr VR128:$src)>;
+ def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
+ (MOVSHDUPrm addr:$src)>;
+ def : Pat<(v4i32 (X86Movsldup VR128:$src)),
+ (MOVSLDUPrr VR128:$src)>;
+ def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))),
+ (MOVSLDUPrm addr:$src)>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE3 - Replicate Double FP - MOVDDUP
+//===---------------------------------------------------------------------===//
+
+multiclass sse3_replicate_dfp<string OpcodeStr> {
+def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))],
+ IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
+def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst,
+ (v2f64 (X86Movddup
+ (scalar_to_vector (loadf64 addr:$src)))))],
+ IIC_SSE_MOV_LH>, Sched<[WriteLoad]>;
+}
+
+// FIXME: Merge with above classe when there're patterns for the ymm version
+multiclass sse3_replicate_dfp_y<string OpcodeStr> {
+def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>,
+ Sched<[WriteFShuffle]>;
+def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst,
+ (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>,
+ Sched<[WriteLoad]>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+ defm VMOVDDUP : sse3_replicate_dfp<"vmovddup">, VEX;
+ defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L;
+}
+
+defm MOVDDUP : sse3_replicate_dfp<"movddup">;
+
+
+let Predicates = [HasAVX, NoVLX] in {
+ def : Pat<(X86Movddup (loadv2f64 addr:$src)),
+ (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+
+ // 256-bit version
+ def : Pat<(X86Movddup (loadv4i64 addr:$src)),
+ (VMOVDDUPYrm addr:$src)>;
+ def : Pat<(X86Movddup (v4i64 VR256:$src)),
+ (VMOVDDUPYrr VR256:$src)>;
+}
+
+let Predicates = [HasAVX] in {
+ def : Pat<(X86Movddup (bc_v2f64 (loadv4f32 addr:$src))),
+ (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+ def : Pat<(X86Movddup (bc_v2f64 (loadv2i64 addr:$src))),
+ (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+ def : Pat<(X86Movddup (bc_v2f64
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+ (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+}
+
+let Predicates = [HasAVX, NoVLX] in
+def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
+ (VMOVDDUPrm addr:$src)>;
+let Predicates = [HasAVX1Only] in
+def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
+ (VMOVDDUPrm addr:$src)>;
+
+let Predicates = [UseSSE3] in {
+ def : Pat<(X86Movddup (memopv2f64 addr:$src)),
+ (MOVDDUPrm addr:$src)>;
+ def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
+ (MOVDDUPrm addr:$src)>;
+ def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))),
+ (MOVDDUPrm addr:$src)>;
+ def : Pat<(X86Movddup (bc_v2f64
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+ (MOVDDUPrm addr:$src)>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE3 - Move Unaligned Integer
+//===---------------------------------------------------------------------===//
+
+let SchedRW = [WriteLoad] in {
+let Predicates = [HasAVX] in {
+ def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "vlddqu\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX;
+ def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+ "vlddqu\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
+ VEX, VEX_L;
+}
+def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "lddqu\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))],
+ IIC_SSE_LDDQU>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE3 - Arithmetic
+//===---------------------------------------------------------------------===//
+
+multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC,
+ X86MemOperand x86memop, OpndItins itins,
+ PatFrag ld_frag, bit Is2Addr = 1> {
+ def rr : I<0xD0, MRMSrcReg,
+ (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (Int RC:$src1, RC:$src2))], itins.rr>,
+ Sched<[itins.Sched]>;
+ def rm : I<0xD0, MRMSrcMem,
+ (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))], itins.rr>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in {
+ let ExeDomain = SSEPackedSingle in {
+ defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128,
+ f128mem, SSE_ALU_F32P, loadv4f32, 0>, XD, VEX_4V;
+ defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256,
+ f256mem, SSE_ALU_F32P, loadv8f32, 0>, XD, VEX_4V, VEX_L;
+ }
+ let ExeDomain = SSEPackedDouble in {
+ defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128,
+ f128mem, SSE_ALU_F64P, loadv2f64, 0>, PD, VEX_4V;
+ defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256,
+ f256mem, SSE_ALU_F64P, loadv4f64, 0>, PD, VEX_4V, VEX_L;
+ }
+}
+let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
+ let ExeDomain = SSEPackedSingle in
+ defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128,
+ f128mem, SSE_ALU_F32P, memopv4f32>, XD;
+ let ExeDomain = SSEPackedDouble in
+ defm ADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "addsubpd", VR128,
+ f128mem, SSE_ALU_F64P, memopv2f64>, PD;
+}
+
+// Patterns used to select 'addsub' instructions.
+let Predicates = [HasAVX] in {
+ def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))),
+ (VADDSUBPSrr VR128:$lhs, VR128:$rhs)>;
+ def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (loadv4f32 addr:$rhs))),
+ (VADDSUBPSrm VR128:$lhs, f128mem:$rhs)>;
+ def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))),
+ (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
+ def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (loadv2f64 addr:$rhs))),
+ (VADDSUBPDrm VR128:$lhs, f128mem:$rhs)>;
+
+ def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (v8f32 VR256:$rhs))),
+ (VADDSUBPSYrr VR256:$lhs, VR256:$rhs)>;
+ def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (loadv8f32 addr:$rhs))),
+ (VADDSUBPSYrm VR256:$lhs, f256mem:$rhs)>;
+ def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (v4f64 VR256:$rhs))),
+ (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>;
+ def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (loadv4f64 addr:$rhs))),
+ (VADDSUBPDYrm VR256:$lhs, f256mem:$rhs)>;
+}
+
+let Predicates = [UseSSE3] in {
+ def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))),
+ (ADDSUBPSrr VR128:$lhs, VR128:$rhs)>;
+ def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (memopv4f32 addr:$rhs))),
+ (ADDSUBPSrm VR128:$lhs, f128mem:$rhs)>;
+ def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))),
+ (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
+ def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (memopv2f64 addr:$rhs))),
+ (ADDSUBPDrm VR128:$lhs, f128mem:$rhs)>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE3 Instructions
+//===---------------------------------------------------------------------===//
+
+// Horizontal ops
+multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
+ X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag,
+ bit Is2Addr = 1> {
+ def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
+ Sched<[WriteFAdd]>;
+
+ def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))],
+ IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>;
+}
+multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
+ X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag,
+ bit Is2Addr = 1> {
+ def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
+ Sched<[WriteFAdd]>;
+
+ def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))],
+ IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in {
+ let ExeDomain = SSEPackedSingle in {
+ defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
+ X86fhadd, loadv4f32, 0>, VEX_4V;
+ defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
+ X86fhsub, loadv4f32, 0>, VEX_4V;
+ defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
+ X86fhadd, loadv8f32, 0>, VEX_4V, VEX_L;
+ defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
+ X86fhsub, loadv8f32, 0>, VEX_4V, VEX_L;
+ }
+ let ExeDomain = SSEPackedDouble in {
+ defm VHADDPD : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem,
+ X86fhadd, loadv2f64, 0>, VEX_4V;
+ defm VHSUBPD : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem,
+ X86fhsub, loadv2f64, 0>, VEX_4V;
+ defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem,
+ X86fhadd, loadv4f64, 0>, VEX_4V, VEX_L;
+ defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem,
+ X86fhsub, loadv4f64, 0>, VEX_4V, VEX_L;
+ }
+}
+
+let Constraints = "$src1 = $dst" in {
+ let ExeDomain = SSEPackedSingle in {
+ defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd,
+ memopv4f32>;
+ defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub,
+ memopv4f32>;
+ }
+ let ExeDomain = SSEPackedDouble in {
+ defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd,
+ memopv2f64>;
+ defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub,
+ memopv2f64>;
+ }
+}
+
+//===---------------------------------------------------------------------===//
+// SSSE3 - Packed Absolute Instructions
+//===---------------------------------------------------------------------===//
+
+
+/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
+multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt,
+ SDNode OpNode, PatFrag ld_frag> {
+ def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (vt (OpNode VR128:$src)))],
+ IIC_SSE_PABS_RR>, Sched<[WriteVecALU]>;
+
+ def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst,
+ (vt (OpNode (bitconvert (ld_frag addr:$src)))))],
+ IIC_SSE_PABS_RM>, Sched<[WriteVecALULd]>;
+}
+
+/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
+multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt,
+ SDNode OpNode> {
+ def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (vt (OpNode VR256:$src)))]>,
+ Sched<[WriteVecALU]>;
+
+ def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins i256mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst,
+ (vt (OpNode (bitconvert (loadv4i64 addr:$src)))))]>,
+ Sched<[WriteVecALULd]>;
+}
+
+// Helper fragments to match sext vXi1 to vXiY.
+def v16i1sextv16i8 : PatLeaf<(v16i8 (X86pcmpgt (bc_v16i8 (v4i32 immAllZerosV)),
+ VR128:$src))>;
+def v8i1sextv8i16 : PatLeaf<(v8i16 (X86vsrai VR128:$src, (i8 15)))>;
+def v4i1sextv4i32 : PatLeaf<(v4i32 (X86vsrai VR128:$src, (i8 31)))>;
+def v32i1sextv32i8 : PatLeaf<(v32i8 (X86pcmpgt (bc_v32i8 (v8i32 immAllZerosV)),
+ VR256:$src))>;
+def v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256:$src, (i8 15)))>;
+def v8i1sextv8i32 : PatLeaf<(v8i32 (X86vsrai VR256:$src, (i8 31)))>;
+
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+ defm VPABSB : SS3I_unop_rm<0x1C, "vpabsb", v16i8, X86Abs, loadv2i64>, VEX;
+ defm VPABSW : SS3I_unop_rm<0x1D, "vpabsw", v8i16, X86Abs, loadv2i64>, VEX;
+}
+let Predicates = [HasAVX, NoVLX] in {
+ defm VPABSD : SS3I_unop_rm<0x1E, "vpabsd", v4i32, X86Abs, loadv2i64>, VEX;
+}
+
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+ def : Pat<(xor
+ (bc_v2i64 (v16i1sextv16i8)),
+ (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
+ (VPABSBrr VR128:$src)>;
+ def : Pat<(xor
+ (bc_v2i64 (v8i1sextv8i16)),
+ (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))),
+ (VPABSWrr VR128:$src)>;
+}
+let Predicates = [HasAVX, NoVLX] in {
+ def : Pat<(xor
+ (bc_v2i64 (v4i1sextv4i32)),
+ (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))),
+ (VPABSDrr VR128:$src)>;
+}
+
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+ defm VPABSB : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, X86Abs>, VEX, VEX_L;
+ defm VPABSW : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, X86Abs>, VEX, VEX_L;
+}
+let Predicates = [HasAVX2, NoVLX] in {
+ defm VPABSD : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, X86Abs>, VEX, VEX_L;
+}
+
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+ def : Pat<(xor
+ (bc_v4i64 (v32i1sextv32i8)),
+ (bc_v4i64 (add (v32i8 VR256:$src), (v32i1sextv32i8)))),
+ (VPABSBYrr VR256:$src)>;
+ def : Pat<(xor
+ (bc_v4i64 (v16i1sextv16i16)),
+ (bc_v4i64 (add (v16i16 VR256:$src), (v16i1sextv16i16)))),
+ (VPABSWYrr VR256:$src)>;
+}
+let Predicates = [HasAVX2, NoVLX] in {
+ def : Pat<(xor
+ (bc_v4i64 (v8i1sextv8i32)),
+ (bc_v4i64 (add (v8i32 VR256:$src), (v8i1sextv8i32)))),
+ (VPABSDYrr VR256:$src)>;
+}
+
+defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, X86Abs, memopv2i64>;
+defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, X86Abs, memopv2i64>;
+defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, X86Abs, memopv2i64>;
+
+let Predicates = [UseSSSE3] in {
+ def : Pat<(xor
+ (bc_v2i64 (v16i1sextv16i8)),
+ (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
+ (PABSBrr VR128:$src)>;
+ def : Pat<(xor
+ (bc_v2i64 (v8i1sextv8i16)),
+ (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))),
+ (PABSWrr VR128:$src)>;
+ def : Pat<(xor
+ (bc_v2i64 (v4i1sextv4i32)),
+ (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))),
+ (PABSDrr VR128:$src)>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSSE3 - Packed Binary Operator Instructions
+//===---------------------------------------------------------------------===//
+
+let Sched = WriteVecALU in {
+def SSE_PHADDSUBD : OpndItins<
+ IIC_SSE_PHADDSUBD_RR, IIC_SSE_PHADDSUBD_RM
+>;
+def SSE_PHADDSUBSW : OpndItins<
+ IIC_SSE_PHADDSUBSW_RR, IIC_SSE_PHADDSUBSW_RM
+>;
+def SSE_PHADDSUBW : OpndItins<
+ IIC_SSE_PHADDSUBW_RR, IIC_SSE_PHADDSUBW_RM
+>;
+}
+let Sched = WriteShuffle in
+def SSE_PSHUFB : OpndItins<
+ IIC_SSE_PSHUFB_RR, IIC_SSE_PSHUFB_RM
+>;
+let Sched = WriteVecALU in
+def SSE_PSIGN : OpndItins<
+ IIC_SSE_PSIGN_RR, IIC_SSE_PSIGN_RM
+>;
+let Sched = WriteVecIMul in
+def SSE_PMULHRSW : OpndItins<
+ IIC_SSE_PMULHRSW, IIC_SSE_PMULHRSW
+>;
+
+/// SS3I_binop_rm - Simple SSSE3 bin op
+multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType DstVT, ValueType OpVT, RegisterClass RC,
+ PatFrag memop_frag, X86MemOperand x86memop,
+ OpndItins itins, bit Is2Addr = 1> {
+ let isCommutable = 1 in
+ def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))], itins.rr>,
+ Sched<[itins.Sched]>;
+ def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst,
+ (DstVT (OpNode (OpVT RC:$src1),
+ (bitconvert (memop_frag addr:$src2)))))], itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
+multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId128, OpndItins itins,
+ PatFrag ld_frag, bit Is2Addr = 1> {
+ let isCommutable = 1 in
+ def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+ Sched<[itins.Sched]>;
+ def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set VR128:$dst,
+ (IntId128 VR128:$src1,
+ (bitconvert (ld_frag addr:$src2))))]>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId256,
+ X86FoldableSchedWrite Sched> {
+ let isCommutable = 1 in
+ def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
+ Sched<[Sched]>;
+ def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, i256mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (IntId256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2))))]>,
+ Sched<[Sched.Folded, ReadAfterLd]>;
+}
+
+let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+let isCommutable = 0 in {
+ defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8,
+ VR128, loadv2i64, i128mem,
+ SSE_PSHUFB, 0>, VEX_4V;
+ defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16,
+ v16i8, VR128, loadv2i64, i128mem,
+ SSE_PMADD, 0>, VEX_4V;
+}
+defm VPMULHRSW : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16,
+ VR128, loadv2i64, i128mem,
+ SSE_PMULHRSW, 0>, VEX_4V;
+}
+
+let ImmT = NoImm, Predicates = [HasAVX] in {
+let isCommutable = 0 in {
+ defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128,
+ loadv2i64, i128mem,
+ SSE_PHADDSUBW, 0>, VEX_4V;
+ defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128,
+ loadv2i64, i128mem,
+ SSE_PHADDSUBD, 0>, VEX_4V;
+ defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128,
+ loadv2i64, i128mem,
+ SSE_PHADDSUBW, 0>, VEX_4V;
+ defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
+ loadv2i64, i128mem,
+ SSE_PHADDSUBD, 0>, VEX_4V;
+ defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb",
+ int_x86_ssse3_psign_b_128,
+ SSE_PSIGN, loadv2i64, 0>, VEX_4V;
+ defm VPSIGNW : SS3I_binop_rm_int<0x09, "vpsignw",
+ int_x86_ssse3_psign_w_128,
+ SSE_PSIGN, loadv2i64, 0>, VEX_4V;
+ defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd",
+ int_x86_ssse3_psign_d_128,
+ SSE_PSIGN, loadv2i64, 0>, VEX_4V;
+ defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw",
+ int_x86_ssse3_phadd_sw_128,
+ SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V;
+ defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw",
+ int_x86_ssse3_phsub_sw_128,
+ SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V;
+}
+}
+
+let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+let isCommutable = 0 in {
+ defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8,
+ VR256, loadv4i64, i256mem,
+ SSE_PSHUFB, 0>, VEX_4V, VEX_L;
+ defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16,
+ v32i8, VR256, loadv4i64, i256mem,
+ SSE_PMADD, 0>, VEX_4V, VEX_L;
+}
+defm VPMULHRSWY : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16,
+ VR256, loadv4i64, i256mem,
+ SSE_PMULHRSW, 0>, VEX_4V, VEX_L;
+}
+
+let ImmT = NoImm, Predicates = [HasAVX2] in {
+let isCommutable = 0 in {
+ defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16,
+ VR256, loadv4i64, i256mem,
+ SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+ defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256,
+ loadv4i64, i256mem,
+ SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+ defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16,
+ VR256, loadv4i64, i256mem,
+ SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+ defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
+ loadv4i64, i256mem,
+ SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+ defm VPSIGNBY : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
+ WriteVecALU>, VEX_4V, VEX_L;
+ defm VPSIGNWY : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w,
+ WriteVecALU>, VEX_4V, VEX_L;
+ defm VPSIGNDY : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
+ WriteVecALU>, VEX_4V, VEX_L;
+ defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw",
+ int_x86_avx2_phadd_sw,
+ WriteVecALU>, VEX_4V, VEX_L;
+ defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw",
+ int_x86_avx2_phsub_sw,
+ WriteVecALU>, VEX_4V, VEX_L;
+}
+}
+
+// None of these have i8 immediate fields.
+let ImmT = NoImm, Constraints = "$src1 = $dst" in {
+let isCommutable = 0 in {
+ defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128,
+ memopv2i64, i128mem, SSE_PHADDSUBW>;
+ defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128,
+ memopv2i64, i128mem, SSE_PHADDSUBD>;
+ defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128,
+ memopv2i64, i128mem, SSE_PHADDSUBW>;
+ defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128,
+ memopv2i64, i128mem, SSE_PHADDSUBD>;
+ defm PSIGNB : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128,
+ SSE_PSIGN, memopv2i64>;
+ defm PSIGNW : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128,
+ SSE_PSIGN, memopv2i64>;
+ defm PSIGND : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128,
+ SSE_PSIGN, memopv2i64>;
+ defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128,
+ memopv2i64, i128mem, SSE_PSHUFB>;
+ defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw",
+ int_x86_ssse3_phadd_sw_128,
+ SSE_PHADDSUBSW, memopv2i64>;
+ defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw",
+ int_x86_ssse3_phsub_sw_128,
+ SSE_PHADDSUBSW, memopv2i64>;
+ defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16,
+ v16i8, VR128, memopv2i64, i128mem,
+ SSE_PMADD>;
+}
+defm PMULHRSW : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16,
+ VR128, memopv2i64, i128mem, SSE_PMULHRSW>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSSE3 - Packed Align Instruction Patterns
+//===---------------------------------------------------------------------===//
+
+multiclass ssse3_palignr<string asm, bit Is2Addr = 1> {
+ let hasSideEffects = 0 in {
+ def rri : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [], IIC_SSE_PALIGNRR>, Sched<[WriteShuffle]>;
+ let mayLoad = 1 in
+ def rmi : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [], IIC_SSE_PALIGNRM>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+ }
+}
+
+multiclass ssse3_palignr_y<string asm, bit Is2Addr = 1> {
+ let hasSideEffects = 0 in {
+ def Yrri : SS3AI<0x0F, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, u8imm:$src3),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, Sched<[WriteShuffle]>;
+ let mayLoad = 1 in
+ def Yrmi : SS3AI<0x0F, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, i256mem:$src2, u8imm:$src3),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+ }
+}
+
+let Predicates = [HasAVX] in
+ defm VPALIGNR : ssse3_palignr<"vpalignr", 0>, VEX_4V;
+let Predicates = [HasAVX2] in
+ defm VPALIGNR : ssse3_palignr_y<"vpalignr", 0>, VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
+ defm PALIGNR : ssse3_palignr<"palignr">;
+
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+def : Pat<(v8i32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPALIGNRYrri VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v8f32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPALIGNRYrri VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPALIGNRYrri VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v32i8 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPALIGNRYrri VR256:$src1, VR256:$src2, imm:$imm)>;
+}
+
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (VPALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
+def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (VPALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
+def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (VPALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
+def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (VPALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
+}
+
+let Predicates = [UseSSSE3] in {
+def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (PALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
+def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (PALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
+def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (PALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
+def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (PALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSSE3 - Thread synchronization
+//===---------------------------------------------------------------------===//
+
+let SchedRW = [WriteSystem] in {
+let usesCustomInserter = 1 in {
+def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3),
+ [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>,
+ Requires<[HasSSE3]>;
+}
+
+let Uses = [EAX, ECX, EDX] in
+def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", [], IIC_SSE_MONITOR>,
+ TB, Requires<[HasSSE3]>;
+
+let Uses = [ECX, EAX] in
+def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait",
+ [(int_x86_sse3_mwait ECX, EAX)], IIC_SSE_MWAIT>,
+ TB, Requires<[HasSSE3]>;
+} // SchedRW
+
+def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>;
+def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>;
+
+def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORrrr)>,
+ Requires<[Not64BitMode]>;
+def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>,
+ Requires<[In64BitMode]>;
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Packed Move with Sign/Zero Extend
+//===----------------------------------------------------------------------===//
+
+multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
+ RegisterClass OutRC, RegisterClass InRC,
+ OpndItins itins> {
+ def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [], itins.rr>,
+ Sched<[itins.Sched]>;
+
+ def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [],
+ itins.rm>, Sched<[itins.Sched.Folded]>;
+}
+
+multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr,
+ X86MemOperand MemOp, X86MemOperand MemYOp,
+ OpndItins SSEItins, OpndItins AVXItins,
+ OpndItins AVX2Itins, Predicate prd> {
+ defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128, SSEItins>;
+ let Predicates = [HasAVX, prd] in
+ defm V#NAME : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp,
+ VR128, VR128, AVXItins>, VEX;
+ let Predicates = [HasAVX2, prd] in
+ defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp,
+ VR256, VR128, AVX2Itins>, VEX, VEX_L;
+}
+
+multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
+ X86MemOperand MemYOp, Predicate prd> {
+ defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr),
+ MemOp, MemYOp,
+ SSE_INTALU_ITINS_SHUFF_P,
+ DEFAULT_ITINS_SHUFFLESCHED,
+ DEFAULT_ITINS_SHUFFLESCHED, prd>;
+ defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10),
+ !strconcat("pmovzx", OpcodeStr),
+ MemOp, MemYOp,
+ SSE_INTALU_ITINS_SHUFF_P,
+ DEFAULT_ITINS_SHUFFLESCHED,
+ DEFAULT_ITINS_SHUFFLESCHED, prd>;
+}
+
+defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem, NoVLX_Or_NoBWI>;
+defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem, NoVLX>;
+defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem, NoVLX>;
+
+defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem, NoVLX>;
+defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>;
+
+defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>;
+
+// AVX2 Patterns
+multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtOp> {
+ // Register-Register patterns
+ let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+ def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))),
+ (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>;
+ }
+ let Predicates = [HasAVX, NoVLX] in {
+ def : Pat<(v8i32 (ExtOp (v16i8 VR128:$src))),
+ (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>;
+ def : Pat<(v4i64 (ExtOp (v16i8 VR128:$src))),
+ (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>;
+
+ def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))),
+ (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>;
+ def : Pat<(v4i64 (ExtOp (v8i16 VR128:$src))),
+ (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>;
+
+ def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))),
+ (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>;
+ }
+
+ // Simple Register-Memory patterns
+ let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+ def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
+ (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
+ }
+ let Predicates = [HasAVX, NoVLX] in {
+ def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
+ (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
+ def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
+ (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
+
+ def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
+ (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
+ def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
+ (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
+
+ def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
+ (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
+ }
+
+ // AVX2 Register-Memory patterns
+ let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+ def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
+ def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
+ def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
+ }
+ let Predicates = [HasAVX, NoVLX] in {
+ def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
+ def : Pat<(v8i32 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
+ def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
+ def : Pat<(v8i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
+
+ def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
+ def : Pat<(v4i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
+ (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
+ def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
+ def : Pat<(v4i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
+
+ def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
+ def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
+ def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
+
+ def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
+ def : Pat<(v4i64 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
+ def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
+ def : Pat<(v4i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
+
+ def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
+ def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
+ def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
+ }
+}
+
+defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", X86vsext>;
+defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", X86vzext>;
+
+// SSE4.1/AVX patterns.
+multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
+ SDNode ExtOp, PatFrag ExtLoad16> {
+ let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+ def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))),
+ (!cast<I>(OpcPrefix#BWrr) VR128:$src)>;
+ }
+ let Predicates = [HasAVX, NoVLX] in {
+ def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))),
+ (!cast<I>(OpcPrefix#BDrr) VR128:$src)>;
+ def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))),
+ (!cast<I>(OpcPrefix#BQrr) VR128:$src)>;
+
+ def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))),
+ (!cast<I>(OpcPrefix#WDrr) VR128:$src)>;
+ def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))),
+ (!cast<I>(OpcPrefix#WQrr) VR128:$src)>;
+
+ def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))),
+ (!cast<I>(OpcPrefix#DQrr) VR128:$src)>;
+ }
+ let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+ def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
+ (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
+ }
+ let Predicates = [HasAVX, NoVLX] in {
+ def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
+ (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
+ def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
+ (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
+
+ def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
+ (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
+ def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
+ (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
+
+ def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
+ (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
+ }
+ let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+ def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
+ def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
+ def : Pat<(v8i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
+ def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
+ def : Pat<(v8i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
+ }
+ let Predicates = [HasAVX, NoVLX] in {
+ def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
+ def : Pat<(v4i32 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
+ (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
+ def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
+ def : Pat<(v4i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
+
+ def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (ExtLoad16 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
+ (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
+
+ def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
+ def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
+ def : Pat<(v4i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
+ def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
+ def : Pat<(v4i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
+
+ def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
+ (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (v8i16 (vzmovl_v4i32 addr:$src)))),
+ (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
+
+ def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
+ }
+}
+
+defm : SS41I_pmovx_patterns<"VPMOVSX", "s", X86vsext, extloadi32i16>;
+defm : SS41I_pmovx_patterns<"VPMOVZX", "z", X86vzext, loadi16_anyext>;
+
+let Predicates = [UseSSE41] in {
+ defm : SS41I_pmovx_patterns<"PMOVSX", "s", X86vsext, extloadi32i16>;
+ defm : SS41I_pmovx_patterns<"PMOVZX", "z", X86vzext, loadi16_anyext>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Extract Instructions
+//===----------------------------------------------------------------------===//
+
+/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
+multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
+ def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
+ (ins VR128:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
+ imm:$src2))]>,
+ Sched<[WriteShuffle]>;
+ let hasSideEffects = 0, mayStore = 1,
+ SchedRW = [WriteShuffleLd, WriteRMW] in
+ def mr : SS4AIi8<opc, MRMDestMem, (outs),
+ (ins i8mem:$dst, VR128:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(store (i8 (trunc (assertzext (X86pextrb (v16i8 VR128:$src1),
+ imm:$src2)))), addr:$dst)]>;
+}
+
+let Predicates = [HasAVX, NoBWI] in
+ defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX;
+
+defm PEXTRB : SS41I_extract8<0x14, "pextrb">;
+
+
+/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
+multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
+ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+ def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
+ (ins VR128:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, Sched<[WriteShuffle]>;
+
+ let hasSideEffects = 0, mayStore = 1,
+ SchedRW = [WriteShuffleLd, WriteRMW] in
+ def mr : SS4AIi8<opc, MRMDestMem, (outs),
+ (ins i16mem:$dst, VR128:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(store (i16 (trunc (assertzext (X86pextrw (v8i16 VR128:$src1),
+ imm:$src2)))), addr:$dst)]>;
+}
+
+let Predicates = [HasAVX, NoBWI] in
+ defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX;
+
+defm PEXTRW : SS41I_extract16<0x15, "pextrw">;
+
+
+/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
+multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
+ def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
+ (ins VR128:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set GR32:$dst,
+ (extractelt (v4i32 VR128:$src1), imm:$src2))]>,
+ Sched<[WriteShuffle]>;
+ let SchedRW = [WriteShuffleLd, WriteRMW] in
+ def mr : SS4AIi8<opc, MRMDestMem, (outs),
+ (ins i32mem:$dst, VR128:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
+ addr:$dst)]>;
+}
+
+let Predicates = [HasAVX, NoDQI] in
+ defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
+
+defm PEXTRD : SS41I_extract32<0x16, "pextrd">;
+
+/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
+multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
+ def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
+ (ins VR128:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set GR64:$dst,
+ (extractelt (v2i64 VR128:$src1), imm:$src2))]>,
+ Sched<[WriteShuffle]>, REX_W;
+ let SchedRW = [WriteShuffleLd, WriteRMW] in
+ def mr : SS4AIi8<opc, MRMDestMem, (outs),
+ (ins i64mem:$dst, VR128:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
+ addr:$dst)]>, REX_W;
+}
+
+let Predicates = [HasAVX, NoDQI] in
+ defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
+
+defm PEXTRQ : SS41I_extract64<0x16, "pextrq">;
+
+/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
+/// destination
+multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr,
+ OpndItins itins = DEFAULT_ITINS> {
+ def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
+ (ins VR128:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set GR32orGR64:$dst,
+ (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))],
+ itins.rr>, Sched<[WriteFBlend]>;
+ let SchedRW = [WriteFBlendLd, WriteRMW] in
+ def mr : SS4AIi8<opc, MRMDestMem, (outs),
+ (ins f32mem:$dst, VR128:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
+ addr:$dst)], itins.rm>;
+}
+
+let ExeDomain = SSEPackedSingle in {
+ let Predicates = [UseAVX] in
+ defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX;
+ defm EXTRACTPS : SS41I_extractf32<0x17, "extractps", SSE_EXTRACT_ITINS>;
+}
+
+// Also match an EXTRACTPS store when the store is done as f32 instead of i32.
+def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
+ imm:$src2))),
+ addr:$dst),
+ (VEXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
+ Requires<[HasAVX]>;
+def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
+ imm:$src2))),
+ addr:$dst),
+ (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
+ Requires<[UseSSE41]>;
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Insert Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
+ def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
+ Sched<[WriteShuffle]>;
+ def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i8mem:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (X86pinsrb VR128:$src1, (extloadi8 addr:$src2),
+ imm:$src3))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX, NoBWI] in
+ defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V;
+let Constraints = "$src1 = $dst" in
+ defm PINSRB : SS41I_insert8<0x20, "pinsrb">;
+
+multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
+ def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, GR32:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
+ Sched<[WriteShuffle]>;
+ def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i32mem:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2),
+ imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX, NoDQI] in
+ defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
+let Constraints = "$src1 = $dst" in
+ defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
+
+multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
+ def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, GR64:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
+ Sched<[WriteShuffle]>;
+ def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i64mem:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2),
+ imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX, NoDQI] in
+ defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
+let Constraints = "$src1 = $dst" in
+ defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
+
+// insertps has a few different modes, there's the first two here below which
+// are optimized inserts that won't zero arbitrary elements in the destination
+// vector. The next one matches the intrinsic and could zero arbitrary elements
+// in the target vector.
+multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1,
+ OpndItins itins = DEFAULT_ITINS> {
+ def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (X86insertps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>,
+ Sched<[WriteFShuffle]>;
+ def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, f32mem:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (X86insertps VR128:$src1,
+ (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
+ imm:$src3))], itins.rm>,
+ Sched<[WriteFShuffleLd, ReadAfterLd]>;
+}
+
+let ExeDomain = SSEPackedSingle in {
+ let Predicates = [UseAVX] in
+ defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V;
+ let Constraints = "$src1 = $dst" in
+ defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>;
+}
+
+let Predicates = [UseSSE41] in {
+ // If we're inserting an element from a load or a null pshuf of a load,
+ // fold the load into the insertps instruction.
+ def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd (v4f32
+ (scalar_to_vector (loadf32 addr:$src2))), (i8 0)),
+ imm:$src3)),
+ (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+ def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd
+ (loadv4f32 addr:$src2), (i8 0)), imm:$src3)),
+ (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+}
+
+let Predicates = [UseAVX] in {
+ // If we're inserting an element from a vbroadcast of a load, fold the
+ // load into the X86insertps instruction.
+ def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
+ (X86VBroadcast (loadf32 addr:$src2)), imm:$src3)),
+ (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+ def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
+ (X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)),
+ (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Round Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass sse41_fp_unop_p<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
+ X86MemOperand x86memop, RegisterClass RC,
+ PatFrag mem_frag32, PatFrag mem_frag64,
+ Intrinsic V4F32Int, Intrinsic V2F64Int> {
+let ExeDomain = SSEPackedSingle in {
+ // Intrinsic operation, reg.
+ // Vector intrinsic operation, reg
+ def PSr : SS4AIi8<opcps, MRMSrcReg,
+ (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))],
+ IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>;
+
+ // Vector intrinsic operation, mem
+ def PSm : SS4AIi8<opcps, MRMSrcMem,
+ (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst,
+ (V4F32Int (mem_frag32 addr:$src1),imm:$src2))],
+ IIC_SSE_ROUNDPS_MEM>, Sched<[WriteFAddLd]>;
+} // ExeDomain = SSEPackedSingle
+
+let ExeDomain = SSEPackedDouble in {
+ // Vector intrinsic operation, reg
+ def PDr : SS4AIi8<opcpd, MRMSrcReg,
+ (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))],
+ IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>;
+
+ // Vector intrinsic operation, mem
+ def PDm : SS4AIi8<opcpd, MRMSrcMem,
+ (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst,
+ (V2F64Int (mem_frag64 addr:$src1),imm:$src2))],
+ IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAddLd]>;
+} // ExeDomain = SSEPackedDouble
+}
+
+multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
+ string OpcodeStr> {
+let ExeDomain = GenericDomain, hasSideEffects = 0 in {
+ def SSr : SS4AIi8<opcss, MRMSrcReg,
+ (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
+ !strconcat(OpcodeStr,
+ "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, Sched<[WriteFAdd]>;
+
+ let mayLoad = 1 in
+ def SSm : SS4AIi8<opcss, MRMSrcMem,
+ (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3),
+ !strconcat(OpcodeStr,
+ "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, Sched<[WriteFAddLd, ReadAfterLd]>;
+
+ def SDr : SS4AIi8<opcsd, MRMSrcReg,
+ (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
+ !strconcat(OpcodeStr,
+ "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, Sched<[WriteFAdd]>;
+
+ let mayLoad = 1 in
+ def SDm : SS4AIi8<opcsd, MRMSrcMem,
+ (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3),
+ !strconcat(OpcodeStr,
+ "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, Sched<[WriteFAddLd, ReadAfterLd]>;
+} // ExeDomain = GenericDomain, hasSideEffects = 0
+}
+
+multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
+ string OpcodeStr> {
+let ExeDomain = GenericDomain, hasSideEffects = 0 in {
+ def SSr : SS4AIi8<opcss, MRMSrcReg,
+ (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, Sched<[WriteFAdd]>;
+
+ let mayLoad = 1 in
+ def SSm : SS4AIi8<opcss, MRMSrcMem,
+ (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, Sched<[WriteFAddLd, ReadAfterLd]>;
+
+ def SDr : SS4AIi8<opcsd, MRMSrcReg,
+ (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, Sched<[WriteFAdd]>;
+
+ let mayLoad = 1 in
+ def SDm : SS4AIi8<opcsd, MRMSrcMem,
+ (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, Sched<[WriteFAddLd, ReadAfterLd]>;
+} // ExeDomain = GenericDomain, hasSideEffects = 0
+}
+
+multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
+ string OpcodeStr,
+ Intrinsic F32Int,
+ Intrinsic F64Int, bit Is2Addr = 1> {
+let ExeDomain = GenericDomain, isCodeGenOnly = 1 in {
+ def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>,
+ Sched<[WriteFAdd]>;
+
+ def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
+ Sched<[WriteFAddLd, ReadAfterLd]>;
+
+ def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>,
+ Sched<[WriteFAdd]>;
+
+ def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
+ Sched<[WriteFAddLd, ReadAfterLd]>;
+} // ExeDomain = GenericDomain, isCodeGenOnly = 1
+}
+
+// FP round - roundss, roundps, roundsd, roundpd
+let Predicates = [HasAVX] in {
+ // Intrinsic form
+ defm VROUND : sse41_fp_unop_p<0x08, 0x09, "vround", f128mem, VR128,
+ loadv4f32, loadv2f64,
+ int_x86_sse41_round_ps,
+ int_x86_sse41_round_pd>, VEX;
+ defm VROUNDY : sse41_fp_unop_p<0x08, 0x09, "vround", f256mem, VR256,
+ loadv8f32, loadv4f64,
+ int_x86_avx_round_ps_256,
+ int_x86_avx_round_pd_256>, VEX, VEX_L;
+ defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround",
+ int_x86_sse41_round_ss,
+ int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
+ defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround">, VEX_4V, VEX_LIG;
+}
+
+let Predicates = [UseAVX] in {
+ def : Pat<(ffloor FR32:$src),
+ (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>;
+ def : Pat<(f64 (ffloor FR64:$src)),
+ (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>;
+ def : Pat<(f32 (fnearbyint FR32:$src)),
+ (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
+ def : Pat<(f64 (fnearbyint FR64:$src)),
+ (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
+ def : Pat<(f32 (fceil FR32:$src)),
+ (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>;
+ def : Pat<(f64 (fceil FR64:$src)),
+ (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>;
+ def : Pat<(f32 (frint FR32:$src)),
+ (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
+ def : Pat<(f64 (frint FR64:$src)),
+ (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
+ def : Pat<(f32 (ftrunc FR32:$src)),
+ (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>;
+ def : Pat<(f64 (ftrunc FR64:$src)),
+ (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>;
+}
+
+let Predicates = [HasAVX] in {
+ def : Pat<(v4f32 (ffloor VR128:$src)),
+ (VROUNDPSr VR128:$src, (i32 0x9))>;
+ def : Pat<(v4f32 (fnearbyint VR128:$src)),
+ (VROUNDPSr VR128:$src, (i32 0xC))>;
+ def : Pat<(v4f32 (fceil VR128:$src)),
+ (VROUNDPSr VR128:$src, (i32 0xA))>;
+ def : Pat<(v4f32 (frint VR128:$src)),
+ (VROUNDPSr VR128:$src, (i32 0x4))>;
+ def : Pat<(v4f32 (ftrunc VR128:$src)),
+ (VROUNDPSr VR128:$src, (i32 0xB))>;
+
+ def : Pat<(v2f64 (ffloor VR128:$src)),
+ (VROUNDPDr VR128:$src, (i32 0x9))>;
+ def : Pat<(v2f64 (fnearbyint VR128:$src)),
+ (VROUNDPDr VR128:$src, (i32 0xC))>;
+ def : Pat<(v2f64 (fceil VR128:$src)),
+ (VROUNDPDr VR128:$src, (i32 0xA))>;
+ def : Pat<(v2f64 (frint VR128:$src)),
+ (VROUNDPDr VR128:$src, (i32 0x4))>;
+ def : Pat<(v2f64 (ftrunc VR128:$src)),
+ (VROUNDPDr VR128:$src, (i32 0xB))>;
+
+ def : Pat<(v8f32 (ffloor VR256:$src)),
+ (VROUNDYPSr VR256:$src, (i32 0x9))>;
+ def : Pat<(v8f32 (fnearbyint VR256:$src)),
+ (VROUNDYPSr VR256:$src, (i32 0xC))>;
+ def : Pat<(v8f32 (fceil VR256:$src)),
+ (VROUNDYPSr VR256:$src, (i32 0xA))>;
+ def : Pat<(v8f32 (frint VR256:$src)),
+ (VROUNDYPSr VR256:$src, (i32 0x4))>;
+ def : Pat<(v8f32 (ftrunc VR256:$src)),
+ (VROUNDYPSr VR256:$src, (i32 0xB))>;
+
+ def : Pat<(v4f64 (ffloor VR256:$src)),
+ (VROUNDYPDr VR256:$src, (i32 0x9))>;
+ def : Pat<(v4f64 (fnearbyint VR256:$src)),
+ (VROUNDYPDr VR256:$src, (i32 0xC))>;
+ def : Pat<(v4f64 (fceil VR256:$src)),
+ (VROUNDYPDr VR256:$src, (i32 0xA))>;
+ def : Pat<(v4f64 (frint VR256:$src)),
+ (VROUNDYPDr VR256:$src, (i32 0x4))>;
+ def : Pat<(v4f64 (ftrunc VR256:$src)),
+ (VROUNDYPDr VR256:$src, (i32 0xB))>;
+}
+
+defm ROUND : sse41_fp_unop_p<0x08, 0x09, "round", f128mem, VR128,
+ memopv4f32, memopv2f64, int_x86_sse41_round_ps,
+ int_x86_sse41_round_pd>;
+
+defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round">;
+
+let Constraints = "$src1 = $dst" in
+defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round",
+ int_x86_sse41_round_ss, int_x86_sse41_round_sd>;
+
+let Predicates = [UseSSE41] in {
+ def : Pat<(ffloor FR32:$src),
+ (ROUNDSSr FR32:$src, (i32 0x9))>;
+ def : Pat<(f64 (ffloor FR64:$src)),
+ (ROUNDSDr FR64:$src, (i32 0x9))>;
+ def : Pat<(f32 (fnearbyint FR32:$src)),
+ (ROUNDSSr FR32:$src, (i32 0xC))>;
+ def : Pat<(f64 (fnearbyint FR64:$src)),
+ (ROUNDSDr FR64:$src, (i32 0xC))>;
+ def : Pat<(f32 (fceil FR32:$src)),
+ (ROUNDSSr FR32:$src, (i32 0xA))>;
+ def : Pat<(f64 (fceil FR64:$src)),
+ (ROUNDSDr FR64:$src, (i32 0xA))>;
+ def : Pat<(f32 (frint FR32:$src)),
+ (ROUNDSSr FR32:$src, (i32 0x4))>;
+ def : Pat<(f64 (frint FR64:$src)),
+ (ROUNDSDr FR64:$src, (i32 0x4))>;
+ def : Pat<(f32 (ftrunc FR32:$src)),
+ (ROUNDSSr FR32:$src, (i32 0xB))>;
+ def : Pat<(f64 (ftrunc FR64:$src)),
+ (ROUNDSDr FR64:$src, (i32 0xB))>;
+
+ def : Pat<(v4f32 (ffloor VR128:$src)),
+ (ROUNDPSr VR128:$src, (i32 0x9))>;
+ def : Pat<(v4f32 (fnearbyint VR128:$src)),
+ (ROUNDPSr VR128:$src, (i32 0xC))>;
+ def : Pat<(v4f32 (fceil VR128:$src)),
+ (ROUNDPSr VR128:$src, (i32 0xA))>;
+ def : Pat<(v4f32 (frint VR128:$src)),
+ (ROUNDPSr VR128:$src, (i32 0x4))>;
+ def : Pat<(v4f32 (ftrunc VR128:$src)),
+ (ROUNDPSr VR128:$src, (i32 0xB))>;
+
+ def : Pat<(v2f64 (ffloor VR128:$src)),
+ (ROUNDPDr VR128:$src, (i32 0x9))>;
+ def : Pat<(v2f64 (fnearbyint VR128:$src)),
+ (ROUNDPDr VR128:$src, (i32 0xC))>;
+ def : Pat<(v2f64 (fceil VR128:$src)),
+ (ROUNDPDr VR128:$src, (i32 0xA))>;
+ def : Pat<(v2f64 (frint VR128:$src)),
+ (ROUNDPDr VR128:$src, (i32 0x4))>;
+ def : Pat<(v2f64 (ftrunc VR128:$src)),
+ (ROUNDPDr VR128:$src, (i32 0xB))>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Packed Bit Test
+//===----------------------------------------------------------------------===//
+
+// ptest instruction we'll lower to this in X86ISelLowering primarily from
+// the intel intrinsic that corresponds to this.
+let Defs = [EFLAGS], Predicates = [HasAVX] in {
+def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+ "vptest\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
+ Sched<[WriteVecLogic]>, VEX;
+def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
+ "vptest\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
+ Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX;
+
+def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
+ "vptest\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
+ Sched<[WriteVecLogic]>, VEX, VEX_L;
+def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
+ "vptest\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
+ Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX, VEX_L;
+}
+
+let Defs = [EFLAGS] in {
+def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+ "ptest\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
+ Sched<[WriteVecLogic]>;
+def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
+ "ptest\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
+ Sched<[WriteVecLogicLd, ReadAfterLd]>;
+}
+
+// The bit test instructions below are AVX only
+multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ X86MemOperand x86memop, PatFrag mem_frag, ValueType vt> {
+ def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+ [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>,
+ Sched<[WriteVecLogic]>, VEX;
+ def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+ [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
+ Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX;
+}
+
+let Defs = [EFLAGS], Predicates = [HasAVX] in {
+let ExeDomain = SSEPackedSingle in {
+defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32>;
+defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32>,
+ VEX_L;
+}
+let ExeDomain = SSEPackedDouble in {
+defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64>;
+defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64>,
+ VEX_L;
+}
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Misc Instructions
+//===----------------------------------------------------------------------===//
+
+let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
+ def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "popcnt{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)],
+ IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>,
+ OpSize16, XS;
+ def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "popcnt{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
+ (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
+ Sched<[WriteFAddLd]>, OpSize16, XS;
+
+ def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "popcnt{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)],
+ IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>,
+ OpSize32, XS;
+
+ def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "popcnt{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
+ (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
+ Sched<[WriteFAddLd]>, OpSize32, XS;
+
+ def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "popcnt{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)],
+ IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>, XS;
+ def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "popcnt{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
+ (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
+ Sched<[WriteFAddLd]>, XS;
+}
+
+
+
+// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
+multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId128, PatFrag ld_frag,
+ X86FoldableSchedWrite Sched> {
+ def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (IntId128 VR128:$src))]>,
+ Sched<[Sched]>;
+ def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst,
+ (IntId128 (bitconvert (ld_frag addr:$src))))]>,
+ Sched<[Sched.Folded]>;
+}
+
+// PHMIN has the same profile as PSAD, thus we use the same scheduling
+// model, although the naming is misleading.
+let Predicates = [HasAVX] in
+defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw",
+ int_x86_sse41_phminposuw, loadv2i64,
+ WriteVecIMul>, VEX;
+defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
+ int_x86_sse41_phminposuw, memopv2i64,
+ WriteVecIMul>;
+
+/// SS48I_binop_rm - Simple SSE41 binary operator.
+multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+ X86MemOperand x86memop, bit Is2Addr = 1,
+ OpndItins itins = SSE_INTALU_ITINS_P> {
+ let isCommutable = 1 in
+ def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
+ Sched<[itins.Sched]>;
+ def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst,
+ (OpVT (OpNode RC:$src1, (bitconvert (memop_frag addr:$src2)))))]>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+/// SS48I_binop_rm2 - Simple SSE41 binary operator with different src and dst
+/// types.
+multiclass SS48I_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType DstVT, ValueType SrcVT, RegisterClass RC,
+ PatFrag memop_frag, X86MemOperand x86memop,
+ OpndItins itins,
+ bit IsCommutable = 0, bit Is2Addr = 1> {
+ let isCommutable = IsCommutable in
+ def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
+ Sched<[itins.Sched]>;
+ def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
+ (bitconvert (memop_frag addr:$src2)))))]>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+ defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
+ loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+ VEX_4V;
+ defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
+ loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+ VEX_4V;
+ defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
+ loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+ VEX_4V;
+ defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
+ loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+ VEX_4V;
+ defm VPMULDQ : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v2i64, v4i32,
+ VR128, loadv2i64, i128mem,
+ SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
+}
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+ defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
+ loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+ VEX_4V;
+ defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
+ loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+ VEX_4V;
+ defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
+ loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+ VEX_4V;
+ defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
+ loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+ VEX_4V;
+}
+
+let Predicates = [HasAVX2, NoVLX] in {
+ defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
+ loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+ VEX_4V, VEX_L;
+ defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
+ loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+ VEX_4V, VEX_L;
+ defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
+ loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+ VEX_4V, VEX_L;
+ defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
+ loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+ VEX_4V, VEX_L;
+ defm VPMULDQY : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v4i64, v8i32,
+ VR256, loadv4i64, i256mem,
+ SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
+}
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+ defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
+ loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+ VEX_4V, VEX_L;
+ defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
+ loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+ VEX_4V, VEX_L;
+ defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
+ loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+ VEX_4V, VEX_L;
+ defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
+ loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+ VEX_4V, VEX_L;
+}
+
+let Constraints = "$src1 = $dst" in {
+ defm PMINSB : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128,
+ memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+ defm PMINSD : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128,
+ memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+ defm PMINUD : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128,
+ memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+ defm PMINUW : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128,
+ memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+ defm PMAXSB : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128,
+ memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+ defm PMAXSD : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128,
+ memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+ defm PMAXUD : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128,
+ memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+ defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128,
+ memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+ defm PMULDQ : SS48I_binop_rm2<0x28, "pmuldq", X86pmuldq, v2i64, v4i32,
+ VR128, memopv2i64, i128mem,
+ SSE_INTMUL_ITINS_P, 1>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+ defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
+ loadv2i64, i128mem, 0, SSE_PMULLD_ITINS>,
+ VEX_4V;
+ defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
+ loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+ VEX_4V;
+}
+let Predicates = [HasAVX2] in {
+ defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
+ loadv4i64, i256mem, 0, SSE_PMULLD_ITINS>,
+ VEX_4V, VEX_L;
+ defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
+ loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+ VEX_4V, VEX_L;
+}
+
+let Constraints = "$src1 = $dst" in {
+ defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
+ memopv2i64, i128mem, 1, SSE_PMULLD_ITINS>;
+ defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
+ memopv2i64, i128mem, 1, SSE_INTALUQ_ITINS_P>;
+}
+
+/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
+multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
+ X86MemOperand x86memop, bit Is2Addr = 1,
+ OpndItins itins = DEFAULT_ITINS> {
+ let isCommutable = 1 in
+ def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))], itins.rr>,
+ Sched<[itins.Sched]>;
+ def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set RC:$dst,
+ (IntId RC:$src1,
+ (bitconvert (memop_frag addr:$src2)), imm:$src3))], itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+/// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate
+multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+ X86MemOperand x86memop, bit Is2Addr = 1,
+ OpndItins itins = DEFAULT_ITINS> {
+ let isCommutable = 1 in
+ def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))],
+ itins.rr>, Sched<[itins.Sched]>;
+ def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set RC:$dst,
+ (OpVT (OpNode RC:$src1,
+ (bitconvert (memop_frag addr:$src2)), imm:$src3)))], itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in {
+ let isCommutable = 0 in {
+ defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
+ VR128, loadv2i64, i128mem, 0,
+ DEFAULT_ITINS_MPSADSCHED>, VEX_4V;
+ }
+
+ let ExeDomain = SSEPackedSingle in {
+ defm VBLENDPS : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v4f32,
+ VR128, loadv4f32, f128mem, 0,
+ DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
+ defm VBLENDPSY : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v8f32,
+ VR256, loadv8f32, f256mem, 0,
+ DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L;
+ }
+ let ExeDomain = SSEPackedDouble in {
+ defm VBLENDPD : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
+ VR128, loadv2f64, f128mem, 0,
+ DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
+ defm VBLENDPDY : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
+ VR256, loadv4f64, f256mem, 0,
+ DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L;
+ }
+ defm VPBLENDW : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
+ VR128, loadv2i64, i128mem, 0,
+ DEFAULT_ITINS_BLENDSCHED>, VEX_4V;
+
+ let ExeDomain = SSEPackedSingle in
+ defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
+ VR128, loadv4f32, f128mem, 0,
+ SSE_DPPS_ITINS>, VEX_4V;
+ let ExeDomain = SSEPackedDouble in
+ defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
+ VR128, loadv2f64, f128mem, 0,
+ SSE_DPPS_ITINS>, VEX_4V;
+ let ExeDomain = SSEPackedSingle in
+ defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
+ VR256, loadv8f32, i256mem, 0,
+ SSE_DPPS_ITINS>, VEX_4V, VEX_L;
+}
+
+let Predicates = [HasAVX2] in {
+ let isCommutable = 0 in {
+ defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
+ VR256, loadv4i64, i256mem, 0,
+ DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_L;
+ }
+ defm VPBLENDWY : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
+ VR256, loadv4i64, i256mem, 0,
+ DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L;
+}
+
+let Constraints = "$src1 = $dst" in {
+ let isCommutable = 0 in {
+ defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
+ VR128, memopv2i64, i128mem,
+ 1, SSE_MPSADBW_ITINS>;
+ }
+ let ExeDomain = SSEPackedSingle in
+ defm BLENDPS : SS41I_binop_rmi<0x0C, "blendps", X86Blendi, v4f32,
+ VR128, memopv4f32, f128mem,
+ 1, SSE_INTALU_ITINS_FBLEND_P>;
+ let ExeDomain = SSEPackedDouble in
+ defm BLENDPD : SS41I_binop_rmi<0x0D, "blendpd", X86Blendi, v2f64,
+ VR128, memopv2f64, f128mem,
+ 1, SSE_INTALU_ITINS_FBLEND_P>;
+ defm PBLENDW : SS41I_binop_rmi<0x0E, "pblendw", X86Blendi, v8i16,
+ VR128, memopv2i64, i128mem,
+ 1, SSE_INTALU_ITINS_BLEND_P>;
+ let ExeDomain = SSEPackedSingle in
+ defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
+ VR128, memopv4f32, f128mem, 1,
+ SSE_DPPS_ITINS>;
+ let ExeDomain = SSEPackedDouble in
+ defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
+ VR128, memopv2f64, f128mem, 1,
+ SSE_DPPD_ITINS>;
+}
+
+/// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators
+multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
+ RegisterClass RC, X86MemOperand x86memop,
+ PatFrag mem_frag, Intrinsic IntId,
+ X86FoldableSchedWrite Sched> {
+ def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))],
+ NoItinerary, SSEPackedInt>, TAPD, VEX_4V,
+ Sched<[Sched]>;
+
+ def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set RC:$dst,
+ (IntId RC:$src1, (bitconvert (mem_frag addr:$src2)),
+ RC:$src3))],
+ NoItinerary, SSEPackedInt>, TAPD, VEX_4V,
+ Sched<[Sched.Folded, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in {
+let ExeDomain = SSEPackedDouble in {
+defm VBLENDVPD : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem,
+ loadv2f64, int_x86_sse41_blendvpd,
+ WriteFVarBlend>;
+defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
+ loadv4f64, int_x86_avx_blendv_pd_256,
+ WriteFVarBlend>, VEX_L;
+} // ExeDomain = SSEPackedDouble
+let ExeDomain = SSEPackedSingle in {
+defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem,
+ loadv4f32, int_x86_sse41_blendvps,
+ WriteFVarBlend>;
+defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem,
+ loadv8f32, int_x86_avx_blendv_ps_256,
+ WriteFVarBlend>, VEX_L;
+} // ExeDomain = SSEPackedSingle
+defm VPBLENDVB : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
+ loadv2i64, int_x86_sse41_pblendvb,
+ WriteVarBlend>;
+}
+
+let Predicates = [HasAVX2] in {
+defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem,
+ loadv4i64, int_x86_avx2_pblendvb,
+ WriteVarBlend>, VEX_L;
+}
+
+let Predicates = [HasAVX] in {
+ def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1),
+ (v16i8 VR128:$src2))),
+ (VPBLENDVBrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+ def : Pat<(v4i32 (vselect (v4i32 VR128:$mask), (v4i32 VR128:$src1),
+ (v4i32 VR128:$src2))),
+ (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+ def : Pat<(v4f32 (vselect (v4i32 VR128:$mask), (v4f32 VR128:$src1),
+ (v4f32 VR128:$src2))),
+ (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+ def : Pat<(v2i64 (vselect (v2i64 VR128:$mask), (v2i64 VR128:$src1),
+ (v2i64 VR128:$src2))),
+ (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+ def : Pat<(v2f64 (vselect (v2i64 VR128:$mask), (v2f64 VR128:$src1),
+ (v2f64 VR128:$src2))),
+ (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+ def : Pat<(v8i32 (vselect (v8i32 VR256:$mask), (v8i32 VR256:$src1),
+ (v8i32 VR256:$src2))),
+ (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+ def : Pat<(v8f32 (vselect (v8i32 VR256:$mask), (v8f32 VR256:$src1),
+ (v8f32 VR256:$src2))),
+ (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+ def : Pat<(v4i64 (vselect (v4i64 VR256:$mask), (v4i64 VR256:$src1),
+ (v4i64 VR256:$src2))),
+ (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+ def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1),
+ (v4f64 VR256:$src2))),
+ (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+}
+
+let Predicates = [HasAVX2] in {
+ def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1),
+ (v32i8 VR256:$src2))),
+ (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+}
+
+// Patterns
+// FIXME: Prefer a movss or movsd over a blendps when optimizing for size or
+// on targets where they have equal performance. These were changed to use
+// blends because blends have better throughput on SandyBridge and Haswell, but
+// movs[s/d] are 1-2 byte shorter instructions.
+let Predicates = [UseAVX] in {
+ let AddedComplexity = 15 in {
+ // Move scalar to XMM zero-extended, zeroing a VR128 then do a
+ // MOVS{S,D} to the lower bits.
+ def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
+ (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
+ def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+ (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
+ def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+ (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
+ def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
+ (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
+
+ // Move low f32 and clear high bits.
+ def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
+ (VBLENDPSYrri (v8f32 (AVX_SET0)), VR256:$src, (i8 1))>;
+
+ // Move low f64 and clear high bits.
+ def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
+ (VBLENDPDYrri (v4f64 (AVX_SET0)), VR256:$src, (i8 1))>;
+ }
+
+ // These will incur an FP/int domain crossing penalty, but it may be the only
+ // way without AVX2. Do not add any complexity because we may be able to match
+ // more optimal patterns defined earlier in this file.
+ def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
+ (VBLENDPSYrri (v8i32 (AVX_SET0)), VR256:$src, (i8 1))>;
+ def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
+ (VBLENDPDYrri (v4i64 (AVX_SET0)), VR256:$src, (i8 1))>;
+}
+
+// FIXME: Prefer a movss or movsd over a blendps when optimizing for size or
+// on targets where they have equal performance. These were changed to use
+// blends because blends have better throughput on SandyBridge and Haswell, but
+// movs[s/d] are 1-2 byte shorter instructions.
+let Predicates = [UseSSE41], AddedComplexity = 15 in {
+ // With SSE41 we can use blends for these patterns.
+ def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+ (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
+ def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+ (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
+}
+
+
+/// SS41I_ternary_int - SSE 4.1 ternary operator
+let Uses = [XMM0], Constraints = "$src1 = $dst" in {
+ multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
+ X86MemOperand x86memop, Intrinsic IntId,
+ OpndItins itins = DEFAULT_ITINS> {
+ def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))],
+ itins.rr>, Sched<[itins.Sched]>;
+
+ def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, x86memop:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst,
+ (IntId VR128:$src1,
+ (bitconvert (mem_frag addr:$src2)), XMM0))],
+ itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ }
+}
+
+let ExeDomain = SSEPackedDouble in
+defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem,
+ int_x86_sse41_blendvpd,
+ DEFAULT_ITINS_FBLENDSCHED>;
+let ExeDomain = SSEPackedSingle in
+defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem,
+ int_x86_sse41_blendvps,
+ DEFAULT_ITINS_FBLENDSCHED>;
+defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem,
+ int_x86_sse41_pblendvb,
+ DEFAULT_ITINS_VARBLENDSCHED>;
+
+// Aliases with the implicit xmm0 argument
+def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+ (BLENDVPDrr0 VR128:$dst, VR128:$src2)>;
+def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+ (BLENDVPDrm0 VR128:$dst, f128mem:$src2)>;
+def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+ (BLENDVPSrr0 VR128:$dst, VR128:$src2)>;
+def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+ (BLENDVPSrm0 VR128:$dst, f128mem:$src2)>;
+def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+ (PBLENDVBrr0 VR128:$dst, VR128:$src2)>;
+def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+ (PBLENDVBrm0 VR128:$dst, i128mem:$src2)>;
+
+let Predicates = [UseSSE41] in {
+ def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1),
+ (v16i8 VR128:$src2))),
+ (PBLENDVBrr0 VR128:$src2, VR128:$src1)>;
+ def : Pat<(v4i32 (vselect (v4i32 XMM0), (v4i32 VR128:$src1),
+ (v4i32 VR128:$src2))),
+ (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
+ def : Pat<(v4f32 (vselect (v4i32 XMM0), (v4f32 VR128:$src1),
+ (v4f32 VR128:$src2))),
+ (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
+ def : Pat<(v2i64 (vselect (v2i64 XMM0), (v2i64 VR128:$src1),
+ (v2i64 VR128:$src2))),
+ (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
+ def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1),
+ (v2f64 VR128:$src2))),
+ (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
+}
+
+let AddedComplexity = 400 in { // Prefer non-temporal versions
+let SchedRW = [WriteLoad] in {
+let Predicates = [HasAVX, NoVLX] in
+def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "vmovntdqa\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
+ VEX;
+let Predicates = [HasAVX2, NoVLX] in
+def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+ "vmovntdqa\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst, (int_x86_avx2_movntdqa addr:$src))]>,
+ VEX, VEX_L;
+def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "movntdqa\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>;
+} // SchedRW
+
+let Predicates = [HasAVX2, NoVLX] in {
+ def : Pat<(v8f32 (alignednontemporalload addr:$src)),
+ (VMOVNTDQAYrm addr:$src)>;
+ def : Pat<(v4f64 (alignednontemporalload addr:$src)),
+ (VMOVNTDQAYrm addr:$src)>;
+ def : Pat<(v4i64 (alignednontemporalload addr:$src)),
+ (VMOVNTDQAYrm addr:$src)>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+ def : Pat<(v4f32 (alignednontemporalload addr:$src)),
+ (VMOVNTDQArm addr:$src)>;
+ def : Pat<(v2f64 (alignednontemporalload addr:$src)),
+ (VMOVNTDQArm addr:$src)>;
+ def : Pat<(v2i64 (alignednontemporalload addr:$src)),
+ (VMOVNTDQArm addr:$src)>;
+}
+
+let Predicates = [UseSSE41] in {
+ def : Pat<(v4f32 (alignednontemporalload addr:$src)),
+ (MOVNTDQArm addr:$src)>;
+ def : Pat<(v2f64 (alignednontemporalload addr:$src)),
+ (MOVNTDQArm addr:$src)>;
+ def : Pat<(v2i64 (alignednontemporalload addr:$src)),
+ (MOVNTDQArm addr:$src)>;
+}
+
+} // AddedComplexity
+
+//===----------------------------------------------------------------------===//
+// SSE4.2 - Compare Instructions
+//===----------------------------------------------------------------------===//
+
+/// SS42I_binop_rm - Simple SSE 4.2 binary operator
+multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+ X86MemOperand x86memop, bit Is2Addr = 1> {
+ def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>;
+ def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst,
+ (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>;
+}
+
+let Predicates = [HasAVX] in
+ defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
+ loadv2i64, i128mem, 0>, VEX_4V;
+
+let Predicates = [HasAVX2] in
+ defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
+ loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
+
+let Constraints = "$src1 = $dst" in
+ defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
+ memopv2i64, i128mem>;
+
+//===----------------------------------------------------------------------===//
+// SSE4.2 - String/text Processing Instructions
+//===----------------------------------------------------------------------===//
+
+// Packed Compare Implicit Length Strings, Return Mask
+multiclass pseudo_pcmpistrm<string asm, PatFrag ld_frag> {
+ def REG : PseudoI<(outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+ [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2,
+ imm:$src3))]>;
+ def MEM : PseudoI<(outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+ [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1,
+ (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>;
+}
+
+let Defs = [EFLAGS], usesCustomInserter = 1 in {
+ defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128", loadv2i64>,
+ Requires<[HasAVX]>;
+ defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128", memopv2i64>,
+ Requires<[UseSSE42]>;
+}
+
+multiclass pcmpistrm_SS42AI<string asm> {
+ def rr : SS42AI<0x62, MRMSrcReg, (outs),
+ (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+ !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+ []>, Sched<[WritePCmpIStrM]>;
+ let mayLoad = 1 in
+ def rm :SS42AI<0x62, MRMSrcMem, (outs),
+ (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+ !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+ []>, Sched<[WritePCmpIStrMLd, ReadAfterLd]>;
+}
+
+let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in {
+ let Predicates = [HasAVX] in
+ defm VPCMPISTRM128 : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
+ defm PCMPISTRM128 : pcmpistrm_SS42AI<"pcmpistrm"> ;
+}
+
+// Packed Compare Explicit Length Strings, Return Mask
+multiclass pseudo_pcmpestrm<string asm, PatFrag ld_frag> {
+ def REG : PseudoI<(outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src3, u8imm:$src5),
+ [(set VR128:$dst, (int_x86_sse42_pcmpestrm128
+ VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
+ def MEM : PseudoI<(outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
+ [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 VR128:$src1, EAX,
+ (bc_v16i8 (ld_frag addr:$src3)), EDX, imm:$src5))]>;
+}
+
+let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
+ defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128", loadv2i64>,
+ Requires<[HasAVX]>;
+ defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128", memopv2i64>,
+ Requires<[UseSSE42]>;
+}
+
+multiclass SS42AI_pcmpestrm<string asm> {
+ def rr : SS42AI<0x60, MRMSrcReg, (outs),
+ (ins VR128:$src1, VR128:$src3, u8imm:$src5),
+ !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+ []>, Sched<[WritePCmpEStrM]>;
+ let mayLoad = 1 in
+ def rm : SS42AI<0x60, MRMSrcMem, (outs),
+ (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
+ !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+ []>, Sched<[WritePCmpEStrMLd, ReadAfterLd]>;
+}
+
+let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
+ let Predicates = [HasAVX] in
+ defm VPCMPESTRM128 : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
+ defm PCMPESTRM128 : SS42AI_pcmpestrm<"pcmpestrm">;
+}
+
+// Packed Compare Implicit Length Strings, Return Index
+multiclass pseudo_pcmpistri<string asm, PatFrag ld_frag> {
+ def REG : PseudoI<(outs GR32:$dst),
+ (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+ [(set GR32:$dst, EFLAGS,
+ (X86pcmpistri VR128:$src1, VR128:$src2, imm:$src3))]>;
+ def MEM : PseudoI<(outs GR32:$dst),
+ (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+ [(set GR32:$dst, EFLAGS, (X86pcmpistri VR128:$src1,
+ (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>;
+}
+
+let Defs = [EFLAGS], usesCustomInserter = 1 in {
+ defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI", loadv2i64>,
+ Requires<[HasAVX]>;
+ defm PCMPISTRI : pseudo_pcmpistri<"#PCMPISTRI", memopv2i64>,
+ Requires<[UseSSE42]>;
+}
+
+multiclass SS42AI_pcmpistri<string asm> {
+ def rr : SS42AI<0x63, MRMSrcReg, (outs),
+ (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+ !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+ []>, Sched<[WritePCmpIStrI]>;
+ let mayLoad = 1 in
+ def rm : SS42AI<0x63, MRMSrcMem, (outs),
+ (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+ !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+ []>, Sched<[WritePCmpIStrILd, ReadAfterLd]>;
+}
+
+let Defs = [ECX, EFLAGS], hasSideEffects = 0 in {
+ let Predicates = [HasAVX] in
+ defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX;
+ defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">;
+}
+
+// Packed Compare Explicit Length Strings, Return Index
+multiclass pseudo_pcmpestri<string asm, PatFrag ld_frag> {
+ def REG : PseudoI<(outs GR32:$dst),
+ (ins VR128:$src1, VR128:$src3, u8imm:$src5),
+ [(set GR32:$dst, EFLAGS,
+ (X86pcmpestri VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
+ def MEM : PseudoI<(outs GR32:$dst),
+ (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
+ [(set GR32:$dst, EFLAGS,
+ (X86pcmpestri VR128:$src1, EAX, (bc_v16i8 (ld_frag addr:$src3)), EDX,
+ imm:$src5))]>;
+}
+
+let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
+ defm VPCMPESTRI : pseudo_pcmpestri<"#VPCMPESTRI", loadv2i64>,
+ Requires<[HasAVX]>;
+ defm PCMPESTRI : pseudo_pcmpestri<"#PCMPESTRI", memopv2i64>,
+ Requires<[UseSSE42]>;
+}
+
+multiclass SS42AI_pcmpestri<string asm> {
+ def rr : SS42AI<0x61, MRMSrcReg, (outs),
+ (ins VR128:$src1, VR128:$src3, u8imm:$src5),
+ !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+ []>, Sched<[WritePCmpEStrI]>;
+ let mayLoad = 1 in
+ def rm : SS42AI<0x61, MRMSrcMem, (outs),
+ (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
+ !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+ []>, Sched<[WritePCmpEStrILd, ReadAfterLd]>;
+}
+
+let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
+ let Predicates = [HasAVX] in
+ defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX;
+ defm PCMPESTRI : SS42AI_pcmpestri<"pcmpestri">;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.2 - CRC Instructions
+//===----------------------------------------------------------------------===//
+
+// No CRC instructions have AVX equivalents
+
+// crc intrinsic instruction
+// This set of instructions are only rm, the only difference is the size
+// of r and m.
+class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
+ RegisterClass RCIn, SDPatternOperator Int> :
+ SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
+ !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
+ [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))], IIC_CRC32_REG>,
+ Sched<[WriteFAdd]>;
+
+class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
+ X86MemOperand x86memop, SDPatternOperator Int> :
+ SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
+ !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
+ [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))],
+ IIC_CRC32_MEM>, Sched<[WriteFAddLd, ReadAfterLd]>;
+
+let Constraints = "$src1 = $dst" in {
+ def CRC32r32m8 : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
+ int_x86_sse42_crc32_32_8>;
+ def CRC32r32r8 : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8,
+ int_x86_sse42_crc32_32_8>;
+ def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem,
+ int_x86_sse42_crc32_32_16>, OpSize16;
+ def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16,
+ int_x86_sse42_crc32_32_16>, OpSize16;
+ def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem,
+ int_x86_sse42_crc32_32_32>, OpSize32;
+ def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32,
+ int_x86_sse42_crc32_32_32>, OpSize32;
+ def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem,
+ int_x86_sse42_crc32_64_64>, REX_W;
+ def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64,
+ int_x86_sse42_crc32_64_64>, REX_W;
+ let hasSideEffects = 0 in {
+ let mayLoad = 1 in
+ def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem,
+ null_frag>, REX_W;
+ def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8,
+ null_frag>, REX_W;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// SHA-NI Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
+ bit UsesXMM0 = 0> {
+ def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [!if(UsesXMM0,
+ (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
+ (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>, T8;
+
+ def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [!if(UsesXMM0,
+ (set VR128:$dst, (IntId VR128:$src1,
+ (bc_v4i32 (memopv2i64 addr:$src2)), XMM0)),
+ (set VR128:$dst, (IntId VR128:$src1,
+ (bc_v4i32 (memopv2i64 addr:$src2)))))]>, T8;
+}
+
+let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
+ def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+ "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR128:$dst,
+ (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
+ (i8 imm:$src3)))]>, TA;
+ def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+ "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR128:$dst,
+ (int_x86_sha1rnds4 VR128:$src1,
+ (bc_v4i32 (memopv2i64 addr:$src2)),
+ (i8 imm:$src3)))]>, TA;
+
+ defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte>;
+ defm SHA1MSG1 : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1>;
+ defm SHA1MSG2 : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2>;
+
+ let Uses=[XMM0] in
+ defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2, 1>;
+
+ defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1>;
+ defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2>;
+}
+
+// Aliases with explicit %xmm0
+def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+ (SHA256RNDS2rr VR128:$dst, VR128:$src2)>;
+def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+ (SHA256RNDS2rm VR128:$dst, i128mem:$src2)>;
+
+//===----------------------------------------------------------------------===//
+// AES-NI Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128,
+ PatFrag ld_frag, bit Is2Addr = 1> {
+ def rr : AES8I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+ Sched<[WriteAESDecEnc]>;
+ def rm : AES8I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set VR128:$dst,
+ (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>,
+ Sched<[WriteAESDecEncLd, ReadAfterLd]>;
+}
+
+// Perform One Round of an AES Encryption/Decryption Flow
+let Predicates = [HasAVX, HasAES] in {
+ defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc",
+ int_x86_aesni_aesenc, loadv2i64, 0>, VEX_4V;
+ defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast",
+ int_x86_aesni_aesenclast, loadv2i64, 0>, VEX_4V;
+ defm VAESDEC : AESI_binop_rm_int<0xDE, "vaesdec",
+ int_x86_aesni_aesdec, loadv2i64, 0>, VEX_4V;
+ defm VAESDECLAST : AESI_binop_rm_int<0xDF, "vaesdeclast",
+ int_x86_aesni_aesdeclast, loadv2i64, 0>, VEX_4V;
+}
+
+let Constraints = "$src1 = $dst" in {
+ defm AESENC : AESI_binop_rm_int<0xDC, "aesenc",
+ int_x86_aesni_aesenc, memopv2i64>;
+ defm AESENCLAST : AESI_binop_rm_int<0xDD, "aesenclast",
+ int_x86_aesni_aesenclast, memopv2i64>;
+ defm AESDEC : AESI_binop_rm_int<0xDE, "aesdec",
+ int_x86_aesni_aesdec, memopv2i64>;
+ defm AESDECLAST : AESI_binop_rm_int<0xDF, "aesdeclast",
+ int_x86_aesni_aesdeclast, memopv2i64>;
+}
+
+// Perform the AES InvMixColumn Transformation
+let Predicates = [HasAVX, HasAES] in {
+ def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1),
+ "vaesimc\t{$src1, $dst|$dst, $src1}",
+ [(set VR128:$dst,
+ (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>,
+ VEX;
+ def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src1),
+ "vaesimc\t{$src1, $dst|$dst, $src1}",
+ [(set VR128:$dst, (int_x86_aesni_aesimc (loadv2i64 addr:$src1)))]>,
+ Sched<[WriteAESIMCLd]>, VEX;
+}
+def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1),
+ "aesimc\t{$src1, $dst|$dst, $src1}",
+ [(set VR128:$dst,
+ (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>;
+def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src1),
+ "aesimc\t{$src1, $dst|$dst, $src1}",
+ [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>,
+ Sched<[WriteAESIMCLd]>;
+
+// AES Round Key Generation Assist
+let Predicates = [HasAVX, HasAES] in {
+ def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, u8imm:$src2),
+ "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
+ Sched<[WriteAESKeyGen]>, VEX;
+ def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src1, u8imm:$src2),
+ "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>,
+ Sched<[WriteAESKeyGenLd]>, VEX;
+}
+def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, u8imm:$src2),
+ "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
+ Sched<[WriteAESKeyGen]>;
+def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src1, u8imm:$src2),
+ "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>,
+ Sched<[WriteAESKeyGenLd]>;
+
+//===----------------------------------------------------------------------===//
+// PCLMUL Instructions
+//===----------------------------------------------------------------------===//
+
+// AVX carry-less Multiplication instructions
+let isCommutable = 1 in
+def VPCLMULQDQrr : AVXPCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+ "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set VR128:$dst,
+ (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>,
+ Sched<[WriteCLMul]>;
+
+def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+ "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
+ (loadv2i64 addr:$src2), imm:$src3))]>,
+ Sched<[WriteCLMulLd, ReadAfterLd]>;
+
+// Carry-less Multiplication instructions
+let Constraints = "$src1 = $dst" in {
+let isCommutable = 1 in
+def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+ "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR128:$dst,
+ (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))],
+ IIC_SSE_PCLMULQDQ_RR>, Sched<[WriteCLMul]>;
+
+def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+ "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
+ (memopv2i64 addr:$src2), imm:$src3))],
+ IIC_SSE_PCLMULQDQ_RM>,
+ Sched<[WriteCLMulLd, ReadAfterLd]>;
+} // Constraints = "$src1 = $dst"
+
+
+multiclass pclmul_alias<string asm, int immop> {
+ def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
+ (PCLMULQDQrr VR128:$dst, VR128:$src, immop), 0>;
+
+ def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
+ (PCLMULQDQrm VR128:$dst, i128mem:$src, immop), 0>;
+
+ def : InstAlias<!strconcat("vpclmul", asm,
+ "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
+ (VPCLMULQDQrr VR128:$dst, VR128:$src1, VR128:$src2, immop),
+ 0>;
+
+ def : InstAlias<!strconcat("vpclmul", asm,
+ "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
+ (VPCLMULQDQrm VR128:$dst, VR128:$src1, i128mem:$src2, immop),
+ 0>;
+}
+defm : pclmul_alias<"hqhq", 0x11>;
+defm : pclmul_alias<"hqlq", 0x01>;
+defm : pclmul_alias<"lqhq", 0x10>;
+defm : pclmul_alias<"lqlq", 0x00>;
+
+//===----------------------------------------------------------------------===//
+// SSE4A Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasSSE4A] in {
+
+let ExeDomain = SSEPackedInt in {
+let Constraints = "$src = $dst" in {
+def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
+ (ins VR128:$src, u8imm:$len, u8imm:$idx),
+ "extrq\t{$idx, $len, $src|$src, $len, $idx}",
+ [(set VR128:$dst, (X86extrqi VR128:$src, imm:$len,
+ imm:$idx))]>, PD;
+def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src, VR128:$mask),
+ "extrq\t{$mask, $src|$src, $mask}",
+ [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
+ VR128:$mask))]>, PD;
+
+def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
+ "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
+ [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
+ imm:$len, imm:$idx))]>, XD;
+def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src, VR128:$mask),
+ "insertq\t{$mask, $src|$src, $mask}",
+ [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
+ VR128:$mask))]>, XD;
+}
+} // ExeDomain = SSEPackedInt
+
+// Non-temporal (unaligned) scalar stores.
+let AddedComplexity = 400 in { // Prefer non-temporal versions
+let mayStore = 1, SchedRW = [WriteStore] in {
+def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
+ "movntss\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVNT>, XS;
+
+def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movntsd\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVNT>, XD;
+} // SchedRW
+
+def : Pat<(nontemporalstore FR32:$src, addr:$dst),
+ (MOVNTSS addr:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+def : Pat<(nontemporalstore FR64:$src, addr:$dst),
+ (MOVNTSD addr:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+} // AddedComplexity
+} // HasSSE4A
+
+//===----------------------------------------------------------------------===//
+// AVX Instructions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// VBROADCAST - Load from memory and broadcast to all elements of the
+// destination operand
+//
+class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ X86MemOperand x86memop, ValueType VT,
+ PatFrag ld_frag, SchedWrite Sched> :
+ AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>,
+ Sched<[Sched]>, VEX;
+
+// AVX2 adds register forms
+class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ ValueType ResVT, ValueType OpVT, SchedWrite Sched> :
+ AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>,
+ Sched<[Sched]>, VEX;
+
+let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in {
+ def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128,
+ f32mem, v4f32, loadf32, WriteLoad>;
+ def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256,
+ f32mem, v8f32, loadf32,
+ WriteFShuffleLd>, VEX_L;
+}
+let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in
+def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem,
+ v4f64, loadf64, WriteFShuffleLd>, VEX_L;
+
+let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in {
+ def VBROADCASTSSrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR128,
+ v4f32, v4f32, WriteFShuffle>;
+ def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256,
+ v8f32, v4f32, WriteFShuffle256>, VEX_L;
+}
+let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in
+def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
+ v4f64, v2f64, WriteFShuffle256>, VEX_L;
+
+//===----------------------------------------------------------------------===//
+// VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both
+// halves of a 256-bit vector.
+//
+let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in
+def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
+ (ins i128mem:$src),
+ "vbroadcasti128\t{$src, $dst|$dst, $src}", []>,
+ Sched<[WriteLoad]>, VEX, VEX_L;
+
+let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX] in
+def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
+ (ins f128mem:$src),
+ "vbroadcastf128\t{$src, $dst|$dst, $src}", []>,
+ Sched<[WriteFShuffleLd]>, VEX, VEX_L;
+
+let Predicates = [HasAVX2, NoVLX] in {
+def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
+ (VBROADCASTI128 addr:$src)>;
+def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))),
+ (VBROADCASTI128 addr:$src)>;
+def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
+ (VBROADCASTI128 addr:$src)>;
+def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
+ (VBROADCASTI128 addr:$src)>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
+ (VBROADCASTF128 addr:$src)>;
+def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))),
+ (VBROADCASTF128 addr:$src)>;
+}
+
+let Predicates = [HasAVX1Only] in {
+def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
+ (VBROADCASTF128 addr:$src)>;
+def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))),
+ (VBROADCASTF128 addr:$src)>;
+def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
+ (VBROADCASTF128 addr:$src)>;
+def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
+ (VBROADCASTF128 addr:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// VINSERTF128 - Insert packed floating-point values
+//
+let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
+def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR128:$src2, u8imm:$src3),
+ "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ []>, Sched<[WriteFShuffle]>, VEX_4V, VEX_L;
+let mayLoad = 1 in
+def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, f128mem:$src2, u8imm:$src3),
+ "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ []>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L;
+}
+
+multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To,
+ PatFrag memop_frag> {
+ def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2),
+ (iPTR imm)),
+ (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2,
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+ def : Pat<(vinsert128_insert:$ins (To VR256:$src1),
+ (From (bitconvert (memop_frag addr:$src2))),
+ (iPTR imm)),
+ (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+ defm : vinsert_lowering<"VINSERTF128", v4f32, v8f32, loadv4f32>;
+ defm : vinsert_lowering<"VINSERTF128", v2f64, v4f64, loadv2f64>;
+}
+
+let Predicates = [HasAVX1Only] in {
+ defm : vinsert_lowering<"VINSERTF128", v2i64, v4i64, loadv2i64>;
+ defm : vinsert_lowering<"VINSERTF128", v4i32, v8i32, loadv2i64>;
+ defm : vinsert_lowering<"VINSERTF128", v8i16, v16i16, loadv2i64>;
+ defm : vinsert_lowering<"VINSERTF128", v16i8, v32i8, loadv2i64>;
+}
+
+//===----------------------------------------------------------------------===//
+// VEXTRACTF128 - Extract packed floating-point values
+//
+let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
+def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
+ (ins VR256:$src1, u8imm:$src2),
+ "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ []>, Sched<[WriteFShuffle]>, VEX, VEX_L;
+let mayStore = 1 in
+def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
+ (ins f128mem:$dst, VR256:$src1, u8imm:$src2),
+ "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ []>, Sched<[WriteStore]>, VEX, VEX_L;
+}
+
+multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> {
+ def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
+ (To (!cast<Instruction>(InstrStr#rr)
+ (From VR256:$src1),
+ (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+ def : Pat<(store (To (vextract128_extract:$ext (From VR256:$src1),
+ (iPTR imm))), addr:$dst),
+ (!cast<Instruction>(InstrStr#mr) addr:$dst, VR256:$src1,
+ (EXTRACT_get_vextract128_imm VR128:$ext))>;
+}
+
+// AVX1 patterns
+let Predicates = [HasAVX, NoVLX] in {
+ defm : vextract_lowering<"VEXTRACTF128", v8f32, v4f32>;
+ defm : vextract_lowering<"VEXTRACTF128", v4f64, v2f64>;
+}
+
+let Predicates = [HasAVX1Only] in {
+ defm : vextract_lowering<"VEXTRACTF128", v4i64, v2i64>;
+ defm : vextract_lowering<"VEXTRACTF128", v8i32, v4i32>;
+ defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>;
+ defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>;
+}
+
+//===----------------------------------------------------------------------===//
+// VMASKMOV - Conditional SIMD Packed Loads and Stores
+//
+multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
+ Intrinsic IntLd, Intrinsic IntLd256,
+ Intrinsic IntSt, Intrinsic IntSt256> {
+ def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, f128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
+ VEX_4V;
+ def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, f256mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
+ VEX_4V, VEX_L;
+ def mr : AVX8I<opc_mr, MRMDestMem, (outs),
+ (ins f128mem:$dst, VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V;
+ def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
+ (ins f256mem:$dst, VR256:$src1, VR256:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
+}
+
+let ExeDomain = SSEPackedSingle in
+defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
+ int_x86_avx_maskload_ps,
+ int_x86_avx_maskload_ps_256,
+ int_x86_avx_maskstore_ps,
+ int_x86_avx_maskstore_ps_256>;
+let ExeDomain = SSEPackedDouble in
+defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
+ int_x86_avx_maskload_pd,
+ int_x86_avx_maskload_pd_256,
+ int_x86_avx_maskstore_pd,
+ int_x86_avx_maskstore_pd_256>;
+
+//===----------------------------------------------------------------------===//
+// VPERMIL - Permute Single and Double Floating-Point Values
+//
+multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
+ RegisterClass RC, X86MemOperand x86memop_f,
+ X86MemOperand x86memop_i, PatFrag i_frag,
+ ValueType f_vt, ValueType i_vt> {
+ let Predicates = [HasAVX, NoVLX] in {
+ def rr : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V,
+ Sched<[WriteFShuffle]>;
+ def rm : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop_i:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1,
+ (i_vt (bitconvert (i_frag addr:$src2))))))]>, VEX_4V,
+ Sched<[WriteFShuffleLd, ReadAfterLd]>;
+
+ def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX,
+ Sched<[WriteFShuffle]>;
+ def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
+ (ins x86memop_f:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst,
+ (f_vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX,
+ Sched<[WriteFShuffleLd]>;
+ }// Predicates = [HasAVX, NoVLX]
+}
+
+let ExeDomain = SSEPackedSingle in {
+ defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
+ loadv2i64, v4f32, v4i32>;
+ defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
+ loadv4i64, v8f32, v8i32>, VEX_L;
+}
+let ExeDomain = SSEPackedDouble in {
+ defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
+ loadv2i64, v2f64, v2i64>;
+ defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
+ loadv4i64, v4f64, v4i64>, VEX_L;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (v8i32 VR256:$src2))),
+ (VPERMILPSYrr VR256:$src1, VR256:$src2)>;
+def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
+ (VPERMILPSYrm VR256:$src1, addr:$src2)>;
+def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (v4i64 VR256:$src2))),
+ (VPERMILPDYrr VR256:$src1, VR256:$src2)>;
+def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (loadv4i64 addr:$src2))),
+ (VPERMILPDYrm VR256:$src1, addr:$src2)>;
+
+def : Pat<(v8i32 (X86VPermilpi VR256:$src1, (i8 imm:$imm))),
+ (VPERMILPSYri VR256:$src1, imm:$imm)>;
+def : Pat<(v4i64 (X86VPermilpi VR256:$src1, (i8 imm:$imm))),
+ (VPERMILPDYri VR256:$src1, imm:$imm)>;
+def : Pat<(v8i32 (X86VPermilpi (bc_v8i32 (loadv4i64 addr:$src1)),
+ (i8 imm:$imm))),
+ (VPERMILPSYmi addr:$src1, imm:$imm)>;
+def : Pat<(v4i64 (X86VPermilpi (loadv4i64 addr:$src1), (i8 imm:$imm))),
+ (VPERMILPDYmi addr:$src1, imm:$imm)>;
+
+def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (v4i32 VR128:$src2))),
+ (VPERMILPSrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)))),
+ (VPERMILPSrm VR128:$src1, addr:$src2)>;
+def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (v2i64 VR128:$src2))),
+ (VPERMILPDrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (loadv2i64 addr:$src2))),
+ (VPERMILPDrm VR128:$src1, addr:$src2)>;
+
+def : Pat<(v2i64 (X86VPermilpi VR128:$src1, (i8 imm:$imm))),
+ (VPERMILPDri VR128:$src1, imm:$imm)>;
+def : Pat<(v2i64 (X86VPermilpi (loadv2i64 addr:$src1), (i8 imm:$imm))),
+ (VPERMILPDmi addr:$src1, imm:$imm)>;
+}
+
+//===----------------------------------------------------------------------===//
+// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
+//
+let ExeDomain = SSEPackedSingle in {
+let isCommutable = 1 in
+def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, u8imm:$src3),
+ "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set VR256:$dst, (v8f32 (X86VPerm2x128 VR256:$src1, VR256:$src2,
+ (i8 imm:$src3))))]>, VEX_4V, VEX_L,
+ Sched<[WriteFShuffle]>;
+def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
+ "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv8f32 addr:$src2),
+ (i8 imm:$src3)))]>, VEX_4V, VEX_L,
+ Sched<[WriteFShuffleLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in {
+def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1,
+ (loadv4f64 addr:$src2), (i8 imm:$imm))),
+ (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+}
+
+let Predicates = [HasAVX1Only] in {
+def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+
+def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1,
+ (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
+ (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
+ (loadv4i64 addr:$src2), (i8 imm:$imm))),
+ (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1,
+ (bc_v32i8 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
+ (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
+ (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
+ (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+}
+
+//===----------------------------------------------------------------------===//
+// VZERO - Zero YMM registers
+//
+let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
+ YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
+ // Zero All YMM registers
+ def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
+ [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L, Requires<[HasAVX]>;
+
+ // Zero Upper bits of YMM registers
+ def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
+ [(int_x86_avx_vzeroupper)]>, PS, VEX, Requires<[HasAVX]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Half precision conversion instructions
+//===----------------------------------------------------------------------===//
+multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
+ def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
+ "vcvtph2ps\t{$src, $dst|$dst, $src}",
+ [(set RC:$dst, (Int VR128:$src))]>,
+ T8PD, VEX, Sched<[WriteCvtF2F]>;
+ let hasSideEffects = 0, mayLoad = 1 in
+ def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+ "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8PD, VEX,
+ Sched<[WriteCvtF2FLd]>;
+}
+
+multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
+ def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
+ (ins RC:$src1, i32u8imm:$src2),
+ "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst, (Int RC:$src1, imm:$src2))]>,
+ TAPD, VEX, Sched<[WriteCvtF2F]>;
+ let hasSideEffects = 0, mayStore = 1,
+ SchedRW = [WriteCvtF2FLd, WriteRMW] in
+ def mr : Ii8<0x1D, MRMDestMem, (outs),
+ (ins x86memop:$dst, RC:$src1, i32u8imm:$src2),
+ "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ TAPD, VEX;
+}
+
+let Predicates = [HasF16C] in {
+ defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>;
+ defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L;
+ defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>;
+ defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>, VEX_L;
+
+ // Pattern match vcvtph2ps of a scalar i64 load.
+ def : Pat<(int_x86_vcvtph2ps_128 (vzmovl_v2i64 addr:$src)),
+ (VCVTPH2PSrm addr:$src)>;
+ def : Pat<(int_x86_vcvtph2ps_128 (vzload_v2i64 addr:$src)),
+ (VCVTPH2PSrm addr:$src)>;
+ def : Pat<(int_x86_vcvtph2ps_128 (bitconvert
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+ (VCVTPH2PSrm addr:$src)>;
+
+ def : Pat<(store (f64 (extractelt (bc_v2f64 (v8i16
+ (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))),
+ addr:$dst),
+ (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
+ def : Pat<(store (i64 (extractelt (bc_v2i64 (v8i16
+ (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))),
+ addr:$dst),
+ (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
+ def : Pat<(store (v8i16 (int_x86_vcvtps2ph_256 VR256:$src1, i32:$src2)),
+ addr:$dst),
+ (VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>;
+}
+
+// Patterns for matching conversions from float to half-float and vice versa.
+let Predicates = [HasF16C, NoVLX] in {
+ // Use MXCSR.RC for rounding instead of explicitly specifying the default
+ // rounding mode (Nearest-Even, encoded as 0). Both are equivalent in the
+ // configurations we support (the default). However, falling back to MXCSR is
+ // more consistent with other instructions, which are always controlled by it.
+ // It's encoded as 0b100.
+ def : Pat<(fp_to_f16 FR32:$src),
+ (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (VCVTPS2PHrr
+ (COPY_TO_REGCLASS FR32:$src, VR128), 4)), sub_16bit))>;
+
+ def : Pat<(f16_to_fp GR16:$src),
+ (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr
+ (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)), FR32)) >;
+
+ def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))),
+ (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr
+ (VCVTPS2PHrr (COPY_TO_REGCLASS FR32:$src, VR128), 4)), FR32)) >;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX2 Instructions
+//===----------------------------------------------------------------------===//
+
+/// AVX2_binop_rmi - AVX2 binary operator with 8-bit immediate
+multiclass AVX2_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+ X86MemOperand x86memop> {
+ let isCommutable = 1 in
+ def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, u8imm:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
+ Sched<[WriteBlend]>, VEX_4V;
+ def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2, u8imm:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set RC:$dst,
+ (OpVT (OpNode RC:$src1,
+ (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>,
+ Sched<[WriteBlendLd, ReadAfterLd]>, VEX_4V;
+}
+
+defm VPBLENDD : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v4i32,
+ VR128, loadv2i64, i128mem>;
+defm VPBLENDDY : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v8i32,
+ VR256, loadv4i64, i256mem>, VEX_L;
+
+//===----------------------------------------------------------------------===//
+// VPBROADCAST - Load from memory and broadcast to all elements of the
+// destination operand
+//
+multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
+ X86MemOperand x86memop, PatFrag ld_frag,
+ ValueType OpVT128, ValueType OpVT256, Predicate prd> {
+ let Predicates = [HasAVX2, prd] in {
+ def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst,
+ (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>,
+ Sched<[WriteShuffle]>, VEX;
+ def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst,
+ (OpVT128 (X86VBroadcast (ld_frag addr:$src))))]>,
+ Sched<[WriteLoad]>, VEX;
+ def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst,
+ (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>,
+ Sched<[WriteShuffle256]>, VEX, VEX_L;
+ def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst,
+ (OpVT256 (X86VBroadcast (ld_frag addr:$src))))]>,
+ Sched<[WriteLoad]>, VEX, VEX_L;
+
+ // Provide aliases for broadcast from the same register class that
+ // automatically does the extract.
+ def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))),
+ (!cast<Instruction>(NAME#"Yrr")
+ (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>;
+ }
+}
+
+defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8,
+ v16i8, v32i8, NoVLX_Or_NoBWI>;
+defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16,
+ v8i16, v16i16, NoVLX_Or_NoBWI>;
+defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32,
+ v4i32, v8i32, NoVLX>;
+defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64,
+ v2i64, v4i64, NoVLX>;
+
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+ // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
+ // This means we'll encounter truncated i32 loads; match that here.
+ def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
+ (VPBROADCASTWrm addr:$src)>;
+ def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
+ (VPBROADCASTWYrm addr:$src)>;
+ def : Pat<(v8i16 (X86VBroadcast
+ (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
+ (VPBROADCASTWrm addr:$src)>;
+ def : Pat<(v16i16 (X86VBroadcast
+ (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
+ (VPBROADCASTWYrm addr:$src)>;
+}
+
+let Predicates = [HasAVX2] in {
+ // Provide aliases for broadcast from the same register class that
+ // automatically does the extract.
+ def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))),
+ (VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src),
+ sub_xmm)))>;
+ def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256:$src))),
+ (VBROADCASTSDYrr (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src),
+ sub_xmm)))>;
+}
+
+let Predicates = [HasAVX2, NoVLX] in {
+ // Provide fallback in case the load node that is used in the patterns above
+ // is used by additional users, which prevents the pattern selection.
+ def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
+ (VBROADCASTSSrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
+ def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
+ (VBROADCASTSSYrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
+ def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
+ (VBROADCASTSDYrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
+}
+
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+ def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
+ (VPBROADCASTBrr (COPY_TO_REGCLASS
+ (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+ GR8:$src, sub_8bit)),
+ VR128))>;
+ def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
+ (VPBROADCASTBYrr (COPY_TO_REGCLASS
+ (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+ GR8:$src, sub_8bit)),
+ VR128))>;
+
+ def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
+ (VPBROADCASTWrr (COPY_TO_REGCLASS
+ (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+ GR16:$src, sub_16bit)),
+ VR128))>;
+ def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
+ (VPBROADCASTWYrr (COPY_TO_REGCLASS
+ (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+ GR16:$src, sub_16bit)),
+ VR128))>;
+}
+let Predicates = [HasAVX2, NoVLX] in {
+ def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
+ (VBROADCASTSSrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
+ def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
+ (VBROADCASTSSYrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
+ def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
+ (VBROADCASTSDYrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
+
+ // The patterns for VPBROADCASTD are not needed because they would match
+ // the exact same thing as VBROADCASTSS patterns.
+
+ def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
+ (VPBROADCASTQrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
+ // The v4i64 pattern is not needed because VBROADCASTSDYrr already match.
+}
+
+// AVX1 broadcast patterns
+let Predicates = [HasAVX1Only] in {
+def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
+ (VBROADCASTSSYrm addr:$src)>;
+def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
+ (VBROADCASTSDYrm addr:$src)>;
+def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
+ (VBROADCASTSSrm addr:$src)>;
+}
+
+ // Provide fallback in case the load node that is used in the patterns above
+ // is used by additional users, which prevents the pattern selection.
+let Predicates = [HasAVX, NoVLX] in {
+ // 128bit broadcasts:
+ def : Pat<(v2f64 (X86VBroadcast f64:$src)),
+ (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
+}
+
+let Predicates = [HasAVX1Only] in {
+ def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
+ (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0)>;
+ def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
+ (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
+ (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), sub_xmm),
+ (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), 1)>;
+ def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
+ (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
+ (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), sub_xmm),
+ (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), 1)>;
+
+ def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
+ (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0)>;
+ def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
+ (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+ (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), sub_xmm),
+ (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), 1)>;
+ def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
+ (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
+ (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), sub_xmm),
+ (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), 1)>;
+
+ def : Pat<(v2i64 (X86VBroadcast i64:$src)),
+ (VMOVDDUPrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
+}
+
+//===----------------------------------------------------------------------===//
+// VPERM - Permute instructions
+//
+
+multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
+ ValueType OpVT, X86FoldableSchedWrite Sched> {
+ let Predicates = [HasAVX2, NoVLX] in {
+ def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
+ Sched<[Sched]>, VEX_4V, VEX_L;
+ def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, i256mem:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (OpVT (X86VPermv VR256:$src1,
+ (bitconvert (mem_frag addr:$src2)))))]>,
+ Sched<[Sched.Folded, ReadAfterLd]>, VEX_4V, VEX_L;
+ }
+}
+
+defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteShuffle256>;
+let ExeDomain = SSEPackedSingle in
+defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFShuffle256>;
+
+multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
+ ValueType OpVT, X86FoldableSchedWrite Sched> {
+ let Predicates = [HasAVX2, NoVLX] in {
+ def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>,
+ Sched<[Sched]>, VEX, VEX_L;
+ def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins i256mem:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (OpVT (X86VPermi (mem_frag addr:$src1),
+ (i8 imm:$src2))))]>,
+ Sched<[Sched.Folded, ReadAfterLd]>, VEX, VEX_L;
+ }
+}
+
+defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64,
+ WriteShuffle256>, VEX_W;
+let ExeDomain = SSEPackedDouble in
+defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
+ WriteFShuffle256>, VEX_W;
+
+//===----------------------------------------------------------------------===//
+// VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
+//
+let isCommutable = 1 in
+def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, u8imm:$src3),
+ "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
+ (i8 imm:$src3))))]>, Sched<[WriteShuffle256]>,
+ VEX_4V, VEX_L;
+def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
+ "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2),
+ (i8 imm:$src3)))]>,
+ Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
+
+let Predicates = [HasAVX2] in {
+def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+
+def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, (bc_v32i8 (loadv4i64 addr:$src2)),
+ (i8 imm:$imm))),
+ (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
+ (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
+ (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)),
+ (i8 imm:$imm))),
+ (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// VINSERTI128 - Insert packed integer values
+//
+let hasSideEffects = 0 in {
+def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR128:$src2, u8imm:$src3),
+ "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
+let mayLoad = 1 in
+def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, i128mem:$src2, u8imm:$src3),
+ "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ []>, Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
+}
+
+let Predicates = [HasAVX2, NoVLX] in {
+ defm : vinsert_lowering<"VINSERTI128", v2i64, v4i64, loadv2i64>;
+ defm : vinsert_lowering<"VINSERTI128", v4i32, v8i32, loadv2i64>;
+ defm : vinsert_lowering<"VINSERTI128", v8i16, v16i16, loadv2i64>;
+ defm : vinsert_lowering<"VINSERTI128", v16i8, v32i8, loadv2i64>;
+}
+
+//===----------------------------------------------------------------------===//
+// VEXTRACTI128 - Extract packed integer values
+//
+def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
+ (ins VR256:$src1, u8imm:$src2),
+ "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ Sched<[WriteShuffle256]>, VEX, VEX_L;
+let hasSideEffects = 0, mayStore = 1 in
+def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
+ (ins i128mem:$dst, VR256:$src1, u8imm:$src2),
+ "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ Sched<[WriteStore]>, VEX, VEX_L;
+
+let Predicates = [HasAVX2, NoVLX] in {
+ defm : vextract_lowering<"VEXTRACTI128", v4i64, v2i64>;
+ defm : vextract_lowering<"VEXTRACTI128", v8i32, v4i32>;
+ defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>;
+ defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>;
+}
+
+//===----------------------------------------------------------------------===//
+// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
+//
+multiclass avx2_pmovmask<string OpcodeStr,
+ Intrinsic IntLd128, Intrinsic IntLd256,
+ Intrinsic IntSt128, Intrinsic IntSt256> {
+ def rm : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>, VEX_4V;
+ def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, i256mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
+ VEX_4V, VEX_L;
+ def mr : AVX28I<0x8e, MRMDestMem, (outs),
+ (ins i128mem:$dst, VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V;
+ def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
+ (ins i256mem:$dst, VR256:$src1, VR256:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
+}
+
+defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
+ int_x86_avx2_maskload_d,
+ int_x86_avx2_maskload_d_256,
+ int_x86_avx2_maskstore_d,
+ int_x86_avx2_maskstore_d_256>;
+defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
+ int_x86_avx2_maskload_q,
+ int_x86_avx2_maskload_q_256,
+ int_x86_avx2_maskstore_q,
+ int_x86_avx2_maskstore_q_256>, VEX_W;
+
+multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
+ ValueType MaskVT, string BlendStr, ValueType ZeroVT> {
+ // masked store
+ def: Pat<(X86mstore addr:$ptr, (MaskVT RC:$mask), (VT RC:$src)),
+ (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>;
+ // masked load
+ def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), undef)),
+ (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
+ def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask),
+ (VT (bitconvert (ZeroVT immAllZerosV))))),
+ (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
+ def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), (VT RC:$src0))),
+ (!cast<Instruction>(BlendStr#"rr")
+ RC:$src0,
+ (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr),
+ RC:$mask)>;
+}
+let Predicates = [HasAVX] in {
+ defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32, "VBLENDVPS", v4i32>;
+ defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64, "VBLENDVPD", v4i32>;
+ defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32, "VBLENDVPSY", v8i32>;
+ defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64, "VBLENDVPDY", v8i32>;
+}
+let Predicates = [HasAVX1Only] in {
+ // load/store i32/i64 not supported use ps/pd version
+ defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>;
+ defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>;
+ defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>;
+ defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>;
+}
+let Predicates = [HasAVX2] in {
+ defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>;
+ defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>;
+ defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>;
+ defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>;
+}
+
+//===----------------------------------------------------------------------===//
+// SubVector Broadcasts
+// Provide fallback in case the load node that is used in the patterns above
+// is used by additional users, which prevents the pattern selection.
+
+let Predicates = [HasAVX2, NoVLX] in {
+def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))),
+ (VINSERTI128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+ (v2i64 VR128:$src), 1)>;
+def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))),
+ (VINSERTI128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+ (v4i32 VR128:$src), 1)>;
+def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))),
+ (VINSERTI128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+ (v8i16 VR128:$src), 1)>;
+def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))),
+ (VINSERTI128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+ (v16i8 VR128:$src), 1)>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128:$src))),
+ (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+ (v2f64 VR128:$src), 1)>;
+def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128:$src))),
+ (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+ (v4f32 VR128:$src), 1)>;
+}
+
+let Predicates = [HasAVX1Only] in {
+def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))),
+ (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+ (v2i64 VR128:$src), 1)>;
+def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))),
+ (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+ (v4i32 VR128:$src), 1)>;
+def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))),
+ (VINSERTF128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+ (v8i16 VR128:$src), 1)>;
+def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))),
+ (VINSERTF128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+ (v16i8 VR128:$src), 1)>;
+}
+
+//===----------------------------------------------------------------------===//
+// Variable Bit Shifts
+//
+multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType vt128, ValueType vt256> {
+ def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
+ VEX_4V, Sched<[WriteVarVecShift]>;
+ def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode VR128:$src1,
+ (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
+ VEX_4V, Sched<[WriteVarVecShiftLd, ReadAfterLd]>;
+ def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
+ VEX_4V, VEX_L, Sched<[WriteVarVecShift]>;
+ def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, i256mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (vt256 (OpNode VR256:$src1,
+ (vt256 (bitconvert (loadv4i64 addr:$src2))))))]>,
+ VEX_4V, VEX_L, Sched<[WriteVarVecShiftLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX2, NoVLX] in {
+ defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>;
+ defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W;
+ defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>;
+ defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W;
+ defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>;
+
+ def : Pat<(v4i32 (X86vsrav VR128:$src1, VR128:$src2)),
+ (VPSRAVDrr VR128:$src1, VR128:$src2)>;
+ def : Pat<(v4i32 (X86vsrav VR128:$src1,
+ (bitconvert (loadv2i64 addr:$src2)))),
+ (VPSRAVDrm VR128:$src1, addr:$src2)>;
+ def : Pat<(v8i32 (X86vsrav VR256:$src1, VR256:$src2)),
+ (VPSRAVDYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v8i32 (X86vsrav VR256:$src1,
+ (bitconvert (loadv4i64 addr:$src2)))),
+ (VPSRAVDYrm VR256:$src1, addr:$src2)>;
+}
+
+
+
+//===----------------------------------------------------------------------===//
+// VGATHER - GATHER Operations
+multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,
+ X86MemOperand memop128, X86MemOperand memop256> {
+ def rm : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb),
+ (ins VR128:$src1, memop128:$src2, VR128:$mask),
+ !strconcat(OpcodeStr,
+ "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
+ []>, VEX;
+ def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb),
+ (ins RC256:$src1, memop256:$src2, RC256:$mask),
+ !strconcat(OpcodeStr,
+ "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
+ []>, VEX, VEX_L;
+}
+
+let mayLoad = 1, hasSideEffects = 0, Constraints
+ = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
+ in {
+ defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", VR256, vx128mem, vx256mem>, VEX_W;
+ defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", VR256, vx128mem, vy256mem>, VEX_W;
+ defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", VR256, vx128mem, vy256mem>;
+ defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", VR128, vx64mem, vy128mem>;
+
+ let ExeDomain = SSEPackedDouble in {
+ defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx128mem, vx256mem>, VEX_W;
+ defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx128mem, vy256mem>, VEX_W;
+ }
+
+ let ExeDomain = SSEPackedSingle in {
+ defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx128mem, vy256mem>;
+ defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx64mem, vy128mem>;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Extra selection patterns for FR128, f128, f128mem
+
+// movaps is shorter than movdqa. movaps is in SSE and movdqa is in SSE2.
+def : Pat<(store (f128 FR128:$src), addr:$dst),
+ (MOVAPSmr addr:$dst, (COPY_TO_REGCLASS (f128 FR128:$src), VR128))>;
+
+def : Pat<(loadf128 addr:$src),
+ (COPY_TO_REGCLASS (MOVAPSrm addr:$src), FR128)>;
+
+// andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2
+def : Pat<(X86fand FR128:$src1, (loadf128 addr:$src2)),
+ (COPY_TO_REGCLASS
+ (ANDPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2),
+ FR128)>;
+
+def : Pat<(X86fand FR128:$src1, FR128:$src2),
+ (COPY_TO_REGCLASS
+ (ANDPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+ (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+
+def : Pat<(and FR128:$src1, FR128:$src2),
+ (COPY_TO_REGCLASS
+ (ANDPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+ (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+
+def : Pat<(X86for FR128:$src1, (loadf128 addr:$src2)),
+ (COPY_TO_REGCLASS
+ (ORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2),
+ FR128)>;
+
+def : Pat<(X86for FR128:$src1, FR128:$src2),
+ (COPY_TO_REGCLASS
+ (ORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+ (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+
+def : Pat<(or FR128:$src1, FR128:$src2),
+ (COPY_TO_REGCLASS
+ (ORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+ (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+
+def : Pat<(X86fxor FR128:$src1, (loadf128 addr:$src2)),
+ (COPY_TO_REGCLASS
+ (XORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2),
+ FR128)>;
+
+def : Pat<(X86fxor FR128:$src1, FR128:$src2),
+ (COPY_TO_REGCLASS
+ (XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+ (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+
+def : Pat<(xor FR128:$src1, FR128:$src2),
+ (COPY_TO_REGCLASS
+ (XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+ (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSVM.td b/contrib/llvm/lib/Target/X86/X86InstrSVM.td
new file mode 100644
index 000000000000..c847be7ec099
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrSVM.td
@@ -0,0 +1,62 @@
+//===-- X86InstrSVM.td - SVM Instruction Set Extension -----*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the AMD SVM instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SVM instructions
+
+// 0F 01 D9
+def VMMCALL : I<0x01, MRM_D9, (outs), (ins), "vmmcall", []>, TB;
+
+// 0F 01 DC
+def STGI : I<0x01, MRM_DC, (outs), (ins), "stgi", []>, TB;
+
+// 0F 01 DD
+def CLGI : I<0x01, MRM_DD, (outs), (ins), "clgi", []>, TB;
+
+// 0F 01 DE
+let Uses = [EAX] in
+def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit\t{%eax|eax}", []>, TB;
+
+// 0F 01 D8
+let Uses = [EAX] in
+def VMRUN32 : I<0x01, MRM_D8, (outs), (ins),
+ "vmrun\t{%eax|eax}", []>, TB, Requires<[Not64BitMode]>;
+let Uses = [RAX] in
+def VMRUN64 : I<0x01, MRM_D8, (outs), (ins),
+ "vmrun\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>;
+
+// 0F 01 DA
+let Uses = [EAX] in
+def VMLOAD32 : I<0x01, MRM_DA, (outs), (ins),
+ "vmload\t{%eax|eax}", []>, TB, Requires<[Not64BitMode]>;
+let Uses = [RAX] in
+def VMLOAD64 : I<0x01, MRM_DA, (outs), (ins),
+ "vmload\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>;
+
+// 0F 01 DB
+let Uses = [EAX] in
+def VMSAVE32 : I<0x01, MRM_DB, (outs), (ins),
+ "vmsave\t{%eax|eax}", []>, TB, Requires<[Not64BitMode]>;
+let Uses = [RAX] in
+def VMSAVE64 : I<0x01, MRM_DB, (outs), (ins),
+ "vmsave\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>;
+
+// 0F 01 DF
+let Uses = [EAX, ECX] in
+def INVLPGA32 : I<0x01, MRM_DF, (outs), (ins),
+ "invlpga\t{%ecx, %eax|eax, ecx}", []>, TB, Requires<[Not64BitMode]>;
+let Uses = [RAX, ECX] in
+def INVLPGA64 : I<0x01, MRM_DF, (outs), (ins),
+ "invlpga\t{%ecx, %rax|rax, ecx}", []>, TB, Requires<[In64BitMode]>;
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td
new file mode 100644
index 000000000000..e2be73532157
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td
@@ -0,0 +1,970 @@
+//===-- X86InstrShiftRotate.td - Shift and Rotate Instrs ---*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the shift and rotate instructions.
+//
+//===----------------------------------------------------------------------===//
+
+// FIXME: Someone needs to smear multipattern goodness all over this file.
+
+let Defs = [EFLAGS] in {
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+let Uses = [CL] in {
+def SHL8rCL : I<0xD2, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "shl{b}\t{%cl, $dst|$dst, cl}",
+ [(set GR8:$dst, (shl GR8:$src1, CL))], IIC_SR>;
+def SHL16rCL : I<0xD3, MRM4r, (outs GR16:$dst), (ins GR16:$src1),
+ "shl{w}\t{%cl, $dst|$dst, cl}",
+ [(set GR16:$dst, (shl GR16:$src1, CL))], IIC_SR>, OpSize16;
+def SHL32rCL : I<0xD3, MRM4r, (outs GR32:$dst), (ins GR32:$src1),
+ "shl{l}\t{%cl, $dst|$dst, cl}",
+ [(set GR32:$dst, (shl GR32:$src1, CL))], IIC_SR>, OpSize32;
+def SHL64rCL : RI<0xD3, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
+ "shl{q}\t{%cl, $dst|$dst, cl}",
+ [(set GR64:$dst, (shl GR64:$src1, CL))], IIC_SR>;
+} // Uses = [CL]
+
+def SHL8ri : Ii8<0xC0, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
+ "shl{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
+
+let isConvertibleToThreeAddress = 1 in { // Can transform into LEA.
+def SHL16ri : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
+ "shl{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (shl GR16:$src1, (i8 imm:$src2)))], IIC_SR>,
+ OpSize16;
+def SHL32ri : Ii8<0xC1, MRM4r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
+ "shl{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))], IIC_SR>,
+ OpSize32;
+def SHL64ri : RIi8<0xC1, MRM4r, (outs GR64:$dst),
+ (ins GR64:$src1, u8imm:$src2),
+ "shl{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))],
+ IIC_SR>;
+} // isConvertibleToThreeAddress = 1
+
+// NOTE: We don't include patterns for shifts of a register by one, because
+// 'add reg,reg' is cheaper (and we have a Pat pattern for shift-by-one).
+let hasSideEffects = 0 in {
+def SHL8r1 : I<0xD0, MRM4r, (outs GR8:$dst), (ins GR8:$src1),
+ "shl{b}\t$dst", [], IIC_SR>;
+def SHL16r1 : I<0xD1, MRM4r, (outs GR16:$dst), (ins GR16:$src1),
+ "shl{w}\t$dst", [], IIC_SR>, OpSize16;
+def SHL32r1 : I<0xD1, MRM4r, (outs GR32:$dst), (ins GR32:$src1),
+ "shl{l}\t$dst", [], IIC_SR>, OpSize32;
+def SHL64r1 : RI<0xD1, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
+ "shl{q}\t$dst", [], IIC_SR>;
+} // hasSideEffects = 0
+} // Constraints = "$src = $dst", SchedRW
+
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+// FIXME: Why do we need an explicit "Uses = [CL]" when the instr has a pattern
+// using CL?
+let Uses = [CL] in {
+def SHL8mCL : I<0xD2, MRM4m, (outs), (ins i8mem :$dst),
+ "shl{b}\t{%cl, $dst|$dst, cl}",
+ [(store (shl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>;
+def SHL16mCL : I<0xD3, MRM4m, (outs), (ins i16mem:$dst),
+ "shl{w}\t{%cl, $dst|$dst, cl}",
+ [(store (shl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>,
+ OpSize16;
+def SHL32mCL : I<0xD3, MRM4m, (outs), (ins i32mem:$dst),
+ "shl{l}\t{%cl, $dst|$dst, cl}",
+ [(store (shl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>,
+ OpSize32;
+def SHL64mCL : RI<0xD3, MRM4m, (outs), (ins i64mem:$dst),
+ "shl{q}\t{%cl, $dst|$dst, cl}",
+ [(store (shl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>;
+}
+def SHL8mi : Ii8<0xC0, MRM4m, (outs), (ins i8mem :$dst, u8imm:$src),
+ "shl{b}\t{$src, $dst|$dst, $src}",
+ [(store (shl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
+def SHL16mi : Ii8<0xC1, MRM4m, (outs), (ins i16mem:$dst, u8imm:$src),
+ "shl{w}\t{$src, $dst|$dst, $src}",
+ [(store (shl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>, OpSize16;
+def SHL32mi : Ii8<0xC1, MRM4m, (outs), (ins i32mem:$dst, u8imm:$src),
+ "shl{l}\t{$src, $dst|$dst, $src}",
+ [(store (shl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>, OpSize32;
+def SHL64mi : RIi8<0xC1, MRM4m, (outs), (ins i64mem:$dst, u8imm:$src),
+ "shl{q}\t{$src, $dst|$dst, $src}",
+ [(store (shl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
+
+// Shift by 1
+def SHL8m1 : I<0xD0, MRM4m, (outs), (ins i8mem :$dst),
+ "shl{b}\t$dst",
+ [(store (shl (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
+def SHL16m1 : I<0xD1, MRM4m, (outs), (ins i16mem:$dst),
+ "shl{w}\t$dst",
+ [(store (shl (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>, OpSize16;
+def SHL32m1 : I<0xD1, MRM4m, (outs), (ins i32mem:$dst),
+ "shl{l}\t$dst",
+ [(store (shl (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>, OpSize32;
+def SHL64m1 : RI<0xD1, MRM4m, (outs), (ins i64mem:$dst),
+ "shl{q}\t$dst",
+ [(store (shl (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
+} // SchedRW
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+let Uses = [CL] in {
+def SHR8rCL : I<0xD2, MRM5r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "shr{b}\t{%cl, $dst|$dst, cl}",
+ [(set GR8:$dst, (srl GR8:$src1, CL))], IIC_SR>;
+def SHR16rCL : I<0xD3, MRM5r, (outs GR16:$dst), (ins GR16:$src1),
+ "shr{w}\t{%cl, $dst|$dst, cl}",
+ [(set GR16:$dst, (srl GR16:$src1, CL))], IIC_SR>, OpSize16;
+def SHR32rCL : I<0xD3, MRM5r, (outs GR32:$dst), (ins GR32:$src1),
+ "shr{l}\t{%cl, $dst|$dst, cl}",
+ [(set GR32:$dst, (srl GR32:$src1, CL))], IIC_SR>, OpSize32;
+def SHR64rCL : RI<0xD3, MRM5r, (outs GR64:$dst), (ins GR64:$src1),
+ "shr{q}\t{%cl, $dst|$dst, cl}",
+ [(set GR64:$dst, (srl GR64:$src1, CL))], IIC_SR>;
+}
+
+def SHR8ri : Ii8<0xC0, MRM5r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$src2),
+ "shr{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (srl GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
+def SHR16ri : Ii8<0xC1, MRM5r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
+ "shr{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (srl GR16:$src1, (i8 imm:$src2)))],
+ IIC_SR>, OpSize16;
+def SHR32ri : Ii8<0xC1, MRM5r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
+ "shr{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (srl GR32:$src1, (i8 imm:$src2)))],
+ IIC_SR>, OpSize32;
+def SHR64ri : RIi8<0xC1, MRM5r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$src2),
+ "shr{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (srl GR64:$src1, (i8 imm:$src2)))], IIC_SR>;
+
+// Shift right by 1
+def SHR8r1 : I<0xD0, MRM5r, (outs GR8:$dst), (ins GR8:$src1),
+ "shr{b}\t$dst",
+ [(set GR8:$dst, (srl GR8:$src1, (i8 1)))], IIC_SR>;
+def SHR16r1 : I<0xD1, MRM5r, (outs GR16:$dst), (ins GR16:$src1),
+ "shr{w}\t$dst",
+ [(set GR16:$dst, (srl GR16:$src1, (i8 1)))], IIC_SR>, OpSize16;
+def SHR32r1 : I<0xD1, MRM5r, (outs GR32:$dst), (ins GR32:$src1),
+ "shr{l}\t$dst",
+ [(set GR32:$dst, (srl GR32:$src1, (i8 1)))], IIC_SR>, OpSize32;
+def SHR64r1 : RI<0xD1, MRM5r, (outs GR64:$dst), (ins GR64:$src1),
+ "shr{q}\t$dst",
+ [(set GR64:$dst, (srl GR64:$src1, (i8 1)))], IIC_SR>;
+} // Constraints = "$src = $dst", SchedRW
+
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+let Uses = [CL] in {
+def SHR8mCL : I<0xD2, MRM5m, (outs), (ins i8mem :$dst),
+ "shr{b}\t{%cl, $dst|$dst, cl}",
+ [(store (srl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>;
+def SHR16mCL : I<0xD3, MRM5m, (outs), (ins i16mem:$dst),
+ "shr{w}\t{%cl, $dst|$dst, cl}",
+ [(store (srl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>,
+ OpSize16;
+def SHR32mCL : I<0xD3, MRM5m, (outs), (ins i32mem:$dst),
+ "shr{l}\t{%cl, $dst|$dst, cl}",
+ [(store (srl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>,
+ OpSize32;
+def SHR64mCL : RI<0xD3, MRM5m, (outs), (ins i64mem:$dst),
+ "shr{q}\t{%cl, $dst|$dst, cl}",
+ [(store (srl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>;
+}
+def SHR8mi : Ii8<0xC0, MRM5m, (outs), (ins i8mem :$dst, u8imm:$src),
+ "shr{b}\t{$src, $dst|$dst, $src}",
+ [(store (srl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
+def SHR16mi : Ii8<0xC1, MRM5m, (outs), (ins i16mem:$dst, u8imm:$src),
+ "shr{w}\t{$src, $dst|$dst, $src}",
+ [(store (srl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>, OpSize16;
+def SHR32mi : Ii8<0xC1, MRM5m, (outs), (ins i32mem:$dst, u8imm:$src),
+ "shr{l}\t{$src, $dst|$dst, $src}",
+ [(store (srl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>, OpSize32;
+def SHR64mi : RIi8<0xC1, MRM5m, (outs), (ins i64mem:$dst, u8imm:$src),
+ "shr{q}\t{$src, $dst|$dst, $src}",
+ [(store (srl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
+
+// Shift by 1
+def SHR8m1 : I<0xD0, MRM5m, (outs), (ins i8mem :$dst),
+ "shr{b}\t$dst",
+ [(store (srl (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
+def SHR16m1 : I<0xD1, MRM5m, (outs), (ins i16mem:$dst),
+ "shr{w}\t$dst",
+ [(store (srl (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>, OpSize16;
+def SHR32m1 : I<0xD1, MRM5m, (outs), (ins i32mem:$dst),
+ "shr{l}\t$dst",
+ [(store (srl (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>, OpSize32;
+def SHR64m1 : RI<0xD1, MRM5m, (outs), (ins i64mem:$dst),
+ "shr{q}\t$dst",
+ [(store (srl (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
+} // SchedRW
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+let Uses = [CL] in {
+def SAR8rCL : I<0xD2, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "sar{b}\t{%cl, $dst|$dst, cl}",
+ [(set GR8:$dst, (sra GR8:$src1, CL))],
+ IIC_SR>;
+def SAR16rCL : I<0xD3, MRM7r, (outs GR16:$dst), (ins GR16:$src1),
+ "sar{w}\t{%cl, $dst|$dst, cl}",
+ [(set GR16:$dst, (sra GR16:$src1, CL))],
+ IIC_SR>, OpSize16;
+def SAR32rCL : I<0xD3, MRM7r, (outs GR32:$dst), (ins GR32:$src1),
+ "sar{l}\t{%cl, $dst|$dst, cl}",
+ [(set GR32:$dst, (sra GR32:$src1, CL))],
+ IIC_SR>, OpSize32;
+def SAR64rCL : RI<0xD3, MRM7r, (outs GR64:$dst), (ins GR64:$src1),
+ "sar{q}\t{%cl, $dst|$dst, cl}",
+ [(set GR64:$dst, (sra GR64:$src1, CL))],
+ IIC_SR>;
+}
+
+def SAR8ri : Ii8<0xC0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
+ "sar{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (sra GR8:$src1, (i8 imm:$src2)))],
+ IIC_SR>;
+def SAR16ri : Ii8<0xC1, MRM7r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
+ "sar{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (sra GR16:$src1, (i8 imm:$src2)))],
+ IIC_SR>, OpSize16;
+def SAR32ri : Ii8<0xC1, MRM7r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
+ "sar{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (sra GR32:$src1, (i8 imm:$src2)))],
+ IIC_SR>, OpSize32;
+def SAR64ri : RIi8<0xC1, MRM7r, (outs GR64:$dst),
+ (ins GR64:$src1, u8imm:$src2),
+ "sar{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (sra GR64:$src1, (i8 imm:$src2)))],
+ IIC_SR>;
+
+// Shift by 1
+def SAR8r1 : I<0xD0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "sar{b}\t$dst",
+ [(set GR8:$dst, (sra GR8:$src1, (i8 1)))],
+ IIC_SR>;
+def SAR16r1 : I<0xD1, MRM7r, (outs GR16:$dst), (ins GR16:$src1),
+ "sar{w}\t$dst",
+ [(set GR16:$dst, (sra GR16:$src1, (i8 1)))],
+ IIC_SR>, OpSize16;
+def SAR32r1 : I<0xD1, MRM7r, (outs GR32:$dst), (ins GR32:$src1),
+ "sar{l}\t$dst",
+ [(set GR32:$dst, (sra GR32:$src1, (i8 1)))],
+ IIC_SR>, OpSize32;
+def SAR64r1 : RI<0xD1, MRM7r, (outs GR64:$dst), (ins GR64:$src1),
+ "sar{q}\t$dst",
+ [(set GR64:$dst, (sra GR64:$src1, (i8 1)))],
+ IIC_SR>;
+} // Constraints = "$src = $dst", SchedRW
+
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+let Uses = [CL] in {
+def SAR8mCL : I<0xD2, MRM7m, (outs), (ins i8mem :$dst),
+ "sar{b}\t{%cl, $dst|$dst, cl}",
+ [(store (sra (loadi8 addr:$dst), CL), addr:$dst)],
+ IIC_SR>;
+def SAR16mCL : I<0xD3, MRM7m, (outs), (ins i16mem:$dst),
+ "sar{w}\t{%cl, $dst|$dst, cl}",
+ [(store (sra (loadi16 addr:$dst), CL), addr:$dst)],
+ IIC_SR>, OpSize16;
+def SAR32mCL : I<0xD3, MRM7m, (outs), (ins i32mem:$dst),
+ "sar{l}\t{%cl, $dst|$dst, cl}",
+ [(store (sra (loadi32 addr:$dst), CL), addr:$dst)],
+ IIC_SR>, OpSize32;
+def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst),
+ "sar{q}\t{%cl, $dst|$dst, cl}",
+ [(store (sra (loadi64 addr:$dst), CL), addr:$dst)],
+ IIC_SR>;
+}
+def SAR8mi : Ii8<0xC0, MRM7m, (outs), (ins i8mem :$dst, u8imm:$src),
+ "sar{b}\t{$src, $dst|$dst, $src}",
+ [(store (sra (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
+def SAR16mi : Ii8<0xC1, MRM7m, (outs), (ins i16mem:$dst, u8imm:$src),
+ "sar{w}\t{$src, $dst|$dst, $src}",
+ [(store (sra (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>, OpSize16;
+def SAR32mi : Ii8<0xC1, MRM7m, (outs), (ins i32mem:$dst, u8imm:$src),
+ "sar{l}\t{$src, $dst|$dst, $src}",
+ [(store (sra (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>, OpSize32;
+def SAR64mi : RIi8<0xC1, MRM7m, (outs), (ins i64mem:$dst, u8imm:$src),
+ "sar{q}\t{$src, $dst|$dst, $src}",
+ [(store (sra (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
+
+// Shift by 1
+def SAR8m1 : I<0xD0, MRM7m, (outs), (ins i8mem :$dst),
+ "sar{b}\t$dst",
+ [(store (sra (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
+def SAR16m1 : I<0xD1, MRM7m, (outs), (ins i16mem:$dst),
+ "sar{w}\t$dst",
+ [(store (sra (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>, OpSize16;
+def SAR32m1 : I<0xD1, MRM7m, (outs), (ins i32mem:$dst),
+ "sar{l}\t$dst",
+ [(store (sra (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>, OpSize32;
+def SAR64m1 : RI<0xD1, MRM7m, (outs), (ins i64mem:$dst),
+ "sar{q}\t$dst",
+ [(store (sra (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Rotate instructions
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 0 in {
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+def RCL8r1 : I<0xD0, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
+ "rcl{b}\t$dst", [], IIC_SR>;
+def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$cnt),
+ "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+let Uses = [CL] in
+def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
+ "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+
+def RCL16r1 : I<0xD1, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
+ "rcl{w}\t$dst", [], IIC_SR>, OpSize16;
+def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$cnt),
+ "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
+let Uses = [CL] in
+def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
+ "rcl{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
+
+def RCL32r1 : I<0xD1, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
+ "rcl{l}\t$dst", [], IIC_SR>, OpSize32;
+def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$cnt),
+ "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
+let Uses = [CL] in
+def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
+ "rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
+
+
+def RCL64r1 : RI<0xD1, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
+ "rcl{q}\t$dst", [], IIC_SR>;
+def RCL64ri : RIi8<0xC1, MRM2r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$cnt),
+ "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+let Uses = [CL] in
+def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
+ "rcl{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+
+
+def RCR8r1 : I<0xD0, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
+ "rcr{b}\t$dst", [], IIC_SR>;
+def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$cnt),
+ "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+let Uses = [CL] in
+def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
+ "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+
+def RCR16r1 : I<0xD1, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
+ "rcr{w}\t$dst", [], IIC_SR>, OpSize16;
+def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$cnt),
+ "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
+let Uses = [CL] in
+def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
+ "rcr{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
+
+def RCR32r1 : I<0xD1, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
+ "rcr{l}\t$dst", [], IIC_SR>, OpSize32;
+def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$cnt),
+ "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
+let Uses = [CL] in
+def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
+ "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
+
+def RCR64r1 : RI<0xD1, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
+ "rcr{q}\t$dst", [], IIC_SR>;
+def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$cnt),
+ "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+let Uses = [CL] in
+def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
+ "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+
+} // Constraints = "$src = $dst"
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+def RCL8m1 : I<0xD0, MRM2m, (outs), (ins i8mem:$dst),
+ "rcl{b}\t$dst", [], IIC_SR>;
+def RCL8mi : Ii8<0xC0, MRM2m, (outs), (ins i8mem:$dst, u8imm:$cnt),
+ "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+def RCL16m1 : I<0xD1, MRM2m, (outs), (ins i16mem:$dst),
+ "rcl{w}\t$dst", [], IIC_SR>, OpSize16;
+def RCL16mi : Ii8<0xC1, MRM2m, (outs), (ins i16mem:$dst, u8imm:$cnt),
+ "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
+def RCL32m1 : I<0xD1, MRM2m, (outs), (ins i32mem:$dst),
+ "rcl{l}\t$dst", [], IIC_SR>, OpSize32;
+def RCL32mi : Ii8<0xC1, MRM2m, (outs), (ins i32mem:$dst, u8imm:$cnt),
+ "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
+def RCL64m1 : RI<0xD1, MRM2m, (outs), (ins i64mem:$dst),
+ "rcl{q}\t$dst", [], IIC_SR>;
+def RCL64mi : RIi8<0xC1, MRM2m, (outs), (ins i64mem:$dst, u8imm:$cnt),
+ "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+
+def RCR8m1 : I<0xD0, MRM3m, (outs), (ins i8mem:$dst),
+ "rcr{b}\t$dst", [], IIC_SR>;
+def RCR8mi : Ii8<0xC0, MRM3m, (outs), (ins i8mem:$dst, u8imm:$cnt),
+ "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+def RCR16m1 : I<0xD1, MRM3m, (outs), (ins i16mem:$dst),
+ "rcr{w}\t$dst", [], IIC_SR>, OpSize16;
+def RCR16mi : Ii8<0xC1, MRM3m, (outs), (ins i16mem:$dst, u8imm:$cnt),
+ "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
+def RCR32m1 : I<0xD1, MRM3m, (outs), (ins i32mem:$dst),
+ "rcr{l}\t$dst", [], IIC_SR>, OpSize32;
+def RCR32mi : Ii8<0xC1, MRM3m, (outs), (ins i32mem:$dst, u8imm:$cnt),
+ "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
+def RCR64m1 : RI<0xD1, MRM3m, (outs), (ins i64mem:$dst),
+ "rcr{q}\t$dst", [], IIC_SR>;
+def RCR64mi : RIi8<0xC1, MRM3m, (outs), (ins i64mem:$dst, u8imm:$cnt),
+ "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+
+let Uses = [CL] in {
+def RCL8mCL : I<0xD2, MRM2m, (outs), (ins i8mem:$dst),
+ "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+def RCL16mCL : I<0xD3, MRM2m, (outs), (ins i16mem:$dst),
+ "rcl{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
+def RCL32mCL : I<0xD3, MRM2m, (outs), (ins i32mem:$dst),
+ "rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
+def RCL64mCL : RI<0xD3, MRM2m, (outs), (ins i64mem:$dst),
+ "rcl{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+
+def RCR8mCL : I<0xD2, MRM3m, (outs), (ins i8mem:$dst),
+ "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+def RCR16mCL : I<0xD3, MRM3m, (outs), (ins i16mem:$dst),
+ "rcr{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
+def RCR32mCL : I<0xD3, MRM3m, (outs), (ins i32mem:$dst),
+ "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
+def RCR64mCL : RI<0xD3, MRM3m, (outs), (ins i64mem:$dst),
+ "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+}
+} // SchedRW
+} // hasSideEffects = 0
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+// FIXME: provide shorter instructions when imm8 == 1
+let Uses = [CL] in {
+def ROL8rCL : I<0xD2, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "rol{b}\t{%cl, $dst|$dst, cl}",
+ [(set GR8:$dst, (rotl GR8:$src1, CL))], IIC_SR>;
+def ROL16rCL : I<0xD3, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
+ "rol{w}\t{%cl, $dst|$dst, cl}",
+ [(set GR16:$dst, (rotl GR16:$src1, CL))], IIC_SR>, OpSize16;
+def ROL32rCL : I<0xD3, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
+ "rol{l}\t{%cl, $dst|$dst, cl}",
+ [(set GR32:$dst, (rotl GR32:$src1, CL))], IIC_SR>, OpSize32;
+def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
+ "rol{q}\t{%cl, $dst|$dst, cl}",
+ [(set GR64:$dst, (rotl GR64:$src1, CL))], IIC_SR>;
+}
+
+def ROL8ri : Ii8<0xC0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
+ "rol{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
+def ROL16ri : Ii8<0xC1, MRM0r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
+ "rol{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))],
+ IIC_SR>, OpSize16;
+def ROL32ri : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
+ "rol{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))],
+ IIC_SR>, OpSize32;
+def ROL64ri : RIi8<0xC1, MRM0r, (outs GR64:$dst),
+ (ins GR64:$src1, u8imm:$src2),
+ "rol{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))],
+ IIC_SR>;
+
+// Rotate by 1
+def ROL8r1 : I<0xD0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "rol{b}\t$dst",
+ [(set GR8:$dst, (rotl GR8:$src1, (i8 1)))],
+ IIC_SR>;
+def ROL16r1 : I<0xD1, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
+ "rol{w}\t$dst",
+ [(set GR16:$dst, (rotl GR16:$src1, (i8 1)))],
+ IIC_SR>, OpSize16;
+def ROL32r1 : I<0xD1, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
+ "rol{l}\t$dst",
+ [(set GR32:$dst, (rotl GR32:$src1, (i8 1)))],
+ IIC_SR>, OpSize32;
+def ROL64r1 : RI<0xD1, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
+ "rol{q}\t$dst",
+ [(set GR64:$dst, (rotl GR64:$src1, (i8 1)))],
+ IIC_SR>;
+} // Constraints = "$src = $dst", SchedRW
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+let Uses = [CL] in {
+def ROL8mCL : I<0xD2, MRM0m, (outs), (ins i8mem :$dst),
+ "rol{b}\t{%cl, $dst|$dst, cl}",
+ [(store (rotl (loadi8 addr:$dst), CL), addr:$dst)],
+ IIC_SR>;
+def ROL16mCL : I<0xD3, MRM0m, (outs), (ins i16mem:$dst),
+ "rol{w}\t{%cl, $dst|$dst, cl}",
+ [(store (rotl (loadi16 addr:$dst), CL), addr:$dst)],
+ IIC_SR>, OpSize16;
+def ROL32mCL : I<0xD3, MRM0m, (outs), (ins i32mem:$dst),
+ "rol{l}\t{%cl, $dst|$dst, cl}",
+ [(store (rotl (loadi32 addr:$dst), CL), addr:$dst)],
+ IIC_SR>, OpSize32;
+def ROL64mCL : RI<0xD3, MRM0m, (outs), (ins i64mem:$dst),
+ "rol{q}\t{%cl, $dst|$dst, cl}",
+ [(store (rotl (loadi64 addr:$dst), CL), addr:$dst)],
+ IIC_SR>;
+}
+def ROL8mi : Ii8<0xC0, MRM0m, (outs), (ins i8mem :$dst, u8imm:$src1),
+ "rol{b}\t{$src1, $dst|$dst, $src1}",
+ [(store (rotl (loadi8 addr:$dst), (i8 imm:$src1)), addr:$dst)],
+ IIC_SR>;
+def ROL16mi : Ii8<0xC1, MRM0m, (outs), (ins i16mem:$dst, u8imm:$src1),
+ "rol{w}\t{$src1, $dst|$dst, $src1}",
+ [(store (rotl (loadi16 addr:$dst), (i8 imm:$src1)), addr:$dst)],
+ IIC_SR>, OpSize16;
+def ROL32mi : Ii8<0xC1, MRM0m, (outs), (ins i32mem:$dst, u8imm:$src1),
+ "rol{l}\t{$src1, $dst|$dst, $src1}",
+ [(store (rotl (loadi32 addr:$dst), (i8 imm:$src1)), addr:$dst)],
+ IIC_SR>, OpSize32;
+def ROL64mi : RIi8<0xC1, MRM0m, (outs), (ins i64mem:$dst, u8imm:$src1),
+ "rol{q}\t{$src1, $dst|$dst, $src1}",
+ [(store (rotl (loadi64 addr:$dst), (i8 imm:$src1)), addr:$dst)],
+ IIC_SR>;
+
+// Rotate by 1
+def ROL8m1 : I<0xD0, MRM0m, (outs), (ins i8mem :$dst),
+ "rol{b}\t$dst",
+ [(store (rotl (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
+def ROL16m1 : I<0xD1, MRM0m, (outs), (ins i16mem:$dst),
+ "rol{w}\t$dst",
+ [(store (rotl (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>, OpSize16;
+def ROL32m1 : I<0xD1, MRM0m, (outs), (ins i32mem:$dst),
+ "rol{l}\t$dst",
+ [(store (rotl (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>, OpSize32;
+def ROL64m1 : RI<0xD1, MRM0m, (outs), (ins i64mem:$dst),
+ "rol{q}\t$dst",
+ [(store (rotl (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
+} // SchedRW
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+let Uses = [CL] in {
+def ROR8rCL : I<0xD2, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "ror{b}\t{%cl, $dst|$dst, cl}",
+ [(set GR8:$dst, (rotr GR8:$src1, CL))], IIC_SR>;
+def ROR16rCL : I<0xD3, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
+ "ror{w}\t{%cl, $dst|$dst, cl}",
+ [(set GR16:$dst, (rotr GR16:$src1, CL))], IIC_SR>, OpSize16;
+def ROR32rCL : I<0xD3, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
+ "ror{l}\t{%cl, $dst|$dst, cl}",
+ [(set GR32:$dst, (rotr GR32:$src1, CL))], IIC_SR>, OpSize32;
+def ROR64rCL : RI<0xD3, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
+ "ror{q}\t{%cl, $dst|$dst, cl}",
+ [(set GR64:$dst, (rotr GR64:$src1, CL))], IIC_SR>;
+}
+
+def ROR8ri : Ii8<0xC0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
+ "ror{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (rotr GR8:$src1, (i8 relocImm:$src2)))],
+ IIC_SR>;
+def ROR16ri : Ii8<0xC1, MRM1r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
+ "ror{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (rotr GR16:$src1, (i8 relocImm:$src2)))],
+ IIC_SR>, OpSize16;
+def ROR32ri : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
+ "ror{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (rotr GR32:$src1, (i8 relocImm:$src2)))],
+ IIC_SR>, OpSize32;
+def ROR64ri : RIi8<0xC1, MRM1r, (outs GR64:$dst),
+ (ins GR64:$src1, u8imm:$src2),
+ "ror{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (rotr GR64:$src1, (i8 relocImm:$src2)))],
+ IIC_SR>;
+
+// Rotate by 1
+def ROR8r1 : I<0xD0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "ror{b}\t$dst",
+ [(set GR8:$dst, (rotl GR8:$src1, (i8 7)))],
+ IIC_SR>;
+def ROR16r1 : I<0xD1, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
+ "ror{w}\t$dst",
+ [(set GR16:$dst, (rotl GR16:$src1, (i8 15)))],
+ IIC_SR>, OpSize16;
+def ROR32r1 : I<0xD1, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
+ "ror{l}\t$dst",
+ [(set GR32:$dst, (rotl GR32:$src1, (i8 31)))],
+ IIC_SR>, OpSize32;
+def ROR64r1 : RI<0xD1, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
+ "ror{q}\t$dst",
+ [(set GR64:$dst, (rotl GR64:$src1, (i8 63)))],
+ IIC_SR>;
+} // Constraints = "$src = $dst", SchedRW
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+let Uses = [CL] in {
+def ROR8mCL : I<0xD2, MRM1m, (outs), (ins i8mem :$dst),
+ "ror{b}\t{%cl, $dst|$dst, cl}",
+ [(store (rotr (loadi8 addr:$dst), CL), addr:$dst)],
+ IIC_SR>;
+def ROR16mCL : I<0xD3, MRM1m, (outs), (ins i16mem:$dst),
+ "ror{w}\t{%cl, $dst|$dst, cl}",
+ [(store (rotr (loadi16 addr:$dst), CL), addr:$dst)],
+ IIC_SR>, OpSize16;
+def ROR32mCL : I<0xD3, MRM1m, (outs), (ins i32mem:$dst),
+ "ror{l}\t{%cl, $dst|$dst, cl}",
+ [(store (rotr (loadi32 addr:$dst), CL), addr:$dst)],
+ IIC_SR>, OpSize32;
+def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst),
+ "ror{q}\t{%cl, $dst|$dst, cl}",
+ [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)],
+ IIC_SR>;
+}
+def ROR8mi : Ii8<0xC0, MRM1m, (outs), (ins i8mem :$dst, u8imm:$src),
+ "ror{b}\t{$src, $dst|$dst, $src}",
+ [(store (rotr (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
+def ROR16mi : Ii8<0xC1, MRM1m, (outs), (ins i16mem:$dst, u8imm:$src),
+ "ror{w}\t{$src, $dst|$dst, $src}",
+ [(store (rotr (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>, OpSize16;
+def ROR32mi : Ii8<0xC1, MRM1m, (outs), (ins i32mem:$dst, u8imm:$src),
+ "ror{l}\t{$src, $dst|$dst, $src}",
+ [(store (rotr (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>, OpSize32;
+def ROR64mi : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, u8imm:$src),
+ "ror{q}\t{$src, $dst|$dst, $src}",
+ [(store (rotr (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
+
+// Rotate by 1
+def ROR8m1 : I<0xD0, MRM1m, (outs), (ins i8mem :$dst),
+ "ror{b}\t$dst",
+ [(store (rotr (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
+def ROR16m1 : I<0xD1, MRM1m, (outs), (ins i16mem:$dst),
+ "ror{w}\t$dst",
+ [(store (rotr (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>, OpSize16;
+def ROR32m1 : I<0xD1, MRM1m, (outs), (ins i32mem:$dst),
+ "ror{l}\t$dst",
+ [(store (rotr (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>, OpSize32;
+def ROR64m1 : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst),
+ "ror{q}\t$dst",
+ [(store (rotr (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
+} // SchedRW
+
+
+//===----------------------------------------------------------------------===//
+// Double shift instructions (generalizations of rotate)
+//===----------------------------------------------------------------------===//
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+
+let Uses = [CL] in {
+def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst),
+ (ins GR16:$src1, GR16:$src2),
+ "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+ [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))],
+ IIC_SHD16_REG_CL>,
+ TB, OpSize16;
+def SHRD16rrCL : I<0xAD, MRMDestReg, (outs GR16:$dst),
+ (ins GR16:$src1, GR16:$src2),
+ "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+ [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))],
+ IIC_SHD16_REG_CL>,
+ TB, OpSize16;
+def SHLD32rrCL : I<0xA5, MRMDestReg, (outs GR32:$dst),
+ (ins GR32:$src1, GR32:$src2),
+ "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+ [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))],
+ IIC_SHD32_REG_CL>, TB, OpSize32;
+def SHRD32rrCL : I<0xAD, MRMDestReg, (outs GR32:$dst),
+ (ins GR32:$src1, GR32:$src2),
+ "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+ [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))],
+ IIC_SHD32_REG_CL>, TB, OpSize32;
+def SHLD64rrCL : RI<0xA5, MRMDestReg, (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2),
+ "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+ [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, CL))],
+ IIC_SHD64_REG_CL>,
+ TB;
+def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2),
+ "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+ [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, CL))],
+ IIC_SHD64_REG_CL>,
+ TB;
+}
+
+let isCommutable = 1 in { // These instructions commute to each other.
+def SHLD16rri8 : Ii8<0xA4, MRMDestReg,
+ (outs GR16:$dst),
+ (ins GR16:$src1, GR16:$src2, u8imm:$src3),
+ "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2,
+ (i8 imm:$src3)))], IIC_SHD16_REG_IM>,
+ TB, OpSize16;
+def SHRD16rri8 : Ii8<0xAC, MRMDestReg,
+ (outs GR16:$dst),
+ (ins GR16:$src1, GR16:$src2, u8imm:$src3),
+ "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2,
+ (i8 imm:$src3)))], IIC_SHD16_REG_IM>,
+ TB, OpSize16;
+def SHLD32rri8 : Ii8<0xA4, MRMDestReg,
+ (outs GR32:$dst),
+ (ins GR32:$src1, GR32:$src2, u8imm:$src3),
+ "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2,
+ (i8 imm:$src3)))], IIC_SHD32_REG_IM>,
+ TB, OpSize32;
+def SHRD32rri8 : Ii8<0xAC, MRMDestReg,
+ (outs GR32:$dst),
+ (ins GR32:$src1, GR32:$src2, u8imm:$src3),
+ "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2,
+ (i8 imm:$src3)))], IIC_SHD32_REG_IM>,
+ TB, OpSize32;
+def SHLD64rri8 : RIi8<0xA4, MRMDestReg,
+ (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2, u8imm:$src3),
+ "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2,
+ (i8 imm:$src3)))], IIC_SHD64_REG_IM>,
+ TB;
+def SHRD64rri8 : RIi8<0xAC, MRMDestReg,
+ (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2, u8imm:$src3),
+ "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2,
+ (i8 imm:$src3)))], IIC_SHD64_REG_IM>,
+ TB;
+}
+} // Constraints = "$src = $dst", SchedRW
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+let Uses = [CL] in {
+def SHLD16mrCL : I<0xA5, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+ "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+ [(store (X86shld (loadi16 addr:$dst), GR16:$src2, CL),
+ addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize16;
+def SHRD16mrCL : I<0xAD, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+ "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+ [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, CL),
+ addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize16;
+
+def SHLD32mrCL : I<0xA5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+ "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+ [(store (X86shld (loadi32 addr:$dst), GR32:$src2, CL),
+ addr:$dst)], IIC_SHD32_MEM_CL>, TB, OpSize32;
+def SHRD32mrCL : I<0xAD, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+ "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+ [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, CL),
+ addr:$dst)], IIC_SHD32_MEM_CL>, TB, OpSize32;
+
+def SHLD64mrCL : RI<0xA5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+ "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+ [(store (X86shld (loadi64 addr:$dst), GR64:$src2, CL),
+ addr:$dst)], IIC_SHD64_MEM_CL>, TB;
+def SHRD64mrCL : RI<0xAD, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+ "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+ [(store (X86shrd (loadi64 addr:$dst), GR64:$src2, CL),
+ addr:$dst)], IIC_SHD64_MEM_CL>, TB;
+}
+
+def SHLD16mri8 : Ii8<0xA4, MRMDestMem,
+ (outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3),
+ "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (X86shld (loadi16 addr:$dst), GR16:$src2,
+ (i8 imm:$src3)), addr:$dst)],
+ IIC_SHD16_MEM_IM>,
+ TB, OpSize16;
+def SHRD16mri8 : Ii8<0xAC, MRMDestMem,
+ (outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3),
+ "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (X86shrd (loadi16 addr:$dst), GR16:$src2,
+ (i8 imm:$src3)), addr:$dst)],
+ IIC_SHD16_MEM_IM>,
+ TB, OpSize16;
+
+def SHLD32mri8 : Ii8<0xA4, MRMDestMem,
+ (outs), (ins i32mem:$dst, GR32:$src2, u8imm:$src3),
+ "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (X86shld (loadi32 addr:$dst), GR32:$src2,
+ (i8 imm:$src3)), addr:$dst)],
+ IIC_SHD32_MEM_IM>,
+ TB, OpSize32;
+def SHRD32mri8 : Ii8<0xAC, MRMDestMem,
+ (outs), (ins i32mem:$dst, GR32:$src2, u8imm:$src3),
+ "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (X86shrd (loadi32 addr:$dst), GR32:$src2,
+ (i8 imm:$src3)), addr:$dst)],
+ IIC_SHD32_MEM_IM>,
+ TB, OpSize32;
+
+def SHLD64mri8 : RIi8<0xA4, MRMDestMem,
+ (outs), (ins i64mem:$dst, GR64:$src2, u8imm:$src3),
+ "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (X86shld (loadi64 addr:$dst), GR64:$src2,
+ (i8 imm:$src3)), addr:$dst)],
+ IIC_SHD64_MEM_IM>,
+ TB;
+def SHRD64mri8 : RIi8<0xAC, MRMDestMem,
+ (outs), (ins i64mem:$dst, GR64:$src2, u8imm:$src3),
+ "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (X86shrd (loadi64 addr:$dst), GR64:$src2,
+ (i8 imm:$src3)), addr:$dst)],
+ IIC_SHD64_MEM_IM>,
+ TB;
+} // SchedRW
+
+} // Defs = [EFLAGS]
+
+def ROT32L2R_imm8 : SDNodeXForm<imm, [{
+ // Convert a ROTL shamt to a ROTR shamt on 32-bit integer.
+ return getI8Imm(32 - N->getZExtValue(), SDLoc(N));
+}]>;
+
+def ROT64L2R_imm8 : SDNodeXForm<imm, [{
+ // Convert a ROTL shamt to a ROTR shamt on 64-bit integer.
+ return getI8Imm(64 - N->getZExtValue(), SDLoc(N));
+}]>;
+
+multiclass bmi_rotate<string asm, RegisterClass RC, X86MemOperand x86memop> {
+let hasSideEffects = 0 in {
+ def ri : Ii8<0xF0, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, u8imm:$src2),
+ !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, TAXD, VEX, Sched<[WriteShift]>;
+ let mayLoad = 1 in
+ def mi : Ii8<0xF0, MRMSrcMem, (outs RC:$dst),
+ (ins x86memop:$src1, u8imm:$src2),
+ !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, TAXD, VEX, Sched<[WriteShiftLd]>;
+}
+}
+
+multiclass bmi_shift<string asm, RegisterClass RC, X86MemOperand x86memop> {
+let hasSideEffects = 0 in {
+ def rr : I<0xF7, MRMSrcReg4VOp3, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
+ VEX, Sched<[WriteShift]>;
+ let mayLoad = 1 in
+ def rm : I<0xF7, MRMSrcMem4VOp3,
+ (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
+ !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
+ VEX, Sched<[WriteShiftLd,
+ // x86memop:$src1
+ ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+ ReadDefault,
+ // RC:$src1
+ ReadAfterLd]>;
+}
+}
+
+let Predicates = [HasBMI2] in {
+ defm RORX32 : bmi_rotate<"rorx{l}", GR32, i32mem>;
+ defm RORX64 : bmi_rotate<"rorx{q}", GR64, i64mem>, VEX_W;
+ defm SARX32 : bmi_shift<"sarx{l}", GR32, i32mem>, T8XS;
+ defm SARX64 : bmi_shift<"sarx{q}", GR64, i64mem>, T8XS, VEX_W;
+ defm SHRX32 : bmi_shift<"shrx{l}", GR32, i32mem>, T8XD;
+ defm SHRX64 : bmi_shift<"shrx{q}", GR64, i64mem>, T8XD, VEX_W;
+ defm SHLX32 : bmi_shift<"shlx{l}", GR32, i32mem>, T8PD;
+ defm SHLX64 : bmi_shift<"shlx{q}", GR64, i64mem>, T8PD, VEX_W;
+
+ // Prefer RORX which is non-destructive and doesn't update EFLAGS.
+ let AddedComplexity = 10 in {
+ def : Pat<(rotl GR32:$src, (i8 imm:$shamt)),
+ (RORX32ri GR32:$src, (ROT32L2R_imm8 imm:$shamt))>;
+ def : Pat<(rotl GR64:$src, (i8 imm:$shamt)),
+ (RORX64ri GR64:$src, (ROT64L2R_imm8 imm:$shamt))>;
+ }
+
+ def : Pat<(rotl (loadi32 addr:$src), (i8 imm:$shamt)),
+ (RORX32mi addr:$src, (ROT32L2R_imm8 imm:$shamt))>;
+ def : Pat<(rotl (loadi64 addr:$src), (i8 imm:$shamt)),
+ (RORX64mi addr:$src, (ROT64L2R_imm8 imm:$shamt))>;
+
+ // Prefer SARX/SHRX/SHLX over SAR/SHR/SHL with variable shift BUT not
+ // immedidate shift, i.e. the following code is considered better
+ //
+ // mov %edi, %esi
+ // shl $imm, %esi
+ // ... %edi, ...
+ //
+ // than
+ //
+ // movb $imm, %sil
+ // shlx %sil, %edi, %esi
+ // ... %edi, ...
+ //
+ let AddedComplexity = 1 in {
+ def : Pat<(sra GR32:$src1, GR8:$src2),
+ (SARX32rr GR32:$src1,
+ (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(sra GR64:$src1, GR8:$src2),
+ (SARX64rr GR64:$src1,
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+ def : Pat<(srl GR32:$src1, GR8:$src2),
+ (SHRX32rr GR32:$src1,
+ (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(srl GR64:$src1, GR8:$src2),
+ (SHRX64rr GR64:$src1,
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+ def : Pat<(shl GR32:$src1, GR8:$src2),
+ (SHLX32rr GR32:$src1,
+ (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(shl GR64:$src1, GR8:$src2),
+ (SHLX64rr GR64:$src1,
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ }
+
+ // Patterns on SARXrm/SHRXrm/SHLXrm are explicitly omitted to favor
+ //
+ // mov (%ecx), %esi
+ // shl $imm, $esi
+ //
+ // over
+ //
+ // movb $imm %al
+ // shlx %al, (%ecx), %esi
+ //
+ // As SARXrr/SHRXrr/SHLXrr is favored on variable shift, the peephole
+ // optimization will fold them into SARXrm/SHRXrm/SHLXrm if possible.
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSystem.td b/contrib/llvm/lib/Target/X86/X86InstrSystem.td
new file mode 100644
index 000000000000..9265d64b3230
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrSystem.td
@@ -0,0 +1,622 @@
+//===-- X86InstrSystem.td - System Instructions ------------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 instructions that are generally used in
+// privileged modes. These are not typically used by the compiler, but are
+// supported for the assembler and disassembler.
+//
+//===----------------------------------------------------------------------===//
+
+let SchedRW = [WriteSystem] in {
+let Defs = [RAX, RDX] in
+ def RDTSC : I<0x31, RawFrm, (outs), (ins), "rdtsc", [(X86rdtsc)], IIC_RDTSC>,
+ TB;
+
+let Defs = [RAX, RCX, RDX] in
+ def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", [(X86rdtscp)]>, TB;
+
+// CPU flow control instructions
+
+let mayLoad = 1, mayStore = 0, hasSideEffects = 1 in {
+ def TRAP : I<0x0B, RawFrm, (outs), (ins), "ud2", [(trap)]>, TB;
+ def UD2B : I<0xB9, RawFrm, (outs), (ins), "ud2b", []>, TB;
+}
+
+def HLT : I<0xF4, RawFrm, (outs), (ins), "hlt", [], IIC_HLT>;
+def RSM : I<0xAA, RawFrm, (outs), (ins), "rsm", [], IIC_RSM>, TB;
+
+// Interrupt and SysCall Instructions.
+let Uses = [EFLAGS] in
+ def INTO : I<0xce, RawFrm, (outs), (ins), "into", []>;
+def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3",
+ [(int_x86_int (i8 3))], IIC_INT3>;
+} // SchedRW
+
+// The long form of "int $3" turns into int3 as a size optimization.
+// FIXME: This doesn't work because InstAlias can't match immediate constants.
+//def : InstAlias<"int\t$3", (INT3)>;
+
+let SchedRW = [WriteSystem] in {
+
+def INT : Ii8<0xcd, RawFrm, (outs), (ins u8imm:$trap), "int\t$trap",
+ [(int_x86_int imm:$trap)], IIC_INT>;
+
+
+def SYSCALL : I<0x05, RawFrm, (outs), (ins), "syscall", [], IIC_SYSCALL>, TB;
+def SYSRET : I<0x07, RawFrm, (outs), (ins), "sysret{l}", [], IIC_SYSCALL>, TB;
+def SYSRET64 :RI<0x07, RawFrm, (outs), (ins), "sysret{q}", [], IIC_SYSCALL>, TB,
+ Requires<[In64BitMode]>;
+
+def SYSENTER : I<0x34, RawFrm, (outs), (ins), "sysenter", [],
+ IIC_SYS_ENTER_EXIT>, TB;
+
+def SYSEXIT : I<0x35, RawFrm, (outs), (ins), "sysexit{l}", [],
+ IIC_SYS_ENTER_EXIT>, TB;
+def SYSEXIT64 :RI<0x35, RawFrm, (outs), (ins), "sysexit{q}", [],
+ IIC_SYS_ENTER_EXIT>, TB, Requires<[In64BitMode]>;
+} // SchedRW
+
+def : Pat<(debugtrap),
+ (INT3)>, Requires<[NotPS4]>;
+def : Pat<(debugtrap),
+ (INT (i8 0x41))>, Requires<[IsPS4]>;
+
+//===----------------------------------------------------------------------===//
+// Input/Output Instructions.
+//
+let SchedRW = [WriteSystem] in {
+let Defs = [AL], Uses = [DX] in
+def IN8rr : I<0xEC, RawFrm, (outs), (ins),
+ "in{b}\t{%dx, %al|al, dx}", [], IIC_IN_RR>;
+let Defs = [AX], Uses = [DX] in
+def IN16rr : I<0xED, RawFrm, (outs), (ins),
+ "in{w}\t{%dx, %ax|ax, dx}", [], IIC_IN_RR>, OpSize16;
+let Defs = [EAX], Uses = [DX] in
+def IN32rr : I<0xED, RawFrm, (outs), (ins),
+ "in{l}\t{%dx, %eax|eax, dx}", [], IIC_IN_RR>, OpSize32;
+
+let Defs = [AL] in
+def IN8ri : Ii8<0xE4, RawFrm, (outs), (ins u8imm:$port),
+ "in{b}\t{$port, %al|al, $port}", [], IIC_IN_RI>;
+let Defs = [AX] in
+def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins u8imm:$port),
+ "in{w}\t{$port, %ax|ax, $port}", [], IIC_IN_RI>, OpSize16;
+let Defs = [EAX] in
+def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins u8imm:$port),
+ "in{l}\t{$port, %eax|eax, $port}", [], IIC_IN_RI>, OpSize32;
+
+let Uses = [DX, AL] in
+def OUT8rr : I<0xEE, RawFrm, (outs), (ins),
+ "out{b}\t{%al, %dx|dx, al}", [], IIC_OUT_RR>;
+let Uses = [DX, AX] in
+def OUT16rr : I<0xEF, RawFrm, (outs), (ins),
+ "out{w}\t{%ax, %dx|dx, ax}", [], IIC_OUT_RR>, OpSize16;
+let Uses = [DX, EAX] in
+def OUT32rr : I<0xEF, RawFrm, (outs), (ins),
+ "out{l}\t{%eax, %dx|dx, eax}", [], IIC_OUT_RR>, OpSize32;
+
+let Uses = [AL] in
+def OUT8ir : Ii8<0xE6, RawFrm, (outs), (ins u8imm:$port),
+ "out{b}\t{%al, $port|$port, al}", [], IIC_OUT_IR>;
+let Uses = [AX] in
+def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins u8imm:$port),
+ "out{w}\t{%ax, $port|$port, ax}", [], IIC_OUT_IR>, OpSize16;
+let Uses = [EAX] in
+def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins u8imm:$port),
+ "out{l}\t{%eax, $port|$port, eax}", [], IIC_OUT_IR>, OpSize32;
+
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Moves to and from debug registers
+
+let SchedRW = [WriteSystem] in {
+def MOV32rd : I<0x21, MRMDestReg, (outs GR32:$dst), (ins DEBUG_REG:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_DR>, TB,
+ Requires<[Not64BitMode]>;
+def MOV64rd : I<0x21, MRMDestReg, (outs GR64:$dst), (ins DEBUG_REG:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_DR>, TB,
+ Requires<[In64BitMode]>;
+
+def MOV32dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR32:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_DR_REG>, TB,
+ Requires<[Not64BitMode]>;
+def MOV64dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR64:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_DR_REG>, TB,
+ Requires<[In64BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Moves to and from control registers
+
+let SchedRW = [WriteSystem] in {
+def MOV32rc : I<0x20, MRMDestReg, (outs GR32:$dst), (ins CONTROL_REG:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_CR>, TB,
+ Requires<[Not64BitMode]>;
+def MOV64rc : I<0x20, MRMDestReg, (outs GR64:$dst), (ins CONTROL_REG:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_CR>, TB,
+ Requires<[In64BitMode]>;
+
+def MOV32cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR32:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_CR_REG>, TB,
+ Requires<[Not64BitMode]>;
+def MOV64cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR64:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_CR_REG>, TB,
+ Requires<[In64BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Segment override instruction prefixes
+
+def CS_PREFIX : I<0x2E, RawFrm, (outs), (ins), "cs", []>;
+def SS_PREFIX : I<0x36, RawFrm, (outs), (ins), "ss", []>;
+def DS_PREFIX : I<0x3E, RawFrm, (outs), (ins), "ds", []>;
+def ES_PREFIX : I<0x26, RawFrm, (outs), (ins), "es", []>;
+def FS_PREFIX : I<0x64, RawFrm, (outs), (ins), "fs", []>;
+def GS_PREFIX : I<0x65, RawFrm, (outs), (ins), "gs", []>;
+
+
+//===----------------------------------------------------------------------===//
+// Moves to and from segment registers.
+//
+
+let SchedRW = [WriteMove] in {
+def MOV16rs : I<0x8C, MRMDestReg, (outs GR16:$dst), (ins SEGMENT_REG:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>, OpSize16;
+def MOV32rs : I<0x8C, MRMDestReg, (outs GR32:$dst), (ins SEGMENT_REG:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>, OpSize32;
+def MOV64rs : RI<0x8C, MRMDestReg, (outs GR64:$dst), (ins SEGMENT_REG:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>;
+
+def MOV16ms : I<0x8C, MRMDestMem, (outs), (ins i16mem:$dst, SEGMENT_REG:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>, OpSize16;
+def MOV32ms : I<0x8C, MRMDestMem, (outs), (ins i32mem:$dst, SEGMENT_REG:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>, OpSize32;
+def MOV64ms : RI<0x8C, MRMDestMem, (outs), (ins i64mem:$dst, SEGMENT_REG:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>;
+
+def MOV16sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR16:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>, OpSize16;
+def MOV32sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR32:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>, OpSize32;
+def MOV64sr : RI<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR64:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>;
+
+def MOV16sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i16mem:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>, OpSize16;
+def MOV32sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i32mem:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>, OpSize32;
+def MOV64sm : RI<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i64mem:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Segmentation support instructions.
+
+let SchedRW = [WriteSystem] in {
+def SWAPGS : I<0x01, MRM_F8, (outs), (ins), "swapgs", [], IIC_SWAPGS>, TB;
+
+def LAR16rm : I<0x02, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB,
+ OpSize16;
+def LAR16rr : I<0x02, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB,
+ OpSize16;
+
+// i16mem operand in LAR32rm and GR32 operand in LAR32rr is not a typo.
+def LAR32rm : I<0x02, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
+ "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB,
+ OpSize32;
+def LAR32rr : I<0x02, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB,
+ OpSize32;
+// i16mem operand in LAR64rm and GR32 operand in LAR32rr is not a typo.
+def LAR64rm : RI<0x02, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+ "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB;
+def LAR64rr : RI<0x02, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
+ "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB;
+
+def LSL16rm : I<0x03, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB,
+ OpSize16;
+def LSL16rr : I<0x03, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB,
+ OpSize16;
+def LSL32rm : I<0x03, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB,
+ OpSize32;
+def LSL32rr : I<0x03, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB,
+ OpSize32;
+def LSL64rm : RI<0x03, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB;
+def LSL64rr : RI<0x03, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB;
+
+def INVLPG : I<0x01, MRM7m, (outs), (ins i8mem:$addr), "invlpg\t$addr",
+ [], IIC_INVLPG>, TB;
+
+def STR16r : I<0x00, MRM1r, (outs GR16:$dst), (ins),
+ "str{w}\t$dst", [], IIC_STR>, TB, OpSize16;
+def STR32r : I<0x00, MRM1r, (outs GR32:$dst), (ins),
+ "str{l}\t$dst", [], IIC_STR>, TB, OpSize32;
+def STR64r : RI<0x00, MRM1r, (outs GR64:$dst), (ins),
+ "str{q}\t$dst", [], IIC_STR>, TB;
+def STRm : I<0x00, MRM1m, (outs), (ins i16mem:$dst),
+ "str{w}\t$dst", [], IIC_STR>, TB;
+
+def LTRr : I<0x00, MRM3r, (outs), (ins GR16:$src),
+ "ltr{w}\t$src", [], IIC_LTR>, TB;
+def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src),
+ "ltr{w}\t$src", [], IIC_LTR>, TB;
+
+def PUSHCS16 : I<0x0E, RawFrm, (outs), (ins),
+ "push{w}\t{%cs|cs}", [], IIC_PUSH_SR>,
+ OpSize16, Requires<[Not64BitMode]>;
+def PUSHCS32 : I<0x0E, RawFrm, (outs), (ins),
+ "push{l}\t{%cs|cs}", [], IIC_PUSH_CS>,
+ OpSize32, Requires<[Not64BitMode]>;
+def PUSHSS16 : I<0x16, RawFrm, (outs), (ins),
+ "push{w}\t{%ss|ss}", [], IIC_PUSH_SR>,
+ OpSize16, Requires<[Not64BitMode]>;
+def PUSHSS32 : I<0x16, RawFrm, (outs), (ins),
+ "push{l}\t{%ss|ss}", [], IIC_PUSH_SR>,
+ OpSize32, Requires<[Not64BitMode]>;
+def PUSHDS16 : I<0x1E, RawFrm, (outs), (ins),
+ "push{w}\t{%ds|ds}", [], IIC_PUSH_SR>,
+ OpSize16, Requires<[Not64BitMode]>;
+def PUSHDS32 : I<0x1E, RawFrm, (outs), (ins),
+ "push{l}\t{%ds|ds}", [], IIC_PUSH_SR>,
+ OpSize32, Requires<[Not64BitMode]>;
+def PUSHES16 : I<0x06, RawFrm, (outs), (ins),
+ "push{w}\t{%es|es}", [], IIC_PUSH_SR>,
+ OpSize16, Requires<[Not64BitMode]>;
+def PUSHES32 : I<0x06, RawFrm, (outs), (ins),
+ "push{l}\t{%es|es}", [], IIC_PUSH_SR>,
+ OpSize32, Requires<[Not64BitMode]>;
+def PUSHFS16 : I<0xa0, RawFrm, (outs), (ins),
+ "push{w}\t{%fs|fs}", [], IIC_PUSH_SR>, OpSize16, TB;
+def PUSHFS32 : I<0xa0, RawFrm, (outs), (ins),
+ "push{l}\t{%fs|fs}", [], IIC_PUSH_SR>, TB,
+ OpSize32, Requires<[Not64BitMode]>;
+def PUSHGS16 : I<0xa8, RawFrm, (outs), (ins),
+ "push{w}\t{%gs|gs}", [], IIC_PUSH_SR>, OpSize16, TB;
+def PUSHGS32 : I<0xa8, RawFrm, (outs), (ins),
+ "push{l}\t{%gs|gs}", [], IIC_PUSH_SR>, TB,
+ OpSize32, Requires<[Not64BitMode]>;
+def PUSHFS64 : I<0xa0, RawFrm, (outs), (ins),
+ "push{q}\t{%fs|fs}", [], IIC_PUSH_SR>, TB,
+ OpSize32, Requires<[In64BitMode]>;
+def PUSHGS64 : I<0xa8, RawFrm, (outs), (ins),
+ "push{q}\t{%gs|gs}", [], IIC_PUSH_SR>, TB,
+ OpSize32, Requires<[In64BitMode]>;
+
+// No "pop cs" instruction.
+def POPSS16 : I<0x17, RawFrm, (outs), (ins),
+ "pop{w}\t{%ss|ss}", [], IIC_POP_SR_SS>,
+ OpSize16, Requires<[Not64BitMode]>;
+def POPSS32 : I<0x17, RawFrm, (outs), (ins),
+ "pop{l}\t{%ss|ss}", [], IIC_POP_SR_SS>,
+ OpSize32, Requires<[Not64BitMode]>;
+
+def POPDS16 : I<0x1F, RawFrm, (outs), (ins),
+ "pop{w}\t{%ds|ds}", [], IIC_POP_SR>,
+ OpSize16, Requires<[Not64BitMode]>;
+def POPDS32 : I<0x1F, RawFrm, (outs), (ins),
+ "pop{l}\t{%ds|ds}", [], IIC_POP_SR>,
+ OpSize32, Requires<[Not64BitMode]>;
+
+def POPES16 : I<0x07, RawFrm, (outs), (ins),
+ "pop{w}\t{%es|es}", [], IIC_POP_SR>,
+ OpSize16, Requires<[Not64BitMode]>;
+def POPES32 : I<0x07, RawFrm, (outs), (ins),
+ "pop{l}\t{%es|es}", [], IIC_POP_SR>,
+ OpSize32, Requires<[Not64BitMode]>;
+
+def POPFS16 : I<0xa1, RawFrm, (outs), (ins),
+ "pop{w}\t{%fs|fs}", [], IIC_POP_SR>, OpSize16, TB;
+def POPFS32 : I<0xa1, RawFrm, (outs), (ins),
+ "pop{l}\t{%fs|fs}", [], IIC_POP_SR>, TB,
+ OpSize32, Requires<[Not64BitMode]>;
+def POPFS64 : I<0xa1, RawFrm, (outs), (ins),
+ "pop{q}\t{%fs|fs}", [], IIC_POP_SR>, TB,
+ OpSize32, Requires<[In64BitMode]>;
+
+def POPGS16 : I<0xa9, RawFrm, (outs), (ins),
+ "pop{w}\t{%gs|gs}", [], IIC_POP_SR>, OpSize16, TB;
+def POPGS32 : I<0xa9, RawFrm, (outs), (ins),
+ "pop{l}\t{%gs|gs}", [], IIC_POP_SR>, TB,
+ OpSize32, Requires<[Not64BitMode]>;
+def POPGS64 : I<0xa9, RawFrm, (outs), (ins),
+ "pop{q}\t{%gs|gs}", [], IIC_POP_SR>, TB,
+ OpSize32, Requires<[In64BitMode]>;
+
+
+def LDS16rm : I<0xc5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+ "lds{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize16,
+ Requires<[Not64BitMode]>;
+def LDS32rm : I<0xc5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+ "lds{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize32,
+ Requires<[Not64BitMode]>;
+
+def LSS16rm : I<0xb2, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+ "lss{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize16;
+def LSS32rm : I<0xb2, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+ "lss{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize32;
+def LSS64rm : RI<0xb2, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
+ "lss{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
+
+def LES16rm : I<0xc4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+ "les{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize16,
+ Requires<[Not64BitMode]>;
+def LES32rm : I<0xc4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+ "les{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize32,
+ Requires<[Not64BitMode]>;
+
+def LFS16rm : I<0xb4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+ "lfs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize16;
+def LFS32rm : I<0xb4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+ "lfs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize32;
+def LFS64rm : RI<0xb4, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
+ "lfs{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
+
+def LGS16rm : I<0xb5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+ "lgs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize16;
+def LGS32rm : I<0xb5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+ "lgs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize32;
+
+def LGS64rm : RI<0xb5, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
+ "lgs{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
+
+
+def VERRr : I<0x00, MRM4r, (outs), (ins GR16:$seg),
+ "verr\t$seg", [], IIC_VERR>, TB;
+def VERRm : I<0x00, MRM4m, (outs), (ins i16mem:$seg),
+ "verr\t$seg", [], IIC_VERR>, TB;
+def VERWr : I<0x00, MRM5r, (outs), (ins GR16:$seg),
+ "verw\t$seg", [], IIC_VERW_MEM>, TB;
+def VERWm : I<0x00, MRM5m, (outs), (ins i16mem:$seg),
+ "verw\t$seg", [], IIC_VERW_REG>, TB;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Descriptor-table support instructions
+
+let SchedRW = [WriteSystem] in {
+def SGDT16m : I<0x01, MRM0m, (outs), (ins opaque48mem:$dst),
+ "sgdt{w}\t$dst", [], IIC_SGDT>, TB, OpSize16, Requires<[Not64BitMode]>;
+def SGDT32m : I<0x01, MRM0m, (outs), (ins opaque48mem:$dst),
+ "sgdt{l}\t$dst", [], IIC_SGDT>, OpSize32, TB, Requires <[Not64BitMode]>;
+def SGDT64m : I<0x01, MRM0m, (outs), (ins opaque80mem:$dst),
+ "sgdt{q}\t$dst", [], IIC_SGDT>, TB, Requires <[In64BitMode]>;
+def SIDT16m : I<0x01, MRM1m, (outs), (ins opaque48mem:$dst),
+ "sidt{w}\t$dst", [], IIC_SIDT>, TB, OpSize16, Requires<[Not64BitMode]>;
+def SIDT32m : I<0x01, MRM1m, (outs), (ins opaque48mem:$dst),
+ "sidt{l}\t$dst", []>, OpSize32, TB, Requires <[Not64BitMode]>;
+def SIDT64m : I<0x01, MRM1m, (outs), (ins opaque80mem:$dst),
+ "sidt{q}\t$dst", []>, TB, Requires <[In64BitMode]>;
+def SLDT16r : I<0x00, MRM0r, (outs GR16:$dst), (ins),
+ "sldt{w}\t$dst", [], IIC_SLDT>, TB, OpSize16;
+def SLDT16m : I<0x00, MRM0m, (outs), (ins i16mem:$dst),
+ "sldt{w}\t$dst", [], IIC_SLDT>, TB;
+def SLDT32r : I<0x00, MRM0r, (outs GR32:$dst), (ins),
+ "sldt{l}\t$dst", [], IIC_SLDT>, OpSize32, TB;
+
+// LLDT is not interpreted specially in 64-bit mode because there is no sign
+// extension.
+def SLDT64r : RI<0x00, MRM0r, (outs GR64:$dst), (ins),
+ "sldt{q}\t$dst", [], IIC_SLDT>, TB;
+def SLDT64m : RI<0x00, MRM0m, (outs), (ins i16mem:$dst),
+ "sldt{q}\t$dst", [], IIC_SLDT>, TB;
+
+def LGDT16m : I<0x01, MRM2m, (outs), (ins opaque48mem:$src),
+ "lgdt{w}\t$src", [], IIC_LGDT>, TB, OpSize16, Requires<[Not64BitMode]>;
+def LGDT32m : I<0x01, MRM2m, (outs), (ins opaque48mem:$src),
+ "lgdt{l}\t$src", [], IIC_LGDT>, OpSize32, TB, Requires<[Not64BitMode]>;
+def LGDT64m : I<0x01, MRM2m, (outs), (ins opaque80mem:$src),
+ "lgdt{q}\t$src", [], IIC_LGDT>, TB, Requires<[In64BitMode]>;
+def LIDT16m : I<0x01, MRM3m, (outs), (ins opaque48mem:$src),
+ "lidt{w}\t$src", [], IIC_LIDT>, TB, OpSize16, Requires<[Not64BitMode]>;
+def LIDT32m : I<0x01, MRM3m, (outs), (ins opaque48mem:$src),
+ "lidt{l}\t$src", [], IIC_LIDT>, OpSize32, TB, Requires<[Not64BitMode]>;
+def LIDT64m : I<0x01, MRM3m, (outs), (ins opaque80mem:$src),
+ "lidt{q}\t$src", [], IIC_LIDT>, TB, Requires<[In64BitMode]>;
+def LLDT16r : I<0x00, MRM2r, (outs), (ins GR16:$src),
+ "lldt{w}\t$src", [], IIC_LLDT_REG>, TB;
+def LLDT16m : I<0x00, MRM2m, (outs), (ins i16mem:$src),
+ "lldt{w}\t$src", [], IIC_LLDT_MEM>, TB;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Specialized register support
+let SchedRW = [WriteSystem] in {
+let Uses = [EAX, ECX, EDX] in
+def WRMSR : I<0x30, RawFrm, (outs), (ins), "wrmsr", [], IIC_WRMSR>, TB;
+let Defs = [EAX, EDX], Uses = [ECX] in
+def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", [], IIC_RDMSR>, TB;
+
+let Defs = [RAX, RDX], Uses = [ECX] in
+ def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", [(X86rdpmc)], IIC_RDPMC>,
+ TB;
+
+def SMSW16r : I<0x01, MRM4r, (outs GR16:$dst), (ins),
+ "smsw{w}\t$dst", [], IIC_SMSW>, OpSize16, TB;
+def SMSW32r : I<0x01, MRM4r, (outs GR32:$dst), (ins),
+ "smsw{l}\t$dst", [], IIC_SMSW>, OpSize32, TB;
+// no m form encodable; use SMSW16m
+def SMSW64r : RI<0x01, MRM4r, (outs GR64:$dst), (ins),
+ "smsw{q}\t$dst", [], IIC_SMSW>, TB;
+
+// For memory operands, there is only a 16-bit form
+def SMSW16m : I<0x01, MRM4m, (outs), (ins i16mem:$dst),
+ "smsw{w}\t$dst", [], IIC_SMSW>, TB;
+
+def LMSW16r : I<0x01, MRM6r, (outs), (ins GR16:$src),
+ "lmsw{w}\t$src", [], IIC_LMSW_MEM>, TB;
+def LMSW16m : I<0x01, MRM6m, (outs), (ins i16mem:$src),
+ "lmsw{w}\t$src", [], IIC_LMSW_REG>, TB;
+
+let Defs = [EAX, EBX, ECX, EDX], Uses = [EAX, ECX] in
+ def CPUID : I<0xA2, RawFrm, (outs), (ins), "cpuid", [], IIC_CPUID>, TB;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Cache instructions
+let SchedRW = [WriteSystem] in {
+def INVD : I<0x08, RawFrm, (outs), (ins), "invd", [], IIC_INVD>, TB;
+def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [], IIC_INVD>, TB;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// XSAVE instructions
+let SchedRW = [WriteSystem] in {
+let Predicates = [HasXSAVE] in {
+let Defs = [EDX, EAX], Uses = [ECX] in
+ def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, TB;
+
+let Uses = [EDX, EAX, ECX] in
+ def XSETBV : I<0x01, MRM_D1, (outs), (ins),
+ "xsetbv",
+ [(int_x86_xsetbv ECX, EDX, EAX)]>, TB;
+
+} // HasXSAVE
+
+let Uses = [EDX, EAX] in {
+let Predicates = [HasXSAVE] in {
+ def XSAVE : I<0xAE, MRM4m, (outs), (ins opaque512mem:$dst),
+ "xsave\t$dst",
+ [(int_x86_xsave addr:$dst, EDX, EAX)]>, TB;
+ def XSAVE64 : RI<0xAE, MRM4m, (outs), (ins opaque512mem:$dst),
+ "xsave64\t$dst",
+ [(int_x86_xsave64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>;
+ def XRSTOR : I<0xAE, MRM5m, (outs), (ins opaque512mem:$dst),
+ "xrstor\t$dst",
+ [(int_x86_xrstor addr:$dst, EDX, EAX)]>, TB;
+ def XRSTOR64 : RI<0xAE, MRM5m, (outs), (ins opaque512mem:$dst),
+ "xrstor64\t$dst",
+ [(int_x86_xrstor64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>;
+}
+let Predicates = [HasXSAVEOPT] in {
+ def XSAVEOPT : I<0xAE, MRM6m, (outs), (ins opaque512mem:$dst),
+ "xsaveopt\t$dst",
+ [(int_x86_xsaveopt addr:$dst, EDX, EAX)]>, PS;
+ def XSAVEOPT64 : RI<0xAE, MRM6m, (outs), (ins opaque512mem:$dst),
+ "xsaveopt64\t$dst",
+ [(int_x86_xsaveopt64 addr:$dst, EDX, EAX)]>, PS, Requires<[In64BitMode]>;
+}
+let Predicates = [HasXSAVEC] in {
+ def XSAVEC : I<0xC7, MRM4m, (outs), (ins opaque512mem:$dst),
+ "xsavec\t$dst",
+ [(int_x86_xsavec addr:$dst, EDX, EAX)]>, TB;
+ def XSAVEC64 : RI<0xC7, MRM4m, (outs), (ins opaque512mem:$dst),
+ "xsavec64\t$dst",
+ [(int_x86_xsavec64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>;
+}
+let Predicates = [HasXSAVES] in {
+ def XSAVES : I<0xC7, MRM5m, (outs), (ins opaque512mem:$dst),
+ "xsaves\t$dst",
+ [(int_x86_xsaves addr:$dst, EDX, EAX)]>, TB;
+ def XSAVES64 : RI<0xC7, MRM5m, (outs), (ins opaque512mem:$dst),
+ "xsaves64\t$dst",
+ [(int_x86_xsaves64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>;
+ def XRSTORS : I<0xC7, MRM3m, (outs), (ins opaque512mem:$dst),
+ "xrstors\t$dst",
+ [(int_x86_xrstors addr:$dst, EDX, EAX)]>, TB;
+ def XRSTORS64 : RI<0xC7, MRM3m, (outs), (ins opaque512mem:$dst),
+ "xrstors64\t$dst",
+ [(int_x86_xrstors64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>;
+}
+} // Uses
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// VIA PadLock crypto instructions
+let Defs = [RAX, RDI], Uses = [RDX, RDI] in
+ def XSTORE : I<0xa7, MRM_C0, (outs), (ins), "xstore", []>, TB;
+
+def : InstAlias<"xstorerng", (XSTORE)>;
+
+let Defs = [RSI, RDI], Uses = [RBX, RDX, RSI, RDI] in {
+ def XCRYPTECB : I<0xa7, MRM_C8, (outs), (ins), "xcryptecb", []>, TB;
+ def XCRYPTCBC : I<0xa7, MRM_D0, (outs), (ins), "xcryptcbc", []>, TB;
+ def XCRYPTCTR : I<0xa7, MRM_D8, (outs), (ins), "xcryptctr", []>, TB;
+ def XCRYPTCFB : I<0xa7, MRM_E0, (outs), (ins), "xcryptcfb", []>, TB;
+ def XCRYPTOFB : I<0xa7, MRM_E8, (outs), (ins), "xcryptofb", []>, TB;
+}
+
+let Defs = [RAX, RSI, RDI], Uses = [RAX, RSI, RDI] in {
+ def XSHA1 : I<0xa6, MRM_C8, (outs), (ins), "xsha1", []>, TB;
+ def XSHA256 : I<0xa6, MRM_D0, (outs), (ins), "xsha256", []>, TB;
+}
+let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in
+ def MONTMUL : I<0xa6, MRM_C0, (outs), (ins), "montmul", []>, TB;
+//==-----------------------------------------------------------------------===//
+// PKU - enable protection key
+let usesCustomInserter = 1 in {
+ def WRPKRU : PseudoI<(outs), (ins GR32:$src),
+ [(int_x86_wrpkru GR32:$src)]>;
+ def RDPKRU : PseudoI<(outs GR32:$dst), (ins),
+ [(set GR32:$dst, (int_x86_rdpkru))]>;
+}
+
+let Defs = [EAX, EDX], Uses = [ECX] in
+ def RDPKRUr : I<0x01, MRM_EE, (outs), (ins), "rdpkru", []>, TB;
+let Uses = [EAX, ECX, EDX] in
+ def WRPKRUr : I<0x01, MRM_EF, (outs), (ins), "wrpkru", []>, TB;
+
+//===----------------------------------------------------------------------===//
+// FS/GS Base Instructions
+let Predicates = [HasFSGSBase, In64BitMode] in {
+ def RDFSBASE : I<0xAE, MRM0r, (outs GR32:$dst), (ins),
+ "rdfsbase{l}\t$dst",
+ [(set GR32:$dst, (int_x86_rdfsbase_32))]>, XS;
+ def RDFSBASE64 : RI<0xAE, MRM0r, (outs GR64:$dst), (ins),
+ "rdfsbase{q}\t$dst",
+ [(set GR64:$dst, (int_x86_rdfsbase_64))]>, XS;
+ def RDGSBASE : I<0xAE, MRM1r, (outs GR32:$dst), (ins),
+ "rdgsbase{l}\t$dst",
+ [(set GR32:$dst, (int_x86_rdgsbase_32))]>, XS;
+ def RDGSBASE64 : RI<0xAE, MRM1r, (outs GR64:$dst), (ins),
+ "rdgsbase{q}\t$dst",
+ [(set GR64:$dst, (int_x86_rdgsbase_64))]>, XS;
+ def WRFSBASE : I<0xAE, MRM2r, (outs), (ins GR32:$src),
+ "wrfsbase{l}\t$src",
+ [(int_x86_wrfsbase_32 GR32:$src)]>, XS;
+ def WRFSBASE64 : RI<0xAE, MRM2r, (outs), (ins GR64:$src),
+ "wrfsbase{q}\t$src",
+ [(int_x86_wrfsbase_64 GR64:$src)]>, XS;
+ def WRGSBASE : I<0xAE, MRM3r, (outs), (ins GR32:$src),
+ "wrgsbase{l}\t$src",
+ [(int_x86_wrgsbase_32 GR32:$src)]>, XS;
+ def WRGSBASE64 : RI<0xAE, MRM3r, (outs), (ins GR64:$src),
+ "wrgsbase{q}\t$src",
+ [(int_x86_wrgsbase_64 GR64:$src)]>, XS;
+}
+
+//===----------------------------------------------------------------------===//
+// INVPCID Instruction
+def INVPCID32 : I<0x82, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
+ "invpcid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+ Requires<[Not64BitMode]>;
+def INVPCID64 : I<0x82, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
+ "invpcid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+ Requires<[In64BitMode]>;
+
+//===----------------------------------------------------------------------===//
+// SMAP Instruction
+let Defs = [EFLAGS] in {
+ def CLAC : I<0x01, MRM_CA, (outs), (ins), "clac", []>, TB;
+ def STAC : I<0x01, MRM_CB, (outs), (ins), "stac", []>, TB;
+}
+
+//===----------------------------------------------------------------------===//
+// SMX Instruction
+let Uses = [RAX, RBX, RCX, RDX], Defs = [RAX, RBX, RCX] in {
+ def GETSEC : I<0x37, RawFrm, (outs), (ins), "getsec", []>, TB;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrTSX.td b/contrib/llvm/lib/Target/X86/X86InstrTSX.td
new file mode 100644
index 000000000000..7267d752653e
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrTSX.td
@@ -0,0 +1,50 @@
+//===-- X86InstrVMX.td - TSX Instruction Set Extension -----*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the Intel TSX instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// TSX instructions
+
+def X86xtest: SDNode<"X86ISD::XTEST", SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+let usesCustomInserter = 1 in
+def XBEGIN : I<0, Pseudo, (outs GR32:$dst), (ins),
+ "# XBEGIN", [(set GR32:$dst, (int_x86_xbegin))]>,
+ Requires<[HasRTM]>;
+
+let isBranch = 1, isTerminator = 1, Defs = [EAX] in {
+def XBEGIN_2 : Ii16PCRel<0xc7, MRM_F8, (outs), (ins brtarget16:$dst),
+ "xbegin\t$dst", []>, OpSize16, Requires<[HasRTM]>;
+def XBEGIN_4 : Ii32PCRel<0xc7, MRM_F8, (outs), (ins brtarget32:$dst),
+ "xbegin\t$dst", []>, OpSize32, Requires<[HasRTM]>;
+}
+
+def XEND : I<0x01, MRM_D5, (outs), (ins),
+ "xend", [(int_x86_xend)]>, TB, Requires<[HasRTM]>;
+
+let Defs = [EFLAGS] in
+def XTEST : I<0x01, MRM_D6, (outs), (ins),
+ "xtest", [(set EFLAGS, (X86xtest))]>, TB, Requires<[HasTSX]>;
+
+def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm),
+ "xabort\t$imm",
+ [(int_x86_xabort imm:$imm)]>, Requires<[HasRTM]>;
+
+// HLE prefixes
+
+let isAsmParserOnly = 1 in {
+def XACQUIRE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "xacquire", []>, Requires<[HasHLE]>;
+def XRELEASE_PREFIX : I<0xF3, RawFrm, (outs), (ins), "xrelease", []>, Requires<[HasHLE]>;
+}
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrTablesInfo.h b/contrib/llvm/lib/Target/X86/X86InstrTablesInfo.h
new file mode 100755
index 000000000000..5d2af829028a
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrTablesInfo.h
@@ -0,0 +1,1148 @@
+//===-- X86AVX512Info.h - X86 Instruction Tables Information ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains related X86 Instruction Information Tables.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86INSTRTABLESINFO_H
+#define LLVM_LIB_TARGET_X86_X86INSTRTABLESINFO_H
+
+using namespace llvm;
+
+struct X86EvexToVexCompressTableEntry {
+ uint16_t EvexOpcode;
+ uint16_t VexOpcode;
+};
+
+
+
+// X86 EVEX encoded instructions that have a VEX 128 encoding
+// (table format: <EVEX opcode, VEX-128 opcode>).
+static const X86EvexToVexCompressTableEntry
+ X86EvexToVex128CompressTable[] = {
+ // EVEX scalar with corresponding VEX.
+ { X86::Int_VCOMISDZrm , X86::Int_VCOMISDrm },
+ { X86::Int_VCOMISDZrr , X86::Int_VCOMISDrr },
+ { X86::Int_VCOMISSZrm , X86::Int_VCOMISSrm },
+ { X86::Int_VCOMISSZrr , X86::Int_VCOMISSrr },
+ { X86::Int_VUCOMISDZrm , X86::Int_VUCOMISDrm },
+ { X86::Int_VUCOMISDZrr , X86::Int_VUCOMISDrr },
+ { X86::Int_VUCOMISSZrm , X86::Int_VUCOMISSrm },
+ { X86::Int_VUCOMISSZrr , X86::Int_VUCOMISSrr },
+ { X86::VADDSDZrm , X86::VADDSDrm },
+ { X86::VADDSDZrm_Int , X86::VADDSDrm_Int },
+ { X86::VADDSDZrr , X86::VADDSDrr },
+ { X86::VADDSDZrr_Int , X86::VADDSDrr_Int },
+ { X86::VADDSSZrm , X86::VADDSSrm },
+ { X86::VADDSSZrm_Int , X86::VADDSSrm_Int },
+ { X86::VADDSSZrr , X86::VADDSSrr },
+ { X86::VADDSSZrr_Int , X86::VADDSSrr_Int },
+ { X86::VCOMISDZrm , X86::VCOMISDrm },
+ { X86::VCOMISDZrr , X86::VCOMISDrr },
+ { X86::VCOMISSZrm , X86::VCOMISSrm },
+ { X86::VCOMISSZrr , X86::VCOMISSrr },
+ { X86::VCVTSD2SI64Zrm , X86::VCVTSD2SI64rm },
+ { X86::VCVTSD2SI64Zrr , X86::VCVTSD2SI64rr },
+ { X86::VCVTSD2SIZrm , X86::VCVTSD2SIrm },
+ { X86::VCVTSD2SIZrr , X86::VCVTSD2SIrr },
+ { X86::VCVTSD2SSZrm , X86::VCVTSD2SSrm },
+ { X86::VCVTSD2SSZrr , X86::VCVTSD2SSrr },
+ { X86::VCVTSI2SDZrm , X86::VCVTSI2SDrm },
+ { X86::VCVTSI2SDZrm_Int , X86::Int_VCVTSI2SDrm },
+ { X86::VCVTSI2SDZrr , X86::VCVTSI2SDrr },
+ { X86::VCVTSI2SDZrr_Int , X86::Int_VCVTSI2SDrr },
+ { X86::VCVTSI2SSZrm , X86::VCVTSI2SSrm },
+ { X86::VCVTSI2SSZrm_Int , X86::Int_VCVTSI2SSrm },
+ { X86::VCVTSI2SSZrr , X86::VCVTSI2SSrr },
+ { X86::VCVTSI2SSZrr_Int , X86::Int_VCVTSI2SSrr },
+ { X86::VCVTSS2SDZrm , X86::VCVTSS2SDrm },
+ { X86::VCVTSS2SDZrr , X86::VCVTSS2SDrr },
+ { X86::VCVTSS2SI64Zrm , X86::VCVTSS2SI64rm },
+ { X86::VCVTSS2SI64Zrr , X86::VCVTSS2SI64rr },
+ { X86::VCVTSS2SIZrm , X86::VCVTSS2SIrm },
+ { X86::VCVTSS2SIZrr , X86::VCVTSS2SIrr },
+ { X86::VCVTTSD2SI64Zrm , X86::VCVTTSD2SI64rm },
+ { X86::VCVTTSD2SI64Zrm_Int , X86::Int_VCVTTSD2SI64rm },
+ { X86::VCVTTSD2SI64Zrr , X86::VCVTTSD2SI64rr },
+ { X86::VCVTTSD2SI64Zrr_Int , X86::Int_VCVTTSD2SI64rr },
+ { X86::VCVTTSD2SIZrm , X86::VCVTTSD2SIrm },
+ { X86::VCVTTSD2SIZrm_Int , X86::Int_VCVTTSD2SIrm },
+ { X86::VCVTTSD2SIZrr , X86::VCVTTSD2SIrr },
+ { X86::VCVTTSD2SIZrr_Int , X86::Int_VCVTTSD2SIrr },
+ { X86::VCVTTSS2SI64Zrm , X86::VCVTTSS2SI64rm },
+ { X86::VCVTTSS2SI64Zrm_Int , X86::Int_VCVTTSS2SI64rm },
+ { X86::VCVTTSS2SI64Zrr , X86::VCVTTSS2SI64rr },
+ { X86::VCVTTSS2SI64Zrr_Int , X86::Int_VCVTTSS2SI64rr },
+ { X86::VCVTTSS2SIZrm , X86::VCVTTSS2SIrm },
+ { X86::VCVTTSS2SIZrm_Int , X86::Int_VCVTTSS2SIrm },
+ { X86::VCVTTSS2SIZrr , X86::VCVTTSS2SIrr },
+ { X86::VCVTTSS2SIZrr_Int , X86::Int_VCVTTSS2SIrr },
+ { X86::VDIVSDZrm , X86::VDIVSDrm },
+ { X86::VDIVSDZrm_Int , X86::VDIVSDrm_Int },
+ { X86::VDIVSDZrr , X86::VDIVSDrr },
+ { X86::VDIVSDZrr_Int , X86::VDIVSDrr_Int },
+ { X86::VDIVSSZrm , X86::VDIVSSrm },
+ { X86::VDIVSSZrm_Int , X86::VDIVSSrm_Int },
+ { X86::VDIVSSZrr , X86::VDIVSSrr },
+ { X86::VDIVSSZrr_Int , X86::VDIVSSrr_Int },
+ { X86::VFMADD132SDZm , X86::VFMADD132SDm },
+ { X86::VFMADD132SDZm_Int , X86::VFMADD132SDm_Int },
+ { X86::VFMADD132SDZr , X86::VFMADD132SDr },
+ { X86::VFMADD132SDZr_Int , X86::VFMADD132SDr_Int },
+ { X86::VFMADD132SSZm , X86::VFMADD132SSm },
+ { X86::VFMADD132SSZm_Int , X86::VFMADD132SSm_Int },
+ { X86::VFMADD132SSZr , X86::VFMADD132SSr },
+ { X86::VFMADD132SSZr_Int , X86::VFMADD132SSr_Int },
+ { X86::VFMADD213SDZm , X86::VFMADD213SDm },
+ { X86::VFMADD213SDZm_Int , X86::VFMADD213SDm_Int },
+ { X86::VFMADD213SDZr , X86::VFMADD213SDr },
+ { X86::VFMADD213SDZr_Int , X86::VFMADD213SDr_Int },
+ { X86::VFMADD213SSZm , X86::VFMADD213SSm },
+ { X86::VFMADD213SSZm_Int , X86::VFMADD213SSm_Int },
+ { X86::VFMADD213SSZr , X86::VFMADD213SSr },
+ { X86::VFMADD213SSZr_Int , X86::VFMADD213SSr_Int },
+ { X86::VFMADD231SDZm , X86::VFMADD231SDm },
+ { X86::VFMADD231SDZm_Int , X86::VFMADD231SDm_Int },
+ { X86::VFMADD231SDZr , X86::VFMADD231SDr },
+ { X86::VFMADD231SDZr_Int , X86::VFMADD231SDr_Int },
+ { X86::VFMADD231SSZm , X86::VFMADD231SSm },
+ { X86::VFMADD231SSZm_Int , X86::VFMADD231SSm_Int },
+ { X86::VFMADD231SSZr , X86::VFMADD231SSr },
+ { X86::VFMADD231SSZr_Int , X86::VFMADD231SSr_Int },
+ { X86::VFMSUB132SDZm , X86::VFMSUB132SDm },
+ { X86::VFMSUB132SDZm_Int , X86::VFMSUB132SDm_Int },
+ { X86::VFMSUB132SDZr , X86::VFMSUB132SDr },
+ { X86::VFMSUB132SDZr_Int , X86::VFMSUB132SDr_Int },
+ { X86::VFMSUB132SSZm , X86::VFMSUB132SSm },
+ { X86::VFMSUB132SSZm_Int , X86::VFMSUB132SSm_Int },
+ { X86::VFMSUB132SSZr , X86::VFMSUB132SSr },
+ { X86::VFMSUB132SSZr_Int , X86::VFMSUB132SSr_Int },
+ { X86::VFMSUB213SDZm , X86::VFMSUB213SDm },
+ { X86::VFMSUB213SDZm_Int , X86::VFMSUB213SDm_Int },
+ { X86::VFMSUB213SDZr , X86::VFMSUB213SDr },
+ { X86::VFMSUB213SDZr_Int , X86::VFMSUB213SDr_Int },
+ { X86::VFMSUB213SSZm , X86::VFMSUB213SSm },
+ { X86::VFMSUB213SSZm_Int , X86::VFMSUB213SSm_Int },
+ { X86::VFMSUB213SSZr , X86::VFMSUB213SSr },
+ { X86::VFMSUB213SSZr_Int , X86::VFMSUB213SSr_Int },
+ { X86::VFMSUB231SDZm , X86::VFMSUB231SDm },
+ { X86::VFMSUB231SDZm_Int , X86::VFMSUB231SDm_Int },
+ { X86::VFMSUB231SDZr , X86::VFMSUB231SDr },
+ { X86::VFMSUB231SDZr_Int , X86::VFMSUB231SDr_Int },
+ { X86::VFMSUB231SSZm , X86::VFMSUB231SSm },
+ { X86::VFMSUB231SSZm_Int , X86::VFMSUB231SSm_Int },
+ { X86::VFMSUB231SSZr , X86::VFMSUB231SSr },
+ { X86::VFMSUB231SSZr_Int , X86::VFMSUB231SSr_Int },
+ { X86::VFNMADD132SDZm , X86::VFNMADD132SDm },
+ { X86::VFNMADD132SDZm_Int , X86::VFNMADD132SDm_Int },
+ { X86::VFNMADD132SDZr , X86::VFNMADD132SDr },
+ { X86::VFNMADD132SDZr_Int , X86::VFNMADD132SDr_Int },
+ { X86::VFNMADD132SSZm , X86::VFNMADD132SSm },
+ { X86::VFNMADD132SSZm_Int , X86::VFNMADD132SSm_Int },
+ { X86::VFNMADD132SSZr , X86::VFNMADD132SSr },
+ { X86::VFNMADD132SSZr_Int , X86::VFNMADD132SSr_Int },
+ { X86::VFNMADD213SDZm , X86::VFNMADD213SDm },
+ { X86::VFNMADD213SDZm_Int , X86::VFNMADD213SDm_Int },
+ { X86::VFNMADD213SDZr , X86::VFNMADD213SDr },
+ { X86::VFNMADD213SDZr_Int , X86::VFNMADD213SDr_Int },
+ { X86::VFNMADD213SSZm , X86::VFNMADD213SSm },
+ { X86::VFNMADD213SSZm_Int , X86::VFNMADD213SSm_Int },
+ { X86::VFNMADD213SSZr , X86::VFNMADD213SSr },
+ { X86::VFNMADD213SSZr_Int , X86::VFNMADD213SSr_Int },
+ { X86::VFNMADD231SDZm , X86::VFNMADD231SDm },
+ { X86::VFNMADD231SDZm_Int , X86::VFNMADD231SDm_Int },
+ { X86::VFNMADD231SDZr , X86::VFNMADD231SDr },
+ { X86::VFNMADD231SDZr_Int , X86::VFNMADD231SDr_Int },
+ { X86::VFNMADD231SSZm , X86::VFNMADD231SSm },
+ { X86::VFNMADD231SSZm_Int , X86::VFNMADD231SSm_Int },
+ { X86::VFNMADD231SSZr , X86::VFNMADD231SSr },
+ { X86::VFNMADD231SSZr_Int , X86::VFNMADD231SSr_Int },
+ { X86::VFNMSUB132SDZm , X86::VFNMSUB132SDm },
+ { X86::VFNMSUB132SDZm_Int , X86::VFNMSUB132SDm_Int },
+ { X86::VFNMSUB132SDZr , X86::VFNMSUB132SDr },
+ { X86::VFNMSUB132SDZr_Int , X86::VFNMSUB132SDr_Int },
+ { X86::VFNMSUB132SSZm , X86::VFNMSUB132SSm },
+ { X86::VFNMSUB132SSZm_Int , X86::VFNMSUB132SSm_Int },
+ { X86::VFNMSUB132SSZr , X86::VFNMSUB132SSr },
+ { X86::VFNMSUB132SSZr_Int , X86::VFNMSUB132SSr_Int },
+ { X86::VFNMSUB213SDZm , X86::VFNMSUB213SDm },
+ { X86::VFNMSUB213SDZm_Int , X86::VFNMSUB213SDm_Int },
+ { X86::VFNMSUB213SDZr , X86::VFNMSUB213SDr },
+ { X86::VFNMSUB213SDZr_Int , X86::VFNMSUB213SDr_Int },
+ { X86::VFNMSUB213SSZm , X86::VFNMSUB213SSm },
+ { X86::VFNMSUB213SSZm_Int , X86::VFNMSUB213SSm_Int },
+ { X86::VFNMSUB213SSZr , X86::VFNMSUB213SSr },
+ { X86::VFNMSUB213SSZr_Int , X86::VFNMSUB213SSr_Int },
+ { X86::VFNMSUB231SDZm , X86::VFNMSUB231SDm },
+ { X86::VFNMSUB231SDZm_Int , X86::VFNMSUB231SDm_Int },
+ { X86::VFNMSUB231SDZr , X86::VFNMSUB231SDr },
+ { X86::VFNMSUB231SDZr_Int , X86::VFNMSUB231SDr_Int },
+ { X86::VFNMSUB231SSZm , X86::VFNMSUB231SSm },
+ { X86::VFNMSUB231SSZm_Int , X86::VFNMSUB231SSm_Int },
+ { X86::VFNMSUB231SSZr , X86::VFNMSUB231SSr },
+ { X86::VFNMSUB231SSZr_Int , X86::VFNMSUB231SSr_Int },
+ { X86::VMAXCSDZrm , X86::VMAXCSDrm },
+ { X86::VMAXCSDZrr , X86::VMAXCSDrr },
+ { X86::VMAXCSSZrm , X86::VMAXCSSrm },
+ { X86::VMAXCSSZrr , X86::VMAXCSSrr },
+ { X86::VMAXSDZrm , X86::VMAXSDrm },
+ { X86::VMAXSDZrm_Int , X86::VMAXSDrm_Int },
+ { X86::VMAXSDZrr , X86::VMAXSDrr },
+ { X86::VMAXSDZrr_Int , X86::VMAXSDrr_Int },
+ { X86::VMAXSSZrm , X86::VMAXSSrm },
+ { X86::VMAXSSZrm_Int , X86::VMAXSSrm_Int },
+ { X86::VMAXSSZrr , X86::VMAXSSrr },
+ { X86::VMAXSSZrr_Int , X86::VMAXSSrr_Int },
+ { X86::VMINCSDZrm , X86::VMINCSDrm },
+ { X86::VMINCSDZrr , X86::VMINCSDrr },
+ { X86::VMINCSSZrm , X86::VMINCSSrm },
+ { X86::VMINCSSZrr , X86::VMINCSSrr },
+ { X86::VMINSDZrm , X86::VMINSDrm },
+ { X86::VMINSDZrm_Int , X86::VMINSDrm_Int },
+ { X86::VMINSDZrr , X86::VMINSDrr },
+ { X86::VMINSDZrr_Int , X86::VMINSDrr_Int },
+ { X86::VMINSSZrm , X86::VMINSSrm },
+ { X86::VMINSSZrm_Int , X86::VMINSSrm_Int },
+ { X86::VMINSSZrr , X86::VMINSSrr },
+ { X86::VMINSSZrr_Int , X86::VMINSSrr_Int },
+ { X86::VMOV64toSDZrr , X86::VMOV64toSDrr },
+ { X86::VMOVDI2SSZrm , X86::VMOVDI2SSrm },
+ { X86::VMOVDI2SSZrr , X86::VMOVDI2SSrr },
+ { X86::VMOVSDZmr , X86::VMOVSDmr },
+ { X86::VMOVSDZrm , X86::VMOVSDrm },
+ { X86::VMOVSDZrr , X86::VMOVSDrr },
+ { X86::VMOVSSZmr , X86::VMOVSSmr },
+ { X86::VMOVSSZrm , X86::VMOVSSrm },
+ { X86::VMOVSSZrr , X86::VMOVSSrr },
+ { X86::VMOVSSZrr_REV , X86::VMOVSSrr_REV },
+ { X86::VMULSDZrm , X86::VMULSDrm },
+ { X86::VMULSDZrm_Int , X86::VMULSDrm_Int },
+ { X86::VMULSDZrr , X86::VMULSDrr },
+ { X86::VMULSDZrr_Int , X86::VMULSDrr_Int },
+ { X86::VMULSSZrm , X86::VMULSSrm },
+ { X86::VMULSSZrm_Int , X86::VMULSSrm_Int },
+ { X86::VMULSSZrr , X86::VMULSSrr },
+ { X86::VMULSSZrr_Int , X86::VMULSSrr_Int },
+ { X86::VSQRTSDZm , X86::VSQRTSDm },
+ { X86::VSQRTSDZm_Int , X86::VSQRTSDm_Int },
+ { X86::VSQRTSDZr , X86::VSQRTSDr },
+ { X86::VSQRTSDZr_Int , X86::VSQRTSDr_Int },
+ { X86::VSQRTSSZm , X86::VSQRTSSm },
+ { X86::VSQRTSSZm_Int , X86::VSQRTSSm_Int },
+ { X86::VSQRTSSZr , X86::VSQRTSSr },
+ { X86::VSQRTSSZr_Int , X86::VSQRTSSr_Int },
+ { X86::VSUBSDZrm , X86::VSUBSDrm },
+ { X86::VSUBSDZrm_Int , X86::VSUBSDrm_Int },
+ { X86::VSUBSDZrr , X86::VSUBSDrr },
+ { X86::VSUBSDZrr_Int , X86::VSUBSDrr_Int },
+ { X86::VSUBSSZrm , X86::VSUBSSrm },
+ { X86::VSUBSSZrm_Int , X86::VSUBSSrm_Int },
+ { X86::VSUBSSZrr , X86::VSUBSSrr },
+ { X86::VSUBSSZrr_Int , X86::VSUBSSrr_Int },
+ { X86::VUCOMISDZrm , X86::VUCOMISDrm },
+ { X86::VUCOMISDZrr , X86::VUCOMISDrr },
+ { X86::VUCOMISSZrm , X86::VUCOMISSrm },
+ { X86::VUCOMISSZrr , X86::VUCOMISSrr },
+
+ { X86::VMOV64toPQIZrr , X86::VMOV64toPQIrr },
+ { X86::VMOV64toSDZrr , X86::VMOV64toSDrr },
+ { X86::VMOVDI2PDIZrm , X86::VMOVDI2PDIrm },
+ { X86::VMOVDI2PDIZrr , X86::VMOVDI2PDIrr },
+ { X86::VMOVLHPSZrr , X86::VMOVLHPSrr },
+ { X86::VMOVHLPSZrr , X86::VMOVHLPSrr },
+ { X86::VMOVPDI2DIZmr , X86::VMOVPDI2DImr },
+ { X86::VMOVPDI2DIZrr , X86::VMOVPDI2DIrr },
+ { X86::VMOVPQI2QIZmr , X86::VMOVPQI2QImr },
+ { X86::VMOVPQIto64Zrr , X86::VMOVPQIto64rr },
+ { X86::VMOVQI2PQIZrm , X86::VMOVQI2PQIrm },
+ { X86::VMOVZPQILo2PQIZrr , X86::VMOVZPQILo2PQIrr },
+
+ { X86::VPEXTRBZmr , X86::VPEXTRBmr },
+ { X86::VPEXTRBZrr , X86::VPEXTRBrr },
+ { X86::VPEXTRDZmr , X86::VPEXTRDmr },
+ { X86::VPEXTRDZrr , X86::VPEXTRDrr },
+ { X86::VPEXTRQZmr , X86::VPEXTRQmr },
+ { X86::VPEXTRQZrr , X86::VPEXTRQrr },
+ { X86::VPEXTRWZmr , X86::VPEXTRWmr },
+ { X86::VPEXTRWZrr , X86::VPEXTRWri },
+
+ { X86::VPINSRBZrm , X86::VPINSRBrm },
+ { X86::VPINSRBZrr , X86::VPINSRBrr },
+ { X86::VPINSRDZrm , X86::VPINSRDrm },
+ { X86::VPINSRDZrr , X86::VPINSRDrr },
+ { X86::VPINSRQZrm , X86::VPINSRQrm },
+ { X86::VPINSRQZrr , X86::VPINSRQrr },
+ { X86::VPINSRWZrm , X86::VPINSRWrmi },
+ { X86::VPINSRWZrr , X86::VPINSRWrri },
+
+ // EVEX 128 with corresponding VEX.
+ { X86::VADDPDZ128rm , X86::VADDPDrm },
+ { X86::VADDPDZ128rr , X86::VADDPDrr },
+ { X86::VADDPSZ128rm , X86::VADDPSrm },
+ { X86::VADDPSZ128rr , X86::VADDPSrr },
+ { X86::VANDNPDZ128rm , X86::VANDNPDrm },
+ { X86::VANDNPDZ128rr , X86::VANDNPDrr },
+ { X86::VANDNPSZ128rm , X86::VANDNPSrm },
+ { X86::VANDNPSZ128rr , X86::VANDNPSrr },
+ { X86::VANDPDZ128rm , X86::VANDPDrm },
+ { X86::VANDPDZ128rr , X86::VANDPDrr },
+ { X86::VANDPSZ128rm , X86::VANDPSrm },
+ { X86::VANDPSZ128rr , X86::VANDPSrr },
+ { X86::VBROADCASTSSZ128m , X86::VBROADCASTSSrm },
+ { X86::VBROADCASTSSZ128r , X86::VBROADCASTSSrr },
+ { X86::VBROADCASTSSZ128r_s , X86::VBROADCASTSSrr },
+ { X86::VCVTDQ2PDZ128rm , X86::VCVTDQ2PDrm },
+ { X86::VCVTDQ2PDZ128rr , X86::VCVTDQ2PDrr },
+ { X86::VCVTDQ2PSZ128rm , X86::VCVTDQ2PSrm },
+ { X86::VCVTDQ2PSZ128rr , X86::VCVTDQ2PSrr },
+ { X86::VCVTPD2DQZ128rm , X86::VCVTPD2DQrm },
+ { X86::VCVTPD2DQZ128rr , X86::VCVTPD2DQrr },
+ { X86::VCVTPD2PSZ128rm , X86::VCVTPD2PSrm },
+ { X86::VCVTPD2PSZ128rr , X86::VCVTPD2PSrr },
+ { X86::VCVTPH2PSZ128rm , X86::VCVTPH2PSrm },
+ { X86::VCVTPH2PSZ128rr , X86::VCVTPH2PSrr },
+ { X86::VCVTPS2DQZ128rm , X86::VCVTPS2DQrm },
+ { X86::VCVTPS2DQZ128rr , X86::VCVTPS2DQrr },
+ { X86::VCVTPS2PDZ128rm , X86::VCVTPS2PDrm },
+ { X86::VCVTPS2PDZ128rr , X86::VCVTPS2PDrr },
+ { X86::VCVTPS2PHZ128mr , X86::VCVTPS2PHmr },
+ { X86::VCVTPS2PHZ128rr , X86::VCVTPS2PHrr },
+ { X86::VCVTTPD2DQZ128rm , X86::VCVTTPD2DQrm },
+ { X86::VCVTTPD2DQZ128rr , X86::VCVTTPD2DQrr },
+ { X86::VCVTTPS2DQZ128rm , X86::VCVTTPS2DQrm },
+ { X86::VCVTTPS2DQZ128rr , X86::VCVTTPS2DQrr },
+ { X86::VDIVPDZ128rm , X86::VDIVPDrm },
+ { X86::VDIVPDZ128rr , X86::VDIVPDrr },
+ { X86::VDIVPSZ128rm , X86::VDIVPSrm },
+ { X86::VDIVPSZ128rr , X86::VDIVPSrr },
+ { X86::VFMADD132PDZ128m , X86::VFMADD132PDm },
+ { X86::VFMADD132PDZ128r , X86::VFMADD132PDr },
+ { X86::VFMADD132PSZ128m , X86::VFMADD132PSm },
+ { X86::VFMADD132PSZ128r , X86::VFMADD132PSr },
+ { X86::VFMADD213PDZ128m , X86::VFMADD213PDm },
+ { X86::VFMADD213PDZ128r , X86::VFMADD213PDr },
+ { X86::VFMADD213PSZ128m , X86::VFMADD213PSm },
+ { X86::VFMADD213PSZ128r , X86::VFMADD213PSr },
+ { X86::VFMADD231PDZ128m , X86::VFMADD231PDm },
+ { X86::VFMADD231PDZ128r , X86::VFMADD231PDr },
+ { X86::VFMADD231PSZ128m , X86::VFMADD231PSm },
+ { X86::VFMADD231PSZ128r , X86::VFMADD231PSr },
+ { X86::VFMADDSUB132PDZ128m , X86::VFMADDSUB132PDm },
+ { X86::VFMADDSUB132PDZ128r , X86::VFMADDSUB132PDr },
+ { X86::VFMADDSUB132PSZ128m , X86::VFMADDSUB132PSm },
+ { X86::VFMADDSUB132PSZ128r , X86::VFMADDSUB132PSr },
+ { X86::VFMADDSUB213PDZ128m , X86::VFMADDSUB213PDm },
+ { X86::VFMADDSUB213PDZ128r , X86::VFMADDSUB213PDr },
+ { X86::VFMADDSUB213PSZ128m , X86::VFMADDSUB213PSm },
+ { X86::VFMADDSUB213PSZ128r , X86::VFMADDSUB213PSr },
+ { X86::VFMADDSUB231PDZ128m , X86::VFMADDSUB231PDm },
+ { X86::VFMADDSUB231PDZ128r , X86::VFMADDSUB231PDr },
+ { X86::VFMADDSUB231PSZ128m , X86::VFMADDSUB231PSm },
+ { X86::VFMADDSUB231PSZ128r , X86::VFMADDSUB231PSr },
+ { X86::VFMSUB132PDZ128m , X86::VFMSUB132PDm },
+ { X86::VFMSUB132PDZ128r , X86::VFMSUB132PDr },
+ { X86::VFMSUB132PSZ128m , X86::VFMSUB132PSm },
+ { X86::VFMSUB132PSZ128r , X86::VFMSUB132PSr },
+ { X86::VFMSUB213PDZ128m , X86::VFMSUB213PDm },
+ { X86::VFMSUB213PDZ128r , X86::VFMSUB213PDr },
+ { X86::VFMSUB213PSZ128m , X86::VFMSUB213PSm },
+ { X86::VFMSUB213PSZ128r , X86::VFMSUB213PSr },
+ { X86::VFMSUB231PDZ128m , X86::VFMSUB231PDm },
+ { X86::VFMSUB231PDZ128r , X86::VFMSUB231PDr },
+ { X86::VFMSUB231PSZ128m , X86::VFMSUB231PSm },
+ { X86::VFMSUB231PSZ128r , X86::VFMSUB231PSr },
+ { X86::VFMSUBADD132PDZ128m , X86::VFMSUBADD132PDm },
+ { X86::VFMSUBADD132PDZ128r , X86::VFMSUBADD132PDr },
+ { X86::VFMSUBADD132PSZ128m , X86::VFMSUBADD132PSm },
+ { X86::VFMSUBADD132PSZ128r , X86::VFMSUBADD132PSr },
+ { X86::VFMSUBADD213PDZ128m , X86::VFMSUBADD213PDm },
+ { X86::VFMSUBADD213PDZ128r , X86::VFMSUBADD213PDr },
+ { X86::VFMSUBADD213PSZ128m , X86::VFMSUBADD213PSm },
+ { X86::VFMSUBADD213PSZ128r , X86::VFMSUBADD213PSr },
+ { X86::VFMSUBADD231PDZ128m , X86::VFMSUBADD231PDm },
+ { X86::VFMSUBADD231PDZ128r , X86::VFMSUBADD231PDr },
+ { X86::VFMSUBADD231PSZ128m , X86::VFMSUBADD231PSm },
+ { X86::VFMSUBADD231PSZ128r , X86::VFMSUBADD231PSr },
+ { X86::VFNMADD132PDZ128m , X86::VFNMADD132PDm },
+ { X86::VFNMADD132PDZ128r , X86::VFNMADD132PDr },
+ { X86::VFNMADD132PSZ128m , X86::VFNMADD132PSm },
+ { X86::VFNMADD132PSZ128r , X86::VFNMADD132PSr },
+ { X86::VFNMADD213PDZ128m , X86::VFNMADD213PDm },
+ { X86::VFNMADD213PDZ128r , X86::VFNMADD213PDr },
+ { X86::VFNMADD213PSZ128m , X86::VFNMADD213PSm },
+ { X86::VFNMADD213PSZ128r , X86::VFNMADD213PSr },
+ { X86::VFNMADD231PDZ128m , X86::VFNMADD231PDm },
+ { X86::VFNMADD231PDZ128r , X86::VFNMADD231PDr },
+ { X86::VFNMADD231PSZ128m , X86::VFNMADD231PSm },
+ { X86::VFNMADD231PSZ128r , X86::VFNMADD231PSr },
+ { X86::VFNMSUB132PDZ128m , X86::VFNMSUB132PDm },
+ { X86::VFNMSUB132PDZ128r , X86::VFNMSUB132PDr },
+ { X86::VFNMSUB132PSZ128m , X86::VFNMSUB132PSm },
+ { X86::VFNMSUB132PSZ128r , X86::VFNMSUB132PSr },
+ { X86::VFNMSUB213PDZ128m , X86::VFNMSUB213PDm },
+ { X86::VFNMSUB213PDZ128r , X86::VFNMSUB213PDr },
+ { X86::VFNMSUB213PSZ128m , X86::VFNMSUB213PSm },
+ { X86::VFNMSUB213PSZ128r , X86::VFNMSUB213PSr },
+ { X86::VFNMSUB231PDZ128m , X86::VFNMSUB231PDm },
+ { X86::VFNMSUB231PDZ128r , X86::VFNMSUB231PDr },
+ { X86::VFNMSUB231PSZ128m , X86::VFNMSUB231PSm },
+ { X86::VFNMSUB231PSZ128r , X86::VFNMSUB231PSr },
+ { X86::VMAXCPDZ128rm , X86::VMAXCPDrm },
+ { X86::VMAXCPDZ128rr , X86::VMAXCPDrr },
+ { X86::VMAXCPSZ128rm , X86::VMAXCPSrm },
+ { X86::VMAXCPSZ128rr , X86::VMAXCPSrr },
+ { X86::VMAXPDZ128rm , X86::VMAXPDrm },
+ { X86::VMAXPDZ128rr , X86::VMAXPDrr },
+ { X86::VMAXPSZ128rm , X86::VMAXPSrm },
+ { X86::VMAXPSZ128rr , X86::VMAXPSrr },
+ { X86::VMINCPDZ128rm , X86::VMINCPDrm },
+ { X86::VMINCPDZ128rr , X86::VMINCPDrr },
+ { X86::VMINCPSZ128rm , X86::VMINCPSrm },
+ { X86::VMINCPSZ128rr , X86::VMINCPSrr },
+ { X86::VMINPDZ128rm , X86::VMINPDrm },
+ { X86::VMINPDZ128rr , X86::VMINPDrr },
+ { X86::VMINPSZ128rm , X86::VMINPSrm },
+ { X86::VMINPSZ128rr , X86::VMINPSrr },
+ { X86::VMOVAPDZ128mr , X86::VMOVAPDmr },
+ { X86::VMOVAPDZ128rm , X86::VMOVAPDrm },
+ { X86::VMOVAPDZ128rr , X86::VMOVAPDrr },
+ { X86::VMOVAPDZ128rr_REV , X86::VMOVAPDrr_REV },
+ { X86::VMOVAPSZ128mr , X86::VMOVAPSmr },
+ { X86::VMOVAPSZ128rm , X86::VMOVAPSrm },
+ { X86::VMOVAPSZ128rr , X86::VMOVAPSrr },
+ { X86::VMOVAPSZ128rr_REV , X86::VMOVAPSrr_REV },
+ { X86::VMOVDDUPZ128rm , X86::VMOVDDUPrm },
+ { X86::VMOVDDUPZ128rr , X86::VMOVDDUPrr },
+ { X86::VMOVDQA32Z128mr , X86::VMOVDQAmr },
+ { X86::VMOVDQA32Z128rm , X86::VMOVDQArm },
+ { X86::VMOVDQA32Z128rr , X86::VMOVDQArr },
+ { X86::VMOVDQA32Z128rr_REV , X86::VMOVDQArr_REV },
+ { X86::VMOVDQA64Z128mr , X86::VMOVDQAmr },
+ { X86::VMOVDQA64Z128rm , X86::VMOVDQArm },
+ { X86::VMOVDQA64Z128rr , X86::VMOVDQArr },
+ { X86::VMOVDQA64Z128rr_REV , X86::VMOVDQArr_REV },
+ { X86::VMOVDQU16Z128mr , X86::VMOVDQUmr },
+ { X86::VMOVDQU16Z128rm , X86::VMOVDQUrm },
+ { X86::VMOVDQU16Z128rr , X86::VMOVDQUrr },
+ { X86::VMOVDQU16Z128rr_REV , X86::VMOVDQUrr_REV },
+ { X86::VMOVDQU32Z128mr , X86::VMOVDQUmr },
+ { X86::VMOVDQU32Z128rm , X86::VMOVDQUrm },
+ { X86::VMOVDQU32Z128rr , X86::VMOVDQUrr },
+ { X86::VMOVDQU32Z128rr_REV , X86::VMOVDQUrr_REV },
+ { X86::VMOVDQU64Z128mr , X86::VMOVDQUmr },
+ { X86::VMOVDQU64Z128rm , X86::VMOVDQUrm },
+ { X86::VMOVDQU64Z128rr , X86::VMOVDQUrr },
+ { X86::VMOVDQU64Z128rr_REV , X86::VMOVDQUrr_REV },
+ { X86::VMOVDQU8Z128mr , X86::VMOVDQUmr },
+ { X86::VMOVDQU8Z128rm , X86::VMOVDQUrm },
+ { X86::VMOVDQU8Z128rr , X86::VMOVDQUrr },
+ { X86::VMOVDQU8Z128rr_REV , X86::VMOVDQUrr_REV },
+ { X86::VMOVHPDZ128mr , X86::VMOVHPDmr },
+ { X86::VMOVHPDZ128rm , X86::VMOVHPDrm },
+ { X86::VMOVHPSZ128mr , X86::VMOVHPSmr },
+ { X86::VMOVHPSZ128rm , X86::VMOVHPSrm },
+ { X86::VMOVLPDZ128mr , X86::VMOVLPDmr },
+ { X86::VMOVLPDZ128rm , X86::VMOVLPDrm },
+ { X86::VMOVLPSZ128mr , X86::VMOVLPSmr },
+ { X86::VMOVLPSZ128rm , X86::VMOVLPSrm },
+ { X86::VMOVNTDQAZ128rm , X86::VMOVNTDQArm },
+ { X86::VMOVNTDQZ128mr , X86::VMOVNTDQmr },
+ { X86::VMOVNTPDZ128mr , X86::VMOVNTPDmr },
+ { X86::VMOVNTPSZ128mr , X86::VMOVNTPSmr },
+ { X86::VMOVSHDUPZ128rm , X86::VMOVSHDUPrm },
+ { X86::VMOVSHDUPZ128rr , X86::VMOVSHDUPrr },
+ { X86::VMOVSLDUPZ128rm , X86::VMOVSLDUPrm },
+ { X86::VMOVSLDUPZ128rr , X86::VMOVSLDUPrr },
+ { X86::VMOVUPDZ128mr , X86::VMOVUPDmr },
+ { X86::VMOVUPDZ128rm , X86::VMOVUPDrm },
+ { X86::VMOVUPDZ128rr , X86::VMOVUPDrr },
+ { X86::VMOVUPDZ128rr_REV , X86::VMOVUPDrr_REV },
+ { X86::VMOVUPSZ128mr , X86::VMOVUPSmr },
+ { X86::VMOVUPSZ128rm , X86::VMOVUPSrm },
+ { X86::VMOVUPSZ128rr , X86::VMOVUPSrr },
+ { X86::VMOVUPSZ128rr_REV , X86::VMOVUPSrr_REV },
+ { X86::VMULPDZ128rm , X86::VMULPDrm },
+ { X86::VMULPDZ128rr , X86::VMULPDrr },
+ { X86::VMULPSZ128rm , X86::VMULPSrm },
+ { X86::VMULPSZ128rr , X86::VMULPSrr },
+ { X86::VORPDZ128rm , X86::VORPDrm },
+ { X86::VORPDZ128rr , X86::VORPDrr },
+ { X86::VORPSZ128rm , X86::VORPSrm },
+ { X86::VORPSZ128rr , X86::VORPSrr },
+ { X86::VPABSBZ128rm , X86::VPABSBrm },
+ { X86::VPABSBZ128rr , X86::VPABSBrr },
+ { X86::VPABSDZ128rm , X86::VPABSDrm },
+ { X86::VPABSDZ128rr , X86::VPABSDrr },
+ { X86::VPABSWZ128rm , X86::VPABSWrm },
+ { X86::VPABSWZ128rr , X86::VPABSWrr },
+ { X86::VPACKSSDWZ128rm , X86::VPACKSSDWrm },
+ { X86::VPACKSSDWZ128rr , X86::VPACKSSDWrr },
+ { X86::VPACKSSWBZ128rm , X86::VPACKSSWBrm },
+ { X86::VPACKSSWBZ128rr , X86::VPACKSSWBrr },
+ { X86::VPACKUSDWZ128rm , X86::VPACKUSDWrm },
+ { X86::VPACKUSDWZ128rr , X86::VPACKUSDWrr },
+ { X86::VPACKUSWBZ128rm , X86::VPACKUSWBrm },
+ { X86::VPACKUSWBZ128rr , X86::VPACKUSWBrr },
+ { X86::VPADDBZ128rm , X86::VPADDBrm },
+ { X86::VPADDBZ128rr , X86::VPADDBrr },
+ { X86::VPADDDZ128rm , X86::VPADDDrm },
+ { X86::VPADDDZ128rr , X86::VPADDDrr },
+ { X86::VPADDQZ128rm , X86::VPADDQrm },
+ { X86::VPADDQZ128rr , X86::VPADDQrr },
+ { X86::VPADDSBZ128rm , X86::VPADDSBrm },
+ { X86::VPADDSBZ128rr , X86::VPADDSBrr },
+ { X86::VPADDSWZ128rm , X86::VPADDSWrm },
+ { X86::VPADDSWZ128rr , X86::VPADDSWrr },
+ { X86::VPADDUSBZ128rm , X86::VPADDUSBrm },
+ { X86::VPADDUSBZ128rr , X86::VPADDUSBrr },
+ { X86::VPADDUSWZ128rm , X86::VPADDUSWrm },
+ { X86::VPADDUSWZ128rr , X86::VPADDUSWrr },
+ { X86::VPADDWZ128rm , X86::VPADDWrm },
+ { X86::VPADDWZ128rr , X86::VPADDWrr },
+ { X86::VPALIGNRZ128rmi , X86::VPALIGNRrmi },
+ { X86::VPALIGNRZ128rri , X86::VPALIGNRrri },
+ { X86::VPANDDZ128rm , X86::VPANDrm },
+ { X86::VPANDDZ128rr , X86::VPANDrr },
+ { X86::VPANDQZ128rm , X86::VPANDrm },
+ { X86::VPANDQZ128rr , X86::VPANDrr },
+ { X86::VPAVGBZ128rm , X86::VPAVGBrm },
+ { X86::VPAVGBZ128rr , X86::VPAVGBrr },
+ { X86::VPAVGWZ128rm , X86::VPAVGWrm },
+ { X86::VPAVGWZ128rr , X86::VPAVGWrr },
+ { X86::VPBROADCASTBZ128m , X86::VPBROADCASTBrm },
+ { X86::VPBROADCASTBZ128r , X86::VPBROADCASTBrr },
+ { X86::VPBROADCASTDZ128m , X86::VPBROADCASTDrm },
+ { X86::VPBROADCASTDZ128r , X86::VPBROADCASTDrr },
+ { X86::VPBROADCASTQZ128m , X86::VPBROADCASTQrm },
+ { X86::VPBROADCASTQZ128r , X86::VPBROADCASTQrr },
+ { X86::VPBROADCASTWZ128m , X86::VPBROADCASTWrm },
+ { X86::VPBROADCASTWZ128r , X86::VPBROADCASTWrr },
+ { X86::VPERMILPDZ128mi , X86::VPERMILPDmi },
+ { X86::VPERMILPDZ128ri , X86::VPERMILPDri },
+ { X86::VPERMILPDZ128rm , X86::VPERMILPDrm },
+ { X86::VPERMILPDZ128rr , X86::VPERMILPDrr },
+ { X86::VPERMILPSZ128mi , X86::VPERMILPSmi },
+ { X86::VPERMILPSZ128ri , X86::VPERMILPSri },
+ { X86::VPERMILPSZ128rm , X86::VPERMILPSrm },
+ { X86::VPERMILPSZ128rr , X86::VPERMILPSrr },
+ { X86::VPMADDUBSWZ128rm , X86::VPMADDUBSWrm },
+ { X86::VPMADDUBSWZ128rr , X86::VPMADDUBSWrr },
+ { X86::VPMADDWDZ128rm , X86::VPMADDWDrm },
+ { X86::VPMADDWDZ128rr , X86::VPMADDWDrr },
+ { X86::VPMAXSBZ128rm , X86::VPMAXSBrm },
+ { X86::VPMAXSBZ128rr , X86::VPMAXSBrr },
+ { X86::VPMAXSDZ128rm , X86::VPMAXSDrm },
+ { X86::VPMAXSDZ128rr , X86::VPMAXSDrr },
+ { X86::VPMAXSWZ128rm , X86::VPMAXSWrm },
+ { X86::VPMAXSWZ128rr , X86::VPMAXSWrr },
+ { X86::VPMAXUBZ128rm , X86::VPMAXUBrm },
+ { X86::VPMAXUBZ128rr , X86::VPMAXUBrr },
+ { X86::VPMAXUDZ128rm , X86::VPMAXUDrm },
+ { X86::VPMAXUDZ128rr , X86::VPMAXUDrr },
+ { X86::VPMAXUWZ128rm , X86::VPMAXUWrm },
+ { X86::VPMAXUWZ128rr , X86::VPMAXUWrr },
+ { X86::VPMINSBZ128rm , X86::VPMINSBrm },
+ { X86::VPMINSBZ128rr , X86::VPMINSBrr },
+ { X86::VPMINSDZ128rm , X86::VPMINSDrm },
+ { X86::VPMINSDZ128rr , X86::VPMINSDrr },
+ { X86::VPMINSWZ128rm , X86::VPMINSWrm },
+ { X86::VPMINSWZ128rr , X86::VPMINSWrr },
+ { X86::VPMINUBZ128rm , X86::VPMINUBrm },
+ { X86::VPMINUBZ128rr , X86::VPMINUBrr },
+ { X86::VPMINUDZ128rm , X86::VPMINUDrm },
+ { X86::VPMINUDZ128rr , X86::VPMINUDrr },
+ { X86::VPMINUWZ128rm , X86::VPMINUWrm },
+ { X86::VPMINUWZ128rr , X86::VPMINUWrr },
+ { X86::VPMOVSXBDZ128rm , X86::VPMOVSXBDrm },
+ { X86::VPMOVSXBDZ128rr , X86::VPMOVSXBDrr },
+ { X86::VPMOVSXBQZ128rm , X86::VPMOVSXBQrm },
+ { X86::VPMOVSXBQZ128rr , X86::VPMOVSXBQrr },
+ { X86::VPMOVSXBWZ128rm , X86::VPMOVSXBWrm },
+ { X86::VPMOVSXBWZ128rr , X86::VPMOVSXBWrr },
+ { X86::VPMOVSXDQZ128rm , X86::VPMOVSXDQrm },
+ { X86::VPMOVSXDQZ128rr , X86::VPMOVSXDQrr },
+ { X86::VPMOVSXWDZ128rm , X86::VPMOVSXWDrm },
+ { X86::VPMOVSXWDZ128rr , X86::VPMOVSXWDrr },
+ { X86::VPMOVSXWQZ128rm , X86::VPMOVSXWQrm },
+ { X86::VPMOVSXWQZ128rr , X86::VPMOVSXWQrr },
+ { X86::VPMOVZXBDZ128rm , X86::VPMOVZXBDrm },
+ { X86::VPMOVZXBDZ128rr , X86::VPMOVZXBDrr },
+ { X86::VPMOVZXBQZ128rm , X86::VPMOVZXBQrm },
+ { X86::VPMOVZXBQZ128rr , X86::VPMOVZXBQrr },
+ { X86::VPMOVZXBWZ128rm , X86::VPMOVZXBWrm },
+ { X86::VPMOVZXBWZ128rr , X86::VPMOVZXBWrr },
+ { X86::VPMOVZXDQZ128rm , X86::VPMOVZXDQrm },
+ { X86::VPMOVZXDQZ128rr , X86::VPMOVZXDQrr },
+ { X86::VPMOVZXWDZ128rm , X86::VPMOVZXWDrm },
+ { X86::VPMOVZXWDZ128rr , X86::VPMOVZXWDrr },
+ { X86::VPMOVZXWQZ128rm , X86::VPMOVZXWQrm },
+ { X86::VPMOVZXWQZ128rr , X86::VPMOVZXWQrr },
+ { X86::VPMULDQZ128rm , X86::VPMULDQrm },
+ { X86::VPMULDQZ128rr , X86::VPMULDQrr },
+ { X86::VPMULHRSWZ128rm , X86::VPMULHRSWrm },
+ { X86::VPMULHRSWZ128rr , X86::VPMULHRSWrr },
+ { X86::VPMULHUWZ128rm , X86::VPMULHUWrm },
+ { X86::VPMULHUWZ128rr , X86::VPMULHUWrr },
+ { X86::VPMULHWZ128rm , X86::VPMULHWrm },
+ { X86::VPMULHWZ128rr , X86::VPMULHWrr },
+ { X86::VPMULLDZ128rm , X86::VPMULLDrm },
+ { X86::VPMULLDZ128rr , X86::VPMULLDrr },
+ { X86::VPMULLWZ128rm , X86::VPMULLWrm },
+ { X86::VPMULLWZ128rr , X86::VPMULLWrr },
+ { X86::VPMULUDQZ128rm , X86::VPMULUDQrm },
+ { X86::VPMULUDQZ128rr , X86::VPMULUDQrr },
+ { X86::VPORDZ128rm , X86::VPORrm },
+ { X86::VPORDZ128rr , X86::VPORrr },
+ { X86::VPORQZ128rm , X86::VPORrm },
+ { X86::VPORQZ128rr , X86::VPORrr },
+ { X86::VPSADBWZ128rm , X86::VPSADBWrm },
+ { X86::VPSADBWZ128rr , X86::VPSADBWrr },
+ { X86::VPSHUFBZ128rm , X86::VPSHUFBrm },
+ { X86::VPSHUFBZ128rr , X86::VPSHUFBrr },
+ { X86::VPSHUFDZ128mi , X86::VPSHUFDmi },
+ { X86::VPSHUFDZ128ri , X86::VPSHUFDri },
+ { X86::VPSHUFHWZ128mi , X86::VPSHUFHWmi },
+ { X86::VPSHUFHWZ128ri , X86::VPSHUFHWri },
+ { X86::VPSHUFLWZ128mi , X86::VPSHUFLWmi },
+ { X86::VPSHUFLWZ128ri , X86::VPSHUFLWri },
+ { X86::VPSLLDQZ128rr , X86::VPSLLDQri },
+ { X86::VPSLLDZ128ri , X86::VPSLLDri },
+ { X86::VPSLLDZ128rm , X86::VPSLLDrm },
+ { X86::VPSLLDZ128rr , X86::VPSLLDrr },
+ { X86::VPSLLQZ128ri , X86::VPSLLQri },
+ { X86::VPSLLQZ128rm , X86::VPSLLQrm },
+ { X86::VPSLLQZ128rr , X86::VPSLLQrr },
+ { X86::VPSLLVDZ128rm , X86::VPSLLVDrm },
+ { X86::VPSLLVDZ128rr , X86::VPSLLVDrr },
+ { X86::VPSLLVQZ128rm , X86::VPSLLVQrm },
+ { X86::VPSLLVQZ128rr , X86::VPSLLVQrr },
+ { X86::VPSLLWZ128ri , X86::VPSLLWri },
+ { X86::VPSLLWZ128rm , X86::VPSLLWrm },
+ { X86::VPSLLWZ128rr , X86::VPSLLWrr },
+ { X86::VPSRADZ128ri , X86::VPSRADri },
+ { X86::VPSRADZ128rm , X86::VPSRADrm },
+ { X86::VPSRADZ128rr , X86::VPSRADrr },
+ { X86::VPSRAVDZ128rm , X86::VPSRAVDrm },
+ { X86::VPSRAVDZ128rr , X86::VPSRAVDrr },
+ { X86::VPSRAWZ128ri , X86::VPSRAWri },
+ { X86::VPSRAWZ128rm , X86::VPSRAWrm },
+ { X86::VPSRAWZ128rr , X86::VPSRAWrr },
+ { X86::VPSRLDQZ128rr , X86::VPSRLDQri },
+ { X86::VPSRLDZ128ri , X86::VPSRLDri },
+ { X86::VPSRLDZ128rm , X86::VPSRLDrm },
+ { X86::VPSRLDZ128rr , X86::VPSRLDrr },
+ { X86::VPSRLQZ128ri , X86::VPSRLQri },
+ { X86::VPSRLQZ128rm , X86::VPSRLQrm },
+ { X86::VPSRLQZ128rr , X86::VPSRLQrr },
+ { X86::VPSRLVDZ128rm , X86::VPSRLVDrm },
+ { X86::VPSRLVDZ128rr , X86::VPSRLVDrr },
+ { X86::VPSRLVQZ128rm , X86::VPSRLVQrm },
+ { X86::VPSRLVQZ128rr , X86::VPSRLVQrr },
+ { X86::VPSRLWZ128ri , X86::VPSRLWri },
+ { X86::VPSRLWZ128rm , X86::VPSRLWrm },
+ { X86::VPSRLWZ128rr , X86::VPSRLWrr },
+ { X86::VPSUBBZ128rm , X86::VPSUBBrm },
+ { X86::VPSUBBZ128rr , X86::VPSUBBrr },
+ { X86::VPSUBDZ128rm , X86::VPSUBDrm },
+ { X86::VPSUBDZ128rr , X86::VPSUBDrr },
+ { X86::VPSUBQZ128rm , X86::VPSUBQrm },
+ { X86::VPSUBQZ128rr , X86::VPSUBQrr },
+ { X86::VPSUBSBZ128rm , X86::VPSUBSBrm },
+ { X86::VPSUBSBZ128rr , X86::VPSUBSBrr },
+ { X86::VPSUBSWZ128rm , X86::VPSUBSWrm },
+ { X86::VPSUBSWZ128rr , X86::VPSUBSWrr },
+ { X86::VPSUBUSBZ128rm , X86::VPSUBUSBrm },
+ { X86::VPSUBUSBZ128rr , X86::VPSUBUSBrr },
+ { X86::VPSUBUSWZ128rm , X86::VPSUBUSWrm },
+ { X86::VPSUBUSWZ128rr , X86::VPSUBUSWrr },
+ { X86::VPSUBWZ128rm , X86::VPSUBWrm },
+ { X86::VPSUBWZ128rr , X86::VPSUBWrr },
+ { X86::VPUNPCKHBWZ128rm , X86::VPUNPCKHBWrm },
+ { X86::VPUNPCKHBWZ128rr , X86::VPUNPCKHBWrr },
+ { X86::VPUNPCKHDQZ128rm , X86::VPUNPCKHDQrm },
+ { X86::VPUNPCKHDQZ128rr , X86::VPUNPCKHDQrr },
+ { X86::VPUNPCKHQDQZ128rm , X86::VPUNPCKHQDQrm },
+ { X86::VPUNPCKHQDQZ128rr , X86::VPUNPCKHQDQrr },
+ { X86::VPUNPCKHWDZ128rm , X86::VPUNPCKHWDrm },
+ { X86::VPUNPCKHWDZ128rr , X86::VPUNPCKHWDrr },
+ { X86::VPUNPCKLBWZ128rm , X86::VPUNPCKLBWrm },
+ { X86::VPUNPCKLBWZ128rr , X86::VPUNPCKLBWrr },
+ { X86::VPUNPCKLDQZ128rm , X86::VPUNPCKLDQrm },
+ { X86::VPUNPCKLDQZ128rr , X86::VPUNPCKLDQrr },
+ { X86::VPUNPCKLQDQZ128rm , X86::VPUNPCKLQDQrm },
+ { X86::VPUNPCKLQDQZ128rr , X86::VPUNPCKLQDQrr },
+ { X86::VPUNPCKLWDZ128rm , X86::VPUNPCKLWDrm },
+ { X86::VPUNPCKLWDZ128rr , X86::VPUNPCKLWDrr },
+ { X86::VPXORDZ128rm , X86::VPXORrm },
+ { X86::VPXORDZ128rr , X86::VPXORrr },
+ { X86::VPXORQZ128rm , X86::VPXORrm },
+ { X86::VPXORQZ128rr , X86::VPXORrr },
+ { X86::VSHUFPDZ128rmi , X86::VSHUFPDrmi },
+ { X86::VSHUFPDZ128rri , X86::VSHUFPDrri },
+ { X86::VSHUFPSZ128rmi , X86::VSHUFPSrmi },
+ { X86::VSHUFPSZ128rri , X86::VSHUFPSrri },
+ { X86::VSQRTPDZ128m , X86::VSQRTPDm },
+ { X86::VSQRTPDZ128r , X86::VSQRTPDr },
+ { X86::VSQRTPSZ128m , X86::VSQRTPSm },
+ { X86::VSQRTPSZ128r , X86::VSQRTPSr },
+ { X86::VSUBPDZ128rm , X86::VSUBPDrm },
+ { X86::VSUBPDZ128rr , X86::VSUBPDrr },
+ { X86::VSUBPSZ128rm , X86::VSUBPSrm },
+ { X86::VSUBPSZ128rr , X86::VSUBPSrr },
+ { X86::VUNPCKHPDZ128rm , X86::VUNPCKHPDrm },
+ { X86::VUNPCKHPDZ128rr , X86::VUNPCKHPDrr },
+ { X86::VUNPCKHPSZ128rm , X86::VUNPCKHPSrm },
+ { X86::VUNPCKHPSZ128rr , X86::VUNPCKHPSrr },
+ { X86::VUNPCKLPDZ128rm , X86::VUNPCKLPDrm },
+ { X86::VUNPCKLPDZ128rr , X86::VUNPCKLPDrr },
+ { X86::VUNPCKLPSZ128rm , X86::VUNPCKLPSrm },
+ { X86::VUNPCKLPSZ128rr , X86::VUNPCKLPSrr },
+ { X86::VXORPDZ128rm , X86::VXORPDrm },
+ { X86::VXORPDZ128rr , X86::VXORPDrr },
+ { X86::VXORPSZ128rm , X86::VXORPSrm },
+ { X86::VXORPSZ128rr , X86::VXORPSrr },
+};
+
+
+// X86 EVEX encoded instructions that have a VEX 256 encoding
+// (table format: <EVEX opcode, VEX-256 opcode>).
+ static const X86EvexToVexCompressTableEntry
+ X86EvexToVex256CompressTable[] = {
+ { X86::VADDPDZ256rm , X86::VADDPDYrm },
+ { X86::VADDPDZ256rr , X86::VADDPDYrr },
+ { X86::VADDPSZ256rm , X86::VADDPSYrm },
+ { X86::VADDPSZ256rr , X86::VADDPSYrr },
+ { X86::VANDNPDZ256rm , X86::VANDNPDYrm },
+ { X86::VANDNPDZ256rr , X86::VANDNPDYrr },
+ { X86::VANDNPSZ256rm , X86::VANDNPSYrm },
+ { X86::VANDNPSZ256rr , X86::VANDNPSYrr },
+ { X86::VANDPDZ256rm , X86::VANDPDYrm },
+ { X86::VANDPDZ256rr , X86::VANDPDYrr },
+ { X86::VANDPSZ256rm , X86::VANDPSYrm },
+ { X86::VANDPSZ256rr , X86::VANDPSYrr },
+ { X86::VBROADCASTSDZ256m , X86::VBROADCASTSDYrm },
+ { X86::VBROADCASTSDZ256r , X86::VBROADCASTSDYrr },
+ { X86::VBROADCASTSDZ256r_s , X86::VBROADCASTSDYrr },
+ { X86::VBROADCASTSSZ256m , X86::VBROADCASTSSYrm },
+ { X86::VBROADCASTSSZ256r , X86::VBROADCASTSSYrr },
+ { X86::VBROADCASTSSZ256r_s , X86::VBROADCASTSSYrr },
+ { X86::VCVTDQ2PDZ256rm , X86::VCVTDQ2PDYrm },
+ { X86::VCVTDQ2PDZ256rr , X86::VCVTDQ2PDYrr },
+ { X86::VCVTDQ2PSZ256rm , X86::VCVTDQ2PSYrm },
+ { X86::VCVTDQ2PSZ256rr , X86::VCVTDQ2PSYrr },
+ { X86::VCVTPD2DQZ256rm , X86::VCVTPD2DQYrm },
+ { X86::VCVTPD2DQZ256rr , X86::VCVTPD2DQYrr },
+ { X86::VCVTPD2PSZ256rm , X86::VCVTPD2PSYrm },
+ { X86::VCVTPD2PSZ256rr , X86::VCVTPD2PSYrr },
+ { X86::VCVTPH2PSZ256rm , X86::VCVTPH2PSYrm },
+ { X86::VCVTPH2PSZ256rr , X86::VCVTPH2PSYrr },
+ { X86::VCVTPS2DQZ256rm , X86::VCVTPS2DQYrm },
+ { X86::VCVTPS2DQZ256rr , X86::VCVTPS2DQYrr },
+ { X86::VCVTPS2PDZ256rm , X86::VCVTPS2PDYrm },
+ { X86::VCVTPS2PDZ256rr , X86::VCVTPS2PDYrr },
+ { X86::VCVTPS2PHZ256mr , X86::VCVTPS2PHYmr },
+ { X86::VCVTPS2PHZ256rr , X86::VCVTPS2PHYrr },
+ { X86::VCVTTPD2DQZ256rm , X86::VCVTTPD2DQYrm },
+ { X86::VCVTTPD2DQZ256rr , X86::VCVTTPD2DQYrr },
+ { X86::VCVTTPS2DQZ256rm , X86::VCVTTPS2DQYrm },
+ { X86::VCVTTPS2DQZ256rr , X86::VCVTTPS2DQYrr },
+ { X86::VDIVPDZ256rm , X86::VDIVPDYrm },
+ { X86::VDIVPDZ256rr , X86::VDIVPDYrr },
+ { X86::VDIVPSZ256rm , X86::VDIVPSYrm },
+ { X86::VDIVPSZ256rr , X86::VDIVPSYrr },
+ { X86::VFMADD132PDZ256m , X86::VFMADD132PDYm },
+ { X86::VFMADD132PDZ256r , X86::VFMADD132PDYr },
+ { X86::VFMADD132PSZ256m , X86::VFMADD132PSYm },
+ { X86::VFMADD132PSZ256r , X86::VFMADD132PSYr },
+ { X86::VFMADD213PDZ256m , X86::VFMADD213PDYm },
+ { X86::VFMADD213PDZ256r , X86::VFMADD213PDYr },
+ { X86::VFMADD213PSZ256m , X86::VFMADD213PSYm },
+ { X86::VFMADD213PSZ256r , X86::VFMADD213PSYr },
+ { X86::VFMADD231PDZ256m , X86::VFMADD231PDYm },
+ { X86::VFMADD231PDZ256r , X86::VFMADD231PDYr },
+ { X86::VFMADD231PSZ256m , X86::VFMADD231PSYm },
+ { X86::VFMADD231PSZ256r , X86::VFMADD231PSYr },
+ { X86::VFMADDSUB132PDZ256m , X86::VFMADDSUB132PDYm },
+ { X86::VFMADDSUB132PDZ256r , X86::VFMADDSUB132PDYr },
+ { X86::VFMADDSUB132PSZ256m , X86::VFMADDSUB132PSYm },
+ { X86::VFMADDSUB132PSZ256r , X86::VFMADDSUB132PSYr },
+ { X86::VFMADDSUB213PDZ256m , X86::VFMADDSUB213PDYm },
+ { X86::VFMADDSUB213PDZ256r , X86::VFMADDSUB213PDYr },
+ { X86::VFMADDSUB213PSZ256m , X86::VFMADDSUB213PSYm },
+ { X86::VFMADDSUB213PSZ256r , X86::VFMADDSUB213PSYr },
+ { X86::VFMADDSUB231PDZ256m , X86::VFMADDSUB231PDYm },
+ { X86::VFMADDSUB231PDZ256r , X86::VFMADDSUB231PDYr },
+ { X86::VFMADDSUB231PSZ256m , X86::VFMADDSUB231PSYm },
+ { X86::VFMADDSUB231PSZ256r , X86::VFMADDSUB231PSYr },
+ { X86::VFMSUB132PDZ256m , X86::VFMSUB132PDYm },
+ { X86::VFMSUB132PDZ256r , X86::VFMSUB132PDYr },
+ { X86::VFMSUB132PSZ256m , X86::VFMSUB132PSYm },
+ { X86::VFMSUB132PSZ256r , X86::VFMSUB132PSYr },
+ { X86::VFMSUB213PDZ256m , X86::VFMSUB213PDYm },
+ { X86::VFMSUB213PDZ256r , X86::VFMSUB213PDYr },
+ { X86::VFMSUB213PSZ256m , X86::VFMSUB213PSYm },
+ { X86::VFMSUB213PSZ256r , X86::VFMSUB213PSYr },
+ { X86::VFMSUB231PDZ256m , X86::VFMSUB231PDYm },
+ { X86::VFMSUB231PDZ256r , X86::VFMSUB231PDYr },
+ { X86::VFMSUB231PSZ256m , X86::VFMSUB231PSYm },
+ { X86::VFMSUB231PSZ256r , X86::VFMSUB231PSYr },
+ { X86::VFMSUBADD132PDZ256m , X86::VFMSUBADD132PDYm },
+ { X86::VFMSUBADD132PDZ256r , X86::VFMSUBADD132PDYr },
+ { X86::VFMSUBADD132PSZ256m , X86::VFMSUBADD132PSYm },
+ { X86::VFMSUBADD132PSZ256r , X86::VFMSUBADD132PSYr },
+ { X86::VFMSUBADD213PDZ256m , X86::VFMSUBADD213PDYm },
+ { X86::VFMSUBADD213PDZ256r , X86::VFMSUBADD213PDYr },
+ { X86::VFMSUBADD213PSZ256m , X86::VFMSUBADD213PSYm },
+ { X86::VFMSUBADD213PSZ256r , X86::VFMSUBADD213PSYr },
+ { X86::VFMSUBADD231PDZ256m , X86::VFMSUBADD231PDYm },
+ { X86::VFMSUBADD231PDZ256r , X86::VFMSUBADD231PDYr },
+ { X86::VFMSUBADD231PSZ256m , X86::VFMSUBADD231PSYm },
+ { X86::VFMSUBADD231PSZ256r , X86::VFMSUBADD231PSYr },
+ { X86::VFNMADD132PDZ256m , X86::VFNMADD132PDYm },
+ { X86::VFNMADD132PDZ256r , X86::VFNMADD132PDYr },
+ { X86::VFNMADD132PSZ256m , X86::VFNMADD132PSYm },
+ { X86::VFNMADD132PSZ256r , X86::VFNMADD132PSYr },
+ { X86::VFNMADD213PDZ256m , X86::VFNMADD213PDYm },
+ { X86::VFNMADD213PDZ256r , X86::VFNMADD213PDYr },
+ { X86::VFNMADD213PSZ256m , X86::VFNMADD213PSYm },
+ { X86::VFNMADD213PSZ256r , X86::VFNMADD213PSYr },
+ { X86::VFNMADD231PDZ256m , X86::VFNMADD231PDYm },
+ { X86::VFNMADD231PDZ256r , X86::VFNMADD231PDYr },
+ { X86::VFNMADD231PSZ256m , X86::VFNMADD231PSYm },
+ { X86::VFNMADD231PSZ256r , X86::VFNMADD231PSYr },
+ { X86::VFNMSUB132PDZ256m , X86::VFNMSUB132PDYm },
+ { X86::VFNMSUB132PDZ256r , X86::VFNMSUB132PDYr },
+ { X86::VFNMSUB132PSZ256m , X86::VFNMSUB132PSYm },
+ { X86::VFNMSUB132PSZ256r , X86::VFNMSUB132PSYr },
+ { X86::VFNMSUB213PDZ256m , X86::VFNMSUB213PDYm },
+ { X86::VFNMSUB213PDZ256r , X86::VFNMSUB213PDYr },
+ { X86::VFNMSUB213PSZ256m , X86::VFNMSUB213PSYm },
+ { X86::VFNMSUB213PSZ256r , X86::VFNMSUB213PSYr },
+ { X86::VFNMSUB231PDZ256m , X86::VFNMSUB231PDYm },
+ { X86::VFNMSUB231PDZ256r , X86::VFNMSUB231PDYr },
+ { X86::VFNMSUB231PSZ256m , X86::VFNMSUB231PSYm },
+ { X86::VFNMSUB231PSZ256r , X86::VFNMSUB231PSYr },
+ { X86::VMAXCPDZ256rm , X86::VMAXCPDYrm },
+ { X86::VMAXCPDZ256rr , X86::VMAXCPDYrr },
+ { X86::VMAXCPSZ256rm , X86::VMAXCPSYrm },
+ { X86::VMAXCPSZ256rr , X86::VMAXCPSYrr },
+ { X86::VMAXPDZ256rm , X86::VMAXPDYrm },
+ { X86::VMAXPDZ256rr , X86::VMAXPDYrr },
+ { X86::VMAXPSZ256rm , X86::VMAXPSYrm },
+ { X86::VMAXPSZ256rr , X86::VMAXPSYrr },
+ { X86::VMINCPDZ256rm , X86::VMINCPDYrm },
+ { X86::VMINCPDZ256rr , X86::VMINCPDYrr },
+ { X86::VMINCPSZ256rm , X86::VMINCPSYrm },
+ { X86::VMINCPSZ256rr , X86::VMINCPSYrr },
+ { X86::VMINPDZ256rm , X86::VMINPDYrm },
+ { X86::VMINPDZ256rr , X86::VMINPDYrr },
+ { X86::VMINPSZ256rm , X86::VMINPSYrm },
+ { X86::VMINPSZ256rr , X86::VMINPSYrr },
+ { X86::VMOVAPDZ256mr , X86::VMOVAPDYmr },
+ { X86::VMOVAPDZ256rm , X86::VMOVAPDYrm },
+ { X86::VMOVAPDZ256rr , X86::VMOVAPDYrr },
+ { X86::VMOVAPDZ256rr_REV , X86::VMOVAPDYrr_REV },
+ { X86::VMOVAPSZ256mr , X86::VMOVAPSYmr },
+ { X86::VMOVAPSZ256rm , X86::VMOVAPSYrm },
+ { X86::VMOVAPSZ256rr , X86::VMOVAPSYrr },
+ { X86::VMOVAPSZ256rr_REV , X86::VMOVAPSYrr_REV },
+ { X86::VMOVDDUPZ256rm , X86::VMOVDDUPYrm },
+ { X86::VMOVDDUPZ256rr , X86::VMOVDDUPYrr },
+ { X86::VMOVDQA32Z256mr , X86::VMOVDQAYmr },
+ { X86::VMOVDQA32Z256rm , X86::VMOVDQAYrm },
+ { X86::VMOVDQA32Z256rr , X86::VMOVDQAYrr },
+ { X86::VMOVDQA32Z256rr_REV , X86::VMOVDQAYrr_REV },
+ { X86::VMOVDQA64Z256mr , X86::VMOVDQAYmr },
+ { X86::VMOVDQA64Z256rm , X86::VMOVDQAYrm },
+ { X86::VMOVDQA64Z256rr , X86::VMOVDQAYrr },
+ { X86::VMOVDQA64Z256rr_REV , X86::VMOVDQAYrr_REV },
+ { X86::VMOVDQU16Z256mr , X86::VMOVDQUYmr },
+ { X86::VMOVDQU16Z256rm , X86::VMOVDQUYrm },
+ { X86::VMOVDQU16Z256rr , X86::VMOVDQUYrr },
+ { X86::VMOVDQU16Z256rr_REV , X86::VMOVDQUYrr_REV },
+ { X86::VMOVDQU32Z256mr , X86::VMOVDQUYmr },
+ { X86::VMOVDQU32Z256rm , X86::VMOVDQUYrm },
+ { X86::VMOVDQU32Z256rr , X86::VMOVDQUYrr },
+ { X86::VMOVDQU32Z256rr_REV , X86::VMOVDQUYrr_REV },
+ { X86::VMOVDQU64Z256mr , X86::VMOVDQUYmr },
+ { X86::VMOVDQU64Z256rm , X86::VMOVDQUYrm },
+ { X86::VMOVDQU64Z256rr , X86::VMOVDQUYrr },
+ { X86::VMOVDQU64Z256rr_REV , X86::VMOVDQUYrr_REV },
+ { X86::VMOVDQU8Z256mr , X86::VMOVDQUYmr },
+ { X86::VMOVDQU8Z256rm , X86::VMOVDQUYrm },
+ { X86::VMOVDQU8Z256rr , X86::VMOVDQUYrr },
+ { X86::VMOVDQU8Z256rr_REV , X86::VMOVDQUYrr_REV },
+ { X86::VMOVNTDQAZ256rm , X86::VMOVNTDQAYrm },
+ { X86::VMOVNTDQZ256mr , X86::VMOVNTDQYmr },
+ { X86::VMOVNTPDZ256mr , X86::VMOVNTPDYmr },
+ { X86::VMOVNTPSZ256mr , X86::VMOVNTPSYmr },
+ { X86::VMOVSHDUPZ256rm , X86::VMOVSHDUPYrm },
+ { X86::VMOVSHDUPZ256rr , X86::VMOVSHDUPYrr },
+ { X86::VMOVSLDUPZ256rm , X86::VMOVSLDUPYrm },
+ { X86::VMOVSLDUPZ256rr , X86::VMOVSLDUPYrr },
+ { X86::VMOVUPDZ256mr , X86::VMOVUPDYmr },
+ { X86::VMOVUPDZ256rm , X86::VMOVUPDYrm },
+ { X86::VMOVUPDZ256rr , X86::VMOVUPDYrr },
+ { X86::VMOVUPDZ256rr_REV , X86::VMOVUPDYrr_REV },
+ { X86::VMOVUPSZ256mr , X86::VMOVUPSYmr },
+ { X86::VMOVUPSZ256rm , X86::VMOVUPSYrm },
+ { X86::VMOVUPSZ256rr , X86::VMOVUPSYrr },
+ { X86::VMOVUPSZ256rr_REV , X86::VMOVUPSYrr_REV },
+ { X86::VMULPDZ256rm , X86::VMULPDYrm },
+ { X86::VMULPDZ256rr , X86::VMULPDYrr },
+ { X86::VMULPSZ256rm , X86::VMULPSYrm },
+ { X86::VMULPSZ256rr , X86::VMULPSYrr },
+ { X86::VORPDZ256rm , X86::VORPDYrm },
+ { X86::VORPDZ256rr , X86::VORPDYrr },
+ { X86::VORPSZ256rm , X86::VORPSYrm },
+ { X86::VORPSZ256rr , X86::VORPSYrr },
+ { X86::VPABSBZ256rm , X86::VPABSBYrm },
+ { X86::VPABSBZ256rr , X86::VPABSBYrr },
+ { X86::VPABSDZ256rm , X86::VPABSDYrm },
+ { X86::VPABSDZ256rr , X86::VPABSDYrr },
+ { X86::VPABSWZ256rm , X86::VPABSWYrm },
+ { X86::VPABSWZ256rr , X86::VPABSWYrr },
+ { X86::VPACKSSDWZ256rm , X86::VPACKSSDWYrm },
+ { X86::VPACKSSDWZ256rr , X86::VPACKSSDWYrr },
+ { X86::VPACKSSWBZ256rm , X86::VPACKSSWBYrm },
+ { X86::VPACKSSWBZ256rr , X86::VPACKSSWBYrr },
+ { X86::VPACKUSDWZ256rm , X86::VPACKUSDWYrm },
+ { X86::VPACKUSDWZ256rr , X86::VPACKUSDWYrr },
+ { X86::VPACKUSWBZ256rm , X86::VPACKUSWBYrm },
+ { X86::VPACKUSWBZ256rr , X86::VPACKUSWBYrr },
+ { X86::VPADDBZ256rm , X86::VPADDBYrm },
+ { X86::VPADDBZ256rr , X86::VPADDBYrr },
+ { X86::VPADDDZ256rm , X86::VPADDDYrm },
+ { X86::VPADDDZ256rr , X86::VPADDDYrr },
+ { X86::VPADDQZ256rm , X86::VPADDQYrm },
+ { X86::VPADDQZ256rr , X86::VPADDQYrr },
+ { X86::VPADDSBZ256rm , X86::VPADDSBYrm },
+ { X86::VPADDSBZ256rr , X86::VPADDSBYrr },
+ { X86::VPADDSWZ256rm , X86::VPADDSWYrm },
+ { X86::VPADDSWZ256rr , X86::VPADDSWYrr },
+ { X86::VPADDUSBZ256rm , X86::VPADDUSBYrm },
+ { X86::VPADDUSBZ256rr , X86::VPADDUSBYrr },
+ { X86::VPADDUSWZ256rm , X86::VPADDUSWYrm },
+ { X86::VPADDUSWZ256rr , X86::VPADDUSWYrr },
+ { X86::VPADDWZ256rm , X86::VPADDWYrm },
+ { X86::VPADDWZ256rr , X86::VPADDWYrr },
+ { X86::VPALIGNRZ256rmi , X86::VPALIGNRYrmi },
+ { X86::VPALIGNRZ256rri , X86::VPALIGNRYrri },
+ { X86::VPANDDZ256rm , X86::VPANDYrm },
+ { X86::VPANDDZ256rr , X86::VPANDYrr },
+ { X86::VPANDQZ256rm , X86::VPANDYrm },
+ { X86::VPANDQZ256rr , X86::VPANDYrr },
+ { X86::VPAVGBZ256rm , X86::VPAVGBYrm },
+ { X86::VPAVGBZ256rr , X86::VPAVGBYrr },
+ { X86::VPAVGWZ256rm , X86::VPAVGWYrm },
+ { X86::VPAVGWZ256rr , X86::VPAVGWYrr },
+ { X86::VPBROADCASTBZ256m , X86::VPBROADCASTBYrm },
+ { X86::VPBROADCASTBZ256r , X86::VPBROADCASTBYrr },
+ { X86::VPBROADCASTDZ256m , X86::VPBROADCASTDYrm },
+ { X86::VPBROADCASTDZ256r , X86::VPBROADCASTDYrr },
+ { X86::VPBROADCASTQZ256m , X86::VPBROADCASTQYrm },
+ { X86::VPBROADCASTQZ256r , X86::VPBROADCASTQYrr },
+ { X86::VPBROADCASTWZ256m , X86::VPBROADCASTWYrm },
+ { X86::VPBROADCASTWZ256r , X86::VPBROADCASTWYrr },
+ { X86::VPERMDZ256rm , X86::VPERMDYrm },
+ { X86::VPERMDZ256rr , X86::VPERMDYrr },
+ { X86::VPERMILPDZ256mi , X86::VPERMILPDYmi },
+ { X86::VPERMILPDZ256ri , X86::VPERMILPDYri },
+ { X86::VPERMILPDZ256rm , X86::VPERMILPDYrm },
+ { X86::VPERMILPDZ256rr , X86::VPERMILPDYrr },
+ { X86::VPERMILPSZ256mi , X86::VPERMILPSYmi },
+ { X86::VPERMILPSZ256ri , X86::VPERMILPSYri },
+ { X86::VPERMILPSZ256rm , X86::VPERMILPSYrm },
+ { X86::VPERMILPSZ256rr , X86::VPERMILPSYrr },
+ { X86::VPERMPDZ256mi , X86::VPERMPDYmi },
+ { X86::VPERMPDZ256ri , X86::VPERMPDYri },
+ { X86::VPERMPSZ256rm , X86::VPERMPSYrm },
+ { X86::VPERMPSZ256rr , X86::VPERMPSYrr },
+ { X86::VPERMQZ256mi , X86::VPERMQYmi },
+ { X86::VPERMQZ256ri , X86::VPERMQYri },
+ { X86::VPMADDUBSWZ256rm , X86::VPMADDUBSWYrm },
+ { X86::VPMADDUBSWZ256rr , X86::VPMADDUBSWYrr },
+ { X86::VPMADDWDZ256rm , X86::VPMADDWDYrm },
+ { X86::VPMADDWDZ256rr , X86::VPMADDWDYrr },
+ { X86::VPMAXSBZ256rm , X86::VPMAXSBYrm },
+ { X86::VPMAXSBZ256rr , X86::VPMAXSBYrr },
+ { X86::VPMAXSDZ256rm , X86::VPMAXSDYrm },
+ { X86::VPMAXSDZ256rr , X86::VPMAXSDYrr },
+ { X86::VPMAXSWZ256rm , X86::VPMAXSWYrm },
+ { X86::VPMAXSWZ256rr , X86::VPMAXSWYrr },
+ { X86::VPMAXUBZ256rm , X86::VPMAXUBYrm },
+ { X86::VPMAXUBZ256rr , X86::VPMAXUBYrr },
+ { X86::VPMAXUDZ256rm , X86::VPMAXUDYrm },
+ { X86::VPMAXUDZ256rr , X86::VPMAXUDYrr },
+ { X86::VPMAXUWZ256rm , X86::VPMAXUWYrm },
+ { X86::VPMAXUWZ256rr , X86::VPMAXUWYrr },
+ { X86::VPMINSBZ256rm , X86::VPMINSBYrm },
+ { X86::VPMINSBZ256rr , X86::VPMINSBYrr },
+ { X86::VPMINSDZ256rm , X86::VPMINSDYrm },
+ { X86::VPMINSDZ256rr , X86::VPMINSDYrr },
+ { X86::VPMINSWZ256rm , X86::VPMINSWYrm },
+ { X86::VPMINSWZ256rr , X86::VPMINSWYrr },
+ { X86::VPMINUBZ256rm , X86::VPMINUBYrm },
+ { X86::VPMINUBZ256rr , X86::VPMINUBYrr },
+ { X86::VPMINUDZ256rm , X86::VPMINUDYrm },
+ { X86::VPMINUDZ256rr , X86::VPMINUDYrr },
+ { X86::VPMINUWZ256rm , X86::VPMINUWYrm },
+ { X86::VPMINUWZ256rr , X86::VPMINUWYrr },
+ { X86::VPMOVSXBDZ256rm , X86::VPMOVSXBDYrm },
+ { X86::VPMOVSXBDZ256rr , X86::VPMOVSXBDYrr },
+ { X86::VPMOVSXBQZ256rm , X86::VPMOVSXBQYrm },
+ { X86::VPMOVSXBQZ256rr , X86::VPMOVSXBQYrr },
+ { X86::VPMOVSXBWZ256rm , X86::VPMOVSXBWYrm },
+ { X86::VPMOVSXBWZ256rr , X86::VPMOVSXBWYrr },
+ { X86::VPMOVSXDQZ256rm , X86::VPMOVSXDQYrm },
+ { X86::VPMOVSXDQZ256rr , X86::VPMOVSXDQYrr },
+ { X86::VPMOVSXWDZ256rm , X86::VPMOVSXWDYrm },
+ { X86::VPMOVSXWDZ256rr , X86::VPMOVSXWDYrr },
+ { X86::VPMOVSXWQZ256rm , X86::VPMOVSXWQYrm },
+ { X86::VPMOVSXWQZ256rr , X86::VPMOVSXWQYrr },
+ { X86::VPMOVZXBDZ256rm , X86::VPMOVZXBDYrm },
+ { X86::VPMOVZXBDZ256rr , X86::VPMOVZXBDYrr },
+ { X86::VPMOVZXBQZ256rm , X86::VPMOVZXBQYrm },
+ { X86::VPMOVZXBQZ256rr , X86::VPMOVZXBQYrr },
+ { X86::VPMOVZXBWZ256rm , X86::VPMOVZXBWYrm },
+ { X86::VPMOVZXBWZ256rr , X86::VPMOVZXBWYrr },
+ { X86::VPMOVZXDQZ256rm , X86::VPMOVZXDQYrm },
+ { X86::VPMOVZXDQZ256rr , X86::VPMOVZXDQYrr },
+ { X86::VPMOVZXWDZ256rm , X86::VPMOVZXWDYrm },
+ { X86::VPMOVZXWDZ256rr , X86::VPMOVZXWDYrr },
+ { X86::VPMOVZXWQZ256rm , X86::VPMOVZXWQYrm },
+ { X86::VPMOVZXWQZ256rr , X86::VPMOVZXWQYrr },
+ { X86::VPMULDQZ256rm , X86::VPMULDQYrm },
+ { X86::VPMULDQZ256rr , X86::VPMULDQYrr },
+ { X86::VPMULHRSWZ256rm , X86::VPMULHRSWYrm },
+ { X86::VPMULHRSWZ256rr , X86::VPMULHRSWYrr },
+ { X86::VPMULHUWZ256rm , X86::VPMULHUWYrm },
+ { X86::VPMULHUWZ256rr , X86::VPMULHUWYrr },
+ { X86::VPMULHWZ256rm , X86::VPMULHWYrm },
+ { X86::VPMULHWZ256rr , X86::VPMULHWYrr },
+ { X86::VPMULLDZ256rm , X86::VPMULLDYrm },
+ { X86::VPMULLDZ256rr , X86::VPMULLDYrr },
+ { X86::VPMULLWZ256rm , X86::VPMULLWYrm },
+ { X86::VPMULLWZ256rr , X86::VPMULLWYrr },
+ { X86::VPMULUDQZ256rm , X86::VPMULUDQYrm },
+ { X86::VPMULUDQZ256rr , X86::VPMULUDQYrr },
+ { X86::VPORDZ256rm , X86::VPORYrm },
+ { X86::VPORDZ256rr , X86::VPORYrr },
+ { X86::VPORQZ256rm , X86::VPORYrm },
+ { X86::VPORQZ256rr , X86::VPORYrr },
+ { X86::VPSADBWZ256rm , X86::VPSADBWYrm },
+ { X86::VPSADBWZ256rr , X86::VPSADBWYrr },
+ { X86::VPSHUFBZ256rm , X86::VPSHUFBYrm },
+ { X86::VPSHUFBZ256rr , X86::VPSHUFBYrr },
+ { X86::VPSHUFDZ256mi , X86::VPSHUFDYmi },
+ { X86::VPSHUFDZ256ri , X86::VPSHUFDYri },
+ { X86::VPSHUFHWZ256mi , X86::VPSHUFHWYmi },
+ { X86::VPSHUFHWZ256ri , X86::VPSHUFHWYri },
+ { X86::VPSHUFLWZ256mi , X86::VPSHUFLWYmi },
+ { X86::VPSHUFLWZ256ri , X86::VPSHUFLWYri },
+ { X86::VPSLLDQZ256rr , X86::VPSLLDQYri },
+ { X86::VPSLLDZ256ri , X86::VPSLLDYri },
+ { X86::VPSLLDZ256rm , X86::VPSLLDYrm },
+ { X86::VPSLLDZ256rr , X86::VPSLLDYrr },
+ { X86::VPSLLQZ256ri , X86::VPSLLQYri },
+ { X86::VPSLLQZ256rm , X86::VPSLLQYrm },
+ { X86::VPSLLQZ256rr , X86::VPSLLQYrr },
+ { X86::VPSLLVDZ256rm , X86::VPSLLVDYrm },
+ { X86::VPSLLVDZ256rr , X86::VPSLLVDYrr },
+ { X86::VPSLLVQZ256rm , X86::VPSLLVQYrm },
+ { X86::VPSLLVQZ256rr , X86::VPSLLVQYrr },
+ { X86::VPSLLWZ256ri , X86::VPSLLWYri },
+ { X86::VPSLLWZ256rm , X86::VPSLLWYrm },
+ { X86::VPSLLWZ256rr , X86::VPSLLWYrr },
+ { X86::VPSRADZ256ri , X86::VPSRADYri },
+ { X86::VPSRADZ256rm , X86::VPSRADYrm },
+ { X86::VPSRADZ256rr , X86::VPSRADYrr },
+ { X86::VPSRAVDZ256rm , X86::VPSRAVDYrm },
+ { X86::VPSRAVDZ256rr , X86::VPSRAVDYrr },
+ { X86::VPSRAWZ256ri , X86::VPSRAWYri },
+ { X86::VPSRAWZ256rm , X86::VPSRAWYrm },
+ { X86::VPSRAWZ256rr , X86::VPSRAWYrr },
+ { X86::VPSRLDQZ256rr , X86::VPSRLDQYri },
+ { X86::VPSRLDZ256ri , X86::VPSRLDYri },
+ { X86::VPSRLDZ256rm , X86::VPSRLDYrm },
+ { X86::VPSRLDZ256rr , X86::VPSRLDYrr },
+ { X86::VPSRLQZ256ri , X86::VPSRLQYri },
+ { X86::VPSRLQZ256rm , X86::VPSRLQYrm },
+ { X86::VPSRLQZ256rr , X86::VPSRLQYrr },
+ { X86::VPSRLVDZ256rm , X86::VPSRLVDYrm },
+ { X86::VPSRLVDZ256rr , X86::VPSRLVDYrr },
+ { X86::VPSRLVQZ256rm , X86::VPSRLVQYrm },
+ { X86::VPSRLVQZ256rr , X86::VPSRLVQYrr },
+ { X86::VPSRLWZ256ri , X86::VPSRLWYri },
+ { X86::VPSRLWZ256rm , X86::VPSRLWYrm },
+ { X86::VPSRLWZ256rr , X86::VPSRLWYrr },
+ { X86::VPSUBBZ256rm , X86::VPSUBBYrm },
+ { X86::VPSUBBZ256rr , X86::VPSUBBYrr },
+ { X86::VPSUBDZ256rm , X86::VPSUBDYrm },
+ { X86::VPSUBDZ256rr , X86::VPSUBDYrr },
+ { X86::VPSUBQZ256rm , X86::VPSUBQYrm },
+ { X86::VPSUBQZ256rr , X86::VPSUBQYrr },
+ { X86::VPSUBSBZ256rm , X86::VPSUBSBYrm },
+ { X86::VPSUBSBZ256rr , X86::VPSUBSBYrr },
+ { X86::VPSUBSWZ256rm , X86::VPSUBSWYrm },
+ { X86::VPSUBSWZ256rr , X86::VPSUBSWYrr },
+ { X86::VPSUBUSBZ256rm , X86::VPSUBUSBYrm },
+ { X86::VPSUBUSBZ256rr , X86::VPSUBUSBYrr },
+ { X86::VPSUBUSWZ256rm , X86::VPSUBUSWYrm },
+ { X86::VPSUBUSWZ256rr , X86::VPSUBUSWYrr },
+ { X86::VPSUBWZ256rm , X86::VPSUBWYrm },
+ { X86::VPSUBWZ256rr , X86::VPSUBWYrr },
+ { X86::VPUNPCKHBWZ256rm , X86::VPUNPCKHBWYrm },
+ { X86::VPUNPCKHBWZ256rr , X86::VPUNPCKHBWYrr },
+ { X86::VPUNPCKHDQZ256rm , X86::VPUNPCKHDQYrm },
+ { X86::VPUNPCKHDQZ256rr , X86::VPUNPCKHDQYrr },
+ { X86::VPUNPCKHQDQZ256rm , X86::VPUNPCKHQDQYrm },
+ { X86::VPUNPCKHQDQZ256rr , X86::VPUNPCKHQDQYrr },
+ { X86::VPUNPCKHWDZ256rm , X86::VPUNPCKHWDYrm },
+ { X86::VPUNPCKHWDZ256rr , X86::VPUNPCKHWDYrr },
+ { X86::VPUNPCKLBWZ256rm , X86::VPUNPCKLBWYrm },
+ { X86::VPUNPCKLBWZ256rr , X86::VPUNPCKLBWYrr },
+ { X86::VPUNPCKLDQZ256rm , X86::VPUNPCKLDQYrm },
+ { X86::VPUNPCKLDQZ256rr , X86::VPUNPCKLDQYrr },
+ { X86::VPUNPCKLQDQZ256rm , X86::VPUNPCKLQDQYrm },
+ { X86::VPUNPCKLQDQZ256rr , X86::VPUNPCKLQDQYrr },
+ { X86::VPUNPCKLWDZ256rm , X86::VPUNPCKLWDYrm },
+ { X86::VPUNPCKLWDZ256rr , X86::VPUNPCKLWDYrr },
+ { X86::VPXORDZ256rm , X86::VPXORYrm },
+ { X86::VPXORDZ256rr , X86::VPXORYrr },
+ { X86::VPXORQZ256rm , X86::VPXORYrm },
+ { X86::VPXORQZ256rr , X86::VPXORYrr },
+ { X86::VSHUFPDZ256rmi , X86::VSHUFPDYrmi },
+ { X86::VSHUFPDZ256rri , X86::VSHUFPDYrri },
+ { X86::VSHUFPSZ256rmi , X86::VSHUFPSYrmi },
+ { X86::VSHUFPSZ256rri , X86::VSHUFPSYrri },
+ { X86::VSQRTPDZ256m , X86::VSQRTPDYm },
+ { X86::VSQRTPDZ256r , X86::VSQRTPDYr },
+ { X86::VSQRTPSZ256m , X86::VSQRTPSYm },
+ { X86::VSQRTPSZ256r , X86::VSQRTPSYr },
+ { X86::VSUBPDZ256rm , X86::VSUBPDYrm },
+ { X86::VSUBPDZ256rr , X86::VSUBPDYrr },
+ { X86::VSUBPSZ256rm , X86::VSUBPSYrm },
+ { X86::VSUBPSZ256rr , X86::VSUBPSYrr },
+ { X86::VUNPCKHPDZ256rm , X86::VUNPCKHPDYrm },
+ { X86::VUNPCKHPDZ256rr , X86::VUNPCKHPDYrr },
+ { X86::VUNPCKHPSZ256rm , X86::VUNPCKHPSYrm },
+ { X86::VUNPCKHPSZ256rr , X86::VUNPCKHPSYrr },
+ { X86::VUNPCKLPDZ256rm , X86::VUNPCKLPDYrm },
+ { X86::VUNPCKLPDZ256rr , X86::VUNPCKLPDYrr },
+ { X86::VUNPCKLPSZ256rm , X86::VUNPCKLPSYrm },
+ { X86::VUNPCKLPSZ256rr , X86::VUNPCKLPSYrr },
+ { X86::VXORPDZ256rm , X86::VXORPDYrm },
+ { X86::VXORPDZ256rr , X86::VXORPDYrr },
+ { X86::VXORPSZ256rm , X86::VXORPSYrm },
+ { X86::VXORPSZ256rr , X86::VXORPSYrr },
+};
+
+#endif \ No newline at end of file
diff --git a/contrib/llvm/lib/Target/X86/X86InstrVMX.td b/contrib/llvm/lib/Target/X86/X86InstrVMX.td
new file mode 100644
index 000000000000..2ea27a934b47
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrVMX.td
@@ -0,0 +1,66 @@
+//===-- X86InstrVMX.td - VMX Instruction Set Extension -----*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the Intel VMX instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// VMX instructions
+
+// 66 0F 38 80
+def INVEPT32 : I<0x80, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
+ "invept\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+ Requires<[Not64BitMode]>;
+def INVEPT64 : I<0x80, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
+ "invept\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+ Requires<[In64BitMode]>;
+// 66 0F 38 81
+def INVVPID32 : I<0x81, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
+ "invvpid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+ Requires<[Not64BitMode]>;
+def INVVPID64 : I<0x81, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
+ "invvpid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+ Requires<[In64BitMode]>;
+// 0F 01 C1
+def VMCALL : I<0x01, MRM_C1, (outs), (ins), "vmcall", []>, TB;
+def VMCLEARm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
+ "vmclear\t$vmcs", []>, PD;
+// OF 01 D4
+def VMFUNC : I<0x01, MRM_D4, (outs), (ins), "vmfunc", []>, TB;
+// 0F 01 C2
+def VMLAUNCH : I<0x01, MRM_C2, (outs), (ins), "vmlaunch", []>, TB;
+// 0F 01 C3
+def VMRESUME : I<0x01, MRM_C3, (outs), (ins), "vmresume", []>, TB;
+def VMPTRLDm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
+ "vmptrld\t$vmcs", []>, PS;
+def VMPTRSTm : I<0xC7, MRM7m, (outs), (ins i64mem:$vmcs),
+ "vmptrst\t$vmcs", []>, TB;
+def VMREAD64rm : I<0x78, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+ "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
+def VMREAD64rr : I<0x78, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+ "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
+def VMREAD32rm : I<0x78, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+ "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
+def VMREAD32rr : I<0x78, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+ "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
+def VMWRITE64rm : I<0x79, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
+def VMWRITE64rr : I<0x79, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
+def VMWRITE32rm : I<0x79, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
+def VMWRITE32rr : I<0x79, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
+// 0F 01 C4
+def VMXOFF : I<0x01, MRM_C4, (outs), (ins), "vmxoff", []>, TB;
+def VMXON : I<0xC7, MRM6m, (outs), (ins i64mem:$vmxon),
+ "vmxon\t$vmxon", []>, XS;
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrXOP.td b/contrib/llvm/lib/Target/X86/X86InstrXOP.td
new file mode 100644
index 000000000000..2b296e1e5b85
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrXOP.td
@@ -0,0 +1,427 @@
+//===-- X86InstrXOP.td - XOP Instruction Set ---------------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes XOP (eXtended OPerations)
+//
+//===----------------------------------------------------------------------===//
+
+multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> {
+ def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (Int VR128:$src))]>, XOP;
+ def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP;
+}
+
+let ExeDomain = SSEPackedInt in {
+ defm VPHSUBWD : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, loadv2i64>;
+ defm VPHSUBDQ : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq, loadv2i64>;
+ defm VPHSUBBW : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw, loadv2i64>;
+ defm VPHADDWQ : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq, loadv2i64>;
+ defm VPHADDWD : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd, loadv2i64>;
+ defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq, loadv2i64>;
+ defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd, loadv2i64>;
+ defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq, loadv2i64>;
+ defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw, loadv2i64>;
+ defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq, loadv2i64>;
+ defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd, loadv2i64>;
+ defm VPHADDDQ : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq, loadv2i64>;
+ defm VPHADDBW : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, loadv2i64>;
+ defm VPHADDBQ : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, loadv2i64>;
+ defm VPHADDBD : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, loadv2i64>;
+}
+
+// Scalar load 2 addr operand instructions
+multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int,
+ Operand memop, ComplexPattern mem_cpat> {
+ def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (Int VR128:$src))]>, XOP;
+ def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (Int (bitconvert mem_cpat:$src)))]>, XOP;
+}
+
+multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int,
+ PatFrag memop> {
+ def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (Int VR128:$src))]>, XOP;
+ def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP;
+}
+
+multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int,
+ PatFrag memop> {
+ def rrY : IXOP<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (Int VR256:$src))]>, XOP, VEX_L;
+ def rmY : IXOP<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP, VEX_L;
+}
+
+let ExeDomain = SSEPackedSingle in {
+ defm VFRCZSS : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss,
+ ssmem, sse_load_f32>;
+ defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, loadv4f32>;
+ defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, loadv8f32>;
+}
+
+let ExeDomain = SSEPackedDouble in {
+ defm VFRCZSD : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd,
+ sdmem, sse_load_f64>;
+ defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, loadv2f64>;
+ defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, loadv4f64>;
+}
+
+multiclass xop3op<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType vt128> {
+ def rr : IXOP<opc, MRMSrcReg4VOp3, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2))))]>,
+ XOP, Sched<[WriteVarVecShift]>;
+ def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (vt128 VR128:$src1),
+ (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
+ XOP_4V, VEX_W, Sched<[WriteVarVecShift, ReadAfterLd]>;
+ def mr : IXOP<opc, MRMSrcMem4VOp3, (outs VR128:$dst),
+ (ins i128mem:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))),
+ (vt128 VR128:$src2))))]>,
+ XOP, Sched<[WriteVarVecShift, ReadAfterLd]>;
+ // For disassembler
+ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+ def rr_REV : IXOP<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>,
+ XOP_4V, VEX_W, Sched<[WriteVarVecShift]>;
+}
+
+let ExeDomain = SSEPackedInt in {
+ defm VPROTB : xop3op<0x90, "vprotb", X86vprot, v16i8>;
+ defm VPROTD : xop3op<0x92, "vprotd", X86vprot, v4i32>;
+ defm VPROTQ : xop3op<0x93, "vprotq", X86vprot, v2i64>;
+ defm VPROTW : xop3op<0x91, "vprotw", X86vprot, v8i16>;
+ defm VPSHAB : xop3op<0x98, "vpshab", X86vpsha, v16i8>;
+ defm VPSHAD : xop3op<0x9A, "vpshad", X86vpsha, v4i32>;
+ defm VPSHAQ : xop3op<0x9B, "vpshaq", X86vpsha, v2i64>;
+ defm VPSHAW : xop3op<0x99, "vpshaw", X86vpsha, v8i16>;
+ defm VPSHLB : xop3op<0x94, "vpshlb", X86vpshl, v16i8>;
+ defm VPSHLD : xop3op<0x96, "vpshld", X86vpshl, v4i32>;
+ defm VPSHLQ : xop3op<0x97, "vpshlq", X86vpshl, v2i64>;
+ defm VPSHLW : xop3op<0x95, "vpshlw", X86vpshl, v8i16>;
+}
+
+multiclass xop3opimm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType vt128> {
+ def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (vt128 VR128:$src1), imm:$src2)))]>, XOP;
+ def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))), imm:$src2)))]>, XOP;
+}
+
+let ExeDomain = SSEPackedInt in {
+ defm VPROTB : xop3opimm<0xC0, "vprotb", X86vproti, v16i8>;
+ defm VPROTD : xop3opimm<0xC2, "vprotd", X86vproti, v4i32>;
+ defm VPROTQ : xop3opimm<0xC3, "vprotq", X86vproti, v2i64>;
+ defm VPROTW : xop3opimm<0xC1, "vprotw", X86vproti, v8i16>;
+}
+
+// Instruction where second source can be memory, but third must be register
+multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+ let isCommutable = 1 in
+ def rr : IXOPi8Reg<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, XOP_4V;
+ def rm : IXOPi8Reg<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2)),
+ VR128:$src3))]>, XOP_4V;
+}
+
+let ExeDomain = SSEPackedInt in {
+ defm VPMADCSWD : xop4opm2<0xB6, "vpmadcswd", int_x86_xop_vpmadcswd>;
+ defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd", int_x86_xop_vpmadcsswd>;
+ defm VPMACSWW : xop4opm2<0x95, "vpmacsww", int_x86_xop_vpmacsww>;
+ defm VPMACSWD : xop4opm2<0x96, "vpmacswd", int_x86_xop_vpmacswd>;
+ defm VPMACSSWW : xop4opm2<0x85, "vpmacssww", int_x86_xop_vpmacssww>;
+ defm VPMACSSWD : xop4opm2<0x86, "vpmacsswd", int_x86_xop_vpmacsswd>;
+ defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql", int_x86_xop_vpmacssdql>;
+ defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh", int_x86_xop_vpmacssdqh>;
+ defm VPMACSSDD : xop4opm2<0x8E, "vpmacssdd", int_x86_xop_vpmacssdd>;
+ defm VPMACSDQL : xop4opm2<0x97, "vpmacsdql", int_x86_xop_vpmacsdql>;
+ defm VPMACSDQH : xop4opm2<0x9F, "vpmacsdqh", int_x86_xop_vpmacsdqh>;
+ defm VPMACSDD : xop4opm2<0x9E, "vpmacsdd", int_x86_xop_vpmacsdd>;
+}
+
+// Instruction where second source can be memory, third must be imm8
+multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128> {
+ let isCommutable = 1 in
+ def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, XOPCC:$cc),
+ !strconcat("vpcom${cc}", Suffix,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
+ i8immZExt3:$cc)))]>,
+ XOP_4V;
+ def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2, XOPCC:$cc),
+ !strconcat("vpcom${cc}", Suffix,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (vt128 VR128:$src1),
+ (vt128 (bitconvert (loadv2i64 addr:$src2))),
+ i8immZExt3:$cc)))]>,
+ XOP_4V;
+ let isAsmParserOnly = 1, hasSideEffects = 0 in {
+ def ri_alt : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+ !strconcat("vpcom", Suffix,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, XOP_4V;
+ let mayLoad = 1 in
+ def mi_alt : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+ !strconcat("vpcom", Suffix,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, XOP_4V;
+ }
+}
+
+let ExeDomain = SSEPackedInt in { // SSE integer instructions
+ defm VPCOMB : xopvpcom<0xCC, "b", X86vpcom, v16i8>;
+ defm VPCOMW : xopvpcom<0xCD, "w", X86vpcom, v8i16>;
+ defm VPCOMD : xopvpcom<0xCE, "d", X86vpcom, v4i32>;
+ defm VPCOMQ : xopvpcom<0xCF, "q", X86vpcom, v2i64>;
+ defm VPCOMUB : xopvpcom<0xEC, "ub", X86vpcomu, v16i8>;
+ defm VPCOMUW : xopvpcom<0xED, "uw", X86vpcomu, v8i16>;
+ defm VPCOMUD : xopvpcom<0xEE, "ud", X86vpcomu, v4i32>;
+ defm VPCOMUQ : xopvpcom<0xEF, "uq", X86vpcomu, v2i64>;
+}
+
+multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType vt128> {
+ def rrr : IXOPi8Reg<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
+ (vt128 VR128:$src3))))]>,
+ XOP_4V;
+ def rrm : IXOPi8Reg<opc, MRMSrcMemOp4, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, i128mem:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
+ (vt128 (bitconvert (loadv2i64 addr:$src3))))))]>,
+ XOP_4V, VEX_W;
+ def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (v16i8 (OpNode (vt128 VR128:$src1), (vt128 (bitconvert (loadv2i64 addr:$src2))),
+ (vt128 VR128:$src3))))]>,
+ XOP_4V;
+ // For disassembler
+ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+ def rrr_REV : IXOPi8Reg<opc, MRMSrcRegOp4, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, XOP_4V, VEX_W;
+}
+
+let ExeDomain = SSEPackedInt in {
+ defm VPPERM : xop4op<0xA3, "vpperm", X86vpperm, v16i8>;
+}
+
+// Instruction where either second or third source can be memory
+multiclass xop4op_int<bits<8> opc, string OpcodeStr,
+ Intrinsic Int128, Intrinsic Int256> {
+ // 128-bit Instruction
+ def rrr : IXOPi8Reg<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst, (Int128 VR128:$src1, VR128:$src2, VR128:$src3))]>,
+ XOP_4V;
+ def rrm : IXOPi8Reg<opc, MRMSrcMemOp4, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, i128mem:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (Int128 VR128:$src1, VR128:$src2,
+ (bitconvert (loadv2i64 addr:$src3))))]>,
+ XOP_4V, VEX_W;
+ def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (Int128 VR128:$src1, (bitconvert (loadv2i64 addr:$src2)),
+ VR128:$src3))]>,
+ XOP_4V;
+ // For disassembler
+ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+ def rrr_REV : IXOPi8Reg<opc, MRMSrcRegOp4, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, XOP_4V, VEX_W;
+
+ // 256-bit Instruction
+ def rrrY : IXOPi8Reg<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, VR256:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR256:$dst, (Int256 VR256:$src1, VR256:$src2, VR256:$src3))]>,
+ XOP_4V, VEX_L;
+ def rrmY : IXOPi8Reg<opc, MRMSrcMemOp4, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, i256mem:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR256:$dst,
+ (Int256 VR256:$src1, VR256:$src2,
+ (bitconvert (loadv4i64 addr:$src3))))]>,
+ XOP_4V, VEX_W, VEX_L;
+ def rmrY : IXOPi8Reg<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, f256mem:$src2, VR256:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR256:$dst,
+ (Int256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2)),
+ VR256:$src3))]>,
+ XOP_4V, VEX_L;
+ // For disassembler
+ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+ def rrrY_REV : IXOPi8Reg<opc, MRMSrcRegOp4, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, VR256:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, XOP_4V, VEX_W, VEX_L;
+}
+
+let ExeDomain = SSEPackedInt in {
+ defm VPCMOV : xop4op_int<0xA2, "vpcmov",
+ int_x86_xop_vpcmov, int_x86_xop_vpcmov_256>;
+}
+
+let Predicates = [HasXOP] in {
+ def : Pat<(v2i64 (or (and VR128:$src3, VR128:$src1),
+ (X86andnp VR128:$src3, VR128:$src2))),
+ (VPCMOVrrr VR128:$src1, VR128:$src2, VR128:$src3)>;
+
+ def : Pat<(v4i64 (or (and VR256:$src3, VR256:$src1),
+ (X86andnp VR256:$src3, VR256:$src2))),
+ (VPCMOVrrrY VR256:$src1, VR256:$src2, VR256:$src3)>;
+}
+
+multiclass xop5op<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType vt128, ValueType vt256,
+ ValueType id128, ValueType id256,
+ PatFrag ld_128, PatFrag ld_256> {
+ def rr : IXOP5<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3, u8imm:$src4),
+ !strconcat(OpcodeStr,
+ "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
+ (id128 VR128:$src3), (i8 imm:$src4))))]>;
+ def rm : IXOP5<opc, MRMSrcMemOp4, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, i128mem:$src3, u8imm:$src4),
+ !strconcat(OpcodeStr,
+ "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
+ (id128 (bitconvert (loadv2i64 addr:$src3))),
+ (i8 imm:$src4))))]>,
+ VEX_W;
+ def mr : IXOP5<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, f128mem:$src2, VR128:$src3, u8imm:$src4),
+ !strconcat(OpcodeStr,
+ "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (vt128 VR128:$src1),
+ (vt128 (bitconvert (ld_128 addr:$src2))),
+ (id128 VR128:$src3), (i8 imm:$src4))))]>;
+ // For disassembler
+ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+ def rr_REV : IXOP5<opc, MRMSrcRegOp4, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3, u8imm:$src4),
+ !strconcat(OpcodeStr,
+ "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+ []>, VEX_W;
+
+ def rrY : IXOP5<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, VR256:$src3, u8imm:$src4),
+ !strconcat(OpcodeStr,
+ "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+ [(set VR256:$dst,
+ (vt256 (OpNode (vt256 VR256:$src1), (vt256 VR256:$src2),
+ (id256 VR256:$src3), (i8 imm:$src4))))]>, VEX_L;
+ def rmY : IXOP5<opc, MRMSrcMemOp4, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, i256mem:$src3, u8imm:$src4),
+ !strconcat(OpcodeStr,
+ "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+ [(set VR256:$dst,
+ (vt256 (OpNode (vt256 VR256:$src1), (vt256 VR256:$src2),
+ (id256 (bitconvert (loadv4i64 addr:$src3))),
+ (i8 imm:$src4))))]>, VEX_W, VEX_L;
+ def mrY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, f256mem:$src2, VR256:$src3, u8imm:$src4),
+ !strconcat(OpcodeStr,
+ "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+ [(set VR256:$dst,
+ (vt256 (OpNode (vt256 VR256:$src1),
+ (vt256 (bitconvert (ld_256 addr:$src2))),
+ (id256 VR256:$src3), (i8 imm:$src4))))]>, VEX_L;
+ // For disassembler
+ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+ def rrY_REV : IXOP5<opc, MRMSrcRegOp4, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, VR256:$src3, u8imm:$src4),
+ !strconcat(OpcodeStr,
+ "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+ []>, VEX_W, VEX_L;
+}
+
+let ExeDomain = SSEPackedDouble in
+ defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", X86vpermil2, v2f64, v4f64,
+ v2i64, v4i64, loadv2f64, loadv4f64>;
+
+let ExeDomain = SSEPackedSingle in
+ defm VPERMIL2PS : xop5op<0x48, "vpermil2ps", X86vpermil2, v4f32, v8f32,
+ v4i32, v8i32, loadv4f32, loadv8f32>;
+
diff --git a/contrib/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/contrib/llvm/lib/Target/X86/X86InterleavedAccess.cpp
new file mode 100644
index 000000000000..d9edf4676faf
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -0,0 +1,221 @@
+//===--------- X86InterleavedAccess.cpp ----------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the X86 implementation of the interleaved accesses
+/// optimization generating X86-specific instructions/intrinsics for
+/// interleaved access groups.
+///
+//===--------------------------------------------------------------------===//
+
+#include "X86ISelLowering.h"
+#include "X86TargetMachine.h"
+
+using namespace llvm;
+
+/// \brief This class holds necessary information to represent an interleaved
+/// access group and supports utilities to lower the group into
+/// X86-specific instructions/intrinsics.
+/// E.g. A group of interleaving access loads (Factor = 2; accessing every
+/// other element)
+/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
+/// %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6>
+/// %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7>
+
+class X86InterleavedAccessGroup {
+ /// \brief Reference to the wide-load instruction of an interleaved access
+ /// group.
+ Instruction *const Inst;
+
+ /// \brief Reference to the shuffle(s), consumer(s) of the (load) 'Inst'.
+ ArrayRef<ShuffleVectorInst *> Shuffles;
+
+ /// \brief Reference to the starting index of each user-shuffle.
+ ArrayRef<unsigned> Indices;
+
+ /// \brief Reference to the interleaving stride in terms of elements.
+ const unsigned Factor;
+
+ /// \brief Reference to the underlying target.
+ const X86Subtarget &Subtarget;
+
+ const DataLayout &DL;
+
+ IRBuilder<> &Builder;
+
+ /// \brief Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
+ /// sub vectors of type \p T. Returns true and the sub-vectors in
+ /// \p DecomposedVectors if it decomposes the Inst, returns false otherwise.
+ bool decompose(Instruction *Inst, unsigned NumSubVectors, VectorType *T,
+ SmallVectorImpl<Instruction *> &DecomposedVectors);
+
+ /// \brief Performs matrix transposition on a 4x4 matrix \p InputVectors and
+ /// returns the transposed-vectors in \p TransposedVectors.
+ /// E.g.
+ /// InputVectors:
+ /// In-V0 = p1, p2, p3, p4
+ /// In-V1 = q1, q2, q3, q4
+ /// In-V2 = r1, r2, r3, r4
+ /// In-V3 = s1, s2, s3, s4
+ /// OutputVectors:
+ /// Out-V0 = p1, q1, r1, s1
+ /// Out-V1 = p2, q2, r2, s2
+ /// Out-V2 = p3, q3, r3, s3
+ /// Out-V3 = P4, q4, r4, s4
+ void transpose_4x4(ArrayRef<Instruction *> InputVectors,
+ SmallVectorImpl<Value *> &TrasposedVectors);
+
+public:
+ /// In order to form an interleaved access group X86InterleavedAccessGroup
+ /// requires a wide-load instruction \p 'I', a group of interleaved-vectors
+ /// \p Shuffs, reference to the first indices of each interleaved-vector
+ /// \p 'Ind' and the interleaving stride factor \p F. In order to generate
+ /// X86-specific instructions/intrinsics it also requires the underlying
+ /// target information \p STarget.
+ explicit X86InterleavedAccessGroup(Instruction *I,
+ ArrayRef<ShuffleVectorInst *> Shuffs,
+ ArrayRef<unsigned> Ind,
+ const unsigned F,
+ const X86Subtarget &STarget,
+ IRBuilder<> &B)
+ : Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget),
+ DL(Inst->getModule()->getDataLayout()), Builder(B) {}
+
+ /// \brief Returns true if this interleaved access group can be lowered into
+ /// x86-specific instructions/intrinsics, false otherwise.
+ bool isSupported() const;
+
+ /// \brief Lowers this interleaved access group into X86-specific
+ /// instructions/intrinsics.
+ bool lowerIntoOptimizedSequence();
+};
+
+bool X86InterleavedAccessGroup::isSupported() const {
+ VectorType *ShuffleVecTy = Shuffles[0]->getType();
+ uint64_t ShuffleVecSize = DL.getTypeSizeInBits(ShuffleVecTy);
+ Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType();
+
+ if (DL.getTypeSizeInBits(Inst->getType()) < Factor * ShuffleVecSize)
+ return false;
+
+ // Currently, lowering is supported for 64 bits on AVX.
+ if (!Subtarget.hasAVX() || ShuffleVecSize != 256 ||
+ DL.getTypeSizeInBits(ShuffleEltTy) != 64 || Factor != 4)
+ return false;
+
+ return true;
+}
+
+bool X86InterleavedAccessGroup::decompose(
+ Instruction *VecInst, unsigned NumSubVectors, VectorType *SubVecTy,
+ SmallVectorImpl<Instruction *> &DecomposedVectors) {
+ Type *VecTy = VecInst->getType();
+ (void)VecTy;
+ assert(VecTy->isVectorTy() &&
+ DL.getTypeSizeInBits(VecTy) >=
+ DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
+ "Invalid Inst-size!!!");
+ assert(VecTy->getVectorElementType() == SubVecTy->getVectorElementType() &&
+ "Element type mismatched!!!");
+
+ if (!isa<LoadInst>(VecInst))
+ return false;
+
+ LoadInst *LI = cast<LoadInst>(VecInst);
+ Type *VecBasePtrTy = SubVecTy->getPointerTo(LI->getPointerAddressSpace());
+
+ Value *VecBasePtr =
+ Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
+
+ // Generate N loads of T type
+ for (unsigned i = 0; i < NumSubVectors; i++) {
+ // TODO: Support inbounds GEP
+ Value *NewBasePtr = Builder.CreateGEP(VecBasePtr, Builder.getInt32(i));
+ Instruction *NewLoad =
+ Builder.CreateAlignedLoad(NewBasePtr, LI->getAlignment());
+ DecomposedVectors.push_back(NewLoad);
+ }
+
+ return true;
+}
+
+void X86InterleavedAccessGroup::transpose_4x4(
+ ArrayRef<Instruction *> Matrix,
+ SmallVectorImpl<Value *> &TransposedMatrix) {
+ assert(Matrix.size() == 4 && "Invalid matrix size");
+ TransposedMatrix.resize(4);
+
+ // dst = src1[0,1],src2[0,1]
+ uint32_t IntMask1[] = {0, 1, 4, 5};
+ ArrayRef<uint32_t> Mask = makeArrayRef(IntMask1, 4);
+ Value *IntrVec1 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
+ Value *IntrVec2 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
+
+ // dst = src1[2,3],src2[2,3]
+ uint32_t IntMask2[] = {2, 3, 6, 7};
+ Mask = makeArrayRef(IntMask2, 4);
+ Value *IntrVec3 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
+ Value *IntrVec4 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
+
+ // dst = src1[0],src2[0],src1[2],src2[2]
+ uint32_t IntMask3[] = {0, 4, 2, 6};
+ Mask = makeArrayRef(IntMask3, 4);
+ TransposedMatrix[0] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
+ TransposedMatrix[2] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
+
+ // dst = src1[1],src2[1],src1[3],src2[3]
+ uint32_t IntMask4[] = {1, 5, 3, 7};
+ Mask = makeArrayRef(IntMask4, 4);
+ TransposedMatrix[1] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
+ TransposedMatrix[3] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
+}
+
+// Lowers this interleaved access group into X86-specific
+// instructions/intrinsics.
+bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
+ SmallVector<Instruction *, 4> DecomposedVectors;
+ VectorType *VecTy = Shuffles[0]->getType();
+ // Try to generate target-sized register(/instruction).
+ if (!decompose(Inst, Factor, VecTy, DecomposedVectors))
+ return false;
+
+ SmallVector<Value *, 4> TransposedVectors;
+ // Perform matrix-transposition in order to compute interleaved
+ // results by generating some sort of (optimized) target-specific
+ // instructions.
+ transpose_4x4(DecomposedVectors, TransposedVectors);
+
+ // Now replace the unoptimized-interleaved-vectors with the
+ // transposed-interleaved vectors.
+ for (unsigned i = 0; i < Shuffles.size(); i++)
+ Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
+
+ return true;
+}
+
+// Lower interleaved load(s) into target specific instructions/
+// intrinsics. Lowering sequence varies depending on the vector-types, factor,
+// number of shuffles and ISA.
+// Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
+bool X86TargetLowering::lowerInterleavedLoad(
+ LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+ ArrayRef<unsigned> Indices, unsigned Factor) const {
+ assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
+ "Invalid interleave factor");
+ assert(!Shuffles.empty() && "Empty shufflevector input");
+ assert(Shuffles.size() == Indices.size() &&
+ "Unmatched number of shufflevectors and indices");
+
+ // Create an interleaved access group.
+ IRBuilder<> Builder(LI);
+ X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
+ Builder);
+
+ return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
+}
diff --git a/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h
new file mode 100644
index 000000000000..df47b4ad583d
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -0,0 +1,1794 @@
+//===-- X86IntrinsicsInfo.h - X86 Intrinsics ------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the details for lowering X86 intrinsics
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86INTRINSICSINFO_H
+#define LLVM_LIB_TARGET_X86_X86INTRINSICSINFO_H
+
+#include "X86ISelLowering.h"
+#include "X86InstrInfo.h"
+
+namespace llvm {
+
+enum IntrinsicType : uint16_t {
+ INTR_NO_TYPE,
+ GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, XGETBV, ADX, FPCLASS, FPCLASSS,
+ INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP,
+ CMP_MASK, CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM,
+ CVTPD2PS, CVTPD2PS_MASK,
+ INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM,
+ INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, INTR_TYPE_2OP_IMM8_MASK,
+ INTR_TYPE_3OP_MASK, INTR_TYPE_3OP_MASK_RM, INTR_TYPE_3OP_IMM8_MASK,
+ FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3,
+ FMA_OP_SCALAR_MASK, FMA_OP_SCALAR_MASKZ, FMA_OP_SCALAR_MASK3,
+ VPERM_2OP_MASK, VPERM_3OP_MASK, VPERM_3OP_MASKZ, INTR_TYPE_SCALAR_MASK,
+ INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK_RM,
+ COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, BRCST_SUBVEC_TO_VEC, BRCST32x2_TO_VEC,
+ TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
+ EXPAND_FROM_MEM, INSERT_SUBVEC,
+ TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
+ FIXUPIMMS_MASKZ, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK
+};
+
+struct IntrinsicData {
+
+ uint16_t Id;
+ IntrinsicType Type;
+ uint16_t Opc0;
+ uint16_t Opc1;
+
+ bool operator<(const IntrinsicData &RHS) const {
+ return Id < RHS.Id;
+ }
+ bool operator==(const IntrinsicData &RHS) const {
+ return RHS.Id == Id;
+ }
+};
+
+#define X86_INTRINSIC_DATA(id, type, op0, op1) \
+ { Intrinsic::x86_##id, type, op0, op1 }
+
+/*
+ * IntrinsicsWithChain - the table should be sorted by Intrinsic ID - in
+ * the alphabetical order.
+ */
+static const IntrinsicData IntrinsicsWithChain[] = {
+ X86_INTRINSIC_DATA(addcarry_u32, ADX, X86ISD::ADC, 0),
+ X86_INTRINSIC_DATA(addcarry_u64, ADX, X86ISD::ADC, 0),
+ X86_INTRINSIC_DATA(addcarryx_u32, ADX, X86ISD::ADC, 0),
+ X86_INTRINSIC_DATA(addcarryx_u64, ADX, X86ISD::ADC, 0),
+
+ X86_INTRINSIC_DATA(avx512_gather_dpd_512, GATHER, X86::VGATHERDPDZrm, 0),
+ X86_INTRINSIC_DATA(avx512_gather_dpi_512, GATHER, X86::VPGATHERDDZrm, 0),
+ X86_INTRINSIC_DATA(avx512_gather_dpq_512, GATHER, X86::VPGATHERDQZrm, 0),
+ X86_INTRINSIC_DATA(avx512_gather_dps_512, GATHER, X86::VGATHERDPSZrm, 0),
+ X86_INTRINSIC_DATA(avx512_gather_qpd_512, GATHER, X86::VGATHERQPDZrm, 0),
+ X86_INTRINSIC_DATA(avx512_gather_qpi_512, GATHER, X86::VPGATHERQDZrm, 0),
+ X86_INTRINSIC_DATA(avx512_gather_qpq_512, GATHER, X86::VPGATHERQQZrm, 0),
+ X86_INTRINSIC_DATA(avx512_gather_qps_512, GATHER, X86::VGATHERQPSZrm, 0),
+ X86_INTRINSIC_DATA(avx512_gather3div2_df, GATHER, X86::VGATHERQPDZ128rm, 0),
+ X86_INTRINSIC_DATA(avx512_gather3div2_di, GATHER, X86::VPGATHERQQZ128rm, 0),
+ X86_INTRINSIC_DATA(avx512_gather3div4_df, GATHER, X86::VGATHERQPDZ256rm, 0),
+ X86_INTRINSIC_DATA(avx512_gather3div4_di, GATHER, X86::VPGATHERQQZ256rm, 0),
+ X86_INTRINSIC_DATA(avx512_gather3div4_sf, GATHER, X86::VGATHERQPSZ128rm, 0),
+ X86_INTRINSIC_DATA(avx512_gather3div4_si, GATHER, X86::VPGATHERQDZ128rm, 0),
+ X86_INTRINSIC_DATA(avx512_gather3div8_sf, GATHER, X86::VGATHERQPSZ256rm, 0),
+ X86_INTRINSIC_DATA(avx512_gather3div8_si, GATHER, X86::VPGATHERQDZ256rm, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv2_df, GATHER, X86::VGATHERDPDZ128rm, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv2_di, GATHER, X86::VPGATHERDQZ128rm, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv4_df, GATHER, X86::VGATHERDPDZ256rm, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv4_di, GATHER, X86::VPGATHERDQZ256rm, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv4_sf, GATHER, X86::VGATHERDPSZ128rm, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv4_si, GATHER, X86::VPGATHERDDZ128rm, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv8_sf, GATHER, X86::VGATHERDPSZ256rm, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv8_si, GATHER, X86::VPGATHERDDZ256rm, 0),
+
+ X86_INTRINSIC_DATA(avx512_gatherpf_dpd_512, PREFETCH,
+ X86::VGATHERPF0DPDm, X86::VGATHERPF1DPDm),
+ X86_INTRINSIC_DATA(avx512_gatherpf_dps_512, PREFETCH,
+ X86::VGATHERPF0DPSm, X86::VGATHERPF1DPSm),
+ X86_INTRINSIC_DATA(avx512_gatherpf_qpd_512, PREFETCH,
+ X86::VGATHERPF0QPDm, X86::VGATHERPF1QPDm),
+ X86_INTRINSIC_DATA(avx512_gatherpf_qps_512, PREFETCH,
+ X86::VGATHERPF0QPSm, X86::VGATHERPF1QPSm),
+
+ X86_INTRINSIC_DATA(avx512_mask_compress_store_d_128,
+ COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_store_d_256,
+ COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_store_d_512,
+ COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_store_pd_128,
+ COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_store_pd_256,
+ COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_store_pd_512,
+ COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_store_ps_128,
+ COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_store_ps_256,
+ COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_store_ps_512,
+ COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_store_q_128,
+ COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_store_q_256,
+ COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_store_q_512,
+ COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_d_128,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_d_256,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_d_512,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_pd_128,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_pd_256,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_pd_512,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_ps_128,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_ps_256,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_ps_512,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_q_128,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_q_256,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_q_512,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_256, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_512, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_128, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_256, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_512, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_128, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_256, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_512, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_128, TRUNCATE_TO_MEM_VI32,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_256, TRUNCATE_TO_MEM_VI32,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_512, TRUNCATE_TO_MEM_VI32,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_128, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_256, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_512, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_128, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_256, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_512, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_db_mem_128, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_db_mem_256, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_db_mem_512, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_mem_128, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_mem_256, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_mem_512, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_mem_128, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_mem_256, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_mem_512, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_mem_128, TRUNCATE_TO_MEM_VI32,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_mem_256, TRUNCATE_TO_MEM_VI32,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_mem_512, TRUNCATE_TO_MEM_VI32,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_mem_128, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_mem_256, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_mem_512, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_mem_128, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_mem_256, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_mem_512, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_db_mem_128, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_db_mem_256, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_db_mem_512, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_mem_128, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_mem_256, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_mem_512, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_mem_128, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_mem_256, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_mem_512, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_mem_128, TRUNCATE_TO_MEM_VI32,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_mem_256, TRUNCATE_TO_MEM_VI32,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_mem_512, TRUNCATE_TO_MEM_VI32,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_mem_128, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_mem_256, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_mem_512, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_mem_128, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_mem_256, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_mem_512, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCUS, 0),
+
+ X86_INTRINSIC_DATA(avx512_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0),
+ X86_INTRINSIC_DATA(avx512_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0),
+ X86_INTRINSIC_DATA(avx512_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0),
+ X86_INTRINSIC_DATA(avx512_scatter_dps_512, SCATTER, X86::VSCATTERDPSZmr, 0),
+ X86_INTRINSIC_DATA(avx512_scatter_qpd_512, SCATTER, X86::VSCATTERQPDZmr, 0),
+ X86_INTRINSIC_DATA(avx512_scatter_qpi_512, SCATTER, X86::VPSCATTERQDZmr, 0),
+ X86_INTRINSIC_DATA(avx512_scatter_qpq_512, SCATTER, X86::VPSCATTERQQZmr, 0),
+ X86_INTRINSIC_DATA(avx512_scatter_qps_512, SCATTER, X86::VSCATTERQPSZmr, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv2_df, SCATTER, X86::VSCATTERQPDZ128mr, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv2_di, SCATTER, X86::VPSCATTERQQZ128mr, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv4_df, SCATTER, X86::VSCATTERQPDZ256mr, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv4_di, SCATTER, X86::VPSCATTERQQZ256mr, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv4_sf, SCATTER, X86::VSCATTERQPSZ128mr, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv4_si, SCATTER, X86::VPSCATTERQDZ128mr, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv8_sf, SCATTER, X86::VSCATTERQPSZ256mr, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv8_si, SCATTER, X86::VPSCATTERQDZ256mr, 0),
+ X86_INTRINSIC_DATA(avx512_scatterpf_dpd_512, PREFETCH, X86::VSCATTERPF0DPDm,
+ X86::VSCATTERPF1DPDm),
+ X86_INTRINSIC_DATA(avx512_scatterpf_dps_512, PREFETCH, X86::VSCATTERPF0DPSm,
+ X86::VSCATTERPF1DPSm),
+ X86_INTRINSIC_DATA(avx512_scatterpf_qpd_512, PREFETCH, X86::VSCATTERPF0QPDm,
+ X86::VSCATTERPF1QPDm),
+ X86_INTRINSIC_DATA(avx512_scatterpf_qps_512, PREFETCH, X86::VSCATTERPF0QPSm,
+ X86::VSCATTERPF1QPSm),
+ X86_INTRINSIC_DATA(avx512_scattersiv2_df, SCATTER, X86::VSCATTERDPDZ128mr, 0),
+ X86_INTRINSIC_DATA(avx512_scattersiv2_di, SCATTER, X86::VPSCATTERDQZ128mr, 0),
+ X86_INTRINSIC_DATA(avx512_scattersiv4_df, SCATTER, X86::VSCATTERDPDZ256mr, 0),
+ X86_INTRINSIC_DATA(avx512_scattersiv4_di, SCATTER, X86::VPSCATTERDQZ256mr, 0),
+ X86_INTRINSIC_DATA(avx512_scattersiv4_sf, SCATTER, X86::VSCATTERDPSZ128mr, 0),
+ X86_INTRINSIC_DATA(avx512_scattersiv4_si, SCATTER, X86::VPSCATTERDDZ128mr, 0),
+ X86_INTRINSIC_DATA(avx512_scattersiv8_sf, SCATTER, X86::VSCATTERDPSZ256mr, 0),
+ X86_INTRINSIC_DATA(avx512_scattersiv8_si, SCATTER, X86::VPSCATTERDDZ256mr, 0),
+ X86_INTRINSIC_DATA(rdpmc, RDPMC, X86ISD::RDPMC_DAG, 0),
+ X86_INTRINSIC_DATA(rdrand_16, RDRAND, X86ISD::RDRAND, 0),
+ X86_INTRINSIC_DATA(rdrand_32, RDRAND, X86ISD::RDRAND, 0),
+ X86_INTRINSIC_DATA(rdrand_64, RDRAND, X86ISD::RDRAND, 0),
+ X86_INTRINSIC_DATA(rdseed_16, RDSEED, X86ISD::RDSEED, 0),
+ X86_INTRINSIC_DATA(rdseed_32, RDSEED, X86ISD::RDSEED, 0),
+ X86_INTRINSIC_DATA(rdseed_64, RDSEED, X86ISD::RDSEED, 0),
+ X86_INTRINSIC_DATA(rdtsc, RDTSC, X86ISD::RDTSC_DAG, 0),
+ X86_INTRINSIC_DATA(rdtscp, RDTSC, X86ISD::RDTSCP_DAG, 0),
+
+ X86_INTRINSIC_DATA(subborrow_u32, ADX, X86ISD::SBB, 0),
+ X86_INTRINSIC_DATA(subborrow_u64, ADX, X86ISD::SBB, 0),
+ X86_INTRINSIC_DATA(xgetbv, XGETBV, X86::XGETBV, 0),
+ X86_INTRINSIC_DATA(xtest, XTEST, X86ISD::XTEST, 0),
+};
+
+/*
+ * Find Intrinsic data by intrinsic ID
+ */
+static const IntrinsicData* getIntrinsicWithChain(uint16_t IntNo) {
+
+ IntrinsicData IntrinsicToFind = {IntNo, INTR_NO_TYPE, 0, 0 };
+ const IntrinsicData *Data = std::lower_bound(std::begin(IntrinsicsWithChain),
+ std::end(IntrinsicsWithChain),
+ IntrinsicToFind);
+ if (Data != std::end(IntrinsicsWithChain) && *Data == IntrinsicToFind)
+ return Data;
+ return nullptr;
+}
+
+/*
+ * IntrinsicsWithoutChain - the table should be sorted by Intrinsic ID - in
+ * the alphabetical order.
+ */
+static const IntrinsicData IntrinsicsWithoutChain[] = {
+ X86_INTRINSIC_DATA(avx_cvt_pd2_ps_256,CVTPD2PS, ISD::FP_ROUND, 0),
+ X86_INTRINSIC_DATA(avx_cvt_pd2dq_256, INTR_TYPE_1OP, X86ISD::CVTP2SI, 0),
+ X86_INTRINSIC_DATA(avx_cvtdq2_ps_256, INTR_TYPE_1OP, ISD::SINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx_cvtt_pd2dq_256,INTR_TYPE_1OP, ISD::FP_TO_SINT, 0),
+ X86_INTRINSIC_DATA(avx_cvtt_ps2dq_256,INTR_TYPE_1OP, ISD::FP_TO_SINT, 0),
+ X86_INTRINSIC_DATA(avx_hadd_pd_256, INTR_TYPE_2OP, X86ISD::FHADD, 0),
+ X86_INTRINSIC_DATA(avx_hadd_ps_256, INTR_TYPE_2OP, X86ISD::FHADD, 0),
+ X86_INTRINSIC_DATA(avx_hsub_pd_256, INTR_TYPE_2OP, X86ISD::FHSUB, 0),
+ X86_INTRINSIC_DATA(avx_hsub_ps_256, INTR_TYPE_2OP, X86ISD::FHSUB, 0),
+ X86_INTRINSIC_DATA(avx_max_pd_256, INTR_TYPE_2OP, X86ISD::FMAX, 0),
+ X86_INTRINSIC_DATA(avx_max_ps_256, INTR_TYPE_2OP, X86ISD::FMAX, 0),
+ X86_INTRINSIC_DATA(avx_min_pd_256, INTR_TYPE_2OP, X86ISD::FMIN, 0),
+ X86_INTRINSIC_DATA(avx_min_ps_256, INTR_TYPE_2OP, X86ISD::FMIN, 0),
+ X86_INTRINSIC_DATA(avx_movmsk_pd_256, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
+ X86_INTRINSIC_DATA(avx_movmsk_ps_256, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
+ X86_INTRINSIC_DATA(avx_rcp_ps_256, INTR_TYPE_1OP, X86ISD::FRCP, 0),
+ X86_INTRINSIC_DATA(avx_rsqrt_ps_256, INTR_TYPE_1OP, X86ISD::FRSQRT, 0),
+ X86_INTRINSIC_DATA(avx_sqrt_pd_256, INTR_TYPE_1OP, ISD::FSQRT, 0),
+ X86_INTRINSIC_DATA(avx_sqrt_ps_256, INTR_TYPE_1OP, ISD::FSQRT, 0),
+ X86_INTRINSIC_DATA(avx_vperm2f128_pd_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
+ X86_INTRINSIC_DATA(avx_vperm2f128_ps_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
+ X86_INTRINSIC_DATA(avx_vperm2f128_si_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
+ X86_INTRINSIC_DATA(avx_vpermilvar_pd, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
+ X86_INTRINSIC_DATA(avx_vpermilvar_pd_256, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
+ X86_INTRINSIC_DATA(avx_vpermilvar_ps, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
+ X86_INTRINSIC_DATA(avx_vpermilvar_ps_256, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
+ X86_INTRINSIC_DATA(avx2_pabs_b, INTR_TYPE_1OP, X86ISD::ABS, 0),
+ X86_INTRINSIC_DATA(avx2_pabs_d, INTR_TYPE_1OP, X86ISD::ABS, 0),
+ X86_INTRINSIC_DATA(avx2_pabs_w, INTR_TYPE_1OP, X86ISD::ABS, 0),
+ X86_INTRINSIC_DATA(avx2_packssdw, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
+ X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
+ X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(avx2_padds_b, INTR_TYPE_2OP, X86ISD::ADDS, 0),
+ X86_INTRINSIC_DATA(avx2_padds_w, INTR_TYPE_2OP, X86ISD::ADDS, 0),
+ X86_INTRINSIC_DATA(avx2_paddus_b, INTR_TYPE_2OP, X86ISD::ADDUS, 0),
+ X86_INTRINSIC_DATA(avx2_paddus_w, INTR_TYPE_2OP, X86ISD::ADDUS, 0),
+ X86_INTRINSIC_DATA(avx2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(avx2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0),
+ X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0),
+ X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0),
+ X86_INTRINSIC_DATA(avx2_phsub_w, INTR_TYPE_2OP, X86ISD::HSUB, 0),
+ X86_INTRINSIC_DATA(avx2_pmadd_ub_sw, INTR_TYPE_2OP, X86ISD::VPMADDUBSW, 0),
+ X86_INTRINSIC_DATA(avx2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0),
+ X86_INTRINSIC_DATA(avx2_pmovmskb, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
+ X86_INTRINSIC_DATA(avx2_pmul_dq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
+ X86_INTRINSIC_DATA(avx2_pmul_hr_sw, INTR_TYPE_2OP, X86ISD::MULHRS, 0),
+ X86_INTRINSIC_DATA(avx2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0),
+ X86_INTRINSIC_DATA(avx2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0),
+ X86_INTRINSIC_DATA(avx2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
+ X86_INTRINSIC_DATA(avx2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
+ X86_INTRINSIC_DATA(avx2_pshuf_b, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
+ X86_INTRINSIC_DATA(avx2_psll_d, INTR_TYPE_2OP, X86ISD::VSHL, 0),
+ X86_INTRINSIC_DATA(avx2_psll_q, INTR_TYPE_2OP, X86ISD::VSHL, 0),
+ X86_INTRINSIC_DATA(avx2_psll_w, INTR_TYPE_2OP, X86ISD::VSHL, 0),
+ X86_INTRINSIC_DATA(avx2_pslli_d, VSHIFT, X86ISD::VSHLI, 0),
+ X86_INTRINSIC_DATA(avx2_pslli_q, VSHIFT, X86ISD::VSHLI, 0),
+ X86_INTRINSIC_DATA(avx2_pslli_w, VSHIFT, X86ISD::VSHLI, 0),
+ X86_INTRINSIC_DATA(avx2_psllv_d, INTR_TYPE_2OP, ISD::SHL, 0),
+ X86_INTRINSIC_DATA(avx2_psllv_d_256, INTR_TYPE_2OP, ISD::SHL, 0),
+ X86_INTRINSIC_DATA(avx2_psllv_q, INTR_TYPE_2OP, ISD::SHL, 0),
+ X86_INTRINSIC_DATA(avx2_psllv_q_256, INTR_TYPE_2OP, ISD::SHL, 0),
+ X86_INTRINSIC_DATA(avx2_psra_d, INTR_TYPE_2OP, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(avx2_psra_w, INTR_TYPE_2OP, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(avx2_psrai_d, VSHIFT, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(avx2_psrai_w, VSHIFT, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(avx2_psrav_d, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
+ X86_INTRINSIC_DATA(avx2_psrav_d_256, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
+ X86_INTRINSIC_DATA(avx2_psrl_d, INTR_TYPE_2OP, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(avx2_psrl_q, INTR_TYPE_2OP, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(avx2_psrl_w, INTR_TYPE_2OP, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(avx2_psrli_d, VSHIFT, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(avx2_psrli_q, VSHIFT, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(avx2_psrli_w, VSHIFT, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(avx2_psrlv_d, INTR_TYPE_2OP, ISD::SRL, 0),
+ X86_INTRINSIC_DATA(avx2_psrlv_d_256, INTR_TYPE_2OP, ISD::SRL, 0),
+ X86_INTRINSIC_DATA(avx2_psrlv_q, INTR_TYPE_2OP, ISD::SRL, 0),
+ X86_INTRINSIC_DATA(avx2_psrlv_q_256, INTR_TYPE_2OP, ISD::SRL, 0),
+ X86_INTRINSIC_DATA(avx2_psubs_b, INTR_TYPE_2OP, X86ISD::SUBS, 0),
+ X86_INTRINSIC_DATA(avx2_psubs_w, INTR_TYPE_2OP, X86ISD::SUBS, 0),
+ X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
+ X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
+ X86_INTRINSIC_DATA(avx2_vperm2i128, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
+ X86_INTRINSIC_DATA(avx512_broadcastmb_128, BROADCASTM, X86ISD::VBROADCASTM, 0),
+ X86_INTRINSIC_DATA(avx512_broadcastmb_256, BROADCASTM, X86ISD::VBROADCASTM, 0),
+ X86_INTRINSIC_DATA(avx512_broadcastmb_512, BROADCASTM, X86ISD::VBROADCASTM, 0),
+ X86_INTRINSIC_DATA(avx512_broadcastmw_128, BROADCASTM, X86ISD::VBROADCASTM, 0),
+ X86_INTRINSIC_DATA(avx512_broadcastmw_256, BROADCASTM, X86ISD::VBROADCASTM, 0),
+ X86_INTRINSIC_DATA(avx512_broadcastmw_512, BROADCASTM, X86ISD::VBROADCASTM, 0),
+ X86_INTRINSIC_DATA(avx512_cvtb2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtb2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtb2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtd2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtd2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtd2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtmask2b_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtmask2b_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtmask2b_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtmask2d_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtmask2d_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtmask2d_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtmask2q_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtmask2q_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtmask2q_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtmask2w_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtmask2w_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtmask2w_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtq2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtq2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtq2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtsi2sd64, INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvtsi2ss32, INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvtsi2ss64, INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvttsd2si, INTR_TYPE_2OP, X86ISD::CVTTS2SI_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvttsd2si64, INTR_TYPE_2OP, X86ISD::CVTTS2SI_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvttsd2usi, INTR_TYPE_2OP, X86ISD::CVTTS2UI_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvttsd2usi64, INTR_TYPE_2OP, X86ISD::CVTTS2UI_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvttss2si, INTR_TYPE_2OP, X86ISD::CVTTS2SI_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvttss2si64, INTR_TYPE_2OP, X86ISD::CVTTS2SI_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvttss2usi, INTR_TYPE_2OP, X86ISD::CVTTS2UI_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvttss2usi64, INTR_TYPE_2OP, X86ISD::CVTTS2UI_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvtusi2ss, INTR_TYPE_3OP, X86ISD::SCALAR_UINT_TO_FP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvtusi642sd, INTR_TYPE_3OP, X86ISD::SCALAR_UINT_TO_FP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvtusi642ss, INTR_TYPE_3OP, X86ISD::SCALAR_UINT_TO_FP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvtw2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtw2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtw2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
+ X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
+ X86_INTRINSIC_DATA(avx512_kunpck_bw, KUNPCK, ISD::CONCAT_VECTORS, 0),
+ X86_INTRINSIC_DATA(avx512_kunpck_dq, KUNPCK, ISD::CONCAT_VECTORS, 0),
+ X86_INTRINSIC_DATA(avx512_kunpck_wd, KUNPCK, ISD::CONCAT_VECTORS, 0),
+
+ X86_INTRINSIC_DATA(avx512_mask_add_pd_512, INTR_TYPE_2OP_MASK, ISD::FADD,
+ X86ISD::FADD_RND),
+ X86_INTRINSIC_DATA(avx512_mask_add_ps_512, INTR_TYPE_2OP_MASK, ISD::FADD,
+ X86ISD::FADD_RND),
+ X86_INTRINSIC_DATA(avx512_mask_add_sd_round, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FADD_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_add_ss_round, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FADD_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_256, BRCST32x2_TO_VEC,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_512, BRCST32x2_TO_VEC,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcastf32x4_256, BRCST_SUBVEC_TO_VEC,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcastf32x4_512, BRCST_SUBVEC_TO_VEC,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcastf32x8_512, BRCST_SUBVEC_TO_VEC,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcastf64x2_256, BRCST_SUBVEC_TO_VEC,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcastf64x2_512, BRCST_SUBVEC_TO_VEC,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcastf64x4_512, BRCST_SUBVEC_TO_VEC,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_128, BRCST32x2_TO_VEC,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_256, BRCST32x2_TO_VEC,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_512, BRCST32x2_TO_VEC,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcasti32x4_256, BRCST_SUBVEC_TO_VEC,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcasti32x4_512, BRCST_SUBVEC_TO_VEC,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcasti32x8_512, BRCST_SUBVEC_TO_VEC,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcasti64x2_256, BRCST_SUBVEC_TO_VEC,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcasti64x2_512, BRCST_SUBVEC_TO_VEC,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcasti64x4_512, BRCST_SUBVEC_TO_VEC,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_b_128, CMP_MASK_CC, X86ISD::CMPM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_b_256, CMP_MASK_CC, X86ISD::CMPM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_b_512, CMP_MASK_CC, X86ISD::CMPM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_d_128, CMP_MASK_CC, X86ISD::CMPM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_d_256, CMP_MASK_CC, X86ISD::CMPM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_d_512, CMP_MASK_CC, X86ISD::CMPM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_pd_256, CMP_MASK_CC, X86ISD::CMPM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPM,
+ X86ISD::CMPM_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_ps_128, CMP_MASK_CC, X86ISD::CMPM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_ps_256, CMP_MASK_CC, X86ISD::CMPM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_ps_512, CMP_MASK_CC, X86ISD::CMPM,
+ X86ISD::CMPM_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_q_128, CMP_MASK_CC, X86ISD::CMPM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_q_256, CMP_MASK_CC, X86ISD::CMPM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_q_512, CMP_MASK_CC, X86ISD::CMPM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_sd, CMP_MASK_SCALAR_CC,
+ X86ISD::FSETCCM, X86ISD::FSETCCM_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_ss, CMP_MASK_SCALAR_CC,
+ X86ISD::FSETCCM, X86ISD::FSETCCM_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_w_128, CMP_MASK_CC, X86ISD::CMPM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_w_256, CMP_MASK_CC, X86ISD::CMPM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_w_512, CMP_MASK_CC, X86ISD::CMPM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_d_128, COMPRESS_EXPAND_IN_REG,
+ X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_d_256, COMPRESS_EXPAND_IN_REG,
+ X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_d_512, COMPRESS_EXPAND_IN_REG,
+ X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_pd_128, COMPRESS_EXPAND_IN_REG,
+ X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_pd_256, COMPRESS_EXPAND_IN_REG,
+ X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_pd_512, COMPRESS_EXPAND_IN_REG,
+ X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_ps_128, COMPRESS_EXPAND_IN_REG,
+ X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_ps_256, COMPRESS_EXPAND_IN_REG,
+ X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_ps_512, COMPRESS_EXPAND_IN_REG,
+ X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_q_128, COMPRESS_EXPAND_IN_REG,
+ X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_q_256, COMPRESS_EXPAND_IN_REG,
+ X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_q_512, COMPRESS_EXPAND_IN_REG,
+ X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_conflict_d_128, INTR_TYPE_1OP_MASK,
+ X86ISD::CONFLICT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_conflict_d_256, INTR_TYPE_1OP_MASK,
+ X86ISD::CONFLICT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_conflict_d_512, INTR_TYPE_1OP_MASK,
+ X86ISD::CONFLICT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_conflict_q_128, INTR_TYPE_1OP_MASK,
+ X86ISD::CONFLICT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_conflict_q_256, INTR_TYPE_1OP_MASK,
+ X86ISD::CONFLICT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_conflict_q_512, INTR_TYPE_1OP_MASK,
+ X86ISD::CONFLICT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_128, INTR_TYPE_1OP_MASK,
+ ISD::SINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_256, INTR_TYPE_1OP_MASK,
+ ISD::SINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_512, INTR_TYPE_1OP_MASK,
+ ISD::SINT_TO_FP, X86ISD::SINT_TO_FP_RND), //er
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2SI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_256, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2SI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_512, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps, INTR_TYPE_1OP_MASK,
+ X86ISD::VFPROUND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_256, CVTPD2PS_MASK,
+ ISD::FP_ROUND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_512, CVTPD2PS_MASK,
+ ISD::FP_ROUND, X86ISD::VFPROUND_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2SI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_256, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2SI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_512, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2UI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_256, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2UI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_512, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2UI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_256, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2UI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_512, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2SI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_256, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2SI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_512, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VFPEXT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_256, INTR_TYPE_1OP_MASK,
+ ISD::FP_EXTEND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_512, INTR_TYPE_1OP_MASK,
+ ISD::FP_EXTEND, X86ISD::VFPEXT_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2SI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_256, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2SI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_512, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2UI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_256, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2UI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_512, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2UI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_256, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2UI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_512, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_128, INTR_TYPE_1OP_MASK,
+ ISD::SINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_256, INTR_TYPE_1OP_MASK,
+ ISD::SINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_512, INTR_TYPE_1OP_MASK,
+ ISD::SINT_TO_FP, X86ISD::SINT_TO_FP_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_128, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTSI2P, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_256, INTR_TYPE_1OP_MASK,
+ ISD::SINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_512, INTR_TYPE_1OP_MASK,
+ ISD::SINT_TO_FP, X86ISD::SINT_TO_FP_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtsd2ss_round, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::VFPROUNDS_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtss2sd_round, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::VFPEXTS_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTTP2SI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_256, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_SINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_512, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_SINT, X86ISD::CVTTP2SI_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_128, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_SINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_256, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_SINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_512, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_SINT, X86ISD::CVTTP2SI_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTTP2UI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_256, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_UINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_512, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_UINT, X86ISD::CVTTP2UI_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_128, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_UINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_256, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_UINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_512, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_UINT, X86ISD::CVTTP2UI_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_128, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_SINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_256, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_SINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_512, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_SINT, X86ISD::CVTTP2SI_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTTP2SI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_256, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_SINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_512, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_SINT, X86ISD::CVTTP2SI_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_128, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_UINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_256, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_UINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_512, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_UINT, X86ISD::CVTTP2UI_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTTP2UI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_256, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_UINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_512, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_UINT, X86ISD::CVTTP2UI_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_128, INTR_TYPE_1OP_MASK,
+ ISD::UINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_256, INTR_TYPE_1OP_MASK,
+ ISD::UINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_512, INTR_TYPE_1OP_MASK,
+ ISD::UINT_TO_FP, X86ISD::UINT_TO_FP_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_128, INTR_TYPE_1OP_MASK,
+ ISD::UINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_256, INTR_TYPE_1OP_MASK,
+ ISD::UINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_512, INTR_TYPE_1OP_MASK,
+ ISD::UINT_TO_FP, X86ISD::UINT_TO_FP_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_128, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTUI2P, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_256, INTR_TYPE_1OP_MASK,
+ ISD::UINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_512, INTR_TYPE_1OP_MASK,
+ ISD::UINT_TO_FP, X86ISD::UINT_TO_FP_RND),
+ X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_128, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::DBPSADBW, 0),
+ X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_256, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::DBPSADBW, 0),
+ X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_512, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::DBPSADBW, 0),
+ X86_INTRINSIC_DATA(avx512_mask_div_pd_512, INTR_TYPE_2OP_MASK, ISD::FDIV,
+ X86ISD::FDIV_RND),
+ X86_INTRINSIC_DATA(avx512_mask_div_ps_512, INTR_TYPE_2OP_MASK, ISD::FDIV,
+ X86ISD::FDIV_RND),
+ X86_INTRINSIC_DATA(avx512_mask_div_sd_round, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FDIV_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_div_ss_round, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FDIV_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_d_128, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_d_256, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_d_512, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_pd_128, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_pd_256, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_pd_512, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_ps_128, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_ps_256, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_ps_512, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_q_128, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_q_256, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_q_512, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fixupimm_pd_128, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fixupimm_pd_256, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fixupimm_pd_512, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fixupimm_ps_128, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fixupimm_ps_256, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fixupimm_ps_512, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fixupimm_sd, FIXUPIMMS, X86ISD::VFIXUPIMMS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fixupimm_ss, FIXUPIMMS, X86ISD::VFIXUPIMMS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_128, FPCLASS, X86ISD::VFPCLASS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_256, FPCLASS, X86ISD::VFPCLASS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_512, FPCLASS, X86ISD::VFPCLASS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_128, FPCLASS, X86ISD::VFPCLASS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_256, FPCLASS, X86ISD::VFPCLASS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_512, FPCLASS, X86ISD::VFPCLASS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fpclass_sd, FPCLASSS, X86ISD::VFPCLASSS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fpclass_ss, FPCLASSS, X86ISD::VFPCLASSS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_pd_128, INTR_TYPE_1OP_MASK_RM,
+ X86ISD::FGETEXP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_pd_256, INTR_TYPE_1OP_MASK_RM,
+ X86ISD::FGETEXP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_pd_512, INTR_TYPE_1OP_MASK_RM,
+ X86ISD::FGETEXP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_ps_128, INTR_TYPE_1OP_MASK_RM,
+ X86ISD::FGETEXP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_ps_256, INTR_TYPE_1OP_MASK_RM,
+ X86ISD::FGETEXP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_ps_512, INTR_TYPE_1OP_MASK_RM,
+ X86ISD::FGETEXP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_sd, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FGETEXPS_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_ss, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FGETEXPS_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_pd_128, INTR_TYPE_2OP_MASK_RM,
+ X86ISD::VGETMANT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_pd_256, INTR_TYPE_2OP_MASK_RM,
+ X86ISD::VGETMANT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_pd_512, INTR_TYPE_2OP_MASK_RM,
+ X86ISD::VGETMANT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_ps_128, INTR_TYPE_2OP_MASK_RM,
+ X86ISD::VGETMANT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_ps_256, INTR_TYPE_2OP_MASK_RM,
+ X86ISD::VGETMANT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_ps_512, INTR_TYPE_2OP_MASK_RM,
+ X86ISD::VGETMANT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_sd, INTR_TYPE_3OP_SCALAR_MASK_RM,
+ X86ISD::VGETMANTS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_ss, INTR_TYPE_3OP_SCALAR_MASK_RM,
+ X86ISD::VGETMANTS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_insertf32x4_256, INSERT_SUBVEC,
+ ISD::INSERT_SUBVECTOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_insertf32x4_512, INSERT_SUBVEC,
+ ISD::INSERT_SUBVECTOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_insertf32x8_512, INSERT_SUBVEC,
+ ISD::INSERT_SUBVECTOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_insertf64x2_256, INSERT_SUBVEC,
+ ISD::INSERT_SUBVECTOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_insertf64x2_512, INSERT_SUBVEC,
+ ISD::INSERT_SUBVECTOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_insertf64x4_512, INSERT_SUBVEC,
+ ISD::INSERT_SUBVECTOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_inserti32x4_256, INSERT_SUBVEC,
+ ISD::INSERT_SUBVECTOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_inserti32x4_512, INSERT_SUBVEC,
+ ISD::INSERT_SUBVECTOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_inserti32x8_512, INSERT_SUBVEC,
+ ISD::INSERT_SUBVECTOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_inserti64x2_256, INSERT_SUBVEC,
+ ISD::INSERT_SUBVECTOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_inserti64x2_512, INSERT_SUBVEC,
+ ISD::INSERT_SUBVECTOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_inserti64x4_512, INSERT_SUBVEC,
+ ISD::INSERT_SUBVECTOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_128, INTR_TYPE_1OP_MASK,
+ ISD::CTLZ, 0),
+ X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_256, INTR_TYPE_1OP_MASK,
+ ISD::CTLZ, 0),
+ X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_512, INTR_TYPE_1OP_MASK,
+ ISD::CTLZ, 0),
+ X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_128, INTR_TYPE_1OP_MASK,
+ ISD::CTLZ, 0),
+ X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_256, INTR_TYPE_1OP_MASK,
+ ISD::CTLZ, 0),
+ X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_512, INTR_TYPE_1OP_MASK,
+ ISD::CTLZ, 0),
+ X86_INTRINSIC_DATA(avx512_mask_max_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
+ X86_INTRINSIC_DATA(avx512_mask_max_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
+ X86_INTRINSIC_DATA(avx512_mask_max_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX,
+ X86ISD::FMAX_RND),
+ X86_INTRINSIC_DATA(avx512_mask_max_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
+ X86_INTRINSIC_DATA(avx512_mask_max_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
+ X86_INTRINSIC_DATA(avx512_mask_max_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX,
+ X86ISD::FMAX_RND),
+ X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FMAX_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FMAX_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_min_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_min_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_min_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN,
+ X86ISD::FMIN_RND),
+ X86_INTRINSIC_DATA(avx512_mask_min_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_min_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_min_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN,
+ X86ISD::FMIN_RND),
+ X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FMIN_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FMIN_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_mul_pd_512, INTR_TYPE_2OP_MASK, ISD::FMUL,
+ X86ISD::FMUL_RND),
+ X86_INTRINSIC_DATA(avx512_mask_mul_ps_512, INTR_TYPE_2OP_MASK, ISD::FMUL,
+ X86ISD::FMUL_RND),
+ X86_INTRINSIC_DATA(avx512_mask_mul_sd_round, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FMUL_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_mul_ss_round, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FMUL_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pabs_b_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pabs_b_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pabs_b_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pabs_d_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pabs_d_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pabs_d_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pabs_q_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pabs_q_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pabs_q_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pabs_w_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pabs_w_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pabs_w_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_packssdw_128, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_packssdw_256, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_packssdw_512, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_packsswb_128, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_packsswb_256, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_packsswb_512, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_packusdw_128, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_packusdw_256, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_packusdw_512, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_packuswb_128, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_packuswb_256, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_packuswb_512, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_padds_b_128, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_padds_b_256, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_padds_b_512, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_padds_w_128, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_padds_w_256, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_padds_w_512, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_paddus_b_128, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_paddus_b_256, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_paddus_b_512, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_paddus_w_128, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_paddus_w_256, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_paddus_w_512, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pavg_b_128, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pavg_b_256, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pavg_b_512, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pavg_w_128, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pavg_w_256, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pavg_w_512, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pbroadcast_b_gpr_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pbroadcast_b_gpr_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pbroadcast_b_gpr_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pbroadcast_d_gpr_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pbroadcast_d_gpr_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pbroadcast_d_gpr_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pbroadcast_q_gpr_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pbroadcast_q_gpr_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pbroadcast_q_gpr_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pbroadcast_w_gpr_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pbroadcast_w_gpr_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pbroadcast_w_gpr_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_mask_permvar_df_256, VPERM_2OP_MASK,
+ X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_permvar_df_512, VPERM_2OP_MASK,
+ X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_permvar_di_256, VPERM_2OP_MASK,
+ X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_permvar_di_512, VPERM_2OP_MASK,
+ X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_permvar_hi_128, VPERM_2OP_MASK,
+ X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_permvar_hi_256, VPERM_2OP_MASK,
+ X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_permvar_hi_512, VPERM_2OP_MASK,
+ X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_permvar_qi_128, VPERM_2OP_MASK,
+ X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_permvar_qi_256, VPERM_2OP_MASK,
+ X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_permvar_qi_512, VPERM_2OP_MASK,
+ X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_permvar_sf_256, VPERM_2OP_MASK,
+ X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_permvar_sf_512, VPERM_2OP_MASK,
+ X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_permvar_si_256, VPERM_2OP_MASK,
+ X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_permvar_si_512, VPERM_2OP_MASK,
+ X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_128, INTR_TYPE_2OP_MASK,
+ X86ISD::VPMADDUBSW, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_256, INTR_TYPE_2OP_MASK,
+ X86ISD::VPMADDUBSW, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_512, INTR_TYPE_2OP_MASK,
+ X86ISD::VPMADDUBSW, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_128, INTR_TYPE_2OP_MASK,
+ X86ISD::VPMADDWD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_256, INTR_TYPE_2OP_MASK,
+ X86ISD::VPMADDWD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_512, INTR_TYPE_2OP_MASK,
+ X86ISD::VPMADDWD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_db_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_db_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_db_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_dw_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_dw_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_dw_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qb_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qb_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qb_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qd_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qd_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qd_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qw_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qw_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qw_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_wb_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_wb_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_wb_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_db_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_db_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_db_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_db_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_db_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_db_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmul_hr_sw_128, INTR_TYPE_2OP_MASK, X86ISD::MULHRS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmul_hr_sw_256, INTR_TYPE_2OP_MASK, X86ISD::MULHRS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmul_hr_sw_512, INTR_TYPE_2OP_MASK, X86ISD::MULHRS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmulh_w_128, INTR_TYPE_2OP_MASK, ISD::MULHS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmulh_w_256, INTR_TYPE_2OP_MASK, ISD::MULHS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmulh_w_512, INTR_TYPE_2OP_MASK, ISD::MULHS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmulhu_w_128, INTR_TYPE_2OP_MASK, ISD::MULHU, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmulhu_w_256, INTR_TYPE_2OP_MASK, ISD::MULHU, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmulhu_w_512, INTR_TYPE_2OP_MASK, ISD::MULHU, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmultishift_qb_128, INTR_TYPE_2OP_MASK,
+ X86ISD::MULTISHIFT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmultishift_qb_256, INTR_TYPE_2OP_MASK,
+ X86ISD::MULTISHIFT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmultishift_qb_512, INTR_TYPE_2OP_MASK,
+ X86ISD::MULTISHIFT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_prol_d_128, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_prol_d_256, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_prol_d_512, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_prol_q_128, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_prol_q_256, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_prol_q_512, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_prolv_d_128, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_prolv_d_256, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_prolv_d_512, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_prolv_q_128, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_prolv_q_256, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_prolv_q_512, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pror_d_128, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTRI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pror_d_256, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTRI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pror_d_512, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTRI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pror_q_128, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTRI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pror_q_256, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTRI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pror_q_512, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTRI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_prorv_d_128, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_prorv_d_256, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_prorv_d_512, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_prorv_q_128, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_prorv_q_256, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_prorv_q_512, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psubs_b_128, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psubs_b_256, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psubs_b_512, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psubs_w_128, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psubs_w_256, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psubs_w_512, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psubus_b_128, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psubus_b_256, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psubus_b_512, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psubus_w_128, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psubus_w_256, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psubus_w_512, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pternlog_d_128, TERLOG_OP_MASK,
+ X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pternlog_d_256, TERLOG_OP_MASK,
+ X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pternlog_d_512, TERLOG_OP_MASK,
+ X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pternlog_q_128, TERLOG_OP_MASK,
+ X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pternlog_q_256, TERLOG_OP_MASK,
+ X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pternlog_q_512, TERLOG_OP_MASK,
+ X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_pd_128, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_pd_256, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_pd_512, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_ps_128, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_ps_256, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_ps_512, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_pd_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_pd_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_pd_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_ps_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_ps_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_ps_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VREDUCES, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VREDUCES, 0),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_sd, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::VRNDSCALES, 0),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_ss, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::VRNDSCALES, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scalef_pd_128, INTR_TYPE_2OP_MASK_RM,
+ X86ISD::SCALEF, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scalef_pd_256, INTR_TYPE_2OP_MASK_RM,
+ X86ISD::SCALEF, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scalef_pd_512, INTR_TYPE_2OP_MASK_RM,
+ X86ISD::SCALEF, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scalef_ps_128, INTR_TYPE_2OP_MASK_RM,
+ X86ISD::SCALEF, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scalef_ps_256, INTR_TYPE_2OP_MASK_RM,
+ X86ISD::SCALEF, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scalef_ps_512, INTR_TYPE_2OP_MASK_RM,
+ X86ISD::SCALEF, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scalef_sd, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::SCALEFS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scalef_ss, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::SCALEFS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_shuf_f32x4, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_shuf_f32x4_256, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_shuf_f64x2, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_shuf_f64x2_256, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_shuf_i32x4, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_shuf_i32x4_256, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_shuf_i64x2, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_shuf_i64x2_256, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_512, INTR_TYPE_1OP_MASK, ISD::FSQRT,
+ X86ISD::FSQRT_RND),
+ X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_512, INTR_TYPE_1OP_MASK, ISD::FSQRT,
+ X86ISD::FSQRT_RND),
+ X86_INTRINSIC_DATA(avx512_mask_sqrt_sd, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FSQRTS_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_sqrt_ss, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FSQRTS_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_sub_pd_512, INTR_TYPE_2OP_MASK, ISD::FSUB,
+ X86ISD::FSUB_RND),
+ X86_INTRINSIC_DATA(avx512_mask_sub_ps_512, INTR_TYPE_2OP_MASK, ISD::FSUB,
+ X86ISD::FSUB_RND),
+ X86_INTRINSIC_DATA(avx512_mask_sub_sd_round, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FSUB_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FSUB_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_ucmp_b_128, CMP_MASK_CC, X86ISD::CMPMU, 0),
+ X86_INTRINSIC_DATA(avx512_mask_ucmp_b_256, CMP_MASK_CC, X86ISD::CMPMU, 0),
+ X86_INTRINSIC_DATA(avx512_mask_ucmp_b_512, CMP_MASK_CC, X86ISD::CMPMU, 0),
+ X86_INTRINSIC_DATA(avx512_mask_ucmp_d_128, CMP_MASK_CC, X86ISD::CMPMU, 0),
+ X86_INTRINSIC_DATA(avx512_mask_ucmp_d_256, CMP_MASK_CC, X86ISD::CMPMU, 0),
+ X86_INTRINSIC_DATA(avx512_mask_ucmp_d_512, CMP_MASK_CC, X86ISD::CMPMU, 0),
+ X86_INTRINSIC_DATA(avx512_mask_ucmp_q_128, CMP_MASK_CC, X86ISD::CMPMU, 0),
+ X86_INTRINSIC_DATA(avx512_mask_ucmp_q_256, CMP_MASK_CC, X86ISD::CMPMU, 0),
+ X86_INTRINSIC_DATA(avx512_mask_ucmp_q_512, CMP_MASK_CC, X86ISD::CMPMU, 0),
+ X86_INTRINSIC_DATA(avx512_mask_ucmp_w_128, CMP_MASK_CC, X86ISD::CMPMU, 0),
+ X86_INTRINSIC_DATA(avx512_mask_ucmp_w_256, CMP_MASK_CC, X86ISD::CMPMU, 0),
+ X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512, CMP_MASK_CC, X86ISD::CMPMU, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK_RM,
+ X86ISD::CVTPH2PS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK_RM,
+ X86ISD::CVTPH2PS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK_RM,
+ X86ISD::CVTPH2PS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_128, INTR_TYPE_2OP_MASK,
+ X86ISD::CVTPS2PH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_256, INTR_TYPE_2OP_MASK,
+ X86ISD::CVTPS2PH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_512, INTR_TYPE_2OP_MASK,
+ X86ISD::CVTPS2PH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_128, FMA_OP_MASK, X86ISD::FMADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_256, FMA_OP_MASK, X86ISD::FMADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_512, FMA_OP_MASK, X86ISD::FMADD,
+ X86ISD::FMADD_RND),
+ X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_128, FMA_OP_MASK, X86ISD::FMADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_256, FMA_OP_MASK, X86ISD::FMADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_512, FMA_OP_MASK, X86ISD::FMADD,
+ X86ISD::FMADD_RND),
+
+ X86_INTRINSIC_DATA(avx512_mask_vfmadd_sd, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfmadd_ss, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_512, FMA_OP_MASK, X86ISD::FMADDSUB,
+ X86ISD::FMADDSUB_RND),
+ X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_ps_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_ps_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_ps_512, FMA_OP_MASK, X86ISD::FMADDSUB,
+ X86ISD::FMADDSUB_RND),
+
+ X86_INTRINSIC_DATA(avx512_mask_vfnmadd_pd_128, FMA_OP_MASK, X86ISD::FNMADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfnmadd_pd_256, FMA_OP_MASK, X86ISD::FNMADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfnmadd_pd_512, FMA_OP_MASK, X86ISD::FNMADD,
+ X86ISD::FNMADD_RND),
+ X86_INTRINSIC_DATA(avx512_mask_vfnmadd_ps_128, FMA_OP_MASK, X86ISD::FNMADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfnmadd_ps_256, FMA_OP_MASK, X86ISD::FNMADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfnmadd_ps_512, FMA_OP_MASK, X86ISD::FNMADD,
+ X86ISD::FNMADD_RND),
+
+ X86_INTRINSIC_DATA(avx512_mask_vfnmsub_pd_128, FMA_OP_MASK, X86ISD::FNMSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfnmsub_pd_256, FMA_OP_MASK, X86ISD::FNMSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfnmsub_pd_512, FMA_OP_MASK, X86ISD::FNMSUB,
+ X86ISD::FNMSUB_RND),
+ X86_INTRINSIC_DATA(avx512_mask_vfnmsub_ps_128, FMA_OP_MASK, X86ISD::FNMSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfnmsub_ps_256, FMA_OP_MASK, X86ISD::FNMSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfnmsub_ps_512, FMA_OP_MASK, X86ISD::FNMSUB,
+ X86ISD::FNMSUB_RND),
+
+ X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_128, VPERM_3OP_MASK,
+ X86ISD::VPERMIV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_256, VPERM_3OP_MASK,
+ X86ISD::VPERMIV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_512, VPERM_3OP_MASK,
+ X86ISD::VPERMIV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermi2var_hi_128, VPERM_3OP_MASK,
+ X86ISD::VPERMIV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermi2var_hi_256, VPERM_3OP_MASK,
+ X86ISD::VPERMIV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermi2var_hi_512, VPERM_3OP_MASK,
+ X86ISD::VPERMIV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermi2var_pd_128, VPERM_3OP_MASK,
+ X86ISD::VPERMIV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermi2var_pd_256, VPERM_3OP_MASK,
+ X86ISD::VPERMIV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermi2var_pd_512, VPERM_3OP_MASK,
+ X86ISD::VPERMIV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermi2var_ps_128, VPERM_3OP_MASK,
+ X86ISD::VPERMIV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermi2var_ps_256, VPERM_3OP_MASK,
+ X86ISD::VPERMIV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermi2var_ps_512, VPERM_3OP_MASK,
+ X86ISD::VPERMIV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermi2var_q_128, VPERM_3OP_MASK,
+ X86ISD::VPERMIV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermi2var_q_256, VPERM_3OP_MASK,
+ X86ISD::VPERMIV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermi2var_q_512, VPERM_3OP_MASK,
+ X86ISD::VPERMIV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermi2var_qi_128, VPERM_3OP_MASK,
+ X86ISD::VPERMIV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermi2var_qi_256, VPERM_3OP_MASK,
+ X86ISD::VPERMIV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermi2var_qi_512, VPERM_3OP_MASK,
+ X86ISD::VPERMIV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_128, VPERM_3OP_MASK,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_256, VPERM_3OP_MASK,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_512, VPERM_3OP_MASK,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_128, VPERM_3OP_MASK,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_256, VPERM_3OP_MASK,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_512, VPERM_3OP_MASK,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_128, VPERM_3OP_MASK,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_256, VPERM_3OP_MASK,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_512, VPERM_3OP_MASK,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_128, VPERM_3OP_MASK,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_256, VPERM_3OP_MASK,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_512, VPERM_3OP_MASK,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_128, VPERM_3OP_MASK,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_256, VPERM_3OP_MASK,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_512, VPERM_3OP_MASK,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermt2var_qi_128, VPERM_3OP_MASK,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermt2var_qi_256, VPERM_3OP_MASK,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermt2var_qi_512, VPERM_3OP_MASK,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_128 , FMA_OP_MASK,
+ X86ISD::VPMADD52H, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_256 , FMA_OP_MASK,
+ X86ISD::VPMADD52H, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_512 , FMA_OP_MASK,
+ X86ISD::VPMADD52H, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_128 , FMA_OP_MASK,
+ X86ISD::VPMADD52L, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_256 , FMA_OP_MASK,
+ X86ISD::VPMADD52L, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_512 , FMA_OP_MASK,
+ X86ISD::VPMADD52L, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_128, FMA_OP_MASK3, X86ISD::FMADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_256, FMA_OP_MASK3, X86ISD::FMADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_512, FMA_OP_MASK3, X86ISD::FMADD,
+ X86ISD::FMADD_RND),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_128, FMA_OP_MASK3, X86ISD::FMADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_256, FMA_OP_MASK3, X86ISD::FMADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_512, FMA_OP_MASK3, X86ISD::FMADD,
+ X86ISD::FMADD_RND),
+
+ X86_INTRINSIC_DATA(avx512_mask3_vfmadd_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_128, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_256, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_512, FMA_OP_MASK3, X86ISD::FMADDSUB,
+ X86ISD::FMADDSUB_RND),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_ps_128, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_ps_256, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_ps_512, FMA_OP_MASK3, X86ISD::FMADDSUB,
+ X86ISD::FMADDSUB_RND),
+
+ X86_INTRINSIC_DATA(avx512_mask3_vfmsub_pd_128, FMA_OP_MASK3, X86ISD::FMSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmsub_pd_256, FMA_OP_MASK3, X86ISD::FMSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmsub_pd_512, FMA_OP_MASK3, X86ISD::FMSUB,
+ X86ISD::FMSUB_RND),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_128, FMA_OP_MASK3, X86ISD::FMSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_256, FMA_OP_MASK3, X86ISD::FMSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_512, FMA_OP_MASK3, X86ISD::FMSUB,
+ X86ISD::FMSUB_RND),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3_RND, 0),
+
+ X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_128, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_256, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_512, FMA_OP_MASK3, X86ISD::FMSUBADD,
+ X86ISD::FMSUBADD_RND),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_ps_128, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_ps_256, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_ps_512, FMA_OP_MASK3, X86ISD::FMSUBADD,
+ X86ISD::FMSUBADD_RND),
+
+ X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_pd_128, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_pd_256, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_pd_512, FMA_OP_MASK3, X86ISD::FNMSUB,
+ X86ISD::FNMSUB_RND),
+ X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_128, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_256, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_512, FMA_OP_MASK3, X86ISD::FNMSUB,
+ X86ISD::FNMSUB_RND),
+ X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3_RND, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_128, FIXUPIMM_MASKZ,
+ X86ISD::VFIXUPIMM, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_256, FIXUPIMM_MASKZ,
+ X86ISD::VFIXUPIMM, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_512, FIXUPIMM_MASKZ,
+ X86ISD::VFIXUPIMM, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ps_128, FIXUPIMM_MASKZ,
+ X86ISD::VFIXUPIMM, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ps_256, FIXUPIMM_MASKZ,
+ X86ISD::VFIXUPIMM, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ps_512, FIXUPIMM_MASKZ,
+ X86ISD::VFIXUPIMM, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_fixupimm_sd, FIXUPIMMS_MASKZ,
+ X86ISD::VFIXUPIMMS, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ss, FIXUPIMMS_MASKZ,
+ X86ISD::VFIXUPIMMS, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_128, TERLOG_OP_MASKZ,
+ X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_256, TERLOG_OP_MASKZ,
+ X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_512, TERLOG_OP_MASKZ,
+ X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_128, TERLOG_OP_MASKZ,
+ X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_256, TERLOG_OP_MASKZ,
+ X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_512, TERLOG_OP_MASKZ,
+ X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_128, FMA_OP_MASKZ, X86ISD::FMADD, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_256, FMA_OP_MASKZ, X86ISD::FMADD, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_512, FMA_OP_MASKZ, X86ISD::FMADD,
+ X86ISD::FMADD_RND),
+ X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_128, FMA_OP_MASKZ, X86ISD::FMADD, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_256, FMA_OP_MASKZ, X86ISD::FMADD, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_512, FMA_OP_MASKZ, X86ISD::FMADD,
+ X86ISD::FMADD_RND),
+
+ X86_INTRINSIC_DATA(avx512_maskz_vfmadd_sd, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1_RND, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ss, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1_RND, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_128, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_256, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_512, FMA_OP_MASKZ, X86ISD::FMADDSUB,
+ X86ISD::FMADDSUB_RND),
+ X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_ps_128, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_ps_256, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_ps_512, FMA_OP_MASKZ, X86ISD::FMADDSUB,
+ X86ISD::FMADDSUB_RND),
+
+ X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_d_128, VPERM_3OP_MASKZ,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_d_256, VPERM_3OP_MASKZ,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_d_512, VPERM_3OP_MASKZ,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_hi_128, VPERM_3OP_MASKZ,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_hi_256, VPERM_3OP_MASKZ,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_hi_512, VPERM_3OP_MASKZ,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_pd_128, VPERM_3OP_MASKZ,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_pd_256, VPERM_3OP_MASKZ,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_pd_512, VPERM_3OP_MASKZ,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_ps_128, VPERM_3OP_MASKZ,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_ps_256, VPERM_3OP_MASKZ,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_ps_512, VPERM_3OP_MASKZ,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_q_128, VPERM_3OP_MASKZ,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_q_256, VPERM_3OP_MASKZ,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_q_512, VPERM_3OP_MASKZ,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_qi_128, VPERM_3OP_MASKZ,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_qi_256, VPERM_3OP_MASKZ,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_qi_512, VPERM_3OP_MASKZ,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_128, FMA_OP_MASKZ,
+ X86ISD::VPMADD52H, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_256, FMA_OP_MASKZ,
+ X86ISD::VPMADD52H, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_512, FMA_OP_MASKZ,
+ X86ISD::VPMADD52H, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_128, FMA_OP_MASKZ,
+ X86ISD::VPMADD52L, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_256, FMA_OP_MASKZ,
+ X86ISD::VPMADD52L, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_512, FMA_OP_MASKZ,
+ X86ISD::VPMADD52L, 0),
+ X86_INTRINSIC_DATA(avx512_pmul_dq_512, INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
+ X86_INTRINSIC_DATA(avx512_pmulu_dq_512, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
+ X86_INTRINSIC_DATA(avx512_psad_bw_512, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
+ X86_INTRINSIC_DATA(avx512_pshuf_b_512, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
+ X86_INTRINSIC_DATA(avx512_psll_d_512, INTR_TYPE_2OP, X86ISD::VSHL, 0),
+ X86_INTRINSIC_DATA(avx512_psll_q_512, INTR_TYPE_2OP, X86ISD::VSHL, 0),
+ X86_INTRINSIC_DATA(avx512_psll_w_512, INTR_TYPE_2OP, X86ISD::VSHL, 0),
+ X86_INTRINSIC_DATA(avx512_pslli_d_512, VSHIFT, X86ISD::VSHLI, 0),
+ X86_INTRINSIC_DATA(avx512_pslli_q_512, VSHIFT, X86ISD::VSHLI, 0),
+ X86_INTRINSIC_DATA(avx512_pslli_w_512, VSHIFT, X86ISD::VSHLI, 0),
+ X86_INTRINSIC_DATA(avx512_psllv_d_512, INTR_TYPE_2OP, ISD::SHL, 0),
+ X86_INTRINSIC_DATA(avx512_psllv_q_512, INTR_TYPE_2OP, ISD::SHL, 0),
+ X86_INTRINSIC_DATA(avx512_psllv_w_128, INTR_TYPE_2OP, ISD::SHL, 0),
+ X86_INTRINSIC_DATA(avx512_psllv_w_256, INTR_TYPE_2OP, ISD::SHL, 0),
+ X86_INTRINSIC_DATA(avx512_psllv_w_512, INTR_TYPE_2OP, ISD::SHL, 0),
+ X86_INTRINSIC_DATA(avx512_psra_d_512, INTR_TYPE_2OP, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(avx512_psra_q_128, INTR_TYPE_2OP, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(avx512_psra_q_256, INTR_TYPE_2OP, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(avx512_psra_q_512, INTR_TYPE_2OP, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(avx512_psra_w_512, INTR_TYPE_2OP, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(avx512_psrai_d_512, VSHIFT, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(avx512_psrai_q_128, VSHIFT, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(avx512_psrai_q_256, VSHIFT, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(avx512_psrai_q_512, VSHIFT, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(avx512_psrai_w_512, VSHIFT, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(avx512_psrav_d_512, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
+ X86_INTRINSIC_DATA(avx512_psrav_q_128, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
+ X86_INTRINSIC_DATA(avx512_psrav_q_256, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
+ X86_INTRINSIC_DATA(avx512_psrav_q_512, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
+ X86_INTRINSIC_DATA(avx512_psrav_w_128, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
+ X86_INTRINSIC_DATA(avx512_psrav_w_256, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
+ X86_INTRINSIC_DATA(avx512_psrav_w_512, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
+ X86_INTRINSIC_DATA(avx512_psrl_d_512, INTR_TYPE_2OP, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(avx512_psrl_q_512, INTR_TYPE_2OP, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(avx512_psrl_w_512, INTR_TYPE_2OP, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(avx512_psrli_d_512, VSHIFT, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(avx512_psrli_q_512, VSHIFT, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(avx512_psrli_w_512, VSHIFT, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(avx512_psrlv_d_512, INTR_TYPE_2OP, ISD::SRL, 0),
+ X86_INTRINSIC_DATA(avx512_psrlv_q_512, INTR_TYPE_2OP, ISD::SRL, 0),
+ X86_INTRINSIC_DATA(avx512_psrlv_w_128, INTR_TYPE_2OP, ISD::SRL, 0),
+ X86_INTRINSIC_DATA(avx512_psrlv_w_256, INTR_TYPE_2OP, ISD::SRL, 0),
+ X86_INTRINSIC_DATA(avx512_psrlv_w_512, INTR_TYPE_2OP, ISD::SRL, 0),
+ X86_INTRINSIC_DATA(avx512_ptestm_b_128, CMP_MASK, X86ISD::TESTM, 0),
+ X86_INTRINSIC_DATA(avx512_ptestm_b_256, CMP_MASK, X86ISD::TESTM, 0),
+ X86_INTRINSIC_DATA(avx512_ptestm_b_512, CMP_MASK, X86ISD::TESTM, 0),
+ X86_INTRINSIC_DATA(avx512_ptestm_d_128, CMP_MASK, X86ISD::TESTM, 0),
+ X86_INTRINSIC_DATA(avx512_ptestm_d_256, CMP_MASK, X86ISD::TESTM, 0),
+ X86_INTRINSIC_DATA(avx512_ptestm_d_512, CMP_MASK, X86ISD::TESTM, 0),
+ X86_INTRINSIC_DATA(avx512_ptestm_q_128, CMP_MASK, X86ISD::TESTM, 0),
+ X86_INTRINSIC_DATA(avx512_ptestm_q_256, CMP_MASK, X86ISD::TESTM, 0),
+ X86_INTRINSIC_DATA(avx512_ptestm_q_512, CMP_MASK, X86ISD::TESTM, 0),
+ X86_INTRINSIC_DATA(avx512_ptestm_w_128, CMP_MASK, X86ISD::TESTM, 0),
+ X86_INTRINSIC_DATA(avx512_ptestm_w_256, CMP_MASK, X86ISD::TESTM, 0),
+ X86_INTRINSIC_DATA(avx512_ptestm_w_512, CMP_MASK, X86ISD::TESTM, 0),
+ X86_INTRINSIC_DATA(avx512_ptestnm_b_128, CMP_MASK, X86ISD::TESTNM, 0),
+ X86_INTRINSIC_DATA(avx512_ptestnm_b_256, CMP_MASK, X86ISD::TESTNM, 0),
+ X86_INTRINSIC_DATA(avx512_ptestnm_b_512, CMP_MASK, X86ISD::TESTNM, 0),
+ X86_INTRINSIC_DATA(avx512_ptestnm_d_128, CMP_MASK, X86ISD::TESTNM, 0),
+ X86_INTRINSIC_DATA(avx512_ptestnm_d_256, CMP_MASK, X86ISD::TESTNM, 0),
+ X86_INTRINSIC_DATA(avx512_ptestnm_d_512, CMP_MASK, X86ISD::TESTNM, 0),
+ X86_INTRINSIC_DATA(avx512_ptestnm_q_128, CMP_MASK, X86ISD::TESTNM, 0),
+ X86_INTRINSIC_DATA(avx512_ptestnm_q_256, CMP_MASK, X86ISD::TESTNM, 0),
+ X86_INTRINSIC_DATA(avx512_ptestnm_q_512, CMP_MASK, X86ISD::TESTNM, 0),
+ X86_INTRINSIC_DATA(avx512_ptestnm_w_128, CMP_MASK, X86ISD::TESTNM, 0),
+ X86_INTRINSIC_DATA(avx512_ptestnm_w_256, CMP_MASK, X86ISD::TESTNM, 0),
+ X86_INTRINSIC_DATA(avx512_ptestnm_w_512, CMP_MASK, X86ISD::TESTNM, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRCPS, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRCPS, 0),
+ X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0),
+ X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0),
+ X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28S, 0),
+ X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28S, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRTS, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRTS, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28S, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28S, 0),
+ X86_INTRINSIC_DATA(avx512_vcomi_sd, COMI_RM, X86ISD::COMI, X86ISD::UCOMI),
+ X86_INTRINSIC_DATA(avx512_vcomi_ss, COMI_RM, X86ISD::COMI, X86ISD::UCOMI),
+ X86_INTRINSIC_DATA(avx512_vcvtsd2si32, INTR_TYPE_2OP, X86ISD::CVTS2SI_RND, 0),
+ X86_INTRINSIC_DATA(avx512_vcvtsd2si64, INTR_TYPE_2OP, X86ISD::CVTS2SI_RND, 0),
+ X86_INTRINSIC_DATA(avx512_vcvtsd2usi32, INTR_TYPE_2OP, X86ISD::CVTS2UI_RND, 0),
+ X86_INTRINSIC_DATA(avx512_vcvtsd2usi64, INTR_TYPE_2OP, X86ISD::CVTS2UI_RND, 0),
+ X86_INTRINSIC_DATA(avx512_vcvtss2si32, INTR_TYPE_2OP, X86ISD::CVTS2SI_RND, 0),
+ X86_INTRINSIC_DATA(avx512_vcvtss2si64, INTR_TYPE_2OP, X86ISD::CVTS2SI_RND, 0),
+ X86_INTRINSIC_DATA(avx512_vcvtss2usi32, INTR_TYPE_2OP, X86ISD::CVTS2UI_RND, 0),
+ X86_INTRINSIC_DATA(avx512_vcvtss2usi64, INTR_TYPE_2OP, X86ISD::CVTS2UI_RND, 0),
+ X86_INTRINSIC_DATA(avx512_vpermilvar_pd_512, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
+ X86_INTRINSIC_DATA(avx512_vpermilvar_ps_512, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
+ X86_INTRINSIC_DATA(fma_vfmadd_pd, INTR_TYPE_3OP, X86ISD::FMADD, 0),
+ X86_INTRINSIC_DATA(fma_vfmadd_pd_256, INTR_TYPE_3OP, X86ISD::FMADD, 0),
+ X86_INTRINSIC_DATA(fma_vfmadd_ps, INTR_TYPE_3OP, X86ISD::FMADD, 0),
+ X86_INTRINSIC_DATA(fma_vfmadd_ps_256, INTR_TYPE_3OP, X86ISD::FMADD, 0),
+ X86_INTRINSIC_DATA(fma_vfmaddsub_pd, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(fma_vfmaddsub_pd_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(fma_vfmaddsub_ps, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(fma_vfmaddsub_ps_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(fma_vfmsub_pd, INTR_TYPE_3OP, X86ISD::FMSUB, 0),
+ X86_INTRINSIC_DATA(fma_vfmsub_pd_256, INTR_TYPE_3OP, X86ISD::FMSUB, 0),
+ X86_INTRINSIC_DATA(fma_vfmsub_ps, INTR_TYPE_3OP, X86ISD::FMSUB, 0),
+ X86_INTRINSIC_DATA(fma_vfmsub_ps_256, INTR_TYPE_3OP, X86ISD::FMSUB, 0),
+ X86_INTRINSIC_DATA(fma_vfmsubadd_pd, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
+ X86_INTRINSIC_DATA(fma_vfmsubadd_pd_256, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
+ X86_INTRINSIC_DATA(fma_vfmsubadd_ps, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
+ X86_INTRINSIC_DATA(fma_vfmsubadd_ps_256, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
+ X86_INTRINSIC_DATA(fma_vfnmadd_pd, INTR_TYPE_3OP, X86ISD::FNMADD, 0),
+ X86_INTRINSIC_DATA(fma_vfnmadd_pd_256, INTR_TYPE_3OP, X86ISD::FNMADD, 0),
+ X86_INTRINSIC_DATA(fma_vfnmadd_ps, INTR_TYPE_3OP, X86ISD::FNMADD, 0),
+ X86_INTRINSIC_DATA(fma_vfnmadd_ps_256, INTR_TYPE_3OP, X86ISD::FNMADD, 0),
+ X86_INTRINSIC_DATA(fma_vfnmsub_pd, INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
+ X86_INTRINSIC_DATA(fma_vfnmsub_pd_256, INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
+ X86_INTRINSIC_DATA(fma_vfnmsub_ps, INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
+ X86_INTRINSIC_DATA(fma_vfnmsub_ps_256, INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
+ X86_INTRINSIC_DATA(sse_comieq_ss, COMI, X86ISD::COMI, ISD::SETEQ),
+ X86_INTRINSIC_DATA(sse_comige_ss, COMI, X86ISD::COMI, ISD::SETGE),
+ X86_INTRINSIC_DATA(sse_comigt_ss, COMI, X86ISD::COMI, ISD::SETGT),
+ X86_INTRINSIC_DATA(sse_comile_ss, COMI, X86ISD::COMI, ISD::SETLE),
+ X86_INTRINSIC_DATA(sse_comilt_ss, COMI, X86ISD::COMI, ISD::SETLT),
+ X86_INTRINSIC_DATA(sse_comineq_ss, COMI, X86ISD::COMI, ISD::SETNE),
+ X86_INTRINSIC_DATA(sse_max_ps, INTR_TYPE_2OP, X86ISD::FMAX, 0),
+ X86_INTRINSIC_DATA(sse_min_ps, INTR_TYPE_2OP, X86ISD::FMIN, 0),
+ X86_INTRINSIC_DATA(sse_movmsk_ps, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
+ X86_INTRINSIC_DATA(sse_rcp_ps, INTR_TYPE_1OP, X86ISD::FRCP, 0),
+ X86_INTRINSIC_DATA(sse_rsqrt_ps, INTR_TYPE_1OP, X86ISD::FRSQRT, 0),
+ X86_INTRINSIC_DATA(sse_sqrt_ps, INTR_TYPE_1OP, ISD::FSQRT, 0),
+ X86_INTRINSIC_DATA(sse_ucomieq_ss, COMI, X86ISD::UCOMI, ISD::SETEQ),
+ X86_INTRINSIC_DATA(sse_ucomige_ss, COMI, X86ISD::UCOMI, ISD::SETGE),
+ X86_INTRINSIC_DATA(sse_ucomigt_ss, COMI, X86ISD::UCOMI, ISD::SETGT),
+ X86_INTRINSIC_DATA(sse_ucomile_ss, COMI, X86ISD::UCOMI, ISD::SETLE),
+ X86_INTRINSIC_DATA(sse_ucomilt_ss, COMI, X86ISD::UCOMI, ISD::SETLT),
+ X86_INTRINSIC_DATA(sse_ucomineq_ss, COMI, X86ISD::UCOMI, ISD::SETNE),
+ X86_INTRINSIC_DATA(sse2_comieq_sd, COMI, X86ISD::COMI, ISD::SETEQ),
+ X86_INTRINSIC_DATA(sse2_comige_sd, COMI, X86ISD::COMI, ISD::SETGE),
+ X86_INTRINSIC_DATA(sse2_comigt_sd, COMI, X86ISD::COMI, ISD::SETGT),
+ X86_INTRINSIC_DATA(sse2_comile_sd, COMI, X86ISD::COMI, ISD::SETLE),
+ X86_INTRINSIC_DATA(sse2_comilt_sd, COMI, X86ISD::COMI, ISD::SETLT),
+ X86_INTRINSIC_DATA(sse2_comineq_sd, COMI, X86ISD::COMI, ISD::SETNE),
+ X86_INTRINSIC_DATA(sse2_cvtdq2ps, INTR_TYPE_1OP, ISD::SINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(sse2_cvtpd2dq, INTR_TYPE_1OP, X86ISD::CVTP2SI, 0),
+ X86_INTRINSIC_DATA(sse2_cvtpd2ps, INTR_TYPE_1OP, X86ISD::VFPROUND, 0),
+ X86_INTRINSIC_DATA(sse2_cvttpd2dq, INTR_TYPE_1OP, X86ISD::CVTTP2SI, 0),
+ X86_INTRINSIC_DATA(sse2_cvttps2dq, INTR_TYPE_1OP, ISD::FP_TO_SINT, 0),
+ X86_INTRINSIC_DATA(sse2_max_pd, INTR_TYPE_2OP, X86ISD::FMAX, 0),
+ X86_INTRINSIC_DATA(sse2_min_pd, INTR_TYPE_2OP, X86ISD::FMIN, 0),
+ X86_INTRINSIC_DATA(sse2_movmsk_pd, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
+ X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
+ X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
+ X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(sse2_padds_b, INTR_TYPE_2OP, X86ISD::ADDS, 0),
+ X86_INTRINSIC_DATA(sse2_padds_w, INTR_TYPE_2OP, X86ISD::ADDS, 0),
+ X86_INTRINSIC_DATA(sse2_paddus_b, INTR_TYPE_2OP, X86ISD::ADDUS, 0),
+ X86_INTRINSIC_DATA(sse2_paddus_w, INTR_TYPE_2OP, X86ISD::ADDUS, 0),
+ X86_INTRINSIC_DATA(sse2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(sse2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(sse2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0),
+ X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
+ X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0),
+ X86_INTRINSIC_DATA(sse2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0),
+ X86_INTRINSIC_DATA(sse2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
+ X86_INTRINSIC_DATA(sse2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
+ X86_INTRINSIC_DATA(sse2_psll_d, INTR_TYPE_2OP, X86ISD::VSHL, 0),
+ X86_INTRINSIC_DATA(sse2_psll_q, INTR_TYPE_2OP, X86ISD::VSHL, 0),
+ X86_INTRINSIC_DATA(sse2_psll_w, INTR_TYPE_2OP, X86ISD::VSHL, 0),
+ X86_INTRINSIC_DATA(sse2_pslli_d, VSHIFT, X86ISD::VSHLI, 0),
+ X86_INTRINSIC_DATA(sse2_pslli_q, VSHIFT, X86ISD::VSHLI, 0),
+ X86_INTRINSIC_DATA(sse2_pslli_w, VSHIFT, X86ISD::VSHLI, 0),
+ X86_INTRINSIC_DATA(sse2_psra_d, INTR_TYPE_2OP, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(sse2_psra_w, INTR_TYPE_2OP, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(sse2_psrai_d, VSHIFT, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(sse2_psrai_w, VSHIFT, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(sse2_psrl_d, INTR_TYPE_2OP, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(sse2_psrl_q, INTR_TYPE_2OP, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(sse2_psrl_w, INTR_TYPE_2OP, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(sse2_psrli_d, VSHIFT, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(sse2_psrli_q, VSHIFT, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(sse2_psrli_w, VSHIFT, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(sse2_psubs_b, INTR_TYPE_2OP, X86ISD::SUBS, 0),
+ X86_INTRINSIC_DATA(sse2_psubs_w, INTR_TYPE_2OP, X86ISD::SUBS, 0),
+ X86_INTRINSIC_DATA(sse2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
+ X86_INTRINSIC_DATA(sse2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
+ X86_INTRINSIC_DATA(sse2_sqrt_pd, INTR_TYPE_1OP, ISD::FSQRT, 0),
+ X86_INTRINSIC_DATA(sse2_ucomieq_sd, COMI, X86ISD::UCOMI, ISD::SETEQ),
+ X86_INTRINSIC_DATA(sse2_ucomige_sd, COMI, X86ISD::UCOMI, ISD::SETGE),
+ X86_INTRINSIC_DATA(sse2_ucomigt_sd, COMI, X86ISD::UCOMI, ISD::SETGT),
+ X86_INTRINSIC_DATA(sse2_ucomile_sd, COMI, X86ISD::UCOMI, ISD::SETLE),
+ X86_INTRINSIC_DATA(sse2_ucomilt_sd, COMI, X86ISD::UCOMI, ISD::SETLT),
+ X86_INTRINSIC_DATA(sse2_ucomineq_sd, COMI, X86ISD::UCOMI, ISD::SETNE),
+ X86_INTRINSIC_DATA(sse3_hadd_pd, INTR_TYPE_2OP, X86ISD::FHADD, 0),
+ X86_INTRINSIC_DATA(sse3_hadd_ps, INTR_TYPE_2OP, X86ISD::FHADD, 0),
+ X86_INTRINSIC_DATA(sse3_hsub_pd, INTR_TYPE_2OP, X86ISD::FHSUB, 0),
+ X86_INTRINSIC_DATA(sse3_hsub_ps, INTR_TYPE_2OP, X86ISD::FHSUB, 0),
+ X86_INTRINSIC_DATA(sse41_insertps, INTR_TYPE_3OP, X86ISD::INSERTPS, 0),
+ X86_INTRINSIC_DATA(sse41_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(sse41_pmuldq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
+ X86_INTRINSIC_DATA(sse4a_extrqi, INTR_TYPE_3OP, X86ISD::EXTRQI, 0),
+ X86_INTRINSIC_DATA(sse4a_insertqi, INTR_TYPE_4OP, X86ISD::INSERTQI, 0),
+ X86_INTRINSIC_DATA(ssse3_pabs_b_128, INTR_TYPE_1OP, X86ISD::ABS, 0),
+ X86_INTRINSIC_DATA(ssse3_pabs_d_128, INTR_TYPE_1OP, X86ISD::ABS, 0),
+ X86_INTRINSIC_DATA(ssse3_pabs_w_128, INTR_TYPE_1OP, X86ISD::ABS, 0),
+ X86_INTRINSIC_DATA(ssse3_phadd_d_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
+ X86_INTRINSIC_DATA(ssse3_phadd_w_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
+ X86_INTRINSIC_DATA(ssse3_phsub_d_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
+ X86_INTRINSIC_DATA(ssse3_phsub_w_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
+ X86_INTRINSIC_DATA(ssse3_pmadd_ub_sw_128, INTR_TYPE_2OP, X86ISD::VPMADDUBSW, 0),
+ X86_INTRINSIC_DATA(ssse3_pmul_hr_sw_128, INTR_TYPE_2OP, X86ISD::MULHRS, 0),
+ X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
+ X86_INTRINSIC_DATA(xop_vpcomb, INTR_TYPE_3OP, X86ISD::VPCOM, 0),
+ X86_INTRINSIC_DATA(xop_vpcomd, INTR_TYPE_3OP, X86ISD::VPCOM, 0),
+ X86_INTRINSIC_DATA(xop_vpcomq, INTR_TYPE_3OP, X86ISD::VPCOM, 0),
+ X86_INTRINSIC_DATA(xop_vpcomub, INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
+ X86_INTRINSIC_DATA(xop_vpcomud, INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
+ X86_INTRINSIC_DATA(xop_vpcomuq, INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
+ X86_INTRINSIC_DATA(xop_vpcomuw, INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
+ X86_INTRINSIC_DATA(xop_vpcomw, INTR_TYPE_3OP, X86ISD::VPCOM, 0),
+ X86_INTRINSIC_DATA(xop_vpermil2pd, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
+ X86_INTRINSIC_DATA(xop_vpermil2pd_256, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
+ X86_INTRINSIC_DATA(xop_vpermil2ps, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
+ X86_INTRINSIC_DATA(xop_vpermil2ps_256, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
+ X86_INTRINSIC_DATA(xop_vpperm, INTR_TYPE_3OP, X86ISD::VPPERM, 0),
+ X86_INTRINSIC_DATA(xop_vprotb, INTR_TYPE_2OP, X86ISD::VPROT, 0),
+ X86_INTRINSIC_DATA(xop_vprotbi, INTR_TYPE_2OP, X86ISD::VPROTI, 0),
+ X86_INTRINSIC_DATA(xop_vprotd, INTR_TYPE_2OP, X86ISD::VPROT, 0),
+ X86_INTRINSIC_DATA(xop_vprotdi, INTR_TYPE_2OP, X86ISD::VPROTI, 0),
+ X86_INTRINSIC_DATA(xop_vprotq, INTR_TYPE_2OP, X86ISD::VPROT, 0),
+ X86_INTRINSIC_DATA(xop_vprotqi, INTR_TYPE_2OP, X86ISD::VPROTI, 0),
+ X86_INTRINSIC_DATA(xop_vprotw, INTR_TYPE_2OP, X86ISD::VPROT, 0),
+ X86_INTRINSIC_DATA(xop_vprotwi, INTR_TYPE_2OP, X86ISD::VPROTI, 0),
+ X86_INTRINSIC_DATA(xop_vpshab, INTR_TYPE_2OP, X86ISD::VPSHA, 0),
+ X86_INTRINSIC_DATA(xop_vpshad, INTR_TYPE_2OP, X86ISD::VPSHA, 0),
+ X86_INTRINSIC_DATA(xop_vpshaq, INTR_TYPE_2OP, X86ISD::VPSHA, 0),
+ X86_INTRINSIC_DATA(xop_vpshaw, INTR_TYPE_2OP, X86ISD::VPSHA, 0),
+ X86_INTRINSIC_DATA(xop_vpshlb, INTR_TYPE_2OP, X86ISD::VPSHL, 0),
+ X86_INTRINSIC_DATA(xop_vpshld, INTR_TYPE_2OP, X86ISD::VPSHL, 0),
+ X86_INTRINSIC_DATA(xop_vpshlq, INTR_TYPE_2OP, X86ISD::VPSHL, 0),
+ X86_INTRINSIC_DATA(xop_vpshlw, INTR_TYPE_2OP, X86ISD::VPSHL, 0)
+};
+
+/*
+ * Retrieve data for Intrinsic without chain.
+ * Return nullptr if intrinsic is not defined in the table.
+ */
+static const IntrinsicData* getIntrinsicWithoutChain(uint16_t IntNo) {
+ IntrinsicData IntrinsicToFind = { IntNo, INTR_NO_TYPE, 0, 0 };
+ const IntrinsicData *Data = std::lower_bound(std::begin(IntrinsicsWithoutChain),
+ std::end(IntrinsicsWithoutChain),
+ IntrinsicToFind);
+ if (Data != std::end(IntrinsicsWithoutChain) && *Data == IntrinsicToFind)
+ return Data;
+ return nullptr;
+}
+
+static void verifyIntrinsicTables() {
+ assert(std::is_sorted(std::begin(IntrinsicsWithoutChain),
+ std::end(IntrinsicsWithoutChain)) &&
+ std::is_sorted(std::begin(IntrinsicsWithChain),
+ std::end(IntrinsicsWithChain)) &&
+ "Intrinsic data tables should be sorted by Intrinsic ID");
+ assert((std::adjacent_find(std::begin(IntrinsicsWithoutChain),
+ std::end(IntrinsicsWithoutChain)) ==
+ std::end(IntrinsicsWithoutChain)) &&
+ (std::adjacent_find(std::begin(IntrinsicsWithChain),
+ std::end(IntrinsicsWithChain)) ==
+ std::end(IntrinsicsWithChain)) &&
+ "Intrinsic data tables should have unique entries");
+}
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp
new file mode 100644
index 000000000000..2f69df064e7f
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -0,0 +1,1795 @@
+//===-- X86MCInstLower.cpp - Convert X86 MachineInstr to an MCInst --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower X86 MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86AsmPrinter.h"
+#include "X86RegisterInfo.h"
+#include "X86ShuffleDecodeConstantPool.h"
+#include "InstPrinter/X86ATTInstPrinter.h"
+#include "InstPrinter/X86InstComments.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "Utils/X86ShuffleDecode.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+using namespace llvm;
+
+namespace {
+
+/// X86MCInstLower - This class is used to lower an MachineInstr into an MCInst.
+class X86MCInstLower {
+ MCContext &Ctx;
+ const MachineFunction &MF;
+ const TargetMachine &TM;
+ const MCAsmInfo &MAI;
+ X86AsmPrinter &AsmPrinter;
+public:
+ X86MCInstLower(const MachineFunction &MF, X86AsmPrinter &asmprinter);
+
+ Optional<MCOperand> LowerMachineOperand(const MachineInstr *MI,
+ const MachineOperand &MO) const;
+ void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+ MCSymbol *GetSymbolFromOperand(const MachineOperand &MO) const;
+ MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+
+private:
+ MachineModuleInfoMachO &getMachOMMI() const;
+};
+
+} // end anonymous namespace
+
+// Emit a minimal sequence of nops spanning NumBytes bytes.
+static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
+ const MCSubtargetInfo &STI);
+
+void X86AsmPrinter::StackMapShadowTracker::count(MCInst &Inst,
+ const MCSubtargetInfo &STI,
+ MCCodeEmitter *CodeEmitter) {
+ if (InShadow) {
+ SmallString<256> Code;
+ SmallVector<MCFixup, 4> Fixups;
+ raw_svector_ostream VecOS(Code);
+ CodeEmitter->encodeInstruction(Inst, VecOS, Fixups, STI);
+ CurrentShadowSize += Code.size();
+ if (CurrentShadowSize >= RequiredShadowSize)
+ InShadow = false; // The shadow is big enough. Stop counting.
+ }
+}
+
+void X86AsmPrinter::StackMapShadowTracker::emitShadowPadding(
+ MCStreamer &OutStreamer, const MCSubtargetInfo &STI) {
+ if (InShadow && CurrentShadowSize < RequiredShadowSize) {
+ InShadow = false;
+ EmitNops(OutStreamer, RequiredShadowSize - CurrentShadowSize,
+ MF->getSubtarget<X86Subtarget>().is64Bit(), STI);
+ }
+}
+
+void X86AsmPrinter::EmitAndCountInstruction(MCInst &Inst) {
+ OutStreamer->EmitInstruction(Inst, getSubtargetInfo());
+ SMShadowTracker.count(Inst, getSubtargetInfo(), CodeEmitter.get());
+}
+
+X86MCInstLower::X86MCInstLower(const MachineFunction &mf,
+ X86AsmPrinter &asmprinter)
+ : Ctx(mf.getContext()), MF(mf), TM(mf.getTarget()), MAI(*TM.getMCAsmInfo()),
+ AsmPrinter(asmprinter) {}
+
+MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const {
+ return MF.getMMI().getObjFileInfo<MachineModuleInfoMachO>();
+}
+
+
+/// GetSymbolFromOperand - Lower an MO_GlobalAddress or MO_ExternalSymbol
+/// operand to an MCSymbol.
+MCSymbol *X86MCInstLower::
+GetSymbolFromOperand(const MachineOperand &MO) const {
+ const DataLayout &DL = MF.getDataLayout();
+ assert((MO.isGlobal() || MO.isSymbol() || MO.isMBB()) && "Isn't a symbol reference");
+
+ MCSymbol *Sym = nullptr;
+ SmallString<128> Name;
+ StringRef Suffix;
+
+ switch (MO.getTargetFlags()) {
+ case X86II::MO_DLLIMPORT:
+ // Handle dllimport linkage.
+ Name += "__imp_";
+ break;
+ case X86II::MO_DARWIN_NONLAZY:
+ case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
+ Suffix = "$non_lazy_ptr";
+ break;
+ }
+
+ if (!Suffix.empty())
+ Name += DL.getPrivateGlobalPrefix();
+
+ if (MO.isGlobal()) {
+ const GlobalValue *GV = MO.getGlobal();
+ AsmPrinter.getNameWithPrefix(Name, GV);
+ } else if (MO.isSymbol()) {
+ Mangler::getNameWithPrefix(Name, MO.getSymbolName(), DL);
+ } else if (MO.isMBB()) {
+ assert(Suffix.empty());
+ Sym = MO.getMBB()->getSymbol();
+ }
+
+ Name += Suffix;
+ if (!Sym)
+ Sym = Ctx.getOrCreateSymbol(Name);
+
+ // If the target flags on the operand changes the name of the symbol, do that
+ // before we return the symbol.
+ switch (MO.getTargetFlags()) {
+ default: break;
+ case X86II::MO_DARWIN_NONLAZY:
+ case X86II::MO_DARWIN_NONLAZY_PIC_BASE: {
+ MachineModuleInfoImpl::StubValueTy &StubSym =
+ getMachOMMI().getGVStubEntry(Sym);
+ if (!StubSym.getPointer()) {
+ assert(MO.isGlobal() && "Extern symbol not handled yet");
+ StubSym =
+ MachineModuleInfoImpl::
+ StubValueTy(AsmPrinter.getSymbol(MO.getGlobal()),
+ !MO.getGlobal()->hasInternalLinkage());
+ }
+ break;
+ }
+ }
+
+ return Sym;
+}
+
+MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
+ MCSymbol *Sym) const {
+ // FIXME: We would like an efficient form for this, so we don't have to do a
+ // lot of extra uniquing.
+ const MCExpr *Expr = nullptr;
+ MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
+
+ switch (MO.getTargetFlags()) {
+ default: llvm_unreachable("Unknown target flag on GV operand");
+ case X86II::MO_NO_FLAG: // No flag.
+ // These affect the name of the symbol, not any suffix.
+ case X86II::MO_DARWIN_NONLAZY:
+ case X86II::MO_DLLIMPORT:
+ break;
+
+ case X86II::MO_TLVP: RefKind = MCSymbolRefExpr::VK_TLVP; break;
+ case X86II::MO_TLVP_PIC_BASE:
+ Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_TLVP, Ctx);
+ // Subtract the pic base.
+ Expr = MCBinaryExpr::createSub(Expr,
+ MCSymbolRefExpr::create(MF.getPICBaseSymbol(),
+ Ctx),
+ Ctx);
+ break;
+ case X86II::MO_SECREL: RefKind = MCSymbolRefExpr::VK_SECREL; break;
+ case X86II::MO_TLSGD: RefKind = MCSymbolRefExpr::VK_TLSGD; break;
+ case X86II::MO_TLSLD: RefKind = MCSymbolRefExpr::VK_TLSLD; break;
+ case X86II::MO_TLSLDM: RefKind = MCSymbolRefExpr::VK_TLSLDM; break;
+ case X86II::MO_GOTTPOFF: RefKind = MCSymbolRefExpr::VK_GOTTPOFF; break;
+ case X86II::MO_INDNTPOFF: RefKind = MCSymbolRefExpr::VK_INDNTPOFF; break;
+ case X86II::MO_TPOFF: RefKind = MCSymbolRefExpr::VK_TPOFF; break;
+ case X86II::MO_DTPOFF: RefKind = MCSymbolRefExpr::VK_DTPOFF; break;
+ case X86II::MO_NTPOFF: RefKind = MCSymbolRefExpr::VK_NTPOFF; break;
+ case X86II::MO_GOTNTPOFF: RefKind = MCSymbolRefExpr::VK_GOTNTPOFF; break;
+ case X86II::MO_GOTPCREL: RefKind = MCSymbolRefExpr::VK_GOTPCREL; break;
+ case X86II::MO_GOT: RefKind = MCSymbolRefExpr::VK_GOT; break;
+ case X86II::MO_GOTOFF: RefKind = MCSymbolRefExpr::VK_GOTOFF; break;
+ case X86II::MO_PLT: RefKind = MCSymbolRefExpr::VK_PLT; break;
+ case X86II::MO_PIC_BASE_OFFSET:
+ case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
+ Expr = MCSymbolRefExpr::create(Sym, Ctx);
+ // Subtract the pic base.
+ Expr = MCBinaryExpr::createSub(Expr,
+ MCSymbolRefExpr::create(MF.getPICBaseSymbol(), Ctx),
+ Ctx);
+ if (MO.isJTI()) {
+ assert(MAI.doesSetDirectiveSuppressReloc());
+ // If .set directive is supported, use it to reduce the number of
+ // relocations the assembler will generate for differences between
+ // local labels. This is only safe when the symbols are in the same
+ // section so we are restricting it to jumptable references.
+ MCSymbol *Label = Ctx.createTempSymbol();
+ AsmPrinter.OutStreamer->EmitAssignment(Label, Expr);
+ Expr = MCSymbolRefExpr::create(Label, Ctx);
+ }
+ break;
+ }
+
+ if (!Expr)
+ Expr = MCSymbolRefExpr::create(Sym, RefKind, Ctx);
+
+ if (!MO.isJTI() && !MO.isMBB() && MO.getOffset())
+ Expr = MCBinaryExpr::createAdd(Expr,
+ MCConstantExpr::create(MO.getOffset(), Ctx),
+ Ctx);
+ return MCOperand::createExpr(Expr);
+}
+
+
+/// \brief Simplify FOO $imm, %{al,ax,eax,rax} to FOO $imm, for instruction with
+/// a short fixed-register form.
+static void SimplifyShortImmForm(MCInst &Inst, unsigned Opcode) {
+ unsigned ImmOp = Inst.getNumOperands() - 1;
+ assert(Inst.getOperand(0).isReg() &&
+ (Inst.getOperand(ImmOp).isImm() || Inst.getOperand(ImmOp).isExpr()) &&
+ ((Inst.getNumOperands() == 3 && Inst.getOperand(1).isReg() &&
+ Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg()) ||
+ Inst.getNumOperands() == 2) && "Unexpected instruction!");
+
+ // Check whether the destination register can be fixed.
+ unsigned Reg = Inst.getOperand(0).getReg();
+ if (Reg != X86::AL && Reg != X86::AX && Reg != X86::EAX && Reg != X86::RAX)
+ return;
+
+ // If so, rewrite the instruction.
+ MCOperand Saved = Inst.getOperand(ImmOp);
+ Inst = MCInst();
+ Inst.setOpcode(Opcode);
+ Inst.addOperand(Saved);
+}
+
+/// \brief If a movsx instruction has a shorter encoding for the used register
+/// simplify the instruction to use it instead.
+static void SimplifyMOVSX(MCInst &Inst) {
+ unsigned NewOpcode = 0;
+ unsigned Op0 = Inst.getOperand(0).getReg(), Op1 = Inst.getOperand(1).getReg();
+ switch (Inst.getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected instruction!");
+ case X86::MOVSX16rr8: // movsbw %al, %ax --> cbtw
+ if (Op0 == X86::AX && Op1 == X86::AL)
+ NewOpcode = X86::CBW;
+ break;
+ case X86::MOVSX32rr16: // movswl %ax, %eax --> cwtl
+ if (Op0 == X86::EAX && Op1 == X86::AX)
+ NewOpcode = X86::CWDE;
+ break;
+ case X86::MOVSX64rr32: // movslq %eax, %rax --> cltq
+ if (Op0 == X86::RAX && Op1 == X86::EAX)
+ NewOpcode = X86::CDQE;
+ break;
+ }
+
+ if (NewOpcode != 0) {
+ Inst = MCInst();
+ Inst.setOpcode(NewOpcode);
+ }
+}
+
+/// \brief Simplify things like MOV32rm to MOV32o32a.
+static void SimplifyShortMoveForm(X86AsmPrinter &Printer, MCInst &Inst,
+ unsigned Opcode) {
+ // Don't make these simplifications in 64-bit mode; other assemblers don't
+ // perform them because they make the code larger.
+ if (Printer.getSubtarget().is64Bit())
+ return;
+
+ bool IsStore = Inst.getOperand(0).isReg() && Inst.getOperand(1).isReg();
+ unsigned AddrBase = IsStore;
+ unsigned RegOp = IsStore ? 0 : 5;
+ unsigned AddrOp = AddrBase + 3;
+ assert(Inst.getNumOperands() == 6 && Inst.getOperand(RegOp).isReg() &&
+ Inst.getOperand(AddrBase + X86::AddrBaseReg).isReg() &&
+ Inst.getOperand(AddrBase + X86::AddrScaleAmt).isImm() &&
+ Inst.getOperand(AddrBase + X86::AddrIndexReg).isReg() &&
+ Inst.getOperand(AddrBase + X86::AddrSegmentReg).isReg() &&
+ (Inst.getOperand(AddrOp).isExpr() ||
+ Inst.getOperand(AddrOp).isImm()) &&
+ "Unexpected instruction!");
+
+ // Check whether the destination register can be fixed.
+ unsigned Reg = Inst.getOperand(RegOp).getReg();
+ if (Reg != X86::AL && Reg != X86::AX && Reg != X86::EAX && Reg != X86::RAX)
+ return;
+
+ // Check whether this is an absolute address.
+ // FIXME: We know TLVP symbol refs aren't, but there should be a better way
+ // to do this here.
+ bool Absolute = true;
+ if (Inst.getOperand(AddrOp).isExpr()) {
+ const MCExpr *MCE = Inst.getOperand(AddrOp).getExpr();
+ if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(MCE))
+ if (SRE->getKind() == MCSymbolRefExpr::VK_TLVP)
+ Absolute = false;
+ }
+
+ if (Absolute &&
+ (Inst.getOperand(AddrBase + X86::AddrBaseReg).getReg() != 0 ||
+ Inst.getOperand(AddrBase + X86::AddrScaleAmt).getImm() != 1 ||
+ Inst.getOperand(AddrBase + X86::AddrIndexReg).getReg() != 0))
+ return;
+
+ // If so, rewrite the instruction.
+ MCOperand Saved = Inst.getOperand(AddrOp);
+ MCOperand Seg = Inst.getOperand(AddrBase + X86::AddrSegmentReg);
+ Inst = MCInst();
+ Inst.setOpcode(Opcode);
+ Inst.addOperand(Saved);
+ Inst.addOperand(Seg);
+}
+
+static unsigned getRetOpcode(const X86Subtarget &Subtarget) {
+ return Subtarget.is64Bit() ? X86::RETQ : X86::RETL;
+}
+
+Optional<MCOperand>
+X86MCInstLower::LowerMachineOperand(const MachineInstr *MI,
+ const MachineOperand &MO) const {
+ switch (MO.getType()) {
+ default:
+ MI->dump();
+ llvm_unreachable("unknown operand type");
+ case MachineOperand::MO_Register:
+ // Ignore all implicit register operands.
+ if (MO.isImplicit())
+ return None;
+ return MCOperand::createReg(MO.getReg());
+ case MachineOperand::MO_Immediate:
+ return MCOperand::createImm(MO.getImm());
+ case MachineOperand::MO_MachineBasicBlock:
+ case MachineOperand::MO_GlobalAddress:
+ case MachineOperand::MO_ExternalSymbol:
+ return LowerSymbolOperand(MO, GetSymbolFromOperand(MO));
+ case MachineOperand::MO_MCSymbol:
+ return LowerSymbolOperand(MO, MO.getMCSymbol());
+ case MachineOperand::MO_JumpTableIndex:
+ return LowerSymbolOperand(MO, AsmPrinter.GetJTISymbol(MO.getIndex()));
+ case MachineOperand::MO_ConstantPoolIndex:
+ return LowerSymbolOperand(MO, AsmPrinter.GetCPISymbol(MO.getIndex()));
+ case MachineOperand::MO_BlockAddress:
+ return LowerSymbolOperand(
+ MO, AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress()));
+ case MachineOperand::MO_RegisterMask:
+ // Ignore call clobbers.
+ return None;
+ }
+}
+
+void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+ OutMI.setOpcode(MI->getOpcode());
+
+ for (const MachineOperand &MO : MI->operands())
+ if (auto MaybeMCOp = LowerMachineOperand(MI, MO))
+ OutMI.addOperand(MaybeMCOp.getValue());
+
+ // Handle a few special cases to eliminate operand modifiers.
+ReSimplify:
+ switch (OutMI.getOpcode()) {
+ case X86::LEA64_32r:
+ case X86::LEA64r:
+ case X86::LEA16r:
+ case X86::LEA32r:
+ // LEA should have a segment register, but it must be empty.
+ assert(OutMI.getNumOperands() == 1+X86::AddrNumOperands &&
+ "Unexpected # of LEA operands");
+ assert(OutMI.getOperand(1+X86::AddrSegmentReg).getReg() == 0 &&
+ "LEA has segment specified!");
+ break;
+
+ // Commute operands to get a smaller encoding by using VEX.R instead of VEX.B
+ // if one of the registers is extended, but other isn't.
+ case X86::VMOVZPQILo2PQIrr:
+ case X86::VMOVAPDrr:
+ case X86::VMOVAPDYrr:
+ case X86::VMOVAPSrr:
+ case X86::VMOVAPSYrr:
+ case X86::VMOVDQArr:
+ case X86::VMOVDQAYrr:
+ case X86::VMOVDQUrr:
+ case X86::VMOVDQUYrr:
+ case X86::VMOVUPDrr:
+ case X86::VMOVUPDYrr:
+ case X86::VMOVUPSrr:
+ case X86::VMOVUPSYrr: {
+ if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(0).getReg()) &&
+ X86II::isX86_64ExtendedReg(OutMI.getOperand(1).getReg())) {
+ unsigned NewOpc;
+ switch (OutMI.getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::VMOVZPQILo2PQIrr: NewOpc = X86::VMOVPQI2QIrr; break;
+ case X86::VMOVAPDrr: NewOpc = X86::VMOVAPDrr_REV; break;
+ case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break;
+ case X86::VMOVAPSrr: NewOpc = X86::VMOVAPSrr_REV; break;
+ case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break;
+ case X86::VMOVDQArr: NewOpc = X86::VMOVDQArr_REV; break;
+ case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break;
+ case X86::VMOVDQUrr: NewOpc = X86::VMOVDQUrr_REV; break;
+ case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break;
+ case X86::VMOVUPDrr: NewOpc = X86::VMOVUPDrr_REV; break;
+ case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break;
+ case X86::VMOVUPSrr: NewOpc = X86::VMOVUPSrr_REV; break;
+ case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break;
+ }
+ OutMI.setOpcode(NewOpc);
+ }
+ break;
+ }
+ case X86::VMOVSDrr:
+ case X86::VMOVSSrr: {
+ if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(0).getReg()) &&
+ X86II::isX86_64ExtendedReg(OutMI.getOperand(2).getReg())) {
+ unsigned NewOpc;
+ switch (OutMI.getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::VMOVSDrr: NewOpc = X86::VMOVSDrr_REV; break;
+ case X86::VMOVSSrr: NewOpc = X86::VMOVSSrr_REV; break;
+ }
+ OutMI.setOpcode(NewOpc);
+ }
+ break;
+ }
+
+ // TAILJMPr64, CALL64r, CALL64pcrel32 - These instructions have register
+ // inputs modeled as normal uses instead of implicit uses. As such, truncate
+ // off all but the first operand (the callee). FIXME: Change isel.
+ case X86::TAILJMPr64:
+ case X86::TAILJMPr64_REX:
+ case X86::CALL64r:
+ case X86::CALL64pcrel32: {
+ unsigned Opcode = OutMI.getOpcode();
+ MCOperand Saved = OutMI.getOperand(0);
+ OutMI = MCInst();
+ OutMI.setOpcode(Opcode);
+ OutMI.addOperand(Saved);
+ break;
+ }
+
+ case X86::EH_RETURN:
+ case X86::EH_RETURN64: {
+ OutMI = MCInst();
+ OutMI.setOpcode(getRetOpcode(AsmPrinter.getSubtarget()));
+ break;
+ }
+
+ case X86::CLEANUPRET: {
+ // Replace CATCHRET with the appropriate RET.
+ OutMI = MCInst();
+ OutMI.setOpcode(getRetOpcode(AsmPrinter.getSubtarget()));
+ break;
+ }
+
+ case X86::CATCHRET: {
+ // Replace CATCHRET with the appropriate RET.
+ const X86Subtarget &Subtarget = AsmPrinter.getSubtarget();
+ unsigned ReturnReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
+ OutMI = MCInst();
+ OutMI.setOpcode(getRetOpcode(Subtarget));
+ OutMI.addOperand(MCOperand::createReg(ReturnReg));
+ break;
+ }
+
+ // TAILJMPd, TAILJMPd64, TailJMPd_cc - Lower to the correct jump instruction.
+ { unsigned Opcode;
+ case X86::TAILJMPr: Opcode = X86::JMP32r; goto SetTailJmpOpcode;
+ case X86::TAILJMPd:
+ case X86::TAILJMPd64: Opcode = X86::JMP_1; goto SetTailJmpOpcode;
+ case X86::TAILJMPd_CC:
+ case X86::TAILJMPd64_CC:
+ Opcode = X86::GetCondBranchFromCond(
+ static_cast<X86::CondCode>(MI->getOperand(1).getImm()));
+ goto SetTailJmpOpcode;
+
+ SetTailJmpOpcode:
+ MCOperand Saved = OutMI.getOperand(0);
+ OutMI = MCInst();
+ OutMI.setOpcode(Opcode);
+ OutMI.addOperand(Saved);
+ break;
+ }
+
+ case X86::DEC16r:
+ case X86::DEC32r:
+ case X86::INC16r:
+ case X86::INC32r:
+ // If we aren't in 64-bit mode we can use the 1-byte inc/dec instructions.
+ if (!AsmPrinter.getSubtarget().is64Bit()) {
+ unsigned Opcode;
+ switch (OutMI.getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::DEC16r: Opcode = X86::DEC16r_alt; break;
+ case X86::DEC32r: Opcode = X86::DEC32r_alt; break;
+ case X86::INC16r: Opcode = X86::INC16r_alt; break;
+ case X86::INC32r: Opcode = X86::INC32r_alt; break;
+ }
+ OutMI.setOpcode(Opcode);
+ }
+ break;
+
+ // These are pseudo-ops for OR to help with the OR->ADD transformation. We do
+ // this with an ugly goto in case the resultant OR uses EAX and needs the
+ // short form.
+ case X86::ADD16rr_DB: OutMI.setOpcode(X86::OR16rr); goto ReSimplify;
+ case X86::ADD32rr_DB: OutMI.setOpcode(X86::OR32rr); goto ReSimplify;
+ case X86::ADD64rr_DB: OutMI.setOpcode(X86::OR64rr); goto ReSimplify;
+ case X86::ADD16ri_DB: OutMI.setOpcode(X86::OR16ri); goto ReSimplify;
+ case X86::ADD32ri_DB: OutMI.setOpcode(X86::OR32ri); goto ReSimplify;
+ case X86::ADD64ri32_DB: OutMI.setOpcode(X86::OR64ri32); goto ReSimplify;
+ case X86::ADD16ri8_DB: OutMI.setOpcode(X86::OR16ri8); goto ReSimplify;
+ case X86::ADD32ri8_DB: OutMI.setOpcode(X86::OR32ri8); goto ReSimplify;
+ case X86::ADD64ri8_DB: OutMI.setOpcode(X86::OR64ri8); goto ReSimplify;
+
+ // Atomic load and store require a separate pseudo-inst because Acquire
+ // implies mayStore and Release implies mayLoad; fix these to regular MOV
+ // instructions here
+ case X86::ACQUIRE_MOV8rm: OutMI.setOpcode(X86::MOV8rm); goto ReSimplify;
+ case X86::ACQUIRE_MOV16rm: OutMI.setOpcode(X86::MOV16rm); goto ReSimplify;
+ case X86::ACQUIRE_MOV32rm: OutMI.setOpcode(X86::MOV32rm); goto ReSimplify;
+ case X86::ACQUIRE_MOV64rm: OutMI.setOpcode(X86::MOV64rm); goto ReSimplify;
+ case X86::RELEASE_MOV8mr: OutMI.setOpcode(X86::MOV8mr); goto ReSimplify;
+ case X86::RELEASE_MOV16mr: OutMI.setOpcode(X86::MOV16mr); goto ReSimplify;
+ case X86::RELEASE_MOV32mr: OutMI.setOpcode(X86::MOV32mr); goto ReSimplify;
+ case X86::RELEASE_MOV64mr: OutMI.setOpcode(X86::MOV64mr); goto ReSimplify;
+ case X86::RELEASE_MOV8mi: OutMI.setOpcode(X86::MOV8mi); goto ReSimplify;
+ case X86::RELEASE_MOV16mi: OutMI.setOpcode(X86::MOV16mi); goto ReSimplify;
+ case X86::RELEASE_MOV32mi: OutMI.setOpcode(X86::MOV32mi); goto ReSimplify;
+ case X86::RELEASE_MOV64mi32: OutMI.setOpcode(X86::MOV64mi32); goto ReSimplify;
+ case X86::RELEASE_ADD8mi: OutMI.setOpcode(X86::ADD8mi); goto ReSimplify;
+ case X86::RELEASE_ADD8mr: OutMI.setOpcode(X86::ADD8mr); goto ReSimplify;
+ case X86::RELEASE_ADD32mi: OutMI.setOpcode(X86::ADD32mi); goto ReSimplify;
+ case X86::RELEASE_ADD32mr: OutMI.setOpcode(X86::ADD32mr); goto ReSimplify;
+ case X86::RELEASE_ADD64mi32: OutMI.setOpcode(X86::ADD64mi32); goto ReSimplify;
+ case X86::RELEASE_ADD64mr: OutMI.setOpcode(X86::ADD64mr); goto ReSimplify;
+ case X86::RELEASE_AND8mi: OutMI.setOpcode(X86::AND8mi); goto ReSimplify;
+ case X86::RELEASE_AND8mr: OutMI.setOpcode(X86::AND8mr); goto ReSimplify;
+ case X86::RELEASE_AND32mi: OutMI.setOpcode(X86::AND32mi); goto ReSimplify;
+ case X86::RELEASE_AND32mr: OutMI.setOpcode(X86::AND32mr); goto ReSimplify;
+ case X86::RELEASE_AND64mi32: OutMI.setOpcode(X86::AND64mi32); goto ReSimplify;
+ case X86::RELEASE_AND64mr: OutMI.setOpcode(X86::AND64mr); goto ReSimplify;
+ case X86::RELEASE_OR8mi: OutMI.setOpcode(X86::OR8mi); goto ReSimplify;
+ case X86::RELEASE_OR8mr: OutMI.setOpcode(X86::OR8mr); goto ReSimplify;
+ case X86::RELEASE_OR32mi: OutMI.setOpcode(X86::OR32mi); goto ReSimplify;
+ case X86::RELEASE_OR32mr: OutMI.setOpcode(X86::OR32mr); goto ReSimplify;
+ case X86::RELEASE_OR64mi32: OutMI.setOpcode(X86::OR64mi32); goto ReSimplify;
+ case X86::RELEASE_OR64mr: OutMI.setOpcode(X86::OR64mr); goto ReSimplify;
+ case X86::RELEASE_XOR8mi: OutMI.setOpcode(X86::XOR8mi); goto ReSimplify;
+ case X86::RELEASE_XOR8mr: OutMI.setOpcode(X86::XOR8mr); goto ReSimplify;
+ case X86::RELEASE_XOR32mi: OutMI.setOpcode(X86::XOR32mi); goto ReSimplify;
+ case X86::RELEASE_XOR32mr: OutMI.setOpcode(X86::XOR32mr); goto ReSimplify;
+ case X86::RELEASE_XOR64mi32: OutMI.setOpcode(X86::XOR64mi32); goto ReSimplify;
+ case X86::RELEASE_XOR64mr: OutMI.setOpcode(X86::XOR64mr); goto ReSimplify;
+ case X86::RELEASE_INC8m: OutMI.setOpcode(X86::INC8m); goto ReSimplify;
+ case X86::RELEASE_INC16m: OutMI.setOpcode(X86::INC16m); goto ReSimplify;
+ case X86::RELEASE_INC32m: OutMI.setOpcode(X86::INC32m); goto ReSimplify;
+ case X86::RELEASE_INC64m: OutMI.setOpcode(X86::INC64m); goto ReSimplify;
+ case X86::RELEASE_DEC8m: OutMI.setOpcode(X86::DEC8m); goto ReSimplify;
+ case X86::RELEASE_DEC16m: OutMI.setOpcode(X86::DEC16m); goto ReSimplify;
+ case X86::RELEASE_DEC32m: OutMI.setOpcode(X86::DEC32m); goto ReSimplify;
+ case X86::RELEASE_DEC64m: OutMI.setOpcode(X86::DEC64m); goto ReSimplify;
+
+ // We don't currently select the correct instruction form for instructions
+ // which have a short %eax, etc. form. Handle this by custom lowering, for
+ // now.
+ //
+ // Note, we are currently not handling the following instructions:
+ // MOV64ao8, MOV64o8a
+ // XCHG16ar, XCHG32ar, XCHG64ar
+ case X86::MOV8mr_NOREX:
+ case X86::MOV8mr:
+ case X86::MOV8rm_NOREX:
+ case X86::MOV8rm:
+ case X86::MOV16mr:
+ case X86::MOV16rm:
+ case X86::MOV32mr:
+ case X86::MOV32rm: {
+ unsigned NewOpc;
+ switch (OutMI.getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::MOV8mr_NOREX:
+ case X86::MOV8mr: NewOpc = X86::MOV8o32a; break;
+ case X86::MOV8rm_NOREX:
+ case X86::MOV8rm: NewOpc = X86::MOV8ao32; break;
+ case X86::MOV16mr: NewOpc = X86::MOV16o32a; break;
+ case X86::MOV16rm: NewOpc = X86::MOV16ao32; break;
+ case X86::MOV32mr: NewOpc = X86::MOV32o32a; break;
+ case X86::MOV32rm: NewOpc = X86::MOV32ao32; break;
+ }
+ SimplifyShortMoveForm(AsmPrinter, OutMI, NewOpc);
+ break;
+ }
+
+ case X86::ADC8ri: case X86::ADC16ri: case X86::ADC32ri: case X86::ADC64ri32:
+ case X86::ADD8ri: case X86::ADD16ri: case X86::ADD32ri: case X86::ADD64ri32:
+ case X86::AND8ri: case X86::AND16ri: case X86::AND32ri: case X86::AND64ri32:
+ case X86::CMP8ri: case X86::CMP16ri: case X86::CMP32ri: case X86::CMP64ri32:
+ case X86::OR8ri: case X86::OR16ri: case X86::OR32ri: case X86::OR64ri32:
+ case X86::SBB8ri: case X86::SBB16ri: case X86::SBB32ri: case X86::SBB64ri32:
+ case X86::SUB8ri: case X86::SUB16ri: case X86::SUB32ri: case X86::SUB64ri32:
+ case X86::TEST8ri:case X86::TEST16ri:case X86::TEST32ri:case X86::TEST64ri32:
+ case X86::XOR8ri: case X86::XOR16ri: case X86::XOR32ri: case X86::XOR64ri32: {
+ unsigned NewOpc;
+ switch (OutMI.getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::ADC8ri: NewOpc = X86::ADC8i8; break;
+ case X86::ADC16ri: NewOpc = X86::ADC16i16; break;
+ case X86::ADC32ri: NewOpc = X86::ADC32i32; break;
+ case X86::ADC64ri32: NewOpc = X86::ADC64i32; break;
+ case X86::ADD8ri: NewOpc = X86::ADD8i8; break;
+ case X86::ADD16ri: NewOpc = X86::ADD16i16; break;
+ case X86::ADD32ri: NewOpc = X86::ADD32i32; break;
+ case X86::ADD64ri32: NewOpc = X86::ADD64i32; break;
+ case X86::AND8ri: NewOpc = X86::AND8i8; break;
+ case X86::AND16ri: NewOpc = X86::AND16i16; break;
+ case X86::AND32ri: NewOpc = X86::AND32i32; break;
+ case X86::AND64ri32: NewOpc = X86::AND64i32; break;
+ case X86::CMP8ri: NewOpc = X86::CMP8i8; break;
+ case X86::CMP16ri: NewOpc = X86::CMP16i16; break;
+ case X86::CMP32ri: NewOpc = X86::CMP32i32; break;
+ case X86::CMP64ri32: NewOpc = X86::CMP64i32; break;
+ case X86::OR8ri: NewOpc = X86::OR8i8; break;
+ case X86::OR16ri: NewOpc = X86::OR16i16; break;
+ case X86::OR32ri: NewOpc = X86::OR32i32; break;
+ case X86::OR64ri32: NewOpc = X86::OR64i32; break;
+ case X86::SBB8ri: NewOpc = X86::SBB8i8; break;
+ case X86::SBB16ri: NewOpc = X86::SBB16i16; break;
+ case X86::SBB32ri: NewOpc = X86::SBB32i32; break;
+ case X86::SBB64ri32: NewOpc = X86::SBB64i32; break;
+ case X86::SUB8ri: NewOpc = X86::SUB8i8; break;
+ case X86::SUB16ri: NewOpc = X86::SUB16i16; break;
+ case X86::SUB32ri: NewOpc = X86::SUB32i32; break;
+ case X86::SUB64ri32: NewOpc = X86::SUB64i32; break;
+ case X86::TEST8ri: NewOpc = X86::TEST8i8; break;
+ case X86::TEST16ri: NewOpc = X86::TEST16i16; break;
+ case X86::TEST32ri: NewOpc = X86::TEST32i32; break;
+ case X86::TEST64ri32: NewOpc = X86::TEST64i32; break;
+ case X86::XOR8ri: NewOpc = X86::XOR8i8; break;
+ case X86::XOR16ri: NewOpc = X86::XOR16i16; break;
+ case X86::XOR32ri: NewOpc = X86::XOR32i32; break;
+ case X86::XOR64ri32: NewOpc = X86::XOR64i32; break;
+ }
+ SimplifyShortImmForm(OutMI, NewOpc);
+ break;
+ }
+
+ // Try to shrink some forms of movsx.
+ case X86::MOVSX16rr8:
+ case X86::MOVSX32rr16:
+ case X86::MOVSX64rr32:
+ SimplifyMOVSX(OutMI);
+ break;
+ }
+}
+
+void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
+ const MachineInstr &MI) {
+
+ bool is64Bits = MI.getOpcode() == X86::TLS_addr64 ||
+ MI.getOpcode() == X86::TLS_base_addr64;
+
+ bool needsPadding = MI.getOpcode() == X86::TLS_addr64;
+
+ MCContext &context = OutStreamer->getContext();
+
+ if (needsPadding)
+ EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
+
+ MCSymbolRefExpr::VariantKind SRVK;
+ switch (MI.getOpcode()) {
+ case X86::TLS_addr32:
+ case X86::TLS_addr64:
+ SRVK = MCSymbolRefExpr::VK_TLSGD;
+ break;
+ case X86::TLS_base_addr32:
+ SRVK = MCSymbolRefExpr::VK_TLSLDM;
+ break;
+ case X86::TLS_base_addr64:
+ SRVK = MCSymbolRefExpr::VK_TLSLD;
+ break;
+ default:
+ llvm_unreachable("unexpected opcode");
+ }
+
+ MCSymbol *sym = MCInstLowering.GetSymbolFromOperand(MI.getOperand(3));
+ const MCSymbolRefExpr *symRef = MCSymbolRefExpr::create(sym, SRVK, context);
+
+ MCInst LEA;
+ if (is64Bits) {
+ LEA.setOpcode(X86::LEA64r);
+ LEA.addOperand(MCOperand::createReg(X86::RDI)); // dest
+ LEA.addOperand(MCOperand::createReg(X86::RIP)); // base
+ LEA.addOperand(MCOperand::createImm(1)); // scale
+ LEA.addOperand(MCOperand::createReg(0)); // index
+ LEA.addOperand(MCOperand::createExpr(symRef)); // disp
+ LEA.addOperand(MCOperand::createReg(0)); // seg
+ } else if (SRVK == MCSymbolRefExpr::VK_TLSLDM) {
+ LEA.setOpcode(X86::LEA32r);
+ LEA.addOperand(MCOperand::createReg(X86::EAX)); // dest
+ LEA.addOperand(MCOperand::createReg(X86::EBX)); // base
+ LEA.addOperand(MCOperand::createImm(1)); // scale
+ LEA.addOperand(MCOperand::createReg(0)); // index
+ LEA.addOperand(MCOperand::createExpr(symRef)); // disp
+ LEA.addOperand(MCOperand::createReg(0)); // seg
+ } else {
+ LEA.setOpcode(X86::LEA32r);
+ LEA.addOperand(MCOperand::createReg(X86::EAX)); // dest
+ LEA.addOperand(MCOperand::createReg(0)); // base
+ LEA.addOperand(MCOperand::createImm(1)); // scale
+ LEA.addOperand(MCOperand::createReg(X86::EBX)); // index
+ LEA.addOperand(MCOperand::createExpr(symRef)); // disp
+ LEA.addOperand(MCOperand::createReg(0)); // seg
+ }
+ EmitAndCountInstruction(LEA);
+
+ if (needsPadding) {
+ EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
+ EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
+ EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX));
+ }
+
+ StringRef name = is64Bits ? "__tls_get_addr" : "___tls_get_addr";
+ MCSymbol *tlsGetAddr = context.getOrCreateSymbol(name);
+ const MCSymbolRefExpr *tlsRef =
+ MCSymbolRefExpr::create(tlsGetAddr,
+ MCSymbolRefExpr::VK_PLT,
+ context);
+
+ EmitAndCountInstruction(MCInstBuilder(is64Bits ? X86::CALL64pcrel32
+ : X86::CALLpcrel32)
+ .addExpr(tlsRef));
+}
+
+/// \brief Emit the largest nop instruction smaller than or equal to \p NumBytes
+/// bytes. Return the size of nop emitted.
+static unsigned EmitNop(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
+ const MCSubtargetInfo &STI) {
+ // This works only for 64bit. For 32bit we have to do additional checking if
+ // the CPU supports multi-byte nops.
+ assert(Is64Bit && "EmitNops only supports X86-64");
+
+ unsigned NopSize;
+ unsigned Opc, BaseReg, ScaleVal, IndexReg, Displacement, SegmentReg;
+ Opc = IndexReg = Displacement = SegmentReg = 0;
+ BaseReg = X86::RAX;
+ ScaleVal = 1;
+ switch (NumBytes) {
+ case 0: llvm_unreachable("Zero nops?"); break;
+ case 1: NopSize = 1; Opc = X86::NOOP; break;
+ case 2: NopSize = 2; Opc = X86::XCHG16ar; break;
+ case 3: NopSize = 3; Opc = X86::NOOPL; break;
+ case 4: NopSize = 4; Opc = X86::NOOPL; Displacement = 8; break;
+ case 5: NopSize = 5; Opc = X86::NOOPL; Displacement = 8;
+ IndexReg = X86::RAX; break;
+ case 6: NopSize = 6; Opc = X86::NOOPW; Displacement = 8;
+ IndexReg = X86::RAX; break;
+ case 7: NopSize = 7; Opc = X86::NOOPL; Displacement = 512; break;
+ case 8: NopSize = 8; Opc = X86::NOOPL; Displacement = 512;
+ IndexReg = X86::RAX; break;
+ case 9: NopSize = 9; Opc = X86::NOOPW; Displacement = 512;
+ IndexReg = X86::RAX; break;
+ default: NopSize = 10; Opc = X86::NOOPW; Displacement = 512;
+ IndexReg = X86::RAX; SegmentReg = X86::CS; break;
+ }
+
+ unsigned NumPrefixes = std::min(NumBytes - NopSize, 5U);
+ NopSize += NumPrefixes;
+ for (unsigned i = 0; i != NumPrefixes; ++i)
+ OS.EmitBytes("\x66");
+
+ switch (Opc) {
+ default:
+ llvm_unreachable("Unexpected opcode");
+ break;
+ case X86::NOOP:
+ OS.EmitInstruction(MCInstBuilder(Opc), STI);
+ break;
+ case X86::XCHG16ar:
+ OS.EmitInstruction(MCInstBuilder(Opc).addReg(X86::AX), STI);
+ break;
+ case X86::NOOPL:
+ case X86::NOOPW:
+ OS.EmitInstruction(MCInstBuilder(Opc)
+ .addReg(BaseReg)
+ .addImm(ScaleVal)
+ .addReg(IndexReg)
+ .addImm(Displacement)
+ .addReg(SegmentReg),
+ STI);
+ break;
+ }
+ assert(NopSize <= NumBytes && "We overemitted?");
+ return NopSize;
+}
+
+/// \brief Emit the optimal amount of multi-byte nops on X86.
+static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
+ const MCSubtargetInfo &STI) {
+ unsigned NopsToEmit = NumBytes;
+ (void)NopsToEmit;
+ while (NumBytes) {
+ NumBytes -= EmitNop(OS, NumBytes, Is64Bit, STI);
+ assert(NopsToEmit >= NumBytes && "Emitted more than I asked for!");
+ }
+}
+
+void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI,
+ X86MCInstLower &MCIL) {
+ assert(Subtarget->is64Bit() && "Statepoint currently only supports X86-64");
+
+ StatepointOpers SOpers(&MI);
+ if (unsigned PatchBytes = SOpers.getNumPatchBytes()) {
+ EmitNops(*OutStreamer, PatchBytes, Subtarget->is64Bit(),
+ getSubtargetInfo());
+ } else {
+ // Lower call target and choose correct opcode
+ const MachineOperand &CallTarget = SOpers.getCallTarget();
+ MCOperand CallTargetMCOp;
+ unsigned CallOpcode;
+ switch (CallTarget.getType()) {
+ case MachineOperand::MO_GlobalAddress:
+ case MachineOperand::MO_ExternalSymbol:
+ CallTargetMCOp = MCIL.LowerSymbolOperand(
+ CallTarget, MCIL.GetSymbolFromOperand(CallTarget));
+ CallOpcode = X86::CALL64pcrel32;
+ // Currently, we only support relative addressing with statepoints.
+ // Otherwise, we'll need a scratch register to hold the target
+ // address. You'll fail asserts during load & relocation if this
+ // symbol is to far away. (TODO: support non-relative addressing)
+ break;
+ case MachineOperand::MO_Immediate:
+ CallTargetMCOp = MCOperand::createImm(CallTarget.getImm());
+ CallOpcode = X86::CALL64pcrel32;
+ // Currently, we only support relative addressing with statepoints.
+ // Otherwise, we'll need a scratch register to hold the target
+ // immediate. You'll fail asserts during load & relocation if this
+ // address is to far away. (TODO: support non-relative addressing)
+ break;
+ case MachineOperand::MO_Register:
+ CallTargetMCOp = MCOperand::createReg(CallTarget.getReg());
+ CallOpcode = X86::CALL64r;
+ break;
+ default:
+ llvm_unreachable("Unsupported operand type in statepoint call target");
+ break;
+ }
+
+ // Emit call
+ MCInst CallInst;
+ CallInst.setOpcode(CallOpcode);
+ CallInst.addOperand(CallTargetMCOp);
+ OutStreamer->EmitInstruction(CallInst, getSubtargetInfo());
+ }
+
+ // Record our statepoint node in the same section used by STACKMAP
+ // and PATCHPOINT
+ SM.recordStatepoint(MI);
+}
+
+void X86AsmPrinter::LowerFAULTING_LOAD_OP(const MachineInstr &MI,
+ X86MCInstLower &MCIL) {
+ // FAULTING_LOAD_OP <def>, <MBB handler>, <load opcode>, <load operands>
+
+ unsigned LoadDefRegister = MI.getOperand(0).getReg();
+ MCSymbol *HandlerLabel = MI.getOperand(1).getMBB()->getSymbol();
+ unsigned LoadOpcode = MI.getOperand(2).getImm();
+ unsigned LoadOperandsBeginIdx = 3;
+
+ FM.recordFaultingOp(FaultMaps::FaultingLoad, HandlerLabel);
+
+ MCInst LoadMI;
+ LoadMI.setOpcode(LoadOpcode);
+
+ if (LoadDefRegister != X86::NoRegister)
+ LoadMI.addOperand(MCOperand::createReg(LoadDefRegister));
+
+ for (auto I = MI.operands_begin() + LoadOperandsBeginIdx,
+ E = MI.operands_end();
+ I != E; ++I)
+ if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, *I))
+ LoadMI.addOperand(MaybeOperand.getValue());
+
+ OutStreamer->EmitInstruction(LoadMI, getSubtargetInfo());
+}
+
+void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI,
+ X86MCInstLower &MCIL) {
+ // PATCHABLE_OP minsize, opcode, operands
+
+ unsigned MinSize = MI.getOperand(0).getImm();
+ unsigned Opcode = MI.getOperand(1).getImm();
+
+ MCInst MCI;
+ MCI.setOpcode(Opcode);
+ for (auto &MO : make_range(MI.operands_begin() + 2, MI.operands_end()))
+ if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
+ MCI.addOperand(MaybeOperand.getValue());
+
+ SmallString<256> Code;
+ SmallVector<MCFixup, 4> Fixups;
+ raw_svector_ostream VecOS(Code);
+ CodeEmitter->encodeInstruction(MCI, VecOS, Fixups, getSubtargetInfo());
+
+ if (Code.size() < MinSize) {
+ if (MinSize == 2 && Opcode == X86::PUSH64r) {
+ // This is an optimization that lets us get away without emitting a nop in
+ // many cases.
+ //
+ // NB! In some cases the encoding for PUSH64r (e.g. PUSH64r %R9) takes two
+ // bytes too, so the check on MinSize is important.
+ MCI.setOpcode(X86::PUSH64rmr);
+ } else {
+ unsigned NopSize = EmitNop(*OutStreamer, MinSize, Subtarget->is64Bit(),
+ getSubtargetInfo());
+ assert(NopSize == MinSize && "Could not implement MinSize!");
+ (void) NopSize;
+ }
+ }
+
+ OutStreamer->EmitInstruction(MCI, getSubtargetInfo());
+}
+
+// Lower a stackmap of the form:
+// <id>, <shadowBytes>, ...
+void X86AsmPrinter::LowerSTACKMAP(const MachineInstr &MI) {
+ SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
+ SM.recordStackMap(MI);
+ unsigned NumShadowBytes = MI.getOperand(1).getImm();
+ SMShadowTracker.reset(NumShadowBytes);
+}
+
+// Lower a patchpoint of the form:
+// [<def>], <id>, <numBytes>, <target>, <numArgs>, <cc>, ...
+void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI,
+ X86MCInstLower &MCIL) {
+ assert(Subtarget->is64Bit() && "Patchpoint currently only supports X86-64");
+
+ SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
+
+ SM.recordPatchPoint(MI);
+
+ PatchPointOpers opers(&MI);
+ unsigned ScratchIdx = opers.getNextScratchIdx();
+ unsigned EncodedBytes = 0;
+ const MachineOperand &CalleeMO = opers.getCallTarget();
+
+ // Check for null target. If target is non-null (i.e. is non-zero or is
+ // symbolic) then emit a call.
+ if (!(CalleeMO.isImm() && !CalleeMO.getImm())) {
+ MCOperand CalleeMCOp;
+ switch (CalleeMO.getType()) {
+ default:
+ /// FIXME: Add a verifier check for bad callee types.
+ llvm_unreachable("Unrecognized callee operand type.");
+ case MachineOperand::MO_Immediate:
+ if (CalleeMO.getImm())
+ CalleeMCOp = MCOperand::createImm(CalleeMO.getImm());
+ break;
+ case MachineOperand::MO_ExternalSymbol:
+ case MachineOperand::MO_GlobalAddress:
+ CalleeMCOp =
+ MCIL.LowerSymbolOperand(CalleeMO,
+ MCIL.GetSymbolFromOperand(CalleeMO));
+ break;
+ }
+
+ // Emit MOV to materialize the target address and the CALL to target.
+ // This is encoded with 12-13 bytes, depending on which register is used.
+ unsigned ScratchReg = MI.getOperand(ScratchIdx).getReg();
+ if (X86II::isX86_64ExtendedReg(ScratchReg))
+ EncodedBytes = 13;
+ else
+ EncodedBytes = 12;
+
+ EmitAndCountInstruction(
+ MCInstBuilder(X86::MOV64ri).addReg(ScratchReg).addOperand(CalleeMCOp));
+ EmitAndCountInstruction(MCInstBuilder(X86::CALL64r).addReg(ScratchReg));
+ }
+
+ // Emit padding.
+ unsigned NumBytes = opers.getNumPatchBytes();
+ assert(NumBytes >= EncodedBytes &&
+ "Patchpoint can't request size less than the length of a call.");
+
+ EmitNops(*OutStreamer, NumBytes - EncodedBytes, Subtarget->is64Bit(),
+ getSubtargetInfo());
+}
+
+void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI,
+ X86MCInstLower &MCIL) {
+ // We want to emit the following pattern:
+ //
+ // .p2align 1, ...
+ // .Lxray_sled_N:
+ // jmp .tmpN
+ // # 9 bytes worth of noops
+ // .tmpN
+ //
+ // We need the 9 bytes because at runtime, we'd be patching over the full 11
+ // bytes with the following pattern:
+ //
+ // mov %r10, <function id, 32-bit> // 6 bytes
+ // call <relative offset, 32-bits> // 5 bytes
+ //
+ auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
+ OutStreamer->EmitCodeAlignment(2);
+ OutStreamer->EmitLabel(CurSled);
+ auto Target = OutContext.createTempSymbol();
+
+ // Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as
+ // an operand (computed as an offset from the jmp instruction).
+ // FIXME: Find another less hacky way do force the relative jump.
+ OutStreamer->EmitBytes("\xeb\x09");
+ EmitNops(*OutStreamer, 9, Subtarget->is64Bit(), getSubtargetInfo());
+ OutStreamer->EmitLabel(Target);
+ recordSled(CurSled, MI, SledKind::FUNCTION_ENTER);
+}
+
+void X86AsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI,
+ X86MCInstLower &MCIL) {
+ // Since PATCHABLE_RET takes the opcode of the return statement as an
+ // argument, we use that to emit the correct form of the RET that we want.
+ // i.e. when we see this:
+ //
+ // PATCHABLE_RET X86::RET ...
+ //
+ // We should emit the RET followed by sleds.
+ //
+ // .p2align 1, ...
+ // .Lxray_sled_N:
+ // ret # or equivalent instruction
+ // # 10 bytes worth of noops
+ //
+ // This just makes sure that the alignment for the next instruction is 2.
+ auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
+ OutStreamer->EmitCodeAlignment(2);
+ OutStreamer->EmitLabel(CurSled);
+ unsigned OpCode = MI.getOperand(0).getImm();
+ MCInst Ret;
+ Ret.setOpcode(OpCode);
+ for (auto &MO : make_range(MI.operands_begin() + 1, MI.operands_end()))
+ if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
+ Ret.addOperand(MaybeOperand.getValue());
+ OutStreamer->EmitInstruction(Ret, getSubtargetInfo());
+ EmitNops(*OutStreamer, 10, Subtarget->is64Bit(), getSubtargetInfo());
+ recordSled(CurSled, MI, SledKind::FUNCTION_EXIT);
+}
+
+void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI, X86MCInstLower &MCIL) {
+ // Like PATCHABLE_RET, we have the actual instruction in the operands to this
+ // instruction so we lower that particular instruction and its operands.
+ // Unlike PATCHABLE_RET though, we put the sled before the JMP, much like how
+ // we do it for PATCHABLE_FUNCTION_ENTER. The sled should be very similar to
+ // the PATCHABLE_FUNCTION_ENTER case, followed by the lowering of the actual
+ // tail call much like how we have it in PATCHABLE_RET.
+ auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
+ OutStreamer->EmitCodeAlignment(2);
+ OutStreamer->EmitLabel(CurSled);
+ auto Target = OutContext.createTempSymbol();
+
+ // Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as
+ // an operand (computed as an offset from the jmp instruction).
+ // FIXME: Find another less hacky way do force the relative jump.
+ OutStreamer->EmitBytes("\xeb\x09");
+ EmitNops(*OutStreamer, 9, Subtarget->is64Bit(), getSubtargetInfo());
+ OutStreamer->EmitLabel(Target);
+ recordSled(CurSled, MI, SledKind::TAIL_CALL);
+
+ unsigned OpCode = MI.getOperand(0).getImm();
+ MCInst TC;
+ TC.setOpcode(OpCode);
+
+ // Before emitting the instruction, add a comment to indicate that this is
+ // indeed a tail call.
+ OutStreamer->AddComment("TAILCALL");
+ for (auto &MO : make_range(MI.operands_begin() + 1, MI.operands_end()))
+ if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
+ TC.addOperand(MaybeOperand.getValue());
+ OutStreamer->EmitInstruction(TC, getSubtargetInfo());
+}
+
+void X86AsmPrinter::EmitXRayTable() {
+ if (Sleds.empty())
+ return;
+
+ auto PrevSection = OutStreamer->getCurrentSectionOnly();
+ auto Fn = MF->getFunction();
+ MCSection *Section = nullptr;
+ if (Subtarget->isTargetELF()) {
+ if (Fn->hasComdat()) {
+ Section = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS,
+ ELF::SHF_ALLOC | ELF::SHF_GROUP, 0,
+ Fn->getComdat()->getName());
+ } else {
+ Section = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS,
+ ELF::SHF_ALLOC);
+ }
+ } else if (Subtarget->isTargetMachO()) {
+ Section = OutContext.getMachOSection("__DATA", "xray_instr_map", 0,
+ SectionKind::getReadOnlyWithRel());
+ } else {
+ llvm_unreachable("Unsupported target");
+ }
+
+ // Before we switch over, we force a reference to a label inside the
+ // xray_instr_map section. Since EmitXRayTable() is always called just
+ // before the function's end, we assume that this is happening after the
+ // last return instruction.
+ //
+ // We then align the reference to 16 byte boundaries, which we determined
+ // experimentally to be beneficial to avoid causing decoder stalls.
+ MCSymbol *Tmp = OutContext.createTempSymbol("xray_synthetic_", true);
+ OutStreamer->EmitCodeAlignment(16);
+ OutStreamer->EmitSymbolValue(Tmp, 8, false);
+ OutStreamer->SwitchSection(Section);
+ OutStreamer->EmitLabel(Tmp);
+ for (const auto &Sled : Sleds) {
+ OutStreamer->EmitSymbolValue(Sled.Sled, 8);
+ OutStreamer->EmitSymbolValue(CurrentFnSym, 8);
+ auto Kind = static_cast<uint8_t>(Sled.Kind);
+ OutStreamer->EmitBytes(
+ StringRef(reinterpret_cast<const char *>(&Kind), 1));
+ OutStreamer->EmitBytes(
+ StringRef(reinterpret_cast<const char *>(&Sled.AlwaysInstrument), 1));
+ OutStreamer->EmitZeros(14);
+ }
+ OutStreamer->SwitchSection(PrevSection);
+
+ Sleds.clear();
+}
+
+// Returns instruction preceding MBBI in MachineFunction.
+// If MBBI is the first instruction of the first basic block, returns null.
+static MachineBasicBlock::const_iterator
+PrevCrossBBInst(MachineBasicBlock::const_iterator MBBI) {
+ const MachineBasicBlock *MBB = MBBI->getParent();
+ while (MBBI == MBB->begin()) {
+ if (MBB == &MBB->getParent()->front())
+ return MachineBasicBlock::const_iterator();
+ MBB = MBB->getPrevNode();
+ MBBI = MBB->end();
+ }
+ return --MBBI;
+}
+
+static const Constant *getConstantFromPool(const MachineInstr &MI,
+ const MachineOperand &Op) {
+ if (!Op.isCPI())
+ return nullptr;
+
+ ArrayRef<MachineConstantPoolEntry> Constants =
+ MI.getParent()->getParent()->getConstantPool()->getConstants();
+ const MachineConstantPoolEntry &ConstantEntry =
+ Constants[Op.getIndex()];
+
+ // Bail if this is a machine constant pool entry, we won't be able to dig out
+ // anything useful.
+ if (ConstantEntry.isMachineConstantPoolEntry())
+ return nullptr;
+
+ auto *C = dyn_cast<Constant>(ConstantEntry.Val.ConstVal);
+ assert((!C || ConstantEntry.getType() == C->getType()) &&
+ "Expected a constant of the same type!");
+ return C;
+}
+
+static std::string getShuffleComment(const MachineInstr *MI,
+ unsigned SrcOp1Idx,
+ unsigned SrcOp2Idx,
+ ArrayRef<int> Mask) {
+ std::string Comment;
+
+ // Compute the name for a register. This is really goofy because we have
+ // multiple instruction printers that could (in theory) use different
+ // names. Fortunately most people use the ATT style (outside of Windows)
+ // and they actually agree on register naming here. Ultimately, this is
+ // a comment, and so its OK if it isn't perfect.
+ auto GetRegisterName = [](unsigned RegNum) -> StringRef {
+ return X86ATTInstPrinter::getRegisterName(RegNum);
+ };
+
+ const MachineOperand &DstOp = MI->getOperand(0);
+ const MachineOperand &SrcOp1 = MI->getOperand(SrcOp1Idx);
+ const MachineOperand &SrcOp2 = MI->getOperand(SrcOp2Idx);
+
+ StringRef DstName = DstOp.isReg() ? GetRegisterName(DstOp.getReg()) : "mem";
+ StringRef Src1Name =
+ SrcOp1.isReg() ? GetRegisterName(SrcOp1.getReg()) : "mem";
+ StringRef Src2Name =
+ SrcOp2.isReg() ? GetRegisterName(SrcOp2.getReg()) : "mem";
+
+ // One source operand, fix the mask to print all elements in one span.
+ SmallVector<int, 8> ShuffleMask(Mask.begin(), Mask.end());
+ if (Src1Name == Src2Name)
+ for (int i = 0, e = ShuffleMask.size(); i != e; ++i)
+ if (ShuffleMask[i] >= e)
+ ShuffleMask[i] -= e;
+
+ raw_string_ostream CS(Comment);
+ CS << DstName;
+
+ // Handle AVX512 MASK/MASXZ write mask comments.
+ // MASK: zmmX {%kY}
+ // MASKZ: zmmX {%kY} {z}
+ if (SrcOp1Idx > 1) {
+ assert((SrcOp1Idx == 2 || SrcOp1Idx == 3) && "Unexpected writemask");
+
+ const MachineOperand &WriteMaskOp = MI->getOperand(SrcOp1Idx - 1);
+ if (WriteMaskOp.isReg()) {
+ CS << " {%" << GetRegisterName(WriteMaskOp.getReg()) << "}";
+
+ if (SrcOp1Idx == 2) {
+ CS << " {z}";
+ }
+ }
+ }
+
+ CS << " = ";
+
+ for (int i = 0, e = ShuffleMask.size(); i != e; ++i) {
+ if (i != 0)
+ CS << ",";
+ if (ShuffleMask[i] == SM_SentinelZero) {
+ CS << "zero";
+ continue;
+ }
+
+ // Otherwise, it must come from src1 or src2. Print the span of elements
+ // that comes from this src.
+ bool isSrc1 = ShuffleMask[i] < (int)e;
+ CS << (isSrc1 ? Src1Name : Src2Name) << '[';
+
+ bool IsFirst = true;
+ while (i != e && ShuffleMask[i] != SM_SentinelZero &&
+ (ShuffleMask[i] < (int)e) == isSrc1) {
+ if (!IsFirst)
+ CS << ',';
+ else
+ IsFirst = false;
+ if (ShuffleMask[i] == SM_SentinelUndef)
+ CS << "u";
+ else
+ CS << ShuffleMask[i] % (int)e;
+ ++i;
+ }
+ CS << ']';
+ --i; // For loop increments element #.
+ }
+ CS.flush();
+
+ return Comment;
+}
+
+void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
+ X86MCInstLower MCInstLowering(*MF, *this);
+ const X86RegisterInfo *RI = MF->getSubtarget<X86Subtarget>().getRegisterInfo();
+
+ // Add a comment about EVEX-2-VEX compression for AVX-512 instrs that
+ // are compressed from EVEX encoding to VEX encoding.
+ if (TM.Options.MCOptions.ShowMCEncoding) {
+ if (MI->getAsmPrinterFlags() & AC_EVEX_2_VEX)
+ OutStreamer->AddComment("EVEX TO VEX Compression ", false);
+ }
+
+ switch (MI->getOpcode()) {
+ case TargetOpcode::DBG_VALUE:
+ llvm_unreachable("Should be handled target independently");
+
+ // Emit nothing here but a comment if we can.
+ case X86::Int_MemBarrier:
+ OutStreamer->emitRawComment("MEMBARRIER");
+ return;
+
+
+ case X86::EH_RETURN:
+ case X86::EH_RETURN64: {
+ // Lower these as normal, but add some comments.
+ unsigned Reg = MI->getOperand(0).getReg();
+ OutStreamer->AddComment(StringRef("eh_return, addr: %") +
+ X86ATTInstPrinter::getRegisterName(Reg));
+ break;
+ }
+ case X86::CLEANUPRET: {
+ // Lower these as normal, but add some comments.
+ OutStreamer->AddComment("CLEANUPRET");
+ break;
+ }
+
+ case X86::CATCHRET: {
+ // Lower these as normal, but add some comments.
+ OutStreamer->AddComment("CATCHRET");
+ break;
+ }
+
+ case X86::TAILJMPr:
+ case X86::TAILJMPm:
+ case X86::TAILJMPd:
+ case X86::TAILJMPd_CC:
+ case X86::TAILJMPr64:
+ case X86::TAILJMPm64:
+ case X86::TAILJMPd64:
+ case X86::TAILJMPd64_CC:
+ case X86::TAILJMPr64_REX:
+ case X86::TAILJMPm64_REX:
+ // Lower these as normal, but add some comments.
+ OutStreamer->AddComment("TAILCALL");
+ break;
+
+ case X86::TLS_addr32:
+ case X86::TLS_addr64:
+ case X86::TLS_base_addr32:
+ case X86::TLS_base_addr64:
+ return LowerTlsAddr(MCInstLowering, *MI);
+
+ case X86::MOVPC32r: {
+ // This is a pseudo op for a two instruction sequence with a label, which
+ // looks like:
+ // call "L1$pb"
+ // "L1$pb":
+ // popl %esi
+
+ // Emit the call.
+ MCSymbol *PICBase = MF->getPICBaseSymbol();
+ // FIXME: We would like an efficient form for this, so we don't have to do a
+ // lot of extra uniquing.
+ EmitAndCountInstruction(MCInstBuilder(X86::CALLpcrel32)
+ .addExpr(MCSymbolRefExpr::create(PICBase, OutContext)));
+
+ const X86FrameLowering* FrameLowering =
+ MF->getSubtarget<X86Subtarget>().getFrameLowering();
+ bool hasFP = FrameLowering->hasFP(*MF);
+
+ // TODO: This is needed only if we require precise CFA.
+ bool HasActiveDwarfFrame = OutStreamer->getNumFrameInfos() &&
+ !OutStreamer->getDwarfFrameInfos().back().End;
+
+ int stackGrowth = -RI->getSlotSize();
+
+ if (HasActiveDwarfFrame && !hasFP) {
+ OutStreamer->EmitCFIAdjustCfaOffset(-stackGrowth);
+ }
+
+ // Emit the label.
+ OutStreamer->EmitLabel(PICBase);
+
+ // popl $reg
+ EmitAndCountInstruction(MCInstBuilder(X86::POP32r)
+ .addReg(MI->getOperand(0).getReg()));
+
+ if (HasActiveDwarfFrame && !hasFP) {
+ OutStreamer->EmitCFIAdjustCfaOffset(stackGrowth);
+ }
+ return;
+ }
+
+ case X86::ADD32ri: {
+ // Lower the MO_GOT_ABSOLUTE_ADDRESS form of ADD32ri.
+ if (MI->getOperand(2).getTargetFlags() != X86II::MO_GOT_ABSOLUTE_ADDRESS)
+ break;
+
+ // Okay, we have something like:
+ // EAX = ADD32ri EAX, MO_GOT_ABSOLUTE_ADDRESS(@MYGLOBAL)
+
+ // For this, we want to print something like:
+ // MYGLOBAL + (. - PICBASE)
+ // However, we can't generate a ".", so just emit a new label here and refer
+ // to it.
+ MCSymbol *DotSym = OutContext.createTempSymbol();
+ OutStreamer->EmitLabel(DotSym);
+
+ // Now that we have emitted the label, lower the complex operand expression.
+ MCSymbol *OpSym = MCInstLowering.GetSymbolFromOperand(MI->getOperand(2));
+
+ const MCExpr *DotExpr = MCSymbolRefExpr::create(DotSym, OutContext);
+ const MCExpr *PICBase =
+ MCSymbolRefExpr::create(MF->getPICBaseSymbol(), OutContext);
+ DotExpr = MCBinaryExpr::createSub(DotExpr, PICBase, OutContext);
+
+ DotExpr = MCBinaryExpr::createAdd(MCSymbolRefExpr::create(OpSym,OutContext),
+ DotExpr, OutContext);
+
+ EmitAndCountInstruction(MCInstBuilder(X86::ADD32ri)
+ .addReg(MI->getOperand(0).getReg())
+ .addReg(MI->getOperand(1).getReg())
+ .addExpr(DotExpr));
+ return;
+ }
+ case TargetOpcode::STATEPOINT:
+ return LowerSTATEPOINT(*MI, MCInstLowering);
+
+ case TargetOpcode::FAULTING_LOAD_OP:
+ return LowerFAULTING_LOAD_OP(*MI, MCInstLowering);
+
+ case TargetOpcode::PATCHABLE_OP:
+ return LowerPATCHABLE_OP(*MI, MCInstLowering);
+
+ case TargetOpcode::STACKMAP:
+ return LowerSTACKMAP(*MI);
+
+ case TargetOpcode::PATCHPOINT:
+ return LowerPATCHPOINT(*MI, MCInstLowering);
+
+ case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
+ return LowerPATCHABLE_FUNCTION_ENTER(*MI, MCInstLowering);
+
+ case TargetOpcode::PATCHABLE_RET:
+ return LowerPATCHABLE_RET(*MI, MCInstLowering);
+
+ case TargetOpcode::PATCHABLE_TAIL_CALL:
+ return LowerPATCHABLE_TAIL_CALL(*MI, MCInstLowering);
+
+ case X86::MORESTACK_RET:
+ EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
+ return;
+
+ case X86::MORESTACK_RET_RESTORE_R10:
+ // Return, then restore R10.
+ EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
+ EmitAndCountInstruction(MCInstBuilder(X86::MOV64rr)
+ .addReg(X86::R10)
+ .addReg(X86::RAX));
+ return;
+
+ case X86::SEH_PushReg:
+ assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
+ OutStreamer->EmitWinCFIPushReg(RI->getSEHRegNum(MI->getOperand(0).getImm()));
+ return;
+
+ case X86::SEH_SaveReg:
+ assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
+ OutStreamer->EmitWinCFISaveReg(RI->getSEHRegNum(MI->getOperand(0).getImm()),
+ MI->getOperand(1).getImm());
+ return;
+
+ case X86::SEH_SaveXMM:
+ assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
+ OutStreamer->EmitWinCFISaveXMM(RI->getSEHRegNum(MI->getOperand(0).getImm()),
+ MI->getOperand(1).getImm());
+ return;
+
+ case X86::SEH_StackAlloc:
+ assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
+ OutStreamer->EmitWinCFIAllocStack(MI->getOperand(0).getImm());
+ return;
+
+ case X86::SEH_SetFrame:
+ assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
+ OutStreamer->EmitWinCFISetFrame(RI->getSEHRegNum(MI->getOperand(0).getImm()),
+ MI->getOperand(1).getImm());
+ return;
+
+ case X86::SEH_PushFrame:
+ assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
+ OutStreamer->EmitWinCFIPushFrame(MI->getOperand(0).getImm());
+ return;
+
+ case X86::SEH_EndPrologue:
+ assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
+ OutStreamer->EmitWinCFIEndProlog();
+ return;
+
+ case X86::SEH_Epilogue: {
+ assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
+ MachineBasicBlock::const_iterator MBBI(MI);
+ // Check if preceded by a call and emit nop if so.
+ for (MBBI = PrevCrossBBInst(MBBI);
+ MBBI != MachineBasicBlock::const_iterator();
+ MBBI = PrevCrossBBInst(MBBI)) {
+ // Conservatively assume that pseudo instructions don't emit code and keep
+ // looking for a call. We may emit an unnecessary nop in some cases.
+ if (!MBBI->isPseudo()) {
+ if (MBBI->isCall())
+ EmitAndCountInstruction(MCInstBuilder(X86::NOOP));
+ break;
+ }
+ }
+ return;
+ }
+
+ // Lower PSHUFB and VPERMILP normally but add a comment if we can find
+ // a constant shuffle mask. We won't be able to do this at the MC layer
+ // because the mask isn't an immediate.
+ case X86::PSHUFBrm:
+ case X86::VPSHUFBrm:
+ case X86::VPSHUFBYrm:
+ case X86::VPSHUFBZ128rm:
+ case X86::VPSHUFBZ128rmk:
+ case X86::VPSHUFBZ128rmkz:
+ case X86::VPSHUFBZ256rm:
+ case X86::VPSHUFBZ256rmk:
+ case X86::VPSHUFBZ256rmkz:
+ case X86::VPSHUFBZrm:
+ case X86::VPSHUFBZrmk:
+ case X86::VPSHUFBZrmkz: {
+ if (!OutStreamer->isVerboseAsm())
+ break;
+ unsigned SrcIdx, MaskIdx;
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::PSHUFBrm:
+ case X86::VPSHUFBrm:
+ case X86::VPSHUFBYrm:
+ case X86::VPSHUFBZ128rm:
+ case X86::VPSHUFBZ256rm:
+ case X86::VPSHUFBZrm:
+ SrcIdx = 1; MaskIdx = 5; break;
+ case X86::VPSHUFBZ128rmkz:
+ case X86::VPSHUFBZ256rmkz:
+ case X86::VPSHUFBZrmkz:
+ SrcIdx = 2; MaskIdx = 6; break;
+ case X86::VPSHUFBZ128rmk:
+ case X86::VPSHUFBZ256rmk:
+ case X86::VPSHUFBZrmk:
+ SrcIdx = 3; MaskIdx = 7; break;
+ }
+
+ assert(MI->getNumOperands() >= 6 &&
+ "We should always have at least 6 operands!");
+
+ const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
+ if (auto *C = getConstantFromPool(*MI, MaskOp)) {
+ SmallVector<int, 64> Mask;
+ DecodePSHUFBMask(C, Mask);
+ if (!Mask.empty())
+ OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask));
+ }
+ break;
+ }
+
+ case X86::VPERMILPSrm:
+ case X86::VPERMILPSYrm:
+ case X86::VPERMILPSZ128rm:
+ case X86::VPERMILPSZ128rmk:
+ case X86::VPERMILPSZ128rmkz:
+ case X86::VPERMILPSZ256rm:
+ case X86::VPERMILPSZ256rmk:
+ case X86::VPERMILPSZ256rmkz:
+ case X86::VPERMILPSZrm:
+ case X86::VPERMILPSZrmk:
+ case X86::VPERMILPSZrmkz:
+ case X86::VPERMILPDrm:
+ case X86::VPERMILPDYrm:
+ case X86::VPERMILPDZ128rm:
+ case X86::VPERMILPDZ128rmk:
+ case X86::VPERMILPDZ128rmkz:
+ case X86::VPERMILPDZ256rm:
+ case X86::VPERMILPDZ256rmk:
+ case X86::VPERMILPDZ256rmkz:
+ case X86::VPERMILPDZrm:
+ case X86::VPERMILPDZrmk:
+ case X86::VPERMILPDZrmkz: {
+ if (!OutStreamer->isVerboseAsm())
+ break;
+ unsigned SrcIdx, MaskIdx;
+ unsigned ElSize;
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::VPERMILPSrm:
+ case X86::VPERMILPSYrm:
+ case X86::VPERMILPSZ128rm:
+ case X86::VPERMILPSZ256rm:
+ case X86::VPERMILPSZrm:
+ SrcIdx = 1; MaskIdx = 5; ElSize = 32; break;
+ case X86::VPERMILPSZ128rmkz:
+ case X86::VPERMILPSZ256rmkz:
+ case X86::VPERMILPSZrmkz:
+ SrcIdx = 2; MaskIdx = 6; ElSize = 32; break;
+ case X86::VPERMILPSZ128rmk:
+ case X86::VPERMILPSZ256rmk:
+ case X86::VPERMILPSZrmk:
+ SrcIdx = 3; MaskIdx = 7; ElSize = 32; break;
+ case X86::VPERMILPDrm:
+ case X86::VPERMILPDYrm:
+ case X86::VPERMILPDZ128rm:
+ case X86::VPERMILPDZ256rm:
+ case X86::VPERMILPDZrm:
+ SrcIdx = 1; MaskIdx = 5; ElSize = 64; break;
+ case X86::VPERMILPDZ128rmkz:
+ case X86::VPERMILPDZ256rmkz:
+ case X86::VPERMILPDZrmkz:
+ SrcIdx = 2; MaskIdx = 6; ElSize = 64; break;
+ case X86::VPERMILPDZ128rmk:
+ case X86::VPERMILPDZ256rmk:
+ case X86::VPERMILPDZrmk:
+ SrcIdx = 3; MaskIdx = 7; ElSize = 64; break;
+ }
+
+ assert(MI->getNumOperands() >= 6 &&
+ "We should always have at least 6 operands!");
+
+ const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
+ if (auto *C = getConstantFromPool(*MI, MaskOp)) {
+ SmallVector<int, 16> Mask;
+ DecodeVPERMILPMask(C, ElSize, Mask);
+ if (!Mask.empty())
+ OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask));
+ }
+ break;
+ }
+
+ case X86::VPERMIL2PDrm:
+ case X86::VPERMIL2PSrm:
+ case X86::VPERMIL2PDrmY:
+ case X86::VPERMIL2PSrmY: {
+ if (!OutStreamer->isVerboseAsm())
+ break;
+ assert(MI->getNumOperands() >= 8 &&
+ "We should always have at least 8 operands!");
+
+ const MachineOperand &CtrlOp = MI->getOperand(MI->getNumOperands() - 1);
+ if (!CtrlOp.isImm())
+ break;
+
+ unsigned ElSize;
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::VPERMIL2PSrm: case X86::VPERMIL2PSrmY: ElSize = 32; break;
+ case X86::VPERMIL2PDrm: case X86::VPERMIL2PDrmY: ElSize = 64; break;
+ }
+
+ const MachineOperand &MaskOp = MI->getOperand(6);
+ if (auto *C = getConstantFromPool(*MI, MaskOp)) {
+ SmallVector<int, 16> Mask;
+ DecodeVPERMIL2PMask(C, (unsigned)CtrlOp.getImm(), ElSize, Mask);
+ if (!Mask.empty())
+ OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask));
+ }
+ break;
+ }
+
+ case X86::VPPERMrrm: {
+ if (!OutStreamer->isVerboseAsm())
+ break;
+ assert(MI->getNumOperands() >= 7 &&
+ "We should always have at least 7 operands!");
+
+ const MachineOperand &MaskOp = MI->getOperand(6);
+ if (auto *C = getConstantFromPool(*MI, MaskOp)) {
+ SmallVector<int, 16> Mask;
+ DecodeVPPERMMask(C, Mask);
+ if (!Mask.empty())
+ OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask));
+ }
+ break;
+ }
+
+#define MOV_CASE(Prefix, Suffix) \
+ case X86::Prefix##MOVAPD##Suffix##rm: \
+ case X86::Prefix##MOVAPS##Suffix##rm: \
+ case X86::Prefix##MOVUPD##Suffix##rm: \
+ case X86::Prefix##MOVUPS##Suffix##rm: \
+ case X86::Prefix##MOVDQA##Suffix##rm: \
+ case X86::Prefix##MOVDQU##Suffix##rm:
+
+#define MOV_AVX512_CASE(Suffix) \
+ case X86::VMOVDQA64##Suffix##rm: \
+ case X86::VMOVDQA32##Suffix##rm: \
+ case X86::VMOVDQU64##Suffix##rm: \
+ case X86::VMOVDQU32##Suffix##rm: \
+ case X86::VMOVDQU16##Suffix##rm: \
+ case X86::VMOVDQU8##Suffix##rm: \
+ case X86::VMOVAPS##Suffix##rm: \
+ case X86::VMOVAPD##Suffix##rm: \
+ case X86::VMOVUPS##Suffix##rm: \
+ case X86::VMOVUPD##Suffix##rm:
+
+#define CASE_ALL_MOV_RM() \
+ MOV_CASE(, ) /* SSE */ \
+ MOV_CASE(V, ) /* AVX-128 */ \
+ MOV_CASE(V, Y) /* AVX-256 */ \
+ MOV_AVX512_CASE(Z) \
+ MOV_AVX512_CASE(Z256) \
+ MOV_AVX512_CASE(Z128)
+
+ // For loads from a constant pool to a vector register, print the constant
+ // loaded.
+ CASE_ALL_MOV_RM()
+ if (!OutStreamer->isVerboseAsm())
+ break;
+ if (MI->getNumOperands() <= 4)
+ break;
+ if (auto *C = getConstantFromPool(*MI, MI->getOperand(4))) {
+ std::string Comment;
+ raw_string_ostream CS(Comment);
+ const MachineOperand &DstOp = MI->getOperand(0);
+ CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = ";
+ if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) {
+ CS << "[";
+ for (int i = 0, NumElements = CDS->getNumElements(); i < NumElements; ++i) {
+ if (i != 0)
+ CS << ",";
+ if (CDS->getElementType()->isIntegerTy())
+ CS << CDS->getElementAsInteger(i);
+ else if (CDS->getElementType()->isFloatTy())
+ CS << CDS->getElementAsFloat(i);
+ else if (CDS->getElementType()->isDoubleTy())
+ CS << CDS->getElementAsDouble(i);
+ else
+ CS << "?";
+ }
+ CS << "]";
+ OutStreamer->AddComment(CS.str());
+ } else if (auto *CV = dyn_cast<ConstantVector>(C)) {
+ CS << "<";
+ for (int i = 0, NumOperands = CV->getNumOperands(); i < NumOperands; ++i) {
+ if (i != 0)
+ CS << ",";
+ Constant *COp = CV->getOperand(i);
+ if (isa<UndefValue>(COp)) {
+ CS << "u";
+ } else if (auto *CI = dyn_cast<ConstantInt>(COp)) {
+ if (CI->getBitWidth() <= 64) {
+ CS << CI->getZExtValue();
+ } else {
+ // print multi-word constant as (w0,w1)
+ const auto &Val = CI->getValue();
+ CS << "(";
+ for (int i = 0, N = Val.getNumWords(); i < N; ++i) {
+ if (i > 0)
+ CS << ",";
+ CS << Val.getRawData()[i];
+ }
+ CS << ")";
+ }
+ } else if (auto *CF = dyn_cast<ConstantFP>(COp)) {
+ SmallString<32> Str;
+ CF->getValueAPF().toString(Str);
+ CS << Str;
+ } else {
+ CS << "?";
+ }
+ }
+ CS << ">";
+ OutStreamer->AddComment(CS.str());
+ }
+ }
+ break;
+ }
+
+ MCInst TmpInst;
+ MCInstLowering.Lower(MI, TmpInst);
+
+ // Stackmap shadows cannot include branch targets, so we can count the bytes
+ // in a call towards the shadow, but must ensure that the no thread returns
+ // in to the stackmap shadow. The only way to achieve this is if the call
+ // is at the end of the shadow.
+ if (MI->isCall()) {
+ // Count then size of the call towards the shadow
+ SMShadowTracker.count(TmpInst, getSubtargetInfo(), CodeEmitter.get());
+ // Then flush the shadow so that we fill with nops before the call, not
+ // after it.
+ SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
+ // Then emit the call
+ OutStreamer->EmitInstruction(TmpInst, getSubtargetInfo());
+ return;
+ }
+
+ EmitAndCountInstruction(TmpInst);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp
new file mode 100644
index 000000000000..c9e636f1eb00
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp
@@ -0,0 +1,33 @@
+//===-- X86MachineFunctionInfo.cpp - X86 machine function info ------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+void X86MachineFunctionInfo::anchor() { }
+
+void X86MachineFunctionInfo::setRestoreBasePointer(const MachineFunction *MF) {
+ if (!RestoreBasePointerOffset) {
+ const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+ MF->getSubtarget().getRegisterInfo());
+ unsigned SlotSize = RegInfo->getSlotSize();
+ for (const MCPhysReg *CSR =
+ RegInfo->X86RegisterInfo::getCalleeSavedRegs(MF);
+ unsigned Reg = *CSR;
+ ++CSR)
+ {
+ if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
+ RestoreBasePointerOffset -= SlotSize;
+ }
+ }
+}
+
diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h
new file mode 100644
index 000000000000..d517d82537a7
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h
@@ -0,0 +1,185 @@
+//===-- X86MachineFunctionInfo.h - X86 machine function info ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares X86-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86MACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_X86_X86MACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineValueType.h"
+
+namespace llvm {
+
+/// X86MachineFunctionInfo - This class is derived from MachineFunction and
+/// contains private X86 target-specific information for each MachineFunction.
+class X86MachineFunctionInfo : public MachineFunctionInfo {
+ virtual void anchor();
+
+ /// ForceFramePointer - True if the function is required to use of frame
+ /// pointer for reasons other than it containing dynamic allocation or
+ /// that FP eliminatation is turned off. For example, Cygwin main function
+ /// contains stack pointer re-alignment code which requires FP.
+ bool ForceFramePointer = false;
+
+ /// RestoreBasePointerOffset - Non-zero if the function has base pointer
+ /// and makes call to llvm.eh.sjlj.setjmp. When non-zero, the value is a
+ /// displacement from the frame pointer to a slot where the base pointer
+ /// is stashed.
+ signed char RestoreBasePointerOffset = 0;
+
+ /// CalleeSavedFrameSize - Size of the callee-saved register portion of the
+ /// stack frame in bytes.
+ unsigned CalleeSavedFrameSize = 0;
+
+ /// BytesToPopOnReturn - Number of bytes function pops on return (in addition
+ /// to the space used by the return address).
+ /// Used on windows platform for stdcall & fastcall name decoration
+ unsigned BytesToPopOnReturn = 0;
+
+ /// ReturnAddrIndex - FrameIndex for return slot.
+ int ReturnAddrIndex = 0;
+
+ /// \brief FrameIndex for return slot.
+ int FrameAddrIndex = 0;
+
+ /// TailCallReturnAddrDelta - The number of bytes by which return address
+ /// stack slot is moved as the result of tail call optimization.
+ int TailCallReturnAddrDelta = 0;
+
+ /// SRetReturnReg - Some subtargets require that sret lowering includes
+ /// returning the value of the returned struct in a register. This field
+ /// holds the virtual register into which the sret argument is passed.
+ unsigned SRetReturnReg = 0;
+
+ /// GlobalBaseReg - keeps track of the virtual register initialized for
+ /// use as the global base register. This is used for PIC in some PIC
+ /// relocation models.
+ unsigned GlobalBaseReg = 0;
+
+ /// VarArgsFrameIndex - FrameIndex for start of varargs area.
+ int VarArgsFrameIndex = 0;
+ /// RegSaveFrameIndex - X86-64 vararg func register save area.
+ int RegSaveFrameIndex = 0;
+ /// VarArgsGPOffset - X86-64 vararg func int reg offset.
+ unsigned VarArgsGPOffset = 0;
+ /// VarArgsFPOffset - X86-64 vararg func fp reg offset.
+ unsigned VarArgsFPOffset = 0;
+ /// ArgumentStackSize - The number of bytes on stack consumed by the arguments
+ /// being passed on the stack.
+ unsigned ArgumentStackSize = 0;
+ /// NumLocalDynamics - Number of local-dynamic TLS accesses.
+ unsigned NumLocalDynamics = 0;
+ /// HasPushSequences - Keeps track of whether this function uses sequences
+ /// of pushes to pass function parameters.
+ bool HasPushSequences = false;
+
+ /// True if the function recovers from an SEH exception, and therefore needs
+ /// to spill and restore the frame pointer.
+ bool HasSEHFramePtrSave = false;
+
+ /// The frame index of a stack object containing the original frame pointer
+ /// used to address arguments in a function using a base pointer.
+ int SEHFramePtrSaveIndex = 0;
+
+ /// True if this function has a subset of CSRs that is handled explicitly via
+ /// copies.
+ bool IsSplitCSR = false;
+
+ /// True if this function uses the red zone.
+ bool UsesRedZone = false;
+
+ /// True if this function has WIN_ALLOCA instructions.
+ bool HasWinAlloca = false;
+
+private:
+ /// ForwardedMustTailRegParms - A list of virtual and physical registers
+ /// that must be forwarded to every musttail call.
+ SmallVector<ForwardedRegister, 1> ForwardedMustTailRegParms;
+
+public:
+ X86MachineFunctionInfo() = default;
+
+ explicit X86MachineFunctionInfo(MachineFunction &MF) {}
+
+ bool getForceFramePointer() const { return ForceFramePointer;}
+ void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; }
+
+ bool getHasPushSequences() const { return HasPushSequences; }
+ void setHasPushSequences(bool HasPush) { HasPushSequences = HasPush; }
+
+ bool getRestoreBasePointer() const { return RestoreBasePointerOffset!=0; }
+ void setRestoreBasePointer(const MachineFunction *MF);
+ int getRestoreBasePointerOffset() const {return RestoreBasePointerOffset; }
+
+ unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
+ void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
+
+ unsigned getBytesToPopOnReturn() const { return BytesToPopOnReturn; }
+ void setBytesToPopOnReturn (unsigned bytes) { BytesToPopOnReturn = bytes;}
+
+ int getRAIndex() const { return ReturnAddrIndex; }
+ void setRAIndex(int Index) { ReturnAddrIndex = Index; }
+
+ int getFAIndex() const { return FrameAddrIndex; }
+ void setFAIndex(int Index) { FrameAddrIndex = Index; }
+
+ int getTCReturnAddrDelta() const { return TailCallReturnAddrDelta; }
+ void setTCReturnAddrDelta(int delta) {TailCallReturnAddrDelta = delta;}
+
+ unsigned getSRetReturnReg() const { return SRetReturnReg; }
+ void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+
+ unsigned getGlobalBaseReg() const { return GlobalBaseReg; }
+ void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; }
+
+ int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+ void setVarArgsFrameIndex(int Idx) { VarArgsFrameIndex = Idx; }
+
+ int getRegSaveFrameIndex() const { return RegSaveFrameIndex; }
+ void setRegSaveFrameIndex(int Idx) { RegSaveFrameIndex = Idx; }
+
+ unsigned getVarArgsGPOffset() const { return VarArgsGPOffset; }
+ void setVarArgsGPOffset(unsigned Offset) { VarArgsGPOffset = Offset; }
+
+ unsigned getVarArgsFPOffset() const { return VarArgsFPOffset; }
+ void setVarArgsFPOffset(unsigned Offset) { VarArgsFPOffset = Offset; }
+
+ unsigned getArgumentStackSize() const { return ArgumentStackSize; }
+ void setArgumentStackSize(unsigned size) { ArgumentStackSize = size; }
+
+ unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; }
+ void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; }
+
+ bool getHasSEHFramePtrSave() const { return HasSEHFramePtrSave; }
+ void setHasSEHFramePtrSave(bool V) { HasSEHFramePtrSave = V; }
+
+ int getSEHFramePtrSaveIndex() const { return SEHFramePtrSaveIndex; }
+ void setSEHFramePtrSaveIndex(int Index) { SEHFramePtrSaveIndex = Index; }
+
+ SmallVectorImpl<ForwardedRegister> &getForwardedMustTailRegParms() {
+ return ForwardedMustTailRegParms;
+ }
+
+ bool isSplitCSR() const { return IsSplitCSR; }
+ void setIsSplitCSR(bool s) { IsSplitCSR = s; }
+
+ bool getUsesRedZone() const { return UsesRedZone; }
+ void setUsesRedZone(bool V) { UsesRedZone = V; }
+
+ bool hasWinAlloca() const { return HasWinAlloca; }
+ void setHasWinAlloca(bool v) { HasWinAlloca = v; }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp b/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
new file mode 100644
index 000000000000..e1447006cd18
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
@@ -0,0 +1,645 @@
+//===-- X86OptimizeLEAs.cpp - optimize usage of LEA instructions ----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass that performs some optimizations with LEA
+// instructions in order to improve performance and code size.
+// Currently, it does two things:
+// 1) If there are two LEA instructions calculating addresses which only differ
+// by displacement inside a basic block, one of them is removed.
+// 2) Address calculations in load and store instructions are replaced by
+// existing LEA def registers where possible.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-optimize-LEAs"
+
+static cl::opt<bool>
+ DisableX86LEAOpt("disable-x86-lea-opt", cl::Hidden,
+ cl::desc("X86: Disable LEA optimizations."),
+ cl::init(false));
+
+STATISTIC(NumSubstLEAs, "Number of LEA instruction substitutions");
+STATISTIC(NumRedundantLEAs, "Number of redundant LEA instructions removed");
+
+/// \brief Returns true if two machine operands are identical and they are not
+/// physical registers.
+static inline bool isIdenticalOp(const MachineOperand &MO1,
+ const MachineOperand &MO2);
+
+/// \brief Returns true if two address displacement operands are of the same
+/// type and use the same symbol/index/address regardless of the offset.
+static bool isSimilarDispOp(const MachineOperand &MO1,
+ const MachineOperand &MO2);
+
+/// \brief Returns true if the instruction is LEA.
+static inline bool isLEA(const MachineInstr &MI);
+
+namespace {
+/// A key based on instruction's memory operands.
+class MemOpKey {
+public:
+ MemOpKey(const MachineOperand *Base, const MachineOperand *Scale,
+ const MachineOperand *Index, const MachineOperand *Segment,
+ const MachineOperand *Disp)
+ : Disp(Disp) {
+ Operands[0] = Base;
+ Operands[1] = Scale;
+ Operands[2] = Index;
+ Operands[3] = Segment;
+ }
+
+ bool operator==(const MemOpKey &Other) const {
+ // Addresses' bases, scales, indices and segments must be identical.
+ for (int i = 0; i < 4; ++i)
+ if (!isIdenticalOp(*Operands[i], *Other.Operands[i]))
+ return false;
+
+ // Addresses' displacements don't have to be exactly the same. It only
+ // matters that they use the same symbol/index/address. Immediates' or
+ // offsets' differences will be taken care of during instruction
+ // substitution.
+ return isSimilarDispOp(*Disp, *Other.Disp);
+ }
+
+ // Address' base, scale, index and segment operands.
+ const MachineOperand *Operands[4];
+
+ // Address' displacement operand.
+ const MachineOperand *Disp;
+};
+} // end anonymous namespace
+
+/// Provide DenseMapInfo for MemOpKey.
+namespace llvm {
+template <> struct DenseMapInfo<MemOpKey> {
+ typedef DenseMapInfo<const MachineOperand *> PtrInfo;
+
+ static inline MemOpKey getEmptyKey() {
+ return MemOpKey(PtrInfo::getEmptyKey(), PtrInfo::getEmptyKey(),
+ PtrInfo::getEmptyKey(), PtrInfo::getEmptyKey(),
+ PtrInfo::getEmptyKey());
+ }
+
+ static inline MemOpKey getTombstoneKey() {
+ return MemOpKey(PtrInfo::getTombstoneKey(), PtrInfo::getTombstoneKey(),
+ PtrInfo::getTombstoneKey(), PtrInfo::getTombstoneKey(),
+ PtrInfo::getTombstoneKey());
+ }
+
+ static unsigned getHashValue(const MemOpKey &Val) {
+ // Checking any field of MemOpKey is enough to determine if the key is
+ // empty or tombstone.
+ assert(Val.Disp != PtrInfo::getEmptyKey() && "Cannot hash the empty key");
+ assert(Val.Disp != PtrInfo::getTombstoneKey() &&
+ "Cannot hash the tombstone key");
+
+ hash_code Hash = hash_combine(*Val.Operands[0], *Val.Operands[1],
+ *Val.Operands[2], *Val.Operands[3]);
+
+ // If the address displacement is an immediate, it should not affect the
+ // hash so that memory operands which differ only be immediate displacement
+ // would have the same hash. If the address displacement is something else,
+ // we should reflect symbol/index/address in the hash.
+ switch (Val.Disp->getType()) {
+ case MachineOperand::MO_Immediate:
+ break;
+ case MachineOperand::MO_ConstantPoolIndex:
+ case MachineOperand::MO_JumpTableIndex:
+ Hash = hash_combine(Hash, Val.Disp->getIndex());
+ break;
+ case MachineOperand::MO_ExternalSymbol:
+ Hash = hash_combine(Hash, Val.Disp->getSymbolName());
+ break;
+ case MachineOperand::MO_GlobalAddress:
+ Hash = hash_combine(Hash, Val.Disp->getGlobal());
+ break;
+ case MachineOperand::MO_BlockAddress:
+ Hash = hash_combine(Hash, Val.Disp->getBlockAddress());
+ break;
+ case MachineOperand::MO_MCSymbol:
+ Hash = hash_combine(Hash, Val.Disp->getMCSymbol());
+ break;
+ case MachineOperand::MO_MachineBasicBlock:
+ Hash = hash_combine(Hash, Val.Disp->getMBB());
+ break;
+ default:
+ llvm_unreachable("Invalid address displacement operand");
+ }
+
+ return (unsigned)Hash;
+ }
+
+ static bool isEqual(const MemOpKey &LHS, const MemOpKey &RHS) {
+ // Checking any field of MemOpKey is enough to determine if the key is
+ // empty or tombstone.
+ if (RHS.Disp == PtrInfo::getEmptyKey())
+ return LHS.Disp == PtrInfo::getEmptyKey();
+ if (RHS.Disp == PtrInfo::getTombstoneKey())
+ return LHS.Disp == PtrInfo::getTombstoneKey();
+ return LHS == RHS;
+ }
+};
+}
+
+/// \brief Returns a hash table key based on memory operands of \p MI. The
+/// number of the first memory operand of \p MI is specified through \p N.
+static inline MemOpKey getMemOpKey(const MachineInstr &MI, unsigned N) {
+ assert((isLEA(MI) || MI.mayLoadOrStore()) &&
+ "The instruction must be a LEA, a load or a store");
+ return MemOpKey(&MI.getOperand(N + X86::AddrBaseReg),
+ &MI.getOperand(N + X86::AddrScaleAmt),
+ &MI.getOperand(N + X86::AddrIndexReg),
+ &MI.getOperand(N + X86::AddrSegmentReg),
+ &MI.getOperand(N + X86::AddrDisp));
+}
+
+static inline bool isIdenticalOp(const MachineOperand &MO1,
+ const MachineOperand &MO2) {
+ return MO1.isIdenticalTo(MO2) &&
+ (!MO1.isReg() ||
+ !TargetRegisterInfo::isPhysicalRegister(MO1.getReg()));
+}
+
+#ifndef NDEBUG
+static bool isValidDispOp(const MachineOperand &MO) {
+ return MO.isImm() || MO.isCPI() || MO.isJTI() || MO.isSymbol() ||
+ MO.isGlobal() || MO.isBlockAddress() || MO.isMCSymbol() || MO.isMBB();
+}
+#endif
+
+static bool isSimilarDispOp(const MachineOperand &MO1,
+ const MachineOperand &MO2) {
+ assert(isValidDispOp(MO1) && isValidDispOp(MO2) &&
+ "Address displacement operand is not valid");
+ return (MO1.isImm() && MO2.isImm()) ||
+ (MO1.isCPI() && MO2.isCPI() && MO1.getIndex() == MO2.getIndex()) ||
+ (MO1.isJTI() && MO2.isJTI() && MO1.getIndex() == MO2.getIndex()) ||
+ (MO1.isSymbol() && MO2.isSymbol() &&
+ MO1.getSymbolName() == MO2.getSymbolName()) ||
+ (MO1.isGlobal() && MO2.isGlobal() &&
+ MO1.getGlobal() == MO2.getGlobal()) ||
+ (MO1.isBlockAddress() && MO2.isBlockAddress() &&
+ MO1.getBlockAddress() == MO2.getBlockAddress()) ||
+ (MO1.isMCSymbol() && MO2.isMCSymbol() &&
+ MO1.getMCSymbol() == MO2.getMCSymbol()) ||
+ (MO1.isMBB() && MO2.isMBB() && MO1.getMBB() == MO2.getMBB());
+}
+
+static inline bool isLEA(const MachineInstr &MI) {
+ unsigned Opcode = MI.getOpcode();
+ return Opcode == X86::LEA16r || Opcode == X86::LEA32r ||
+ Opcode == X86::LEA64r || Opcode == X86::LEA64_32r;
+}
+
+namespace {
+class OptimizeLEAPass : public MachineFunctionPass {
+public:
+ OptimizeLEAPass() : MachineFunctionPass(ID) {}
+
+ StringRef getPassName() const override { return "X86 LEA Optimize"; }
+
+ /// \brief Loop over all of the basic blocks, replacing address
+ /// calculations in load and store instructions, if it's already
+ /// been calculated by LEA. Also, remove redundant LEAs.
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+ typedef DenseMap<MemOpKey, SmallVector<MachineInstr *, 16>> MemOpMap;
+
+ /// \brief Returns a distance between two instructions inside one basic block.
+ /// Negative result means, that instructions occur in reverse order.
+ int calcInstrDist(const MachineInstr &First, const MachineInstr &Last);
+
+ /// \brief Choose the best \p LEA instruction from the \p List to replace
+ /// address calculation in \p MI instruction. Return the address displacement
+ /// and the distance between \p MI and the chosen \p BestLEA in
+ /// \p AddrDispShift and \p Dist.
+ bool chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List,
+ const MachineInstr &MI, MachineInstr *&BestLEA,
+ int64_t &AddrDispShift, int &Dist);
+
+ /// \brief Returns the difference between addresses' displacements of \p MI1
+ /// and \p MI2. The numbers of the first memory operands for the instructions
+ /// are specified through \p N1 and \p N2.
+ int64_t getAddrDispShift(const MachineInstr &MI1, unsigned N1,
+ const MachineInstr &MI2, unsigned N2) const;
+
+ /// \brief Returns true if the \p Last LEA instruction can be replaced by the
+ /// \p First. The difference between displacements of the addresses calculated
+ /// by these LEAs is returned in \p AddrDispShift. It'll be used for proper
+ /// replacement of the \p Last LEA's uses with the \p First's def register.
+ bool isReplaceable(const MachineInstr &First, const MachineInstr &Last,
+ int64_t &AddrDispShift) const;
+
+ /// \brief Find all LEA instructions in the basic block. Also, assign position
+ /// numbers to all instructions in the basic block to speed up calculation of
+ /// distance between them.
+ void findLEAs(const MachineBasicBlock &MBB, MemOpMap &LEAs);
+
+ /// \brief Removes redundant address calculations.
+ bool removeRedundantAddrCalc(MemOpMap &LEAs);
+
+ /// \brief Removes LEAs which calculate similar addresses.
+ bool removeRedundantLEAs(MemOpMap &LEAs);
+
+ DenseMap<const MachineInstr *, unsigned> InstrPos;
+
+ MachineRegisterInfo *MRI;
+ const X86InstrInfo *TII;
+ const X86RegisterInfo *TRI;
+
+ static char ID;
+};
+char OptimizeLEAPass::ID = 0;
+}
+
+FunctionPass *llvm::createX86OptimizeLEAs() { return new OptimizeLEAPass(); }
+
+int OptimizeLEAPass::calcInstrDist(const MachineInstr &First,
+ const MachineInstr &Last) {
+ // Both instructions must be in the same basic block and they must be
+ // presented in InstrPos.
+ assert(Last.getParent() == First.getParent() &&
+ "Instructions are in different basic blocks");
+ assert(InstrPos.find(&First) != InstrPos.end() &&
+ InstrPos.find(&Last) != InstrPos.end() &&
+ "Instructions' positions are undefined");
+
+ return InstrPos[&Last] - InstrPos[&First];
+}
+
+// Find the best LEA instruction in the List to replace address recalculation in
+// MI. Such LEA must meet these requirements:
+// 1) The address calculated by the LEA differs only by the displacement from
+// the address used in MI.
+// 2) The register class of the definition of the LEA is compatible with the
+// register class of the address base register of MI.
+// 3) Displacement of the new memory operand should fit in 1 byte if possible.
+// 4) The LEA should be as close to MI as possible, and prior to it if
+// possible.
+bool OptimizeLEAPass::chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List,
+ const MachineInstr &MI,
+ MachineInstr *&BestLEA,
+ int64_t &AddrDispShift, int &Dist) {
+ const MachineFunction *MF = MI.getParent()->getParent();
+ const MCInstrDesc &Desc = MI.getDesc();
+ int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags) +
+ X86II::getOperandBias(Desc);
+
+ BestLEA = nullptr;
+
+ // Loop over all LEA instructions.
+ for (auto DefMI : List) {
+ // Get new address displacement.
+ int64_t AddrDispShiftTemp = getAddrDispShift(MI, MemOpNo, *DefMI, 1);
+
+ // Make sure address displacement fits 4 bytes.
+ if (!isInt<32>(AddrDispShiftTemp))
+ continue;
+
+ // Check that LEA def register can be used as MI address base. Some
+ // instructions can use a limited set of registers as address base, for
+ // example MOV8mr_NOREX. We could constrain the register class of the LEA
+ // def to suit MI, however since this case is very rare and hard to
+ // reproduce in a test it's just more reliable to skip the LEA.
+ if (TII->getRegClass(Desc, MemOpNo + X86::AddrBaseReg, TRI, *MF) !=
+ MRI->getRegClass(DefMI->getOperand(0).getReg()))
+ continue;
+
+ // Choose the closest LEA instruction from the list, prior to MI if
+ // possible. Note that we took into account resulting address displacement
+ // as well. Also note that the list is sorted by the order in which the LEAs
+ // occur, so the break condition is pretty simple.
+ int DistTemp = calcInstrDist(*DefMI, MI);
+ assert(DistTemp != 0 &&
+ "The distance between two different instructions cannot be zero");
+ if (DistTemp > 0 || BestLEA == nullptr) {
+ // Do not update return LEA, if the current one provides a displacement
+ // which fits in 1 byte, while the new candidate does not.
+ if (BestLEA != nullptr && !isInt<8>(AddrDispShiftTemp) &&
+ isInt<8>(AddrDispShift))
+ continue;
+
+ BestLEA = DefMI;
+ AddrDispShift = AddrDispShiftTemp;
+ Dist = DistTemp;
+ }
+
+ // FIXME: Maybe we should not always stop at the first LEA after MI.
+ if (DistTemp < 0)
+ break;
+ }
+
+ return BestLEA != nullptr;
+}
+
+// Get the difference between the addresses' displacements of the two
+// instructions \p MI1 and \p MI2. The numbers of the first memory operands are
+// passed through \p N1 and \p N2.
+int64_t OptimizeLEAPass::getAddrDispShift(const MachineInstr &MI1, unsigned N1,
+ const MachineInstr &MI2,
+ unsigned N2) const {
+ const MachineOperand &Op1 = MI1.getOperand(N1 + X86::AddrDisp);
+ const MachineOperand &Op2 = MI2.getOperand(N2 + X86::AddrDisp);
+
+ assert(isSimilarDispOp(Op1, Op2) &&
+ "Address displacement operands are not compatible");
+
+ // After the assert above we can be sure that both operands are of the same
+ // valid type and use the same symbol/index/address, thus displacement shift
+ // calculation is rather simple.
+ if (Op1.isJTI())
+ return 0;
+ return Op1.isImm() ? Op1.getImm() - Op2.getImm()
+ : Op1.getOffset() - Op2.getOffset();
+}
+
+// Check that the Last LEA can be replaced by the First LEA. To be so,
+// these requirements must be met:
+// 1) Addresses calculated by LEAs differ only by displacement.
+// 2) Def registers of LEAs belong to the same class.
+// 3) All uses of the Last LEA def register are replaceable, thus the
+// register is used only as address base.
+bool OptimizeLEAPass::isReplaceable(const MachineInstr &First,
+ const MachineInstr &Last,
+ int64_t &AddrDispShift) const {
+ assert(isLEA(First) && isLEA(Last) &&
+ "The function works only with LEA instructions");
+
+ // Get new address displacement.
+ AddrDispShift = getAddrDispShift(Last, 1, First, 1);
+
+ // Make sure that LEA def registers belong to the same class. There may be
+ // instructions (like MOV8mr_NOREX) which allow a limited set of registers to
+ // be used as their operands, so we must be sure that replacing one LEA
+ // with another won't lead to putting a wrong register in the instruction.
+ if (MRI->getRegClass(First.getOperand(0).getReg()) !=
+ MRI->getRegClass(Last.getOperand(0).getReg()))
+ return false;
+
+ // Loop over all uses of the Last LEA to check that its def register is
+ // used only as address base for memory accesses. If so, it can be
+ // replaced, otherwise - no.
+ for (auto &MO : MRI->use_operands(Last.getOperand(0).getReg())) {
+ MachineInstr &MI = *MO.getParent();
+
+ // Get the number of the first memory operand.
+ const MCInstrDesc &Desc = MI.getDesc();
+ int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags);
+
+ // If the use instruction has no memory operand - the LEA is not
+ // replaceable.
+ if (MemOpNo < 0)
+ return false;
+
+ MemOpNo += X86II::getOperandBias(Desc);
+
+ // If the address base of the use instruction is not the LEA def register -
+ // the LEA is not replaceable.
+ if (!isIdenticalOp(MI.getOperand(MemOpNo + X86::AddrBaseReg), MO))
+ return false;
+
+ // If the LEA def register is used as any other operand of the use
+ // instruction - the LEA is not replaceable.
+ for (unsigned i = 0; i < MI.getNumOperands(); i++)
+ if (i != (unsigned)(MemOpNo + X86::AddrBaseReg) &&
+ isIdenticalOp(MI.getOperand(i), MO))
+ return false;
+
+ // Check that the new address displacement will fit 4 bytes.
+ if (MI.getOperand(MemOpNo + X86::AddrDisp).isImm() &&
+ !isInt<32>(MI.getOperand(MemOpNo + X86::AddrDisp).getImm() +
+ AddrDispShift))
+ return false;
+ }
+
+ return true;
+}
+
+void OptimizeLEAPass::findLEAs(const MachineBasicBlock &MBB, MemOpMap &LEAs) {
+ unsigned Pos = 0;
+ for (auto &MI : MBB) {
+ // Assign the position number to the instruction. Note that we are going to
+ // move some instructions during the optimization however there will never
+ // be a need to move two instructions before any selected instruction. So to
+ // avoid multiple positions' updates during moves we just increase position
+ // counter by two leaving a free space for instructions which will be moved.
+ InstrPos[&MI] = Pos += 2;
+
+ if (isLEA(MI))
+ LEAs[getMemOpKey(MI, 1)].push_back(const_cast<MachineInstr *>(&MI));
+ }
+}
+
+// Try to find load and store instructions which recalculate addresses already
+// calculated by some LEA and replace their memory operands with its def
+// register.
+bool OptimizeLEAPass::removeRedundantAddrCalc(MemOpMap &LEAs) {
+ bool Changed = false;
+
+ assert(!LEAs.empty());
+ MachineBasicBlock *MBB = (*LEAs.begin()->second.begin())->getParent();
+
+ // Process all instructions in basic block.
+ for (auto I = MBB->begin(), E = MBB->end(); I != E;) {
+ MachineInstr &MI = *I++;
+
+ // Instruction must be load or store.
+ if (!MI.mayLoadOrStore())
+ continue;
+
+ // Get the number of the first memory operand.
+ const MCInstrDesc &Desc = MI.getDesc();
+ int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags);
+
+ // If instruction has no memory operand - skip it.
+ if (MemOpNo < 0)
+ continue;
+
+ MemOpNo += X86II::getOperandBias(Desc);
+
+ // Get the best LEA instruction to replace address calculation.
+ MachineInstr *DefMI;
+ int64_t AddrDispShift;
+ int Dist;
+ if (!chooseBestLEA(LEAs[getMemOpKey(MI, MemOpNo)], MI, DefMI, AddrDispShift,
+ Dist))
+ continue;
+
+ // If LEA occurs before current instruction, we can freely replace
+ // the instruction. If LEA occurs after, we can lift LEA above the
+ // instruction and this way to be able to replace it. Since LEA and the
+ // instruction have similar memory operands (thus, the same def
+ // instructions for these operands), we can always do that, without
+ // worries of using registers before their defs.
+ if (Dist < 0) {
+ DefMI->removeFromParent();
+ MBB->insert(MachineBasicBlock::iterator(&MI), DefMI);
+ InstrPos[DefMI] = InstrPos[&MI] - 1;
+
+ // Make sure the instructions' position numbers are sane.
+ assert(((InstrPos[DefMI] == 1 &&
+ MachineBasicBlock::iterator(DefMI) == MBB->begin()) ||
+ InstrPos[DefMI] >
+ InstrPos[&*std::prev(MachineBasicBlock::iterator(DefMI))]) &&
+ "Instruction positioning is broken");
+ }
+
+ // Since we can possibly extend register lifetime, clear kill flags.
+ MRI->clearKillFlags(DefMI->getOperand(0).getReg());
+
+ ++NumSubstLEAs;
+ DEBUG(dbgs() << "OptimizeLEAs: Candidate to replace: "; MI.dump(););
+
+ // Change instruction operands.
+ MI.getOperand(MemOpNo + X86::AddrBaseReg)
+ .ChangeToRegister(DefMI->getOperand(0).getReg(), false);
+ MI.getOperand(MemOpNo + X86::AddrScaleAmt).ChangeToImmediate(1);
+ MI.getOperand(MemOpNo + X86::AddrIndexReg)
+ .ChangeToRegister(X86::NoRegister, false);
+ MI.getOperand(MemOpNo + X86::AddrDisp).ChangeToImmediate(AddrDispShift);
+ MI.getOperand(MemOpNo + X86::AddrSegmentReg)
+ .ChangeToRegister(X86::NoRegister, false);
+
+ DEBUG(dbgs() << "OptimizeLEAs: Replaced by: "; MI.dump(););
+
+ Changed = true;
+ }
+
+ return Changed;
+}
+
+// Try to find similar LEAs in the list and replace one with another.
+bool OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) {
+ bool Changed = false;
+
+ // Loop over all entries in the table.
+ for (auto &E : LEAs) {
+ auto &List = E.second;
+
+ // Loop over all LEA pairs.
+ auto I1 = List.begin();
+ while (I1 != List.end()) {
+ MachineInstr &First = **I1;
+ auto I2 = std::next(I1);
+ while (I2 != List.end()) {
+ MachineInstr &Last = **I2;
+ int64_t AddrDispShift;
+
+ // LEAs should be in occurrence order in the list, so we can freely
+ // replace later LEAs with earlier ones.
+ assert(calcInstrDist(First, Last) > 0 &&
+ "LEAs must be in occurrence order in the list");
+
+ // Check that the Last LEA instruction can be replaced by the First.
+ if (!isReplaceable(First, Last, AddrDispShift)) {
+ ++I2;
+ continue;
+ }
+
+ // Loop over all uses of the Last LEA and update their operands. Note
+ // that the correctness of this has already been checked in the
+ // isReplaceable function.
+ for (auto UI = MRI->use_begin(Last.getOperand(0).getReg()),
+ UE = MRI->use_end();
+ UI != UE;) {
+ MachineOperand &MO = *UI++;
+ MachineInstr &MI = *MO.getParent();
+
+ // Get the number of the first memory operand.
+ const MCInstrDesc &Desc = MI.getDesc();
+ int MemOpNo =
+ X86II::getMemoryOperandNo(Desc.TSFlags) +
+ X86II::getOperandBias(Desc);
+
+ // Update address base.
+ MO.setReg(First.getOperand(0).getReg());
+
+ // Update address disp.
+ MachineOperand &Op = MI.getOperand(MemOpNo + X86::AddrDisp);
+ if (Op.isImm())
+ Op.setImm(Op.getImm() + AddrDispShift);
+ else if (!Op.isJTI())
+ Op.setOffset(Op.getOffset() + AddrDispShift);
+ }
+
+ // Since we can possibly extend register lifetime, clear kill flags.
+ MRI->clearKillFlags(First.getOperand(0).getReg());
+
+ ++NumRedundantLEAs;
+ DEBUG(dbgs() << "OptimizeLEAs: Remove redundant LEA: "; Last.dump(););
+
+ // By this moment, all of the Last LEA's uses must be replaced. So we
+ // can freely remove it.
+ assert(MRI->use_empty(Last.getOperand(0).getReg()) &&
+ "The LEA's def register must have no uses");
+ Last.eraseFromParent();
+
+ // Erase removed LEA from the list.
+ I2 = List.erase(I2);
+
+ Changed = true;
+ }
+ ++I1;
+ }
+ }
+
+ return Changed;
+}
+
+bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) {
+ bool Changed = false;
+
+ if (DisableX86LEAOpt || skipFunction(*MF.getFunction()))
+ return false;
+
+ MRI = &MF.getRegInfo();
+ TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+ TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo();
+
+ // Process all basic blocks.
+ for (auto &MBB : MF) {
+ MemOpMap LEAs;
+ InstrPos.clear();
+
+ // Find all LEA instructions in basic block.
+ findLEAs(MBB, LEAs);
+
+ // If current basic block has no LEAs, move on to the next one.
+ if (LEAs.empty())
+ continue;
+
+ // Remove redundant LEA instructions.
+ Changed |= removeRedundantLEAs(LEAs);
+
+ // Remove redundant address calculations. Do it only for -Os/-Oz since only
+ // a code size gain is expected from this part of the pass.
+ if (MF.getFunction()->optForSize())
+ Changed |= removeRedundantAddrCalc(LEAs);
+ }
+
+ return Changed;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp b/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp
new file mode 100644
index 000000000000..3069d1fd3497
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp
@@ -0,0 +1,219 @@
+//===-------- X86PadShortFunction.cpp - pad short functions -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which will pad short functions to prevent
+// a stall if a function returns before the return address is ready. This
+// is needed for some Intel Atom processors.
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-pad-short-functions"
+
+STATISTIC(NumBBsPadded, "Number of basic blocks padded");
+
+namespace {
+ struct VisitedBBInfo {
+ // HasReturn - Whether the BB contains a return instruction
+ bool HasReturn;
+
+ // Cycles - Number of cycles until return if HasReturn is true, otherwise
+ // number of cycles until end of the BB
+ unsigned int Cycles;
+
+ VisitedBBInfo() : HasReturn(false), Cycles(0) {}
+ VisitedBBInfo(bool HasReturn, unsigned int Cycles)
+ : HasReturn(HasReturn), Cycles(Cycles) {}
+ };
+
+ struct PadShortFunc : public MachineFunctionPass {
+ static char ID;
+ PadShortFunc() : MachineFunctionPass(ID)
+ , Threshold(4), STI(nullptr), TII(nullptr) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+ StringRef getPassName() const override {
+ return "X86 Atom pad short functions";
+ }
+
+ private:
+ void findReturns(MachineBasicBlock *MBB,
+ unsigned int Cycles = 0);
+
+ bool cyclesUntilReturn(MachineBasicBlock *MBB,
+ unsigned int &Cycles);
+
+ void addPadding(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator &MBBI,
+ unsigned int NOOPsToAdd);
+
+ const unsigned int Threshold;
+
+ // ReturnBBs - Maps basic blocks that return to the minimum number of
+ // cycles until the return, starting from the entry block.
+ DenseMap<MachineBasicBlock*, unsigned int> ReturnBBs;
+
+ // VisitedBBs - Cache of previously visited BBs.
+ DenseMap<MachineBasicBlock*, VisitedBBInfo> VisitedBBs;
+
+ const X86Subtarget *STI;
+ const TargetInstrInfo *TII;
+ };
+
+ char PadShortFunc::ID = 0;
+}
+
+FunctionPass *llvm::createX86PadShortFunctions() {
+ return new PadShortFunc();
+}
+
+/// runOnMachineFunction - Loop over all of the basic blocks, inserting
+/// NOOP instructions before early exits.
+bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
+ if (MF.getFunction()->optForSize()) {
+ return false;
+ }
+
+ STI = &MF.getSubtarget<X86Subtarget>();
+ if (!STI->padShortFunctions())
+ return false;
+
+ TII = STI->getInstrInfo();
+
+ // Search through basic blocks and mark the ones that have early returns
+ ReturnBBs.clear();
+ VisitedBBs.clear();
+ findReturns(&MF.front());
+
+ bool MadeChange = false;
+
+ MachineBasicBlock *MBB;
+ unsigned int Cycles = 0;
+
+ // Pad the identified basic blocks with NOOPs
+ for (DenseMap<MachineBasicBlock*, unsigned int>::iterator I = ReturnBBs.begin();
+ I != ReturnBBs.end(); ++I) {
+ MBB = I->first;
+ Cycles = I->second;
+
+ if (Cycles < Threshold) {
+ // BB ends in a return. Skip over any DBG_VALUE instructions
+ // trailing the terminator.
+ assert(MBB->size() > 0 &&
+ "Basic block should contain at least a RET but is empty");
+ MachineBasicBlock::iterator ReturnLoc = --MBB->end();
+
+ while (ReturnLoc->isDebugValue())
+ --ReturnLoc;
+ assert(ReturnLoc->isReturn() && !ReturnLoc->isCall() &&
+ "Basic block does not end with RET");
+
+ addPadding(MBB, ReturnLoc, Threshold - Cycles);
+ NumBBsPadded++;
+ MadeChange = true;
+ }
+ }
+
+ return MadeChange;
+}
+
+/// findReturn - Starting at MBB, follow control flow and add all
+/// basic blocks that contain a return to ReturnBBs.
+void PadShortFunc::findReturns(MachineBasicBlock *MBB, unsigned int Cycles) {
+ // If this BB has a return, note how many cycles it takes to get there.
+ bool hasReturn = cyclesUntilReturn(MBB, Cycles);
+ if (Cycles >= Threshold)
+ return;
+
+ if (hasReturn) {
+ ReturnBBs[MBB] = std::max(ReturnBBs[MBB], Cycles);
+ return;
+ }
+
+ // Follow branches in BB and look for returns
+ for (MachineBasicBlock::succ_iterator I = MBB->succ_begin();
+ I != MBB->succ_end(); ++I) {
+ if (*I == MBB)
+ continue;
+ findReturns(*I, Cycles);
+ }
+}
+
+/// cyclesUntilReturn - return true if the MBB has a return instruction,
+/// and return false otherwise.
+/// Cycles will be incremented by the number of cycles taken to reach the
+/// return or the end of the BB, whichever occurs first.
+bool PadShortFunc::cyclesUntilReturn(MachineBasicBlock *MBB,
+ unsigned int &Cycles) {
+ // Return cached result if BB was previously visited
+ DenseMap<MachineBasicBlock*, VisitedBBInfo>::iterator it
+ = VisitedBBs.find(MBB);
+ if (it != VisitedBBs.end()) {
+ VisitedBBInfo BBInfo = it->second;
+ Cycles += BBInfo.Cycles;
+ return BBInfo.HasReturn;
+ }
+
+ unsigned int CyclesToEnd = 0;
+
+ for (MachineInstr &MI : *MBB) {
+ // Mark basic blocks with a return instruction. Calls to other
+ // functions do not count because the called function will be padded,
+ // if necessary.
+ if (MI.isReturn() && !MI.isCall()) {
+ VisitedBBs[MBB] = VisitedBBInfo(true, CyclesToEnd);
+ Cycles += CyclesToEnd;
+ return true;
+ }
+
+ CyclesToEnd += TII->getInstrLatency(STI->getInstrItineraryData(), MI);
+ }
+
+ VisitedBBs[MBB] = VisitedBBInfo(false, CyclesToEnd);
+ Cycles += CyclesToEnd;
+ return false;
+}
+
+/// addPadding - Add the given number of NOOP instructions to the function
+/// just prior to the return at MBBI
+void PadShortFunc::addPadding(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator &MBBI,
+ unsigned int NOOPsToAdd) {
+ DebugLoc DL = MBBI->getDebugLoc();
+
+ while (NOOPsToAdd-- > 0) {
+ BuildMI(*MBB, MBBI, DL, TII->get(X86::NOOP));
+ BuildMI(*MBB, MBBI, DL, TII->get(X86::NOOP));
+ }
+}
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp
new file mode 100644
index 000000000000..65f438f94b04
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -0,0 +1,758 @@
+//===-- X86RegisterInfo.cpp - X86 Register Information --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetRegisterInfo class.
+// This file is responsible for the frame pointer elimination optimization
+// on X86.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86RegisterInfo.h"
+#include "X86FrameLowering.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+#define GET_REGINFO_TARGET_DESC
+#include "X86GenRegisterInfo.inc"
+
+static cl::opt<bool>
+EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true),
+ cl::desc("Enable use of a base pointer for complex stack frames"));
+
+X86RegisterInfo::X86RegisterInfo(const Triple &TT)
+ : X86GenRegisterInfo((TT.isArch64Bit() ? X86::RIP : X86::EIP),
+ X86_MC::getDwarfRegFlavour(TT, false),
+ X86_MC::getDwarfRegFlavour(TT, true),
+ (TT.isArch64Bit() ? X86::RIP : X86::EIP)) {
+ X86_MC::initLLVMToSEHAndCVRegMapping(this);
+
+ // Cache some information.
+ Is64Bit = TT.isArch64Bit();
+ IsWin64 = Is64Bit && TT.isOSWindows();
+
+ // Use a callee-saved register as the base pointer. These registers must
+ // not conflict with any ABI requirements. For example, in 32-bit mode PIC
+ // requires GOT in the EBX register before function calls via PLT GOT pointer.
+ if (Is64Bit) {
+ SlotSize = 8;
+ // This matches the simplified 32-bit pointer code in the data layout
+ // computation.
+ // FIXME: Should use the data layout?
+ bool Use64BitReg = TT.getEnvironment() != Triple::GNUX32;
+ StackPtr = Use64BitReg ? X86::RSP : X86::ESP;
+ FramePtr = Use64BitReg ? X86::RBP : X86::EBP;
+ BasePtr = Use64BitReg ? X86::RBX : X86::EBX;
+ } else {
+ SlotSize = 4;
+ StackPtr = X86::ESP;
+ FramePtr = X86::EBP;
+ BasePtr = X86::ESI;
+ }
+}
+
+bool
+X86RegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+ // ExeDepsFixer and PostRAScheduler require liveness.
+ return true;
+}
+
+int
+X86RegisterInfo::getSEHRegNum(unsigned i) const {
+ return getEncodingValue(i);
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getSubClassWithSubReg(const TargetRegisterClass *RC,
+ unsigned Idx) const {
+ // The sub_8bit sub-register index is more constrained in 32-bit mode.
+ // It behaves just like the sub_8bit_hi index.
+ if (!Is64Bit && Idx == X86::sub_8bit)
+ Idx = X86::sub_8bit_hi;
+
+ // Forward to TableGen's default version.
+ return X86GenRegisterInfo::getSubClassWithSubReg(RC, Idx);
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
+ const TargetRegisterClass *B,
+ unsigned SubIdx) const {
+ // The sub_8bit sub-register index is more constrained in 32-bit mode.
+ if (!Is64Bit && SubIdx == X86::sub_8bit) {
+ A = X86GenRegisterInfo::getSubClassWithSubReg(A, X86::sub_8bit_hi);
+ if (!A)
+ return nullptr;
+ }
+ return X86GenRegisterInfo::getMatchingSuperRegClass(A, B, SubIdx);
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
+ const MachineFunction &MF) const {
+ // Don't allow super-classes of GR8_NOREX. This class is only used after
+ // extracting sub_8bit_hi sub-registers. The H sub-registers cannot be copied
+ // to the full GR8 register class in 64-bit mode, so we cannot allow the
+ // reigster class inflation.
+ //
+ // The GR8_NOREX class is always used in a way that won't be constrained to a
+ // sub-class, so sub-classes like GR8_ABCD_L are allowed to expand to the
+ // full GR8 class.
+ if (RC == &X86::GR8_NOREXRegClass)
+ return RC;
+
+ const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
+
+ const TargetRegisterClass *Super = RC;
+ TargetRegisterClass::sc_iterator I = RC->getSuperClasses();
+ do {
+ switch (Super->getID()) {
+ case X86::FR32RegClassID:
+ case X86::FR64RegClassID:
+ // If AVX-512 isn't supported we should only inflate to these classes.
+ if (!Subtarget.hasAVX512() && Super->getSize() == RC->getSize())
+ return Super;
+ break;
+ case X86::VR128RegClassID:
+ case X86::VR256RegClassID:
+ // If VLX isn't supported we should only inflate to these classes.
+ if (!Subtarget.hasVLX() && Super->getSize() == RC->getSize())
+ return Super;
+ break;
+ case X86::VR128XRegClassID:
+ case X86::VR256XRegClassID:
+ // If VLX isn't support we shouldn't inflate to these classes.
+ if (Subtarget.hasVLX() && Super->getSize() == RC->getSize())
+ return Super;
+ break;
+ case X86::FR32XRegClassID:
+ case X86::FR64XRegClassID:
+ // If AVX-512 isn't support we shouldn't inflate to these classes.
+ if (Subtarget.hasAVX512() && Super->getSize() == RC->getSize())
+ return Super;
+ break;
+ case X86::GR8RegClassID:
+ case X86::GR16RegClassID:
+ case X86::GR32RegClassID:
+ case X86::GR64RegClassID:
+ case X86::RFP32RegClassID:
+ case X86::RFP64RegClassID:
+ case X86::RFP80RegClassID:
+ case X86::VR512RegClassID:
+ // Don't return a super-class that would shrink the spill size.
+ // That can happen with the vector and float classes.
+ if (Super->getSize() == RC->getSize())
+ return Super;
+ }
+ Super = *I++;
+ } while (Super);
+ return RC;
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getPointerRegClass(const MachineFunction &MF,
+ unsigned Kind) const {
+ const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
+ switch (Kind) {
+ default: llvm_unreachable("Unexpected Kind in getPointerRegClass!");
+ case 0: // Normal GPRs.
+ if (Subtarget.isTarget64BitLP64())
+ return &X86::GR64RegClass;
+ // If the target is 64bit but we have been told to use 32bit addresses,
+ // we can still use 64-bit register as long as we know the high bits
+ // are zeros.
+ // Reflect that in the returned register class.
+ if (Is64Bit) {
+ // When the target also allows 64-bit frame pointer and we do have a
+ // frame, this is fine to use it for the address accesses as well.
+ const X86FrameLowering *TFI = getFrameLowering(MF);
+ return TFI->hasFP(MF) && TFI->Uses64BitFramePtr
+ ? &X86::LOW32_ADDR_ACCESS_RBPRegClass
+ : &X86::LOW32_ADDR_ACCESSRegClass;
+ }
+ return &X86::GR32RegClass;
+ case 1: // Normal GPRs except the stack pointer (for encoding reasons).
+ if (Subtarget.isTarget64BitLP64())
+ return &X86::GR64_NOSPRegClass;
+ // NOSP does not contain RIP, so no special case here.
+ return &X86::GR32_NOSPRegClass;
+ case 2: // NOREX GPRs.
+ if (Subtarget.isTarget64BitLP64())
+ return &X86::GR64_NOREXRegClass;
+ return &X86::GR32_NOREXRegClass;
+ case 3: // NOREX GPRs except the stack pointer (for encoding reasons).
+ if (Subtarget.isTarget64BitLP64())
+ return &X86::GR64_NOREX_NOSPRegClass;
+ // NOSP does not contain RIP, so no special case here.
+ return &X86::GR32_NOREX_NOSPRegClass;
+ case 4: // Available for tailcall (not callee-saved GPRs).
+ return getGPRsForTailCall(MF);
+ }
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getGPRsForTailCall(const MachineFunction &MF) const {
+ const Function *F = MF.getFunction();
+ if (IsWin64 || (F && F->getCallingConv() == CallingConv::X86_64_Win64))
+ return &X86::GR64_TCW64RegClass;
+ else if (Is64Bit)
+ return &X86::GR64_TCRegClass;
+
+ bool hasHipeCC = (F ? F->getCallingConv() == CallingConv::HiPE : false);
+ if (hasHipeCC)
+ return &X86::GR32RegClass;
+ return &X86::GR32_TCRegClass;
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
+ if (RC == &X86::CCRRegClass) {
+ if (Is64Bit)
+ return &X86::GR64RegClass;
+ else
+ return &X86::GR32RegClass;
+ }
+ return RC;
+}
+
+unsigned
+X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+ MachineFunction &MF) const {
+ const X86FrameLowering *TFI = getFrameLowering(MF);
+
+ unsigned FPDiff = TFI->hasFP(MF) ? 1 : 0;
+ switch (RC->getID()) {
+ default:
+ return 0;
+ case X86::GR32RegClassID:
+ return 4 - FPDiff;
+ case X86::GR64RegClassID:
+ return 12 - FPDiff;
+ case X86::VR128RegClassID:
+ return Is64Bit ? 10 : 4;
+ case X86::VR64RegClassID:
+ return 4;
+ }
+}
+
+const MCPhysReg *
+X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+ assert(MF && "MachineFunction required");
+
+ const X86Subtarget &Subtarget = MF->getSubtarget<X86Subtarget>();
+ bool HasSSE = Subtarget.hasSSE1();
+ bool HasAVX = Subtarget.hasAVX();
+ bool HasAVX512 = Subtarget.hasAVX512();
+ bool CallsEHReturn = MF->callsEHReturn();
+
+ switch (MF->getFunction()->getCallingConv()) {
+ case CallingConv::GHC:
+ case CallingConv::HiPE:
+ return CSR_NoRegs_SaveList;
+ case CallingConv::AnyReg:
+ if (HasAVX)
+ return CSR_64_AllRegs_AVX_SaveList;
+ return CSR_64_AllRegs_SaveList;
+ case CallingConv::PreserveMost:
+ return CSR_64_RT_MostRegs_SaveList;
+ case CallingConv::PreserveAll:
+ if (HasAVX)
+ return CSR_64_RT_AllRegs_AVX_SaveList;
+ return CSR_64_RT_AllRegs_SaveList;
+ case CallingConv::CXX_FAST_TLS:
+ if (Is64Bit)
+ return MF->getInfo<X86MachineFunctionInfo>()->isSplitCSR() ?
+ CSR_64_CXX_TLS_Darwin_PE_SaveList : CSR_64_TLS_Darwin_SaveList;
+ break;
+ case CallingConv::Intel_OCL_BI: {
+ if (HasAVX512 && IsWin64)
+ return CSR_Win64_Intel_OCL_BI_AVX512_SaveList;
+ if (HasAVX512 && Is64Bit)
+ return CSR_64_Intel_OCL_BI_AVX512_SaveList;
+ if (HasAVX && IsWin64)
+ return CSR_Win64_Intel_OCL_BI_AVX_SaveList;
+ if (HasAVX && Is64Bit)
+ return CSR_64_Intel_OCL_BI_AVX_SaveList;
+ if (!HasAVX && !IsWin64 && Is64Bit)
+ return CSR_64_Intel_OCL_BI_SaveList;
+ break;
+ }
+ case CallingConv::HHVM:
+ return CSR_64_HHVM_SaveList;
+ case CallingConv::X86_RegCall:
+ if (Is64Bit) {
+ if (IsWin64) {
+ return (HasSSE ? CSR_Win64_RegCall_SaveList :
+ CSR_Win64_RegCall_NoSSE_SaveList);
+ } else {
+ return (HasSSE ? CSR_SysV64_RegCall_SaveList :
+ CSR_SysV64_RegCall_NoSSE_SaveList);
+ }
+ } else {
+ return (HasSSE ? CSR_32_RegCall_SaveList :
+ CSR_32_RegCall_NoSSE_SaveList);
+ }
+ case CallingConv::Cold:
+ if (Is64Bit)
+ return CSR_64_MostRegs_SaveList;
+ break;
+ case CallingConv::X86_64_Win64:
+ if (!HasSSE)
+ return CSR_Win64_NoSSE_SaveList;
+ return CSR_Win64_SaveList;
+ case CallingConv::X86_64_SysV:
+ if (CallsEHReturn)
+ return CSR_64EHRet_SaveList;
+ return CSR_64_SaveList;
+ case CallingConv::X86_INTR:
+ if (Is64Bit) {
+ if (HasAVX512)
+ return CSR_64_AllRegs_AVX512_SaveList;
+ if (HasAVX)
+ return CSR_64_AllRegs_AVX_SaveList;
+ return CSR_64_AllRegs_SaveList;
+ } else {
+ if (HasAVX512)
+ return CSR_32_AllRegs_AVX512_SaveList;
+ if (HasAVX)
+ return CSR_32_AllRegs_AVX_SaveList;
+ if (HasSSE)
+ return CSR_32_AllRegs_SSE_SaveList;
+ return CSR_32_AllRegs_SaveList;
+ }
+ default:
+ break;
+ }
+
+ if (Is64Bit) {
+ if (IsWin64) {
+ if (!HasSSE)
+ return CSR_Win64_NoSSE_SaveList;
+ return CSR_Win64_SaveList;
+ }
+ if (CallsEHReturn)
+ return CSR_64EHRet_SaveList;
+ if (Subtarget.getTargetLowering()->supportSwiftError() &&
+ MF->getFunction()->getAttributes().hasAttrSomewhere(
+ Attribute::SwiftError))
+ return CSR_64_SwiftError_SaveList;
+ return CSR_64_SaveList;
+ }
+ if (CallsEHReturn)
+ return CSR_32EHRet_SaveList;
+ return CSR_32_SaveList;
+}
+
+const MCPhysReg *X86RegisterInfo::getCalleeSavedRegsViaCopy(
+ const MachineFunction *MF) const {
+ assert(MF && "Invalid MachineFunction pointer.");
+ if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
+ MF->getInfo<X86MachineFunctionInfo>()->isSplitCSR())
+ return CSR_64_CXX_TLS_Darwin_ViaCopy_SaveList;
+ return nullptr;
+}
+
+const uint32_t *
+X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
+ CallingConv::ID CC) const {
+ const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
+ bool HasSSE = Subtarget.hasSSE1();
+ bool HasAVX = Subtarget.hasAVX();
+ bool HasAVX512 = Subtarget.hasAVX512();
+
+ switch (CC) {
+ case CallingConv::GHC:
+ case CallingConv::HiPE:
+ return CSR_NoRegs_RegMask;
+ case CallingConv::AnyReg:
+ if (HasAVX)
+ return CSR_64_AllRegs_AVX_RegMask;
+ return CSR_64_AllRegs_RegMask;
+ case CallingConv::PreserveMost:
+ return CSR_64_RT_MostRegs_RegMask;
+ case CallingConv::PreserveAll:
+ if (HasAVX)
+ return CSR_64_RT_AllRegs_AVX_RegMask;
+ return CSR_64_RT_AllRegs_RegMask;
+ case CallingConv::CXX_FAST_TLS:
+ if (Is64Bit)
+ return CSR_64_TLS_Darwin_RegMask;
+ break;
+ case CallingConv::Intel_OCL_BI: {
+ if (HasAVX512 && IsWin64)
+ return CSR_Win64_Intel_OCL_BI_AVX512_RegMask;
+ if (HasAVX512 && Is64Bit)
+ return CSR_64_Intel_OCL_BI_AVX512_RegMask;
+ if (HasAVX && IsWin64)
+ return CSR_Win64_Intel_OCL_BI_AVX_RegMask;
+ if (HasAVX && Is64Bit)
+ return CSR_64_Intel_OCL_BI_AVX_RegMask;
+ if (!HasAVX && !IsWin64 && Is64Bit)
+ return CSR_64_Intel_OCL_BI_RegMask;
+ break;
+ }
+ case CallingConv::HHVM:
+ return CSR_64_HHVM_RegMask;
+ case CallingConv::X86_RegCall:
+ if (Is64Bit) {
+ if (IsWin64) {
+ return (HasSSE ? CSR_Win64_RegCall_RegMask :
+ CSR_Win64_RegCall_NoSSE_RegMask);
+ } else {
+ return (HasSSE ? CSR_SysV64_RegCall_RegMask :
+ CSR_SysV64_RegCall_NoSSE_RegMask);
+ }
+ } else {
+ return (HasSSE ? CSR_32_RegCall_RegMask :
+ CSR_32_RegCall_NoSSE_RegMask);
+ }
+ case CallingConv::Cold:
+ if (Is64Bit)
+ return CSR_64_MostRegs_RegMask;
+ break;
+ case CallingConv::X86_64_Win64:
+ return CSR_Win64_RegMask;
+ case CallingConv::X86_64_SysV:
+ return CSR_64_RegMask;
+ case CallingConv::X86_INTR:
+ if (Is64Bit) {
+ if (HasAVX512)
+ return CSR_64_AllRegs_AVX512_RegMask;
+ if (HasAVX)
+ return CSR_64_AllRegs_AVX_RegMask;
+ return CSR_64_AllRegs_RegMask;
+ } else {
+ if (HasAVX512)
+ return CSR_32_AllRegs_AVX512_RegMask;
+ if (HasAVX)
+ return CSR_32_AllRegs_AVX_RegMask;
+ if (HasSSE)
+ return CSR_32_AllRegs_SSE_RegMask;
+ return CSR_32_AllRegs_RegMask;
+ }
+ default:
+ break;
+ }
+
+ // Unlike getCalleeSavedRegs(), we don't have MMI so we can't check
+ // callsEHReturn().
+ if (Is64Bit) {
+ if (IsWin64)
+ return CSR_Win64_RegMask;
+ if (Subtarget.getTargetLowering()->supportSwiftError() &&
+ MF.getFunction()->getAttributes().hasAttrSomewhere(
+ Attribute::SwiftError))
+ return CSR_64_SwiftError_RegMask;
+ return CSR_64_RegMask;
+ }
+ return CSR_32_RegMask;
+}
+
+const uint32_t*
+X86RegisterInfo::getNoPreservedMask() const {
+ return CSR_NoRegs_RegMask;
+}
+
+const uint32_t *X86RegisterInfo::getDarwinTLSCallPreservedMask() const {
+ return CSR_64_TLS_Darwin_RegMask;
+}
+
+BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+ BitVector Reserved(getNumRegs());
+ const X86FrameLowering *TFI = getFrameLowering(MF);
+
+ // Set the stack-pointer register and its aliases as reserved.
+ for (MCSubRegIterator I(X86::RSP, this, /*IncludeSelf=*/true); I.isValid();
+ ++I)
+ Reserved.set(*I);
+
+ // Set the instruction pointer register and its aliases as reserved.
+ for (MCSubRegIterator I(X86::RIP, this, /*IncludeSelf=*/true); I.isValid();
+ ++I)
+ Reserved.set(*I);
+
+ // Set the frame-pointer register and its aliases as reserved if needed.
+ if (TFI->hasFP(MF)) {
+ for (MCSubRegIterator I(X86::RBP, this, /*IncludeSelf=*/true); I.isValid();
+ ++I)
+ Reserved.set(*I);
+ }
+
+ // Set the base-pointer register and its aliases as reserved if needed.
+ if (hasBasePointer(MF)) {
+ CallingConv::ID CC = MF.getFunction()->getCallingConv();
+ const uint32_t *RegMask = getCallPreservedMask(MF, CC);
+ if (MachineOperand::clobbersPhysReg(RegMask, getBaseRegister()))
+ report_fatal_error(
+ "Stack realignment in presence of dynamic allocas is not supported with"
+ "this calling convention.");
+
+ unsigned BasePtr = getX86SubSuperRegister(getBaseRegister(), 64);
+ for (MCSubRegIterator I(BasePtr, this, /*IncludeSelf=*/true);
+ I.isValid(); ++I)
+ Reserved.set(*I);
+ }
+
+ // Mark the segment registers as reserved.
+ Reserved.set(X86::CS);
+ Reserved.set(X86::SS);
+ Reserved.set(X86::DS);
+ Reserved.set(X86::ES);
+ Reserved.set(X86::FS);
+ Reserved.set(X86::GS);
+
+ // Mark the floating point stack registers as reserved.
+ for (unsigned n = 0; n != 8; ++n)
+ Reserved.set(X86::ST0 + n);
+
+ // Reserve the registers that only exist in 64-bit mode.
+ if (!Is64Bit) {
+ // These 8-bit registers are part of the x86-64 extension even though their
+ // super-registers are old 32-bits.
+ Reserved.set(X86::SIL);
+ Reserved.set(X86::DIL);
+ Reserved.set(X86::BPL);
+ Reserved.set(X86::SPL);
+
+ for (unsigned n = 0; n != 8; ++n) {
+ // R8, R9, ...
+ for (MCRegAliasIterator AI(X86::R8 + n, this, true); AI.isValid(); ++AI)
+ Reserved.set(*AI);
+
+ // XMM8, XMM9, ...
+ for (MCRegAliasIterator AI(X86::XMM8 + n, this, true); AI.isValid(); ++AI)
+ Reserved.set(*AI);
+ }
+ }
+ if (!Is64Bit || !MF.getSubtarget<X86Subtarget>().hasAVX512()) {
+ for (unsigned n = 16; n != 32; ++n) {
+ for (MCRegAliasIterator AI(X86::XMM0 + n, this, true); AI.isValid(); ++AI)
+ Reserved.set(*AI);
+ }
+ }
+
+ assert(checkAllSuperRegsMarked(Reserved,
+ {X86::SIL, X86::DIL, X86::BPL, X86::SPL}));
+ return Reserved;
+}
+
+void X86RegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const {
+ // Check if the EFLAGS register is marked as live-out. This shouldn't happen,
+ // because the calling convention defines the EFLAGS register as NOT
+ // preserved.
+ //
+ // Unfortunatelly the EFLAGS show up as live-out after branch folding. Adding
+ // an assert to track this and clear the register afterwards to avoid
+ // unnecessary crashes during release builds.
+ assert(!(Mask[X86::EFLAGS / 32] & (1U << (X86::EFLAGS % 32))) &&
+ "EFLAGS are not live-out from a patchpoint.");
+
+ // Also clean other registers that don't need preserving (IP).
+ for (auto Reg : {X86::EFLAGS, X86::RIP, X86::EIP, X86::IP})
+ Mask[Reg / 32] &= ~(1U << (Reg % 32));
+}
+
+//===----------------------------------------------------------------------===//
+// Stack Frame Processing methods
+//===----------------------------------------------------------------------===//
+
+static bool CantUseSP(const MachineFrameInfo &MFI) {
+ return MFI.hasVarSizedObjects() || MFI.hasOpaqueSPAdjustment();
+}
+
+bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ if (!EnableBasePointer)
+ return false;
+
+ // When we need stack realignment, we can't address the stack from the frame
+ // pointer. When we have dynamic allocas or stack-adjusting inline asm, we
+ // can't address variables from the stack pointer. MS inline asm can
+ // reference locals while also adjusting the stack pointer. When we can't
+ // use both the SP and the FP, we need a separate base pointer register.
+ bool CantUseFP = needsStackRealignment(MF);
+ return CantUseFP && CantUseSP(MFI);
+}
+
+bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
+ if (!TargetRegisterInfo::canRealignStack(MF))
+ return false;
+
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const MachineRegisterInfo *MRI = &MF.getRegInfo();
+
+ // Stack realignment requires a frame pointer. If we already started
+ // register allocation with frame pointer elimination, it is too late now.
+ if (!MRI->canReserveReg(FramePtr))
+ return false;
+
+ // If a base pointer is necessary. Check that it isn't too late to reserve
+ // it.
+ if (CantUseSP(MFI))
+ return MRI->canReserveReg(BasePtr);
+ return true;
+}
+
+bool X86RegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
+ unsigned Reg, int &FrameIdx) const {
+ // Since X86 defines assignCalleeSavedSpillSlots which always return true
+ // this function neither used nor tested.
+ llvm_unreachable("Unused function on X86. Otherwise need a test case.");
+}
+
+// tryOptimizeLEAtoMOV - helper function that tries to replace a LEA instruction
+// of the form 'lea (%esp), %ebx' --> 'mov %esp, %ebx'.
+// TODO: In this case we should be really trying first to entirely eliminate
+// this instruction which is a plain copy.
+static bool tryOptimizeLEAtoMOV(MachineBasicBlock::iterator II) {
+ MachineInstr &MI = *II;
+ unsigned Opc = II->getOpcode();
+ // Check if this is a LEA of the form 'lea (%esp), %ebx'
+ if ((Opc != X86::LEA32r && Opc != X86::LEA64r && Opc != X86::LEA64_32r) ||
+ MI.getOperand(2).getImm() != 1 ||
+ MI.getOperand(3).getReg() != X86::NoRegister ||
+ MI.getOperand(4).getImm() != 0 ||
+ MI.getOperand(5).getReg() != X86::NoRegister)
+ return false;
+ unsigned BasePtr = MI.getOperand(1).getReg();
+ // In X32 mode, ensure the base-pointer is a 32-bit operand, so the LEA will
+ // be replaced with a 32-bit operand MOV which will zero extend the upper
+ // 32-bits of the super register.
+ if (Opc == X86::LEA64_32r)
+ BasePtr = getX86SubSuperRegister(BasePtr, 32);
+ unsigned NewDestReg = MI.getOperand(0).getReg();
+ const X86InstrInfo *TII =
+ MI.getParent()->getParent()->getSubtarget<X86Subtarget>().getInstrInfo();
+ TII->copyPhysReg(*MI.getParent(), II, MI.getDebugLoc(), NewDestReg, BasePtr,
+ MI.getOperand(1).isKill());
+ MI.eraseFromParent();
+ return true;
+}
+
+void
+X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+ int SPAdj, unsigned FIOperandNum,
+ RegScavenger *RS) const {
+ MachineInstr &MI = *II;
+ MachineFunction &MF = *MI.getParent()->getParent();
+ const X86FrameLowering *TFI = getFrameLowering(MF);
+ int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+ unsigned BasePtr;
+
+ unsigned Opc = MI.getOpcode();
+ bool AfterFPPop = Opc == X86::TAILJMPm64 || Opc == X86::TAILJMPm ||
+ Opc == X86::TCRETURNmi || Opc == X86::TCRETURNmi64;
+
+ if (hasBasePointer(MF))
+ BasePtr = (FrameIndex < 0 ? FramePtr : getBaseRegister());
+ else if (needsStackRealignment(MF))
+ BasePtr = (FrameIndex < 0 ? FramePtr : StackPtr);
+ else if (AfterFPPop)
+ BasePtr = StackPtr;
+ else
+ BasePtr = (TFI->hasFP(MF) ? FramePtr : StackPtr);
+
+ // LOCAL_ESCAPE uses a single offset, with no register. It only works in the
+ // simple FP case, and doesn't work with stack realignment. On 32-bit, the
+ // offset is from the traditional base pointer location. On 64-bit, the
+ // offset is from the SP at the end of the prologue, not the FP location. This
+ // matches the behavior of llvm.frameaddress.
+ unsigned IgnoredFrameReg;
+ if (Opc == TargetOpcode::LOCAL_ESCAPE) {
+ MachineOperand &FI = MI.getOperand(FIOperandNum);
+ int Offset;
+ Offset = TFI->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
+ FI.ChangeToImmediate(Offset);
+ return;
+ }
+
+ // For LEA64_32r when BasePtr is 32-bits (X32) we can use full-size 64-bit
+ // register as source operand, semantic is the same and destination is
+ // 32-bits. It saves one byte per lea in code since 0x67 prefix is avoided.
+ // Don't change BasePtr since it is used later for stack adjustment.
+ unsigned MachineBasePtr = BasePtr;
+ if (Opc == X86::LEA64_32r && X86::GR32RegClass.contains(BasePtr))
+ MachineBasePtr = getX86SubSuperRegister(BasePtr, 64);
+
+ // This must be part of a four operand memory reference. Replace the
+ // FrameIndex with base register. Add an offset to the offset.
+ MI.getOperand(FIOperandNum).ChangeToRegister(MachineBasePtr, false);
+
+ // Now add the frame object offset to the offset from EBP.
+ int FIOffset;
+ if (AfterFPPop) {
+ // Tail call jmp happens after FP is popped.
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ FIOffset = MFI.getObjectOffset(FrameIndex) - TFI->getOffsetOfLocalArea();
+ } else
+ FIOffset = TFI->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
+
+ if (BasePtr == StackPtr)
+ FIOffset += SPAdj;
+
+ // The frame index format for stackmaps and patchpoints is different from the
+ // X86 format. It only has a FI and an offset.
+ if (Opc == TargetOpcode::STACKMAP || Opc == TargetOpcode::PATCHPOINT) {
+ assert(BasePtr == FramePtr && "Expected the FP as base register");
+ int64_t Offset = MI.getOperand(FIOperandNum + 1).getImm() + FIOffset;
+ MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+ return;
+ }
+
+ if (MI.getOperand(FIOperandNum+3).isImm()) {
+ // Offset is a 32-bit integer.
+ int Imm = (int)(MI.getOperand(FIOperandNum + 3).getImm());
+ int Offset = FIOffset + Imm;
+ assert((!Is64Bit || isInt<32>((long long)FIOffset + Imm)) &&
+ "Requesting 64-bit offset in 32-bit immediate!");
+ if (Offset != 0 || !tryOptimizeLEAtoMOV(II))
+ MI.getOperand(FIOperandNum + 3).ChangeToImmediate(Offset);
+ } else {
+ // Offset is symbolic. This is extremely rare.
+ uint64_t Offset = FIOffset +
+ (uint64_t)MI.getOperand(FIOperandNum+3).getOffset();
+ MI.getOperand(FIOperandNum + 3).setOffset(Offset);
+ }
+}
+
+unsigned X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+ const X86FrameLowering *TFI = getFrameLowering(MF);
+ return TFI->hasFP(MF) ? FramePtr : StackPtr;
+}
+
+unsigned
+X86RegisterInfo::getPtrSizedFrameRegister(const MachineFunction &MF) const {
+ const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
+ unsigned FrameReg = getFrameRegister(MF);
+ if (Subtarget.isTarget64BitILP32())
+ FrameReg = getX86SubSuperRegister(FrameReg, 32);
+ return FrameReg;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.h b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h
new file mode 100644
index 000000000000..58fa31e94fba
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h
@@ -0,0 +1,142 @@
+//===-- X86RegisterInfo.h - X86 Register Information Impl -------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86REGISTERINFO_H
+#define LLVM_LIB_TARGET_X86_X86REGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "X86GenRegisterInfo.inc"
+
+namespace llvm {
+ class Triple;
+
+class X86RegisterInfo final : public X86GenRegisterInfo {
+private:
+ /// Is64Bit - Is the target 64-bits.
+ ///
+ bool Is64Bit;
+
+ /// IsWin64 - Is the target on of win64 flavours
+ ///
+ bool IsWin64;
+
+ /// SlotSize - Stack slot size in bytes.
+ ///
+ unsigned SlotSize;
+
+ /// StackPtr - X86 physical register used as stack ptr.
+ ///
+ unsigned StackPtr;
+
+ /// FramePtr - X86 physical register used as frame ptr.
+ ///
+ unsigned FramePtr;
+
+ /// BasePtr - X86 physical register used as a base ptr in complex stack
+ /// frames. I.e., when we need a 3rd base, not just SP and FP, due to
+ /// variable size stack objects.
+ unsigned BasePtr;
+
+public:
+ X86RegisterInfo(const Triple &TT);
+
+ // FIXME: This should be tablegen'd like getDwarfRegNum is
+ int getSEHRegNum(unsigned i) const;
+
+ /// Code Generation virtual methods...
+ ///
+ bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
+
+ /// getMatchingSuperRegClass - Return a subclass of the specified register
+ /// class A so that each register in it has a sub-register of the
+ /// specified sub-register index which is in the specified register class B.
+ const TargetRegisterClass *
+ getMatchingSuperRegClass(const TargetRegisterClass *A,
+ const TargetRegisterClass *B,
+ unsigned Idx) const override;
+
+ const TargetRegisterClass *
+ getSubClassWithSubReg(const TargetRegisterClass *RC,
+ unsigned Idx) const override;
+
+ const TargetRegisterClass *
+ getLargestLegalSuperClass(const TargetRegisterClass *RC,
+ const MachineFunction &MF) const override;
+
+ /// getPointerRegClass - Returns a TargetRegisterClass used for pointer
+ /// values.
+ const TargetRegisterClass *
+ getPointerRegClass(const MachineFunction &MF,
+ unsigned Kind = 0) const override;
+
+ /// getCrossCopyRegClass - Returns a legal register class to copy a register
+ /// in the specified class to or from. Returns NULL if it is possible to copy
+ /// between a two registers of the specified class.
+ const TargetRegisterClass *
+ getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
+
+ /// getGPRsForTailCall - Returns a register class with registers that can be
+ /// used in forming tail calls.
+ const TargetRegisterClass *
+ getGPRsForTailCall(const MachineFunction &MF) const;
+
+ unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+ MachineFunction &MF) const override;
+
+ /// getCalleeSavedRegs - Return a null-terminated list of all of the
+ /// callee-save registers on this target.
+ const MCPhysReg *
+ getCalleeSavedRegs(const MachineFunction* MF) const override;
+ const MCPhysReg *
+ getCalleeSavedRegsViaCopy(const MachineFunction *MF) const;
+ const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+ CallingConv::ID) const override;
+ const uint32_t *getNoPreservedMask() const override;
+
+ // Calls involved in thread-local variable lookup save more registers than
+ // normal calls, so they need a different mask to represent this.
+ const uint32_t *getDarwinTLSCallPreservedMask() const;
+
+ /// getReservedRegs - Returns a bitset indexed by physical register number
+ /// indicating if a register is a special register that has particular uses and
+ /// should be considered unavailable at all times, e.g. SP, RA. This is used by
+ /// register scavenger to determine what registers are free.
+ BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+ void adjustStackMapLiveOutMask(uint32_t *Mask) const override;
+
+ bool hasBasePointer(const MachineFunction &MF) const;
+
+ bool canRealignStack(const MachineFunction &MF) const override;
+
+ bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg,
+ int &FrameIdx) const override;
+
+ void eliminateFrameIndex(MachineBasicBlock::iterator MI,
+ int SPAdj, unsigned FIOperandNum,
+ RegScavenger *RS = nullptr) const override;
+
+ // Debug information queries.
+ unsigned getFrameRegister(const MachineFunction &MF) const override;
+ unsigned getPtrSizedFrameRegister(const MachineFunction &MF) const;
+ unsigned getStackRegister() const { return StackPtr; }
+ unsigned getBaseRegister() const { return BasePtr; }
+ // FIXME: Move to FrameInfok
+ unsigned getSlotSize() const { return SlotSize; }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.td b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td
new file mode 100644
index 000000000000..372a15aff15a
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -0,0 +1,530 @@
+//===- X86RegisterInfo.td - Describe the X86 Register File --*- tablegen -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 Register file, defining the registers themselves,
+// aliases between the registers, and the register classes built out of the
+// registers.
+//
+//===----------------------------------------------------------------------===//
+
+class X86Reg<string n, bits<16> Enc, list<Register> subregs = []> : Register<n> {
+ let Namespace = "X86";
+ let HWEncoding = Enc;
+ let SubRegs = subregs;
+}
+
+// Subregister indices.
+let Namespace = "X86" in {
+ def sub_8bit : SubRegIndex<8>;
+ def sub_8bit_hi : SubRegIndex<8, 8>;
+ def sub_16bit : SubRegIndex<16>;
+ def sub_32bit : SubRegIndex<32>;
+ def sub_xmm : SubRegIndex<128>;
+ def sub_ymm : SubRegIndex<256>;
+}
+
+//===----------------------------------------------------------------------===//
+// Register definitions...
+//
+
+// In the register alias definitions below, we define which registers alias
+// which others. We only specify which registers the small registers alias,
+// because the register file generator is smart enough to figure out that
+// AL aliases AX if we tell it that AX aliased AL (for example).
+
+// Dwarf numbering is different for 32-bit and 64-bit, and there are
+// variations by target as well. Currently the first entry is for X86-64,
+// second - for EH on X86-32/Darwin and third is 'generic' one (X86-32/Linux
+// and debug information on X86-32/Darwin)
+
+// 8-bit registers
+// Low registers
+def AL : X86Reg<"al", 0>;
+def DL : X86Reg<"dl", 2>;
+def CL : X86Reg<"cl", 1>;
+def BL : X86Reg<"bl", 3>;
+
+// High registers. On x86-64, these cannot be used in any instruction
+// with a REX prefix.
+def AH : X86Reg<"ah", 4>;
+def DH : X86Reg<"dh", 6>;
+def CH : X86Reg<"ch", 5>;
+def BH : X86Reg<"bh", 7>;
+
+// X86-64 only, requires REX.
+let CostPerUse = 1 in {
+def SIL : X86Reg<"sil", 6>;
+def DIL : X86Reg<"dil", 7>;
+def BPL : X86Reg<"bpl", 5>;
+def SPL : X86Reg<"spl", 4>;
+def R8B : X86Reg<"r8b", 8>;
+def R9B : X86Reg<"r9b", 9>;
+def R10B : X86Reg<"r10b", 10>;
+def R11B : X86Reg<"r11b", 11>;
+def R12B : X86Reg<"r12b", 12>;
+def R13B : X86Reg<"r13b", 13>;
+def R14B : X86Reg<"r14b", 14>;
+def R15B : X86Reg<"r15b", 15>;
+}
+
+// 16-bit registers
+let SubRegIndices = [sub_8bit, sub_8bit_hi], CoveredBySubRegs = 1 in {
+def AX : X86Reg<"ax", 0, [AL,AH]>;
+def DX : X86Reg<"dx", 2, [DL,DH]>;
+def CX : X86Reg<"cx", 1, [CL,CH]>;
+def BX : X86Reg<"bx", 3, [BL,BH]>;
+}
+let SubRegIndices = [sub_8bit] in {
+def SI : X86Reg<"si", 6, [SIL]>;
+def DI : X86Reg<"di", 7, [DIL]>;
+def BP : X86Reg<"bp", 5, [BPL]>;
+def SP : X86Reg<"sp", 4, [SPL]>;
+}
+def IP : X86Reg<"ip", 0>;
+
+// X86-64 only, requires REX.
+let SubRegIndices = [sub_8bit], CostPerUse = 1 in {
+def R8W : X86Reg<"r8w", 8, [R8B]>;
+def R9W : X86Reg<"r9w", 9, [R9B]>;
+def R10W : X86Reg<"r10w", 10, [R10B]>;
+def R11W : X86Reg<"r11w", 11, [R11B]>;
+def R12W : X86Reg<"r12w", 12, [R12B]>;
+def R13W : X86Reg<"r13w", 13, [R13B]>;
+def R14W : X86Reg<"r14w", 14, [R14B]>;
+def R15W : X86Reg<"r15w", 15, [R15B]>;
+}
+
+// 32-bit registers
+let SubRegIndices = [sub_16bit] in {
+def EAX : X86Reg<"eax", 0, [AX]>, DwarfRegNum<[-2, 0, 0]>;
+def EDX : X86Reg<"edx", 2, [DX]>, DwarfRegNum<[-2, 2, 2]>;
+def ECX : X86Reg<"ecx", 1, [CX]>, DwarfRegNum<[-2, 1, 1]>;
+def EBX : X86Reg<"ebx", 3, [BX]>, DwarfRegNum<[-2, 3, 3]>;
+def ESI : X86Reg<"esi", 6, [SI]>, DwarfRegNum<[-2, 6, 6]>;
+def EDI : X86Reg<"edi", 7, [DI]>, DwarfRegNum<[-2, 7, 7]>;
+def EBP : X86Reg<"ebp", 5, [BP]>, DwarfRegNum<[-2, 4, 5]>;
+def ESP : X86Reg<"esp", 4, [SP]>, DwarfRegNum<[-2, 5, 4]>;
+def EIP : X86Reg<"eip", 0, [IP]>, DwarfRegNum<[-2, 8, 8]>;
+
+// X86-64 only, requires REX
+let CostPerUse = 1 in {
+def R8D : X86Reg<"r8d", 8, [R8W]>;
+def R9D : X86Reg<"r9d", 9, [R9W]>;
+def R10D : X86Reg<"r10d", 10, [R10W]>;
+def R11D : X86Reg<"r11d", 11, [R11W]>;
+def R12D : X86Reg<"r12d", 12, [R12W]>;
+def R13D : X86Reg<"r13d", 13, [R13W]>;
+def R14D : X86Reg<"r14d", 14, [R14W]>;
+def R15D : X86Reg<"r15d", 15, [R15W]>;
+}}
+
+// 64-bit registers, X86-64 only
+let SubRegIndices = [sub_32bit] in {
+def RAX : X86Reg<"rax", 0, [EAX]>, DwarfRegNum<[0, -2, -2]>;
+def RDX : X86Reg<"rdx", 2, [EDX]>, DwarfRegNum<[1, -2, -2]>;
+def RCX : X86Reg<"rcx", 1, [ECX]>, DwarfRegNum<[2, -2, -2]>;
+def RBX : X86Reg<"rbx", 3, [EBX]>, DwarfRegNum<[3, -2, -2]>;
+def RSI : X86Reg<"rsi", 6, [ESI]>, DwarfRegNum<[4, -2, -2]>;
+def RDI : X86Reg<"rdi", 7, [EDI]>, DwarfRegNum<[5, -2, -2]>;
+def RBP : X86Reg<"rbp", 5, [EBP]>, DwarfRegNum<[6, -2, -2]>;
+def RSP : X86Reg<"rsp", 4, [ESP]>, DwarfRegNum<[7, -2, -2]>;
+
+// These also require REX.
+let CostPerUse = 1 in {
+def R8 : X86Reg<"r8", 8, [R8D]>, DwarfRegNum<[ 8, -2, -2]>;
+def R9 : X86Reg<"r9", 9, [R9D]>, DwarfRegNum<[ 9, -2, -2]>;
+def R10 : X86Reg<"r10", 10, [R10D]>, DwarfRegNum<[10, -2, -2]>;
+def R11 : X86Reg<"r11", 11, [R11D]>, DwarfRegNum<[11, -2, -2]>;
+def R12 : X86Reg<"r12", 12, [R12D]>, DwarfRegNum<[12, -2, -2]>;
+def R13 : X86Reg<"r13", 13, [R13D]>, DwarfRegNum<[13, -2, -2]>;
+def R14 : X86Reg<"r14", 14, [R14D]>, DwarfRegNum<[14, -2, -2]>;
+def R15 : X86Reg<"r15", 15, [R15D]>, DwarfRegNum<[15, -2, -2]>;
+def RIP : X86Reg<"rip", 0, [EIP]>, DwarfRegNum<[16, -2, -2]>;
+}}
+
+// MMX Registers. These are actually aliased to ST0 .. ST7
+def MM0 : X86Reg<"mm0", 0>, DwarfRegNum<[41, 29, 29]>;
+def MM1 : X86Reg<"mm1", 1>, DwarfRegNum<[42, 30, 30]>;
+def MM2 : X86Reg<"mm2", 2>, DwarfRegNum<[43, 31, 31]>;
+def MM3 : X86Reg<"mm3", 3>, DwarfRegNum<[44, 32, 32]>;
+def MM4 : X86Reg<"mm4", 4>, DwarfRegNum<[45, 33, 33]>;
+def MM5 : X86Reg<"mm5", 5>, DwarfRegNum<[46, 34, 34]>;
+def MM6 : X86Reg<"mm6", 6>, DwarfRegNum<[47, 35, 35]>;
+def MM7 : X86Reg<"mm7", 7>, DwarfRegNum<[48, 36, 36]>;
+
+// Pseudo Floating Point registers
+def FP0 : X86Reg<"fp0", 0>;
+def FP1 : X86Reg<"fp1", 0>;
+def FP2 : X86Reg<"fp2", 0>;
+def FP3 : X86Reg<"fp3", 0>;
+def FP4 : X86Reg<"fp4", 0>;
+def FP5 : X86Reg<"fp5", 0>;
+def FP6 : X86Reg<"fp6", 0>;
+def FP7 : X86Reg<"fp7", 0>;
+
+// XMM Registers, used by the various SSE instruction set extensions.
+def XMM0: X86Reg<"xmm0", 0>, DwarfRegNum<[17, 21, 21]>;
+def XMM1: X86Reg<"xmm1", 1>, DwarfRegNum<[18, 22, 22]>;
+def XMM2: X86Reg<"xmm2", 2>, DwarfRegNum<[19, 23, 23]>;
+def XMM3: X86Reg<"xmm3", 3>, DwarfRegNum<[20, 24, 24]>;
+def XMM4: X86Reg<"xmm4", 4>, DwarfRegNum<[21, 25, 25]>;
+def XMM5: X86Reg<"xmm5", 5>, DwarfRegNum<[22, 26, 26]>;
+def XMM6: X86Reg<"xmm6", 6>, DwarfRegNum<[23, 27, 27]>;
+def XMM7: X86Reg<"xmm7", 7>, DwarfRegNum<[24, 28, 28]>;
+
+// X86-64 only
+let CostPerUse = 1 in {
+def XMM8: X86Reg<"xmm8", 8>, DwarfRegNum<[25, -2, -2]>;
+def XMM9: X86Reg<"xmm9", 9>, DwarfRegNum<[26, -2, -2]>;
+def XMM10: X86Reg<"xmm10", 10>, DwarfRegNum<[27, -2, -2]>;
+def XMM11: X86Reg<"xmm11", 11>, DwarfRegNum<[28, -2, -2]>;
+def XMM12: X86Reg<"xmm12", 12>, DwarfRegNum<[29, -2, -2]>;
+def XMM13: X86Reg<"xmm13", 13>, DwarfRegNum<[30, -2, -2]>;
+def XMM14: X86Reg<"xmm14", 14>, DwarfRegNum<[31, -2, -2]>;
+def XMM15: X86Reg<"xmm15", 15>, DwarfRegNum<[32, -2, -2]>;
+
+def XMM16: X86Reg<"xmm16", 16>, DwarfRegNum<[60, -2, -2]>;
+def XMM17: X86Reg<"xmm17", 17>, DwarfRegNum<[61, -2, -2]>;
+def XMM18: X86Reg<"xmm18", 18>, DwarfRegNum<[62, -2, -2]>;
+def XMM19: X86Reg<"xmm19", 19>, DwarfRegNum<[63, -2, -2]>;
+def XMM20: X86Reg<"xmm20", 20>, DwarfRegNum<[64, -2, -2]>;
+def XMM21: X86Reg<"xmm21", 21>, DwarfRegNum<[65, -2, -2]>;
+def XMM22: X86Reg<"xmm22", 22>, DwarfRegNum<[66, -2, -2]>;
+def XMM23: X86Reg<"xmm23", 23>, DwarfRegNum<[67, -2, -2]>;
+def XMM24: X86Reg<"xmm24", 24>, DwarfRegNum<[68, -2, -2]>;
+def XMM25: X86Reg<"xmm25", 25>, DwarfRegNum<[69, -2, -2]>;
+def XMM26: X86Reg<"xmm26", 26>, DwarfRegNum<[70, -2, -2]>;
+def XMM27: X86Reg<"xmm27", 27>, DwarfRegNum<[71, -2, -2]>;
+def XMM28: X86Reg<"xmm28", 28>, DwarfRegNum<[72, -2, -2]>;
+def XMM29: X86Reg<"xmm29", 29>, DwarfRegNum<[73, -2, -2]>;
+def XMM30: X86Reg<"xmm30", 30>, DwarfRegNum<[74, -2, -2]>;
+def XMM31: X86Reg<"xmm31", 31>, DwarfRegNum<[75, -2, -2]>;
+
+} // CostPerUse
+
+// YMM0-15 registers, used by AVX instructions and
+// YMM16-31 registers, used by AVX-512 instructions.
+let SubRegIndices = [sub_xmm] in {
+ foreach Index = 0-31 in {
+ def YMM#Index : X86Reg<"ymm"#Index, Index, [!cast<X86Reg>("XMM"#Index)]>,
+ DwarfRegAlias<!cast<X86Reg>("XMM"#Index)>;
+ }
+}
+
+// ZMM Registers, used by AVX-512 instructions.
+let SubRegIndices = [sub_ymm] in {
+ foreach Index = 0-31 in {
+ def ZMM#Index : X86Reg<"zmm"#Index, Index, [!cast<X86Reg>("YMM"#Index)]>,
+ DwarfRegAlias<!cast<X86Reg>("XMM"#Index)>;
+ }
+}
+
+// Mask Registers, used by AVX-512 instructions.
+def K0 : X86Reg<"k0", 0>, DwarfRegNum<[118, 93, 93]>;
+def K1 : X86Reg<"k1", 1>, DwarfRegNum<[119, 94, 94]>;
+def K2 : X86Reg<"k2", 2>, DwarfRegNum<[120, 95, 95]>;
+def K3 : X86Reg<"k3", 3>, DwarfRegNum<[121, 96, 96]>;
+def K4 : X86Reg<"k4", 4>, DwarfRegNum<[122, 97, 97]>;
+def K5 : X86Reg<"k5", 5>, DwarfRegNum<[123, 98, 98]>;
+def K6 : X86Reg<"k6", 6>, DwarfRegNum<[124, 99, 99]>;
+def K7 : X86Reg<"k7", 7>, DwarfRegNum<[125, 100, 100]>;
+
+// Floating point stack registers. These don't map one-to-one to the FP
+// pseudo registers, but we still mark them as aliasing FP registers. That
+// way both kinds can be live without exceeding the stack depth. ST registers
+// are only live around inline assembly.
+def ST0 : X86Reg<"st(0)", 0>, DwarfRegNum<[33, 12, 11]>;
+def ST1 : X86Reg<"st(1)", 1>, DwarfRegNum<[34, 13, 12]>;
+def ST2 : X86Reg<"st(2)", 2>, DwarfRegNum<[35, 14, 13]>;
+def ST3 : X86Reg<"st(3)", 3>, DwarfRegNum<[36, 15, 14]>;
+def ST4 : X86Reg<"st(4)", 4>, DwarfRegNum<[37, 16, 15]>;
+def ST5 : X86Reg<"st(5)", 5>, DwarfRegNum<[38, 17, 16]>;
+def ST6 : X86Reg<"st(6)", 6>, DwarfRegNum<[39, 18, 17]>;
+def ST7 : X86Reg<"st(7)", 7>, DwarfRegNum<[40, 19, 18]>;
+
+// Floating-point status word
+def FPSW : X86Reg<"fpsw", 0>;
+
+// Status flags register
+def EFLAGS : X86Reg<"flags", 0>;
+
+// Segment registers
+def CS : X86Reg<"cs", 1>;
+def DS : X86Reg<"ds", 3>;
+def SS : X86Reg<"ss", 2>;
+def ES : X86Reg<"es", 0>;
+def FS : X86Reg<"fs", 4>;
+def GS : X86Reg<"gs", 5>;
+
+// Debug registers
+def DR0 : X86Reg<"dr0", 0>;
+def DR1 : X86Reg<"dr1", 1>;
+def DR2 : X86Reg<"dr2", 2>;
+def DR3 : X86Reg<"dr3", 3>;
+def DR4 : X86Reg<"dr4", 4>;
+def DR5 : X86Reg<"dr5", 5>;
+def DR6 : X86Reg<"dr6", 6>;
+def DR7 : X86Reg<"dr7", 7>;
+def DR8 : X86Reg<"dr8", 8>;
+def DR9 : X86Reg<"dr9", 9>;
+def DR10 : X86Reg<"dr10", 10>;
+def DR11 : X86Reg<"dr11", 11>;
+def DR12 : X86Reg<"dr12", 12>;
+def DR13 : X86Reg<"dr13", 13>;
+def DR14 : X86Reg<"dr14", 14>;
+def DR15 : X86Reg<"dr15", 15>;
+
+// Control registers
+def CR0 : X86Reg<"cr0", 0>;
+def CR1 : X86Reg<"cr1", 1>;
+def CR2 : X86Reg<"cr2", 2>;
+def CR3 : X86Reg<"cr3", 3>;
+def CR4 : X86Reg<"cr4", 4>;
+def CR5 : X86Reg<"cr5", 5>;
+def CR6 : X86Reg<"cr6", 6>;
+def CR7 : X86Reg<"cr7", 7>;
+def CR8 : X86Reg<"cr8", 8>;
+def CR9 : X86Reg<"cr9", 9>;
+def CR10 : X86Reg<"cr10", 10>;
+def CR11 : X86Reg<"cr11", 11>;
+def CR12 : X86Reg<"cr12", 12>;
+def CR13 : X86Reg<"cr13", 13>;
+def CR14 : X86Reg<"cr14", 14>;
+def CR15 : X86Reg<"cr15", 15>;
+
+// Pseudo index registers
+def EIZ : X86Reg<"eiz", 4>;
+def RIZ : X86Reg<"riz", 4>;
+
+// Bound registers, used in MPX instructions
+def BND0 : X86Reg<"bnd0", 0>;
+def BND1 : X86Reg<"bnd1", 1>;
+def BND2 : X86Reg<"bnd2", 2>;
+def BND3 : X86Reg<"bnd3", 3>;
+
+//===----------------------------------------------------------------------===//
+// Register Class Definitions... now that we have all of the pieces, define the
+// top-level register classes. The order specified in the register list is
+// implicitly defined to be the register allocation order.
+//
+
+// List call-clobbered registers before callee-save registers. RBX, RBP, (and
+// R12, R13, R14, and R15 for X86-64) are callee-save registers.
+// In 64-mode, there are 12 additional i8 registers, SIL, DIL, BPL, SPL, and
+// R8B, ... R15B.
+// Allocate R12 and R13 last, as these require an extra byte when
+// encoded in x86_64 instructions.
+// FIXME: Allow AH, CH, DH, BH to be used as general-purpose registers in
+// 64-bit mode. The main complication is that they cannot be encoded in an
+// instruction requiring a REX prefix, while SIL, DIL, BPL, R8D, etc.
+// require a REX prefix. For example, "addb %ah, %dil" and "movzbl %ah, %r8d"
+// cannot be encoded.
+def GR8 : RegisterClass<"X86", [i8], 8,
+ (add AL, CL, DL, AH, CH, DH, BL, BH, SIL, DIL, BPL, SPL,
+ R8B, R9B, R10B, R11B, R14B, R15B, R12B, R13B)> {
+ let AltOrders = [(sub GR8, AH, BH, CH, DH)];
+ let AltOrderSelect = [{
+ return MF.getSubtarget<X86Subtarget>().is64Bit();
+ }];
+}
+
+def GR16 : RegisterClass<"X86", [i16], 16,
+ (add AX, CX, DX, SI, DI, BX, BP, SP,
+ R8W, R9W, R10W, R11W, R14W, R15W, R12W, R13W)>;
+
+def GR32 : RegisterClass<"X86", [i32], 32,
+ (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP,
+ R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D)>;
+
+// GR64 - 64-bit GPRs. This oddly includes RIP, which isn't accurate, since
+// RIP isn't really a register and it can't be used anywhere except in an
+// address, but it doesn't cause trouble.
+// FIXME: it *does* cause trouble - CheckBaseRegAndIndexReg() has extra
+// tests because of the inclusion of RIP in this register class.
+def GR64 : RegisterClass<"X86", [i64], 64,
+ (add RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+ RBX, R14, R15, R12, R13, RBP, RSP, RIP)>;
+
+// Segment registers for use by MOV instructions (and others) that have a
+// segment register as one operand. Always contain a 16-bit segment
+// descriptor.
+def SEGMENT_REG : RegisterClass<"X86", [i16], 16, (add CS, DS, SS, ES, FS, GS)>;
+
+// Debug registers.
+def DEBUG_REG : RegisterClass<"X86", [i32], 32, (sequence "DR%u", 0, 7)>;
+
+// Control registers.
+def CONTROL_REG : RegisterClass<"X86", [i64], 64, (sequence "CR%u", 0, 15)>;
+
+// GR8_ABCD_L, GR8_ABCD_H, GR16_ABCD, GR32_ABCD, GR64_ABCD - Subclasses of
+// GR8, GR16, GR32, and GR64 which contain just the "a" "b", "c", and "d"
+// registers. On x86-32, GR16_ABCD and GR32_ABCD are classes for registers
+// that support 8-bit subreg operations. On x86-64, GR16_ABCD, GR32_ABCD,
+// and GR64_ABCD are classes for registers that support 8-bit h-register
+// operations.
+def GR8_ABCD_L : RegisterClass<"X86", [i8], 8, (add AL, CL, DL, BL)>;
+def GR8_ABCD_H : RegisterClass<"X86", [i8], 8, (add AH, CH, DH, BH)>;
+def GR16_ABCD : RegisterClass<"X86", [i16], 16, (add AX, CX, DX, BX)>;
+def GR32_ABCD : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX, EBX)>;
+def GR64_ABCD : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RBX)>;
+def GR32_TC : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX)>;
+def GR64_TC : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RSI, RDI,
+ R8, R9, R11, RIP)>;
+def GR64_TCW64 : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX,
+ R8, R9, R10, R11, RIP)>;
+
+// GR8_NOREX - GR8 registers which do not require a REX prefix.
+def GR8_NOREX : RegisterClass<"X86", [i8], 8,
+ (add AL, CL, DL, AH, CH, DH, BL, BH)> {
+ let AltOrders = [(sub GR8_NOREX, AH, BH, CH, DH)];
+ let AltOrderSelect = [{
+ return MF.getSubtarget<X86Subtarget>().is64Bit();
+ }];
+}
+// GR16_NOREX - GR16 registers which do not require a REX prefix.
+def GR16_NOREX : RegisterClass<"X86", [i16], 16,
+ (add AX, CX, DX, SI, DI, BX, BP, SP)>;
+// GR32_NOREX - GR32 registers which do not require a REX prefix.
+def GR32_NOREX : RegisterClass<"X86", [i32], 32,
+ (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP)>;
+// GR64_NOREX - GR64 registers which do not require a REX prefix.
+def GR64_NOREX : RegisterClass<"X86", [i64], 64,
+ (add RAX, RCX, RDX, RSI, RDI, RBX, RBP, RSP, RIP)>;
+
+// GR32_NOAX - GR32 registers except EAX. Used by AddRegFrm of XCHG32 in 64-bit
+// mode to prevent encoding using the 0x90 NOP encoding. xchg %eax, %eax needs
+// to clear upper 32-bits of RAX so is not a NOP.
+def GR32_NOAX : RegisterClass<"X86", [i32], 32, (sub GR32, EAX)>;
+
+// GR32_NOSP - GR32 registers except ESP.
+def GR32_NOSP : RegisterClass<"X86", [i32], 32, (sub GR32, ESP)>;
+
+// GR64_NOSP - GR64 registers except RSP (and RIP).
+def GR64_NOSP : RegisterClass<"X86", [i64], 64, (sub GR64, RSP, RIP)>;
+
+// GR32_NOREX_NOSP - GR32 registers which do not require a REX prefix except
+// ESP.
+def GR32_NOREX_NOSP : RegisterClass<"X86", [i32], 32,
+ (and GR32_NOREX, GR32_NOSP)>;
+
+// GR64_NOREX_NOSP - GR64_NOREX registers except RSP.
+def GR64_NOREX_NOSP : RegisterClass<"X86", [i64], 64,
+ (and GR64_NOREX, GR64_NOSP)>;
+
+// Register classes used for ABIs that use 32-bit address accesses,
+// while using the whole x84_64 ISA.
+
+// In such cases, it is fine to use RIP as we are sure the 32 high
+// bits are not set. We do not need variants for NOSP as RIP is not
+// allowed there.
+// RIP is not spilled anywhere for now, so stick to 32-bit alignment
+// to save on memory space.
+// FIXME: We could allow all 64bit registers, but we would need
+// something to check that the 32 high bits are not set,
+// which we do not have right now.
+def LOW32_ADDR_ACCESS : RegisterClass<"X86", [i32], 32, (add GR32, RIP)>;
+
+// When RBP is used as a base pointer in a 32-bit addresses environement,
+// this is also safe to use the full register to access addresses.
+// Since RBP will never be spilled, stick to a 32 alignment to save
+// on memory consumption.
+def LOW32_ADDR_ACCESS_RBP : RegisterClass<"X86", [i32], 32,
+ (add LOW32_ADDR_ACCESS, RBP)>;
+
+// A class to support the 'A' assembler constraint: EAX then EDX.
+def GR32_AD : RegisterClass<"X86", [i32], 32, (add EAX, EDX)>;
+
+// Scalar SSE2 floating point registers.
+def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>;
+
+def FR64 : RegisterClass<"X86", [f64], 64, (add FR32)>;
+
+def FR128 : RegisterClass<"X86", [i128, f128], 128, (add FR32)>;
+
+
+// FIXME: This sets up the floating point register files as though they are f64
+// values, though they really are f80 values. This will cause us to spill
+// values as 64-bit quantities instead of 80-bit quantities, which is much much
+// faster on common hardware. In reality, this should be controlled by a
+// command line option or something.
+
+def RFP32 : RegisterClass<"X86",[f32], 32, (sequence "FP%u", 0, 6)>;
+def RFP64 : RegisterClass<"X86",[f64], 32, (add RFP32)>;
+def RFP80 : RegisterClass<"X86",[f80], 32, (add RFP32)>;
+
+// Floating point stack registers (these are not allocatable by the
+// register allocator - the floating point stackifier is responsible
+// for transforming FPn allocations to STn registers)
+def RST : RegisterClass<"X86", [f80, f64, f32], 32, (sequence "ST%u", 0, 7)> {
+ let isAllocatable = 0;
+}
+
+// Generic vector registers: VR64 and VR128.
+// Ensure that float types are declared first - only float is legal on SSE1.
+def VR64: RegisterClass<"X86", [x86mmx], 64, (sequence "MM%u", 0, 7)>;
+def VR128 : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64],
+ 128, (add FR32)>;
+def VR256 : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
+ 256, (sequence "YMM%u", 0, 15)>;
+
+// Special classes that help the assembly parser choose some alternate
+// instructions to favor 2-byte VEX encodings.
+def VR128L : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64],
+ 128, (sequence "XMM%u", 0, 7)>;
+def VR128H : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64],
+ 128, (sequence "XMM%u", 8, 15)>;
+def VR256L : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
+ 256, (sequence "YMM%u", 0, 7)>;
+def VR256H : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
+ 256, (sequence "YMM%u", 8, 15)>;
+
+// Status flags registers.
+def CCR : RegisterClass<"X86", [i32], 32, (add EFLAGS)> {
+ let CopyCost = -1; // Don't allow copying of status registers.
+ let isAllocatable = 0;
+}
+def FPCCR : RegisterClass<"X86", [i16], 16, (add FPSW)> {
+ let CopyCost = -1; // Don't allow copying of status registers.
+ let isAllocatable = 0;
+}
+
+// AVX-512 vector/mask registers.
+def VR512 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64],
+ 512, (sequence "ZMM%u", 0, 31)>;
+
+// Scalar AVX-512 floating point registers.
+def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>;
+
+def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>;
+
+// Extended VR128 and VR256 for AVX-512 instructions
+def VR128X : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64],
+ 128, (add FR32X)>;
+def VR256X : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
+ 256, (sequence "YMM%u", 0, 31)>;
+
+// Mask registers
+def VK1 : RegisterClass<"X86", [i1], 16, (sequence "K%u", 0, 7)> {let Size = 16;}
+def VK2 : RegisterClass<"X86", [v2i1], 16, (add VK1)> {let Size = 16;}
+def VK4 : RegisterClass<"X86", [v4i1], 16, (add VK2)> {let Size = 16;}
+def VK8 : RegisterClass<"X86", [v8i1], 16, (add VK4)> {let Size = 16;}
+def VK16 : RegisterClass<"X86", [v16i1], 16, (add VK8)> {let Size = 16;}
+def VK32 : RegisterClass<"X86", [v32i1], 32, (add VK16)> {let Size = 32;}
+def VK64 : RegisterClass<"X86", [v64i1], 64, (add VK32)> {let Size = 64;}
+
+def VK1WM : RegisterClass<"X86", [i1], 16, (sub VK1, K0)> {let Size = 16;}
+def VK2WM : RegisterClass<"X86", [v2i1], 16, (sub VK2, K0)> {let Size = 16;}
+def VK4WM : RegisterClass<"X86", [v4i1], 16, (sub VK4, K0)> {let Size = 16;}
+def VK8WM : RegisterClass<"X86", [v8i1], 16, (sub VK8, K0)> {let Size = 16;}
+def VK16WM : RegisterClass<"X86", [v16i1], 16, (add VK8WM)> {let Size = 16;}
+def VK32WM : RegisterClass<"X86", [v32i1], 32, (add VK16WM)> {let Size = 32;}
+def VK64WM : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;}
+
+// Bound registers
+def BNDR : RegisterClass<"X86", [v2i64], 128, (sequence "BND%u", 0, 3)>;
diff --git a/contrib/llvm/lib/Target/X86/X86SchedHaswell.td b/contrib/llvm/lib/Target/X86/X86SchedHaswell.td
new file mode 100644
index 000000000000..677e82459766
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86SchedHaswell.td
@@ -0,0 +1,2147 @@
+//=- X86SchedHaswell.td - X86 Haswell Scheduling -------------*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Haswell to support instruction
+// scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def HaswellModel : SchedMachineModel {
+ // All x86 instructions are modeled as a single micro-op, and HW can decode 4
+ // instructions per cycle.
+ let IssueWidth = 4;
+ let MicroOpBufferSize = 192; // Based on the reorder buffer.
+ let LoadLatency = 4;
+ let MispredictPenalty = 16;
+
+ // Based on the LSD (loop-stream detector) queue size and benchmarking data.
+ let LoopMicroOpBufferSize = 50;
+
+ // FIXME: SSE4 and AVX are unimplemented. This flag is set to allow
+ // the scheduler to assign a default model to unrecognized opcodes.
+ let CompleteModel = 0;
+}
+
+let SchedModel = HaswellModel in {
+
+// Haswell can issue micro-ops to 8 different ports in one cycle.
+
+// Ports 0, 1, 5, and 6 handle all computation.
+// Port 4 gets the data half of stores. Store data can be available later than
+// the store address, but since we don't model the latency of stores, we can
+// ignore that.
+// Ports 2 and 3 are identical. They handle loads and the address half of
+// stores. Port 7 can handle address calculations.
+def HWPort0 : ProcResource<1>;
+def HWPort1 : ProcResource<1>;
+def HWPort2 : ProcResource<1>;
+def HWPort3 : ProcResource<1>;
+def HWPort4 : ProcResource<1>;
+def HWPort5 : ProcResource<1>;
+def HWPort6 : ProcResource<1>;
+def HWPort7 : ProcResource<1>;
+
+// Many micro-ops are capable of issuing on multiple ports.
+def HWPort01 : ProcResGroup<[HWPort0, HWPort1]>;
+def HWPort23 : ProcResGroup<[HWPort2, HWPort3]>;
+def HWPort237 : ProcResGroup<[HWPort2, HWPort3, HWPort7]>;
+def HWPort04 : ProcResGroup<[HWPort0, HWPort4]>;
+def HWPort05 : ProcResGroup<[HWPort0, HWPort5]>;
+def HWPort06 : ProcResGroup<[HWPort0, HWPort6]>;
+def HWPort15 : ProcResGroup<[HWPort1, HWPort5]>;
+def HWPort16 : ProcResGroup<[HWPort1, HWPort6]>;
+def HWPort56 : ProcResGroup<[HWPort5, HWPort6]>;
+def HWPort015 : ProcResGroup<[HWPort0, HWPort1, HWPort5]>;
+def HWPort056 : ProcResGroup<[HWPort0, HWPort5, HWPort6]>;
+def HWPort0156: ProcResGroup<[HWPort0, HWPort1, HWPort5, HWPort6]>;
+
+// 60 Entry Unified Scheduler
+def HWPortAny : ProcResGroup<[HWPort0, HWPort1, HWPort2, HWPort3, HWPort4,
+ HWPort5, HWPort6, HWPort7]> {
+ let BufferSize=60;
+}
+
+// Integer division issued on port 0.
+def HWDivider : ProcResource<1>;
+
+// Loads are 4 cycles, so ReadAfterLd registers needn't be available until 4
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 4>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when queued in the reservation station.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass HWWriteResPair<X86FoldableSchedWrite SchedRW,
+ ProcResourceKind ExePort,
+ int Lat> {
+ // Register variant is using a single cycle on ExePort.
+ def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+ // Memory variant also uses a cycle on port 2/3 and adds 4 cycles to the
+ // latency.
+ def : WriteRes<SchedRW.Folded, [HWPort23, ExePort]> {
+ let Latency = !add(Lat, 4);
+ }
+}
+
+// A folded store needs a cycle on port 4 for the store data, but it does not
+// need an extra port 2/3 cycle to recompute the address.
+def : WriteRes<WriteRMW, [HWPort4]>;
+
+// Store_addr on 237.
+// Store_data on 4.
+def : WriteRes<WriteStore, [HWPort237, HWPort4]>;
+def : WriteRes<WriteLoad, [HWPort23]> { let Latency = 4; }
+def : WriteRes<WriteMove, [HWPort0156]>;
+def : WriteRes<WriteZero, []>;
+
+defm : HWWriteResPair<WriteALU, HWPort0156, 1>;
+defm : HWWriteResPair<WriteIMul, HWPort1, 3>;
+def : WriteRes<WriteIMulH, []> { let Latency = 3; }
+defm : HWWriteResPair<WriteShift, HWPort06, 1>;
+defm : HWWriteResPair<WriteJump, HWPort06, 1>;
+
+// This is for simple LEAs with one or two input operands.
+// The complex ones can only execute on port 1, and they require two cycles on
+// the port to read all inputs. We don't model that.
+def : WriteRes<WriteLEA, [HWPort15]>;
+
+// This is quite rough, latency depends on the dividend.
+def : WriteRes<WriteIDiv, [HWPort0, HWDivider]> {
+ let Latency = 25;
+ let ResourceCycles = [1, 10];
+}
+def : WriteRes<WriteIDivLd, [HWPort23, HWPort0, HWDivider]> {
+ let Latency = 29;
+ let ResourceCycles = [1, 1, 10];
+}
+
+// Scalar and vector floating point.
+defm : HWWriteResPair<WriteFAdd, HWPort1, 3>;
+defm : HWWriteResPair<WriteFMul, HWPort0, 5>;
+defm : HWWriteResPair<WriteFDiv, HWPort0, 12>; // 10-14 cycles.
+defm : HWWriteResPair<WriteFRcp, HWPort0, 5>;
+defm : HWWriteResPair<WriteFRsqrt, HWPort0, 5>;
+defm : HWWriteResPair<WriteFSqrt, HWPort0, 15>;
+defm : HWWriteResPair<WriteCvtF2I, HWPort1, 3>;
+defm : HWWriteResPair<WriteCvtI2F, HWPort1, 4>;
+defm : HWWriteResPair<WriteCvtF2F, HWPort1, 3>;
+defm : HWWriteResPair<WriteFShuffle, HWPort5, 1>;
+defm : HWWriteResPair<WriteFBlend, HWPort015, 1>;
+defm : HWWriteResPair<WriteFShuffle256, HWPort5, 3>;
+
+def : WriteRes<WriteFVarBlend, [HWPort5]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WriteFVarBlendLd, [HWPort5, HWPort23]> {
+ let Latency = 6;
+ let ResourceCycles = [2, 1];
+}
+
+// Vector integer operations.
+defm : HWWriteResPair<WriteVecShift, HWPort0, 1>;
+defm : HWWriteResPair<WriteVecLogic, HWPort015, 1>;
+defm : HWWriteResPair<WriteVecALU, HWPort15, 1>;
+defm : HWWriteResPair<WriteVecIMul, HWPort0, 5>;
+defm : HWWriteResPair<WriteShuffle, HWPort5, 1>;
+defm : HWWriteResPair<WriteBlend, HWPort15, 1>;
+defm : HWWriteResPair<WriteShuffle256, HWPort5, 3>;
+
+def : WriteRes<WriteVarBlend, [HWPort5]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WriteVarBlendLd, [HWPort5, HWPort23]> {
+ let Latency = 6;
+ let ResourceCycles = [2, 1];
+}
+
+def : WriteRes<WriteVarVecShift, [HWPort0, HWPort5]> {
+ let Latency = 2;
+ let ResourceCycles = [2, 1];
+}
+def : WriteRes<WriteVarVecShiftLd, [HWPort0, HWPort5, HWPort23]> {
+ let Latency = 6;
+ let ResourceCycles = [2, 1, 1];
+}
+
+def : WriteRes<WriteMPSAD, [HWPort0, HWPort5]> {
+ let Latency = 6;
+ let ResourceCycles = [1, 2];
+}
+def : WriteRes<WriteMPSADLd, [HWPort23, HWPort0, HWPort5]> {
+ let Latency = 6;
+ let ResourceCycles = [1, 1, 2];
+}
+
+// String instructions.
+// Packed Compare Implicit Length Strings, Return Mask
+def : WriteRes<WritePCmpIStrM, [HWPort0]> {
+ let Latency = 10;
+ let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrMLd, [HWPort0, HWPort23]> {
+ let Latency = 10;
+ let ResourceCycles = [3, 1];
+}
+
+// Packed Compare Explicit Length Strings, Return Mask
+def : WriteRes<WritePCmpEStrM, [HWPort0, HWPort16, HWPort5]> {
+ let Latency = 10;
+ let ResourceCycles = [3, 2, 4];
+}
+def : WriteRes<WritePCmpEStrMLd, [HWPort05, HWPort16, HWPort23]> {
+ let Latency = 10;
+ let ResourceCycles = [6, 2, 1];
+}
+
+// Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpIStrI, [HWPort0]> {
+ let Latency = 11;
+ let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrILd, [HWPort0, HWPort23]> {
+ let Latency = 11;
+ let ResourceCycles = [3, 1];
+}
+
+// Packed Compare Explicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrI, [HWPort05, HWPort16]> {
+ let Latency = 11;
+ let ResourceCycles = [6, 2];
+}
+def : WriteRes<WritePCmpEStrILd, [HWPort0, HWPort16, HWPort5, HWPort23]> {
+ let Latency = 11;
+ let ResourceCycles = [3, 2, 2, 1];
+}
+
+// AES Instructions.
+def : WriteRes<WriteAESDecEnc, [HWPort5]> {
+ let Latency = 7;
+ let ResourceCycles = [1];
+}
+def : WriteRes<WriteAESDecEncLd, [HWPort5, HWPort23]> {
+ let Latency = 7;
+ let ResourceCycles = [1, 1];
+}
+
+def : WriteRes<WriteAESIMC, [HWPort5]> {
+ let Latency = 14;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WriteAESIMCLd, [HWPort5, HWPort23]> {
+ let Latency = 14;
+ let ResourceCycles = [2, 1];
+}
+
+def : WriteRes<WriteAESKeyGen, [HWPort0, HWPort5]> {
+ let Latency = 10;
+ let ResourceCycles = [2, 8];
+}
+def : WriteRes<WriteAESKeyGenLd, [HWPort0, HWPort5, HWPort23]> {
+ let Latency = 10;
+ let ResourceCycles = [2, 7, 1];
+}
+
+// Carry-less multiplication instructions.
+def : WriteRes<WriteCLMul, [HWPort0, HWPort5]> {
+ let Latency = 7;
+ let ResourceCycles = [2, 1];
+}
+def : WriteRes<WriteCLMulLd, [HWPort0, HWPort5, HWPort23]> {
+ let Latency = 7;
+ let ResourceCycles = [2, 1, 1];
+}
+
+def : WriteRes<WriteSystem, [HWPort0156]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [HWPort0156]> { let Latency = 100; }
+def : WriteRes<WriteFence, [HWPort23, HWPort4]>;
+def : WriteRes<WriteNop, []>;
+
+//================ Exceptions ================//
+
+//-- Specific Scheduling Models --//
+
+// Starting with P0.
+def WriteP0 : SchedWriteRes<[HWPort0]>;
+
+def WriteP0_P1_Lat4 : SchedWriteRes<[HWPort0, HWPort1]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+
+def WriteP0_P1_Lat4Ld : SchedWriteRes<[HWPort0, HWPort1, HWPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1, 1, 1];
+}
+
+def WriteP01 : SchedWriteRes<[HWPort01]>;
+
+def Write2P01 : SchedWriteRes<[HWPort01]> {
+ let NumMicroOps = 2;
+}
+def Write3P01 : SchedWriteRes<[HWPort01]> {
+ let NumMicroOps = 3;
+}
+
+def WriteP015 : SchedWriteRes<[HWPort015]>;
+
+def WriteP01_P5 : SchedWriteRes<[HWPort01, HWPort5]> {
+ let NumMicroOps = 2;
+}
+def WriteP06 : SchedWriteRes<[HWPort06]>;
+
+def Write2P06 : SchedWriteRes<[HWPort06]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+
+def Write3P06_Lat2 : SchedWriteRes<[HWPort06]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [3];
+}
+
+def WriteP0156_P23 : SchedWriteRes<[HWPort0156, HWPort23]> {
+ let NumMicroOps = 2;
+}
+
+def Write2P0156_P23 : SchedWriteRes<[HWPort0156, HWPort23]> {
+ let NumMicroOps = 3;
+ let ResourceCycles = [2, 1];
+}
+
+def Write2P0156_Lat2 : SchedWriteRes<[HWPort0156]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+def Write2P0156_Lat2Ld : SchedWriteRes<[HWPort0156, HWPort23]> {
+ let Latency = 6;
+ let ResourceCycles = [2, 1];
+}
+
+def Write5P0156 : SchedWriteRes<[HWPort0156]> {
+ let NumMicroOps = 5;
+ let ResourceCycles = [5];
+}
+
+def WriteP0156_2P237_P4 : SchedWriteRes<[HWPort0156, HWPort237, HWPort4]> {
+ let Latency = 1;
+ let ResourceCycles = [1, 2, 1];
+}
+
+def Write2P0156_2P237_P4 : SchedWriteRes<[HWPort0156, HWPort237, HWPort4]> {
+ let Latency = 1;
+ let ResourceCycles = [2, 2, 1];
+}
+
+def Write3P0156_2P237_P4 : SchedWriteRes<[HWPort0156, HWPort237, HWPort4]> {
+ let Latency = 1;
+ let ResourceCycles = [3, 2, 1];
+}
+
+// Starting with P1.
+def WriteP1 : SchedWriteRes<[HWPort1]>;
+
+def WriteP1_P23 : SchedWriteRes<[HWPort1, HWPort23]> {
+ let NumMicroOps = 2;
+}
+def WriteP1_Lat3 : SchedWriteRes<[HWPort1]> {
+ let Latency = 3;
+}
+def WriteP1_Lat3Ld : SchedWriteRes<[HWPort1, HWPort23]> {
+ let Latency = 7;
+}
+
+def Write2P1 : SchedWriteRes<[HWPort1]> {
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def Write2P1_P23 : SchedWriteRes<[HWPort1, HWPort23]> {
+ let NumMicroOps = 3;
+ let ResourceCycles = [2, 1];
+}
+def WriteP15 : SchedWriteRes<[HWPort15]>;
+def WriteP15Ld : SchedWriteRes<[HWPort15, HWPort23]> {
+ let Latency = 4;
+}
+
+def WriteP1_P5_Lat4 : SchedWriteRes<[HWPort1, HWPort5]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+
+def WriteP1_P5_Lat4Ld : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1, 1, 1];
+}
+
+def WriteP1_P5_Lat6 : SchedWriteRes<[HWPort1, HWPort5]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+
+def WriteP1_P5_Lat6Ld : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1, 1, 1];
+}
+
+// Starting with P2.
+def Write2P237_P4 : SchedWriteRes<[HWPort237, HWPort4]> {
+ let Latency = 1;
+ let ResourceCycles = [2, 1];
+}
+
+// Starting with P5.
+def WriteP5 : SchedWriteRes<[HWPort5]>;
+def WriteP5Ld : SchedWriteRes<[HWPort5, HWPort23]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+
+// Notation:
+// - r: register.
+// - mm: 64 bit mmx register.
+// - x = 128 bit xmm register.
+// - (x)mm = mmx or xmm register.
+// - y = 256 bit ymm register.
+// - v = any vector register.
+// - m = memory.
+
+//=== Integer Instructions ===//
+//-- Move instructions --//
+
+// MOV.
+// r16,m.
+def : InstRW<[WriteALULd], (instregex "MOV16rm")>;
+
+// MOVSX, MOVZX.
+// r,m.
+def : InstRW<[WriteLoad], (instregex "MOV(S|Z)X32rm(8|16)")>;
+
+// CMOVcc.
+// r,r.
+def : InstRW<[Write2P0156_Lat2],
+ (instregex "CMOV(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)(16|32|64)rr")>;
+// r,m.
+def : InstRW<[Write2P0156_Lat2Ld, ReadAfterLd],
+ (instregex "CMOV(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)(16|32|64)rm")>;
+
+// XCHG.
+// r,r.
+def WriteXCHG : SchedWriteRes<[HWPort0156]> {
+ let Latency = 2;
+ let ResourceCycles = [3];
+}
+
+def : InstRW<[WriteXCHG], (instregex "XCHG(8|16|32|64)rr", "XCHG(16|32|64)ar")>;
+
+// r,m.
+def WriteXCHGrm : SchedWriteRes<[]> {
+ let Latency = 21;
+ let NumMicroOps = 8;
+}
+def : InstRW<[WriteXCHGrm], (instregex "XCHG(8|16|32|64)rm")>;
+
+// XLAT.
+def WriteXLAT : SchedWriteRes<[]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+}
+def : InstRW<[WriteXLAT], (instregex "XLAT")>;
+
+// PUSH.
+// m.
+def : InstRW<[Write2P237_P4], (instregex "PUSH(16|32)rmm")>;
+
+// PUSHF.
+def WritePushF : SchedWriteRes<[HWPort1, HWPort4, HWPort237, HWPort06]> {
+ let NumMicroOps = 4;
+}
+def : InstRW<[WritePushF], (instregex "PUSHF(16|32)")>;
+
+// PUSHA.
+def WritePushA : SchedWriteRes<[]> {
+ let NumMicroOps = 19;
+}
+def : InstRW<[WritePushA], (instregex "PUSHA(16|32)")>;
+
+// POP.
+// m.
+def : InstRW<[Write2P237_P4], (instregex "POP(16|32)rmm")>;
+
+// POPF.
+def WritePopF : SchedWriteRes<[]> {
+ let NumMicroOps = 9;
+}
+def : InstRW<[WritePopF], (instregex "POPF(16|32)")>;
+
+// POPA.
+def WritePopA : SchedWriteRes<[]> {
+ let NumMicroOps = 18;
+}
+def : InstRW<[WritePopA], (instregex "POPA(16|32)")>;
+
+// LAHF SAHF.
+def : InstRW<[WriteP06], (instregex "(S|L)AHF")>;
+
+// BSWAP.
+// r32.
+def WriteBSwap32 : SchedWriteRes<[HWPort15]>;
+def : InstRW<[WriteBSwap32], (instregex "BSWAP32r")>;
+
+// r64.
+def WriteBSwap64 : SchedWriteRes<[HWPort06, HWPort15]> {
+ let NumMicroOps = 2;
+}
+def : InstRW<[WriteBSwap64], (instregex "BSWAP64r")>;
+
+// MOVBE.
+// r16,m16 / r64,m64.
+def : InstRW<[Write2P0156_Lat2Ld], (instregex "MOVBE(16|64)rm")>;
+
+// r32, m32.
+def WriteMoveBE32rm : SchedWriteRes<[HWPort15, HWPort23]> {
+ let NumMicroOps = 2;
+}
+def : InstRW<[WriteMoveBE32rm], (instregex "MOVBE32rm")>;
+
+// m16,r16.
+def WriteMoveBE16mr : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> {
+ let NumMicroOps = 3;
+}
+def : InstRW<[WriteMoveBE16mr], (instregex "MOVBE16mr")>;
+
+// m32,r32.
+def WriteMoveBE32mr : SchedWriteRes<[HWPort15, HWPort237, HWPort4]> {
+ let NumMicroOps = 3;
+}
+def : InstRW<[WriteMoveBE32mr], (instregex "MOVBE32mr")>;
+
+// m64,r64.
+def WriteMoveBE64mr : SchedWriteRes<[HWPort06, HWPort15, HWPort237, HWPort4]> {
+ let NumMicroOps = 4;
+}
+def : InstRW<[WriteMoveBE64mr], (instregex "MOVBE64mr")>;
+
+//-- Arithmetic instructions --//
+
+// ADD SUB.
+// m,r/i.
+def : InstRW<[Write2P0156_2P237_P4],
+ (instregex "(ADD|SUB)(8|16|32|64)m(r|i)",
+ "(ADD|SUB)(8|16|32|64)mi8", "(ADD|SUB)64mi32")>;
+
+// ADC SBB.
+// r,r/i.
+def : InstRW<[Write2P0156_Lat2], (instregex "(ADC|SBB)(8|16|32|64)r(r|i)",
+ "(ADC|SBB)(16|32|64)ri8",
+ "(ADC|SBB)64ri32",
+ "(ADC|SBB)(8|16|32|64)rr_REV")>;
+
+// r,m.
+def : InstRW<[Write2P0156_Lat2Ld, ReadAfterLd], (instregex "(ADC|SBB)(8|16|32|64)rm")>;
+
+// m,r/i.
+def : InstRW<[Write3P0156_2P237_P4],
+ (instregex "(ADC|SBB)(8|16|32|64)m(r|i)",
+ "(ADC|SBB)(16|32|64)mi8",
+ "(ADC|SBB)64mi32")>;
+
+// INC DEC NOT NEG.
+// m.
+def : InstRW<[WriteP0156_2P237_P4],
+ (instregex "(INC|DEC|NOT|NEG)(8|16|32|64)m",
+ "(INC|DEC)64(16|32)m")>;
+
+// MUL IMUL.
+// r16.
+def WriteMul16 : SchedWriteRes<[HWPort1, HWPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+}
+def : InstRW<[WriteMul16], (instregex "IMUL16r", "MUL16r")>;
+
+// m16.
+def WriteMul16Ld : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 5;
+}
+def : InstRW<[WriteMul16Ld], (instregex "IMUL16m", "MUL16m")>;
+
+// r32.
+def WriteMul32 : SchedWriteRes<[HWPort1, HWPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+}
+def : InstRW<[WriteMul32], (instregex "IMUL32r", "MUL32r")>;
+
+// m32.
+def WriteMul32Ld : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+}
+def : InstRW<[WriteMul32Ld], (instregex "IMUL32m", "MUL32m")>;
+
+// r64.
+def WriteMul64 : SchedWriteRes<[HWPort1, HWPort6]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+def : InstRW<[WriteMul64], (instregex "IMUL64r", "MUL64r")>;
+
+// m64.
+def WriteMul64Ld : SchedWriteRes<[HWPort1, HWPort6, HWPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+}
+def : InstRW<[WriteMul64Ld], (instregex "IMUL64m", "MUL64m")>;
+
+// r16,r16.
+def WriteMul16rri : SchedWriteRes<[HWPort1, HWPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+def : InstRW<[WriteMul16rri], (instregex "IMUL16rri", "IMUL16rri8")>;
+
+// r16,m16.
+def WriteMul16rmi : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+}
+def : InstRW<[WriteMul16rmi], (instregex "IMUL16rmi", "IMUL16rmi8")>;
+
+// MULX.
+// r32,r32,r32.
+def WriteMulX32 : SchedWriteRes<[HWPort1, HWPort056]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1, 2];
+}
+def : InstRW<[WriteMulX32], (instregex "MULX32rr")>;
+
+// r32,r32,m32.
+def WriteMulX32Ld : SchedWriteRes<[HWPort1, HWPort056, HWPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1, 2, 1];
+}
+def : InstRW<[WriteMulX32Ld], (instregex "MULX32rm")>;
+
+// r64,r64,r64.
+def WriteMulX64 : SchedWriteRes<[HWPort1, HWPort6]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+def : InstRW<[WriteMulX64], (instregex "MULX64rr")>;
+
+// r64,r64,m64.
+def WriteMulX64Ld : SchedWriteRes<[HWPort1, HWPort6, HWPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+}
+def : InstRW<[WriteMulX64Ld], (instregex "MULX64rm")>;
+
+// DIV.
+// r8.
+def WriteDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+ let Latency = 22;
+ let NumMicroOps = 9;
+}
+def : InstRW<[WriteDiv8], (instregex "DIV8r")>;
+
+// r16.
+def WriteDiv16 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+ let Latency = 23;
+ let NumMicroOps = 10;
+}
+def : InstRW<[WriteDiv16], (instregex "DIV16r")>;
+
+// r32.
+def WriteDiv32 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+ let Latency = 22;
+ let NumMicroOps = 10;
+}
+def : InstRW<[WriteDiv32], (instregex "DIV32r")>;
+
+// r64.
+def WriteDiv64 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+ let Latency = 32;
+ let NumMicroOps = 36;
+}
+def : InstRW<[WriteDiv64], (instregex "DIV64r")>;
+
+// IDIV.
+// r8.
+def WriteIDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+ let Latency = 23;
+ let NumMicroOps = 9;
+}
+def : InstRW<[WriteIDiv8], (instregex "IDIV8r")>;
+
+// r16.
+def WriteIDiv16 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+ let Latency = 23;
+ let NumMicroOps = 10;
+}
+def : InstRW<[WriteIDiv16], (instregex "IDIV16r")>;
+
+// r32.
+def WriteIDiv32 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+ let Latency = 22;
+ let NumMicroOps = 9;
+}
+def : InstRW<[WriteIDiv32], (instregex "IDIV32r")>;
+
+// r64.
+def WriteIDiv64 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+ let Latency = 39;
+ let NumMicroOps = 59;
+}
+def : InstRW<[WriteIDiv64], (instregex "IDIV64r")>;
+
+//-- Logic instructions --//
+
+// AND OR XOR.
+// m,r/i.
+def : InstRW<[Write2P0156_2P237_P4],
+ (instregex "(AND|OR|XOR)(8|16|32|64)m(r|i)",
+ "(AND|OR|XOR)(8|16|32|64)mi8", "(AND|OR|XOR)64mi32")>;
+
+// SHR SHL SAR.
+// m,i.
+def WriteShiftRMW : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> {
+ let NumMicroOps = 4;
+ let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteShiftRMW], (instregex "S(A|H)(R|L)(8|16|32|64)m(i|1)")>;
+
+// r,cl.
+def : InstRW<[Write3P06_Lat2], (instregex "S(A|H)(R|L)(8|16|32|64)rCL")>;
+
+// m,cl.
+def WriteShiftClLdRMW : SchedWriteRes<[HWPort06, HWPort23, HWPort4]> {
+ let NumMicroOps = 6;
+ let ResourceCycles = [3, 2, 1];
+}
+def : InstRW<[WriteShiftClLdRMW], (instregex "S(A|H)(R|L)(8|16|32|64)mCL")>;
+
+// ROR ROL.
+// r,1.
+def : InstRW<[Write2P06], (instregex "RO(R|L)(8|16|32|64)r1")>;
+
+// m,i.
+def WriteRotateRMW : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> {
+ let NumMicroOps = 5;
+ let ResourceCycles = [2, 2, 1];
+}
+def : InstRW<[WriteRotateRMW], (instregex "RO(R|L)(8|16|32|64)mi")>;
+
+// r,cl.
+def : InstRW<[Write3P06_Lat2], (instregex "RO(R|L)(8|16|32|64)rCL")>;
+
+// m,cl.
+def WriteRotateRMWCL : SchedWriteRes<[]> {
+ let NumMicroOps = 6;
+}
+def : InstRW<[WriteRotateRMWCL], (instregex "RO(R|L)(8|16|32|64)mCL")>;
+
+// RCR RCL.
+// r,1.
+def WriteRCr1 : SchedWriteRes<[HWPort06, HWPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteRCr1], (instregex "RC(R|L)(8|16|32|64)r1")>;
+
+// m,1.
+def WriteRCm1 : SchedWriteRes<[]> {
+ let NumMicroOps = 6;
+}
+def : InstRW<[WriteRCm1], (instregex "RC(R|L)(8|16|32|64)m1")>;
+
+// r,i.
+def WriteRCri : SchedWriteRes<[HWPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 8;
+}
+def : InstRW<[WriteRCri], (instregex "RC(R|L)(8|16|32|64)r(i|CL)")>;
+
+// m,i.
+def WriteRCmi : SchedWriteRes<[]> {
+ let NumMicroOps = 11;
+}
+def : InstRW<[WriteRCmi], (instregex "RC(R|L)(8|16|32|64)m(i|CL)")>;
+
+// SHRD SHLD.
+// r,r,i.
+def WriteShDrr : SchedWriteRes<[HWPort1]> {
+ let Latency = 3;
+}
+def : InstRW<[WriteShDrr], (instregex "SH(R|L)D(16|32|64)rri8")>;
+
+// m,r,i.
+def WriteShDmr : SchedWriteRes<[]> {
+ let NumMicroOps = 5;
+}
+def : InstRW<[WriteShDmr], (instregex "SH(R|L)D(16|32|64)mri8")>;
+
+// r,r,cl.
+def WriteShlDCL : SchedWriteRes<[HWPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 4;
+}
+def : InstRW<[WriteShlDCL], (instregex "SHLD(16|32|64)rrCL")>;
+
+// r,r,cl.
+def WriteShrDCL : SchedWriteRes<[HWPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+}
+def : InstRW<[WriteShrDCL], (instregex "SHRD(16|32|64)rrCL")>;
+
+// m,r,cl.
+def WriteShDmrCL : SchedWriteRes<[]> {
+ let NumMicroOps = 7;
+}
+def : InstRW<[WriteShDmrCL], (instregex "SH(R|L)D(16|32|64)mrCL")>;
+
+// BT.
+// r,r/i.
+def : InstRW<[WriteShift], (instregex "BT(16|32|64)r(r|i8)")>;
+
+// m,r.
+def WriteBTmr : SchedWriteRes<[]> {
+ let NumMicroOps = 10;
+}
+def : InstRW<[WriteBTmr], (instregex "BT(16|32|64)mr")>;
+
+// m,i.
+def : InstRW<[WriteShiftLd], (instregex "BT(16|32|64)mi8")>;
+
+// BTR BTS BTC.
+// r,r,i.
+def : InstRW<[WriteShift], (instregex "BT(R|S|C)(16|32|64)r(r|i8)")>;
+
+// m,r.
+def WriteBTRSCmr : SchedWriteRes<[]> {
+ let NumMicroOps = 11;
+}
+def : InstRW<[WriteBTRSCmr], (instregex "BT(R|S|C)(16|32|64)mr")>;
+
+// m,i.
+def : InstRW<[WriteShiftLd], (instregex "BT(R|S|C)(16|32|64)mi8")>;
+
+// BSF BSR.
+// r,r.
+def : InstRW<[WriteP1_Lat3], (instregex "BS(R|F)(16|32|64)rr")>;
+// r,m.
+def : InstRW<[WriteP1_Lat3Ld], (instregex "BS(R|F)(16|32|64)rm")>;
+
+// SETcc.
+// r.
+def : InstRW<[WriteShift],
+ (instregex "SET(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)r")>;
+// m.
+def WriteSetCCm : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> {
+ let NumMicroOps = 3;
+}
+def : InstRW<[WriteSetCCm],
+ (instregex "SET(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)m")>;
+
+// CLD STD.
+def WriteCldStd : SchedWriteRes<[HWPort15, HWPort6]> {
+ let NumMicroOps = 3;
+}
+def : InstRW<[WriteCldStd], (instregex "STD", "CLD")>;
+
+// LZCNT TZCNT.
+// r,r.
+def : InstRW<[WriteP1_Lat3], (instregex "(L|TZCNT)(16|32|64)rr")>;
+// r,m.
+def : InstRW<[WriteP1_Lat3Ld], (instregex "(L|TZCNT)(16|32|64)rm")>;
+
+// ANDN.
+// r,r.
+def : InstRW<[WriteP15], (instregex "ANDN(32|64)rr")>;
+// r,m.
+def : InstRW<[WriteP15Ld], (instregex "ANDN(32|64)rm")>;
+
+// BLSI BLSMSK BLSR.
+// r,r.
+def : InstRW<[WriteP15], (instregex "BLS(I|MSK|R)(32|64)rr")>;
+// r,m.
+def : InstRW<[WriteP15Ld], (instregex "BLS(I|MSK|R)(32|64)rm")>;
+
+// BEXTR.
+// r,r,r.
+def : InstRW<[Write2P0156_Lat2], (instregex "BEXTR(32|64)rr")>;
+// r,m,r.
+def : InstRW<[Write2P0156_Lat2Ld], (instregex "BEXTR(32|64)rm")>;
+
+// BZHI.
+// r,r,r.
+def : InstRW<[WriteP15], (instregex "BZHI(32|64)rr")>;
+// r,m,r.
+def : InstRW<[WriteP15Ld], (instregex "BZHI(32|64)rm")>;
+
+// PDEP PEXT.
+// r,r,r.
+def : InstRW<[WriteP1_Lat3], (instregex "PDEP(32|64)rr", "PEXT(32|64)rr")>;
+// r,m,r.
+def : InstRW<[WriteP1_Lat3Ld], (instregex "PDEP(32|64)rm", "PEXT(32|64)rm")>;
+
+//-- Control transfer instructions --//
+
+// J(E|R)CXZ.
+def WriteJCXZ : SchedWriteRes<[HWPort0156, HWPort6]> {
+ let NumMicroOps = 2;
+}
+def : InstRW<[WriteJCXZ], (instregex "JCXZ", "JECXZ_(32|64)", "JRCXZ")>;
+
+// LOOP.
+def WriteLOOP : SchedWriteRes<[]> {
+ let NumMicroOps = 7;
+}
+def : InstRW<[WriteLOOP], (instregex "LOOP")>;
+
+// LOOP(N)E
+def WriteLOOPE : SchedWriteRes<[]> {
+ let NumMicroOps = 11;
+}
+def : InstRW<[WriteLOOPE], (instregex "LOOPE", "LOOPNE")>;
+
+// CALL.
+// r.
+def WriteCALLr : SchedWriteRes<[HWPort237, HWPort4, HWPort6]> {
+ let NumMicroOps = 3;
+}
+def : InstRW<[WriteCALLr], (instregex "CALL(16|32)r")>;
+
+// m.
+def WriteCALLm : SchedWriteRes<[HWPort237, HWPort4, HWPort6]> {
+ let NumMicroOps = 4;
+ let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteCALLm], (instregex "CALL(16|32)m")>;
+
+// RET.
+def WriteRET : SchedWriteRes<[HWPort237, HWPort6]> {
+ let NumMicroOps = 2;
+}
+def : InstRW<[WriteRET], (instregex "RET(L|Q|W)", "LRET(L|Q|W)")>;
+
+// i.
+def WriteRETI : SchedWriteRes<[HWPort23, HWPort6, HWPort015]> {
+ let NumMicroOps = 4;
+ let ResourceCycles = [1, 2, 1];
+}
+def : InstRW<[WriteRETI], (instregex "RETI(L|Q|W)", "LRETI(L|Q|W)")>;
+
+// BOUND.
+// r,m.
+def WriteBOUND : SchedWriteRes<[]> {
+ let NumMicroOps = 15;
+}
+def : InstRW<[WriteBOUND], (instregex "BOUNDS(16|32)rm")>;
+
+// INTO.
+def WriteINTO : SchedWriteRes<[]> {
+ let NumMicroOps = 4;
+}
+def : InstRW<[WriteINTO], (instregex "INTO")>;
+
+//-- String instructions --//
+
+// LODSB/W.
+def : InstRW<[Write2P0156_P23], (instregex "LODS(B|W)")>;
+
+// LODSD/Q.
+def : InstRW<[WriteP0156_P23], (instregex "LODS(L|Q)")>;
+
+// STOS.
+def WriteSTOS : SchedWriteRes<[HWPort23, HWPort0156, HWPort4]> {
+ let NumMicroOps = 3;
+}
+def : InstRW<[WriteSTOS], (instregex "STOS(B|L|Q|W)")>;
+
+// MOVS.
+def WriteMOVS : SchedWriteRes<[HWPort23, HWPort4, HWPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 5;
+ let ResourceCycles = [2, 1, 2];
+}
+def : InstRW<[WriteMOVS], (instregex "MOVS(B|L|Q|W)")>;
+
+// SCAS.
+def : InstRW<[Write2P0156_P23], (instregex "SCAS(B|W|L|Q)")>;
+
+// CMPS.
+def WriteCMPS : SchedWriteRes<[HWPort23, HWPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 5;
+ let ResourceCycles = [2, 3];
+}
+def : InstRW<[WriteCMPS], (instregex "CMPS(B|L|Q|W)")>;
+
+//-- Synchronization instructions --//
+
+// XADD.
+def WriteXADD : SchedWriteRes<[]> {
+ let NumMicroOps = 5;
+}
+def : InstRW<[WriteXADD], (instregex "XADD(8|16|32|64)rm")>;
+
+// CMPXCHG.
+def WriteCMPXCHG : SchedWriteRes<[]> {
+ let NumMicroOps = 6;
+}
+def : InstRW<[WriteCMPXCHG], (instregex "CMPXCHG(8|16|32|64)rm")>;
+
+// CMPXCHG8B.
+def WriteCMPXCHG8B : SchedWriteRes<[]> {
+ let NumMicroOps = 15;
+}
+def : InstRW<[WriteCMPXCHG8B], (instregex "CMPXCHG8B")>;
+
+// CMPXCHG16B.
+def WriteCMPXCHG16B : SchedWriteRes<[]> {
+ let NumMicroOps = 22;
+}
+def : InstRW<[WriteCMPXCHG16B], (instregex "CMPXCHG16B")>;
+
+//-- Other --//
+
+// PAUSE.
+def WritePAUSE : SchedWriteRes<[HWPort05, HWPort6]> {
+ let NumMicroOps = 5;
+ let ResourceCycles = [1, 3];
+}
+def : InstRW<[WritePAUSE], (instregex "PAUSE")>;
+
+// LEAVE.
+def : InstRW<[Write2P0156_P23], (instregex "LEAVE")>;
+
+// XGETBV.
+def WriteXGETBV : SchedWriteRes<[]> {
+ let NumMicroOps = 8;
+}
+def : InstRW<[WriteXGETBV], (instregex "XGETBV")>;
+
+// RDTSC.
+def WriteRDTSC : SchedWriteRes<[]> {
+ let NumMicroOps = 15;
+}
+def : InstRW<[WriteRDTSC], (instregex "RDTSC")>;
+
+// RDPMC.
+def WriteRDPMC : SchedWriteRes<[]> {
+ let NumMicroOps = 34;
+}
+def : InstRW<[WriteRDPMC], (instregex "RDPMC")>;
+
+// RDRAND.
+def WriteRDRAND : SchedWriteRes<[HWPort23, HWPort015]> {
+ let NumMicroOps = 17;
+ let ResourceCycles = [1, 16];
+}
+def : InstRW<[WriteRDRAND], (instregex "RDRAND(16|32|64)r")>;
+
+//=== Floating Point x87 Instructions ===//
+//-- Move instructions --//
+
+// FLD.
+// m80.
+def : InstRW<[WriteP01], (instregex "LD_Frr")>;
+
+def WriteLD_F80m : SchedWriteRes<[HWPort01, HWPort23]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2, 2];
+}
+def : InstRW<[WriteLD_F80m], (instregex "LD_F80m")>;
+
+// FBLD.
+// m80.
+def WriteFBLD : SchedWriteRes<[]> {
+ let Latency = 47;
+ let NumMicroOps = 43;
+}
+def : InstRW<[WriteFBLD], (instregex "FBLDm")>;
+
+// FST(P).
+// r.
+def : InstRW<[WriteP01], (instregex "ST_(F|FP)rr")>;
+
+// m80.
+def WriteST_FP80m : SchedWriteRes<[HWPort0156, HWPort23, HWPort4]> {
+ let NumMicroOps = 7;
+ let ResourceCycles = [3, 2, 2];
+}
+def : InstRW<[WriteST_FP80m], (instregex "ST_FP80m")>;
+
+// FBSTP.
+// m80.
+def WriteFBSTP : SchedWriteRes<[]> {
+ let NumMicroOps = 226;
+}
+def : InstRW<[WriteFBSTP], (instregex "FBSTPm")>;
+
+// FXCHG.
+def : InstRW<[WriteNop], (instregex "XCH_F")>;
+
+// FILD.
+def WriteFILD : SchedWriteRes<[HWPort01, HWPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+def : InstRW<[WriteFILD], (instregex "ILD_F(16|32|64)m")>;
+
+// FIST(P) FISTTP.
+def WriteFIST : SchedWriteRes<[HWPort1, HWPort23, HWPort4]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+}
+def : InstRW<[WriteFIST], (instregex "IST_(F|FP)(16|32)m")>;
+
+// FLDZ.
+def : InstRW<[WriteP01], (instregex "LD_F0")>;
+
+// FLD1.
+def : InstRW<[Write2P01], (instregex "LD_F1")>;
+
+// FLDPI FLDL2E etc.
+def : InstRW<[Write2P01], (instregex "FLDPI", "FLDL2(T|E)" "FLDL(G|N)2")>;
+
+// FCMOVcc.
+def WriteFCMOVcc : SchedWriteRes<[HWPort0, HWPort5]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteFCMOVcc], (instregex "CMOV(B|BE|P|NB|NBE|NE|NP)_F")>;
+
+// FNSTSW.
+// AX.
+def WriteFNSTSW : SchedWriteRes<[HWPort0, HWPort0156]> {
+ let NumMicroOps = 2;
+}
+def : InstRW<[WriteFNSTSW], (instregex "FNSTSW16r")>;
+
+// m16.
+def WriteFNSTSWm : SchedWriteRes<[HWPort0, HWPort4, HWPort237]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+}
+def : InstRW<[WriteFNSTSWm], (instregex "FNSTSWm")>;
+
+// FLDCW.
+def WriteFLDCW : SchedWriteRes<[HWPort01, HWPort23, HWPort6]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+}
+def : InstRW<[WriteFLDCW], (instregex "FLDCW16m")>;
+
+// FNSTCW.
+def WriteFNSTCW : SchedWriteRes<[HWPort237, HWPort4, HWPort6]> {
+ let NumMicroOps = 3;
+}
+def : InstRW<[WriteFNSTCW], (instregex "FNSTCW16m")>;
+
+// FINCSTP FDECSTP.
+def : InstRW<[WriteP01], (instregex "FINCSTP", "FDECSTP")>;
+
+// FFREE.
+def : InstRW<[WriteP01], (instregex "FFREE")>;
+
+// FNSAVE.
+def WriteFNSAVE : SchedWriteRes<[]> {
+ let NumMicroOps = 147;
+}
+def : InstRW<[WriteFNSAVE], (instregex "FSAVEm")>;
+
+// FRSTOR.
+def WriteFRSTOR : SchedWriteRes<[]> {
+ let NumMicroOps = 90;
+}
+def : InstRW<[WriteFRSTOR], (instregex "FRSTORm")>;
+
+//-- Arithmetic instructions --//
+
+// FABS.
+def : InstRW<[WriteP0], (instregex "ABS_F")>;
+
+// FCHS.
+def : InstRW<[WriteP0], (instregex "CHS_F")>;
+
+// FCOM(P) FUCOM(P).
+// r.
+def : InstRW<[WriteP1], (instregex "COM_FST0r", "COMP_FST0r", "UCOM_Fr",
+ "UCOM_FPr")>;
+// m.
+def : InstRW<[WriteP1_P23], (instregex "FCOM(32|64)m", "FCOMP(32|64)m")>;
+
+// FCOMPP FUCOMPP.
+// r.
+def : InstRW<[Write2P01], (instregex "FCOMPP", "UCOM_FPPr")>;
+
+// FCOMI(P) FUCOMI(P).
+// m.
+def : InstRW<[Write3P01], (instregex "COM_FIr", "COM_FIPr", "UCOM_FIr",
+ "UCOM_FIPr")>;
+
+// FICOM(P).
+def : InstRW<[Write2P1_P23], (instregex "FICOM(16|32)m", "FICOMP(16|32)m")>;
+
+// FTST.
+def : InstRW<[WriteP1], (instregex "TST_F")>;
+
+// FXAM.
+def : InstRW<[Write2P1], (instregex "FXAM")>;
+
+// FPREM.
+def WriteFPREM : SchedWriteRes<[]> {
+ let Latency = 19;
+ let NumMicroOps = 28;
+}
+def : InstRW<[WriteFPREM], (instregex "FPREM")>;
+
+// FPREM1.
+def WriteFPREM1 : SchedWriteRes<[]> {
+ let Latency = 27;
+ let NumMicroOps = 41;
+}
+def : InstRW<[WriteFPREM1], (instregex "FPREM1")>;
+
+// FRNDINT.
+def WriteFRNDINT : SchedWriteRes<[]> {
+ let Latency = 11;
+ let NumMicroOps = 17;
+}
+def : InstRW<[WriteFRNDINT], (instregex "FRNDINT")>;
+
+//-- Math instructions --//
+
+// FSCALE.
+def WriteFSCALE : SchedWriteRes<[]> {
+ let Latency = 75; // 49-125
+ let NumMicroOps = 50; // 25-75
+}
+def : InstRW<[WriteFSCALE], (instregex "FSCALE")>;
+
+// FXTRACT.
+def WriteFXTRACT : SchedWriteRes<[]> {
+ let Latency = 15;
+ let NumMicroOps = 17;
+}
+def : InstRW<[WriteFXTRACT], (instregex "FXTRACT")>;
+
+//-- Other instructions --//
+
+// FNOP.
+def : InstRW<[WriteP01], (instregex "FNOP")>;
+
+// WAIT.
+def : InstRW<[Write2P01], (instregex "WAIT")>;
+
+// FNCLEX.
+def : InstRW<[Write5P0156], (instregex "FNCLEX")>;
+
+// FNINIT.
+def WriteFNINIT : SchedWriteRes<[]> {
+ let NumMicroOps = 26;
+}
+def : InstRW<[WriteFNINIT], (instregex "FNINIT")>;
+
+//=== Integer MMX and XMM Instructions ===//
+//-- Move instructions --//
+
+// MOVD.
+// r32/64 <- (x)mm.
+def : InstRW<[WriteP0], (instregex "MMX_MOVD64grr", "MMX_MOVD64from64rr",
+ "VMOVPDI2DIrr", "MOVPDI2DIrr")>;
+
+// (x)mm <- r32/64.
+def : InstRW<[WriteP5], (instregex "MMX_MOVD64rr", "MMX_MOVD64to64rr",
+ "VMOVDI2PDIrr", "MOVDI2PDIrr")>;
+
+// MOVQ.
+// r64 <- (x)mm.
+def : InstRW<[WriteP0], (instregex "VMOVPQIto64rr")>;
+
+// (x)mm <- r64.
+def : InstRW<[WriteP5], (instregex "VMOV64toPQIrr", "VMOVZQI2PQIrr")>;
+
+// (x)mm <- (x)mm.
+def : InstRW<[WriteP015], (instregex "MMX_MOVQ64rr")>;
+
+// (V)MOVDQA/U.
+// x <- x.
+def : InstRW<[WriteP015], (instregex "MOVDQ(A|U)rr", "VMOVDQ(A|U)rr",
+ "MOVDQ(A|U)rr_REV", "VMOVDQ(A|U)rr_REV",
+ "VMOVDQ(A|U)Yrr", "VMOVDQ(A|U)Yrr_REV")>;
+
+// MOVDQ2Q.
+def : InstRW<[WriteP01_P5], (instregex "MMX_MOVDQ2Qrr")>;
+
+// MOVQ2DQ.
+def : InstRW<[WriteP015], (instregex "MMX_MOVQ2DQrr")>;
+
+
+// PACKSSWB/DW.
+// mm <- mm.
+def WriteMMXPACKSSrr : SchedWriteRes<[HWPort5]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [3];
+}
+def : InstRW<[WriteMMXPACKSSrr], (instregex "MMX_PACKSSDWirr",
+ "MMX_PACKSSWBirr", "MMX_PACKUSWBirr")>;
+
+// mm <- m64.
+def WriteMMXPACKSSrm : SchedWriteRes<[HWPort23, HWPort5]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1, 3];
+}
+def : InstRW<[WriteMMXPACKSSrm], (instregex "MMX_PACKSSDWirm",
+ "MMX_PACKSSWBirm", "MMX_PACKUSWBirm")>;
+
+// VPMOVSX/ZX BW BD BQ DW DQ.
+// y <- x.
+def WriteVPMOVSX : SchedWriteRes<[HWPort5]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+}
+def : InstRW<[WriteVPMOVSX], (instregex "VPMOV(SX|ZX)(BW|BQ|DW|DQ)Yrr")>;
+
+// PBLENDW.
+// x,x,i / v,v,v,i
+def WritePBLENDWr : SchedWriteRes<[HWPort5]>;
+def : InstRW<[WritePBLENDWr], (instregex "(V?)PBLENDW(Y?)rri")>;
+
+// x,m,i / v,v,m,i
+def WritePBLENDWm : SchedWriteRes<[HWPort5, HWPort23]> {
+ let NumMicroOps = 2;
+ let Latency = 4;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[WritePBLENDWm, ReadAfterLd], (instregex "(V?)PBLENDW(Y?)rmi")>;
+
+// VPBLENDD.
+// v,v,v,i.
+def WriteVPBLENDDr : SchedWriteRes<[HWPort015]>;
+def : InstRW<[WriteVPBLENDDr], (instregex "VPBLENDD(Y?)rri")>;
+
+// v,v,m,i
+def WriteVPBLENDDm : SchedWriteRes<[HWPort015, HWPort23]> {
+ let NumMicroOps = 2;
+ let Latency = 4;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteVPBLENDDm, ReadAfterLd], (instregex "VPBLENDD(Y?)rmi")>;
+
+// MASKMOVQ.
+def WriteMASKMOVQ : SchedWriteRes<[HWPort0, HWPort4, HWPort23]> {
+ let Latency = 13;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1, 1, 2];
+}
+def : InstRW<[WriteMASKMOVQ], (instregex "MMX_MASKMOVQ(64)?")>;
+
+// MASKMOVDQU.
+def WriteMASKMOVDQU : SchedWriteRes<[HWPort04, HWPort56, HWPort23]> {
+ let Latency = 14;
+ let NumMicroOps = 10;
+ let ResourceCycles = [4, 2, 4];
+}
+def : InstRW<[WriteMASKMOVDQU], (instregex "(V?)MASKMOVDQU(64)?")>;
+
+// VPMASKMOV D/Q.
+// v,v,m.
+def WriteVPMASKMOVr : SchedWriteRes<[HWPort5, HWPort23]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteVPMASKMOVr, ReadAfterLd],
+ (instregex "VPMASKMOV(D|Q)(Y?)rm")>;
+
+// m, v,v.
+def WriteVPMASKMOVm : SchedWriteRes<[HWPort0, HWPort1, HWPort4, HWPort23]> {
+ let Latency = 13;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1, 1, 1, 1];
+}
+def : InstRW<[WriteVPMASKMOVm], (instregex "VPMASKMOV(D|Q)(Y?)mr")>;
+
+// PMOVMSKB.
+def WritePMOVMSKB : SchedWriteRes<[HWPort0]> {
+ let Latency = 3;
+}
+def : InstRW<[WritePMOVMSKB], (instregex "(V|MMX_)?PMOVMSKB(Y?)rr")>;
+
+// PEXTR B/W/D/Q.
+// r32,x,i.
+def WritePEXTRr : SchedWriteRes<[HWPort0, HWPort5]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[WritePEXTRr], (instregex "PEXTR(B|W|D|Q)rr", "MMX_PEXTRWirri")>;
+
+// m8,x,i.
+def WritePEXTRm : SchedWriteRes<[HWPort23, HWPort4, HWPort5]> {
+ let NumMicroOps = 3;
+ let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WritePEXTRm], (instregex "PEXTR(B|W|D|Q)mr")>;
+
+// VPBROADCAST B/W.
+// x, m8/16.
+def WriteVPBROADCAST128Ld : SchedWriteRes<[HWPort01, HWPort23, HWPort5]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WriteVPBROADCAST128Ld, ReadAfterLd],
+ (instregex "VPBROADCAST(B|W)rm")>;
+
+// y, m8/16
+def WriteVPBROADCAST256Ld : SchedWriteRes<[HWPort01, HWPort23, HWPort5]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WriteVPBROADCAST256Ld, ReadAfterLd],
+ (instregex "VPBROADCAST(B|W)Yrm")>;
+
+// VPGATHERDD.
+// x.
+def WriteVPGATHERDD128 : SchedWriteRes<[]> {
+ let NumMicroOps = 20;
+}
+def : InstRW<[WriteVPGATHERDD128, ReadAfterLd], (instregex "VPGATHERDDrm")>;
+
+// y.
+def WriteVPGATHERDD256 : SchedWriteRes<[]> {
+ let NumMicroOps = 34;
+}
+def : InstRW<[WriteVPGATHERDD256, ReadAfterLd], (instregex "VPGATHERDDYrm")>;
+
+// VPGATHERQD.
+// x.
+def WriteVPGATHERQD128 : SchedWriteRes<[]> {
+ let NumMicroOps = 15;
+}
+def : InstRW<[WriteVPGATHERQD128, ReadAfterLd], (instregex "VPGATHERQDrm")>;
+
+// y.
+def WriteVPGATHERQD256 : SchedWriteRes<[]> {
+ let NumMicroOps = 22;
+}
+def : InstRW<[WriteVPGATHERQD256, ReadAfterLd], (instregex "VPGATHERQDYrm")>;
+
+// VPGATHERDQ.
+// x.
+def WriteVPGATHERDQ128 : SchedWriteRes<[]> {
+ let NumMicroOps = 12;
+}
+def : InstRW<[WriteVPGATHERDQ128, ReadAfterLd], (instregex "VPGATHERDQrm")>;
+
+// y.
+def WriteVPGATHERDQ256 : SchedWriteRes<[]> {
+ let NumMicroOps = 20;
+}
+def : InstRW<[WriteVPGATHERDQ256, ReadAfterLd], (instregex "VPGATHERDQYrm")>;
+
+// VPGATHERQQ.
+// x.
+def WriteVPGATHERQQ128 : SchedWriteRes<[]> {
+ let NumMicroOps = 14;
+}
+def : InstRW<[WriteVPGATHERQQ128, ReadAfterLd], (instregex "VPGATHERQQrm")>;
+
+// y.
+def WriteVPGATHERQQ256 : SchedWriteRes<[]> {
+ let NumMicroOps = 22;
+}
+def : InstRW<[WriteVPGATHERQQ256, ReadAfterLd], (instregex "VPGATHERQQYrm")>;
+
+//-- Arithmetic instructions --//
+
+// PHADD|PHSUB (S) W/D.
+// v <- v,v.
+def WritePHADDSUBr : SchedWriteRes<[HWPort1, HWPort5]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1, 2];
+}
+def : InstRW<[WritePHADDSUBr], (instregex "MMX_PHADD(W?)rr64",
+ "MMX_PHADDSWrr64",
+ "MMX_PHSUB(W|D)rr64",
+ "MMX_PHSUBSWrr64",
+ "(V?)PH(ADD|SUB)(W|D)(Y?)rr",
+ "(V?)PH(ADD|SUB)SWrr(256)?")>;
+
+// v <- v,m.
+def WritePHADDSUBm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1, 2, 1];
+}
+def : InstRW<[WritePHADDSUBm, ReadAfterLd],
+ (instregex "MMX_PHADD(W?)rm64",
+ "MMX_PHADDSWrm64",
+ "MMX_PHSUB(W|D)rm64",
+ "MMX_PHSUBSWrm64",
+ "(V?)PH(ADD|SUB)(W|D)(Y?)rm",
+ "(V?)PH(ADD|SUB)SWrm(128|256)?")>;
+
+// PCMPGTQ.
+// v <- v,v.
+def WritePCMPGTQr : SchedWriteRes<[HWPort0]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+}
+def : InstRW<[WritePCMPGTQr], (instregex "(V?)PCMPGTQ(Y?)rr")>;
+
+// v <- v,m.
+def WritePCMPGTQm : SchedWriteRes<[HWPort0, HWPort23]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[WritePCMPGTQm, ReadAfterLd], (instregex "(V?)PCMPGTQ(Y?)rm")>;
+
+// PMULLD.
+// x,x / y,y,y.
+def WritePMULLDr : SchedWriteRes<[HWPort0]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def : InstRW<[WritePMULLDr], (instregex "(V?)PMULLD(Y?)rr")>;
+
+// x,m / y,y,m.
+def WritePMULLDm : SchedWriteRes<[HWPort0, HWPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2, 1];
+}
+def : InstRW<[WritePMULLDm, ReadAfterLd], (instregex "(V?)PMULLD(Y?)rm")>;
+
+//-- Logic instructions --//
+
+// PTEST.
+// v,v.
+def WritePTESTr : SchedWriteRes<[HWPort0, HWPort5]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[WritePTESTr], (instregex "(V?)PTEST(Y?)rr")>;
+
+// v,m.
+def WritePTESTm : SchedWriteRes<[HWPort0, HWPort5, HWPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WritePTESTr], (instregex "(V?)PTEST(Y?)rm")>;
+
+// PSLL,PSRL,PSRA W/D/Q.
+// x,x / v,v,x.
+def WritePShift : SchedWriteRes<[HWPort0, HWPort5]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[WritePShift], (instregex "(V?)PS(LL|RL|RA)(W|D|Q)(Y?)rr")>;
+
+// PSLL,PSRL DQ.
+def : InstRW<[WriteP5], (instregex "(V?)PS(R|L)LDQ(Y?)ri")>;
+
+//-- Other --//
+
+// EMMS.
+def WriteEMMS : SchedWriteRes<[]> {
+ let Latency = 13;
+ let NumMicroOps = 31;
+}
+def : InstRW<[WriteEMMS], (instregex "MMX_EMMS")>;
+
+//=== Floating Point XMM and YMM Instructions ===//
+//-- Move instructions --//
+
+// MOVMSKP S/D.
+// r32 <- x.
+def WriteMOVMSKPr : SchedWriteRes<[HWPort0]> {
+ let Latency = 3;
+}
+def : InstRW<[WriteMOVMSKPr], (instregex "(V?)MOVMSKP(S|D)rr")>;
+
+// r32 <- y.
+def WriteVMOVMSKPYr : SchedWriteRes<[HWPort0]> {
+ let Latency = 2;
+}
+def : InstRW<[WriteVMOVMSKPYr], (instregex "VMOVMSKP(S|D)Yrr")>;
+
+// VPERM2F128.
+def : InstRW<[WriteFShuffle256], (instregex "VPERM2F128rr")>;
+def : InstRW<[WriteFShuffle256Ld, ReadAfterLd], (instregex "VPERM2F128rm")>;
+
+// BLENDVP S/D.
+def : InstRW<[WriteFVarBlend], (instregex "BLENDVP(S|D)rr0")>;
+def : InstRW<[WriteFVarBlendLd, ReadAfterLd], (instregex "BLENDVP(S|D)rm0")>;
+
+// VBROADCASTF128.
+def : InstRW<[WriteLoad], (instregex "VBROADCASTF128")>;
+
+// EXTRACTPS.
+// r32,x,i.
+def WriteEXTRACTPSr : SchedWriteRes<[HWPort0, HWPort5]> {
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteEXTRACTPSr], (instregex "(V?)EXTRACTPSrr")>;
+
+// m32,x,i.
+def WriteEXTRACTPSm : SchedWriteRes<[HWPort0, HWPort5, HWPort23]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WriteEXTRACTPSm], (instregex "(V?)EXTRACTPSmr")>;
+
+// VEXTRACTF128.
+// x,y,i.
+def : InstRW<[WriteFShuffle256], (instregex "VEXTRACTF128rr")>;
+
+// m128,y,i.
+def WriteVEXTRACTF128m : SchedWriteRes<[HWPort23, HWPort4]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteVEXTRACTF128m], (instregex "VEXTRACTF128mr")>;
+
+// VINSERTF128.
+// y,y,x,i.
+def : InstRW<[WriteFShuffle256], (instregex "VINSERTF128rr")>;
+
+// y,y,m128,i.
+def WriteVINSERTF128m : SchedWriteRes<[HWPort015, HWPort23]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteFShuffle256, ReadAfterLd], (instregex "VINSERTF128rm")>;
+
+// VMASKMOVP S/D.
+// v,v,m.
+def WriteVMASKMOVPrm : SchedWriteRes<[HWPort5, HWPort23]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteVMASKMOVPrm], (instregex "VMASKMOVP(S|D)(Y?)rm")>;
+
+// m128,x,x.
+def WriteVMASKMOVPmr : SchedWriteRes<[HWPort0, HWPort1, HWPort4, HWPort23]> {
+ let Latency = 13;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1, 1, 1, 1];
+}
+def : InstRW<[WriteVMASKMOVPmr], (instregex "VMASKMOVP(S|D)mr")>;
+
+// m256,y,y.
+def WriteVMASKMOVPYmr : SchedWriteRes<[HWPort0, HWPort1, HWPort4, HWPort23]> {
+ let Latency = 14;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1, 1, 1, 1];
+}
+def : InstRW<[WriteVMASKMOVPYmr], (instregex "VMASKMOVP(S|D)Ymr")>;
+
+// VGATHERDPS.
+// x.
+def WriteVGATHERDPS128 : SchedWriteRes<[]> {
+ let NumMicroOps = 20;
+}
+def : InstRW<[WriteVGATHERDPS128, ReadAfterLd], (instregex "VGATHERDPSrm")>;
+
+// y.
+def WriteVGATHERDPS256 : SchedWriteRes<[]> {
+ let NumMicroOps = 34;
+}
+def : InstRW<[WriteVGATHERDPS256, ReadAfterLd], (instregex "VGATHERDPSYrm")>;
+
+// VGATHERQPS.
+// x.
+def WriteVGATHERQPS128 : SchedWriteRes<[]> {
+ let NumMicroOps = 15;
+}
+def : InstRW<[WriteVGATHERQPS128, ReadAfterLd], (instregex "VGATHERQPSrm")>;
+
+// y.
+def WriteVGATHERQPS256 : SchedWriteRes<[]> {
+ let NumMicroOps = 22;
+}
+def : InstRW<[WriteVGATHERQPS256, ReadAfterLd], (instregex "VGATHERQPSYrm")>;
+
+// VGATHERDPD.
+// x.
+def WriteVGATHERDPD128 : SchedWriteRes<[]> {
+ let NumMicroOps = 12;
+}
+def : InstRW<[WriteVGATHERDPD128, ReadAfterLd], (instregex "VGATHERDPDrm")>;
+
+// y.
+def WriteVGATHERDPD256 : SchedWriteRes<[]> {
+ let NumMicroOps = 20;
+}
+def : InstRW<[WriteVGATHERDPD256, ReadAfterLd], (instregex "VGATHERDPDYrm")>;
+
+// VGATHERQPD.
+// x.
+def WriteVGATHERQPD128 : SchedWriteRes<[]> {
+ let NumMicroOps = 14;
+}
+def : InstRW<[WriteVGATHERQPD128, ReadAfterLd], (instregex "VGATHERQPDrm")>;
+
+// y.
+def WriteVGATHERQPD256 : SchedWriteRes<[]> {
+ let NumMicroOps = 22;
+}
+def : InstRW<[WriteVGATHERQPD256, ReadAfterLd], (instregex "VGATHERQPDYrm")>;
+
+//-- Conversion instructions --//
+
+// CVTPD2PS.
+// x,x.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "(V?)CVTPD2PSrr")>;
+
+// x,m128.
+def : InstRW<[WriteP1_P5_Lat4Ld], (instregex "(V?)CVTPD2PS(X?)rm")>;
+
+// x,y.
+def WriteCVTPD2PSYrr : SchedWriteRes<[HWPort1, HWPort5]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteCVTPD2PSYrr], (instregex "(V?)CVTPD2PSYrr")>;
+
+// x,m256.
+def WriteCVTPD2PSYrm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WriteCVTPD2PSYrm], (instregex "(V?)CVTPD2PSYrm")>;
+
+// CVTSD2SS.
+// x,x.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "(Int_)?(V)?CVTSD2SSrr")>;
+
+// x,m64.
+def : InstRW<[WriteP1_P5_Lat4Ld], (instregex "(Int_)?(V)?CVTSD2SSrm")>;
+
+// CVTPS2PD.
+// x,x.
+def WriteCVTPS2PDrr : SchedWriteRes<[HWPort0, HWPort5]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteCVTPS2PDrr], (instregex "(V?)CVTPS2PDrr")>;
+
+// x,m64.
+// y,m128.
+def WriteCVTPS2PDrm : SchedWriteRes<[HWPort0, HWPort23]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteCVTPS2PDrm], (instregex "(V?)CVTPS2PD(Y?)rm")>;
+
+// y,x.
+def WriteVCVTPS2PDYrr : SchedWriteRes<[HWPort0, HWPort5]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteVCVTPS2PDYrr], (instregex "VCVTPS2PDYrr")>;
+
+// CVTSS2SD.
+// x,x.
+def WriteCVTSS2SDrr : SchedWriteRes<[HWPort0, HWPort5]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteCVTSS2SDrr], (instregex "(Int_)?(V?)CVTSS2SDrr")>;
+
+// x,m32.
+def WriteCVTSS2SDrm : SchedWriteRes<[HWPort0, HWPort23]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteCVTSS2SDrm], (instregex "(Int_)?(V?)CVTSS2SDrm")>;
+
+// CVTDQ2PD.
+// x,x.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "(V)?CVTDQ2PDrr")>;
+
+// y,x.
+def : InstRW<[WriteP1_P5_Lat6], (instregex "VCVTDQ2PDYrr")>;
+
+// CVT(T)PD2DQ.
+// x,x.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "(V?)CVT(T?)PD2DQrr")>;
+// x,m128.
+def : InstRW<[WriteP1_P5_Lat4Ld], (instregex "(V?)CVT(T?)PD2DQrm")>;
+// x,y.
+def : InstRW<[WriteP1_P5_Lat6], (instregex "VCVT(T?)PD2DQYrr")>;
+// x,m256.
+def : InstRW<[WriteP1_P5_Lat6Ld], (instregex "VCVT(T?)PD2DQYrm")>;
+
+// CVT(T)PS2PI.
+// mm,x.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "MMX_CVT(T?)PS2PIirr")>;
+
+// CVTPI2PD.
+// x,mm.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "MMX_CVT(T?)PI2PDirr")>;
+
+// CVT(T)PD2PI.
+// mm,x.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "MMX_CVT(T?)PD2PIirr")>;
+
+// CVSTSI2SS.
+// x,r32.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "(Int_)?(V?)CVT(T?)SI2SS(64)?rr")>;
+
+// CVT(T)SS2SI.
+// r32,x.
+def : InstRW<[WriteP0_P1_Lat4], (instregex "(Int_)?(V?)CVT(T?)SS2SI(64)?rr")>;
+// r32,m32.
+def : InstRW<[WriteP0_P1_Lat4Ld], (instregex "(Int_)?(V?)CVT(T?)SS2SI(64)?rm")>;
+
+// CVTSI2SD.
+// x,r32/64.
+def : InstRW<[WriteP0_P1_Lat4], (instregex "(Int_)?(V?)CVTSI2SS(64)?rr")>;
+
+// CVTSD2SI.
+// r32/64
+def : InstRW<[WriteP0_P1_Lat4], (instregex "(Int_)?(V?)CVT(T?)SD2SI(64)?rr")>;
+// r32,m32.
+def : InstRW<[WriteP0_P1_Lat4Ld], (instregex "(Int_)?(V?)CVT(T?)SD2SI(64)?rm")>;
+
+// VCVTPS2PH.
+// x,v,i.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "VCVTPS2PH(Y?)rr")>;
+// m,v,i.
+def : InstRW<[WriteP1_P5_Lat4Ld, WriteRMW], (instregex "VCVTPS2PH(Y?)mr")>;
+
+// VCVTPH2PS.
+// v,x.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "VCVTPH2PS(Y?)rr")>;
+
+//-- Arithmetic instructions --//
+
+// HADD, HSUB PS/PD
+// x,x / v,v,v.
+def WriteHADDSUBPr : SchedWriteRes<[HWPort1, HWPort5]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1, 2];
+}
+def : InstRW<[WriteHADDSUBPr], (instregex "(V?)H(ADD|SUB)P(S|D)(Y?)rr")>;
+
+// x,m / v,v,m.
+def WriteHADDSUBPm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1, 2, 1];
+}
+def : InstRW<[WriteHADDSUBPm], (instregex "(V?)H(ADD|SUB)P(S|D)(Y?)rm")>;
+
+// MULL SS/SD PS/PD.
+// x,x / v,v,v.
+def WriteMULr : SchedWriteRes<[HWPort01]> {
+ let Latency = 5;
+}
+def : InstRW<[WriteMULr], (instregex "(V?)MUL(P|S)(S|D)rr")>;
+
+// x,m / v,v,m.
+def WriteMULm : SchedWriteRes<[HWPort01, HWPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteMULm], (instregex "(V?)MUL(P|S)(S|D)rm")>;
+
+// VDIVPS.
+// y,y,y.
+def WriteVDIVPSYrr : SchedWriteRes<[HWPort0, HWPort15]> {
+ let Latency = 19; // 18-21 cycles.
+ let NumMicroOps = 3;
+ let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteVDIVPSYrr], (instregex "VDIVPSYrr")>;
+
+// y,y,m256.
+def WriteVDIVPSYrm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
+ let Latency = 23; // 18-21 + 4 cycles.
+ let NumMicroOps = 4;
+ let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteVDIVPSYrm, ReadAfterLd], (instregex "VDIVPSYrm")>;
+
+// VDIVPD.
+// y,y,y.
+def WriteVDIVPDYrr : SchedWriteRes<[HWPort0, HWPort15]> {
+ let Latency = 27; // 19-35 cycles.
+ let NumMicroOps = 3;
+ let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteVDIVPDYrr], (instregex "VDIVPDYrr")>;
+
+// y,y,m256.
+def WriteVDIVPDYrm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
+ let Latency = 31; // 19-35 + 4 cycles.
+ let NumMicroOps = 4;
+ let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteVDIVPDYrm, ReadAfterLd], (instregex "VDIVPDYrm")>;
+
+// VRCPPS.
+// y,y.
+def WriteVRCPPSr : SchedWriteRes<[HWPort0, HWPort15]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteVRCPPSr], (instregex "VRCPPSYr(_Int)?")>;
+
+// y,m256.
+def WriteVRCPPSm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteVRCPPSm], (instregex "VRCPPSYm(_Int)?")>;
+
+// ROUND SS/SD PS/PD.
+// v,v,i.
+def WriteROUNDr : SchedWriteRes<[HWPort1]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def : InstRW<[WriteROUNDr], (instregex "(V?)ROUND(Y?)(S|P)(S|D)r(_Int)?")>;
+
+// v,m,i.
+def WriteROUNDm : SchedWriteRes<[HWPort1, HWPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteROUNDm], (instregex "(V?)ROUND(Y?)(S|P)(S|D)m(_Int)?")>;
+
+// DPPS.
+// x,x,i / v,v,v,i.
+def WriteDPPSr : SchedWriteRes<[HWPort0, HWPort1, HWPort5]> {
+ let Latency = 14;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteDPPSr], (instregex "(V?)DPPS(Y?)rri")>;
+
+// x,m,i / v,v,m,i.
+def WriteDPPSm : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort23, HWPort6]> {
+ let Latency = 18;
+ let NumMicroOps = 6;
+ let ResourceCycles = [2, 1, 1, 1, 1];
+}
+def : InstRW<[WriteDPPSm, ReadAfterLd], (instregex "(V?)DPPS(Y?)rmi")>;
+
+// DPPD.
+// x,x,i.
+def WriteDPPDr : SchedWriteRes<[HWPort0, HWPort1, HWPort5]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WriteDPPDr], (instregex "(V?)DPPDrri")>;
+
+// x,m,i.
+def WriteDPPDm : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort23]> {
+ let Latency = 13;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1, 1, 1, 1];
+}
+def : InstRW<[WriteDPPDm], (instregex "(V?)DPPDrmi")>;
+
+// VFMADD.
+// v,v,v.
+def WriteFMADDr : SchedWriteRes<[HWPort01]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+}
+def : InstRW<[WriteFMADDr],
+ (instregex
+ // 3p forms.
+ "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)P(S|D)(r213|r132|r231)r(Y)?",
+ // 3s forms.
+ "VF(N?)M(ADD|SUB)S(S|D)(r132|r231|r213)r",
+ // 4s/4s_int forms.
+ "VF(N?)M(ADD|SUB)S(S|D)4rr(_REV|_Int)?",
+ // 4p forms.
+ "VF(N?)M(ADD|SUB)P(S|D)4rr(Y)?(_REV)?")>;
+
+// v,v,m.
+def WriteFMADDm : SchedWriteRes<[HWPort01, HWPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteFMADDm],
+ (instregex
+ // 3p forms.
+ "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)P(S|D)(r213|r132|r231)m(Y)?",
+ // 3s forms.
+ "VF(N?)M(ADD|SUB)S(S|D)(r132|r231|r213)m",
+ // 4s/4s_int forms.
+ "VF(N?)M(ADD|SUB)S(S|D)4(rm|mr)(_Int)?",
+ // 4p forms.
+ "VF(N?)M(ADD|SUB)P(S|D)4(rm|mr)(Y)?")>;
+
+//-- Math instructions --//
+
+// VSQRTPS.
+// y,y.
+def WriteVSQRTPSYr : SchedWriteRes<[HWPort0, HWPort15]> {
+ let Latency = 19;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteVSQRTPSYr], (instregex "VSQRTPSYr")>;
+
+// y,m256.
+def WriteVSQRTPSYm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
+ let Latency = 23;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteVSQRTPSYm], (instregex "VSQRTPSYm")>;
+
+// VSQRTPD.
+// y,y.
+def WriteVSQRTPDYr : SchedWriteRes<[HWPort0, HWPort15]> {
+ let Latency = 28;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteVSQRTPDYr], (instregex "VSQRTPDYr")>;
+
+// y,m256.
+def WriteVSQRTPDYm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
+ let Latency = 32;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteVSQRTPDYm], (instregex "VSQRTPDYm")>;
+
+// RSQRT SS/PS.
+// x,x.
+def WriteRSQRTr : SchedWriteRes<[HWPort0]> {
+ let Latency = 5;
+}
+def : InstRW<[WriteRSQRTr], (instregex "(V?)RSQRT(SS|PS)r(_Int)?")>;
+
+// x,m128.
+def WriteRSQRTm : SchedWriteRes<[HWPort0, HWPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteRSQRTm], (instregex "(V?)RSQRT(SS|PS)m(_Int)?")>;
+
+// RSQRTPS 256.
+// y,y.
+def WriteRSQRTPSYr : SchedWriteRes<[HWPort0, HWPort15]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteRSQRTPSYr], (instregex "VRSQRTPSYr(_Int)?")>;
+
+// y,m256.
+def WriteRSQRTPSYm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteRSQRTPSYm], (instregex "VRSQRTPSYm(_Int)?")>;
+
+//-- Logic instructions --//
+
+// AND, ANDN, OR, XOR PS/PD.
+// x,x / v,v,v.
+def : InstRW<[WriteP5], (instregex "(V?)(AND|ANDN|OR|XOR)P(S|D)(Y?)rr")>;
+// x,m / v,v,m.
+def : InstRW<[WriteP5Ld, ReadAfterLd],
+ (instregex "(V?)(AND|ANDN|OR|XOR)P(S|D)(Y?)rm")>;
+
+//-- Other instructions --//
+
+// VZEROUPPER.
+def WriteVZEROUPPER : SchedWriteRes<[]> {
+ let NumMicroOps = 4;
+}
+def : InstRW<[WriteVZEROUPPER], (instregex "VZEROUPPER")>;
+
+// VZEROALL.
+def WriteVZEROALL : SchedWriteRes<[]> {
+ let NumMicroOps = 12;
+}
+def : InstRW<[WriteVZEROALL], (instregex "VZEROALL")>;
+
+// LDMXCSR.
+def WriteLDMXCSR : SchedWriteRes<[HWPort0, HWPort6, HWPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WriteLDMXCSR], (instregex "(V)?LDMXCSR")>;
+
+// STMXCSR.
+def WriteSTMXCSR : SchedWriteRes<[HWPort0, HWPort4, HWPort6, HWPort237]> {
+ let Latency = 7;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1, 1, 1, 1];
+}
+def : InstRW<[WriteSTMXCSR], (instregex "(V)?STMXCSR")>;
+
+} // SchedModel
diff --git a/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td b/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td
new file mode 100644
index 000000000000..eca65c2892b7
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td
@@ -0,0 +1,250 @@
+//=- X86SchedSandyBridge.td - X86 Sandy Bridge Scheduling ----*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Sandy Bridge to support instruction
+// scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def SandyBridgeModel : SchedMachineModel {
+ // All x86 instructions are modeled as a single micro-op, and SB can decode 4
+ // instructions per cycle.
+ // FIXME: Identify instructions that aren't a single fused micro-op.
+ let IssueWidth = 4;
+ let MicroOpBufferSize = 168; // Based on the reorder buffer.
+ let LoadLatency = 4;
+ let MispredictPenalty = 16;
+
+ // Based on the LSD (loop-stream detector) queue size.
+ let LoopMicroOpBufferSize = 28;
+
+ // FIXME: SSE4 and AVX are unimplemented. This flag is set to allow
+ // the scheduler to assign a default model to unrecognized opcodes.
+ let CompleteModel = 0;
+}
+
+let SchedModel = SandyBridgeModel in {
+
+// Sandy Bridge can issue micro-ops to 6 different ports in one cycle.
+
+// Ports 0, 1, and 5 handle all computation.
+def SBPort0 : ProcResource<1>;
+def SBPort1 : ProcResource<1>;
+def SBPort5 : ProcResource<1>;
+
+// Ports 2 and 3 are identical. They handle loads and the address half of
+// stores.
+def SBPort23 : ProcResource<2>;
+
+// Port 4 gets the data half of stores. Store data can be available later than
+// the store address, but since we don't model the latency of stores, we can
+// ignore that.
+def SBPort4 : ProcResource<1>;
+
+// Many micro-ops are capable of issuing on multiple ports.
+def SBPort05 : ProcResGroup<[SBPort0, SBPort5]>;
+def SBPort15 : ProcResGroup<[SBPort1, SBPort5]>;
+def SBPort015 : ProcResGroup<[SBPort0, SBPort1, SBPort5]>;
+
+// 54 Entry Unified Scheduler
+def SBPortAny : ProcResGroup<[SBPort0, SBPort1, SBPort23, SBPort4, SBPort5]> {
+ let BufferSize=54;
+}
+
+// Integer division issued on port 0.
+def SBDivider : ProcResource<1>;
+
+// Loads are 4 cycles, so ReadAfterLd registers needn't be available until 4
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 4>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when queued in the reservation station.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass SBWriteResPair<X86FoldableSchedWrite SchedRW,
+ ProcResourceKind ExePort,
+ int Lat> {
+ // Register variant is using a single cycle on ExePort.
+ def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+ // Memory variant also uses a cycle on port 2/3 and adds 4 cycles to the
+ // latency.
+ def : WriteRes<SchedRW.Folded, [SBPort23, ExePort]> {
+ let Latency = !add(Lat, 4);
+ }
+}
+
+// A folded store needs a cycle on port 4 for the store data, but it does not
+// need an extra port 2/3 cycle to recompute the address.
+def : WriteRes<WriteRMW, [SBPort4]>;
+
+def : WriteRes<WriteStore, [SBPort23, SBPort4]>;
+def : WriteRes<WriteLoad, [SBPort23]> { let Latency = 4; }
+def : WriteRes<WriteMove, [SBPort015]>;
+def : WriteRes<WriteZero, []>;
+
+defm : SBWriteResPair<WriteALU, SBPort015, 1>;
+defm : SBWriteResPair<WriteIMul, SBPort1, 3>;
+def : WriteRes<WriteIMulH, []> { let Latency = 3; }
+defm : SBWriteResPair<WriteShift, SBPort05, 1>;
+defm : SBWriteResPair<WriteJump, SBPort5, 1>;
+
+// This is for simple LEAs with one or two input operands.
+// The complex ones can only execute on port 1, and they require two cycles on
+// the port to read all inputs. We don't model that.
+def : WriteRes<WriteLEA, [SBPort15]>;
+
+// This is quite rough, latency depends on the dividend.
+def : WriteRes<WriteIDiv, [SBPort0, SBDivider]> {
+ let Latency = 25;
+ let ResourceCycles = [1, 10];
+}
+def : WriteRes<WriteIDivLd, [SBPort23, SBPort0, SBDivider]> {
+ let Latency = 29;
+ let ResourceCycles = [1, 1, 10];
+}
+
+// Scalar and vector floating point.
+defm : SBWriteResPair<WriteFAdd, SBPort1, 3>;
+defm : SBWriteResPair<WriteFMul, SBPort0, 5>;
+defm : SBWriteResPair<WriteFDiv, SBPort0, 12>; // 10-14 cycles.
+defm : SBWriteResPair<WriteFRcp, SBPort0, 5>;
+defm : SBWriteResPair<WriteFRsqrt, SBPort0, 5>;
+defm : SBWriteResPair<WriteFSqrt, SBPort0, 15>;
+defm : SBWriteResPair<WriteCvtF2I, SBPort1, 3>;
+defm : SBWriteResPair<WriteCvtI2F, SBPort1, 4>;
+defm : SBWriteResPair<WriteCvtF2F, SBPort1, 3>;
+defm : SBWriteResPair<WriteFShuffle, SBPort5, 1>;
+defm : SBWriteResPair<WriteFBlend, SBPort05, 1>;
+def : WriteRes<WriteFVarBlend, [SBPort0, SBPort5]> {
+ let Latency = 2;
+ let ResourceCycles = [1, 1];
+}
+def : WriteRes<WriteFVarBlendLd, [SBPort0, SBPort5, SBPort23]> {
+ let Latency = 6;
+ let ResourceCycles = [1, 1, 1];
+}
+
+// Vector integer operations.
+defm : SBWriteResPair<WriteVecShift, SBPort05, 1>;
+defm : SBWriteResPair<WriteVecLogic, SBPort015, 1>;
+defm : SBWriteResPair<WriteVecALU, SBPort15, 1>;
+defm : SBWriteResPair<WriteVecIMul, SBPort0, 5>;
+defm : SBWriteResPair<WriteShuffle, SBPort15, 1>;
+defm : SBWriteResPair<WriteBlend, SBPort15, 1>;
+def : WriteRes<WriteVarBlend, [SBPort1, SBPort5]> {
+ let Latency = 2;
+ let ResourceCycles = [1, 1];
+}
+def : WriteRes<WriteVarBlendLd, [SBPort1, SBPort5, SBPort23]> {
+ let Latency = 6;
+ let ResourceCycles = [1, 1, 1];
+}
+def : WriteRes<WriteMPSAD, [SBPort0, SBPort1, SBPort5]> {
+ let Latency = 6;
+ let ResourceCycles = [1, 1, 1];
+}
+def : WriteRes<WriteMPSADLd, [SBPort0, SBPort1, SBPort5, SBPort23]> {
+ let Latency = 6;
+ let ResourceCycles = [1, 1, 1, 1];
+}
+
+// String instructions.
+// Packed Compare Implicit Length Strings, Return Mask
+def : WriteRes<WritePCmpIStrM, [SBPort015]> {
+ let Latency = 11;
+ let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrMLd, [SBPort015, SBPort23]> {
+ let Latency = 11;
+ let ResourceCycles = [3, 1];
+}
+
+// Packed Compare Explicit Length Strings, Return Mask
+def : WriteRes<WritePCmpEStrM, [SBPort015]> {
+ let Latency = 11;
+ let ResourceCycles = [8];
+}
+def : WriteRes<WritePCmpEStrMLd, [SBPort015, SBPort23]> {
+ let Latency = 11;
+ let ResourceCycles = [7, 1];
+}
+
+// Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpIStrI, [SBPort015]> {
+ let Latency = 3;
+ let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrILd, [SBPort015, SBPort23]> {
+ let Latency = 3;
+ let ResourceCycles = [3, 1];
+}
+
+// Packed Compare Explicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrI, [SBPort015]> {
+ let Latency = 4;
+ let ResourceCycles = [8];
+}
+def : WriteRes<WritePCmpEStrILd, [SBPort015, SBPort23]> {
+ let Latency = 4;
+ let ResourceCycles = [7, 1];
+}
+
+// AES Instructions.
+def : WriteRes<WriteAESDecEnc, [SBPort015]> {
+ let Latency = 8;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WriteAESDecEncLd, [SBPort015, SBPort23]> {
+ let Latency = 8;
+ let ResourceCycles = [2, 1];
+}
+
+def : WriteRes<WriteAESIMC, [SBPort015]> {
+ let Latency = 8;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WriteAESIMCLd, [SBPort015, SBPort23]> {
+ let Latency = 8;
+ let ResourceCycles = [2, 1];
+}
+
+def : WriteRes<WriteAESKeyGen, [SBPort015]> {
+ let Latency = 8;
+ let ResourceCycles = [11];
+}
+def : WriteRes<WriteAESKeyGenLd, [SBPort015, SBPort23]> {
+ let Latency = 8;
+ let ResourceCycles = [10, 1];
+}
+
+// Carry-less multiplication instructions.
+def : WriteRes<WriteCLMul, [SBPort015]> {
+ let Latency = 14;
+ let ResourceCycles = [18];
+}
+def : WriteRes<WriteCLMulLd, [SBPort015, SBPort23]> {
+ let Latency = 14;
+ let ResourceCycles = [17, 1];
+}
+
+
+def : WriteRes<WriteSystem, [SBPort015]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [SBPort015]> { let Latency = 100; }
+def : WriteRes<WriteFence, [SBPort23, SBPort4]>;
+def : WriteRes<WriteNop, []>;
+
+// AVX2 is not supported on that architecture, but we should define the basic
+// scheduling resources anyway.
+defm : SBWriteResPair<WriteFShuffle256, SBPort0, 1>;
+defm : SBWriteResPair<WriteShuffle256, SBPort0, 1>;
+defm : SBWriteResPair<WriteVarVecShift, SBPort0, 1>;
+} // SchedModel
diff --git a/contrib/llvm/lib/Target/X86/X86Schedule.td b/contrib/llvm/lib/Target/X86/X86Schedule.td
new file mode 100644
index 000000000000..35257f89100c
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86Schedule.td
@@ -0,0 +1,661 @@
+//===-- X86Schedule.td - X86 Scheduling Definitions --------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// InstrSchedModel annotations for out-of-order CPUs.
+//
+// These annotations are independent of the itinerary classes defined below.
+
+// Instructions with folded loads need to read the memory operand immediately,
+// but other register operands don't have to be read until the load is ready.
+// These operands are marked with ReadAfterLd.
+def ReadAfterLd : SchedRead;
+
+// Instructions with both a load and a store folded are modeled as a folded
+// load + WriteRMW.
+def WriteRMW : SchedWrite;
+
+// Most instructions can fold loads, so almost every SchedWrite comes in two
+// variants: With and without a folded load.
+// An X86FoldableSchedWrite holds a reference to the corresponding SchedWrite
+// with a folded load.
+class X86FoldableSchedWrite : SchedWrite {
+ // The SchedWrite to use when a load is folded into the instruction.
+ SchedWrite Folded;
+}
+
+// Multiclass that produces a linked pair of SchedWrites.
+multiclass X86SchedWritePair {
+ // Register-Memory operation.
+ def Ld : SchedWrite;
+ // Register-Register operation.
+ def NAME : X86FoldableSchedWrite {
+ let Folded = !cast<SchedWrite>(NAME#"Ld");
+ }
+}
+
+// Arithmetic.
+defm WriteALU : X86SchedWritePair; // Simple integer ALU op.
+defm WriteIMul : X86SchedWritePair; // Integer multiplication.
+def WriteIMulH : SchedWrite; // Integer multiplication, high part.
+defm WriteIDiv : X86SchedWritePair; // Integer division.
+def WriteLEA : SchedWrite; // LEA instructions can't fold loads.
+
+// Integer shifts and rotates.
+defm WriteShift : X86SchedWritePair;
+
+// Loads, stores, and moves, not folded with other operations.
+def WriteLoad : SchedWrite;
+def WriteStore : SchedWrite;
+def WriteMove : SchedWrite;
+
+// Idioms that clear a register, like xorps %xmm0, %xmm0.
+// These can often bypass execution ports completely.
+def WriteZero : SchedWrite;
+
+// Branches don't produce values, so they have no latency, but they still
+// consume resources. Indirect branches can fold loads.
+defm WriteJump : X86SchedWritePair;
+
+// Floating point. This covers both scalar and vector operations.
+defm WriteFAdd : X86SchedWritePair; // Floating point add/sub/compare.
+defm WriteFMul : X86SchedWritePair; // Floating point multiplication.
+defm WriteFDiv : X86SchedWritePair; // Floating point division.
+defm WriteFSqrt : X86SchedWritePair; // Floating point square root.
+defm WriteFRcp : X86SchedWritePair; // Floating point reciprocal estimate.
+defm WriteFRsqrt : X86SchedWritePair; // Floating point reciprocal square root estimate.
+defm WriteFMA : X86SchedWritePair; // Fused Multiply Add.
+defm WriteFShuffle : X86SchedWritePair; // Floating point vector shuffles.
+defm WriteFBlend : X86SchedWritePair; // Floating point vector blends.
+defm WriteFVarBlend : X86SchedWritePair; // Fp vector variable blends.
+
+// FMA Scheduling helper class.
+class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; }
+
+// Vector integer operations.
+defm WriteVecALU : X86SchedWritePair; // Vector integer ALU op, no logicals.
+defm WriteVecShift : X86SchedWritePair; // Vector integer shifts.
+defm WriteVecIMul : X86SchedWritePair; // Vector integer multiply.
+defm WriteShuffle : X86SchedWritePair; // Vector shuffles.
+defm WriteBlend : X86SchedWritePair; // Vector blends.
+defm WriteVarBlend : X86SchedWritePair; // Vector variable blends.
+defm WriteMPSAD : X86SchedWritePair; // Vector MPSAD.
+
+// Vector bitwise operations.
+// These are often used on both floating point and integer vectors.
+defm WriteVecLogic : X86SchedWritePair; // Vector and/or/xor.
+
+// Conversion between integer and float.
+defm WriteCvtF2I : X86SchedWritePair; // Float -> Integer.
+defm WriteCvtI2F : X86SchedWritePair; // Integer -> Float.
+defm WriteCvtF2F : X86SchedWritePair; // Float -> Float size conversion.
+
+// Strings instructions.
+// Packed Compare Implicit Length Strings, Return Mask
+defm WritePCmpIStrM : X86SchedWritePair;
+// Packed Compare Explicit Length Strings, Return Mask
+defm WritePCmpEStrM : X86SchedWritePair;
+// Packed Compare Implicit Length Strings, Return Index
+defm WritePCmpIStrI : X86SchedWritePair;
+// Packed Compare Explicit Length Strings, Return Index
+defm WritePCmpEStrI : X86SchedWritePair;
+
+// AES instructions.
+defm WriteAESDecEnc : X86SchedWritePair; // Decryption, encryption.
+defm WriteAESIMC : X86SchedWritePair; // InvMixColumn.
+defm WriteAESKeyGen : X86SchedWritePair; // Key Generation.
+
+// Carry-less multiplication instructions.
+defm WriteCLMul : X86SchedWritePair;
+
+// Catch-all for expensive system instructions.
+def WriteSystem : SchedWrite;
+
+// AVX2.
+defm WriteFShuffle256 : X86SchedWritePair; // Fp 256-bit width vector shuffles.
+defm WriteShuffle256 : X86SchedWritePair; // 256-bit width vector shuffles.
+defm WriteVarVecShift : X86SchedWritePair; // Variable vector shifts.
+
+// Old microcoded instructions that nobody use.
+def WriteMicrocoded : SchedWrite;
+
+// Fence instructions.
+def WriteFence : SchedWrite;
+
+// Nop, not very useful expect it provides a model for nops!
+def WriteNop : SchedWrite;
+
+//===----------------------------------------------------------------------===//
+// Instruction Itinerary classes used for X86
+def IIC_ALU_MEM : InstrItinClass;
+def IIC_ALU_NONMEM : InstrItinClass;
+def IIC_LEA : InstrItinClass;
+def IIC_LEA_16 : InstrItinClass;
+def IIC_MUL8 : InstrItinClass;
+def IIC_MUL16_MEM : InstrItinClass;
+def IIC_MUL16_REG : InstrItinClass;
+def IIC_MUL32_MEM : InstrItinClass;
+def IIC_MUL32_REG : InstrItinClass;
+def IIC_MUL64 : InstrItinClass;
+// imul by al, ax, eax, tax
+def IIC_IMUL8 : InstrItinClass;
+def IIC_IMUL16_MEM : InstrItinClass;
+def IIC_IMUL16_REG : InstrItinClass;
+def IIC_IMUL32_MEM : InstrItinClass;
+def IIC_IMUL32_REG : InstrItinClass;
+def IIC_IMUL64 : InstrItinClass;
+// imul reg by reg|mem
+def IIC_IMUL16_RM : InstrItinClass;
+def IIC_IMUL16_RR : InstrItinClass;
+def IIC_IMUL32_RM : InstrItinClass;
+def IIC_IMUL32_RR : InstrItinClass;
+def IIC_IMUL64_RM : InstrItinClass;
+def IIC_IMUL64_RR : InstrItinClass;
+// imul reg = reg/mem * imm
+def IIC_IMUL16_RMI : InstrItinClass;
+def IIC_IMUL16_RRI : InstrItinClass;
+def IIC_IMUL32_RMI : InstrItinClass;
+def IIC_IMUL32_RRI : InstrItinClass;
+def IIC_IMUL64_RMI : InstrItinClass;
+def IIC_IMUL64_RRI : InstrItinClass;
+// div
+def IIC_DIV8_MEM : InstrItinClass;
+def IIC_DIV8_REG : InstrItinClass;
+def IIC_DIV16 : InstrItinClass;
+def IIC_DIV32 : InstrItinClass;
+def IIC_DIV64 : InstrItinClass;
+// idiv
+def IIC_IDIV8 : InstrItinClass;
+def IIC_IDIV16 : InstrItinClass;
+def IIC_IDIV32 : InstrItinClass;
+def IIC_IDIV64 : InstrItinClass;
+// neg/not/inc/dec
+def IIC_UNARY_REG : InstrItinClass;
+def IIC_UNARY_MEM : InstrItinClass;
+// add/sub/and/or/xor/sbc/cmp/test
+def IIC_BIN_MEM : InstrItinClass;
+def IIC_BIN_NONMEM : InstrItinClass;
+// adc/sbc
+def IIC_BIN_CARRY_MEM : InstrItinClass;
+def IIC_BIN_CARRY_NONMEM : InstrItinClass;
+// shift/rotate
+def IIC_SR : InstrItinClass;
+// shift double
+def IIC_SHD16_REG_IM : InstrItinClass;
+def IIC_SHD16_REG_CL : InstrItinClass;
+def IIC_SHD16_MEM_IM : InstrItinClass;
+def IIC_SHD16_MEM_CL : InstrItinClass;
+def IIC_SHD32_REG_IM : InstrItinClass;
+def IIC_SHD32_REG_CL : InstrItinClass;
+def IIC_SHD32_MEM_IM : InstrItinClass;
+def IIC_SHD32_MEM_CL : InstrItinClass;
+def IIC_SHD64_REG_IM : InstrItinClass;
+def IIC_SHD64_REG_CL : InstrItinClass;
+def IIC_SHD64_MEM_IM : InstrItinClass;
+def IIC_SHD64_MEM_CL : InstrItinClass;
+// cmov
+def IIC_CMOV16_RM : InstrItinClass;
+def IIC_CMOV16_RR : InstrItinClass;
+def IIC_CMOV32_RM : InstrItinClass;
+def IIC_CMOV32_RR : InstrItinClass;
+def IIC_CMOV64_RM : InstrItinClass;
+def IIC_CMOV64_RR : InstrItinClass;
+// set
+def IIC_SET_R : InstrItinClass;
+def IIC_SET_M : InstrItinClass;
+// jmp/jcc/jcxz
+def IIC_Jcc : InstrItinClass;
+def IIC_JCXZ : InstrItinClass;
+def IIC_JMP_REL : InstrItinClass;
+def IIC_JMP_REG : InstrItinClass;
+def IIC_JMP_MEM : InstrItinClass;
+def IIC_JMP_FAR_MEM : InstrItinClass;
+def IIC_JMP_FAR_PTR : InstrItinClass;
+// loop
+def IIC_LOOP : InstrItinClass;
+def IIC_LOOPE : InstrItinClass;
+def IIC_LOOPNE : InstrItinClass;
+// call
+def IIC_CALL_RI : InstrItinClass;
+def IIC_CALL_MEM : InstrItinClass;
+def IIC_CALL_FAR_MEM : InstrItinClass;
+def IIC_CALL_FAR_PTR : InstrItinClass;
+// ret
+def IIC_RET : InstrItinClass;
+def IIC_RET_IMM : InstrItinClass;
+//sign extension movs
+def IIC_MOVSX : InstrItinClass;
+def IIC_MOVSX_R16_R8 : InstrItinClass;
+def IIC_MOVSX_R16_M8 : InstrItinClass;
+def IIC_MOVSX_R16_R16 : InstrItinClass;
+def IIC_MOVSX_R32_R32 : InstrItinClass;
+//zero extension movs
+def IIC_MOVZX : InstrItinClass;
+def IIC_MOVZX_R16_R8 : InstrItinClass;
+def IIC_MOVZX_R16_M8 : InstrItinClass;
+
+def IIC_REP_MOVS : InstrItinClass;
+def IIC_REP_STOS : InstrItinClass;
+
+// SSE scalar/parallel binary operations
+def IIC_SSE_ALU_F32S_RR : InstrItinClass;
+def IIC_SSE_ALU_F32S_RM : InstrItinClass;
+def IIC_SSE_ALU_F64S_RR : InstrItinClass;
+def IIC_SSE_ALU_F64S_RM : InstrItinClass;
+def IIC_SSE_MUL_F32S_RR : InstrItinClass;
+def IIC_SSE_MUL_F32S_RM : InstrItinClass;
+def IIC_SSE_MUL_F64S_RR : InstrItinClass;
+def IIC_SSE_MUL_F64S_RM : InstrItinClass;
+def IIC_SSE_DIV_F32S_RR : InstrItinClass;
+def IIC_SSE_DIV_F32S_RM : InstrItinClass;
+def IIC_SSE_DIV_F64S_RR : InstrItinClass;
+def IIC_SSE_DIV_F64S_RM : InstrItinClass;
+def IIC_SSE_ALU_F32P_RR : InstrItinClass;
+def IIC_SSE_ALU_F32P_RM : InstrItinClass;
+def IIC_SSE_ALU_F64P_RR : InstrItinClass;
+def IIC_SSE_ALU_F64P_RM : InstrItinClass;
+def IIC_SSE_MUL_F32P_RR : InstrItinClass;
+def IIC_SSE_MUL_F32P_RM : InstrItinClass;
+def IIC_SSE_MUL_F64P_RR : InstrItinClass;
+def IIC_SSE_MUL_F64P_RM : InstrItinClass;
+def IIC_SSE_DIV_F32P_RR : InstrItinClass;
+def IIC_SSE_DIV_F32P_RM : InstrItinClass;
+def IIC_SSE_DIV_F64P_RR : InstrItinClass;
+def IIC_SSE_DIV_F64P_RM : InstrItinClass;
+
+def IIC_SSE_COMIS_RR : InstrItinClass;
+def IIC_SSE_COMIS_RM : InstrItinClass;
+
+def IIC_SSE_HADDSUB_RR : InstrItinClass;
+def IIC_SSE_HADDSUB_RM : InstrItinClass;
+
+def IIC_SSE_BIT_P_RR : InstrItinClass;
+def IIC_SSE_BIT_P_RM : InstrItinClass;
+
+def IIC_SSE_INTALU_P_RR : InstrItinClass;
+def IIC_SSE_INTALU_P_RM : InstrItinClass;
+def IIC_SSE_INTALUQ_P_RR : InstrItinClass;
+def IIC_SSE_INTALUQ_P_RM : InstrItinClass;
+
+def IIC_SSE_INTMUL_P_RR : InstrItinClass;
+def IIC_SSE_INTMUL_P_RM : InstrItinClass;
+
+def IIC_SSE_INTSH_P_RR : InstrItinClass;
+def IIC_SSE_INTSH_P_RM : InstrItinClass;
+def IIC_SSE_INTSH_P_RI : InstrItinClass;
+
+def IIC_SSE_INTSHDQ_P_RI : InstrItinClass;
+
+def IIC_SSE_SHUFP : InstrItinClass;
+def IIC_SSE_PSHUF_RI : InstrItinClass;
+def IIC_SSE_PSHUF_MI : InstrItinClass;
+
+def IIC_SSE_UNPCK : InstrItinClass;
+
+def IIC_SSE_MOVMSK : InstrItinClass;
+def IIC_SSE_MASKMOV : InstrItinClass;
+
+def IIC_SSE_PEXTRW : InstrItinClass;
+def IIC_SSE_PINSRW : InstrItinClass;
+
+def IIC_SSE_PABS_RR : InstrItinClass;
+def IIC_SSE_PABS_RM : InstrItinClass;
+
+def IIC_SSE_SQRTPS_RR : InstrItinClass;
+def IIC_SSE_SQRTPS_RM : InstrItinClass;
+def IIC_SSE_SQRTSS_RR : InstrItinClass;
+def IIC_SSE_SQRTSS_RM : InstrItinClass;
+def IIC_SSE_SQRTPD_RR : InstrItinClass;
+def IIC_SSE_SQRTPD_RM : InstrItinClass;
+def IIC_SSE_SQRTSD_RR : InstrItinClass;
+def IIC_SSE_SQRTSD_RM : InstrItinClass;
+
+def IIC_SSE_RSQRTPS_RR : InstrItinClass;
+def IIC_SSE_RSQRTPS_RM : InstrItinClass;
+def IIC_SSE_RSQRTSS_RR : InstrItinClass;
+def IIC_SSE_RSQRTSS_RM : InstrItinClass;
+
+def IIC_SSE_RCPP_RR : InstrItinClass;
+def IIC_SSE_RCPP_RM : InstrItinClass;
+def IIC_SSE_RCPS_RR : InstrItinClass;
+def IIC_SSE_RCPS_RM : InstrItinClass;
+
+def IIC_SSE_MOV_S_RR : InstrItinClass;
+def IIC_SSE_MOV_S_RM : InstrItinClass;
+def IIC_SSE_MOV_S_MR : InstrItinClass;
+
+def IIC_SSE_MOVA_P_RR : InstrItinClass;
+def IIC_SSE_MOVA_P_RM : InstrItinClass;
+def IIC_SSE_MOVA_P_MR : InstrItinClass;
+
+def IIC_SSE_MOVU_P_RR : InstrItinClass;
+def IIC_SSE_MOVU_P_RM : InstrItinClass;
+def IIC_SSE_MOVU_P_MR : InstrItinClass;
+
+def IIC_SSE_MOVDQ : InstrItinClass;
+def IIC_SSE_MOVD_ToGP : InstrItinClass;
+def IIC_SSE_MOVQ_RR : InstrItinClass;
+
+def IIC_SSE_MOV_LH : InstrItinClass;
+
+def IIC_SSE_LDDQU : InstrItinClass;
+
+def IIC_SSE_MOVNT : InstrItinClass;
+
+def IIC_SSE_PHADDSUBD_RR : InstrItinClass;
+def IIC_SSE_PHADDSUBD_RM : InstrItinClass;
+def IIC_SSE_PHADDSUBSW_RR : InstrItinClass;
+def IIC_SSE_PHADDSUBSW_RM : InstrItinClass;
+def IIC_SSE_PHADDSUBW_RR : InstrItinClass;
+def IIC_SSE_PHADDSUBW_RM : InstrItinClass;
+def IIC_SSE_PSHUFB_RR : InstrItinClass;
+def IIC_SSE_PSHUFB_RM : InstrItinClass;
+def IIC_SSE_PSIGN_RR : InstrItinClass;
+def IIC_SSE_PSIGN_RM : InstrItinClass;
+
+def IIC_SSE_PMADD : InstrItinClass;
+def IIC_SSE_PMULHRSW : InstrItinClass;
+def IIC_SSE_PALIGNRR : InstrItinClass;
+def IIC_SSE_PALIGNRM : InstrItinClass;
+def IIC_SSE_MWAIT : InstrItinClass;
+def IIC_SSE_MONITOR : InstrItinClass;
+def IIC_SSE_MWAITX : InstrItinClass;
+def IIC_SSE_MONITORX : InstrItinClass;
+
+def IIC_SSE_PREFETCH : InstrItinClass;
+def IIC_SSE_PAUSE : InstrItinClass;
+def IIC_SSE_LFENCE : InstrItinClass;
+def IIC_SSE_MFENCE : InstrItinClass;
+def IIC_SSE_SFENCE : InstrItinClass;
+def IIC_SSE_LDMXCSR : InstrItinClass;
+def IIC_SSE_STMXCSR : InstrItinClass;
+
+def IIC_SSE_CVT_PD_RR : InstrItinClass;
+def IIC_SSE_CVT_PD_RM : InstrItinClass;
+def IIC_SSE_CVT_PS_RR : InstrItinClass;
+def IIC_SSE_CVT_PS_RM : InstrItinClass;
+def IIC_SSE_CVT_PI2PS_RR : InstrItinClass;
+def IIC_SSE_CVT_PI2PS_RM : InstrItinClass;
+def IIC_SSE_CVT_Scalar_RR : InstrItinClass;
+def IIC_SSE_CVT_Scalar_RM : InstrItinClass;
+def IIC_SSE_CVT_SS2SI32_RM : InstrItinClass;
+def IIC_SSE_CVT_SS2SI32_RR : InstrItinClass;
+def IIC_SSE_CVT_SS2SI64_RM : InstrItinClass;
+def IIC_SSE_CVT_SS2SI64_RR : InstrItinClass;
+def IIC_SSE_CVT_SD2SI_RM : InstrItinClass;
+def IIC_SSE_CVT_SD2SI_RR : InstrItinClass;
+
+// MMX
+def IIC_MMX_MOV_MM_RM : InstrItinClass;
+def IIC_MMX_MOV_REG_MM : InstrItinClass;
+def IIC_MMX_MOVQ_RM : InstrItinClass;
+def IIC_MMX_MOVQ_RR : InstrItinClass;
+
+def IIC_MMX_ALU_RM : InstrItinClass;
+def IIC_MMX_ALU_RR : InstrItinClass;
+def IIC_MMX_ALUQ_RM : InstrItinClass;
+def IIC_MMX_ALUQ_RR : InstrItinClass;
+def IIC_MMX_PHADDSUBW_RM : InstrItinClass;
+def IIC_MMX_PHADDSUBW_RR : InstrItinClass;
+def IIC_MMX_PHADDSUBD_RM : InstrItinClass;
+def IIC_MMX_PHADDSUBD_RR : InstrItinClass;
+def IIC_MMX_PMUL : InstrItinClass;
+def IIC_MMX_MISC_FUNC_MEM : InstrItinClass;
+def IIC_MMX_MISC_FUNC_REG : InstrItinClass;
+def IIC_MMX_PSADBW : InstrItinClass;
+def IIC_MMX_SHIFT_RI : InstrItinClass;
+def IIC_MMX_SHIFT_RM : InstrItinClass;
+def IIC_MMX_SHIFT_RR : InstrItinClass;
+def IIC_MMX_UNPCK_H_RM : InstrItinClass;
+def IIC_MMX_UNPCK_H_RR : InstrItinClass;
+def IIC_MMX_UNPCK_L : InstrItinClass;
+def IIC_MMX_PCK_RM : InstrItinClass;
+def IIC_MMX_PCK_RR : InstrItinClass;
+def IIC_MMX_PSHUF : InstrItinClass;
+def IIC_MMX_PEXTR : InstrItinClass;
+def IIC_MMX_PINSRW : InstrItinClass;
+def IIC_MMX_MASKMOV : InstrItinClass;
+
+def IIC_MMX_CVT_PD_RR : InstrItinClass;
+def IIC_MMX_CVT_PD_RM : InstrItinClass;
+def IIC_MMX_CVT_PS_RR : InstrItinClass;
+def IIC_MMX_CVT_PS_RM : InstrItinClass;
+
+def IIC_CMPX_LOCK : InstrItinClass;
+def IIC_CMPX_LOCK_8 : InstrItinClass;
+def IIC_CMPX_LOCK_8B : InstrItinClass;
+def IIC_CMPX_LOCK_16B : InstrItinClass;
+
+def IIC_XADD_LOCK_MEM : InstrItinClass;
+def IIC_XADD_LOCK_MEM8 : InstrItinClass;
+
+def IIC_FILD : InstrItinClass;
+def IIC_FLD : InstrItinClass;
+def IIC_FLD80 : InstrItinClass;
+def IIC_FST : InstrItinClass;
+def IIC_FST80 : InstrItinClass;
+def IIC_FIST : InstrItinClass;
+def IIC_FLDZ : InstrItinClass;
+def IIC_FUCOM : InstrItinClass;
+def IIC_FUCOMI : InstrItinClass;
+def IIC_FCOMI : InstrItinClass;
+def IIC_FNSTSW : InstrItinClass;
+def IIC_FNSTCW : InstrItinClass;
+def IIC_FLDCW : InstrItinClass;
+def IIC_FNINIT : InstrItinClass;
+def IIC_FFREE : InstrItinClass;
+def IIC_FNCLEX : InstrItinClass;
+def IIC_WAIT : InstrItinClass;
+def IIC_FXAM : InstrItinClass;
+def IIC_FNOP : InstrItinClass;
+def IIC_FLDL : InstrItinClass;
+def IIC_F2XM1 : InstrItinClass;
+def IIC_FYL2X : InstrItinClass;
+def IIC_FPTAN : InstrItinClass;
+def IIC_FPATAN : InstrItinClass;
+def IIC_FXTRACT : InstrItinClass;
+def IIC_FPREM1 : InstrItinClass;
+def IIC_FPSTP : InstrItinClass;
+def IIC_FPREM : InstrItinClass;
+def IIC_FYL2XP1 : InstrItinClass;
+def IIC_FSINCOS : InstrItinClass;
+def IIC_FRNDINT : InstrItinClass;
+def IIC_FSCALE : InstrItinClass;
+def IIC_FCOMPP : InstrItinClass;
+def IIC_FXSAVE : InstrItinClass;
+def IIC_FXRSTOR : InstrItinClass;
+
+def IIC_FXCH : InstrItinClass;
+
+// System instructions
+def IIC_CPUID : InstrItinClass;
+def IIC_INT : InstrItinClass;
+def IIC_INT3 : InstrItinClass;
+def IIC_INVD : InstrItinClass;
+def IIC_INVLPG : InstrItinClass;
+def IIC_IRET : InstrItinClass;
+def IIC_HLT : InstrItinClass;
+def IIC_LXS : InstrItinClass;
+def IIC_LTR : InstrItinClass;
+def IIC_RDTSC : InstrItinClass;
+def IIC_RSM : InstrItinClass;
+def IIC_SIDT : InstrItinClass;
+def IIC_SGDT : InstrItinClass;
+def IIC_SLDT : InstrItinClass;
+def IIC_STR : InstrItinClass;
+def IIC_SWAPGS : InstrItinClass;
+def IIC_SYSCALL : InstrItinClass;
+def IIC_SYS_ENTER_EXIT : InstrItinClass;
+def IIC_IN_RR : InstrItinClass;
+def IIC_IN_RI : InstrItinClass;
+def IIC_OUT_RR : InstrItinClass;
+def IIC_OUT_IR : InstrItinClass;
+def IIC_INS : InstrItinClass;
+def IIC_MOV_REG_DR : InstrItinClass;
+def IIC_MOV_DR_REG : InstrItinClass;
+def IIC_MOV_REG_CR : InstrItinClass;
+def IIC_MOV_CR_REG : InstrItinClass;
+def IIC_MOV_REG_SR : InstrItinClass;
+def IIC_MOV_MEM_SR : InstrItinClass;
+def IIC_MOV_SR_REG : InstrItinClass;
+def IIC_MOV_SR_MEM : InstrItinClass;
+def IIC_LAR_RM : InstrItinClass;
+def IIC_LAR_RR : InstrItinClass;
+def IIC_LSL_RM : InstrItinClass;
+def IIC_LSL_RR : InstrItinClass;
+def IIC_LGDT : InstrItinClass;
+def IIC_LIDT : InstrItinClass;
+def IIC_LLDT_REG : InstrItinClass;
+def IIC_LLDT_MEM : InstrItinClass;
+def IIC_PUSH_CS : InstrItinClass;
+def IIC_PUSH_SR : InstrItinClass;
+def IIC_POP_SR : InstrItinClass;
+def IIC_POP_SR_SS : InstrItinClass;
+def IIC_VERR : InstrItinClass;
+def IIC_VERW_REG : InstrItinClass;
+def IIC_VERW_MEM : InstrItinClass;
+def IIC_WRMSR : InstrItinClass;
+def IIC_RDMSR : InstrItinClass;
+def IIC_RDPMC : InstrItinClass;
+def IIC_SMSW : InstrItinClass;
+def IIC_LMSW_REG : InstrItinClass;
+def IIC_LMSW_MEM : InstrItinClass;
+def IIC_ENTER : InstrItinClass;
+def IIC_LEAVE : InstrItinClass;
+def IIC_POP_MEM : InstrItinClass;
+def IIC_POP_REG16 : InstrItinClass;
+def IIC_POP_REG : InstrItinClass;
+def IIC_POP_F : InstrItinClass;
+def IIC_POP_FD : InstrItinClass;
+def IIC_POP_A : InstrItinClass;
+def IIC_PUSH_IMM : InstrItinClass;
+def IIC_PUSH_MEM : InstrItinClass;
+def IIC_PUSH_REG : InstrItinClass;
+def IIC_PUSH_F : InstrItinClass;
+def IIC_PUSH_A : InstrItinClass;
+def IIC_BSWAP : InstrItinClass;
+def IIC_BIT_SCAN_MEM : InstrItinClass;
+def IIC_BIT_SCAN_REG : InstrItinClass;
+def IIC_MOVS : InstrItinClass;
+def IIC_STOS : InstrItinClass;
+def IIC_SCAS : InstrItinClass;
+def IIC_CMPS : InstrItinClass;
+def IIC_MOV : InstrItinClass;
+def IIC_MOV_MEM : InstrItinClass;
+def IIC_AHF : InstrItinClass;
+def IIC_BT_MI : InstrItinClass;
+def IIC_BT_MR : InstrItinClass;
+def IIC_BT_RI : InstrItinClass;
+def IIC_BT_RR : InstrItinClass;
+def IIC_BTX_MI : InstrItinClass;
+def IIC_BTX_MR : InstrItinClass;
+def IIC_BTX_RI : InstrItinClass;
+def IIC_BTX_RR : InstrItinClass;
+def IIC_XCHG_REG : InstrItinClass;
+def IIC_XCHG_MEM : InstrItinClass;
+def IIC_XADD_REG : InstrItinClass;
+def IIC_XADD_MEM : InstrItinClass;
+def IIC_CMPXCHG_MEM : InstrItinClass;
+def IIC_CMPXCHG_REG : InstrItinClass;
+def IIC_CMPXCHG_MEM8 : InstrItinClass;
+def IIC_CMPXCHG_REG8 : InstrItinClass;
+def IIC_CMPXCHG_8B : InstrItinClass;
+def IIC_CMPXCHG_16B : InstrItinClass;
+def IIC_LODS : InstrItinClass;
+def IIC_OUTS : InstrItinClass;
+def IIC_CLC : InstrItinClass;
+def IIC_CLD : InstrItinClass;
+def IIC_CLI : InstrItinClass;
+def IIC_CMC : InstrItinClass;
+def IIC_CLTS : InstrItinClass;
+def IIC_STC : InstrItinClass;
+def IIC_STI : InstrItinClass;
+def IIC_STD : InstrItinClass;
+def IIC_XLAT : InstrItinClass;
+def IIC_AAA : InstrItinClass;
+def IIC_AAD : InstrItinClass;
+def IIC_AAM : InstrItinClass;
+def IIC_AAS : InstrItinClass;
+def IIC_DAA : InstrItinClass;
+def IIC_DAS : InstrItinClass;
+def IIC_BOUND : InstrItinClass;
+def IIC_ARPL_REG : InstrItinClass;
+def IIC_ARPL_MEM : InstrItinClass;
+def IIC_MOVBE : InstrItinClass;
+def IIC_AES : InstrItinClass;
+def IIC_BLEND_MEM : InstrItinClass;
+def IIC_BLEND_NOMEM : InstrItinClass;
+def IIC_CBW : InstrItinClass;
+def IIC_CRC32_REG : InstrItinClass;
+def IIC_CRC32_MEM : InstrItinClass;
+def IIC_SSE_DPPD_RR : InstrItinClass;
+def IIC_SSE_DPPD_RM : InstrItinClass;
+def IIC_SSE_DPPS_RR : InstrItinClass;
+def IIC_SSE_DPPS_RM : InstrItinClass;
+def IIC_MMX_EMMS : InstrItinClass;
+def IIC_SSE_EXTRACTPS_RR : InstrItinClass;
+def IIC_SSE_EXTRACTPS_RM : InstrItinClass;
+def IIC_SSE_INSERTPS_RR : InstrItinClass;
+def IIC_SSE_INSERTPS_RM : InstrItinClass;
+def IIC_SSE_MPSADBW_RR : InstrItinClass;
+def IIC_SSE_MPSADBW_RM : InstrItinClass;
+def IIC_SSE_PMULLD_RR : InstrItinClass;
+def IIC_SSE_PMULLD_RM : InstrItinClass;
+def IIC_SSE_ROUNDPS_REG : InstrItinClass;
+def IIC_SSE_ROUNDPS_MEM : InstrItinClass;
+def IIC_SSE_ROUNDPD_REG : InstrItinClass;
+def IIC_SSE_ROUNDPD_MEM : InstrItinClass;
+def IIC_SSE_POPCNT_RR : InstrItinClass;
+def IIC_SSE_POPCNT_RM : InstrItinClass;
+def IIC_SSE_PCLMULQDQ_RR : InstrItinClass;
+def IIC_SSE_PCLMULQDQ_RM : InstrItinClass;
+
+def IIC_NOP : InstrItinClass;
+
+//===----------------------------------------------------------------------===//
+// Processor instruction itineraries.
+
+// IssueWidth is analogous to the number of decode units. Core and its
+// descendents, including Nehalem and SandyBridge have 4 decoders.
+// Resources beyond the decoder operate on micro-ops and are bufferred
+// so adjacent micro-ops don't directly compete.
+//
+// MicroOpBufferSize > 1 indicates that RAW dependencies can be
+// decoded in the same cycle. The value 32 is a reasonably arbitrary
+// number of in-flight instructions.
+//
+// HighLatency=10 is optimistic. X86InstrInfo::isHighLatencyDef
+// indicates high latency opcodes. Alternatively, InstrItinData
+// entries may be included here to define specific operand
+// latencies. Since these latencies are not used for pipeline hazards,
+// they do not need to be exact.
+//
+// The GenericX86Model contains no instruction itineraries
+// and disables PostRAScheduler.
+class GenericX86Model : SchedMachineModel {
+ let IssueWidth = 4;
+ let MicroOpBufferSize = 32;
+ let LoadLatency = 4;
+ let HighLatency = 10;
+ let PostRAScheduler = 0;
+ let CompleteModel = 0;
+}
+
+def GenericModel : GenericX86Model;
+
+// Define a model with the PostRAScheduler enabled.
+def GenericPostRAModel : GenericX86Model {
+ let PostRAScheduler = 1;
+}
+
+include "X86ScheduleAtom.td"
+include "X86SchedSandyBridge.td"
+include "X86SchedHaswell.td"
+include "X86ScheduleSLM.td"
+include "X86ScheduleBtVer2.td"
+
diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td b/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td
new file mode 100644
index 000000000000..a5b440182aa9
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td
@@ -0,0 +1,550 @@
+//===- X86ScheduleAtom.td - X86 Atom Scheduling Definitions -*- tablegen -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the Intel Atom
+// in order (Saltwell-32nm/Bonnell-45nm) processors.
+//
+//===----------------------------------------------------------------------===//
+
+//
+// Scheduling information derived from the "Intel 64 and IA32 Architectures
+// Optimization Reference Manual", Chapter 13, Section 4.
+// Functional Units
+// Port 0
+def Port0 : FuncUnit; // ALU: ALU0, shift/rotate, load/store
+ // SIMD/FP: SIMD ALU, Shuffle,SIMD/FP multiply, divide
+def Port1 : FuncUnit; // ALU: ALU1, bit processing, jump, and LEA
+ // SIMD/FP: SIMD ALU, FP Adder
+
+def AtomItineraries : ProcessorItineraries<
+ [ Port0, Port1 ],
+ [], [
+ // P0 only
+ // InstrItinData<class, [InstrStage<N, [P0]>] >,
+ // P0 or P1
+ // InstrItinData<class, [InstrStage<N, [P0, P1]>] >,
+ // P0 and P1
+ // InstrItinData<class, [InstrStage<N, [P0], 0>, InstrStage<N, [P1]>] >,
+ //
+ // Default is 1 cycle, port0 or port1
+ InstrItinData<IIC_ALU_MEM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_ALU_NONMEM, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_LEA, [InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_LEA_16, [InstrStage<2, [Port0, Port1]>] >,
+ // mul
+ InstrItinData<IIC_MUL8, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_MUL16_MEM, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_MUL16_REG, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_MUL32_MEM, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_MUL32_REG, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_MUL64, [InstrStage<12, [Port0, Port1]>] >,
+ // imul by al, ax, eax, rax
+ InstrItinData<IIC_IMUL8, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL16_MEM, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL16_REG, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL32_MEM, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL32_REG, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL64, [InstrStage<12, [Port0, Port1]>] >,
+ // imul reg by reg|mem
+ InstrItinData<IIC_IMUL16_RM, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL16_RR, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL32_RM, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_IMUL32_RR, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_IMUL64_RM, [InstrStage<12, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL64_RR, [InstrStage<12, [Port0, Port1]>] >,
+ // imul reg = reg/mem * imm
+ InstrItinData<IIC_IMUL16_RRI, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL32_RRI, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_IMUL64_RRI, [InstrStage<14, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL16_RMI, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL32_RMI, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_IMUL64_RMI, [InstrStage<14, [Port0, Port1]>] >,
+ // idiv
+ InstrItinData<IIC_IDIV8, [InstrStage<62, [Port0, Port1]>] >,
+ InstrItinData<IIC_IDIV16, [InstrStage<62, [Port0, Port1]>] >,
+ InstrItinData<IIC_IDIV32, [InstrStage<62, [Port0, Port1]>] >,
+ InstrItinData<IIC_IDIV64, [InstrStage<130, [Port0, Port1]>] >,
+ // div
+ InstrItinData<IIC_DIV8_REG, [InstrStage<50, [Port0, Port1]>] >,
+ InstrItinData<IIC_DIV8_MEM, [InstrStage<68, [Port0, Port1]>] >,
+ InstrItinData<IIC_DIV16, [InstrStage<50, [Port0, Port1]>] >,
+ InstrItinData<IIC_DIV32, [InstrStage<50, [Port0, Port1]>] >,
+ InstrItinData<IIC_DIV64, [InstrStage<130, [Port0, Port1]>] >,
+ // neg/not/inc/dec
+ InstrItinData<IIC_UNARY_REG, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_UNARY_MEM, [InstrStage<1, [Port0]>] >,
+ // add/sub/and/or/xor/cmp/test
+ InstrItinData<IIC_BIN_NONMEM, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_BIN_MEM, [InstrStage<1, [Port0]>] >,
+ // adc/sbc
+ InstrItinData<IIC_BIN_CARRY_NONMEM, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_BIN_CARRY_MEM, [InstrStage<1, [Port0]>] >,
+ // shift/rotate
+ InstrItinData<IIC_SR, [InstrStage<1, [Port0]>] >,
+ // shift double
+ InstrItinData<IIC_SHD16_REG_IM, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD16_REG_CL, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD16_MEM_IM, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD16_MEM_CL, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD32_REG_IM, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD32_REG_CL, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD32_MEM_IM, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD32_MEM_CL, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD64_REG_IM, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD64_REG_CL, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD64_MEM_IM, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD64_MEM_CL, [InstrStage<9, [Port0, Port1]>] >,
+ // cmov
+ InstrItinData<IIC_CMOV16_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_CMOV16_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMOV32_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_CMOV32_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMOV64_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_CMOV64_RR, [InstrStage<1, [Port0, Port1]>] >,
+ // set
+ InstrItinData<IIC_SET_M, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_SET_R, [InstrStage<1, [Port0, Port1]>] >,
+ // jcc
+ InstrItinData<IIC_Jcc, [InstrStage<1, [Port1]>] >,
+ // jcxz/jecxz/jrcxz
+ InstrItinData<IIC_JCXZ, [InstrStage<4, [Port0, Port1]>] >,
+ // jmp rel
+ InstrItinData<IIC_JMP_REL, [InstrStage<1, [Port1]>] >,
+ // jmp indirect
+ InstrItinData<IIC_JMP_REG, [InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_JMP_MEM, [InstrStage<2, [Port0, Port1]>] >,
+ // jmp far
+ InstrItinData<IIC_JMP_FAR_MEM, [InstrStage<32, [Port0, Port1]>] >,
+ InstrItinData<IIC_JMP_FAR_PTR, [InstrStage<31, [Port0, Port1]>] >,
+ // loop/loope/loopne
+ InstrItinData<IIC_LOOP, [InstrStage<18, [Port0, Port1]>] >,
+ InstrItinData<IIC_LOOPE, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_LOOPNE, [InstrStage<17, [Port0, Port1]>] >,
+ // call - all but reg/imm
+ InstrItinData<IIC_CALL_RI, [InstrStage<1, [Port0], 0>,
+ InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_CALL_MEM, [InstrStage<15, [Port0, Port1]>] >,
+ InstrItinData<IIC_CALL_FAR_MEM, [InstrStage<40, [Port0, Port1]>] >,
+ InstrItinData<IIC_CALL_FAR_PTR, [InstrStage<39, [Port0, Port1]>] >,
+ //ret
+ InstrItinData<IIC_RET, [InstrStage<79, [Port0, Port1]>] >,
+ InstrItinData<IIC_RET_IMM, [InstrStage<1, [Port0], 0>, InstrStage<1, [Port1]>] >,
+ //sign extension movs
+ InstrItinData<IIC_MOVSX,[InstrStage<1, [Port0] >] >,
+ InstrItinData<IIC_MOVSX_R16_R8, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOVSX_R16_M8, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOVSX_R16_R16, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOVSX_R32_R32, [InstrStage<1, [Port0, Port1]>] >,
+ //zero extension movs
+ InstrItinData<IIC_MOVZX,[InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MOVZX_R16_R8, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOVZX_R16_M8, [InstrStage<3, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_REP_MOVS, [InstrStage<75, [Port0, Port1]>] >,
+ InstrItinData<IIC_REP_STOS, [InstrStage<74, [Port0, Port1]>] >,
+
+ // SSE binary operations
+ // arithmetic fp scalar
+ InstrItinData<IIC_SSE_ALU_F32S_RR, [InstrStage<5, [Port1]>] >,
+ InstrItinData<IIC_SSE_ALU_F32S_RM, [InstrStage<5, [Port0], 0>,
+ InstrStage<5, [Port1]>] >,
+ InstrItinData<IIC_SSE_ALU_F64S_RR, [InstrStage<5, [Port1]>] >,
+ InstrItinData<IIC_SSE_ALU_F64S_RM, [InstrStage<5, [Port0], 0>,
+ InstrStage<5, [Port1]>] >,
+ InstrItinData<IIC_SSE_MUL_F32S_RR, [InstrStage<4, [Port0]>] >,
+ InstrItinData<IIC_SSE_MUL_F32S_RM, [InstrStage<4, [Port0]>] >,
+ InstrItinData<IIC_SSE_MUL_F64S_RR, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_SSE_MUL_F64S_RM, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_SSE_DIV_F32S_RR, [InstrStage<34, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_DIV_F32S_RM, [InstrStage<34, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_DIV_F64S_RR, [InstrStage<62, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_DIV_F64S_RM, [InstrStage<62, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_COMIS_RR, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_COMIS_RM, [InstrStage<10, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_HADDSUB_RR, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_HADDSUB_RM, [InstrStage<9, [Port0, Port1]>] >,
+
+ // arithmetic fp parallel
+ InstrItinData<IIC_SSE_ALU_F32P_RR, [InstrStage<5, [Port1]>] >,
+ InstrItinData<IIC_SSE_ALU_F32P_RM, [InstrStage<5, [Port0], 0>,
+ InstrStage<5, [Port1]>] >,
+ InstrItinData<IIC_SSE_ALU_F64P_RR, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_ALU_F64P_RM, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_MUL_F32P_RR, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_SSE_MUL_F32P_RM, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_SSE_MUL_F64P_RR, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_MUL_F64P_RM, [InstrStage<10, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_DIV_F32P_RR, [InstrStage<70, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_DIV_F32P_RM, [InstrStage<70, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_DIV_F64P_RR, [InstrStage<125, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_DIV_F64P_RM, [InstrStage<125, [Port0, Port1]>] >,
+
+ // bitwise parallel
+ InstrItinData<IIC_SSE_BIT_P_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_BIT_P_RM, [InstrStage<1, [Port0]>] >,
+
+ // arithmetic int parallel
+ InstrItinData<IIC_SSE_INTALU_P_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_INTALU_P_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_INTALUQ_P_RR, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_INTALUQ_P_RM, [InstrStage<3, [Port0, Port1]>] >,
+
+ // multiply int parallel
+ InstrItinData<IIC_SSE_INTMUL_P_RR, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_SSE_INTMUL_P_RM, [InstrStage<5, [Port0]>] >,
+
+ // shift parallel
+ InstrItinData<IIC_SSE_INTSH_P_RR, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_INTSH_P_RM, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_INTSH_P_RI, [InstrStage<1, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_INTSHDQ_P_RI, [InstrStage<1, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_SHUFP, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_PSHUF_RI, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_PSHUF_MI, [InstrStage<1, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_UNPCK, [InstrStage<1, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_SQRTPS_RR, [InstrStage<70, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_SQRTPS_RM, [InstrStage<70, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_SQRTSS_RR, [InstrStage<34, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_SQRTSS_RM, [InstrStage<34, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_SQRTPD_RR, [InstrStage<125, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_SQRTPD_RM, [InstrStage<125, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_SQRTSD_RR, [InstrStage<62, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_SQRTSD_RM, [InstrStage<62, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_RSQRTPS_RR, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_RSQRTPS_RM, [InstrStage<10, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_RSQRTSS_RR, [InstrStage<4, [Port0]>] >,
+ InstrItinData<IIC_SSE_RSQRTSS_RM, [InstrStage<4, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_RCPP_RR, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_RCPP_RM, [InstrStage<10, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_RCPS_RR, [InstrStage<4, [Port0]>] >,
+ InstrItinData<IIC_SSE_RCPS_RM, [InstrStage<4, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_MOVMSK, [InstrStage<3, [Port0]>] >,
+ InstrItinData<IIC_SSE_MASKMOV, [InstrStage<2, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_PEXTRW, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PINSRW, [InstrStage<1, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_PABS_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PABS_RM, [InstrStage<1, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_MOV_S_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_MOV_S_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_MOV_S_MR, [InstrStage<1, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_MOVA_P_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_MOVA_P_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_MOVA_P_MR, [InstrStage<1, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_MOVU_P_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_MOVU_P_RM, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_MOVU_P_MR, [InstrStage<2, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_MOV_LH, [InstrStage<1, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_LDDQU, [InstrStage<3, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_MOVDQ, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_MOVD_ToGP, [InstrStage<3, [Port0]>] >,
+ InstrItinData<IIC_SSE_MOVQ_RR, [InstrStage<1, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_MOVNT, [InstrStage<1, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_PREFETCH, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_PAUSE, [InstrStage<17, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_LFENCE, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_MFENCE, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_SFENCE, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_LDMXCSR, [InstrStage<5, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_STMXCSR, [InstrStage<15, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_PHADDSUBD_RR, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PHADDSUBD_RM, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PHADDSUBSW_RR, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PHADDSUBSW_RM, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PHADDSUBW_RR, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PHADDSUBW_RM, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PSHUFB_RR, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PSHUFB_RM, [InstrStage<5, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PSIGN_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PSIGN_RM, [InstrStage<1, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_PMADD, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_SSE_PMULHRSW, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_SSE_PALIGNRR, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_PALIGNRM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_MWAIT, [InstrStage<46, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_MONITOR, [InstrStage<45, [Port0, Port1]>] >,
+
+ // conversions
+ // to/from PD ...
+ InstrItinData<IIC_SSE_CVT_PD_RR, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_PD_RM, [InstrStage<8, [Port0, Port1]>] >,
+ // to/from PS except to/from PD and PS2PI
+ InstrItinData<IIC_SSE_CVT_PS_RR, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_PS_RM, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_Scalar_RR, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_Scalar_RM, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_SS2SI32_RR, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_SS2SI32_RM, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_SS2SI64_RR, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_SS2SI64_RM, [InstrStage<10, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_SD2SI_RR, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_SD2SI_RM, [InstrStage<9, [Port0, Port1]>] >,
+
+ // MMX MOVs
+ InstrItinData<IIC_MMX_MOV_MM_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MMX_MOV_REG_MM, [InstrStage<3, [Port0]>] >,
+ InstrItinData<IIC_MMX_MOVQ_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MMX_MOVQ_RR, [InstrStage<1, [Port0, Port1]>] >,
+ // other MMX
+ InstrItinData<IIC_MMX_ALU_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MMX_ALU_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_ALUQ_RM, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_ALUQ_RR, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_PHADDSUBW_RM, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_PHADDSUBW_RR, [InstrStage<5, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_PHADDSUBD_RM, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_PHADDSUBD_RR, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_PMUL, [InstrStage<4, [Port0]>] >,
+ InstrItinData<IIC_MMX_MISC_FUNC_MEM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MMX_MISC_FUNC_REG, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_PSADBW, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_SHIFT_RI, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_SHIFT_RM, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_SHIFT_RR, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_UNPCK_H_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MMX_UNPCK_H_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_UNPCK_L, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MMX_PCK_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MMX_PCK_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_PSHUF, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MMX_PEXTR, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_PINSRW, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MMX_MASKMOV, [InstrStage<1, [Port0]>] >,
+ // conversions
+ // from/to PD
+ InstrItinData<IIC_MMX_CVT_PD_RR, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_CVT_PD_RM, [InstrStage<8, [Port0, Port1]>] >,
+ // from/to PI
+ InstrItinData<IIC_MMX_CVT_PS_RR, [InstrStage<5, [Port1]>] >,
+ InstrItinData<IIC_MMX_CVT_PS_RM, [InstrStage<5, [Port0], 0>,
+ InstrStage<5, [Port1]>]>,
+
+ InstrItinData<IIC_CMPX_LOCK, [InstrStage<14, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMPX_LOCK_8, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMPX_LOCK_8B, [InstrStage<18, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMPX_LOCK_16B, [InstrStage<22, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<3, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_FILD, [InstrStage<5, [Port0], 0>, InstrStage<5, [Port1]>] >,
+ InstrItinData<IIC_FLD, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_FLD80, [InstrStage<4, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_FST, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_FST80, [InstrStage<5, [Port0, Port1]>] >,
+ InstrItinData<IIC_FIST, [InstrStage<6, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_FLDZ, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_FUCOM, [InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_FUCOMI, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_FCOMI, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_FNSTSW, [InstrStage<10, [Port0, Port1]>] >,
+ InstrItinData<IIC_FNSTCW, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_FLDCW, [InstrStage<5, [Port0, Port1]>] >,
+ InstrItinData<IIC_FNINIT, [InstrStage<63, [Port0, Port1]>] >,
+ InstrItinData<IIC_FFREE, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_FNCLEX, [InstrStage<25, [Port0, Port1]>] >,
+ InstrItinData<IIC_WAIT, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_FXAM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_FNOP, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_FLDL, [InstrStage<10, [Port0, Port1]>] >,
+ InstrItinData<IIC_F2XM1, [InstrStage<99, [Port0, Port1]>] >,
+ InstrItinData<IIC_FYL2X, [InstrStage<146, [Port0, Port1]>] >,
+ InstrItinData<IIC_FPTAN, [InstrStage<168, [Port0, Port1]>] >,
+ InstrItinData<IIC_FPATAN, [InstrStage<183, [Port0, Port1]>] >,
+ InstrItinData<IIC_FXTRACT, [InstrStage<25, [Port0, Port1]>] >,
+ InstrItinData<IIC_FPREM1, [InstrStage<71, [Port0, Port1]>] >,
+ InstrItinData<IIC_FPSTP, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_FPREM, [InstrStage<55, [Port0, Port1]>] >,
+ InstrItinData<IIC_FYL2XP1, [InstrStage<147, [Port0, Port1]>] >,
+ InstrItinData<IIC_FSINCOS, [InstrStage<174, [Port0, Port1]>] >,
+ InstrItinData<IIC_FRNDINT, [InstrStage<46, [Port0, Port1]>] >,
+ InstrItinData<IIC_FSCALE, [InstrStage<77, [Port0, Port1]>] >,
+ InstrItinData<IIC_FCOMPP, [InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_FXSAVE, [InstrStage<140, [Port0, Port1]>] >,
+ InstrItinData<IIC_FXRSTOR, [InstrStage<141, [Port0, Port1]>] >,
+ InstrItinData<IIC_FXCH, [InstrStage<1, [Port0], 0>, InstrStage<1, [Port1]>] >,
+
+ // System instructions
+ InstrItinData<IIC_CPUID, [InstrStage<121, [Port0, Port1]>] >,
+ InstrItinData<IIC_INT, [InstrStage<127, [Port0, Port1]>] >,
+ InstrItinData<IIC_INT3, [InstrStage<130, [Port0, Port1]>] >,
+ InstrItinData<IIC_INVD, [InstrStage<1003, [Port0, Port1]>] >,
+ InstrItinData<IIC_INVLPG, [InstrStage<71, [Port0, Port1]>] >,
+ InstrItinData<IIC_IRET, [InstrStage<109, [Port0, Port1]>] >,
+ InstrItinData<IIC_HLT, [InstrStage<121, [Port0, Port1]>] >,
+ InstrItinData<IIC_LXS, [InstrStage<10, [Port0, Port1]>] >,
+ InstrItinData<IIC_LTR, [InstrStage<83, [Port0, Port1]>] >,
+ InstrItinData<IIC_RDTSC, [InstrStage<30, [Port0, Port1]>] >,
+ InstrItinData<IIC_RSM, [InstrStage<741, [Port0, Port1]>] >,
+ InstrItinData<IIC_SIDT, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_SGDT, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_SLDT, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_STR, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_SWAPGS, [InstrStage<22, [Port0, Port1]>] >,
+ InstrItinData<IIC_SYSCALL, [InstrStage<96, [Port0, Port1]>] >,
+ InstrItinData<IIC_SYS_ENTER_EXIT, [InstrStage<88, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_IN_RR, [InstrStage<94, [Port0, Port1]>] >,
+ InstrItinData<IIC_IN_RI, [InstrStage<92, [Port0, Port1]>] >,
+ InstrItinData<IIC_OUT_RR, [InstrStage<68, [Port0, Port1]>] >,
+ InstrItinData<IIC_OUT_IR, [InstrStage<72, [Port0, Port1]>] >,
+ InstrItinData<IIC_INS, [InstrStage<59, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_MOV_REG_DR, [InstrStage<88, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOV_DR_REG, [InstrStage<123, [Port0, Port1]>] >,
+ // worst case for mov REG_CRx
+ InstrItinData<IIC_MOV_REG_CR, [InstrStage<12, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOV_CR_REG, [InstrStage<136, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_MOV_REG_SR, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MOV_MEM_SR, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOV_SR_REG, [InstrStage<21, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOV_SR_MEM, [InstrStage<26, [Port0, Port1]>] >,
+ // LAR
+ InstrItinData<IIC_LAR_RM, [InstrStage<50, [Port0, Port1]>] >,
+ InstrItinData<IIC_LAR_RR, [InstrStage<54, [Port0, Port1]>] >,
+ // LSL
+ InstrItinData<IIC_LSL_RM, [InstrStage<46, [Port0, Port1]>] >,
+ InstrItinData<IIC_LSL_RR, [InstrStage<49, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_LGDT, [InstrStage<44, [Port0, Port1]>] >,
+ InstrItinData<IIC_LIDT, [InstrStage<44, [Port0, Port1]>] >,
+ InstrItinData<IIC_LLDT_REG, [InstrStage<60, [Port0, Port1]>] >,
+ InstrItinData<IIC_LLDT_MEM, [InstrStage<64, [Port0, Port1]>] >,
+ // push control register, segment registers
+ InstrItinData<IIC_PUSH_CS, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_PUSH_SR, [InstrStage<2, [Port0, Port1]>] >,
+ // pop control register, segment registers
+ InstrItinData<IIC_POP_SR, [InstrStage<29, [Port0, Port1]>] >,
+ InstrItinData<IIC_POP_SR_SS, [InstrStage<48, [Port0, Port1]>] >,
+ // VERR, VERW
+ InstrItinData<IIC_VERR, [InstrStage<41, [Port0, Port1]>] >,
+ InstrItinData<IIC_VERW_REG, [InstrStage<51, [Port0, Port1]>] >,
+ InstrItinData<IIC_VERW_MEM, [InstrStage<50, [Port0, Port1]>] >,
+ // WRMSR, RDMSR
+ InstrItinData<IIC_WRMSR, [InstrStage<202, [Port0, Port1]>] >,
+ InstrItinData<IIC_RDMSR, [InstrStage<78, [Port0, Port1]>] >,
+ InstrItinData<IIC_RDPMC, [InstrStage<46, [Port0, Port1]>] >,
+ // SMSW, LMSW
+ InstrItinData<IIC_SMSW, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_LMSW_REG, [InstrStage<69, [Port0, Port1]>] >,
+ InstrItinData<IIC_LMSW_MEM, [InstrStage<67, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_ENTER, [InstrStage<32, [Port0, Port1]>] >,
+ InstrItinData<IIC_LEAVE, [InstrStage<2, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_POP_MEM, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_POP_REG16, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_POP_REG, [InstrStage<1, [Port0], 0>,
+ InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_POP_F, [InstrStage<32, [Port0, Port1]>] >,
+ InstrItinData<IIC_POP_FD, [InstrStage<26, [Port0, Port1]>] >,
+ InstrItinData<IIC_POP_A, [InstrStage<9, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_PUSH_IMM, [InstrStage<1, [Port0], 0>,
+ InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_PUSH_MEM, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_PUSH_REG, [InstrStage<1, [Port0], 0>,
+ InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_PUSH_F, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_PUSH_A, [InstrStage<8, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_BSWAP, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_BIT_SCAN_MEM, [InstrStage<16, [Port0, Port1]>] >,
+ InstrItinData<IIC_BIT_SCAN_REG, [InstrStage<16, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOVS, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_STOS, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_SCAS, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMPS, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOV, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOV_MEM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_AHF, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_BT_MI, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_BT_MR, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_BT_RI, [InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_BT_RR, [InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_BTX_MI, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_BTX_MR, [InstrStage<11, [Port0, Port1]>] >,
+ InstrItinData<IIC_BTX_RI, [InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_BTX_RR, [InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_XCHG_REG, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_XCHG_MEM, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_XADD_REG, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_XADD_MEM, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMPXCHG_MEM, [InstrStage<14, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMPXCHG_REG, [InstrStage<15, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMPXCHG_MEM8, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMPXCHG_REG8, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMPXCHG_8B, [InstrStage<18, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMPXCHG_16B, [InstrStage<22, [Port0, Port1]>] >,
+ InstrItinData<IIC_LODS, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_OUTS, [InstrStage<74, [Port0, Port1]>] >,
+ InstrItinData<IIC_CLC, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_CLD, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_CLI, [InstrStage<14, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMC, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_CLTS, [InstrStage<33, [Port0, Port1]>] >,
+ InstrItinData<IIC_STC, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_STI, [InstrStage<17, [Port0, Port1]>] >,
+ InstrItinData<IIC_STD, [InstrStage<21, [Port0, Port1]>] >,
+ InstrItinData<IIC_XLAT, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_AAA, [InstrStage<13, [Port0, Port1]>] >,
+ InstrItinData<IIC_AAD, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_AAM, [InstrStage<21, [Port0, Port1]>] >,
+ InstrItinData<IIC_AAS, [InstrStage<13, [Port0, Port1]>] >,
+ InstrItinData<IIC_DAA, [InstrStage<18, [Port0, Port1]>] >,
+ InstrItinData<IIC_DAS, [InstrStage<20, [Port0, Port1]>] >,
+ InstrItinData<IIC_BOUND, [InstrStage<11, [Port0, Port1]>] >,
+ InstrItinData<IIC_ARPL_REG, [InstrStage<24, [Port0, Port1]>] >,
+ InstrItinData<IIC_ARPL_MEM, [InstrStage<23, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOVBE, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_CBW, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_EMMS, [InstrStage<5, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_NOP, [InstrStage<1, [Port0, Port1]>] >
+ ]>;
+
+// Atom machine model.
+def AtomModel : SchedMachineModel {
+ let IssueWidth = 2; // Allows 2 instructions per scheduling group.
+ let MicroOpBufferSize = 0; // In-order execution, always hide latency.
+ let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles.
+ let HighLatency = 30;// Expected, may be overriden by OperandCycles.
+
+ // On the Atom, the throughput for taken branches is 2 cycles. For small
+ // simple loops, expand by a small factor to hide the backedge cost.
+ let LoopMicroOpBufferSize = 10;
+ let PostRAScheduler = 1;
+ let CompleteModel = 0;
+
+ let Itineraries = AtomItineraries;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td
new file mode 100644
index 000000000000..ce1ece34e431
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td
@@ -0,0 +1,341 @@
+//=- X86ScheduleBtVer2.td - X86 BtVer2 (Jaguar) Scheduling ---*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for AMD btver2 (Jaguar) to support
+// instruction scheduling and other instruction cost heuristics. Based off AMD Software
+// Optimization Guide for AMD Family 16h Processors & Instruction Latency appendix.
+//
+//===----------------------------------------------------------------------===//
+
+def BtVer2Model : SchedMachineModel {
+ // All x86 instructions are modeled as a single micro-op, and btver2 can
+ // decode 2 instructions per cycle.
+ let IssueWidth = 2;
+ let MicroOpBufferSize = 64; // Retire Control Unit
+ let LoadLatency = 5; // FPU latency (worse case cf Integer 3 cycle latency)
+ let HighLatency = 25;
+ let MispredictPenalty = 14; // Minimum branch misdirection penalty
+ let PostRAScheduler = 1;
+
+ // FIXME: SSE4/AVX is unimplemented. This flag is set to allow
+ // the scheduler to assign a default model to unrecognized opcodes.
+ let CompleteModel = 0;
+}
+
+let SchedModel = BtVer2Model in {
+
+// Jaguar can issue up to 6 micro-ops in one cycle
+def JALU0 : ProcResource<1>; // Integer Pipe0: integer ALU0 (also handle FP->INT jam)
+def JALU1 : ProcResource<1>; // Integer Pipe1: integer ALU1/MUL/DIV
+def JLAGU : ProcResource<1>; // Integer Pipe2: LAGU
+def JSAGU : ProcResource<1>; // Integer Pipe3: SAGU (also handles 3-operand LEA)
+def JFPU0 : ProcResource<1>; // Vector/FPU Pipe0: VALU0/VIMUL/FPA
+def JFPU1 : ProcResource<1>; // Vector/FPU Pipe1: VALU1/STC/FPM
+
+// Any pipe - FIXME we need this until we can discriminate between int/fpu load/store/moves properly
+def JAny : ProcResGroup<[JALU0, JALU1, JLAGU, JSAGU, JFPU0, JFPU1]>;
+
+// Integer Pipe Scheduler
+def JALU01 : ProcResGroup<[JALU0, JALU1]> {
+ let BufferSize=20;
+}
+
+// AGU Pipe Scheduler
+def JLSAGU : ProcResGroup<[JLAGU, JSAGU]> {
+ let BufferSize=12;
+}
+
+// Fpu Pipe Scheduler
+def JFPU01 : ProcResGroup<[JFPU0, JFPU1]> {
+ let BufferSize=18;
+}
+
+def JDiv : ProcResource<1>; // integer division
+def JMul : ProcResource<1>; // integer multiplication
+def JVALU0 : ProcResource<1>; // vector integer
+def JVALU1 : ProcResource<1>; // vector integer
+def JVIMUL : ProcResource<1>; // vector integer multiplication
+def JSTC : ProcResource<1>; // vector store/convert
+def JFPM : ProcResource<1>; // FP multiplication
+def JFPA : ProcResource<1>; // FP addition
+
+// Integer loads are 3 cycles, so ReadAfterLd registers needn't be available until 3
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 3>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when dispatched by the schedulers.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass JWriteResIntPair<X86FoldableSchedWrite SchedRW,
+ ProcResourceKind ExePort,
+ int Lat> {
+ // Register variant is using a single cycle on ExePort.
+ def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+ // Memory variant also uses a cycle on JLAGU and adds 3 cycles to the
+ // latency.
+ def : WriteRes<SchedRW.Folded, [JLAGU, ExePort]> {
+ let Latency = !add(Lat, 3);
+ }
+}
+
+multiclass JWriteResFpuPair<X86FoldableSchedWrite SchedRW,
+ ProcResourceKind ExePort,
+ int Lat> {
+ // Register variant is using a single cycle on ExePort.
+ def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+ // Memory variant also uses a cycle on JLAGU and adds 5 cycles to the
+ // latency.
+ def : WriteRes<SchedRW.Folded, [JLAGU, ExePort]> {
+ let Latency = !add(Lat, 5);
+ }
+}
+
+// A folded store needs a cycle on the SAGU for the store data.
+def : WriteRes<WriteRMW, [JSAGU]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Arithmetic.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : JWriteResIntPair<WriteALU, JALU01, 1>;
+defm : JWriteResIntPair<WriteIMul, JALU1, 3>;
+
+def : WriteRes<WriteIMulH, [JALU1]> {
+ let Latency = 6;
+ let ResourceCycles = [4];
+}
+
+// FIXME 8/16 bit divisions
+def : WriteRes<WriteIDiv, [JALU1, JDiv]> {
+ let Latency = 25;
+ let ResourceCycles = [1, 25];
+}
+def : WriteRes<WriteIDivLd, [JALU1, JLAGU, JDiv]> {
+ let Latency = 41;
+ let ResourceCycles = [1, 1, 25];
+}
+
+// This is for simple LEAs with one or two input operands.
+// FIXME: SAGU 3-operand LEA
+def : WriteRes<WriteLEA, [JALU01]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Integer shifts and rotates.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : JWriteResIntPair<WriteShift, JALU01, 1>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Loads, stores, and moves, not folded with other operations.
+// FIXME: Split x86 and SSE load/store/moves
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteLoad, [JLAGU]> { let Latency = 5; }
+def : WriteRes<WriteStore, [JSAGU]>;
+def : WriteRes<WriteMove, [JAny]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Idioms that clear a register, like xorps %xmm0, %xmm0.
+// These can often bypass execution ports completely.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteZero, []>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Branches don't produce values, so they have no latency, but they still
+// consume resources. Indirect branches can fold loads.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : JWriteResIntPair<WriteJump, JALU01, 1>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Floating point. This covers both scalar and vector operations.
+// FIXME: should we bother splitting JFPU pipe + unit stages for fast instructions?
+// FIXME: Double precision latencies
+// FIXME: SS vs PS latencies
+// FIXME: ymm latencies
+////////////////////////////////////////////////////////////////////////////////
+
+defm : JWriteResFpuPair<WriteFAdd, JFPU0, 3>;
+defm : JWriteResFpuPair<WriteFMul, JFPU1, 2>;
+defm : JWriteResFpuPair<WriteFRcp, JFPU1, 2>;
+defm : JWriteResFpuPair<WriteFRsqrt, JFPU1, 2>;
+defm : JWriteResFpuPair<WriteFShuffle, JFPU01, 1>;
+defm : JWriteResFpuPair<WriteFBlend, JFPU01, 1>;
+defm : JWriteResFpuPair<WriteFShuffle256, JFPU01, 1>;
+
+def : WriteRes<WriteFSqrt, [JFPU1, JLAGU, JFPM]> {
+ let Latency = 21;
+ let ResourceCycles = [1, 1, 21];
+}
+def : WriteRes<WriteFSqrtLd, [JFPU1, JLAGU, JFPM]> {
+ let Latency = 26;
+ let ResourceCycles = [1, 1, 21];
+}
+
+def : WriteRes<WriteFDiv, [JFPU1, JLAGU, JFPM]> {
+ let Latency = 19;
+ let ResourceCycles = [1, 1, 19];
+}
+def : WriteRes<WriteFDivLd, [JFPU1, JLAGU, JFPM]> {
+ let Latency = 24;
+ let ResourceCycles = [1, 1, 19];
+}
+
+// FIXME: integer pipes
+defm : JWriteResFpuPair<WriteCvtF2I, JFPU1, 3>; // Float -> Integer.
+defm : JWriteResFpuPair<WriteCvtI2F, JFPU1, 3>; // Integer -> Float.
+defm : JWriteResFpuPair<WriteCvtF2F, JFPU1, 3>; // Float -> Float size conversion.
+
+def : WriteRes<WriteFVarBlend, [JFPU01]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WriteFVarBlendLd, [JLAGU, JFPU01]> {
+ let Latency = 7;
+ let ResourceCycles = [1, 2];
+}
+
+// Vector integer operations.
+defm : JWriteResFpuPair<WriteVecALU, JFPU01, 1>;
+defm : JWriteResFpuPair<WriteVecShift, JFPU01, 1>;
+defm : JWriteResFpuPair<WriteVecIMul, JFPU0, 2>;
+defm : JWriteResFpuPair<WriteShuffle, JFPU01, 1>;
+defm : JWriteResFpuPair<WriteBlend, JFPU01, 1>;
+defm : JWriteResFpuPair<WriteVecLogic, JFPU01, 1>;
+defm : JWriteResFpuPair<WriteShuffle256, JFPU01, 1>;
+
+def : WriteRes<WriteVarBlend, [JFPU01]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WriteVarBlendLd, [JLAGU, JFPU01]> {
+ let Latency = 7;
+ let ResourceCycles = [1, 2];
+}
+
+// FIXME: why do we need to define AVX2 resource on CPU that doesn't have AVX2?
+def : WriteRes<WriteVarVecShift, [JFPU01]> {
+ let Latency = 1;
+ let ResourceCycles = [1];
+}
+def : WriteRes<WriteVarVecShiftLd, [JLAGU, JFPU01]> {
+ let Latency = 6;
+ let ResourceCycles = [1, 1];
+}
+
+def : WriteRes<WriteMPSAD, [JFPU0]> {
+ let Latency = 3;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WriteMPSADLd, [JLAGU, JFPU0]> {
+ let Latency = 8;
+ let ResourceCycles = [1, 2];
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// String instructions.
+// Packed Compare Implicit Length Strings, Return Mask
+// FIXME: approximate latencies + pipe dependencies
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WritePCmpIStrM, [JFPU01]> {
+ let Latency = 7;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WritePCmpIStrMLd, [JLAGU, JFPU01]> {
+ let Latency = 12;
+ let ResourceCycles = [1, 2];
+}
+
+// Packed Compare Explicit Length Strings, Return Mask
+def : WriteRes<WritePCmpEStrM, [JFPU01]> {
+ let Latency = 13;
+ let ResourceCycles = [5];
+}
+def : WriteRes<WritePCmpEStrMLd, [JLAGU, JFPU01]> {
+ let Latency = 18;
+ let ResourceCycles = [1, 5];
+}
+
+// Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpIStrI, [JFPU01]> {
+ let Latency = 6;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WritePCmpIStrILd, [JLAGU, JFPU01]> {
+ let Latency = 11;
+ let ResourceCycles = [1, 2];
+}
+
+// Packed Compare Explicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrI, [JFPU01]> {
+ let Latency = 13;
+ let ResourceCycles = [5];
+}
+def : WriteRes<WritePCmpEStrILd, [JLAGU, JFPU01]> {
+ let Latency = 18;
+ let ResourceCycles = [1, 5];
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// AES Instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteAESDecEnc, [JFPU01, JVIMUL]> {
+ let Latency = 3;
+ let ResourceCycles = [1, 1];
+}
+def : WriteRes<WriteAESDecEncLd, [JFPU01, JLAGU, JVIMUL]> {
+ let Latency = 8;
+ let ResourceCycles = [1, 1, 1];
+}
+
+def : WriteRes<WriteAESIMC, [JVIMUL]> {
+ let Latency = 2;
+ let ResourceCycles = [1];
+}
+def : WriteRes<WriteAESIMCLd, [JLAGU, JVIMUL]> {
+ let Latency = 7;
+ let ResourceCycles = [1, 1];
+}
+
+def : WriteRes<WriteAESKeyGen, [JVIMUL]> {
+ let Latency = 2;
+ let ResourceCycles = [1];
+}
+def : WriteRes<WriteAESKeyGenLd, [JLAGU, JVIMUL]> {
+ let Latency = 7;
+ let ResourceCycles = [1, 1];
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Carry-less multiplication instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteCLMul, [JVIMUL]> {
+ let Latency = 2;
+ let ResourceCycles = [1];
+}
+def : WriteRes<WriteCLMulLd, [JLAGU, JVIMUL]> {
+ let Latency = 7;
+ let ResourceCycles = [1, 1];
+}
+
+// FIXME: pipe for system/microcode?
+def : WriteRes<WriteSystem, [JAny]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [JAny]> { let Latency = 100; }
+def : WriteRes<WriteFence, [JSAGU]>;
+def : WriteRes<WriteNop, []>;
+} // SchedModel
+
diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td b/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td
new file mode 100644
index 000000000000..f95d4fa04177
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td
@@ -0,0 +1,233 @@
+//=- X86ScheduleSLM.td - X86 Silvermont Scheduling -----------*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Intel Silvermont to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def SLMModel : SchedMachineModel {
+ // All x86 instructions are modeled as a single micro-op, and SLM can decode 2
+ // instructions per cycle.
+ let IssueWidth = 2;
+ let MicroOpBufferSize = 32; // Based on the reorder buffer.
+ let LoadLatency = 3;
+ let MispredictPenalty = 10;
+ let PostRAScheduler = 1;
+
+ // For small loops, expand by a small factor to hide the backedge cost.
+ let LoopMicroOpBufferSize = 10;
+
+ // FIXME: SSE4 is unimplemented. This flag is set to allow
+ // the scheduler to assign a default model to unrecognized opcodes.
+ let CompleteModel = 0;
+}
+
+let SchedModel = SLMModel in {
+
+// Silvermont has 5 reservation stations for micro-ops
+
+def IEC_RSV0 : ProcResource<1>;
+def IEC_RSV1 : ProcResource<1>;
+def FPC_RSV0 : ProcResource<1> { let BufferSize = 1; }
+def FPC_RSV1 : ProcResource<1> { let BufferSize = 1; }
+def MEC_RSV : ProcResource<1>;
+
+// Many micro-ops are capable of issuing on multiple ports.
+def IEC_RSV01 : ProcResGroup<[IEC_RSV0, IEC_RSV1]>;
+def FPC_RSV01 : ProcResGroup<[FPC_RSV0, FPC_RSV1]>;
+
+def SMDivider : ProcResource<1>;
+def SMFPMultiplier : ProcResource<1>;
+def SMFPDivider : ProcResource<1>;
+
+// Loads are 3 cycles, so ReadAfterLd registers needn't be available until 3
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 3>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when queued in the reservation station.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass SMWriteResPair<X86FoldableSchedWrite SchedRW,
+ ProcResourceKind ExePort,
+ int Lat> {
+ // Register variant is using a single cycle on ExePort.
+ def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+ // Memory variant also uses a cycle on MEC_RSV and adds 3 cycles to the
+ // latency.
+ def : WriteRes<SchedRW.Folded, [MEC_RSV, ExePort]> {
+ let Latency = !add(Lat, 3);
+ }
+}
+
+// A folded store needs a cycle on MEC_RSV for the store data, but it does not
+// need an extra port cycle to recompute the address.
+def : WriteRes<WriteRMW, [MEC_RSV]>;
+
+def : WriteRes<WriteStore, [IEC_RSV01, MEC_RSV]>;
+def : WriteRes<WriteLoad, [MEC_RSV]> { let Latency = 3; }
+def : WriteRes<WriteMove, [IEC_RSV01]>;
+def : WriteRes<WriteZero, []>;
+
+defm : SMWriteResPair<WriteALU, IEC_RSV01, 1>;
+defm : SMWriteResPair<WriteIMul, IEC_RSV1, 3>;
+defm : SMWriteResPair<WriteShift, IEC_RSV0, 1>;
+defm : SMWriteResPair<WriteJump, IEC_RSV1, 1>;
+
+// This is for simple LEAs with one or two input operands.
+// The complex ones can only execute on port 1, and they require two cycles on
+// the port to read all inputs. We don't model that.
+def : WriteRes<WriteLEA, [IEC_RSV1]>;
+
+// This is quite rough, latency depends on the dividend.
+def : WriteRes<WriteIDiv, [IEC_RSV01, SMDivider]> {
+ let Latency = 25;
+ let ResourceCycles = [1, 25];
+}
+def : WriteRes<WriteIDivLd, [MEC_RSV, IEC_RSV01, SMDivider]> {
+ let Latency = 29;
+ let ResourceCycles = [1, 1, 25];
+}
+
+// Scalar and vector floating point.
+defm : SMWriteResPair<WriteFAdd, FPC_RSV1, 3>;
+defm : SMWriteResPair<WriteFRcp, FPC_RSV0, 5>;
+defm : SMWriteResPair<WriteFRsqrt, FPC_RSV0, 5>;
+defm : SMWriteResPair<WriteFSqrt, FPC_RSV0, 15>;
+defm : SMWriteResPair<WriteCvtF2I, FPC_RSV01, 4>;
+defm : SMWriteResPair<WriteCvtI2F, FPC_RSV01, 4>;
+defm : SMWriteResPair<WriteCvtF2F, FPC_RSV01, 4>;
+defm : SMWriteResPair<WriteFShuffle, FPC_RSV0, 1>;
+defm : SMWriteResPair<WriteFBlend, FPC_RSV0, 1>;
+
+// This is quite rough, latency depends on precision
+def : WriteRes<WriteFMul, [FPC_RSV0, SMFPMultiplier]> {
+ let Latency = 5;
+ let ResourceCycles = [1, 2];
+}
+def : WriteRes<WriteFMulLd, [MEC_RSV, FPC_RSV0, SMFPMultiplier]> {
+ let Latency = 8;
+ let ResourceCycles = [1, 1, 2];
+}
+
+def : WriteRes<WriteFDiv, [FPC_RSV0, SMFPDivider]> {
+ let Latency = 34;
+ let ResourceCycles = [1, 34];
+}
+def : WriteRes<WriteFDivLd, [MEC_RSV, FPC_RSV0, SMFPDivider]> {
+ let Latency = 37;
+ let ResourceCycles = [1, 1, 34];
+}
+
+// Vector integer operations.
+defm : SMWriteResPair<WriteVecShift, FPC_RSV0, 1>;
+defm : SMWriteResPair<WriteVecLogic, FPC_RSV01, 1>;
+defm : SMWriteResPair<WriteVecALU, FPC_RSV01, 1>;
+defm : SMWriteResPair<WriteVecIMul, FPC_RSV0, 4>;
+defm : SMWriteResPair<WriteShuffle, FPC_RSV0, 1>;
+defm : SMWriteResPair<WriteBlend, FPC_RSV0, 1>;
+defm : SMWriteResPair<WriteMPSAD, FPC_RSV0, 7>;
+
+// String instructions.
+// Packed Compare Implicit Length Strings, Return Mask
+def : WriteRes<WritePCmpIStrM, [FPC_RSV0]> {
+ let Latency = 13;
+ let ResourceCycles = [13];
+}
+def : WriteRes<WritePCmpIStrMLd, [FPC_RSV0, MEC_RSV]> {
+ let Latency = 13;
+ let ResourceCycles = [13, 1];
+}
+
+// Packed Compare Explicit Length Strings, Return Mask
+def : WriteRes<WritePCmpEStrM, [FPC_RSV0]> {
+ let Latency = 17;
+ let ResourceCycles = [17];
+}
+def : WriteRes<WritePCmpEStrMLd, [FPC_RSV0, MEC_RSV]> {
+ let Latency = 17;
+ let ResourceCycles = [17, 1];
+}
+
+// Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpIStrI, [FPC_RSV0]> {
+ let Latency = 17;
+ let ResourceCycles = [17];
+}
+def : WriteRes<WritePCmpIStrILd, [FPC_RSV0, MEC_RSV]> {
+ let Latency = 17;
+ let ResourceCycles = [17, 1];
+}
+
+// Packed Compare Explicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrI, [FPC_RSV0]> {
+ let Latency = 21;
+ let ResourceCycles = [21];
+}
+def : WriteRes<WritePCmpEStrILd, [FPC_RSV0, MEC_RSV]> {
+ let Latency = 21;
+ let ResourceCycles = [21, 1];
+}
+
+// AES Instructions.
+def : WriteRes<WriteAESDecEnc, [FPC_RSV0]> {
+ let Latency = 8;
+ let ResourceCycles = [5];
+}
+def : WriteRes<WriteAESDecEncLd, [FPC_RSV0, MEC_RSV]> {
+ let Latency = 8;
+ let ResourceCycles = [5, 1];
+}
+
+def : WriteRes<WriteAESIMC, [FPC_RSV0]> {
+ let Latency = 8;
+ let ResourceCycles = [5];
+}
+def : WriteRes<WriteAESIMCLd, [FPC_RSV0, MEC_RSV]> {
+ let Latency = 8;
+ let ResourceCycles = [5, 1];
+}
+
+def : WriteRes<WriteAESKeyGen, [FPC_RSV0]> {
+ let Latency = 8;
+ let ResourceCycles = [5];
+}
+def : WriteRes<WriteAESKeyGenLd, [FPC_RSV0, MEC_RSV]> {
+ let Latency = 8;
+ let ResourceCycles = [5, 1];
+}
+
+// Carry-less multiplication instructions.
+def : WriteRes<WriteCLMul, [FPC_RSV0]> {
+ let Latency = 10;
+ let ResourceCycles = [10];
+}
+def : WriteRes<WriteCLMulLd, [FPC_RSV0, MEC_RSV]> {
+ let Latency = 10;
+ let ResourceCycles = [10, 1];
+}
+
+
+def : WriteRes<WriteSystem, [FPC_RSV0]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [FPC_RSV0]> { let Latency = 100; }
+def : WriteRes<WriteFence, [MEC_RSV]>;
+def : WriteRes<WriteNop, []>;
+
+// AVX is not supported on that architecture, but we should define the basic
+// scheduling resources anyway.
+def : WriteRes<WriteIMulH, [FPC_RSV0]>;
+defm : SMWriteResPair<WriteVarBlend, FPC_RSV0, 1>;
+defm : SMWriteResPair<WriteFVarBlend, FPC_RSV0, 1>;
+defm : SMWriteResPair<WriteFShuffle256, FPC_RSV0, 1>;
+defm : SMWriteResPair<WriteShuffle256, FPC_RSV0, 1>;
+defm : SMWriteResPair<WriteVarVecShift, FPC_RSV0, 1>;
+} // SchedModel
diff --git a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
new file mode 100644
index 000000000000..f031a281e5dd
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -0,0 +1,283 @@
+//===-- X86SelectionDAGInfo.cpp - X86 SelectionDAG Info -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the X86SelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrInfo.h"
+#include "X86ISelLowering.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "X86SelectionDAGInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/Target/TargetLowering.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-selectiondag-info"
+
+bool X86SelectionDAGInfo::isBaseRegConflictPossible(
+ SelectionDAG &DAG, ArrayRef<MCPhysReg> ClobberSet) const {
+ // We cannot use TRI->hasBasePointer() until *after* we select all basic
+ // blocks. Legalization may introduce new stack temporaries with large
+ // alignment requirements. Fall back to generic code if there are any
+ // dynamic stack adjustments (hopefully rare) and the base pointer would
+ // conflict if we had to use it.
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ if (!MFI.hasVarSizedObjects() && !MFI.hasOpaqueSPAdjustment())
+ return false;
+
+ const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *>(
+ DAG.getSubtarget().getRegisterInfo());
+ unsigned BaseReg = TRI->getBaseRegister();
+ for (unsigned R : ClobberSet)
+ if (BaseReg == R)
+ return true;
+ return false;
+}
+
+SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
+ SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align, bool isVolatile,
+ MachinePointerInfo DstPtrInfo) const {
+ ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+ const X86Subtarget &Subtarget =
+ DAG.getMachineFunction().getSubtarget<X86Subtarget>();
+
+#ifndef NDEBUG
+ // If the base register might conflict with our physical registers, bail out.
+ const MCPhysReg ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI,
+ X86::ECX, X86::EAX, X86::EDI};
+ assert(!isBaseRegConflictPossible(DAG, ClobberSet));
+#endif
+
+ // If to a segment-relative address space, use the default lowering.
+ if (DstPtrInfo.getAddrSpace() >= 256)
+ return SDValue();
+
+ // If not DWORD aligned or size is more than the threshold, call the library.
+ // The libc version is likely to be faster for these cases. It can use the
+ // address value and run time information about the CPU.
+ if ((Align & 3) != 0 || !ConstantSize ||
+ ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold()) {
+ // Check to see if there is a specialized entry-point for memory zeroing.
+ ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
+
+ if (const char *bzeroEntry = V &&
+ V->isNullValue() ? Subtarget.getBZeroEntry() : nullptr) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout());
+ Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
+ TargetLowering::ArgListTy Args;
+ TargetLowering::ArgListEntry Entry;
+ Entry.Node = Dst;
+ Entry.Ty = IntPtrTy;
+ Args.push_back(Entry);
+ Entry.Node = Size;
+ Args.push_back(Entry);
+
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(dl).setChain(Chain)
+ .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+ DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args))
+ .setDiscardResult();
+
+ std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
+ return CallResult.second;
+ }
+
+ // Otherwise have the target-independent code call memset.
+ return SDValue();
+ }
+
+ uint64_t SizeVal = ConstantSize->getZExtValue();
+ SDValue InFlag;
+ EVT AVT;
+ SDValue Count;
+ ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src);
+ unsigned BytesLeft = 0;
+ bool TwoRepStos = false;
+ if (ValC) {
+ unsigned ValReg;
+ uint64_t Val = ValC->getZExtValue() & 255;
+
+ // If the value is a constant, then we can potentially use larger sets.
+ switch (Align & 3) {
+ case 2: // WORD aligned
+ AVT = MVT::i16;
+ ValReg = X86::AX;
+ Val = (Val << 8) | Val;
+ break;
+ case 0: // DWORD aligned
+ AVT = MVT::i32;
+ ValReg = X86::EAX;
+ Val = (Val << 8) | Val;
+ Val = (Val << 16) | Val;
+ if (Subtarget.is64Bit() && ((Align & 0x7) == 0)) { // QWORD aligned
+ AVT = MVT::i64;
+ ValReg = X86::RAX;
+ Val = (Val << 32) | Val;
+ }
+ break;
+ default: // Byte aligned
+ AVT = MVT::i8;
+ ValReg = X86::AL;
+ Count = DAG.getIntPtrConstant(SizeVal, dl);
+ break;
+ }
+
+ if (AVT.bitsGT(MVT::i8)) {
+ unsigned UBytes = AVT.getSizeInBits() / 8;
+ Count = DAG.getIntPtrConstant(SizeVal / UBytes, dl);
+ BytesLeft = SizeVal % UBytes;
+ }
+
+ Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, dl, AVT),
+ InFlag);
+ InFlag = Chain.getValue(1);
+ } else {
+ AVT = MVT::i8;
+ Count = DAG.getIntPtrConstant(SizeVal, dl);
+ Chain = DAG.getCopyToReg(Chain, dl, X86::AL, Src, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX : X86::ECX,
+ Count, InFlag);
+ InFlag = Chain.getValue(1);
+ Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI : X86::EDI,
+ Dst, InFlag);
+ InFlag = Chain.getValue(1);
+
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag };
+ Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
+
+ if (TwoRepStos) {
+ InFlag = Chain.getValue(1);
+ Count = Size;
+ EVT CVT = Count.getValueType();
+ SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count,
+ DAG.getConstant((AVT == MVT::i64) ? 7 : 3, dl,
+ CVT));
+ Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX : X86::ECX,
+ Left, InFlag);
+ InFlag = Chain.getValue(1);
+ Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue Ops[] = { Chain, DAG.getValueType(MVT::i8), InFlag };
+ Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
+ } else if (BytesLeft) {
+ // Handle the last 1 - 7 bytes.
+ unsigned Offset = SizeVal - BytesLeft;
+ EVT AddrVT = Dst.getValueType();
+ EVT SizeVT = Size.getValueType();
+
+ Chain = DAG.getMemset(Chain, dl,
+ DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
+ DAG.getConstant(Offset, dl, AddrVT)),
+ Src,
+ DAG.getConstant(BytesLeft, dl, SizeVT),
+ Align, isVolatile, false,
+ DstPtrInfo.getWithOffset(Offset));
+ }
+
+ // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
+ return Chain;
+}
+
+SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
+ SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline,
+ MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
+ // This requires the copy size to be a constant, preferably
+ // within a subtarget-specific limit.
+ ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+ const X86Subtarget &Subtarget =
+ DAG.getMachineFunction().getSubtarget<X86Subtarget>();
+ if (!ConstantSize)
+ return SDValue();
+ uint64_t SizeVal = ConstantSize->getZExtValue();
+ if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
+ return SDValue();
+
+ /// If not DWORD aligned, it is more efficient to call the library. However
+ /// if calling the library is not allowed (AlwaysInline), then soldier on as
+ /// the code generated here is better than the long load-store sequence we
+ /// would otherwise get.
+ if (!AlwaysInline && (Align & 3) != 0)
+ return SDValue();
+
+ // If to a segment-relative address space, use the default lowering.
+ if (DstPtrInfo.getAddrSpace() >= 256 ||
+ SrcPtrInfo.getAddrSpace() >= 256)
+ return SDValue();
+
+ // If the base register might conflict with our physical registers, bail out.
+ const MCPhysReg ClobberSet[] = {X86::RCX, X86::RSI, X86::RDI,
+ X86::ECX, X86::ESI, X86::EDI};
+ if (isBaseRegConflictPossible(DAG, ClobberSet))
+ return SDValue();
+
+ MVT AVT;
+ if (Align & 1)
+ AVT = MVT::i8;
+ else if (Align & 2)
+ AVT = MVT::i16;
+ else if (Align & 4)
+ // DWORD aligned
+ AVT = MVT::i32;
+ else
+ // QWORD aligned
+ AVT = Subtarget.is64Bit() ? MVT::i64 : MVT::i32;
+
+ unsigned UBytes = AVT.getSizeInBits() / 8;
+ unsigned CountVal = SizeVal / UBytes;
+ SDValue Count = DAG.getIntPtrConstant(CountVal, dl);
+ unsigned BytesLeft = SizeVal % UBytes;
+
+ SDValue InFlag;
+ Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX : X86::ECX,
+ Count, InFlag);
+ InFlag = Chain.getValue(1);
+ Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI : X86::EDI,
+ Dst, InFlag);
+ InFlag = Chain.getValue(1);
+ Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RSI : X86::ESI,
+ Src, InFlag);
+ InFlag = Chain.getValue(1);
+
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag };
+ SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops);
+
+ SmallVector<SDValue, 4> Results;
+ Results.push_back(RepMovs);
+ if (BytesLeft) {
+ // Handle the last 1 - 7 bytes.
+ unsigned Offset = SizeVal - BytesLeft;
+ EVT DstVT = Dst.getValueType();
+ EVT SrcVT = Src.getValueType();
+ EVT SizeVT = Size.getValueType();
+ Results.push_back(DAG.getMemcpy(Chain, dl,
+ DAG.getNode(ISD::ADD, dl, DstVT, Dst,
+ DAG.getConstant(Offset, dl,
+ DstVT)),
+ DAG.getNode(ISD::ADD, dl, SrcVT, Src,
+ DAG.getConstant(Offset, dl,
+ SrcVT)),
+ DAG.getConstant(BytesLeft, dl, SizeVT),
+ Align, isVolatile, AlwaysInline, false,
+ DstPtrInfo.getWithOffset(Offset),
+ SrcPtrInfo.getWithOffset(Offset)));
+ }
+
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h
new file mode 100644
index 000000000000..f4a285a5f916
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h
@@ -0,0 +1,50 @@
+//===-- X86SelectionDAGInfo.h - X86 SelectionDAG Info -----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86 subclass for SelectionDAGTargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86SELECTIONDAGINFO_H
+#define LLVM_LIB_TARGET_X86_X86SELECTIONDAGINFO_H
+
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+
+namespace llvm {
+
+class X86TargetLowering;
+class X86TargetMachine;
+class X86Subtarget;
+
+class X86SelectionDAGInfo : public SelectionDAGTargetInfo {
+ /// Returns true if it is possible for the base register to conflict with the
+ /// given set of clobbers for a memory intrinsic.
+ bool isBaseRegConflictPossible(SelectionDAG &DAG,
+ ArrayRef<MCPhysReg> ClobberSet) const;
+
+public:
+ explicit X86SelectionDAGInfo() = default;
+
+ SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
+ SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align, bool isVolatile,
+ MachinePointerInfo DstPtrInfo) const override;
+
+ SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
+ SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align, bool isVolatile,
+ bool AlwaysInline,
+ MachinePointerInfo DstPtrInfo,
+ MachinePointerInfo SrcPtrInfo) const override;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
new file mode 100644
index 000000000000..11115524c810
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
@@ -0,0 +1,333 @@
+//===-- X86ShuffleDecodeConstantPool.cpp - X86 shuffle decode -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Define several functions to decode x86 specific shuffle semantics using
+// constants from the constant pool.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ShuffleDecodeConstantPool.h"
+#include "Utils/X86ShuffleDecode.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/IR/Constants.h"
+
+//===----------------------------------------------------------------------===//
+// Vector Mask Decoding
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+
+static bool extractConstantMask(const Constant *C, unsigned MaskEltSizeInBits,
+ SmallBitVector &UndefElts,
+ SmallVectorImpl<uint64_t> &RawMask) {
+ // It is not an error for shuffle masks to not be a vector of
+ // MaskEltSizeInBits because the constant pool uniques constants by their
+ // bit representation.
+ // e.g. the following take up the same space in the constant pool:
+ // i128 -170141183420855150465331762880109871104
+ //
+ // <2 x i64> <i64 -9223372034707292160, i64 -9223372034707292160>
+ //
+ // <4 x i32> <i32 -2147483648, i32 -2147483648,
+ // i32 -2147483648, i32 -2147483648>
+ Type *CstTy = C->getType();
+ if (!CstTy->isVectorTy())
+ return false;
+
+ Type *CstEltTy = CstTy->getVectorElementType();
+ if (!CstEltTy->isIntegerTy())
+ return false;
+
+ unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
+ unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
+ unsigned NumCstElts = CstTy->getVectorNumElements();
+
+ // Extract all the undef/constant element data and pack into single bitsets.
+ APInt UndefBits(CstSizeInBits, 0);
+ APInt MaskBits(CstSizeInBits, 0);
+ for (unsigned i = 0; i != NumCstElts; ++i) {
+ Constant *COp = C->getAggregateElement(i);
+ if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
+ return false;
+
+ if (isa<UndefValue>(COp)) {
+ APInt EltUndef = APInt::getLowBitsSet(CstSizeInBits, CstEltSizeInBits);
+ UndefBits |= EltUndef.shl(i * CstEltSizeInBits);
+ continue;
+ }
+
+ APInt EltBits = cast<ConstantInt>(COp)->getValue();
+ EltBits = EltBits.zextOrTrunc(CstSizeInBits);
+ MaskBits |= EltBits.shl(i * CstEltSizeInBits);
+ }
+
+ // Now extract the undef/constant bit data into the raw shuffle masks.
+ assert((CstSizeInBits % MaskEltSizeInBits) == 0 &&
+ "Unaligned shuffle mask size");
+
+ unsigned NumMaskElts = CstSizeInBits / MaskEltSizeInBits;
+ UndefElts = SmallBitVector(NumMaskElts, false);
+ RawMask.resize(NumMaskElts, 0);
+
+ for (unsigned i = 0; i != NumMaskElts; ++i) {
+ APInt EltUndef = UndefBits.lshr(i * MaskEltSizeInBits);
+ EltUndef = EltUndef.zextOrTrunc(MaskEltSizeInBits);
+
+ // Only treat the element as UNDEF if all bits are UNDEF, otherwise
+ // treat it as zero.
+ if (EltUndef.isAllOnesValue()) {
+ UndefElts[i] = true;
+ RawMask[i] = 0;
+ continue;
+ }
+
+ APInt EltBits = MaskBits.lshr(i * MaskEltSizeInBits);
+ EltBits = EltBits.zextOrTrunc(MaskEltSizeInBits);
+ RawMask[i] = EltBits.getZExtValue();
+ }
+
+ return true;
+}
+
+void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
+ Type *MaskTy = C->getType();
+ unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+ (void)MaskTySize;
+ assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) &&
+ "Unexpected vector size.");
+
+ // The shuffle mask requires a byte vector.
+ SmallBitVector UndefElts;
+ SmallVector<uint64_t, 32> RawMask;
+ if (!extractConstantMask(C, 8, UndefElts, RawMask))
+ return;
+
+ unsigned NumElts = RawMask.size();
+ assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
+ "Unexpected number of vector elements.");
+
+ for (unsigned i = 0; i != NumElts; ++i) {
+ if (UndefElts[i]) {
+ ShuffleMask.push_back(SM_SentinelUndef);
+ continue;
+ }
+
+ uint64_t Element = RawMask[i];
+ // If the high bit (7) of the byte is set, the element is zeroed.
+ if (Element & (1 << 7))
+ ShuffleMask.push_back(SM_SentinelZero);
+ else {
+ // For AVX vectors with 32 bytes the base of the shuffle is the 16-byte
+ // lane of the vector we're inside.
+ unsigned Base = i & ~0xf;
+
+ // Only the least significant 4 bits of the byte are used.
+ int Index = Base + (Element & 0xf);
+ ShuffleMask.push_back(Index);
+ }
+ }
+}
+
+void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
+ SmallVectorImpl<int> &ShuffleMask) {
+ Type *MaskTy = C->getType();
+ unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+ (void)MaskTySize;
+ assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) &&
+ "Unexpected vector size.");
+ assert((ElSize == 32 || ElSize == 64) && "Unexpected vector element size.");
+
+ // The shuffle mask requires elements the same size as the target.
+ SmallBitVector UndefElts;
+ SmallVector<uint64_t, 8> RawMask;
+ if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
+ return;
+
+ unsigned NumElts = RawMask.size();
+ unsigned NumEltsPerLane = 128 / ElSize;
+ assert((NumElts == 2 || NumElts == 4 || NumElts == 8 || NumElts == 16) &&
+ "Unexpected number of vector elements.");
+
+ for (unsigned i = 0; i != NumElts; ++i) {
+ if (UndefElts[i]) {
+ ShuffleMask.push_back(SM_SentinelUndef);
+ continue;
+ }
+
+ int Index = i & ~(NumEltsPerLane - 1);
+ uint64_t Element = RawMask[i];
+ if (ElSize == 64)
+ Index += (Element >> 1) & 0x1;
+ else
+ Index += Element & 0x3;
+
+ ShuffleMask.push_back(Index);
+ }
+}
+
+void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize,
+ SmallVectorImpl<int> &ShuffleMask) {
+ Type *MaskTy = C->getType();
+ unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+ (void)MaskTySize;
+ assert((MaskTySize == 128 || MaskTySize == 256) && "Unexpected vector size.");
+
+ // The shuffle mask requires elements the same size as the target.
+ SmallBitVector UndefElts;
+ SmallVector<uint64_t, 8> RawMask;
+ if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
+ return;
+
+ unsigned NumElts = RawMask.size();
+ unsigned NumEltsPerLane = 128 / ElSize;
+ assert((NumElts == 2 || NumElts == 4 || NumElts == 8) &&
+ "Unexpected number of vector elements.");
+
+ for (unsigned i = 0; i != NumElts; ++i) {
+ if (UndefElts[i]) {
+ ShuffleMask.push_back(SM_SentinelUndef);
+ continue;
+ }
+
+ // VPERMIL2 Operation.
+ // Bits[3] - Match Bit.
+ // Bits[2:1] - (Per Lane) PD Shuffle Mask.
+ // Bits[2:0] - (Per Lane) PS Shuffle Mask.
+ uint64_t Selector = RawMask[i];
+ unsigned MatchBit = (Selector >> 3) & 0x1;
+
+ // M2Z[0:1] MatchBit
+ // 0Xb X Source selected by Selector index.
+ // 10b 0 Source selected by Selector index.
+ // 10b 1 Zero.
+ // 11b 0 Zero.
+ // 11b 1 Source selected by Selector index.
+ if ((M2Z & 0x2) != 0u && MatchBit != (M2Z & 0x1)) {
+ ShuffleMask.push_back(SM_SentinelZero);
+ continue;
+ }
+
+ int Index = i & ~(NumEltsPerLane - 1);
+ if (ElSize == 64)
+ Index += (Selector >> 1) & 0x1;
+ else
+ Index += Selector & 0x3;
+
+ int Src = (Selector >> 2) & 0x1;
+ Index += Src * NumElts;
+ ShuffleMask.push_back(Index);
+ }
+}
+
+void DecodeVPPERMMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
+ assert(C->getType()->getPrimitiveSizeInBits() == 128 &&
+ "Unexpected vector size.");
+
+ // The shuffle mask requires a byte vector.
+ SmallBitVector UndefElts;
+ SmallVector<uint64_t, 32> RawMask;
+ if (!extractConstantMask(C, 8, UndefElts, RawMask))
+ return;
+
+ unsigned NumElts = RawMask.size();
+ assert(NumElts == 16 && "Unexpected number of vector elements.");
+
+ for (unsigned i = 0; i != NumElts; ++i) {
+ if (UndefElts[i]) {
+ ShuffleMask.push_back(SM_SentinelUndef);
+ continue;
+ }
+
+ // VPPERM Operation
+ // Bits[4:0] - Byte Index (0 - 31)
+ // Bits[7:5] - Permute Operation
+ //
+ // Permute Operation:
+ // 0 - Source byte (no logical operation).
+ // 1 - Invert source byte.
+ // 2 - Bit reverse of source byte.
+ // 3 - Bit reverse of inverted source byte.
+ // 4 - 00h (zero - fill).
+ // 5 - FFh (ones - fill).
+ // 6 - Most significant bit of source byte replicated in all bit positions.
+ // 7 - Invert most significant bit of source byte and replicate in all bit
+ // positions.
+ uint64_t Element = RawMask[i];
+ uint64_t Index = Element & 0x1F;
+ uint64_t PermuteOp = (Element >> 5) & 0x7;
+
+ if (PermuteOp == 4) {
+ ShuffleMask.push_back(SM_SentinelZero);
+ continue;
+ }
+ if (PermuteOp != 0) {
+ ShuffleMask.clear();
+ return;
+ }
+ ShuffleMask.push_back((int)Index);
+ }
+}
+
+void DecodeVPERMVMask(const Constant *C, unsigned ElSize,
+ SmallVectorImpl<int> &ShuffleMask) {
+ Type *MaskTy = C->getType();
+ unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+ (void)MaskTySize;
+ assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) &&
+ "Unexpected vector size.");
+ assert((ElSize == 8 || ElSize == 16 || ElSize == 32 || ElSize == 64) &&
+ "Unexpected vector element size.");
+
+ // The shuffle mask requires elements the same size as the target.
+ SmallBitVector UndefElts;
+ SmallVector<uint64_t, 8> RawMask;
+ if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
+ return;
+
+ unsigned NumElts = RawMask.size();
+
+ for (unsigned i = 0; i != NumElts; ++i) {
+ if (UndefElts[i]) {
+ ShuffleMask.push_back(SM_SentinelUndef);
+ continue;
+ }
+ int Index = RawMask[i] & (NumElts - 1);
+ ShuffleMask.push_back(Index);
+ }
+}
+
+void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize,
+ SmallVectorImpl<int> &ShuffleMask) {
+ Type *MaskTy = C->getType();
+ unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+ (void)MaskTySize;
+ assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) &&
+ "Unexpected vector size.");
+ assert((ElSize == 8 || ElSize == 16 || ElSize == 32 || ElSize == 64) &&
+ "Unexpected vector element size.");
+
+ // The shuffle mask requires elements the same size as the target.
+ SmallBitVector UndefElts;
+ SmallVector<uint64_t, 8> RawMask;
+ if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
+ return;
+
+ unsigned NumElts = RawMask.size();
+
+ for (unsigned i = 0; i != NumElts; ++i) {
+ if (UndefElts[i]) {
+ ShuffleMask.push_back(SM_SentinelUndef);
+ continue;
+ }
+ int Index = RawMask[i] & (NumElts*2 - 1);
+ ShuffleMask.push_back(Index);
+ }
+}
+} // llvm namespace
diff --git a/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h
new file mode 100644
index 000000000000..b703cbbd2b29
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h
@@ -0,0 +1,52 @@
+//===-- X86ShuffleDecodeConstantPool.h - X86 shuffle decode -----*-C++-*---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Define several functions to decode x86 specific shuffle semantics using
+// constants from the constant pool.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86SHUFFLEDECODECONSTANTPOOL_H
+#define LLVM_LIB_TARGET_X86_X86SHUFFLEDECODECONSTANTPOOL_H
+
+#include "llvm/ADT/SmallVector.h"
+
+//===----------------------------------------------------------------------===//
+// Vector Mask Decoding
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+class Constant;
+class MVT;
+
+/// Decode a PSHUFB mask from an IR-level vector constant.
+void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a VPERMILP variable mask from an IR-level vector constant.
+void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a VPERMILP2 variable mask from an IR-level vector constant.
+void DecodeVPERMIL2PMask(const Constant *C, unsigned MatchImm, unsigned ElSize,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a VPPERM variable mask from an IR-level vector constant.
+void DecodeVPPERMMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a VPERM W/D/Q/PS/PD mask from an IR-level vector constant.
+void DecodeVPERMVMask(const Constant *C, unsigned ElSize,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a VPERMT2 W/D/Q/PS/PD mask from an IR-level vector constant.
+void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize,
+ SmallVectorImpl<int> &ShuffleMask);
+
+} // llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.cpp b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp
new file mode 100644
index 000000000000..727ff70c3ff6
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -0,0 +1,365 @@
+//===-- X86Subtarget.cpp - X86 Subtarget Information ----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the X86 specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86Subtarget.h"
+#include "X86InstrInfo.h"
+#include "X86TargetMachine.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
+
+using namespace llvm;
+
+#define DEBUG_TYPE "subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "X86GenSubtargetInfo.inc"
+
+// Temporary option to control early if-conversion for x86 while adding machine
+// models.
+static cl::opt<bool>
+X86EarlyIfConv("x86-early-ifcvt", cl::Hidden,
+ cl::desc("Enable early if-conversion on X86"));
+
+
+/// Classify a blockaddress reference for the current subtarget according to how
+/// we should reference it in a non-pcrel context.
+unsigned char X86Subtarget::classifyBlockAddressReference() const {
+ return classifyLocalReference(nullptr);
+}
+
+/// Classify a global variable reference for the current subtarget according to
+/// how we should reference it in a non-pcrel context.
+unsigned char
+X86Subtarget::classifyGlobalReference(const GlobalValue *GV) const {
+ return classifyGlobalReference(GV, *GV->getParent());
+}
+
+unsigned char
+X86Subtarget::classifyLocalReference(const GlobalValue *GV) const {
+ // 64 bits can use %rip addressing for anything local.
+ if (is64Bit())
+ return X86II::MO_NO_FLAG;
+
+ // If this is for a position dependent executable, the static linker can
+ // figure it out.
+ if (!isPositionIndependent())
+ return X86II::MO_NO_FLAG;
+
+ // The COFF dynamic linker just patches the executable sections.
+ if (isTargetCOFF())
+ return X86II::MO_NO_FLAG;
+
+ if (isTargetDarwin()) {
+ // 32 bit macho has no relocation for a-b if a is undefined, even if
+ // b is in the section that is being relocated.
+ // This means we have to use o load even for GVs that are known to be
+ // local to the dso.
+ if (GV && (GV->isDeclarationForLinker() || GV->hasCommonLinkage()))
+ return X86II::MO_DARWIN_NONLAZY_PIC_BASE;
+
+ return X86II::MO_PIC_BASE_OFFSET;
+ }
+
+ return X86II::MO_GOTOFF;
+}
+
+unsigned char X86Subtarget::classifyGlobalReference(const GlobalValue *GV,
+ const Module &M) const {
+ // Large model never uses stubs.
+ if (TM.getCodeModel() == CodeModel::Large)
+ return X86II::MO_NO_FLAG;
+
+ // Absolute symbols can be referenced directly.
+ if (GV && GV->isAbsoluteSymbolRef())
+ return X86II::MO_NO_FLAG;
+
+ if (TM.shouldAssumeDSOLocal(M, GV))
+ return classifyLocalReference(GV);
+
+ if (isTargetCOFF())
+ return X86II::MO_DLLIMPORT;
+
+ if (is64Bit())
+ return X86II::MO_GOTPCREL;
+
+ if (isTargetDarwin()) {
+ if (!isPositionIndependent())
+ return X86II::MO_DARWIN_NONLAZY;
+ return X86II::MO_DARWIN_NONLAZY_PIC_BASE;
+ }
+
+ return X86II::MO_GOT;
+}
+
+unsigned char
+X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV) const {
+ return classifyGlobalFunctionReference(GV, *GV->getParent());
+}
+
+unsigned char
+X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV,
+ const Module &M) const {
+ if (TM.shouldAssumeDSOLocal(M, GV))
+ return X86II::MO_NO_FLAG;
+
+ assert(!isTargetCOFF());
+
+ if (isTargetELF())
+ return X86II::MO_PLT;
+
+ if (is64Bit()) {
+ auto *F = dyn_cast_or_null<Function>(GV);
+ if (F && F->hasFnAttribute(Attribute::NonLazyBind))
+ // If the function is marked as non-lazy, generate an indirect call
+ // which loads from the GOT directly. This avoids runtime overhead
+ // at the cost of eager binding (and one extra byte of encoding).
+ return X86II::MO_GOTPCREL;
+ return X86II::MO_NO_FLAG;
+ }
+
+ return X86II::MO_NO_FLAG;
+}
+
+/// This function returns the name of a function which has an interface like
+/// the non-standard bzero function, if such a function exists on the
+/// current subtarget and it is considered preferable over memset with zero
+/// passed as the second argument. Otherwise it returns null.
+const char *X86Subtarget::getBZeroEntry() const {
+ // Darwin 10 has a __bzero entry point for this purpose.
+ if (getTargetTriple().isMacOSX() &&
+ !getTargetTriple().isMacOSXVersionLT(10, 6))
+ return "__bzero";
+
+ return nullptr;
+}
+
+bool X86Subtarget::hasSinCos() const {
+ return getTargetTriple().isMacOSX() &&
+ !getTargetTriple().isMacOSXVersionLT(10, 9) &&
+ is64Bit();
+}
+
+/// Return true if the subtarget allows calls to immediate address.
+bool X86Subtarget::isLegalToCallImmediateAddr() const {
+ // FIXME: I386 PE/COFF supports PC relative calls using IMAGE_REL_I386_REL32
+ // but WinCOFFObjectWriter::RecordRelocation cannot emit them. Once it does,
+ // the following check for Win32 should be removed.
+ if (In64BitMode || isTargetWin32())
+ return false;
+ return isTargetELF() || TM.getRelocationModel() == Reloc::Static;
+}
+
+void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
+ std::string CPUName = CPU;
+ if (CPUName.empty())
+ CPUName = "generic";
+
+ // Make sure 64-bit features are available in 64-bit mode. (But make sure
+ // SSE2 can be turned off explicitly.)
+ std::string FullFS = FS;
+ if (In64BitMode) {
+ if (!FullFS.empty())
+ FullFS = "+64bit,+sse2," + FullFS;
+ else
+ FullFS = "+64bit,+sse2";
+ }
+
+ // LAHF/SAHF are always supported in non-64-bit mode.
+ if (!In64BitMode) {
+ if (!FullFS.empty())
+ FullFS = "+sahf," + FullFS;
+ else
+ FullFS = "+sahf";
+ }
+
+
+ // Parse features string and set the CPU.
+ ParseSubtargetFeatures(CPUName, FullFS);
+
+ // All CPUs that implement SSE4.2 or SSE4A support unaligned accesses of
+ // 16-bytes and under that are reasonably fast. These features were
+ // introduced with Intel's Nehalem/Silvermont and AMD's Family10h
+ // micro-architectures respectively.
+ if (hasSSE42() || hasSSE4A())
+ IsUAMem16Slow = false;
+
+ InstrItins = getInstrItineraryForCPU(CPUName);
+
+ // It's important to keep the MCSubtargetInfo feature bits in sync with
+ // target data structure which is shared with MC code emitter, etc.
+ if (In64BitMode)
+ ToggleFeature(X86::Mode64Bit);
+ else if (In32BitMode)
+ ToggleFeature(X86::Mode32Bit);
+ else if (In16BitMode)
+ ToggleFeature(X86::Mode16Bit);
+ else
+ llvm_unreachable("Not 16-bit, 32-bit or 64-bit mode!");
+
+ DEBUG(dbgs() << "Subtarget features: SSELevel " << X86SSELevel
+ << ", 3DNowLevel " << X863DNowLevel
+ << ", 64bit " << HasX86_64 << "\n");
+ assert((!In64BitMode || HasX86_64) &&
+ "64-bit code requested on a subtarget that doesn't support it!");
+
+ // Stack alignment is 16 bytes on Darwin, Linux, kFreeBSD and Solaris (both
+ // 32 and 64 bit) and for all 64-bit targets.
+ if (StackAlignOverride)
+ stackAlignment = StackAlignOverride;
+ else if (isTargetDarwin() || isTargetLinux() || isTargetSolaris() ||
+ isTargetKFreeBSD() || In64BitMode)
+ stackAlignment = 16;
+
+ assert((!isPMULLDSlow() || hasSSE41()) &&
+ "Feature Slow PMULLD can only be set on a subtarget with SSE4.1");
+}
+
+void X86Subtarget::initializeEnvironment() {
+ X86SSELevel = NoSSE;
+ X863DNowLevel = NoThreeDNow;
+ HasX87 = false;
+ HasCMov = false;
+ HasX86_64 = false;
+ HasPOPCNT = false;
+ HasSSE4A = false;
+ HasAES = false;
+ HasFXSR = false;
+ HasXSAVE = false;
+ HasXSAVEOPT = false;
+ HasXSAVEC = false;
+ HasXSAVES = false;
+ HasPCLMUL = false;
+ HasFMA = false;
+ HasFMA4 = false;
+ HasXOP = false;
+ HasTBM = false;
+ HasMOVBE = false;
+ HasRDRAND = false;
+ HasF16C = false;
+ HasFSGSBase = false;
+ HasLZCNT = false;
+ HasBMI = false;
+ HasBMI2 = false;
+ HasVBMI = false;
+ HasIFMA = false;
+ HasRTM = false;
+ HasHLE = false;
+ HasERI = false;
+ HasCDI = false;
+ HasPFI = false;
+ HasDQI = false;
+ HasBWI = false;
+ HasVLX = false;
+ HasADX = false;
+ HasPKU = false;
+ HasSHA = false;
+ HasPRFCHW = false;
+ HasRDSEED = false;
+ HasLAHFSAHF = false;
+ HasMWAITX = false;
+ HasMPX = false;
+ IsBTMemSlow = false;
+ IsPMULLDSlow = false;
+ IsSHLDSlow = false;
+ IsUAMem16Slow = false;
+ IsUAMem32Slow = false;
+ HasSSEUnalignedMem = false;
+ HasCmpxchg16b = false;
+ UseLeaForSP = false;
+ HasFastPartialYMMWrite = false;
+ HasFastScalarFSQRT = false;
+ HasFastVectorFSQRT = false;
+ HasFastLZCNT = false;
+ HasSlowDivide32 = false;
+ HasSlowDivide64 = false;
+ PadShortFunctions = false;
+ CallRegIndirect = false;
+ LEAUsesAG = false;
+ SlowLEA = false;
+ SlowIncDec = false;
+ stackAlignment = 4;
+ // FIXME: this is a known good value for Yonah. How about others?
+ MaxInlineSizeThreshold = 128;
+ UseSoftFloat = false;
+}
+
+X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU,
+ StringRef FS) {
+ initializeEnvironment();
+ initSubtargetFeatures(CPU, FS);
+ return *this;
+}
+
+X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
+ const X86TargetMachine &TM,
+ unsigned StackAlignOverride)
+ : X86GenSubtargetInfo(TT, CPU, FS), X86ProcFamily(Others),
+ PICStyle(PICStyles::None), TM(TM), TargetTriple(TT),
+ StackAlignOverride(StackAlignOverride),
+ In64BitMode(TargetTriple.getArch() == Triple::x86_64),
+ In32BitMode(TargetTriple.getArch() == Triple::x86 &&
+ TargetTriple.getEnvironment() != Triple::CODE16),
+ In16BitMode(TargetTriple.getArch() == Triple::x86 &&
+ TargetTriple.getEnvironment() == Triple::CODE16),
+ TSInfo(), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
+ TLInfo(TM, *this), FrameLowering(*this, getStackAlignment()) {
+ // Determine the PICStyle based on the target selected.
+ if (!isPositionIndependent())
+ setPICStyle(PICStyles::None);
+ else if (is64Bit())
+ setPICStyle(PICStyles::RIPRel);
+ else if (isTargetCOFF())
+ setPICStyle(PICStyles::None);
+ else if (isTargetDarwin())
+ setPICStyle(PICStyles::StubPIC);
+ else if (isTargetELF())
+ setPICStyle(PICStyles::GOT);
+}
+
+const CallLowering *X86Subtarget::getCallLowering() const {
+ assert(GISel && "Access to GlobalISel APIs not set");
+ return GISel->getCallLowering();
+}
+
+const InstructionSelector *X86Subtarget::getInstructionSelector() const {
+ assert(GISel && "Access to GlobalISel APIs not set");
+ return GISel->getInstructionSelector();
+}
+
+const LegalizerInfo *X86Subtarget::getLegalizerInfo() const {
+ assert(GISel && "Access to GlobalISel APIs not set");
+ return GISel->getLegalizerInfo();
+}
+
+const RegisterBankInfo *X86Subtarget::getRegBankInfo() const {
+ assert(GISel && "Access to GlobalISel APIs not set");
+ return GISel->getRegBankInfo();
+}
+
+bool X86Subtarget::enableEarlyIfConversion() const {
+ return hasCMov() && X86EarlyIfConv;
+}
+
diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.h b/contrib/llvm/lib/Target/X86/X86Subtarget.h
new file mode 100644
index 000000000000..92c16214aa4a
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86Subtarget.h
@@ -0,0 +1,633 @@
+//===-- X86Subtarget.h - Define Subtarget for the X86 ----------*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the X86 specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86SUBTARGET_H
+#define LLVM_LIB_TARGET_X86_X86SUBTARGET_H
+
+#include "X86FrameLowering.h"
+#include "X86ISelLowering.h"
+#include "X86InstrInfo.h"
+#include "X86SelectionDAGInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "X86GenSubtargetInfo.inc"
+
+namespace llvm {
+class GlobalValue;
+class StringRef;
+class TargetMachine;
+
+/// The X86 backend supports a number of different styles of PIC.
+///
+namespace PICStyles {
+enum Style {
+ StubPIC, // Used on i386-darwin in pic mode.
+ GOT, // Used on 32 bit elf on when in pic mode.
+ RIPRel, // Used on X86-64 when in pic mode.
+ None // Set when not in pic mode.
+};
+}
+
+class X86Subtarget final : public X86GenSubtargetInfo {
+
+protected:
+ enum X86SSEEnum {
+ NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F
+ };
+
+ enum X863DNowEnum {
+ NoThreeDNow, MMX, ThreeDNow, ThreeDNowA
+ };
+
+ enum X86ProcFamilyEnum {
+ Others, IntelAtom, IntelSLM
+ };
+
+ /// X86 processor family: Intel Atom, and others
+ X86ProcFamilyEnum X86ProcFamily;
+
+ /// Which PIC style to use
+ PICStyles::Style PICStyle;
+
+ const TargetMachine &TM;
+
+ /// SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or none supported.
+ X86SSEEnum X86SSELevel;
+
+ /// MMX, 3DNow, 3DNow Athlon, or none supported.
+ X863DNowEnum X863DNowLevel;
+
+ /// True if the processor supports X87 instructions.
+ bool HasX87;
+
+ /// True if this processor has conditional move instructions
+ /// (generally pentium pro+).
+ bool HasCMov;
+
+ /// True if the processor supports X86-64 instructions.
+ bool HasX86_64;
+
+ /// True if the processor supports POPCNT.
+ bool HasPOPCNT;
+
+ /// True if the processor supports SSE4A instructions.
+ bool HasSSE4A;
+
+ /// Target has AES instructions
+ bool HasAES;
+
+ /// Target has FXSAVE/FXRESTOR instructions
+ bool HasFXSR;
+
+ /// Target has XSAVE instructions
+ bool HasXSAVE;
+ /// Target has XSAVEOPT instructions
+ bool HasXSAVEOPT;
+ /// Target has XSAVEC instructions
+ bool HasXSAVEC;
+ /// Target has XSAVES instructions
+ bool HasXSAVES;
+
+ /// Target has carry-less multiplication
+ bool HasPCLMUL;
+
+ /// Target has 3-operand fused multiply-add
+ bool HasFMA;
+
+ /// Target has 4-operand fused multiply-add
+ bool HasFMA4;
+
+ /// Target has XOP instructions
+ bool HasXOP;
+
+ /// Target has TBM instructions.
+ bool HasTBM;
+
+ /// True if the processor has the MOVBE instruction.
+ bool HasMOVBE;
+
+ /// True if the processor has the RDRAND instruction.
+ bool HasRDRAND;
+
+ /// Processor has 16-bit floating point conversion instructions.
+ bool HasF16C;
+
+ /// Processor has FS/GS base insturctions.
+ bool HasFSGSBase;
+
+ /// Processor has LZCNT instruction.
+ bool HasLZCNT;
+
+ /// Processor has BMI1 instructions.
+ bool HasBMI;
+
+ /// Processor has BMI2 instructions.
+ bool HasBMI2;
+
+ /// Processor has VBMI instructions.
+ bool HasVBMI;
+
+ /// Processor has Integer Fused Multiply Add
+ bool HasIFMA;
+
+ /// Processor has RTM instructions.
+ bool HasRTM;
+
+ /// Processor has HLE.
+ bool HasHLE;
+
+ /// Processor has ADX instructions.
+ bool HasADX;
+
+ /// Processor has SHA instructions.
+ bool HasSHA;
+
+ /// Processor has PRFCHW instructions.
+ bool HasPRFCHW;
+
+ /// Processor has RDSEED instructions.
+ bool HasRDSEED;
+
+ /// Processor has LAHF/SAHF instructions.
+ bool HasLAHFSAHF;
+
+ /// Processor has MONITORX/MWAITX instructions.
+ bool HasMWAITX;
+
+ /// Processor has Prefetch with intent to Write instruction
+ bool HasPFPREFETCHWT1;
+
+ /// True if BT (bit test) of memory instructions are slow.
+ bool IsBTMemSlow;
+
+ /// True if SHLD instructions are slow.
+ bool IsSHLDSlow;
+
+ /// True if the PMULLD instruction is slow compared to PMULLW/PMULHW and
+ // PMULUDQ.
+ bool IsPMULLDSlow;
+
+ /// True if unaligned memory accesses of 16-bytes are slow.
+ bool IsUAMem16Slow;
+
+ /// True if unaligned memory accesses of 32-bytes are slow.
+ bool IsUAMem32Slow;
+
+ /// True if SSE operations can have unaligned memory operands.
+ /// This may require setting a configuration bit in the processor.
+ bool HasSSEUnalignedMem;
+
+ /// True if this processor has the CMPXCHG16B instruction;
+ /// this is true for most x86-64 chips, but not the first AMD chips.
+ bool HasCmpxchg16b;
+
+ /// True if the LEA instruction should be used for adjusting
+ /// the stack pointer. This is an optimization for Intel Atom processors.
+ bool UseLeaForSP;
+
+ /// True if there is no performance penalty to writing only the lower parts
+ /// of a YMM register without clearing the upper part.
+ bool HasFastPartialYMMWrite;
+
+ /// True if hardware SQRTSS instruction is at least as fast (latency) as
+ /// RSQRTSS followed by a Newton-Raphson iteration.
+ bool HasFastScalarFSQRT;
+
+ /// True if hardware SQRTPS/VSQRTPS instructions are at least as fast
+ /// (throughput) as RSQRTPS/VRSQRTPS followed by a Newton-Raphson iteration.
+ bool HasFastVectorFSQRT;
+
+ /// True if 8-bit divisions are significantly faster than
+ /// 32-bit divisions and should be used when possible.
+ bool HasSlowDivide32;
+
+ /// True if 16-bit divides are significantly faster than
+ /// 64-bit divisions and should be used when possible.
+ bool HasSlowDivide64;
+
+ /// True if LZCNT instruction is fast.
+ bool HasFastLZCNT;
+
+ /// True if the short functions should be padded to prevent
+ /// a stall when returning too early.
+ bool PadShortFunctions;
+
+ /// True if the Calls with memory reference should be converted
+ /// to a register-based indirect call.
+ bool CallRegIndirect;
+
+ /// True if the LEA instruction inputs have to be ready at address generation
+ /// (AG) time.
+ bool LEAUsesAG;
+
+ /// True if the LEA instruction with certain arguments is slow
+ bool SlowLEA;
+
+ /// True if INC and DEC instructions are slow when writing to flags
+ bool SlowIncDec;
+
+ /// Processor has AVX-512 PreFetch Instructions
+ bool HasPFI;
+
+ /// Processor has AVX-512 Exponential and Reciprocal Instructions
+ bool HasERI;
+
+ /// Processor has AVX-512 Conflict Detection Instructions
+ bool HasCDI;
+
+ /// Processor has AVX-512 Doubleword and Quadword instructions
+ bool HasDQI;
+
+ /// Processor has AVX-512 Byte and Word instructions
+ bool HasBWI;
+
+ /// Processor has AVX-512 Vector Length eXtenstions
+ bool HasVLX;
+
+ /// Processor has PKU extenstions
+ bool HasPKU;
+
+ /// Processor supports MPX - Memory Protection Extensions
+ bool HasMPX;
+
+ /// Processor supports Invalidate Process-Context Identifier
+ bool HasInvPCId;
+
+ /// Processor has VM Functions
+ bool HasVMFUNC;
+
+ /// Processor has Supervisor Mode Access Protection
+ bool HasSMAP;
+
+ /// Processor has Software Guard Extensions
+ bool HasSGX;
+
+ /// Processor supports Flush Cache Line instruction
+ bool HasCLFLUSHOPT;
+
+ /// Processor has Persistent Commit feature
+ bool HasPCOMMIT;
+
+ /// Processor supports Cache Line Write Back instruction
+ bool HasCLWB;
+
+ /// Use software floating point for code generation.
+ bool UseSoftFloat;
+
+ /// The minimum alignment known to hold of the stack frame on
+ /// entry to the function and which must be maintained by every function.
+ unsigned stackAlignment;
+
+ /// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops.
+ ///
+ unsigned MaxInlineSizeThreshold;
+
+ /// What processor and OS we're targeting.
+ Triple TargetTriple;
+
+ /// Instruction itineraries for scheduling
+ InstrItineraryData InstrItins;
+
+ /// Gather the accessor points to GlobalISel-related APIs.
+ /// This is used to avoid ifndefs spreading around while GISel is
+ /// an optional library.
+ std::unique_ptr<GISelAccessor> GISel;
+private:
+
+ /// Override the stack alignment.
+ unsigned StackAlignOverride;
+
+ /// True if compiling for 64-bit, false for 16-bit or 32-bit.
+ bool In64BitMode;
+
+ /// True if compiling for 32-bit, false for 16-bit or 64-bit.
+ bool In32BitMode;
+
+ /// True if compiling for 16-bit, false for 32-bit or 64-bit.
+ bool In16BitMode;
+
+ X86SelectionDAGInfo TSInfo;
+ // Ordering here is important. X86InstrInfo initializes X86RegisterInfo which
+ // X86TargetLowering needs.
+ X86InstrInfo InstrInfo;
+ X86TargetLowering TLInfo;
+ X86FrameLowering FrameLowering;
+
+public:
+ /// This constructor initializes the data members to match that
+ /// of the specified triple.
+ ///
+ X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
+ const X86TargetMachine &TM, unsigned StackAlignOverride);
+
+ /// This object will take onwership of \p GISelAccessor.
+ void setGISelAccessor(GISelAccessor &GISel) { this->GISel.reset(&GISel); }
+
+ const X86TargetLowering *getTargetLowering() const override {
+ return &TLInfo;
+ }
+ const X86InstrInfo *getInstrInfo() const override { return &InstrInfo; }
+ const X86FrameLowering *getFrameLowering() const override {
+ return &FrameLowering;
+ }
+ const X86SelectionDAGInfo *getSelectionDAGInfo() const override {
+ return &TSInfo;
+ }
+ const X86RegisterInfo *getRegisterInfo() const override {
+ return &getInstrInfo()->getRegisterInfo();
+ }
+
+ /// Returns the minimum alignment known to hold of the
+ /// stack frame on entry to the function and which must be maintained by every
+ /// function for this subtarget.
+ unsigned getStackAlignment() const { return stackAlignment; }
+
+ /// Returns the maximum memset / memcpy size
+ /// that still makes it profitable to inline the call.
+ unsigned getMaxInlineSizeThreshold() const { return MaxInlineSizeThreshold; }
+
+ /// ParseSubtargetFeatures - Parses features string setting specified
+ /// subtarget options. Definition of function is auto generated by tblgen.
+ void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+ /// Methods used by Global ISel
+ const CallLowering *getCallLowering() const override;
+ const InstructionSelector *getInstructionSelector() const override;
+ const LegalizerInfo *getLegalizerInfo() const override;
+ const RegisterBankInfo *getRegBankInfo() const override;
+private:
+ /// Initialize the full set of dependencies so we can use an initializer
+ /// list for X86Subtarget.
+ X86Subtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
+ void initializeEnvironment();
+ void initSubtargetFeatures(StringRef CPU, StringRef FS);
+public:
+ /// Is this x86_64? (disregarding specific ABI / programming model)
+ bool is64Bit() const {
+ return In64BitMode;
+ }
+
+ bool is32Bit() const {
+ return In32BitMode;
+ }
+
+ bool is16Bit() const {
+ return In16BitMode;
+ }
+
+ /// Is this x86_64 with the ILP32 programming model (x32 ABI)?
+ bool isTarget64BitILP32() const {
+ return In64BitMode && (TargetTriple.getEnvironment() == Triple::GNUX32 ||
+ TargetTriple.isOSNaCl());
+ }
+
+ /// Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
+ bool isTarget64BitLP64() const {
+ return In64BitMode && (TargetTriple.getEnvironment() != Triple::GNUX32 &&
+ !TargetTriple.isOSNaCl());
+ }
+
+ PICStyles::Style getPICStyle() const { return PICStyle; }
+ void setPICStyle(PICStyles::Style Style) { PICStyle = Style; }
+
+ bool hasX87() const { return HasX87; }
+ bool hasCMov() const { return HasCMov; }
+ bool hasSSE1() const { return X86SSELevel >= SSE1; }
+ bool hasSSE2() const { return X86SSELevel >= SSE2; }
+ bool hasSSE3() const { return X86SSELevel >= SSE3; }
+ bool hasSSSE3() const { return X86SSELevel >= SSSE3; }
+ bool hasSSE41() const { return X86SSELevel >= SSE41; }
+ bool hasSSE42() const { return X86SSELevel >= SSE42; }
+ bool hasAVX() const { return X86SSELevel >= AVX; }
+ bool hasAVX2() const { return X86SSELevel >= AVX2; }
+ bool hasAVX512() const { return X86SSELevel >= AVX512F; }
+ bool hasFp256() const { return hasAVX(); }
+ bool hasInt256() const { return hasAVX2(); }
+ bool hasSSE4A() const { return HasSSE4A; }
+ bool hasMMX() const { return X863DNowLevel >= MMX; }
+ bool has3DNow() const { return X863DNowLevel >= ThreeDNow; }
+ bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; }
+ bool hasPOPCNT() const { return HasPOPCNT; }
+ bool hasAES() const { return HasAES; }
+ bool hasFXSR() const { return HasFXSR; }
+ bool hasXSAVE() const { return HasXSAVE; }
+ bool hasXSAVEOPT() const { return HasXSAVEOPT; }
+ bool hasXSAVEC() const { return HasXSAVEC; }
+ bool hasXSAVES() const { return HasXSAVES; }
+ bool hasPCLMUL() const { return HasPCLMUL; }
+ // Prefer FMA4 to FMA - its better for commutation/memory folding and
+ // has equal or better performance on all supported targets.
+ bool hasFMA() const { return HasFMA && !HasFMA4; }
+ bool hasFMA4() const { return HasFMA4; }
+ bool hasAnyFMA() const { return hasFMA() || hasFMA4() || hasAVX512(); }
+ bool hasXOP() const { return HasXOP; }
+ bool hasTBM() const { return HasTBM; }
+ bool hasMOVBE() const { return HasMOVBE; }
+ bool hasRDRAND() const { return HasRDRAND; }
+ bool hasF16C() const { return HasF16C; }
+ bool hasFSGSBase() const { return HasFSGSBase; }
+ bool hasLZCNT() const { return HasLZCNT; }
+ bool hasBMI() const { return HasBMI; }
+ bool hasBMI2() const { return HasBMI2; }
+ bool hasVBMI() const { return HasVBMI; }
+ bool hasIFMA() const { return HasIFMA; }
+ bool hasRTM() const { return HasRTM; }
+ bool hasHLE() const { return HasHLE; }
+ bool hasADX() const { return HasADX; }
+ bool hasSHA() const { return HasSHA; }
+ bool hasPRFCHW() const { return HasPRFCHW; }
+ bool hasRDSEED() const { return HasRDSEED; }
+ bool hasLAHFSAHF() const { return HasLAHFSAHF; }
+ bool hasMWAITX() const { return HasMWAITX; }
+ bool isBTMemSlow() const { return IsBTMemSlow; }
+ bool isSHLDSlow() const { return IsSHLDSlow; }
+ bool isPMULLDSlow() const { return IsPMULLDSlow; }
+ bool isUnalignedMem16Slow() const { return IsUAMem16Slow; }
+ bool isUnalignedMem32Slow() const { return IsUAMem32Slow; }
+ bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; }
+ bool hasCmpxchg16b() const { return HasCmpxchg16b; }
+ bool useLeaForSP() const { return UseLeaForSP; }
+ bool hasFastPartialYMMWrite() const { return HasFastPartialYMMWrite; }
+ bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }
+ bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
+ bool hasFastLZCNT() const { return HasFastLZCNT; }
+ bool hasSlowDivide32() const { return HasSlowDivide32; }
+ bool hasSlowDivide64() const { return HasSlowDivide64; }
+ bool padShortFunctions() const { return PadShortFunctions; }
+ bool callRegIndirect() const { return CallRegIndirect; }
+ bool LEAusesAG() const { return LEAUsesAG; }
+ bool slowLEA() const { return SlowLEA; }
+ bool slowIncDec() const { return SlowIncDec; }
+ bool hasCDI() const { return HasCDI; }
+ bool hasPFI() const { return HasPFI; }
+ bool hasERI() const { return HasERI; }
+ bool hasDQI() const { return HasDQI; }
+ bool hasBWI() const { return HasBWI; }
+ bool hasVLX() const { return HasVLX; }
+ bool hasPKU() const { return HasPKU; }
+ bool hasMPX() const { return HasMPX; }
+
+ virtual bool isXRaySupported() const override { return is64Bit(); }
+
+ bool isAtom() const { return X86ProcFamily == IntelAtom; }
+ bool isSLM() const { return X86ProcFamily == IntelSLM; }
+ bool useSoftFloat() const { return UseSoftFloat; }
+
+ /// Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
+ /// no-sse2). There isn't any reason to disable it if the target processor
+ /// supports it.
+ bool hasMFence() const { return hasSSE2() || is64Bit(); }
+
+ const Triple &getTargetTriple() const { return TargetTriple; }
+
+ bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
+ bool isTargetFreeBSD() const { return TargetTriple.isOSFreeBSD(); }
+ bool isTargetDragonFly() const { return TargetTriple.isOSDragonFly(); }
+ bool isTargetSolaris() const { return TargetTriple.isOSSolaris(); }
+ bool isTargetPS4() const { return TargetTriple.isPS4CPU(); }
+
+ bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
+ bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); }
+ bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
+
+ bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
+ bool isTargetKFreeBSD() const { return TargetTriple.isOSKFreeBSD(); }
+ bool isTargetGlibc() const { return TargetTriple.isOSGlibc(); }
+ bool isTargetAndroid() const { return TargetTriple.isAndroid(); }
+ bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
+ bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); }
+ bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); }
+ bool isTargetMCU() const { return TargetTriple.isOSIAMCU(); }
+
+ bool isTargetWindowsMSVC() const {
+ return TargetTriple.isWindowsMSVCEnvironment();
+ }
+
+ bool isTargetKnownWindowsMSVC() const {
+ return TargetTriple.isKnownWindowsMSVCEnvironment();
+ }
+
+ bool isTargetWindowsCoreCLR() const {
+ return TargetTriple.isWindowsCoreCLREnvironment();
+ }
+
+ bool isTargetWindowsCygwin() const {
+ return TargetTriple.isWindowsCygwinEnvironment();
+ }
+
+ bool isTargetWindowsGNU() const {
+ return TargetTriple.isWindowsGNUEnvironment();
+ }
+
+ bool isTargetWindowsItanium() const {
+ return TargetTriple.isWindowsItaniumEnvironment();
+ }
+
+ bool isTargetCygMing() const { return TargetTriple.isOSCygMing(); }
+
+ bool isOSWindows() const { return TargetTriple.isOSWindows(); }
+
+ bool isTargetWin64() const {
+ return In64BitMode && TargetTriple.isOSWindows();
+ }
+
+ bool isTargetWin32() const {
+ return !In64BitMode && (isTargetCygMing() || isTargetKnownWindowsMSVC());
+ }
+
+ bool isPICStyleGOT() const { return PICStyle == PICStyles::GOT; }
+ bool isPICStyleRIPRel() const { return PICStyle == PICStyles::RIPRel; }
+
+ bool isPICStyleStubPIC() const {
+ return PICStyle == PICStyles::StubPIC;
+ }
+
+ bool isPositionIndependent() const { return TM.isPositionIndependent(); }
+
+ bool isCallingConvWin64(CallingConv::ID CC) const {
+ switch (CC) {
+ // On Win64, all these conventions just use the default convention.
+ case CallingConv::C:
+ case CallingConv::Fast:
+ case CallingConv::X86_FastCall:
+ case CallingConv::X86_StdCall:
+ case CallingConv::X86_ThisCall:
+ case CallingConv::X86_VectorCall:
+ case CallingConv::Intel_OCL_BI:
+ return isTargetWin64();
+ // This convention allows using the Win64 convention on other targets.
+ case CallingConv::X86_64_Win64:
+ return true;
+ // This convention allows using the SysV convention on Windows targets.
+ case CallingConv::X86_64_SysV:
+ return false;
+ // Otherwise, who knows what this is.
+ default:
+ return false;
+ }
+ }
+
+ /// Classify a global variable reference for the current subtarget according
+ /// to how we should reference it in a non-pcrel context.
+ unsigned char classifyLocalReference(const GlobalValue *GV) const;
+
+ unsigned char classifyGlobalReference(const GlobalValue *GV,
+ const Module &M) const;
+ unsigned char classifyGlobalReference(const GlobalValue *GV) const;
+
+ /// Classify a global function reference for the current subtarget.
+ unsigned char classifyGlobalFunctionReference(const GlobalValue *GV,
+ const Module &M) const;
+ unsigned char classifyGlobalFunctionReference(const GlobalValue *GV) const;
+
+ /// Classify a blockaddress reference for the current subtarget according to
+ /// how we should reference it in a non-pcrel context.
+ unsigned char classifyBlockAddressReference() const;
+
+ /// Return true if the subtarget allows calls to immediate address.
+ bool isLegalToCallImmediateAddr() const;
+
+ /// This function returns the name of a function which has an interface
+ /// like the non-standard bzero function, if such a function exists on
+ /// the current subtarget and it is considered prefereable over
+ /// memset with zero passed as the second argument. Otherwise it
+ /// returns null.
+ const char *getBZeroEntry() const;
+
+ /// This function returns true if the target has sincos() routine in its
+ /// compiler runtime or math libraries.
+ bool hasSinCos() const;
+
+ /// Enable the MachineScheduler pass for all X86 subtargets.
+ bool enableMachineScheduler() const override { return true; }
+
+ bool enableEarlyIfConversion() const override;
+
+ /// Return the instruction itineraries based on the subtarget selection.
+ const InstrItineraryData *getInstrItineraryData() const override {
+ return &InstrItins;
+ }
+
+ AntiDepBreakMode getAntiDepBreakMode() const override {
+ return TargetSubtargetInfo::ANTIDEP_CRITICAL;
+ }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp
new file mode 100644
index 000000000000..aa5cfc64e9eb
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -0,0 +1,405 @@
+//===-- X86TargetMachine.cpp - Define TargetMachine for the X86 -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetMachine.h"
+#include "X86.h"
+#include "X86CallLowering.h"
+#include "X86TargetObjectFile.h"
+#include "X86TargetTransformInfo.h"
+#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
+#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner",
+ cl::desc("Enable the machine combiner pass"),
+ cl::init(true), cl::Hidden);
+
+namespace llvm {
+void initializeWinEHStatePassPass(PassRegistry &);
+}
+
+extern "C" void LLVMInitializeX86Target() {
+ // Register the target.
+ RegisterTargetMachine<X86TargetMachine> X(getTheX86_32Target());
+ RegisterTargetMachine<X86TargetMachine> Y(getTheX86_64Target());
+
+ PassRegistry &PR = *PassRegistry::getPassRegistry();
+ initializeGlobalISel(PR);
+ initializeWinEHStatePassPass(PR);
+ initializeFixupBWInstPassPass(PR);
+ initializeEvexToVexInstPassPass(PR);
+}
+
+static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
+ if (TT.isOSBinFormatMachO()) {
+ if (TT.getArch() == Triple::x86_64)
+ return make_unique<X86_64MachoTargetObjectFile>();
+ return make_unique<TargetLoweringObjectFileMachO>();
+ }
+
+ if (TT.isOSFreeBSD())
+ return make_unique<X86FreeBSDTargetObjectFile>();
+ if (TT.isOSLinux() || TT.isOSNaCl())
+ return make_unique<X86LinuxNaClTargetObjectFile>();
+ if (TT.isOSFuchsia())
+ return make_unique<X86FuchsiaTargetObjectFile>();
+ if (TT.isOSBinFormatELF())
+ return make_unique<X86ELFTargetObjectFile>();
+ if (TT.isKnownWindowsMSVCEnvironment() || TT.isWindowsCoreCLREnvironment())
+ return make_unique<X86WindowsTargetObjectFile>();
+ if (TT.isOSBinFormatCOFF())
+ return make_unique<TargetLoweringObjectFileCOFF>();
+ llvm_unreachable("unknown subtarget type");
+}
+
+static std::string computeDataLayout(const Triple &TT) {
+ // X86 is little endian
+ std::string Ret = "e";
+
+ Ret += DataLayout::getManglingComponent(TT);
+ // X86 and x32 have 32 bit pointers.
+ if ((TT.isArch64Bit() &&
+ (TT.getEnvironment() == Triple::GNUX32 || TT.isOSNaCl())) ||
+ !TT.isArch64Bit())
+ Ret += "-p:32:32";
+
+ // Some ABIs align 64 bit integers and doubles to 64 bits, others to 32.
+ if (TT.isArch64Bit() || TT.isOSWindows() || TT.isOSNaCl())
+ Ret += "-i64:64";
+ else if (TT.isOSIAMCU())
+ Ret += "-i64:32-f64:32";
+ else
+ Ret += "-f64:32:64";
+
+ // Some ABIs align long double to 128 bits, others to 32.
+ if (TT.isOSNaCl() || TT.isOSIAMCU())
+ ; // No f80
+ else if (TT.isArch64Bit() || TT.isOSDarwin())
+ Ret += "-f80:128";
+ else
+ Ret += "-f80:32";
+
+ if (TT.isOSIAMCU())
+ Ret += "-f128:32";
+
+ // The registers can hold 8, 16, 32 or, in x86-64, 64 bits.
+ if (TT.isArch64Bit())
+ Ret += "-n8:16:32:64";
+ else
+ Ret += "-n8:16:32";
+
+ // The stack is aligned to 32 bits on some ABIs and 128 bits on others.
+ if ((!TT.isArch64Bit() && TT.isOSWindows()) || TT.isOSIAMCU())
+ Ret += "-a:0:32-S32";
+ else
+ Ret += "-S128";
+
+ return Ret;
+}
+
+static Reloc::Model getEffectiveRelocModel(const Triple &TT,
+ Optional<Reloc::Model> RM) {
+ bool is64Bit = TT.getArch() == Triple::x86_64;
+ if (!RM.hasValue()) {
+ // Darwin defaults to PIC in 64 bit mode and dynamic-no-pic in 32 bit mode.
+ // Win64 requires rip-rel addressing, thus we force it to PIC. Otherwise we
+ // use static relocation model by default.
+ if (TT.isOSDarwin()) {
+ if (is64Bit)
+ return Reloc::PIC_;
+ return Reloc::DynamicNoPIC;
+ }
+ if (TT.isOSWindows() && is64Bit)
+ return Reloc::PIC_;
+ return Reloc::Static;
+ }
+
+ // ELF and X86-64 don't have a distinct DynamicNoPIC model. DynamicNoPIC
+ // is defined as a model for code which may be used in static or dynamic
+ // executables but not necessarily a shared library. On X86-32 we just
+ // compile in -static mode, in x86-64 we use PIC.
+ if (*RM == Reloc::DynamicNoPIC) {
+ if (is64Bit)
+ return Reloc::PIC_;
+ if (!TT.isOSDarwin())
+ return Reloc::Static;
+ }
+
+ // If we are on Darwin, disallow static relocation model in X86-64 mode, since
+ // the Mach-O file format doesn't support it.
+ if (*RM == Reloc::Static && TT.isOSDarwin() && is64Bit)
+ return Reloc::PIC_;
+
+ return *RM;
+}
+
+/// Create an X86 target.
+///
+X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT,
+ StringRef CPU, StringRef FS,
+ const TargetOptions &Options,
+ Optional<Reloc::Model> RM,
+ CodeModel::Model CM, CodeGenOpt::Level OL)
+ : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options,
+ getEffectiveRelocModel(TT, RM), CM, OL),
+ TLOF(createTLOF(getTargetTriple())) {
+ // Windows stack unwinder gets confused when execution flow "falls through"
+ // after a call to 'noreturn' function.
+ // To prevent that, we emit a trap for 'unreachable' IR instructions.
+ // (which on X86, happens to be the 'ud2' instruction)
+ // On PS4, the "return address" of a 'noreturn' call must still be within
+ // the calling function, and TrapUnreachable is an easy way to get that.
+ // The check here for 64-bit windows is a bit icky, but as we're unlikely
+ // to ever want to mix 32 and 64-bit windows code in a single module
+ // this should be fine.
+ if ((TT.isOSWindows() && TT.getArch() == Triple::x86_64) || TT.isPS4())
+ this->Options.TrapUnreachable = true;
+
+ initAsmInfo();
+}
+
+X86TargetMachine::~X86TargetMachine() {}
+
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+namespace {
+struct X86GISelActualAccessor : public GISelAccessor {
+ std::unique_ptr<CallLowering> CL;
+ X86GISelActualAccessor(CallLowering* CL): CL(CL) {}
+ const CallLowering *getCallLowering() const override {
+ return CL.get();
+ }
+ const InstructionSelector *getInstructionSelector() const override {
+ //TODO: Implement
+ return nullptr;
+ }
+ const LegalizerInfo *getLegalizerInfo() const override {
+ //TODO: Implement
+ return nullptr;
+ }
+ const RegisterBankInfo *getRegBankInfo() const override {
+ //TODO: Implement
+ return nullptr;
+ }
+};
+} // End anonymous namespace.
+#endif
+const X86Subtarget *
+X86TargetMachine::getSubtargetImpl(const Function &F) const {
+ Attribute CPUAttr = F.getFnAttribute("target-cpu");
+ Attribute FSAttr = F.getFnAttribute("target-features");
+
+ StringRef CPU = !CPUAttr.hasAttribute(Attribute::None)
+ ? CPUAttr.getValueAsString()
+ : (StringRef)TargetCPU;
+ StringRef FS = !FSAttr.hasAttribute(Attribute::None)
+ ? FSAttr.getValueAsString()
+ : (StringRef)TargetFS;
+
+ SmallString<512> Key;
+ Key.reserve(CPU.size() + FS.size());
+ Key += CPU;
+ Key += FS;
+
+ // FIXME: This is related to the code below to reset the target options,
+ // we need to know whether or not the soft float flag is set on the
+ // function before we can generate a subtarget. We also need to use
+ // it as a key for the subtarget since that can be the only difference
+ // between two functions.
+ bool SoftFloat =
+ F.getFnAttribute("use-soft-float").getValueAsString() == "true";
+ // If the soft float attribute is set on the function turn on the soft float
+ // subtarget feature.
+ if (SoftFloat)
+ Key += FS.empty() ? "+soft-float" : ",+soft-float";
+
+ FS = Key.substr(CPU.size());
+
+ auto &I = SubtargetMap[Key];
+ if (!I) {
+ // This needs to be done before we create a new subtarget since any
+ // creation will depend on the TM and the code generation flags on the
+ // function that reside in TargetOptions.
+ resetTargetOptions(F);
+ I = llvm::make_unique<X86Subtarget>(TargetTriple, CPU, FS, *this,
+ Options.StackAlignmentOverride);
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+ GISelAccessor *GISel = new GISelAccessor();
+#else
+ X86GISelActualAccessor *GISel = new X86GISelActualAccessor(
+ new X86CallLowering(*I->getTargetLowering()));
+#endif
+ I->setGISelAccessor(*GISel);
+ }
+ return I.get();
+}
+
+//===----------------------------------------------------------------------===//
+// Command line options for x86
+//===----------------------------------------------------------------------===//
+static cl::opt<bool>
+UseVZeroUpper("x86-use-vzeroupper", cl::Hidden,
+ cl::desc("Minimize AVX to SSE transition penalty"),
+ cl::init(true));
+
+//===----------------------------------------------------------------------===//
+// X86 TTI query.
+//===----------------------------------------------------------------------===//
+
+TargetIRAnalysis X86TargetMachine::getTargetIRAnalysis() {
+ return TargetIRAnalysis([this](const Function &F) {
+ return TargetTransformInfo(X86TTIImpl(this, F));
+ });
+}
+
+
+//===----------------------------------------------------------------------===//
+// Pass Pipeline Configuration
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// X86 Code Generator Pass Configuration Options.
+class X86PassConfig : public TargetPassConfig {
+public:
+ X86PassConfig(X86TargetMachine *TM, PassManagerBase &PM)
+ : TargetPassConfig(TM, PM) {}
+
+ X86TargetMachine &getX86TargetMachine() const {
+ return getTM<X86TargetMachine>();
+ }
+
+ ScheduleDAGInstrs *
+ createMachineScheduler(MachineSchedContext *C) const override {
+ ScheduleDAGMILive *DAG = createGenericSchedLive(C);
+ DAG->addMutation(createMacroFusionDAGMutation(DAG->TII));
+ return DAG;
+ }
+
+ void addIRPasses() override;
+ bool addInstSelector() override;
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+ bool addIRTranslator() override;
+ bool addLegalizeMachineIR() override;
+ bool addRegBankSelect() override;
+ bool addGlobalInstructionSelect() override;
+#endif
+bool addILPOpts() override;
+ bool addPreISel() override;
+ void addPreRegAlloc() override;
+ void addPostRegAlloc() override;
+ void addPreEmitPass() override;
+ void addPreSched2() override;
+};
+} // namespace
+
+TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) {
+ return new X86PassConfig(this, PM);
+}
+
+void X86PassConfig::addIRPasses() {
+ addPass(createAtomicExpandPass(&getX86TargetMachine()));
+
+ TargetPassConfig::addIRPasses();
+
+ if (TM->getOptLevel() != CodeGenOpt::None)
+ addPass(createInterleavedAccessPass(TM));
+}
+
+bool X86PassConfig::addInstSelector() {
+ // Install an instruction selector.
+ addPass(createX86ISelDag(getX86TargetMachine(), getOptLevel()));
+
+ // For ELF, cleanup any local-dynamic TLS accesses.
+ if (TM->getTargetTriple().isOSBinFormatELF() &&
+ getOptLevel() != CodeGenOpt::None)
+ addPass(createCleanupLocalDynamicTLSPass());
+
+ addPass(createX86GlobalBaseRegPass());
+ return false;
+}
+
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+bool X86PassConfig::addIRTranslator() {
+ addPass(new IRTranslator());
+ return false;
+}
+
+bool X86PassConfig::addLegalizeMachineIR() {
+ //TODO: Implement
+ return false;
+}
+
+bool X86PassConfig::addRegBankSelect() {
+ //TODO: Implement
+ return false;
+}
+
+bool X86PassConfig::addGlobalInstructionSelect() {
+ //TODO: Implement
+ return false;
+}
+#endif
+
+bool X86PassConfig::addILPOpts() {
+ addPass(&EarlyIfConverterID);
+ if (EnableMachineCombinerPass)
+ addPass(&MachineCombinerID);
+ return true;
+}
+
+bool X86PassConfig::addPreISel() {
+ // Only add this pass for 32-bit x86 Windows.
+ const Triple &TT = TM->getTargetTriple();
+ if (TT.isOSWindows() && TT.getArch() == Triple::x86)
+ addPass(createX86WinEHStatePass());
+ return true;
+}
+
+void X86PassConfig::addPreRegAlloc() {
+ if (getOptLevel() != CodeGenOpt::None) {
+ addPass(createX86FixupSetCC());
+ addPass(createX86OptimizeLEAs());
+ addPass(createX86CallFrameOptimization());
+ }
+
+ addPass(createX86WinAllocaExpander());
+}
+
+void X86PassConfig::addPostRegAlloc() {
+ addPass(createX86FloatingPointStackifierPass());
+}
+
+void X86PassConfig::addPreSched2() { addPass(createX86ExpandPseudoPass()); }
+
+void X86PassConfig::addPreEmitPass() {
+ if (getOptLevel() != CodeGenOpt::None)
+ addPass(createExecutionDependencyFixPass(&X86::VR128XRegClass));
+
+ if (UseVZeroUpper)
+ addPass(createX86IssueVZeroUpperPass());
+
+ if (getOptLevel() != CodeGenOpt::None) {
+ addPass(createX86FixupBWInsts());
+ addPass(createX86PadShortFunctions());
+ addPass(createX86FixupLEAs());
+ addPass(createX86EvexToVexInsts());
+ }
+}
diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.h b/contrib/llvm/lib/Target/X86/X86TargetMachine.h
new file mode 100644
index 000000000000..d756d07926dd
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.h
@@ -0,0 +1,48 @@
+//===-- X86TargetMachine.h - Define TargetMachine for the X86 ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the X86 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86TARGETMACHINE_H
+#define LLVM_LIB_TARGET_X86_X86TARGETMACHINE_H
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class StringRef;
+
+class X86TargetMachine final : public LLVMTargetMachine {
+ std::unique_ptr<TargetLoweringObjectFile> TLOF;
+ mutable StringMap<std::unique_ptr<X86Subtarget>> SubtargetMap;
+
+public:
+ X86TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL);
+ ~X86TargetMachine() override;
+ const X86Subtarget *getSubtargetImpl(const Function &F) const override;
+
+ TargetIRAnalysis getTargetIRAnalysis() override;
+
+ // Set up the pass pipeline.
+ TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+ TargetLoweringObjectFile *getObjFileLowering() const override {
+ return TLOF.get();
+ }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp
new file mode 100644
index 000000000000..7f70829cb6c6
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp
@@ -0,0 +1,184 @@
+//===-- X86TargetObjectFile.cpp - X86 Object Info -------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetObjectFile.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Target/TargetLowering.h"
+
+using namespace llvm;
+using namespace dwarf;
+
+const MCExpr *X86_64MachoTargetObjectFile::getTTypeGlobalReference(
+ const GlobalValue *GV, unsigned Encoding, const TargetMachine &TM,
+ MachineModuleInfo *MMI, MCStreamer &Streamer) const {
+
+ // On Darwin/X86-64, we can reference dwarf symbols with foo@GOTPCREL+4, which
+ // is an indirect pc-relative reference.
+ if ((Encoding & DW_EH_PE_indirect) && (Encoding & DW_EH_PE_pcrel)) {
+ const MCSymbol *Sym = TM.getSymbol(GV);
+ const MCExpr *Res =
+ MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOTPCREL, getContext());
+ const MCExpr *Four = MCConstantExpr::create(4, getContext());
+ return MCBinaryExpr::createAdd(Res, Four, getContext());
+ }
+
+ return TargetLoweringObjectFileMachO::getTTypeGlobalReference(
+ GV, Encoding, TM, MMI, Streamer);
+}
+
+MCSymbol *X86_64MachoTargetObjectFile::getCFIPersonalitySymbol(
+ const GlobalValue *GV, const TargetMachine &TM,
+ MachineModuleInfo *MMI) const {
+ return TM.getSymbol(GV);
+}
+
+const MCExpr *X86_64MachoTargetObjectFile::getIndirectSymViaGOTPCRel(
+ const MCSymbol *Sym, const MCValue &MV, int64_t Offset,
+ MachineModuleInfo *MMI, MCStreamer &Streamer) const {
+ // On Darwin/X86-64, we need to use foo@GOTPCREL+4 to access the got entry
+ // from a data section. In case there's an additional offset, then use
+ // foo@GOTPCREL+4+<offset>.
+ unsigned FinalOff = Offset+MV.getConstant()+4;
+ const MCExpr *Res =
+ MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOTPCREL, getContext());
+ const MCExpr *Off = MCConstantExpr::create(FinalOff, getContext());
+ return MCBinaryExpr::createAdd(Res, Off, getContext());
+}
+
+const MCExpr *X86ELFTargetObjectFile::getDebugThreadLocalSymbol(
+ const MCSymbol *Sym) const {
+ return MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_DTPOFF, getContext());
+}
+
+void
+X86FreeBSDTargetObjectFile::Initialize(MCContext &Ctx,
+ const TargetMachine &TM) {
+ TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+ InitializeELF(TM.Options.UseInitArray);
+}
+
+void
+X86FuchsiaTargetObjectFile::Initialize(MCContext &Ctx,
+ const TargetMachine &TM) {
+ TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+ InitializeELF(TM.Options.UseInitArray);
+}
+
+void
+X86LinuxNaClTargetObjectFile::Initialize(MCContext &Ctx,
+ const TargetMachine &TM) {
+ TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+ InitializeELF(TM.Options.UseInitArray);
+}
+
+const MCExpr *X86WindowsTargetObjectFile::lowerRelativeReference(
+ const GlobalValue *LHS, const GlobalValue *RHS,
+ const TargetMachine &TM) const {
+ // Our symbols should exist in address space zero, cowardly no-op if
+ // otherwise.
+ if (LHS->getType()->getPointerAddressSpace() != 0 ||
+ RHS->getType()->getPointerAddressSpace() != 0)
+ return nullptr;
+
+ // Both ptrtoint instructions must wrap global objects:
+ // - Only global variables are eligible for image relative relocations.
+ // - The subtrahend refers to the special symbol __ImageBase, a GlobalVariable.
+ // We expect __ImageBase to be a global variable without a section, externally
+ // defined.
+ //
+ // It should look something like this: @__ImageBase = external constant i8
+ if (!isa<GlobalObject>(LHS) || !isa<GlobalVariable>(RHS) ||
+ LHS->isThreadLocal() || RHS->isThreadLocal() ||
+ RHS->getName() != "__ImageBase" || !RHS->hasExternalLinkage() ||
+ cast<GlobalVariable>(RHS)->hasInitializer() || RHS->hasSection())
+ return nullptr;
+
+ return MCSymbolRefExpr::create(TM.getSymbol(LHS),
+ MCSymbolRefExpr::VK_COFF_IMGREL32,
+ getContext());
+}
+
+static std::string APIntToHexString(const APInt &AI) {
+ unsigned Width = (AI.getBitWidth() / 8) * 2;
+ std::string HexString = utohexstr(AI.getLimitedValue(), /*LowerCase=*/true);
+ unsigned Size = HexString.size();
+ assert(Width >= Size && "hex string is too large!");
+ HexString.insert(HexString.begin(), Width - Size, '0');
+
+ return HexString;
+}
+
+static std::string scalarConstantToHexString(const Constant *C) {
+ Type *Ty = C->getType();
+ if (isa<UndefValue>(C)) {
+ return APIntToHexString(APInt::getNullValue(Ty->getPrimitiveSizeInBits()));
+ } else if (const auto *CFP = dyn_cast<ConstantFP>(C)) {
+ return APIntToHexString(CFP->getValueAPF().bitcastToAPInt());
+ } else if (const auto *CI = dyn_cast<ConstantInt>(C)) {
+ return APIntToHexString(CI->getValue());
+ } else {
+ unsigned NumElements;
+ if (isa<VectorType>(Ty))
+ NumElements = Ty->getVectorNumElements();
+ else
+ NumElements = Ty->getArrayNumElements();
+ std::string HexString;
+ for (int I = NumElements - 1, E = -1; I != E; --I)
+ HexString += scalarConstantToHexString(C->getAggregateElement(I));
+ return HexString;
+ }
+}
+
+MCSection *X86WindowsTargetObjectFile::getSectionForConstant(
+ const DataLayout &DL, SectionKind Kind, const Constant *C,
+ unsigned &Align) const {
+ if (Kind.isMergeableConst() && C) {
+ const unsigned Characteristics = COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+ COFF::IMAGE_SCN_MEM_READ |
+ COFF::IMAGE_SCN_LNK_COMDAT;
+ std::string COMDATSymName;
+ if (Kind.isMergeableConst4()) {
+ if (Align <= 4) {
+ COMDATSymName = "__real@" + scalarConstantToHexString(C);
+ Align = 4;
+ }
+ } else if (Kind.isMergeableConst8()) {
+ if (Align <= 8) {
+ COMDATSymName = "__real@" + scalarConstantToHexString(C);
+ Align = 8;
+ }
+ } else if (Kind.isMergeableConst16()) {
+ if (Align <= 16) {
+ COMDATSymName = "__xmm@" + scalarConstantToHexString(C);
+ Align = 16;
+ }
+ } else if (Kind.isMergeableConst32()) {
+ if (Align <= 32) {
+ COMDATSymName = "__ymm@" + scalarConstantToHexString(C);
+ Align = 32;
+ }
+ }
+
+ if (!COMDATSymName.empty())
+ return getContext().getCOFFSection(".rdata", Characteristics, Kind,
+ COMDATSymName,
+ COFF::IMAGE_COMDAT_SELECT_ANY);
+ }
+
+ return TargetLoweringObjectFile::getSectionForConstant(DL, Kind, C, Align);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h
new file mode 100644
index 000000000000..39d2e84e5ed7
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h
@@ -0,0 +1,84 @@
+//===-- X86TargetObjectFile.h - X86 Object Info -----------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86TARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_X86_X86TARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+namespace llvm {
+
+ /// X86_64MachoTargetObjectFile - This TLOF implementation is used for Darwin
+ /// x86-64.
+ class X86_64MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
+ public:
+ const MCExpr *getTTypeGlobalReference(const GlobalValue *GV,
+ unsigned Encoding,
+ const TargetMachine &TM,
+ MachineModuleInfo *MMI,
+ MCStreamer &Streamer) const override;
+
+ // getCFIPersonalitySymbol - The symbol that gets passed to
+ // .cfi_personality.
+ MCSymbol *getCFIPersonalitySymbol(const GlobalValue *GV,
+ const TargetMachine &TM,
+ MachineModuleInfo *MMI) const override;
+
+ const MCExpr *getIndirectSymViaGOTPCRel(const MCSymbol *Sym,
+ const MCValue &MV, int64_t Offset,
+ MachineModuleInfo *MMI,
+ MCStreamer &Streamer) const override;
+ };
+
+ /// \brief This implemenatation is used for X86 ELF targets that don't
+ /// have a further specialization.
+ class X86ELFTargetObjectFile : public TargetLoweringObjectFileELF {
+ public:
+ X86ELFTargetObjectFile() {
+ PLTRelativeVariantKind = MCSymbolRefExpr::VK_PLT;
+ }
+
+ /// \brief Describe a TLS variable address within debug info.
+ const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override;
+ };
+
+ /// X86FreeBSDTargetObjectFile - This implementation is used for FreeBSD
+ /// on x86 and x86-64.
+ class X86FreeBSDTargetObjectFile : public X86ELFTargetObjectFile {
+ void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+ };
+
+ /// \brief This implementation is used for Fuchsia on x86-64.
+ class X86FuchsiaTargetObjectFile : public X86ELFTargetObjectFile {
+ void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+ };
+
+ /// X86LinuxNaClTargetObjectFile - This implementation is used for linux and
+ /// Native Client on x86 and x86-64.
+ class X86LinuxNaClTargetObjectFile : public X86ELFTargetObjectFile {
+ void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+ };
+
+ /// \brief This implementation is used for Windows targets on x86 and x86-64.
+ class X86WindowsTargetObjectFile : public TargetLoweringObjectFileCOFF {
+ const MCExpr *
+ lowerRelativeReference(const GlobalValue *LHS, const GlobalValue *RHS,
+ const TargetMachine &TM) const override;
+
+ /// \brief Given a mergeable constant with the specified size and relocation
+ /// information, return a section that it should be placed in.
+ MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
+ const Constant *C,
+ unsigned &Align) const override;
+ };
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
new file mode 100644
index 000000000000..2b0e672d56f2
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -0,0 +1,2250 @@
+//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements a TargetTransformInfo analysis pass specific to the
+/// X86 target machine. It uses the target's detailed information to provide
+/// more precise answers to certain TTI queries, while letting the target
+/// independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+/// About Cost Model numbers used below it's necessary to say the following:
+/// the numbers correspond to some "generic" X86 CPU instead of usage of
+/// concrete CPU model. Usually the numbers correspond to CPU where the feature
+/// apeared at the first time. For example, if we do Subtarget.hasSSE42() in
+/// the lookups below the cost is based on Nehalem as that was the first CPU
+/// to support that feature level and thus has most likely the worst case cost.
+/// Some examples of other technologies/CPUs:
+/// SSE 3 - Pentium4 / Athlon64
+/// SSE 4.1 - Penryn
+/// SSE 4.2 - Nehalem
+/// AVX - Sandy Bridge
+/// AVX2 - Haswell
+/// AVX-512 - Xeon Phi / Skylake
+/// And some examples of instruction target dependent costs (latency)
+/// divss sqrtss rsqrtss
+/// AMD K7 11-16 19 3
+/// Piledriver 9-24 13-15 5
+/// Jaguar 14 16 2
+/// Pentium II,III 18 30 2
+/// Nehalem 7-14 7-18 3
+/// Haswell 10-13 11 5
+/// TODO: Develop and implement the target dependent cost model and
+/// specialize cost numbers for different Cost Model Targets such as throughput,
+/// code size, latency and uop count.
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetTransformInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/CostTable.h"
+#include "llvm/Target/TargetLowering.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86tti"
+
+//===----------------------------------------------------------------------===//
+//
+// X86 cost model.
+//
+//===----------------------------------------------------------------------===//
+
+TargetTransformInfo::PopcntSupportKind
+X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
+ assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
+ // TODO: Currently the __builtin_popcount() implementation using SSE3
+ // instructions is inefficient. Once the problem is fixed, we should
+ // call ST->hasSSE3() instead of ST->hasPOPCNT().
+ return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
+}
+
+unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
+ if (Vector && !ST->hasSSE1())
+ return 0;
+
+ if (ST->is64Bit()) {
+ if (Vector && ST->hasAVX512())
+ return 32;
+ return 16;
+ }
+ return 8;
+}
+
+unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) {
+ if (Vector) {
+ if (ST->hasAVX512()) return 512;
+ if (ST->hasAVX()) return 256;
+ if (ST->hasSSE1()) return 128;
+ return 0;
+ }
+
+ if (ST->is64Bit())
+ return 64;
+
+ return 32;
+}
+
+unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
+ // If the loop will not be vectorized, don't interleave the loop.
+ // Let regular unroll to unroll the loop, which saves the overflow
+ // check and memory check cost.
+ if (VF == 1)
+ return 1;
+
+ if (ST->isAtom())
+ return 1;
+
+ // Sandybridge and Haswell have multiple execution ports and pipelined
+ // vector units.
+ if (ST->hasAVX())
+ return 4;
+
+ return 2;
+}
+
+int X86TTIImpl::getArithmeticInstrCost(
+ unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
+ TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
+ TTI::OperandValueProperties Opd2PropInfo) {
+ // Legalize the type.
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ assert(ISD && "Invalid opcode");
+
+ if (ISD == ISD::SDIV &&
+ Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+ Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
+ // On X86, vector signed division by constants power-of-two are
+ // normally expanded to the sequence SRA + SRL + ADD + SRA.
+ // The OperandValue properties many not be same as that of previous
+ // operation;conservatively assume OP_None.
+ int Cost = 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info,
+ Op2Info, TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
+ Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
+ Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
+
+ return Cost;
+ }
+
+ static const CostTblEntry AVX512BWUniformConstCostTable[] = {
+ { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence
+ { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence
+ };
+
+ if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+ ST->hasBWI()) {
+ if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD,
+ LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry AVX512UniformConstCostTable[] = {
+ { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
+ { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
+ };
+
+ if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+ ST->hasAVX512()) {
+ if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD,
+ LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry AVX2UniformConstCostTable[] = {
+ { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
+
+ { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
+ { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
+ { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
+ { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence
+ };
+
+ if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+ ST->hasAVX2()) {
+ if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
+ LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry SSE2UniformConstCostTable[] = {
+ { ISD::SDIV, MVT::v16i16, 12 }, // pmulhw sequence
+ { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
+ { ISD::UDIV, MVT::v16i16, 12 }, // pmulhuw sequence
+ { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
+ { ISD::SDIV, MVT::v8i32, 38 }, // pmuludq sequence
+ { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
+ { ISD::UDIV, MVT::v8i32, 30 }, // pmuludq sequence
+ { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
+ };
+
+ if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+ ST->hasSSE2()) {
+ // pmuldq sequence.
+ if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
+ return LT.first * 30;
+ if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
+ return LT.first * 15;
+
+ if (const auto *Entry = CostTableLookup(SSE2UniformConstCostTable, ISD,
+ LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry AVX512DQCostTable[] = {
+ { ISD::MUL, MVT::v2i64, 1 },
+ { ISD::MUL, MVT::v4i64, 1 },
+ { ISD::MUL, MVT::v8i64, 1 }
+ };
+
+ // Look for AVX512DQ lowering tricks for custom cases.
+ if (ST->hasDQI()) {
+ if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD,
+ LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry AVX512BWCostTable[] = {
+ { ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence.
+ { ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence.
+ { ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence.
+
+ // Vectorizing division is a bad idea. See the SSE2 table for more comments.
+ { ISD::SDIV, MVT::v64i8, 64*20 },
+ { ISD::SDIV, MVT::v32i16, 32*20 },
+ { ISD::SDIV, MVT::v16i32, 16*20 },
+ { ISD::SDIV, MVT::v8i64, 8*20 },
+ { ISD::UDIV, MVT::v64i8, 64*20 },
+ { ISD::UDIV, MVT::v32i16, 32*20 },
+ { ISD::UDIV, MVT::v16i32, 16*20 },
+ { ISD::UDIV, MVT::v8i64, 8*20 },
+ };
+
+ // Look for AVX512BW lowering tricks for custom cases.
+ if (ST->hasBWI()) {
+ if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD,
+ LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry AVX512CostTable[] = {
+ { ISD::SHL, MVT::v16i32, 1 },
+ { ISD::SRL, MVT::v16i32, 1 },
+ { ISD::SRA, MVT::v16i32, 1 },
+ { ISD::SHL, MVT::v8i64, 1 },
+ { ISD::SRL, MVT::v8i64, 1 },
+ { ISD::SRA, MVT::v8i64, 1 },
+
+ { ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence.
+ { ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence.
+ };
+
+ if (ST->hasAVX512()) {
+ if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry AVX2CostTable[] = {
+ // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
+ // customize them to detect the cases where shift amount is a scalar one.
+ { ISD::SHL, MVT::v4i32, 1 },
+ { ISD::SRL, MVT::v4i32, 1 },
+ { ISD::SRA, MVT::v4i32, 1 },
+ { ISD::SHL, MVT::v8i32, 1 },
+ { ISD::SRL, MVT::v8i32, 1 },
+ { ISD::SRA, MVT::v8i32, 1 },
+ { ISD::SHL, MVT::v2i64, 1 },
+ { ISD::SRL, MVT::v2i64, 1 },
+ { ISD::SHL, MVT::v4i64, 1 },
+ { ISD::SRL, MVT::v4i64, 1 },
+ };
+
+ // Look for AVX2 lowering tricks.
+ if (ST->hasAVX2()) {
+ if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
+ (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+ Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
+ // On AVX2, a packed v16i16 shift left by a constant build_vector
+ // is lowered into a vector multiply (vpmullw).
+ return LT.first;
+
+ if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry XOPCostTable[] = {
+ // 128bit shifts take 1cy, but right shifts require negation beforehand.
+ { ISD::SHL, MVT::v16i8, 1 },
+ { ISD::SRL, MVT::v16i8, 2 },
+ { ISD::SRA, MVT::v16i8, 2 },
+ { ISD::SHL, MVT::v8i16, 1 },
+ { ISD::SRL, MVT::v8i16, 2 },
+ { ISD::SRA, MVT::v8i16, 2 },
+ { ISD::SHL, MVT::v4i32, 1 },
+ { ISD::SRL, MVT::v4i32, 2 },
+ { ISD::SRA, MVT::v4i32, 2 },
+ { ISD::SHL, MVT::v2i64, 1 },
+ { ISD::SRL, MVT::v2i64, 2 },
+ { ISD::SRA, MVT::v2i64, 2 },
+ // 256bit shifts require splitting if AVX2 didn't catch them above.
+ { ISD::SHL, MVT::v32i8, 2 },
+ { ISD::SRL, MVT::v32i8, 4 },
+ { ISD::SRA, MVT::v32i8, 4 },
+ { ISD::SHL, MVT::v16i16, 2 },
+ { ISD::SRL, MVT::v16i16, 4 },
+ { ISD::SRA, MVT::v16i16, 4 },
+ { ISD::SHL, MVT::v8i32, 2 },
+ { ISD::SRL, MVT::v8i32, 4 },
+ { ISD::SRA, MVT::v8i32, 4 },
+ { ISD::SHL, MVT::v4i64, 2 },
+ { ISD::SRL, MVT::v4i64, 4 },
+ { ISD::SRA, MVT::v4i64, 4 },
+ };
+
+ // Look for XOP lowering tricks.
+ if (ST->hasXOP()) {
+ if (const auto *Entry = CostTableLookup(XOPCostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry AVX2CustomCostTable[] = {
+ { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence.
+ { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
+
+ { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence.
+ { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
+
+ { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence.
+ { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence.
+ { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence.
+ { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence.
+
+ { ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence.
+ { ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence.
+
+ { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
+ { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
+ { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
+ { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/
+ { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
+ { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
+ };
+
+ // Look for AVX2 lowering tricks for custom cases.
+ if (ST->hasAVX2()) {
+ if (const auto *Entry = CostTableLookup(AVX2CustomCostTable, ISD,
+ LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry AVXCustomCostTable[] = {
+ { ISD::MUL, MVT::v32i8, 26 }, // extend/pmullw/trunc sequence.
+
+ { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/
+ { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
+ { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
+ { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/
+ { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/
+ { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/
+
+ // Vectorizing division is a bad idea. See the SSE2 table for more comments.
+ { ISD::SDIV, MVT::v32i8, 32*20 },
+ { ISD::SDIV, MVT::v16i16, 16*20 },
+ { ISD::SDIV, MVT::v8i32, 8*20 },
+ { ISD::SDIV, MVT::v4i64, 4*20 },
+ { ISD::UDIV, MVT::v32i8, 32*20 },
+ { ISD::UDIV, MVT::v16i16, 16*20 },
+ { ISD::UDIV, MVT::v8i32, 8*20 },
+ { ISD::UDIV, MVT::v4i64, 4*20 },
+ };
+
+ // Look for AVX2 lowering tricks for custom cases.
+ if (ST->hasAVX()) {
+ if (const auto *Entry = CostTableLookup(AVXCustomCostTable, ISD,
+ LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry SSE42FloatCostTable[] = {
+ { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/
+ { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
+ { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/
+ { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
+ };
+
+ if (ST->hasSSE42()) {
+ if (const auto *Entry = CostTableLookup(SSE42FloatCostTable, ISD,
+ LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry
+ SSE2UniformCostTable[] = {
+ // Uniform splats are cheaper for the following instructions.
+ { ISD::SHL, MVT::v16i8, 1 }, // psllw.
+ { ISD::SHL, MVT::v32i8, 2 }, // psllw.
+ { ISD::SHL, MVT::v8i16, 1 }, // psllw.
+ { ISD::SHL, MVT::v16i16, 2 }, // psllw.
+ { ISD::SHL, MVT::v4i32, 1 }, // pslld
+ { ISD::SHL, MVT::v8i32, 2 }, // pslld
+ { ISD::SHL, MVT::v2i64, 1 }, // psllq.
+ { ISD::SHL, MVT::v4i64, 2 }, // psllq.
+
+ { ISD::SRL, MVT::v16i8, 1 }, // psrlw.
+ { ISD::SRL, MVT::v32i8, 2 }, // psrlw.
+ { ISD::SRL, MVT::v8i16, 1 }, // psrlw.
+ { ISD::SRL, MVT::v16i16, 2 }, // psrlw.
+ { ISD::SRL, MVT::v4i32, 1 }, // psrld.
+ { ISD::SRL, MVT::v8i32, 2 }, // psrld.
+ { ISD::SRL, MVT::v2i64, 1 }, // psrlq.
+ { ISD::SRL, MVT::v4i64, 2 }, // psrlq.
+
+ { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
+ { ISD::SRA, MVT::v32i8, 8 }, // psrlw, pand, pxor, psubb.
+ { ISD::SRA, MVT::v8i16, 1 }, // psraw.
+ { ISD::SRA, MVT::v16i16, 2 }, // psraw.
+ { ISD::SRA, MVT::v4i32, 1 }, // psrad.
+ { ISD::SRA, MVT::v8i32, 2 }, // psrad.
+ { ISD::SRA, MVT::v2i64, 4 }, // 2 x psrad + shuffle.
+ { ISD::SRA, MVT::v4i64, 8 }, // 2 x psrad + shuffle.
+ };
+
+ if (ST->hasSSE2() &&
+ ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
+ (Op2Info == TargetTransformInfo::OK_UniformValue))) {
+ if (const auto *Entry =
+ CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ if (ISD == ISD::SHL &&
+ Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) {
+ MVT VT = LT.second;
+ // Vector shift left by non uniform constant can be lowered
+ // into vector multiply (pmullw/pmulld).
+ if ((VT == MVT::v8i16 && ST->hasSSE2()) ||
+ (VT == MVT::v4i32 && ST->hasSSE41()))
+ return LT.first;
+
+ // v16i16 and v8i32 shifts by non-uniform constants are lowered into a
+ // sequence of extract + two vector multiply + insert.
+ if ((VT == MVT::v8i32 || VT == MVT::v16i16) &&
+ (ST->hasAVX() && !ST->hasAVX2()))
+ ISD = ISD::MUL;
+
+ // A vector shift left by non uniform constant is converted
+ // into a vector multiply; the new multiply is eventually
+ // lowered into a sequence of shuffles and 2 x pmuludq.
+ if (VT == MVT::v4i32 && ST->hasSSE2())
+ ISD = ISD::MUL;
+ }
+
+ static const CostTblEntry SSE41CostTable[] = {
+ { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence.
+ { ISD::SHL, MVT::v32i8, 2*11 }, // pblendvb sequence.
+ { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence.
+ { ISD::SHL, MVT::v16i16, 2*14 }, // pblendvb sequence.
+
+ { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence.
+ { ISD::SRL, MVT::v32i8, 2*12 }, // pblendvb sequence.
+ { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence.
+ { ISD::SRL, MVT::v16i16, 2*14 }, // pblendvb sequence.
+ { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend.
+ { ISD::SRL, MVT::v8i32, 2*11 }, // Shift each lane + blend.
+
+ { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence.
+ { ISD::SRA, MVT::v32i8, 2*24 }, // pblendvb sequence.
+ { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence.
+ { ISD::SRA, MVT::v16i16, 2*14 }, // pblendvb sequence.
+ { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
+ { ISD::SRA, MVT::v8i32, 2*12 }, // Shift each lane + blend.
+ };
+
+ if (ST->hasSSE41()) {
+ if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry SSE2CostTable[] = {
+ // We don't correctly identify costs of casts because they are marked as
+ // custom.
+ { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence.
+ { ISD::SHL, MVT::v32i8, 2*26 }, // cmpgtb sequence.
+ { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence.
+ { ISD::SHL, MVT::v16i16, 2*32 }, // cmpgtb sequence.
+ { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
+ { ISD::SHL, MVT::v8i32, 2*2*5 }, // We optimized this using mul.
+ { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
+ { ISD::SHL, MVT::v4i64, 2*4 }, // splat+shuffle sequence.
+
+ { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence.
+ { ISD::SRL, MVT::v32i8, 2*26 }, // cmpgtb sequence.
+ { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence.
+ { ISD::SRL, MVT::v16i16, 2*32 }, // cmpgtb sequence.
+ { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
+ { ISD::SRL, MVT::v8i32, 2*16 }, // Shift each lane + blend.
+ { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
+ { ISD::SRL, MVT::v4i64, 2*4 }, // splat+shuffle sequence.
+
+ { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence.
+ { ISD::SRA, MVT::v32i8, 2*54 }, // unpacked cmpgtb sequence.
+ { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence.
+ { ISD::SRA, MVT::v16i16, 2*32 }, // cmpgtb sequence.
+ { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend.
+ { ISD::SRA, MVT::v8i32, 2*16 }, // Shift each lane + blend.
+ { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence.
+ { ISD::SRA, MVT::v4i64, 2*12 }, // srl/xor/sub sequence.
+
+ { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence.
+
+ { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/
+ { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
+ { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
+ { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
+
+ // It is not a good idea to vectorize division. We have to scalarize it and
+ // in the process we will often end up having to spilling regular
+ // registers. The overhead of division is going to dominate most kernels
+ // anyways so try hard to prevent vectorization of division - it is
+ // generally a bad idea. Assume somewhat arbitrarily that we have to be able
+ // to hide "20 cycles" for each lane.
+ { ISD::SDIV, MVT::v16i8, 16*20 },
+ { ISD::SDIV, MVT::v8i16, 8*20 },
+ { ISD::SDIV, MVT::v4i32, 4*20 },
+ { ISD::SDIV, MVT::v2i64, 2*20 },
+ { ISD::UDIV, MVT::v16i8, 16*20 },
+ { ISD::UDIV, MVT::v8i16, 8*20 },
+ { ISD::UDIV, MVT::v4i32, 4*20 },
+ { ISD::UDIV, MVT::v2i64, 2*20 },
+ };
+
+ if (ST->hasSSE2()) {
+ if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry AVX1CostTable[] = {
+ // We don't have to scalarize unsupported ops. We can issue two half-sized
+ // operations and we only need to extract the upper YMM half.
+ // Two ops + 1 extract + 1 insert = 4.
+ { ISD::MUL, MVT::v16i16, 4 },
+ { ISD::MUL, MVT::v8i32, 4 },
+ { ISD::SUB, MVT::v32i8, 4 },
+ { ISD::ADD, MVT::v32i8, 4 },
+ { ISD::SUB, MVT::v16i16, 4 },
+ { ISD::ADD, MVT::v16i16, 4 },
+ { ISD::SUB, MVT::v8i32, 4 },
+ { ISD::ADD, MVT::v8i32, 4 },
+ { ISD::SUB, MVT::v4i64, 4 },
+ { ISD::ADD, MVT::v4i64, 4 },
+ // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
+ // are lowered as a series of long multiplies(3), shifts(3) and adds(2)
+ // Because we believe v4i64 to be a legal type, we must also include the
+ // split factor of two in the cost table. Therefore, the cost here is 16
+ // instead of 8.
+ { ISD::MUL, MVT::v4i64, 16 },
+ };
+
+ // Look for AVX1 lowering tricks.
+ if (ST->hasAVX() && !ST->hasAVX2()) {
+ MVT VT = LT.second;
+
+ if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, VT))
+ return LT.first * Entry->Cost;
+ }
+
+ // Custom lowering of vectors.
+ static const CostTblEntry CustomLowered[] = {
+ // A v2i64/v4i64 and multiply is custom lowered as a series of long
+ // multiplies(3), shifts(3) and adds(2).
+ { ISD::MUL, MVT::v2i64, 8 },
+ { ISD::MUL, MVT::v4i64, 8 },
+ { ISD::MUL, MVT::v8i64, 8 }
+ };
+ if (const auto *Entry = CostTableLookup(CustomLowered, ISD, LT.second))
+ return LT.first * Entry->Cost;
+
+ // Special lowering of v4i32 mul on sse2, sse3: Lower v4i32 mul as 2x shuffle,
+ // 2x pmuludq, 2x shuffle.
+ if (ISD == ISD::MUL && LT.second == MVT::v4i32 && ST->hasSSE2() &&
+ !ST->hasSSE41())
+ return LT.first * 6;
+
+ static const CostTblEntry SSE1FloatCostTable[] = {
+ { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/
+ { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
+ };
+
+ if (ST->hasSSE1())
+ if (const auto *Entry = CostTableLookup(SSE1FloatCostTable, ISD,
+ LT.second))
+ return LT.first * Entry->Cost;
+ // Fallback to the default implementation.
+ return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
+}
+
+int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+ Type *SubTp) {
+
+ if (Kind == TTI::SK_Reverse) {
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+
+ static const CostTblEntry AVX512VBMIShuffleTbl[] = {
+ { ISD::VECTOR_SHUFFLE, MVT::v64i8, 1 }, // vpermb
+ { ISD::VECTOR_SHUFFLE, MVT::v32i8, 1 } // vpermb
+ };
+
+ if (ST->hasVBMI())
+ if (const auto *Entry = CostTableLookup(AVX512VBMIShuffleTbl,
+ ISD::VECTOR_SHUFFLE, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry AVX512BWShuffleTbl[] = {
+ { ISD::VECTOR_SHUFFLE, MVT::v32i16, 1 }, // vpermw
+ { ISD::VECTOR_SHUFFLE, MVT::v16i16, 1 }, // vpermw
+ { ISD::VECTOR_SHUFFLE, MVT::v64i8, 6 } // vextracti64x4 + 2*vperm2i128
+ // + 2*pshufb + vinserti64x4
+ };
+
+ if (ST->hasBWI())
+ if (const auto *Entry = CostTableLookup(AVX512BWShuffleTbl,
+ ISD::VECTOR_SHUFFLE, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry AVX512ShuffleTbl[] = {
+ { ISD::VECTOR_SHUFFLE, MVT::v8f64, 1 }, // vpermpd
+ { ISD::VECTOR_SHUFFLE, MVT::v16f32, 1 }, // vpermps
+ { ISD::VECTOR_SHUFFLE, MVT::v8i64, 1 }, // vpermq
+ { ISD::VECTOR_SHUFFLE, MVT::v16i32, 1 }, // vpermd
+ };
+
+ if (ST->hasAVX512())
+ if (const auto *Entry =
+ CostTableLookup(AVX512ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry AVX2ShuffleTbl[] = {
+ { ISD::VECTOR_SHUFFLE, MVT::v4f64, 1 }, // vpermpd
+ { ISD::VECTOR_SHUFFLE, MVT::v8f32, 1 }, // vpermps
+ { ISD::VECTOR_SHUFFLE, MVT::v4i64, 1 }, // vpermq
+ { ISD::VECTOR_SHUFFLE, MVT::v8i32, 1 }, // vpermd
+ { ISD::VECTOR_SHUFFLE, MVT::v16i16, 2 }, // vperm2i128 + pshufb
+ { ISD::VECTOR_SHUFFLE, MVT::v32i8, 2 } // vperm2i128 + pshufb
+ };
+
+ if (ST->hasAVX2())
+ if (const auto *Entry =
+ CostTableLookup(AVX2ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry AVX1ShuffleTbl[] = {
+ { ISD::VECTOR_SHUFFLE, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd
+ { ISD::VECTOR_SHUFFLE, MVT::v8f32, 2 }, // vperm2f128 + vpermilps
+ { ISD::VECTOR_SHUFFLE, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd
+ { ISD::VECTOR_SHUFFLE, MVT::v8i32, 2 }, // vperm2f128 + vpermilps
+ { ISD::VECTOR_SHUFFLE, MVT::v16i16, 4 }, // vextractf128 + 2*pshufb
+ // + vinsertf128
+ { ISD::VECTOR_SHUFFLE, MVT::v32i8, 4 } // vextractf128 + 2*pshufb
+ // + vinsertf128
+ };
+
+ if (ST->hasAVX())
+ if (const auto *Entry =
+ CostTableLookup(AVX1ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry SSSE3ShuffleTbl[] = {
+ { ISD::VECTOR_SHUFFLE, MVT::v8i16, 1 }, // pshufb
+ { ISD::VECTOR_SHUFFLE, MVT::v16i8, 1 } // pshufb
+ };
+
+ if (ST->hasSSSE3())
+ if (const auto *Entry =
+ CostTableLookup(SSSE3ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry SSE2ShuffleTbl[] = {
+ { ISD::VECTOR_SHUFFLE, MVT::v2f64, 1 }, // shufpd
+ { ISD::VECTOR_SHUFFLE, MVT::v2i64, 1 }, // pshufd
+ { ISD::VECTOR_SHUFFLE, MVT::v4i32, 1 }, // pshufd
+ { ISD::VECTOR_SHUFFLE, MVT::v8i16, 3 }, // pshuflw + pshufhw + pshufd
+ { ISD::VECTOR_SHUFFLE, MVT::v16i8, 9 } // 2*pshuflw + 2*pshufhw
+ // + 2*pshufd + 2*unpck + packus
+ };
+
+ if (ST->hasSSE2())
+ if (const auto *Entry =
+ CostTableLookup(SSE2ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry SSE1ShuffleTbl[] = {
+ { ISD::VECTOR_SHUFFLE, MVT::v4f32, 1 }, // shufps
+ };
+
+ if (ST->hasSSE1())
+ if (const auto *Entry =
+ CostTableLookup(SSE1ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
+ return LT.first * Entry->Cost;
+
+ } else if (Kind == TTI::SK_Alternate) {
+ // 64-bit packed float vectors (v2f32) are widened to type v4f32.
+ // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+
+ // The backend knows how to generate a single VEX.256 version of
+ // instruction VPBLENDW if the target supports AVX2.
+ if (ST->hasAVX2() && LT.second == MVT::v16i16)
+ return LT.first;
+
+ static const CostTblEntry AVXAltShuffleTbl[] = {
+ {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1}, // vblendpd
+ {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1}, // vblendpd
+
+ {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1}, // vblendps
+ {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1}, // vblendps
+
+ // This shuffle is custom lowered into a sequence of:
+ // 2x vextractf128 , 2x vpblendw , 1x vinsertf128
+ {ISD::VECTOR_SHUFFLE, MVT::v16i16, 5},
+
+ // This shuffle is custom lowered into a long sequence of:
+ // 2x vextractf128 , 4x vpshufb , 2x vpor , 1x vinsertf128
+ {ISD::VECTOR_SHUFFLE, MVT::v32i8, 9}
+ };
+
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVXAltShuffleTbl,
+ ISD::VECTOR_SHUFFLE, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry SSE41AltShuffleTbl[] = {
+ // These are lowered into movsd.
+ {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
+
+ // packed float vectors with four elements are lowered into BLENDI dag
+ // nodes. A v4i32/v4f32 BLENDI generates a single 'blendps'/'blendpd'.
+ {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
+
+ // This shuffle generates a single pshufw.
+ {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
+
+ // There is no instruction that matches a v16i8 alternate shuffle.
+ // The backend will expand it into the sequence 'pshufb + pshufb + or'.
+ {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3}
+ };
+
+ if (ST->hasSSE41())
+ if (const auto *Entry = CostTableLookup(SSE41AltShuffleTbl, ISD::VECTOR_SHUFFLE,
+ LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry SSSE3AltShuffleTbl[] = {
+ {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // movsd
+ {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // movsd
+
+ // SSE3 doesn't have 'blendps'. The following shuffles are expanded into
+ // the sequence 'shufps + pshufd'
+ {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
+ {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
+
+ {ISD::VECTOR_SHUFFLE, MVT::v8i16, 3}, // pshufb + pshufb + or
+ {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3} // pshufb + pshufb + or
+ };
+
+ if (ST->hasSSSE3())
+ if (const auto *Entry = CostTableLookup(SSSE3AltShuffleTbl,
+ ISD::VECTOR_SHUFFLE, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry SSEAltShuffleTbl[] = {
+ {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // movsd
+ {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // movsd
+
+ {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, // shufps + pshufd
+ {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, // shufps + pshufd
+
+ // This is expanded into a long sequence of four extract + four insert.
+ {ISD::VECTOR_SHUFFLE, MVT::v8i16, 8}, // 4 x pextrw + 4 pinsrw.
+
+ // 8 x (pinsrw + pextrw + and + movb + movzb + or)
+ {ISD::VECTOR_SHUFFLE, MVT::v16i8, 48}
+ };
+
+ // Fall-back (SSE3 and SSE2).
+ if (const auto *Entry = CostTableLookup(SSEAltShuffleTbl,
+ ISD::VECTOR_SHUFFLE, LT.second))
+ return LT.first * Entry->Cost;
+
+ } else if (Kind == TTI::SK_PermuteTwoSrc) {
+ // We assume that source and destination have the same vector type.
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+ int NumOfDests = LT.first;
+ int NumOfShufflesPerDest = LT.first * 2 - 1;
+ int NumOfShuffles = NumOfDests * NumOfShufflesPerDest;
+
+ static const CostTblEntry AVX512VBMIShuffleTbl[] = {
+ {ISD::VECTOR_SHUFFLE, MVT::v64i8, 1}, // vpermt2b
+ {ISD::VECTOR_SHUFFLE, MVT::v32i8, 1}, // vpermt2b
+ {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1} // vpermt2b
+ };
+
+ if (ST->hasVBMI())
+ if (const auto *Entry = CostTableLookup(AVX512VBMIShuffleTbl,
+ ISD::VECTOR_SHUFFLE, LT.second))
+ return NumOfShuffles * Entry->Cost;
+
+ static const CostTblEntry AVX512BWShuffleTbl[] = {
+ {ISD::VECTOR_SHUFFLE, MVT::v32i16, 1}, // vpermt2w
+ {ISD::VECTOR_SHUFFLE, MVT::v16i16, 1}, // vpermt2w
+ {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, // vpermt2w
+ {ISD::VECTOR_SHUFFLE, MVT::v32i8, 3}, // zext + vpermt2w + trunc
+ {ISD::VECTOR_SHUFFLE, MVT::v64i8, 19}, // 6 * v32i8 + 1
+ {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3} // zext + vpermt2w + trunc
+ };
+
+ if (ST->hasBWI())
+ if (const auto *Entry = CostTableLookup(AVX512BWShuffleTbl,
+ ISD::VECTOR_SHUFFLE, LT.second))
+ return NumOfShuffles * Entry->Cost;
+
+ static const CostTblEntry AVX512ShuffleTbl[] = {
+ {ISD::VECTOR_SHUFFLE, MVT::v8f64, 1}, // vpermt2pd
+ {ISD::VECTOR_SHUFFLE, MVT::v16f32, 1}, // vpermt2ps
+ {ISD::VECTOR_SHUFFLE, MVT::v8i64, 1}, // vpermt2q
+ {ISD::VECTOR_SHUFFLE, MVT::v16i32, 1}, // vpermt2d
+ {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1}, // vpermt2pd
+ {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1}, // vpermt2ps
+ {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1}, // vpermt2q
+ {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1}, // vpermt2d
+ {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // vpermt2pd
+ {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, // vpermt2ps
+ {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // vpermt2q
+ {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1} // vpermt2d
+ };
+
+ if (ST->hasAVX512())
+ if (const auto *Entry =
+ CostTableLookup(AVX512ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
+ return NumOfShuffles * Entry->Cost;
+
+ } else if (Kind == TTI::SK_PermuteSingleSrc) {
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+ if (LT.first == 1) {
+
+ static const CostTblEntry AVX512VBMIShuffleTbl[] = {
+ {ISD::VECTOR_SHUFFLE, MVT::v64i8, 1}, // vpermb
+ {ISD::VECTOR_SHUFFLE, MVT::v32i8, 1} // vpermb
+ };
+
+ if (ST->hasVBMI())
+ if (const auto *Entry = CostTableLookup(AVX512VBMIShuffleTbl,
+ ISD::VECTOR_SHUFFLE, LT.second))
+ return Entry->Cost;
+
+ static const CostTblEntry AVX512BWShuffleTbl[] = {
+ {ISD::VECTOR_SHUFFLE, MVT::v32i16, 1}, // vpermw
+ {ISD::VECTOR_SHUFFLE, MVT::v16i16, 1}, // vpermw
+ {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, // vpermw
+ {ISD::VECTOR_SHUFFLE, MVT::v64i8, 8}, // extend to v32i16
+ {ISD::VECTOR_SHUFFLE, MVT::v32i8, 3} // vpermw + zext/trunc
+ };
+
+ if (ST->hasBWI())
+ if (const auto *Entry = CostTableLookup(AVX512BWShuffleTbl,
+ ISD::VECTOR_SHUFFLE, LT.second))
+ return Entry->Cost;
+
+ static const CostTblEntry AVX512ShuffleTbl[] = {
+ {ISD::VECTOR_SHUFFLE, MVT::v8f64, 1}, // vpermpd
+ {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1}, // vpermpd
+ {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // vpermpd
+ {ISD::VECTOR_SHUFFLE, MVT::v16f32, 1}, // vpermps
+ {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1}, // vpermps
+ {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, // vpermps
+ {ISD::VECTOR_SHUFFLE, MVT::v8i64, 1}, // vpermq
+ {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1}, // vpermq
+ {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // vpermq
+ {ISD::VECTOR_SHUFFLE, MVT::v16i32, 1}, // vpermd
+ {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1}, // vpermd
+ {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}, // vpermd
+ {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1} // pshufb
+ };
+
+ if (ST->hasAVX512())
+ if (const auto *Entry =
+ CostTableLookup(AVX512ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
+ return Entry->Cost;
+
+ } else {
+ // We are going to permute multiple sources and the result will be in
+ // multiple destinations. Providing an accurate cost only for splits where
+ // the element type remains the same.
+
+ MVT LegalVT = LT.second;
+ if (LegalVT.getVectorElementType().getSizeInBits() ==
+ Tp->getVectorElementType()->getPrimitiveSizeInBits() &&
+ LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) {
+
+ unsigned VecTySize = DL.getTypeStoreSize(Tp);
+ unsigned LegalVTSize = LegalVT.getStoreSize();
+ // Number of source vectors after legalization:
+ unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
+ // Number of destination vectors after legalization:
+ unsigned NumOfDests = LT.first;
+
+ Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(),
+ LegalVT.getVectorNumElements());
+
+ unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
+ return NumOfShuffles *
+ getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr);
+ }
+ }
+ }
+
+ return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+}
+
+int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ assert(ISD && "Invalid opcode");
+
+ // FIXME: Need a better design of the cost table to handle non-simple types of
+ // potential massive combinations (elem_num x src_type x dst_type).
+
+ static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
+ { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
+ { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
+
+ { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
+
+ { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 1 },
+ { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 },
+ { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 },
+ { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
+ { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 },
+ { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 },
+
+ { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
+ { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 },
+ { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 },
+ };
+
+ // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
+ // 256-bit wide vectors.
+
+ static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
+ { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 },
+ { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 },
+ { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 },
+
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 1 },
+ { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 1 },
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 1 },
+ { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 },
+
+ // v16i1 -> v16i32 - load + broadcast
+ { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 },
+ { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
+ { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
+
+ { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
+ { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
+ { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 },
+ { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 },
+ { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
+ { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 },
+ { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
+ { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 },
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 26 },
+
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
+ { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 5 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
+ { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
+ { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 12 },
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 26 },
+
+ { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 },
+ };
+
+ static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 3 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 3 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
+
+ { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 },
+ { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 },
+ { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 },
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 },
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 },
+ { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 4 },
+
+ { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 },
+ { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 },
+
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 },
+ };
+
+ static const TypeConversionCostTblEntry AVXConversionTbl[] = {
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 6 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 7 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 4 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 6 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 4 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 4 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 4 },
+
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 4 },
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 },
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
+ { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 4 },
+ { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 4 },
+ { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 4 },
+ { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 9 },
+
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i8, 3 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 8 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 3 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i16, 3 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
+
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 5 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 6 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 6 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 9 },
+ // The generic code to compute the scalar overhead is currently broken.
+ // Workaround this limitation by estimating the scalarization overhead
+ // here. We have roughly 10 instructions per scalar element.
+ // Multiply that by the vector width.
+ // FIXME: remove that when PR19268 is fixed.
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 10 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 20 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 },
+
+ { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 },
+ { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 7 },
+ // This node is expanded into scalarized operations but BasicTTI is overly
+ // optimistic estimating its cost. It computes 3 per element (one
+ // vector-extract, one scalar conversion and one vector-insert). The
+ // problem is that the inserts form a read-modify-write chain so latency
+ // should be factored in too. Inflating the cost per element by 1.
+ { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 8*4 },
+ { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4*4 },
+
+ { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 },
+ { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 1 },
+ };
+
+ static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 2 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 2 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
+
+ { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 2 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 2 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
+ { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 4 },
+ { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 4 },
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
+ { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
+ { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
+
+ { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 },
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1 },
+ { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1 },
+ { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 3 },
+ { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 },
+
+ };
+
+ static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
+ // These are somewhat magic numbers justified by looking at the output of
+ // Intel's IACA, running some kernels and making sure when we take
+ // legalization into account the throughput will be overestimated.
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
+
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
+
+ { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 3 },
+
+ { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 },
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 3 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 8 },
+ { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 2 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 6 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 6 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
+ { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 9 },
+ { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 12 },
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 10 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 },
+ { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 6 },
+ { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 8 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 5 },
+
+ { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 4 },
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 },
+ { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 3 },
+ { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 3 },
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 },
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
+ { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 10 },
+ };
+
+ std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
+ std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst);
+
+ if (ST->hasSSE2() && !ST->hasAVX()) {
+ if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
+ LTDest.second, LTSrc.second))
+ return LTSrc.first * Entry->Cost;
+ }
+
+ EVT SrcTy = TLI->getValueType(DL, Src);
+ EVT DstTy = TLI->getValueType(DL, Dst);
+
+ // The function getSimpleVT only handles simple value types.
+ if (!SrcTy.isSimple() || !DstTy.isSimple())
+ return BaseT::getCastInstrCost(Opcode, Dst, Src);
+
+ if (ST->hasDQI())
+ if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
+ DstTy.getSimpleVT(),
+ SrcTy.getSimpleVT()))
+ return Entry->Cost;
+
+ if (ST->hasAVX512())
+ if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
+ DstTy.getSimpleVT(),
+ SrcTy.getSimpleVT()))
+ return Entry->Cost;
+
+ if (ST->hasAVX2()) {
+ if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
+ DstTy.getSimpleVT(),
+ SrcTy.getSimpleVT()))
+ return Entry->Cost;
+ }
+
+ if (ST->hasAVX()) {
+ if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
+ DstTy.getSimpleVT(),
+ SrcTy.getSimpleVT()))
+ return Entry->Cost;
+ }
+
+ if (ST->hasSSE41()) {
+ if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
+ DstTy.getSimpleVT(),
+ SrcTy.getSimpleVT()))
+ return Entry->Cost;
+ }
+
+ if (ST->hasSSE2()) {
+ if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
+ DstTy.getSimpleVT(),
+ SrcTy.getSimpleVT()))
+ return Entry->Cost;
+ }
+
+ return BaseT::getCastInstrCost(Opcode, Dst, Src);
+}
+
+int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
+ // Legalize the type.
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+
+ MVT MTy = LT.second;
+
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ assert(ISD && "Invalid opcode");
+
+ static const CostTblEntry SSE2CostTbl[] = {
+ { ISD::SETCC, MVT::v2i64, 8 },
+ { ISD::SETCC, MVT::v4i32, 1 },
+ { ISD::SETCC, MVT::v8i16, 1 },
+ { ISD::SETCC, MVT::v16i8, 1 },
+ };
+
+ static const CostTblEntry SSE42CostTbl[] = {
+ { ISD::SETCC, MVT::v2f64, 1 },
+ { ISD::SETCC, MVT::v4f32, 1 },
+ { ISD::SETCC, MVT::v2i64, 1 },
+ };
+
+ static const CostTblEntry AVX1CostTbl[] = {
+ { ISD::SETCC, MVT::v4f64, 1 },
+ { ISD::SETCC, MVT::v8f32, 1 },
+ // AVX1 does not support 8-wide integer compare.
+ { ISD::SETCC, MVT::v4i64, 4 },
+ { ISD::SETCC, MVT::v8i32, 4 },
+ { ISD::SETCC, MVT::v16i16, 4 },
+ { ISD::SETCC, MVT::v32i8, 4 },
+ };
+
+ static const CostTblEntry AVX2CostTbl[] = {
+ { ISD::SETCC, MVT::v4i64, 1 },
+ { ISD::SETCC, MVT::v8i32, 1 },
+ { ISD::SETCC, MVT::v16i16, 1 },
+ { ISD::SETCC, MVT::v32i8, 1 },
+ };
+
+ static const CostTblEntry AVX512CostTbl[] = {
+ { ISD::SETCC, MVT::v8i64, 1 },
+ { ISD::SETCC, MVT::v16i32, 1 },
+ { ISD::SETCC, MVT::v8f64, 1 },
+ { ISD::SETCC, MVT::v16f32, 1 },
+ };
+
+ if (ST->hasAVX512())
+ if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasAVX2())
+ if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasSSE42())
+ if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasSSE2())
+ if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+}
+
+int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
+ ArrayRef<Type *> Tys, FastMathFlags FMF) {
+ // Costs should match the codegen from:
+ // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
+ // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
+ // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
+ // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
+ // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
+ static const CostTblEntry XOPCostTbl[] = {
+ { ISD::BITREVERSE, MVT::v4i64, 4 },
+ { ISD::BITREVERSE, MVT::v8i32, 4 },
+ { ISD::BITREVERSE, MVT::v16i16, 4 },
+ { ISD::BITREVERSE, MVT::v32i8, 4 },
+ { ISD::BITREVERSE, MVT::v2i64, 1 },
+ { ISD::BITREVERSE, MVT::v4i32, 1 },
+ { ISD::BITREVERSE, MVT::v8i16, 1 },
+ { ISD::BITREVERSE, MVT::v16i8, 1 },
+ { ISD::BITREVERSE, MVT::i64, 3 },
+ { ISD::BITREVERSE, MVT::i32, 3 },
+ { ISD::BITREVERSE, MVT::i16, 3 },
+ { ISD::BITREVERSE, MVT::i8, 3 }
+ };
+ static const CostTblEntry AVX2CostTbl[] = {
+ { ISD::BITREVERSE, MVT::v4i64, 5 },
+ { ISD::BITREVERSE, MVT::v8i32, 5 },
+ { ISD::BITREVERSE, MVT::v16i16, 5 },
+ { ISD::BITREVERSE, MVT::v32i8, 5 },
+ { ISD::BSWAP, MVT::v4i64, 1 },
+ { ISD::BSWAP, MVT::v8i32, 1 },
+ { ISD::BSWAP, MVT::v16i16, 1 },
+ { ISD::CTLZ, MVT::v4i64, 23 },
+ { ISD::CTLZ, MVT::v8i32, 18 },
+ { ISD::CTLZ, MVT::v16i16, 14 },
+ { ISD::CTLZ, MVT::v32i8, 9 },
+ { ISD::CTPOP, MVT::v4i64, 7 },
+ { ISD::CTPOP, MVT::v8i32, 11 },
+ { ISD::CTPOP, MVT::v16i16, 9 },
+ { ISD::CTPOP, MVT::v32i8, 6 },
+ { ISD::CTTZ, MVT::v4i64, 10 },
+ { ISD::CTTZ, MVT::v8i32, 14 },
+ { ISD::CTTZ, MVT::v16i16, 12 },
+ { ISD::CTTZ, MVT::v32i8, 9 },
+ { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/
+ { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
+ { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
+ { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/
+ { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
+ { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
+ };
+ static const CostTblEntry AVX1CostTbl[] = {
+ { ISD::BITREVERSE, MVT::v4i64, 10 },
+ { ISD::BITREVERSE, MVT::v8i32, 10 },
+ { ISD::BITREVERSE, MVT::v16i16, 10 },
+ { ISD::BITREVERSE, MVT::v32i8, 10 },
+ { ISD::BSWAP, MVT::v4i64, 4 },
+ { ISD::BSWAP, MVT::v8i32, 4 },
+ { ISD::BSWAP, MVT::v16i16, 4 },
+ { ISD::CTLZ, MVT::v4i64, 46 },
+ { ISD::CTLZ, MVT::v8i32, 36 },
+ { ISD::CTLZ, MVT::v16i16, 28 },
+ { ISD::CTLZ, MVT::v32i8, 18 },
+ { ISD::CTPOP, MVT::v4i64, 14 },
+ { ISD::CTPOP, MVT::v8i32, 22 },
+ { ISD::CTPOP, MVT::v16i16, 18 },
+ { ISD::CTPOP, MVT::v32i8, 12 },
+ { ISD::CTTZ, MVT::v4i64, 20 },
+ { ISD::CTTZ, MVT::v8i32, 28 },
+ { ISD::CTTZ, MVT::v16i16, 24 },
+ { ISD::CTTZ, MVT::v32i8, 18 },
+ { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/
+ { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
+ { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
+ { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/
+ { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/
+ { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/
+ };
+ static const CostTblEntry SSE42CostTbl[] = {
+ { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
+ { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
+ };
+ static const CostTblEntry SSSE3CostTbl[] = {
+ { ISD::BITREVERSE, MVT::v2i64, 5 },
+ { ISD::BITREVERSE, MVT::v4i32, 5 },
+ { ISD::BITREVERSE, MVT::v8i16, 5 },
+ { ISD::BITREVERSE, MVT::v16i8, 5 },
+ { ISD::BSWAP, MVT::v2i64, 1 },
+ { ISD::BSWAP, MVT::v4i32, 1 },
+ { ISD::BSWAP, MVT::v8i16, 1 },
+ { ISD::CTLZ, MVT::v2i64, 23 },
+ { ISD::CTLZ, MVT::v4i32, 18 },
+ { ISD::CTLZ, MVT::v8i16, 14 },
+ { ISD::CTLZ, MVT::v16i8, 9 },
+ { ISD::CTPOP, MVT::v2i64, 7 },
+ { ISD::CTPOP, MVT::v4i32, 11 },
+ { ISD::CTPOP, MVT::v8i16, 9 },
+ { ISD::CTPOP, MVT::v16i8, 6 },
+ { ISD::CTTZ, MVT::v2i64, 10 },
+ { ISD::CTTZ, MVT::v4i32, 14 },
+ { ISD::CTTZ, MVT::v8i16, 12 },
+ { ISD::CTTZ, MVT::v16i8, 9 }
+ };
+ static const CostTblEntry SSE2CostTbl[] = {
+ { ISD::BSWAP, MVT::v2i64, 7 },
+ { ISD::BSWAP, MVT::v4i32, 7 },
+ { ISD::BSWAP, MVT::v8i16, 7 },
+ { ISD::CTLZ, MVT::v2i64, 25 },
+ { ISD::CTLZ, MVT::v4i32, 26 },
+ { ISD::CTLZ, MVT::v8i16, 20 },
+ { ISD::CTLZ, MVT::v16i8, 17 },
+ { ISD::CTPOP, MVT::v2i64, 12 },
+ { ISD::CTPOP, MVT::v4i32, 15 },
+ { ISD::CTPOP, MVT::v8i16, 13 },
+ { ISD::CTPOP, MVT::v16i8, 10 },
+ { ISD::CTTZ, MVT::v2i64, 14 },
+ { ISD::CTTZ, MVT::v4i32, 18 },
+ { ISD::CTTZ, MVT::v8i16, 16 },
+ { ISD::CTTZ, MVT::v16i8, 13 },
+ { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/
+ { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/
+ };
+ static const CostTblEntry SSE1CostTbl[] = {
+ { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
+ { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
+ };
+
+ unsigned ISD = ISD::DELETED_NODE;
+ switch (IID) {
+ default:
+ break;
+ case Intrinsic::bitreverse:
+ ISD = ISD::BITREVERSE;
+ break;
+ case Intrinsic::bswap:
+ ISD = ISD::BSWAP;
+ break;
+ case Intrinsic::ctlz:
+ ISD = ISD::CTLZ;
+ break;
+ case Intrinsic::ctpop:
+ ISD = ISD::CTPOP;
+ break;
+ case Intrinsic::cttz:
+ ISD = ISD::CTTZ;
+ break;
+ case Intrinsic::sqrt:
+ ISD = ISD::FSQRT;
+ break;
+ }
+
+ // Legalize the type.
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
+ MVT MTy = LT.second;
+
+ // Attempt to lookup cost.
+ if (ST->hasXOP())
+ if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasAVX2())
+ if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasSSE42())
+ if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasSSSE3())
+ if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasSSE2())
+ if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasSSE1())
+ if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF);
+}
+
+int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
+ ArrayRef<Value *> Args, FastMathFlags FMF) {
+ return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF);
+}
+
+int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
+ assert(Val->isVectorTy() && "This must be a vector type");
+
+ Type *ScalarType = Val->getScalarType();
+
+ if (Index != -1U) {
+ // Legalize the type.
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
+
+ // This type is legalized to a scalar type.
+ if (!LT.second.isVector())
+ return 0;
+
+ // The type may be split. Normalize the index to the new type.
+ unsigned Width = LT.second.getVectorNumElements();
+ Index = Index % Width;
+
+ // Floating point scalars are already located in index #0.
+ if (ScalarType->isFloatingPointTy() && Index == 0)
+ return 0;
+ }
+
+ // Add to the base cost if we know that the extracted element of a vector is
+ // destined to be moved to and used in the integer register file.
+ int RegisterFileMoveCost = 0;
+ if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
+ RegisterFileMoveCost = 1;
+
+ return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
+}
+
+int X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
+ assert (Ty->isVectorTy() && "Can only scalarize vectors");
+ int Cost = 0;
+
+ for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
+ if (Insert)
+ Cost += getVectorInstrCost(Instruction::InsertElement, Ty, i);
+ if (Extract)
+ Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, i);
+ }
+
+ return Cost;
+}
+
+int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+ unsigned AddressSpace) {
+ // Handle non-power-of-two vectors such as <3 x float>
+ if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
+ unsigned NumElem = VTy->getVectorNumElements();
+
+ // Handle a few common cases:
+ // <3 x float>
+ if (NumElem == 3 && VTy->getScalarSizeInBits() == 32)
+ // Cost = 64 bit store + extract + 32 bit store.
+ return 3;
+
+ // <3 x double>
+ if (NumElem == 3 && VTy->getScalarSizeInBits() == 64)
+ // Cost = 128 bit store + unpack + 64 bit store.
+ return 3;
+
+ // Assume that all other non-power-of-two numbers are scalarized.
+ if (!isPowerOf2_32(NumElem)) {
+ int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,
+ AddressSpace);
+ int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load,
+ Opcode == Instruction::Store);
+ return NumElem * Cost + SplitCost;
+ }
+ }
+
+ // Legalize the type.
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
+ assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
+ "Invalid Opcode");
+
+ // Each load/store unit costs 1.
+ int Cost = LT.first * 1;
+
+ // This isn't exactly right. We're using slow unaligned 32-byte accesses as a
+ // proxy for a double-pumped AVX memory interface such as on Sandybridge.
+ if (LT.second.getStoreSize() == 32 && ST->isUnalignedMem32Slow())
+ Cost *= 2;
+
+ return Cost;
+}
+
+int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
+ unsigned Alignment,
+ unsigned AddressSpace) {
+ VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy);
+ if (!SrcVTy)
+ // To calculate scalar take the regular cost, without mask
+ return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace);
+
+ unsigned NumElem = SrcVTy->getVectorNumElements();
+ VectorType *MaskTy =
+ VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
+ if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy)) ||
+ (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy)) ||
+ !isPowerOf2_32(NumElem)) {
+ // Scalarization
+ int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
+ int ScalarCompareCost = getCmpSelInstrCost(
+ Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr);
+ int BranchCost = getCFInstrCost(Instruction::Br);
+ int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
+
+ int ValueSplitCost = getScalarizationOverhead(
+ SrcVTy, Opcode == Instruction::Load, Opcode == Instruction::Store);
+ int MemopCost =
+ NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
+ Alignment, AddressSpace);
+ return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
+ }
+
+ // Legalize the type.
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
+ auto VT = TLI->getValueType(DL, SrcVTy);
+ int Cost = 0;
+ if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
+ LT.second.getVectorNumElements() == NumElem)
+ // Promotion requires expand/truncate for data and a shuffle for mask.
+ Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, nullptr) +
+ getShuffleCost(TTI::SK_Alternate, MaskTy, 0, nullptr);
+
+ else if (LT.second.getVectorNumElements() > NumElem) {
+ VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(),
+ LT.second.getVectorNumElements());
+ // Expanding requires fill mask with zeroes
+ Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy);
+ }
+ if (!ST->hasAVX512())
+ return Cost + LT.first*4; // Each maskmov costs 4
+
+ // AVX-512 masked load/store is cheapper
+ return Cost+LT.first;
+}
+
+int X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
+ // Address computations in vectorized code with non-consecutive addresses will
+ // likely result in more instructions compared to scalar code where the
+ // computation can more often be merged into the index mode. The resulting
+ // extra micro-ops can significantly decrease throughput.
+ unsigned NumVectorInstToHideOverhead = 10;
+
+ if (Ty->isVectorTy() && IsComplex)
+ return NumVectorInstToHideOverhead;
+
+ return BaseT::getAddressComputationCost(Ty, IsComplex);
+}
+
+int X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy,
+ bool IsPairwise) {
+
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+
+ MVT MTy = LT.second;
+
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ assert(ISD && "Invalid opcode");
+
+ // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
+ // and make it as the cost.
+
+ static const CostTblEntry SSE42CostTblPairWise[] = {
+ { ISD::FADD, MVT::v2f64, 2 },
+ { ISD::FADD, MVT::v4f32, 4 },
+ { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
+ { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5".
+ { ISD::ADD, MVT::v8i16, 5 },
+ };
+
+ static const CostTblEntry AVX1CostTblPairWise[] = {
+ { ISD::FADD, MVT::v4f32, 4 },
+ { ISD::FADD, MVT::v4f64, 5 },
+ { ISD::FADD, MVT::v8f32, 7 },
+ { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
+ { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5".
+ { ISD::ADD, MVT::v4i64, 5 }, // The data reported by the IACA tool is "4.8".
+ { ISD::ADD, MVT::v8i16, 5 },
+ { ISD::ADD, MVT::v8i32, 5 },
+ };
+
+ static const CostTblEntry SSE42CostTblNoPairWise[] = {
+ { ISD::FADD, MVT::v2f64, 2 },
+ { ISD::FADD, MVT::v4f32, 4 },
+ { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
+ { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
+ { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
+ };
+
+ static const CostTblEntry AVX1CostTblNoPairWise[] = {
+ { ISD::FADD, MVT::v4f32, 3 },
+ { ISD::FADD, MVT::v4f64, 3 },
+ { ISD::FADD, MVT::v8f32, 4 },
+ { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
+ { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "2.8".
+ { ISD::ADD, MVT::v4i64, 3 },
+ { ISD::ADD, MVT::v8i16, 4 },
+ { ISD::ADD, MVT::v8i32, 5 },
+ };
+
+ if (IsPairwise) {
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasSSE42())
+ if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
+ return LT.first * Entry->Cost;
+ } else {
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasSSE42())
+ if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
+ return LT.first * Entry->Cost;
+ }
+
+ return BaseT::getReductionCost(Opcode, ValTy, IsPairwise);
+}
+
+/// \brief Calculate the cost of materializing a 64-bit value. This helper
+/// method might only calculate a fraction of a larger immediate. Therefore it
+/// is valid to return a cost of ZERO.
+int X86TTIImpl::getIntImmCost(int64_t Val) {
+ if (Val == 0)
+ return TTI::TCC_Free;
+
+ if (isInt<32>(Val))
+ return TTI::TCC_Basic;
+
+ return 2 * TTI::TCC_Basic;
+}
+
+int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
+ assert(Ty->isIntegerTy());
+
+ unsigned BitSize = Ty->getPrimitiveSizeInBits();
+ if (BitSize == 0)
+ return ~0U;
+
+ // Never hoist constants larger than 128bit, because this might lead to
+ // incorrect code generation or assertions in codegen.
+ // Fixme: Create a cost model for types larger than i128 once the codegen
+ // issues have been fixed.
+ if (BitSize > 128)
+ return TTI::TCC_Free;
+
+ if (Imm == 0)
+ return TTI::TCC_Free;
+
+ // Sign-extend all constants to a multiple of 64-bit.
+ APInt ImmVal = Imm;
+ if (BitSize & 0x3f)
+ ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
+
+ // Split the constant into 64-bit chunks and calculate the cost for each
+ // chunk.
+ int Cost = 0;
+ for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
+ APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
+ int64_t Val = Tmp.getSExtValue();
+ Cost += getIntImmCost(Val);
+ }
+ // We need at least one instruction to materialize the constant.
+ return std::max(1, Cost);
+}
+
+int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+ Type *Ty) {
+ assert(Ty->isIntegerTy());
+
+ unsigned BitSize = Ty->getPrimitiveSizeInBits();
+ // There is no cost model for constants with a bit size of 0. Return TCC_Free
+ // here, so that constant hoisting will ignore this constant.
+ if (BitSize == 0)
+ return TTI::TCC_Free;
+
+ unsigned ImmIdx = ~0U;
+ switch (Opcode) {
+ default:
+ return TTI::TCC_Free;
+ case Instruction::GetElementPtr:
+ // Always hoist the base address of a GetElementPtr. This prevents the
+ // creation of new constants for every base constant that gets constant
+ // folded with the offset.
+ if (Idx == 0)
+ return 2 * TTI::TCC_Basic;
+ return TTI::TCC_Free;
+ case Instruction::Store:
+ ImmIdx = 0;
+ break;
+ case Instruction::ICmp:
+ // This is an imperfect hack to prevent constant hoisting of
+ // compares that might be trying to check if a 64-bit value fits in
+ // 32-bits. The backend can optimize these cases using a right shift by 32.
+ // Ideally we would check the compare predicate here. There also other
+ // similar immediates the backend can use shifts for.
+ if (Idx == 1 && Imm.getBitWidth() == 64) {
+ uint64_t ImmVal = Imm.getZExtValue();
+ if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
+ return TTI::TCC_Free;
+ }
+ ImmIdx = 1;
+ break;
+ case Instruction::And:
+ // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
+ // by using a 32-bit operation with implicit zero extension. Detect such
+ // immediates here as the normal path expects bit 31 to be sign extended.
+ if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue()))
+ return TTI::TCC_Free;
+ LLVM_FALLTHROUGH;
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::Mul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::Or:
+ case Instruction::Xor:
+ ImmIdx = 1;
+ break;
+ // Always return TCC_Free for the shift value of a shift instruction.
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ if (Idx == 1)
+ return TTI::TCC_Free;
+ break;
+ case Instruction::Trunc:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::IntToPtr:
+ case Instruction::PtrToInt:
+ case Instruction::BitCast:
+ case Instruction::PHI:
+ case Instruction::Call:
+ case Instruction::Select:
+ case Instruction::Ret:
+ case Instruction::Load:
+ break;
+ }
+
+ if (Idx == ImmIdx) {
+ int NumConstants = (BitSize + 63) / 64;
+ int Cost = X86TTIImpl::getIntImmCost(Imm, Ty);
+ return (Cost <= NumConstants * TTI::TCC_Basic)
+ ? static_cast<int>(TTI::TCC_Free)
+ : Cost;
+ }
+
+ return X86TTIImpl::getIntImmCost(Imm, Ty);
+}
+
+int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+ Type *Ty) {
+ assert(Ty->isIntegerTy());
+
+ unsigned BitSize = Ty->getPrimitiveSizeInBits();
+ // There is no cost model for constants with a bit size of 0. Return TCC_Free
+ // here, so that constant hoisting will ignore this constant.
+ if (BitSize == 0)
+ return TTI::TCC_Free;
+
+ switch (IID) {
+ default:
+ return TTI::TCC_Free;
+ case Intrinsic::sadd_with_overflow:
+ case Intrinsic::uadd_with_overflow:
+ case Intrinsic::ssub_with_overflow:
+ case Intrinsic::usub_with_overflow:
+ case Intrinsic::smul_with_overflow:
+ case Intrinsic::umul_with_overflow:
+ if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
+ return TTI::TCC_Free;
+ break;
+ case Intrinsic::experimental_stackmap:
+ if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+ return TTI::TCC_Free;
+ break;
+ case Intrinsic::experimental_patchpoint_void:
+ case Intrinsic::experimental_patchpoint_i64:
+ if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+ return TTI::TCC_Free;
+ break;
+ }
+ return X86TTIImpl::getIntImmCost(Imm, Ty);
+}
+
+// Return an average cost of Gather / Scatter instruction, maybe improved later
+int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
+ unsigned Alignment, unsigned AddressSpace) {
+
+ assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
+ unsigned VF = SrcVTy->getVectorNumElements();
+
+ // Try to reduce index size from 64 bit (default for GEP)
+ // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
+ // operation will use 16 x 64 indices which do not fit in a zmm and needs
+ // to split. Also check that the base pointer is the same for all lanes,
+ // and that there's at most one variable index.
+ auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) {
+ unsigned IndexSize = DL.getPointerSizeInBits();
+ GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+ if (IndexSize < 64 || !GEP)
+ return IndexSize;
+
+ unsigned NumOfVarIndices = 0;
+ Value *Ptrs = GEP->getPointerOperand();
+ if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
+ return IndexSize;
+ for (unsigned i = 1; i < GEP->getNumOperands(); ++i) {
+ if (isa<Constant>(GEP->getOperand(i)))
+ continue;
+ Type *IndxTy = GEP->getOperand(i)->getType();
+ if (IndxTy->isVectorTy())
+ IndxTy = IndxTy->getVectorElementType();
+ if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
+ !isa<SExtInst>(GEP->getOperand(i))) ||
+ ++NumOfVarIndices > 1)
+ return IndexSize; // 64
+ }
+ return (unsigned)32;
+ };
+
+
+ // Trying to reduce IndexSize to 32 bits for vector 16.
+ // By default the IndexSize is equal to pointer size.
+ unsigned IndexSize = (VF >= 16) ? getIndexSizeInBits(Ptr, DL) :
+ DL.getPointerSizeInBits();
+
+ Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(),
+ IndexSize), VF);
+ std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);
+ std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);
+ int SplitFactor = std::max(IdxsLT.first, SrcLT.first);
+ if (SplitFactor > 1) {
+ // Handle splitting of vector of pointers
+ Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
+ return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
+ AddressSpace);
+ }
+
+ // The gather / scatter cost is given by Intel architects. It is a rough
+ // number since we are looking at one instruction in a time.
+ const int GSOverhead = 2;
+ return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
+ Alignment, AddressSpace);
+}
+
+/// Return the cost of full scalarization of gather / scatter operation.
+///
+/// Opcode - Load or Store instruction.
+/// SrcVTy - The type of the data vector that should be gathered or scattered.
+/// VariableMask - The mask is non-constant at compile time.
+/// Alignment - Alignment for one element.
+/// AddressSpace - pointer[s] address space.
+///
+int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
+ bool VariableMask, unsigned Alignment,
+ unsigned AddressSpace) {
+ unsigned VF = SrcVTy->getVectorNumElements();
+
+ int MaskUnpackCost = 0;
+ if (VariableMask) {
+ VectorType *MaskTy =
+ VectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
+ MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true);
+ int ScalarCompareCost =
+ getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()),
+ nullptr);
+ int BranchCost = getCFInstrCost(Instruction::Br);
+ MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
+ }
+
+ // The cost of the scalar loads/stores.
+ int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
+ Alignment, AddressSpace);
+
+ int InsertExtractCost = 0;
+ if (Opcode == Instruction::Load)
+ for (unsigned i = 0; i < VF; ++i)
+ // Add the cost of inserting each scalar load into the vector
+ InsertExtractCost +=
+ getVectorInstrCost(Instruction::InsertElement, SrcVTy, i);
+ else
+ for (unsigned i = 0; i < VF; ++i)
+ // Add the cost of extracting each element out of the data vector
+ InsertExtractCost +=
+ getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i);
+
+ return MemoryOpCost + MaskUnpackCost + InsertExtractCost;
+}
+
+/// Calculate the cost of Gather / Scatter operation
+int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
+ Value *Ptr, bool VariableMask,
+ unsigned Alignment) {
+ assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
+ unsigned VF = SrcVTy->getVectorNumElements();
+ PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
+ if (!PtrTy && Ptr->getType()->isVectorTy())
+ PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType());
+ assert(PtrTy && "Unexpected type for Ptr argument");
+ unsigned AddressSpace = PtrTy->getAddressSpace();
+
+ bool Scalarize = false;
+ if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy)) ||
+ (Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy)))
+ Scalarize = true;
+ // Gather / Scatter for vector 2 is not profitable on KNL / SKX
+ // Vector-4 of gather/scatter instruction does not exist on KNL.
+ // We can extend it to 8 elements, but zeroing upper bits of
+ // the mask vector will add more instructions. Right now we give the scalar
+ // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction
+ // is better in the VariableMask case.
+ if (VF == 2 || (VF == 4 && !ST->hasVLX()))
+ Scalarize = true;
+
+ if (Scalarize)
+ return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
+ AddressSpace);
+
+ return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
+}
+
+bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
+ Type *ScalarTy = DataTy->getScalarType();
+ int DataWidth = isa<PointerType>(ScalarTy) ?
+ DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits();
+
+ return ((DataWidth == 32 || DataWidth == 64) && ST->hasAVX()) ||
+ ((DataWidth == 8 || DataWidth == 16) && ST->hasBWI());
+}
+
+bool X86TTIImpl::isLegalMaskedStore(Type *DataType) {
+ return isLegalMaskedLoad(DataType);
+}
+
+bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) {
+ // This function is called now in two cases: from the Loop Vectorizer
+ // and from the Scalarizer.
+ // When the Loop Vectorizer asks about legality of the feature,
+ // the vectorization factor is not calculated yet. The Loop Vectorizer
+ // sends a scalar type and the decision is based on the width of the
+ // scalar element.
+ // Later on, the cost model will estimate usage this intrinsic based on
+ // the vector type.
+ // The Scalarizer asks again about legality. It sends a vector type.
+ // In this case we can reject non-power-of-2 vectors.
+ if (isa<VectorType>(DataTy) && !isPowerOf2_32(DataTy->getVectorNumElements()))
+ return false;
+ Type *ScalarTy = DataTy->getScalarType();
+ int DataWidth = isa<PointerType>(ScalarTy) ?
+ DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits();
+
+ // AVX-512 allows gather and scatter
+ return (DataWidth == 32 || DataWidth == 64) && ST->hasAVX512();
+}
+
+bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) {
+ return isLegalMaskedGather(DataType);
+}
+
+bool X86TTIImpl::areInlineCompatible(const Function *Caller,
+ const Function *Callee) const {
+ const TargetMachine &TM = getTLI()->getTargetMachine();
+
+ // Work this as a subsetting of subtarget features.
+ const FeatureBitset &CallerBits =
+ TM.getSubtargetImpl(*Caller)->getFeatureBits();
+ const FeatureBitset &CalleeBits =
+ TM.getSubtargetImpl(*Callee)->getFeatureBits();
+
+ // FIXME: This is likely too limiting as it will include subtarget features
+ // that we might not care about for inlining, but it is conservatively
+ // correct.
+ return (CallerBits & CalleeBits) == CalleeBits;
+}
+
+bool X86TTIImpl::enableInterleavedAccessVectorization() {
+ // TODO: We expect this to be beneficial regardless of arch,
+ // but there are currently some unexplained performance artifacts on Atom.
+ // As a temporary solution, disable on Atom.
+ return !(ST->isAtom() || ST->isSLM());
+}
+
+// Get estimation for interleaved load/store operations and strided load.
+// \p Indices contains indices for strided load.
+// \p Factor - the factor of interleaving.
+// AVX-512 provides 3-src shuffles that significantly reduces the cost.
+int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
+ unsigned Factor,
+ ArrayRef<unsigned> Indices,
+ unsigned Alignment,
+ unsigned AddressSpace) {
+
+ // VecTy for interleave memop is <VF*Factor x Elt>.
+ // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
+ // VecTy = <12 x i32>.
+
+ // Calculate the number of memory operations (NumOfMemOps), required
+ // for load/store the VecTy.
+ MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
+ unsigned VecTySize = DL.getTypeStoreSize(VecTy);
+ unsigned LegalVTSize = LegalVT.getStoreSize();
+ unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
+
+ // Get the cost of one memory operation.
+ Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
+ LegalVT.getVectorNumElements());
+ unsigned MemOpCost =
+ getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
+
+ if (Opcode == Instruction::Load) {
+ // Kind of shuffle depends on number of loaded values.
+ // If we load the entire data in one register, we can use a 1-src shuffle.
+ // Otherwise, we'll merge 2 sources in each operation.
+ TTI::ShuffleKind ShuffleKind =
+ (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
+
+ unsigned ShuffleCost =
+ getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr);
+
+ unsigned NumOfLoadsInInterleaveGrp =
+ Indices.size() ? Indices.size() : Factor;
+ Type *ResultTy = VectorType::get(VecTy->getVectorElementType(),
+ VecTy->getVectorNumElements() / Factor);
+ unsigned NumOfResults =
+ getTLI()->getTypeLegalizationCost(DL, ResultTy).first *
+ NumOfLoadsInInterleaveGrp;
+
+ // About a half of the loads may be folded in shuffles when we have only
+ // one result. If we have more than one result, we do not fold loads at all.
+ unsigned NumOfUnfoldedLoads =
+ NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
+
+ // Get a number of shuffle operations per result.
+ unsigned NumOfShufflesPerResult =
+ std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
+
+ // The SK_MergeTwoSrc shuffle clobbers one of src operands.
+ // When we have more than one destination, we need additional instructions
+ // to keep sources.
+ unsigned NumOfMoves = 0;
+ if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
+ NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
+
+ int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
+ NumOfUnfoldedLoads * MemOpCost + NumOfMoves;
+
+ return Cost;
+ }
+
+ // Store.
+ assert(Opcode == Instruction::Store &&
+ "Expected Store Instruction at this point");
+
+ // There is no strided stores meanwhile. And store can't be folded in
+ // shuffle.
+ unsigned NumOfSources = Factor; // The number of values to be merged.
+ unsigned ShuffleCost =
+ getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr);
+ unsigned NumOfShufflesPerStore = NumOfSources - 1;
+
+ // The SK_MergeTwoSrc shuffle clobbers one of src operands.
+ // We need additional instructions to keep sources.
+ unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
+ int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
+ NumOfMoves;
+ return Cost;
+}
+
+int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+ unsigned Factor,
+ ArrayRef<unsigned> Indices,
+ unsigned Alignment,
+ unsigned AddressSpace) {
+ auto isSupportedOnAVX512 = [](Type *VecTy, bool &RequiresBW) {
+ RequiresBW = false;
+ Type *EltTy = VecTy->getVectorElementType();
+ if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
+ EltTy->isIntegerTy(32) || EltTy->isPointerTy())
+ return true;
+ if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8)) {
+ RequiresBW = true;
+ return true;
+ }
+ return false;
+ };
+ bool RequiresBW;
+ bool HasAVX512Solution = isSupportedOnAVX512(VecTy, RequiresBW);
+ if (ST->hasAVX512() && HasAVX512Solution && (!RequiresBW || ST->hasBWI()))
+ return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace);
+ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h
new file mode 100644
index 000000000000..f6bcb9f569e4
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -0,0 +1,116 @@
+//===-- X86TargetTransformInfo.h - X86 specific TTI -------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file a TargetTransformInfo::Concept conforming object specific to the
+/// X86 target machine. It uses the target's detailed information to
+/// provide more precise answers to certain TTI queries, while letting the
+/// target independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86TARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_X86_X86TARGETTRANSFORMINFO_H
+
+#include "X86.h"
+#include "X86TargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
+ typedef BasicTTIImplBase<X86TTIImpl> BaseT;
+ typedef TargetTransformInfo TTI;
+ friend BaseT;
+
+ const X86Subtarget *ST;
+ const X86TargetLowering *TLI;
+
+ int getScalarizationOverhead(Type *Ty, bool Insert, bool Extract);
+
+ const X86Subtarget *getST() const { return ST; }
+ const X86TargetLowering *getTLI() const { return TLI; }
+
+public:
+ explicit X86TTIImpl(const X86TargetMachine *TM, const Function &F)
+ : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+ TLI(ST->getTargetLowering()) {}
+
+ /// \name Scalar TTI Implementations
+ /// @{
+ TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
+
+ /// @}
+
+ /// \name Vector TTI Implementations
+ /// @{
+
+ unsigned getNumberOfRegisters(bool Vector);
+ unsigned getRegisterBitWidth(bool Vector);
+ unsigned getMaxInterleaveFactor(unsigned VF);
+ int getArithmeticInstrCost(
+ unsigned Opcode, Type *Ty,
+ TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+ TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+ TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+ int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
+ int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
+ int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+ int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+ int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+ unsigned AddressSpace);
+ int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+ unsigned AddressSpace);
+ int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
+ bool VariableMask, unsigned Alignment);
+ int getAddressComputationCost(Type *PtrTy, bool IsComplex);
+
+ int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
+ ArrayRef<Type *> Tys, FastMathFlags FMF);
+ int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
+ ArrayRef<Value *> Args, FastMathFlags FMF);
+
+ int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm);
+
+ int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+ unsigned Factor, ArrayRef<unsigned> Indices,
+ unsigned Alignment, unsigned AddressSpace);
+ int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
+ unsigned Factor, ArrayRef<unsigned> Indices,
+ unsigned Alignment, unsigned AddressSpace);
+
+ int getIntImmCost(int64_t);
+
+ int getIntImmCost(const APInt &Imm, Type *Ty);
+
+ int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
+ int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+ Type *Ty);
+ bool isLegalMaskedLoad(Type *DataType);
+ bool isLegalMaskedStore(Type *DataType);
+ bool isLegalMaskedGather(Type *DataType);
+ bool isLegalMaskedScatter(Type *DataType);
+ bool areInlineCompatible(const Function *Caller,
+ const Function *Callee) const;
+
+ bool enableInterleavedAccessVectorization();
+private:
+ int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask,
+ unsigned Alignment, unsigned AddressSpace);
+ int getGSVectorCost(unsigned Opcode, Type *DataTy, Value *Ptr,
+ unsigned Alignment, unsigned AddressSpace);
+
+ /// @}
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp
new file mode 100644
index 000000000000..9766b84be652
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp
@@ -0,0 +1,328 @@
+//===-- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which inserts x86 AVX vzeroupper instructions
+// before calls to SSE encoded functions. This avoids transition latency
+// penalty when transferring control between AVX encoded instructions and old
+// SSE encoding mode.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-vzeroupper"
+
+STATISTIC(NumVZU, "Number of vzeroupper instructions inserted");
+
+namespace {
+
+ class VZeroUpperInserter : public MachineFunctionPass {
+ public:
+
+ VZeroUpperInserter() : MachineFunctionPass(ID) {}
+ bool runOnMachineFunction(MachineFunction &MF) override;
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+ StringRef getPassName() const override { return "X86 vzeroupper inserter"; }
+
+ private:
+
+ void processBasicBlock(MachineBasicBlock &MBB);
+ void insertVZeroUpper(MachineBasicBlock::iterator I,
+ MachineBasicBlock &MBB);
+ void addDirtySuccessor(MachineBasicBlock &MBB);
+
+ typedef enum { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY } BlockExitState;
+ static const char* getBlockExitStateName(BlockExitState ST);
+
+ // Core algorithm state:
+ // BlockState - Each block is either:
+ // - PASS_THROUGH: There are neither YMM dirtying instructions nor
+ // vzeroupper instructions in this block.
+ // - EXITS_CLEAN: There is (or will be) a vzeroupper instruction in this
+ // block that will ensure that YMM is clean on exit.
+ // - EXITS_DIRTY: An instruction in the block dirties YMM and no
+ // subsequent vzeroupper in the block clears it.
+ //
+ // AddedToDirtySuccessors - This flag is raised when a block is added to the
+ // DirtySuccessors list to ensure that it's not
+ // added multiple times.
+ //
+ // FirstUnguardedCall - Records the location of the first unguarded call in
+ // each basic block that may need to be guarded by a
+ // vzeroupper. We won't know whether it actually needs
+ // to be guarded until we discover a predecessor that
+ // is DIRTY_OUT.
+ struct BlockState {
+ BlockState() : ExitState(PASS_THROUGH), AddedToDirtySuccessors(false) {}
+ BlockExitState ExitState;
+ bool AddedToDirtySuccessors;
+ MachineBasicBlock::iterator FirstUnguardedCall;
+ };
+ typedef SmallVector<BlockState, 8> BlockStateMap;
+ typedef SmallVector<MachineBasicBlock*, 8> DirtySuccessorsWorkList;
+
+ BlockStateMap BlockStates;
+ DirtySuccessorsWorkList DirtySuccessors;
+ bool EverMadeChange;
+ bool IsX86INTR;
+ const TargetInstrInfo *TII;
+
+ static char ID;
+ };
+
+ char VZeroUpperInserter::ID = 0;
+}
+
+FunctionPass *llvm::createX86IssueVZeroUpperPass() {
+ return new VZeroUpperInserter();
+}
+
+const char* VZeroUpperInserter::getBlockExitStateName(BlockExitState ST) {
+ switch (ST) {
+ case PASS_THROUGH: return "Pass-through";
+ case EXITS_DIRTY: return "Exits-dirty";
+ case EXITS_CLEAN: return "Exits-clean";
+ }
+ llvm_unreachable("Invalid block exit state.");
+}
+
+static bool isYmmReg(unsigned Reg) {
+ return (Reg >= X86::YMM0 && Reg <= X86::YMM15);
+}
+
+static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) {
+ for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(),
+ E = MRI.livein_end(); I != E; ++I)
+ if (isYmmReg(I->first))
+ return true;
+
+ return false;
+}
+
+static bool clobbersAllYmmRegs(const MachineOperand &MO) {
+ for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
+ if (!MO.clobbersPhysReg(reg))
+ return false;
+ }
+ return true;
+}
+
+static bool hasYmmReg(MachineInstr &MI) {
+ for (const MachineOperand &MO : MI.operands()) {
+ if (MI.isCall() && MO.isRegMask() && !clobbersAllYmmRegs(MO))
+ return true;
+ if (!MO.isReg())
+ continue;
+ if (MO.isDebug())
+ continue;
+ if (isYmmReg(MO.getReg()))
+ return true;
+ }
+ return false;
+}
+
+/// Check if any YMM register will be clobbered by this instruction.
+static bool callClobbersAnyYmmReg(MachineInstr &MI) {
+ assert(MI.isCall() && "Can only be called on call instructions.");
+ for (const MachineOperand &MO : MI.operands()) {
+ if (!MO.isRegMask())
+ continue;
+ for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
+ if (MO.clobbersPhysReg(reg))
+ return true;
+ }
+ }
+ return false;
+}
+
+/// Insert a vzeroupper instruction before I.
+void VZeroUpperInserter::insertVZeroUpper(MachineBasicBlock::iterator I,
+ MachineBasicBlock &MBB) {
+ DebugLoc dl = I->getDebugLoc();
+ BuildMI(MBB, I, dl, TII->get(X86::VZEROUPPER));
+ ++NumVZU;
+ EverMadeChange = true;
+}
+
+/// Add MBB to the DirtySuccessors list if it hasn't already been added.
+void VZeroUpperInserter::addDirtySuccessor(MachineBasicBlock &MBB) {
+ if (!BlockStates[MBB.getNumber()].AddedToDirtySuccessors) {
+ DirtySuccessors.push_back(&MBB);
+ BlockStates[MBB.getNumber()].AddedToDirtySuccessors = true;
+ }
+}
+
+/// Loop over all of the instructions in the basic block, inserting vzeroupper
+/// instructions before function calls.
+void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
+
+ // Start by assuming that the block is PASS_THROUGH which implies no unguarded
+ // calls.
+ BlockExitState CurState = PASS_THROUGH;
+ BlockStates[MBB.getNumber()].FirstUnguardedCall = MBB.end();
+
+ for (MachineInstr &MI : MBB) {
+ // No need for vzeroupper before iret in interrupt handler function,
+ // epilogue will restore YMM registers if needed.
+ bool IsReturnFromX86INTR = IsX86INTR && MI.isReturn();
+ bool IsControlFlow = MI.isCall() || MI.isReturn();
+
+ // An existing VZERO* instruction resets the state.
+ if (MI.getOpcode() == X86::VZEROALL || MI.getOpcode() == X86::VZEROUPPER) {
+ CurState = EXITS_CLEAN;
+ continue;
+ }
+
+ // Shortcut: don't need to check regular instructions in dirty state.
+ if ((!IsControlFlow || IsReturnFromX86INTR) && CurState == EXITS_DIRTY)
+ continue;
+
+ if (hasYmmReg(MI)) {
+ // We found a ymm-using instruction; this could be an AVX instruction,
+ // or it could be control flow.
+ CurState = EXITS_DIRTY;
+ continue;
+ }
+
+ // Check for control-flow out of the current function (which might
+ // indirectly execute SSE instructions).
+ if (!IsControlFlow || IsReturnFromX86INTR)
+ continue;
+
+ // If the call won't clobber any YMM register, skip it as well. It usually
+ // happens on helper function calls (such as '_chkstk', '_ftol2') where
+ // standard calling convention is not used (RegMask is not used to mark
+ // register clobbered and register usage (def/imp-def/use) is well-defined
+ // and explicitly specified.
+ if (MI.isCall() && !callClobbersAnyYmmReg(MI))
+ continue;
+
+ // The VZEROUPPER instruction resets the upper 128 bits of all AVX
+ // registers. In addition, the processor changes back to Clean state, after
+ // which execution of SSE instructions or AVX instructions has no transition
+ // penalty. Add the VZEROUPPER instruction before any function call/return
+ // that might execute SSE code.
+ // FIXME: In some cases, we may want to move the VZEROUPPER into a
+ // predecessor block.
+ if (CurState == EXITS_DIRTY) {
+ // After the inserted VZEROUPPER the state becomes clean again, but
+ // other YMM may appear before other subsequent calls or even before
+ // the end of the BB.
+ insertVZeroUpper(MI, MBB);
+ CurState = EXITS_CLEAN;
+ } else if (CurState == PASS_THROUGH) {
+ // If this block is currently in pass-through state and we encounter a
+ // call then whether we need a vzeroupper or not depends on whether this
+ // block has successors that exit dirty. Record the location of the call,
+ // and set the state to EXITS_CLEAN, but do not insert the vzeroupper yet.
+ // It will be inserted later if necessary.
+ BlockStates[MBB.getNumber()].FirstUnguardedCall = MI;
+ CurState = EXITS_CLEAN;
+ }
+ }
+
+ DEBUG(dbgs() << "MBB #" << MBB.getNumber() << " exit state: "
+ << getBlockExitStateName(CurState) << '\n');
+
+ if (CurState == EXITS_DIRTY)
+ for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(),
+ SE = MBB.succ_end();
+ SI != SE; ++SI)
+ addDirtySuccessor(**SI);
+
+ BlockStates[MBB.getNumber()].ExitState = CurState;
+}
+
+/// Loop over all of the basic blocks, inserting vzeroupper instructions before
+/// function calls.
+bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
+ const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
+ if (!ST.hasAVX() || ST.hasAVX512() || ST.hasFastPartialYMMWrite())
+ return false;
+ TII = ST.getInstrInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ EverMadeChange = false;
+ IsX86INTR = MF.getFunction()->getCallingConv() == CallingConv::X86_INTR;
+
+ bool FnHasLiveInYmm = checkFnHasLiveInYmm(MRI);
+
+ // Fast check: if the function doesn't use any ymm registers, we don't need
+ // to insert any VZEROUPPER instructions. This is constant-time, so it is
+ // cheap in the common case of no ymm use.
+ bool YMMUsed = FnHasLiveInYmm;
+ if (!YMMUsed) {
+ const TargetRegisterClass *RC = &X86::VR256RegClass;
+ for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); i != e;
+ i++) {
+ if (!MRI.reg_nodbg_empty(*i)) {
+ YMMUsed = true;
+ break;
+ }
+ }
+ }
+ if (!YMMUsed) {
+ return false;
+ }
+
+ assert(BlockStates.empty() && DirtySuccessors.empty() &&
+ "X86VZeroUpper state should be clear");
+ BlockStates.resize(MF.getNumBlockIDs());
+
+ // Process all blocks. This will compute block exit states, record the first
+ // unguarded call in each block, and add successors of dirty blocks to the
+ // DirtySuccessors list.
+ for (MachineBasicBlock &MBB : MF)
+ processBasicBlock(MBB);
+
+ // If any YMM regs are live-in to this function, add the entry block to the
+ // DirtySuccessors list
+ if (FnHasLiveInYmm)
+ addDirtySuccessor(MF.front());
+
+ // Re-visit all blocks that are successors of EXITS_DIRTY blocks. Add
+ // vzeroupper instructions to unguarded calls, and propagate EXITS_DIRTY
+ // through PASS_THROUGH blocks.
+ while (!DirtySuccessors.empty()) {
+ MachineBasicBlock &MBB = *DirtySuccessors.back();
+ DirtySuccessors.pop_back();
+ BlockState &BBState = BlockStates[MBB.getNumber()];
+
+ // MBB is a successor of a dirty block, so its first call needs to be
+ // guarded.
+ if (BBState.FirstUnguardedCall != MBB.end())
+ insertVZeroUpper(BBState.FirstUnguardedCall, MBB);
+
+ // If this successor was a pass-through block, then it is now dirty. Its
+ // successors need to be added to the worklist (if they haven't been
+ // already).
+ if (BBState.ExitState == PASS_THROUGH) {
+ DEBUG(dbgs() << "MBB #" << MBB.getNumber()
+ << " was Pass-through, is now Dirty-out.\n");
+ for (MachineBasicBlock *Succ : MBB.successors())
+ addDirtySuccessor(*Succ);
+ }
+ }
+
+ BlockStates.clear();
+ return EverMadeChange;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86WinAllocaExpander.cpp b/contrib/llvm/lib/Target/X86/X86WinAllocaExpander.cpp
new file mode 100644
index 000000000000..fc08f1582ad7
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86WinAllocaExpander.cpp
@@ -0,0 +1,295 @@
+//===----- X86WinAllocaExpander.cpp - Expand WinAlloca pseudo instruction -===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pass that expands WinAlloca pseudo-instructions.
+//
+// It performs a conservative analysis to determine whether each allocation
+// falls within a region of the stack that is safe to use, or whether stack
+// probes must be emitted.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+class X86WinAllocaExpander : public MachineFunctionPass {
+public:
+ X86WinAllocaExpander() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+ /// Strategies for lowering a WinAlloca.
+ enum Lowering { TouchAndSub, Sub, Probe };
+
+ /// Deterministic-order map from WinAlloca instruction to desired lowering.
+ typedef MapVector<MachineInstr*, Lowering> LoweringMap;
+
+ /// Compute which lowering to use for each WinAlloca instruction.
+ void computeLowerings(MachineFunction &MF, LoweringMap& Lowerings);
+
+ /// Get the appropriate lowering based on current offset and amount.
+ Lowering getLowering(int64_t CurrentOffset, int64_t AllocaAmount);
+
+ /// Lower a WinAlloca instruction.
+ void lower(MachineInstr* MI, Lowering L);
+
+ MachineRegisterInfo *MRI;
+ const X86Subtarget *STI;
+ const TargetInstrInfo *TII;
+ const X86RegisterInfo *TRI;
+ unsigned StackPtr;
+ unsigned SlotSize;
+ int64_t StackProbeSize;
+
+ StringRef getPassName() const override { return "X86 WinAlloca Expander"; }
+ static char ID;
+};
+
+char X86WinAllocaExpander::ID = 0;
+
+} // end anonymous namespace
+
+FunctionPass *llvm::createX86WinAllocaExpander() {
+ return new X86WinAllocaExpander();
+}
+
+/// Return the allocation amount for a WinAlloca instruction, or -1 if unknown.
+static int64_t getWinAllocaAmount(MachineInstr *MI, MachineRegisterInfo *MRI) {
+ assert(MI->getOpcode() == X86::WIN_ALLOCA_32 ||
+ MI->getOpcode() == X86::WIN_ALLOCA_64);
+ assert(MI->getOperand(0).isReg());
+
+ unsigned AmountReg = MI->getOperand(0).getReg();
+ MachineInstr *Def = MRI->getUniqueVRegDef(AmountReg);
+
+ // Look through copies.
+ while (Def && Def->isCopy() && Def->getOperand(1).isReg())
+ Def = MRI->getUniqueVRegDef(Def->getOperand(1).getReg());
+
+ if (!Def ||
+ (Def->getOpcode() != X86::MOV32ri && Def->getOpcode() != X86::MOV64ri) ||
+ !Def->getOperand(1).isImm())
+ return -1;
+
+ return Def->getOperand(1).getImm();
+}
+
+X86WinAllocaExpander::Lowering
+X86WinAllocaExpander::getLowering(int64_t CurrentOffset,
+ int64_t AllocaAmount) {
+ // For a non-constant amount or a large amount, we have to probe.
+ if (AllocaAmount < 0 || AllocaAmount > StackProbeSize)
+ return Probe;
+
+ // If it fits within the safe region of the stack, just subtract.
+ if (CurrentOffset + AllocaAmount <= StackProbeSize)
+ return Sub;
+
+ // Otherwise, touch the current tip of the stack, then subtract.
+ return TouchAndSub;
+}
+
+static bool isPushPop(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case X86::PUSH32i8:
+ case X86::PUSH32r:
+ case X86::PUSH32rmm:
+ case X86::PUSH32rmr:
+ case X86::PUSHi32:
+ case X86::PUSH64i8:
+ case X86::PUSH64r:
+ case X86::PUSH64rmm:
+ case X86::PUSH64rmr:
+ case X86::PUSH64i32:
+ case X86::POP32r:
+ case X86::POP64r:
+ return true;
+ default:
+ return false;
+ }
+}
+
+void X86WinAllocaExpander::computeLowerings(MachineFunction &MF,
+ LoweringMap &Lowerings) {
+ // Do a one-pass reverse post-order walk of the CFG to conservatively estimate
+ // the offset between the stack pointer and the lowest touched part of the
+ // stack, and use that to decide how to lower each WinAlloca instruction.
+
+ // Initialize OutOffset[B], the stack offset at exit from B, to something big.
+ DenseMap<MachineBasicBlock *, int64_t> OutOffset;
+ for (MachineBasicBlock &MBB : MF)
+ OutOffset[&MBB] = INT32_MAX;
+
+ // Note: we don't know the offset at the start of the entry block since the
+ // prologue hasn't been inserted yet, and how much that will adjust the stack
+ // pointer depends on register spills, which have not been computed yet.
+
+ // Compute the reverse post-order.
+ ReversePostOrderTraversal<MachineFunction*> RPO(&MF);
+
+ for (MachineBasicBlock *MBB : RPO) {
+ int64_t Offset = -1;
+ for (MachineBasicBlock *Pred : MBB->predecessors())
+ Offset = std::max(Offset, OutOffset[Pred]);
+ if (Offset == -1) Offset = INT32_MAX;
+
+ for (MachineInstr &MI : *MBB) {
+ if (MI.getOpcode() == X86::WIN_ALLOCA_32 ||
+ MI.getOpcode() == X86::WIN_ALLOCA_64) {
+ // A WinAlloca moves StackPtr, and potentially touches it.
+ int64_t Amount = getWinAllocaAmount(&MI, MRI);
+ Lowering L = getLowering(Offset, Amount);
+ Lowerings[&MI] = L;
+ switch (L) {
+ case Sub:
+ Offset += Amount;
+ break;
+ case TouchAndSub:
+ Offset = Amount;
+ break;
+ case Probe:
+ Offset = 0;
+ break;
+ }
+ } else if (MI.isCall() || isPushPop(MI)) {
+ // Calls, pushes and pops touch the tip of the stack.
+ Offset = 0;
+ } else if (MI.getOpcode() == X86::ADJCALLSTACKUP32 ||
+ MI.getOpcode() == X86::ADJCALLSTACKUP64) {
+ Offset -= MI.getOperand(0).getImm();
+ } else if (MI.getOpcode() == X86::ADJCALLSTACKDOWN32 ||
+ MI.getOpcode() == X86::ADJCALLSTACKDOWN64) {
+ Offset += MI.getOperand(0).getImm();
+ } else if (MI.modifiesRegister(StackPtr, TRI)) {
+ // Any other modification of SP means we've lost track of it.
+ Offset = INT32_MAX;
+ }
+ }
+
+ OutOffset[MBB] = Offset;
+ }
+}
+
+static unsigned getSubOpcode(bool Is64Bit, int64_t Amount) {
+ if (Is64Bit)
+ return isInt<8>(Amount) ? X86::SUB64ri8 : X86::SUB64ri32;
+ return isInt<8>(Amount) ? X86::SUB32ri8 : X86::SUB32ri;
+}
+
+void X86WinAllocaExpander::lower(MachineInstr* MI, Lowering L) {
+ DebugLoc DL = MI->getDebugLoc();
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineBasicBlock::iterator I = *MI;
+
+ int64_t Amount = getWinAllocaAmount(MI, MRI);
+ if (Amount == 0) {
+ MI->eraseFromParent();
+ return;
+ }
+
+ bool Is64Bit = STI->is64Bit();
+ assert(SlotSize == 4 || SlotSize == 8);
+ unsigned RegA = (SlotSize == 8) ? X86::RAX : X86::EAX;
+
+ switch (L) {
+ case TouchAndSub:
+ assert(Amount >= SlotSize);
+
+ // Use a push to touch the top of the stack.
+ BuildMI(*MBB, I, DL, TII->get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
+ .addReg(RegA, RegState::Undef);
+ Amount -= SlotSize;
+ if (!Amount)
+ break;
+
+ // Fall through to make any remaining adjustment.
+ LLVM_FALLTHROUGH;
+ case Sub:
+ assert(Amount > 0);
+ if (Amount == SlotSize) {
+ // Use push to save size.
+ BuildMI(*MBB, I, DL, TII->get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
+ .addReg(RegA, RegState::Undef);
+ } else {
+ // Sub.
+ BuildMI(*MBB, I, DL, TII->get(getSubOpcode(Is64Bit, Amount)), StackPtr)
+ .addReg(StackPtr)
+ .addImm(Amount);
+ }
+ break;
+ case Probe:
+ // The probe lowering expects the amount in RAX/EAX.
+ BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegA)
+ .addReg(MI->getOperand(0).getReg());
+
+ // Do the probe.
+ STI->getFrameLowering()->emitStackProbe(*MBB->getParent(), *MBB, MI, DL,
+ /*InPrologue=*/false);
+ break;
+ }
+
+ unsigned AmountReg = MI->getOperand(0).getReg();
+ MI->eraseFromParent();
+
+ // Delete the definition of AmountReg, possibly walking a chain of copies.
+ for (;;) {
+ if (!MRI->use_empty(AmountReg))
+ break;
+ MachineInstr *AmountDef = MRI->getUniqueVRegDef(AmountReg);
+ if (!AmountDef)
+ break;
+ if (AmountDef->isCopy() && AmountDef->getOperand(1).isReg())
+ AmountReg = AmountDef->getOperand(1).isReg();
+ AmountDef->eraseFromParent();
+ break;
+ }
+}
+
+bool X86WinAllocaExpander::runOnMachineFunction(MachineFunction &MF) {
+ if (!MF.getInfo<X86MachineFunctionInfo>()->hasWinAlloca())
+ return false;
+
+ MRI = &MF.getRegInfo();
+ STI = &MF.getSubtarget<X86Subtarget>();
+ TII = STI->getInstrInfo();
+ TRI = STI->getRegisterInfo();
+ StackPtr = TRI->getStackRegister();
+ SlotSize = TRI->getSlotSize();
+
+ StackProbeSize = 4096;
+ if (MF.getFunction()->hasFnAttribute("stack-probe-size")) {
+ MF.getFunction()
+ ->getFnAttribute("stack-probe-size")
+ .getValueAsString()
+ .getAsInteger(0, StackProbeSize);
+ }
+
+ LoweringMap Lowerings;
+ computeLowerings(MF, Lowerings);
+ for (auto &P : Lowerings)
+ lower(P.first, P.second);
+
+ return true;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86WinEHState.cpp b/contrib/llvm/lib/Target/X86/X86WinEHState.cpp
new file mode 100644
index 000000000000..bc14630584e5
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86WinEHState.cpp
@@ -0,0 +1,796 @@
+//===-- X86WinEHState - Insert EH state updates for win32 exceptions ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// All functions using an MSVC EH personality use an explicitly updated state
+// number stored in an exception registration stack object. The registration
+// object is linked into a thread-local chain of registrations stored at fs:00.
+// This pass adds the registration object and EH state updates.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include <deque>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "winehstate"
+
+namespace llvm {
+void initializeWinEHStatePassPass(PassRegistry &);
+}
+
+namespace {
+const int OverdefinedState = INT_MIN;
+
+class WinEHStatePass : public FunctionPass {
+public:
+ static char ID; // Pass identification, replacement for typeid.
+
+ WinEHStatePass() : FunctionPass(ID) {
+ initializeWinEHStatePassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &Fn) override;
+
+ bool doInitialization(Module &M) override;
+
+ bool doFinalization(Module &M) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+ StringRef getPassName() const override {
+ return "Windows 32-bit x86 EH state insertion";
+ }
+
+private:
+ void emitExceptionRegistrationRecord(Function *F);
+
+ void linkExceptionRegistration(IRBuilder<> &Builder, Function *Handler);
+ void unlinkExceptionRegistration(IRBuilder<> &Builder);
+ void addStateStores(Function &F, WinEHFuncInfo &FuncInfo);
+ void insertStateNumberStore(Instruction *IP, int State);
+
+ Value *emitEHLSDA(IRBuilder<> &Builder, Function *F);
+
+ Function *generateLSDAInEAXThunk(Function *ParentFunc);
+
+ bool isStateStoreNeeded(EHPersonality Personality, CallSite CS);
+ void rewriteSetJmpCallSite(IRBuilder<> &Builder, Function &F, CallSite CS,
+ Value *State);
+ int getBaseStateForBB(DenseMap<BasicBlock *, ColorVector> &BlockColors,
+ WinEHFuncInfo &FuncInfo, BasicBlock *BB);
+ int getStateForCallSite(DenseMap<BasicBlock *, ColorVector> &BlockColors,
+ WinEHFuncInfo &FuncInfo, CallSite CS);
+
+ // Module-level type getters.
+ Type *getEHLinkRegistrationType();
+ Type *getSEHRegistrationType();
+ Type *getCXXEHRegistrationType();
+
+ // Per-module data.
+ Module *TheModule = nullptr;
+ StructType *EHLinkRegistrationTy = nullptr;
+ StructType *CXXEHRegistrationTy = nullptr;
+ StructType *SEHRegistrationTy = nullptr;
+ Constant *SetJmp3 = nullptr;
+ Constant *CxxLongjmpUnwind = nullptr;
+
+ // Per-function state
+ EHPersonality Personality = EHPersonality::Unknown;
+ Function *PersonalityFn = nullptr;
+ bool UseStackGuard = false;
+ int ParentBaseState;
+ Constant *SehLongjmpUnwind = nullptr;
+ Constant *Cookie = nullptr;
+
+ /// The stack allocation containing all EH data, including the link in the
+ /// fs:00 chain and the current state.
+ AllocaInst *RegNode = nullptr;
+
+ // The allocation containing the EH security guard.
+ AllocaInst *EHGuardNode = nullptr;
+
+ /// The index of the state field of RegNode.
+ int StateFieldIndex = ~0U;
+
+ /// The linked list node subobject inside of RegNode.
+ Value *Link = nullptr;
+};
+}
+
+FunctionPass *llvm::createX86WinEHStatePass() { return new WinEHStatePass(); }
+
+char WinEHStatePass::ID = 0;
+
+INITIALIZE_PASS(WinEHStatePass, "x86-winehstate",
+ "Insert stores for EH state numbers", false, false)
+
+bool WinEHStatePass::doInitialization(Module &M) {
+ TheModule = &M;
+ return false;
+}
+
+bool WinEHStatePass::doFinalization(Module &M) {
+ assert(TheModule == &M);
+ TheModule = nullptr;
+ EHLinkRegistrationTy = nullptr;
+ CXXEHRegistrationTy = nullptr;
+ SEHRegistrationTy = nullptr;
+ SetJmp3 = nullptr;
+ CxxLongjmpUnwind = nullptr;
+ SehLongjmpUnwind = nullptr;
+ Cookie = nullptr;
+ return false;
+}
+
+void WinEHStatePass::getAnalysisUsage(AnalysisUsage &AU) const {
+ // This pass should only insert a stack allocation, memory accesses, and
+ // localrecovers.
+ AU.setPreservesCFG();
+}
+
+bool WinEHStatePass::runOnFunction(Function &F) {
+ // Check the personality. Do nothing if this personality doesn't use funclets.
+ if (!F.hasPersonalityFn())
+ return false;
+ PersonalityFn =
+ dyn_cast<Function>(F.getPersonalityFn()->stripPointerCasts());
+ if (!PersonalityFn)
+ return false;
+ Personality = classifyEHPersonality(PersonalityFn);
+ if (!isFuncletEHPersonality(Personality))
+ return false;
+
+ // Skip this function if there are no EH pads and we aren't using IR-level
+ // outlining.
+ bool HasPads = false;
+ for (BasicBlock &BB : F) {
+ if (BB.isEHPad()) {
+ HasPads = true;
+ break;
+ }
+ }
+ if (!HasPads)
+ return false;
+
+ Type *Int8PtrType = Type::getInt8PtrTy(TheModule->getContext());
+ SetJmp3 = TheModule->getOrInsertFunction(
+ "_setjmp3", FunctionType::get(
+ Type::getInt32Ty(TheModule->getContext()),
+ {Int8PtrType, Type::getInt32Ty(TheModule->getContext())},
+ /*isVarArg=*/true));
+
+ // Disable frame pointer elimination in this function.
+ // FIXME: Do the nested handlers need to keep the parent ebp in ebp, or can we
+ // use an arbitrary register?
+ F.addFnAttr("no-frame-pointer-elim", "true");
+
+ emitExceptionRegistrationRecord(&F);
+
+ // The state numbers calculated here in IR must agree with what we calculate
+ // later on for the MachineFunction. In particular, if an IR pass deletes an
+ // unreachable EH pad after this point before machine CFG construction, we
+ // will be in trouble. If this assumption is ever broken, we should turn the
+ // numbers into an immutable analysis pass.
+ WinEHFuncInfo FuncInfo;
+ addStateStores(F, FuncInfo);
+
+ // Reset per-function state.
+ PersonalityFn = nullptr;
+ Personality = EHPersonality::Unknown;
+ UseStackGuard = false;
+ RegNode = nullptr;
+ EHGuardNode = nullptr;
+
+ return true;
+}
+
+/// Get the common EH registration subobject:
+/// typedef _EXCEPTION_DISPOSITION (*PEXCEPTION_ROUTINE)(
+/// _EXCEPTION_RECORD *, void *, _CONTEXT *, void *);
+/// struct EHRegistrationNode {
+/// EHRegistrationNode *Next;
+/// PEXCEPTION_ROUTINE Handler;
+/// };
+Type *WinEHStatePass::getEHLinkRegistrationType() {
+ if (EHLinkRegistrationTy)
+ return EHLinkRegistrationTy;
+ LLVMContext &Context = TheModule->getContext();
+ EHLinkRegistrationTy = StructType::create(Context, "EHRegistrationNode");
+ Type *FieldTys[] = {
+ EHLinkRegistrationTy->getPointerTo(0), // EHRegistrationNode *Next
+ Type::getInt8PtrTy(Context) // EXCEPTION_DISPOSITION (*Handler)(...)
+ };
+ EHLinkRegistrationTy->setBody(FieldTys, false);
+ return EHLinkRegistrationTy;
+}
+
+/// The __CxxFrameHandler3 registration node:
+/// struct CXXExceptionRegistration {
+/// void *SavedESP;
+/// EHRegistrationNode SubRecord;
+/// int32_t TryLevel;
+/// };
+Type *WinEHStatePass::getCXXEHRegistrationType() {
+ if (CXXEHRegistrationTy)
+ return CXXEHRegistrationTy;
+ LLVMContext &Context = TheModule->getContext();
+ Type *FieldTys[] = {
+ Type::getInt8PtrTy(Context), // void *SavedESP
+ getEHLinkRegistrationType(), // EHRegistrationNode SubRecord
+ Type::getInt32Ty(Context) // int32_t TryLevel
+ };
+ CXXEHRegistrationTy =
+ StructType::create(FieldTys, "CXXExceptionRegistration");
+ return CXXEHRegistrationTy;
+}
+
+/// The _except_handler3/4 registration node:
+/// struct EH4ExceptionRegistration {
+/// void *SavedESP;
+/// _EXCEPTION_POINTERS *ExceptionPointers;
+/// EHRegistrationNode SubRecord;
+/// int32_t EncodedScopeTable;
+/// int32_t TryLevel;
+/// };
+Type *WinEHStatePass::getSEHRegistrationType() {
+ if (SEHRegistrationTy)
+ return SEHRegistrationTy;
+ LLVMContext &Context = TheModule->getContext();
+ Type *FieldTys[] = {
+ Type::getInt8PtrTy(Context), // void *SavedESP
+ Type::getInt8PtrTy(Context), // void *ExceptionPointers
+ getEHLinkRegistrationType(), // EHRegistrationNode SubRecord
+ Type::getInt32Ty(Context), // int32_t EncodedScopeTable
+ Type::getInt32Ty(Context) // int32_t TryLevel
+ };
+ SEHRegistrationTy = StructType::create(FieldTys, "SEHExceptionRegistration");
+ return SEHRegistrationTy;
+}
+
+// Emit an exception registration record. These are stack allocations with the
+// common subobject of two pointers: the previous registration record (the old
+// fs:00) and the personality function for the current frame. The data before
+// and after that is personality function specific.
+void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) {
+ assert(Personality == EHPersonality::MSVC_CXX ||
+ Personality == EHPersonality::MSVC_X86SEH);
+
+ // Struct type of RegNode. Used for GEPing.
+ Type *RegNodeTy;
+
+ IRBuilder<> Builder(&F->getEntryBlock(), F->getEntryBlock().begin());
+ Type *Int8PtrType = Builder.getInt8PtrTy();
+ Type *Int32Ty = Builder.getInt32Ty();
+ Type *VoidTy = Builder.getVoidTy();
+
+ if (Personality == EHPersonality::MSVC_CXX) {
+ RegNodeTy = getCXXEHRegistrationType();
+ RegNode = Builder.CreateAlloca(RegNodeTy);
+ // SavedESP = llvm.stacksave()
+ Value *SP = Builder.CreateCall(
+ Intrinsic::getDeclaration(TheModule, Intrinsic::stacksave), {});
+ Builder.CreateStore(SP, Builder.CreateStructGEP(RegNodeTy, RegNode, 0));
+ // TryLevel = -1
+ StateFieldIndex = 2;
+ ParentBaseState = -1;
+ insertStateNumberStore(&*Builder.GetInsertPoint(), ParentBaseState);
+ // Handler = __ehhandler$F
+ Function *Trampoline = generateLSDAInEAXThunk(F);
+ Link = Builder.CreateStructGEP(RegNodeTy, RegNode, 1);
+ linkExceptionRegistration(Builder, Trampoline);
+
+ CxxLongjmpUnwind = TheModule->getOrInsertFunction(
+ "__CxxLongjmpUnwind",
+ FunctionType::get(VoidTy, Int8PtrType, /*isVarArg=*/false));
+ cast<Function>(CxxLongjmpUnwind->stripPointerCasts())
+ ->setCallingConv(CallingConv::X86_StdCall);
+ } else if (Personality == EHPersonality::MSVC_X86SEH) {
+ // If _except_handler4 is in use, some additional guard checks and prologue
+ // stuff is required.
+ StringRef PersonalityName = PersonalityFn->getName();
+ UseStackGuard = (PersonalityName == "_except_handler4");
+
+ // Allocate local structures.
+ RegNodeTy = getSEHRegistrationType();
+ RegNode = Builder.CreateAlloca(RegNodeTy);
+ if (UseStackGuard)
+ EHGuardNode = Builder.CreateAlloca(Int32Ty);
+
+ // SavedESP = llvm.stacksave()
+ Value *SP = Builder.CreateCall(
+ Intrinsic::getDeclaration(TheModule, Intrinsic::stacksave), {});
+ Builder.CreateStore(SP, Builder.CreateStructGEP(RegNodeTy, RegNode, 0));
+ // TryLevel = -2 / -1
+ StateFieldIndex = 4;
+ ParentBaseState = UseStackGuard ? -2 : -1;
+ insertStateNumberStore(&*Builder.GetInsertPoint(), ParentBaseState);
+ // ScopeTable = llvm.x86.seh.lsda(F)
+ Value *LSDA = emitEHLSDA(Builder, F);
+ LSDA = Builder.CreatePtrToInt(LSDA, Int32Ty);
+ // If using _except_handler4, xor the address of the table with
+ // __security_cookie.
+ if (UseStackGuard) {
+ Cookie = TheModule->getOrInsertGlobal("__security_cookie", Int32Ty);
+ Value *Val = Builder.CreateLoad(Int32Ty, Cookie, "cookie");
+ LSDA = Builder.CreateXor(LSDA, Val);
+ }
+ Builder.CreateStore(LSDA, Builder.CreateStructGEP(RegNodeTy, RegNode, 3));
+
+ // If using _except_handler4, the EHGuard contains: FramePtr xor Cookie.
+ if (UseStackGuard) {
+ Value *Val = Builder.CreateLoad(Int32Ty, Cookie);
+ Value *FrameAddr = Builder.CreateCall(
+ Intrinsic::getDeclaration(TheModule, Intrinsic::frameaddress),
+ Builder.getInt32(0), "frameaddr");
+ Value *FrameAddrI32 = Builder.CreatePtrToInt(FrameAddr, Int32Ty);
+ FrameAddrI32 = Builder.CreateXor(FrameAddrI32, Val);
+ Builder.CreateStore(FrameAddrI32, EHGuardNode);
+ }
+
+ // Register the exception handler.
+ Link = Builder.CreateStructGEP(RegNodeTy, RegNode, 2);
+ linkExceptionRegistration(Builder, PersonalityFn);
+
+ SehLongjmpUnwind = TheModule->getOrInsertFunction(
+ UseStackGuard ? "_seh_longjmp_unwind4" : "_seh_longjmp_unwind",
+ FunctionType::get(Type::getVoidTy(TheModule->getContext()), Int8PtrType,
+ /*isVarArg=*/false));
+ cast<Function>(SehLongjmpUnwind->stripPointerCasts())
+ ->setCallingConv(CallingConv::X86_StdCall);
+ } else {
+ llvm_unreachable("unexpected personality function");
+ }
+
+ // Insert an unlink before all returns.
+ for (BasicBlock &BB : *F) {
+ TerminatorInst *T = BB.getTerminator();
+ if (!isa<ReturnInst>(T))
+ continue;
+ Builder.SetInsertPoint(T);
+ unlinkExceptionRegistration(Builder);
+ }
+}
+
+Value *WinEHStatePass::emitEHLSDA(IRBuilder<> &Builder, Function *F) {
+ Value *FI8 = Builder.CreateBitCast(F, Type::getInt8PtrTy(F->getContext()));
+ return Builder.CreateCall(
+ Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_lsda), FI8);
+}
+
+/// Generate a thunk that puts the LSDA of ParentFunc in EAX and then calls
+/// PersonalityFn, forwarding the parameters passed to PEXCEPTION_ROUTINE:
+/// typedef _EXCEPTION_DISPOSITION (*PEXCEPTION_ROUTINE)(
+/// _EXCEPTION_RECORD *, void *, _CONTEXT *, void *);
+/// We essentially want this code:
+/// movl $lsda, %eax
+/// jmpl ___CxxFrameHandler3
+Function *WinEHStatePass::generateLSDAInEAXThunk(Function *ParentFunc) {
+ LLVMContext &Context = ParentFunc->getContext();
+ Type *Int32Ty = Type::getInt32Ty(Context);
+ Type *Int8PtrType = Type::getInt8PtrTy(Context);
+ Type *ArgTys[5] = {Int8PtrType, Int8PtrType, Int8PtrType, Int8PtrType,
+ Int8PtrType};
+ FunctionType *TrampolineTy =
+ FunctionType::get(Int32Ty, makeArrayRef(&ArgTys[0], 4),
+ /*isVarArg=*/false);
+ FunctionType *TargetFuncTy =
+ FunctionType::get(Int32Ty, makeArrayRef(&ArgTys[0], 5),
+ /*isVarArg=*/false);
+ Function *Trampoline =
+ Function::Create(TrampolineTy, GlobalValue::InternalLinkage,
+ Twine("__ehhandler$") + GlobalValue::getRealLinkageName(
+ ParentFunc->getName()),
+ TheModule);
+ BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", Trampoline);
+ IRBuilder<> Builder(EntryBB);
+ Value *LSDA = emitEHLSDA(Builder, ParentFunc);
+ Value *CastPersonality =
+ Builder.CreateBitCast(PersonalityFn, TargetFuncTy->getPointerTo());
+ auto AI = Trampoline->arg_begin();
+ Value *Args[5] = {LSDA, &*AI++, &*AI++, &*AI++, &*AI++};
+ CallInst *Call = Builder.CreateCall(CastPersonality, Args);
+ // Can't use musttail due to prototype mismatch, but we can use tail.
+ Call->setTailCall(true);
+ // Set inreg so we pass it in EAX.
+ Call->addAttribute(1, Attribute::InReg);
+ Builder.CreateRet(Call);
+ return Trampoline;
+}
+
+void WinEHStatePass::linkExceptionRegistration(IRBuilder<> &Builder,
+ Function *Handler) {
+ // Emit the .safeseh directive for this function.
+ Handler->addFnAttr("safeseh");
+
+ Type *LinkTy = getEHLinkRegistrationType();
+ // Handler = Handler
+ Value *HandlerI8 = Builder.CreateBitCast(Handler, Builder.getInt8PtrTy());
+ Builder.CreateStore(HandlerI8, Builder.CreateStructGEP(LinkTy, Link, 1));
+ // Next = [fs:00]
+ Constant *FSZero =
+ Constant::getNullValue(LinkTy->getPointerTo()->getPointerTo(257));
+ Value *Next = Builder.CreateLoad(FSZero);
+ Builder.CreateStore(Next, Builder.CreateStructGEP(LinkTy, Link, 0));
+ // [fs:00] = Link
+ Builder.CreateStore(Link, FSZero);
+}
+
+void WinEHStatePass::unlinkExceptionRegistration(IRBuilder<> &Builder) {
+ // Clone Link into the current BB for better address mode folding.
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(Link)) {
+ GEP = cast<GetElementPtrInst>(GEP->clone());
+ Builder.Insert(GEP);
+ Link = GEP;
+ }
+ Type *LinkTy = getEHLinkRegistrationType();
+ // [fs:00] = Link->Next
+ Value *Next =
+ Builder.CreateLoad(Builder.CreateStructGEP(LinkTy, Link, 0));
+ Constant *FSZero =
+ Constant::getNullValue(LinkTy->getPointerTo()->getPointerTo(257));
+ Builder.CreateStore(Next, FSZero);
+}
+
+// Calls to setjmp(p) are lowered to _setjmp3(p, 0) by the frontend.
+// The idea behind _setjmp3 is that it takes an optional number of personality
+// specific parameters to indicate how to restore the personality-specific frame
+// state when longjmp is initiated. Typically, the current TryLevel is saved.
+void WinEHStatePass::rewriteSetJmpCallSite(IRBuilder<> &Builder, Function &F,
+ CallSite CS, Value *State) {
+ // Don't rewrite calls with a weird number of arguments.
+ if (CS.getNumArgOperands() != 2)
+ return;
+
+ Instruction *Inst = CS.getInstruction();
+
+ SmallVector<OperandBundleDef, 1> OpBundles;
+ CS.getOperandBundlesAsDefs(OpBundles);
+
+ SmallVector<Value *, 3> OptionalArgs;
+ if (Personality == EHPersonality::MSVC_CXX) {
+ OptionalArgs.push_back(CxxLongjmpUnwind);
+ OptionalArgs.push_back(State);
+ OptionalArgs.push_back(emitEHLSDA(Builder, &F));
+ } else if (Personality == EHPersonality::MSVC_X86SEH) {
+ OptionalArgs.push_back(SehLongjmpUnwind);
+ OptionalArgs.push_back(State);
+ if (UseStackGuard)
+ OptionalArgs.push_back(Cookie);
+ } else {
+ llvm_unreachable("unhandled personality!");
+ }
+
+ SmallVector<Value *, 5> Args;
+ Args.push_back(
+ Builder.CreateBitCast(CS.getArgOperand(0), Builder.getInt8PtrTy()));
+ Args.push_back(Builder.getInt32(OptionalArgs.size()));
+ Args.append(OptionalArgs.begin(), OptionalArgs.end());
+
+ CallSite NewCS;
+ if (CS.isCall()) {
+ auto *CI = cast<CallInst>(Inst);
+ CallInst *NewCI = Builder.CreateCall(SetJmp3, Args, OpBundles);
+ NewCI->setTailCallKind(CI->getTailCallKind());
+ NewCS = NewCI;
+ } else {
+ auto *II = cast<InvokeInst>(Inst);
+ NewCS = Builder.CreateInvoke(
+ SetJmp3, II->getNormalDest(), II->getUnwindDest(), Args, OpBundles);
+ }
+ NewCS.setCallingConv(CS.getCallingConv());
+ NewCS.setAttributes(CS.getAttributes());
+ NewCS->setDebugLoc(CS->getDebugLoc());
+
+ Instruction *NewInst = NewCS.getInstruction();
+ NewInst->takeName(Inst);
+ Inst->replaceAllUsesWith(NewInst);
+ Inst->eraseFromParent();
+}
+
+// Figure out what state we should assign calls in this block.
+int WinEHStatePass::getBaseStateForBB(
+ DenseMap<BasicBlock *, ColorVector> &BlockColors, WinEHFuncInfo &FuncInfo,
+ BasicBlock *BB) {
+ int BaseState = ParentBaseState;
+ auto &BBColors = BlockColors[BB];
+
+ assert(BBColors.size() == 1 && "multi-color BB not removed by preparation");
+ BasicBlock *FuncletEntryBB = BBColors.front();
+ if (auto *FuncletPad =
+ dyn_cast<FuncletPadInst>(FuncletEntryBB->getFirstNonPHI())) {
+ auto BaseStateI = FuncInfo.FuncletBaseStateMap.find(FuncletPad);
+ if (BaseStateI != FuncInfo.FuncletBaseStateMap.end())
+ BaseState = BaseStateI->second;
+ }
+
+ return BaseState;
+}
+
+// Calculate the state a call-site is in.
+int WinEHStatePass::getStateForCallSite(
+ DenseMap<BasicBlock *, ColorVector> &BlockColors, WinEHFuncInfo &FuncInfo,
+ CallSite CS) {
+ if (auto *II = dyn_cast<InvokeInst>(CS.getInstruction())) {
+ // Look up the state number of the EH pad this unwinds to.
+ assert(FuncInfo.InvokeStateMap.count(II) && "invoke has no state!");
+ return FuncInfo.InvokeStateMap[II];
+ }
+ // Possibly throwing call instructions have no actions to take after
+ // an unwind. Ensure they are in the -1 state.
+ return getBaseStateForBB(BlockColors, FuncInfo, CS.getParent());
+}
+
+// Calculate the intersection of all the FinalStates for a BasicBlock's
+// predecessors.
+static int getPredState(DenseMap<BasicBlock *, int> &FinalStates, Function &F,
+ int ParentBaseState, BasicBlock *BB) {
+ // The entry block has no predecessors but we know that the prologue always
+ // sets us up with a fixed state.
+ if (&F.getEntryBlock() == BB)
+ return ParentBaseState;
+
+ // This is an EH Pad, conservatively report this basic block as overdefined.
+ if (BB->isEHPad())
+ return OverdefinedState;
+
+ int CommonState = OverdefinedState;
+ for (BasicBlock *PredBB : predecessors(BB)) {
+ // We didn't manage to get a state for one of these predecessors,
+ // conservatively report this basic block as overdefined.
+ auto PredEndState = FinalStates.find(PredBB);
+ if (PredEndState == FinalStates.end())
+ return OverdefinedState;
+
+ // This code is reachable via exceptional control flow,
+ // conservatively report this basic block as overdefined.
+ if (isa<CatchReturnInst>(PredBB->getTerminator()))
+ return OverdefinedState;
+
+ int PredState = PredEndState->second;
+ assert(PredState != OverdefinedState &&
+ "overdefined BBs shouldn't be in FinalStates");
+ if (CommonState == OverdefinedState)
+ CommonState = PredState;
+
+ // At least two predecessors have different FinalStates,
+ // conservatively report this basic block as overdefined.
+ if (CommonState != PredState)
+ return OverdefinedState;
+ }
+
+ return CommonState;
+}
+
+// Calculate the intersection of all the InitialStates for a BasicBlock's
+// successors.
+static int getSuccState(DenseMap<BasicBlock *, int> &InitialStates, Function &F,
+ int ParentBaseState, BasicBlock *BB) {
+ // This block rejoins normal control flow,
+ // conservatively report this basic block as overdefined.
+ if (isa<CatchReturnInst>(BB->getTerminator()))
+ return OverdefinedState;
+
+ int CommonState = OverdefinedState;
+ for (BasicBlock *SuccBB : successors(BB)) {
+ // We didn't manage to get a state for one of these predecessors,
+ // conservatively report this basic block as overdefined.
+ auto SuccStartState = InitialStates.find(SuccBB);
+ if (SuccStartState == InitialStates.end())
+ return OverdefinedState;
+
+ // This is an EH Pad, conservatively report this basic block as overdefined.
+ if (SuccBB->isEHPad())
+ return OverdefinedState;
+
+ int SuccState = SuccStartState->second;
+ assert(SuccState != OverdefinedState &&
+ "overdefined BBs shouldn't be in FinalStates");
+ if (CommonState == OverdefinedState)
+ CommonState = SuccState;
+
+ // At least two successors have different InitialStates,
+ // conservatively report this basic block as overdefined.
+ if (CommonState != SuccState)
+ return OverdefinedState;
+ }
+
+ return CommonState;
+}
+
+bool WinEHStatePass::isStateStoreNeeded(EHPersonality Personality,
+ CallSite CS) {
+ if (!CS)
+ return false;
+
+ // If the function touches memory, it needs a state store.
+ if (isAsynchronousEHPersonality(Personality))
+ return !CS.doesNotAccessMemory();
+
+ // If the function throws, it needs a state store.
+ return !CS.doesNotThrow();
+}
+
+void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) {
+ // Mark the registration node. The backend needs to know which alloca it is so
+ // that it can recover the original frame pointer.
+ IRBuilder<> Builder(RegNode->getNextNode());
+ Value *RegNodeI8 = Builder.CreateBitCast(RegNode, Builder.getInt8PtrTy());
+ Builder.CreateCall(
+ Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_ehregnode),
+ {RegNodeI8});
+
+ if (EHGuardNode) {
+ IRBuilder<> Builder(EHGuardNode->getNextNode());
+ Value *EHGuardNodeI8 =
+ Builder.CreateBitCast(EHGuardNode, Builder.getInt8PtrTy());
+ Builder.CreateCall(
+ Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_ehguard),
+ {EHGuardNodeI8});
+ }
+
+ // Calculate state numbers.
+ if (isAsynchronousEHPersonality(Personality))
+ calculateSEHStateNumbers(&F, FuncInfo);
+ else
+ calculateWinCXXEHStateNumbers(&F, FuncInfo);
+
+ // Iterate all the instructions and emit state number stores.
+ DenseMap<BasicBlock *, ColorVector> BlockColors = colorEHFunclets(F);
+ ReversePostOrderTraversal<Function *> RPOT(&F);
+
+ // InitialStates yields the state of the first call-site for a BasicBlock.
+ DenseMap<BasicBlock *, int> InitialStates;
+ // FinalStates yields the state of the last call-site for a BasicBlock.
+ DenseMap<BasicBlock *, int> FinalStates;
+ // Worklist used to revisit BasicBlocks with indeterminate
+ // Initial/Final-States.
+ std::deque<BasicBlock *> Worklist;
+ // Fill in InitialStates and FinalStates for BasicBlocks with call-sites.
+ for (BasicBlock *BB : RPOT) {
+ int InitialState = OverdefinedState;
+ int FinalState;
+ if (&F.getEntryBlock() == BB)
+ InitialState = FinalState = ParentBaseState;
+ for (Instruction &I : *BB) {
+ CallSite CS(&I);
+ if (!isStateStoreNeeded(Personality, CS))
+ continue;
+
+ int State = getStateForCallSite(BlockColors, FuncInfo, CS);
+ if (InitialState == OverdefinedState)
+ InitialState = State;
+ FinalState = State;
+ }
+ // No call-sites in this basic block? That's OK, we will come back to these
+ // in a later pass.
+ if (InitialState == OverdefinedState) {
+ Worklist.push_back(BB);
+ continue;
+ }
+ DEBUG(dbgs() << "X86WinEHState: " << BB->getName()
+ << " InitialState=" << InitialState << '\n');
+ DEBUG(dbgs() << "X86WinEHState: " << BB->getName()
+ << " FinalState=" << FinalState << '\n');
+ InitialStates.insert({BB, InitialState});
+ FinalStates.insert({BB, FinalState});
+ }
+
+ // Try to fill-in InitialStates and FinalStates which have no call-sites.
+ while (!Worklist.empty()) {
+ BasicBlock *BB = Worklist.front();
+ Worklist.pop_front();
+ // This BasicBlock has already been figured out, nothing more we can do.
+ if (InitialStates.count(BB) != 0)
+ continue;
+
+ int PredState = getPredState(FinalStates, F, ParentBaseState, BB);
+ if (PredState == OverdefinedState)
+ continue;
+
+ // We successfully inferred this BasicBlock's state via it's predecessors;
+ // enqueue it's successors to see if we can infer their states.
+ InitialStates.insert({BB, PredState});
+ FinalStates.insert({BB, PredState});
+ for (BasicBlock *SuccBB : successors(BB))
+ Worklist.push_back(SuccBB);
+ }
+
+ // Try to hoist stores from successors.
+ for (BasicBlock *BB : RPOT) {
+ int SuccState = getSuccState(InitialStates, F, ParentBaseState, BB);
+ if (SuccState == OverdefinedState)
+ continue;
+
+ // Update our FinalState to reflect the common InitialState of our
+ // successors.
+ FinalStates.insert({BB, SuccState});
+ }
+
+ // Finally, insert state stores before call-sites which transition us to a new
+ // state.
+ for (BasicBlock *BB : RPOT) {
+ auto &BBColors = BlockColors[BB];
+ BasicBlock *FuncletEntryBB = BBColors.front();
+ if (isa<CleanupPadInst>(FuncletEntryBB->getFirstNonPHI()))
+ continue;
+
+ int PrevState = getPredState(FinalStates, F, ParentBaseState, BB);
+ DEBUG(dbgs() << "X86WinEHState: " << BB->getName()
+ << " PrevState=" << PrevState << '\n');
+
+ for (Instruction &I : *BB) {
+ CallSite CS(&I);
+ if (!isStateStoreNeeded(Personality, CS))
+ continue;
+
+ int State = getStateForCallSite(BlockColors, FuncInfo, CS);
+ if (State != PrevState)
+ insertStateNumberStore(&I, State);
+ PrevState = State;
+ }
+
+ // We might have hoisted a state store into this block, emit it now.
+ auto EndState = FinalStates.find(BB);
+ if (EndState != FinalStates.end())
+ if (EndState->second != PrevState)
+ insertStateNumberStore(BB->getTerminator(), EndState->second);
+ }
+
+ SmallVector<CallSite, 1> SetJmp3CallSites;
+ for (BasicBlock *BB : RPOT) {
+ for (Instruction &I : *BB) {
+ CallSite CS(&I);
+ if (!CS)
+ continue;
+ if (CS.getCalledValue()->stripPointerCasts() !=
+ SetJmp3->stripPointerCasts())
+ continue;
+
+ SetJmp3CallSites.push_back(CS);
+ }
+ }
+
+ for (CallSite CS : SetJmp3CallSites) {
+ auto &BBColors = BlockColors[CS->getParent()];
+ BasicBlock *FuncletEntryBB = BBColors.front();
+ bool InCleanup = isa<CleanupPadInst>(FuncletEntryBB->getFirstNonPHI());
+
+ IRBuilder<> Builder(CS.getInstruction());
+ Value *State;
+ if (InCleanup) {
+ Value *StateField =
+ Builder.CreateStructGEP(nullptr, RegNode, StateFieldIndex);
+ State = Builder.CreateLoad(StateField);
+ } else {
+ State = Builder.getInt32(getStateForCallSite(BlockColors, FuncInfo, CS));
+ }
+ rewriteSetJmpCallSite(Builder, F, CS, State);
+ }
+}
+
+void WinEHStatePass::insertStateNumberStore(Instruction *IP, int State) {
+ IRBuilder<> Builder(IP);
+ Value *StateField =
+ Builder.CreateStructGEP(nullptr, RegNode, StateFieldIndex);
+ Builder.CreateStore(Builder.getInt32(State), StateField);
+}
diff --git a/contrib/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp b/contrib/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
new file mode 100644
index 000000000000..059b75ef482a
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
@@ -0,0 +1,785 @@
+//===- XCoreDisassembler.cpp - Disassembler for XCore -----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file is part of the XCore Disassembler.
+///
+//===----------------------------------------------------------------------===//
+
+#include "XCore.h"
+#include "XCoreRegisterInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "xcore-disassembler"
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+
+/// \brief A disassembler class for XCore.
+class XCoreDisassembler : public MCDisassembler {
+public:
+ XCoreDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) :
+ MCDisassembler(STI, Ctx) {}
+
+ DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes, uint64_t Address,
+ raw_ostream &VStream,
+ raw_ostream &CStream) const override;
+};
+}
+
+static bool readInstruction16(ArrayRef<uint8_t> Bytes, uint64_t Address,
+ uint64_t &Size, uint16_t &Insn) {
+ // We want to read exactly 2 Bytes of data.
+ if (Bytes.size() < 2) {
+ Size = 0;
+ return false;
+ }
+ // Encoded as a little-endian 16-bit word in the stream.
+ Insn = (Bytes[0] << 0) | (Bytes[1] << 8);
+ return true;
+}
+
+static bool readInstruction32(ArrayRef<uint8_t> Bytes, uint64_t Address,
+ uint64_t &Size, uint32_t &Insn) {
+ // We want to read exactly 4 Bytes of data.
+ if (Bytes.size() < 4) {
+ Size = 0;
+ return false;
+ }
+ // Encoded as a little-endian 32-bit word in the stream.
+ Insn =
+ (Bytes[0] << 0) | (Bytes[1] << 8) | (Bytes[2] << 16) | (Bytes[3] << 24);
+ return true;
+}
+
+static unsigned getReg(const void *D, unsigned RC, unsigned RegNo) {
+ const XCoreDisassembler *Dis = static_cast<const XCoreDisassembler*>(D);
+ const MCRegisterInfo *RegInfo = Dis->getContext().getRegisterInfo();
+ return *(RegInfo->getRegClass(RC).begin() + RegNo);
+}
+
+static DecodeStatus DecodeGRRegsRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeRRegsRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeBitpOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeNegImmOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+
+static DecodeStatus Decode2RInstruction(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus Decode2RImmInstruction(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeR2RInstruction(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus Decode2RSrcDstInstruction(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeRUSInstruction(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeRUSBitpInstruction(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeRUSSrcDstBitpInstruction(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeL2RInstruction(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeLR2RInstruction(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus Decode3RInstruction(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus Decode3RImmInstruction(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus Decode2RUSInstruction(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus Decode2RUSBitpInstruction(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeL3RInstruction(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeL3RSrcDstInstruction(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeL2RUSInstruction(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeL2RUSBitpInstruction(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeL6RInstruction(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeL5RInstruction(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeL4RSrcDstInstruction(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeL4RSrcDstSrcDstInstruction(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+#include "XCoreGenDisassemblerTables.inc"
+
+static DecodeStatus DecodeGRRegsRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder)
+{
+ if (RegNo > 11)
+ return MCDisassembler::Fail;
+ unsigned Reg = getReg(Decoder, XCore::GRRegsRegClassID, RegNo);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeRRegsRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder)
+{
+ if (RegNo > 15)
+ return MCDisassembler::Fail;
+ unsigned Reg = getReg(Decoder, XCore::RRegsRegClassID, RegNo);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeBitpOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ if (Val > 11)
+ return MCDisassembler::Fail;
+ static const unsigned Values[] = {
+ 32 /*bpw*/, 1, 2, 3, 4, 5, 6, 7, 8, 16, 24, 32
+ };
+ Inst.addOperand(MCOperand::createImm(Values[Val]));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeNegImmOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ Inst.addOperand(MCOperand::createImm(-(int64_t)Val));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus
+Decode2OpInstruction(unsigned Insn, unsigned &Op1, unsigned &Op2) {
+ unsigned Combined = fieldFromInstruction(Insn, 6, 5);
+ if (Combined < 27)
+ return MCDisassembler::Fail;
+ if (fieldFromInstruction(Insn, 5, 1)) {
+ if (Combined == 31)
+ return MCDisassembler::Fail;
+ Combined += 5;
+ }
+ Combined -= 27;
+ unsigned Op1High = Combined % 3;
+ unsigned Op2High = Combined / 3;
+ Op1 = (Op1High << 2) | fieldFromInstruction(Insn, 2, 2);
+ Op2 = (Op2High << 2) | fieldFromInstruction(Insn, 0, 2);
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus
+Decode3OpInstruction(unsigned Insn, unsigned &Op1, unsigned &Op2,
+ unsigned &Op3) {
+ unsigned Combined = fieldFromInstruction(Insn, 6, 5);
+ if (Combined >= 27)
+ return MCDisassembler::Fail;
+
+ unsigned Op1High = Combined % 3;
+ unsigned Op2High = (Combined / 3) % 3;
+ unsigned Op3High = Combined / 9;
+ Op1 = (Op1High << 2) | fieldFromInstruction(Insn, 4, 2);
+ Op2 = (Op2High << 2) | fieldFromInstruction(Insn, 2, 2);
+ Op3 = (Op3High << 2) | fieldFromInstruction(Insn, 0, 2);
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus
+Decode2OpInstructionFail(MCInst &Inst, unsigned Insn, uint64_t Address,
+ const void *Decoder) {
+ // Try and decode as a 3R instruction.
+ unsigned Opcode = fieldFromInstruction(Insn, 11, 5);
+ switch (Opcode) {
+ case 0x0:
+ Inst.setOpcode(XCore::STW_2rus);
+ return Decode2RUSInstruction(Inst, Insn, Address, Decoder);
+ case 0x1:
+ Inst.setOpcode(XCore::LDW_2rus);
+ return Decode2RUSInstruction(Inst, Insn, Address, Decoder);
+ case 0x2:
+ Inst.setOpcode(XCore::ADD_3r);
+ return Decode3RInstruction(Inst, Insn, Address, Decoder);
+ case 0x3:
+ Inst.setOpcode(XCore::SUB_3r);
+ return Decode3RInstruction(Inst, Insn, Address, Decoder);
+ case 0x4:
+ Inst.setOpcode(XCore::SHL_3r);
+ return Decode3RInstruction(Inst, Insn, Address, Decoder);
+ case 0x5:
+ Inst.setOpcode(XCore::SHR_3r);
+ return Decode3RInstruction(Inst, Insn, Address, Decoder);
+ case 0x6:
+ Inst.setOpcode(XCore::EQ_3r);
+ return Decode3RInstruction(Inst, Insn, Address, Decoder);
+ case 0x7:
+ Inst.setOpcode(XCore::AND_3r);
+ return Decode3RInstruction(Inst, Insn, Address, Decoder);
+ case 0x8:
+ Inst.setOpcode(XCore::OR_3r);
+ return Decode3RInstruction(Inst, Insn, Address, Decoder);
+ case 0x9:
+ Inst.setOpcode(XCore::LDW_3r);
+ return Decode3RInstruction(Inst, Insn, Address, Decoder);
+ case 0x10:
+ Inst.setOpcode(XCore::LD16S_3r);
+ return Decode3RInstruction(Inst, Insn, Address, Decoder);
+ case 0x11:
+ Inst.setOpcode(XCore::LD8U_3r);
+ return Decode3RInstruction(Inst, Insn, Address, Decoder);
+ case 0x12:
+ Inst.setOpcode(XCore::ADD_2rus);
+ return Decode2RUSInstruction(Inst, Insn, Address, Decoder);
+ case 0x13:
+ Inst.setOpcode(XCore::SUB_2rus);
+ return Decode2RUSInstruction(Inst, Insn, Address, Decoder);
+ case 0x14:
+ Inst.setOpcode(XCore::SHL_2rus);
+ return Decode2RUSBitpInstruction(Inst, Insn, Address, Decoder);
+ case 0x15:
+ Inst.setOpcode(XCore::SHR_2rus);
+ return Decode2RUSBitpInstruction(Inst, Insn, Address, Decoder);
+ case 0x16:
+ Inst.setOpcode(XCore::EQ_2rus);
+ return Decode2RUSInstruction(Inst, Insn, Address, Decoder);
+ case 0x17:
+ Inst.setOpcode(XCore::TSETR_3r);
+ return Decode3RImmInstruction(Inst, Insn, Address, Decoder);
+ case 0x18:
+ Inst.setOpcode(XCore::LSS_3r);
+ return Decode3RInstruction(Inst, Insn, Address, Decoder);
+ case 0x19:
+ Inst.setOpcode(XCore::LSU_3r);
+ return Decode3RInstruction(Inst, Insn, Address, Decoder);
+ }
+ return MCDisassembler::Fail;
+}
+
+static DecodeStatus
+Decode2RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+ const void *Decoder) {
+ unsigned Op1, Op2;
+ DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
+ if (S != MCDisassembler::Success)
+ return Decode2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+ DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+ return S;
+}
+
+static DecodeStatus
+Decode2RImmInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+ const void *Decoder) {
+ unsigned Op1, Op2;
+ DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
+ if (S != MCDisassembler::Success)
+ return Decode2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+ Inst.addOperand(MCOperand::createImm(Op1));
+ DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+ return S;
+}
+
+static DecodeStatus
+DecodeR2RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+ const void *Decoder) {
+ unsigned Op1, Op2;
+ DecodeStatus S = Decode2OpInstruction(Insn, Op2, Op1);
+ if (S != MCDisassembler::Success)
+ return Decode2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+ DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+ return S;
+}
+
+static DecodeStatus
+Decode2RSrcDstInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+ const void *Decoder) {
+ unsigned Op1, Op2;
+ DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
+ if (S != MCDisassembler::Success)
+ return Decode2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+ DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+ return S;
+}
+
+static DecodeStatus
+DecodeRUSInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+ const void *Decoder) {
+ unsigned Op1, Op2;
+ DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
+ if (S != MCDisassembler::Success)
+ return Decode2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+ DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+ Inst.addOperand(MCOperand::createImm(Op2));
+ return S;
+}
+
+static DecodeStatus
+DecodeRUSBitpInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+ const void *Decoder) {
+ unsigned Op1, Op2;
+ DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
+ if (S != MCDisassembler::Success)
+ return Decode2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+ DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+ DecodeBitpOperand(Inst, Op2, Address, Decoder);
+ return S;
+}
+
+static DecodeStatus
+DecodeRUSSrcDstBitpInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+ const void *Decoder) {
+ unsigned Op1, Op2;
+ DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
+ if (S != MCDisassembler::Success)
+ return Decode2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+ DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+ DecodeBitpOperand(Inst, Op2, Address, Decoder);
+ return S;
+}
+
+static DecodeStatus
+DecodeL2OpInstructionFail(MCInst &Inst, unsigned Insn, uint64_t Address,
+ const void *Decoder) {
+ // Try and decode as a L3R / L2RUS instruction.
+ unsigned Opcode = fieldFromInstruction(Insn, 16, 4) |
+ fieldFromInstruction(Insn, 27, 5) << 4;
+ switch (Opcode) {
+ case 0x0c:
+ Inst.setOpcode(XCore::STW_l3r);
+ return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+ case 0x1c:
+ Inst.setOpcode(XCore::XOR_l3r);
+ return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+ case 0x2c:
+ Inst.setOpcode(XCore::ASHR_l3r);
+ return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+ case 0x3c:
+ Inst.setOpcode(XCore::LDAWF_l3r);
+ return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+ case 0x4c:
+ Inst.setOpcode(XCore::LDAWB_l3r);
+ return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+ case 0x5c:
+ Inst.setOpcode(XCore::LDA16F_l3r);
+ return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+ case 0x6c:
+ Inst.setOpcode(XCore::LDA16B_l3r);
+ return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+ case 0x7c:
+ Inst.setOpcode(XCore::MUL_l3r);
+ return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+ case 0x8c:
+ Inst.setOpcode(XCore::DIVS_l3r);
+ return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+ case 0x9c:
+ Inst.setOpcode(XCore::DIVU_l3r);
+ return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+ case 0x10c:
+ Inst.setOpcode(XCore::ST16_l3r);
+ return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+ case 0x11c:
+ Inst.setOpcode(XCore::ST8_l3r);
+ return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+ case 0x12c:
+ Inst.setOpcode(XCore::ASHR_l2rus);
+ return DecodeL2RUSBitpInstruction(Inst, Insn, Address, Decoder);
+ case 0x12d:
+ Inst.setOpcode(XCore::OUTPW_l2rus);
+ return DecodeL2RUSBitpInstruction(Inst, Insn, Address, Decoder);
+ case 0x12e:
+ Inst.setOpcode(XCore::INPW_l2rus);
+ return DecodeL2RUSBitpInstruction(Inst, Insn, Address, Decoder);
+ case 0x13c:
+ Inst.setOpcode(XCore::LDAWF_l2rus);
+ return DecodeL2RUSInstruction(Inst, Insn, Address, Decoder);
+ case 0x14c:
+ Inst.setOpcode(XCore::LDAWB_l2rus);
+ return DecodeL2RUSInstruction(Inst, Insn, Address, Decoder);
+ case 0x15c:
+ Inst.setOpcode(XCore::CRC_l3r);
+ return DecodeL3RSrcDstInstruction(Inst, Insn, Address, Decoder);
+ case 0x18c:
+ Inst.setOpcode(XCore::REMS_l3r);
+ return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+ case 0x19c:
+ Inst.setOpcode(XCore::REMU_l3r);
+ return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+ }
+ return MCDisassembler::Fail;
+}
+
+static DecodeStatus
+DecodeL2RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+ const void *Decoder) {
+ unsigned Op1, Op2;
+ DecodeStatus S = Decode2OpInstruction(fieldFromInstruction(Insn, 0, 16),
+ Op1, Op2);
+ if (S != MCDisassembler::Success)
+ return DecodeL2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+ DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+ return S;
+}
+
+static DecodeStatus
+DecodeLR2RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+ const void *Decoder) {
+ unsigned Op1, Op2;
+ DecodeStatus S = Decode2OpInstruction(fieldFromInstruction(Insn, 0, 16),
+ Op1, Op2);
+ if (S != MCDisassembler::Success)
+ return DecodeL2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+ DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+ return S;
+}
+
+static DecodeStatus
+Decode3RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+ const void *Decoder) {
+ unsigned Op1, Op2, Op3;
+ DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
+ if (S == MCDisassembler::Success) {
+ DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+ }
+ return S;
+}
+
+static DecodeStatus
+Decode3RImmInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+ const void *Decoder) {
+ unsigned Op1, Op2, Op3;
+ DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
+ if (S == MCDisassembler::Success) {
+ Inst.addOperand(MCOperand::createImm(Op1));
+ DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+ }
+ return S;
+}
+
+static DecodeStatus
+Decode2RUSInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+ const void *Decoder) {
+ unsigned Op1, Op2, Op3;
+ DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
+ if (S == MCDisassembler::Success) {
+ DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+ Inst.addOperand(MCOperand::createImm(Op3));
+ }
+ return S;
+}
+
+static DecodeStatus
+Decode2RUSBitpInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+ const void *Decoder) {
+ unsigned Op1, Op2, Op3;
+ DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
+ if (S == MCDisassembler::Success) {
+ DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+ DecodeBitpOperand(Inst, Op3, Address, Decoder);
+ }
+ return S;
+}
+
+static DecodeStatus
+DecodeL3RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+ const void *Decoder) {
+ unsigned Op1, Op2, Op3;
+ DecodeStatus S =
+ Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+ if (S == MCDisassembler::Success) {
+ DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+ }
+ return S;
+}
+
+static DecodeStatus
+DecodeL3RSrcDstInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+ const void *Decoder) {
+ unsigned Op1, Op2, Op3;
+ DecodeStatus S =
+ Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+ if (S == MCDisassembler::Success) {
+ DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+ }
+ return S;
+}
+
+static DecodeStatus
+DecodeL2RUSInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+ const void *Decoder) {
+ unsigned Op1, Op2, Op3;
+ DecodeStatus S =
+ Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+ if (S == MCDisassembler::Success) {
+ DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+ Inst.addOperand(MCOperand::createImm(Op3));
+ }
+ return S;
+}
+
+static DecodeStatus
+DecodeL2RUSBitpInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+ const void *Decoder) {
+ unsigned Op1, Op2, Op3;
+ DecodeStatus S =
+ Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+ if (S == MCDisassembler::Success) {
+ DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+ DecodeBitpOperand(Inst, Op3, Address, Decoder);
+ }
+ return S;
+}
+
+static DecodeStatus
+DecodeL6RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+ const void *Decoder) {
+ unsigned Op1, Op2, Op3, Op4, Op5, Op6;
+ DecodeStatus S =
+ Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+ if (S != MCDisassembler::Success)
+ return S;
+ S = Decode3OpInstruction(fieldFromInstruction(Insn, 16, 16), Op4, Op5, Op6);
+ if (S != MCDisassembler::Success)
+ return S;
+ DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op4, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op5, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op6, Address, Decoder);
+ return S;
+}
+
+static DecodeStatus
+DecodeL5RInstructionFail(MCInst &Inst, unsigned Insn, uint64_t Address,
+ const void *Decoder) {
+ // Try and decode as a L6R instruction.
+ Inst.clear();
+ unsigned Opcode = fieldFromInstruction(Insn, 27, 5);
+ switch (Opcode) {
+ case 0x00:
+ Inst.setOpcode(XCore::LMUL_l6r);
+ return DecodeL6RInstruction(Inst, Insn, Address, Decoder);
+ }
+ return MCDisassembler::Fail;
+}
+
+static DecodeStatus
+DecodeL5RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+ const void *Decoder) {
+ unsigned Op1, Op2, Op3, Op4, Op5;
+ DecodeStatus S =
+ Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+ if (S != MCDisassembler::Success)
+ return DecodeL5RInstructionFail(Inst, Insn, Address, Decoder);
+ S = Decode2OpInstruction(fieldFromInstruction(Insn, 16, 16), Op4, Op5);
+ if (S != MCDisassembler::Success)
+ return DecodeL5RInstructionFail(Inst, Insn, Address, Decoder);
+
+ DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op4, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op5, Address, Decoder);
+ return S;
+}
+
+static DecodeStatus
+DecodeL4RSrcDstInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+ const void *Decoder) {
+ unsigned Op1, Op2, Op3;
+ unsigned Op4 = fieldFromInstruction(Insn, 16, 4);
+ DecodeStatus S =
+ Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+ if (S == MCDisassembler::Success) {
+ DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+ S = DecodeGRRegsRegisterClass(Inst, Op4, Address, Decoder);
+ }
+ if (S == MCDisassembler::Success) {
+ DecodeGRRegsRegisterClass(Inst, Op4, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+ }
+ return S;
+}
+
+static DecodeStatus
+DecodeL4RSrcDstSrcDstInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+ const void *Decoder) {
+ unsigned Op1, Op2, Op3;
+ unsigned Op4 = fieldFromInstruction(Insn, 16, 4);
+ DecodeStatus S =
+ Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+ if (S == MCDisassembler::Success) {
+ DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+ S = DecodeGRRegsRegisterClass(Inst, Op4, Address, Decoder);
+ }
+ if (S == MCDisassembler::Success) {
+ DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op4, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+ }
+ return S;
+}
+
+MCDisassembler::DecodeStatus XCoreDisassembler::getInstruction(
+ MCInst &instr, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address,
+ raw_ostream &vStream, raw_ostream &cStream) const {
+ uint16_t insn16;
+
+ if (!readInstruction16(Bytes, Address, Size, insn16)) {
+ return Fail;
+ }
+
+ // Calling the auto-generated decoder function.
+ DecodeStatus Result = decodeInstruction(DecoderTable16, instr, insn16,
+ Address, this, STI);
+ if (Result != Fail) {
+ Size = 2;
+ return Result;
+ }
+
+ uint32_t insn32;
+
+ if (!readInstruction32(Bytes, Address, Size, insn32)) {
+ return Fail;
+ }
+
+ // Calling the auto-generated decoder function.
+ Result = decodeInstruction(DecoderTable32, instr, insn32, Address, this, STI);
+ if (Result != Fail) {
+ Size = 4;
+ return Result;
+ }
+
+ return Fail;
+}
+
+namespace llvm {
+ Target &getTheXCoreTarget();
+}
+
+static MCDisassembler *createXCoreDisassembler(const Target &T,
+ const MCSubtargetInfo &STI,
+ MCContext &Ctx) {
+ return new XCoreDisassembler(STI, Ctx);
+}
+
+extern "C" void LLVMInitializeXCoreDisassembler() {
+ // Register the disassembler.
+ TargetRegistry::RegisterMCDisassembler(getTheXCoreTarget(),
+ createXCoreDisassembler);
+}
diff --git a/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp b/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp
new file mode 100644
index 000000000000..500c84d2a418
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp
@@ -0,0 +1,88 @@
+//===-- XCoreInstPrinter.cpp - Convert XCore MCInst to assembly syntax ----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an XCore MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreInstPrinter.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+#include "XCoreGenAsmWriter.inc"
+
+void XCoreInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+ OS << StringRef(getRegisterName(RegNo)).lower();
+}
+
+void XCoreInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+ StringRef Annot, const MCSubtargetInfo &STI) {
+ printInstruction(MI, O);
+ printAnnotation(O, Annot);
+}
+
+void XCoreInstPrinter::
+printInlineJT(const MCInst *MI, int opNum, raw_ostream &O) {
+ report_fatal_error("can't handle InlineJT");
+}
+
+void XCoreInstPrinter::
+printInlineJT32(const MCInst *MI, int opNum, raw_ostream &O) {
+ report_fatal_error("can't handle InlineJT32");
+}
+
+static void printExpr(const MCExpr *Expr, const MCAsmInfo *MAI,
+ raw_ostream &OS) {
+ int Offset = 0;
+ const MCSymbolRefExpr *SRE;
+
+ if (const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(Expr)) {
+ SRE = dyn_cast<MCSymbolRefExpr>(BE->getLHS());
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(BE->getRHS());
+ assert(SRE && CE && "Binary expression must be sym+const.");
+ Offset = CE->getValue();
+ } else {
+ SRE = dyn_cast<MCSymbolRefExpr>(Expr);
+ assert(SRE && "Unexpected MCExpr type.");
+ }
+ assert(SRE->getKind() == MCSymbolRefExpr::VK_None);
+
+ SRE->getSymbol().print(OS, MAI);
+
+ if (Offset) {
+ if (Offset > 0)
+ OS << '+';
+ OS << Offset;
+ }
+}
+
+void XCoreInstPrinter::
+printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isReg()) {
+ printRegName(O, Op.getReg());
+ return;
+ }
+
+ if (Op.isImm()) {
+ O << Op.getImm();
+ return;
+ }
+
+ assert(Op.isExpr() && "unknown operand kind in printOperand");
+ printExpr(Op.getExpr(), &MAI, O);
+}
diff --git a/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h b/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
new file mode 100644
index 000000000000..dc513f7b225b
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
@@ -0,0 +1,43 @@
+//== XCoreInstPrinter.h - Convert XCore MCInst to assembly syntax -*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains the declaration of the XCoreInstPrinter class,
+/// which is used to print XCore MCInst to a .s file.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XCORE_INSTPRINTER_XCOREINSTPRINTER_H
+#define LLVM_LIB_TARGET_XCORE_INSTPRINTER_XCOREINSTPRINTER_H
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+
+class XCoreInstPrinter : public MCInstPrinter {
+public:
+ XCoreInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI)
+ : MCInstPrinter(MAI, MII, MRI) {}
+
+ // Autogenerated by tblgen.
+ void printInstruction(const MCInst *MI, raw_ostream &O);
+ static const char *getRegisterName(unsigned RegNo);
+
+ void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+ void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
+ const MCSubtargetInfo &STI) override;
+private:
+ void printInlineJT(const MCInst *MI, int opNum, raw_ostream &O);
+ void printInlineJT32(const MCInst *MI, int opNum, raw_ostream &O);
+ void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printMemOperand(const MCInst *MI, int opNum, raw_ostream &O);
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp b/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
new file mode 100644
index 000000000000..3178a4edbb3b
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
@@ -0,0 +1,33 @@
+//===-- XCoreMCAsmInfo.cpp - XCore asm properties -------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreMCAsmInfo.h"
+using namespace llvm;
+
+void XCoreMCAsmInfo::anchor() { }
+
+XCoreMCAsmInfo::XCoreMCAsmInfo(const Triple &TT) {
+ SupportsDebugInformation = true;
+ Data16bitsDirective = "\t.short\t";
+ Data32bitsDirective = "\t.long\t";
+ Data64bitsDirective = nullptr;
+ ZeroDirective = "\t.space\t";
+ CommentString = "#";
+
+ AscizDirective = ".asciiz";
+
+ HiddenVisibilityAttr = MCSA_Invalid;
+ HiddenDeclarationVisibilityAttr = MCSA_Invalid;
+ ProtectedVisibilityAttr = MCSA_Invalid;
+
+ // Debug
+ ExceptionsType = ExceptionHandling::DwarfCFI;
+ DwarfRegNumForCFI = true;
+}
+
diff --git a/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h b/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h
new file mode 100644
index 000000000000..39581e424e8c
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h
@@ -0,0 +1,31 @@
+//===-- XCoreMCAsmInfo.h - XCore asm properties ----------------*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the XCoreMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XCORE_MCTARGETDESC_XCOREMCASMINFO_H
+#define LLVM_LIB_TARGET_XCORE_MCTARGETDESC_XCOREMCASMINFO_H
+
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+class Triple;
+
+class XCoreMCAsmInfo : public MCAsmInfoELF {
+ void anchor() override;
+
+public:
+ explicit XCoreMCAsmInfo(const Triple &TT);
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp b/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
new file mode 100644
index 000000000000..c5859b7786f7
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
@@ -0,0 +1,151 @@
+//===-- XCoreMCTargetDesc.cpp - XCore Target Descriptions -----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides XCore specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreMCTargetDesc.h"
+#include "InstPrinter/XCoreInstPrinter.h"
+#include "XCoreMCAsmInfo.h"
+#include "XCoreTargetStreamer.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "XCoreGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "XCoreGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "XCoreGenRegisterInfo.inc"
+
+static MCInstrInfo *createXCoreMCInstrInfo() {
+ MCInstrInfo *X = new MCInstrInfo();
+ InitXCoreMCInstrInfo(X);
+ return X;
+}
+
+static MCRegisterInfo *createXCoreMCRegisterInfo(const Triple &TT) {
+ MCRegisterInfo *X = new MCRegisterInfo();
+ InitXCoreMCRegisterInfo(X, XCore::LR);
+ return X;
+}
+
+static MCSubtargetInfo *
+createXCoreMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
+ return createXCoreMCSubtargetInfoImpl(TT, CPU, FS);
+}
+
+static MCAsmInfo *createXCoreMCAsmInfo(const MCRegisterInfo &MRI,
+ const Triple &TT) {
+ MCAsmInfo *MAI = new XCoreMCAsmInfo(TT);
+
+ // Initial state of the frame pointer is SP.
+ MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, XCore::SP, 0);
+ MAI->addInitialFrameState(Inst);
+
+ return MAI;
+}
+
+static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM,
+ CodeModel::Model &CM) {
+ if (CM == CodeModel::Default) {
+ CM = CodeModel::Small;
+ }
+ if (CM != CodeModel::Small && CM != CodeModel::Large)
+ report_fatal_error("Target only supports CodeModel Small or Large");
+}
+
+static MCInstPrinter *createXCoreMCInstPrinter(const Triple &T,
+ unsigned SyntaxVariant,
+ const MCAsmInfo &MAI,
+ const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI) {
+ return new XCoreInstPrinter(MAI, MII, MRI);
+}
+
+XCoreTargetStreamer::XCoreTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
+XCoreTargetStreamer::~XCoreTargetStreamer() {}
+
+namespace {
+
+class XCoreTargetAsmStreamer : public XCoreTargetStreamer {
+ formatted_raw_ostream &OS;
+public:
+ XCoreTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
+ void emitCCTopData(StringRef Name) override;
+ void emitCCTopFunction(StringRef Name) override;
+ void emitCCBottomData(StringRef Name) override;
+ void emitCCBottomFunction(StringRef Name) override;
+};
+
+XCoreTargetAsmStreamer::XCoreTargetAsmStreamer(MCStreamer &S,
+ formatted_raw_ostream &OS)
+ : XCoreTargetStreamer(S), OS(OS) {}
+
+void XCoreTargetAsmStreamer::emitCCTopData(StringRef Name) {
+ OS << "\t.cc_top " << Name << ".data," << Name << '\n';
+}
+
+void XCoreTargetAsmStreamer::emitCCTopFunction(StringRef Name) {
+ OS << "\t.cc_top " << Name << ".function," << Name << '\n';
+}
+
+void XCoreTargetAsmStreamer::emitCCBottomData(StringRef Name) {
+ OS << "\t.cc_bottom " << Name << ".data\n";
+}
+
+void XCoreTargetAsmStreamer::emitCCBottomFunction(StringRef Name) {
+ OS << "\t.cc_bottom " << Name << ".function\n";
+}
+}
+
+static MCTargetStreamer *createTargetAsmStreamer(MCStreamer &S,
+ formatted_raw_ostream &OS,
+ MCInstPrinter *InstPrint,
+ bool isVerboseAsm) {
+ return new XCoreTargetAsmStreamer(S, OS);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeXCoreTargetMC() {
+ // Register the MC asm info.
+ RegisterMCAsmInfoFn X(getTheXCoreTarget(), createXCoreMCAsmInfo);
+
+ // Register the MC codegen info.
+ TargetRegistry::registerMCAdjustCodeGenOpts(getTheXCoreTarget(),
+ adjustCodeGenOpts);
+
+ // Register the MC instruction info.
+ TargetRegistry::RegisterMCInstrInfo(getTheXCoreTarget(),
+ createXCoreMCInstrInfo);
+
+ // Register the MC register info.
+ TargetRegistry::RegisterMCRegInfo(getTheXCoreTarget(),
+ createXCoreMCRegisterInfo);
+
+ // Register the MC subtarget info.
+ TargetRegistry::RegisterMCSubtargetInfo(getTheXCoreTarget(),
+ createXCoreMCSubtargetInfo);
+
+ // Register the MCInstPrinter
+ TargetRegistry::RegisterMCInstPrinter(getTheXCoreTarget(),
+ createXCoreMCInstPrinter);
+
+ TargetRegistry::RegisterAsmTargetStreamer(getTheXCoreTarget(),
+ createTargetAsmStreamer);
+}
diff --git a/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h b/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h
new file mode 100644
index 000000000000..ac0f3fefbae7
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h
@@ -0,0 +1,39 @@
+//===-- XCoreMCTargetDesc.h - XCore Target Descriptions ---------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides XCore specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XCORE_MCTARGETDESC_XCOREMCTARGETDESC_H
+#define LLVM_LIB_TARGET_XCORE_MCTARGETDESC_XCOREMCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+class Target;
+Target &getTheXCoreTarget();
+
+} // End llvm namespace
+
+// Defines symbolic names for XCore registers. This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "XCoreGenRegisterInfo.inc"
+
+// Defines symbolic names for the XCore instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "XCoreGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "XCoreGenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp b/contrib/llvm/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp
new file mode 100644
index 000000000000..df5774c7e8ea
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp
@@ -0,0 +1,22 @@
+//===-- XCoreTargetInfo.cpp - XCore Target Implementation -----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCore.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target &llvm::getTheXCoreTarget() {
+ static Target TheXCoreTarget;
+ return TheXCoreTarget;
+}
+
+extern "C" void LLVMInitializeXCoreTargetInfo() {
+ RegisterTarget<Triple::xcore> X(getTheXCoreTarget(), "xcore", "XCore");
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCore.h b/contrib/llvm/lib/Target/XCore/XCore.h
new file mode 100644
index 000000000000..ba6ca843671e
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCore.h
@@ -0,0 +1,37 @@
+//===-- XCore.h - Top-level interface for XCore representation --*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// XCore back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XCORE_XCORE_H
+#define LLVM_LIB_TARGET_XCORE_XCORE_H
+
+#include "MCTargetDesc/XCoreMCTargetDesc.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+ class FunctionPass;
+ class ModulePass;
+ class TargetMachine;
+ class XCoreTargetMachine;
+ class formatted_raw_ostream;
+
+ void initializeXCoreLowerThreadLocalPass(PassRegistry &p);
+
+ FunctionPass *createXCoreFrameToArgsOffsetEliminationPass();
+ FunctionPass *createXCoreISelDag(XCoreTargetMachine &TM,
+ CodeGenOpt::Level OptLevel);
+ ModulePass *createXCoreLowerThreadLocalPass();
+
+} // end namespace llvm;
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCore.td b/contrib/llvm/lib/Target/XCore/XCore.td
new file mode 100644
index 000000000000..04a1dd5e95be
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCore.td
@@ -0,0 +1,47 @@
+//===-- XCore.td - Describe the XCore Target Machine -------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the top level entry point for the XCore target.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// Descriptions
+//===----------------------------------------------------------------------===//
+
+include "XCoreRegisterInfo.td"
+include "XCoreInstrInfo.td"
+include "XCoreCallingConv.td"
+
+def XCoreInstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// XCore processors supported.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"generic", []>;
+def : Proc<"xs1b-generic", []>;
+
+//===----------------------------------------------------------------------===//
+// Declare the target which we are implementing
+//===----------------------------------------------------------------------===//
+
+def XCore : Target {
+ // Pull in Instruction Info:
+ let InstructionSet = XCoreInstrInfo;
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp b/contrib/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
new file mode 100644
index 000000000000..b35aa0b95821
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
@@ -0,0 +1,300 @@
+//===-- XCoreAsmPrinter.cpp - XCore LLVM assembly writer ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to the XAS-format XCore assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCore.h"
+#include "InstPrinter/XCoreInstPrinter.h"
+#include "XCoreInstrInfo.h"
+#include "XCoreMCInstLower.h"
+#include "XCoreSubtarget.h"
+#include "XCoreTargetMachine.h"
+#include "XCoreTargetStreamer.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include <algorithm>
+#include <cctype>
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+namespace {
+ class XCoreAsmPrinter : public AsmPrinter {
+ XCoreMCInstLower MCInstLowering;
+ XCoreTargetStreamer &getTargetStreamer();
+
+ public:
+ explicit XCoreAsmPrinter(TargetMachine &TM,
+ std::unique_ptr<MCStreamer> Streamer)
+ : AsmPrinter(TM, std::move(Streamer)), MCInstLowering(*this) {}
+
+ StringRef getPassName() const override { return "XCore Assembly Printer"; }
+
+ void printInlineJT(const MachineInstr *MI, int opNum, raw_ostream &O,
+ const std::string &directive = ".jmptable");
+ void printInlineJT32(const MachineInstr *MI, int opNum, raw_ostream &O) {
+ printInlineJT(MI, opNum, O, ".jmptable32");
+ }
+ void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O);
+ bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &O) override;
+ bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &O) override;
+
+ void emitArrayBound(MCSymbol *Sym, const GlobalVariable *GV);
+ void EmitGlobalVariable(const GlobalVariable *GV) override;
+
+ void EmitFunctionEntryLabel() override;
+ void EmitInstruction(const MachineInstr *MI) override;
+ void EmitFunctionBodyStart() override;
+ void EmitFunctionBodyEnd() override;
+ };
+} // end of anonymous namespace
+
+XCoreTargetStreamer &XCoreAsmPrinter::getTargetStreamer() {
+ return static_cast<XCoreTargetStreamer&>(*OutStreamer->getTargetStreamer());
+}
+
+void XCoreAsmPrinter::emitArrayBound(MCSymbol *Sym, const GlobalVariable *GV) {
+ assert( ( GV->hasExternalLinkage() || GV->hasWeakLinkage() ||
+ GV->hasLinkOnceLinkage() || GV->hasCommonLinkage() ) &&
+ "Unexpected linkage");
+ if (ArrayType *ATy = dyn_cast<ArrayType>(GV->getValueType())) {
+
+ MCSymbol *SymGlob = OutContext.getOrCreateSymbol(
+ Twine(Sym->getName() + StringRef(".globound")));
+ OutStreamer->EmitSymbolAttribute(SymGlob, MCSA_Global);
+ OutStreamer->EmitAssignment(SymGlob,
+ MCConstantExpr::create(ATy->getNumElements(),
+ OutContext));
+ if (GV->hasWeakLinkage() || GV->hasLinkOnceLinkage() ||
+ GV->hasCommonLinkage()) {
+ OutStreamer->EmitSymbolAttribute(SymGlob, MCSA_Weak);
+ }
+ }
+}
+
+void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
+ // Check to see if this is a special global used by LLVM, if so, emit it.
+ if (!GV->hasInitializer() ||
+ EmitSpecialLLVMGlobal(GV))
+ return;
+
+ const DataLayout &DL = getDataLayout();
+ OutStreamer->SwitchSection(getObjFileLowering().SectionForGlobal(GV, TM));
+
+ MCSymbol *GVSym = getSymbol(GV);
+ const Constant *C = GV->getInitializer();
+ unsigned Align = (unsigned)DL.getPreferredTypeAlignmentShift(C->getType());
+
+ // Mark the start of the global
+ getTargetStreamer().emitCCTopData(GVSym->getName());
+
+ switch (GV->getLinkage()) {
+ case GlobalValue::AppendingLinkage:
+ report_fatal_error("AppendingLinkage is not supported by this target!");
+ case GlobalValue::LinkOnceAnyLinkage:
+ case GlobalValue::LinkOnceODRLinkage:
+ case GlobalValue::WeakAnyLinkage:
+ case GlobalValue::WeakODRLinkage:
+ case GlobalValue::ExternalLinkage:
+ case GlobalValue::CommonLinkage:
+ emitArrayBound(GVSym, GV);
+ OutStreamer->EmitSymbolAttribute(GVSym, MCSA_Global);
+
+ if (GV->hasWeakLinkage() || GV->hasLinkOnceLinkage() ||
+ GV->hasCommonLinkage())
+ OutStreamer->EmitSymbolAttribute(GVSym, MCSA_Weak);
+ LLVM_FALLTHROUGH;
+ case GlobalValue::InternalLinkage:
+ case GlobalValue::PrivateLinkage:
+ break;
+ default:
+ llvm_unreachable("Unknown linkage type!");
+ }
+
+ EmitAlignment(Align > 2 ? Align : 2, GV);
+
+ if (GV->isThreadLocal()) {
+ report_fatal_error("TLS is not supported by this target!");
+ }
+ unsigned Size = DL.getTypeAllocSize(C->getType());
+ if (MAI->hasDotTypeDotSizeDirective()) {
+ OutStreamer->EmitSymbolAttribute(GVSym, MCSA_ELF_TypeObject);
+ OutStreamer->emitELFSize(GVSym, MCConstantExpr::create(Size, OutContext));
+ }
+ OutStreamer->EmitLabel(GVSym);
+
+ EmitGlobalConstant(DL, C);
+ // The ABI requires that unsigned scalar types smaller than 32 bits
+ // are padded to 32 bits.
+ if (Size < 4)
+ OutStreamer->EmitZeros(4 - Size);
+
+ // Mark the end of the global
+ getTargetStreamer().emitCCBottomData(GVSym->getName());
+}
+
+void XCoreAsmPrinter::EmitFunctionBodyStart() {
+ MCInstLowering.Initialize(&MF->getContext());
+}
+
+/// EmitFunctionBodyEnd - Targets can override this to emit stuff after
+/// the last basic block in the function.
+void XCoreAsmPrinter::EmitFunctionBodyEnd() {
+ // Emit function end directives
+ getTargetStreamer().emitCCBottomFunction(CurrentFnSym->getName());
+}
+
+void XCoreAsmPrinter::EmitFunctionEntryLabel() {
+ // Mark the start of the function
+ getTargetStreamer().emitCCTopFunction(CurrentFnSym->getName());
+ OutStreamer->EmitLabel(CurrentFnSym);
+}
+
+void XCoreAsmPrinter::
+printInlineJT(const MachineInstr *MI, int opNum, raw_ostream &O,
+ const std::string &directive) {
+ unsigned JTI = MI->getOperand(opNum).getIndex();
+ const MachineFunction *MF = MI->getParent()->getParent();
+ const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+ const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+ const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+ O << "\t" << directive << " ";
+ for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) {
+ MachineBasicBlock *MBB = JTBBs[i];
+ if (i > 0)
+ O << ",";
+ MBB->getSymbol()->print(O, MAI);
+ }
+}
+
+void XCoreAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
+ raw_ostream &O) {
+ const DataLayout &DL = getDataLayout();
+ const MachineOperand &MO = MI->getOperand(opNum);
+ switch (MO.getType()) {
+ case MachineOperand::MO_Register:
+ O << XCoreInstPrinter::getRegisterName(MO.getReg());
+ break;
+ case MachineOperand::MO_Immediate:
+ O << MO.getImm();
+ break;
+ case MachineOperand::MO_MachineBasicBlock:
+ MO.getMBB()->getSymbol()->print(O, MAI);
+ break;
+ case MachineOperand::MO_GlobalAddress:
+ getSymbol(MO.getGlobal())->print(O, MAI);
+ break;
+ case MachineOperand::MO_ConstantPoolIndex:
+ O << DL.getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << '_'
+ << MO.getIndex();
+ break;
+ case MachineOperand::MO_BlockAddress:
+ GetBlockAddressSymbol(MO.getBlockAddress())->print(O, MAI);
+ break;
+ default:
+ llvm_unreachable("not implemented");
+ }
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool XCoreAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant,const char *ExtraCode,
+ raw_ostream &O) {
+ // Print the operand if there is no operand modifier.
+ if (!ExtraCode || !ExtraCode[0]) {
+ printOperand(MI, OpNo, O);
+ return false;
+ }
+
+ // Otherwise fallback on the default implementation.
+ return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+}
+
+bool XCoreAsmPrinter::
+PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &O) {
+ if (ExtraCode && ExtraCode[0]) {
+ return true; // Unknown modifier.
+ }
+ printOperand(MI, OpNum, O);
+ O << '[';
+ printOperand(MI, OpNum + 1, O);
+ O << ']';
+ return false;
+}
+
+void XCoreAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+ SmallString<128> Str;
+ raw_svector_ostream O(Str);
+
+ switch (MI->getOpcode()) {
+ case XCore::DBG_VALUE:
+ llvm_unreachable("Should be handled target independently");
+ case XCore::ADD_2rus:
+ if (MI->getOperand(2).getImm() == 0) {
+ O << "\tmov "
+ << XCoreInstPrinter::getRegisterName(MI->getOperand(0).getReg()) << ", "
+ << XCoreInstPrinter::getRegisterName(MI->getOperand(1).getReg());
+ OutStreamer->EmitRawText(O.str());
+ return;
+ }
+ break;
+ case XCore::BR_JT:
+ case XCore::BR_JT32:
+ O << "\tbru "
+ << XCoreInstPrinter::getRegisterName(MI->getOperand(1).getReg()) << '\n';
+ if (MI->getOpcode() == XCore::BR_JT)
+ printInlineJT(MI, 0, O);
+ else
+ printInlineJT32(MI, 0, O);
+ O << '\n';
+ OutStreamer->EmitRawText(O.str());
+ return;
+ }
+
+ MCInst TmpInst;
+ MCInstLowering.Lower(MI, TmpInst);
+
+ EmitToStreamer(*OutStreamer, TmpInst);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeXCoreAsmPrinter() {
+ RegisterAsmPrinter<XCoreAsmPrinter> X(getTheXCoreTarget());
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreCallingConv.td b/contrib/llvm/lib/Target/XCore/XCoreCallingConv.td
new file mode 100644
index 000000000000..e149e6d9ec20
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreCallingConv.td
@@ -0,0 +1,40 @@
+//===- XCoreCallingConv.td - Calling Conventions for XCore -*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for XCore architecture.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// XCore Return Value Calling Convention
+//===----------------------------------------------------------------------===//
+def RetCC_XCore : CallingConv<[
+ // i32 are returned in registers R0, R1, R2, R3
+ CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
+
+ // Integer values get stored in stack slots that are 4 bytes in
+ // size and 4-byte aligned.
+ CCIfType<[i32], CCAssignToStack<4, 4>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// XCore Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+def CC_XCore : CallingConv<[
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // The 'nest' parameter, if any, is passed in R11.
+ CCIfNest<CCAssignToReg<[R11]>>,
+
+ // The first 4 integer arguments are passed in integer registers.
+ CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
+
+ // Integer values get stored in stack slots that are 4 bytes in
+ // size and 4-byte aligned.
+ CCIfType<[i32], CCAssignToStack<4, 4>>
+]>;
diff --git a/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.cpp b/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
new file mode 100644
index 000000000000..e0e2e0319964
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -0,0 +1,592 @@
+//===-- XCoreFrameLowering.cpp - Frame info for XCore Target --------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains XCore frame information that doesn't fit anywhere else
+// cleanly...
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreFrameLowering.h"
+#include "XCore.h"
+#include "XCoreInstrInfo.h"
+#include "XCoreMachineFunctionInfo.h"
+#include "XCoreSubtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetOptions.h"
+#include <algorithm> // std::sort
+
+using namespace llvm;
+
+static const unsigned FramePtr = XCore::R10;
+static const int MaxImmU16 = (1<<16) - 1;
+
+// helper functions. FIXME: Eliminate.
+static inline bool isImmU6(unsigned val) {
+ return val < (1 << 6);
+}
+
+static inline bool isImmU16(unsigned val) {
+ return val < (1 << 16);
+}
+
+// Helper structure with compare function for handling stack slots.
+namespace {
+struct StackSlotInfo {
+ int FI;
+ int Offset;
+ unsigned Reg;
+ StackSlotInfo(int f, int o, int r) : FI(f), Offset(o), Reg(r){};
+};
+} // end anonymous namespace
+
+static bool CompareSSIOffset(const StackSlotInfo& a, const StackSlotInfo& b) {
+ return a.Offset < b.Offset;
+}
+
+static void EmitDefCfaRegister(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &dl, const TargetInstrInfo &TII,
+ MachineFunction &MF, unsigned DRegNum) {
+ unsigned CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::createDefCfaRegister(nullptr, DRegNum));
+ BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+}
+
+static void EmitDefCfaOffset(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &dl, const TargetInstrInfo &TII,
+ int Offset) {
+ MachineFunction &MF = *MBB.getParent();
+ unsigned CFIIndex =
+ MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, -Offset));
+ BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+}
+
+static void EmitCfiOffset(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, const DebugLoc &dl,
+ const TargetInstrInfo &TII, unsigned DRegNum,
+ int Offset) {
+ MachineFunction &MF = *MBB.getParent();
+ unsigned CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::createOffset(nullptr, DRegNum, Offset));
+ BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+}
+
+/// The SP register is moved in steps of 'MaxImmU16' towards the bottom of the
+/// frame. During these steps, it may be necessary to spill registers.
+/// IfNeededExtSP emits the necessary EXTSP instructions to move the SP only
+/// as far as to make 'OffsetFromBottom' reachable using an STWSP_lru6.
+/// \param OffsetFromTop the spill offset from the top of the frame.
+/// \param [in,out] Adjusted the current SP offset from the top of the frame.
+static void IfNeededExtSP(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, const DebugLoc &dl,
+ const TargetInstrInfo &TII, int OffsetFromTop,
+ int &Adjusted, int FrameSize, bool emitFrameMoves) {
+ while (OffsetFromTop > Adjusted) {
+ assert(Adjusted < FrameSize && "OffsetFromTop is beyond FrameSize");
+ int remaining = FrameSize - Adjusted;
+ int OpImm = (remaining > MaxImmU16) ? MaxImmU16 : remaining;
+ int Opcode = isImmU6(OpImm) ? XCore::EXTSP_u6 : XCore::EXTSP_lu6;
+ BuildMI(MBB, MBBI, dl, TII.get(Opcode)).addImm(OpImm);
+ Adjusted += OpImm;
+ if (emitFrameMoves)
+ EmitDefCfaOffset(MBB, MBBI, dl, TII, Adjusted*4);
+ }
+}
+
+/// The SP register is moved in steps of 'MaxImmU16' towards the top of the
+/// frame. During these steps, it may be necessary to re-load registers.
+/// IfNeededLDAWSP emits the necessary LDAWSP instructions to move the SP only
+/// as far as to make 'OffsetFromTop' reachable using an LDAWSP_lru6.
+/// \param OffsetFromTop the spill offset from the top of the frame.
+/// \param [in,out] RemainingAdj the current SP offset from the top of the
+/// frame.
+static void IfNeededLDAWSP(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, const DebugLoc &dl,
+ const TargetInstrInfo &TII, int OffsetFromTop,
+ int &RemainingAdj) {
+ while (OffsetFromTop < RemainingAdj - MaxImmU16) {
+ assert(RemainingAdj && "OffsetFromTop is beyond FrameSize");
+ int OpImm = (RemainingAdj > MaxImmU16) ? MaxImmU16 : RemainingAdj;
+ int Opcode = isImmU6(OpImm) ? XCore::LDAWSP_ru6 : XCore::LDAWSP_lru6;
+ BuildMI(MBB, MBBI, dl, TII.get(Opcode), XCore::SP).addImm(OpImm);
+ RemainingAdj -= OpImm;
+ }
+}
+
+/// Creates an ordered list of registers that are spilled
+/// during the emitPrologue/emitEpilogue.
+/// Registers are ordered according to their frame offset.
+/// As offsets are negative, the largest offsets will be first.
+static void GetSpillList(SmallVectorImpl<StackSlotInfo> &SpillList,
+ MachineFrameInfo &MFI, XCoreFunctionInfo *XFI,
+ bool fetchLR, bool fetchFP) {
+ if (fetchLR) {
+ int Offset = MFI.getObjectOffset(XFI->getLRSpillSlot());
+ SpillList.push_back(StackSlotInfo(XFI->getLRSpillSlot(),
+ Offset,
+ XCore::LR));
+ }
+ if (fetchFP) {
+ int Offset = MFI.getObjectOffset(XFI->getFPSpillSlot());
+ SpillList.push_back(StackSlotInfo(XFI->getFPSpillSlot(),
+ Offset,
+ FramePtr));
+ }
+ std::sort(SpillList.begin(), SpillList.end(), CompareSSIOffset);
+}
+
+/// Creates an ordered list of EH info register 'spills'.
+/// These slots are only used by the unwinder and calls to llvm.eh.return().
+/// Registers are ordered according to their frame offset.
+/// As offsets are negative, the largest offsets will be first.
+static void GetEHSpillList(SmallVectorImpl<StackSlotInfo> &SpillList,
+ MachineFrameInfo &MFI, XCoreFunctionInfo *XFI,
+ const Constant *PersonalityFn,
+ const TargetLowering *TL) {
+ assert(XFI->hasEHSpillSlot() && "There are no EH register spill slots");
+ const int *EHSlot = XFI->getEHSpillSlot();
+ SpillList.push_back(
+ StackSlotInfo(EHSlot[0], MFI.getObjectOffset(EHSlot[0]),
+ TL->getExceptionPointerRegister(PersonalityFn)));
+ SpillList.push_back(
+ StackSlotInfo(EHSlot[0], MFI.getObjectOffset(EHSlot[1]),
+ TL->getExceptionSelectorRegister(PersonalityFn)));
+ std::sort(SpillList.begin(), SpillList.end(), CompareSSIOffset);
+}
+
+static MachineMemOperand *getFrameIndexMMO(MachineBasicBlock &MBB,
+ int FrameIndex,
+ MachineMemOperand::Flags flags) {
+ MachineFunction *MF = MBB.getParent();
+ const MachineFrameInfo &MFI = MF->getFrameInfo();
+ MachineMemOperand *MMO = MF->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*MF, FrameIndex), flags,
+ MFI.getObjectSize(FrameIndex), MFI.getObjectAlignment(FrameIndex));
+ return MMO;
+}
+
+
+/// Restore clobbered registers with their spill slot value.
+/// The SP will be adjusted at the same time, thus the SpillList must be ordered
+/// with the largest (negative) offsets first.
+static void RestoreSpillList(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &dl, const TargetInstrInfo &TII,
+ int &RemainingAdj,
+ SmallVectorImpl<StackSlotInfo> &SpillList) {
+ for (unsigned i = 0, e = SpillList.size(); i != e; ++i) {
+ assert(SpillList[i].Offset % 4 == 0 && "Misaligned stack offset");
+ assert(SpillList[i].Offset <= 0 && "Unexpected positive stack offset");
+ int OffsetFromTop = - SpillList[i].Offset/4;
+ IfNeededLDAWSP(MBB, MBBI, dl, TII, OffsetFromTop, RemainingAdj);
+ int Offset = RemainingAdj - OffsetFromTop;
+ int Opcode = isImmU6(Offset) ? XCore::LDWSP_ru6 : XCore::LDWSP_lru6;
+ BuildMI(MBB, MBBI, dl, TII.get(Opcode), SpillList[i].Reg)
+ .addImm(Offset)
+ .addMemOperand(getFrameIndexMMO(MBB, SpillList[i].FI,
+ MachineMemOperand::MOLoad));
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// XCoreFrameLowering:
+//===----------------------------------------------------------------------===//
+
+XCoreFrameLowering::XCoreFrameLowering(const XCoreSubtarget &sti)
+ : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 4, 0) {
+ // Do nothing
+}
+
+bool XCoreFrameLowering::hasFP(const MachineFunction &MF) const {
+ return MF.getTarget().Options.DisableFramePointerElim(MF) ||
+ MF.getFrameInfo().hasVarSizedObjects();
+}
+
+void XCoreFrameLowering::emitPrologue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
+ MachineBasicBlock::iterator MBBI = MBB.begin();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MachineModuleInfo *MMI = &MF.getMMI();
+ const MCRegisterInfo *MRI = MMI->getContext().getRegisterInfo();
+ const XCoreInstrInfo &TII = *MF.getSubtarget<XCoreSubtarget>().getInstrInfo();
+ XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+ // Debug location must be unknown since the first debug location is used
+ // to determine the end of the prologue.
+ DebugLoc dl;
+
+ if (MFI.getMaxAlignment() > getStackAlignment())
+ report_fatal_error("emitPrologue unsupported alignment: "
+ + Twine(MFI.getMaxAlignment()));
+
+ const AttributeSet &PAL = MF.getFunction()->getAttributes();
+ if (PAL.hasAttrSomewhere(Attribute::Nest))
+ BuildMI(MBB, MBBI, dl, TII.get(XCore::LDWSP_ru6), XCore::R11).addImm(0);
+ // FIX: Needs addMemOperand() but can't use getFixedStack() or getStack().
+
+ // Work out frame sizes.
+ // We will adjust the SP in stages towards the final FrameSize.
+ assert(MFI.getStackSize()%4 == 0 && "Misaligned frame size");
+ const int FrameSize = MFI.getStackSize() / 4;
+ int Adjusted = 0;
+
+ bool saveLR = XFI->hasLRSpillSlot();
+ bool UseENTSP = saveLR && FrameSize
+ && (MFI.getObjectOffset(XFI->getLRSpillSlot()) == 0);
+ if (UseENTSP)
+ saveLR = false;
+ bool FP = hasFP(MF);
+ bool emitFrameMoves = XCoreRegisterInfo::needsFrameMoves(MF);
+
+ if (UseENTSP) {
+ // Allocate space on the stack at the same time as saving LR.
+ Adjusted = (FrameSize > MaxImmU16) ? MaxImmU16 : FrameSize;
+ int Opcode = isImmU6(Adjusted) ? XCore::ENTSP_u6 : XCore::ENTSP_lu6;
+ MBB.addLiveIn(XCore::LR);
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opcode));
+ MIB.addImm(Adjusted);
+ MIB->addRegisterKilled(XCore::LR, MF.getSubtarget().getRegisterInfo(),
+ true);
+ if (emitFrameMoves) {
+ EmitDefCfaOffset(MBB, MBBI, dl, TII, Adjusted*4);
+ unsigned DRegNum = MRI->getDwarfRegNum(XCore::LR, true);
+ EmitCfiOffset(MBB, MBBI, dl, TII, DRegNum, 0);
+ }
+ }
+
+ // If necessary, save LR and FP to the stack, as we EXTSP.
+ SmallVector<StackSlotInfo,2> SpillList;
+ GetSpillList(SpillList, MFI, XFI, saveLR, FP);
+ // We want the nearest (negative) offsets first, so reverse list.
+ std::reverse(SpillList.begin(), SpillList.end());
+ for (unsigned i = 0, e = SpillList.size(); i != e; ++i) {
+ assert(SpillList[i].Offset % 4 == 0 && "Misaligned stack offset");
+ assert(SpillList[i].Offset <= 0 && "Unexpected positive stack offset");
+ int OffsetFromTop = - SpillList[i].Offset/4;
+ IfNeededExtSP(MBB, MBBI, dl, TII, OffsetFromTop, Adjusted, FrameSize,
+ emitFrameMoves);
+ int Offset = Adjusted - OffsetFromTop;
+ int Opcode = isImmU6(Offset) ? XCore::STWSP_ru6 : XCore::STWSP_lru6;
+ MBB.addLiveIn(SpillList[i].Reg);
+ BuildMI(MBB, MBBI, dl, TII.get(Opcode))
+ .addReg(SpillList[i].Reg, RegState::Kill)
+ .addImm(Offset)
+ .addMemOperand(getFrameIndexMMO(MBB, SpillList[i].FI,
+ MachineMemOperand::MOStore));
+ if (emitFrameMoves) {
+ unsigned DRegNum = MRI->getDwarfRegNum(SpillList[i].Reg, true);
+ EmitCfiOffset(MBB, MBBI, dl, TII, DRegNum, SpillList[i].Offset);
+ }
+ }
+
+ // Complete any remaining Stack adjustment.
+ IfNeededExtSP(MBB, MBBI, dl, TII, FrameSize, Adjusted, FrameSize,
+ emitFrameMoves);
+ assert(Adjusted==FrameSize && "IfNeededExtSP has not completed adjustment");
+
+ if (FP) {
+ // Set the FP from the SP.
+ BuildMI(MBB, MBBI, dl, TII.get(XCore::LDAWSP_ru6), FramePtr).addImm(0);
+ if (emitFrameMoves)
+ EmitDefCfaRegister(MBB, MBBI, dl, TII, MF,
+ MRI->getDwarfRegNum(FramePtr, true));
+ }
+
+ if (emitFrameMoves) {
+ // Frame moves for callee saved.
+ for (const auto &SpillLabel : XFI->getSpillLabels()) {
+ MachineBasicBlock::iterator Pos = SpillLabel.first;
+ ++Pos;
+ const CalleeSavedInfo &CSI = SpillLabel.second;
+ int Offset = MFI.getObjectOffset(CSI.getFrameIdx());
+ unsigned DRegNum = MRI->getDwarfRegNum(CSI.getReg(), true);
+ EmitCfiOffset(MBB, Pos, dl, TII, DRegNum, Offset);
+ }
+ if (XFI->hasEHSpillSlot()) {
+ // The unwinder requires stack slot & CFI offsets for the exception info.
+ // We do not save/spill these registers.
+ const Function *Fn = MF.getFunction();
+ const Constant *PersonalityFn =
+ Fn->hasPersonalityFn() ? Fn->getPersonalityFn() : nullptr;
+ SmallVector<StackSlotInfo, 2> SpillList;
+ GetEHSpillList(SpillList, MFI, XFI, PersonalityFn,
+ MF.getSubtarget().getTargetLowering());
+ assert(SpillList.size()==2 && "Unexpected SpillList size");
+ EmitCfiOffset(MBB, MBBI, dl, TII,
+ MRI->getDwarfRegNum(SpillList[0].Reg, true),
+ SpillList[0].Offset);
+ EmitCfiOffset(MBB, MBBI, dl, TII,
+ MRI->getDwarfRegNum(SpillList[1].Reg, true),
+ SpillList[1].Offset);
+ }
+ }
+}
+
+void XCoreFrameLowering::emitEpilogue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+ const XCoreInstrInfo &TII = *MF.getSubtarget<XCoreSubtarget>().getInstrInfo();
+ XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+ DebugLoc dl = MBBI->getDebugLoc();
+ unsigned RetOpcode = MBBI->getOpcode();
+
+ // Work out frame sizes.
+ // We will adjust the SP in stages towards the final FrameSize.
+ int RemainingAdj = MFI.getStackSize();
+ assert(RemainingAdj%4 == 0 && "Misaligned frame size");
+ RemainingAdj /= 4;
+
+ if (RetOpcode == XCore::EH_RETURN) {
+ // 'Restore' the exception info the unwinder has placed into the stack
+ // slots.
+ const Function *Fn = MF.getFunction();
+ const Constant *PersonalityFn =
+ Fn->hasPersonalityFn() ? Fn->getPersonalityFn() : nullptr;
+ SmallVector<StackSlotInfo, 2> SpillList;
+ GetEHSpillList(SpillList, MFI, XFI, PersonalityFn,
+ MF.getSubtarget().getTargetLowering());
+ RestoreSpillList(MBB, MBBI, dl, TII, RemainingAdj, SpillList);
+
+ // Return to the landing pad.
+ unsigned EhStackReg = MBBI->getOperand(0).getReg();
+ unsigned EhHandlerReg = MBBI->getOperand(1).getReg();
+ BuildMI(MBB, MBBI, dl, TII.get(XCore::SETSP_1r)).addReg(EhStackReg);
+ BuildMI(MBB, MBBI, dl, TII.get(XCore::BAU_1r)).addReg(EhHandlerReg);
+ MBB.erase(MBBI); // Erase the previous return instruction.
+ return;
+ }
+
+ bool restoreLR = XFI->hasLRSpillSlot();
+ bool UseRETSP = restoreLR && RemainingAdj
+ && (MFI.getObjectOffset(XFI->getLRSpillSlot()) == 0);
+ if (UseRETSP)
+ restoreLR = false;
+ bool FP = hasFP(MF);
+
+ if (FP) // Restore the stack pointer.
+ BuildMI(MBB, MBBI, dl, TII.get(XCore::SETSP_1r)).addReg(FramePtr);
+
+ // If necessary, restore LR and FP from the stack, as we EXTSP.
+ SmallVector<StackSlotInfo,2> SpillList;
+ GetSpillList(SpillList, MFI, XFI, restoreLR, FP);
+ RestoreSpillList(MBB, MBBI, dl, TII, RemainingAdj, SpillList);
+
+ if (RemainingAdj) {
+ // Complete all but one of the remaining Stack adjustments.
+ IfNeededLDAWSP(MBB, MBBI, dl, TII, 0, RemainingAdj);
+ if (UseRETSP) {
+ // Fold prologue into return instruction
+ assert(RetOpcode == XCore::RETSP_u6
+ || RetOpcode == XCore::RETSP_lu6);
+ int Opcode = isImmU6(RemainingAdj) ? XCore::RETSP_u6 : XCore::RETSP_lu6;
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opcode))
+ .addImm(RemainingAdj);
+ for (unsigned i = 3, e = MBBI->getNumOperands(); i < e; ++i)
+ MIB->addOperand(MBBI->getOperand(i)); // copy any variadic operands
+ MBB.erase(MBBI); // Erase the previous return instruction.
+ } else {
+ int Opcode = isImmU6(RemainingAdj) ? XCore::LDAWSP_ru6 :
+ XCore::LDAWSP_lru6;
+ BuildMI(MBB, MBBI, dl, TII.get(Opcode), XCore::SP).addImm(RemainingAdj);
+ // Don't erase the return instruction.
+ }
+ } // else Don't erase the return instruction.
+}
+
+bool XCoreFrameLowering::
+spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const {
+ if (CSI.empty())
+ return true;
+
+ MachineFunction *MF = MBB.getParent();
+ const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo();
+ XCoreFunctionInfo *XFI = MF->getInfo<XCoreFunctionInfo>();
+ bool emitFrameMoves = XCoreRegisterInfo::needsFrameMoves(*MF);
+
+ DebugLoc DL;
+ if (MI != MBB.end() && !MI->isDebugValue())
+ DL = MI->getDebugLoc();
+
+ for (std::vector<CalleeSavedInfo>::const_iterator it = CSI.begin();
+ it != CSI.end(); ++it) {
+ unsigned Reg = it->getReg();
+ assert(Reg != XCore::LR && !(Reg == XCore::R10 && hasFP(*MF)) &&
+ "LR & FP are always handled in emitPrologue");
+
+ // Add the callee-saved register as live-in. It's killed at the spill.
+ MBB.addLiveIn(Reg);
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+ TII.storeRegToStackSlot(MBB, MI, Reg, true, it->getFrameIdx(), RC, TRI);
+ if (emitFrameMoves) {
+ auto Store = MI;
+ --Store;
+ XFI->getSpillLabels().push_back(std::make_pair(Store, *it));
+ }
+ }
+ return true;
+}
+
+bool XCoreFrameLowering::
+restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const{
+ MachineFunction *MF = MBB.getParent();
+ const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo();
+ bool AtStart = MI == MBB.begin();
+ MachineBasicBlock::iterator BeforeI = MI;
+ if (!AtStart)
+ --BeforeI;
+ for (std::vector<CalleeSavedInfo>::const_iterator it = CSI.begin();
+ it != CSI.end(); ++it) {
+ unsigned Reg = it->getReg();
+ assert(Reg != XCore::LR && !(Reg == XCore::R10 && hasFP(*MF)) &&
+ "LR & FP are always handled in emitEpilogue");
+
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+ TII.loadRegFromStackSlot(MBB, MI, Reg, it->getFrameIdx(), RC, TRI);
+ assert(MI != MBB.begin() &&
+ "loadRegFromStackSlot didn't insert any code!");
+ // Insert in reverse order. loadRegFromStackSlot can insert multiple
+ // instructions.
+ if (AtStart)
+ MI = MBB.begin();
+ else {
+ MI = BeforeI;
+ ++MI;
+ }
+ }
+ return true;
+}
+
+// This function eliminates ADJCALLSTACKDOWN,
+// ADJCALLSTACKUP pseudo instructions
+MachineBasicBlock::iterator XCoreFrameLowering::eliminateCallFramePseudoInstr(
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
+ const XCoreInstrInfo &TII = *MF.getSubtarget<XCoreSubtarget>().getInstrInfo();
+ if (!hasReservedCallFrame(MF)) {
+ // Turn the adjcallstackdown instruction into 'extsp <amt>' and the
+ // adjcallstackup instruction into 'ldaw sp, sp[<amt>]'
+ MachineInstr &Old = *I;
+ uint64_t Amount = Old.getOperand(0).getImm();
+ if (Amount != 0) {
+ // We need to keep the stack aligned properly. To do this, we round the
+ // amount of space needed for the outgoing arguments up to the next
+ // alignment boundary.
+ unsigned Align = getStackAlignment();
+ Amount = (Amount+Align-1)/Align*Align;
+
+ assert(Amount%4 == 0);
+ Amount /= 4;
+
+ bool isU6 = isImmU6(Amount);
+ if (!isU6 && !isImmU16(Amount)) {
+ // FIX could emit multiple instructions in this case.
+#ifndef NDEBUG
+ errs() << "eliminateCallFramePseudoInstr size too big: "
+ << Amount << "\n";
+#endif
+ llvm_unreachable(nullptr);
+ }
+
+ MachineInstr *New;
+ if (Old.getOpcode() == XCore::ADJCALLSTACKDOWN) {
+ int Opcode = isU6 ? XCore::EXTSP_u6 : XCore::EXTSP_lu6;
+ New = BuildMI(MF, Old.getDebugLoc(), TII.get(Opcode)).addImm(Amount);
+ } else {
+ assert(Old.getOpcode() == XCore::ADJCALLSTACKUP);
+ int Opcode = isU6 ? XCore::LDAWSP_ru6 : XCore::LDAWSP_lru6;
+ New = BuildMI(MF, Old.getDebugLoc(), TII.get(Opcode), XCore::SP)
+ .addImm(Amount);
+ }
+
+ // Replace the pseudo instruction with a new instruction...
+ MBB.insert(I, New);
+ }
+ }
+
+ return MBB.erase(I);
+}
+
+void XCoreFrameLowering::determineCalleeSaves(MachineFunction &MF,
+ BitVector &SavedRegs,
+ RegScavenger *RS) const {
+ TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+
+ XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ bool LRUsed = MRI.isPhysRegModified(XCore::LR);
+
+ if (!LRUsed && !MF.getFunction()->isVarArg() &&
+ MF.getFrameInfo().estimateStackSize(MF))
+ // If we need to extend the stack it is more efficient to use entsp / retsp.
+ // We force the LR to be saved so these instructions are used.
+ LRUsed = true;
+
+ if (MF.callsUnwindInit() || MF.callsEHReturn()) {
+ // The unwinder expects to find spill slots for the exception info regs R0
+ // & R1. These are used during llvm.eh.return() to 'restore' the exception
+ // info. N.B. we do not spill or restore R0, R1 during normal operation.
+ XFI->createEHSpillSlot(MF);
+ // As we will have a stack, we force the LR to be saved.
+ LRUsed = true;
+ }
+
+ if (LRUsed) {
+ // We will handle the LR in the prologue/epilogue
+ // and allocate space on the stack ourselves.
+ SavedRegs.reset(XCore::LR);
+ XFI->createLRSpillSlot(MF);
+ }
+
+ if (hasFP(MF))
+ // A callee save register is used to hold the FP.
+ // This needs saving / restoring in the epilogue / prologue.
+ XFI->createFPSpillSlot(MF);
+}
+
+void XCoreFrameLowering::
+processFunctionBeforeFrameFinalized(MachineFunction &MF,
+ RegScavenger *RS) const {
+ assert(RS && "requiresRegisterScavenging failed");
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ const TargetRegisterClass *RC = &XCore::GRRegsRegClass;
+ XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+ // Reserve slots close to SP or frame pointer for Scavenging spills.
+ // When using SP for small frames, we don't need any scratch registers.
+ // When using SP for large frames, we may need 2 scratch registers.
+ // When using FP, for large or small frames, we may need 1 scratch register.
+ if (XFI->isLargeFrame(MF) || hasFP(MF))
+ RS->addScavengingFrameIndex(MFI.CreateStackObject(RC->getSize(),
+ RC->getAlignment(),
+ false));
+ if (XFI->isLargeFrame(MF) && !hasFP(MF))
+ RS->addScavengingFrameIndex(MFI.CreateStackObject(RC->getSize(),
+ RC->getAlignment(),
+ false));
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.h b/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.h
new file mode 100644
index 000000000000..8729d2208bb2
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.h
@@ -0,0 +1,63 @@
+//===-- XCoreFrameLowering.h - Frame info for XCore Target ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains XCore frame information that doesn't fit anywhere else
+// cleanly...
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XCORE_XCOREFRAMELOWERING_H
+#define LLVM_LIB_TARGET_XCORE_XCOREFRAMELOWERING_H
+
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+ class XCoreSubtarget;
+
+ class XCoreFrameLowering: public TargetFrameLowering {
+ public:
+ XCoreFrameLowering(const XCoreSubtarget &STI);
+
+ /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+ /// the function.
+ void emitPrologue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const override;
+ void emitEpilogue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const override;
+
+ bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const override;
+ bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const override;
+
+ MachineBasicBlock::iterator
+ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const override;
+
+ bool hasFP(const MachineFunction &MF) const override;
+
+ void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+ RegScavenger *RS = nullptr) const override;
+
+ void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+ RegScavenger *RS = nullptr) const override;
+
+ //! Stack slot size (4 bytes)
+ static int stackSlotSize() {
+ return 4;
+ }
+ };
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp b/contrib/llvm/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp
new file mode 100644
index 000000000000..4b10e71be03d
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp
@@ -0,0 +1,66 @@
+//===-- XCoreFrameToArgsOffsetElim.cpp ----------------------------*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Replace Pseudo FRAME_TO_ARGS_OFFSET with the appropriate real offset.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCore.h"
+#include "XCoreInstrInfo.h"
+#include "XCoreSubtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+namespace {
+ struct XCoreFTAOElim : public MachineFunctionPass {
+ static char ID;
+ XCoreFTAOElim() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+ StringRef getPassName() const override {
+ return "XCore FRAME_TO_ARGS_OFFSET Elimination";
+ }
+ };
+ char XCoreFTAOElim::ID = 0;
+}
+
+/// createXCoreFrameToArgsOffsetEliminationPass - returns an instance of the
+/// Frame to args offset elimination pass
+FunctionPass *llvm::createXCoreFrameToArgsOffsetEliminationPass() {
+ return new XCoreFTAOElim();
+}
+
+bool XCoreFTAOElim::runOnMachineFunction(MachineFunction &MF) {
+ const XCoreInstrInfo &TII =
+ *static_cast<const XCoreInstrInfo *>(MF.getSubtarget().getInstrInfo());
+ unsigned StackSize = MF.getFrameInfo().getStackSize();
+ for (MachineFunction::iterator MFI = MF.begin(), E = MF.end(); MFI != E;
+ ++MFI) {
+ MachineBasicBlock &MBB = *MFI;
+ for (MachineBasicBlock::iterator MBBI = MBB.begin(), EE = MBB.end();
+ MBBI != EE; ++MBBI) {
+ if (MBBI->getOpcode() == XCore::FRAME_TO_ARGS_OFFSET) {
+ MachineInstr &OldInst = *MBBI;
+ unsigned Reg = OldInst.getOperand(0).getReg();
+ MBBI = TII.loadImmediate(MBB, MBBI, Reg, StackSize);
+ OldInst.eraseFromParent();
+ }
+ }
+ }
+ return true;
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/contrib/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp
new file mode 100644
index 000000000000..086d1d544f69
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp
@@ -0,0 +1,282 @@
+//===-- XCoreISelDAGToDAG.cpp - A dag to dag inst selector for XCore ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the XCore target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCore.h"
+#include "XCoreTargetMachine.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+
+/// XCoreDAGToDAGISel - XCore specific code to select XCore machine
+/// instructions for SelectionDAG operations.
+///
+namespace {
+ class XCoreDAGToDAGISel : public SelectionDAGISel {
+
+ public:
+ XCoreDAGToDAGISel(XCoreTargetMachine &TM, CodeGenOpt::Level OptLevel)
+ : SelectionDAGISel(TM, OptLevel) {}
+
+ void Select(SDNode *N) override;
+ bool tryBRIND(SDNode *N);
+
+ /// getI32Imm - Return a target constant with the specified value, of type
+ /// i32.
+ inline SDValue getI32Imm(unsigned Imm, const SDLoc &dl) {
+ return CurDAG->getTargetConstant(Imm, dl, MVT::i32);
+ }
+
+ inline bool immMskBitp(SDNode *inN) const {
+ ConstantSDNode *N = cast<ConstantSDNode>(inN);
+ uint32_t value = (uint32_t)N->getZExtValue();
+ if (!isMask_32(value)) {
+ return false;
+ }
+ int msksize = 32 - countLeadingZeros(value);
+ return (msksize >= 1 && msksize <= 8) ||
+ msksize == 16 || msksize == 24 || msksize == 32;
+ }
+
+ // Complex Pattern Selectors.
+ bool SelectADDRspii(SDValue Addr, SDValue &Base, SDValue &Offset);
+
+ bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
+ std::vector<SDValue> &OutOps) override;
+
+ StringRef getPassName() const override {
+ return "XCore DAG->DAG Pattern Instruction Selection";
+ }
+
+ // Include the pieces autogenerated from the target description.
+ #include "XCoreGenDAGISel.inc"
+ };
+} // end anonymous namespace
+
+/// createXCoreISelDag - This pass converts a legalized DAG into a
+/// XCore-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createXCoreISelDag(XCoreTargetMachine &TM,
+ CodeGenOpt::Level OptLevel) {
+ return new XCoreDAGToDAGISel(TM, OptLevel);
+}
+
+bool XCoreDAGToDAGISel::SelectADDRspii(SDValue Addr, SDValue &Base,
+ SDValue &Offset) {
+ FrameIndexSDNode *FIN = nullptr;
+ if ((FIN = dyn_cast<FrameIndexSDNode>(Addr))) {
+ Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+ Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
+ return true;
+ }
+ if (Addr.getOpcode() == ISD::ADD) {
+ ConstantSDNode *CN = nullptr;
+ if ((FIN = dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
+ && (CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
+ && (CN->getSExtValue() % 4 == 0 && CN->getSExtValue() >= 0)) {
+ // Constant positive word offset from frame index
+ Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+ Offset = CurDAG->getTargetConstant(CN->getSExtValue(), SDLoc(Addr),
+ MVT::i32);
+ return true;
+ }
+ }
+ return false;
+}
+
+bool XCoreDAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
+ std::vector<SDValue> &OutOps) {
+ SDValue Reg;
+ switch (ConstraintID) {
+ default: return true;
+ case InlineAsm::Constraint_m: // Memory.
+ switch (Op.getOpcode()) {
+ default: return true;
+ case XCoreISD::CPRelativeWrapper:
+ Reg = CurDAG->getRegister(XCore::CP, MVT::i32);
+ break;
+ case XCoreISD::DPRelativeWrapper:
+ Reg = CurDAG->getRegister(XCore::DP, MVT::i32);
+ break;
+ }
+ }
+ OutOps.push_back(Reg);
+ OutOps.push_back(Op.getOperand(0));
+ return false;
+}
+
+void XCoreDAGToDAGISel::Select(SDNode *N) {
+ SDLoc dl(N);
+ switch (N->getOpcode()) {
+ default: break;
+ case ISD::Constant: {
+ uint64_t Val = cast<ConstantSDNode>(N)->getZExtValue();
+ if (immMskBitp(N)) {
+ // Transformation function: get the size of a mask
+ // Look for the first non-zero bit
+ SDValue MskSize = getI32Imm(32 - countLeadingZeros((uint32_t)Val), dl);
+ ReplaceNode(N, CurDAG->getMachineNode(XCore::MKMSK_rus, dl,
+ MVT::i32, MskSize));
+ return;
+ }
+ else if (!isUInt<16>(Val)) {
+ SDValue CPIdx = CurDAG->getTargetConstantPool(
+ ConstantInt::get(Type::getInt32Ty(*CurDAG->getContext()), Val),
+ getTargetLowering()->getPointerTy(CurDAG->getDataLayout()));
+ SDNode *node = CurDAG->getMachineNode(XCore::LDWCP_lru6, dl, MVT::i32,
+ MVT::Other, CPIdx,
+ CurDAG->getEntryNode());
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] =
+ MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
+ MachineMemOperand::MOLoad, 4, 4);
+ cast<MachineSDNode>(node)->setMemRefs(MemOp, MemOp + 1);
+ ReplaceNode(N, node);
+ return;
+ }
+ break;
+ }
+ case XCoreISD::LADD: {
+ SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+ N->getOperand(2) };
+ ReplaceNode(N, CurDAG->getMachineNode(XCore::LADD_l5r, dl, MVT::i32,
+ MVT::i32, Ops));
+ return;
+ }
+ case XCoreISD::LSUB: {
+ SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+ N->getOperand(2) };
+ ReplaceNode(N, CurDAG->getMachineNode(XCore::LSUB_l5r, dl, MVT::i32,
+ MVT::i32, Ops));
+ return;
+ }
+ case XCoreISD::MACCU: {
+ SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+ N->getOperand(2), N->getOperand(3) };
+ ReplaceNode(N, CurDAG->getMachineNode(XCore::MACCU_l4r, dl, MVT::i32,
+ MVT::i32, Ops));
+ return;
+ }
+ case XCoreISD::MACCS: {
+ SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+ N->getOperand(2), N->getOperand(3) };
+ ReplaceNode(N, CurDAG->getMachineNode(XCore::MACCS_l4r, dl, MVT::i32,
+ MVT::i32, Ops));
+ return;
+ }
+ case XCoreISD::LMUL: {
+ SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+ N->getOperand(2), N->getOperand(3) };
+ ReplaceNode(N, CurDAG->getMachineNode(XCore::LMUL_l6r, dl, MVT::i32,
+ MVT::i32, Ops));
+ return;
+ }
+ case XCoreISD::CRC8: {
+ SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2) };
+ ReplaceNode(N, CurDAG->getMachineNode(XCore::CRC8_l4r, dl, MVT::i32,
+ MVT::i32, Ops));
+ return;
+ }
+ case ISD::BRIND:
+ if (tryBRIND(N))
+ return;
+ break;
+ // Other cases are autogenerated.
+ }
+ SelectCode(N);
+}
+
+/// Given a chain return a new chain where any appearance of Old is replaced
+/// by New. There must be at most one instruction between Old and Chain and
+/// this instruction must be a TokenFactor. Returns an empty SDValue if
+/// these conditions don't hold.
+static SDValue
+replaceInChain(SelectionDAG *CurDAG, SDValue Chain, SDValue Old, SDValue New)
+{
+ if (Chain == Old)
+ return New;
+ if (Chain->getOpcode() != ISD::TokenFactor)
+ return SDValue();
+ SmallVector<SDValue, 8> Ops;
+ bool found = false;
+ for (unsigned i = 0, e = Chain->getNumOperands(); i != e; ++i) {
+ if (Chain->getOperand(i) == Old) {
+ Ops.push_back(New);
+ found = true;
+ } else {
+ Ops.push_back(Chain->getOperand(i));
+ }
+ }
+ if (!found)
+ return SDValue();
+ return CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, Ops);
+}
+
+bool XCoreDAGToDAGISel::tryBRIND(SDNode *N) {
+ SDLoc dl(N);
+ // (brind (int_xcore_checkevent (addr)))
+ SDValue Chain = N->getOperand(0);
+ SDValue Addr = N->getOperand(1);
+ if (Addr->getOpcode() != ISD::INTRINSIC_W_CHAIN)
+ return false;
+ unsigned IntNo = cast<ConstantSDNode>(Addr->getOperand(1))->getZExtValue();
+ if (IntNo != Intrinsic::xcore_checkevent)
+ return false;
+ SDValue nextAddr = Addr->getOperand(2);
+ SDValue CheckEventChainOut(Addr.getNode(), 1);
+ if (!CheckEventChainOut.use_empty()) {
+ // If the chain out of the checkevent intrinsic is an operand of the
+ // indirect branch or used in a TokenFactor which is the operand of the
+ // indirect branch then build a new chain which uses the chain coming into
+ // the checkevent intrinsic instead.
+ SDValue CheckEventChainIn = Addr->getOperand(0);
+ SDValue NewChain = replaceInChain(CurDAG, Chain, CheckEventChainOut,
+ CheckEventChainIn);
+ if (!NewChain.getNode())
+ return false;
+ Chain = NewChain;
+ }
+ // Enable events on the thread using setsr 1 and then disable them immediately
+ // after with clrsr 1. If any resources owned by the thread are ready an event
+ // will be taken. If no resource is ready we branch to the address which was
+ // the operand to the checkevent intrinsic.
+ SDValue constOne = getI32Imm(1, dl);
+ SDValue Glue =
+ SDValue(CurDAG->getMachineNode(XCore::SETSR_branch_u6, dl, MVT::Glue,
+ constOne, Chain), 0);
+ Glue =
+ SDValue(CurDAG->getMachineNode(XCore::CLRSR_branch_u6, dl, MVT::Glue,
+ constOne, Glue), 0);
+ if (nextAddr->getOpcode() == XCoreISD::PCRelativeWrapper &&
+ nextAddr->getOperand(0)->getOpcode() == ISD::TargetBlockAddress) {
+ CurDAG->SelectNodeTo(N, XCore::BRFU_lu6, MVT::Other,
+ nextAddr->getOperand(0), Glue);
+ return true;
+ }
+ CurDAG->SelectNodeTo(N, XCore::BAU_1r, MVT::Other, nextAddr, Glue);
+ return true;
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreISelLowering.cpp b/contrib/llvm/lib/Target/XCore/XCoreISelLowering.cpp
new file mode 100644
index 000000000000..9244d594460f
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreISelLowering.cpp
@@ -0,0 +1,1948 @@
+//===-- XCoreISelLowering.cpp - XCore DAG Lowering Implementation ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the XCoreTargetLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreISelLowering.h"
+#include "XCore.h"
+#include "XCoreMachineFunctionInfo.h"
+#include "XCoreSubtarget.h"
+#include "XCoreTargetMachine.h"
+#include "XCoreTargetObjectFile.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "xcore-lower"
+
+const char *XCoreTargetLowering::
+getTargetNodeName(unsigned Opcode) const
+{
+ switch ((XCoreISD::NodeType)Opcode)
+ {
+ case XCoreISD::FIRST_NUMBER : break;
+ case XCoreISD::BL : return "XCoreISD::BL";
+ case XCoreISD::PCRelativeWrapper : return "XCoreISD::PCRelativeWrapper";
+ case XCoreISD::DPRelativeWrapper : return "XCoreISD::DPRelativeWrapper";
+ case XCoreISD::CPRelativeWrapper : return "XCoreISD::CPRelativeWrapper";
+ case XCoreISD::LDWSP : return "XCoreISD::LDWSP";
+ case XCoreISD::STWSP : return "XCoreISD::STWSP";
+ case XCoreISD::RETSP : return "XCoreISD::RETSP";
+ case XCoreISD::LADD : return "XCoreISD::LADD";
+ case XCoreISD::LSUB : return "XCoreISD::LSUB";
+ case XCoreISD::LMUL : return "XCoreISD::LMUL";
+ case XCoreISD::MACCU : return "XCoreISD::MACCU";
+ case XCoreISD::MACCS : return "XCoreISD::MACCS";
+ case XCoreISD::CRC8 : return "XCoreISD::CRC8";
+ case XCoreISD::BR_JT : return "XCoreISD::BR_JT";
+ case XCoreISD::BR_JT32 : return "XCoreISD::BR_JT32";
+ case XCoreISD::FRAME_TO_ARGS_OFFSET : return "XCoreISD::FRAME_TO_ARGS_OFFSET";
+ case XCoreISD::EH_RETURN : return "XCoreISD::EH_RETURN";
+ case XCoreISD::MEMBARRIER : return "XCoreISD::MEMBARRIER";
+ }
+ return nullptr;
+}
+
+XCoreTargetLowering::XCoreTargetLowering(const TargetMachine &TM,
+ const XCoreSubtarget &Subtarget)
+ : TargetLowering(TM), TM(TM), Subtarget(Subtarget) {
+
+ // Set up the register classes.
+ addRegisterClass(MVT::i32, &XCore::GRRegsRegClass);
+
+ // Compute derived properties from the register classes
+ computeRegisterProperties(Subtarget.getRegisterInfo());
+
+ setStackPointerRegisterToSaveRestore(XCore::SP);
+
+ setSchedulingPreference(Sched::Source);
+
+ // Use i32 for setcc operations results (slt, sgt, ...).
+ setBooleanContents(ZeroOrOneBooleanContent);
+ setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
+
+ // XCore does not have the NodeTypes below.
+ setOperationAction(ISD::BR_CC, MVT::i32, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
+ setOperationAction(ISD::ADDC, MVT::i32, Expand);
+ setOperationAction(ISD::ADDE, MVT::i32, Expand);
+ setOperationAction(ISD::SUBC, MVT::i32, Expand);
+ setOperationAction(ISD::SUBE, MVT::i32, Expand);
+
+ // 64bit
+ setOperationAction(ISD::ADD, MVT::i64, Custom);
+ setOperationAction(ISD::SUB, MVT::i64, Custom);
+ setOperationAction(ISD::SMUL_LOHI, MVT::i32, Custom);
+ setOperationAction(ISD::UMUL_LOHI, MVT::i32, Custom);
+ setOperationAction(ISD::MULHS, MVT::i32, Expand);
+ setOperationAction(ISD::MULHU, MVT::i32, Expand);
+ setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
+ setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
+ setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
+
+ // Bit Manipulation
+ setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+ setOperationAction(ISD::ROTL , MVT::i32, Expand);
+ setOperationAction(ISD::ROTR , MVT::i32, Expand);
+
+ setOperationAction(ISD::TRAP, MVT::Other, Legal);
+
+ // Jump tables.
+ setOperationAction(ISD::BR_JT, MVT::Other, Custom);
+
+ setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+ setOperationAction(ISD::BlockAddress, MVT::i32 , Custom);
+
+ // Conversion of i64 -> double produces constantpool nodes
+ setOperationAction(ISD::ConstantPool, MVT::i32, Custom);
+
+ // Loads
+ for (MVT VT : MVT::integer_valuetypes()) {
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Expand);
+ }
+
+ // Custom expand misaligned loads / stores.
+ setOperationAction(ISD::LOAD, MVT::i32, Custom);
+ setOperationAction(ISD::STORE, MVT::i32, Custom);
+
+ // Varargs
+ setOperationAction(ISD::VAEND, MVT::Other, Expand);
+ setOperationAction(ISD::VACOPY, MVT::Other, Expand);
+ setOperationAction(ISD::VAARG, MVT::Other, Custom);
+ setOperationAction(ISD::VASTART, MVT::Other, Custom);
+
+ // Dynamic stack
+ setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+ setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
+
+ // Exception handling
+ setOperationAction(ISD::EH_RETURN, MVT::Other, Custom);
+ setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
+
+ // Atomic operations
+ // We request a fence for ATOMIC_* instructions, to reduce them to Monotonic.
+ // As we are always Sequential Consistent, an ATOMIC_FENCE becomes a no OP.
+ setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
+ setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
+
+ // TRAMPOLINE is custom lowered.
+ setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
+ setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
+
+ // We want to custom lower some of our intrinsics.
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
+ MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 4;
+ MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize
+ = MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 2;
+
+ // We have target-specific dag combine patterns for the following nodes:
+ setTargetDAGCombine(ISD::STORE);
+ setTargetDAGCombine(ISD::ADD);
+ setTargetDAGCombine(ISD::INTRINSIC_VOID);
+ setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
+
+ setMinFunctionAlignment(1);
+ setPrefFunctionAlignment(2);
+}
+
+bool XCoreTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+ if (Val.getOpcode() != ISD::LOAD)
+ return false;
+
+ EVT VT1 = Val.getValueType();
+ if (!VT1.isSimple() || !VT1.isInteger() ||
+ !VT2.isSimple() || !VT2.isInteger())
+ return false;
+
+ switch (VT1.getSimpleVT().SimpleTy) {
+ default: break;
+ case MVT::i8:
+ return true;
+ }
+
+ return false;
+}
+
+SDValue XCoreTargetLowering::
+LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+ switch (Op.getOpcode())
+ {
+ case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
+ case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
+ case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
+ case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
+ case ISD::BR_JT: return LowerBR_JT(Op, DAG);
+ case ISD::LOAD: return LowerLOAD(Op, DAG);
+ case ISD::STORE: return LowerSTORE(Op, DAG);
+ case ISD::VAARG: return LowerVAARG(Op, DAG);
+ case ISD::VASTART: return LowerVASTART(Op, DAG);
+ case ISD::SMUL_LOHI: return LowerSMUL_LOHI(Op, DAG);
+ case ISD::UMUL_LOHI: return LowerUMUL_LOHI(Op, DAG);
+ // FIXME: Remove these when LegalizeDAGTypes lands.
+ case ISD::ADD:
+ case ISD::SUB: return ExpandADDSUB(Op.getNode(), DAG);
+ case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
+ case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
+ case ISD::FRAME_TO_ARGS_OFFSET: return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
+ case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
+ case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
+ case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+ case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG);
+ case ISD::ATOMIC_LOAD: return LowerATOMIC_LOAD(Op, DAG);
+ case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
+ default:
+ llvm_unreachable("unimplemented operand");
+ }
+}
+
+/// ReplaceNodeResults - Replace the results of node with an illegal result
+/// type with new values built out of custom code.
+void XCoreTargetLowering::ReplaceNodeResults(SDNode *N,
+ SmallVectorImpl<SDValue>&Results,
+ SelectionDAG &DAG) const {
+ switch (N->getOpcode()) {
+ default:
+ llvm_unreachable("Don't know how to custom expand this!");
+ case ISD::ADD:
+ case ISD::SUB:
+ Results.push_back(ExpandADDSUB(N, DAG));
+ return;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Misc Lower Operation implementation
+//===----------------------------------------------------------------------===//
+
+SDValue XCoreTargetLowering::getGlobalAddressWrapper(SDValue GA,
+ const GlobalValue *GV,
+ SelectionDAG &DAG) const {
+ // FIXME there is no actual debug info here
+ SDLoc dl(GA);
+
+ if (GV->getValueType()->isFunctionTy())
+ return DAG.getNode(XCoreISD::PCRelativeWrapper, dl, MVT::i32, GA);
+
+ const auto *GVar = dyn_cast<GlobalVariable>(GV);
+ if ((GV->hasSection() && GV->getSection().startswith(".cp.")) ||
+ (GVar && GVar->isConstant() && GV->hasLocalLinkage()))
+ return DAG.getNode(XCoreISD::CPRelativeWrapper, dl, MVT::i32, GA);
+
+ return DAG.getNode(XCoreISD::DPRelativeWrapper, dl, MVT::i32, GA);
+}
+
+static bool IsSmallObject(const GlobalValue *GV, const XCoreTargetLowering &XTL) {
+ if (XTL.getTargetMachine().getCodeModel() == CodeModel::Small)
+ return true;
+
+ Type *ObjType = GV->getValueType();
+ if (!ObjType->isSized())
+ return false;
+
+ auto &DL = GV->getParent()->getDataLayout();
+ unsigned ObjSize = DL.getTypeAllocSize(ObjType);
+ return ObjSize < CodeModelLargeSize && ObjSize != 0;
+}
+
+SDValue XCoreTargetLowering::
+LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const
+{
+ const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
+ const GlobalValue *GV = GN->getGlobal();
+ SDLoc DL(GN);
+ int64_t Offset = GN->getOffset();
+ if (IsSmallObject(GV, *this)) {
+ // We can only fold positive offsets that are a multiple of the word size.
+ int64_t FoldedOffset = std::max(Offset & ~3, (int64_t)0);
+ SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, FoldedOffset);
+ GA = getGlobalAddressWrapper(GA, GV, DAG);
+ // Handle the rest of the offset.
+ if (Offset != FoldedOffset) {
+ SDValue Remaining = DAG.getConstant(Offset - FoldedOffset, DL, MVT::i32);
+ GA = DAG.getNode(ISD::ADD, DL, MVT::i32, GA, Remaining);
+ }
+ return GA;
+ } else {
+ // Ideally we would not fold in offset with an index <= 11.
+ Type *Ty = Type::getInt8PtrTy(*DAG.getContext());
+ Constant *GA = ConstantExpr::getBitCast(const_cast<GlobalValue*>(GV), Ty);
+ Ty = Type::getInt32Ty(*DAG.getContext());
+ Constant *Idx = ConstantInt::get(Ty, Offset);
+ Constant *GAI = ConstantExpr::getGetElementPtr(
+ Type::getInt8Ty(*DAG.getContext()), GA, Idx);
+ SDValue CP = DAG.getConstantPool(GAI, MVT::i32);
+ return DAG.getLoad(getPointerTy(DAG.getDataLayout()), DL,
+ DAG.getEntryNode(), CP, MachinePointerInfo());
+ }
+}
+
+SDValue XCoreTargetLowering::
+LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const
+{
+ SDLoc DL(Op);
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+ const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+ SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT);
+
+ return DAG.getNode(XCoreISD::PCRelativeWrapper, DL, PtrVT, Result);
+}
+
+SDValue XCoreTargetLowering::
+LowerConstantPool(SDValue Op, SelectionDAG &DAG) const
+{
+ ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+ // FIXME there isn't really debug info here
+ SDLoc dl(CP);
+ EVT PtrVT = Op.getValueType();
+ SDValue Res;
+ if (CP->isMachineConstantPoolEntry()) {
+ Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
+ CP->getAlignment(), CP->getOffset());
+ } else {
+ Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
+ CP->getAlignment(), CP->getOffset());
+ }
+ return DAG.getNode(XCoreISD::CPRelativeWrapper, dl, MVT::i32, Res);
+}
+
+unsigned XCoreTargetLowering::getJumpTableEncoding() const {
+ return MachineJumpTableInfo::EK_Inline;
+}
+
+SDValue XCoreTargetLowering::
+LowerBR_JT(SDValue Op, SelectionDAG &DAG) const
+{
+ SDValue Chain = Op.getOperand(0);
+ SDValue Table = Op.getOperand(1);
+ SDValue Index = Op.getOperand(2);
+ SDLoc dl(Op);
+ JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
+ unsigned JTI = JT->getIndex();
+ MachineFunction &MF = DAG.getMachineFunction();
+ const MachineJumpTableInfo *MJTI = MF.getJumpTableInfo();
+ SDValue TargetJT = DAG.getTargetJumpTable(JT->getIndex(), MVT::i32);
+
+ unsigned NumEntries = MJTI->getJumpTables()[JTI].MBBs.size();
+ if (NumEntries <= 32) {
+ return DAG.getNode(XCoreISD::BR_JT, dl, MVT::Other, Chain, TargetJT, Index);
+ }
+ assert((NumEntries >> 31) == 0);
+ SDValue ScaledIndex = DAG.getNode(ISD::SHL, dl, MVT::i32, Index,
+ DAG.getConstant(1, dl, MVT::i32));
+ return DAG.getNode(XCoreISD::BR_JT32, dl, MVT::Other, Chain, TargetJT,
+ ScaledIndex);
+}
+
+SDValue XCoreTargetLowering::lowerLoadWordFromAlignedBasePlusOffset(
+ const SDLoc &DL, SDValue Chain, SDValue Base, int64_t Offset,
+ SelectionDAG &DAG) const {
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+ if ((Offset & 0x3) == 0) {
+ return DAG.getLoad(PtrVT, DL, Chain, Base, MachinePointerInfo());
+ }
+ // Lower to pair of consecutive word aligned loads plus some bit shifting.
+ int32_t HighOffset = alignTo(Offset, 4);
+ int32_t LowOffset = HighOffset - 4;
+ SDValue LowAddr, HighAddr;
+ if (GlobalAddressSDNode *GASD =
+ dyn_cast<GlobalAddressSDNode>(Base.getNode())) {
+ LowAddr = DAG.getGlobalAddress(GASD->getGlobal(), DL, Base.getValueType(),
+ LowOffset);
+ HighAddr = DAG.getGlobalAddress(GASD->getGlobal(), DL, Base.getValueType(),
+ HighOffset);
+ } else {
+ LowAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, Base,
+ DAG.getConstant(LowOffset, DL, MVT::i32));
+ HighAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, Base,
+ DAG.getConstant(HighOffset, DL, MVT::i32));
+ }
+ SDValue LowShift = DAG.getConstant((Offset - LowOffset) * 8, DL, MVT::i32);
+ SDValue HighShift = DAG.getConstant((HighOffset - Offset) * 8, DL, MVT::i32);
+
+ SDValue Low = DAG.getLoad(PtrVT, DL, Chain, LowAddr, MachinePointerInfo());
+ SDValue High = DAG.getLoad(PtrVT, DL, Chain, HighAddr, MachinePointerInfo());
+ SDValue LowShifted = DAG.getNode(ISD::SRL, DL, MVT::i32, Low, LowShift);
+ SDValue HighShifted = DAG.getNode(ISD::SHL, DL, MVT::i32, High, HighShift);
+ SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i32, LowShifted, HighShifted);
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Low.getValue(1),
+ High.getValue(1));
+ SDValue Ops[] = { Result, Chain };
+ return DAG.getMergeValues(Ops, DL);
+}
+
+static bool isWordAligned(SDValue Value, SelectionDAG &DAG)
+{
+ APInt KnownZero, KnownOne;
+ DAG.computeKnownBits(Value, KnownZero, KnownOne);
+ return KnownZero.countTrailingOnes() >= 2;
+}
+
+SDValue XCoreTargetLowering::
+LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ LoadSDNode *LD = cast<LoadSDNode>(Op);
+ assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
+ "Unexpected extension type");
+ assert(LD->getMemoryVT() == MVT::i32 && "Unexpected load EVT");
+ if (allowsMisalignedMemoryAccesses(LD->getMemoryVT(),
+ LD->getAddressSpace(),
+ LD->getAlignment()))
+ return SDValue();
+
+ auto &TD = DAG.getDataLayout();
+ unsigned ABIAlignment = TD.getABITypeAlignment(
+ LD->getMemoryVT().getTypeForEVT(*DAG.getContext()));
+ // Leave aligned load alone.
+ if (LD->getAlignment() >= ABIAlignment)
+ return SDValue();
+
+ SDValue Chain = LD->getChain();
+ SDValue BasePtr = LD->getBasePtr();
+ SDLoc DL(Op);
+
+ if (!LD->isVolatile()) {
+ const GlobalValue *GV;
+ int64_t Offset = 0;
+ if (DAG.isBaseWithConstantOffset(BasePtr) &&
+ isWordAligned(BasePtr->getOperand(0), DAG)) {
+ SDValue NewBasePtr = BasePtr->getOperand(0);
+ Offset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
+ return lowerLoadWordFromAlignedBasePlusOffset(DL, Chain, NewBasePtr,
+ Offset, DAG);
+ }
+ if (TLI.isGAPlusOffset(BasePtr.getNode(), GV, Offset) &&
+ MinAlign(GV->getAlignment(), 4) == 4) {
+ SDValue NewBasePtr = DAG.getGlobalAddress(GV, DL,
+ BasePtr->getValueType(0));
+ return lowerLoadWordFromAlignedBasePlusOffset(DL, Chain, NewBasePtr,
+ Offset, DAG);
+ }
+ }
+
+ if (LD->getAlignment() == 2) {
+ SDValue Low =
+ DAG.getExtLoad(ISD::ZEXTLOAD, DL, MVT::i32, Chain, BasePtr,
+ LD->getPointerInfo(), MVT::i16,
+ /* Alignment = */ 2, LD->getMemOperand()->getFlags());
+ SDValue HighAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
+ DAG.getConstant(2, DL, MVT::i32));
+ SDValue High =
+ DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, HighAddr,
+ LD->getPointerInfo().getWithOffset(2), MVT::i16,
+ /* Alignment = */ 2, LD->getMemOperand()->getFlags());
+ SDValue HighShifted = DAG.getNode(ISD::SHL, DL, MVT::i32, High,
+ DAG.getConstant(16, DL, MVT::i32));
+ SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i32, Low, HighShifted);
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Low.getValue(1),
+ High.getValue(1));
+ SDValue Ops[] = { Result, Chain };
+ return DAG.getMergeValues(Ops, DL);
+ }
+
+ // Lower to a call to __misaligned_load(BasePtr).
+ Type *IntPtrTy = TD.getIntPtrType(*DAG.getContext());
+ TargetLowering::ArgListTy Args;
+ TargetLowering::ArgListEntry Entry;
+
+ Entry.Ty = IntPtrTy;
+ Entry.Node = BasePtr;
+ Args.push_back(Entry);
+
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(DL).setChain(Chain).setCallee(
+ CallingConv::C, IntPtrTy,
+ DAG.getExternalSymbol("__misaligned_load",
+ getPointerTy(DAG.getDataLayout())),
+ std::move(Args));
+
+ std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+ SDValue Ops[] = { CallResult.first, CallResult.second };
+ return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue XCoreTargetLowering::
+LowerSTORE(SDValue Op, SelectionDAG &DAG) const
+{
+ StoreSDNode *ST = cast<StoreSDNode>(Op);
+ assert(!ST->isTruncatingStore() && "Unexpected store type");
+ assert(ST->getMemoryVT() == MVT::i32 && "Unexpected store EVT");
+ if (allowsMisalignedMemoryAccesses(ST->getMemoryVT(),
+ ST->getAddressSpace(),
+ ST->getAlignment())) {
+ return SDValue();
+ }
+ unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(
+ ST->getMemoryVT().getTypeForEVT(*DAG.getContext()));
+ // Leave aligned store alone.
+ if (ST->getAlignment() >= ABIAlignment) {
+ return SDValue();
+ }
+ SDValue Chain = ST->getChain();
+ SDValue BasePtr = ST->getBasePtr();
+ SDValue Value = ST->getValue();
+ SDLoc dl(Op);
+
+ if (ST->getAlignment() == 2) {
+ SDValue Low = Value;
+ SDValue High = DAG.getNode(ISD::SRL, dl, MVT::i32, Value,
+ DAG.getConstant(16, dl, MVT::i32));
+ SDValue StoreLow = DAG.getTruncStore(
+ Chain, dl, Low, BasePtr, ST->getPointerInfo(), MVT::i16,
+ /* Alignment = */ 2, ST->getMemOperand()->getFlags());
+ SDValue HighAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, BasePtr,
+ DAG.getConstant(2, dl, MVT::i32));
+ SDValue StoreHigh = DAG.getTruncStore(
+ Chain, dl, High, HighAddr, ST->getPointerInfo().getWithOffset(2),
+ MVT::i16, /* Alignment = */ 2, ST->getMemOperand()->getFlags());
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StoreLow, StoreHigh);
+ }
+
+ // Lower to a call to __misaligned_store(BasePtr, Value).
+ Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
+ TargetLowering::ArgListTy Args;
+ TargetLowering::ArgListEntry Entry;
+
+ Entry.Ty = IntPtrTy;
+ Entry.Node = BasePtr;
+ Args.push_back(Entry);
+
+ Entry.Node = Value;
+ Args.push_back(Entry);
+
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(dl).setChain(Chain).setCallee(
+ CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+ DAG.getExternalSymbol("__misaligned_store",
+ getPointerTy(DAG.getDataLayout())),
+ std::move(Args));
+
+ std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+ return CallResult.second;
+}
+
+SDValue XCoreTargetLowering::
+LowerSMUL_LOHI(SDValue Op, SelectionDAG &DAG) const
+{
+ assert(Op.getValueType() == MVT::i32 && Op.getOpcode() == ISD::SMUL_LOHI &&
+ "Unexpected operand to lower!");
+ SDLoc dl(Op);
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
+ SDValue Hi = DAG.getNode(XCoreISD::MACCS, dl,
+ DAG.getVTList(MVT::i32, MVT::i32), Zero, Zero,
+ LHS, RHS);
+ SDValue Lo(Hi.getNode(), 1);
+ SDValue Ops[] = { Lo, Hi };
+ return DAG.getMergeValues(Ops, dl);
+}
+
+SDValue XCoreTargetLowering::
+LowerUMUL_LOHI(SDValue Op, SelectionDAG &DAG) const
+{
+ assert(Op.getValueType() == MVT::i32 && Op.getOpcode() == ISD::UMUL_LOHI &&
+ "Unexpected operand to lower!");
+ SDLoc dl(Op);
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
+ SDValue Hi = DAG.getNode(XCoreISD::LMUL, dl,
+ DAG.getVTList(MVT::i32, MVT::i32), LHS, RHS,
+ Zero, Zero);
+ SDValue Lo(Hi.getNode(), 1);
+ SDValue Ops[] = { Lo, Hi };
+ return DAG.getMergeValues(Ops, dl);
+}
+
+/// isADDADDMUL - Return whether Op is in a form that is equivalent to
+/// add(add(mul(x,y),a),b). If requireIntermediatesHaveOneUse is true then
+/// each intermediate result in the calculation must also have a single use.
+/// If the Op is in the correct form the constituent parts are written to Mul0,
+/// Mul1, Addend0 and Addend1.
+static bool
+isADDADDMUL(SDValue Op, SDValue &Mul0, SDValue &Mul1, SDValue &Addend0,
+ SDValue &Addend1, bool requireIntermediatesHaveOneUse)
+{
+ if (Op.getOpcode() != ISD::ADD)
+ return false;
+ SDValue N0 = Op.getOperand(0);
+ SDValue N1 = Op.getOperand(1);
+ SDValue AddOp;
+ SDValue OtherOp;
+ if (N0.getOpcode() == ISD::ADD) {
+ AddOp = N0;
+ OtherOp = N1;
+ } else if (N1.getOpcode() == ISD::ADD) {
+ AddOp = N1;
+ OtherOp = N0;
+ } else {
+ return false;
+ }
+ if (requireIntermediatesHaveOneUse && !AddOp.hasOneUse())
+ return false;
+ if (OtherOp.getOpcode() == ISD::MUL) {
+ // add(add(a,b),mul(x,y))
+ if (requireIntermediatesHaveOneUse && !OtherOp.hasOneUse())
+ return false;
+ Mul0 = OtherOp.getOperand(0);
+ Mul1 = OtherOp.getOperand(1);
+ Addend0 = AddOp.getOperand(0);
+ Addend1 = AddOp.getOperand(1);
+ return true;
+ }
+ if (AddOp.getOperand(0).getOpcode() == ISD::MUL) {
+ // add(add(mul(x,y),a),b)
+ if (requireIntermediatesHaveOneUse && !AddOp.getOperand(0).hasOneUse())
+ return false;
+ Mul0 = AddOp.getOperand(0).getOperand(0);
+ Mul1 = AddOp.getOperand(0).getOperand(1);
+ Addend0 = AddOp.getOperand(1);
+ Addend1 = OtherOp;
+ return true;
+ }
+ if (AddOp.getOperand(1).getOpcode() == ISD::MUL) {
+ // add(add(a,mul(x,y)),b)
+ if (requireIntermediatesHaveOneUse && !AddOp.getOperand(1).hasOneUse())
+ return false;
+ Mul0 = AddOp.getOperand(1).getOperand(0);
+ Mul1 = AddOp.getOperand(1).getOperand(1);
+ Addend0 = AddOp.getOperand(0);
+ Addend1 = OtherOp;
+ return true;
+ }
+ return false;
+}
+
+SDValue XCoreTargetLowering::
+TryExpandADDWithMul(SDNode *N, SelectionDAG &DAG) const
+{
+ SDValue Mul;
+ SDValue Other;
+ if (N->getOperand(0).getOpcode() == ISD::MUL) {
+ Mul = N->getOperand(0);
+ Other = N->getOperand(1);
+ } else if (N->getOperand(1).getOpcode() == ISD::MUL) {
+ Mul = N->getOperand(1);
+ Other = N->getOperand(0);
+ } else {
+ return SDValue();
+ }
+ SDLoc dl(N);
+ SDValue LL, RL, AddendL, AddendH;
+ LL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+ Mul.getOperand(0), DAG.getConstant(0, dl, MVT::i32));
+ RL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+ Mul.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
+ AddendL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+ Other, DAG.getConstant(0, dl, MVT::i32));
+ AddendH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+ Other, DAG.getConstant(1, dl, MVT::i32));
+ APInt HighMask = APInt::getHighBitsSet(64, 32);
+ unsigned LHSSB = DAG.ComputeNumSignBits(Mul.getOperand(0));
+ unsigned RHSSB = DAG.ComputeNumSignBits(Mul.getOperand(1));
+ if (DAG.MaskedValueIsZero(Mul.getOperand(0), HighMask) &&
+ DAG.MaskedValueIsZero(Mul.getOperand(1), HighMask)) {
+ // The inputs are both zero-extended.
+ SDValue Hi = DAG.getNode(XCoreISD::MACCU, dl,
+ DAG.getVTList(MVT::i32, MVT::i32), AddendH,
+ AddendL, LL, RL);
+ SDValue Lo(Hi.getNode(), 1);
+ return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+ }
+ if (LHSSB > 32 && RHSSB > 32) {
+ // The inputs are both sign-extended.
+ SDValue Hi = DAG.getNode(XCoreISD::MACCS, dl,
+ DAG.getVTList(MVT::i32, MVT::i32), AddendH,
+ AddendL, LL, RL);
+ SDValue Lo(Hi.getNode(), 1);
+ return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+ }
+ SDValue LH, RH;
+ LH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+ Mul.getOperand(0), DAG.getConstant(1, dl, MVT::i32));
+ RH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+ Mul.getOperand(1), DAG.getConstant(1, dl, MVT::i32));
+ SDValue Hi = DAG.getNode(XCoreISD::MACCU, dl,
+ DAG.getVTList(MVT::i32, MVT::i32), AddendH,
+ AddendL, LL, RL);
+ SDValue Lo(Hi.getNode(), 1);
+ RH = DAG.getNode(ISD::MUL, dl, MVT::i32, LL, RH);
+ LH = DAG.getNode(ISD::MUL, dl, MVT::i32, LH, RL);
+ Hi = DAG.getNode(ISD::ADD, dl, MVT::i32, Hi, RH);
+ Hi = DAG.getNode(ISD::ADD, dl, MVT::i32, Hi, LH);
+ return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+}
+
+SDValue XCoreTargetLowering::
+ExpandADDSUB(SDNode *N, SelectionDAG &DAG) const
+{
+ assert(N->getValueType(0) == MVT::i64 &&
+ (N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
+ "Unknown operand to lower!");
+
+ if (N->getOpcode() == ISD::ADD)
+ if (SDValue Result = TryExpandADDWithMul(N, DAG))
+ return Result;
+
+ SDLoc dl(N);
+
+ // Extract components
+ SDValue LHSL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+ N->getOperand(0),
+ DAG.getConstant(0, dl, MVT::i32));
+ SDValue LHSH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+ N->getOperand(0),
+ DAG.getConstant(1, dl, MVT::i32));
+ SDValue RHSL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+ N->getOperand(1),
+ DAG.getConstant(0, dl, MVT::i32));
+ SDValue RHSH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+ N->getOperand(1),
+ DAG.getConstant(1, dl, MVT::i32));
+
+ // Expand
+ unsigned Opcode = (N->getOpcode() == ISD::ADD) ? XCoreISD::LADD :
+ XCoreISD::LSUB;
+ SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
+ SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
+ LHSL, RHSL, Zero);
+ SDValue Carry(Lo.getNode(), 1);
+
+ SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
+ LHSH, RHSH, Carry);
+ SDValue Ignored(Hi.getNode(), 1);
+ // Merge the pieces
+ return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+}
+
+SDValue XCoreTargetLowering::
+LowerVAARG(SDValue Op, SelectionDAG &DAG) const
+{
+ // Whist llvm does not support aggregate varargs we can ignore
+ // the possibility of the ValueType being an implicit byVal vararg.
+ SDNode *Node = Op.getNode();
+ EVT VT = Node->getValueType(0); // not an aggregate
+ SDValue InChain = Node->getOperand(0);
+ SDValue VAListPtr = Node->getOperand(1);
+ EVT PtrVT = VAListPtr.getValueType();
+ const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
+ SDLoc dl(Node);
+ SDValue VAList =
+ DAG.getLoad(PtrVT, dl, InChain, VAListPtr, MachinePointerInfo(SV));
+ // Increment the pointer, VAList, to the next vararg
+ SDValue nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAList,
+ DAG.getIntPtrConstant(VT.getSizeInBits() / 8,
+ dl));
+ // Store the incremented VAList to the legalized pointer
+ InChain = DAG.getStore(VAList.getValue(1), dl, nextPtr, VAListPtr,
+ MachinePointerInfo(SV));
+ // Load the actual argument out of the pointer VAList
+ return DAG.getLoad(VT, dl, InChain, VAList, MachinePointerInfo());
+}
+
+SDValue XCoreTargetLowering::
+LowerVASTART(SDValue Op, SelectionDAG &DAG) const
+{
+ SDLoc dl(Op);
+ // vastart stores the address of the VarArgsFrameIndex slot into the
+ // memory location argument
+ MachineFunction &MF = DAG.getMachineFunction();
+ XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+ SDValue Addr = DAG.getFrameIndex(XFI->getVarArgsFrameIndex(), MVT::i32);
+ return DAG.getStore(Op.getOperand(0), dl, Addr, Op.getOperand(1),
+ MachinePointerInfo());
+}
+
+SDValue XCoreTargetLowering::LowerFRAMEADDR(SDValue Op,
+ SelectionDAG &DAG) const {
+ // This nodes represent llvm.frameaddress on the DAG.
+ // It takes one operand, the index of the frame address to return.
+ // An index of zero corresponds to the current function's frame address.
+ // An index of one to the parent's frame address, and so on.
+ // Depths > 0 not supported yet!
+ if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() > 0)
+ return SDValue();
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ const TargetRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(Op),
+ RegInfo->getFrameRegister(MF), MVT::i32);
+}
+
+SDValue XCoreTargetLowering::
+LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
+ // This nodes represent llvm.returnaddress on the DAG.
+ // It takes one operand, the index of the return address to return.
+ // An index of zero corresponds to the current function's return address.
+ // An index of one to the parent's return address, and so on.
+ // Depths > 0 not supported yet!
+ if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() > 0)
+ return SDValue();
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+ int FI = XFI->createLRSpillSlot(MF);
+ SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+ return DAG.getLoad(getPointerTy(DAG.getDataLayout()), SDLoc(Op),
+ DAG.getEntryNode(), FIN,
+ MachinePointerInfo::getFixedStack(MF, FI));
+}
+
+SDValue XCoreTargetLowering::
+LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const {
+ // This node represents offset from frame pointer to first on-stack argument.
+ // This is needed for correct stack adjustment during unwind.
+ // However, we don't know the offset until after the frame has be finalised.
+ // This is done during the XCoreFTAOElim pass.
+ return DAG.getNode(XCoreISD::FRAME_TO_ARGS_OFFSET, SDLoc(Op), MVT::i32);
+}
+
+SDValue XCoreTargetLowering::
+LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
+ // OUTCHAIN = EH_RETURN(INCHAIN, OFFSET, HANDLER)
+ // This node represents 'eh_return' gcc dwarf builtin, which is used to
+ // return from exception. The general meaning is: adjust stack by OFFSET and
+ // pass execution to HANDLER.
+ MachineFunction &MF = DAG.getMachineFunction();
+ SDValue Chain = Op.getOperand(0);
+ SDValue Offset = Op.getOperand(1);
+ SDValue Handler = Op.getOperand(2);
+ SDLoc dl(Op);
+
+ // Absolute SP = (FP + FrameToArgs) + Offset
+ const TargetRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ SDValue Stack = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
+ RegInfo->getFrameRegister(MF), MVT::i32);
+ SDValue FrameToArgs = DAG.getNode(XCoreISD::FRAME_TO_ARGS_OFFSET, dl,
+ MVT::i32);
+ Stack = DAG.getNode(ISD::ADD, dl, MVT::i32, Stack, FrameToArgs);
+ Stack = DAG.getNode(ISD::ADD, dl, MVT::i32, Stack, Offset);
+
+ // R0=ExceptionPointerRegister R1=ExceptionSelectorRegister
+ // which leaves 2 caller saved registers, R2 & R3 for us to use.
+ unsigned StackReg = XCore::R2;
+ unsigned HandlerReg = XCore::R3;
+
+ SDValue OutChains[] = {
+ DAG.getCopyToReg(Chain, dl, StackReg, Stack),
+ DAG.getCopyToReg(Chain, dl, HandlerReg, Handler)
+ };
+
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+
+ return DAG.getNode(XCoreISD::EH_RETURN, dl, MVT::Other, Chain,
+ DAG.getRegister(StackReg, MVT::i32),
+ DAG.getRegister(HandlerReg, MVT::i32));
+
+}
+
+SDValue XCoreTargetLowering::
+LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const {
+ return Op.getOperand(0);
+}
+
+SDValue XCoreTargetLowering::
+LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const {
+ SDValue Chain = Op.getOperand(0);
+ SDValue Trmp = Op.getOperand(1); // trampoline
+ SDValue FPtr = Op.getOperand(2); // nested function
+ SDValue Nest = Op.getOperand(3); // 'nest' parameter value
+
+ const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+
+ // .align 4
+ // LDAPF_u10 r11, nest
+ // LDW_2rus r11, r11[0]
+ // STWSP_ru6 r11, sp[0]
+ // LDAPF_u10 r11, fptr
+ // LDW_2rus r11, r11[0]
+ // BAU_1r r11
+ // nest:
+ // .word nest
+ // fptr:
+ // .word fptr
+ SDValue OutChains[5];
+
+ SDValue Addr = Trmp;
+
+ SDLoc dl(Op);
+ OutChains[0] =
+ DAG.getStore(Chain, dl, DAG.getConstant(0x0a3cd805, dl, MVT::i32), Addr,
+ MachinePointerInfo(TrmpAddr));
+
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+ DAG.getConstant(4, dl, MVT::i32));
+ OutChains[1] =
+ DAG.getStore(Chain, dl, DAG.getConstant(0xd80456c0, dl, MVT::i32), Addr,
+ MachinePointerInfo(TrmpAddr, 4));
+
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+ DAG.getConstant(8, dl, MVT::i32));
+ OutChains[2] =
+ DAG.getStore(Chain, dl, DAG.getConstant(0x27fb0a3c, dl, MVT::i32), Addr,
+ MachinePointerInfo(TrmpAddr, 8));
+
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+ DAG.getConstant(12, dl, MVT::i32));
+ OutChains[3] =
+ DAG.getStore(Chain, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12));
+
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+ DAG.getConstant(16, dl, MVT::i32));
+ OutChains[4] =
+ DAG.getStore(Chain, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 16));
+
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+}
+
+SDValue XCoreTargetLowering::
+LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ switch (IntNo) {
+ case Intrinsic::xcore_crc8:
+ EVT VT = Op.getValueType();
+ SDValue Data =
+ DAG.getNode(XCoreISD::CRC8, DL, DAG.getVTList(VT, VT),
+ Op.getOperand(1), Op.getOperand(2) , Op.getOperand(3));
+ SDValue Crc(Data.getNode(), 1);
+ SDValue Results[] = { Crc, Data };
+ return DAG.getMergeValues(Results, DL);
+ }
+ return SDValue();
+}
+
+SDValue XCoreTargetLowering::
+LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ return DAG.getNode(XCoreISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
+}
+
+SDValue XCoreTargetLowering::
+LowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const {
+ AtomicSDNode *N = cast<AtomicSDNode>(Op);
+ assert(N->getOpcode() == ISD::ATOMIC_LOAD && "Bad Atomic OP");
+ assert((N->getOrdering() == AtomicOrdering::Unordered ||
+ N->getOrdering() == AtomicOrdering::Monotonic) &&
+ "setInsertFencesForAtomic(true) expects unordered / monotonic");
+ if (N->getMemoryVT() == MVT::i32) {
+ if (N->getAlignment() < 4)
+ report_fatal_error("atomic load must be aligned");
+ return DAG.getLoad(getPointerTy(DAG.getDataLayout()), SDLoc(Op),
+ N->getChain(), N->getBasePtr(), N->getPointerInfo(),
+ N->getAlignment(), N->getMemOperand()->getFlags(),
+ N->getAAInfo(), N->getRanges());
+ }
+ if (N->getMemoryVT() == MVT::i16) {
+ if (N->getAlignment() < 2)
+ report_fatal_error("atomic load must be aligned");
+ return DAG.getExtLoad(ISD::EXTLOAD, SDLoc(Op), MVT::i32, N->getChain(),
+ N->getBasePtr(), N->getPointerInfo(), MVT::i16,
+ N->getAlignment(), N->getMemOperand()->getFlags(),
+ N->getAAInfo());
+ }
+ if (N->getMemoryVT() == MVT::i8)
+ return DAG.getExtLoad(ISD::EXTLOAD, SDLoc(Op), MVT::i32, N->getChain(),
+ N->getBasePtr(), N->getPointerInfo(), MVT::i8,
+ N->getAlignment(), N->getMemOperand()->getFlags(),
+ N->getAAInfo());
+ return SDValue();
+}
+
+SDValue XCoreTargetLowering::
+LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const {
+ AtomicSDNode *N = cast<AtomicSDNode>(Op);
+ assert(N->getOpcode() == ISD::ATOMIC_STORE && "Bad Atomic OP");
+ assert((N->getOrdering() == AtomicOrdering::Unordered ||
+ N->getOrdering() == AtomicOrdering::Monotonic) &&
+ "setInsertFencesForAtomic(true) expects unordered / monotonic");
+ if (N->getMemoryVT() == MVT::i32) {
+ if (N->getAlignment() < 4)
+ report_fatal_error("atomic store must be aligned");
+ return DAG.getStore(N->getChain(), SDLoc(Op), N->getVal(), N->getBasePtr(),
+ N->getPointerInfo(), N->getAlignment(),
+ N->getMemOperand()->getFlags(), N->getAAInfo());
+ }
+ if (N->getMemoryVT() == MVT::i16) {
+ if (N->getAlignment() < 2)
+ report_fatal_error("atomic store must be aligned");
+ return DAG.getTruncStore(N->getChain(), SDLoc(Op), N->getVal(),
+ N->getBasePtr(), N->getPointerInfo(), MVT::i16,
+ N->getAlignment(), N->getMemOperand()->getFlags(),
+ N->getAAInfo());
+ }
+ if (N->getMemoryVT() == MVT::i8)
+ return DAG.getTruncStore(N->getChain(), SDLoc(Op), N->getVal(),
+ N->getBasePtr(), N->getPointerInfo(), MVT::i8,
+ N->getAlignment(), N->getMemOperand()->getFlags(),
+ N->getAAInfo());
+ return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+// Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "XCoreGenCallingConv.inc"
+
+//===----------------------------------------------------------------------===//
+// Call Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+/// XCore call implementation
+SDValue
+XCoreTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const {
+ SelectionDAG &DAG = CLI.DAG;
+ SDLoc &dl = CLI.DL;
+ SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+ SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+ SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
+ SDValue Chain = CLI.Chain;
+ SDValue Callee = CLI.Callee;
+ bool &isTailCall = CLI.IsTailCall;
+ CallingConv::ID CallConv = CLI.CallConv;
+ bool isVarArg = CLI.IsVarArg;
+
+ // XCore target does not yet support tail call optimization.
+ isTailCall = false;
+
+ // For now, only CallingConv::C implemented
+ switch (CallConv)
+ {
+ default:
+ llvm_unreachable("Unsupported calling convention");
+ case CallingConv::Fast:
+ case CallingConv::C:
+ return LowerCCCCallTo(Chain, Callee, CallConv, isVarArg, isTailCall,
+ Outs, OutVals, Ins, dl, DAG, InVals);
+ }
+}
+
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers / memory locations.
+static SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+ const SmallVectorImpl<CCValAssign> &RVLocs,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) {
+ SmallVector<std::pair<int, unsigned>, 4> ResultMemLocs;
+ // Copy results out of physical registers.
+ for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+ const CCValAssign &VA = RVLocs[i];
+ if (VA.isRegLoc()) {
+ Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getValVT(),
+ InFlag).getValue(1);
+ InFlag = Chain.getValue(2);
+ InVals.push_back(Chain.getValue(0));
+ } else {
+ assert(VA.isMemLoc());
+ ResultMemLocs.push_back(std::make_pair(VA.getLocMemOffset(),
+ InVals.size()));
+ // Reserve space for this result.
+ InVals.push_back(SDValue());
+ }
+ }
+
+ // Copy results out of memory.
+ SmallVector<SDValue, 4> MemOpChains;
+ for (unsigned i = 0, e = ResultMemLocs.size(); i != e; ++i) {
+ int offset = ResultMemLocs[i].first;
+ unsigned index = ResultMemLocs[i].second;
+ SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
+ SDValue Ops[] = { Chain, DAG.getConstant(offset / 4, dl, MVT::i32) };
+ SDValue load = DAG.getNode(XCoreISD::LDWSP, dl, VTs, Ops);
+ InVals[index] = load;
+ MemOpChains.push_back(load.getValue(1));
+ }
+
+ // Transform all loads nodes into one single node because
+ // all load nodes are independent of each other.
+ if (!MemOpChains.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+
+ return Chain;
+}
+
+/// LowerCCCCallTo - functions arguments are copied from virtual
+/// regs to (physical regs)/(stack frame), CALLSEQ_START and
+/// CALLSEQ_END are emitted.
+/// TODO: isTailCall, sret.
+SDValue XCoreTargetLowering::LowerCCCCallTo(
+ SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
+ bool isTailCall, const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+ *DAG.getContext());
+
+ // The ABI dictates there should be one stack slot available to the callee
+ // on function entry (for saving lr).
+ CCInfo.AllocateStack(4, 4);
+
+ CCInfo.AnalyzeCallOperands(Outs, CC_XCore);
+
+ SmallVector<CCValAssign, 16> RVLocs;
+ // Analyze return values to determine the number of bytes of stack required.
+ CCState RetCCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+ *DAG.getContext());
+ RetCCInfo.AllocateStack(CCInfo.getNextStackOffset(), 4);
+ RetCCInfo.AnalyzeCallResult(Ins, RetCC_XCore);
+
+ // Get a count of how many bytes are to be pushed on the stack.
+ unsigned NumBytes = RetCCInfo.getNextStackOffset();
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+ Chain = DAG.getCALLSEQ_START(Chain,
+ DAG.getConstant(NumBytes, dl, PtrVT, true), dl);
+
+ SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
+ SmallVector<SDValue, 12> MemOpChains;
+
+ // Walk the register/memloc assignments, inserting copies/loads.
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ SDValue Arg = OutVals[i];
+
+ // Promote the value if needed.
+ switch (VA.getLocInfo()) {
+ default: llvm_unreachable("Unknown loc info!");
+ case CCValAssign::Full: break;
+ case CCValAssign::SExt:
+ Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::ZExt:
+ Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::AExt:
+ Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+ break;
+ }
+
+ // Arguments that can be passed on register must be kept at
+ // RegsToPass vector
+ if (VA.isRegLoc()) {
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+ } else {
+ assert(VA.isMemLoc());
+
+ int Offset = VA.getLocMemOffset();
+
+ MemOpChains.push_back(DAG.getNode(XCoreISD::STWSP, dl, MVT::Other,
+ Chain, Arg,
+ DAG.getConstant(Offset/4, dl,
+ MVT::i32)));
+ }
+ }
+
+ // Transform all store nodes into one single node because
+ // all store nodes are independent of each other.
+ if (!MemOpChains.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+
+ // Build a sequence of copy-to-reg nodes chained together with token
+ // chain and flag operands which copy the outgoing args into registers.
+ // The InFlag in necessary since all emitted instructions must be
+ // stuck together.
+ SDValue InFlag;
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+ Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+ RegsToPass[i].second, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ // If the callee is a GlobalAddress node (quite common, every direct call is)
+ // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+ // Likewise ExternalSymbol -> TargetExternalSymbol.
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+ Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, MVT::i32);
+ else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
+ Callee = DAG.getTargetExternalSymbol(E->getSymbol(), MVT::i32);
+
+ // XCoreBranchLink = #chain, #target_address, #opt_in_flags...
+ // = Chain, Callee, Reg#1, Reg#2, ...
+ //
+ // Returns a chain & a flag for retval copy to use.
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(Chain);
+ Ops.push_back(Callee);
+
+ // Add argument registers to the end of the list so that they are
+ // known live into the call.
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+ Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+ RegsToPass[i].second.getValueType()));
+
+ if (InFlag.getNode())
+ Ops.push_back(InFlag);
+
+ Chain = DAG.getNode(XCoreISD::BL, dl, NodeTys, Ops);
+ InFlag = Chain.getValue(1);
+
+ // Create the CALLSEQ_END node.
+ Chain = DAG.getCALLSEQ_END(Chain, DAG.getConstant(NumBytes, dl, PtrVT, true),
+ DAG.getConstant(0, dl, PtrVT, true), InFlag, dl);
+ InFlag = Chain.getValue(1);
+
+ // Handle result values, copying them out of physregs into vregs that we
+ // return.
+ return LowerCallResult(Chain, InFlag, RVLocs, dl, DAG, InVals);
+}
+
+//===----------------------------------------------------------------------===//
+// Formal Arguments Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+ struct ArgDataPair { SDValue SDV; ISD::ArgFlagsTy Flags; };
+}
+
+/// XCore formal arguments implementation
+SDValue XCoreTargetLowering::LowerFormalArguments(
+ SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+ switch (CallConv)
+ {
+ default:
+ llvm_unreachable("Unsupported calling convention");
+ case CallingConv::C:
+ case CallingConv::Fast:
+ return LowerCCCArguments(Chain, CallConv, isVarArg,
+ Ins, dl, DAG, InVals);
+ }
+}
+
+/// LowerCCCArguments - transform physical registers into
+/// virtual registers and generate load operations for
+/// arguments places on the stack.
+/// TODO: sret
+SDValue XCoreTargetLowering::LowerCCCArguments(
+ SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MachineRegisterInfo &RegInfo = MF.getRegInfo();
+ XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+
+ // Assign locations to all of the incoming arguments.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+ *DAG.getContext());
+
+ CCInfo.AnalyzeFormalArguments(Ins, CC_XCore);
+
+ unsigned StackSlotSize = XCoreFrameLowering::stackSlotSize();
+
+ unsigned LRSaveSize = StackSlotSize;
+
+ if (!isVarArg)
+ XFI->setReturnStackOffset(CCInfo.getNextStackOffset() + LRSaveSize);
+
+ // All getCopyFromReg ops must precede any getMemcpys to prevent the
+ // scheduler clobbering a register before it has been copied.
+ // The stages are:
+ // 1. CopyFromReg (and load) arg & vararg registers.
+ // 2. Chain CopyFromReg nodes into a TokenFactor.
+ // 3. Memcpy 'byVal' args & push final InVals.
+ // 4. Chain mem ops nodes into a TokenFactor.
+ SmallVector<SDValue, 4> CFRegNode;
+ SmallVector<ArgDataPair, 4> ArgData;
+ SmallVector<SDValue, 4> MemOps;
+
+ // 1a. CopyFromReg (and load) arg registers.
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+
+ CCValAssign &VA = ArgLocs[i];
+ SDValue ArgIn;
+
+ if (VA.isRegLoc()) {
+ // Arguments passed in registers
+ EVT RegVT = VA.getLocVT();
+ switch (RegVT.getSimpleVT().SimpleTy) {
+ default:
+ {
+#ifndef NDEBUG
+ errs() << "LowerFormalArguments Unhandled argument type: "
+ << RegVT.getEVTString() << "\n";
+#endif
+ llvm_unreachable(nullptr);
+ }
+ case MVT::i32:
+ unsigned VReg = RegInfo.createVirtualRegister(&XCore::GRRegsRegClass);
+ RegInfo.addLiveIn(VA.getLocReg(), VReg);
+ ArgIn = DAG.getCopyFromReg(Chain, dl, VReg, RegVT);
+ CFRegNode.push_back(ArgIn.getValue(ArgIn->getNumValues() - 1));
+ }
+ } else {
+ // sanity check
+ assert(VA.isMemLoc());
+ // Load the argument to a virtual register
+ unsigned ObjSize = VA.getLocVT().getSizeInBits()/8;
+ if (ObjSize > StackSlotSize) {
+ errs() << "LowerFormalArguments Unhandled argument type: "
+ << EVT(VA.getLocVT()).getEVTString()
+ << "\n";
+ }
+ // Create the frame index object for this incoming parameter...
+ int FI = MFI.CreateFixedObject(ObjSize,
+ LRSaveSize + VA.getLocMemOffset(),
+ true);
+
+ // Create the SelectionDAG nodes corresponding to a load
+ //from this parameter
+ SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+ ArgIn = DAG.getLoad(VA.getLocVT(), dl, Chain, FIN,
+ MachinePointerInfo::getFixedStack(MF, FI));
+ }
+ const ArgDataPair ADP = { ArgIn, Ins[i].Flags };
+ ArgData.push_back(ADP);
+ }
+
+ // 1b. CopyFromReg vararg registers.
+ if (isVarArg) {
+ // Argument registers
+ static const MCPhysReg ArgRegs[] = {
+ XCore::R0, XCore::R1, XCore::R2, XCore::R3
+ };
+ XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+ unsigned FirstVAReg = CCInfo.getFirstUnallocated(ArgRegs);
+ if (FirstVAReg < array_lengthof(ArgRegs)) {
+ int offset = 0;
+ // Save remaining registers, storing higher register numbers at a higher
+ // address
+ for (int i = array_lengthof(ArgRegs) - 1; i >= (int)FirstVAReg; --i) {
+ // Create a stack slot
+ int FI = MFI.CreateFixedObject(4, offset, true);
+ if (i == (int)FirstVAReg) {
+ XFI->setVarArgsFrameIndex(FI);
+ }
+ offset -= StackSlotSize;
+ SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+ // Move argument from phys reg -> virt reg
+ unsigned VReg = RegInfo.createVirtualRegister(&XCore::GRRegsRegClass);
+ RegInfo.addLiveIn(ArgRegs[i], VReg);
+ SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
+ CFRegNode.push_back(Val.getValue(Val->getNumValues() - 1));
+ // Move argument from virt reg -> stack
+ SDValue Store =
+ DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
+ MemOps.push_back(Store);
+ }
+ } else {
+ // This will point to the next argument passed via stack.
+ XFI->setVarArgsFrameIndex(
+ MFI.CreateFixedObject(4, LRSaveSize + CCInfo.getNextStackOffset(),
+ true));
+ }
+ }
+
+ // 2. chain CopyFromReg nodes into a TokenFactor.
+ if (!CFRegNode.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, CFRegNode);
+
+ // 3. Memcpy 'byVal' args & push final InVals.
+ // Aggregates passed "byVal" need to be copied by the callee.
+ // The callee will use a pointer to this copy, rather than the original
+ // pointer.
+ for (SmallVectorImpl<ArgDataPair>::const_iterator ArgDI = ArgData.begin(),
+ ArgDE = ArgData.end();
+ ArgDI != ArgDE; ++ArgDI) {
+ if (ArgDI->Flags.isByVal() && ArgDI->Flags.getByValSize()) {
+ unsigned Size = ArgDI->Flags.getByValSize();
+ unsigned Align = std::max(StackSlotSize, ArgDI->Flags.getByValAlign());
+ // Create a new object on the stack and copy the pointee into it.
+ int FI = MFI.CreateStackObject(Size, Align, false);
+ SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+ InVals.push_back(FIN);
+ MemOps.push_back(DAG.getMemcpy(Chain, dl, FIN, ArgDI->SDV,
+ DAG.getConstant(Size, dl, MVT::i32),
+ Align, false, false, false,
+ MachinePointerInfo(),
+ MachinePointerInfo()));
+ } else {
+ InVals.push_back(ArgDI->SDV);
+ }
+ }
+
+ // 4, chain mem ops nodes into a TokenFactor.
+ if (!MemOps.empty()) {
+ MemOps.push_back(Chain);
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
+ }
+
+ return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+bool XCoreTargetLowering::
+CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ LLVMContext &Context) const {
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
+ if (!CCInfo.CheckReturn(Outs, RetCC_XCore))
+ return false;
+ if (CCInfo.getNextStackOffset() != 0 && isVarArg)
+ return false;
+ return true;
+}
+
+SDValue
+XCoreTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SDLoc &dl, SelectionDAG &DAG) const {
+
+ XCoreFunctionInfo *XFI =
+ DAG.getMachineFunction().getInfo<XCoreFunctionInfo>();
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+
+ // CCValAssign - represent the assignment of
+ // the return value to a location
+ SmallVector<CCValAssign, 16> RVLocs;
+
+ // CCState - Info about the registers and stack slot.
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+ *DAG.getContext());
+
+ // Analyze return values.
+ if (!isVarArg)
+ CCInfo.AllocateStack(XFI->getReturnStackOffset(), 4);
+
+ CCInfo.AnalyzeReturn(Outs, RetCC_XCore);
+
+ SDValue Flag;
+ SmallVector<SDValue, 4> RetOps(1, Chain);
+
+ // Return on XCore is always a "retsp 0"
+ RetOps.push_back(DAG.getConstant(0, dl, MVT::i32));
+
+ SmallVector<SDValue, 4> MemOpChains;
+ // Handle return values that must be copied to memory.
+ for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+ CCValAssign &VA = RVLocs[i];
+ if (VA.isRegLoc())
+ continue;
+ assert(VA.isMemLoc());
+ if (isVarArg) {
+ report_fatal_error("Can't return value from vararg function in memory");
+ }
+
+ int Offset = VA.getLocMemOffset();
+ unsigned ObjSize = VA.getLocVT().getSizeInBits() / 8;
+ // Create the frame index object for the memory location.
+ int FI = MFI.CreateFixedObject(ObjSize, Offset, false);
+
+ // Create a SelectionDAG node corresponding to a store
+ // to this memory location.
+ SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+ MemOpChains.push_back(DAG.getStore(
+ Chain, dl, OutVals[i], FIN,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
+ }
+
+ // Transform all store nodes into one single node because
+ // all stores are independent of each other.
+ if (!MemOpChains.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+
+ // Now handle return values copied to registers.
+ for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+ CCValAssign &VA = RVLocs[i];
+ if (!VA.isRegLoc())
+ continue;
+ // Copy the result values into the output registers.
+ Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), OutVals[i], Flag);
+
+ // guarantee that all emitted copies are
+ // stuck together, avoiding something bad
+ Flag = Chain.getValue(1);
+ RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+ }
+
+ RetOps[0] = Chain; // Update chain.
+
+ // Add the flag if we have it.
+ if (Flag.getNode())
+ RetOps.push_back(Flag);
+
+ return DAG.getNode(XCoreISD::RETSP, dl, MVT::Other, RetOps);
+}
+
+//===----------------------------------------------------------------------===//
+// Other Lowering Code
+//===----------------------------------------------------------------------===//
+
+MachineBasicBlock *
+XCoreTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+ DebugLoc dl = MI.getDebugLoc();
+ assert((MI.getOpcode() == XCore::SELECT_CC) &&
+ "Unexpected instr type to insert");
+
+ // To "insert" a SELECT_CC instruction, we actually have to insert the diamond
+ // control-flow pattern. The incoming instruction knows the destination vreg
+ // to set, the condition code register to branch on, the true/false values to
+ // select between, and a branch opcode to use.
+ const BasicBlock *LLVM_BB = BB->getBasicBlock();
+ MachineFunction::iterator It = ++BB->getIterator();
+
+ // thisMBB:
+ // ...
+ // TrueVal = ...
+ // cmpTY ccX, r1, r2
+ // bCC copy1MBB
+ // fallthrough --> copy0MBB
+ MachineBasicBlock *thisMBB = BB;
+ MachineFunction *F = BB->getParent();
+ MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ F->insert(It, copy0MBB);
+ F->insert(It, sinkMBB);
+
+ // Transfer the remainder of BB and its successor edges to sinkMBB.
+ sinkMBB->splice(sinkMBB->begin(), BB,
+ std::next(MachineBasicBlock::iterator(MI)), BB->end());
+ sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+ // Next, add the true and fallthrough blocks as its successors.
+ BB->addSuccessor(copy0MBB);
+ BB->addSuccessor(sinkMBB);
+
+ BuildMI(BB, dl, TII.get(XCore::BRFT_lru6))
+ .addReg(MI.getOperand(1).getReg())
+ .addMBB(sinkMBB);
+
+ // copy0MBB:
+ // %FalseValue = ...
+ // # fallthrough to sinkMBB
+ BB = copy0MBB;
+
+ // Update machine-CFG edges
+ BB->addSuccessor(sinkMBB);
+
+ // sinkMBB:
+ // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+ // ...
+ BB = sinkMBB;
+ BuildMI(*BB, BB->begin(), dl, TII.get(XCore::PHI), MI.getOperand(0).getReg())
+ .addReg(MI.getOperand(3).getReg())
+ .addMBB(copy0MBB)
+ .addReg(MI.getOperand(2).getReg())
+ .addMBB(thisMBB);
+
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
+ return BB;
+}
+
+//===----------------------------------------------------------------------===//
+// Target Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc dl(N);
+ switch (N->getOpcode()) {
+ default: break;
+ case ISD::INTRINSIC_VOID:
+ switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+ case Intrinsic::xcore_outt:
+ case Intrinsic::xcore_outct:
+ case Intrinsic::xcore_chkct: {
+ SDValue OutVal = N->getOperand(3);
+ // These instructions ignore the high bits.
+ if (OutVal.hasOneUse()) {
+ unsigned BitWidth = OutVal.getValueSizeInBits();
+ APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8);
+ APInt KnownZero, KnownOne;
+ TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+ !DCI.isBeforeLegalizeOps());
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLO.ShrinkDemandedConstant(OutVal, DemandedMask) ||
+ TLI.SimplifyDemandedBits(OutVal, DemandedMask, KnownZero, KnownOne,
+ TLO))
+ DCI.CommitTargetLoweringOpt(TLO);
+ }
+ break;
+ }
+ case Intrinsic::xcore_setpt: {
+ SDValue Time = N->getOperand(3);
+ // This instruction ignores the high bits.
+ if (Time.hasOneUse()) {
+ unsigned BitWidth = Time.getValueSizeInBits();
+ APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
+ APInt KnownZero, KnownOne;
+ TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+ !DCI.isBeforeLegalizeOps());
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLO.ShrinkDemandedConstant(Time, DemandedMask) ||
+ TLI.SimplifyDemandedBits(Time, DemandedMask, KnownZero, KnownOne,
+ TLO))
+ DCI.CommitTargetLoweringOpt(TLO);
+ }
+ break;
+ }
+ }
+ break;
+ case XCoreISD::LADD: {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDValue N2 = N->getOperand(2);
+ ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+ ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+ EVT VT = N0.getValueType();
+
+ // canonicalize constant to RHS
+ if (N0C && !N1C)
+ return DAG.getNode(XCoreISD::LADD, dl, DAG.getVTList(VT, VT), N1, N0, N2);
+
+ // fold (ladd 0, 0, x) -> 0, x & 1
+ if (N0C && N0C->isNullValue() && N1C && N1C->isNullValue()) {
+ SDValue Carry = DAG.getConstant(0, dl, VT);
+ SDValue Result = DAG.getNode(ISD::AND, dl, VT, N2,
+ DAG.getConstant(1, dl, VT));
+ SDValue Ops[] = { Result, Carry };
+ return DAG.getMergeValues(Ops, dl);
+ }
+
+ // fold (ladd x, 0, y) -> 0, add x, y iff carry is unused and y has only the
+ // low bit set
+ if (N1C && N1C->isNullValue() && N->hasNUsesOfValue(0, 1)) {
+ APInt KnownZero, KnownOne;
+ APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
+ VT.getSizeInBits() - 1);
+ DAG.computeKnownBits(N2, KnownZero, KnownOne);
+ if ((KnownZero & Mask) == Mask) {
+ SDValue Carry = DAG.getConstant(0, dl, VT);
+ SDValue Result = DAG.getNode(ISD::ADD, dl, VT, N0, N2);
+ SDValue Ops[] = { Result, Carry };
+ return DAG.getMergeValues(Ops, dl);
+ }
+ }
+ }
+ break;
+ case XCoreISD::LSUB: {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDValue N2 = N->getOperand(2);
+ ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+ ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+ EVT VT = N0.getValueType();
+
+ // fold (lsub 0, 0, x) -> x, -x iff x has only the low bit set
+ if (N0C && N0C->isNullValue() && N1C && N1C->isNullValue()) {
+ APInt KnownZero, KnownOne;
+ APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
+ VT.getSizeInBits() - 1);
+ DAG.computeKnownBits(N2, KnownZero, KnownOne);
+ if ((KnownZero & Mask) == Mask) {
+ SDValue Borrow = N2;
+ SDValue Result = DAG.getNode(ISD::SUB, dl, VT,
+ DAG.getConstant(0, dl, VT), N2);
+ SDValue Ops[] = { Result, Borrow };
+ return DAG.getMergeValues(Ops, dl);
+ }
+ }
+
+ // fold (lsub x, 0, y) -> 0, sub x, y iff borrow is unused and y has only the
+ // low bit set
+ if (N1C && N1C->isNullValue() && N->hasNUsesOfValue(0, 1)) {
+ APInt KnownZero, KnownOne;
+ APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
+ VT.getSizeInBits() - 1);
+ DAG.computeKnownBits(N2, KnownZero, KnownOne);
+ if ((KnownZero & Mask) == Mask) {
+ SDValue Borrow = DAG.getConstant(0, dl, VT);
+ SDValue Result = DAG.getNode(ISD::SUB, dl, VT, N0, N2);
+ SDValue Ops[] = { Result, Borrow };
+ return DAG.getMergeValues(Ops, dl);
+ }
+ }
+ }
+ break;
+ case XCoreISD::LMUL: {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDValue N2 = N->getOperand(2);
+ SDValue N3 = N->getOperand(3);
+ ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+ ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+ EVT VT = N0.getValueType();
+ // Canonicalize multiplicative constant to RHS. If both multiplicative
+ // operands are constant canonicalize smallest to RHS.
+ if ((N0C && !N1C) ||
+ (N0C && N1C && N0C->getZExtValue() < N1C->getZExtValue()))
+ return DAG.getNode(XCoreISD::LMUL, dl, DAG.getVTList(VT, VT),
+ N1, N0, N2, N3);
+
+ // lmul(x, 0, a, b)
+ if (N1C && N1C->isNullValue()) {
+ // If the high result is unused fold to add(a, b)
+ if (N->hasNUsesOfValue(0, 0)) {
+ SDValue Lo = DAG.getNode(ISD::ADD, dl, VT, N2, N3);
+ SDValue Ops[] = { Lo, Lo };
+ return DAG.getMergeValues(Ops, dl);
+ }
+ // Otherwise fold to ladd(a, b, 0)
+ SDValue Result =
+ DAG.getNode(XCoreISD::LADD, dl, DAG.getVTList(VT, VT), N2, N3, N1);
+ SDValue Carry(Result.getNode(), 1);
+ SDValue Ops[] = { Carry, Result };
+ return DAG.getMergeValues(Ops, dl);
+ }
+ }
+ break;
+ case ISD::ADD: {
+ // Fold 32 bit expressions such as add(add(mul(x,y),a),b) ->
+ // lmul(x, y, a, b). The high result of lmul will be ignored.
+ // This is only profitable if the intermediate results are unused
+ // elsewhere.
+ SDValue Mul0, Mul1, Addend0, Addend1;
+ if (N->getValueType(0) == MVT::i32 &&
+ isADDADDMUL(SDValue(N, 0), Mul0, Mul1, Addend0, Addend1, true)) {
+ SDValue Ignored = DAG.getNode(XCoreISD::LMUL, dl,
+ DAG.getVTList(MVT::i32, MVT::i32), Mul0,
+ Mul1, Addend0, Addend1);
+ SDValue Result(Ignored.getNode(), 1);
+ return Result;
+ }
+ APInt HighMask = APInt::getHighBitsSet(64, 32);
+ // Fold 64 bit expression such as add(add(mul(x,y),a),b) ->
+ // lmul(x, y, a, b) if all operands are zero-extended. We do this
+ // before type legalization as it is messy to match the operands after
+ // that.
+ if (N->getValueType(0) == MVT::i64 &&
+ isADDADDMUL(SDValue(N, 0), Mul0, Mul1, Addend0, Addend1, false) &&
+ DAG.MaskedValueIsZero(Mul0, HighMask) &&
+ DAG.MaskedValueIsZero(Mul1, HighMask) &&
+ DAG.MaskedValueIsZero(Addend0, HighMask) &&
+ DAG.MaskedValueIsZero(Addend1, HighMask)) {
+ SDValue Mul0L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+ Mul0, DAG.getConstant(0, dl, MVT::i32));
+ SDValue Mul1L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+ Mul1, DAG.getConstant(0, dl, MVT::i32));
+ SDValue Addend0L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+ Addend0, DAG.getConstant(0, dl, MVT::i32));
+ SDValue Addend1L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+ Addend1, DAG.getConstant(0, dl, MVT::i32));
+ SDValue Hi = DAG.getNode(XCoreISD::LMUL, dl,
+ DAG.getVTList(MVT::i32, MVT::i32), Mul0L, Mul1L,
+ Addend0L, Addend1L);
+ SDValue Lo(Hi.getNode(), 1);
+ return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+ }
+ }
+ break;
+ case ISD::STORE: {
+ // Replace unaligned store of unaligned load with memmove.
+ StoreSDNode *ST = cast<StoreSDNode>(N);
+ if (!DCI.isBeforeLegalize() ||
+ allowsMisalignedMemoryAccesses(ST->getMemoryVT(),
+ ST->getAddressSpace(),
+ ST->getAlignment()) ||
+ ST->isVolatile() || ST->isIndexed()) {
+ break;
+ }
+ SDValue Chain = ST->getChain();
+
+ unsigned StoreBits = ST->getMemoryVT().getStoreSizeInBits();
+ assert((StoreBits % 8) == 0 &&
+ "Store size in bits must be a multiple of 8");
+ unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(
+ ST->getMemoryVT().getTypeForEVT(*DCI.DAG.getContext()));
+ unsigned Alignment = ST->getAlignment();
+ if (Alignment >= ABIAlignment) {
+ break;
+ }
+
+ if (LoadSDNode *LD = dyn_cast<LoadSDNode>(ST->getValue())) {
+ if (LD->hasNUsesOfValue(1, 0) && ST->getMemoryVT() == LD->getMemoryVT() &&
+ LD->getAlignment() == Alignment &&
+ !LD->isVolatile() && !LD->isIndexed() &&
+ Chain.reachesChainWithoutSideEffects(SDValue(LD, 1))) {
+ bool isTail = isInTailCallPosition(DAG, ST, Chain);
+ return DAG.getMemmove(Chain, dl, ST->getBasePtr(),
+ LD->getBasePtr(),
+ DAG.getConstant(StoreBits/8, dl, MVT::i32),
+ Alignment, false, isTail, ST->getPointerInfo(),
+ LD->getPointerInfo());
+ }
+ }
+ break;
+ }
+ }
+ return SDValue();
+}
+
+void XCoreTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
+ APInt &KnownZero,
+ APInt &KnownOne,
+ const SelectionDAG &DAG,
+ unsigned Depth) const {
+ KnownZero = KnownOne = APInt(KnownZero.getBitWidth(), 0);
+ switch (Op.getOpcode()) {
+ default: break;
+ case XCoreISD::LADD:
+ case XCoreISD::LSUB:
+ if (Op.getResNo() == 1) {
+ // Top bits of carry / borrow are clear.
+ KnownZero = APInt::getHighBitsSet(KnownZero.getBitWidth(),
+ KnownZero.getBitWidth() - 1);
+ }
+ break;
+ case ISD::INTRINSIC_W_CHAIN:
+ {
+ unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+ switch (IntNo) {
+ case Intrinsic::xcore_getts:
+ // High bits are known to be zero.
+ KnownZero = APInt::getHighBitsSet(KnownZero.getBitWidth(),
+ KnownZero.getBitWidth() - 16);
+ break;
+ case Intrinsic::xcore_int:
+ case Intrinsic::xcore_inct:
+ // High bits are known to be zero.
+ KnownZero = APInt::getHighBitsSet(KnownZero.getBitWidth(),
+ KnownZero.getBitWidth() - 8);
+ break;
+ case Intrinsic::xcore_testct:
+ // Result is either 0 or 1.
+ KnownZero = APInt::getHighBitsSet(KnownZero.getBitWidth(),
+ KnownZero.getBitWidth() - 1);
+ break;
+ case Intrinsic::xcore_testwct:
+ // Result is in the range 0 - 4.
+ KnownZero = APInt::getHighBitsSet(KnownZero.getBitWidth(),
+ KnownZero.getBitWidth() - 3);
+ break;
+ }
+ }
+ break;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Addressing mode description hooks
+//===----------------------------------------------------------------------===//
+
+static inline bool isImmUs(int64_t val)
+{
+ return (val >= 0 && val <= 11);
+}
+
+static inline bool isImmUs2(int64_t val)
+{
+ return (val%2 == 0 && isImmUs(val/2));
+}
+
+static inline bool isImmUs4(int64_t val)
+{
+ return (val%4 == 0 && isImmUs(val/4));
+}
+
+/// isLegalAddressingMode - Return true if the addressing mode represented
+/// by AM is legal for this target, for a load/store of the specified type.
+bool XCoreTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+ const AddrMode &AM, Type *Ty,
+ unsigned AS) const {
+ if (Ty->getTypeID() == Type::VoidTyID)
+ return AM.Scale == 0 && isImmUs(AM.BaseOffs) && isImmUs4(AM.BaseOffs);
+
+ unsigned Size = DL.getTypeAllocSize(Ty);
+ if (AM.BaseGV) {
+ return Size >= 4 && !AM.HasBaseReg && AM.Scale == 0 &&
+ AM.BaseOffs%4 == 0;
+ }
+
+ switch (Size) {
+ case 1:
+ // reg + imm
+ if (AM.Scale == 0) {
+ return isImmUs(AM.BaseOffs);
+ }
+ // reg + reg
+ return AM.Scale == 1 && AM.BaseOffs == 0;
+ case 2:
+ case 3:
+ // reg + imm
+ if (AM.Scale == 0) {
+ return isImmUs2(AM.BaseOffs);
+ }
+ // reg + reg<<1
+ return AM.Scale == 2 && AM.BaseOffs == 0;
+ default:
+ // reg + imm
+ if (AM.Scale == 0) {
+ return isImmUs4(AM.BaseOffs);
+ }
+ // reg + reg<<2
+ return AM.Scale == 4 && AM.BaseOffs == 0;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// XCore Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+std::pair<unsigned, const TargetRegisterClass *>
+XCoreTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ StringRef Constraint,
+ MVT VT) const {
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ default : break;
+ case 'r':
+ return std::make_pair(0U, &XCore::GRRegsRegClass);
+ }
+ }
+ // Use the default implementation in TargetLowering to convert the register
+ // constraint into a member of a register class.
+ return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreISelLowering.h b/contrib/llvm/lib/Target/XCore/XCoreISelLowering.h
new file mode 100644
index 000000000000..41813bbb8156
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreISelLowering.h
@@ -0,0 +1,234 @@
+//===-- XCoreISelLowering.h - XCore DAG Lowering Interface ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that XCore uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XCORE_XCOREISELLOWERING_H
+#define LLVM_LIB_TARGET_XCORE_XCOREISELLOWERING_H
+
+#include "XCore.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+ // Forward delcarations
+ class XCoreSubtarget;
+ class XCoreTargetMachine;
+
+ namespace XCoreISD {
+ enum NodeType : unsigned {
+ // Start the numbering where the builtin ops and target ops leave off.
+ FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+ // Branch and link (call)
+ BL,
+
+ // pc relative address
+ PCRelativeWrapper,
+
+ // dp relative address
+ DPRelativeWrapper,
+
+ // cp relative address
+ CPRelativeWrapper,
+
+ // Load word from stack
+ LDWSP,
+
+ // Store word to stack
+ STWSP,
+
+ // Corresponds to retsp instruction
+ RETSP,
+
+ // Corresponds to LADD instruction
+ LADD,
+
+ // Corresponds to LSUB instruction
+ LSUB,
+
+ // Corresponds to LMUL instruction
+ LMUL,
+
+ // Corresponds to MACCU instruction
+ MACCU,
+
+ // Corresponds to MACCS instruction
+ MACCS,
+
+ // Corresponds to CRC8 instruction
+ CRC8,
+
+ // Jumptable branch.
+ BR_JT,
+
+ // Jumptable branch using long branches for each entry.
+ BR_JT32,
+
+ // Offset from frame pointer to the first (possible) on-stack argument
+ FRAME_TO_ARGS_OFFSET,
+
+ // Exception handler return. The stack is restored to the first
+ // followed by a jump to the second argument.
+ EH_RETURN,
+
+ // Memory barrier.
+ MEMBARRIER
+ };
+ }
+
+ //===--------------------------------------------------------------------===//
+ // TargetLowering Implementation
+ //===--------------------------------------------------------------------===//
+ class XCoreTargetLowering : public TargetLowering
+ {
+ public:
+ explicit XCoreTargetLowering(const TargetMachine &TM,
+ const XCoreSubtarget &Subtarget);
+
+ using TargetLowering::isZExtFree;
+ bool isZExtFree(SDValue Val, EVT VT2) const override;
+
+
+ unsigned getJumpTableEncoding() const override;
+ MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override {
+ return MVT::i32;
+ }
+
+ /// LowerOperation - Provide custom lowering hooks for some operations.
+ SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+ /// ReplaceNodeResults - Replace the results of node with an illegal result
+ /// type with new values built out of custom code.
+ ///
+ void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+ SelectionDAG &DAG) const override;
+
+ /// getTargetNodeName - This method returns the name of a target specific
+ // DAG node.
+ const char *getTargetNodeName(unsigned Opcode) const override;
+
+ MachineBasicBlock *
+ EmitInstrWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *MBB) const override;
+
+ bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
+ Type *Ty, unsigned AS) const override;
+
+ /// If a physical register, this returns the register that receives the
+ /// exception address on entry to an EH pad.
+ unsigned
+ getExceptionPointerRegister(const Constant *PersonalityFn) const override {
+ return XCore::R0;
+ }
+
+ /// If a physical register, this returns the register that receives the
+ /// exception typeid on entry to a landing pad.
+ unsigned
+ getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
+ return XCore::R1;
+ }
+
+ private:
+ const TargetMachine &TM;
+ const XCoreSubtarget &Subtarget;
+
+ // Lower Operand helpers
+ SDValue LowerCCCArguments(SDValue Chain, CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const;
+ SDValue LowerCCCCallTo(SDValue Chain, SDValue Callee,
+ CallingConv::ID CallConv, bool isVarArg,
+ bool isTailCall,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const;
+ SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
+ SDValue getGlobalAddressWrapper(SDValue GA, const GlobalValue *GV,
+ SelectionDAG &DAG) const;
+ SDValue lowerLoadWordFromAlignedBasePlusOffset(const SDLoc &DL,
+ SDValue Chain, SDValue Base,
+ int64_t Offset,
+ SelectionDAG &DAG) const;
+
+ // Lower Operand specifics
+ SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerUMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const;
+
+ // Inline asm support
+ std::pair<unsigned, const TargetRegisterClass *>
+ getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ StringRef Constraint, MVT VT) const override;
+
+ // Expand specifics
+ SDValue TryExpandADDWithMul(SDNode *Op, SelectionDAG &DAG) const;
+ SDValue ExpandADDSUB(SDNode *Op, SelectionDAG &DAG) const;
+
+ SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+ void computeKnownBitsForTargetNode(const SDValue Op,
+ APInt &KnownZero,
+ APInt &KnownOne,
+ const SelectionDAG &DAG,
+ unsigned Depth = 0) const override;
+
+ SDValue
+ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const override;
+
+ SDValue
+ LowerCall(TargetLowering::CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const override;
+
+ SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SDLoc &dl, SelectionDAG &DAG) const override;
+
+ bool
+ CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &ArgsFlags,
+ LLVMContext &Context) const override;
+ bool shouldInsertFencesForAtomic(const Instruction *I) const override {
+ return true;
+ }
+ };
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCoreInstrFormats.td b/contrib/llvm/lib/Target/XCore/XCoreInstrFormats.td
new file mode 100644
index 000000000000..379cc39aa617
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreInstrFormats.td
@@ -0,0 +1,277 @@
+//===-- XCoreInstrFormats.td - XCore Instruction Formats ---*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+class InstXCore<int sz, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : Instruction {
+ field bits<32> Inst;
+
+ let Namespace = "XCore";
+ dag OutOperandList = outs;
+ dag InOperandList = ins;
+ let AsmString = asmstr;
+ let Pattern = pattern;
+ let Size = sz;
+ field bits<32> SoftFail = 0;
+}
+
+// XCore pseudo instructions format
+class PseudoInstXCore<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstXCore<0, outs, ins, asmstr, pattern> {
+ let isPseudo = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction formats
+//===----------------------------------------------------------------------===//
+
+class _F3R<bits<5> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstXCore<2, outs, ins, asmstr, pattern> {
+ let Inst{15-11} = opc;
+ let DecoderMethod = "Decode3RInstruction";
+}
+
+// 3R with first operand as an immediate. Used for TSETR where the first
+// operand is treated as an immediate since it refers to a register number in
+// another thread.
+class _F3RImm<bits<5> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : _F3R<opc, outs, ins, asmstr, pattern> {
+ let DecoderMethod = "Decode3RImmInstruction";
+}
+
+class _FL3R<bits<9> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstXCore<4, outs, ins, asmstr, pattern> {
+ let Inst{31-27} = opc{8-4};
+ let Inst{26-20} = 0b1111110;
+ let Inst{19-16} = opc{3-0};
+
+ let Inst{15-11} = 0b11111;
+ let DecoderMethod = "DecodeL3RInstruction";
+}
+
+// L3R with first operand as both a source and a destination.
+class _FL3RSrcDst<bits<9> opc, dag outs, dag ins, string asmstr,
+ list<dag> pattern> : _FL3R<opc, outs, ins, asmstr, pattern> {
+ let DecoderMethod = "DecodeL3RSrcDstInstruction";
+}
+
+class _F2RUS<bits<5> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstXCore<2, outs, ins, asmstr, pattern> {
+ let Inst{15-11} = opc;
+ let DecoderMethod = "Decode2RUSInstruction";
+}
+
+// 2RUS with bitp operand
+class _F2RUSBitp<bits<5> opc, dag outs, dag ins, string asmstr,
+ list<dag> pattern>
+ : _F2RUS<opc, outs, ins, asmstr, pattern> {
+ let DecoderMethod = "Decode2RUSBitpInstruction";
+}
+
+class _FL2RUS<bits<9> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstXCore<4, outs, ins, asmstr, pattern> {
+ let Inst{31-27} = opc{8-4};
+ let Inst{26-20} = 0b1111110;
+ let Inst{19-16} = opc{3-0};
+
+ let Inst{15-11} = 0b11111;
+ let DecoderMethod = "DecodeL2RUSInstruction";
+}
+
+// L2RUS with bitp operand
+class _FL2RUSBitp<bits<9> opc, dag outs, dag ins, string asmstr,
+ list<dag> pattern>
+ : _FL2RUS<opc, outs, ins, asmstr, pattern> {
+ let DecoderMethod = "DecodeL2RUSBitpInstruction";
+}
+
+class _FRU6<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstXCore<2, outs, ins, asmstr, pattern> {
+ bits<4> a;
+ bits<6> b;
+
+ let Inst{15-10} = opc;
+ let Inst{9-6} = a;
+ let Inst{5-0} = b;
+}
+
+class _FLRU6<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstXCore<4, outs, ins, asmstr, pattern> {
+ bits<4> a;
+ bits<16> b;
+
+ let Inst{31-26} = opc;
+ let Inst{25-22} = a;
+ let Inst{21-16} = b{5-0};
+ let Inst{15-10} = 0b111100;
+ let Inst{9-0} = b{15-6};
+}
+
+class _FU6<bits<10> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstXCore<2, outs, ins, asmstr, pattern> {
+ bits<6> a;
+
+ let Inst{15-6} = opc;
+ let Inst{5-0} = a;
+}
+
+class _FLU6<bits<10> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstXCore<4, outs, ins, asmstr, pattern> {
+ bits<16> a;
+
+ let Inst{31-22} = opc;
+ let Inst{21-16} = a{5-0};
+ let Inst{15-10} = 0b111100;
+ let Inst{9-0} = a{15-6};
+}
+
+class _FU10<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstXCore<2, outs, ins, asmstr, pattern> {
+ bits<10> a;
+
+ let Inst{15-10} = opc;
+ let Inst{9-0} = a;
+}
+
+class _FLU10<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstXCore<4, outs, ins, asmstr, pattern> {
+ bits<20> a;
+
+ let Inst{31-26} = opc;
+ let Inst{25-16} = a{9-0};
+ let Inst{15-10} = 0b111100;
+ let Inst{9-0} = a{19-10};
+}
+
+class _F2R<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstXCore<2, outs, ins, asmstr, pattern> {
+ let Inst{15-11} = opc{5-1};
+ let Inst{4} = opc{0};
+ let DecoderMethod = "Decode2RInstruction";
+}
+
+// 2R with first operand as an immediate. Used for TSETMR where the first
+// operand is treated as an immediate since it refers to a register number in
+// another thread.
+class _F2RImm<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : _F2R<opc, outs, ins, asmstr, pattern> {
+ let DecoderMethod = "Decode2RImmInstruction";
+}
+
+// 2R with first operand as both a source and a destination.
+class _F2RSrcDst<bits<6> opc, dag outs, dag ins, string asmstr,
+ list<dag> pattern> : _F2R<opc, outs, ins, asmstr, pattern> {
+ let DecoderMethod = "Decode2RSrcDstInstruction";
+}
+
+// Same as 2R with last two operands swapped
+class _FR2R<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : _F2R<opc, outs, ins, asmstr, pattern> {
+ let DecoderMethod = "DecodeR2RInstruction";
+}
+
+class _FRUS<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstXCore<2, outs, ins, asmstr, pattern> {
+ let Inst{15-11} = opc{5-1};
+ let Inst{4} = opc{0};
+ let DecoderMethod = "DecodeRUSInstruction";
+}
+
+// RUS with bitp operand
+class _FRUSBitp<bits<6> opc, dag outs, dag ins, string asmstr,
+ list<dag> pattern>
+ : _FRUS<opc, outs, ins, asmstr, pattern> {
+ let DecoderMethod = "DecodeRUSBitpInstruction";
+}
+
+// RUS with first operand as both a source and a destination and a bitp second
+// operand
+class _FRUSSrcDstBitp<bits<6> opc, dag outs, dag ins, string asmstr,
+ list<dag> pattern>
+ : _FRUS<opc, outs, ins, asmstr, pattern> {
+ let DecoderMethod = "DecodeRUSSrcDstBitpInstruction";
+}
+
+class _FL2R<bits<10> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstXCore<4, outs, ins, asmstr, pattern> {
+ let Inst{31-27} = opc{9-5};
+ let Inst{26-20} = 0b1111110;
+ let Inst{19-16} = opc{4-1};
+
+ let Inst{15-11} = 0b11111;
+ let Inst{4} = opc{0};
+ let DecoderMethod = "DecodeL2RInstruction";
+}
+
+// Same as L2R with last two operands swapped
+class _FLR2R<bits<10> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : _FL2R<opc, outs, ins, asmstr, pattern> {
+ let DecoderMethod = "DecodeLR2RInstruction";
+}
+
+class _F1R<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstXCore<2, outs, ins, asmstr, pattern> {
+ bits<4> a;
+
+ let Inst{15-11} = opc{5-1};
+ let Inst{10-5} = 0b111111;
+ let Inst{4} = opc{0};
+ let Inst{3-0} = a;
+}
+
+class _F0R<bits<10> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstXCore<2, outs, ins, asmstr, pattern> {
+ let Inst{15-11} = opc{9-5};
+ let Inst{10-5} = 0b111111;
+ let Inst{4-0} = opc{4-0};
+}
+
+class _FL4R<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstXCore<4, outs, ins, asmstr, pattern> {
+ bits<4> d;
+
+ let Inst{31-27} = opc{5-1};
+ let Inst{26-21} = 0b111111;
+ let Inst{20} = opc{0};
+ let Inst{19-16} = d;
+ let Inst{15-11} = 0b11111;
+}
+
+// L4R with 4th operand as both a source and a destination.
+class _FL4RSrcDst<bits<6> opc, dag outs, dag ins, string asmstr,
+ list<dag> pattern>
+ : _FL4R<opc, outs, ins, asmstr, pattern> {
+ let DecoderMethod = "DecodeL4RSrcDstInstruction";
+}
+
+// L4R with 1st and 4th operand as both a source and a destination.
+class _FL4RSrcDstSrcDst<bits<6> opc, dag outs, dag ins, string asmstr,
+ list<dag> pattern>
+ : _FL4R<opc, outs, ins, asmstr, pattern> {
+ let DecoderMethod = "DecodeL4RSrcDstSrcDstInstruction";
+}
+
+class _FL5R<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstXCore<4, outs, ins, asmstr, pattern> {
+ let Inst{31-27} = opc{5-1};
+ let Inst{20} = opc{0};
+ let Inst{15-11} = 0b11111;
+
+ let DecoderMethod = "DecodeL5RInstruction";
+}
+
+class _FL6R<bits<5> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstXCore<4, outs, ins, asmstr, pattern> {
+ let Inst{31-27} = opc;
+ let Inst{15-11} = 0b11111;
+
+ let DecoderMethod = "DecodeL6RInstruction";
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.cpp b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.cpp
new file mode 100644
index 000000000000..7a9c6fc93f8a
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.cpp
@@ -0,0 +1,451 @@
+//===-- XCoreInstrInfo.cpp - XCore Instruction Information ----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the XCore implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreInstrInfo.h"
+#include "XCore.h"
+#include "XCoreMachineFunctionInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "XCoreGenInstrInfo.inc"
+
+namespace llvm {
+namespace XCore {
+
+ // XCore Condition Codes
+ enum CondCode {
+ COND_TRUE,
+ COND_FALSE,
+ COND_INVALID
+ };
+}
+}
+
+// Pin the vtable to this file.
+void XCoreInstrInfo::anchor() {}
+
+XCoreInstrInfo::XCoreInstrInfo()
+ : XCoreGenInstrInfo(XCore::ADJCALLSTACKDOWN, XCore::ADJCALLSTACKUP),
+ RI() {
+}
+
+static bool isZeroImm(const MachineOperand &op) {
+ return op.isImm() && op.getImm() == 0;
+}
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the destination along with the FrameIndex of the loaded stack slot. If
+/// not, return 0. This predicate must return 0 if the instruction has
+/// any side effects other than loading from the stack slot.
+unsigned XCoreInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const {
+ int Opcode = MI.getOpcode();
+ if (Opcode == XCore::LDWFI)
+ {
+ if ((MI.getOperand(1).isFI()) && // is a stack slot
+ (MI.getOperand(2).isImm()) && // the imm is zero
+ (isZeroImm(MI.getOperand(2)))) {
+ FrameIndex = MI.getOperand(1).getIndex();
+ return MI.getOperand(0).getReg();
+ }
+ }
+ return 0;
+}
+
+ /// isStoreToStackSlot - If the specified machine instruction is a direct
+ /// store to a stack slot, return the virtual or physical register number of
+ /// the source reg along with the FrameIndex of the loaded stack slot. If
+ /// not, return 0. This predicate must return 0 if the instruction has
+ /// any side effects other than storing to the stack slot.
+unsigned XCoreInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const {
+ int Opcode = MI.getOpcode();
+ if (Opcode == XCore::STWFI)
+ {
+ if ((MI.getOperand(1).isFI()) && // is a stack slot
+ (MI.getOperand(2).isImm()) && // the imm is zero
+ (isZeroImm(MI.getOperand(2)))) {
+ FrameIndex = MI.getOperand(1).getIndex();
+ return MI.getOperand(0).getReg();
+ }
+ }
+ return 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Branch Analysis
+//===----------------------------------------------------------------------===//
+
+static inline bool IsBRU(unsigned BrOpc) {
+ return BrOpc == XCore::BRFU_u6
+ || BrOpc == XCore::BRFU_lu6
+ || BrOpc == XCore::BRBU_u6
+ || BrOpc == XCore::BRBU_lu6;
+}
+
+static inline bool IsBRT(unsigned BrOpc) {
+ return BrOpc == XCore::BRFT_ru6
+ || BrOpc == XCore::BRFT_lru6
+ || BrOpc == XCore::BRBT_ru6
+ || BrOpc == XCore::BRBT_lru6;
+}
+
+static inline bool IsBRF(unsigned BrOpc) {
+ return BrOpc == XCore::BRFF_ru6
+ || BrOpc == XCore::BRFF_lru6
+ || BrOpc == XCore::BRBF_ru6
+ || BrOpc == XCore::BRBF_lru6;
+}
+
+static inline bool IsCondBranch(unsigned BrOpc) {
+ return IsBRF(BrOpc) || IsBRT(BrOpc);
+}
+
+static inline bool IsBR_JT(unsigned BrOpc) {
+ return BrOpc == XCore::BR_JT
+ || BrOpc == XCore::BR_JT32;
+}
+
+/// GetCondFromBranchOpc - Return the XCore CC that matches
+/// the correspondent Branch instruction opcode.
+static XCore::CondCode GetCondFromBranchOpc(unsigned BrOpc)
+{
+ if (IsBRT(BrOpc)) {
+ return XCore::COND_TRUE;
+ } else if (IsBRF(BrOpc)) {
+ return XCore::COND_FALSE;
+ } else {
+ return XCore::COND_INVALID;
+ }
+}
+
+/// GetCondBranchFromCond - Return the Branch instruction
+/// opcode that matches the cc.
+static inline unsigned GetCondBranchFromCond(XCore::CondCode CC)
+{
+ switch (CC) {
+ default: llvm_unreachable("Illegal condition code!");
+ case XCore::COND_TRUE : return XCore::BRFT_lru6;
+ case XCore::COND_FALSE : return XCore::BRFF_lru6;
+ }
+}
+
+/// GetOppositeBranchCondition - Return the inverse of the specified
+/// condition, e.g. turning COND_E to COND_NE.
+static inline XCore::CondCode GetOppositeBranchCondition(XCore::CondCode CC)
+{
+ switch (CC) {
+ default: llvm_unreachable("Illegal condition code!");
+ case XCore::COND_TRUE : return XCore::COND_FALSE;
+ case XCore::COND_FALSE : return XCore::COND_TRUE;
+ }
+}
+
+/// AnalyzeBranch - Analyze the branching code at the end of MBB, returning
+/// true if it cannot be understood (e.g. it's a switch dispatch or isn't
+/// implemented for a target). Upon success, this returns false and returns
+/// with the following information in various cases:
+///
+/// 1. If this block ends with no branches (it just falls through to its succ)
+/// just return false, leaving TBB/FBB null.
+/// 2. If this block ends with only an unconditional branch, it sets TBB to be
+/// the destination block.
+/// 3. If this block ends with an conditional branch and it falls through to
+/// an successor block, it sets TBB to be the branch destination block and a
+/// list of operands that evaluate the condition. These
+/// operands can be passed to other TargetInstrInfo methods to create new
+/// branches.
+/// 4. If this block ends with an conditional branch and an unconditional
+/// block, it returns the 'true' destination in TBB, the 'false' destination
+/// in FBB, and a list of operands that evaluate the condition. These
+/// operands can be passed to other TargetInstrInfo methods to create new
+/// branches.
+///
+/// Note that removeBranch and insertBranch must be implemented to support
+/// cases where this method returns success.
+///
+bool XCoreInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const {
+ // If the block has no terminators, it just falls into the block after it.
+ MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
+ if (I == MBB.end())
+ return false;
+
+ if (!isUnpredicatedTerminator(*I))
+ return false;
+
+ // Get the last instruction in the block.
+ MachineInstr *LastInst = &*I;
+
+ // If there is only one terminator instruction, process it.
+ if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
+ if (IsBRU(LastInst->getOpcode())) {
+ TBB = LastInst->getOperand(0).getMBB();
+ return false;
+ }
+
+ XCore::CondCode BranchCode = GetCondFromBranchOpc(LastInst->getOpcode());
+ if (BranchCode == XCore::COND_INVALID)
+ return true; // Can't handle indirect branch.
+
+ // Conditional branch
+ // Block ends with fall-through condbranch.
+
+ TBB = LastInst->getOperand(1).getMBB();
+ Cond.push_back(MachineOperand::CreateImm(BranchCode));
+ Cond.push_back(LastInst->getOperand(0));
+ return false;
+ }
+
+ // Get the instruction before it if it's a terminator.
+ MachineInstr *SecondLastInst = &*I;
+
+ // If there are three terminators, we don't know what sort of block this is.
+ if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
+ return true;
+
+ unsigned SecondLastOpc = SecondLastInst->getOpcode();
+ XCore::CondCode BranchCode = GetCondFromBranchOpc(SecondLastOpc);
+
+ // If the block ends with conditional branch followed by unconditional,
+ // handle it.
+ if (BranchCode != XCore::COND_INVALID
+ && IsBRU(LastInst->getOpcode())) {
+
+ TBB = SecondLastInst->getOperand(1).getMBB();
+ Cond.push_back(MachineOperand::CreateImm(BranchCode));
+ Cond.push_back(SecondLastInst->getOperand(0));
+
+ FBB = LastInst->getOperand(0).getMBB();
+ return false;
+ }
+
+ // If the block ends with two unconditional branches, handle it. The second
+ // one is not executed, so remove it.
+ if (IsBRU(SecondLastInst->getOpcode()) &&
+ IsBRU(LastInst->getOpcode())) {
+ TBB = SecondLastInst->getOperand(0).getMBB();
+ I = LastInst;
+ if (AllowModify)
+ I->eraseFromParent();
+ return false;
+ }
+
+ // Likewise if it ends with a branch table followed by an unconditional branch.
+ if (IsBR_JT(SecondLastInst->getOpcode()) && IsBRU(LastInst->getOpcode())) {
+ I = LastInst;
+ if (AllowModify)
+ I->eraseFromParent();
+ return true;
+ }
+
+ // Otherwise, can't handle this.
+ return true;
+}
+
+unsigned XCoreInstrInfo::insertBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB,
+ ArrayRef<MachineOperand> Cond,
+ const DebugLoc &DL,
+ int *BytesAdded) const {
+ // Shouldn't be a fall through.
+ assert(TBB && "insertBranch must not be told to insert a fallthrough");
+ assert((Cond.size() == 2 || Cond.size() == 0) &&
+ "Unexpected number of components!");
+ assert(!BytesAdded && "code size not handled");
+
+ if (!FBB) { // One way branch.
+ if (Cond.empty()) {
+ // Unconditional branch
+ BuildMI(&MBB, DL, get(XCore::BRFU_lu6)).addMBB(TBB);
+ } else {
+ // Conditional branch.
+ unsigned Opc = GetCondBranchFromCond((XCore::CondCode)Cond[0].getImm());
+ BuildMI(&MBB, DL, get(Opc)).addReg(Cond[1].getReg())
+ .addMBB(TBB);
+ }
+ return 1;
+ }
+
+ // Two-way Conditional branch.
+ assert(Cond.size() == 2 && "Unexpected number of components!");
+ unsigned Opc = GetCondBranchFromCond((XCore::CondCode)Cond[0].getImm());
+ BuildMI(&MBB, DL, get(Opc)).addReg(Cond[1].getReg())
+ .addMBB(TBB);
+ BuildMI(&MBB, DL, get(XCore::BRFU_lu6)).addMBB(FBB);
+ return 2;
+}
+
+unsigned
+XCoreInstrInfo::removeBranch(MachineBasicBlock &MBB, int *BytesRemoved) const {
+ assert(!BytesRemoved && "code size not handled");
+
+ MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
+ if (I == MBB.end())
+ return 0;
+
+ if (!IsBRU(I->getOpcode()) && !IsCondBranch(I->getOpcode()))
+ return 0;
+
+ // Remove the branch.
+ I->eraseFromParent();
+
+ I = MBB.end();
+
+ if (I == MBB.begin()) return 1;
+ --I;
+ if (!IsCondBranch(I->getOpcode()))
+ return 1;
+
+ // Remove the branch.
+ I->eraseFromParent();
+ return 2;
+}
+
+void XCoreInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL, unsigned DestReg,
+ unsigned SrcReg, bool KillSrc) const {
+ bool GRDest = XCore::GRRegsRegClass.contains(DestReg);
+ bool GRSrc = XCore::GRRegsRegClass.contains(SrcReg);
+
+ if (GRDest && GRSrc) {
+ BuildMI(MBB, I, DL, get(XCore::ADD_2rus), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc))
+ .addImm(0);
+ return;
+ }
+
+ if (GRDest && SrcReg == XCore::SP) {
+ BuildMI(MBB, I, DL, get(XCore::LDAWSP_ru6), DestReg).addImm(0);
+ return;
+ }
+
+ if (DestReg == XCore::SP && GRSrc) {
+ BuildMI(MBB, I, DL, get(XCore::SETSP_1r))
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+ }
+ llvm_unreachable("Impossible reg-to-reg copy");
+}
+
+void XCoreInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ unsigned SrcReg, bool isKill,
+ int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const
+{
+ DebugLoc DL;
+ if (I != MBB.end() && !I->isDebugValue())
+ DL = I->getDebugLoc();
+ MachineFunction *MF = MBB.getParent();
+ const MachineFrameInfo &MFI = MF->getFrameInfo();
+ MachineMemOperand *MMO = MF->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*MF, FrameIndex),
+ MachineMemOperand::MOStore, MFI.getObjectSize(FrameIndex),
+ MFI.getObjectAlignment(FrameIndex));
+ BuildMI(MBB, I, DL, get(XCore::STWFI))
+ .addReg(SrcReg, getKillRegState(isKill))
+ .addFrameIndex(FrameIndex)
+ .addImm(0)
+ .addMemOperand(MMO);
+}
+
+void XCoreInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ unsigned DestReg, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const
+{
+ DebugLoc DL;
+ if (I != MBB.end() && !I->isDebugValue())
+ DL = I->getDebugLoc();
+ MachineFunction *MF = MBB.getParent();
+ const MachineFrameInfo &MFI = MF->getFrameInfo();
+ MachineMemOperand *MMO = MF->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*MF, FrameIndex),
+ MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIndex),
+ MFI.getObjectAlignment(FrameIndex));
+ BuildMI(MBB, I, DL, get(XCore::LDWFI), DestReg)
+ .addFrameIndex(FrameIndex)
+ .addImm(0)
+ .addMemOperand(MMO);
+}
+
+bool XCoreInstrInfo::
+reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+ assert((Cond.size() == 2) &&
+ "Invalid XCore branch condition!");
+ Cond[0].setImm(GetOppositeBranchCondition((XCore::CondCode)Cond[0].getImm()));
+ return false;
+}
+
+static inline bool isImmU6(unsigned val) {
+ return val < (1 << 6);
+}
+
+static inline bool isImmU16(unsigned val) {
+ return val < (1 << 16);
+}
+
+static bool isImmMskBitp(unsigned val) {
+ if (!isMask_32(val)) {
+ return false;
+ }
+ int N = Log2_32(val) + 1;
+ return (N >= 1 && N <= 8) || N == 16 || N == 24 || N == 32;
+}
+
+MachineBasicBlock::iterator XCoreInstrInfo::loadImmediate(
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned Reg, uint64_t Value) const {
+ DebugLoc dl;
+ if (MI != MBB.end() && !MI->isDebugValue())
+ dl = MI->getDebugLoc();
+ if (isImmMskBitp(Value)) {
+ int N = Log2_32(Value) + 1;
+ return BuildMI(MBB, MI, dl, get(XCore::MKMSK_rus), Reg)
+ .addImm(N)
+ .getInstr();
+ }
+ if (isImmU16(Value)) {
+ int Opcode = isImmU6(Value) ? XCore::LDC_ru6 : XCore::LDC_lru6;
+ return BuildMI(MBB, MI, dl, get(Opcode), Reg).addImm(Value).getInstr();
+ }
+ MachineConstantPool *ConstantPool = MBB.getParent()->getConstantPool();
+ const Constant *C = ConstantInt::get(
+ Type::getInt32Ty(MBB.getParent()->getFunction()->getContext()), Value);
+ unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
+ return BuildMI(MBB, MI, dl, get(XCore::LDWCP_lru6), Reg)
+ .addConstantPoolIndex(Idx)
+ .getInstr();
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.h b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.h
new file mode 100644
index 000000000000..a377784caf4b
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.h
@@ -0,0 +1,94 @@
+//===-- XCoreInstrInfo.h - XCore Instruction Information --------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the XCore implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XCORE_XCOREINSTRINFO_H
+#define LLVM_LIB_TARGET_XCORE_XCOREINSTRINFO_H
+
+#include "XCoreRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "XCoreGenInstrInfo.inc"
+
+namespace llvm {
+
+class XCoreInstrInfo : public XCoreGenInstrInfo {
+ const XCoreRegisterInfo RI;
+ virtual void anchor();
+public:
+ XCoreInstrInfo();
+
+ /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As
+ /// such, whenever a client has an instance of instruction info, it should
+ /// always be able to get register info as well (through this method).
+ ///
+ const TargetRegisterInfo &getRegisterInfo() const { return RI; }
+
+ /// isLoadFromStackSlot - If the specified machine instruction is a direct
+ /// load from a stack slot, return the virtual or physical register number of
+ /// the destination along with the FrameIndex of the loaded stack slot. If
+ /// not, return 0. This predicate must return 0 if the instruction has
+ /// any side effects other than loading from the stack slot.
+ unsigned isLoadFromStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const override;
+
+ /// isStoreToStackSlot - If the specified machine instruction is a direct
+ /// store to a stack slot, return the virtual or physical register number of
+ /// the source reg along with the FrameIndex of the loaded stack slot. If
+ /// not, return 0. This predicate must return 0 if the instruction has
+ /// any side effects other than storing to the stack slot.
+ unsigned isStoreToStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const override;
+
+ bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const override;
+
+ unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+ const DebugLoc &DL,
+ int *BytesAdded = nullptr) const override;
+
+ unsigned removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved = nullptr) const override;
+
+ void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+ bool KillSrc) const override;
+
+ void storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned SrcReg, bool isKill, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
+
+ void loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned DestReg, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
+
+ bool reverseBranchCondition(
+ SmallVectorImpl<MachineOperand> &Cond) const override;
+
+ // Emit code before MBBI to load immediate value into physical register Reg.
+ // Returns an iterator to the new instruction.
+ MachineBasicBlock::iterator loadImmediate(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned Reg, uint64_t Value) const;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.td b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.td
new file mode 100644
index 000000000000..f1d52d5a191f
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.td
@@ -0,0 +1,1312 @@
+//===-- XCoreInstrInfo.td - Target Description for XCore ---*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the XCore instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+// Uses of CP, DP are not currently reflected in the patterns, since
+// having a physical register as an operand prevents loop hoisting and
+// since the value of these registers never changes during the life of the
+// function.
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass.
+//===----------------------------------------------------------------------===//
+
+include "XCoreInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// XCore specific DAG Nodes.
+//
+
+// Call
+def SDT_XCoreBranchLink : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+def XCoreBranchLink : SDNode<"XCoreISD::BL",SDT_XCoreBranchLink,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+ SDNPVariadic]>;
+
+def XCoreRetsp : SDNode<"XCoreISD::RETSP", SDTBrind,
+ [SDNPHasChain, SDNPOptInGlue, SDNPMayLoad, SDNPVariadic]>;
+
+def SDT_XCoreEhRet : SDTypeProfile<0, 2,
+ [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+def XCoreEhRet : SDNode<"XCoreISD::EH_RETURN", SDT_XCoreEhRet,
+ [SDNPHasChain, SDNPOptInGlue]>;
+
+def SDT_XCoreBR_JT : SDTypeProfile<0, 2,
+ [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
+
+def XCoreBR_JT : SDNode<"XCoreISD::BR_JT", SDT_XCoreBR_JT,
+ [SDNPHasChain]>;
+
+def XCoreBR_JT32 : SDNode<"XCoreISD::BR_JT32", SDT_XCoreBR_JT,
+ [SDNPHasChain]>;
+
+def SDT_XCoreAddress : SDTypeProfile<1, 1,
+ [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+
+def pcrelwrapper : SDNode<"XCoreISD::PCRelativeWrapper", SDT_XCoreAddress,
+ []>;
+
+def dprelwrapper : SDNode<"XCoreISD::DPRelativeWrapper", SDT_XCoreAddress,
+ []>;
+
+def cprelwrapper : SDNode<"XCoreISD::CPRelativeWrapper", SDT_XCoreAddress,
+ []>;
+
+def frametoargsoffset : SDNode<"XCoreISD::FRAME_TO_ARGS_OFFSET", SDTIntLeaf,
+ []>;
+
+def SDT_XCoreStwsp : SDTypeProfile<0, 2, [SDTCisInt<1>]>;
+def XCoreStwsp : SDNode<"XCoreISD::STWSP", SDT_XCoreStwsp,
+ [SDNPHasChain, SDNPMayStore]>;
+
+def SDT_XCoreLdwsp : SDTypeProfile<1, 1, [SDTCisInt<1>]>;
+def XCoreLdwsp : SDNode<"XCoreISD::LDWSP", SDT_XCoreLdwsp,
+ [SDNPHasChain, SDNPMayLoad]>;
+
+// These are target-independent nodes, but have target-specific formats.
+def SDT_XCoreCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_XCoreCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>,
+ SDTCisVT<1, i32> ]>;
+
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_XCoreCallSeqStart,
+ [SDNPHasChain, SDNPOutGlue]>;
+def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_XCoreCallSeqEnd,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def SDT_XCoreMEMBARRIER : SDTypeProfile<0, 0, []>;
+
+def XCoreMemBarrier : SDNode<"XCoreISD::MEMBARRIER", SDT_XCoreMEMBARRIER,
+ [SDNPHasChain]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction Pattern Stuff
+//===----------------------------------------------------------------------===//
+
+def div4_xform : SDNodeXForm<imm, [{
+ // Transformation function: imm/4
+ assert(N->getZExtValue() % 4 == 0);
+ return getI32Imm(N->getZExtValue()/4, SDLoc(N));
+}]>;
+
+def msksize_xform : SDNodeXForm<imm, [{
+ // Transformation function: get the size of a mask
+ assert(isMask_32(N->getZExtValue()));
+ // look for the first non-zero bit
+ return getI32Imm(32 - countLeadingZeros((uint32_t)N->getZExtValue()),
+ SDLoc(N));
+}]>;
+
+def neg_xform : SDNodeXForm<imm, [{
+ // Transformation function: -imm
+ uint32_t value = N->getZExtValue();
+ return getI32Imm(-value, SDLoc(N));
+}]>;
+
+def bpwsub_xform : SDNodeXForm<imm, [{
+ // Transformation function: 32-imm
+ uint32_t value = N->getZExtValue();
+ return getI32Imm(32 - value, SDLoc(N));
+}]>;
+
+def div4neg_xform : SDNodeXForm<imm, [{
+ // Transformation function: -imm/4
+ uint32_t value = N->getZExtValue();
+ assert(-value % 4 == 0);
+ return getI32Imm(-value/4, SDLoc(N));
+}]>;
+
+def immUs4Neg : PatLeaf<(imm), [{
+ uint32_t value = (uint32_t)N->getZExtValue();
+ return (-value)%4 == 0 && (-value)/4 <= 11;
+}]>;
+
+def immUs4 : PatLeaf<(imm), [{
+ uint32_t value = (uint32_t)N->getZExtValue();
+ return value%4 == 0 && value/4 <= 11;
+}]>;
+
+def immUsNeg : PatLeaf<(imm), [{
+ return -((uint32_t)N->getZExtValue()) <= 11;
+}]>;
+
+def immUs : PatLeaf<(imm), [{
+ return (uint32_t)N->getZExtValue() <= 11;
+}]>;
+
+def immU6 : PatLeaf<(imm), [{
+ return (uint32_t)N->getZExtValue() < (1 << 6);
+}]>;
+
+def immU16 : PatLeaf<(imm), [{
+ return (uint32_t)N->getZExtValue() < (1 << 16);
+}]>;
+
+def immMskBitp : PatLeaf<(imm), [{ return immMskBitp(N); }]>;
+
+def immBitp : PatLeaf<(imm), [{
+ uint32_t value = (uint32_t)N->getZExtValue();
+ return (value >= 1 && value <= 8)
+ || value == 16
+ || value == 24
+ || value == 32;
+}]>;
+
+def immBpwSubBitp : PatLeaf<(imm), [{
+ uint32_t value = (uint32_t)N->getZExtValue();
+ return (value >= 24 && value <= 31)
+ || value == 16
+ || value == 8
+ || value == 0;
+}]>;
+
+def lda16f : PatFrag<(ops node:$addr, node:$offset),
+ (add node:$addr, (shl node:$offset, 1))>;
+def lda16b : PatFrag<(ops node:$addr, node:$offset),
+ (sub node:$addr, (shl node:$offset, 1))>;
+def ldawf : PatFrag<(ops node:$addr, node:$offset),
+ (add node:$addr, (shl node:$offset, 2))>;
+def ldawb : PatFrag<(ops node:$addr, node:$offset),
+ (sub node:$addr, (shl node:$offset, 2))>;
+
+// Instruction operand types
+def pcrel_imm : Operand<i32>;
+def pcrel_imm_neg : Operand<i32> {
+ let DecoderMethod = "DecodeNegImmOperand";
+}
+def brtarget : Operand<OtherVT>;
+def brtarget_neg : Operand<OtherVT> {
+ let DecoderMethod = "DecodeNegImmOperand";
+}
+
+// Addressing modes
+def ADDRspii : ComplexPattern<i32, 2, "SelectADDRspii", [add, frameindex], []>;
+
+// Address operands
+def MEMii : Operand<i32> {
+ let MIOperandInfo = (ops i32imm, i32imm);
+}
+
+// Jump tables.
+def InlineJT : Operand<i32> {
+ let PrintMethod = "printInlineJT";
+}
+
+def InlineJT32 : Operand<i32> {
+ let PrintMethod = "printInlineJT32";
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction Class Templates
+//===----------------------------------------------------------------------===//
+
+// Three operand short
+
+multiclass F3R_2RUS<bits<5> opc1, bits<5> opc2, string OpcStr, SDNode OpNode> {
+ def _3r: _F3R<opc1, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+ !strconcat(OpcStr, " $dst, $b, $c"),
+ [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+ def _2rus : _F2RUS<opc2, (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
+ !strconcat(OpcStr, " $dst, $b, $c"),
+ [(set GRRegs:$dst, (OpNode GRRegs:$b, immUs:$c))]>;
+}
+
+multiclass F3R_2RUS_np<bits<5> opc1, bits<5> opc2, string OpcStr> {
+ def _3r: _F3R<opc1, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+ !strconcat(OpcStr, " $dst, $b, $c"), []>;
+ def _2rus : _F2RUS<opc2, (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
+ !strconcat(OpcStr, " $dst, $b, $c"), []>;
+}
+
+multiclass F3R_2RBITP<bits<5> opc1, bits<5> opc2, string OpcStr,
+ SDNode OpNode> {
+ def _3r: _F3R<opc1, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+ !strconcat(OpcStr, " $dst, $b, $c"),
+ [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+ def _2rus : _F2RUSBitp<opc2, (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
+ !strconcat(OpcStr, " $dst, $b, $c"),
+ [(set GRRegs:$dst, (OpNode GRRegs:$b, immBitp:$c))]>;
+}
+
+class F3R<bits<5> opc, string OpcStr, SDNode OpNode> :
+ _F3R<opc, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+ !strconcat(OpcStr, " $dst, $b, $c"),
+ [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+
+class F3R_np<bits<5> opc, string OpcStr> :
+ _F3R<opc, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+ !strconcat(OpcStr, " $dst, $b, $c"), []>;
+// Three operand long
+
+/// FL3R_L2RUS multiclass - Define a normal FL3R/FL2RUS pattern in one shot.
+multiclass FL3R_L2RUS<bits<9> opc1, bits<9> opc2, string OpcStr,
+ SDNode OpNode> {
+ def _l3r: _FL3R<opc1, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+ !strconcat(OpcStr, " $dst, $b, $c"),
+ [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+ def _l2rus : _FL2RUS<opc2, (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
+ !strconcat(OpcStr, " $dst, $b, $c"),
+ [(set GRRegs:$dst, (OpNode GRRegs:$b, immUs:$c))]>;
+}
+
+/// FL3R_L2RUS multiclass - Define a normal FL3R/FL2RUS pattern in one shot.
+multiclass FL3R_L2RBITP<bits<9> opc1, bits<9> opc2, string OpcStr,
+ SDNode OpNode> {
+ def _l3r: _FL3R<opc1, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+ !strconcat(OpcStr, " $dst, $b, $c"),
+ [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+ def _l2rus : _FL2RUSBitp<opc2, (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
+ !strconcat(OpcStr, " $dst, $b, $c"),
+ [(set GRRegs:$dst, (OpNode GRRegs:$b, immBitp:$c))]>;
+}
+
+class FL3R<bits<9> opc, string OpcStr, SDNode OpNode> :
+ _FL3R<opc, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+ !strconcat(OpcStr, " $dst, $b, $c"),
+ [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+
+// Register - U6
+// Operand register - U6
+multiclass FRU6_LRU6_branch<bits<6> opc, string OpcStr> {
+ def _ru6: _FRU6<opc, (outs), (ins GRRegs:$a, brtarget:$b),
+ !strconcat(OpcStr, " $a, $b"), []>;
+ def _lru6: _FLRU6<opc, (outs), (ins GRRegs:$a, brtarget:$b),
+ !strconcat(OpcStr, " $a, $b"), []>;
+}
+
+multiclass FRU6_LRU6_backwards_branch<bits<6> opc, string OpcStr> {
+ def _ru6: _FRU6<opc, (outs), (ins GRRegs:$a, brtarget_neg:$b),
+ !strconcat(OpcStr, " $a, $b"), []>;
+ def _lru6: _FLRU6<opc, (outs), (ins GRRegs:$a, brtarget_neg:$b),
+ !strconcat(OpcStr, " $a, $b"), []>;
+}
+
+
+// U6
+multiclass FU6_LU6<bits<10> opc, string OpcStr, SDNode OpNode> {
+ def _u6: _FU6<opc, (outs), (ins i32imm:$a), !strconcat(OpcStr, " $a"),
+ [(OpNode immU6:$a)]>;
+ def _lu6: _FLU6<opc, (outs), (ins i32imm:$a), !strconcat(OpcStr, " $a"),
+ [(OpNode immU16:$a)]>;
+}
+
+multiclass FU6_LU6_int<bits<10> opc, string OpcStr, Intrinsic Int> {
+ def _u6: _FU6<opc, (outs), (ins i32imm:$a), !strconcat(OpcStr, " $a"),
+ [(Int immU6:$a)]>;
+ def _lu6: _FLU6<opc, (outs), (ins i32imm:$a), !strconcat(OpcStr, " $a"),
+ [(Int immU16:$a)]>;
+}
+
+multiclass FU6_LU6_np<bits<10> opc, string OpcStr> {
+ def _u6: _FU6<opc, (outs), (ins i32imm:$a), !strconcat(OpcStr, " $a"), []>;
+ def _lu6: _FLU6<opc, (outs), (ins i32imm:$a), !strconcat(OpcStr, " $a"), []>;
+}
+
+// Two operand short
+
+class F2R_np<bits<6> opc, string OpcStr> :
+ _F2R<opc, (outs GRRegs:$dst), (ins GRRegs:$b),
+ !strconcat(OpcStr, " $dst, $b"), []>;
+
+// Two operand long
+
+//===----------------------------------------------------------------------===//
+// Pseudo Instructions
+//===----------------------------------------------------------------------===//
+
+let Defs = [SP], Uses = [SP] in {
+def ADJCALLSTACKDOWN : PseudoInstXCore<(outs), (ins i32imm:$amt),
+ "# ADJCALLSTACKDOWN $amt",
+ [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP : PseudoInstXCore<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+ "# ADJCALLSTACKUP $amt1",
+ [(callseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+let isReMaterializable = 1 in
+def FRAME_TO_ARGS_OFFSET : PseudoInstXCore<(outs GRRegs:$dst), (ins),
+ "# FRAME_TO_ARGS_OFFSET $dst",
+ [(set GRRegs:$dst, (frametoargsoffset))]>;
+
+let isReturn = 1, isTerminator = 1, isBarrier = 1 in
+def EH_RETURN : PseudoInstXCore<(outs), (ins GRRegs:$s, GRRegs:$handler),
+ "# EH_RETURN $s, $handler",
+ [(XCoreEhRet GRRegs:$s, GRRegs:$handler)]>;
+
+def LDWFI : PseudoInstXCore<(outs GRRegs:$dst), (ins MEMii:$addr),
+ "# LDWFI $dst, $addr",
+ [(set GRRegs:$dst, (load ADDRspii:$addr))]>;
+
+def LDAWFI : PseudoInstXCore<(outs GRRegs:$dst), (ins MEMii:$addr),
+ "# LDAWFI $dst, $addr",
+ [(set GRRegs:$dst, ADDRspii:$addr)]>;
+
+def STWFI : PseudoInstXCore<(outs), (ins GRRegs:$src, MEMii:$addr),
+ "# STWFI $src, $addr",
+ [(store GRRegs:$src, ADDRspii:$addr)]>;
+
+// SELECT_CC_* - Used to implement the SELECT_CC DAG operation. Expanded after
+// instruction selection into a branch sequence.
+let usesCustomInserter = 1 in {
+ def SELECT_CC : PseudoInstXCore<(outs GRRegs:$dst),
+ (ins GRRegs:$cond, GRRegs:$T, GRRegs:$F),
+ "# SELECT_CC PSEUDO!",
+ [(set GRRegs:$dst,
+ (select GRRegs:$cond, GRRegs:$T, GRRegs:$F))]>;
+}
+
+let hasSideEffects = 1 in
+def Int_MemBarrier : PseudoInstXCore<(outs), (ins), "#MEMBARRIER",
+ [(XCoreMemBarrier)]>;
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+// Three operand short
+defm ADD : F3R_2RUS<0b00010, 0b10010, "add", add>;
+defm SUB : F3R_2RUS<0b00011, 0b10011, "sub", sub>;
+let hasSideEffects = 0 in {
+defm EQ : F3R_2RUS_np<0b00110, 0b10110, "eq">;
+def LSS_3r : F3R_np<0b11000, "lss">;
+def LSU_3r : F3R_np<0b11001, "lsu">;
+}
+def AND_3r : F3R<0b00111, "and", and>;
+def OR_3r : F3R<0b01000, "or", or>;
+
+let mayLoad=1 in {
+def LDW_3r : _F3R<0b01001, (outs GRRegs:$dst),
+ (ins GRRegs:$addr, GRRegs:$offset),
+ "ldw $dst, $addr[$offset]", []>;
+
+def LDW_2rus : _F2RUS<0b00001, (outs GRRegs:$dst),
+ (ins GRRegs:$addr, i32imm:$offset),
+ "ldw $dst, $addr[$offset]", []>;
+
+def LD16S_3r : _F3R<0b10000, (outs GRRegs:$dst),
+ (ins GRRegs:$addr, GRRegs:$offset),
+ "ld16s $dst, $addr[$offset]", []>;
+
+def LD8U_3r : _F3R<0b10001, (outs GRRegs:$dst),
+ (ins GRRegs:$addr, GRRegs:$offset),
+ "ld8u $dst, $addr[$offset]", []>;
+}
+
+let mayStore=1 in {
+def STW_l3r : _FL3R<0b000001100, (outs),
+ (ins GRRegs:$val, GRRegs:$addr, GRRegs:$offset),
+ "stw $val, $addr[$offset]", []>;
+
+def STW_2rus : _F2RUS<0b00000, (outs),
+ (ins GRRegs:$val, GRRegs:$addr, i32imm:$offset),
+ "stw $val, $addr[$offset]", []>;
+}
+
+defm SHL : F3R_2RBITP<0b00100, 0b10100, "shl", shl>;
+defm SHR : F3R_2RBITP<0b00101, 0b10101, "shr", srl>;
+
+// The first operand is treated as an immediate since it refers to a register
+// number in another thread.
+def TSETR_3r : _F3RImm<0b10111, (outs), (ins i32imm:$a, GRRegs:$b, GRRegs:$c),
+ "set t[$c]:r$a, $b", []>;
+
+// Three operand long
+def LDAWF_l3r : _FL3R<0b000111100, (outs GRRegs:$dst),
+ (ins GRRegs:$addr, GRRegs:$offset),
+ "ldaw $dst, $addr[$offset]",
+ [(set GRRegs:$dst,
+ (ldawf GRRegs:$addr, GRRegs:$offset))]>;
+
+let hasSideEffects = 0 in
+def LDAWF_l2rus : _FL2RUS<0b100111100, (outs GRRegs:$dst),
+ (ins GRRegs:$addr, i32imm:$offset),
+ "ldaw $dst, $addr[$offset]", []>;
+
+def LDAWB_l3r : _FL3R<0b001001100, (outs GRRegs:$dst),
+ (ins GRRegs:$addr, GRRegs:$offset),
+ "ldaw $dst, $addr[-$offset]",
+ [(set GRRegs:$dst,
+ (ldawb GRRegs:$addr, GRRegs:$offset))]>;
+
+let hasSideEffects = 0 in
+def LDAWB_l2rus : _FL2RUS<0b101001100, (outs GRRegs:$dst),
+ (ins GRRegs:$addr, i32imm:$offset),
+ "ldaw $dst, $addr[-$offset]", []>;
+
+def LDA16F_l3r : _FL3R<0b001011100, (outs GRRegs:$dst),
+ (ins GRRegs:$addr, GRRegs:$offset),
+ "lda16 $dst, $addr[$offset]",
+ [(set GRRegs:$dst,
+ (lda16f GRRegs:$addr, GRRegs:$offset))]>;
+
+def LDA16B_l3r : _FL3R<0b001101100, (outs GRRegs:$dst),
+ (ins GRRegs:$addr, GRRegs:$offset),
+ "lda16 $dst, $addr[-$offset]",
+ [(set GRRegs:$dst,
+ (lda16b GRRegs:$addr, GRRegs:$offset))]>;
+
+def MUL_l3r : FL3R<0b001111100, "mul", mul>;
+// Instructions which may trap are marked as side effecting.
+let hasSideEffects = 1 in {
+def DIVS_l3r : FL3R<0b010001100, "divs", sdiv>;
+def DIVU_l3r : FL3R<0b010011100, "divu", udiv>;
+def REMS_l3r : FL3R<0b110001100, "rems", srem>;
+def REMU_l3r : FL3R<0b110011100, "remu", urem>;
+}
+def XOR_l3r : FL3R<0b000011100, "xor", xor>;
+defm ASHR : FL3R_L2RBITP<0b000101100, 0b100101100, "ashr", sra>;
+
+let Constraints = "$src1 = $dst" in
+def CRC_l3r : _FL3RSrcDst<0b101011100, (outs GRRegs:$dst),
+ (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
+ "crc32 $dst, $src2, $src3",
+ [(set GRRegs:$dst,
+ (int_xcore_crc32 GRRegs:$src1, GRRegs:$src2,
+ GRRegs:$src3))]>;
+
+let mayStore=1 in {
+def ST16_l3r : _FL3R<0b100001100, (outs),
+ (ins GRRegs:$val, GRRegs:$addr, GRRegs:$offset),
+ "st16 $val, $addr[$offset]", []>;
+
+def ST8_l3r : _FL3R<0b100011100, (outs),
+ (ins GRRegs:$val, GRRegs:$addr, GRRegs:$offset),
+ "st8 $val, $addr[$offset]", []>;
+}
+
+def INPW_l2rus : _FL2RUSBitp<0b100101110, (outs GRRegs:$a),
+ (ins GRRegs:$b, i32imm:$c), "inpw $a, res[$b], $c",
+ []>;
+
+def OUTPW_l2rus : _FL2RUSBitp<0b100101101, (outs),
+ (ins GRRegs:$a, GRRegs:$b, i32imm:$c),
+ "outpw res[$b], $a, $c", []>;
+
+// Four operand long
+let Constraints = "$e = $a,$f = $b" in {
+def MACCU_l4r : _FL4RSrcDstSrcDst<
+ 0b000001, (outs GRRegs:$a, GRRegs:$b),
+ (ins GRRegs:$e, GRRegs:$f, GRRegs:$c, GRRegs:$d), "maccu $a, $b, $c, $d", []>;
+
+def MACCS_l4r : _FL4RSrcDstSrcDst<
+ 0b000010, (outs GRRegs:$a, GRRegs:$b),
+ (ins GRRegs:$e, GRRegs:$f, GRRegs:$c, GRRegs:$d), "maccs $a, $b, $c, $d", []>;
+}
+
+let Constraints = "$e = $b" in
+def CRC8_l4r : _FL4RSrcDst<0b000000, (outs GRRegs:$a, GRRegs:$b),
+ (ins GRRegs:$e, GRRegs:$c, GRRegs:$d),
+ "crc8 $b, $a, $c, $d", []>;
+
+// Five operand long
+
+def LADD_l5r : _FL5R<0b000001, (outs GRRegs:$dst1, GRRegs:$dst2),
+ (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
+ "ladd $dst2, $dst1, $src1, $src2, $src3",
+ []>;
+
+def LSUB_l5r : _FL5R<0b000010, (outs GRRegs:$dst1, GRRegs:$dst2),
+ (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
+ "lsub $dst2, $dst1, $src1, $src2, $src3", []>;
+
+def LDIVU_l5r : _FL5R<0b000000, (outs GRRegs:$dst1, GRRegs:$dst2),
+ (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
+ "ldivu $dst1, $dst2, $src3, $src1, $src2", []>;
+
+// Six operand long
+
+def LMUL_l6r : _FL6R<
+ 0b00000, (outs GRRegs:$dst1, GRRegs:$dst2),
+ (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3, GRRegs:$src4),
+ "lmul $dst1, $dst2, $src1, $src2, $src3, $src4", []>;
+
+// Register - U6
+
+//let Uses = [DP] in ...
+let hasSideEffects = 0, isReMaterializable = 1 in
+def LDAWDP_ru6: _FRU6<0b011000, (outs RRegs:$a), (ins i32imm:$b),
+ "ldaw $a, dp[$b]", []>;
+
+let isReMaterializable = 1 in
+def LDAWDP_lru6: _FLRU6<0b011000, (outs RRegs:$a), (ins i32imm:$b),
+ "ldaw $a, dp[$b]",
+ [(set RRegs:$a, (dprelwrapper tglobaladdr:$b))]>;
+
+let mayLoad=1 in
+def LDWDP_ru6: _FRU6<0b010110, (outs RRegs:$a), (ins i32imm:$b),
+ "ldw $a, dp[$b]", []>;
+
+def LDWDP_lru6: _FLRU6<0b010110, (outs RRegs:$a), (ins i32imm:$b),
+ "ldw $a, dp[$b]",
+ [(set RRegs:$a, (load (dprelwrapper tglobaladdr:$b)))]>;
+
+let mayStore=1 in
+def STWDP_ru6 : _FRU6<0b010100, (outs), (ins RRegs:$a, i32imm:$b),
+ "stw $a, dp[$b]", []>;
+
+def STWDP_lru6 : _FLRU6<0b010100, (outs), (ins RRegs:$a, i32imm:$b),
+ "stw $a, dp[$b]",
+ [(store RRegs:$a, (dprelwrapper tglobaladdr:$b))]>;
+
+//let Uses = [CP] in ..
+let mayLoad = 1, isReMaterializable = 1, hasSideEffects = 0 in {
+def LDWCP_ru6 : _FRU6<0b011011, (outs RRegs:$a), (ins i32imm:$b),
+ "ldw $a, cp[$b]", []>;
+def LDWCP_lru6: _FLRU6<0b011011, (outs RRegs:$a), (ins i32imm:$b),
+ "ldw $a, cp[$b]",
+ [(set RRegs:$a, (load (cprelwrapper tglobaladdr:$b)))]>;
+}
+
+let Uses = [SP] in {
+let mayStore=1 in {
+def STWSP_ru6 : _FRU6<0b010101, (outs), (ins RRegs:$a, i32imm:$b),
+ "stw $a, sp[$b]",
+ [(XCoreStwsp RRegs:$a, immU6:$b)]>;
+
+def STWSP_lru6 : _FLRU6<0b010101, (outs), (ins RRegs:$a, i32imm:$b),
+ "stw $a, sp[$b]",
+ [(XCoreStwsp RRegs:$a, immU16:$b)]>;
+}
+
+let mayLoad=1 in {
+def LDWSP_ru6 : _FRU6<0b010111, (outs RRegs:$a), (ins i32imm:$b),
+ "ldw $a, sp[$b]",
+ [(set RRegs:$a, (XCoreLdwsp immU6:$b))]>;
+
+def LDWSP_lru6 : _FLRU6<0b010111, (outs RRegs:$a), (ins i32imm:$b),
+ "ldw $a, sp[$b]",
+ [(set RRegs:$a, (XCoreLdwsp immU16:$b))]>;
+}
+
+let hasSideEffects = 0 in {
+def LDAWSP_ru6 : _FRU6<0b011001, (outs RRegs:$a), (ins i32imm:$b),
+ "ldaw $a, sp[$b]", []>;
+
+def LDAWSP_lru6 : _FLRU6<0b011001, (outs RRegs:$a), (ins i32imm:$b),
+ "ldaw $a, sp[$b]", []>;
+}
+}
+
+let isReMaterializable = 1 in {
+def LDC_ru6 : _FRU6<0b011010, (outs RRegs:$a), (ins i32imm:$b),
+ "ldc $a, $b", [(set RRegs:$a, immU6:$b)]>;
+
+def LDC_lru6 : _FLRU6<0b011010, (outs RRegs:$a), (ins i32imm:$b),
+ "ldc $a, $b", [(set RRegs:$a, immU16:$b)]>;
+}
+
+def SETC_ru6 : _FRU6<0b111010, (outs), (ins GRRegs:$a, i32imm:$b),
+ "setc res[$a], $b",
+ [(int_xcore_setc GRRegs:$a, immU6:$b)]>;
+
+def SETC_lru6 : _FLRU6<0b111010, (outs), (ins GRRegs:$a, i32imm:$b),
+ "setc res[$a], $b",
+ [(int_xcore_setc GRRegs:$a, immU16:$b)]>;
+
+// Operand register - U6
+let isBranch = 1, isTerminator = 1 in {
+defm BRFT: FRU6_LRU6_branch<0b011100, "bt">;
+defm BRBT: FRU6_LRU6_backwards_branch<0b011101, "bt">;
+defm BRFF: FRU6_LRU6_branch<0b011110, "bf">;
+defm BRBF: FRU6_LRU6_backwards_branch<0b011111, "bf">;
+}
+
+// U6
+let Defs = [SP], Uses = [SP] in {
+let hasSideEffects = 0 in
+defm EXTSP : FU6_LU6_np<0b0111011110, "extsp">;
+
+let mayStore = 1 in
+defm ENTSP : FU6_LU6_np<0b0111011101, "entsp">;
+
+let isReturn = 1, isTerminator = 1, mayLoad = 1, isBarrier = 1 in {
+defm RETSP : FU6_LU6<0b0111011111, "retsp", XCoreRetsp>;
+}
+}
+
+let hasSideEffects = 0 in
+defm EXTDP : FU6_LU6_np<0b0111001110, "extdp">;
+
+let Uses = [R11], isCall=1 in
+defm BLAT : FU6_LU6_np<0b0111001101, "blat">;
+
+let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
+def BRBU_u6 : _FU6<0b0111011100, (outs), (ins brtarget_neg:$a), "bu $a", []>;
+
+def BRBU_lu6 : _FLU6<0b0111011100, (outs), (ins brtarget_neg:$a), "bu $a", []>;
+
+def BRFU_u6 : _FU6<0b0111001100, (outs), (ins brtarget:$a), "bu $a", []>;
+
+def BRFU_lu6 : _FLU6<0b0111001100, (outs), (ins brtarget:$a), "bu $a", []>;
+}
+
+//let Uses = [CP] in ...
+let Defs = [R11], hasSideEffects = 0, isReMaterializable = 1 in
+def LDAWCP_u6: _FU6<0b0111111101, (outs), (ins i32imm:$a), "ldaw r11, cp[$a]",
+ []>;
+
+let Defs = [R11], isReMaterializable = 1 in
+def LDAWCP_lu6: _FLU6<0b0111111101, (outs), (ins i32imm:$a), "ldaw r11, cp[$a]",
+ [(set R11, (cprelwrapper tglobaladdr:$a))]>;
+
+let Defs = [R11] in
+defm GETSR : FU6_LU6_np<0b0111111100, "getsr r11,">;
+
+defm SETSR : FU6_LU6_int<0b0111101101, "setsr", int_xcore_setsr>;
+
+defm CLRSR : FU6_LU6_int<0b0111101100, "clrsr", int_xcore_clrsr>;
+
+// setsr may cause a branch if it is used to enable events. clrsr may
+// branch if it is executed while events are enabled.
+let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1,
+ isCodeGenOnly = 1 in {
+defm SETSR_branch : FU6_LU6_np<0b0111101101, "setsr">;
+defm CLRSR_branch : FU6_LU6_np<0b0111101100, "clrsr">;
+}
+
+defm KCALL : FU6_LU6_np<0b0111001111, "kcall">;
+
+let Uses = [SP], Defs = [SP], mayStore = 1 in
+defm KENTSP : FU6_LU6_np<0b0111101110, "kentsp">;
+
+let Uses = [SP], Defs = [SP], mayLoad = 1 in
+defm KRESTSP : FU6_LU6_np<0b0111101111, "krestsp">;
+
+// U10
+
+let Defs = [R11], isReMaterializable = 1 in {
+let hasSideEffects = 0 in
+def LDAPF_u10 : _FU10<0b110110, (outs), (ins pcrel_imm:$a), "ldap r11, $a", []>;
+
+def LDAPF_lu10 : _FLU10<0b110110, (outs), (ins pcrel_imm:$a), "ldap r11, $a",
+ [(set R11, (pcrelwrapper tglobaladdr:$a))]>;
+
+let hasSideEffects = 0 in
+def LDAPB_u10 : _FU10<0b110111, (outs), (ins pcrel_imm_neg:$a), "ldap r11, $a",
+ []>;
+
+let hasSideEffects = 0 in
+def LDAPB_lu10 : _FLU10<0b110111, (outs), (ins pcrel_imm_neg:$a),
+ "ldap r11, $a",
+ [(set R11, (pcrelwrapper tglobaladdr:$a))]>;
+
+let isCodeGenOnly = 1 in
+def LDAPF_lu10_ba : _FLU10<0b110110, (outs), (ins pcrel_imm:$a), "ldap r11, $a",
+ [(set R11, (pcrelwrapper tblockaddress:$a))]>;
+}
+
+let isCall=1,
+// All calls clobber the link register and the non-callee-saved registers:
+Defs = [R0, R1, R2, R3, R11, LR], Uses = [SP] in {
+def BLACP_u10 : _FU10<0b111000, (outs), (ins i32imm:$a), "bla cp[$a]", []>;
+
+def BLACP_lu10 : _FLU10<0b111000, (outs), (ins i32imm:$a), "bla cp[$a]", []>;
+
+def BLRF_u10 : _FU10<0b110100, (outs), (ins pcrel_imm:$a), "bl $a",
+ []>;
+
+def BLRF_lu10 : _FLU10<0b110100, (outs), (ins pcrel_imm:$a), "bl $a",
+ [(XCoreBranchLink tglobaladdr:$a)]>;
+
+def BLRB_u10 : _FU10<0b110101, (outs), (ins pcrel_imm_neg:$a), "bl $a", []>;
+
+def BLRB_lu10 : _FLU10<0b110101, (outs), (ins pcrel_imm_neg:$a), "bl $a", []>;
+}
+
+let Defs = [R11], mayLoad = 1, isReMaterializable = 1,
+ hasSideEffects = 0 in {
+def LDWCP_u10 : _FU10<0b111001, (outs), (ins i32imm:$a), "ldw r11, cp[$a]", []>;
+
+def LDWCP_lu10 : _FLU10<0b111001, (outs), (ins i32imm:$a), "ldw r11, cp[$a]",
+ []>;
+}
+
+// Two operand short
+def NOT : _F2R<0b100010, (outs GRRegs:$dst), (ins GRRegs:$b),
+ "not $dst, $b", [(set GRRegs:$dst, (not GRRegs:$b))]>;
+
+def NEG : _F2R<0b100100, (outs GRRegs:$dst), (ins GRRegs:$b),
+ "neg $dst, $b", [(set GRRegs:$dst, (ineg GRRegs:$b))]>;
+
+let Constraints = "$src1 = $dst" in {
+def SEXT_rus :
+ _FRUSSrcDstBitp<0b001101, (outs GRRegs:$dst), (ins GRRegs:$src1, i32imm:$src2),
+ "sext $dst, $src2",
+ [(set GRRegs:$dst, (int_xcore_sext GRRegs:$src1,
+ immBitp:$src2))]>;
+
+def SEXT_2r :
+ _F2RSrcDst<0b001100, (outs GRRegs:$dst), (ins GRRegs:$src1, GRRegs:$src2),
+ "sext $dst, $src2",
+ [(set GRRegs:$dst, (int_xcore_sext GRRegs:$src1, GRRegs:$src2))]>;
+
+def ZEXT_rus :
+ _FRUSSrcDstBitp<0b010001, (outs GRRegs:$dst), (ins GRRegs:$src1, i32imm:$src2),
+ "zext $dst, $src2",
+ [(set GRRegs:$dst, (int_xcore_zext GRRegs:$src1,
+ immBitp:$src2))]>;
+
+def ZEXT_2r :
+ _F2RSrcDst<0b010000, (outs GRRegs:$dst), (ins GRRegs:$src1, GRRegs:$src2),
+ "zext $dst, $src2",
+ [(set GRRegs:$dst, (int_xcore_zext GRRegs:$src1, GRRegs:$src2))]>;
+
+def ANDNOT_2r :
+ _F2RSrcDst<0b001010, (outs GRRegs:$dst), (ins GRRegs:$src1, GRRegs:$src2),
+ "andnot $dst, $src2",
+ [(set GRRegs:$dst, (and GRRegs:$src1, (not GRRegs:$src2)))]>;
+}
+
+let isReMaterializable = 1, hasSideEffects = 0 in
+def MKMSK_rus : _FRUSBitp<0b101001, (outs GRRegs:$dst), (ins i32imm:$size),
+ "mkmsk $dst, $size", []>;
+
+def MKMSK_2r : _F2R<0b101000, (outs GRRegs:$dst), (ins GRRegs:$size),
+ "mkmsk $dst, $size",
+ [(set GRRegs:$dst, (add (shl 1, GRRegs:$size), -1))]>;
+
+def GETR_rus : _FRUS<0b100000, (outs GRRegs:$dst), (ins i32imm:$type),
+ "getr $dst, $type",
+ [(set GRRegs:$dst, (int_xcore_getr immUs:$type))]>;
+
+def GETTS_2r : _F2R<0b001110, (outs GRRegs:$dst), (ins GRRegs:$r),
+ "getts $dst, res[$r]",
+ [(set GRRegs:$dst, (int_xcore_getts GRRegs:$r))]>;
+
+def SETPT_2r : _FR2R<0b001111, (outs), (ins GRRegs:$r, GRRegs:$val),
+ "setpt res[$r], $val",
+ [(int_xcore_setpt GRRegs:$r, GRRegs:$val)]>;
+
+def OUTCT_2r : _F2R<0b010010, (outs), (ins GRRegs:$r, GRRegs:$val),
+ "outct res[$r], $val",
+ [(int_xcore_outct GRRegs:$r, GRRegs:$val)]>;
+
+def OUTCT_rus : _FRUS<0b010011, (outs), (ins GRRegs:$r, i32imm:$val),
+ "outct res[$r], $val",
+ [(int_xcore_outct GRRegs:$r, immUs:$val)]>;
+
+def OUTT_2r : _FR2R<0b000011, (outs), (ins GRRegs:$r, GRRegs:$val),
+ "outt res[$r], $val",
+ [(int_xcore_outt GRRegs:$r, GRRegs:$val)]>;
+
+def OUT_2r : _FR2R<0b101010, (outs), (ins GRRegs:$r, GRRegs:$val),
+ "out res[$r], $val",
+ [(int_xcore_out GRRegs:$r, GRRegs:$val)]>;
+
+let Constraints = "$src = $dst" in
+def OUTSHR_2r :
+ _F2RSrcDst<0b101011, (outs GRRegs:$dst), (ins GRRegs:$src, GRRegs:$r),
+ "outshr res[$r], $src",
+ [(set GRRegs:$dst, (int_xcore_outshr GRRegs:$r, GRRegs:$src))]>;
+
+def INCT_2r : _F2R<0b100001, (outs GRRegs:$dst), (ins GRRegs:$r),
+ "inct $dst, res[$r]",
+ [(set GRRegs:$dst, (int_xcore_inct GRRegs:$r))]>;
+
+def INT_2r : _F2R<0b100011, (outs GRRegs:$dst), (ins GRRegs:$r),
+ "int $dst, res[$r]",
+ [(set GRRegs:$dst, (int_xcore_int GRRegs:$r))]>;
+
+def IN_2r : _F2R<0b101100, (outs GRRegs:$dst), (ins GRRegs:$r),
+ "in $dst, res[$r]",
+ [(set GRRegs:$dst, (int_xcore_in GRRegs:$r))]>;
+
+let Constraints = "$src = $dst" in
+def INSHR_2r :
+ _F2RSrcDst<0b101101, (outs GRRegs:$dst), (ins GRRegs:$src, GRRegs:$r),
+ "inshr $dst, res[$r]",
+ [(set GRRegs:$dst, (int_xcore_inshr GRRegs:$r, GRRegs:$src))]>;
+
+def CHKCT_2r : _F2R<0b110010, (outs), (ins GRRegs:$r, GRRegs:$val),
+ "chkct res[$r], $val",
+ [(int_xcore_chkct GRRegs:$r, GRRegs:$val)]>;
+
+def CHKCT_rus : _FRUSBitp<0b110011, (outs), (ins GRRegs:$r, i32imm:$val),
+ "chkct res[$r], $val",
+ [(int_xcore_chkct GRRegs:$r, immUs:$val)]>;
+
+def TESTCT_2r : _F2R<0b101111, (outs GRRegs:$dst), (ins GRRegs:$src),
+ "testct $dst, res[$src]",
+ [(set GRRegs:$dst, (int_xcore_testct GRRegs:$src))]>;
+
+def TESTWCT_2r : _F2R<0b110001, (outs GRRegs:$dst), (ins GRRegs:$src),
+ "testwct $dst, res[$src]",
+ [(set GRRegs:$dst, (int_xcore_testwct GRRegs:$src))]>;
+
+def SETD_2r : _FR2R<0b000101, (outs), (ins GRRegs:$r, GRRegs:$val),
+ "setd res[$r], $val",
+ [(int_xcore_setd GRRegs:$r, GRRegs:$val)]>;
+
+def SETPSC_2r : _FR2R<0b110000, (outs), (ins GRRegs:$src1, GRRegs:$src2),
+ "setpsc res[$src1], $src2",
+ [(int_xcore_setpsc GRRegs:$src1, GRRegs:$src2)]>;
+
+def GETST_2r : _F2R<0b000001, (outs GRRegs:$dst), (ins GRRegs:$r),
+ "getst $dst, res[$r]",
+ [(set GRRegs:$dst, (int_xcore_getst GRRegs:$r))]>;
+
+def INITSP_2r : _F2R<0b000100, (outs), (ins GRRegs:$src, GRRegs:$t),
+ "init t[$t]:sp, $src",
+ [(int_xcore_initsp GRRegs:$t, GRRegs:$src)]>;
+
+def INITPC_2r : _F2R<0b000000, (outs), (ins GRRegs:$src, GRRegs:$t),
+ "init t[$t]:pc, $src",
+ [(int_xcore_initpc GRRegs:$t, GRRegs:$src)]>;
+
+def INITCP_2r : _F2R<0b000110, (outs), (ins GRRegs:$src, GRRegs:$t),
+ "init t[$t]:cp, $src",
+ [(int_xcore_initcp GRRegs:$t, GRRegs:$src)]>;
+
+def INITDP_2r : _F2R<0b000010, (outs), (ins GRRegs:$src, GRRegs:$t),
+ "init t[$t]:dp, $src",
+ [(int_xcore_initdp GRRegs:$t, GRRegs:$src)]>;
+
+def PEEK_2r : _F2R<0b101110, (outs GRRegs:$dst), (ins GRRegs:$src),
+ "peek $dst, res[$src]",
+ [(set GRRegs:$dst, (int_xcore_peek GRRegs:$src))]>;
+
+def ENDIN_2r : _F2R<0b100101, (outs GRRegs:$dst), (ins GRRegs:$src),
+ "endin $dst, res[$src]",
+ [(set GRRegs:$dst, (int_xcore_endin GRRegs:$src))]>;
+
+def EEF_2r : _F2R<0b001011, (outs), (ins GRRegs:$a, GRRegs:$b),
+ "eef $a, res[$b]", []>;
+
+def EET_2r : _F2R<0b001001, (outs), (ins GRRegs:$a, GRRegs:$b),
+ "eet $a, res[$b]", []>;
+
+def TSETMR_2r : _F2RImm<0b000111, (outs), (ins i32imm:$a, GRRegs:$b),
+ "tsetmr r$a, $b", []>;
+
+// Two operand long
+def BITREV_l2r : _FL2R<0b0000011000, (outs GRRegs:$dst), (ins GRRegs:$src),
+ "bitrev $dst, $src",
+ [(set GRRegs:$dst, (int_xcore_bitrev GRRegs:$src))]>;
+
+def BYTEREV_l2r : _FL2R<0b0000011001, (outs GRRegs:$dst), (ins GRRegs:$src),
+ "byterev $dst, $src",
+ [(set GRRegs:$dst, (bswap GRRegs:$src))]>;
+
+def CLZ_l2r : _FL2R<0b0000111000, (outs GRRegs:$dst), (ins GRRegs:$src),
+ "clz $dst, $src",
+ [(set GRRegs:$dst, (ctlz GRRegs:$src))]>;
+
+def GETD_l2r : _FL2R<0b0001111001, (outs GRRegs:$dst), (ins GRRegs:$src),
+ "getd $dst, res[$src]", []>;
+
+def GETN_l2r : _FL2R<0b0011011001, (outs GRRegs:$dst), (ins GRRegs:$src),
+ "getn $dst, res[$src]", []>;
+
+def SETC_l2r : _FL2R<0b0010111001, (outs), (ins GRRegs:$r, GRRegs:$val),
+ "setc res[$r], $val",
+ [(int_xcore_setc GRRegs:$r, GRRegs:$val)]>;
+
+def SETTW_l2r : _FLR2R<0b0010011001, (outs), (ins GRRegs:$r, GRRegs:$val),
+ "settw res[$r], $val",
+ [(int_xcore_settw GRRegs:$r, GRRegs:$val)]>;
+
+def GETPS_l2r : _FL2R<0b0001011001, (outs GRRegs:$dst), (ins GRRegs:$src),
+ "get $dst, ps[$src]",
+ [(set GRRegs:$dst, (int_xcore_getps GRRegs:$src))]>;
+
+def SETPS_l2r : _FLR2R<0b0001111000, (outs), (ins GRRegs:$src1, GRRegs:$src2),
+ "set ps[$src1], $src2",
+ [(int_xcore_setps GRRegs:$src1, GRRegs:$src2)]>;
+
+def INITLR_l2r : _FL2R<0b0001011000, (outs), (ins GRRegs:$src, GRRegs:$t),
+ "init t[$t]:lr, $src",
+ [(int_xcore_initlr GRRegs:$t, GRRegs:$src)]>;
+
+def SETCLK_l2r : _FLR2R<0b0000111001, (outs), (ins GRRegs:$src1, GRRegs:$src2),
+ "setclk res[$src1], $src2",
+ [(int_xcore_setclk GRRegs:$src1, GRRegs:$src2)]>;
+
+def SETN_l2r : _FLR2R<0b0011011000, (outs), (ins GRRegs:$src1, GRRegs:$src2),
+ "setn res[$src1], $src2", []>;
+
+def SETRDY_l2r : _FLR2R<0b0010111000, (outs), (ins GRRegs:$src1, GRRegs:$src2),
+ "setrdy res[$src1], $src2",
+ [(int_xcore_setrdy GRRegs:$src1, GRRegs:$src2)]>;
+
+def TESTLCL_l2r : _FL2R<0b0010011000, (outs GRRegs:$dst), (ins GRRegs:$src),
+ "testlcl $dst, res[$src]", []>;
+
+// One operand short
+def MSYNC_1r : _F1R<0b000111, (outs), (ins GRRegs:$a),
+ "msync res[$a]",
+ [(int_xcore_msync GRRegs:$a)]>;
+def MJOIN_1r : _F1R<0b000101, (outs), (ins GRRegs:$a),
+ "mjoin res[$a]",
+ [(int_xcore_mjoin GRRegs:$a)]>;
+
+let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1 in
+def BAU_1r : _F1R<0b001001, (outs), (ins GRRegs:$a),
+ "bau $a",
+ [(brind GRRegs:$a)]>;
+
+let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1 in
+def BR_JT : PseudoInstXCore<(outs), (ins InlineJT:$t, GRRegs:$i),
+ "bru $i\n$t",
+ [(XCoreBR_JT tjumptable:$t, GRRegs:$i)]>;
+
+let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1 in
+def BR_JT32 : PseudoInstXCore<(outs), (ins InlineJT32:$t, GRRegs:$i),
+ "bru $i\n$t",
+ [(XCoreBR_JT32 tjumptable:$t, GRRegs:$i)]>;
+
+let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1 in
+def BRU_1r : _F1R<0b001010, (outs), (ins GRRegs:$a), "bru $a", []>;
+
+let Defs=[SP], hasSideEffects=0 in
+def SETSP_1r : _F1R<0b001011, (outs), (ins GRRegs:$a), "set sp, $a", []>;
+
+let hasSideEffects=0 in
+def SETDP_1r : _F1R<0b001100, (outs), (ins GRRegs:$a), "set dp, $a", []>;
+
+let hasSideEffects=0 in
+def SETCP_1r : _F1R<0b001101, (outs), (ins GRRegs:$a), "set cp, $a", []>;
+
+let hasCtrlDep = 1 in
+def ECALLT_1r : _F1R<0b010011, (outs), (ins GRRegs:$a),
+ "ecallt $a",
+ []>;
+
+let hasCtrlDep = 1 in
+def ECALLF_1r : _F1R<0b010010, (outs), (ins GRRegs:$a),
+ "ecallf $a",
+ []>;
+
+let isCall=1,
+// All calls clobber the link register and the non-callee-saved registers:
+Defs = [R0, R1, R2, R3, R11, LR], Uses = [SP] in {
+def BLA_1r : _F1R<0b001000, (outs), (ins GRRegs:$a),
+ "bla $a",
+ [(XCoreBranchLink GRRegs:$a)]>;
+}
+
+def SYNCR_1r : _F1R<0b100001, (outs), (ins GRRegs:$a),
+ "syncr res[$a]",
+ [(int_xcore_syncr GRRegs:$a)]>;
+
+def FREER_1r : _F1R<0b000100, (outs), (ins GRRegs:$a),
+ "freer res[$a]",
+ [(int_xcore_freer GRRegs:$a)]>;
+
+let Uses=[R11] in {
+def SETV_1r : _F1R<0b010001, (outs), (ins GRRegs:$a),
+ "setv res[$a], r11",
+ [(int_xcore_setv GRRegs:$a, R11)]>;
+
+def SETEV_1r : _F1R<0b001111, (outs), (ins GRRegs:$a),
+ "setev res[$a], r11",
+ [(int_xcore_setev GRRegs:$a, R11)]>;
+}
+
+def DGETREG_1r : _F1R<0b001110, (outs GRRegs:$a), (ins), "dgetreg $a", []>;
+
+def EDU_1r : _F1R<0b000000, (outs), (ins GRRegs:$a), "edu res[$a]",
+ [(int_xcore_edu GRRegs:$a)]>;
+
+def EEU_1r : _F1R<0b000001, (outs), (ins GRRegs:$a),
+ "eeu res[$a]",
+ [(int_xcore_eeu GRRegs:$a)]>;
+
+def KCALL_1r : _F1R<0b010000, (outs), (ins GRRegs:$a), "kcall $a", []>;
+
+def WAITEF_1R : _F1R<0b000011, (outs), (ins GRRegs:$a), "waitef $a", []>;
+
+def WAITET_1R : _F1R<0b000010, (outs), (ins GRRegs:$a), "waitet $a", []>;
+
+def TSTART_1R : _F1R<0b000110, (outs), (ins GRRegs:$a), "start t[$a]", []>;
+
+def CLRPT_1R : _F1R<0b100000, (outs), (ins GRRegs:$a), "clrpt res[$a]",
+ [(int_xcore_clrpt GRRegs:$a)]>;
+
+// Zero operand short
+
+def CLRE_0R : _F0R<0b0000001101, (outs), (ins), "clre", [(int_xcore_clre)]>;
+
+def DCALL_0R : _F0R<0b0000011100, (outs), (ins), "dcall", []>;
+
+let Defs = [SP], Uses = [SP] in
+def DENTSP_0R : _F0R<0b0001001100, (outs), (ins), "dentsp", []>;
+
+let Defs = [SP] in
+def DRESTSP_0R : _F0R<0b0001001101, (outs), (ins), "drestsp", []>;
+
+def DRET_0R : _F0R<0b0000011110, (outs), (ins), "dret", []>;
+
+def FREET_0R : _F0R<0b0000001111, (outs), (ins), "freet", []>;
+
+let Defs = [R11] in {
+def GETID_0R : _F0R<0b0001001110, (outs), (ins),
+ "get r11, id",
+ [(set R11, (int_xcore_getid))]>;
+
+def GETED_0R : _F0R<0b0000111110, (outs), (ins),
+ "get r11, ed",
+ [(set R11, (int_xcore_geted))]>;
+
+def GETET_0R : _F0R<0b0000111111, (outs), (ins),
+ "get r11, et",
+ [(set R11, (int_xcore_getet))]>;
+
+def GETKEP_0R : _F0R<0b0001001111, (outs), (ins),
+ "get r11, kep", []>;
+
+def GETKSP_0R : _F0R<0b0001011100, (outs), (ins),
+ "get r11, ksp", []>;
+}
+
+let Defs = [SP] in
+def KRET_0R : _F0R<0b0000011101, (outs), (ins), "kret", []>;
+
+let Uses = [SP], mayLoad = 1 in {
+def LDET_0R : _F0R<0b0001011110, (outs), (ins), "ldw et, sp[4]", []>;
+
+def LDSED_0R : _F0R<0b0001011101, (outs), (ins), "ldw sed, sp[3]", []>;
+
+def LDSPC_0R : _F0R<0b0000101100, (outs), (ins), "ldw spc, sp[1]", []>;
+
+def LDSSR_0R : _F0R<0b0000101110, (outs), (ins), "ldw ssr, sp[2]", []>;
+}
+
+let Uses=[R11] in
+def SETKEP_0R : _F0R<0b0000011111, (outs), (ins), "set kep, r11", []>;
+
+def SSYNC_0r : _F0R<0b0000001110, (outs), (ins),
+ "ssync",
+ [(int_xcore_ssync)]>;
+
+let Uses = [SP], mayStore = 1 in {
+def STET_0R : _F0R<0b0000111101, (outs), (ins), "stw et, sp[4]", []>;
+
+def STSED_0R : _F0R<0b0000111100, (outs), (ins), "stw sed, sp[3]", []>;
+
+def STSPC_0R : _F0R<0b0000101101, (outs), (ins), "stw spc, sp[1]", []>;
+
+def STSSR_0R : _F0R<0b0000101111, (outs), (ins), "stw ssr, sp[2]", []>;
+}
+
+let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1,
+ hasSideEffects = 1 in
+def WAITEU_0R : _F0R<0b0000001100, (outs), (ins),
+ "waiteu",
+ [(brind (int_xcore_waitevent))]>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat<(XCoreBranchLink texternalsym:$addr), (BLRF_lu10 texternalsym:$addr)>;
+
+/// sext_inreg
+def : Pat<(sext_inreg GRRegs:$b, i1), (SEXT_rus GRRegs:$b, 1)>;
+def : Pat<(sext_inreg GRRegs:$b, i8), (SEXT_rus GRRegs:$b, 8)>;
+def : Pat<(sext_inreg GRRegs:$b, i16), (SEXT_rus GRRegs:$b, 16)>;
+
+/// loads
+def : Pat<(zextloadi8 (add GRRegs:$addr, GRRegs:$offset)),
+ (LD8U_3r GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(zextloadi8 GRRegs:$addr), (LD8U_3r GRRegs:$addr, (LDC_ru6 0))>;
+
+def : Pat<(sextloadi16 (lda16f GRRegs:$addr, GRRegs:$offset)),
+ (LD16S_3r GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(sextloadi16 GRRegs:$addr), (LD16S_3r GRRegs:$addr, (LDC_ru6 0))>;
+
+def : Pat<(load (ldawf GRRegs:$addr, GRRegs:$offset)),
+ (LDW_3r GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(load (add GRRegs:$addr, immUs4:$offset)),
+ (LDW_2rus GRRegs:$addr, (div4_xform immUs4:$offset))>;
+def : Pat<(load GRRegs:$addr), (LDW_2rus GRRegs:$addr, 0)>;
+
+/// anyext
+def : Pat<(extloadi8 (add GRRegs:$addr, GRRegs:$offset)),
+ (LD8U_3r GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(extloadi8 GRRegs:$addr), (LD8U_3r GRRegs:$addr, (LDC_ru6 0))>;
+def : Pat<(extloadi16 (lda16f GRRegs:$addr, GRRegs:$offset)),
+ (LD16S_3r GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(extloadi16 GRRegs:$addr), (LD16S_3r GRRegs:$addr, (LDC_ru6 0))>;
+
+/// stores
+def : Pat<(truncstorei8 GRRegs:$val, (add GRRegs:$addr, GRRegs:$offset)),
+ (ST8_l3r GRRegs:$val, GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(truncstorei8 GRRegs:$val, GRRegs:$addr),
+ (ST8_l3r GRRegs:$val, GRRegs:$addr, (LDC_ru6 0))>;
+
+def : Pat<(truncstorei16 GRRegs:$val, (lda16f GRRegs:$addr, GRRegs:$offset)),
+ (ST16_l3r GRRegs:$val, GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(truncstorei16 GRRegs:$val, GRRegs:$addr),
+ (ST16_l3r GRRegs:$val, GRRegs:$addr, (LDC_ru6 0))>;
+
+def : Pat<(store GRRegs:$val, (ldawf GRRegs:$addr, GRRegs:$offset)),
+ (STW_l3r GRRegs:$val, GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(store GRRegs:$val, (add GRRegs:$addr, immUs4:$offset)),
+ (STW_2rus GRRegs:$val, GRRegs:$addr, (div4_xform immUs4:$offset))>;
+def : Pat<(store GRRegs:$val, GRRegs:$addr),
+ (STW_2rus GRRegs:$val, GRRegs:$addr, 0)>;
+
+/// cttz
+def : Pat<(cttz GRRegs:$src), (CLZ_l2r (BITREV_l2r GRRegs:$src))>;
+
+/// trap
+def : Pat<(trap), (ECALLF_1r (LDC_ru6 0))>;
+
+///
+/// branch patterns
+///
+
+// unconditional branch
+def : Pat<(br bb:$addr), (BRFU_lu6 bb:$addr)>;
+
+// direct match equal/notequal zero brcond
+def : Pat<(brcond (setne GRRegs:$lhs, 0), bb:$dst),
+ (BRFT_lru6 GRRegs:$lhs, bb:$dst)>;
+def : Pat<(brcond (seteq GRRegs:$lhs, 0), bb:$dst),
+ (BRFF_lru6 GRRegs:$lhs, bb:$dst)>;
+
+def : Pat<(brcond (setle GRRegs:$lhs, GRRegs:$rhs), bb:$dst),
+ (BRFF_lru6 (LSS_3r GRRegs:$rhs, GRRegs:$lhs), bb:$dst)>;
+def : Pat<(brcond (setule GRRegs:$lhs, GRRegs:$rhs), bb:$dst),
+ (BRFF_lru6 (LSU_3r GRRegs:$rhs, GRRegs:$lhs), bb:$dst)>;
+def : Pat<(brcond (setge GRRegs:$lhs, GRRegs:$rhs), bb:$dst),
+ (BRFF_lru6 (LSS_3r GRRegs:$lhs, GRRegs:$rhs), bb:$dst)>;
+def : Pat<(brcond (setuge GRRegs:$lhs, GRRegs:$rhs), bb:$dst),
+ (BRFF_lru6 (LSU_3r GRRegs:$lhs, GRRegs:$rhs), bb:$dst)>;
+def : Pat<(brcond (setne GRRegs:$lhs, GRRegs:$rhs), bb:$dst),
+ (BRFF_lru6 (EQ_3r GRRegs:$lhs, GRRegs:$rhs), bb:$dst)>;
+def : Pat<(brcond (setne GRRegs:$lhs, immUs:$rhs), bb:$dst),
+ (BRFF_lru6 (EQ_2rus GRRegs:$lhs, immUs:$rhs), bb:$dst)>;
+
+// generic brcond pattern
+def : Pat<(brcond GRRegs:$cond, bb:$addr), (BRFT_lru6 GRRegs:$cond, bb:$addr)>;
+
+
+///
+/// Select patterns
+///
+
+// direct match equal/notequal zero select
+def : Pat<(select (setne GRRegs:$lhs, 0), GRRegs:$T, GRRegs:$F),
+ (SELECT_CC GRRegs:$lhs, GRRegs:$T, GRRegs:$F)>;
+
+def : Pat<(select (seteq GRRegs:$lhs, 0), GRRegs:$T, GRRegs:$F),
+ (SELECT_CC GRRegs:$lhs, GRRegs:$F, GRRegs:$T)>;
+
+def : Pat<(select (setle GRRegs:$lhs, GRRegs:$rhs), GRRegs:$T, GRRegs:$F),
+ (SELECT_CC (LSS_3r GRRegs:$rhs, GRRegs:$lhs), GRRegs:$F, GRRegs:$T)>;
+def : Pat<(select (setule GRRegs:$lhs, GRRegs:$rhs), GRRegs:$T, GRRegs:$F),
+ (SELECT_CC (LSU_3r GRRegs:$rhs, GRRegs:$lhs), GRRegs:$F, GRRegs:$T)>;
+def : Pat<(select (setge GRRegs:$lhs, GRRegs:$rhs), GRRegs:$T, GRRegs:$F),
+ (SELECT_CC (LSS_3r GRRegs:$lhs, GRRegs:$rhs), GRRegs:$F, GRRegs:$T)>;
+def : Pat<(select (setuge GRRegs:$lhs, GRRegs:$rhs), GRRegs:$T, GRRegs:$F),
+ (SELECT_CC (LSU_3r GRRegs:$lhs, GRRegs:$rhs), GRRegs:$F, GRRegs:$T)>;
+def : Pat<(select (setne GRRegs:$lhs, GRRegs:$rhs), GRRegs:$T, GRRegs:$F),
+ (SELECT_CC (EQ_3r GRRegs:$lhs, GRRegs:$rhs), GRRegs:$F, GRRegs:$T)>;
+def : Pat<(select (setne GRRegs:$lhs, immUs:$rhs), GRRegs:$T, GRRegs:$F),
+ (SELECT_CC (EQ_2rus GRRegs:$lhs, immUs:$rhs), GRRegs:$F, GRRegs:$T)>;
+
+///
+/// setcc patterns, only matched when none of the above brcond
+/// patterns match
+///
+
+// setcc 2 register operands
+def : Pat<(setle GRRegs:$lhs, GRRegs:$rhs),
+ (EQ_2rus (LSS_3r GRRegs:$rhs, GRRegs:$lhs), 0)>;
+def : Pat<(setule GRRegs:$lhs, GRRegs:$rhs),
+ (EQ_2rus (LSU_3r GRRegs:$rhs, GRRegs:$lhs), 0)>;
+
+def : Pat<(setgt GRRegs:$lhs, GRRegs:$rhs),
+ (LSS_3r GRRegs:$rhs, GRRegs:$lhs)>;
+def : Pat<(setugt GRRegs:$lhs, GRRegs:$rhs),
+ (LSU_3r GRRegs:$rhs, GRRegs:$lhs)>;
+
+def : Pat<(setge GRRegs:$lhs, GRRegs:$rhs),
+ (EQ_2rus (LSS_3r GRRegs:$lhs, GRRegs:$rhs), 0)>;
+def : Pat<(setuge GRRegs:$lhs, GRRegs:$rhs),
+ (EQ_2rus (LSU_3r GRRegs:$lhs, GRRegs:$rhs), 0)>;
+
+def : Pat<(setlt GRRegs:$lhs, GRRegs:$rhs),
+ (LSS_3r GRRegs:$lhs, GRRegs:$rhs)>;
+def : Pat<(setult GRRegs:$lhs, GRRegs:$rhs),
+ (LSU_3r GRRegs:$lhs, GRRegs:$rhs)>;
+
+def : Pat<(setne GRRegs:$lhs, GRRegs:$rhs),
+ (EQ_2rus (EQ_3r GRRegs:$lhs, GRRegs:$rhs), 0)>;
+
+def : Pat<(seteq GRRegs:$lhs, GRRegs:$rhs),
+ (EQ_3r GRRegs:$lhs, GRRegs:$rhs)>;
+
+// setcc reg/imm operands
+def : Pat<(seteq GRRegs:$lhs, immUs:$rhs),
+ (EQ_2rus GRRegs:$lhs, immUs:$rhs)>;
+def : Pat<(setne GRRegs:$lhs, immUs:$rhs),
+ (EQ_2rus (EQ_2rus GRRegs:$lhs, immUs:$rhs), 0)>;
+
+// misc
+def : Pat<(add GRRegs:$addr, immUs4:$offset),
+ (LDAWF_l2rus GRRegs:$addr, (div4_xform immUs4:$offset))>;
+
+def : Pat<(sub GRRegs:$addr, immUs4:$offset),
+ (LDAWB_l2rus GRRegs:$addr, (div4_xform immUs4:$offset))>;
+
+def : Pat<(and GRRegs:$val, immMskBitp:$mask),
+ (ZEXT_rus GRRegs:$val, (msksize_xform immMskBitp:$mask))>;
+
+// (sub X, imm) gets canonicalized to (add X, -imm). Match this form.
+def : Pat<(add GRRegs:$src1, immUsNeg:$src2),
+ (SUB_2rus GRRegs:$src1, (neg_xform immUsNeg:$src2))>;
+
+def : Pat<(add GRRegs:$src1, immUs4Neg:$src2),
+ (LDAWB_l2rus GRRegs:$src1, (div4neg_xform immUs4Neg:$src2))>;
+
+///
+/// Some peepholes
+///
+
+def : Pat<(mul GRRegs:$src, 3),
+ (LDA16F_l3r GRRegs:$src, GRRegs:$src)>;
+
+def : Pat<(mul GRRegs:$src, 5),
+ (LDAWF_l3r GRRegs:$src, GRRegs:$src)>;
+
+def : Pat<(mul GRRegs:$src, -3),
+ (LDAWB_l3r GRRegs:$src, GRRegs:$src)>;
+
+// ashr X, 32 is equivalent to ashr X, 31 on the XCore.
+def : Pat<(sra GRRegs:$src, 31),
+ (ASHR_l2rus GRRegs:$src, 32)>;
+
+def : Pat<(brcond (setlt GRRegs:$lhs, 0), bb:$dst),
+ (BRFT_lru6 (ASHR_l2rus GRRegs:$lhs, 32), bb:$dst)>;
+
+// setge X, 0 is canonicalized to setgt X, -1
+def : Pat<(brcond (setgt GRRegs:$lhs, -1), bb:$dst),
+ (BRFF_lru6 (ASHR_l2rus GRRegs:$lhs, 32), bb:$dst)>;
+
+def : Pat<(select (setlt GRRegs:$lhs, 0), GRRegs:$T, GRRegs:$F),
+ (SELECT_CC (ASHR_l2rus GRRegs:$lhs, 32), GRRegs:$T, GRRegs:$F)>;
+
+def : Pat<(select (setgt GRRegs:$lhs, -1), GRRegs:$T, GRRegs:$F),
+ (SELECT_CC (ASHR_l2rus GRRegs:$lhs, 32), GRRegs:$F, GRRegs:$T)>;
+
+def : Pat<(setgt GRRegs:$lhs, -1),
+ (EQ_2rus (ASHR_l2rus GRRegs:$lhs, 32), 0)>;
+
+def : Pat<(sra (shl GRRegs:$src, immBpwSubBitp:$imm), immBpwSubBitp:$imm),
+ (SEXT_rus GRRegs:$src, (bpwsub_xform immBpwSubBitp:$imm))>;
+
+def : Pat<(load (cprelwrapper tconstpool:$b)),
+ (LDWCP_lru6 tconstpool:$b)>;
+
+def : Pat<(cprelwrapper tconstpool:$b),
+ (LDAWCP_lu6 tconstpool:$b)>;
diff --git a/contrib/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp b/contrib/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp
new file mode 100644
index 000000000000..5cc51cd7a992
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp
@@ -0,0 +1,234 @@
+//===-- XCoreLowerThreadLocal - Lower thread local variables --------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains a pass that lowers thread local variables on the
+/// XCore.
+///
+//===----------------------------------------------------------------------===//
+
+#include "XCore.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/NoFolder.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "xcore-lower-thread-local"
+
+using namespace llvm;
+
+static cl::opt<unsigned> MaxThreads(
+ "xcore-max-threads", cl::Optional,
+ cl::desc("Maximum number of threads (for emulation thread-local storage)"),
+ cl::Hidden, cl::value_desc("number"), cl::init(8));
+
+namespace {
+ /// Lowers thread local variables on the XCore. Each thread local variable is
+ /// expanded to an array of n elements indexed by the thread ID where n is the
+ /// fixed number hardware threads supported by the device.
+ struct XCoreLowerThreadLocal : public ModulePass {
+ static char ID;
+
+ XCoreLowerThreadLocal() : ModulePass(ID) {
+ initializeXCoreLowerThreadLocalPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool lowerGlobal(GlobalVariable *GV);
+
+ bool runOnModule(Module &M) override;
+ };
+}
+
+char XCoreLowerThreadLocal::ID = 0;
+
+INITIALIZE_PASS(XCoreLowerThreadLocal, "xcore-lower-thread-local",
+ "Lower thread local variables", false, false)
+
+ModulePass *llvm::createXCoreLowerThreadLocalPass() {
+ return new XCoreLowerThreadLocal();
+}
+
+static ArrayType *createLoweredType(Type *OriginalType) {
+ return ArrayType::get(OriginalType, MaxThreads);
+}
+
+static Constant *
+createLoweredInitializer(ArrayType *NewType, Constant *OriginalInitializer) {
+ SmallVector<Constant *, 8> Elements(MaxThreads);
+ for (unsigned i = 0; i != MaxThreads; ++i) {
+ Elements[i] = OriginalInitializer;
+ }
+ return ConstantArray::get(NewType, Elements);
+}
+
+static Instruction *
+createReplacementInstr(ConstantExpr *CE, Instruction *Instr) {
+ IRBuilder<NoFolder> Builder(Instr);
+ unsigned OpCode = CE->getOpcode();
+ switch (OpCode) {
+ case Instruction::GetElementPtr: {
+ SmallVector<Value *,4> CEOpVec(CE->op_begin(), CE->op_end());
+ ArrayRef<Value *> CEOps(CEOpVec);
+ return dyn_cast<Instruction>(Builder.CreateInBoundsGEP(
+ cast<GEPOperator>(CE)->getSourceElementType(), CEOps[0],
+ CEOps.slice(1)));
+ }
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::Mul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ return dyn_cast<Instruction>(
+ Builder.CreateBinOp((Instruction::BinaryOps)OpCode,
+ CE->getOperand(0), CE->getOperand(1),
+ CE->getName()));
+ case Instruction::Trunc:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::UIToFP:
+ case Instruction::SIToFP:
+ case Instruction::FPTrunc:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::BitCast:
+ return dyn_cast<Instruction>(
+ Builder.CreateCast((Instruction::CastOps)OpCode,
+ CE->getOperand(0), CE->getType(),
+ CE->getName()));
+ default:
+ llvm_unreachable("Unhandled constant expression!\n");
+ }
+}
+
+static bool replaceConstantExprOp(ConstantExpr *CE, Pass *P) {
+ do {
+ SmallVector<WeakVH,8> WUsers(CE->user_begin(), CE->user_end());
+ std::sort(WUsers.begin(), WUsers.end());
+ WUsers.erase(std::unique(WUsers.begin(), WUsers.end()), WUsers.end());
+ while (!WUsers.empty())
+ if (WeakVH WU = WUsers.pop_back_val()) {
+ if (PHINode *PN = dyn_cast<PHINode>(WU)) {
+ for (int I = 0, E = PN->getNumIncomingValues(); I < E; ++I)
+ if (PN->getIncomingValue(I) == CE) {
+ BasicBlock *PredBB = PN->getIncomingBlock(I);
+ if (PredBB->getTerminator()->getNumSuccessors() > 1)
+ PredBB = SplitEdge(PredBB, PN->getParent());
+ Instruction *InsertPos = PredBB->getTerminator();
+ Instruction *NewInst = createReplacementInstr(CE, InsertPos);
+ PN->setOperand(I, NewInst);
+ }
+ } else if (Instruction *Instr = dyn_cast<Instruction>(WU)) {
+ Instruction *NewInst = createReplacementInstr(CE, Instr);
+ Instr->replaceUsesOfWith(CE, NewInst);
+ } else {
+ ConstantExpr *CExpr = dyn_cast<ConstantExpr>(WU);
+ if (!CExpr || !replaceConstantExprOp(CExpr, P))
+ return false;
+ }
+ }
+ } while (CE->hasNUsesOrMore(1)); // We need to check because a recursive
+ // sibling may have used 'CE' when createReplacementInstr was called.
+ CE->destroyConstant();
+ return true;
+}
+
+static bool rewriteNonInstructionUses(GlobalVariable *GV, Pass *P) {
+ SmallVector<WeakVH,8> WUsers;
+ for (User *U : GV->users())
+ if (!isa<Instruction>(U))
+ WUsers.push_back(WeakVH(U));
+ while (!WUsers.empty())
+ if (WeakVH WU = WUsers.pop_back_val()) {
+ ConstantExpr *CE = dyn_cast<ConstantExpr>(WU);
+ if (!CE || !replaceConstantExprOp(CE, P))
+ return false;
+ }
+ return true;
+}
+
+static bool isZeroLengthArray(Type *Ty) {
+ ArrayType *AT = dyn_cast<ArrayType>(Ty);
+ return AT && (AT->getNumElements() == 0);
+}
+
+bool XCoreLowerThreadLocal::lowerGlobal(GlobalVariable *GV) {
+ Module *M = GV->getParent();
+ if (!GV->isThreadLocal())
+ return false;
+
+ // Skip globals that we can't lower and leave it for the backend to error.
+ if (!rewriteNonInstructionUses(GV, this) ||
+ !GV->getType()->isSized() || isZeroLengthArray(GV->getType()))
+ return false;
+
+ // Create replacement global.
+ ArrayType *NewType = createLoweredType(GV->getValueType());
+ Constant *NewInitializer = nullptr;
+ if (GV->hasInitializer())
+ NewInitializer = createLoweredInitializer(NewType,
+ GV->getInitializer());
+ GlobalVariable *NewGV =
+ new GlobalVariable(*M, NewType, GV->isConstant(), GV->getLinkage(),
+ NewInitializer, "", nullptr,
+ GlobalVariable::NotThreadLocal,
+ GV->getType()->getAddressSpace(),
+ GV->isExternallyInitialized());
+
+ // Update uses.
+ SmallVector<User *, 16> Users(GV->user_begin(), GV->user_end());
+ for (unsigned I = 0, E = Users.size(); I != E; ++I) {
+ User *U = Users[I];
+ Instruction *Inst = cast<Instruction>(U);
+ IRBuilder<> Builder(Inst);
+ Function *GetID = Intrinsic::getDeclaration(GV->getParent(),
+ Intrinsic::xcore_getid);
+ Value *ThreadID = Builder.CreateCall(GetID, {});
+ Value *Addr = Builder.CreateInBoundsGEP(NewGV->getValueType(), NewGV,
+ {Builder.getInt64(0), ThreadID});
+ U->replaceUsesOfWith(GV, Addr);
+ }
+
+ // Remove old global.
+ NewGV->takeName(GV);
+ GV->eraseFromParent();
+ return true;
+}
+
+bool XCoreLowerThreadLocal::runOnModule(Module &M) {
+ // Find thread local globals.
+ bool MadeChange = false;
+ SmallVector<GlobalVariable *, 16> ThreadLocalGlobals;
+ for (GlobalVariable &GV : M.globals())
+ if (GV.isThreadLocal())
+ ThreadLocalGlobals.push_back(&GV);
+ for (unsigned I = 0, E = ThreadLocalGlobals.size(); I != E; ++I) {
+ MadeChange |= lowerGlobal(ThreadLocalGlobals[I]);
+ }
+ return MadeChange;
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreMCInstLower.cpp b/contrib/llvm/lib/Target/XCore/XCoreMCInstLower.cpp
new file mode 100644
index 000000000000..7763ccc8f4af
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreMCInstLower.cpp
@@ -0,0 +1,114 @@
+//===-- XCoreMCInstLower.cpp - Convert XCore MachineInstr to MCInst -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains code to lower XCore MachineInstrs to their
+/// corresponding MCInst records.
+///
+//===----------------------------------------------------------------------===//
+#include "XCoreMCInstLower.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+
+using namespace llvm;
+
+XCoreMCInstLower::XCoreMCInstLower(class AsmPrinter &asmprinter)
+ : Printer(asmprinter) {}
+
+void XCoreMCInstLower::Initialize(MCContext *C) { Ctx = C; }
+
+MCOperand XCoreMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
+ MachineOperandType MOTy,
+ unsigned Offset) const {
+ MCSymbolRefExpr::VariantKind Kind = MCSymbolRefExpr::VK_None;
+ const MCSymbol *Symbol;
+
+ switch (MOTy) {
+ case MachineOperand::MO_MachineBasicBlock:
+ Symbol = MO.getMBB()->getSymbol();
+ break;
+ case MachineOperand::MO_GlobalAddress:
+ Symbol = Printer.getSymbol(MO.getGlobal());
+ Offset += MO.getOffset();
+ break;
+ case MachineOperand::MO_BlockAddress:
+ Symbol = Printer.GetBlockAddressSymbol(MO.getBlockAddress());
+ Offset += MO.getOffset();
+ break;
+ case MachineOperand::MO_ExternalSymbol:
+ Symbol = Printer.GetExternalSymbolSymbol(MO.getSymbolName());
+ Offset += MO.getOffset();
+ break;
+ case MachineOperand::MO_JumpTableIndex:
+ Symbol = Printer.GetJTISymbol(MO.getIndex());
+ break;
+ case MachineOperand::MO_ConstantPoolIndex:
+ Symbol = Printer.GetCPISymbol(MO.getIndex());
+ Offset += MO.getOffset();
+ break;
+ default:
+ llvm_unreachable("<unknown operand type>");
+ }
+
+ const MCSymbolRefExpr *MCSym = MCSymbolRefExpr::create(Symbol, Kind, *Ctx);
+
+ if (!Offset)
+ return MCOperand::createExpr(MCSym);
+
+ // Assume offset is never negative.
+ assert(Offset > 0);
+
+ const MCConstantExpr *OffsetExpr = MCConstantExpr::create(Offset, *Ctx);
+ const MCBinaryExpr *Add = MCBinaryExpr::createAdd(MCSym, OffsetExpr, *Ctx);
+ return MCOperand::createExpr(Add);
+}
+
+MCOperand XCoreMCInstLower::LowerOperand(const MachineOperand &MO,
+ unsigned offset) const {
+ MachineOperandType MOTy = MO.getType();
+
+ switch (MOTy) {
+ default: llvm_unreachable("unknown operand type");
+ case MachineOperand::MO_Register:
+ // Ignore all implicit register operands.
+ if (MO.isImplicit()) break;
+ return MCOperand::createReg(MO.getReg());
+ case MachineOperand::MO_Immediate:
+ return MCOperand::createImm(MO.getImm() + offset);
+ case MachineOperand::MO_MachineBasicBlock:
+ case MachineOperand::MO_GlobalAddress:
+ case MachineOperand::MO_ExternalSymbol:
+ case MachineOperand::MO_JumpTableIndex:
+ case MachineOperand::MO_ConstantPoolIndex:
+ case MachineOperand::MO_BlockAddress:
+ return LowerSymbolOperand(MO, MOTy, offset);
+ case MachineOperand::MO_RegisterMask:
+ break;
+ }
+
+ return MCOperand();
+}
+
+void XCoreMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+ OutMI.setOpcode(MI->getOpcode());
+
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+ MCOperand MCOp = LowerOperand(MO);
+
+ if (MCOp.isValid())
+ OutMI.addOperand(MCOp);
+ }
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreMCInstLower.h b/contrib/llvm/lib/Target/XCore/XCoreMCInstLower.h
new file mode 100644
index 000000000000..8fb1593cc6e6
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreMCInstLower.h
@@ -0,0 +1,41 @@
+//===-- XCoreMCInstLower.h - Lower MachineInstr to MCInst ------*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XCORE_XCOREMCINSTLOWER_H
+#define LLVM_LIB_TARGET_XCORE_XCOREMCINSTLOWER_H
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+ class MCContext;
+ class MCInst;
+ class MCOperand;
+ class MachineInstr;
+ class MachineFunction;
+ class Mangler;
+ class AsmPrinter;
+
+/// \brief This class is used to lower an MachineInstr into an MCInst.
+class LLVM_LIBRARY_VISIBILITY XCoreMCInstLower {
+ typedef MachineOperand::MachineOperandType MachineOperandType;
+ MCContext *Ctx;
+ AsmPrinter &Printer;
+public:
+ XCoreMCInstLower(class AsmPrinter &asmprinter);
+ void Initialize(MCContext *C);
+ void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+ MCOperand LowerOperand(const MachineOperand& MO, unsigned offset = 0) const;
+
+private:
+ MCOperand LowerSymbolOperand(const MachineOperand &MO,
+ MachineOperandType MOTy, unsigned Offset) const;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.cpp
new file mode 100644
index 000000000000..e91536ca1e83
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.cpp
@@ -0,0 +1,72 @@
+//===-- XCoreMachineFunctionInfo.cpp - XCore machine function info --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreMachineFunctionInfo.h"
+#include "XCoreInstrInfo.h"
+#include "llvm/IR/Function.h"
+
+using namespace llvm;
+
+void XCoreFunctionInfo::anchor() { }
+
+bool XCoreFunctionInfo::isLargeFrame(const MachineFunction &MF) const {
+ if (CachedEStackSize == -1) {
+ CachedEStackSize = MF.getFrameInfo().estimateStackSize(MF);
+ }
+ // isLargeFrame() is used when deciding if spill slots should be added to
+ // allow eliminateFrameIndex() to scavenge registers.
+ // This is only required when there is no FP and offsets are greater than
+ // ~256KB (~64Kwords). Thus only for code run on the emulator!
+ //
+ // The arbitrary value of 0xf000 allows frames of up to ~240KB before spill
+ // slots are added for the use of eliminateFrameIndex() register scavenging.
+ // For frames less than 240KB, it is assumed that there will be less than
+ // 16KB of function arguments.
+ return CachedEStackSize > 0xf000;
+}
+
+int XCoreFunctionInfo::createLRSpillSlot(MachineFunction &MF) {
+ if (LRSpillSlotSet) {
+ return LRSpillSlot;
+ }
+ const TargetRegisterClass *RC = &XCore::GRRegsRegClass;
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ if (! MF.getFunction()->isVarArg()) {
+ // A fixed offset of 0 allows us to save / restore LR using entsp / retsp.
+ LRSpillSlot = MFI.CreateFixedObject(RC->getSize(), 0, true);
+ } else {
+ LRSpillSlot = MFI.CreateStackObject(RC->getSize(), RC->getAlignment(), true);
+ }
+ LRSpillSlotSet = true;
+ return LRSpillSlot;
+}
+
+int XCoreFunctionInfo::createFPSpillSlot(MachineFunction &MF) {
+ if (FPSpillSlotSet) {
+ return FPSpillSlot;
+ }
+ const TargetRegisterClass *RC = &XCore::GRRegsRegClass;
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ FPSpillSlot = MFI.CreateStackObject(RC->getSize(), RC->getAlignment(), true);
+ FPSpillSlotSet = true;
+ return FPSpillSlot;
+}
+
+const int* XCoreFunctionInfo::createEHSpillSlot(MachineFunction &MF) {
+ if (EHSpillSlotSet) {
+ return EHSpillSlot;
+ }
+ const TargetRegisterClass *RC = &XCore::GRRegsRegClass;
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ EHSpillSlot[0] = MFI.CreateStackObject(RC->getSize(), RC->getAlignment(), true);
+ EHSpillSlot[1] = MFI.CreateStackObject(RC->getSize(), RC->getAlignment(), true);
+ EHSpillSlotSet = true;
+ return EHSpillSlot;
+}
+
diff --git a/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h b/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h
new file mode 100644
index 000000000000..cdcc52fdc32d
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h
@@ -0,0 +1,106 @@
+//===- XCoreMachineFunctionInfo.h - XCore machine function info -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares XCore-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XCORE_XCOREMACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_XCORE_XCOREMACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include <vector>
+
+namespace llvm {
+
+// Forward declarations
+class Function;
+
+/// XCoreFunctionInfo - This class is derived from MachineFunction private
+/// XCore target-specific information for each MachineFunction.
+class XCoreFunctionInfo : public MachineFunctionInfo {
+ virtual void anchor();
+ bool LRSpillSlotSet;
+ int LRSpillSlot;
+ bool FPSpillSlotSet;
+ int FPSpillSlot;
+ bool EHSpillSlotSet;
+ int EHSpillSlot[2];
+ unsigned ReturnStackOffset;
+ bool ReturnStackOffsetSet;
+ int VarArgsFrameIndex;
+ mutable int CachedEStackSize;
+ std::vector<std::pair<MachineBasicBlock::iterator, CalleeSavedInfo>>
+ SpillLabels;
+
+public:
+ XCoreFunctionInfo() :
+ LRSpillSlotSet(false),
+ FPSpillSlotSet(false),
+ EHSpillSlotSet(false),
+ ReturnStackOffsetSet(false),
+ VarArgsFrameIndex(0),
+ CachedEStackSize(-1) {}
+
+ explicit XCoreFunctionInfo(MachineFunction &MF) :
+ LRSpillSlotSet(false),
+ FPSpillSlotSet(false),
+ EHSpillSlotSet(false),
+ ReturnStackOffsetSet(false),
+ VarArgsFrameIndex(0),
+ CachedEStackSize(-1) {}
+
+ ~XCoreFunctionInfo() {}
+
+ void setVarArgsFrameIndex(int off) { VarArgsFrameIndex = off; }
+ int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+
+ int createLRSpillSlot(MachineFunction &MF);
+ bool hasLRSpillSlot() { return LRSpillSlotSet; }
+ int getLRSpillSlot() const {
+ assert(LRSpillSlotSet && "LR Spill slot not set");
+ return LRSpillSlot;
+ }
+
+ int createFPSpillSlot(MachineFunction &MF);
+ bool hasFPSpillSlot() { return FPSpillSlotSet; }
+ int getFPSpillSlot() const {
+ assert(FPSpillSlotSet && "FP Spill slot not set");
+ return FPSpillSlot;
+ }
+
+ const int* createEHSpillSlot(MachineFunction &MF);
+ bool hasEHSpillSlot() { return EHSpillSlotSet; }
+ const int* getEHSpillSlot() const {
+ assert(EHSpillSlotSet && "EH Spill slot not set");
+ return EHSpillSlot;
+ }
+
+ void setReturnStackOffset(unsigned value) {
+ assert(!ReturnStackOffsetSet && "Return stack offset set twice");
+ ReturnStackOffset = value;
+ ReturnStackOffsetSet = true;
+ }
+
+ unsigned getReturnStackOffset() const {
+ assert(ReturnStackOffsetSet && "Return stack offset not set");
+ return ReturnStackOffset;
+ }
+
+ bool isLargeFrame(const MachineFunction &MF) const;
+
+ std::vector<std::pair<MachineBasicBlock::iterator, CalleeSavedInfo>> &
+ getSpillLabels() {
+ return SpillLabels;
+ }
+};
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.cpp b/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.cpp
new file mode 100644
index 000000000000..d34e928b14f7
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.cpp
@@ -0,0 +1,330 @@
+//===-- XCoreRegisterInfo.cpp - XCore Register Information ----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the XCore implementation of the MRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreRegisterInfo.h"
+#include "XCore.h"
+#include "XCoreInstrInfo.h"
+#include "XCoreMachineFunctionInfo.h"
+#include "XCoreSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "xcore-reg-info"
+
+#define GET_REGINFO_TARGET_DESC
+#include "XCoreGenRegisterInfo.inc"
+
+XCoreRegisterInfo::XCoreRegisterInfo()
+ : XCoreGenRegisterInfo(XCore::LR) {
+}
+
+// helper functions
+static inline bool isImmUs(unsigned val) {
+ return val <= 11;
+}
+
+static inline bool isImmU6(unsigned val) {
+ return val < (1 << 6);
+}
+
+static inline bool isImmU16(unsigned val) {
+ return val < (1 << 16);
+}
+
+
+static void InsertFPImmInst(MachineBasicBlock::iterator II,
+ const XCoreInstrInfo &TII,
+ unsigned Reg, unsigned FrameReg, int Offset ) {
+ MachineInstr &MI = *II;
+ MachineBasicBlock &MBB = *MI.getParent();
+ DebugLoc dl = MI.getDebugLoc();
+
+ switch (MI.getOpcode()) {
+ case XCore::LDWFI:
+ BuildMI(MBB, II, dl, TII.get(XCore::LDW_2rus), Reg)
+ .addReg(FrameReg)
+ .addImm(Offset)
+ .addMemOperand(*MI.memoperands_begin());
+ break;
+ case XCore::STWFI:
+ BuildMI(MBB, II, dl, TII.get(XCore::STW_2rus))
+ .addReg(Reg, getKillRegState(MI.getOperand(0).isKill()))
+ .addReg(FrameReg)
+ .addImm(Offset)
+ .addMemOperand(*MI.memoperands_begin());
+ break;
+ case XCore::LDAWFI:
+ BuildMI(MBB, II, dl, TII.get(XCore::LDAWF_l2rus), Reg)
+ .addReg(FrameReg)
+ .addImm(Offset);
+ break;
+ default:
+ llvm_unreachable("Unexpected Opcode");
+ }
+}
+
+static void InsertFPConstInst(MachineBasicBlock::iterator II,
+ const XCoreInstrInfo &TII,
+ unsigned Reg, unsigned FrameReg,
+ int Offset, RegScavenger *RS ) {
+ assert(RS && "requiresRegisterScavenging failed");
+ MachineInstr &MI = *II;
+ MachineBasicBlock &MBB = *MI.getParent();
+ DebugLoc dl = MI.getDebugLoc();
+ unsigned ScratchOffset = RS->scavengeRegister(&XCore::GRRegsRegClass, II, 0);
+ RS->setRegUsed(ScratchOffset);
+ TII.loadImmediate(MBB, II, ScratchOffset, Offset);
+
+ switch (MI.getOpcode()) {
+ case XCore::LDWFI:
+ BuildMI(MBB, II, dl, TII.get(XCore::LDW_3r), Reg)
+ .addReg(FrameReg)
+ .addReg(ScratchOffset, RegState::Kill)
+ .addMemOperand(*MI.memoperands_begin());
+ break;
+ case XCore::STWFI:
+ BuildMI(MBB, II, dl, TII.get(XCore::STW_l3r))
+ .addReg(Reg, getKillRegState(MI.getOperand(0).isKill()))
+ .addReg(FrameReg)
+ .addReg(ScratchOffset, RegState::Kill)
+ .addMemOperand(*MI.memoperands_begin());
+ break;
+ case XCore::LDAWFI:
+ BuildMI(MBB, II, dl, TII.get(XCore::LDAWF_l3r), Reg)
+ .addReg(FrameReg)
+ .addReg(ScratchOffset, RegState::Kill);
+ break;
+ default:
+ llvm_unreachable("Unexpected Opcode");
+ }
+}
+
+static void InsertSPImmInst(MachineBasicBlock::iterator II,
+ const XCoreInstrInfo &TII,
+ unsigned Reg, int Offset) {
+ MachineInstr &MI = *II;
+ MachineBasicBlock &MBB = *MI.getParent();
+ DebugLoc dl = MI.getDebugLoc();
+ bool isU6 = isImmU6(Offset);
+
+ switch (MI.getOpcode()) {
+ int NewOpcode;
+ case XCore::LDWFI:
+ NewOpcode = (isU6) ? XCore::LDWSP_ru6 : XCore::LDWSP_lru6;
+ BuildMI(MBB, II, dl, TII.get(NewOpcode), Reg)
+ .addImm(Offset)
+ .addMemOperand(*MI.memoperands_begin());
+ break;
+ case XCore::STWFI:
+ NewOpcode = (isU6) ? XCore::STWSP_ru6 : XCore::STWSP_lru6;
+ BuildMI(MBB, II, dl, TII.get(NewOpcode))
+ .addReg(Reg, getKillRegState(MI.getOperand(0).isKill()))
+ .addImm(Offset)
+ .addMemOperand(*MI.memoperands_begin());
+ break;
+ case XCore::LDAWFI:
+ NewOpcode = (isU6) ? XCore::LDAWSP_ru6 : XCore::LDAWSP_lru6;
+ BuildMI(MBB, II, dl, TII.get(NewOpcode), Reg)
+ .addImm(Offset);
+ break;
+ default:
+ llvm_unreachable("Unexpected Opcode");
+ }
+}
+
+static void InsertSPConstInst(MachineBasicBlock::iterator II,
+ const XCoreInstrInfo &TII,
+ unsigned Reg, int Offset, RegScavenger *RS ) {
+ assert(RS && "requiresRegisterScavenging failed");
+ MachineInstr &MI = *II;
+ MachineBasicBlock &MBB = *MI.getParent();
+ DebugLoc dl = MI.getDebugLoc();
+ unsigned OpCode = MI.getOpcode();
+
+ unsigned ScratchBase;
+ if (OpCode==XCore::STWFI) {
+ ScratchBase = RS->scavengeRegister(&XCore::GRRegsRegClass, II, 0);
+ RS->setRegUsed(ScratchBase);
+ } else
+ ScratchBase = Reg;
+ BuildMI(MBB, II, dl, TII.get(XCore::LDAWSP_ru6), ScratchBase).addImm(0);
+ unsigned ScratchOffset = RS->scavengeRegister(&XCore::GRRegsRegClass, II, 0);
+ RS->setRegUsed(ScratchOffset);
+ TII.loadImmediate(MBB, II, ScratchOffset, Offset);
+
+ switch (OpCode) {
+ case XCore::LDWFI:
+ BuildMI(MBB, II, dl, TII.get(XCore::LDW_3r), Reg)
+ .addReg(ScratchBase, RegState::Kill)
+ .addReg(ScratchOffset, RegState::Kill)
+ .addMemOperand(*MI.memoperands_begin());
+ break;
+ case XCore::STWFI:
+ BuildMI(MBB, II, dl, TII.get(XCore::STW_l3r))
+ .addReg(Reg, getKillRegState(MI.getOperand(0).isKill()))
+ .addReg(ScratchBase, RegState::Kill)
+ .addReg(ScratchOffset, RegState::Kill)
+ .addMemOperand(*MI.memoperands_begin());
+ break;
+ case XCore::LDAWFI:
+ BuildMI(MBB, II, dl, TII.get(XCore::LDAWF_l3r), Reg)
+ .addReg(ScratchBase, RegState::Kill)
+ .addReg(ScratchOffset, RegState::Kill);
+ break;
+ default:
+ llvm_unreachable("Unexpected Opcode");
+ }
+}
+
+bool XCoreRegisterInfo::needsFrameMoves(const MachineFunction &MF) {
+ return MF.getMMI().hasDebugInfo() ||
+ MF.getFunction()->needsUnwindTableEntry();
+}
+
+const MCPhysReg *
+XCoreRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+ // The callee saved registers LR & FP are explicitly handled during
+ // emitPrologue & emitEpilogue and related functions.
+ static const MCPhysReg CalleeSavedRegs[] = {
+ XCore::R4, XCore::R5, XCore::R6, XCore::R7,
+ XCore::R8, XCore::R9, XCore::R10,
+ 0
+ };
+ static const MCPhysReg CalleeSavedRegsFP[] = {
+ XCore::R4, XCore::R5, XCore::R6, XCore::R7,
+ XCore::R8, XCore::R9,
+ 0
+ };
+ const XCoreFrameLowering *TFI = getFrameLowering(*MF);
+ if (TFI->hasFP(*MF))
+ return CalleeSavedRegsFP;
+ return CalleeSavedRegs;
+}
+
+BitVector XCoreRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+ BitVector Reserved(getNumRegs());
+ const XCoreFrameLowering *TFI = getFrameLowering(MF);
+
+ Reserved.set(XCore::CP);
+ Reserved.set(XCore::DP);
+ Reserved.set(XCore::SP);
+ Reserved.set(XCore::LR);
+ if (TFI->hasFP(MF)) {
+ Reserved.set(XCore::R10);
+ }
+ return Reserved;
+}
+
+bool
+XCoreRegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const {
+ return true;
+}
+
+bool
+XCoreRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+ return true;
+}
+
+bool
+XCoreRegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
+ return false;
+}
+
+void
+XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+ int SPAdj, unsigned FIOperandNum,
+ RegScavenger *RS) const {
+ assert(SPAdj == 0 && "Unexpected");
+ MachineInstr &MI = *II;
+ MachineOperand &FrameOp = MI.getOperand(FIOperandNum);
+ int FrameIndex = FrameOp.getIndex();
+
+ MachineFunction &MF = *MI.getParent()->getParent();
+ const XCoreInstrInfo &TII =
+ *static_cast<const XCoreInstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+ const XCoreFrameLowering *TFI = getFrameLowering(MF);
+ int Offset = MF.getFrameInfo().getObjectOffset(FrameIndex);
+ int StackSize = MF.getFrameInfo().getStackSize();
+
+ #ifndef NDEBUG
+ DEBUG(errs() << "\nFunction : "
+ << MF.getName() << "\n");
+ DEBUG(errs() << "<--------->\n");
+ DEBUG(MI.print(errs()));
+ DEBUG(errs() << "FrameIndex : " << FrameIndex << "\n");
+ DEBUG(errs() << "FrameOffset : " << Offset << "\n");
+ DEBUG(errs() << "StackSize : " << StackSize << "\n");
+ #endif
+
+ Offset += StackSize;
+
+ unsigned FrameReg = getFrameRegister(MF);
+
+ // Special handling of DBG_VALUE instructions.
+ if (MI.isDebugValue()) {
+ MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false /*isDef*/);
+ MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+ return;
+ }
+
+ // fold constant into offset.
+ Offset += MI.getOperand(FIOperandNum + 1).getImm();
+ MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0);
+
+ assert(Offset%4 == 0 && "Misaligned stack offset");
+ DEBUG(errs() << "Offset : " << Offset << "\n" << "<--------->\n");
+ Offset/=4;
+
+ unsigned Reg = MI.getOperand(0).getReg();
+ assert(XCore::GRRegsRegClass.contains(Reg) && "Unexpected register operand");
+
+ if (TFI->hasFP(MF)) {
+ if (isImmUs(Offset))
+ InsertFPImmInst(II, TII, Reg, FrameReg, Offset);
+ else
+ InsertFPConstInst(II, TII, Reg, FrameReg, Offset, RS);
+ } else {
+ if (isImmU16(Offset))
+ InsertSPImmInst(II, TII, Reg, Offset);
+ else
+ InsertSPConstInst(II, TII, Reg, Offset, RS);
+ }
+ // Erase old instruction.
+ MachineBasicBlock &MBB = *MI.getParent();
+ MBB.erase(II);
+}
+
+
+unsigned XCoreRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+ const XCoreFrameLowering *TFI = getFrameLowering(MF);
+
+ return TFI->hasFP(MF) ? XCore::R10 : XCore::SP;
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.h b/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.h
new file mode 100644
index 000000000000..010fccd797a6
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.h
@@ -0,0 +1,55 @@
+//===-- XCoreRegisterInfo.h - XCore Register Information Impl ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the XCore implementation of the MRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XCORE_XCOREREGISTERINFO_H
+#define LLVM_LIB_TARGET_XCORE_XCOREREGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "XCoreGenRegisterInfo.inc"
+
+namespace llvm {
+
+class TargetInstrInfo;
+
+struct XCoreRegisterInfo : public XCoreGenRegisterInfo {
+public:
+ XCoreRegisterInfo();
+
+ /// Code Generation virtual methods...
+
+ const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+
+ BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+ bool requiresRegisterScavenging(const MachineFunction &MF) const override;
+
+ bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
+
+ bool useFPForScavengingIndex(const MachineFunction &MF) const override;
+
+ void eliminateFrameIndex(MachineBasicBlock::iterator II,
+ int SPAdj, unsigned FIOperandNum,
+ RegScavenger *RS = nullptr) const override;
+
+ // Debug information queries.
+ unsigned getFrameRegister(const MachineFunction &MF) const override;
+
+ //! Return whether to emit frame moves
+ static bool needsFrameMoves(const MachineFunction &MF);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.td b/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.td
new file mode 100644
index 000000000000..6694b2882aca
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.td
@@ -0,0 +1,59 @@
+//===-- XCoreRegisterInfo.td - XCore Register defs ---------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Declarations that describe the XCore register file
+//===----------------------------------------------------------------------===//
+
+class XCoreReg<string n> : Register<n> {
+ field bits<4> Num;
+ let Namespace = "XCore";
+}
+
+// Registers are identified with 4-bit ID numbers.
+// Ri - 32-bit integer registers
+class Ri<bits<4> num, string n> : XCoreReg<n> {
+ let Num = num;
+}
+
+// CPU registers
+def R0 : Ri< 0, "r0">, DwarfRegNum<[0]>;
+def R1 : Ri< 1, "r1">, DwarfRegNum<[1]>;
+def R2 : Ri< 2, "r2">, DwarfRegNum<[2]>;
+def R3 : Ri< 3, "r3">, DwarfRegNum<[3]>;
+def R4 : Ri< 4, "r4">, DwarfRegNum<[4]>;
+def R5 : Ri< 5, "r5">, DwarfRegNum<[5]>;
+def R6 : Ri< 6, "r6">, DwarfRegNum<[6]>;
+def R7 : Ri< 7, "r7">, DwarfRegNum<[7]>;
+def R8 : Ri< 8, "r8">, DwarfRegNum<[8]>;
+def R9 : Ri< 9, "r9">, DwarfRegNum<[9]>;
+def R10 : Ri<10, "r10">, DwarfRegNum<[10]>;
+def R11 : Ri<11, "r11">, DwarfRegNum<[11]>;
+def CP : Ri<12, "cp">, DwarfRegNum<[12]>;
+def DP : Ri<13, "dp">, DwarfRegNum<[13]>;
+def SP : Ri<14, "sp">, DwarfRegNum<[14]>;
+def LR : Ri<15, "lr">, DwarfRegNum<[15]>;
+
+// Register classes.
+//
+def GRRegs : RegisterClass<"XCore", [i32], 32,
+ // Return values and arguments
+ (add R0, R1, R2, R3,
+ // Callee save
+ R4, R5, R6, R7, R8, R9, R10,
+ // Not preserved across procedure calls
+ R11)>;
+
+// Reserved
+def RRegs : RegisterClass<"XCore", [i32], 32,
+ (add R0, R1, R2, R3,
+ R4, R5, R6, R7, R8, R9, R10,
+ R11, CP, DP, SP, LR)> {
+ let isAllocatable = 0;
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
new file mode 100644
index 000000000000..c03b0afceba3
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
@@ -0,0 +1,51 @@
+//===-- XCoreSelectionDAGInfo.cpp - XCore SelectionDAG Info ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the XCoreSelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreTargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "xcore-selectiondag-info"
+
+SDValue XCoreSelectionDAGInfo::EmitTargetCodeForMemcpy(
+ SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline,
+ MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
+ unsigned SizeBitWidth = Size.getValueSizeInBits();
+ // Call __memcpy_4 if the src, dst and size are all 4 byte aligned.
+ if (!AlwaysInline && (Align & 3) == 0 &&
+ DAG.MaskedValueIsZero(Size, APInt(SizeBitWidth, 3))) {
+ const TargetLowering &TLI = *DAG.getSubtarget().getTargetLowering();
+ TargetLowering::ArgListTy Args;
+ TargetLowering::ArgListEntry Entry;
+ Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
+ Entry.Node = Dst; Args.push_back(Entry);
+ Entry.Node = Src; Args.push_back(Entry);
+ Entry.Node = Size; Args.push_back(Entry);
+
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(dl)
+ .setChain(Chain)
+ .setCallee(TLI.getLibcallCallingConv(RTLIB::MEMCPY),
+ Type::getVoidTy(*DAG.getContext()),
+ DAG.getExternalSymbol("__memcpy_4",
+ TLI.getPointerTy(DAG.getDataLayout())),
+ std::move(Args))
+ .setDiscardResult();
+
+ std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
+ return CallResult.second;
+ }
+
+ // Otherwise have the target-independent code call memcpy.
+ return SDValue();
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.h b/contrib/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.h
new file mode 100644
index 000000000000..7cd0d8216e91
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.h
@@ -0,0 +1,35 @@
+//===-- XCoreSelectionDAGInfo.h - XCore SelectionDAG Info -------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the XCore subclass for SelectionDAGTargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XCORE_XCORESELECTIONDAGINFO_H
+#define LLVM_LIB_TARGET_XCORE_XCORESELECTIONDAGINFO_H
+
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+
+namespace llvm {
+
+class XCoreTargetMachine;
+
+class XCoreSelectionDAGInfo : public SelectionDAGTargetInfo {
+public:
+ SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
+ SDValue Chain, SDValue Op1, SDValue Op2,
+ SDValue Op3, unsigned Align, bool isVolatile,
+ bool AlwaysInline,
+ MachinePointerInfo DstPtrInfo,
+ MachinePointerInfo SrcPtrInfo) const override;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCoreSubtarget.cpp b/contrib/llvm/lib/Target/XCore/XCoreSubtarget.cpp
new file mode 100644
index 000000000000..99ad2c88504f
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreSubtarget.cpp
@@ -0,0 +1,31 @@
+//===-- XCoreSubtarget.cpp - XCore Subtarget Information ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the XCore specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreSubtarget.h"
+#include "XCore.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "xcore-subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "XCoreGenSubtargetInfo.inc"
+
+void XCoreSubtarget::anchor() { }
+
+XCoreSubtarget::XCoreSubtarget(const Triple &TT, const std::string &CPU,
+ const std::string &FS, const TargetMachine &TM)
+ : XCoreGenSubtargetInfo(TT, CPU, FS), InstrInfo(), FrameLowering(*this),
+ TLInfo(TM, *this), TSInfo() {}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreSubtarget.h b/contrib/llvm/lib/Target/XCore/XCoreSubtarget.h
new file mode 100644
index 000000000000..f01fb6714d86
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreSubtarget.h
@@ -0,0 +1,66 @@
+//===-- XCoreSubtarget.h - Define Subtarget for the XCore -------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the XCore specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XCORE_XCORESUBTARGET_H
+#define LLVM_LIB_TARGET_XCORE_XCORESUBTARGET_H
+
+#include "XCoreFrameLowering.h"
+#include "XCoreISelLowering.h"
+#include "XCoreInstrInfo.h"
+#include "XCoreSelectionDAGInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "XCoreGenSubtargetInfo.inc"
+
+namespace llvm {
+class StringRef;
+
+class XCoreSubtarget : public XCoreGenSubtargetInfo {
+ virtual void anchor();
+ XCoreInstrInfo InstrInfo;
+ XCoreFrameLowering FrameLowering;
+ XCoreTargetLowering TLInfo;
+ XCoreSelectionDAGInfo TSInfo;
+
+public:
+ /// This constructor initializes the data members to match that
+ /// of the specified triple.
+ ///
+ XCoreSubtarget(const Triple &TT, const std::string &CPU,
+ const std::string &FS, const TargetMachine &TM);
+
+ /// ParseSubtargetFeatures - Parses features string setting specified
+ /// subtarget options. Definition of function is auto generated by tblgen.
+ void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+ const XCoreInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+ const XCoreFrameLowering *getFrameLowering() const override {
+ return &FrameLowering;
+ }
+ const XCoreTargetLowering *getTargetLowering() const override {
+ return &TLInfo;
+ }
+ const XCoreSelectionDAGInfo *getSelectionDAGInfo() const override {
+ return &TSInfo;
+ }
+ const TargetRegisterInfo *getRegisterInfo() const override {
+ return &InstrInfo.getRegisterInfo();
+ }
+};
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.cpp b/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
new file mode 100644
index 000000000000..bf3138f2164a
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -0,0 +1,99 @@
+//===-- XCoreTargetMachine.cpp - Define TargetMachine for XCore -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreTargetMachine.h"
+#include "XCoreTargetObjectFile.h"
+#include "XCoreTargetTransformInfo.h"
+#include "XCore.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
+ if (!RM.hasValue())
+ return Reloc::Static;
+ return *RM;
+}
+
+/// Create an ILP32 architecture model
+///
+XCoreTargetMachine::XCoreTargetMachine(const Target &T, const Triple &TT,
+ StringRef CPU, StringRef FS,
+ const TargetOptions &Options,
+ Optional<Reloc::Model> RM,
+ CodeModel::Model CM,
+ CodeGenOpt::Level OL)
+ : LLVMTargetMachine(
+ T, "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:32-f64:32-a:0:32-n32",
+ TT, CPU, FS, Options, getEffectiveRelocModel(RM), CM, OL),
+ TLOF(make_unique<XCoreTargetObjectFile>()),
+ Subtarget(TT, CPU, FS, *this) {
+ initAsmInfo();
+}
+
+XCoreTargetMachine::~XCoreTargetMachine() {}
+
+namespace {
+/// XCore Code Generator Pass Configuration Options.
+class XCorePassConfig : public TargetPassConfig {
+public:
+ XCorePassConfig(XCoreTargetMachine *TM, PassManagerBase &PM)
+ : TargetPassConfig(TM, PM) {}
+
+ XCoreTargetMachine &getXCoreTargetMachine() const {
+ return getTM<XCoreTargetMachine>();
+ }
+
+ void addIRPasses() override;
+ bool addPreISel() override;
+ bool addInstSelector() override;
+ void addPreEmitPass() override;
+};
+} // namespace
+
+TargetPassConfig *XCoreTargetMachine::createPassConfig(PassManagerBase &PM) {
+ return new XCorePassConfig(this, PM);
+}
+
+void XCorePassConfig::addIRPasses() {
+ addPass(createAtomicExpandPass(&getXCoreTargetMachine()));
+
+ TargetPassConfig::addIRPasses();
+}
+
+bool XCorePassConfig::addPreISel() {
+ addPass(createXCoreLowerThreadLocalPass());
+ return false;
+}
+
+bool XCorePassConfig::addInstSelector() {
+ addPass(createXCoreISelDag(getXCoreTargetMachine(), getOptLevel()));
+ return false;
+}
+
+void XCorePassConfig::addPreEmitPass() {
+ addPass(createXCoreFrameToArgsOffsetEliminationPass(), false);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeXCoreTarget() {
+ RegisterTargetMachine<XCoreTargetMachine> X(getTheXCoreTarget());
+}
+
+TargetIRAnalysis XCoreTargetMachine::getTargetIRAnalysis() {
+ return TargetIRAnalysis([this](const Function &F) {
+ return TargetTransformInfo(XCoreTTIImpl(this, F));
+ });
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.h b/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.h
new file mode 100644
index 000000000000..4bd25bc8776c
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.h
@@ -0,0 +1,48 @@
+//===-- XCoreTargetMachine.h - Define TargetMachine for XCore ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the XCore specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XCORE_XCORETARGETMACHINE_H
+#define LLVM_LIB_TARGET_XCORE_XCORETARGETMACHINE_H
+
+#include "XCoreSubtarget.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class XCoreTargetMachine : public LLVMTargetMachine {
+ std::unique_ptr<TargetLoweringObjectFile> TLOF;
+ XCoreSubtarget Subtarget;
+public:
+ XCoreTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL);
+ ~XCoreTargetMachine() override;
+
+ const XCoreSubtarget *getSubtargetImpl() const { return &Subtarget; }
+ const XCoreSubtarget *getSubtargetImpl(const Function &) const override {
+ return &Subtarget;
+ }
+
+ // Pass Pipeline Configuration
+ TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+ TargetIRAnalysis getTargetIRAnalysis() override;
+ TargetLoweringObjectFile *getObjFileLowering() const override {
+ return TLOF.get();
+ }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp b/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp
new file mode 100644
index 000000000000..ad8693fd325e
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp
@@ -0,0 +1,156 @@
+//===-- XCoreTargetObjectFile.cpp - XCore object files --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreTargetObjectFile.h"
+#include "XCoreSubtarget.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+
+void XCoreTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM){
+ TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+
+ BSSSection = Ctx.getELFSection(".dp.bss", ELF::SHT_NOBITS,
+ ELF::SHF_ALLOC | ELF::SHF_WRITE |
+ ELF::XCORE_SHF_DP_SECTION);
+ BSSSectionLarge = Ctx.getELFSection(".dp.bss.large", ELF::SHT_NOBITS,
+ ELF::SHF_ALLOC | ELF::SHF_WRITE |
+ ELF::XCORE_SHF_DP_SECTION);
+ DataSection = Ctx.getELFSection(".dp.data", ELF::SHT_PROGBITS,
+ ELF::SHF_ALLOC | ELF::SHF_WRITE |
+ ELF::XCORE_SHF_DP_SECTION);
+ DataSectionLarge = Ctx.getELFSection(".dp.data.large", ELF::SHT_PROGBITS,
+ ELF::SHF_ALLOC | ELF::SHF_WRITE |
+ ELF::XCORE_SHF_DP_SECTION);
+ DataRelROSection = Ctx.getELFSection(".dp.rodata", ELF::SHT_PROGBITS,
+ ELF::SHF_ALLOC | ELF::SHF_WRITE |
+ ELF::XCORE_SHF_DP_SECTION);
+ DataRelROSectionLarge = Ctx.getELFSection(
+ ".dp.rodata.large", ELF::SHT_PROGBITS,
+ ELF::SHF_ALLOC | ELF::SHF_WRITE | ELF::XCORE_SHF_DP_SECTION);
+ ReadOnlySection =
+ Ctx.getELFSection(".cp.rodata", ELF::SHT_PROGBITS,
+ ELF::SHF_ALLOC | ELF::XCORE_SHF_CP_SECTION);
+ ReadOnlySectionLarge =
+ Ctx.getELFSection(".cp.rodata.large", ELF::SHT_PROGBITS,
+ ELF::SHF_ALLOC | ELF::XCORE_SHF_CP_SECTION);
+ MergeableConst4Section = Ctx.getELFSection(
+ ".cp.rodata.cst4", ELF::SHT_PROGBITS,
+ ELF::SHF_ALLOC | ELF::SHF_MERGE | ELF::XCORE_SHF_CP_SECTION, 4, "");
+ MergeableConst8Section = Ctx.getELFSection(
+ ".cp.rodata.cst8", ELF::SHT_PROGBITS,
+ ELF::SHF_ALLOC | ELF::SHF_MERGE | ELF::XCORE_SHF_CP_SECTION, 8, "");
+ MergeableConst16Section = Ctx.getELFSection(
+ ".cp.rodata.cst16", ELF::SHT_PROGBITS,
+ ELF::SHF_ALLOC | ELF::SHF_MERGE | ELF::XCORE_SHF_CP_SECTION, 16, "");
+ CStringSection =
+ Ctx.getELFSection(".cp.rodata.string", ELF::SHT_PROGBITS,
+ ELF::SHF_ALLOC | ELF::SHF_MERGE | ELF::SHF_STRINGS |
+ ELF::XCORE_SHF_CP_SECTION);
+ // TextSection - see MObjectFileInfo.cpp
+ // StaticCtorSection - see MObjectFileInfo.cpp
+ // StaticDtorSection - see MObjectFileInfo.cpp
+ }
+
+static unsigned getXCoreSectionType(SectionKind K) {
+ if (K.isBSS())
+ return ELF::SHT_NOBITS;
+ return ELF::SHT_PROGBITS;
+}
+
+static unsigned getXCoreSectionFlags(SectionKind K, bool IsCPRel) {
+ unsigned Flags = 0;
+
+ if (!K.isMetadata())
+ Flags |= ELF::SHF_ALLOC;
+
+ if (K.isText())
+ Flags |= ELF::SHF_EXECINSTR;
+ else if (IsCPRel)
+ Flags |= ELF::XCORE_SHF_CP_SECTION;
+ else
+ Flags |= ELF::XCORE_SHF_DP_SECTION;
+
+ if (K.isWriteable())
+ Flags |= ELF::SHF_WRITE;
+
+ if (K.isMergeableCString() || K.isMergeableConst4() ||
+ K.isMergeableConst8() || K.isMergeableConst16())
+ Flags |= ELF::SHF_MERGE;
+
+ if (K.isMergeableCString())
+ Flags |= ELF::SHF_STRINGS;
+
+ return Flags;
+}
+
+MCSection *XCoreTargetObjectFile::getExplicitSectionGlobal(
+ const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+ StringRef SectionName = GO->getSection();
+ // Infer section flags from the section name if we can.
+ bool IsCPRel = SectionName.startswith(".cp.");
+ if (IsCPRel && !Kind.isReadOnly())
+ report_fatal_error("Using .cp. section for writeable object.");
+ return getContext().getELFSection(SectionName, getXCoreSectionType(Kind),
+ getXCoreSectionFlags(Kind, IsCPRel));
+}
+
+MCSection *XCoreTargetObjectFile::SelectSectionForGlobal(
+ const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+
+ bool UseCPRel = GO->hasLocalLinkage();
+
+ if (Kind.isText()) return TextSection;
+ if (UseCPRel) {
+ if (Kind.isMergeable1ByteCString()) return CStringSection;
+ if (Kind.isMergeableConst4()) return MergeableConst4Section;
+ if (Kind.isMergeableConst8()) return MergeableConst8Section;
+ if (Kind.isMergeableConst16()) return MergeableConst16Section;
+ }
+ Type *ObjType = GO->getValueType();
+ auto &DL = GO->getParent()->getDataLayout();
+ if (TM.getCodeModel() == CodeModel::Small || !ObjType->isSized() ||
+ DL.getTypeAllocSize(ObjType) < CodeModelLargeSize) {
+ if (Kind.isReadOnly()) return UseCPRel? ReadOnlySection
+ : DataRelROSection;
+ if (Kind.isBSS() || Kind.isCommon())return BSSSection;
+ if (Kind.isData())
+ return DataSection;
+ if (Kind.isReadOnlyWithRel()) return DataRelROSection;
+ } else {
+ if (Kind.isReadOnly()) return UseCPRel? ReadOnlySectionLarge
+ : DataRelROSectionLarge;
+ if (Kind.isBSS() || Kind.isCommon())return BSSSectionLarge;
+ if (Kind.isData())
+ return DataSectionLarge;
+ if (Kind.isReadOnlyWithRel()) return DataRelROSectionLarge;
+ }
+
+ assert((Kind.isThreadLocal() || Kind.isCommon()) && "Unknown section kind");
+ report_fatal_error("Target does not support TLS or Common sections");
+}
+
+MCSection *XCoreTargetObjectFile::getSectionForConstant(const DataLayout &DL,
+ SectionKind Kind,
+ const Constant *C,
+ unsigned &Align) const {
+ if (Kind.isMergeableConst4()) return MergeableConst4Section;
+ if (Kind.isMergeableConst8()) return MergeableConst8Section;
+ if (Kind.isMergeableConst16()) return MergeableConst16Section;
+ assert((Kind.isReadOnly() || Kind.isReadOnlyWithRel()) &&
+ "Unknown section kind");
+ // We assume the size of the object is never greater than CodeModelLargeSize.
+ // To handle CodeModelLargeSize changes to AsmPrinter would be required.
+ return ReadOnlySection;
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.h b/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.h
new file mode 100644
index 000000000000..5eb423a7435e
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.h
@@ -0,0 +1,40 @@
+//===-- XCoreTargetObjectFile.h - XCore Object Info -------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XCORE_XCORETARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_XCORE_XCORETARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+
+namespace llvm {
+
+static const unsigned CodeModelLargeSize = 256;
+
+ class XCoreTargetObjectFile : public TargetLoweringObjectFileELF {
+ MCSection *BSSSectionLarge;
+ MCSection *DataSectionLarge;
+ MCSection *ReadOnlySectionLarge;
+ MCSection *DataRelROSectionLarge;
+
+ public:
+ void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+
+ MCSection *getExplicitSectionGlobal(const GlobalObject *GO, SectionKind Kind,
+ const TargetMachine &TM) const override;
+
+ MCSection *SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind,
+ const TargetMachine &TM) const override;
+
+ MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
+ const Constant *C,
+ unsigned &Align) const override;
+ };
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetStreamer.h b/contrib/llvm/lib/Target/XCore/XCoreTargetStreamer.h
new file mode 100644
index 000000000000..3563dbc5cb7b
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreTargetStreamer.h
@@ -0,0 +1,27 @@
+//===-- XCoreTargetStreamer.h - XCore Target Streamer ----------*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XCORE_XCORETARGETSTREAMER_H
+#define LLVM_LIB_TARGET_XCORE_XCORETARGETSTREAMER_H
+
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+class XCoreTargetStreamer : public MCTargetStreamer {
+public:
+ XCoreTargetStreamer(MCStreamer &S);
+ ~XCoreTargetStreamer() override;
+ virtual void emitCCTopData(StringRef Name) = 0;
+ virtual void emitCCTopFunction(StringRef Name) = 0;
+ virtual void emitCCBottomData(StringRef Name) = 0;
+ virtual void emitCCBottomFunction(StringRef Name) = 0;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h b/contrib/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h
new file mode 100644
index 000000000000..9617796f4861
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h
@@ -0,0 +1,54 @@
+//===-- XCoreTargetTransformInfo.h - XCore specific TTI ---------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file a TargetTransformInfo::Concept conforming object specific to the
+/// XCore target machine. It uses the target's detailed information to
+/// provide more precise answers to certain TTI queries, while letting the
+/// target independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XCORE_XCORETARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_XCORE_XCORETARGETTRANSFORMINFO_H
+
+#include "XCore.h"
+#include "XCoreTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+class XCoreTTIImpl : public BasicTTIImplBase<XCoreTTIImpl> {
+ typedef BasicTTIImplBase<XCoreTTIImpl> BaseT;
+ typedef TargetTransformInfo TTI;
+ friend BaseT;
+
+ const XCoreSubtarget *ST;
+ const XCoreTargetLowering *TLI;
+
+ const XCoreSubtarget *getST() const { return ST; }
+ const XCoreTargetLowering *getTLI() const { return TLI; }
+
+public:
+ explicit XCoreTTIImpl(const XCoreTargetMachine *TM, const Function &F)
+ : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl()),
+ TLI(ST->getTargetLowering()) {}
+
+ unsigned getNumberOfRegisters(bool Vector) {
+ if (Vector) {
+ return 0;
+ }
+ return 12;
+ }
+};
+
+} // end namespace llvm
+
+#endif